diff --git "a/checkpoint-8500/trainer_state.json" "b/checkpoint-8500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-8500/trainer_state.json" @@ -0,0 +1,368729 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.37266734158648673, + "eval_steps": 1000, + "global_step": 8500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.5677195921819358e-05, + "grad_norm": 4.09375, + "learning_rate": 0.00019981493881728382, + "loss": 3.8085, + "step": 1 + }, + { + "epoch": 5.1354391843638716e-05, + "grad_norm": 3.796875, + "learning_rate": 0.00019981466725181818, + "loss": 3.4522, + "step": 2 + }, + { + "epoch": 7.703158776545808e-05, + "grad_norm": 2.765625, + "learning_rate": 0.0001998143954874309, + "loss": 3.3251, + "step": 3 + }, + { + "epoch": 0.00010270878368727743, + "grad_norm": 2.296875, + "learning_rate": 0.00019981412352412255, + "loss": 3.3513, + "step": 4 + }, + { + "epoch": 0.0001283859796090968, + "grad_norm": 2.59375, + "learning_rate": 0.00019981385136189368, + "loss": 3.4416, + "step": 5 + }, + { + "epoch": 0.00015406317553091616, + "grad_norm": 2.125, + "learning_rate": 0.00019981357900074486, + "loss": 3.2113, + "step": 6 + }, + { + "epoch": 0.0001797403714527355, + "grad_norm": 2.28125, + "learning_rate": 0.00019981330644067654, + "loss": 2.8963, + "step": 7 + }, + { + "epoch": 0.00020541756737455487, + "grad_norm": 2.078125, + "learning_rate": 0.00019981303368168932, + "loss": 3.1162, + "step": 8 + }, + { + "epoch": 0.0002310947632963742, + "grad_norm": 2.171875, + "learning_rate": 0.00019981276072378375, + "loss": 3.271, + "step": 9 + }, + { + "epoch": 0.0002567719592181936, + "grad_norm": 1.890625, + "learning_rate": 0.00019981248756696033, + "loss": 2.723, + "step": 10 + }, + { + "epoch": 0.0002824491551400129, + "grad_norm": 2.171875, + "learning_rate": 0.0001998122142112197, + "loss": 2.9188, + "step": 11 + }, + { + "epoch": 0.0003081263510618323, + "grad_norm": 2.1875, + "learning_rate": 0.00019981194065656232, + "loss": 2.8855, + "step": 12 + }, + { + "epoch": 0.00033380354698365165, + "grad_norm": 1.921875, + "learning_rate": 0.00019981166690298877, + "loss": 2.9555, + "step": 13 + }, + { + "epoch": 0.000359480742905471, + "grad_norm": 2.15625, + "learning_rate": 0.00019981139295049952, + "loss": 2.779, + "step": 14 + }, + { + "epoch": 0.0003851579388272904, + "grad_norm": 2.078125, + "learning_rate": 0.00019981111879909525, + "loss": 2.8852, + "step": 15 + }, + { + "epoch": 0.00041083513474910973, + "grad_norm": 2.046875, + "learning_rate": 0.00019981084444877642, + "loss": 2.8717, + "step": 16 + }, + { + "epoch": 0.00043651233067092907, + "grad_norm": 2.203125, + "learning_rate": 0.00019981056989954357, + "loss": 3.0705, + "step": 17 + }, + { + "epoch": 0.0004621895265927484, + "grad_norm": 1.90625, + "learning_rate": 0.0001998102951513973, + "loss": 2.8481, + "step": 18 + }, + { + "epoch": 0.0004878667225145678, + "grad_norm": 1.96875, + "learning_rate": 0.0001998100202043381, + "loss": 2.8879, + "step": 19 + }, + { + "epoch": 0.0005135439184363871, + "grad_norm": 1.7109375, + "learning_rate": 0.00019980974505836655, + "loss": 2.616, + "step": 20 + }, + { + "epoch": 0.0005392211143582065, + "grad_norm": 2.265625, + "learning_rate": 0.00019980946971348323, + "loss": 2.8716, + "step": 21 + }, + { + "epoch": 0.0005648983102800258, + "grad_norm": 1.875, + "learning_rate": 0.00019980919416968859, + "loss": 2.4833, + "step": 22 + }, + { + "epoch": 0.0005905755062018452, + "grad_norm": 6.25, + "learning_rate": 0.0001998089184269833, + "loss": 2.8268, + "step": 23 + }, + { + "epoch": 0.0006162527021236646, + "grad_norm": 1.8359375, + "learning_rate": 0.00019980864248536783, + "loss": 2.7042, + "step": 24 + }, + { + "epoch": 0.000641929898045484, + "grad_norm": 9.5625, + "learning_rate": 0.00019980836634484275, + "loss": 2.6445, + "step": 25 + }, + { + "epoch": 0.0006676070939673033, + "grad_norm": 1.90625, + "learning_rate": 0.00019980809000540858, + "loss": 2.6366, + "step": 26 + }, + { + "epoch": 0.0006932842898891226, + "grad_norm": 1.7578125, + "learning_rate": 0.00019980781346706596, + "loss": 2.9186, + "step": 27 + }, + { + "epoch": 0.000718961485810942, + "grad_norm": 2.125, + "learning_rate": 0.00019980753672981533, + "loss": 2.5831, + "step": 28 + }, + { + "epoch": 0.0007446386817327613, + "grad_norm": 1.8984375, + "learning_rate": 0.00019980725979365732, + "loss": 2.5846, + "step": 29 + }, + { + "epoch": 0.0007703158776545808, + "grad_norm": 1.7265625, + "learning_rate": 0.00019980698265859248, + "loss": 2.5265, + "step": 30 + }, + { + "epoch": 0.0007959930735764001, + "grad_norm": 1.625, + "learning_rate": 0.00019980670532462132, + "loss": 2.612, + "step": 31 + }, + { + "epoch": 0.0008216702694982195, + "grad_norm": 1.78125, + "learning_rate": 0.0001998064277917444, + "loss": 2.4593, + "step": 32 + }, + { + "epoch": 0.0008473474654200388, + "grad_norm": 1.890625, + "learning_rate": 0.0001998061500599623, + "loss": 2.5988, + "step": 33 + }, + { + "epoch": 0.0008730246613418581, + "grad_norm": 1.8828125, + "learning_rate": 0.00019980587212927554, + "loss": 2.737, + "step": 34 + }, + { + "epoch": 0.0008987018572636775, + "grad_norm": 1.6875, + "learning_rate": 0.0001998055939996847, + "loss": 2.6832, + "step": 35 + }, + { + "epoch": 0.0009243790531854968, + "grad_norm": 1.6796875, + "learning_rate": 0.00019980531567119033, + "loss": 2.3475, + "step": 36 + }, + { + "epoch": 0.0009500562491073163, + "grad_norm": 1.78125, + "learning_rate": 0.000199805037143793, + "loss": 2.4184, + "step": 37 + }, + { + "epoch": 0.0009757334450291356, + "grad_norm": 2.203125, + "learning_rate": 0.0001998047584174932, + "loss": 2.5031, + "step": 38 + }, + { + "epoch": 0.001001410640950955, + "grad_norm": 1.75, + "learning_rate": 0.00019980447949229156, + "loss": 2.6544, + "step": 39 + }, + { + "epoch": 0.0010270878368727743, + "grad_norm": 1.9375, + "learning_rate": 0.00019980420036818863, + "loss": 2.534, + "step": 40 + }, + { + "epoch": 0.0010527650327945936, + "grad_norm": 1.578125, + "learning_rate": 0.00019980392104518492, + "loss": 2.3989, + "step": 41 + }, + { + "epoch": 0.001078442228716413, + "grad_norm": 1.5390625, + "learning_rate": 0.00019980364152328103, + "loss": 2.4021, + "step": 42 + }, + { + "epoch": 0.0011041194246382323, + "grad_norm": 3.265625, + "learning_rate": 0.00019980336180247746, + "loss": 2.6585, + "step": 43 + }, + { + "epoch": 0.0011297966205600517, + "grad_norm": 1.7578125, + "learning_rate": 0.00019980308188277484, + "loss": 2.6566, + "step": 44 + }, + { + "epoch": 0.001155473816481871, + "grad_norm": 1.4921875, + "learning_rate": 0.00019980280176417367, + "loss": 2.3926, + "step": 45 + }, + { + "epoch": 0.0011811510124036903, + "grad_norm": 1.53125, + "learning_rate": 0.00019980252144667456, + "loss": 2.5069, + "step": 46 + }, + { + "epoch": 0.00120682820832551, + "grad_norm": 1.6640625, + "learning_rate": 0.000199802240930278, + "loss": 2.5065, + "step": 47 + }, + { + "epoch": 0.0012325054042473292, + "grad_norm": 1.6640625, + "learning_rate": 0.0001998019602149846, + "loss": 2.3828, + "step": 48 + }, + { + "epoch": 0.0012581826001691486, + "grad_norm": 1.5, + "learning_rate": 0.00019980167930079494, + "loss": 2.4286, + "step": 49 + }, + { + "epoch": 0.001283859796090968, + "grad_norm": 1.65625, + "learning_rate": 0.0001998013981877095, + "loss": 2.5382, + "step": 50 + }, + { + "epoch": 0.0013095369920127873, + "grad_norm": 1.5234375, + "learning_rate": 0.00019980111687572892, + "loss": 2.4564, + "step": 51 + }, + { + "epoch": 0.0013352141879346066, + "grad_norm": 1.640625, + "learning_rate": 0.0001998008353648537, + "loss": 2.5861, + "step": 52 + }, + { + "epoch": 0.001360891383856426, + "grad_norm": 1.5234375, + "learning_rate": 0.00019980055365508448, + "loss": 2.4832, + "step": 53 + }, + { + "epoch": 0.0013865685797782453, + "grad_norm": 1.46875, + "learning_rate": 0.00019980027174642176, + "loss": 2.1409, + "step": 54 + }, + { + "epoch": 0.0014122457757000646, + "grad_norm": 1.6015625, + "learning_rate": 0.00019979998963886607, + "loss": 2.4651, + "step": 55 + }, + { + "epoch": 0.001437922971621884, + "grad_norm": 1.5234375, + "learning_rate": 0.00019979970733241805, + "loss": 2.6257, + "step": 56 + }, + { + "epoch": 0.0014636001675437033, + "grad_norm": 1.484375, + "learning_rate": 0.0001997994248270782, + "loss": 2.4016, + "step": 57 + }, + { + "epoch": 0.0014892773634655227, + "grad_norm": 1.546875, + "learning_rate": 0.00019979914212284717, + "loss": 2.4857, + "step": 58 + }, + { + "epoch": 0.001514954559387342, + "grad_norm": 1.8515625, + "learning_rate": 0.0001997988592197254, + "loss": 2.5518, + "step": 59 + }, + { + "epoch": 0.0015406317553091616, + "grad_norm": 1.671875, + "learning_rate": 0.00019979857611771357, + "loss": 2.4377, + "step": 60 + }, + { + "epoch": 0.001566308951230981, + "grad_norm": 1.515625, + "learning_rate": 0.00019979829281681214, + "loss": 2.3236, + "step": 61 + }, + { + "epoch": 0.0015919861471528002, + "grad_norm": 1.40625, + "learning_rate": 0.00019979800931702176, + "loss": 2.4122, + "step": 62 + }, + { + "epoch": 0.0016176633430746196, + "grad_norm": 1.4921875, + "learning_rate": 0.00019979772561834298, + "loss": 2.4593, + "step": 63 + }, + { + "epoch": 0.001643340538996439, + "grad_norm": 1.6796875, + "learning_rate": 0.00019979744172077631, + "loss": 2.4078, + "step": 64 + }, + { + "epoch": 0.0016690177349182583, + "grad_norm": 1.59375, + "learning_rate": 0.00019979715762432235, + "loss": 2.2705, + "step": 65 + }, + { + "epoch": 0.0016946949308400776, + "grad_norm": 1.5390625, + "learning_rate": 0.00019979687332898172, + "loss": 2.2347, + "step": 66 + }, + { + "epoch": 0.001720372126761897, + "grad_norm": 1.4453125, + "learning_rate": 0.00019979658883475488, + "loss": 2.254, + "step": 67 + }, + { + "epoch": 0.0017460493226837163, + "grad_norm": 1.4296875, + "learning_rate": 0.0001997963041416425, + "loss": 2.1997, + "step": 68 + }, + { + "epoch": 0.0017717265186055356, + "grad_norm": 1.375, + "learning_rate": 0.00019979601924964506, + "loss": 2.1757, + "step": 69 + }, + { + "epoch": 0.001797403714527355, + "grad_norm": 1.3671875, + "learning_rate": 0.00019979573415876317, + "loss": 2.3028, + "step": 70 + }, + { + "epoch": 0.0018230809104491743, + "grad_norm": 1.390625, + "learning_rate": 0.00019979544886899743, + "loss": 2.3168, + "step": 71 + }, + { + "epoch": 0.0018487581063709937, + "grad_norm": 1.4609375, + "learning_rate": 0.00019979516338034835, + "loss": 2.2942, + "step": 72 + }, + { + "epoch": 0.0018744353022928132, + "grad_norm": 1.46875, + "learning_rate": 0.00019979487769281653, + "loss": 2.3045, + "step": 73 + }, + { + "epoch": 0.0019001124982146326, + "grad_norm": 1.4921875, + "learning_rate": 0.00019979459180640254, + "loss": 2.2598, + "step": 74 + }, + { + "epoch": 0.001925789694136452, + "grad_norm": 1.5859375, + "learning_rate": 0.00019979430572110692, + "loss": 2.2965, + "step": 75 + }, + { + "epoch": 0.0019514668900582712, + "grad_norm": 1.4609375, + "learning_rate": 0.00019979401943693027, + "loss": 2.1196, + "step": 76 + }, + { + "epoch": 0.0019771440859800904, + "grad_norm": 1.53125, + "learning_rate": 0.0001997937329538732, + "loss": 2.166, + "step": 77 + }, + { + "epoch": 0.00200282128190191, + "grad_norm": 1.65625, + "learning_rate": 0.00019979344627193618, + "loss": 2.4396, + "step": 78 + }, + { + "epoch": 0.0020284984778237295, + "grad_norm": 1.4140625, + "learning_rate": 0.00019979315939111985, + "loss": 2.2294, + "step": 79 + }, + { + "epoch": 0.0020541756737455486, + "grad_norm": 1.4453125, + "learning_rate": 0.00019979287231142477, + "loss": 2.1996, + "step": 80 + }, + { + "epoch": 0.002079852869667368, + "grad_norm": 1.359375, + "learning_rate": 0.00019979258503285152, + "loss": 2.0635, + "step": 81 + }, + { + "epoch": 0.0021055300655891873, + "grad_norm": 1.3984375, + "learning_rate": 0.00019979229755540066, + "loss": 2.2769, + "step": 82 + }, + { + "epoch": 0.002131207261511007, + "grad_norm": 1.546875, + "learning_rate": 0.00019979200987907273, + "loss": 2.3905, + "step": 83 + }, + { + "epoch": 0.002156884457432826, + "grad_norm": 1.4140625, + "learning_rate": 0.0001997917220038684, + "loss": 2.2679, + "step": 84 + }, + { + "epoch": 0.0021825616533546455, + "grad_norm": 1.3828125, + "learning_rate": 0.00019979143392978812, + "loss": 2.1311, + "step": 85 + }, + { + "epoch": 0.0022082388492764646, + "grad_norm": 1.4765625, + "learning_rate": 0.00019979114565683257, + "loss": 2.4163, + "step": 86 + }, + { + "epoch": 0.002233916045198284, + "grad_norm": 1.4609375, + "learning_rate": 0.00019979085718500227, + "loss": 2.3178, + "step": 87 + }, + { + "epoch": 0.0022595932411201033, + "grad_norm": 1.5546875, + "learning_rate": 0.0001997905685142978, + "loss": 2.2761, + "step": 88 + }, + { + "epoch": 0.002285270437041923, + "grad_norm": 1.484375, + "learning_rate": 0.00019979027964471974, + "loss": 2.268, + "step": 89 + }, + { + "epoch": 0.002310947632963742, + "grad_norm": 1.3671875, + "learning_rate": 0.00019978999057626866, + "loss": 2.3155, + "step": 90 + }, + { + "epoch": 0.0023366248288855616, + "grad_norm": 1.390625, + "learning_rate": 0.00019978970130894516, + "loss": 2.1695, + "step": 91 + }, + { + "epoch": 0.0023623020248073807, + "grad_norm": 1.5625, + "learning_rate": 0.00019978941184274978, + "loss": 2.0385, + "step": 92 + }, + { + "epoch": 0.0023879792207292003, + "grad_norm": 1.453125, + "learning_rate": 0.00019978912217768314, + "loss": 2.1725, + "step": 93 + }, + { + "epoch": 0.00241365641665102, + "grad_norm": 1.453125, + "learning_rate": 0.0001997888323137458, + "loss": 2.1515, + "step": 94 + }, + { + "epoch": 0.002439333612572839, + "grad_norm": 1.4375, + "learning_rate": 0.00019978854225093828, + "loss": 2.2315, + "step": 95 + }, + { + "epoch": 0.0024650108084946585, + "grad_norm": 1.625, + "learning_rate": 0.00019978825198926125, + "loss": 2.113, + "step": 96 + }, + { + "epoch": 0.0024906880044164776, + "grad_norm": 1.515625, + "learning_rate": 0.00019978796152871523, + "loss": 2.3379, + "step": 97 + }, + { + "epoch": 0.002516365200338297, + "grad_norm": 1.5390625, + "learning_rate": 0.00019978767086930083, + "loss": 2.0741, + "step": 98 + }, + { + "epoch": 0.0025420423962601163, + "grad_norm": 1.3984375, + "learning_rate": 0.0001997873800110186, + "loss": 2.2404, + "step": 99 + }, + { + "epoch": 0.002567719592181936, + "grad_norm": 1.5, + "learning_rate": 0.00019978708895386916, + "loss": 2.1393, + "step": 100 + }, + { + "epoch": 0.002593396788103755, + "grad_norm": 1.375, + "learning_rate": 0.00019978679769785306, + "loss": 2.1815, + "step": 101 + }, + { + "epoch": 0.0026190739840255745, + "grad_norm": 1.3828125, + "learning_rate": 0.00019978650624297086, + "loss": 2.1449, + "step": 102 + }, + { + "epoch": 0.0026447511799473937, + "grad_norm": 1.4921875, + "learning_rate": 0.0001997862145892232, + "loss": 2.2345, + "step": 103 + }, + { + "epoch": 0.0026704283758692132, + "grad_norm": 1.40625, + "learning_rate": 0.0001997859227366106, + "loss": 2.119, + "step": 104 + }, + { + "epoch": 0.0026961055717910323, + "grad_norm": 1.4609375, + "learning_rate": 0.0001997856306851337, + "loss": 2.2419, + "step": 105 + }, + { + "epoch": 0.002721782767712852, + "grad_norm": 1.3515625, + "learning_rate": 0.000199785338434793, + "loss": 2.0232, + "step": 106 + }, + { + "epoch": 0.0027474599636346715, + "grad_norm": 1.390625, + "learning_rate": 0.00019978504598558918, + "loss": 2.214, + "step": 107 + }, + { + "epoch": 0.0027731371595564906, + "grad_norm": 1.3125, + "learning_rate": 0.00019978475333752279, + "loss": 2.0411, + "step": 108 + }, + { + "epoch": 0.00279881435547831, + "grad_norm": 1.3828125, + "learning_rate": 0.00019978446049059436, + "loss": 2.0352, + "step": 109 + }, + { + "epoch": 0.0028244915514001293, + "grad_norm": 1.4375, + "learning_rate": 0.00019978416744480452, + "loss": 2.2534, + "step": 110 + }, + { + "epoch": 0.002850168747321949, + "grad_norm": 1.4765625, + "learning_rate": 0.00019978387420015387, + "loss": 2.3404, + "step": 111 + }, + { + "epoch": 0.002875845943243768, + "grad_norm": 1.3984375, + "learning_rate": 0.00019978358075664295, + "loss": 2.3274, + "step": 112 + }, + { + "epoch": 0.0029015231391655875, + "grad_norm": 1.2890625, + "learning_rate": 0.00019978328711427236, + "loss": 2.1794, + "step": 113 + }, + { + "epoch": 0.0029272003350874066, + "grad_norm": 1.5, + "learning_rate": 0.00019978299327304272, + "loss": 2.3423, + "step": 114 + }, + { + "epoch": 0.002952877531009226, + "grad_norm": 1.4921875, + "learning_rate": 0.00019978269923295457, + "loss": 2.0839, + "step": 115 + }, + { + "epoch": 0.0029785547269310453, + "grad_norm": 1.21875, + "learning_rate": 0.00019978240499400854, + "loss": 1.8905, + "step": 116 + }, + { + "epoch": 0.003004231922852865, + "grad_norm": 1.3515625, + "learning_rate": 0.00019978211055620516, + "loss": 2.1838, + "step": 117 + }, + { + "epoch": 0.003029909118774684, + "grad_norm": 1.40625, + "learning_rate": 0.00019978181591954507, + "loss": 2.1985, + "step": 118 + }, + { + "epoch": 0.0030555863146965036, + "grad_norm": 1.328125, + "learning_rate": 0.00019978152108402882, + "loss": 2.0684, + "step": 119 + }, + { + "epoch": 0.003081263510618323, + "grad_norm": 1.4765625, + "learning_rate": 0.000199781226049657, + "loss": 2.0323, + "step": 120 + }, + { + "epoch": 0.0031069407065401422, + "grad_norm": 1.3671875, + "learning_rate": 0.00019978093081643021, + "loss": 1.9744, + "step": 121 + }, + { + "epoch": 0.003132617902461962, + "grad_norm": 1.3203125, + "learning_rate": 0.00019978063538434905, + "loss": 2.0766, + "step": 122 + }, + { + "epoch": 0.003158295098383781, + "grad_norm": 1.3203125, + "learning_rate": 0.0001997803397534141, + "loss": 2.1999, + "step": 123 + }, + { + "epoch": 0.0031839722943056005, + "grad_norm": 1.328125, + "learning_rate": 0.00019978004392362595, + "loss": 1.9197, + "step": 124 + }, + { + "epoch": 0.0032096494902274196, + "grad_norm": 1.3828125, + "learning_rate": 0.00019977974789498517, + "loss": 2.1476, + "step": 125 + }, + { + "epoch": 0.003235326686149239, + "grad_norm": 1.3203125, + "learning_rate": 0.00019977945166749236, + "loss": 2.1268, + "step": 126 + }, + { + "epoch": 0.0032610038820710583, + "grad_norm": 1.28125, + "learning_rate": 0.00019977915524114814, + "loss": 2.0392, + "step": 127 + }, + { + "epoch": 0.003286681077992878, + "grad_norm": 1.359375, + "learning_rate": 0.00019977885861595306, + "loss": 2.0557, + "step": 128 + }, + { + "epoch": 0.003312358273914697, + "grad_norm": 1.4765625, + "learning_rate": 0.00019977856179190774, + "loss": 2.2573, + "step": 129 + }, + { + "epoch": 0.0033380354698365165, + "grad_norm": 1.4375, + "learning_rate": 0.00019977826476901277, + "loss": 2.1071, + "step": 130 + }, + { + "epoch": 0.0033637126657583357, + "grad_norm": 1.2734375, + "learning_rate": 0.0001997779675472687, + "loss": 1.8169, + "step": 131 + }, + { + "epoch": 0.003389389861680155, + "grad_norm": 1.3046875, + "learning_rate": 0.00019977767012667618, + "loss": 2.4099, + "step": 132 + }, + { + "epoch": 0.0034150670576019748, + "grad_norm": 1.34375, + "learning_rate": 0.00019977737250723575, + "loss": 1.929, + "step": 133 + }, + { + "epoch": 0.003440744253523794, + "grad_norm": 1.2734375, + "learning_rate": 0.00019977707468894806, + "loss": 1.9606, + "step": 134 + }, + { + "epoch": 0.0034664214494456135, + "grad_norm": 1.3828125, + "learning_rate": 0.00019977677667181365, + "loss": 2.0147, + "step": 135 + }, + { + "epoch": 0.0034920986453674326, + "grad_norm": 1.34375, + "learning_rate": 0.00019977647845583315, + "loss": 2.0981, + "step": 136 + }, + { + "epoch": 0.003517775841289252, + "grad_norm": 1.40625, + "learning_rate": 0.00019977618004100712, + "loss": 2.3351, + "step": 137 + }, + { + "epoch": 0.0035434530372110713, + "grad_norm": 1.2890625, + "learning_rate": 0.0001997758814273362, + "loss": 1.9093, + "step": 138 + }, + { + "epoch": 0.003569130233132891, + "grad_norm": 1.484375, + "learning_rate": 0.00019977558261482093, + "loss": 1.9715, + "step": 139 + }, + { + "epoch": 0.00359480742905471, + "grad_norm": 1.296875, + "learning_rate": 0.00019977528360346197, + "loss": 2.0464, + "step": 140 + }, + { + "epoch": 0.0036204846249765295, + "grad_norm": 1.3125, + "learning_rate": 0.0001997749843932599, + "loss": 2.1074, + "step": 141 + }, + { + "epoch": 0.0036461618208983486, + "grad_norm": 1.3359375, + "learning_rate": 0.00019977468498421527, + "loss": 2.0343, + "step": 142 + }, + { + "epoch": 0.003671839016820168, + "grad_norm": 1.203125, + "learning_rate": 0.00019977438537632867, + "loss": 2.0005, + "step": 143 + }, + { + "epoch": 0.0036975162127419873, + "grad_norm": 1.3828125, + "learning_rate": 0.00019977408556960078, + "loss": 2.0801, + "step": 144 + }, + { + "epoch": 0.003723193408663807, + "grad_norm": 1.2421875, + "learning_rate": 0.00019977378556403213, + "loss": 2.2685, + "step": 145 + }, + { + "epoch": 0.0037488706045856264, + "grad_norm": 1.328125, + "learning_rate": 0.00019977348535962336, + "loss": 1.9362, + "step": 146 + }, + { + "epoch": 0.0037745478005074455, + "grad_norm": 1.21875, + "learning_rate": 0.000199773184956375, + "loss": 1.9542, + "step": 147 + }, + { + "epoch": 0.003800224996429265, + "grad_norm": 1.3359375, + "learning_rate": 0.00019977288435428774, + "loss": 2.0299, + "step": 148 + }, + { + "epoch": 0.0038259021923510842, + "grad_norm": 1.328125, + "learning_rate": 0.00019977258355336212, + "loss": 2.0426, + "step": 149 + }, + { + "epoch": 0.003851579388272904, + "grad_norm": 1.453125, + "learning_rate": 0.00019977228255359876, + "loss": 2.096, + "step": 150 + }, + { + "epoch": 0.003877256584194723, + "grad_norm": 1.59375, + "learning_rate": 0.00019977198135499825, + "loss": 2.1197, + "step": 151 + }, + { + "epoch": 0.0039029337801165425, + "grad_norm": 1.296875, + "learning_rate": 0.00019977167995756119, + "loss": 2.2025, + "step": 152 + }, + { + "epoch": 0.003928610976038362, + "grad_norm": 1.234375, + "learning_rate": 0.0001997713783612882, + "loss": 1.8936, + "step": 153 + }, + { + "epoch": 0.003954288171960181, + "grad_norm": 1.3359375, + "learning_rate": 0.0001997710765661798, + "loss": 2.1434, + "step": 154 + }, + { + "epoch": 0.003979965367882, + "grad_norm": 1.21875, + "learning_rate": 0.00019977077457223672, + "loss": 2.0932, + "step": 155 + }, + { + "epoch": 0.00400564256380382, + "grad_norm": 1.2421875, + "learning_rate": 0.0001997704723794595, + "loss": 1.9591, + "step": 156 + }, + { + "epoch": 0.004031319759725639, + "grad_norm": 1.3125, + "learning_rate": 0.00019977016998784871, + "loss": 2.0054, + "step": 157 + }, + { + "epoch": 0.004056996955647459, + "grad_norm": 1.2109375, + "learning_rate": 0.00019976986739740502, + "loss": 2.1367, + "step": 158 + }, + { + "epoch": 0.004082674151569278, + "grad_norm": 1.25, + "learning_rate": 0.00019976956460812897, + "loss": 1.909, + "step": 159 + }, + { + "epoch": 0.004108351347491097, + "grad_norm": 1.3125, + "learning_rate": 0.0001997692616200212, + "loss": 2.1406, + "step": 160 + }, + { + "epoch": 0.004134028543412917, + "grad_norm": 1.2421875, + "learning_rate": 0.0001997689584330823, + "loss": 1.8451, + "step": 161 + }, + { + "epoch": 0.004159705739334736, + "grad_norm": 1.7109375, + "learning_rate": 0.00019976865504731288, + "loss": 2.2586, + "step": 162 + }, + { + "epoch": 0.004185382935256555, + "grad_norm": 1.359375, + "learning_rate": 0.00019976835146271352, + "loss": 1.8745, + "step": 163 + }, + { + "epoch": 0.0042110601311783746, + "grad_norm": 1.3984375, + "learning_rate": 0.00019976804767928488, + "loss": 1.9142, + "step": 164 + }, + { + "epoch": 0.004236737327100194, + "grad_norm": 1.5234375, + "learning_rate": 0.00019976774369702752, + "loss": 2.0619, + "step": 165 + }, + { + "epoch": 0.004262414523022014, + "grad_norm": 1.3515625, + "learning_rate": 0.00019976743951594206, + "loss": 1.9781, + "step": 166 + }, + { + "epoch": 0.004288091718943832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001997671351360291, + "loss": 1.8087, + "step": 167 + }, + { + "epoch": 0.004313768914865652, + "grad_norm": 1.3203125, + "learning_rate": 0.00019976683055728924, + "loss": 2.0817, + "step": 168 + }, + { + "epoch": 0.0043394461107874715, + "grad_norm": 1.390625, + "learning_rate": 0.00019976652577972313, + "loss": 2.1426, + "step": 169 + }, + { + "epoch": 0.004365123306709291, + "grad_norm": 1.28125, + "learning_rate": 0.00019976622080333133, + "loss": 1.9369, + "step": 170 + }, + { + "epoch": 0.004390800502631111, + "grad_norm": 1.2734375, + "learning_rate": 0.00019976591562811448, + "loss": 2.0434, + "step": 171 + }, + { + "epoch": 0.004416477698552929, + "grad_norm": 1.2578125, + "learning_rate": 0.00019976561025407315, + "loss": 2.2161, + "step": 172 + }, + { + "epoch": 0.004442154894474749, + "grad_norm": 1.390625, + "learning_rate": 0.00019976530468120797, + "loss": 1.977, + "step": 173 + }, + { + "epoch": 0.004467832090396568, + "grad_norm": 1.3125, + "learning_rate": 0.00019976499890951956, + "loss": 1.9103, + "step": 174 + }, + { + "epoch": 0.004493509286318388, + "grad_norm": 1.3046875, + "learning_rate": 0.0001997646929390085, + "loss": 1.8608, + "step": 175 + }, + { + "epoch": 0.004519186482240207, + "grad_norm": 1.1953125, + "learning_rate": 0.00019976438676967543, + "loss": 1.8129, + "step": 176 + }, + { + "epoch": 0.004544863678162026, + "grad_norm": 1.3828125, + "learning_rate": 0.00019976408040152098, + "loss": 1.9564, + "step": 177 + }, + { + "epoch": 0.004570540874083846, + "grad_norm": 1.2890625, + "learning_rate": 0.0001997637738345457, + "loss": 2.0602, + "step": 178 + }, + { + "epoch": 0.004596218070005665, + "grad_norm": 1.421875, + "learning_rate": 0.00019976346706875023, + "loss": 2.0533, + "step": 179 + }, + { + "epoch": 0.004621895265927484, + "grad_norm": 1.2890625, + "learning_rate": 0.00019976316010413515, + "loss": 2.0526, + "step": 180 + }, + { + "epoch": 0.004647572461849304, + "grad_norm": 1.203125, + "learning_rate": 0.00019976285294070117, + "loss": 1.7198, + "step": 181 + }, + { + "epoch": 0.004673249657771123, + "grad_norm": 2.078125, + "learning_rate": 0.00019976254557844877, + "loss": 2.0407, + "step": 182 + }, + { + "epoch": 0.004698926853692943, + "grad_norm": 1.2421875, + "learning_rate": 0.00019976223801737868, + "loss": 1.9106, + "step": 183 + }, + { + "epoch": 0.004724604049614761, + "grad_norm": 1.3203125, + "learning_rate": 0.00019976193025749144, + "loss": 1.9606, + "step": 184 + }, + { + "epoch": 0.004750281245536581, + "grad_norm": 1.3828125, + "learning_rate": 0.0001997616222987877, + "loss": 2.0454, + "step": 185 + }, + { + "epoch": 0.0047759584414584005, + "grad_norm": 1.3125, + "learning_rate": 0.00019976131414126805, + "loss": 1.7913, + "step": 186 + }, + { + "epoch": 0.00480163563738022, + "grad_norm": 1.3984375, + "learning_rate": 0.00019976100578493312, + "loss": 1.7453, + "step": 187 + }, + { + "epoch": 0.00482731283330204, + "grad_norm": 1.2890625, + "learning_rate": 0.00019976069722978348, + "loss": 1.9642, + "step": 188 + }, + { + "epoch": 0.004852990029223858, + "grad_norm": 1.1953125, + "learning_rate": 0.00019976038847581982, + "loss": 1.9041, + "step": 189 + }, + { + "epoch": 0.004878667225145678, + "grad_norm": 1.328125, + "learning_rate": 0.00019976007952304266, + "loss": 2.0635, + "step": 190 + }, + { + "epoch": 0.004904344421067497, + "grad_norm": 1.4453125, + "learning_rate": 0.00019975977037145276, + "loss": 1.7887, + "step": 191 + }, + { + "epoch": 0.004930021616989317, + "grad_norm": 1.2578125, + "learning_rate": 0.00019975946102105058, + "loss": 1.9701, + "step": 192 + }, + { + "epoch": 0.004955698812911136, + "grad_norm": 1.1953125, + "learning_rate": 0.00019975915147183685, + "loss": 1.8339, + "step": 193 + }, + { + "epoch": 0.004981376008832955, + "grad_norm": 1.21875, + "learning_rate": 0.0001997588417238121, + "loss": 2.1332, + "step": 194 + }, + { + "epoch": 0.005007053204754775, + "grad_norm": 1.2421875, + "learning_rate": 0.00019975853177697702, + "loss": 2.1316, + "step": 195 + }, + { + "epoch": 0.005032730400676594, + "grad_norm": 1.296875, + "learning_rate": 0.0001997582216313322, + "loss": 1.8423, + "step": 196 + }, + { + "epoch": 0.005058407596598413, + "grad_norm": 1.234375, + "learning_rate": 0.00019975791128687824, + "loss": 1.775, + "step": 197 + }, + { + "epoch": 0.005084084792520233, + "grad_norm": 1.1640625, + "learning_rate": 0.00019975760074361577, + "loss": 1.8544, + "step": 198 + }, + { + "epoch": 0.005109761988442052, + "grad_norm": 1.3203125, + "learning_rate": 0.0001997572900015454, + "loss": 2.0779, + "step": 199 + }, + { + "epoch": 0.005135439184363872, + "grad_norm": 1.2890625, + "learning_rate": 0.00019975697906066781, + "loss": 1.9575, + "step": 200 + }, + { + "epoch": 0.005161116380285691, + "grad_norm": 1.2265625, + "learning_rate": 0.00019975666792098357, + "loss": 1.9392, + "step": 201 + }, + { + "epoch": 0.00518679357620751, + "grad_norm": 1.3203125, + "learning_rate": 0.00019975635658249326, + "loss": 2.1059, + "step": 202 + }, + { + "epoch": 0.0052124707721293295, + "grad_norm": 1.234375, + "learning_rate": 0.00019975604504519754, + "loss": 1.9351, + "step": 203 + }, + { + "epoch": 0.005238147968051149, + "grad_norm": 1.2421875, + "learning_rate": 0.00019975573330909704, + "loss": 1.7031, + "step": 204 + }, + { + "epoch": 0.005263825163972969, + "grad_norm": 1.265625, + "learning_rate": 0.0001997554213741924, + "loss": 1.8233, + "step": 205 + }, + { + "epoch": 0.005289502359894787, + "grad_norm": 1.1796875, + "learning_rate": 0.00019975510924048423, + "loss": 1.828, + "step": 206 + }, + { + "epoch": 0.005315179555816607, + "grad_norm": 1.265625, + "learning_rate": 0.0001997547969079731, + "loss": 1.8282, + "step": 207 + }, + { + "epoch": 0.0053408567517384264, + "grad_norm": 1.25, + "learning_rate": 0.00019975448437665967, + "loss": 1.8393, + "step": 208 + }, + { + "epoch": 0.005366533947660246, + "grad_norm": 1.28125, + "learning_rate": 0.00019975417164654456, + "loss": 1.9413, + "step": 209 + }, + { + "epoch": 0.005392211143582065, + "grad_norm": 1.28125, + "learning_rate": 0.00019975385871762844, + "loss": 1.9984, + "step": 210 + }, + { + "epoch": 0.005417888339503884, + "grad_norm": 1.3046875, + "learning_rate": 0.00019975354558991187, + "loss": 2.0349, + "step": 211 + }, + { + "epoch": 0.005443565535425704, + "grad_norm": 1.15625, + "learning_rate": 0.0001997532322633955, + "loss": 1.9123, + "step": 212 + }, + { + "epoch": 0.005469242731347523, + "grad_norm": 1.2109375, + "learning_rate": 0.0001997529187380799, + "loss": 2.1437, + "step": 213 + }, + { + "epoch": 0.005494919927269343, + "grad_norm": 1.265625, + "learning_rate": 0.0001997526050139658, + "loss": 1.9852, + "step": 214 + }, + { + "epoch": 0.005520597123191162, + "grad_norm": 1.2109375, + "learning_rate": 0.00019975229109105372, + "loss": 2.0391, + "step": 215 + }, + { + "epoch": 0.005546274319112981, + "grad_norm": 1.1015625, + "learning_rate": 0.0001997519769693444, + "loss": 1.8174, + "step": 216 + }, + { + "epoch": 0.005571951515034801, + "grad_norm": 1.203125, + "learning_rate": 0.00019975166264883833, + "loss": 1.8886, + "step": 217 + }, + { + "epoch": 0.00559762871095662, + "grad_norm": 1.3828125, + "learning_rate": 0.00019975134812953623, + "loss": 2.1096, + "step": 218 + }, + { + "epoch": 0.005623305906878439, + "grad_norm": 1.21875, + "learning_rate": 0.00019975103341143873, + "loss": 2.1911, + "step": 219 + }, + { + "epoch": 0.0056489831028002585, + "grad_norm": 1.25, + "learning_rate": 0.00019975071849454641, + "loss": 1.9302, + "step": 220 + }, + { + "epoch": 0.005674660298722078, + "grad_norm": 1.3125, + "learning_rate": 0.00019975040337885994, + "loss": 2.026, + "step": 221 + }, + { + "epoch": 0.005700337494643898, + "grad_norm": 1.265625, + "learning_rate": 0.0001997500880643799, + "loss": 1.8796, + "step": 222 + }, + { + "epoch": 0.005726014690565716, + "grad_norm": 1.234375, + "learning_rate": 0.00019974977255110695, + "loss": 1.7137, + "step": 223 + }, + { + "epoch": 0.005751691886487536, + "grad_norm": 1.28125, + "learning_rate": 0.0001997494568390417, + "loss": 1.8575, + "step": 224 + }, + { + "epoch": 0.0057773690824093555, + "grad_norm": 1.3203125, + "learning_rate": 0.00019974914092818483, + "loss": 2.0677, + "step": 225 + }, + { + "epoch": 0.005803046278331175, + "grad_norm": 1.265625, + "learning_rate": 0.0001997488248185369, + "loss": 1.7926, + "step": 226 + }, + { + "epoch": 0.005828723474252995, + "grad_norm": 1.390625, + "learning_rate": 0.0001997485085100986, + "loss": 1.9124, + "step": 227 + }, + { + "epoch": 0.005854400670174813, + "grad_norm": 1.3671875, + "learning_rate": 0.00019974819200287052, + "loss": 1.905, + "step": 228 + }, + { + "epoch": 0.005880077866096633, + "grad_norm": 1.4140625, + "learning_rate": 0.0001997478752968533, + "loss": 2.0241, + "step": 229 + }, + { + "epoch": 0.005905755062018452, + "grad_norm": 1.359375, + "learning_rate": 0.00019974755839204756, + "loss": 2.0559, + "step": 230 + }, + { + "epoch": 0.005931432257940272, + "grad_norm": 1.3359375, + "learning_rate": 0.00019974724128845396, + "loss": 2.0247, + "step": 231 + }, + { + "epoch": 0.005957109453862091, + "grad_norm": 1.25, + "learning_rate": 0.00019974692398607314, + "loss": 2.1181, + "step": 232 + }, + { + "epoch": 0.00598278664978391, + "grad_norm": 1.328125, + "learning_rate": 0.0001997466064849057, + "loss": 2.0184, + "step": 233 + }, + { + "epoch": 0.00600846384570573, + "grad_norm": 1.296875, + "learning_rate": 0.00019974628878495225, + "loss": 1.8364, + "step": 234 + }, + { + "epoch": 0.006034141041627549, + "grad_norm": 1.34375, + "learning_rate": 0.0001997459708862135, + "loss": 1.6873, + "step": 235 + }, + { + "epoch": 0.006059818237549368, + "grad_norm": 1.28125, + "learning_rate": 0.00019974565278869003, + "loss": 1.798, + "step": 236 + }, + { + "epoch": 0.0060854954334711876, + "grad_norm": 1.234375, + "learning_rate": 0.00019974533449238245, + "loss": 1.842, + "step": 237 + }, + { + "epoch": 0.006111172629393007, + "grad_norm": 1.296875, + "learning_rate": 0.00019974501599729147, + "loss": 1.7427, + "step": 238 + }, + { + "epoch": 0.006136849825314827, + "grad_norm": 1.265625, + "learning_rate": 0.00019974469730341766, + "loss": 1.9307, + "step": 239 + }, + { + "epoch": 0.006162527021236646, + "grad_norm": 1.203125, + "learning_rate": 0.00019974437841076167, + "loss": 1.8584, + "step": 240 + }, + { + "epoch": 0.006188204217158465, + "grad_norm": 1.2421875, + "learning_rate": 0.00019974405931932418, + "loss": 1.8383, + "step": 241 + }, + { + "epoch": 0.0062138814130802845, + "grad_norm": 1.2734375, + "learning_rate": 0.00019974374002910573, + "loss": 1.9619, + "step": 242 + }, + { + "epoch": 0.006239558609002104, + "grad_norm": 1.3046875, + "learning_rate": 0.00019974342054010708, + "loss": 1.8814, + "step": 243 + }, + { + "epoch": 0.006265235804923924, + "grad_norm": 1.171875, + "learning_rate": 0.00019974310085232877, + "loss": 1.9053, + "step": 244 + }, + { + "epoch": 0.006290913000845742, + "grad_norm": 1.2578125, + "learning_rate": 0.00019974278096577148, + "loss": 1.9791, + "step": 245 + }, + { + "epoch": 0.006316590196767562, + "grad_norm": 1.1796875, + "learning_rate": 0.00019974246088043582, + "loss": 1.8957, + "step": 246 + }, + { + "epoch": 0.006342267392689381, + "grad_norm": 1.2421875, + "learning_rate": 0.0001997421405963225, + "loss": 2.16, + "step": 247 + }, + { + "epoch": 0.006367944588611201, + "grad_norm": 1.3671875, + "learning_rate": 0.00019974182011343205, + "loss": 2.0619, + "step": 248 + }, + { + "epoch": 0.00639362178453302, + "grad_norm": 1.1171875, + "learning_rate": 0.00019974149943176517, + "loss": 1.8667, + "step": 249 + }, + { + "epoch": 0.006419298980454839, + "grad_norm": 1.1484375, + "learning_rate": 0.0001997411785513225, + "loss": 1.7649, + "step": 250 + }, + { + "epoch": 0.006444976176376659, + "grad_norm": 1.171875, + "learning_rate": 0.00019974085747210464, + "loss": 1.8126, + "step": 251 + }, + { + "epoch": 0.006470653372298478, + "grad_norm": 1.21875, + "learning_rate": 0.0001997405361941123, + "loss": 1.7649, + "step": 252 + }, + { + "epoch": 0.006496330568220298, + "grad_norm": 1.21875, + "learning_rate": 0.00019974021471734607, + "loss": 1.9819, + "step": 253 + }, + { + "epoch": 0.006522007764142117, + "grad_norm": 1.234375, + "learning_rate": 0.0001997398930418066, + "loss": 1.8951, + "step": 254 + }, + { + "epoch": 0.006547684960063936, + "grad_norm": 1.1953125, + "learning_rate": 0.00019973957116749453, + "loss": 1.8966, + "step": 255 + }, + { + "epoch": 0.006573362155985756, + "grad_norm": 1.40625, + "learning_rate": 0.0001997392490944105, + "loss": 1.6778, + "step": 256 + }, + { + "epoch": 0.006599039351907575, + "grad_norm": 1.1484375, + "learning_rate": 0.00019973892682255518, + "loss": 1.8668, + "step": 257 + }, + { + "epoch": 0.006624716547829394, + "grad_norm": 1.1796875, + "learning_rate": 0.00019973860435192916, + "loss": 1.8748, + "step": 258 + }, + { + "epoch": 0.0066503937437512135, + "grad_norm": 1.234375, + "learning_rate": 0.00019973828168253312, + "loss": 1.857, + "step": 259 + }, + { + "epoch": 0.006676070939673033, + "grad_norm": 1.4296875, + "learning_rate": 0.0001997379588143677, + "loss": 1.9194, + "step": 260 + }, + { + "epoch": 0.006701748135594853, + "grad_norm": 1.140625, + "learning_rate": 0.00019973763574743353, + "loss": 1.7043, + "step": 261 + }, + { + "epoch": 0.006727425331516671, + "grad_norm": 1.21875, + "learning_rate": 0.00019973731248173126, + "loss": 1.9553, + "step": 262 + }, + { + "epoch": 0.006753102527438491, + "grad_norm": 1.2109375, + "learning_rate": 0.00019973698901726153, + "loss": 1.8621, + "step": 263 + }, + { + "epoch": 0.00677877972336031, + "grad_norm": 1.1875, + "learning_rate": 0.000199736665354025, + "loss": 2.0353, + "step": 264 + }, + { + "epoch": 0.00680445691928213, + "grad_norm": 1.2109375, + "learning_rate": 0.00019973634149202232, + "loss": 1.925, + "step": 265 + }, + { + "epoch": 0.0068301341152039495, + "grad_norm": 1.2890625, + "learning_rate": 0.00019973601743125409, + "loss": 1.8143, + "step": 266 + }, + { + "epoch": 0.006855811311125768, + "grad_norm": 1.234375, + "learning_rate": 0.00019973569317172102, + "loss": 2.0033, + "step": 267 + }, + { + "epoch": 0.006881488507047588, + "grad_norm": 1.21875, + "learning_rate": 0.0001997353687134237, + "loss": 1.684, + "step": 268 + }, + { + "epoch": 0.006907165702969407, + "grad_norm": 1.34375, + "learning_rate": 0.0001997350440563628, + "loss": 2.0488, + "step": 269 + }, + { + "epoch": 0.006932842898891227, + "grad_norm": 1.1796875, + "learning_rate": 0.00019973471920053896, + "loss": 1.7987, + "step": 270 + }, + { + "epoch": 0.006958520094813046, + "grad_norm": 1.1875, + "learning_rate": 0.00019973439414595285, + "loss": 1.9595, + "step": 271 + }, + { + "epoch": 0.006984197290734865, + "grad_norm": 1.2109375, + "learning_rate": 0.00019973406889260508, + "loss": 1.8901, + "step": 272 + }, + { + "epoch": 0.007009874486656685, + "grad_norm": 1.2734375, + "learning_rate": 0.00019973374344049635, + "loss": 1.9371, + "step": 273 + }, + { + "epoch": 0.007035551682578504, + "grad_norm": 1.203125, + "learning_rate": 0.00019973341778962726, + "loss": 1.7949, + "step": 274 + }, + { + "epoch": 0.007061228878500323, + "grad_norm": 1.2890625, + "learning_rate": 0.0001997330919399985, + "loss": 2.1239, + "step": 275 + }, + { + "epoch": 0.0070869060744221425, + "grad_norm": 1.140625, + "learning_rate": 0.00019973276589161065, + "loss": 1.8516, + "step": 276 + }, + { + "epoch": 0.007112583270343962, + "grad_norm": 1.1875, + "learning_rate": 0.00019973243964446443, + "loss": 1.7713, + "step": 277 + }, + { + "epoch": 0.007138260466265782, + "grad_norm": 1.203125, + "learning_rate": 0.00019973211319856046, + "loss": 1.8969, + "step": 278 + }, + { + "epoch": 0.007163937662187601, + "grad_norm": 1.171875, + "learning_rate": 0.0001997317865538994, + "loss": 1.8497, + "step": 279 + }, + { + "epoch": 0.00718961485810942, + "grad_norm": 1.140625, + "learning_rate": 0.00019973145971048192, + "loss": 1.6959, + "step": 280 + }, + { + "epoch": 0.0072152920540312394, + "grad_norm": 1.1484375, + "learning_rate": 0.00019973113266830863, + "loss": 1.8457, + "step": 281 + }, + { + "epoch": 0.007240969249953059, + "grad_norm": 1.25, + "learning_rate": 0.00019973080542738024, + "loss": 1.7543, + "step": 282 + }, + { + "epoch": 0.0072666464458748786, + "grad_norm": 1.203125, + "learning_rate": 0.00019973047798769732, + "loss": 1.8588, + "step": 283 + }, + { + "epoch": 0.007292323641796697, + "grad_norm": 1.203125, + "learning_rate": 0.00019973015034926056, + "loss": 1.9748, + "step": 284 + }, + { + "epoch": 0.007318000837718517, + "grad_norm": 1.1171875, + "learning_rate": 0.00019972982251207063, + "loss": 1.663, + "step": 285 + }, + { + "epoch": 0.007343678033640336, + "grad_norm": 1.15625, + "learning_rate": 0.0001997294944761282, + "loss": 1.8601, + "step": 286 + }, + { + "epoch": 0.007369355229562156, + "grad_norm": 1.1875, + "learning_rate": 0.00019972916624143384, + "loss": 1.9841, + "step": 287 + }, + { + "epoch": 0.007395032425483975, + "grad_norm": 1.1015625, + "learning_rate": 0.0001997288378079883, + "loss": 1.7883, + "step": 288 + }, + { + "epoch": 0.007420709621405794, + "grad_norm": 1.1953125, + "learning_rate": 0.0001997285091757922, + "loss": 1.997, + "step": 289 + }, + { + "epoch": 0.007446386817327614, + "grad_norm": 1.1796875, + "learning_rate": 0.00019972818034484616, + "loss": 1.7257, + "step": 290 + }, + { + "epoch": 0.007472064013249433, + "grad_norm": 1.234375, + "learning_rate": 0.0001997278513151509, + "loss": 1.9261, + "step": 291 + }, + { + "epoch": 0.007497741209171253, + "grad_norm": 1.1796875, + "learning_rate": 0.00019972752208670703, + "loss": 1.9084, + "step": 292 + }, + { + "epoch": 0.0075234184050930715, + "grad_norm": 1.1015625, + "learning_rate": 0.00019972719265951518, + "loss": 1.8128, + "step": 293 + }, + { + "epoch": 0.007549095601014891, + "grad_norm": 1.28125, + "learning_rate": 0.0001997268630335761, + "loss": 1.9687, + "step": 294 + }, + { + "epoch": 0.007574772796936711, + "grad_norm": 1.734375, + "learning_rate": 0.00019972653320889035, + "loss": 1.9329, + "step": 295 + }, + { + "epoch": 0.00760044999285853, + "grad_norm": 1.3828125, + "learning_rate": 0.00019972620318545862, + "loss": 1.7815, + "step": 296 + }, + { + "epoch": 0.007626127188780349, + "grad_norm": 1.4140625, + "learning_rate": 0.0001997258729632816, + "loss": 1.8117, + "step": 297 + }, + { + "epoch": 0.0076518043847021685, + "grad_norm": 1.2265625, + "learning_rate": 0.00019972554254235993, + "loss": 1.7553, + "step": 298 + }, + { + "epoch": 0.007677481580623988, + "grad_norm": 1.25, + "learning_rate": 0.00019972521192269425, + "loss": 1.9957, + "step": 299 + }, + { + "epoch": 0.007703158776545808, + "grad_norm": 1.1875, + "learning_rate": 0.00019972488110428524, + "loss": 1.9145, + "step": 300 + }, + { + "epoch": 0.007728835972467626, + "grad_norm": 1.2109375, + "learning_rate": 0.00019972455008713353, + "loss": 1.9651, + "step": 301 + }, + { + "epoch": 0.007754513168389446, + "grad_norm": 1.2890625, + "learning_rate": 0.00019972421887123982, + "loss": 1.7477, + "step": 302 + }, + { + "epoch": 0.007780190364311265, + "grad_norm": 1.203125, + "learning_rate": 0.00019972388745660474, + "loss": 1.8351, + "step": 303 + }, + { + "epoch": 0.007805867560233085, + "grad_norm": 1.203125, + "learning_rate": 0.00019972355584322896, + "loss": 1.9175, + "step": 304 + }, + { + "epoch": 0.007831544756154904, + "grad_norm": 1.1875, + "learning_rate": 0.00019972322403111314, + "loss": 1.8567, + "step": 305 + }, + { + "epoch": 0.007857221952076724, + "grad_norm": 1.171875, + "learning_rate": 0.00019972289202025795, + "loss": 1.6569, + "step": 306 + }, + { + "epoch": 0.007882899147998543, + "grad_norm": 1.28125, + "learning_rate": 0.00019972255981066402, + "loss": 1.6634, + "step": 307 + }, + { + "epoch": 0.007908576343920361, + "grad_norm": 1.2578125, + "learning_rate": 0.00019972222740233207, + "loss": 1.7974, + "step": 308 + }, + { + "epoch": 0.007934253539842182, + "grad_norm": 1.2421875, + "learning_rate": 0.0001997218947952627, + "loss": 1.6202, + "step": 309 + }, + { + "epoch": 0.007959930735764, + "grad_norm": 1.2578125, + "learning_rate": 0.0001997215619894566, + "loss": 1.9131, + "step": 310 + }, + { + "epoch": 0.007985607931685821, + "grad_norm": 1.109375, + "learning_rate": 0.00019972122898491448, + "loss": 1.7266, + "step": 311 + }, + { + "epoch": 0.00801128512760764, + "grad_norm": 1.109375, + "learning_rate": 0.00019972089578163693, + "loss": 1.9187, + "step": 312 + }, + { + "epoch": 0.008036962323529458, + "grad_norm": 1.1328125, + "learning_rate": 0.00019972056237962463, + "loss": 1.6832, + "step": 313 + }, + { + "epoch": 0.008062639519451279, + "grad_norm": 1.2421875, + "learning_rate": 0.0001997202287788783, + "loss": 1.8736, + "step": 314 + }, + { + "epoch": 0.008088316715373097, + "grad_norm": 1.234375, + "learning_rate": 0.00019971989497939848, + "loss": 1.9551, + "step": 315 + }, + { + "epoch": 0.008113993911294918, + "grad_norm": 1.109375, + "learning_rate": 0.000199719560981186, + "loss": 1.7719, + "step": 316 + }, + { + "epoch": 0.008139671107216737, + "grad_norm": 1.1640625, + "learning_rate": 0.0001997192267842414, + "loss": 1.9235, + "step": 317 + }, + { + "epoch": 0.008165348303138555, + "grad_norm": 1.1171875, + "learning_rate": 0.0001997188923885654, + "loss": 1.9036, + "step": 318 + }, + { + "epoch": 0.008191025499060376, + "grad_norm": 1.1796875, + "learning_rate": 0.00019971855779415867, + "loss": 1.8696, + "step": 319 + }, + { + "epoch": 0.008216702694982194, + "grad_norm": 1.15625, + "learning_rate": 0.00019971822300102182, + "loss": 1.7377, + "step": 320 + }, + { + "epoch": 0.008242379890904013, + "grad_norm": 1.0703125, + "learning_rate": 0.0001997178880091556, + "loss": 1.8273, + "step": 321 + }, + { + "epoch": 0.008268057086825834, + "grad_norm": 1.15625, + "learning_rate": 0.00019971755281856062, + "loss": 1.7127, + "step": 322 + }, + { + "epoch": 0.008293734282747652, + "grad_norm": 1.1796875, + "learning_rate": 0.00019971721742923756, + "loss": 1.8927, + "step": 323 + }, + { + "epoch": 0.008319411478669473, + "grad_norm": 1.1875, + "learning_rate": 0.0001997168818411871, + "loss": 1.7798, + "step": 324 + }, + { + "epoch": 0.008345088674591291, + "grad_norm": 1.15625, + "learning_rate": 0.00019971654605440987, + "loss": 1.7128, + "step": 325 + }, + { + "epoch": 0.00837076587051311, + "grad_norm": 1.1484375, + "learning_rate": 0.00019971621006890664, + "loss": 1.5813, + "step": 326 + }, + { + "epoch": 0.00839644306643493, + "grad_norm": 1.171875, + "learning_rate": 0.00019971587388467797, + "loss": 1.6393, + "step": 327 + }, + { + "epoch": 0.008422120262356749, + "grad_norm": 1.1328125, + "learning_rate": 0.00019971553750172455, + "loss": 1.8425, + "step": 328 + }, + { + "epoch": 0.00844779745827857, + "grad_norm": 1.1484375, + "learning_rate": 0.0001997152009200471, + "loss": 1.7632, + "step": 329 + }, + { + "epoch": 0.008473474654200388, + "grad_norm": 1.171875, + "learning_rate": 0.00019971486413964627, + "loss": 1.8557, + "step": 330 + }, + { + "epoch": 0.008499151850122207, + "grad_norm": 1.203125, + "learning_rate": 0.0001997145271605227, + "loss": 1.8226, + "step": 331 + }, + { + "epoch": 0.008524829046044027, + "grad_norm": 1.15625, + "learning_rate": 0.0001997141899826771, + "loss": 1.6952, + "step": 332 + }, + { + "epoch": 0.008550506241965846, + "grad_norm": 1.140625, + "learning_rate": 0.0001997138526061101, + "loss": 1.7493, + "step": 333 + }, + { + "epoch": 0.008576183437887665, + "grad_norm": 1.265625, + "learning_rate": 0.00019971351503082242, + "loss": 1.7465, + "step": 334 + }, + { + "epoch": 0.008601860633809485, + "grad_norm": 1.1875, + "learning_rate": 0.00019971317725681474, + "loss": 1.7524, + "step": 335 + }, + { + "epoch": 0.008627537829731304, + "grad_norm": 1.296875, + "learning_rate": 0.00019971283928408765, + "loss": 1.817, + "step": 336 + }, + { + "epoch": 0.008653215025653124, + "grad_norm": 1.203125, + "learning_rate": 0.00019971250111264192, + "loss": 1.8899, + "step": 337 + }, + { + "epoch": 0.008678892221574943, + "grad_norm": 1.1015625, + "learning_rate": 0.00019971216274247816, + "loss": 1.6186, + "step": 338 + }, + { + "epoch": 0.008704569417496762, + "grad_norm": 1.1875, + "learning_rate": 0.00019971182417359706, + "loss": 1.81, + "step": 339 + }, + { + "epoch": 0.008730246613418582, + "grad_norm": 1.1484375, + "learning_rate": 0.00019971148540599934, + "loss": 1.8437, + "step": 340 + }, + { + "epoch": 0.0087559238093404, + "grad_norm": 1.1328125, + "learning_rate": 0.0001997111464396856, + "loss": 1.7811, + "step": 341 + }, + { + "epoch": 0.008781601005262221, + "grad_norm": 1.2421875, + "learning_rate": 0.00019971080727465657, + "loss": 1.9023, + "step": 342 + }, + { + "epoch": 0.00880727820118404, + "grad_norm": 1.2109375, + "learning_rate": 0.00019971046791091287, + "loss": 1.8833, + "step": 343 + }, + { + "epoch": 0.008832955397105859, + "grad_norm": 1.1015625, + "learning_rate": 0.00019971012834845526, + "loss": 1.7821, + "step": 344 + }, + { + "epoch": 0.008858632593027679, + "grad_norm": 1.1328125, + "learning_rate": 0.00019970978858728435, + "loss": 1.6469, + "step": 345 + }, + { + "epoch": 0.008884309788949498, + "grad_norm": 1.25, + "learning_rate": 0.00019970944862740083, + "loss": 1.9876, + "step": 346 + }, + { + "epoch": 0.008909986984871316, + "grad_norm": 1.1328125, + "learning_rate": 0.00019970910846880543, + "loss": 1.6967, + "step": 347 + }, + { + "epoch": 0.008935664180793137, + "grad_norm": 1.25, + "learning_rate": 0.00019970876811149871, + "loss": 1.8077, + "step": 348 + }, + { + "epoch": 0.008961341376714956, + "grad_norm": 1.1953125, + "learning_rate": 0.00019970842755548146, + "loss": 1.7945, + "step": 349 + }, + { + "epoch": 0.008987018572636776, + "grad_norm": 1.2734375, + "learning_rate": 0.00019970808680075435, + "loss": 1.7071, + "step": 350 + }, + { + "epoch": 0.009012695768558595, + "grad_norm": 1.1015625, + "learning_rate": 0.000199707745847318, + "loss": 1.7332, + "step": 351 + }, + { + "epoch": 0.009038372964480413, + "grad_norm": 1.1171875, + "learning_rate": 0.00019970740469517313, + "loss": 1.838, + "step": 352 + }, + { + "epoch": 0.009064050160402234, + "grad_norm": 1.1484375, + "learning_rate": 0.0001997070633443204, + "loss": 1.6384, + "step": 353 + }, + { + "epoch": 0.009089727356324052, + "grad_norm": 1.1953125, + "learning_rate": 0.00019970672179476048, + "loss": 1.8566, + "step": 354 + }, + { + "epoch": 0.009115404552245871, + "grad_norm": 1.1953125, + "learning_rate": 0.0001997063800464941, + "loss": 1.7528, + "step": 355 + }, + { + "epoch": 0.009141081748167692, + "grad_norm": 1.1171875, + "learning_rate": 0.00019970603809952193, + "loss": 1.7903, + "step": 356 + }, + { + "epoch": 0.00916675894408951, + "grad_norm": 1.171875, + "learning_rate": 0.00019970569595384458, + "loss": 1.845, + "step": 357 + }, + { + "epoch": 0.00919243614001133, + "grad_norm": 1.1875, + "learning_rate": 0.0001997053536094628, + "loss": 1.7661, + "step": 358 + }, + { + "epoch": 0.00921811333593315, + "grad_norm": 1.1484375, + "learning_rate": 0.00019970501106637728, + "loss": 1.7161, + "step": 359 + }, + { + "epoch": 0.009243790531854968, + "grad_norm": 1.125, + "learning_rate": 0.00019970466832458866, + "loss": 1.7202, + "step": 360 + }, + { + "epoch": 0.009269467727776788, + "grad_norm": 1.25, + "learning_rate": 0.00019970432538409763, + "loss": 1.7726, + "step": 361 + }, + { + "epoch": 0.009295144923698607, + "grad_norm": 1.203125, + "learning_rate": 0.00019970398224490493, + "loss": 1.7279, + "step": 362 + }, + { + "epoch": 0.009320822119620428, + "grad_norm": 1.0859375, + "learning_rate": 0.00019970363890701117, + "loss": 1.6867, + "step": 363 + }, + { + "epoch": 0.009346499315542246, + "grad_norm": 1.140625, + "learning_rate": 0.00019970329537041709, + "loss": 1.5235, + "step": 364 + }, + { + "epoch": 0.009372176511464065, + "grad_norm": 1.03125, + "learning_rate": 0.0001997029516351233, + "loss": 1.6704, + "step": 365 + }, + { + "epoch": 0.009397853707385885, + "grad_norm": 1.1640625, + "learning_rate": 0.00019970260770113055, + "loss": 1.8011, + "step": 366 + }, + { + "epoch": 0.009423530903307704, + "grad_norm": 1.140625, + "learning_rate": 0.00019970226356843954, + "loss": 1.7362, + "step": 367 + }, + { + "epoch": 0.009449208099229523, + "grad_norm": 1.1640625, + "learning_rate": 0.0001997019192370509, + "loss": 1.5848, + "step": 368 + }, + { + "epoch": 0.009474885295151343, + "grad_norm": 1.1875, + "learning_rate": 0.00019970157470696533, + "loss": 1.8278, + "step": 369 + }, + { + "epoch": 0.009500562491073162, + "grad_norm": 1.09375, + "learning_rate": 0.00019970122997818354, + "loss": 1.8764, + "step": 370 + }, + { + "epoch": 0.009526239686994982, + "grad_norm": 1.1484375, + "learning_rate": 0.0001997008850507062, + "loss": 1.9372, + "step": 371 + }, + { + "epoch": 0.009551916882916801, + "grad_norm": 1.1484375, + "learning_rate": 0.000199700539924534, + "loss": 1.8385, + "step": 372 + }, + { + "epoch": 0.00957759407883862, + "grad_norm": 1.0859375, + "learning_rate": 0.00019970019459966765, + "loss": 1.8667, + "step": 373 + }, + { + "epoch": 0.00960327127476044, + "grad_norm": 1.171875, + "learning_rate": 0.0001996998490761078, + "loss": 1.6162, + "step": 374 + }, + { + "epoch": 0.009628948470682259, + "grad_norm": 1.0859375, + "learning_rate": 0.00019969950335385517, + "loss": 1.5868, + "step": 375 + }, + { + "epoch": 0.00965462566660408, + "grad_norm": 1.84375, + "learning_rate": 0.00019969915743291044, + "loss": 1.8162, + "step": 376 + }, + { + "epoch": 0.009680302862525898, + "grad_norm": 1.25, + "learning_rate": 0.0001996988113132743, + "loss": 2.1175, + "step": 377 + }, + { + "epoch": 0.009705980058447717, + "grad_norm": 1.1953125, + "learning_rate": 0.00019969846499494738, + "loss": 1.793, + "step": 378 + }, + { + "epoch": 0.009731657254369537, + "grad_norm": 1.171875, + "learning_rate": 0.00019969811847793048, + "loss": 1.6211, + "step": 379 + }, + { + "epoch": 0.009757334450291356, + "grad_norm": 1.1953125, + "learning_rate": 0.0001996977717622242, + "loss": 1.7763, + "step": 380 + }, + { + "epoch": 0.009783011646213174, + "grad_norm": 1.1953125, + "learning_rate": 0.0001996974248478293, + "loss": 1.6399, + "step": 381 + }, + { + "epoch": 0.009808688842134995, + "grad_norm": 1.171875, + "learning_rate": 0.0001996970777347464, + "loss": 1.7743, + "step": 382 + }, + { + "epoch": 0.009834366038056814, + "grad_norm": 1.078125, + "learning_rate": 0.00019969673042297627, + "loss": 1.6742, + "step": 383 + }, + { + "epoch": 0.009860043233978634, + "grad_norm": 1.1484375, + "learning_rate": 0.00019969638291251953, + "loss": 1.7875, + "step": 384 + }, + { + "epoch": 0.009885720429900453, + "grad_norm": 1.1796875, + "learning_rate": 0.00019969603520337694, + "loss": 1.7759, + "step": 385 + }, + { + "epoch": 0.009911397625822271, + "grad_norm": 1.203125, + "learning_rate": 0.00019969568729554912, + "loss": 1.9311, + "step": 386 + }, + { + "epoch": 0.009937074821744092, + "grad_norm": 1.1484375, + "learning_rate": 0.00019969533918903683, + "loss": 1.8094, + "step": 387 + }, + { + "epoch": 0.00996275201766591, + "grad_norm": 1.046875, + "learning_rate": 0.0001996949908838407, + "loss": 1.7112, + "step": 388 + }, + { + "epoch": 0.009988429213587731, + "grad_norm": 1.03125, + "learning_rate": 0.0001996946423799615, + "loss": 1.55, + "step": 389 + }, + { + "epoch": 0.01001410640950955, + "grad_norm": 1.09375, + "learning_rate": 0.00019969429367739988, + "loss": 1.4763, + "step": 390 + }, + { + "epoch": 0.010039783605431368, + "grad_norm": 1.15625, + "learning_rate": 0.0001996939447761565, + "loss": 1.6551, + "step": 391 + }, + { + "epoch": 0.010065460801353189, + "grad_norm": 1.1640625, + "learning_rate": 0.0001996935956762321, + "loss": 1.9569, + "step": 392 + }, + { + "epoch": 0.010091137997275007, + "grad_norm": 1.09375, + "learning_rate": 0.00019969324637762742, + "loss": 1.7528, + "step": 393 + }, + { + "epoch": 0.010116815193196826, + "grad_norm": 1.046875, + "learning_rate": 0.00019969289688034306, + "loss": 1.6146, + "step": 394 + }, + { + "epoch": 0.010142492389118647, + "grad_norm": 1.078125, + "learning_rate": 0.00019969254718437974, + "loss": 1.7426, + "step": 395 + }, + { + "epoch": 0.010168169585040465, + "grad_norm": 1.15625, + "learning_rate": 0.00019969219728973821, + "loss": 1.7774, + "step": 396 + }, + { + "epoch": 0.010193846780962286, + "grad_norm": 1.1640625, + "learning_rate": 0.00019969184719641913, + "loss": 1.7462, + "step": 397 + }, + { + "epoch": 0.010219523976884104, + "grad_norm": 1.046875, + "learning_rate": 0.00019969149690442323, + "loss": 1.6467, + "step": 398 + }, + { + "epoch": 0.010245201172805923, + "grad_norm": 1.046875, + "learning_rate": 0.00019969114641375116, + "loss": 1.7966, + "step": 399 + }, + { + "epoch": 0.010270878368727743, + "grad_norm": 1.1953125, + "learning_rate": 0.00019969079572440364, + "loss": 1.7212, + "step": 400 + }, + { + "epoch": 0.010296555564649562, + "grad_norm": 1.140625, + "learning_rate": 0.00019969044483638136, + "loss": 1.8428, + "step": 401 + }, + { + "epoch": 0.010322232760571383, + "grad_norm": 1.1640625, + "learning_rate": 0.00019969009374968505, + "loss": 1.8797, + "step": 402 + }, + { + "epoch": 0.010347909956493201, + "grad_norm": 1.0859375, + "learning_rate": 0.0001996897424643154, + "loss": 1.7703, + "step": 403 + }, + { + "epoch": 0.01037358715241502, + "grad_norm": 1.1640625, + "learning_rate": 0.00019968939098027305, + "loss": 1.6691, + "step": 404 + }, + { + "epoch": 0.01039926434833684, + "grad_norm": 1.1015625, + "learning_rate": 0.00019968903929755877, + "loss": 1.7057, + "step": 405 + }, + { + "epoch": 0.010424941544258659, + "grad_norm": 1.1796875, + "learning_rate": 0.00019968868741617325, + "loss": 1.583, + "step": 406 + }, + { + "epoch": 0.010450618740180478, + "grad_norm": 1.1484375, + "learning_rate": 0.00019968833533611714, + "loss": 1.6487, + "step": 407 + }, + { + "epoch": 0.010476295936102298, + "grad_norm": 1.1015625, + "learning_rate": 0.00019968798305739123, + "loss": 1.6445, + "step": 408 + }, + { + "epoch": 0.010501973132024117, + "grad_norm": 1.09375, + "learning_rate": 0.00019968763057999617, + "loss": 1.7771, + "step": 409 + }, + { + "epoch": 0.010527650327945937, + "grad_norm": 1.125, + "learning_rate": 0.00019968727790393266, + "loss": 1.7675, + "step": 410 + }, + { + "epoch": 0.010553327523867756, + "grad_norm": 1.0234375, + "learning_rate": 0.0001996869250292014, + "loss": 1.6663, + "step": 411 + }, + { + "epoch": 0.010579004719789575, + "grad_norm": 1.1953125, + "learning_rate": 0.0001996865719558031, + "loss": 1.7841, + "step": 412 + }, + { + "epoch": 0.010604681915711395, + "grad_norm": 1.15625, + "learning_rate": 0.0001996862186837385, + "loss": 1.8568, + "step": 413 + }, + { + "epoch": 0.010630359111633214, + "grad_norm": 1.1875, + "learning_rate": 0.00019968586521300824, + "loss": 1.8833, + "step": 414 + }, + { + "epoch": 0.010656036307555034, + "grad_norm": 1.15625, + "learning_rate": 0.00019968551154361306, + "loss": 1.8665, + "step": 415 + }, + { + "epoch": 0.010681713503476853, + "grad_norm": 1.1171875, + "learning_rate": 0.00019968515767555367, + "loss": 1.688, + "step": 416 + }, + { + "epoch": 0.010707390699398672, + "grad_norm": 1.1171875, + "learning_rate": 0.00019968480360883076, + "loss": 1.7266, + "step": 417 + }, + { + "epoch": 0.010733067895320492, + "grad_norm": 1.15625, + "learning_rate": 0.00019968444934344504, + "loss": 1.6336, + "step": 418 + }, + { + "epoch": 0.01075874509124231, + "grad_norm": 1.0546875, + "learning_rate": 0.00019968409487939722, + "loss": 1.5974, + "step": 419 + }, + { + "epoch": 0.01078442228716413, + "grad_norm": 1.140625, + "learning_rate": 0.000199683740216688, + "loss": 1.6831, + "step": 420 + }, + { + "epoch": 0.01081009948308595, + "grad_norm": 1.1015625, + "learning_rate": 0.00019968338535531808, + "loss": 1.5546, + "step": 421 + }, + { + "epoch": 0.010835776679007769, + "grad_norm": 1.125, + "learning_rate": 0.00019968303029528818, + "loss": 1.8141, + "step": 422 + }, + { + "epoch": 0.010861453874929589, + "grad_norm": 1.0703125, + "learning_rate": 0.00019968267503659904, + "loss": 1.7929, + "step": 423 + }, + { + "epoch": 0.010887131070851408, + "grad_norm": 1.1015625, + "learning_rate": 0.0001996823195792513, + "loss": 1.8732, + "step": 424 + }, + { + "epoch": 0.010912808266773226, + "grad_norm": 1.1875, + "learning_rate": 0.00019968196392324567, + "loss": 1.7545, + "step": 425 + }, + { + "epoch": 0.010938485462695047, + "grad_norm": 1.1484375, + "learning_rate": 0.00019968160806858295, + "loss": 1.7932, + "step": 426 + }, + { + "epoch": 0.010964162658616865, + "grad_norm": 1.171875, + "learning_rate": 0.00019968125201526377, + "loss": 1.7704, + "step": 427 + }, + { + "epoch": 0.010989839854538686, + "grad_norm": 1.2109375, + "learning_rate": 0.00019968089576328883, + "loss": 1.7187, + "step": 428 + }, + { + "epoch": 0.011015517050460505, + "grad_norm": 1.109375, + "learning_rate": 0.00019968053931265892, + "loss": 1.4602, + "step": 429 + }, + { + "epoch": 0.011041194246382323, + "grad_norm": 1.2109375, + "learning_rate": 0.00019968018266337465, + "loss": 1.7117, + "step": 430 + }, + { + "epoch": 0.011066871442304144, + "grad_norm": 1.1953125, + "learning_rate": 0.00019967982581543684, + "loss": 1.7176, + "step": 431 + }, + { + "epoch": 0.011092548638225962, + "grad_norm": 1.171875, + "learning_rate": 0.00019967946876884608, + "loss": 1.7788, + "step": 432 + }, + { + "epoch": 0.011118225834147781, + "grad_norm": 1.1015625, + "learning_rate": 0.00019967911152360317, + "loss": 1.7759, + "step": 433 + }, + { + "epoch": 0.011143903030069601, + "grad_norm": 1.0546875, + "learning_rate": 0.00019967875407970879, + "loss": 1.5139, + "step": 434 + }, + { + "epoch": 0.01116958022599142, + "grad_norm": 1.203125, + "learning_rate": 0.00019967839643716365, + "loss": 1.7944, + "step": 435 + }, + { + "epoch": 0.01119525742191324, + "grad_norm": 1.1328125, + "learning_rate": 0.0001996780385959685, + "loss": 1.6012, + "step": 436 + }, + { + "epoch": 0.01122093461783506, + "grad_norm": 1.1328125, + "learning_rate": 0.00019967768055612398, + "loss": 1.8449, + "step": 437 + }, + { + "epoch": 0.011246611813756878, + "grad_norm": 1.140625, + "learning_rate": 0.00019967732231763088, + "loss": 1.5959, + "step": 438 + }, + { + "epoch": 0.011272289009678698, + "grad_norm": 1.140625, + "learning_rate": 0.00019967696388048986, + "loss": 1.6453, + "step": 439 + }, + { + "epoch": 0.011297966205600517, + "grad_norm": 1.1640625, + "learning_rate": 0.00019967660524470166, + "loss": 1.634, + "step": 440 + }, + { + "epoch": 0.011323643401522338, + "grad_norm": 1.21875, + "learning_rate": 0.000199676246410267, + "loss": 1.671, + "step": 441 + }, + { + "epoch": 0.011349320597444156, + "grad_norm": 1.1640625, + "learning_rate": 0.00019967588737718658, + "loss": 1.7569, + "step": 442 + }, + { + "epoch": 0.011374997793365975, + "grad_norm": 1.0625, + "learning_rate": 0.0001996755281454611, + "loss": 1.7299, + "step": 443 + }, + { + "epoch": 0.011400674989287795, + "grad_norm": 1.09375, + "learning_rate": 0.00019967516871509134, + "loss": 1.7621, + "step": 444 + }, + { + "epoch": 0.011426352185209614, + "grad_norm": 1.0546875, + "learning_rate": 0.00019967480908607792, + "loss": 1.5903, + "step": 445 + }, + { + "epoch": 0.011452029381131433, + "grad_norm": 1.0390625, + "learning_rate": 0.00019967444925842164, + "loss": 1.7768, + "step": 446 + }, + { + "epoch": 0.011477706577053253, + "grad_norm": 1.2265625, + "learning_rate": 0.00019967408923212318, + "loss": 1.7113, + "step": 447 + }, + { + "epoch": 0.011503383772975072, + "grad_norm": 1.1171875, + "learning_rate": 0.00019967372900718327, + "loss": 1.5659, + "step": 448 + }, + { + "epoch": 0.011529060968896892, + "grad_norm": 1.140625, + "learning_rate": 0.00019967336858360261, + "loss": 1.7802, + "step": 449 + }, + { + "epoch": 0.011554738164818711, + "grad_norm": 1.1328125, + "learning_rate": 0.00019967300796138194, + "loss": 1.7782, + "step": 450 + }, + { + "epoch": 0.01158041536074053, + "grad_norm": 1.1484375, + "learning_rate": 0.00019967264714052196, + "loss": 1.7433, + "step": 451 + }, + { + "epoch": 0.01160609255666235, + "grad_norm": 1.203125, + "learning_rate": 0.0001996722861210234, + "loss": 1.7191, + "step": 452 + }, + { + "epoch": 0.011631769752584169, + "grad_norm": 1.203125, + "learning_rate": 0.00019967192490288698, + "loss": 1.7041, + "step": 453 + }, + { + "epoch": 0.01165744694850599, + "grad_norm": 1.140625, + "learning_rate": 0.00019967156348611343, + "loss": 1.7039, + "step": 454 + }, + { + "epoch": 0.011683124144427808, + "grad_norm": 1.0859375, + "learning_rate": 0.00019967120187070343, + "loss": 1.6203, + "step": 455 + }, + { + "epoch": 0.011708801340349627, + "grad_norm": 1.0546875, + "learning_rate": 0.00019967084005665774, + "loss": 1.7507, + "step": 456 + }, + { + "epoch": 0.011734478536271447, + "grad_norm": 1.1328125, + "learning_rate": 0.0001996704780439771, + "loss": 1.6025, + "step": 457 + }, + { + "epoch": 0.011760155732193266, + "grad_norm": 1.1953125, + "learning_rate": 0.00019967011583266216, + "loss": 1.646, + "step": 458 + }, + { + "epoch": 0.011785832928115084, + "grad_norm": 1.2578125, + "learning_rate": 0.00019966975342271368, + "loss": 1.6973, + "step": 459 + }, + { + "epoch": 0.011811510124036905, + "grad_norm": 1.1640625, + "learning_rate": 0.00019966939081413241, + "loss": 1.6053, + "step": 460 + }, + { + "epoch": 0.011837187319958723, + "grad_norm": 1.125, + "learning_rate": 0.00019966902800691902, + "loss": 1.5869, + "step": 461 + }, + { + "epoch": 0.011862864515880544, + "grad_norm": 1.1484375, + "learning_rate": 0.0001996686650010743, + "loss": 1.7187, + "step": 462 + }, + { + "epoch": 0.011888541711802363, + "grad_norm": 1.0546875, + "learning_rate": 0.00019966830179659887, + "loss": 1.839, + "step": 463 + }, + { + "epoch": 0.011914218907724181, + "grad_norm": 1.1328125, + "learning_rate": 0.00019966793839349358, + "loss": 1.7399, + "step": 464 + }, + { + "epoch": 0.011939896103646002, + "grad_norm": 1.0625, + "learning_rate": 0.00019966757479175905, + "loss": 1.5158, + "step": 465 + }, + { + "epoch": 0.01196557329956782, + "grad_norm": 1.0703125, + "learning_rate": 0.0001996672109913961, + "loss": 1.6339, + "step": 466 + }, + { + "epoch": 0.01199125049548964, + "grad_norm": 1.0859375, + "learning_rate": 0.00019966684699240534, + "loss": 1.7077, + "step": 467 + }, + { + "epoch": 0.01201692769141146, + "grad_norm": 1.1015625, + "learning_rate": 0.00019966648279478758, + "loss": 1.7155, + "step": 468 + }, + { + "epoch": 0.012042604887333278, + "grad_norm": 1.0625, + "learning_rate": 0.00019966611839854356, + "loss": 1.6848, + "step": 469 + }, + { + "epoch": 0.012068282083255099, + "grad_norm": 1.203125, + "learning_rate": 0.0001996657538036739, + "loss": 1.6422, + "step": 470 + }, + { + "epoch": 0.012093959279176917, + "grad_norm": 1.1015625, + "learning_rate": 0.00019966538901017945, + "loss": 1.8272, + "step": 471 + }, + { + "epoch": 0.012119636475098736, + "grad_norm": 1.1015625, + "learning_rate": 0.00019966502401806086, + "loss": 1.5085, + "step": 472 + }, + { + "epoch": 0.012145313671020556, + "grad_norm": 1.0625, + "learning_rate": 0.00019966465882731885, + "loss": 1.7875, + "step": 473 + }, + { + "epoch": 0.012170990866942375, + "grad_norm": 1.234375, + "learning_rate": 0.00019966429343795422, + "loss": 1.8757, + "step": 474 + }, + { + "epoch": 0.012196668062864196, + "grad_norm": 1.1484375, + "learning_rate": 0.00019966392784996766, + "loss": 1.7758, + "step": 475 + }, + { + "epoch": 0.012222345258786014, + "grad_norm": 1.046875, + "learning_rate": 0.00019966356206335987, + "loss": 1.6779, + "step": 476 + }, + { + "epoch": 0.012248022454707833, + "grad_norm": 1.0546875, + "learning_rate": 0.0001996631960781316, + "loss": 1.7749, + "step": 477 + }, + { + "epoch": 0.012273699650629653, + "grad_norm": 1.140625, + "learning_rate": 0.0001996628298942836, + "loss": 1.8974, + "step": 478 + }, + { + "epoch": 0.012299376846551472, + "grad_norm": 1.171875, + "learning_rate": 0.00019966246351181658, + "loss": 1.6213, + "step": 479 + }, + { + "epoch": 0.012325054042473292, + "grad_norm": 1.109375, + "learning_rate": 0.0001996620969307313, + "loss": 1.6703, + "step": 480 + }, + { + "epoch": 0.012350731238395111, + "grad_norm": 1.1484375, + "learning_rate": 0.0001996617301510284, + "loss": 1.7559, + "step": 481 + }, + { + "epoch": 0.01237640843431693, + "grad_norm": 1.0, + "learning_rate": 0.00019966136317270871, + "loss": 1.6991, + "step": 482 + }, + { + "epoch": 0.01240208563023875, + "grad_norm": 1.0625, + "learning_rate": 0.0001996609959957729, + "loss": 1.8081, + "step": 483 + }, + { + "epoch": 0.012427762826160569, + "grad_norm": 1.203125, + "learning_rate": 0.0001996606286202218, + "loss": 1.85, + "step": 484 + }, + { + "epoch": 0.012453440022082388, + "grad_norm": 1.0390625, + "learning_rate": 0.00019966026104605602, + "loss": 1.5634, + "step": 485 + }, + { + "epoch": 0.012479117218004208, + "grad_norm": 1.140625, + "learning_rate": 0.00019965989327327632, + "loss": 1.7484, + "step": 486 + }, + { + "epoch": 0.012504794413926027, + "grad_norm": 1.140625, + "learning_rate": 0.0001996595253018835, + "loss": 1.5248, + "step": 487 + }, + { + "epoch": 0.012530471609847847, + "grad_norm": 1.09375, + "learning_rate": 0.0001996591571318782, + "loss": 1.6394, + "step": 488 + }, + { + "epoch": 0.012556148805769666, + "grad_norm": 1.1328125, + "learning_rate": 0.00019965878876326124, + "loss": 1.6277, + "step": 489 + }, + { + "epoch": 0.012581826001691485, + "grad_norm": 1.0859375, + "learning_rate": 0.00019965842019603331, + "loss": 1.6765, + "step": 490 + }, + { + "epoch": 0.012607503197613305, + "grad_norm": 1.0859375, + "learning_rate": 0.00019965805143019516, + "loss": 1.5707, + "step": 491 + }, + { + "epoch": 0.012633180393535124, + "grad_norm": 1.140625, + "learning_rate": 0.0001996576824657475, + "loss": 1.7858, + "step": 492 + }, + { + "epoch": 0.012658857589456944, + "grad_norm": 1.125, + "learning_rate": 0.00019965731330269106, + "loss": 1.6784, + "step": 493 + }, + { + "epoch": 0.012684534785378763, + "grad_norm": 1.03125, + "learning_rate": 0.00019965694394102662, + "loss": 1.7451, + "step": 494 + }, + { + "epoch": 0.012710211981300581, + "grad_norm": 1.109375, + "learning_rate": 0.0001996565743807549, + "loss": 1.6593, + "step": 495 + }, + { + "epoch": 0.012735889177222402, + "grad_norm": 1.0703125, + "learning_rate": 0.0001996562046218766, + "loss": 1.5761, + "step": 496 + }, + { + "epoch": 0.01276156637314422, + "grad_norm": 1.15625, + "learning_rate": 0.00019965583466439253, + "loss": 1.5775, + "step": 497 + }, + { + "epoch": 0.01278724356906604, + "grad_norm": 1.2265625, + "learning_rate": 0.00019965546450830335, + "loss": 1.782, + "step": 498 + }, + { + "epoch": 0.01281292076498786, + "grad_norm": 1.0546875, + "learning_rate": 0.00019965509415360985, + "loss": 1.6366, + "step": 499 + }, + { + "epoch": 0.012838597960909678, + "grad_norm": 1.125, + "learning_rate": 0.00019965472360031274, + "loss": 1.6308, + "step": 500 + }, + { + "epoch": 0.012864275156831499, + "grad_norm": 1.2265625, + "learning_rate": 0.00019965435284841277, + "loss": 1.7294, + "step": 501 + }, + { + "epoch": 0.012889952352753318, + "grad_norm": 1.1484375, + "learning_rate": 0.00019965398189791066, + "loss": 1.8646, + "step": 502 + }, + { + "epoch": 0.012915629548675136, + "grad_norm": 1.1484375, + "learning_rate": 0.00019965361074880718, + "loss": 1.6965, + "step": 503 + }, + { + "epoch": 0.012941306744596957, + "grad_norm": 1.1015625, + "learning_rate": 0.00019965323940110308, + "loss": 1.588, + "step": 504 + }, + { + "epoch": 0.012966983940518775, + "grad_norm": 1.1484375, + "learning_rate": 0.00019965286785479904, + "loss": 1.6604, + "step": 505 + }, + { + "epoch": 0.012992661136440596, + "grad_norm": 1.234375, + "learning_rate": 0.00019965249610989582, + "loss": 1.738, + "step": 506 + }, + { + "epoch": 0.013018338332362414, + "grad_norm": 1.0546875, + "learning_rate": 0.00019965212416639424, + "loss": 1.5191, + "step": 507 + }, + { + "epoch": 0.013044015528284233, + "grad_norm": 1.2421875, + "learning_rate": 0.0001996517520242949, + "loss": 1.7419, + "step": 508 + }, + { + "epoch": 0.013069692724206054, + "grad_norm": 1.0859375, + "learning_rate": 0.00019965137968359868, + "loss": 1.5536, + "step": 509 + }, + { + "epoch": 0.013095369920127872, + "grad_norm": 1.109375, + "learning_rate": 0.00019965100714430624, + "loss": 1.7777, + "step": 510 + }, + { + "epoch": 0.013121047116049691, + "grad_norm": 1.1328125, + "learning_rate": 0.00019965063440641835, + "loss": 1.6255, + "step": 511 + }, + { + "epoch": 0.013146724311971511, + "grad_norm": 1.1875, + "learning_rate": 0.00019965026146993574, + "loss": 1.8239, + "step": 512 + }, + { + "epoch": 0.01317240150789333, + "grad_norm": 1.15625, + "learning_rate": 0.00019964988833485916, + "loss": 1.6318, + "step": 513 + }, + { + "epoch": 0.01319807870381515, + "grad_norm": 1.1953125, + "learning_rate": 0.00019964951500118936, + "loss": 1.8425, + "step": 514 + }, + { + "epoch": 0.01322375589973697, + "grad_norm": 1.1484375, + "learning_rate": 0.00019964914146892708, + "loss": 1.874, + "step": 515 + }, + { + "epoch": 0.013249433095658788, + "grad_norm": 1.15625, + "learning_rate": 0.00019964876773807306, + "loss": 1.6595, + "step": 516 + }, + { + "epoch": 0.013275110291580608, + "grad_norm": 1.078125, + "learning_rate": 0.00019964839380862807, + "loss": 1.826, + "step": 517 + }, + { + "epoch": 0.013300787487502427, + "grad_norm": 1.03125, + "learning_rate": 0.0001996480196805928, + "loss": 1.6304, + "step": 518 + }, + { + "epoch": 0.013326464683424247, + "grad_norm": 1.0, + "learning_rate": 0.00019964764535396803, + "loss": 1.4722, + "step": 519 + }, + { + "epoch": 0.013352141879346066, + "grad_norm": 1.046875, + "learning_rate": 0.0001996472708287545, + "loss": 1.6412, + "step": 520 + }, + { + "epoch": 0.013377819075267885, + "grad_norm": 1.046875, + "learning_rate": 0.00019964689610495298, + "loss": 1.6124, + "step": 521 + }, + { + "epoch": 0.013403496271189705, + "grad_norm": 1.078125, + "learning_rate": 0.00019964652118256417, + "loss": 1.6357, + "step": 522 + }, + { + "epoch": 0.013429173467111524, + "grad_norm": 1.1328125, + "learning_rate": 0.00019964614606158887, + "loss": 1.6674, + "step": 523 + }, + { + "epoch": 0.013454850663033343, + "grad_norm": 1.125, + "learning_rate": 0.0001996457707420278, + "loss": 1.8987, + "step": 524 + }, + { + "epoch": 0.013480527858955163, + "grad_norm": 1.0859375, + "learning_rate": 0.0001996453952238817, + "loss": 1.7633, + "step": 525 + }, + { + "epoch": 0.013506205054876982, + "grad_norm": 1.1796875, + "learning_rate": 0.00019964501950715132, + "loss": 1.731, + "step": 526 + }, + { + "epoch": 0.013531882250798802, + "grad_norm": 1.046875, + "learning_rate": 0.00019964464359183745, + "loss": 1.5419, + "step": 527 + }, + { + "epoch": 0.01355755944672062, + "grad_norm": 1.1328125, + "learning_rate": 0.00019964426747794077, + "loss": 1.7823, + "step": 528 + }, + { + "epoch": 0.01358323664264244, + "grad_norm": 1.0625, + "learning_rate": 0.00019964389116546206, + "loss": 1.6951, + "step": 529 + }, + { + "epoch": 0.01360891383856426, + "grad_norm": 1.03125, + "learning_rate": 0.00019964351465440213, + "loss": 1.5224, + "step": 530 + }, + { + "epoch": 0.013634591034486079, + "grad_norm": 1.046875, + "learning_rate": 0.00019964313794476165, + "loss": 1.677, + "step": 531 + }, + { + "epoch": 0.013660268230407899, + "grad_norm": 1.1171875, + "learning_rate": 0.00019964276103654136, + "loss": 1.6735, + "step": 532 + }, + { + "epoch": 0.013685945426329718, + "grad_norm": 1.109375, + "learning_rate": 0.0001996423839297421, + "loss": 1.9353, + "step": 533 + }, + { + "epoch": 0.013711622622251536, + "grad_norm": 1.125, + "learning_rate": 0.00019964200662436453, + "loss": 1.4975, + "step": 534 + }, + { + "epoch": 0.013737299818173357, + "grad_norm": 1.046875, + "learning_rate": 0.00019964162912040946, + "loss": 1.5632, + "step": 535 + }, + { + "epoch": 0.013762977014095176, + "grad_norm": 1.09375, + "learning_rate": 0.00019964125141787764, + "loss": 1.719, + "step": 536 + }, + { + "epoch": 0.013788654210016994, + "grad_norm": 1.0625, + "learning_rate": 0.00019964087351676977, + "loss": 1.705, + "step": 537 + }, + { + "epoch": 0.013814331405938815, + "grad_norm": 1.109375, + "learning_rate": 0.00019964049541708665, + "loss": 1.7028, + "step": 538 + }, + { + "epoch": 0.013840008601860633, + "grad_norm": 1.1015625, + "learning_rate": 0.00019964011711882905, + "loss": 1.5913, + "step": 539 + }, + { + "epoch": 0.013865685797782454, + "grad_norm": 1.1171875, + "learning_rate": 0.00019963973862199768, + "loss": 1.6848, + "step": 540 + }, + { + "epoch": 0.013891362993704272, + "grad_norm": 1.09375, + "learning_rate": 0.0001996393599265933, + "loss": 1.6935, + "step": 541 + }, + { + "epoch": 0.013917040189626091, + "grad_norm": 1.09375, + "learning_rate": 0.0001996389810326167, + "loss": 1.5764, + "step": 542 + }, + { + "epoch": 0.013942717385547912, + "grad_norm": 1.0703125, + "learning_rate": 0.0001996386019400686, + "loss": 1.7204, + "step": 543 + }, + { + "epoch": 0.01396839458146973, + "grad_norm": 1.0390625, + "learning_rate": 0.00019963822264894976, + "loss": 1.6484, + "step": 544 + }, + { + "epoch": 0.01399407177739155, + "grad_norm": 1.203125, + "learning_rate": 0.00019963784315926093, + "loss": 1.575, + "step": 545 + }, + { + "epoch": 0.01401974897331337, + "grad_norm": 1.0703125, + "learning_rate": 0.00019963746347100288, + "loss": 1.5641, + "step": 546 + }, + { + "epoch": 0.014045426169235188, + "grad_norm": 1.0390625, + "learning_rate": 0.0001996370835841764, + "loss": 1.5865, + "step": 547 + }, + { + "epoch": 0.014071103365157009, + "grad_norm": 1.109375, + "learning_rate": 0.00019963670349878218, + "loss": 1.7775, + "step": 548 + }, + { + "epoch": 0.014096780561078827, + "grad_norm": 1.046875, + "learning_rate": 0.000199636323214821, + "loss": 1.706, + "step": 549 + }, + { + "epoch": 0.014122457757000646, + "grad_norm": 1.03125, + "learning_rate": 0.00019963594273229366, + "loss": 1.7055, + "step": 550 + }, + { + "epoch": 0.014148134952922466, + "grad_norm": 1.0859375, + "learning_rate": 0.0001996355620512009, + "loss": 1.4927, + "step": 551 + }, + { + "epoch": 0.014173812148844285, + "grad_norm": 1.0703125, + "learning_rate": 0.00019963518117154343, + "loss": 1.5922, + "step": 552 + }, + { + "epoch": 0.014199489344766105, + "grad_norm": 1.15625, + "learning_rate": 0.00019963480009332205, + "loss": 1.5685, + "step": 553 + }, + { + "epoch": 0.014225166540687924, + "grad_norm": 1.0625, + "learning_rate": 0.00019963441881653754, + "loss": 1.6281, + "step": 554 + }, + { + "epoch": 0.014250843736609743, + "grad_norm": 1.171875, + "learning_rate": 0.0001996340373411906, + "loss": 1.6265, + "step": 555 + }, + { + "epoch": 0.014276520932531563, + "grad_norm": 1.0625, + "learning_rate": 0.00019963365566728203, + "loss": 1.6162, + "step": 556 + }, + { + "epoch": 0.014302198128453382, + "grad_norm": 1.1015625, + "learning_rate": 0.0001996332737948126, + "loss": 1.4966, + "step": 557 + }, + { + "epoch": 0.014327875324375202, + "grad_norm": 1.0546875, + "learning_rate": 0.00019963289172378305, + "loss": 1.4199, + "step": 558 + }, + { + "epoch": 0.014353552520297021, + "grad_norm": 1.1796875, + "learning_rate": 0.00019963250945419415, + "loss": 1.7109, + "step": 559 + }, + { + "epoch": 0.01437922971621884, + "grad_norm": 1.2109375, + "learning_rate": 0.00019963212698604664, + "loss": 1.6364, + "step": 560 + }, + { + "epoch": 0.01440490691214066, + "grad_norm": 1.1328125, + "learning_rate": 0.0001996317443193413, + "loss": 1.6938, + "step": 561 + }, + { + "epoch": 0.014430584108062479, + "grad_norm": 1.109375, + "learning_rate": 0.00019963136145407893, + "loss": 1.7324, + "step": 562 + }, + { + "epoch": 0.014456261303984298, + "grad_norm": 1.0390625, + "learning_rate": 0.0001996309783902602, + "loss": 1.534, + "step": 563 + }, + { + "epoch": 0.014481938499906118, + "grad_norm": 1.0, + "learning_rate": 0.00019963059512788598, + "loss": 1.5947, + "step": 564 + }, + { + "epoch": 0.014507615695827937, + "grad_norm": 1.09375, + "learning_rate": 0.00019963021166695698, + "loss": 1.7233, + "step": 565 + }, + { + "epoch": 0.014533292891749757, + "grad_norm": 1.046875, + "learning_rate": 0.00019962982800747394, + "loss": 1.6019, + "step": 566 + }, + { + "epoch": 0.014558970087671576, + "grad_norm": 1.0625, + "learning_rate": 0.00019962944414943766, + "loss": 1.759, + "step": 567 + }, + { + "epoch": 0.014584647283593394, + "grad_norm": 1.0390625, + "learning_rate": 0.0001996290600928489, + "loss": 1.6707, + "step": 568 + }, + { + "epoch": 0.014610324479515215, + "grad_norm": 1.046875, + "learning_rate": 0.00019962867583770842, + "loss": 1.6581, + "step": 569 + }, + { + "epoch": 0.014636001675437034, + "grad_norm": 1.03125, + "learning_rate": 0.00019962829138401697, + "loss": 1.6053, + "step": 570 + }, + { + "epoch": 0.014661678871358854, + "grad_norm": 1.078125, + "learning_rate": 0.00019962790673177535, + "loss": 1.6717, + "step": 571 + }, + { + "epoch": 0.014687356067280673, + "grad_norm": 1.1171875, + "learning_rate": 0.00019962752188098436, + "loss": 1.6913, + "step": 572 + }, + { + "epoch": 0.014713033263202491, + "grad_norm": 1.1015625, + "learning_rate": 0.00019962713683164464, + "loss": 1.534, + "step": 573 + }, + { + "epoch": 0.014738710459124312, + "grad_norm": 1.1171875, + "learning_rate": 0.0001996267515837571, + "loss": 1.5554, + "step": 574 + }, + { + "epoch": 0.01476438765504613, + "grad_norm": 1.0703125, + "learning_rate": 0.00019962636613732242, + "loss": 1.6612, + "step": 575 + }, + { + "epoch": 0.01479006485096795, + "grad_norm": 1.265625, + "learning_rate": 0.00019962598049234136, + "loss": 1.5195, + "step": 576 + }, + { + "epoch": 0.01481574204688977, + "grad_norm": 1.0625, + "learning_rate": 0.00019962559464881477, + "loss": 1.7079, + "step": 577 + }, + { + "epoch": 0.014841419242811588, + "grad_norm": 1.1640625, + "learning_rate": 0.00019962520860674333, + "loss": 1.7328, + "step": 578 + }, + { + "epoch": 0.014867096438733409, + "grad_norm": 1.0859375, + "learning_rate": 0.00019962482236612788, + "loss": 1.7201, + "step": 579 + }, + { + "epoch": 0.014892773634655227, + "grad_norm": 1.015625, + "learning_rate": 0.00019962443592696914, + "loss": 1.6359, + "step": 580 + }, + { + "epoch": 0.014918450830577046, + "grad_norm": 1.2421875, + "learning_rate": 0.0001996240492892679, + "loss": 1.496, + "step": 581 + }, + { + "epoch": 0.014944128026498867, + "grad_norm": 1.078125, + "learning_rate": 0.0001996236624530249, + "loss": 1.7146, + "step": 582 + }, + { + "epoch": 0.014969805222420685, + "grad_norm": 1.1171875, + "learning_rate": 0.00019962327541824098, + "loss": 1.7726, + "step": 583 + }, + { + "epoch": 0.014995482418342506, + "grad_norm": 1.1328125, + "learning_rate": 0.00019962288818491688, + "loss": 1.5442, + "step": 584 + }, + { + "epoch": 0.015021159614264324, + "grad_norm": 1.125, + "learning_rate": 0.00019962250075305333, + "loss": 1.4369, + "step": 585 + }, + { + "epoch": 0.015046836810186143, + "grad_norm": 1.0859375, + "learning_rate": 0.00019962211312265113, + "loss": 1.7224, + "step": 586 + }, + { + "epoch": 0.015072514006107963, + "grad_norm": 1.125, + "learning_rate": 0.0001996217252937111, + "loss": 1.5585, + "step": 587 + }, + { + "epoch": 0.015098191202029782, + "grad_norm": 1.0546875, + "learning_rate": 0.00019962133726623393, + "loss": 1.8022, + "step": 588 + }, + { + "epoch": 0.015123868397951601, + "grad_norm": 1.4296875, + "learning_rate": 0.00019962094904022045, + "loss": 1.4852, + "step": 589 + }, + { + "epoch": 0.015149545593873421, + "grad_norm": 1.1328125, + "learning_rate": 0.00019962056061567142, + "loss": 1.597, + "step": 590 + }, + { + "epoch": 0.01517522278979524, + "grad_norm": 1.078125, + "learning_rate": 0.0001996201719925876, + "loss": 1.6516, + "step": 591 + }, + { + "epoch": 0.01520089998571706, + "grad_norm": 1.078125, + "learning_rate": 0.00019961978317096978, + "loss": 1.6635, + "step": 592 + }, + { + "epoch": 0.015226577181638879, + "grad_norm": 1.1640625, + "learning_rate": 0.00019961939415081873, + "loss": 1.6924, + "step": 593 + }, + { + "epoch": 0.015252254377560698, + "grad_norm": 1.0859375, + "learning_rate": 0.00019961900493213524, + "loss": 1.5009, + "step": 594 + }, + { + "epoch": 0.015277931573482518, + "grad_norm": 1.0703125, + "learning_rate": 0.00019961861551492008, + "loss": 1.6952, + "step": 595 + }, + { + "epoch": 0.015303608769404337, + "grad_norm": 1.09375, + "learning_rate": 0.00019961822589917397, + "loss": 1.4915, + "step": 596 + }, + { + "epoch": 0.015329285965326157, + "grad_norm": 0.9609375, + "learning_rate": 0.0001996178360848978, + "loss": 1.505, + "step": 597 + }, + { + "epoch": 0.015354963161247976, + "grad_norm": 1.0859375, + "learning_rate": 0.00019961744607209224, + "loss": 1.7452, + "step": 598 + }, + { + "epoch": 0.015380640357169795, + "grad_norm": 1.1875, + "learning_rate": 0.0001996170558607581, + "loss": 1.5945, + "step": 599 + }, + { + "epoch": 0.015406317553091615, + "grad_norm": 1.0390625, + "learning_rate": 0.0001996166654508962, + "loss": 1.625, + "step": 600 + }, + { + "epoch": 0.015431994749013434, + "grad_norm": 1.2109375, + "learning_rate": 0.00019961627484250724, + "loss": 1.6757, + "step": 601 + }, + { + "epoch": 0.015457671944935253, + "grad_norm": 1.109375, + "learning_rate": 0.0001996158840355921, + "loss": 1.6766, + "step": 602 + }, + { + "epoch": 0.015483349140857073, + "grad_norm": 1.0625, + "learning_rate": 0.00019961549303015145, + "loss": 1.7553, + "step": 603 + }, + { + "epoch": 0.015509026336778892, + "grad_norm": 1.0234375, + "learning_rate": 0.00019961510182618612, + "loss": 1.5654, + "step": 604 + }, + { + "epoch": 0.015534703532700712, + "grad_norm": 1.0390625, + "learning_rate": 0.0001996147104236969, + "loss": 1.7169, + "step": 605 + }, + { + "epoch": 0.01556038072862253, + "grad_norm": 1.15625, + "learning_rate": 0.00019961431882268457, + "loss": 1.6962, + "step": 606 + }, + { + "epoch": 0.01558605792454435, + "grad_norm": 1.09375, + "learning_rate": 0.00019961392702314988, + "loss": 1.6041, + "step": 607 + }, + { + "epoch": 0.01561173512046617, + "grad_norm": 1.125, + "learning_rate": 0.0001996135350250937, + "loss": 1.7155, + "step": 608 + }, + { + "epoch": 0.01563741231638799, + "grad_norm": 0.984375, + "learning_rate": 0.00019961314282851666, + "loss": 1.6277, + "step": 609 + }, + { + "epoch": 0.015663089512309807, + "grad_norm": 1.0703125, + "learning_rate": 0.00019961275043341967, + "loss": 1.7286, + "step": 610 + }, + { + "epoch": 0.015688766708231626, + "grad_norm": 1.15625, + "learning_rate": 0.00019961235783980344, + "loss": 1.7794, + "step": 611 + }, + { + "epoch": 0.015714443904153448, + "grad_norm": 1.0390625, + "learning_rate": 0.0001996119650476688, + "loss": 1.6337, + "step": 612 + }, + { + "epoch": 0.015740121100075267, + "grad_norm": 1.1015625, + "learning_rate": 0.0001996115720570165, + "loss": 1.5625, + "step": 613 + }, + { + "epoch": 0.015765798295997085, + "grad_norm": 1.1171875, + "learning_rate": 0.00019961117886784736, + "loss": 1.6635, + "step": 614 + }, + { + "epoch": 0.015791475491918904, + "grad_norm": 1.1328125, + "learning_rate": 0.0001996107854801621, + "loss": 1.6771, + "step": 615 + }, + { + "epoch": 0.015817152687840723, + "grad_norm": 1.046875, + "learning_rate": 0.00019961039189396159, + "loss": 1.5333, + "step": 616 + }, + { + "epoch": 0.015842829883762545, + "grad_norm": 1.0546875, + "learning_rate": 0.00019960999810924652, + "loss": 1.4451, + "step": 617 + }, + { + "epoch": 0.015868507079684364, + "grad_norm": 1.0859375, + "learning_rate": 0.00019960960412601772, + "loss": 1.5741, + "step": 618 + }, + { + "epoch": 0.015894184275606182, + "grad_norm": 1.1171875, + "learning_rate": 0.000199609209944276, + "loss": 1.632, + "step": 619 + }, + { + "epoch": 0.015919861471528, + "grad_norm": 1.109375, + "learning_rate": 0.00019960881556402212, + "loss": 1.6803, + "step": 620 + }, + { + "epoch": 0.01594553866744982, + "grad_norm": 1.1015625, + "learning_rate": 0.00019960842098525687, + "loss": 1.5647, + "step": 621 + }, + { + "epoch": 0.015971215863371642, + "grad_norm": 1.0859375, + "learning_rate": 0.00019960802620798103, + "loss": 1.7056, + "step": 622 + }, + { + "epoch": 0.01599689305929346, + "grad_norm": 1.0390625, + "learning_rate": 0.0001996076312321954, + "loss": 1.4768, + "step": 623 + }, + { + "epoch": 0.01602257025521528, + "grad_norm": 1.109375, + "learning_rate": 0.00019960723605790074, + "loss": 1.6978, + "step": 624 + }, + { + "epoch": 0.016048247451137098, + "grad_norm": 1.1171875, + "learning_rate": 0.00019960684068509786, + "loss": 1.5838, + "step": 625 + }, + { + "epoch": 0.016073924647058917, + "grad_norm": 1.1015625, + "learning_rate": 0.00019960644511378756, + "loss": 1.6863, + "step": 626 + }, + { + "epoch": 0.01609960184298074, + "grad_norm": 1.0546875, + "learning_rate": 0.0001996060493439706, + "loss": 1.6294, + "step": 627 + }, + { + "epoch": 0.016125279038902558, + "grad_norm": 1.046875, + "learning_rate": 0.00019960565337564777, + "loss": 1.6868, + "step": 628 + }, + { + "epoch": 0.016150956234824376, + "grad_norm": 1.0859375, + "learning_rate": 0.0001996052572088199, + "loss": 1.5448, + "step": 629 + }, + { + "epoch": 0.016176633430746195, + "grad_norm": 1.0703125, + "learning_rate": 0.00019960486084348773, + "loss": 1.6703, + "step": 630 + }, + { + "epoch": 0.016202310626668014, + "grad_norm": 1.0859375, + "learning_rate": 0.00019960446427965206, + "loss": 1.6312, + "step": 631 + }, + { + "epoch": 0.016227987822589836, + "grad_norm": 1.15625, + "learning_rate": 0.00019960406751731372, + "loss": 1.5482, + "step": 632 + }, + { + "epoch": 0.016253665018511654, + "grad_norm": 1.09375, + "learning_rate": 0.00019960367055647343, + "loss": 1.5546, + "step": 633 + }, + { + "epoch": 0.016279342214433473, + "grad_norm": 1.1875, + "learning_rate": 0.00019960327339713205, + "loss": 1.6941, + "step": 634 + }, + { + "epoch": 0.016305019410355292, + "grad_norm": 1.0, + "learning_rate": 0.00019960287603929032, + "loss": 1.3535, + "step": 635 + }, + { + "epoch": 0.01633069660627711, + "grad_norm": 1.0546875, + "learning_rate": 0.0001996024784829491, + "loss": 1.474, + "step": 636 + }, + { + "epoch": 0.01635637380219893, + "grad_norm": 1.0390625, + "learning_rate": 0.0001996020807281091, + "loss": 1.3116, + "step": 637 + }, + { + "epoch": 0.01638205099812075, + "grad_norm": 1.0859375, + "learning_rate": 0.00019960168277477117, + "loss": 1.3909, + "step": 638 + }, + { + "epoch": 0.01640772819404257, + "grad_norm": 1.0859375, + "learning_rate": 0.00019960128462293608, + "loss": 1.5804, + "step": 639 + }, + { + "epoch": 0.01643340538996439, + "grad_norm": 1.1171875, + "learning_rate": 0.0001996008862726046, + "loss": 1.6854, + "step": 640 + }, + { + "epoch": 0.016459082585886207, + "grad_norm": 1.09375, + "learning_rate": 0.0001996004877237776, + "loss": 1.7849, + "step": 641 + }, + { + "epoch": 0.016484759781808026, + "grad_norm": 1.0234375, + "learning_rate": 0.00019960008897645576, + "loss": 1.5514, + "step": 642 + }, + { + "epoch": 0.01651043697772985, + "grad_norm": 1.1640625, + "learning_rate": 0.00019959969003064, + "loss": 1.4694, + "step": 643 + }, + { + "epoch": 0.016536114173651667, + "grad_norm": 1.078125, + "learning_rate": 0.00019959929088633104, + "loss": 1.6664, + "step": 644 + }, + { + "epoch": 0.016561791369573486, + "grad_norm": 1.03125, + "learning_rate": 0.00019959889154352967, + "loss": 1.5039, + "step": 645 + }, + { + "epoch": 0.016587468565495304, + "grad_norm": 1.1015625, + "learning_rate": 0.0001995984920022367, + "loss": 1.6584, + "step": 646 + }, + { + "epoch": 0.016613145761417123, + "grad_norm": 1.1171875, + "learning_rate": 0.00019959809226245296, + "loss": 1.3829, + "step": 647 + }, + { + "epoch": 0.016638822957338945, + "grad_norm": 1.015625, + "learning_rate": 0.00019959769232417922, + "loss": 1.6191, + "step": 648 + }, + { + "epoch": 0.016664500153260764, + "grad_norm": 0.96875, + "learning_rate": 0.00019959729218741625, + "loss": 1.5378, + "step": 649 + }, + { + "epoch": 0.016690177349182583, + "grad_norm": 1.0625, + "learning_rate": 0.0001995968918521649, + "loss": 1.664, + "step": 650 + }, + { + "epoch": 0.0167158545451044, + "grad_norm": 1.1171875, + "learning_rate": 0.0001995964913184259, + "loss": 1.4757, + "step": 651 + }, + { + "epoch": 0.01674153174102622, + "grad_norm": 1.0625, + "learning_rate": 0.00019959609058620013, + "loss": 1.5318, + "step": 652 + }, + { + "epoch": 0.016767208936948042, + "grad_norm": 0.9921875, + "learning_rate": 0.00019959568965548835, + "loss": 1.7507, + "step": 653 + }, + { + "epoch": 0.01679288613286986, + "grad_norm": 1.2265625, + "learning_rate": 0.00019959528852629132, + "loss": 1.772, + "step": 654 + }, + { + "epoch": 0.01681856332879168, + "grad_norm": 1.03125, + "learning_rate": 0.0001995948871986099, + "loss": 1.5272, + "step": 655 + }, + { + "epoch": 0.016844240524713498, + "grad_norm": 1.09375, + "learning_rate": 0.00019959448567244485, + "loss": 1.705, + "step": 656 + }, + { + "epoch": 0.016869917720635317, + "grad_norm": 1.0234375, + "learning_rate": 0.000199594083947797, + "loss": 1.7199, + "step": 657 + }, + { + "epoch": 0.01689559491655714, + "grad_norm": 1.0546875, + "learning_rate": 0.00019959368202466715, + "loss": 1.5247, + "step": 658 + }, + { + "epoch": 0.016921272112478958, + "grad_norm": 1.1640625, + "learning_rate": 0.00019959327990305608, + "loss": 1.7985, + "step": 659 + }, + { + "epoch": 0.016946949308400776, + "grad_norm": 1.0234375, + "learning_rate": 0.0001995928775829646, + "loss": 1.5109, + "step": 660 + }, + { + "epoch": 0.016972626504322595, + "grad_norm": 1.0, + "learning_rate": 0.00019959247506439349, + "loss": 1.4539, + "step": 661 + }, + { + "epoch": 0.016998303700244414, + "grad_norm": 1.171875, + "learning_rate": 0.0001995920723473436, + "loss": 1.5993, + "step": 662 + }, + { + "epoch": 0.017023980896166233, + "grad_norm": 1.0859375, + "learning_rate": 0.0001995916694318157, + "loss": 1.7412, + "step": 663 + }, + { + "epoch": 0.017049658092088055, + "grad_norm": 1.078125, + "learning_rate": 0.00019959126631781057, + "loss": 1.4544, + "step": 664 + }, + { + "epoch": 0.017075335288009873, + "grad_norm": 1.0703125, + "learning_rate": 0.00019959086300532907, + "loss": 1.5902, + "step": 665 + }, + { + "epoch": 0.017101012483931692, + "grad_norm": 1.0390625, + "learning_rate": 0.00019959045949437197, + "loss": 1.5491, + "step": 666 + }, + { + "epoch": 0.01712668967985351, + "grad_norm": 1.0390625, + "learning_rate": 0.00019959005578494007, + "loss": 1.7015, + "step": 667 + }, + { + "epoch": 0.01715236687577533, + "grad_norm": 1.078125, + "learning_rate": 0.00019958965187703422, + "loss": 1.6744, + "step": 668 + }, + { + "epoch": 0.01717804407169715, + "grad_norm": 1.0234375, + "learning_rate": 0.00019958924777065516, + "loss": 1.4934, + "step": 669 + }, + { + "epoch": 0.01720372126761897, + "grad_norm": 0.99609375, + "learning_rate": 0.00019958884346580374, + "loss": 1.6639, + "step": 670 + }, + { + "epoch": 0.01722939846354079, + "grad_norm": 1.2265625, + "learning_rate": 0.00019958843896248073, + "loss": 1.6003, + "step": 671 + }, + { + "epoch": 0.017255075659462608, + "grad_norm": 1.109375, + "learning_rate": 0.00019958803426068698, + "loss": 1.5673, + "step": 672 + }, + { + "epoch": 0.017280752855384426, + "grad_norm": 1.1015625, + "learning_rate": 0.00019958762936042324, + "loss": 1.7155, + "step": 673 + }, + { + "epoch": 0.01730643005130625, + "grad_norm": 1.1484375, + "learning_rate": 0.0001995872242616904, + "loss": 1.8157, + "step": 674 + }, + { + "epoch": 0.017332107247228067, + "grad_norm": 1.03125, + "learning_rate": 0.00019958681896448918, + "loss": 1.6662, + "step": 675 + }, + { + "epoch": 0.017357784443149886, + "grad_norm": 1.1484375, + "learning_rate": 0.00019958641346882043, + "loss": 1.6613, + "step": 676 + }, + { + "epoch": 0.017383461639071705, + "grad_norm": 1.1171875, + "learning_rate": 0.00019958600777468497, + "loss": 1.4046, + "step": 677 + }, + { + "epoch": 0.017409138834993523, + "grad_norm": 1.0390625, + "learning_rate": 0.0001995856018820836, + "loss": 1.6197, + "step": 678 + }, + { + "epoch": 0.017434816030915345, + "grad_norm": 1.0703125, + "learning_rate": 0.0001995851957910171, + "loss": 1.6473, + "step": 679 + }, + { + "epoch": 0.017460493226837164, + "grad_norm": 1.2421875, + "learning_rate": 0.0001995847895014863, + "loss": 1.6546, + "step": 680 + }, + { + "epoch": 0.017486170422758983, + "grad_norm": 1.0625, + "learning_rate": 0.000199584383013492, + "loss": 1.4897, + "step": 681 + }, + { + "epoch": 0.0175118476186808, + "grad_norm": 1.0625, + "learning_rate": 0.00019958397632703504, + "loss": 1.4463, + "step": 682 + }, + { + "epoch": 0.01753752481460262, + "grad_norm": 1.0859375, + "learning_rate": 0.0001995835694421162, + "loss": 1.6788, + "step": 683 + }, + { + "epoch": 0.017563202010524442, + "grad_norm": 1.1875, + "learning_rate": 0.00019958316235873632, + "loss": 1.6394, + "step": 684 + }, + { + "epoch": 0.01758887920644626, + "grad_norm": 1.0234375, + "learning_rate": 0.00019958275507689618, + "loss": 1.5224, + "step": 685 + }, + { + "epoch": 0.01761455640236808, + "grad_norm": 1.0, + "learning_rate": 0.0001995823475965966, + "loss": 1.5324, + "step": 686 + }, + { + "epoch": 0.0176402335982899, + "grad_norm": 1.1171875, + "learning_rate": 0.00019958193991783838, + "loss": 1.6197, + "step": 687 + }, + { + "epoch": 0.017665910794211717, + "grad_norm": 1.109375, + "learning_rate": 0.00019958153204062239, + "loss": 1.5688, + "step": 688 + }, + { + "epoch": 0.017691587990133536, + "grad_norm": 1.0390625, + "learning_rate": 0.00019958112396494936, + "loss": 1.4585, + "step": 689 + }, + { + "epoch": 0.017717265186055358, + "grad_norm": 1.109375, + "learning_rate": 0.00019958071569082018, + "loss": 1.5427, + "step": 690 + }, + { + "epoch": 0.017742942381977177, + "grad_norm": 1.0, + "learning_rate": 0.00019958030721823562, + "loss": 1.5665, + "step": 691 + }, + { + "epoch": 0.017768619577898995, + "grad_norm": 0.91796875, + "learning_rate": 0.00019957989854719647, + "loss": 1.4858, + "step": 692 + }, + { + "epoch": 0.017794296773820814, + "grad_norm": 1.0703125, + "learning_rate": 0.0001995794896777036, + "loss": 1.4827, + "step": 693 + }, + { + "epoch": 0.017819973969742633, + "grad_norm": 1.078125, + "learning_rate": 0.0001995790806097578, + "loss": 1.626, + "step": 694 + }, + { + "epoch": 0.017845651165664455, + "grad_norm": 1.0703125, + "learning_rate": 0.00019957867134335992, + "loss": 1.7552, + "step": 695 + }, + { + "epoch": 0.017871328361586274, + "grad_norm": 1.1796875, + "learning_rate": 0.0001995782618785107, + "loss": 1.5762, + "step": 696 + }, + { + "epoch": 0.017897005557508092, + "grad_norm": 1.140625, + "learning_rate": 0.00019957785221521102, + "loss": 1.7809, + "step": 697 + }, + { + "epoch": 0.01792268275342991, + "grad_norm": 0.94140625, + "learning_rate": 0.00019957744235346168, + "loss": 1.4838, + "step": 698 + }, + { + "epoch": 0.01794835994935173, + "grad_norm": 0.98046875, + "learning_rate": 0.00019957703229326349, + "loss": 1.429, + "step": 699 + }, + { + "epoch": 0.017974037145273552, + "grad_norm": 1.0390625, + "learning_rate": 0.00019957662203461726, + "loss": 1.6377, + "step": 700 + }, + { + "epoch": 0.01799971434119537, + "grad_norm": 1.0859375, + "learning_rate": 0.00019957621157752379, + "loss": 1.5479, + "step": 701 + }, + { + "epoch": 0.01802539153711719, + "grad_norm": 1.0703125, + "learning_rate": 0.00019957580092198397, + "loss": 1.5915, + "step": 702 + }, + { + "epoch": 0.018051068733039008, + "grad_norm": 1.0546875, + "learning_rate": 0.00019957539006799856, + "loss": 1.5117, + "step": 703 + }, + { + "epoch": 0.018076745928960827, + "grad_norm": 1.1328125, + "learning_rate": 0.0001995749790155684, + "loss": 1.3898, + "step": 704 + }, + { + "epoch": 0.01810242312488265, + "grad_norm": 1.09375, + "learning_rate": 0.00019957456776469428, + "loss": 1.6842, + "step": 705 + }, + { + "epoch": 0.018128100320804467, + "grad_norm": 1.0859375, + "learning_rate": 0.00019957415631537706, + "loss": 1.6016, + "step": 706 + }, + { + "epoch": 0.018153777516726286, + "grad_norm": 1.125, + "learning_rate": 0.00019957374466761754, + "loss": 1.4697, + "step": 707 + }, + { + "epoch": 0.018179454712648105, + "grad_norm": 1.0546875, + "learning_rate": 0.0001995733328214165, + "loss": 1.7838, + "step": 708 + }, + { + "epoch": 0.018205131908569924, + "grad_norm": 1.015625, + "learning_rate": 0.00019957292077677486, + "loss": 1.6722, + "step": 709 + }, + { + "epoch": 0.018230809104491742, + "grad_norm": 1.0859375, + "learning_rate": 0.00019957250853369334, + "loss": 1.6596, + "step": 710 + }, + { + "epoch": 0.018256486300413564, + "grad_norm": 1.2109375, + "learning_rate": 0.00019957209609217284, + "loss": 1.5763, + "step": 711 + }, + { + "epoch": 0.018282163496335383, + "grad_norm": 0.98046875, + "learning_rate": 0.0001995716834522141, + "loss": 1.7074, + "step": 712 + }, + { + "epoch": 0.018307840692257202, + "grad_norm": 1.0546875, + "learning_rate": 0.00019957127061381803, + "loss": 1.5918, + "step": 713 + }, + { + "epoch": 0.01833351788817902, + "grad_norm": 0.9921875, + "learning_rate": 0.0001995708575769854, + "loss": 1.4897, + "step": 714 + }, + { + "epoch": 0.01835919508410084, + "grad_norm": 1.09375, + "learning_rate": 0.00019957044434171705, + "loss": 1.6304, + "step": 715 + }, + { + "epoch": 0.01838487228002266, + "grad_norm": 1.0, + "learning_rate": 0.0001995700309080138, + "loss": 1.5881, + "step": 716 + }, + { + "epoch": 0.01841054947594448, + "grad_norm": 1.0859375, + "learning_rate": 0.0001995696172758765, + "loss": 1.7474, + "step": 717 + }, + { + "epoch": 0.0184362266718663, + "grad_norm": 0.99609375, + "learning_rate": 0.0001995692034453059, + "loss": 1.4496, + "step": 718 + }, + { + "epoch": 0.018461903867788117, + "grad_norm": 0.96484375, + "learning_rate": 0.00019956878941630287, + "loss": 1.4815, + "step": 719 + }, + { + "epoch": 0.018487581063709936, + "grad_norm": 1.2109375, + "learning_rate": 0.00019956837518886827, + "loss": 1.5042, + "step": 720 + }, + { + "epoch": 0.018513258259631758, + "grad_norm": 1.078125, + "learning_rate": 0.00019956796076300287, + "loss": 1.708, + "step": 721 + }, + { + "epoch": 0.018538935455553577, + "grad_norm": 1.0546875, + "learning_rate": 0.00019956754613870752, + "loss": 1.5954, + "step": 722 + }, + { + "epoch": 0.018564612651475396, + "grad_norm": 1.0625, + "learning_rate": 0.00019956713131598304, + "loss": 1.5881, + "step": 723 + }, + { + "epoch": 0.018590289847397214, + "grad_norm": 1.03125, + "learning_rate": 0.0001995667162948303, + "loss": 1.6546, + "step": 724 + }, + { + "epoch": 0.018615967043319033, + "grad_norm": 1.078125, + "learning_rate": 0.00019956630107525002, + "loss": 1.7816, + "step": 725 + }, + { + "epoch": 0.018641644239240855, + "grad_norm": 1.0390625, + "learning_rate": 0.00019956588565724315, + "loss": 1.5616, + "step": 726 + }, + { + "epoch": 0.018667321435162674, + "grad_norm": 1.015625, + "learning_rate": 0.00019956547004081046, + "loss": 1.5658, + "step": 727 + }, + { + "epoch": 0.018692998631084493, + "grad_norm": 1.0546875, + "learning_rate": 0.00019956505422595274, + "loss": 1.5498, + "step": 728 + }, + { + "epoch": 0.01871867582700631, + "grad_norm": 1.109375, + "learning_rate": 0.00019956463821267092, + "loss": 1.4889, + "step": 729 + }, + { + "epoch": 0.01874435302292813, + "grad_norm": 1.046875, + "learning_rate": 0.0001995642220009657, + "loss": 1.4059, + "step": 730 + }, + { + "epoch": 0.018770030218849952, + "grad_norm": 1.1171875, + "learning_rate": 0.00019956380559083803, + "loss": 1.665, + "step": 731 + }, + { + "epoch": 0.01879570741477177, + "grad_norm": 1.078125, + "learning_rate": 0.00019956338898228867, + "loss": 1.6749, + "step": 732 + }, + { + "epoch": 0.01882138461069359, + "grad_norm": 1.046875, + "learning_rate": 0.0001995629721753185, + "loss": 1.4719, + "step": 733 + }, + { + "epoch": 0.018847061806615408, + "grad_norm": 1.03125, + "learning_rate": 0.00019956255516992828, + "loss": 1.7207, + "step": 734 + }, + { + "epoch": 0.018872739002537227, + "grad_norm": 1.1171875, + "learning_rate": 0.00019956213796611891, + "loss": 1.5775, + "step": 735 + }, + { + "epoch": 0.018898416198459046, + "grad_norm": 1.125, + "learning_rate": 0.00019956172056389117, + "loss": 1.6392, + "step": 736 + }, + { + "epoch": 0.018924093394380868, + "grad_norm": 1.0078125, + "learning_rate": 0.00019956130296324594, + "loss": 1.6005, + "step": 737 + }, + { + "epoch": 0.018949770590302686, + "grad_norm": 1.1015625, + "learning_rate": 0.000199560885164184, + "loss": 1.5195, + "step": 738 + }, + { + "epoch": 0.018975447786224505, + "grad_norm": 1.1015625, + "learning_rate": 0.00019956046716670622, + "loss": 1.4799, + "step": 739 + }, + { + "epoch": 0.019001124982146324, + "grad_norm": 1.09375, + "learning_rate": 0.00019956004897081343, + "loss": 1.6152, + "step": 740 + }, + { + "epoch": 0.019026802178068142, + "grad_norm": 1.015625, + "learning_rate": 0.00019955963057650643, + "loss": 1.4489, + "step": 741 + }, + { + "epoch": 0.019052479373989965, + "grad_norm": 1.0625, + "learning_rate": 0.0001995592119837861, + "loss": 1.7323, + "step": 742 + }, + { + "epoch": 0.019078156569911783, + "grad_norm": 1.078125, + "learning_rate": 0.00019955879319265326, + "loss": 1.5552, + "step": 743 + }, + { + "epoch": 0.019103833765833602, + "grad_norm": 1.09375, + "learning_rate": 0.00019955837420310873, + "loss": 1.5793, + "step": 744 + }, + { + "epoch": 0.01912951096175542, + "grad_norm": 1.0703125, + "learning_rate": 0.00019955795501515336, + "loss": 1.5655, + "step": 745 + }, + { + "epoch": 0.01915518815767724, + "grad_norm": 1.046875, + "learning_rate": 0.00019955753562878796, + "loss": 1.6547, + "step": 746 + }, + { + "epoch": 0.01918086535359906, + "grad_norm": 1.0078125, + "learning_rate": 0.0001995571160440134, + "loss": 1.4486, + "step": 747 + }, + { + "epoch": 0.01920654254952088, + "grad_norm": 1.03125, + "learning_rate": 0.0001995566962608305, + "loss": 1.764, + "step": 748 + }, + { + "epoch": 0.0192322197454427, + "grad_norm": 1.0546875, + "learning_rate": 0.00019955627627924008, + "loss": 1.6718, + "step": 749 + }, + { + "epoch": 0.019257896941364518, + "grad_norm": 0.9609375, + "learning_rate": 0.000199555856099243, + "loss": 1.4823, + "step": 750 + }, + { + "epoch": 0.019283574137286336, + "grad_norm": 0.9609375, + "learning_rate": 0.0001995554357208401, + "loss": 1.6049, + "step": 751 + }, + { + "epoch": 0.01930925133320816, + "grad_norm": 1.0546875, + "learning_rate": 0.00019955501514403224, + "loss": 1.6845, + "step": 752 + }, + { + "epoch": 0.019334928529129977, + "grad_norm": 1.109375, + "learning_rate": 0.00019955459436882017, + "loss": 1.7104, + "step": 753 + }, + { + "epoch": 0.019360605725051796, + "grad_norm": 1.0234375, + "learning_rate": 0.00019955417339520483, + "loss": 1.5796, + "step": 754 + }, + { + "epoch": 0.019386282920973615, + "grad_norm": 1.0234375, + "learning_rate": 0.000199553752223187, + "loss": 1.6223, + "step": 755 + }, + { + "epoch": 0.019411960116895433, + "grad_norm": 1.015625, + "learning_rate": 0.0001995533308527675, + "loss": 1.6281, + "step": 756 + }, + { + "epoch": 0.019437637312817255, + "grad_norm": 1.0703125, + "learning_rate": 0.00019955290928394727, + "loss": 1.5164, + "step": 757 + }, + { + "epoch": 0.019463314508739074, + "grad_norm": 1.0703125, + "learning_rate": 0.00019955248751672705, + "loss": 1.4484, + "step": 758 + }, + { + "epoch": 0.019488991704660893, + "grad_norm": 1.09375, + "learning_rate": 0.0001995520655511077, + "loss": 1.7047, + "step": 759 + }, + { + "epoch": 0.01951466890058271, + "grad_norm": 1.078125, + "learning_rate": 0.0001995516433870901, + "loss": 1.6287, + "step": 760 + }, + { + "epoch": 0.01954034609650453, + "grad_norm": 0.9296875, + "learning_rate": 0.00019955122102467506, + "loss": 1.5167, + "step": 761 + }, + { + "epoch": 0.01956602329242635, + "grad_norm": 1.078125, + "learning_rate": 0.0001995507984638634, + "loss": 1.6206, + "step": 762 + }, + { + "epoch": 0.01959170048834817, + "grad_norm": 1.03125, + "learning_rate": 0.00019955037570465605, + "loss": 1.5878, + "step": 763 + }, + { + "epoch": 0.01961737768426999, + "grad_norm": 1.1328125, + "learning_rate": 0.00019954995274705376, + "loss": 1.5512, + "step": 764 + }, + { + "epoch": 0.01964305488019181, + "grad_norm": 1.0625, + "learning_rate": 0.0001995495295910574, + "loss": 1.5201, + "step": 765 + }, + { + "epoch": 0.019668732076113627, + "grad_norm": 1.0625, + "learning_rate": 0.00019954910623666783, + "loss": 1.6177, + "step": 766 + }, + { + "epoch": 0.019694409272035446, + "grad_norm": 1.2265625, + "learning_rate": 0.0001995486826838859, + "loss": 1.6516, + "step": 767 + }, + { + "epoch": 0.019720086467957268, + "grad_norm": 1.03125, + "learning_rate": 0.0001995482589327124, + "loss": 1.4382, + "step": 768 + }, + { + "epoch": 0.019745763663879087, + "grad_norm": 1.125, + "learning_rate": 0.00019954783498314825, + "loss": 1.5313, + "step": 769 + }, + { + "epoch": 0.019771440859800905, + "grad_norm": 1.0234375, + "learning_rate": 0.00019954741083519424, + "loss": 1.5431, + "step": 770 + }, + { + "epoch": 0.019797118055722724, + "grad_norm": 1.0703125, + "learning_rate": 0.0001995469864888512, + "loss": 1.5872, + "step": 771 + }, + { + "epoch": 0.019822795251644543, + "grad_norm": 1.09375, + "learning_rate": 0.00019954656194412005, + "loss": 1.582, + "step": 772 + }, + { + "epoch": 0.019848472447566365, + "grad_norm": 1.0390625, + "learning_rate": 0.00019954613720100158, + "loss": 1.5987, + "step": 773 + }, + { + "epoch": 0.019874149643488184, + "grad_norm": 1.109375, + "learning_rate": 0.00019954571225949666, + "loss": 1.3921, + "step": 774 + }, + { + "epoch": 0.019899826839410002, + "grad_norm": 0.9140625, + "learning_rate": 0.00019954528711960611, + "loss": 1.4783, + "step": 775 + }, + { + "epoch": 0.01992550403533182, + "grad_norm": 1.046875, + "learning_rate": 0.00019954486178133083, + "loss": 1.4535, + "step": 776 + }, + { + "epoch": 0.01995118123125364, + "grad_norm": 1.03125, + "learning_rate": 0.00019954443624467158, + "loss": 1.5987, + "step": 777 + }, + { + "epoch": 0.019976858427175462, + "grad_norm": 1.078125, + "learning_rate": 0.00019954401050962929, + "loss": 1.6735, + "step": 778 + }, + { + "epoch": 0.02000253562309728, + "grad_norm": 1.171875, + "learning_rate": 0.0001995435845762048, + "loss": 1.3754, + "step": 779 + }, + { + "epoch": 0.0200282128190191, + "grad_norm": 0.98828125, + "learning_rate": 0.0001995431584443989, + "loss": 1.5412, + "step": 780 + }, + { + "epoch": 0.020053890014940918, + "grad_norm": 1.0625, + "learning_rate": 0.00019954273211421247, + "loss": 1.4165, + "step": 781 + }, + { + "epoch": 0.020079567210862737, + "grad_norm": 0.90234375, + "learning_rate": 0.00019954230558564642, + "loss": 1.4773, + "step": 782 + }, + { + "epoch": 0.02010524440678456, + "grad_norm": 0.9765625, + "learning_rate": 0.0001995418788587015, + "loss": 1.5705, + "step": 783 + }, + { + "epoch": 0.020130921602706377, + "grad_norm": 1.0546875, + "learning_rate": 0.00019954145193337862, + "loss": 1.6192, + "step": 784 + }, + { + "epoch": 0.020156598798628196, + "grad_norm": 1.0625, + "learning_rate": 0.0001995410248096786, + "loss": 1.393, + "step": 785 + }, + { + "epoch": 0.020182275994550015, + "grad_norm": 0.94921875, + "learning_rate": 0.00019954059748760234, + "loss": 1.3434, + "step": 786 + }, + { + "epoch": 0.020207953190471833, + "grad_norm": 1.1015625, + "learning_rate": 0.00019954016996715068, + "loss": 1.4503, + "step": 787 + }, + { + "epoch": 0.020233630386393652, + "grad_norm": 1.046875, + "learning_rate": 0.00019953974224832442, + "loss": 1.6668, + "step": 788 + }, + { + "epoch": 0.020259307582315474, + "grad_norm": 1.0703125, + "learning_rate": 0.00019953931433112443, + "loss": 1.6038, + "step": 789 + }, + { + "epoch": 0.020284984778237293, + "grad_norm": 1.015625, + "learning_rate": 0.0001995388862155516, + "loss": 1.4964, + "step": 790 + }, + { + "epoch": 0.02031066197415911, + "grad_norm": 1.109375, + "learning_rate": 0.00019953845790160677, + "loss": 1.6327, + "step": 791 + }, + { + "epoch": 0.02033633917008093, + "grad_norm": 1.015625, + "learning_rate": 0.00019953802938929075, + "loss": 1.6404, + "step": 792 + }, + { + "epoch": 0.02036201636600275, + "grad_norm": 1.0078125, + "learning_rate": 0.00019953760067860444, + "loss": 1.5059, + "step": 793 + }, + { + "epoch": 0.02038769356192457, + "grad_norm": 0.96875, + "learning_rate": 0.0001995371717695487, + "loss": 1.5551, + "step": 794 + }, + { + "epoch": 0.02041337075784639, + "grad_norm": 1.0234375, + "learning_rate": 0.00019953674266212437, + "loss": 1.5828, + "step": 795 + }, + { + "epoch": 0.02043904795376821, + "grad_norm": 1.0703125, + "learning_rate": 0.0001995363133563323, + "loss": 1.6665, + "step": 796 + }, + { + "epoch": 0.020464725149690027, + "grad_norm": 1.0625, + "learning_rate": 0.00019953588385217334, + "loss": 1.4506, + "step": 797 + }, + { + "epoch": 0.020490402345611846, + "grad_norm": 1.0390625, + "learning_rate": 0.00019953545414964837, + "loss": 1.5594, + "step": 798 + }, + { + "epoch": 0.020516079541533668, + "grad_norm": 1.0703125, + "learning_rate": 0.00019953502424875817, + "loss": 1.5167, + "step": 799 + }, + { + "epoch": 0.020541756737455487, + "grad_norm": 0.99609375, + "learning_rate": 0.00019953459414950373, + "loss": 1.3748, + "step": 800 + }, + { + "epoch": 0.020567433933377306, + "grad_norm": 1.09375, + "learning_rate": 0.00019953416385188583, + "loss": 1.4606, + "step": 801 + }, + { + "epoch": 0.020593111129299124, + "grad_norm": 1.0546875, + "learning_rate": 0.0001995337333559053, + "loss": 1.5209, + "step": 802 + }, + { + "epoch": 0.020618788325220943, + "grad_norm": 1.0703125, + "learning_rate": 0.00019953330266156302, + "loss": 1.6689, + "step": 803 + }, + { + "epoch": 0.020644465521142765, + "grad_norm": 0.9765625, + "learning_rate": 0.0001995328717688599, + "loss": 1.4536, + "step": 804 + }, + { + "epoch": 0.020670142717064584, + "grad_norm": 0.96484375, + "learning_rate": 0.00019953244067779673, + "loss": 1.4721, + "step": 805 + }, + { + "epoch": 0.020695819912986402, + "grad_norm": 1.078125, + "learning_rate": 0.0001995320093883744, + "loss": 1.7268, + "step": 806 + }, + { + "epoch": 0.02072149710890822, + "grad_norm": 1.0, + "learning_rate": 0.00019953157790059378, + "loss": 1.3641, + "step": 807 + }, + { + "epoch": 0.02074717430483004, + "grad_norm": 1.0859375, + "learning_rate": 0.0001995311462144557, + "loss": 1.5842, + "step": 808 + }, + { + "epoch": 0.020772851500751862, + "grad_norm": 0.98828125, + "learning_rate": 0.00019953071432996105, + "loss": 1.4441, + "step": 809 + }, + { + "epoch": 0.02079852869667368, + "grad_norm": 0.99609375, + "learning_rate": 0.00019953028224711065, + "loss": 1.5264, + "step": 810 + }, + { + "epoch": 0.0208242058925955, + "grad_norm": 1.109375, + "learning_rate": 0.00019952984996590543, + "loss": 1.3372, + "step": 811 + }, + { + "epoch": 0.020849883088517318, + "grad_norm": 1.0078125, + "learning_rate": 0.0001995294174863462, + "loss": 1.6151, + "step": 812 + }, + { + "epoch": 0.020875560284439137, + "grad_norm": 1.0625, + "learning_rate": 0.0001995289848084338, + "loss": 1.4193, + "step": 813 + }, + { + "epoch": 0.020901237480360955, + "grad_norm": 1.046875, + "learning_rate": 0.00019952855193216916, + "loss": 1.4436, + "step": 814 + }, + { + "epoch": 0.020926914676282778, + "grad_norm": 0.9921875, + "learning_rate": 0.00019952811885755306, + "loss": 1.6566, + "step": 815 + }, + { + "epoch": 0.020952591872204596, + "grad_norm": 1.0703125, + "learning_rate": 0.00019952768558458646, + "loss": 1.5506, + "step": 816 + }, + { + "epoch": 0.020978269068126415, + "grad_norm": 0.99609375, + "learning_rate": 0.00019952725211327014, + "loss": 1.5133, + "step": 817 + }, + { + "epoch": 0.021003946264048234, + "grad_norm": 0.9921875, + "learning_rate": 0.000199526818443605, + "loss": 1.4206, + "step": 818 + }, + { + "epoch": 0.021029623459970052, + "grad_norm": 1.015625, + "learning_rate": 0.0001995263845755919, + "loss": 1.5456, + "step": 819 + }, + { + "epoch": 0.021055300655891875, + "grad_norm": 0.9765625, + "learning_rate": 0.00019952595050923172, + "loss": 1.4414, + "step": 820 + }, + { + "epoch": 0.021080977851813693, + "grad_norm": 1.0234375, + "learning_rate": 0.0001995255162445253, + "loss": 1.4631, + "step": 821 + }, + { + "epoch": 0.021106655047735512, + "grad_norm": 0.9453125, + "learning_rate": 0.00019952508178147353, + "loss": 1.5066, + "step": 822 + }, + { + "epoch": 0.02113233224365733, + "grad_norm": 1.0390625, + "learning_rate": 0.00019952464712007725, + "loss": 1.5715, + "step": 823 + }, + { + "epoch": 0.02115800943957915, + "grad_norm": 1.0234375, + "learning_rate": 0.00019952421226033734, + "loss": 1.3491, + "step": 824 + }, + { + "epoch": 0.02118368663550097, + "grad_norm": 0.9453125, + "learning_rate": 0.00019952377720225468, + "loss": 1.4982, + "step": 825 + }, + { + "epoch": 0.02120936383142279, + "grad_norm": 1.109375, + "learning_rate": 0.0001995233419458301, + "loss": 1.6414, + "step": 826 + }, + { + "epoch": 0.02123504102734461, + "grad_norm": 1.0, + "learning_rate": 0.00019952290649106452, + "loss": 1.5785, + "step": 827 + }, + { + "epoch": 0.021260718223266428, + "grad_norm": 1.0859375, + "learning_rate": 0.00019952247083795873, + "loss": 1.4548, + "step": 828 + }, + { + "epoch": 0.021286395419188246, + "grad_norm": 1.0390625, + "learning_rate": 0.00019952203498651368, + "loss": 1.4499, + "step": 829 + }, + { + "epoch": 0.02131207261511007, + "grad_norm": 1.0546875, + "learning_rate": 0.00019952159893673018, + "loss": 1.4731, + "step": 830 + }, + { + "epoch": 0.021337749811031887, + "grad_norm": 1.0078125, + "learning_rate": 0.00019952116268860914, + "loss": 1.377, + "step": 831 + }, + { + "epoch": 0.021363427006953706, + "grad_norm": 0.95703125, + "learning_rate": 0.00019952072624215142, + "loss": 1.4716, + "step": 832 + }, + { + "epoch": 0.021389104202875524, + "grad_norm": 0.94921875, + "learning_rate": 0.00019952028959735788, + "loss": 1.4359, + "step": 833 + }, + { + "epoch": 0.021414781398797343, + "grad_norm": 1.0625, + "learning_rate": 0.00019951985275422937, + "loss": 1.6344, + "step": 834 + }, + { + "epoch": 0.021440458594719165, + "grad_norm": 1.0703125, + "learning_rate": 0.0001995194157127668, + "loss": 1.5806, + "step": 835 + }, + { + "epoch": 0.021466135790640984, + "grad_norm": 1.0625, + "learning_rate": 0.00019951897847297103, + "loss": 1.6625, + "step": 836 + }, + { + "epoch": 0.021491812986562803, + "grad_norm": 1.0625, + "learning_rate": 0.00019951854103484294, + "loss": 1.3793, + "step": 837 + }, + { + "epoch": 0.02151749018248462, + "grad_norm": 1.015625, + "learning_rate": 0.00019951810339838334, + "loss": 1.4103, + "step": 838 + }, + { + "epoch": 0.02154316737840644, + "grad_norm": 1.078125, + "learning_rate": 0.00019951766556359316, + "loss": 1.5541, + "step": 839 + }, + { + "epoch": 0.02156884457432826, + "grad_norm": 0.9765625, + "learning_rate": 0.0001995172275304733, + "loss": 1.5021, + "step": 840 + }, + { + "epoch": 0.02159452177025008, + "grad_norm": 0.94140625, + "learning_rate": 0.00019951678929902458, + "loss": 1.6555, + "step": 841 + }, + { + "epoch": 0.0216201989661719, + "grad_norm": 1.0546875, + "learning_rate": 0.00019951635086924786, + "loss": 1.5905, + "step": 842 + }, + { + "epoch": 0.02164587616209372, + "grad_norm": 1.140625, + "learning_rate": 0.00019951591224114408, + "loss": 1.6506, + "step": 843 + }, + { + "epoch": 0.021671553358015537, + "grad_norm": 1.0390625, + "learning_rate": 0.00019951547341471405, + "loss": 1.699, + "step": 844 + }, + { + "epoch": 0.021697230553937356, + "grad_norm": 0.97265625, + "learning_rate": 0.0001995150343899587, + "loss": 1.5775, + "step": 845 + }, + { + "epoch": 0.021722907749859178, + "grad_norm": 1.0, + "learning_rate": 0.00019951459516687884, + "loss": 1.4354, + "step": 846 + }, + { + "epoch": 0.021748584945780997, + "grad_norm": 0.953125, + "learning_rate": 0.00019951415574547538, + "loss": 1.5518, + "step": 847 + }, + { + "epoch": 0.021774262141702815, + "grad_norm": 1.5234375, + "learning_rate": 0.0001995137161257492, + "loss": 1.6481, + "step": 848 + }, + { + "epoch": 0.021799939337624634, + "grad_norm": 1.0859375, + "learning_rate": 0.0001995132763077012, + "loss": 1.5445, + "step": 849 + }, + { + "epoch": 0.021825616533546453, + "grad_norm": 1.0078125, + "learning_rate": 0.00019951283629133222, + "loss": 1.3666, + "step": 850 + }, + { + "epoch": 0.021851293729468275, + "grad_norm": 1.6640625, + "learning_rate": 0.00019951239607664313, + "loss": 1.6768, + "step": 851 + }, + { + "epoch": 0.021876970925390093, + "grad_norm": 0.96875, + "learning_rate": 0.00019951195566363482, + "loss": 1.4034, + "step": 852 + }, + { + "epoch": 0.021902648121311912, + "grad_norm": 1.046875, + "learning_rate": 0.00019951151505230818, + "loss": 1.4673, + "step": 853 + }, + { + "epoch": 0.02192832531723373, + "grad_norm": 1.046875, + "learning_rate": 0.0001995110742426641, + "loss": 1.566, + "step": 854 + }, + { + "epoch": 0.02195400251315555, + "grad_norm": 0.96875, + "learning_rate": 0.00019951063323470344, + "loss": 1.3887, + "step": 855 + }, + { + "epoch": 0.02197967970907737, + "grad_norm": 1.046875, + "learning_rate": 0.00019951019202842703, + "loss": 1.424, + "step": 856 + }, + { + "epoch": 0.02200535690499919, + "grad_norm": 1.0234375, + "learning_rate": 0.00019950975062383582, + "loss": 1.533, + "step": 857 + }, + { + "epoch": 0.02203103410092101, + "grad_norm": 0.9765625, + "learning_rate": 0.0001995093090209307, + "loss": 1.4614, + "step": 858 + }, + { + "epoch": 0.022056711296842828, + "grad_norm": 1.0078125, + "learning_rate": 0.00019950886721971248, + "loss": 1.6161, + "step": 859 + }, + { + "epoch": 0.022082388492764646, + "grad_norm": 1.0234375, + "learning_rate": 0.00019950842522018208, + "loss": 1.4764, + "step": 860 + }, + { + "epoch": 0.02210806568868647, + "grad_norm": 0.95703125, + "learning_rate": 0.00019950798302234037, + "loss": 1.4672, + "step": 861 + }, + { + "epoch": 0.022133742884608287, + "grad_norm": 1.0859375, + "learning_rate": 0.00019950754062618824, + "loss": 1.8103, + "step": 862 + }, + { + "epoch": 0.022159420080530106, + "grad_norm": 1.046875, + "learning_rate": 0.0001995070980317266, + "loss": 1.5803, + "step": 863 + }, + { + "epoch": 0.022185097276451925, + "grad_norm": 0.9375, + "learning_rate": 0.00019950665523895626, + "loss": 1.4217, + "step": 864 + }, + { + "epoch": 0.022210774472373743, + "grad_norm": 1.078125, + "learning_rate": 0.00019950621224787816, + "loss": 1.5597, + "step": 865 + }, + { + "epoch": 0.022236451668295562, + "grad_norm": 0.98046875, + "learning_rate": 0.00019950576905849318, + "loss": 1.4953, + "step": 866 + }, + { + "epoch": 0.022262128864217384, + "grad_norm": 1.046875, + "learning_rate": 0.0001995053256708022, + "loss": 1.4599, + "step": 867 + }, + { + "epoch": 0.022287806060139203, + "grad_norm": 1.0390625, + "learning_rate": 0.00019950488208480606, + "loss": 1.3956, + "step": 868 + }, + { + "epoch": 0.02231348325606102, + "grad_norm": 0.99609375, + "learning_rate": 0.0001995044383005057, + "loss": 1.6099, + "step": 869 + }, + { + "epoch": 0.02233916045198284, + "grad_norm": 1.046875, + "learning_rate": 0.00019950399431790196, + "loss": 1.6576, + "step": 870 + }, + { + "epoch": 0.02236483764790466, + "grad_norm": 0.94921875, + "learning_rate": 0.00019950355013699576, + "loss": 1.2904, + "step": 871 + }, + { + "epoch": 0.02239051484382648, + "grad_norm": 0.93359375, + "learning_rate": 0.000199503105757788, + "loss": 1.5904, + "step": 872 + }, + { + "epoch": 0.0224161920397483, + "grad_norm": 1.0078125, + "learning_rate": 0.00019950266118027953, + "loss": 1.4324, + "step": 873 + }, + { + "epoch": 0.02244186923567012, + "grad_norm": 1.03125, + "learning_rate": 0.00019950221640447122, + "loss": 1.458, + "step": 874 + }, + { + "epoch": 0.022467546431591937, + "grad_norm": 1.046875, + "learning_rate": 0.00019950177143036396, + "loss": 1.6217, + "step": 875 + }, + { + "epoch": 0.022493223627513756, + "grad_norm": 1.265625, + "learning_rate": 0.00019950132625795873, + "loss": 1.4882, + "step": 876 + }, + { + "epoch": 0.022518900823435578, + "grad_norm": 1.03125, + "learning_rate": 0.00019950088088725626, + "loss": 1.6471, + "step": 877 + }, + { + "epoch": 0.022544578019357397, + "grad_norm": 1.078125, + "learning_rate": 0.0001995004353182576, + "loss": 1.484, + "step": 878 + }, + { + "epoch": 0.022570255215279215, + "grad_norm": 1.09375, + "learning_rate": 0.0001994999895509635, + "loss": 1.4079, + "step": 879 + }, + { + "epoch": 0.022595932411201034, + "grad_norm": 1.0234375, + "learning_rate": 0.00019949954358537494, + "loss": 1.4707, + "step": 880 + }, + { + "epoch": 0.022621609607122853, + "grad_norm": 1.015625, + "learning_rate": 0.00019949909742149278, + "loss": 1.6398, + "step": 881 + }, + { + "epoch": 0.022647286803044675, + "grad_norm": 1.015625, + "learning_rate": 0.0001994986510593179, + "loss": 1.4026, + "step": 882 + }, + { + "epoch": 0.022672963998966494, + "grad_norm": 0.9375, + "learning_rate": 0.0001994982044988512, + "loss": 1.4445, + "step": 883 + }, + { + "epoch": 0.022698641194888312, + "grad_norm": 0.9296875, + "learning_rate": 0.00019949775774009355, + "loss": 1.3877, + "step": 884 + }, + { + "epoch": 0.02272431839081013, + "grad_norm": 0.984375, + "learning_rate": 0.00019949731078304587, + "loss": 1.58, + "step": 885 + }, + { + "epoch": 0.02274999558673195, + "grad_norm": 0.9921875, + "learning_rate": 0.00019949686362770902, + "loss": 1.3893, + "step": 886 + }, + { + "epoch": 0.022775672782653772, + "grad_norm": 1.125, + "learning_rate": 0.00019949641627408393, + "loss": 1.5333, + "step": 887 + }, + { + "epoch": 0.02280134997857559, + "grad_norm": 0.9453125, + "learning_rate": 0.00019949596872217146, + "loss": 1.3745, + "step": 888 + }, + { + "epoch": 0.02282702717449741, + "grad_norm": 1.015625, + "learning_rate": 0.00019949552097197254, + "loss": 1.4622, + "step": 889 + }, + { + "epoch": 0.022852704370419228, + "grad_norm": 1.1328125, + "learning_rate": 0.00019949507302348797, + "loss": 1.4962, + "step": 890 + }, + { + "epoch": 0.022878381566341047, + "grad_norm": 1.0234375, + "learning_rate": 0.00019949462487671874, + "loss": 1.4499, + "step": 891 + }, + { + "epoch": 0.022904058762262865, + "grad_norm": 1.03125, + "learning_rate": 0.00019949417653166573, + "loss": 1.5165, + "step": 892 + }, + { + "epoch": 0.022929735958184688, + "grad_norm": 0.98046875, + "learning_rate": 0.00019949372798832982, + "loss": 1.4468, + "step": 893 + }, + { + "epoch": 0.022955413154106506, + "grad_norm": 1.078125, + "learning_rate": 0.00019949327924671185, + "loss": 1.5104, + "step": 894 + }, + { + "epoch": 0.022981090350028325, + "grad_norm": 1.0078125, + "learning_rate": 0.00019949283030681279, + "loss": 1.3732, + "step": 895 + }, + { + "epoch": 0.023006767545950144, + "grad_norm": 1.0859375, + "learning_rate": 0.0001994923811686335, + "loss": 1.4353, + "step": 896 + }, + { + "epoch": 0.023032444741871962, + "grad_norm": 1.0078125, + "learning_rate": 0.0001994919318321749, + "loss": 1.4338, + "step": 897 + }, + { + "epoch": 0.023058121937793784, + "grad_norm": 1.015625, + "learning_rate": 0.00019949148229743785, + "loss": 1.4381, + "step": 898 + }, + { + "epoch": 0.023083799133715603, + "grad_norm": 1.0859375, + "learning_rate": 0.00019949103256442326, + "loss": 1.6771, + "step": 899 + }, + { + "epoch": 0.023109476329637422, + "grad_norm": 1.0703125, + "learning_rate": 0.00019949058263313204, + "loss": 1.8138, + "step": 900 + }, + { + "epoch": 0.02313515352555924, + "grad_norm": 1.0390625, + "learning_rate": 0.00019949013250356506, + "loss": 1.5203, + "step": 901 + }, + { + "epoch": 0.02316083072148106, + "grad_norm": 1.0625, + "learning_rate": 0.00019948968217572323, + "loss": 1.6572, + "step": 902 + }, + { + "epoch": 0.02318650791740288, + "grad_norm": 0.87890625, + "learning_rate": 0.00019948923164960747, + "loss": 1.5407, + "step": 903 + }, + { + "epoch": 0.0232121851133247, + "grad_norm": 0.921875, + "learning_rate": 0.00019948878092521867, + "loss": 1.5301, + "step": 904 + }, + { + "epoch": 0.02323786230924652, + "grad_norm": 0.96875, + "learning_rate": 0.00019948833000255767, + "loss": 1.4248, + "step": 905 + }, + { + "epoch": 0.023263539505168337, + "grad_norm": 0.93359375, + "learning_rate": 0.00019948787888162545, + "loss": 1.4084, + "step": 906 + }, + { + "epoch": 0.023289216701090156, + "grad_norm": 0.98046875, + "learning_rate": 0.00019948742756242287, + "loss": 1.3673, + "step": 907 + }, + { + "epoch": 0.02331489389701198, + "grad_norm": 1.0703125, + "learning_rate": 0.00019948697604495083, + "loss": 1.5057, + "step": 908 + }, + { + "epoch": 0.023340571092933797, + "grad_norm": 0.9921875, + "learning_rate": 0.00019948652432921022, + "loss": 1.2903, + "step": 909 + }, + { + "epoch": 0.023366248288855616, + "grad_norm": 1.09375, + "learning_rate": 0.00019948607241520198, + "loss": 1.5024, + "step": 910 + }, + { + "epoch": 0.023391925484777434, + "grad_norm": 0.97265625, + "learning_rate": 0.00019948562030292698, + "loss": 1.6616, + "step": 911 + }, + { + "epoch": 0.023417602680699253, + "grad_norm": 1.0078125, + "learning_rate": 0.0001994851679923861, + "loss": 1.4012, + "step": 912 + }, + { + "epoch": 0.023443279876621075, + "grad_norm": 1.0234375, + "learning_rate": 0.0001994847154835803, + "loss": 1.5844, + "step": 913 + }, + { + "epoch": 0.023468957072542894, + "grad_norm": 1.0, + "learning_rate": 0.00019948426277651042, + "loss": 1.3435, + "step": 914 + }, + { + "epoch": 0.023494634268464713, + "grad_norm": 1.0546875, + "learning_rate": 0.00019948380987117742, + "loss": 1.5494, + "step": 915 + }, + { + "epoch": 0.02352031146438653, + "grad_norm": 0.9765625, + "learning_rate": 0.00019948335676758214, + "loss": 1.5103, + "step": 916 + }, + { + "epoch": 0.02354598866030835, + "grad_norm": 1.015625, + "learning_rate": 0.0001994829034657255, + "loss": 1.4475, + "step": 917 + }, + { + "epoch": 0.02357166585623017, + "grad_norm": 1.046875, + "learning_rate": 0.0001994824499656085, + "loss": 1.6603, + "step": 918 + }, + { + "epoch": 0.02359734305215199, + "grad_norm": 0.984375, + "learning_rate": 0.0001994819962672319, + "loss": 1.4902, + "step": 919 + }, + { + "epoch": 0.02362302024807381, + "grad_norm": 0.96484375, + "learning_rate": 0.00019948154237059667, + "loss": 1.7293, + "step": 920 + }, + { + "epoch": 0.023648697443995628, + "grad_norm": 0.98828125, + "learning_rate": 0.00019948108827570372, + "loss": 1.6226, + "step": 921 + }, + { + "epoch": 0.023674374639917447, + "grad_norm": 1.0546875, + "learning_rate": 0.00019948063398255394, + "loss": 1.4017, + "step": 922 + }, + { + "epoch": 0.023700051835839266, + "grad_norm": 0.96484375, + "learning_rate": 0.00019948017949114827, + "loss": 1.2714, + "step": 923 + }, + { + "epoch": 0.023725729031761088, + "grad_norm": 1.0078125, + "learning_rate": 0.00019947972480148756, + "loss": 1.6549, + "step": 924 + }, + { + "epoch": 0.023751406227682906, + "grad_norm": 0.9765625, + "learning_rate": 0.00019947926991357274, + "loss": 1.667, + "step": 925 + }, + { + "epoch": 0.023777083423604725, + "grad_norm": 1.03125, + "learning_rate": 0.00019947881482740477, + "loss": 1.4544, + "step": 926 + }, + { + "epoch": 0.023802760619526544, + "grad_norm": 0.94921875, + "learning_rate": 0.00019947835954298447, + "loss": 1.4719, + "step": 927 + }, + { + "epoch": 0.023828437815448363, + "grad_norm": 1.0390625, + "learning_rate": 0.0001994779040603128, + "loss": 1.5881, + "step": 928 + }, + { + "epoch": 0.023854115011370185, + "grad_norm": 0.9375, + "learning_rate": 0.00019947744837939065, + "loss": 1.4513, + "step": 929 + }, + { + "epoch": 0.023879792207292003, + "grad_norm": 1.09375, + "learning_rate": 0.0001994769925002189, + "loss": 1.5569, + "step": 930 + }, + { + "epoch": 0.023905469403213822, + "grad_norm": 0.97265625, + "learning_rate": 0.00019947653642279854, + "loss": 1.2876, + "step": 931 + }, + { + "epoch": 0.02393114659913564, + "grad_norm": 1.0546875, + "learning_rate": 0.0001994760801471304, + "loss": 1.426, + "step": 932 + }, + { + "epoch": 0.02395682379505746, + "grad_norm": 1.0703125, + "learning_rate": 0.0001994756236732154, + "loss": 1.4106, + "step": 933 + }, + { + "epoch": 0.02398250099097928, + "grad_norm": 1.1015625, + "learning_rate": 0.0001994751670010545, + "loss": 1.6208, + "step": 934 + }, + { + "epoch": 0.0240081781869011, + "grad_norm": 1.0625, + "learning_rate": 0.00019947471013064856, + "loss": 1.4117, + "step": 935 + }, + { + "epoch": 0.02403385538282292, + "grad_norm": 1.0546875, + "learning_rate": 0.00019947425306199852, + "loss": 1.4862, + "step": 936 + }, + { + "epoch": 0.024059532578744738, + "grad_norm": 0.90625, + "learning_rate": 0.00019947379579510525, + "loss": 1.3705, + "step": 937 + }, + { + "epoch": 0.024085209774666556, + "grad_norm": 0.9765625, + "learning_rate": 0.00019947333832996973, + "loss": 1.4999, + "step": 938 + }, + { + "epoch": 0.02411088697058838, + "grad_norm": 0.98046875, + "learning_rate": 0.0001994728806665928, + "loss": 1.4521, + "step": 939 + }, + { + "epoch": 0.024136564166510197, + "grad_norm": 0.9453125, + "learning_rate": 0.00019947242280497546, + "loss": 1.4601, + "step": 940 + }, + { + "epoch": 0.024162241362432016, + "grad_norm": 1.015625, + "learning_rate": 0.0001994719647451185, + "loss": 1.5144, + "step": 941 + }, + { + "epoch": 0.024187918558353835, + "grad_norm": 1.0234375, + "learning_rate": 0.00019947150648702292, + "loss": 1.4534, + "step": 942 + }, + { + "epoch": 0.024213595754275653, + "grad_norm": 1.015625, + "learning_rate": 0.00019947104803068964, + "loss": 1.4953, + "step": 943 + }, + { + "epoch": 0.024239272950197472, + "grad_norm": 0.96484375, + "learning_rate": 0.0001994705893761195, + "loss": 1.5119, + "step": 944 + }, + { + "epoch": 0.024264950146119294, + "grad_norm": 0.984375, + "learning_rate": 0.0001994701305233135, + "loss": 1.3314, + "step": 945 + }, + { + "epoch": 0.024290627342041113, + "grad_norm": 1.0, + "learning_rate": 0.0001994696714722725, + "loss": 1.374, + "step": 946 + }, + { + "epoch": 0.02431630453796293, + "grad_norm": 1.03125, + "learning_rate": 0.00019946921222299745, + "loss": 1.4049, + "step": 947 + }, + { + "epoch": 0.02434198173388475, + "grad_norm": 0.96484375, + "learning_rate": 0.00019946875277548922, + "loss": 1.5508, + "step": 948 + }, + { + "epoch": 0.02436765892980657, + "grad_norm": 1.046875, + "learning_rate": 0.00019946829312974875, + "loss": 1.531, + "step": 949 + }, + { + "epoch": 0.02439333612572839, + "grad_norm": 0.984375, + "learning_rate": 0.00019946783328577696, + "loss": 1.592, + "step": 950 + }, + { + "epoch": 0.02441901332165021, + "grad_norm": 1.046875, + "learning_rate": 0.00019946737324357477, + "loss": 1.2547, + "step": 951 + }, + { + "epoch": 0.02444469051757203, + "grad_norm": 0.96484375, + "learning_rate": 0.0001994669130031431, + "loss": 1.5151, + "step": 952 + }, + { + "epoch": 0.024470367713493847, + "grad_norm": 1.0703125, + "learning_rate": 0.00019946645256448286, + "loss": 1.53, + "step": 953 + }, + { + "epoch": 0.024496044909415666, + "grad_norm": 1.0859375, + "learning_rate": 0.00019946599192759493, + "loss": 1.4622, + "step": 954 + }, + { + "epoch": 0.024521722105337488, + "grad_norm": 0.9921875, + "learning_rate": 0.00019946553109248028, + "loss": 1.4093, + "step": 955 + }, + { + "epoch": 0.024547399301259307, + "grad_norm": 0.96484375, + "learning_rate": 0.0001994650700591398, + "loss": 1.5277, + "step": 956 + }, + { + "epoch": 0.024573076497181125, + "grad_norm": 0.9765625, + "learning_rate": 0.00019946460882757443, + "loss": 1.4143, + "step": 957 + }, + { + "epoch": 0.024598753693102944, + "grad_norm": 0.953125, + "learning_rate": 0.00019946414739778509, + "loss": 1.5181, + "step": 958 + }, + { + "epoch": 0.024624430889024763, + "grad_norm": 1.0078125, + "learning_rate": 0.00019946368576977266, + "loss": 1.6324, + "step": 959 + }, + { + "epoch": 0.024650108084946585, + "grad_norm": 0.94921875, + "learning_rate": 0.00019946322394353812, + "loss": 1.4392, + "step": 960 + }, + { + "epoch": 0.024675785280868404, + "grad_norm": 0.9765625, + "learning_rate": 0.00019946276191908235, + "loss": 1.5506, + "step": 961 + }, + { + "epoch": 0.024701462476790222, + "grad_norm": 0.91015625, + "learning_rate": 0.00019946229969640625, + "loss": 1.5007, + "step": 962 + }, + { + "epoch": 0.02472713967271204, + "grad_norm": 1.0546875, + "learning_rate": 0.0001994618372755108, + "loss": 1.3606, + "step": 963 + }, + { + "epoch": 0.02475281686863386, + "grad_norm": 0.984375, + "learning_rate": 0.00019946137465639692, + "loss": 1.3803, + "step": 964 + }, + { + "epoch": 0.024778494064555682, + "grad_norm": 0.99609375, + "learning_rate": 0.00019946091183906548, + "loss": 1.3568, + "step": 965 + }, + { + "epoch": 0.0248041712604775, + "grad_norm": 1.0546875, + "learning_rate": 0.00019946044882351743, + "loss": 1.5297, + "step": 966 + }, + { + "epoch": 0.02482984845639932, + "grad_norm": 0.9921875, + "learning_rate": 0.00019945998560975367, + "loss": 1.5713, + "step": 967 + }, + { + "epoch": 0.024855525652321138, + "grad_norm": 1.0234375, + "learning_rate": 0.00019945952219777518, + "loss": 1.3684, + "step": 968 + }, + { + "epoch": 0.024881202848242957, + "grad_norm": 1.015625, + "learning_rate": 0.0001994590585875828, + "loss": 1.5556, + "step": 969 + }, + { + "epoch": 0.024906880044164775, + "grad_norm": 0.95703125, + "learning_rate": 0.00019945859477917753, + "loss": 1.5417, + "step": 970 + }, + { + "epoch": 0.024932557240086597, + "grad_norm": 1.0234375, + "learning_rate": 0.00019945813077256025, + "loss": 1.6196, + "step": 971 + }, + { + "epoch": 0.024958234436008416, + "grad_norm": 1.0390625, + "learning_rate": 0.00019945766656773192, + "loss": 1.608, + "step": 972 + }, + { + "epoch": 0.024983911631930235, + "grad_norm": 1.0859375, + "learning_rate": 0.00019945720216469343, + "loss": 1.4669, + "step": 973 + }, + { + "epoch": 0.025009588827852054, + "grad_norm": 1.0, + "learning_rate": 0.0001994567375634457, + "loss": 1.6096, + "step": 974 + }, + { + "epoch": 0.025035266023773872, + "grad_norm": 1.0390625, + "learning_rate": 0.0001994562727639897, + "loss": 1.5775, + "step": 975 + }, + { + "epoch": 0.025060943219695694, + "grad_norm": 0.9921875, + "learning_rate": 0.00019945580776632634, + "loss": 1.5278, + "step": 976 + }, + { + "epoch": 0.025086620415617513, + "grad_norm": 1.03125, + "learning_rate": 0.00019945534257045653, + "loss": 1.4514, + "step": 977 + }, + { + "epoch": 0.025112297611539332, + "grad_norm": 0.92578125, + "learning_rate": 0.0001994548771763812, + "loss": 1.4368, + "step": 978 + }, + { + "epoch": 0.02513797480746115, + "grad_norm": 1.0, + "learning_rate": 0.0001994544115841013, + "loss": 1.6954, + "step": 979 + }, + { + "epoch": 0.02516365200338297, + "grad_norm": 1.09375, + "learning_rate": 0.0001994539457936177, + "loss": 1.4419, + "step": 980 + }, + { + "epoch": 0.02518932919930479, + "grad_norm": 0.9375, + "learning_rate": 0.0001994534798049314, + "loss": 1.4578, + "step": 981 + }, + { + "epoch": 0.02521500639522661, + "grad_norm": 1.03125, + "learning_rate": 0.00019945301361804331, + "loss": 1.4524, + "step": 982 + }, + { + "epoch": 0.02524068359114843, + "grad_norm": 0.99609375, + "learning_rate": 0.00019945254723295435, + "loss": 1.4384, + "step": 983 + }, + { + "epoch": 0.025266360787070247, + "grad_norm": 0.9921875, + "learning_rate": 0.00019945208064966544, + "loss": 1.6247, + "step": 984 + }, + { + "epoch": 0.025292037982992066, + "grad_norm": 0.90625, + "learning_rate": 0.0001994516138681775, + "loss": 1.391, + "step": 985 + }, + { + "epoch": 0.025317715178913888, + "grad_norm": 1.0234375, + "learning_rate": 0.00019945114688849148, + "loss": 1.5169, + "step": 986 + }, + { + "epoch": 0.025343392374835707, + "grad_norm": 0.953125, + "learning_rate": 0.00019945067971060832, + "loss": 1.2809, + "step": 987 + }, + { + "epoch": 0.025369069570757526, + "grad_norm": 1.0859375, + "learning_rate": 0.00019945021233452894, + "loss": 1.7627, + "step": 988 + }, + { + "epoch": 0.025394746766679344, + "grad_norm": 1.0703125, + "learning_rate": 0.00019944974476025426, + "loss": 1.6699, + "step": 989 + }, + { + "epoch": 0.025420423962601163, + "grad_norm": 1.0234375, + "learning_rate": 0.00019944927698778523, + "loss": 1.4578, + "step": 990 + }, + { + "epoch": 0.025446101158522985, + "grad_norm": 0.984375, + "learning_rate": 0.00019944880901712276, + "loss": 1.4202, + "step": 991 + }, + { + "epoch": 0.025471778354444804, + "grad_norm": 1.03125, + "learning_rate": 0.0001994483408482678, + "loss": 1.5926, + "step": 992 + }, + { + "epoch": 0.025497455550366623, + "grad_norm": 0.99609375, + "learning_rate": 0.00019944787248122128, + "loss": 1.4173, + "step": 993 + }, + { + "epoch": 0.02552313274628844, + "grad_norm": 1.0234375, + "learning_rate": 0.00019944740391598416, + "loss": 1.5358, + "step": 994 + }, + { + "epoch": 0.02554880994221026, + "grad_norm": 1.0703125, + "learning_rate": 0.0001994469351525573, + "loss": 1.4454, + "step": 995 + }, + { + "epoch": 0.02557448713813208, + "grad_norm": 1.015625, + "learning_rate": 0.00019944646619094169, + "loss": 1.5302, + "step": 996 + }, + { + "epoch": 0.0256001643340539, + "grad_norm": 1.1015625, + "learning_rate": 0.00019944599703113829, + "loss": 1.4566, + "step": 997 + }, + { + "epoch": 0.02562584152997572, + "grad_norm": 1.015625, + "learning_rate": 0.000199445527673148, + "loss": 1.5232, + "step": 998 + }, + { + "epoch": 0.025651518725897538, + "grad_norm": 1.0, + "learning_rate": 0.00019944505811697174, + "loss": 1.5513, + "step": 999 + }, + { + "epoch": 0.025677195921819357, + "grad_norm": 1.078125, + "learning_rate": 0.00019944458836261043, + "loss": 1.5472, + "step": 1000 + }, + { + "epoch": 0.025677195921819357, + "eval_loss": 1.4854185581207275, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 406.1321, + "eval_samples_per_second": 24.623, + "eval_steps_per_second": 0.771, + "step": 1000 + }, + { + "epoch": 0.025702873117741176, + "grad_norm": 0.98828125, + "learning_rate": 0.0001994441184100651, + "loss": 1.4708, + "step": 1001 + }, + { + "epoch": 0.025728550313662998, + "grad_norm": 0.921875, + "learning_rate": 0.00019944364825933656, + "loss": 1.4844, + "step": 1002 + }, + { + "epoch": 0.025754227509584816, + "grad_norm": 1.03125, + "learning_rate": 0.00019944317791042586, + "loss": 1.4611, + "step": 1003 + }, + { + "epoch": 0.025779904705506635, + "grad_norm": 0.98046875, + "learning_rate": 0.00019944270736333387, + "loss": 1.5811, + "step": 1004 + }, + { + "epoch": 0.025805581901428454, + "grad_norm": 0.9921875, + "learning_rate": 0.00019944223661806155, + "loss": 1.6801, + "step": 1005 + }, + { + "epoch": 0.025831259097350272, + "grad_norm": 0.94921875, + "learning_rate": 0.00019944176567460983, + "loss": 1.5402, + "step": 1006 + }, + { + "epoch": 0.025856936293272095, + "grad_norm": 1.0546875, + "learning_rate": 0.00019944129453297967, + "loss": 1.6332, + "step": 1007 + }, + { + "epoch": 0.025882613489193913, + "grad_norm": 1.0078125, + "learning_rate": 0.000199440823193172, + "loss": 1.3894, + "step": 1008 + }, + { + "epoch": 0.025908290685115732, + "grad_norm": 1.0625, + "learning_rate": 0.00019944035165518772, + "loss": 1.7039, + "step": 1009 + }, + { + "epoch": 0.02593396788103755, + "grad_norm": 1.03125, + "learning_rate": 0.00019943987991902784, + "loss": 1.4799, + "step": 1010 + }, + { + "epoch": 0.02595964507695937, + "grad_norm": 0.98046875, + "learning_rate": 0.00019943940798469322, + "loss": 1.5618, + "step": 1011 + }, + { + "epoch": 0.02598532227288119, + "grad_norm": 0.9765625, + "learning_rate": 0.00019943893585218486, + "loss": 1.4429, + "step": 1012 + }, + { + "epoch": 0.02601099946880301, + "grad_norm": 0.96484375, + "learning_rate": 0.00019943846352150367, + "loss": 1.5287, + "step": 1013 + }, + { + "epoch": 0.02603667666472483, + "grad_norm": 1.0078125, + "learning_rate": 0.00019943799099265063, + "loss": 1.5508, + "step": 1014 + }, + { + "epoch": 0.026062353860646648, + "grad_norm": 1.0234375, + "learning_rate": 0.00019943751826562663, + "loss": 1.572, + "step": 1015 + }, + { + "epoch": 0.026088031056568466, + "grad_norm": 0.9765625, + "learning_rate": 0.00019943704534043267, + "loss": 1.4846, + "step": 1016 + }, + { + "epoch": 0.02611370825249029, + "grad_norm": 1.0, + "learning_rate": 0.00019943657221706965, + "loss": 1.5034, + "step": 1017 + }, + { + "epoch": 0.026139385448412107, + "grad_norm": 1.0234375, + "learning_rate": 0.0001994360988955385, + "loss": 1.4575, + "step": 1018 + }, + { + "epoch": 0.026165062644333926, + "grad_norm": 0.97265625, + "learning_rate": 0.00019943562537584023, + "loss": 1.5424, + "step": 1019 + }, + { + "epoch": 0.026190739840255745, + "grad_norm": 1.078125, + "learning_rate": 0.00019943515165797568, + "loss": 1.4437, + "step": 1020 + }, + { + "epoch": 0.026216417036177563, + "grad_norm": 0.94921875, + "learning_rate": 0.0001994346777419459, + "loss": 1.5223, + "step": 1021 + }, + { + "epoch": 0.026242094232099382, + "grad_norm": 1.0859375, + "learning_rate": 0.00019943420362775177, + "loss": 1.4361, + "step": 1022 + }, + { + "epoch": 0.026267771428021204, + "grad_norm": 0.99609375, + "learning_rate": 0.0001994337293153943, + "loss": 1.3712, + "step": 1023 + }, + { + "epoch": 0.026293448623943023, + "grad_norm": 0.953125, + "learning_rate": 0.00019943325480487433, + "loss": 1.3299, + "step": 1024 + }, + { + "epoch": 0.02631912581986484, + "grad_norm": 1.0234375, + "learning_rate": 0.0001994327800961929, + "loss": 1.5511, + "step": 1025 + }, + { + "epoch": 0.02634480301578666, + "grad_norm": 0.98046875, + "learning_rate": 0.00019943230518935088, + "loss": 1.412, + "step": 1026 + }, + { + "epoch": 0.02637048021170848, + "grad_norm": 1.078125, + "learning_rate": 0.0001994318300843493, + "loss": 1.395, + "step": 1027 + }, + { + "epoch": 0.0263961574076303, + "grad_norm": 0.984375, + "learning_rate": 0.00019943135478118904, + "loss": 1.4236, + "step": 1028 + }, + { + "epoch": 0.02642183460355212, + "grad_norm": 0.94140625, + "learning_rate": 0.0001994308792798711, + "loss": 1.5184, + "step": 1029 + }, + { + "epoch": 0.02644751179947394, + "grad_norm": 0.9453125, + "learning_rate": 0.00019943040358039635, + "loss": 1.3486, + "step": 1030 + }, + { + "epoch": 0.026473188995395757, + "grad_norm": 1.0078125, + "learning_rate": 0.00019942992768276583, + "loss": 1.4837, + "step": 1031 + }, + { + "epoch": 0.026498866191317576, + "grad_norm": 1.0234375, + "learning_rate": 0.00019942945158698042, + "loss": 1.6237, + "step": 1032 + }, + { + "epoch": 0.026524543387239398, + "grad_norm": 1.1796875, + "learning_rate": 0.00019942897529304113, + "loss": 1.4504, + "step": 1033 + }, + { + "epoch": 0.026550220583161217, + "grad_norm": 1.0625, + "learning_rate": 0.00019942849880094884, + "loss": 1.5517, + "step": 1034 + }, + { + "epoch": 0.026575897779083035, + "grad_norm": 1.015625, + "learning_rate": 0.00019942802211070455, + "loss": 1.45, + "step": 1035 + }, + { + "epoch": 0.026601574975004854, + "grad_norm": 1.0703125, + "learning_rate": 0.00019942754522230918, + "loss": 1.637, + "step": 1036 + }, + { + "epoch": 0.026627252170926673, + "grad_norm": 1.125, + "learning_rate": 0.00019942706813576367, + "loss": 1.4628, + "step": 1037 + }, + { + "epoch": 0.026652929366848495, + "grad_norm": 1.0390625, + "learning_rate": 0.00019942659085106905, + "loss": 1.3764, + "step": 1038 + }, + { + "epoch": 0.026678606562770314, + "grad_norm": 0.98046875, + "learning_rate": 0.0001994261133682262, + "loss": 1.6435, + "step": 1039 + }, + { + "epoch": 0.026704283758692132, + "grad_norm": 1.03125, + "learning_rate": 0.00019942563568723607, + "loss": 1.4869, + "step": 1040 + }, + { + "epoch": 0.02672996095461395, + "grad_norm": 1.125, + "learning_rate": 0.00019942515780809963, + "loss": 1.4512, + "step": 1041 + }, + { + "epoch": 0.02675563815053577, + "grad_norm": 1.046875, + "learning_rate": 0.00019942467973081785, + "loss": 1.4988, + "step": 1042 + }, + { + "epoch": 0.026781315346457592, + "grad_norm": 0.9765625, + "learning_rate": 0.00019942420145539164, + "loss": 1.5283, + "step": 1043 + }, + { + "epoch": 0.02680699254237941, + "grad_norm": 0.98828125, + "learning_rate": 0.00019942372298182198, + "loss": 1.4573, + "step": 1044 + }, + { + "epoch": 0.02683266973830123, + "grad_norm": 1.0703125, + "learning_rate": 0.00019942324431010983, + "loss": 1.5492, + "step": 1045 + }, + { + "epoch": 0.026858346934223048, + "grad_norm": 1.0625, + "learning_rate": 0.00019942276544025614, + "loss": 1.3878, + "step": 1046 + }, + { + "epoch": 0.026884024130144867, + "grad_norm": 1.03125, + "learning_rate": 0.00019942228637226186, + "loss": 1.5424, + "step": 1047 + }, + { + "epoch": 0.026909701326066685, + "grad_norm": 1.09375, + "learning_rate": 0.00019942180710612794, + "loss": 1.5653, + "step": 1048 + }, + { + "epoch": 0.026935378521988507, + "grad_norm": 1.15625, + "learning_rate": 0.00019942132764185535, + "loss": 1.4343, + "step": 1049 + }, + { + "epoch": 0.026961055717910326, + "grad_norm": 1.03125, + "learning_rate": 0.00019942084797944503, + "loss": 1.4759, + "step": 1050 + }, + { + "epoch": 0.026986732913832145, + "grad_norm": 1.0703125, + "learning_rate": 0.00019942036811889792, + "loss": 1.5807, + "step": 1051 + }, + { + "epoch": 0.027012410109753963, + "grad_norm": 0.9609375, + "learning_rate": 0.00019941988806021502, + "loss": 1.3789, + "step": 1052 + }, + { + "epoch": 0.027038087305675782, + "grad_norm": 1.0, + "learning_rate": 0.00019941940780339725, + "loss": 1.3312, + "step": 1053 + }, + { + "epoch": 0.027063764501597604, + "grad_norm": 1.0546875, + "learning_rate": 0.00019941892734844557, + "loss": 1.4798, + "step": 1054 + }, + { + "epoch": 0.027089441697519423, + "grad_norm": 1.0625, + "learning_rate": 0.000199418446695361, + "loss": 1.5944, + "step": 1055 + }, + { + "epoch": 0.02711511889344124, + "grad_norm": 0.9453125, + "learning_rate": 0.00019941796584414437, + "loss": 1.3328, + "step": 1056 + }, + { + "epoch": 0.02714079608936306, + "grad_norm": 1.046875, + "learning_rate": 0.00019941748479479677, + "loss": 1.4425, + "step": 1057 + }, + { + "epoch": 0.02716647328528488, + "grad_norm": 1.0546875, + "learning_rate": 0.0001994170035473191, + "loss": 1.533, + "step": 1058 + }, + { + "epoch": 0.0271921504812067, + "grad_norm": 0.98828125, + "learning_rate": 0.00019941652210171232, + "loss": 1.404, + "step": 1059 + }, + { + "epoch": 0.02721782767712852, + "grad_norm": 0.94921875, + "learning_rate": 0.00019941604045797736, + "loss": 1.4773, + "step": 1060 + }, + { + "epoch": 0.02724350487305034, + "grad_norm": 1.046875, + "learning_rate": 0.00019941555861611524, + "loss": 1.667, + "step": 1061 + }, + { + "epoch": 0.027269182068972157, + "grad_norm": 1.015625, + "learning_rate": 0.00019941507657612688, + "loss": 1.5115, + "step": 1062 + }, + { + "epoch": 0.027294859264893976, + "grad_norm": 1.0, + "learning_rate": 0.00019941459433801324, + "loss": 1.6065, + "step": 1063 + }, + { + "epoch": 0.027320536460815798, + "grad_norm": 1.0625, + "learning_rate": 0.00019941411190177533, + "loss": 1.4143, + "step": 1064 + }, + { + "epoch": 0.027346213656737617, + "grad_norm": 1.0859375, + "learning_rate": 0.00019941362926741404, + "loss": 1.4215, + "step": 1065 + }, + { + "epoch": 0.027371890852659436, + "grad_norm": 0.93359375, + "learning_rate": 0.00019941314643493038, + "loss": 1.3561, + "step": 1066 + }, + { + "epoch": 0.027397568048581254, + "grad_norm": 1.1171875, + "learning_rate": 0.0001994126634043253, + "loss": 1.4143, + "step": 1067 + }, + { + "epoch": 0.027423245244503073, + "grad_norm": 0.98046875, + "learning_rate": 0.00019941218017559978, + "loss": 1.4436, + "step": 1068 + }, + { + "epoch": 0.027448922440424895, + "grad_norm": 1.0234375, + "learning_rate": 0.00019941169674875475, + "loss": 1.387, + "step": 1069 + }, + { + "epoch": 0.027474599636346714, + "grad_norm": 0.984375, + "learning_rate": 0.00019941121312379116, + "loss": 1.5323, + "step": 1070 + }, + { + "epoch": 0.027500276832268532, + "grad_norm": 0.96484375, + "learning_rate": 0.00019941072930071006, + "loss": 1.5316, + "step": 1071 + }, + { + "epoch": 0.02752595402819035, + "grad_norm": 1.0, + "learning_rate": 0.00019941024527951232, + "loss": 1.4795, + "step": 1072 + }, + { + "epoch": 0.02755163122411217, + "grad_norm": 1.078125, + "learning_rate": 0.00019940976106019894, + "loss": 1.5218, + "step": 1073 + }, + { + "epoch": 0.02757730842003399, + "grad_norm": 1.734375, + "learning_rate": 0.0001994092766427709, + "loss": 1.279, + "step": 1074 + }, + { + "epoch": 0.02760298561595581, + "grad_norm": 0.984375, + "learning_rate": 0.00019940879202722913, + "loss": 1.5853, + "step": 1075 + }, + { + "epoch": 0.02762866281187763, + "grad_norm": 1.078125, + "learning_rate": 0.0001994083072135746, + "loss": 1.4069, + "step": 1076 + }, + { + "epoch": 0.027654340007799448, + "grad_norm": 1.0625, + "learning_rate": 0.00019940782220180837, + "loss": 1.5102, + "step": 1077 + }, + { + "epoch": 0.027680017203721267, + "grad_norm": 1.0, + "learning_rate": 0.0001994073369919313, + "loss": 1.4527, + "step": 1078 + }, + { + "epoch": 0.027705694399643085, + "grad_norm": 1.015625, + "learning_rate": 0.00019940685158394432, + "loss": 1.5117, + "step": 1079 + }, + { + "epoch": 0.027731371595564908, + "grad_norm": 1.03125, + "learning_rate": 0.00019940636597784854, + "loss": 1.396, + "step": 1080 + }, + { + "epoch": 0.027757048791486726, + "grad_norm": 1.0546875, + "learning_rate": 0.00019940588017364482, + "loss": 1.5491, + "step": 1081 + }, + { + "epoch": 0.027782725987408545, + "grad_norm": 1.0703125, + "learning_rate": 0.00019940539417133418, + "loss": 1.3757, + "step": 1082 + }, + { + "epoch": 0.027808403183330364, + "grad_norm": 0.98046875, + "learning_rate": 0.00019940490797091753, + "loss": 1.3879, + "step": 1083 + }, + { + "epoch": 0.027834080379252182, + "grad_norm": 0.99609375, + "learning_rate": 0.0001994044215723959, + "loss": 1.4296, + "step": 1084 + }, + { + "epoch": 0.027859757575174005, + "grad_norm": 0.9453125, + "learning_rate": 0.00019940393497577024, + "loss": 1.352, + "step": 1085 + }, + { + "epoch": 0.027885434771095823, + "grad_norm": 1.1015625, + "learning_rate": 0.00019940344818104153, + "loss": 1.391, + "step": 1086 + }, + { + "epoch": 0.027911111967017642, + "grad_norm": 1.0546875, + "learning_rate": 0.0001994029611882107, + "loss": 1.4751, + "step": 1087 + }, + { + "epoch": 0.02793678916293946, + "grad_norm": 0.953125, + "learning_rate": 0.00019940247399727876, + "loss": 1.465, + "step": 1088 + }, + { + "epoch": 0.02796246635886128, + "grad_norm": 1.046875, + "learning_rate": 0.00019940198660824666, + "loss": 1.3967, + "step": 1089 + }, + { + "epoch": 0.0279881435547831, + "grad_norm": 1.0546875, + "learning_rate": 0.0001994014990211154, + "loss": 1.4257, + "step": 1090 + }, + { + "epoch": 0.02801382075070492, + "grad_norm": 0.96484375, + "learning_rate": 0.0001994010112358859, + "loss": 1.4247, + "step": 1091 + }, + { + "epoch": 0.02803949794662674, + "grad_norm": 1.0078125, + "learning_rate": 0.0001994005232525592, + "loss": 1.5331, + "step": 1092 + }, + { + "epoch": 0.028065175142548558, + "grad_norm": 0.9140625, + "learning_rate": 0.0001994000350711362, + "loss": 1.411, + "step": 1093 + }, + { + "epoch": 0.028090852338470376, + "grad_norm": 1.0234375, + "learning_rate": 0.00019939954669161795, + "loss": 1.3371, + "step": 1094 + }, + { + "epoch": 0.0281165295343922, + "grad_norm": 1.0625, + "learning_rate": 0.00019939905811400537, + "loss": 1.5818, + "step": 1095 + }, + { + "epoch": 0.028142206730314017, + "grad_norm": 1.0390625, + "learning_rate": 0.00019939856933829942, + "loss": 1.3699, + "step": 1096 + }, + { + "epoch": 0.028167883926235836, + "grad_norm": 1.0859375, + "learning_rate": 0.00019939808036450112, + "loss": 1.4681, + "step": 1097 + }, + { + "epoch": 0.028193561122157654, + "grad_norm": 1.0234375, + "learning_rate": 0.00019939759119261141, + "loss": 1.4487, + "step": 1098 + }, + { + "epoch": 0.028219238318079473, + "grad_norm": 1.1328125, + "learning_rate": 0.0001993971018226313, + "loss": 1.4987, + "step": 1099 + }, + { + "epoch": 0.028244915514001292, + "grad_norm": 1.109375, + "learning_rate": 0.00019939661225456173, + "loss": 1.5479, + "step": 1100 + }, + { + "epoch": 0.028270592709923114, + "grad_norm": 1.0390625, + "learning_rate": 0.00019939612248840368, + "loss": 1.3609, + "step": 1101 + }, + { + "epoch": 0.028296269905844933, + "grad_norm": 1.046875, + "learning_rate": 0.00019939563252415816, + "loss": 1.4547, + "step": 1102 + }, + { + "epoch": 0.02832194710176675, + "grad_norm": 0.94140625, + "learning_rate": 0.0001993951423618261, + "loss": 1.5048, + "step": 1103 + }, + { + "epoch": 0.02834762429768857, + "grad_norm": 0.91015625, + "learning_rate": 0.00019939465200140855, + "loss": 1.4501, + "step": 1104 + }, + { + "epoch": 0.02837330149361039, + "grad_norm": 1.0546875, + "learning_rate": 0.00019939416144290636, + "loss": 1.6549, + "step": 1105 + }, + { + "epoch": 0.02839897868953221, + "grad_norm": 0.9921875, + "learning_rate": 0.00019939367068632066, + "loss": 1.3473, + "step": 1106 + }, + { + "epoch": 0.02842465588545403, + "grad_norm": 1.0546875, + "learning_rate": 0.0001993931797316523, + "loss": 1.3497, + "step": 1107 + }, + { + "epoch": 0.02845033308137585, + "grad_norm": 0.97265625, + "learning_rate": 0.00019939268857890232, + "loss": 1.5023, + "step": 1108 + }, + { + "epoch": 0.028476010277297667, + "grad_norm": 0.9765625, + "learning_rate": 0.0001993921972280717, + "loss": 1.4863, + "step": 1109 + }, + { + "epoch": 0.028501687473219486, + "grad_norm": 1.0234375, + "learning_rate": 0.00019939170567916142, + "loss": 1.4593, + "step": 1110 + }, + { + "epoch": 0.028527364669141308, + "grad_norm": 1.0625, + "learning_rate": 0.00019939121393217245, + "loss": 1.5566, + "step": 1111 + }, + { + "epoch": 0.028553041865063127, + "grad_norm": 1.28125, + "learning_rate": 0.0001993907219871057, + "loss": 1.3723, + "step": 1112 + }, + { + "epoch": 0.028578719060984945, + "grad_norm": 1.0078125, + "learning_rate": 0.0001993902298439623, + "loss": 1.5394, + "step": 1113 + }, + { + "epoch": 0.028604396256906764, + "grad_norm": 0.96484375, + "learning_rate": 0.00019938973750274313, + "loss": 1.3245, + "step": 1114 + }, + { + "epoch": 0.028630073452828583, + "grad_norm": 0.9375, + "learning_rate": 0.00019938924496344917, + "loss": 1.5583, + "step": 1115 + }, + { + "epoch": 0.028655750648750405, + "grad_norm": 1.0390625, + "learning_rate": 0.00019938875222608144, + "loss": 1.4821, + "step": 1116 + }, + { + "epoch": 0.028681427844672223, + "grad_norm": 0.9921875, + "learning_rate": 0.0001993882592906409, + "loss": 1.406, + "step": 1117 + }, + { + "epoch": 0.028707105040594042, + "grad_norm": 1.0078125, + "learning_rate": 0.00019938776615712852, + "loss": 1.4598, + "step": 1118 + }, + { + "epoch": 0.02873278223651586, + "grad_norm": 0.9765625, + "learning_rate": 0.00019938727282554534, + "loss": 1.5509, + "step": 1119 + }, + { + "epoch": 0.02875845943243768, + "grad_norm": 0.91015625, + "learning_rate": 0.00019938677929589227, + "loss": 1.4516, + "step": 1120 + }, + { + "epoch": 0.0287841366283595, + "grad_norm": 0.98046875, + "learning_rate": 0.00019938628556817034, + "loss": 1.3546, + "step": 1121 + }, + { + "epoch": 0.02880981382428132, + "grad_norm": 0.9296875, + "learning_rate": 0.0001993857916423805, + "loss": 1.3877, + "step": 1122 + }, + { + "epoch": 0.02883549102020314, + "grad_norm": 0.98046875, + "learning_rate": 0.0001993852975185238, + "loss": 1.2831, + "step": 1123 + }, + { + "epoch": 0.028861168216124958, + "grad_norm": 1.0546875, + "learning_rate": 0.00019938480319660112, + "loss": 1.5056, + "step": 1124 + }, + { + "epoch": 0.028886845412046776, + "grad_norm": 0.99609375, + "learning_rate": 0.00019938430867661355, + "loss": 1.3605, + "step": 1125 + }, + { + "epoch": 0.028912522607968595, + "grad_norm": 0.98828125, + "learning_rate": 0.00019938381395856204, + "loss": 1.4964, + "step": 1126 + }, + { + "epoch": 0.028938199803890417, + "grad_norm": 0.9921875, + "learning_rate": 0.00019938331904244754, + "loss": 1.499, + "step": 1127 + }, + { + "epoch": 0.028963876999812236, + "grad_norm": 0.99609375, + "learning_rate": 0.00019938282392827102, + "loss": 1.536, + "step": 1128 + }, + { + "epoch": 0.028989554195734055, + "grad_norm": 0.91015625, + "learning_rate": 0.0001993823286160336, + "loss": 1.4503, + "step": 1129 + }, + { + "epoch": 0.029015231391655873, + "grad_norm": 0.98046875, + "learning_rate": 0.0001993818331057361, + "loss": 1.5386, + "step": 1130 + }, + { + "epoch": 0.029040908587577692, + "grad_norm": 1.0, + "learning_rate": 0.00019938133739737964, + "loss": 1.4607, + "step": 1131 + }, + { + "epoch": 0.029066585783499514, + "grad_norm": 1.1171875, + "learning_rate": 0.00019938084149096513, + "loss": 1.2683, + "step": 1132 + }, + { + "epoch": 0.029092262979421333, + "grad_norm": 0.92578125, + "learning_rate": 0.00019938034538649355, + "loss": 1.4044, + "step": 1133 + }, + { + "epoch": 0.02911794017534315, + "grad_norm": 0.93359375, + "learning_rate": 0.00019937984908396594, + "loss": 1.5974, + "step": 1134 + }, + { + "epoch": 0.02914361737126497, + "grad_norm": 0.97265625, + "learning_rate": 0.00019937935258338327, + "loss": 1.3665, + "step": 1135 + }, + { + "epoch": 0.02916929456718679, + "grad_norm": 0.98828125, + "learning_rate": 0.0001993788558847465, + "loss": 1.4654, + "step": 1136 + }, + { + "epoch": 0.02919497176310861, + "grad_norm": 1.0546875, + "learning_rate": 0.0001993783589880567, + "loss": 1.2685, + "step": 1137 + }, + { + "epoch": 0.02922064895903043, + "grad_norm": 0.96484375, + "learning_rate": 0.0001993778618933148, + "loss": 1.401, + "step": 1138 + }, + { + "epoch": 0.02924632615495225, + "grad_norm": 1.0234375, + "learning_rate": 0.00019937736460052177, + "loss": 1.3639, + "step": 1139 + }, + { + "epoch": 0.029272003350874067, + "grad_norm": 1.046875, + "learning_rate": 0.00019937686710967865, + "loss": 1.498, + "step": 1140 + }, + { + "epoch": 0.029297680546795886, + "grad_norm": 1.0625, + "learning_rate": 0.00019937636942078638, + "loss": 1.5416, + "step": 1141 + }, + { + "epoch": 0.029323357742717708, + "grad_norm": 1.0625, + "learning_rate": 0.00019937587153384602, + "loss": 1.6077, + "step": 1142 + }, + { + "epoch": 0.029349034938639527, + "grad_norm": 1.0390625, + "learning_rate": 0.00019937537344885852, + "loss": 1.4465, + "step": 1143 + }, + { + "epoch": 0.029374712134561345, + "grad_norm": 1.1015625, + "learning_rate": 0.00019937487516582484, + "loss": 1.5702, + "step": 1144 + }, + { + "epoch": 0.029400389330483164, + "grad_norm": 0.9765625, + "learning_rate": 0.00019937437668474602, + "loss": 1.2581, + "step": 1145 + }, + { + "epoch": 0.029426066526404983, + "grad_norm": 1.0234375, + "learning_rate": 0.00019937387800562307, + "loss": 1.5457, + "step": 1146 + }, + { + "epoch": 0.029451743722326805, + "grad_norm": 0.92578125, + "learning_rate": 0.00019937337912845696, + "loss": 1.5024, + "step": 1147 + }, + { + "epoch": 0.029477420918248624, + "grad_norm": 0.9921875, + "learning_rate": 0.00019937288005324866, + "loss": 1.3323, + "step": 1148 + }, + { + "epoch": 0.029503098114170442, + "grad_norm": 0.9765625, + "learning_rate": 0.0001993723807799992, + "loss": 1.5063, + "step": 1149 + }, + { + "epoch": 0.02952877531009226, + "grad_norm": 0.93359375, + "learning_rate": 0.00019937188130870955, + "loss": 1.4807, + "step": 1150 + }, + { + "epoch": 0.02955445250601408, + "grad_norm": 0.93359375, + "learning_rate": 0.00019937138163938074, + "loss": 1.4422, + "step": 1151 + }, + { + "epoch": 0.0295801297019359, + "grad_norm": 0.98046875, + "learning_rate": 0.0001993708817720137, + "loss": 1.6101, + "step": 1152 + }, + { + "epoch": 0.02960580689785772, + "grad_norm": 1.0, + "learning_rate": 0.0001993703817066095, + "loss": 1.2915, + "step": 1153 + }, + { + "epoch": 0.02963148409377954, + "grad_norm": 0.9375, + "learning_rate": 0.0001993698814431691, + "loss": 1.4729, + "step": 1154 + }, + { + "epoch": 0.029657161289701358, + "grad_norm": 1.1171875, + "learning_rate": 0.00019936938098169348, + "loss": 1.3154, + "step": 1155 + }, + { + "epoch": 0.029682838485623177, + "grad_norm": 1.0078125, + "learning_rate": 0.0001993688803221837, + "loss": 1.1856, + "step": 1156 + }, + { + "epoch": 0.029708515681544995, + "grad_norm": 1.0390625, + "learning_rate": 0.00019936837946464068, + "loss": 1.4448, + "step": 1157 + }, + { + "epoch": 0.029734192877466818, + "grad_norm": 0.95703125, + "learning_rate": 0.00019936787840906547, + "loss": 1.5734, + "step": 1158 + }, + { + "epoch": 0.029759870073388636, + "grad_norm": 1.5234375, + "learning_rate": 0.0001993673771554591, + "loss": 1.5457, + "step": 1159 + }, + { + "epoch": 0.029785547269310455, + "grad_norm": 0.96484375, + "learning_rate": 0.00019936687570382247, + "loss": 1.5326, + "step": 1160 + }, + { + "epoch": 0.029811224465232274, + "grad_norm": 0.96875, + "learning_rate": 0.00019936637405415663, + "loss": 1.4062, + "step": 1161 + }, + { + "epoch": 0.029836901661154092, + "grad_norm": 1.015625, + "learning_rate": 0.00019936587220646259, + "loss": 1.3567, + "step": 1162 + }, + { + "epoch": 0.029862578857075914, + "grad_norm": 0.9921875, + "learning_rate": 0.00019936537016074137, + "loss": 1.3598, + "step": 1163 + }, + { + "epoch": 0.029888256052997733, + "grad_norm": 1.828125, + "learning_rate": 0.00019936486791699391, + "loss": 1.3307, + "step": 1164 + }, + { + "epoch": 0.029913933248919552, + "grad_norm": 0.984375, + "learning_rate": 0.00019936436547522126, + "loss": 1.4853, + "step": 1165 + }, + { + "epoch": 0.02993961044484137, + "grad_norm": 1.0, + "learning_rate": 0.0001993638628354244, + "loss": 1.3955, + "step": 1166 + }, + { + "epoch": 0.02996528764076319, + "grad_norm": 0.91796875, + "learning_rate": 0.00019936335999760433, + "loss": 1.2615, + "step": 1167 + }, + { + "epoch": 0.02999096483668501, + "grad_norm": 1.0390625, + "learning_rate": 0.0001993628569617621, + "loss": 1.4572, + "step": 1168 + }, + { + "epoch": 0.03001664203260683, + "grad_norm": 0.9453125, + "learning_rate": 0.0001993623537278986, + "loss": 1.3467, + "step": 1169 + }, + { + "epoch": 0.03004231922852865, + "grad_norm": 0.97265625, + "learning_rate": 0.00019936185029601494, + "loss": 1.551, + "step": 1170 + }, + { + "epoch": 0.030067996424450467, + "grad_norm": 0.92578125, + "learning_rate": 0.0001993613466661121, + "loss": 1.5532, + "step": 1171 + }, + { + "epoch": 0.030093673620372286, + "grad_norm": 0.92578125, + "learning_rate": 0.00019936084283819105, + "loss": 1.3497, + "step": 1172 + }, + { + "epoch": 0.03011935081629411, + "grad_norm": 1.0390625, + "learning_rate": 0.00019936033881225285, + "loss": 1.5198, + "step": 1173 + }, + { + "epoch": 0.030145028012215927, + "grad_norm": 1.0078125, + "learning_rate": 0.00019935983458829843, + "loss": 1.4712, + "step": 1174 + }, + { + "epoch": 0.030170705208137746, + "grad_norm": 1.15625, + "learning_rate": 0.00019935933016632887, + "loss": 1.4757, + "step": 1175 + }, + { + "epoch": 0.030196382404059564, + "grad_norm": 0.92578125, + "learning_rate": 0.0001993588255463451, + "loss": 1.3571, + "step": 1176 + }, + { + "epoch": 0.030222059599981383, + "grad_norm": 1.0546875, + "learning_rate": 0.00019935832072834822, + "loss": 1.4206, + "step": 1177 + }, + { + "epoch": 0.030247736795903202, + "grad_norm": 1.2890625, + "learning_rate": 0.00019935781571233911, + "loss": 1.4748, + "step": 1178 + }, + { + "epoch": 0.030273413991825024, + "grad_norm": 0.97265625, + "learning_rate": 0.0001993573104983189, + "loss": 1.5958, + "step": 1179 + }, + { + "epoch": 0.030299091187746843, + "grad_norm": 1.015625, + "learning_rate": 0.00019935680508628852, + "loss": 1.3775, + "step": 1180 + }, + { + "epoch": 0.03032476838366866, + "grad_norm": 0.921875, + "learning_rate": 0.00019935629947624904, + "loss": 1.3504, + "step": 1181 + }, + { + "epoch": 0.03035044557959048, + "grad_norm": 0.95703125, + "learning_rate": 0.00019935579366820138, + "loss": 1.6669, + "step": 1182 + }, + { + "epoch": 0.0303761227755123, + "grad_norm": 0.96875, + "learning_rate": 0.00019935528766214664, + "loss": 1.347, + "step": 1183 + }, + { + "epoch": 0.03040179997143412, + "grad_norm": 1.078125, + "learning_rate": 0.00019935478145808577, + "loss": 1.3578, + "step": 1184 + }, + { + "epoch": 0.03042747716735594, + "grad_norm": 0.9609375, + "learning_rate": 0.0001993542750560198, + "loss": 1.2549, + "step": 1185 + }, + { + "epoch": 0.030453154363277758, + "grad_norm": 0.95703125, + "learning_rate": 0.0001993537684559497, + "loss": 1.3755, + "step": 1186 + }, + { + "epoch": 0.030478831559199577, + "grad_norm": 1.0234375, + "learning_rate": 0.00019935326165787656, + "loss": 1.3527, + "step": 1187 + }, + { + "epoch": 0.030504508755121396, + "grad_norm": 0.984375, + "learning_rate": 0.00019935275466180134, + "loss": 1.3499, + "step": 1188 + }, + { + "epoch": 0.030530185951043218, + "grad_norm": 0.9296875, + "learning_rate": 0.00019935224746772502, + "loss": 1.3158, + "step": 1189 + }, + { + "epoch": 0.030555863146965036, + "grad_norm": 0.9921875, + "learning_rate": 0.00019935174007564867, + "loss": 1.3525, + "step": 1190 + }, + { + "epoch": 0.030581540342886855, + "grad_norm": 1.0703125, + "learning_rate": 0.00019935123248557328, + "loss": 1.3885, + "step": 1191 + }, + { + "epoch": 0.030607217538808674, + "grad_norm": 0.91796875, + "learning_rate": 0.00019935072469749984, + "loss": 1.3659, + "step": 1192 + }, + { + "epoch": 0.030632894734730493, + "grad_norm": 1.0, + "learning_rate": 0.0001993502167114294, + "loss": 1.3813, + "step": 1193 + }, + { + "epoch": 0.030658571930652315, + "grad_norm": 1.0703125, + "learning_rate": 0.00019934970852736295, + "loss": 1.3151, + "step": 1194 + }, + { + "epoch": 0.030684249126574133, + "grad_norm": 1.015625, + "learning_rate": 0.0001993492001453015, + "loss": 1.526, + "step": 1195 + }, + { + "epoch": 0.030709926322495952, + "grad_norm": 1.0625, + "learning_rate": 0.00019934869156524606, + "loss": 1.4802, + "step": 1196 + }, + { + "epoch": 0.03073560351841777, + "grad_norm": 0.96875, + "learning_rate": 0.00019934818278719764, + "loss": 1.4433, + "step": 1197 + }, + { + "epoch": 0.03076128071433959, + "grad_norm": 0.94921875, + "learning_rate": 0.00019934767381115732, + "loss": 1.3783, + "step": 1198 + }, + { + "epoch": 0.03078695791026141, + "grad_norm": 1.1875, + "learning_rate": 0.000199347164637126, + "loss": 1.3718, + "step": 1199 + }, + { + "epoch": 0.03081263510618323, + "grad_norm": 1.046875, + "learning_rate": 0.00019934665526510478, + "loss": 1.6825, + "step": 1200 + }, + { + "epoch": 0.03083831230210505, + "grad_norm": 1.0234375, + "learning_rate": 0.00019934614569509465, + "loss": 1.3985, + "step": 1201 + }, + { + "epoch": 0.030863989498026868, + "grad_norm": 0.9765625, + "learning_rate": 0.00019934563592709662, + "loss": 1.2761, + "step": 1202 + }, + { + "epoch": 0.030889666693948686, + "grad_norm": 1.03125, + "learning_rate": 0.00019934512596111174, + "loss": 1.447, + "step": 1203 + }, + { + "epoch": 0.030915343889870505, + "grad_norm": 0.91015625, + "learning_rate": 0.00019934461579714093, + "loss": 1.2719, + "step": 1204 + }, + { + "epoch": 0.030941021085792327, + "grad_norm": 1.171875, + "learning_rate": 0.00019934410543518536, + "loss": 1.4181, + "step": 1205 + }, + { + "epoch": 0.030966698281714146, + "grad_norm": 1.1875, + "learning_rate": 0.00019934359487524592, + "loss": 1.4599, + "step": 1206 + }, + { + "epoch": 0.030992375477635965, + "grad_norm": 0.94140625, + "learning_rate": 0.00019934308411732364, + "loss": 1.4054, + "step": 1207 + }, + { + "epoch": 0.031018052673557783, + "grad_norm": 1.0703125, + "learning_rate": 0.00019934257316141961, + "loss": 1.411, + "step": 1208 + }, + { + "epoch": 0.031043729869479602, + "grad_norm": 1.0234375, + "learning_rate": 0.0001993420620075348, + "loss": 1.3816, + "step": 1209 + }, + { + "epoch": 0.031069407065401424, + "grad_norm": 0.96484375, + "learning_rate": 0.00019934155065567022, + "loss": 1.6254, + "step": 1210 + }, + { + "epoch": 0.031095084261323243, + "grad_norm": 1.1015625, + "learning_rate": 0.0001993410391058269, + "loss": 1.5864, + "step": 1211 + }, + { + "epoch": 0.03112076145724506, + "grad_norm": 1.078125, + "learning_rate": 0.00019934052735800587, + "loss": 1.6024, + "step": 1212 + }, + { + "epoch": 0.03114643865316688, + "grad_norm": 1.1484375, + "learning_rate": 0.00019934001541220816, + "loss": 1.614, + "step": 1213 + }, + { + "epoch": 0.0311721158490887, + "grad_norm": 1.03125, + "learning_rate": 0.00019933950326843472, + "loss": 1.4872, + "step": 1214 + }, + { + "epoch": 0.03119779304501052, + "grad_norm": 0.97265625, + "learning_rate": 0.00019933899092668667, + "loss": 1.4049, + "step": 1215 + }, + { + "epoch": 0.03122347024093234, + "grad_norm": 0.97265625, + "learning_rate": 0.000199338478386965, + "loss": 1.3268, + "step": 1216 + }, + { + "epoch": 0.03124914743685416, + "grad_norm": 0.9765625, + "learning_rate": 0.00019933796564927068, + "loss": 1.366, + "step": 1217 + }, + { + "epoch": 0.03127482463277598, + "grad_norm": 0.984375, + "learning_rate": 0.00019933745271360478, + "loss": 1.3945, + "step": 1218 + }, + { + "epoch": 0.0313005018286978, + "grad_norm": 0.94140625, + "learning_rate": 0.00019933693957996832, + "loss": 1.437, + "step": 1219 + }, + { + "epoch": 0.031326179024619615, + "grad_norm": 0.91796875, + "learning_rate": 0.00019933642624836228, + "loss": 1.3241, + "step": 1220 + }, + { + "epoch": 0.03135185622054144, + "grad_norm": 1.09375, + "learning_rate": 0.00019933591271878776, + "loss": 1.7029, + "step": 1221 + }, + { + "epoch": 0.03137753341646325, + "grad_norm": 1.0, + "learning_rate": 0.0001993353989912457, + "loss": 1.3072, + "step": 1222 + }, + { + "epoch": 0.031403210612385074, + "grad_norm": 1.015625, + "learning_rate": 0.00019933488506573716, + "loss": 1.4956, + "step": 1223 + }, + { + "epoch": 0.031428887808306896, + "grad_norm": 0.94921875, + "learning_rate": 0.0001993343709422632, + "loss": 1.3802, + "step": 1224 + }, + { + "epoch": 0.03145456500422871, + "grad_norm": 0.96484375, + "learning_rate": 0.0001993338566208248, + "loss": 1.4746, + "step": 1225 + }, + { + "epoch": 0.031480242200150534, + "grad_norm": 0.984375, + "learning_rate": 0.00019933334210142298, + "loss": 1.34, + "step": 1226 + }, + { + "epoch": 0.03150591939607235, + "grad_norm": 0.94140625, + "learning_rate": 0.00019933282738405882, + "loss": 1.4646, + "step": 1227 + }, + { + "epoch": 0.03153159659199417, + "grad_norm": 1.1015625, + "learning_rate": 0.00019933231246873324, + "loss": 1.3813, + "step": 1228 + }, + { + "epoch": 0.03155727378791599, + "grad_norm": 1.0390625, + "learning_rate": 0.00019933179735544737, + "loss": 1.5668, + "step": 1229 + }, + { + "epoch": 0.03158295098383781, + "grad_norm": 0.96484375, + "learning_rate": 0.00019933128204420222, + "loss": 1.3167, + "step": 1230 + }, + { + "epoch": 0.03160862817975963, + "grad_norm": 0.9765625, + "learning_rate": 0.00019933076653499878, + "loss": 1.5303, + "step": 1231 + }, + { + "epoch": 0.031634305375681446, + "grad_norm": 0.96875, + "learning_rate": 0.0001993302508278381, + "loss": 1.4316, + "step": 1232 + }, + { + "epoch": 0.03165998257160327, + "grad_norm": 0.91796875, + "learning_rate": 0.0001993297349227212, + "loss": 1.418, + "step": 1233 + }, + { + "epoch": 0.03168565976752509, + "grad_norm": 1.0078125, + "learning_rate": 0.00019932921881964913, + "loss": 1.415, + "step": 1234 + }, + { + "epoch": 0.031711336963446905, + "grad_norm": 0.91015625, + "learning_rate": 0.0001993287025186229, + "loss": 1.298, + "step": 1235 + }, + { + "epoch": 0.03173701415936873, + "grad_norm": 1.0625, + "learning_rate": 0.00019932818601964348, + "loss": 1.6523, + "step": 1236 + }, + { + "epoch": 0.03176269135529054, + "grad_norm": 1.09375, + "learning_rate": 0.000199327669322712, + "loss": 1.3831, + "step": 1237 + }, + { + "epoch": 0.031788368551212365, + "grad_norm": 1.0390625, + "learning_rate": 0.00019932715242782946, + "loss": 1.3538, + "step": 1238 + }, + { + "epoch": 0.03181404574713419, + "grad_norm": 1.0546875, + "learning_rate": 0.00019932663533499685, + "loss": 1.4682, + "step": 1239 + }, + { + "epoch": 0.031839722943056, + "grad_norm": 1.140625, + "learning_rate": 0.00019932611804421524, + "loss": 1.486, + "step": 1240 + }, + { + "epoch": 0.031865400138977824, + "grad_norm": 0.9765625, + "learning_rate": 0.00019932560055548564, + "loss": 1.5256, + "step": 1241 + }, + { + "epoch": 0.03189107733489964, + "grad_norm": 0.98828125, + "learning_rate": 0.0001993250828688091, + "loss": 1.4685, + "step": 1242 + }, + { + "epoch": 0.03191675453082146, + "grad_norm": 0.9140625, + "learning_rate": 0.00019932456498418662, + "loss": 1.4535, + "step": 1243 + }, + { + "epoch": 0.031942431726743284, + "grad_norm": 1.0078125, + "learning_rate": 0.00019932404690161927, + "loss": 1.4444, + "step": 1244 + }, + { + "epoch": 0.0319681089226651, + "grad_norm": 0.9609375, + "learning_rate": 0.0001993235286211081, + "loss": 1.3452, + "step": 1245 + }, + { + "epoch": 0.03199378611858692, + "grad_norm": 0.9140625, + "learning_rate": 0.00019932301014265405, + "loss": 1.3219, + "step": 1246 + }, + { + "epoch": 0.032019463314508737, + "grad_norm": 0.95703125, + "learning_rate": 0.00019932249146625825, + "loss": 1.527, + "step": 1247 + }, + { + "epoch": 0.03204514051043056, + "grad_norm": 0.9375, + "learning_rate": 0.00019932197259192168, + "loss": 1.3103, + "step": 1248 + }, + { + "epoch": 0.03207081770635238, + "grad_norm": 0.9765625, + "learning_rate": 0.00019932145351964542, + "loss": 1.5292, + "step": 1249 + }, + { + "epoch": 0.032096494902274196, + "grad_norm": 1.0, + "learning_rate": 0.00019932093424943043, + "loss": 1.5054, + "step": 1250 + }, + { + "epoch": 0.03212217209819602, + "grad_norm": 0.921875, + "learning_rate": 0.00019932041478127783, + "loss": 1.4028, + "step": 1251 + }, + { + "epoch": 0.03214784929411783, + "grad_norm": 0.90234375, + "learning_rate": 0.0001993198951151886, + "loss": 1.395, + "step": 1252 + }, + { + "epoch": 0.032173526490039656, + "grad_norm": 0.953125, + "learning_rate": 0.00019931937525116377, + "loss": 1.3956, + "step": 1253 + }, + { + "epoch": 0.03219920368596148, + "grad_norm": 1.0, + "learning_rate": 0.00019931885518920442, + "loss": 1.461, + "step": 1254 + }, + { + "epoch": 0.03222488088188329, + "grad_norm": 0.9921875, + "learning_rate": 0.00019931833492931156, + "loss": 1.5444, + "step": 1255 + }, + { + "epoch": 0.032250558077805115, + "grad_norm": 1.046875, + "learning_rate": 0.00019931781447148623, + "loss": 1.519, + "step": 1256 + }, + { + "epoch": 0.03227623527372693, + "grad_norm": 1.046875, + "learning_rate": 0.00019931729381572947, + "loss": 1.3063, + "step": 1257 + }, + { + "epoch": 0.03230191246964875, + "grad_norm": 1.0703125, + "learning_rate": 0.0001993167729620423, + "loss": 1.4971, + "step": 1258 + }, + { + "epoch": 0.032327589665570575, + "grad_norm": 0.91796875, + "learning_rate": 0.0001993162519104258, + "loss": 1.3789, + "step": 1259 + }, + { + "epoch": 0.03235326686149239, + "grad_norm": 0.9375, + "learning_rate": 0.00019931573066088096, + "loss": 1.3854, + "step": 1260 + }, + { + "epoch": 0.03237894405741421, + "grad_norm": 0.98828125, + "learning_rate": 0.00019931520921340882, + "loss": 1.2787, + "step": 1261 + }, + { + "epoch": 0.03240462125333603, + "grad_norm": 0.9921875, + "learning_rate": 0.00019931468756801047, + "loss": 1.3239, + "step": 1262 + }, + { + "epoch": 0.03243029844925785, + "grad_norm": 1.0234375, + "learning_rate": 0.0001993141657246869, + "loss": 1.3758, + "step": 1263 + }, + { + "epoch": 0.03245597564517967, + "grad_norm": 1.9375, + "learning_rate": 0.0001993136436834392, + "loss": 1.5797, + "step": 1264 + }, + { + "epoch": 0.03248165284110149, + "grad_norm": 1.0078125, + "learning_rate": 0.00019931312144426836, + "loss": 1.4412, + "step": 1265 + }, + { + "epoch": 0.03250733003702331, + "grad_norm": 0.90625, + "learning_rate": 0.00019931259900717545, + "loss": 1.3212, + "step": 1266 + }, + { + "epoch": 0.032533007232945124, + "grad_norm": 0.92578125, + "learning_rate": 0.00019931207637216146, + "loss": 1.4087, + "step": 1267 + }, + { + "epoch": 0.032558684428866946, + "grad_norm": 0.91015625, + "learning_rate": 0.00019931155353922753, + "loss": 1.3294, + "step": 1268 + }, + { + "epoch": 0.03258436162478876, + "grad_norm": 0.98046875, + "learning_rate": 0.0001993110305083746, + "loss": 1.5901, + "step": 1269 + }, + { + "epoch": 0.032610038820710584, + "grad_norm": 0.98828125, + "learning_rate": 0.00019931050727960378, + "loss": 1.4147, + "step": 1270 + }, + { + "epoch": 0.032635716016632406, + "grad_norm": 0.94921875, + "learning_rate": 0.00019930998385291607, + "loss": 1.4479, + "step": 1271 + }, + { + "epoch": 0.03266139321255422, + "grad_norm": 0.984375, + "learning_rate": 0.00019930946022831257, + "loss": 1.4087, + "step": 1272 + }, + { + "epoch": 0.03268707040847604, + "grad_norm": 0.9765625, + "learning_rate": 0.00019930893640579425, + "loss": 1.4766, + "step": 1273 + }, + { + "epoch": 0.03271274760439786, + "grad_norm": 1.0390625, + "learning_rate": 0.00019930841238536222, + "loss": 1.6565, + "step": 1274 + }, + { + "epoch": 0.03273842480031968, + "grad_norm": 0.9140625, + "learning_rate": 0.00019930788816701747, + "loss": 1.5196, + "step": 1275 + }, + { + "epoch": 0.0327641019962415, + "grad_norm": 0.96875, + "learning_rate": 0.00019930736375076106, + "loss": 1.4316, + "step": 1276 + }, + { + "epoch": 0.03278977919216332, + "grad_norm": 0.9375, + "learning_rate": 0.00019930683913659407, + "loss": 1.4696, + "step": 1277 + }, + { + "epoch": 0.03281545638808514, + "grad_norm": 0.98046875, + "learning_rate": 0.0001993063143245175, + "loss": 1.3563, + "step": 1278 + }, + { + "epoch": 0.032841133584006955, + "grad_norm": 0.97265625, + "learning_rate": 0.00019930578931453243, + "loss": 1.3368, + "step": 1279 + }, + { + "epoch": 0.03286681077992878, + "grad_norm": 1.09375, + "learning_rate": 0.0001993052641066399, + "loss": 1.49, + "step": 1280 + }, + { + "epoch": 0.0328924879758506, + "grad_norm": 1.0234375, + "learning_rate": 0.00019930473870084095, + "loss": 1.2795, + "step": 1281 + }, + { + "epoch": 0.032918165171772415, + "grad_norm": 0.9140625, + "learning_rate": 0.00019930421309713662, + "loss": 1.4165, + "step": 1282 + }, + { + "epoch": 0.03294384236769424, + "grad_norm": 0.921875, + "learning_rate": 0.00019930368729552793, + "loss": 1.46, + "step": 1283 + }, + { + "epoch": 0.03296951956361605, + "grad_norm": 1.0234375, + "learning_rate": 0.000199303161296016, + "loss": 1.4022, + "step": 1284 + }, + { + "epoch": 0.032995196759537875, + "grad_norm": 0.9765625, + "learning_rate": 0.00019930263509860183, + "loss": 1.3569, + "step": 1285 + }, + { + "epoch": 0.0330208739554597, + "grad_norm": 0.96484375, + "learning_rate": 0.00019930210870328647, + "loss": 1.362, + "step": 1286 + }, + { + "epoch": 0.03304655115138151, + "grad_norm": 0.99609375, + "learning_rate": 0.00019930158211007098, + "loss": 1.2735, + "step": 1287 + }, + { + "epoch": 0.033072228347303334, + "grad_norm": 1.0703125, + "learning_rate": 0.00019930105531895643, + "loss": 1.4294, + "step": 1288 + }, + { + "epoch": 0.03309790554322515, + "grad_norm": 0.91015625, + "learning_rate": 0.00019930052832994383, + "loss": 1.4341, + "step": 1289 + }, + { + "epoch": 0.03312358273914697, + "grad_norm": 0.96875, + "learning_rate": 0.00019930000114303425, + "loss": 1.5677, + "step": 1290 + }, + { + "epoch": 0.033149259935068794, + "grad_norm": 0.921875, + "learning_rate": 0.00019929947375822872, + "loss": 1.387, + "step": 1291 + }, + { + "epoch": 0.03317493713099061, + "grad_norm": 0.9296875, + "learning_rate": 0.00019929894617552832, + "loss": 1.3053, + "step": 1292 + }, + { + "epoch": 0.03320061432691243, + "grad_norm": 0.94140625, + "learning_rate": 0.00019929841839493408, + "loss": 1.5284, + "step": 1293 + }, + { + "epoch": 0.033226291522834246, + "grad_norm": 1.015625, + "learning_rate": 0.0001992978904164471, + "loss": 1.2499, + "step": 1294 + }, + { + "epoch": 0.03325196871875607, + "grad_norm": 1.1015625, + "learning_rate": 0.00019929736224006834, + "loss": 1.4412, + "step": 1295 + }, + { + "epoch": 0.03327764591467789, + "grad_norm": 0.9140625, + "learning_rate": 0.00019929683386579893, + "loss": 1.3622, + "step": 1296 + }, + { + "epoch": 0.033303323110599706, + "grad_norm": 0.94921875, + "learning_rate": 0.0001992963052936399, + "loss": 1.397, + "step": 1297 + }, + { + "epoch": 0.03332900030652153, + "grad_norm": 1.0, + "learning_rate": 0.00019929577652359233, + "loss": 1.3953, + "step": 1298 + }, + { + "epoch": 0.03335467750244334, + "grad_norm": 0.90625, + "learning_rate": 0.0001992952475556572, + "loss": 1.2782, + "step": 1299 + }, + { + "epoch": 0.033380354698365165, + "grad_norm": 1.0, + "learning_rate": 0.0001992947183898356, + "loss": 1.4231, + "step": 1300 + }, + { + "epoch": 0.03340603189428699, + "grad_norm": 0.9296875, + "learning_rate": 0.00019929418902612866, + "loss": 1.3707, + "step": 1301 + }, + { + "epoch": 0.0334317090902088, + "grad_norm": 0.9609375, + "learning_rate": 0.00019929365946453733, + "loss": 1.3749, + "step": 1302 + }, + { + "epoch": 0.033457386286130625, + "grad_norm": 0.98046875, + "learning_rate": 0.00019929312970506273, + "loss": 1.3626, + "step": 1303 + }, + { + "epoch": 0.03348306348205244, + "grad_norm": 1.0078125, + "learning_rate": 0.00019929259974770587, + "loss": 1.3875, + "step": 1304 + }, + { + "epoch": 0.03350874067797426, + "grad_norm": 0.9375, + "learning_rate": 0.00019929206959246783, + "loss": 1.5559, + "step": 1305 + }, + { + "epoch": 0.033534417873896084, + "grad_norm": 1.0234375, + "learning_rate": 0.00019929153923934967, + "loss": 1.5799, + "step": 1306 + }, + { + "epoch": 0.0335600950698179, + "grad_norm": 1.1484375, + "learning_rate": 0.00019929100868835243, + "loss": 1.5069, + "step": 1307 + }, + { + "epoch": 0.03358577226573972, + "grad_norm": 1.0078125, + "learning_rate": 0.00019929047793947722, + "loss": 1.4231, + "step": 1308 + }, + { + "epoch": 0.03361144946166154, + "grad_norm": 0.97265625, + "learning_rate": 0.000199289946992725, + "loss": 1.397, + "step": 1309 + }, + { + "epoch": 0.03363712665758336, + "grad_norm": 0.9453125, + "learning_rate": 0.0001992894158480969, + "loss": 1.4448, + "step": 1310 + }, + { + "epoch": 0.03366280385350518, + "grad_norm": 0.984375, + "learning_rate": 0.00019928888450559398, + "loss": 1.3748, + "step": 1311 + }, + { + "epoch": 0.033688481049426997, + "grad_norm": 0.9609375, + "learning_rate": 0.0001992883529652173, + "loss": 1.2605, + "step": 1312 + }, + { + "epoch": 0.03371415824534882, + "grad_norm": 0.9375, + "learning_rate": 0.00019928782122696786, + "loss": 1.3911, + "step": 1313 + }, + { + "epoch": 0.033739835441270634, + "grad_norm": 0.87109375, + "learning_rate": 0.00019928728929084676, + "loss": 1.3722, + "step": 1314 + }, + { + "epoch": 0.033765512637192456, + "grad_norm": 0.99609375, + "learning_rate": 0.00019928675715685508, + "loss": 1.5363, + "step": 1315 + }, + { + "epoch": 0.03379118983311428, + "grad_norm": 0.9765625, + "learning_rate": 0.00019928622482499387, + "loss": 1.4025, + "step": 1316 + }, + { + "epoch": 0.03381686702903609, + "grad_norm": 1.0078125, + "learning_rate": 0.00019928569229526417, + "loss": 1.5471, + "step": 1317 + }, + { + "epoch": 0.033842544224957916, + "grad_norm": 0.9140625, + "learning_rate": 0.00019928515956766705, + "loss": 1.294, + "step": 1318 + }, + { + "epoch": 0.03386822142087973, + "grad_norm": 0.9765625, + "learning_rate": 0.00019928462664220358, + "loss": 1.3571, + "step": 1319 + }, + { + "epoch": 0.03389389861680155, + "grad_norm": 1.0234375, + "learning_rate": 0.0001992840935188748, + "loss": 1.4216, + "step": 1320 + }, + { + "epoch": 0.03391957581272337, + "grad_norm": 1.0078125, + "learning_rate": 0.00019928356019768183, + "loss": 1.3485, + "step": 1321 + }, + { + "epoch": 0.03394525300864519, + "grad_norm": 0.93359375, + "learning_rate": 0.00019928302667862567, + "loss": 1.4083, + "step": 1322 + }, + { + "epoch": 0.03397093020456701, + "grad_norm": 1.0390625, + "learning_rate": 0.00019928249296170738, + "loss": 1.4517, + "step": 1323 + }, + { + "epoch": 0.03399660740048883, + "grad_norm": 1.2890625, + "learning_rate": 0.0001992819590469281, + "loss": 1.4167, + "step": 1324 + }, + { + "epoch": 0.03402228459641065, + "grad_norm": 1.03125, + "learning_rate": 0.0001992814249342888, + "loss": 1.3819, + "step": 1325 + }, + { + "epoch": 0.034047961792332465, + "grad_norm": 0.98046875, + "learning_rate": 0.00019928089062379062, + "loss": 1.4575, + "step": 1326 + }, + { + "epoch": 0.03407363898825429, + "grad_norm": 0.99609375, + "learning_rate": 0.00019928035611543456, + "loss": 1.4305, + "step": 1327 + }, + { + "epoch": 0.03409931618417611, + "grad_norm": 0.95703125, + "learning_rate": 0.00019927982140922174, + "loss": 1.3553, + "step": 1328 + }, + { + "epoch": 0.034124993380097925, + "grad_norm": 0.984375, + "learning_rate": 0.00019927928650515322, + "loss": 1.3282, + "step": 1329 + }, + { + "epoch": 0.03415067057601975, + "grad_norm": 1.046875, + "learning_rate": 0.00019927875140323003, + "loss": 1.3842, + "step": 1330 + }, + { + "epoch": 0.03417634777194156, + "grad_norm": 0.98046875, + "learning_rate": 0.00019927821610345325, + "loss": 1.3942, + "step": 1331 + }, + { + "epoch": 0.034202024967863384, + "grad_norm": 0.91015625, + "learning_rate": 0.00019927768060582396, + "loss": 1.2644, + "step": 1332 + }, + { + "epoch": 0.034227702163785206, + "grad_norm": 0.9609375, + "learning_rate": 0.00019927714491034323, + "loss": 1.3022, + "step": 1333 + }, + { + "epoch": 0.03425337935970702, + "grad_norm": 0.91796875, + "learning_rate": 0.0001992766090170121, + "loss": 1.4074, + "step": 1334 + }, + { + "epoch": 0.034279056555628844, + "grad_norm": 1.03125, + "learning_rate": 0.00019927607292583168, + "loss": 1.3799, + "step": 1335 + }, + { + "epoch": 0.03430473375155066, + "grad_norm": 0.89453125, + "learning_rate": 0.000199275536636803, + "loss": 1.218, + "step": 1336 + }, + { + "epoch": 0.03433041094747248, + "grad_norm": 1.125, + "learning_rate": 0.00019927500014992714, + "loss": 1.4376, + "step": 1337 + }, + { + "epoch": 0.0343560881433943, + "grad_norm": 0.953125, + "learning_rate": 0.00019927446346520517, + "loss": 1.358, + "step": 1338 + }, + { + "epoch": 0.03438176533931612, + "grad_norm": 0.95703125, + "learning_rate": 0.00019927392658263817, + "loss": 1.3068, + "step": 1339 + }, + { + "epoch": 0.03440744253523794, + "grad_norm": 1.03125, + "learning_rate": 0.00019927338950222718, + "loss": 1.3437, + "step": 1340 + }, + { + "epoch": 0.034433119731159756, + "grad_norm": 0.98046875, + "learning_rate": 0.00019927285222397334, + "loss": 1.5072, + "step": 1341 + }, + { + "epoch": 0.03445879692708158, + "grad_norm": 1.0546875, + "learning_rate": 0.00019927231474787762, + "loss": 1.388, + "step": 1342 + }, + { + "epoch": 0.0344844741230034, + "grad_norm": 1.015625, + "learning_rate": 0.0001992717770739412, + "loss": 1.3923, + "step": 1343 + }, + { + "epoch": 0.034510151318925215, + "grad_norm": 1.0, + "learning_rate": 0.00019927123920216504, + "loss": 1.5415, + "step": 1344 + }, + { + "epoch": 0.03453582851484704, + "grad_norm": 1.03125, + "learning_rate": 0.00019927070113255027, + "loss": 1.3741, + "step": 1345 + }, + { + "epoch": 0.03456150571076885, + "grad_norm": 0.87109375, + "learning_rate": 0.00019927016286509802, + "loss": 1.2864, + "step": 1346 + }, + { + "epoch": 0.034587182906690675, + "grad_norm": 1.1171875, + "learning_rate": 0.00019926962439980925, + "loss": 1.4656, + "step": 1347 + }, + { + "epoch": 0.0346128601026125, + "grad_norm": 0.93359375, + "learning_rate": 0.0001992690857366851, + "loss": 1.1963, + "step": 1348 + }, + { + "epoch": 0.03463853729853431, + "grad_norm": 0.89453125, + "learning_rate": 0.00019926854687572662, + "loss": 1.3047, + "step": 1349 + }, + { + "epoch": 0.034664214494456135, + "grad_norm": 1.1484375, + "learning_rate": 0.00019926800781693487, + "loss": 1.4785, + "step": 1350 + }, + { + "epoch": 0.03468989169037795, + "grad_norm": 0.8984375, + "learning_rate": 0.000199267468560311, + "loss": 1.2921, + "step": 1351 + }, + { + "epoch": 0.03471556888629977, + "grad_norm": 0.8984375, + "learning_rate": 0.00019926692910585603, + "loss": 1.2401, + "step": 1352 + }, + { + "epoch": 0.034741246082221594, + "grad_norm": 1.1484375, + "learning_rate": 0.00019926638945357098, + "loss": 1.5164, + "step": 1353 + }, + { + "epoch": 0.03476692327814341, + "grad_norm": 1.0, + "learning_rate": 0.00019926584960345704, + "loss": 1.3102, + "step": 1354 + }, + { + "epoch": 0.03479260047406523, + "grad_norm": 1.015625, + "learning_rate": 0.0001992653095555152, + "loss": 1.3392, + "step": 1355 + }, + { + "epoch": 0.03481827766998705, + "grad_norm": 0.9453125, + "learning_rate": 0.00019926476930974657, + "loss": 1.4317, + "step": 1356 + }, + { + "epoch": 0.03484395486590887, + "grad_norm": 0.9296875, + "learning_rate": 0.00019926422886615223, + "loss": 1.5288, + "step": 1357 + }, + { + "epoch": 0.03486963206183069, + "grad_norm": 1.03125, + "learning_rate": 0.00019926368822473323, + "loss": 1.2037, + "step": 1358 + }, + { + "epoch": 0.034895309257752506, + "grad_norm": 0.90625, + "learning_rate": 0.00019926314738549067, + "loss": 1.2477, + "step": 1359 + }, + { + "epoch": 0.03492098645367433, + "grad_norm": 0.9453125, + "learning_rate": 0.00019926260634842567, + "loss": 1.2366, + "step": 1360 + }, + { + "epoch": 0.034946663649596144, + "grad_norm": 1.0390625, + "learning_rate": 0.0001992620651135392, + "loss": 1.5529, + "step": 1361 + }, + { + "epoch": 0.034972340845517966, + "grad_norm": 1.0625, + "learning_rate": 0.00019926152368083242, + "loss": 1.5609, + "step": 1362 + }, + { + "epoch": 0.03499801804143979, + "grad_norm": 1.046875, + "learning_rate": 0.0001992609820503064, + "loss": 1.6152, + "step": 1363 + }, + { + "epoch": 0.0350236952373616, + "grad_norm": 0.9453125, + "learning_rate": 0.0001992604402219622, + "loss": 1.3876, + "step": 1364 + }, + { + "epoch": 0.035049372433283425, + "grad_norm": 0.90234375, + "learning_rate": 0.0001992598981958009, + "loss": 1.36, + "step": 1365 + }, + { + "epoch": 0.03507504962920524, + "grad_norm": 1.078125, + "learning_rate": 0.0001992593559718236, + "loss": 1.3105, + "step": 1366 + }, + { + "epoch": 0.03510072682512706, + "grad_norm": 0.98828125, + "learning_rate": 0.0001992588135500314, + "loss": 1.354, + "step": 1367 + }, + { + "epoch": 0.035126404021048885, + "grad_norm": 0.90234375, + "learning_rate": 0.0001992582709304253, + "loss": 1.3792, + "step": 1368 + }, + { + "epoch": 0.0351520812169707, + "grad_norm": 0.93359375, + "learning_rate": 0.00019925772811300646, + "loss": 1.3436, + "step": 1369 + }, + { + "epoch": 0.03517775841289252, + "grad_norm": 0.97265625, + "learning_rate": 0.0001992571850977759, + "loss": 1.2874, + "step": 1370 + }, + { + "epoch": 0.03520343560881434, + "grad_norm": 1.03125, + "learning_rate": 0.00019925664188473477, + "loss": 1.5504, + "step": 1371 + }, + { + "epoch": 0.03522911280473616, + "grad_norm": 1.0390625, + "learning_rate": 0.0001992560984738841, + "loss": 1.6633, + "step": 1372 + }, + { + "epoch": 0.035254790000657975, + "grad_norm": 1.0, + "learning_rate": 0.00019925555486522502, + "loss": 1.4889, + "step": 1373 + }, + { + "epoch": 0.0352804671965798, + "grad_norm": 0.99609375, + "learning_rate": 0.00019925501105875855, + "loss": 1.4307, + "step": 1374 + }, + { + "epoch": 0.03530614439250162, + "grad_norm": 0.94140625, + "learning_rate": 0.00019925446705448585, + "loss": 1.2615, + "step": 1375 + }, + { + "epoch": 0.035331821588423434, + "grad_norm": 0.9453125, + "learning_rate": 0.00019925392285240792, + "loss": 1.3757, + "step": 1376 + }, + { + "epoch": 0.035357498784345257, + "grad_norm": 0.96484375, + "learning_rate": 0.0001992533784525259, + "loss": 1.3837, + "step": 1377 + }, + { + "epoch": 0.03538317598026707, + "grad_norm": 1.0625, + "learning_rate": 0.00019925283385484086, + "loss": 1.3426, + "step": 1378 + }, + { + "epoch": 0.035408853176188894, + "grad_norm": 0.9296875, + "learning_rate": 0.00019925228905935392, + "loss": 1.3465, + "step": 1379 + }, + { + "epoch": 0.035434530372110716, + "grad_norm": 0.9140625, + "learning_rate": 0.0001992517440660661, + "loss": 1.3799, + "step": 1380 + }, + { + "epoch": 0.03546020756803253, + "grad_norm": 0.9765625, + "learning_rate": 0.0001992511988749785, + "loss": 1.3472, + "step": 1381 + }, + { + "epoch": 0.03548588476395435, + "grad_norm": 0.93359375, + "learning_rate": 0.0001992506534860923, + "loss": 1.427, + "step": 1382 + }, + { + "epoch": 0.03551156195987617, + "grad_norm": 0.97265625, + "learning_rate": 0.00019925010789940845, + "loss": 1.5358, + "step": 1383 + }, + { + "epoch": 0.03553723915579799, + "grad_norm": 1.0234375, + "learning_rate": 0.00019924956211492812, + "loss": 1.5095, + "step": 1384 + }, + { + "epoch": 0.03556291635171981, + "grad_norm": 0.9296875, + "learning_rate": 0.00019924901613265237, + "loss": 1.2929, + "step": 1385 + }, + { + "epoch": 0.03558859354764163, + "grad_norm": 0.890625, + "learning_rate": 0.0001992484699525823, + "loss": 1.4175, + "step": 1386 + }, + { + "epoch": 0.03561427074356345, + "grad_norm": 1.03125, + "learning_rate": 0.00019924792357471898, + "loss": 1.4011, + "step": 1387 + }, + { + "epoch": 0.035639947939485266, + "grad_norm": 0.953125, + "learning_rate": 0.00019924737699906353, + "loss": 1.4942, + "step": 1388 + }, + { + "epoch": 0.03566562513540709, + "grad_norm": 1.0, + "learning_rate": 0.00019924683022561702, + "loss": 1.5811, + "step": 1389 + }, + { + "epoch": 0.03569130233132891, + "grad_norm": 0.984375, + "learning_rate": 0.00019924628325438055, + "loss": 1.5009, + "step": 1390 + }, + { + "epoch": 0.035716979527250725, + "grad_norm": 0.98828125, + "learning_rate": 0.0001992457360853552, + "loss": 1.5456, + "step": 1391 + }, + { + "epoch": 0.03574265672317255, + "grad_norm": 0.94140625, + "learning_rate": 0.00019924518871854206, + "loss": 1.4641, + "step": 1392 + }, + { + "epoch": 0.03576833391909436, + "grad_norm": 0.93359375, + "learning_rate": 0.00019924464115394223, + "loss": 1.3502, + "step": 1393 + }, + { + "epoch": 0.035794011115016185, + "grad_norm": 0.92578125, + "learning_rate": 0.00019924409339155678, + "loss": 1.4687, + "step": 1394 + }, + { + "epoch": 0.03581968831093801, + "grad_norm": 0.92578125, + "learning_rate": 0.00019924354543138684, + "loss": 1.4876, + "step": 1395 + }, + { + "epoch": 0.03584536550685982, + "grad_norm": 0.921875, + "learning_rate": 0.00019924299727343346, + "loss": 1.4132, + "step": 1396 + }, + { + "epoch": 0.035871042702781644, + "grad_norm": 0.96484375, + "learning_rate": 0.00019924244891769775, + "loss": 1.3422, + "step": 1397 + }, + { + "epoch": 0.03589671989870346, + "grad_norm": 0.98046875, + "learning_rate": 0.00019924190036418077, + "loss": 1.4029, + "step": 1398 + }, + { + "epoch": 0.03592239709462528, + "grad_norm": 0.9921875, + "learning_rate": 0.0001992413516128837, + "loss": 1.4894, + "step": 1399 + }, + { + "epoch": 0.035948074290547104, + "grad_norm": 0.94921875, + "learning_rate": 0.00019924080266380757, + "loss": 1.2839, + "step": 1400 + }, + { + "epoch": 0.03597375148646892, + "grad_norm": 0.91796875, + "learning_rate": 0.00019924025351695347, + "loss": 1.3967, + "step": 1401 + }, + { + "epoch": 0.03599942868239074, + "grad_norm": 0.9375, + "learning_rate": 0.00019923970417232254, + "loss": 1.3999, + "step": 1402 + }, + { + "epoch": 0.036025105878312556, + "grad_norm": 0.890625, + "learning_rate": 0.0001992391546299158, + "loss": 1.332, + "step": 1403 + }, + { + "epoch": 0.03605078307423438, + "grad_norm": 0.89453125, + "learning_rate": 0.00019923860488973443, + "loss": 1.2533, + "step": 1404 + }, + { + "epoch": 0.0360764602701562, + "grad_norm": 0.94921875, + "learning_rate": 0.00019923805495177947, + "loss": 1.58, + "step": 1405 + }, + { + "epoch": 0.036102137466078016, + "grad_norm": 0.98046875, + "learning_rate": 0.000199237504816052, + "loss": 1.2631, + "step": 1406 + }, + { + "epoch": 0.03612781466199984, + "grad_norm": 0.9609375, + "learning_rate": 0.0001992369544825532, + "loss": 1.389, + "step": 1407 + }, + { + "epoch": 0.03615349185792165, + "grad_norm": 0.9765625, + "learning_rate": 0.00019923640395128409, + "loss": 1.5878, + "step": 1408 + }, + { + "epoch": 0.036179169053843475, + "grad_norm": 0.97265625, + "learning_rate": 0.00019923585322224576, + "loss": 1.3869, + "step": 1409 + }, + { + "epoch": 0.0362048462497653, + "grad_norm": 0.96875, + "learning_rate": 0.00019923530229543938, + "loss": 1.3539, + "step": 1410 + }, + { + "epoch": 0.03623052344568711, + "grad_norm": 0.8515625, + "learning_rate": 0.000199234751170866, + "loss": 1.4121, + "step": 1411 + }, + { + "epoch": 0.036256200641608935, + "grad_norm": 1.171875, + "learning_rate": 0.0001992341998485267, + "loss": 1.3528, + "step": 1412 + }, + { + "epoch": 0.03628187783753075, + "grad_norm": 0.88671875, + "learning_rate": 0.00019923364832842263, + "loss": 1.3281, + "step": 1413 + }, + { + "epoch": 0.03630755503345257, + "grad_norm": 0.94921875, + "learning_rate": 0.00019923309661055484, + "loss": 1.3571, + "step": 1414 + }, + { + "epoch": 0.036333232229374395, + "grad_norm": 0.9296875, + "learning_rate": 0.0001992325446949245, + "loss": 1.5198, + "step": 1415 + }, + { + "epoch": 0.03635890942529621, + "grad_norm": 0.91015625, + "learning_rate": 0.0001992319925815326, + "loss": 1.4966, + "step": 1416 + }, + { + "epoch": 0.03638458662121803, + "grad_norm": 1.0234375, + "learning_rate": 0.00019923144027038034, + "loss": 1.4983, + "step": 1417 + }, + { + "epoch": 0.03641026381713985, + "grad_norm": 1.0546875, + "learning_rate": 0.00019923088776146878, + "loss": 1.3515, + "step": 1418 + }, + { + "epoch": 0.03643594101306167, + "grad_norm": 0.97265625, + "learning_rate": 0.000199230335054799, + "loss": 1.3502, + "step": 1419 + }, + { + "epoch": 0.036461618208983484, + "grad_norm": 0.9921875, + "learning_rate": 0.00019922978215037215, + "loss": 1.4101, + "step": 1420 + }, + { + "epoch": 0.03648729540490531, + "grad_norm": 0.96875, + "learning_rate": 0.0001992292290481893, + "loss": 1.2212, + "step": 1421 + }, + { + "epoch": 0.03651297260082713, + "grad_norm": 0.8671875, + "learning_rate": 0.00019922867574825157, + "loss": 1.2241, + "step": 1422 + }, + { + "epoch": 0.036538649796748944, + "grad_norm": 1.0234375, + "learning_rate": 0.00019922812225056004, + "loss": 1.456, + "step": 1423 + }, + { + "epoch": 0.036564326992670766, + "grad_norm": 0.953125, + "learning_rate": 0.00019922756855511584, + "loss": 1.4287, + "step": 1424 + }, + { + "epoch": 0.03659000418859258, + "grad_norm": 0.98046875, + "learning_rate": 0.00019922701466192004, + "loss": 1.3166, + "step": 1425 + }, + { + "epoch": 0.036615681384514404, + "grad_norm": 0.984375, + "learning_rate": 0.00019922646057097377, + "loss": 1.5949, + "step": 1426 + }, + { + "epoch": 0.036641358580436226, + "grad_norm": 0.9765625, + "learning_rate": 0.00019922590628227812, + "loss": 1.3839, + "step": 1427 + }, + { + "epoch": 0.03666703577635804, + "grad_norm": 0.875, + "learning_rate": 0.00019922535179583422, + "loss": 1.4304, + "step": 1428 + }, + { + "epoch": 0.03669271297227986, + "grad_norm": 1.0546875, + "learning_rate": 0.00019922479711164315, + "loss": 1.333, + "step": 1429 + }, + { + "epoch": 0.03671839016820168, + "grad_norm": 0.99609375, + "learning_rate": 0.000199224242229706, + "loss": 1.3465, + "step": 1430 + }, + { + "epoch": 0.0367440673641235, + "grad_norm": 0.8828125, + "learning_rate": 0.00019922368715002392, + "loss": 1.4738, + "step": 1431 + }, + { + "epoch": 0.03676974456004532, + "grad_norm": 1.015625, + "learning_rate": 0.000199223131872598, + "loss": 1.3818, + "step": 1432 + }, + { + "epoch": 0.03679542175596714, + "grad_norm": 0.953125, + "learning_rate": 0.00019922257639742934, + "loss": 1.3826, + "step": 1433 + }, + { + "epoch": 0.03682109895188896, + "grad_norm": 0.9765625, + "learning_rate": 0.00019922202072451904, + "loss": 1.3152, + "step": 1434 + }, + { + "epoch": 0.036846776147810775, + "grad_norm": 1.0546875, + "learning_rate": 0.0001992214648538682, + "loss": 1.3862, + "step": 1435 + }, + { + "epoch": 0.0368724533437326, + "grad_norm": 0.95703125, + "learning_rate": 0.00019922090878547798, + "loss": 1.3916, + "step": 1436 + }, + { + "epoch": 0.03689813053965442, + "grad_norm": 1.0703125, + "learning_rate": 0.0001992203525193494, + "loss": 1.379, + "step": 1437 + }, + { + "epoch": 0.036923807735576235, + "grad_norm": 1.046875, + "learning_rate": 0.00019921979605548368, + "loss": 1.3768, + "step": 1438 + }, + { + "epoch": 0.03694948493149806, + "grad_norm": 0.9921875, + "learning_rate": 0.00019921923939388182, + "loss": 1.4391, + "step": 1439 + }, + { + "epoch": 0.03697516212741987, + "grad_norm": 0.9765625, + "learning_rate": 0.000199218682534545, + "loss": 1.4632, + "step": 1440 + }, + { + "epoch": 0.037000839323341694, + "grad_norm": 0.94921875, + "learning_rate": 0.0001992181254774743, + "loss": 1.4478, + "step": 1441 + }, + { + "epoch": 0.037026516519263517, + "grad_norm": 0.98046875, + "learning_rate": 0.00019921756822267086, + "loss": 1.3356, + "step": 1442 + }, + { + "epoch": 0.03705219371518533, + "grad_norm": 0.9296875, + "learning_rate": 0.00019921701077013575, + "loss": 1.5418, + "step": 1443 + }, + { + "epoch": 0.037077870911107154, + "grad_norm": 0.9609375, + "learning_rate": 0.0001992164531198701, + "loss": 1.4791, + "step": 1444 + }, + { + "epoch": 0.03710354810702897, + "grad_norm": 1.0078125, + "learning_rate": 0.00019921589527187504, + "loss": 1.5326, + "step": 1445 + }, + { + "epoch": 0.03712922530295079, + "grad_norm": 1.015625, + "learning_rate": 0.00019921533722615164, + "loss": 1.551, + "step": 1446 + }, + { + "epoch": 0.03715490249887261, + "grad_norm": 1.0546875, + "learning_rate": 0.000199214778982701, + "loss": 1.4105, + "step": 1447 + }, + { + "epoch": 0.03718057969479443, + "grad_norm": 1.28125, + "learning_rate": 0.00019921422054152435, + "loss": 1.328, + "step": 1448 + }, + { + "epoch": 0.03720625689071625, + "grad_norm": 0.94921875, + "learning_rate": 0.00019921366190262267, + "loss": 1.3933, + "step": 1449 + }, + { + "epoch": 0.037231934086638066, + "grad_norm": 0.97265625, + "learning_rate": 0.00019921310306599712, + "loss": 1.36, + "step": 1450 + }, + { + "epoch": 0.03725761128255989, + "grad_norm": 1.0546875, + "learning_rate": 0.00019921254403164885, + "loss": 1.5009, + "step": 1451 + }, + { + "epoch": 0.03728328847848171, + "grad_norm": 1.0625, + "learning_rate": 0.0001992119847995789, + "loss": 1.521, + "step": 1452 + }, + { + "epoch": 0.037308965674403526, + "grad_norm": 1.015625, + "learning_rate": 0.00019921142536978844, + "loss": 1.5055, + "step": 1453 + }, + { + "epoch": 0.03733464287032535, + "grad_norm": 1.015625, + "learning_rate": 0.0001992108657422786, + "loss": 1.5727, + "step": 1454 + }, + { + "epoch": 0.03736032006624716, + "grad_norm": 1.0546875, + "learning_rate": 0.00019921030591705047, + "loss": 1.4005, + "step": 1455 + }, + { + "epoch": 0.037385997262168985, + "grad_norm": 0.92578125, + "learning_rate": 0.00019920974589410513, + "loss": 1.4962, + "step": 1456 + }, + { + "epoch": 0.03741167445809081, + "grad_norm": 0.9921875, + "learning_rate": 0.0001992091856734437, + "loss": 1.4774, + "step": 1457 + }, + { + "epoch": 0.03743735165401262, + "grad_norm": 1.0859375, + "learning_rate": 0.00019920862525506737, + "loss": 1.4959, + "step": 1458 + }, + { + "epoch": 0.037463028849934445, + "grad_norm": 0.94140625, + "learning_rate": 0.0001992080646389772, + "loss": 1.3618, + "step": 1459 + }, + { + "epoch": 0.03748870604585626, + "grad_norm": 1.015625, + "learning_rate": 0.00019920750382517434, + "loss": 1.3206, + "step": 1460 + }, + { + "epoch": 0.03751438324177808, + "grad_norm": 0.9921875, + "learning_rate": 0.00019920694281365986, + "loss": 1.1625, + "step": 1461 + }, + { + "epoch": 0.037540060437699904, + "grad_norm": 0.97265625, + "learning_rate": 0.0001992063816044349, + "loss": 1.3956, + "step": 1462 + }, + { + "epoch": 0.03756573763362172, + "grad_norm": 0.95703125, + "learning_rate": 0.00019920582019750062, + "loss": 1.3738, + "step": 1463 + }, + { + "epoch": 0.03759141482954354, + "grad_norm": 0.96484375, + "learning_rate": 0.00019920525859285805, + "loss": 1.2921, + "step": 1464 + }, + { + "epoch": 0.03761709202546536, + "grad_norm": 0.9453125, + "learning_rate": 0.0001992046967905084, + "loss": 1.3562, + "step": 1465 + }, + { + "epoch": 0.03764276922138718, + "grad_norm": 0.96875, + "learning_rate": 0.00019920413479045275, + "loss": 1.3119, + "step": 1466 + }, + { + "epoch": 0.037668446417309, + "grad_norm": 0.89453125, + "learning_rate": 0.00019920357259269218, + "loss": 1.3312, + "step": 1467 + }, + { + "epoch": 0.037694123613230816, + "grad_norm": 0.92578125, + "learning_rate": 0.0001992030101972279, + "loss": 1.3349, + "step": 1468 + }, + { + "epoch": 0.03771980080915264, + "grad_norm": 0.97265625, + "learning_rate": 0.00019920244760406096, + "loss": 1.4584, + "step": 1469 + }, + { + "epoch": 0.037745478005074454, + "grad_norm": 0.8984375, + "learning_rate": 0.00019920188481319247, + "loss": 1.2528, + "step": 1470 + }, + { + "epoch": 0.037771155200996276, + "grad_norm": 0.921875, + "learning_rate": 0.00019920132182462362, + "loss": 1.4069, + "step": 1471 + }, + { + "epoch": 0.03779683239691809, + "grad_norm": 0.9453125, + "learning_rate": 0.00019920075863835552, + "loss": 1.1422, + "step": 1472 + }, + { + "epoch": 0.03782250959283991, + "grad_norm": 0.94921875, + "learning_rate": 0.00019920019525438925, + "loss": 1.495, + "step": 1473 + }, + { + "epoch": 0.037848186788761735, + "grad_norm": 1.0390625, + "learning_rate": 0.0001991996316727259, + "loss": 1.5764, + "step": 1474 + }, + { + "epoch": 0.03787386398468355, + "grad_norm": 0.91796875, + "learning_rate": 0.0001991990678933667, + "loss": 1.4822, + "step": 1475 + }, + { + "epoch": 0.03789954118060537, + "grad_norm": 0.953125, + "learning_rate": 0.00019919850391631272, + "loss": 1.1688, + "step": 1476 + }, + { + "epoch": 0.03792521837652719, + "grad_norm": 0.9296875, + "learning_rate": 0.00019919793974156505, + "loss": 1.3047, + "step": 1477 + }, + { + "epoch": 0.03795089557244901, + "grad_norm": 0.9609375, + "learning_rate": 0.00019919737536912489, + "loss": 1.4364, + "step": 1478 + }, + { + "epoch": 0.03797657276837083, + "grad_norm": 1.015625, + "learning_rate": 0.00019919681079899327, + "loss": 1.3653, + "step": 1479 + }, + { + "epoch": 0.03800224996429265, + "grad_norm": 0.890625, + "learning_rate": 0.0001991962460311714, + "loss": 1.3193, + "step": 1480 + }, + { + "epoch": 0.03802792716021447, + "grad_norm": 0.984375, + "learning_rate": 0.00019919568106566038, + "loss": 1.3126, + "step": 1481 + }, + { + "epoch": 0.038053604356136285, + "grad_norm": 0.94921875, + "learning_rate": 0.0001991951159024613, + "loss": 1.4965, + "step": 1482 + }, + { + "epoch": 0.03807928155205811, + "grad_norm": 1.7109375, + "learning_rate": 0.00019919455054157533, + "loss": 1.4155, + "step": 1483 + }, + { + "epoch": 0.03810495874797993, + "grad_norm": 0.98046875, + "learning_rate": 0.00019919398498300357, + "loss": 1.2981, + "step": 1484 + }, + { + "epoch": 0.038130635943901744, + "grad_norm": 0.96875, + "learning_rate": 0.0001991934192267472, + "loss": 1.4848, + "step": 1485 + }, + { + "epoch": 0.03815631313982357, + "grad_norm": 0.94140625, + "learning_rate": 0.00019919285327280726, + "loss": 1.3882, + "step": 1486 + }, + { + "epoch": 0.03818199033574538, + "grad_norm": 0.9609375, + "learning_rate": 0.00019919228712118493, + "loss": 1.3575, + "step": 1487 + }, + { + "epoch": 0.038207667531667204, + "grad_norm": 1.03125, + "learning_rate": 0.00019919172077188132, + "loss": 1.3743, + "step": 1488 + }, + { + "epoch": 0.038233344727589026, + "grad_norm": 1.0390625, + "learning_rate": 0.0001991911542248976, + "loss": 1.3553, + "step": 1489 + }, + { + "epoch": 0.03825902192351084, + "grad_norm": 1.0625, + "learning_rate": 0.00019919058748023484, + "loss": 1.3861, + "step": 1490 + }, + { + "epoch": 0.038284699119432664, + "grad_norm": 1.0, + "learning_rate": 0.00019919002053789422, + "loss": 1.4908, + "step": 1491 + }, + { + "epoch": 0.03831037631535448, + "grad_norm": 1.0, + "learning_rate": 0.00019918945339787687, + "loss": 1.4496, + "step": 1492 + }, + { + "epoch": 0.0383360535112763, + "grad_norm": 0.921875, + "learning_rate": 0.00019918888606018387, + "loss": 1.3719, + "step": 1493 + }, + { + "epoch": 0.03836173070719812, + "grad_norm": 0.859375, + "learning_rate": 0.00019918831852481638, + "loss": 1.2127, + "step": 1494 + }, + { + "epoch": 0.03838740790311994, + "grad_norm": 0.96875, + "learning_rate": 0.0001991877507917755, + "loss": 1.4935, + "step": 1495 + }, + { + "epoch": 0.03841308509904176, + "grad_norm": 0.91796875, + "learning_rate": 0.00019918718286106245, + "loss": 1.3855, + "step": 1496 + }, + { + "epoch": 0.038438762294963576, + "grad_norm": 1.0234375, + "learning_rate": 0.00019918661473267827, + "loss": 1.376, + "step": 1497 + }, + { + "epoch": 0.0384644394908854, + "grad_norm": 0.90234375, + "learning_rate": 0.00019918604640662416, + "loss": 1.4583, + "step": 1498 + }, + { + "epoch": 0.03849011668680722, + "grad_norm": 0.8984375, + "learning_rate": 0.0001991854778829012, + "loss": 1.42, + "step": 1499 + }, + { + "epoch": 0.038515793882729035, + "grad_norm": 0.921875, + "learning_rate": 0.00019918490916151052, + "loss": 1.52, + "step": 1500 + }, + { + "epoch": 0.03854147107865086, + "grad_norm": 0.92578125, + "learning_rate": 0.00019918434024245328, + "loss": 1.401, + "step": 1501 + }, + { + "epoch": 0.03856714827457267, + "grad_norm": 0.9296875, + "learning_rate": 0.00019918377112573065, + "loss": 1.5658, + "step": 1502 + }, + { + "epoch": 0.038592825470494495, + "grad_norm": 0.9140625, + "learning_rate": 0.0001991832018113437, + "loss": 1.3486, + "step": 1503 + }, + { + "epoch": 0.03861850266641632, + "grad_norm": 0.92578125, + "learning_rate": 0.00019918263229929358, + "loss": 1.3445, + "step": 1504 + }, + { + "epoch": 0.03864417986233813, + "grad_norm": 0.9375, + "learning_rate": 0.00019918206258958142, + "loss": 1.2173, + "step": 1505 + }, + { + "epoch": 0.038669857058259954, + "grad_norm": 0.91796875, + "learning_rate": 0.0001991814926822084, + "loss": 1.5226, + "step": 1506 + }, + { + "epoch": 0.03869553425418177, + "grad_norm": 0.92578125, + "learning_rate": 0.0001991809225771756, + "loss": 1.3405, + "step": 1507 + }, + { + "epoch": 0.03872121145010359, + "grad_norm": 1.0546875, + "learning_rate": 0.0001991803522744842, + "loss": 1.317, + "step": 1508 + }, + { + "epoch": 0.038746888646025414, + "grad_norm": 0.8984375, + "learning_rate": 0.0001991797817741353, + "loss": 1.1821, + "step": 1509 + }, + { + "epoch": 0.03877256584194723, + "grad_norm": 1.09375, + "learning_rate": 0.00019917921107613006, + "loss": 1.6162, + "step": 1510 + }, + { + "epoch": 0.03879824303786905, + "grad_norm": 0.921875, + "learning_rate": 0.0001991786401804696, + "loss": 1.4272, + "step": 1511 + }, + { + "epoch": 0.038823920233790866, + "grad_norm": 0.9375, + "learning_rate": 0.0001991780690871551, + "loss": 1.3422, + "step": 1512 + }, + { + "epoch": 0.03884959742971269, + "grad_norm": 0.92578125, + "learning_rate": 0.00019917749779618763, + "loss": 1.4964, + "step": 1513 + }, + { + "epoch": 0.03887527462563451, + "grad_norm": 0.84765625, + "learning_rate": 0.0001991769263075684, + "loss": 1.2891, + "step": 1514 + }, + { + "epoch": 0.038900951821556326, + "grad_norm": 0.9453125, + "learning_rate": 0.0001991763546212985, + "loss": 1.2648, + "step": 1515 + }, + { + "epoch": 0.03892662901747815, + "grad_norm": 1.0625, + "learning_rate": 0.00019917578273737907, + "loss": 1.3042, + "step": 1516 + }, + { + "epoch": 0.03895230621339996, + "grad_norm": 0.8671875, + "learning_rate": 0.00019917521065581127, + "loss": 1.2898, + "step": 1517 + }, + { + "epoch": 0.038977983409321786, + "grad_norm": 0.921875, + "learning_rate": 0.00019917463837659626, + "loss": 1.3854, + "step": 1518 + }, + { + "epoch": 0.03900366060524361, + "grad_norm": 0.984375, + "learning_rate": 0.00019917406589973512, + "loss": 1.3566, + "step": 1519 + }, + { + "epoch": 0.03902933780116542, + "grad_norm": 0.96875, + "learning_rate": 0.00019917349322522902, + "loss": 1.2936, + "step": 1520 + }, + { + "epoch": 0.039055014997087245, + "grad_norm": 0.9609375, + "learning_rate": 0.00019917292035307913, + "loss": 1.3708, + "step": 1521 + }, + { + "epoch": 0.03908069219300906, + "grad_norm": 0.9296875, + "learning_rate": 0.00019917234728328658, + "loss": 1.5504, + "step": 1522 + }, + { + "epoch": 0.03910636938893088, + "grad_norm": 1.0078125, + "learning_rate": 0.00019917177401585246, + "loss": 1.3721, + "step": 1523 + }, + { + "epoch": 0.0391320465848527, + "grad_norm": 0.90625, + "learning_rate": 0.000199171200550778, + "loss": 1.3206, + "step": 1524 + }, + { + "epoch": 0.03915772378077452, + "grad_norm": 0.92578125, + "learning_rate": 0.00019917062688806425, + "loss": 1.3414, + "step": 1525 + }, + { + "epoch": 0.03918340097669634, + "grad_norm": 1.0078125, + "learning_rate": 0.00019917005302771244, + "loss": 1.4972, + "step": 1526 + }, + { + "epoch": 0.03920907817261816, + "grad_norm": 1.015625, + "learning_rate": 0.00019916947896972365, + "loss": 1.4944, + "step": 1527 + }, + { + "epoch": 0.03923475536853998, + "grad_norm": 0.875, + "learning_rate": 0.00019916890471409905, + "loss": 1.4231, + "step": 1528 + }, + { + "epoch": 0.039260432564461795, + "grad_norm": 0.9296875, + "learning_rate": 0.00019916833026083975, + "loss": 1.4347, + "step": 1529 + }, + { + "epoch": 0.03928610976038362, + "grad_norm": 1.0078125, + "learning_rate": 0.00019916775560994697, + "loss": 1.3661, + "step": 1530 + }, + { + "epoch": 0.03931178695630544, + "grad_norm": 1.0078125, + "learning_rate": 0.0001991671807614218, + "loss": 1.5283, + "step": 1531 + }, + { + "epoch": 0.039337464152227254, + "grad_norm": 0.94140625, + "learning_rate": 0.00019916660571526538, + "loss": 1.4445, + "step": 1532 + }, + { + "epoch": 0.039363141348149076, + "grad_norm": 1.5390625, + "learning_rate": 0.00019916603047147888, + "loss": 1.6612, + "step": 1533 + }, + { + "epoch": 0.03938881854407089, + "grad_norm": 0.9765625, + "learning_rate": 0.00019916545503006344, + "loss": 1.3085, + "step": 1534 + }, + { + "epoch": 0.039414495739992714, + "grad_norm": 0.9296875, + "learning_rate": 0.00019916487939102023, + "loss": 1.3679, + "step": 1535 + }, + { + "epoch": 0.039440172935914536, + "grad_norm": 0.95703125, + "learning_rate": 0.00019916430355435032, + "loss": 1.2591, + "step": 1536 + }, + { + "epoch": 0.03946585013183635, + "grad_norm": 1.0078125, + "learning_rate": 0.00019916372752005495, + "loss": 1.3744, + "step": 1537 + }, + { + "epoch": 0.03949152732775817, + "grad_norm": 0.95703125, + "learning_rate": 0.0001991631512881352, + "loss": 1.3794, + "step": 1538 + }, + { + "epoch": 0.03951720452367999, + "grad_norm": 0.8671875, + "learning_rate": 0.00019916257485859228, + "loss": 1.3393, + "step": 1539 + }, + { + "epoch": 0.03954288171960181, + "grad_norm": 0.9140625, + "learning_rate": 0.0001991619982314273, + "loss": 1.3299, + "step": 1540 + }, + { + "epoch": 0.03956855891552363, + "grad_norm": 0.9609375, + "learning_rate": 0.0001991614214066414, + "loss": 1.51, + "step": 1541 + }, + { + "epoch": 0.03959423611144545, + "grad_norm": 0.93359375, + "learning_rate": 0.00019916084438423575, + "loss": 1.314, + "step": 1542 + }, + { + "epoch": 0.03961991330736727, + "grad_norm": 0.92578125, + "learning_rate": 0.0001991602671642115, + "loss": 1.3667, + "step": 1543 + }, + { + "epoch": 0.039645590503289085, + "grad_norm": 0.9921875, + "learning_rate": 0.00019915968974656978, + "loss": 1.3948, + "step": 1544 + }, + { + "epoch": 0.03967126769921091, + "grad_norm": 1.0, + "learning_rate": 0.00019915911213131177, + "loss": 1.3902, + "step": 1545 + }, + { + "epoch": 0.03969694489513273, + "grad_norm": 0.98046875, + "learning_rate": 0.0001991585343184386, + "loss": 1.4541, + "step": 1546 + }, + { + "epoch": 0.039722622091054545, + "grad_norm": 0.96484375, + "learning_rate": 0.00019915795630795144, + "loss": 1.3656, + "step": 1547 + }, + { + "epoch": 0.03974829928697637, + "grad_norm": 0.94921875, + "learning_rate": 0.0001991573780998514, + "loss": 1.3135, + "step": 1548 + }, + { + "epoch": 0.03977397648289818, + "grad_norm": 1.015625, + "learning_rate": 0.00019915679969413969, + "loss": 1.2685, + "step": 1549 + }, + { + "epoch": 0.039799653678820004, + "grad_norm": 0.98046875, + "learning_rate": 0.00019915622109081745, + "loss": 1.4244, + "step": 1550 + }, + { + "epoch": 0.03982533087474183, + "grad_norm": 0.984375, + "learning_rate": 0.0001991556422898858, + "loss": 1.4732, + "step": 1551 + }, + { + "epoch": 0.03985100807066364, + "grad_norm": 0.9765625, + "learning_rate": 0.00019915506329134586, + "loss": 1.3535, + "step": 1552 + }, + { + "epoch": 0.039876685266585464, + "grad_norm": 0.96484375, + "learning_rate": 0.00019915448409519893, + "loss": 1.2659, + "step": 1553 + }, + { + "epoch": 0.03990236246250728, + "grad_norm": 0.94140625, + "learning_rate": 0.000199153904701446, + "loss": 1.3599, + "step": 1554 + }, + { + "epoch": 0.0399280396584291, + "grad_norm": 0.953125, + "learning_rate": 0.00019915332511008833, + "loss": 1.3674, + "step": 1555 + }, + { + "epoch": 0.039953716854350924, + "grad_norm": 0.98828125, + "learning_rate": 0.00019915274532112702, + "loss": 1.5626, + "step": 1556 + }, + { + "epoch": 0.03997939405027274, + "grad_norm": 0.9375, + "learning_rate": 0.00019915216533456325, + "loss": 1.4518, + "step": 1557 + }, + { + "epoch": 0.04000507124619456, + "grad_norm": 1.28125, + "learning_rate": 0.00019915158515039817, + "loss": 1.4206, + "step": 1558 + }, + { + "epoch": 0.040030748442116376, + "grad_norm": 0.9296875, + "learning_rate": 0.00019915100476863295, + "loss": 1.2435, + "step": 1559 + }, + { + "epoch": 0.0400564256380382, + "grad_norm": 0.94140625, + "learning_rate": 0.00019915042418926871, + "loss": 1.3289, + "step": 1560 + }, + { + "epoch": 0.04008210283396002, + "grad_norm": 1.0625, + "learning_rate": 0.00019914984341230666, + "loss": 1.4664, + "step": 1561 + }, + { + "epoch": 0.040107780029881836, + "grad_norm": 0.875, + "learning_rate": 0.0001991492624377479, + "loss": 1.3392, + "step": 1562 + }, + { + "epoch": 0.04013345722580366, + "grad_norm": 0.95703125, + "learning_rate": 0.00019914868126559365, + "loss": 1.4754, + "step": 1563 + }, + { + "epoch": 0.04015913442172547, + "grad_norm": 0.96875, + "learning_rate": 0.000199148099895845, + "loss": 1.1943, + "step": 1564 + }, + { + "epoch": 0.040184811617647295, + "grad_norm": 0.953125, + "learning_rate": 0.00019914751832850316, + "loss": 1.3792, + "step": 1565 + }, + { + "epoch": 0.04021048881356912, + "grad_norm": 0.98046875, + "learning_rate": 0.00019914693656356927, + "loss": 1.4542, + "step": 1566 + }, + { + "epoch": 0.04023616600949093, + "grad_norm": 1.0234375, + "learning_rate": 0.0001991463546010445, + "loss": 1.3897, + "step": 1567 + }, + { + "epoch": 0.040261843205412755, + "grad_norm": 0.98046875, + "learning_rate": 0.00019914577244092998, + "loss": 1.3639, + "step": 1568 + }, + { + "epoch": 0.04028752040133457, + "grad_norm": 0.93359375, + "learning_rate": 0.0001991451900832269, + "loss": 1.4334, + "step": 1569 + }, + { + "epoch": 0.04031319759725639, + "grad_norm": 0.94140625, + "learning_rate": 0.0001991446075279364, + "loss": 1.3039, + "step": 1570 + }, + { + "epoch": 0.040338874793178214, + "grad_norm": 0.98828125, + "learning_rate": 0.00019914402477505967, + "loss": 1.3307, + "step": 1571 + }, + { + "epoch": 0.04036455198910003, + "grad_norm": 0.98828125, + "learning_rate": 0.00019914344182459786, + "loss": 1.3686, + "step": 1572 + }, + { + "epoch": 0.04039022918502185, + "grad_norm": 0.8359375, + "learning_rate": 0.00019914285867655212, + "loss": 1.245, + "step": 1573 + }, + { + "epoch": 0.04041590638094367, + "grad_norm": 1.015625, + "learning_rate": 0.0001991422753309236, + "loss": 1.388, + "step": 1574 + }, + { + "epoch": 0.04044158357686549, + "grad_norm": 0.96484375, + "learning_rate": 0.0001991416917877135, + "loss": 1.365, + "step": 1575 + }, + { + "epoch": 0.040467260772787304, + "grad_norm": 0.96484375, + "learning_rate": 0.00019914110804692295, + "loss": 1.3769, + "step": 1576 + }, + { + "epoch": 0.040492937968709126, + "grad_norm": 0.84375, + "learning_rate": 0.00019914052410855315, + "loss": 1.1926, + "step": 1577 + }, + { + "epoch": 0.04051861516463095, + "grad_norm": 0.9453125, + "learning_rate": 0.00019913993997260524, + "loss": 1.1941, + "step": 1578 + }, + { + "epoch": 0.040544292360552764, + "grad_norm": 0.99609375, + "learning_rate": 0.00019913935563908034, + "loss": 1.3306, + "step": 1579 + }, + { + "epoch": 0.040569969556474586, + "grad_norm": 0.95703125, + "learning_rate": 0.0001991387711079797, + "loss": 1.3418, + "step": 1580 + }, + { + "epoch": 0.0405956467523964, + "grad_norm": 1.0, + "learning_rate": 0.00019913818637930445, + "loss": 1.3973, + "step": 1581 + }, + { + "epoch": 0.04062132394831822, + "grad_norm": 1.0, + "learning_rate": 0.0001991376014530557, + "loss": 1.426, + "step": 1582 + }, + { + "epoch": 0.040647001144240046, + "grad_norm": 1.046875, + "learning_rate": 0.0001991370163292347, + "loss": 1.5343, + "step": 1583 + }, + { + "epoch": 0.04067267834016186, + "grad_norm": 1.2109375, + "learning_rate": 0.0001991364310078426, + "loss": 1.4846, + "step": 1584 + }, + { + "epoch": 0.04069835553608368, + "grad_norm": 0.9140625, + "learning_rate": 0.00019913584548888054, + "loss": 1.4194, + "step": 1585 + }, + { + "epoch": 0.0407240327320055, + "grad_norm": 0.8984375, + "learning_rate": 0.00019913525977234968, + "loss": 1.1677, + "step": 1586 + }, + { + "epoch": 0.04074970992792732, + "grad_norm": 0.94921875, + "learning_rate": 0.00019913467385825122, + "loss": 1.4102, + "step": 1587 + }, + { + "epoch": 0.04077538712384914, + "grad_norm": 0.921875, + "learning_rate": 0.0001991340877465863, + "loss": 1.276, + "step": 1588 + }, + { + "epoch": 0.04080106431977096, + "grad_norm": 1.0234375, + "learning_rate": 0.00019913350143735607, + "loss": 1.3191, + "step": 1589 + }, + { + "epoch": 0.04082674151569278, + "grad_norm": 0.91015625, + "learning_rate": 0.00019913291493056176, + "loss": 1.3794, + "step": 1590 + }, + { + "epoch": 0.040852418711614595, + "grad_norm": 0.8203125, + "learning_rate": 0.0001991323282262045, + "loss": 1.3338, + "step": 1591 + }, + { + "epoch": 0.04087809590753642, + "grad_norm": 1.015625, + "learning_rate": 0.00019913174132428547, + "loss": 1.4016, + "step": 1592 + }, + { + "epoch": 0.04090377310345824, + "grad_norm": 0.96484375, + "learning_rate": 0.00019913115422480582, + "loss": 1.3537, + "step": 1593 + }, + { + "epoch": 0.040929450299380055, + "grad_norm": 1.1015625, + "learning_rate": 0.00019913056692776672, + "loss": 1.4297, + "step": 1594 + }, + { + "epoch": 0.04095512749530188, + "grad_norm": 0.98828125, + "learning_rate": 0.00019912997943316937, + "loss": 1.3011, + "step": 1595 + }, + { + "epoch": 0.04098080469122369, + "grad_norm": 0.9296875, + "learning_rate": 0.00019912939174101494, + "loss": 1.3994, + "step": 1596 + }, + { + "epoch": 0.041006481887145514, + "grad_norm": 0.9609375, + "learning_rate": 0.00019912880385130458, + "loss": 1.4406, + "step": 1597 + }, + { + "epoch": 0.041032159083067336, + "grad_norm": 0.9765625, + "learning_rate": 0.00019912821576403947, + "loss": 1.4418, + "step": 1598 + }, + { + "epoch": 0.04105783627898915, + "grad_norm": 0.98828125, + "learning_rate": 0.00019912762747922077, + "loss": 1.595, + "step": 1599 + }, + { + "epoch": 0.041083513474910974, + "grad_norm": 0.97265625, + "learning_rate": 0.00019912703899684965, + "loss": 1.4109, + "step": 1600 + }, + { + "epoch": 0.04110919067083279, + "grad_norm": 0.9375, + "learning_rate": 0.00019912645031692732, + "loss": 1.4194, + "step": 1601 + }, + { + "epoch": 0.04113486786675461, + "grad_norm": 1.0234375, + "learning_rate": 0.00019912586143945493, + "loss": 1.4314, + "step": 1602 + }, + { + "epoch": 0.04116054506267643, + "grad_norm": 1.015625, + "learning_rate": 0.00019912527236443363, + "loss": 1.3965, + "step": 1603 + }, + { + "epoch": 0.04118622225859825, + "grad_norm": 0.8828125, + "learning_rate": 0.00019912468309186463, + "loss": 1.205, + "step": 1604 + }, + { + "epoch": 0.04121189945452007, + "grad_norm": 0.8984375, + "learning_rate": 0.0001991240936217491, + "loss": 1.3464, + "step": 1605 + }, + { + "epoch": 0.041237576650441886, + "grad_norm": 0.91015625, + "learning_rate": 0.00019912350395408817, + "loss": 1.3733, + "step": 1606 + }, + { + "epoch": 0.04126325384636371, + "grad_norm": 0.9609375, + "learning_rate": 0.0001991229140888831, + "loss": 1.3432, + "step": 1607 + }, + { + "epoch": 0.04128893104228553, + "grad_norm": 0.8828125, + "learning_rate": 0.00019912232402613498, + "loss": 1.4234, + "step": 1608 + }, + { + "epoch": 0.041314608238207345, + "grad_norm": 0.87890625, + "learning_rate": 0.00019912173376584503, + "loss": 1.3509, + "step": 1609 + }, + { + "epoch": 0.04134028543412917, + "grad_norm": 0.99609375, + "learning_rate": 0.0001991211433080144, + "loss": 1.266, + "step": 1610 + }, + { + "epoch": 0.04136596263005098, + "grad_norm": 0.8828125, + "learning_rate": 0.00019912055265264433, + "loss": 1.3016, + "step": 1611 + }, + { + "epoch": 0.041391639825972805, + "grad_norm": 1.015625, + "learning_rate": 0.00019911996179973593, + "loss": 1.4575, + "step": 1612 + }, + { + "epoch": 0.04141731702189463, + "grad_norm": 0.96875, + "learning_rate": 0.0001991193707492904, + "loss": 1.5211, + "step": 1613 + }, + { + "epoch": 0.04144299421781644, + "grad_norm": 0.94140625, + "learning_rate": 0.0001991187795013089, + "loss": 1.3238, + "step": 1614 + }, + { + "epoch": 0.041468671413738264, + "grad_norm": 0.91796875, + "learning_rate": 0.00019911818805579265, + "loss": 1.4201, + "step": 1615 + }, + { + "epoch": 0.04149434860966008, + "grad_norm": 0.90234375, + "learning_rate": 0.0001991175964127428, + "loss": 1.3077, + "step": 1616 + }, + { + "epoch": 0.0415200258055819, + "grad_norm": 0.9375, + "learning_rate": 0.0001991170045721605, + "loss": 1.4672, + "step": 1617 + }, + { + "epoch": 0.041545703001503724, + "grad_norm": 0.98828125, + "learning_rate": 0.000199116412534047, + "loss": 1.4139, + "step": 1618 + }, + { + "epoch": 0.04157138019742554, + "grad_norm": 0.87109375, + "learning_rate": 0.00019911582029840346, + "loss": 1.4184, + "step": 1619 + }, + { + "epoch": 0.04159705739334736, + "grad_norm": 0.89453125, + "learning_rate": 0.00019911522786523103, + "loss": 1.4186, + "step": 1620 + }, + { + "epoch": 0.04162273458926918, + "grad_norm": 0.9375, + "learning_rate": 0.00019911463523453088, + "loss": 1.4161, + "step": 1621 + }, + { + "epoch": 0.041648411785191, + "grad_norm": 0.83984375, + "learning_rate": 0.00019911404240630424, + "loss": 1.2604, + "step": 1622 + }, + { + "epoch": 0.04167408898111282, + "grad_norm": 0.93359375, + "learning_rate": 0.00019911344938055222, + "loss": 1.3052, + "step": 1623 + }, + { + "epoch": 0.041699766177034636, + "grad_norm": 1.03125, + "learning_rate": 0.0001991128561572761, + "loss": 1.5536, + "step": 1624 + }, + { + "epoch": 0.04172544337295646, + "grad_norm": 0.94140625, + "learning_rate": 0.00019911226273647698, + "loss": 1.1958, + "step": 1625 + }, + { + "epoch": 0.041751120568878274, + "grad_norm": 0.96484375, + "learning_rate": 0.0001991116691181561, + "loss": 1.3935, + "step": 1626 + }, + { + "epoch": 0.041776797764800096, + "grad_norm": 0.97265625, + "learning_rate": 0.0001991110753023146, + "loss": 1.296, + "step": 1627 + }, + { + "epoch": 0.04180247496072191, + "grad_norm": 0.98046875, + "learning_rate": 0.00019911048128895366, + "loss": 1.3806, + "step": 1628 + }, + { + "epoch": 0.04182815215664373, + "grad_norm": 0.91796875, + "learning_rate": 0.0001991098870780745, + "loss": 1.2721, + "step": 1629 + }, + { + "epoch": 0.041853829352565555, + "grad_norm": 0.98046875, + "learning_rate": 0.0001991092926696783, + "loss": 1.7051, + "step": 1630 + }, + { + "epoch": 0.04187950654848737, + "grad_norm": 0.99609375, + "learning_rate": 0.00019910869806376625, + "loss": 1.5318, + "step": 1631 + }, + { + "epoch": 0.04190518374440919, + "grad_norm": 0.97265625, + "learning_rate": 0.00019910810326033947, + "loss": 1.2405, + "step": 1632 + }, + { + "epoch": 0.04193086094033101, + "grad_norm": 0.875, + "learning_rate": 0.0001991075082593992, + "loss": 1.2528, + "step": 1633 + }, + { + "epoch": 0.04195653813625283, + "grad_norm": 0.90625, + "learning_rate": 0.00019910691306094665, + "loss": 1.4062, + "step": 1634 + }, + { + "epoch": 0.04198221533217465, + "grad_norm": 0.93359375, + "learning_rate": 0.00019910631766498294, + "loss": 1.3453, + "step": 1635 + }, + { + "epoch": 0.04200789252809647, + "grad_norm": 0.89453125, + "learning_rate": 0.00019910572207150931, + "loss": 1.3607, + "step": 1636 + }, + { + "epoch": 0.04203356972401829, + "grad_norm": 0.90234375, + "learning_rate": 0.00019910512628052693, + "loss": 1.2142, + "step": 1637 + }, + { + "epoch": 0.042059246919940105, + "grad_norm": 1.0, + "learning_rate": 0.000199104530292037, + "loss": 1.2022, + "step": 1638 + }, + { + "epoch": 0.04208492411586193, + "grad_norm": 0.96875, + "learning_rate": 0.00019910393410604068, + "loss": 1.5272, + "step": 1639 + }, + { + "epoch": 0.04211060131178375, + "grad_norm": 0.93359375, + "learning_rate": 0.00019910333772253915, + "loss": 1.3595, + "step": 1640 + }, + { + "epoch": 0.042136278507705564, + "grad_norm": 1.0078125, + "learning_rate": 0.00019910274114153363, + "loss": 1.4432, + "step": 1641 + }, + { + "epoch": 0.042161955703627386, + "grad_norm": 0.9453125, + "learning_rate": 0.0001991021443630253, + "loss": 1.225, + "step": 1642 + }, + { + "epoch": 0.0421876328995492, + "grad_norm": 0.9296875, + "learning_rate": 0.00019910154738701536, + "loss": 1.6048, + "step": 1643 + }, + { + "epoch": 0.042213310095471024, + "grad_norm": 0.94140625, + "learning_rate": 0.00019910095021350498, + "loss": 1.3443, + "step": 1644 + }, + { + "epoch": 0.042238987291392846, + "grad_norm": 0.91015625, + "learning_rate": 0.00019910035284249536, + "loss": 1.4495, + "step": 1645 + }, + { + "epoch": 0.04226466448731466, + "grad_norm": 0.9140625, + "learning_rate": 0.0001990997552739877, + "loss": 1.3786, + "step": 1646 + }, + { + "epoch": 0.04229034168323648, + "grad_norm": 0.93359375, + "learning_rate": 0.00019909915750798316, + "loss": 1.3464, + "step": 1647 + }, + { + "epoch": 0.0423160188791583, + "grad_norm": 0.91015625, + "learning_rate": 0.00019909855954448297, + "loss": 1.5096, + "step": 1648 + }, + { + "epoch": 0.04234169607508012, + "grad_norm": 0.95703125, + "learning_rate": 0.00019909796138348828, + "loss": 1.461, + "step": 1649 + }, + { + "epoch": 0.04236737327100194, + "grad_norm": 0.94140625, + "learning_rate": 0.00019909736302500033, + "loss": 1.3549, + "step": 1650 + }, + { + "epoch": 0.04239305046692376, + "grad_norm": 0.96875, + "learning_rate": 0.00019909676446902027, + "loss": 1.4585, + "step": 1651 + }, + { + "epoch": 0.04241872766284558, + "grad_norm": 0.90234375, + "learning_rate": 0.0001990961657155493, + "loss": 1.4361, + "step": 1652 + }, + { + "epoch": 0.042444404858767396, + "grad_norm": 0.91015625, + "learning_rate": 0.00019909556676458862, + "loss": 1.3659, + "step": 1653 + }, + { + "epoch": 0.04247008205468922, + "grad_norm": 1.046875, + "learning_rate": 0.00019909496761613945, + "loss": 1.5284, + "step": 1654 + }, + { + "epoch": 0.04249575925061104, + "grad_norm": 0.91796875, + "learning_rate": 0.00019909436827020294, + "loss": 1.3675, + "step": 1655 + }, + { + "epoch": 0.042521436446532855, + "grad_norm": 0.8984375, + "learning_rate": 0.00019909376872678035, + "loss": 1.5367, + "step": 1656 + }, + { + "epoch": 0.04254711364245468, + "grad_norm": 0.9921875, + "learning_rate": 0.00019909316898587278, + "loss": 1.3617, + "step": 1657 + }, + { + "epoch": 0.04257279083837649, + "grad_norm": 0.94921875, + "learning_rate": 0.0001990925690474815, + "loss": 1.3759, + "step": 1658 + }, + { + "epoch": 0.042598468034298315, + "grad_norm": 1.03125, + "learning_rate": 0.00019909196891160767, + "loss": 1.3926, + "step": 1659 + }, + { + "epoch": 0.04262414523022014, + "grad_norm": 1.0234375, + "learning_rate": 0.0001990913685782525, + "loss": 1.4042, + "step": 1660 + }, + { + "epoch": 0.04264982242614195, + "grad_norm": 1.0, + "learning_rate": 0.00019909076804741716, + "loss": 1.5368, + "step": 1661 + }, + { + "epoch": 0.042675499622063774, + "grad_norm": 0.8359375, + "learning_rate": 0.0001990901673191029, + "loss": 1.2533, + "step": 1662 + }, + { + "epoch": 0.04270117681798559, + "grad_norm": 0.87109375, + "learning_rate": 0.0001990895663933109, + "loss": 1.2974, + "step": 1663 + }, + { + "epoch": 0.04272685401390741, + "grad_norm": 0.91015625, + "learning_rate": 0.00019908896527004232, + "loss": 1.2865, + "step": 1664 + }, + { + "epoch": 0.042752531209829234, + "grad_norm": 0.921875, + "learning_rate": 0.00019908836394929837, + "loss": 1.2297, + "step": 1665 + }, + { + "epoch": 0.04277820840575105, + "grad_norm": 0.9453125, + "learning_rate": 0.0001990877624310803, + "loss": 1.3115, + "step": 1666 + }, + { + "epoch": 0.04280388560167287, + "grad_norm": 0.8984375, + "learning_rate": 0.00019908716071538924, + "loss": 1.3543, + "step": 1667 + }, + { + "epoch": 0.042829562797594686, + "grad_norm": 0.98828125, + "learning_rate": 0.00019908655880222643, + "loss": 1.4775, + "step": 1668 + }, + { + "epoch": 0.04285523999351651, + "grad_norm": 0.87890625, + "learning_rate": 0.00019908595669159307, + "loss": 1.2797, + "step": 1669 + }, + { + "epoch": 0.04288091718943833, + "grad_norm": 0.93359375, + "learning_rate": 0.00019908535438349032, + "loss": 1.4395, + "step": 1670 + }, + { + "epoch": 0.042906594385360146, + "grad_norm": 0.86328125, + "learning_rate": 0.00019908475187791944, + "loss": 1.3158, + "step": 1671 + }, + { + "epoch": 0.04293227158128197, + "grad_norm": 0.91015625, + "learning_rate": 0.00019908414917488157, + "loss": 1.4196, + "step": 1672 + }, + { + "epoch": 0.04295794877720378, + "grad_norm": 0.96875, + "learning_rate": 0.00019908354627437796, + "loss": 1.3912, + "step": 1673 + }, + { + "epoch": 0.042983625973125605, + "grad_norm": 0.9609375, + "learning_rate": 0.00019908294317640978, + "loss": 1.3021, + "step": 1674 + }, + { + "epoch": 0.04300930316904743, + "grad_norm": 0.94921875, + "learning_rate": 0.00019908233988097824, + "loss": 1.2208, + "step": 1675 + }, + { + "epoch": 0.04303498036496924, + "grad_norm": 0.9453125, + "learning_rate": 0.00019908173638808458, + "loss": 1.3677, + "step": 1676 + }, + { + "epoch": 0.043060657560891065, + "grad_norm": 0.91796875, + "learning_rate": 0.00019908113269772993, + "loss": 1.3349, + "step": 1677 + }, + { + "epoch": 0.04308633475681288, + "grad_norm": 0.93359375, + "learning_rate": 0.00019908052880991556, + "loss": 1.4401, + "step": 1678 + }, + { + "epoch": 0.0431120119527347, + "grad_norm": 0.94921875, + "learning_rate": 0.00019907992472464265, + "loss": 1.2382, + "step": 1679 + }, + { + "epoch": 0.04313768914865652, + "grad_norm": 0.875, + "learning_rate": 0.0001990793204419124, + "loss": 1.3762, + "step": 1680 + }, + { + "epoch": 0.04316336634457834, + "grad_norm": 0.984375, + "learning_rate": 0.000199078715961726, + "loss": 1.2757, + "step": 1681 + }, + { + "epoch": 0.04318904354050016, + "grad_norm": 0.9140625, + "learning_rate": 0.00019907811128408467, + "loss": 1.4355, + "step": 1682 + }, + { + "epoch": 0.04321472073642198, + "grad_norm": 0.8984375, + "learning_rate": 0.0001990775064089896, + "loss": 1.4188, + "step": 1683 + }, + { + "epoch": 0.0432403979323438, + "grad_norm": 0.91796875, + "learning_rate": 0.00019907690133644202, + "loss": 1.2681, + "step": 1684 + }, + { + "epoch": 0.043266075128265614, + "grad_norm": 0.859375, + "learning_rate": 0.00019907629606644313, + "loss": 1.2036, + "step": 1685 + }, + { + "epoch": 0.04329175232418744, + "grad_norm": 0.85546875, + "learning_rate": 0.00019907569059899412, + "loss": 1.2254, + "step": 1686 + }, + { + "epoch": 0.04331742952010926, + "grad_norm": 1.03125, + "learning_rate": 0.00019907508493409624, + "loss": 1.6581, + "step": 1687 + }, + { + "epoch": 0.043343106716031074, + "grad_norm": 0.828125, + "learning_rate": 0.00019907447907175062, + "loss": 1.3669, + "step": 1688 + }, + { + "epoch": 0.043368783911952896, + "grad_norm": 0.9296875, + "learning_rate": 0.00019907387301195854, + "loss": 1.2608, + "step": 1689 + }, + { + "epoch": 0.04339446110787471, + "grad_norm": 0.93359375, + "learning_rate": 0.00019907326675472117, + "loss": 1.4503, + "step": 1690 + }, + { + "epoch": 0.043420138303796534, + "grad_norm": 0.94921875, + "learning_rate": 0.00019907266030003975, + "loss": 1.3387, + "step": 1691 + }, + { + "epoch": 0.043445815499718356, + "grad_norm": 0.9375, + "learning_rate": 0.00019907205364791546, + "loss": 1.3771, + "step": 1692 + }, + { + "epoch": 0.04347149269564017, + "grad_norm": 0.86328125, + "learning_rate": 0.0001990714467983495, + "loss": 1.4836, + "step": 1693 + }, + { + "epoch": 0.04349716989156199, + "grad_norm": 0.98046875, + "learning_rate": 0.00019907083975134312, + "loss": 1.2994, + "step": 1694 + }, + { + "epoch": 0.04352284708748381, + "grad_norm": 0.87109375, + "learning_rate": 0.00019907023250689747, + "loss": 1.2572, + "step": 1695 + }, + { + "epoch": 0.04354852428340563, + "grad_norm": 0.8671875, + "learning_rate": 0.00019906962506501384, + "loss": 1.3051, + "step": 1696 + }, + { + "epoch": 0.04357420147932745, + "grad_norm": 0.94921875, + "learning_rate": 0.00019906901742569336, + "loss": 1.1798, + "step": 1697 + }, + { + "epoch": 0.04359987867524927, + "grad_norm": 0.92578125, + "learning_rate": 0.00019906840958893728, + "loss": 1.3665, + "step": 1698 + }, + { + "epoch": 0.04362555587117109, + "grad_norm": 0.890625, + "learning_rate": 0.00019906780155474682, + "loss": 1.273, + "step": 1699 + }, + { + "epoch": 0.043651233067092905, + "grad_norm": 0.953125, + "learning_rate": 0.00019906719332312317, + "loss": 1.4202, + "step": 1700 + }, + { + "epoch": 0.04367691026301473, + "grad_norm": 1.0, + "learning_rate": 0.00019906658489406755, + "loss": 1.3708, + "step": 1701 + }, + { + "epoch": 0.04370258745893655, + "grad_norm": 0.87890625, + "learning_rate": 0.0001990659762675812, + "loss": 1.3034, + "step": 1702 + }, + { + "epoch": 0.043728264654858365, + "grad_norm": 0.8828125, + "learning_rate": 0.0001990653674436653, + "loss": 1.28, + "step": 1703 + }, + { + "epoch": 0.04375394185078019, + "grad_norm": 0.84375, + "learning_rate": 0.00019906475842232103, + "loss": 1.3439, + "step": 1704 + }, + { + "epoch": 0.043779619046702, + "grad_norm": 1.0078125, + "learning_rate": 0.0001990641492035497, + "loss": 1.3427, + "step": 1705 + }, + { + "epoch": 0.043805296242623824, + "grad_norm": 0.875, + "learning_rate": 0.00019906353978735246, + "loss": 1.5143, + "step": 1706 + }, + { + "epoch": 0.043830973438545646, + "grad_norm": 0.93359375, + "learning_rate": 0.00019906293017373053, + "loss": 1.4082, + "step": 1707 + }, + { + "epoch": 0.04385665063446746, + "grad_norm": 0.8828125, + "learning_rate": 0.0001990623203626851, + "loss": 1.1209, + "step": 1708 + }, + { + "epoch": 0.043882327830389284, + "grad_norm": 0.91015625, + "learning_rate": 0.00019906171035421742, + "loss": 1.4365, + "step": 1709 + }, + { + "epoch": 0.0439080050263111, + "grad_norm": 0.9375, + "learning_rate": 0.00019906110014832873, + "loss": 1.379, + "step": 1710 + }, + { + "epoch": 0.04393368222223292, + "grad_norm": 0.953125, + "learning_rate": 0.00019906048974502018, + "loss": 1.3833, + "step": 1711 + }, + { + "epoch": 0.04395935941815474, + "grad_norm": 0.94140625, + "learning_rate": 0.00019905987914429306, + "loss": 1.235, + "step": 1712 + }, + { + "epoch": 0.04398503661407656, + "grad_norm": 0.98046875, + "learning_rate": 0.00019905926834614852, + "loss": 1.4068, + "step": 1713 + }, + { + "epoch": 0.04401071380999838, + "grad_norm": 0.87890625, + "learning_rate": 0.0001990586573505878, + "loss": 1.4553, + "step": 1714 + }, + { + "epoch": 0.044036391005920196, + "grad_norm": 0.9765625, + "learning_rate": 0.00019905804615761213, + "loss": 1.5087, + "step": 1715 + }, + { + "epoch": 0.04406206820184202, + "grad_norm": 1.0, + "learning_rate": 0.00019905743476722275, + "loss": 1.3817, + "step": 1716 + }, + { + "epoch": 0.04408774539776384, + "grad_norm": 0.86328125, + "learning_rate": 0.00019905682317942084, + "loss": 1.3351, + "step": 1717 + }, + { + "epoch": 0.044113422593685656, + "grad_norm": 0.9140625, + "learning_rate": 0.0001990562113942076, + "loss": 1.345, + "step": 1718 + }, + { + "epoch": 0.04413909978960748, + "grad_norm": 0.87890625, + "learning_rate": 0.00019905559941158432, + "loss": 1.3873, + "step": 1719 + }, + { + "epoch": 0.04416477698552929, + "grad_norm": 0.91015625, + "learning_rate": 0.00019905498723155215, + "loss": 1.2417, + "step": 1720 + }, + { + "epoch": 0.044190454181451115, + "grad_norm": 0.9375, + "learning_rate": 0.00019905437485411235, + "loss": 1.5311, + "step": 1721 + }, + { + "epoch": 0.04421613137737294, + "grad_norm": 1.0078125, + "learning_rate": 0.00019905376227926614, + "loss": 1.2231, + "step": 1722 + }, + { + "epoch": 0.04424180857329475, + "grad_norm": 0.90234375, + "learning_rate": 0.0001990531495070147, + "loss": 1.3523, + "step": 1723 + }, + { + "epoch": 0.044267485769216575, + "grad_norm": 0.91015625, + "learning_rate": 0.0001990525365373593, + "loss": 1.2911, + "step": 1724 + }, + { + "epoch": 0.04429316296513839, + "grad_norm": 0.93359375, + "learning_rate": 0.00019905192337030112, + "loss": 1.3291, + "step": 1725 + }, + { + "epoch": 0.04431884016106021, + "grad_norm": 0.8828125, + "learning_rate": 0.00019905131000584142, + "loss": 1.3328, + "step": 1726 + }, + { + "epoch": 0.044344517356982034, + "grad_norm": 0.9765625, + "learning_rate": 0.0001990506964439814, + "loss": 1.4742, + "step": 1727 + }, + { + "epoch": 0.04437019455290385, + "grad_norm": 0.921875, + "learning_rate": 0.0001990500826847223, + "loss": 1.5323, + "step": 1728 + }, + { + "epoch": 0.04439587174882567, + "grad_norm": 0.8828125, + "learning_rate": 0.00019904946872806534, + "loss": 1.3256, + "step": 1729 + }, + { + "epoch": 0.04442154894474749, + "grad_norm": 0.95703125, + "learning_rate": 0.00019904885457401172, + "loss": 1.4141, + "step": 1730 + }, + { + "epoch": 0.04444722614066931, + "grad_norm": 0.91015625, + "learning_rate": 0.0001990482402225627, + "loss": 1.4072, + "step": 1731 + }, + { + "epoch": 0.044472903336591124, + "grad_norm": 1.015625, + "learning_rate": 0.00019904762567371945, + "loss": 1.396, + "step": 1732 + }, + { + "epoch": 0.044498580532512946, + "grad_norm": 0.94921875, + "learning_rate": 0.00019904701092748325, + "loss": 1.2789, + "step": 1733 + }, + { + "epoch": 0.04452425772843477, + "grad_norm": 0.88671875, + "learning_rate": 0.00019904639598385528, + "loss": 1.2924, + "step": 1734 + }, + { + "epoch": 0.044549934924356584, + "grad_norm": 0.8671875, + "learning_rate": 0.00019904578084283683, + "loss": 1.2515, + "step": 1735 + }, + { + "epoch": 0.044575612120278406, + "grad_norm": 0.890625, + "learning_rate": 0.00019904516550442905, + "loss": 1.3907, + "step": 1736 + }, + { + "epoch": 0.04460128931620022, + "grad_norm": 0.87109375, + "learning_rate": 0.00019904454996863322, + "loss": 1.3173, + "step": 1737 + }, + { + "epoch": 0.04462696651212204, + "grad_norm": 0.90234375, + "learning_rate": 0.00019904393423545057, + "loss": 1.4315, + "step": 1738 + }, + { + "epoch": 0.044652643708043865, + "grad_norm": 0.88671875, + "learning_rate": 0.00019904331830488227, + "loss": 1.3145, + "step": 1739 + }, + { + "epoch": 0.04467832090396568, + "grad_norm": 0.90625, + "learning_rate": 0.00019904270217692958, + "loss": 1.3054, + "step": 1740 + }, + { + "epoch": 0.0447039980998875, + "grad_norm": 0.89453125, + "learning_rate": 0.00019904208585159373, + "loss": 1.3834, + "step": 1741 + }, + { + "epoch": 0.04472967529580932, + "grad_norm": 0.9453125, + "learning_rate": 0.00019904146932887598, + "loss": 1.2632, + "step": 1742 + }, + { + "epoch": 0.04475535249173114, + "grad_norm": 0.8828125, + "learning_rate": 0.00019904085260877752, + "loss": 1.3033, + "step": 1743 + }, + { + "epoch": 0.04478102968765296, + "grad_norm": 0.87109375, + "learning_rate": 0.00019904023569129956, + "loss": 1.343, + "step": 1744 + }, + { + "epoch": 0.04480670688357478, + "grad_norm": 0.8828125, + "learning_rate": 0.0001990396185764434, + "loss": 1.2395, + "step": 1745 + }, + { + "epoch": 0.0448323840794966, + "grad_norm": 1.0078125, + "learning_rate": 0.0001990390012642102, + "loss": 1.34, + "step": 1746 + }, + { + "epoch": 0.044858061275418415, + "grad_norm": 0.984375, + "learning_rate": 0.00019903838375460122, + "loss": 1.3444, + "step": 1747 + }, + { + "epoch": 0.04488373847134024, + "grad_norm": 0.97265625, + "learning_rate": 0.0001990377660476177, + "loss": 1.3753, + "step": 1748 + }, + { + "epoch": 0.04490941566726206, + "grad_norm": 0.94921875, + "learning_rate": 0.00019903714814326081, + "loss": 1.2913, + "step": 1749 + }, + { + "epoch": 0.044935092863183874, + "grad_norm": 0.9921875, + "learning_rate": 0.0001990365300415319, + "loss": 1.452, + "step": 1750 + }, + { + "epoch": 0.0449607700591057, + "grad_norm": 0.91015625, + "learning_rate": 0.0001990359117424321, + "loss": 1.381, + "step": 1751 + }, + { + "epoch": 0.04498644725502751, + "grad_norm": 1.046875, + "learning_rate": 0.00019903529324596266, + "loss": 1.3718, + "step": 1752 + }, + { + "epoch": 0.045012124450949334, + "grad_norm": 0.953125, + "learning_rate": 0.00019903467455212483, + "loss": 1.3817, + "step": 1753 + }, + { + "epoch": 0.045037801646871156, + "grad_norm": 1.015625, + "learning_rate": 0.00019903405566091986, + "loss": 1.384, + "step": 1754 + }, + { + "epoch": 0.04506347884279297, + "grad_norm": 1.0546875, + "learning_rate": 0.00019903343657234895, + "loss": 1.364, + "step": 1755 + }, + { + "epoch": 0.045089156038714794, + "grad_norm": 2.953125, + "learning_rate": 0.00019903281728641332, + "loss": 1.138, + "step": 1756 + }, + { + "epoch": 0.04511483323463661, + "grad_norm": 0.97265625, + "learning_rate": 0.00019903219780311428, + "loss": 1.3119, + "step": 1757 + }, + { + "epoch": 0.04514051043055843, + "grad_norm": 0.9140625, + "learning_rate": 0.000199031578122453, + "loss": 1.3028, + "step": 1758 + }, + { + "epoch": 0.04516618762648025, + "grad_norm": 0.8828125, + "learning_rate": 0.0001990309582444307, + "loss": 1.3991, + "step": 1759 + }, + { + "epoch": 0.04519186482240207, + "grad_norm": 0.9296875, + "learning_rate": 0.00019903033816904867, + "loss": 1.196, + "step": 1760 + }, + { + "epoch": 0.04521754201832389, + "grad_norm": 0.890625, + "learning_rate": 0.00019902971789630814, + "loss": 1.3355, + "step": 1761 + }, + { + "epoch": 0.045243219214245706, + "grad_norm": 0.96484375, + "learning_rate": 0.0001990290974262103, + "loss": 1.2933, + "step": 1762 + }, + { + "epoch": 0.04526889641016753, + "grad_norm": 0.890625, + "learning_rate": 0.0001990284767587564, + "loss": 1.3024, + "step": 1763 + }, + { + "epoch": 0.04529457360608935, + "grad_norm": 0.98828125, + "learning_rate": 0.00019902785589394773, + "loss": 1.382, + "step": 1764 + }, + { + "epoch": 0.045320250802011165, + "grad_norm": 0.90625, + "learning_rate": 0.00019902723483178548, + "loss": 1.2414, + "step": 1765 + }, + { + "epoch": 0.04534592799793299, + "grad_norm": 0.9453125, + "learning_rate": 0.0001990266135722709, + "loss": 1.1616, + "step": 1766 + }, + { + "epoch": 0.0453716051938548, + "grad_norm": 0.875, + "learning_rate": 0.00019902599211540518, + "loss": 1.2809, + "step": 1767 + }, + { + "epoch": 0.045397282389776625, + "grad_norm": 0.94140625, + "learning_rate": 0.00019902537046118964, + "loss": 1.4541, + "step": 1768 + }, + { + "epoch": 0.04542295958569845, + "grad_norm": 0.9609375, + "learning_rate": 0.00019902474860962544, + "loss": 1.3831, + "step": 1769 + }, + { + "epoch": 0.04544863678162026, + "grad_norm": 0.96484375, + "learning_rate": 0.00019902412656071393, + "loss": 1.3633, + "step": 1770 + }, + { + "epoch": 0.045474313977542084, + "grad_norm": 0.90234375, + "learning_rate": 0.00019902350431445624, + "loss": 1.2061, + "step": 1771 + }, + { + "epoch": 0.0454999911734639, + "grad_norm": 0.9140625, + "learning_rate": 0.00019902288187085364, + "loss": 1.3202, + "step": 1772 + }, + { + "epoch": 0.04552566836938572, + "grad_norm": 0.91015625, + "learning_rate": 0.0001990222592299074, + "loss": 1.4322, + "step": 1773 + }, + { + "epoch": 0.045551345565307544, + "grad_norm": 0.875, + "learning_rate": 0.00019902163639161876, + "loss": 1.2942, + "step": 1774 + }, + { + "epoch": 0.04557702276122936, + "grad_norm": 0.953125, + "learning_rate": 0.00019902101335598894, + "loss": 1.5953, + "step": 1775 + }, + { + "epoch": 0.04560269995715118, + "grad_norm": 0.94921875, + "learning_rate": 0.00019902039012301915, + "loss": 1.2818, + "step": 1776 + }, + { + "epoch": 0.045628377153072996, + "grad_norm": 0.94921875, + "learning_rate": 0.00019901976669271066, + "loss": 1.4459, + "step": 1777 + }, + { + "epoch": 0.04565405434899482, + "grad_norm": 0.94140625, + "learning_rate": 0.00019901914306506475, + "loss": 1.4129, + "step": 1778 + }, + { + "epoch": 0.04567973154491664, + "grad_norm": 2.640625, + "learning_rate": 0.00019901851924008262, + "loss": 1.2153, + "step": 1779 + }, + { + "epoch": 0.045705408740838456, + "grad_norm": 0.9375, + "learning_rate": 0.00019901789521776554, + "loss": 1.3318, + "step": 1780 + }, + { + "epoch": 0.04573108593676028, + "grad_norm": 0.85546875, + "learning_rate": 0.00019901727099811475, + "loss": 1.3562, + "step": 1781 + }, + { + "epoch": 0.04575676313268209, + "grad_norm": 0.8359375, + "learning_rate": 0.00019901664658113146, + "loss": 1.2037, + "step": 1782 + }, + { + "epoch": 0.045782440328603916, + "grad_norm": 0.9296875, + "learning_rate": 0.00019901602196681695, + "loss": 1.2838, + "step": 1783 + }, + { + "epoch": 0.04580811752452573, + "grad_norm": 0.99609375, + "learning_rate": 0.00019901539715517244, + "loss": 1.3768, + "step": 1784 + }, + { + "epoch": 0.04583379472044755, + "grad_norm": 0.97265625, + "learning_rate": 0.00019901477214619918, + "loss": 1.5139, + "step": 1785 + }, + { + "epoch": 0.045859471916369375, + "grad_norm": 0.8125, + "learning_rate": 0.00019901414693989844, + "loss": 1.325, + "step": 1786 + }, + { + "epoch": 0.04588514911229119, + "grad_norm": 0.91796875, + "learning_rate": 0.00019901352153627145, + "loss": 1.3502, + "step": 1787 + }, + { + "epoch": 0.04591082630821301, + "grad_norm": 0.83203125, + "learning_rate": 0.00019901289593531943, + "loss": 1.3468, + "step": 1788 + }, + { + "epoch": 0.04593650350413483, + "grad_norm": 0.8671875, + "learning_rate": 0.0001990122701370437, + "loss": 1.3555, + "step": 1789 + }, + { + "epoch": 0.04596218070005665, + "grad_norm": 0.91015625, + "learning_rate": 0.00019901164414144542, + "loss": 1.4775, + "step": 1790 + }, + { + "epoch": 0.04598785789597847, + "grad_norm": 0.99609375, + "learning_rate": 0.00019901101794852588, + "loss": 1.4308, + "step": 1791 + }, + { + "epoch": 0.04601353509190029, + "grad_norm": 0.94921875, + "learning_rate": 0.00019901039155828634, + "loss": 1.0866, + "step": 1792 + }, + { + "epoch": 0.04603921228782211, + "grad_norm": 1.0078125, + "learning_rate": 0.000199009764970728, + "loss": 1.4743, + "step": 1793 + }, + { + "epoch": 0.046064889483743925, + "grad_norm": 0.9765625, + "learning_rate": 0.00019900913818585218, + "loss": 1.2047, + "step": 1794 + }, + { + "epoch": 0.04609056667966575, + "grad_norm": 0.96875, + "learning_rate": 0.00019900851120366008, + "loss": 1.4747, + "step": 1795 + }, + { + "epoch": 0.04611624387558757, + "grad_norm": 0.87109375, + "learning_rate": 0.00019900788402415297, + "loss": 1.235, + "step": 1796 + }, + { + "epoch": 0.046141921071509384, + "grad_norm": 0.98828125, + "learning_rate": 0.0001990072566473321, + "loss": 1.4638, + "step": 1797 + }, + { + "epoch": 0.046167598267431206, + "grad_norm": 1.0078125, + "learning_rate": 0.00019900662907319867, + "loss": 1.3177, + "step": 1798 + }, + { + "epoch": 0.04619327546335302, + "grad_norm": 0.99609375, + "learning_rate": 0.00019900600130175398, + "loss": 1.5346, + "step": 1799 + }, + { + "epoch": 0.046218952659274844, + "grad_norm": 0.84765625, + "learning_rate": 0.0001990053733329993, + "loss": 1.3965, + "step": 1800 + }, + { + "epoch": 0.046244629855196666, + "grad_norm": 1.0390625, + "learning_rate": 0.00019900474516693583, + "loss": 1.4277, + "step": 1801 + }, + { + "epoch": 0.04627030705111848, + "grad_norm": 0.87890625, + "learning_rate": 0.00019900411680356486, + "loss": 1.3192, + "step": 1802 + }, + { + "epoch": 0.0462959842470403, + "grad_norm": 0.89453125, + "learning_rate": 0.0001990034882428876, + "loss": 1.4739, + "step": 1803 + }, + { + "epoch": 0.04632166144296212, + "grad_norm": 1.0625, + "learning_rate": 0.00019900285948490538, + "loss": 1.4226, + "step": 1804 + }, + { + "epoch": 0.04634733863888394, + "grad_norm": 1.0078125, + "learning_rate": 0.00019900223052961936, + "loss": 1.4224, + "step": 1805 + }, + { + "epoch": 0.04637301583480576, + "grad_norm": 0.9296875, + "learning_rate": 0.00019900160137703088, + "loss": 1.2938, + "step": 1806 + }, + { + "epoch": 0.04639869303072758, + "grad_norm": 0.8984375, + "learning_rate": 0.0001990009720271411, + "loss": 1.2583, + "step": 1807 + }, + { + "epoch": 0.0464243702266494, + "grad_norm": 0.93359375, + "learning_rate": 0.00019900034247995137, + "loss": 1.258, + "step": 1808 + }, + { + "epoch": 0.046450047422571215, + "grad_norm": 0.8828125, + "learning_rate": 0.00019899971273546288, + "loss": 1.2772, + "step": 1809 + }, + { + "epoch": 0.04647572461849304, + "grad_norm": 1.0234375, + "learning_rate": 0.00019899908279367689, + "loss": 1.3402, + "step": 1810 + }, + { + "epoch": 0.04650140181441486, + "grad_norm": 1.015625, + "learning_rate": 0.0001989984526545947, + "loss": 1.4393, + "step": 1811 + }, + { + "epoch": 0.046527079010336675, + "grad_norm": 0.875, + "learning_rate": 0.00019899782231821753, + "loss": 1.2326, + "step": 1812 + }, + { + "epoch": 0.0465527562062585, + "grad_norm": 0.94140625, + "learning_rate": 0.00019899719178454662, + "loss": 1.2685, + "step": 1813 + }, + { + "epoch": 0.04657843340218031, + "grad_norm": 1.015625, + "learning_rate": 0.0001989965610535833, + "loss": 1.4393, + "step": 1814 + }, + { + "epoch": 0.046604110598102134, + "grad_norm": 0.9296875, + "learning_rate": 0.00019899593012532871, + "loss": 1.2085, + "step": 1815 + }, + { + "epoch": 0.04662978779402396, + "grad_norm": 0.9296875, + "learning_rate": 0.00019899529899978423, + "loss": 1.2921, + "step": 1816 + }, + { + "epoch": 0.04665546498994577, + "grad_norm": 1.484375, + "learning_rate": 0.000198994667676951, + "loss": 1.4882, + "step": 1817 + }, + { + "epoch": 0.046681142185867594, + "grad_norm": 1.0703125, + "learning_rate": 0.00019899403615683038, + "loss": 1.463, + "step": 1818 + }, + { + "epoch": 0.04670681938178941, + "grad_norm": 0.89453125, + "learning_rate": 0.00019899340443942359, + "loss": 1.3184, + "step": 1819 + }, + { + "epoch": 0.04673249657771123, + "grad_norm": 0.9453125, + "learning_rate": 0.0001989927725247319, + "loss": 1.3515, + "step": 1820 + }, + { + "epoch": 0.046758173773633054, + "grad_norm": 0.96875, + "learning_rate": 0.0001989921404127565, + "loss": 1.2751, + "step": 1821 + }, + { + "epoch": 0.04678385096955487, + "grad_norm": 0.94140625, + "learning_rate": 0.00019899150810349876, + "loss": 1.361, + "step": 1822 + }, + { + "epoch": 0.04680952816547669, + "grad_norm": 0.9296875, + "learning_rate": 0.0001989908755969599, + "loss": 1.3211, + "step": 1823 + }, + { + "epoch": 0.046835205361398506, + "grad_norm": 0.9375, + "learning_rate": 0.0001989902428931411, + "loss": 1.3466, + "step": 1824 + }, + { + "epoch": 0.04686088255732033, + "grad_norm": 1.03125, + "learning_rate": 0.00019898960999204372, + "loss": 1.2442, + "step": 1825 + }, + { + "epoch": 0.04688655975324215, + "grad_norm": 1.015625, + "learning_rate": 0.00019898897689366903, + "loss": 1.433, + "step": 1826 + }, + { + "epoch": 0.046912236949163966, + "grad_norm": 0.953125, + "learning_rate": 0.0001989883435980182, + "loss": 1.4258, + "step": 1827 + }, + { + "epoch": 0.04693791414508579, + "grad_norm": 1.0078125, + "learning_rate": 0.00019898771010509254, + "loss": 1.2149, + "step": 1828 + }, + { + "epoch": 0.0469635913410076, + "grad_norm": 0.93359375, + "learning_rate": 0.00019898707641489334, + "loss": 1.3916, + "step": 1829 + }, + { + "epoch": 0.046989268536929425, + "grad_norm": 1.0078125, + "learning_rate": 0.00019898644252742183, + "loss": 1.2549, + "step": 1830 + }, + { + "epoch": 0.04701494573285125, + "grad_norm": 0.92578125, + "learning_rate": 0.00019898580844267928, + "loss": 1.1896, + "step": 1831 + }, + { + "epoch": 0.04704062292877306, + "grad_norm": 0.96875, + "learning_rate": 0.00019898517416066695, + "loss": 1.4658, + "step": 1832 + }, + { + "epoch": 0.047066300124694885, + "grad_norm": 0.92578125, + "learning_rate": 0.00019898453968138612, + "loss": 1.2441, + "step": 1833 + }, + { + "epoch": 0.0470919773206167, + "grad_norm": 0.97265625, + "learning_rate": 0.00019898390500483807, + "loss": 1.3928, + "step": 1834 + }, + { + "epoch": 0.04711765451653852, + "grad_norm": 0.92578125, + "learning_rate": 0.00019898327013102398, + "loss": 1.3401, + "step": 1835 + }, + { + "epoch": 0.04714333171246034, + "grad_norm": 1.0078125, + "learning_rate": 0.00019898263505994522, + "loss": 1.4061, + "step": 1836 + }, + { + "epoch": 0.04716900890838216, + "grad_norm": 0.94140625, + "learning_rate": 0.00019898199979160298, + "loss": 1.5672, + "step": 1837 + }, + { + "epoch": 0.04719468610430398, + "grad_norm": 0.83203125, + "learning_rate": 0.00019898136432599855, + "loss": 1.1383, + "step": 1838 + }, + { + "epoch": 0.0472203633002258, + "grad_norm": 0.91796875, + "learning_rate": 0.00019898072866313325, + "loss": 1.2283, + "step": 1839 + }, + { + "epoch": 0.04724604049614762, + "grad_norm": 0.8671875, + "learning_rate": 0.00019898009280300825, + "loss": 1.221, + "step": 1840 + }, + { + "epoch": 0.047271717692069434, + "grad_norm": 0.97265625, + "learning_rate": 0.0001989794567456249, + "loss": 1.3311, + "step": 1841 + }, + { + "epoch": 0.047297394887991256, + "grad_norm": 0.9296875, + "learning_rate": 0.0001989788204909844, + "loss": 1.4348, + "step": 1842 + }, + { + "epoch": 0.04732307208391308, + "grad_norm": 0.87890625, + "learning_rate": 0.00019897818403908805, + "loss": 1.407, + "step": 1843 + }, + { + "epoch": 0.047348749279834894, + "grad_norm": 0.87890625, + "learning_rate": 0.00019897754738993715, + "loss": 1.3012, + "step": 1844 + }, + { + "epoch": 0.047374426475756716, + "grad_norm": 0.9453125, + "learning_rate": 0.00019897691054353295, + "loss": 1.1661, + "step": 1845 + }, + { + "epoch": 0.04740010367167853, + "grad_norm": 0.9296875, + "learning_rate": 0.00019897627349987668, + "loss": 1.5119, + "step": 1846 + }, + { + "epoch": 0.04742578086760035, + "grad_norm": 0.8359375, + "learning_rate": 0.00019897563625896964, + "loss": 1.2502, + "step": 1847 + }, + { + "epoch": 0.047451458063522176, + "grad_norm": 0.87109375, + "learning_rate": 0.00019897499882081307, + "loss": 1.3346, + "step": 1848 + }, + { + "epoch": 0.04747713525944399, + "grad_norm": 0.99609375, + "learning_rate": 0.0001989743611854083, + "loss": 1.3553, + "step": 1849 + }, + { + "epoch": 0.04750281245536581, + "grad_norm": 0.90625, + "learning_rate": 0.00019897372335275657, + "loss": 1.4586, + "step": 1850 + }, + { + "epoch": 0.04752848965128763, + "grad_norm": 0.91796875, + "learning_rate": 0.00019897308532285915, + "loss": 1.3451, + "step": 1851 + }, + { + "epoch": 0.04755416684720945, + "grad_norm": 0.9140625, + "learning_rate": 0.0001989724470957173, + "loss": 1.4275, + "step": 1852 + }, + { + "epoch": 0.04757984404313127, + "grad_norm": 0.9453125, + "learning_rate": 0.0001989718086713323, + "loss": 1.3249, + "step": 1853 + }, + { + "epoch": 0.04760552123905309, + "grad_norm": 0.8828125, + "learning_rate": 0.00019897117004970545, + "loss": 1.446, + "step": 1854 + }, + { + "epoch": 0.04763119843497491, + "grad_norm": 0.9453125, + "learning_rate": 0.00019897053123083797, + "loss": 1.2704, + "step": 1855 + }, + { + "epoch": 0.047656875630896725, + "grad_norm": 0.9453125, + "learning_rate": 0.00019896989221473117, + "loss": 1.328, + "step": 1856 + }, + { + "epoch": 0.04768255282681855, + "grad_norm": 1.0, + "learning_rate": 0.00019896925300138635, + "loss": 1.3396, + "step": 1857 + }, + { + "epoch": 0.04770823002274037, + "grad_norm": 0.91015625, + "learning_rate": 0.0001989686135908047, + "loss": 1.2413, + "step": 1858 + }, + { + "epoch": 0.047733907218662185, + "grad_norm": 0.8828125, + "learning_rate": 0.00019896797398298753, + "loss": 1.3542, + "step": 1859 + }, + { + "epoch": 0.04775958441458401, + "grad_norm": 0.87890625, + "learning_rate": 0.00019896733417793617, + "loss": 1.2678, + "step": 1860 + }, + { + "epoch": 0.04778526161050582, + "grad_norm": 0.92578125, + "learning_rate": 0.00019896669417565185, + "loss": 1.4454, + "step": 1861 + }, + { + "epoch": 0.047810938806427644, + "grad_norm": 0.84375, + "learning_rate": 0.00019896605397613584, + "loss": 1.159, + "step": 1862 + }, + { + "epoch": 0.047836616002349466, + "grad_norm": 0.95703125, + "learning_rate": 0.00019896541357938943, + "loss": 1.3332, + "step": 1863 + }, + { + "epoch": 0.04786229319827128, + "grad_norm": 0.87890625, + "learning_rate": 0.00019896477298541386, + "loss": 1.3755, + "step": 1864 + }, + { + "epoch": 0.047887970394193104, + "grad_norm": 0.93359375, + "learning_rate": 0.00019896413219421048, + "loss": 1.3697, + "step": 1865 + }, + { + "epoch": 0.04791364759011492, + "grad_norm": 0.9375, + "learning_rate": 0.0001989634912057805, + "loss": 1.2226, + "step": 1866 + }, + { + "epoch": 0.04793932478603674, + "grad_norm": 0.8828125, + "learning_rate": 0.00019896285002012523, + "loss": 1.2025, + "step": 1867 + }, + { + "epoch": 0.04796500198195856, + "grad_norm": 0.83203125, + "learning_rate": 0.00019896220863724592, + "loss": 1.2845, + "step": 1868 + }, + { + "epoch": 0.04799067917788038, + "grad_norm": 0.97265625, + "learning_rate": 0.00019896156705714388, + "loss": 1.1383, + "step": 1869 + }, + { + "epoch": 0.0480163563738022, + "grad_norm": 0.90234375, + "learning_rate": 0.0001989609252798204, + "loss": 1.1869, + "step": 1870 + }, + { + "epoch": 0.048042033569724016, + "grad_norm": 0.96484375, + "learning_rate": 0.0001989602833052767, + "loss": 1.3871, + "step": 1871 + }, + { + "epoch": 0.04806771076564584, + "grad_norm": 1.1328125, + "learning_rate": 0.0001989596411335141, + "loss": 1.2186, + "step": 1872 + }, + { + "epoch": 0.04809338796156766, + "grad_norm": 0.8515625, + "learning_rate": 0.0001989589987645339, + "loss": 1.1982, + "step": 1873 + }, + { + "epoch": 0.048119065157489475, + "grad_norm": 0.90625, + "learning_rate": 0.00019895835619833732, + "loss": 1.3458, + "step": 1874 + }, + { + "epoch": 0.0481447423534113, + "grad_norm": 0.94140625, + "learning_rate": 0.00019895771343492568, + "loss": 1.3958, + "step": 1875 + }, + { + "epoch": 0.04817041954933311, + "grad_norm": 0.8046875, + "learning_rate": 0.0001989570704743003, + "loss": 1.1922, + "step": 1876 + }, + { + "epoch": 0.048196096745254935, + "grad_norm": 0.87109375, + "learning_rate": 0.00019895642731646235, + "loss": 1.3391, + "step": 1877 + }, + { + "epoch": 0.04822177394117676, + "grad_norm": 0.89453125, + "learning_rate": 0.0001989557839614132, + "loss": 1.3069, + "step": 1878 + }, + { + "epoch": 0.04824745113709857, + "grad_norm": 0.94140625, + "learning_rate": 0.00019895514040915412, + "loss": 1.388, + "step": 1879 + }, + { + "epoch": 0.048273128333020394, + "grad_norm": 0.96484375, + "learning_rate": 0.0001989544966596864, + "loss": 1.3724, + "step": 1880 + }, + { + "epoch": 0.04829880552894221, + "grad_norm": 0.98828125, + "learning_rate": 0.0001989538527130113, + "loss": 1.4909, + "step": 1881 + }, + { + "epoch": 0.04832448272486403, + "grad_norm": 0.953125, + "learning_rate": 0.00019895320856913012, + "loss": 1.2209, + "step": 1882 + }, + { + "epoch": 0.048350159920785854, + "grad_norm": 0.9765625, + "learning_rate": 0.0001989525642280441, + "loss": 1.4584, + "step": 1883 + }, + { + "epoch": 0.04837583711670767, + "grad_norm": 0.9140625, + "learning_rate": 0.00019895191968975458, + "loss": 1.2348, + "step": 1884 + }, + { + "epoch": 0.04840151431262949, + "grad_norm": 0.91796875, + "learning_rate": 0.00019895127495426284, + "loss": 1.3479, + "step": 1885 + }, + { + "epoch": 0.04842719150855131, + "grad_norm": 0.94140625, + "learning_rate": 0.0001989506300215701, + "loss": 1.2382, + "step": 1886 + }, + { + "epoch": 0.04845286870447313, + "grad_norm": 0.87890625, + "learning_rate": 0.00019894998489167773, + "loss": 1.232, + "step": 1887 + }, + { + "epoch": 0.048478545900394944, + "grad_norm": 0.84765625, + "learning_rate": 0.00019894933956458695, + "loss": 1.3108, + "step": 1888 + }, + { + "epoch": 0.048504223096316766, + "grad_norm": 0.93359375, + "learning_rate": 0.0001989486940402991, + "loss": 1.3861, + "step": 1889 + }, + { + "epoch": 0.04852990029223859, + "grad_norm": 0.92578125, + "learning_rate": 0.00019894804831881543, + "loss": 1.3553, + "step": 1890 + }, + { + "epoch": 0.048555577488160404, + "grad_norm": 0.91796875, + "learning_rate": 0.00019894740240013726, + "loss": 1.2326, + "step": 1891 + }, + { + "epoch": 0.048581254684082226, + "grad_norm": 0.9921875, + "learning_rate": 0.00019894675628426582, + "loss": 1.4462, + "step": 1892 + }, + { + "epoch": 0.04860693188000404, + "grad_norm": 0.88671875, + "learning_rate": 0.00019894610997120245, + "loss": 1.2508, + "step": 1893 + }, + { + "epoch": 0.04863260907592586, + "grad_norm": 0.890625, + "learning_rate": 0.0001989454634609484, + "loss": 1.3946, + "step": 1894 + }, + { + "epoch": 0.048658286271847685, + "grad_norm": 0.90234375, + "learning_rate": 0.000198944816753505, + "loss": 1.5199, + "step": 1895 + }, + { + "epoch": 0.0486839634677695, + "grad_norm": 0.9375, + "learning_rate": 0.00019894416984887352, + "loss": 1.3076, + "step": 1896 + }, + { + "epoch": 0.04870964066369132, + "grad_norm": 0.92578125, + "learning_rate": 0.00019894352274705523, + "loss": 1.2816, + "step": 1897 + }, + { + "epoch": 0.04873531785961314, + "grad_norm": 0.9453125, + "learning_rate": 0.00019894287544805145, + "loss": 1.2654, + "step": 1898 + }, + { + "epoch": 0.04876099505553496, + "grad_norm": 1.03125, + "learning_rate": 0.00019894222795186346, + "loss": 1.4038, + "step": 1899 + }, + { + "epoch": 0.04878667225145678, + "grad_norm": 0.92578125, + "learning_rate": 0.00019894158025849255, + "loss": 1.3736, + "step": 1900 + }, + { + "epoch": 0.0488123494473786, + "grad_norm": 0.97265625, + "learning_rate": 0.00019894093236794, + "loss": 1.4918, + "step": 1901 + }, + { + "epoch": 0.04883802664330042, + "grad_norm": 0.875, + "learning_rate": 0.0001989402842802071, + "loss": 1.2634, + "step": 1902 + }, + { + "epoch": 0.048863703839222235, + "grad_norm": 1.1171875, + "learning_rate": 0.00019893963599529517, + "loss": 1.4167, + "step": 1903 + }, + { + "epoch": 0.04888938103514406, + "grad_norm": 0.89453125, + "learning_rate": 0.00019893898751320544, + "loss": 1.2536, + "step": 1904 + }, + { + "epoch": 0.04891505823106588, + "grad_norm": 0.8984375, + "learning_rate": 0.0001989383388339393, + "loss": 1.2672, + "step": 1905 + }, + { + "epoch": 0.048940735426987694, + "grad_norm": 0.9140625, + "learning_rate": 0.00019893768995749795, + "loss": 1.3471, + "step": 1906 + }, + { + "epoch": 0.048966412622909516, + "grad_norm": 0.8671875, + "learning_rate": 0.00019893704088388273, + "loss": 1.4547, + "step": 1907 + }, + { + "epoch": 0.04899208981883133, + "grad_norm": 0.93359375, + "learning_rate": 0.00019893639161309493, + "loss": 1.3532, + "step": 1908 + }, + { + "epoch": 0.049017767014753154, + "grad_norm": 0.90234375, + "learning_rate": 0.00019893574214513584, + "loss": 1.3043, + "step": 1909 + }, + { + "epoch": 0.049043444210674976, + "grad_norm": 0.84375, + "learning_rate": 0.00019893509248000677, + "loss": 1.0792, + "step": 1910 + }, + { + "epoch": 0.04906912140659679, + "grad_norm": 0.9296875, + "learning_rate": 0.00019893444261770898, + "loss": 1.0859, + "step": 1911 + }, + { + "epoch": 0.04909479860251861, + "grad_norm": 0.9765625, + "learning_rate": 0.0001989337925582438, + "loss": 1.4225, + "step": 1912 + }, + { + "epoch": 0.04912047579844043, + "grad_norm": 1.0, + "learning_rate": 0.00019893314230161246, + "loss": 1.2433, + "step": 1913 + }, + { + "epoch": 0.04914615299436225, + "grad_norm": 0.94921875, + "learning_rate": 0.00019893249184781633, + "loss": 1.3691, + "step": 1914 + }, + { + "epoch": 0.04917183019028407, + "grad_norm": 0.94921875, + "learning_rate": 0.00019893184119685667, + "loss": 1.3235, + "step": 1915 + }, + { + "epoch": 0.04919750738620589, + "grad_norm": 0.85546875, + "learning_rate": 0.00019893119034873483, + "loss": 1.1618, + "step": 1916 + }, + { + "epoch": 0.04922318458212771, + "grad_norm": 0.88671875, + "learning_rate": 0.00019893053930345202, + "loss": 1.2079, + "step": 1917 + }, + { + "epoch": 0.049248861778049526, + "grad_norm": 0.97265625, + "learning_rate": 0.00019892988806100958, + "loss": 1.2573, + "step": 1918 + }, + { + "epoch": 0.04927453897397135, + "grad_norm": 0.96484375, + "learning_rate": 0.00019892923662140883, + "loss": 1.2765, + "step": 1919 + }, + { + "epoch": 0.04930021616989317, + "grad_norm": 0.9453125, + "learning_rate": 0.00019892858498465107, + "loss": 1.3863, + "step": 1920 + }, + { + "epoch": 0.049325893365814985, + "grad_norm": 1.0625, + "learning_rate": 0.0001989279331507375, + "loss": 1.3919, + "step": 1921 + }, + { + "epoch": 0.04935157056173681, + "grad_norm": 0.85546875, + "learning_rate": 0.0001989272811196696, + "loss": 1.3087, + "step": 1922 + }, + { + "epoch": 0.04937724775765862, + "grad_norm": 0.98046875, + "learning_rate": 0.00019892662889144848, + "loss": 1.2897, + "step": 1923 + }, + { + "epoch": 0.049402924953580445, + "grad_norm": 0.86328125, + "learning_rate": 0.00019892597646607556, + "loss": 1.2133, + "step": 1924 + }, + { + "epoch": 0.04942860214950227, + "grad_norm": 0.85546875, + "learning_rate": 0.0001989253238435521, + "loss": 1.3948, + "step": 1925 + }, + { + "epoch": 0.04945427934542408, + "grad_norm": 0.90625, + "learning_rate": 0.00019892467102387943, + "loss": 1.4706, + "step": 1926 + }, + { + "epoch": 0.049479956541345904, + "grad_norm": 0.89453125, + "learning_rate": 0.00019892401800705877, + "loss": 1.2848, + "step": 1927 + }, + { + "epoch": 0.04950563373726772, + "grad_norm": 0.92578125, + "learning_rate": 0.00019892336479309153, + "loss": 1.2711, + "step": 1928 + }, + { + "epoch": 0.04953131093318954, + "grad_norm": 0.8671875, + "learning_rate": 0.00019892271138197894, + "loss": 1.1948, + "step": 1929 + }, + { + "epoch": 0.049556988129111364, + "grad_norm": 0.8671875, + "learning_rate": 0.0001989220577737223, + "loss": 1.2033, + "step": 1930 + }, + { + "epoch": 0.04958266532503318, + "grad_norm": 0.8984375, + "learning_rate": 0.00019892140396832297, + "loss": 1.4853, + "step": 1931 + }, + { + "epoch": 0.049608342520955, + "grad_norm": 0.87890625, + "learning_rate": 0.00019892074996578218, + "loss": 1.4383, + "step": 1932 + }, + { + "epoch": 0.049634019716876816, + "grad_norm": 0.83203125, + "learning_rate": 0.00019892009576610132, + "loss": 1.1994, + "step": 1933 + }, + { + "epoch": 0.04965969691279864, + "grad_norm": 0.87890625, + "learning_rate": 0.0001989194413692816, + "loss": 1.3445, + "step": 1934 + }, + { + "epoch": 0.04968537410872046, + "grad_norm": 0.93359375, + "learning_rate": 0.00019891878677532438, + "loss": 1.3417, + "step": 1935 + }, + { + "epoch": 0.049711051304642276, + "grad_norm": 0.90234375, + "learning_rate": 0.00019891813198423094, + "loss": 1.2185, + "step": 1936 + }, + { + "epoch": 0.0497367285005641, + "grad_norm": 0.95703125, + "learning_rate": 0.00019891747699600263, + "loss": 1.3291, + "step": 1937 + }, + { + "epoch": 0.04976240569648591, + "grad_norm": 0.90234375, + "learning_rate": 0.00019891682181064072, + "loss": 1.2572, + "step": 1938 + }, + { + "epoch": 0.049788082892407735, + "grad_norm": 0.97265625, + "learning_rate": 0.0001989161664281465, + "loss": 1.2999, + "step": 1939 + }, + { + "epoch": 0.04981376008832955, + "grad_norm": 0.90234375, + "learning_rate": 0.0001989155108485213, + "loss": 1.1443, + "step": 1940 + }, + { + "epoch": 0.04983943728425137, + "grad_norm": 0.87890625, + "learning_rate": 0.00019891485507176647, + "loss": 1.3962, + "step": 1941 + }, + { + "epoch": 0.049865114480173195, + "grad_norm": 0.9296875, + "learning_rate": 0.0001989141990978832, + "loss": 1.2528, + "step": 1942 + }, + { + "epoch": 0.04989079167609501, + "grad_norm": 0.9453125, + "learning_rate": 0.0001989135429268729, + "loss": 1.3721, + "step": 1943 + }, + { + "epoch": 0.04991646887201683, + "grad_norm": 0.98828125, + "learning_rate": 0.00019891288655873683, + "loss": 1.3572, + "step": 1944 + }, + { + "epoch": 0.04994214606793865, + "grad_norm": 0.890625, + "learning_rate": 0.0001989122299934763, + "loss": 1.2663, + "step": 1945 + }, + { + "epoch": 0.04996782326386047, + "grad_norm": 0.96484375, + "learning_rate": 0.00019891157323109266, + "loss": 1.3917, + "step": 1946 + }, + { + "epoch": 0.04999350045978229, + "grad_norm": 0.86328125, + "learning_rate": 0.0001989109162715872, + "loss": 1.311, + "step": 1947 + }, + { + "epoch": 0.05001917765570411, + "grad_norm": 0.921875, + "learning_rate": 0.00019891025911496118, + "loss": 1.2017, + "step": 1948 + }, + { + "epoch": 0.05004485485162593, + "grad_norm": 0.90625, + "learning_rate": 0.000198909601761216, + "loss": 1.2428, + "step": 1949 + }, + { + "epoch": 0.050070532047547744, + "grad_norm": 0.89453125, + "learning_rate": 0.00019890894421035284, + "loss": 1.3438, + "step": 1950 + }, + { + "epoch": 0.05009620924346957, + "grad_norm": 0.98046875, + "learning_rate": 0.00019890828646237314, + "loss": 1.3809, + "step": 1951 + }, + { + "epoch": 0.05012188643939139, + "grad_norm": 1.0390625, + "learning_rate": 0.00019890762851727814, + "loss": 1.3385, + "step": 1952 + }, + { + "epoch": 0.050147563635313204, + "grad_norm": 0.91015625, + "learning_rate": 0.00019890697037506917, + "loss": 1.4331, + "step": 1953 + }, + { + "epoch": 0.050173240831235026, + "grad_norm": 0.8828125, + "learning_rate": 0.00019890631203574758, + "loss": 1.2484, + "step": 1954 + }, + { + "epoch": 0.05019891802715684, + "grad_norm": 0.92578125, + "learning_rate": 0.0001989056534993146, + "loss": 1.3241, + "step": 1955 + }, + { + "epoch": 0.050224595223078664, + "grad_norm": 0.97265625, + "learning_rate": 0.0001989049947657716, + "loss": 1.3935, + "step": 1956 + }, + { + "epoch": 0.050250272419000486, + "grad_norm": 0.9296875, + "learning_rate": 0.0001989043358351199, + "loss": 1.3358, + "step": 1957 + }, + { + "epoch": 0.0502759496149223, + "grad_norm": 0.90625, + "learning_rate": 0.00019890367670736078, + "loss": 1.2177, + "step": 1958 + }, + { + "epoch": 0.05030162681084412, + "grad_norm": 0.84375, + "learning_rate": 0.00019890301738249554, + "loss": 1.2815, + "step": 1959 + }, + { + "epoch": 0.05032730400676594, + "grad_norm": 0.91796875, + "learning_rate": 0.00019890235786052557, + "loss": 1.2568, + "step": 1960 + }, + { + "epoch": 0.05035298120268776, + "grad_norm": 0.8515625, + "learning_rate": 0.0001989016981414521, + "loss": 1.2639, + "step": 1961 + }, + { + "epoch": 0.05037865839860958, + "grad_norm": 0.93359375, + "learning_rate": 0.00019890103822527649, + "loss": 1.2524, + "step": 1962 + }, + { + "epoch": 0.0504043355945314, + "grad_norm": 0.95703125, + "learning_rate": 0.00019890037811200003, + "loss": 1.3949, + "step": 1963 + }, + { + "epoch": 0.05043001279045322, + "grad_norm": 0.91015625, + "learning_rate": 0.00019889971780162407, + "loss": 1.3615, + "step": 1964 + }, + { + "epoch": 0.050455689986375035, + "grad_norm": 0.94140625, + "learning_rate": 0.00019889905729414991, + "loss": 1.516, + "step": 1965 + }, + { + "epoch": 0.05048136718229686, + "grad_norm": 0.9765625, + "learning_rate": 0.00019889839658957884, + "loss": 1.3248, + "step": 1966 + }, + { + "epoch": 0.05050704437821868, + "grad_norm": 0.91015625, + "learning_rate": 0.00019889773568791222, + "loss": 1.4225, + "step": 1967 + }, + { + "epoch": 0.050532721574140495, + "grad_norm": 0.9609375, + "learning_rate": 0.00019889707458915133, + "loss": 1.5194, + "step": 1968 + }, + { + "epoch": 0.05055839877006232, + "grad_norm": 0.95703125, + "learning_rate": 0.00019889641329329748, + "loss": 1.4552, + "step": 1969 + }, + { + "epoch": 0.05058407596598413, + "grad_norm": 0.92578125, + "learning_rate": 0.00019889575180035205, + "loss": 1.4266, + "step": 1970 + }, + { + "epoch": 0.050609753161905954, + "grad_norm": 0.96875, + "learning_rate": 0.0001988950901103163, + "loss": 1.3355, + "step": 1971 + }, + { + "epoch": 0.050635430357827776, + "grad_norm": 0.9140625, + "learning_rate": 0.00019889442822319158, + "loss": 1.4212, + "step": 1972 + }, + { + "epoch": 0.05066110755374959, + "grad_norm": 0.90625, + "learning_rate": 0.0001988937661389792, + "loss": 1.369, + "step": 1973 + }, + { + "epoch": 0.050686784749671414, + "grad_norm": 0.90234375, + "learning_rate": 0.00019889310385768047, + "loss": 1.3675, + "step": 1974 + }, + { + "epoch": 0.05071246194559323, + "grad_norm": 0.8984375, + "learning_rate": 0.00019889244137929674, + "loss": 1.3959, + "step": 1975 + }, + { + "epoch": 0.05073813914151505, + "grad_norm": 0.9609375, + "learning_rate": 0.00019889177870382926, + "loss": 1.2925, + "step": 1976 + }, + { + "epoch": 0.05076381633743687, + "grad_norm": 0.87109375, + "learning_rate": 0.00019889111583127944, + "loss": 1.2357, + "step": 1977 + }, + { + "epoch": 0.05078949353335869, + "grad_norm": 1.046875, + "learning_rate": 0.00019889045276164855, + "loss": 1.164, + "step": 1978 + }, + { + "epoch": 0.05081517072928051, + "grad_norm": 0.9765625, + "learning_rate": 0.0001988897894949379, + "loss": 1.2161, + "step": 1979 + }, + { + "epoch": 0.050840847925202326, + "grad_norm": 0.95703125, + "learning_rate": 0.00019888912603114887, + "loss": 1.4026, + "step": 1980 + }, + { + "epoch": 0.05086652512112415, + "grad_norm": 0.87890625, + "learning_rate": 0.00019888846237028272, + "loss": 1.31, + "step": 1981 + }, + { + "epoch": 0.05089220231704597, + "grad_norm": 0.9453125, + "learning_rate": 0.00019888779851234077, + "loss": 1.3797, + "step": 1982 + }, + { + "epoch": 0.050917879512967786, + "grad_norm": 0.9296875, + "learning_rate": 0.00019888713445732442, + "loss": 1.3592, + "step": 1983 + }, + { + "epoch": 0.05094355670888961, + "grad_norm": 0.9140625, + "learning_rate": 0.00019888647020523492, + "loss": 1.34, + "step": 1984 + }, + { + "epoch": 0.05096923390481142, + "grad_norm": 0.89453125, + "learning_rate": 0.00019888580575607362, + "loss": 1.2588, + "step": 1985 + }, + { + "epoch": 0.050994911100733245, + "grad_norm": 0.9765625, + "learning_rate": 0.00019888514110984185, + "loss": 1.2873, + "step": 1986 + }, + { + "epoch": 0.05102058829665507, + "grad_norm": 0.88671875, + "learning_rate": 0.0001988844762665409, + "loss": 1.2102, + "step": 1987 + }, + { + "epoch": 0.05104626549257688, + "grad_norm": 0.91015625, + "learning_rate": 0.00019888381122617213, + "loss": 1.402, + "step": 1988 + }, + { + "epoch": 0.051071942688498705, + "grad_norm": 0.84765625, + "learning_rate": 0.00019888314598873687, + "loss": 1.2647, + "step": 1989 + }, + { + "epoch": 0.05109761988442052, + "grad_norm": 0.98046875, + "learning_rate": 0.00019888248055423643, + "loss": 1.4236, + "step": 1990 + }, + { + "epoch": 0.05112329708034234, + "grad_norm": 0.9375, + "learning_rate": 0.00019888181492267216, + "loss": 1.3762, + "step": 1991 + }, + { + "epoch": 0.05114897427626416, + "grad_norm": 0.91796875, + "learning_rate": 0.0001988811490940453, + "loss": 1.431, + "step": 1992 + }, + { + "epoch": 0.05117465147218598, + "grad_norm": 0.921875, + "learning_rate": 0.00019888048306835728, + "loss": 1.1737, + "step": 1993 + }, + { + "epoch": 0.0512003286681078, + "grad_norm": 0.87890625, + "learning_rate": 0.0001988798168456094, + "loss": 1.2038, + "step": 1994 + }, + { + "epoch": 0.05122600586402962, + "grad_norm": 0.95703125, + "learning_rate": 0.00019887915042580295, + "loss": 1.368, + "step": 1995 + }, + { + "epoch": 0.05125168305995144, + "grad_norm": 0.88671875, + "learning_rate": 0.00019887848380893935, + "loss": 1.4637, + "step": 1996 + }, + { + "epoch": 0.051277360255873254, + "grad_norm": 0.96484375, + "learning_rate": 0.0001988778169950198, + "loss": 1.3273, + "step": 1997 + }, + { + "epoch": 0.051303037451795076, + "grad_norm": 0.94140625, + "learning_rate": 0.0001988771499840457, + "loss": 1.3009, + "step": 1998 + }, + { + "epoch": 0.0513287146477169, + "grad_norm": 0.953125, + "learning_rate": 0.0001988764827760184, + "loss": 1.4057, + "step": 1999 + }, + { + "epoch": 0.051354391843638714, + "grad_norm": 0.875, + "learning_rate": 0.00019887581537093917, + "loss": 1.35, + "step": 2000 + }, + { + "epoch": 0.051354391843638714, + "eval_loss": 1.3078194856643677, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 408.4975, + "eval_samples_per_second": 24.48, + "eval_steps_per_second": 0.766, + "step": 2000 + }, + { + "epoch": 0.051380069039560536, + "grad_norm": 0.875, + "learning_rate": 0.00019887514776880936, + "loss": 1.0943, + "step": 2001 + }, + { + "epoch": 0.05140574623548235, + "grad_norm": 0.9375, + "learning_rate": 0.00019887447996963035, + "loss": 1.3075, + "step": 2002 + }, + { + "epoch": 0.05143142343140417, + "grad_norm": 0.8984375, + "learning_rate": 0.0001988738119734034, + "loss": 1.4501, + "step": 2003 + }, + { + "epoch": 0.051457100627325995, + "grad_norm": 0.94140625, + "learning_rate": 0.00019887314378012988, + "loss": 1.3165, + "step": 2004 + }, + { + "epoch": 0.05148277782324781, + "grad_norm": 0.98828125, + "learning_rate": 0.00019887247538981115, + "loss": 1.38, + "step": 2005 + }, + { + "epoch": 0.05150845501916963, + "grad_norm": 0.91796875, + "learning_rate": 0.00019887180680244849, + "loss": 1.3203, + "step": 2006 + }, + { + "epoch": 0.05153413221509145, + "grad_norm": 0.87109375, + "learning_rate": 0.00019887113801804324, + "loss": 1.2588, + "step": 2007 + }, + { + "epoch": 0.05155980941101327, + "grad_norm": 0.91015625, + "learning_rate": 0.00019887046903659675, + "loss": 1.4877, + "step": 2008 + }, + { + "epoch": 0.05158548660693509, + "grad_norm": 1.0078125, + "learning_rate": 0.00019886979985811034, + "loss": 1.4762, + "step": 2009 + }, + { + "epoch": 0.05161116380285691, + "grad_norm": 0.9375, + "learning_rate": 0.00019886913048258533, + "loss": 1.3109, + "step": 2010 + }, + { + "epoch": 0.05163684099877873, + "grad_norm": 0.8984375, + "learning_rate": 0.00019886846091002311, + "loss": 1.0714, + "step": 2011 + }, + { + "epoch": 0.051662518194700545, + "grad_norm": 0.890625, + "learning_rate": 0.00019886779114042496, + "loss": 1.2042, + "step": 2012 + }, + { + "epoch": 0.05168819539062237, + "grad_norm": 0.9296875, + "learning_rate": 0.00019886712117379225, + "loss": 1.4327, + "step": 2013 + }, + { + "epoch": 0.05171387258654419, + "grad_norm": 0.890625, + "learning_rate": 0.00019886645101012626, + "loss": 1.2262, + "step": 2014 + }, + { + "epoch": 0.051739549782466004, + "grad_norm": 0.953125, + "learning_rate": 0.00019886578064942843, + "loss": 1.3222, + "step": 2015 + }, + { + "epoch": 0.05176522697838783, + "grad_norm": 0.8984375, + "learning_rate": 0.00019886511009169997, + "loss": 1.2282, + "step": 2016 + }, + { + "epoch": 0.05179090417430964, + "grad_norm": 0.9296875, + "learning_rate": 0.00019886443933694227, + "loss": 1.1666, + "step": 2017 + }, + { + "epoch": 0.051816581370231464, + "grad_norm": 0.921875, + "learning_rate": 0.00019886376838515671, + "loss": 1.4997, + "step": 2018 + }, + { + "epoch": 0.051842258566153286, + "grad_norm": 0.8984375, + "learning_rate": 0.0001988630972363446, + "loss": 1.3359, + "step": 2019 + }, + { + "epoch": 0.0518679357620751, + "grad_norm": 0.8984375, + "learning_rate": 0.00019886242589050722, + "loss": 1.2791, + "step": 2020 + }, + { + "epoch": 0.051893612957996924, + "grad_norm": 0.89453125, + "learning_rate": 0.000198861754347646, + "loss": 1.2563, + "step": 2021 + }, + { + "epoch": 0.05191929015391874, + "grad_norm": 0.91015625, + "learning_rate": 0.0001988610826077622, + "loss": 1.3144, + "step": 2022 + }, + { + "epoch": 0.05194496734984056, + "grad_norm": 0.90625, + "learning_rate": 0.00019886041067085721, + "loss": 1.2859, + "step": 2023 + }, + { + "epoch": 0.05197064454576238, + "grad_norm": 0.8671875, + "learning_rate": 0.00019885973853693236, + "loss": 1.1412, + "step": 2024 + }, + { + "epoch": 0.0519963217416842, + "grad_norm": 1.0078125, + "learning_rate": 0.00019885906620598895, + "loss": 1.0788, + "step": 2025 + }, + { + "epoch": 0.05202199893760602, + "grad_norm": 1.03125, + "learning_rate": 0.00019885839367802838, + "loss": 1.3189, + "step": 2026 + }, + { + "epoch": 0.052047676133527836, + "grad_norm": 0.94921875, + "learning_rate": 0.00019885772095305196, + "loss": 1.3785, + "step": 2027 + }, + { + "epoch": 0.05207335332944966, + "grad_norm": 0.875, + "learning_rate": 0.00019885704803106102, + "loss": 1.301, + "step": 2028 + }, + { + "epoch": 0.05209903052537148, + "grad_norm": 0.92578125, + "learning_rate": 0.0001988563749120569, + "loss": 1.3255, + "step": 2029 + }, + { + "epoch": 0.052124707721293295, + "grad_norm": 0.93359375, + "learning_rate": 0.00019885570159604095, + "loss": 1.2634, + "step": 2030 + }, + { + "epoch": 0.05215038491721512, + "grad_norm": 0.94140625, + "learning_rate": 0.00019885502808301457, + "loss": 1.3346, + "step": 2031 + }, + { + "epoch": 0.05217606211313693, + "grad_norm": 0.85546875, + "learning_rate": 0.000198854354372979, + "loss": 1.2621, + "step": 2032 + }, + { + "epoch": 0.052201739309058755, + "grad_norm": 0.87890625, + "learning_rate": 0.00019885368046593566, + "loss": 1.2375, + "step": 2033 + }, + { + "epoch": 0.05222741650498058, + "grad_norm": 0.91796875, + "learning_rate": 0.00019885300636188587, + "loss": 1.2985, + "step": 2034 + }, + { + "epoch": 0.05225309370090239, + "grad_norm": 0.90234375, + "learning_rate": 0.00019885233206083094, + "loss": 1.3735, + "step": 2035 + }, + { + "epoch": 0.052278770896824214, + "grad_norm": 0.96875, + "learning_rate": 0.00019885165756277224, + "loss": 1.3123, + "step": 2036 + }, + { + "epoch": 0.05230444809274603, + "grad_norm": 0.984375, + "learning_rate": 0.00019885098286771114, + "loss": 1.2665, + "step": 2037 + }, + { + "epoch": 0.05233012528866785, + "grad_norm": 0.9140625, + "learning_rate": 0.00019885030797564893, + "loss": 1.1628, + "step": 2038 + }, + { + "epoch": 0.052355802484589674, + "grad_norm": 0.9609375, + "learning_rate": 0.00019884963288658702, + "loss": 1.2593, + "step": 2039 + }, + { + "epoch": 0.05238147968051149, + "grad_norm": 0.98046875, + "learning_rate": 0.00019884895760052668, + "loss": 1.306, + "step": 2040 + }, + { + "epoch": 0.05240715687643331, + "grad_norm": 0.95703125, + "learning_rate": 0.00019884828211746934, + "loss": 1.5319, + "step": 2041 + }, + { + "epoch": 0.052432834072355126, + "grad_norm": 0.9609375, + "learning_rate": 0.00019884760643741628, + "loss": 1.4409, + "step": 2042 + }, + { + "epoch": 0.05245851126827695, + "grad_norm": 0.9375, + "learning_rate": 0.00019884693056036886, + "loss": 1.1899, + "step": 2043 + }, + { + "epoch": 0.052484188464198764, + "grad_norm": 1.015625, + "learning_rate": 0.00019884625448632844, + "loss": 1.1932, + "step": 2044 + }, + { + "epoch": 0.052509865660120586, + "grad_norm": 0.921875, + "learning_rate": 0.0001988455782152964, + "loss": 1.0321, + "step": 2045 + }, + { + "epoch": 0.05253554285604241, + "grad_norm": 0.87109375, + "learning_rate": 0.000198844901747274, + "loss": 1.2805, + "step": 2046 + }, + { + "epoch": 0.05256122005196422, + "grad_norm": 1.0078125, + "learning_rate": 0.00019884422508226267, + "loss": 1.1858, + "step": 2047 + }, + { + "epoch": 0.052586897247886046, + "grad_norm": 0.921875, + "learning_rate": 0.0001988435482202637, + "loss": 1.3124, + "step": 2048 + }, + { + "epoch": 0.05261257444380786, + "grad_norm": 1.0625, + "learning_rate": 0.00019884287116127852, + "loss": 1.3196, + "step": 2049 + }, + { + "epoch": 0.05263825163972968, + "grad_norm": 0.90234375, + "learning_rate": 0.00019884219390530836, + "loss": 1.3815, + "step": 2050 + }, + { + "epoch": 0.052663928835651505, + "grad_norm": 0.98046875, + "learning_rate": 0.00019884151645235468, + "loss": 1.2959, + "step": 2051 + }, + { + "epoch": 0.05268960603157332, + "grad_norm": 0.875, + "learning_rate": 0.00019884083880241876, + "loss": 1.3616, + "step": 2052 + }, + { + "epoch": 0.05271528322749514, + "grad_norm": 0.9453125, + "learning_rate": 0.000198840160955502, + "loss": 1.3333, + "step": 2053 + }, + { + "epoch": 0.05274096042341696, + "grad_norm": 0.8828125, + "learning_rate": 0.00019883948291160572, + "loss": 1.3365, + "step": 2054 + }, + { + "epoch": 0.05276663761933878, + "grad_norm": 0.953125, + "learning_rate": 0.0001988388046707313, + "loss": 1.4535, + "step": 2055 + }, + { + "epoch": 0.0527923148152606, + "grad_norm": 0.91796875, + "learning_rate": 0.00019883812623288003, + "loss": 1.3846, + "step": 2056 + }, + { + "epoch": 0.05281799201118242, + "grad_norm": 0.91796875, + "learning_rate": 0.00019883744759805331, + "loss": 1.2507, + "step": 2057 + }, + { + "epoch": 0.05284366920710424, + "grad_norm": 0.9375, + "learning_rate": 0.0001988367687662525, + "loss": 1.4706, + "step": 2058 + }, + { + "epoch": 0.052869346403026055, + "grad_norm": 0.98828125, + "learning_rate": 0.00019883608973747893, + "loss": 1.4118, + "step": 2059 + }, + { + "epoch": 0.05289502359894788, + "grad_norm": 0.94921875, + "learning_rate": 0.00019883541051173394, + "loss": 1.3776, + "step": 2060 + }, + { + "epoch": 0.0529207007948697, + "grad_norm": 0.9453125, + "learning_rate": 0.00019883473108901897, + "loss": 1.2741, + "step": 2061 + }, + { + "epoch": 0.052946377990791514, + "grad_norm": 0.9375, + "learning_rate": 0.00019883405146933525, + "loss": 1.3325, + "step": 2062 + }, + { + "epoch": 0.052972055186713336, + "grad_norm": 0.90625, + "learning_rate": 0.0001988333716526842, + "loss": 1.3921, + "step": 2063 + }, + { + "epoch": 0.05299773238263515, + "grad_norm": 0.89453125, + "learning_rate": 0.00019883269163906717, + "loss": 1.2357, + "step": 2064 + }, + { + "epoch": 0.053023409578556974, + "grad_norm": 0.79296875, + "learning_rate": 0.00019883201142848554, + "loss": 1.2261, + "step": 2065 + }, + { + "epoch": 0.053049086774478796, + "grad_norm": 1.0, + "learning_rate": 0.0001988313310209406, + "loss": 1.2858, + "step": 2066 + }, + { + "epoch": 0.05307476397040061, + "grad_norm": 0.89453125, + "learning_rate": 0.00019883065041643378, + "loss": 1.37, + "step": 2067 + }, + { + "epoch": 0.05310044116632243, + "grad_norm": 0.88671875, + "learning_rate": 0.0001988299696149664, + "loss": 1.2699, + "step": 2068 + }, + { + "epoch": 0.05312611836224425, + "grad_norm": 0.90625, + "learning_rate": 0.0001988292886165398, + "loss": 1.332, + "step": 2069 + }, + { + "epoch": 0.05315179555816607, + "grad_norm": 0.91796875, + "learning_rate": 0.00019882860742115535, + "loss": 1.2524, + "step": 2070 + }, + { + "epoch": 0.05317747275408789, + "grad_norm": 0.8671875, + "learning_rate": 0.00019882792602881442, + "loss": 1.1156, + "step": 2071 + }, + { + "epoch": 0.05320314995000971, + "grad_norm": 0.94921875, + "learning_rate": 0.00019882724443951838, + "loss": 1.5621, + "step": 2072 + }, + { + "epoch": 0.05322882714593153, + "grad_norm": 1.0, + "learning_rate": 0.00019882656265326852, + "loss": 1.3806, + "step": 2073 + }, + { + "epoch": 0.053254504341853345, + "grad_norm": 0.94140625, + "learning_rate": 0.0001988258806700663, + "loss": 1.2464, + "step": 2074 + }, + { + "epoch": 0.05328018153777517, + "grad_norm": 0.85546875, + "learning_rate": 0.00019882519848991298, + "loss": 1.2882, + "step": 2075 + }, + { + "epoch": 0.05330585873369699, + "grad_norm": 0.89453125, + "learning_rate": 0.00019882451611281, + "loss": 1.2245, + "step": 2076 + }, + { + "epoch": 0.053331535929618805, + "grad_norm": 0.90625, + "learning_rate": 0.00019882383353875868, + "loss": 1.235, + "step": 2077 + }, + { + "epoch": 0.05335721312554063, + "grad_norm": 0.9375, + "learning_rate": 0.00019882315076776038, + "loss": 1.3184, + "step": 2078 + }, + { + "epoch": 0.05338289032146244, + "grad_norm": 0.91796875, + "learning_rate": 0.00019882246779981646, + "loss": 1.4397, + "step": 2079 + }, + { + "epoch": 0.053408567517384264, + "grad_norm": 0.921875, + "learning_rate": 0.0001988217846349283, + "loss": 1.1651, + "step": 2080 + }, + { + "epoch": 0.05343424471330609, + "grad_norm": 0.875, + "learning_rate": 0.00019882110127309726, + "loss": 1.235, + "step": 2081 + }, + { + "epoch": 0.0534599219092279, + "grad_norm": 0.91015625, + "learning_rate": 0.00019882041771432466, + "loss": 1.1791, + "step": 2082 + }, + { + "epoch": 0.053485599105149724, + "grad_norm": 0.96484375, + "learning_rate": 0.00019881973395861194, + "loss": 1.3552, + "step": 2083 + }, + { + "epoch": 0.05351127630107154, + "grad_norm": 0.96875, + "learning_rate": 0.0001988190500059604, + "loss": 1.3097, + "step": 2084 + }, + { + "epoch": 0.05353695349699336, + "grad_norm": 0.9453125, + "learning_rate": 0.0001988183658563714, + "loss": 1.2065, + "step": 2085 + }, + { + "epoch": 0.053562630692915184, + "grad_norm": 0.91015625, + "learning_rate": 0.0001988176815098463, + "loss": 1.3288, + "step": 2086 + }, + { + "epoch": 0.053588307888837, + "grad_norm": 0.875, + "learning_rate": 0.00019881699696638655, + "loss": 1.3044, + "step": 2087 + }, + { + "epoch": 0.05361398508475882, + "grad_norm": 0.89453125, + "learning_rate": 0.0001988163122259934, + "loss": 1.1959, + "step": 2088 + }, + { + "epoch": 0.053639662280680636, + "grad_norm": 0.91796875, + "learning_rate": 0.0001988156272886683, + "loss": 1.2977, + "step": 2089 + }, + { + "epoch": 0.05366533947660246, + "grad_norm": 0.890625, + "learning_rate": 0.00019881494215441258, + "loss": 1.3833, + "step": 2090 + }, + { + "epoch": 0.05369101667252428, + "grad_norm": 0.859375, + "learning_rate": 0.00019881425682322757, + "loss": 1.2247, + "step": 2091 + }, + { + "epoch": 0.053716693868446096, + "grad_norm": 0.92578125, + "learning_rate": 0.0001988135712951147, + "loss": 1.319, + "step": 2092 + }, + { + "epoch": 0.05374237106436792, + "grad_norm": 0.9140625, + "learning_rate": 0.0001988128855700753, + "loss": 1.2458, + "step": 2093 + }, + { + "epoch": 0.05376804826028973, + "grad_norm": 0.84375, + "learning_rate": 0.00019881219964811074, + "loss": 1.1517, + "step": 2094 + }, + { + "epoch": 0.053793725456211555, + "grad_norm": 0.91796875, + "learning_rate": 0.00019881151352922242, + "loss": 1.1995, + "step": 2095 + }, + { + "epoch": 0.05381940265213337, + "grad_norm": 0.94921875, + "learning_rate": 0.00019881082721341168, + "loss": 1.2963, + "step": 2096 + }, + { + "epoch": 0.05384507984805519, + "grad_norm": 0.984375, + "learning_rate": 0.00019881014070067984, + "loss": 1.4053, + "step": 2097 + }, + { + "epoch": 0.053870757043977015, + "grad_norm": 0.9453125, + "learning_rate": 0.00019880945399102835, + "loss": 1.1891, + "step": 2098 + }, + { + "epoch": 0.05389643423989883, + "grad_norm": 0.87890625, + "learning_rate": 0.00019880876708445856, + "loss": 1.3646, + "step": 2099 + }, + { + "epoch": 0.05392211143582065, + "grad_norm": 0.87109375, + "learning_rate": 0.00019880807998097177, + "loss": 1.1036, + "step": 2100 + }, + { + "epoch": 0.05394778863174247, + "grad_norm": 1.0, + "learning_rate": 0.00019880739268056944, + "loss": 1.3872, + "step": 2101 + }, + { + "epoch": 0.05397346582766429, + "grad_norm": 0.9375, + "learning_rate": 0.00019880670518325288, + "loss": 1.2082, + "step": 2102 + }, + { + "epoch": 0.05399914302358611, + "grad_norm": 0.8828125, + "learning_rate": 0.0001988060174890235, + "loss": 1.2335, + "step": 2103 + }, + { + "epoch": 0.05402482021950793, + "grad_norm": 0.92578125, + "learning_rate": 0.00019880532959788263, + "loss": 1.46, + "step": 2104 + }, + { + "epoch": 0.05405049741542975, + "grad_norm": 0.96875, + "learning_rate": 0.00019880464150983168, + "loss": 1.3003, + "step": 2105 + }, + { + "epoch": 0.054076174611351564, + "grad_norm": 0.875, + "learning_rate": 0.000198803953224872, + "loss": 1.4065, + "step": 2106 + }, + { + "epoch": 0.054101851807273386, + "grad_norm": 0.93359375, + "learning_rate": 0.00019880326474300497, + "loss": 1.3131, + "step": 2107 + }, + { + "epoch": 0.05412752900319521, + "grad_norm": 0.89453125, + "learning_rate": 0.00019880257606423197, + "loss": 1.3857, + "step": 2108 + }, + { + "epoch": 0.054153206199117024, + "grad_norm": 0.89453125, + "learning_rate": 0.00019880188718855433, + "loss": 1.2329, + "step": 2109 + }, + { + "epoch": 0.054178883395038846, + "grad_norm": 0.87890625, + "learning_rate": 0.0001988011981159735, + "loss": 1.1935, + "step": 2110 + }, + { + "epoch": 0.05420456059096066, + "grad_norm": 0.89453125, + "learning_rate": 0.00019880050884649078, + "loss": 1.383, + "step": 2111 + }, + { + "epoch": 0.05423023778688248, + "grad_norm": 0.89453125, + "learning_rate": 0.00019879981938010754, + "loss": 1.2846, + "step": 2112 + }, + { + "epoch": 0.054255914982804306, + "grad_norm": 0.875, + "learning_rate": 0.00019879912971682524, + "loss": 1.5588, + "step": 2113 + }, + { + "epoch": 0.05428159217872612, + "grad_norm": 0.91015625, + "learning_rate": 0.00019879843985664514, + "loss": 1.3921, + "step": 2114 + }, + { + "epoch": 0.05430726937464794, + "grad_norm": 0.93359375, + "learning_rate": 0.00019879774979956872, + "loss": 1.2027, + "step": 2115 + }, + { + "epoch": 0.05433294657056976, + "grad_norm": 0.95703125, + "learning_rate": 0.00019879705954559728, + "loss": 1.3014, + "step": 2116 + }, + { + "epoch": 0.05435862376649158, + "grad_norm": 0.9921875, + "learning_rate": 0.00019879636909473226, + "loss": 1.3817, + "step": 2117 + }, + { + "epoch": 0.0543843009624134, + "grad_norm": 0.859375, + "learning_rate": 0.000198795678446975, + "loss": 1.3864, + "step": 2118 + }, + { + "epoch": 0.05440997815833522, + "grad_norm": 0.94140625, + "learning_rate": 0.00019879498760232684, + "loss": 1.3812, + "step": 2119 + }, + { + "epoch": 0.05443565535425704, + "grad_norm": 0.9140625, + "learning_rate": 0.0001987942965607892, + "loss": 1.3025, + "step": 2120 + }, + { + "epoch": 0.054461332550178855, + "grad_norm": 0.83984375, + "learning_rate": 0.00019879360532236347, + "loss": 1.1075, + "step": 2121 + }, + { + "epoch": 0.05448700974610068, + "grad_norm": 0.9375, + "learning_rate": 0.000198792913887051, + "loss": 1.1759, + "step": 2122 + }, + { + "epoch": 0.0545126869420225, + "grad_norm": 0.796875, + "learning_rate": 0.00019879222225485318, + "loss": 1.2904, + "step": 2123 + }, + { + "epoch": 0.054538364137944315, + "grad_norm": 0.921875, + "learning_rate": 0.0001987915304257714, + "loss": 1.1638, + "step": 2124 + }, + { + "epoch": 0.05456404133386614, + "grad_norm": 0.94921875, + "learning_rate": 0.00019879083839980698, + "loss": 1.3074, + "step": 2125 + }, + { + "epoch": 0.05458971852978795, + "grad_norm": 0.9609375, + "learning_rate": 0.00019879014617696136, + "loss": 1.4983, + "step": 2126 + }, + { + "epoch": 0.054615395725709774, + "grad_norm": 0.9140625, + "learning_rate": 0.00019878945375723594, + "loss": 1.2858, + "step": 2127 + }, + { + "epoch": 0.054641072921631596, + "grad_norm": 0.8828125, + "learning_rate": 0.000198788761140632, + "loss": 1.2752, + "step": 2128 + }, + { + "epoch": 0.05466675011755341, + "grad_norm": 0.91796875, + "learning_rate": 0.00019878806832715102, + "loss": 1.2038, + "step": 2129 + }, + { + "epoch": 0.054692427313475234, + "grad_norm": 0.90625, + "learning_rate": 0.00019878737531679437, + "loss": 1.4201, + "step": 2130 + }, + { + "epoch": 0.05471810450939705, + "grad_norm": 0.85546875, + "learning_rate": 0.00019878668210956336, + "loss": 1.2794, + "step": 2131 + }, + { + "epoch": 0.05474378170531887, + "grad_norm": 0.98046875, + "learning_rate": 0.00019878598870545942, + "loss": 1.3725, + "step": 2132 + }, + { + "epoch": 0.05476945890124069, + "grad_norm": 0.87890625, + "learning_rate": 0.00019878529510448397, + "loss": 1.3015, + "step": 2133 + }, + { + "epoch": 0.05479513609716251, + "grad_norm": 1.046875, + "learning_rate": 0.0001987846013066383, + "loss": 1.52, + "step": 2134 + }, + { + "epoch": 0.05482081329308433, + "grad_norm": 0.83984375, + "learning_rate": 0.00019878390731192385, + "loss": 1.3116, + "step": 2135 + }, + { + "epoch": 0.054846490489006146, + "grad_norm": 0.9375, + "learning_rate": 0.00019878321312034202, + "loss": 1.4708, + "step": 2136 + }, + { + "epoch": 0.05487216768492797, + "grad_norm": 0.95703125, + "learning_rate": 0.00019878251873189416, + "loss": 1.313, + "step": 2137 + }, + { + "epoch": 0.05489784488084979, + "grad_norm": 0.875, + "learning_rate": 0.00019878182414658166, + "loss": 1.2716, + "step": 2138 + }, + { + "epoch": 0.054923522076771605, + "grad_norm": 0.859375, + "learning_rate": 0.00019878112936440589, + "loss": 1.2439, + "step": 2139 + }, + { + "epoch": 0.05494919927269343, + "grad_norm": 1.03125, + "learning_rate": 0.00019878043438536827, + "loss": 1.4015, + "step": 2140 + }, + { + "epoch": 0.05497487646861524, + "grad_norm": 0.859375, + "learning_rate": 0.00019877973920947017, + "loss": 1.2873, + "step": 2141 + }, + { + "epoch": 0.055000553664537065, + "grad_norm": 0.85546875, + "learning_rate": 0.00019877904383671296, + "loss": 1.318, + "step": 2142 + }, + { + "epoch": 0.05502623086045888, + "grad_norm": 0.91015625, + "learning_rate": 0.00019877834826709804, + "loss": 1.4446, + "step": 2143 + }, + { + "epoch": 0.0550519080563807, + "grad_norm": 0.9140625, + "learning_rate": 0.0001987776525006268, + "loss": 1.1837, + "step": 2144 + }, + { + "epoch": 0.055077585252302524, + "grad_norm": 0.90625, + "learning_rate": 0.00019877695653730063, + "loss": 1.3495, + "step": 2145 + }, + { + "epoch": 0.05510326244822434, + "grad_norm": 0.92578125, + "learning_rate": 0.00019877626037712091, + "loss": 1.3372, + "step": 2146 + }, + { + "epoch": 0.05512893964414616, + "grad_norm": 0.90625, + "learning_rate": 0.000198775564020089, + "loss": 1.067, + "step": 2147 + }, + { + "epoch": 0.05515461684006798, + "grad_norm": 0.890625, + "learning_rate": 0.00019877486746620635, + "loss": 1.0175, + "step": 2148 + }, + { + "epoch": 0.0551802940359898, + "grad_norm": 0.9609375, + "learning_rate": 0.00019877417071547426, + "loss": 1.3964, + "step": 2149 + }, + { + "epoch": 0.05520597123191162, + "grad_norm": 0.94140625, + "learning_rate": 0.0001987734737678942, + "loss": 1.1235, + "step": 2150 + }, + { + "epoch": 0.05523164842783344, + "grad_norm": 0.92578125, + "learning_rate": 0.00019877277662346755, + "loss": 1.2778, + "step": 2151 + }, + { + "epoch": 0.05525732562375526, + "grad_norm": 0.9296875, + "learning_rate": 0.00019877207928219566, + "loss": 1.3542, + "step": 2152 + }, + { + "epoch": 0.055283002819677074, + "grad_norm": 0.87890625, + "learning_rate": 0.0001987713817440799, + "loss": 1.304, + "step": 2153 + }, + { + "epoch": 0.055308680015598896, + "grad_norm": 0.89453125, + "learning_rate": 0.00019877068400912175, + "loss": 1.3343, + "step": 2154 + }, + { + "epoch": 0.05533435721152072, + "grad_norm": 0.95703125, + "learning_rate": 0.00019876998607732253, + "loss": 1.3362, + "step": 2155 + }, + { + "epoch": 0.055360034407442534, + "grad_norm": 0.91015625, + "learning_rate": 0.00019876928794868365, + "loss": 1.34, + "step": 2156 + }, + { + "epoch": 0.055385711603364356, + "grad_norm": 0.8671875, + "learning_rate": 0.0001987685896232065, + "loss": 1.1004, + "step": 2157 + }, + { + "epoch": 0.05541138879928617, + "grad_norm": 0.97265625, + "learning_rate": 0.0001987678911008925, + "loss": 1.289, + "step": 2158 + }, + { + "epoch": 0.05543706599520799, + "grad_norm": 0.85546875, + "learning_rate": 0.000198767192381743, + "loss": 1.265, + "step": 2159 + }, + { + "epoch": 0.055462743191129815, + "grad_norm": 0.859375, + "learning_rate": 0.00019876649346575937, + "loss": 1.2621, + "step": 2160 + }, + { + "epoch": 0.05548842038705163, + "grad_norm": 0.90625, + "learning_rate": 0.00019876579435294307, + "loss": 1.1374, + "step": 2161 + }, + { + "epoch": 0.05551409758297345, + "grad_norm": 0.94921875, + "learning_rate": 0.00019876509504329544, + "loss": 1.2725, + "step": 2162 + }, + { + "epoch": 0.05553977477889527, + "grad_norm": 0.8671875, + "learning_rate": 0.00019876439553681794, + "loss": 1.2743, + "step": 2163 + }, + { + "epoch": 0.05556545197481709, + "grad_norm": 0.9296875, + "learning_rate": 0.0001987636958335119, + "loss": 1.2597, + "step": 2164 + }, + { + "epoch": 0.05559112917073891, + "grad_norm": 0.93359375, + "learning_rate": 0.00019876299593337871, + "loss": 1.4844, + "step": 2165 + }, + { + "epoch": 0.05561680636666073, + "grad_norm": 0.921875, + "learning_rate": 0.00019876229583641982, + "loss": 1.2919, + "step": 2166 + }, + { + "epoch": 0.05564248356258255, + "grad_norm": 0.90234375, + "learning_rate": 0.0001987615955426366, + "loss": 1.2401, + "step": 2167 + }, + { + "epoch": 0.055668160758504365, + "grad_norm": 0.875, + "learning_rate": 0.00019876089505203043, + "loss": 1.346, + "step": 2168 + }, + { + "epoch": 0.05569383795442619, + "grad_norm": 0.91796875, + "learning_rate": 0.00019876019436460273, + "loss": 1.3235, + "step": 2169 + }, + { + "epoch": 0.05571951515034801, + "grad_norm": 1.0390625, + "learning_rate": 0.00019875949348035485, + "loss": 1.3286, + "step": 2170 + }, + { + "epoch": 0.055745192346269824, + "grad_norm": 0.90234375, + "learning_rate": 0.00019875879239928827, + "loss": 1.3121, + "step": 2171 + }, + { + "epoch": 0.055770869542191646, + "grad_norm": 0.85546875, + "learning_rate": 0.00019875809112140428, + "loss": 1.3647, + "step": 2172 + }, + { + "epoch": 0.05579654673811346, + "grad_norm": 1.015625, + "learning_rate": 0.0001987573896467044, + "loss": 1.4047, + "step": 2173 + }, + { + "epoch": 0.055822223934035284, + "grad_norm": 0.90234375, + "learning_rate": 0.0001987566879751899, + "loss": 1.2958, + "step": 2174 + }, + { + "epoch": 0.055847901129957106, + "grad_norm": 0.93359375, + "learning_rate": 0.00019875598610686227, + "loss": 1.1011, + "step": 2175 + }, + { + "epoch": 0.05587357832587892, + "grad_norm": 1.0078125, + "learning_rate": 0.00019875528404172287, + "loss": 1.3185, + "step": 2176 + }, + { + "epoch": 0.05589925552180074, + "grad_norm": 0.828125, + "learning_rate": 0.00019875458177977313, + "loss": 1.1743, + "step": 2177 + }, + { + "epoch": 0.05592493271772256, + "grad_norm": 0.88671875, + "learning_rate": 0.0001987538793210144, + "loss": 1.305, + "step": 2178 + }, + { + "epoch": 0.05595060991364438, + "grad_norm": 0.94140625, + "learning_rate": 0.00019875317666544814, + "loss": 1.4606, + "step": 2179 + }, + { + "epoch": 0.0559762871095662, + "grad_norm": 0.89453125, + "learning_rate": 0.0001987524738130757, + "loss": 1.2737, + "step": 2180 + }, + { + "epoch": 0.05600196430548802, + "grad_norm": 0.90234375, + "learning_rate": 0.00019875177076389853, + "loss": 1.32, + "step": 2181 + }, + { + "epoch": 0.05602764150140984, + "grad_norm": 0.8828125, + "learning_rate": 0.00019875106751791796, + "loss": 1.3141, + "step": 2182 + }, + { + "epoch": 0.056053318697331656, + "grad_norm": 0.83203125, + "learning_rate": 0.00019875036407513544, + "loss": 1.2447, + "step": 2183 + }, + { + "epoch": 0.05607899589325348, + "grad_norm": 0.87109375, + "learning_rate": 0.0001987496604355524, + "loss": 1.2184, + "step": 2184 + }, + { + "epoch": 0.0561046730891753, + "grad_norm": 0.90625, + "learning_rate": 0.00019874895659917019, + "loss": 1.3967, + "step": 2185 + }, + { + "epoch": 0.056130350285097115, + "grad_norm": 0.95703125, + "learning_rate": 0.0001987482525659902, + "loss": 1.2631, + "step": 2186 + }, + { + "epoch": 0.05615602748101894, + "grad_norm": 0.9765625, + "learning_rate": 0.0001987475483360139, + "loss": 1.4103, + "step": 2187 + }, + { + "epoch": 0.05618170467694075, + "grad_norm": 0.89453125, + "learning_rate": 0.00019874684390924264, + "loss": 1.2202, + "step": 2188 + }, + { + "epoch": 0.056207381872862575, + "grad_norm": 0.94140625, + "learning_rate": 0.00019874613928567785, + "loss": 1.2875, + "step": 2189 + }, + { + "epoch": 0.0562330590687844, + "grad_norm": 0.90625, + "learning_rate": 0.0001987454344653209, + "loss": 1.0618, + "step": 2190 + }, + { + "epoch": 0.05625873626470621, + "grad_norm": 0.83203125, + "learning_rate": 0.00019874472944817324, + "loss": 1.3171, + "step": 2191 + }, + { + "epoch": 0.056284413460628034, + "grad_norm": 0.9296875, + "learning_rate": 0.00019874402423423625, + "loss": 1.2348, + "step": 2192 + }, + { + "epoch": 0.05631009065654985, + "grad_norm": 0.9140625, + "learning_rate": 0.00019874331882351132, + "loss": 1.2365, + "step": 2193 + }, + { + "epoch": 0.05633576785247167, + "grad_norm": 0.9296875, + "learning_rate": 0.00019874261321599989, + "loss": 1.4614, + "step": 2194 + }, + { + "epoch": 0.05636144504839349, + "grad_norm": 1.0234375, + "learning_rate": 0.00019874190741170337, + "loss": 1.3108, + "step": 2195 + }, + { + "epoch": 0.05638712224431531, + "grad_norm": 1.1640625, + "learning_rate": 0.00019874120141062312, + "loss": 1.1305, + "step": 2196 + }, + { + "epoch": 0.05641279944023713, + "grad_norm": 0.96484375, + "learning_rate": 0.0001987404952127606, + "loss": 1.3475, + "step": 2197 + }, + { + "epoch": 0.056438476636158946, + "grad_norm": 1.0546875, + "learning_rate": 0.00019873978881811716, + "loss": 1.1946, + "step": 2198 + }, + { + "epoch": 0.05646415383208077, + "grad_norm": 0.9453125, + "learning_rate": 0.00019873908222669425, + "loss": 1.6327, + "step": 2199 + }, + { + "epoch": 0.056489831028002584, + "grad_norm": 0.94921875, + "learning_rate": 0.00019873837543849326, + "loss": 1.2018, + "step": 2200 + }, + { + "epoch": 0.056515508223924406, + "grad_norm": 0.84765625, + "learning_rate": 0.00019873766845351565, + "loss": 1.1654, + "step": 2201 + }, + { + "epoch": 0.05654118541984623, + "grad_norm": 0.8515625, + "learning_rate": 0.00019873696127176277, + "loss": 1.3326, + "step": 2202 + }, + { + "epoch": 0.05656686261576804, + "grad_norm": 0.90625, + "learning_rate": 0.000198736253893236, + "loss": 1.2287, + "step": 2203 + }, + { + "epoch": 0.056592539811689865, + "grad_norm": 0.984375, + "learning_rate": 0.00019873554631793684, + "loss": 1.2828, + "step": 2204 + }, + { + "epoch": 0.05661821700761168, + "grad_norm": 0.953125, + "learning_rate": 0.00019873483854586664, + "loss": 1.2957, + "step": 2205 + }, + { + "epoch": 0.0566438942035335, + "grad_norm": 0.89453125, + "learning_rate": 0.0001987341305770268, + "loss": 1.2704, + "step": 2206 + }, + { + "epoch": 0.056669571399455325, + "grad_norm": 0.9296875, + "learning_rate": 0.0001987334224114188, + "loss": 1.4488, + "step": 2207 + }, + { + "epoch": 0.05669524859537714, + "grad_norm": 0.92578125, + "learning_rate": 0.00019873271404904398, + "loss": 1.285, + "step": 2208 + }, + { + "epoch": 0.05672092579129896, + "grad_norm": 0.88671875, + "learning_rate": 0.0001987320054899038, + "loss": 1.367, + "step": 2209 + }, + { + "epoch": 0.05674660298722078, + "grad_norm": 0.8828125, + "learning_rate": 0.00019873129673399963, + "loss": 1.2773, + "step": 2210 + }, + { + "epoch": 0.0567722801831426, + "grad_norm": 0.80078125, + "learning_rate": 0.00019873058778133293, + "loss": 1.1573, + "step": 2211 + }, + { + "epoch": 0.05679795737906442, + "grad_norm": 0.87109375, + "learning_rate": 0.00019872987863190508, + "loss": 1.2695, + "step": 2212 + }, + { + "epoch": 0.05682363457498624, + "grad_norm": 0.93359375, + "learning_rate": 0.00019872916928571747, + "loss": 1.2685, + "step": 2213 + }, + { + "epoch": 0.05684931177090806, + "grad_norm": 0.8828125, + "learning_rate": 0.00019872845974277156, + "loss": 1.23, + "step": 2214 + }, + { + "epoch": 0.056874988966829874, + "grad_norm": 0.82421875, + "learning_rate": 0.0001987277500030688, + "loss": 1.3185, + "step": 2215 + }, + { + "epoch": 0.0569006661627517, + "grad_norm": 0.87109375, + "learning_rate": 0.00019872704006661048, + "loss": 1.2539, + "step": 2216 + }, + { + "epoch": 0.05692634335867352, + "grad_norm": 0.89453125, + "learning_rate": 0.00019872632993339813, + "loss": 1.2184, + "step": 2217 + }, + { + "epoch": 0.056952020554595334, + "grad_norm": 0.91015625, + "learning_rate": 0.0001987256196034331, + "loss": 1.2051, + "step": 2218 + }, + { + "epoch": 0.056977697750517156, + "grad_norm": 0.91796875, + "learning_rate": 0.00019872490907671685, + "loss": 1.3704, + "step": 2219 + }, + { + "epoch": 0.05700337494643897, + "grad_norm": 0.84765625, + "learning_rate": 0.00019872419835325074, + "loss": 1.2852, + "step": 2220 + }, + { + "epoch": 0.057029052142360794, + "grad_norm": 0.9140625, + "learning_rate": 0.00019872348743303625, + "loss": 1.3746, + "step": 2221 + }, + { + "epoch": 0.057054729338282616, + "grad_norm": 0.953125, + "learning_rate": 0.00019872277631607474, + "loss": 1.2533, + "step": 2222 + }, + { + "epoch": 0.05708040653420443, + "grad_norm": 0.87890625, + "learning_rate": 0.00019872206500236766, + "loss": 1.221, + "step": 2223 + }, + { + "epoch": 0.05710608373012625, + "grad_norm": 0.9921875, + "learning_rate": 0.00019872135349191644, + "loss": 1.2732, + "step": 2224 + }, + { + "epoch": 0.05713176092604807, + "grad_norm": 0.90625, + "learning_rate": 0.00019872064178472247, + "loss": 1.281, + "step": 2225 + }, + { + "epoch": 0.05715743812196989, + "grad_norm": 0.95703125, + "learning_rate": 0.00019871992988078718, + "loss": 1.2581, + "step": 2226 + }, + { + "epoch": 0.05718311531789171, + "grad_norm": 0.8984375, + "learning_rate": 0.000198719217780112, + "loss": 1.4676, + "step": 2227 + }, + { + "epoch": 0.05720879251381353, + "grad_norm": 0.92578125, + "learning_rate": 0.00019871850548269833, + "loss": 1.1525, + "step": 2228 + }, + { + "epoch": 0.05723446970973535, + "grad_norm": 0.91796875, + "learning_rate": 0.00019871779298854758, + "loss": 1.2833, + "step": 2229 + }, + { + "epoch": 0.057260146905657165, + "grad_norm": 1.0234375, + "learning_rate": 0.00019871708029766118, + "loss": 1.4007, + "step": 2230 + }, + { + "epoch": 0.05728582410157899, + "grad_norm": 0.98828125, + "learning_rate": 0.00019871636741004058, + "loss": 1.2364, + "step": 2231 + }, + { + "epoch": 0.05731150129750081, + "grad_norm": 0.8828125, + "learning_rate": 0.00019871565432568716, + "loss": 1.2648, + "step": 2232 + }, + { + "epoch": 0.057337178493422625, + "grad_norm": 0.87890625, + "learning_rate": 0.00019871494104460239, + "loss": 1.2885, + "step": 2233 + }, + { + "epoch": 0.05736285568934445, + "grad_norm": 0.89453125, + "learning_rate": 0.0001987142275667876, + "loss": 1.4951, + "step": 2234 + }, + { + "epoch": 0.05738853288526626, + "grad_norm": 0.88671875, + "learning_rate": 0.00019871351389224432, + "loss": 1.4172, + "step": 2235 + }, + { + "epoch": 0.057414210081188084, + "grad_norm": 0.859375, + "learning_rate": 0.00019871280002097392, + "loss": 1.2632, + "step": 2236 + }, + { + "epoch": 0.057439887277109906, + "grad_norm": 0.91015625, + "learning_rate": 0.0001987120859529778, + "loss": 1.2752, + "step": 2237 + }, + { + "epoch": 0.05746556447303172, + "grad_norm": 0.8671875, + "learning_rate": 0.00019871137168825744, + "loss": 1.2693, + "step": 2238 + }, + { + "epoch": 0.057491241668953544, + "grad_norm": 0.90234375, + "learning_rate": 0.0001987106572268142, + "loss": 1.3259, + "step": 2239 + }, + { + "epoch": 0.05751691886487536, + "grad_norm": 0.9609375, + "learning_rate": 0.00019870994256864955, + "loss": 1.3788, + "step": 2240 + }, + { + "epoch": 0.05754259606079718, + "grad_norm": 0.90234375, + "learning_rate": 0.00019870922771376496, + "loss": 1.2492, + "step": 2241 + }, + { + "epoch": 0.057568273256719, + "grad_norm": 0.91015625, + "learning_rate": 0.00019870851266216171, + "loss": 1.364, + "step": 2242 + }, + { + "epoch": 0.05759395045264082, + "grad_norm": 0.9296875, + "learning_rate": 0.00019870779741384135, + "loss": 1.4743, + "step": 2243 + }, + { + "epoch": 0.05761962764856264, + "grad_norm": 0.99609375, + "learning_rate": 0.00019870708196880526, + "loss": 1.239, + "step": 2244 + }, + { + "epoch": 0.057645304844484456, + "grad_norm": 0.9375, + "learning_rate": 0.00019870636632705486, + "loss": 1.2229, + "step": 2245 + }, + { + "epoch": 0.05767098204040628, + "grad_norm": 0.9140625, + "learning_rate": 0.0001987056504885916, + "loss": 1.3021, + "step": 2246 + }, + { + "epoch": 0.05769665923632809, + "grad_norm": 1.0078125, + "learning_rate": 0.0001987049344534169, + "loss": 1.3069, + "step": 2247 + }, + { + "epoch": 0.057722336432249916, + "grad_norm": 0.8984375, + "learning_rate": 0.00019870421822153217, + "loss": 1.3793, + "step": 2248 + }, + { + "epoch": 0.05774801362817174, + "grad_norm": 0.8515625, + "learning_rate": 0.00019870350179293885, + "loss": 1.3197, + "step": 2249 + }, + { + "epoch": 0.05777369082409355, + "grad_norm": 0.9375, + "learning_rate": 0.00019870278516763835, + "loss": 1.3622, + "step": 2250 + }, + { + "epoch": 0.057799368020015375, + "grad_norm": 0.98046875, + "learning_rate": 0.00019870206834563217, + "loss": 1.2994, + "step": 2251 + }, + { + "epoch": 0.05782504521593719, + "grad_norm": 0.85546875, + "learning_rate": 0.00019870135132692162, + "loss": 1.3788, + "step": 2252 + }, + { + "epoch": 0.05785072241185901, + "grad_norm": 0.875, + "learning_rate": 0.00019870063411150823, + "loss": 1.3912, + "step": 2253 + }, + { + "epoch": 0.057876399607780835, + "grad_norm": 0.95703125, + "learning_rate": 0.00019869991669939339, + "loss": 1.4139, + "step": 2254 + }, + { + "epoch": 0.05790207680370265, + "grad_norm": 0.80859375, + "learning_rate": 0.0001986991990905785, + "loss": 1.0621, + "step": 2255 + }, + { + "epoch": 0.05792775399962447, + "grad_norm": 0.91796875, + "learning_rate": 0.00019869848128506505, + "loss": 1.1829, + "step": 2256 + }, + { + "epoch": 0.05795343119554629, + "grad_norm": 0.9296875, + "learning_rate": 0.00019869776328285444, + "loss": 1.2203, + "step": 2257 + }, + { + "epoch": 0.05797910839146811, + "grad_norm": 0.9296875, + "learning_rate": 0.00019869704508394814, + "loss": 1.2501, + "step": 2258 + }, + { + "epoch": 0.05800478558738993, + "grad_norm": 0.796875, + "learning_rate": 0.00019869632668834749, + "loss": 0.9786, + "step": 2259 + }, + { + "epoch": 0.05803046278331175, + "grad_norm": 0.9375, + "learning_rate": 0.000198695608096054, + "loss": 1.2736, + "step": 2260 + }, + { + "epoch": 0.05805613997923357, + "grad_norm": 0.94140625, + "learning_rate": 0.00019869488930706906, + "loss": 1.2117, + "step": 2261 + }, + { + "epoch": 0.058081817175155384, + "grad_norm": 0.90234375, + "learning_rate": 0.00019869417032139413, + "loss": 1.2015, + "step": 2262 + }, + { + "epoch": 0.058107494371077206, + "grad_norm": 0.98046875, + "learning_rate": 0.00019869345113903065, + "loss": 1.2175, + "step": 2263 + }, + { + "epoch": 0.05813317156699903, + "grad_norm": 0.9375, + "learning_rate": 0.00019869273175998, + "loss": 1.3594, + "step": 2264 + }, + { + "epoch": 0.058158848762920844, + "grad_norm": 0.92578125, + "learning_rate": 0.00019869201218424367, + "loss": 1.1208, + "step": 2265 + }, + { + "epoch": 0.058184525958842666, + "grad_norm": 0.9296875, + "learning_rate": 0.00019869129241182308, + "loss": 1.2713, + "step": 2266 + }, + { + "epoch": 0.05821020315476448, + "grad_norm": 0.921875, + "learning_rate": 0.00019869057244271967, + "loss": 1.1953, + "step": 2267 + }, + { + "epoch": 0.0582358803506863, + "grad_norm": 0.94140625, + "learning_rate": 0.00019868985227693484, + "loss": 1.4039, + "step": 2268 + }, + { + "epoch": 0.058261557546608125, + "grad_norm": 0.82421875, + "learning_rate": 0.00019868913191447004, + "loss": 1.2397, + "step": 2269 + }, + { + "epoch": 0.05828723474252994, + "grad_norm": 1.0234375, + "learning_rate": 0.00019868841135532673, + "loss": 1.3064, + "step": 2270 + }, + { + "epoch": 0.05831291193845176, + "grad_norm": 0.83984375, + "learning_rate": 0.00019868769059950636, + "loss": 1.1607, + "step": 2271 + }, + { + "epoch": 0.05833858913437358, + "grad_norm": 0.890625, + "learning_rate": 0.0001986869696470103, + "loss": 1.1993, + "step": 2272 + }, + { + "epoch": 0.0583642663302954, + "grad_norm": 0.9296875, + "learning_rate": 0.00019868624849784004, + "loss": 1.4712, + "step": 2273 + }, + { + "epoch": 0.05838994352621722, + "grad_norm": 1.0546875, + "learning_rate": 0.000198685527151997, + "loss": 1.2852, + "step": 2274 + }, + { + "epoch": 0.05841562072213904, + "grad_norm": 0.94921875, + "learning_rate": 0.0001986848056094826, + "loss": 1.4401, + "step": 2275 + }, + { + "epoch": 0.05844129791806086, + "grad_norm": 0.8359375, + "learning_rate": 0.00019868408387029832, + "loss": 1.2137, + "step": 2276 + }, + { + "epoch": 0.058466975113982675, + "grad_norm": 0.91015625, + "learning_rate": 0.00019868336193444556, + "loss": 1.3219, + "step": 2277 + }, + { + "epoch": 0.0584926523099045, + "grad_norm": 0.85546875, + "learning_rate": 0.0001986826398019258, + "loss": 1.2851, + "step": 2278 + }, + { + "epoch": 0.05851832950582632, + "grad_norm": 0.9140625, + "learning_rate": 0.00019868191747274042, + "loss": 1.3395, + "step": 2279 + }, + { + "epoch": 0.058544006701748134, + "grad_norm": 0.921875, + "learning_rate": 0.00019868119494689088, + "loss": 1.3317, + "step": 2280 + }, + { + "epoch": 0.05856968389766996, + "grad_norm": 0.91796875, + "learning_rate": 0.00019868047222437868, + "loss": 1.3788, + "step": 2281 + }, + { + "epoch": 0.05859536109359177, + "grad_norm": 0.90234375, + "learning_rate": 0.00019867974930520518, + "loss": 1.2817, + "step": 2282 + }, + { + "epoch": 0.058621038289513594, + "grad_norm": 0.9453125, + "learning_rate": 0.00019867902618937185, + "loss": 1.3305, + "step": 2283 + }, + { + "epoch": 0.058646715485435416, + "grad_norm": 0.8359375, + "learning_rate": 0.00019867830287688016, + "loss": 1.2652, + "step": 2284 + }, + { + "epoch": 0.05867239268135723, + "grad_norm": 0.86328125, + "learning_rate": 0.0001986775793677315, + "loss": 1.2706, + "step": 2285 + }, + { + "epoch": 0.058698069877279054, + "grad_norm": 0.98828125, + "learning_rate": 0.00019867685566192736, + "loss": 1.5038, + "step": 2286 + }, + { + "epoch": 0.05872374707320087, + "grad_norm": 0.93359375, + "learning_rate": 0.00019867613175946915, + "loss": 1.2135, + "step": 2287 + }, + { + "epoch": 0.05874942426912269, + "grad_norm": 0.87109375, + "learning_rate": 0.00019867540766035831, + "loss": 1.1866, + "step": 2288 + }, + { + "epoch": 0.05877510146504451, + "grad_norm": 0.84375, + "learning_rate": 0.0001986746833645963, + "loss": 1.2882, + "step": 2289 + }, + { + "epoch": 0.05880077866096633, + "grad_norm": 0.96875, + "learning_rate": 0.00019867395887218457, + "loss": 1.4522, + "step": 2290 + }, + { + "epoch": 0.05882645585688815, + "grad_norm": 1.0625, + "learning_rate": 0.00019867323418312457, + "loss": 1.3224, + "step": 2291 + }, + { + "epoch": 0.058852133052809966, + "grad_norm": 0.92578125, + "learning_rate": 0.0001986725092974177, + "loss": 1.1956, + "step": 2292 + }, + { + "epoch": 0.05887781024873179, + "grad_norm": 0.9375, + "learning_rate": 0.00019867178421506545, + "loss": 1.4472, + "step": 2293 + }, + { + "epoch": 0.05890348744465361, + "grad_norm": 0.91796875, + "learning_rate": 0.00019867105893606922, + "loss": 1.4312, + "step": 2294 + }, + { + "epoch": 0.058929164640575425, + "grad_norm": 0.91015625, + "learning_rate": 0.0001986703334604305, + "loss": 1.2157, + "step": 2295 + }, + { + "epoch": 0.05895484183649725, + "grad_norm": 0.859375, + "learning_rate": 0.00019866960778815073, + "loss": 1.268, + "step": 2296 + }, + { + "epoch": 0.05898051903241906, + "grad_norm": 0.84765625, + "learning_rate": 0.00019866888191923134, + "loss": 1.0882, + "step": 2297 + }, + { + "epoch": 0.059006196228340885, + "grad_norm": 0.890625, + "learning_rate": 0.0001986681558536738, + "loss": 1.2015, + "step": 2298 + }, + { + "epoch": 0.0590318734242627, + "grad_norm": 0.890625, + "learning_rate": 0.00019866742959147947, + "loss": 1.2758, + "step": 2299 + }, + { + "epoch": 0.05905755062018452, + "grad_norm": 0.89453125, + "learning_rate": 0.00019866670313264993, + "loss": 1.4245, + "step": 2300 + }, + { + "epoch": 0.059083227816106344, + "grad_norm": 0.890625, + "learning_rate": 0.00019866597647718654, + "loss": 1.1496, + "step": 2301 + }, + { + "epoch": 0.05910890501202816, + "grad_norm": 0.91796875, + "learning_rate": 0.00019866524962509078, + "loss": 1.3092, + "step": 2302 + }, + { + "epoch": 0.05913458220794998, + "grad_norm": 0.9765625, + "learning_rate": 0.00019866452257636408, + "loss": 1.3171, + "step": 2303 + }, + { + "epoch": 0.0591602594038718, + "grad_norm": 0.88671875, + "learning_rate": 0.00019866379533100789, + "loss": 1.0103, + "step": 2304 + }, + { + "epoch": 0.05918593659979362, + "grad_norm": 0.9375, + "learning_rate": 0.00019866306788902369, + "loss": 1.3435, + "step": 2305 + }, + { + "epoch": 0.05921161379571544, + "grad_norm": 0.859375, + "learning_rate": 0.00019866234025041292, + "loss": 1.218, + "step": 2306 + }, + { + "epoch": 0.059237290991637256, + "grad_norm": 0.86328125, + "learning_rate": 0.00019866161241517696, + "loss": 1.1686, + "step": 2307 + }, + { + "epoch": 0.05926296818755908, + "grad_norm": 0.91015625, + "learning_rate": 0.00019866088438331736, + "loss": 1.309, + "step": 2308 + }, + { + "epoch": 0.059288645383480894, + "grad_norm": 0.8203125, + "learning_rate": 0.00019866015615483553, + "loss": 0.9594, + "step": 2309 + }, + { + "epoch": 0.059314322579402716, + "grad_norm": 0.8515625, + "learning_rate": 0.00019865942772973288, + "loss": 1.2569, + "step": 2310 + }, + { + "epoch": 0.05933999977532454, + "grad_norm": 0.9140625, + "learning_rate": 0.00019865869910801095, + "loss": 1.2087, + "step": 2311 + }, + { + "epoch": 0.05936567697124635, + "grad_norm": 0.91796875, + "learning_rate": 0.00019865797028967115, + "loss": 1.2365, + "step": 2312 + }, + { + "epoch": 0.059391354167168176, + "grad_norm": 0.88671875, + "learning_rate": 0.0001986572412747149, + "loss": 1.2693, + "step": 2313 + }, + { + "epoch": 0.05941703136308999, + "grad_norm": 0.8828125, + "learning_rate": 0.0001986565120631437, + "loss": 1.226, + "step": 2314 + }, + { + "epoch": 0.05944270855901181, + "grad_norm": 0.91015625, + "learning_rate": 0.00019865578265495896, + "loss": 1.2406, + "step": 2315 + }, + { + "epoch": 0.059468385754933635, + "grad_norm": 0.83984375, + "learning_rate": 0.00019865505305016216, + "loss": 1.2012, + "step": 2316 + }, + { + "epoch": 0.05949406295085545, + "grad_norm": 0.953125, + "learning_rate": 0.00019865432324875476, + "loss": 1.3467, + "step": 2317 + }, + { + "epoch": 0.05951974014677727, + "grad_norm": 0.98046875, + "learning_rate": 0.0001986535932507382, + "loss": 1.2173, + "step": 2318 + }, + { + "epoch": 0.05954541734269909, + "grad_norm": 0.88671875, + "learning_rate": 0.00019865286305611394, + "loss": 1.3724, + "step": 2319 + }, + { + "epoch": 0.05957109453862091, + "grad_norm": 0.91015625, + "learning_rate": 0.00019865213266488345, + "loss": 1.3275, + "step": 2320 + }, + { + "epoch": 0.05959677173454273, + "grad_norm": 0.8203125, + "learning_rate": 0.00019865140207704816, + "loss": 1.2891, + "step": 2321 + }, + { + "epoch": 0.05962244893046455, + "grad_norm": 0.90625, + "learning_rate": 0.00019865067129260956, + "loss": 1.192, + "step": 2322 + }, + { + "epoch": 0.05964812612638637, + "grad_norm": 0.94921875, + "learning_rate": 0.00019864994031156902, + "loss": 1.2708, + "step": 2323 + }, + { + "epoch": 0.059673803322308185, + "grad_norm": 0.8984375, + "learning_rate": 0.00019864920913392813, + "loss": 1.2473, + "step": 2324 + }, + { + "epoch": 0.05969948051823001, + "grad_norm": 0.984375, + "learning_rate": 0.00019864847775968823, + "loss": 1.116, + "step": 2325 + }, + { + "epoch": 0.05972515771415183, + "grad_norm": 0.8984375, + "learning_rate": 0.00019864774618885082, + "loss": 1.1229, + "step": 2326 + }, + { + "epoch": 0.059750834910073644, + "grad_norm": 0.859375, + "learning_rate": 0.00019864701442141739, + "loss": 1.1474, + "step": 2327 + }, + { + "epoch": 0.059776512105995466, + "grad_norm": 0.859375, + "learning_rate": 0.00019864628245738937, + "loss": 1.3466, + "step": 2328 + }, + { + "epoch": 0.05980218930191728, + "grad_norm": 0.89453125, + "learning_rate": 0.0001986455502967682, + "loss": 1.2491, + "step": 2329 + }, + { + "epoch": 0.059827866497839104, + "grad_norm": 0.8984375, + "learning_rate": 0.00019864481793955538, + "loss": 1.2859, + "step": 2330 + }, + { + "epoch": 0.059853543693760926, + "grad_norm": 0.9296875, + "learning_rate": 0.00019864408538575233, + "loss": 1.2313, + "step": 2331 + }, + { + "epoch": 0.05987922088968274, + "grad_norm": 0.8515625, + "learning_rate": 0.00019864335263536056, + "loss": 1.3255, + "step": 2332 + }, + { + "epoch": 0.05990489808560456, + "grad_norm": 0.9296875, + "learning_rate": 0.0001986426196883815, + "loss": 1.3362, + "step": 2333 + }, + { + "epoch": 0.05993057528152638, + "grad_norm": 0.90234375, + "learning_rate": 0.00019864188654481656, + "loss": 1.2798, + "step": 2334 + }, + { + "epoch": 0.0599562524774482, + "grad_norm": 0.91796875, + "learning_rate": 0.0001986411532046673, + "loss": 1.4122, + "step": 2335 + }, + { + "epoch": 0.05998192967337002, + "grad_norm": 0.8828125, + "learning_rate": 0.0001986404196679351, + "loss": 1.3648, + "step": 2336 + }, + { + "epoch": 0.06000760686929184, + "grad_norm": 0.91796875, + "learning_rate": 0.0001986396859346215, + "loss": 1.3549, + "step": 2337 + }, + { + "epoch": 0.06003328406521366, + "grad_norm": 0.90234375, + "learning_rate": 0.00019863895200472785, + "loss": 1.2714, + "step": 2338 + }, + { + "epoch": 0.060058961261135475, + "grad_norm": 0.87109375, + "learning_rate": 0.00019863821787825573, + "loss": 1.2863, + "step": 2339 + }, + { + "epoch": 0.0600846384570573, + "grad_norm": 0.890625, + "learning_rate": 0.0001986374835552065, + "loss": 1.3616, + "step": 2340 + }, + { + "epoch": 0.06011031565297912, + "grad_norm": 0.8828125, + "learning_rate": 0.0001986367490355817, + "loss": 1.2421, + "step": 2341 + }, + { + "epoch": 0.060135992848900935, + "grad_norm": 1.0390625, + "learning_rate": 0.00019863601431938278, + "loss": 1.354, + "step": 2342 + }, + { + "epoch": 0.06016167004482276, + "grad_norm": 0.94140625, + "learning_rate": 0.00019863527940661117, + "loss": 1.3127, + "step": 2343 + }, + { + "epoch": 0.06018734724074457, + "grad_norm": 0.94140625, + "learning_rate": 0.0001986345442972684, + "loss": 1.1301, + "step": 2344 + }, + { + "epoch": 0.060213024436666394, + "grad_norm": 1.03125, + "learning_rate": 0.00019863380899135587, + "loss": 1.2882, + "step": 2345 + }, + { + "epoch": 0.06023870163258822, + "grad_norm": 0.9296875, + "learning_rate": 0.00019863307348887507, + "loss": 1.3148, + "step": 2346 + }, + { + "epoch": 0.06026437882851003, + "grad_norm": 0.98046875, + "learning_rate": 0.00019863233778982746, + "loss": 1.2027, + "step": 2347 + }, + { + "epoch": 0.060290056024431854, + "grad_norm": 0.9375, + "learning_rate": 0.0001986316018942145, + "loss": 1.1576, + "step": 2348 + }, + { + "epoch": 0.06031573322035367, + "grad_norm": 0.9609375, + "learning_rate": 0.00019863086580203766, + "loss": 1.2426, + "step": 2349 + }, + { + "epoch": 0.06034141041627549, + "grad_norm": 0.93359375, + "learning_rate": 0.00019863012951329846, + "loss": 1.2944, + "step": 2350 + }, + { + "epoch": 0.06036708761219731, + "grad_norm": 0.85546875, + "learning_rate": 0.00019862939302799827, + "loss": 1.2725, + "step": 2351 + }, + { + "epoch": 0.06039276480811913, + "grad_norm": 0.88671875, + "learning_rate": 0.00019862865634613864, + "loss": 1.3806, + "step": 2352 + }, + { + "epoch": 0.06041844200404095, + "grad_norm": 0.87890625, + "learning_rate": 0.000198627919467721, + "loss": 1.2009, + "step": 2353 + }, + { + "epoch": 0.060444119199962766, + "grad_norm": 0.95703125, + "learning_rate": 0.00019862718239274681, + "loss": 1.3075, + "step": 2354 + }, + { + "epoch": 0.06046979639588459, + "grad_norm": 0.8359375, + "learning_rate": 0.0001986264451212176, + "loss": 1.292, + "step": 2355 + }, + { + "epoch": 0.060495473591806403, + "grad_norm": 0.8203125, + "learning_rate": 0.00019862570765313476, + "loss": 1.0848, + "step": 2356 + }, + { + "epoch": 0.060521150787728226, + "grad_norm": 0.87890625, + "learning_rate": 0.0001986249699884998, + "loss": 1.2704, + "step": 2357 + }, + { + "epoch": 0.06054682798365005, + "grad_norm": 0.94140625, + "learning_rate": 0.00019862423212731417, + "loss": 1.282, + "step": 2358 + }, + { + "epoch": 0.06057250517957186, + "grad_norm": 0.83203125, + "learning_rate": 0.00019862349406957934, + "loss": 1.1477, + "step": 2359 + }, + { + "epoch": 0.060598182375493685, + "grad_norm": 1.046875, + "learning_rate": 0.00019862275581529685, + "loss": 1.3214, + "step": 2360 + }, + { + "epoch": 0.0606238595714155, + "grad_norm": 0.87890625, + "learning_rate": 0.0001986220173644681, + "loss": 1.1193, + "step": 2361 + }, + { + "epoch": 0.06064953676733732, + "grad_norm": 0.8828125, + "learning_rate": 0.00019862127871709452, + "loss": 1.4294, + "step": 2362 + }, + { + "epoch": 0.060675213963259145, + "grad_norm": 1.25, + "learning_rate": 0.0001986205398731777, + "loss": 1.2488, + "step": 2363 + }, + { + "epoch": 0.06070089115918096, + "grad_norm": 0.87890625, + "learning_rate": 0.00019861980083271902, + "loss": 1.2793, + "step": 2364 + }, + { + "epoch": 0.06072656835510278, + "grad_norm": 0.98046875, + "learning_rate": 0.00019861906159572, + "loss": 1.0481, + "step": 2365 + }, + { + "epoch": 0.0607522455510246, + "grad_norm": 0.97265625, + "learning_rate": 0.0001986183221621821, + "loss": 1.3362, + "step": 2366 + }, + { + "epoch": 0.06077792274694642, + "grad_norm": 1.078125, + "learning_rate": 0.00019861758253210678, + "loss": 1.3308, + "step": 2367 + }, + { + "epoch": 0.06080359994286824, + "grad_norm": 0.90234375, + "learning_rate": 0.00019861684270549557, + "loss": 1.3751, + "step": 2368 + }, + { + "epoch": 0.06082927713879006, + "grad_norm": 0.80859375, + "learning_rate": 0.00019861610268234986, + "loss": 1.0264, + "step": 2369 + }, + { + "epoch": 0.06085495433471188, + "grad_norm": 0.94921875, + "learning_rate": 0.00019861536246267117, + "loss": 1.2502, + "step": 2370 + }, + { + "epoch": 0.060880631530633694, + "grad_norm": 0.78515625, + "learning_rate": 0.00019861462204646095, + "loss": 1.0756, + "step": 2371 + }, + { + "epoch": 0.060906308726555516, + "grad_norm": 0.90234375, + "learning_rate": 0.00019861388143372073, + "loss": 1.2922, + "step": 2372 + }, + { + "epoch": 0.06093198592247734, + "grad_norm": 0.97265625, + "learning_rate": 0.00019861314062445193, + "loss": 1.2408, + "step": 2373 + }, + { + "epoch": 0.060957663118399154, + "grad_norm": 0.98828125, + "learning_rate": 0.00019861239961865607, + "loss": 1.1707, + "step": 2374 + }, + { + "epoch": 0.060983340314320976, + "grad_norm": 0.95703125, + "learning_rate": 0.0001986116584163346, + "loss": 1.0758, + "step": 2375 + }, + { + "epoch": 0.06100901751024279, + "grad_norm": 0.91015625, + "learning_rate": 0.000198610917017489, + "loss": 1.3627, + "step": 2376 + }, + { + "epoch": 0.06103469470616461, + "grad_norm": 1.0, + "learning_rate": 0.00019861017542212076, + "loss": 1.1715, + "step": 2377 + }, + { + "epoch": 0.061060371902086436, + "grad_norm": 1.0234375, + "learning_rate": 0.0001986094336302313, + "loss": 1.2798, + "step": 2378 + }, + { + "epoch": 0.06108604909800825, + "grad_norm": 0.921875, + "learning_rate": 0.0001986086916418222, + "loss": 1.2541, + "step": 2379 + }, + { + "epoch": 0.06111172629393007, + "grad_norm": 0.9140625, + "learning_rate": 0.00019860794945689486, + "loss": 1.3726, + "step": 2380 + }, + { + "epoch": 0.06113740348985189, + "grad_norm": 0.92578125, + "learning_rate": 0.00019860720707545079, + "loss": 1.3819, + "step": 2381 + }, + { + "epoch": 0.06116308068577371, + "grad_norm": 0.91015625, + "learning_rate": 0.00019860646449749148, + "loss": 1.1901, + "step": 2382 + }, + { + "epoch": 0.06118875788169553, + "grad_norm": 0.91015625, + "learning_rate": 0.00019860572172301834, + "loss": 1.3441, + "step": 2383 + }, + { + "epoch": 0.06121443507761735, + "grad_norm": 0.92578125, + "learning_rate": 0.00019860497875203294, + "loss": 1.2996, + "step": 2384 + }, + { + "epoch": 0.06124011227353917, + "grad_norm": 0.97265625, + "learning_rate": 0.00019860423558453673, + "loss": 1.2187, + "step": 2385 + }, + { + "epoch": 0.061265789469460985, + "grad_norm": 0.9921875, + "learning_rate": 0.00019860349222053118, + "loss": 1.5184, + "step": 2386 + }, + { + "epoch": 0.06129146666538281, + "grad_norm": 0.90625, + "learning_rate": 0.0001986027486600178, + "loss": 1.3676, + "step": 2387 + }, + { + "epoch": 0.06131714386130463, + "grad_norm": 0.9453125, + "learning_rate": 0.00019860200490299802, + "loss": 1.3625, + "step": 2388 + }, + { + "epoch": 0.061342821057226445, + "grad_norm": 0.8125, + "learning_rate": 0.00019860126094947337, + "loss": 1.0733, + "step": 2389 + }, + { + "epoch": 0.06136849825314827, + "grad_norm": 0.88671875, + "learning_rate": 0.00019860051679944528, + "loss": 1.1093, + "step": 2390 + }, + { + "epoch": 0.06139417544907008, + "grad_norm": 0.96484375, + "learning_rate": 0.00019859977245291529, + "loss": 1.3244, + "step": 2391 + }, + { + "epoch": 0.061419852644991904, + "grad_norm": 0.86328125, + "learning_rate": 0.00019859902790988484, + "loss": 1.2411, + "step": 2392 + }, + { + "epoch": 0.061445529840913726, + "grad_norm": 0.8828125, + "learning_rate": 0.00019859828317035543, + "loss": 1.302, + "step": 2393 + }, + { + "epoch": 0.06147120703683554, + "grad_norm": 0.8984375, + "learning_rate": 0.0001985975382343286, + "loss": 1.2498, + "step": 2394 + }, + { + "epoch": 0.061496884232757364, + "grad_norm": 1.40625, + "learning_rate": 0.00019859679310180574, + "loss": 1.1964, + "step": 2395 + }, + { + "epoch": 0.06152256142867918, + "grad_norm": 0.87109375, + "learning_rate": 0.0001985960477727884, + "loss": 1.3167, + "step": 2396 + }, + { + "epoch": 0.061548238624601, + "grad_norm": 0.921875, + "learning_rate": 0.00019859530224727802, + "loss": 1.337, + "step": 2397 + }, + { + "epoch": 0.06157391582052282, + "grad_norm": 0.80859375, + "learning_rate": 0.00019859455652527613, + "loss": 1.2062, + "step": 2398 + }, + { + "epoch": 0.06159959301644464, + "grad_norm": 0.88671875, + "learning_rate": 0.00019859381060678417, + "loss": 1.2106, + "step": 2399 + }, + { + "epoch": 0.06162527021236646, + "grad_norm": 0.9765625, + "learning_rate": 0.00019859306449180366, + "loss": 1.2547, + "step": 2400 + }, + { + "epoch": 0.061650947408288276, + "grad_norm": 0.89453125, + "learning_rate": 0.0001985923181803361, + "loss": 1.2912, + "step": 2401 + }, + { + "epoch": 0.0616766246042101, + "grad_norm": 1.0, + "learning_rate": 0.00019859157167238293, + "loss": 1.298, + "step": 2402 + }, + { + "epoch": 0.06170230180013191, + "grad_norm": 1.171875, + "learning_rate": 0.0001985908249679457, + "loss": 1.5087, + "step": 2403 + }, + { + "epoch": 0.061727978996053735, + "grad_norm": 1.0, + "learning_rate": 0.0001985900780670258, + "loss": 1.2972, + "step": 2404 + }, + { + "epoch": 0.06175365619197556, + "grad_norm": 0.9375, + "learning_rate": 0.00019858933096962483, + "loss": 1.435, + "step": 2405 + }, + { + "epoch": 0.06177933338789737, + "grad_norm": 0.8671875, + "learning_rate": 0.00019858858367574421, + "loss": 1.3473, + "step": 2406 + }, + { + "epoch": 0.061805010583819195, + "grad_norm": 1.0078125, + "learning_rate": 0.00019858783618538546, + "loss": 1.3174, + "step": 2407 + }, + { + "epoch": 0.06183068777974101, + "grad_norm": 0.9296875, + "learning_rate": 0.00019858708849855003, + "loss": 1.2668, + "step": 2408 + }, + { + "epoch": 0.06185636497566283, + "grad_norm": 0.89453125, + "learning_rate": 0.00019858634061523946, + "loss": 1.3767, + "step": 2409 + }, + { + "epoch": 0.061882042171584654, + "grad_norm": 0.96484375, + "learning_rate": 0.0001985855925354552, + "loss": 1.3085, + "step": 2410 + }, + { + "epoch": 0.06190771936750647, + "grad_norm": 0.9375, + "learning_rate": 0.0001985848442591988, + "loss": 1.2159, + "step": 2411 + }, + { + "epoch": 0.06193339656342829, + "grad_norm": 0.94921875, + "learning_rate": 0.0001985840957864717, + "loss": 1.2202, + "step": 2412 + }, + { + "epoch": 0.06195907375935011, + "grad_norm": 0.98828125, + "learning_rate": 0.00019858334711727539, + "loss": 1.2913, + "step": 2413 + }, + { + "epoch": 0.06198475095527193, + "grad_norm": 0.953125, + "learning_rate": 0.00019858259825161135, + "loss": 1.4344, + "step": 2414 + }, + { + "epoch": 0.06201042815119375, + "grad_norm": 0.890625, + "learning_rate": 0.00019858184918948113, + "loss": 1.2394, + "step": 2415 + }, + { + "epoch": 0.06203610534711557, + "grad_norm": 0.8515625, + "learning_rate": 0.00019858109993088619, + "loss": 1.3483, + "step": 2416 + }, + { + "epoch": 0.06206178254303739, + "grad_norm": 0.96484375, + "learning_rate": 0.00019858035047582798, + "loss": 1.1542, + "step": 2417 + }, + { + "epoch": 0.062087459738959204, + "grad_norm": 0.83984375, + "learning_rate": 0.00019857960082430808, + "loss": 1.1704, + "step": 2418 + }, + { + "epoch": 0.062113136934881026, + "grad_norm": 0.8125, + "learning_rate": 0.00019857885097632793, + "loss": 1.1617, + "step": 2419 + }, + { + "epoch": 0.06213881413080285, + "grad_norm": 0.8828125, + "learning_rate": 0.00019857810093188905, + "loss": 1.1978, + "step": 2420 + }, + { + "epoch": 0.062164491326724663, + "grad_norm": 0.93359375, + "learning_rate": 0.0001985773506909929, + "loss": 1.3687, + "step": 2421 + }, + { + "epoch": 0.062190168522646486, + "grad_norm": 0.9140625, + "learning_rate": 0.00019857660025364099, + "loss": 1.3078, + "step": 2422 + }, + { + "epoch": 0.0622158457185683, + "grad_norm": 0.96484375, + "learning_rate": 0.0001985758496198348, + "loss": 1.1902, + "step": 2423 + }, + { + "epoch": 0.06224152291449012, + "grad_norm": 0.83984375, + "learning_rate": 0.0001985750987895759, + "loss": 1.2965, + "step": 2424 + }, + { + "epoch": 0.062267200110411945, + "grad_norm": 0.890625, + "learning_rate": 0.0001985743477628657, + "loss": 1.288, + "step": 2425 + }, + { + "epoch": 0.06229287730633376, + "grad_norm": 0.87109375, + "learning_rate": 0.00019857359653970572, + "loss": 1.271, + "step": 2426 + }, + { + "epoch": 0.06231855450225558, + "grad_norm": 0.92578125, + "learning_rate": 0.0001985728451200975, + "loss": 1.259, + "step": 2427 + }, + { + "epoch": 0.0623442316981774, + "grad_norm": 0.921875, + "learning_rate": 0.00019857209350404248, + "loss": 1.254, + "step": 2428 + }, + { + "epoch": 0.06236990889409922, + "grad_norm": 0.85546875, + "learning_rate": 0.00019857134169154217, + "loss": 1.1353, + "step": 2429 + }, + { + "epoch": 0.06239558609002104, + "grad_norm": 0.94140625, + "learning_rate": 0.0001985705896825981, + "loss": 1.2271, + "step": 2430 + }, + { + "epoch": 0.06242126328594286, + "grad_norm": 0.93359375, + "learning_rate": 0.00019856983747721174, + "loss": 1.3734, + "step": 2431 + }, + { + "epoch": 0.06244694048186468, + "grad_norm": 0.8359375, + "learning_rate": 0.00019856908507538462, + "loss": 1.072, + "step": 2432 + }, + { + "epoch": 0.062472617677786495, + "grad_norm": 0.84765625, + "learning_rate": 0.0001985683324771182, + "loss": 1.1394, + "step": 2433 + }, + { + "epoch": 0.06249829487370832, + "grad_norm": 0.91796875, + "learning_rate": 0.00019856757968241397, + "loss": 1.2484, + "step": 2434 + }, + { + "epoch": 0.06252397206963013, + "grad_norm": 0.95703125, + "learning_rate": 0.0001985668266912735, + "loss": 1.3193, + "step": 2435 + }, + { + "epoch": 0.06254964926555195, + "grad_norm": 0.92578125, + "learning_rate": 0.00019856607350369823, + "loss": 1.35, + "step": 2436 + }, + { + "epoch": 0.06257532646147378, + "grad_norm": 0.875, + "learning_rate": 0.00019856532011968967, + "loss": 1.2797, + "step": 2437 + }, + { + "epoch": 0.0626010036573956, + "grad_norm": 0.91796875, + "learning_rate": 0.00019856456653924933, + "loss": 1.1851, + "step": 2438 + }, + { + "epoch": 0.0626266808533174, + "grad_norm": 0.83984375, + "learning_rate": 0.00019856381276237874, + "loss": 1.2445, + "step": 2439 + }, + { + "epoch": 0.06265235804923923, + "grad_norm": 1.0, + "learning_rate": 0.00019856305878907936, + "loss": 1.4098, + "step": 2440 + }, + { + "epoch": 0.06267803524516105, + "grad_norm": 0.83984375, + "learning_rate": 0.0001985623046193527, + "loss": 1.0661, + "step": 2441 + }, + { + "epoch": 0.06270371244108287, + "grad_norm": 0.85546875, + "learning_rate": 0.00019856155025320025, + "loss": 1.2973, + "step": 2442 + }, + { + "epoch": 0.0627293896370047, + "grad_norm": 0.9453125, + "learning_rate": 0.00019856079569062355, + "loss": 1.329, + "step": 2443 + }, + { + "epoch": 0.0627550668329265, + "grad_norm": 0.875, + "learning_rate": 0.00019856004093162413, + "loss": 1.3051, + "step": 2444 + }, + { + "epoch": 0.06278074402884833, + "grad_norm": 0.83984375, + "learning_rate": 0.0001985592859762034, + "loss": 1.3607, + "step": 2445 + }, + { + "epoch": 0.06280642122477015, + "grad_norm": 0.8984375, + "learning_rate": 0.00019855853082436292, + "loss": 1.3604, + "step": 2446 + }, + { + "epoch": 0.06283209842069197, + "grad_norm": 0.83203125, + "learning_rate": 0.00019855777547610418, + "loss": 1.3384, + "step": 2447 + }, + { + "epoch": 0.06285777561661379, + "grad_norm": 0.96875, + "learning_rate": 0.00019855701993142874, + "loss": 1.2217, + "step": 2448 + }, + { + "epoch": 0.0628834528125356, + "grad_norm": 0.875, + "learning_rate": 0.00019855626419033805, + "loss": 1.1331, + "step": 2449 + }, + { + "epoch": 0.06290913000845742, + "grad_norm": 0.90234375, + "learning_rate": 0.0001985555082528336, + "loss": 1.4944, + "step": 2450 + }, + { + "epoch": 0.06293480720437925, + "grad_norm": 0.90234375, + "learning_rate": 0.00019855475211891696, + "loss": 1.1773, + "step": 2451 + }, + { + "epoch": 0.06296048440030107, + "grad_norm": 0.87109375, + "learning_rate": 0.00019855399578858957, + "loss": 1.1546, + "step": 2452 + }, + { + "epoch": 0.06298616159622289, + "grad_norm": 0.9140625, + "learning_rate": 0.00019855323926185297, + "loss": 1.3887, + "step": 2453 + }, + { + "epoch": 0.0630118387921447, + "grad_norm": 0.875, + "learning_rate": 0.0001985524825387087, + "loss": 1.2761, + "step": 2454 + }, + { + "epoch": 0.06303751598806652, + "grad_norm": 0.87109375, + "learning_rate": 0.0001985517256191582, + "loss": 1.3046, + "step": 2455 + }, + { + "epoch": 0.06306319318398834, + "grad_norm": 0.9453125, + "learning_rate": 0.00019855096850320304, + "loss": 1.1865, + "step": 2456 + }, + { + "epoch": 0.06308887037991016, + "grad_norm": 0.9609375, + "learning_rate": 0.00019855021119084468, + "loss": 1.4052, + "step": 2457 + }, + { + "epoch": 0.06311454757583199, + "grad_norm": 0.98828125, + "learning_rate": 0.00019854945368208467, + "loss": 1.2281, + "step": 2458 + }, + { + "epoch": 0.0631402247717538, + "grad_norm": 0.91796875, + "learning_rate": 0.0001985486959769245, + "loss": 1.3421, + "step": 2459 + }, + { + "epoch": 0.06316590196767562, + "grad_norm": 0.8359375, + "learning_rate": 0.00019854793807536566, + "loss": 1.2832, + "step": 2460 + }, + { + "epoch": 0.06319157916359744, + "grad_norm": 0.87890625, + "learning_rate": 0.0001985471799774097, + "loss": 1.3739, + "step": 2461 + }, + { + "epoch": 0.06321725635951926, + "grad_norm": 0.921875, + "learning_rate": 0.0001985464216830581, + "loss": 1.3954, + "step": 2462 + }, + { + "epoch": 0.06324293355544108, + "grad_norm": 0.8828125, + "learning_rate": 0.00019854566319231238, + "loss": 1.4157, + "step": 2463 + }, + { + "epoch": 0.06326861075136289, + "grad_norm": 0.90625, + "learning_rate": 0.00019854490450517406, + "loss": 1.2791, + "step": 2464 + }, + { + "epoch": 0.06329428794728471, + "grad_norm": 0.86328125, + "learning_rate": 0.00019854414562164464, + "loss": 1.3336, + "step": 2465 + }, + { + "epoch": 0.06331996514320654, + "grad_norm": 0.8203125, + "learning_rate": 0.00019854338654172566, + "loss": 1.0102, + "step": 2466 + }, + { + "epoch": 0.06334564233912836, + "grad_norm": 0.89453125, + "learning_rate": 0.0001985426272654186, + "loss": 1.2214, + "step": 2467 + }, + { + "epoch": 0.06337131953505018, + "grad_norm": 0.97265625, + "learning_rate": 0.000198541867792725, + "loss": 1.2247, + "step": 2468 + }, + { + "epoch": 0.06339699673097199, + "grad_norm": 1.015625, + "learning_rate": 0.00019854110812364634, + "loss": 1.3206, + "step": 2469 + }, + { + "epoch": 0.06342267392689381, + "grad_norm": 0.90234375, + "learning_rate": 0.00019854034825818415, + "loss": 1.3076, + "step": 2470 + }, + { + "epoch": 0.06344835112281563, + "grad_norm": 0.875, + "learning_rate": 0.00019853958819633996, + "loss": 1.2237, + "step": 2471 + }, + { + "epoch": 0.06347402831873745, + "grad_norm": 0.890625, + "learning_rate": 0.00019853882793811526, + "loss": 1.2567, + "step": 2472 + }, + { + "epoch": 0.06349970551465928, + "grad_norm": 0.8203125, + "learning_rate": 0.0001985380674835116, + "loss": 1.102, + "step": 2473 + }, + { + "epoch": 0.06352538271058109, + "grad_norm": 0.8515625, + "learning_rate": 0.00019853730683253043, + "loss": 1.2833, + "step": 2474 + }, + { + "epoch": 0.06355105990650291, + "grad_norm": 0.9375, + "learning_rate": 0.00019853654598517336, + "loss": 1.2836, + "step": 2475 + }, + { + "epoch": 0.06357673710242473, + "grad_norm": 0.8359375, + "learning_rate": 0.0001985357849414418, + "loss": 1.1835, + "step": 2476 + }, + { + "epoch": 0.06360241429834655, + "grad_norm": 0.7890625, + "learning_rate": 0.00019853502370133734, + "loss": 1.1182, + "step": 2477 + }, + { + "epoch": 0.06362809149426837, + "grad_norm": 0.92578125, + "learning_rate": 0.00019853426226486151, + "loss": 1.3237, + "step": 2478 + }, + { + "epoch": 0.06365376869019018, + "grad_norm": 0.875, + "learning_rate": 0.00019853350063201577, + "loss": 1.159, + "step": 2479 + }, + { + "epoch": 0.063679445886112, + "grad_norm": 0.84375, + "learning_rate": 0.00019853273880280167, + "loss": 1.1632, + "step": 2480 + }, + { + "epoch": 0.06370512308203383, + "grad_norm": 0.96484375, + "learning_rate": 0.00019853197677722074, + "loss": 1.2326, + "step": 2481 + }, + { + "epoch": 0.06373080027795565, + "grad_norm": 0.9765625, + "learning_rate": 0.00019853121455527443, + "loss": 1.3331, + "step": 2482 + }, + { + "epoch": 0.06375647747387747, + "grad_norm": 0.8984375, + "learning_rate": 0.00019853045213696433, + "loss": 1.3287, + "step": 2483 + }, + { + "epoch": 0.06378215466979928, + "grad_norm": 0.81640625, + "learning_rate": 0.00019852968952229196, + "loss": 1.1219, + "step": 2484 + }, + { + "epoch": 0.0638078318657211, + "grad_norm": 0.796875, + "learning_rate": 0.0001985289267112588, + "loss": 1.0858, + "step": 2485 + }, + { + "epoch": 0.06383350906164292, + "grad_norm": 0.94140625, + "learning_rate": 0.00019852816370386642, + "loss": 1.1566, + "step": 2486 + }, + { + "epoch": 0.06385918625756475, + "grad_norm": 0.92578125, + "learning_rate": 0.00019852740050011627, + "loss": 1.3007, + "step": 2487 + }, + { + "epoch": 0.06388486345348657, + "grad_norm": 0.8828125, + "learning_rate": 0.00019852663710000993, + "loss": 1.3479, + "step": 2488 + }, + { + "epoch": 0.06391054064940838, + "grad_norm": 0.93359375, + "learning_rate": 0.00019852587350354889, + "loss": 1.1577, + "step": 2489 + }, + { + "epoch": 0.0639362178453302, + "grad_norm": 0.80078125, + "learning_rate": 0.0001985251097107347, + "loss": 1.1898, + "step": 2490 + }, + { + "epoch": 0.06396189504125202, + "grad_norm": 0.875, + "learning_rate": 0.00019852434572156886, + "loss": 1.2055, + "step": 2491 + }, + { + "epoch": 0.06398757223717384, + "grad_norm": 0.85546875, + "learning_rate": 0.0001985235815360529, + "loss": 1.1815, + "step": 2492 + }, + { + "epoch": 0.06401324943309566, + "grad_norm": 0.84765625, + "learning_rate": 0.00019852281715418833, + "loss": 1.2676, + "step": 2493 + }, + { + "epoch": 0.06403892662901747, + "grad_norm": 0.84375, + "learning_rate": 0.0001985220525759767, + "loss": 1.1579, + "step": 2494 + }, + { + "epoch": 0.0640646038249393, + "grad_norm": 0.92578125, + "learning_rate": 0.00019852128780141953, + "loss": 1.2977, + "step": 2495 + }, + { + "epoch": 0.06409028102086112, + "grad_norm": 0.84765625, + "learning_rate": 0.0001985205228305183, + "loss": 1.2611, + "step": 2496 + }, + { + "epoch": 0.06411595821678294, + "grad_norm": 0.953125, + "learning_rate": 0.0001985197576632746, + "loss": 1.2658, + "step": 2497 + }, + { + "epoch": 0.06414163541270476, + "grad_norm": 0.87109375, + "learning_rate": 0.00019851899229968989, + "loss": 1.4224, + "step": 2498 + }, + { + "epoch": 0.06416731260862657, + "grad_norm": 0.859375, + "learning_rate": 0.00019851822673976576, + "loss": 1.1335, + "step": 2499 + }, + { + "epoch": 0.06419298980454839, + "grad_norm": 0.8671875, + "learning_rate": 0.00019851746098350365, + "loss": 1.293, + "step": 2500 + }, + { + "epoch": 0.06421866700047021, + "grad_norm": 0.9609375, + "learning_rate": 0.0001985166950309052, + "loss": 1.1102, + "step": 2501 + }, + { + "epoch": 0.06424434419639204, + "grad_norm": 0.86328125, + "learning_rate": 0.00019851592888197185, + "loss": 1.2894, + "step": 2502 + }, + { + "epoch": 0.06427002139231386, + "grad_norm": 0.98046875, + "learning_rate": 0.00019851516253670517, + "loss": 1.3982, + "step": 2503 + }, + { + "epoch": 0.06429569858823567, + "grad_norm": 0.9140625, + "learning_rate": 0.00019851439599510664, + "loss": 1.4808, + "step": 2504 + }, + { + "epoch": 0.06432137578415749, + "grad_norm": 0.8359375, + "learning_rate": 0.00019851362925717782, + "loss": 1.0581, + "step": 2505 + }, + { + "epoch": 0.06434705298007931, + "grad_norm": 0.9375, + "learning_rate": 0.00019851286232292026, + "loss": 1.3049, + "step": 2506 + }, + { + "epoch": 0.06437273017600113, + "grad_norm": 0.91015625, + "learning_rate": 0.00019851209519233546, + "loss": 1.295, + "step": 2507 + }, + { + "epoch": 0.06439840737192296, + "grad_norm": 1.4140625, + "learning_rate": 0.00019851132786542492, + "loss": 1.1319, + "step": 2508 + }, + { + "epoch": 0.06442408456784476, + "grad_norm": 0.8203125, + "learning_rate": 0.00019851056034219023, + "loss": 1.1906, + "step": 2509 + }, + { + "epoch": 0.06444976176376659, + "grad_norm": 0.859375, + "learning_rate": 0.0001985097926226329, + "loss": 1.0929, + "step": 2510 + }, + { + "epoch": 0.06447543895968841, + "grad_norm": 0.87890625, + "learning_rate": 0.00019850902470675444, + "loss": 1.112, + "step": 2511 + }, + { + "epoch": 0.06450111615561023, + "grad_norm": 1.0078125, + "learning_rate": 0.00019850825659455638, + "loss": 1.332, + "step": 2512 + }, + { + "epoch": 0.06452679335153205, + "grad_norm": 1.0390625, + "learning_rate": 0.00019850748828604027, + "loss": 1.3611, + "step": 2513 + }, + { + "epoch": 0.06455247054745386, + "grad_norm": 0.8359375, + "learning_rate": 0.00019850671978120766, + "loss": 1.1321, + "step": 2514 + }, + { + "epoch": 0.06457814774337568, + "grad_norm": 0.87890625, + "learning_rate": 0.00019850595108006003, + "loss": 1.1118, + "step": 2515 + }, + { + "epoch": 0.0646038249392975, + "grad_norm": 1.1015625, + "learning_rate": 0.00019850518218259895, + "loss": 1.4464, + "step": 2516 + }, + { + "epoch": 0.06462950213521933, + "grad_norm": 0.890625, + "learning_rate": 0.00019850441308882593, + "loss": 1.0672, + "step": 2517 + }, + { + "epoch": 0.06465517933114115, + "grad_norm": 0.87109375, + "learning_rate": 0.0001985036437987425, + "loss": 1.3362, + "step": 2518 + }, + { + "epoch": 0.06468085652706296, + "grad_norm": 0.8984375, + "learning_rate": 0.00019850287431235023, + "loss": 1.2466, + "step": 2519 + }, + { + "epoch": 0.06470653372298478, + "grad_norm": 0.93359375, + "learning_rate": 0.0001985021046296506, + "loss": 1.138, + "step": 2520 + }, + { + "epoch": 0.0647322109189066, + "grad_norm": 0.86328125, + "learning_rate": 0.00019850133475064522, + "loss": 1.1753, + "step": 2521 + }, + { + "epoch": 0.06475788811482842, + "grad_norm": 0.91796875, + "learning_rate": 0.00019850056467533552, + "loss": 1.2398, + "step": 2522 + }, + { + "epoch": 0.06478356531075025, + "grad_norm": 1.015625, + "learning_rate": 0.00019849979440372313, + "loss": 1.3125, + "step": 2523 + }, + { + "epoch": 0.06480924250667205, + "grad_norm": 0.83984375, + "learning_rate": 0.00019849902393580954, + "loss": 1.2109, + "step": 2524 + }, + { + "epoch": 0.06483491970259388, + "grad_norm": 0.87109375, + "learning_rate": 0.0001984982532715963, + "loss": 1.2724, + "step": 2525 + }, + { + "epoch": 0.0648605968985157, + "grad_norm": 0.84765625, + "learning_rate": 0.00019849748241108494, + "loss": 1.2012, + "step": 2526 + }, + { + "epoch": 0.06488627409443752, + "grad_norm": 0.8828125, + "learning_rate": 0.00019849671135427696, + "loss": 1.3287, + "step": 2527 + }, + { + "epoch": 0.06491195129035934, + "grad_norm": 0.9140625, + "learning_rate": 0.00019849594010117394, + "loss": 1.1965, + "step": 2528 + }, + { + "epoch": 0.06493762848628115, + "grad_norm": 0.85546875, + "learning_rate": 0.00019849516865177744, + "loss": 1.2634, + "step": 2529 + }, + { + "epoch": 0.06496330568220297, + "grad_norm": 0.87890625, + "learning_rate": 0.00019849439700608893, + "loss": 1.1146, + "step": 2530 + }, + { + "epoch": 0.0649889828781248, + "grad_norm": 1.171875, + "learning_rate": 0.00019849362516411004, + "loss": 1.2529, + "step": 2531 + }, + { + "epoch": 0.06501466007404662, + "grad_norm": 0.95703125, + "learning_rate": 0.0001984928531258422, + "loss": 1.0763, + "step": 2532 + }, + { + "epoch": 0.06504033726996843, + "grad_norm": 0.91796875, + "learning_rate": 0.000198492080891287, + "loss": 1.314, + "step": 2533 + }, + { + "epoch": 0.06506601446589025, + "grad_norm": 0.94921875, + "learning_rate": 0.000198491308460446, + "loss": 1.2941, + "step": 2534 + }, + { + "epoch": 0.06509169166181207, + "grad_norm": 0.96875, + "learning_rate": 0.0001984905358333207, + "loss": 1.2532, + "step": 2535 + }, + { + "epoch": 0.06511736885773389, + "grad_norm": 0.82421875, + "learning_rate": 0.0001984897630099127, + "loss": 1.1975, + "step": 2536 + }, + { + "epoch": 0.06514304605365571, + "grad_norm": 0.8828125, + "learning_rate": 0.00019848898999022348, + "loss": 1.0975, + "step": 2537 + }, + { + "epoch": 0.06516872324957752, + "grad_norm": 0.890625, + "learning_rate": 0.0001984882167742546, + "loss": 1.3534, + "step": 2538 + }, + { + "epoch": 0.06519440044549935, + "grad_norm": 0.8828125, + "learning_rate": 0.00019848744336200757, + "loss": 1.2956, + "step": 2539 + }, + { + "epoch": 0.06522007764142117, + "grad_norm": 0.9375, + "learning_rate": 0.000198486669753484, + "loss": 1.2228, + "step": 2540 + }, + { + "epoch": 0.06524575483734299, + "grad_norm": 0.828125, + "learning_rate": 0.00019848589594868537, + "loss": 1.2763, + "step": 2541 + }, + { + "epoch": 0.06527143203326481, + "grad_norm": 0.9296875, + "learning_rate": 0.00019848512194761323, + "loss": 1.3866, + "step": 2542 + }, + { + "epoch": 0.06529710922918662, + "grad_norm": 0.953125, + "learning_rate": 0.0001984843477502692, + "loss": 1.2577, + "step": 2543 + }, + { + "epoch": 0.06532278642510844, + "grad_norm": 0.890625, + "learning_rate": 0.00019848357335665472, + "loss": 1.2339, + "step": 2544 + }, + { + "epoch": 0.06534846362103026, + "grad_norm": 0.90234375, + "learning_rate": 0.00019848279876677136, + "loss": 1.2251, + "step": 2545 + }, + { + "epoch": 0.06537414081695209, + "grad_norm": 0.97265625, + "learning_rate": 0.0001984820239806207, + "loss": 1.2382, + "step": 2546 + }, + { + "epoch": 0.06539981801287391, + "grad_norm": 0.94140625, + "learning_rate": 0.00019848124899820424, + "loss": 1.4308, + "step": 2547 + }, + { + "epoch": 0.06542549520879572, + "grad_norm": 0.9453125, + "learning_rate": 0.00019848047381952358, + "loss": 1.3339, + "step": 2548 + }, + { + "epoch": 0.06545117240471754, + "grad_norm": 0.8828125, + "learning_rate": 0.00019847969844458022, + "loss": 1.3099, + "step": 2549 + }, + { + "epoch": 0.06547684960063936, + "grad_norm": 0.8359375, + "learning_rate": 0.0001984789228733757, + "loss": 1.1981, + "step": 2550 + }, + { + "epoch": 0.06550252679656118, + "grad_norm": 0.84765625, + "learning_rate": 0.0001984781471059116, + "loss": 1.169, + "step": 2551 + }, + { + "epoch": 0.065528203992483, + "grad_norm": 0.85546875, + "learning_rate": 0.00019847737114218945, + "loss": 1.3198, + "step": 2552 + }, + { + "epoch": 0.06555388118840481, + "grad_norm": 0.859375, + "learning_rate": 0.00019847659498221077, + "loss": 1.108, + "step": 2553 + }, + { + "epoch": 0.06557955838432664, + "grad_norm": 0.8984375, + "learning_rate": 0.00019847581862597716, + "loss": 1.1996, + "step": 2554 + }, + { + "epoch": 0.06560523558024846, + "grad_norm": 0.9453125, + "learning_rate": 0.00019847504207349013, + "loss": 1.2712, + "step": 2555 + }, + { + "epoch": 0.06563091277617028, + "grad_norm": 0.8046875, + "learning_rate": 0.0001984742653247512, + "loss": 1.2102, + "step": 2556 + }, + { + "epoch": 0.0656565899720921, + "grad_norm": 0.9453125, + "learning_rate": 0.00019847348837976201, + "loss": 1.1677, + "step": 2557 + }, + { + "epoch": 0.06568226716801391, + "grad_norm": 0.88671875, + "learning_rate": 0.00019847271123852402, + "loss": 1.2714, + "step": 2558 + }, + { + "epoch": 0.06570794436393573, + "grad_norm": 0.90234375, + "learning_rate": 0.0001984719339010388, + "loss": 1.1635, + "step": 2559 + }, + { + "epoch": 0.06573362155985756, + "grad_norm": 0.890625, + "learning_rate": 0.00019847115636730794, + "loss": 1.4198, + "step": 2560 + }, + { + "epoch": 0.06575929875577938, + "grad_norm": 0.85546875, + "learning_rate": 0.00019847037863733298, + "loss": 1.2765, + "step": 2561 + }, + { + "epoch": 0.0657849759517012, + "grad_norm": 0.9140625, + "learning_rate": 0.0001984696007111154, + "loss": 1.2182, + "step": 2562 + }, + { + "epoch": 0.06581065314762301, + "grad_norm": 0.8515625, + "learning_rate": 0.00019846882258865684, + "loss": 1.3611, + "step": 2563 + }, + { + "epoch": 0.06583633034354483, + "grad_norm": 0.921875, + "learning_rate": 0.00019846804426995878, + "loss": 1.1917, + "step": 2564 + }, + { + "epoch": 0.06586200753946665, + "grad_norm": 0.9296875, + "learning_rate": 0.00019846726575502283, + "loss": 1.1543, + "step": 2565 + }, + { + "epoch": 0.06588768473538847, + "grad_norm": 0.90625, + "learning_rate": 0.0001984664870438505, + "loss": 1.3033, + "step": 2566 + }, + { + "epoch": 0.0659133619313103, + "grad_norm": 0.89453125, + "learning_rate": 0.00019846570813644333, + "loss": 1.2841, + "step": 2567 + }, + { + "epoch": 0.0659390391272321, + "grad_norm": 1.046875, + "learning_rate": 0.00019846492903280295, + "loss": 1.2552, + "step": 2568 + }, + { + "epoch": 0.06596471632315393, + "grad_norm": 0.9921875, + "learning_rate": 0.00019846414973293084, + "loss": 1.2325, + "step": 2569 + }, + { + "epoch": 0.06599039351907575, + "grad_norm": 0.8515625, + "learning_rate": 0.00019846337023682854, + "loss": 1.2113, + "step": 2570 + }, + { + "epoch": 0.06601607071499757, + "grad_norm": 0.8828125, + "learning_rate": 0.00019846259054449766, + "loss": 1.3021, + "step": 2571 + }, + { + "epoch": 0.0660417479109194, + "grad_norm": 0.90625, + "learning_rate": 0.00019846181065593976, + "loss": 1.2946, + "step": 2572 + }, + { + "epoch": 0.0660674251068412, + "grad_norm": 0.98828125, + "learning_rate": 0.00019846103057115635, + "loss": 1.134, + "step": 2573 + }, + { + "epoch": 0.06609310230276302, + "grad_norm": 0.921875, + "learning_rate": 0.00019846025029014898, + "loss": 1.2628, + "step": 2574 + }, + { + "epoch": 0.06611877949868485, + "grad_norm": 0.84765625, + "learning_rate": 0.00019845946981291926, + "loss": 1.1249, + "step": 2575 + }, + { + "epoch": 0.06614445669460667, + "grad_norm": 0.8515625, + "learning_rate": 0.00019845868913946868, + "loss": 1.2511, + "step": 2576 + }, + { + "epoch": 0.06617013389052849, + "grad_norm": 0.8359375, + "learning_rate": 0.0001984579082697988, + "loss": 1.2161, + "step": 2577 + }, + { + "epoch": 0.0661958110864503, + "grad_norm": 0.89453125, + "learning_rate": 0.00019845712720391125, + "loss": 1.3211, + "step": 2578 + }, + { + "epoch": 0.06622148828237212, + "grad_norm": 0.8671875, + "learning_rate": 0.00019845634594180752, + "loss": 1.2536, + "step": 2579 + }, + { + "epoch": 0.06624716547829394, + "grad_norm": 0.86328125, + "learning_rate": 0.00019845556448348922, + "loss": 1.3096, + "step": 2580 + }, + { + "epoch": 0.06627284267421577, + "grad_norm": 0.94921875, + "learning_rate": 0.00019845478282895783, + "loss": 1.1871, + "step": 2581 + }, + { + "epoch": 0.06629851987013759, + "grad_norm": 0.89453125, + "learning_rate": 0.000198454000978215, + "loss": 1.3047, + "step": 2582 + }, + { + "epoch": 0.0663241970660594, + "grad_norm": 0.86328125, + "learning_rate": 0.0001984532189312622, + "loss": 1.156, + "step": 2583 + }, + { + "epoch": 0.06634987426198122, + "grad_norm": 0.9140625, + "learning_rate": 0.000198452436688101, + "loss": 1.3385, + "step": 2584 + }, + { + "epoch": 0.06637555145790304, + "grad_norm": 0.9140625, + "learning_rate": 0.00019845165424873303, + "loss": 1.195, + "step": 2585 + }, + { + "epoch": 0.06640122865382486, + "grad_norm": 0.90234375, + "learning_rate": 0.0001984508716131598, + "loss": 1.1762, + "step": 2586 + }, + { + "epoch": 0.06642690584974668, + "grad_norm": 0.9140625, + "learning_rate": 0.00019845008878138288, + "loss": 1.1865, + "step": 2587 + }, + { + "epoch": 0.06645258304566849, + "grad_norm": 0.83984375, + "learning_rate": 0.0001984493057534038, + "loss": 1.2385, + "step": 2588 + }, + { + "epoch": 0.06647826024159031, + "grad_norm": 0.93359375, + "learning_rate": 0.00019844852252922418, + "loss": 1.206, + "step": 2589 + }, + { + "epoch": 0.06650393743751214, + "grad_norm": 0.92578125, + "learning_rate": 0.00019844773910884553, + "loss": 1.1809, + "step": 2590 + }, + { + "epoch": 0.06652961463343396, + "grad_norm": 0.87890625, + "learning_rate": 0.00019844695549226944, + "loss": 1.2973, + "step": 2591 + }, + { + "epoch": 0.06655529182935578, + "grad_norm": 0.89453125, + "learning_rate": 0.00019844617167949746, + "loss": 1.1044, + "step": 2592 + }, + { + "epoch": 0.06658096902527759, + "grad_norm": 0.88671875, + "learning_rate": 0.00019844538767053112, + "loss": 1.309, + "step": 2593 + }, + { + "epoch": 0.06660664622119941, + "grad_norm": 0.8828125, + "learning_rate": 0.00019844460346537206, + "loss": 1.3412, + "step": 2594 + }, + { + "epoch": 0.06663232341712123, + "grad_norm": 0.85546875, + "learning_rate": 0.00019844381906402176, + "loss": 1.2417, + "step": 2595 + }, + { + "epoch": 0.06665800061304306, + "grad_norm": 0.9453125, + "learning_rate": 0.00019844303446648186, + "loss": 1.2381, + "step": 2596 + }, + { + "epoch": 0.06668367780896488, + "grad_norm": 0.85546875, + "learning_rate": 0.00019844224967275383, + "loss": 1.2439, + "step": 2597 + }, + { + "epoch": 0.06670935500488669, + "grad_norm": 0.890625, + "learning_rate": 0.00019844146468283932, + "loss": 1.3912, + "step": 2598 + }, + { + "epoch": 0.06673503220080851, + "grad_norm": 0.98828125, + "learning_rate": 0.0001984406794967399, + "loss": 1.3404, + "step": 2599 + }, + { + "epoch": 0.06676070939673033, + "grad_norm": 0.88671875, + "learning_rate": 0.00019843989411445705, + "loss": 1.4781, + "step": 2600 + }, + { + "epoch": 0.06678638659265215, + "grad_norm": 0.96875, + "learning_rate": 0.00019843910853599238, + "loss": 1.2234, + "step": 2601 + }, + { + "epoch": 0.06681206378857397, + "grad_norm": 0.7734375, + "learning_rate": 0.0001984383227613475, + "loss": 1.2055, + "step": 2602 + }, + { + "epoch": 0.06683774098449578, + "grad_norm": 0.87890625, + "learning_rate": 0.00019843753679052388, + "loss": 1.3055, + "step": 2603 + }, + { + "epoch": 0.0668634181804176, + "grad_norm": 0.96484375, + "learning_rate": 0.00019843675062352319, + "loss": 1.2958, + "step": 2604 + }, + { + "epoch": 0.06688909537633943, + "grad_norm": 0.87890625, + "learning_rate": 0.00019843596426034692, + "loss": 1.19, + "step": 2605 + }, + { + "epoch": 0.06691477257226125, + "grad_norm": 0.90234375, + "learning_rate": 0.0001984351777009967, + "loss": 1.387, + "step": 2606 + }, + { + "epoch": 0.06694044976818307, + "grad_norm": 0.87890625, + "learning_rate": 0.000198434390945474, + "loss": 1.253, + "step": 2607 + }, + { + "epoch": 0.06696612696410488, + "grad_norm": 1.015625, + "learning_rate": 0.0001984336039937805, + "loss": 1.1235, + "step": 2608 + }, + { + "epoch": 0.0669918041600267, + "grad_norm": 0.87109375, + "learning_rate": 0.0001984328168459177, + "loss": 1.2749, + "step": 2609 + }, + { + "epoch": 0.06701748135594852, + "grad_norm": 0.8515625, + "learning_rate": 0.0001984320295018872, + "loss": 1.2765, + "step": 2610 + }, + { + "epoch": 0.06704315855187035, + "grad_norm": 0.84375, + "learning_rate": 0.00019843124196169054, + "loss": 1.2251, + "step": 2611 + }, + { + "epoch": 0.06706883574779217, + "grad_norm": 0.82421875, + "learning_rate": 0.00019843045422532932, + "loss": 1.1463, + "step": 2612 + }, + { + "epoch": 0.06709451294371398, + "grad_norm": 0.86328125, + "learning_rate": 0.00019842966629280508, + "loss": 1.0884, + "step": 2613 + }, + { + "epoch": 0.0671201901396358, + "grad_norm": 0.91015625, + "learning_rate": 0.00019842887816411942, + "loss": 1.2868, + "step": 2614 + }, + { + "epoch": 0.06714586733555762, + "grad_norm": 0.90625, + "learning_rate": 0.0001984280898392739, + "loss": 1.1814, + "step": 2615 + }, + { + "epoch": 0.06717154453147944, + "grad_norm": 0.88671875, + "learning_rate": 0.00019842730131827007, + "loss": 1.2702, + "step": 2616 + }, + { + "epoch": 0.06719722172740127, + "grad_norm": 0.89453125, + "learning_rate": 0.0001984265126011095, + "loss": 1.2337, + "step": 2617 + }, + { + "epoch": 0.06722289892332307, + "grad_norm": 0.84765625, + "learning_rate": 0.00019842572368779384, + "loss": 1.0591, + "step": 2618 + }, + { + "epoch": 0.0672485761192449, + "grad_norm": 0.90234375, + "learning_rate": 0.00019842493457832458, + "loss": 1.1172, + "step": 2619 + }, + { + "epoch": 0.06727425331516672, + "grad_norm": 0.95703125, + "learning_rate": 0.00019842414527270331, + "loss": 1.2634, + "step": 2620 + }, + { + "epoch": 0.06729993051108854, + "grad_norm": 0.86328125, + "learning_rate": 0.0001984233557709316, + "loss": 1.154, + "step": 2621 + }, + { + "epoch": 0.06732560770701036, + "grad_norm": 0.90234375, + "learning_rate": 0.00019842256607301108, + "loss": 1.2741, + "step": 2622 + }, + { + "epoch": 0.06735128490293217, + "grad_norm": 0.9296875, + "learning_rate": 0.00019842177617894322, + "loss": 1.2866, + "step": 2623 + }, + { + "epoch": 0.06737696209885399, + "grad_norm": 0.8984375, + "learning_rate": 0.00019842098608872965, + "loss": 1.2716, + "step": 2624 + }, + { + "epoch": 0.06740263929477582, + "grad_norm": 0.890625, + "learning_rate": 0.00019842019580237197, + "loss": 1.2493, + "step": 2625 + }, + { + "epoch": 0.06742831649069764, + "grad_norm": 0.92578125, + "learning_rate": 0.0001984194053198717, + "loss": 1.3161, + "step": 2626 + }, + { + "epoch": 0.06745399368661946, + "grad_norm": 0.9375, + "learning_rate": 0.00019841861464123047, + "loss": 1.2649, + "step": 2627 + }, + { + "epoch": 0.06747967088254127, + "grad_norm": 0.89453125, + "learning_rate": 0.00019841782376644985, + "loss": 1.2744, + "step": 2628 + }, + { + "epoch": 0.06750534807846309, + "grad_norm": 0.91015625, + "learning_rate": 0.0001984170326955314, + "loss": 1.1168, + "step": 2629 + }, + { + "epoch": 0.06753102527438491, + "grad_norm": 0.890625, + "learning_rate": 0.00019841624142847666, + "loss": 1.0659, + "step": 2630 + }, + { + "epoch": 0.06755670247030673, + "grad_norm": 0.8828125, + "learning_rate": 0.00019841544996528724, + "loss": 1.3606, + "step": 2631 + }, + { + "epoch": 0.06758237966622856, + "grad_norm": 0.83984375, + "learning_rate": 0.00019841465830596473, + "loss": 1.0744, + "step": 2632 + }, + { + "epoch": 0.06760805686215036, + "grad_norm": 0.92578125, + "learning_rate": 0.0001984138664505107, + "loss": 1.1769, + "step": 2633 + }, + { + "epoch": 0.06763373405807219, + "grad_norm": 0.84765625, + "learning_rate": 0.0001984130743989267, + "loss": 1.4478, + "step": 2634 + }, + { + "epoch": 0.06765941125399401, + "grad_norm": 0.97265625, + "learning_rate": 0.0001984122821512144, + "loss": 1.2657, + "step": 2635 + }, + { + "epoch": 0.06768508844991583, + "grad_norm": 0.85546875, + "learning_rate": 0.00019841148970737525, + "loss": 1.2108, + "step": 2636 + }, + { + "epoch": 0.06771076564583764, + "grad_norm": 0.84765625, + "learning_rate": 0.0001984106970674109, + "loss": 1.1042, + "step": 2637 + }, + { + "epoch": 0.06773644284175946, + "grad_norm": 0.87109375, + "learning_rate": 0.0001984099042313229, + "loss": 1.238, + "step": 2638 + }, + { + "epoch": 0.06776212003768128, + "grad_norm": 0.984375, + "learning_rate": 0.00019840911119911288, + "loss": 1.2059, + "step": 2639 + }, + { + "epoch": 0.0677877972336031, + "grad_norm": 0.91015625, + "learning_rate": 0.0001984083179707824, + "loss": 1.2614, + "step": 2640 + }, + { + "epoch": 0.06781347442952493, + "grad_norm": 0.91796875, + "learning_rate": 0.000198407524546333, + "loss": 1.2741, + "step": 2641 + }, + { + "epoch": 0.06783915162544674, + "grad_norm": 0.9140625, + "learning_rate": 0.00019840673092576632, + "loss": 1.1073, + "step": 2642 + }, + { + "epoch": 0.06786482882136856, + "grad_norm": 0.9765625, + "learning_rate": 0.0001984059371090839, + "loss": 1.3456, + "step": 2643 + }, + { + "epoch": 0.06789050601729038, + "grad_norm": 0.97265625, + "learning_rate": 0.00019840514309628734, + "loss": 1.2727, + "step": 2644 + }, + { + "epoch": 0.0679161832132122, + "grad_norm": 1.0234375, + "learning_rate": 0.00019840434888737822, + "loss": 1.2734, + "step": 2645 + }, + { + "epoch": 0.06794186040913403, + "grad_norm": 0.92578125, + "learning_rate": 0.00019840355448235813, + "loss": 1.1624, + "step": 2646 + }, + { + "epoch": 0.06796753760505583, + "grad_norm": 0.94921875, + "learning_rate": 0.00019840275988122863, + "loss": 1.5552, + "step": 2647 + }, + { + "epoch": 0.06799321480097766, + "grad_norm": 0.953125, + "learning_rate": 0.0001984019650839913, + "loss": 1.2936, + "step": 2648 + }, + { + "epoch": 0.06801889199689948, + "grad_norm": 0.8671875, + "learning_rate": 0.0001984011700906478, + "loss": 1.2829, + "step": 2649 + }, + { + "epoch": 0.0680445691928213, + "grad_norm": 0.9296875, + "learning_rate": 0.0001984003749011996, + "loss": 1.3205, + "step": 2650 + }, + { + "epoch": 0.06807024638874312, + "grad_norm": 0.94140625, + "learning_rate": 0.00019839957951564837, + "loss": 1.3008, + "step": 2651 + }, + { + "epoch": 0.06809592358466493, + "grad_norm": 0.91796875, + "learning_rate": 0.00019839878393399567, + "loss": 1.4079, + "step": 2652 + }, + { + "epoch": 0.06812160078058675, + "grad_norm": 0.88671875, + "learning_rate": 0.00019839798815624306, + "loss": 1.1295, + "step": 2653 + }, + { + "epoch": 0.06814727797650857, + "grad_norm": 0.90625, + "learning_rate": 0.00019839719218239216, + "loss": 1.4476, + "step": 2654 + }, + { + "epoch": 0.0681729551724304, + "grad_norm": 0.8984375, + "learning_rate": 0.00019839639601244456, + "loss": 1.3034, + "step": 2655 + }, + { + "epoch": 0.06819863236835222, + "grad_norm": 0.9140625, + "learning_rate": 0.0001983955996464018, + "loss": 1.1643, + "step": 2656 + }, + { + "epoch": 0.06822430956427403, + "grad_norm": 0.8203125, + "learning_rate": 0.0001983948030842655, + "loss": 1.0973, + "step": 2657 + }, + { + "epoch": 0.06824998676019585, + "grad_norm": 0.94140625, + "learning_rate": 0.00019839400632603725, + "loss": 1.2163, + "step": 2658 + }, + { + "epoch": 0.06827566395611767, + "grad_norm": 0.875, + "learning_rate": 0.00019839320937171866, + "loss": 1.1652, + "step": 2659 + }, + { + "epoch": 0.0683013411520395, + "grad_norm": 0.91015625, + "learning_rate": 0.00019839241222131127, + "loss": 1.2527, + "step": 2660 + }, + { + "epoch": 0.06832701834796132, + "grad_norm": 0.87109375, + "learning_rate": 0.0001983916148748167, + "loss": 1.1421, + "step": 2661 + }, + { + "epoch": 0.06835269554388312, + "grad_norm": 0.90625, + "learning_rate": 0.00019839081733223654, + "loss": 1.2499, + "step": 2662 + }, + { + "epoch": 0.06837837273980495, + "grad_norm": 0.890625, + "learning_rate": 0.00019839001959357232, + "loss": 1.2608, + "step": 2663 + }, + { + "epoch": 0.06840404993572677, + "grad_norm": 0.91796875, + "learning_rate": 0.00019838922165882572, + "loss": 1.2241, + "step": 2664 + }, + { + "epoch": 0.06842972713164859, + "grad_norm": 0.8125, + "learning_rate": 0.0001983884235279983, + "loss": 1.1213, + "step": 2665 + }, + { + "epoch": 0.06845540432757041, + "grad_norm": 0.80078125, + "learning_rate": 0.0001983876252010916, + "loss": 1.0645, + "step": 2666 + }, + { + "epoch": 0.06848108152349222, + "grad_norm": 0.97265625, + "learning_rate": 0.00019838682667810726, + "loss": 1.2441, + "step": 2667 + }, + { + "epoch": 0.06850675871941404, + "grad_norm": 0.87109375, + "learning_rate": 0.00019838602795904686, + "loss": 1.2493, + "step": 2668 + }, + { + "epoch": 0.06853243591533587, + "grad_norm": 0.91015625, + "learning_rate": 0.00019838522904391202, + "loss": 1.2599, + "step": 2669 + }, + { + "epoch": 0.06855811311125769, + "grad_norm": 0.9375, + "learning_rate": 0.00019838442993270428, + "loss": 1.1604, + "step": 2670 + }, + { + "epoch": 0.06858379030717951, + "grad_norm": 1.0703125, + "learning_rate": 0.00019838363062542525, + "loss": 1.3253, + "step": 2671 + }, + { + "epoch": 0.06860946750310132, + "grad_norm": 0.828125, + "learning_rate": 0.00019838283112207656, + "loss": 1.0599, + "step": 2672 + }, + { + "epoch": 0.06863514469902314, + "grad_norm": 0.96875, + "learning_rate": 0.00019838203142265976, + "loss": 1.3177, + "step": 2673 + }, + { + "epoch": 0.06866082189494496, + "grad_norm": 0.87109375, + "learning_rate": 0.00019838123152717647, + "loss": 1.1481, + "step": 2674 + }, + { + "epoch": 0.06868649909086678, + "grad_norm": 0.8984375, + "learning_rate": 0.00019838043143562825, + "loss": 1.286, + "step": 2675 + }, + { + "epoch": 0.0687121762867886, + "grad_norm": 0.98046875, + "learning_rate": 0.00019837963114801674, + "loss": 1.3614, + "step": 2676 + }, + { + "epoch": 0.06873785348271041, + "grad_norm": 0.93359375, + "learning_rate": 0.0001983788306643435, + "loss": 1.2611, + "step": 2677 + }, + { + "epoch": 0.06876353067863224, + "grad_norm": 0.80859375, + "learning_rate": 0.00019837802998461012, + "loss": 1.2055, + "step": 2678 + }, + { + "epoch": 0.06878920787455406, + "grad_norm": 0.91015625, + "learning_rate": 0.00019837722910881828, + "loss": 1.3022, + "step": 2679 + }, + { + "epoch": 0.06881488507047588, + "grad_norm": 0.90625, + "learning_rate": 0.00019837642803696942, + "loss": 1.448, + "step": 2680 + }, + { + "epoch": 0.0688405622663977, + "grad_norm": 0.9609375, + "learning_rate": 0.00019837562676906526, + "loss": 1.2462, + "step": 2681 + }, + { + "epoch": 0.06886623946231951, + "grad_norm": 0.83984375, + "learning_rate": 0.00019837482530510737, + "loss": 1.1572, + "step": 2682 + }, + { + "epoch": 0.06889191665824133, + "grad_norm": 0.93359375, + "learning_rate": 0.00019837402364509735, + "loss": 1.3512, + "step": 2683 + }, + { + "epoch": 0.06891759385416316, + "grad_norm": 0.95703125, + "learning_rate": 0.00019837322178903677, + "loss": 1.2779, + "step": 2684 + }, + { + "epoch": 0.06894327105008498, + "grad_norm": 0.91796875, + "learning_rate": 0.00019837241973692723, + "loss": 1.1687, + "step": 2685 + }, + { + "epoch": 0.0689689482460068, + "grad_norm": 0.84765625, + "learning_rate": 0.00019837161748877038, + "loss": 1.1639, + "step": 2686 + }, + { + "epoch": 0.06899462544192861, + "grad_norm": 0.88671875, + "learning_rate": 0.00019837081504456777, + "loss": 1.1801, + "step": 2687 + }, + { + "epoch": 0.06902030263785043, + "grad_norm": 0.8828125, + "learning_rate": 0.000198370012404321, + "loss": 1.3209, + "step": 2688 + }, + { + "epoch": 0.06904597983377225, + "grad_norm": 0.8359375, + "learning_rate": 0.00019836920956803165, + "loss": 1.276, + "step": 2689 + }, + { + "epoch": 0.06907165702969408, + "grad_norm": 0.82421875, + "learning_rate": 0.0001983684065357014, + "loss": 1.1796, + "step": 2690 + }, + { + "epoch": 0.0690973342256159, + "grad_norm": 0.91015625, + "learning_rate": 0.0001983676033073318, + "loss": 1.3116, + "step": 2691 + }, + { + "epoch": 0.0691230114215377, + "grad_norm": 0.87109375, + "learning_rate": 0.00019836679988292444, + "loss": 1.3379, + "step": 2692 + }, + { + "epoch": 0.06914868861745953, + "grad_norm": 0.890625, + "learning_rate": 0.00019836599626248095, + "loss": 1.134, + "step": 2693 + }, + { + "epoch": 0.06917436581338135, + "grad_norm": 0.875, + "learning_rate": 0.0001983651924460029, + "loss": 1.2175, + "step": 2694 + }, + { + "epoch": 0.06920004300930317, + "grad_norm": 0.88671875, + "learning_rate": 0.0001983643884334919, + "loss": 1.0849, + "step": 2695 + }, + { + "epoch": 0.069225720205225, + "grad_norm": 0.90625, + "learning_rate": 0.00019836358422494957, + "loss": 1.2731, + "step": 2696 + }, + { + "epoch": 0.0692513974011468, + "grad_norm": 0.91015625, + "learning_rate": 0.00019836277982037748, + "loss": 1.1828, + "step": 2697 + }, + { + "epoch": 0.06927707459706862, + "grad_norm": 0.84765625, + "learning_rate": 0.00019836197521977725, + "loss": 1.1325, + "step": 2698 + }, + { + "epoch": 0.06930275179299045, + "grad_norm": 0.94921875, + "learning_rate": 0.00019836117042315053, + "loss": 1.2362, + "step": 2699 + }, + { + "epoch": 0.06932842898891227, + "grad_norm": 0.9140625, + "learning_rate": 0.00019836036543049884, + "loss": 1.377, + "step": 2700 + }, + { + "epoch": 0.06935410618483409, + "grad_norm": 0.953125, + "learning_rate": 0.00019835956024182388, + "loss": 1.0676, + "step": 2701 + }, + { + "epoch": 0.0693797833807559, + "grad_norm": 0.8828125, + "learning_rate": 0.00019835875485712715, + "loss": 1.0473, + "step": 2702 + }, + { + "epoch": 0.06940546057667772, + "grad_norm": 0.984375, + "learning_rate": 0.00019835794927641032, + "loss": 1.3782, + "step": 2703 + }, + { + "epoch": 0.06943113777259954, + "grad_norm": 0.88671875, + "learning_rate": 0.000198357143499675, + "loss": 1.3416, + "step": 2704 + }, + { + "epoch": 0.06945681496852137, + "grad_norm": 0.92578125, + "learning_rate": 0.00019835633752692276, + "loss": 1.3109, + "step": 2705 + }, + { + "epoch": 0.06948249216444319, + "grad_norm": 0.875, + "learning_rate": 0.00019835553135815522, + "loss": 1.2434, + "step": 2706 + }, + { + "epoch": 0.069508169360365, + "grad_norm": 0.9296875, + "learning_rate": 0.00019835472499337402, + "loss": 1.1841, + "step": 2707 + }, + { + "epoch": 0.06953384655628682, + "grad_norm": 0.81640625, + "learning_rate": 0.0001983539184325807, + "loss": 1.217, + "step": 2708 + }, + { + "epoch": 0.06955952375220864, + "grad_norm": 0.81640625, + "learning_rate": 0.00019835311167577696, + "loss": 1.3279, + "step": 2709 + }, + { + "epoch": 0.06958520094813046, + "grad_norm": 0.875, + "learning_rate": 0.00019835230472296428, + "loss": 1.1702, + "step": 2710 + }, + { + "epoch": 0.06961087814405229, + "grad_norm": 0.828125, + "learning_rate": 0.00019835149757414436, + "loss": 1.1429, + "step": 2711 + }, + { + "epoch": 0.0696365553399741, + "grad_norm": 0.91015625, + "learning_rate": 0.00019835069022931882, + "loss": 1.3895, + "step": 2712 + }, + { + "epoch": 0.06966223253589592, + "grad_norm": 0.8515625, + "learning_rate": 0.00019834988268848923, + "loss": 1.2791, + "step": 2713 + }, + { + "epoch": 0.06968790973181774, + "grad_norm": 0.92578125, + "learning_rate": 0.0001983490749516572, + "loss": 1.1623, + "step": 2714 + }, + { + "epoch": 0.06971358692773956, + "grad_norm": 0.89453125, + "learning_rate": 0.00019834826701882436, + "loss": 1.1019, + "step": 2715 + }, + { + "epoch": 0.06973926412366138, + "grad_norm": 0.90625, + "learning_rate": 0.00019834745888999228, + "loss": 1.2098, + "step": 2716 + }, + { + "epoch": 0.06976494131958319, + "grad_norm": 0.8828125, + "learning_rate": 0.00019834665056516263, + "loss": 1.3257, + "step": 2717 + }, + { + "epoch": 0.06979061851550501, + "grad_norm": 0.90234375, + "learning_rate": 0.00019834584204433698, + "loss": 1.1319, + "step": 2718 + }, + { + "epoch": 0.06981629571142683, + "grad_norm": 0.8828125, + "learning_rate": 0.00019834503332751693, + "loss": 1.1939, + "step": 2719 + }, + { + "epoch": 0.06984197290734866, + "grad_norm": 0.94921875, + "learning_rate": 0.00019834422441470412, + "loss": 1.2615, + "step": 2720 + }, + { + "epoch": 0.06986765010327048, + "grad_norm": 0.84765625, + "learning_rate": 0.00019834341530590015, + "loss": 1.1781, + "step": 2721 + }, + { + "epoch": 0.06989332729919229, + "grad_norm": 1.015625, + "learning_rate": 0.00019834260600110667, + "loss": 1.2718, + "step": 2722 + }, + { + "epoch": 0.06991900449511411, + "grad_norm": 0.8984375, + "learning_rate": 0.00019834179650032524, + "loss": 1.2442, + "step": 2723 + }, + { + "epoch": 0.06994468169103593, + "grad_norm": 0.85546875, + "learning_rate": 0.00019834098680355746, + "loss": 1.1262, + "step": 2724 + }, + { + "epoch": 0.06997035888695775, + "grad_norm": 0.8984375, + "learning_rate": 0.000198340176910805, + "loss": 1.1685, + "step": 2725 + }, + { + "epoch": 0.06999603608287958, + "grad_norm": 0.9921875, + "learning_rate": 0.00019833936682206948, + "loss": 1.2928, + "step": 2726 + }, + { + "epoch": 0.07002171327880138, + "grad_norm": 0.94921875, + "learning_rate": 0.00019833855653735244, + "loss": 1.2885, + "step": 2727 + }, + { + "epoch": 0.0700473904747232, + "grad_norm": 0.84765625, + "learning_rate": 0.00019833774605665556, + "loss": 1.0235, + "step": 2728 + }, + { + "epoch": 0.07007306767064503, + "grad_norm": 0.8359375, + "learning_rate": 0.00019833693537998044, + "loss": 1.3848, + "step": 2729 + }, + { + "epoch": 0.07009874486656685, + "grad_norm": 0.94140625, + "learning_rate": 0.00019833612450732868, + "loss": 1.2387, + "step": 2730 + }, + { + "epoch": 0.07012442206248867, + "grad_norm": 0.9140625, + "learning_rate": 0.00019833531343870192, + "loss": 1.3113, + "step": 2731 + }, + { + "epoch": 0.07015009925841048, + "grad_norm": 0.859375, + "learning_rate": 0.00019833450217410174, + "loss": 1.261, + "step": 2732 + }, + { + "epoch": 0.0701757764543323, + "grad_norm": 0.79296875, + "learning_rate": 0.0001983336907135298, + "loss": 1.2007, + "step": 2733 + }, + { + "epoch": 0.07020145365025413, + "grad_norm": 0.83984375, + "learning_rate": 0.0001983328790569877, + "loss": 1.3723, + "step": 2734 + }, + { + "epoch": 0.07022713084617595, + "grad_norm": 0.81640625, + "learning_rate": 0.00019833206720447705, + "loss": 1.2698, + "step": 2735 + }, + { + "epoch": 0.07025280804209777, + "grad_norm": 1.0703125, + "learning_rate": 0.00019833125515599946, + "loss": 1.1595, + "step": 2736 + }, + { + "epoch": 0.07027848523801958, + "grad_norm": 0.9453125, + "learning_rate": 0.00019833044291155659, + "loss": 1.2174, + "step": 2737 + }, + { + "epoch": 0.0703041624339414, + "grad_norm": 0.96875, + "learning_rate": 0.00019832963047115, + "loss": 1.3904, + "step": 2738 + }, + { + "epoch": 0.07032983962986322, + "grad_norm": 0.88671875, + "learning_rate": 0.00019832881783478134, + "loss": 1.3716, + "step": 2739 + }, + { + "epoch": 0.07035551682578504, + "grad_norm": 0.92578125, + "learning_rate": 0.0001983280050024522, + "loss": 1.2501, + "step": 2740 + }, + { + "epoch": 0.07038119402170685, + "grad_norm": 0.91796875, + "learning_rate": 0.00019832719197416428, + "loss": 1.1694, + "step": 2741 + }, + { + "epoch": 0.07040687121762867, + "grad_norm": 0.79296875, + "learning_rate": 0.00019832637874991912, + "loss": 1.2991, + "step": 2742 + }, + { + "epoch": 0.0704325484135505, + "grad_norm": 0.84375, + "learning_rate": 0.0001983255653297184, + "loss": 1.1516, + "step": 2743 + }, + { + "epoch": 0.07045822560947232, + "grad_norm": 0.84375, + "learning_rate": 0.00019832475171356368, + "loss": 1.3292, + "step": 2744 + }, + { + "epoch": 0.07048390280539414, + "grad_norm": 0.86328125, + "learning_rate": 0.00019832393790145662, + "loss": 1.2701, + "step": 2745 + }, + { + "epoch": 0.07050958000131595, + "grad_norm": 1.015625, + "learning_rate": 0.00019832312389339885, + "loss": 1.1789, + "step": 2746 + }, + { + "epoch": 0.07053525719723777, + "grad_norm": 0.859375, + "learning_rate": 0.00019832230968939193, + "loss": 1.2139, + "step": 2747 + }, + { + "epoch": 0.0705609343931596, + "grad_norm": 1.0078125, + "learning_rate": 0.00019832149528943758, + "loss": 1.3393, + "step": 2748 + }, + { + "epoch": 0.07058661158908142, + "grad_norm": 0.921875, + "learning_rate": 0.00019832068069353736, + "loss": 1.272, + "step": 2749 + }, + { + "epoch": 0.07061228878500324, + "grad_norm": 1.0, + "learning_rate": 0.0001983198659016929, + "loss": 1.2665, + "step": 2750 + }, + { + "epoch": 0.07063796598092505, + "grad_norm": 0.98828125, + "learning_rate": 0.00019831905091390582, + "loss": 1.1569, + "step": 2751 + }, + { + "epoch": 0.07066364317684687, + "grad_norm": 0.875, + "learning_rate": 0.00019831823573017774, + "loss": 1.3752, + "step": 2752 + }, + { + "epoch": 0.07068932037276869, + "grad_norm": 0.8671875, + "learning_rate": 0.0001983174203505103, + "loss": 1.3193, + "step": 2753 + }, + { + "epoch": 0.07071499756869051, + "grad_norm": 0.91015625, + "learning_rate": 0.00019831660477490515, + "loss": 1.2465, + "step": 2754 + }, + { + "epoch": 0.07074067476461234, + "grad_norm": 0.953125, + "learning_rate": 0.00019831578900336388, + "loss": 1.2383, + "step": 2755 + }, + { + "epoch": 0.07076635196053414, + "grad_norm": 0.890625, + "learning_rate": 0.0001983149730358881, + "loss": 1.2919, + "step": 2756 + }, + { + "epoch": 0.07079202915645597, + "grad_norm": 0.8828125, + "learning_rate": 0.00019831415687247947, + "loss": 1.4165, + "step": 2757 + }, + { + "epoch": 0.07081770635237779, + "grad_norm": 0.859375, + "learning_rate": 0.00019831334051313961, + "loss": 1.2657, + "step": 2758 + }, + { + "epoch": 0.07084338354829961, + "grad_norm": 0.86328125, + "learning_rate": 0.00019831252395787017, + "loss": 1.1052, + "step": 2759 + }, + { + "epoch": 0.07086906074422143, + "grad_norm": 0.97265625, + "learning_rate": 0.0001983117072066727, + "loss": 1.3421, + "step": 2760 + }, + { + "epoch": 0.07089473794014324, + "grad_norm": 0.9921875, + "learning_rate": 0.00019831089025954893, + "loss": 1.4382, + "step": 2761 + }, + { + "epoch": 0.07092041513606506, + "grad_norm": 0.953125, + "learning_rate": 0.0001983100731165004, + "loss": 1.2161, + "step": 2762 + }, + { + "epoch": 0.07094609233198688, + "grad_norm": 0.87890625, + "learning_rate": 0.00019830925577752877, + "loss": 1.4057, + "step": 2763 + }, + { + "epoch": 0.0709717695279087, + "grad_norm": 0.7890625, + "learning_rate": 0.00019830843824263566, + "loss": 1.1795, + "step": 2764 + }, + { + "epoch": 0.07099744672383053, + "grad_norm": 0.90625, + "learning_rate": 0.00019830762051182278, + "loss": 1.0756, + "step": 2765 + }, + { + "epoch": 0.07102312391975234, + "grad_norm": 0.953125, + "learning_rate": 0.00019830680258509166, + "loss": 1.0941, + "step": 2766 + }, + { + "epoch": 0.07104880111567416, + "grad_norm": 0.96484375, + "learning_rate": 0.00019830598446244393, + "loss": 1.2399, + "step": 2767 + }, + { + "epoch": 0.07107447831159598, + "grad_norm": 0.98046875, + "learning_rate": 0.00019830516614388128, + "loss": 1.1577, + "step": 2768 + }, + { + "epoch": 0.0711001555075178, + "grad_norm": 0.84765625, + "learning_rate": 0.0001983043476294053, + "loss": 1.1403, + "step": 2769 + }, + { + "epoch": 0.07112583270343963, + "grad_norm": 0.890625, + "learning_rate": 0.00019830352891901765, + "loss": 1.0915, + "step": 2770 + }, + { + "epoch": 0.07115150989936143, + "grad_norm": 0.8828125, + "learning_rate": 0.00019830271001271995, + "loss": 1.2511, + "step": 2771 + }, + { + "epoch": 0.07117718709528326, + "grad_norm": 0.89453125, + "learning_rate": 0.0001983018909105138, + "loss": 1.1923, + "step": 2772 + }, + { + "epoch": 0.07120286429120508, + "grad_norm": 0.8984375, + "learning_rate": 0.0001983010716124009, + "loss": 1.2925, + "step": 2773 + }, + { + "epoch": 0.0712285414871269, + "grad_norm": 0.84375, + "learning_rate": 0.0001983002521183828, + "loss": 1.1815, + "step": 2774 + }, + { + "epoch": 0.07125421868304872, + "grad_norm": 0.83984375, + "learning_rate": 0.00019829943242846124, + "loss": 1.0761, + "step": 2775 + }, + { + "epoch": 0.07127989587897053, + "grad_norm": 0.93359375, + "learning_rate": 0.00019829861254263772, + "loss": 1.2138, + "step": 2776 + }, + { + "epoch": 0.07130557307489235, + "grad_norm": 0.92578125, + "learning_rate": 0.000198297792460914, + "loss": 1.1049, + "step": 2777 + }, + { + "epoch": 0.07133125027081418, + "grad_norm": 0.84765625, + "learning_rate": 0.00019829697218329165, + "loss": 1.1479, + "step": 2778 + }, + { + "epoch": 0.071356927466736, + "grad_norm": 1.09375, + "learning_rate": 0.00019829615170977228, + "loss": 1.3467, + "step": 2779 + }, + { + "epoch": 0.07138260466265782, + "grad_norm": 1.09375, + "learning_rate": 0.00019829533104035759, + "loss": 1.2629, + "step": 2780 + }, + { + "epoch": 0.07140828185857963, + "grad_norm": 0.9140625, + "learning_rate": 0.00019829451017504916, + "loss": 1.3317, + "step": 2781 + }, + { + "epoch": 0.07143395905450145, + "grad_norm": 0.85546875, + "learning_rate": 0.00019829368911384866, + "loss": 1.2623, + "step": 2782 + }, + { + "epoch": 0.07145963625042327, + "grad_norm": 1.21875, + "learning_rate": 0.0001982928678567577, + "loss": 1.271, + "step": 2783 + }, + { + "epoch": 0.0714853134463451, + "grad_norm": 0.89453125, + "learning_rate": 0.000198292046403778, + "loss": 1.3831, + "step": 2784 + }, + { + "epoch": 0.07151099064226692, + "grad_norm": 0.84765625, + "learning_rate": 0.00019829122475491104, + "loss": 1.2126, + "step": 2785 + }, + { + "epoch": 0.07153666783818872, + "grad_norm": 0.96484375, + "learning_rate": 0.0001982904029101586, + "loss": 1.3015, + "step": 2786 + }, + { + "epoch": 0.07156234503411055, + "grad_norm": 0.81640625, + "learning_rate": 0.00019828958086952225, + "loss": 1.1366, + "step": 2787 + }, + { + "epoch": 0.07158802223003237, + "grad_norm": 0.97265625, + "learning_rate": 0.00019828875863300366, + "loss": 1.1911, + "step": 2788 + }, + { + "epoch": 0.07161369942595419, + "grad_norm": 1.578125, + "learning_rate": 0.00019828793620060443, + "loss": 1.1233, + "step": 2789 + }, + { + "epoch": 0.07163937662187601, + "grad_norm": 0.921875, + "learning_rate": 0.00019828711357232625, + "loss": 1.1768, + "step": 2790 + }, + { + "epoch": 0.07166505381779782, + "grad_norm": 1.109375, + "learning_rate": 0.0001982862907481707, + "loss": 1.2874, + "step": 2791 + }, + { + "epoch": 0.07169073101371964, + "grad_norm": 0.8828125, + "learning_rate": 0.00019828546772813948, + "loss": 1.1802, + "step": 2792 + }, + { + "epoch": 0.07171640820964147, + "grad_norm": 1.0546875, + "learning_rate": 0.00019828464451223418, + "loss": 1.079, + "step": 2793 + }, + { + "epoch": 0.07174208540556329, + "grad_norm": 1.015625, + "learning_rate": 0.00019828382110045648, + "loss": 1.447, + "step": 2794 + }, + { + "epoch": 0.07176776260148511, + "grad_norm": 1.09375, + "learning_rate": 0.00019828299749280798, + "loss": 1.1174, + "step": 2795 + }, + { + "epoch": 0.07179343979740692, + "grad_norm": 1.0859375, + "learning_rate": 0.00019828217368929036, + "loss": 1.4195, + "step": 2796 + }, + { + "epoch": 0.07181911699332874, + "grad_norm": 0.95703125, + "learning_rate": 0.00019828134968990523, + "loss": 1.2582, + "step": 2797 + }, + { + "epoch": 0.07184479418925056, + "grad_norm": 0.90625, + "learning_rate": 0.00019828052549465425, + "loss": 1.311, + "step": 2798 + }, + { + "epoch": 0.07187047138517239, + "grad_norm": 0.90234375, + "learning_rate": 0.0001982797011035391, + "loss": 1.2783, + "step": 2799 + }, + { + "epoch": 0.07189614858109421, + "grad_norm": 0.8671875, + "learning_rate": 0.00019827887651656135, + "loss": 1.197, + "step": 2800 + }, + { + "epoch": 0.07192182577701602, + "grad_norm": 0.9921875, + "learning_rate": 0.00019827805173372264, + "loss": 1.2337, + "step": 2801 + }, + { + "epoch": 0.07194750297293784, + "grad_norm": 0.953125, + "learning_rate": 0.0001982772267550247, + "loss": 1.3409, + "step": 2802 + }, + { + "epoch": 0.07197318016885966, + "grad_norm": 0.9609375, + "learning_rate": 0.00019827640158046912, + "loss": 1.36, + "step": 2803 + }, + { + "epoch": 0.07199885736478148, + "grad_norm": 0.96484375, + "learning_rate": 0.00019827557621005751, + "loss": 1.3119, + "step": 2804 + }, + { + "epoch": 0.0720245345607033, + "grad_norm": 0.9375, + "learning_rate": 0.0001982747506437916, + "loss": 1.2796, + "step": 2805 + }, + { + "epoch": 0.07205021175662511, + "grad_norm": 0.84375, + "learning_rate": 0.000198273924881673, + "loss": 1.1467, + "step": 2806 + }, + { + "epoch": 0.07207588895254693, + "grad_norm": 0.8984375, + "learning_rate": 0.0001982730989237033, + "loss": 1.2538, + "step": 2807 + }, + { + "epoch": 0.07210156614846876, + "grad_norm": 0.921875, + "learning_rate": 0.0001982722727698842, + "loss": 1.1173, + "step": 2808 + }, + { + "epoch": 0.07212724334439058, + "grad_norm": 0.91015625, + "learning_rate": 0.0001982714464202174, + "loss": 1.3651, + "step": 2809 + }, + { + "epoch": 0.0721529205403124, + "grad_norm": 0.86328125, + "learning_rate": 0.0001982706198747044, + "loss": 1.237, + "step": 2810 + }, + { + "epoch": 0.07217859773623421, + "grad_norm": 0.90625, + "learning_rate": 0.00019826979313334694, + "loss": 1.1621, + "step": 2811 + }, + { + "epoch": 0.07220427493215603, + "grad_norm": 0.95703125, + "learning_rate": 0.0001982689661961467, + "loss": 1.3059, + "step": 2812 + }, + { + "epoch": 0.07222995212807785, + "grad_norm": 0.90625, + "learning_rate": 0.00019826813906310528, + "loss": 1.2354, + "step": 2813 + }, + { + "epoch": 0.07225562932399968, + "grad_norm": 0.94921875, + "learning_rate": 0.0001982673117342243, + "loss": 1.3089, + "step": 2814 + }, + { + "epoch": 0.0722813065199215, + "grad_norm": 0.91796875, + "learning_rate": 0.00019826648420950547, + "loss": 1.2351, + "step": 2815 + }, + { + "epoch": 0.0723069837158433, + "grad_norm": 0.90625, + "learning_rate": 0.0001982656564889504, + "loss": 1.1956, + "step": 2816 + }, + { + "epoch": 0.07233266091176513, + "grad_norm": 0.94921875, + "learning_rate": 0.0001982648285725608, + "loss": 1.2215, + "step": 2817 + }, + { + "epoch": 0.07235833810768695, + "grad_norm": 0.94921875, + "learning_rate": 0.00019826400046033823, + "loss": 1.3396, + "step": 2818 + }, + { + "epoch": 0.07238401530360877, + "grad_norm": 0.88671875, + "learning_rate": 0.00019826317215228438, + "loss": 1.1436, + "step": 2819 + }, + { + "epoch": 0.0724096924995306, + "grad_norm": 0.90234375, + "learning_rate": 0.00019826234364840093, + "loss": 1.22, + "step": 2820 + }, + { + "epoch": 0.0724353696954524, + "grad_norm": 0.91015625, + "learning_rate": 0.0001982615149486895, + "loss": 1.2626, + "step": 2821 + }, + { + "epoch": 0.07246104689137423, + "grad_norm": 0.94921875, + "learning_rate": 0.00019826068605315172, + "loss": 1.1073, + "step": 2822 + }, + { + "epoch": 0.07248672408729605, + "grad_norm": 0.88671875, + "learning_rate": 0.0001982598569617893, + "loss": 1.0957, + "step": 2823 + }, + { + "epoch": 0.07251240128321787, + "grad_norm": 0.96484375, + "learning_rate": 0.00019825902767460382, + "loss": 1.1902, + "step": 2824 + }, + { + "epoch": 0.07253807847913969, + "grad_norm": 0.84765625, + "learning_rate": 0.000198258198191597, + "loss": 1.2485, + "step": 2825 + }, + { + "epoch": 0.0725637556750615, + "grad_norm": 0.92578125, + "learning_rate": 0.00019825736851277045, + "loss": 1.2015, + "step": 2826 + }, + { + "epoch": 0.07258943287098332, + "grad_norm": 1.109375, + "learning_rate": 0.0001982565386381259, + "loss": 1.302, + "step": 2827 + }, + { + "epoch": 0.07261511006690514, + "grad_norm": 0.91796875, + "learning_rate": 0.00019825570856766487, + "loss": 1.3781, + "step": 2828 + }, + { + "epoch": 0.07264078726282697, + "grad_norm": 0.98046875, + "learning_rate": 0.0001982548783013891, + "loss": 1.3897, + "step": 2829 + }, + { + "epoch": 0.07266646445874879, + "grad_norm": 1.21875, + "learning_rate": 0.00019825404783930026, + "loss": 1.1313, + "step": 2830 + }, + { + "epoch": 0.0726921416546706, + "grad_norm": 0.875, + "learning_rate": 0.00019825321718139997, + "loss": 1.2727, + "step": 2831 + }, + { + "epoch": 0.07271781885059242, + "grad_norm": 0.91796875, + "learning_rate": 0.0001982523863276899, + "loss": 1.2827, + "step": 2832 + }, + { + "epoch": 0.07274349604651424, + "grad_norm": 0.88671875, + "learning_rate": 0.00019825155527817168, + "loss": 1.2233, + "step": 2833 + }, + { + "epoch": 0.07276917324243606, + "grad_norm": 0.94921875, + "learning_rate": 0.00019825072403284702, + "loss": 1.313, + "step": 2834 + }, + { + "epoch": 0.07279485043835789, + "grad_norm": 0.8515625, + "learning_rate": 0.0001982498925917175, + "loss": 1.2505, + "step": 2835 + }, + { + "epoch": 0.0728205276342797, + "grad_norm": 0.91015625, + "learning_rate": 0.00019824906095478484, + "loss": 1.2942, + "step": 2836 + }, + { + "epoch": 0.07284620483020152, + "grad_norm": 0.90625, + "learning_rate": 0.00019824822912205067, + "loss": 1.195, + "step": 2837 + }, + { + "epoch": 0.07287188202612334, + "grad_norm": 0.87109375, + "learning_rate": 0.00019824739709351665, + "loss": 1.3502, + "step": 2838 + }, + { + "epoch": 0.07289755922204516, + "grad_norm": 0.90625, + "learning_rate": 0.00019824656486918448, + "loss": 1.2646, + "step": 2839 + }, + { + "epoch": 0.07292323641796697, + "grad_norm": 0.859375, + "learning_rate": 0.00019824573244905572, + "loss": 1.3529, + "step": 2840 + }, + { + "epoch": 0.07294891361388879, + "grad_norm": 0.8671875, + "learning_rate": 0.00019824489983313211, + "loss": 1.2687, + "step": 2841 + }, + { + "epoch": 0.07297459080981061, + "grad_norm": 0.77734375, + "learning_rate": 0.0001982440670214153, + "loss": 1.1307, + "step": 2842 + }, + { + "epoch": 0.07300026800573244, + "grad_norm": 0.875, + "learning_rate": 0.00019824323401390696, + "loss": 1.1441, + "step": 2843 + }, + { + "epoch": 0.07302594520165426, + "grad_norm": 0.85546875, + "learning_rate": 0.0001982424008106087, + "loss": 1.2633, + "step": 2844 + }, + { + "epoch": 0.07305162239757607, + "grad_norm": 0.8671875, + "learning_rate": 0.00019824156741152222, + "loss": 1.1579, + "step": 2845 + }, + { + "epoch": 0.07307729959349789, + "grad_norm": 0.86328125, + "learning_rate": 0.00019824073381664915, + "loss": 1.0947, + "step": 2846 + }, + { + "epoch": 0.07310297678941971, + "grad_norm": 0.875, + "learning_rate": 0.0001982399000259912, + "loss": 1.3037, + "step": 2847 + }, + { + "epoch": 0.07312865398534153, + "grad_norm": 0.8515625, + "learning_rate": 0.00019823906603954998, + "loss": 1.2909, + "step": 2848 + }, + { + "epoch": 0.07315433118126335, + "grad_norm": 0.82421875, + "learning_rate": 0.0001982382318573272, + "loss": 1.3853, + "step": 2849 + }, + { + "epoch": 0.07318000837718516, + "grad_norm": 0.91015625, + "learning_rate": 0.00019823739747932447, + "loss": 1.2858, + "step": 2850 + }, + { + "epoch": 0.07320568557310698, + "grad_norm": 0.98828125, + "learning_rate": 0.0001982365629055435, + "loss": 1.3219, + "step": 2851 + }, + { + "epoch": 0.07323136276902881, + "grad_norm": 0.87890625, + "learning_rate": 0.00019823572813598594, + "loss": 1.1728, + "step": 2852 + }, + { + "epoch": 0.07325703996495063, + "grad_norm": 0.88671875, + "learning_rate": 0.00019823489317065342, + "loss": 1.2258, + "step": 2853 + }, + { + "epoch": 0.07328271716087245, + "grad_norm": 0.8515625, + "learning_rate": 0.00019823405800954767, + "loss": 1.1069, + "step": 2854 + }, + { + "epoch": 0.07330839435679426, + "grad_norm": 0.88671875, + "learning_rate": 0.00019823322265267028, + "loss": 1.1905, + "step": 2855 + }, + { + "epoch": 0.07333407155271608, + "grad_norm": 0.81640625, + "learning_rate": 0.00019823238710002297, + "loss": 1.2566, + "step": 2856 + }, + { + "epoch": 0.0733597487486379, + "grad_norm": 0.87109375, + "learning_rate": 0.00019823155135160737, + "loss": 1.0591, + "step": 2857 + }, + { + "epoch": 0.07338542594455973, + "grad_norm": 0.859375, + "learning_rate": 0.00019823071540742517, + "loss": 1.1, + "step": 2858 + }, + { + "epoch": 0.07341110314048155, + "grad_norm": 0.9140625, + "learning_rate": 0.00019822987926747802, + "loss": 1.1854, + "step": 2859 + }, + { + "epoch": 0.07343678033640336, + "grad_norm": 0.92578125, + "learning_rate": 0.0001982290429317676, + "loss": 1.0551, + "step": 2860 + }, + { + "epoch": 0.07346245753232518, + "grad_norm": 0.80859375, + "learning_rate": 0.0001982282064002956, + "loss": 1.0582, + "step": 2861 + }, + { + "epoch": 0.073488134728247, + "grad_norm": 0.875, + "learning_rate": 0.0001982273696730636, + "loss": 1.1562, + "step": 2862 + }, + { + "epoch": 0.07351381192416882, + "grad_norm": 0.859375, + "learning_rate": 0.00019822653275007338, + "loss": 1.3228, + "step": 2863 + }, + { + "epoch": 0.07353948912009065, + "grad_norm": 0.8359375, + "learning_rate": 0.0001982256956313265, + "loss": 1.2372, + "step": 2864 + }, + { + "epoch": 0.07356516631601245, + "grad_norm": 0.87890625, + "learning_rate": 0.00019822485831682472, + "loss": 1.1474, + "step": 2865 + }, + { + "epoch": 0.07359084351193428, + "grad_norm": 0.8515625, + "learning_rate": 0.00019822402080656967, + "loss": 1.3195, + "step": 2866 + }, + { + "epoch": 0.0736165207078561, + "grad_norm": 0.8828125, + "learning_rate": 0.00019822318310056303, + "loss": 1.1602, + "step": 2867 + }, + { + "epoch": 0.07364219790377792, + "grad_norm": 0.83984375, + "learning_rate": 0.00019822234519880643, + "loss": 1.1606, + "step": 2868 + }, + { + "epoch": 0.07366787509969974, + "grad_norm": 0.921875, + "learning_rate": 0.00019822150710130157, + "loss": 1.3146, + "step": 2869 + }, + { + "epoch": 0.07369355229562155, + "grad_norm": 0.88671875, + "learning_rate": 0.00019822066880805013, + "loss": 1.2999, + "step": 2870 + }, + { + "epoch": 0.07371922949154337, + "grad_norm": 0.90625, + "learning_rate": 0.00019821983031905377, + "loss": 1.1915, + "step": 2871 + }, + { + "epoch": 0.0737449066874652, + "grad_norm": 0.87890625, + "learning_rate": 0.00019821899163431414, + "loss": 1.2481, + "step": 2872 + }, + { + "epoch": 0.07377058388338702, + "grad_norm": 0.88671875, + "learning_rate": 0.00019821815275383294, + "loss": 1.3105, + "step": 2873 + }, + { + "epoch": 0.07379626107930884, + "grad_norm": 0.859375, + "learning_rate": 0.00019821731367761184, + "loss": 1.0851, + "step": 2874 + }, + { + "epoch": 0.07382193827523065, + "grad_norm": 0.84375, + "learning_rate": 0.00019821647440565249, + "loss": 1.2549, + "step": 2875 + }, + { + "epoch": 0.07384761547115247, + "grad_norm": 0.88671875, + "learning_rate": 0.0001982156349379566, + "loss": 1.301, + "step": 2876 + }, + { + "epoch": 0.07387329266707429, + "grad_norm": 0.84765625, + "learning_rate": 0.0001982147952745258, + "loss": 1.3145, + "step": 2877 + }, + { + "epoch": 0.07389896986299611, + "grad_norm": 0.890625, + "learning_rate": 0.00019821395541536182, + "loss": 1.2252, + "step": 2878 + }, + { + "epoch": 0.07392464705891794, + "grad_norm": 0.9140625, + "learning_rate": 0.00019821311536046628, + "loss": 1.273, + "step": 2879 + }, + { + "epoch": 0.07395032425483974, + "grad_norm": 0.85546875, + "learning_rate": 0.00019821227510984088, + "loss": 1.2575, + "step": 2880 + }, + { + "epoch": 0.07397600145076157, + "grad_norm": 0.875, + "learning_rate": 0.00019821143466348725, + "loss": 1.1918, + "step": 2881 + }, + { + "epoch": 0.07400167864668339, + "grad_norm": 0.8984375, + "learning_rate": 0.00019821059402140716, + "loss": 1.2132, + "step": 2882 + }, + { + "epoch": 0.07402735584260521, + "grad_norm": 0.9453125, + "learning_rate": 0.0001982097531836022, + "loss": 1.3432, + "step": 2883 + }, + { + "epoch": 0.07405303303852703, + "grad_norm": 0.91015625, + "learning_rate": 0.00019820891215007407, + "loss": 1.3749, + "step": 2884 + }, + { + "epoch": 0.07407871023444884, + "grad_norm": 0.86328125, + "learning_rate": 0.00019820807092082445, + "loss": 1.1522, + "step": 2885 + }, + { + "epoch": 0.07410438743037066, + "grad_norm": 0.98828125, + "learning_rate": 0.000198207229495855, + "loss": 1.3482, + "step": 2886 + }, + { + "epoch": 0.07413006462629249, + "grad_norm": 0.84765625, + "learning_rate": 0.00019820638787516743, + "loss": 1.2224, + "step": 2887 + }, + { + "epoch": 0.07415574182221431, + "grad_norm": 0.8984375, + "learning_rate": 0.0001982055460587634, + "loss": 1.168, + "step": 2888 + }, + { + "epoch": 0.07418141901813613, + "grad_norm": 0.90625, + "learning_rate": 0.0001982047040466446, + "loss": 1.2771, + "step": 2889 + }, + { + "epoch": 0.07420709621405794, + "grad_norm": 0.875, + "learning_rate": 0.00019820386183881268, + "loss": 1.243, + "step": 2890 + }, + { + "epoch": 0.07423277340997976, + "grad_norm": 0.9140625, + "learning_rate": 0.0001982030194352693, + "loss": 1.3289, + "step": 2891 + }, + { + "epoch": 0.07425845060590158, + "grad_norm": 0.890625, + "learning_rate": 0.00019820217683601623, + "loss": 1.0944, + "step": 2892 + }, + { + "epoch": 0.0742841278018234, + "grad_norm": 0.8515625, + "learning_rate": 0.00019820133404105508, + "loss": 1.1939, + "step": 2893 + }, + { + "epoch": 0.07430980499774523, + "grad_norm": 0.9765625, + "learning_rate": 0.00019820049105038755, + "loss": 1.2049, + "step": 2894 + }, + { + "epoch": 0.07433548219366704, + "grad_norm": 0.95703125, + "learning_rate": 0.00019819964786401526, + "loss": 1.1444, + "step": 2895 + }, + { + "epoch": 0.07436115938958886, + "grad_norm": 0.80859375, + "learning_rate": 0.00019819880448193998, + "loss": 1.0059, + "step": 2896 + }, + { + "epoch": 0.07438683658551068, + "grad_norm": 0.84375, + "learning_rate": 0.00019819796090416337, + "loss": 1.2008, + "step": 2897 + }, + { + "epoch": 0.0744125137814325, + "grad_norm": 0.83203125, + "learning_rate": 0.00019819711713068707, + "loss": 1.0023, + "step": 2898 + }, + { + "epoch": 0.07443819097735432, + "grad_norm": 0.91015625, + "learning_rate": 0.0001981962731615128, + "loss": 1.1831, + "step": 2899 + }, + { + "epoch": 0.07446386817327613, + "grad_norm": 0.875, + "learning_rate": 0.00019819542899664221, + "loss": 1.1112, + "step": 2900 + }, + { + "epoch": 0.07448954536919795, + "grad_norm": 0.8203125, + "learning_rate": 0.000198194584636077, + "loss": 1.2917, + "step": 2901 + }, + { + "epoch": 0.07451522256511978, + "grad_norm": 0.8671875, + "learning_rate": 0.00019819374007981884, + "loss": 1.2404, + "step": 2902 + }, + { + "epoch": 0.0745408997610416, + "grad_norm": 0.8203125, + "learning_rate": 0.00019819289532786946, + "loss": 1.1728, + "step": 2903 + }, + { + "epoch": 0.07456657695696342, + "grad_norm": 0.91015625, + "learning_rate": 0.0001981920503802305, + "loss": 1.3043, + "step": 2904 + }, + { + "epoch": 0.07459225415288523, + "grad_norm": 0.84765625, + "learning_rate": 0.00019819120523690363, + "loss": 1.2243, + "step": 2905 + }, + { + "epoch": 0.07461793134880705, + "grad_norm": 1.0078125, + "learning_rate": 0.00019819035989789057, + "loss": 1.2427, + "step": 2906 + }, + { + "epoch": 0.07464360854472887, + "grad_norm": 0.87109375, + "learning_rate": 0.00019818951436319303, + "loss": 1.0878, + "step": 2907 + }, + { + "epoch": 0.0746692857406507, + "grad_norm": 0.8359375, + "learning_rate": 0.0001981886686328126, + "loss": 1.1846, + "step": 2908 + }, + { + "epoch": 0.07469496293657252, + "grad_norm": 0.84375, + "learning_rate": 0.00019818782270675104, + "loss": 1.2901, + "step": 2909 + }, + { + "epoch": 0.07472064013249433, + "grad_norm": 0.8984375, + "learning_rate": 0.00019818697658501003, + "loss": 1.2502, + "step": 2910 + }, + { + "epoch": 0.07474631732841615, + "grad_norm": 0.90234375, + "learning_rate": 0.00019818613026759122, + "loss": 1.2001, + "step": 2911 + }, + { + "epoch": 0.07477199452433797, + "grad_norm": 0.921875, + "learning_rate": 0.00019818528375449635, + "loss": 1.2826, + "step": 2912 + }, + { + "epoch": 0.07479767172025979, + "grad_norm": 0.8671875, + "learning_rate": 0.0001981844370457271, + "loss": 1.3017, + "step": 2913 + }, + { + "epoch": 0.07482334891618161, + "grad_norm": 0.828125, + "learning_rate": 0.0001981835901412851, + "loss": 1.1525, + "step": 2914 + }, + { + "epoch": 0.07484902611210342, + "grad_norm": 0.9453125, + "learning_rate": 0.00019818274304117206, + "loss": 1.3293, + "step": 2915 + }, + { + "epoch": 0.07487470330802524, + "grad_norm": 0.9140625, + "learning_rate": 0.00019818189574538972, + "loss": 1.2715, + "step": 2916 + }, + { + "epoch": 0.07490038050394707, + "grad_norm": 1.03125, + "learning_rate": 0.0001981810482539397, + "loss": 1.1311, + "step": 2917 + }, + { + "epoch": 0.07492605769986889, + "grad_norm": 0.9609375, + "learning_rate": 0.00019818020056682373, + "loss": 1.1759, + "step": 2918 + }, + { + "epoch": 0.07495173489579071, + "grad_norm": 0.93359375, + "learning_rate": 0.0001981793526840435, + "loss": 1.1866, + "step": 2919 + }, + { + "epoch": 0.07497741209171252, + "grad_norm": 0.97265625, + "learning_rate": 0.00019817850460560068, + "loss": 1.0909, + "step": 2920 + }, + { + "epoch": 0.07500308928763434, + "grad_norm": 0.91796875, + "learning_rate": 0.00019817765633149697, + "loss": 1.1822, + "step": 2921 + }, + { + "epoch": 0.07502876648355616, + "grad_norm": 1.0, + "learning_rate": 0.00019817680786173405, + "loss": 1.1825, + "step": 2922 + }, + { + "epoch": 0.07505444367947799, + "grad_norm": 0.89453125, + "learning_rate": 0.0001981759591963136, + "loss": 1.2731, + "step": 2923 + }, + { + "epoch": 0.07508012087539981, + "grad_norm": 0.92578125, + "learning_rate": 0.0001981751103352374, + "loss": 1.2962, + "step": 2924 + }, + { + "epoch": 0.07510579807132162, + "grad_norm": 0.890625, + "learning_rate": 0.000198174261278507, + "loss": 1.0693, + "step": 2925 + }, + { + "epoch": 0.07513147526724344, + "grad_norm": 0.8203125, + "learning_rate": 0.00019817341202612424, + "loss": 1.1638, + "step": 2926 + }, + { + "epoch": 0.07515715246316526, + "grad_norm": 0.8984375, + "learning_rate": 0.00019817256257809068, + "loss": 1.1487, + "step": 2927 + }, + { + "epoch": 0.07518282965908708, + "grad_norm": 0.828125, + "learning_rate": 0.0001981717129344081, + "loss": 1.1848, + "step": 2928 + }, + { + "epoch": 0.0752085068550089, + "grad_norm": 0.921875, + "learning_rate": 0.00019817086309507814, + "loss": 1.3097, + "step": 2929 + }, + { + "epoch": 0.07523418405093071, + "grad_norm": 0.91796875, + "learning_rate": 0.00019817001306010251, + "loss": 1.2523, + "step": 2930 + }, + { + "epoch": 0.07525986124685254, + "grad_norm": 0.90234375, + "learning_rate": 0.00019816916282948294, + "loss": 1.3066, + "step": 2931 + }, + { + "epoch": 0.07528553844277436, + "grad_norm": 0.86328125, + "learning_rate": 0.00019816831240322108, + "loss": 1.2895, + "step": 2932 + }, + { + "epoch": 0.07531121563869618, + "grad_norm": 1.921875, + "learning_rate": 0.00019816746178131863, + "loss": 1.3618, + "step": 2933 + }, + { + "epoch": 0.075336892834618, + "grad_norm": 0.86328125, + "learning_rate": 0.00019816661096377735, + "loss": 1.2693, + "step": 2934 + }, + { + "epoch": 0.07536257003053981, + "grad_norm": 0.9609375, + "learning_rate": 0.00019816575995059883, + "loss": 1.1944, + "step": 2935 + }, + { + "epoch": 0.07538824722646163, + "grad_norm": 0.94140625, + "learning_rate": 0.00019816490874178483, + "loss": 1.194, + "step": 2936 + }, + { + "epoch": 0.07541392442238345, + "grad_norm": 0.83203125, + "learning_rate": 0.00019816405733733702, + "loss": 1.1929, + "step": 2937 + }, + { + "epoch": 0.07543960161830528, + "grad_norm": 0.94921875, + "learning_rate": 0.00019816320573725715, + "loss": 1.1457, + "step": 2938 + }, + { + "epoch": 0.0754652788142271, + "grad_norm": 0.88671875, + "learning_rate": 0.00019816235394154685, + "loss": 1.1833, + "step": 2939 + }, + { + "epoch": 0.07549095601014891, + "grad_norm": 0.85546875, + "learning_rate": 0.00019816150195020783, + "loss": 1.236, + "step": 2940 + }, + { + "epoch": 0.07551663320607073, + "grad_norm": 0.8515625, + "learning_rate": 0.00019816064976324184, + "loss": 1.2829, + "step": 2941 + }, + { + "epoch": 0.07554231040199255, + "grad_norm": 0.91796875, + "learning_rate": 0.00019815979738065055, + "loss": 1.1427, + "step": 2942 + }, + { + "epoch": 0.07556798759791437, + "grad_norm": 0.875, + "learning_rate": 0.0001981589448024356, + "loss": 1.0539, + "step": 2943 + }, + { + "epoch": 0.07559366479383618, + "grad_norm": 0.859375, + "learning_rate": 0.00019815809202859875, + "loss": 1.095, + "step": 2944 + }, + { + "epoch": 0.075619341989758, + "grad_norm": 0.83984375, + "learning_rate": 0.00019815723905914175, + "loss": 1.3226, + "step": 2945 + }, + { + "epoch": 0.07564501918567983, + "grad_norm": 0.96875, + "learning_rate": 0.00019815638589406616, + "loss": 1.1504, + "step": 2946 + }, + { + "epoch": 0.07567069638160165, + "grad_norm": 0.83203125, + "learning_rate": 0.0001981555325333738, + "loss": 1.1752, + "step": 2947 + }, + { + "epoch": 0.07569637357752347, + "grad_norm": 0.9375, + "learning_rate": 0.00019815467897706636, + "loss": 1.1789, + "step": 2948 + }, + { + "epoch": 0.07572205077344528, + "grad_norm": 0.90625, + "learning_rate": 0.00019815382522514545, + "loss": 1.3232, + "step": 2949 + }, + { + "epoch": 0.0757477279693671, + "grad_norm": 0.8828125, + "learning_rate": 0.00019815297127761286, + "loss": 1.2176, + "step": 2950 + }, + { + "epoch": 0.07577340516528892, + "grad_norm": 0.96875, + "learning_rate": 0.00019815211713447025, + "loss": 1.1768, + "step": 2951 + }, + { + "epoch": 0.07579908236121075, + "grad_norm": 0.8515625, + "learning_rate": 0.00019815126279571933, + "loss": 1.1881, + "step": 2952 + }, + { + "epoch": 0.07582475955713257, + "grad_norm": 0.8828125, + "learning_rate": 0.00019815040826136183, + "loss": 1.2897, + "step": 2953 + }, + { + "epoch": 0.07585043675305438, + "grad_norm": 0.96875, + "learning_rate": 0.00019814955353139943, + "loss": 1.2902, + "step": 2954 + }, + { + "epoch": 0.0758761139489762, + "grad_norm": 0.95703125, + "learning_rate": 0.00019814869860583384, + "loss": 1.2628, + "step": 2955 + }, + { + "epoch": 0.07590179114489802, + "grad_norm": 0.859375, + "learning_rate": 0.00019814784348466677, + "loss": 1.2888, + "step": 2956 + }, + { + "epoch": 0.07592746834081984, + "grad_norm": 0.9375, + "learning_rate": 0.0001981469881678999, + "loss": 1.2296, + "step": 2957 + }, + { + "epoch": 0.07595314553674166, + "grad_norm": 0.91796875, + "learning_rate": 0.00019814613265553495, + "loss": 1.3699, + "step": 2958 + }, + { + "epoch": 0.07597882273266347, + "grad_norm": 0.93359375, + "learning_rate": 0.00019814527694757362, + "loss": 1.2137, + "step": 2959 + }, + { + "epoch": 0.0760044999285853, + "grad_norm": 0.921875, + "learning_rate": 0.00019814442104401764, + "loss": 1.2538, + "step": 2960 + }, + { + "epoch": 0.07603017712450712, + "grad_norm": 0.9453125, + "learning_rate": 0.00019814356494486863, + "loss": 1.1658, + "step": 2961 + }, + { + "epoch": 0.07605585432042894, + "grad_norm": 0.89453125, + "learning_rate": 0.0001981427086501284, + "loss": 1.1207, + "step": 2962 + }, + { + "epoch": 0.07608153151635076, + "grad_norm": 0.85546875, + "learning_rate": 0.00019814185215979863, + "loss": 1.1761, + "step": 2963 + }, + { + "epoch": 0.07610720871227257, + "grad_norm": 1.0390625, + "learning_rate": 0.000198140995473881, + "loss": 1.1451, + "step": 2964 + }, + { + "epoch": 0.07613288590819439, + "grad_norm": 0.9609375, + "learning_rate": 0.00019814013859237726, + "loss": 1.1935, + "step": 2965 + }, + { + "epoch": 0.07615856310411621, + "grad_norm": 0.87109375, + "learning_rate": 0.00019813928151528903, + "loss": 1.1859, + "step": 2966 + }, + { + "epoch": 0.07618424030003804, + "grad_norm": 0.8359375, + "learning_rate": 0.00019813842424261813, + "loss": 1.1416, + "step": 2967 + }, + { + "epoch": 0.07620991749595986, + "grad_norm": 0.99609375, + "learning_rate": 0.00019813756677436619, + "loss": 1.1775, + "step": 2968 + }, + { + "epoch": 0.07623559469188167, + "grad_norm": 0.8984375, + "learning_rate": 0.00019813670911053495, + "loss": 1.1036, + "step": 2969 + }, + { + "epoch": 0.07626127188780349, + "grad_norm": 0.83984375, + "learning_rate": 0.00019813585125112612, + "loss": 1.2163, + "step": 2970 + }, + { + "epoch": 0.07628694908372531, + "grad_norm": 0.859375, + "learning_rate": 0.00019813499319614136, + "loss": 1.2171, + "step": 2971 + }, + { + "epoch": 0.07631262627964713, + "grad_norm": 0.86328125, + "learning_rate": 0.0001981341349455825, + "loss": 1.1192, + "step": 2972 + }, + { + "epoch": 0.07633830347556896, + "grad_norm": 1.015625, + "learning_rate": 0.0001981332764994511, + "loss": 1.4285, + "step": 2973 + }, + { + "epoch": 0.07636398067149076, + "grad_norm": 0.8984375, + "learning_rate": 0.000198132417857749, + "loss": 1.2152, + "step": 2974 + }, + { + "epoch": 0.07638965786741259, + "grad_norm": 0.9375, + "learning_rate": 0.00019813155902047785, + "loss": 1.1429, + "step": 2975 + }, + { + "epoch": 0.07641533506333441, + "grad_norm": 0.82421875, + "learning_rate": 0.00019813069998763933, + "loss": 1.0398, + "step": 2976 + }, + { + "epoch": 0.07644101225925623, + "grad_norm": 2.078125, + "learning_rate": 0.00019812984075923522, + "loss": 1.4037, + "step": 2977 + }, + { + "epoch": 0.07646668945517805, + "grad_norm": 0.91796875, + "learning_rate": 0.0001981289813352672, + "loss": 1.215, + "step": 2978 + }, + { + "epoch": 0.07649236665109986, + "grad_norm": 1.0703125, + "learning_rate": 0.00019812812171573696, + "loss": 1.2431, + "step": 2979 + }, + { + "epoch": 0.07651804384702168, + "grad_norm": 1.1015625, + "learning_rate": 0.00019812726190064623, + "loss": 1.1523, + "step": 2980 + }, + { + "epoch": 0.0765437210429435, + "grad_norm": 0.83203125, + "learning_rate": 0.00019812640188999675, + "loss": 1.1995, + "step": 2981 + }, + { + "epoch": 0.07656939823886533, + "grad_norm": 0.9140625, + "learning_rate": 0.00019812554168379022, + "loss": 1.118, + "step": 2982 + }, + { + "epoch": 0.07659507543478715, + "grad_norm": 0.90625, + "learning_rate": 0.00019812468128202837, + "loss": 1.0956, + "step": 2983 + }, + { + "epoch": 0.07662075263070896, + "grad_norm": 0.875, + "learning_rate": 0.00019812382068471284, + "loss": 1.1611, + "step": 2984 + }, + { + "epoch": 0.07664642982663078, + "grad_norm": 0.921875, + "learning_rate": 0.00019812295989184544, + "loss": 1.3343, + "step": 2985 + }, + { + "epoch": 0.0766721070225526, + "grad_norm": 0.9375, + "learning_rate": 0.00019812209890342783, + "loss": 1.3106, + "step": 2986 + }, + { + "epoch": 0.07669778421847442, + "grad_norm": 0.8515625, + "learning_rate": 0.00019812123771946173, + "loss": 1.2097, + "step": 2987 + }, + { + "epoch": 0.07672346141439625, + "grad_norm": 0.87890625, + "learning_rate": 0.00019812037633994888, + "loss": 1.1538, + "step": 2988 + }, + { + "epoch": 0.07674913861031805, + "grad_norm": 0.83203125, + "learning_rate": 0.000198119514764891, + "loss": 1.1401, + "step": 2989 + }, + { + "epoch": 0.07677481580623988, + "grad_norm": 0.96875, + "learning_rate": 0.00019811865299428977, + "loss": 1.2764, + "step": 2990 + }, + { + "epoch": 0.0768004930021617, + "grad_norm": 0.8515625, + "learning_rate": 0.00019811779102814692, + "loss": 1.1098, + "step": 2991 + }, + { + "epoch": 0.07682617019808352, + "grad_norm": 0.90234375, + "learning_rate": 0.0001981169288664642, + "loss": 1.2977, + "step": 2992 + }, + { + "epoch": 0.07685184739400534, + "grad_norm": 0.91015625, + "learning_rate": 0.0001981160665092433, + "loss": 1.0798, + "step": 2993 + }, + { + "epoch": 0.07687752458992715, + "grad_norm": 0.94921875, + "learning_rate": 0.00019811520395648593, + "loss": 1.3404, + "step": 2994 + }, + { + "epoch": 0.07690320178584897, + "grad_norm": 0.94140625, + "learning_rate": 0.0001981143412081938, + "loss": 1.3571, + "step": 2995 + }, + { + "epoch": 0.0769288789817708, + "grad_norm": 1.015625, + "learning_rate": 0.00019811347826436868, + "loss": 1.3444, + "step": 2996 + }, + { + "epoch": 0.07695455617769262, + "grad_norm": 1.359375, + "learning_rate": 0.00019811261512501225, + "loss": 1.2398, + "step": 2997 + }, + { + "epoch": 0.07698023337361444, + "grad_norm": 0.80078125, + "learning_rate": 0.00019811175179012625, + "loss": 1.1922, + "step": 2998 + }, + { + "epoch": 0.07700591056953625, + "grad_norm": 0.875, + "learning_rate": 0.00019811088825971238, + "loss": 1.2819, + "step": 2999 + }, + { + "epoch": 0.07703158776545807, + "grad_norm": 0.875, + "learning_rate": 0.00019811002453377235, + "loss": 1.1615, + "step": 3000 + }, + { + "epoch": 0.07703158776545807, + "eval_loss": 1.2214481830596924, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 401.4544, + "eval_samples_per_second": 24.909, + "eval_steps_per_second": 0.78, + "step": 3000 + }, + { + "epoch": 0.07705726496137989, + "grad_norm": 0.84765625, + "learning_rate": 0.00019810916061230796, + "loss": 1.1882, + "step": 3001 + }, + { + "epoch": 0.07708294215730171, + "grad_norm": 0.921875, + "learning_rate": 0.00019810829649532084, + "loss": 1.1589, + "step": 3002 + }, + { + "epoch": 0.07710861935322354, + "grad_norm": 0.8984375, + "learning_rate": 0.00019810743218281275, + "loss": 1.2782, + "step": 3003 + }, + { + "epoch": 0.07713429654914535, + "grad_norm": 0.8515625, + "learning_rate": 0.0001981065676747854, + "loss": 1.1591, + "step": 3004 + }, + { + "epoch": 0.07715997374506717, + "grad_norm": 0.828125, + "learning_rate": 0.00019810570297124056, + "loss": 1.2506, + "step": 3005 + }, + { + "epoch": 0.07718565094098899, + "grad_norm": 0.96484375, + "learning_rate": 0.0001981048380721799, + "loss": 1.324, + "step": 3006 + }, + { + "epoch": 0.07721132813691081, + "grad_norm": 0.9453125, + "learning_rate": 0.00019810397297760513, + "loss": 1.1495, + "step": 3007 + }, + { + "epoch": 0.07723700533283263, + "grad_norm": 0.8515625, + "learning_rate": 0.00019810310768751804, + "loss": 1.1966, + "step": 3008 + }, + { + "epoch": 0.07726268252875444, + "grad_norm": 0.82421875, + "learning_rate": 0.0001981022422019203, + "loss": 1.198, + "step": 3009 + }, + { + "epoch": 0.07728835972467626, + "grad_norm": 0.890625, + "learning_rate": 0.00019810137652081364, + "loss": 1.3172, + "step": 3010 + }, + { + "epoch": 0.07731403692059809, + "grad_norm": 0.921875, + "learning_rate": 0.00019810051064419982, + "loss": 1.0432, + "step": 3011 + }, + { + "epoch": 0.07733971411651991, + "grad_norm": 0.73828125, + "learning_rate": 0.00019809964457208055, + "loss": 1.1597, + "step": 3012 + }, + { + "epoch": 0.07736539131244173, + "grad_norm": 0.87109375, + "learning_rate": 0.00019809877830445754, + "loss": 0.8613, + "step": 3013 + }, + { + "epoch": 0.07739106850836354, + "grad_norm": 0.8125, + "learning_rate": 0.0001980979118413325, + "loss": 1.1473, + "step": 3014 + }, + { + "epoch": 0.07741674570428536, + "grad_norm": 0.91015625, + "learning_rate": 0.0001980970451827072, + "loss": 1.2458, + "step": 3015 + }, + { + "epoch": 0.07744242290020718, + "grad_norm": 0.8828125, + "learning_rate": 0.00019809617832858335, + "loss": 1.2877, + "step": 3016 + }, + { + "epoch": 0.077468100096129, + "grad_norm": 0.87890625, + "learning_rate": 0.0001980953112789627, + "loss": 1.2058, + "step": 3017 + }, + { + "epoch": 0.07749377729205083, + "grad_norm": 0.85546875, + "learning_rate": 0.00019809444403384694, + "loss": 1.1571, + "step": 3018 + }, + { + "epoch": 0.07751945448797264, + "grad_norm": 0.91796875, + "learning_rate": 0.00019809357659323783, + "loss": 1.1565, + "step": 3019 + }, + { + "epoch": 0.07754513168389446, + "grad_norm": 0.765625, + "learning_rate": 0.00019809270895713703, + "loss": 1.1104, + "step": 3020 + }, + { + "epoch": 0.07757080887981628, + "grad_norm": 0.85546875, + "learning_rate": 0.00019809184112554634, + "loss": 1.2429, + "step": 3021 + }, + { + "epoch": 0.0775964860757381, + "grad_norm": 1.0, + "learning_rate": 0.00019809097309846752, + "loss": 1.1766, + "step": 3022 + }, + { + "epoch": 0.07762216327165992, + "grad_norm": 0.83203125, + "learning_rate": 0.0001980901048759022, + "loss": 1.2028, + "step": 3023 + }, + { + "epoch": 0.07764784046758173, + "grad_norm": 0.90625, + "learning_rate": 0.0001980892364578522, + "loss": 1.2844, + "step": 3024 + }, + { + "epoch": 0.07767351766350356, + "grad_norm": 1.0, + "learning_rate": 0.00019808836784431918, + "loss": 1.1129, + "step": 3025 + }, + { + "epoch": 0.07769919485942538, + "grad_norm": 0.8984375, + "learning_rate": 0.00019808749903530492, + "loss": 1.0673, + "step": 3026 + }, + { + "epoch": 0.0777248720553472, + "grad_norm": 0.91015625, + "learning_rate": 0.00019808663003081112, + "loss": 1.0591, + "step": 3027 + }, + { + "epoch": 0.07775054925126902, + "grad_norm": 0.8828125, + "learning_rate": 0.00019808576083083957, + "loss": 1.2624, + "step": 3028 + }, + { + "epoch": 0.07777622644719083, + "grad_norm": 0.96484375, + "learning_rate": 0.0001980848914353919, + "loss": 1.3165, + "step": 3029 + }, + { + "epoch": 0.07780190364311265, + "grad_norm": 0.9921875, + "learning_rate": 0.00019808402184446993, + "loss": 1.2443, + "step": 3030 + }, + { + "epoch": 0.07782758083903447, + "grad_norm": 0.78125, + "learning_rate": 0.00019808315205807535, + "loss": 1.2104, + "step": 3031 + }, + { + "epoch": 0.0778532580349563, + "grad_norm": 0.9375, + "learning_rate": 0.00019808228207620992, + "loss": 1.2484, + "step": 3032 + }, + { + "epoch": 0.07787893523087812, + "grad_norm": 0.8671875, + "learning_rate": 0.00019808141189887538, + "loss": 1.1419, + "step": 3033 + }, + { + "epoch": 0.07790461242679993, + "grad_norm": 0.86328125, + "learning_rate": 0.00019808054152607341, + "loss": 1.221, + "step": 3034 + }, + { + "epoch": 0.07793028962272175, + "grad_norm": 0.953125, + "learning_rate": 0.0001980796709578058, + "loss": 1.4304, + "step": 3035 + }, + { + "epoch": 0.07795596681864357, + "grad_norm": 0.87109375, + "learning_rate": 0.00019807880019407427, + "loss": 1.1546, + "step": 3036 + }, + { + "epoch": 0.0779816440145654, + "grad_norm": 0.9609375, + "learning_rate": 0.00019807792923488054, + "loss": 1.1852, + "step": 3037 + }, + { + "epoch": 0.07800732121048722, + "grad_norm": 0.9921875, + "learning_rate": 0.00019807705808022635, + "loss": 1.1732, + "step": 3038 + }, + { + "epoch": 0.07803299840640902, + "grad_norm": 0.859375, + "learning_rate": 0.00019807618673011346, + "loss": 1.1273, + "step": 3039 + }, + { + "epoch": 0.07805867560233085, + "grad_norm": 1.3125, + "learning_rate": 0.0001980753151845436, + "loss": 1.3679, + "step": 3040 + }, + { + "epoch": 0.07808435279825267, + "grad_norm": 0.8359375, + "learning_rate": 0.00019807444344351844, + "loss": 1.1571, + "step": 3041 + }, + { + "epoch": 0.07811002999417449, + "grad_norm": 0.90234375, + "learning_rate": 0.00019807357150703983, + "loss": 1.2467, + "step": 3042 + }, + { + "epoch": 0.07813570719009631, + "grad_norm": 1.0, + "learning_rate": 0.0001980726993751094, + "loss": 1.2703, + "step": 3043 + }, + { + "epoch": 0.07816138438601812, + "grad_norm": 0.9296875, + "learning_rate": 0.000198071827047729, + "loss": 1.3975, + "step": 3044 + }, + { + "epoch": 0.07818706158193994, + "grad_norm": 0.8984375, + "learning_rate": 0.00019807095452490026, + "loss": 1.0815, + "step": 3045 + }, + { + "epoch": 0.07821273877786176, + "grad_norm": 0.9296875, + "learning_rate": 0.00019807008180662498, + "loss": 1.1755, + "step": 3046 + }, + { + "epoch": 0.07823841597378359, + "grad_norm": 0.91015625, + "learning_rate": 0.0001980692088929049, + "loss": 1.0729, + "step": 3047 + }, + { + "epoch": 0.0782640931697054, + "grad_norm": 0.89453125, + "learning_rate": 0.00019806833578374174, + "loss": 1.3352, + "step": 3048 + }, + { + "epoch": 0.07828977036562722, + "grad_norm": 0.953125, + "learning_rate": 0.00019806746247913722, + "loss": 1.4354, + "step": 3049 + }, + { + "epoch": 0.07831544756154904, + "grad_norm": 0.828125, + "learning_rate": 0.00019806658897909314, + "loss": 1.2832, + "step": 3050 + }, + { + "epoch": 0.07834112475747086, + "grad_norm": 1.25, + "learning_rate": 0.00019806571528361119, + "loss": 1.2254, + "step": 3051 + }, + { + "epoch": 0.07836680195339268, + "grad_norm": 0.86328125, + "learning_rate": 0.00019806484139269311, + "loss": 1.1258, + "step": 3052 + }, + { + "epoch": 0.07839247914931449, + "grad_norm": 0.9609375, + "learning_rate": 0.0001980639673063407, + "loss": 1.3175, + "step": 3053 + }, + { + "epoch": 0.07841815634523631, + "grad_norm": 0.8359375, + "learning_rate": 0.00019806309302455565, + "loss": 1.1119, + "step": 3054 + }, + { + "epoch": 0.07844383354115814, + "grad_norm": 0.87109375, + "learning_rate": 0.0001980622185473397, + "loss": 1.2497, + "step": 3055 + }, + { + "epoch": 0.07846951073707996, + "grad_norm": 0.98046875, + "learning_rate": 0.0001980613438746946, + "loss": 1.2851, + "step": 3056 + }, + { + "epoch": 0.07849518793300178, + "grad_norm": 0.95703125, + "learning_rate": 0.00019806046900662212, + "loss": 1.3508, + "step": 3057 + }, + { + "epoch": 0.07852086512892359, + "grad_norm": 0.94140625, + "learning_rate": 0.00019805959394312395, + "loss": 1.182, + "step": 3058 + }, + { + "epoch": 0.07854654232484541, + "grad_norm": 0.87109375, + "learning_rate": 0.0001980587186842019, + "loss": 1.1706, + "step": 3059 + }, + { + "epoch": 0.07857221952076723, + "grad_norm": 0.92578125, + "learning_rate": 0.00019805784322985765, + "loss": 1.3305, + "step": 3060 + }, + { + "epoch": 0.07859789671668906, + "grad_norm": 0.87109375, + "learning_rate": 0.000198056967580093, + "loss": 1.0204, + "step": 3061 + }, + { + "epoch": 0.07862357391261088, + "grad_norm": 0.875, + "learning_rate": 0.00019805609173490968, + "loss": 1.096, + "step": 3062 + }, + { + "epoch": 0.07864925110853269, + "grad_norm": 0.8671875, + "learning_rate": 0.0001980552156943094, + "loss": 1.0732, + "step": 3063 + }, + { + "epoch": 0.07867492830445451, + "grad_norm": 0.82421875, + "learning_rate": 0.00019805433945829396, + "loss": 1.2371, + "step": 3064 + }, + { + "epoch": 0.07870060550037633, + "grad_norm": 0.85546875, + "learning_rate": 0.00019805346302686507, + "loss": 1.0855, + "step": 3065 + }, + { + "epoch": 0.07872628269629815, + "grad_norm": 0.85546875, + "learning_rate": 0.00019805258640002445, + "loss": 1.1087, + "step": 3066 + }, + { + "epoch": 0.07875195989221997, + "grad_norm": 0.8671875, + "learning_rate": 0.00019805170957777396, + "loss": 1.2245, + "step": 3067 + }, + { + "epoch": 0.07877763708814178, + "grad_norm": 0.8984375, + "learning_rate": 0.00019805083256011519, + "loss": 1.1586, + "step": 3068 + }, + { + "epoch": 0.0788033142840636, + "grad_norm": 0.98828125, + "learning_rate": 0.00019804995534705003, + "loss": 1.2733, + "step": 3069 + }, + { + "epoch": 0.07882899147998543, + "grad_norm": 0.85546875, + "learning_rate": 0.0001980490779385801, + "loss": 1.1908, + "step": 3070 + }, + { + "epoch": 0.07885466867590725, + "grad_norm": 0.80859375, + "learning_rate": 0.00019804820033470725, + "loss": 1.2225, + "step": 3071 + }, + { + "epoch": 0.07888034587182907, + "grad_norm": 0.88671875, + "learning_rate": 0.0001980473225354332, + "loss": 1.2142, + "step": 3072 + }, + { + "epoch": 0.07890602306775088, + "grad_norm": 0.88671875, + "learning_rate": 0.00019804644454075968, + "loss": 1.1509, + "step": 3073 + }, + { + "epoch": 0.0789317002636727, + "grad_norm": 0.90234375, + "learning_rate": 0.00019804556635068848, + "loss": 1.2113, + "step": 3074 + }, + { + "epoch": 0.07895737745959452, + "grad_norm": 0.88671875, + "learning_rate": 0.00019804468796522128, + "loss": 1.1386, + "step": 3075 + }, + { + "epoch": 0.07898305465551635, + "grad_norm": 0.9140625, + "learning_rate": 0.00019804380938435992, + "loss": 1.2329, + "step": 3076 + }, + { + "epoch": 0.07900873185143817, + "grad_norm": 0.8828125, + "learning_rate": 0.00019804293060810606, + "loss": 1.381, + "step": 3077 + }, + { + "epoch": 0.07903440904735998, + "grad_norm": 0.8359375, + "learning_rate": 0.00019804205163646152, + "loss": 1.247, + "step": 3078 + }, + { + "epoch": 0.0790600862432818, + "grad_norm": 0.91015625, + "learning_rate": 0.00019804117246942802, + "loss": 1.3649, + "step": 3079 + }, + { + "epoch": 0.07908576343920362, + "grad_norm": 0.8671875, + "learning_rate": 0.00019804029310700733, + "loss": 1.2954, + "step": 3080 + }, + { + "epoch": 0.07911144063512544, + "grad_norm": 0.796875, + "learning_rate": 0.00019803941354920116, + "loss": 1.1696, + "step": 3081 + }, + { + "epoch": 0.07913711783104727, + "grad_norm": 0.82421875, + "learning_rate": 0.00019803853379601134, + "loss": 1.1372, + "step": 3082 + }, + { + "epoch": 0.07916279502696907, + "grad_norm": 0.94921875, + "learning_rate": 0.00019803765384743954, + "loss": 1.3965, + "step": 3083 + }, + { + "epoch": 0.0791884722228909, + "grad_norm": 0.8984375, + "learning_rate": 0.00019803677370348758, + "loss": 1.2579, + "step": 3084 + }, + { + "epoch": 0.07921414941881272, + "grad_norm": 0.92578125, + "learning_rate": 0.00019803589336415714, + "loss": 1.2296, + "step": 3085 + }, + { + "epoch": 0.07923982661473454, + "grad_norm": 0.87890625, + "learning_rate": 0.00019803501282945006, + "loss": 1.1712, + "step": 3086 + }, + { + "epoch": 0.07926550381065636, + "grad_norm": 0.875, + "learning_rate": 0.00019803413209936807, + "loss": 1.3424, + "step": 3087 + }, + { + "epoch": 0.07929118100657817, + "grad_norm": 0.94140625, + "learning_rate": 0.00019803325117391288, + "loss": 1.1522, + "step": 3088 + }, + { + "epoch": 0.07931685820249999, + "grad_norm": 0.84375, + "learning_rate": 0.00019803237005308627, + "loss": 1.1266, + "step": 3089 + }, + { + "epoch": 0.07934253539842182, + "grad_norm": 0.8984375, + "learning_rate": 0.00019803148873689, + "loss": 1.394, + "step": 3090 + }, + { + "epoch": 0.07936821259434364, + "grad_norm": 0.94140625, + "learning_rate": 0.00019803060722532588, + "loss": 1.2796, + "step": 3091 + }, + { + "epoch": 0.07939388979026546, + "grad_norm": 0.88671875, + "learning_rate": 0.00019802972551839556, + "loss": 1.2322, + "step": 3092 + }, + { + "epoch": 0.07941956698618727, + "grad_norm": 0.921875, + "learning_rate": 0.00019802884361610088, + "loss": 1.2058, + "step": 3093 + }, + { + "epoch": 0.07944524418210909, + "grad_norm": 0.82421875, + "learning_rate": 0.00019802796151844357, + "loss": 1.214, + "step": 3094 + }, + { + "epoch": 0.07947092137803091, + "grad_norm": 0.8671875, + "learning_rate": 0.00019802707922542535, + "loss": 1.1646, + "step": 3095 + }, + { + "epoch": 0.07949659857395273, + "grad_norm": 0.94140625, + "learning_rate": 0.00019802619673704806, + "loss": 1.2468, + "step": 3096 + }, + { + "epoch": 0.07952227576987456, + "grad_norm": 0.84765625, + "learning_rate": 0.0001980253140533134, + "loss": 1.1527, + "step": 3097 + }, + { + "epoch": 0.07954795296579636, + "grad_norm": 0.92578125, + "learning_rate": 0.00019802443117422313, + "loss": 1.2369, + "step": 3098 + }, + { + "epoch": 0.07957363016171819, + "grad_norm": 0.87890625, + "learning_rate": 0.00019802354809977905, + "loss": 1.2662, + "step": 3099 + }, + { + "epoch": 0.07959930735764001, + "grad_norm": 1.03125, + "learning_rate": 0.00019802266482998285, + "loss": 1.1835, + "step": 3100 + }, + { + "epoch": 0.07962498455356183, + "grad_norm": 0.90625, + "learning_rate": 0.00019802178136483638, + "loss": 1.2441, + "step": 3101 + }, + { + "epoch": 0.07965066174948365, + "grad_norm": 0.859375, + "learning_rate": 0.00019802089770434134, + "loss": 1.2645, + "step": 3102 + }, + { + "epoch": 0.07967633894540546, + "grad_norm": 0.83203125, + "learning_rate": 0.0001980200138484995, + "loss": 1.103, + "step": 3103 + }, + { + "epoch": 0.07970201614132728, + "grad_norm": 0.86328125, + "learning_rate": 0.00019801912979731262, + "loss": 1.2253, + "step": 3104 + }, + { + "epoch": 0.0797276933372491, + "grad_norm": 0.8984375, + "learning_rate": 0.00019801824555078248, + "loss": 1.1634, + "step": 3105 + }, + { + "epoch": 0.07975337053317093, + "grad_norm": 0.83984375, + "learning_rate": 0.0001980173611089108, + "loss": 1.31, + "step": 3106 + }, + { + "epoch": 0.07977904772909275, + "grad_norm": 0.8515625, + "learning_rate": 0.0001980164764716994, + "loss": 1.213, + "step": 3107 + }, + { + "epoch": 0.07980472492501456, + "grad_norm": 0.90625, + "learning_rate": 0.00019801559163915003, + "loss": 1.207, + "step": 3108 + }, + { + "epoch": 0.07983040212093638, + "grad_norm": 0.87890625, + "learning_rate": 0.00019801470661126442, + "loss": 1.2007, + "step": 3109 + }, + { + "epoch": 0.0798560793168582, + "grad_norm": 0.98046875, + "learning_rate": 0.00019801382138804436, + "loss": 1.2503, + "step": 3110 + }, + { + "epoch": 0.07988175651278002, + "grad_norm": 0.90234375, + "learning_rate": 0.0001980129359694916, + "loss": 1.1615, + "step": 3111 + }, + { + "epoch": 0.07990743370870185, + "grad_norm": 0.85546875, + "learning_rate": 0.00019801205035560794, + "loss": 1.1175, + "step": 3112 + }, + { + "epoch": 0.07993311090462366, + "grad_norm": 0.79296875, + "learning_rate": 0.0001980111645463951, + "loss": 1.1527, + "step": 3113 + }, + { + "epoch": 0.07995878810054548, + "grad_norm": 0.8359375, + "learning_rate": 0.00019801027854185482, + "loss": 1.0649, + "step": 3114 + }, + { + "epoch": 0.0799844652964673, + "grad_norm": 0.8203125, + "learning_rate": 0.00019800939234198897, + "loss": 1.3412, + "step": 3115 + }, + { + "epoch": 0.08001014249238912, + "grad_norm": 0.84375, + "learning_rate": 0.00019800850594679922, + "loss": 1.2746, + "step": 3116 + }, + { + "epoch": 0.08003581968831094, + "grad_norm": 0.98046875, + "learning_rate": 0.00019800761935628738, + "loss": 1.2061, + "step": 3117 + }, + { + "epoch": 0.08006149688423275, + "grad_norm": 0.83203125, + "learning_rate": 0.0001980067325704552, + "loss": 1.2752, + "step": 3118 + }, + { + "epoch": 0.08008717408015457, + "grad_norm": 0.90625, + "learning_rate": 0.00019800584558930445, + "loss": 1.3231, + "step": 3119 + }, + { + "epoch": 0.0801128512760764, + "grad_norm": 0.83984375, + "learning_rate": 0.00019800495841283694, + "loss": 1.1057, + "step": 3120 + }, + { + "epoch": 0.08013852847199822, + "grad_norm": 0.8671875, + "learning_rate": 0.00019800407104105436, + "loss": 1.1631, + "step": 3121 + }, + { + "epoch": 0.08016420566792004, + "grad_norm": 0.94140625, + "learning_rate": 0.00019800318347395855, + "loss": 1.1801, + "step": 3122 + }, + { + "epoch": 0.08018988286384185, + "grad_norm": 1.0, + "learning_rate": 0.0001980022957115512, + "loss": 1.2896, + "step": 3123 + }, + { + "epoch": 0.08021556005976367, + "grad_norm": 0.90625, + "learning_rate": 0.00019800140775383417, + "loss": 1.2108, + "step": 3124 + }, + { + "epoch": 0.0802412372556855, + "grad_norm": 0.83984375, + "learning_rate": 0.0001980005196008092, + "loss": 1.2214, + "step": 3125 + }, + { + "epoch": 0.08026691445160732, + "grad_norm": 1.0234375, + "learning_rate": 0.00019799963125247802, + "loss": 1.1521, + "step": 3126 + }, + { + "epoch": 0.08029259164752914, + "grad_norm": 0.88671875, + "learning_rate": 0.0001979987427088424, + "loss": 1.1673, + "step": 3127 + }, + { + "epoch": 0.08031826884345095, + "grad_norm": 0.9140625, + "learning_rate": 0.0001979978539699042, + "loss": 1.2036, + "step": 3128 + }, + { + "epoch": 0.08034394603937277, + "grad_norm": 0.83984375, + "learning_rate": 0.0001979969650356651, + "loss": 1.1122, + "step": 3129 + }, + { + "epoch": 0.08036962323529459, + "grad_norm": 0.80859375, + "learning_rate": 0.00019799607590612688, + "loss": 1.0159, + "step": 3130 + }, + { + "epoch": 0.08039530043121641, + "grad_norm": 0.83203125, + "learning_rate": 0.00019799518658129137, + "loss": 1.1261, + "step": 3131 + }, + { + "epoch": 0.08042097762713823, + "grad_norm": 0.8828125, + "learning_rate": 0.00019799429706116026, + "loss": 1.2971, + "step": 3132 + }, + { + "epoch": 0.08044665482306004, + "grad_norm": 0.79296875, + "learning_rate": 0.00019799340734573542, + "loss": 1.2817, + "step": 3133 + }, + { + "epoch": 0.08047233201898187, + "grad_norm": 0.8828125, + "learning_rate": 0.00019799251743501852, + "loss": 1.3342, + "step": 3134 + }, + { + "epoch": 0.08049800921490369, + "grad_norm": 0.83203125, + "learning_rate": 0.00019799162732901142, + "loss": 1.2005, + "step": 3135 + }, + { + "epoch": 0.08052368641082551, + "grad_norm": 0.953125, + "learning_rate": 0.00019799073702771584, + "loss": 1.2515, + "step": 3136 + }, + { + "epoch": 0.08054936360674733, + "grad_norm": 0.8828125, + "learning_rate": 0.00019798984653113358, + "loss": 1.1952, + "step": 3137 + }, + { + "epoch": 0.08057504080266914, + "grad_norm": 0.9140625, + "learning_rate": 0.00019798895583926642, + "loss": 1.0856, + "step": 3138 + }, + { + "epoch": 0.08060071799859096, + "grad_norm": 0.84375, + "learning_rate": 0.0001979880649521161, + "loss": 1.2187, + "step": 3139 + }, + { + "epoch": 0.08062639519451278, + "grad_norm": 0.91796875, + "learning_rate": 0.00019798717386968444, + "loss": 1.1139, + "step": 3140 + }, + { + "epoch": 0.0806520723904346, + "grad_norm": 0.8359375, + "learning_rate": 0.00019798628259197316, + "loss": 1.1245, + "step": 3141 + }, + { + "epoch": 0.08067774958635643, + "grad_norm": 0.83203125, + "learning_rate": 0.0001979853911189841, + "loss": 1.1896, + "step": 3142 + }, + { + "epoch": 0.08070342678227824, + "grad_norm": 0.8203125, + "learning_rate": 0.000197984499450719, + "loss": 1.233, + "step": 3143 + }, + { + "epoch": 0.08072910397820006, + "grad_norm": 0.8828125, + "learning_rate": 0.00019798360758717962, + "loss": 1.1989, + "step": 3144 + }, + { + "epoch": 0.08075478117412188, + "grad_norm": 0.8203125, + "learning_rate": 0.0001979827155283678, + "loss": 1.1448, + "step": 3145 + }, + { + "epoch": 0.0807804583700437, + "grad_norm": 0.92578125, + "learning_rate": 0.00019798182327428524, + "loss": 1.166, + "step": 3146 + }, + { + "epoch": 0.08080613556596553, + "grad_norm": 0.85546875, + "learning_rate": 0.00019798093082493379, + "loss": 1.224, + "step": 3147 + }, + { + "epoch": 0.08083181276188733, + "grad_norm": 0.85546875, + "learning_rate": 0.0001979800381803152, + "loss": 1.3312, + "step": 3148 + }, + { + "epoch": 0.08085748995780916, + "grad_norm": 0.8828125, + "learning_rate": 0.00019797914534043121, + "loss": 1.147, + "step": 3149 + }, + { + "epoch": 0.08088316715373098, + "grad_norm": 0.875, + "learning_rate": 0.00019797825230528365, + "loss": 1.2704, + "step": 3150 + }, + { + "epoch": 0.0809088443496528, + "grad_norm": 0.7734375, + "learning_rate": 0.00019797735907487428, + "loss": 1.0012, + "step": 3151 + }, + { + "epoch": 0.08093452154557461, + "grad_norm": 0.9609375, + "learning_rate": 0.0001979764656492049, + "loss": 1.1913, + "step": 3152 + }, + { + "epoch": 0.08096019874149643, + "grad_norm": 0.890625, + "learning_rate": 0.00019797557202827726, + "loss": 1.2702, + "step": 3153 + }, + { + "epoch": 0.08098587593741825, + "grad_norm": 0.8515625, + "learning_rate": 0.00019797467821209313, + "loss": 1.0569, + "step": 3154 + }, + { + "epoch": 0.08101155313334008, + "grad_norm": 0.9921875, + "learning_rate": 0.00019797378420065436, + "loss": 1.1963, + "step": 3155 + }, + { + "epoch": 0.0810372303292619, + "grad_norm": 0.84375, + "learning_rate": 0.00019797288999396265, + "loss": 1.1176, + "step": 3156 + }, + { + "epoch": 0.0810629075251837, + "grad_norm": 0.85546875, + "learning_rate": 0.00019797199559201984, + "loss": 1.1668, + "step": 3157 + }, + { + "epoch": 0.08108858472110553, + "grad_norm": 0.90625, + "learning_rate": 0.00019797110099482773, + "loss": 1.272, + "step": 3158 + }, + { + "epoch": 0.08111426191702735, + "grad_norm": 0.91015625, + "learning_rate": 0.00019797020620238799, + "loss": 1.0964, + "step": 3159 + }, + { + "epoch": 0.08113993911294917, + "grad_norm": 0.87109375, + "learning_rate": 0.00019796931121470252, + "loss": 1.1456, + "step": 3160 + }, + { + "epoch": 0.081165616308871, + "grad_norm": 0.97265625, + "learning_rate": 0.00019796841603177308, + "loss": 1.3371, + "step": 3161 + }, + { + "epoch": 0.0811912935047928, + "grad_norm": 0.8203125, + "learning_rate": 0.0001979675206536014, + "loss": 1.121, + "step": 3162 + }, + { + "epoch": 0.08121697070071462, + "grad_norm": 0.9453125, + "learning_rate": 0.00019796662508018932, + "loss": 1.32, + "step": 3163 + }, + { + "epoch": 0.08124264789663645, + "grad_norm": 0.8515625, + "learning_rate": 0.00019796572931153863, + "loss": 1.2187, + "step": 3164 + }, + { + "epoch": 0.08126832509255827, + "grad_norm": 0.85546875, + "learning_rate": 0.00019796483334765105, + "loss": 1.2247, + "step": 3165 + }, + { + "epoch": 0.08129400228848009, + "grad_norm": 0.88671875, + "learning_rate": 0.00019796393718852842, + "loss": 1.0933, + "step": 3166 + }, + { + "epoch": 0.0813196794844019, + "grad_norm": 0.859375, + "learning_rate": 0.00019796304083417253, + "loss": 1.4081, + "step": 3167 + }, + { + "epoch": 0.08134535668032372, + "grad_norm": 0.9453125, + "learning_rate": 0.0001979621442845851, + "loss": 1.183, + "step": 3168 + }, + { + "epoch": 0.08137103387624554, + "grad_norm": 0.9296875, + "learning_rate": 0.00019796124753976798, + "loss": 1.2247, + "step": 3169 + }, + { + "epoch": 0.08139671107216737, + "grad_norm": 1.15625, + "learning_rate": 0.00019796035059972298, + "loss": 1.1926, + "step": 3170 + }, + { + "epoch": 0.08142238826808919, + "grad_norm": 0.90625, + "learning_rate": 0.00019795945346445183, + "loss": 1.2139, + "step": 3171 + }, + { + "epoch": 0.081448065464011, + "grad_norm": 0.8125, + "learning_rate": 0.00019795855613395633, + "loss": 1.2054, + "step": 3172 + }, + { + "epoch": 0.08147374265993282, + "grad_norm": 0.83984375, + "learning_rate": 0.0001979576586082383, + "loss": 1.076, + "step": 3173 + }, + { + "epoch": 0.08149941985585464, + "grad_norm": 0.84765625, + "learning_rate": 0.00019795676088729946, + "loss": 1.2338, + "step": 3174 + }, + { + "epoch": 0.08152509705177646, + "grad_norm": 0.9140625, + "learning_rate": 0.0001979558629711417, + "loss": 1.1816, + "step": 3175 + }, + { + "epoch": 0.08155077424769828, + "grad_norm": 0.87890625, + "learning_rate": 0.00019795496485976673, + "loss": 1.2586, + "step": 3176 + }, + { + "epoch": 0.0815764514436201, + "grad_norm": 0.90625, + "learning_rate": 0.00019795406655317632, + "loss": 1.1768, + "step": 3177 + }, + { + "epoch": 0.08160212863954192, + "grad_norm": 0.91015625, + "learning_rate": 0.00019795316805137236, + "loss": 1.2037, + "step": 3178 + }, + { + "epoch": 0.08162780583546374, + "grad_norm": 0.91015625, + "learning_rate": 0.00019795226935435655, + "loss": 1.0858, + "step": 3179 + }, + { + "epoch": 0.08165348303138556, + "grad_norm": 0.875, + "learning_rate": 0.00019795137046213076, + "loss": 1.1768, + "step": 3180 + }, + { + "epoch": 0.08167916022730738, + "grad_norm": 0.83203125, + "learning_rate": 0.0001979504713746967, + "loss": 1.3245, + "step": 3181 + }, + { + "epoch": 0.08170483742322919, + "grad_norm": 0.98046875, + "learning_rate": 0.0001979495720920562, + "loss": 1.2267, + "step": 3182 + }, + { + "epoch": 0.08173051461915101, + "grad_norm": 0.8359375, + "learning_rate": 0.00019794867261421106, + "loss": 1.1539, + "step": 3183 + }, + { + "epoch": 0.08175619181507283, + "grad_norm": 0.88671875, + "learning_rate": 0.00019794777294116305, + "loss": 1.2724, + "step": 3184 + }, + { + "epoch": 0.08178186901099466, + "grad_norm": 0.859375, + "learning_rate": 0.00019794687307291396, + "loss": 1.2629, + "step": 3185 + }, + { + "epoch": 0.08180754620691648, + "grad_norm": 0.86328125, + "learning_rate": 0.00019794597300946566, + "loss": 1.1838, + "step": 3186 + }, + { + "epoch": 0.08183322340283829, + "grad_norm": 0.8671875, + "learning_rate": 0.00019794507275081984, + "loss": 1.1786, + "step": 3187 + }, + { + "epoch": 0.08185890059876011, + "grad_norm": 0.859375, + "learning_rate": 0.00019794417229697833, + "loss": 1.2303, + "step": 3188 + }, + { + "epoch": 0.08188457779468193, + "grad_norm": 0.87890625, + "learning_rate": 0.00019794327164794298, + "loss": 1.1794, + "step": 3189 + }, + { + "epoch": 0.08191025499060375, + "grad_norm": 0.84375, + "learning_rate": 0.0001979423708037155, + "loss": 1.0404, + "step": 3190 + }, + { + "epoch": 0.08193593218652558, + "grad_norm": 0.86328125, + "learning_rate": 0.0001979414697642977, + "loss": 1.1206, + "step": 3191 + }, + { + "epoch": 0.08196160938244738, + "grad_norm": 0.8125, + "learning_rate": 0.00019794056852969142, + "loss": 1.1658, + "step": 3192 + }, + { + "epoch": 0.0819872865783692, + "grad_norm": 0.8125, + "learning_rate": 0.00019793966709989844, + "loss": 1.2676, + "step": 3193 + }, + { + "epoch": 0.08201296377429103, + "grad_norm": 0.81640625, + "learning_rate": 0.00019793876547492055, + "loss": 1.1234, + "step": 3194 + }, + { + "epoch": 0.08203864097021285, + "grad_norm": 0.8515625, + "learning_rate": 0.00019793786365475955, + "loss": 1.2049, + "step": 3195 + }, + { + "epoch": 0.08206431816613467, + "grad_norm": 0.875, + "learning_rate": 0.00019793696163941725, + "loss": 1.1663, + "step": 3196 + }, + { + "epoch": 0.08208999536205648, + "grad_norm": 0.86328125, + "learning_rate": 0.00019793605942889538, + "loss": 1.236, + "step": 3197 + }, + { + "epoch": 0.0821156725579783, + "grad_norm": 0.890625, + "learning_rate": 0.00019793515702319585, + "loss": 1.1726, + "step": 3198 + }, + { + "epoch": 0.08214134975390013, + "grad_norm": 0.94921875, + "learning_rate": 0.00019793425442232035, + "loss": 1.3263, + "step": 3199 + }, + { + "epoch": 0.08216702694982195, + "grad_norm": 0.87890625, + "learning_rate": 0.00019793335162627073, + "loss": 1.3517, + "step": 3200 + }, + { + "epoch": 0.08219270414574377, + "grad_norm": 0.8203125, + "learning_rate": 0.00019793244863504882, + "loss": 1.2659, + "step": 3201 + }, + { + "epoch": 0.08221838134166558, + "grad_norm": 0.8828125, + "learning_rate": 0.0001979315454486564, + "loss": 1.3489, + "step": 3202 + }, + { + "epoch": 0.0822440585375874, + "grad_norm": 0.9296875, + "learning_rate": 0.0001979306420670952, + "loss": 1.158, + "step": 3203 + }, + { + "epoch": 0.08226973573350922, + "grad_norm": 0.87890625, + "learning_rate": 0.00019792973849036712, + "loss": 1.1653, + "step": 3204 + }, + { + "epoch": 0.08229541292943104, + "grad_norm": 0.9921875, + "learning_rate": 0.00019792883471847389, + "loss": 1.2814, + "step": 3205 + }, + { + "epoch": 0.08232109012535287, + "grad_norm": 0.859375, + "learning_rate": 0.00019792793075141732, + "loss": 1.2513, + "step": 3206 + }, + { + "epoch": 0.08234676732127467, + "grad_norm": 0.86328125, + "learning_rate": 0.00019792702658919927, + "loss": 1.0929, + "step": 3207 + }, + { + "epoch": 0.0823724445171965, + "grad_norm": 0.89453125, + "learning_rate": 0.0001979261222318215, + "loss": 1.2315, + "step": 3208 + }, + { + "epoch": 0.08239812171311832, + "grad_norm": 0.83984375, + "learning_rate": 0.0001979252176792858, + "loss": 1.1424, + "step": 3209 + }, + { + "epoch": 0.08242379890904014, + "grad_norm": 0.87109375, + "learning_rate": 0.00019792431293159398, + "loss": 1.2429, + "step": 3210 + }, + { + "epoch": 0.08244947610496196, + "grad_norm": 0.8125, + "learning_rate": 0.00019792340798874787, + "loss": 1.2138, + "step": 3211 + }, + { + "epoch": 0.08247515330088377, + "grad_norm": 0.87109375, + "learning_rate": 0.00019792250285074925, + "loss": 1.162, + "step": 3212 + }, + { + "epoch": 0.0825008304968056, + "grad_norm": 1.015625, + "learning_rate": 0.00019792159751759992, + "loss": 1.1434, + "step": 3213 + }, + { + "epoch": 0.08252650769272742, + "grad_norm": 0.9296875, + "learning_rate": 0.0001979206919893017, + "loss": 1.2324, + "step": 3214 + }, + { + "epoch": 0.08255218488864924, + "grad_norm": 0.82421875, + "learning_rate": 0.00019791978626585636, + "loss": 1.1201, + "step": 3215 + }, + { + "epoch": 0.08257786208457106, + "grad_norm": 0.91796875, + "learning_rate": 0.00019791888034726576, + "loss": 1.2766, + "step": 3216 + }, + { + "epoch": 0.08260353928049287, + "grad_norm": 0.890625, + "learning_rate": 0.00019791797423353167, + "loss": 1.1285, + "step": 3217 + }, + { + "epoch": 0.08262921647641469, + "grad_norm": 0.875, + "learning_rate": 0.0001979170679246559, + "loss": 0.9866, + "step": 3218 + }, + { + "epoch": 0.08265489367233651, + "grad_norm": 0.95703125, + "learning_rate": 0.00019791616142064022, + "loss": 1.2461, + "step": 3219 + }, + { + "epoch": 0.08268057086825834, + "grad_norm": 1.1796875, + "learning_rate": 0.00019791525472148653, + "loss": 1.1953, + "step": 3220 + }, + { + "epoch": 0.08270624806418016, + "grad_norm": 0.890625, + "learning_rate": 0.00019791434782719659, + "loss": 1.3099, + "step": 3221 + }, + { + "epoch": 0.08273192526010197, + "grad_norm": 0.8671875, + "learning_rate": 0.00019791344073777213, + "loss": 1.1243, + "step": 3222 + }, + { + "epoch": 0.08275760245602379, + "grad_norm": 0.85546875, + "learning_rate": 0.00019791253345321507, + "loss": 1.318, + "step": 3223 + }, + { + "epoch": 0.08278327965194561, + "grad_norm": 0.83203125, + "learning_rate": 0.0001979116259735272, + "loss": 1.0865, + "step": 3224 + }, + { + "epoch": 0.08280895684786743, + "grad_norm": 0.78125, + "learning_rate": 0.00019791071829871026, + "loss": 1.1332, + "step": 3225 + }, + { + "epoch": 0.08283463404378925, + "grad_norm": 0.91796875, + "learning_rate": 0.00019790981042876613, + "loss": 1.2503, + "step": 3226 + }, + { + "epoch": 0.08286031123971106, + "grad_norm": 1.0390625, + "learning_rate": 0.0001979089023636966, + "loss": 1.3376, + "step": 3227 + }, + { + "epoch": 0.08288598843563288, + "grad_norm": 0.9296875, + "learning_rate": 0.00019790799410350344, + "loss": 1.2412, + "step": 3228 + }, + { + "epoch": 0.0829116656315547, + "grad_norm": 0.921875, + "learning_rate": 0.0001979070856481885, + "loss": 1.2392, + "step": 3229 + }, + { + "epoch": 0.08293734282747653, + "grad_norm": 0.84765625, + "learning_rate": 0.00019790617699775358, + "loss": 1.0747, + "step": 3230 + }, + { + "epoch": 0.08296302002339835, + "grad_norm": 0.91796875, + "learning_rate": 0.0001979052681522005, + "loss": 1.1245, + "step": 3231 + }, + { + "epoch": 0.08298869721932016, + "grad_norm": 0.984375, + "learning_rate": 0.00019790435911153106, + "loss": 1.2486, + "step": 3232 + }, + { + "epoch": 0.08301437441524198, + "grad_norm": 0.8515625, + "learning_rate": 0.0001979034498757471, + "loss": 1.0596, + "step": 3233 + }, + { + "epoch": 0.0830400516111638, + "grad_norm": 0.80078125, + "learning_rate": 0.00019790254044485037, + "loss": 1.1315, + "step": 3234 + }, + { + "epoch": 0.08306572880708563, + "grad_norm": 0.90625, + "learning_rate": 0.00019790163081884275, + "loss": 1.2317, + "step": 3235 + }, + { + "epoch": 0.08309140600300745, + "grad_norm": 0.8359375, + "learning_rate": 0.00019790072099772603, + "loss": 1.2138, + "step": 3236 + }, + { + "epoch": 0.08311708319892926, + "grad_norm": 0.94140625, + "learning_rate": 0.00019789981098150197, + "loss": 1.264, + "step": 3237 + }, + { + "epoch": 0.08314276039485108, + "grad_norm": 0.890625, + "learning_rate": 0.0001978989007701725, + "loss": 1.124, + "step": 3238 + }, + { + "epoch": 0.0831684375907729, + "grad_norm": 0.91796875, + "learning_rate": 0.0001978979903637393, + "loss": 1.1994, + "step": 3239 + }, + { + "epoch": 0.08319411478669472, + "grad_norm": 1.4140625, + "learning_rate": 0.00019789707976220427, + "loss": 1.3064, + "step": 3240 + }, + { + "epoch": 0.08321979198261654, + "grad_norm": 0.96875, + "learning_rate": 0.00019789616896556921, + "loss": 1.2342, + "step": 3241 + }, + { + "epoch": 0.08324546917853835, + "grad_norm": 0.80078125, + "learning_rate": 0.0001978952579738359, + "loss": 1.0331, + "step": 3242 + }, + { + "epoch": 0.08327114637446018, + "grad_norm": 1.078125, + "learning_rate": 0.00019789434678700623, + "loss": 1.4082, + "step": 3243 + }, + { + "epoch": 0.083296823570382, + "grad_norm": 1.8125, + "learning_rate": 0.00019789343540508193, + "loss": 1.31, + "step": 3244 + }, + { + "epoch": 0.08332250076630382, + "grad_norm": 0.83984375, + "learning_rate": 0.0001978925238280649, + "loss": 1.1408, + "step": 3245 + }, + { + "epoch": 0.08334817796222564, + "grad_norm": 0.8125, + "learning_rate": 0.00019789161205595687, + "loss": 1.2393, + "step": 3246 + }, + { + "epoch": 0.08337385515814745, + "grad_norm": 0.86328125, + "learning_rate": 0.00019789070008875971, + "loss": 1.181, + "step": 3247 + }, + { + "epoch": 0.08339953235406927, + "grad_norm": 0.9609375, + "learning_rate": 0.00019788978792647523, + "loss": 1.2992, + "step": 3248 + }, + { + "epoch": 0.0834252095499911, + "grad_norm": 0.95703125, + "learning_rate": 0.00019788887556910524, + "loss": 1.356, + "step": 3249 + }, + { + "epoch": 0.08345088674591292, + "grad_norm": 1.0625, + "learning_rate": 0.00019788796301665158, + "loss": 1.1172, + "step": 3250 + }, + { + "epoch": 0.08347656394183474, + "grad_norm": 2.1875, + "learning_rate": 0.00019788705026911604, + "loss": 1.448, + "step": 3251 + }, + { + "epoch": 0.08350224113775655, + "grad_norm": 0.96484375, + "learning_rate": 0.00019788613732650044, + "loss": 1.1733, + "step": 3252 + }, + { + "epoch": 0.08352791833367837, + "grad_norm": 0.921875, + "learning_rate": 0.00019788522418880665, + "loss": 1.1427, + "step": 3253 + }, + { + "epoch": 0.08355359552960019, + "grad_norm": 0.8671875, + "learning_rate": 0.00019788431085603642, + "loss": 1.2393, + "step": 3254 + }, + { + "epoch": 0.08357927272552201, + "grad_norm": 0.9375, + "learning_rate": 0.00019788339732819158, + "loss": 1.1973, + "step": 3255 + }, + { + "epoch": 0.08360494992144382, + "grad_norm": 1.5390625, + "learning_rate": 0.00019788248360527402, + "loss": 1.2135, + "step": 3256 + }, + { + "epoch": 0.08363062711736564, + "grad_norm": 1.21875, + "learning_rate": 0.00019788156968728545, + "loss": 1.3322, + "step": 3257 + }, + { + "epoch": 0.08365630431328747, + "grad_norm": 1.1484375, + "learning_rate": 0.00019788065557422782, + "loss": 1.3762, + "step": 3258 + }, + { + "epoch": 0.08368198150920929, + "grad_norm": 0.95703125, + "learning_rate": 0.00019787974126610284, + "loss": 1.1702, + "step": 3259 + }, + { + "epoch": 0.08370765870513111, + "grad_norm": 0.83203125, + "learning_rate": 0.0001978788267629124, + "loss": 1.1563, + "step": 3260 + }, + { + "epoch": 0.08373333590105292, + "grad_norm": 0.8984375, + "learning_rate": 0.00019787791206465828, + "loss": 1.2337, + "step": 3261 + }, + { + "epoch": 0.08375901309697474, + "grad_norm": 0.98046875, + "learning_rate": 0.00019787699717134234, + "loss": 1.2594, + "step": 3262 + }, + { + "epoch": 0.08378469029289656, + "grad_norm": 0.7734375, + "learning_rate": 0.0001978760820829664, + "loss": 1.1183, + "step": 3263 + }, + { + "epoch": 0.08381036748881839, + "grad_norm": 0.9140625, + "learning_rate": 0.00019787516679953225, + "loss": 1.1413, + "step": 3264 + }, + { + "epoch": 0.08383604468474021, + "grad_norm": 0.83203125, + "learning_rate": 0.0001978742513210417, + "loss": 0.9059, + "step": 3265 + }, + { + "epoch": 0.08386172188066202, + "grad_norm": 0.828125, + "learning_rate": 0.00019787333564749666, + "loss": 1.1896, + "step": 3266 + }, + { + "epoch": 0.08388739907658384, + "grad_norm": 1.0234375, + "learning_rate": 0.0001978724197788989, + "loss": 1.1307, + "step": 3267 + }, + { + "epoch": 0.08391307627250566, + "grad_norm": 1.0, + "learning_rate": 0.00019787150371525022, + "loss": 1.2027, + "step": 3268 + }, + { + "epoch": 0.08393875346842748, + "grad_norm": 0.921875, + "learning_rate": 0.00019787058745655248, + "loss": 1.2061, + "step": 3269 + }, + { + "epoch": 0.0839644306643493, + "grad_norm": 0.95703125, + "learning_rate": 0.0001978696710028075, + "loss": 1.1829, + "step": 3270 + }, + { + "epoch": 0.08399010786027111, + "grad_norm": 0.9375, + "learning_rate": 0.0001978687543540171, + "loss": 1.425, + "step": 3271 + }, + { + "epoch": 0.08401578505619293, + "grad_norm": 0.87109375, + "learning_rate": 0.00019786783751018315, + "loss": 1.1366, + "step": 3272 + }, + { + "epoch": 0.08404146225211476, + "grad_norm": 0.8984375, + "learning_rate": 0.0001978669204713074, + "loss": 1.1734, + "step": 3273 + }, + { + "epoch": 0.08406713944803658, + "grad_norm": 1.046875, + "learning_rate": 0.00019786600323739174, + "loss": 1.2034, + "step": 3274 + }, + { + "epoch": 0.0840928166439584, + "grad_norm": 0.87109375, + "learning_rate": 0.00019786508580843797, + "loss": 1.1262, + "step": 3275 + }, + { + "epoch": 0.08411849383988021, + "grad_norm": 0.9296875, + "learning_rate": 0.00019786416818444794, + "loss": 1.1436, + "step": 3276 + }, + { + "epoch": 0.08414417103580203, + "grad_norm": 0.9375, + "learning_rate": 0.0001978632503654234, + "loss": 1.1173, + "step": 3277 + }, + { + "epoch": 0.08416984823172385, + "grad_norm": 0.9453125, + "learning_rate": 0.0001978623323513663, + "loss": 1.3724, + "step": 3278 + }, + { + "epoch": 0.08419552542764568, + "grad_norm": 0.859375, + "learning_rate": 0.0001978614141422784, + "loss": 1.3188, + "step": 3279 + }, + { + "epoch": 0.0842212026235675, + "grad_norm": 0.87109375, + "learning_rate": 0.00019786049573816157, + "loss": 1.2234, + "step": 3280 + }, + { + "epoch": 0.0842468798194893, + "grad_norm": 0.8203125, + "learning_rate": 0.00019785957713901758, + "loss": 1.1338, + "step": 3281 + }, + { + "epoch": 0.08427255701541113, + "grad_norm": 0.9140625, + "learning_rate": 0.00019785865834484828, + "loss": 1.2793, + "step": 3282 + }, + { + "epoch": 0.08429823421133295, + "grad_norm": 0.859375, + "learning_rate": 0.00019785773935565552, + "loss": 1.3279, + "step": 3283 + }, + { + "epoch": 0.08432391140725477, + "grad_norm": 0.921875, + "learning_rate": 0.00019785682017144115, + "loss": 1.0464, + "step": 3284 + }, + { + "epoch": 0.0843495886031766, + "grad_norm": 0.98828125, + "learning_rate": 0.00019785590079220695, + "loss": 1.0603, + "step": 3285 + }, + { + "epoch": 0.0843752657990984, + "grad_norm": 0.88671875, + "learning_rate": 0.00019785498121795478, + "loss": 1.2792, + "step": 3286 + }, + { + "epoch": 0.08440094299502023, + "grad_norm": 0.8984375, + "learning_rate": 0.0001978540614486865, + "loss": 1.1293, + "step": 3287 + }, + { + "epoch": 0.08442662019094205, + "grad_norm": 0.890625, + "learning_rate": 0.00019785314148440387, + "loss": 1.144, + "step": 3288 + }, + { + "epoch": 0.08445229738686387, + "grad_norm": 1.3125, + "learning_rate": 0.00019785222132510878, + "loss": 1.1997, + "step": 3289 + }, + { + "epoch": 0.08447797458278569, + "grad_norm": 0.921875, + "learning_rate": 0.00019785130097080308, + "loss": 1.4469, + "step": 3290 + }, + { + "epoch": 0.0845036517787075, + "grad_norm": 0.9296875, + "learning_rate": 0.00019785038042148857, + "loss": 1.1522, + "step": 3291 + }, + { + "epoch": 0.08452932897462932, + "grad_norm": 0.94921875, + "learning_rate": 0.0001978494596771671, + "loss": 1.3213, + "step": 3292 + }, + { + "epoch": 0.08455500617055114, + "grad_norm": 0.89453125, + "learning_rate": 0.00019784853873784045, + "loss": 1.2236, + "step": 3293 + }, + { + "epoch": 0.08458068336647297, + "grad_norm": 0.91015625, + "learning_rate": 0.00019784761760351053, + "loss": 1.0752, + "step": 3294 + }, + { + "epoch": 0.08460636056239479, + "grad_norm": 0.84375, + "learning_rate": 0.00019784669627417913, + "loss": 1.1213, + "step": 3295 + }, + { + "epoch": 0.0846320377583166, + "grad_norm": 1.09375, + "learning_rate": 0.00019784577474984814, + "loss": 1.1066, + "step": 3296 + }, + { + "epoch": 0.08465771495423842, + "grad_norm": 0.88671875, + "learning_rate": 0.00019784485303051935, + "loss": 1.3931, + "step": 3297 + }, + { + "epoch": 0.08468339215016024, + "grad_norm": 0.86328125, + "learning_rate": 0.00019784393111619457, + "loss": 1.2324, + "step": 3298 + }, + { + "epoch": 0.08470906934608206, + "grad_norm": 0.83203125, + "learning_rate": 0.00019784300900687569, + "loss": 1.0495, + "step": 3299 + }, + { + "epoch": 0.08473474654200389, + "grad_norm": 0.95703125, + "learning_rate": 0.00019784208670256456, + "loss": 1.185, + "step": 3300 + }, + { + "epoch": 0.0847604237379257, + "grad_norm": 0.875, + "learning_rate": 0.00019784116420326294, + "loss": 1.209, + "step": 3301 + }, + { + "epoch": 0.08478610093384752, + "grad_norm": 0.921875, + "learning_rate": 0.00019784024150897275, + "loss": 1.1104, + "step": 3302 + }, + { + "epoch": 0.08481177812976934, + "grad_norm": 0.86328125, + "learning_rate": 0.00019783931861969581, + "loss": 1.2195, + "step": 3303 + }, + { + "epoch": 0.08483745532569116, + "grad_norm": 0.83203125, + "learning_rate": 0.00019783839553543393, + "loss": 1.1154, + "step": 3304 + }, + { + "epoch": 0.08486313252161298, + "grad_norm": 0.89453125, + "learning_rate": 0.00019783747225618896, + "loss": 1.1747, + "step": 3305 + }, + { + "epoch": 0.08488880971753479, + "grad_norm": 0.875, + "learning_rate": 0.00019783654878196275, + "loss": 1.2178, + "step": 3306 + }, + { + "epoch": 0.08491448691345661, + "grad_norm": 0.9296875, + "learning_rate": 0.0001978356251127571, + "loss": 1.2756, + "step": 3307 + }, + { + "epoch": 0.08494016410937844, + "grad_norm": 0.890625, + "learning_rate": 0.00019783470124857394, + "loss": 1.0409, + "step": 3308 + }, + { + "epoch": 0.08496584130530026, + "grad_norm": 0.890625, + "learning_rate": 0.00019783377718941503, + "loss": 1.2614, + "step": 3309 + }, + { + "epoch": 0.08499151850122208, + "grad_norm": 0.9609375, + "learning_rate": 0.00019783285293528224, + "loss": 1.3144, + "step": 3310 + }, + { + "epoch": 0.08501719569714389, + "grad_norm": 0.8984375, + "learning_rate": 0.0001978319284861774, + "loss": 1.2056, + "step": 3311 + }, + { + "epoch": 0.08504287289306571, + "grad_norm": 0.90625, + "learning_rate": 0.00019783100384210243, + "loss": 1.3258, + "step": 3312 + }, + { + "epoch": 0.08506855008898753, + "grad_norm": 0.8671875, + "learning_rate": 0.00019783007900305903, + "loss": 1.215, + "step": 3313 + }, + { + "epoch": 0.08509422728490935, + "grad_norm": 0.86328125, + "learning_rate": 0.00019782915396904916, + "loss": 1.1779, + "step": 3314 + }, + { + "epoch": 0.08511990448083118, + "grad_norm": 0.92578125, + "learning_rate": 0.00019782822874007461, + "loss": 1.099, + "step": 3315 + }, + { + "epoch": 0.08514558167675298, + "grad_norm": 0.8828125, + "learning_rate": 0.00019782730331613724, + "loss": 1.1973, + "step": 3316 + }, + { + "epoch": 0.08517125887267481, + "grad_norm": 0.88671875, + "learning_rate": 0.0001978263776972389, + "loss": 1.1025, + "step": 3317 + }, + { + "epoch": 0.08519693606859663, + "grad_norm": 0.828125, + "learning_rate": 0.0001978254518833814, + "loss": 1.24, + "step": 3318 + }, + { + "epoch": 0.08522261326451845, + "grad_norm": 0.87890625, + "learning_rate": 0.0001978245258745666, + "loss": 1.2816, + "step": 3319 + }, + { + "epoch": 0.08524829046044027, + "grad_norm": 0.953125, + "learning_rate": 0.0001978235996707964, + "loss": 1.1593, + "step": 3320 + }, + { + "epoch": 0.08527396765636208, + "grad_norm": 0.828125, + "learning_rate": 0.00019782267327207256, + "loss": 1.2219, + "step": 3321 + }, + { + "epoch": 0.0852996448522839, + "grad_norm": 0.96484375, + "learning_rate": 0.00019782174667839703, + "loss": 1.1216, + "step": 3322 + }, + { + "epoch": 0.08532532204820573, + "grad_norm": 0.95703125, + "learning_rate": 0.00019782081988977155, + "loss": 1.4018, + "step": 3323 + }, + { + "epoch": 0.08535099924412755, + "grad_norm": 0.84375, + "learning_rate": 0.00019781989290619802, + "loss": 1.2271, + "step": 3324 + }, + { + "epoch": 0.08537667644004937, + "grad_norm": 1.1328125, + "learning_rate": 0.0001978189657276783, + "loss": 1.2062, + "step": 3325 + }, + { + "epoch": 0.08540235363597118, + "grad_norm": 0.9375, + "learning_rate": 0.00019781803835421417, + "loss": 1.4166, + "step": 3326 + }, + { + "epoch": 0.085428030831893, + "grad_norm": 0.8671875, + "learning_rate": 0.00019781711078580756, + "loss": 1.1514, + "step": 3327 + }, + { + "epoch": 0.08545370802781482, + "grad_norm": 0.90234375, + "learning_rate": 0.00019781618302246027, + "loss": 1.2469, + "step": 3328 + }, + { + "epoch": 0.08547938522373665, + "grad_norm": 0.96875, + "learning_rate": 0.00019781525506417418, + "loss": 1.3499, + "step": 3329 + }, + { + "epoch": 0.08550506241965847, + "grad_norm": 0.91015625, + "learning_rate": 0.0001978143269109511, + "loss": 1.3263, + "step": 3330 + }, + { + "epoch": 0.08553073961558028, + "grad_norm": 0.828125, + "learning_rate": 0.00019781339856279293, + "loss": 1.0786, + "step": 3331 + }, + { + "epoch": 0.0855564168115021, + "grad_norm": 0.9296875, + "learning_rate": 0.00019781247001970145, + "loss": 1.2106, + "step": 3332 + }, + { + "epoch": 0.08558209400742392, + "grad_norm": 0.859375, + "learning_rate": 0.0001978115412816786, + "loss": 1.32, + "step": 3333 + }, + { + "epoch": 0.08560777120334574, + "grad_norm": 0.94921875, + "learning_rate": 0.00019781061234872615, + "loss": 1.2388, + "step": 3334 + }, + { + "epoch": 0.08563344839926756, + "grad_norm": 0.87890625, + "learning_rate": 0.000197809683220846, + "loss": 1.1623, + "step": 3335 + }, + { + "epoch": 0.08565912559518937, + "grad_norm": 0.890625, + "learning_rate": 0.00019780875389804, + "loss": 1.2454, + "step": 3336 + }, + { + "epoch": 0.0856848027911112, + "grad_norm": 0.8515625, + "learning_rate": 0.00019780782438030996, + "loss": 1.2003, + "step": 3337 + }, + { + "epoch": 0.08571047998703302, + "grad_norm": 0.83203125, + "learning_rate": 0.00019780689466765777, + "loss": 1.2733, + "step": 3338 + }, + { + "epoch": 0.08573615718295484, + "grad_norm": 0.88671875, + "learning_rate": 0.00019780596476008525, + "loss": 1.4233, + "step": 3339 + }, + { + "epoch": 0.08576183437887666, + "grad_norm": 0.78125, + "learning_rate": 0.00019780503465759432, + "loss": 1.2411, + "step": 3340 + }, + { + "epoch": 0.08578751157479847, + "grad_norm": 0.84765625, + "learning_rate": 0.00019780410436018677, + "loss": 1.2151, + "step": 3341 + }, + { + "epoch": 0.08581318877072029, + "grad_norm": 0.8671875, + "learning_rate": 0.00019780317386786447, + "loss": 1.1143, + "step": 3342 + }, + { + "epoch": 0.08583886596664211, + "grad_norm": 0.9375, + "learning_rate": 0.0001978022431806293, + "loss": 1.1736, + "step": 3343 + }, + { + "epoch": 0.08586454316256394, + "grad_norm": 0.91015625, + "learning_rate": 0.00019780131229848305, + "loss": 1.1963, + "step": 3344 + }, + { + "epoch": 0.08589022035848576, + "grad_norm": 0.9140625, + "learning_rate": 0.00019780038122142767, + "loss": 1.188, + "step": 3345 + }, + { + "epoch": 0.08591589755440757, + "grad_norm": 0.91015625, + "learning_rate": 0.00019779944994946494, + "loss": 1.3588, + "step": 3346 + }, + { + "epoch": 0.08594157475032939, + "grad_norm": 0.8984375, + "learning_rate": 0.00019779851848259676, + "loss": 1.1109, + "step": 3347 + }, + { + "epoch": 0.08596725194625121, + "grad_norm": 0.83203125, + "learning_rate": 0.00019779758682082497, + "loss": 1.2063, + "step": 3348 + }, + { + "epoch": 0.08599292914217303, + "grad_norm": 0.97265625, + "learning_rate": 0.00019779665496415138, + "loss": 1.2861, + "step": 3349 + }, + { + "epoch": 0.08601860633809486, + "grad_norm": 0.84375, + "learning_rate": 0.00019779572291257793, + "loss": 1.235, + "step": 3350 + }, + { + "epoch": 0.08604428353401666, + "grad_norm": 0.85546875, + "learning_rate": 0.0001977947906661064, + "loss": 1.1216, + "step": 3351 + }, + { + "epoch": 0.08606996072993849, + "grad_norm": 0.82421875, + "learning_rate": 0.00019779385822473873, + "loss": 1.2477, + "step": 3352 + }, + { + "epoch": 0.08609563792586031, + "grad_norm": 0.87890625, + "learning_rate": 0.0001977929255884767, + "loss": 1.2243, + "step": 3353 + }, + { + "epoch": 0.08612131512178213, + "grad_norm": 0.87109375, + "learning_rate": 0.00019779199275732224, + "loss": 1.2065, + "step": 3354 + }, + { + "epoch": 0.08614699231770395, + "grad_norm": 0.88671875, + "learning_rate": 0.00019779105973127714, + "loss": 1.1895, + "step": 3355 + }, + { + "epoch": 0.08617266951362576, + "grad_norm": 0.8671875, + "learning_rate": 0.0001977901265103433, + "loss": 1.2017, + "step": 3356 + }, + { + "epoch": 0.08619834670954758, + "grad_norm": 0.8125, + "learning_rate": 0.00019778919309452255, + "loss": 1.0589, + "step": 3357 + }, + { + "epoch": 0.0862240239054694, + "grad_norm": 0.87890625, + "learning_rate": 0.0001977882594838168, + "loss": 1.1282, + "step": 3358 + }, + { + "epoch": 0.08624970110139123, + "grad_norm": 0.875, + "learning_rate": 0.0001977873256782279, + "loss": 1.3407, + "step": 3359 + }, + { + "epoch": 0.08627537829731304, + "grad_norm": 0.8046875, + "learning_rate": 0.00019778639167775764, + "loss": 1.1199, + "step": 3360 + }, + { + "epoch": 0.08630105549323486, + "grad_norm": 0.8984375, + "learning_rate": 0.00019778545748240799, + "loss": 1.1068, + "step": 3361 + }, + { + "epoch": 0.08632673268915668, + "grad_norm": 0.84375, + "learning_rate": 0.00019778452309218068, + "loss": 1.1324, + "step": 3362 + }, + { + "epoch": 0.0863524098850785, + "grad_norm": 0.8828125, + "learning_rate": 0.00019778358850707772, + "loss": 1.3116, + "step": 3363 + }, + { + "epoch": 0.08637808708100032, + "grad_norm": 0.87890625, + "learning_rate": 0.0001977826537271009, + "loss": 1.1414, + "step": 3364 + }, + { + "epoch": 0.08640376427692213, + "grad_norm": 0.796875, + "learning_rate": 0.00019778171875225203, + "loss": 1.0908, + "step": 3365 + }, + { + "epoch": 0.08642944147284395, + "grad_norm": 0.92578125, + "learning_rate": 0.00019778078358253308, + "loss": 1.1971, + "step": 3366 + }, + { + "epoch": 0.08645511866876578, + "grad_norm": 0.9765625, + "learning_rate": 0.0001977798482179458, + "loss": 1.2653, + "step": 3367 + }, + { + "epoch": 0.0864807958646876, + "grad_norm": 0.8984375, + "learning_rate": 0.00019777891265849217, + "loss": 1.1879, + "step": 3368 + }, + { + "epoch": 0.08650647306060942, + "grad_norm": 0.921875, + "learning_rate": 0.000197777976904174, + "loss": 1.1848, + "step": 3369 + }, + { + "epoch": 0.08653215025653123, + "grad_norm": 0.859375, + "learning_rate": 0.00019777704095499314, + "loss": 1.2273, + "step": 3370 + }, + { + "epoch": 0.08655782745245305, + "grad_norm": 0.8984375, + "learning_rate": 0.0001977761048109515, + "loss": 1.2121, + "step": 3371 + }, + { + "epoch": 0.08658350464837487, + "grad_norm": 0.90234375, + "learning_rate": 0.00019777516847205084, + "loss": 1.1381, + "step": 3372 + }, + { + "epoch": 0.0866091818442967, + "grad_norm": 0.8359375, + "learning_rate": 0.00019777423193829315, + "loss": 1.1687, + "step": 3373 + }, + { + "epoch": 0.08663485904021852, + "grad_norm": 0.8671875, + "learning_rate": 0.00019777329520968023, + "loss": 1.1278, + "step": 3374 + }, + { + "epoch": 0.08666053623614033, + "grad_norm": 0.91015625, + "learning_rate": 0.000197772358286214, + "loss": 1.1706, + "step": 3375 + }, + { + "epoch": 0.08668621343206215, + "grad_norm": 0.90625, + "learning_rate": 0.00019777142116789625, + "loss": 1.2967, + "step": 3376 + }, + { + "epoch": 0.08671189062798397, + "grad_norm": 0.921875, + "learning_rate": 0.0001977704838547289, + "loss": 1.068, + "step": 3377 + }, + { + "epoch": 0.08673756782390579, + "grad_norm": 0.875, + "learning_rate": 0.00019776954634671384, + "loss": 1.2258, + "step": 3378 + }, + { + "epoch": 0.08676324501982761, + "grad_norm": 1.0, + "learning_rate": 0.00019776860864385287, + "loss": 1.2474, + "step": 3379 + }, + { + "epoch": 0.08678892221574942, + "grad_norm": 0.86328125, + "learning_rate": 0.0001977676707461479, + "loss": 1.0664, + "step": 3380 + }, + { + "epoch": 0.08681459941167124, + "grad_norm": 0.84765625, + "learning_rate": 0.00019776673265360078, + "loss": 1.1986, + "step": 3381 + }, + { + "epoch": 0.08684027660759307, + "grad_norm": 0.84375, + "learning_rate": 0.00019776579436621343, + "loss": 1.1121, + "step": 3382 + }, + { + "epoch": 0.08686595380351489, + "grad_norm": 0.83203125, + "learning_rate": 0.00019776485588398766, + "loss": 1.081, + "step": 3383 + }, + { + "epoch": 0.08689163099943671, + "grad_norm": 0.8125, + "learning_rate": 0.00019776391720692534, + "loss": 1.1419, + "step": 3384 + }, + { + "epoch": 0.08691730819535852, + "grad_norm": 0.90625, + "learning_rate": 0.00019776297833502842, + "loss": 1.2476, + "step": 3385 + }, + { + "epoch": 0.08694298539128034, + "grad_norm": 0.90234375, + "learning_rate": 0.0001977620392682987, + "loss": 1.2424, + "step": 3386 + }, + { + "epoch": 0.08696866258720216, + "grad_norm": 0.7890625, + "learning_rate": 0.00019776110000673805, + "loss": 1.1877, + "step": 3387 + }, + { + "epoch": 0.08699433978312399, + "grad_norm": 0.90234375, + "learning_rate": 0.00019776016055034833, + "loss": 1.2428, + "step": 3388 + }, + { + "epoch": 0.08702001697904581, + "grad_norm": 0.890625, + "learning_rate": 0.0001977592208991315, + "loss": 1.103, + "step": 3389 + }, + { + "epoch": 0.08704569417496762, + "grad_norm": 0.828125, + "learning_rate": 0.00019775828105308933, + "loss": 1.1544, + "step": 3390 + }, + { + "epoch": 0.08707137137088944, + "grad_norm": 0.97265625, + "learning_rate": 0.00019775734101222376, + "loss": 1.3028, + "step": 3391 + }, + { + "epoch": 0.08709704856681126, + "grad_norm": 0.88671875, + "learning_rate": 0.00019775640077653663, + "loss": 1.4075, + "step": 3392 + }, + { + "epoch": 0.08712272576273308, + "grad_norm": 0.93359375, + "learning_rate": 0.00019775546034602983, + "loss": 1.1422, + "step": 3393 + }, + { + "epoch": 0.0871484029586549, + "grad_norm": 0.8046875, + "learning_rate": 0.0001977545197207052, + "loss": 1.0718, + "step": 3394 + }, + { + "epoch": 0.08717408015457671, + "grad_norm": 0.83203125, + "learning_rate": 0.00019775357890056467, + "loss": 1.2613, + "step": 3395 + }, + { + "epoch": 0.08719975735049854, + "grad_norm": 0.82421875, + "learning_rate": 0.0001977526378856101, + "loss": 1.099, + "step": 3396 + }, + { + "epoch": 0.08722543454642036, + "grad_norm": 0.92578125, + "learning_rate": 0.00019775169667584331, + "loss": 1.2679, + "step": 3397 + }, + { + "epoch": 0.08725111174234218, + "grad_norm": 0.87890625, + "learning_rate": 0.00019775075527126625, + "loss": 1.2637, + "step": 3398 + }, + { + "epoch": 0.087276788938264, + "grad_norm": 0.84375, + "learning_rate": 0.00019774981367188076, + "loss": 1.3208, + "step": 3399 + }, + { + "epoch": 0.08730246613418581, + "grad_norm": 0.8203125, + "learning_rate": 0.0001977488718776887, + "loss": 1.073, + "step": 3400 + }, + { + "epoch": 0.08732814333010763, + "grad_norm": 0.91796875, + "learning_rate": 0.000197747929888692, + "loss": 1.294, + "step": 3401 + }, + { + "epoch": 0.08735382052602945, + "grad_norm": 0.94921875, + "learning_rate": 0.0001977469877048925, + "loss": 1.2477, + "step": 3402 + }, + { + "epoch": 0.08737949772195128, + "grad_norm": 0.87109375, + "learning_rate": 0.0001977460453262921, + "loss": 1.109, + "step": 3403 + }, + { + "epoch": 0.0874051749178731, + "grad_norm": 0.89453125, + "learning_rate": 0.00019774510275289263, + "loss": 1.1977, + "step": 3404 + }, + { + "epoch": 0.08743085211379491, + "grad_norm": 0.80859375, + "learning_rate": 0.000197744159984696, + "loss": 1.2558, + "step": 3405 + }, + { + "epoch": 0.08745652930971673, + "grad_norm": 0.8828125, + "learning_rate": 0.00019774321702170408, + "loss": 1.3438, + "step": 3406 + }, + { + "epoch": 0.08748220650563855, + "grad_norm": 0.8359375, + "learning_rate": 0.00019774227386391876, + "loss": 0.9025, + "step": 3407 + }, + { + "epoch": 0.08750788370156037, + "grad_norm": 0.86328125, + "learning_rate": 0.00019774133051134193, + "loss": 1.1197, + "step": 3408 + }, + { + "epoch": 0.0875335608974822, + "grad_norm": 0.9375, + "learning_rate": 0.00019774038696397547, + "loss": 1.3373, + "step": 3409 + }, + { + "epoch": 0.087559238093404, + "grad_norm": 0.88671875, + "learning_rate": 0.00019773944322182122, + "loss": 1.2295, + "step": 3410 + }, + { + "epoch": 0.08758491528932583, + "grad_norm": 1.1328125, + "learning_rate": 0.0001977384992848811, + "loss": 1.236, + "step": 3411 + }, + { + "epoch": 0.08761059248524765, + "grad_norm": 0.83203125, + "learning_rate": 0.000197737555153157, + "loss": 1.0346, + "step": 3412 + }, + { + "epoch": 0.08763626968116947, + "grad_norm": 0.91015625, + "learning_rate": 0.00019773661082665077, + "loss": 1.377, + "step": 3413 + }, + { + "epoch": 0.08766194687709129, + "grad_norm": 0.8359375, + "learning_rate": 0.00019773566630536425, + "loss": 1.1271, + "step": 3414 + }, + { + "epoch": 0.0876876240730131, + "grad_norm": 0.89453125, + "learning_rate": 0.00019773472158929946, + "loss": 1.1339, + "step": 3415 + }, + { + "epoch": 0.08771330126893492, + "grad_norm": 0.77734375, + "learning_rate": 0.00019773377667845816, + "loss": 1.0705, + "step": 3416 + }, + { + "epoch": 0.08773897846485675, + "grad_norm": 0.77734375, + "learning_rate": 0.00019773283157284227, + "loss": 1.0183, + "step": 3417 + }, + { + "epoch": 0.08776465566077857, + "grad_norm": 0.84765625, + "learning_rate": 0.00019773188627245368, + "loss": 1.1152, + "step": 3418 + }, + { + "epoch": 0.08779033285670039, + "grad_norm": 0.90234375, + "learning_rate": 0.00019773094077729425, + "loss": 1.116, + "step": 3419 + }, + { + "epoch": 0.0878160100526222, + "grad_norm": 0.875, + "learning_rate": 0.0001977299950873659, + "loss": 1.2162, + "step": 3420 + }, + { + "epoch": 0.08784168724854402, + "grad_norm": 0.84765625, + "learning_rate": 0.00019772904920267048, + "loss": 1.1196, + "step": 3421 + }, + { + "epoch": 0.08786736444446584, + "grad_norm": 0.88671875, + "learning_rate": 0.00019772810312320994, + "loss": 1.2709, + "step": 3422 + }, + { + "epoch": 0.08789304164038766, + "grad_norm": 0.96875, + "learning_rate": 0.00019772715684898606, + "loss": 1.3344, + "step": 3423 + }, + { + "epoch": 0.08791871883630949, + "grad_norm": 0.83984375, + "learning_rate": 0.0001977262103800008, + "loss": 1.1656, + "step": 3424 + }, + { + "epoch": 0.0879443960322313, + "grad_norm": 0.84375, + "learning_rate": 0.00019772526371625605, + "loss": 1.075, + "step": 3425 + }, + { + "epoch": 0.08797007322815312, + "grad_norm": 0.8515625, + "learning_rate": 0.00019772431685775366, + "loss": 1.1003, + "step": 3426 + }, + { + "epoch": 0.08799575042407494, + "grad_norm": 0.87890625, + "learning_rate": 0.00019772336980449556, + "loss": 1.2277, + "step": 3427 + }, + { + "epoch": 0.08802142761999676, + "grad_norm": 0.859375, + "learning_rate": 0.0001977224225564836, + "loss": 1.2231, + "step": 3428 + }, + { + "epoch": 0.08804710481591858, + "grad_norm": 0.859375, + "learning_rate": 0.00019772147511371965, + "loss": 1.2706, + "step": 3429 + }, + { + "epoch": 0.08807278201184039, + "grad_norm": 0.94921875, + "learning_rate": 0.00019772052747620564, + "loss": 1.3283, + "step": 3430 + }, + { + "epoch": 0.08809845920776221, + "grad_norm": 1.0390625, + "learning_rate": 0.00019771957964394345, + "loss": 1.1445, + "step": 3431 + }, + { + "epoch": 0.08812413640368404, + "grad_norm": 0.8984375, + "learning_rate": 0.000197718631616935, + "loss": 1.3331, + "step": 3432 + }, + { + "epoch": 0.08814981359960586, + "grad_norm": 0.796875, + "learning_rate": 0.0001977176833951821, + "loss": 1.2168, + "step": 3433 + }, + { + "epoch": 0.08817549079552768, + "grad_norm": 0.9140625, + "learning_rate": 0.0001977167349786867, + "loss": 1.0645, + "step": 3434 + }, + { + "epoch": 0.08820116799144949, + "grad_norm": 0.88671875, + "learning_rate": 0.00019771578636745068, + "loss": 1.1214, + "step": 3435 + }, + { + "epoch": 0.08822684518737131, + "grad_norm": 0.94140625, + "learning_rate": 0.00019771483756147592, + "loss": 1.1401, + "step": 3436 + }, + { + "epoch": 0.08825252238329313, + "grad_norm": 0.875, + "learning_rate": 0.0001977138885607643, + "loss": 1.0407, + "step": 3437 + }, + { + "epoch": 0.08827819957921496, + "grad_norm": 0.9296875, + "learning_rate": 0.00019771293936531774, + "loss": 1.278, + "step": 3438 + }, + { + "epoch": 0.08830387677513678, + "grad_norm": 0.87890625, + "learning_rate": 0.0001977119899751381, + "loss": 1.2765, + "step": 3439 + }, + { + "epoch": 0.08832955397105859, + "grad_norm": 0.90234375, + "learning_rate": 0.00019771104039022731, + "loss": 1.2111, + "step": 3440 + }, + { + "epoch": 0.08835523116698041, + "grad_norm": 0.75390625, + "learning_rate": 0.00019771009061058723, + "loss": 1.0282, + "step": 3441 + }, + { + "epoch": 0.08838090836290223, + "grad_norm": 0.8359375, + "learning_rate": 0.00019770914063621978, + "loss": 1.3079, + "step": 3442 + }, + { + "epoch": 0.08840658555882405, + "grad_norm": 0.85546875, + "learning_rate": 0.00019770819046712682, + "loss": 1.2296, + "step": 3443 + }, + { + "epoch": 0.08843226275474587, + "grad_norm": 0.89453125, + "learning_rate": 0.0001977072401033103, + "loss": 1.3098, + "step": 3444 + }, + { + "epoch": 0.08845793995066768, + "grad_norm": 0.90234375, + "learning_rate": 0.00019770628954477205, + "loss": 1.4024, + "step": 3445 + }, + { + "epoch": 0.0884836171465895, + "grad_norm": 0.8984375, + "learning_rate": 0.00019770533879151397, + "loss": 1.0906, + "step": 3446 + }, + { + "epoch": 0.08850929434251133, + "grad_norm": 0.90234375, + "learning_rate": 0.000197704387843538, + "loss": 1.1841, + "step": 3447 + }, + { + "epoch": 0.08853497153843315, + "grad_norm": 0.92578125, + "learning_rate": 0.00019770343670084603, + "loss": 1.1031, + "step": 3448 + }, + { + "epoch": 0.08856064873435497, + "grad_norm": 0.80078125, + "learning_rate": 0.00019770248536343988, + "loss": 1.2232, + "step": 3449 + }, + { + "epoch": 0.08858632593027678, + "grad_norm": 0.85546875, + "learning_rate": 0.00019770153383132154, + "loss": 1.2045, + "step": 3450 + }, + { + "epoch": 0.0886120031261986, + "grad_norm": 0.859375, + "learning_rate": 0.00019770058210449287, + "loss": 1.1571, + "step": 3451 + }, + { + "epoch": 0.08863768032212042, + "grad_norm": 0.921875, + "learning_rate": 0.00019769963018295575, + "loss": 1.2859, + "step": 3452 + }, + { + "epoch": 0.08866335751804225, + "grad_norm": 0.890625, + "learning_rate": 0.00019769867806671213, + "loss": 1.095, + "step": 3453 + }, + { + "epoch": 0.08868903471396407, + "grad_norm": 0.890625, + "learning_rate": 0.00019769772575576385, + "loss": 1.2418, + "step": 3454 + }, + { + "epoch": 0.08871471190988588, + "grad_norm": 0.8828125, + "learning_rate": 0.0001976967732501128, + "loss": 1.2457, + "step": 3455 + }, + { + "epoch": 0.0887403891058077, + "grad_norm": 0.9296875, + "learning_rate": 0.00019769582054976093, + "loss": 1.2137, + "step": 3456 + }, + { + "epoch": 0.08876606630172952, + "grad_norm": 0.94140625, + "learning_rate": 0.00019769486765471012, + "loss": 1.204, + "step": 3457 + }, + { + "epoch": 0.08879174349765134, + "grad_norm": 0.91015625, + "learning_rate": 0.00019769391456496223, + "loss": 1.1165, + "step": 3458 + }, + { + "epoch": 0.08881742069357317, + "grad_norm": 0.82421875, + "learning_rate": 0.0001976929612805192, + "loss": 1.1577, + "step": 3459 + }, + { + "epoch": 0.08884309788949497, + "grad_norm": 0.84375, + "learning_rate": 0.00019769200780138298, + "loss": 1.2467, + "step": 3460 + }, + { + "epoch": 0.0888687750854168, + "grad_norm": 0.7890625, + "learning_rate": 0.00019769105412755536, + "loss": 1.2336, + "step": 3461 + }, + { + "epoch": 0.08889445228133862, + "grad_norm": 0.80859375, + "learning_rate": 0.00019769010025903828, + "loss": 1.1852, + "step": 3462 + }, + { + "epoch": 0.08892012947726044, + "grad_norm": 0.8203125, + "learning_rate": 0.00019768914619583373, + "loss": 1.0817, + "step": 3463 + }, + { + "epoch": 0.08894580667318225, + "grad_norm": 0.87890625, + "learning_rate": 0.00019768819193794347, + "loss": 1.0664, + "step": 3464 + }, + { + "epoch": 0.08897148386910407, + "grad_norm": 0.83984375, + "learning_rate": 0.00019768723748536947, + "loss": 1.1639, + "step": 3465 + }, + { + "epoch": 0.08899716106502589, + "grad_norm": 0.875, + "learning_rate": 0.00019768628283811364, + "loss": 1.1187, + "step": 3466 + }, + { + "epoch": 0.08902283826094771, + "grad_norm": 0.9609375, + "learning_rate": 0.0001976853279961779, + "loss": 1.2422, + "step": 3467 + }, + { + "epoch": 0.08904851545686954, + "grad_norm": 0.875, + "learning_rate": 0.00019768437295956408, + "loss": 1.2136, + "step": 3468 + }, + { + "epoch": 0.08907419265279135, + "grad_norm": 0.83203125, + "learning_rate": 0.00019768341772827416, + "loss": 1.0745, + "step": 3469 + }, + { + "epoch": 0.08909986984871317, + "grad_norm": 0.875, + "learning_rate": 0.00019768246230231, + "loss": 1.183, + "step": 3470 + }, + { + "epoch": 0.08912554704463499, + "grad_norm": 0.84765625, + "learning_rate": 0.0001976815066816735, + "loss": 1.3272, + "step": 3471 + }, + { + "epoch": 0.08915122424055681, + "grad_norm": 0.859375, + "learning_rate": 0.00019768055086636662, + "loss": 1.1612, + "step": 3472 + }, + { + "epoch": 0.08917690143647863, + "grad_norm": 0.8203125, + "learning_rate": 0.0001976795948563912, + "loss": 1.2123, + "step": 3473 + }, + { + "epoch": 0.08920257863240044, + "grad_norm": 0.90234375, + "learning_rate": 0.00019767863865174915, + "loss": 1.3453, + "step": 3474 + }, + { + "epoch": 0.08922825582832226, + "grad_norm": 0.90234375, + "learning_rate": 0.00019767768225244242, + "loss": 1.2323, + "step": 3475 + }, + { + "epoch": 0.08925393302424409, + "grad_norm": 0.875, + "learning_rate": 0.00019767672565847294, + "loss": 1.2274, + "step": 3476 + }, + { + "epoch": 0.08927961022016591, + "grad_norm": 0.91796875, + "learning_rate": 0.0001976757688698425, + "loss": 1.2835, + "step": 3477 + }, + { + "epoch": 0.08930528741608773, + "grad_norm": 0.859375, + "learning_rate": 0.0001976748118865531, + "loss": 1.167, + "step": 3478 + }, + { + "epoch": 0.08933096461200954, + "grad_norm": 0.84765625, + "learning_rate": 0.0001976738547086066, + "loss": 1.3696, + "step": 3479 + }, + { + "epoch": 0.08935664180793136, + "grad_norm": 0.88671875, + "learning_rate": 0.00019767289733600495, + "loss": 1.2541, + "step": 3480 + }, + { + "epoch": 0.08938231900385318, + "grad_norm": 0.8515625, + "learning_rate": 0.00019767193976875005, + "loss": 1.1841, + "step": 3481 + }, + { + "epoch": 0.089407996199775, + "grad_norm": 0.8671875, + "learning_rate": 0.00019767098200684378, + "loss": 1.3088, + "step": 3482 + }, + { + "epoch": 0.08943367339569683, + "grad_norm": 0.81640625, + "learning_rate": 0.0001976700240502881, + "loss": 1.1758, + "step": 3483 + }, + { + "epoch": 0.08945935059161864, + "grad_norm": 0.91015625, + "learning_rate": 0.00019766906589908483, + "loss": 1.2384, + "step": 3484 + }, + { + "epoch": 0.08948502778754046, + "grad_norm": 0.86328125, + "learning_rate": 0.00019766810755323598, + "loss": 1.2955, + "step": 3485 + }, + { + "epoch": 0.08951070498346228, + "grad_norm": 0.87890625, + "learning_rate": 0.0001976671490127434, + "loss": 1.2288, + "step": 3486 + }, + { + "epoch": 0.0895363821793841, + "grad_norm": 1.09375, + "learning_rate": 0.000197666190277609, + "loss": 1.0241, + "step": 3487 + }, + { + "epoch": 0.08956205937530592, + "grad_norm": 0.8671875, + "learning_rate": 0.0001976652313478347, + "loss": 1.3169, + "step": 3488 + }, + { + "epoch": 0.08958773657122773, + "grad_norm": 0.87890625, + "learning_rate": 0.00019766427222342244, + "loss": 1.2537, + "step": 3489 + }, + { + "epoch": 0.08961341376714956, + "grad_norm": 0.79296875, + "learning_rate": 0.00019766331290437406, + "loss": 1.2139, + "step": 3490 + }, + { + "epoch": 0.08963909096307138, + "grad_norm": 0.8671875, + "learning_rate": 0.00019766235339069158, + "loss": 1.1276, + "step": 3491 + }, + { + "epoch": 0.0896647681589932, + "grad_norm": 0.81640625, + "learning_rate": 0.00019766139368237684, + "loss": 1.1526, + "step": 3492 + }, + { + "epoch": 0.08969044535491502, + "grad_norm": 0.86328125, + "learning_rate": 0.00019766043377943176, + "loss": 1.1978, + "step": 3493 + }, + { + "epoch": 0.08971612255083683, + "grad_norm": 0.93359375, + "learning_rate": 0.0001976594736818582, + "loss": 1.2248, + "step": 3494 + }, + { + "epoch": 0.08974179974675865, + "grad_norm": 0.81640625, + "learning_rate": 0.0001976585133896582, + "loss": 1.1323, + "step": 3495 + }, + { + "epoch": 0.08976747694268047, + "grad_norm": 0.828125, + "learning_rate": 0.00019765755290283358, + "loss": 1.137, + "step": 3496 + }, + { + "epoch": 0.0897931541386023, + "grad_norm": 0.85546875, + "learning_rate": 0.00019765659222138632, + "loss": 1.2348, + "step": 3497 + }, + { + "epoch": 0.08981883133452412, + "grad_norm": 0.859375, + "learning_rate": 0.0001976556313453182, + "loss": 1.1223, + "step": 3498 + }, + { + "epoch": 0.08984450853044593, + "grad_norm": 0.87890625, + "learning_rate": 0.0001976546702746313, + "loss": 1.252, + "step": 3499 + }, + { + "epoch": 0.08987018572636775, + "grad_norm": 0.90234375, + "learning_rate": 0.00019765370900932746, + "loss": 1.2062, + "step": 3500 + }, + { + "epoch": 0.08989586292228957, + "grad_norm": 0.80078125, + "learning_rate": 0.0001976527475494086, + "loss": 1.1739, + "step": 3501 + }, + { + "epoch": 0.0899215401182114, + "grad_norm": 0.9296875, + "learning_rate": 0.0001976517858948766, + "loss": 1.244, + "step": 3502 + }, + { + "epoch": 0.08994721731413322, + "grad_norm": 0.9453125, + "learning_rate": 0.00019765082404573346, + "loss": 1.1425, + "step": 3503 + }, + { + "epoch": 0.08997289451005502, + "grad_norm": 0.8359375, + "learning_rate": 0.00019764986200198102, + "loss": 1.1702, + "step": 3504 + }, + { + "epoch": 0.08999857170597685, + "grad_norm": 0.875, + "learning_rate": 0.0001976488997636212, + "loss": 1.2471, + "step": 3505 + }, + { + "epoch": 0.09002424890189867, + "grad_norm": 1.1796875, + "learning_rate": 0.00019764793733065598, + "loss": 1.2444, + "step": 3506 + }, + { + "epoch": 0.09004992609782049, + "grad_norm": 0.9140625, + "learning_rate": 0.00019764697470308723, + "loss": 1.2147, + "step": 3507 + }, + { + "epoch": 0.09007560329374231, + "grad_norm": 0.90234375, + "learning_rate": 0.00019764601188091688, + "loss": 1.2127, + "step": 3508 + }, + { + "epoch": 0.09010128048966412, + "grad_norm": 0.85546875, + "learning_rate": 0.00019764504886414684, + "loss": 1.1834, + "step": 3509 + }, + { + "epoch": 0.09012695768558594, + "grad_norm": 0.90625, + "learning_rate": 0.00019764408565277903, + "loss": 1.2221, + "step": 3510 + }, + { + "epoch": 0.09015263488150776, + "grad_norm": 0.890625, + "learning_rate": 0.0001976431222468154, + "loss": 1.1683, + "step": 3511 + }, + { + "epoch": 0.09017831207742959, + "grad_norm": 0.91015625, + "learning_rate": 0.00019764215864625785, + "loss": 1.0982, + "step": 3512 + }, + { + "epoch": 0.09020398927335141, + "grad_norm": 0.890625, + "learning_rate": 0.00019764119485110828, + "loss": 1.2546, + "step": 3513 + }, + { + "epoch": 0.09022966646927322, + "grad_norm": 0.87890625, + "learning_rate": 0.00019764023086136864, + "loss": 1.1547, + "step": 3514 + }, + { + "epoch": 0.09025534366519504, + "grad_norm": 0.86328125, + "learning_rate": 0.00019763926667704085, + "loss": 1.2131, + "step": 3515 + }, + { + "epoch": 0.09028102086111686, + "grad_norm": 0.8828125, + "learning_rate": 0.0001976383022981268, + "loss": 1.2215, + "step": 3516 + }, + { + "epoch": 0.09030669805703868, + "grad_norm": 0.83984375, + "learning_rate": 0.00019763733772462845, + "loss": 1.1932, + "step": 3517 + }, + { + "epoch": 0.0903323752529605, + "grad_norm": 0.87890625, + "learning_rate": 0.0001976363729565477, + "loss": 1.3536, + "step": 3518 + }, + { + "epoch": 0.09035805244888231, + "grad_norm": 0.88671875, + "learning_rate": 0.00019763540799388647, + "loss": 1.1482, + "step": 3519 + }, + { + "epoch": 0.09038372964480414, + "grad_norm": 0.89453125, + "learning_rate": 0.0001976344428366467, + "loss": 1.3066, + "step": 3520 + }, + { + "epoch": 0.09040940684072596, + "grad_norm": 0.828125, + "learning_rate": 0.0001976334774848303, + "loss": 1.0399, + "step": 3521 + }, + { + "epoch": 0.09043508403664778, + "grad_norm": 0.9296875, + "learning_rate": 0.0001976325119384392, + "loss": 1.193, + "step": 3522 + }, + { + "epoch": 0.0904607612325696, + "grad_norm": 0.90625, + "learning_rate": 0.00019763154619747533, + "loss": 1.1619, + "step": 3523 + }, + { + "epoch": 0.09048643842849141, + "grad_norm": 1.0546875, + "learning_rate": 0.0001976305802619406, + "loss": 1.178, + "step": 3524 + }, + { + "epoch": 0.09051211562441323, + "grad_norm": 0.8359375, + "learning_rate": 0.00019762961413183694, + "loss": 1.0072, + "step": 3525 + }, + { + "epoch": 0.09053779282033506, + "grad_norm": 0.87890625, + "learning_rate": 0.00019762864780716627, + "loss": 1.1181, + "step": 3526 + }, + { + "epoch": 0.09056347001625688, + "grad_norm": 0.86328125, + "learning_rate": 0.00019762768128793055, + "loss": 1.2408, + "step": 3527 + }, + { + "epoch": 0.0905891472121787, + "grad_norm": 0.87890625, + "learning_rate": 0.00019762671457413165, + "loss": 1.2244, + "step": 3528 + }, + { + "epoch": 0.09061482440810051, + "grad_norm": 0.9609375, + "learning_rate": 0.00019762574766577154, + "loss": 1.2748, + "step": 3529 + }, + { + "epoch": 0.09064050160402233, + "grad_norm": 0.9296875, + "learning_rate": 0.00019762478056285215, + "loss": 1.2724, + "step": 3530 + }, + { + "epoch": 0.09066617879994415, + "grad_norm": 0.87890625, + "learning_rate": 0.0001976238132653754, + "loss": 1.095, + "step": 3531 + }, + { + "epoch": 0.09069185599586597, + "grad_norm": 1.015625, + "learning_rate": 0.00019762284577334315, + "loss": 1.1536, + "step": 3532 + }, + { + "epoch": 0.0907175331917878, + "grad_norm": 7.0625, + "learning_rate": 0.0001976218780867574, + "loss": 1.2239, + "step": 3533 + }, + { + "epoch": 0.0907432103877096, + "grad_norm": 0.93359375, + "learning_rate": 0.0001976209102056201, + "loss": 1.2654, + "step": 3534 + }, + { + "epoch": 0.09076888758363143, + "grad_norm": 0.84375, + "learning_rate": 0.00019761994212993313, + "loss": 1.0803, + "step": 3535 + }, + { + "epoch": 0.09079456477955325, + "grad_norm": 0.8359375, + "learning_rate": 0.00019761897385969844, + "loss": 1.2197, + "step": 3536 + }, + { + "epoch": 0.09082024197547507, + "grad_norm": 0.83984375, + "learning_rate": 0.00019761800539491796, + "loss": 1.051, + "step": 3537 + }, + { + "epoch": 0.0908459191713969, + "grad_norm": 1.3203125, + "learning_rate": 0.00019761703673559357, + "loss": 1.0659, + "step": 3538 + }, + { + "epoch": 0.0908715963673187, + "grad_norm": 0.91015625, + "learning_rate": 0.0001976160678817273, + "loss": 1.1237, + "step": 3539 + }, + { + "epoch": 0.09089727356324052, + "grad_norm": 0.91796875, + "learning_rate": 0.00019761509883332097, + "loss": 1.342, + "step": 3540 + }, + { + "epoch": 0.09092295075916235, + "grad_norm": 0.95703125, + "learning_rate": 0.0001976141295903766, + "loss": 1.2189, + "step": 3541 + }, + { + "epoch": 0.09094862795508417, + "grad_norm": 0.99609375, + "learning_rate": 0.0001976131601528961, + "loss": 1.244, + "step": 3542 + }, + { + "epoch": 0.09097430515100599, + "grad_norm": 0.98046875, + "learning_rate": 0.00019761219052088133, + "loss": 1.2537, + "step": 3543 + }, + { + "epoch": 0.0909999823469278, + "grad_norm": 0.97265625, + "learning_rate": 0.00019761122069433435, + "loss": 1.2231, + "step": 3544 + }, + { + "epoch": 0.09102565954284962, + "grad_norm": 0.91796875, + "learning_rate": 0.00019761025067325697, + "loss": 1.2404, + "step": 3545 + }, + { + "epoch": 0.09105133673877144, + "grad_norm": 0.8203125, + "learning_rate": 0.0001976092804576512, + "loss": 1.3409, + "step": 3546 + }, + { + "epoch": 0.09107701393469327, + "grad_norm": 0.94921875, + "learning_rate": 0.00019760831004751896, + "loss": 1.2185, + "step": 3547 + }, + { + "epoch": 0.09110269113061509, + "grad_norm": 0.90625, + "learning_rate": 0.00019760733944286217, + "loss": 1.3951, + "step": 3548 + }, + { + "epoch": 0.0911283683265369, + "grad_norm": 0.9296875, + "learning_rate": 0.0001976063686436828, + "loss": 1.2014, + "step": 3549 + }, + { + "epoch": 0.09115404552245872, + "grad_norm": 0.8828125, + "learning_rate": 0.0001976053976499827, + "loss": 1.2398, + "step": 3550 + }, + { + "epoch": 0.09117972271838054, + "grad_norm": 0.921875, + "learning_rate": 0.00019760442646176387, + "loss": 1.2559, + "step": 3551 + }, + { + "epoch": 0.09120539991430236, + "grad_norm": 0.84375, + "learning_rate": 0.00019760345507902828, + "loss": 1.2502, + "step": 3552 + }, + { + "epoch": 0.09123107711022418, + "grad_norm": 0.90234375, + "learning_rate": 0.00019760248350177774, + "loss": 1.2942, + "step": 3553 + }, + { + "epoch": 0.09125675430614599, + "grad_norm": 0.88671875, + "learning_rate": 0.00019760151173001434, + "loss": 1.2311, + "step": 3554 + }, + { + "epoch": 0.09128243150206782, + "grad_norm": 0.84765625, + "learning_rate": 0.0001976005397637399, + "loss": 1.1235, + "step": 3555 + }, + { + "epoch": 0.09130810869798964, + "grad_norm": 0.92578125, + "learning_rate": 0.0001975995676029564, + "loss": 1.2001, + "step": 3556 + }, + { + "epoch": 0.09133378589391146, + "grad_norm": 0.85546875, + "learning_rate": 0.00019759859524766583, + "loss": 1.2287, + "step": 3557 + }, + { + "epoch": 0.09135946308983328, + "grad_norm": 0.84375, + "learning_rate": 0.00019759762269787, + "loss": 1.268, + "step": 3558 + }, + { + "epoch": 0.09138514028575509, + "grad_norm": 0.89453125, + "learning_rate": 0.000197596649953571, + "loss": 1.2827, + "step": 3559 + }, + { + "epoch": 0.09141081748167691, + "grad_norm": 0.94140625, + "learning_rate": 0.00019759567701477064, + "loss": 1.3658, + "step": 3560 + }, + { + "epoch": 0.09143649467759873, + "grad_norm": 1.125, + "learning_rate": 0.00019759470388147095, + "loss": 1.241, + "step": 3561 + }, + { + "epoch": 0.09146217187352056, + "grad_norm": 0.8671875, + "learning_rate": 0.0001975937305536738, + "loss": 1.0729, + "step": 3562 + }, + { + "epoch": 0.09148784906944236, + "grad_norm": 0.7890625, + "learning_rate": 0.00019759275703138115, + "loss": 1.1098, + "step": 3563 + }, + { + "epoch": 0.09151352626536419, + "grad_norm": 0.875, + "learning_rate": 0.000197591783314595, + "loss": 1.2655, + "step": 3564 + }, + { + "epoch": 0.09153920346128601, + "grad_norm": 0.8046875, + "learning_rate": 0.0001975908094033172, + "loss": 1.2426, + "step": 3565 + }, + { + "epoch": 0.09156488065720783, + "grad_norm": 0.8203125, + "learning_rate": 0.00019758983529754974, + "loss": 1.0285, + "step": 3566 + }, + { + "epoch": 0.09159055785312965, + "grad_norm": 0.87890625, + "learning_rate": 0.00019758886099729454, + "loss": 1.1256, + "step": 3567 + }, + { + "epoch": 0.09161623504905146, + "grad_norm": 0.96875, + "learning_rate": 0.00019758788650255357, + "loss": 1.0601, + "step": 3568 + }, + { + "epoch": 0.09164191224497328, + "grad_norm": 2.15625, + "learning_rate": 0.00019758691181332875, + "loss": 1.244, + "step": 3569 + }, + { + "epoch": 0.0916675894408951, + "grad_norm": 0.9921875, + "learning_rate": 0.00019758593692962206, + "loss": 1.2478, + "step": 3570 + }, + { + "epoch": 0.09169326663681693, + "grad_norm": 0.8203125, + "learning_rate": 0.00019758496185143538, + "loss": 1.1113, + "step": 3571 + }, + { + "epoch": 0.09171894383273875, + "grad_norm": 0.87109375, + "learning_rate": 0.0001975839865787707, + "loss": 1.1232, + "step": 3572 + }, + { + "epoch": 0.09174462102866056, + "grad_norm": 0.96484375, + "learning_rate": 0.00019758301111162996, + "loss": 1.2238, + "step": 3573 + }, + { + "epoch": 0.09177029822458238, + "grad_norm": 0.88671875, + "learning_rate": 0.00019758203545001507, + "loss": 1.2019, + "step": 3574 + }, + { + "epoch": 0.0917959754205042, + "grad_norm": 0.9296875, + "learning_rate": 0.000197581059593928, + "loss": 1.3095, + "step": 3575 + }, + { + "epoch": 0.09182165261642602, + "grad_norm": 0.828125, + "learning_rate": 0.00019758008354337072, + "loss": 1.0971, + "step": 3576 + }, + { + "epoch": 0.09184732981234785, + "grad_norm": 0.859375, + "learning_rate": 0.00019757910729834512, + "loss": 1.2007, + "step": 3577 + }, + { + "epoch": 0.09187300700826966, + "grad_norm": 0.8515625, + "learning_rate": 0.00019757813085885317, + "loss": 1.3019, + "step": 3578 + }, + { + "epoch": 0.09189868420419148, + "grad_norm": 1.2421875, + "learning_rate": 0.00019757715422489685, + "loss": 1.1733, + "step": 3579 + }, + { + "epoch": 0.0919243614001133, + "grad_norm": 0.90234375, + "learning_rate": 0.00019757617739647808, + "loss": 1.1918, + "step": 3580 + }, + { + "epoch": 0.09195003859603512, + "grad_norm": 0.8984375, + "learning_rate": 0.00019757520037359875, + "loss": 1.3057, + "step": 3581 + }, + { + "epoch": 0.09197571579195694, + "grad_norm": 0.859375, + "learning_rate": 0.00019757422315626092, + "loss": 1.2066, + "step": 3582 + }, + { + "epoch": 0.09200139298787875, + "grad_norm": 0.8671875, + "learning_rate": 0.00019757324574446643, + "loss": 1.2865, + "step": 3583 + }, + { + "epoch": 0.09202707018380057, + "grad_norm": 0.83984375, + "learning_rate": 0.00019757226813821728, + "loss": 1.2038, + "step": 3584 + }, + { + "epoch": 0.0920527473797224, + "grad_norm": 0.93359375, + "learning_rate": 0.00019757129033751543, + "loss": 1.1713, + "step": 3585 + }, + { + "epoch": 0.09207842457564422, + "grad_norm": 0.8359375, + "learning_rate": 0.0001975703123423628, + "loss": 1.2662, + "step": 3586 + }, + { + "epoch": 0.09210410177156604, + "grad_norm": 0.9609375, + "learning_rate": 0.00019756933415276135, + "loss": 1.0773, + "step": 3587 + }, + { + "epoch": 0.09212977896748785, + "grad_norm": 0.96484375, + "learning_rate": 0.00019756835576871304, + "loss": 1.309, + "step": 3588 + }, + { + "epoch": 0.09215545616340967, + "grad_norm": 0.84375, + "learning_rate": 0.00019756737719021981, + "loss": 1.1691, + "step": 3589 + }, + { + "epoch": 0.0921811333593315, + "grad_norm": 0.85546875, + "learning_rate": 0.0001975663984172836, + "loss": 1.1459, + "step": 3590 + }, + { + "epoch": 0.09220681055525332, + "grad_norm": 0.90625, + "learning_rate": 0.00019756541944990642, + "loss": 1.1238, + "step": 3591 + }, + { + "epoch": 0.09223248775117514, + "grad_norm": 0.87109375, + "learning_rate": 0.00019756444028809012, + "loss": 1.2424, + "step": 3592 + }, + { + "epoch": 0.09225816494709695, + "grad_norm": 0.86328125, + "learning_rate": 0.00019756346093183672, + "loss": 1.1603, + "step": 3593 + }, + { + "epoch": 0.09228384214301877, + "grad_norm": 0.84375, + "learning_rate": 0.00019756248138114814, + "loss": 0.9949, + "step": 3594 + }, + { + "epoch": 0.09230951933894059, + "grad_norm": 0.8359375, + "learning_rate": 0.00019756150163602635, + "loss": 1.1268, + "step": 3595 + }, + { + "epoch": 0.09233519653486241, + "grad_norm": 1.078125, + "learning_rate": 0.00019756052169647332, + "loss": 1.2449, + "step": 3596 + }, + { + "epoch": 0.09236087373078423, + "grad_norm": 0.9296875, + "learning_rate": 0.00019755954156249093, + "loss": 1.1496, + "step": 3597 + }, + { + "epoch": 0.09238655092670604, + "grad_norm": 0.77734375, + "learning_rate": 0.00019755856123408126, + "loss": 1.2095, + "step": 3598 + }, + { + "epoch": 0.09241222812262787, + "grad_norm": 1.09375, + "learning_rate": 0.00019755758071124616, + "loss": 1.169, + "step": 3599 + }, + { + "epoch": 0.09243790531854969, + "grad_norm": 0.9375, + "learning_rate": 0.00019755659999398762, + "loss": 1.4576, + "step": 3600 + }, + { + "epoch": 0.09246358251447151, + "grad_norm": 0.921875, + "learning_rate": 0.00019755561908230758, + "loss": 1.1245, + "step": 3601 + }, + { + "epoch": 0.09248925971039333, + "grad_norm": 0.83984375, + "learning_rate": 0.00019755463797620798, + "loss": 1.2389, + "step": 3602 + }, + { + "epoch": 0.09251493690631514, + "grad_norm": 0.90625, + "learning_rate": 0.00019755365667569085, + "loss": 1.2031, + "step": 3603 + }, + { + "epoch": 0.09254061410223696, + "grad_norm": 0.94921875, + "learning_rate": 0.00019755267518075807, + "loss": 1.1247, + "step": 3604 + }, + { + "epoch": 0.09256629129815878, + "grad_norm": 0.84375, + "learning_rate": 0.0001975516934914116, + "loss": 1.1493, + "step": 3605 + }, + { + "epoch": 0.0925919684940806, + "grad_norm": 1.1640625, + "learning_rate": 0.00019755071160765342, + "loss": 1.1034, + "step": 3606 + }, + { + "epoch": 0.09261764569000243, + "grad_norm": 0.80078125, + "learning_rate": 0.00019754972952948553, + "loss": 1.19, + "step": 3607 + }, + { + "epoch": 0.09264332288592424, + "grad_norm": 0.93359375, + "learning_rate": 0.0001975487472569098, + "loss": 1.1307, + "step": 3608 + }, + { + "epoch": 0.09266900008184606, + "grad_norm": 0.9140625, + "learning_rate": 0.0001975477647899282, + "loss": 1.1305, + "step": 3609 + }, + { + "epoch": 0.09269467727776788, + "grad_norm": 0.8125, + "learning_rate": 0.00019754678212854275, + "loss": 1.283, + "step": 3610 + }, + { + "epoch": 0.0927203544736897, + "grad_norm": 0.953125, + "learning_rate": 0.0001975457992727554, + "loss": 1.1449, + "step": 3611 + }, + { + "epoch": 0.09274603166961153, + "grad_norm": 0.89453125, + "learning_rate": 0.00019754481622256807, + "loss": 1.1921, + "step": 3612 + }, + { + "epoch": 0.09277170886553333, + "grad_norm": 0.90625, + "learning_rate": 0.0001975438329779827, + "loss": 1.3327, + "step": 3613 + }, + { + "epoch": 0.09279738606145516, + "grad_norm": 0.859375, + "learning_rate": 0.0001975428495390013, + "loss": 1.08, + "step": 3614 + }, + { + "epoch": 0.09282306325737698, + "grad_norm": 0.77734375, + "learning_rate": 0.00019754186590562582, + "loss": 1.114, + "step": 3615 + }, + { + "epoch": 0.0928487404532988, + "grad_norm": 0.875, + "learning_rate": 0.0001975408820778582, + "loss": 1.3341, + "step": 3616 + }, + { + "epoch": 0.09287441764922062, + "grad_norm": 1.046875, + "learning_rate": 0.00019753989805570044, + "loss": 1.3621, + "step": 3617 + }, + { + "epoch": 0.09290009484514243, + "grad_norm": 1.0546875, + "learning_rate": 0.00019753891383915445, + "loss": 1.1447, + "step": 3618 + }, + { + "epoch": 0.09292577204106425, + "grad_norm": 0.82421875, + "learning_rate": 0.0001975379294282222, + "loss": 1.0748, + "step": 3619 + }, + { + "epoch": 0.09295144923698608, + "grad_norm": 0.86328125, + "learning_rate": 0.0001975369448229057, + "loss": 1.1988, + "step": 3620 + }, + { + "epoch": 0.0929771264329079, + "grad_norm": 0.91796875, + "learning_rate": 0.00019753596002320686, + "loss": 1.289, + "step": 3621 + }, + { + "epoch": 0.09300280362882972, + "grad_norm": 0.83203125, + "learning_rate": 0.0001975349750291277, + "loss": 1.2003, + "step": 3622 + }, + { + "epoch": 0.09302848082475153, + "grad_norm": 0.85546875, + "learning_rate": 0.00019753398984067008, + "loss": 1.266, + "step": 3623 + }, + { + "epoch": 0.09305415802067335, + "grad_norm": 0.93359375, + "learning_rate": 0.00019753300445783607, + "loss": 1.1414, + "step": 3624 + }, + { + "epoch": 0.09307983521659517, + "grad_norm": 0.98828125, + "learning_rate": 0.00019753201888062758, + "loss": 1.3385, + "step": 3625 + }, + { + "epoch": 0.093105512412517, + "grad_norm": 0.87109375, + "learning_rate": 0.0001975310331090466, + "loss": 1.139, + "step": 3626 + }, + { + "epoch": 0.09313118960843882, + "grad_norm": 0.91015625, + "learning_rate": 0.00019753004714309505, + "loss": 1.1768, + "step": 3627 + }, + { + "epoch": 0.09315686680436062, + "grad_norm": 0.86328125, + "learning_rate": 0.00019752906098277495, + "loss": 1.1279, + "step": 3628 + }, + { + "epoch": 0.09318254400028245, + "grad_norm": 0.859375, + "learning_rate": 0.00019752807462808825, + "loss": 1.1893, + "step": 3629 + }, + { + "epoch": 0.09320822119620427, + "grad_norm": 0.859375, + "learning_rate": 0.00019752708807903687, + "loss": 1.1989, + "step": 3630 + }, + { + "epoch": 0.09323389839212609, + "grad_norm": 0.87109375, + "learning_rate": 0.00019752610133562287, + "loss": 1.2225, + "step": 3631 + }, + { + "epoch": 0.09325957558804791, + "grad_norm": 0.875, + "learning_rate": 0.00019752511439784815, + "loss": 1.223, + "step": 3632 + }, + { + "epoch": 0.09328525278396972, + "grad_norm": 0.89453125, + "learning_rate": 0.00019752412726571462, + "loss": 1.1011, + "step": 3633 + }, + { + "epoch": 0.09331092997989154, + "grad_norm": 0.82421875, + "learning_rate": 0.00019752313993922437, + "loss": 1.1251, + "step": 3634 + }, + { + "epoch": 0.09333660717581337, + "grad_norm": 0.8203125, + "learning_rate": 0.0001975221524183793, + "loss": 1.1548, + "step": 3635 + }, + { + "epoch": 0.09336228437173519, + "grad_norm": 0.8203125, + "learning_rate": 0.00019752116470318139, + "loss": 1.2051, + "step": 3636 + }, + { + "epoch": 0.09338796156765701, + "grad_norm": 0.8828125, + "learning_rate": 0.0001975201767936326, + "loss": 1.2406, + "step": 3637 + }, + { + "epoch": 0.09341363876357882, + "grad_norm": 0.8671875, + "learning_rate": 0.0001975191886897349, + "loss": 1.1784, + "step": 3638 + }, + { + "epoch": 0.09343931595950064, + "grad_norm": 0.90234375, + "learning_rate": 0.00019751820039149028, + "loss": 1.066, + "step": 3639 + }, + { + "epoch": 0.09346499315542246, + "grad_norm": 0.9921875, + "learning_rate": 0.00019751721189890072, + "loss": 1.1528, + "step": 3640 + }, + { + "epoch": 0.09349067035134428, + "grad_norm": 0.94921875, + "learning_rate": 0.00019751622321196815, + "loss": 1.2577, + "step": 3641 + }, + { + "epoch": 0.09351634754726611, + "grad_norm": 0.83203125, + "learning_rate": 0.00019751523433069452, + "loss": 1.0231, + "step": 3642 + }, + { + "epoch": 0.09354202474318792, + "grad_norm": 0.94921875, + "learning_rate": 0.00019751424525508187, + "loss": 1.0922, + "step": 3643 + }, + { + "epoch": 0.09356770193910974, + "grad_norm": 0.828125, + "learning_rate": 0.00019751325598513212, + "loss": 1.1894, + "step": 3644 + }, + { + "epoch": 0.09359337913503156, + "grad_norm": 0.91796875, + "learning_rate": 0.00019751226652084728, + "loss": 1.2717, + "step": 3645 + }, + { + "epoch": 0.09361905633095338, + "grad_norm": 0.90625, + "learning_rate": 0.00019751127686222928, + "loss": 1.2493, + "step": 3646 + }, + { + "epoch": 0.0936447335268752, + "grad_norm": 0.9375, + "learning_rate": 0.00019751028700928014, + "loss": 1.2309, + "step": 3647 + }, + { + "epoch": 0.09367041072279701, + "grad_norm": 0.8828125, + "learning_rate": 0.00019750929696200176, + "loss": 1.104, + "step": 3648 + }, + { + "epoch": 0.09369608791871883, + "grad_norm": 0.90625, + "learning_rate": 0.00019750830672039623, + "loss": 1.1416, + "step": 3649 + }, + { + "epoch": 0.09372176511464066, + "grad_norm": 0.84765625, + "learning_rate": 0.0001975073162844654, + "loss": 1.0922, + "step": 3650 + }, + { + "epoch": 0.09374744231056248, + "grad_norm": 0.97265625, + "learning_rate": 0.0001975063256542113, + "loss": 1.2211, + "step": 3651 + }, + { + "epoch": 0.0937731195064843, + "grad_norm": 0.90234375, + "learning_rate": 0.00019750533482963591, + "loss": 1.2065, + "step": 3652 + }, + { + "epoch": 0.09379879670240611, + "grad_norm": 0.86328125, + "learning_rate": 0.0001975043438107412, + "loss": 1.109, + "step": 3653 + }, + { + "epoch": 0.09382447389832793, + "grad_norm": 0.90234375, + "learning_rate": 0.00019750335259752913, + "loss": 1.2152, + "step": 3654 + }, + { + "epoch": 0.09385015109424975, + "grad_norm": 0.91015625, + "learning_rate": 0.0001975023611900017, + "loss": 1.159, + "step": 3655 + }, + { + "epoch": 0.09387582829017158, + "grad_norm": 0.9296875, + "learning_rate": 0.00019750136958816085, + "loss": 1.2625, + "step": 3656 + }, + { + "epoch": 0.0939015054860934, + "grad_norm": 0.91796875, + "learning_rate": 0.0001975003777920086, + "loss": 1.2882, + "step": 3657 + }, + { + "epoch": 0.0939271826820152, + "grad_norm": 0.80859375, + "learning_rate": 0.00019749938580154687, + "loss": 1.0964, + "step": 3658 + }, + { + "epoch": 0.09395285987793703, + "grad_norm": 0.875, + "learning_rate": 0.0001974983936167777, + "loss": 1.3513, + "step": 3659 + }, + { + "epoch": 0.09397853707385885, + "grad_norm": 0.89453125, + "learning_rate": 0.00019749740123770304, + "loss": 1.2628, + "step": 3660 + }, + { + "epoch": 0.09400421426978067, + "grad_norm": 0.93359375, + "learning_rate": 0.00019749640866432487, + "loss": 1.1465, + "step": 3661 + }, + { + "epoch": 0.0940298914657025, + "grad_norm": 0.85546875, + "learning_rate": 0.00019749541589664514, + "loss": 1.2172, + "step": 3662 + }, + { + "epoch": 0.0940555686616243, + "grad_norm": 0.81640625, + "learning_rate": 0.00019749442293466588, + "loss": 1.1048, + "step": 3663 + }, + { + "epoch": 0.09408124585754613, + "grad_norm": 0.89453125, + "learning_rate": 0.00019749342977838904, + "loss": 1.1641, + "step": 3664 + }, + { + "epoch": 0.09410692305346795, + "grad_norm": 0.94140625, + "learning_rate": 0.00019749243642781656, + "loss": 1.1637, + "step": 3665 + }, + { + "epoch": 0.09413260024938977, + "grad_norm": 0.89453125, + "learning_rate": 0.0001974914428829505, + "loss": 1.2744, + "step": 3666 + }, + { + "epoch": 0.09415827744531158, + "grad_norm": 0.87109375, + "learning_rate": 0.00019749044914379278, + "loss": 1.1789, + "step": 3667 + }, + { + "epoch": 0.0941839546412334, + "grad_norm": 0.859375, + "learning_rate": 0.0001974894552103454, + "loss": 1.18, + "step": 3668 + }, + { + "epoch": 0.09420963183715522, + "grad_norm": 0.9921875, + "learning_rate": 0.00019748846108261036, + "loss": 1.139, + "step": 3669 + }, + { + "epoch": 0.09423530903307704, + "grad_norm": 0.95703125, + "learning_rate": 0.0001974874667605896, + "loss": 1.1923, + "step": 3670 + }, + { + "epoch": 0.09426098622899887, + "grad_norm": 0.89453125, + "learning_rate": 0.00019748647224428515, + "loss": 1.2413, + "step": 3671 + }, + { + "epoch": 0.09428666342492067, + "grad_norm": 0.859375, + "learning_rate": 0.00019748547753369898, + "loss": 1.1761, + "step": 3672 + }, + { + "epoch": 0.0943123406208425, + "grad_norm": 0.77734375, + "learning_rate": 0.00019748448262883303, + "loss": 1.1459, + "step": 3673 + }, + { + "epoch": 0.09433801781676432, + "grad_norm": 0.90625, + "learning_rate": 0.00019748348752968931, + "loss": 1.3845, + "step": 3674 + }, + { + "epoch": 0.09436369501268614, + "grad_norm": 0.890625, + "learning_rate": 0.00019748249223626983, + "loss": 1.2405, + "step": 3675 + }, + { + "epoch": 0.09438937220860796, + "grad_norm": 0.88671875, + "learning_rate": 0.00019748149674857652, + "loss": 1.2064, + "step": 3676 + }, + { + "epoch": 0.09441504940452977, + "grad_norm": 0.86328125, + "learning_rate": 0.00019748050106661142, + "loss": 1.1008, + "step": 3677 + }, + { + "epoch": 0.0944407266004516, + "grad_norm": 0.87109375, + "learning_rate": 0.00019747950519037645, + "loss": 1.2904, + "step": 3678 + }, + { + "epoch": 0.09446640379637342, + "grad_norm": 0.9140625, + "learning_rate": 0.00019747850911987366, + "loss": 1.174, + "step": 3679 + }, + { + "epoch": 0.09449208099229524, + "grad_norm": 0.86328125, + "learning_rate": 0.000197477512855105, + "loss": 1.1252, + "step": 3680 + }, + { + "epoch": 0.09451775818821706, + "grad_norm": 0.8671875, + "learning_rate": 0.00019747651639607247, + "loss": 1.2278, + "step": 3681 + }, + { + "epoch": 0.09454343538413887, + "grad_norm": 0.8671875, + "learning_rate": 0.00019747551974277805, + "loss": 1.2764, + "step": 3682 + }, + { + "epoch": 0.09456911258006069, + "grad_norm": 0.9375, + "learning_rate": 0.0001974745228952237, + "loss": 1.0539, + "step": 3683 + }, + { + "epoch": 0.09459478977598251, + "grad_norm": 0.93359375, + "learning_rate": 0.00019747352585341145, + "loss": 1.2078, + "step": 3684 + }, + { + "epoch": 0.09462046697190434, + "grad_norm": 0.88671875, + "learning_rate": 0.00019747252861734325, + "loss": 1.1393, + "step": 3685 + }, + { + "epoch": 0.09464614416782616, + "grad_norm": 0.8828125, + "learning_rate": 0.00019747153118702113, + "loss": 1.2904, + "step": 3686 + }, + { + "epoch": 0.09467182136374797, + "grad_norm": 0.88671875, + "learning_rate": 0.00019747053356244704, + "loss": 1.1368, + "step": 3687 + }, + { + "epoch": 0.09469749855966979, + "grad_norm": 0.84375, + "learning_rate": 0.00019746953574362297, + "loss": 1.1926, + "step": 3688 + }, + { + "epoch": 0.09472317575559161, + "grad_norm": 0.83203125, + "learning_rate": 0.0001974685377305509, + "loss": 1.0769, + "step": 3689 + }, + { + "epoch": 0.09474885295151343, + "grad_norm": 0.8515625, + "learning_rate": 0.00019746753952323292, + "loss": 1.1879, + "step": 3690 + }, + { + "epoch": 0.09477453014743525, + "grad_norm": 0.9140625, + "learning_rate": 0.00019746654112167086, + "loss": 1.3894, + "step": 3691 + }, + { + "epoch": 0.09480020734335706, + "grad_norm": 0.92578125, + "learning_rate": 0.0001974655425258668, + "loss": 1.1965, + "step": 3692 + }, + { + "epoch": 0.09482588453927888, + "grad_norm": 0.89453125, + "learning_rate": 0.00019746454373582278, + "loss": 1.2488, + "step": 3693 + }, + { + "epoch": 0.0948515617352007, + "grad_norm": 0.84765625, + "learning_rate": 0.00019746354475154066, + "loss": 1.2538, + "step": 3694 + }, + { + "epoch": 0.09487723893112253, + "grad_norm": 0.82421875, + "learning_rate": 0.00019746254557302252, + "loss": 1.0912, + "step": 3695 + }, + { + "epoch": 0.09490291612704435, + "grad_norm": 0.95703125, + "learning_rate": 0.00019746154620027033, + "loss": 1.1559, + "step": 3696 + }, + { + "epoch": 0.09492859332296616, + "grad_norm": 0.91015625, + "learning_rate": 0.00019746054663328608, + "loss": 1.116, + "step": 3697 + }, + { + "epoch": 0.09495427051888798, + "grad_norm": 0.83203125, + "learning_rate": 0.00019745954687207178, + "loss": 1.2935, + "step": 3698 + }, + { + "epoch": 0.0949799477148098, + "grad_norm": 0.94140625, + "learning_rate": 0.00019745854691662938, + "loss": 1.3225, + "step": 3699 + }, + { + "epoch": 0.09500562491073163, + "grad_norm": 0.84375, + "learning_rate": 0.00019745754676696092, + "loss": 1.2331, + "step": 3700 + }, + { + "epoch": 0.09503130210665345, + "grad_norm": 0.82421875, + "learning_rate": 0.00019745654642306835, + "loss": 1.1397, + "step": 3701 + }, + { + "epoch": 0.09505697930257526, + "grad_norm": 1.0859375, + "learning_rate": 0.00019745554588495372, + "loss": 1.1509, + "step": 3702 + }, + { + "epoch": 0.09508265649849708, + "grad_norm": 0.86328125, + "learning_rate": 0.00019745454515261897, + "loss": 1.3661, + "step": 3703 + }, + { + "epoch": 0.0951083336944189, + "grad_norm": 0.83984375, + "learning_rate": 0.0001974535442260661, + "loss": 1.2357, + "step": 3704 + }, + { + "epoch": 0.09513401089034072, + "grad_norm": 0.875, + "learning_rate": 0.00019745254310529715, + "loss": 1.2762, + "step": 3705 + }, + { + "epoch": 0.09515968808626254, + "grad_norm": 0.88671875, + "learning_rate": 0.00019745154179031407, + "loss": 1.2749, + "step": 3706 + }, + { + "epoch": 0.09518536528218435, + "grad_norm": 0.85546875, + "learning_rate": 0.00019745054028111886, + "loss": 1.1469, + "step": 3707 + }, + { + "epoch": 0.09521104247810618, + "grad_norm": 0.87890625, + "learning_rate": 0.00019744953857771353, + "loss": 1.2373, + "step": 3708 + }, + { + "epoch": 0.095236719674028, + "grad_norm": 0.8828125, + "learning_rate": 0.00019744853668010007, + "loss": 1.3669, + "step": 3709 + }, + { + "epoch": 0.09526239686994982, + "grad_norm": 0.83984375, + "learning_rate": 0.00019744753458828053, + "loss": 1.0996, + "step": 3710 + }, + { + "epoch": 0.09528807406587164, + "grad_norm": 0.82421875, + "learning_rate": 0.0001974465323022568, + "loss": 1.1206, + "step": 3711 + }, + { + "epoch": 0.09531375126179345, + "grad_norm": 0.859375, + "learning_rate": 0.00019744552982203092, + "loss": 1.3149, + "step": 3712 + }, + { + "epoch": 0.09533942845771527, + "grad_norm": 0.88671875, + "learning_rate": 0.00019744452714760496, + "loss": 1.3466, + "step": 3713 + }, + { + "epoch": 0.0953651056536371, + "grad_norm": 0.828125, + "learning_rate": 0.0001974435242789808, + "loss": 1.1655, + "step": 3714 + }, + { + "epoch": 0.09539078284955892, + "grad_norm": 0.7265625, + "learning_rate": 0.00019744252121616055, + "loss": 1.1534, + "step": 3715 + }, + { + "epoch": 0.09541646004548074, + "grad_norm": 0.875, + "learning_rate": 0.0001974415179591461, + "loss": 1.157, + "step": 3716 + }, + { + "epoch": 0.09544213724140255, + "grad_norm": 0.984375, + "learning_rate": 0.00019744051450793957, + "loss": 1.1664, + "step": 3717 + }, + { + "epoch": 0.09546781443732437, + "grad_norm": 0.80859375, + "learning_rate": 0.00019743951086254285, + "loss": 0.9518, + "step": 3718 + }, + { + "epoch": 0.09549349163324619, + "grad_norm": 0.92578125, + "learning_rate": 0.000197438507022958, + "loss": 1.3556, + "step": 3719 + }, + { + "epoch": 0.09551916882916801, + "grad_norm": 0.8515625, + "learning_rate": 0.00019743750298918702, + "loss": 1.1636, + "step": 3720 + }, + { + "epoch": 0.09554484602508984, + "grad_norm": 0.859375, + "learning_rate": 0.00019743649876123186, + "loss": 1.3551, + "step": 3721 + }, + { + "epoch": 0.09557052322101164, + "grad_norm": 0.94140625, + "learning_rate": 0.0001974354943390946, + "loss": 1.3426, + "step": 3722 + }, + { + "epoch": 0.09559620041693347, + "grad_norm": 0.9375, + "learning_rate": 0.0001974344897227772, + "loss": 1.1872, + "step": 3723 + }, + { + "epoch": 0.09562187761285529, + "grad_norm": 0.7734375, + "learning_rate": 0.00019743348491228164, + "loss": 1.1276, + "step": 3724 + }, + { + "epoch": 0.09564755480877711, + "grad_norm": 0.890625, + "learning_rate": 0.00019743247990760995, + "loss": 1.0758, + "step": 3725 + }, + { + "epoch": 0.09567323200469893, + "grad_norm": 0.92578125, + "learning_rate": 0.00019743147470876413, + "loss": 1.0971, + "step": 3726 + }, + { + "epoch": 0.09569890920062074, + "grad_norm": 0.8515625, + "learning_rate": 0.0001974304693157462, + "loss": 1.2424, + "step": 3727 + }, + { + "epoch": 0.09572458639654256, + "grad_norm": 0.92578125, + "learning_rate": 0.00019742946372855813, + "loss": 1.1445, + "step": 3728 + }, + { + "epoch": 0.09575026359246439, + "grad_norm": 0.84375, + "learning_rate": 0.00019742845794720192, + "loss": 1.2338, + "step": 3729 + }, + { + "epoch": 0.09577594078838621, + "grad_norm": 0.890625, + "learning_rate": 0.0001974274519716796, + "loss": 1.2138, + "step": 3730 + }, + { + "epoch": 0.09580161798430803, + "grad_norm": 0.875, + "learning_rate": 0.0001974264458019932, + "loss": 1.1351, + "step": 3731 + }, + { + "epoch": 0.09582729518022984, + "grad_norm": 0.91015625, + "learning_rate": 0.0001974254394381447, + "loss": 1.0728, + "step": 3732 + }, + { + "epoch": 0.09585297237615166, + "grad_norm": 0.87109375, + "learning_rate": 0.0001974244328801361, + "loss": 1.357, + "step": 3733 + }, + { + "epoch": 0.09587864957207348, + "grad_norm": 0.90234375, + "learning_rate": 0.00019742342612796935, + "loss": 1.2212, + "step": 3734 + }, + { + "epoch": 0.0959043267679953, + "grad_norm": 0.79296875, + "learning_rate": 0.00019742241918164656, + "loss": 1.0806, + "step": 3735 + }, + { + "epoch": 0.09593000396391713, + "grad_norm": 0.90625, + "learning_rate": 0.00019742141204116966, + "loss": 1.2332, + "step": 3736 + }, + { + "epoch": 0.09595568115983893, + "grad_norm": 0.90234375, + "learning_rate": 0.0001974204047065407, + "loss": 1.1527, + "step": 3737 + }, + { + "epoch": 0.09598135835576076, + "grad_norm": 0.875, + "learning_rate": 0.00019741939717776168, + "loss": 1.1615, + "step": 3738 + }, + { + "epoch": 0.09600703555168258, + "grad_norm": 0.84765625, + "learning_rate": 0.0001974183894548346, + "loss": 1.3667, + "step": 3739 + }, + { + "epoch": 0.0960327127476044, + "grad_norm": 0.91015625, + "learning_rate": 0.00019741738153776145, + "loss": 1.2184, + "step": 3740 + }, + { + "epoch": 0.09605838994352622, + "grad_norm": 0.89453125, + "learning_rate": 0.00019741637342654428, + "loss": 1.1657, + "step": 3741 + }, + { + "epoch": 0.09608406713944803, + "grad_norm": 0.828125, + "learning_rate": 0.00019741536512118507, + "loss": 1.0794, + "step": 3742 + }, + { + "epoch": 0.09610974433536985, + "grad_norm": 0.7734375, + "learning_rate": 0.00019741435662168585, + "loss": 1.1298, + "step": 3743 + }, + { + "epoch": 0.09613542153129168, + "grad_norm": 0.9765625, + "learning_rate": 0.0001974133479280486, + "loss": 1.2634, + "step": 3744 + }, + { + "epoch": 0.0961610987272135, + "grad_norm": 0.8125, + "learning_rate": 0.00019741233904027535, + "loss": 1.187, + "step": 3745 + }, + { + "epoch": 0.09618677592313532, + "grad_norm": 0.8125, + "learning_rate": 0.00019741132995836812, + "loss": 1.1484, + "step": 3746 + }, + { + "epoch": 0.09621245311905713, + "grad_norm": 0.828125, + "learning_rate": 0.00019741032068232889, + "loss": 1.1486, + "step": 3747 + }, + { + "epoch": 0.09623813031497895, + "grad_norm": 0.86328125, + "learning_rate": 0.00019740931121215967, + "loss": 1.2594, + "step": 3748 + }, + { + "epoch": 0.09626380751090077, + "grad_norm": 0.86328125, + "learning_rate": 0.00019740830154786254, + "loss": 1.1114, + "step": 3749 + }, + { + "epoch": 0.0962894847068226, + "grad_norm": 0.92578125, + "learning_rate": 0.00019740729168943942, + "loss": 0.9505, + "step": 3750 + }, + { + "epoch": 0.09631516190274442, + "grad_norm": 0.859375, + "learning_rate": 0.0001974062816368924, + "loss": 1.04, + "step": 3751 + }, + { + "epoch": 0.09634083909866623, + "grad_norm": 0.84765625, + "learning_rate": 0.00019740527139022347, + "loss": 1.2937, + "step": 3752 + }, + { + "epoch": 0.09636651629458805, + "grad_norm": 0.87890625, + "learning_rate": 0.0001974042609494346, + "loss": 1.1996, + "step": 3753 + }, + { + "epoch": 0.09639219349050987, + "grad_norm": 0.7890625, + "learning_rate": 0.00019740325031452783, + "loss": 1.0522, + "step": 3754 + }, + { + "epoch": 0.09641787068643169, + "grad_norm": 0.89453125, + "learning_rate": 0.0001974022394855052, + "loss": 1.2138, + "step": 3755 + }, + { + "epoch": 0.09644354788235351, + "grad_norm": 0.84375, + "learning_rate": 0.0001974012284623687, + "loss": 1.0155, + "step": 3756 + }, + { + "epoch": 0.09646922507827532, + "grad_norm": 0.8984375, + "learning_rate": 0.00019740021724512036, + "loss": 1.1274, + "step": 3757 + }, + { + "epoch": 0.09649490227419714, + "grad_norm": 0.90234375, + "learning_rate": 0.00019739920583376218, + "loss": 1.2105, + "step": 3758 + }, + { + "epoch": 0.09652057947011897, + "grad_norm": 0.859375, + "learning_rate": 0.00019739819422829618, + "loss": 1.2578, + "step": 3759 + }, + { + "epoch": 0.09654625666604079, + "grad_norm": 0.84375, + "learning_rate": 0.00019739718242872437, + "loss": 1.2383, + "step": 3760 + }, + { + "epoch": 0.09657193386196261, + "grad_norm": 0.984375, + "learning_rate": 0.0001973961704350488, + "loss": 1.5368, + "step": 3761 + }, + { + "epoch": 0.09659761105788442, + "grad_norm": 0.87109375, + "learning_rate": 0.0001973951582472714, + "loss": 1.1066, + "step": 3762 + }, + { + "epoch": 0.09662328825380624, + "grad_norm": 0.9375, + "learning_rate": 0.00019739414586539432, + "loss": 1.1013, + "step": 3763 + }, + { + "epoch": 0.09664896544972806, + "grad_norm": 0.84765625, + "learning_rate": 0.00019739313328941943, + "loss": 1.0953, + "step": 3764 + }, + { + "epoch": 0.09667464264564989, + "grad_norm": 0.85546875, + "learning_rate": 0.00019739212051934888, + "loss": 1.1213, + "step": 3765 + }, + { + "epoch": 0.09670031984157171, + "grad_norm": 0.94140625, + "learning_rate": 0.00019739110755518462, + "loss": 1.1256, + "step": 3766 + }, + { + "epoch": 0.09672599703749352, + "grad_norm": 0.90625, + "learning_rate": 0.00019739009439692867, + "loss": 1.1779, + "step": 3767 + }, + { + "epoch": 0.09675167423341534, + "grad_norm": 0.83203125, + "learning_rate": 0.00019738908104458306, + "loss": 1.0783, + "step": 3768 + }, + { + "epoch": 0.09677735142933716, + "grad_norm": 0.80078125, + "learning_rate": 0.0001973880674981498, + "loss": 1.1011, + "step": 3769 + }, + { + "epoch": 0.09680302862525898, + "grad_norm": 0.94921875, + "learning_rate": 0.00019738705375763095, + "loss": 1.2036, + "step": 3770 + }, + { + "epoch": 0.09682870582118079, + "grad_norm": 0.9296875, + "learning_rate": 0.00019738603982302847, + "loss": 1.2927, + "step": 3771 + }, + { + "epoch": 0.09685438301710261, + "grad_norm": 1.0234375, + "learning_rate": 0.00019738502569434442, + "loss": 1.4425, + "step": 3772 + }, + { + "epoch": 0.09688006021302444, + "grad_norm": 0.87890625, + "learning_rate": 0.00019738401137158083, + "loss": 1.2649, + "step": 3773 + }, + { + "epoch": 0.09690573740894626, + "grad_norm": 0.9453125, + "learning_rate": 0.0001973829968547397, + "loss": 1.1621, + "step": 3774 + }, + { + "epoch": 0.09693141460486808, + "grad_norm": 0.85546875, + "learning_rate": 0.00019738198214382303, + "loss": 0.9427, + "step": 3775 + }, + { + "epoch": 0.09695709180078989, + "grad_norm": 0.89453125, + "learning_rate": 0.00019738096723883285, + "loss": 1.1759, + "step": 3776 + }, + { + "epoch": 0.09698276899671171, + "grad_norm": 0.8828125, + "learning_rate": 0.00019737995213977124, + "loss": 1.1048, + "step": 3777 + }, + { + "epoch": 0.09700844619263353, + "grad_norm": 0.890625, + "learning_rate": 0.00019737893684664015, + "loss": 1.1817, + "step": 3778 + }, + { + "epoch": 0.09703412338855535, + "grad_norm": 0.9375, + "learning_rate": 0.00019737792135944164, + "loss": 1.2181, + "step": 3779 + }, + { + "epoch": 0.09705980058447718, + "grad_norm": 0.8515625, + "learning_rate": 0.00019737690567817775, + "loss": 1.1999, + "step": 3780 + }, + { + "epoch": 0.09708547778039898, + "grad_norm": 0.8671875, + "learning_rate": 0.00019737588980285047, + "loss": 1.1426, + "step": 3781 + }, + { + "epoch": 0.09711115497632081, + "grad_norm": 0.890625, + "learning_rate": 0.00019737487373346183, + "loss": 1.2667, + "step": 3782 + }, + { + "epoch": 0.09713683217224263, + "grad_norm": 1.609375, + "learning_rate": 0.00019737385747001387, + "loss": 1.0507, + "step": 3783 + }, + { + "epoch": 0.09716250936816445, + "grad_norm": 0.93359375, + "learning_rate": 0.0001973728410125086, + "loss": 1.2371, + "step": 3784 + }, + { + "epoch": 0.09718818656408627, + "grad_norm": 0.8359375, + "learning_rate": 0.00019737182436094805, + "loss": 1.1599, + "step": 3785 + }, + { + "epoch": 0.09721386376000808, + "grad_norm": 0.8515625, + "learning_rate": 0.00019737080751533427, + "loss": 1.3813, + "step": 3786 + }, + { + "epoch": 0.0972395409559299, + "grad_norm": 0.8671875, + "learning_rate": 0.00019736979047566924, + "loss": 1.3358, + "step": 3787 + }, + { + "epoch": 0.09726521815185173, + "grad_norm": 0.80859375, + "learning_rate": 0.00019736877324195504, + "loss": 1.264, + "step": 3788 + }, + { + "epoch": 0.09729089534777355, + "grad_norm": 0.82421875, + "learning_rate": 0.00019736775581419367, + "loss": 0.9728, + "step": 3789 + }, + { + "epoch": 0.09731657254369537, + "grad_norm": 0.80078125, + "learning_rate": 0.00019736673819238713, + "loss": 1.2481, + "step": 3790 + }, + { + "epoch": 0.09734224973961718, + "grad_norm": 0.9453125, + "learning_rate": 0.0001973657203765375, + "loss": 1.1691, + "step": 3791 + }, + { + "epoch": 0.097367926935539, + "grad_norm": 0.8203125, + "learning_rate": 0.00019736470236664678, + "loss": 1.3541, + "step": 3792 + }, + { + "epoch": 0.09739360413146082, + "grad_norm": 0.921875, + "learning_rate": 0.00019736368416271697, + "loss": 1.2615, + "step": 3793 + }, + { + "epoch": 0.09741928132738265, + "grad_norm": 0.80859375, + "learning_rate": 0.00019736266576475019, + "loss": 1.1612, + "step": 3794 + }, + { + "epoch": 0.09744495852330447, + "grad_norm": 0.8828125, + "learning_rate": 0.00019736164717274837, + "loss": 1.1731, + "step": 3795 + }, + { + "epoch": 0.09747063571922628, + "grad_norm": 0.8828125, + "learning_rate": 0.0001973606283867136, + "loss": 1.2255, + "step": 3796 + }, + { + "epoch": 0.0974963129151481, + "grad_norm": 0.8671875, + "learning_rate": 0.00019735960940664787, + "loss": 1.2039, + "step": 3797 + }, + { + "epoch": 0.09752199011106992, + "grad_norm": 0.859375, + "learning_rate": 0.00019735859023255328, + "loss": 1.0716, + "step": 3798 + }, + { + "epoch": 0.09754766730699174, + "grad_norm": 0.8203125, + "learning_rate": 0.0001973575708644318, + "loss": 1.2124, + "step": 3799 + }, + { + "epoch": 0.09757334450291356, + "grad_norm": 0.90625, + "learning_rate": 0.00019735655130228544, + "loss": 1.1593, + "step": 3800 + }, + { + "epoch": 0.09759902169883537, + "grad_norm": 0.86328125, + "learning_rate": 0.00019735553154611628, + "loss": 1.2432, + "step": 3801 + }, + { + "epoch": 0.0976246988947572, + "grad_norm": 0.84765625, + "learning_rate": 0.00019735451159592636, + "loss": 1.2401, + "step": 3802 + }, + { + "epoch": 0.09765037609067902, + "grad_norm": 0.8203125, + "learning_rate": 0.00019735349145171767, + "loss": 1.1835, + "step": 3803 + }, + { + "epoch": 0.09767605328660084, + "grad_norm": 0.9140625, + "learning_rate": 0.0001973524711134923, + "loss": 1.2016, + "step": 3804 + }, + { + "epoch": 0.09770173048252266, + "grad_norm": 0.94921875, + "learning_rate": 0.0001973514505812522, + "loss": 1.0517, + "step": 3805 + }, + { + "epoch": 0.09772740767844447, + "grad_norm": 0.921875, + "learning_rate": 0.00019735042985499947, + "loss": 1.2813, + "step": 3806 + }, + { + "epoch": 0.09775308487436629, + "grad_norm": 0.83203125, + "learning_rate": 0.00019734940893473615, + "loss": 1.055, + "step": 3807 + }, + { + "epoch": 0.09777876207028811, + "grad_norm": 0.8671875, + "learning_rate": 0.00019734838782046424, + "loss": 1.1992, + "step": 3808 + }, + { + "epoch": 0.09780443926620994, + "grad_norm": 0.91015625, + "learning_rate": 0.00019734736651218576, + "loss": 1.1972, + "step": 3809 + }, + { + "epoch": 0.09783011646213176, + "grad_norm": 0.87890625, + "learning_rate": 0.0001973463450099028, + "loss": 1.2179, + "step": 3810 + }, + { + "epoch": 0.09785579365805357, + "grad_norm": 0.828125, + "learning_rate": 0.00019734532331361736, + "loss": 0.8147, + "step": 3811 + }, + { + "epoch": 0.09788147085397539, + "grad_norm": 0.8828125, + "learning_rate": 0.00019734430142333148, + "loss": 1.1601, + "step": 3812 + }, + { + "epoch": 0.09790714804989721, + "grad_norm": 1.0625, + "learning_rate": 0.00019734327933904723, + "loss": 1.0455, + "step": 3813 + }, + { + "epoch": 0.09793282524581903, + "grad_norm": 1.3203125, + "learning_rate": 0.0001973422570607666, + "loss": 1.0758, + "step": 3814 + }, + { + "epoch": 0.09795850244174086, + "grad_norm": 0.94140625, + "learning_rate": 0.0001973412345884916, + "loss": 1.2291, + "step": 3815 + }, + { + "epoch": 0.09798417963766266, + "grad_norm": 0.87890625, + "learning_rate": 0.00019734021192222438, + "loss": 1.2439, + "step": 3816 + }, + { + "epoch": 0.09800985683358449, + "grad_norm": 0.84375, + "learning_rate": 0.00019733918906196686, + "loss": 1.2264, + "step": 3817 + }, + { + "epoch": 0.09803553402950631, + "grad_norm": 0.84375, + "learning_rate": 0.00019733816600772116, + "loss": 1.1595, + "step": 3818 + }, + { + "epoch": 0.09806121122542813, + "grad_norm": 0.7734375, + "learning_rate": 0.00019733714275948927, + "loss": 1.0938, + "step": 3819 + }, + { + "epoch": 0.09808688842134995, + "grad_norm": 0.859375, + "learning_rate": 0.00019733611931727326, + "loss": 1.1814, + "step": 3820 + }, + { + "epoch": 0.09811256561727176, + "grad_norm": 0.8515625, + "learning_rate": 0.00019733509568107516, + "loss": 1.2993, + "step": 3821 + }, + { + "epoch": 0.09813824281319358, + "grad_norm": 0.80078125, + "learning_rate": 0.000197334071850897, + "loss": 1.2435, + "step": 3822 + }, + { + "epoch": 0.0981639200091154, + "grad_norm": 0.90234375, + "learning_rate": 0.00019733304782674083, + "loss": 1.3087, + "step": 3823 + }, + { + "epoch": 0.09818959720503723, + "grad_norm": 0.859375, + "learning_rate": 0.00019733202360860865, + "loss": 1.1817, + "step": 3824 + }, + { + "epoch": 0.09821527440095905, + "grad_norm": 0.91015625, + "learning_rate": 0.00019733099919650257, + "loss": 1.2764, + "step": 3825 + }, + { + "epoch": 0.09824095159688086, + "grad_norm": 0.8125, + "learning_rate": 0.0001973299745904246, + "loss": 1.1193, + "step": 3826 + }, + { + "epoch": 0.09826662879280268, + "grad_norm": 0.8671875, + "learning_rate": 0.0001973289497903768, + "loss": 1.27, + "step": 3827 + }, + { + "epoch": 0.0982923059887245, + "grad_norm": 0.83984375, + "learning_rate": 0.00019732792479636117, + "loss": 1.2051, + "step": 3828 + }, + { + "epoch": 0.09831798318464632, + "grad_norm": 0.7890625, + "learning_rate": 0.0001973268996083798, + "loss": 1.2228, + "step": 3829 + }, + { + "epoch": 0.09834366038056815, + "grad_norm": 0.8828125, + "learning_rate": 0.0001973258742264347, + "loss": 1.1197, + "step": 3830 + }, + { + "epoch": 0.09836933757648995, + "grad_norm": 0.83984375, + "learning_rate": 0.0001973248486505279, + "loss": 1.0543, + "step": 3831 + }, + { + "epoch": 0.09839501477241178, + "grad_norm": 0.8515625, + "learning_rate": 0.0001973238228806615, + "loss": 1.2091, + "step": 3832 + }, + { + "epoch": 0.0984206919683336, + "grad_norm": 0.8671875, + "learning_rate": 0.0001973227969168375, + "loss": 1.1645, + "step": 3833 + }, + { + "epoch": 0.09844636916425542, + "grad_norm": 0.89453125, + "learning_rate": 0.00019732177075905795, + "loss": 1.2682, + "step": 3834 + }, + { + "epoch": 0.09847204636017724, + "grad_norm": 0.90234375, + "learning_rate": 0.00019732074440732491, + "loss": 1.1757, + "step": 3835 + }, + { + "epoch": 0.09849772355609905, + "grad_norm": 0.86328125, + "learning_rate": 0.00019731971786164044, + "loss": 1.0185, + "step": 3836 + }, + { + "epoch": 0.09852340075202087, + "grad_norm": 0.875, + "learning_rate": 0.0001973186911220065, + "loss": 1.2896, + "step": 3837 + }, + { + "epoch": 0.0985490779479427, + "grad_norm": 0.8671875, + "learning_rate": 0.00019731766418842526, + "loss": 1.03, + "step": 3838 + }, + { + "epoch": 0.09857475514386452, + "grad_norm": 0.82421875, + "learning_rate": 0.00019731663706089867, + "loss": 1.1162, + "step": 3839 + }, + { + "epoch": 0.09860043233978634, + "grad_norm": 0.97265625, + "learning_rate": 0.00019731560973942883, + "loss": 1.0856, + "step": 3840 + }, + { + "epoch": 0.09862610953570815, + "grad_norm": 0.87890625, + "learning_rate": 0.00019731458222401778, + "loss": 1.0444, + "step": 3841 + }, + { + "epoch": 0.09865178673162997, + "grad_norm": 0.89453125, + "learning_rate": 0.00019731355451466754, + "loss": 1.2106, + "step": 3842 + }, + { + "epoch": 0.09867746392755179, + "grad_norm": 0.84375, + "learning_rate": 0.00019731252661138018, + "loss": 1.0946, + "step": 3843 + }, + { + "epoch": 0.09870314112347361, + "grad_norm": 0.8515625, + "learning_rate": 0.00019731149851415775, + "loss": 1.0713, + "step": 3844 + }, + { + "epoch": 0.09872881831939544, + "grad_norm": 0.81640625, + "learning_rate": 0.0001973104702230023, + "loss": 1.0242, + "step": 3845 + }, + { + "epoch": 0.09875449551531724, + "grad_norm": 0.83203125, + "learning_rate": 0.00019730944173791586, + "loss": 1.2818, + "step": 3846 + }, + { + "epoch": 0.09878017271123907, + "grad_norm": 0.9375, + "learning_rate": 0.0001973084130589005, + "loss": 1.3578, + "step": 3847 + }, + { + "epoch": 0.09880584990716089, + "grad_norm": 0.92578125, + "learning_rate": 0.00019730738418595825, + "loss": 1.2066, + "step": 3848 + }, + { + "epoch": 0.09883152710308271, + "grad_norm": 0.91796875, + "learning_rate": 0.00019730635511909118, + "loss": 1.2333, + "step": 3849 + }, + { + "epoch": 0.09885720429900453, + "grad_norm": 0.88671875, + "learning_rate": 0.00019730532585830136, + "loss": 1.1222, + "step": 3850 + }, + { + "epoch": 0.09888288149492634, + "grad_norm": 0.9296875, + "learning_rate": 0.00019730429640359075, + "loss": 1.1469, + "step": 3851 + }, + { + "epoch": 0.09890855869084816, + "grad_norm": 0.8515625, + "learning_rate": 0.0001973032667549615, + "loss": 1.2401, + "step": 3852 + }, + { + "epoch": 0.09893423588676999, + "grad_norm": 0.8046875, + "learning_rate": 0.00019730223691241566, + "loss": 1.0836, + "step": 3853 + }, + { + "epoch": 0.09895991308269181, + "grad_norm": 0.83984375, + "learning_rate": 0.0001973012068759552, + "loss": 1.2489, + "step": 3854 + }, + { + "epoch": 0.09898559027861363, + "grad_norm": 0.8515625, + "learning_rate": 0.00019730017664558227, + "loss": 1.1381, + "step": 3855 + }, + { + "epoch": 0.09901126747453544, + "grad_norm": 0.8828125, + "learning_rate": 0.00019729914622129886, + "loss": 1.316, + "step": 3856 + }, + { + "epoch": 0.09903694467045726, + "grad_norm": 0.94921875, + "learning_rate": 0.00019729811560310702, + "loss": 1.2471, + "step": 3857 + }, + { + "epoch": 0.09906262186637908, + "grad_norm": 0.859375, + "learning_rate": 0.00019729708479100885, + "loss": 1.128, + "step": 3858 + }, + { + "epoch": 0.0990882990623009, + "grad_norm": 0.7890625, + "learning_rate": 0.00019729605378500639, + "loss": 1.1239, + "step": 3859 + }, + { + "epoch": 0.09911397625822273, + "grad_norm": 0.7734375, + "learning_rate": 0.00019729502258510165, + "loss": 1.1208, + "step": 3860 + }, + { + "epoch": 0.09913965345414454, + "grad_norm": 0.890625, + "learning_rate": 0.00019729399119129673, + "loss": 1.2815, + "step": 3861 + }, + { + "epoch": 0.09916533065006636, + "grad_norm": 0.77734375, + "learning_rate": 0.0001972929596035937, + "loss": 1.1725, + "step": 3862 + }, + { + "epoch": 0.09919100784598818, + "grad_norm": 0.875, + "learning_rate": 0.00019729192782199455, + "loss": 1.2149, + "step": 3863 + }, + { + "epoch": 0.09921668504191, + "grad_norm": 0.8203125, + "learning_rate": 0.0001972908958465014, + "loss": 1.1932, + "step": 3864 + }, + { + "epoch": 0.09924236223783182, + "grad_norm": 0.91015625, + "learning_rate": 0.00019728986367711628, + "loss": 1.2499, + "step": 3865 + }, + { + "epoch": 0.09926803943375363, + "grad_norm": 0.8359375, + "learning_rate": 0.00019728883131384125, + "loss": 1.1591, + "step": 3866 + }, + { + "epoch": 0.09929371662967545, + "grad_norm": 0.9140625, + "learning_rate": 0.00019728779875667835, + "loss": 1.1903, + "step": 3867 + }, + { + "epoch": 0.09931939382559728, + "grad_norm": 0.87109375, + "learning_rate": 0.00019728676600562966, + "loss": 1.1919, + "step": 3868 + }, + { + "epoch": 0.0993450710215191, + "grad_norm": 0.875, + "learning_rate": 0.00019728573306069727, + "loss": 1.0361, + "step": 3869 + }, + { + "epoch": 0.09937074821744092, + "grad_norm": 0.91015625, + "learning_rate": 0.00019728469992188316, + "loss": 1.2384, + "step": 3870 + }, + { + "epoch": 0.09939642541336273, + "grad_norm": 0.8828125, + "learning_rate": 0.00019728366658918947, + "loss": 1.0797, + "step": 3871 + }, + { + "epoch": 0.09942210260928455, + "grad_norm": 0.9140625, + "learning_rate": 0.00019728263306261818, + "loss": 1.1279, + "step": 3872 + }, + { + "epoch": 0.09944777980520637, + "grad_norm": 0.76953125, + "learning_rate": 0.0001972815993421714, + "loss": 1.1114, + "step": 3873 + }, + { + "epoch": 0.0994734570011282, + "grad_norm": 0.82421875, + "learning_rate": 0.0001972805654278512, + "loss": 1.1236, + "step": 3874 + }, + { + "epoch": 0.09949913419705, + "grad_norm": 0.92578125, + "learning_rate": 0.0001972795313196596, + "loss": 1.1101, + "step": 3875 + }, + { + "epoch": 0.09952481139297183, + "grad_norm": 0.828125, + "learning_rate": 0.00019727849701759866, + "loss": 1.1454, + "step": 3876 + }, + { + "epoch": 0.09955048858889365, + "grad_norm": 0.83984375, + "learning_rate": 0.0001972774625216705, + "loss": 1.1309, + "step": 3877 + }, + { + "epoch": 0.09957616578481547, + "grad_norm": 0.87890625, + "learning_rate": 0.00019727642783187713, + "loss": 1.2586, + "step": 3878 + }, + { + "epoch": 0.09960184298073729, + "grad_norm": 0.8125, + "learning_rate": 0.00019727539294822063, + "loss": 0.9967, + "step": 3879 + }, + { + "epoch": 0.0996275201766591, + "grad_norm": 0.8203125, + "learning_rate": 0.00019727435787070306, + "loss": 1.0989, + "step": 3880 + }, + { + "epoch": 0.09965319737258092, + "grad_norm": 0.9765625, + "learning_rate": 0.0001972733225993265, + "loss": 1.2495, + "step": 3881 + }, + { + "epoch": 0.09967887456850275, + "grad_norm": 0.89453125, + "learning_rate": 0.00019727228713409297, + "loss": 1.1493, + "step": 3882 + }, + { + "epoch": 0.09970455176442457, + "grad_norm": 0.921875, + "learning_rate": 0.00019727125147500456, + "loss": 1.0373, + "step": 3883 + }, + { + "epoch": 0.09973022896034639, + "grad_norm": 0.78125, + "learning_rate": 0.00019727021562206335, + "loss": 1.128, + "step": 3884 + }, + { + "epoch": 0.0997559061562682, + "grad_norm": 0.83203125, + "learning_rate": 0.00019726917957527134, + "loss": 1.1978, + "step": 3885 + }, + { + "epoch": 0.09978158335219002, + "grad_norm": 0.7734375, + "learning_rate": 0.00019726814333463069, + "loss": 1.2215, + "step": 3886 + }, + { + "epoch": 0.09980726054811184, + "grad_norm": 0.87109375, + "learning_rate": 0.0001972671069001434, + "loss": 1.1919, + "step": 3887 + }, + { + "epoch": 0.09983293774403366, + "grad_norm": 0.90625, + "learning_rate": 0.00019726607027181154, + "loss": 1.1808, + "step": 3888 + }, + { + "epoch": 0.09985861493995549, + "grad_norm": 0.8515625, + "learning_rate": 0.0001972650334496372, + "loss": 1.1526, + "step": 3889 + }, + { + "epoch": 0.0998842921358773, + "grad_norm": 0.84765625, + "learning_rate": 0.00019726399643362242, + "loss": 1.2486, + "step": 3890 + }, + { + "epoch": 0.09990996933179912, + "grad_norm": 0.8671875, + "learning_rate": 0.0001972629592237693, + "loss": 1.226, + "step": 3891 + }, + { + "epoch": 0.09993564652772094, + "grad_norm": 0.89453125, + "learning_rate": 0.00019726192182007988, + "loss": 1.1797, + "step": 3892 + }, + { + "epoch": 0.09996132372364276, + "grad_norm": 0.87890625, + "learning_rate": 0.00019726088422255625, + "loss": 1.195, + "step": 3893 + }, + { + "epoch": 0.09998700091956458, + "grad_norm": 0.86328125, + "learning_rate": 0.00019725984643120047, + "loss": 1.1244, + "step": 3894 + }, + { + "epoch": 0.10001267811548639, + "grad_norm": 0.859375, + "learning_rate": 0.00019725880844601457, + "loss": 1.2434, + "step": 3895 + }, + { + "epoch": 0.10003835531140821, + "grad_norm": 0.9140625, + "learning_rate": 0.00019725777026700065, + "loss": 1.2431, + "step": 3896 + }, + { + "epoch": 0.10006403250733004, + "grad_norm": 0.78515625, + "learning_rate": 0.00019725673189416082, + "loss": 1.1388, + "step": 3897 + }, + { + "epoch": 0.10008970970325186, + "grad_norm": 0.875, + "learning_rate": 0.00019725569332749708, + "loss": 1.1536, + "step": 3898 + }, + { + "epoch": 0.10011538689917368, + "grad_norm": 0.90234375, + "learning_rate": 0.00019725465456701152, + "loss": 1.1473, + "step": 3899 + }, + { + "epoch": 0.10014106409509549, + "grad_norm": 0.8671875, + "learning_rate": 0.00019725361561270623, + "loss": 1.2019, + "step": 3900 + }, + { + "epoch": 0.10016674129101731, + "grad_norm": 0.86328125, + "learning_rate": 0.00019725257646458326, + "loss": 1.1907, + "step": 3901 + }, + { + "epoch": 0.10019241848693913, + "grad_norm": 0.83203125, + "learning_rate": 0.00019725153712264471, + "loss": 1.2144, + "step": 3902 + }, + { + "epoch": 0.10021809568286096, + "grad_norm": 0.81640625, + "learning_rate": 0.0001972504975868926, + "loss": 1.0477, + "step": 3903 + }, + { + "epoch": 0.10024377287878278, + "grad_norm": 0.80859375, + "learning_rate": 0.00019724945785732904, + "loss": 1.1883, + "step": 3904 + }, + { + "epoch": 0.10026945007470459, + "grad_norm": 0.875, + "learning_rate": 0.00019724841793395613, + "loss": 1.1531, + "step": 3905 + }, + { + "epoch": 0.10029512727062641, + "grad_norm": 0.8515625, + "learning_rate": 0.00019724737781677586, + "loss": 1.2203, + "step": 3906 + }, + { + "epoch": 0.10032080446654823, + "grad_norm": 0.890625, + "learning_rate": 0.00019724633750579037, + "loss": 1.375, + "step": 3907 + }, + { + "epoch": 0.10034648166247005, + "grad_norm": 0.87109375, + "learning_rate": 0.00019724529700100173, + "loss": 1.2131, + "step": 3908 + }, + { + "epoch": 0.10037215885839187, + "grad_norm": 0.87890625, + "learning_rate": 0.00019724425630241198, + "loss": 1.4079, + "step": 3909 + }, + { + "epoch": 0.10039783605431368, + "grad_norm": 0.84375, + "learning_rate": 0.00019724321541002323, + "loss": 1.0882, + "step": 3910 + }, + { + "epoch": 0.1004235132502355, + "grad_norm": 0.81640625, + "learning_rate": 0.00019724217432383752, + "loss": 1.276, + "step": 3911 + }, + { + "epoch": 0.10044919044615733, + "grad_norm": 0.83984375, + "learning_rate": 0.00019724113304385693, + "loss": 0.9549, + "step": 3912 + }, + { + "epoch": 0.10047486764207915, + "grad_norm": 0.9765625, + "learning_rate": 0.00019724009157008358, + "loss": 1.3537, + "step": 3913 + }, + { + "epoch": 0.10050054483800097, + "grad_norm": 0.84375, + "learning_rate": 0.00019723904990251947, + "loss": 1.189, + "step": 3914 + }, + { + "epoch": 0.10052622203392278, + "grad_norm": 0.84765625, + "learning_rate": 0.00019723800804116674, + "loss": 1.2783, + "step": 3915 + }, + { + "epoch": 0.1005518992298446, + "grad_norm": 0.796875, + "learning_rate": 0.00019723696598602744, + "loss": 1.055, + "step": 3916 + }, + { + "epoch": 0.10057757642576642, + "grad_norm": 0.85546875, + "learning_rate": 0.00019723592373710363, + "loss": 1.1478, + "step": 3917 + }, + { + "epoch": 0.10060325362168825, + "grad_norm": 0.9296875, + "learning_rate": 0.00019723488129439742, + "loss": 1.2657, + "step": 3918 + }, + { + "epoch": 0.10062893081761007, + "grad_norm": 1.0390625, + "learning_rate": 0.00019723383865791089, + "loss": 1.0859, + "step": 3919 + }, + { + "epoch": 0.10065460801353188, + "grad_norm": 0.8515625, + "learning_rate": 0.0001972327958276461, + "loss": 1.0215, + "step": 3920 + }, + { + "epoch": 0.1006802852094537, + "grad_norm": 0.890625, + "learning_rate": 0.00019723175280360512, + "loss": 1.227, + "step": 3921 + }, + { + "epoch": 0.10070596240537552, + "grad_norm": 0.87890625, + "learning_rate": 0.00019723070958579006, + "loss": 1.3137, + "step": 3922 + }, + { + "epoch": 0.10073163960129734, + "grad_norm": 0.91015625, + "learning_rate": 0.00019722966617420295, + "loss": 1.2887, + "step": 3923 + }, + { + "epoch": 0.10075731679721917, + "grad_norm": 0.83984375, + "learning_rate": 0.0001972286225688459, + "loss": 1.159, + "step": 3924 + }, + { + "epoch": 0.10078299399314097, + "grad_norm": 0.82421875, + "learning_rate": 0.000197227578769721, + "loss": 1.181, + "step": 3925 + }, + { + "epoch": 0.1008086711890628, + "grad_norm": 0.87109375, + "learning_rate": 0.00019722653477683032, + "loss": 1.1311, + "step": 3926 + }, + { + "epoch": 0.10083434838498462, + "grad_norm": 0.87890625, + "learning_rate": 0.00019722549059017594, + "loss": 1.2102, + "step": 3927 + }, + { + "epoch": 0.10086002558090644, + "grad_norm": 0.85546875, + "learning_rate": 0.00019722444620975992, + "loss": 1.2588, + "step": 3928 + }, + { + "epoch": 0.10088570277682826, + "grad_norm": 0.875, + "learning_rate": 0.0001972234016355844, + "loss": 1.1197, + "step": 3929 + }, + { + "epoch": 0.10091137997275007, + "grad_norm": 0.8046875, + "learning_rate": 0.0001972223568676514, + "loss": 1.1064, + "step": 3930 + }, + { + "epoch": 0.10093705716867189, + "grad_norm": 0.93359375, + "learning_rate": 0.00019722131190596303, + "loss": 1.204, + "step": 3931 + }, + { + "epoch": 0.10096273436459371, + "grad_norm": 0.9375, + "learning_rate": 0.00019722026675052134, + "loss": 1.1626, + "step": 3932 + }, + { + "epoch": 0.10098841156051554, + "grad_norm": 0.8203125, + "learning_rate": 0.00019721922140132847, + "loss": 1.2264, + "step": 3933 + }, + { + "epoch": 0.10101408875643736, + "grad_norm": 0.8671875, + "learning_rate": 0.00019721817585838645, + "loss": 1.1217, + "step": 3934 + }, + { + "epoch": 0.10103976595235917, + "grad_norm": 0.9375, + "learning_rate": 0.0001972171301216974, + "loss": 1.1828, + "step": 3935 + }, + { + "epoch": 0.10106544314828099, + "grad_norm": 0.90625, + "learning_rate": 0.0001972160841912634, + "loss": 1.1752, + "step": 3936 + }, + { + "epoch": 0.10109112034420281, + "grad_norm": 0.9453125, + "learning_rate": 0.00019721503806708653, + "loss": 1.1304, + "step": 3937 + }, + { + "epoch": 0.10111679754012463, + "grad_norm": 0.921875, + "learning_rate": 0.00019721399174916883, + "loss": 1.1356, + "step": 3938 + }, + { + "epoch": 0.10114247473604646, + "grad_norm": 0.89453125, + "learning_rate": 0.00019721294523751247, + "loss": 1.1137, + "step": 3939 + }, + { + "epoch": 0.10116815193196826, + "grad_norm": 0.91796875, + "learning_rate": 0.00019721189853211948, + "loss": 1.1894, + "step": 3940 + }, + { + "epoch": 0.10119382912789009, + "grad_norm": 0.8671875, + "learning_rate": 0.00019721085163299196, + "loss": 1.0769, + "step": 3941 + }, + { + "epoch": 0.10121950632381191, + "grad_norm": 0.84375, + "learning_rate": 0.00019720980454013197, + "loss": 1.2087, + "step": 3942 + }, + { + "epoch": 0.10124518351973373, + "grad_norm": 0.82421875, + "learning_rate": 0.00019720875725354165, + "loss": 1.1389, + "step": 3943 + }, + { + "epoch": 0.10127086071565555, + "grad_norm": 0.7890625, + "learning_rate": 0.00019720770977322305, + "loss": 1.2012, + "step": 3944 + }, + { + "epoch": 0.10129653791157736, + "grad_norm": 0.92578125, + "learning_rate": 0.00019720666209917825, + "loss": 1.21, + "step": 3945 + }, + { + "epoch": 0.10132221510749918, + "grad_norm": 0.96875, + "learning_rate": 0.00019720561423140938, + "loss": 1.2259, + "step": 3946 + }, + { + "epoch": 0.101347892303421, + "grad_norm": 0.83984375, + "learning_rate": 0.00019720456616991846, + "loss": 1.1361, + "step": 3947 + }, + { + "epoch": 0.10137356949934283, + "grad_norm": 0.91796875, + "learning_rate": 0.00019720351791470766, + "loss": 1.2027, + "step": 3948 + }, + { + "epoch": 0.10139924669526465, + "grad_norm": 0.8203125, + "learning_rate": 0.000197202469465779, + "loss": 1.1332, + "step": 3949 + }, + { + "epoch": 0.10142492389118646, + "grad_norm": 2.078125, + "learning_rate": 0.00019720142082313462, + "loss": 1.1719, + "step": 3950 + }, + { + "epoch": 0.10145060108710828, + "grad_norm": 0.8828125, + "learning_rate": 0.00019720037198677658, + "loss": 1.063, + "step": 3951 + }, + { + "epoch": 0.1014762782830301, + "grad_norm": 0.9296875, + "learning_rate": 0.00019719932295670698, + "loss": 1.2618, + "step": 3952 + }, + { + "epoch": 0.10150195547895192, + "grad_norm": 0.84765625, + "learning_rate": 0.00019719827373292792, + "loss": 1.1946, + "step": 3953 + }, + { + "epoch": 0.10152763267487375, + "grad_norm": 0.8828125, + "learning_rate": 0.00019719722431544144, + "loss": 1.2925, + "step": 3954 + }, + { + "epoch": 0.10155330987079556, + "grad_norm": 0.90234375, + "learning_rate": 0.00019719617470424972, + "loss": 1.2858, + "step": 3955 + }, + { + "epoch": 0.10157898706671738, + "grad_norm": 0.9453125, + "learning_rate": 0.0001971951248993548, + "loss": 1.1032, + "step": 3956 + }, + { + "epoch": 0.1016046642626392, + "grad_norm": 0.90625, + "learning_rate": 0.00019719407490075873, + "loss": 1.1132, + "step": 3957 + }, + { + "epoch": 0.10163034145856102, + "grad_norm": 0.890625, + "learning_rate": 0.00019719302470846368, + "loss": 1.3238, + "step": 3958 + }, + { + "epoch": 0.10165601865448284, + "grad_norm": 0.94921875, + "learning_rate": 0.0001971919743224717, + "loss": 1.1035, + "step": 3959 + }, + { + "epoch": 0.10168169585040465, + "grad_norm": 0.8203125, + "learning_rate": 0.00019719092374278493, + "loss": 0.961, + "step": 3960 + }, + { + "epoch": 0.10170737304632647, + "grad_norm": 0.87109375, + "learning_rate": 0.0001971898729694054, + "loss": 1.142, + "step": 3961 + }, + { + "epoch": 0.1017330502422483, + "grad_norm": 0.8984375, + "learning_rate": 0.00019718882200233523, + "loss": 1.2882, + "step": 3962 + }, + { + "epoch": 0.10175872743817012, + "grad_norm": 0.9296875, + "learning_rate": 0.00019718777084157653, + "loss": 1.2722, + "step": 3963 + }, + { + "epoch": 0.10178440463409194, + "grad_norm": 0.87890625, + "learning_rate": 0.00019718671948713138, + "loss": 1.2335, + "step": 3964 + }, + { + "epoch": 0.10181008183001375, + "grad_norm": 0.8046875, + "learning_rate": 0.00019718566793900185, + "loss": 1.0988, + "step": 3965 + }, + { + "epoch": 0.10183575902593557, + "grad_norm": 0.8671875, + "learning_rate": 0.0001971846161971901, + "loss": 1.2057, + "step": 3966 + }, + { + "epoch": 0.1018614362218574, + "grad_norm": 0.8671875, + "learning_rate": 0.00019718356426169816, + "loss": 1.0664, + "step": 3967 + }, + { + "epoch": 0.10188711341777922, + "grad_norm": 0.80078125, + "learning_rate": 0.00019718251213252815, + "loss": 1.1221, + "step": 3968 + }, + { + "epoch": 0.10191279061370104, + "grad_norm": 0.84765625, + "learning_rate": 0.0001971814598096822, + "loss": 1.1503, + "step": 3969 + }, + { + "epoch": 0.10193846780962285, + "grad_norm": 0.81640625, + "learning_rate": 0.00019718040729316236, + "loss": 1.1884, + "step": 3970 + }, + { + "epoch": 0.10196414500554467, + "grad_norm": 0.859375, + "learning_rate": 0.00019717935458297078, + "loss": 1.1146, + "step": 3971 + }, + { + "epoch": 0.10198982220146649, + "grad_norm": 1.0234375, + "learning_rate": 0.00019717830167910946, + "loss": 1.0595, + "step": 3972 + }, + { + "epoch": 0.10201549939738831, + "grad_norm": 0.859375, + "learning_rate": 0.00019717724858158063, + "loss": 1.0063, + "step": 3973 + }, + { + "epoch": 0.10204117659331013, + "grad_norm": 0.921875, + "learning_rate": 0.0001971761952903863, + "loss": 1.1256, + "step": 3974 + }, + { + "epoch": 0.10206685378923194, + "grad_norm": 1.0, + "learning_rate": 0.0001971751418055286, + "loss": 1.1509, + "step": 3975 + }, + { + "epoch": 0.10209253098515376, + "grad_norm": 0.85546875, + "learning_rate": 0.00019717408812700956, + "loss": 1.2308, + "step": 3976 + }, + { + "epoch": 0.10211820818107559, + "grad_norm": 0.8359375, + "learning_rate": 0.00019717303425483137, + "loss": 0.9604, + "step": 3977 + }, + { + "epoch": 0.10214388537699741, + "grad_norm": 0.89453125, + "learning_rate": 0.00019717198018899613, + "loss": 1.25, + "step": 3978 + }, + { + "epoch": 0.10216956257291922, + "grad_norm": 0.92578125, + "learning_rate": 0.0001971709259295059, + "loss": 1.1132, + "step": 3979 + }, + { + "epoch": 0.10219523976884104, + "grad_norm": 0.92578125, + "learning_rate": 0.00019716987147636278, + "loss": 1.3292, + "step": 3980 + }, + { + "epoch": 0.10222091696476286, + "grad_norm": 0.828125, + "learning_rate": 0.0001971688168295689, + "loss": 1.0939, + "step": 3981 + }, + { + "epoch": 0.10224659416068468, + "grad_norm": 0.83984375, + "learning_rate": 0.00019716776198912636, + "loss": 1.1368, + "step": 3982 + }, + { + "epoch": 0.1022722713566065, + "grad_norm": 0.91015625, + "learning_rate": 0.00019716670695503718, + "loss": 1.2929, + "step": 3983 + }, + { + "epoch": 0.10229794855252831, + "grad_norm": 0.8359375, + "learning_rate": 0.0001971656517273036, + "loss": 1.242, + "step": 3984 + }, + { + "epoch": 0.10232362574845014, + "grad_norm": 0.91015625, + "learning_rate": 0.0001971645963059276, + "loss": 1.2308, + "step": 3985 + }, + { + "epoch": 0.10234930294437196, + "grad_norm": 0.8046875, + "learning_rate": 0.00019716354069091135, + "loss": 1.1404, + "step": 3986 + }, + { + "epoch": 0.10237498014029378, + "grad_norm": 0.9375, + "learning_rate": 0.00019716248488225698, + "loss": 1.2387, + "step": 3987 + }, + { + "epoch": 0.1024006573362156, + "grad_norm": 0.890625, + "learning_rate": 0.00019716142887996654, + "loss": 1.315, + "step": 3988 + }, + { + "epoch": 0.10242633453213741, + "grad_norm": 0.91015625, + "learning_rate": 0.00019716037268404212, + "loss": 1.2723, + "step": 3989 + }, + { + "epoch": 0.10245201172805923, + "grad_norm": 0.96875, + "learning_rate": 0.00019715931629448588, + "loss": 1.217, + "step": 3990 + }, + { + "epoch": 0.10247768892398106, + "grad_norm": 0.86328125, + "learning_rate": 0.0001971582597112999, + "loss": 1.2611, + "step": 3991 + }, + { + "epoch": 0.10250336611990288, + "grad_norm": 0.91796875, + "learning_rate": 0.00019715720293448624, + "loss": 1.1691, + "step": 3992 + }, + { + "epoch": 0.1025290433158247, + "grad_norm": 0.84375, + "learning_rate": 0.0001971561459640471, + "loss": 1.2786, + "step": 3993 + }, + { + "epoch": 0.10255472051174651, + "grad_norm": 1.0078125, + "learning_rate": 0.00019715508879998453, + "loss": 1.0315, + "step": 3994 + }, + { + "epoch": 0.10258039770766833, + "grad_norm": 0.83984375, + "learning_rate": 0.00019715403144230062, + "loss": 1.2126, + "step": 3995 + }, + { + "epoch": 0.10260607490359015, + "grad_norm": 0.89453125, + "learning_rate": 0.00019715297389099752, + "loss": 1.2427, + "step": 3996 + }, + { + "epoch": 0.10263175209951197, + "grad_norm": 0.91015625, + "learning_rate": 0.0001971519161460773, + "loss": 1.2623, + "step": 3997 + }, + { + "epoch": 0.1026574292954338, + "grad_norm": 0.7890625, + "learning_rate": 0.0001971508582075421, + "loss": 1.1187, + "step": 3998 + }, + { + "epoch": 0.1026831064913556, + "grad_norm": 0.78125, + "learning_rate": 0.00019714980007539404, + "loss": 1.0484, + "step": 3999 + }, + { + "epoch": 0.10270878368727743, + "grad_norm": 0.80078125, + "learning_rate": 0.00019714874174963515, + "loss": 1.2014, + "step": 4000 + }, + { + "epoch": 0.10270878368727743, + "eval_loss": 1.1637061834335327, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 533.0197, + "eval_samples_per_second": 18.761, + "eval_steps_per_second": 0.587, + "step": 4000 + }, + { + "epoch": 0.10273446088319925, + "grad_norm": 0.83984375, + "learning_rate": 0.00019714768323026764, + "loss": 1.2368, + "step": 4001 + }, + { + "epoch": 0.10276013807912107, + "grad_norm": 0.859375, + "learning_rate": 0.00019714662451729358, + "loss": 1.221, + "step": 4002 + }, + { + "epoch": 0.1027858152750429, + "grad_norm": 0.85546875, + "learning_rate": 0.00019714556561071503, + "loss": 1.1553, + "step": 4003 + }, + { + "epoch": 0.1028114924709647, + "grad_norm": 0.86328125, + "learning_rate": 0.00019714450651053418, + "loss": 1.1304, + "step": 4004 + }, + { + "epoch": 0.10283716966688652, + "grad_norm": 0.8125, + "learning_rate": 0.00019714344721675307, + "loss": 1.1614, + "step": 4005 + }, + { + "epoch": 0.10286284686280835, + "grad_norm": 0.87890625, + "learning_rate": 0.00019714238772937387, + "loss": 1.2005, + "step": 4006 + }, + { + "epoch": 0.10288852405873017, + "grad_norm": 0.8203125, + "learning_rate": 0.00019714132804839866, + "loss": 1.1286, + "step": 4007 + }, + { + "epoch": 0.10291420125465199, + "grad_norm": 0.859375, + "learning_rate": 0.00019714026817382952, + "loss": 0.9923, + "step": 4008 + }, + { + "epoch": 0.1029398784505738, + "grad_norm": 0.84375, + "learning_rate": 0.00019713920810566867, + "loss": 1.2218, + "step": 4009 + }, + { + "epoch": 0.10296555564649562, + "grad_norm": 0.859375, + "learning_rate": 0.0001971381478439181, + "loss": 1.2054, + "step": 4010 + }, + { + "epoch": 0.10299123284241744, + "grad_norm": 0.80859375, + "learning_rate": 0.00019713708738858, + "loss": 1.2934, + "step": 4011 + }, + { + "epoch": 0.10301691003833927, + "grad_norm": 0.8203125, + "learning_rate": 0.00019713602673965643, + "loss": 1.2388, + "step": 4012 + }, + { + "epoch": 0.10304258723426109, + "grad_norm": 0.82421875, + "learning_rate": 0.00019713496589714958, + "loss": 1.3784, + "step": 4013 + }, + { + "epoch": 0.1030682644301829, + "grad_norm": 0.83984375, + "learning_rate": 0.00019713390486106146, + "loss": 1.1266, + "step": 4014 + }, + { + "epoch": 0.10309394162610472, + "grad_norm": 0.86328125, + "learning_rate": 0.0001971328436313943, + "loss": 1.1238, + "step": 4015 + }, + { + "epoch": 0.10311961882202654, + "grad_norm": 0.85546875, + "learning_rate": 0.0001971317822081501, + "loss": 1.21, + "step": 4016 + }, + { + "epoch": 0.10314529601794836, + "grad_norm": 0.82421875, + "learning_rate": 0.00019713072059133108, + "loss": 1.2284, + "step": 4017 + }, + { + "epoch": 0.10317097321387018, + "grad_norm": 0.890625, + "learning_rate": 0.0001971296587809393, + "loss": 1.151, + "step": 4018 + }, + { + "epoch": 0.10319665040979199, + "grad_norm": 0.8203125, + "learning_rate": 0.00019712859677697688, + "loss": 1.1792, + "step": 4019 + }, + { + "epoch": 0.10322232760571382, + "grad_norm": 0.90234375, + "learning_rate": 0.00019712753457944593, + "loss": 1.0885, + "step": 4020 + }, + { + "epoch": 0.10324800480163564, + "grad_norm": 0.86328125, + "learning_rate": 0.00019712647218834856, + "loss": 1.3272, + "step": 4021 + }, + { + "epoch": 0.10327368199755746, + "grad_norm": 0.8984375, + "learning_rate": 0.00019712540960368694, + "loss": 1.2536, + "step": 4022 + }, + { + "epoch": 0.10329935919347928, + "grad_norm": 0.89453125, + "learning_rate": 0.00019712434682546314, + "loss": 1.1381, + "step": 4023 + }, + { + "epoch": 0.10332503638940109, + "grad_norm": 0.95703125, + "learning_rate": 0.00019712328385367928, + "loss": 1.2882, + "step": 4024 + }, + { + "epoch": 0.10335071358532291, + "grad_norm": 0.828125, + "learning_rate": 0.0001971222206883375, + "loss": 1.0902, + "step": 4025 + }, + { + "epoch": 0.10337639078124473, + "grad_norm": 0.90625, + "learning_rate": 0.0001971211573294399, + "loss": 1.1666, + "step": 4026 + }, + { + "epoch": 0.10340206797716656, + "grad_norm": 0.83984375, + "learning_rate": 0.00019712009377698863, + "loss": 1.081, + "step": 4027 + }, + { + "epoch": 0.10342774517308838, + "grad_norm": 0.87109375, + "learning_rate": 0.00019711903003098575, + "loss": 1.1606, + "step": 4028 + }, + { + "epoch": 0.10345342236901019, + "grad_norm": 0.90234375, + "learning_rate": 0.00019711796609143343, + "loss": 1.3279, + "step": 4029 + }, + { + "epoch": 0.10347909956493201, + "grad_norm": 0.828125, + "learning_rate": 0.0001971169019583338, + "loss": 1.0596, + "step": 4030 + }, + { + "epoch": 0.10350477676085383, + "grad_norm": 0.82421875, + "learning_rate": 0.0001971158376316889, + "loss": 1.0855, + "step": 4031 + }, + { + "epoch": 0.10353045395677565, + "grad_norm": 0.8984375, + "learning_rate": 0.00019711477311150095, + "loss": 1.0916, + "step": 4032 + }, + { + "epoch": 0.10355613115269748, + "grad_norm": 0.828125, + "learning_rate": 0.000197113708397772, + "loss": 1.1296, + "step": 4033 + }, + { + "epoch": 0.10358180834861928, + "grad_norm": 0.8359375, + "learning_rate": 0.00019711264349050426, + "loss": 1.4485, + "step": 4034 + }, + { + "epoch": 0.1036074855445411, + "grad_norm": 0.796875, + "learning_rate": 0.00019711157838969975, + "loss": 1.0813, + "step": 4035 + }, + { + "epoch": 0.10363316274046293, + "grad_norm": 0.83984375, + "learning_rate": 0.00019711051309536065, + "loss": 1.1685, + "step": 4036 + }, + { + "epoch": 0.10365883993638475, + "grad_norm": 0.875, + "learning_rate": 0.00019710944760748905, + "loss": 1.074, + "step": 4037 + }, + { + "epoch": 0.10368451713230657, + "grad_norm": 0.8515625, + "learning_rate": 0.0001971083819260871, + "loss": 0.9926, + "step": 4038 + }, + { + "epoch": 0.10371019432822838, + "grad_norm": 0.87890625, + "learning_rate": 0.00019710731605115692, + "loss": 1.0473, + "step": 4039 + }, + { + "epoch": 0.1037358715241502, + "grad_norm": 0.921875, + "learning_rate": 0.00019710624998270064, + "loss": 1.2334, + "step": 4040 + }, + { + "epoch": 0.10376154872007202, + "grad_norm": 0.89453125, + "learning_rate": 0.00019710518372072038, + "loss": 1.1836, + "step": 4041 + }, + { + "epoch": 0.10378722591599385, + "grad_norm": 0.87890625, + "learning_rate": 0.00019710411726521824, + "loss": 1.1147, + "step": 4042 + }, + { + "epoch": 0.10381290311191567, + "grad_norm": 0.88671875, + "learning_rate": 0.0001971030506161964, + "loss": 1.251, + "step": 4043 + }, + { + "epoch": 0.10383858030783748, + "grad_norm": 0.83984375, + "learning_rate": 0.0001971019837736569, + "loss": 1.1352, + "step": 4044 + }, + { + "epoch": 0.1038642575037593, + "grad_norm": 0.8359375, + "learning_rate": 0.00019710091673760195, + "loss": 0.9884, + "step": 4045 + }, + { + "epoch": 0.10388993469968112, + "grad_norm": 0.9140625, + "learning_rate": 0.0001970998495080336, + "loss": 1.237, + "step": 4046 + }, + { + "epoch": 0.10391561189560294, + "grad_norm": 0.91796875, + "learning_rate": 0.00019709878208495408, + "loss": 1.1057, + "step": 4047 + }, + { + "epoch": 0.10394128909152477, + "grad_norm": 0.77734375, + "learning_rate": 0.00019709771446836544, + "loss": 1.1478, + "step": 4048 + }, + { + "epoch": 0.10396696628744657, + "grad_norm": 0.9609375, + "learning_rate": 0.0001970966466582698, + "loss": 1.1186, + "step": 4049 + }, + { + "epoch": 0.1039926434833684, + "grad_norm": 0.91015625, + "learning_rate": 0.00019709557865466937, + "loss": 1.1813, + "step": 4050 + }, + { + "epoch": 0.10401832067929022, + "grad_norm": 0.94140625, + "learning_rate": 0.00019709451045756617, + "loss": 1.2502, + "step": 4051 + }, + { + "epoch": 0.10404399787521204, + "grad_norm": 0.84375, + "learning_rate": 0.00019709344206696237, + "loss": 1.1374, + "step": 4052 + }, + { + "epoch": 0.10406967507113386, + "grad_norm": 0.828125, + "learning_rate": 0.00019709237348286015, + "loss": 1.1375, + "step": 4053 + }, + { + "epoch": 0.10409535226705567, + "grad_norm": 0.859375, + "learning_rate": 0.0001970913047052616, + "loss": 1.0803, + "step": 4054 + }, + { + "epoch": 0.1041210294629775, + "grad_norm": 0.828125, + "learning_rate": 0.00019709023573416884, + "loss": 1.1532, + "step": 4055 + }, + { + "epoch": 0.10414670665889932, + "grad_norm": 0.8671875, + "learning_rate": 0.00019708916656958404, + "loss": 1.1514, + "step": 4056 + }, + { + "epoch": 0.10417238385482114, + "grad_norm": 0.8984375, + "learning_rate": 0.00019708809721150924, + "loss": 1.1007, + "step": 4057 + }, + { + "epoch": 0.10419806105074296, + "grad_norm": 0.75, + "learning_rate": 0.00019708702765994666, + "loss": 1.0736, + "step": 4058 + }, + { + "epoch": 0.10422373824666477, + "grad_norm": 0.89453125, + "learning_rate": 0.00019708595791489845, + "loss": 1.086, + "step": 4059 + }, + { + "epoch": 0.10424941544258659, + "grad_norm": 0.86328125, + "learning_rate": 0.00019708488797636663, + "loss": 1.0189, + "step": 4060 + }, + { + "epoch": 0.10427509263850841, + "grad_norm": 0.8828125, + "learning_rate": 0.0001970838178443534, + "loss": 1.0995, + "step": 4061 + }, + { + "epoch": 0.10430076983443023, + "grad_norm": 0.87109375, + "learning_rate": 0.00019708274751886095, + "loss": 1.0462, + "step": 4062 + }, + { + "epoch": 0.10432644703035206, + "grad_norm": 0.84765625, + "learning_rate": 0.00019708167699989128, + "loss": 1.0996, + "step": 4063 + }, + { + "epoch": 0.10435212422627387, + "grad_norm": 0.94140625, + "learning_rate": 0.00019708060628744663, + "loss": 1.1328, + "step": 4064 + }, + { + "epoch": 0.10437780142219569, + "grad_norm": 0.890625, + "learning_rate": 0.00019707953538152914, + "loss": 1.2203, + "step": 4065 + }, + { + "epoch": 0.10440347861811751, + "grad_norm": 0.90625, + "learning_rate": 0.00019707846428214084, + "loss": 1.2283, + "step": 4066 + }, + { + "epoch": 0.10442915581403933, + "grad_norm": 0.82421875, + "learning_rate": 0.00019707739298928395, + "loss": 1.0266, + "step": 4067 + }, + { + "epoch": 0.10445483300996115, + "grad_norm": 0.84375, + "learning_rate": 0.00019707632150296062, + "loss": 1.2558, + "step": 4068 + }, + { + "epoch": 0.10448051020588296, + "grad_norm": 0.890625, + "learning_rate": 0.0001970752498231729, + "loss": 1.1707, + "step": 4069 + }, + { + "epoch": 0.10450618740180478, + "grad_norm": 0.9765625, + "learning_rate": 0.000197074177949923, + "loss": 1.1693, + "step": 4070 + }, + { + "epoch": 0.1045318645977266, + "grad_norm": 0.875, + "learning_rate": 0.000197073105883213, + "loss": 1.2569, + "step": 4071 + }, + { + "epoch": 0.10455754179364843, + "grad_norm": 0.87890625, + "learning_rate": 0.00019707203362304512, + "loss": 1.2623, + "step": 4072 + }, + { + "epoch": 0.10458321898957025, + "grad_norm": 0.86328125, + "learning_rate": 0.00019707096116942143, + "loss": 1.0573, + "step": 4073 + }, + { + "epoch": 0.10460889618549206, + "grad_norm": 0.87890625, + "learning_rate": 0.0001970698885223441, + "loss": 1.1099, + "step": 4074 + }, + { + "epoch": 0.10463457338141388, + "grad_norm": 0.921875, + "learning_rate": 0.0001970688156818152, + "loss": 1.1789, + "step": 4075 + }, + { + "epoch": 0.1046602505773357, + "grad_norm": 1.484375, + "learning_rate": 0.00019706774264783693, + "loss": 1.1941, + "step": 4076 + }, + { + "epoch": 0.10468592777325753, + "grad_norm": 0.953125, + "learning_rate": 0.00019706666942041146, + "loss": 1.2064, + "step": 4077 + }, + { + "epoch": 0.10471160496917935, + "grad_norm": 0.80859375, + "learning_rate": 0.00019706559599954083, + "loss": 1.1178, + "step": 4078 + }, + { + "epoch": 0.10473728216510116, + "grad_norm": 0.90234375, + "learning_rate": 0.0001970645223852273, + "loss": 1.2218, + "step": 4079 + }, + { + "epoch": 0.10476295936102298, + "grad_norm": 0.828125, + "learning_rate": 0.0001970634485774729, + "loss": 1.1767, + "step": 4080 + }, + { + "epoch": 0.1047886365569448, + "grad_norm": 0.8828125, + "learning_rate": 0.00019706237457627982, + "loss": 1.2904, + "step": 4081 + }, + { + "epoch": 0.10481431375286662, + "grad_norm": 0.90625, + "learning_rate": 0.0001970613003816502, + "loss": 1.1515, + "step": 4082 + }, + { + "epoch": 0.10483999094878843, + "grad_norm": 0.94921875, + "learning_rate": 0.0001970602259935862, + "loss": 1.339, + "step": 4083 + }, + { + "epoch": 0.10486566814471025, + "grad_norm": 0.84375, + "learning_rate": 0.00019705915141208987, + "loss": 1.1567, + "step": 4084 + }, + { + "epoch": 0.10489134534063208, + "grad_norm": 0.9921875, + "learning_rate": 0.00019705807663716348, + "loss": 1.2901, + "step": 4085 + }, + { + "epoch": 0.1049170225365539, + "grad_norm": 0.90625, + "learning_rate": 0.0001970570016688091, + "loss": 1.0884, + "step": 4086 + }, + { + "epoch": 0.10494269973247572, + "grad_norm": 1.1796875, + "learning_rate": 0.00019705592650702886, + "loss": 1.1537, + "step": 4087 + }, + { + "epoch": 0.10496837692839753, + "grad_norm": 0.92578125, + "learning_rate": 0.00019705485115182494, + "loss": 1.0711, + "step": 4088 + }, + { + "epoch": 0.10499405412431935, + "grad_norm": 0.96875, + "learning_rate": 0.00019705377560319946, + "loss": 1.0863, + "step": 4089 + }, + { + "epoch": 0.10501973132024117, + "grad_norm": 0.84765625, + "learning_rate": 0.00019705269986115457, + "loss": 1.1477, + "step": 4090 + }, + { + "epoch": 0.105045408516163, + "grad_norm": 0.8828125, + "learning_rate": 0.00019705162392569245, + "loss": 1.1294, + "step": 4091 + }, + { + "epoch": 0.10507108571208482, + "grad_norm": 0.79296875, + "learning_rate": 0.00019705054779681514, + "loss": 1.2142, + "step": 4092 + }, + { + "epoch": 0.10509676290800662, + "grad_norm": 0.84765625, + "learning_rate": 0.00019704947147452492, + "loss": 1.0024, + "step": 4093 + }, + { + "epoch": 0.10512244010392845, + "grad_norm": 0.8046875, + "learning_rate": 0.00019704839495882385, + "loss": 1.0419, + "step": 4094 + }, + { + "epoch": 0.10514811729985027, + "grad_norm": 0.7890625, + "learning_rate": 0.00019704731824971412, + "loss": 0.9872, + "step": 4095 + }, + { + "epoch": 0.10517379449577209, + "grad_norm": 1.3828125, + "learning_rate": 0.0001970462413471978, + "loss": 1.2478, + "step": 4096 + }, + { + "epoch": 0.10519947169169391, + "grad_norm": 0.85546875, + "learning_rate": 0.00019704516425127712, + "loss": 1.2509, + "step": 4097 + }, + { + "epoch": 0.10522514888761572, + "grad_norm": 0.83984375, + "learning_rate": 0.00019704408696195416, + "loss": 1.0603, + "step": 4098 + }, + { + "epoch": 0.10525082608353754, + "grad_norm": 0.82421875, + "learning_rate": 0.00019704300947923117, + "loss": 1.1827, + "step": 4099 + }, + { + "epoch": 0.10527650327945937, + "grad_norm": 0.828125, + "learning_rate": 0.00019704193180311016, + "loss": 1.2632, + "step": 4100 + }, + { + "epoch": 0.10530218047538119, + "grad_norm": 0.86328125, + "learning_rate": 0.00019704085393359338, + "loss": 1.0198, + "step": 4101 + }, + { + "epoch": 0.10532785767130301, + "grad_norm": 0.859375, + "learning_rate": 0.00019703977587068295, + "loss": 1.1305, + "step": 4102 + }, + { + "epoch": 0.10535353486722482, + "grad_norm": 0.8671875, + "learning_rate": 0.000197038697614381, + "loss": 1.1249, + "step": 4103 + }, + { + "epoch": 0.10537921206314664, + "grad_norm": 0.8984375, + "learning_rate": 0.00019703761916468968, + "loss": 1.1659, + "step": 4104 + }, + { + "epoch": 0.10540488925906846, + "grad_norm": 0.875, + "learning_rate": 0.00019703654052161116, + "loss": 1.2034, + "step": 4105 + }, + { + "epoch": 0.10543056645499028, + "grad_norm": 0.8671875, + "learning_rate": 0.00019703546168514758, + "loss": 1.0881, + "step": 4106 + }, + { + "epoch": 0.10545624365091211, + "grad_norm": 0.9453125, + "learning_rate": 0.00019703438265530107, + "loss": 1.2385, + "step": 4107 + }, + { + "epoch": 0.10548192084683392, + "grad_norm": 0.8125, + "learning_rate": 0.00019703330343207381, + "loss": 1.1872, + "step": 4108 + }, + { + "epoch": 0.10550759804275574, + "grad_norm": 0.89453125, + "learning_rate": 0.00019703222401546797, + "loss": 1.1455, + "step": 4109 + }, + { + "epoch": 0.10553327523867756, + "grad_norm": 0.859375, + "learning_rate": 0.00019703114440548566, + "loss": 1.0228, + "step": 4110 + }, + { + "epoch": 0.10555895243459938, + "grad_norm": 0.90234375, + "learning_rate": 0.00019703006460212901, + "loss": 1.1986, + "step": 4111 + }, + { + "epoch": 0.1055846296305212, + "grad_norm": 0.84375, + "learning_rate": 0.00019702898460540025, + "loss": 1.0507, + "step": 4112 + }, + { + "epoch": 0.10561030682644301, + "grad_norm": 0.84765625, + "learning_rate": 0.00019702790441530148, + "loss": 1.1151, + "step": 4113 + }, + { + "epoch": 0.10563598402236483, + "grad_norm": 0.8046875, + "learning_rate": 0.00019702682403183484, + "loss": 1.0182, + "step": 4114 + }, + { + "epoch": 0.10566166121828666, + "grad_norm": 0.84765625, + "learning_rate": 0.0001970257434550025, + "loss": 1.0623, + "step": 4115 + }, + { + "epoch": 0.10568733841420848, + "grad_norm": 0.83984375, + "learning_rate": 0.00019702466268480665, + "loss": 1.0832, + "step": 4116 + }, + { + "epoch": 0.1057130156101303, + "grad_norm": 0.8515625, + "learning_rate": 0.0001970235817212494, + "loss": 1.0784, + "step": 4117 + }, + { + "epoch": 0.10573869280605211, + "grad_norm": 0.953125, + "learning_rate": 0.00019702250056433292, + "loss": 1.1267, + "step": 4118 + }, + { + "epoch": 0.10576437000197393, + "grad_norm": 0.8359375, + "learning_rate": 0.00019702141921405935, + "loss": 1.1872, + "step": 4119 + }, + { + "epoch": 0.10579004719789575, + "grad_norm": 0.82421875, + "learning_rate": 0.00019702033767043085, + "loss": 1.184, + "step": 4120 + }, + { + "epoch": 0.10581572439381758, + "grad_norm": 0.90234375, + "learning_rate": 0.00019701925593344958, + "loss": 1.2718, + "step": 4121 + }, + { + "epoch": 0.1058414015897394, + "grad_norm": 0.8359375, + "learning_rate": 0.00019701817400311772, + "loss": 1.1725, + "step": 4122 + }, + { + "epoch": 0.1058670787856612, + "grad_norm": 0.85546875, + "learning_rate": 0.00019701709187943737, + "loss": 1.2423, + "step": 4123 + }, + { + "epoch": 0.10589275598158303, + "grad_norm": 0.8046875, + "learning_rate": 0.00019701600956241073, + "loss": 1.201, + "step": 4124 + }, + { + "epoch": 0.10591843317750485, + "grad_norm": 0.8359375, + "learning_rate": 0.00019701492705203997, + "loss": 1.1324, + "step": 4125 + }, + { + "epoch": 0.10594411037342667, + "grad_norm": 0.8984375, + "learning_rate": 0.00019701384434832718, + "loss": 1.1606, + "step": 4126 + }, + { + "epoch": 0.1059697875693485, + "grad_norm": 0.84765625, + "learning_rate": 0.0001970127614512746, + "loss": 1.1497, + "step": 4127 + }, + { + "epoch": 0.1059954647652703, + "grad_norm": 0.83984375, + "learning_rate": 0.00019701167836088434, + "loss": 1.118, + "step": 4128 + }, + { + "epoch": 0.10602114196119213, + "grad_norm": 0.89453125, + "learning_rate": 0.00019701059507715857, + "loss": 1.2946, + "step": 4129 + }, + { + "epoch": 0.10604681915711395, + "grad_norm": 0.828125, + "learning_rate": 0.00019700951160009944, + "loss": 1.0507, + "step": 4130 + }, + { + "epoch": 0.10607249635303577, + "grad_norm": 0.875, + "learning_rate": 0.00019700842792970912, + "loss": 1.0937, + "step": 4131 + }, + { + "epoch": 0.10609817354895759, + "grad_norm": 0.81640625, + "learning_rate": 0.00019700734406598978, + "loss": 1.1826, + "step": 4132 + }, + { + "epoch": 0.1061238507448794, + "grad_norm": 0.86328125, + "learning_rate": 0.00019700626000894352, + "loss": 1.0784, + "step": 4133 + }, + { + "epoch": 0.10614952794080122, + "grad_norm": 0.8125, + "learning_rate": 0.00019700517575857261, + "loss": 1.0642, + "step": 4134 + }, + { + "epoch": 0.10617520513672304, + "grad_norm": 0.81640625, + "learning_rate": 0.00019700409131487908, + "loss": 1.0279, + "step": 4135 + }, + { + "epoch": 0.10620088233264487, + "grad_norm": 0.8828125, + "learning_rate": 0.0001970030066778652, + "loss": 1.2014, + "step": 4136 + }, + { + "epoch": 0.10622655952856669, + "grad_norm": 0.83203125, + "learning_rate": 0.0001970019218475331, + "loss": 1.1502, + "step": 4137 + }, + { + "epoch": 0.1062522367244885, + "grad_norm": 0.84375, + "learning_rate": 0.00019700083682388492, + "loss": 1.1667, + "step": 4138 + }, + { + "epoch": 0.10627791392041032, + "grad_norm": 0.953125, + "learning_rate": 0.0001969997516069228, + "loss": 1.07, + "step": 4139 + }, + { + "epoch": 0.10630359111633214, + "grad_norm": 0.8515625, + "learning_rate": 0.00019699866619664897, + "loss": 1.1264, + "step": 4140 + }, + { + "epoch": 0.10632926831225396, + "grad_norm": 0.86328125, + "learning_rate": 0.00019699758059306557, + "loss": 1.106, + "step": 4141 + }, + { + "epoch": 0.10635494550817579, + "grad_norm": 0.8828125, + "learning_rate": 0.00019699649479617474, + "loss": 1.2117, + "step": 4142 + }, + { + "epoch": 0.1063806227040976, + "grad_norm": 0.91796875, + "learning_rate": 0.00019699540880597865, + "loss": 1.115, + "step": 4143 + }, + { + "epoch": 0.10640629990001942, + "grad_norm": 1.1171875, + "learning_rate": 0.00019699432262247946, + "loss": 1.231, + "step": 4144 + }, + { + "epoch": 0.10643197709594124, + "grad_norm": 0.91796875, + "learning_rate": 0.00019699323624567937, + "loss": 1.2128, + "step": 4145 + }, + { + "epoch": 0.10645765429186306, + "grad_norm": 0.8828125, + "learning_rate": 0.00019699214967558052, + "loss": 1.2207, + "step": 4146 + }, + { + "epoch": 0.10648333148778488, + "grad_norm": 0.859375, + "learning_rate": 0.00019699106291218508, + "loss": 1.1579, + "step": 4147 + }, + { + "epoch": 0.10650900868370669, + "grad_norm": 0.90625, + "learning_rate": 0.00019698997595549522, + "loss": 1.304, + "step": 4148 + }, + { + "epoch": 0.10653468587962851, + "grad_norm": 0.9375, + "learning_rate": 0.00019698888880551307, + "loss": 1.1799, + "step": 4149 + }, + { + "epoch": 0.10656036307555034, + "grad_norm": 0.86328125, + "learning_rate": 0.00019698780146224084, + "loss": 1.2193, + "step": 4150 + }, + { + "epoch": 0.10658604027147216, + "grad_norm": 0.9765625, + "learning_rate": 0.00019698671392568069, + "loss": 1.2396, + "step": 4151 + }, + { + "epoch": 0.10661171746739398, + "grad_norm": 0.84375, + "learning_rate": 0.00019698562619583477, + "loss": 1.1806, + "step": 4152 + }, + { + "epoch": 0.10663739466331579, + "grad_norm": 0.89453125, + "learning_rate": 0.00019698453827270525, + "loss": 1.1655, + "step": 4153 + }, + { + "epoch": 0.10666307185923761, + "grad_norm": 0.84765625, + "learning_rate": 0.00019698345015629434, + "loss": 1.0916, + "step": 4154 + }, + { + "epoch": 0.10668874905515943, + "grad_norm": 0.87890625, + "learning_rate": 0.00019698236184660414, + "loss": 1.2267, + "step": 4155 + }, + { + "epoch": 0.10671442625108125, + "grad_norm": 0.91015625, + "learning_rate": 0.00019698127334363689, + "loss": 1.1578, + "step": 4156 + }, + { + "epoch": 0.10674010344700308, + "grad_norm": 0.82421875, + "learning_rate": 0.00019698018464739468, + "loss": 1.3012, + "step": 4157 + }, + { + "epoch": 0.10676578064292488, + "grad_norm": 0.85546875, + "learning_rate": 0.00019697909575787975, + "loss": 1.0101, + "step": 4158 + }, + { + "epoch": 0.1067914578388467, + "grad_norm": 0.80078125, + "learning_rate": 0.0001969780066750942, + "loss": 1.2097, + "step": 4159 + }, + { + "epoch": 0.10681713503476853, + "grad_norm": 0.89453125, + "learning_rate": 0.0001969769173990403, + "loss": 1.0567, + "step": 4160 + }, + { + "epoch": 0.10684281223069035, + "grad_norm": 0.93359375, + "learning_rate": 0.00019697582792972012, + "loss": 1.2141, + "step": 4161 + }, + { + "epoch": 0.10686848942661217, + "grad_norm": 0.83203125, + "learning_rate": 0.00019697473826713592, + "loss": 1.0135, + "step": 4162 + }, + { + "epoch": 0.10689416662253398, + "grad_norm": 0.80859375, + "learning_rate": 0.00019697364841128975, + "loss": 1.0774, + "step": 4163 + }, + { + "epoch": 0.1069198438184558, + "grad_norm": 0.84375, + "learning_rate": 0.00019697255836218395, + "loss": 1.2163, + "step": 4164 + }, + { + "epoch": 0.10694552101437763, + "grad_norm": 0.8203125, + "learning_rate": 0.00019697146811982052, + "loss": 1.0807, + "step": 4165 + }, + { + "epoch": 0.10697119821029945, + "grad_norm": 0.83203125, + "learning_rate": 0.00019697037768420174, + "loss": 1.0981, + "step": 4166 + }, + { + "epoch": 0.10699687540622127, + "grad_norm": 0.83203125, + "learning_rate": 0.00019696928705532977, + "loss": 1.0605, + "step": 4167 + }, + { + "epoch": 0.10702255260214308, + "grad_norm": 0.85546875, + "learning_rate": 0.0001969681962332068, + "loss": 1.3017, + "step": 4168 + }, + { + "epoch": 0.1070482297980649, + "grad_norm": 0.83984375, + "learning_rate": 0.00019696710521783492, + "loss": 1.0304, + "step": 4169 + }, + { + "epoch": 0.10707390699398672, + "grad_norm": 0.81640625, + "learning_rate": 0.0001969660140092164, + "loss": 1.2588, + "step": 4170 + }, + { + "epoch": 0.10709958418990854, + "grad_norm": 0.85546875, + "learning_rate": 0.00019696492260735333, + "loss": 1.1059, + "step": 4171 + }, + { + "epoch": 0.10712526138583037, + "grad_norm": 0.8515625, + "learning_rate": 0.00019696383101224795, + "loss": 1.1526, + "step": 4172 + }, + { + "epoch": 0.10715093858175218, + "grad_norm": 0.8046875, + "learning_rate": 0.00019696273922390243, + "loss": 1.0475, + "step": 4173 + }, + { + "epoch": 0.107176615777674, + "grad_norm": 0.8125, + "learning_rate": 0.0001969616472423189, + "loss": 1.1704, + "step": 4174 + }, + { + "epoch": 0.10720229297359582, + "grad_norm": 0.92578125, + "learning_rate": 0.0001969605550674996, + "loss": 1.1565, + "step": 4175 + }, + { + "epoch": 0.10722797016951764, + "grad_norm": 0.8125, + "learning_rate": 0.00019695946269944665, + "loss": 1.2409, + "step": 4176 + }, + { + "epoch": 0.10725364736543946, + "grad_norm": 0.78515625, + "learning_rate": 0.00019695837013816223, + "loss": 1.0847, + "step": 4177 + }, + { + "epoch": 0.10727932456136127, + "grad_norm": 0.7578125, + "learning_rate": 0.00019695727738364854, + "loss": 1.0485, + "step": 4178 + }, + { + "epoch": 0.1073050017572831, + "grad_norm": 0.86328125, + "learning_rate": 0.0001969561844359078, + "loss": 1.2524, + "step": 4179 + }, + { + "epoch": 0.10733067895320492, + "grad_norm": 0.8515625, + "learning_rate": 0.0001969550912949421, + "loss": 1.2708, + "step": 4180 + }, + { + "epoch": 0.10735635614912674, + "grad_norm": 0.9140625, + "learning_rate": 0.00019695399796075368, + "loss": 1.2876, + "step": 4181 + }, + { + "epoch": 0.10738203334504856, + "grad_norm": 0.84375, + "learning_rate": 0.0001969529044333447, + "loss": 1.1211, + "step": 4182 + }, + { + "epoch": 0.10740771054097037, + "grad_norm": 0.80859375, + "learning_rate": 0.00019695181071271734, + "loss": 1.098, + "step": 4183 + }, + { + "epoch": 0.10743338773689219, + "grad_norm": 0.9296875, + "learning_rate": 0.00019695071679887377, + "loss": 1.2863, + "step": 4184 + }, + { + "epoch": 0.10745906493281401, + "grad_norm": 0.8984375, + "learning_rate": 0.00019694962269181616, + "loss": 1.114, + "step": 4185 + }, + { + "epoch": 0.10748474212873584, + "grad_norm": 0.890625, + "learning_rate": 0.00019694852839154673, + "loss": 1.2584, + "step": 4186 + }, + { + "epoch": 0.10751041932465764, + "grad_norm": 0.85546875, + "learning_rate": 0.00019694743389806765, + "loss": 1.1944, + "step": 4187 + }, + { + "epoch": 0.10753609652057947, + "grad_norm": 0.8359375, + "learning_rate": 0.0001969463392113811, + "loss": 1.2462, + "step": 4188 + }, + { + "epoch": 0.10756177371650129, + "grad_norm": 0.890625, + "learning_rate": 0.00019694524433148922, + "loss": 1.0818, + "step": 4189 + }, + { + "epoch": 0.10758745091242311, + "grad_norm": 0.859375, + "learning_rate": 0.00019694414925839425, + "loss": 1.2357, + "step": 4190 + }, + { + "epoch": 0.10761312810834493, + "grad_norm": 0.90234375, + "learning_rate": 0.00019694305399209832, + "loss": 1.0332, + "step": 4191 + }, + { + "epoch": 0.10763880530426674, + "grad_norm": 0.91015625, + "learning_rate": 0.0001969419585326037, + "loss": 1.4245, + "step": 4192 + }, + { + "epoch": 0.10766448250018856, + "grad_norm": 0.8359375, + "learning_rate": 0.00019694086287991246, + "loss": 1.0368, + "step": 4193 + }, + { + "epoch": 0.10769015969611039, + "grad_norm": 0.890625, + "learning_rate": 0.00019693976703402682, + "loss": 1.3702, + "step": 4194 + }, + { + "epoch": 0.10771583689203221, + "grad_norm": 0.82421875, + "learning_rate": 0.00019693867099494902, + "loss": 1.1786, + "step": 4195 + }, + { + "epoch": 0.10774151408795403, + "grad_norm": 0.88671875, + "learning_rate": 0.0001969375747626812, + "loss": 1.1637, + "step": 4196 + }, + { + "epoch": 0.10776719128387584, + "grad_norm": 0.828125, + "learning_rate": 0.00019693647833722553, + "loss": 1.1198, + "step": 4197 + }, + { + "epoch": 0.10779286847979766, + "grad_norm": 0.890625, + "learning_rate": 0.00019693538171858426, + "loss": 1.3122, + "step": 4198 + }, + { + "epoch": 0.10781854567571948, + "grad_norm": 0.77734375, + "learning_rate": 0.00019693428490675946, + "loss": 1.1678, + "step": 4199 + }, + { + "epoch": 0.1078442228716413, + "grad_norm": 0.8203125, + "learning_rate": 0.00019693318790175345, + "loss": 1.0395, + "step": 4200 + }, + { + "epoch": 0.10786990006756313, + "grad_norm": 0.80078125, + "learning_rate": 0.00019693209070356833, + "loss": 1.1122, + "step": 4201 + }, + { + "epoch": 0.10789557726348493, + "grad_norm": 0.8515625, + "learning_rate": 0.0001969309933122063, + "loss": 1.2243, + "step": 4202 + }, + { + "epoch": 0.10792125445940676, + "grad_norm": 0.9765625, + "learning_rate": 0.00019692989572766957, + "loss": 1.1006, + "step": 4203 + }, + { + "epoch": 0.10794693165532858, + "grad_norm": 0.87890625, + "learning_rate": 0.00019692879794996028, + "loss": 1.0665, + "step": 4204 + }, + { + "epoch": 0.1079726088512504, + "grad_norm": 0.91796875, + "learning_rate": 0.0001969276999790807, + "loss": 1.1095, + "step": 4205 + }, + { + "epoch": 0.10799828604717222, + "grad_norm": 0.83984375, + "learning_rate": 0.00019692660181503294, + "loss": 1.2285, + "step": 4206 + }, + { + "epoch": 0.10802396324309403, + "grad_norm": 0.84375, + "learning_rate": 0.00019692550345781925, + "loss": 1.1914, + "step": 4207 + }, + { + "epoch": 0.10804964043901585, + "grad_norm": 0.83203125, + "learning_rate": 0.00019692440490744176, + "loss": 1.3717, + "step": 4208 + }, + { + "epoch": 0.10807531763493768, + "grad_norm": 0.859375, + "learning_rate": 0.0001969233061639027, + "loss": 1.2085, + "step": 4209 + }, + { + "epoch": 0.1081009948308595, + "grad_norm": 0.86328125, + "learning_rate": 0.0001969222072272042, + "loss": 1.0979, + "step": 4210 + }, + { + "epoch": 0.10812667202678132, + "grad_norm": 0.89453125, + "learning_rate": 0.00019692110809734853, + "loss": 1.0729, + "step": 4211 + }, + { + "epoch": 0.10815234922270313, + "grad_norm": 0.87109375, + "learning_rate": 0.00019692000877433787, + "loss": 1.1807, + "step": 4212 + }, + { + "epoch": 0.10817802641862495, + "grad_norm": 0.87890625, + "learning_rate": 0.00019691890925817437, + "loss": 1.1357, + "step": 4213 + }, + { + "epoch": 0.10820370361454677, + "grad_norm": 0.921875, + "learning_rate": 0.0001969178095488602, + "loss": 1.141, + "step": 4214 + }, + { + "epoch": 0.1082293808104686, + "grad_norm": 0.8671875, + "learning_rate": 0.00019691670964639763, + "loss": 1.2523, + "step": 4215 + }, + { + "epoch": 0.10825505800639042, + "grad_norm": 0.8046875, + "learning_rate": 0.00019691560955078884, + "loss": 1.0799, + "step": 4216 + }, + { + "epoch": 0.10828073520231223, + "grad_norm": 0.765625, + "learning_rate": 0.0001969145092620359, + "loss": 1.0311, + "step": 4217 + }, + { + "epoch": 0.10830641239823405, + "grad_norm": 0.8203125, + "learning_rate": 0.0001969134087801412, + "loss": 1.1026, + "step": 4218 + }, + { + "epoch": 0.10833208959415587, + "grad_norm": 0.8671875, + "learning_rate": 0.00019691230810510676, + "loss": 1.1307, + "step": 4219 + }, + { + "epoch": 0.10835776679007769, + "grad_norm": 0.85546875, + "learning_rate": 0.0001969112072369349, + "loss": 1.0967, + "step": 4220 + }, + { + "epoch": 0.10838344398599951, + "grad_norm": 0.890625, + "learning_rate": 0.00019691010617562773, + "loss": 0.9239, + "step": 4221 + }, + { + "epoch": 0.10840912118192132, + "grad_norm": 0.83984375, + "learning_rate": 0.00019690900492118748, + "loss": 1.2922, + "step": 4222 + }, + { + "epoch": 0.10843479837784314, + "grad_norm": 0.88671875, + "learning_rate": 0.0001969079034736163, + "loss": 1.1514, + "step": 4223 + }, + { + "epoch": 0.10846047557376497, + "grad_norm": 0.94140625, + "learning_rate": 0.00019690680183291644, + "loss": 1.1005, + "step": 4224 + }, + { + "epoch": 0.10848615276968679, + "grad_norm": 0.80078125, + "learning_rate": 0.0001969056999990901, + "loss": 1.0267, + "step": 4225 + }, + { + "epoch": 0.10851182996560861, + "grad_norm": 0.890625, + "learning_rate": 0.00019690459797213944, + "loss": 1.0677, + "step": 4226 + }, + { + "epoch": 0.10853750716153042, + "grad_norm": 0.87109375, + "learning_rate": 0.00019690349575206666, + "loss": 1.2553, + "step": 4227 + }, + { + "epoch": 0.10856318435745224, + "grad_norm": 0.90234375, + "learning_rate": 0.000196902393338874, + "loss": 1.3177, + "step": 4228 + }, + { + "epoch": 0.10858886155337406, + "grad_norm": 0.8515625, + "learning_rate": 0.00019690129073256357, + "loss": 1.2617, + "step": 4229 + }, + { + "epoch": 0.10861453874929589, + "grad_norm": 0.8828125, + "learning_rate": 0.00019690018793313768, + "loss": 1.1725, + "step": 4230 + }, + { + "epoch": 0.10864021594521771, + "grad_norm": 0.859375, + "learning_rate": 0.00019689908494059842, + "loss": 1.093, + "step": 4231 + }, + { + "epoch": 0.10866589314113952, + "grad_norm": 0.859375, + "learning_rate": 0.00019689798175494806, + "loss": 1.1172, + "step": 4232 + }, + { + "epoch": 0.10869157033706134, + "grad_norm": 0.8203125, + "learning_rate": 0.00019689687837618877, + "loss": 1.2404, + "step": 4233 + }, + { + "epoch": 0.10871724753298316, + "grad_norm": 0.90234375, + "learning_rate": 0.00019689577480432274, + "loss": 1.1613, + "step": 4234 + }, + { + "epoch": 0.10874292472890498, + "grad_norm": 0.81640625, + "learning_rate": 0.0001968946710393522, + "loss": 1.0342, + "step": 4235 + }, + { + "epoch": 0.1087686019248268, + "grad_norm": 0.91796875, + "learning_rate": 0.00019689356708127936, + "loss": 1.2458, + "step": 4236 + }, + { + "epoch": 0.10879427912074861, + "grad_norm": 0.8515625, + "learning_rate": 0.00019689246293010636, + "loss": 1.1499, + "step": 4237 + }, + { + "epoch": 0.10881995631667044, + "grad_norm": 0.87890625, + "learning_rate": 0.00019689135858583543, + "loss": 1.4176, + "step": 4238 + }, + { + "epoch": 0.10884563351259226, + "grad_norm": 0.79296875, + "learning_rate": 0.00019689025404846878, + "loss": 1.113, + "step": 4239 + }, + { + "epoch": 0.10887131070851408, + "grad_norm": 0.85546875, + "learning_rate": 0.00019688914931800862, + "loss": 1.079, + "step": 4240 + }, + { + "epoch": 0.1088969879044359, + "grad_norm": 0.78515625, + "learning_rate": 0.00019688804439445712, + "loss": 1.2135, + "step": 4241 + }, + { + "epoch": 0.10892266510035771, + "grad_norm": 0.87109375, + "learning_rate": 0.0001968869392778165, + "loss": 1.1051, + "step": 4242 + }, + { + "epoch": 0.10894834229627953, + "grad_norm": 0.828125, + "learning_rate": 0.00019688583396808897, + "loss": 1.2087, + "step": 4243 + }, + { + "epoch": 0.10897401949220135, + "grad_norm": 0.83984375, + "learning_rate": 0.00019688472846527671, + "loss": 1.069, + "step": 4244 + }, + { + "epoch": 0.10899969668812318, + "grad_norm": 0.87890625, + "learning_rate": 0.00019688362276938195, + "loss": 1.2823, + "step": 4245 + }, + { + "epoch": 0.109025373884045, + "grad_norm": 0.85546875, + "learning_rate": 0.0001968825168804069, + "loss": 1.07, + "step": 4246 + }, + { + "epoch": 0.10905105107996681, + "grad_norm": 0.8671875, + "learning_rate": 0.0001968814107983537, + "loss": 1.2791, + "step": 4247 + }, + { + "epoch": 0.10907672827588863, + "grad_norm": 0.90625, + "learning_rate": 0.00019688030452322464, + "loss": 1.0723, + "step": 4248 + }, + { + "epoch": 0.10910240547181045, + "grad_norm": 0.87109375, + "learning_rate": 0.00019687919805502188, + "loss": 1.1862, + "step": 4249 + }, + { + "epoch": 0.10912808266773227, + "grad_norm": 0.890625, + "learning_rate": 0.00019687809139374763, + "loss": 1.1178, + "step": 4250 + }, + { + "epoch": 0.1091537598636541, + "grad_norm": 0.87109375, + "learning_rate": 0.00019687698453940407, + "loss": 1.0189, + "step": 4251 + }, + { + "epoch": 0.1091794370595759, + "grad_norm": 0.8671875, + "learning_rate": 0.00019687587749199343, + "loss": 1.094, + "step": 4252 + }, + { + "epoch": 0.10920511425549773, + "grad_norm": 0.8671875, + "learning_rate": 0.00019687477025151794, + "loss": 0.9701, + "step": 4253 + }, + { + "epoch": 0.10923079145141955, + "grad_norm": 0.90625, + "learning_rate": 0.00019687366281797978, + "loss": 1.05, + "step": 4254 + }, + { + "epoch": 0.10925646864734137, + "grad_norm": 0.94921875, + "learning_rate": 0.00019687255519138116, + "loss": 1.2778, + "step": 4255 + }, + { + "epoch": 0.10928214584326319, + "grad_norm": 0.94140625, + "learning_rate": 0.00019687144737172427, + "loss": 1.1626, + "step": 4256 + }, + { + "epoch": 0.109307823039185, + "grad_norm": 0.8984375, + "learning_rate": 0.00019687033935901136, + "loss": 1.1104, + "step": 4257 + }, + { + "epoch": 0.10933350023510682, + "grad_norm": 0.828125, + "learning_rate": 0.0001968692311532446, + "loss": 1.0555, + "step": 4258 + }, + { + "epoch": 0.10935917743102865, + "grad_norm": 0.8359375, + "learning_rate": 0.0001968681227544262, + "loss": 1.1678, + "step": 4259 + }, + { + "epoch": 0.10938485462695047, + "grad_norm": 0.91796875, + "learning_rate": 0.00019686701416255843, + "loss": 1.1462, + "step": 4260 + }, + { + "epoch": 0.10941053182287229, + "grad_norm": 0.8671875, + "learning_rate": 0.0001968659053776434, + "loss": 1.1267, + "step": 4261 + }, + { + "epoch": 0.1094362090187941, + "grad_norm": 0.82421875, + "learning_rate": 0.0001968647963996834, + "loss": 1.166, + "step": 4262 + }, + { + "epoch": 0.10946188621471592, + "grad_norm": 0.87890625, + "learning_rate": 0.00019686368722868057, + "loss": 1.1063, + "step": 4263 + }, + { + "epoch": 0.10948756341063774, + "grad_norm": 0.7578125, + "learning_rate": 0.0001968625778646372, + "loss": 1.0315, + "step": 4264 + }, + { + "epoch": 0.10951324060655956, + "grad_norm": 0.828125, + "learning_rate": 0.00019686146830755547, + "loss": 1.1777, + "step": 4265 + }, + { + "epoch": 0.10953891780248139, + "grad_norm": 0.76953125, + "learning_rate": 0.00019686035855743755, + "loss": 1.0883, + "step": 4266 + }, + { + "epoch": 0.1095645949984032, + "grad_norm": 0.875, + "learning_rate": 0.00019685924861428567, + "loss": 1.2199, + "step": 4267 + }, + { + "epoch": 0.10959027219432502, + "grad_norm": 0.8671875, + "learning_rate": 0.00019685813847810212, + "loss": 1.1438, + "step": 4268 + }, + { + "epoch": 0.10961594939024684, + "grad_norm": 0.87109375, + "learning_rate": 0.000196857028148889, + "loss": 1.2208, + "step": 4269 + }, + { + "epoch": 0.10964162658616866, + "grad_norm": 0.8359375, + "learning_rate": 0.0001968559176266486, + "loss": 1.1701, + "step": 4270 + }, + { + "epoch": 0.10966730378209048, + "grad_norm": 0.84375, + "learning_rate": 0.00019685480691138308, + "loss": 1.1392, + "step": 4271 + }, + { + "epoch": 0.10969298097801229, + "grad_norm": 0.83203125, + "learning_rate": 0.00019685369600309468, + "loss": 0.9918, + "step": 4272 + }, + { + "epoch": 0.10971865817393411, + "grad_norm": 0.8359375, + "learning_rate": 0.0001968525849017856, + "loss": 1.1647, + "step": 4273 + }, + { + "epoch": 0.10974433536985594, + "grad_norm": 0.85546875, + "learning_rate": 0.00019685147360745813, + "loss": 1.0358, + "step": 4274 + }, + { + "epoch": 0.10977001256577776, + "grad_norm": 0.8359375, + "learning_rate": 0.00019685036212011437, + "loss": 1.0561, + "step": 4275 + }, + { + "epoch": 0.10979568976169958, + "grad_norm": 0.85546875, + "learning_rate": 0.0001968492504397566, + "loss": 1.1177, + "step": 4276 + }, + { + "epoch": 0.10982136695762139, + "grad_norm": 0.8671875, + "learning_rate": 0.00019684813856638705, + "loss": 1.0948, + "step": 4277 + }, + { + "epoch": 0.10984704415354321, + "grad_norm": 0.84765625, + "learning_rate": 0.00019684702650000786, + "loss": 1.1372, + "step": 4278 + }, + { + "epoch": 0.10987272134946503, + "grad_norm": 0.83984375, + "learning_rate": 0.00019684591424062132, + "loss": 1.0938, + "step": 4279 + }, + { + "epoch": 0.10989839854538686, + "grad_norm": 0.796875, + "learning_rate": 0.00019684480178822962, + "loss": 1.1892, + "step": 4280 + }, + { + "epoch": 0.10992407574130868, + "grad_norm": 0.8046875, + "learning_rate": 0.00019684368914283496, + "loss": 1.0607, + "step": 4281 + }, + { + "epoch": 0.10994975293723049, + "grad_norm": 0.87109375, + "learning_rate": 0.00019684257630443958, + "loss": 1.1411, + "step": 4282 + }, + { + "epoch": 0.10997543013315231, + "grad_norm": 0.80859375, + "learning_rate": 0.00019684146327304572, + "loss": 1.1025, + "step": 4283 + }, + { + "epoch": 0.11000110732907413, + "grad_norm": 0.8203125, + "learning_rate": 0.00019684035004865553, + "loss": 1.0764, + "step": 4284 + }, + { + "epoch": 0.11002678452499595, + "grad_norm": 0.82421875, + "learning_rate": 0.0001968392366312713, + "loss": 1.2267, + "step": 4285 + }, + { + "epoch": 0.11005246172091776, + "grad_norm": 0.7890625, + "learning_rate": 0.00019683812302089522, + "loss": 1.0203, + "step": 4286 + }, + { + "epoch": 0.11007813891683958, + "grad_norm": 0.88671875, + "learning_rate": 0.0001968370092175295, + "loss": 1.2115, + "step": 4287 + }, + { + "epoch": 0.1101038161127614, + "grad_norm": 0.96875, + "learning_rate": 0.00019683589522117635, + "loss": 1.1495, + "step": 4288 + }, + { + "epoch": 0.11012949330868323, + "grad_norm": 0.9375, + "learning_rate": 0.00019683478103183803, + "loss": 1.2718, + "step": 4289 + }, + { + "epoch": 0.11015517050460505, + "grad_norm": 0.84375, + "learning_rate": 0.00019683366664951675, + "loss": 1.2033, + "step": 4290 + }, + { + "epoch": 0.11018084770052686, + "grad_norm": 0.890625, + "learning_rate": 0.0001968325520742147, + "loss": 1.1354, + "step": 4291 + }, + { + "epoch": 0.11020652489644868, + "grad_norm": 0.8984375, + "learning_rate": 0.00019683143730593413, + "loss": 1.1274, + "step": 4292 + }, + { + "epoch": 0.1102322020923705, + "grad_norm": 1.015625, + "learning_rate": 0.00019683032234467723, + "loss": 1.0337, + "step": 4293 + }, + { + "epoch": 0.11025787928829232, + "grad_norm": 0.77734375, + "learning_rate": 0.00019682920719044626, + "loss": 1.0921, + "step": 4294 + }, + { + "epoch": 0.11028355648421415, + "grad_norm": 0.90234375, + "learning_rate": 0.00019682809184324343, + "loss": 1.0738, + "step": 4295 + }, + { + "epoch": 0.11030923368013595, + "grad_norm": 0.890625, + "learning_rate": 0.00019682697630307096, + "loss": 1.192, + "step": 4296 + }, + { + "epoch": 0.11033491087605778, + "grad_norm": 0.8671875, + "learning_rate": 0.00019682586056993107, + "loss": 1.096, + "step": 4297 + }, + { + "epoch": 0.1103605880719796, + "grad_norm": 0.90234375, + "learning_rate": 0.00019682474464382597, + "loss": 1.0653, + "step": 4298 + }, + { + "epoch": 0.11038626526790142, + "grad_norm": 0.89453125, + "learning_rate": 0.0001968236285247579, + "loss": 1.2672, + "step": 4299 + }, + { + "epoch": 0.11041194246382324, + "grad_norm": 0.90234375, + "learning_rate": 0.0001968225122127291, + "loss": 1.2925, + "step": 4300 + }, + { + "epoch": 0.11043761965974505, + "grad_norm": 0.80078125, + "learning_rate": 0.0001968213957077418, + "loss": 0.9333, + "step": 4301 + }, + { + "epoch": 0.11046329685566687, + "grad_norm": 0.75390625, + "learning_rate": 0.00019682027900979818, + "loss": 1.155, + "step": 4302 + }, + { + "epoch": 0.1104889740515887, + "grad_norm": 0.8046875, + "learning_rate": 0.00019681916211890049, + "loss": 1.029, + "step": 4303 + }, + { + "epoch": 0.11051465124751052, + "grad_norm": 0.87890625, + "learning_rate": 0.00019681804503505096, + "loss": 1.2651, + "step": 4304 + }, + { + "epoch": 0.11054032844343234, + "grad_norm": 0.83984375, + "learning_rate": 0.0001968169277582518, + "loss": 0.922, + "step": 4305 + }, + { + "epoch": 0.11056600563935415, + "grad_norm": 0.90234375, + "learning_rate": 0.00019681581028850527, + "loss": 1.2378, + "step": 4306 + }, + { + "epoch": 0.11059168283527597, + "grad_norm": 0.89453125, + "learning_rate": 0.00019681469262581354, + "loss": 0.8911, + "step": 4307 + }, + { + "epoch": 0.11061736003119779, + "grad_norm": 0.8984375, + "learning_rate": 0.0001968135747701789, + "loss": 1.2841, + "step": 4308 + }, + { + "epoch": 0.11064303722711961, + "grad_norm": 0.890625, + "learning_rate": 0.00019681245672160356, + "loss": 0.9897, + "step": 4309 + }, + { + "epoch": 0.11066871442304144, + "grad_norm": 0.93359375, + "learning_rate": 0.00019681133848008974, + "loss": 1.1518, + "step": 4310 + }, + { + "epoch": 0.11069439161896324, + "grad_norm": 0.8359375, + "learning_rate": 0.00019681022004563965, + "loss": 1.1558, + "step": 4311 + }, + { + "epoch": 0.11072006881488507, + "grad_norm": 0.859375, + "learning_rate": 0.0001968091014182555, + "loss": 1.1975, + "step": 4312 + }, + { + "epoch": 0.11074574601080689, + "grad_norm": 0.87890625, + "learning_rate": 0.00019680798259793963, + "loss": 1.1078, + "step": 4313 + }, + { + "epoch": 0.11077142320672871, + "grad_norm": 0.89453125, + "learning_rate": 0.00019680686358469415, + "loss": 0.9968, + "step": 4314 + }, + { + "epoch": 0.11079710040265053, + "grad_norm": 0.8671875, + "learning_rate": 0.00019680574437852134, + "loss": 1.0437, + "step": 4315 + }, + { + "epoch": 0.11082277759857234, + "grad_norm": 0.92578125, + "learning_rate": 0.00019680462497942343, + "loss": 1.2728, + "step": 4316 + }, + { + "epoch": 0.11084845479449416, + "grad_norm": 0.8359375, + "learning_rate": 0.00019680350538740267, + "loss": 1.1297, + "step": 4317 + }, + { + "epoch": 0.11087413199041599, + "grad_norm": 1.0390625, + "learning_rate": 0.00019680238560246122, + "loss": 1.1562, + "step": 4318 + }, + { + "epoch": 0.11089980918633781, + "grad_norm": 0.88671875, + "learning_rate": 0.0001968012656246014, + "loss": 1.0835, + "step": 4319 + }, + { + "epoch": 0.11092548638225963, + "grad_norm": 0.83984375, + "learning_rate": 0.0001968001454538254, + "loss": 1.2329, + "step": 4320 + }, + { + "epoch": 0.11095116357818144, + "grad_norm": 1.1015625, + "learning_rate": 0.00019679902509013542, + "loss": 1.0764, + "step": 4321 + }, + { + "epoch": 0.11097684077410326, + "grad_norm": 0.84375, + "learning_rate": 0.00019679790453353376, + "loss": 1.2063, + "step": 4322 + }, + { + "epoch": 0.11100251797002508, + "grad_norm": 0.89453125, + "learning_rate": 0.0001967967837840226, + "loss": 1.0785, + "step": 4323 + }, + { + "epoch": 0.1110281951659469, + "grad_norm": 0.8671875, + "learning_rate": 0.0001967956628416042, + "loss": 1.2346, + "step": 4324 + }, + { + "epoch": 0.11105387236186873, + "grad_norm": 0.84765625, + "learning_rate": 0.0001967945417062808, + "loss": 1.1748, + "step": 4325 + }, + { + "epoch": 0.11107954955779054, + "grad_norm": 0.84765625, + "learning_rate": 0.0001967934203780546, + "loss": 1.1521, + "step": 4326 + }, + { + "epoch": 0.11110522675371236, + "grad_norm": 0.75390625, + "learning_rate": 0.00019679229885692786, + "loss": 1.0934, + "step": 4327 + }, + { + "epoch": 0.11113090394963418, + "grad_norm": 0.85546875, + "learning_rate": 0.00019679117714290284, + "loss": 1.0805, + "step": 4328 + }, + { + "epoch": 0.111156581145556, + "grad_norm": 0.85546875, + "learning_rate": 0.00019679005523598174, + "loss": 1.116, + "step": 4329 + }, + { + "epoch": 0.11118225834147782, + "grad_norm": 0.9140625, + "learning_rate": 0.00019678893313616677, + "loss": 1.0481, + "step": 4330 + }, + { + "epoch": 0.11120793553739963, + "grad_norm": 0.8671875, + "learning_rate": 0.00019678781084346022, + "loss": 1.1977, + "step": 4331 + }, + { + "epoch": 0.11123361273332145, + "grad_norm": 0.8203125, + "learning_rate": 0.00019678668835786432, + "loss": 1.1401, + "step": 4332 + }, + { + "epoch": 0.11125928992924328, + "grad_norm": 0.85546875, + "learning_rate": 0.00019678556567938123, + "loss": 1.2352, + "step": 4333 + }, + { + "epoch": 0.1112849671251651, + "grad_norm": 1.0, + "learning_rate": 0.00019678444280801333, + "loss": 1.0979, + "step": 4334 + }, + { + "epoch": 0.11131064432108692, + "grad_norm": 0.83203125, + "learning_rate": 0.00019678331974376273, + "loss": 1.0854, + "step": 4335 + }, + { + "epoch": 0.11133632151700873, + "grad_norm": 0.921875, + "learning_rate": 0.00019678219648663173, + "loss": 1.0301, + "step": 4336 + }, + { + "epoch": 0.11136199871293055, + "grad_norm": 0.86328125, + "learning_rate": 0.00019678107303662255, + "loss": 1.1317, + "step": 4337 + }, + { + "epoch": 0.11138767590885237, + "grad_norm": 0.88671875, + "learning_rate": 0.00019677994939373746, + "loss": 1.2101, + "step": 4338 + }, + { + "epoch": 0.1114133531047742, + "grad_norm": 0.859375, + "learning_rate": 0.00019677882555797863, + "loss": 1.0731, + "step": 4339 + }, + { + "epoch": 0.11143903030069602, + "grad_norm": 0.81640625, + "learning_rate": 0.00019677770152934837, + "loss": 1.1332, + "step": 4340 + }, + { + "epoch": 0.11146470749661783, + "grad_norm": 0.89453125, + "learning_rate": 0.0001967765773078489, + "loss": 1.1881, + "step": 4341 + }, + { + "epoch": 0.11149038469253965, + "grad_norm": 1.0234375, + "learning_rate": 0.00019677545289348243, + "loss": 1.2111, + "step": 4342 + }, + { + "epoch": 0.11151606188846147, + "grad_norm": 0.984375, + "learning_rate": 0.00019677432828625125, + "loss": 1.1788, + "step": 4343 + }, + { + "epoch": 0.11154173908438329, + "grad_norm": 0.90234375, + "learning_rate": 0.00019677320348615755, + "loss": 1.1187, + "step": 4344 + }, + { + "epoch": 0.11156741628030512, + "grad_norm": 0.83984375, + "learning_rate": 0.00019677207849320361, + "loss": 1.0968, + "step": 4345 + }, + { + "epoch": 0.11159309347622692, + "grad_norm": 0.89453125, + "learning_rate": 0.00019677095330739166, + "loss": 0.9367, + "step": 4346 + }, + { + "epoch": 0.11161877067214875, + "grad_norm": 0.78515625, + "learning_rate": 0.00019676982792872394, + "loss": 1.1507, + "step": 4347 + }, + { + "epoch": 0.11164444786807057, + "grad_norm": 0.83984375, + "learning_rate": 0.0001967687023572027, + "loss": 1.0224, + "step": 4348 + }, + { + "epoch": 0.11167012506399239, + "grad_norm": 0.87890625, + "learning_rate": 0.00019676757659283016, + "loss": 1.2165, + "step": 4349 + }, + { + "epoch": 0.11169580225991421, + "grad_norm": 0.86328125, + "learning_rate": 0.00019676645063560857, + "loss": 1.1883, + "step": 4350 + }, + { + "epoch": 0.11172147945583602, + "grad_norm": 0.84765625, + "learning_rate": 0.00019676532448554022, + "loss": 1.1294, + "step": 4351 + }, + { + "epoch": 0.11174715665175784, + "grad_norm": 0.84765625, + "learning_rate": 0.0001967641981426273, + "loss": 1.1722, + "step": 4352 + }, + { + "epoch": 0.11177283384767966, + "grad_norm": 0.93359375, + "learning_rate": 0.0001967630716068721, + "loss": 1.1263, + "step": 4353 + }, + { + "epoch": 0.11179851104360149, + "grad_norm": 1.0390625, + "learning_rate": 0.0001967619448782768, + "loss": 1.066, + "step": 4354 + }, + { + "epoch": 0.11182418823952331, + "grad_norm": 1.265625, + "learning_rate": 0.0001967608179568437, + "loss": 1.1096, + "step": 4355 + }, + { + "epoch": 0.11184986543544512, + "grad_norm": 0.890625, + "learning_rate": 0.000196759690842575, + "loss": 1.3427, + "step": 4356 + }, + { + "epoch": 0.11187554263136694, + "grad_norm": 0.85546875, + "learning_rate": 0.00019675856353547304, + "loss": 1.1269, + "step": 4357 + }, + { + "epoch": 0.11190121982728876, + "grad_norm": 0.859375, + "learning_rate": 0.00019675743603553996, + "loss": 1.0286, + "step": 4358 + }, + { + "epoch": 0.11192689702321058, + "grad_norm": 0.8515625, + "learning_rate": 0.00019675630834277804, + "loss": 1.085, + "step": 4359 + }, + { + "epoch": 0.1119525742191324, + "grad_norm": 1.0, + "learning_rate": 0.00019675518045718957, + "loss": 1.2142, + "step": 4360 + }, + { + "epoch": 0.11197825141505421, + "grad_norm": 0.87890625, + "learning_rate": 0.00019675405237877677, + "loss": 0.9917, + "step": 4361 + }, + { + "epoch": 0.11200392861097604, + "grad_norm": 0.78125, + "learning_rate": 0.00019675292410754186, + "loss": 1.1529, + "step": 4362 + }, + { + "epoch": 0.11202960580689786, + "grad_norm": 0.875, + "learning_rate": 0.00019675179564348713, + "loss": 1.2478, + "step": 4363 + }, + { + "epoch": 0.11205528300281968, + "grad_norm": 0.83203125, + "learning_rate": 0.00019675066698661477, + "loss": 1.0844, + "step": 4364 + }, + { + "epoch": 0.1120809601987415, + "grad_norm": 0.828125, + "learning_rate": 0.0001967495381369271, + "loss": 1.1216, + "step": 4365 + }, + { + "epoch": 0.11210663739466331, + "grad_norm": 0.80078125, + "learning_rate": 0.00019674840909442637, + "loss": 1.0905, + "step": 4366 + }, + { + "epoch": 0.11213231459058513, + "grad_norm": 0.88671875, + "learning_rate": 0.00019674727985911474, + "loss": 1.2591, + "step": 4367 + }, + { + "epoch": 0.11215799178650696, + "grad_norm": 0.91796875, + "learning_rate": 0.00019674615043099457, + "loss": 1.1337, + "step": 4368 + }, + { + "epoch": 0.11218366898242878, + "grad_norm": 0.98828125, + "learning_rate": 0.00019674502081006804, + "loss": 1.3088, + "step": 4369 + }, + { + "epoch": 0.1122093461783506, + "grad_norm": 0.953125, + "learning_rate": 0.0001967438909963374, + "loss": 1.2213, + "step": 4370 + }, + { + "epoch": 0.11223502337427241, + "grad_norm": 0.890625, + "learning_rate": 0.00019674276098980497, + "loss": 1.1936, + "step": 4371 + }, + { + "epoch": 0.11226070057019423, + "grad_norm": 0.93359375, + "learning_rate": 0.00019674163079047295, + "loss": 1.0259, + "step": 4372 + }, + { + "epoch": 0.11228637776611605, + "grad_norm": 0.8125, + "learning_rate": 0.00019674050039834357, + "loss": 1.1096, + "step": 4373 + }, + { + "epoch": 0.11231205496203787, + "grad_norm": 0.92578125, + "learning_rate": 0.00019673936981341912, + "loss": 1.1379, + "step": 4374 + }, + { + "epoch": 0.1123377321579597, + "grad_norm": 0.93359375, + "learning_rate": 0.00019673823903570186, + "loss": 1.0037, + "step": 4375 + }, + { + "epoch": 0.1123634093538815, + "grad_norm": 0.86328125, + "learning_rate": 0.000196737108065194, + "loss": 1.1866, + "step": 4376 + }, + { + "epoch": 0.11238908654980333, + "grad_norm": 0.79296875, + "learning_rate": 0.00019673597690189786, + "loss": 1.1614, + "step": 4377 + }, + { + "epoch": 0.11241476374572515, + "grad_norm": 0.8828125, + "learning_rate": 0.0001967348455458156, + "loss": 1.0223, + "step": 4378 + }, + { + "epoch": 0.11244044094164697, + "grad_norm": 0.875, + "learning_rate": 0.00019673371399694958, + "loss": 1.1668, + "step": 4379 + }, + { + "epoch": 0.1124661181375688, + "grad_norm": 0.91015625, + "learning_rate": 0.00019673258225530197, + "loss": 1.0138, + "step": 4380 + }, + { + "epoch": 0.1124917953334906, + "grad_norm": 0.890625, + "learning_rate": 0.00019673145032087508, + "loss": 1.101, + "step": 4381 + }, + { + "epoch": 0.11251747252941242, + "grad_norm": 0.9453125, + "learning_rate": 0.00019673031819367117, + "loss": 1.1531, + "step": 4382 + }, + { + "epoch": 0.11254314972533425, + "grad_norm": 0.828125, + "learning_rate": 0.00019672918587369242, + "loss": 1.1534, + "step": 4383 + }, + { + "epoch": 0.11256882692125607, + "grad_norm": 0.8046875, + "learning_rate": 0.0001967280533609412, + "loss": 1.1434, + "step": 4384 + }, + { + "epoch": 0.11259450411717789, + "grad_norm": 0.86328125, + "learning_rate": 0.00019672692065541967, + "loss": 1.1098, + "step": 4385 + }, + { + "epoch": 0.1126201813130997, + "grad_norm": 0.8046875, + "learning_rate": 0.00019672578775713014, + "loss": 1.1106, + "step": 4386 + }, + { + "epoch": 0.11264585850902152, + "grad_norm": 0.86328125, + "learning_rate": 0.00019672465466607483, + "loss": 1.3226, + "step": 4387 + }, + { + "epoch": 0.11267153570494334, + "grad_norm": 0.87109375, + "learning_rate": 0.00019672352138225605, + "loss": 1.2213, + "step": 4388 + }, + { + "epoch": 0.11269721290086517, + "grad_norm": 0.91796875, + "learning_rate": 0.000196722387905676, + "loss": 1.1093, + "step": 4389 + }, + { + "epoch": 0.11272289009678697, + "grad_norm": 1.0234375, + "learning_rate": 0.000196721254236337, + "loss": 1.1128, + "step": 4390 + }, + { + "epoch": 0.1127485672927088, + "grad_norm": 0.83984375, + "learning_rate": 0.00019672012037424126, + "loss": 1.1407, + "step": 4391 + }, + { + "epoch": 0.11277424448863062, + "grad_norm": 0.859375, + "learning_rate": 0.00019671898631939104, + "loss": 1.0139, + "step": 4392 + }, + { + "epoch": 0.11279992168455244, + "grad_norm": 0.859375, + "learning_rate": 0.00019671785207178862, + "loss": 1.1739, + "step": 4393 + }, + { + "epoch": 0.11282559888047426, + "grad_norm": 0.7578125, + "learning_rate": 0.0001967167176314363, + "loss": 0.9689, + "step": 4394 + }, + { + "epoch": 0.11285127607639607, + "grad_norm": 0.859375, + "learning_rate": 0.00019671558299833627, + "loss": 1.2206, + "step": 4395 + }, + { + "epoch": 0.11287695327231789, + "grad_norm": 0.8046875, + "learning_rate": 0.0001967144481724908, + "loss": 1.1817, + "step": 4396 + }, + { + "epoch": 0.11290263046823971, + "grad_norm": 0.83203125, + "learning_rate": 0.0001967133131539022, + "loss": 1.0623, + "step": 4397 + }, + { + "epoch": 0.11292830766416154, + "grad_norm": 0.7734375, + "learning_rate": 0.0001967121779425727, + "loss": 0.9711, + "step": 4398 + }, + { + "epoch": 0.11295398486008336, + "grad_norm": 0.77734375, + "learning_rate": 0.00019671104253850453, + "loss": 1.1326, + "step": 4399 + }, + { + "epoch": 0.11297966205600517, + "grad_norm": 0.86328125, + "learning_rate": 0.00019670990694170003, + "loss": 1.26, + "step": 4400 + }, + { + "epoch": 0.11300533925192699, + "grad_norm": 0.8671875, + "learning_rate": 0.0001967087711521614, + "loss": 1.101, + "step": 4401 + }, + { + "epoch": 0.11303101644784881, + "grad_norm": 0.8671875, + "learning_rate": 0.00019670763516989095, + "loss": 1.1738, + "step": 4402 + }, + { + "epoch": 0.11305669364377063, + "grad_norm": 0.90625, + "learning_rate": 0.0001967064989948909, + "loss": 1.0968, + "step": 4403 + }, + { + "epoch": 0.11308237083969246, + "grad_norm": 0.8671875, + "learning_rate": 0.00019670536262716352, + "loss": 1.2035, + "step": 4404 + }, + { + "epoch": 0.11310804803561426, + "grad_norm": 0.8515625, + "learning_rate": 0.0001967042260667111, + "loss": 1.2, + "step": 4405 + }, + { + "epoch": 0.11313372523153609, + "grad_norm": 1.2578125, + "learning_rate": 0.0001967030893135359, + "loss": 1.1326, + "step": 4406 + }, + { + "epoch": 0.11315940242745791, + "grad_norm": 0.875, + "learning_rate": 0.00019670195236764018, + "loss": 1.0324, + "step": 4407 + }, + { + "epoch": 0.11318507962337973, + "grad_norm": 0.88671875, + "learning_rate": 0.0001967008152290262, + "loss": 1.1243, + "step": 4408 + }, + { + "epoch": 0.11321075681930155, + "grad_norm": 0.92578125, + "learning_rate": 0.0001966996778976962, + "loss": 1.2886, + "step": 4409 + }, + { + "epoch": 0.11323643401522336, + "grad_norm": 0.9140625, + "learning_rate": 0.00019669854037365251, + "loss": 1.3565, + "step": 4410 + }, + { + "epoch": 0.11326211121114518, + "grad_norm": 0.8515625, + "learning_rate": 0.00019669740265689737, + "loss": 1.0922, + "step": 4411 + }, + { + "epoch": 0.113287788407067, + "grad_norm": 0.8828125, + "learning_rate": 0.00019669626474743304, + "loss": 1.1928, + "step": 4412 + }, + { + "epoch": 0.11331346560298883, + "grad_norm": 0.91796875, + "learning_rate": 0.00019669512664526178, + "loss": 1.0842, + "step": 4413 + }, + { + "epoch": 0.11333914279891065, + "grad_norm": 0.80859375, + "learning_rate": 0.00019669398835038584, + "loss": 1.1473, + "step": 4414 + }, + { + "epoch": 0.11336481999483246, + "grad_norm": 0.890625, + "learning_rate": 0.00019669284986280754, + "loss": 1.072, + "step": 4415 + }, + { + "epoch": 0.11339049719075428, + "grad_norm": 0.90234375, + "learning_rate": 0.00019669171118252913, + "loss": 1.0855, + "step": 4416 + }, + { + "epoch": 0.1134161743866761, + "grad_norm": 0.84765625, + "learning_rate": 0.00019669057230955285, + "loss": 1.1206, + "step": 4417 + }, + { + "epoch": 0.11344185158259792, + "grad_norm": 0.90234375, + "learning_rate": 0.00019668943324388104, + "loss": 1.1287, + "step": 4418 + }, + { + "epoch": 0.11346752877851975, + "grad_norm": 0.96484375, + "learning_rate": 0.00019668829398551587, + "loss": 0.9827, + "step": 4419 + }, + { + "epoch": 0.11349320597444156, + "grad_norm": 0.84375, + "learning_rate": 0.0001966871545344597, + "loss": 1.1079, + "step": 4420 + }, + { + "epoch": 0.11351888317036338, + "grad_norm": 1.1328125, + "learning_rate": 0.00019668601489071475, + "loss": 1.1244, + "step": 4421 + }, + { + "epoch": 0.1135445603662852, + "grad_norm": 0.80078125, + "learning_rate": 0.0001966848750542833, + "loss": 1.0181, + "step": 4422 + }, + { + "epoch": 0.11357023756220702, + "grad_norm": 0.8125, + "learning_rate": 0.00019668373502516767, + "loss": 1.0085, + "step": 4423 + }, + { + "epoch": 0.11359591475812884, + "grad_norm": 0.8203125, + "learning_rate": 0.00019668259480337002, + "loss": 1.2269, + "step": 4424 + }, + { + "epoch": 0.11362159195405065, + "grad_norm": 0.98828125, + "learning_rate": 0.00019668145438889273, + "loss": 1.4377, + "step": 4425 + }, + { + "epoch": 0.11364726914997247, + "grad_norm": 0.90234375, + "learning_rate": 0.00019668031378173803, + "loss": 1.0735, + "step": 4426 + }, + { + "epoch": 0.1136729463458943, + "grad_norm": 0.9921875, + "learning_rate": 0.0001966791729819082, + "loss": 1.3712, + "step": 4427 + }, + { + "epoch": 0.11369862354181612, + "grad_norm": 0.859375, + "learning_rate": 0.00019667803198940553, + "loss": 0.9928, + "step": 4428 + }, + { + "epoch": 0.11372430073773794, + "grad_norm": 0.87109375, + "learning_rate": 0.00019667689080423223, + "loss": 1.2855, + "step": 4429 + }, + { + "epoch": 0.11374997793365975, + "grad_norm": 0.9609375, + "learning_rate": 0.00019667574942639065, + "loss": 1.0825, + "step": 4430 + }, + { + "epoch": 0.11377565512958157, + "grad_norm": 0.8984375, + "learning_rate": 0.00019667460785588304, + "loss": 1.0747, + "step": 4431 + }, + { + "epoch": 0.1138013323255034, + "grad_norm": 0.85546875, + "learning_rate": 0.00019667346609271165, + "loss": 1.0347, + "step": 4432 + }, + { + "epoch": 0.11382700952142522, + "grad_norm": 0.94140625, + "learning_rate": 0.0001966723241368788, + "loss": 1.2007, + "step": 4433 + }, + { + "epoch": 0.11385268671734704, + "grad_norm": 0.8984375, + "learning_rate": 0.00019667118198838672, + "loss": 1.0939, + "step": 4434 + }, + { + "epoch": 0.11387836391326885, + "grad_norm": 0.8046875, + "learning_rate": 0.00019667003964723772, + "loss": 1.1073, + "step": 4435 + }, + { + "epoch": 0.11390404110919067, + "grad_norm": 0.953125, + "learning_rate": 0.00019666889711343407, + "loss": 1.029, + "step": 4436 + }, + { + "epoch": 0.11392971830511249, + "grad_norm": 0.98046875, + "learning_rate": 0.000196667754386978, + "loss": 1.1443, + "step": 4437 + }, + { + "epoch": 0.11395539550103431, + "grad_norm": 0.82421875, + "learning_rate": 0.00019666661146787188, + "loss": 1.0515, + "step": 4438 + }, + { + "epoch": 0.11398107269695613, + "grad_norm": 1.015625, + "learning_rate": 0.0001966654683561179, + "loss": 1.2751, + "step": 4439 + }, + { + "epoch": 0.11400674989287794, + "grad_norm": 1.1171875, + "learning_rate": 0.00019666432505171836, + "loss": 1.1691, + "step": 4440 + }, + { + "epoch": 0.11403242708879976, + "grad_norm": 0.86328125, + "learning_rate": 0.00019666318155467556, + "loss": 1.1901, + "step": 4441 + }, + { + "epoch": 0.11405810428472159, + "grad_norm": 0.87890625, + "learning_rate": 0.0001966620378649918, + "loss": 0.9891, + "step": 4442 + }, + { + "epoch": 0.11408378148064341, + "grad_norm": 0.83984375, + "learning_rate": 0.00019666089398266928, + "loss": 1.0345, + "step": 4443 + }, + { + "epoch": 0.11410945867656523, + "grad_norm": 0.8984375, + "learning_rate": 0.00019665974990771038, + "loss": 1.0991, + "step": 4444 + }, + { + "epoch": 0.11413513587248704, + "grad_norm": 0.91796875, + "learning_rate": 0.00019665860564011732, + "loss": 1.3314, + "step": 4445 + }, + { + "epoch": 0.11416081306840886, + "grad_norm": 0.9140625, + "learning_rate": 0.00019665746117989235, + "loss": 1.1289, + "step": 4446 + }, + { + "epoch": 0.11418649026433068, + "grad_norm": 0.79296875, + "learning_rate": 0.00019665631652703784, + "loss": 1.1266, + "step": 4447 + }, + { + "epoch": 0.1142121674602525, + "grad_norm": 0.8671875, + "learning_rate": 0.00019665517168155602, + "loss": 1.0861, + "step": 4448 + }, + { + "epoch": 0.11423784465617433, + "grad_norm": 1.0390625, + "learning_rate": 0.00019665402664344916, + "loss": 1.401, + "step": 4449 + }, + { + "epoch": 0.11426352185209614, + "grad_norm": 0.921875, + "learning_rate": 0.00019665288141271954, + "loss": 1.0166, + "step": 4450 + }, + { + "epoch": 0.11428919904801796, + "grad_norm": 0.82421875, + "learning_rate": 0.0001966517359893695, + "loss": 1.0911, + "step": 4451 + }, + { + "epoch": 0.11431487624393978, + "grad_norm": 0.921875, + "learning_rate": 0.00019665059037340124, + "loss": 1.0896, + "step": 4452 + }, + { + "epoch": 0.1143405534398616, + "grad_norm": 0.9140625, + "learning_rate": 0.00019664944456481708, + "loss": 1.2043, + "step": 4453 + }, + { + "epoch": 0.11436623063578343, + "grad_norm": 0.91796875, + "learning_rate": 0.00019664829856361934, + "loss": 1.3098, + "step": 4454 + }, + { + "epoch": 0.11439190783170523, + "grad_norm": 0.8828125, + "learning_rate": 0.00019664715236981023, + "loss": 1.096, + "step": 4455 + }, + { + "epoch": 0.11441758502762706, + "grad_norm": 0.90234375, + "learning_rate": 0.00019664600598339211, + "loss": 1.1094, + "step": 4456 + }, + { + "epoch": 0.11444326222354888, + "grad_norm": 0.90234375, + "learning_rate": 0.00019664485940436722, + "loss": 1.2067, + "step": 4457 + }, + { + "epoch": 0.1144689394194707, + "grad_norm": 0.8828125, + "learning_rate": 0.00019664371263273785, + "loss": 1.2284, + "step": 4458 + }, + { + "epoch": 0.11449461661539252, + "grad_norm": 0.94140625, + "learning_rate": 0.0001966425656685063, + "loss": 1.3295, + "step": 4459 + }, + { + "epoch": 0.11452029381131433, + "grad_norm": 0.87890625, + "learning_rate": 0.00019664141851167486, + "loss": 1.2329, + "step": 4460 + }, + { + "epoch": 0.11454597100723615, + "grad_norm": 0.875, + "learning_rate": 0.00019664027116224578, + "loss": 1.1829, + "step": 4461 + }, + { + "epoch": 0.11457164820315797, + "grad_norm": 0.87109375, + "learning_rate": 0.00019663912362022137, + "loss": 0.9928, + "step": 4462 + }, + { + "epoch": 0.1145973253990798, + "grad_norm": 0.97265625, + "learning_rate": 0.0001966379758856039, + "loss": 1.1516, + "step": 4463 + }, + { + "epoch": 0.11462300259500162, + "grad_norm": 1.0703125, + "learning_rate": 0.0001966368279583957, + "loss": 1.1388, + "step": 4464 + }, + { + "epoch": 0.11464867979092343, + "grad_norm": 0.8359375, + "learning_rate": 0.00019663567983859903, + "loss": 0.9383, + "step": 4465 + }, + { + "epoch": 0.11467435698684525, + "grad_norm": 0.84765625, + "learning_rate": 0.0001966345315262162, + "loss": 1.3203, + "step": 4466 + }, + { + "epoch": 0.11470003418276707, + "grad_norm": 0.8359375, + "learning_rate": 0.00019663338302124944, + "loss": 1.0794, + "step": 4467 + }, + { + "epoch": 0.1147257113786889, + "grad_norm": 0.85546875, + "learning_rate": 0.0001966322343237011, + "loss": 1.0802, + "step": 4468 + }, + { + "epoch": 0.11475138857461072, + "grad_norm": 0.91015625, + "learning_rate": 0.0001966310854335734, + "loss": 1.0863, + "step": 4469 + }, + { + "epoch": 0.11477706577053252, + "grad_norm": 0.89453125, + "learning_rate": 0.0001966299363508687, + "loss": 1.0766, + "step": 4470 + }, + { + "epoch": 0.11480274296645435, + "grad_norm": 0.85546875, + "learning_rate": 0.00019662878707558926, + "loss": 1.0904, + "step": 4471 + }, + { + "epoch": 0.11482842016237617, + "grad_norm": 0.9921875, + "learning_rate": 0.00019662763760773741, + "loss": 1.0656, + "step": 4472 + }, + { + "epoch": 0.11485409735829799, + "grad_norm": 0.89453125, + "learning_rate": 0.0001966264879473154, + "loss": 1.1398, + "step": 4473 + }, + { + "epoch": 0.11487977455421981, + "grad_norm": 0.87890625, + "learning_rate": 0.0001966253380943255, + "loss": 1.212, + "step": 4474 + }, + { + "epoch": 0.11490545175014162, + "grad_norm": 0.921875, + "learning_rate": 0.00019662418804877006, + "loss": 1.1083, + "step": 4475 + }, + { + "epoch": 0.11493112894606344, + "grad_norm": 0.8828125, + "learning_rate": 0.00019662303781065133, + "loss": 1.078, + "step": 4476 + }, + { + "epoch": 0.11495680614198527, + "grad_norm": 0.78125, + "learning_rate": 0.00019662188737997157, + "loss": 1.1017, + "step": 4477 + }, + { + "epoch": 0.11498248333790709, + "grad_norm": 0.8515625, + "learning_rate": 0.00019662073675673317, + "loss": 1.1055, + "step": 4478 + }, + { + "epoch": 0.11500816053382891, + "grad_norm": 0.84765625, + "learning_rate": 0.00019661958594093836, + "loss": 1.0462, + "step": 4479 + }, + { + "epoch": 0.11503383772975072, + "grad_norm": 0.82421875, + "learning_rate": 0.00019661843493258942, + "loss": 1.1504, + "step": 4480 + }, + { + "epoch": 0.11505951492567254, + "grad_norm": 0.890625, + "learning_rate": 0.0001966172837316887, + "loss": 1.1799, + "step": 4481 + }, + { + "epoch": 0.11508519212159436, + "grad_norm": 0.89453125, + "learning_rate": 0.00019661613233823845, + "loss": 1.1978, + "step": 4482 + }, + { + "epoch": 0.11511086931751618, + "grad_norm": 0.82421875, + "learning_rate": 0.00019661498075224096, + "loss": 1.0961, + "step": 4483 + }, + { + "epoch": 0.115136546513438, + "grad_norm": 0.85546875, + "learning_rate": 0.00019661382897369855, + "loss": 1.1549, + "step": 4484 + }, + { + "epoch": 0.11516222370935982, + "grad_norm": 0.9296875, + "learning_rate": 0.0001966126770026135, + "loss": 1.2465, + "step": 4485 + }, + { + "epoch": 0.11518790090528164, + "grad_norm": 0.80078125, + "learning_rate": 0.00019661152483898813, + "loss": 0.987, + "step": 4486 + }, + { + "epoch": 0.11521357810120346, + "grad_norm": 0.82421875, + "learning_rate": 0.0001966103724828247, + "loss": 1.0413, + "step": 4487 + }, + { + "epoch": 0.11523925529712528, + "grad_norm": 0.8359375, + "learning_rate": 0.00019660921993412553, + "loss": 1.0656, + "step": 4488 + }, + { + "epoch": 0.1152649324930471, + "grad_norm": 0.875, + "learning_rate": 0.00019660806719289293, + "loss": 1.313, + "step": 4489 + }, + { + "epoch": 0.11529060968896891, + "grad_norm": 0.77734375, + "learning_rate": 0.00019660691425912915, + "loss": 1.0155, + "step": 4490 + }, + { + "epoch": 0.11531628688489073, + "grad_norm": 0.90625, + "learning_rate": 0.0001966057611328365, + "loss": 1.2718, + "step": 4491 + }, + { + "epoch": 0.11534196408081256, + "grad_norm": 0.8828125, + "learning_rate": 0.00019660460781401734, + "loss": 1.143, + "step": 4492 + }, + { + "epoch": 0.11536764127673438, + "grad_norm": 0.91015625, + "learning_rate": 0.0001966034543026739, + "loss": 1.2992, + "step": 4493 + }, + { + "epoch": 0.11539331847265619, + "grad_norm": 0.84375, + "learning_rate": 0.0001966023005988085, + "loss": 1.2056, + "step": 4494 + }, + { + "epoch": 0.11541899566857801, + "grad_norm": 0.84765625, + "learning_rate": 0.00019660114670242345, + "loss": 1.0985, + "step": 4495 + }, + { + "epoch": 0.11544467286449983, + "grad_norm": 0.8359375, + "learning_rate": 0.00019659999261352105, + "loss": 1.129, + "step": 4496 + }, + { + "epoch": 0.11547035006042165, + "grad_norm": 0.85546875, + "learning_rate": 0.00019659883833210358, + "loss": 1.0225, + "step": 4497 + }, + { + "epoch": 0.11549602725634348, + "grad_norm": 0.921875, + "learning_rate": 0.00019659768385817334, + "loss": 1.2011, + "step": 4498 + }, + { + "epoch": 0.11552170445226528, + "grad_norm": 0.91796875, + "learning_rate": 0.00019659652919173267, + "loss": 1.268, + "step": 4499 + }, + { + "epoch": 0.1155473816481871, + "grad_norm": 0.7890625, + "learning_rate": 0.0001965953743327838, + "loss": 1.0652, + "step": 4500 + }, + { + "epoch": 0.11557305884410893, + "grad_norm": 0.82421875, + "learning_rate": 0.00019659421928132912, + "loss": 1.123, + "step": 4501 + }, + { + "epoch": 0.11559873604003075, + "grad_norm": 0.86328125, + "learning_rate": 0.00019659306403737084, + "loss": 1.1257, + "step": 4502 + }, + { + "epoch": 0.11562441323595257, + "grad_norm": 0.86328125, + "learning_rate": 0.00019659190860091134, + "loss": 1.131, + "step": 4503 + }, + { + "epoch": 0.11565009043187438, + "grad_norm": 0.89453125, + "learning_rate": 0.0001965907529719529, + "loss": 1.1464, + "step": 4504 + }, + { + "epoch": 0.1156757676277962, + "grad_norm": 0.875, + "learning_rate": 0.00019658959715049778, + "loss": 1.2847, + "step": 4505 + }, + { + "epoch": 0.11570144482371802, + "grad_norm": 0.8828125, + "learning_rate": 0.00019658844113654836, + "loss": 1.0838, + "step": 4506 + }, + { + "epoch": 0.11572712201963985, + "grad_norm": 0.8671875, + "learning_rate": 0.00019658728493010685, + "loss": 1.2122, + "step": 4507 + }, + { + "epoch": 0.11575279921556167, + "grad_norm": 0.8203125, + "learning_rate": 0.00019658612853117564, + "loss": 1.2119, + "step": 4508 + }, + { + "epoch": 0.11577847641148348, + "grad_norm": 0.87890625, + "learning_rate": 0.000196584971939757, + "loss": 1.1499, + "step": 4509 + }, + { + "epoch": 0.1158041536074053, + "grad_norm": 0.83984375, + "learning_rate": 0.00019658381515585323, + "loss": 0.9901, + "step": 4510 + }, + { + "epoch": 0.11582983080332712, + "grad_norm": 0.95703125, + "learning_rate": 0.00019658265817946663, + "loss": 1.092, + "step": 4511 + }, + { + "epoch": 0.11585550799924894, + "grad_norm": 0.78125, + "learning_rate": 0.00019658150101059952, + "loss": 1.0425, + "step": 4512 + }, + { + "epoch": 0.11588118519517077, + "grad_norm": 0.84375, + "learning_rate": 0.00019658034364925423, + "loss": 1.0898, + "step": 4513 + }, + { + "epoch": 0.11590686239109257, + "grad_norm": 0.84375, + "learning_rate": 0.000196579186095433, + "loss": 1.1453, + "step": 4514 + }, + { + "epoch": 0.1159325395870144, + "grad_norm": 0.91015625, + "learning_rate": 0.0001965780283491382, + "loss": 1.2162, + "step": 4515 + }, + { + "epoch": 0.11595821678293622, + "grad_norm": 0.8046875, + "learning_rate": 0.0001965768704103721, + "loss": 1.0376, + "step": 4516 + }, + { + "epoch": 0.11598389397885804, + "grad_norm": 0.80859375, + "learning_rate": 0.00019657571227913706, + "loss": 1.0301, + "step": 4517 + }, + { + "epoch": 0.11600957117477986, + "grad_norm": 0.9140625, + "learning_rate": 0.0001965745539554353, + "loss": 1.2176, + "step": 4518 + }, + { + "epoch": 0.11603524837070167, + "grad_norm": 0.84375, + "learning_rate": 0.00019657339543926918, + "loss": 1.103, + "step": 4519 + }, + { + "epoch": 0.1160609255666235, + "grad_norm": 0.81640625, + "learning_rate": 0.00019657223673064103, + "loss": 1.2105, + "step": 4520 + }, + { + "epoch": 0.11608660276254532, + "grad_norm": 0.88671875, + "learning_rate": 0.00019657107782955312, + "loss": 1.1352, + "step": 4521 + }, + { + "epoch": 0.11611227995846714, + "grad_norm": 0.96875, + "learning_rate": 0.0001965699187360078, + "loss": 1.3183, + "step": 4522 + }, + { + "epoch": 0.11613795715438896, + "grad_norm": 0.8359375, + "learning_rate": 0.00019656875945000732, + "loss": 1.2554, + "step": 4523 + }, + { + "epoch": 0.11616363435031077, + "grad_norm": 0.78515625, + "learning_rate": 0.00019656759997155405, + "loss": 1.1059, + "step": 4524 + }, + { + "epoch": 0.11618931154623259, + "grad_norm": 0.8671875, + "learning_rate": 0.00019656644030065028, + "loss": 1.181, + "step": 4525 + }, + { + "epoch": 0.11621498874215441, + "grad_norm": 0.92578125, + "learning_rate": 0.0001965652804372983, + "loss": 1.1281, + "step": 4526 + }, + { + "epoch": 0.11624066593807623, + "grad_norm": 0.890625, + "learning_rate": 0.00019656412038150043, + "loss": 1.1935, + "step": 4527 + }, + { + "epoch": 0.11626634313399806, + "grad_norm": 0.81640625, + "learning_rate": 0.000196562960133259, + "loss": 1.1323, + "step": 4528 + }, + { + "epoch": 0.11629202032991987, + "grad_norm": 0.83203125, + "learning_rate": 0.00019656179969257634, + "loss": 1.2133, + "step": 4529 + }, + { + "epoch": 0.11631769752584169, + "grad_norm": 0.8828125, + "learning_rate": 0.00019656063905945472, + "loss": 1.2033, + "step": 4530 + }, + { + "epoch": 0.11634337472176351, + "grad_norm": 0.8359375, + "learning_rate": 0.00019655947823389645, + "loss": 1.3328, + "step": 4531 + }, + { + "epoch": 0.11636905191768533, + "grad_norm": 0.8671875, + "learning_rate": 0.00019655831721590388, + "loss": 1.1042, + "step": 4532 + }, + { + "epoch": 0.11639472911360715, + "grad_norm": 0.87109375, + "learning_rate": 0.0001965571560054793, + "loss": 1.1009, + "step": 4533 + }, + { + "epoch": 0.11642040630952896, + "grad_norm": 0.83984375, + "learning_rate": 0.000196555994602625, + "loss": 1.094, + "step": 4534 + }, + { + "epoch": 0.11644608350545078, + "grad_norm": 0.8046875, + "learning_rate": 0.00019655483300734338, + "loss": 1.1352, + "step": 4535 + }, + { + "epoch": 0.1164717607013726, + "grad_norm": 0.7734375, + "learning_rate": 0.00019655367121963668, + "loss": 0.9495, + "step": 4536 + }, + { + "epoch": 0.11649743789729443, + "grad_norm": 0.80078125, + "learning_rate": 0.00019655250923950724, + "loss": 1.1809, + "step": 4537 + }, + { + "epoch": 0.11652311509321625, + "grad_norm": 0.8125, + "learning_rate": 0.00019655134706695734, + "loss": 1.1889, + "step": 4538 + }, + { + "epoch": 0.11654879228913806, + "grad_norm": 0.88671875, + "learning_rate": 0.00019655018470198936, + "loss": 1.0143, + "step": 4539 + }, + { + "epoch": 0.11657446948505988, + "grad_norm": 0.92578125, + "learning_rate": 0.00019654902214460558, + "loss": 1.3549, + "step": 4540 + }, + { + "epoch": 0.1166001466809817, + "grad_norm": 0.80859375, + "learning_rate": 0.0001965478593948083, + "loss": 1.1045, + "step": 4541 + }, + { + "epoch": 0.11662582387690353, + "grad_norm": 0.84765625, + "learning_rate": 0.00019654669645259985, + "loss": 1.17, + "step": 4542 + }, + { + "epoch": 0.11665150107282535, + "grad_norm": 0.94140625, + "learning_rate": 0.00019654553331798259, + "loss": 1.2805, + "step": 4543 + }, + { + "epoch": 0.11667717826874716, + "grad_norm": 0.890625, + "learning_rate": 0.0001965443699909588, + "loss": 1.293, + "step": 4544 + }, + { + "epoch": 0.11670285546466898, + "grad_norm": 0.86328125, + "learning_rate": 0.00019654320647153077, + "loss": 1.1195, + "step": 4545 + }, + { + "epoch": 0.1167285326605908, + "grad_norm": 0.7421875, + "learning_rate": 0.00019654204275970088, + "loss": 1.114, + "step": 4546 + }, + { + "epoch": 0.11675420985651262, + "grad_norm": 0.90625, + "learning_rate": 0.0001965408788554714, + "loss": 1.1609, + "step": 4547 + }, + { + "epoch": 0.11677988705243444, + "grad_norm": 0.91015625, + "learning_rate": 0.00019653971475884466, + "loss": 1.1413, + "step": 4548 + }, + { + "epoch": 0.11680556424835625, + "grad_norm": 0.94140625, + "learning_rate": 0.00019653855046982303, + "loss": 1.2011, + "step": 4549 + }, + { + "epoch": 0.11683124144427808, + "grad_norm": 0.87890625, + "learning_rate": 0.00019653738598840876, + "loss": 1.1532, + "step": 4550 + }, + { + "epoch": 0.1168569186401999, + "grad_norm": 0.80078125, + "learning_rate": 0.0001965362213146042, + "loss": 1.0138, + "step": 4551 + }, + { + "epoch": 0.11688259583612172, + "grad_norm": 0.859375, + "learning_rate": 0.0001965350564484117, + "loss": 1.1257, + "step": 4552 + }, + { + "epoch": 0.11690827303204354, + "grad_norm": 0.83984375, + "learning_rate": 0.00019653389138983348, + "loss": 1.2371, + "step": 4553 + }, + { + "epoch": 0.11693395022796535, + "grad_norm": 0.8203125, + "learning_rate": 0.00019653272613887198, + "loss": 1.1708, + "step": 4554 + }, + { + "epoch": 0.11695962742388717, + "grad_norm": 0.87109375, + "learning_rate": 0.00019653156069552948, + "loss": 1.2279, + "step": 4555 + }, + { + "epoch": 0.116985304619809, + "grad_norm": 0.8828125, + "learning_rate": 0.00019653039505980832, + "loss": 1.264, + "step": 4556 + }, + { + "epoch": 0.11701098181573082, + "grad_norm": 0.8515625, + "learning_rate": 0.00019652922923171076, + "loss": 1.1824, + "step": 4557 + }, + { + "epoch": 0.11703665901165264, + "grad_norm": 0.84765625, + "learning_rate": 0.0001965280632112392, + "loss": 1.0857, + "step": 4558 + }, + { + "epoch": 0.11706233620757445, + "grad_norm": 0.90625, + "learning_rate": 0.0001965268969983959, + "loss": 1.3233, + "step": 4559 + }, + { + "epoch": 0.11708801340349627, + "grad_norm": 0.9921875, + "learning_rate": 0.00019652573059318325, + "loss": 1.1407, + "step": 4560 + }, + { + "epoch": 0.11711369059941809, + "grad_norm": 0.890625, + "learning_rate": 0.0001965245639956035, + "loss": 1.0737, + "step": 4561 + }, + { + "epoch": 0.11713936779533991, + "grad_norm": 0.90234375, + "learning_rate": 0.00019652339720565903, + "loss": 1.1253, + "step": 4562 + }, + { + "epoch": 0.11716504499126174, + "grad_norm": 0.8515625, + "learning_rate": 0.00019652223022335216, + "loss": 0.9734, + "step": 4563 + }, + { + "epoch": 0.11719072218718354, + "grad_norm": 0.9140625, + "learning_rate": 0.00019652106304868518, + "loss": 1.3048, + "step": 4564 + }, + { + "epoch": 0.11721639938310537, + "grad_norm": 0.8828125, + "learning_rate": 0.00019651989568166047, + "loss": 1.2064, + "step": 4565 + }, + { + "epoch": 0.11724207657902719, + "grad_norm": 0.80078125, + "learning_rate": 0.0001965187281222803, + "loss": 1.0314, + "step": 4566 + }, + { + "epoch": 0.11726775377494901, + "grad_norm": 0.83203125, + "learning_rate": 0.00019651756037054703, + "loss": 1.1929, + "step": 4567 + }, + { + "epoch": 0.11729343097087083, + "grad_norm": 0.89453125, + "learning_rate": 0.00019651639242646298, + "loss": 1.2844, + "step": 4568 + }, + { + "epoch": 0.11731910816679264, + "grad_norm": 0.8046875, + "learning_rate": 0.00019651522429003048, + "loss": 1.075, + "step": 4569 + }, + { + "epoch": 0.11734478536271446, + "grad_norm": 0.8125, + "learning_rate": 0.00019651405596125186, + "loss": 1.2406, + "step": 4570 + }, + { + "epoch": 0.11737046255863628, + "grad_norm": 0.8203125, + "learning_rate": 0.00019651288744012944, + "loss": 1.0709, + "step": 4571 + }, + { + "epoch": 0.11739613975455811, + "grad_norm": 0.91796875, + "learning_rate": 0.00019651171872666555, + "loss": 1.1167, + "step": 4572 + }, + { + "epoch": 0.11742181695047993, + "grad_norm": 0.85546875, + "learning_rate": 0.00019651054982086253, + "loss": 0.9901, + "step": 4573 + }, + { + "epoch": 0.11744749414640174, + "grad_norm": 0.7578125, + "learning_rate": 0.0001965093807227227, + "loss": 1.0638, + "step": 4574 + }, + { + "epoch": 0.11747317134232356, + "grad_norm": 0.83984375, + "learning_rate": 0.0001965082114322484, + "loss": 1.1583, + "step": 4575 + }, + { + "epoch": 0.11749884853824538, + "grad_norm": 1.2890625, + "learning_rate": 0.00019650704194944192, + "loss": 1.2706, + "step": 4576 + }, + { + "epoch": 0.1175245257341672, + "grad_norm": 1.296875, + "learning_rate": 0.00019650587227430566, + "loss": 1.1636, + "step": 4577 + }, + { + "epoch": 0.11755020293008903, + "grad_norm": 0.8046875, + "learning_rate": 0.00019650470240684188, + "loss": 1.3368, + "step": 4578 + }, + { + "epoch": 0.11757588012601083, + "grad_norm": 0.7578125, + "learning_rate": 0.000196503532347053, + "loss": 0.9642, + "step": 4579 + }, + { + "epoch": 0.11760155732193266, + "grad_norm": 0.9140625, + "learning_rate": 0.00019650236209494123, + "loss": 1.2445, + "step": 4580 + }, + { + "epoch": 0.11762723451785448, + "grad_norm": 0.80078125, + "learning_rate": 0.00019650119165050902, + "loss": 1.1156, + "step": 4581 + }, + { + "epoch": 0.1176529117137763, + "grad_norm": 0.78515625, + "learning_rate": 0.00019650002101375864, + "loss": 0.9912, + "step": 4582 + }, + { + "epoch": 0.11767858890969812, + "grad_norm": 0.88671875, + "learning_rate": 0.0001964988501846924, + "loss": 1.0878, + "step": 4583 + }, + { + "epoch": 0.11770426610561993, + "grad_norm": 0.84375, + "learning_rate": 0.00019649767916331273, + "loss": 1.0455, + "step": 4584 + }, + { + "epoch": 0.11772994330154175, + "grad_norm": 0.8828125, + "learning_rate": 0.00019649650794962184, + "loss": 1.3307, + "step": 4585 + }, + { + "epoch": 0.11775562049746358, + "grad_norm": 0.82421875, + "learning_rate": 0.0001964953365436222, + "loss": 0.9464, + "step": 4586 + }, + { + "epoch": 0.1177812976933854, + "grad_norm": 0.78125, + "learning_rate": 0.00019649416494531597, + "loss": 1.086, + "step": 4587 + }, + { + "epoch": 0.11780697488930722, + "grad_norm": 0.90625, + "learning_rate": 0.00019649299315470565, + "loss": 1.2387, + "step": 4588 + }, + { + "epoch": 0.11783265208522903, + "grad_norm": 0.84765625, + "learning_rate": 0.0001964918211717935, + "loss": 1.2032, + "step": 4589 + }, + { + "epoch": 0.11785832928115085, + "grad_norm": 0.84765625, + "learning_rate": 0.00019649064899658188, + "loss": 1.1448, + "step": 4590 + }, + { + "epoch": 0.11788400647707267, + "grad_norm": 0.8125, + "learning_rate": 0.00019648947662907312, + "loss": 1.1207, + "step": 4591 + }, + { + "epoch": 0.1179096836729945, + "grad_norm": 0.8515625, + "learning_rate": 0.0001964883040692695, + "loss": 1.1616, + "step": 4592 + }, + { + "epoch": 0.11793536086891632, + "grad_norm": 0.85546875, + "learning_rate": 0.00019648713131717343, + "loss": 1.1442, + "step": 4593 + }, + { + "epoch": 0.11796103806483813, + "grad_norm": 0.8125, + "learning_rate": 0.00019648595837278726, + "loss": 1.2898, + "step": 4594 + }, + { + "epoch": 0.11798671526075995, + "grad_norm": 0.86328125, + "learning_rate": 0.00019648478523611324, + "loss": 1.2665, + "step": 4595 + }, + { + "epoch": 0.11801239245668177, + "grad_norm": 0.8203125, + "learning_rate": 0.00019648361190715375, + "loss": 1.0897, + "step": 4596 + }, + { + "epoch": 0.11803806965260359, + "grad_norm": 0.87109375, + "learning_rate": 0.00019648243838591117, + "loss": 1.2045, + "step": 4597 + }, + { + "epoch": 0.1180637468485254, + "grad_norm": 0.8828125, + "learning_rate": 0.00019648126467238783, + "loss": 1.2847, + "step": 4598 + }, + { + "epoch": 0.11808942404444722, + "grad_norm": 0.87109375, + "learning_rate": 0.000196480090766586, + "loss": 1.1481, + "step": 4599 + }, + { + "epoch": 0.11811510124036904, + "grad_norm": 0.87890625, + "learning_rate": 0.0001964789166685081, + "loss": 1.197, + "step": 4600 + }, + { + "epoch": 0.11814077843629087, + "grad_norm": 0.91015625, + "learning_rate": 0.0001964777423781564, + "loss": 1.1816, + "step": 4601 + }, + { + "epoch": 0.11816645563221269, + "grad_norm": 0.890625, + "learning_rate": 0.00019647656789553329, + "loss": 1.2544, + "step": 4602 + }, + { + "epoch": 0.1181921328281345, + "grad_norm": 0.8515625, + "learning_rate": 0.00019647539322064107, + "loss": 1.1822, + "step": 4603 + }, + { + "epoch": 0.11821781002405632, + "grad_norm": 0.90625, + "learning_rate": 0.00019647421835348214, + "loss": 1.102, + "step": 4604 + }, + { + "epoch": 0.11824348721997814, + "grad_norm": 0.859375, + "learning_rate": 0.00019647304329405878, + "loss": 1.0123, + "step": 4605 + }, + { + "epoch": 0.11826916441589996, + "grad_norm": 0.97265625, + "learning_rate": 0.0001964718680423734, + "loss": 1.2426, + "step": 4606 + }, + { + "epoch": 0.11829484161182179, + "grad_norm": 0.953125, + "learning_rate": 0.00019647069259842828, + "loss": 1.19, + "step": 4607 + }, + { + "epoch": 0.1183205188077436, + "grad_norm": 0.87109375, + "learning_rate": 0.00019646951696222575, + "loss": 1.0298, + "step": 4608 + }, + { + "epoch": 0.11834619600366542, + "grad_norm": 0.78515625, + "learning_rate": 0.00019646834113376822, + "loss": 0.9986, + "step": 4609 + }, + { + "epoch": 0.11837187319958724, + "grad_norm": 0.85546875, + "learning_rate": 0.000196467165113058, + "loss": 1.0611, + "step": 4610 + }, + { + "epoch": 0.11839755039550906, + "grad_norm": 0.98046875, + "learning_rate": 0.00019646598890009744, + "loss": 1.1771, + "step": 4611 + }, + { + "epoch": 0.11842322759143088, + "grad_norm": 0.88671875, + "learning_rate": 0.00019646481249488887, + "loss": 1.1579, + "step": 4612 + }, + { + "epoch": 0.11844890478735269, + "grad_norm": 0.93359375, + "learning_rate": 0.00019646363589743464, + "loss": 1.124, + "step": 4613 + }, + { + "epoch": 0.11847458198327451, + "grad_norm": 0.921875, + "learning_rate": 0.00019646245910773707, + "loss": 1.1439, + "step": 4614 + }, + { + "epoch": 0.11850025917919634, + "grad_norm": 0.890625, + "learning_rate": 0.00019646128212579857, + "loss": 1.1502, + "step": 4615 + }, + { + "epoch": 0.11852593637511816, + "grad_norm": 0.87109375, + "learning_rate": 0.00019646010495162145, + "loss": 0.9812, + "step": 4616 + }, + { + "epoch": 0.11855161357103998, + "grad_norm": 0.875, + "learning_rate": 0.00019645892758520802, + "loss": 1.1556, + "step": 4617 + }, + { + "epoch": 0.11857729076696179, + "grad_norm": 0.86328125, + "learning_rate": 0.0001964577500265607, + "loss": 1.1621, + "step": 4618 + }, + { + "epoch": 0.11860296796288361, + "grad_norm": 0.79296875, + "learning_rate": 0.00019645657227568178, + "loss": 1.1799, + "step": 4619 + }, + { + "epoch": 0.11862864515880543, + "grad_norm": 0.85546875, + "learning_rate": 0.00019645539433257362, + "loss": 0.9683, + "step": 4620 + }, + { + "epoch": 0.11865432235472725, + "grad_norm": 0.83203125, + "learning_rate": 0.00019645421619723856, + "loss": 1.1017, + "step": 4621 + }, + { + "epoch": 0.11867999955064908, + "grad_norm": 0.90625, + "learning_rate": 0.000196453037869679, + "loss": 1.197, + "step": 4622 + }, + { + "epoch": 0.11870567674657088, + "grad_norm": 0.8828125, + "learning_rate": 0.0001964518593498972, + "loss": 0.9648, + "step": 4623 + }, + { + "epoch": 0.1187313539424927, + "grad_norm": 0.875, + "learning_rate": 0.00019645068063789558, + "loss": 1.1391, + "step": 4624 + }, + { + "epoch": 0.11875703113841453, + "grad_norm": 0.87890625, + "learning_rate": 0.00019644950173367649, + "loss": 1.178, + "step": 4625 + }, + { + "epoch": 0.11878270833433635, + "grad_norm": 0.91015625, + "learning_rate": 0.00019644832263724223, + "loss": 1.1734, + "step": 4626 + }, + { + "epoch": 0.11880838553025817, + "grad_norm": 0.87109375, + "learning_rate": 0.00019644714334859516, + "loss": 1.224, + "step": 4627 + }, + { + "epoch": 0.11883406272617998, + "grad_norm": 0.84375, + "learning_rate": 0.00019644596386773767, + "loss": 1.1104, + "step": 4628 + }, + { + "epoch": 0.1188597399221018, + "grad_norm": 0.79296875, + "learning_rate": 0.00019644478419467208, + "loss": 1.1939, + "step": 4629 + }, + { + "epoch": 0.11888541711802363, + "grad_norm": 0.8828125, + "learning_rate": 0.00019644360432940073, + "loss": 1.0348, + "step": 4630 + }, + { + "epoch": 0.11891109431394545, + "grad_norm": 0.8203125, + "learning_rate": 0.000196442424271926, + "loss": 1.2221, + "step": 4631 + }, + { + "epoch": 0.11893677150986727, + "grad_norm": 0.85546875, + "learning_rate": 0.00019644124402225024, + "loss": 0.9793, + "step": 4632 + }, + { + "epoch": 0.11896244870578908, + "grad_norm": 0.83203125, + "learning_rate": 0.00019644006358037576, + "loss": 1.2145, + "step": 4633 + }, + { + "epoch": 0.1189881259017109, + "grad_norm": 0.9140625, + "learning_rate": 0.000196438882946305, + "loss": 1.1688, + "step": 4634 + }, + { + "epoch": 0.11901380309763272, + "grad_norm": 0.84375, + "learning_rate": 0.0001964377021200402, + "loss": 0.9982, + "step": 4635 + }, + { + "epoch": 0.11903948029355454, + "grad_norm": 0.90625, + "learning_rate": 0.00019643652110158381, + "loss": 1.298, + "step": 4636 + }, + { + "epoch": 0.11906515748947637, + "grad_norm": 0.87890625, + "learning_rate": 0.0001964353398909381, + "loss": 1.1926, + "step": 4637 + }, + { + "epoch": 0.11909083468539818, + "grad_norm": 0.8828125, + "learning_rate": 0.00019643415848810553, + "loss": 1.2176, + "step": 4638 + }, + { + "epoch": 0.11911651188132, + "grad_norm": 0.81640625, + "learning_rate": 0.00019643297689308836, + "loss": 1.0444, + "step": 4639 + }, + { + "epoch": 0.11914218907724182, + "grad_norm": 0.8359375, + "learning_rate": 0.00019643179510588898, + "loss": 1.1821, + "step": 4640 + }, + { + "epoch": 0.11916786627316364, + "grad_norm": 0.953125, + "learning_rate": 0.00019643061312650976, + "loss": 1.3395, + "step": 4641 + }, + { + "epoch": 0.11919354346908546, + "grad_norm": 0.83203125, + "learning_rate": 0.00019642943095495304, + "loss": 1.1629, + "step": 4642 + }, + { + "epoch": 0.11921922066500727, + "grad_norm": 0.9375, + "learning_rate": 0.00019642824859122113, + "loss": 1.2726, + "step": 4643 + }, + { + "epoch": 0.1192448978609291, + "grad_norm": 0.84375, + "learning_rate": 0.00019642706603531645, + "loss": 1.2212, + "step": 4644 + }, + { + "epoch": 0.11927057505685092, + "grad_norm": 0.90234375, + "learning_rate": 0.00019642588328724135, + "loss": 1.1225, + "step": 4645 + }, + { + "epoch": 0.11929625225277274, + "grad_norm": 0.8671875, + "learning_rate": 0.00019642470034699817, + "loss": 1.1385, + "step": 4646 + }, + { + "epoch": 0.11932192944869456, + "grad_norm": 0.921875, + "learning_rate": 0.00019642351721458926, + "loss": 1.1913, + "step": 4647 + }, + { + "epoch": 0.11934760664461637, + "grad_norm": 0.7890625, + "learning_rate": 0.00019642233389001703, + "loss": 1.096, + "step": 4648 + }, + { + "epoch": 0.11937328384053819, + "grad_norm": 0.80859375, + "learning_rate": 0.00019642115037328377, + "loss": 1.1037, + "step": 4649 + }, + { + "epoch": 0.11939896103646001, + "grad_norm": 0.87109375, + "learning_rate": 0.00019641996666439187, + "loss": 1.1913, + "step": 4650 + }, + { + "epoch": 0.11942463823238184, + "grad_norm": 0.8359375, + "learning_rate": 0.0001964187827633437, + "loss": 1.1182, + "step": 4651 + }, + { + "epoch": 0.11945031542830366, + "grad_norm": 0.90234375, + "learning_rate": 0.0001964175986701416, + "loss": 1.3625, + "step": 4652 + }, + { + "epoch": 0.11947599262422547, + "grad_norm": 0.93359375, + "learning_rate": 0.0001964164143847879, + "loss": 1.0771, + "step": 4653 + }, + { + "epoch": 0.11950166982014729, + "grad_norm": 0.875, + "learning_rate": 0.000196415229907285, + "loss": 1.1757, + "step": 4654 + }, + { + "epoch": 0.11952734701606911, + "grad_norm": 0.93359375, + "learning_rate": 0.00019641404523763529, + "loss": 1.2092, + "step": 4655 + }, + { + "epoch": 0.11955302421199093, + "grad_norm": 0.8515625, + "learning_rate": 0.00019641286037584108, + "loss": 1.269, + "step": 4656 + }, + { + "epoch": 0.11957870140791275, + "grad_norm": 0.8828125, + "learning_rate": 0.00019641167532190475, + "loss": 1.2182, + "step": 4657 + }, + { + "epoch": 0.11960437860383456, + "grad_norm": 0.8046875, + "learning_rate": 0.00019641049007582866, + "loss": 1.2165, + "step": 4658 + }, + { + "epoch": 0.11963005579975639, + "grad_norm": 0.82421875, + "learning_rate": 0.00019640930463761517, + "loss": 1.1502, + "step": 4659 + }, + { + "epoch": 0.11965573299567821, + "grad_norm": 0.77734375, + "learning_rate": 0.00019640811900726665, + "loss": 1.216, + "step": 4660 + }, + { + "epoch": 0.11968141019160003, + "grad_norm": 0.87890625, + "learning_rate": 0.00019640693318478546, + "loss": 1.1196, + "step": 4661 + }, + { + "epoch": 0.11970708738752185, + "grad_norm": 0.75, + "learning_rate": 0.00019640574717017396, + "loss": 1.0478, + "step": 4662 + }, + { + "epoch": 0.11973276458344366, + "grad_norm": 0.82421875, + "learning_rate": 0.00019640456096343452, + "loss": 1.1307, + "step": 4663 + }, + { + "epoch": 0.11975844177936548, + "grad_norm": 0.9375, + "learning_rate": 0.0001964033745645695, + "loss": 1.1884, + "step": 4664 + }, + { + "epoch": 0.1197841189752873, + "grad_norm": 0.87890625, + "learning_rate": 0.00019640218797358123, + "loss": 1.1349, + "step": 4665 + }, + { + "epoch": 0.11980979617120913, + "grad_norm": 1.234375, + "learning_rate": 0.00019640100119047214, + "loss": 1.1094, + "step": 4666 + }, + { + "epoch": 0.11983547336713095, + "grad_norm": 0.890625, + "learning_rate": 0.00019639981421524453, + "loss": 1.1159, + "step": 4667 + }, + { + "epoch": 0.11986115056305276, + "grad_norm": 0.88671875, + "learning_rate": 0.00019639862704790085, + "loss": 1.2426, + "step": 4668 + }, + { + "epoch": 0.11988682775897458, + "grad_norm": 0.85546875, + "learning_rate": 0.00019639743968844338, + "loss": 1.0214, + "step": 4669 + }, + { + "epoch": 0.1199125049548964, + "grad_norm": 0.859375, + "learning_rate": 0.00019639625213687452, + "loss": 1.0353, + "step": 4670 + }, + { + "epoch": 0.11993818215081822, + "grad_norm": 0.7734375, + "learning_rate": 0.00019639506439319662, + "loss": 1.0138, + "step": 4671 + }, + { + "epoch": 0.11996385934674005, + "grad_norm": 0.8828125, + "learning_rate": 0.00019639387645741207, + "loss": 1.1544, + "step": 4672 + }, + { + "epoch": 0.11998953654266185, + "grad_norm": 0.8984375, + "learning_rate": 0.00019639268832952325, + "loss": 1.1938, + "step": 4673 + }, + { + "epoch": 0.12001521373858368, + "grad_norm": 0.828125, + "learning_rate": 0.00019639150000953253, + "loss": 1.0858, + "step": 4674 + }, + { + "epoch": 0.1200408909345055, + "grad_norm": 0.8125, + "learning_rate": 0.00019639031149744222, + "loss": 1.0626, + "step": 4675 + }, + { + "epoch": 0.12006656813042732, + "grad_norm": 0.8203125, + "learning_rate": 0.00019638912279325475, + "loss": 1.0218, + "step": 4676 + }, + { + "epoch": 0.12009224532634914, + "grad_norm": 1.0625, + "learning_rate": 0.00019638793389697244, + "loss": 1.4618, + "step": 4677 + }, + { + "epoch": 0.12011792252227095, + "grad_norm": 0.859375, + "learning_rate": 0.0001963867448085977, + "loss": 1.1097, + "step": 4678 + }, + { + "epoch": 0.12014359971819277, + "grad_norm": 0.875, + "learning_rate": 0.00019638555552813286, + "loss": 1.1767, + "step": 4679 + }, + { + "epoch": 0.1201692769141146, + "grad_norm": 0.8671875, + "learning_rate": 0.00019638436605558035, + "loss": 1.0498, + "step": 4680 + }, + { + "epoch": 0.12019495411003642, + "grad_norm": 0.84765625, + "learning_rate": 0.0001963831763909425, + "loss": 1.1342, + "step": 4681 + }, + { + "epoch": 0.12022063130595824, + "grad_norm": 0.80078125, + "learning_rate": 0.00019638198653422167, + "loss": 1.1374, + "step": 4682 + }, + { + "epoch": 0.12024630850188005, + "grad_norm": 0.796875, + "learning_rate": 0.00019638079648542025, + "loss": 1.194, + "step": 4683 + }, + { + "epoch": 0.12027198569780187, + "grad_norm": 0.8515625, + "learning_rate": 0.00019637960624454062, + "loss": 1.2359, + "step": 4684 + }, + { + "epoch": 0.12029766289372369, + "grad_norm": 0.81640625, + "learning_rate": 0.00019637841581158512, + "loss": 1.2062, + "step": 4685 + }, + { + "epoch": 0.12032334008964551, + "grad_norm": 0.8828125, + "learning_rate": 0.00019637722518655614, + "loss": 1.0843, + "step": 4686 + }, + { + "epoch": 0.12034901728556734, + "grad_norm": 0.82421875, + "learning_rate": 0.0001963760343694561, + "loss": 1.0391, + "step": 4687 + }, + { + "epoch": 0.12037469448148914, + "grad_norm": 0.91796875, + "learning_rate": 0.00019637484336028731, + "loss": 1.209, + "step": 4688 + }, + { + "epoch": 0.12040037167741097, + "grad_norm": 0.87890625, + "learning_rate": 0.00019637365215905214, + "loss": 1.2479, + "step": 4689 + }, + { + "epoch": 0.12042604887333279, + "grad_norm": 0.84765625, + "learning_rate": 0.000196372460765753, + "loss": 1.0535, + "step": 4690 + }, + { + "epoch": 0.12045172606925461, + "grad_norm": 0.890625, + "learning_rate": 0.00019637126918039223, + "loss": 1.2301, + "step": 4691 + }, + { + "epoch": 0.12047740326517643, + "grad_norm": 0.8359375, + "learning_rate": 0.00019637007740297226, + "loss": 1.0723, + "step": 4692 + }, + { + "epoch": 0.12050308046109824, + "grad_norm": 0.87109375, + "learning_rate": 0.00019636888543349542, + "loss": 1.0701, + "step": 4693 + }, + { + "epoch": 0.12052875765702006, + "grad_norm": 0.8828125, + "learning_rate": 0.0001963676932719641, + "loss": 1.0935, + "step": 4694 + }, + { + "epoch": 0.12055443485294189, + "grad_norm": 0.83203125, + "learning_rate": 0.00019636650091838065, + "loss": 1.0964, + "step": 4695 + }, + { + "epoch": 0.12058011204886371, + "grad_norm": 0.8125, + "learning_rate": 0.00019636530837274752, + "loss": 1.0492, + "step": 4696 + }, + { + "epoch": 0.12060578924478553, + "grad_norm": 0.91015625, + "learning_rate": 0.00019636411563506698, + "loss": 1.2571, + "step": 4697 + }, + { + "epoch": 0.12063146644070734, + "grad_norm": 0.875, + "learning_rate": 0.0001963629227053415, + "loss": 1.1297, + "step": 4698 + }, + { + "epoch": 0.12065714363662916, + "grad_norm": 0.78515625, + "learning_rate": 0.0001963617295835734, + "loss": 1.0579, + "step": 4699 + }, + { + "epoch": 0.12068282083255098, + "grad_norm": 0.8671875, + "learning_rate": 0.00019636053626976506, + "loss": 1.2001, + "step": 4700 + }, + { + "epoch": 0.1207084980284728, + "grad_norm": 0.76171875, + "learning_rate": 0.0001963593427639189, + "loss": 1.0797, + "step": 4701 + }, + { + "epoch": 0.12073417522439461, + "grad_norm": 0.890625, + "learning_rate": 0.00019635814906603726, + "loss": 1.0497, + "step": 4702 + }, + { + "epoch": 0.12075985242031644, + "grad_norm": 0.8515625, + "learning_rate": 0.00019635695517612254, + "loss": 1.2485, + "step": 4703 + }, + { + "epoch": 0.12078552961623826, + "grad_norm": 0.86328125, + "learning_rate": 0.00019635576109417713, + "loss": 1.0424, + "step": 4704 + }, + { + "epoch": 0.12081120681216008, + "grad_norm": 0.85546875, + "learning_rate": 0.00019635456682020336, + "loss": 1.24, + "step": 4705 + }, + { + "epoch": 0.1208368840080819, + "grad_norm": 0.90625, + "learning_rate": 0.00019635337235420365, + "loss": 1.1381, + "step": 4706 + }, + { + "epoch": 0.12086256120400371, + "grad_norm": 0.84375, + "learning_rate": 0.0001963521776961804, + "loss": 1.0498, + "step": 4707 + }, + { + "epoch": 0.12088823839992553, + "grad_norm": 0.80078125, + "learning_rate": 0.00019635098284613594, + "loss": 0.9854, + "step": 4708 + }, + { + "epoch": 0.12091391559584735, + "grad_norm": 0.8125, + "learning_rate": 0.00019634978780407267, + "loss": 1.0578, + "step": 4709 + }, + { + "epoch": 0.12093959279176918, + "grad_norm": 0.859375, + "learning_rate": 0.000196348592569993, + "loss": 1.235, + "step": 4710 + }, + { + "epoch": 0.120965269987691, + "grad_norm": 0.76171875, + "learning_rate": 0.00019634739714389924, + "loss": 1.0469, + "step": 4711 + }, + { + "epoch": 0.12099094718361281, + "grad_norm": 0.796875, + "learning_rate": 0.00019634620152579385, + "loss": 1.0881, + "step": 4712 + }, + { + "epoch": 0.12101662437953463, + "grad_norm": 0.87109375, + "learning_rate": 0.0001963450057156792, + "loss": 1.095, + "step": 4713 + }, + { + "epoch": 0.12104230157545645, + "grad_norm": 0.94921875, + "learning_rate": 0.00019634380971355762, + "loss": 1.2729, + "step": 4714 + }, + { + "epoch": 0.12106797877137827, + "grad_norm": 0.9140625, + "learning_rate": 0.00019634261351943159, + "loss": 1.2617, + "step": 4715 + }, + { + "epoch": 0.1210936559673001, + "grad_norm": 0.8671875, + "learning_rate": 0.00019634141713330337, + "loss": 1.0808, + "step": 4716 + }, + { + "epoch": 0.1211193331632219, + "grad_norm": 0.79296875, + "learning_rate": 0.00019634022055517542, + "loss": 1.138, + "step": 4717 + }, + { + "epoch": 0.12114501035914373, + "grad_norm": 0.77734375, + "learning_rate": 0.00019633902378505015, + "loss": 1.0744, + "step": 4718 + }, + { + "epoch": 0.12117068755506555, + "grad_norm": 0.89453125, + "learning_rate": 0.0001963378268229299, + "loss": 1.0494, + "step": 4719 + }, + { + "epoch": 0.12119636475098737, + "grad_norm": 0.87890625, + "learning_rate": 0.00019633662966881703, + "loss": 1.225, + "step": 4720 + }, + { + "epoch": 0.12122204194690919, + "grad_norm": 0.93359375, + "learning_rate": 0.00019633543232271396, + "loss": 1.1865, + "step": 4721 + }, + { + "epoch": 0.121247719142831, + "grad_norm": 0.90625, + "learning_rate": 0.0001963342347846231, + "loss": 1.2493, + "step": 4722 + }, + { + "epoch": 0.12127339633875282, + "grad_norm": 0.91796875, + "learning_rate": 0.00019633303705454683, + "loss": 1.1765, + "step": 4723 + }, + { + "epoch": 0.12129907353467465, + "grad_norm": 0.796875, + "learning_rate": 0.00019633183913248748, + "loss": 1.2002, + "step": 4724 + }, + { + "epoch": 0.12132475073059647, + "grad_norm": 0.83984375, + "learning_rate": 0.00019633064101844748, + "loss": 1.0999, + "step": 4725 + }, + { + "epoch": 0.12135042792651829, + "grad_norm": 0.859375, + "learning_rate": 0.00019632944271242924, + "loss": 1.1491, + "step": 4726 + }, + { + "epoch": 0.1213761051224401, + "grad_norm": 0.875, + "learning_rate": 0.0001963282442144351, + "loss": 1.1947, + "step": 4727 + }, + { + "epoch": 0.12140178231836192, + "grad_norm": 0.84765625, + "learning_rate": 0.00019632704552446746, + "loss": 1.0792, + "step": 4728 + }, + { + "epoch": 0.12142745951428374, + "grad_norm": 0.8203125, + "learning_rate": 0.00019632584664252875, + "loss": 1.1731, + "step": 4729 + }, + { + "epoch": 0.12145313671020556, + "grad_norm": 0.875, + "learning_rate": 0.0001963246475686213, + "loss": 1.1002, + "step": 4730 + }, + { + "epoch": 0.12147881390612739, + "grad_norm": 0.79296875, + "learning_rate": 0.00019632344830274753, + "loss": 1.0687, + "step": 4731 + }, + { + "epoch": 0.1215044911020492, + "grad_norm": 0.80078125, + "learning_rate": 0.00019632224884490987, + "loss": 0.905, + "step": 4732 + }, + { + "epoch": 0.12153016829797102, + "grad_norm": 0.86328125, + "learning_rate": 0.00019632104919511064, + "loss": 1.0447, + "step": 4733 + }, + { + "epoch": 0.12155584549389284, + "grad_norm": 0.88671875, + "learning_rate": 0.00019631984935335225, + "loss": 1.046, + "step": 4734 + }, + { + "epoch": 0.12158152268981466, + "grad_norm": 0.8828125, + "learning_rate": 0.0001963186493196371, + "loss": 1.2548, + "step": 4735 + }, + { + "epoch": 0.12160719988573648, + "grad_norm": 0.84375, + "learning_rate": 0.0001963174490939676, + "loss": 1.1166, + "step": 4736 + }, + { + "epoch": 0.12163287708165829, + "grad_norm": 0.8515625, + "learning_rate": 0.00019631624867634613, + "loss": 1.0353, + "step": 4737 + }, + { + "epoch": 0.12165855427758011, + "grad_norm": 0.8515625, + "learning_rate": 0.00019631504806677503, + "loss": 1.1417, + "step": 4738 + }, + { + "epoch": 0.12168423147350194, + "grad_norm": 0.76171875, + "learning_rate": 0.00019631384726525678, + "loss": 1.0503, + "step": 4739 + }, + { + "epoch": 0.12170990866942376, + "grad_norm": 0.9140625, + "learning_rate": 0.00019631264627179373, + "loss": 1.2069, + "step": 4740 + }, + { + "epoch": 0.12173558586534558, + "grad_norm": 0.90234375, + "learning_rate": 0.00019631144508638828, + "loss": 1.0997, + "step": 4741 + }, + { + "epoch": 0.12176126306126739, + "grad_norm": 0.9375, + "learning_rate": 0.0001963102437090428, + "loss": 1.2427, + "step": 4742 + }, + { + "epoch": 0.12178694025718921, + "grad_norm": 0.81640625, + "learning_rate": 0.0001963090421397597, + "loss": 1.0631, + "step": 4743 + }, + { + "epoch": 0.12181261745311103, + "grad_norm": 1.375, + "learning_rate": 0.0001963078403785414, + "loss": 1.2697, + "step": 4744 + }, + { + "epoch": 0.12183829464903285, + "grad_norm": 0.84765625, + "learning_rate": 0.00019630663842539029, + "loss": 1.3218, + "step": 4745 + }, + { + "epoch": 0.12186397184495468, + "grad_norm": 0.89453125, + "learning_rate": 0.0001963054362803087, + "loss": 1.1738, + "step": 4746 + }, + { + "epoch": 0.12188964904087649, + "grad_norm": 0.8828125, + "learning_rate": 0.00019630423394329912, + "loss": 1.259, + "step": 4747 + }, + { + "epoch": 0.12191532623679831, + "grad_norm": 0.86328125, + "learning_rate": 0.00019630303141436386, + "loss": 1.156, + "step": 4748 + }, + { + "epoch": 0.12194100343272013, + "grad_norm": 0.95703125, + "learning_rate": 0.0001963018286935054, + "loss": 1.0389, + "step": 4749 + }, + { + "epoch": 0.12196668062864195, + "grad_norm": 0.8359375, + "learning_rate": 0.00019630062578072607, + "loss": 1.033, + "step": 4750 + }, + { + "epoch": 0.12199235782456377, + "grad_norm": 0.84765625, + "learning_rate": 0.00019629942267602828, + "loss": 1.2311, + "step": 4751 + }, + { + "epoch": 0.12201803502048558, + "grad_norm": 0.875, + "learning_rate": 0.00019629821937941445, + "loss": 1.2289, + "step": 4752 + }, + { + "epoch": 0.1220437122164074, + "grad_norm": 0.86328125, + "learning_rate": 0.00019629701589088698, + "loss": 1.2037, + "step": 4753 + }, + { + "epoch": 0.12206938941232923, + "grad_norm": 0.83203125, + "learning_rate": 0.00019629581221044826, + "loss": 1.1617, + "step": 4754 + }, + { + "epoch": 0.12209506660825105, + "grad_norm": 0.828125, + "learning_rate": 0.00019629460833810067, + "loss": 1.0427, + "step": 4755 + }, + { + "epoch": 0.12212074380417287, + "grad_norm": 0.890625, + "learning_rate": 0.0001962934042738466, + "loss": 1.1316, + "step": 4756 + }, + { + "epoch": 0.12214642100009468, + "grad_norm": 0.81640625, + "learning_rate": 0.0001962922000176885, + "loss": 1.1493, + "step": 4757 + }, + { + "epoch": 0.1221720981960165, + "grad_norm": 0.83203125, + "learning_rate": 0.00019629099556962875, + "loss": 1.1921, + "step": 4758 + }, + { + "epoch": 0.12219777539193832, + "grad_norm": 0.84765625, + "learning_rate": 0.00019628979092966976, + "loss": 1.3526, + "step": 4759 + }, + { + "epoch": 0.12222345258786015, + "grad_norm": 0.95703125, + "learning_rate": 0.00019628858609781387, + "loss": 1.1265, + "step": 4760 + }, + { + "epoch": 0.12224912978378197, + "grad_norm": 0.84375, + "learning_rate": 0.00019628738107406354, + "loss": 1.0477, + "step": 4761 + }, + { + "epoch": 0.12227480697970378, + "grad_norm": 0.890625, + "learning_rate": 0.00019628617585842117, + "loss": 1.1547, + "step": 4762 + }, + { + "epoch": 0.1223004841756256, + "grad_norm": 0.84765625, + "learning_rate": 0.00019628497045088913, + "loss": 1.2688, + "step": 4763 + }, + { + "epoch": 0.12232616137154742, + "grad_norm": 0.93359375, + "learning_rate": 0.00019628376485146987, + "loss": 1.0956, + "step": 4764 + }, + { + "epoch": 0.12235183856746924, + "grad_norm": 0.828125, + "learning_rate": 0.00019628255906016574, + "loss": 1.1677, + "step": 4765 + }, + { + "epoch": 0.12237751576339106, + "grad_norm": 0.8046875, + "learning_rate": 0.0001962813530769792, + "loss": 1.1376, + "step": 4766 + }, + { + "epoch": 0.12240319295931287, + "grad_norm": 0.8046875, + "learning_rate": 0.00019628014690191257, + "loss": 1.0033, + "step": 4767 + }, + { + "epoch": 0.1224288701552347, + "grad_norm": 0.92578125, + "learning_rate": 0.0001962789405349683, + "loss": 1.1906, + "step": 4768 + }, + { + "epoch": 0.12245454735115652, + "grad_norm": 0.88671875, + "learning_rate": 0.00019627773397614887, + "loss": 1.1202, + "step": 4769 + }, + { + "epoch": 0.12248022454707834, + "grad_norm": 0.859375, + "learning_rate": 0.00019627652722545655, + "loss": 1.1682, + "step": 4770 + }, + { + "epoch": 0.12250590174300016, + "grad_norm": 0.8359375, + "learning_rate": 0.00019627532028289383, + "loss": 1.2021, + "step": 4771 + }, + { + "epoch": 0.12253157893892197, + "grad_norm": 0.80078125, + "learning_rate": 0.0001962741131484631, + "loss": 1.0485, + "step": 4772 + }, + { + "epoch": 0.12255725613484379, + "grad_norm": 0.7578125, + "learning_rate": 0.00019627290582216675, + "loss": 1.0002, + "step": 4773 + }, + { + "epoch": 0.12258293333076561, + "grad_norm": 0.78515625, + "learning_rate": 0.0001962716983040072, + "loss": 1.1375, + "step": 4774 + }, + { + "epoch": 0.12260861052668744, + "grad_norm": 0.85546875, + "learning_rate": 0.00019627049059398684, + "loss": 1.1885, + "step": 4775 + }, + { + "epoch": 0.12263428772260926, + "grad_norm": 0.875, + "learning_rate": 0.00019626928269210809, + "loss": 1.0155, + "step": 4776 + }, + { + "epoch": 0.12265996491853107, + "grad_norm": 0.92578125, + "learning_rate": 0.00019626807459837336, + "loss": 1.2558, + "step": 4777 + }, + { + "epoch": 0.12268564211445289, + "grad_norm": 0.78125, + "learning_rate": 0.00019626686631278505, + "loss": 1.2798, + "step": 4778 + }, + { + "epoch": 0.12271131931037471, + "grad_norm": 0.81640625, + "learning_rate": 0.0001962656578353456, + "loss": 1.0618, + "step": 4779 + }, + { + "epoch": 0.12273699650629653, + "grad_norm": 0.84375, + "learning_rate": 0.00019626444916605732, + "loss": 1.0471, + "step": 4780 + }, + { + "epoch": 0.12276267370221836, + "grad_norm": 0.8203125, + "learning_rate": 0.00019626324030492276, + "loss": 1.1483, + "step": 4781 + }, + { + "epoch": 0.12278835089814016, + "grad_norm": 0.82421875, + "learning_rate": 0.00019626203125194423, + "loss": 1.1086, + "step": 4782 + }, + { + "epoch": 0.12281402809406199, + "grad_norm": 0.8515625, + "learning_rate": 0.00019626082200712417, + "loss": 1.2572, + "step": 4783 + }, + { + "epoch": 0.12283970528998381, + "grad_norm": 0.9375, + "learning_rate": 0.00019625961257046498, + "loss": 1.111, + "step": 4784 + }, + { + "epoch": 0.12286538248590563, + "grad_norm": 0.87890625, + "learning_rate": 0.0001962584029419691, + "loss": 1.1688, + "step": 4785 + }, + { + "epoch": 0.12289105968182745, + "grad_norm": 0.72265625, + "learning_rate": 0.00019625719312163885, + "loss": 1.0926, + "step": 4786 + }, + { + "epoch": 0.12291673687774926, + "grad_norm": 0.8046875, + "learning_rate": 0.0001962559831094768, + "loss": 1.0221, + "step": 4787 + }, + { + "epoch": 0.12294241407367108, + "grad_norm": 0.87890625, + "learning_rate": 0.0001962547729054852, + "loss": 1.066, + "step": 4788 + }, + { + "epoch": 0.1229680912695929, + "grad_norm": 0.84765625, + "learning_rate": 0.00019625356250966658, + "loss": 1.2192, + "step": 4789 + }, + { + "epoch": 0.12299376846551473, + "grad_norm": 0.83984375, + "learning_rate": 0.00019625235192202327, + "loss": 1.0265, + "step": 4790 + }, + { + "epoch": 0.12301944566143655, + "grad_norm": 0.859375, + "learning_rate": 0.00019625114114255773, + "loss": 1.3292, + "step": 4791 + }, + { + "epoch": 0.12304512285735836, + "grad_norm": 0.8359375, + "learning_rate": 0.00019624993017127235, + "loss": 1.2613, + "step": 4792 + }, + { + "epoch": 0.12307080005328018, + "grad_norm": 0.85546875, + "learning_rate": 0.0001962487190081696, + "loss": 1.2086, + "step": 4793 + }, + { + "epoch": 0.123096477249202, + "grad_norm": 0.84375, + "learning_rate": 0.00019624750765325183, + "loss": 1.122, + "step": 4794 + }, + { + "epoch": 0.12312215444512382, + "grad_norm": 0.84765625, + "learning_rate": 0.0001962462961065214, + "loss": 1.0497, + "step": 4795 + }, + { + "epoch": 0.12314783164104565, + "grad_norm": 0.84375, + "learning_rate": 0.0001962450843679809, + "loss": 1.1935, + "step": 4796 + }, + { + "epoch": 0.12317350883696745, + "grad_norm": 0.79296875, + "learning_rate": 0.00019624387243763258, + "loss": 1.1587, + "step": 4797 + }, + { + "epoch": 0.12319918603288928, + "grad_norm": 0.8125, + "learning_rate": 0.0001962426603154789, + "loss": 1.0522, + "step": 4798 + }, + { + "epoch": 0.1232248632288111, + "grad_norm": 0.90625, + "learning_rate": 0.00019624144800152232, + "loss": 1.1987, + "step": 4799 + }, + { + "epoch": 0.12325054042473292, + "grad_norm": 0.8671875, + "learning_rate": 0.00019624023549576523, + "loss": 1.1843, + "step": 4800 + }, + { + "epoch": 0.12327621762065474, + "grad_norm": 0.8203125, + "learning_rate": 0.00019623902279821006, + "loss": 1.1285, + "step": 4801 + }, + { + "epoch": 0.12330189481657655, + "grad_norm": 0.88671875, + "learning_rate": 0.00019623780990885917, + "loss": 1.2404, + "step": 4802 + }, + { + "epoch": 0.12332757201249837, + "grad_norm": 0.83984375, + "learning_rate": 0.00019623659682771504, + "loss": 1.2721, + "step": 4803 + }, + { + "epoch": 0.1233532492084202, + "grad_norm": 0.8359375, + "learning_rate": 0.00019623538355478007, + "loss": 1.0888, + "step": 4804 + }, + { + "epoch": 0.12337892640434202, + "grad_norm": 0.7734375, + "learning_rate": 0.00019623417009005667, + "loss": 1.115, + "step": 4805 + }, + { + "epoch": 0.12340460360026383, + "grad_norm": 0.89453125, + "learning_rate": 0.00019623295643354726, + "loss": 1.2433, + "step": 4806 + }, + { + "epoch": 0.12343028079618565, + "grad_norm": 0.91796875, + "learning_rate": 0.00019623174258525425, + "loss": 1.1933, + "step": 4807 + }, + { + "epoch": 0.12345595799210747, + "grad_norm": 0.82421875, + "learning_rate": 0.00019623052854518007, + "loss": 1.1017, + "step": 4808 + }, + { + "epoch": 0.12348163518802929, + "grad_norm": 0.93359375, + "learning_rate": 0.00019622931431332715, + "loss": 1.0321, + "step": 4809 + }, + { + "epoch": 0.12350731238395111, + "grad_norm": 0.89453125, + "learning_rate": 0.0001962280998896979, + "loss": 1.1081, + "step": 4810 + }, + { + "epoch": 0.12353298957987292, + "grad_norm": 0.8359375, + "learning_rate": 0.00019622688527429474, + "loss": 1.0228, + "step": 4811 + }, + { + "epoch": 0.12355866677579475, + "grad_norm": 1.0, + "learning_rate": 0.0001962256704671201, + "loss": 1.1021, + "step": 4812 + }, + { + "epoch": 0.12358434397171657, + "grad_norm": 0.81640625, + "learning_rate": 0.00019622445546817638, + "loss": 0.9878, + "step": 4813 + }, + { + "epoch": 0.12361002116763839, + "grad_norm": 0.8359375, + "learning_rate": 0.00019622324027746598, + "loss": 1.248, + "step": 4814 + }, + { + "epoch": 0.12363569836356021, + "grad_norm": 0.859375, + "learning_rate": 0.0001962220248949914, + "loss": 1.2059, + "step": 4815 + }, + { + "epoch": 0.12366137555948202, + "grad_norm": 0.80859375, + "learning_rate": 0.000196220809320755, + "loss": 1.2142, + "step": 4816 + }, + { + "epoch": 0.12368705275540384, + "grad_norm": 0.77734375, + "learning_rate": 0.00019621959355475922, + "loss": 1.1784, + "step": 4817 + }, + { + "epoch": 0.12371272995132566, + "grad_norm": 0.83203125, + "learning_rate": 0.00019621837759700646, + "loss": 1.1787, + "step": 4818 + }, + { + "epoch": 0.12373840714724749, + "grad_norm": 0.84375, + "learning_rate": 0.0001962171614474992, + "loss": 1.2129, + "step": 4819 + }, + { + "epoch": 0.12376408434316931, + "grad_norm": 0.9296875, + "learning_rate": 0.0001962159451062398, + "loss": 1.1869, + "step": 4820 + }, + { + "epoch": 0.12378976153909112, + "grad_norm": 0.7578125, + "learning_rate": 0.00019621472857323072, + "loss": 0.9934, + "step": 4821 + }, + { + "epoch": 0.12381543873501294, + "grad_norm": 0.8828125, + "learning_rate": 0.0001962135118484744, + "loss": 1.2491, + "step": 4822 + }, + { + "epoch": 0.12384111593093476, + "grad_norm": 0.87109375, + "learning_rate": 0.0001962122949319732, + "loss": 1.1788, + "step": 4823 + }, + { + "epoch": 0.12386679312685658, + "grad_norm": 0.79296875, + "learning_rate": 0.00019621107782372965, + "loss": 1.0768, + "step": 4824 + }, + { + "epoch": 0.1238924703227784, + "grad_norm": 0.87109375, + "learning_rate": 0.00019620986052374607, + "loss": 1.1419, + "step": 4825 + }, + { + "epoch": 0.12391814751870021, + "grad_norm": 0.875, + "learning_rate": 0.00019620864303202491, + "loss": 1.295, + "step": 4826 + }, + { + "epoch": 0.12394382471462204, + "grad_norm": 0.86328125, + "learning_rate": 0.00019620742534856865, + "loss": 1.0448, + "step": 4827 + }, + { + "epoch": 0.12396950191054386, + "grad_norm": 0.859375, + "learning_rate": 0.00019620620747337966, + "loss": 0.9929, + "step": 4828 + }, + { + "epoch": 0.12399517910646568, + "grad_norm": 0.8515625, + "learning_rate": 0.00019620498940646041, + "loss": 1.1223, + "step": 4829 + }, + { + "epoch": 0.1240208563023875, + "grad_norm": 0.83203125, + "learning_rate": 0.0001962037711478133, + "loss": 1.1307, + "step": 4830 + }, + { + "epoch": 0.12404653349830931, + "grad_norm": 0.8828125, + "learning_rate": 0.00019620255269744074, + "loss": 1.2484, + "step": 4831 + }, + { + "epoch": 0.12407221069423113, + "grad_norm": 0.9375, + "learning_rate": 0.0001962013340553452, + "loss": 1.201, + "step": 4832 + }, + { + "epoch": 0.12409788789015296, + "grad_norm": 0.9375, + "learning_rate": 0.0001962001152215291, + "loss": 1.1945, + "step": 4833 + }, + { + "epoch": 0.12412356508607478, + "grad_norm": 0.93359375, + "learning_rate": 0.00019619889619599485, + "loss": 1.2281, + "step": 4834 + }, + { + "epoch": 0.1241492422819966, + "grad_norm": 0.88671875, + "learning_rate": 0.0001961976769787449, + "loss": 1.0452, + "step": 4835 + }, + { + "epoch": 0.12417491947791841, + "grad_norm": 0.8359375, + "learning_rate": 0.00019619645756978164, + "loss": 1.2555, + "step": 4836 + }, + { + "epoch": 0.12420059667384023, + "grad_norm": 0.8359375, + "learning_rate": 0.00019619523796910758, + "loss": 1.1461, + "step": 4837 + }, + { + "epoch": 0.12422627386976205, + "grad_norm": 0.8203125, + "learning_rate": 0.00019619401817672506, + "loss": 1.132, + "step": 4838 + }, + { + "epoch": 0.12425195106568387, + "grad_norm": 0.859375, + "learning_rate": 0.00019619279819263653, + "loss": 1.0668, + "step": 4839 + }, + { + "epoch": 0.1242776282616057, + "grad_norm": 0.87890625, + "learning_rate": 0.0001961915780168445, + "loss": 1.2022, + "step": 4840 + }, + { + "epoch": 0.1243033054575275, + "grad_norm": 0.859375, + "learning_rate": 0.0001961903576493513, + "loss": 0.9463, + "step": 4841 + }, + { + "epoch": 0.12432898265344933, + "grad_norm": 0.9296875, + "learning_rate": 0.00019618913709015944, + "loss": 1.126, + "step": 4842 + }, + { + "epoch": 0.12435465984937115, + "grad_norm": 0.90234375, + "learning_rate": 0.0001961879163392713, + "loss": 1.1874, + "step": 4843 + }, + { + "epoch": 0.12438033704529297, + "grad_norm": 0.80078125, + "learning_rate": 0.00019618669539668934, + "loss": 1.051, + "step": 4844 + }, + { + "epoch": 0.1244060142412148, + "grad_norm": 0.8046875, + "learning_rate": 0.00019618547426241597, + "loss": 1.1356, + "step": 4845 + }, + { + "epoch": 0.1244316914371366, + "grad_norm": 0.8046875, + "learning_rate": 0.00019618425293645365, + "loss": 0.9883, + "step": 4846 + }, + { + "epoch": 0.12445736863305842, + "grad_norm": 0.9453125, + "learning_rate": 0.0001961830314188048, + "loss": 1.3745, + "step": 4847 + }, + { + "epoch": 0.12448304582898025, + "grad_norm": 0.890625, + "learning_rate": 0.00019618180970947183, + "loss": 1.1823, + "step": 4848 + }, + { + "epoch": 0.12450872302490207, + "grad_norm": 0.8359375, + "learning_rate": 0.00019618058780845722, + "loss": 1.157, + "step": 4849 + }, + { + "epoch": 0.12453440022082389, + "grad_norm": 0.7890625, + "learning_rate": 0.0001961793657157634, + "loss": 0.9198, + "step": 4850 + }, + { + "epoch": 0.1245600774167457, + "grad_norm": 0.875, + "learning_rate": 0.00019617814343139277, + "loss": 1.1475, + "step": 4851 + }, + { + "epoch": 0.12458575461266752, + "grad_norm": 0.8984375, + "learning_rate": 0.00019617692095534782, + "loss": 1.3017, + "step": 4852 + }, + { + "epoch": 0.12461143180858934, + "grad_norm": 0.875, + "learning_rate": 0.00019617569828763094, + "loss": 1.2691, + "step": 4853 + }, + { + "epoch": 0.12463710900451117, + "grad_norm": 0.86328125, + "learning_rate": 0.0001961744754282446, + "loss": 1.1431, + "step": 4854 + }, + { + "epoch": 0.12466278620043299, + "grad_norm": 0.8515625, + "learning_rate": 0.00019617325237719117, + "loss": 1.1782, + "step": 4855 + }, + { + "epoch": 0.1246884633963548, + "grad_norm": 0.796875, + "learning_rate": 0.00019617202913447317, + "loss": 1.1749, + "step": 4856 + }, + { + "epoch": 0.12471414059227662, + "grad_norm": 0.796875, + "learning_rate": 0.00019617080570009297, + "loss": 0.9621, + "step": 4857 + }, + { + "epoch": 0.12473981778819844, + "grad_norm": 0.88671875, + "learning_rate": 0.0001961695820740531, + "loss": 1.2087, + "step": 4858 + }, + { + "epoch": 0.12476549498412026, + "grad_norm": 0.88671875, + "learning_rate": 0.00019616835825635588, + "loss": 1.3314, + "step": 4859 + }, + { + "epoch": 0.12479117218004208, + "grad_norm": 1.0, + "learning_rate": 0.00019616713424700383, + "loss": 1.1065, + "step": 4860 + }, + { + "epoch": 0.12481684937596389, + "grad_norm": 0.81640625, + "learning_rate": 0.00019616591004599938, + "loss": 0.99, + "step": 4861 + }, + { + "epoch": 0.12484252657188571, + "grad_norm": 0.859375, + "learning_rate": 0.00019616468565334495, + "loss": 1.0928, + "step": 4862 + }, + { + "epoch": 0.12486820376780754, + "grad_norm": 0.99609375, + "learning_rate": 0.000196163461069043, + "loss": 1.2424, + "step": 4863 + }, + { + "epoch": 0.12489388096372936, + "grad_norm": 0.84765625, + "learning_rate": 0.00019616223629309593, + "loss": 1.1048, + "step": 4864 + }, + { + "epoch": 0.12491955815965118, + "grad_norm": 0.8359375, + "learning_rate": 0.00019616101132550621, + "loss": 0.9482, + "step": 4865 + }, + { + "epoch": 0.12494523535557299, + "grad_norm": 0.98046875, + "learning_rate": 0.00019615978616627632, + "loss": 1.1092, + "step": 4866 + }, + { + "epoch": 0.12497091255149481, + "grad_norm": 0.84765625, + "learning_rate": 0.0001961585608154086, + "loss": 1.1406, + "step": 4867 + }, + { + "epoch": 0.12499658974741663, + "grad_norm": 0.8984375, + "learning_rate": 0.00019615733527290563, + "loss": 0.9892, + "step": 4868 + }, + { + "epoch": 0.12502226694333846, + "grad_norm": 0.83984375, + "learning_rate": 0.00019615610953876972, + "loss": 1.1247, + "step": 4869 + }, + { + "epoch": 0.12504794413926026, + "grad_norm": 0.7578125, + "learning_rate": 0.0001961548836130034, + "loss": 0.8932, + "step": 4870 + }, + { + "epoch": 0.1250736213351821, + "grad_norm": 1.3046875, + "learning_rate": 0.00019615365749560905, + "loss": 1.0679, + "step": 4871 + }, + { + "epoch": 0.1250992985311039, + "grad_norm": 0.85546875, + "learning_rate": 0.0001961524311865892, + "loss": 1.1911, + "step": 4872 + }, + { + "epoch": 0.12512497572702572, + "grad_norm": 0.83984375, + "learning_rate": 0.00019615120468594618, + "loss": 1.2696, + "step": 4873 + }, + { + "epoch": 0.12515065292294755, + "grad_norm": 0.875, + "learning_rate": 0.00019614997799368252, + "loss": 1.1514, + "step": 4874 + }, + { + "epoch": 0.12517633011886936, + "grad_norm": 0.91796875, + "learning_rate": 0.00019614875110980068, + "loss": 1.204, + "step": 4875 + }, + { + "epoch": 0.1252020073147912, + "grad_norm": 0.921875, + "learning_rate": 0.00019614752403430302, + "loss": 1.0626, + "step": 4876 + }, + { + "epoch": 0.125227684510713, + "grad_norm": 0.81640625, + "learning_rate": 0.000196146296767192, + "loss": 1.2106, + "step": 4877 + }, + { + "epoch": 0.1252533617066348, + "grad_norm": 0.828125, + "learning_rate": 0.00019614506930847015, + "loss": 1.0541, + "step": 4878 + }, + { + "epoch": 0.12527903890255665, + "grad_norm": 0.9609375, + "learning_rate": 0.00019614384165813983, + "loss": 1.301, + "step": 4879 + }, + { + "epoch": 0.12530471609847846, + "grad_norm": 0.890625, + "learning_rate": 0.00019614261381620355, + "loss": 1.0051, + "step": 4880 + }, + { + "epoch": 0.1253303932944003, + "grad_norm": 0.80078125, + "learning_rate": 0.00019614138578266367, + "loss": 1.1753, + "step": 4881 + }, + { + "epoch": 0.1253560704903221, + "grad_norm": 0.82421875, + "learning_rate": 0.00019614015755752275, + "loss": 1.0177, + "step": 4882 + }, + { + "epoch": 0.1253817476862439, + "grad_norm": 0.8203125, + "learning_rate": 0.00019613892914078316, + "loss": 1.0777, + "step": 4883 + }, + { + "epoch": 0.12540742488216575, + "grad_norm": 0.8671875, + "learning_rate": 0.00019613770053244735, + "loss": 1.1055, + "step": 4884 + }, + { + "epoch": 0.12543310207808755, + "grad_norm": 0.84375, + "learning_rate": 0.00019613647173251782, + "loss": 1.0318, + "step": 4885 + }, + { + "epoch": 0.1254587792740094, + "grad_norm": 0.80078125, + "learning_rate": 0.00019613524274099696, + "loss": 1.162, + "step": 4886 + }, + { + "epoch": 0.1254844564699312, + "grad_norm": 0.8203125, + "learning_rate": 0.00019613401355788725, + "loss": 1.0601, + "step": 4887 + }, + { + "epoch": 0.125510133665853, + "grad_norm": 0.83203125, + "learning_rate": 0.00019613278418319113, + "loss": 1.0929, + "step": 4888 + }, + { + "epoch": 0.12553581086177484, + "grad_norm": 0.78515625, + "learning_rate": 0.0001961315546169111, + "loss": 1.2317, + "step": 4889 + }, + { + "epoch": 0.12556148805769665, + "grad_norm": 0.9375, + "learning_rate": 0.0001961303248590495, + "loss": 1.1623, + "step": 4890 + }, + { + "epoch": 0.1255871652536185, + "grad_norm": 0.78125, + "learning_rate": 0.0001961290949096089, + "loss": 1.0933, + "step": 4891 + }, + { + "epoch": 0.1256128424495403, + "grad_norm": 0.9375, + "learning_rate": 0.00019612786476859166, + "loss": 1.4328, + "step": 4892 + }, + { + "epoch": 0.1256385196454621, + "grad_norm": 0.8984375, + "learning_rate": 0.00019612663443600026, + "loss": 1.1117, + "step": 4893 + }, + { + "epoch": 0.12566419684138394, + "grad_norm": 0.87890625, + "learning_rate": 0.0001961254039118372, + "loss": 1.1505, + "step": 4894 + }, + { + "epoch": 0.12568987403730575, + "grad_norm": 0.859375, + "learning_rate": 0.00019612417319610486, + "loss": 1.1488, + "step": 4895 + }, + { + "epoch": 0.12571555123322758, + "grad_norm": 0.83984375, + "learning_rate": 0.00019612294228880576, + "loss": 1.1893, + "step": 4896 + }, + { + "epoch": 0.1257412284291494, + "grad_norm": 0.80859375, + "learning_rate": 0.00019612171118994228, + "loss": 1.0535, + "step": 4897 + }, + { + "epoch": 0.1257669056250712, + "grad_norm": 0.87109375, + "learning_rate": 0.00019612047989951692, + "loss": 1.2221, + "step": 4898 + }, + { + "epoch": 0.12579258282099304, + "grad_norm": 0.82421875, + "learning_rate": 0.00019611924841753215, + "loss": 1.1187, + "step": 4899 + }, + { + "epoch": 0.12581826001691485, + "grad_norm": 0.80859375, + "learning_rate": 0.0001961180167439904, + "loss": 1.1011, + "step": 4900 + }, + { + "epoch": 0.12584393721283668, + "grad_norm": 0.90234375, + "learning_rate": 0.00019611678487889411, + "loss": 1.1543, + "step": 4901 + }, + { + "epoch": 0.1258696144087585, + "grad_norm": 0.80078125, + "learning_rate": 0.00019611555282224573, + "loss": 1.0529, + "step": 4902 + }, + { + "epoch": 0.1258952916046803, + "grad_norm": 0.8046875, + "learning_rate": 0.00019611432057404778, + "loss": 1.125, + "step": 4903 + }, + { + "epoch": 0.12592096880060213, + "grad_norm": 0.88671875, + "learning_rate": 0.00019611308813430266, + "loss": 1.2201, + "step": 4904 + }, + { + "epoch": 0.12594664599652394, + "grad_norm": 0.89453125, + "learning_rate": 0.0001961118555030128, + "loss": 1.163, + "step": 4905 + }, + { + "epoch": 0.12597232319244578, + "grad_norm": 0.90234375, + "learning_rate": 0.0001961106226801807, + "loss": 1.1689, + "step": 4906 + }, + { + "epoch": 0.1259980003883676, + "grad_norm": 0.77734375, + "learning_rate": 0.00019610938966580885, + "loss": 1.0845, + "step": 4907 + }, + { + "epoch": 0.1260236775842894, + "grad_norm": 1.015625, + "learning_rate": 0.00019610815645989966, + "loss": 1.1709, + "step": 4908 + }, + { + "epoch": 0.12604935478021123, + "grad_norm": 0.84375, + "learning_rate": 0.00019610692306245558, + "loss": 1.0393, + "step": 4909 + }, + { + "epoch": 0.12607503197613304, + "grad_norm": 0.84375, + "learning_rate": 0.0001961056894734791, + "loss": 1.0965, + "step": 4910 + }, + { + "epoch": 0.12610070917205488, + "grad_norm": 0.7734375, + "learning_rate": 0.00019610445569297262, + "loss": 1.0246, + "step": 4911 + }, + { + "epoch": 0.12612638636797668, + "grad_norm": 0.87890625, + "learning_rate": 0.0001961032217209387, + "loss": 1.134, + "step": 4912 + }, + { + "epoch": 0.1261520635638985, + "grad_norm": 0.81640625, + "learning_rate": 0.0001961019875573797, + "loss": 1.1532, + "step": 4913 + }, + { + "epoch": 0.12617774075982033, + "grad_norm": 0.82421875, + "learning_rate": 0.00019610075320229814, + "loss": 1.0805, + "step": 4914 + }, + { + "epoch": 0.12620341795574214, + "grad_norm": 0.82421875, + "learning_rate": 0.00019609951865569641, + "loss": 1.0931, + "step": 4915 + }, + { + "epoch": 0.12622909515166397, + "grad_norm": 0.81640625, + "learning_rate": 0.00019609828391757708, + "loss": 1.101, + "step": 4916 + }, + { + "epoch": 0.12625477234758578, + "grad_norm": 0.890625, + "learning_rate": 0.00019609704898794253, + "loss": 1.0719, + "step": 4917 + }, + { + "epoch": 0.1262804495435076, + "grad_norm": 0.8515625, + "learning_rate": 0.00019609581386679522, + "loss": 1.1631, + "step": 4918 + }, + { + "epoch": 0.12630612673942943, + "grad_norm": 1.1640625, + "learning_rate": 0.00019609457855413766, + "loss": 1.2496, + "step": 4919 + }, + { + "epoch": 0.12633180393535123, + "grad_norm": 0.796875, + "learning_rate": 0.0001960933430499723, + "loss": 1.0863, + "step": 4920 + }, + { + "epoch": 0.12635748113127307, + "grad_norm": 0.91015625, + "learning_rate": 0.00019609210735430156, + "loss": 1.3187, + "step": 4921 + }, + { + "epoch": 0.12638315832719488, + "grad_norm": 0.91015625, + "learning_rate": 0.00019609087146712795, + "loss": 1.1718, + "step": 4922 + }, + { + "epoch": 0.12640883552311669, + "grad_norm": 0.90234375, + "learning_rate": 0.00019608963538845389, + "loss": 1.0247, + "step": 4923 + }, + { + "epoch": 0.12643451271903852, + "grad_norm": 0.83203125, + "learning_rate": 0.00019608839911828188, + "loss": 1.1957, + "step": 4924 + }, + { + "epoch": 0.12646018991496033, + "grad_norm": 0.828125, + "learning_rate": 0.00019608716265661434, + "loss": 1.0755, + "step": 4925 + }, + { + "epoch": 0.12648586711088217, + "grad_norm": 0.89453125, + "learning_rate": 0.00019608592600345382, + "loss": 1.1619, + "step": 4926 + }, + { + "epoch": 0.12651154430680397, + "grad_norm": 0.7890625, + "learning_rate": 0.00019608468915880269, + "loss": 0.9376, + "step": 4927 + }, + { + "epoch": 0.12653722150272578, + "grad_norm": 0.86328125, + "learning_rate": 0.00019608345212266344, + "loss": 1.1747, + "step": 4928 + }, + { + "epoch": 0.12656289869864762, + "grad_norm": 0.859375, + "learning_rate": 0.0001960822148950386, + "loss": 1.0719, + "step": 4929 + }, + { + "epoch": 0.12658857589456943, + "grad_norm": 0.76953125, + "learning_rate": 0.00019608097747593055, + "loss": 0.9465, + "step": 4930 + }, + { + "epoch": 0.12661425309049126, + "grad_norm": 0.84375, + "learning_rate": 0.0001960797398653418, + "loss": 1.2008, + "step": 4931 + }, + { + "epoch": 0.12663993028641307, + "grad_norm": 0.875, + "learning_rate": 0.0001960785020632748, + "loss": 1.1796, + "step": 4932 + }, + { + "epoch": 0.12666560748233488, + "grad_norm": 0.85546875, + "learning_rate": 0.00019607726406973202, + "loss": 1.0526, + "step": 4933 + }, + { + "epoch": 0.12669128467825672, + "grad_norm": 0.84765625, + "learning_rate": 0.00019607602588471597, + "loss": 1.2164, + "step": 4934 + }, + { + "epoch": 0.12671696187417852, + "grad_norm": 0.9375, + "learning_rate": 0.00019607478750822903, + "loss": 1.0326, + "step": 4935 + }, + { + "epoch": 0.12674263907010036, + "grad_norm": 0.8671875, + "learning_rate": 0.00019607354894027375, + "loss": 0.9984, + "step": 4936 + }, + { + "epoch": 0.12676831626602217, + "grad_norm": 0.80078125, + "learning_rate": 0.00019607231018085254, + "loss": 1.1475, + "step": 4937 + }, + { + "epoch": 0.12679399346194398, + "grad_norm": 0.81640625, + "learning_rate": 0.00019607107122996791, + "loss": 1.1168, + "step": 4938 + }, + { + "epoch": 0.1268196706578658, + "grad_norm": 0.796875, + "learning_rate": 0.00019606983208762232, + "loss": 1.186, + "step": 4939 + }, + { + "epoch": 0.12684534785378762, + "grad_norm": 0.91796875, + "learning_rate": 0.0001960685927538182, + "loss": 1.0917, + "step": 4940 + }, + { + "epoch": 0.12687102504970946, + "grad_norm": 0.80078125, + "learning_rate": 0.0001960673532285581, + "loss": 1.1358, + "step": 4941 + }, + { + "epoch": 0.12689670224563127, + "grad_norm": 0.78515625, + "learning_rate": 0.0001960661135118444, + "loss": 1.0844, + "step": 4942 + }, + { + "epoch": 0.12692237944155307, + "grad_norm": 0.859375, + "learning_rate": 0.00019606487360367966, + "loss": 1.169, + "step": 4943 + }, + { + "epoch": 0.1269480566374749, + "grad_norm": 0.88671875, + "learning_rate": 0.00019606363350406625, + "loss": 1.1809, + "step": 4944 + }, + { + "epoch": 0.12697373383339672, + "grad_norm": 0.78125, + "learning_rate": 0.00019606239321300672, + "loss": 1.0028, + "step": 4945 + }, + { + "epoch": 0.12699941102931855, + "grad_norm": 0.85546875, + "learning_rate": 0.00019606115273050354, + "loss": 1.1537, + "step": 4946 + }, + { + "epoch": 0.12702508822524036, + "grad_norm": 0.83203125, + "learning_rate": 0.0001960599120565591, + "loss": 1.1579, + "step": 4947 + }, + { + "epoch": 0.12705076542116217, + "grad_norm": 0.828125, + "learning_rate": 0.000196058671191176, + "loss": 1.1091, + "step": 4948 + }, + { + "epoch": 0.127076442617084, + "grad_norm": 0.79296875, + "learning_rate": 0.0001960574301343566, + "loss": 1.0389, + "step": 4949 + }, + { + "epoch": 0.12710211981300581, + "grad_norm": 0.87109375, + "learning_rate": 0.00019605618888610344, + "loss": 1.0951, + "step": 4950 + }, + { + "epoch": 0.12712779700892765, + "grad_norm": 0.76953125, + "learning_rate": 0.00019605494744641896, + "loss": 1.0382, + "step": 4951 + }, + { + "epoch": 0.12715347420484946, + "grad_norm": 0.8046875, + "learning_rate": 0.00019605370581530566, + "loss": 1.048, + "step": 4952 + }, + { + "epoch": 0.12717915140077127, + "grad_norm": 0.7734375, + "learning_rate": 0.000196052463992766, + "loss": 1.1654, + "step": 4953 + }, + { + "epoch": 0.1272048285966931, + "grad_norm": 0.87109375, + "learning_rate": 0.00019605122197880243, + "loss": 1.1932, + "step": 4954 + }, + { + "epoch": 0.1272305057926149, + "grad_norm": 0.8359375, + "learning_rate": 0.0001960499797734175, + "loss": 1.1239, + "step": 4955 + }, + { + "epoch": 0.12725618298853675, + "grad_norm": 0.91015625, + "learning_rate": 0.0001960487373766136, + "loss": 1.0339, + "step": 4956 + }, + { + "epoch": 0.12728186018445856, + "grad_norm": 0.86328125, + "learning_rate": 0.00019604749478839323, + "loss": 1.0654, + "step": 4957 + }, + { + "epoch": 0.12730753738038036, + "grad_norm": 1.2734375, + "learning_rate": 0.00019604625200875893, + "loss": 1.2851, + "step": 4958 + }, + { + "epoch": 0.1273332145763022, + "grad_norm": 0.97265625, + "learning_rate": 0.00019604500903771307, + "loss": 1.0842, + "step": 4959 + }, + { + "epoch": 0.127358891772224, + "grad_norm": 0.90234375, + "learning_rate": 0.0001960437658752582, + "loss": 1.1312, + "step": 4960 + }, + { + "epoch": 0.12738456896814584, + "grad_norm": 0.94921875, + "learning_rate": 0.0001960425225213968, + "loss": 1.1016, + "step": 4961 + }, + { + "epoch": 0.12741024616406765, + "grad_norm": 0.84765625, + "learning_rate": 0.00019604127897613133, + "loss": 0.9743, + "step": 4962 + }, + { + "epoch": 0.12743592335998946, + "grad_norm": 0.8359375, + "learning_rate": 0.00019604003523946423, + "loss": 1.1589, + "step": 4963 + }, + { + "epoch": 0.1274616005559113, + "grad_norm": 0.84765625, + "learning_rate": 0.00019603879131139805, + "loss": 0.9992, + "step": 4964 + }, + { + "epoch": 0.1274872777518331, + "grad_norm": 0.81640625, + "learning_rate": 0.00019603754719193522, + "loss": 1.1546, + "step": 4965 + }, + { + "epoch": 0.12751295494775494, + "grad_norm": 0.76171875, + "learning_rate": 0.00019603630288107823, + "loss": 1.1153, + "step": 4966 + }, + { + "epoch": 0.12753863214367675, + "grad_norm": 0.765625, + "learning_rate": 0.00019603505837882955, + "loss": 1.0006, + "step": 4967 + }, + { + "epoch": 0.12756430933959856, + "grad_norm": 0.76953125, + "learning_rate": 0.0001960338136851917, + "loss": 0.9941, + "step": 4968 + }, + { + "epoch": 0.1275899865355204, + "grad_norm": 0.9375, + "learning_rate": 0.00019603256880016713, + "loss": 1.1484, + "step": 4969 + }, + { + "epoch": 0.1276156637314422, + "grad_norm": 0.8359375, + "learning_rate": 0.00019603132372375832, + "loss": 1.0487, + "step": 4970 + }, + { + "epoch": 0.12764134092736404, + "grad_norm": 0.921875, + "learning_rate": 0.00019603007845596775, + "loss": 1.0894, + "step": 4971 + }, + { + "epoch": 0.12766701812328585, + "grad_norm": 1.3046875, + "learning_rate": 0.0001960288329967979, + "loss": 0.9929, + "step": 4972 + }, + { + "epoch": 0.12769269531920766, + "grad_norm": 0.87109375, + "learning_rate": 0.0001960275873462513, + "loss": 1.0875, + "step": 4973 + }, + { + "epoch": 0.1277183725151295, + "grad_norm": 0.87890625, + "learning_rate": 0.00019602634150433037, + "loss": 1.0438, + "step": 4974 + }, + { + "epoch": 0.1277440497110513, + "grad_norm": 0.83203125, + "learning_rate": 0.0001960250954710376, + "loss": 1.0956, + "step": 4975 + }, + { + "epoch": 0.12776972690697314, + "grad_norm": 0.80078125, + "learning_rate": 0.0001960238492463755, + "loss": 1.0332, + "step": 4976 + }, + { + "epoch": 0.12779540410289494, + "grad_norm": 0.82421875, + "learning_rate": 0.00019602260283034656, + "loss": 1.0329, + "step": 4977 + }, + { + "epoch": 0.12782108129881675, + "grad_norm": 0.90234375, + "learning_rate": 0.0001960213562229532, + "loss": 1.0597, + "step": 4978 + }, + { + "epoch": 0.1278467584947386, + "grad_norm": 0.8984375, + "learning_rate": 0.00019602010942419798, + "loss": 1.3101, + "step": 4979 + }, + { + "epoch": 0.1278724356906604, + "grad_norm": 0.87109375, + "learning_rate": 0.00019601886243408336, + "loss": 1.2395, + "step": 4980 + }, + { + "epoch": 0.12789811288658223, + "grad_norm": 0.87890625, + "learning_rate": 0.00019601761525261181, + "loss": 1.2007, + "step": 4981 + }, + { + "epoch": 0.12792379008250404, + "grad_norm": 1.1953125, + "learning_rate": 0.00019601636787978585, + "loss": 1.1747, + "step": 4982 + }, + { + "epoch": 0.12794946727842585, + "grad_norm": 0.7890625, + "learning_rate": 0.0001960151203156079, + "loss": 1.1068, + "step": 4983 + }, + { + "epoch": 0.12797514447434769, + "grad_norm": 0.796875, + "learning_rate": 0.0001960138725600805, + "loss": 1.1262, + "step": 4984 + }, + { + "epoch": 0.1280008216702695, + "grad_norm": 0.88671875, + "learning_rate": 0.00019601262461320617, + "loss": 1.2648, + "step": 4985 + }, + { + "epoch": 0.12802649886619133, + "grad_norm": 0.9140625, + "learning_rate": 0.0001960113764749873, + "loss": 1.0303, + "step": 4986 + }, + { + "epoch": 0.12805217606211314, + "grad_norm": 0.828125, + "learning_rate": 0.00019601012814542647, + "loss": 1.0886, + "step": 4987 + }, + { + "epoch": 0.12807785325803495, + "grad_norm": 0.82421875, + "learning_rate": 0.0001960088796245261, + "loss": 0.9259, + "step": 4988 + }, + { + "epoch": 0.12810353045395678, + "grad_norm": 0.88671875, + "learning_rate": 0.00019600763091228872, + "loss": 1.1856, + "step": 4989 + }, + { + "epoch": 0.1281292076498786, + "grad_norm": 0.87890625, + "learning_rate": 0.0001960063820087168, + "loss": 1.0781, + "step": 4990 + }, + { + "epoch": 0.12815488484580043, + "grad_norm": 0.7421875, + "learning_rate": 0.0001960051329138128, + "loss": 1.063, + "step": 4991 + }, + { + "epoch": 0.12818056204172223, + "grad_norm": 0.88671875, + "learning_rate": 0.00019600388362757927, + "loss": 1.2022, + "step": 4992 + }, + { + "epoch": 0.12820623923764404, + "grad_norm": 0.87109375, + "learning_rate": 0.00019600263415001868, + "loss": 1.1342, + "step": 4993 + }, + { + "epoch": 0.12823191643356588, + "grad_norm": 1.0546875, + "learning_rate": 0.0001960013844811335, + "loss": 1.0395, + "step": 4994 + }, + { + "epoch": 0.1282575936294877, + "grad_norm": 0.81640625, + "learning_rate": 0.00019600013462092624, + "loss": 1.0326, + "step": 4995 + }, + { + "epoch": 0.12828327082540952, + "grad_norm": 0.83203125, + "learning_rate": 0.00019599888456939935, + "loss": 1.1271, + "step": 4996 + }, + { + "epoch": 0.12830894802133133, + "grad_norm": 0.96484375, + "learning_rate": 0.0001959976343265554, + "loss": 1.2464, + "step": 4997 + }, + { + "epoch": 0.12833462521725314, + "grad_norm": 1.0703125, + "learning_rate": 0.00019599638389239683, + "loss": 1.0253, + "step": 4998 + }, + { + "epoch": 0.12836030241317498, + "grad_norm": 0.89453125, + "learning_rate": 0.00019599513326692614, + "loss": 1.1934, + "step": 4999 + }, + { + "epoch": 0.12838597960909678, + "grad_norm": 0.83203125, + "learning_rate": 0.0001959938824501458, + "loss": 0.9688, + "step": 5000 + }, + { + "epoch": 0.12838597960909678, + "eval_loss": 1.1211611032485962, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 407.3182, + "eval_samples_per_second": 24.551, + "eval_steps_per_second": 0.768, + "step": 5000 + }, + { + "epoch": 0.12841165680501862, + "grad_norm": 0.8515625, + "learning_rate": 0.00019599263144205834, + "loss": 0.9984, + "step": 5001 + }, + { + "epoch": 0.12843733400094043, + "grad_norm": 0.8515625, + "learning_rate": 0.0001959913802426662, + "loss": 1.1311, + "step": 5002 + }, + { + "epoch": 0.12846301119686224, + "grad_norm": 0.89453125, + "learning_rate": 0.00019599012885197198, + "loss": 1.1383, + "step": 5003 + }, + { + "epoch": 0.12848868839278407, + "grad_norm": 0.8828125, + "learning_rate": 0.00019598887726997804, + "loss": 1.1312, + "step": 5004 + }, + { + "epoch": 0.12851436558870588, + "grad_norm": 0.8515625, + "learning_rate": 0.00019598762549668695, + "loss": 1.2542, + "step": 5005 + }, + { + "epoch": 0.12854004278462772, + "grad_norm": 0.8203125, + "learning_rate": 0.0001959863735321012, + "loss": 0.9978, + "step": 5006 + }, + { + "epoch": 0.12856571998054953, + "grad_norm": 0.84375, + "learning_rate": 0.00019598512137622328, + "loss": 1.1598, + "step": 5007 + }, + { + "epoch": 0.12859139717647133, + "grad_norm": 0.8984375, + "learning_rate": 0.0001959838690290557, + "loss": 1.2139, + "step": 5008 + }, + { + "epoch": 0.12861707437239317, + "grad_norm": 0.8046875, + "learning_rate": 0.00019598261649060093, + "loss": 1.2369, + "step": 5009 + }, + { + "epoch": 0.12864275156831498, + "grad_norm": 0.80078125, + "learning_rate": 0.00019598136376086146, + "loss": 0.9782, + "step": 5010 + }, + { + "epoch": 0.12866842876423681, + "grad_norm": 0.96875, + "learning_rate": 0.0001959801108398398, + "loss": 1.1326, + "step": 5011 + }, + { + "epoch": 0.12869410596015862, + "grad_norm": 0.83984375, + "learning_rate": 0.00019597885772753846, + "loss": 1.1471, + "step": 5012 + }, + { + "epoch": 0.12871978315608043, + "grad_norm": 0.83203125, + "learning_rate": 0.00019597760442395995, + "loss": 1.0486, + "step": 5013 + }, + { + "epoch": 0.12874546035200227, + "grad_norm": 0.828125, + "learning_rate": 0.0001959763509291067, + "loss": 1.125, + "step": 5014 + }, + { + "epoch": 0.12877113754792407, + "grad_norm": 0.79296875, + "learning_rate": 0.00019597509724298128, + "loss": 0.9806, + "step": 5015 + }, + { + "epoch": 0.1287968147438459, + "grad_norm": 0.7734375, + "learning_rate": 0.00019597384336558616, + "loss": 0.896, + "step": 5016 + }, + { + "epoch": 0.12882249193976772, + "grad_norm": 0.953125, + "learning_rate": 0.00019597258929692383, + "loss": 1.2608, + "step": 5017 + }, + { + "epoch": 0.12884816913568953, + "grad_norm": 0.84765625, + "learning_rate": 0.00019597133503699681, + "loss": 1.266, + "step": 5018 + }, + { + "epoch": 0.12887384633161136, + "grad_norm": 0.86328125, + "learning_rate": 0.00019597008058580757, + "loss": 1.0676, + "step": 5019 + }, + { + "epoch": 0.12889952352753317, + "grad_norm": 0.8359375, + "learning_rate": 0.00019596882594335863, + "loss": 1.07, + "step": 5020 + }, + { + "epoch": 0.128925200723455, + "grad_norm": 0.828125, + "learning_rate": 0.0001959675711096525, + "loss": 1.0196, + "step": 5021 + }, + { + "epoch": 0.12895087791937682, + "grad_norm": 0.890625, + "learning_rate": 0.00019596631608469168, + "loss": 1.0639, + "step": 5022 + }, + { + "epoch": 0.12897655511529862, + "grad_norm": 0.83984375, + "learning_rate": 0.00019596506086847864, + "loss": 1.173, + "step": 5023 + }, + { + "epoch": 0.12900223231122046, + "grad_norm": 0.9921875, + "learning_rate": 0.00019596380546101593, + "loss": 1.1084, + "step": 5024 + }, + { + "epoch": 0.12902790950714227, + "grad_norm": 0.796875, + "learning_rate": 0.000195962549862306, + "loss": 1.0465, + "step": 5025 + }, + { + "epoch": 0.1290535867030641, + "grad_norm": 0.76953125, + "learning_rate": 0.00019596129407235138, + "loss": 1.247, + "step": 5026 + }, + { + "epoch": 0.1290792638989859, + "grad_norm": 0.8671875, + "learning_rate": 0.00019596003809115454, + "loss": 1.2399, + "step": 5027 + }, + { + "epoch": 0.12910494109490772, + "grad_norm": 0.8203125, + "learning_rate": 0.00019595878191871804, + "loss": 1.1822, + "step": 5028 + }, + { + "epoch": 0.12913061829082956, + "grad_norm": 0.94140625, + "learning_rate": 0.00019595752555504436, + "loss": 1.1985, + "step": 5029 + }, + { + "epoch": 0.12915629548675137, + "grad_norm": 0.84765625, + "learning_rate": 0.00019595626900013603, + "loss": 1.1947, + "step": 5030 + }, + { + "epoch": 0.1291819726826732, + "grad_norm": 0.87890625, + "learning_rate": 0.00019595501225399547, + "loss": 1.0086, + "step": 5031 + }, + { + "epoch": 0.129207649878595, + "grad_norm": 0.8125, + "learning_rate": 0.00019595375531662526, + "loss": 1.3009, + "step": 5032 + }, + { + "epoch": 0.12923332707451682, + "grad_norm": 0.859375, + "learning_rate": 0.00019595249818802788, + "loss": 1.1247, + "step": 5033 + }, + { + "epoch": 0.12925900427043865, + "grad_norm": 0.91796875, + "learning_rate": 0.00019595124086820581, + "loss": 1.2115, + "step": 5034 + }, + { + "epoch": 0.12928468146636046, + "grad_norm": 0.83203125, + "learning_rate": 0.00019594998335716163, + "loss": 1.2046, + "step": 5035 + }, + { + "epoch": 0.1293103586622823, + "grad_norm": 0.85546875, + "learning_rate": 0.00019594872565489779, + "loss": 1.2124, + "step": 5036 + }, + { + "epoch": 0.1293360358582041, + "grad_norm": 0.83203125, + "learning_rate": 0.0001959474677614168, + "loss": 1.092, + "step": 5037 + }, + { + "epoch": 0.12936171305412592, + "grad_norm": 0.8125, + "learning_rate": 0.00019594620967672117, + "loss": 1.127, + "step": 5038 + }, + { + "epoch": 0.12938739025004775, + "grad_norm": 0.76171875, + "learning_rate": 0.00019594495140081342, + "loss": 1.0552, + "step": 5039 + }, + { + "epoch": 0.12941306744596956, + "grad_norm": 0.89453125, + "learning_rate": 0.00019594369293369606, + "loss": 0.9447, + "step": 5040 + }, + { + "epoch": 0.1294387446418914, + "grad_norm": 0.84765625, + "learning_rate": 0.00019594243427537155, + "loss": 1.1884, + "step": 5041 + }, + { + "epoch": 0.1294644218378132, + "grad_norm": 0.8515625, + "learning_rate": 0.00019594117542584246, + "loss": 1.1712, + "step": 5042 + }, + { + "epoch": 0.129490099033735, + "grad_norm": 0.81640625, + "learning_rate": 0.00019593991638511128, + "loss": 1.1149, + "step": 5043 + }, + { + "epoch": 0.12951577622965685, + "grad_norm": 0.87890625, + "learning_rate": 0.0001959386571531805, + "loss": 1.2869, + "step": 5044 + }, + { + "epoch": 0.12954145342557866, + "grad_norm": 0.84765625, + "learning_rate": 0.00019593739773005262, + "loss": 1.2338, + "step": 5045 + }, + { + "epoch": 0.1295671306215005, + "grad_norm": 0.9921875, + "learning_rate": 0.0001959361381157302, + "loss": 1.0597, + "step": 5046 + }, + { + "epoch": 0.1295928078174223, + "grad_norm": 0.8203125, + "learning_rate": 0.00019593487831021572, + "loss": 1.0304, + "step": 5047 + }, + { + "epoch": 0.1296184850133441, + "grad_norm": 0.921875, + "learning_rate": 0.0001959336183135117, + "loss": 1.2443, + "step": 5048 + }, + { + "epoch": 0.12964416220926595, + "grad_norm": 0.8125, + "learning_rate": 0.00019593235812562064, + "loss": 1.1094, + "step": 5049 + }, + { + "epoch": 0.12966983940518775, + "grad_norm": 0.94921875, + "learning_rate": 0.00019593109774654503, + "loss": 1.2836, + "step": 5050 + }, + { + "epoch": 0.1296955166011096, + "grad_norm": 0.88671875, + "learning_rate": 0.00019592983717628746, + "loss": 1.186, + "step": 5051 + }, + { + "epoch": 0.1297211937970314, + "grad_norm": 0.80078125, + "learning_rate": 0.00019592857641485037, + "loss": 1.1957, + "step": 5052 + }, + { + "epoch": 0.1297468709929532, + "grad_norm": 0.83203125, + "learning_rate": 0.00019592731546223626, + "loss": 1.0788, + "step": 5053 + }, + { + "epoch": 0.12977254818887504, + "grad_norm": 0.75390625, + "learning_rate": 0.0001959260543184477, + "loss": 1.2097, + "step": 5054 + }, + { + "epoch": 0.12979822538479685, + "grad_norm": 1.375, + "learning_rate": 0.0001959247929834872, + "loss": 1.0955, + "step": 5055 + }, + { + "epoch": 0.1298239025807187, + "grad_norm": 0.83203125, + "learning_rate": 0.0001959235314573572, + "loss": 1.1163, + "step": 5056 + }, + { + "epoch": 0.1298495797766405, + "grad_norm": 0.89453125, + "learning_rate": 0.00019592226974006032, + "loss": 1.2465, + "step": 5057 + }, + { + "epoch": 0.1298752569725623, + "grad_norm": 0.859375, + "learning_rate": 0.000195921007831599, + "loss": 1.0545, + "step": 5058 + }, + { + "epoch": 0.12990093416848414, + "grad_norm": 0.8671875, + "learning_rate": 0.00019591974573197573, + "loss": 1.1332, + "step": 5059 + }, + { + "epoch": 0.12992661136440595, + "grad_norm": 1.0, + "learning_rate": 0.00019591848344119311, + "loss": 1.0457, + "step": 5060 + }, + { + "epoch": 0.12995228856032776, + "grad_norm": 0.828125, + "learning_rate": 0.00019591722095925365, + "loss": 1.0803, + "step": 5061 + }, + { + "epoch": 0.1299779657562496, + "grad_norm": 0.90234375, + "learning_rate": 0.00019591595828615977, + "loss": 1.0663, + "step": 5062 + }, + { + "epoch": 0.1300036429521714, + "grad_norm": 0.86328125, + "learning_rate": 0.00019591469542191411, + "loss": 1.2551, + "step": 5063 + }, + { + "epoch": 0.13002932014809324, + "grad_norm": 0.77734375, + "learning_rate": 0.00019591343236651909, + "loss": 1.0218, + "step": 5064 + }, + { + "epoch": 0.13005499734401504, + "grad_norm": 0.828125, + "learning_rate": 0.00019591216911997726, + "loss": 1.1068, + "step": 5065 + }, + { + "epoch": 0.13008067453993685, + "grad_norm": 0.80078125, + "learning_rate": 0.00019591090568229117, + "loss": 1.0894, + "step": 5066 + }, + { + "epoch": 0.1301063517358587, + "grad_norm": 0.76171875, + "learning_rate": 0.00019590964205346326, + "loss": 1.0297, + "step": 5067 + }, + { + "epoch": 0.1301320289317805, + "grad_norm": 0.828125, + "learning_rate": 0.00019590837823349615, + "loss": 1.0539, + "step": 5068 + }, + { + "epoch": 0.13015770612770233, + "grad_norm": 0.875, + "learning_rate": 0.00019590711422239228, + "loss": 1.1416, + "step": 5069 + }, + { + "epoch": 0.13018338332362414, + "grad_norm": 0.88671875, + "learning_rate": 0.00019590585002015421, + "loss": 1.1291, + "step": 5070 + }, + { + "epoch": 0.13020906051954595, + "grad_norm": 0.859375, + "learning_rate": 0.00019590458562678445, + "loss": 1.2295, + "step": 5071 + }, + { + "epoch": 0.13023473771546779, + "grad_norm": 0.90625, + "learning_rate": 0.00019590332104228546, + "loss": 1.1084, + "step": 5072 + }, + { + "epoch": 0.1302604149113896, + "grad_norm": 0.796875, + "learning_rate": 0.00019590205626665984, + "loss": 1.2252, + "step": 5073 + }, + { + "epoch": 0.13028609210731143, + "grad_norm": 0.8671875, + "learning_rate": 0.0001959007912999101, + "loss": 1.1078, + "step": 5074 + }, + { + "epoch": 0.13031176930323324, + "grad_norm": 0.79296875, + "learning_rate": 0.00019589952614203874, + "loss": 1.1248, + "step": 5075 + }, + { + "epoch": 0.13033744649915505, + "grad_norm": 0.86328125, + "learning_rate": 0.0001958982607930483, + "loss": 1.039, + "step": 5076 + }, + { + "epoch": 0.13036312369507688, + "grad_norm": 0.90234375, + "learning_rate": 0.00019589699525294127, + "loss": 1.1286, + "step": 5077 + }, + { + "epoch": 0.1303888008909987, + "grad_norm": 0.84765625, + "learning_rate": 0.0001958957295217202, + "loss": 1.1536, + "step": 5078 + }, + { + "epoch": 0.13041447808692053, + "grad_norm": 0.875, + "learning_rate": 0.00019589446359938762, + "loss": 1.2547, + "step": 5079 + }, + { + "epoch": 0.13044015528284233, + "grad_norm": 0.765625, + "learning_rate": 0.00019589319748594602, + "loss": 1.1203, + "step": 5080 + }, + { + "epoch": 0.13046583247876414, + "grad_norm": 0.90234375, + "learning_rate": 0.0001958919311813979, + "loss": 1.0358, + "step": 5081 + }, + { + "epoch": 0.13049150967468598, + "grad_norm": 0.87109375, + "learning_rate": 0.00019589066468574587, + "loss": 1.2583, + "step": 5082 + }, + { + "epoch": 0.1305171868706078, + "grad_norm": 0.875, + "learning_rate": 0.00019588939799899239, + "loss": 1.1808, + "step": 5083 + }, + { + "epoch": 0.13054286406652962, + "grad_norm": 0.8203125, + "learning_rate": 0.00019588813112114, + "loss": 1.0646, + "step": 5084 + }, + { + "epoch": 0.13056854126245143, + "grad_norm": 0.8125, + "learning_rate": 0.00019588686405219122, + "loss": 1.1804, + "step": 5085 + }, + { + "epoch": 0.13059421845837324, + "grad_norm": 0.85546875, + "learning_rate": 0.00019588559679214863, + "loss": 1.0969, + "step": 5086 + }, + { + "epoch": 0.13061989565429508, + "grad_norm": 0.80859375, + "learning_rate": 0.00019588432934101465, + "loss": 1.0216, + "step": 5087 + }, + { + "epoch": 0.13064557285021688, + "grad_norm": 0.82421875, + "learning_rate": 0.0001958830616987919, + "loss": 0.9896, + "step": 5088 + }, + { + "epoch": 0.13067125004613872, + "grad_norm": 0.9296875, + "learning_rate": 0.00019588179386548284, + "loss": 1.0272, + "step": 5089 + }, + { + "epoch": 0.13069692724206053, + "grad_norm": 0.80859375, + "learning_rate": 0.00019588052584109005, + "loss": 1.0275, + "step": 5090 + }, + { + "epoch": 0.13072260443798234, + "grad_norm": 0.80078125, + "learning_rate": 0.000195879257625616, + "loss": 1.1623, + "step": 5091 + }, + { + "epoch": 0.13074828163390417, + "grad_norm": 0.875, + "learning_rate": 0.00019587798921906328, + "loss": 1.2704, + "step": 5092 + }, + { + "epoch": 0.13077395882982598, + "grad_norm": 0.80859375, + "learning_rate": 0.00019587672062143437, + "loss": 1.13, + "step": 5093 + }, + { + "epoch": 0.13079963602574782, + "grad_norm": 0.78125, + "learning_rate": 0.00019587545183273185, + "loss": 1.0334, + "step": 5094 + }, + { + "epoch": 0.13082531322166963, + "grad_norm": 0.828125, + "learning_rate": 0.0001958741828529582, + "loss": 1.0931, + "step": 5095 + }, + { + "epoch": 0.13085099041759143, + "grad_norm": 0.80078125, + "learning_rate": 0.00019587291368211593, + "loss": 1.1827, + "step": 5096 + }, + { + "epoch": 0.13087666761351327, + "grad_norm": 0.85546875, + "learning_rate": 0.00019587164432020764, + "loss": 1.1037, + "step": 5097 + }, + { + "epoch": 0.13090234480943508, + "grad_norm": 0.86328125, + "learning_rate": 0.00019587037476723583, + "loss": 1.1701, + "step": 5098 + }, + { + "epoch": 0.13092802200535691, + "grad_norm": 0.83984375, + "learning_rate": 0.00019586910502320296, + "loss": 1.15, + "step": 5099 + }, + { + "epoch": 0.13095369920127872, + "grad_norm": 0.83984375, + "learning_rate": 0.0001958678350881117, + "loss": 1.0661, + "step": 5100 + }, + { + "epoch": 0.13097937639720053, + "grad_norm": 0.9140625, + "learning_rate": 0.00019586656496196447, + "loss": 1.1498, + "step": 5101 + }, + { + "epoch": 0.13100505359312237, + "grad_norm": 0.9375, + "learning_rate": 0.00019586529464476384, + "loss": 1.0938, + "step": 5102 + }, + { + "epoch": 0.13103073078904418, + "grad_norm": 2.125, + "learning_rate": 0.00019586402413651234, + "loss": 1.1892, + "step": 5103 + }, + { + "epoch": 0.131056407984966, + "grad_norm": 1.0859375, + "learning_rate": 0.00019586275343721248, + "loss": 1.0318, + "step": 5104 + }, + { + "epoch": 0.13108208518088782, + "grad_norm": 0.859375, + "learning_rate": 0.00019586148254686683, + "loss": 1.1539, + "step": 5105 + }, + { + "epoch": 0.13110776237680963, + "grad_norm": 1.2109375, + "learning_rate": 0.0001958602114654779, + "loss": 1.0735, + "step": 5106 + }, + { + "epoch": 0.13113343957273146, + "grad_norm": 0.8828125, + "learning_rate": 0.00019585894019304823, + "loss": 0.9837, + "step": 5107 + }, + { + "epoch": 0.13115911676865327, + "grad_norm": 0.7890625, + "learning_rate": 0.00019585766872958033, + "loss": 1.0502, + "step": 5108 + }, + { + "epoch": 0.1311847939645751, + "grad_norm": 0.8359375, + "learning_rate": 0.0001958563970750768, + "loss": 0.9333, + "step": 5109 + }, + { + "epoch": 0.13121047116049692, + "grad_norm": 0.84375, + "learning_rate": 0.0001958551252295401, + "loss": 1.0465, + "step": 5110 + }, + { + "epoch": 0.13123614835641872, + "grad_norm": 1.0703125, + "learning_rate": 0.00019585385319297277, + "loss": 0.9225, + "step": 5111 + }, + { + "epoch": 0.13126182555234056, + "grad_norm": 0.890625, + "learning_rate": 0.00019585258096537742, + "loss": 1.1357, + "step": 5112 + }, + { + "epoch": 0.13128750274826237, + "grad_norm": 0.92578125, + "learning_rate": 0.0001958513085467565, + "loss": 1.1693, + "step": 5113 + }, + { + "epoch": 0.1313131799441842, + "grad_norm": 0.921875, + "learning_rate": 0.0001958500359371126, + "loss": 1.1308, + "step": 5114 + }, + { + "epoch": 0.131338857140106, + "grad_norm": 0.87109375, + "learning_rate": 0.00019584876313644823, + "loss": 1.1126, + "step": 5115 + }, + { + "epoch": 0.13136453433602782, + "grad_norm": 0.81640625, + "learning_rate": 0.00019584749014476592, + "loss": 1.1646, + "step": 5116 + }, + { + "epoch": 0.13139021153194966, + "grad_norm": 7.5625, + "learning_rate": 0.00019584621696206825, + "loss": 1.0343, + "step": 5117 + }, + { + "epoch": 0.13141588872787147, + "grad_norm": 0.81640625, + "learning_rate": 0.00019584494358835768, + "loss": 1.0828, + "step": 5118 + }, + { + "epoch": 0.1314415659237933, + "grad_norm": 0.8359375, + "learning_rate": 0.00019584367002363685, + "loss": 1.185, + "step": 5119 + }, + { + "epoch": 0.1314672431197151, + "grad_norm": 0.82421875, + "learning_rate": 0.00019584239626790822, + "loss": 1.18, + "step": 5120 + }, + { + "epoch": 0.13149292031563692, + "grad_norm": 0.9375, + "learning_rate": 0.00019584112232117433, + "loss": 0.9678, + "step": 5121 + }, + { + "epoch": 0.13151859751155875, + "grad_norm": 1.109375, + "learning_rate": 0.00019583984818343777, + "loss": 1.1613, + "step": 5122 + }, + { + "epoch": 0.13154427470748056, + "grad_norm": 1.0390625, + "learning_rate": 0.000195838573854701, + "loss": 1.1942, + "step": 5123 + }, + { + "epoch": 0.1315699519034024, + "grad_norm": 0.8359375, + "learning_rate": 0.00019583729933496667, + "loss": 1.2156, + "step": 5124 + }, + { + "epoch": 0.1315956290993242, + "grad_norm": 0.765625, + "learning_rate": 0.00019583602462423723, + "loss": 1.2716, + "step": 5125 + }, + { + "epoch": 0.13162130629524602, + "grad_norm": 0.83984375, + "learning_rate": 0.00019583474972251525, + "loss": 1.1275, + "step": 5126 + }, + { + "epoch": 0.13164698349116785, + "grad_norm": 0.8203125, + "learning_rate": 0.0001958334746298033, + "loss": 1.1052, + "step": 5127 + }, + { + "epoch": 0.13167266068708966, + "grad_norm": 0.90625, + "learning_rate": 0.00019583219934610386, + "loss": 1.1944, + "step": 5128 + }, + { + "epoch": 0.1316983378830115, + "grad_norm": 0.90234375, + "learning_rate": 0.0001958309238714195, + "loss": 1.1052, + "step": 5129 + }, + { + "epoch": 0.1317240150789333, + "grad_norm": 0.87109375, + "learning_rate": 0.0001958296482057528, + "loss": 1.1581, + "step": 5130 + }, + { + "epoch": 0.1317496922748551, + "grad_norm": 0.796875, + "learning_rate": 0.00019582837234910624, + "loss": 1.2337, + "step": 5131 + }, + { + "epoch": 0.13177536947077695, + "grad_norm": 0.95703125, + "learning_rate": 0.00019582709630148237, + "loss": 1.1687, + "step": 5132 + }, + { + "epoch": 0.13180104666669876, + "grad_norm": 0.84765625, + "learning_rate": 0.00019582582006288379, + "loss": 1.147, + "step": 5133 + }, + { + "epoch": 0.1318267238626206, + "grad_norm": 0.96484375, + "learning_rate": 0.000195824543633313, + "loss": 1.1434, + "step": 5134 + }, + { + "epoch": 0.1318524010585424, + "grad_norm": 0.8203125, + "learning_rate": 0.00019582326701277254, + "loss": 1.0363, + "step": 5135 + }, + { + "epoch": 0.1318780782544642, + "grad_norm": 0.83984375, + "learning_rate": 0.00019582199020126497, + "loss": 1.175, + "step": 5136 + }, + { + "epoch": 0.13190375545038605, + "grad_norm": 0.81640625, + "learning_rate": 0.00019582071319879283, + "loss": 1.1337, + "step": 5137 + }, + { + "epoch": 0.13192943264630785, + "grad_norm": 0.8046875, + "learning_rate": 0.00019581943600535865, + "loss": 1.2228, + "step": 5138 + }, + { + "epoch": 0.1319551098422297, + "grad_norm": 0.81640625, + "learning_rate": 0.000195818158620965, + "loss": 1.0055, + "step": 5139 + }, + { + "epoch": 0.1319807870381515, + "grad_norm": 0.8125, + "learning_rate": 0.00019581688104561442, + "loss": 1.0445, + "step": 5140 + }, + { + "epoch": 0.1320064642340733, + "grad_norm": 0.86328125, + "learning_rate": 0.00019581560327930946, + "loss": 1.1624, + "step": 5141 + }, + { + "epoch": 0.13203214142999514, + "grad_norm": 0.87109375, + "learning_rate": 0.00019581432532205263, + "loss": 1.2063, + "step": 5142 + }, + { + "epoch": 0.13205781862591695, + "grad_norm": 0.890625, + "learning_rate": 0.00019581304717384654, + "loss": 1.1939, + "step": 5143 + }, + { + "epoch": 0.1320834958218388, + "grad_norm": 0.8046875, + "learning_rate": 0.00019581176883469366, + "loss": 0.9114, + "step": 5144 + }, + { + "epoch": 0.1321091730177606, + "grad_norm": 0.90625, + "learning_rate": 0.00019581049030459663, + "loss": 1.1243, + "step": 5145 + }, + { + "epoch": 0.1321348502136824, + "grad_norm": 0.9453125, + "learning_rate": 0.0001958092115835579, + "loss": 1.1779, + "step": 5146 + }, + { + "epoch": 0.13216052740960424, + "grad_norm": 0.984375, + "learning_rate": 0.0001958079326715801, + "loss": 1.084, + "step": 5147 + }, + { + "epoch": 0.13218620460552605, + "grad_norm": 0.8515625, + "learning_rate": 0.0001958066535686657, + "loss": 1.1193, + "step": 5148 + }, + { + "epoch": 0.13221188180144788, + "grad_norm": 0.8359375, + "learning_rate": 0.00019580537427481734, + "loss": 1.1418, + "step": 5149 + }, + { + "epoch": 0.1322375589973697, + "grad_norm": 0.87890625, + "learning_rate": 0.0001958040947900375, + "loss": 1.0332, + "step": 5150 + }, + { + "epoch": 0.1322632361932915, + "grad_norm": 0.88671875, + "learning_rate": 0.00019580281511432876, + "loss": 1.0631, + "step": 5151 + }, + { + "epoch": 0.13228891338921334, + "grad_norm": 0.84765625, + "learning_rate": 0.00019580153524769367, + "loss": 1.2038, + "step": 5152 + }, + { + "epoch": 0.13231459058513514, + "grad_norm": 0.8125, + "learning_rate": 0.00019580025519013475, + "loss": 0.9595, + "step": 5153 + }, + { + "epoch": 0.13234026778105698, + "grad_norm": 0.91015625, + "learning_rate": 0.0001957989749416546, + "loss": 1.1635, + "step": 5154 + }, + { + "epoch": 0.1323659449769788, + "grad_norm": 0.890625, + "learning_rate": 0.00019579769450225572, + "loss": 1.2595, + "step": 5155 + }, + { + "epoch": 0.1323916221729006, + "grad_norm": 0.76953125, + "learning_rate": 0.0001957964138719407, + "loss": 1.1027, + "step": 5156 + }, + { + "epoch": 0.13241729936882243, + "grad_norm": 0.82421875, + "learning_rate": 0.0001957951330507121, + "loss": 1.0621, + "step": 5157 + }, + { + "epoch": 0.13244297656474424, + "grad_norm": 0.92578125, + "learning_rate": 0.00019579385203857244, + "loss": 1.1161, + "step": 5158 + }, + { + "epoch": 0.13246865376066608, + "grad_norm": 0.91015625, + "learning_rate": 0.0001957925708355243, + "loss": 1.2299, + "step": 5159 + }, + { + "epoch": 0.13249433095658789, + "grad_norm": 0.96484375, + "learning_rate": 0.0001957912894415702, + "loss": 1.1955, + "step": 5160 + }, + { + "epoch": 0.1325200081525097, + "grad_norm": 0.8828125, + "learning_rate": 0.0001957900078567127, + "loss": 1.0943, + "step": 5161 + }, + { + "epoch": 0.13254568534843153, + "grad_norm": 0.91796875, + "learning_rate": 0.00019578872608095438, + "loss": 1.2024, + "step": 5162 + }, + { + "epoch": 0.13257136254435334, + "grad_norm": 0.84765625, + "learning_rate": 0.0001957874441142978, + "loss": 0.9645, + "step": 5163 + }, + { + "epoch": 0.13259703974027517, + "grad_norm": 0.89453125, + "learning_rate": 0.00019578616195674547, + "loss": 1.0736, + "step": 5164 + }, + { + "epoch": 0.13262271693619698, + "grad_norm": 0.859375, + "learning_rate": 0.0001957848796083, + "loss": 1.1656, + "step": 5165 + }, + { + "epoch": 0.1326483941321188, + "grad_norm": 0.86328125, + "learning_rate": 0.00019578359706896387, + "loss": 1.2073, + "step": 5166 + }, + { + "epoch": 0.13267407132804063, + "grad_norm": 1.0078125, + "learning_rate": 0.00019578231433873972, + "loss": 0.9344, + "step": 5167 + }, + { + "epoch": 0.13269974852396244, + "grad_norm": 0.875, + "learning_rate": 0.0001957810314176301, + "loss": 1.1713, + "step": 5168 + }, + { + "epoch": 0.13272542571988427, + "grad_norm": 0.86328125, + "learning_rate": 0.0001957797483056375, + "loss": 1.0037, + "step": 5169 + }, + { + "epoch": 0.13275110291580608, + "grad_norm": 0.87109375, + "learning_rate": 0.00019577846500276448, + "loss": 1.1763, + "step": 5170 + }, + { + "epoch": 0.1327767801117279, + "grad_norm": 0.90625, + "learning_rate": 0.00019577718150901366, + "loss": 1.026, + "step": 5171 + }, + { + "epoch": 0.13280245730764972, + "grad_norm": 0.83984375, + "learning_rate": 0.0001957758978243876, + "loss": 1.0096, + "step": 5172 + }, + { + "epoch": 0.13282813450357153, + "grad_norm": 0.85546875, + "learning_rate": 0.00019577461394888878, + "loss": 1.1129, + "step": 5173 + }, + { + "epoch": 0.13285381169949337, + "grad_norm": 0.85546875, + "learning_rate": 0.00019577332988251982, + "loss": 1.2449, + "step": 5174 + }, + { + "epoch": 0.13287948889541518, + "grad_norm": 0.78125, + "learning_rate": 0.00019577204562528326, + "loss": 1.0788, + "step": 5175 + }, + { + "epoch": 0.13290516609133698, + "grad_norm": 0.8125, + "learning_rate": 0.0001957707611771817, + "loss": 1.1226, + "step": 5176 + }, + { + "epoch": 0.13293084328725882, + "grad_norm": 1.0546875, + "learning_rate": 0.00019576947653821762, + "loss": 1.1135, + "step": 5177 + }, + { + "epoch": 0.13295652048318063, + "grad_norm": 0.890625, + "learning_rate": 0.00019576819170839365, + "loss": 1.0929, + "step": 5178 + }, + { + "epoch": 0.13298219767910247, + "grad_norm": 0.859375, + "learning_rate": 0.00019576690668771232, + "loss": 1.0637, + "step": 5179 + }, + { + "epoch": 0.13300787487502427, + "grad_norm": 0.8125, + "learning_rate": 0.0001957656214761762, + "loss": 1.1208, + "step": 5180 + }, + { + "epoch": 0.13303355207094608, + "grad_norm": 0.875, + "learning_rate": 0.00019576433607378782, + "loss": 1.1096, + "step": 5181 + }, + { + "epoch": 0.13305922926686792, + "grad_norm": 0.85546875, + "learning_rate": 0.00019576305048054978, + "loss": 1.1014, + "step": 5182 + }, + { + "epoch": 0.13308490646278973, + "grad_norm": 0.8515625, + "learning_rate": 0.00019576176469646462, + "loss": 1.0457, + "step": 5183 + }, + { + "epoch": 0.13311058365871156, + "grad_norm": 0.91796875, + "learning_rate": 0.00019576047872153494, + "loss": 1.2353, + "step": 5184 + }, + { + "epoch": 0.13313626085463337, + "grad_norm": 0.97265625, + "learning_rate": 0.00019575919255576327, + "loss": 1.0463, + "step": 5185 + }, + { + "epoch": 0.13316193805055518, + "grad_norm": 0.8359375, + "learning_rate": 0.00019575790619915217, + "loss": 1.0417, + "step": 5186 + }, + { + "epoch": 0.13318761524647701, + "grad_norm": 0.96875, + "learning_rate": 0.00019575661965170422, + "loss": 1.0662, + "step": 5187 + }, + { + "epoch": 0.13321329244239882, + "grad_norm": 1.0546875, + "learning_rate": 0.000195755332913422, + "loss": 1.2509, + "step": 5188 + }, + { + "epoch": 0.13323896963832066, + "grad_norm": 0.84765625, + "learning_rate": 0.00019575404598430802, + "loss": 1.0449, + "step": 5189 + }, + { + "epoch": 0.13326464683424247, + "grad_norm": 1.46875, + "learning_rate": 0.0001957527588643649, + "loss": 1.0214, + "step": 5190 + }, + { + "epoch": 0.13329032403016428, + "grad_norm": 1.0078125, + "learning_rate": 0.00019575147155359514, + "loss": 1.148, + "step": 5191 + }, + { + "epoch": 0.1333160012260861, + "grad_norm": 0.84765625, + "learning_rate": 0.00019575018405200138, + "loss": 1.0627, + "step": 5192 + }, + { + "epoch": 0.13334167842200792, + "grad_norm": 0.90234375, + "learning_rate": 0.00019574889635958612, + "loss": 1.2881, + "step": 5193 + }, + { + "epoch": 0.13336735561792976, + "grad_norm": 0.82421875, + "learning_rate": 0.000195747608476352, + "loss": 1.1318, + "step": 5194 + }, + { + "epoch": 0.13339303281385156, + "grad_norm": 0.87109375, + "learning_rate": 0.0001957463204023015, + "loss": 1.1032, + "step": 5195 + }, + { + "epoch": 0.13341871000977337, + "grad_norm": 0.8515625, + "learning_rate": 0.00019574503213743726, + "loss": 1.0418, + "step": 5196 + }, + { + "epoch": 0.1334443872056952, + "grad_norm": 0.90234375, + "learning_rate": 0.00019574374368176183, + "loss": 0.9743, + "step": 5197 + }, + { + "epoch": 0.13347006440161702, + "grad_norm": 0.8828125, + "learning_rate": 0.00019574245503527777, + "loss": 0.9677, + "step": 5198 + }, + { + "epoch": 0.13349574159753885, + "grad_norm": 0.87109375, + "learning_rate": 0.00019574116619798762, + "loss": 1.1408, + "step": 5199 + }, + { + "epoch": 0.13352141879346066, + "grad_norm": 0.875, + "learning_rate": 0.000195739877169894, + "loss": 1.3369, + "step": 5200 + }, + { + "epoch": 0.13354709598938247, + "grad_norm": 0.84765625, + "learning_rate": 0.00019573858795099943, + "loss": 1.1653, + "step": 5201 + }, + { + "epoch": 0.1335727731853043, + "grad_norm": 0.78515625, + "learning_rate": 0.0001957372985413065, + "loss": 1.0455, + "step": 5202 + }, + { + "epoch": 0.1335984503812261, + "grad_norm": 0.859375, + "learning_rate": 0.0001957360089408178, + "loss": 1.0755, + "step": 5203 + }, + { + "epoch": 0.13362412757714795, + "grad_norm": 0.8046875, + "learning_rate": 0.00019573471914953588, + "loss": 0.993, + "step": 5204 + }, + { + "epoch": 0.13364980477306976, + "grad_norm": 0.80078125, + "learning_rate": 0.00019573342916746328, + "loss": 1.1053, + "step": 5205 + }, + { + "epoch": 0.13367548196899157, + "grad_norm": 0.83203125, + "learning_rate": 0.00019573213899460264, + "loss": 1.149, + "step": 5206 + }, + { + "epoch": 0.1337011591649134, + "grad_norm": 0.7734375, + "learning_rate": 0.00019573084863095647, + "loss": 1.117, + "step": 5207 + }, + { + "epoch": 0.1337268363608352, + "grad_norm": 0.84375, + "learning_rate": 0.0001957295580765274, + "loss": 1.1055, + "step": 5208 + }, + { + "epoch": 0.13375251355675705, + "grad_norm": 0.84375, + "learning_rate": 0.00019572826733131794, + "loss": 1.2711, + "step": 5209 + }, + { + "epoch": 0.13377819075267885, + "grad_norm": 0.93359375, + "learning_rate": 0.00019572697639533068, + "loss": 1.1236, + "step": 5210 + }, + { + "epoch": 0.13380386794860066, + "grad_norm": 0.8515625, + "learning_rate": 0.00019572568526856824, + "loss": 1.1375, + "step": 5211 + }, + { + "epoch": 0.1338295451445225, + "grad_norm": 0.89453125, + "learning_rate": 0.00019572439395103313, + "loss": 1.0199, + "step": 5212 + }, + { + "epoch": 0.1338552223404443, + "grad_norm": 0.78515625, + "learning_rate": 0.00019572310244272796, + "loss": 1.0353, + "step": 5213 + }, + { + "epoch": 0.13388089953636614, + "grad_norm": 0.8671875, + "learning_rate": 0.00019572181074365528, + "loss": 1.0148, + "step": 5214 + }, + { + "epoch": 0.13390657673228795, + "grad_norm": 0.9296875, + "learning_rate": 0.0001957205188538177, + "loss": 1.1784, + "step": 5215 + }, + { + "epoch": 0.13393225392820976, + "grad_norm": 0.875, + "learning_rate": 0.00019571922677321776, + "loss": 1.1138, + "step": 5216 + }, + { + "epoch": 0.1339579311241316, + "grad_norm": 0.85546875, + "learning_rate": 0.00019571793450185804, + "loss": 1.1439, + "step": 5217 + }, + { + "epoch": 0.1339836083200534, + "grad_norm": 0.81640625, + "learning_rate": 0.0001957166420397411, + "loss": 1.1541, + "step": 5218 + }, + { + "epoch": 0.13400928551597524, + "grad_norm": 0.85546875, + "learning_rate": 0.0001957153493868696, + "loss": 1.1751, + "step": 5219 + }, + { + "epoch": 0.13403496271189705, + "grad_norm": 0.83203125, + "learning_rate": 0.00019571405654324601, + "loss": 1.0572, + "step": 5220 + }, + { + "epoch": 0.13406063990781886, + "grad_norm": 0.828125, + "learning_rate": 0.00019571276350887295, + "loss": 1.1265, + "step": 5221 + }, + { + "epoch": 0.1340863171037407, + "grad_norm": 0.765625, + "learning_rate": 0.00019571147028375302, + "loss": 1.1137, + "step": 5222 + }, + { + "epoch": 0.1341119942996625, + "grad_norm": 0.8671875, + "learning_rate": 0.00019571017686788878, + "loss": 1.0535, + "step": 5223 + }, + { + "epoch": 0.13413767149558434, + "grad_norm": 0.87890625, + "learning_rate": 0.00019570888326128278, + "loss": 1.1433, + "step": 5224 + }, + { + "epoch": 0.13416334869150615, + "grad_norm": 0.8359375, + "learning_rate": 0.00019570758946393764, + "loss": 1.1044, + "step": 5225 + }, + { + "epoch": 0.13418902588742795, + "grad_norm": 0.859375, + "learning_rate": 0.0001957062954758559, + "loss": 1.1386, + "step": 5226 + }, + { + "epoch": 0.1342147030833498, + "grad_norm": 0.8671875, + "learning_rate": 0.0001957050012970402, + "loss": 1.1625, + "step": 5227 + }, + { + "epoch": 0.1342403802792716, + "grad_norm": 0.8203125, + "learning_rate": 0.00019570370692749303, + "loss": 1.2971, + "step": 5228 + }, + { + "epoch": 0.13426605747519343, + "grad_norm": 0.8515625, + "learning_rate": 0.00019570241236721702, + "loss": 1.0787, + "step": 5229 + }, + { + "epoch": 0.13429173467111524, + "grad_norm": 0.80078125, + "learning_rate": 0.00019570111761621476, + "loss": 1.1955, + "step": 5230 + }, + { + "epoch": 0.13431741186703705, + "grad_norm": 0.85546875, + "learning_rate": 0.00019569982267448883, + "loss": 1.0607, + "step": 5231 + }, + { + "epoch": 0.1343430890629589, + "grad_norm": 0.83984375, + "learning_rate": 0.00019569852754204177, + "loss": 1.0817, + "step": 5232 + }, + { + "epoch": 0.1343687662588807, + "grad_norm": 0.84765625, + "learning_rate": 0.0001956972322188762, + "loss": 1.0206, + "step": 5233 + }, + { + "epoch": 0.13439444345480253, + "grad_norm": 0.82421875, + "learning_rate": 0.0001956959367049947, + "loss": 1.0286, + "step": 5234 + }, + { + "epoch": 0.13442012065072434, + "grad_norm": 0.8359375, + "learning_rate": 0.00019569464100039984, + "loss": 1.0387, + "step": 5235 + }, + { + "epoch": 0.13444579784664615, + "grad_norm": 0.875, + "learning_rate": 0.0001956933451050942, + "loss": 1.1679, + "step": 5236 + }, + { + "epoch": 0.13447147504256798, + "grad_norm": 0.8984375, + "learning_rate": 0.00019569204901908038, + "loss": 1.1502, + "step": 5237 + }, + { + "epoch": 0.1344971522384898, + "grad_norm": 0.83203125, + "learning_rate": 0.00019569075274236095, + "loss": 1.2262, + "step": 5238 + }, + { + "epoch": 0.13452282943441163, + "grad_norm": 0.84765625, + "learning_rate": 0.00019568945627493846, + "loss": 1.141, + "step": 5239 + }, + { + "epoch": 0.13454850663033344, + "grad_norm": 0.7734375, + "learning_rate": 0.00019568815961681555, + "loss": 0.9807, + "step": 5240 + }, + { + "epoch": 0.13457418382625524, + "grad_norm": 0.75, + "learning_rate": 0.00019568686276799478, + "loss": 0.9869, + "step": 5241 + }, + { + "epoch": 0.13459986102217708, + "grad_norm": 0.90234375, + "learning_rate": 0.00019568556572847876, + "loss": 1.3204, + "step": 5242 + }, + { + "epoch": 0.1346255382180989, + "grad_norm": 0.859375, + "learning_rate": 0.00019568426849826998, + "loss": 1.1125, + "step": 5243 + }, + { + "epoch": 0.13465121541402073, + "grad_norm": 0.80859375, + "learning_rate": 0.00019568297107737119, + "loss": 1.0567, + "step": 5244 + }, + { + "epoch": 0.13467689260994253, + "grad_norm": 1.0703125, + "learning_rate": 0.00019568167346578483, + "loss": 1.1174, + "step": 5245 + }, + { + "epoch": 0.13470256980586434, + "grad_norm": 0.83984375, + "learning_rate": 0.00019568037566351353, + "loss": 1.1333, + "step": 5246 + }, + { + "epoch": 0.13472824700178618, + "grad_norm": 0.796875, + "learning_rate": 0.00019567907767055988, + "loss": 0.9702, + "step": 5247 + }, + { + "epoch": 0.13475392419770799, + "grad_norm": 0.90625, + "learning_rate": 0.0001956777794869265, + "loss": 1.1605, + "step": 5248 + }, + { + "epoch": 0.13477960139362982, + "grad_norm": 0.86328125, + "learning_rate": 0.00019567648111261593, + "loss": 1.2818, + "step": 5249 + }, + { + "epoch": 0.13480527858955163, + "grad_norm": 0.80078125, + "learning_rate": 0.0001956751825476308, + "loss": 1.1231, + "step": 5250 + }, + { + "epoch": 0.13483095578547344, + "grad_norm": 0.88671875, + "learning_rate": 0.00019567388379197365, + "loss": 1.1095, + "step": 5251 + }, + { + "epoch": 0.13485663298139527, + "grad_norm": 0.76953125, + "learning_rate": 0.0001956725848456471, + "loss": 1.0195, + "step": 5252 + }, + { + "epoch": 0.13488231017731708, + "grad_norm": 0.83203125, + "learning_rate": 0.00019567128570865372, + "loss": 1.1228, + "step": 5253 + }, + { + "epoch": 0.13490798737323892, + "grad_norm": 0.828125, + "learning_rate": 0.00019566998638099613, + "loss": 1.2502, + "step": 5254 + }, + { + "epoch": 0.13493366456916073, + "grad_norm": 0.8359375, + "learning_rate": 0.00019566868686267687, + "loss": 1.1333, + "step": 5255 + }, + { + "epoch": 0.13495934176508254, + "grad_norm": 0.8515625, + "learning_rate": 0.00019566738715369857, + "loss": 1.1295, + "step": 5256 + }, + { + "epoch": 0.13498501896100437, + "grad_norm": 0.8515625, + "learning_rate": 0.00019566608725406383, + "loss": 1.1675, + "step": 5257 + }, + { + "epoch": 0.13501069615692618, + "grad_norm": 0.859375, + "learning_rate": 0.0001956647871637752, + "loss": 1.2221, + "step": 5258 + }, + { + "epoch": 0.13503637335284802, + "grad_norm": 0.875, + "learning_rate": 0.0001956634868828353, + "loss": 1.16, + "step": 5259 + }, + { + "epoch": 0.13506205054876982, + "grad_norm": 0.80078125, + "learning_rate": 0.0001956621864112467, + "loss": 1.0909, + "step": 5260 + }, + { + "epoch": 0.13508772774469163, + "grad_norm": 0.92578125, + "learning_rate": 0.000195660885749012, + "loss": 1.2334, + "step": 5261 + }, + { + "epoch": 0.13511340494061347, + "grad_norm": 0.83984375, + "learning_rate": 0.0001956595848961338, + "loss": 1.0181, + "step": 5262 + }, + { + "epoch": 0.13513908213653528, + "grad_norm": 0.8359375, + "learning_rate": 0.00019565828385261467, + "loss": 1.0997, + "step": 5263 + }, + { + "epoch": 0.1351647593324571, + "grad_norm": 0.84375, + "learning_rate": 0.00019565698261845728, + "loss": 1.0426, + "step": 5264 + }, + { + "epoch": 0.13519043652837892, + "grad_norm": 0.8125, + "learning_rate": 0.00019565568119366412, + "loss": 1.0119, + "step": 5265 + }, + { + "epoch": 0.13521611372430073, + "grad_norm": 0.890625, + "learning_rate": 0.0001956543795782378, + "loss": 1.1663, + "step": 5266 + }, + { + "epoch": 0.13524179092022257, + "grad_norm": 1.0625, + "learning_rate": 0.000195653077772181, + "loss": 1.0827, + "step": 5267 + }, + { + "epoch": 0.13526746811614437, + "grad_norm": 0.85546875, + "learning_rate": 0.00019565177577549625, + "loss": 1.0295, + "step": 5268 + }, + { + "epoch": 0.13529314531206618, + "grad_norm": 2.203125, + "learning_rate": 0.00019565047358818612, + "loss": 1.0522, + "step": 5269 + }, + { + "epoch": 0.13531882250798802, + "grad_norm": 0.8359375, + "learning_rate": 0.00019564917121025327, + "loss": 1.1734, + "step": 5270 + }, + { + "epoch": 0.13534449970390983, + "grad_norm": 0.83203125, + "learning_rate": 0.00019564786864170023, + "loss": 1.1503, + "step": 5271 + }, + { + "epoch": 0.13537017689983166, + "grad_norm": 0.80859375, + "learning_rate": 0.00019564656588252964, + "loss": 0.9812, + "step": 5272 + }, + { + "epoch": 0.13539585409575347, + "grad_norm": 0.78515625, + "learning_rate": 0.0001956452629327441, + "loss": 1.0963, + "step": 5273 + }, + { + "epoch": 0.13542153129167528, + "grad_norm": 0.765625, + "learning_rate": 0.00019564395979234614, + "loss": 0.9288, + "step": 5274 + }, + { + "epoch": 0.13544720848759711, + "grad_norm": 0.82421875, + "learning_rate": 0.0001956426564613385, + "loss": 0.9904, + "step": 5275 + }, + { + "epoch": 0.13547288568351892, + "grad_norm": 0.76171875, + "learning_rate": 0.0001956413529397236, + "loss": 1.0333, + "step": 5276 + }, + { + "epoch": 0.13549856287944076, + "grad_norm": 0.8671875, + "learning_rate": 0.00019564004922750417, + "loss": 1.1128, + "step": 5277 + }, + { + "epoch": 0.13552424007536257, + "grad_norm": 0.79296875, + "learning_rate": 0.0001956387453246827, + "loss": 1.0214, + "step": 5278 + }, + { + "epoch": 0.13554991727128438, + "grad_norm": 0.859375, + "learning_rate": 0.00019563744123126192, + "loss": 1.1385, + "step": 5279 + }, + { + "epoch": 0.1355755944672062, + "grad_norm": 0.95703125, + "learning_rate": 0.00019563613694724432, + "loss": 0.9838, + "step": 5280 + }, + { + "epoch": 0.13560127166312802, + "grad_norm": 0.828125, + "learning_rate": 0.00019563483247263258, + "loss": 1.014, + "step": 5281 + }, + { + "epoch": 0.13562694885904986, + "grad_norm": 0.91796875, + "learning_rate": 0.00019563352780742918, + "loss": 1.206, + "step": 5282 + }, + { + "epoch": 0.13565262605497166, + "grad_norm": 0.79296875, + "learning_rate": 0.0001956322229516369, + "loss": 0.98, + "step": 5283 + }, + { + "epoch": 0.13567830325089347, + "grad_norm": 0.84765625, + "learning_rate": 0.00019563091790525815, + "loss": 1.1209, + "step": 5284 + }, + { + "epoch": 0.1357039804468153, + "grad_norm": 0.8203125, + "learning_rate": 0.00019562961266829566, + "loss": 1.2444, + "step": 5285 + }, + { + "epoch": 0.13572965764273712, + "grad_norm": 0.84765625, + "learning_rate": 0.00019562830724075196, + "loss": 1.1827, + "step": 5286 + }, + { + "epoch": 0.13575533483865895, + "grad_norm": 0.76953125, + "learning_rate": 0.0001956270016226297, + "loss": 1.0943, + "step": 5287 + }, + { + "epoch": 0.13578101203458076, + "grad_norm": 0.9140625, + "learning_rate": 0.00019562569581393148, + "loss": 1.0363, + "step": 5288 + }, + { + "epoch": 0.13580668923050257, + "grad_norm": 0.90625, + "learning_rate": 0.00019562438981465986, + "loss": 1.2217, + "step": 5289 + }, + { + "epoch": 0.1358323664264244, + "grad_norm": 0.86328125, + "learning_rate": 0.0001956230836248175, + "loss": 1.005, + "step": 5290 + }, + { + "epoch": 0.13585804362234621, + "grad_norm": 0.921875, + "learning_rate": 0.00019562177724440693, + "loss": 1.2815, + "step": 5291 + }, + { + "epoch": 0.13588372081826805, + "grad_norm": 0.81640625, + "learning_rate": 0.00019562047067343081, + "loss": 1.0745, + "step": 5292 + }, + { + "epoch": 0.13590939801418986, + "grad_norm": 0.88671875, + "learning_rate": 0.00019561916391189175, + "loss": 1.1021, + "step": 5293 + }, + { + "epoch": 0.13593507521011167, + "grad_norm": 0.875, + "learning_rate": 0.0001956178569597923, + "loss": 1.2195, + "step": 5294 + }, + { + "epoch": 0.1359607524060335, + "grad_norm": 0.82421875, + "learning_rate": 0.00019561654981713512, + "loss": 1.1256, + "step": 5295 + }, + { + "epoch": 0.1359864296019553, + "grad_norm": 0.79296875, + "learning_rate": 0.0001956152424839228, + "loss": 1.0482, + "step": 5296 + }, + { + "epoch": 0.13601210679787715, + "grad_norm": 0.94140625, + "learning_rate": 0.00019561393496015793, + "loss": 1.164, + "step": 5297 + }, + { + "epoch": 0.13603778399379896, + "grad_norm": 1.0703125, + "learning_rate": 0.00019561262724584314, + "loss": 1.1224, + "step": 5298 + }, + { + "epoch": 0.13606346118972076, + "grad_norm": 1.234375, + "learning_rate": 0.000195611319340981, + "loss": 0.9536, + "step": 5299 + }, + { + "epoch": 0.1360891383856426, + "grad_norm": 0.89453125, + "learning_rate": 0.00019561001124557414, + "loss": 1.084, + "step": 5300 + }, + { + "epoch": 0.1361148155815644, + "grad_norm": 0.875, + "learning_rate": 0.0001956087029596252, + "loss": 1.0999, + "step": 5301 + }, + { + "epoch": 0.13614049277748624, + "grad_norm": 0.8984375, + "learning_rate": 0.0001956073944831367, + "loss": 1.1511, + "step": 5302 + }, + { + "epoch": 0.13616616997340805, + "grad_norm": 0.84765625, + "learning_rate": 0.00019560608581611132, + "loss": 1.1255, + "step": 5303 + }, + { + "epoch": 0.13619184716932986, + "grad_norm": 0.890625, + "learning_rate": 0.00019560477695855163, + "loss": 1.0949, + "step": 5304 + }, + { + "epoch": 0.1362175243652517, + "grad_norm": 0.84765625, + "learning_rate": 0.00019560346791046027, + "loss": 1.0007, + "step": 5305 + }, + { + "epoch": 0.1362432015611735, + "grad_norm": 0.83203125, + "learning_rate": 0.00019560215867183984, + "loss": 1.2987, + "step": 5306 + }, + { + "epoch": 0.13626887875709534, + "grad_norm": 0.8203125, + "learning_rate": 0.00019560084924269292, + "loss": 1.0661, + "step": 5307 + }, + { + "epoch": 0.13629455595301715, + "grad_norm": 0.84765625, + "learning_rate": 0.00019559953962302218, + "loss": 0.9379, + "step": 5308 + }, + { + "epoch": 0.13632023314893896, + "grad_norm": 0.859375, + "learning_rate": 0.00019559822981283018, + "loss": 1.2353, + "step": 5309 + }, + { + "epoch": 0.1363459103448608, + "grad_norm": 0.89453125, + "learning_rate": 0.0001955969198121195, + "loss": 1.154, + "step": 5310 + }, + { + "epoch": 0.1363715875407826, + "grad_norm": 0.8046875, + "learning_rate": 0.00019559560962089284, + "loss": 1.1146, + "step": 5311 + }, + { + "epoch": 0.13639726473670444, + "grad_norm": 0.82421875, + "learning_rate": 0.00019559429923915277, + "loss": 0.9957, + "step": 5312 + }, + { + "epoch": 0.13642294193262625, + "grad_norm": 0.8828125, + "learning_rate": 0.0001955929886669019, + "loss": 1.0216, + "step": 5313 + }, + { + "epoch": 0.13644861912854805, + "grad_norm": 0.8828125, + "learning_rate": 0.00019559167790414282, + "loss": 1.2073, + "step": 5314 + }, + { + "epoch": 0.1364742963244699, + "grad_norm": 0.78125, + "learning_rate": 0.00019559036695087815, + "loss": 1.0172, + "step": 5315 + }, + { + "epoch": 0.1364999735203917, + "grad_norm": 0.79296875, + "learning_rate": 0.0001955890558071105, + "loss": 1.1338, + "step": 5316 + }, + { + "epoch": 0.13652565071631353, + "grad_norm": 0.91796875, + "learning_rate": 0.00019558774447284253, + "loss": 1.2025, + "step": 5317 + }, + { + "epoch": 0.13655132791223534, + "grad_norm": 0.890625, + "learning_rate": 0.0001955864329480768, + "loss": 1.1881, + "step": 5318 + }, + { + "epoch": 0.13657700510815715, + "grad_norm": 0.828125, + "learning_rate": 0.000195585121232816, + "loss": 1.0948, + "step": 5319 + }, + { + "epoch": 0.136602682304079, + "grad_norm": 1.515625, + "learning_rate": 0.00019558380932706263, + "loss": 1.0107, + "step": 5320 + }, + { + "epoch": 0.1366283595000008, + "grad_norm": 0.875, + "learning_rate": 0.0001955824972308194, + "loss": 1.2289, + "step": 5321 + }, + { + "epoch": 0.13665403669592263, + "grad_norm": 0.875, + "learning_rate": 0.0001955811849440888, + "loss": 1.1472, + "step": 5322 + }, + { + "epoch": 0.13667971389184444, + "grad_norm": 0.86328125, + "learning_rate": 0.00019557987246687363, + "loss": 1.2444, + "step": 5323 + }, + { + "epoch": 0.13670539108776625, + "grad_norm": 0.98828125, + "learning_rate": 0.00019557855979917635, + "loss": 0.9302, + "step": 5324 + }, + { + "epoch": 0.13673106828368808, + "grad_norm": 0.828125, + "learning_rate": 0.0001955772469409997, + "loss": 1.1297, + "step": 5325 + }, + { + "epoch": 0.1367567454796099, + "grad_norm": 0.76953125, + "learning_rate": 0.00019557593389234613, + "loss": 0.9862, + "step": 5326 + }, + { + "epoch": 0.13678242267553173, + "grad_norm": 0.79296875, + "learning_rate": 0.00019557462065321844, + "loss": 1.1493, + "step": 5327 + }, + { + "epoch": 0.13680809987145354, + "grad_norm": 0.92578125, + "learning_rate": 0.00019557330722361912, + "loss": 1.0205, + "step": 5328 + }, + { + "epoch": 0.13683377706737535, + "grad_norm": 0.78125, + "learning_rate": 0.00019557199360355085, + "loss": 1.0462, + "step": 5329 + }, + { + "epoch": 0.13685945426329718, + "grad_norm": 0.8046875, + "learning_rate": 0.00019557067979301623, + "loss": 1.0353, + "step": 5330 + }, + { + "epoch": 0.136885131459219, + "grad_norm": 0.80078125, + "learning_rate": 0.0001955693657920179, + "loss": 1.0965, + "step": 5331 + }, + { + "epoch": 0.13691080865514083, + "grad_norm": 0.85546875, + "learning_rate": 0.00019556805160055843, + "loss": 1.0338, + "step": 5332 + }, + { + "epoch": 0.13693648585106263, + "grad_norm": 0.90625, + "learning_rate": 0.00019556673721864045, + "loss": 1.0623, + "step": 5333 + }, + { + "epoch": 0.13696216304698444, + "grad_norm": 0.8671875, + "learning_rate": 0.0001955654226462666, + "loss": 1.1404, + "step": 5334 + }, + { + "epoch": 0.13698784024290628, + "grad_norm": 0.86328125, + "learning_rate": 0.00019556410788343953, + "loss": 0.9778, + "step": 5335 + }, + { + "epoch": 0.1370135174388281, + "grad_norm": 0.87109375, + "learning_rate": 0.0001955627929301618, + "loss": 1.0776, + "step": 5336 + }, + { + "epoch": 0.13703919463474992, + "grad_norm": 0.8515625, + "learning_rate": 0.00019556147778643609, + "loss": 1.0616, + "step": 5337 + }, + { + "epoch": 0.13706487183067173, + "grad_norm": 0.80078125, + "learning_rate": 0.00019556016245226494, + "loss": 0.9994, + "step": 5338 + }, + { + "epoch": 0.13709054902659354, + "grad_norm": 0.890625, + "learning_rate": 0.00019555884692765103, + "loss": 1.1858, + "step": 5339 + }, + { + "epoch": 0.13711622622251537, + "grad_norm": 0.8359375, + "learning_rate": 0.00019555753121259698, + "loss": 1.2145, + "step": 5340 + }, + { + "epoch": 0.13714190341843718, + "grad_norm": 0.75390625, + "learning_rate": 0.00019555621530710537, + "loss": 1.0513, + "step": 5341 + }, + { + "epoch": 0.13716758061435902, + "grad_norm": 0.75390625, + "learning_rate": 0.0001955548992111789, + "loss": 1.1226, + "step": 5342 + }, + { + "epoch": 0.13719325781028083, + "grad_norm": 0.8046875, + "learning_rate": 0.00019555358292482012, + "loss": 1.0786, + "step": 5343 + }, + { + "epoch": 0.13721893500620264, + "grad_norm": 0.8671875, + "learning_rate": 0.00019555226644803168, + "loss": 1.0746, + "step": 5344 + }, + { + "epoch": 0.13724461220212447, + "grad_norm": 0.8359375, + "learning_rate": 0.00019555094978081623, + "loss": 1.1921, + "step": 5345 + }, + { + "epoch": 0.13727028939804628, + "grad_norm": 0.88671875, + "learning_rate": 0.00019554963292317635, + "loss": 1.1459, + "step": 5346 + }, + { + "epoch": 0.13729596659396812, + "grad_norm": 0.99609375, + "learning_rate": 0.00019554831587511467, + "loss": 1.0717, + "step": 5347 + }, + { + "epoch": 0.13732164378988992, + "grad_norm": 0.828125, + "learning_rate": 0.00019554699863663384, + "loss": 0.9574, + "step": 5348 + }, + { + "epoch": 0.13734732098581173, + "grad_norm": 0.86328125, + "learning_rate": 0.0001955456812077365, + "loss": 1.2309, + "step": 5349 + }, + { + "epoch": 0.13737299818173357, + "grad_norm": 0.875, + "learning_rate": 0.0001955443635884252, + "loss": 1.2162, + "step": 5350 + }, + { + "epoch": 0.13739867537765538, + "grad_norm": 0.86328125, + "learning_rate": 0.00019554304577870264, + "loss": 1.2666, + "step": 5351 + }, + { + "epoch": 0.1374243525735772, + "grad_norm": 0.8359375, + "learning_rate": 0.00019554172777857143, + "loss": 1.0966, + "step": 5352 + }, + { + "epoch": 0.13745002976949902, + "grad_norm": 0.8671875, + "learning_rate": 0.00019554040958803416, + "loss": 1.224, + "step": 5353 + }, + { + "epoch": 0.13747570696542083, + "grad_norm": 0.84765625, + "learning_rate": 0.00019553909120709347, + "loss": 1.1977, + "step": 5354 + }, + { + "epoch": 0.13750138416134267, + "grad_norm": 1.0390625, + "learning_rate": 0.00019553777263575206, + "loss": 1.0661, + "step": 5355 + }, + { + "epoch": 0.13752706135726447, + "grad_norm": 0.828125, + "learning_rate": 0.00019553645387401244, + "loss": 1.2073, + "step": 5356 + }, + { + "epoch": 0.1375527385531863, + "grad_norm": 0.83203125, + "learning_rate": 0.00019553513492187734, + "loss": 1.117, + "step": 5357 + }, + { + "epoch": 0.13757841574910812, + "grad_norm": 0.83984375, + "learning_rate": 0.00019553381577934933, + "loss": 1.2928, + "step": 5358 + }, + { + "epoch": 0.13760409294502993, + "grad_norm": 0.84375, + "learning_rate": 0.00019553249644643105, + "loss": 1.207, + "step": 5359 + }, + { + "epoch": 0.13762977014095176, + "grad_norm": 0.93359375, + "learning_rate": 0.00019553117692312514, + "loss": 0.9513, + "step": 5360 + }, + { + "epoch": 0.13765544733687357, + "grad_norm": 1.015625, + "learning_rate": 0.00019552985720943422, + "loss": 1.0005, + "step": 5361 + }, + { + "epoch": 0.1376811245327954, + "grad_norm": 0.84765625, + "learning_rate": 0.00019552853730536095, + "loss": 1.1137, + "step": 5362 + }, + { + "epoch": 0.13770680172871722, + "grad_norm": 0.8203125, + "learning_rate": 0.0001955272172109079, + "loss": 1.1863, + "step": 5363 + }, + { + "epoch": 0.13773247892463902, + "grad_norm": 0.8046875, + "learning_rate": 0.00019552589692607776, + "loss": 1.0071, + "step": 5364 + }, + { + "epoch": 0.13775815612056086, + "grad_norm": 0.8359375, + "learning_rate": 0.0001955245764508731, + "loss": 1.1096, + "step": 5365 + }, + { + "epoch": 0.13778383331648267, + "grad_norm": 0.83984375, + "learning_rate": 0.00019552325578529663, + "loss": 1.0189, + "step": 5366 + }, + { + "epoch": 0.1378095105124045, + "grad_norm": 0.79296875, + "learning_rate": 0.00019552193492935088, + "loss": 1.1871, + "step": 5367 + }, + { + "epoch": 0.1378351877083263, + "grad_norm": 0.859375, + "learning_rate": 0.00019552061388303858, + "loss": 1.1915, + "step": 5368 + }, + { + "epoch": 0.13786086490424812, + "grad_norm": 0.76171875, + "learning_rate": 0.00019551929264636234, + "loss": 1.0877, + "step": 5369 + }, + { + "epoch": 0.13788654210016996, + "grad_norm": 0.8671875, + "learning_rate": 0.00019551797121932478, + "loss": 1.1508, + "step": 5370 + }, + { + "epoch": 0.13791221929609176, + "grad_norm": 0.80078125, + "learning_rate": 0.0001955166496019285, + "loss": 1.1478, + "step": 5371 + }, + { + "epoch": 0.1379378964920136, + "grad_norm": 0.765625, + "learning_rate": 0.0001955153277941762, + "loss": 0.9269, + "step": 5372 + }, + { + "epoch": 0.1379635736879354, + "grad_norm": 0.82421875, + "learning_rate": 0.00019551400579607045, + "loss": 0.9681, + "step": 5373 + }, + { + "epoch": 0.13798925088385722, + "grad_norm": 0.8125, + "learning_rate": 0.0001955126836076139, + "loss": 1.1745, + "step": 5374 + }, + { + "epoch": 0.13801492807977905, + "grad_norm": 0.82421875, + "learning_rate": 0.00019551136122880925, + "loss": 1.1993, + "step": 5375 + }, + { + "epoch": 0.13804060527570086, + "grad_norm": 0.8203125, + "learning_rate": 0.00019551003865965905, + "loss": 1.1238, + "step": 5376 + }, + { + "epoch": 0.1380662824716227, + "grad_norm": 0.76953125, + "learning_rate": 0.00019550871590016597, + "loss": 1.0545, + "step": 5377 + }, + { + "epoch": 0.1380919596675445, + "grad_norm": 0.8203125, + "learning_rate": 0.00019550739295033266, + "loss": 1.0024, + "step": 5378 + }, + { + "epoch": 0.13811763686346631, + "grad_norm": 0.83203125, + "learning_rate": 0.00019550606981016174, + "loss": 1.079, + "step": 5379 + }, + { + "epoch": 0.13814331405938815, + "grad_norm": 0.82421875, + "learning_rate": 0.00019550474647965586, + "loss": 1.0615, + "step": 5380 + }, + { + "epoch": 0.13816899125530996, + "grad_norm": 0.8828125, + "learning_rate": 0.00019550342295881763, + "loss": 1.0768, + "step": 5381 + }, + { + "epoch": 0.1381946684512318, + "grad_norm": 0.8125, + "learning_rate": 0.00019550209924764972, + "loss": 0.9945, + "step": 5382 + }, + { + "epoch": 0.1382203456471536, + "grad_norm": 0.8671875, + "learning_rate": 0.00019550077534615472, + "loss": 1.2699, + "step": 5383 + }, + { + "epoch": 0.1382460228430754, + "grad_norm": 0.8125, + "learning_rate": 0.00019549945125433532, + "loss": 1.0786, + "step": 5384 + }, + { + "epoch": 0.13827170003899725, + "grad_norm": 0.78515625, + "learning_rate": 0.00019549812697219413, + "loss": 1.0885, + "step": 5385 + }, + { + "epoch": 0.13829737723491906, + "grad_norm": 0.77734375, + "learning_rate": 0.00019549680249973382, + "loss": 1.0153, + "step": 5386 + }, + { + "epoch": 0.1383230544308409, + "grad_norm": 0.83203125, + "learning_rate": 0.000195495477836957, + "loss": 1.0412, + "step": 5387 + }, + { + "epoch": 0.1383487316267627, + "grad_norm": 0.81640625, + "learning_rate": 0.00019549415298386635, + "loss": 1.0159, + "step": 5388 + }, + { + "epoch": 0.1383744088226845, + "grad_norm": 0.796875, + "learning_rate": 0.00019549282794046444, + "loss": 1.0028, + "step": 5389 + }, + { + "epoch": 0.13840008601860634, + "grad_norm": 0.875, + "learning_rate": 0.00019549150270675397, + "loss": 1.0898, + "step": 5390 + }, + { + "epoch": 0.13842576321452815, + "grad_norm": 0.8515625, + "learning_rate": 0.00019549017728273755, + "loss": 1.1698, + "step": 5391 + }, + { + "epoch": 0.13845144041045, + "grad_norm": 0.8203125, + "learning_rate": 0.00019548885166841782, + "loss": 1.0326, + "step": 5392 + }, + { + "epoch": 0.1384771176063718, + "grad_norm": 0.90234375, + "learning_rate": 0.00019548752586379746, + "loss": 1.247, + "step": 5393 + }, + { + "epoch": 0.1385027948022936, + "grad_norm": 0.85546875, + "learning_rate": 0.00019548619986887905, + "loss": 1.1687, + "step": 5394 + }, + { + "epoch": 0.13852847199821544, + "grad_norm": 0.8828125, + "learning_rate": 0.0001954848736836653, + "loss": 1.0303, + "step": 5395 + }, + { + "epoch": 0.13855414919413725, + "grad_norm": 0.8125, + "learning_rate": 0.00019548354730815882, + "loss": 1.0941, + "step": 5396 + }, + { + "epoch": 0.13857982639005909, + "grad_norm": 0.77734375, + "learning_rate": 0.00019548222074236224, + "loss": 1.0911, + "step": 5397 + }, + { + "epoch": 0.1386055035859809, + "grad_norm": 0.89453125, + "learning_rate": 0.00019548089398627823, + "loss": 1.22, + "step": 5398 + }, + { + "epoch": 0.1386311807819027, + "grad_norm": 0.8671875, + "learning_rate": 0.00019547956703990944, + "loss": 1.0926, + "step": 5399 + }, + { + "epoch": 0.13865685797782454, + "grad_norm": 0.8359375, + "learning_rate": 0.00019547823990325846, + "loss": 1.1337, + "step": 5400 + }, + { + "epoch": 0.13868253517374635, + "grad_norm": 0.921875, + "learning_rate": 0.00019547691257632802, + "loss": 1.1356, + "step": 5401 + }, + { + "epoch": 0.13870821236966818, + "grad_norm": 0.8359375, + "learning_rate": 0.00019547558505912064, + "loss": 1.0225, + "step": 5402 + }, + { + "epoch": 0.13873388956559, + "grad_norm": 0.91796875, + "learning_rate": 0.0001954742573516391, + "loss": 1.13, + "step": 5403 + }, + { + "epoch": 0.1387595667615118, + "grad_norm": 0.828125, + "learning_rate": 0.000195472929453886, + "loss": 1.1526, + "step": 5404 + }, + { + "epoch": 0.13878524395743363, + "grad_norm": 0.765625, + "learning_rate": 0.00019547160136586397, + "loss": 1.0308, + "step": 5405 + }, + { + "epoch": 0.13881092115335544, + "grad_norm": 0.828125, + "learning_rate": 0.0001954702730875756, + "loss": 1.0768, + "step": 5406 + }, + { + "epoch": 0.13883659834927728, + "grad_norm": 0.87109375, + "learning_rate": 0.00019546894461902367, + "loss": 1.1163, + "step": 5407 + }, + { + "epoch": 0.1388622755451991, + "grad_norm": 0.828125, + "learning_rate": 0.00019546761596021074, + "loss": 1.0551, + "step": 5408 + }, + { + "epoch": 0.1388879527411209, + "grad_norm": 0.7890625, + "learning_rate": 0.00019546628711113948, + "loss": 1.0128, + "step": 5409 + }, + { + "epoch": 0.13891362993704273, + "grad_norm": 0.90625, + "learning_rate": 0.00019546495807181252, + "loss": 1.1514, + "step": 5410 + }, + { + "epoch": 0.13893930713296454, + "grad_norm": 0.8828125, + "learning_rate": 0.0001954636288422325, + "loss": 1.1856, + "step": 5411 + }, + { + "epoch": 0.13896498432888638, + "grad_norm": 0.81640625, + "learning_rate": 0.00019546229942240216, + "loss": 1.0659, + "step": 5412 + }, + { + "epoch": 0.13899066152480818, + "grad_norm": 0.7890625, + "learning_rate": 0.00019546096981232404, + "loss": 1.1529, + "step": 5413 + }, + { + "epoch": 0.13901633872073, + "grad_norm": 0.8984375, + "learning_rate": 0.00019545964001200082, + "loss": 1.1262, + "step": 5414 + }, + { + "epoch": 0.13904201591665183, + "grad_norm": 0.81640625, + "learning_rate": 0.00019545831002143514, + "loss": 1.1008, + "step": 5415 + }, + { + "epoch": 0.13906769311257364, + "grad_norm": 0.85546875, + "learning_rate": 0.00019545697984062972, + "loss": 1.107, + "step": 5416 + }, + { + "epoch": 0.13909337030849547, + "grad_norm": 0.7734375, + "learning_rate": 0.0001954556494695871, + "loss": 0.9361, + "step": 5417 + }, + { + "epoch": 0.13911904750441728, + "grad_norm": 0.8046875, + "learning_rate": 0.00019545431890831004, + "loss": 1.0991, + "step": 5418 + }, + { + "epoch": 0.1391447247003391, + "grad_norm": 0.921875, + "learning_rate": 0.00019545298815680116, + "loss": 0.9903, + "step": 5419 + }, + { + "epoch": 0.13917040189626093, + "grad_norm": 0.88671875, + "learning_rate": 0.0001954516572150631, + "loss": 1.1735, + "step": 5420 + }, + { + "epoch": 0.13919607909218273, + "grad_norm": 0.984375, + "learning_rate": 0.00019545032608309846, + "loss": 1.1363, + "step": 5421 + }, + { + "epoch": 0.13922175628810457, + "grad_norm": 0.79296875, + "learning_rate": 0.00019544899476090996, + "loss": 1.0746, + "step": 5422 + }, + { + "epoch": 0.13924743348402638, + "grad_norm": 0.7890625, + "learning_rate": 0.00019544766324850024, + "loss": 1.2291, + "step": 5423 + }, + { + "epoch": 0.1392731106799482, + "grad_norm": 0.83984375, + "learning_rate": 0.00019544633154587197, + "loss": 0.9952, + "step": 5424 + }, + { + "epoch": 0.13929878787587002, + "grad_norm": 0.80078125, + "learning_rate": 0.00019544499965302775, + "loss": 1.1094, + "step": 5425 + }, + { + "epoch": 0.13932446507179183, + "grad_norm": 0.8828125, + "learning_rate": 0.00019544366756997028, + "loss": 1.0566, + "step": 5426 + }, + { + "epoch": 0.13935014226771367, + "grad_norm": 0.8125, + "learning_rate": 0.0001954423352967022, + "loss": 1.1296, + "step": 5427 + }, + { + "epoch": 0.13937581946363548, + "grad_norm": 0.88671875, + "learning_rate": 0.00019544100283322617, + "loss": 0.999, + "step": 5428 + }, + { + "epoch": 0.13940149665955728, + "grad_norm": 0.8828125, + "learning_rate": 0.00019543967017954486, + "loss": 1.0786, + "step": 5429 + }, + { + "epoch": 0.13942717385547912, + "grad_norm": 0.7265625, + "learning_rate": 0.00019543833733566088, + "loss": 0.9039, + "step": 5430 + }, + { + "epoch": 0.13945285105140093, + "grad_norm": 0.7890625, + "learning_rate": 0.0001954370043015769, + "loss": 1.0596, + "step": 5431 + }, + { + "epoch": 0.13947852824732276, + "grad_norm": 0.828125, + "learning_rate": 0.00019543567107729564, + "loss": 1.0116, + "step": 5432 + }, + { + "epoch": 0.13950420544324457, + "grad_norm": 0.82421875, + "learning_rate": 0.0001954343376628197, + "loss": 1.1336, + "step": 5433 + }, + { + "epoch": 0.13952988263916638, + "grad_norm": 0.76953125, + "learning_rate": 0.0001954330040581517, + "loss": 1.0049, + "step": 5434 + }, + { + "epoch": 0.13955555983508822, + "grad_norm": 0.83203125, + "learning_rate": 0.0001954316702632944, + "loss": 1.0605, + "step": 5435 + }, + { + "epoch": 0.13958123703101002, + "grad_norm": 0.81640625, + "learning_rate": 0.00019543033627825035, + "loss": 1.0209, + "step": 5436 + }, + { + "epoch": 0.13960691422693186, + "grad_norm": 0.93359375, + "learning_rate": 0.0001954290021030223, + "loss": 1.076, + "step": 5437 + }, + { + "epoch": 0.13963259142285367, + "grad_norm": 0.80859375, + "learning_rate": 0.00019542766773761284, + "loss": 0.9794, + "step": 5438 + }, + { + "epoch": 0.13965826861877548, + "grad_norm": 0.78125, + "learning_rate": 0.00019542633318202468, + "loss": 1.1235, + "step": 5439 + }, + { + "epoch": 0.1396839458146973, + "grad_norm": 0.828125, + "learning_rate": 0.00019542499843626043, + "loss": 1.2405, + "step": 5440 + }, + { + "epoch": 0.13970962301061912, + "grad_norm": 0.8984375, + "learning_rate": 0.00019542366350032282, + "loss": 1.1309, + "step": 5441 + }, + { + "epoch": 0.13973530020654096, + "grad_norm": 0.875, + "learning_rate": 0.00019542232837421443, + "loss": 1.2183, + "step": 5442 + }, + { + "epoch": 0.13976097740246277, + "grad_norm": 0.8359375, + "learning_rate": 0.000195420993057938, + "loss": 1.1083, + "step": 5443 + }, + { + "epoch": 0.13978665459838457, + "grad_norm": 0.81640625, + "learning_rate": 0.00019541965755149612, + "loss": 1.2822, + "step": 5444 + }, + { + "epoch": 0.1398123317943064, + "grad_norm": 0.80859375, + "learning_rate": 0.00019541832185489147, + "loss": 1.0442, + "step": 5445 + }, + { + "epoch": 0.13983800899022822, + "grad_norm": 0.8828125, + "learning_rate": 0.00019541698596812674, + "loss": 1.1899, + "step": 5446 + }, + { + "epoch": 0.13986368618615005, + "grad_norm": 0.90234375, + "learning_rate": 0.00019541564989120457, + "loss": 1.0519, + "step": 5447 + }, + { + "epoch": 0.13988936338207186, + "grad_norm": 0.82421875, + "learning_rate": 0.00019541431362412763, + "loss": 1.0387, + "step": 5448 + }, + { + "epoch": 0.13991504057799367, + "grad_norm": 0.79296875, + "learning_rate": 0.0001954129771668986, + "loss": 1.1742, + "step": 5449 + }, + { + "epoch": 0.1399407177739155, + "grad_norm": 0.8671875, + "learning_rate": 0.0001954116405195201, + "loss": 1.1443, + "step": 5450 + }, + { + "epoch": 0.13996639496983732, + "grad_norm": 0.86328125, + "learning_rate": 0.00019541030368199484, + "loss": 1.1125, + "step": 5451 + }, + { + "epoch": 0.13999207216575915, + "grad_norm": 0.83984375, + "learning_rate": 0.00019540896665432546, + "loss": 1.1144, + "step": 5452 + }, + { + "epoch": 0.14001774936168096, + "grad_norm": 0.87890625, + "learning_rate": 0.0001954076294365146, + "loss": 1.0587, + "step": 5453 + }, + { + "epoch": 0.14004342655760277, + "grad_norm": 0.84375, + "learning_rate": 0.000195406292028565, + "loss": 1.0972, + "step": 5454 + }, + { + "epoch": 0.1400691037535246, + "grad_norm": 0.95703125, + "learning_rate": 0.00019540495443047924, + "loss": 1.2129, + "step": 5455 + }, + { + "epoch": 0.1400947809494464, + "grad_norm": 0.9453125, + "learning_rate": 0.00019540361664226004, + "loss": 1.1816, + "step": 5456 + }, + { + "epoch": 0.14012045814536825, + "grad_norm": 0.9453125, + "learning_rate": 0.00019540227866391002, + "loss": 1.2356, + "step": 5457 + }, + { + "epoch": 0.14014613534129006, + "grad_norm": 0.8046875, + "learning_rate": 0.00019540094049543192, + "loss": 0.8519, + "step": 5458 + }, + { + "epoch": 0.14017181253721187, + "grad_norm": 0.82421875, + "learning_rate": 0.00019539960213682832, + "loss": 1.0375, + "step": 5459 + }, + { + "epoch": 0.1401974897331337, + "grad_norm": 0.8671875, + "learning_rate": 0.00019539826358810198, + "loss": 1.1814, + "step": 5460 + }, + { + "epoch": 0.1402231669290555, + "grad_norm": 0.81640625, + "learning_rate": 0.0001953969248492555, + "loss": 1.0826, + "step": 5461 + }, + { + "epoch": 0.14024884412497735, + "grad_norm": 1.3203125, + "learning_rate": 0.00019539558592029154, + "loss": 1.0599, + "step": 5462 + }, + { + "epoch": 0.14027452132089915, + "grad_norm": 0.8671875, + "learning_rate": 0.00019539424680121284, + "loss": 1.1228, + "step": 5463 + }, + { + "epoch": 0.14030019851682096, + "grad_norm": 0.80078125, + "learning_rate": 0.000195392907492022, + "loss": 1.0403, + "step": 5464 + }, + { + "epoch": 0.1403258757127428, + "grad_norm": 0.80078125, + "learning_rate": 0.0001953915679927217, + "loss": 1.0189, + "step": 5465 + }, + { + "epoch": 0.1403515529086646, + "grad_norm": 0.96875, + "learning_rate": 0.0001953902283033146, + "loss": 1.1393, + "step": 5466 + }, + { + "epoch": 0.14037723010458644, + "grad_norm": 0.7578125, + "learning_rate": 0.00019538888842380344, + "loss": 0.9994, + "step": 5467 + }, + { + "epoch": 0.14040290730050825, + "grad_norm": 0.83984375, + "learning_rate": 0.00019538754835419082, + "loss": 1.1218, + "step": 5468 + }, + { + "epoch": 0.14042858449643006, + "grad_norm": 0.8515625, + "learning_rate": 0.00019538620809447945, + "loss": 1.1614, + "step": 5469 + }, + { + "epoch": 0.1404542616923519, + "grad_norm": 0.828125, + "learning_rate": 0.00019538486764467195, + "loss": 0.9373, + "step": 5470 + }, + { + "epoch": 0.1404799388882737, + "grad_norm": 0.80078125, + "learning_rate": 0.00019538352700477105, + "loss": 1.0548, + "step": 5471 + }, + { + "epoch": 0.14050561608419554, + "grad_norm": 0.921875, + "learning_rate": 0.0001953821861747794, + "loss": 1.1274, + "step": 5472 + }, + { + "epoch": 0.14053129328011735, + "grad_norm": 0.765625, + "learning_rate": 0.00019538084515469966, + "loss": 1.1416, + "step": 5473 + }, + { + "epoch": 0.14055697047603916, + "grad_norm": 0.8203125, + "learning_rate": 0.0001953795039445345, + "loss": 1.2499, + "step": 5474 + }, + { + "epoch": 0.140582647671961, + "grad_norm": 0.83984375, + "learning_rate": 0.00019537816254428664, + "loss": 1.1749, + "step": 5475 + }, + { + "epoch": 0.1406083248678828, + "grad_norm": 0.78515625, + "learning_rate": 0.00019537682095395868, + "loss": 1.0407, + "step": 5476 + }, + { + "epoch": 0.1406340020638046, + "grad_norm": 0.890625, + "learning_rate": 0.0001953754791735533, + "loss": 1.0606, + "step": 5477 + }, + { + "epoch": 0.14065967925972644, + "grad_norm": 0.84375, + "learning_rate": 0.0001953741372030733, + "loss": 1.152, + "step": 5478 + }, + { + "epoch": 0.14068535645564825, + "grad_norm": 0.80859375, + "learning_rate": 0.00019537279504252118, + "loss": 1.0811, + "step": 5479 + }, + { + "epoch": 0.1407110336515701, + "grad_norm": 0.8671875, + "learning_rate": 0.00019537145269189974, + "loss": 1.0313, + "step": 5480 + }, + { + "epoch": 0.1407367108474919, + "grad_norm": 0.85546875, + "learning_rate": 0.00019537011015121158, + "loss": 1.0857, + "step": 5481 + }, + { + "epoch": 0.1407623880434137, + "grad_norm": 0.96484375, + "learning_rate": 0.00019536876742045945, + "loss": 1.0292, + "step": 5482 + }, + { + "epoch": 0.14078806523933554, + "grad_norm": 0.97265625, + "learning_rate": 0.0001953674244996459, + "loss": 1.1686, + "step": 5483 + }, + { + "epoch": 0.14081374243525735, + "grad_norm": 0.83203125, + "learning_rate": 0.00019536608138877374, + "loss": 1.1719, + "step": 5484 + }, + { + "epoch": 0.14083941963117919, + "grad_norm": 0.87890625, + "learning_rate": 0.0001953647380878456, + "loss": 1.0346, + "step": 5485 + }, + { + "epoch": 0.140865096827101, + "grad_norm": 0.8125, + "learning_rate": 0.00019536339459686416, + "loss": 1.0725, + "step": 5486 + }, + { + "epoch": 0.1408907740230228, + "grad_norm": 0.9453125, + "learning_rate": 0.00019536205091583207, + "loss": 1.3894, + "step": 5487 + }, + { + "epoch": 0.14091645121894464, + "grad_norm": 0.86328125, + "learning_rate": 0.000195360707044752, + "loss": 1.1301, + "step": 5488 + }, + { + "epoch": 0.14094212841486645, + "grad_norm": 0.87890625, + "learning_rate": 0.0001953593629836267, + "loss": 1.0653, + "step": 5489 + }, + { + "epoch": 0.14096780561078828, + "grad_norm": 0.86328125, + "learning_rate": 0.0001953580187324588, + "loss": 0.9406, + "step": 5490 + }, + { + "epoch": 0.1409934828067101, + "grad_norm": 0.8125, + "learning_rate": 0.00019535667429125095, + "loss": 1.0241, + "step": 5491 + }, + { + "epoch": 0.1410191600026319, + "grad_norm": 0.859375, + "learning_rate": 0.00019535532966000587, + "loss": 1.0028, + "step": 5492 + }, + { + "epoch": 0.14104483719855374, + "grad_norm": 0.91796875, + "learning_rate": 0.00019535398483872625, + "loss": 0.9644, + "step": 5493 + }, + { + "epoch": 0.14107051439447554, + "grad_norm": 0.81640625, + "learning_rate": 0.00019535263982741477, + "loss": 1.077, + "step": 5494 + }, + { + "epoch": 0.14109619159039738, + "grad_norm": 0.78125, + "learning_rate": 0.00019535129462607405, + "loss": 1.1183, + "step": 5495 + }, + { + "epoch": 0.1411218687863192, + "grad_norm": 1.1171875, + "learning_rate": 0.00019534994923470683, + "loss": 1.2411, + "step": 5496 + }, + { + "epoch": 0.141147545982241, + "grad_norm": 0.8359375, + "learning_rate": 0.00019534860365331578, + "loss": 1.1714, + "step": 5497 + }, + { + "epoch": 0.14117322317816283, + "grad_norm": 0.87109375, + "learning_rate": 0.00019534725788190356, + "loss": 1.1835, + "step": 5498 + }, + { + "epoch": 0.14119890037408464, + "grad_norm": 0.84375, + "learning_rate": 0.00019534591192047287, + "loss": 1.1452, + "step": 5499 + }, + { + "epoch": 0.14122457757000648, + "grad_norm": 2.46875, + "learning_rate": 0.0001953445657690264, + "loss": 1.0825, + "step": 5500 + }, + { + "epoch": 0.14125025476592828, + "grad_norm": 0.84765625, + "learning_rate": 0.0001953432194275668, + "loss": 1.1382, + "step": 5501 + }, + { + "epoch": 0.1412759319618501, + "grad_norm": 0.86328125, + "learning_rate": 0.0001953418728960968, + "loss": 1.174, + "step": 5502 + }, + { + "epoch": 0.14130160915777193, + "grad_norm": 0.88671875, + "learning_rate": 0.00019534052617461908, + "loss": 1.0755, + "step": 5503 + }, + { + "epoch": 0.14132728635369374, + "grad_norm": 0.859375, + "learning_rate": 0.00019533917926313626, + "loss": 1.1035, + "step": 5504 + }, + { + "epoch": 0.14135296354961557, + "grad_norm": 0.8359375, + "learning_rate": 0.00019533783216165107, + "loss": 1.1292, + "step": 5505 + }, + { + "epoch": 0.14137864074553738, + "grad_norm": 0.859375, + "learning_rate": 0.0001953364848701662, + "loss": 0.9693, + "step": 5506 + }, + { + "epoch": 0.1414043179414592, + "grad_norm": 0.88671875, + "learning_rate": 0.00019533513738868434, + "loss": 1.1054, + "step": 5507 + }, + { + "epoch": 0.14142999513738103, + "grad_norm": 0.8203125, + "learning_rate": 0.00019533378971720814, + "loss": 1.1826, + "step": 5508 + }, + { + "epoch": 0.14145567233330283, + "grad_norm": 0.859375, + "learning_rate": 0.00019533244185574033, + "loss": 1.0236, + "step": 5509 + }, + { + "epoch": 0.14148134952922467, + "grad_norm": 0.88671875, + "learning_rate": 0.00019533109380428357, + "loss": 1.2192, + "step": 5510 + }, + { + "epoch": 0.14150702672514648, + "grad_norm": 0.89453125, + "learning_rate": 0.00019532974556284058, + "loss": 1.2624, + "step": 5511 + }, + { + "epoch": 0.1415327039210683, + "grad_norm": 0.89453125, + "learning_rate": 0.00019532839713141397, + "loss": 1.2218, + "step": 5512 + }, + { + "epoch": 0.14155838111699012, + "grad_norm": 0.81640625, + "learning_rate": 0.0001953270485100065, + "loss": 1.0664, + "step": 5513 + }, + { + "epoch": 0.14158405831291193, + "grad_norm": 0.84765625, + "learning_rate": 0.00019532569969862084, + "loss": 1.1519, + "step": 5514 + }, + { + "epoch": 0.14160973550883377, + "grad_norm": 0.83203125, + "learning_rate": 0.00019532435069725965, + "loss": 1.1531, + "step": 5515 + }, + { + "epoch": 0.14163541270475558, + "grad_norm": 0.8203125, + "learning_rate": 0.00019532300150592566, + "loss": 1.0435, + "step": 5516 + }, + { + "epoch": 0.14166108990067738, + "grad_norm": 0.84765625, + "learning_rate": 0.0001953216521246215, + "loss": 1.0469, + "step": 5517 + }, + { + "epoch": 0.14168676709659922, + "grad_norm": 0.8828125, + "learning_rate": 0.00019532030255334993, + "loss": 1.1594, + "step": 5518 + }, + { + "epoch": 0.14171244429252103, + "grad_norm": 0.7734375, + "learning_rate": 0.00019531895279211364, + "loss": 0.9564, + "step": 5519 + }, + { + "epoch": 0.14173812148844286, + "grad_norm": 0.921875, + "learning_rate": 0.00019531760284091524, + "loss": 1.2537, + "step": 5520 + }, + { + "epoch": 0.14176379868436467, + "grad_norm": 0.83203125, + "learning_rate": 0.00019531625269975745, + "loss": 1.076, + "step": 5521 + }, + { + "epoch": 0.14178947588028648, + "grad_norm": 0.8125, + "learning_rate": 0.000195314902368643, + "loss": 1.049, + "step": 5522 + }, + { + "epoch": 0.14181515307620832, + "grad_norm": 0.81640625, + "learning_rate": 0.00019531355184757457, + "loss": 1.0817, + "step": 5523 + }, + { + "epoch": 0.14184083027213013, + "grad_norm": 0.8125, + "learning_rate": 0.00019531220113655487, + "loss": 1.2074, + "step": 5524 + }, + { + "epoch": 0.14186650746805196, + "grad_norm": 0.828125, + "learning_rate": 0.0001953108502355865, + "loss": 1.0882, + "step": 5525 + }, + { + "epoch": 0.14189218466397377, + "grad_norm": 0.75390625, + "learning_rate": 0.00019530949914467225, + "loss": 0.8971, + "step": 5526 + }, + { + "epoch": 0.14191786185989558, + "grad_norm": 0.8515625, + "learning_rate": 0.00019530814786381477, + "loss": 1.1371, + "step": 5527 + }, + { + "epoch": 0.1419435390558174, + "grad_norm": 0.8125, + "learning_rate": 0.00019530679639301676, + "loss": 1.0552, + "step": 5528 + }, + { + "epoch": 0.14196921625173922, + "grad_norm": 0.77734375, + "learning_rate": 0.00019530544473228092, + "loss": 1.0434, + "step": 5529 + }, + { + "epoch": 0.14199489344766106, + "grad_norm": 0.8203125, + "learning_rate": 0.00019530409288160994, + "loss": 1.0911, + "step": 5530 + }, + { + "epoch": 0.14202057064358287, + "grad_norm": 0.84765625, + "learning_rate": 0.0001953027408410065, + "loss": 1.1388, + "step": 5531 + }, + { + "epoch": 0.14204624783950467, + "grad_norm": 0.84765625, + "learning_rate": 0.0001953013886104733, + "loss": 1.1968, + "step": 5532 + }, + { + "epoch": 0.1420719250354265, + "grad_norm": 0.859375, + "learning_rate": 0.00019530003619001307, + "loss": 0.9786, + "step": 5533 + }, + { + "epoch": 0.14209760223134832, + "grad_norm": 0.90234375, + "learning_rate": 0.00019529868357962845, + "loss": 1.1405, + "step": 5534 + }, + { + "epoch": 0.14212327942727015, + "grad_norm": 0.84765625, + "learning_rate": 0.00019529733077932218, + "loss": 1.1069, + "step": 5535 + }, + { + "epoch": 0.14214895662319196, + "grad_norm": 0.7578125, + "learning_rate": 0.0001952959777890969, + "loss": 1.0187, + "step": 5536 + }, + { + "epoch": 0.14217463381911377, + "grad_norm": 0.82421875, + "learning_rate": 0.00019529462460895537, + "loss": 1.0191, + "step": 5537 + }, + { + "epoch": 0.1422003110150356, + "grad_norm": 0.8359375, + "learning_rate": 0.00019529327123890027, + "loss": 1.1948, + "step": 5538 + }, + { + "epoch": 0.14222598821095742, + "grad_norm": 0.87109375, + "learning_rate": 0.00019529191767893428, + "loss": 1.0144, + "step": 5539 + }, + { + "epoch": 0.14225166540687925, + "grad_norm": 0.765625, + "learning_rate": 0.00019529056392906011, + "loss": 1.2352, + "step": 5540 + }, + { + "epoch": 0.14227734260280106, + "grad_norm": 0.90234375, + "learning_rate": 0.00019528920998928046, + "loss": 1.1419, + "step": 5541 + }, + { + "epoch": 0.14230301979872287, + "grad_norm": 0.85546875, + "learning_rate": 0.000195287855859598, + "loss": 1.1513, + "step": 5542 + }, + { + "epoch": 0.1423286969946447, + "grad_norm": 0.82421875, + "learning_rate": 0.00019528650154001546, + "loss": 1.1626, + "step": 5543 + }, + { + "epoch": 0.1423543741905665, + "grad_norm": 0.875, + "learning_rate": 0.00019528514703053554, + "loss": 1.1747, + "step": 5544 + }, + { + "epoch": 0.14238005138648835, + "grad_norm": 0.828125, + "learning_rate": 0.00019528379233116092, + "loss": 1.1148, + "step": 5545 + }, + { + "epoch": 0.14240572858241016, + "grad_norm": 0.87890625, + "learning_rate": 0.0001952824374418943, + "loss": 1.1333, + "step": 5546 + }, + { + "epoch": 0.14243140577833197, + "grad_norm": 0.86328125, + "learning_rate": 0.0001952810823627384, + "loss": 1.2867, + "step": 5547 + }, + { + "epoch": 0.1424570829742538, + "grad_norm": 0.8359375, + "learning_rate": 0.0001952797270936959, + "loss": 1.0196, + "step": 5548 + }, + { + "epoch": 0.1424827601701756, + "grad_norm": 0.90625, + "learning_rate": 0.00019527837163476954, + "loss": 1.1678, + "step": 5549 + }, + { + "epoch": 0.14250843736609745, + "grad_norm": 0.87109375, + "learning_rate": 0.00019527701598596195, + "loss": 1.107, + "step": 5550 + }, + { + "epoch": 0.14253411456201925, + "grad_norm": 0.8359375, + "learning_rate": 0.00019527566014727592, + "loss": 1.2786, + "step": 5551 + }, + { + "epoch": 0.14255979175794106, + "grad_norm": 0.77734375, + "learning_rate": 0.00019527430411871407, + "loss": 1.0924, + "step": 5552 + }, + { + "epoch": 0.1425854689538629, + "grad_norm": 0.9296875, + "learning_rate": 0.00019527294790027915, + "loss": 1.1034, + "step": 5553 + }, + { + "epoch": 0.1426111461497847, + "grad_norm": 0.8359375, + "learning_rate": 0.00019527159149197384, + "loss": 1.1034, + "step": 5554 + }, + { + "epoch": 0.14263682334570654, + "grad_norm": 0.78125, + "learning_rate": 0.00019527023489380088, + "loss": 1.0599, + "step": 5555 + }, + { + "epoch": 0.14266250054162835, + "grad_norm": 0.8515625, + "learning_rate": 0.00019526887810576292, + "loss": 1.1967, + "step": 5556 + }, + { + "epoch": 0.14268817773755016, + "grad_norm": 0.93359375, + "learning_rate": 0.0001952675211278627, + "loss": 1.0405, + "step": 5557 + }, + { + "epoch": 0.142713854933472, + "grad_norm": 0.796875, + "learning_rate": 0.0001952661639601029, + "loss": 0.942, + "step": 5558 + }, + { + "epoch": 0.1427395321293938, + "grad_norm": 0.875, + "learning_rate": 0.0001952648066024863, + "loss": 1.1026, + "step": 5559 + }, + { + "epoch": 0.14276520932531564, + "grad_norm": 0.78125, + "learning_rate": 0.0001952634490550155, + "loss": 1.0713, + "step": 5560 + }, + { + "epoch": 0.14279088652123745, + "grad_norm": 0.77734375, + "learning_rate": 0.00019526209131769324, + "loss": 0.9305, + "step": 5561 + }, + { + "epoch": 0.14281656371715926, + "grad_norm": 0.92578125, + "learning_rate": 0.00019526073339052227, + "loss": 1.1161, + "step": 5562 + }, + { + "epoch": 0.1428422409130811, + "grad_norm": 0.9609375, + "learning_rate": 0.00019525937527350523, + "loss": 1.0683, + "step": 5563 + }, + { + "epoch": 0.1428679181090029, + "grad_norm": 0.93359375, + "learning_rate": 0.00019525801696664488, + "loss": 1.1693, + "step": 5564 + }, + { + "epoch": 0.14289359530492474, + "grad_norm": 0.87109375, + "learning_rate": 0.0001952566584699439, + "loss": 1.0183, + "step": 5565 + }, + { + "epoch": 0.14291927250084654, + "grad_norm": 0.796875, + "learning_rate": 0.000195255299783405, + "loss": 1.0909, + "step": 5566 + }, + { + "epoch": 0.14294494969676835, + "grad_norm": 0.9140625, + "learning_rate": 0.00019525394090703091, + "loss": 1.0479, + "step": 5567 + }, + { + "epoch": 0.1429706268926902, + "grad_norm": 0.92578125, + "learning_rate": 0.0001952525818408243, + "loss": 1.2945, + "step": 5568 + }, + { + "epoch": 0.142996304088612, + "grad_norm": 0.88671875, + "learning_rate": 0.0001952512225847879, + "loss": 1.12, + "step": 5569 + }, + { + "epoch": 0.14302198128453383, + "grad_norm": 0.8125, + "learning_rate": 0.00019524986313892444, + "loss": 1.1044, + "step": 5570 + }, + { + "epoch": 0.14304765848045564, + "grad_norm": 0.859375, + "learning_rate": 0.00019524850350323658, + "loss": 1.058, + "step": 5571 + }, + { + "epoch": 0.14307333567637745, + "grad_norm": 0.83984375, + "learning_rate": 0.00019524714367772706, + "loss": 1.0635, + "step": 5572 + }, + { + "epoch": 0.14309901287229929, + "grad_norm": 0.80859375, + "learning_rate": 0.0001952457836623986, + "loss": 1.3089, + "step": 5573 + }, + { + "epoch": 0.1431246900682211, + "grad_norm": 0.890625, + "learning_rate": 0.0001952444234572539, + "loss": 1.1019, + "step": 5574 + }, + { + "epoch": 0.14315036726414293, + "grad_norm": 0.8046875, + "learning_rate": 0.00019524306306229565, + "loss": 1.1909, + "step": 5575 + }, + { + "epoch": 0.14317604446006474, + "grad_norm": 0.828125, + "learning_rate": 0.0001952417024775266, + "loss": 1.1139, + "step": 5576 + }, + { + "epoch": 0.14320172165598655, + "grad_norm": 0.7578125, + "learning_rate": 0.0001952403417029494, + "loss": 0.9468, + "step": 5577 + }, + { + "epoch": 0.14322739885190838, + "grad_norm": 0.8046875, + "learning_rate": 0.00019523898073856683, + "loss": 1.1197, + "step": 5578 + }, + { + "epoch": 0.1432530760478302, + "grad_norm": 0.8671875, + "learning_rate": 0.0001952376195843816, + "loss": 1.116, + "step": 5579 + }, + { + "epoch": 0.14327875324375203, + "grad_norm": 0.80859375, + "learning_rate": 0.00019523625824039635, + "loss": 0.9357, + "step": 5580 + }, + { + "epoch": 0.14330443043967384, + "grad_norm": 0.82421875, + "learning_rate": 0.00019523489670661385, + "loss": 1.1744, + "step": 5581 + }, + { + "epoch": 0.14333010763559564, + "grad_norm": 0.8203125, + "learning_rate": 0.00019523353498303683, + "loss": 1.0431, + "step": 5582 + }, + { + "epoch": 0.14335578483151748, + "grad_norm": 0.8984375, + "learning_rate": 0.00019523217306966796, + "loss": 1.208, + "step": 5583 + }, + { + "epoch": 0.1433814620274393, + "grad_norm": 0.8359375, + "learning_rate": 0.00019523081096650997, + "loss": 1.1407, + "step": 5584 + }, + { + "epoch": 0.14340713922336112, + "grad_norm": 0.8203125, + "learning_rate": 0.00019522944867356556, + "loss": 1.1713, + "step": 5585 + }, + { + "epoch": 0.14343281641928293, + "grad_norm": 0.7578125, + "learning_rate": 0.0001952280861908375, + "loss": 1.0407, + "step": 5586 + }, + { + "epoch": 0.14345849361520474, + "grad_norm": 0.81640625, + "learning_rate": 0.00019522672351832845, + "loss": 1.0366, + "step": 5587 + }, + { + "epoch": 0.14348417081112658, + "grad_norm": 0.8046875, + "learning_rate": 0.0001952253606560411, + "loss": 1.0726, + "step": 5588 + }, + { + "epoch": 0.14350984800704839, + "grad_norm": 0.86328125, + "learning_rate": 0.00019522399760397828, + "loss": 1.1699, + "step": 5589 + }, + { + "epoch": 0.14353552520297022, + "grad_norm": 0.7734375, + "learning_rate": 0.0001952226343621426, + "loss": 1.0147, + "step": 5590 + }, + { + "epoch": 0.14356120239889203, + "grad_norm": 0.8046875, + "learning_rate": 0.00019522127093053683, + "loss": 1.0671, + "step": 5591 + }, + { + "epoch": 0.14358687959481384, + "grad_norm": 0.91796875, + "learning_rate": 0.00019521990730916362, + "loss": 1.0492, + "step": 5592 + }, + { + "epoch": 0.14361255679073567, + "grad_norm": 0.90234375, + "learning_rate": 0.0001952185434980258, + "loss": 1.1005, + "step": 5593 + }, + { + "epoch": 0.14363823398665748, + "grad_norm": 0.8515625, + "learning_rate": 0.00019521717949712596, + "loss": 1.1159, + "step": 5594 + }, + { + "epoch": 0.14366391118257932, + "grad_norm": 0.8125, + "learning_rate": 0.0001952158153064669, + "loss": 1.2665, + "step": 5595 + }, + { + "epoch": 0.14368958837850113, + "grad_norm": 0.84375, + "learning_rate": 0.00019521445092605133, + "loss": 1.0412, + "step": 5596 + }, + { + "epoch": 0.14371526557442293, + "grad_norm": 0.77734375, + "learning_rate": 0.00019521308635588196, + "loss": 0.845, + "step": 5597 + }, + { + "epoch": 0.14374094277034477, + "grad_norm": 0.78515625, + "learning_rate": 0.00019521172159596152, + "loss": 1.1118, + "step": 5598 + }, + { + "epoch": 0.14376661996626658, + "grad_norm": 0.8203125, + "learning_rate": 0.0001952103566462927, + "loss": 1.0553, + "step": 5599 + }, + { + "epoch": 0.14379229716218841, + "grad_norm": 0.76953125, + "learning_rate": 0.00019520899150687825, + "loss": 1.0366, + "step": 5600 + }, + { + "epoch": 0.14381797435811022, + "grad_norm": 0.85546875, + "learning_rate": 0.00019520762617772087, + "loss": 1.1655, + "step": 5601 + }, + { + "epoch": 0.14384365155403203, + "grad_norm": 0.81640625, + "learning_rate": 0.00019520626065882332, + "loss": 1.2953, + "step": 5602 + }, + { + "epoch": 0.14386932874995387, + "grad_norm": 0.78125, + "learning_rate": 0.00019520489495018827, + "loss": 1.0098, + "step": 5603 + }, + { + "epoch": 0.14389500594587568, + "grad_norm": 0.83984375, + "learning_rate": 0.00019520352905181846, + "loss": 1.0901, + "step": 5604 + }, + { + "epoch": 0.1439206831417975, + "grad_norm": 0.875, + "learning_rate": 0.00019520216296371664, + "loss": 1.1233, + "step": 5605 + }, + { + "epoch": 0.14394636033771932, + "grad_norm": 0.87890625, + "learning_rate": 0.00019520079668588548, + "loss": 1.0524, + "step": 5606 + }, + { + "epoch": 0.14397203753364113, + "grad_norm": 0.91015625, + "learning_rate": 0.00019519943021832775, + "loss": 1.2018, + "step": 5607 + }, + { + "epoch": 0.14399771472956296, + "grad_norm": 0.875, + "learning_rate": 0.00019519806356104613, + "loss": 1.0047, + "step": 5608 + }, + { + "epoch": 0.14402339192548477, + "grad_norm": 0.93359375, + "learning_rate": 0.00019519669671404338, + "loss": 1.1565, + "step": 5609 + }, + { + "epoch": 0.1440490691214066, + "grad_norm": 0.9296875, + "learning_rate": 0.0001951953296773222, + "loss": 0.8961, + "step": 5610 + }, + { + "epoch": 0.14407474631732842, + "grad_norm": 1.3671875, + "learning_rate": 0.0001951939624508854, + "loss": 1.0768, + "step": 5611 + }, + { + "epoch": 0.14410042351325023, + "grad_norm": 0.83203125, + "learning_rate": 0.00019519259503473558, + "loss": 1.1685, + "step": 5612 + }, + { + "epoch": 0.14412610070917206, + "grad_norm": 0.83984375, + "learning_rate": 0.0001951912274288755, + "loss": 1.2515, + "step": 5613 + }, + { + "epoch": 0.14415177790509387, + "grad_norm": 0.83203125, + "learning_rate": 0.0001951898596333079, + "loss": 1.0383, + "step": 5614 + }, + { + "epoch": 0.1441774551010157, + "grad_norm": 0.8125, + "learning_rate": 0.00019518849164803554, + "loss": 1.0626, + "step": 5615 + }, + { + "epoch": 0.14420313229693751, + "grad_norm": 0.9765625, + "learning_rate": 0.0001951871234730611, + "loss": 1.2082, + "step": 5616 + }, + { + "epoch": 0.14422880949285932, + "grad_norm": 0.82421875, + "learning_rate": 0.0001951857551083873, + "loss": 1.0263, + "step": 5617 + }, + { + "epoch": 0.14425448668878116, + "grad_norm": 0.83203125, + "learning_rate": 0.00019518438655401692, + "loss": 0.9861, + "step": 5618 + }, + { + "epoch": 0.14428016388470297, + "grad_norm": 0.83203125, + "learning_rate": 0.00019518301780995265, + "loss": 0.9827, + "step": 5619 + }, + { + "epoch": 0.1443058410806248, + "grad_norm": 0.828125, + "learning_rate": 0.00019518164887619724, + "loss": 1.0145, + "step": 5620 + }, + { + "epoch": 0.1443315182765466, + "grad_norm": 0.8515625, + "learning_rate": 0.00019518027975275336, + "loss": 0.9783, + "step": 5621 + }, + { + "epoch": 0.14435719547246842, + "grad_norm": 1.1640625, + "learning_rate": 0.0001951789104396238, + "loss": 1.1259, + "step": 5622 + }, + { + "epoch": 0.14438287266839026, + "grad_norm": 0.859375, + "learning_rate": 0.00019517754093681126, + "loss": 1.1222, + "step": 5623 + }, + { + "epoch": 0.14440854986431206, + "grad_norm": 0.80078125, + "learning_rate": 0.00019517617124431848, + "loss": 1.0373, + "step": 5624 + }, + { + "epoch": 0.1444342270602339, + "grad_norm": 0.7421875, + "learning_rate": 0.00019517480136214821, + "loss": 1.0217, + "step": 5625 + }, + { + "epoch": 0.1444599042561557, + "grad_norm": 0.8671875, + "learning_rate": 0.00019517343129030312, + "loss": 1.0884, + "step": 5626 + }, + { + "epoch": 0.14448558145207752, + "grad_norm": 0.8046875, + "learning_rate": 0.00019517206102878602, + "loss": 1.1342, + "step": 5627 + }, + { + "epoch": 0.14451125864799935, + "grad_norm": 0.81640625, + "learning_rate": 0.00019517069057759957, + "loss": 1.0671, + "step": 5628 + }, + { + "epoch": 0.14453693584392116, + "grad_norm": 0.80859375, + "learning_rate": 0.00019516931993674653, + "loss": 0.9796, + "step": 5629 + }, + { + "epoch": 0.144562613039843, + "grad_norm": 0.765625, + "learning_rate": 0.00019516794910622967, + "loss": 1.0974, + "step": 5630 + }, + { + "epoch": 0.1445882902357648, + "grad_norm": 0.8125, + "learning_rate": 0.00019516657808605167, + "loss": 1.0711, + "step": 5631 + }, + { + "epoch": 0.1446139674316866, + "grad_norm": 0.828125, + "learning_rate": 0.00019516520687621523, + "loss": 1.1833, + "step": 5632 + }, + { + "epoch": 0.14463964462760845, + "grad_norm": 0.93359375, + "learning_rate": 0.00019516383547672317, + "loss": 1.2363, + "step": 5633 + }, + { + "epoch": 0.14466532182353026, + "grad_norm": 0.83984375, + "learning_rate": 0.00019516246388757817, + "loss": 1.1509, + "step": 5634 + }, + { + "epoch": 0.1446909990194521, + "grad_norm": 0.7890625, + "learning_rate": 0.00019516109210878296, + "loss": 1.0785, + "step": 5635 + }, + { + "epoch": 0.1447166762153739, + "grad_norm": 0.78125, + "learning_rate": 0.00019515972014034035, + "loss": 1.0525, + "step": 5636 + }, + { + "epoch": 0.1447423534112957, + "grad_norm": 0.85546875, + "learning_rate": 0.00019515834798225298, + "loss": 1.1325, + "step": 5637 + }, + { + "epoch": 0.14476803060721755, + "grad_norm": 0.80859375, + "learning_rate": 0.0001951569756345236, + "loss": 1.0622, + "step": 5638 + }, + { + "epoch": 0.14479370780313935, + "grad_norm": 0.83984375, + "learning_rate": 0.00019515560309715495, + "loss": 1.1208, + "step": 5639 + }, + { + "epoch": 0.1448193849990612, + "grad_norm": 0.86328125, + "learning_rate": 0.0001951542303701498, + "loss": 1.0576, + "step": 5640 + }, + { + "epoch": 0.144845062194983, + "grad_norm": 0.8671875, + "learning_rate": 0.0001951528574535109, + "loss": 0.9965, + "step": 5641 + }, + { + "epoch": 0.1448707393909048, + "grad_norm": 0.83203125, + "learning_rate": 0.00019515148434724088, + "loss": 1.072, + "step": 5642 + }, + { + "epoch": 0.14489641658682664, + "grad_norm": 0.890625, + "learning_rate": 0.0001951501110513426, + "loss": 1.0219, + "step": 5643 + }, + { + "epoch": 0.14492209378274845, + "grad_norm": 0.83203125, + "learning_rate": 0.00019514873756581872, + "loss": 1.103, + "step": 5644 + }, + { + "epoch": 0.1449477709786703, + "grad_norm": 0.80859375, + "learning_rate": 0.000195147363890672, + "loss": 1.0814, + "step": 5645 + }, + { + "epoch": 0.1449734481745921, + "grad_norm": 0.7890625, + "learning_rate": 0.0001951459900259052, + "loss": 1.1072, + "step": 5646 + }, + { + "epoch": 0.1449991253705139, + "grad_norm": 0.8203125, + "learning_rate": 0.00019514461597152104, + "loss": 1.0545, + "step": 5647 + }, + { + "epoch": 0.14502480256643574, + "grad_norm": 0.80078125, + "learning_rate": 0.00019514324172752222, + "loss": 1.0316, + "step": 5648 + }, + { + "epoch": 0.14505047976235755, + "grad_norm": 0.79296875, + "learning_rate": 0.00019514186729391154, + "loss": 1.0307, + "step": 5649 + }, + { + "epoch": 0.14507615695827938, + "grad_norm": 0.80859375, + "learning_rate": 0.00019514049267069168, + "loss": 1.0068, + "step": 5650 + }, + { + "epoch": 0.1451018341542012, + "grad_norm": 0.8046875, + "learning_rate": 0.00019513911785786544, + "loss": 1.011, + "step": 5651 + }, + { + "epoch": 0.145127511350123, + "grad_norm": 0.85546875, + "learning_rate": 0.00019513774285543554, + "loss": 1.0049, + "step": 5652 + }, + { + "epoch": 0.14515318854604484, + "grad_norm": 0.8046875, + "learning_rate": 0.00019513636766340468, + "loss": 1.0593, + "step": 5653 + }, + { + "epoch": 0.14517886574196665, + "grad_norm": 0.8125, + "learning_rate": 0.00019513499228177565, + "loss": 1.0958, + "step": 5654 + }, + { + "epoch": 0.14520454293788848, + "grad_norm": 0.74609375, + "learning_rate": 0.00019513361671055117, + "loss": 1.0921, + "step": 5655 + }, + { + "epoch": 0.1452302201338103, + "grad_norm": 0.890625, + "learning_rate": 0.000195132240949734, + "loss": 1.2288, + "step": 5656 + }, + { + "epoch": 0.1452558973297321, + "grad_norm": 0.8046875, + "learning_rate": 0.00019513086499932687, + "loss": 0.9877, + "step": 5657 + }, + { + "epoch": 0.14528157452565393, + "grad_norm": 0.87890625, + "learning_rate": 0.0001951294888593325, + "loss": 1.2084, + "step": 5658 + }, + { + "epoch": 0.14530725172157574, + "grad_norm": 0.81640625, + "learning_rate": 0.00019512811252975367, + "loss": 1.1838, + "step": 5659 + }, + { + "epoch": 0.14533292891749758, + "grad_norm": 0.79296875, + "learning_rate": 0.0001951267360105931, + "loss": 0.9294, + "step": 5660 + }, + { + "epoch": 0.1453586061134194, + "grad_norm": 0.859375, + "learning_rate": 0.00019512535930185353, + "loss": 1.0799, + "step": 5661 + }, + { + "epoch": 0.1453842833093412, + "grad_norm": 0.828125, + "learning_rate": 0.0001951239824035377, + "loss": 1.0515, + "step": 5662 + }, + { + "epoch": 0.14540996050526303, + "grad_norm": 0.7890625, + "learning_rate": 0.0001951226053156484, + "loss": 1.0885, + "step": 5663 + }, + { + "epoch": 0.14543563770118484, + "grad_norm": 0.8515625, + "learning_rate": 0.0001951212280381883, + "loss": 1.0888, + "step": 5664 + }, + { + "epoch": 0.14546131489710667, + "grad_norm": 0.90234375, + "learning_rate": 0.0001951198505711602, + "loss": 1.1584, + "step": 5665 + }, + { + "epoch": 0.14548699209302848, + "grad_norm": 0.85546875, + "learning_rate": 0.00019511847291456684, + "loss": 1.0818, + "step": 5666 + }, + { + "epoch": 0.1455126692889503, + "grad_norm": 0.82421875, + "learning_rate": 0.00019511709506841093, + "loss": 0.9998, + "step": 5667 + }, + { + "epoch": 0.14553834648487213, + "grad_norm": 0.92578125, + "learning_rate": 0.00019511571703269528, + "loss": 1.0985, + "step": 5668 + }, + { + "epoch": 0.14556402368079394, + "grad_norm": 0.80859375, + "learning_rate": 0.00019511433880742258, + "loss": 0.9619, + "step": 5669 + }, + { + "epoch": 0.14558970087671577, + "grad_norm": 0.79296875, + "learning_rate": 0.00019511296039259558, + "loss": 1.1056, + "step": 5670 + }, + { + "epoch": 0.14561537807263758, + "grad_norm": 0.84375, + "learning_rate": 0.00019511158178821706, + "loss": 1.2011, + "step": 5671 + }, + { + "epoch": 0.1456410552685594, + "grad_norm": 0.96875, + "learning_rate": 0.00019511020299428972, + "loss": 1.2402, + "step": 5672 + }, + { + "epoch": 0.14566673246448122, + "grad_norm": 0.84375, + "learning_rate": 0.00019510882401081638, + "loss": 1.0843, + "step": 5673 + }, + { + "epoch": 0.14569240966040303, + "grad_norm": 0.88671875, + "learning_rate": 0.00019510744483779973, + "loss": 1.1654, + "step": 5674 + }, + { + "epoch": 0.14571808685632487, + "grad_norm": 0.828125, + "learning_rate": 0.00019510606547524254, + "loss": 1.0859, + "step": 5675 + }, + { + "epoch": 0.14574376405224668, + "grad_norm": 0.84375, + "learning_rate": 0.00019510468592314753, + "loss": 1.2175, + "step": 5676 + }, + { + "epoch": 0.14576944124816849, + "grad_norm": 0.890625, + "learning_rate": 0.00019510330618151747, + "loss": 1.0833, + "step": 5677 + }, + { + "epoch": 0.14579511844409032, + "grad_norm": 0.84765625, + "learning_rate": 0.00019510192625035513, + "loss": 1.0054, + "step": 5678 + }, + { + "epoch": 0.14582079564001213, + "grad_norm": 0.75390625, + "learning_rate": 0.00019510054612966325, + "loss": 1.0538, + "step": 5679 + }, + { + "epoch": 0.14584647283593394, + "grad_norm": 0.85546875, + "learning_rate": 0.00019509916581944453, + "loss": 1.2728, + "step": 5680 + }, + { + "epoch": 0.14587215003185577, + "grad_norm": 0.890625, + "learning_rate": 0.0001950977853197018, + "loss": 1.2422, + "step": 5681 + }, + { + "epoch": 0.14589782722777758, + "grad_norm": 0.8203125, + "learning_rate": 0.00019509640463043776, + "loss": 1.0464, + "step": 5682 + }, + { + "epoch": 0.14592350442369942, + "grad_norm": 0.87890625, + "learning_rate": 0.00019509502375165519, + "loss": 1.2628, + "step": 5683 + }, + { + "epoch": 0.14594918161962123, + "grad_norm": 0.86328125, + "learning_rate": 0.0001950936426833568, + "loss": 1.1298, + "step": 5684 + }, + { + "epoch": 0.14597485881554303, + "grad_norm": 1.2890625, + "learning_rate": 0.0001950922614255454, + "loss": 1.1637, + "step": 5685 + }, + { + "epoch": 0.14600053601146487, + "grad_norm": 0.8203125, + "learning_rate": 0.0001950908799782237, + "loss": 1.1942, + "step": 5686 + }, + { + "epoch": 0.14602621320738668, + "grad_norm": 0.8515625, + "learning_rate": 0.00019508949834139445, + "loss": 1.0772, + "step": 5687 + }, + { + "epoch": 0.14605189040330852, + "grad_norm": 0.828125, + "learning_rate": 0.00019508811651506046, + "loss": 0.9831, + "step": 5688 + }, + { + "epoch": 0.14607756759923032, + "grad_norm": 0.8515625, + "learning_rate": 0.00019508673449922438, + "loss": 1.058, + "step": 5689 + }, + { + "epoch": 0.14610324479515213, + "grad_norm": 0.7890625, + "learning_rate": 0.00019508535229388908, + "loss": 1.0737, + "step": 5690 + }, + { + "epoch": 0.14612892199107397, + "grad_norm": 0.9140625, + "learning_rate": 0.00019508396989905723, + "loss": 1.194, + "step": 5691 + }, + { + "epoch": 0.14615459918699578, + "grad_norm": 0.80859375, + "learning_rate": 0.00019508258731473167, + "loss": 1.0551, + "step": 5692 + }, + { + "epoch": 0.1461802763829176, + "grad_norm": 0.76953125, + "learning_rate": 0.00019508120454091505, + "loss": 1.1063, + "step": 5693 + }, + { + "epoch": 0.14620595357883942, + "grad_norm": 1.0078125, + "learning_rate": 0.0001950798215776102, + "loss": 1.1183, + "step": 5694 + }, + { + "epoch": 0.14623163077476123, + "grad_norm": 0.8515625, + "learning_rate": 0.00019507843842481984, + "loss": 1.1871, + "step": 5695 + }, + { + "epoch": 0.14625730797068306, + "grad_norm": 0.78125, + "learning_rate": 0.00019507705508254675, + "loss": 1.2376, + "step": 5696 + }, + { + "epoch": 0.14628298516660487, + "grad_norm": 0.80859375, + "learning_rate": 0.00019507567155079366, + "loss": 1.1538, + "step": 5697 + }, + { + "epoch": 0.1463086623625267, + "grad_norm": 0.828125, + "learning_rate": 0.00019507428782956338, + "loss": 1.0638, + "step": 5698 + }, + { + "epoch": 0.14633433955844852, + "grad_norm": 0.83984375, + "learning_rate": 0.00019507290391885862, + "loss": 1.1533, + "step": 5699 + }, + { + "epoch": 0.14636001675437033, + "grad_norm": 0.78125, + "learning_rate": 0.00019507151981868213, + "loss": 1.0632, + "step": 5700 + }, + { + "epoch": 0.14638569395029216, + "grad_norm": 0.859375, + "learning_rate": 0.0001950701355290367, + "loss": 1.1593, + "step": 5701 + }, + { + "epoch": 0.14641137114621397, + "grad_norm": 0.81640625, + "learning_rate": 0.0001950687510499251, + "loss": 0.893, + "step": 5702 + }, + { + "epoch": 0.1464370483421358, + "grad_norm": 0.8203125, + "learning_rate": 0.00019506736638135003, + "loss": 1.1319, + "step": 5703 + }, + { + "epoch": 0.14646272553805761, + "grad_norm": 0.83984375, + "learning_rate": 0.0001950659815233143, + "loss": 1.1954, + "step": 5704 + }, + { + "epoch": 0.14648840273397942, + "grad_norm": 0.7578125, + "learning_rate": 0.00019506459647582064, + "loss": 1.0523, + "step": 5705 + }, + { + "epoch": 0.14651407992990126, + "grad_norm": 0.77734375, + "learning_rate": 0.00019506321123887187, + "loss": 1.0812, + "step": 5706 + }, + { + "epoch": 0.14653975712582307, + "grad_norm": 0.8046875, + "learning_rate": 0.0001950618258124707, + "loss": 0.9249, + "step": 5707 + }, + { + "epoch": 0.1465654343217449, + "grad_norm": 0.8671875, + "learning_rate": 0.00019506044019661987, + "loss": 1.0555, + "step": 5708 + }, + { + "epoch": 0.1465911115176667, + "grad_norm": 0.8671875, + "learning_rate": 0.00019505905439132218, + "loss": 1.0876, + "step": 5709 + }, + { + "epoch": 0.14661678871358852, + "grad_norm": 0.8828125, + "learning_rate": 0.00019505766839658038, + "loss": 1.1076, + "step": 5710 + }, + { + "epoch": 0.14664246590951036, + "grad_norm": 0.76953125, + "learning_rate": 0.00019505628221239723, + "loss": 1.1632, + "step": 5711 + }, + { + "epoch": 0.14666814310543216, + "grad_norm": 0.78515625, + "learning_rate": 0.0001950548958387755, + "loss": 1.2013, + "step": 5712 + }, + { + "epoch": 0.146693820301354, + "grad_norm": 0.81640625, + "learning_rate": 0.00019505350927571791, + "loss": 1.1852, + "step": 5713 + }, + { + "epoch": 0.1467194974972758, + "grad_norm": 0.79296875, + "learning_rate": 0.00019505212252322732, + "loss": 1.1725, + "step": 5714 + }, + { + "epoch": 0.14674517469319762, + "grad_norm": 0.76953125, + "learning_rate": 0.00019505073558130645, + "loss": 0.9762, + "step": 5715 + }, + { + "epoch": 0.14677085188911945, + "grad_norm": 0.8359375, + "learning_rate": 0.000195049348449958, + "loss": 0.9923, + "step": 5716 + }, + { + "epoch": 0.14679652908504126, + "grad_norm": 0.80078125, + "learning_rate": 0.0001950479611291848, + "loss": 1.1498, + "step": 5717 + }, + { + "epoch": 0.1468222062809631, + "grad_norm": 0.82421875, + "learning_rate": 0.00019504657361898962, + "loss": 1.082, + "step": 5718 + }, + { + "epoch": 0.1468478834768849, + "grad_norm": 0.87109375, + "learning_rate": 0.00019504518591937514, + "loss": 1.0857, + "step": 5719 + }, + { + "epoch": 0.1468735606728067, + "grad_norm": 0.828125, + "learning_rate": 0.00019504379803034425, + "loss": 1.0475, + "step": 5720 + }, + { + "epoch": 0.14689923786872855, + "grad_norm": 0.890625, + "learning_rate": 0.00019504240995189966, + "loss": 1.2142, + "step": 5721 + }, + { + "epoch": 0.14692491506465036, + "grad_norm": 0.890625, + "learning_rate": 0.0001950410216840441, + "loss": 1.1217, + "step": 5722 + }, + { + "epoch": 0.1469505922605722, + "grad_norm": 0.90625, + "learning_rate": 0.00019503963322678038, + "loss": 1.2274, + "step": 5723 + }, + { + "epoch": 0.146976269456494, + "grad_norm": 1.0390625, + "learning_rate": 0.00019503824458011126, + "loss": 1.1303, + "step": 5724 + }, + { + "epoch": 0.1470019466524158, + "grad_norm": 0.8359375, + "learning_rate": 0.00019503685574403948, + "loss": 1.096, + "step": 5725 + }, + { + "epoch": 0.14702762384833765, + "grad_norm": 0.85546875, + "learning_rate": 0.00019503546671856788, + "loss": 0.9993, + "step": 5726 + }, + { + "epoch": 0.14705330104425945, + "grad_norm": 0.87109375, + "learning_rate": 0.00019503407750369914, + "loss": 1.0674, + "step": 5727 + }, + { + "epoch": 0.1470789782401813, + "grad_norm": 0.80078125, + "learning_rate": 0.00019503268809943606, + "loss": 1.059, + "step": 5728 + }, + { + "epoch": 0.1471046554361031, + "grad_norm": 0.96875, + "learning_rate": 0.00019503129850578145, + "loss": 1.25, + "step": 5729 + }, + { + "epoch": 0.1471303326320249, + "grad_norm": 0.8359375, + "learning_rate": 0.00019502990872273803, + "loss": 1.0524, + "step": 5730 + }, + { + "epoch": 0.14715600982794674, + "grad_norm": 0.76953125, + "learning_rate": 0.0001950285187503086, + "loss": 1.0442, + "step": 5731 + }, + { + "epoch": 0.14718168702386855, + "grad_norm": 0.9140625, + "learning_rate": 0.0001950271285884959, + "loss": 1.125, + "step": 5732 + }, + { + "epoch": 0.1472073642197904, + "grad_norm": 0.828125, + "learning_rate": 0.00019502573823730273, + "loss": 1.2309, + "step": 5733 + }, + { + "epoch": 0.1472330414157122, + "grad_norm": 0.8515625, + "learning_rate": 0.00019502434769673183, + "loss": 1.0122, + "step": 5734 + }, + { + "epoch": 0.147258718611634, + "grad_norm": 0.84375, + "learning_rate": 0.000195022956966786, + "loss": 1.0008, + "step": 5735 + }, + { + "epoch": 0.14728439580755584, + "grad_norm": 0.80078125, + "learning_rate": 0.000195021566047468, + "loss": 1.0745, + "step": 5736 + }, + { + "epoch": 0.14731007300347765, + "grad_norm": 0.828125, + "learning_rate": 0.0001950201749387806, + "loss": 1.0711, + "step": 5737 + }, + { + "epoch": 0.14733575019939948, + "grad_norm": 1.34375, + "learning_rate": 0.00019501878364072656, + "loss": 1.0719, + "step": 5738 + }, + { + "epoch": 0.1473614273953213, + "grad_norm": 0.7578125, + "learning_rate": 0.00019501739215330868, + "loss": 1.089, + "step": 5739 + }, + { + "epoch": 0.1473871045912431, + "grad_norm": 0.83984375, + "learning_rate": 0.00019501600047652974, + "loss": 1.1924, + "step": 5740 + }, + { + "epoch": 0.14741278178716494, + "grad_norm": 0.82421875, + "learning_rate": 0.00019501460861039247, + "loss": 1.0791, + "step": 5741 + }, + { + "epoch": 0.14743845898308675, + "grad_norm": 0.82421875, + "learning_rate": 0.00019501321655489965, + "loss": 1.2021, + "step": 5742 + }, + { + "epoch": 0.14746413617900858, + "grad_norm": 0.7890625, + "learning_rate": 0.0001950118243100541, + "loss": 1.0447, + "step": 5743 + }, + { + "epoch": 0.1474898133749304, + "grad_norm": 0.8046875, + "learning_rate": 0.00019501043187585858, + "loss": 1.0357, + "step": 5744 + }, + { + "epoch": 0.1475154905708522, + "grad_norm": 0.79296875, + "learning_rate": 0.0001950090392523158, + "loss": 1.0568, + "step": 5745 + }, + { + "epoch": 0.14754116776677403, + "grad_norm": 0.83203125, + "learning_rate": 0.00019500764643942865, + "loss": 1.0999, + "step": 5746 + }, + { + "epoch": 0.14756684496269584, + "grad_norm": 0.75, + "learning_rate": 0.00019500625343719978, + "loss": 1.0064, + "step": 5747 + }, + { + "epoch": 0.14759252215861768, + "grad_norm": 0.89453125, + "learning_rate": 0.00019500486024563206, + "loss": 1.1481, + "step": 5748 + }, + { + "epoch": 0.1476181993545395, + "grad_norm": 0.80859375, + "learning_rate": 0.00019500346686472826, + "loss": 0.9884, + "step": 5749 + }, + { + "epoch": 0.1476438765504613, + "grad_norm": 0.84765625, + "learning_rate": 0.00019500207329449108, + "loss": 1.1011, + "step": 5750 + }, + { + "epoch": 0.14766955374638313, + "grad_norm": 0.8984375, + "learning_rate": 0.0001950006795349234, + "loss": 1.1909, + "step": 5751 + }, + { + "epoch": 0.14769523094230494, + "grad_norm": 0.81640625, + "learning_rate": 0.00019499928558602792, + "loss": 1.2041, + "step": 5752 + }, + { + "epoch": 0.14772090813822678, + "grad_norm": 0.8515625, + "learning_rate": 0.00019499789144780745, + "loss": 1.1551, + "step": 5753 + }, + { + "epoch": 0.14774658533414858, + "grad_norm": 0.83984375, + "learning_rate": 0.00019499649712026478, + "loss": 1.0723, + "step": 5754 + }, + { + "epoch": 0.1477722625300704, + "grad_norm": 0.84765625, + "learning_rate": 0.00019499510260340263, + "loss": 1.1008, + "step": 5755 + }, + { + "epoch": 0.14779793972599223, + "grad_norm": 0.7890625, + "learning_rate": 0.00019499370789722385, + "loss": 1.0906, + "step": 5756 + }, + { + "epoch": 0.14782361692191404, + "grad_norm": 0.9375, + "learning_rate": 0.0001949923130017312, + "loss": 1.0534, + "step": 5757 + }, + { + "epoch": 0.14784929411783587, + "grad_norm": 0.80078125, + "learning_rate": 0.00019499091791692742, + "loss": 1.0691, + "step": 5758 + }, + { + "epoch": 0.14787497131375768, + "grad_norm": 0.796875, + "learning_rate": 0.00019498952264281532, + "loss": 1.0925, + "step": 5759 + }, + { + "epoch": 0.1479006485096795, + "grad_norm": 0.82421875, + "learning_rate": 0.0001949881271793977, + "loss": 1.0203, + "step": 5760 + }, + { + "epoch": 0.14792632570560132, + "grad_norm": 0.890625, + "learning_rate": 0.00019498673152667733, + "loss": 1.0748, + "step": 5761 + }, + { + "epoch": 0.14795200290152313, + "grad_norm": 0.859375, + "learning_rate": 0.000194985335684657, + "loss": 1.1232, + "step": 5762 + }, + { + "epoch": 0.14797768009744497, + "grad_norm": 0.8203125, + "learning_rate": 0.00019498393965333946, + "loss": 1.1366, + "step": 5763 + }, + { + "epoch": 0.14800335729336678, + "grad_norm": 0.87109375, + "learning_rate": 0.00019498254343272749, + "loss": 1.0376, + "step": 5764 + }, + { + "epoch": 0.14802903448928859, + "grad_norm": 0.75, + "learning_rate": 0.0001949811470228239, + "loss": 1.042, + "step": 5765 + }, + { + "epoch": 0.14805471168521042, + "grad_norm": 0.78515625, + "learning_rate": 0.00019497975042363148, + "loss": 1.1531, + "step": 5766 + }, + { + "epoch": 0.14808038888113223, + "grad_norm": 0.8046875, + "learning_rate": 0.00019497835363515298, + "loss": 1.0636, + "step": 5767 + }, + { + "epoch": 0.14810606607705407, + "grad_norm": 0.859375, + "learning_rate": 0.00019497695665739121, + "loss": 1.0666, + "step": 5768 + }, + { + "epoch": 0.14813174327297587, + "grad_norm": 0.78125, + "learning_rate": 0.00019497555949034894, + "loss": 1.0637, + "step": 5769 + }, + { + "epoch": 0.14815742046889768, + "grad_norm": 0.765625, + "learning_rate": 0.00019497416213402896, + "loss": 1.101, + "step": 5770 + }, + { + "epoch": 0.14818309766481952, + "grad_norm": 0.83203125, + "learning_rate": 0.00019497276458843406, + "loss": 1.0145, + "step": 5771 + }, + { + "epoch": 0.14820877486074133, + "grad_norm": 0.92578125, + "learning_rate": 0.000194971366853567, + "loss": 0.9899, + "step": 5772 + }, + { + "epoch": 0.14823445205666316, + "grad_norm": 0.79296875, + "learning_rate": 0.0001949699689294306, + "loss": 0.9206, + "step": 5773 + }, + { + "epoch": 0.14826012925258497, + "grad_norm": 0.875, + "learning_rate": 0.0001949685708160276, + "loss": 1.0852, + "step": 5774 + }, + { + "epoch": 0.14828580644850678, + "grad_norm": 0.8203125, + "learning_rate": 0.00019496717251336085, + "loss": 1.1082, + "step": 5775 + }, + { + "epoch": 0.14831148364442862, + "grad_norm": 0.8125, + "learning_rate": 0.0001949657740214331, + "loss": 1.1009, + "step": 5776 + }, + { + "epoch": 0.14833716084035042, + "grad_norm": 0.8203125, + "learning_rate": 0.0001949643753402471, + "loss": 1.1027, + "step": 5777 + }, + { + "epoch": 0.14836283803627226, + "grad_norm": 0.7421875, + "learning_rate": 0.00019496297646980575, + "loss": 0.974, + "step": 5778 + }, + { + "epoch": 0.14838851523219407, + "grad_norm": 0.7578125, + "learning_rate": 0.00019496157741011173, + "loss": 1.0451, + "step": 5779 + }, + { + "epoch": 0.14841419242811588, + "grad_norm": 0.8046875, + "learning_rate": 0.00019496017816116786, + "loss": 1.0561, + "step": 5780 + }, + { + "epoch": 0.1484398696240377, + "grad_norm": 0.828125, + "learning_rate": 0.00019495877872297693, + "loss": 0.9441, + "step": 5781 + }, + { + "epoch": 0.14846554681995952, + "grad_norm": 0.80078125, + "learning_rate": 0.00019495737909554174, + "loss": 0.9701, + "step": 5782 + }, + { + "epoch": 0.14849122401588136, + "grad_norm": 0.84765625, + "learning_rate": 0.00019495597927886508, + "loss": 1.0793, + "step": 5783 + }, + { + "epoch": 0.14851690121180317, + "grad_norm": 0.91015625, + "learning_rate": 0.00019495457927294973, + "loss": 1.214, + "step": 5784 + }, + { + "epoch": 0.14854257840772497, + "grad_norm": 0.91796875, + "learning_rate": 0.00019495317907779845, + "loss": 1.213, + "step": 5785 + }, + { + "epoch": 0.1485682556036468, + "grad_norm": 0.8125, + "learning_rate": 0.00019495177869341407, + "loss": 1.0544, + "step": 5786 + }, + { + "epoch": 0.14859393279956862, + "grad_norm": 0.8125, + "learning_rate": 0.00019495037811979938, + "loss": 1.0301, + "step": 5787 + }, + { + "epoch": 0.14861960999549045, + "grad_norm": 0.8203125, + "learning_rate": 0.00019494897735695718, + "loss": 1.172, + "step": 5788 + }, + { + "epoch": 0.14864528719141226, + "grad_norm": 0.7421875, + "learning_rate": 0.00019494757640489026, + "loss": 0.943, + "step": 5789 + }, + { + "epoch": 0.14867096438733407, + "grad_norm": 0.8203125, + "learning_rate": 0.00019494617526360136, + "loss": 1.0238, + "step": 5790 + }, + { + "epoch": 0.1486966415832559, + "grad_norm": 0.86328125, + "learning_rate": 0.00019494477393309332, + "loss": 1.2207, + "step": 5791 + }, + { + "epoch": 0.14872231877917771, + "grad_norm": 0.8515625, + "learning_rate": 0.0001949433724133689, + "loss": 1.0595, + "step": 5792 + }, + { + "epoch": 0.14874799597509955, + "grad_norm": 0.83203125, + "learning_rate": 0.00019494197070443094, + "loss": 1.0757, + "step": 5793 + }, + { + "epoch": 0.14877367317102136, + "grad_norm": 0.89453125, + "learning_rate": 0.00019494056880628219, + "loss": 1.1335, + "step": 5794 + }, + { + "epoch": 0.14879935036694317, + "grad_norm": 1.15625, + "learning_rate": 0.0001949391667189255, + "loss": 1.0925, + "step": 5795 + }, + { + "epoch": 0.148825027562865, + "grad_norm": 0.859375, + "learning_rate": 0.0001949377644423636, + "loss": 1.0902, + "step": 5796 + }, + { + "epoch": 0.1488507047587868, + "grad_norm": 0.89453125, + "learning_rate": 0.00019493636197659932, + "loss": 1.2355, + "step": 5797 + }, + { + "epoch": 0.14887638195470865, + "grad_norm": 0.8125, + "learning_rate": 0.0001949349593216354, + "loss": 1.1798, + "step": 5798 + }, + { + "epoch": 0.14890205915063046, + "grad_norm": 0.796875, + "learning_rate": 0.00019493355647747475, + "loss": 1.0503, + "step": 5799 + }, + { + "epoch": 0.14892773634655226, + "grad_norm": 0.79296875, + "learning_rate": 0.00019493215344412005, + "loss": 1.0008, + "step": 5800 + }, + { + "epoch": 0.1489534135424741, + "grad_norm": 0.8671875, + "learning_rate": 0.00019493075022157416, + "loss": 1.089, + "step": 5801 + }, + { + "epoch": 0.1489790907383959, + "grad_norm": 0.84765625, + "learning_rate": 0.00019492934680983987, + "loss": 1.0759, + "step": 5802 + }, + { + "epoch": 0.14900476793431774, + "grad_norm": 0.81640625, + "learning_rate": 0.00019492794320891997, + "loss": 1.053, + "step": 5803 + }, + { + "epoch": 0.14903044513023955, + "grad_norm": 0.796875, + "learning_rate": 0.00019492653941881725, + "loss": 1.085, + "step": 5804 + }, + { + "epoch": 0.14905612232616136, + "grad_norm": 0.8203125, + "learning_rate": 0.0001949251354395345, + "loss": 1.0976, + "step": 5805 + }, + { + "epoch": 0.1490817995220832, + "grad_norm": 0.8203125, + "learning_rate": 0.00019492373127107454, + "loss": 1.128, + "step": 5806 + }, + { + "epoch": 0.149107476718005, + "grad_norm": 0.84375, + "learning_rate": 0.00019492232691344015, + "loss": 1.1151, + "step": 5807 + }, + { + "epoch": 0.14913315391392684, + "grad_norm": 0.8203125, + "learning_rate": 0.00019492092236663412, + "loss": 1.0438, + "step": 5808 + }, + { + "epoch": 0.14915883110984865, + "grad_norm": 0.8046875, + "learning_rate": 0.0001949195176306593, + "loss": 1.0795, + "step": 5809 + }, + { + "epoch": 0.14918450830577046, + "grad_norm": 0.8203125, + "learning_rate": 0.00019491811270551843, + "loss": 1.1134, + "step": 5810 + }, + { + "epoch": 0.1492101855016923, + "grad_norm": 0.890625, + "learning_rate": 0.00019491670759121433, + "loss": 1.0996, + "step": 5811 + }, + { + "epoch": 0.1492358626976141, + "grad_norm": 0.89453125, + "learning_rate": 0.00019491530228774983, + "loss": 1.1189, + "step": 5812 + }, + { + "epoch": 0.14926153989353594, + "grad_norm": 0.76953125, + "learning_rate": 0.0001949138967951277, + "loss": 0.9576, + "step": 5813 + }, + { + "epoch": 0.14928721708945775, + "grad_norm": 0.765625, + "learning_rate": 0.00019491249111335075, + "loss": 1.0389, + "step": 5814 + }, + { + "epoch": 0.14931289428537955, + "grad_norm": 0.80859375, + "learning_rate": 0.00019491108524242177, + "loss": 1.0922, + "step": 5815 + }, + { + "epoch": 0.1493385714813014, + "grad_norm": 0.86328125, + "learning_rate": 0.00019490967918234359, + "loss": 1.0053, + "step": 5816 + }, + { + "epoch": 0.1493642486772232, + "grad_norm": 0.8828125, + "learning_rate": 0.00019490827293311896, + "loss": 1.2209, + "step": 5817 + }, + { + "epoch": 0.14938992587314504, + "grad_norm": 0.8125, + "learning_rate": 0.00019490686649475074, + "loss": 1.0546, + "step": 5818 + }, + { + "epoch": 0.14941560306906684, + "grad_norm": 0.80859375, + "learning_rate": 0.0001949054598672417, + "loss": 1.1419, + "step": 5819 + }, + { + "epoch": 0.14944128026498865, + "grad_norm": 0.87109375, + "learning_rate": 0.00019490405305059465, + "loss": 1.103, + "step": 5820 + }, + { + "epoch": 0.1494669574609105, + "grad_norm": 0.84765625, + "learning_rate": 0.00019490264604481237, + "loss": 1.2194, + "step": 5821 + }, + { + "epoch": 0.1494926346568323, + "grad_norm": 0.88671875, + "learning_rate": 0.00019490123884989772, + "loss": 1.2272, + "step": 5822 + }, + { + "epoch": 0.14951831185275413, + "grad_norm": 0.828125, + "learning_rate": 0.00019489983146585348, + "loss": 1.0524, + "step": 5823 + }, + { + "epoch": 0.14954398904867594, + "grad_norm": 0.9453125, + "learning_rate": 0.00019489842389268242, + "loss": 1.1712, + "step": 5824 + }, + { + "epoch": 0.14956966624459775, + "grad_norm": 0.83203125, + "learning_rate": 0.0001948970161303874, + "loss": 1.1924, + "step": 5825 + }, + { + "epoch": 0.14959534344051958, + "grad_norm": 0.828125, + "learning_rate": 0.00019489560817897118, + "loss": 1.1005, + "step": 5826 + }, + { + "epoch": 0.1496210206364414, + "grad_norm": 0.984375, + "learning_rate": 0.0001948942000384366, + "loss": 1.1643, + "step": 5827 + }, + { + "epoch": 0.14964669783236323, + "grad_norm": 0.81640625, + "learning_rate": 0.00019489279170878642, + "loss": 0.9992, + "step": 5828 + }, + { + "epoch": 0.14967237502828504, + "grad_norm": 0.83203125, + "learning_rate": 0.0001948913831900235, + "loss": 1.1019, + "step": 5829 + }, + { + "epoch": 0.14969805222420685, + "grad_norm": 0.82421875, + "learning_rate": 0.00019488997448215064, + "loss": 0.9314, + "step": 5830 + }, + { + "epoch": 0.14972372942012868, + "grad_norm": 1.3515625, + "learning_rate": 0.00019488856558517062, + "loss": 1.2072, + "step": 5831 + }, + { + "epoch": 0.1497494066160505, + "grad_norm": 0.91796875, + "learning_rate": 0.00019488715649908627, + "loss": 1.1818, + "step": 5832 + }, + { + "epoch": 0.14977508381197233, + "grad_norm": 0.890625, + "learning_rate": 0.00019488574722390035, + "loss": 1.1798, + "step": 5833 + }, + { + "epoch": 0.14980076100789413, + "grad_norm": 0.8125, + "learning_rate": 0.00019488433775961575, + "loss": 1.0033, + "step": 5834 + }, + { + "epoch": 0.14982643820381594, + "grad_norm": 0.79296875, + "learning_rate": 0.0001948829281062352, + "loss": 1.0686, + "step": 5835 + }, + { + "epoch": 0.14985211539973778, + "grad_norm": 0.7890625, + "learning_rate": 0.00019488151826376157, + "loss": 1.222, + "step": 5836 + }, + { + "epoch": 0.1498777925956596, + "grad_norm": 0.87109375, + "learning_rate": 0.00019488010823219765, + "loss": 1.2059, + "step": 5837 + }, + { + "epoch": 0.14990346979158142, + "grad_norm": 0.73828125, + "learning_rate": 0.00019487869801154624, + "loss": 0.9916, + "step": 5838 + }, + { + "epoch": 0.14992914698750323, + "grad_norm": 0.82421875, + "learning_rate": 0.00019487728760181014, + "loss": 1.08, + "step": 5839 + }, + { + "epoch": 0.14995482418342504, + "grad_norm": 0.9609375, + "learning_rate": 0.00019487587700299218, + "loss": 1.225, + "step": 5840 + }, + { + "epoch": 0.14998050137934688, + "grad_norm": 0.8828125, + "learning_rate": 0.00019487446621509517, + "loss": 1.2521, + "step": 5841 + }, + { + "epoch": 0.15000617857526868, + "grad_norm": 0.80859375, + "learning_rate": 0.00019487305523812194, + "loss": 1.1042, + "step": 5842 + }, + { + "epoch": 0.15003185577119052, + "grad_norm": 0.76953125, + "learning_rate": 0.00019487164407207527, + "loss": 0.9846, + "step": 5843 + }, + { + "epoch": 0.15005753296711233, + "grad_norm": 0.79296875, + "learning_rate": 0.00019487023271695794, + "loss": 1.1247, + "step": 5844 + }, + { + "epoch": 0.15008321016303414, + "grad_norm": 0.78515625, + "learning_rate": 0.00019486882117277285, + "loss": 1.0209, + "step": 5845 + }, + { + "epoch": 0.15010888735895597, + "grad_norm": 1.46875, + "learning_rate": 0.00019486740943952275, + "loss": 1.1093, + "step": 5846 + }, + { + "epoch": 0.15013456455487778, + "grad_norm": 0.8359375, + "learning_rate": 0.00019486599751721047, + "loss": 1.0277, + "step": 5847 + }, + { + "epoch": 0.15016024175079962, + "grad_norm": 0.76953125, + "learning_rate": 0.00019486458540583885, + "loss": 1.0026, + "step": 5848 + }, + { + "epoch": 0.15018591894672143, + "grad_norm": 0.84375, + "learning_rate": 0.00019486317310541068, + "loss": 1.072, + "step": 5849 + }, + { + "epoch": 0.15021159614264323, + "grad_norm": 0.796875, + "learning_rate": 0.00019486176061592875, + "loss": 1.135, + "step": 5850 + }, + { + "epoch": 0.15023727333856507, + "grad_norm": 0.83203125, + "learning_rate": 0.00019486034793739592, + "loss": 1.0572, + "step": 5851 + }, + { + "epoch": 0.15026295053448688, + "grad_norm": 0.828125, + "learning_rate": 0.000194858935069815, + "loss": 1.0883, + "step": 5852 + }, + { + "epoch": 0.1502886277304087, + "grad_norm": 0.80859375, + "learning_rate": 0.00019485752201318875, + "loss": 1.0477, + "step": 5853 + }, + { + "epoch": 0.15031430492633052, + "grad_norm": 0.85546875, + "learning_rate": 0.00019485610876752003, + "loss": 1.0004, + "step": 5854 + }, + { + "epoch": 0.15033998212225233, + "grad_norm": 0.8359375, + "learning_rate": 0.00019485469533281168, + "loss": 1.0961, + "step": 5855 + }, + { + "epoch": 0.15036565931817417, + "grad_norm": 0.8828125, + "learning_rate": 0.00019485328170906646, + "loss": 1.0497, + "step": 5856 + }, + { + "epoch": 0.15039133651409597, + "grad_norm": 0.81640625, + "learning_rate": 0.00019485186789628726, + "loss": 1.2042, + "step": 5857 + }, + { + "epoch": 0.1504170137100178, + "grad_norm": 0.875, + "learning_rate": 0.00019485045389447682, + "loss": 1.0901, + "step": 5858 + }, + { + "epoch": 0.15044269090593962, + "grad_norm": 0.8671875, + "learning_rate": 0.000194849039703638, + "loss": 1.064, + "step": 5859 + }, + { + "epoch": 0.15046836810186143, + "grad_norm": 0.8515625, + "learning_rate": 0.00019484762532377364, + "loss": 1.2158, + "step": 5860 + }, + { + "epoch": 0.15049404529778326, + "grad_norm": 0.80078125, + "learning_rate": 0.0001948462107548865, + "loss": 1.0467, + "step": 5861 + }, + { + "epoch": 0.15051972249370507, + "grad_norm": 0.79296875, + "learning_rate": 0.0001948447959969794, + "loss": 0.9769, + "step": 5862 + }, + { + "epoch": 0.1505453996896269, + "grad_norm": 0.87109375, + "learning_rate": 0.00019484338105005527, + "loss": 1.0628, + "step": 5863 + }, + { + "epoch": 0.15057107688554872, + "grad_norm": 0.8203125, + "learning_rate": 0.0001948419659141168, + "loss": 0.8864, + "step": 5864 + }, + { + "epoch": 0.15059675408147052, + "grad_norm": 0.8671875, + "learning_rate": 0.00019484055058916687, + "loss": 0.9558, + "step": 5865 + }, + { + "epoch": 0.15062243127739236, + "grad_norm": 0.8125, + "learning_rate": 0.00019483913507520827, + "loss": 1.0137, + "step": 5866 + }, + { + "epoch": 0.15064810847331417, + "grad_norm": 0.8125, + "learning_rate": 0.00019483771937224385, + "loss": 1.0777, + "step": 5867 + }, + { + "epoch": 0.150673785669236, + "grad_norm": 0.76953125, + "learning_rate": 0.00019483630348027643, + "loss": 1.021, + "step": 5868 + }, + { + "epoch": 0.1506994628651578, + "grad_norm": 0.89453125, + "learning_rate": 0.0001948348873993088, + "loss": 1.1762, + "step": 5869 + }, + { + "epoch": 0.15072514006107962, + "grad_norm": 0.78515625, + "learning_rate": 0.00019483347112934384, + "loss": 1.0083, + "step": 5870 + }, + { + "epoch": 0.15075081725700146, + "grad_norm": 0.73828125, + "learning_rate": 0.00019483205467038436, + "loss": 0.975, + "step": 5871 + }, + { + "epoch": 0.15077649445292327, + "grad_norm": 0.86328125, + "learning_rate": 0.0001948306380224331, + "loss": 1.1614, + "step": 5872 + }, + { + "epoch": 0.1508021716488451, + "grad_norm": 0.84375, + "learning_rate": 0.00019482922118549295, + "loss": 0.9835, + "step": 5873 + }, + { + "epoch": 0.1508278488447669, + "grad_norm": 0.81640625, + "learning_rate": 0.00019482780415956677, + "loss": 1.0859, + "step": 5874 + }, + { + "epoch": 0.15085352604068872, + "grad_norm": 0.796875, + "learning_rate": 0.0001948263869446573, + "loss": 1.06, + "step": 5875 + }, + { + "epoch": 0.15087920323661055, + "grad_norm": 0.75390625, + "learning_rate": 0.00019482496954076743, + "loss": 1.0264, + "step": 5876 + }, + { + "epoch": 0.15090488043253236, + "grad_norm": 0.8359375, + "learning_rate": 0.00019482355194789996, + "loss": 1.0576, + "step": 5877 + }, + { + "epoch": 0.1509305576284542, + "grad_norm": 0.84375, + "learning_rate": 0.00019482213416605773, + "loss": 1.2093, + "step": 5878 + }, + { + "epoch": 0.150956234824376, + "grad_norm": 0.7421875, + "learning_rate": 0.0001948207161952435, + "loss": 1.0242, + "step": 5879 + }, + { + "epoch": 0.15098191202029781, + "grad_norm": 0.87890625, + "learning_rate": 0.0001948192980354602, + "loss": 1.1742, + "step": 5880 + }, + { + "epoch": 0.15100758921621965, + "grad_norm": 0.859375, + "learning_rate": 0.0001948178796867106, + "loss": 1.1378, + "step": 5881 + }, + { + "epoch": 0.15103326641214146, + "grad_norm": 0.8984375, + "learning_rate": 0.0001948164611489975, + "loss": 1.201, + "step": 5882 + }, + { + "epoch": 0.1510589436080633, + "grad_norm": 0.8125, + "learning_rate": 0.0001948150424223238, + "loss": 1.3102, + "step": 5883 + }, + { + "epoch": 0.1510846208039851, + "grad_norm": 0.8203125, + "learning_rate": 0.00019481362350669225, + "loss": 1.1804, + "step": 5884 + }, + { + "epoch": 0.1511102979999069, + "grad_norm": 0.875, + "learning_rate": 0.00019481220440210572, + "loss": 1.3638, + "step": 5885 + }, + { + "epoch": 0.15113597519582875, + "grad_norm": 0.85546875, + "learning_rate": 0.00019481078510856704, + "loss": 1.0594, + "step": 5886 + }, + { + "epoch": 0.15116165239175056, + "grad_norm": 0.80078125, + "learning_rate": 0.00019480936562607903, + "loss": 1.0398, + "step": 5887 + }, + { + "epoch": 0.15118732958767236, + "grad_norm": 0.83203125, + "learning_rate": 0.0001948079459546445, + "loss": 1.1567, + "step": 5888 + }, + { + "epoch": 0.1512130067835942, + "grad_norm": 0.8359375, + "learning_rate": 0.0001948065260942663, + "loss": 1.1217, + "step": 5889 + }, + { + "epoch": 0.151238683979516, + "grad_norm": 0.8046875, + "learning_rate": 0.00019480510604494725, + "loss": 0.9918, + "step": 5890 + }, + { + "epoch": 0.15126436117543784, + "grad_norm": 0.9609375, + "learning_rate": 0.00019480368580669023, + "loss": 1.0968, + "step": 5891 + }, + { + "epoch": 0.15129003837135965, + "grad_norm": 0.8125, + "learning_rate": 0.000194802265379498, + "loss": 1.0929, + "step": 5892 + }, + { + "epoch": 0.15131571556728146, + "grad_norm": 0.84375, + "learning_rate": 0.00019480084476337345, + "loss": 1.0799, + "step": 5893 + }, + { + "epoch": 0.1513413927632033, + "grad_norm": 0.76171875, + "learning_rate": 0.00019479942395831933, + "loss": 0.8989, + "step": 5894 + }, + { + "epoch": 0.1513670699591251, + "grad_norm": 0.76953125, + "learning_rate": 0.00019479800296433853, + "loss": 1.1352, + "step": 5895 + }, + { + "epoch": 0.15139274715504694, + "grad_norm": 0.8671875, + "learning_rate": 0.00019479658178143388, + "loss": 1.1677, + "step": 5896 + }, + { + "epoch": 0.15141842435096875, + "grad_norm": 0.83203125, + "learning_rate": 0.00019479516040960823, + "loss": 1.0731, + "step": 5897 + }, + { + "epoch": 0.15144410154689056, + "grad_norm": 0.875, + "learning_rate": 0.00019479373884886435, + "loss": 0.9052, + "step": 5898 + }, + { + "epoch": 0.1514697787428124, + "grad_norm": 0.75390625, + "learning_rate": 0.00019479231709920513, + "loss": 0.9209, + "step": 5899 + }, + { + "epoch": 0.1514954559387342, + "grad_norm": 0.8515625, + "learning_rate": 0.0001947908951606334, + "loss": 1.1112, + "step": 5900 + }, + { + "epoch": 0.15152113313465604, + "grad_norm": 0.890625, + "learning_rate": 0.00019478947303315196, + "loss": 1.2, + "step": 5901 + }, + { + "epoch": 0.15154681033057785, + "grad_norm": 0.75390625, + "learning_rate": 0.00019478805071676365, + "loss": 1.0994, + "step": 5902 + }, + { + "epoch": 0.15157248752649966, + "grad_norm": 0.86328125, + "learning_rate": 0.00019478662821147133, + "loss": 1.1451, + "step": 5903 + }, + { + "epoch": 0.1515981647224215, + "grad_norm": 0.875, + "learning_rate": 0.00019478520551727784, + "loss": 1.2253, + "step": 5904 + }, + { + "epoch": 0.1516238419183433, + "grad_norm": 0.859375, + "learning_rate": 0.00019478378263418597, + "loss": 1.1253, + "step": 5905 + }, + { + "epoch": 0.15164951911426514, + "grad_norm": 0.81640625, + "learning_rate": 0.00019478235956219863, + "loss": 1.0973, + "step": 5906 + }, + { + "epoch": 0.15167519631018694, + "grad_norm": 0.83203125, + "learning_rate": 0.00019478093630131852, + "loss": 1.0235, + "step": 5907 + }, + { + "epoch": 0.15170087350610875, + "grad_norm": 0.80078125, + "learning_rate": 0.00019477951285154865, + "loss": 1.0398, + "step": 5908 + }, + { + "epoch": 0.1517265507020306, + "grad_norm": 0.7734375, + "learning_rate": 0.00019477808921289174, + "loss": 1.0783, + "step": 5909 + }, + { + "epoch": 0.1517522278979524, + "grad_norm": 0.8046875, + "learning_rate": 0.00019477666538535065, + "loss": 1.2295, + "step": 5910 + }, + { + "epoch": 0.15177790509387423, + "grad_norm": 0.82421875, + "learning_rate": 0.00019477524136892824, + "loss": 1.0956, + "step": 5911 + }, + { + "epoch": 0.15180358228979604, + "grad_norm": 0.84375, + "learning_rate": 0.00019477381716362735, + "loss": 0.9847, + "step": 5912 + }, + { + "epoch": 0.15182925948571785, + "grad_norm": 0.828125, + "learning_rate": 0.0001947723927694508, + "loss": 1.0965, + "step": 5913 + }, + { + "epoch": 0.15185493668163969, + "grad_norm": 0.8203125, + "learning_rate": 0.0001947709681864014, + "loss": 1.1403, + "step": 5914 + }, + { + "epoch": 0.1518806138775615, + "grad_norm": 0.90234375, + "learning_rate": 0.00019476954341448202, + "loss": 1.0677, + "step": 5915 + }, + { + "epoch": 0.15190629107348333, + "grad_norm": 0.875, + "learning_rate": 0.00019476811845369553, + "loss": 1.1182, + "step": 5916 + }, + { + "epoch": 0.15193196826940514, + "grad_norm": 0.8359375, + "learning_rate": 0.0001947666933040447, + "loss": 1.117, + "step": 5917 + }, + { + "epoch": 0.15195764546532695, + "grad_norm": 0.91015625, + "learning_rate": 0.00019476526796553248, + "loss": 1.2004, + "step": 5918 + }, + { + "epoch": 0.15198332266124878, + "grad_norm": 0.82421875, + "learning_rate": 0.00019476384243816158, + "loss": 1.2507, + "step": 5919 + }, + { + "epoch": 0.1520089998571706, + "grad_norm": 0.84765625, + "learning_rate": 0.00019476241672193494, + "loss": 1.0971, + "step": 5920 + }, + { + "epoch": 0.15203467705309243, + "grad_norm": 0.828125, + "learning_rate": 0.00019476099081685533, + "loss": 1.2621, + "step": 5921 + }, + { + "epoch": 0.15206035424901423, + "grad_norm": 0.8828125, + "learning_rate": 0.00019475956472292565, + "loss": 1.1016, + "step": 5922 + }, + { + "epoch": 0.15208603144493604, + "grad_norm": 0.85546875, + "learning_rate": 0.00019475813844014873, + "loss": 1.051, + "step": 5923 + }, + { + "epoch": 0.15211170864085788, + "grad_norm": 0.80078125, + "learning_rate": 0.00019475671196852735, + "loss": 1.112, + "step": 5924 + }, + { + "epoch": 0.1521373858367797, + "grad_norm": 0.84375, + "learning_rate": 0.00019475528530806443, + "loss": 1.0857, + "step": 5925 + }, + { + "epoch": 0.15216306303270152, + "grad_norm": 0.87109375, + "learning_rate": 0.0001947538584587628, + "loss": 1.0712, + "step": 5926 + }, + { + "epoch": 0.15218874022862333, + "grad_norm": 0.8359375, + "learning_rate": 0.00019475243142062527, + "loss": 1.0456, + "step": 5927 + }, + { + "epoch": 0.15221441742454514, + "grad_norm": 0.8359375, + "learning_rate": 0.0001947510041936547, + "loss": 0.9992, + "step": 5928 + }, + { + "epoch": 0.15224009462046698, + "grad_norm": 0.765625, + "learning_rate": 0.00019474957677785394, + "loss": 0.9702, + "step": 5929 + }, + { + "epoch": 0.15226577181638878, + "grad_norm": 0.9296875, + "learning_rate": 0.00019474814917322584, + "loss": 1.168, + "step": 5930 + }, + { + "epoch": 0.15229144901231062, + "grad_norm": 0.875, + "learning_rate": 0.00019474672137977325, + "loss": 1.0427, + "step": 5931 + }, + { + "epoch": 0.15231712620823243, + "grad_norm": 0.80078125, + "learning_rate": 0.000194745293397499, + "loss": 1.1387, + "step": 5932 + }, + { + "epoch": 0.15234280340415424, + "grad_norm": 0.80859375, + "learning_rate": 0.00019474386522640592, + "loss": 1.034, + "step": 5933 + }, + { + "epoch": 0.15236848060007607, + "grad_norm": 0.796875, + "learning_rate": 0.00019474243686649686, + "loss": 1.1741, + "step": 5934 + }, + { + "epoch": 0.15239415779599788, + "grad_norm": 0.80859375, + "learning_rate": 0.0001947410083177747, + "loss": 1.1665, + "step": 5935 + }, + { + "epoch": 0.15241983499191972, + "grad_norm": 0.9296875, + "learning_rate": 0.00019473957958024229, + "loss": 1.0716, + "step": 5936 + }, + { + "epoch": 0.15244551218784153, + "grad_norm": 0.84765625, + "learning_rate": 0.00019473815065390243, + "loss": 1.0451, + "step": 5937 + }, + { + "epoch": 0.15247118938376333, + "grad_norm": 0.81640625, + "learning_rate": 0.000194736721538758, + "loss": 1.0837, + "step": 5938 + }, + { + "epoch": 0.15249686657968517, + "grad_norm": 0.875, + "learning_rate": 0.00019473529223481184, + "loss": 1.1885, + "step": 5939 + }, + { + "epoch": 0.15252254377560698, + "grad_norm": 0.80859375, + "learning_rate": 0.00019473386274206683, + "loss": 1.0767, + "step": 5940 + }, + { + "epoch": 0.15254822097152881, + "grad_norm": 0.80078125, + "learning_rate": 0.00019473243306052573, + "loss": 1.0671, + "step": 5941 + }, + { + "epoch": 0.15257389816745062, + "grad_norm": 0.875, + "learning_rate": 0.00019473100319019148, + "loss": 1.2469, + "step": 5942 + }, + { + "epoch": 0.15259957536337243, + "grad_norm": 0.91015625, + "learning_rate": 0.0001947295731310669, + "loss": 1.0783, + "step": 5943 + }, + { + "epoch": 0.15262525255929427, + "grad_norm": 0.76953125, + "learning_rate": 0.00019472814288315485, + "loss": 1.0396, + "step": 5944 + }, + { + "epoch": 0.15265092975521607, + "grad_norm": 0.921875, + "learning_rate": 0.00019472671244645816, + "loss": 1.1789, + "step": 5945 + }, + { + "epoch": 0.1526766069511379, + "grad_norm": 1.0546875, + "learning_rate": 0.00019472528182097967, + "loss": 1.1704, + "step": 5946 + }, + { + "epoch": 0.15270228414705972, + "grad_norm": 0.82421875, + "learning_rate": 0.00019472385100672223, + "loss": 1.0395, + "step": 5947 + }, + { + "epoch": 0.15272796134298153, + "grad_norm": 0.80859375, + "learning_rate": 0.00019472242000368874, + "loss": 1.025, + "step": 5948 + }, + { + "epoch": 0.15275363853890336, + "grad_norm": 0.8125, + "learning_rate": 0.00019472098881188204, + "loss": 1.0492, + "step": 5949 + }, + { + "epoch": 0.15277931573482517, + "grad_norm": 0.8515625, + "learning_rate": 0.00019471955743130494, + "loss": 0.975, + "step": 5950 + }, + { + "epoch": 0.152804992930747, + "grad_norm": 0.83203125, + "learning_rate": 0.00019471812586196034, + "loss": 1.0289, + "step": 5951 + }, + { + "epoch": 0.15283067012666882, + "grad_norm": 0.82421875, + "learning_rate": 0.00019471669410385105, + "loss": 0.9706, + "step": 5952 + }, + { + "epoch": 0.15285634732259062, + "grad_norm": 0.80078125, + "learning_rate": 0.00019471526215697997, + "loss": 1.1338, + "step": 5953 + }, + { + "epoch": 0.15288202451851246, + "grad_norm": 0.7890625, + "learning_rate": 0.00019471383002134991, + "loss": 1.0879, + "step": 5954 + }, + { + "epoch": 0.15290770171443427, + "grad_norm": 0.78125, + "learning_rate": 0.00019471239769696376, + "loss": 1.0088, + "step": 5955 + }, + { + "epoch": 0.1529333789103561, + "grad_norm": 0.78125, + "learning_rate": 0.00019471096518382431, + "loss": 1.0458, + "step": 5956 + }, + { + "epoch": 0.1529590561062779, + "grad_norm": 0.875, + "learning_rate": 0.00019470953248193452, + "loss": 1.048, + "step": 5957 + }, + { + "epoch": 0.15298473330219972, + "grad_norm": 0.765625, + "learning_rate": 0.00019470809959129716, + "loss": 0.9338, + "step": 5958 + }, + { + "epoch": 0.15301041049812156, + "grad_norm": 0.82421875, + "learning_rate": 0.00019470666651191508, + "loss": 1.0274, + "step": 5959 + }, + { + "epoch": 0.15303608769404337, + "grad_norm": 0.7890625, + "learning_rate": 0.0001947052332437912, + "loss": 0.9167, + "step": 5960 + }, + { + "epoch": 0.1530617648899652, + "grad_norm": 0.796875, + "learning_rate": 0.00019470379978692837, + "loss": 1.0083, + "step": 5961 + }, + { + "epoch": 0.153087442085887, + "grad_norm": 0.79296875, + "learning_rate": 0.00019470236614132938, + "loss": 1.0598, + "step": 5962 + }, + { + "epoch": 0.15311311928180882, + "grad_norm": 0.81640625, + "learning_rate": 0.00019470093230699716, + "loss": 1.0752, + "step": 5963 + }, + { + "epoch": 0.15313879647773065, + "grad_norm": 0.77734375, + "learning_rate": 0.0001946994982839345, + "loss": 1.0394, + "step": 5964 + }, + { + "epoch": 0.15316447367365246, + "grad_norm": 0.80859375, + "learning_rate": 0.00019469806407214433, + "loss": 0.9591, + "step": 5965 + }, + { + "epoch": 0.1531901508695743, + "grad_norm": 0.8203125, + "learning_rate": 0.00019469662967162946, + "loss": 1.0596, + "step": 5966 + }, + { + "epoch": 0.1532158280654961, + "grad_norm": 0.9296875, + "learning_rate": 0.00019469519508239275, + "loss": 0.9725, + "step": 5967 + }, + { + "epoch": 0.15324150526141792, + "grad_norm": 0.85546875, + "learning_rate": 0.00019469376030443707, + "loss": 1.1805, + "step": 5968 + }, + { + "epoch": 0.15326718245733975, + "grad_norm": 0.8984375, + "learning_rate": 0.00019469232533776533, + "loss": 1.21, + "step": 5969 + }, + { + "epoch": 0.15329285965326156, + "grad_norm": 0.890625, + "learning_rate": 0.00019469089018238028, + "loss": 1.1231, + "step": 5970 + }, + { + "epoch": 0.1533185368491834, + "grad_norm": 0.80859375, + "learning_rate": 0.00019468945483828485, + "loss": 1.187, + "step": 5971 + }, + { + "epoch": 0.1533442140451052, + "grad_norm": 0.8125, + "learning_rate": 0.0001946880193054819, + "loss": 1.0387, + "step": 5972 + }, + { + "epoch": 0.153369891241027, + "grad_norm": 0.80078125, + "learning_rate": 0.00019468658358397426, + "loss": 1.0667, + "step": 5973 + }, + { + "epoch": 0.15339556843694885, + "grad_norm": 0.83984375, + "learning_rate": 0.00019468514767376482, + "loss": 1.0722, + "step": 5974 + }, + { + "epoch": 0.15342124563287066, + "grad_norm": 0.83984375, + "learning_rate": 0.00019468371157485646, + "loss": 0.9845, + "step": 5975 + }, + { + "epoch": 0.1534469228287925, + "grad_norm": 0.8515625, + "learning_rate": 0.000194682275287252, + "loss": 1.0668, + "step": 5976 + }, + { + "epoch": 0.1534726000247143, + "grad_norm": 0.94140625, + "learning_rate": 0.00019468083881095432, + "loss": 1.0804, + "step": 5977 + }, + { + "epoch": 0.1534982772206361, + "grad_norm": 0.71875, + "learning_rate": 0.00019467940214596628, + "loss": 0.9507, + "step": 5978 + }, + { + "epoch": 0.15352395441655795, + "grad_norm": 0.8515625, + "learning_rate": 0.00019467796529229071, + "loss": 1.03, + "step": 5979 + }, + { + "epoch": 0.15354963161247975, + "grad_norm": 0.82421875, + "learning_rate": 0.00019467652824993055, + "loss": 1.0918, + "step": 5980 + }, + { + "epoch": 0.1535753088084016, + "grad_norm": 0.8515625, + "learning_rate": 0.00019467509101888864, + "loss": 1.1139, + "step": 5981 + }, + { + "epoch": 0.1536009860043234, + "grad_norm": 0.7890625, + "learning_rate": 0.0001946736535991678, + "loss": 1.2347, + "step": 5982 + }, + { + "epoch": 0.1536266632002452, + "grad_norm": 0.828125, + "learning_rate": 0.00019467221599077088, + "loss": 1.1614, + "step": 5983 + }, + { + "epoch": 0.15365234039616704, + "grad_norm": 0.86328125, + "learning_rate": 0.00019467077819370084, + "loss": 1.1518, + "step": 5984 + }, + { + "epoch": 0.15367801759208885, + "grad_norm": 0.87890625, + "learning_rate": 0.00019466934020796045, + "loss": 1.0268, + "step": 5985 + }, + { + "epoch": 0.1537036947880107, + "grad_norm": 0.8359375, + "learning_rate": 0.00019466790203355265, + "loss": 1.0554, + "step": 5986 + }, + { + "epoch": 0.1537293719839325, + "grad_norm": 0.80859375, + "learning_rate": 0.00019466646367048023, + "loss": 1.0793, + "step": 5987 + }, + { + "epoch": 0.1537550491798543, + "grad_norm": 0.8046875, + "learning_rate": 0.00019466502511874615, + "loss": 1.046, + "step": 5988 + }, + { + "epoch": 0.15378072637577614, + "grad_norm": 0.828125, + "learning_rate": 0.0001946635863783532, + "loss": 0.9658, + "step": 5989 + }, + { + "epoch": 0.15380640357169795, + "grad_norm": 0.859375, + "learning_rate": 0.0001946621474493043, + "loss": 1.0275, + "step": 5990 + }, + { + "epoch": 0.15383208076761978, + "grad_norm": 0.828125, + "learning_rate": 0.00019466070833160226, + "loss": 1.1074, + "step": 5991 + }, + { + "epoch": 0.1538577579635416, + "grad_norm": 0.80859375, + "learning_rate": 0.00019465926902525001, + "loss": 1.1391, + "step": 5992 + }, + { + "epoch": 0.1538834351594634, + "grad_norm": 0.82421875, + "learning_rate": 0.00019465782953025036, + "loss": 1.1955, + "step": 5993 + }, + { + "epoch": 0.15390911235538524, + "grad_norm": 0.86328125, + "learning_rate": 0.0001946563898466062, + "loss": 1.1746, + "step": 5994 + }, + { + "epoch": 0.15393478955130704, + "grad_norm": 0.84765625, + "learning_rate": 0.00019465494997432042, + "loss": 1.0656, + "step": 5995 + }, + { + "epoch": 0.15396046674722888, + "grad_norm": 0.8671875, + "learning_rate": 0.00019465350991339586, + "loss": 1.1727, + "step": 5996 + }, + { + "epoch": 0.1539861439431507, + "grad_norm": 0.8515625, + "learning_rate": 0.00019465206966383543, + "loss": 1.0244, + "step": 5997 + }, + { + "epoch": 0.1540118211390725, + "grad_norm": 0.8828125, + "learning_rate": 0.00019465062922564196, + "loss": 1.0508, + "step": 5998 + }, + { + "epoch": 0.15403749833499433, + "grad_norm": 0.81640625, + "learning_rate": 0.00019464918859881834, + "loss": 1.0789, + "step": 5999 + }, + { + "epoch": 0.15406317553091614, + "grad_norm": 0.8046875, + "learning_rate": 0.00019464774778336742, + "loss": 1.2009, + "step": 6000 + }, + { + "epoch": 0.15406317553091614, + "eval_loss": 1.0851809978485107, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 404.3626, + "eval_samples_per_second": 24.73, + "eval_steps_per_second": 0.774, + "step": 6000 + }, + { + "epoch": 0.15408885272683798, + "grad_norm": 0.828125, + "learning_rate": 0.0001946463067792921, + "loss": 0.9998, + "step": 6001 + }, + { + "epoch": 0.15411452992275979, + "grad_norm": 0.80078125, + "learning_rate": 0.00019464486558659524, + "loss": 1.0888, + "step": 6002 + }, + { + "epoch": 0.1541402071186816, + "grad_norm": 0.8515625, + "learning_rate": 0.00019464342420527972, + "loss": 1.1131, + "step": 6003 + }, + { + "epoch": 0.15416588431460343, + "grad_norm": 0.86328125, + "learning_rate": 0.0001946419826353484, + "loss": 1.1634, + "step": 6004 + }, + { + "epoch": 0.15419156151052524, + "grad_norm": 0.8203125, + "learning_rate": 0.00019464054087680417, + "loss": 1.0768, + "step": 6005 + }, + { + "epoch": 0.15421723870644707, + "grad_norm": 0.83984375, + "learning_rate": 0.00019463909892964988, + "loss": 1.2743, + "step": 6006 + }, + { + "epoch": 0.15424291590236888, + "grad_norm": 0.83984375, + "learning_rate": 0.0001946376567938884, + "loss": 1.1455, + "step": 6007 + }, + { + "epoch": 0.1542685930982907, + "grad_norm": 0.8125, + "learning_rate": 0.00019463621446952264, + "loss": 1.1447, + "step": 6008 + }, + { + "epoch": 0.15429427029421253, + "grad_norm": 0.8359375, + "learning_rate": 0.0001946347719565554, + "loss": 1.0407, + "step": 6009 + }, + { + "epoch": 0.15431994749013433, + "grad_norm": 0.875, + "learning_rate": 0.0001946333292549897, + "loss": 1.0727, + "step": 6010 + }, + { + "epoch": 0.15434562468605617, + "grad_norm": 0.828125, + "learning_rate": 0.00019463188636482826, + "loss": 1.0912, + "step": 6011 + }, + { + "epoch": 0.15437130188197798, + "grad_norm": 0.796875, + "learning_rate": 0.00019463044328607402, + "loss": 1.2039, + "step": 6012 + }, + { + "epoch": 0.1543969790778998, + "grad_norm": 0.79296875, + "learning_rate": 0.00019462900001872987, + "loss": 1.0206, + "step": 6013 + }, + { + "epoch": 0.15442265627382162, + "grad_norm": 0.80078125, + "learning_rate": 0.00019462755656279867, + "loss": 1.1256, + "step": 6014 + }, + { + "epoch": 0.15444833346974343, + "grad_norm": 0.875, + "learning_rate": 0.0001946261129182833, + "loss": 1.1931, + "step": 6015 + }, + { + "epoch": 0.15447401066566527, + "grad_norm": 0.875, + "learning_rate": 0.00019462466908518663, + "loss": 1.0245, + "step": 6016 + }, + { + "epoch": 0.15449968786158708, + "grad_norm": 0.84375, + "learning_rate": 0.00019462322506351153, + "loss": 0.9794, + "step": 6017 + }, + { + "epoch": 0.15452536505750888, + "grad_norm": 0.83984375, + "learning_rate": 0.00019462178085326092, + "loss": 1.1259, + "step": 6018 + }, + { + "epoch": 0.15455104225343072, + "grad_norm": 0.90234375, + "learning_rate": 0.0001946203364544376, + "loss": 1.2062, + "step": 6019 + }, + { + "epoch": 0.15457671944935253, + "grad_norm": 0.859375, + "learning_rate": 0.00019461889186704454, + "loss": 1.0224, + "step": 6020 + }, + { + "epoch": 0.15460239664527436, + "grad_norm": 0.82421875, + "learning_rate": 0.00019461744709108457, + "loss": 0.9961, + "step": 6021 + }, + { + "epoch": 0.15462807384119617, + "grad_norm": 0.8671875, + "learning_rate": 0.00019461600212656056, + "loss": 1.0231, + "step": 6022 + }, + { + "epoch": 0.15465375103711798, + "grad_norm": 0.859375, + "learning_rate": 0.00019461455697347541, + "loss": 0.9687, + "step": 6023 + }, + { + "epoch": 0.15467942823303982, + "grad_norm": 0.8359375, + "learning_rate": 0.000194613111631832, + "loss": 1.0788, + "step": 6024 + }, + { + "epoch": 0.15470510542896163, + "grad_norm": 0.83203125, + "learning_rate": 0.00019461166610163319, + "loss": 1.1093, + "step": 6025 + }, + { + "epoch": 0.15473078262488346, + "grad_norm": 0.85546875, + "learning_rate": 0.0001946102203828819, + "loss": 1.0648, + "step": 6026 + }, + { + "epoch": 0.15475645982080527, + "grad_norm": 0.83984375, + "learning_rate": 0.00019460877447558097, + "loss": 1.0379, + "step": 6027 + }, + { + "epoch": 0.15478213701672708, + "grad_norm": 0.8125, + "learning_rate": 0.0001946073283797333, + "loss": 1.0047, + "step": 6028 + }, + { + "epoch": 0.15480781421264891, + "grad_norm": 0.83984375, + "learning_rate": 0.00019460588209534177, + "loss": 1.1892, + "step": 6029 + }, + { + "epoch": 0.15483349140857072, + "grad_norm": 0.9296875, + "learning_rate": 0.00019460443562240926, + "loss": 1.1795, + "step": 6030 + }, + { + "epoch": 0.15485916860449256, + "grad_norm": 0.796875, + "learning_rate": 0.00019460298896093866, + "loss": 1.0261, + "step": 6031 + }, + { + "epoch": 0.15488484580041437, + "grad_norm": 0.80078125, + "learning_rate": 0.00019460154211093283, + "loss": 1.1103, + "step": 6032 + }, + { + "epoch": 0.15491052299633618, + "grad_norm": 0.81640625, + "learning_rate": 0.0001946000950723947, + "loss": 0.9444, + "step": 6033 + }, + { + "epoch": 0.154936200192258, + "grad_norm": 0.75, + "learning_rate": 0.00019459864784532714, + "loss": 0.783, + "step": 6034 + }, + { + "epoch": 0.15496187738817982, + "grad_norm": 0.77734375, + "learning_rate": 0.000194597200429733, + "loss": 1.1611, + "step": 6035 + }, + { + "epoch": 0.15498755458410166, + "grad_norm": 0.78125, + "learning_rate": 0.00019459575282561518, + "loss": 1.1165, + "step": 6036 + }, + { + "epoch": 0.15501323178002346, + "grad_norm": 0.76953125, + "learning_rate": 0.00019459430503297655, + "loss": 1.0885, + "step": 6037 + }, + { + "epoch": 0.15503890897594527, + "grad_norm": 0.8203125, + "learning_rate": 0.00019459285705182003, + "loss": 1.1182, + "step": 6038 + }, + { + "epoch": 0.1550645861718671, + "grad_norm": 0.7734375, + "learning_rate": 0.0001945914088821485, + "loss": 0.9116, + "step": 6039 + }, + { + "epoch": 0.15509026336778892, + "grad_norm": 0.8046875, + "learning_rate": 0.00019458996052396484, + "loss": 1.0503, + "step": 6040 + }, + { + "epoch": 0.15511594056371075, + "grad_norm": 0.83984375, + "learning_rate": 0.0001945885119772719, + "loss": 1.1879, + "step": 6041 + }, + { + "epoch": 0.15514161775963256, + "grad_norm": 0.90625, + "learning_rate": 0.00019458706324207263, + "loss": 1.1278, + "step": 6042 + }, + { + "epoch": 0.15516729495555437, + "grad_norm": 0.85546875, + "learning_rate": 0.00019458561431836988, + "loss": 1.0797, + "step": 6043 + }, + { + "epoch": 0.1551929721514762, + "grad_norm": 0.78125, + "learning_rate": 0.00019458416520616657, + "loss": 1.0581, + "step": 6044 + }, + { + "epoch": 0.155218649347398, + "grad_norm": 0.86328125, + "learning_rate": 0.00019458271590546553, + "loss": 1.0967, + "step": 6045 + }, + { + "epoch": 0.15524432654331985, + "grad_norm": 0.7421875, + "learning_rate": 0.00019458126641626967, + "loss": 1.0004, + "step": 6046 + }, + { + "epoch": 0.15527000373924166, + "grad_norm": 0.828125, + "learning_rate": 0.0001945798167385819, + "loss": 1.1525, + "step": 6047 + }, + { + "epoch": 0.15529568093516347, + "grad_norm": 0.92578125, + "learning_rate": 0.00019457836687240514, + "loss": 1.1435, + "step": 6048 + }, + { + "epoch": 0.1553213581310853, + "grad_norm": 0.8046875, + "learning_rate": 0.00019457691681774218, + "loss": 1.0954, + "step": 6049 + }, + { + "epoch": 0.1553470353270071, + "grad_norm": 0.88671875, + "learning_rate": 0.000194575466574596, + "loss": 0.9812, + "step": 6050 + }, + { + "epoch": 0.15537271252292895, + "grad_norm": 0.84375, + "learning_rate": 0.00019457401614296943, + "loss": 1.1133, + "step": 6051 + }, + { + "epoch": 0.15539838971885075, + "grad_norm": 0.8515625, + "learning_rate": 0.00019457256552286544, + "loss": 1.1167, + "step": 6052 + }, + { + "epoch": 0.15542406691477256, + "grad_norm": 0.8515625, + "learning_rate": 0.00019457111471428683, + "loss": 1.1115, + "step": 6053 + }, + { + "epoch": 0.1554497441106944, + "grad_norm": 0.85546875, + "learning_rate": 0.00019456966371723654, + "loss": 1.0732, + "step": 6054 + }, + { + "epoch": 0.1554754213066162, + "grad_norm": 0.83203125, + "learning_rate": 0.00019456821253171746, + "loss": 1.1798, + "step": 6055 + }, + { + "epoch": 0.15550109850253804, + "grad_norm": 0.85546875, + "learning_rate": 0.00019456676115773246, + "loss": 1.0509, + "step": 6056 + }, + { + "epoch": 0.15552677569845985, + "grad_norm": 0.77734375, + "learning_rate": 0.00019456530959528445, + "loss": 1.0832, + "step": 6057 + }, + { + "epoch": 0.15555245289438166, + "grad_norm": 0.890625, + "learning_rate": 0.00019456385784437635, + "loss": 1.2718, + "step": 6058 + }, + { + "epoch": 0.1555781300903035, + "grad_norm": 0.8046875, + "learning_rate": 0.00019456240590501098, + "loss": 0.9842, + "step": 6059 + }, + { + "epoch": 0.1556038072862253, + "grad_norm": 0.83984375, + "learning_rate": 0.0001945609537771913, + "loss": 1.1404, + "step": 6060 + }, + { + "epoch": 0.15562948448214714, + "grad_norm": 0.79296875, + "learning_rate": 0.00019455950146092017, + "loss": 1.0205, + "step": 6061 + }, + { + "epoch": 0.15565516167806895, + "grad_norm": 0.8515625, + "learning_rate": 0.00019455804895620048, + "loss": 1.1504, + "step": 6062 + }, + { + "epoch": 0.15568083887399076, + "grad_norm": 0.796875, + "learning_rate": 0.00019455659626303517, + "loss": 1.035, + "step": 6063 + }, + { + "epoch": 0.1557065160699126, + "grad_norm": 0.90234375, + "learning_rate": 0.0001945551433814271, + "loss": 1.1498, + "step": 6064 + }, + { + "epoch": 0.1557321932658344, + "grad_norm": 0.82421875, + "learning_rate": 0.00019455369031137916, + "loss": 1.1355, + "step": 6065 + }, + { + "epoch": 0.15575787046175624, + "grad_norm": 0.91015625, + "learning_rate": 0.00019455223705289426, + "loss": 1.1004, + "step": 6066 + }, + { + "epoch": 0.15578354765767805, + "grad_norm": 0.8359375, + "learning_rate": 0.0001945507836059753, + "loss": 1.0897, + "step": 6067 + }, + { + "epoch": 0.15580922485359985, + "grad_norm": 0.94140625, + "learning_rate": 0.00019454932997062517, + "loss": 1.1273, + "step": 6068 + }, + { + "epoch": 0.1558349020495217, + "grad_norm": 0.82421875, + "learning_rate": 0.00019454787614684674, + "loss": 1.0991, + "step": 6069 + }, + { + "epoch": 0.1558605792454435, + "grad_norm": 0.80859375, + "learning_rate": 0.00019454642213464295, + "loss": 0.9566, + "step": 6070 + }, + { + "epoch": 0.15588625644136533, + "grad_norm": 0.8359375, + "learning_rate": 0.00019454496793401668, + "loss": 1.0151, + "step": 6071 + }, + { + "epoch": 0.15591193363728714, + "grad_norm": 0.84375, + "learning_rate": 0.00019454351354497085, + "loss": 1.1543, + "step": 6072 + }, + { + "epoch": 0.15593761083320895, + "grad_norm": 0.8984375, + "learning_rate": 0.00019454205896750833, + "loss": 1.0971, + "step": 6073 + }, + { + "epoch": 0.1559632880291308, + "grad_norm": 0.828125, + "learning_rate": 0.000194540604201632, + "loss": 1.1455, + "step": 6074 + }, + { + "epoch": 0.1559889652250526, + "grad_norm": 0.76953125, + "learning_rate": 0.00019453914924734482, + "loss": 1.0705, + "step": 6075 + }, + { + "epoch": 0.15601464242097443, + "grad_norm": 0.83203125, + "learning_rate": 0.00019453769410464965, + "loss": 1.2085, + "step": 6076 + }, + { + "epoch": 0.15604031961689624, + "grad_norm": 0.8671875, + "learning_rate": 0.00019453623877354936, + "loss": 1.232, + "step": 6077 + }, + { + "epoch": 0.15606599681281805, + "grad_norm": 0.81640625, + "learning_rate": 0.0001945347832540469, + "loss": 1.0325, + "step": 6078 + }, + { + "epoch": 0.15609167400873988, + "grad_norm": 0.765625, + "learning_rate": 0.00019453332754614517, + "loss": 1.0719, + "step": 6079 + }, + { + "epoch": 0.1561173512046617, + "grad_norm": 0.90234375, + "learning_rate": 0.00019453187164984707, + "loss": 1.15, + "step": 6080 + }, + { + "epoch": 0.15614302840058353, + "grad_norm": 0.90625, + "learning_rate": 0.00019453041556515544, + "loss": 0.9665, + "step": 6081 + }, + { + "epoch": 0.15616870559650534, + "grad_norm": 0.8515625, + "learning_rate": 0.00019452895929207329, + "loss": 1.0646, + "step": 6082 + }, + { + "epoch": 0.15619438279242714, + "grad_norm": 0.81640625, + "learning_rate": 0.0001945275028306034, + "loss": 0.9959, + "step": 6083 + }, + { + "epoch": 0.15622005998834898, + "grad_norm": 0.87890625, + "learning_rate": 0.0001945260461807488, + "loss": 1.0477, + "step": 6084 + }, + { + "epoch": 0.1562457371842708, + "grad_norm": 0.78515625, + "learning_rate": 0.00019452458934251227, + "loss": 1.1142, + "step": 6085 + }, + { + "epoch": 0.15627141438019262, + "grad_norm": 0.8984375, + "learning_rate": 0.00019452313231589684, + "loss": 1.0748, + "step": 6086 + }, + { + "epoch": 0.15629709157611443, + "grad_norm": 0.8671875, + "learning_rate": 0.0001945216751009053, + "loss": 1.147, + "step": 6087 + }, + { + "epoch": 0.15632276877203624, + "grad_norm": 0.9375, + "learning_rate": 0.0001945202176975406, + "loss": 1.0541, + "step": 6088 + }, + { + "epoch": 0.15634844596795808, + "grad_norm": 0.83203125, + "learning_rate": 0.00019451876010580566, + "loss": 1.1023, + "step": 6089 + }, + { + "epoch": 0.15637412316387989, + "grad_norm": 0.79296875, + "learning_rate": 0.00019451730232570333, + "loss": 1.1736, + "step": 6090 + }, + { + "epoch": 0.15639980035980172, + "grad_norm": 0.91015625, + "learning_rate": 0.00019451584435723658, + "loss": 1.2036, + "step": 6091 + }, + { + "epoch": 0.15642547755572353, + "grad_norm": 0.8046875, + "learning_rate": 0.0001945143862004083, + "loss": 1.0188, + "step": 6092 + }, + { + "epoch": 0.15645115475164534, + "grad_norm": 0.8125, + "learning_rate": 0.0001945129278552214, + "loss": 1.007, + "step": 6093 + }, + { + "epoch": 0.15647683194756717, + "grad_norm": 0.87890625, + "learning_rate": 0.00019451146932167874, + "loss": 1.1962, + "step": 6094 + }, + { + "epoch": 0.15650250914348898, + "grad_norm": 0.828125, + "learning_rate": 0.00019451001059978324, + "loss": 1.1459, + "step": 6095 + }, + { + "epoch": 0.1565281863394108, + "grad_norm": 0.83984375, + "learning_rate": 0.00019450855168953789, + "loss": 0.9963, + "step": 6096 + }, + { + "epoch": 0.15655386353533263, + "grad_norm": 0.859375, + "learning_rate": 0.00019450709259094548, + "loss": 1.1184, + "step": 6097 + }, + { + "epoch": 0.15657954073125444, + "grad_norm": 0.75, + "learning_rate": 0.000194505633304009, + "loss": 1.0024, + "step": 6098 + }, + { + "epoch": 0.15660521792717627, + "grad_norm": 0.8515625, + "learning_rate": 0.0001945041738287313, + "loss": 0.9962, + "step": 6099 + }, + { + "epoch": 0.15663089512309808, + "grad_norm": 0.87109375, + "learning_rate": 0.00019450271416511535, + "loss": 1.2535, + "step": 6100 + }, + { + "epoch": 0.1566565723190199, + "grad_norm": 0.8125, + "learning_rate": 0.000194501254313164, + "loss": 1.0959, + "step": 6101 + }, + { + "epoch": 0.15668224951494172, + "grad_norm": 0.78125, + "learning_rate": 0.0001944997942728802, + "loss": 0.9177, + "step": 6102 + }, + { + "epoch": 0.15670792671086353, + "grad_norm": 0.8359375, + "learning_rate": 0.00019449833404426684, + "loss": 1.0809, + "step": 6103 + }, + { + "epoch": 0.15673360390678537, + "grad_norm": 0.8515625, + "learning_rate": 0.00019449687362732684, + "loss": 1.1151, + "step": 6104 + }, + { + "epoch": 0.15675928110270718, + "grad_norm": 0.8828125, + "learning_rate": 0.00019449541302206312, + "loss": 0.9837, + "step": 6105 + }, + { + "epoch": 0.15678495829862898, + "grad_norm": 0.84375, + "learning_rate": 0.00019449395222847859, + "loss": 1.1523, + "step": 6106 + }, + { + "epoch": 0.15681063549455082, + "grad_norm": 0.7890625, + "learning_rate": 0.00019449249124657613, + "loss": 1.2253, + "step": 6107 + }, + { + "epoch": 0.15683631269047263, + "grad_norm": 0.8203125, + "learning_rate": 0.00019449103007635865, + "loss": 1.0471, + "step": 6108 + }, + { + "epoch": 0.15686198988639447, + "grad_norm": 0.73828125, + "learning_rate": 0.0001944895687178291, + "loss": 1.2108, + "step": 6109 + }, + { + "epoch": 0.15688766708231627, + "grad_norm": 0.80859375, + "learning_rate": 0.00019448810717099038, + "loss": 1.0581, + "step": 6110 + }, + { + "epoch": 0.15691334427823808, + "grad_norm": 0.84375, + "learning_rate": 0.00019448664543584538, + "loss": 1.0152, + "step": 6111 + }, + { + "epoch": 0.15693902147415992, + "grad_norm": 0.7734375, + "learning_rate": 0.00019448518351239703, + "loss": 1.0241, + "step": 6112 + }, + { + "epoch": 0.15696469867008173, + "grad_norm": 0.9453125, + "learning_rate": 0.00019448372140064824, + "loss": 1.315, + "step": 6113 + }, + { + "epoch": 0.15699037586600356, + "grad_norm": 0.84375, + "learning_rate": 0.00019448225910060195, + "loss": 1.1319, + "step": 6114 + }, + { + "epoch": 0.15701605306192537, + "grad_norm": 0.86328125, + "learning_rate": 0.0001944807966122611, + "loss": 1.0867, + "step": 6115 + }, + { + "epoch": 0.15704173025784718, + "grad_norm": 0.8125, + "learning_rate": 0.00019447933393562848, + "loss": 1.041, + "step": 6116 + }, + { + "epoch": 0.15706740745376901, + "grad_norm": 0.83984375, + "learning_rate": 0.0001944778710707071, + "loss": 1.0734, + "step": 6117 + }, + { + "epoch": 0.15709308464969082, + "grad_norm": 0.81640625, + "learning_rate": 0.00019447640801749985, + "loss": 1.0504, + "step": 6118 + }, + { + "epoch": 0.15711876184561266, + "grad_norm": 0.8125, + "learning_rate": 0.00019447494477600966, + "loss": 1.1174, + "step": 6119 + }, + { + "epoch": 0.15714443904153447, + "grad_norm": 0.796875, + "learning_rate": 0.00019447348134623944, + "loss": 1.1113, + "step": 6120 + }, + { + "epoch": 0.15717011623745628, + "grad_norm": 0.828125, + "learning_rate": 0.00019447201772819213, + "loss": 1.0023, + "step": 6121 + }, + { + "epoch": 0.1571957934333781, + "grad_norm": 0.79296875, + "learning_rate": 0.00019447055392187058, + "loss": 0.995, + "step": 6122 + }, + { + "epoch": 0.15722147062929992, + "grad_norm": 0.8359375, + "learning_rate": 0.00019446908992727777, + "loss": 1.1349, + "step": 6123 + }, + { + "epoch": 0.15724714782522176, + "grad_norm": 0.82421875, + "learning_rate": 0.0001944676257444166, + "loss": 1.0351, + "step": 6124 + }, + { + "epoch": 0.15727282502114356, + "grad_norm": 0.92578125, + "learning_rate": 0.00019446616137329, + "loss": 1.08, + "step": 6125 + }, + { + "epoch": 0.15729850221706537, + "grad_norm": 0.8828125, + "learning_rate": 0.00019446469681390084, + "loss": 1.1271, + "step": 6126 + }, + { + "epoch": 0.1573241794129872, + "grad_norm": 0.8359375, + "learning_rate": 0.0001944632320662521, + "loss": 0.9418, + "step": 6127 + }, + { + "epoch": 0.15734985660890902, + "grad_norm": 0.88671875, + "learning_rate": 0.00019446176713034668, + "loss": 1.2227, + "step": 6128 + }, + { + "epoch": 0.15737553380483085, + "grad_norm": 0.85546875, + "learning_rate": 0.00019446030200618742, + "loss": 1.0116, + "step": 6129 + }, + { + "epoch": 0.15740121100075266, + "grad_norm": 0.84375, + "learning_rate": 0.0001944588366937774, + "loss": 1.1094, + "step": 6130 + }, + { + "epoch": 0.15742688819667447, + "grad_norm": 0.8359375, + "learning_rate": 0.00019445737119311939, + "loss": 1.0561, + "step": 6131 + }, + { + "epoch": 0.1574525653925963, + "grad_norm": 0.81640625, + "learning_rate": 0.00019445590550421643, + "loss": 1.0, + "step": 6132 + }, + { + "epoch": 0.1574782425885181, + "grad_norm": 0.81640625, + "learning_rate": 0.00019445443962707134, + "loss": 1.0785, + "step": 6133 + }, + { + "epoch": 0.15750391978443995, + "grad_norm": 0.83203125, + "learning_rate": 0.0001944529735616871, + "loss": 1.2712, + "step": 6134 + }, + { + "epoch": 0.15752959698036176, + "grad_norm": 0.80078125, + "learning_rate": 0.00019445150730806657, + "loss": 1.0988, + "step": 6135 + }, + { + "epoch": 0.15755527417628357, + "grad_norm": 0.8125, + "learning_rate": 0.00019445004086621274, + "loss": 1.0208, + "step": 6136 + }, + { + "epoch": 0.1575809513722054, + "grad_norm": 0.91015625, + "learning_rate": 0.00019444857423612854, + "loss": 1.1941, + "step": 6137 + }, + { + "epoch": 0.1576066285681272, + "grad_norm": 0.8125, + "learning_rate": 0.00019444710741781687, + "loss": 1.0027, + "step": 6138 + }, + { + "epoch": 0.15763230576404905, + "grad_norm": 0.8203125, + "learning_rate": 0.0001944456404112806, + "loss": 0.9189, + "step": 6139 + }, + { + "epoch": 0.15765798295997085, + "grad_norm": 0.87109375, + "learning_rate": 0.00019444417321652272, + "loss": 1.0851, + "step": 6140 + }, + { + "epoch": 0.15768366015589266, + "grad_norm": 0.7578125, + "learning_rate": 0.00019444270583354614, + "loss": 0.9577, + "step": 6141 + }, + { + "epoch": 0.1577093373518145, + "grad_norm": 0.78125, + "learning_rate": 0.00019444123826235375, + "loss": 1.1332, + "step": 6142 + }, + { + "epoch": 0.1577350145477363, + "grad_norm": 0.7734375, + "learning_rate": 0.00019443977050294855, + "loss": 1.003, + "step": 6143 + }, + { + "epoch": 0.15776069174365814, + "grad_norm": 0.81640625, + "learning_rate": 0.0001944383025553334, + "loss": 1.1188, + "step": 6144 + }, + { + "epoch": 0.15778636893957995, + "grad_norm": 0.8359375, + "learning_rate": 0.00019443683441951125, + "loss": 1.0885, + "step": 6145 + }, + { + "epoch": 0.15781204613550176, + "grad_norm": 0.78515625, + "learning_rate": 0.000194435366095485, + "loss": 1.1755, + "step": 6146 + }, + { + "epoch": 0.1578377233314236, + "grad_norm": 0.80078125, + "learning_rate": 0.0001944338975832576, + "loss": 1.0333, + "step": 6147 + }, + { + "epoch": 0.1578634005273454, + "grad_norm": 0.84375, + "learning_rate": 0.00019443242888283197, + "loss": 1.0598, + "step": 6148 + }, + { + "epoch": 0.15788907772326724, + "grad_norm": 0.80078125, + "learning_rate": 0.00019443095999421104, + "loss": 1.1494, + "step": 6149 + }, + { + "epoch": 0.15791475491918905, + "grad_norm": 0.87890625, + "learning_rate": 0.00019442949091739778, + "loss": 1.1391, + "step": 6150 + }, + { + "epoch": 0.15794043211511086, + "grad_norm": 0.80859375, + "learning_rate": 0.000194428021652395, + "loss": 0.962, + "step": 6151 + }, + { + "epoch": 0.1579661093110327, + "grad_norm": 0.828125, + "learning_rate": 0.00019442655219920573, + "loss": 1.2519, + "step": 6152 + }, + { + "epoch": 0.1579917865069545, + "grad_norm": 0.765625, + "learning_rate": 0.0001944250825578329, + "loss": 1.088, + "step": 6153 + }, + { + "epoch": 0.15801746370287634, + "grad_norm": 0.85546875, + "learning_rate": 0.00019442361272827937, + "loss": 1.1046, + "step": 6154 + }, + { + "epoch": 0.15804314089879815, + "grad_norm": 0.81640625, + "learning_rate": 0.00019442214271054816, + "loss": 1.1516, + "step": 6155 + }, + { + "epoch": 0.15806881809471995, + "grad_norm": 0.87109375, + "learning_rate": 0.00019442067250464211, + "loss": 1.0381, + "step": 6156 + }, + { + "epoch": 0.1580944952906418, + "grad_norm": 0.76953125, + "learning_rate": 0.0001944192021105642, + "loss": 1.06, + "step": 6157 + }, + { + "epoch": 0.1581201724865636, + "grad_norm": 0.83203125, + "learning_rate": 0.00019441773152831734, + "loss": 1.0494, + "step": 6158 + }, + { + "epoch": 0.15814584968248543, + "grad_norm": 0.8359375, + "learning_rate": 0.0001944162607579045, + "loss": 1.1321, + "step": 6159 + }, + { + "epoch": 0.15817152687840724, + "grad_norm": 0.87109375, + "learning_rate": 0.00019441478979932852, + "loss": 1.0787, + "step": 6160 + }, + { + "epoch": 0.15819720407432905, + "grad_norm": 0.87109375, + "learning_rate": 0.00019441331865259246, + "loss": 1.0384, + "step": 6161 + }, + { + "epoch": 0.1582228812702509, + "grad_norm": 0.8359375, + "learning_rate": 0.00019441184731769915, + "loss": 1.0395, + "step": 6162 + }, + { + "epoch": 0.1582485584661727, + "grad_norm": 0.83984375, + "learning_rate": 0.00019441037579465156, + "loss": 1.11, + "step": 6163 + }, + { + "epoch": 0.15827423566209453, + "grad_norm": 0.8515625, + "learning_rate": 0.0001944089040834526, + "loss": 1.0986, + "step": 6164 + }, + { + "epoch": 0.15829991285801634, + "grad_norm": 0.81640625, + "learning_rate": 0.00019440743218410525, + "loss": 1.0757, + "step": 6165 + }, + { + "epoch": 0.15832559005393815, + "grad_norm": 0.83203125, + "learning_rate": 0.0001944059600966124, + "loss": 1.0748, + "step": 6166 + }, + { + "epoch": 0.15835126724985998, + "grad_norm": 0.8203125, + "learning_rate": 0.00019440448782097702, + "loss": 1.1537, + "step": 6167 + }, + { + "epoch": 0.1583769444457818, + "grad_norm": 0.80859375, + "learning_rate": 0.000194403015357202, + "loss": 1.1438, + "step": 6168 + }, + { + "epoch": 0.15840262164170363, + "grad_norm": 0.78515625, + "learning_rate": 0.00019440154270529032, + "loss": 1.028, + "step": 6169 + }, + { + "epoch": 0.15842829883762544, + "grad_norm": 0.96875, + "learning_rate": 0.00019440006986524486, + "loss": 1.1326, + "step": 6170 + }, + { + "epoch": 0.15845397603354724, + "grad_norm": 0.84375, + "learning_rate": 0.0001943985968370686, + "loss": 1.1927, + "step": 6171 + }, + { + "epoch": 0.15847965322946908, + "grad_norm": 0.7890625, + "learning_rate": 0.00019439712362076447, + "loss": 0.9654, + "step": 6172 + }, + { + "epoch": 0.1585053304253909, + "grad_norm": 0.84375, + "learning_rate": 0.0001943956502163354, + "loss": 1.0672, + "step": 6173 + }, + { + "epoch": 0.15853100762131273, + "grad_norm": 0.828125, + "learning_rate": 0.00019439417662378433, + "loss": 1.1257, + "step": 6174 + }, + { + "epoch": 0.15855668481723453, + "grad_norm": 0.8046875, + "learning_rate": 0.00019439270284311419, + "loss": 1.0827, + "step": 6175 + }, + { + "epoch": 0.15858236201315634, + "grad_norm": 0.78515625, + "learning_rate": 0.0001943912288743279, + "loss": 1.1645, + "step": 6176 + }, + { + "epoch": 0.15860803920907818, + "grad_norm": 0.828125, + "learning_rate": 0.00019438975471742847, + "loss": 1.1818, + "step": 6177 + }, + { + "epoch": 0.15863371640499999, + "grad_norm": 0.8515625, + "learning_rate": 0.00019438828037241873, + "loss": 1.1555, + "step": 6178 + }, + { + "epoch": 0.15865939360092182, + "grad_norm": 0.80859375, + "learning_rate": 0.00019438680583930168, + "loss": 1.0079, + "step": 6179 + }, + { + "epoch": 0.15868507079684363, + "grad_norm": 0.828125, + "learning_rate": 0.0001943853311180803, + "loss": 1.0836, + "step": 6180 + }, + { + "epoch": 0.15871074799276544, + "grad_norm": 0.7890625, + "learning_rate": 0.00019438385620875744, + "loss": 1.006, + "step": 6181 + }, + { + "epoch": 0.15873642518868727, + "grad_norm": 0.828125, + "learning_rate": 0.00019438238111133606, + "loss": 1.083, + "step": 6182 + }, + { + "epoch": 0.15876210238460908, + "grad_norm": 0.8359375, + "learning_rate": 0.00019438090582581916, + "loss": 0.9426, + "step": 6183 + }, + { + "epoch": 0.15878777958053092, + "grad_norm": 0.8515625, + "learning_rate": 0.00019437943035220966, + "loss": 1.064, + "step": 6184 + }, + { + "epoch": 0.15881345677645273, + "grad_norm": 0.79296875, + "learning_rate": 0.0001943779546905104, + "loss": 1.0255, + "step": 6185 + }, + { + "epoch": 0.15883913397237454, + "grad_norm": 0.78515625, + "learning_rate": 0.00019437647884072446, + "loss": 0.9664, + "step": 6186 + }, + { + "epoch": 0.15886481116829637, + "grad_norm": 0.88671875, + "learning_rate": 0.00019437500280285473, + "loss": 1.028, + "step": 6187 + }, + { + "epoch": 0.15889048836421818, + "grad_norm": 0.80078125, + "learning_rate": 0.00019437352657690414, + "loss": 1.1007, + "step": 6188 + }, + { + "epoch": 0.15891616556014002, + "grad_norm": 0.84375, + "learning_rate": 0.0001943720501628756, + "loss": 1.118, + "step": 6189 + }, + { + "epoch": 0.15894184275606182, + "grad_norm": 0.7734375, + "learning_rate": 0.00019437057356077212, + "loss": 1.0241, + "step": 6190 + }, + { + "epoch": 0.15896751995198363, + "grad_norm": 0.84765625, + "learning_rate": 0.0001943690967705966, + "loss": 1.1271, + "step": 6191 + }, + { + "epoch": 0.15899319714790547, + "grad_norm": 0.91796875, + "learning_rate": 0.000194367619792352, + "loss": 1.1453, + "step": 6192 + }, + { + "epoch": 0.15901887434382728, + "grad_norm": 0.8125, + "learning_rate": 0.00019436614262604126, + "loss": 1.1376, + "step": 6193 + }, + { + "epoch": 0.1590445515397491, + "grad_norm": 0.83203125, + "learning_rate": 0.0001943646652716673, + "loss": 1.1731, + "step": 6194 + }, + { + "epoch": 0.15907022873567092, + "grad_norm": 0.91796875, + "learning_rate": 0.0001943631877292331, + "loss": 1.0898, + "step": 6195 + }, + { + "epoch": 0.15909590593159273, + "grad_norm": 0.828125, + "learning_rate": 0.0001943617099987416, + "loss": 1.1441, + "step": 6196 + }, + { + "epoch": 0.15912158312751457, + "grad_norm": 0.8359375, + "learning_rate": 0.00019436023208019572, + "loss": 1.1081, + "step": 6197 + }, + { + "epoch": 0.15914726032343637, + "grad_norm": 0.80078125, + "learning_rate": 0.00019435875397359845, + "loss": 1.1328, + "step": 6198 + }, + { + "epoch": 0.1591729375193582, + "grad_norm": 0.7578125, + "learning_rate": 0.00019435727567895268, + "loss": 1.0016, + "step": 6199 + }, + { + "epoch": 0.15919861471528002, + "grad_norm": 0.859375, + "learning_rate": 0.00019435579719626137, + "loss": 1.2292, + "step": 6200 + }, + { + "epoch": 0.15922429191120183, + "grad_norm": 0.8515625, + "learning_rate": 0.0001943543185255275, + "loss": 1.2627, + "step": 6201 + }, + { + "epoch": 0.15924996910712366, + "grad_norm": 0.8046875, + "learning_rate": 0.000194352839666754, + "loss": 0.9791, + "step": 6202 + }, + { + "epoch": 0.15927564630304547, + "grad_norm": 0.83984375, + "learning_rate": 0.00019435136061994382, + "loss": 0.9924, + "step": 6203 + }, + { + "epoch": 0.1593013234989673, + "grad_norm": 0.84375, + "learning_rate": 0.00019434988138509985, + "loss": 1.0551, + "step": 6204 + }, + { + "epoch": 0.15932700069488911, + "grad_norm": 0.80078125, + "learning_rate": 0.00019434840196222515, + "loss": 1.1417, + "step": 6205 + }, + { + "epoch": 0.15935267789081092, + "grad_norm": 0.77734375, + "learning_rate": 0.00019434692235132257, + "loss": 1.0267, + "step": 6206 + }, + { + "epoch": 0.15937835508673276, + "grad_norm": 0.88671875, + "learning_rate": 0.00019434544255239513, + "loss": 1.0968, + "step": 6207 + }, + { + "epoch": 0.15940403228265457, + "grad_norm": 0.81640625, + "learning_rate": 0.00019434396256544573, + "loss": 1.0393, + "step": 6208 + }, + { + "epoch": 0.1594297094785764, + "grad_norm": 0.83203125, + "learning_rate": 0.00019434248239047733, + "loss": 1.0693, + "step": 6209 + }, + { + "epoch": 0.1594553866744982, + "grad_norm": 0.8125, + "learning_rate": 0.00019434100202749287, + "loss": 1.0262, + "step": 6210 + }, + { + "epoch": 0.15948106387042002, + "grad_norm": 0.90625, + "learning_rate": 0.0001943395214764953, + "loss": 1.1673, + "step": 6211 + }, + { + "epoch": 0.15950674106634186, + "grad_norm": 0.84375, + "learning_rate": 0.00019433804073748764, + "loss": 1.0051, + "step": 6212 + }, + { + "epoch": 0.15953241826226366, + "grad_norm": 0.85546875, + "learning_rate": 0.00019433655981047278, + "loss": 1.0838, + "step": 6213 + }, + { + "epoch": 0.1595580954581855, + "grad_norm": 0.90234375, + "learning_rate": 0.00019433507869545365, + "loss": 1.084, + "step": 6214 + }, + { + "epoch": 0.1595837726541073, + "grad_norm": 0.84765625, + "learning_rate": 0.00019433359739243325, + "loss": 0.9971, + "step": 6215 + }, + { + "epoch": 0.15960944985002912, + "grad_norm": 0.828125, + "learning_rate": 0.0001943321159014145, + "loss": 1.1289, + "step": 6216 + }, + { + "epoch": 0.15963512704595095, + "grad_norm": 0.95703125, + "learning_rate": 0.00019433063422240036, + "loss": 1.1298, + "step": 6217 + }, + { + "epoch": 0.15966080424187276, + "grad_norm": 0.8203125, + "learning_rate": 0.0001943291523553938, + "loss": 1.133, + "step": 6218 + }, + { + "epoch": 0.1596864814377946, + "grad_norm": 0.84765625, + "learning_rate": 0.00019432767030039773, + "loss": 1.2181, + "step": 6219 + }, + { + "epoch": 0.1597121586337164, + "grad_norm": 0.83984375, + "learning_rate": 0.00019432618805741516, + "loss": 1.0111, + "step": 6220 + }, + { + "epoch": 0.15973783582963821, + "grad_norm": 0.8515625, + "learning_rate": 0.000194324705626449, + "loss": 1.1041, + "step": 6221 + }, + { + "epoch": 0.15976351302556005, + "grad_norm": 0.7890625, + "learning_rate": 0.00019432322300750227, + "loss": 1.1576, + "step": 6222 + }, + { + "epoch": 0.15978919022148186, + "grad_norm": 0.828125, + "learning_rate": 0.00019432174020057782, + "loss": 1.0478, + "step": 6223 + }, + { + "epoch": 0.1598148674174037, + "grad_norm": 0.71875, + "learning_rate": 0.00019432025720567868, + "loss": 1.1809, + "step": 6224 + }, + { + "epoch": 0.1598405446133255, + "grad_norm": 0.7734375, + "learning_rate": 0.0001943187740228078, + "loss": 0.9984, + "step": 6225 + }, + { + "epoch": 0.1598662218092473, + "grad_norm": 0.8203125, + "learning_rate": 0.0001943172906519681, + "loss": 0.9325, + "step": 6226 + }, + { + "epoch": 0.15989189900516915, + "grad_norm": 0.8671875, + "learning_rate": 0.0001943158070931626, + "loss": 0.9927, + "step": 6227 + }, + { + "epoch": 0.15991757620109096, + "grad_norm": 0.828125, + "learning_rate": 0.00019431432334639416, + "loss": 1.1364, + "step": 6228 + }, + { + "epoch": 0.1599432533970128, + "grad_norm": 0.8828125, + "learning_rate": 0.00019431283941166583, + "loss": 1.0296, + "step": 6229 + }, + { + "epoch": 0.1599689305929346, + "grad_norm": 0.8359375, + "learning_rate": 0.00019431135528898052, + "loss": 1.131, + "step": 6230 + }, + { + "epoch": 0.1599946077888564, + "grad_norm": 0.85546875, + "learning_rate": 0.00019430987097834122, + "loss": 1.1306, + "step": 6231 + }, + { + "epoch": 0.16002028498477824, + "grad_norm": 0.87109375, + "learning_rate": 0.00019430838647975084, + "loss": 1.0486, + "step": 6232 + }, + { + "epoch": 0.16004596218070005, + "grad_norm": 0.84375, + "learning_rate": 0.0001943069017932124, + "loss": 1.1732, + "step": 6233 + }, + { + "epoch": 0.1600716393766219, + "grad_norm": 0.796875, + "learning_rate": 0.00019430541691872874, + "loss": 0.9618, + "step": 6234 + }, + { + "epoch": 0.1600973165725437, + "grad_norm": 0.87890625, + "learning_rate": 0.00019430393185630298, + "loss": 1.0072, + "step": 6235 + }, + { + "epoch": 0.1601229937684655, + "grad_norm": 0.75, + "learning_rate": 0.00019430244660593798, + "loss": 1.0817, + "step": 6236 + }, + { + "epoch": 0.16014867096438734, + "grad_norm": 0.734375, + "learning_rate": 0.00019430096116763673, + "loss": 1.1069, + "step": 6237 + }, + { + "epoch": 0.16017434816030915, + "grad_norm": 0.78515625, + "learning_rate": 0.00019429947554140217, + "loss": 1.2286, + "step": 6238 + }, + { + "epoch": 0.16020002535623099, + "grad_norm": 0.859375, + "learning_rate": 0.00019429798972723727, + "loss": 1.1716, + "step": 6239 + }, + { + "epoch": 0.1602257025521528, + "grad_norm": 0.76953125, + "learning_rate": 0.000194296503725145, + "loss": 1.1649, + "step": 6240 + }, + { + "epoch": 0.1602513797480746, + "grad_norm": 0.78515625, + "learning_rate": 0.0001942950175351283, + "loss": 0.9757, + "step": 6241 + }, + { + "epoch": 0.16027705694399644, + "grad_norm": 0.828125, + "learning_rate": 0.00019429353115719018, + "loss": 1.0162, + "step": 6242 + }, + { + "epoch": 0.16030273413991825, + "grad_norm": 0.828125, + "learning_rate": 0.00019429204459133357, + "loss": 1.254, + "step": 6243 + }, + { + "epoch": 0.16032841133584008, + "grad_norm": 0.82421875, + "learning_rate": 0.0001942905578375614, + "loss": 1.0868, + "step": 6244 + }, + { + "epoch": 0.1603540885317619, + "grad_norm": 0.8125, + "learning_rate": 0.0001942890708958767, + "loss": 1.0188, + "step": 6245 + }, + { + "epoch": 0.1603797657276837, + "grad_norm": 0.8828125, + "learning_rate": 0.00019428758376628238, + "loss": 1.1057, + "step": 6246 + }, + { + "epoch": 0.16040544292360553, + "grad_norm": 0.78515625, + "learning_rate": 0.00019428609644878142, + "loss": 0.98, + "step": 6247 + }, + { + "epoch": 0.16043112011952734, + "grad_norm": 0.81640625, + "learning_rate": 0.0001942846089433768, + "loss": 1.0562, + "step": 6248 + }, + { + "epoch": 0.16045679731544918, + "grad_norm": 0.83984375, + "learning_rate": 0.00019428312125007144, + "loss": 1.112, + "step": 6249 + }, + { + "epoch": 0.160482474511371, + "grad_norm": 0.83984375, + "learning_rate": 0.00019428163336886834, + "loss": 1.0981, + "step": 6250 + }, + { + "epoch": 0.1605081517072928, + "grad_norm": 0.7734375, + "learning_rate": 0.0001942801452997705, + "loss": 0.9996, + "step": 6251 + }, + { + "epoch": 0.16053382890321463, + "grad_norm": 0.84765625, + "learning_rate": 0.00019427865704278082, + "loss": 1.0372, + "step": 6252 + }, + { + "epoch": 0.16055950609913644, + "grad_norm": 0.8203125, + "learning_rate": 0.0001942771685979023, + "loss": 1.0138, + "step": 6253 + }, + { + "epoch": 0.16058518329505828, + "grad_norm": 0.82421875, + "learning_rate": 0.00019427567996513793, + "loss": 1.1162, + "step": 6254 + }, + { + "epoch": 0.16061086049098008, + "grad_norm": 0.87890625, + "learning_rate": 0.0001942741911444906, + "loss": 1.1036, + "step": 6255 + }, + { + "epoch": 0.1606365376869019, + "grad_norm": 0.76953125, + "learning_rate": 0.00019427270213596333, + "loss": 0.994, + "step": 6256 + }, + { + "epoch": 0.16066221488282373, + "grad_norm": 0.8046875, + "learning_rate": 0.0001942712129395591, + "loss": 0.9784, + "step": 6257 + }, + { + "epoch": 0.16068789207874554, + "grad_norm": 0.84375, + "learning_rate": 0.00019426972355528083, + "loss": 1.107, + "step": 6258 + }, + { + "epoch": 0.16071356927466737, + "grad_norm": 0.8515625, + "learning_rate": 0.00019426823398313153, + "loss": 1.0503, + "step": 6259 + }, + { + "epoch": 0.16073924647058918, + "grad_norm": 0.80859375, + "learning_rate": 0.00019426674422311414, + "loss": 1.166, + "step": 6260 + }, + { + "epoch": 0.160764923666511, + "grad_norm": 0.796875, + "learning_rate": 0.00019426525427523167, + "loss": 1.1391, + "step": 6261 + }, + { + "epoch": 0.16079060086243283, + "grad_norm": 0.765625, + "learning_rate": 0.00019426376413948706, + "loss": 1.1458, + "step": 6262 + }, + { + "epoch": 0.16081627805835463, + "grad_norm": 0.8203125, + "learning_rate": 0.0001942622738158833, + "loss": 1.0929, + "step": 6263 + }, + { + "epoch": 0.16084195525427647, + "grad_norm": 0.8046875, + "learning_rate": 0.0001942607833044233, + "loss": 1.1892, + "step": 6264 + }, + { + "epoch": 0.16086763245019828, + "grad_norm": 0.80078125, + "learning_rate": 0.00019425929260511007, + "loss": 1.2022, + "step": 6265 + }, + { + "epoch": 0.1608933096461201, + "grad_norm": 0.78125, + "learning_rate": 0.0001942578017179466, + "loss": 1.0443, + "step": 6266 + }, + { + "epoch": 0.16091898684204192, + "grad_norm": 0.83203125, + "learning_rate": 0.00019425631064293585, + "loss": 0.9666, + "step": 6267 + }, + { + "epoch": 0.16094466403796373, + "grad_norm": 0.84375, + "learning_rate": 0.0001942548193800808, + "loss": 1.051, + "step": 6268 + }, + { + "epoch": 0.16097034123388557, + "grad_norm": 0.73828125, + "learning_rate": 0.0001942533279293844, + "loss": 0.8988, + "step": 6269 + }, + { + "epoch": 0.16099601842980737, + "grad_norm": 0.8984375, + "learning_rate": 0.00019425183629084963, + "loss": 1.0332, + "step": 6270 + }, + { + "epoch": 0.16102169562572918, + "grad_norm": 0.77734375, + "learning_rate": 0.00019425034446447946, + "loss": 1.1481, + "step": 6271 + }, + { + "epoch": 0.16104737282165102, + "grad_norm": 0.8515625, + "learning_rate": 0.0001942488524502769, + "loss": 1.109, + "step": 6272 + }, + { + "epoch": 0.16107305001757283, + "grad_norm": 0.81640625, + "learning_rate": 0.00019424736024824485, + "loss": 1.0792, + "step": 6273 + }, + { + "epoch": 0.16109872721349466, + "grad_norm": 0.91796875, + "learning_rate": 0.00019424586785838632, + "loss": 1.1161, + "step": 6274 + }, + { + "epoch": 0.16112440440941647, + "grad_norm": 0.84765625, + "learning_rate": 0.0001942443752807043, + "loss": 0.976, + "step": 6275 + }, + { + "epoch": 0.16115008160533828, + "grad_norm": 0.89453125, + "learning_rate": 0.00019424288251520175, + "loss": 1.1484, + "step": 6276 + }, + { + "epoch": 0.16117575880126012, + "grad_norm": 0.8203125, + "learning_rate": 0.00019424138956188167, + "loss": 1.0702, + "step": 6277 + }, + { + "epoch": 0.16120143599718192, + "grad_norm": 0.8125, + "learning_rate": 0.00019423989642074698, + "loss": 1.0562, + "step": 6278 + }, + { + "epoch": 0.16122711319310376, + "grad_norm": 0.81640625, + "learning_rate": 0.00019423840309180072, + "loss": 1.0498, + "step": 6279 + }, + { + "epoch": 0.16125279038902557, + "grad_norm": 0.7734375, + "learning_rate": 0.00019423690957504582, + "loss": 1.0373, + "step": 6280 + }, + { + "epoch": 0.16127846758494738, + "grad_norm": 0.8203125, + "learning_rate": 0.00019423541587048528, + "loss": 1.1354, + "step": 6281 + }, + { + "epoch": 0.1613041447808692, + "grad_norm": 0.84375, + "learning_rate": 0.00019423392197812206, + "loss": 1.1892, + "step": 6282 + }, + { + "epoch": 0.16132982197679102, + "grad_norm": 0.83984375, + "learning_rate": 0.00019423242789795914, + "loss": 1.028, + "step": 6283 + }, + { + "epoch": 0.16135549917271286, + "grad_norm": 0.796875, + "learning_rate": 0.00019423093362999953, + "loss": 0.9083, + "step": 6284 + }, + { + "epoch": 0.16138117636863467, + "grad_norm": 0.81640625, + "learning_rate": 0.00019422943917424618, + "loss": 1.1995, + "step": 6285 + }, + { + "epoch": 0.16140685356455647, + "grad_norm": 0.87890625, + "learning_rate": 0.00019422794453070202, + "loss": 1.2902, + "step": 6286 + }, + { + "epoch": 0.1614325307604783, + "grad_norm": 0.81640625, + "learning_rate": 0.00019422644969937012, + "loss": 0.9506, + "step": 6287 + }, + { + "epoch": 0.16145820795640012, + "grad_norm": 0.828125, + "learning_rate": 0.0001942249546802534, + "loss": 1.0282, + "step": 6288 + }, + { + "epoch": 0.16148388515232195, + "grad_norm": 0.91015625, + "learning_rate": 0.00019422345947335489, + "loss": 1.037, + "step": 6289 + }, + { + "epoch": 0.16150956234824376, + "grad_norm": 0.75, + "learning_rate": 0.00019422196407867747, + "loss": 1.1191, + "step": 6290 + }, + { + "epoch": 0.16153523954416557, + "grad_norm": 0.88671875, + "learning_rate": 0.00019422046849622423, + "loss": 1.1482, + "step": 6291 + }, + { + "epoch": 0.1615609167400874, + "grad_norm": 0.82421875, + "learning_rate": 0.00019421897272599813, + "loss": 1.0987, + "step": 6292 + }, + { + "epoch": 0.16158659393600922, + "grad_norm": 1.6015625, + "learning_rate": 0.00019421747676800209, + "loss": 1.1847, + "step": 6293 + }, + { + "epoch": 0.16161227113193105, + "grad_norm": 0.84765625, + "learning_rate": 0.00019421598062223914, + "loss": 1.1041, + "step": 6294 + }, + { + "epoch": 0.16163794832785286, + "grad_norm": 0.83984375, + "learning_rate": 0.00019421448428871225, + "loss": 1.1547, + "step": 6295 + }, + { + "epoch": 0.16166362552377467, + "grad_norm": 0.85546875, + "learning_rate": 0.00019421298776742442, + "loss": 1.1676, + "step": 6296 + }, + { + "epoch": 0.1616893027196965, + "grad_norm": 0.77734375, + "learning_rate": 0.0001942114910583786, + "loss": 1.0102, + "step": 6297 + }, + { + "epoch": 0.1617149799156183, + "grad_norm": 0.7734375, + "learning_rate": 0.00019420999416157778, + "loss": 0.9418, + "step": 6298 + }, + { + "epoch": 0.16174065711154015, + "grad_norm": 0.78515625, + "learning_rate": 0.00019420849707702496, + "loss": 1.1251, + "step": 6299 + }, + { + "epoch": 0.16176633430746196, + "grad_norm": 0.76953125, + "learning_rate": 0.00019420699980472313, + "loss": 1.083, + "step": 6300 + }, + { + "epoch": 0.16179201150338376, + "grad_norm": 0.81640625, + "learning_rate": 0.00019420550234467522, + "loss": 0.9916, + "step": 6301 + }, + { + "epoch": 0.1618176886993056, + "grad_norm": 0.8125, + "learning_rate": 0.00019420400469688428, + "loss": 1.1267, + "step": 6302 + }, + { + "epoch": 0.1618433658952274, + "grad_norm": 0.84765625, + "learning_rate": 0.00019420250686135327, + "loss": 1.3763, + "step": 6303 + }, + { + "epoch": 0.16186904309114922, + "grad_norm": 0.7890625, + "learning_rate": 0.00019420100883808518, + "loss": 0.9073, + "step": 6304 + }, + { + "epoch": 0.16189472028707105, + "grad_norm": 0.7890625, + "learning_rate": 0.00019419951062708298, + "loss": 1.1359, + "step": 6305 + }, + { + "epoch": 0.16192039748299286, + "grad_norm": 0.8125, + "learning_rate": 0.00019419801222834965, + "loss": 0.9859, + "step": 6306 + }, + { + "epoch": 0.1619460746789147, + "grad_norm": 0.81640625, + "learning_rate": 0.00019419651364188821, + "loss": 1.0763, + "step": 6307 + }, + { + "epoch": 0.1619717518748365, + "grad_norm": 0.8359375, + "learning_rate": 0.0001941950148677016, + "loss": 1.0639, + "step": 6308 + }, + { + "epoch": 0.16199742907075831, + "grad_norm": 0.828125, + "learning_rate": 0.00019419351590579286, + "loss": 1.1248, + "step": 6309 + }, + { + "epoch": 0.16202310626668015, + "grad_norm": 1.1171875, + "learning_rate": 0.00019419201675616496, + "loss": 1.1127, + "step": 6310 + }, + { + "epoch": 0.16204878346260196, + "grad_norm": 0.76171875, + "learning_rate": 0.00019419051741882085, + "loss": 0.9315, + "step": 6311 + }, + { + "epoch": 0.1620744606585238, + "grad_norm": 0.8046875, + "learning_rate": 0.00019418901789376359, + "loss": 0.972, + "step": 6312 + }, + { + "epoch": 0.1621001378544456, + "grad_norm": 0.859375, + "learning_rate": 0.00019418751818099607, + "loss": 1.0294, + "step": 6313 + }, + { + "epoch": 0.1621258150503674, + "grad_norm": 0.8046875, + "learning_rate": 0.00019418601828052136, + "loss": 1.1021, + "step": 6314 + }, + { + "epoch": 0.16215149224628925, + "grad_norm": 1.09375, + "learning_rate": 0.0001941845181923424, + "loss": 1.2379, + "step": 6315 + }, + { + "epoch": 0.16217716944221106, + "grad_norm": 0.765625, + "learning_rate": 0.00019418301791646227, + "loss": 0.8657, + "step": 6316 + }, + { + "epoch": 0.1622028466381329, + "grad_norm": 0.90625, + "learning_rate": 0.00019418151745288385, + "loss": 1.0596, + "step": 6317 + }, + { + "epoch": 0.1622285238340547, + "grad_norm": 0.859375, + "learning_rate": 0.00019418001680161017, + "loss": 1.0519, + "step": 6318 + }, + { + "epoch": 0.1622542010299765, + "grad_norm": 0.921875, + "learning_rate": 0.00019417851596264423, + "loss": 0.9655, + "step": 6319 + }, + { + "epoch": 0.16227987822589834, + "grad_norm": 0.83984375, + "learning_rate": 0.000194177014935989, + "loss": 0.9116, + "step": 6320 + }, + { + "epoch": 0.16230555542182015, + "grad_norm": 0.82421875, + "learning_rate": 0.00019417551372164751, + "loss": 1.1511, + "step": 6321 + }, + { + "epoch": 0.162331232617742, + "grad_norm": 0.8125, + "learning_rate": 0.0001941740123196227, + "loss": 1.0437, + "step": 6322 + }, + { + "epoch": 0.1623569098136638, + "grad_norm": 0.98046875, + "learning_rate": 0.0001941725107299176, + "loss": 0.9798, + "step": 6323 + }, + { + "epoch": 0.1623825870095856, + "grad_norm": 0.8984375, + "learning_rate": 0.00019417100895253523, + "loss": 0.9242, + "step": 6324 + }, + { + "epoch": 0.16240826420550744, + "grad_norm": 0.828125, + "learning_rate": 0.00019416950698747848, + "loss": 1.1557, + "step": 6325 + }, + { + "epoch": 0.16243394140142925, + "grad_norm": 0.84375, + "learning_rate": 0.00019416800483475045, + "loss": 1.0456, + "step": 6326 + }, + { + "epoch": 0.16245961859735109, + "grad_norm": 0.82421875, + "learning_rate": 0.0001941665024943541, + "loss": 1.0852, + "step": 6327 + }, + { + "epoch": 0.1624852957932729, + "grad_norm": 0.9375, + "learning_rate": 0.0001941649999662924, + "loss": 1.162, + "step": 6328 + }, + { + "epoch": 0.1625109729891947, + "grad_norm": 2.765625, + "learning_rate": 0.00019416349725056838, + "loss": 0.9654, + "step": 6329 + }, + { + "epoch": 0.16253665018511654, + "grad_norm": 0.8828125, + "learning_rate": 0.00019416199434718498, + "loss": 1.3226, + "step": 6330 + }, + { + "epoch": 0.16256232738103835, + "grad_norm": 0.98046875, + "learning_rate": 0.00019416049125614527, + "loss": 1.0373, + "step": 6331 + }, + { + "epoch": 0.16258800457696018, + "grad_norm": 0.8828125, + "learning_rate": 0.00019415898797745218, + "loss": 1.0616, + "step": 6332 + }, + { + "epoch": 0.162613681772882, + "grad_norm": 0.875, + "learning_rate": 0.0001941574845111087, + "loss": 1.0933, + "step": 6333 + }, + { + "epoch": 0.1626393589688038, + "grad_norm": 0.8046875, + "learning_rate": 0.0001941559808571179, + "loss": 1.0688, + "step": 6334 + }, + { + "epoch": 0.16266503616472563, + "grad_norm": 0.81640625, + "learning_rate": 0.00019415447701548276, + "loss": 0.9489, + "step": 6335 + }, + { + "epoch": 0.16269071336064744, + "grad_norm": 0.89453125, + "learning_rate": 0.0001941529729862062, + "loss": 1.0447, + "step": 6336 + }, + { + "epoch": 0.16271639055656928, + "grad_norm": 0.83203125, + "learning_rate": 0.0001941514687692913, + "loss": 1.0753, + "step": 6337 + }, + { + "epoch": 0.1627420677524911, + "grad_norm": 0.78125, + "learning_rate": 0.00019414996436474104, + "loss": 1.1647, + "step": 6338 + }, + { + "epoch": 0.1627677449484129, + "grad_norm": 0.85546875, + "learning_rate": 0.00019414845977255838, + "loss": 0.9797, + "step": 6339 + }, + { + "epoch": 0.16279342214433473, + "grad_norm": 0.796875, + "learning_rate": 0.00019414695499274636, + "loss": 0.9883, + "step": 6340 + }, + { + "epoch": 0.16281909934025654, + "grad_norm": 0.82421875, + "learning_rate": 0.00019414545002530796, + "loss": 1.0721, + "step": 6341 + }, + { + "epoch": 0.16284477653617838, + "grad_norm": 0.76953125, + "learning_rate": 0.00019414394487024617, + "loss": 1.0202, + "step": 6342 + }, + { + "epoch": 0.16287045373210018, + "grad_norm": 0.83984375, + "learning_rate": 0.000194142439527564, + "loss": 1.1866, + "step": 6343 + }, + { + "epoch": 0.162896130928022, + "grad_norm": 0.7734375, + "learning_rate": 0.00019414093399726444, + "loss": 1.08, + "step": 6344 + }, + { + "epoch": 0.16292180812394383, + "grad_norm": 0.84375, + "learning_rate": 0.00019413942827935053, + "loss": 1.1168, + "step": 6345 + }, + { + "epoch": 0.16294748531986564, + "grad_norm": 0.83203125, + "learning_rate": 0.00019413792237382526, + "loss": 1.0864, + "step": 6346 + }, + { + "epoch": 0.16297316251578747, + "grad_norm": 0.890625, + "learning_rate": 0.00019413641628069156, + "loss": 1.0358, + "step": 6347 + }, + { + "epoch": 0.16299883971170928, + "grad_norm": 0.76953125, + "learning_rate": 0.00019413490999995253, + "loss": 0.9823, + "step": 6348 + }, + { + "epoch": 0.1630245169076311, + "grad_norm": 0.83203125, + "learning_rate": 0.0001941334035316111, + "loss": 1.0927, + "step": 6349 + }, + { + "epoch": 0.16305019410355293, + "grad_norm": 0.75, + "learning_rate": 0.00019413189687567033, + "loss": 1.0451, + "step": 6350 + }, + { + "epoch": 0.16307587129947473, + "grad_norm": 0.8203125, + "learning_rate": 0.00019413039003213318, + "loss": 1.0946, + "step": 6351 + }, + { + "epoch": 0.16310154849539657, + "grad_norm": 0.86328125, + "learning_rate": 0.00019412888300100267, + "loss": 1.0891, + "step": 6352 + }, + { + "epoch": 0.16312722569131838, + "grad_norm": 0.7890625, + "learning_rate": 0.00019412737578228178, + "loss": 1.0825, + "step": 6353 + }, + { + "epoch": 0.1631529028872402, + "grad_norm": 1.0859375, + "learning_rate": 0.00019412586837597352, + "loss": 0.9314, + "step": 6354 + }, + { + "epoch": 0.16317858008316202, + "grad_norm": 0.80078125, + "learning_rate": 0.00019412436078208094, + "loss": 1.189, + "step": 6355 + }, + { + "epoch": 0.16320425727908383, + "grad_norm": 0.8671875, + "learning_rate": 0.000194122853000607, + "loss": 1.0216, + "step": 6356 + }, + { + "epoch": 0.16322993447500567, + "grad_norm": 0.83203125, + "learning_rate": 0.0001941213450315547, + "loss": 1.0096, + "step": 6357 + }, + { + "epoch": 0.16325561167092748, + "grad_norm": 0.80078125, + "learning_rate": 0.0001941198368749271, + "loss": 1.1286, + "step": 6358 + }, + { + "epoch": 0.16328128886684928, + "grad_norm": 0.80859375, + "learning_rate": 0.00019411832853072713, + "loss": 1.1185, + "step": 6359 + }, + { + "epoch": 0.16330696606277112, + "grad_norm": 0.8203125, + "learning_rate": 0.00019411681999895785, + "loss": 1.1582, + "step": 6360 + }, + { + "epoch": 0.16333264325869293, + "grad_norm": 0.80078125, + "learning_rate": 0.00019411531127962227, + "loss": 1.0717, + "step": 6361 + }, + { + "epoch": 0.16335832045461476, + "grad_norm": 0.828125, + "learning_rate": 0.00019411380237272332, + "loss": 1.0748, + "step": 6362 + }, + { + "epoch": 0.16338399765053657, + "grad_norm": 0.81640625, + "learning_rate": 0.0001941122932782641, + "loss": 1.1038, + "step": 6363 + }, + { + "epoch": 0.16340967484645838, + "grad_norm": 1.0859375, + "learning_rate": 0.00019411078399624758, + "loss": 1.1703, + "step": 6364 + }, + { + "epoch": 0.16343535204238022, + "grad_norm": 0.84375, + "learning_rate": 0.0001941092745266768, + "loss": 0.8899, + "step": 6365 + }, + { + "epoch": 0.16346102923830202, + "grad_norm": 0.7734375, + "learning_rate": 0.00019410776486955466, + "loss": 1.0852, + "step": 6366 + }, + { + "epoch": 0.16348670643422386, + "grad_norm": 0.87890625, + "learning_rate": 0.0001941062550248843, + "loss": 1.075, + "step": 6367 + }, + { + "epoch": 0.16351238363014567, + "grad_norm": 0.83203125, + "learning_rate": 0.00019410474499266867, + "loss": 0.9424, + "step": 6368 + }, + { + "epoch": 0.16353806082606748, + "grad_norm": 0.80078125, + "learning_rate": 0.00019410323477291077, + "loss": 0.9926, + "step": 6369 + }, + { + "epoch": 0.1635637380219893, + "grad_norm": 0.98046875, + "learning_rate": 0.00019410172436561362, + "loss": 0.9391, + "step": 6370 + }, + { + "epoch": 0.16358941521791112, + "grad_norm": 0.83984375, + "learning_rate": 0.0001941002137707802, + "loss": 1.0382, + "step": 6371 + }, + { + "epoch": 0.16361509241383296, + "grad_norm": 0.74609375, + "learning_rate": 0.00019409870298841363, + "loss": 1.1562, + "step": 6372 + }, + { + "epoch": 0.16364076960975477, + "grad_norm": 0.77734375, + "learning_rate": 0.00019409719201851682, + "loss": 1.0703, + "step": 6373 + }, + { + "epoch": 0.16366644680567657, + "grad_norm": 1.140625, + "learning_rate": 0.0001940956808610928, + "loss": 0.9631, + "step": 6374 + }, + { + "epoch": 0.1636921240015984, + "grad_norm": 0.85546875, + "learning_rate": 0.0001940941695161446, + "loss": 1.1208, + "step": 6375 + }, + { + "epoch": 0.16371780119752022, + "grad_norm": 0.85546875, + "learning_rate": 0.00019409265798367518, + "loss": 1.1185, + "step": 6376 + }, + { + "epoch": 0.16374347839344205, + "grad_norm": 0.90234375, + "learning_rate": 0.00019409114626368763, + "loss": 1.0542, + "step": 6377 + }, + { + "epoch": 0.16376915558936386, + "grad_norm": 0.84375, + "learning_rate": 0.0001940896343561849, + "loss": 1.081, + "step": 6378 + }, + { + "epoch": 0.16379483278528567, + "grad_norm": 0.78515625, + "learning_rate": 0.00019408812226117005, + "loss": 0.9049, + "step": 6379 + }, + { + "epoch": 0.1638205099812075, + "grad_norm": 0.8125, + "learning_rate": 0.00019408660997864601, + "loss": 1.0037, + "step": 6380 + }, + { + "epoch": 0.16384618717712932, + "grad_norm": 0.84375, + "learning_rate": 0.00019408509750861594, + "loss": 1.0733, + "step": 6381 + }, + { + "epoch": 0.16387186437305115, + "grad_norm": 0.8203125, + "learning_rate": 0.00019408358485108274, + "loss": 1.2632, + "step": 6382 + }, + { + "epoch": 0.16389754156897296, + "grad_norm": 0.82421875, + "learning_rate": 0.00019408207200604944, + "loss": 0.9607, + "step": 6383 + }, + { + "epoch": 0.16392321876489477, + "grad_norm": 0.8671875, + "learning_rate": 0.00019408055897351907, + "loss": 1.2046, + "step": 6384 + }, + { + "epoch": 0.1639488959608166, + "grad_norm": 0.80859375, + "learning_rate": 0.00019407904575349466, + "loss": 1.1603, + "step": 6385 + }, + { + "epoch": 0.1639745731567384, + "grad_norm": 0.81640625, + "learning_rate": 0.0001940775323459792, + "loss": 1.1114, + "step": 6386 + }, + { + "epoch": 0.16400025035266025, + "grad_norm": 0.828125, + "learning_rate": 0.0001940760187509757, + "loss": 1.034, + "step": 6387 + }, + { + "epoch": 0.16402592754858206, + "grad_norm": 0.83203125, + "learning_rate": 0.00019407450496848722, + "loss": 1.0424, + "step": 6388 + }, + { + "epoch": 0.16405160474450386, + "grad_norm": 0.84765625, + "learning_rate": 0.00019407299099851675, + "loss": 1.0374, + "step": 6389 + }, + { + "epoch": 0.1640772819404257, + "grad_norm": 0.78125, + "learning_rate": 0.0001940714768410673, + "loss": 1.129, + "step": 6390 + }, + { + "epoch": 0.1641029591363475, + "grad_norm": 0.8046875, + "learning_rate": 0.00019406996249614187, + "loss": 0.9241, + "step": 6391 + }, + { + "epoch": 0.16412863633226935, + "grad_norm": 0.7734375, + "learning_rate": 0.00019406844796374353, + "loss": 1.0626, + "step": 6392 + }, + { + "epoch": 0.16415431352819115, + "grad_norm": 0.72265625, + "learning_rate": 0.00019406693324387525, + "loss": 1.0975, + "step": 6393 + }, + { + "epoch": 0.16417999072411296, + "grad_norm": 0.90234375, + "learning_rate": 0.0001940654183365401, + "loss": 1.199, + "step": 6394 + }, + { + "epoch": 0.1642056679200348, + "grad_norm": 0.984375, + "learning_rate": 0.00019406390324174105, + "loss": 1.0193, + "step": 6395 + }, + { + "epoch": 0.1642313451159566, + "grad_norm": 0.7734375, + "learning_rate": 0.00019406238795948114, + "loss": 1.0841, + "step": 6396 + }, + { + "epoch": 0.16425702231187844, + "grad_norm": 0.91015625, + "learning_rate": 0.00019406087248976338, + "loss": 1.1184, + "step": 6397 + }, + { + "epoch": 0.16428269950780025, + "grad_norm": 0.984375, + "learning_rate": 0.0001940593568325908, + "loss": 1.0929, + "step": 6398 + }, + { + "epoch": 0.16430837670372206, + "grad_norm": 0.83984375, + "learning_rate": 0.0001940578409879664, + "loss": 1.1008, + "step": 6399 + }, + { + "epoch": 0.1643340538996439, + "grad_norm": 0.84375, + "learning_rate": 0.00019405632495589324, + "loss": 1.0957, + "step": 6400 + }, + { + "epoch": 0.1643597310955657, + "grad_norm": 0.86328125, + "learning_rate": 0.00019405480873637432, + "loss": 1.0054, + "step": 6401 + }, + { + "epoch": 0.16438540829148754, + "grad_norm": 0.8671875, + "learning_rate": 0.00019405329232941268, + "loss": 1.0646, + "step": 6402 + }, + { + "epoch": 0.16441108548740935, + "grad_norm": 0.84375, + "learning_rate": 0.0001940517757350113, + "loss": 1.1465, + "step": 6403 + }, + { + "epoch": 0.16443676268333116, + "grad_norm": 0.7890625, + "learning_rate": 0.00019405025895317325, + "loss": 0.9406, + "step": 6404 + }, + { + "epoch": 0.164462439879253, + "grad_norm": 0.90234375, + "learning_rate": 0.00019404874198390148, + "loss": 1.0478, + "step": 6405 + }, + { + "epoch": 0.1644881170751748, + "grad_norm": 0.83203125, + "learning_rate": 0.0001940472248271991, + "loss": 1.0065, + "step": 6406 + }, + { + "epoch": 0.16451379427109664, + "grad_norm": 0.8203125, + "learning_rate": 0.0001940457074830691, + "loss": 1.0311, + "step": 6407 + }, + { + "epoch": 0.16453947146701844, + "grad_norm": 0.82421875, + "learning_rate": 0.00019404418995151447, + "loss": 1.0518, + "step": 6408 + }, + { + "epoch": 0.16456514866294025, + "grad_norm": 0.984375, + "learning_rate": 0.00019404267223253828, + "loss": 1.1086, + "step": 6409 + }, + { + "epoch": 0.1645908258588621, + "grad_norm": 0.828125, + "learning_rate": 0.00019404115432614355, + "loss": 0.938, + "step": 6410 + }, + { + "epoch": 0.1646165030547839, + "grad_norm": 0.8046875, + "learning_rate": 0.0001940396362323333, + "loss": 0.9732, + "step": 6411 + }, + { + "epoch": 0.16464218025070573, + "grad_norm": 0.93359375, + "learning_rate": 0.00019403811795111052, + "loss": 1.0483, + "step": 6412 + }, + { + "epoch": 0.16466785744662754, + "grad_norm": 0.8828125, + "learning_rate": 0.0001940365994824783, + "loss": 1.1147, + "step": 6413 + }, + { + "epoch": 0.16469353464254935, + "grad_norm": 0.875, + "learning_rate": 0.00019403508082643963, + "loss": 0.9717, + "step": 6414 + }, + { + "epoch": 0.16471921183847119, + "grad_norm": 0.87109375, + "learning_rate": 0.00019403356198299752, + "loss": 1.2198, + "step": 6415 + }, + { + "epoch": 0.164744889034393, + "grad_norm": 0.93359375, + "learning_rate": 0.000194032042952155, + "loss": 1.0888, + "step": 6416 + }, + { + "epoch": 0.16477056623031483, + "grad_norm": 0.9609375, + "learning_rate": 0.00019403052373391516, + "loss": 1.1721, + "step": 6417 + }, + { + "epoch": 0.16479624342623664, + "grad_norm": 0.8515625, + "learning_rate": 0.00019402900432828092, + "loss": 0.9075, + "step": 6418 + }, + { + "epoch": 0.16482192062215845, + "grad_norm": 1.0, + "learning_rate": 0.0001940274847352554, + "loss": 1.1783, + "step": 6419 + }, + { + "epoch": 0.16484759781808028, + "grad_norm": 0.91796875, + "learning_rate": 0.0001940259649548416, + "loss": 1.0849, + "step": 6420 + }, + { + "epoch": 0.1648732750140021, + "grad_norm": 0.8671875, + "learning_rate": 0.00019402444498704252, + "loss": 1.1229, + "step": 6421 + }, + { + "epoch": 0.16489895220992393, + "grad_norm": 0.78125, + "learning_rate": 0.00019402292483186123, + "loss": 1.1491, + "step": 6422 + }, + { + "epoch": 0.16492462940584574, + "grad_norm": 0.84765625, + "learning_rate": 0.00019402140448930078, + "loss": 1.0217, + "step": 6423 + }, + { + "epoch": 0.16495030660176754, + "grad_norm": 0.89453125, + "learning_rate": 0.0001940198839593641, + "loss": 1.1031, + "step": 6424 + }, + { + "epoch": 0.16497598379768938, + "grad_norm": 0.85546875, + "learning_rate": 0.00019401836324205434, + "loss": 1.2159, + "step": 6425 + }, + { + "epoch": 0.1650016609936112, + "grad_norm": 0.91796875, + "learning_rate": 0.00019401684233737445, + "loss": 0.9826, + "step": 6426 + }, + { + "epoch": 0.16502733818953302, + "grad_norm": 0.9140625, + "learning_rate": 0.00019401532124532748, + "loss": 1.0865, + "step": 6427 + }, + { + "epoch": 0.16505301538545483, + "grad_norm": 0.8515625, + "learning_rate": 0.00019401379996591647, + "loss": 1.1564, + "step": 6428 + }, + { + "epoch": 0.16507869258137664, + "grad_norm": 0.81640625, + "learning_rate": 0.00019401227849914445, + "loss": 1.0156, + "step": 6429 + }, + { + "epoch": 0.16510436977729848, + "grad_norm": 0.85546875, + "learning_rate": 0.00019401075684501447, + "loss": 1.1359, + "step": 6430 + }, + { + "epoch": 0.16513004697322028, + "grad_norm": 0.8046875, + "learning_rate": 0.0001940092350035295, + "loss": 1.003, + "step": 6431 + }, + { + "epoch": 0.16515572416914212, + "grad_norm": 0.81640625, + "learning_rate": 0.00019400771297469266, + "loss": 1.1651, + "step": 6432 + }, + { + "epoch": 0.16518140136506393, + "grad_norm": 0.7890625, + "learning_rate": 0.0001940061907585069, + "loss": 1.1641, + "step": 6433 + }, + { + "epoch": 0.16520707856098574, + "grad_norm": 0.88671875, + "learning_rate": 0.00019400466835497532, + "loss": 1.2577, + "step": 6434 + }, + { + "epoch": 0.16523275575690757, + "grad_norm": 1.21875, + "learning_rate": 0.00019400314576410095, + "loss": 0.9743, + "step": 6435 + }, + { + "epoch": 0.16525843295282938, + "grad_norm": 0.80859375, + "learning_rate": 0.00019400162298588674, + "loss": 1.1442, + "step": 6436 + }, + { + "epoch": 0.16528411014875122, + "grad_norm": 0.8515625, + "learning_rate": 0.00019400010002033584, + "loss": 1.1145, + "step": 6437 + }, + { + "epoch": 0.16530978734467303, + "grad_norm": 0.84375, + "learning_rate": 0.0001939985768674512, + "loss": 1.029, + "step": 6438 + }, + { + "epoch": 0.16533546454059483, + "grad_norm": 0.79296875, + "learning_rate": 0.00019399705352723593, + "loss": 0.9281, + "step": 6439 + }, + { + "epoch": 0.16536114173651667, + "grad_norm": 0.86328125, + "learning_rate": 0.000193995529999693, + "loss": 1.0946, + "step": 6440 + }, + { + "epoch": 0.16538681893243848, + "grad_norm": 0.875, + "learning_rate": 0.00019399400628482543, + "loss": 1.0941, + "step": 6441 + }, + { + "epoch": 0.16541249612836031, + "grad_norm": 0.8203125, + "learning_rate": 0.00019399248238263634, + "loss": 1.0129, + "step": 6442 + }, + { + "epoch": 0.16543817332428212, + "grad_norm": 0.7890625, + "learning_rate": 0.0001939909582931287, + "loss": 1.2948, + "step": 6443 + }, + { + "epoch": 0.16546385052020393, + "grad_norm": 0.84375, + "learning_rate": 0.00019398943401630558, + "loss": 1.1079, + "step": 6444 + }, + { + "epoch": 0.16548952771612577, + "grad_norm": 0.84375, + "learning_rate": 0.00019398790955217003, + "loss": 0.9616, + "step": 6445 + }, + { + "epoch": 0.16551520491204758, + "grad_norm": 0.859375, + "learning_rate": 0.00019398638490072503, + "loss": 0.966, + "step": 6446 + }, + { + "epoch": 0.1655408821079694, + "grad_norm": 0.859375, + "learning_rate": 0.00019398486006197367, + "loss": 1.0149, + "step": 6447 + }, + { + "epoch": 0.16556655930389122, + "grad_norm": 0.84765625, + "learning_rate": 0.000193983335035919, + "loss": 1.0742, + "step": 6448 + }, + { + "epoch": 0.16559223649981303, + "grad_norm": 0.90625, + "learning_rate": 0.00019398180982256399, + "loss": 1.0885, + "step": 6449 + }, + { + "epoch": 0.16561791369573486, + "grad_norm": 0.87109375, + "learning_rate": 0.00019398028442191173, + "loss": 1.1131, + "step": 6450 + }, + { + "epoch": 0.16564359089165667, + "grad_norm": 0.84375, + "learning_rate": 0.00019397875883396526, + "loss": 1.0122, + "step": 6451 + }, + { + "epoch": 0.1656692680875785, + "grad_norm": 0.86328125, + "learning_rate": 0.00019397723305872763, + "loss": 1.1066, + "step": 6452 + }, + { + "epoch": 0.16569494528350032, + "grad_norm": 0.99609375, + "learning_rate": 0.00019397570709620183, + "loss": 1.0553, + "step": 6453 + }, + { + "epoch": 0.16572062247942212, + "grad_norm": 0.74609375, + "learning_rate": 0.00019397418094639095, + "loss": 0.9962, + "step": 6454 + }, + { + "epoch": 0.16574629967534396, + "grad_norm": 0.80859375, + "learning_rate": 0.00019397265460929802, + "loss": 1.0074, + "step": 6455 + }, + { + "epoch": 0.16577197687126577, + "grad_norm": 0.79296875, + "learning_rate": 0.00019397112808492608, + "loss": 1.0063, + "step": 6456 + }, + { + "epoch": 0.1657976540671876, + "grad_norm": 0.76171875, + "learning_rate": 0.0001939696013732782, + "loss": 1.1077, + "step": 6457 + }, + { + "epoch": 0.1658233312631094, + "grad_norm": 0.83203125, + "learning_rate": 0.00019396807447435733, + "loss": 1.2133, + "step": 6458 + }, + { + "epoch": 0.16584900845903122, + "grad_norm": 0.7578125, + "learning_rate": 0.00019396654738816662, + "loss": 1.1652, + "step": 6459 + }, + { + "epoch": 0.16587468565495306, + "grad_norm": 0.84765625, + "learning_rate": 0.00019396502011470904, + "loss": 1.0141, + "step": 6460 + }, + { + "epoch": 0.16590036285087487, + "grad_norm": 0.859375, + "learning_rate": 0.00019396349265398769, + "loss": 1.0401, + "step": 6461 + }, + { + "epoch": 0.1659260400467967, + "grad_norm": 0.7734375, + "learning_rate": 0.00019396196500600556, + "loss": 0.9747, + "step": 6462 + }, + { + "epoch": 0.1659517172427185, + "grad_norm": 0.7890625, + "learning_rate": 0.00019396043717076573, + "loss": 0.9153, + "step": 6463 + }, + { + "epoch": 0.16597739443864032, + "grad_norm": 0.796875, + "learning_rate": 0.00019395890914827125, + "loss": 1.041, + "step": 6464 + }, + { + "epoch": 0.16600307163456215, + "grad_norm": 0.83984375, + "learning_rate": 0.00019395738093852514, + "loss": 1.1705, + "step": 6465 + }, + { + "epoch": 0.16602874883048396, + "grad_norm": 0.7890625, + "learning_rate": 0.00019395585254153045, + "loss": 0.9217, + "step": 6466 + }, + { + "epoch": 0.1660544260264058, + "grad_norm": 0.80078125, + "learning_rate": 0.00019395432395729025, + "loss": 1.0481, + "step": 6467 + }, + { + "epoch": 0.1660801032223276, + "grad_norm": 0.8671875, + "learning_rate": 0.00019395279518580756, + "loss": 1.0491, + "step": 6468 + }, + { + "epoch": 0.16610578041824942, + "grad_norm": 0.859375, + "learning_rate": 0.00019395126622708545, + "loss": 1.2177, + "step": 6469 + }, + { + "epoch": 0.16613145761417125, + "grad_norm": 0.77734375, + "learning_rate": 0.0001939497370811269, + "loss": 1.0234, + "step": 6470 + }, + { + "epoch": 0.16615713481009306, + "grad_norm": 0.8359375, + "learning_rate": 0.0001939482077479351, + "loss": 1.1803, + "step": 6471 + }, + { + "epoch": 0.1661828120060149, + "grad_norm": 0.86328125, + "learning_rate": 0.00019394667822751293, + "loss": 1.136, + "step": 6472 + }, + { + "epoch": 0.1662084892019367, + "grad_norm": 0.875, + "learning_rate": 0.00019394514851986356, + "loss": 1.1039, + "step": 6473 + }, + { + "epoch": 0.1662341663978585, + "grad_norm": 0.84375, + "learning_rate": 0.00019394361862498996, + "loss": 1.0588, + "step": 6474 + }, + { + "epoch": 0.16625984359378035, + "grad_norm": 0.8828125, + "learning_rate": 0.00019394208854289525, + "loss": 1.0235, + "step": 6475 + }, + { + "epoch": 0.16628552078970216, + "grad_norm": 0.84375, + "learning_rate": 0.00019394055827358242, + "loss": 1.1215, + "step": 6476 + }, + { + "epoch": 0.166311197985624, + "grad_norm": 0.90234375, + "learning_rate": 0.00019393902781705456, + "loss": 1.0981, + "step": 6477 + }, + { + "epoch": 0.1663368751815458, + "grad_norm": 0.8046875, + "learning_rate": 0.00019393749717331468, + "loss": 0.9029, + "step": 6478 + }, + { + "epoch": 0.1663625523774676, + "grad_norm": 0.74609375, + "learning_rate": 0.0001939359663423659, + "loss": 1.1119, + "step": 6479 + }, + { + "epoch": 0.16638822957338945, + "grad_norm": 0.90625, + "learning_rate": 0.00019393443532421116, + "loss": 1.1811, + "step": 6480 + }, + { + "epoch": 0.16641390676931125, + "grad_norm": 0.8046875, + "learning_rate": 0.0001939329041188536, + "loss": 1.2285, + "step": 6481 + }, + { + "epoch": 0.1664395839652331, + "grad_norm": 0.86328125, + "learning_rate": 0.00019393137272629628, + "loss": 0.9814, + "step": 6482 + }, + { + "epoch": 0.1664652611611549, + "grad_norm": 0.87109375, + "learning_rate": 0.00019392984114654217, + "loss": 1.1091, + "step": 6483 + }, + { + "epoch": 0.1664909383570767, + "grad_norm": 0.8359375, + "learning_rate": 0.0001939283093795944, + "loss": 1.1673, + "step": 6484 + }, + { + "epoch": 0.16651661555299854, + "grad_norm": 0.9296875, + "learning_rate": 0.000193926777425456, + "loss": 0.9268, + "step": 6485 + }, + { + "epoch": 0.16654229274892035, + "grad_norm": 0.88671875, + "learning_rate": 0.00019392524528413, + "loss": 1.3653, + "step": 6486 + }, + { + "epoch": 0.1665679699448422, + "grad_norm": 0.80859375, + "learning_rate": 0.00019392371295561946, + "loss": 1.0497, + "step": 6487 + }, + { + "epoch": 0.166593647140764, + "grad_norm": 0.76953125, + "learning_rate": 0.00019392218043992748, + "loss": 1.1183, + "step": 6488 + }, + { + "epoch": 0.1666193243366858, + "grad_norm": 0.984375, + "learning_rate": 0.00019392064773705704, + "loss": 1.0809, + "step": 6489 + }, + { + "epoch": 0.16664500153260764, + "grad_norm": 0.8828125, + "learning_rate": 0.00019391911484701124, + "loss": 1.1659, + "step": 6490 + }, + { + "epoch": 0.16667067872852945, + "grad_norm": 0.8828125, + "learning_rate": 0.00019391758176979317, + "loss": 1.1248, + "step": 6491 + }, + { + "epoch": 0.16669635592445128, + "grad_norm": 1.0078125, + "learning_rate": 0.00019391604850540578, + "loss": 1.0933, + "step": 6492 + }, + { + "epoch": 0.1667220331203731, + "grad_norm": 0.97265625, + "learning_rate": 0.00019391451505385224, + "loss": 1.088, + "step": 6493 + }, + { + "epoch": 0.1667477103162949, + "grad_norm": 0.77734375, + "learning_rate": 0.00019391298141513552, + "loss": 1.1473, + "step": 6494 + }, + { + "epoch": 0.16677338751221674, + "grad_norm": 0.921875, + "learning_rate": 0.00019391144758925871, + "loss": 1.0634, + "step": 6495 + }, + { + "epoch": 0.16679906470813854, + "grad_norm": 0.77734375, + "learning_rate": 0.0001939099135762249, + "loss": 0.9573, + "step": 6496 + }, + { + "epoch": 0.16682474190406038, + "grad_norm": 0.89453125, + "learning_rate": 0.0001939083793760371, + "loss": 1.1798, + "step": 6497 + }, + { + "epoch": 0.1668504190999822, + "grad_norm": 0.7890625, + "learning_rate": 0.0001939068449886984, + "loss": 1.1331, + "step": 6498 + }, + { + "epoch": 0.166876096295904, + "grad_norm": 0.8203125, + "learning_rate": 0.0001939053104142118, + "loss": 1.0612, + "step": 6499 + }, + { + "epoch": 0.16690177349182583, + "grad_norm": 0.79296875, + "learning_rate": 0.00019390377565258043, + "loss": 1.091, + "step": 6500 + }, + { + "epoch": 0.16692745068774764, + "grad_norm": 0.79296875, + "learning_rate": 0.0001939022407038073, + "loss": 1.0298, + "step": 6501 + }, + { + "epoch": 0.16695312788366948, + "grad_norm": 0.81640625, + "learning_rate": 0.0001939007055678955, + "loss": 0.9919, + "step": 6502 + }, + { + "epoch": 0.16697880507959129, + "grad_norm": 0.83984375, + "learning_rate": 0.00019389917024484807, + "loss": 1.0407, + "step": 6503 + }, + { + "epoch": 0.1670044822755131, + "grad_norm": 0.80078125, + "learning_rate": 0.00019389763473466807, + "loss": 1.1524, + "step": 6504 + }, + { + "epoch": 0.16703015947143493, + "grad_norm": 0.83984375, + "learning_rate": 0.00019389609903735857, + "loss": 1.238, + "step": 6505 + }, + { + "epoch": 0.16705583666735674, + "grad_norm": 0.85546875, + "learning_rate": 0.00019389456315292263, + "loss": 0.951, + "step": 6506 + }, + { + "epoch": 0.16708151386327855, + "grad_norm": 0.890625, + "learning_rate": 0.0001938930270813633, + "loss": 0.9859, + "step": 6507 + }, + { + "epoch": 0.16710719105920038, + "grad_norm": 0.82421875, + "learning_rate": 0.00019389149082268365, + "loss": 1.0221, + "step": 6508 + }, + { + "epoch": 0.1671328682551222, + "grad_norm": 0.7890625, + "learning_rate": 0.00019388995437688675, + "loss": 1.0569, + "step": 6509 + }, + { + "epoch": 0.16715854545104403, + "grad_norm": 0.82421875, + "learning_rate": 0.00019388841774397567, + "loss": 1.004, + "step": 6510 + }, + { + "epoch": 0.16718422264696584, + "grad_norm": 0.80078125, + "learning_rate": 0.0001938868809239534, + "loss": 1.2186, + "step": 6511 + }, + { + "epoch": 0.16720989984288764, + "grad_norm": 0.80078125, + "learning_rate": 0.0001938853439168231, + "loss": 0.991, + "step": 6512 + }, + { + "epoch": 0.16723557703880948, + "grad_norm": 0.81640625, + "learning_rate": 0.0001938838067225878, + "loss": 1.1458, + "step": 6513 + }, + { + "epoch": 0.1672612542347313, + "grad_norm": 0.8046875, + "learning_rate": 0.0001938822693412505, + "loss": 1.035, + "step": 6514 + }, + { + "epoch": 0.16728693143065312, + "grad_norm": 0.796875, + "learning_rate": 0.00019388073177281436, + "loss": 0.9739, + "step": 6515 + }, + { + "epoch": 0.16731260862657493, + "grad_norm": 0.8671875, + "learning_rate": 0.00019387919401728237, + "loss": 1.2579, + "step": 6516 + }, + { + "epoch": 0.16733828582249674, + "grad_norm": 0.7421875, + "learning_rate": 0.00019387765607465767, + "loss": 1.0661, + "step": 6517 + }, + { + "epoch": 0.16736396301841858, + "grad_norm": 0.875, + "learning_rate": 0.00019387611794494325, + "loss": 1.0749, + "step": 6518 + }, + { + "epoch": 0.16738964021434038, + "grad_norm": 0.77734375, + "learning_rate": 0.00019387457962814222, + "loss": 1.2006, + "step": 6519 + }, + { + "epoch": 0.16741531741026222, + "grad_norm": 0.8046875, + "learning_rate": 0.00019387304112425762, + "loss": 1.2715, + "step": 6520 + }, + { + "epoch": 0.16744099460618403, + "grad_norm": 0.79296875, + "learning_rate": 0.00019387150243329254, + "loss": 1.1168, + "step": 6521 + }, + { + "epoch": 0.16746667180210584, + "grad_norm": 0.78515625, + "learning_rate": 0.00019386996355525002, + "loss": 1.0753, + "step": 6522 + }, + { + "epoch": 0.16749234899802767, + "grad_norm": 0.7890625, + "learning_rate": 0.00019386842449013316, + "loss": 1.1082, + "step": 6523 + }, + { + "epoch": 0.16751802619394948, + "grad_norm": 0.89453125, + "learning_rate": 0.00019386688523794502, + "loss": 1.2608, + "step": 6524 + }, + { + "epoch": 0.16754370338987132, + "grad_norm": 0.7890625, + "learning_rate": 0.00019386534579868864, + "loss": 0.9255, + "step": 6525 + }, + { + "epoch": 0.16756938058579313, + "grad_norm": 0.796875, + "learning_rate": 0.00019386380617236707, + "loss": 1.0905, + "step": 6526 + }, + { + "epoch": 0.16759505778171493, + "grad_norm": 0.75390625, + "learning_rate": 0.00019386226635898345, + "loss": 0.9231, + "step": 6527 + }, + { + "epoch": 0.16762073497763677, + "grad_norm": 0.77734375, + "learning_rate": 0.0001938607263585408, + "loss": 1.0436, + "step": 6528 + }, + { + "epoch": 0.16764641217355858, + "grad_norm": 0.80078125, + "learning_rate": 0.00019385918617104223, + "loss": 1.0361, + "step": 6529 + }, + { + "epoch": 0.16767208936948041, + "grad_norm": 0.76171875, + "learning_rate": 0.00019385764579649074, + "loss": 1.1418, + "step": 6530 + }, + { + "epoch": 0.16769776656540222, + "grad_norm": 0.921875, + "learning_rate": 0.00019385610523488947, + "loss": 1.2061, + "step": 6531 + }, + { + "epoch": 0.16772344376132403, + "grad_norm": 0.8671875, + "learning_rate": 0.00019385456448624147, + "loss": 1.1858, + "step": 6532 + }, + { + "epoch": 0.16774912095724587, + "grad_norm": 0.83203125, + "learning_rate": 0.00019385302355054976, + "loss": 1.1093, + "step": 6533 + }, + { + "epoch": 0.16777479815316768, + "grad_norm": 0.93359375, + "learning_rate": 0.00019385148242781749, + "loss": 1.1656, + "step": 6534 + }, + { + "epoch": 0.1678004753490895, + "grad_norm": 0.796875, + "learning_rate": 0.00019384994111804765, + "loss": 0.9884, + "step": 6535 + }, + { + "epoch": 0.16782615254501132, + "grad_norm": 0.80078125, + "learning_rate": 0.00019384839962124337, + "loss": 0.9514, + "step": 6536 + }, + { + "epoch": 0.16785182974093313, + "grad_norm": 0.91796875, + "learning_rate": 0.00019384685793740774, + "loss": 1.347, + "step": 6537 + }, + { + "epoch": 0.16787750693685496, + "grad_norm": 0.78125, + "learning_rate": 0.00019384531606654376, + "loss": 1.0769, + "step": 6538 + }, + { + "epoch": 0.16790318413277677, + "grad_norm": 0.8671875, + "learning_rate": 0.00019384377400865455, + "loss": 0.9756, + "step": 6539 + }, + { + "epoch": 0.1679288613286986, + "grad_norm": 0.8125, + "learning_rate": 0.0001938422317637432, + "loss": 1.0467, + "step": 6540 + }, + { + "epoch": 0.16795453852462042, + "grad_norm": 0.79296875, + "learning_rate": 0.00019384068933181274, + "loss": 0.8997, + "step": 6541 + }, + { + "epoch": 0.16798021572054223, + "grad_norm": 0.796875, + "learning_rate": 0.00019383914671286626, + "loss": 1.0795, + "step": 6542 + }, + { + "epoch": 0.16800589291646406, + "grad_norm": 0.7890625, + "learning_rate": 0.00019383760390690683, + "loss": 1.052, + "step": 6543 + }, + { + "epoch": 0.16803157011238587, + "grad_norm": 0.78125, + "learning_rate": 0.00019383606091393757, + "loss": 1.0528, + "step": 6544 + }, + { + "epoch": 0.1680572473083077, + "grad_norm": 0.79296875, + "learning_rate": 0.00019383451773396146, + "loss": 1.0503, + "step": 6545 + }, + { + "epoch": 0.16808292450422951, + "grad_norm": 0.83203125, + "learning_rate": 0.0001938329743669817, + "loss": 0.9549, + "step": 6546 + }, + { + "epoch": 0.16810860170015132, + "grad_norm": 0.7578125, + "learning_rate": 0.00019383143081300123, + "loss": 1.088, + "step": 6547 + }, + { + "epoch": 0.16813427889607316, + "grad_norm": 0.8828125, + "learning_rate": 0.00019382988707202322, + "loss": 1.0002, + "step": 6548 + }, + { + "epoch": 0.16815995609199497, + "grad_norm": 0.953125, + "learning_rate": 0.00019382834314405075, + "loss": 1.1272, + "step": 6549 + }, + { + "epoch": 0.1681856332879168, + "grad_norm": 0.796875, + "learning_rate": 0.00019382679902908682, + "loss": 1.058, + "step": 6550 + }, + { + "epoch": 0.1682113104838386, + "grad_norm": 0.828125, + "learning_rate": 0.0001938252547271346, + "loss": 1.0989, + "step": 6551 + }, + { + "epoch": 0.16823698767976042, + "grad_norm": 0.84765625, + "learning_rate": 0.0001938237102381971, + "loss": 1.1215, + "step": 6552 + }, + { + "epoch": 0.16826266487568226, + "grad_norm": 0.79296875, + "learning_rate": 0.0001938221655622774, + "loss": 1.0972, + "step": 6553 + }, + { + "epoch": 0.16828834207160406, + "grad_norm": 0.8203125, + "learning_rate": 0.00019382062069937862, + "loss": 0.9396, + "step": 6554 + }, + { + "epoch": 0.1683140192675259, + "grad_norm": 0.890625, + "learning_rate": 0.00019381907564950383, + "loss": 1.1721, + "step": 6555 + }, + { + "epoch": 0.1683396964634477, + "grad_norm": 0.8046875, + "learning_rate": 0.00019381753041265606, + "loss": 1.1349, + "step": 6556 + }, + { + "epoch": 0.16836537365936952, + "grad_norm": 0.8671875, + "learning_rate": 0.00019381598498883847, + "loss": 1.0598, + "step": 6557 + }, + { + "epoch": 0.16839105085529135, + "grad_norm": 1.4140625, + "learning_rate": 0.00019381443937805407, + "loss": 1.0701, + "step": 6558 + }, + { + "epoch": 0.16841672805121316, + "grad_norm": 0.83203125, + "learning_rate": 0.000193812893580306, + "loss": 0.9166, + "step": 6559 + }, + { + "epoch": 0.168442405247135, + "grad_norm": 0.8515625, + "learning_rate": 0.00019381134759559725, + "loss": 1.0586, + "step": 6560 + }, + { + "epoch": 0.1684680824430568, + "grad_norm": 0.9296875, + "learning_rate": 0.000193809801423931, + "loss": 1.1598, + "step": 6561 + }, + { + "epoch": 0.1684937596389786, + "grad_norm": 0.82421875, + "learning_rate": 0.00019380825506531027, + "loss": 1.0685, + "step": 6562 + }, + { + "epoch": 0.16851943683490045, + "grad_norm": 0.828125, + "learning_rate": 0.00019380670851973817, + "loss": 0.9097, + "step": 6563 + }, + { + "epoch": 0.16854511403082226, + "grad_norm": 0.8828125, + "learning_rate": 0.00019380516178721778, + "loss": 1.097, + "step": 6564 + }, + { + "epoch": 0.1685707912267441, + "grad_norm": 0.85546875, + "learning_rate": 0.00019380361486775216, + "loss": 1.0805, + "step": 6565 + }, + { + "epoch": 0.1685964684226659, + "grad_norm": 0.87890625, + "learning_rate": 0.0001938020677613444, + "loss": 1.233, + "step": 6566 + }, + { + "epoch": 0.1686221456185877, + "grad_norm": 0.8046875, + "learning_rate": 0.0001938005204679976, + "loss": 1.1098, + "step": 6567 + }, + { + "epoch": 0.16864782281450955, + "grad_norm": 0.84765625, + "learning_rate": 0.00019379897298771488, + "loss": 1.0828, + "step": 6568 + }, + { + "epoch": 0.16867350001043135, + "grad_norm": 0.82421875, + "learning_rate": 0.00019379742532049923, + "loss": 1.1666, + "step": 6569 + }, + { + "epoch": 0.1686991772063532, + "grad_norm": 0.90625, + "learning_rate": 0.0001937958774663538, + "loss": 1.0104, + "step": 6570 + }, + { + "epoch": 0.168724854402275, + "grad_norm": 0.828125, + "learning_rate": 0.00019379432942528166, + "loss": 1.076, + "step": 6571 + }, + { + "epoch": 0.1687505315981968, + "grad_norm": 0.90625, + "learning_rate": 0.0001937927811972859, + "loss": 1.0833, + "step": 6572 + }, + { + "epoch": 0.16877620879411864, + "grad_norm": 0.86328125, + "learning_rate": 0.0001937912327823696, + "loss": 1.0496, + "step": 6573 + }, + { + "epoch": 0.16880188599004045, + "grad_norm": 0.91796875, + "learning_rate": 0.00019378968418053583, + "loss": 1.0618, + "step": 6574 + }, + { + "epoch": 0.1688275631859623, + "grad_norm": 1.078125, + "learning_rate": 0.00019378813539178767, + "loss": 1.1769, + "step": 6575 + }, + { + "epoch": 0.1688532403818841, + "grad_norm": 0.83984375, + "learning_rate": 0.00019378658641612827, + "loss": 1.0452, + "step": 6576 + }, + { + "epoch": 0.1688789175778059, + "grad_norm": 0.84765625, + "learning_rate": 0.00019378503725356064, + "loss": 1.1125, + "step": 6577 + }, + { + "epoch": 0.16890459477372774, + "grad_norm": 0.77734375, + "learning_rate": 0.00019378348790408792, + "loss": 1.0271, + "step": 6578 + }, + { + "epoch": 0.16893027196964955, + "grad_norm": 0.97265625, + "learning_rate": 0.00019378193836771322, + "loss": 1.1298, + "step": 6579 + }, + { + "epoch": 0.16895594916557138, + "grad_norm": 0.83984375, + "learning_rate": 0.00019378038864443955, + "loss": 1.0857, + "step": 6580 + }, + { + "epoch": 0.1689816263614932, + "grad_norm": 0.8359375, + "learning_rate": 0.00019377883873427003, + "loss": 1.1192, + "step": 6581 + }, + { + "epoch": 0.169007303557415, + "grad_norm": 0.8359375, + "learning_rate": 0.00019377728863720776, + "loss": 1.0918, + "step": 6582 + }, + { + "epoch": 0.16903298075333684, + "grad_norm": 0.98046875, + "learning_rate": 0.00019377573835325582, + "loss": 1.0605, + "step": 6583 + }, + { + "epoch": 0.16905865794925864, + "grad_norm": 1.0390625, + "learning_rate": 0.00019377418788241733, + "loss": 0.935, + "step": 6584 + }, + { + "epoch": 0.16908433514518048, + "grad_norm": 0.8125, + "learning_rate": 0.00019377263722469533, + "loss": 1.0552, + "step": 6585 + }, + { + "epoch": 0.1691100123411023, + "grad_norm": 0.953125, + "learning_rate": 0.00019377108638009294, + "loss": 1.0483, + "step": 6586 + }, + { + "epoch": 0.1691356895370241, + "grad_norm": 0.88671875, + "learning_rate": 0.00019376953534861325, + "loss": 0.9702, + "step": 6587 + }, + { + "epoch": 0.16916136673294593, + "grad_norm": 0.79296875, + "learning_rate": 0.00019376798413025934, + "loss": 1.186, + "step": 6588 + }, + { + "epoch": 0.16918704392886774, + "grad_norm": 0.86328125, + "learning_rate": 0.00019376643272503433, + "loss": 1.059, + "step": 6589 + }, + { + "epoch": 0.16921272112478958, + "grad_norm": 0.8359375, + "learning_rate": 0.00019376488113294124, + "loss": 1.1336, + "step": 6590 + }, + { + "epoch": 0.1692383983207114, + "grad_norm": 0.8203125, + "learning_rate": 0.00019376332935398325, + "loss": 1.0705, + "step": 6591 + }, + { + "epoch": 0.1692640755166332, + "grad_norm": 0.80859375, + "learning_rate": 0.0001937617773881634, + "loss": 1.0757, + "step": 6592 + }, + { + "epoch": 0.16928975271255503, + "grad_norm": 1.453125, + "learning_rate": 0.0001937602252354848, + "loss": 1.1859, + "step": 6593 + }, + { + "epoch": 0.16931542990847684, + "grad_norm": 0.82421875, + "learning_rate": 0.00019375867289595053, + "loss": 1.1671, + "step": 6594 + }, + { + "epoch": 0.16934110710439867, + "grad_norm": 0.9375, + "learning_rate": 0.00019375712036956373, + "loss": 0.9687, + "step": 6595 + }, + { + "epoch": 0.16936678430032048, + "grad_norm": 0.82421875, + "learning_rate": 0.00019375556765632742, + "loss": 1.1042, + "step": 6596 + }, + { + "epoch": 0.1693924614962423, + "grad_norm": 0.8515625, + "learning_rate": 0.00019375401475624476, + "loss": 1.1346, + "step": 6597 + }, + { + "epoch": 0.16941813869216413, + "grad_norm": 0.8203125, + "learning_rate": 0.0001937524616693188, + "loss": 1.1375, + "step": 6598 + }, + { + "epoch": 0.16944381588808594, + "grad_norm": 0.93359375, + "learning_rate": 0.00019375090839555265, + "loss": 1.0177, + "step": 6599 + }, + { + "epoch": 0.16946949308400777, + "grad_norm": 0.84765625, + "learning_rate": 0.00019374935493494944, + "loss": 1.0061, + "step": 6600 + }, + { + "epoch": 0.16949517027992958, + "grad_norm": 0.91796875, + "learning_rate": 0.0001937478012875122, + "loss": 1.0844, + "step": 6601 + }, + { + "epoch": 0.1695208474758514, + "grad_norm": 0.8125, + "learning_rate": 0.00019374624745324405, + "loss": 1.0074, + "step": 6602 + }, + { + "epoch": 0.16954652467177322, + "grad_norm": 0.91015625, + "learning_rate": 0.0001937446934321481, + "loss": 1.1918, + "step": 6603 + }, + { + "epoch": 0.16957220186769503, + "grad_norm": 0.76953125, + "learning_rate": 0.0001937431392242275, + "loss": 1.0323, + "step": 6604 + }, + { + "epoch": 0.16959787906361687, + "grad_norm": 0.84375, + "learning_rate": 0.00019374158482948523, + "loss": 0.9703, + "step": 6605 + }, + { + "epoch": 0.16962355625953868, + "grad_norm": 0.79296875, + "learning_rate": 0.00019374003024792447, + "loss": 1.048, + "step": 6606 + }, + { + "epoch": 0.16964923345546049, + "grad_norm": 0.84765625, + "learning_rate": 0.00019373847547954828, + "loss": 1.1344, + "step": 6607 + }, + { + "epoch": 0.16967491065138232, + "grad_norm": 0.73828125, + "learning_rate": 0.0001937369205243598, + "loss": 0.922, + "step": 6608 + }, + { + "epoch": 0.16970058784730413, + "grad_norm": 0.82421875, + "learning_rate": 0.0001937353653823621, + "loss": 1.1824, + "step": 6609 + }, + { + "epoch": 0.16972626504322597, + "grad_norm": 0.9921875, + "learning_rate": 0.0001937338100535583, + "loss": 0.9519, + "step": 6610 + }, + { + "epoch": 0.16975194223914777, + "grad_norm": 0.8203125, + "learning_rate": 0.00019373225453795146, + "loss": 1.0503, + "step": 6611 + }, + { + "epoch": 0.16977761943506958, + "grad_norm": 0.84765625, + "learning_rate": 0.00019373069883554472, + "loss": 1.1883, + "step": 6612 + }, + { + "epoch": 0.16980329663099142, + "grad_norm": 0.83984375, + "learning_rate": 0.00019372914294634114, + "loss": 1.1278, + "step": 6613 + }, + { + "epoch": 0.16982897382691323, + "grad_norm": 0.859375, + "learning_rate": 0.00019372758687034386, + "loss": 1.1289, + "step": 6614 + }, + { + "epoch": 0.16985465102283506, + "grad_norm": 0.8828125, + "learning_rate": 0.00019372603060755595, + "loss": 1.0247, + "step": 6615 + }, + { + "epoch": 0.16988032821875687, + "grad_norm": 0.81640625, + "learning_rate": 0.00019372447415798053, + "loss": 1.0035, + "step": 6616 + }, + { + "epoch": 0.16990600541467868, + "grad_norm": 0.94140625, + "learning_rate": 0.00019372291752162073, + "loss": 1.1155, + "step": 6617 + }, + { + "epoch": 0.16993168261060052, + "grad_norm": 0.8125, + "learning_rate": 0.00019372136069847963, + "loss": 1.0547, + "step": 6618 + }, + { + "epoch": 0.16995735980652232, + "grad_norm": 0.82421875, + "learning_rate": 0.00019371980368856028, + "loss": 1.2446, + "step": 6619 + }, + { + "epoch": 0.16998303700244416, + "grad_norm": 0.87109375, + "learning_rate": 0.00019371824649186582, + "loss": 1.0506, + "step": 6620 + }, + { + "epoch": 0.17000871419836597, + "grad_norm": 0.9296875, + "learning_rate": 0.00019371668910839939, + "loss": 1.0506, + "step": 6621 + }, + { + "epoch": 0.17003439139428778, + "grad_norm": 0.81640625, + "learning_rate": 0.00019371513153816405, + "loss": 1.2007, + "step": 6622 + }, + { + "epoch": 0.1700600685902096, + "grad_norm": 0.86328125, + "learning_rate": 0.00019371357378116292, + "loss": 1.0279, + "step": 6623 + }, + { + "epoch": 0.17008574578613142, + "grad_norm": 0.875, + "learning_rate": 0.00019371201583739912, + "loss": 1.1031, + "step": 6624 + }, + { + "epoch": 0.17011142298205326, + "grad_norm": 0.859375, + "learning_rate": 0.00019371045770687572, + "loss": 1.0328, + "step": 6625 + }, + { + "epoch": 0.17013710017797506, + "grad_norm": 0.73046875, + "learning_rate": 0.00019370889938959588, + "loss": 0.8935, + "step": 6626 + }, + { + "epoch": 0.17016277737389687, + "grad_norm": 0.84375, + "learning_rate": 0.0001937073408855626, + "loss": 1.0113, + "step": 6627 + }, + { + "epoch": 0.1701884545698187, + "grad_norm": 0.84375, + "learning_rate": 0.0001937057821947791, + "loss": 0.9002, + "step": 6628 + }, + { + "epoch": 0.17021413176574052, + "grad_norm": 0.78515625, + "learning_rate": 0.00019370422331724843, + "loss": 1.0393, + "step": 6629 + }, + { + "epoch": 0.17023980896166235, + "grad_norm": 0.8046875, + "learning_rate": 0.0001937026642529737, + "loss": 1.1059, + "step": 6630 + }, + { + "epoch": 0.17026548615758416, + "grad_norm": 0.8046875, + "learning_rate": 0.000193701105001958, + "loss": 0.9743, + "step": 6631 + }, + { + "epoch": 0.17029116335350597, + "grad_norm": 0.80078125, + "learning_rate": 0.00019369954556420452, + "loss": 0.9792, + "step": 6632 + }, + { + "epoch": 0.1703168405494278, + "grad_norm": 0.7890625, + "learning_rate": 0.0001936979859397163, + "loss": 1.1112, + "step": 6633 + }, + { + "epoch": 0.17034251774534961, + "grad_norm": 0.83203125, + "learning_rate": 0.0001936964261284964, + "loss": 0.962, + "step": 6634 + }, + { + "epoch": 0.17036819494127145, + "grad_norm": 0.8125, + "learning_rate": 0.00019369486613054804, + "loss": 0.9813, + "step": 6635 + }, + { + "epoch": 0.17039387213719326, + "grad_norm": 0.82421875, + "learning_rate": 0.00019369330594587428, + "loss": 1.1589, + "step": 6636 + }, + { + "epoch": 0.17041954933311507, + "grad_norm": 0.80078125, + "learning_rate": 0.0001936917455744782, + "loss": 1.1057, + "step": 6637 + }, + { + "epoch": 0.1704452265290369, + "grad_norm": 0.77734375, + "learning_rate": 0.00019369018501636293, + "loss": 1.0364, + "step": 6638 + }, + { + "epoch": 0.1704709037249587, + "grad_norm": 0.83984375, + "learning_rate": 0.0001936886242715316, + "loss": 1.1432, + "step": 6639 + }, + { + "epoch": 0.17049658092088055, + "grad_norm": 0.875, + "learning_rate": 0.00019368706333998728, + "loss": 1.2336, + "step": 6640 + }, + { + "epoch": 0.17052225811680236, + "grad_norm": 0.890625, + "learning_rate": 0.00019368550222173317, + "loss": 1.171, + "step": 6641 + }, + { + "epoch": 0.17054793531272416, + "grad_norm": 0.87890625, + "learning_rate": 0.00019368394091677225, + "loss": 1.0212, + "step": 6642 + }, + { + "epoch": 0.170573612508646, + "grad_norm": 0.80859375, + "learning_rate": 0.0001936823794251077, + "loss": 1.0734, + "step": 6643 + }, + { + "epoch": 0.1705992897045678, + "grad_norm": 0.76953125, + "learning_rate": 0.00019368081774674266, + "loss": 1.19, + "step": 6644 + }, + { + "epoch": 0.17062496690048964, + "grad_norm": 0.8203125, + "learning_rate": 0.0001936792558816802, + "loss": 1.1057, + "step": 6645 + }, + { + "epoch": 0.17065064409641145, + "grad_norm": 0.79296875, + "learning_rate": 0.00019367769382992346, + "loss": 1.1274, + "step": 6646 + }, + { + "epoch": 0.17067632129233326, + "grad_norm": 0.859375, + "learning_rate": 0.0001936761315914755, + "loss": 1.029, + "step": 6647 + }, + { + "epoch": 0.1707019984882551, + "grad_norm": 1.1484375, + "learning_rate": 0.00019367456916633954, + "loss": 1.0015, + "step": 6648 + }, + { + "epoch": 0.1707276756841769, + "grad_norm": 0.87890625, + "learning_rate": 0.00019367300655451857, + "loss": 1.056, + "step": 6649 + }, + { + "epoch": 0.17075335288009874, + "grad_norm": 0.85546875, + "learning_rate": 0.00019367144375601576, + "loss": 0.9782, + "step": 6650 + }, + { + "epoch": 0.17077903007602055, + "grad_norm": 0.8515625, + "learning_rate": 0.00019366988077083424, + "loss": 1.0874, + "step": 6651 + }, + { + "epoch": 0.17080470727194236, + "grad_norm": 0.8203125, + "learning_rate": 0.0001936683175989771, + "loss": 1.0458, + "step": 6652 + }, + { + "epoch": 0.1708303844678642, + "grad_norm": 0.7890625, + "learning_rate": 0.00019366675424044747, + "loss": 0.9765, + "step": 6653 + }, + { + "epoch": 0.170856061663786, + "grad_norm": 0.8828125, + "learning_rate": 0.00019366519069524847, + "loss": 0.978, + "step": 6654 + }, + { + "epoch": 0.17088173885970784, + "grad_norm": 1.109375, + "learning_rate": 0.00019366362696338318, + "loss": 1.0501, + "step": 6655 + }, + { + "epoch": 0.17090741605562965, + "grad_norm": 0.91015625, + "learning_rate": 0.00019366206304485475, + "loss": 1.1146, + "step": 6656 + }, + { + "epoch": 0.17093309325155145, + "grad_norm": 0.85546875, + "learning_rate": 0.0001936604989396663, + "loss": 1.1466, + "step": 6657 + }, + { + "epoch": 0.1709587704474733, + "grad_norm": 0.83203125, + "learning_rate": 0.00019365893464782092, + "loss": 0.9967, + "step": 6658 + }, + { + "epoch": 0.1709844476433951, + "grad_norm": 0.86328125, + "learning_rate": 0.00019365737016932176, + "loss": 1.0576, + "step": 6659 + }, + { + "epoch": 0.17101012483931693, + "grad_norm": 0.953125, + "learning_rate": 0.0001936558055041719, + "loss": 1.1051, + "step": 6660 + }, + { + "epoch": 0.17103580203523874, + "grad_norm": 0.87890625, + "learning_rate": 0.00019365424065237452, + "loss": 1.1596, + "step": 6661 + }, + { + "epoch": 0.17106147923116055, + "grad_norm": 0.87890625, + "learning_rate": 0.00019365267561393266, + "loss": 1.0675, + "step": 6662 + }, + { + "epoch": 0.1710871564270824, + "grad_norm": 0.83203125, + "learning_rate": 0.0001936511103888495, + "loss": 0.989, + "step": 6663 + }, + { + "epoch": 0.1711128336230042, + "grad_norm": 0.88671875, + "learning_rate": 0.0001936495449771281, + "loss": 1.0491, + "step": 6664 + }, + { + "epoch": 0.17113851081892603, + "grad_norm": 0.79296875, + "learning_rate": 0.00019364797937877163, + "loss": 1.0776, + "step": 6665 + }, + { + "epoch": 0.17116418801484784, + "grad_norm": 0.8046875, + "learning_rate": 0.00019364641359378323, + "loss": 1.0204, + "step": 6666 + }, + { + "epoch": 0.17118986521076965, + "grad_norm": 0.91015625, + "learning_rate": 0.00019364484762216598, + "loss": 1.1543, + "step": 6667 + }, + { + "epoch": 0.17121554240669148, + "grad_norm": 0.765625, + "learning_rate": 0.00019364328146392299, + "loss": 1.012, + "step": 6668 + }, + { + "epoch": 0.1712412196026133, + "grad_norm": 0.83203125, + "learning_rate": 0.0001936417151190574, + "loss": 1.0743, + "step": 6669 + }, + { + "epoch": 0.17126689679853513, + "grad_norm": 0.73828125, + "learning_rate": 0.00019364014858757234, + "loss": 1.0581, + "step": 6670 + }, + { + "epoch": 0.17129257399445694, + "grad_norm": 0.859375, + "learning_rate": 0.0001936385818694709, + "loss": 1.1541, + "step": 6671 + }, + { + "epoch": 0.17131825119037875, + "grad_norm": 0.859375, + "learning_rate": 0.00019363701496475624, + "loss": 1.0562, + "step": 6672 + }, + { + "epoch": 0.17134392838630058, + "grad_norm": 0.8359375, + "learning_rate": 0.00019363544787343146, + "loss": 1.2816, + "step": 6673 + }, + { + "epoch": 0.1713696055822224, + "grad_norm": 0.84375, + "learning_rate": 0.00019363388059549973, + "loss": 0.9891, + "step": 6674 + }, + { + "epoch": 0.17139528277814423, + "grad_norm": 0.75, + "learning_rate": 0.0001936323131309641, + "loss": 1.1793, + "step": 6675 + }, + { + "epoch": 0.17142095997406603, + "grad_norm": 0.86328125, + "learning_rate": 0.00019363074547982772, + "loss": 1.1919, + "step": 6676 + }, + { + "epoch": 0.17144663716998784, + "grad_norm": 0.78515625, + "learning_rate": 0.00019362917764209373, + "loss": 1.0974, + "step": 6677 + }, + { + "epoch": 0.17147231436590968, + "grad_norm": 0.7890625, + "learning_rate": 0.00019362760961776524, + "loss": 0.9887, + "step": 6678 + }, + { + "epoch": 0.1714979915618315, + "grad_norm": 0.88671875, + "learning_rate": 0.00019362604140684537, + "loss": 1.2771, + "step": 6679 + }, + { + "epoch": 0.17152366875775332, + "grad_norm": 0.86328125, + "learning_rate": 0.00019362447300933728, + "loss": 1.017, + "step": 6680 + }, + { + "epoch": 0.17154934595367513, + "grad_norm": 0.875, + "learning_rate": 0.00019362290442524407, + "loss": 1.0573, + "step": 6681 + }, + { + "epoch": 0.17157502314959694, + "grad_norm": 0.76953125, + "learning_rate": 0.00019362133565456886, + "loss": 1.0829, + "step": 6682 + }, + { + "epoch": 0.17160070034551878, + "grad_norm": 0.8515625, + "learning_rate": 0.00019361976669731478, + "loss": 1.124, + "step": 6683 + }, + { + "epoch": 0.17162637754144058, + "grad_norm": 0.80078125, + "learning_rate": 0.000193618197553485, + "loss": 1.0134, + "step": 6684 + }, + { + "epoch": 0.17165205473736242, + "grad_norm": 0.80078125, + "learning_rate": 0.00019361662822308255, + "loss": 1.0446, + "step": 6685 + }, + { + "epoch": 0.17167773193328423, + "grad_norm": 0.8671875, + "learning_rate": 0.00019361505870611065, + "loss": 1.103, + "step": 6686 + }, + { + "epoch": 0.17170340912920604, + "grad_norm": 0.92578125, + "learning_rate": 0.00019361348900257238, + "loss": 0.949, + "step": 6687 + }, + { + "epoch": 0.17172908632512787, + "grad_norm": 0.82421875, + "learning_rate": 0.00019361191911247092, + "loss": 1.0354, + "step": 6688 + }, + { + "epoch": 0.17175476352104968, + "grad_norm": 0.91796875, + "learning_rate": 0.00019361034903580933, + "loss": 1.094, + "step": 6689 + }, + { + "epoch": 0.17178044071697152, + "grad_norm": 0.8359375, + "learning_rate": 0.00019360877877259077, + "loss": 1.1202, + "step": 6690 + }, + { + "epoch": 0.17180611791289332, + "grad_norm": 0.828125, + "learning_rate": 0.00019360720832281838, + "loss": 1.0428, + "step": 6691 + }, + { + "epoch": 0.17183179510881513, + "grad_norm": 0.81640625, + "learning_rate": 0.00019360563768649526, + "loss": 0.9898, + "step": 6692 + }, + { + "epoch": 0.17185747230473697, + "grad_norm": 0.8125, + "learning_rate": 0.00019360406686362458, + "loss": 1.1242, + "step": 6693 + }, + { + "epoch": 0.17188314950065878, + "grad_norm": 0.859375, + "learning_rate": 0.00019360249585420944, + "loss": 1.1775, + "step": 6694 + }, + { + "epoch": 0.1719088266965806, + "grad_norm": 0.75, + "learning_rate": 0.000193600924658253, + "loss": 0.9785, + "step": 6695 + }, + { + "epoch": 0.17193450389250242, + "grad_norm": 0.92578125, + "learning_rate": 0.00019359935327575834, + "loss": 1.198, + "step": 6696 + }, + { + "epoch": 0.17196018108842423, + "grad_norm": 0.81640625, + "learning_rate": 0.00019359778170672864, + "loss": 1.1357, + "step": 6697 + }, + { + "epoch": 0.17198585828434607, + "grad_norm": 0.828125, + "learning_rate": 0.000193596209951167, + "loss": 1.1356, + "step": 6698 + }, + { + "epoch": 0.17201153548026787, + "grad_norm": 0.8125, + "learning_rate": 0.0001935946380090766, + "loss": 1.0544, + "step": 6699 + }, + { + "epoch": 0.1720372126761897, + "grad_norm": 0.89453125, + "learning_rate": 0.00019359306588046053, + "loss": 1.0363, + "step": 6700 + }, + { + "epoch": 0.17206288987211152, + "grad_norm": 0.83984375, + "learning_rate": 0.00019359149356532192, + "loss": 1.219, + "step": 6701 + }, + { + "epoch": 0.17208856706803333, + "grad_norm": 0.8125, + "learning_rate": 0.00019358992106366393, + "loss": 1.1402, + "step": 6702 + }, + { + "epoch": 0.17211424426395516, + "grad_norm": 0.8359375, + "learning_rate": 0.00019358834837548966, + "loss": 1.1689, + "step": 6703 + }, + { + "epoch": 0.17213992145987697, + "grad_norm": 0.87890625, + "learning_rate": 0.00019358677550080231, + "loss": 1.1342, + "step": 6704 + }, + { + "epoch": 0.1721655986557988, + "grad_norm": 0.8515625, + "learning_rate": 0.00019358520243960495, + "loss": 1.2576, + "step": 6705 + }, + { + "epoch": 0.17219127585172062, + "grad_norm": 0.82421875, + "learning_rate": 0.00019358362919190072, + "loss": 1.1264, + "step": 6706 + }, + { + "epoch": 0.17221695304764242, + "grad_norm": 0.78515625, + "learning_rate": 0.00019358205575769277, + "loss": 0.9777, + "step": 6707 + }, + { + "epoch": 0.17224263024356426, + "grad_norm": 0.76171875, + "learning_rate": 0.00019358048213698425, + "loss": 1.1237, + "step": 6708 + }, + { + "epoch": 0.17226830743948607, + "grad_norm": 0.765625, + "learning_rate": 0.00019357890832977828, + "loss": 0.9438, + "step": 6709 + }, + { + "epoch": 0.1722939846354079, + "grad_norm": 0.80859375, + "learning_rate": 0.00019357733433607798, + "loss": 1.0556, + "step": 6710 + }, + { + "epoch": 0.1723196618313297, + "grad_norm": 0.77734375, + "learning_rate": 0.0001935757601558865, + "loss": 0.951, + "step": 6711 + }, + { + "epoch": 0.17234533902725152, + "grad_norm": 0.8203125, + "learning_rate": 0.00019357418578920703, + "loss": 1.1262, + "step": 6712 + }, + { + "epoch": 0.17237101622317336, + "grad_norm": 0.81640625, + "learning_rate": 0.0001935726112360426, + "loss": 1.0621, + "step": 6713 + }, + { + "epoch": 0.17239669341909516, + "grad_norm": 0.8046875, + "learning_rate": 0.00019357103649639647, + "loss": 0.9973, + "step": 6714 + }, + { + "epoch": 0.17242237061501697, + "grad_norm": 0.8203125, + "learning_rate": 0.00019356946157027168, + "loss": 1.1047, + "step": 6715 + }, + { + "epoch": 0.1724480478109388, + "grad_norm": 0.83984375, + "learning_rate": 0.00019356788645767144, + "loss": 1.1604, + "step": 6716 + }, + { + "epoch": 0.17247372500686062, + "grad_norm": 0.84375, + "learning_rate": 0.0001935663111585988, + "loss": 0.9497, + "step": 6717 + }, + { + "epoch": 0.17249940220278245, + "grad_norm": 0.83984375, + "learning_rate": 0.000193564735673057, + "loss": 1.2195, + "step": 6718 + }, + { + "epoch": 0.17252507939870426, + "grad_norm": 0.74609375, + "learning_rate": 0.00019356316000104912, + "loss": 1.0018, + "step": 6719 + }, + { + "epoch": 0.17255075659462607, + "grad_norm": 1.078125, + "learning_rate": 0.0001935615841425783, + "loss": 1.1726, + "step": 6720 + }, + { + "epoch": 0.1725764337905479, + "grad_norm": 0.80859375, + "learning_rate": 0.00019356000809764773, + "loss": 1.0026, + "step": 6721 + }, + { + "epoch": 0.17260211098646971, + "grad_norm": 0.91015625, + "learning_rate": 0.00019355843186626045, + "loss": 1.1006, + "step": 6722 + }, + { + "epoch": 0.17262778818239155, + "grad_norm": 0.8515625, + "learning_rate": 0.00019355685544841973, + "loss": 1.0561, + "step": 6723 + }, + { + "epoch": 0.17265346537831336, + "grad_norm": 0.83984375, + "learning_rate": 0.0001935552788441286, + "loss": 1.0954, + "step": 6724 + }, + { + "epoch": 0.17267914257423517, + "grad_norm": 0.81640625, + "learning_rate": 0.00019355370205339028, + "loss": 1.0302, + "step": 6725 + }, + { + "epoch": 0.172704819770157, + "grad_norm": 0.89453125, + "learning_rate": 0.00019355212507620787, + "loss": 1.0387, + "step": 6726 + }, + { + "epoch": 0.1727304969660788, + "grad_norm": 0.875, + "learning_rate": 0.00019355054791258454, + "loss": 1.0704, + "step": 6727 + }, + { + "epoch": 0.17275617416200065, + "grad_norm": 0.83203125, + "learning_rate": 0.0001935489705625234, + "loss": 1.3437, + "step": 6728 + }, + { + "epoch": 0.17278185135792246, + "grad_norm": 0.828125, + "learning_rate": 0.00019354739302602763, + "loss": 1.1673, + "step": 6729 + }, + { + "epoch": 0.17280752855384426, + "grad_norm": 0.96484375, + "learning_rate": 0.00019354581530310033, + "loss": 1.1677, + "step": 6730 + }, + { + "epoch": 0.1728332057497661, + "grad_norm": 0.890625, + "learning_rate": 0.0001935442373937447, + "loss": 0.993, + "step": 6731 + }, + { + "epoch": 0.1728588829456879, + "grad_norm": 0.8125, + "learning_rate": 0.00019354265929796385, + "loss": 0.9717, + "step": 6732 + }, + { + "epoch": 0.17288456014160974, + "grad_norm": 0.8203125, + "learning_rate": 0.00019354108101576092, + "loss": 0.9796, + "step": 6733 + }, + { + "epoch": 0.17291023733753155, + "grad_norm": 0.8046875, + "learning_rate": 0.00019353950254713905, + "loss": 0.9648, + "step": 6734 + }, + { + "epoch": 0.17293591453345336, + "grad_norm": 0.7421875, + "learning_rate": 0.00019353792389210141, + "loss": 0.9523, + "step": 6735 + }, + { + "epoch": 0.1729615917293752, + "grad_norm": 0.81640625, + "learning_rate": 0.00019353634505065115, + "loss": 1.1512, + "step": 6736 + }, + { + "epoch": 0.172987268925297, + "grad_norm": 0.79296875, + "learning_rate": 0.0001935347660227914, + "loss": 1.0439, + "step": 6737 + }, + { + "epoch": 0.17301294612121884, + "grad_norm": 0.828125, + "learning_rate": 0.0001935331868085253, + "loss": 1.105, + "step": 6738 + }, + { + "epoch": 0.17303862331714065, + "grad_norm": 0.8359375, + "learning_rate": 0.00019353160740785601, + "loss": 0.9373, + "step": 6739 + }, + { + "epoch": 0.17306430051306246, + "grad_norm": 0.79296875, + "learning_rate": 0.0001935300278207867, + "loss": 1.0477, + "step": 6740 + }, + { + "epoch": 0.1730899777089843, + "grad_norm": 0.80078125, + "learning_rate": 0.00019352844804732046, + "loss": 0.9785, + "step": 6741 + }, + { + "epoch": 0.1731156549049061, + "grad_norm": 0.77734375, + "learning_rate": 0.00019352686808746047, + "loss": 1.15, + "step": 6742 + }, + { + "epoch": 0.17314133210082794, + "grad_norm": 0.80859375, + "learning_rate": 0.00019352528794120989, + "loss": 0.9869, + "step": 6743 + }, + { + "epoch": 0.17316700929674975, + "grad_norm": 0.78515625, + "learning_rate": 0.00019352370760857186, + "loss": 0.9945, + "step": 6744 + }, + { + "epoch": 0.17319268649267155, + "grad_norm": 0.8515625, + "learning_rate": 0.00019352212708954953, + "loss": 1.0822, + "step": 6745 + }, + { + "epoch": 0.1732183636885934, + "grad_norm": 0.76171875, + "learning_rate": 0.00019352054638414607, + "loss": 0.9807, + "step": 6746 + }, + { + "epoch": 0.1732440408845152, + "grad_norm": 0.90625, + "learning_rate": 0.00019351896549236458, + "loss": 0.9452, + "step": 6747 + }, + { + "epoch": 0.17326971808043704, + "grad_norm": 0.78125, + "learning_rate": 0.00019351738441420827, + "loss": 1.1442, + "step": 6748 + }, + { + "epoch": 0.17329539527635884, + "grad_norm": 0.92578125, + "learning_rate": 0.0001935158031496802, + "loss": 1.061, + "step": 6749 + }, + { + "epoch": 0.17332107247228065, + "grad_norm": 0.84375, + "learning_rate": 0.00019351422169878367, + "loss": 0.9948, + "step": 6750 + }, + { + "epoch": 0.1733467496682025, + "grad_norm": 0.8046875, + "learning_rate": 0.00019351264006152165, + "loss": 1.1056, + "step": 6751 + }, + { + "epoch": 0.1733724268641243, + "grad_norm": 0.8203125, + "learning_rate": 0.00019351105823789744, + "loss": 0.8882, + "step": 6752 + }, + { + "epoch": 0.17339810406004613, + "grad_norm": 0.87890625, + "learning_rate": 0.00019350947622791414, + "loss": 1.165, + "step": 6753 + }, + { + "epoch": 0.17342378125596794, + "grad_norm": 0.82421875, + "learning_rate": 0.0001935078940315749, + "loss": 0.9235, + "step": 6754 + }, + { + "epoch": 0.17344945845188975, + "grad_norm": 0.85546875, + "learning_rate": 0.00019350631164888283, + "loss": 1.056, + "step": 6755 + }, + { + "epoch": 0.17347513564781158, + "grad_norm": 0.83203125, + "learning_rate": 0.00019350472907984117, + "loss": 1.0048, + "step": 6756 + }, + { + "epoch": 0.1735008128437334, + "grad_norm": 0.8828125, + "learning_rate": 0.00019350314632445302, + "loss": 0.9998, + "step": 6757 + }, + { + "epoch": 0.17352649003965523, + "grad_norm": 0.93359375, + "learning_rate": 0.00019350156338272153, + "loss": 1.1614, + "step": 6758 + }, + { + "epoch": 0.17355216723557704, + "grad_norm": 0.8671875, + "learning_rate": 0.0001934999802546499, + "loss": 1.0921, + "step": 6759 + }, + { + "epoch": 0.17357784443149885, + "grad_norm": 0.78125, + "learning_rate": 0.00019349839694024127, + "loss": 0.9653, + "step": 6760 + }, + { + "epoch": 0.17360352162742068, + "grad_norm": 0.86328125, + "learning_rate": 0.00019349681343949873, + "loss": 1.2791, + "step": 6761 + }, + { + "epoch": 0.1736291988233425, + "grad_norm": 0.81640625, + "learning_rate": 0.00019349522975242552, + "loss": 1.1495, + "step": 6762 + }, + { + "epoch": 0.17365487601926433, + "grad_norm": 0.77734375, + "learning_rate": 0.00019349364587902473, + "loss": 1.0708, + "step": 6763 + }, + { + "epoch": 0.17368055321518613, + "grad_norm": 0.921875, + "learning_rate": 0.00019349206181929954, + "loss": 1.0043, + "step": 6764 + }, + { + "epoch": 0.17370623041110794, + "grad_norm": 0.82421875, + "learning_rate": 0.00019349047757325315, + "loss": 1.0484, + "step": 6765 + }, + { + "epoch": 0.17373190760702978, + "grad_norm": 0.83203125, + "learning_rate": 0.00019348889314088868, + "loss": 0.9964, + "step": 6766 + }, + { + "epoch": 0.1737575848029516, + "grad_norm": 0.87890625, + "learning_rate": 0.0001934873085222093, + "loss": 0.9944, + "step": 6767 + }, + { + "epoch": 0.17378326199887342, + "grad_norm": 0.83203125, + "learning_rate": 0.00019348572371721815, + "loss": 1.0373, + "step": 6768 + }, + { + "epoch": 0.17380893919479523, + "grad_norm": 0.79296875, + "learning_rate": 0.0001934841387259184, + "loss": 0.9951, + "step": 6769 + }, + { + "epoch": 0.17383461639071704, + "grad_norm": 0.8046875, + "learning_rate": 0.0001934825535483132, + "loss": 1.1941, + "step": 6770 + }, + { + "epoch": 0.17386029358663888, + "grad_norm": 0.76171875, + "learning_rate": 0.00019348096818440572, + "loss": 0.9891, + "step": 6771 + }, + { + "epoch": 0.17388597078256068, + "grad_norm": 0.86328125, + "learning_rate": 0.0001934793826341991, + "loss": 1.0242, + "step": 6772 + }, + { + "epoch": 0.17391164797848252, + "grad_norm": 0.796875, + "learning_rate": 0.00019347779689769654, + "loss": 0.9416, + "step": 6773 + }, + { + "epoch": 0.17393732517440433, + "grad_norm": 0.76953125, + "learning_rate": 0.00019347621097490114, + "loss": 1.1338, + "step": 6774 + }, + { + "epoch": 0.17396300237032614, + "grad_norm": 0.76171875, + "learning_rate": 0.00019347462486581614, + "loss": 0.9457, + "step": 6775 + }, + { + "epoch": 0.17398867956624797, + "grad_norm": 0.83984375, + "learning_rate": 0.0001934730385704446, + "loss": 1.0424, + "step": 6776 + }, + { + "epoch": 0.17401435676216978, + "grad_norm": 0.8671875, + "learning_rate": 0.0001934714520887898, + "loss": 1.1899, + "step": 6777 + }, + { + "epoch": 0.17404003395809162, + "grad_norm": 0.81640625, + "learning_rate": 0.00019346986542085478, + "loss": 1.0432, + "step": 6778 + }, + { + "epoch": 0.17406571115401342, + "grad_norm": 0.87890625, + "learning_rate": 0.00019346827856664281, + "loss": 1.0451, + "step": 6779 + }, + { + "epoch": 0.17409138834993523, + "grad_norm": 0.76171875, + "learning_rate": 0.00019346669152615697, + "loss": 1.0255, + "step": 6780 + }, + { + "epoch": 0.17411706554585707, + "grad_norm": 0.88671875, + "learning_rate": 0.00019346510429940045, + "loss": 1.1141, + "step": 6781 + }, + { + "epoch": 0.17414274274177888, + "grad_norm": 0.82421875, + "learning_rate": 0.00019346351688637645, + "loss": 1.0172, + "step": 6782 + }, + { + "epoch": 0.1741684199377007, + "grad_norm": 0.828125, + "learning_rate": 0.00019346192928708811, + "loss": 1.0813, + "step": 6783 + }, + { + "epoch": 0.17419409713362252, + "grad_norm": 0.92578125, + "learning_rate": 0.00019346034150153856, + "loss": 1.1609, + "step": 6784 + }, + { + "epoch": 0.17421977432954433, + "grad_norm": 1.0546875, + "learning_rate": 0.000193458753529731, + "loss": 1.0149, + "step": 6785 + }, + { + "epoch": 0.17424545152546617, + "grad_norm": 0.83984375, + "learning_rate": 0.00019345716537166858, + "loss": 1.0983, + "step": 6786 + }, + { + "epoch": 0.17427112872138797, + "grad_norm": 0.796875, + "learning_rate": 0.00019345557702735448, + "loss": 0.9373, + "step": 6787 + }, + { + "epoch": 0.1742968059173098, + "grad_norm": 0.84765625, + "learning_rate": 0.00019345398849679186, + "loss": 1.1779, + "step": 6788 + }, + { + "epoch": 0.17432248311323162, + "grad_norm": 0.859375, + "learning_rate": 0.00019345239977998387, + "loss": 1.0711, + "step": 6789 + }, + { + "epoch": 0.17434816030915343, + "grad_norm": 0.90234375, + "learning_rate": 0.00019345081087693367, + "loss": 1.0844, + "step": 6790 + }, + { + "epoch": 0.17437383750507526, + "grad_norm": 0.91015625, + "learning_rate": 0.0001934492217876445, + "loss": 1.1015, + "step": 6791 + }, + { + "epoch": 0.17439951470099707, + "grad_norm": 0.75, + "learning_rate": 0.00019344763251211947, + "loss": 0.9054, + "step": 6792 + }, + { + "epoch": 0.1744251918969189, + "grad_norm": 0.87109375, + "learning_rate": 0.0001934460430503617, + "loss": 1.049, + "step": 6793 + }, + { + "epoch": 0.17445086909284072, + "grad_norm": 0.80859375, + "learning_rate": 0.00019344445340237443, + "loss": 1.2361, + "step": 6794 + }, + { + "epoch": 0.17447654628876252, + "grad_norm": 0.87109375, + "learning_rate": 0.0001934428635681608, + "loss": 1.1661, + "step": 6795 + }, + { + "epoch": 0.17450222348468436, + "grad_norm": 0.875, + "learning_rate": 0.00019344127354772398, + "loss": 1.0409, + "step": 6796 + }, + { + "epoch": 0.17452790068060617, + "grad_norm": 0.8671875, + "learning_rate": 0.00019343968334106716, + "loss": 1.0993, + "step": 6797 + }, + { + "epoch": 0.174553577876528, + "grad_norm": 0.85546875, + "learning_rate": 0.00019343809294819348, + "loss": 1.0362, + "step": 6798 + }, + { + "epoch": 0.1745792550724498, + "grad_norm": 0.88671875, + "learning_rate": 0.00019343650236910612, + "loss": 1.181, + "step": 6799 + }, + { + "epoch": 0.17460493226837162, + "grad_norm": 0.8046875, + "learning_rate": 0.00019343491160380826, + "loss": 1.1184, + "step": 6800 + }, + { + "epoch": 0.17463060946429346, + "grad_norm": 0.9609375, + "learning_rate": 0.00019343332065230307, + "loss": 1.1051, + "step": 6801 + }, + { + "epoch": 0.17465628666021527, + "grad_norm": 0.85546875, + "learning_rate": 0.00019343172951459368, + "loss": 1.1491, + "step": 6802 + }, + { + "epoch": 0.1746819638561371, + "grad_norm": 0.81640625, + "learning_rate": 0.0001934301381906833, + "loss": 1.0645, + "step": 6803 + }, + { + "epoch": 0.1747076410520589, + "grad_norm": 0.92578125, + "learning_rate": 0.00019342854668057512, + "loss": 1.1471, + "step": 6804 + }, + { + "epoch": 0.17473331824798072, + "grad_norm": 0.8203125, + "learning_rate": 0.00019342695498427229, + "loss": 1.0741, + "step": 6805 + }, + { + "epoch": 0.17475899544390255, + "grad_norm": 0.87109375, + "learning_rate": 0.00019342536310177795, + "loss": 1.1709, + "step": 6806 + }, + { + "epoch": 0.17478467263982436, + "grad_norm": 0.796875, + "learning_rate": 0.0001934237710330953, + "loss": 1.0157, + "step": 6807 + }, + { + "epoch": 0.1748103498357462, + "grad_norm": 0.8203125, + "learning_rate": 0.00019342217877822753, + "loss": 1.0223, + "step": 6808 + }, + { + "epoch": 0.174836027031668, + "grad_norm": 0.8515625, + "learning_rate": 0.00019342058633717779, + "loss": 1.2611, + "step": 6809 + }, + { + "epoch": 0.17486170422758981, + "grad_norm": 0.88671875, + "learning_rate": 0.00019341899370994926, + "loss": 1.1252, + "step": 6810 + }, + { + "epoch": 0.17488738142351165, + "grad_norm": 0.8046875, + "learning_rate": 0.0001934174008965451, + "loss": 0.9692, + "step": 6811 + }, + { + "epoch": 0.17491305861943346, + "grad_norm": 0.8828125, + "learning_rate": 0.0001934158078969685, + "loss": 1.0555, + "step": 6812 + }, + { + "epoch": 0.1749387358153553, + "grad_norm": 0.8671875, + "learning_rate": 0.00019341421471122264, + "loss": 1.0173, + "step": 6813 + }, + { + "epoch": 0.1749644130112771, + "grad_norm": 0.84375, + "learning_rate": 0.00019341262133931072, + "loss": 1.1623, + "step": 6814 + }, + { + "epoch": 0.1749900902071989, + "grad_norm": 0.78515625, + "learning_rate": 0.00019341102778123586, + "loss": 1.0314, + "step": 6815 + }, + { + "epoch": 0.17501576740312075, + "grad_norm": 0.95703125, + "learning_rate": 0.00019340943403700124, + "loss": 1.1775, + "step": 6816 + }, + { + "epoch": 0.17504144459904256, + "grad_norm": 0.79296875, + "learning_rate": 0.00019340784010661008, + "loss": 1.0872, + "step": 6817 + }, + { + "epoch": 0.1750671217949644, + "grad_norm": 0.859375, + "learning_rate": 0.00019340624599006553, + "loss": 1.0823, + "step": 6818 + }, + { + "epoch": 0.1750927989908862, + "grad_norm": 0.78125, + "learning_rate": 0.00019340465168737075, + "loss": 0.9142, + "step": 6819 + }, + { + "epoch": 0.175118476186808, + "grad_norm": 0.86328125, + "learning_rate": 0.00019340305719852893, + "loss": 1.0186, + "step": 6820 + }, + { + "epoch": 0.17514415338272984, + "grad_norm": 0.796875, + "learning_rate": 0.0001934014625235433, + "loss": 1.1951, + "step": 6821 + }, + { + "epoch": 0.17516983057865165, + "grad_norm": 0.8671875, + "learning_rate": 0.00019339986766241695, + "loss": 1.0925, + "step": 6822 + }, + { + "epoch": 0.1751955077745735, + "grad_norm": 0.90625, + "learning_rate": 0.00019339827261515314, + "loss": 1.0987, + "step": 6823 + }, + { + "epoch": 0.1752211849704953, + "grad_norm": 0.83984375, + "learning_rate": 0.00019339667738175497, + "loss": 1.1113, + "step": 6824 + }, + { + "epoch": 0.1752468621664171, + "grad_norm": 0.87890625, + "learning_rate": 0.00019339508196222568, + "loss": 1.0786, + "step": 6825 + }, + { + "epoch": 0.17527253936233894, + "grad_norm": 0.8125, + "learning_rate": 0.00019339348635656842, + "loss": 1.0009, + "step": 6826 + }, + { + "epoch": 0.17529821655826075, + "grad_norm": 0.87109375, + "learning_rate": 0.00019339189056478636, + "loss": 1.1608, + "step": 6827 + }, + { + "epoch": 0.17532389375418259, + "grad_norm": 0.84375, + "learning_rate": 0.0001933902945868827, + "loss": 1.0461, + "step": 6828 + }, + { + "epoch": 0.1753495709501044, + "grad_norm": 0.828125, + "learning_rate": 0.00019338869842286066, + "loss": 1.1072, + "step": 6829 + }, + { + "epoch": 0.1753752481460262, + "grad_norm": 0.81640625, + "learning_rate": 0.00019338710207272337, + "loss": 1.128, + "step": 6830 + }, + { + "epoch": 0.17540092534194804, + "grad_norm": 0.8125, + "learning_rate": 0.00019338550553647398, + "loss": 1.1523, + "step": 6831 + }, + { + "epoch": 0.17542660253786985, + "grad_norm": 0.83984375, + "learning_rate": 0.00019338390881411575, + "loss": 1.0719, + "step": 6832 + }, + { + "epoch": 0.17545227973379168, + "grad_norm": 0.82421875, + "learning_rate": 0.0001933823119056518, + "loss": 1.1472, + "step": 6833 + }, + { + "epoch": 0.1754779569297135, + "grad_norm": 0.80078125, + "learning_rate": 0.00019338071481108537, + "loss": 1.0532, + "step": 6834 + }, + { + "epoch": 0.1755036341256353, + "grad_norm": 0.76953125, + "learning_rate": 0.0001933791175304196, + "loss": 1.0777, + "step": 6835 + }, + { + "epoch": 0.17552931132155714, + "grad_norm": 0.8203125, + "learning_rate": 0.00019337752006365767, + "loss": 0.9368, + "step": 6836 + }, + { + "epoch": 0.17555498851747894, + "grad_norm": 0.828125, + "learning_rate": 0.00019337592241080277, + "loss": 1.0373, + "step": 6837 + }, + { + "epoch": 0.17558066571340078, + "grad_norm": 0.9609375, + "learning_rate": 0.00019337432457185814, + "loss": 1.2867, + "step": 6838 + }, + { + "epoch": 0.1756063429093226, + "grad_norm": 0.81640625, + "learning_rate": 0.00019337272654682686, + "loss": 1.1301, + "step": 6839 + }, + { + "epoch": 0.1756320201052444, + "grad_norm": 0.78515625, + "learning_rate": 0.0001933711283357122, + "loss": 1.1365, + "step": 6840 + }, + { + "epoch": 0.17565769730116623, + "grad_norm": 1.140625, + "learning_rate": 0.0001933695299385173, + "loss": 1.0841, + "step": 6841 + }, + { + "epoch": 0.17568337449708804, + "grad_norm": 0.78515625, + "learning_rate": 0.00019336793135524535, + "loss": 1.0111, + "step": 6842 + }, + { + "epoch": 0.17570905169300988, + "grad_norm": 0.81640625, + "learning_rate": 0.00019336633258589957, + "loss": 1.1585, + "step": 6843 + }, + { + "epoch": 0.17573472888893168, + "grad_norm": 0.80859375, + "learning_rate": 0.00019336473363048312, + "loss": 1.1928, + "step": 6844 + }, + { + "epoch": 0.1757604060848535, + "grad_norm": 0.8515625, + "learning_rate": 0.0001933631344889992, + "loss": 1.0859, + "step": 6845 + }, + { + "epoch": 0.17578608328077533, + "grad_norm": 0.9609375, + "learning_rate": 0.00019336153516145096, + "loss": 1.0583, + "step": 6846 + }, + { + "epoch": 0.17581176047669714, + "grad_norm": 0.86328125, + "learning_rate": 0.00019335993564784164, + "loss": 1.1701, + "step": 6847 + }, + { + "epoch": 0.17583743767261897, + "grad_norm": 0.75, + "learning_rate": 0.00019335833594817436, + "loss": 0.9903, + "step": 6848 + }, + { + "epoch": 0.17586311486854078, + "grad_norm": 0.85546875, + "learning_rate": 0.00019335673606245239, + "loss": 1.1605, + "step": 6849 + }, + { + "epoch": 0.1758887920644626, + "grad_norm": 0.84765625, + "learning_rate": 0.00019335513599067888, + "loss": 1.0804, + "step": 6850 + }, + { + "epoch": 0.17591446926038443, + "grad_norm": 0.83984375, + "learning_rate": 0.000193353535732857, + "loss": 1.0618, + "step": 6851 + }, + { + "epoch": 0.17594014645630623, + "grad_norm": 0.7578125, + "learning_rate": 0.00019335193528898997, + "loss": 1.1191, + "step": 6852 + }, + { + "epoch": 0.17596582365222807, + "grad_norm": 0.8125, + "learning_rate": 0.00019335033465908095, + "loss": 0.9843, + "step": 6853 + }, + { + "epoch": 0.17599150084814988, + "grad_norm": 0.7890625, + "learning_rate": 0.00019334873384313316, + "loss": 1.0119, + "step": 6854 + }, + { + "epoch": 0.1760171780440717, + "grad_norm": 0.80859375, + "learning_rate": 0.00019334713284114975, + "loss": 1.1407, + "step": 6855 + }, + { + "epoch": 0.17604285523999352, + "grad_norm": 0.81640625, + "learning_rate": 0.00019334553165313394, + "loss": 1.1278, + "step": 6856 + }, + { + "epoch": 0.17606853243591533, + "grad_norm": 0.76953125, + "learning_rate": 0.00019334393027908893, + "loss": 1.0517, + "step": 6857 + }, + { + "epoch": 0.17609420963183717, + "grad_norm": 0.79296875, + "learning_rate": 0.0001933423287190179, + "loss": 0.9945, + "step": 6858 + }, + { + "epoch": 0.17611988682775898, + "grad_norm": 0.9140625, + "learning_rate": 0.000193340726972924, + "loss": 1.1166, + "step": 6859 + }, + { + "epoch": 0.17614556402368078, + "grad_norm": 0.8359375, + "learning_rate": 0.00019333912504081052, + "loss": 1.1933, + "step": 6860 + }, + { + "epoch": 0.17617124121960262, + "grad_norm": 0.78125, + "learning_rate": 0.0001933375229226806, + "loss": 1.0141, + "step": 6861 + }, + { + "epoch": 0.17619691841552443, + "grad_norm": 0.8125, + "learning_rate": 0.00019333592061853737, + "loss": 1.0988, + "step": 6862 + }, + { + "epoch": 0.17622259561144626, + "grad_norm": 0.828125, + "learning_rate": 0.00019333431812838413, + "loss": 1.0464, + "step": 6863 + }, + { + "epoch": 0.17624827280736807, + "grad_norm": 0.859375, + "learning_rate": 0.000193332715452224, + "loss": 1.0949, + "step": 6864 + }, + { + "epoch": 0.17627395000328988, + "grad_norm": 0.78515625, + "learning_rate": 0.00019333111259006022, + "loss": 0.8652, + "step": 6865 + }, + { + "epoch": 0.17629962719921172, + "grad_norm": 0.78125, + "learning_rate": 0.00019332950954189596, + "loss": 1.1865, + "step": 6866 + }, + { + "epoch": 0.17632530439513353, + "grad_norm": 0.76953125, + "learning_rate": 0.00019332790630773437, + "loss": 0.9806, + "step": 6867 + }, + { + "epoch": 0.17635098159105536, + "grad_norm": 0.84375, + "learning_rate": 0.00019332630288757875, + "loss": 0.9773, + "step": 6868 + }, + { + "epoch": 0.17637665878697717, + "grad_norm": 0.79296875, + "learning_rate": 0.00019332469928143222, + "loss": 0.9864, + "step": 6869 + }, + { + "epoch": 0.17640233598289898, + "grad_norm": 0.87109375, + "learning_rate": 0.000193323095489298, + "loss": 1.1813, + "step": 6870 + }, + { + "epoch": 0.17642801317882081, + "grad_norm": 0.76171875, + "learning_rate": 0.00019332149151117927, + "loss": 1.1337, + "step": 6871 + }, + { + "epoch": 0.17645369037474262, + "grad_norm": 0.80078125, + "learning_rate": 0.00019331988734707923, + "loss": 1.0493, + "step": 6872 + }, + { + "epoch": 0.17647936757066446, + "grad_norm": 0.8046875, + "learning_rate": 0.0001933182829970011, + "loss": 1.0701, + "step": 6873 + }, + { + "epoch": 0.17650504476658627, + "grad_norm": 0.921875, + "learning_rate": 0.00019331667846094806, + "loss": 0.937, + "step": 6874 + }, + { + "epoch": 0.17653072196250807, + "grad_norm": 0.80859375, + "learning_rate": 0.00019331507373892333, + "loss": 1.1971, + "step": 6875 + }, + { + "epoch": 0.1765563991584299, + "grad_norm": 0.8046875, + "learning_rate": 0.00019331346883093005, + "loss": 1.0248, + "step": 6876 + }, + { + "epoch": 0.17658207635435172, + "grad_norm": 0.859375, + "learning_rate": 0.00019331186373697148, + "loss": 0.9741, + "step": 6877 + }, + { + "epoch": 0.17660775355027356, + "grad_norm": 0.75390625, + "learning_rate": 0.00019331025845705074, + "loss": 1.0236, + "step": 6878 + }, + { + "epoch": 0.17663343074619536, + "grad_norm": 0.96875, + "learning_rate": 0.00019330865299117114, + "loss": 1.0245, + "step": 6879 + }, + { + "epoch": 0.17665910794211717, + "grad_norm": 0.796875, + "learning_rate": 0.00019330704733933583, + "loss": 1.091, + "step": 6880 + }, + { + "epoch": 0.176684785138039, + "grad_norm": 0.7890625, + "learning_rate": 0.00019330544150154797, + "loss": 1.1133, + "step": 6881 + }, + { + "epoch": 0.17671046233396082, + "grad_norm": 0.76171875, + "learning_rate": 0.00019330383547781082, + "loss": 1.1252, + "step": 6882 + }, + { + "epoch": 0.17673613952988265, + "grad_norm": 0.8203125, + "learning_rate": 0.00019330222926812753, + "loss": 1.0666, + "step": 6883 + }, + { + "epoch": 0.17676181672580446, + "grad_norm": 0.734375, + "learning_rate": 0.00019330062287250133, + "loss": 1.192, + "step": 6884 + }, + { + "epoch": 0.17678749392172627, + "grad_norm": 0.8515625, + "learning_rate": 0.00019329901629093542, + "loss": 1.0749, + "step": 6885 + }, + { + "epoch": 0.1768131711176481, + "grad_norm": 0.81640625, + "learning_rate": 0.000193297409523433, + "loss": 0.9735, + "step": 6886 + }, + { + "epoch": 0.1768388483135699, + "grad_norm": 0.81640625, + "learning_rate": 0.0001932958025699973, + "loss": 1.106, + "step": 6887 + }, + { + "epoch": 0.17686452550949175, + "grad_norm": 0.90625, + "learning_rate": 0.00019329419543063145, + "loss": 1.0728, + "step": 6888 + }, + { + "epoch": 0.17689020270541356, + "grad_norm": 0.8046875, + "learning_rate": 0.0001932925881053387, + "loss": 0.8729, + "step": 6889 + }, + { + "epoch": 0.17691587990133537, + "grad_norm": 0.921875, + "learning_rate": 0.00019329098059412227, + "loss": 1.1179, + "step": 6890 + }, + { + "epoch": 0.1769415570972572, + "grad_norm": 0.7578125, + "learning_rate": 0.00019328937289698532, + "loss": 1.1201, + "step": 6891 + }, + { + "epoch": 0.176967234293179, + "grad_norm": 0.8515625, + "learning_rate": 0.00019328776501393108, + "loss": 1.1873, + "step": 6892 + }, + { + "epoch": 0.17699291148910085, + "grad_norm": 1.7421875, + "learning_rate": 0.00019328615694496278, + "loss": 1.1393, + "step": 6893 + }, + { + "epoch": 0.17701858868502265, + "grad_norm": 0.8359375, + "learning_rate": 0.00019328454869008358, + "loss": 1.0055, + "step": 6894 + }, + { + "epoch": 0.17704426588094446, + "grad_norm": 0.875, + "learning_rate": 0.00019328294024929669, + "loss": 1.239, + "step": 6895 + }, + { + "epoch": 0.1770699430768663, + "grad_norm": 0.8671875, + "learning_rate": 0.00019328133162260533, + "loss": 1.1768, + "step": 6896 + }, + { + "epoch": 0.1770956202727881, + "grad_norm": 0.89453125, + "learning_rate": 0.00019327972281001267, + "loss": 0.8635, + "step": 6897 + }, + { + "epoch": 0.17712129746870994, + "grad_norm": 0.81640625, + "learning_rate": 0.00019327811381152202, + "loss": 0.9252, + "step": 6898 + }, + { + "epoch": 0.17714697466463175, + "grad_norm": 0.83984375, + "learning_rate": 0.00019327650462713645, + "loss": 1.1695, + "step": 6899 + }, + { + "epoch": 0.17717265186055356, + "grad_norm": 0.7734375, + "learning_rate": 0.00019327489525685927, + "loss": 0.9398, + "step": 6900 + }, + { + "epoch": 0.1771983290564754, + "grad_norm": 0.8203125, + "learning_rate": 0.00019327328570069362, + "loss": 1.0617, + "step": 6901 + }, + { + "epoch": 0.1772240062523972, + "grad_norm": 0.80859375, + "learning_rate": 0.00019327167595864278, + "loss": 1.0113, + "step": 6902 + }, + { + "epoch": 0.17724968344831904, + "grad_norm": 0.80078125, + "learning_rate": 0.0001932700660307099, + "loss": 1.0467, + "step": 6903 + }, + { + "epoch": 0.17727536064424085, + "grad_norm": 0.8515625, + "learning_rate": 0.00019326845591689817, + "loss": 1.0777, + "step": 6904 + }, + { + "epoch": 0.17730103784016266, + "grad_norm": 0.84765625, + "learning_rate": 0.00019326684561721087, + "loss": 1.0711, + "step": 6905 + }, + { + "epoch": 0.1773267150360845, + "grad_norm": 0.859375, + "learning_rate": 0.00019326523513165116, + "loss": 1.1526, + "step": 6906 + }, + { + "epoch": 0.1773523922320063, + "grad_norm": 0.7734375, + "learning_rate": 0.00019326362446022227, + "loss": 0.9643, + "step": 6907 + }, + { + "epoch": 0.17737806942792814, + "grad_norm": 0.8203125, + "learning_rate": 0.00019326201360292736, + "loss": 0.9558, + "step": 6908 + }, + { + "epoch": 0.17740374662384994, + "grad_norm": 0.87109375, + "learning_rate": 0.00019326040255976973, + "loss": 1.1264, + "step": 6909 + }, + { + "epoch": 0.17742942381977175, + "grad_norm": 0.8203125, + "learning_rate": 0.0001932587913307525, + "loss": 1.1566, + "step": 6910 + }, + { + "epoch": 0.1774551010156936, + "grad_norm": 0.91015625, + "learning_rate": 0.00019325717991587894, + "loss": 1.0336, + "step": 6911 + }, + { + "epoch": 0.1774807782116154, + "grad_norm": 0.81640625, + "learning_rate": 0.00019325556831515227, + "loss": 1.1755, + "step": 6912 + }, + { + "epoch": 0.17750645540753723, + "grad_norm": 0.8046875, + "learning_rate": 0.00019325395652857564, + "loss": 0.974, + "step": 6913 + }, + { + "epoch": 0.17753213260345904, + "grad_norm": 0.80859375, + "learning_rate": 0.0001932523445561523, + "loss": 0.9632, + "step": 6914 + }, + { + "epoch": 0.17755780979938085, + "grad_norm": 0.80859375, + "learning_rate": 0.00019325073239788548, + "loss": 1.2544, + "step": 6915 + }, + { + "epoch": 0.1775834869953027, + "grad_norm": 0.8359375, + "learning_rate": 0.00019324912005377838, + "loss": 1.1179, + "step": 6916 + }, + { + "epoch": 0.1776091641912245, + "grad_norm": 0.82421875, + "learning_rate": 0.0001932475075238342, + "loss": 0.9038, + "step": 6917 + }, + { + "epoch": 0.17763484138714633, + "grad_norm": 0.75390625, + "learning_rate": 0.00019324589480805614, + "loss": 0.9963, + "step": 6918 + }, + { + "epoch": 0.17766051858306814, + "grad_norm": 0.921875, + "learning_rate": 0.00019324428190644745, + "loss": 1.1402, + "step": 6919 + }, + { + "epoch": 0.17768619577898995, + "grad_norm": 0.80078125, + "learning_rate": 0.00019324266881901134, + "loss": 1.1729, + "step": 6920 + }, + { + "epoch": 0.17771187297491178, + "grad_norm": 0.87109375, + "learning_rate": 0.00019324105554575097, + "loss": 1.0017, + "step": 6921 + }, + { + "epoch": 0.1777375501708336, + "grad_norm": 0.87109375, + "learning_rate": 0.00019323944208666965, + "loss": 1.2143, + "step": 6922 + }, + { + "epoch": 0.1777632273667554, + "grad_norm": 0.828125, + "learning_rate": 0.00019323782844177055, + "loss": 1.1533, + "step": 6923 + }, + { + "epoch": 0.17778890456267724, + "grad_norm": 0.90625, + "learning_rate": 0.00019323621461105685, + "loss": 1.186, + "step": 6924 + }, + { + "epoch": 0.17781458175859904, + "grad_norm": 0.90625, + "learning_rate": 0.0001932346005945318, + "loss": 1.0948, + "step": 6925 + }, + { + "epoch": 0.17784025895452088, + "grad_norm": 0.85546875, + "learning_rate": 0.0001932329863921986, + "loss": 1.0888, + "step": 6926 + }, + { + "epoch": 0.1778659361504427, + "grad_norm": 0.83203125, + "learning_rate": 0.00019323137200406048, + "loss": 1.0565, + "step": 6927 + }, + { + "epoch": 0.1778916133463645, + "grad_norm": 0.84375, + "learning_rate": 0.0001932297574301207, + "loss": 0.9835, + "step": 6928 + }, + { + "epoch": 0.17791729054228633, + "grad_norm": 0.81640625, + "learning_rate": 0.0001932281426703824, + "loss": 1.0773, + "step": 6929 + }, + { + "epoch": 0.17794296773820814, + "grad_norm": 0.80859375, + "learning_rate": 0.00019322652772484883, + "loss": 0.9982, + "step": 6930 + }, + { + "epoch": 0.17796864493412998, + "grad_norm": 0.79296875, + "learning_rate": 0.00019322491259352322, + "loss": 0.8881, + "step": 6931 + }, + { + "epoch": 0.17799432213005179, + "grad_norm": 1.8515625, + "learning_rate": 0.0001932232972764088, + "loss": 1.045, + "step": 6932 + }, + { + "epoch": 0.1780199993259736, + "grad_norm": 1.0, + "learning_rate": 0.00019322168177350873, + "loss": 1.0553, + "step": 6933 + }, + { + "epoch": 0.17804567652189543, + "grad_norm": 0.83984375, + "learning_rate": 0.00019322006608482627, + "loss": 1.0622, + "step": 6934 + }, + { + "epoch": 0.17807135371781724, + "grad_norm": 0.859375, + "learning_rate": 0.00019321845021036467, + "loss": 0.9724, + "step": 6935 + }, + { + "epoch": 0.17809703091373907, + "grad_norm": 0.80859375, + "learning_rate": 0.00019321683415012713, + "loss": 1.0094, + "step": 6936 + }, + { + "epoch": 0.17812270810966088, + "grad_norm": 1.015625, + "learning_rate": 0.00019321521790411681, + "loss": 1.0273, + "step": 6937 + }, + { + "epoch": 0.1781483853055827, + "grad_norm": 0.8203125, + "learning_rate": 0.00019321360147233702, + "loss": 0.981, + "step": 6938 + }, + { + "epoch": 0.17817406250150453, + "grad_norm": 0.765625, + "learning_rate": 0.00019321198485479092, + "loss": 1.0205, + "step": 6939 + }, + { + "epoch": 0.17819973969742633, + "grad_norm": 0.73828125, + "learning_rate": 0.00019321036805148176, + "loss": 0.9976, + "step": 6940 + }, + { + "epoch": 0.17822541689334817, + "grad_norm": 0.91015625, + "learning_rate": 0.00019320875106241275, + "loss": 1.0071, + "step": 6941 + }, + { + "epoch": 0.17825109408926998, + "grad_norm": 0.91015625, + "learning_rate": 0.00019320713388758714, + "loss": 1.0806, + "step": 6942 + }, + { + "epoch": 0.1782767712851918, + "grad_norm": 0.86328125, + "learning_rate": 0.00019320551652700813, + "loss": 1.092, + "step": 6943 + }, + { + "epoch": 0.17830244848111362, + "grad_norm": 0.8359375, + "learning_rate": 0.00019320389898067893, + "loss": 1.1022, + "step": 6944 + }, + { + "epoch": 0.17832812567703543, + "grad_norm": 0.8515625, + "learning_rate": 0.0001932022812486028, + "loss": 0.7357, + "step": 6945 + }, + { + "epoch": 0.17835380287295727, + "grad_norm": 0.83203125, + "learning_rate": 0.00019320066333078291, + "loss": 1.2401, + "step": 6946 + }, + { + "epoch": 0.17837948006887908, + "grad_norm": 0.875, + "learning_rate": 0.00019319904522722254, + "loss": 1.0205, + "step": 6947 + }, + { + "epoch": 0.17840515726480088, + "grad_norm": 0.890625, + "learning_rate": 0.00019319742693792486, + "loss": 1.0763, + "step": 6948 + }, + { + "epoch": 0.17843083446072272, + "grad_norm": 0.81640625, + "learning_rate": 0.00019319580846289319, + "loss": 1.0401, + "step": 6949 + }, + { + "epoch": 0.17845651165664453, + "grad_norm": 0.84765625, + "learning_rate": 0.00019319418980213063, + "loss": 0.9397, + "step": 6950 + }, + { + "epoch": 0.17848218885256636, + "grad_norm": 0.75, + "learning_rate": 0.0001931925709556405, + "loss": 1.1207, + "step": 6951 + }, + { + "epoch": 0.17850786604848817, + "grad_norm": 0.76953125, + "learning_rate": 0.00019319095192342597, + "loss": 0.9345, + "step": 6952 + }, + { + "epoch": 0.17853354324440998, + "grad_norm": 0.828125, + "learning_rate": 0.00019318933270549032, + "loss": 1.1094, + "step": 6953 + }, + { + "epoch": 0.17855922044033182, + "grad_norm": 0.9921875, + "learning_rate": 0.00019318771330183672, + "loss": 1.0865, + "step": 6954 + }, + { + "epoch": 0.17858489763625363, + "grad_norm": 0.8046875, + "learning_rate": 0.00019318609371246845, + "loss": 0.9268, + "step": 6955 + }, + { + "epoch": 0.17861057483217546, + "grad_norm": 0.77734375, + "learning_rate": 0.0001931844739373887, + "loss": 1.1092, + "step": 6956 + }, + { + "epoch": 0.17863625202809727, + "grad_norm": 0.796875, + "learning_rate": 0.0001931828539766007, + "loss": 1.0499, + "step": 6957 + }, + { + "epoch": 0.17866192922401908, + "grad_norm": 0.84375, + "learning_rate": 0.00019318123383010768, + "loss": 1.1733, + "step": 6958 + }, + { + "epoch": 0.17868760641994091, + "grad_norm": 0.86328125, + "learning_rate": 0.00019317961349791293, + "loss": 1.0692, + "step": 6959 + }, + { + "epoch": 0.17871328361586272, + "grad_norm": 0.8671875, + "learning_rate": 0.00019317799298001958, + "loss": 1.0162, + "step": 6960 + }, + { + "epoch": 0.17873896081178456, + "grad_norm": 0.828125, + "learning_rate": 0.00019317637227643094, + "loss": 0.8428, + "step": 6961 + }, + { + "epoch": 0.17876463800770637, + "grad_norm": 0.7890625, + "learning_rate": 0.00019317475138715018, + "loss": 1.0475, + "step": 6962 + }, + { + "epoch": 0.17879031520362818, + "grad_norm": 0.921875, + "learning_rate": 0.00019317313031218055, + "loss": 1.1562, + "step": 6963 + }, + { + "epoch": 0.17881599239955, + "grad_norm": 0.80078125, + "learning_rate": 0.00019317150905152534, + "loss": 1.1192, + "step": 6964 + }, + { + "epoch": 0.17884166959547182, + "grad_norm": 0.80859375, + "learning_rate": 0.00019316988760518768, + "loss": 1.0898, + "step": 6965 + }, + { + "epoch": 0.17886734679139366, + "grad_norm": 0.8828125, + "learning_rate": 0.00019316826597317083, + "loss": 0.9094, + "step": 6966 + }, + { + "epoch": 0.17889302398731546, + "grad_norm": 0.796875, + "learning_rate": 0.0001931666441554781, + "loss": 1.0306, + "step": 6967 + }, + { + "epoch": 0.17891870118323727, + "grad_norm": 0.87109375, + "learning_rate": 0.00019316502215211263, + "loss": 1.0748, + "step": 6968 + }, + { + "epoch": 0.1789443783791591, + "grad_norm": 0.7734375, + "learning_rate": 0.0001931633999630777, + "loss": 1.0108, + "step": 6969 + }, + { + "epoch": 0.17897005557508092, + "grad_norm": 0.85546875, + "learning_rate": 0.0001931617775883765, + "loss": 1.068, + "step": 6970 + }, + { + "epoch": 0.17899573277100275, + "grad_norm": 0.90234375, + "learning_rate": 0.00019316015502801235, + "loss": 1.1516, + "step": 6971 + }, + { + "epoch": 0.17902140996692456, + "grad_norm": 0.80859375, + "learning_rate": 0.0001931585322819884, + "loss": 0.8707, + "step": 6972 + }, + { + "epoch": 0.17904708716284637, + "grad_norm": 2.03125, + "learning_rate": 0.00019315690935030788, + "loss": 1.1376, + "step": 6973 + }, + { + "epoch": 0.1790727643587682, + "grad_norm": 0.8671875, + "learning_rate": 0.00019315528623297407, + "loss": 1.2153, + "step": 6974 + }, + { + "epoch": 0.17909844155469, + "grad_norm": 0.84375, + "learning_rate": 0.0001931536629299902, + "loss": 1.0765, + "step": 6975 + }, + { + "epoch": 0.17912411875061185, + "grad_norm": 0.7890625, + "learning_rate": 0.00019315203944135948, + "loss": 1.1195, + "step": 6976 + }, + { + "epoch": 0.17914979594653366, + "grad_norm": 0.890625, + "learning_rate": 0.00019315041576708518, + "loss": 1.0646, + "step": 6977 + }, + { + "epoch": 0.17917547314245547, + "grad_norm": 0.8046875, + "learning_rate": 0.0001931487919071705, + "loss": 1.1108, + "step": 6978 + }, + { + "epoch": 0.1792011503383773, + "grad_norm": 0.83203125, + "learning_rate": 0.00019314716786161871, + "loss": 1.1002, + "step": 6979 + }, + { + "epoch": 0.1792268275342991, + "grad_norm": 0.859375, + "learning_rate": 0.00019314554363043302, + "loss": 1.0576, + "step": 6980 + }, + { + "epoch": 0.17925250473022095, + "grad_norm": 0.80859375, + "learning_rate": 0.00019314391921361669, + "loss": 1.105, + "step": 6981 + }, + { + "epoch": 0.17927818192614275, + "grad_norm": 0.74609375, + "learning_rate": 0.0001931422946111729, + "loss": 1.1338, + "step": 6982 + }, + { + "epoch": 0.17930385912206456, + "grad_norm": 0.85546875, + "learning_rate": 0.00019314066982310495, + "loss": 1.0731, + "step": 6983 + }, + { + "epoch": 0.1793295363179864, + "grad_norm": 0.78515625, + "learning_rate": 0.00019313904484941606, + "loss": 0.9885, + "step": 6984 + }, + { + "epoch": 0.1793552135139082, + "grad_norm": 0.78515625, + "learning_rate": 0.00019313741969010945, + "loss": 0.9538, + "step": 6985 + }, + { + "epoch": 0.17938089070983004, + "grad_norm": 0.79296875, + "learning_rate": 0.0001931357943451884, + "loss": 1.0677, + "step": 6986 + }, + { + "epoch": 0.17940656790575185, + "grad_norm": 0.87890625, + "learning_rate": 0.00019313416881465613, + "loss": 1.0397, + "step": 6987 + }, + { + "epoch": 0.17943224510167366, + "grad_norm": 0.8515625, + "learning_rate": 0.00019313254309851586, + "loss": 1.1038, + "step": 6988 + }, + { + "epoch": 0.1794579222975955, + "grad_norm": 0.8046875, + "learning_rate": 0.00019313091719677086, + "loss": 1.0292, + "step": 6989 + }, + { + "epoch": 0.1794835994935173, + "grad_norm": 0.8828125, + "learning_rate": 0.00019312929110942433, + "loss": 0.9131, + "step": 6990 + }, + { + "epoch": 0.17950927668943914, + "grad_norm": 0.875, + "learning_rate": 0.00019312766483647957, + "loss": 1.1145, + "step": 6991 + }, + { + "epoch": 0.17953495388536095, + "grad_norm": 0.8671875, + "learning_rate": 0.00019312603837793978, + "loss": 1.0569, + "step": 6992 + }, + { + "epoch": 0.17956063108128276, + "grad_norm": 0.73046875, + "learning_rate": 0.00019312441173380816, + "loss": 0.9411, + "step": 6993 + }, + { + "epoch": 0.1795863082772046, + "grad_norm": 0.8828125, + "learning_rate": 0.00019312278490408804, + "loss": 1.0825, + "step": 6994 + }, + { + "epoch": 0.1796119854731264, + "grad_norm": 1.0703125, + "learning_rate": 0.0001931211578887826, + "loss": 1.0772, + "step": 6995 + }, + { + "epoch": 0.17963766266904824, + "grad_norm": 0.890625, + "learning_rate": 0.0001931195306878951, + "loss": 1.0048, + "step": 6996 + }, + { + "epoch": 0.17966333986497005, + "grad_norm": 0.85546875, + "learning_rate": 0.00019311790330142883, + "loss": 1.1726, + "step": 6997 + }, + { + "epoch": 0.17968901706089185, + "grad_norm": 0.8203125, + "learning_rate": 0.00019311627572938695, + "loss": 1.0056, + "step": 6998 + }, + { + "epoch": 0.1797146942568137, + "grad_norm": 0.81640625, + "learning_rate": 0.00019311464797177276, + "loss": 1.0246, + "step": 6999 + }, + { + "epoch": 0.1797403714527355, + "grad_norm": 0.78515625, + "learning_rate": 0.00019311302002858948, + "loss": 0.9698, + "step": 7000 + }, + { + "epoch": 0.1797403714527355, + "eval_loss": 1.0615228414535522, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 402.9474, + "eval_samples_per_second": 24.817, + "eval_steps_per_second": 0.777, + "step": 7000 + }, + { + "epoch": 0.17976604864865733, + "grad_norm": 0.8203125, + "learning_rate": 0.00019311139189984037, + "loss": 1.1696, + "step": 7001 + }, + { + "epoch": 0.17979172584457914, + "grad_norm": 0.8125, + "learning_rate": 0.00019310976358552867, + "loss": 1.1284, + "step": 7002 + }, + { + "epoch": 0.17981740304050095, + "grad_norm": 0.81640625, + "learning_rate": 0.0001931081350856576, + "loss": 1.1105, + "step": 7003 + }, + { + "epoch": 0.1798430802364228, + "grad_norm": 0.82421875, + "learning_rate": 0.00019310650640023047, + "loss": 0.9855, + "step": 7004 + }, + { + "epoch": 0.1798687574323446, + "grad_norm": 0.85546875, + "learning_rate": 0.00019310487752925045, + "loss": 1.0835, + "step": 7005 + }, + { + "epoch": 0.17989443462826643, + "grad_norm": 0.859375, + "learning_rate": 0.00019310324847272083, + "loss": 1.1517, + "step": 7006 + }, + { + "epoch": 0.17992011182418824, + "grad_norm": 0.83203125, + "learning_rate": 0.00019310161923064484, + "loss": 1.0414, + "step": 7007 + }, + { + "epoch": 0.17994578902011005, + "grad_norm": 1.9765625, + "learning_rate": 0.00019309998980302572, + "loss": 1.1861, + "step": 7008 + }, + { + "epoch": 0.17997146621603188, + "grad_norm": 0.875, + "learning_rate": 0.00019309836018986673, + "loss": 1.1194, + "step": 7009 + }, + { + "epoch": 0.1799971434119537, + "grad_norm": 0.796875, + "learning_rate": 0.00019309673039117112, + "loss": 0.9878, + "step": 7010 + }, + { + "epoch": 0.18002282060787553, + "grad_norm": 1.015625, + "learning_rate": 0.00019309510040694215, + "loss": 1.0351, + "step": 7011 + }, + { + "epoch": 0.18004849780379734, + "grad_norm": 0.83203125, + "learning_rate": 0.00019309347023718307, + "loss": 1.1684, + "step": 7012 + }, + { + "epoch": 0.18007417499971914, + "grad_norm": 0.828125, + "learning_rate": 0.0001930918398818971, + "loss": 1.0114, + "step": 7013 + }, + { + "epoch": 0.18009985219564098, + "grad_norm": 0.83984375, + "learning_rate": 0.00019309020934108747, + "loss": 1.1173, + "step": 7014 + }, + { + "epoch": 0.1801255293915628, + "grad_norm": 0.84765625, + "learning_rate": 0.0001930885786147575, + "loss": 1.1324, + "step": 7015 + }, + { + "epoch": 0.18015120658748462, + "grad_norm": 0.83984375, + "learning_rate": 0.00019308694770291037, + "loss": 1.0613, + "step": 7016 + }, + { + "epoch": 0.18017688378340643, + "grad_norm": 0.76953125, + "learning_rate": 0.0001930853166055494, + "loss": 1.0185, + "step": 7017 + }, + { + "epoch": 0.18020256097932824, + "grad_norm": 0.890625, + "learning_rate": 0.0001930836853226778, + "loss": 1.0693, + "step": 7018 + }, + { + "epoch": 0.18022823817525008, + "grad_norm": 0.83203125, + "learning_rate": 0.00019308205385429878, + "loss": 1.2065, + "step": 7019 + }, + { + "epoch": 0.18025391537117189, + "grad_norm": 0.84375, + "learning_rate": 0.00019308042220041568, + "loss": 0.9957, + "step": 7020 + }, + { + "epoch": 0.18027959256709372, + "grad_norm": 0.80859375, + "learning_rate": 0.0001930787903610317, + "loss": 0.9307, + "step": 7021 + }, + { + "epoch": 0.18030526976301553, + "grad_norm": 0.8671875, + "learning_rate": 0.00019307715833615007, + "loss": 1.1348, + "step": 7022 + }, + { + "epoch": 0.18033094695893734, + "grad_norm": 0.8828125, + "learning_rate": 0.0001930755261257741, + "loss": 1.1561, + "step": 7023 + }, + { + "epoch": 0.18035662415485917, + "grad_norm": 1.234375, + "learning_rate": 0.000193073893729907, + "loss": 1.0737, + "step": 7024 + }, + { + "epoch": 0.18038230135078098, + "grad_norm": 0.875, + "learning_rate": 0.00019307226114855203, + "loss": 1.0615, + "step": 7025 + }, + { + "epoch": 0.18040797854670282, + "grad_norm": 0.84765625, + "learning_rate": 0.00019307062838171243, + "loss": 1.0305, + "step": 7026 + }, + { + "epoch": 0.18043365574262463, + "grad_norm": 0.81640625, + "learning_rate": 0.00019306899542939152, + "loss": 0.9947, + "step": 7027 + }, + { + "epoch": 0.18045933293854644, + "grad_norm": 0.84375, + "learning_rate": 0.00019306736229159248, + "loss": 1.2872, + "step": 7028 + }, + { + "epoch": 0.18048501013446827, + "grad_norm": 0.8125, + "learning_rate": 0.0001930657289683186, + "loss": 1.035, + "step": 7029 + }, + { + "epoch": 0.18051068733039008, + "grad_norm": 0.796875, + "learning_rate": 0.00019306409545957314, + "loss": 1.0835, + "step": 7030 + }, + { + "epoch": 0.18053636452631192, + "grad_norm": 0.8125, + "learning_rate": 0.00019306246176535935, + "loss": 1.0925, + "step": 7031 + }, + { + "epoch": 0.18056204172223372, + "grad_norm": 0.765625, + "learning_rate": 0.00019306082788568047, + "loss": 1.091, + "step": 7032 + }, + { + "epoch": 0.18058771891815553, + "grad_norm": 0.76953125, + "learning_rate": 0.00019305919382053976, + "loss": 0.9454, + "step": 7033 + }, + { + "epoch": 0.18061339611407737, + "grad_norm": 0.859375, + "learning_rate": 0.0001930575595699405, + "loss": 1.0808, + "step": 7034 + }, + { + "epoch": 0.18063907330999918, + "grad_norm": 0.8125, + "learning_rate": 0.0001930559251338859, + "loss": 1.0616, + "step": 7035 + }, + { + "epoch": 0.180664750505921, + "grad_norm": 3.703125, + "learning_rate": 0.00019305429051237927, + "loss": 1.0407, + "step": 7036 + }, + { + "epoch": 0.18069042770184282, + "grad_norm": 0.85546875, + "learning_rate": 0.00019305265570542383, + "loss": 1.0811, + "step": 7037 + }, + { + "epoch": 0.18071610489776463, + "grad_norm": 0.81640625, + "learning_rate": 0.00019305102071302287, + "loss": 1.2382, + "step": 7038 + }, + { + "epoch": 0.18074178209368646, + "grad_norm": 0.90234375, + "learning_rate": 0.00019304938553517964, + "loss": 1.1468, + "step": 7039 + }, + { + "epoch": 0.18076745928960827, + "grad_norm": 0.7890625, + "learning_rate": 0.00019304775017189735, + "loss": 1.0609, + "step": 7040 + }, + { + "epoch": 0.1807931364855301, + "grad_norm": 0.7578125, + "learning_rate": 0.00019304611462317932, + "loss": 0.9621, + "step": 7041 + }, + { + "epoch": 0.18081881368145192, + "grad_norm": 0.7578125, + "learning_rate": 0.0001930444788890288, + "loss": 0.981, + "step": 7042 + }, + { + "epoch": 0.18084449087737373, + "grad_norm": 0.76171875, + "learning_rate": 0.00019304284296944903, + "loss": 0.9671, + "step": 7043 + }, + { + "epoch": 0.18087016807329556, + "grad_norm": 0.86328125, + "learning_rate": 0.00019304120686444327, + "loss": 1.0816, + "step": 7044 + }, + { + "epoch": 0.18089584526921737, + "grad_norm": 0.86328125, + "learning_rate": 0.0001930395705740148, + "loss": 1.2648, + "step": 7045 + }, + { + "epoch": 0.1809215224651392, + "grad_norm": 0.796875, + "learning_rate": 0.00019303793409816686, + "loss": 1.1498, + "step": 7046 + }, + { + "epoch": 0.18094719966106101, + "grad_norm": 0.80859375, + "learning_rate": 0.00019303629743690272, + "loss": 0.9724, + "step": 7047 + }, + { + "epoch": 0.18097287685698282, + "grad_norm": 0.8359375, + "learning_rate": 0.00019303466059022566, + "loss": 1.0057, + "step": 7048 + }, + { + "epoch": 0.18099855405290466, + "grad_norm": 0.7734375, + "learning_rate": 0.00019303302355813893, + "loss": 1.0168, + "step": 7049 + }, + { + "epoch": 0.18102423124882647, + "grad_norm": 0.8203125, + "learning_rate": 0.00019303138634064576, + "loss": 1.0586, + "step": 7050 + }, + { + "epoch": 0.1810499084447483, + "grad_norm": 0.76171875, + "learning_rate": 0.00019302974893774946, + "loss": 1.0247, + "step": 7051 + }, + { + "epoch": 0.1810755856406701, + "grad_norm": 0.859375, + "learning_rate": 0.00019302811134945324, + "loss": 1.0708, + "step": 7052 + }, + { + "epoch": 0.18110126283659192, + "grad_norm": 0.796875, + "learning_rate": 0.00019302647357576045, + "loss": 1.053, + "step": 7053 + }, + { + "epoch": 0.18112694003251376, + "grad_norm": 0.83984375, + "learning_rate": 0.00019302483561667427, + "loss": 1.0707, + "step": 7054 + }, + { + "epoch": 0.18115261722843556, + "grad_norm": 0.78515625, + "learning_rate": 0.000193023197472198, + "loss": 1.1359, + "step": 7055 + }, + { + "epoch": 0.1811782944243574, + "grad_norm": 0.84765625, + "learning_rate": 0.0001930215591423349, + "loss": 1.1007, + "step": 7056 + }, + { + "epoch": 0.1812039716202792, + "grad_norm": 0.88671875, + "learning_rate": 0.0001930199206270882, + "loss": 1.0364, + "step": 7057 + }, + { + "epoch": 0.18122964881620102, + "grad_norm": 0.96484375, + "learning_rate": 0.00019301828192646125, + "loss": 1.2782, + "step": 7058 + }, + { + "epoch": 0.18125532601212285, + "grad_norm": 0.78515625, + "learning_rate": 0.0001930166430404573, + "loss": 0.9833, + "step": 7059 + }, + { + "epoch": 0.18128100320804466, + "grad_norm": 0.90625, + "learning_rate": 0.0001930150039690795, + "loss": 0.9594, + "step": 7060 + }, + { + "epoch": 0.1813066804039665, + "grad_norm": 0.77734375, + "learning_rate": 0.00019301336471233123, + "loss": 1.0811, + "step": 7061 + }, + { + "epoch": 0.1813323575998883, + "grad_norm": 0.84765625, + "learning_rate": 0.00019301172527021575, + "loss": 0.9756, + "step": 7062 + }, + { + "epoch": 0.1813580347958101, + "grad_norm": 0.72265625, + "learning_rate": 0.00019301008564273628, + "loss": 1.0798, + "step": 7063 + }, + { + "epoch": 0.18138371199173195, + "grad_norm": 0.890625, + "learning_rate": 0.0001930084458298961, + "loss": 1.1568, + "step": 7064 + }, + { + "epoch": 0.18140938918765376, + "grad_norm": 0.78515625, + "learning_rate": 0.0001930068058316985, + "loss": 0.9642, + "step": 7065 + }, + { + "epoch": 0.1814350663835756, + "grad_norm": 0.75, + "learning_rate": 0.00019300516564814676, + "loss": 0.951, + "step": 7066 + }, + { + "epoch": 0.1814607435794974, + "grad_norm": 0.84765625, + "learning_rate": 0.00019300352527924411, + "loss": 1.2233, + "step": 7067 + }, + { + "epoch": 0.1814864207754192, + "grad_norm": 0.78125, + "learning_rate": 0.00019300188472499383, + "loss": 1.073, + "step": 7068 + }, + { + "epoch": 0.18151209797134105, + "grad_norm": 0.828125, + "learning_rate": 0.0001930002439853992, + "loss": 1.048, + "step": 7069 + }, + { + "epoch": 0.18153777516726285, + "grad_norm": 0.91796875, + "learning_rate": 0.0001929986030604635, + "loss": 1.0955, + "step": 7070 + }, + { + "epoch": 0.1815634523631847, + "grad_norm": 0.84765625, + "learning_rate": 0.00019299696195018993, + "loss": 1.1104, + "step": 7071 + }, + { + "epoch": 0.1815891295591065, + "grad_norm": 0.81640625, + "learning_rate": 0.00019299532065458186, + "loss": 1.0022, + "step": 7072 + }, + { + "epoch": 0.1816148067550283, + "grad_norm": 0.8984375, + "learning_rate": 0.0001929936791736425, + "loss": 1.2496, + "step": 7073 + }, + { + "epoch": 0.18164048395095014, + "grad_norm": 0.80859375, + "learning_rate": 0.00019299203750737515, + "loss": 1.0139, + "step": 7074 + }, + { + "epoch": 0.18166616114687195, + "grad_norm": 0.78515625, + "learning_rate": 0.00019299039565578307, + "loss": 1.088, + "step": 7075 + }, + { + "epoch": 0.1816918383427938, + "grad_norm": 0.859375, + "learning_rate": 0.00019298875361886951, + "loss": 1.106, + "step": 7076 + }, + { + "epoch": 0.1817175155387156, + "grad_norm": 0.8671875, + "learning_rate": 0.00019298711139663778, + "loss": 1.1174, + "step": 7077 + }, + { + "epoch": 0.1817431927346374, + "grad_norm": 0.81640625, + "learning_rate": 0.00019298546898909114, + "loss": 1.0379, + "step": 7078 + }, + { + "epoch": 0.18176886993055924, + "grad_norm": 0.796875, + "learning_rate": 0.00019298382639623286, + "loss": 0.9673, + "step": 7079 + }, + { + "epoch": 0.18179454712648105, + "grad_norm": 0.80078125, + "learning_rate": 0.00019298218361806622, + "loss": 1.076, + "step": 7080 + }, + { + "epoch": 0.18182022432240288, + "grad_norm": 0.7890625, + "learning_rate": 0.00019298054065459446, + "loss": 1.0146, + "step": 7081 + }, + { + "epoch": 0.1818459015183247, + "grad_norm": 0.78515625, + "learning_rate": 0.0001929788975058209, + "loss": 1.1071, + "step": 7082 + }, + { + "epoch": 0.1818715787142465, + "grad_norm": 1.1484375, + "learning_rate": 0.0001929772541717488, + "loss": 1.1165, + "step": 7083 + }, + { + "epoch": 0.18189725591016834, + "grad_norm": 0.87109375, + "learning_rate": 0.0001929756106523814, + "loss": 1.0616, + "step": 7084 + }, + { + "epoch": 0.18192293310609015, + "grad_norm": 0.83203125, + "learning_rate": 0.00019297396694772204, + "loss": 0.9986, + "step": 7085 + }, + { + "epoch": 0.18194861030201198, + "grad_norm": 0.8203125, + "learning_rate": 0.00019297232305777397, + "loss": 1.0319, + "step": 7086 + }, + { + "epoch": 0.1819742874979338, + "grad_norm": 0.84375, + "learning_rate": 0.00019297067898254043, + "loss": 1.0889, + "step": 7087 + }, + { + "epoch": 0.1819999646938556, + "grad_norm": 0.828125, + "learning_rate": 0.0001929690347220247, + "loss": 1.0895, + "step": 7088 + }, + { + "epoch": 0.18202564188977743, + "grad_norm": 0.87890625, + "learning_rate": 0.00019296739027623017, + "loss": 1.1306, + "step": 7089 + }, + { + "epoch": 0.18205131908569924, + "grad_norm": 0.84765625, + "learning_rate": 0.00019296574564515995, + "loss": 1.0832, + "step": 7090 + }, + { + "epoch": 0.18207699628162108, + "grad_norm": 0.8671875, + "learning_rate": 0.00019296410082881744, + "loss": 1.0668, + "step": 7091 + }, + { + "epoch": 0.1821026734775429, + "grad_norm": 0.80078125, + "learning_rate": 0.00019296245582720584, + "loss": 1.0769, + "step": 7092 + }, + { + "epoch": 0.1821283506734647, + "grad_norm": 0.8125, + "learning_rate": 0.0001929608106403285, + "loss": 1.0686, + "step": 7093 + }, + { + "epoch": 0.18215402786938653, + "grad_norm": 0.8359375, + "learning_rate": 0.00019295916526818863, + "loss": 1.146, + "step": 7094 + }, + { + "epoch": 0.18217970506530834, + "grad_norm": 0.80078125, + "learning_rate": 0.00019295751971078954, + "loss": 1.0867, + "step": 7095 + }, + { + "epoch": 0.18220538226123018, + "grad_norm": 0.828125, + "learning_rate": 0.00019295587396813451, + "loss": 1.0328, + "step": 7096 + }, + { + "epoch": 0.18223105945715198, + "grad_norm": 0.8359375, + "learning_rate": 0.00019295422804022685, + "loss": 0.9967, + "step": 7097 + }, + { + "epoch": 0.1822567366530738, + "grad_norm": 0.77734375, + "learning_rate": 0.0001929525819270698, + "loss": 0.978, + "step": 7098 + }, + { + "epoch": 0.18228241384899563, + "grad_norm": 0.83203125, + "learning_rate": 0.00019295093562866664, + "loss": 1.0472, + "step": 7099 + }, + { + "epoch": 0.18230809104491744, + "grad_norm": 0.80078125, + "learning_rate": 0.00019294928914502066, + "loss": 1.088, + "step": 7100 + }, + { + "epoch": 0.18233376824083927, + "grad_norm": 0.84765625, + "learning_rate": 0.00019294764247613515, + "loss": 1.1663, + "step": 7101 + }, + { + "epoch": 0.18235944543676108, + "grad_norm": 0.94140625, + "learning_rate": 0.00019294599562201338, + "loss": 1.2472, + "step": 7102 + }, + { + "epoch": 0.1823851226326829, + "grad_norm": 0.73828125, + "learning_rate": 0.00019294434858265863, + "loss": 1.0875, + "step": 7103 + }, + { + "epoch": 0.18241079982860472, + "grad_norm": 0.80859375, + "learning_rate": 0.0001929427013580742, + "loss": 1.0857, + "step": 7104 + }, + { + "epoch": 0.18243647702452653, + "grad_norm": 0.8359375, + "learning_rate": 0.00019294105394826336, + "loss": 1.12, + "step": 7105 + }, + { + "epoch": 0.18246215422044837, + "grad_norm": 0.8046875, + "learning_rate": 0.0001929394063532294, + "loss": 1.172, + "step": 7106 + }, + { + "epoch": 0.18248783141637018, + "grad_norm": 0.828125, + "learning_rate": 0.00019293775857297558, + "loss": 0.9966, + "step": 7107 + }, + { + "epoch": 0.18251350861229199, + "grad_norm": 0.92578125, + "learning_rate": 0.0001929361106075052, + "loss": 1.1352, + "step": 7108 + }, + { + "epoch": 0.18253918580821382, + "grad_norm": 0.76953125, + "learning_rate": 0.00019293446245682158, + "loss": 1.0731, + "step": 7109 + }, + { + "epoch": 0.18256486300413563, + "grad_norm": 0.74609375, + "learning_rate": 0.00019293281412092794, + "loss": 0.9768, + "step": 7110 + }, + { + "epoch": 0.18259054020005747, + "grad_norm": 0.828125, + "learning_rate": 0.0001929311655998276, + "loss": 1.0918, + "step": 7111 + }, + { + "epoch": 0.18261621739597927, + "grad_norm": 0.84375, + "learning_rate": 0.00019292951689352386, + "loss": 1.2249, + "step": 7112 + }, + { + "epoch": 0.18264189459190108, + "grad_norm": 0.8359375, + "learning_rate": 0.00019292786800202, + "loss": 1.0513, + "step": 7113 + }, + { + "epoch": 0.18266757178782292, + "grad_norm": 0.83984375, + "learning_rate": 0.00019292621892531923, + "loss": 1.1333, + "step": 7114 + }, + { + "epoch": 0.18269324898374473, + "grad_norm": 0.828125, + "learning_rate": 0.00019292456966342496, + "loss": 1.2115, + "step": 7115 + }, + { + "epoch": 0.18271892617966656, + "grad_norm": 0.76953125, + "learning_rate": 0.00019292292021634038, + "loss": 0.9803, + "step": 7116 + }, + { + "epoch": 0.18274460337558837, + "grad_norm": 0.7734375, + "learning_rate": 0.00019292127058406883, + "loss": 1.125, + "step": 7117 + }, + { + "epoch": 0.18277028057151018, + "grad_norm": 0.8046875, + "learning_rate": 0.0001929196207666136, + "loss": 1.1373, + "step": 7118 + }, + { + "epoch": 0.18279595776743202, + "grad_norm": 0.8203125, + "learning_rate": 0.00019291797076397793, + "loss": 1.0231, + "step": 7119 + }, + { + "epoch": 0.18282163496335382, + "grad_norm": 0.8671875, + "learning_rate": 0.00019291632057616512, + "loss": 1.0733, + "step": 7120 + }, + { + "epoch": 0.18284731215927566, + "grad_norm": 0.7734375, + "learning_rate": 0.00019291467020317852, + "loss": 0.982, + "step": 7121 + }, + { + "epoch": 0.18287298935519747, + "grad_norm": 0.79296875, + "learning_rate": 0.00019291301964502138, + "loss": 1.1271, + "step": 7122 + }, + { + "epoch": 0.18289866655111928, + "grad_norm": 0.82421875, + "learning_rate": 0.00019291136890169693, + "loss": 1.0211, + "step": 7123 + }, + { + "epoch": 0.1829243437470411, + "grad_norm": 0.8125, + "learning_rate": 0.00019290971797320856, + "loss": 1.0397, + "step": 7124 + }, + { + "epoch": 0.18295002094296292, + "grad_norm": 0.80078125, + "learning_rate": 0.0001929080668595595, + "loss": 1.1406, + "step": 7125 + }, + { + "epoch": 0.18297569813888473, + "grad_norm": 0.86328125, + "learning_rate": 0.00019290641556075306, + "loss": 1.1256, + "step": 7126 + }, + { + "epoch": 0.18300137533480657, + "grad_norm": 0.72265625, + "learning_rate": 0.0001929047640767925, + "loss": 0.9563, + "step": 7127 + }, + { + "epoch": 0.18302705253072837, + "grad_norm": 0.890625, + "learning_rate": 0.00019290311240768116, + "loss": 0.974, + "step": 7128 + }, + { + "epoch": 0.1830527297266502, + "grad_norm": 0.77734375, + "learning_rate": 0.00019290146055342232, + "loss": 1.0726, + "step": 7129 + }, + { + "epoch": 0.18307840692257202, + "grad_norm": 0.84765625, + "learning_rate": 0.00019289980851401928, + "loss": 0.9987, + "step": 7130 + }, + { + "epoch": 0.18310408411849383, + "grad_norm": 0.875, + "learning_rate": 0.00019289815628947525, + "loss": 1.0743, + "step": 7131 + }, + { + "epoch": 0.18312976131441566, + "grad_norm": 0.890625, + "learning_rate": 0.00019289650387979362, + "loss": 1.1024, + "step": 7132 + }, + { + "epoch": 0.18315543851033747, + "grad_norm": 0.77734375, + "learning_rate": 0.00019289485128497766, + "loss": 1.2485, + "step": 7133 + }, + { + "epoch": 0.1831811157062593, + "grad_norm": 0.7890625, + "learning_rate": 0.00019289319850503063, + "loss": 1.1168, + "step": 7134 + }, + { + "epoch": 0.18320679290218111, + "grad_norm": 0.86328125, + "learning_rate": 0.00019289154553995584, + "loss": 1.0531, + "step": 7135 + }, + { + "epoch": 0.18323247009810292, + "grad_norm": 0.78125, + "learning_rate": 0.00019288989238975664, + "loss": 1.0684, + "step": 7136 + }, + { + "epoch": 0.18325814729402476, + "grad_norm": 0.7109375, + "learning_rate": 0.00019288823905443624, + "loss": 0.9193, + "step": 7137 + }, + { + "epoch": 0.18328382448994657, + "grad_norm": 0.796875, + "learning_rate": 0.000192886585533998, + "loss": 1.0242, + "step": 7138 + }, + { + "epoch": 0.1833095016858684, + "grad_norm": 0.8046875, + "learning_rate": 0.00019288493182844513, + "loss": 0.9409, + "step": 7139 + }, + { + "epoch": 0.1833351788817902, + "grad_norm": 0.76953125, + "learning_rate": 0.000192883277937781, + "loss": 0.9727, + "step": 7140 + }, + { + "epoch": 0.18336085607771202, + "grad_norm": 0.796875, + "learning_rate": 0.0001928816238620089, + "loss": 1.0923, + "step": 7141 + }, + { + "epoch": 0.18338653327363386, + "grad_norm": 0.80078125, + "learning_rate": 0.00019287996960113213, + "loss": 1.0756, + "step": 7142 + }, + { + "epoch": 0.18341221046955566, + "grad_norm": 0.8671875, + "learning_rate": 0.00019287831515515393, + "loss": 1.1563, + "step": 7143 + }, + { + "epoch": 0.1834378876654775, + "grad_norm": 0.90625, + "learning_rate": 0.00019287666052407765, + "loss": 1.0208, + "step": 7144 + }, + { + "epoch": 0.1834635648613993, + "grad_norm": 0.8046875, + "learning_rate": 0.0001928750057079066, + "loss": 0.9609, + "step": 7145 + }, + { + "epoch": 0.18348924205732112, + "grad_norm": 0.875, + "learning_rate": 0.00019287335070664402, + "loss": 1.0793, + "step": 7146 + }, + { + "epoch": 0.18351491925324295, + "grad_norm": 0.890625, + "learning_rate": 0.00019287169552029327, + "loss": 1.0699, + "step": 7147 + }, + { + "epoch": 0.18354059644916476, + "grad_norm": 0.8515625, + "learning_rate": 0.00019287004014885762, + "loss": 1.0508, + "step": 7148 + }, + { + "epoch": 0.1835662736450866, + "grad_norm": 0.7890625, + "learning_rate": 0.00019286838459234033, + "loss": 0.949, + "step": 7149 + }, + { + "epoch": 0.1835919508410084, + "grad_norm": 0.7578125, + "learning_rate": 0.00019286672885074477, + "loss": 0.9347, + "step": 7150 + }, + { + "epoch": 0.1836176280369302, + "grad_norm": 0.94140625, + "learning_rate": 0.0001928650729240742, + "loss": 1.0136, + "step": 7151 + }, + { + "epoch": 0.18364330523285205, + "grad_norm": 0.90234375, + "learning_rate": 0.00019286341681233195, + "loss": 1.1008, + "step": 7152 + }, + { + "epoch": 0.18366898242877386, + "grad_norm": 0.78125, + "learning_rate": 0.00019286176051552128, + "loss": 1.0155, + "step": 7153 + }, + { + "epoch": 0.1836946596246957, + "grad_norm": 0.80078125, + "learning_rate": 0.00019286010403364552, + "loss": 1.0366, + "step": 7154 + }, + { + "epoch": 0.1837203368206175, + "grad_norm": 0.79296875, + "learning_rate": 0.00019285844736670793, + "loss": 1.0281, + "step": 7155 + }, + { + "epoch": 0.1837460140165393, + "grad_norm": 0.85546875, + "learning_rate": 0.00019285679051471186, + "loss": 1.028, + "step": 7156 + }, + { + "epoch": 0.18377169121246115, + "grad_norm": 0.921875, + "learning_rate": 0.0001928551334776606, + "loss": 1.0353, + "step": 7157 + }, + { + "epoch": 0.18379736840838296, + "grad_norm": 0.83203125, + "learning_rate": 0.00019285347625555744, + "loss": 1.0957, + "step": 7158 + }, + { + "epoch": 0.1838230456043048, + "grad_norm": 0.76171875, + "learning_rate": 0.00019285181884840568, + "loss": 1.2185, + "step": 7159 + }, + { + "epoch": 0.1838487228002266, + "grad_norm": 0.8046875, + "learning_rate": 0.00019285016125620866, + "loss": 0.9623, + "step": 7160 + }, + { + "epoch": 0.1838743999961484, + "grad_norm": 0.83203125, + "learning_rate": 0.00019284850347896964, + "loss": 0.9444, + "step": 7161 + }, + { + "epoch": 0.18390007719207024, + "grad_norm": 0.82421875, + "learning_rate": 0.00019284684551669194, + "loss": 1.0395, + "step": 7162 + }, + { + "epoch": 0.18392575438799205, + "grad_norm": 0.80859375, + "learning_rate": 0.00019284518736937886, + "loss": 1.0039, + "step": 7163 + }, + { + "epoch": 0.1839514315839139, + "grad_norm": 0.8515625, + "learning_rate": 0.00019284352903703371, + "loss": 0.8838, + "step": 7164 + }, + { + "epoch": 0.1839771087798357, + "grad_norm": 0.828125, + "learning_rate": 0.0001928418705196598, + "loss": 1.243, + "step": 7165 + }, + { + "epoch": 0.1840027859757575, + "grad_norm": 0.80859375, + "learning_rate": 0.00019284021181726043, + "loss": 1.0762, + "step": 7166 + }, + { + "epoch": 0.18402846317167934, + "grad_norm": 0.80859375, + "learning_rate": 0.00019283855292983888, + "loss": 1.049, + "step": 7167 + }, + { + "epoch": 0.18405414036760115, + "grad_norm": 0.8046875, + "learning_rate": 0.0001928368938573985, + "loss": 0.9824, + "step": 7168 + }, + { + "epoch": 0.18407981756352298, + "grad_norm": 0.9140625, + "learning_rate": 0.00019283523459994256, + "loss": 1.2787, + "step": 7169 + }, + { + "epoch": 0.1841054947594448, + "grad_norm": 0.796875, + "learning_rate": 0.0001928335751574744, + "loss": 1.1944, + "step": 7170 + }, + { + "epoch": 0.1841311719553666, + "grad_norm": 0.8984375, + "learning_rate": 0.0001928319155299973, + "loss": 1.0327, + "step": 7171 + }, + { + "epoch": 0.18415684915128844, + "grad_norm": 0.83203125, + "learning_rate": 0.0001928302557175146, + "loss": 0.9691, + "step": 7172 + }, + { + "epoch": 0.18418252634721025, + "grad_norm": 0.796875, + "learning_rate": 0.00019282859572002957, + "loss": 0.9827, + "step": 7173 + }, + { + "epoch": 0.18420820354313208, + "grad_norm": 0.9140625, + "learning_rate": 0.00019282693553754554, + "loss": 1.1087, + "step": 7174 + }, + { + "epoch": 0.1842338807390539, + "grad_norm": 0.85546875, + "learning_rate": 0.00019282527517006578, + "loss": 1.1001, + "step": 7175 + }, + { + "epoch": 0.1842595579349757, + "grad_norm": 0.86328125, + "learning_rate": 0.0001928236146175937, + "loss": 1.16, + "step": 7176 + }, + { + "epoch": 0.18428523513089753, + "grad_norm": 0.83984375, + "learning_rate": 0.0001928219538801325, + "loss": 1.0234, + "step": 7177 + }, + { + "epoch": 0.18431091232681934, + "grad_norm": 0.953125, + "learning_rate": 0.00019282029295768552, + "loss": 1.2025, + "step": 7178 + }, + { + "epoch": 0.18433658952274118, + "grad_norm": 0.96875, + "learning_rate": 0.00019281863185025607, + "loss": 1.0963, + "step": 7179 + }, + { + "epoch": 0.184362266718663, + "grad_norm": 0.82421875, + "learning_rate": 0.00019281697055784747, + "loss": 0.9719, + "step": 7180 + }, + { + "epoch": 0.1843879439145848, + "grad_norm": 0.890625, + "learning_rate": 0.0001928153090804631, + "loss": 1.0496, + "step": 7181 + }, + { + "epoch": 0.18441362111050663, + "grad_norm": 0.796875, + "learning_rate": 0.00019281364741810614, + "loss": 1.023, + "step": 7182 + }, + { + "epoch": 0.18443929830642844, + "grad_norm": 0.7890625, + "learning_rate": 0.00019281198557077997, + "loss": 1.0248, + "step": 7183 + }, + { + "epoch": 0.18446497550235028, + "grad_norm": 0.8125, + "learning_rate": 0.0001928103235384879, + "loss": 1.0558, + "step": 7184 + }, + { + "epoch": 0.18449065269827208, + "grad_norm": 0.83984375, + "learning_rate": 0.00019280866132123325, + "loss": 1.153, + "step": 7185 + }, + { + "epoch": 0.1845163298941939, + "grad_norm": 0.8515625, + "learning_rate": 0.00019280699891901932, + "loss": 1.0176, + "step": 7186 + }, + { + "epoch": 0.18454200709011573, + "grad_norm": 1.40625, + "learning_rate": 0.00019280533633184944, + "loss": 0.9979, + "step": 7187 + }, + { + "epoch": 0.18456768428603754, + "grad_norm": 0.859375, + "learning_rate": 0.00019280367355972686, + "loss": 1.0273, + "step": 7188 + }, + { + "epoch": 0.18459336148195937, + "grad_norm": 0.9609375, + "learning_rate": 0.00019280201060265499, + "loss": 1.0985, + "step": 7189 + }, + { + "epoch": 0.18461903867788118, + "grad_norm": 0.7890625, + "learning_rate": 0.00019280034746063706, + "loss": 0.9411, + "step": 7190 + }, + { + "epoch": 0.184644715873803, + "grad_norm": 0.8203125, + "learning_rate": 0.00019279868413367646, + "loss": 0.9774, + "step": 7191 + }, + { + "epoch": 0.18467039306972483, + "grad_norm": 0.91796875, + "learning_rate": 0.00019279702062177643, + "loss": 1.0248, + "step": 7192 + }, + { + "epoch": 0.18469607026564663, + "grad_norm": 0.7734375, + "learning_rate": 0.00019279535692494034, + "loss": 0.9225, + "step": 7193 + }, + { + "epoch": 0.18472174746156847, + "grad_norm": 0.76953125, + "learning_rate": 0.00019279369304317148, + "loss": 0.9754, + "step": 7194 + }, + { + "epoch": 0.18474742465749028, + "grad_norm": 0.80078125, + "learning_rate": 0.00019279202897647316, + "loss": 1.1526, + "step": 7195 + }, + { + "epoch": 0.1847731018534121, + "grad_norm": 0.89453125, + "learning_rate": 0.00019279036472484871, + "loss": 1.1293, + "step": 7196 + }, + { + "epoch": 0.18479877904933392, + "grad_norm": 0.80859375, + "learning_rate": 0.00019278870028830144, + "loss": 1.0922, + "step": 7197 + }, + { + "epoch": 0.18482445624525573, + "grad_norm": 0.78515625, + "learning_rate": 0.0001927870356668347, + "loss": 0.972, + "step": 7198 + }, + { + "epoch": 0.18485013344117757, + "grad_norm": 0.84765625, + "learning_rate": 0.00019278537086045176, + "loss": 0.9926, + "step": 7199 + }, + { + "epoch": 0.18487581063709937, + "grad_norm": 0.88671875, + "learning_rate": 0.00019278370586915596, + "loss": 1.1193, + "step": 7200 + }, + { + "epoch": 0.18490148783302118, + "grad_norm": 0.83203125, + "learning_rate": 0.00019278204069295062, + "loss": 1.006, + "step": 7201 + }, + { + "epoch": 0.18492716502894302, + "grad_norm": 0.859375, + "learning_rate": 0.00019278037533183905, + "loss": 1.1176, + "step": 7202 + }, + { + "epoch": 0.18495284222486483, + "grad_norm": 0.90625, + "learning_rate": 0.00019277870978582457, + "loss": 1.0001, + "step": 7203 + }, + { + "epoch": 0.18497851942078666, + "grad_norm": 0.88671875, + "learning_rate": 0.0001927770440549105, + "loss": 0.9396, + "step": 7204 + }, + { + "epoch": 0.18500419661670847, + "grad_norm": 0.796875, + "learning_rate": 0.00019277537813910017, + "loss": 1.0216, + "step": 7205 + }, + { + "epoch": 0.18502987381263028, + "grad_norm": 1.265625, + "learning_rate": 0.0001927737120383969, + "loss": 1.2669, + "step": 7206 + }, + { + "epoch": 0.18505555100855212, + "grad_norm": 0.796875, + "learning_rate": 0.000192772045752804, + "loss": 1.0766, + "step": 7207 + }, + { + "epoch": 0.18508122820447392, + "grad_norm": 0.76953125, + "learning_rate": 0.00019277037928232477, + "loss": 0.9506, + "step": 7208 + }, + { + "epoch": 0.18510690540039576, + "grad_norm": 0.796875, + "learning_rate": 0.00019276871262696258, + "loss": 1.2871, + "step": 7209 + }, + { + "epoch": 0.18513258259631757, + "grad_norm": 0.90234375, + "learning_rate": 0.00019276704578672068, + "loss": 1.1571, + "step": 7210 + }, + { + "epoch": 0.18515825979223938, + "grad_norm": 0.78515625, + "learning_rate": 0.00019276537876160247, + "loss": 1.0586, + "step": 7211 + }, + { + "epoch": 0.1851839369881612, + "grad_norm": 0.8046875, + "learning_rate": 0.00019276371155161126, + "loss": 1.1069, + "step": 7212 + }, + { + "epoch": 0.18520961418408302, + "grad_norm": 0.8125, + "learning_rate": 0.00019276204415675033, + "loss": 0.9842, + "step": 7213 + }, + { + "epoch": 0.18523529138000486, + "grad_norm": 0.71875, + "learning_rate": 0.00019276037657702303, + "loss": 1.1145, + "step": 7214 + }, + { + "epoch": 0.18526096857592667, + "grad_norm": 0.80859375, + "learning_rate": 0.00019275870881243264, + "loss": 1.0712, + "step": 7215 + }, + { + "epoch": 0.18528664577184847, + "grad_norm": 0.82421875, + "learning_rate": 0.0001927570408629826, + "loss": 0.9964, + "step": 7216 + }, + { + "epoch": 0.1853123229677703, + "grad_norm": 0.8046875, + "learning_rate": 0.0001927553727286761, + "loss": 0.983, + "step": 7217 + }, + { + "epoch": 0.18533800016369212, + "grad_norm": 0.76171875, + "learning_rate": 0.00019275370440951652, + "loss": 1.1615, + "step": 7218 + }, + { + "epoch": 0.18536367735961395, + "grad_norm": 0.7890625, + "learning_rate": 0.00019275203590550722, + "loss": 1.0426, + "step": 7219 + }, + { + "epoch": 0.18538935455553576, + "grad_norm": 0.7421875, + "learning_rate": 0.00019275036721665148, + "loss": 0.9977, + "step": 7220 + }, + { + "epoch": 0.18541503175145757, + "grad_norm": 0.80859375, + "learning_rate": 0.0001927486983429526, + "loss": 1.0675, + "step": 7221 + }, + { + "epoch": 0.1854407089473794, + "grad_norm": 0.8203125, + "learning_rate": 0.00019274702928441398, + "loss": 1.0039, + "step": 7222 + }, + { + "epoch": 0.18546638614330122, + "grad_norm": 0.84765625, + "learning_rate": 0.0001927453600410389, + "loss": 1.1128, + "step": 7223 + }, + { + "epoch": 0.18549206333922305, + "grad_norm": 0.85546875, + "learning_rate": 0.0001927436906128307, + "loss": 0.891, + "step": 7224 + }, + { + "epoch": 0.18551774053514486, + "grad_norm": 0.8515625, + "learning_rate": 0.00019274202099979268, + "loss": 0.8705, + "step": 7225 + }, + { + "epoch": 0.18554341773106667, + "grad_norm": 0.83984375, + "learning_rate": 0.00019274035120192822, + "loss": 1.0289, + "step": 7226 + }, + { + "epoch": 0.1855690949269885, + "grad_norm": 0.83984375, + "learning_rate": 0.00019273868121924057, + "loss": 0.9676, + "step": 7227 + }, + { + "epoch": 0.1855947721229103, + "grad_norm": 0.77734375, + "learning_rate": 0.00019273701105173316, + "loss": 1.0815, + "step": 7228 + }, + { + "epoch": 0.18562044931883215, + "grad_norm": 0.984375, + "learning_rate": 0.00019273534069940923, + "loss": 0.9336, + "step": 7229 + }, + { + "epoch": 0.18564612651475396, + "grad_norm": 0.81640625, + "learning_rate": 0.00019273367016227214, + "loss": 0.9934, + "step": 7230 + }, + { + "epoch": 0.18567180371067576, + "grad_norm": 0.8359375, + "learning_rate": 0.00019273199944032525, + "loss": 1.167, + "step": 7231 + }, + { + "epoch": 0.1856974809065976, + "grad_norm": 0.81640625, + "learning_rate": 0.00019273032853357185, + "loss": 0.9286, + "step": 7232 + }, + { + "epoch": 0.1857231581025194, + "grad_norm": 0.91015625, + "learning_rate": 0.00019272865744201527, + "loss": 1.0635, + "step": 7233 + }, + { + "epoch": 0.18574883529844124, + "grad_norm": 0.78515625, + "learning_rate": 0.00019272698616565887, + "loss": 1.0363, + "step": 7234 + }, + { + "epoch": 0.18577451249436305, + "grad_norm": 0.98046875, + "learning_rate": 0.00019272531470450595, + "loss": 1.0323, + "step": 7235 + }, + { + "epoch": 0.18580018969028486, + "grad_norm": 0.81640625, + "learning_rate": 0.00019272364305855986, + "loss": 1.1195, + "step": 7236 + }, + { + "epoch": 0.1858258668862067, + "grad_norm": 0.87109375, + "learning_rate": 0.00019272197122782391, + "loss": 1.18, + "step": 7237 + }, + { + "epoch": 0.1858515440821285, + "grad_norm": 0.796875, + "learning_rate": 0.00019272029921230146, + "loss": 1.0663, + "step": 7238 + }, + { + "epoch": 0.18587722127805034, + "grad_norm": 0.85546875, + "learning_rate": 0.00019271862701199583, + "loss": 1.0943, + "step": 7239 + }, + { + "epoch": 0.18590289847397215, + "grad_norm": 0.796875, + "learning_rate": 0.00019271695462691035, + "loss": 1.1705, + "step": 7240 + }, + { + "epoch": 0.18592857566989396, + "grad_norm": 0.796875, + "learning_rate": 0.00019271528205704836, + "loss": 1.1508, + "step": 7241 + }, + { + "epoch": 0.1859542528658158, + "grad_norm": 0.8515625, + "learning_rate": 0.00019271360930241317, + "loss": 1.0598, + "step": 7242 + }, + { + "epoch": 0.1859799300617376, + "grad_norm": 1.2578125, + "learning_rate": 0.00019271193636300816, + "loss": 0.8719, + "step": 7243 + }, + { + "epoch": 0.18600560725765944, + "grad_norm": 0.84375, + "learning_rate": 0.00019271026323883662, + "loss": 1.1475, + "step": 7244 + }, + { + "epoch": 0.18603128445358125, + "grad_norm": 0.8359375, + "learning_rate": 0.0001927085899299019, + "loss": 1.1726, + "step": 7245 + }, + { + "epoch": 0.18605696164950306, + "grad_norm": 0.87109375, + "learning_rate": 0.00019270691643620734, + "loss": 1.1236, + "step": 7246 + }, + { + "epoch": 0.1860826388454249, + "grad_norm": 0.86328125, + "learning_rate": 0.00019270524275775624, + "loss": 1.0411, + "step": 7247 + }, + { + "epoch": 0.1861083160413467, + "grad_norm": 0.83203125, + "learning_rate": 0.00019270356889455198, + "loss": 1.1508, + "step": 7248 + }, + { + "epoch": 0.18613399323726854, + "grad_norm": 0.88671875, + "learning_rate": 0.0001927018948465979, + "loss": 1.087, + "step": 7249 + }, + { + "epoch": 0.18615967043319034, + "grad_norm": 0.7734375, + "learning_rate": 0.0001927002206138973, + "loss": 0.9906, + "step": 7250 + }, + { + "epoch": 0.18618534762911215, + "grad_norm": 0.9453125, + "learning_rate": 0.00019269854619645353, + "loss": 0.9821, + "step": 7251 + }, + { + "epoch": 0.186211024825034, + "grad_norm": 0.8515625, + "learning_rate": 0.00019269687159426994, + "loss": 0.8603, + "step": 7252 + }, + { + "epoch": 0.1862367020209558, + "grad_norm": 0.83984375, + "learning_rate": 0.00019269519680734987, + "loss": 1.0325, + "step": 7253 + }, + { + "epoch": 0.18626237921687763, + "grad_norm": 0.7734375, + "learning_rate": 0.0001926935218356966, + "loss": 1.0374, + "step": 7254 + }, + { + "epoch": 0.18628805641279944, + "grad_norm": 0.83203125, + "learning_rate": 0.0001926918466793136, + "loss": 1.0827, + "step": 7255 + }, + { + "epoch": 0.18631373360872125, + "grad_norm": 0.80859375, + "learning_rate": 0.00019269017133820407, + "loss": 1.0298, + "step": 7256 + }, + { + "epoch": 0.18633941080464309, + "grad_norm": 0.765625, + "learning_rate": 0.00019268849581237138, + "loss": 1.1179, + "step": 7257 + }, + { + "epoch": 0.1863650880005649, + "grad_norm": 0.8671875, + "learning_rate": 0.0001926868201018189, + "loss": 1.1323, + "step": 7258 + }, + { + "epoch": 0.18639076519648673, + "grad_norm": 0.828125, + "learning_rate": 0.00019268514420655, + "loss": 1.0862, + "step": 7259 + }, + { + "epoch": 0.18641644239240854, + "grad_norm": 0.85546875, + "learning_rate": 0.00019268346812656794, + "loss": 1.0203, + "step": 7260 + }, + { + "epoch": 0.18644211958833035, + "grad_norm": 0.8828125, + "learning_rate": 0.0001926817918618761, + "loss": 1.0118, + "step": 7261 + }, + { + "epoch": 0.18646779678425218, + "grad_norm": 0.81640625, + "learning_rate": 0.00019268011541247787, + "loss": 1.0126, + "step": 7262 + }, + { + "epoch": 0.186493473980174, + "grad_norm": 0.8515625, + "learning_rate": 0.00019267843877837648, + "loss": 1.199, + "step": 7263 + }, + { + "epoch": 0.18651915117609583, + "grad_norm": 0.75, + "learning_rate": 0.00019267676195957535, + "loss": 0.8663, + "step": 7264 + }, + { + "epoch": 0.18654482837201763, + "grad_norm": 0.78125, + "learning_rate": 0.00019267508495607782, + "loss": 0.9906, + "step": 7265 + }, + { + "epoch": 0.18657050556793944, + "grad_norm": 0.77734375, + "learning_rate": 0.0001926734077678872, + "loss": 0.9581, + "step": 7266 + }, + { + "epoch": 0.18659618276386128, + "grad_norm": 0.83203125, + "learning_rate": 0.00019267173039500687, + "loss": 0.9714, + "step": 7267 + }, + { + "epoch": 0.1866218599597831, + "grad_norm": 0.83203125, + "learning_rate": 0.00019267005283744016, + "loss": 0.9488, + "step": 7268 + }, + { + "epoch": 0.18664753715570492, + "grad_norm": 0.81640625, + "learning_rate": 0.00019266837509519035, + "loss": 0.9545, + "step": 7269 + }, + { + "epoch": 0.18667321435162673, + "grad_norm": 0.8828125, + "learning_rate": 0.0001926666971682609, + "loss": 1.1111, + "step": 7270 + }, + { + "epoch": 0.18669889154754854, + "grad_norm": 0.92578125, + "learning_rate": 0.00019266501905665506, + "loss": 0.9965, + "step": 7271 + }, + { + "epoch": 0.18672456874347038, + "grad_norm": 0.8359375, + "learning_rate": 0.00019266334076037622, + "loss": 1.0164, + "step": 7272 + }, + { + "epoch": 0.18675024593939218, + "grad_norm": 0.79296875, + "learning_rate": 0.00019266166227942773, + "loss": 0.9913, + "step": 7273 + }, + { + "epoch": 0.18677592313531402, + "grad_norm": 0.859375, + "learning_rate": 0.00019265998361381288, + "loss": 1.0392, + "step": 7274 + }, + { + "epoch": 0.18680160033123583, + "grad_norm": 0.765625, + "learning_rate": 0.00019265830476353508, + "loss": 1.1915, + "step": 7275 + }, + { + "epoch": 0.18682727752715764, + "grad_norm": 0.84375, + "learning_rate": 0.0001926566257285976, + "loss": 1.0255, + "step": 7276 + }, + { + "epoch": 0.18685295472307947, + "grad_norm": 0.83984375, + "learning_rate": 0.0001926549465090039, + "loss": 1.1962, + "step": 7277 + }, + { + "epoch": 0.18687863191900128, + "grad_norm": 0.859375, + "learning_rate": 0.00019265326710475722, + "loss": 1.1343, + "step": 7278 + }, + { + "epoch": 0.18690430911492312, + "grad_norm": 0.87890625, + "learning_rate": 0.00019265158751586098, + "loss": 1.0692, + "step": 7279 + }, + { + "epoch": 0.18692998631084493, + "grad_norm": 0.82421875, + "learning_rate": 0.00019264990774231846, + "loss": 1.0187, + "step": 7280 + }, + { + "epoch": 0.18695566350676673, + "grad_norm": 0.80859375, + "learning_rate": 0.00019264822778413304, + "loss": 0.9874, + "step": 7281 + }, + { + "epoch": 0.18698134070268857, + "grad_norm": 0.828125, + "learning_rate": 0.00019264654764130813, + "loss": 0.9966, + "step": 7282 + }, + { + "epoch": 0.18700701789861038, + "grad_norm": 0.78515625, + "learning_rate": 0.00019264486731384697, + "loss": 1.1516, + "step": 7283 + }, + { + "epoch": 0.18703269509453221, + "grad_norm": 0.78515625, + "learning_rate": 0.00019264318680175296, + "loss": 1.0105, + "step": 7284 + }, + { + "epoch": 0.18705837229045402, + "grad_norm": 0.90234375, + "learning_rate": 0.00019264150610502945, + "loss": 1.0893, + "step": 7285 + }, + { + "epoch": 0.18708404948637583, + "grad_norm": 0.96875, + "learning_rate": 0.0001926398252236798, + "loss": 1.0928, + "step": 7286 + }, + { + "epoch": 0.18710972668229767, + "grad_norm": 0.80859375, + "learning_rate": 0.00019263814415770733, + "loss": 0.8376, + "step": 7287 + }, + { + "epoch": 0.18713540387821948, + "grad_norm": 0.79296875, + "learning_rate": 0.0001926364629071154, + "loss": 1.041, + "step": 7288 + }, + { + "epoch": 0.1871610810741413, + "grad_norm": 0.86328125, + "learning_rate": 0.00019263478147190738, + "loss": 1.0813, + "step": 7289 + }, + { + "epoch": 0.18718675827006312, + "grad_norm": 0.84375, + "learning_rate": 0.0001926330998520866, + "loss": 1.0645, + "step": 7290 + }, + { + "epoch": 0.18721243546598493, + "grad_norm": 0.85546875, + "learning_rate": 0.00019263141804765646, + "loss": 1.1034, + "step": 7291 + }, + { + "epoch": 0.18723811266190676, + "grad_norm": 0.81640625, + "learning_rate": 0.00019262973605862024, + "loss": 1.1094, + "step": 7292 + }, + { + "epoch": 0.18726378985782857, + "grad_norm": 0.90234375, + "learning_rate": 0.0001926280538849813, + "loss": 1.1591, + "step": 7293 + }, + { + "epoch": 0.1872894670537504, + "grad_norm": 0.88671875, + "learning_rate": 0.00019262637152674307, + "loss": 1.1695, + "step": 7294 + }, + { + "epoch": 0.18731514424967222, + "grad_norm": 0.76953125, + "learning_rate": 0.00019262468898390882, + "loss": 0.9694, + "step": 7295 + }, + { + "epoch": 0.18734082144559402, + "grad_norm": 0.77734375, + "learning_rate": 0.0001926230062564819, + "loss": 1.0242, + "step": 7296 + }, + { + "epoch": 0.18736649864151586, + "grad_norm": 0.8359375, + "learning_rate": 0.00019262132334446573, + "loss": 0.8934, + "step": 7297 + }, + { + "epoch": 0.18739217583743767, + "grad_norm": 0.87890625, + "learning_rate": 0.00019261964024786364, + "loss": 1.0467, + "step": 7298 + }, + { + "epoch": 0.1874178530333595, + "grad_norm": 0.83984375, + "learning_rate": 0.00019261795696667896, + "loss": 1.1557, + "step": 7299 + }, + { + "epoch": 0.1874435302292813, + "grad_norm": 0.76953125, + "learning_rate": 0.00019261627350091506, + "loss": 0.9663, + "step": 7300 + }, + { + "epoch": 0.18746920742520312, + "grad_norm": 0.8359375, + "learning_rate": 0.00019261458985057525, + "loss": 1.1337, + "step": 7301 + }, + { + "epoch": 0.18749488462112496, + "grad_norm": 0.78125, + "learning_rate": 0.00019261290601566298, + "loss": 1.0442, + "step": 7302 + }, + { + "epoch": 0.18752056181704677, + "grad_norm": 0.82421875, + "learning_rate": 0.00019261122199618153, + "loss": 0.9733, + "step": 7303 + }, + { + "epoch": 0.1875462390129686, + "grad_norm": 0.7734375, + "learning_rate": 0.00019260953779213428, + "loss": 1.0547, + "step": 7304 + }, + { + "epoch": 0.1875719162088904, + "grad_norm": 0.80078125, + "learning_rate": 0.00019260785340352463, + "loss": 1.0853, + "step": 7305 + }, + { + "epoch": 0.18759759340481222, + "grad_norm": 0.75, + "learning_rate": 0.00019260616883035584, + "loss": 0.9217, + "step": 7306 + }, + { + "epoch": 0.18762327060073405, + "grad_norm": 0.90234375, + "learning_rate": 0.00019260448407263137, + "loss": 1.0132, + "step": 7307 + }, + { + "epoch": 0.18764894779665586, + "grad_norm": 0.8359375, + "learning_rate": 0.00019260279913035447, + "loss": 1.0158, + "step": 7308 + }, + { + "epoch": 0.1876746249925777, + "grad_norm": 0.80078125, + "learning_rate": 0.0001926011140035286, + "loss": 1.0518, + "step": 7309 + }, + { + "epoch": 0.1877003021884995, + "grad_norm": 0.8515625, + "learning_rate": 0.00019259942869215703, + "loss": 0.929, + "step": 7310 + }, + { + "epoch": 0.18772597938442132, + "grad_norm": 0.8359375, + "learning_rate": 0.0001925977431962432, + "loss": 0.8723, + "step": 7311 + }, + { + "epoch": 0.18775165658034315, + "grad_norm": 0.8046875, + "learning_rate": 0.00019259605751579043, + "loss": 1.1093, + "step": 7312 + }, + { + "epoch": 0.18777733377626496, + "grad_norm": 0.9765625, + "learning_rate": 0.00019259437165080207, + "loss": 1.0334, + "step": 7313 + }, + { + "epoch": 0.1878030109721868, + "grad_norm": 1.1328125, + "learning_rate": 0.0001925926856012815, + "loss": 1.0187, + "step": 7314 + }, + { + "epoch": 0.1878286881681086, + "grad_norm": 0.73828125, + "learning_rate": 0.0001925909993672321, + "loss": 0.9141, + "step": 7315 + }, + { + "epoch": 0.1878543653640304, + "grad_norm": 0.828125, + "learning_rate": 0.00019258931294865715, + "loss": 0.9645, + "step": 7316 + }, + { + "epoch": 0.18788004255995225, + "grad_norm": 0.84765625, + "learning_rate": 0.00019258762634556006, + "loss": 1.1121, + "step": 7317 + }, + { + "epoch": 0.18790571975587406, + "grad_norm": 0.78125, + "learning_rate": 0.00019258593955794425, + "loss": 1.074, + "step": 7318 + }, + { + "epoch": 0.1879313969517959, + "grad_norm": 0.81640625, + "learning_rate": 0.00019258425258581298, + "loss": 0.9116, + "step": 7319 + }, + { + "epoch": 0.1879570741477177, + "grad_norm": 0.890625, + "learning_rate": 0.00019258256542916967, + "loss": 1.1059, + "step": 7320 + }, + { + "epoch": 0.1879827513436395, + "grad_norm": 0.77734375, + "learning_rate": 0.00019258087808801766, + "loss": 1.0636, + "step": 7321 + }, + { + "epoch": 0.18800842853956135, + "grad_norm": 0.85546875, + "learning_rate": 0.00019257919056236037, + "loss": 1.1475, + "step": 7322 + }, + { + "epoch": 0.18803410573548315, + "grad_norm": 0.8828125, + "learning_rate": 0.00019257750285220105, + "loss": 1.1447, + "step": 7323 + }, + { + "epoch": 0.188059782931405, + "grad_norm": 0.82421875, + "learning_rate": 0.00019257581495754319, + "loss": 1.0258, + "step": 7324 + }, + { + "epoch": 0.1880854601273268, + "grad_norm": 0.859375, + "learning_rate": 0.00019257412687839005, + "loss": 1.0755, + "step": 7325 + }, + { + "epoch": 0.1881111373232486, + "grad_norm": 0.875, + "learning_rate": 0.00019257243861474504, + "loss": 1.1173, + "step": 7326 + }, + { + "epoch": 0.18813681451917044, + "grad_norm": 0.765625, + "learning_rate": 0.00019257075016661154, + "loss": 0.9694, + "step": 7327 + }, + { + "epoch": 0.18816249171509225, + "grad_norm": 0.7890625, + "learning_rate": 0.00019256906153399287, + "loss": 1.1132, + "step": 7328 + }, + { + "epoch": 0.1881881689110141, + "grad_norm": 0.828125, + "learning_rate": 0.00019256737271689244, + "loss": 1.0062, + "step": 7329 + }, + { + "epoch": 0.1882138461069359, + "grad_norm": 0.765625, + "learning_rate": 0.0001925656837153136, + "loss": 0.9185, + "step": 7330 + }, + { + "epoch": 0.1882395233028577, + "grad_norm": 0.87109375, + "learning_rate": 0.0001925639945292597, + "loss": 0.9631, + "step": 7331 + }, + { + "epoch": 0.18826520049877954, + "grad_norm": 0.77734375, + "learning_rate": 0.00019256230515873416, + "loss": 1.0779, + "step": 7332 + }, + { + "epoch": 0.18829087769470135, + "grad_norm": 0.7578125, + "learning_rate": 0.00019256061560374025, + "loss": 1.1849, + "step": 7333 + }, + { + "epoch": 0.18831655489062316, + "grad_norm": 0.94140625, + "learning_rate": 0.00019255892586428144, + "loss": 1.0803, + "step": 7334 + }, + { + "epoch": 0.188342232086545, + "grad_norm": 0.8046875, + "learning_rate": 0.00019255723594036105, + "loss": 0.9991, + "step": 7335 + }, + { + "epoch": 0.1883679092824668, + "grad_norm": 0.77734375, + "learning_rate": 0.0001925555458319824, + "loss": 1.1113, + "step": 7336 + }, + { + "epoch": 0.18839358647838864, + "grad_norm": 0.79296875, + "learning_rate": 0.00019255385553914895, + "loss": 1.1248, + "step": 7337 + }, + { + "epoch": 0.18841926367431044, + "grad_norm": 0.7890625, + "learning_rate": 0.00019255216506186402, + "loss": 1.1272, + "step": 7338 + }, + { + "epoch": 0.18844494087023225, + "grad_norm": 0.83984375, + "learning_rate": 0.00019255047440013099, + "loss": 0.9631, + "step": 7339 + }, + { + "epoch": 0.1884706180661541, + "grad_norm": 0.80859375, + "learning_rate": 0.00019254878355395323, + "loss": 1.0256, + "step": 7340 + }, + { + "epoch": 0.1884962952620759, + "grad_norm": 0.74609375, + "learning_rate": 0.00019254709252333407, + "loss": 1.0039, + "step": 7341 + }, + { + "epoch": 0.18852197245799773, + "grad_norm": 0.84765625, + "learning_rate": 0.00019254540130827694, + "loss": 1.1042, + "step": 7342 + }, + { + "epoch": 0.18854764965391954, + "grad_norm": 0.8203125, + "learning_rate": 0.00019254370990878518, + "loss": 1.1243, + "step": 7343 + }, + { + "epoch": 0.18857332684984135, + "grad_norm": 0.7890625, + "learning_rate": 0.00019254201832486216, + "loss": 0.9947, + "step": 7344 + }, + { + "epoch": 0.18859900404576319, + "grad_norm": 0.83984375, + "learning_rate": 0.0001925403265565113, + "loss": 1.1229, + "step": 7345 + }, + { + "epoch": 0.188624681241685, + "grad_norm": 0.82421875, + "learning_rate": 0.00019253863460373584, + "loss": 1.0673, + "step": 7346 + }, + { + "epoch": 0.18865035843760683, + "grad_norm": 0.77734375, + "learning_rate": 0.00019253694246653931, + "loss": 1.1391, + "step": 7347 + }, + { + "epoch": 0.18867603563352864, + "grad_norm": 0.80078125, + "learning_rate": 0.00019253525014492498, + "loss": 1.0791, + "step": 7348 + }, + { + "epoch": 0.18870171282945045, + "grad_norm": 0.8046875, + "learning_rate": 0.0001925335576388963, + "loss": 0.9558, + "step": 7349 + }, + { + "epoch": 0.18872739002537228, + "grad_norm": 0.765625, + "learning_rate": 0.00019253186494845657, + "loss": 0.9704, + "step": 7350 + }, + { + "epoch": 0.1887530672212941, + "grad_norm": 0.80859375, + "learning_rate": 0.00019253017207360916, + "loss": 0.988, + "step": 7351 + }, + { + "epoch": 0.18877874441721593, + "grad_norm": 0.8671875, + "learning_rate": 0.0001925284790143575, + "loss": 1.0431, + "step": 7352 + }, + { + "epoch": 0.18880442161313774, + "grad_norm": 0.83984375, + "learning_rate": 0.00019252678577070498, + "loss": 1.2003, + "step": 7353 + }, + { + "epoch": 0.18883009880905954, + "grad_norm": 0.85546875, + "learning_rate": 0.00019252509234265488, + "loss": 1.001, + "step": 7354 + }, + { + "epoch": 0.18885577600498138, + "grad_norm": 0.828125, + "learning_rate": 0.00019252339873021063, + "loss": 1.048, + "step": 7355 + }, + { + "epoch": 0.1888814532009032, + "grad_norm": 0.796875, + "learning_rate": 0.0001925217049333756, + "loss": 1.0897, + "step": 7356 + }, + { + "epoch": 0.18890713039682502, + "grad_norm": 0.77734375, + "learning_rate": 0.00019252001095215323, + "loss": 1.0339, + "step": 7357 + }, + { + "epoch": 0.18893280759274683, + "grad_norm": 0.9296875, + "learning_rate": 0.00019251831678654679, + "loss": 1.0525, + "step": 7358 + }, + { + "epoch": 0.18895848478866864, + "grad_norm": 0.86328125, + "learning_rate": 0.0001925166224365597, + "loss": 1.0202, + "step": 7359 + }, + { + "epoch": 0.18898416198459048, + "grad_norm": 0.81640625, + "learning_rate": 0.00019251492790219534, + "loss": 0.9823, + "step": 7360 + }, + { + "epoch": 0.18900983918051228, + "grad_norm": 0.80078125, + "learning_rate": 0.0001925132331834571, + "loss": 0.9724, + "step": 7361 + }, + { + "epoch": 0.18903551637643412, + "grad_norm": 0.890625, + "learning_rate": 0.00019251153828034833, + "loss": 1.0214, + "step": 7362 + }, + { + "epoch": 0.18906119357235593, + "grad_norm": 0.93359375, + "learning_rate": 0.00019250984319287243, + "loss": 1.0783, + "step": 7363 + }, + { + "epoch": 0.18908687076827774, + "grad_norm": 0.828125, + "learning_rate": 0.00019250814792103274, + "loss": 1.0184, + "step": 7364 + }, + { + "epoch": 0.18911254796419957, + "grad_norm": 0.953125, + "learning_rate": 0.0001925064524648327, + "loss": 1.0069, + "step": 7365 + }, + { + "epoch": 0.18913822516012138, + "grad_norm": 0.83203125, + "learning_rate": 0.00019250475682427567, + "loss": 1.003, + "step": 7366 + }, + { + "epoch": 0.18916390235604322, + "grad_norm": 0.828125, + "learning_rate": 0.000192503060999365, + "loss": 1.0069, + "step": 7367 + }, + { + "epoch": 0.18918957955196503, + "grad_norm": 0.90234375, + "learning_rate": 0.00019250136499010404, + "loss": 1.104, + "step": 7368 + }, + { + "epoch": 0.18921525674788683, + "grad_norm": 0.83984375, + "learning_rate": 0.00019249966879649627, + "loss": 0.9509, + "step": 7369 + }, + { + "epoch": 0.18924093394380867, + "grad_norm": 0.80078125, + "learning_rate": 0.000192497972418545, + "loss": 1.1125, + "step": 7370 + }, + { + "epoch": 0.18926661113973048, + "grad_norm": 0.84765625, + "learning_rate": 0.00019249627585625362, + "loss": 1.0897, + "step": 7371 + }, + { + "epoch": 0.18929228833565231, + "grad_norm": 1.28125, + "learning_rate": 0.00019249457910962555, + "loss": 1.1876, + "step": 7372 + }, + { + "epoch": 0.18931796553157412, + "grad_norm": 0.828125, + "learning_rate": 0.0001924928821786641, + "loss": 1.0014, + "step": 7373 + }, + { + "epoch": 0.18934364272749593, + "grad_norm": 0.87109375, + "learning_rate": 0.0001924911850633727, + "loss": 1.133, + "step": 7374 + }, + { + "epoch": 0.18936931992341777, + "grad_norm": 0.93359375, + "learning_rate": 0.00019248948776375474, + "loss": 1.0279, + "step": 7375 + }, + { + "epoch": 0.18939499711933958, + "grad_norm": 0.76171875, + "learning_rate": 0.00019248779027981354, + "loss": 0.8717, + "step": 7376 + }, + { + "epoch": 0.1894206743152614, + "grad_norm": 0.84765625, + "learning_rate": 0.00019248609261155255, + "loss": 1.0225, + "step": 7377 + }, + { + "epoch": 0.18944635151118322, + "grad_norm": 0.78515625, + "learning_rate": 0.00019248439475897514, + "loss": 1.0779, + "step": 7378 + }, + { + "epoch": 0.18947202870710503, + "grad_norm": 0.84375, + "learning_rate": 0.0001924826967220847, + "loss": 1.1464, + "step": 7379 + }, + { + "epoch": 0.18949770590302686, + "grad_norm": 1.15625, + "learning_rate": 0.0001924809985008846, + "loss": 0.9914, + "step": 7380 + }, + { + "epoch": 0.18952338309894867, + "grad_norm": 0.71875, + "learning_rate": 0.0001924793000953782, + "loss": 1.0232, + "step": 7381 + }, + { + "epoch": 0.1895490602948705, + "grad_norm": 0.78125, + "learning_rate": 0.00019247760150556894, + "loss": 1.1286, + "step": 7382 + }, + { + "epoch": 0.18957473749079232, + "grad_norm": 0.90234375, + "learning_rate": 0.00019247590273146012, + "loss": 0.9517, + "step": 7383 + }, + { + "epoch": 0.18960041468671412, + "grad_norm": 1.921875, + "learning_rate": 0.00019247420377305522, + "loss": 1.1183, + "step": 7384 + }, + { + "epoch": 0.18962609188263596, + "grad_norm": 0.796875, + "learning_rate": 0.00019247250463035758, + "loss": 1.1047, + "step": 7385 + }, + { + "epoch": 0.18965176907855777, + "grad_norm": 0.80078125, + "learning_rate": 0.00019247080530337057, + "loss": 1.0489, + "step": 7386 + }, + { + "epoch": 0.1896774462744796, + "grad_norm": 0.81640625, + "learning_rate": 0.00019246910579209762, + "loss": 1.0127, + "step": 7387 + }, + { + "epoch": 0.1897031234704014, + "grad_norm": 0.8515625, + "learning_rate": 0.00019246740609654207, + "loss": 0.9344, + "step": 7388 + }, + { + "epoch": 0.18972880066632322, + "grad_norm": 0.8125, + "learning_rate": 0.00019246570621670735, + "loss": 0.9804, + "step": 7389 + }, + { + "epoch": 0.18975447786224506, + "grad_norm": 0.9140625, + "learning_rate": 0.00019246400615259684, + "loss": 1.1161, + "step": 7390 + }, + { + "epoch": 0.18978015505816687, + "grad_norm": 0.76171875, + "learning_rate": 0.00019246230590421392, + "loss": 0.9662, + "step": 7391 + }, + { + "epoch": 0.1898058322540887, + "grad_norm": 0.76171875, + "learning_rate": 0.00019246060547156195, + "loss": 0.9851, + "step": 7392 + }, + { + "epoch": 0.1898315094500105, + "grad_norm": 0.86328125, + "learning_rate": 0.00019245890485464437, + "loss": 0.9861, + "step": 7393 + }, + { + "epoch": 0.18985718664593232, + "grad_norm": 0.7578125, + "learning_rate": 0.00019245720405346455, + "loss": 1.0263, + "step": 7394 + }, + { + "epoch": 0.18988286384185415, + "grad_norm": 0.84375, + "learning_rate": 0.00019245550306802583, + "loss": 1.0054, + "step": 7395 + }, + { + "epoch": 0.18990854103777596, + "grad_norm": 0.77734375, + "learning_rate": 0.00019245380189833168, + "loss": 1.032, + "step": 7396 + }, + { + "epoch": 0.1899342182336978, + "grad_norm": 0.87890625, + "learning_rate": 0.00019245210054438547, + "loss": 1.0359, + "step": 7397 + }, + { + "epoch": 0.1899598954296196, + "grad_norm": 0.9375, + "learning_rate": 0.00019245039900619054, + "loss": 1.1514, + "step": 7398 + }, + { + "epoch": 0.18998557262554142, + "grad_norm": 0.83203125, + "learning_rate": 0.00019244869728375034, + "loss": 1.0537, + "step": 7399 + }, + { + "epoch": 0.19001124982146325, + "grad_norm": 0.84375, + "learning_rate": 0.0001924469953770682, + "loss": 1.1326, + "step": 7400 + }, + { + "epoch": 0.19003692701738506, + "grad_norm": 0.8984375, + "learning_rate": 0.00019244529328614758, + "loss": 1.1064, + "step": 7401 + }, + { + "epoch": 0.1900626042133069, + "grad_norm": 0.73828125, + "learning_rate": 0.00019244359101099184, + "loss": 0.874, + "step": 7402 + }, + { + "epoch": 0.1900882814092287, + "grad_norm": 0.875, + "learning_rate": 0.00019244188855160437, + "loss": 1.0421, + "step": 7403 + }, + { + "epoch": 0.1901139586051505, + "grad_norm": 0.9375, + "learning_rate": 0.00019244018590798857, + "loss": 1.1166, + "step": 7404 + }, + { + "epoch": 0.19013963580107235, + "grad_norm": 0.86328125, + "learning_rate": 0.0001924384830801478, + "loss": 1.1961, + "step": 7405 + }, + { + "epoch": 0.19016531299699416, + "grad_norm": 0.86328125, + "learning_rate": 0.00019243678006808552, + "loss": 1.0177, + "step": 7406 + }, + { + "epoch": 0.190190990192916, + "grad_norm": 0.78125, + "learning_rate": 0.00019243507687180506, + "loss": 1.1522, + "step": 7407 + }, + { + "epoch": 0.1902166673888378, + "grad_norm": 0.9296875, + "learning_rate": 0.00019243337349130987, + "loss": 0.9353, + "step": 7408 + }, + { + "epoch": 0.1902423445847596, + "grad_norm": 0.8671875, + "learning_rate": 0.0001924316699266033, + "loss": 1.0069, + "step": 7409 + }, + { + "epoch": 0.19026802178068145, + "grad_norm": 0.80078125, + "learning_rate": 0.00019242996617768876, + "loss": 0.9012, + "step": 7410 + }, + { + "epoch": 0.19029369897660325, + "grad_norm": 0.9140625, + "learning_rate": 0.00019242826224456967, + "loss": 1.0586, + "step": 7411 + }, + { + "epoch": 0.1903193761725251, + "grad_norm": 0.8359375, + "learning_rate": 0.00019242655812724938, + "loss": 1.0621, + "step": 7412 + }, + { + "epoch": 0.1903450533684469, + "grad_norm": 0.7578125, + "learning_rate": 0.0001924248538257313, + "loss": 1.0018, + "step": 7413 + }, + { + "epoch": 0.1903707305643687, + "grad_norm": 0.87890625, + "learning_rate": 0.00019242314934001886, + "loss": 1.1351, + "step": 7414 + }, + { + "epoch": 0.19039640776029054, + "grad_norm": 0.82421875, + "learning_rate": 0.0001924214446701154, + "loss": 1.0927, + "step": 7415 + }, + { + "epoch": 0.19042208495621235, + "grad_norm": 0.92578125, + "learning_rate": 0.00019241973981602438, + "loss": 1.155, + "step": 7416 + }, + { + "epoch": 0.1904477621521342, + "grad_norm": 0.828125, + "learning_rate": 0.00019241803477774915, + "loss": 0.9822, + "step": 7417 + }, + { + "epoch": 0.190473439348056, + "grad_norm": 0.70703125, + "learning_rate": 0.00019241632955529313, + "loss": 1.0451, + "step": 7418 + }, + { + "epoch": 0.1904991165439778, + "grad_norm": 1.015625, + "learning_rate": 0.0001924146241486597, + "loss": 1.179, + "step": 7419 + }, + { + "epoch": 0.19052479373989964, + "grad_norm": 0.8359375, + "learning_rate": 0.00019241291855785227, + "loss": 1.129, + "step": 7420 + }, + { + "epoch": 0.19055047093582145, + "grad_norm": 0.83984375, + "learning_rate": 0.00019241121278287427, + "loss": 0.9891, + "step": 7421 + }, + { + "epoch": 0.19057614813174328, + "grad_norm": 0.8046875, + "learning_rate": 0.00019240950682372906, + "loss": 0.8937, + "step": 7422 + }, + { + "epoch": 0.1906018253276651, + "grad_norm": 0.84765625, + "learning_rate": 0.00019240780068042004, + "loss": 1.0518, + "step": 7423 + }, + { + "epoch": 0.1906275025235869, + "grad_norm": 0.9609375, + "learning_rate": 0.0001924060943529506, + "loss": 1.2029, + "step": 7424 + }, + { + "epoch": 0.19065317971950874, + "grad_norm": 3.921875, + "learning_rate": 0.00019240438784132418, + "loss": 1.0961, + "step": 7425 + }, + { + "epoch": 0.19067885691543054, + "grad_norm": 0.765625, + "learning_rate": 0.0001924026811455442, + "loss": 0.9043, + "step": 7426 + }, + { + "epoch": 0.19070453411135238, + "grad_norm": 0.77734375, + "learning_rate": 0.000192400974265614, + "loss": 1.1205, + "step": 7427 + }, + { + "epoch": 0.1907302113072742, + "grad_norm": 0.79296875, + "learning_rate": 0.000192399267201537, + "loss": 1.0739, + "step": 7428 + }, + { + "epoch": 0.190755888503196, + "grad_norm": 0.80859375, + "learning_rate": 0.0001923975599533166, + "loss": 1.1429, + "step": 7429 + }, + { + "epoch": 0.19078156569911783, + "grad_norm": 0.8359375, + "learning_rate": 0.0001923958525209562, + "loss": 1.1212, + "step": 7430 + }, + { + "epoch": 0.19080724289503964, + "grad_norm": 0.796875, + "learning_rate": 0.00019239414490445923, + "loss": 1.2941, + "step": 7431 + }, + { + "epoch": 0.19083292009096148, + "grad_norm": 0.875, + "learning_rate": 0.00019239243710382907, + "loss": 1.069, + "step": 7432 + }, + { + "epoch": 0.19085859728688329, + "grad_norm": 0.80078125, + "learning_rate": 0.00019239072911906912, + "loss": 1.0045, + "step": 7433 + }, + { + "epoch": 0.1908842744828051, + "grad_norm": 0.8984375, + "learning_rate": 0.0001923890209501828, + "loss": 1.1633, + "step": 7434 + }, + { + "epoch": 0.19090995167872693, + "grad_norm": 0.80859375, + "learning_rate": 0.00019238731259717351, + "loss": 0.9809, + "step": 7435 + }, + { + "epoch": 0.19093562887464874, + "grad_norm": 0.84765625, + "learning_rate": 0.00019238560406004466, + "loss": 0.9719, + "step": 7436 + }, + { + "epoch": 0.19096130607057057, + "grad_norm": 0.91796875, + "learning_rate": 0.00019238389533879962, + "loss": 1.0556, + "step": 7437 + }, + { + "epoch": 0.19098698326649238, + "grad_norm": 0.8515625, + "learning_rate": 0.00019238218643344185, + "loss": 1.1331, + "step": 7438 + }, + { + "epoch": 0.1910126604624142, + "grad_norm": 0.80859375, + "learning_rate": 0.0001923804773439747, + "loss": 1.0724, + "step": 7439 + }, + { + "epoch": 0.19103833765833603, + "grad_norm": 0.90234375, + "learning_rate": 0.00019237876807040163, + "loss": 1.1073, + "step": 7440 + }, + { + "epoch": 0.19106401485425784, + "grad_norm": 0.99609375, + "learning_rate": 0.00019237705861272598, + "loss": 1.1212, + "step": 7441 + }, + { + "epoch": 0.19108969205017967, + "grad_norm": 0.84375, + "learning_rate": 0.00019237534897095123, + "loss": 1.0882, + "step": 7442 + }, + { + "epoch": 0.19111536924610148, + "grad_norm": 0.8671875, + "learning_rate": 0.00019237363914508074, + "loss": 1.1237, + "step": 7443 + }, + { + "epoch": 0.1911410464420233, + "grad_norm": 1.1015625, + "learning_rate": 0.00019237192913511796, + "loss": 1.0441, + "step": 7444 + }, + { + "epoch": 0.19116672363794512, + "grad_norm": 0.828125, + "learning_rate": 0.00019237021894106623, + "loss": 1.0807, + "step": 7445 + }, + { + "epoch": 0.19119240083386693, + "grad_norm": 0.8515625, + "learning_rate": 0.00019236850856292902, + "loss": 0.9731, + "step": 7446 + }, + { + "epoch": 0.19121807802978877, + "grad_norm": 0.859375, + "learning_rate": 0.00019236679800070972, + "loss": 0.9587, + "step": 7447 + }, + { + "epoch": 0.19124375522571058, + "grad_norm": 0.859375, + "learning_rate": 0.0001923650872544117, + "loss": 1.0137, + "step": 7448 + }, + { + "epoch": 0.19126943242163238, + "grad_norm": 0.8203125, + "learning_rate": 0.00019236337632403845, + "loss": 1.1476, + "step": 7449 + }, + { + "epoch": 0.19129510961755422, + "grad_norm": 0.84375, + "learning_rate": 0.00019236166520959332, + "loss": 0.989, + "step": 7450 + }, + { + "epoch": 0.19132078681347603, + "grad_norm": 0.8359375, + "learning_rate": 0.00019235995391107973, + "loss": 1.1055, + "step": 7451 + }, + { + "epoch": 0.19134646400939787, + "grad_norm": 0.7578125, + "learning_rate": 0.00019235824242850108, + "loss": 0.8599, + "step": 7452 + }, + { + "epoch": 0.19137214120531967, + "grad_norm": 0.82421875, + "learning_rate": 0.00019235653076186079, + "loss": 1.0123, + "step": 7453 + }, + { + "epoch": 0.19139781840124148, + "grad_norm": 0.84375, + "learning_rate": 0.00019235481891116232, + "loss": 1.1529, + "step": 7454 + }, + { + "epoch": 0.19142349559716332, + "grad_norm": 0.765625, + "learning_rate": 0.00019235310687640902, + "loss": 1.0034, + "step": 7455 + }, + { + "epoch": 0.19144917279308513, + "grad_norm": 0.7890625, + "learning_rate": 0.0001923513946576043, + "loss": 0.9173, + "step": 7456 + }, + { + "epoch": 0.19147484998900696, + "grad_norm": 0.875, + "learning_rate": 0.0001923496822547516, + "loss": 0.9185, + "step": 7457 + }, + { + "epoch": 0.19150052718492877, + "grad_norm": 0.77734375, + "learning_rate": 0.00019234796966785433, + "loss": 1.0483, + "step": 7458 + }, + { + "epoch": 0.19152620438085058, + "grad_norm": 0.83203125, + "learning_rate": 0.0001923462568969159, + "loss": 1.0307, + "step": 7459 + }, + { + "epoch": 0.19155188157677241, + "grad_norm": 0.82421875, + "learning_rate": 0.00019234454394193975, + "loss": 1.0942, + "step": 7460 + }, + { + "epoch": 0.19157755877269422, + "grad_norm": 0.83984375, + "learning_rate": 0.00019234283080292924, + "loss": 1.0592, + "step": 7461 + }, + { + "epoch": 0.19160323596861606, + "grad_norm": 0.84375, + "learning_rate": 0.00019234111747988778, + "loss": 0.9467, + "step": 7462 + }, + { + "epoch": 0.19162891316453787, + "grad_norm": 1.453125, + "learning_rate": 0.00019233940397281885, + "loss": 1.2077, + "step": 7463 + }, + { + "epoch": 0.19165459036045968, + "grad_norm": 0.7890625, + "learning_rate": 0.00019233769028172582, + "loss": 1.0777, + "step": 7464 + }, + { + "epoch": 0.1916802675563815, + "grad_norm": 0.81640625, + "learning_rate": 0.0001923359764066121, + "loss": 1.1108, + "step": 7465 + }, + { + "epoch": 0.19170594475230332, + "grad_norm": 0.8046875, + "learning_rate": 0.00019233426234748112, + "loss": 1.0858, + "step": 7466 + }, + { + "epoch": 0.19173162194822516, + "grad_norm": 0.765625, + "learning_rate": 0.0001923325481043363, + "loss": 0.9374, + "step": 7467 + }, + { + "epoch": 0.19175729914414696, + "grad_norm": 0.83203125, + "learning_rate": 0.0001923308336771811, + "loss": 1.1393, + "step": 7468 + }, + { + "epoch": 0.19178297634006877, + "grad_norm": 0.90234375, + "learning_rate": 0.00019232911906601885, + "loss": 0.9453, + "step": 7469 + }, + { + "epoch": 0.1918086535359906, + "grad_norm": 0.8359375, + "learning_rate": 0.000192327404270853, + "loss": 1.0593, + "step": 7470 + }, + { + "epoch": 0.19183433073191242, + "grad_norm": 0.89453125, + "learning_rate": 0.00019232568929168696, + "loss": 1.0635, + "step": 7471 + }, + { + "epoch": 0.19186000792783425, + "grad_norm": 1.0546875, + "learning_rate": 0.00019232397412852418, + "loss": 1.157, + "step": 7472 + }, + { + "epoch": 0.19188568512375606, + "grad_norm": 0.81640625, + "learning_rate": 0.00019232225878136806, + "loss": 1.0754, + "step": 7473 + }, + { + "epoch": 0.19191136231967787, + "grad_norm": 0.7890625, + "learning_rate": 0.00019232054325022202, + "loss": 1.0596, + "step": 7474 + }, + { + "epoch": 0.1919370395155997, + "grad_norm": 0.7578125, + "learning_rate": 0.00019231882753508947, + "loss": 0.9173, + "step": 7475 + }, + { + "epoch": 0.1919627167115215, + "grad_norm": 0.875, + "learning_rate": 0.00019231711163597383, + "loss": 1.0152, + "step": 7476 + }, + { + "epoch": 0.19198839390744335, + "grad_norm": 0.87890625, + "learning_rate": 0.00019231539555287857, + "loss": 1.0574, + "step": 7477 + }, + { + "epoch": 0.19201407110336516, + "grad_norm": 0.82421875, + "learning_rate": 0.00019231367928580702, + "loss": 1.0128, + "step": 7478 + }, + { + "epoch": 0.19203974829928697, + "grad_norm": 0.82421875, + "learning_rate": 0.00019231196283476265, + "loss": 0.9983, + "step": 7479 + }, + { + "epoch": 0.1920654254952088, + "grad_norm": 0.875, + "learning_rate": 0.00019231024619974888, + "loss": 1.0603, + "step": 7480 + }, + { + "epoch": 0.1920911026911306, + "grad_norm": 0.8125, + "learning_rate": 0.00019230852938076912, + "loss": 1.0774, + "step": 7481 + }, + { + "epoch": 0.19211677988705245, + "grad_norm": 0.80859375, + "learning_rate": 0.00019230681237782682, + "loss": 1.008, + "step": 7482 + }, + { + "epoch": 0.19214245708297426, + "grad_norm": 0.83984375, + "learning_rate": 0.00019230509519092536, + "loss": 1.0848, + "step": 7483 + }, + { + "epoch": 0.19216813427889606, + "grad_norm": 0.98828125, + "learning_rate": 0.0001923033778200682, + "loss": 1.1044, + "step": 7484 + }, + { + "epoch": 0.1921938114748179, + "grad_norm": 0.87109375, + "learning_rate": 0.00019230166026525875, + "loss": 1.0889, + "step": 7485 + }, + { + "epoch": 0.1922194886707397, + "grad_norm": 0.87890625, + "learning_rate": 0.00019229994252650042, + "loss": 1.0538, + "step": 7486 + }, + { + "epoch": 0.19224516586666154, + "grad_norm": 0.79296875, + "learning_rate": 0.00019229822460379662, + "loss": 1.089, + "step": 7487 + }, + { + "epoch": 0.19227084306258335, + "grad_norm": 0.828125, + "learning_rate": 0.00019229650649715084, + "loss": 1.0796, + "step": 7488 + }, + { + "epoch": 0.19229652025850516, + "grad_norm": 0.83984375, + "learning_rate": 0.00019229478820656642, + "loss": 1.3061, + "step": 7489 + }, + { + "epoch": 0.192322197454427, + "grad_norm": 0.8046875, + "learning_rate": 0.0001922930697320468, + "loss": 0.9724, + "step": 7490 + }, + { + "epoch": 0.1923478746503488, + "grad_norm": 0.90234375, + "learning_rate": 0.0001922913510735955, + "loss": 1.1723, + "step": 7491 + }, + { + "epoch": 0.19237355184627064, + "grad_norm": 0.890625, + "learning_rate": 0.00019228963223121587, + "loss": 1.1512, + "step": 7492 + }, + { + "epoch": 0.19239922904219245, + "grad_norm": 0.6796875, + "learning_rate": 0.0001922879132049113, + "loss": 0.9012, + "step": 7493 + }, + { + "epoch": 0.19242490623811426, + "grad_norm": 0.78125, + "learning_rate": 0.00019228619399468526, + "loss": 1.0602, + "step": 7494 + }, + { + "epoch": 0.1924505834340361, + "grad_norm": 0.8828125, + "learning_rate": 0.00019228447460054116, + "loss": 1.1031, + "step": 7495 + }, + { + "epoch": 0.1924762606299579, + "grad_norm": 0.9609375, + "learning_rate": 0.00019228275502248247, + "loss": 1.1691, + "step": 7496 + }, + { + "epoch": 0.19250193782587974, + "grad_norm": 0.83203125, + "learning_rate": 0.00019228103526051257, + "loss": 1.0169, + "step": 7497 + }, + { + "epoch": 0.19252761502180155, + "grad_norm": 0.82421875, + "learning_rate": 0.0001922793153146349, + "loss": 0.9026, + "step": 7498 + }, + { + "epoch": 0.19255329221772335, + "grad_norm": 0.76953125, + "learning_rate": 0.00019227759518485288, + "loss": 0.9215, + "step": 7499 + }, + { + "epoch": 0.1925789694136452, + "grad_norm": 0.8671875, + "learning_rate": 0.00019227587487116997, + "loss": 1.0534, + "step": 7500 + }, + { + "epoch": 0.192604646609567, + "grad_norm": 0.765625, + "learning_rate": 0.00019227415437358958, + "loss": 0.9593, + "step": 7501 + }, + { + "epoch": 0.19263032380548883, + "grad_norm": 0.76953125, + "learning_rate": 0.00019227243369211513, + "loss": 1.0144, + "step": 7502 + }, + { + "epoch": 0.19265600100141064, + "grad_norm": 0.79296875, + "learning_rate": 0.00019227071282675002, + "loss": 1.2012, + "step": 7503 + }, + { + "epoch": 0.19268167819733245, + "grad_norm": 0.78515625, + "learning_rate": 0.00019226899177749774, + "loss": 1.0291, + "step": 7504 + }, + { + "epoch": 0.1927073553932543, + "grad_norm": 0.8046875, + "learning_rate": 0.00019226727054436171, + "loss": 1.0438, + "step": 7505 + }, + { + "epoch": 0.1927330325891761, + "grad_norm": 0.796875, + "learning_rate": 0.00019226554912734532, + "loss": 1.0195, + "step": 7506 + }, + { + "epoch": 0.19275870978509793, + "grad_norm": 0.89453125, + "learning_rate": 0.00019226382752645204, + "loss": 1.0862, + "step": 7507 + }, + { + "epoch": 0.19278438698101974, + "grad_norm": 0.84765625, + "learning_rate": 0.00019226210574168527, + "loss": 1.127, + "step": 7508 + }, + { + "epoch": 0.19281006417694155, + "grad_norm": 0.75390625, + "learning_rate": 0.00019226038377304848, + "loss": 1.0158, + "step": 7509 + }, + { + "epoch": 0.19283574137286338, + "grad_norm": 0.8125, + "learning_rate": 0.00019225866162054505, + "loss": 0.9793, + "step": 7510 + }, + { + "epoch": 0.1928614185687852, + "grad_norm": 0.79296875, + "learning_rate": 0.00019225693928417848, + "loss": 1.0821, + "step": 7511 + }, + { + "epoch": 0.19288709576470703, + "grad_norm": 0.91796875, + "learning_rate": 0.00019225521676395215, + "loss": 1.2316, + "step": 7512 + }, + { + "epoch": 0.19291277296062884, + "grad_norm": 0.82421875, + "learning_rate": 0.0001922534940598695, + "loss": 1.1652, + "step": 7513 + }, + { + "epoch": 0.19293845015655064, + "grad_norm": 0.7890625, + "learning_rate": 0.00019225177117193395, + "loss": 0.992, + "step": 7514 + }, + { + "epoch": 0.19296412735247248, + "grad_norm": 0.83203125, + "learning_rate": 0.00019225004810014898, + "loss": 0.9657, + "step": 7515 + }, + { + "epoch": 0.1929898045483943, + "grad_norm": 0.84375, + "learning_rate": 0.000192248324844518, + "loss": 1.044, + "step": 7516 + }, + { + "epoch": 0.19301548174431613, + "grad_norm": 0.796875, + "learning_rate": 0.0001922466014050444, + "loss": 0.9417, + "step": 7517 + }, + { + "epoch": 0.19304115894023793, + "grad_norm": 0.77734375, + "learning_rate": 0.0001922448777817317, + "loss": 0.9535, + "step": 7518 + }, + { + "epoch": 0.19306683613615974, + "grad_norm": 0.8671875, + "learning_rate": 0.00019224315397458328, + "loss": 1.2015, + "step": 7519 + }, + { + "epoch": 0.19309251333208158, + "grad_norm": 0.9296875, + "learning_rate": 0.00019224142998360257, + "loss": 1.0224, + "step": 7520 + }, + { + "epoch": 0.1931181905280034, + "grad_norm": 0.85546875, + "learning_rate": 0.00019223970580879306, + "loss": 1.0589, + "step": 7521 + }, + { + "epoch": 0.19314386772392522, + "grad_norm": 0.76953125, + "learning_rate": 0.00019223798145015812, + "loss": 0.9402, + "step": 7522 + }, + { + "epoch": 0.19316954491984703, + "grad_norm": 0.81640625, + "learning_rate": 0.0001922362569077012, + "loss": 0.9866, + "step": 7523 + }, + { + "epoch": 0.19319522211576884, + "grad_norm": 0.81640625, + "learning_rate": 0.00019223453218142578, + "loss": 1.1004, + "step": 7524 + }, + { + "epoch": 0.19322089931169067, + "grad_norm": 0.8203125, + "learning_rate": 0.00019223280727133525, + "loss": 1.1023, + "step": 7525 + }, + { + "epoch": 0.19324657650761248, + "grad_norm": 0.83203125, + "learning_rate": 0.00019223108217743308, + "loss": 1.1153, + "step": 7526 + }, + { + "epoch": 0.19327225370353432, + "grad_norm": 0.8671875, + "learning_rate": 0.0001922293568997227, + "loss": 1.1758, + "step": 7527 + }, + { + "epoch": 0.19329793089945613, + "grad_norm": 0.87890625, + "learning_rate": 0.00019222763143820752, + "loss": 1.2769, + "step": 7528 + }, + { + "epoch": 0.19332360809537794, + "grad_norm": 0.78515625, + "learning_rate": 0.00019222590579289102, + "loss": 0.9506, + "step": 7529 + }, + { + "epoch": 0.19334928529129977, + "grad_norm": 0.83203125, + "learning_rate": 0.0001922241799637766, + "loss": 1.089, + "step": 7530 + }, + { + "epoch": 0.19337496248722158, + "grad_norm": 1.0625, + "learning_rate": 0.00019222245395086776, + "loss": 1.0955, + "step": 7531 + }, + { + "epoch": 0.19340063968314342, + "grad_norm": 0.79296875, + "learning_rate": 0.00019222072775416785, + "loss": 1.1937, + "step": 7532 + }, + { + "epoch": 0.19342631687906522, + "grad_norm": 0.85546875, + "learning_rate": 0.0001922190013736804, + "loss": 1.1282, + "step": 7533 + }, + { + "epoch": 0.19345199407498703, + "grad_norm": 0.79296875, + "learning_rate": 0.0001922172748094088, + "loss": 1.0045, + "step": 7534 + }, + { + "epoch": 0.19347767127090887, + "grad_norm": 0.8046875, + "learning_rate": 0.0001922155480613565, + "loss": 1.0708, + "step": 7535 + }, + { + "epoch": 0.19350334846683068, + "grad_norm": 0.79296875, + "learning_rate": 0.00019221382112952692, + "loss": 1.0665, + "step": 7536 + }, + { + "epoch": 0.1935290256627525, + "grad_norm": 0.80859375, + "learning_rate": 0.00019221209401392355, + "loss": 1.1013, + "step": 7537 + }, + { + "epoch": 0.19355470285867432, + "grad_norm": 0.8203125, + "learning_rate": 0.0001922103667145498, + "loss": 1.1453, + "step": 7538 + }, + { + "epoch": 0.19358038005459613, + "grad_norm": 0.8046875, + "learning_rate": 0.00019220863923140912, + "loss": 0.9894, + "step": 7539 + }, + { + "epoch": 0.19360605725051797, + "grad_norm": 0.9140625, + "learning_rate": 0.00019220691156450495, + "loss": 1.0909, + "step": 7540 + }, + { + "epoch": 0.19363173444643977, + "grad_norm": 0.86328125, + "learning_rate": 0.00019220518371384072, + "loss": 1.2175, + "step": 7541 + }, + { + "epoch": 0.19365741164236158, + "grad_norm": 0.875, + "learning_rate": 0.0001922034556794199, + "loss": 1.0685, + "step": 7542 + }, + { + "epoch": 0.19368308883828342, + "grad_norm": 0.87109375, + "learning_rate": 0.00019220172746124594, + "loss": 1.0722, + "step": 7543 + }, + { + "epoch": 0.19370876603420523, + "grad_norm": 0.76171875, + "learning_rate": 0.00019219999905932222, + "loss": 1.032, + "step": 7544 + }, + { + "epoch": 0.19373444323012706, + "grad_norm": 0.828125, + "learning_rate": 0.00019219827047365227, + "loss": 1.1192, + "step": 7545 + }, + { + "epoch": 0.19376012042604887, + "grad_norm": 0.890625, + "learning_rate": 0.0001921965417042395, + "loss": 0.9928, + "step": 7546 + }, + { + "epoch": 0.19378579762197068, + "grad_norm": 0.8359375, + "learning_rate": 0.0001921948127510873, + "loss": 1.0096, + "step": 7547 + }, + { + "epoch": 0.19381147481789252, + "grad_norm": 0.8046875, + "learning_rate": 0.0001921930836141992, + "loss": 1.0563, + "step": 7548 + }, + { + "epoch": 0.19383715201381432, + "grad_norm": 0.83984375, + "learning_rate": 0.0001921913542935786, + "loss": 0.9704, + "step": 7549 + }, + { + "epoch": 0.19386282920973616, + "grad_norm": 0.828125, + "learning_rate": 0.000192189624789229, + "loss": 1.1676, + "step": 7550 + }, + { + "epoch": 0.19388850640565797, + "grad_norm": 0.89453125, + "learning_rate": 0.00019218789510115376, + "loss": 0.9635, + "step": 7551 + }, + { + "epoch": 0.19391418360157978, + "grad_norm": 0.828125, + "learning_rate": 0.0001921861652293564, + "loss": 1.1194, + "step": 7552 + }, + { + "epoch": 0.1939398607975016, + "grad_norm": 0.765625, + "learning_rate": 0.00019218443517384032, + "loss": 0.994, + "step": 7553 + }, + { + "epoch": 0.19396553799342342, + "grad_norm": 0.796875, + "learning_rate": 0.000192182704934609, + "loss": 1.0627, + "step": 7554 + }, + { + "epoch": 0.19399121518934526, + "grad_norm": 0.828125, + "learning_rate": 0.00019218097451166588, + "loss": 1.138, + "step": 7555 + }, + { + "epoch": 0.19401689238526706, + "grad_norm": 0.79296875, + "learning_rate": 0.00019217924390501438, + "loss": 1.032, + "step": 7556 + }, + { + "epoch": 0.19404256958118887, + "grad_norm": 0.78125, + "learning_rate": 0.000192177513114658, + "loss": 1.0214, + "step": 7557 + }, + { + "epoch": 0.1940682467771107, + "grad_norm": 0.8046875, + "learning_rate": 0.00019217578214060014, + "loss": 1.1026, + "step": 7558 + }, + { + "epoch": 0.19409392397303252, + "grad_norm": 0.84375, + "learning_rate": 0.00019217405098284428, + "loss": 1.0291, + "step": 7559 + }, + { + "epoch": 0.19411960116895435, + "grad_norm": 0.8125, + "learning_rate": 0.00019217231964139387, + "loss": 1.0473, + "step": 7560 + }, + { + "epoch": 0.19414527836487616, + "grad_norm": 0.84375, + "learning_rate": 0.00019217058811625236, + "loss": 1.0014, + "step": 7561 + }, + { + "epoch": 0.19417095556079797, + "grad_norm": 0.84375, + "learning_rate": 0.0001921688564074232, + "loss": 1.0568, + "step": 7562 + }, + { + "epoch": 0.1941966327567198, + "grad_norm": 0.77734375, + "learning_rate": 0.00019216712451490978, + "loss": 1.0506, + "step": 7563 + }, + { + "epoch": 0.19422230995264161, + "grad_norm": 0.828125, + "learning_rate": 0.00019216539243871567, + "loss": 1.0603, + "step": 7564 + }, + { + "epoch": 0.19424798714856345, + "grad_norm": 0.71484375, + "learning_rate": 0.0001921636601788442, + "loss": 0.915, + "step": 7565 + }, + { + "epoch": 0.19427366434448526, + "grad_norm": 1.0234375, + "learning_rate": 0.00019216192773529892, + "loss": 1.0717, + "step": 7566 + }, + { + "epoch": 0.19429934154040707, + "grad_norm": 0.75, + "learning_rate": 0.00019216019510808326, + "loss": 1.0453, + "step": 7567 + }, + { + "epoch": 0.1943250187363289, + "grad_norm": 0.84765625, + "learning_rate": 0.00019215846229720063, + "loss": 1.0035, + "step": 7568 + }, + { + "epoch": 0.1943506959322507, + "grad_norm": 0.91015625, + "learning_rate": 0.0001921567293026545, + "loss": 1.0443, + "step": 7569 + }, + { + "epoch": 0.19437637312817255, + "grad_norm": 0.8203125, + "learning_rate": 0.00019215499612444835, + "loss": 1.0909, + "step": 7570 + }, + { + "epoch": 0.19440205032409436, + "grad_norm": 0.8359375, + "learning_rate": 0.00019215326276258562, + "loss": 0.9698, + "step": 7571 + }, + { + "epoch": 0.19442772752001616, + "grad_norm": 0.87890625, + "learning_rate": 0.00019215152921706974, + "loss": 0.99, + "step": 7572 + }, + { + "epoch": 0.194453404715938, + "grad_norm": 0.90625, + "learning_rate": 0.0001921497954879042, + "loss": 1.149, + "step": 7573 + }, + { + "epoch": 0.1944790819118598, + "grad_norm": 0.890625, + "learning_rate": 0.00019214806157509244, + "loss": 1.0165, + "step": 7574 + }, + { + "epoch": 0.19450475910778164, + "grad_norm": 0.77734375, + "learning_rate": 0.00019214632747863793, + "loss": 0.9598, + "step": 7575 + }, + { + "epoch": 0.19453043630370345, + "grad_norm": 0.80078125, + "learning_rate": 0.00019214459319854407, + "loss": 0.9942, + "step": 7576 + }, + { + "epoch": 0.19455611349962526, + "grad_norm": 0.8125, + "learning_rate": 0.00019214285873481442, + "loss": 1.0083, + "step": 7577 + }, + { + "epoch": 0.1945817906955471, + "grad_norm": 0.90234375, + "learning_rate": 0.0001921411240874523, + "loss": 1.1178, + "step": 7578 + }, + { + "epoch": 0.1946074678914689, + "grad_norm": 0.796875, + "learning_rate": 0.00019213938925646132, + "loss": 0.9725, + "step": 7579 + }, + { + "epoch": 0.19463314508739074, + "grad_norm": 0.8125, + "learning_rate": 0.0001921376542418448, + "loss": 0.9906, + "step": 7580 + }, + { + "epoch": 0.19465882228331255, + "grad_norm": 0.8828125, + "learning_rate": 0.00019213591904360632, + "loss": 1.1178, + "step": 7581 + }, + { + "epoch": 0.19468449947923436, + "grad_norm": 1.0234375, + "learning_rate": 0.00019213418366174922, + "loss": 1.0544, + "step": 7582 + }, + { + "epoch": 0.1947101766751562, + "grad_norm": 0.81640625, + "learning_rate": 0.00019213244809627704, + "loss": 1.1267, + "step": 7583 + }, + { + "epoch": 0.194735853871078, + "grad_norm": 0.8125, + "learning_rate": 0.0001921307123471932, + "loss": 1.0107, + "step": 7584 + }, + { + "epoch": 0.19476153106699984, + "grad_norm": 0.77734375, + "learning_rate": 0.0001921289764145012, + "loss": 1.0085, + "step": 7585 + }, + { + "epoch": 0.19478720826292165, + "grad_norm": 0.84375, + "learning_rate": 0.00019212724029820443, + "loss": 0.9379, + "step": 7586 + }, + { + "epoch": 0.19481288545884345, + "grad_norm": 0.84375, + "learning_rate": 0.00019212550399830644, + "loss": 0.9993, + "step": 7587 + }, + { + "epoch": 0.1948385626547653, + "grad_norm": 0.83984375, + "learning_rate": 0.0001921237675148106, + "loss": 1.1011, + "step": 7588 + }, + { + "epoch": 0.1948642398506871, + "grad_norm": 0.74609375, + "learning_rate": 0.00019212203084772043, + "loss": 1.0272, + "step": 7589 + }, + { + "epoch": 0.19488991704660893, + "grad_norm": 0.859375, + "learning_rate": 0.00019212029399703936, + "loss": 0.9974, + "step": 7590 + }, + { + "epoch": 0.19491559424253074, + "grad_norm": 1.2734375, + "learning_rate": 0.0001921185569627709, + "loss": 1.0642, + "step": 7591 + }, + { + "epoch": 0.19494127143845255, + "grad_norm": 0.8046875, + "learning_rate": 0.00019211681974491845, + "loss": 1.1888, + "step": 7592 + }, + { + "epoch": 0.1949669486343744, + "grad_norm": 1.015625, + "learning_rate": 0.0001921150823434855, + "loss": 1.1542, + "step": 7593 + }, + { + "epoch": 0.1949926258302962, + "grad_norm": 0.84765625, + "learning_rate": 0.00019211334475847553, + "loss": 1.1031, + "step": 7594 + }, + { + "epoch": 0.19501830302621803, + "grad_norm": 0.84375, + "learning_rate": 0.00019211160698989196, + "loss": 0.9894, + "step": 7595 + }, + { + "epoch": 0.19504398022213984, + "grad_norm": 0.81640625, + "learning_rate": 0.0001921098690377383, + "loss": 1.1069, + "step": 7596 + }, + { + "epoch": 0.19506965741806165, + "grad_norm": 0.828125, + "learning_rate": 0.00019210813090201798, + "loss": 0.9711, + "step": 7597 + }, + { + "epoch": 0.19509533461398348, + "grad_norm": 0.74609375, + "learning_rate": 0.0001921063925827345, + "loss": 1.0334, + "step": 7598 + }, + { + "epoch": 0.1951210118099053, + "grad_norm": 0.80078125, + "learning_rate": 0.00019210465407989123, + "loss": 0.9441, + "step": 7599 + }, + { + "epoch": 0.19514668900582713, + "grad_norm": 0.8203125, + "learning_rate": 0.00019210291539349176, + "loss": 1.0081, + "step": 7600 + }, + { + "epoch": 0.19517236620174894, + "grad_norm": 0.8359375, + "learning_rate": 0.0001921011765235395, + "loss": 0.9565, + "step": 7601 + }, + { + "epoch": 0.19519804339767075, + "grad_norm": 0.8046875, + "learning_rate": 0.0001920994374700379, + "loss": 1.0724, + "step": 7602 + }, + { + "epoch": 0.19522372059359258, + "grad_norm": 0.83203125, + "learning_rate": 0.00019209769823299043, + "loss": 1.1451, + "step": 7603 + }, + { + "epoch": 0.1952493977895144, + "grad_norm": 0.83203125, + "learning_rate": 0.00019209595881240058, + "loss": 1.0401, + "step": 7604 + }, + { + "epoch": 0.19527507498543623, + "grad_norm": 0.80859375, + "learning_rate": 0.0001920942192082718, + "loss": 1.0445, + "step": 7605 + }, + { + "epoch": 0.19530075218135803, + "grad_norm": 0.8515625, + "learning_rate": 0.0001920924794206076, + "loss": 1.0629, + "step": 7606 + }, + { + "epoch": 0.19532642937727984, + "grad_norm": 0.80078125, + "learning_rate": 0.00019209073944941136, + "loss": 0.9773, + "step": 7607 + }, + { + "epoch": 0.19535210657320168, + "grad_norm": 0.94140625, + "learning_rate": 0.0001920889992946866, + "loss": 1.0073, + "step": 7608 + }, + { + "epoch": 0.1953777837691235, + "grad_norm": 0.83203125, + "learning_rate": 0.00019208725895643677, + "loss": 0.9887, + "step": 7609 + }, + { + "epoch": 0.19540346096504532, + "grad_norm": 0.83984375, + "learning_rate": 0.00019208551843466538, + "loss": 1.0415, + "step": 7610 + }, + { + "epoch": 0.19542913816096713, + "grad_norm": 0.8046875, + "learning_rate": 0.00019208377772937584, + "loss": 0.9429, + "step": 7611 + }, + { + "epoch": 0.19545481535688894, + "grad_norm": 1.1875, + "learning_rate": 0.00019208203684057169, + "loss": 0.9213, + "step": 7612 + }, + { + "epoch": 0.19548049255281078, + "grad_norm": 0.83203125, + "learning_rate": 0.0001920802957682563, + "loss": 1.0642, + "step": 7613 + }, + { + "epoch": 0.19550616974873258, + "grad_norm": 0.8125, + "learning_rate": 0.00019207855451243327, + "loss": 1.1378, + "step": 7614 + }, + { + "epoch": 0.19553184694465442, + "grad_norm": 0.765625, + "learning_rate": 0.00019207681307310598, + "loss": 0.9514, + "step": 7615 + }, + { + "epoch": 0.19555752414057623, + "grad_norm": 0.85546875, + "learning_rate": 0.00019207507145027787, + "loss": 1.0506, + "step": 7616 + }, + { + "epoch": 0.19558320133649804, + "grad_norm": 0.7734375, + "learning_rate": 0.0001920733296439525, + "loss": 0.9173, + "step": 7617 + }, + { + "epoch": 0.19560887853241987, + "grad_norm": 0.72265625, + "learning_rate": 0.0001920715876541333, + "loss": 0.9706, + "step": 7618 + }, + { + "epoch": 0.19563455572834168, + "grad_norm": 0.87109375, + "learning_rate": 0.0001920698454808237, + "loss": 1.1007, + "step": 7619 + }, + { + "epoch": 0.19566023292426352, + "grad_norm": 0.8359375, + "learning_rate": 0.00019206810312402727, + "loss": 1.0734, + "step": 7620 + }, + { + "epoch": 0.19568591012018532, + "grad_norm": 0.796875, + "learning_rate": 0.00019206636058374742, + "loss": 1.1431, + "step": 7621 + }, + { + "epoch": 0.19571158731610713, + "grad_norm": 0.7578125, + "learning_rate": 0.0001920646178599876, + "loss": 0.9817, + "step": 7622 + }, + { + "epoch": 0.19573726451202897, + "grad_norm": 0.765625, + "learning_rate": 0.00019206287495275132, + "loss": 1.1095, + "step": 7623 + }, + { + "epoch": 0.19576294170795078, + "grad_norm": 0.77734375, + "learning_rate": 0.00019206113186204208, + "loss": 1.1722, + "step": 7624 + }, + { + "epoch": 0.1957886189038726, + "grad_norm": 1.1328125, + "learning_rate": 0.0001920593885878633, + "loss": 1.0129, + "step": 7625 + }, + { + "epoch": 0.19581429609979442, + "grad_norm": 0.8828125, + "learning_rate": 0.00019205764513021846, + "loss": 1.0865, + "step": 7626 + }, + { + "epoch": 0.19583997329571623, + "grad_norm": 0.8203125, + "learning_rate": 0.00019205590148911107, + "loss": 0.9872, + "step": 7627 + }, + { + "epoch": 0.19586565049163807, + "grad_norm": 0.8203125, + "learning_rate": 0.00019205415766454455, + "loss": 1.0414, + "step": 7628 + }, + { + "epoch": 0.19589132768755987, + "grad_norm": 0.8515625, + "learning_rate": 0.00019205241365652246, + "loss": 1.1834, + "step": 7629 + }, + { + "epoch": 0.1959170048834817, + "grad_norm": 0.80859375, + "learning_rate": 0.00019205066946504818, + "loss": 1.2094, + "step": 7630 + }, + { + "epoch": 0.19594268207940352, + "grad_norm": 0.78125, + "learning_rate": 0.00019204892509012522, + "loss": 0.9233, + "step": 7631 + }, + { + "epoch": 0.19596835927532533, + "grad_norm": 0.8203125, + "learning_rate": 0.0001920471805317571, + "loss": 1.249, + "step": 7632 + }, + { + "epoch": 0.19599403647124716, + "grad_norm": 0.75390625, + "learning_rate": 0.00019204543578994727, + "loss": 0.9657, + "step": 7633 + }, + { + "epoch": 0.19601971366716897, + "grad_norm": 0.76953125, + "learning_rate": 0.00019204369086469917, + "loss": 1.0467, + "step": 7634 + }, + { + "epoch": 0.1960453908630908, + "grad_norm": 0.8125, + "learning_rate": 0.00019204194575601636, + "loss": 1.2782, + "step": 7635 + }, + { + "epoch": 0.19607106805901262, + "grad_norm": 0.84765625, + "learning_rate": 0.00019204020046390222, + "loss": 1.1342, + "step": 7636 + }, + { + "epoch": 0.19609674525493442, + "grad_norm": 0.79296875, + "learning_rate": 0.00019203845498836025, + "loss": 1.0114, + "step": 7637 + }, + { + "epoch": 0.19612242245085626, + "grad_norm": 0.78125, + "learning_rate": 0.00019203670932939403, + "loss": 1.0036, + "step": 7638 + }, + { + "epoch": 0.19614809964677807, + "grad_norm": 0.83984375, + "learning_rate": 0.0001920349634870069, + "loss": 1.0056, + "step": 7639 + }, + { + "epoch": 0.1961737768426999, + "grad_norm": 0.87109375, + "learning_rate": 0.00019203321746120243, + "loss": 1.0721, + "step": 7640 + }, + { + "epoch": 0.1961994540386217, + "grad_norm": 0.7578125, + "learning_rate": 0.00019203147125198406, + "loss": 0.9489, + "step": 7641 + }, + { + "epoch": 0.19622513123454352, + "grad_norm": 0.8515625, + "learning_rate": 0.0001920297248593553, + "loss": 0.9313, + "step": 7642 + }, + { + "epoch": 0.19625080843046536, + "grad_norm": 0.82421875, + "learning_rate": 0.0001920279782833196, + "loss": 1.0216, + "step": 7643 + }, + { + "epoch": 0.19627648562638716, + "grad_norm": 0.83984375, + "learning_rate": 0.00019202623152388042, + "loss": 1.1178, + "step": 7644 + }, + { + "epoch": 0.196302162822309, + "grad_norm": 0.79296875, + "learning_rate": 0.0001920244845810413, + "loss": 1.0677, + "step": 7645 + }, + { + "epoch": 0.1963278400182308, + "grad_norm": 0.90625, + "learning_rate": 0.00019202273745480572, + "loss": 1.1172, + "step": 7646 + }, + { + "epoch": 0.19635351721415262, + "grad_norm": 0.7734375, + "learning_rate": 0.0001920209901451771, + "loss": 0.9195, + "step": 7647 + }, + { + "epoch": 0.19637919441007445, + "grad_norm": 0.77734375, + "learning_rate": 0.00019201924265215898, + "loss": 1.0706, + "step": 7648 + }, + { + "epoch": 0.19640487160599626, + "grad_norm": 0.86328125, + "learning_rate": 0.0001920174949757548, + "loss": 1.0856, + "step": 7649 + }, + { + "epoch": 0.1964305488019181, + "grad_norm": 0.78515625, + "learning_rate": 0.00019201574711596807, + "loss": 0.9847, + "step": 7650 + }, + { + "epoch": 0.1964562259978399, + "grad_norm": 0.8125, + "learning_rate": 0.0001920139990728023, + "loss": 1.1923, + "step": 7651 + }, + { + "epoch": 0.19648190319376171, + "grad_norm": 0.890625, + "learning_rate": 0.00019201225084626092, + "loss": 1.0995, + "step": 7652 + }, + { + "epoch": 0.19650758038968355, + "grad_norm": 0.76953125, + "learning_rate": 0.0001920105024363474, + "loss": 1.1649, + "step": 7653 + }, + { + "epoch": 0.19653325758560536, + "grad_norm": 0.7734375, + "learning_rate": 0.0001920087538430653, + "loss": 1.0038, + "step": 7654 + }, + { + "epoch": 0.1965589347815272, + "grad_norm": 0.83203125, + "learning_rate": 0.00019200700506641808, + "loss": 1.004, + "step": 7655 + }, + { + "epoch": 0.196584611977449, + "grad_norm": 0.84765625, + "learning_rate": 0.0001920052561064092, + "loss": 1.0871, + "step": 7656 + }, + { + "epoch": 0.1966102891733708, + "grad_norm": 0.85546875, + "learning_rate": 0.0001920035069630421, + "loss": 1.0398, + "step": 7657 + }, + { + "epoch": 0.19663596636929265, + "grad_norm": 0.94921875, + "learning_rate": 0.00019200175763632037, + "loss": 0.986, + "step": 7658 + }, + { + "epoch": 0.19666164356521446, + "grad_norm": 0.875, + "learning_rate": 0.00019200000812624746, + "loss": 1.0629, + "step": 7659 + }, + { + "epoch": 0.1966873207611363, + "grad_norm": 0.875, + "learning_rate": 0.00019199825843282682, + "loss": 1.0544, + "step": 7660 + }, + { + "epoch": 0.1967129979570581, + "grad_norm": 0.86328125, + "learning_rate": 0.00019199650855606195, + "loss": 0.9222, + "step": 7661 + }, + { + "epoch": 0.1967386751529799, + "grad_norm": 0.80078125, + "learning_rate": 0.00019199475849595636, + "loss": 1.0001, + "step": 7662 + }, + { + "epoch": 0.19676435234890174, + "grad_norm": 0.78515625, + "learning_rate": 0.00019199300825251353, + "loss": 1.0623, + "step": 7663 + }, + { + "epoch": 0.19679002954482355, + "grad_norm": 0.91015625, + "learning_rate": 0.00019199125782573697, + "loss": 1.0394, + "step": 7664 + }, + { + "epoch": 0.1968157067407454, + "grad_norm": 0.859375, + "learning_rate": 0.0001919895072156301, + "loss": 1.0688, + "step": 7665 + }, + { + "epoch": 0.1968413839366672, + "grad_norm": 0.84375, + "learning_rate": 0.0001919877564221965, + "loss": 1.1134, + "step": 7666 + }, + { + "epoch": 0.196867061132589, + "grad_norm": 0.8046875, + "learning_rate": 0.00019198600544543956, + "loss": 1.0966, + "step": 7667 + }, + { + "epoch": 0.19689273832851084, + "grad_norm": 0.80078125, + "learning_rate": 0.00019198425428536284, + "loss": 0.9432, + "step": 7668 + }, + { + "epoch": 0.19691841552443265, + "grad_norm": 0.7890625, + "learning_rate": 0.00019198250294196983, + "loss": 0.9428, + "step": 7669 + }, + { + "epoch": 0.19694409272035449, + "grad_norm": 0.8046875, + "learning_rate": 0.00019198075141526398, + "loss": 1.1586, + "step": 7670 + }, + { + "epoch": 0.1969697699162763, + "grad_norm": 0.87890625, + "learning_rate": 0.00019197899970524877, + "loss": 1.0008, + "step": 7671 + }, + { + "epoch": 0.1969954471121981, + "grad_norm": 0.81640625, + "learning_rate": 0.00019197724781192777, + "loss": 0.9556, + "step": 7672 + }, + { + "epoch": 0.19702112430811994, + "grad_norm": 0.8203125, + "learning_rate": 0.0001919754957353044, + "loss": 1.1311, + "step": 7673 + }, + { + "epoch": 0.19704680150404175, + "grad_norm": 1.2734375, + "learning_rate": 0.00019197374347538218, + "loss": 1.0303, + "step": 7674 + }, + { + "epoch": 0.19707247869996358, + "grad_norm": 0.8046875, + "learning_rate": 0.0001919719910321646, + "loss": 1.1035, + "step": 7675 + }, + { + "epoch": 0.1970981558958854, + "grad_norm": 0.7890625, + "learning_rate": 0.00019197023840565516, + "loss": 0.9916, + "step": 7676 + }, + { + "epoch": 0.1971238330918072, + "grad_norm": 0.73828125, + "learning_rate": 0.00019196848559585732, + "loss": 0.969, + "step": 7677 + }, + { + "epoch": 0.19714951028772904, + "grad_norm": 0.80859375, + "learning_rate": 0.0001919667326027746, + "loss": 1.0664, + "step": 7678 + }, + { + "epoch": 0.19717518748365084, + "grad_norm": 0.828125, + "learning_rate": 0.0001919649794264105, + "loss": 0.8708, + "step": 7679 + }, + { + "epoch": 0.19720086467957268, + "grad_norm": 0.8515625, + "learning_rate": 0.0001919632260667685, + "loss": 1.0862, + "step": 7680 + }, + { + "epoch": 0.1972265418754945, + "grad_norm": 0.85546875, + "learning_rate": 0.0001919614725238521, + "loss": 1.118, + "step": 7681 + }, + { + "epoch": 0.1972522190714163, + "grad_norm": 0.83203125, + "learning_rate": 0.0001919597187976648, + "loss": 1.105, + "step": 7682 + }, + { + "epoch": 0.19727789626733813, + "grad_norm": 0.79296875, + "learning_rate": 0.00019195796488821008, + "loss": 0.9046, + "step": 7683 + }, + { + "epoch": 0.19730357346325994, + "grad_norm": 0.86328125, + "learning_rate": 0.00019195621079549143, + "loss": 1.1209, + "step": 7684 + }, + { + "epoch": 0.19732925065918178, + "grad_norm": 0.796875, + "learning_rate": 0.00019195445651951236, + "loss": 0.9748, + "step": 7685 + }, + { + "epoch": 0.19735492785510358, + "grad_norm": 0.765625, + "learning_rate": 0.00019195270206027635, + "loss": 0.8894, + "step": 7686 + }, + { + "epoch": 0.1973806050510254, + "grad_norm": 0.8359375, + "learning_rate": 0.00019195094741778697, + "loss": 1.0724, + "step": 7687 + }, + { + "epoch": 0.19740628224694723, + "grad_norm": 0.8125, + "learning_rate": 0.0001919491925920476, + "loss": 0.9117, + "step": 7688 + }, + { + "epoch": 0.19743195944286904, + "grad_norm": 0.79296875, + "learning_rate": 0.0001919474375830618, + "loss": 1.1814, + "step": 7689 + }, + { + "epoch": 0.19745763663879087, + "grad_norm": 0.8359375, + "learning_rate": 0.00019194568239083308, + "loss": 1.0038, + "step": 7690 + }, + { + "epoch": 0.19748331383471268, + "grad_norm": 0.84765625, + "learning_rate": 0.00019194392701536494, + "loss": 0.9369, + "step": 7691 + }, + { + "epoch": 0.1975089910306345, + "grad_norm": 0.89453125, + "learning_rate": 0.0001919421714566608, + "loss": 1.1964, + "step": 7692 + }, + { + "epoch": 0.19753466822655633, + "grad_norm": 0.7734375, + "learning_rate": 0.0001919404157147243, + "loss": 0.9805, + "step": 7693 + }, + { + "epoch": 0.19756034542247813, + "grad_norm": 0.796875, + "learning_rate": 0.0001919386597895588, + "loss": 1.114, + "step": 7694 + }, + { + "epoch": 0.19758602261839997, + "grad_norm": 0.8125, + "learning_rate": 0.00019193690368116785, + "loss": 1.119, + "step": 7695 + }, + { + "epoch": 0.19761169981432178, + "grad_norm": 0.7578125, + "learning_rate": 0.00019193514738955497, + "loss": 0.9277, + "step": 7696 + }, + { + "epoch": 0.1976373770102436, + "grad_norm": 0.71875, + "learning_rate": 0.00019193339091472367, + "loss": 1.0693, + "step": 7697 + }, + { + "epoch": 0.19766305420616542, + "grad_norm": 0.8671875, + "learning_rate": 0.0001919316342566774, + "loss": 1.2216, + "step": 7698 + }, + { + "epoch": 0.19768873140208723, + "grad_norm": 0.84765625, + "learning_rate": 0.00019192987741541967, + "loss": 1.0876, + "step": 7699 + }, + { + "epoch": 0.19771440859800907, + "grad_norm": 0.85546875, + "learning_rate": 0.00019192812039095402, + "loss": 0.9911, + "step": 7700 + }, + { + "epoch": 0.19774008579393088, + "grad_norm": 0.81640625, + "learning_rate": 0.00019192636318328393, + "loss": 1.0883, + "step": 7701 + }, + { + "epoch": 0.19776576298985268, + "grad_norm": 0.78515625, + "learning_rate": 0.0001919246057924129, + "loss": 1.1521, + "step": 7702 + }, + { + "epoch": 0.19779144018577452, + "grad_norm": 0.78515625, + "learning_rate": 0.00019192284821834444, + "loss": 1.0761, + "step": 7703 + }, + { + "epoch": 0.19781711738169633, + "grad_norm": 0.82421875, + "learning_rate": 0.00019192109046108205, + "loss": 1.0734, + "step": 7704 + }, + { + "epoch": 0.19784279457761816, + "grad_norm": 0.875, + "learning_rate": 0.00019191933252062923, + "loss": 1.0013, + "step": 7705 + }, + { + "epoch": 0.19786847177353997, + "grad_norm": 0.94140625, + "learning_rate": 0.00019191757439698947, + "loss": 1.098, + "step": 7706 + }, + { + "epoch": 0.19789414896946178, + "grad_norm": 0.80078125, + "learning_rate": 0.00019191581609016628, + "loss": 1.1605, + "step": 7707 + }, + { + "epoch": 0.19791982616538362, + "grad_norm": 0.7890625, + "learning_rate": 0.0001919140576001632, + "loss": 1.0756, + "step": 7708 + }, + { + "epoch": 0.19794550336130542, + "grad_norm": 0.7421875, + "learning_rate": 0.00019191229892698367, + "loss": 0.9418, + "step": 7709 + }, + { + "epoch": 0.19797118055722726, + "grad_norm": 0.80078125, + "learning_rate": 0.00019191054007063125, + "loss": 0.9664, + "step": 7710 + }, + { + "epoch": 0.19799685775314907, + "grad_norm": 0.7734375, + "learning_rate": 0.00019190878103110944, + "loss": 1.1381, + "step": 7711 + }, + { + "epoch": 0.19802253494907088, + "grad_norm": 0.78515625, + "learning_rate": 0.00019190702180842172, + "loss": 0.8941, + "step": 7712 + }, + { + "epoch": 0.1980482121449927, + "grad_norm": 0.76953125, + "learning_rate": 0.0001919052624025716, + "loss": 1.1102, + "step": 7713 + }, + { + "epoch": 0.19807388934091452, + "grad_norm": 0.87890625, + "learning_rate": 0.0001919035028135626, + "loss": 1.0846, + "step": 7714 + }, + { + "epoch": 0.19809956653683636, + "grad_norm": 0.83203125, + "learning_rate": 0.0001919017430413982, + "loss": 1.0907, + "step": 7715 + }, + { + "epoch": 0.19812524373275817, + "grad_norm": 0.8203125, + "learning_rate": 0.00019189998308608196, + "loss": 1.1139, + "step": 7716 + }, + { + "epoch": 0.19815092092867997, + "grad_norm": 0.80859375, + "learning_rate": 0.00019189822294761734, + "loss": 0.9555, + "step": 7717 + }, + { + "epoch": 0.1981765981246018, + "grad_norm": 0.859375, + "learning_rate": 0.00019189646262600784, + "loss": 1.0702, + "step": 7718 + }, + { + "epoch": 0.19820227532052362, + "grad_norm": 0.93359375, + "learning_rate": 0.000191894702121257, + "loss": 1.0462, + "step": 7719 + }, + { + "epoch": 0.19822795251644545, + "grad_norm": 1.0390625, + "learning_rate": 0.00019189294143336837, + "loss": 1.0005, + "step": 7720 + }, + { + "epoch": 0.19825362971236726, + "grad_norm": 0.7578125, + "learning_rate": 0.00019189118056234537, + "loss": 0.9765, + "step": 7721 + }, + { + "epoch": 0.19827930690828907, + "grad_norm": 0.79296875, + "learning_rate": 0.00019188941950819155, + "loss": 0.9385, + "step": 7722 + }, + { + "epoch": 0.1983049841042109, + "grad_norm": 0.77734375, + "learning_rate": 0.00019188765827091042, + "loss": 1.0848, + "step": 7723 + }, + { + "epoch": 0.19833066130013272, + "grad_norm": 0.76171875, + "learning_rate": 0.0001918858968505055, + "loss": 1.0876, + "step": 7724 + }, + { + "epoch": 0.19835633849605455, + "grad_norm": 0.78125, + "learning_rate": 0.00019188413524698025, + "loss": 1.1984, + "step": 7725 + }, + { + "epoch": 0.19838201569197636, + "grad_norm": 0.80078125, + "learning_rate": 0.00019188237346033825, + "loss": 0.9838, + "step": 7726 + }, + { + "epoch": 0.19840769288789817, + "grad_norm": 0.83984375, + "learning_rate": 0.00019188061149058295, + "loss": 1.0618, + "step": 7727 + }, + { + "epoch": 0.19843337008382, + "grad_norm": 0.82421875, + "learning_rate": 0.00019187884933771795, + "loss": 0.9696, + "step": 7728 + }, + { + "epoch": 0.1984590472797418, + "grad_norm": 0.828125, + "learning_rate": 0.00019187708700174665, + "loss": 1.0715, + "step": 7729 + }, + { + "epoch": 0.19848472447566365, + "grad_norm": 0.8203125, + "learning_rate": 0.00019187532448267263, + "loss": 1.1038, + "step": 7730 + }, + { + "epoch": 0.19851040167158546, + "grad_norm": 0.8671875, + "learning_rate": 0.00019187356178049938, + "loss": 1.0831, + "step": 7731 + }, + { + "epoch": 0.19853607886750727, + "grad_norm": 0.77734375, + "learning_rate": 0.0001918717988952304, + "loss": 1.0951, + "step": 7732 + }, + { + "epoch": 0.1985617560634291, + "grad_norm": 0.87890625, + "learning_rate": 0.0001918700358268693, + "loss": 1.1308, + "step": 7733 + }, + { + "epoch": 0.1985874332593509, + "grad_norm": 0.86328125, + "learning_rate": 0.00019186827257541944, + "loss": 1.0417, + "step": 7734 + }, + { + "epoch": 0.19861311045527275, + "grad_norm": 0.8125, + "learning_rate": 0.00019186650914088444, + "loss": 1.0841, + "step": 7735 + }, + { + "epoch": 0.19863878765119455, + "grad_norm": 0.859375, + "learning_rate": 0.0001918647455232678, + "loss": 1.0289, + "step": 7736 + }, + { + "epoch": 0.19866446484711636, + "grad_norm": 0.80859375, + "learning_rate": 0.000191862981722573, + "loss": 1.1111, + "step": 7737 + }, + { + "epoch": 0.1986901420430382, + "grad_norm": 0.8359375, + "learning_rate": 0.00019186121773880356, + "loss": 1.1366, + "step": 7738 + }, + { + "epoch": 0.19871581923896, + "grad_norm": 0.8359375, + "learning_rate": 0.000191859453571963, + "loss": 1.059, + "step": 7739 + }, + { + "epoch": 0.19874149643488184, + "grad_norm": 0.85546875, + "learning_rate": 0.00019185768922205492, + "loss": 1.0734, + "step": 7740 + }, + { + "epoch": 0.19876717363080365, + "grad_norm": 0.87890625, + "learning_rate": 0.0001918559246890827, + "loss": 1.1045, + "step": 7741 + }, + { + "epoch": 0.19879285082672546, + "grad_norm": 0.7734375, + "learning_rate": 0.00019185415997304995, + "loss": 0.8498, + "step": 7742 + }, + { + "epoch": 0.1988185280226473, + "grad_norm": 0.87109375, + "learning_rate": 0.00019185239507396012, + "loss": 1.2322, + "step": 7743 + }, + { + "epoch": 0.1988442052185691, + "grad_norm": 0.734375, + "learning_rate": 0.0001918506299918168, + "loss": 0.9482, + "step": 7744 + }, + { + "epoch": 0.19886988241449094, + "grad_norm": 0.74609375, + "learning_rate": 0.0001918488647266235, + "loss": 1.0335, + "step": 7745 + }, + { + "epoch": 0.19889555961041275, + "grad_norm": 0.8125, + "learning_rate": 0.00019184709927838365, + "loss": 0.9778, + "step": 7746 + }, + { + "epoch": 0.19892123680633456, + "grad_norm": 0.84375, + "learning_rate": 0.00019184533364710084, + "loss": 1.108, + "step": 7747 + }, + { + "epoch": 0.1989469140022564, + "grad_norm": 0.8359375, + "learning_rate": 0.00019184356783277857, + "loss": 1.2696, + "step": 7748 + }, + { + "epoch": 0.1989725911981782, + "grad_norm": 0.765625, + "learning_rate": 0.0001918418018354204, + "loss": 0.9185, + "step": 7749 + }, + { + "epoch": 0.1989982683941, + "grad_norm": 0.91796875, + "learning_rate": 0.0001918400356550298, + "loss": 1.169, + "step": 7750 + }, + { + "epoch": 0.19902394559002184, + "grad_norm": 0.76953125, + "learning_rate": 0.00019183826929161028, + "loss": 0.881, + "step": 7751 + }, + { + "epoch": 0.19904962278594365, + "grad_norm": 0.7734375, + "learning_rate": 0.0001918365027451654, + "loss": 1.0453, + "step": 7752 + }, + { + "epoch": 0.1990752999818655, + "grad_norm": 0.82421875, + "learning_rate": 0.00019183473601569869, + "loss": 0.9221, + "step": 7753 + }, + { + "epoch": 0.1991009771777873, + "grad_norm": 0.76953125, + "learning_rate": 0.0001918329691032136, + "loss": 0.9788, + "step": 7754 + }, + { + "epoch": 0.1991266543737091, + "grad_norm": 0.78125, + "learning_rate": 0.00019183120200771375, + "loss": 1.0323, + "step": 7755 + }, + { + "epoch": 0.19915233156963094, + "grad_norm": 0.7734375, + "learning_rate": 0.00019182943472920258, + "loss": 1.1532, + "step": 7756 + }, + { + "epoch": 0.19917800876555275, + "grad_norm": 0.859375, + "learning_rate": 0.00019182766726768364, + "loss": 1.1824, + "step": 7757 + }, + { + "epoch": 0.19920368596147459, + "grad_norm": 0.85546875, + "learning_rate": 0.00019182589962316046, + "loss": 1.0943, + "step": 7758 + }, + { + "epoch": 0.1992293631573964, + "grad_norm": 0.86328125, + "learning_rate": 0.00019182413179563655, + "loss": 1.2022, + "step": 7759 + }, + { + "epoch": 0.1992550403533182, + "grad_norm": 0.84765625, + "learning_rate": 0.00019182236378511544, + "loss": 1.0797, + "step": 7760 + }, + { + "epoch": 0.19928071754924004, + "grad_norm": 0.7421875, + "learning_rate": 0.00019182059559160066, + "loss": 1.0551, + "step": 7761 + }, + { + "epoch": 0.19930639474516185, + "grad_norm": 0.93359375, + "learning_rate": 0.00019181882721509572, + "loss": 1.0555, + "step": 7762 + }, + { + "epoch": 0.19933207194108368, + "grad_norm": 0.8046875, + "learning_rate": 0.00019181705865560416, + "loss": 1.0656, + "step": 7763 + }, + { + "epoch": 0.1993577491370055, + "grad_norm": 0.8515625, + "learning_rate": 0.0001918152899131295, + "loss": 1.2318, + "step": 7764 + }, + { + "epoch": 0.1993834263329273, + "grad_norm": 0.79296875, + "learning_rate": 0.00019181352098767524, + "loss": 1.0931, + "step": 7765 + }, + { + "epoch": 0.19940910352884914, + "grad_norm": 0.83203125, + "learning_rate": 0.00019181175187924496, + "loss": 1.0366, + "step": 7766 + }, + { + "epoch": 0.19943478072477094, + "grad_norm": 0.8359375, + "learning_rate": 0.0001918099825878421, + "loss": 0.9426, + "step": 7767 + }, + { + "epoch": 0.19946045792069278, + "grad_norm": 0.8046875, + "learning_rate": 0.00019180821311347028, + "loss": 1.1029, + "step": 7768 + }, + { + "epoch": 0.1994861351166146, + "grad_norm": 0.8046875, + "learning_rate": 0.000191806443456133, + "loss": 1.102, + "step": 7769 + }, + { + "epoch": 0.1995118123125364, + "grad_norm": 0.78125, + "learning_rate": 0.00019180467361583375, + "loss": 0.9352, + "step": 7770 + }, + { + "epoch": 0.19953748950845823, + "grad_norm": 0.796875, + "learning_rate": 0.00019180290359257604, + "loss": 0.9876, + "step": 7771 + }, + { + "epoch": 0.19956316670438004, + "grad_norm": 0.83984375, + "learning_rate": 0.0001918011333863635, + "loss": 0.9737, + "step": 7772 + }, + { + "epoch": 0.19958884390030188, + "grad_norm": 0.8125, + "learning_rate": 0.00019179936299719956, + "loss": 1.082, + "step": 7773 + }, + { + "epoch": 0.19961452109622368, + "grad_norm": 0.921875, + "learning_rate": 0.00019179759242508778, + "loss": 1.0201, + "step": 7774 + }, + { + "epoch": 0.1996401982921455, + "grad_norm": 0.74609375, + "learning_rate": 0.00019179582167003169, + "loss": 1.1005, + "step": 7775 + }, + { + "epoch": 0.19966587548806733, + "grad_norm": 0.76953125, + "learning_rate": 0.00019179405073203483, + "loss": 1.1122, + "step": 7776 + }, + { + "epoch": 0.19969155268398914, + "grad_norm": 0.86328125, + "learning_rate": 0.0001917922796111007, + "loss": 0.9648, + "step": 7777 + }, + { + "epoch": 0.19971722987991097, + "grad_norm": 0.8515625, + "learning_rate": 0.00019179050830723287, + "loss": 1.0956, + "step": 7778 + }, + { + "epoch": 0.19974290707583278, + "grad_norm": 0.79296875, + "learning_rate": 0.00019178873682043483, + "loss": 1.0512, + "step": 7779 + }, + { + "epoch": 0.1997685842717546, + "grad_norm": 0.73046875, + "learning_rate": 0.00019178696515071014, + "loss": 0.9835, + "step": 7780 + }, + { + "epoch": 0.19979426146767643, + "grad_norm": 0.921875, + "learning_rate": 0.00019178519329806232, + "loss": 0.9894, + "step": 7781 + }, + { + "epoch": 0.19981993866359823, + "grad_norm": 0.8671875, + "learning_rate": 0.0001917834212624949, + "loss": 1.1552, + "step": 7782 + }, + { + "epoch": 0.19984561585952007, + "grad_norm": 0.8203125, + "learning_rate": 0.0001917816490440114, + "loss": 1.0802, + "step": 7783 + }, + { + "epoch": 0.19987129305544188, + "grad_norm": 0.78515625, + "learning_rate": 0.00019177987664261537, + "loss": 1.0596, + "step": 7784 + }, + { + "epoch": 0.1998969702513637, + "grad_norm": 0.76171875, + "learning_rate": 0.00019177810405831034, + "loss": 0.9165, + "step": 7785 + }, + { + "epoch": 0.19992264744728552, + "grad_norm": 0.80859375, + "learning_rate": 0.00019177633129109983, + "loss": 1.0946, + "step": 7786 + }, + { + "epoch": 0.19994832464320733, + "grad_norm": 0.7734375, + "learning_rate": 0.00019177455834098737, + "loss": 0.9073, + "step": 7787 + }, + { + "epoch": 0.19997400183912917, + "grad_norm": 0.80859375, + "learning_rate": 0.00019177278520797657, + "loss": 1.0705, + "step": 7788 + }, + { + "epoch": 0.19999967903505098, + "grad_norm": 0.87890625, + "learning_rate": 0.00019177101189207086, + "loss": 0.9632, + "step": 7789 + }, + { + "epoch": 0.20002535623097278, + "grad_norm": 0.78515625, + "learning_rate": 0.0001917692383932738, + "loss": 0.9494, + "step": 7790 + }, + { + "epoch": 0.20005103342689462, + "grad_norm": 0.77734375, + "learning_rate": 0.00019176746471158892, + "loss": 1.005, + "step": 7791 + }, + { + "epoch": 0.20007671062281643, + "grad_norm": 0.80078125, + "learning_rate": 0.00019176569084701983, + "loss": 0.9687, + "step": 7792 + }, + { + "epoch": 0.20010238781873826, + "grad_norm": 0.7578125, + "learning_rate": 0.00019176391679956995, + "loss": 1.0203, + "step": 7793 + }, + { + "epoch": 0.20012806501466007, + "grad_norm": 0.7890625, + "learning_rate": 0.0001917621425692429, + "loss": 1.0254, + "step": 7794 + }, + { + "epoch": 0.20015374221058188, + "grad_norm": 0.78515625, + "learning_rate": 0.00019176036815604219, + "loss": 1.1491, + "step": 7795 + }, + { + "epoch": 0.20017941940650372, + "grad_norm": 0.84375, + "learning_rate": 0.00019175859355997137, + "loss": 1.1864, + "step": 7796 + }, + { + "epoch": 0.20020509660242553, + "grad_norm": 0.828125, + "learning_rate": 0.0001917568187810339, + "loss": 0.9288, + "step": 7797 + }, + { + "epoch": 0.20023077379834736, + "grad_norm": 0.8046875, + "learning_rate": 0.00019175504381923345, + "loss": 0.9888, + "step": 7798 + }, + { + "epoch": 0.20025645099426917, + "grad_norm": 0.79296875, + "learning_rate": 0.00019175326867457347, + "loss": 1.0744, + "step": 7799 + }, + { + "epoch": 0.20028212819019098, + "grad_norm": 0.82421875, + "learning_rate": 0.00019175149334705748, + "loss": 1.0676, + "step": 7800 + }, + { + "epoch": 0.2003078053861128, + "grad_norm": 0.765625, + "learning_rate": 0.00019174971783668908, + "loss": 1.1545, + "step": 7801 + }, + { + "epoch": 0.20033348258203462, + "grad_norm": 0.77734375, + "learning_rate": 0.00019174794214347175, + "loss": 1.0596, + "step": 7802 + }, + { + "epoch": 0.20035915977795646, + "grad_norm": 0.80859375, + "learning_rate": 0.00019174616626740912, + "loss": 1.1052, + "step": 7803 + }, + { + "epoch": 0.20038483697387827, + "grad_norm": 0.86328125, + "learning_rate": 0.0001917443902085046, + "loss": 1.0088, + "step": 7804 + }, + { + "epoch": 0.20041051416980007, + "grad_norm": 0.8359375, + "learning_rate": 0.00019174261396676183, + "loss": 1.0261, + "step": 7805 + }, + { + "epoch": 0.2004361913657219, + "grad_norm": 0.80859375, + "learning_rate": 0.0001917408375421843, + "loss": 1.0019, + "step": 7806 + }, + { + "epoch": 0.20046186856164372, + "grad_norm": 0.828125, + "learning_rate": 0.0001917390609347756, + "loss": 1.1267, + "step": 7807 + }, + { + "epoch": 0.20048754575756556, + "grad_norm": 0.79296875, + "learning_rate": 0.0001917372841445392, + "loss": 0.8947, + "step": 7808 + }, + { + "epoch": 0.20051322295348736, + "grad_norm": 0.78515625, + "learning_rate": 0.00019173550717147873, + "loss": 0.9682, + "step": 7809 + }, + { + "epoch": 0.20053890014940917, + "grad_norm": 0.79296875, + "learning_rate": 0.00019173373001559764, + "loss": 1.0702, + "step": 7810 + }, + { + "epoch": 0.200564577345331, + "grad_norm": 0.77734375, + "learning_rate": 0.00019173195267689952, + "loss": 1.1214, + "step": 7811 + }, + { + "epoch": 0.20059025454125282, + "grad_norm": 0.8359375, + "learning_rate": 0.00019173017515538792, + "loss": 1.013, + "step": 7812 + }, + { + "epoch": 0.20061593173717465, + "grad_norm": 0.83984375, + "learning_rate": 0.00019172839745106634, + "loss": 0.9856, + "step": 7813 + }, + { + "epoch": 0.20064160893309646, + "grad_norm": 0.8046875, + "learning_rate": 0.00019172661956393835, + "loss": 1.0073, + "step": 7814 + }, + { + "epoch": 0.20066728612901827, + "grad_norm": 0.76171875, + "learning_rate": 0.0001917248414940075, + "loss": 0.9271, + "step": 7815 + }, + { + "epoch": 0.2006929633249401, + "grad_norm": 0.76171875, + "learning_rate": 0.00019172306324127735, + "loss": 1.0241, + "step": 7816 + }, + { + "epoch": 0.2007186405208619, + "grad_norm": 0.8125, + "learning_rate": 0.00019172128480575141, + "loss": 0.9467, + "step": 7817 + }, + { + "epoch": 0.20074431771678375, + "grad_norm": 0.8125, + "learning_rate": 0.00019171950618743325, + "loss": 0.9448, + "step": 7818 + }, + { + "epoch": 0.20076999491270556, + "grad_norm": 0.81640625, + "learning_rate": 0.00019171772738632635, + "loss": 1.031, + "step": 7819 + }, + { + "epoch": 0.20079567210862737, + "grad_norm": 0.7578125, + "learning_rate": 0.00019171594840243432, + "loss": 1.1801, + "step": 7820 + }, + { + "epoch": 0.2008213493045492, + "grad_norm": 0.83984375, + "learning_rate": 0.00019171416923576074, + "loss": 1.0011, + "step": 7821 + }, + { + "epoch": 0.200847026500471, + "grad_norm": 0.88671875, + "learning_rate": 0.00019171238988630905, + "loss": 0.9688, + "step": 7822 + }, + { + "epoch": 0.20087270369639285, + "grad_norm": 0.80078125, + "learning_rate": 0.00019171061035408283, + "loss": 1.2182, + "step": 7823 + }, + { + "epoch": 0.20089838089231465, + "grad_norm": 0.85546875, + "learning_rate": 0.00019170883063908571, + "loss": 1.1073, + "step": 7824 + }, + { + "epoch": 0.20092405808823646, + "grad_norm": 0.77734375, + "learning_rate": 0.00019170705074132114, + "loss": 1.0262, + "step": 7825 + }, + { + "epoch": 0.2009497352841583, + "grad_norm": 0.76953125, + "learning_rate": 0.00019170527066079274, + "loss": 0.9106, + "step": 7826 + }, + { + "epoch": 0.2009754124800801, + "grad_norm": 0.78515625, + "learning_rate": 0.00019170349039750398, + "loss": 1.1274, + "step": 7827 + }, + { + "epoch": 0.20100108967600194, + "grad_norm": 0.84375, + "learning_rate": 0.00019170170995145848, + "loss": 1.1453, + "step": 7828 + }, + { + "epoch": 0.20102676687192375, + "grad_norm": 0.765625, + "learning_rate": 0.0001916999293226597, + "loss": 1.0972, + "step": 7829 + }, + { + "epoch": 0.20105244406784556, + "grad_norm": 0.76953125, + "learning_rate": 0.00019169814851111128, + "loss": 1.0009, + "step": 7830 + }, + { + "epoch": 0.2010781212637674, + "grad_norm": 0.80078125, + "learning_rate": 0.00019169636751681672, + "loss": 1.0869, + "step": 7831 + }, + { + "epoch": 0.2011037984596892, + "grad_norm": 0.7734375, + "learning_rate": 0.0001916945863397796, + "loss": 1.0535, + "step": 7832 + }, + { + "epoch": 0.20112947565561104, + "grad_norm": 0.78125, + "learning_rate": 0.00019169280498000342, + "loss": 0.942, + "step": 7833 + }, + { + "epoch": 0.20115515285153285, + "grad_norm": 0.8203125, + "learning_rate": 0.00019169102343749182, + "loss": 0.9414, + "step": 7834 + }, + { + "epoch": 0.20118083004745466, + "grad_norm": 0.74609375, + "learning_rate": 0.00019168924171224824, + "loss": 1.1162, + "step": 7835 + }, + { + "epoch": 0.2012065072433765, + "grad_norm": 0.82421875, + "learning_rate": 0.00019168745980427631, + "loss": 1.0903, + "step": 7836 + }, + { + "epoch": 0.2012321844392983, + "grad_norm": 0.83203125, + "learning_rate": 0.00019168567771357954, + "loss": 0.985, + "step": 7837 + }, + { + "epoch": 0.20125786163522014, + "grad_norm": 0.80859375, + "learning_rate": 0.00019168389544016153, + "loss": 0.9505, + "step": 7838 + }, + { + "epoch": 0.20128353883114194, + "grad_norm": 0.8984375, + "learning_rate": 0.00019168211298402574, + "loss": 1.1574, + "step": 7839 + }, + { + "epoch": 0.20130921602706375, + "grad_norm": 0.80078125, + "learning_rate": 0.00019168033034517584, + "loss": 1.0243, + "step": 7840 + }, + { + "epoch": 0.2013348932229856, + "grad_norm": 0.8984375, + "learning_rate": 0.00019167854752361526, + "loss": 1.0456, + "step": 7841 + }, + { + "epoch": 0.2013605704189074, + "grad_norm": 0.79296875, + "learning_rate": 0.00019167676451934765, + "loss": 1.1959, + "step": 7842 + }, + { + "epoch": 0.20138624761482923, + "grad_norm": 0.78515625, + "learning_rate": 0.00019167498133237655, + "loss": 1.0562, + "step": 7843 + }, + { + "epoch": 0.20141192481075104, + "grad_norm": 0.7890625, + "learning_rate": 0.00019167319796270548, + "loss": 0.9569, + "step": 7844 + }, + { + "epoch": 0.20143760200667285, + "grad_norm": 0.78125, + "learning_rate": 0.000191671414410338, + "loss": 1.1482, + "step": 7845 + }, + { + "epoch": 0.2014632792025947, + "grad_norm": 0.875, + "learning_rate": 0.00019166963067527768, + "loss": 1.0247, + "step": 7846 + }, + { + "epoch": 0.2014889563985165, + "grad_norm": 0.83984375, + "learning_rate": 0.00019166784675752805, + "loss": 1.0248, + "step": 7847 + }, + { + "epoch": 0.20151463359443833, + "grad_norm": 0.85546875, + "learning_rate": 0.0001916660626570927, + "loss": 0.9521, + "step": 7848 + }, + { + "epoch": 0.20154031079036014, + "grad_norm": 0.80078125, + "learning_rate": 0.00019166427837397516, + "loss": 1.0837, + "step": 7849 + }, + { + "epoch": 0.20156598798628195, + "grad_norm": 0.9140625, + "learning_rate": 0.000191662493908179, + "loss": 1.062, + "step": 7850 + }, + { + "epoch": 0.20159166518220378, + "grad_norm": 0.86328125, + "learning_rate": 0.00019166070925970776, + "loss": 1.1165, + "step": 7851 + }, + { + "epoch": 0.2016173423781256, + "grad_norm": 0.82421875, + "learning_rate": 0.00019165892442856501, + "loss": 1.0127, + "step": 7852 + }, + { + "epoch": 0.20164301957404743, + "grad_norm": 0.83984375, + "learning_rate": 0.0001916571394147543, + "loss": 1.1172, + "step": 7853 + }, + { + "epoch": 0.20166869676996924, + "grad_norm": 0.765625, + "learning_rate": 0.0001916553542182792, + "loss": 1.1527, + "step": 7854 + }, + { + "epoch": 0.20169437396589104, + "grad_norm": 0.87109375, + "learning_rate": 0.00019165356883914323, + "loss": 0.9721, + "step": 7855 + }, + { + "epoch": 0.20172005116181288, + "grad_norm": 0.7890625, + "learning_rate": 0.00019165178327734998, + "loss": 1.0866, + "step": 7856 + }, + { + "epoch": 0.2017457283577347, + "grad_norm": 0.7265625, + "learning_rate": 0.00019164999753290305, + "loss": 1.0578, + "step": 7857 + }, + { + "epoch": 0.20177140555365652, + "grad_norm": 0.79296875, + "learning_rate": 0.0001916482116058059, + "loss": 1.1387, + "step": 7858 + }, + { + "epoch": 0.20179708274957833, + "grad_norm": 0.80078125, + "learning_rate": 0.00019164642549606217, + "loss": 1.0841, + "step": 7859 + }, + { + "epoch": 0.20182275994550014, + "grad_norm": 0.8515625, + "learning_rate": 0.00019164463920367538, + "loss": 0.9487, + "step": 7860 + }, + { + "epoch": 0.20184843714142198, + "grad_norm": 0.8203125, + "learning_rate": 0.00019164285272864912, + "loss": 1.0793, + "step": 7861 + }, + { + "epoch": 0.20187411433734379, + "grad_norm": 0.8203125, + "learning_rate": 0.00019164106607098694, + "loss": 0.9776, + "step": 7862 + }, + { + "epoch": 0.20189979153326562, + "grad_norm": 0.85546875, + "learning_rate": 0.00019163927923069237, + "loss": 1.1705, + "step": 7863 + }, + { + "epoch": 0.20192546872918743, + "grad_norm": 0.73828125, + "learning_rate": 0.00019163749220776902, + "loss": 0.8113, + "step": 7864 + }, + { + "epoch": 0.20195114592510924, + "grad_norm": 0.79296875, + "learning_rate": 0.0001916357050022204, + "loss": 1.0344, + "step": 7865 + }, + { + "epoch": 0.20197682312103107, + "grad_norm": 0.8203125, + "learning_rate": 0.0001916339176140501, + "loss": 0.9799, + "step": 7866 + }, + { + "epoch": 0.20200250031695288, + "grad_norm": 0.79296875, + "learning_rate": 0.00019163213004326168, + "loss": 0.9744, + "step": 7867 + }, + { + "epoch": 0.20202817751287472, + "grad_norm": 0.78125, + "learning_rate": 0.0001916303422898587, + "loss": 1.0554, + "step": 7868 + }, + { + "epoch": 0.20205385470879653, + "grad_norm": 0.80078125, + "learning_rate": 0.00019162855435384476, + "loss": 1.0664, + "step": 7869 + }, + { + "epoch": 0.20207953190471833, + "grad_norm": 0.8125, + "learning_rate": 0.00019162676623522333, + "loss": 1.152, + "step": 7870 + }, + { + "epoch": 0.20210520910064017, + "grad_norm": 0.8515625, + "learning_rate": 0.00019162497793399807, + "loss": 0.9417, + "step": 7871 + }, + { + "epoch": 0.20213088629656198, + "grad_norm": 0.80859375, + "learning_rate": 0.00019162318945017251, + "loss": 1.0648, + "step": 7872 + }, + { + "epoch": 0.20215656349248382, + "grad_norm": 0.8125, + "learning_rate": 0.00019162140078375018, + "loss": 0.9962, + "step": 7873 + }, + { + "epoch": 0.20218224068840562, + "grad_norm": 0.86328125, + "learning_rate": 0.0001916196119347347, + "loss": 1.0128, + "step": 7874 + }, + { + "epoch": 0.20220791788432743, + "grad_norm": 0.8203125, + "learning_rate": 0.00019161782290312958, + "loss": 0.9306, + "step": 7875 + }, + { + "epoch": 0.20223359508024927, + "grad_norm": 0.80078125, + "learning_rate": 0.00019161603368893844, + "loss": 1.0321, + "step": 7876 + }, + { + "epoch": 0.20225927227617108, + "grad_norm": 0.83203125, + "learning_rate": 0.0001916142442921648, + "loss": 1.1142, + "step": 7877 + }, + { + "epoch": 0.2022849494720929, + "grad_norm": 0.7890625, + "learning_rate": 0.0001916124547128123, + "loss": 0.9569, + "step": 7878 + }, + { + "epoch": 0.20231062666801472, + "grad_norm": 0.80078125, + "learning_rate": 0.00019161066495088438, + "loss": 1.0135, + "step": 7879 + }, + { + "epoch": 0.20233630386393653, + "grad_norm": 0.8203125, + "learning_rate": 0.00019160887500638474, + "loss": 1.0358, + "step": 7880 + }, + { + "epoch": 0.20236198105985836, + "grad_norm": 0.80859375, + "learning_rate": 0.00019160708487931684, + "loss": 1.0282, + "step": 7881 + }, + { + "epoch": 0.20238765825578017, + "grad_norm": 0.82421875, + "learning_rate": 0.0001916052945696843, + "loss": 1.0157, + "step": 7882 + }, + { + "epoch": 0.202413335451702, + "grad_norm": 0.859375, + "learning_rate": 0.0001916035040774907, + "loss": 1.1417, + "step": 7883 + }, + { + "epoch": 0.20243901264762382, + "grad_norm": 0.7578125, + "learning_rate": 0.0001916017134027396, + "loss": 0.8907, + "step": 7884 + }, + { + "epoch": 0.20246468984354563, + "grad_norm": 0.7578125, + "learning_rate": 0.00019159992254543456, + "loss": 0.9851, + "step": 7885 + }, + { + "epoch": 0.20249036703946746, + "grad_norm": 0.8125, + "learning_rate": 0.00019159813150557912, + "loss": 1.1479, + "step": 7886 + }, + { + "epoch": 0.20251604423538927, + "grad_norm": 0.75, + "learning_rate": 0.0001915963402831769, + "loss": 0.9464, + "step": 7887 + }, + { + "epoch": 0.2025417214313111, + "grad_norm": 0.79296875, + "learning_rate": 0.00019159454887823145, + "loss": 0.9196, + "step": 7888 + }, + { + "epoch": 0.20256739862723291, + "grad_norm": 0.76953125, + "learning_rate": 0.00019159275729074634, + "loss": 1.045, + "step": 7889 + }, + { + "epoch": 0.20259307582315472, + "grad_norm": 0.8125, + "learning_rate": 0.00019159096552072514, + "loss": 0.9648, + "step": 7890 + }, + { + "epoch": 0.20261875301907656, + "grad_norm": 0.765625, + "learning_rate": 0.00019158917356817138, + "loss": 1.034, + "step": 7891 + }, + { + "epoch": 0.20264443021499837, + "grad_norm": 0.8125, + "learning_rate": 0.0001915873814330887, + "loss": 0.94, + "step": 7892 + }, + { + "epoch": 0.2026701074109202, + "grad_norm": 0.765625, + "learning_rate": 0.00019158558911548066, + "loss": 1.0179, + "step": 7893 + }, + { + "epoch": 0.202695784606842, + "grad_norm": 0.8984375, + "learning_rate": 0.00019158379661535078, + "loss": 1.0122, + "step": 7894 + }, + { + "epoch": 0.20272146180276382, + "grad_norm": 0.8046875, + "learning_rate": 0.00019158200393270267, + "loss": 1.0074, + "step": 7895 + }, + { + "epoch": 0.20274713899868566, + "grad_norm": 0.98828125, + "learning_rate": 0.00019158021106753992, + "loss": 0.9826, + "step": 7896 + }, + { + "epoch": 0.20277281619460746, + "grad_norm": 0.8125, + "learning_rate": 0.00019157841801986606, + "loss": 0.9461, + "step": 7897 + }, + { + "epoch": 0.2027984933905293, + "grad_norm": 0.76171875, + "learning_rate": 0.00019157662478968472, + "loss": 1.0955, + "step": 7898 + }, + { + "epoch": 0.2028241705864511, + "grad_norm": 0.8671875, + "learning_rate": 0.00019157483137699938, + "loss": 1.0174, + "step": 7899 + }, + { + "epoch": 0.20284984778237292, + "grad_norm": 0.7109375, + "learning_rate": 0.00019157303778181373, + "loss": 1.0404, + "step": 7900 + }, + { + "epoch": 0.20287552497829475, + "grad_norm": 1.03125, + "learning_rate": 0.00019157124400413127, + "loss": 1.0849, + "step": 7901 + }, + { + "epoch": 0.20290120217421656, + "grad_norm": 0.73046875, + "learning_rate": 0.0001915694500439556, + "loss": 0.8527, + "step": 7902 + }, + { + "epoch": 0.2029268793701384, + "grad_norm": 0.81640625, + "learning_rate": 0.00019156765590129028, + "loss": 1.194, + "step": 7903 + }, + { + "epoch": 0.2029525565660602, + "grad_norm": 0.84765625, + "learning_rate": 0.00019156586157613887, + "loss": 1.1809, + "step": 7904 + }, + { + "epoch": 0.202978233761982, + "grad_norm": 0.8359375, + "learning_rate": 0.000191564067068505, + "loss": 1.1018, + "step": 7905 + }, + { + "epoch": 0.20300391095790385, + "grad_norm": 0.79296875, + "learning_rate": 0.00019156227237839216, + "loss": 1.1169, + "step": 7906 + }, + { + "epoch": 0.20302958815382566, + "grad_norm": 0.80859375, + "learning_rate": 0.00019156047750580406, + "loss": 1.0017, + "step": 7907 + }, + { + "epoch": 0.2030552653497475, + "grad_norm": 0.7890625, + "learning_rate": 0.00019155868245074415, + "loss": 1.1513, + "step": 7908 + }, + { + "epoch": 0.2030809425456693, + "grad_norm": 0.796875, + "learning_rate": 0.00019155688721321607, + "loss": 0.9601, + "step": 7909 + }, + { + "epoch": 0.2031066197415911, + "grad_norm": 1.25, + "learning_rate": 0.0001915550917932234, + "loss": 0.9789, + "step": 7910 + }, + { + "epoch": 0.20313229693751295, + "grad_norm": 0.80078125, + "learning_rate": 0.00019155329619076968, + "loss": 0.9556, + "step": 7911 + }, + { + "epoch": 0.20315797413343475, + "grad_norm": 0.7578125, + "learning_rate": 0.0001915515004058585, + "loss": 0.9977, + "step": 7912 + }, + { + "epoch": 0.2031836513293566, + "grad_norm": 0.8125, + "learning_rate": 0.00019154970443849346, + "loss": 0.9915, + "step": 7913 + }, + { + "epoch": 0.2032093285252784, + "grad_norm": 0.87109375, + "learning_rate": 0.00019154790828867816, + "loss": 1.0272, + "step": 7914 + }, + { + "epoch": 0.2032350057212002, + "grad_norm": 0.81640625, + "learning_rate": 0.0001915461119564161, + "loss": 1.1953, + "step": 7915 + }, + { + "epoch": 0.20326068291712204, + "grad_norm": 0.7890625, + "learning_rate": 0.00019154431544171095, + "loss": 1.0026, + "step": 7916 + }, + { + "epoch": 0.20328636011304385, + "grad_norm": 0.8359375, + "learning_rate": 0.0001915425187445662, + "loss": 0.9967, + "step": 7917 + }, + { + "epoch": 0.2033120373089657, + "grad_norm": 0.93359375, + "learning_rate": 0.00019154072186498554, + "loss": 1.03, + "step": 7918 + }, + { + "epoch": 0.2033377145048875, + "grad_norm": 0.86328125, + "learning_rate": 0.00019153892480297245, + "loss": 1.0953, + "step": 7919 + }, + { + "epoch": 0.2033633917008093, + "grad_norm": 0.8671875, + "learning_rate": 0.00019153712755853056, + "loss": 0.9283, + "step": 7920 + }, + { + "epoch": 0.20338906889673114, + "grad_norm": 0.90234375, + "learning_rate": 0.00019153533013166344, + "loss": 1.1464, + "step": 7921 + }, + { + "epoch": 0.20341474609265295, + "grad_norm": 0.765625, + "learning_rate": 0.00019153353252237466, + "loss": 1.0918, + "step": 7922 + }, + { + "epoch": 0.20344042328857478, + "grad_norm": 0.7578125, + "learning_rate": 0.00019153173473066785, + "loss": 0.9842, + "step": 7923 + }, + { + "epoch": 0.2034661004844966, + "grad_norm": 0.8046875, + "learning_rate": 0.00019152993675654653, + "loss": 1.1335, + "step": 7924 + }, + { + "epoch": 0.2034917776804184, + "grad_norm": 0.73046875, + "learning_rate": 0.00019152813860001436, + "loss": 1.0045, + "step": 7925 + }, + { + "epoch": 0.20351745487634024, + "grad_norm": 0.84375, + "learning_rate": 0.00019152634026107485, + "loss": 0.9708, + "step": 7926 + }, + { + "epoch": 0.20354313207226205, + "grad_norm": 1.65625, + "learning_rate": 0.0001915245417397316, + "loss": 0.9682, + "step": 7927 + }, + { + "epoch": 0.20356880926818388, + "grad_norm": 0.84765625, + "learning_rate": 0.00019152274303598822, + "loss": 1.0, + "step": 7928 + }, + { + "epoch": 0.2035944864641057, + "grad_norm": 0.828125, + "learning_rate": 0.00019152094414984826, + "loss": 1.117, + "step": 7929 + }, + { + "epoch": 0.2036201636600275, + "grad_norm": 0.7890625, + "learning_rate": 0.00019151914508131536, + "loss": 0.9556, + "step": 7930 + }, + { + "epoch": 0.20364584085594933, + "grad_norm": 0.77734375, + "learning_rate": 0.00019151734583039304, + "loss": 1.0057, + "step": 7931 + }, + { + "epoch": 0.20367151805187114, + "grad_norm": 0.86328125, + "learning_rate": 0.00019151554639708492, + "loss": 0.9593, + "step": 7932 + }, + { + "epoch": 0.20369719524779298, + "grad_norm": 0.87890625, + "learning_rate": 0.0001915137467813946, + "loss": 1.1147, + "step": 7933 + }, + { + "epoch": 0.2037228724437148, + "grad_norm": 0.91796875, + "learning_rate": 0.00019151194698332564, + "loss": 1.1885, + "step": 7934 + }, + { + "epoch": 0.2037485496396366, + "grad_norm": 0.80859375, + "learning_rate": 0.00019151014700288164, + "loss": 1.1313, + "step": 7935 + }, + { + "epoch": 0.20377422683555843, + "grad_norm": 0.8125, + "learning_rate": 0.0001915083468400662, + "loss": 1.1751, + "step": 7936 + }, + { + "epoch": 0.20379990403148024, + "grad_norm": 0.79296875, + "learning_rate": 0.00019150654649488285, + "loss": 1.0364, + "step": 7937 + }, + { + "epoch": 0.20382558122740208, + "grad_norm": 0.77734375, + "learning_rate": 0.00019150474596733524, + "loss": 0.9206, + "step": 7938 + }, + { + "epoch": 0.20385125842332388, + "grad_norm": 0.828125, + "learning_rate": 0.00019150294525742694, + "loss": 1.2113, + "step": 7939 + }, + { + "epoch": 0.2038769356192457, + "grad_norm": 0.921875, + "learning_rate": 0.00019150114436516154, + "loss": 1.0569, + "step": 7940 + }, + { + "epoch": 0.20390261281516753, + "grad_norm": 0.8515625, + "learning_rate": 0.0001914993432905426, + "loss": 1.1269, + "step": 7941 + }, + { + "epoch": 0.20392829001108934, + "grad_norm": 0.79296875, + "learning_rate": 0.00019149754203357376, + "loss": 0.9416, + "step": 7942 + }, + { + "epoch": 0.20395396720701117, + "grad_norm": 0.7421875, + "learning_rate": 0.00019149574059425858, + "loss": 0.98, + "step": 7943 + }, + { + "epoch": 0.20397964440293298, + "grad_norm": 0.79296875, + "learning_rate": 0.00019149393897260062, + "loss": 1.2279, + "step": 7944 + }, + { + "epoch": 0.2040053215988548, + "grad_norm": 0.84375, + "learning_rate": 0.00019149213716860354, + "loss": 1.0358, + "step": 7945 + }, + { + "epoch": 0.20403099879477662, + "grad_norm": 0.84765625, + "learning_rate": 0.0001914903351822709, + "loss": 0.9995, + "step": 7946 + }, + { + "epoch": 0.20405667599069843, + "grad_norm": 0.76953125, + "learning_rate": 0.00019148853301360625, + "loss": 1.1279, + "step": 7947 + }, + { + "epoch": 0.20408235318662027, + "grad_norm": 0.85546875, + "learning_rate": 0.00019148673066261327, + "loss": 1.1481, + "step": 7948 + }, + { + "epoch": 0.20410803038254208, + "grad_norm": 0.7734375, + "learning_rate": 0.00019148492812929546, + "loss": 1.0003, + "step": 7949 + }, + { + "epoch": 0.20413370757846389, + "grad_norm": 0.75390625, + "learning_rate": 0.00019148312541365648, + "loss": 0.8517, + "step": 7950 + }, + { + "epoch": 0.20415938477438572, + "grad_norm": 0.73046875, + "learning_rate": 0.00019148132251569985, + "loss": 1.0507, + "step": 7951 + }, + { + "epoch": 0.20418506197030753, + "grad_norm": 0.8359375, + "learning_rate": 0.00019147951943542925, + "loss": 1.1222, + "step": 7952 + }, + { + "epoch": 0.20421073916622934, + "grad_norm": 0.7734375, + "learning_rate": 0.00019147771617284822, + "loss": 1.1932, + "step": 7953 + }, + { + "epoch": 0.20423641636215117, + "grad_norm": 0.9296875, + "learning_rate": 0.00019147591272796036, + "loss": 1.0079, + "step": 7954 + }, + { + "epoch": 0.20426209355807298, + "grad_norm": 0.8359375, + "learning_rate": 0.00019147410910076926, + "loss": 1.1816, + "step": 7955 + }, + { + "epoch": 0.20428777075399482, + "grad_norm": 0.875, + "learning_rate": 0.00019147230529127854, + "loss": 1.0376, + "step": 7956 + }, + { + "epoch": 0.20431344794991663, + "grad_norm": 0.78125, + "learning_rate": 0.00019147050129949175, + "loss": 1.0426, + "step": 7957 + }, + { + "epoch": 0.20433912514583844, + "grad_norm": 0.82421875, + "learning_rate": 0.00019146869712541252, + "loss": 1.0556, + "step": 7958 + }, + { + "epoch": 0.20436480234176027, + "grad_norm": 0.80078125, + "learning_rate": 0.00019146689276904448, + "loss": 0.9565, + "step": 7959 + }, + { + "epoch": 0.20439047953768208, + "grad_norm": 0.796875, + "learning_rate": 0.00019146508823039116, + "loss": 1.0127, + "step": 7960 + }, + { + "epoch": 0.20441615673360392, + "grad_norm": 0.79296875, + "learning_rate": 0.00019146328350945618, + "loss": 1.0808, + "step": 7961 + }, + { + "epoch": 0.20444183392952572, + "grad_norm": 0.86328125, + "learning_rate": 0.00019146147860624315, + "loss": 0.9713, + "step": 7962 + }, + { + "epoch": 0.20446751112544753, + "grad_norm": 0.875, + "learning_rate": 0.00019145967352075563, + "loss": 1.1327, + "step": 7963 + }, + { + "epoch": 0.20449318832136937, + "grad_norm": 0.828125, + "learning_rate": 0.00019145786825299724, + "loss": 1.0573, + "step": 7964 + }, + { + "epoch": 0.20451886551729118, + "grad_norm": 0.7734375, + "learning_rate": 0.00019145606280297162, + "loss": 1.0264, + "step": 7965 + }, + { + "epoch": 0.204544542713213, + "grad_norm": 0.85546875, + "learning_rate": 0.0001914542571706823, + "loss": 1.0834, + "step": 7966 + }, + { + "epoch": 0.20457021990913482, + "grad_norm": 0.83984375, + "learning_rate": 0.00019145245135613288, + "loss": 1.0374, + "step": 7967 + }, + { + "epoch": 0.20459589710505663, + "grad_norm": 0.84375, + "learning_rate": 0.00019145064535932703, + "loss": 1.2052, + "step": 7968 + }, + { + "epoch": 0.20462157430097846, + "grad_norm": 0.87109375, + "learning_rate": 0.00019144883918026829, + "loss": 1.0319, + "step": 7969 + }, + { + "epoch": 0.20464725149690027, + "grad_norm": 0.83984375, + "learning_rate": 0.00019144703281896028, + "loss": 1.1907, + "step": 7970 + }, + { + "epoch": 0.2046729286928221, + "grad_norm": 0.8046875, + "learning_rate": 0.00019144522627540658, + "loss": 0.9617, + "step": 7971 + }, + { + "epoch": 0.20469860588874392, + "grad_norm": 0.8046875, + "learning_rate": 0.0001914434195496108, + "loss": 0.8927, + "step": 7972 + }, + { + "epoch": 0.20472428308466573, + "grad_norm": 0.76953125, + "learning_rate": 0.00019144161264157656, + "loss": 0.9021, + "step": 7973 + }, + { + "epoch": 0.20474996028058756, + "grad_norm": 0.78125, + "learning_rate": 0.00019143980555130744, + "loss": 0.8647, + "step": 7974 + }, + { + "epoch": 0.20477563747650937, + "grad_norm": 0.8125, + "learning_rate": 0.00019143799827880703, + "loss": 1.0244, + "step": 7975 + }, + { + "epoch": 0.2048013146724312, + "grad_norm": 0.8984375, + "learning_rate": 0.00019143619082407897, + "loss": 1.2023, + "step": 7976 + }, + { + "epoch": 0.20482699186835301, + "grad_norm": 0.88671875, + "learning_rate": 0.00019143438318712684, + "loss": 1.0182, + "step": 7977 + }, + { + "epoch": 0.20485266906427482, + "grad_norm": 0.8359375, + "learning_rate": 0.00019143257536795427, + "loss": 1.1297, + "step": 7978 + }, + { + "epoch": 0.20487834626019666, + "grad_norm": 0.77734375, + "learning_rate": 0.0001914307673665648, + "loss": 0.9236, + "step": 7979 + }, + { + "epoch": 0.20490402345611847, + "grad_norm": 0.78125, + "learning_rate": 0.00019142895918296206, + "loss": 1.0644, + "step": 7980 + }, + { + "epoch": 0.2049297006520403, + "grad_norm": 0.83203125, + "learning_rate": 0.0001914271508171497, + "loss": 1.0794, + "step": 7981 + }, + { + "epoch": 0.2049553778479621, + "grad_norm": 0.83984375, + "learning_rate": 0.00019142534226913128, + "loss": 1.1889, + "step": 7982 + }, + { + "epoch": 0.20498105504388392, + "grad_norm": 0.83203125, + "learning_rate": 0.00019142353353891037, + "loss": 0.9369, + "step": 7983 + }, + { + "epoch": 0.20500673223980576, + "grad_norm": 0.83984375, + "learning_rate": 0.00019142172462649067, + "loss": 1.1351, + "step": 7984 + }, + { + "epoch": 0.20503240943572756, + "grad_norm": 0.796875, + "learning_rate": 0.0001914199155318757, + "loss": 0.9375, + "step": 7985 + }, + { + "epoch": 0.2050580866316494, + "grad_norm": 0.80078125, + "learning_rate": 0.00019141810625506912, + "loss": 1.0098, + "step": 7986 + }, + { + "epoch": 0.2050837638275712, + "grad_norm": 0.8359375, + "learning_rate": 0.00019141629679607448, + "loss": 1.0062, + "step": 7987 + }, + { + "epoch": 0.20510944102349302, + "grad_norm": 0.80078125, + "learning_rate": 0.00019141448715489544, + "loss": 1.0156, + "step": 7988 + }, + { + "epoch": 0.20513511821941485, + "grad_norm": 0.82421875, + "learning_rate": 0.00019141267733153558, + "loss": 1.0619, + "step": 7989 + }, + { + "epoch": 0.20516079541533666, + "grad_norm": 0.8203125, + "learning_rate": 0.0001914108673259985, + "loss": 0.9761, + "step": 7990 + }, + { + "epoch": 0.2051864726112585, + "grad_norm": 0.8984375, + "learning_rate": 0.00019140905713828783, + "loss": 1.1458, + "step": 7991 + }, + { + "epoch": 0.2052121498071803, + "grad_norm": 0.78515625, + "learning_rate": 0.00019140724676840716, + "loss": 1.1126, + "step": 7992 + }, + { + "epoch": 0.2052378270031021, + "grad_norm": 0.87890625, + "learning_rate": 0.0001914054362163601, + "loss": 1.0954, + "step": 7993 + }, + { + "epoch": 0.20526350419902395, + "grad_norm": 0.7265625, + "learning_rate": 0.00019140362548215027, + "loss": 0.8008, + "step": 7994 + }, + { + "epoch": 0.20528918139494576, + "grad_norm": 0.80078125, + "learning_rate": 0.0001914018145657813, + "loss": 1.0457, + "step": 7995 + }, + { + "epoch": 0.2053148585908676, + "grad_norm": 0.82421875, + "learning_rate": 0.00019140000346725672, + "loss": 1.123, + "step": 7996 + }, + { + "epoch": 0.2053405357867894, + "grad_norm": 0.75390625, + "learning_rate": 0.00019139819218658022, + "loss": 0.9748, + "step": 7997 + }, + { + "epoch": 0.2053662129827112, + "grad_norm": 0.86328125, + "learning_rate": 0.00019139638072375537, + "loss": 1.0205, + "step": 7998 + }, + { + "epoch": 0.20539189017863305, + "grad_norm": 0.953125, + "learning_rate": 0.00019139456907878578, + "loss": 1.1858, + "step": 7999 + }, + { + "epoch": 0.20541756737455485, + "grad_norm": 0.7578125, + "learning_rate": 0.00019139275725167508, + "loss": 1.131, + "step": 8000 + }, + { + "epoch": 0.20541756737455485, + "eval_loss": 1.038010835647583, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 404.8423, + "eval_samples_per_second": 24.701, + "eval_steps_per_second": 0.773, + "step": 8000 + }, + { + "epoch": 0.2054432445704767, + "grad_norm": 0.81640625, + "learning_rate": 0.00019139094524242686, + "loss": 1.116, + "step": 8001 + }, + { + "epoch": 0.2054689217663985, + "grad_norm": 0.81640625, + "learning_rate": 0.00019138913305104474, + "loss": 1.0927, + "step": 8002 + }, + { + "epoch": 0.2054945989623203, + "grad_norm": 0.8203125, + "learning_rate": 0.00019138732067753233, + "loss": 1.0554, + "step": 8003 + }, + { + "epoch": 0.20552027615824214, + "grad_norm": 0.875, + "learning_rate": 0.00019138550812189327, + "loss": 1.0614, + "step": 8004 + }, + { + "epoch": 0.20554595335416395, + "grad_norm": 0.81640625, + "learning_rate": 0.00019138369538413113, + "loss": 0.9675, + "step": 8005 + }, + { + "epoch": 0.2055716305500858, + "grad_norm": 0.796875, + "learning_rate": 0.00019138188246424956, + "loss": 0.996, + "step": 8006 + }, + { + "epoch": 0.2055973077460076, + "grad_norm": 0.89453125, + "learning_rate": 0.00019138006936225214, + "loss": 1.1084, + "step": 8007 + }, + { + "epoch": 0.2056229849419294, + "grad_norm": 0.89453125, + "learning_rate": 0.0001913782560781425, + "loss": 1.1995, + "step": 8008 + }, + { + "epoch": 0.20564866213785124, + "grad_norm": 0.796875, + "learning_rate": 0.00019137644261192424, + "loss": 0.9029, + "step": 8009 + }, + { + "epoch": 0.20567433933377305, + "grad_norm": 0.82421875, + "learning_rate": 0.00019137462896360097, + "loss": 0.9828, + "step": 8010 + }, + { + "epoch": 0.20570001652969488, + "grad_norm": 0.83203125, + "learning_rate": 0.00019137281513317633, + "loss": 0.8547, + "step": 8011 + }, + { + "epoch": 0.2057256937256167, + "grad_norm": 0.84375, + "learning_rate": 0.00019137100112065394, + "loss": 1.15, + "step": 8012 + }, + { + "epoch": 0.2057513709215385, + "grad_norm": 0.796875, + "learning_rate": 0.00019136918692603736, + "loss": 1.0153, + "step": 8013 + }, + { + "epoch": 0.20577704811746034, + "grad_norm": 0.78125, + "learning_rate": 0.0001913673725493303, + "loss": 0.9804, + "step": 8014 + }, + { + "epoch": 0.20580272531338215, + "grad_norm": 0.796875, + "learning_rate": 0.00019136555799053627, + "loss": 0.8899, + "step": 8015 + }, + { + "epoch": 0.20582840250930398, + "grad_norm": 0.8046875, + "learning_rate": 0.00019136374324965895, + "loss": 1.036, + "step": 8016 + }, + { + "epoch": 0.2058540797052258, + "grad_norm": 0.75, + "learning_rate": 0.00019136192832670194, + "loss": 1.0029, + "step": 8017 + }, + { + "epoch": 0.2058797569011476, + "grad_norm": 0.76171875, + "learning_rate": 0.00019136011322166887, + "loss": 1.0905, + "step": 8018 + }, + { + "epoch": 0.20590543409706943, + "grad_norm": 0.78125, + "learning_rate": 0.00019135829793456334, + "loss": 1.0147, + "step": 8019 + }, + { + "epoch": 0.20593111129299124, + "grad_norm": 0.83984375, + "learning_rate": 0.00019135648246538899, + "loss": 0.9724, + "step": 8020 + }, + { + "epoch": 0.20595678848891308, + "grad_norm": 0.78515625, + "learning_rate": 0.00019135466681414939, + "loss": 0.8668, + "step": 8021 + }, + { + "epoch": 0.2059824656848349, + "grad_norm": 0.78125, + "learning_rate": 0.00019135285098084823, + "loss": 1.0945, + "step": 8022 + }, + { + "epoch": 0.2060081428807567, + "grad_norm": 0.88671875, + "learning_rate": 0.00019135103496548909, + "loss": 1.0605, + "step": 8023 + }, + { + "epoch": 0.20603382007667853, + "grad_norm": 0.8671875, + "learning_rate": 0.00019134921876807557, + "loss": 1.0285, + "step": 8024 + }, + { + "epoch": 0.20605949727260034, + "grad_norm": 0.7890625, + "learning_rate": 0.0001913474023886113, + "loss": 1.0314, + "step": 8025 + }, + { + "epoch": 0.20608517446852218, + "grad_norm": 0.8203125, + "learning_rate": 0.0001913455858270999, + "loss": 1.0328, + "step": 8026 + }, + { + "epoch": 0.20611085166444398, + "grad_norm": 0.7578125, + "learning_rate": 0.00019134376908354505, + "loss": 0.9257, + "step": 8027 + }, + { + "epoch": 0.2061365288603658, + "grad_norm": 0.765625, + "learning_rate": 0.00019134195215795026, + "loss": 1.1135, + "step": 8028 + }, + { + "epoch": 0.20616220605628763, + "grad_norm": 0.8125, + "learning_rate": 0.00019134013505031926, + "loss": 1.1573, + "step": 8029 + }, + { + "epoch": 0.20618788325220944, + "grad_norm": 0.75, + "learning_rate": 0.0001913383177606556, + "loss": 0.9861, + "step": 8030 + }, + { + "epoch": 0.20621356044813127, + "grad_norm": 0.8515625, + "learning_rate": 0.0001913365002889629, + "loss": 1.0478, + "step": 8031 + }, + { + "epoch": 0.20623923764405308, + "grad_norm": 0.84765625, + "learning_rate": 0.00019133468263524483, + "loss": 1.0244, + "step": 8032 + }, + { + "epoch": 0.2062649148399749, + "grad_norm": 0.72265625, + "learning_rate": 0.000191332864799505, + "loss": 1.0029, + "step": 8033 + }, + { + "epoch": 0.20629059203589672, + "grad_norm": 0.7734375, + "learning_rate": 0.00019133104678174698, + "loss": 1.3331, + "step": 8034 + }, + { + "epoch": 0.20631626923181853, + "grad_norm": 0.84375, + "learning_rate": 0.00019132922858197446, + "loss": 1.1568, + "step": 8035 + }, + { + "epoch": 0.20634194642774037, + "grad_norm": 0.8203125, + "learning_rate": 0.00019132741020019104, + "loss": 0.9361, + "step": 8036 + }, + { + "epoch": 0.20636762362366218, + "grad_norm": 0.76171875, + "learning_rate": 0.00019132559163640033, + "loss": 0.8697, + "step": 8037 + }, + { + "epoch": 0.20639330081958399, + "grad_norm": 0.79296875, + "learning_rate": 0.00019132377289060598, + "loss": 1.0241, + "step": 8038 + }, + { + "epoch": 0.20641897801550582, + "grad_norm": 0.8046875, + "learning_rate": 0.0001913219539628116, + "loss": 1.0121, + "step": 8039 + }, + { + "epoch": 0.20644465521142763, + "grad_norm": 0.75, + "learning_rate": 0.0001913201348530208, + "loss": 0.8954, + "step": 8040 + }, + { + "epoch": 0.20647033240734947, + "grad_norm": 0.79296875, + "learning_rate": 0.00019131831556123722, + "loss": 1.0843, + "step": 8041 + }, + { + "epoch": 0.20649600960327127, + "grad_norm": 0.72265625, + "learning_rate": 0.00019131649608746448, + "loss": 1.1213, + "step": 8042 + }, + { + "epoch": 0.20652168679919308, + "grad_norm": 0.74609375, + "learning_rate": 0.00019131467643170624, + "loss": 0.9842, + "step": 8043 + }, + { + "epoch": 0.20654736399511492, + "grad_norm": 0.82421875, + "learning_rate": 0.0001913128565939661, + "loss": 1.0779, + "step": 8044 + }, + { + "epoch": 0.20657304119103673, + "grad_norm": 0.82421875, + "learning_rate": 0.00019131103657424764, + "loss": 1.0418, + "step": 8045 + }, + { + "epoch": 0.20659871838695856, + "grad_norm": 0.7734375, + "learning_rate": 0.00019130921637255458, + "loss": 0.9214, + "step": 8046 + }, + { + "epoch": 0.20662439558288037, + "grad_norm": 0.859375, + "learning_rate": 0.00019130739598889046, + "loss": 1.1524, + "step": 8047 + }, + { + "epoch": 0.20665007277880218, + "grad_norm": 0.8359375, + "learning_rate": 0.00019130557542325897, + "loss": 1.0605, + "step": 8048 + }, + { + "epoch": 0.20667574997472402, + "grad_norm": 0.83984375, + "learning_rate": 0.0001913037546756637, + "loss": 1.0532, + "step": 8049 + }, + { + "epoch": 0.20670142717064582, + "grad_norm": 0.828125, + "learning_rate": 0.00019130193374610835, + "loss": 1.0683, + "step": 8050 + }, + { + "epoch": 0.20672710436656766, + "grad_norm": 0.80078125, + "learning_rate": 0.00019130011263459643, + "loss": 0.9846, + "step": 8051 + }, + { + "epoch": 0.20675278156248947, + "grad_norm": 0.83203125, + "learning_rate": 0.00019129829134113165, + "loss": 1.097, + "step": 8052 + }, + { + "epoch": 0.20677845875841128, + "grad_norm": 0.8359375, + "learning_rate": 0.00019129646986571763, + "loss": 1.1127, + "step": 8053 + }, + { + "epoch": 0.2068041359543331, + "grad_norm": 0.76171875, + "learning_rate": 0.00019129464820835798, + "loss": 0.8658, + "step": 8054 + }, + { + "epoch": 0.20682981315025492, + "grad_norm": 0.8359375, + "learning_rate": 0.00019129282636905638, + "loss": 1.0536, + "step": 8055 + }, + { + "epoch": 0.20685549034617676, + "grad_norm": 0.7890625, + "learning_rate": 0.00019129100434781636, + "loss": 1.0836, + "step": 8056 + }, + { + "epoch": 0.20688116754209857, + "grad_norm": 0.8203125, + "learning_rate": 0.00019128918214464163, + "loss": 0.9535, + "step": 8057 + }, + { + "epoch": 0.20690684473802037, + "grad_norm": 0.73828125, + "learning_rate": 0.00019128735975953586, + "loss": 1.0701, + "step": 8058 + }, + { + "epoch": 0.2069325219339422, + "grad_norm": 0.83203125, + "learning_rate": 0.0001912855371925026, + "loss": 1.1281, + "step": 8059 + }, + { + "epoch": 0.20695819912986402, + "grad_norm": 0.75390625, + "learning_rate": 0.0001912837144435455, + "loss": 0.9238, + "step": 8060 + }, + { + "epoch": 0.20698387632578585, + "grad_norm": 0.74609375, + "learning_rate": 0.0001912818915126682, + "loss": 0.8374, + "step": 8061 + }, + { + "epoch": 0.20700955352170766, + "grad_norm": 0.8359375, + "learning_rate": 0.00019128006839987433, + "loss": 0.9854, + "step": 8062 + }, + { + "epoch": 0.20703523071762947, + "grad_norm": 0.76953125, + "learning_rate": 0.00019127824510516755, + "loss": 1.007, + "step": 8063 + }, + { + "epoch": 0.2070609079135513, + "grad_norm": 0.75, + "learning_rate": 0.00019127642162855145, + "loss": 0.9887, + "step": 8064 + }, + { + "epoch": 0.20708658510947311, + "grad_norm": 0.87890625, + "learning_rate": 0.0001912745979700297, + "loss": 1.047, + "step": 8065 + }, + { + "epoch": 0.20711226230539495, + "grad_norm": 0.8046875, + "learning_rate": 0.0001912727741296059, + "loss": 1.0986, + "step": 8066 + }, + { + "epoch": 0.20713793950131676, + "grad_norm": 0.78515625, + "learning_rate": 0.00019127095010728376, + "loss": 0.8369, + "step": 8067 + }, + { + "epoch": 0.20716361669723857, + "grad_norm": 0.71875, + "learning_rate": 0.00019126912590306682, + "loss": 1.0421, + "step": 8068 + }, + { + "epoch": 0.2071892938931604, + "grad_norm": 0.8046875, + "learning_rate": 0.00019126730151695876, + "loss": 1.1018, + "step": 8069 + }, + { + "epoch": 0.2072149710890822, + "grad_norm": 0.83984375, + "learning_rate": 0.0001912654769489632, + "loss": 0.9782, + "step": 8070 + }, + { + "epoch": 0.20724064828500405, + "grad_norm": 0.8125, + "learning_rate": 0.00019126365219908383, + "loss": 0.9386, + "step": 8071 + }, + { + "epoch": 0.20726632548092586, + "grad_norm": 0.83203125, + "learning_rate": 0.0001912618272673242, + "loss": 1.0366, + "step": 8072 + }, + { + "epoch": 0.20729200267684766, + "grad_norm": 0.859375, + "learning_rate": 0.00019126000215368802, + "loss": 0.9391, + "step": 8073 + }, + { + "epoch": 0.2073176798727695, + "grad_norm": 0.8203125, + "learning_rate": 0.00019125817685817888, + "loss": 0.9942, + "step": 8074 + }, + { + "epoch": 0.2073433570686913, + "grad_norm": 1.03125, + "learning_rate": 0.00019125635138080046, + "loss": 1.1366, + "step": 8075 + }, + { + "epoch": 0.20736903426461314, + "grad_norm": 0.82421875, + "learning_rate": 0.00019125452572155637, + "loss": 1.0851, + "step": 8076 + }, + { + "epoch": 0.20739471146053495, + "grad_norm": 0.84375, + "learning_rate": 0.00019125269988045024, + "loss": 0.946, + "step": 8077 + }, + { + "epoch": 0.20742038865645676, + "grad_norm": 0.85546875, + "learning_rate": 0.00019125087385748574, + "loss": 1.0769, + "step": 8078 + }, + { + "epoch": 0.2074460658523786, + "grad_norm": 0.85546875, + "learning_rate": 0.00019124904765266648, + "loss": 1.0885, + "step": 8079 + }, + { + "epoch": 0.2074717430483004, + "grad_norm": 0.75390625, + "learning_rate": 0.0001912472212659961, + "loss": 1.063, + "step": 8080 + }, + { + "epoch": 0.20749742024422224, + "grad_norm": 0.8828125, + "learning_rate": 0.00019124539469747825, + "loss": 1.1091, + "step": 8081 + }, + { + "epoch": 0.20752309744014405, + "grad_norm": 0.89453125, + "learning_rate": 0.0001912435679471166, + "loss": 1.0789, + "step": 8082 + }, + { + "epoch": 0.20754877463606586, + "grad_norm": 0.796875, + "learning_rate": 0.00019124174101491472, + "loss": 0.9917, + "step": 8083 + }, + { + "epoch": 0.2075744518319877, + "grad_norm": 0.77734375, + "learning_rate": 0.0001912399139008763, + "loss": 0.9004, + "step": 8084 + }, + { + "epoch": 0.2076001290279095, + "grad_norm": 0.734375, + "learning_rate": 0.00019123808660500503, + "loss": 0.9719, + "step": 8085 + }, + { + "epoch": 0.20762580622383134, + "grad_norm": 0.86328125, + "learning_rate": 0.00019123625912730442, + "loss": 1.1041, + "step": 8086 + }, + { + "epoch": 0.20765148341975315, + "grad_norm": 0.8515625, + "learning_rate": 0.00019123443146777823, + "loss": 1.1403, + "step": 8087 + }, + { + "epoch": 0.20767716061567496, + "grad_norm": 0.7734375, + "learning_rate": 0.00019123260362643004, + "loss": 1.0173, + "step": 8088 + }, + { + "epoch": 0.2077028378115968, + "grad_norm": 0.75, + "learning_rate": 0.0001912307756032635, + "loss": 1.0861, + "step": 8089 + }, + { + "epoch": 0.2077285150075186, + "grad_norm": 0.8203125, + "learning_rate": 0.0001912289473982823, + "loss": 1.0214, + "step": 8090 + }, + { + "epoch": 0.20775419220344044, + "grad_norm": 0.86328125, + "learning_rate": 0.00019122711901149004, + "loss": 1.1676, + "step": 8091 + }, + { + "epoch": 0.20777986939936224, + "grad_norm": 0.84375, + "learning_rate": 0.00019122529044289036, + "loss": 1.0264, + "step": 8092 + }, + { + "epoch": 0.20780554659528405, + "grad_norm": 0.91015625, + "learning_rate": 0.00019122346169248692, + "loss": 1.0472, + "step": 8093 + }, + { + "epoch": 0.2078312237912059, + "grad_norm": 0.89453125, + "learning_rate": 0.00019122163276028337, + "loss": 1.0199, + "step": 8094 + }, + { + "epoch": 0.2078569009871277, + "grad_norm": 0.8515625, + "learning_rate": 0.00019121980364628332, + "loss": 1.0681, + "step": 8095 + }, + { + "epoch": 0.20788257818304953, + "grad_norm": 0.80078125, + "learning_rate": 0.00019121797435049046, + "loss": 0.9338, + "step": 8096 + }, + { + "epoch": 0.20790825537897134, + "grad_norm": 0.73046875, + "learning_rate": 0.00019121614487290845, + "loss": 1.0921, + "step": 8097 + }, + { + "epoch": 0.20793393257489315, + "grad_norm": 0.875, + "learning_rate": 0.00019121431521354086, + "loss": 1.0198, + "step": 8098 + }, + { + "epoch": 0.20795960977081498, + "grad_norm": 0.84765625, + "learning_rate": 0.00019121248537239138, + "loss": 1.1139, + "step": 8099 + }, + { + "epoch": 0.2079852869667368, + "grad_norm": 0.76171875, + "learning_rate": 0.00019121065534946364, + "loss": 1.0422, + "step": 8100 + }, + { + "epoch": 0.20801096416265863, + "grad_norm": 0.76953125, + "learning_rate": 0.00019120882514476136, + "loss": 0.9961, + "step": 8101 + }, + { + "epoch": 0.20803664135858044, + "grad_norm": 0.83984375, + "learning_rate": 0.00019120699475828808, + "loss": 1.0864, + "step": 8102 + }, + { + "epoch": 0.20806231855450225, + "grad_norm": 0.80859375, + "learning_rate": 0.0001912051641900475, + "loss": 1.1226, + "step": 8103 + }, + { + "epoch": 0.20808799575042408, + "grad_norm": 0.8515625, + "learning_rate": 0.0001912033334400433, + "loss": 1.0398, + "step": 8104 + }, + { + "epoch": 0.2081136729463459, + "grad_norm": 0.80859375, + "learning_rate": 0.00019120150250827908, + "loss": 0.9449, + "step": 8105 + }, + { + "epoch": 0.20813935014226773, + "grad_norm": 0.87109375, + "learning_rate": 0.0001911996713947585, + "loss": 0.9563, + "step": 8106 + }, + { + "epoch": 0.20816502733818953, + "grad_norm": 0.82421875, + "learning_rate": 0.0001911978400994852, + "loss": 1.066, + "step": 8107 + }, + { + "epoch": 0.20819070453411134, + "grad_norm": 0.84375, + "learning_rate": 0.00019119600862246285, + "loss": 1.1004, + "step": 8108 + }, + { + "epoch": 0.20821638173003318, + "grad_norm": 0.86328125, + "learning_rate": 0.00019119417696369513, + "loss": 1.0249, + "step": 8109 + }, + { + "epoch": 0.208242058925955, + "grad_norm": 0.796875, + "learning_rate": 0.0001911923451231856, + "loss": 1.1011, + "step": 8110 + }, + { + "epoch": 0.20826773612187682, + "grad_norm": 0.796875, + "learning_rate": 0.00019119051310093798, + "loss": 1.0533, + "step": 8111 + }, + { + "epoch": 0.20829341331779863, + "grad_norm": 0.78125, + "learning_rate": 0.0001911886808969559, + "loss": 1.0502, + "step": 8112 + }, + { + "epoch": 0.20831909051372044, + "grad_norm": 0.81640625, + "learning_rate": 0.00019118684851124305, + "loss": 1.1615, + "step": 8113 + }, + { + "epoch": 0.20834476770964228, + "grad_norm": 0.75, + "learning_rate": 0.00019118501594380302, + "loss": 0.9136, + "step": 8114 + }, + { + "epoch": 0.20837044490556408, + "grad_norm": 0.80859375, + "learning_rate": 0.0001911831831946395, + "loss": 1.0915, + "step": 8115 + }, + { + "epoch": 0.20839612210148592, + "grad_norm": 0.9375, + "learning_rate": 0.0001911813502637561, + "loss": 1.0078, + "step": 8116 + }, + { + "epoch": 0.20842179929740773, + "grad_norm": 0.80078125, + "learning_rate": 0.00019117951715115657, + "loss": 1.1058, + "step": 8117 + }, + { + "epoch": 0.20844747649332954, + "grad_norm": 1.03125, + "learning_rate": 0.00019117768385684442, + "loss": 1.0897, + "step": 8118 + }, + { + "epoch": 0.20847315368925137, + "grad_norm": 0.80859375, + "learning_rate": 0.00019117585038082343, + "loss": 0.9739, + "step": 8119 + }, + { + "epoch": 0.20849883088517318, + "grad_norm": 0.86328125, + "learning_rate": 0.0001911740167230972, + "loss": 1.0145, + "step": 8120 + }, + { + "epoch": 0.20852450808109502, + "grad_norm": 0.84375, + "learning_rate": 0.0001911721828836694, + "loss": 1.0702, + "step": 8121 + }, + { + "epoch": 0.20855018527701683, + "grad_norm": 0.7578125, + "learning_rate": 0.00019117034886254364, + "loss": 0.9201, + "step": 8122 + }, + { + "epoch": 0.20857586247293863, + "grad_norm": 0.8359375, + "learning_rate": 0.00019116851465972366, + "loss": 1.0679, + "step": 8123 + }, + { + "epoch": 0.20860153966886047, + "grad_norm": 0.9375, + "learning_rate": 0.00019116668027521302, + "loss": 0.9671, + "step": 8124 + }, + { + "epoch": 0.20862721686478228, + "grad_norm": 0.78125, + "learning_rate": 0.00019116484570901545, + "loss": 0.9311, + "step": 8125 + }, + { + "epoch": 0.2086528940607041, + "grad_norm": 0.84375, + "learning_rate": 0.00019116301096113456, + "loss": 1.0572, + "step": 8126 + }, + { + "epoch": 0.20867857125662592, + "grad_norm": 0.8046875, + "learning_rate": 0.00019116117603157405, + "loss": 1.0568, + "step": 8127 + }, + { + "epoch": 0.20870424845254773, + "grad_norm": 0.81640625, + "learning_rate": 0.00019115934092033752, + "loss": 0.9447, + "step": 8128 + }, + { + "epoch": 0.20872992564846957, + "grad_norm": 1.0, + "learning_rate": 0.00019115750562742867, + "loss": 1.0254, + "step": 8129 + }, + { + "epoch": 0.20875560284439137, + "grad_norm": 0.8515625, + "learning_rate": 0.00019115567015285114, + "loss": 1.1939, + "step": 8130 + }, + { + "epoch": 0.2087812800403132, + "grad_norm": 0.78515625, + "learning_rate": 0.0001911538344966086, + "loss": 1.0571, + "step": 8131 + }, + { + "epoch": 0.20880695723623502, + "grad_norm": 0.765625, + "learning_rate": 0.0001911519986587047, + "loss": 0.9996, + "step": 8132 + }, + { + "epoch": 0.20883263443215683, + "grad_norm": 0.921875, + "learning_rate": 0.0001911501626391431, + "loss": 1.0065, + "step": 8133 + }, + { + "epoch": 0.20885831162807866, + "grad_norm": 0.859375, + "learning_rate": 0.00019114832643792745, + "loss": 1.1657, + "step": 8134 + }, + { + "epoch": 0.20888398882400047, + "grad_norm": 0.82421875, + "learning_rate": 0.00019114649005506142, + "loss": 1.0293, + "step": 8135 + }, + { + "epoch": 0.2089096660199223, + "grad_norm": 0.85546875, + "learning_rate": 0.00019114465349054867, + "loss": 1.2605, + "step": 8136 + }, + { + "epoch": 0.20893534321584412, + "grad_norm": 0.8203125, + "learning_rate": 0.0001911428167443929, + "loss": 1.0798, + "step": 8137 + }, + { + "epoch": 0.20896102041176592, + "grad_norm": 0.81640625, + "learning_rate": 0.0001911409798165977, + "loss": 1.0367, + "step": 8138 + }, + { + "epoch": 0.20898669760768776, + "grad_norm": 0.921875, + "learning_rate": 0.00019113914270716675, + "loss": 0.9337, + "step": 8139 + }, + { + "epoch": 0.20901237480360957, + "grad_norm": 0.7421875, + "learning_rate": 0.00019113730541610373, + "loss": 1.1258, + "step": 8140 + }, + { + "epoch": 0.2090380519995314, + "grad_norm": 0.8515625, + "learning_rate": 0.00019113546794341228, + "loss": 1.0588, + "step": 8141 + }, + { + "epoch": 0.2090637291954532, + "grad_norm": 0.796875, + "learning_rate": 0.00019113363028909608, + "loss": 1.1393, + "step": 8142 + }, + { + "epoch": 0.20908940639137502, + "grad_norm": 0.890625, + "learning_rate": 0.0001911317924531588, + "loss": 1.0997, + "step": 8143 + }, + { + "epoch": 0.20911508358729686, + "grad_norm": 0.8984375, + "learning_rate": 0.0001911299544356041, + "loss": 1.2124, + "step": 8144 + }, + { + "epoch": 0.20914076078321867, + "grad_norm": 0.83984375, + "learning_rate": 0.0001911281162364356, + "loss": 1.0024, + "step": 8145 + }, + { + "epoch": 0.2091664379791405, + "grad_norm": 0.75390625, + "learning_rate": 0.000191126277855657, + "loss": 1.0379, + "step": 8146 + }, + { + "epoch": 0.2091921151750623, + "grad_norm": 0.796875, + "learning_rate": 0.00019112443929327196, + "loss": 1.0996, + "step": 8147 + }, + { + "epoch": 0.20921779237098412, + "grad_norm": 0.87890625, + "learning_rate": 0.00019112260054928413, + "loss": 1.0635, + "step": 8148 + }, + { + "epoch": 0.20924346956690595, + "grad_norm": 0.89453125, + "learning_rate": 0.00019112076162369724, + "loss": 0.8901, + "step": 8149 + }, + { + "epoch": 0.20926914676282776, + "grad_norm": 0.76953125, + "learning_rate": 0.00019111892251651486, + "loss": 0.9742, + "step": 8150 + }, + { + "epoch": 0.2092948239587496, + "grad_norm": 0.8359375, + "learning_rate": 0.00019111708322774073, + "loss": 1.0481, + "step": 8151 + }, + { + "epoch": 0.2093205011546714, + "grad_norm": 0.765625, + "learning_rate": 0.00019111524375737843, + "loss": 0.9652, + "step": 8152 + }, + { + "epoch": 0.20934617835059322, + "grad_norm": 0.8671875, + "learning_rate": 0.00019111340410543176, + "loss": 1.1146, + "step": 8153 + }, + { + "epoch": 0.20937185554651505, + "grad_norm": 0.78515625, + "learning_rate": 0.00019111156427190423, + "loss": 0.9694, + "step": 8154 + }, + { + "epoch": 0.20939753274243686, + "grad_norm": 0.87890625, + "learning_rate": 0.00019110972425679962, + "loss": 1.0426, + "step": 8155 + }, + { + "epoch": 0.2094232099383587, + "grad_norm": 0.78125, + "learning_rate": 0.00019110788406012156, + "loss": 1.0383, + "step": 8156 + }, + { + "epoch": 0.2094488871342805, + "grad_norm": 0.80078125, + "learning_rate": 0.0001911060436818737, + "loss": 1.0423, + "step": 8157 + }, + { + "epoch": 0.2094745643302023, + "grad_norm": 0.83203125, + "learning_rate": 0.00019110420312205975, + "loss": 1.0382, + "step": 8158 + }, + { + "epoch": 0.20950024152612415, + "grad_norm": 0.81640625, + "learning_rate": 0.0001911023623806833, + "loss": 1.0015, + "step": 8159 + }, + { + "epoch": 0.20952591872204596, + "grad_norm": 0.7734375, + "learning_rate": 0.00019110052145774814, + "loss": 1.0658, + "step": 8160 + }, + { + "epoch": 0.20955159591796776, + "grad_norm": 0.7734375, + "learning_rate": 0.00019109868035325781, + "loss": 1.0888, + "step": 8161 + }, + { + "epoch": 0.2095772731138896, + "grad_norm": 0.9921875, + "learning_rate": 0.0001910968390672161, + "loss": 1.2058, + "step": 8162 + }, + { + "epoch": 0.2096029503098114, + "grad_norm": 0.80859375, + "learning_rate": 0.00019109499759962657, + "loss": 1.0726, + "step": 8163 + }, + { + "epoch": 0.20962862750573324, + "grad_norm": 0.8515625, + "learning_rate": 0.00019109315595049295, + "loss": 1.0799, + "step": 8164 + }, + { + "epoch": 0.20965430470165505, + "grad_norm": 0.84375, + "learning_rate": 0.00019109131411981892, + "loss": 1.1652, + "step": 8165 + }, + { + "epoch": 0.20967998189757686, + "grad_norm": 0.78125, + "learning_rate": 0.0001910894721076081, + "loss": 0.9864, + "step": 8166 + }, + { + "epoch": 0.2097056590934987, + "grad_norm": 0.78125, + "learning_rate": 0.0001910876299138642, + "loss": 1.0531, + "step": 8167 + }, + { + "epoch": 0.2097313362894205, + "grad_norm": 0.8359375, + "learning_rate": 0.00019108578753859086, + "loss": 1.1386, + "step": 8168 + }, + { + "epoch": 0.20975701348534234, + "grad_norm": 0.78125, + "learning_rate": 0.00019108394498179183, + "loss": 0.9706, + "step": 8169 + }, + { + "epoch": 0.20978269068126415, + "grad_norm": 0.8125, + "learning_rate": 0.00019108210224347069, + "loss": 1.163, + "step": 8170 + }, + { + "epoch": 0.20980836787718596, + "grad_norm": 0.7421875, + "learning_rate": 0.0001910802593236311, + "loss": 0.99, + "step": 8171 + }, + { + "epoch": 0.2098340450731078, + "grad_norm": 0.79296875, + "learning_rate": 0.00019107841622227688, + "loss": 0.9514, + "step": 8172 + }, + { + "epoch": 0.2098597222690296, + "grad_norm": 0.8046875, + "learning_rate": 0.00019107657293941156, + "loss": 0.8824, + "step": 8173 + }, + { + "epoch": 0.20988539946495144, + "grad_norm": 0.8125, + "learning_rate": 0.0001910747294750388, + "loss": 1.0957, + "step": 8174 + }, + { + "epoch": 0.20991107666087325, + "grad_norm": 0.73828125, + "learning_rate": 0.00019107288582916239, + "loss": 0.9548, + "step": 8175 + }, + { + "epoch": 0.20993675385679506, + "grad_norm": 0.80078125, + "learning_rate": 0.00019107104200178596, + "loss": 1.0483, + "step": 8176 + }, + { + "epoch": 0.2099624310527169, + "grad_norm": 1.09375, + "learning_rate": 0.00019106919799291312, + "loss": 1.047, + "step": 8177 + }, + { + "epoch": 0.2099881082486387, + "grad_norm": 0.8203125, + "learning_rate": 0.00019106735380254762, + "loss": 1.113, + "step": 8178 + }, + { + "epoch": 0.21001378544456054, + "grad_norm": 0.9453125, + "learning_rate": 0.00019106550943069313, + "loss": 1.0991, + "step": 8179 + }, + { + "epoch": 0.21003946264048234, + "grad_norm": 0.80078125, + "learning_rate": 0.00019106366487735328, + "loss": 0.955, + "step": 8180 + }, + { + "epoch": 0.21006513983640415, + "grad_norm": 0.77734375, + "learning_rate": 0.0001910618201425318, + "loss": 1.0214, + "step": 8181 + }, + { + "epoch": 0.210090817032326, + "grad_norm": 0.828125, + "learning_rate": 0.0001910599752262323, + "loss": 1.0913, + "step": 8182 + }, + { + "epoch": 0.2101164942282478, + "grad_norm": 0.9140625, + "learning_rate": 0.0001910581301284585, + "loss": 1.1212, + "step": 8183 + }, + { + "epoch": 0.21014217142416963, + "grad_norm": 0.75390625, + "learning_rate": 0.00019105628484921406, + "loss": 0.99, + "step": 8184 + }, + { + "epoch": 0.21016784862009144, + "grad_norm": 0.73046875, + "learning_rate": 0.0001910544393885027, + "loss": 0.7977, + "step": 8185 + }, + { + "epoch": 0.21019352581601325, + "grad_norm": 0.77734375, + "learning_rate": 0.00019105259374632808, + "loss": 1.0291, + "step": 8186 + }, + { + "epoch": 0.21021920301193509, + "grad_norm": 0.796875, + "learning_rate": 0.0001910507479226938, + "loss": 1.077, + "step": 8187 + }, + { + "epoch": 0.2102448802078569, + "grad_norm": 0.76171875, + "learning_rate": 0.00019104890191760368, + "loss": 1.001, + "step": 8188 + }, + { + "epoch": 0.21027055740377873, + "grad_norm": 0.796875, + "learning_rate": 0.0001910470557310613, + "loss": 0.9952, + "step": 8189 + }, + { + "epoch": 0.21029623459970054, + "grad_norm": 0.875, + "learning_rate": 0.00019104520936307035, + "loss": 1.0625, + "step": 8190 + }, + { + "epoch": 0.21032191179562235, + "grad_norm": 0.84765625, + "learning_rate": 0.0001910433628136345, + "loss": 1.1011, + "step": 8191 + }, + { + "epoch": 0.21034758899154418, + "grad_norm": 0.80078125, + "learning_rate": 0.00019104151608275748, + "loss": 0.9899, + "step": 8192 + }, + { + "epoch": 0.210373266187466, + "grad_norm": 0.8125, + "learning_rate": 0.00019103966917044296, + "loss": 1.0156, + "step": 8193 + }, + { + "epoch": 0.21039894338338783, + "grad_norm": 0.8359375, + "learning_rate": 0.00019103782207669458, + "loss": 1.1437, + "step": 8194 + }, + { + "epoch": 0.21042462057930963, + "grad_norm": 0.8046875, + "learning_rate": 0.00019103597480151603, + "loss": 1.0661, + "step": 8195 + }, + { + "epoch": 0.21045029777523144, + "grad_norm": 0.73046875, + "learning_rate": 0.00019103412734491104, + "loss": 1.0757, + "step": 8196 + }, + { + "epoch": 0.21047597497115328, + "grad_norm": 0.8671875, + "learning_rate": 0.00019103227970688322, + "loss": 0.9137, + "step": 8197 + }, + { + "epoch": 0.2105016521670751, + "grad_norm": 0.77734375, + "learning_rate": 0.0001910304318874363, + "loss": 1.207, + "step": 8198 + }, + { + "epoch": 0.21052732936299692, + "grad_norm": 0.80859375, + "learning_rate": 0.00019102858388657395, + "loss": 0.9845, + "step": 8199 + }, + { + "epoch": 0.21055300655891873, + "grad_norm": 0.79296875, + "learning_rate": 0.0001910267357042999, + "loss": 1.1136, + "step": 8200 + }, + { + "epoch": 0.21057868375484054, + "grad_norm": 0.86328125, + "learning_rate": 0.00019102488734061774, + "loss": 1.195, + "step": 8201 + }, + { + "epoch": 0.21060436095076238, + "grad_norm": 0.953125, + "learning_rate": 0.00019102303879553123, + "loss": 0.8757, + "step": 8202 + }, + { + "epoch": 0.21063003814668418, + "grad_norm": 0.8984375, + "learning_rate": 0.00019102119006904399, + "loss": 1.0022, + "step": 8203 + }, + { + "epoch": 0.21065571534260602, + "grad_norm": 0.80859375, + "learning_rate": 0.00019101934116115977, + "loss": 0.9648, + "step": 8204 + }, + { + "epoch": 0.21068139253852783, + "grad_norm": 0.8046875, + "learning_rate": 0.00019101749207188221, + "loss": 0.9186, + "step": 8205 + }, + { + "epoch": 0.21070706973444964, + "grad_norm": 0.78125, + "learning_rate": 0.00019101564280121503, + "loss": 1.0405, + "step": 8206 + }, + { + "epoch": 0.21073274693037147, + "grad_norm": 0.8046875, + "learning_rate": 0.0001910137933491619, + "loss": 1.0491, + "step": 8207 + }, + { + "epoch": 0.21075842412629328, + "grad_norm": 0.83203125, + "learning_rate": 0.00019101194371572646, + "loss": 0.9023, + "step": 8208 + }, + { + "epoch": 0.21078410132221512, + "grad_norm": 0.7421875, + "learning_rate": 0.00019101009390091244, + "loss": 0.9838, + "step": 8209 + }, + { + "epoch": 0.21080977851813693, + "grad_norm": 0.78125, + "learning_rate": 0.00019100824390472356, + "loss": 1.0019, + "step": 8210 + }, + { + "epoch": 0.21083545571405873, + "grad_norm": 0.70703125, + "learning_rate": 0.00019100639372716344, + "loss": 0.8686, + "step": 8211 + }, + { + "epoch": 0.21086113290998057, + "grad_norm": 0.828125, + "learning_rate": 0.00019100454336823584, + "loss": 1.0825, + "step": 8212 + }, + { + "epoch": 0.21088681010590238, + "grad_norm": 0.8515625, + "learning_rate": 0.00019100269282794433, + "loss": 0.9149, + "step": 8213 + }, + { + "epoch": 0.21091248730182421, + "grad_norm": 0.8203125, + "learning_rate": 0.00019100084210629275, + "loss": 1.0566, + "step": 8214 + }, + { + "epoch": 0.21093816449774602, + "grad_norm": 0.84375, + "learning_rate": 0.00019099899120328466, + "loss": 1.0499, + "step": 8215 + }, + { + "epoch": 0.21096384169366783, + "grad_norm": 0.80078125, + "learning_rate": 0.00019099714011892386, + "loss": 0.9924, + "step": 8216 + }, + { + "epoch": 0.21098951888958967, + "grad_norm": 0.74609375, + "learning_rate": 0.00019099528885321393, + "loss": 1.0475, + "step": 8217 + }, + { + "epoch": 0.21101519608551147, + "grad_norm": 0.796875, + "learning_rate": 0.0001909934374061586, + "loss": 1.0938, + "step": 8218 + }, + { + "epoch": 0.2110408732814333, + "grad_norm": 0.83203125, + "learning_rate": 0.00019099158577776158, + "loss": 1.1154, + "step": 8219 + }, + { + "epoch": 0.21106655047735512, + "grad_norm": 0.76171875, + "learning_rate": 0.00019098973396802657, + "loss": 0.8962, + "step": 8220 + }, + { + "epoch": 0.21109222767327693, + "grad_norm": 0.78515625, + "learning_rate": 0.00019098788197695723, + "loss": 1.0575, + "step": 8221 + }, + { + "epoch": 0.21111790486919876, + "grad_norm": 0.8359375, + "learning_rate": 0.00019098602980455724, + "loss": 1.1799, + "step": 8222 + }, + { + "epoch": 0.21114358206512057, + "grad_norm": 0.74609375, + "learning_rate": 0.00019098417745083034, + "loss": 1.0576, + "step": 8223 + }, + { + "epoch": 0.2111692592610424, + "grad_norm": 0.80859375, + "learning_rate": 0.00019098232491578018, + "loss": 1.188, + "step": 8224 + }, + { + "epoch": 0.21119493645696422, + "grad_norm": 0.71484375, + "learning_rate": 0.00019098047219941048, + "loss": 1.0117, + "step": 8225 + }, + { + "epoch": 0.21122061365288602, + "grad_norm": 0.79296875, + "learning_rate": 0.00019097861930172488, + "loss": 1.0158, + "step": 8226 + }, + { + "epoch": 0.21124629084880786, + "grad_norm": 0.8125, + "learning_rate": 0.00019097676622272715, + "loss": 1.0968, + "step": 8227 + }, + { + "epoch": 0.21127196804472967, + "grad_norm": 0.7734375, + "learning_rate": 0.00019097491296242095, + "loss": 0.902, + "step": 8228 + }, + { + "epoch": 0.2112976452406515, + "grad_norm": 0.86328125, + "learning_rate": 0.0001909730595208099, + "loss": 1.0233, + "step": 8229 + }, + { + "epoch": 0.2113233224365733, + "grad_norm": 0.7578125, + "learning_rate": 0.0001909712058978978, + "loss": 1.0726, + "step": 8230 + }, + { + "epoch": 0.21134899963249512, + "grad_norm": 0.7734375, + "learning_rate": 0.00019096935209368833, + "loss": 0.997, + "step": 8231 + }, + { + "epoch": 0.21137467682841696, + "grad_norm": 0.7734375, + "learning_rate": 0.00019096749810818513, + "loss": 0.9907, + "step": 8232 + }, + { + "epoch": 0.21140035402433877, + "grad_norm": 0.78125, + "learning_rate": 0.00019096564394139193, + "loss": 0.9467, + "step": 8233 + }, + { + "epoch": 0.2114260312202606, + "grad_norm": 0.83203125, + "learning_rate": 0.0001909637895933124, + "loss": 1.078, + "step": 8234 + }, + { + "epoch": 0.2114517084161824, + "grad_norm": 0.74609375, + "learning_rate": 0.0001909619350639503, + "loss": 0.9067, + "step": 8235 + }, + { + "epoch": 0.21147738561210422, + "grad_norm": 0.7890625, + "learning_rate": 0.00019096008035330924, + "loss": 1.0115, + "step": 8236 + }, + { + "epoch": 0.21150306280802605, + "grad_norm": 0.80859375, + "learning_rate": 0.00019095822546139296, + "loss": 1.0764, + "step": 8237 + }, + { + "epoch": 0.21152874000394786, + "grad_norm": 0.875, + "learning_rate": 0.00019095637038820518, + "loss": 1.0517, + "step": 8238 + }, + { + "epoch": 0.2115544171998697, + "grad_norm": 0.8046875, + "learning_rate": 0.00019095451513374953, + "loss": 1.027, + "step": 8239 + }, + { + "epoch": 0.2115800943957915, + "grad_norm": 0.8515625, + "learning_rate": 0.00019095265969802978, + "loss": 0.9968, + "step": 8240 + }, + { + "epoch": 0.21160577159171332, + "grad_norm": 0.84375, + "learning_rate": 0.00019095080408104958, + "loss": 1.0555, + "step": 8241 + }, + { + "epoch": 0.21163144878763515, + "grad_norm": 0.8359375, + "learning_rate": 0.00019094894828281263, + "loss": 1.0032, + "step": 8242 + }, + { + "epoch": 0.21165712598355696, + "grad_norm": 0.84375, + "learning_rate": 0.00019094709230332266, + "loss": 1.1045, + "step": 8243 + }, + { + "epoch": 0.2116828031794788, + "grad_norm": 0.7734375, + "learning_rate": 0.00019094523614258336, + "loss": 1.0081, + "step": 8244 + }, + { + "epoch": 0.2117084803754006, + "grad_norm": 0.73046875, + "learning_rate": 0.0001909433798005984, + "loss": 1.12, + "step": 8245 + }, + { + "epoch": 0.2117341575713224, + "grad_norm": 0.7421875, + "learning_rate": 0.00019094152327737152, + "loss": 0.8963, + "step": 8246 + }, + { + "epoch": 0.21175983476724425, + "grad_norm": 0.84765625, + "learning_rate": 0.00019093966657290635, + "loss": 1.1481, + "step": 8247 + }, + { + "epoch": 0.21178551196316606, + "grad_norm": 0.86328125, + "learning_rate": 0.0001909378096872067, + "loss": 1.0306, + "step": 8248 + }, + { + "epoch": 0.2118111891590879, + "grad_norm": 0.83984375, + "learning_rate": 0.00019093595262027616, + "loss": 1.1693, + "step": 8249 + }, + { + "epoch": 0.2118368663550097, + "grad_norm": 0.78125, + "learning_rate": 0.0001909340953721185, + "loss": 1.1214, + "step": 8250 + }, + { + "epoch": 0.2118625435509315, + "grad_norm": 0.8125, + "learning_rate": 0.0001909322379427374, + "loss": 0.9656, + "step": 8251 + }, + { + "epoch": 0.21188822074685335, + "grad_norm": 0.80859375, + "learning_rate": 0.0001909303803321366, + "loss": 0.982, + "step": 8252 + }, + { + "epoch": 0.21191389794277515, + "grad_norm": 0.78125, + "learning_rate": 0.00019092852254031973, + "loss": 1.1104, + "step": 8253 + }, + { + "epoch": 0.211939575138697, + "grad_norm": 0.7734375, + "learning_rate": 0.0001909266645672905, + "loss": 1.037, + "step": 8254 + }, + { + "epoch": 0.2119652523346188, + "grad_norm": 0.85546875, + "learning_rate": 0.0001909248064130527, + "loss": 1.085, + "step": 8255 + }, + { + "epoch": 0.2119909295305406, + "grad_norm": 0.77734375, + "learning_rate": 0.00019092294807760994, + "loss": 0.9214, + "step": 8256 + }, + { + "epoch": 0.21201660672646244, + "grad_norm": 0.828125, + "learning_rate": 0.00019092108956096597, + "loss": 1.1064, + "step": 8257 + }, + { + "epoch": 0.21204228392238425, + "grad_norm": 0.76953125, + "learning_rate": 0.00019091923086312447, + "loss": 0.9876, + "step": 8258 + }, + { + "epoch": 0.2120679611183061, + "grad_norm": 0.83203125, + "learning_rate": 0.00019091737198408918, + "loss": 1.0225, + "step": 8259 + }, + { + "epoch": 0.2120936383142279, + "grad_norm": 0.84765625, + "learning_rate": 0.00019091551292386373, + "loss": 1.151, + "step": 8260 + }, + { + "epoch": 0.2121193155101497, + "grad_norm": 0.79296875, + "learning_rate": 0.00019091365368245194, + "loss": 0.9231, + "step": 8261 + }, + { + "epoch": 0.21214499270607154, + "grad_norm": 0.82421875, + "learning_rate": 0.00019091179425985742, + "loss": 1.123, + "step": 8262 + }, + { + "epoch": 0.21217066990199335, + "grad_norm": 0.86328125, + "learning_rate": 0.00019090993465608388, + "loss": 0.9144, + "step": 8263 + }, + { + "epoch": 0.21219634709791518, + "grad_norm": 0.80859375, + "learning_rate": 0.00019090807487113506, + "loss": 0.9565, + "step": 8264 + }, + { + "epoch": 0.212222024293837, + "grad_norm": 0.76171875, + "learning_rate": 0.0001909062149050147, + "loss": 1.0345, + "step": 8265 + }, + { + "epoch": 0.2122477014897588, + "grad_norm": 0.828125, + "learning_rate": 0.00019090435475772644, + "loss": 1.029, + "step": 8266 + }, + { + "epoch": 0.21227337868568064, + "grad_norm": 0.7578125, + "learning_rate": 0.000190902494429274, + "loss": 1.1406, + "step": 8267 + }, + { + "epoch": 0.21229905588160244, + "grad_norm": 0.859375, + "learning_rate": 0.0001909006339196611, + "loss": 1.0519, + "step": 8268 + }, + { + "epoch": 0.21232473307752428, + "grad_norm": 0.828125, + "learning_rate": 0.00019089877322889148, + "loss": 1.039, + "step": 8269 + }, + { + "epoch": 0.2123504102734461, + "grad_norm": 0.87109375, + "learning_rate": 0.00019089691235696878, + "loss": 1.1068, + "step": 8270 + }, + { + "epoch": 0.2123760874693679, + "grad_norm": 0.8515625, + "learning_rate": 0.00019089505130389677, + "loss": 1.1281, + "step": 8271 + }, + { + "epoch": 0.21240176466528973, + "grad_norm": 0.80078125, + "learning_rate": 0.00019089319006967913, + "loss": 1.0747, + "step": 8272 + }, + { + "epoch": 0.21242744186121154, + "grad_norm": 0.78515625, + "learning_rate": 0.00019089132865431955, + "loss": 1.1592, + "step": 8273 + }, + { + "epoch": 0.21245311905713338, + "grad_norm": 0.87890625, + "learning_rate": 0.0001908894670578218, + "loss": 0.9923, + "step": 8274 + }, + { + "epoch": 0.21247879625305519, + "grad_norm": 0.78125, + "learning_rate": 0.0001908876052801895, + "loss": 0.9274, + "step": 8275 + }, + { + "epoch": 0.212504473448977, + "grad_norm": 0.91796875, + "learning_rate": 0.00019088574332142645, + "loss": 1.1325, + "step": 8276 + }, + { + "epoch": 0.21253015064489883, + "grad_norm": 0.84375, + "learning_rate": 0.00019088388118153633, + "loss": 0.9671, + "step": 8277 + }, + { + "epoch": 0.21255582784082064, + "grad_norm": 0.85546875, + "learning_rate": 0.00019088201886052284, + "loss": 0.9946, + "step": 8278 + }, + { + "epoch": 0.21258150503674247, + "grad_norm": 0.8515625, + "learning_rate": 0.00019088015635838966, + "loss": 1.1555, + "step": 8279 + }, + { + "epoch": 0.21260718223266428, + "grad_norm": 0.8359375, + "learning_rate": 0.00019087829367514059, + "loss": 1.0693, + "step": 8280 + }, + { + "epoch": 0.2126328594285861, + "grad_norm": 0.83203125, + "learning_rate": 0.00019087643081077923, + "loss": 1.1039, + "step": 8281 + }, + { + "epoch": 0.21265853662450793, + "grad_norm": 0.765625, + "learning_rate": 0.00019087456776530938, + "loss": 0.8985, + "step": 8282 + }, + { + "epoch": 0.21268421382042973, + "grad_norm": 0.82421875, + "learning_rate": 0.00019087270453873475, + "loss": 1.0391, + "step": 8283 + }, + { + "epoch": 0.21270989101635157, + "grad_norm": 0.859375, + "learning_rate": 0.000190870841131059, + "loss": 0.989, + "step": 8284 + }, + { + "epoch": 0.21273556821227338, + "grad_norm": 0.78515625, + "learning_rate": 0.00019086897754228588, + "loss": 1.0441, + "step": 8285 + }, + { + "epoch": 0.2127612454081952, + "grad_norm": 0.83984375, + "learning_rate": 0.0001908671137724191, + "loss": 1.2159, + "step": 8286 + }, + { + "epoch": 0.21278692260411702, + "grad_norm": 0.77734375, + "learning_rate": 0.00019086524982146237, + "loss": 1.1071, + "step": 8287 + }, + { + "epoch": 0.21281259980003883, + "grad_norm": 0.87890625, + "learning_rate": 0.00019086338568941939, + "loss": 0.9795, + "step": 8288 + }, + { + "epoch": 0.21283827699596067, + "grad_norm": 0.796875, + "learning_rate": 0.0001908615213762939, + "loss": 1.0153, + "step": 8289 + }, + { + "epoch": 0.21286395419188248, + "grad_norm": 0.84765625, + "learning_rate": 0.0001908596568820896, + "loss": 1.0673, + "step": 8290 + }, + { + "epoch": 0.21288963138780428, + "grad_norm": 0.80078125, + "learning_rate": 0.00019085779220681025, + "loss": 1.1081, + "step": 8291 + }, + { + "epoch": 0.21291530858372612, + "grad_norm": 0.734375, + "learning_rate": 0.00019085592735045947, + "loss": 0.9946, + "step": 8292 + }, + { + "epoch": 0.21294098577964793, + "grad_norm": 0.77734375, + "learning_rate": 0.00019085406231304105, + "loss": 1.1198, + "step": 8293 + }, + { + "epoch": 0.21296666297556976, + "grad_norm": 0.87890625, + "learning_rate": 0.0001908521970945587, + "loss": 1.0479, + "step": 8294 + }, + { + "epoch": 0.21299234017149157, + "grad_norm": 0.75, + "learning_rate": 0.00019085033169501613, + "loss": 1.0345, + "step": 8295 + }, + { + "epoch": 0.21301801736741338, + "grad_norm": 0.76171875, + "learning_rate": 0.00019084846611441706, + "loss": 0.9117, + "step": 8296 + }, + { + "epoch": 0.21304369456333522, + "grad_norm": 0.94140625, + "learning_rate": 0.0001908466003527652, + "loss": 1.0584, + "step": 8297 + }, + { + "epoch": 0.21306937175925703, + "grad_norm": 0.78125, + "learning_rate": 0.00019084473441006427, + "loss": 0.949, + "step": 8298 + }, + { + "epoch": 0.21309504895517886, + "grad_norm": 0.734375, + "learning_rate": 0.000190842868286318, + "loss": 0.974, + "step": 8299 + }, + { + "epoch": 0.21312072615110067, + "grad_norm": 0.82421875, + "learning_rate": 0.00019084100198153006, + "loss": 1.0155, + "step": 8300 + }, + { + "epoch": 0.21314640334702248, + "grad_norm": 0.8828125, + "learning_rate": 0.00019083913549570425, + "loss": 1.0305, + "step": 8301 + }, + { + "epoch": 0.21317208054294431, + "grad_norm": 1.0078125, + "learning_rate": 0.00019083726882884424, + "loss": 1.0463, + "step": 8302 + }, + { + "epoch": 0.21319775773886612, + "grad_norm": 0.86328125, + "learning_rate": 0.00019083540198095374, + "loss": 1.0815, + "step": 8303 + }, + { + "epoch": 0.21322343493478796, + "grad_norm": 0.8125, + "learning_rate": 0.0001908335349520365, + "loss": 1.083, + "step": 8304 + }, + { + "epoch": 0.21324911213070977, + "grad_norm": 0.875, + "learning_rate": 0.00019083166774209624, + "loss": 1.222, + "step": 8305 + }, + { + "epoch": 0.21327478932663158, + "grad_norm": 0.84375, + "learning_rate": 0.00019082980035113665, + "loss": 1.0517, + "step": 8306 + }, + { + "epoch": 0.2133004665225534, + "grad_norm": 1.0703125, + "learning_rate": 0.00019082793277916152, + "loss": 1.0597, + "step": 8307 + }, + { + "epoch": 0.21332614371847522, + "grad_norm": 0.8359375, + "learning_rate": 0.00019082606502617447, + "loss": 0.9656, + "step": 8308 + }, + { + "epoch": 0.21335182091439706, + "grad_norm": 0.7890625, + "learning_rate": 0.0001908241970921793, + "loss": 1.075, + "step": 8309 + }, + { + "epoch": 0.21337749811031886, + "grad_norm": 0.76953125, + "learning_rate": 0.0001908223289771797, + "loss": 1.0532, + "step": 8310 + }, + { + "epoch": 0.21340317530624067, + "grad_norm": 0.83984375, + "learning_rate": 0.0001908204606811794, + "loss": 1.0726, + "step": 8311 + }, + { + "epoch": 0.2134288525021625, + "grad_norm": 0.7890625, + "learning_rate": 0.00019081859220418214, + "loss": 1.0128, + "step": 8312 + }, + { + "epoch": 0.21345452969808432, + "grad_norm": 0.8046875, + "learning_rate": 0.0001908167235461916, + "loss": 1.0255, + "step": 8313 + }, + { + "epoch": 0.21348020689400615, + "grad_norm": 0.8046875, + "learning_rate": 0.0001908148547072116, + "loss": 0.9351, + "step": 8314 + }, + { + "epoch": 0.21350588408992796, + "grad_norm": 0.83203125, + "learning_rate": 0.00019081298568724572, + "loss": 1.0285, + "step": 8315 + }, + { + "epoch": 0.21353156128584977, + "grad_norm": 0.8046875, + "learning_rate": 0.0001908111164862978, + "loss": 0.992, + "step": 8316 + }, + { + "epoch": 0.2135572384817716, + "grad_norm": 0.79296875, + "learning_rate": 0.00019080924710437153, + "loss": 1.0216, + "step": 8317 + }, + { + "epoch": 0.2135829156776934, + "grad_norm": 0.84765625, + "learning_rate": 0.0001908073775414706, + "loss": 1.0959, + "step": 8318 + }, + { + "epoch": 0.21360859287361525, + "grad_norm": 0.796875, + "learning_rate": 0.00019080550779759884, + "loss": 1.0172, + "step": 8319 + }, + { + "epoch": 0.21363427006953706, + "grad_norm": 0.84765625, + "learning_rate": 0.00019080363787275984, + "loss": 1.1929, + "step": 8320 + }, + { + "epoch": 0.21365994726545887, + "grad_norm": 0.81640625, + "learning_rate": 0.00019080176776695745, + "loss": 1.0657, + "step": 8321 + }, + { + "epoch": 0.2136856244613807, + "grad_norm": 0.796875, + "learning_rate": 0.00019079989748019526, + "loss": 1.2408, + "step": 8322 + }, + { + "epoch": 0.2137113016573025, + "grad_norm": 0.87890625, + "learning_rate": 0.00019079802701247713, + "loss": 1.1078, + "step": 8323 + }, + { + "epoch": 0.21373697885322435, + "grad_norm": 0.79296875, + "learning_rate": 0.00019079615636380672, + "loss": 0.9788, + "step": 8324 + }, + { + "epoch": 0.21376265604914615, + "grad_norm": 0.87890625, + "learning_rate": 0.00019079428553418778, + "loss": 1.2283, + "step": 8325 + }, + { + "epoch": 0.21378833324506796, + "grad_norm": 0.83203125, + "learning_rate": 0.00019079241452362404, + "loss": 1.035, + "step": 8326 + }, + { + "epoch": 0.2138140104409898, + "grad_norm": 0.75390625, + "learning_rate": 0.00019079054333211919, + "loss": 1.0985, + "step": 8327 + }, + { + "epoch": 0.2138396876369116, + "grad_norm": 0.8046875, + "learning_rate": 0.00019078867195967703, + "loss": 1.0075, + "step": 8328 + }, + { + "epoch": 0.21386536483283344, + "grad_norm": 0.7890625, + "learning_rate": 0.0001907868004063012, + "loss": 1.1101, + "step": 8329 + }, + { + "epoch": 0.21389104202875525, + "grad_norm": 0.76171875, + "learning_rate": 0.0001907849286719955, + "loss": 0.9965, + "step": 8330 + }, + { + "epoch": 0.21391671922467706, + "grad_norm": 0.77734375, + "learning_rate": 0.00019078305675676363, + "loss": 1.0692, + "step": 8331 + }, + { + "epoch": 0.2139423964205989, + "grad_norm": 0.7734375, + "learning_rate": 0.00019078118466060934, + "loss": 1.0822, + "step": 8332 + }, + { + "epoch": 0.2139680736165207, + "grad_norm": 0.734375, + "learning_rate": 0.00019077931238353633, + "loss": 1.0139, + "step": 8333 + }, + { + "epoch": 0.21399375081244254, + "grad_norm": 0.79296875, + "learning_rate": 0.00019077743992554838, + "loss": 1.0718, + "step": 8334 + }, + { + "epoch": 0.21401942800836435, + "grad_norm": 0.84765625, + "learning_rate": 0.00019077556728664918, + "loss": 1.0062, + "step": 8335 + }, + { + "epoch": 0.21404510520428616, + "grad_norm": 0.81640625, + "learning_rate": 0.00019077369446684247, + "loss": 0.9238, + "step": 8336 + }, + { + "epoch": 0.214070782400208, + "grad_norm": 0.98046875, + "learning_rate": 0.00019077182146613197, + "loss": 1.1277, + "step": 8337 + }, + { + "epoch": 0.2140964595961298, + "grad_norm": 0.85546875, + "learning_rate": 0.00019076994828452147, + "loss": 1.0023, + "step": 8338 + }, + { + "epoch": 0.21412213679205164, + "grad_norm": 0.859375, + "learning_rate": 0.0001907680749220146, + "loss": 1.1076, + "step": 8339 + }, + { + "epoch": 0.21414781398797345, + "grad_norm": 0.79296875, + "learning_rate": 0.0001907662013786152, + "loss": 1.0647, + "step": 8340 + }, + { + "epoch": 0.21417349118389525, + "grad_norm": 0.9375, + "learning_rate": 0.00019076432765432696, + "loss": 1.0079, + "step": 8341 + }, + { + "epoch": 0.2141991683798171, + "grad_norm": 1.0390625, + "learning_rate": 0.0001907624537491536, + "loss": 1.1754, + "step": 8342 + }, + { + "epoch": 0.2142248455757389, + "grad_norm": 0.75390625, + "learning_rate": 0.0001907605796630989, + "loss": 0.9353, + "step": 8343 + }, + { + "epoch": 0.21425052277166073, + "grad_norm": 0.85546875, + "learning_rate": 0.00019075870539616655, + "loss": 1.0688, + "step": 8344 + }, + { + "epoch": 0.21427619996758254, + "grad_norm": 0.828125, + "learning_rate": 0.00019075683094836026, + "loss": 1.1561, + "step": 8345 + }, + { + "epoch": 0.21430187716350435, + "grad_norm": 0.7734375, + "learning_rate": 0.00019075495631968383, + "loss": 0.9339, + "step": 8346 + }, + { + "epoch": 0.2143275543594262, + "grad_norm": 0.8046875, + "learning_rate": 0.00019075308151014097, + "loss": 0.9911, + "step": 8347 + }, + { + "epoch": 0.214353231555348, + "grad_norm": 0.8359375, + "learning_rate": 0.00019075120651973544, + "loss": 1.009, + "step": 8348 + }, + { + "epoch": 0.21437890875126983, + "grad_norm": 0.76171875, + "learning_rate": 0.00019074933134847092, + "loss": 0.9105, + "step": 8349 + }, + { + "epoch": 0.21440458594719164, + "grad_norm": 0.8203125, + "learning_rate": 0.00019074745599635117, + "loss": 1.0604, + "step": 8350 + }, + { + "epoch": 0.21443026314311345, + "grad_norm": 0.8203125, + "learning_rate": 0.00019074558046337998, + "loss": 0.8888, + "step": 8351 + }, + { + "epoch": 0.21445594033903528, + "grad_norm": 0.77734375, + "learning_rate": 0.000190743704749561, + "loss": 0.9389, + "step": 8352 + }, + { + "epoch": 0.2144816175349571, + "grad_norm": 0.8125, + "learning_rate": 0.00019074182885489804, + "loss": 1.1038, + "step": 8353 + }, + { + "epoch": 0.21450729473087893, + "grad_norm": 0.92578125, + "learning_rate": 0.00019073995277939481, + "loss": 1.1426, + "step": 8354 + }, + { + "epoch": 0.21453297192680074, + "grad_norm": 0.765625, + "learning_rate": 0.00019073807652305504, + "loss": 0.878, + "step": 8355 + }, + { + "epoch": 0.21455864912272254, + "grad_norm": 0.8125, + "learning_rate": 0.0001907362000858825, + "loss": 0.9894, + "step": 8356 + }, + { + "epoch": 0.21458432631864438, + "grad_norm": 0.79296875, + "learning_rate": 0.0001907343234678809, + "loss": 0.984, + "step": 8357 + }, + { + "epoch": 0.2146100035145662, + "grad_norm": 0.8203125, + "learning_rate": 0.00019073244666905397, + "loss": 1.0084, + "step": 8358 + }, + { + "epoch": 0.21463568071048802, + "grad_norm": 0.77734375, + "learning_rate": 0.0001907305696894055, + "loss": 0.8917, + "step": 8359 + }, + { + "epoch": 0.21466135790640983, + "grad_norm": 0.78515625, + "learning_rate": 0.00019072869252893918, + "loss": 1.0646, + "step": 8360 + }, + { + "epoch": 0.21468703510233164, + "grad_norm": 0.80859375, + "learning_rate": 0.00019072681518765878, + "loss": 1.0925, + "step": 8361 + }, + { + "epoch": 0.21471271229825348, + "grad_norm": 0.76171875, + "learning_rate": 0.00019072493766556802, + "loss": 1.0, + "step": 8362 + }, + { + "epoch": 0.21473838949417529, + "grad_norm": 1.0, + "learning_rate": 0.0001907230599626707, + "loss": 1.08, + "step": 8363 + }, + { + "epoch": 0.21476406669009712, + "grad_norm": 0.828125, + "learning_rate": 0.00019072118207897045, + "loss": 0.9908, + "step": 8364 + }, + { + "epoch": 0.21478974388601893, + "grad_norm": 0.87109375, + "learning_rate": 0.00019071930401447112, + "loss": 1.1384, + "step": 8365 + }, + { + "epoch": 0.21481542108194074, + "grad_norm": 0.90625, + "learning_rate": 0.0001907174257691764, + "loss": 1.0778, + "step": 8366 + }, + { + "epoch": 0.21484109827786257, + "grad_norm": 1.3515625, + "learning_rate": 0.00019071554734309003, + "loss": 1.0624, + "step": 8367 + }, + { + "epoch": 0.21486677547378438, + "grad_norm": 0.76171875, + "learning_rate": 0.00019071366873621578, + "loss": 0.9289, + "step": 8368 + }, + { + "epoch": 0.2148924526697062, + "grad_norm": 0.81640625, + "learning_rate": 0.0001907117899485574, + "loss": 0.8671, + "step": 8369 + }, + { + "epoch": 0.21491812986562803, + "grad_norm": 0.85546875, + "learning_rate": 0.0001907099109801186, + "loss": 1.0526, + "step": 8370 + }, + { + "epoch": 0.21494380706154984, + "grad_norm": 0.75, + "learning_rate": 0.00019070803183090317, + "loss": 0.9325, + "step": 8371 + }, + { + "epoch": 0.21496948425747167, + "grad_norm": 0.77734375, + "learning_rate": 0.0001907061525009148, + "loss": 0.9369, + "step": 8372 + }, + { + "epoch": 0.21499516145339348, + "grad_norm": 0.78125, + "learning_rate": 0.00019070427299015726, + "loss": 0.994, + "step": 8373 + }, + { + "epoch": 0.2150208386493153, + "grad_norm": 0.7890625, + "learning_rate": 0.00019070239329863428, + "loss": 1.1153, + "step": 8374 + }, + { + "epoch": 0.21504651584523712, + "grad_norm": 0.8125, + "learning_rate": 0.00019070051342634964, + "loss": 0.9584, + "step": 8375 + }, + { + "epoch": 0.21507219304115893, + "grad_norm": 0.7890625, + "learning_rate": 0.00019069863337330709, + "loss": 0.9682, + "step": 8376 + }, + { + "epoch": 0.21509787023708077, + "grad_norm": 0.8671875, + "learning_rate": 0.00019069675313951033, + "loss": 1.044, + "step": 8377 + }, + { + "epoch": 0.21512354743300258, + "grad_norm": 0.828125, + "learning_rate": 0.00019069487272496317, + "loss": 1.113, + "step": 8378 + }, + { + "epoch": 0.21514922462892438, + "grad_norm": 0.87890625, + "learning_rate": 0.00019069299212966926, + "loss": 1.0959, + "step": 8379 + }, + { + "epoch": 0.21517490182484622, + "grad_norm": 0.81640625, + "learning_rate": 0.00019069111135363245, + "loss": 0.8998, + "step": 8380 + }, + { + "epoch": 0.21520057902076803, + "grad_norm": 0.796875, + "learning_rate": 0.00019068923039685645, + "loss": 0.9413, + "step": 8381 + }, + { + "epoch": 0.21522625621668987, + "grad_norm": 0.8203125, + "learning_rate": 0.00019068734925934498, + "loss": 1.0699, + "step": 8382 + }, + { + "epoch": 0.21525193341261167, + "grad_norm": 0.80859375, + "learning_rate": 0.00019068546794110184, + "loss": 1.1936, + "step": 8383 + }, + { + "epoch": 0.21527761060853348, + "grad_norm": 0.83984375, + "learning_rate": 0.00019068358644213075, + "loss": 0.9758, + "step": 8384 + }, + { + "epoch": 0.21530328780445532, + "grad_norm": 0.78515625, + "learning_rate": 0.00019068170476243545, + "loss": 1.0209, + "step": 8385 + }, + { + "epoch": 0.21532896500037713, + "grad_norm": 0.76953125, + "learning_rate": 0.00019067982290201974, + "loss": 1.0956, + "step": 8386 + }, + { + "epoch": 0.21535464219629896, + "grad_norm": 0.78125, + "learning_rate": 0.0001906779408608873, + "loss": 0.9688, + "step": 8387 + }, + { + "epoch": 0.21538031939222077, + "grad_norm": 0.80078125, + "learning_rate": 0.0001906760586390419, + "loss": 1.1504, + "step": 8388 + }, + { + "epoch": 0.21540599658814258, + "grad_norm": 0.7265625, + "learning_rate": 0.00019067417623648734, + "loss": 1.0594, + "step": 8389 + }, + { + "epoch": 0.21543167378406441, + "grad_norm": 0.76953125, + "learning_rate": 0.00019067229365322732, + "loss": 0.9089, + "step": 8390 + }, + { + "epoch": 0.21545735097998622, + "grad_norm": 0.83984375, + "learning_rate": 0.0001906704108892656, + "loss": 1.0942, + "step": 8391 + }, + { + "epoch": 0.21548302817590806, + "grad_norm": 0.88671875, + "learning_rate": 0.00019066852794460596, + "loss": 1.2153, + "step": 8392 + }, + { + "epoch": 0.21550870537182987, + "grad_norm": 0.796875, + "learning_rate": 0.00019066664481925215, + "loss": 0.926, + "step": 8393 + }, + { + "epoch": 0.21553438256775168, + "grad_norm": 0.8125, + "learning_rate": 0.00019066476151320785, + "loss": 1.2066, + "step": 8394 + }, + { + "epoch": 0.2155600597636735, + "grad_norm": 0.8515625, + "learning_rate": 0.00019066287802647694, + "loss": 1.0359, + "step": 8395 + }, + { + "epoch": 0.21558573695959532, + "grad_norm": 0.77734375, + "learning_rate": 0.00019066099435906307, + "loss": 1.1326, + "step": 8396 + }, + { + "epoch": 0.21561141415551716, + "grad_norm": 0.89453125, + "learning_rate": 0.00019065911051097, + "loss": 1.0277, + "step": 8397 + }, + { + "epoch": 0.21563709135143896, + "grad_norm": 0.80859375, + "learning_rate": 0.00019065722648220154, + "loss": 0.9959, + "step": 8398 + }, + { + "epoch": 0.21566276854736077, + "grad_norm": 0.80859375, + "learning_rate": 0.0001906553422727614, + "loss": 1.0235, + "step": 8399 + }, + { + "epoch": 0.2156884457432826, + "grad_norm": 0.8671875, + "learning_rate": 0.0001906534578826534, + "loss": 0.9975, + "step": 8400 + }, + { + "epoch": 0.21571412293920442, + "grad_norm": 0.75390625, + "learning_rate": 0.0001906515733118812, + "loss": 0.9492, + "step": 8401 + }, + { + "epoch": 0.21573980013512625, + "grad_norm": 0.87109375, + "learning_rate": 0.0001906496885604486, + "loss": 1.0301, + "step": 8402 + }, + { + "epoch": 0.21576547733104806, + "grad_norm": 0.7421875, + "learning_rate": 0.0001906478036283594, + "loss": 1.0049, + "step": 8403 + }, + { + "epoch": 0.21579115452696987, + "grad_norm": 0.85546875, + "learning_rate": 0.00019064591851561727, + "loss": 0.921, + "step": 8404 + }, + { + "epoch": 0.2158168317228917, + "grad_norm": 0.7734375, + "learning_rate": 0.00019064403322222604, + "loss": 0.9262, + "step": 8405 + }, + { + "epoch": 0.2158425089188135, + "grad_norm": 0.859375, + "learning_rate": 0.00019064214774818945, + "loss": 1.0445, + "step": 8406 + }, + { + "epoch": 0.21586818611473535, + "grad_norm": 0.81640625, + "learning_rate": 0.00019064026209351123, + "loss": 1.0586, + "step": 8407 + }, + { + "epoch": 0.21589386331065716, + "grad_norm": 0.82421875, + "learning_rate": 0.00019063837625819515, + "loss": 1.0784, + "step": 8408 + }, + { + "epoch": 0.21591954050657897, + "grad_norm": 0.7890625, + "learning_rate": 0.00019063649024224496, + "loss": 1.0711, + "step": 8409 + }, + { + "epoch": 0.2159452177025008, + "grad_norm": 0.828125, + "learning_rate": 0.00019063460404566447, + "loss": 1.0418, + "step": 8410 + }, + { + "epoch": 0.2159708948984226, + "grad_norm": 0.75390625, + "learning_rate": 0.0001906327176684574, + "loss": 0.9929, + "step": 8411 + }, + { + "epoch": 0.21599657209434445, + "grad_norm": 0.78125, + "learning_rate": 0.00019063083111062746, + "loss": 1.1043, + "step": 8412 + }, + { + "epoch": 0.21602224929026625, + "grad_norm": 0.8203125, + "learning_rate": 0.0001906289443721785, + "loss": 1.189, + "step": 8413 + }, + { + "epoch": 0.21604792648618806, + "grad_norm": 0.7890625, + "learning_rate": 0.00019062705745311425, + "loss": 1.1143, + "step": 8414 + }, + { + "epoch": 0.2160736036821099, + "grad_norm": 0.7734375, + "learning_rate": 0.00019062517035343843, + "loss": 1.0437, + "step": 8415 + }, + { + "epoch": 0.2160992808780317, + "grad_norm": 0.84375, + "learning_rate": 0.00019062328307315485, + "loss": 1.1038, + "step": 8416 + }, + { + "epoch": 0.21612495807395354, + "grad_norm": 0.75390625, + "learning_rate": 0.00019062139561226723, + "loss": 1.0625, + "step": 8417 + }, + { + "epoch": 0.21615063526987535, + "grad_norm": 0.72265625, + "learning_rate": 0.0001906195079707794, + "loss": 0.8922, + "step": 8418 + }, + { + "epoch": 0.21617631246579716, + "grad_norm": 0.83203125, + "learning_rate": 0.00019061762014869503, + "loss": 0.8703, + "step": 8419 + }, + { + "epoch": 0.216201989661719, + "grad_norm": 0.8359375, + "learning_rate": 0.00019061573214601796, + "loss": 1.084, + "step": 8420 + }, + { + "epoch": 0.2162276668576408, + "grad_norm": 0.81640625, + "learning_rate": 0.0001906138439627519, + "loss": 1.0382, + "step": 8421 + }, + { + "epoch": 0.21625334405356264, + "grad_norm": 0.765625, + "learning_rate": 0.00019061195559890064, + "loss": 0.9166, + "step": 8422 + }, + { + "epoch": 0.21627902124948445, + "grad_norm": 0.80078125, + "learning_rate": 0.00019061006705446793, + "loss": 1.0975, + "step": 8423 + }, + { + "epoch": 0.21630469844540626, + "grad_norm": 0.88671875, + "learning_rate": 0.00019060817832945755, + "loss": 1.173, + "step": 8424 + }, + { + "epoch": 0.2163303756413281, + "grad_norm": 0.81640625, + "learning_rate": 0.00019060628942387328, + "loss": 1.1713, + "step": 8425 + }, + { + "epoch": 0.2163560528372499, + "grad_norm": 0.76953125, + "learning_rate": 0.00019060440033771882, + "loss": 1.13, + "step": 8426 + }, + { + "epoch": 0.21638173003317174, + "grad_norm": 0.85546875, + "learning_rate": 0.000190602511070998, + "loss": 1.0435, + "step": 8427 + }, + { + "epoch": 0.21640740722909355, + "grad_norm": 0.75390625, + "learning_rate": 0.00019060062162371455, + "loss": 0.9706, + "step": 8428 + }, + { + "epoch": 0.21643308442501535, + "grad_norm": 0.81640625, + "learning_rate": 0.00019059873199587225, + "loss": 1.1103, + "step": 8429 + }, + { + "epoch": 0.2164587616209372, + "grad_norm": 0.78515625, + "learning_rate": 0.00019059684218747486, + "loss": 0.993, + "step": 8430 + }, + { + "epoch": 0.216484438816859, + "grad_norm": 0.7890625, + "learning_rate": 0.00019059495219852615, + "loss": 0.9618, + "step": 8431 + }, + { + "epoch": 0.21651011601278083, + "grad_norm": 0.75, + "learning_rate": 0.00019059306202902986, + "loss": 1.1334, + "step": 8432 + }, + { + "epoch": 0.21653579320870264, + "grad_norm": 0.90625, + "learning_rate": 0.00019059117167898978, + "loss": 1.1325, + "step": 8433 + }, + { + "epoch": 0.21656147040462445, + "grad_norm": 0.79296875, + "learning_rate": 0.00019058928114840972, + "loss": 1.0129, + "step": 8434 + }, + { + "epoch": 0.2165871476005463, + "grad_norm": 0.75390625, + "learning_rate": 0.00019058739043729337, + "loss": 1.0255, + "step": 8435 + }, + { + "epoch": 0.2166128247964681, + "grad_norm": 0.84375, + "learning_rate": 0.00019058549954564455, + "loss": 1.0149, + "step": 8436 + }, + { + "epoch": 0.21663850199238993, + "grad_norm": 0.859375, + "learning_rate": 0.00019058360847346698, + "loss": 1.1038, + "step": 8437 + }, + { + "epoch": 0.21666417918831174, + "grad_norm": 0.8203125, + "learning_rate": 0.0001905817172207645, + "loss": 0.9753, + "step": 8438 + }, + { + "epoch": 0.21668985638423355, + "grad_norm": 0.828125, + "learning_rate": 0.00019057982578754085, + "loss": 0.992, + "step": 8439 + }, + { + "epoch": 0.21671553358015538, + "grad_norm": 0.8515625, + "learning_rate": 0.00019057793417379975, + "loss": 1.0559, + "step": 8440 + }, + { + "epoch": 0.2167412107760772, + "grad_norm": 0.796875, + "learning_rate": 0.00019057604237954502, + "loss": 1.0568, + "step": 8441 + }, + { + "epoch": 0.21676688797199903, + "grad_norm": 0.953125, + "learning_rate": 0.00019057415040478045, + "loss": 1.0925, + "step": 8442 + }, + { + "epoch": 0.21679256516792084, + "grad_norm": 0.82421875, + "learning_rate": 0.00019057225824950974, + "loss": 1.037, + "step": 8443 + }, + { + "epoch": 0.21681824236384264, + "grad_norm": 0.71484375, + "learning_rate": 0.00019057036591373672, + "loss": 0.9142, + "step": 8444 + }, + { + "epoch": 0.21684391955976448, + "grad_norm": 0.83984375, + "learning_rate": 0.00019056847339746512, + "loss": 1.2133, + "step": 8445 + }, + { + "epoch": 0.2168695967556863, + "grad_norm": 0.73828125, + "learning_rate": 0.00019056658070069874, + "loss": 0.8706, + "step": 8446 + }, + { + "epoch": 0.21689527395160813, + "grad_norm": 0.7734375, + "learning_rate": 0.00019056468782344138, + "loss": 0.8823, + "step": 8447 + }, + { + "epoch": 0.21692095114752993, + "grad_norm": 0.8828125, + "learning_rate": 0.00019056279476569677, + "loss": 0.9994, + "step": 8448 + }, + { + "epoch": 0.21694662834345174, + "grad_norm": 0.8359375, + "learning_rate": 0.00019056090152746866, + "loss": 1.0318, + "step": 8449 + }, + { + "epoch": 0.21697230553937358, + "grad_norm": 0.8046875, + "learning_rate": 0.00019055900810876087, + "loss": 1.0697, + "step": 8450 + }, + { + "epoch": 0.21699798273529539, + "grad_norm": 0.8515625, + "learning_rate": 0.00019055711450957717, + "loss": 1.0051, + "step": 8451 + }, + { + "epoch": 0.21702365993121722, + "grad_norm": 0.77734375, + "learning_rate": 0.0001905552207299213, + "loss": 1.037, + "step": 8452 + }, + { + "epoch": 0.21704933712713903, + "grad_norm": 0.84375, + "learning_rate": 0.00019055332676979706, + "loss": 0.9907, + "step": 8453 + }, + { + "epoch": 0.21707501432306084, + "grad_norm": 0.796875, + "learning_rate": 0.00019055143262920821, + "loss": 1.0429, + "step": 8454 + }, + { + "epoch": 0.21710069151898267, + "grad_norm": 0.74609375, + "learning_rate": 0.00019054953830815853, + "loss": 0.8957, + "step": 8455 + }, + { + "epoch": 0.21712636871490448, + "grad_norm": 0.80078125, + "learning_rate": 0.00019054764380665182, + "loss": 1.0461, + "step": 8456 + }, + { + "epoch": 0.21715204591082632, + "grad_norm": 0.85546875, + "learning_rate": 0.00019054574912469183, + "loss": 1.125, + "step": 8457 + }, + { + "epoch": 0.21717772310674813, + "grad_norm": 0.89453125, + "learning_rate": 0.00019054385426228233, + "loss": 1.0125, + "step": 8458 + }, + { + "epoch": 0.21720340030266994, + "grad_norm": 0.76953125, + "learning_rate": 0.00019054195921942713, + "loss": 0.9712, + "step": 8459 + }, + { + "epoch": 0.21722907749859177, + "grad_norm": 0.81640625, + "learning_rate": 0.00019054006399612998, + "loss": 1.1127, + "step": 8460 + }, + { + "epoch": 0.21725475469451358, + "grad_norm": 0.81640625, + "learning_rate": 0.00019053816859239463, + "loss": 1.0928, + "step": 8461 + }, + { + "epoch": 0.21728043189043542, + "grad_norm": 0.82421875, + "learning_rate": 0.0001905362730082249, + "loss": 1.0313, + "step": 8462 + }, + { + "epoch": 0.21730610908635722, + "grad_norm": 0.796875, + "learning_rate": 0.0001905343772436246, + "loss": 0.9083, + "step": 8463 + }, + { + "epoch": 0.21733178628227903, + "grad_norm": 0.9140625, + "learning_rate": 0.0001905324812985974, + "loss": 1.0126, + "step": 8464 + }, + { + "epoch": 0.21735746347820087, + "grad_norm": 0.79296875, + "learning_rate": 0.0001905305851731472, + "loss": 0.9812, + "step": 8465 + }, + { + "epoch": 0.21738314067412268, + "grad_norm": 0.87890625, + "learning_rate": 0.00019052868886727768, + "loss": 1.232, + "step": 8466 + }, + { + "epoch": 0.2174088178700445, + "grad_norm": 0.80078125, + "learning_rate": 0.00019052679238099266, + "loss": 0.9925, + "step": 8467 + }, + { + "epoch": 0.21743449506596632, + "grad_norm": 0.76953125, + "learning_rate": 0.00019052489571429594, + "loss": 0.8642, + "step": 8468 + }, + { + "epoch": 0.21746017226188813, + "grad_norm": 0.80859375, + "learning_rate": 0.00019052299886719126, + "loss": 1.0588, + "step": 8469 + }, + { + "epoch": 0.21748584945780997, + "grad_norm": 0.81640625, + "learning_rate": 0.00019052110183968247, + "loss": 0.9162, + "step": 8470 + }, + { + "epoch": 0.21751152665373177, + "grad_norm": 0.84765625, + "learning_rate": 0.00019051920463177325, + "loss": 0.8968, + "step": 8471 + }, + { + "epoch": 0.2175372038496536, + "grad_norm": 1.3984375, + "learning_rate": 0.00019051730724346745, + "loss": 1.1884, + "step": 8472 + }, + { + "epoch": 0.21756288104557542, + "grad_norm": 0.80859375, + "learning_rate": 0.00019051540967476881, + "loss": 1.2066, + "step": 8473 + }, + { + "epoch": 0.21758855824149723, + "grad_norm": 0.734375, + "learning_rate": 0.00019051351192568116, + "loss": 0.9694, + "step": 8474 + }, + { + "epoch": 0.21761423543741906, + "grad_norm": 0.7734375, + "learning_rate": 0.00019051161399620827, + "loss": 1.1368, + "step": 8475 + }, + { + "epoch": 0.21763991263334087, + "grad_norm": 0.75390625, + "learning_rate": 0.0001905097158863539, + "loss": 0.9308, + "step": 8476 + }, + { + "epoch": 0.2176655898292627, + "grad_norm": 0.796875, + "learning_rate": 0.00019050781759612182, + "loss": 1.0181, + "step": 8477 + }, + { + "epoch": 0.21769126702518451, + "grad_norm": 0.87109375, + "learning_rate": 0.00019050591912551584, + "loss": 0.8571, + "step": 8478 + }, + { + "epoch": 0.21771694422110632, + "grad_norm": 0.84765625, + "learning_rate": 0.00019050402047453977, + "loss": 1.132, + "step": 8479 + }, + { + "epoch": 0.21774262141702816, + "grad_norm": 0.9296875, + "learning_rate": 0.00019050212164319733, + "loss": 1.0015, + "step": 8480 + }, + { + "epoch": 0.21776829861294997, + "grad_norm": 0.83203125, + "learning_rate": 0.00019050022263149234, + "loss": 0.9407, + "step": 8481 + }, + { + "epoch": 0.2177939758088718, + "grad_norm": 0.8203125, + "learning_rate": 0.0001904983234394286, + "loss": 1.0263, + "step": 8482 + }, + { + "epoch": 0.2178196530047936, + "grad_norm": 0.7734375, + "learning_rate": 0.00019049642406700985, + "loss": 1.1486, + "step": 8483 + }, + { + "epoch": 0.21784533020071542, + "grad_norm": 0.8515625, + "learning_rate": 0.0001904945245142399, + "loss": 1.0772, + "step": 8484 + }, + { + "epoch": 0.21787100739663726, + "grad_norm": 0.78125, + "learning_rate": 0.00019049262478112254, + "loss": 1.0529, + "step": 8485 + }, + { + "epoch": 0.21789668459255906, + "grad_norm": 0.8046875, + "learning_rate": 0.0001904907248676616, + "loss": 1.0163, + "step": 8486 + }, + { + "epoch": 0.2179223617884809, + "grad_norm": 0.8046875, + "learning_rate": 0.00019048882477386078, + "loss": 0.8913, + "step": 8487 + }, + { + "epoch": 0.2179480389844027, + "grad_norm": 0.81640625, + "learning_rate": 0.0001904869244997239, + "loss": 1.0282, + "step": 8488 + }, + { + "epoch": 0.21797371618032452, + "grad_norm": 0.796875, + "learning_rate": 0.00019048502404525478, + "loss": 1.0365, + "step": 8489 + }, + { + "epoch": 0.21799939337624635, + "grad_norm": 0.8203125, + "learning_rate": 0.00019048312341045715, + "loss": 0.8751, + "step": 8490 + }, + { + "epoch": 0.21802507057216816, + "grad_norm": 0.8203125, + "learning_rate": 0.00019048122259533482, + "loss": 1.0291, + "step": 8491 + }, + { + "epoch": 0.21805074776809, + "grad_norm": 0.765625, + "learning_rate": 0.00019047932159989165, + "loss": 1.0298, + "step": 8492 + }, + { + "epoch": 0.2180764249640118, + "grad_norm": 0.9140625, + "learning_rate": 0.0001904774204241313, + "loss": 0.9829, + "step": 8493 + }, + { + "epoch": 0.21810210215993361, + "grad_norm": 0.85546875, + "learning_rate": 0.00019047551906805764, + "loss": 1.046, + "step": 8494 + }, + { + "epoch": 0.21812777935585545, + "grad_norm": 0.83984375, + "learning_rate": 0.00019047361753167445, + "loss": 1.1415, + "step": 8495 + }, + { + "epoch": 0.21815345655177726, + "grad_norm": 0.8203125, + "learning_rate": 0.00019047171581498552, + "loss": 1.108, + "step": 8496 + }, + { + "epoch": 0.2181791337476991, + "grad_norm": 0.82421875, + "learning_rate": 0.0001904698139179946, + "loss": 1.0876, + "step": 8497 + }, + { + "epoch": 0.2182048109436209, + "grad_norm": 0.8203125, + "learning_rate": 0.00019046791184070555, + "loss": 1.0449, + "step": 8498 + }, + { + "epoch": 0.2182304881395427, + "grad_norm": 0.8046875, + "learning_rate": 0.0001904660095831221, + "loss": 1.0745, + "step": 8499 + }, + { + "epoch": 0.21825616533546455, + "grad_norm": 0.83203125, + "learning_rate": 0.0001904641071452481, + "loss": 1.0383, + "step": 8500 + }, + { + "epoch": 0.21828184253138636, + "grad_norm": 0.73046875, + "learning_rate": 0.00019046220452708726, + "loss": 0.8797, + "step": 8501 + }, + { + "epoch": 0.2183075197273082, + "grad_norm": 0.80859375, + "learning_rate": 0.00019046030172864344, + "loss": 1.0236, + "step": 8502 + }, + { + "epoch": 0.21833319692323, + "grad_norm": 0.85546875, + "learning_rate": 0.00019045839874992044, + "loss": 1.0703, + "step": 8503 + }, + { + "epoch": 0.2183588741191518, + "grad_norm": 0.83984375, + "learning_rate": 0.00019045649559092199, + "loss": 1.0898, + "step": 8504 + }, + { + "epoch": 0.21838455131507364, + "grad_norm": 0.77734375, + "learning_rate": 0.00019045459225165189, + "loss": 0.8921, + "step": 8505 + }, + { + "epoch": 0.21841022851099545, + "grad_norm": 0.91796875, + "learning_rate": 0.00019045268873211398, + "loss": 0.9458, + "step": 8506 + }, + { + "epoch": 0.2184359057069173, + "grad_norm": 0.77734375, + "learning_rate": 0.00019045078503231207, + "loss": 1.1165, + "step": 8507 + }, + { + "epoch": 0.2184615829028391, + "grad_norm": 0.78125, + "learning_rate": 0.00019044888115224987, + "loss": 1.2112, + "step": 8508 + }, + { + "epoch": 0.2184872600987609, + "grad_norm": 0.78515625, + "learning_rate": 0.00019044697709193122, + "loss": 1.0611, + "step": 8509 + }, + { + "epoch": 0.21851293729468274, + "grad_norm": 0.84765625, + "learning_rate": 0.00019044507285135995, + "loss": 1.0463, + "step": 8510 + }, + { + "epoch": 0.21853861449060455, + "grad_norm": 0.7421875, + "learning_rate": 0.0001904431684305398, + "loss": 0.9371, + "step": 8511 + }, + { + "epoch": 0.21856429168652639, + "grad_norm": 0.84375, + "learning_rate": 0.00019044126382947459, + "loss": 1.009, + "step": 8512 + }, + { + "epoch": 0.2185899688824482, + "grad_norm": 0.80078125, + "learning_rate": 0.0001904393590481681, + "loss": 0.9287, + "step": 8513 + }, + { + "epoch": 0.21861564607837, + "grad_norm": 0.8046875, + "learning_rate": 0.00019043745408662416, + "loss": 1.1097, + "step": 8514 + }, + { + "epoch": 0.21864132327429184, + "grad_norm": 0.83203125, + "learning_rate": 0.0001904355489448465, + "loss": 0.9413, + "step": 8515 + }, + { + "epoch": 0.21866700047021365, + "grad_norm": 0.81640625, + "learning_rate": 0.00019043364362283897, + "loss": 1.1488, + "step": 8516 + }, + { + "epoch": 0.21869267766613548, + "grad_norm": 0.7578125, + "learning_rate": 0.00019043173812060537, + "loss": 0.9045, + "step": 8517 + }, + { + "epoch": 0.2187183548620573, + "grad_norm": 0.80859375, + "learning_rate": 0.0001904298324381495, + "loss": 0.9905, + "step": 8518 + }, + { + "epoch": 0.2187440320579791, + "grad_norm": 0.8671875, + "learning_rate": 0.00019042792657547513, + "loss": 0.9671, + "step": 8519 + }, + { + "epoch": 0.21876970925390093, + "grad_norm": 0.7578125, + "learning_rate": 0.00019042602053258607, + "loss": 0.9752, + "step": 8520 + }, + { + "epoch": 0.21879538644982274, + "grad_norm": 0.83203125, + "learning_rate": 0.00019042411430948613, + "loss": 1.0222, + "step": 8521 + }, + { + "epoch": 0.21882106364574458, + "grad_norm": 0.80078125, + "learning_rate": 0.00019042220790617908, + "loss": 1.0393, + "step": 8522 + }, + { + "epoch": 0.2188467408416664, + "grad_norm": 0.76953125, + "learning_rate": 0.00019042030132266873, + "loss": 1.1346, + "step": 8523 + }, + { + "epoch": 0.2188724180375882, + "grad_norm": 0.7734375, + "learning_rate": 0.00019041839455895892, + "loss": 0.8965, + "step": 8524 + }, + { + "epoch": 0.21889809523351003, + "grad_norm": 0.81640625, + "learning_rate": 0.00019041648761505337, + "loss": 1.0976, + "step": 8525 + }, + { + "epoch": 0.21892377242943184, + "grad_norm": 0.77734375, + "learning_rate": 0.00019041458049095597, + "loss": 0.8779, + "step": 8526 + }, + { + "epoch": 0.21894944962535368, + "grad_norm": 0.8203125, + "learning_rate": 0.00019041267318667044, + "loss": 0.9391, + "step": 8527 + }, + { + "epoch": 0.21897512682127548, + "grad_norm": 0.84765625, + "learning_rate": 0.00019041076570220066, + "loss": 1.1055, + "step": 8528 + }, + { + "epoch": 0.2190008040171973, + "grad_norm": 0.8125, + "learning_rate": 0.00019040885803755036, + "loss": 1.1178, + "step": 8529 + }, + { + "epoch": 0.21902648121311913, + "grad_norm": 0.81640625, + "learning_rate": 0.0001904069501927234, + "loss": 1.0836, + "step": 8530 + }, + { + "epoch": 0.21905215840904094, + "grad_norm": 0.76171875, + "learning_rate": 0.0001904050421677235, + "loss": 1.0271, + "step": 8531 + }, + { + "epoch": 0.21907783560496277, + "grad_norm": 0.7578125, + "learning_rate": 0.00019040313396255458, + "loss": 1.093, + "step": 8532 + }, + { + "epoch": 0.21910351280088458, + "grad_norm": 0.82421875, + "learning_rate": 0.00019040122557722038, + "loss": 1.0533, + "step": 8533 + }, + { + "epoch": 0.2191291899968064, + "grad_norm": 0.71875, + "learning_rate": 0.00019039931701172468, + "loss": 0.9221, + "step": 8534 + }, + { + "epoch": 0.21915486719272823, + "grad_norm": 0.7578125, + "learning_rate": 0.00019039740826607131, + "loss": 1.0102, + "step": 8535 + }, + { + "epoch": 0.21918054438865003, + "grad_norm": 0.90234375, + "learning_rate": 0.00019039549934026407, + "loss": 1.0727, + "step": 8536 + }, + { + "epoch": 0.21920622158457187, + "grad_norm": 0.83984375, + "learning_rate": 0.00019039359023430676, + "loss": 0.9837, + "step": 8537 + }, + { + "epoch": 0.21923189878049368, + "grad_norm": 0.875, + "learning_rate": 0.0001903916809482032, + "loss": 1.0306, + "step": 8538 + }, + { + "epoch": 0.2192575759764155, + "grad_norm": 0.84765625, + "learning_rate": 0.00019038977148195719, + "loss": 1.0405, + "step": 8539 + }, + { + "epoch": 0.21928325317233732, + "grad_norm": 0.86328125, + "learning_rate": 0.0001903878618355725, + "loss": 1.1126, + "step": 8540 + }, + { + "epoch": 0.21930893036825913, + "grad_norm": 0.84375, + "learning_rate": 0.00019038595200905297, + "loss": 1.0232, + "step": 8541 + }, + { + "epoch": 0.21933460756418097, + "grad_norm": 0.8828125, + "learning_rate": 0.00019038404200240244, + "loss": 0.9985, + "step": 8542 + }, + { + "epoch": 0.21936028476010277, + "grad_norm": 0.8125, + "learning_rate": 0.00019038213181562467, + "loss": 1.104, + "step": 8543 + }, + { + "epoch": 0.21938596195602458, + "grad_norm": 0.91015625, + "learning_rate": 0.00019038022144872346, + "loss": 1.0606, + "step": 8544 + }, + { + "epoch": 0.21941163915194642, + "grad_norm": 0.76953125, + "learning_rate": 0.00019037831090170263, + "loss": 0.9857, + "step": 8545 + }, + { + "epoch": 0.21943731634786823, + "grad_norm": 0.82421875, + "learning_rate": 0.00019037640017456598, + "loss": 1.0751, + "step": 8546 + }, + { + "epoch": 0.21946299354379006, + "grad_norm": 0.796875, + "learning_rate": 0.00019037448926731732, + "loss": 1.196, + "step": 8547 + }, + { + "epoch": 0.21948867073971187, + "grad_norm": 0.86328125, + "learning_rate": 0.00019037257817996052, + "loss": 0.9803, + "step": 8548 + }, + { + "epoch": 0.21951434793563368, + "grad_norm": 0.84375, + "learning_rate": 0.0001903706669124993, + "loss": 0.997, + "step": 8549 + }, + { + "epoch": 0.21954002513155552, + "grad_norm": 0.83203125, + "learning_rate": 0.00019036875546493752, + "loss": 0.9239, + "step": 8550 + }, + { + "epoch": 0.21956570232747732, + "grad_norm": 0.74609375, + "learning_rate": 0.00019036684383727894, + "loss": 1.1344, + "step": 8551 + }, + { + "epoch": 0.21959137952339916, + "grad_norm": 0.765625, + "learning_rate": 0.00019036493202952744, + "loss": 1.0595, + "step": 8552 + }, + { + "epoch": 0.21961705671932097, + "grad_norm": 0.7890625, + "learning_rate": 0.00019036302004168677, + "loss": 1.082, + "step": 8553 + }, + { + "epoch": 0.21964273391524278, + "grad_norm": 0.74609375, + "learning_rate": 0.0001903611078737608, + "loss": 0.8654, + "step": 8554 + }, + { + "epoch": 0.2196684111111646, + "grad_norm": 0.734375, + "learning_rate": 0.00019035919552575322, + "loss": 1.0626, + "step": 8555 + }, + { + "epoch": 0.21969408830708642, + "grad_norm": 0.828125, + "learning_rate": 0.000190357282997668, + "loss": 1.056, + "step": 8556 + }, + { + "epoch": 0.21971976550300826, + "grad_norm": 0.765625, + "learning_rate": 0.00019035537028950884, + "loss": 1.1107, + "step": 8557 + }, + { + "epoch": 0.21974544269893007, + "grad_norm": 0.84375, + "learning_rate": 0.00019035345740127962, + "loss": 0.9014, + "step": 8558 + }, + { + "epoch": 0.21977111989485187, + "grad_norm": 0.765625, + "learning_rate": 0.0001903515443329841, + "loss": 1.0324, + "step": 8559 + }, + { + "epoch": 0.2197967970907737, + "grad_norm": 0.7734375, + "learning_rate": 0.00019034963108462613, + "loss": 1.07, + "step": 8560 + }, + { + "epoch": 0.21982247428669552, + "grad_norm": 0.78125, + "learning_rate": 0.0001903477176562095, + "loss": 0.9595, + "step": 8561 + }, + { + "epoch": 0.21984815148261735, + "grad_norm": 0.76171875, + "learning_rate": 0.00019034580404773803, + "loss": 0.999, + "step": 8562 + }, + { + "epoch": 0.21987382867853916, + "grad_norm": 0.8359375, + "learning_rate": 0.00019034389025921554, + "loss": 1.0035, + "step": 8563 + }, + { + "epoch": 0.21989950587446097, + "grad_norm": 0.80859375, + "learning_rate": 0.0001903419762906458, + "loss": 1.0328, + "step": 8564 + }, + { + "epoch": 0.2199251830703828, + "grad_norm": 1.4765625, + "learning_rate": 0.0001903400621420327, + "loss": 0.9602, + "step": 8565 + }, + { + "epoch": 0.21995086026630462, + "grad_norm": 0.80859375, + "learning_rate": 0.00019033814781338002, + "loss": 1.1476, + "step": 8566 + }, + { + "epoch": 0.21997653746222645, + "grad_norm": 0.78515625, + "learning_rate": 0.00019033623330469155, + "loss": 0.9558, + "step": 8567 + }, + { + "epoch": 0.22000221465814826, + "grad_norm": 0.76171875, + "learning_rate": 0.00019033431861597113, + "loss": 1.0657, + "step": 8568 + }, + { + "epoch": 0.22002789185407007, + "grad_norm": 0.88671875, + "learning_rate": 0.0001903324037472226, + "loss": 1.1347, + "step": 8569 + }, + { + "epoch": 0.2200535690499919, + "grad_norm": 0.83203125, + "learning_rate": 0.0001903304886984497, + "loss": 0.9005, + "step": 8570 + }, + { + "epoch": 0.2200792462459137, + "grad_norm": 0.90234375, + "learning_rate": 0.00019032857346965634, + "loss": 1.0999, + "step": 8571 + }, + { + "epoch": 0.22010492344183552, + "grad_norm": 0.80859375, + "learning_rate": 0.00019032665806084626, + "loss": 0.9525, + "step": 8572 + }, + { + "epoch": 0.22013060063775736, + "grad_norm": 0.78125, + "learning_rate": 0.00019032474247202333, + "loss": 0.9868, + "step": 8573 + }, + { + "epoch": 0.22015627783367916, + "grad_norm": 0.76953125, + "learning_rate": 0.00019032282670319135, + "loss": 0.8776, + "step": 8574 + }, + { + "epoch": 0.220181955029601, + "grad_norm": 0.80078125, + "learning_rate": 0.00019032091075435413, + "loss": 1.0874, + "step": 8575 + }, + { + "epoch": 0.2202076322255228, + "grad_norm": 0.7890625, + "learning_rate": 0.00019031899462551547, + "loss": 1.0586, + "step": 8576 + }, + { + "epoch": 0.22023330942144462, + "grad_norm": 0.8125, + "learning_rate": 0.00019031707831667925, + "loss": 1.0443, + "step": 8577 + }, + { + "epoch": 0.22025898661736645, + "grad_norm": 0.75, + "learning_rate": 0.00019031516182784923, + "loss": 0.9686, + "step": 8578 + }, + { + "epoch": 0.22028466381328826, + "grad_norm": 0.84765625, + "learning_rate": 0.00019031324515902926, + "loss": 1.1104, + "step": 8579 + }, + { + "epoch": 0.2203103410092101, + "grad_norm": 0.80078125, + "learning_rate": 0.00019031132831022314, + "loss": 1.0438, + "step": 8580 + }, + { + "epoch": 0.2203360182051319, + "grad_norm": 0.80078125, + "learning_rate": 0.00019030941128143471, + "loss": 0.9475, + "step": 8581 + }, + { + "epoch": 0.22036169540105371, + "grad_norm": 0.82421875, + "learning_rate": 0.00019030749407266778, + "loss": 0.948, + "step": 8582 + }, + { + "epoch": 0.22038737259697555, + "grad_norm": 0.75390625, + "learning_rate": 0.00019030557668392616, + "loss": 1.058, + "step": 8583 + }, + { + "epoch": 0.22041304979289736, + "grad_norm": 0.78125, + "learning_rate": 0.00019030365911521367, + "loss": 0.9861, + "step": 8584 + }, + { + "epoch": 0.2204387269888192, + "grad_norm": 0.87890625, + "learning_rate": 0.00019030174136653417, + "loss": 1.1219, + "step": 8585 + }, + { + "epoch": 0.220464404184741, + "grad_norm": 0.84375, + "learning_rate": 0.00019029982343789147, + "loss": 1.0271, + "step": 8586 + }, + { + "epoch": 0.2204900813806628, + "grad_norm": 0.7890625, + "learning_rate": 0.00019029790532928934, + "loss": 0.9958, + "step": 8587 + }, + { + "epoch": 0.22051575857658465, + "grad_norm": 0.75, + "learning_rate": 0.0001902959870407317, + "loss": 0.9906, + "step": 8588 + }, + { + "epoch": 0.22054143577250646, + "grad_norm": 0.796875, + "learning_rate": 0.00019029406857222226, + "loss": 1.081, + "step": 8589 + }, + { + "epoch": 0.2205671129684283, + "grad_norm": 0.75, + "learning_rate": 0.00019029214992376492, + "loss": 1.0075, + "step": 8590 + }, + { + "epoch": 0.2205927901643501, + "grad_norm": 0.7890625, + "learning_rate": 0.00019029023109536346, + "loss": 1.1331, + "step": 8591 + }, + { + "epoch": 0.2206184673602719, + "grad_norm": 0.87109375, + "learning_rate": 0.00019028831208702177, + "loss": 0.991, + "step": 8592 + }, + { + "epoch": 0.22064414455619374, + "grad_norm": 0.84375, + "learning_rate": 0.00019028639289874358, + "loss": 1.0349, + "step": 8593 + }, + { + "epoch": 0.22066982175211555, + "grad_norm": 0.8359375, + "learning_rate": 0.00019028447353053277, + "loss": 0.991, + "step": 8594 + }, + { + "epoch": 0.2206954989480374, + "grad_norm": 0.796875, + "learning_rate": 0.0001902825539823932, + "loss": 1.0104, + "step": 8595 + }, + { + "epoch": 0.2207211761439592, + "grad_norm": 0.78515625, + "learning_rate": 0.0001902806342543286, + "loss": 1.1315, + "step": 8596 + }, + { + "epoch": 0.220746853339881, + "grad_norm": 0.84375, + "learning_rate": 0.00019027871434634287, + "loss": 1.0294, + "step": 8597 + }, + { + "epoch": 0.22077253053580284, + "grad_norm": 0.85546875, + "learning_rate": 0.00019027679425843985, + "loss": 1.0455, + "step": 8598 + }, + { + "epoch": 0.22079820773172465, + "grad_norm": 0.8203125, + "learning_rate": 0.00019027487399062332, + "loss": 0.9524, + "step": 8599 + }, + { + "epoch": 0.22082388492764649, + "grad_norm": 0.8203125, + "learning_rate": 0.0001902729535428971, + "loss": 0.9889, + "step": 8600 + }, + { + "epoch": 0.2208495621235683, + "grad_norm": 0.77734375, + "learning_rate": 0.00019027103291526505, + "loss": 1.0903, + "step": 8601 + }, + { + "epoch": 0.2208752393194901, + "grad_norm": 0.78515625, + "learning_rate": 0.000190269112107731, + "loss": 1.0203, + "step": 8602 + }, + { + "epoch": 0.22090091651541194, + "grad_norm": 0.828125, + "learning_rate": 0.00019026719112029875, + "loss": 1.0265, + "step": 8603 + }, + { + "epoch": 0.22092659371133375, + "grad_norm": 0.7890625, + "learning_rate": 0.00019026526995297212, + "loss": 1.0376, + "step": 8604 + }, + { + "epoch": 0.22095227090725558, + "grad_norm": 0.80859375, + "learning_rate": 0.000190263348605755, + "loss": 0.9469, + "step": 8605 + }, + { + "epoch": 0.2209779481031774, + "grad_norm": 0.76953125, + "learning_rate": 0.00019026142707865114, + "loss": 1.0969, + "step": 8606 + }, + { + "epoch": 0.2210036252990992, + "grad_norm": 0.8359375, + "learning_rate": 0.00019025950537166443, + "loss": 1.1192, + "step": 8607 + }, + { + "epoch": 0.22102930249502103, + "grad_norm": 0.82421875, + "learning_rate": 0.0001902575834847987, + "loss": 1.1607, + "step": 8608 + }, + { + "epoch": 0.22105497969094284, + "grad_norm": 0.8671875, + "learning_rate": 0.00019025566141805773, + "loss": 1.1557, + "step": 8609 + }, + { + "epoch": 0.22108065688686468, + "grad_norm": 0.7890625, + "learning_rate": 0.0001902537391714454, + "loss": 0.9404, + "step": 8610 + }, + { + "epoch": 0.2211063340827865, + "grad_norm": 0.76171875, + "learning_rate": 0.00019025181674496547, + "loss": 0.9756, + "step": 8611 + }, + { + "epoch": 0.2211320112787083, + "grad_norm": 0.80859375, + "learning_rate": 0.0001902498941386219, + "loss": 0.8952, + "step": 8612 + }, + { + "epoch": 0.22115768847463013, + "grad_norm": 0.8046875, + "learning_rate": 0.0001902479713524184, + "loss": 0.9838, + "step": 8613 + }, + { + "epoch": 0.22118336567055194, + "grad_norm": 0.8046875, + "learning_rate": 0.0001902460483863588, + "loss": 1.0404, + "step": 8614 + }, + { + "epoch": 0.22120904286647378, + "grad_norm": 0.82421875, + "learning_rate": 0.00019024412524044703, + "loss": 1.0043, + "step": 8615 + }, + { + "epoch": 0.22123472006239558, + "grad_norm": 0.703125, + "learning_rate": 0.0001902422019146869, + "loss": 0.9674, + "step": 8616 + }, + { + "epoch": 0.2212603972583174, + "grad_norm": 0.78125, + "learning_rate": 0.00019024027840908216, + "loss": 0.8838, + "step": 8617 + }, + { + "epoch": 0.22128607445423923, + "grad_norm": 0.84375, + "learning_rate": 0.0001902383547236367, + "loss": 0.8831, + "step": 8618 + }, + { + "epoch": 0.22131175165016104, + "grad_norm": 0.875, + "learning_rate": 0.00019023643085835438, + "loss": 1.0747, + "step": 8619 + }, + { + "epoch": 0.22133742884608287, + "grad_norm": 0.828125, + "learning_rate": 0.000190234506813239, + "loss": 0.973, + "step": 8620 + }, + { + "epoch": 0.22136310604200468, + "grad_norm": 0.83984375, + "learning_rate": 0.00019023258258829437, + "loss": 1.0828, + "step": 8621 + }, + { + "epoch": 0.2213887832379265, + "grad_norm": 0.80859375, + "learning_rate": 0.0001902306581835244, + "loss": 0.9904, + "step": 8622 + }, + { + "epoch": 0.22141446043384833, + "grad_norm": 0.84375, + "learning_rate": 0.00019022873359893282, + "loss": 1.0207, + "step": 8623 + }, + { + "epoch": 0.22144013762977013, + "grad_norm": 0.8125, + "learning_rate": 0.00019022680883452355, + "loss": 1.1883, + "step": 8624 + }, + { + "epoch": 0.22146581482569197, + "grad_norm": 0.77734375, + "learning_rate": 0.00019022488389030043, + "loss": 0.9653, + "step": 8625 + }, + { + "epoch": 0.22149149202161378, + "grad_norm": 0.83984375, + "learning_rate": 0.00019022295876626723, + "loss": 1.0094, + "step": 8626 + }, + { + "epoch": 0.2215171692175356, + "grad_norm": 0.78515625, + "learning_rate": 0.00019022103346242784, + "loss": 1.0695, + "step": 8627 + }, + { + "epoch": 0.22154284641345742, + "grad_norm": 0.7734375, + "learning_rate": 0.00019021910797878606, + "loss": 0.9947, + "step": 8628 + }, + { + "epoch": 0.22156852360937923, + "grad_norm": 0.75390625, + "learning_rate": 0.00019021718231534577, + "loss": 1.0903, + "step": 8629 + }, + { + "epoch": 0.22159420080530107, + "grad_norm": 0.78515625, + "learning_rate": 0.00019021525647211078, + "loss": 1.0524, + "step": 8630 + }, + { + "epoch": 0.22161987800122288, + "grad_norm": 0.78125, + "learning_rate": 0.0001902133304490849, + "loss": 1.1325, + "step": 8631 + }, + { + "epoch": 0.22164555519714468, + "grad_norm": 0.75, + "learning_rate": 0.00019021140424627206, + "loss": 1.0422, + "step": 8632 + }, + { + "epoch": 0.22167123239306652, + "grad_norm": 0.83203125, + "learning_rate": 0.000190209477863676, + "loss": 1.0898, + "step": 8633 + }, + { + "epoch": 0.22169690958898833, + "grad_norm": 0.796875, + "learning_rate": 0.00019020755130130063, + "loss": 1.0189, + "step": 8634 + }, + { + "epoch": 0.22172258678491016, + "grad_norm": 0.88671875, + "learning_rate": 0.00019020562455914974, + "loss": 1.1236, + "step": 8635 + }, + { + "epoch": 0.22174826398083197, + "grad_norm": 0.76953125, + "learning_rate": 0.00019020369763722718, + "loss": 0.9937, + "step": 8636 + }, + { + "epoch": 0.22177394117675378, + "grad_norm": 0.80078125, + "learning_rate": 0.0001902017705355368, + "loss": 1.1009, + "step": 8637 + }, + { + "epoch": 0.22179961837267562, + "grad_norm": 0.87109375, + "learning_rate": 0.00019019984325408245, + "loss": 1.0221, + "step": 8638 + }, + { + "epoch": 0.22182529556859742, + "grad_norm": 0.7265625, + "learning_rate": 0.00019019791579286794, + "loss": 0.8972, + "step": 8639 + }, + { + "epoch": 0.22185097276451926, + "grad_norm": 0.76953125, + "learning_rate": 0.00019019598815189716, + "loss": 1.0229, + "step": 8640 + }, + { + "epoch": 0.22187664996044107, + "grad_norm": 0.72265625, + "learning_rate": 0.0001901940603311739, + "loss": 0.8379, + "step": 8641 + }, + { + "epoch": 0.22190232715636288, + "grad_norm": 0.83203125, + "learning_rate": 0.00019019213233070206, + "loss": 1.1232, + "step": 8642 + }, + { + "epoch": 0.2219280043522847, + "grad_norm": 0.8125, + "learning_rate": 0.0001901902041504854, + "loss": 0.9993, + "step": 8643 + }, + { + "epoch": 0.22195368154820652, + "grad_norm": 0.7890625, + "learning_rate": 0.00019018827579052783, + "loss": 0.968, + "step": 8644 + }, + { + "epoch": 0.22197935874412836, + "grad_norm": 0.79296875, + "learning_rate": 0.00019018634725083316, + "loss": 0.9612, + "step": 8645 + }, + { + "epoch": 0.22200503594005017, + "grad_norm": 0.8359375, + "learning_rate": 0.00019018441853140526, + "loss": 1.0712, + "step": 8646 + }, + { + "epoch": 0.22203071313597197, + "grad_norm": 0.88671875, + "learning_rate": 0.00019018248963224797, + "loss": 1.1397, + "step": 8647 + }, + { + "epoch": 0.2220563903318938, + "grad_norm": 0.8515625, + "learning_rate": 0.00019018056055336512, + "loss": 1.114, + "step": 8648 + }, + { + "epoch": 0.22208206752781562, + "grad_norm": 0.796875, + "learning_rate": 0.00019017863129476057, + "loss": 1.0915, + "step": 8649 + }, + { + "epoch": 0.22210774472373745, + "grad_norm": 0.7109375, + "learning_rate": 0.0001901767018564381, + "loss": 0.9451, + "step": 8650 + }, + { + "epoch": 0.22213342191965926, + "grad_norm": 0.79296875, + "learning_rate": 0.00019017477223840168, + "loss": 1.1048, + "step": 8651 + }, + { + "epoch": 0.22215909911558107, + "grad_norm": 0.76171875, + "learning_rate": 0.00019017284244065506, + "loss": 0.8542, + "step": 8652 + }, + { + "epoch": 0.2221847763115029, + "grad_norm": 0.76171875, + "learning_rate": 0.00019017091246320211, + "loss": 1.1159, + "step": 8653 + }, + { + "epoch": 0.22221045350742472, + "grad_norm": 0.84765625, + "learning_rate": 0.00019016898230604664, + "loss": 1.0089, + "step": 8654 + }, + { + "epoch": 0.22223613070334655, + "grad_norm": 0.875, + "learning_rate": 0.00019016705196919258, + "loss": 0.966, + "step": 8655 + }, + { + "epoch": 0.22226180789926836, + "grad_norm": 0.78125, + "learning_rate": 0.00019016512145264368, + "loss": 1.2469, + "step": 8656 + }, + { + "epoch": 0.22228748509519017, + "grad_norm": 1.140625, + "learning_rate": 0.0001901631907564039, + "loss": 1.1072, + "step": 8657 + }, + { + "epoch": 0.222313162291112, + "grad_norm": 0.78515625, + "learning_rate": 0.000190161259880477, + "loss": 0.9975, + "step": 8658 + }, + { + "epoch": 0.2223388394870338, + "grad_norm": 0.79296875, + "learning_rate": 0.00019015932882486682, + "loss": 1.1319, + "step": 8659 + }, + { + "epoch": 0.22236451668295565, + "grad_norm": 0.79296875, + "learning_rate": 0.00019015739758957727, + "loss": 0.9555, + "step": 8660 + }, + { + "epoch": 0.22239019387887746, + "grad_norm": 0.80859375, + "learning_rate": 0.0001901554661746122, + "loss": 1.0052, + "step": 8661 + }, + { + "epoch": 0.22241587107479927, + "grad_norm": 0.7578125, + "learning_rate": 0.00019015353457997537, + "loss": 0.9955, + "step": 8662 + }, + { + "epoch": 0.2224415482707211, + "grad_norm": 0.80078125, + "learning_rate": 0.00019015160280567073, + "loss": 0.998, + "step": 8663 + }, + { + "epoch": 0.2224672254666429, + "grad_norm": 0.76171875, + "learning_rate": 0.00019014967085170207, + "loss": 0.962, + "step": 8664 + }, + { + "epoch": 0.22249290266256475, + "grad_norm": 0.86328125, + "learning_rate": 0.00019014773871807328, + "loss": 1.0543, + "step": 8665 + }, + { + "epoch": 0.22251857985848655, + "grad_norm": 0.78125, + "learning_rate": 0.00019014580640478815, + "loss": 0.9549, + "step": 8666 + }, + { + "epoch": 0.22254425705440836, + "grad_norm": 0.8515625, + "learning_rate": 0.0001901438739118506, + "loss": 0.912, + "step": 8667 + }, + { + "epoch": 0.2225699342503302, + "grad_norm": 0.80078125, + "learning_rate": 0.0001901419412392645, + "loss": 0.9696, + "step": 8668 + }, + { + "epoch": 0.222595611446252, + "grad_norm": 0.81640625, + "learning_rate": 0.00019014000838703357, + "loss": 1.0194, + "step": 8669 + }, + { + "epoch": 0.22262128864217384, + "grad_norm": 0.8046875, + "learning_rate": 0.00019013807535516177, + "loss": 0.9458, + "step": 8670 + }, + { + "epoch": 0.22264696583809565, + "grad_norm": 0.8359375, + "learning_rate": 0.00019013614214365295, + "loss": 1.1599, + "step": 8671 + }, + { + "epoch": 0.22267264303401746, + "grad_norm": 0.7890625, + "learning_rate": 0.00019013420875251092, + "loss": 0.892, + "step": 8672 + }, + { + "epoch": 0.2226983202299393, + "grad_norm": 0.77734375, + "learning_rate": 0.00019013227518173954, + "loss": 0.9467, + "step": 8673 + }, + { + "epoch": 0.2227239974258611, + "grad_norm": 0.8359375, + "learning_rate": 0.00019013034143134272, + "loss": 1.0087, + "step": 8674 + }, + { + "epoch": 0.22274967462178294, + "grad_norm": 0.765625, + "learning_rate": 0.00019012840750132424, + "loss": 0.9742, + "step": 8675 + }, + { + "epoch": 0.22277535181770475, + "grad_norm": 0.75, + "learning_rate": 0.000190126473391688, + "loss": 1.0762, + "step": 8676 + }, + { + "epoch": 0.22280102901362656, + "grad_norm": 0.828125, + "learning_rate": 0.00019012453910243783, + "loss": 0.9861, + "step": 8677 + }, + { + "epoch": 0.2228267062095484, + "grad_norm": 0.78125, + "learning_rate": 0.00019012260463357761, + "loss": 1.081, + "step": 8678 + }, + { + "epoch": 0.2228523834054702, + "grad_norm": 0.8046875, + "learning_rate": 0.00019012066998511115, + "loss": 1.0535, + "step": 8679 + }, + { + "epoch": 0.22287806060139204, + "grad_norm": 0.8359375, + "learning_rate": 0.00019011873515704236, + "loss": 1.0082, + "step": 8680 + }, + { + "epoch": 0.22290373779731384, + "grad_norm": 0.80078125, + "learning_rate": 0.00019011680014937508, + "loss": 1.0475, + "step": 8681 + }, + { + "epoch": 0.22292941499323565, + "grad_norm": 0.87109375, + "learning_rate": 0.00019011486496211313, + "loss": 1.0849, + "step": 8682 + }, + { + "epoch": 0.2229550921891575, + "grad_norm": 1.0546875, + "learning_rate": 0.00019011292959526043, + "loss": 1.0472, + "step": 8683 + }, + { + "epoch": 0.2229807693850793, + "grad_norm": 0.8046875, + "learning_rate": 0.00019011099404882075, + "loss": 1.0122, + "step": 8684 + }, + { + "epoch": 0.22300644658100113, + "grad_norm": 0.890625, + "learning_rate": 0.00019010905832279803, + "loss": 0.971, + "step": 8685 + }, + { + "epoch": 0.22303212377692294, + "grad_norm": 0.953125, + "learning_rate": 0.0001901071224171961, + "loss": 1.0062, + "step": 8686 + }, + { + "epoch": 0.22305780097284475, + "grad_norm": 0.83203125, + "learning_rate": 0.0001901051863320188, + "loss": 1.182, + "step": 8687 + }, + { + "epoch": 0.22308347816876659, + "grad_norm": 0.84765625, + "learning_rate": 0.00019010325006727, + "loss": 0.9731, + "step": 8688 + }, + { + "epoch": 0.2231091553646884, + "grad_norm": 0.84375, + "learning_rate": 0.0001901013136229536, + "loss": 1.0277, + "step": 8689 + }, + { + "epoch": 0.22313483256061023, + "grad_norm": 0.91796875, + "learning_rate": 0.00019009937699907337, + "loss": 1.0468, + "step": 8690 + }, + { + "epoch": 0.22316050975653204, + "grad_norm": 0.84375, + "learning_rate": 0.00019009744019563325, + "loss": 1.0626, + "step": 8691 + }, + { + "epoch": 0.22318618695245385, + "grad_norm": 0.78515625, + "learning_rate": 0.00019009550321263705, + "loss": 1.0533, + "step": 8692 + }, + { + "epoch": 0.22321186414837568, + "grad_norm": 0.8046875, + "learning_rate": 0.00019009356605008865, + "loss": 1.0326, + "step": 8693 + }, + { + "epoch": 0.2232375413442975, + "grad_norm": 0.79296875, + "learning_rate": 0.00019009162870799192, + "loss": 1.1371, + "step": 8694 + }, + { + "epoch": 0.22326321854021933, + "grad_norm": 0.8125, + "learning_rate": 0.00019008969118635075, + "loss": 1.0384, + "step": 8695 + }, + { + "epoch": 0.22328889573614114, + "grad_norm": 0.8125, + "learning_rate": 0.00019008775348516892, + "loss": 1.0173, + "step": 8696 + }, + { + "epoch": 0.22331457293206294, + "grad_norm": 0.80859375, + "learning_rate": 0.00019008581560445034, + "loss": 1.0306, + "step": 8697 + }, + { + "epoch": 0.22334025012798478, + "grad_norm": 0.79296875, + "learning_rate": 0.00019008387754419886, + "loss": 0.9187, + "step": 8698 + }, + { + "epoch": 0.2233659273239066, + "grad_norm": 0.84375, + "learning_rate": 0.00019008193930441838, + "loss": 1.0013, + "step": 8699 + }, + { + "epoch": 0.22339160451982842, + "grad_norm": 0.76953125, + "learning_rate": 0.0001900800008851127, + "loss": 0.9672, + "step": 8700 + }, + { + "epoch": 0.22341728171575023, + "grad_norm": 0.76171875, + "learning_rate": 0.0001900780622862857, + "loss": 1.0392, + "step": 8701 + }, + { + "epoch": 0.22344295891167204, + "grad_norm": 0.73828125, + "learning_rate": 0.0001900761235079413, + "loss": 1.0313, + "step": 8702 + }, + { + "epoch": 0.22346863610759388, + "grad_norm": 0.8203125, + "learning_rate": 0.0001900741845500833, + "loss": 0.9229, + "step": 8703 + }, + { + "epoch": 0.22349431330351568, + "grad_norm": 0.78515625, + "learning_rate": 0.0001900722454127156, + "loss": 1.1409, + "step": 8704 + }, + { + "epoch": 0.22351999049943752, + "grad_norm": 0.75390625, + "learning_rate": 0.00019007030609584206, + "loss": 0.8917, + "step": 8705 + }, + { + "epoch": 0.22354566769535933, + "grad_norm": 0.82421875, + "learning_rate": 0.00019006836659946652, + "loss": 1.072, + "step": 8706 + }, + { + "epoch": 0.22357134489128114, + "grad_norm": 0.7734375, + "learning_rate": 0.00019006642692359288, + "loss": 1.0891, + "step": 8707 + }, + { + "epoch": 0.22359702208720297, + "grad_norm": 0.8359375, + "learning_rate": 0.00019006448706822496, + "loss": 1.0279, + "step": 8708 + }, + { + "epoch": 0.22362269928312478, + "grad_norm": 0.9765625, + "learning_rate": 0.00019006254703336667, + "loss": 0.9834, + "step": 8709 + }, + { + "epoch": 0.22364837647904662, + "grad_norm": 0.75, + "learning_rate": 0.00019006060681902182, + "loss": 0.9832, + "step": 8710 + }, + { + "epoch": 0.22367405367496843, + "grad_norm": 0.78125, + "learning_rate": 0.00019005866642519436, + "loss": 1.1563, + "step": 8711 + }, + { + "epoch": 0.22369973087089023, + "grad_norm": 0.79296875, + "learning_rate": 0.0001900567258518881, + "loss": 1.1457, + "step": 8712 + }, + { + "epoch": 0.22372540806681207, + "grad_norm": 0.8046875, + "learning_rate": 0.00019005478509910692, + "loss": 0.9978, + "step": 8713 + }, + { + "epoch": 0.22375108526273388, + "grad_norm": 0.8515625, + "learning_rate": 0.00019005284416685468, + "loss": 1.0162, + "step": 8714 + }, + { + "epoch": 0.22377676245865571, + "grad_norm": 0.82421875, + "learning_rate": 0.00019005090305513527, + "loss": 0.9262, + "step": 8715 + }, + { + "epoch": 0.22380243965457752, + "grad_norm": 0.8984375, + "learning_rate": 0.00019004896176395253, + "loss": 1.3515, + "step": 8716 + }, + { + "epoch": 0.22382811685049933, + "grad_norm": 0.76171875, + "learning_rate": 0.00019004702029331037, + "loss": 1.0841, + "step": 8717 + }, + { + "epoch": 0.22385379404642117, + "grad_norm": 1.03125, + "learning_rate": 0.0001900450786432126, + "loss": 0.8998, + "step": 8718 + }, + { + "epoch": 0.22387947124234298, + "grad_norm": 0.8046875, + "learning_rate": 0.00019004313681366314, + "loss": 0.9639, + "step": 8719 + }, + { + "epoch": 0.2239051484382648, + "grad_norm": 0.83203125, + "learning_rate": 0.0001900411948046658, + "loss": 1.1406, + "step": 8720 + }, + { + "epoch": 0.22393082563418662, + "grad_norm": 0.828125, + "learning_rate": 0.00019003925261622454, + "loss": 1.0159, + "step": 8721 + }, + { + "epoch": 0.22395650283010843, + "grad_norm": 0.7890625, + "learning_rate": 0.0001900373102483432, + "loss": 0.9072, + "step": 8722 + }, + { + "epoch": 0.22398218002603026, + "grad_norm": 0.828125, + "learning_rate": 0.0001900353677010256, + "loss": 1.0213, + "step": 8723 + }, + { + "epoch": 0.22400785722195207, + "grad_norm": 0.81640625, + "learning_rate": 0.00019003342497427565, + "loss": 1.032, + "step": 8724 + }, + { + "epoch": 0.2240335344178739, + "grad_norm": 0.859375, + "learning_rate": 0.00019003148206809723, + "loss": 1.0202, + "step": 8725 + }, + { + "epoch": 0.22405921161379572, + "grad_norm": 0.7890625, + "learning_rate": 0.0001900295389824942, + "loss": 1.0106, + "step": 8726 + }, + { + "epoch": 0.22408488880971753, + "grad_norm": 0.78515625, + "learning_rate": 0.0001900275957174704, + "loss": 0.9194, + "step": 8727 + }, + { + "epoch": 0.22411056600563936, + "grad_norm": 0.875, + "learning_rate": 0.00019002565227302975, + "loss": 1.1077, + "step": 8728 + }, + { + "epoch": 0.22413624320156117, + "grad_norm": 0.765625, + "learning_rate": 0.00019002370864917612, + "loss": 1.033, + "step": 8729 + }, + { + "epoch": 0.224161920397483, + "grad_norm": 0.8203125, + "learning_rate": 0.00019002176484591337, + "loss": 1.1848, + "step": 8730 + }, + { + "epoch": 0.2241875975934048, + "grad_norm": 0.80078125, + "learning_rate": 0.00019001982086324534, + "loss": 0.979, + "step": 8731 + }, + { + "epoch": 0.22421327478932662, + "grad_norm": 0.73828125, + "learning_rate": 0.00019001787670117596, + "loss": 0.8966, + "step": 8732 + }, + { + "epoch": 0.22423895198524846, + "grad_norm": 0.828125, + "learning_rate": 0.00019001593235970908, + "loss": 1.0538, + "step": 8733 + }, + { + "epoch": 0.22426462918117027, + "grad_norm": 0.7578125, + "learning_rate": 0.0001900139878388486, + "loss": 0.945, + "step": 8734 + }, + { + "epoch": 0.2242903063770921, + "grad_norm": 0.828125, + "learning_rate": 0.00019001204313859836, + "loss": 1.0126, + "step": 8735 + }, + { + "epoch": 0.2243159835730139, + "grad_norm": 0.875, + "learning_rate": 0.00019001009825896222, + "loss": 0.9716, + "step": 8736 + }, + { + "epoch": 0.22434166076893572, + "grad_norm": 0.79296875, + "learning_rate": 0.0001900081531999441, + "loss": 1.0068, + "step": 8737 + }, + { + "epoch": 0.22436733796485755, + "grad_norm": 0.78125, + "learning_rate": 0.00019000620796154788, + "loss": 0.9529, + "step": 8738 + }, + { + "epoch": 0.22439301516077936, + "grad_norm": 0.8671875, + "learning_rate": 0.0001900042625437774, + "loss": 0.9138, + "step": 8739 + }, + { + "epoch": 0.2244186923567012, + "grad_norm": 0.75390625, + "learning_rate": 0.0001900023169466366, + "loss": 1.0348, + "step": 8740 + }, + { + "epoch": 0.224444369552623, + "grad_norm": 0.79296875, + "learning_rate": 0.00019000037117012926, + "loss": 1.0168, + "step": 8741 + }, + { + "epoch": 0.22447004674854482, + "grad_norm": 0.78515625, + "learning_rate": 0.00018999842521425934, + "loss": 0.9948, + "step": 8742 + }, + { + "epoch": 0.22449572394446665, + "grad_norm": 0.85546875, + "learning_rate": 0.00018999647907903065, + "loss": 1.0397, + "step": 8743 + }, + { + "epoch": 0.22452140114038846, + "grad_norm": 0.80078125, + "learning_rate": 0.00018999453276444714, + "loss": 0.9386, + "step": 8744 + }, + { + "epoch": 0.2245470783363103, + "grad_norm": 0.83203125, + "learning_rate": 0.00018999258627051265, + "loss": 1.0861, + "step": 8745 + }, + { + "epoch": 0.2245727555322321, + "grad_norm": 0.74609375, + "learning_rate": 0.00018999063959723107, + "loss": 1.044, + "step": 8746 + }, + { + "epoch": 0.2245984327281539, + "grad_norm": 0.82421875, + "learning_rate": 0.00018998869274460626, + "loss": 0.9908, + "step": 8747 + }, + { + "epoch": 0.22462410992407575, + "grad_norm": 0.84765625, + "learning_rate": 0.00018998674571264212, + "loss": 1.1096, + "step": 8748 + }, + { + "epoch": 0.22464978711999756, + "grad_norm": 0.7890625, + "learning_rate": 0.0001899847985013425, + "loss": 1.0982, + "step": 8749 + }, + { + "epoch": 0.2246754643159194, + "grad_norm": 0.8828125, + "learning_rate": 0.00018998285111071134, + "loss": 1.1614, + "step": 8750 + }, + { + "epoch": 0.2247011415118412, + "grad_norm": 0.82421875, + "learning_rate": 0.0001899809035407525, + "loss": 1.0216, + "step": 8751 + }, + { + "epoch": 0.224726818707763, + "grad_norm": 0.7578125, + "learning_rate": 0.00018997895579146978, + "loss": 1.0333, + "step": 8752 + }, + { + "epoch": 0.22475249590368485, + "grad_norm": 0.75390625, + "learning_rate": 0.00018997700786286718, + "loss": 1.1013, + "step": 8753 + }, + { + "epoch": 0.22477817309960665, + "grad_norm": 0.83984375, + "learning_rate": 0.00018997505975494852, + "loss": 1.0576, + "step": 8754 + }, + { + "epoch": 0.2248038502955285, + "grad_norm": 0.765625, + "learning_rate": 0.0001899731114677177, + "loss": 0.9128, + "step": 8755 + }, + { + "epoch": 0.2248295274914503, + "grad_norm": 0.7578125, + "learning_rate": 0.0001899711630011786, + "loss": 0.8875, + "step": 8756 + }, + { + "epoch": 0.2248552046873721, + "grad_norm": 0.71484375, + "learning_rate": 0.0001899692143553351, + "loss": 0.9186, + "step": 8757 + }, + { + "epoch": 0.22488088188329394, + "grad_norm": 0.7734375, + "learning_rate": 0.00018996726553019105, + "loss": 1.0116, + "step": 8758 + }, + { + "epoch": 0.22490655907921575, + "grad_norm": 0.796875, + "learning_rate": 0.00018996531652575038, + "loss": 1.0279, + "step": 8759 + }, + { + "epoch": 0.2249322362751376, + "grad_norm": 0.81640625, + "learning_rate": 0.00018996336734201696, + "loss": 0.9048, + "step": 8760 + }, + { + "epoch": 0.2249579134710594, + "grad_norm": 0.79296875, + "learning_rate": 0.0001899614179789947, + "loss": 1.0751, + "step": 8761 + }, + { + "epoch": 0.2249835906669812, + "grad_norm": 0.76953125, + "learning_rate": 0.00018995946843668743, + "loss": 0.9742, + "step": 8762 + }, + { + "epoch": 0.22500926786290304, + "grad_norm": 0.87109375, + "learning_rate": 0.00018995751871509907, + "loss": 1.0668, + "step": 8763 + }, + { + "epoch": 0.22503494505882485, + "grad_norm": 0.81640625, + "learning_rate": 0.00018995556881423354, + "loss": 1.0046, + "step": 8764 + }, + { + "epoch": 0.22506062225474668, + "grad_norm": 0.80859375, + "learning_rate": 0.00018995361873409464, + "loss": 1.0267, + "step": 8765 + }, + { + "epoch": 0.2250862994506685, + "grad_norm": 0.85546875, + "learning_rate": 0.00018995166847468633, + "loss": 1.1304, + "step": 8766 + }, + { + "epoch": 0.2251119766465903, + "grad_norm": 0.82421875, + "learning_rate": 0.00018994971803601245, + "loss": 0.9087, + "step": 8767 + }, + { + "epoch": 0.22513765384251214, + "grad_norm": 0.7421875, + "learning_rate": 0.0001899477674180769, + "loss": 1.0053, + "step": 8768 + }, + { + "epoch": 0.22516333103843394, + "grad_norm": 0.75390625, + "learning_rate": 0.0001899458166208836, + "loss": 1.0153, + "step": 8769 + }, + { + "epoch": 0.22518900823435578, + "grad_norm": 0.84375, + "learning_rate": 0.00018994386564443643, + "loss": 1.0753, + "step": 8770 + }, + { + "epoch": 0.2252146854302776, + "grad_norm": 0.9140625, + "learning_rate": 0.00018994191448873924, + "loss": 1.1601, + "step": 8771 + }, + { + "epoch": 0.2252403626261994, + "grad_norm": 0.7890625, + "learning_rate": 0.00018993996315379592, + "loss": 0.9589, + "step": 8772 + }, + { + "epoch": 0.22526603982212123, + "grad_norm": 0.75390625, + "learning_rate": 0.00018993801163961037, + "loss": 0.9481, + "step": 8773 + }, + { + "epoch": 0.22529171701804304, + "grad_norm": 0.82421875, + "learning_rate": 0.00018993605994618653, + "loss": 1.111, + "step": 8774 + }, + { + "epoch": 0.22531739421396488, + "grad_norm": 0.796875, + "learning_rate": 0.00018993410807352824, + "loss": 0.9544, + "step": 8775 + }, + { + "epoch": 0.22534307140988669, + "grad_norm": 0.796875, + "learning_rate": 0.00018993215602163937, + "loss": 1.1313, + "step": 8776 + }, + { + "epoch": 0.2253687486058085, + "grad_norm": 0.8203125, + "learning_rate": 0.00018993020379052384, + "loss": 1.0992, + "step": 8777 + }, + { + "epoch": 0.22539442580173033, + "grad_norm": 0.80078125, + "learning_rate": 0.00018992825138018556, + "loss": 1.0888, + "step": 8778 + }, + { + "epoch": 0.22542010299765214, + "grad_norm": 0.8125, + "learning_rate": 0.00018992629879062837, + "loss": 1.0451, + "step": 8779 + }, + { + "epoch": 0.22544578019357395, + "grad_norm": 0.80078125, + "learning_rate": 0.0001899243460218562, + "loss": 1.0566, + "step": 8780 + }, + { + "epoch": 0.22547145738949578, + "grad_norm": 0.76953125, + "learning_rate": 0.00018992239307387293, + "loss": 1.0432, + "step": 8781 + }, + { + "epoch": 0.2254971345854176, + "grad_norm": 0.83984375, + "learning_rate": 0.00018992043994668246, + "loss": 1.0575, + "step": 8782 + }, + { + "epoch": 0.22552281178133943, + "grad_norm": 0.8046875, + "learning_rate": 0.0001899184866402887, + "loss": 1.0322, + "step": 8783 + }, + { + "epoch": 0.22554848897726124, + "grad_norm": 0.75390625, + "learning_rate": 0.00018991653315469548, + "loss": 0.958, + "step": 8784 + }, + { + "epoch": 0.22557416617318304, + "grad_norm": 0.73828125, + "learning_rate": 0.00018991457948990673, + "loss": 0.9287, + "step": 8785 + }, + { + "epoch": 0.22559984336910488, + "grad_norm": 0.7890625, + "learning_rate": 0.00018991262564592635, + "loss": 1.0076, + "step": 8786 + }, + { + "epoch": 0.2256255205650267, + "grad_norm": 0.78125, + "learning_rate": 0.00018991067162275825, + "loss": 1.2062, + "step": 8787 + }, + { + "epoch": 0.22565119776094852, + "grad_norm": 0.78125, + "learning_rate": 0.0001899087174204063, + "loss": 1.04, + "step": 8788 + }, + { + "epoch": 0.22567687495687033, + "grad_norm": 0.8125, + "learning_rate": 0.00018990676303887438, + "loss": 1.112, + "step": 8789 + }, + { + "epoch": 0.22570255215279214, + "grad_norm": 0.8203125, + "learning_rate": 0.0001899048084781664, + "loss": 1.0638, + "step": 8790 + }, + { + "epoch": 0.22572822934871398, + "grad_norm": 0.75, + "learning_rate": 0.00018990285373828627, + "loss": 0.9877, + "step": 8791 + }, + { + "epoch": 0.22575390654463579, + "grad_norm": 0.73828125, + "learning_rate": 0.00018990089881923788, + "loss": 1.023, + "step": 8792 + }, + { + "epoch": 0.22577958374055762, + "grad_norm": 0.8125, + "learning_rate": 0.0001898989437210251, + "loss": 1.0864, + "step": 8793 + }, + { + "epoch": 0.22580526093647943, + "grad_norm": 0.84375, + "learning_rate": 0.00018989698844365188, + "loss": 0.8876, + "step": 8794 + }, + { + "epoch": 0.22583093813240124, + "grad_norm": 0.81640625, + "learning_rate": 0.00018989503298712202, + "loss": 1.1076, + "step": 8795 + }, + { + "epoch": 0.22585661532832307, + "grad_norm": 0.76171875, + "learning_rate": 0.00018989307735143952, + "loss": 0.9807, + "step": 8796 + }, + { + "epoch": 0.22588229252424488, + "grad_norm": 0.8828125, + "learning_rate": 0.00018989112153660825, + "loss": 1.025, + "step": 8797 + }, + { + "epoch": 0.22590796972016672, + "grad_norm": 0.8359375, + "learning_rate": 0.00018988916554263206, + "loss": 1.1281, + "step": 8798 + }, + { + "epoch": 0.22593364691608853, + "grad_norm": 0.828125, + "learning_rate": 0.0001898872093695149, + "loss": 1.1343, + "step": 8799 + }, + { + "epoch": 0.22595932411201033, + "grad_norm": 0.91796875, + "learning_rate": 0.00018988525301726067, + "loss": 0.9511, + "step": 8800 + }, + { + "epoch": 0.22598500130793217, + "grad_norm": 0.7578125, + "learning_rate": 0.00018988329648587324, + "loss": 1.2331, + "step": 8801 + }, + { + "epoch": 0.22601067850385398, + "grad_norm": 0.7890625, + "learning_rate": 0.0001898813397753565, + "loss": 0.9939, + "step": 8802 + }, + { + "epoch": 0.22603635569977581, + "grad_norm": 0.828125, + "learning_rate": 0.00018987938288571437, + "loss": 1.0031, + "step": 8803 + }, + { + "epoch": 0.22606203289569762, + "grad_norm": 0.8515625, + "learning_rate": 0.0001898774258169508, + "loss": 0.9794, + "step": 8804 + }, + { + "epoch": 0.22608771009161943, + "grad_norm": 0.7734375, + "learning_rate": 0.00018987546856906958, + "loss": 1.0883, + "step": 8805 + }, + { + "epoch": 0.22611338728754127, + "grad_norm": 0.85546875, + "learning_rate": 0.00018987351114207467, + "loss": 0.9953, + "step": 8806 + }, + { + "epoch": 0.22613906448346308, + "grad_norm": 0.90234375, + "learning_rate": 0.00018987155353597, + "loss": 1.0042, + "step": 8807 + }, + { + "epoch": 0.2261647416793849, + "grad_norm": 0.80078125, + "learning_rate": 0.00018986959575075944, + "loss": 1.0444, + "step": 8808 + }, + { + "epoch": 0.22619041887530672, + "grad_norm": 0.8671875, + "learning_rate": 0.00018986763778644687, + "loss": 1.0826, + "step": 8809 + }, + { + "epoch": 0.22621609607122853, + "grad_norm": 0.7734375, + "learning_rate": 0.00018986567964303627, + "loss": 0.9354, + "step": 8810 + }, + { + "epoch": 0.22624177326715036, + "grad_norm": 0.8203125, + "learning_rate": 0.00018986372132053143, + "loss": 1.12, + "step": 8811 + }, + { + "epoch": 0.22626745046307217, + "grad_norm": 0.80078125, + "learning_rate": 0.00018986176281893632, + "loss": 0.9835, + "step": 8812 + }, + { + "epoch": 0.226293127658994, + "grad_norm": 0.8203125, + "learning_rate": 0.00018985980413825483, + "loss": 1.1239, + "step": 8813 + }, + { + "epoch": 0.22631880485491582, + "grad_norm": 0.72265625, + "learning_rate": 0.00018985784527849087, + "loss": 0.884, + "step": 8814 + }, + { + "epoch": 0.22634448205083763, + "grad_norm": 0.79296875, + "learning_rate": 0.00018985588623964835, + "loss": 1.114, + "step": 8815 + }, + { + "epoch": 0.22637015924675946, + "grad_norm": 0.81640625, + "learning_rate": 0.0001898539270217312, + "loss": 1.049, + "step": 8816 + }, + { + "epoch": 0.22639583644268127, + "grad_norm": 0.78515625, + "learning_rate": 0.00018985196762474327, + "loss": 1.09, + "step": 8817 + }, + { + "epoch": 0.2264215136386031, + "grad_norm": 0.78125, + "learning_rate": 0.00018985000804868846, + "loss": 0.8892, + "step": 8818 + }, + { + "epoch": 0.22644719083452491, + "grad_norm": 0.77734375, + "learning_rate": 0.0001898480482935707, + "loss": 1.0261, + "step": 8819 + }, + { + "epoch": 0.22647286803044672, + "grad_norm": 0.84375, + "learning_rate": 0.0001898460883593939, + "loss": 1.0447, + "step": 8820 + }, + { + "epoch": 0.22649854522636856, + "grad_norm": 0.81640625, + "learning_rate": 0.00018984412824616197, + "loss": 1.1194, + "step": 8821 + }, + { + "epoch": 0.22652422242229037, + "grad_norm": 0.7734375, + "learning_rate": 0.00018984216795387882, + "loss": 1.0712, + "step": 8822 + }, + { + "epoch": 0.2265498996182122, + "grad_norm": 0.83203125, + "learning_rate": 0.00018984020748254833, + "loss": 0.9954, + "step": 8823 + }, + { + "epoch": 0.226575576814134, + "grad_norm": 0.8203125, + "learning_rate": 0.00018983824683217442, + "loss": 1.0679, + "step": 8824 + }, + { + "epoch": 0.22660125401005582, + "grad_norm": 0.78125, + "learning_rate": 0.000189836286002761, + "loss": 1.0742, + "step": 8825 + }, + { + "epoch": 0.22662693120597766, + "grad_norm": 0.7890625, + "learning_rate": 0.00018983432499431198, + "loss": 1.0728, + "step": 8826 + }, + { + "epoch": 0.22665260840189946, + "grad_norm": 0.8125, + "learning_rate": 0.00018983236380683125, + "loss": 1.0125, + "step": 8827 + }, + { + "epoch": 0.2266782855978213, + "grad_norm": 0.8515625, + "learning_rate": 0.00018983040244032276, + "loss": 0.9505, + "step": 8828 + }, + { + "epoch": 0.2267039627937431, + "grad_norm": 0.8046875, + "learning_rate": 0.0001898284408947904, + "loss": 0.9066, + "step": 8829 + }, + { + "epoch": 0.22672963998966492, + "grad_norm": 0.8046875, + "learning_rate": 0.00018982647917023803, + "loss": 1.009, + "step": 8830 + }, + { + "epoch": 0.22675531718558675, + "grad_norm": 0.90234375, + "learning_rate": 0.00018982451726666963, + "loss": 0.9769, + "step": 8831 + }, + { + "epoch": 0.22678099438150856, + "grad_norm": 0.8984375, + "learning_rate": 0.00018982255518408908, + "loss": 0.9791, + "step": 8832 + }, + { + "epoch": 0.2268066715774304, + "grad_norm": 0.7421875, + "learning_rate": 0.00018982059292250026, + "loss": 0.9761, + "step": 8833 + }, + { + "epoch": 0.2268323487733522, + "grad_norm": 0.77734375, + "learning_rate": 0.00018981863048190712, + "loss": 1.0265, + "step": 8834 + }, + { + "epoch": 0.226858025969274, + "grad_norm": 0.8046875, + "learning_rate": 0.0001898166678623136, + "loss": 1.121, + "step": 8835 + }, + { + "epoch": 0.22688370316519585, + "grad_norm": 0.953125, + "learning_rate": 0.00018981470506372356, + "loss": 1.009, + "step": 8836 + }, + { + "epoch": 0.22690938036111766, + "grad_norm": 0.83984375, + "learning_rate": 0.00018981274208614091, + "loss": 0.9643, + "step": 8837 + }, + { + "epoch": 0.2269350575570395, + "grad_norm": 0.8203125, + "learning_rate": 0.0001898107789295696, + "loss": 1.1627, + "step": 8838 + }, + { + "epoch": 0.2269607347529613, + "grad_norm": 0.7734375, + "learning_rate": 0.00018980881559401352, + "loss": 1.0139, + "step": 8839 + }, + { + "epoch": 0.2269864119488831, + "grad_norm": 0.80078125, + "learning_rate": 0.00018980685207947655, + "loss": 0.9145, + "step": 8840 + }, + { + "epoch": 0.22701208914480495, + "grad_norm": 0.85546875, + "learning_rate": 0.00018980488838596265, + "loss": 1.1105, + "step": 8841 + }, + { + "epoch": 0.22703776634072675, + "grad_norm": 0.79296875, + "learning_rate": 0.00018980292451347571, + "loss": 1.0992, + "step": 8842 + }, + { + "epoch": 0.2270634435366486, + "grad_norm": 0.859375, + "learning_rate": 0.00018980096046201968, + "loss": 0.973, + "step": 8843 + }, + { + "epoch": 0.2270891207325704, + "grad_norm": 0.82421875, + "learning_rate": 0.00018979899623159844, + "loss": 0.9832, + "step": 8844 + }, + { + "epoch": 0.2271147979284922, + "grad_norm": 0.73046875, + "learning_rate": 0.0001897970318222159, + "loss": 0.954, + "step": 8845 + }, + { + "epoch": 0.22714047512441404, + "grad_norm": 0.77734375, + "learning_rate": 0.000189795067233876, + "loss": 1.1722, + "step": 8846 + }, + { + "epoch": 0.22716615232033585, + "grad_norm": 0.76171875, + "learning_rate": 0.00018979310246658264, + "loss": 1.0561, + "step": 8847 + }, + { + "epoch": 0.2271918295162577, + "grad_norm": 0.86328125, + "learning_rate": 0.00018979113752033973, + "loss": 1.1306, + "step": 8848 + }, + { + "epoch": 0.2272175067121795, + "grad_norm": 0.81640625, + "learning_rate": 0.0001897891723951512, + "loss": 0.9903, + "step": 8849 + }, + { + "epoch": 0.2272431839081013, + "grad_norm": 0.76953125, + "learning_rate": 0.00018978720709102097, + "loss": 0.9801, + "step": 8850 + }, + { + "epoch": 0.22726886110402314, + "grad_norm": 0.8359375, + "learning_rate": 0.00018978524160795292, + "loss": 0.9881, + "step": 8851 + }, + { + "epoch": 0.22729453829994495, + "grad_norm": 0.8125, + "learning_rate": 0.00018978327594595102, + "loss": 1.0035, + "step": 8852 + }, + { + "epoch": 0.22732021549586678, + "grad_norm": 0.78125, + "learning_rate": 0.00018978131010501917, + "loss": 0.9113, + "step": 8853 + }, + { + "epoch": 0.2273458926917886, + "grad_norm": 0.80078125, + "learning_rate": 0.00018977934408516122, + "loss": 1.0313, + "step": 8854 + }, + { + "epoch": 0.2273715698877104, + "grad_norm": 0.89453125, + "learning_rate": 0.00018977737788638123, + "loss": 1.0299, + "step": 8855 + }, + { + "epoch": 0.22739724708363224, + "grad_norm": 0.75, + "learning_rate": 0.00018977541150868297, + "loss": 0.8394, + "step": 8856 + }, + { + "epoch": 0.22742292427955405, + "grad_norm": 0.76171875, + "learning_rate": 0.00018977344495207046, + "loss": 0.9794, + "step": 8857 + }, + { + "epoch": 0.22744860147547588, + "grad_norm": 0.72265625, + "learning_rate": 0.00018977147821654756, + "loss": 0.9581, + "step": 8858 + }, + { + "epoch": 0.2274742786713977, + "grad_norm": 0.78125, + "learning_rate": 0.0001897695113021182, + "loss": 1.0183, + "step": 8859 + }, + { + "epoch": 0.2274999558673195, + "grad_norm": 0.82421875, + "learning_rate": 0.00018976754420878633, + "loss": 1.051, + "step": 8860 + }, + { + "epoch": 0.22752563306324133, + "grad_norm": 0.78125, + "learning_rate": 0.00018976557693655587, + "loss": 0.9713, + "step": 8861 + }, + { + "epoch": 0.22755131025916314, + "grad_norm": 0.83203125, + "learning_rate": 0.0001897636094854307, + "loss": 0.9305, + "step": 8862 + }, + { + "epoch": 0.22757698745508498, + "grad_norm": 0.83203125, + "learning_rate": 0.0001897616418554148, + "loss": 1.0544, + "step": 8863 + }, + { + "epoch": 0.2276026646510068, + "grad_norm": 0.98828125, + "learning_rate": 0.000189759674046512, + "loss": 1.1265, + "step": 8864 + }, + { + "epoch": 0.2276283418469286, + "grad_norm": 0.81640625, + "learning_rate": 0.00018975770605872633, + "loss": 1.0623, + "step": 8865 + }, + { + "epoch": 0.22765401904285043, + "grad_norm": 0.79296875, + "learning_rate": 0.00018975573789206164, + "loss": 1.0524, + "step": 8866 + }, + { + "epoch": 0.22767969623877224, + "grad_norm": 0.82421875, + "learning_rate": 0.00018975376954652185, + "loss": 0.9223, + "step": 8867 + }, + { + "epoch": 0.22770537343469407, + "grad_norm": 0.8359375, + "learning_rate": 0.00018975180102211094, + "loss": 1.092, + "step": 8868 + }, + { + "epoch": 0.22773105063061588, + "grad_norm": 0.80078125, + "learning_rate": 0.0001897498323188328, + "loss": 0.9844, + "step": 8869 + }, + { + "epoch": 0.2277567278265377, + "grad_norm": 0.765625, + "learning_rate": 0.00018974786343669132, + "loss": 0.9109, + "step": 8870 + }, + { + "epoch": 0.22778240502245953, + "grad_norm": 0.80078125, + "learning_rate": 0.00018974589437569046, + "loss": 0.9823, + "step": 8871 + }, + { + "epoch": 0.22780808221838134, + "grad_norm": 0.8203125, + "learning_rate": 0.00018974392513583415, + "loss": 1.1015, + "step": 8872 + }, + { + "epoch": 0.22783375941430317, + "grad_norm": 0.80859375, + "learning_rate": 0.0001897419557171263, + "loss": 1.0732, + "step": 8873 + }, + { + "epoch": 0.22785943661022498, + "grad_norm": 0.8046875, + "learning_rate": 0.00018973998611957087, + "loss": 0.9321, + "step": 8874 + }, + { + "epoch": 0.2278851138061468, + "grad_norm": 0.79296875, + "learning_rate": 0.00018973801634317172, + "loss": 0.9741, + "step": 8875 + }, + { + "epoch": 0.22791079100206862, + "grad_norm": 0.77734375, + "learning_rate": 0.0001897360463879328, + "loss": 0.9637, + "step": 8876 + }, + { + "epoch": 0.22793646819799043, + "grad_norm": 0.8203125, + "learning_rate": 0.00018973407625385807, + "loss": 0.9487, + "step": 8877 + }, + { + "epoch": 0.22796214539391227, + "grad_norm": 0.7734375, + "learning_rate": 0.0001897321059409514, + "loss": 0.8689, + "step": 8878 + }, + { + "epoch": 0.22798782258983408, + "grad_norm": 0.8671875, + "learning_rate": 0.00018973013544921677, + "loss": 0.9884, + "step": 8879 + }, + { + "epoch": 0.22801349978575589, + "grad_norm": 0.828125, + "learning_rate": 0.0001897281647786581, + "loss": 1.0183, + "step": 8880 + }, + { + "epoch": 0.22803917698167772, + "grad_norm": 0.73046875, + "learning_rate": 0.00018972619392927927, + "loss": 0.9755, + "step": 8881 + }, + { + "epoch": 0.22806485417759953, + "grad_norm": 0.828125, + "learning_rate": 0.00018972422290108426, + "loss": 1.0097, + "step": 8882 + }, + { + "epoch": 0.22809053137352137, + "grad_norm": 0.86328125, + "learning_rate": 0.000189722251694077, + "loss": 1.1014, + "step": 8883 + }, + { + "epoch": 0.22811620856944317, + "grad_norm": 0.85546875, + "learning_rate": 0.00018972028030826138, + "loss": 1.1564, + "step": 8884 + }, + { + "epoch": 0.22814188576536498, + "grad_norm": 0.82421875, + "learning_rate": 0.00018971830874364135, + "loss": 1.085, + "step": 8885 + }, + { + "epoch": 0.22816756296128682, + "grad_norm": 0.86328125, + "learning_rate": 0.0001897163370002208, + "loss": 1.0596, + "step": 8886 + }, + { + "epoch": 0.22819324015720863, + "grad_norm": 0.83984375, + "learning_rate": 0.00018971436507800372, + "loss": 1.0751, + "step": 8887 + }, + { + "epoch": 0.22821891735313046, + "grad_norm": 0.82421875, + "learning_rate": 0.00018971239297699405, + "loss": 0.8939, + "step": 8888 + }, + { + "epoch": 0.22824459454905227, + "grad_norm": 0.76953125, + "learning_rate": 0.0001897104206971956, + "loss": 0.904, + "step": 8889 + }, + { + "epoch": 0.22827027174497408, + "grad_norm": 0.79296875, + "learning_rate": 0.00018970844823861245, + "loss": 1.0657, + "step": 8890 + }, + { + "epoch": 0.22829594894089592, + "grad_norm": 0.80078125, + "learning_rate": 0.00018970647560124845, + "loss": 1.0232, + "step": 8891 + }, + { + "epoch": 0.22832162613681772, + "grad_norm": 0.828125, + "learning_rate": 0.00018970450278510753, + "loss": 1.0342, + "step": 8892 + }, + { + "epoch": 0.22834730333273956, + "grad_norm": 0.8046875, + "learning_rate": 0.00018970252979019368, + "loss": 1.1209, + "step": 8893 + }, + { + "epoch": 0.22837298052866137, + "grad_norm": 0.80859375, + "learning_rate": 0.00018970055661651073, + "loss": 0.9712, + "step": 8894 + }, + { + "epoch": 0.22839865772458318, + "grad_norm": 0.828125, + "learning_rate": 0.00018969858326406272, + "loss": 1.0546, + "step": 8895 + }, + { + "epoch": 0.228424334920505, + "grad_norm": 0.80859375, + "learning_rate": 0.00018969660973285356, + "loss": 1.1077, + "step": 8896 + }, + { + "epoch": 0.22845001211642682, + "grad_norm": 0.83203125, + "learning_rate": 0.0001896946360228871, + "loss": 0.932, + "step": 8897 + }, + { + "epoch": 0.22847568931234866, + "grad_norm": 0.76171875, + "learning_rate": 0.00018969266213416733, + "loss": 0.9316, + "step": 8898 + }, + { + "epoch": 0.22850136650827046, + "grad_norm": 0.82421875, + "learning_rate": 0.00018969068806669824, + "loss": 1.0022, + "step": 8899 + }, + { + "epoch": 0.22852704370419227, + "grad_norm": 0.85546875, + "learning_rate": 0.0001896887138204837, + "loss": 1.039, + "step": 8900 + }, + { + "epoch": 0.2285527209001141, + "grad_norm": 0.73828125, + "learning_rate": 0.00018968673939552763, + "loss": 0.9729, + "step": 8901 + }, + { + "epoch": 0.22857839809603592, + "grad_norm": 0.8828125, + "learning_rate": 0.000189684764791834, + "loss": 1.0312, + "step": 8902 + }, + { + "epoch": 0.22860407529195775, + "grad_norm": 0.75, + "learning_rate": 0.00018968279000940673, + "loss": 0.918, + "step": 8903 + }, + { + "epoch": 0.22862975248787956, + "grad_norm": 0.796875, + "learning_rate": 0.00018968081504824977, + "loss": 1.1112, + "step": 8904 + }, + { + "epoch": 0.22865542968380137, + "grad_norm": 0.83984375, + "learning_rate": 0.00018967883990836701, + "loss": 0.8819, + "step": 8905 + }, + { + "epoch": 0.2286811068797232, + "grad_norm": 0.8046875, + "learning_rate": 0.00018967686458976247, + "loss": 0.844, + "step": 8906 + }, + { + "epoch": 0.22870678407564501, + "grad_norm": 0.828125, + "learning_rate": 0.00018967488909244002, + "loss": 0.9621, + "step": 8907 + }, + { + "epoch": 0.22873246127156685, + "grad_norm": 0.82421875, + "learning_rate": 0.0001896729134164036, + "loss": 1.0605, + "step": 8908 + }, + { + "epoch": 0.22875813846748866, + "grad_norm": 0.76953125, + "learning_rate": 0.0001896709375616572, + "loss": 1.044, + "step": 8909 + }, + { + "epoch": 0.22878381566341047, + "grad_norm": 0.84765625, + "learning_rate": 0.0001896689615282047, + "loss": 0.9853, + "step": 8910 + }, + { + "epoch": 0.2288094928593323, + "grad_norm": 0.90234375, + "learning_rate": 0.00018966698531605006, + "loss": 1.1068, + "step": 8911 + }, + { + "epoch": 0.2288351700552541, + "grad_norm": 0.7734375, + "learning_rate": 0.00018966500892519722, + "loss": 1.0383, + "step": 8912 + }, + { + "epoch": 0.22886084725117595, + "grad_norm": 0.8359375, + "learning_rate": 0.00018966303235565013, + "loss": 0.9685, + "step": 8913 + }, + { + "epoch": 0.22888652444709776, + "grad_norm": 0.74609375, + "learning_rate": 0.0001896610556074127, + "loss": 0.8843, + "step": 8914 + }, + { + "epoch": 0.22891220164301956, + "grad_norm": 0.80078125, + "learning_rate": 0.0001896590786804889, + "loss": 0.9646, + "step": 8915 + }, + { + "epoch": 0.2289378788389414, + "grad_norm": 0.8203125, + "learning_rate": 0.00018965710157488262, + "loss": 1.0226, + "step": 8916 + }, + { + "epoch": 0.2289635560348632, + "grad_norm": 0.82421875, + "learning_rate": 0.00018965512429059786, + "loss": 1.1429, + "step": 8917 + }, + { + "epoch": 0.22898923323078504, + "grad_norm": 0.7109375, + "learning_rate": 0.00018965314682763856, + "loss": 0.9742, + "step": 8918 + }, + { + "epoch": 0.22901491042670685, + "grad_norm": 0.80078125, + "learning_rate": 0.0001896511691860086, + "loss": 1.0595, + "step": 8919 + }, + { + "epoch": 0.22904058762262866, + "grad_norm": 0.77734375, + "learning_rate": 0.00018964919136571195, + "loss": 1.0131, + "step": 8920 + }, + { + "epoch": 0.2290662648185505, + "grad_norm": 0.7578125, + "learning_rate": 0.0001896472133667526, + "loss": 1.1305, + "step": 8921 + }, + { + "epoch": 0.2290919420144723, + "grad_norm": 0.85546875, + "learning_rate": 0.00018964523518913442, + "loss": 1.0942, + "step": 8922 + }, + { + "epoch": 0.22911761921039414, + "grad_norm": 0.8203125, + "learning_rate": 0.0001896432568328614, + "loss": 1.0395, + "step": 8923 + }, + { + "epoch": 0.22914329640631595, + "grad_norm": 0.85546875, + "learning_rate": 0.00018964127829793745, + "loss": 1.1178, + "step": 8924 + }, + { + "epoch": 0.22916897360223776, + "grad_norm": 0.8046875, + "learning_rate": 0.00018963929958436656, + "loss": 0.9845, + "step": 8925 + }, + { + "epoch": 0.2291946507981596, + "grad_norm": 0.94921875, + "learning_rate": 0.00018963732069215264, + "loss": 1.1037, + "step": 8926 + }, + { + "epoch": 0.2292203279940814, + "grad_norm": 0.8046875, + "learning_rate": 0.00018963534162129962, + "loss": 1.0408, + "step": 8927 + }, + { + "epoch": 0.22924600519000324, + "grad_norm": 0.8515625, + "learning_rate": 0.00018963336237181144, + "loss": 1.2059, + "step": 8928 + }, + { + "epoch": 0.22927168238592505, + "grad_norm": 0.8359375, + "learning_rate": 0.00018963138294369208, + "loss": 1.0435, + "step": 8929 + }, + { + "epoch": 0.22929735958184685, + "grad_norm": 0.90234375, + "learning_rate": 0.00018962940333694546, + "loss": 0.9674, + "step": 8930 + }, + { + "epoch": 0.2293230367777687, + "grad_norm": 0.77734375, + "learning_rate": 0.00018962742355157556, + "loss": 0.9464, + "step": 8931 + }, + { + "epoch": 0.2293487139736905, + "grad_norm": 0.7734375, + "learning_rate": 0.0001896254435875863, + "loss": 0.938, + "step": 8932 + }, + { + "epoch": 0.22937439116961233, + "grad_norm": 0.78125, + "learning_rate": 0.00018962346344498162, + "loss": 0.912, + "step": 8933 + }, + { + "epoch": 0.22940006836553414, + "grad_norm": 0.8046875, + "learning_rate": 0.00018962148312376547, + "loss": 0.952, + "step": 8934 + }, + { + "epoch": 0.22942574556145595, + "grad_norm": 0.84765625, + "learning_rate": 0.00018961950262394182, + "loss": 0.9873, + "step": 8935 + }, + { + "epoch": 0.2294514227573778, + "grad_norm": 0.734375, + "learning_rate": 0.00018961752194551456, + "loss": 0.8639, + "step": 8936 + }, + { + "epoch": 0.2294770999532996, + "grad_norm": 0.859375, + "learning_rate": 0.0001896155410884877, + "loss": 1.0134, + "step": 8937 + }, + { + "epoch": 0.22950277714922143, + "grad_norm": 0.85546875, + "learning_rate": 0.00018961356005286516, + "loss": 1.0244, + "step": 8938 + }, + { + "epoch": 0.22952845434514324, + "grad_norm": 0.80859375, + "learning_rate": 0.00018961157883865088, + "loss": 1.0612, + "step": 8939 + }, + { + "epoch": 0.22955413154106505, + "grad_norm": 0.8359375, + "learning_rate": 0.0001896095974458488, + "loss": 1.0665, + "step": 8940 + }, + { + "epoch": 0.22957980873698688, + "grad_norm": 0.8125, + "learning_rate": 0.00018960761587446295, + "loss": 1.0216, + "step": 8941 + }, + { + "epoch": 0.2296054859329087, + "grad_norm": 0.8125, + "learning_rate": 0.00018960563412449716, + "loss": 1.0076, + "step": 8942 + }, + { + "epoch": 0.22963116312883053, + "grad_norm": 0.828125, + "learning_rate": 0.00018960365219595544, + "loss": 0.8687, + "step": 8943 + }, + { + "epoch": 0.22965684032475234, + "grad_norm": 0.7734375, + "learning_rate": 0.00018960167008884174, + "loss": 1.0093, + "step": 8944 + }, + { + "epoch": 0.22968251752067415, + "grad_norm": 0.78125, + "learning_rate": 0.00018959968780316003, + "loss": 0.9663, + "step": 8945 + }, + { + "epoch": 0.22970819471659598, + "grad_norm": 0.8046875, + "learning_rate": 0.00018959770533891418, + "loss": 1.0005, + "step": 8946 + }, + { + "epoch": 0.2297338719125178, + "grad_norm": 0.89453125, + "learning_rate": 0.00018959572269610826, + "loss": 1.0422, + "step": 8947 + }, + { + "epoch": 0.22975954910843963, + "grad_norm": 0.7734375, + "learning_rate": 0.00018959373987474613, + "loss": 0.9981, + "step": 8948 + }, + { + "epoch": 0.22978522630436143, + "grad_norm": 0.76171875, + "learning_rate": 0.00018959175687483176, + "loss": 1.0827, + "step": 8949 + }, + { + "epoch": 0.22981090350028324, + "grad_norm": 0.7734375, + "learning_rate": 0.00018958977369636911, + "loss": 0.9647, + "step": 8950 + }, + { + "epoch": 0.22983658069620508, + "grad_norm": 0.80078125, + "learning_rate": 0.00018958779033936214, + "loss": 1.0662, + "step": 8951 + }, + { + "epoch": 0.2298622578921269, + "grad_norm": 0.76953125, + "learning_rate": 0.00018958580680381482, + "loss": 0.9272, + "step": 8952 + }, + { + "epoch": 0.22988793508804872, + "grad_norm": 0.80859375, + "learning_rate": 0.00018958382308973106, + "loss": 1.0084, + "step": 8953 + }, + { + "epoch": 0.22991361228397053, + "grad_norm": 0.80859375, + "learning_rate": 0.00018958183919711482, + "loss": 1.06, + "step": 8954 + }, + { + "epoch": 0.22993928947989234, + "grad_norm": 0.81640625, + "learning_rate": 0.00018957985512597008, + "loss": 1.138, + "step": 8955 + }, + { + "epoch": 0.22996496667581418, + "grad_norm": 0.796875, + "learning_rate": 0.00018957787087630078, + "loss": 1.0641, + "step": 8956 + }, + { + "epoch": 0.22999064387173598, + "grad_norm": 0.79296875, + "learning_rate": 0.0001895758864481109, + "loss": 0.8776, + "step": 8957 + }, + { + "epoch": 0.23001632106765782, + "grad_norm": 0.8125, + "learning_rate": 0.0001895739018414043, + "loss": 1.0399, + "step": 8958 + }, + { + "epoch": 0.23004199826357963, + "grad_norm": 0.796875, + "learning_rate": 0.00018957191705618506, + "loss": 0.9733, + "step": 8959 + }, + { + "epoch": 0.23006767545950144, + "grad_norm": 0.76953125, + "learning_rate": 0.0001895699320924571, + "loss": 1.0985, + "step": 8960 + }, + { + "epoch": 0.23009335265542327, + "grad_norm": 0.85546875, + "learning_rate": 0.0001895679469502243, + "loss": 0.9247, + "step": 8961 + }, + { + "epoch": 0.23011902985134508, + "grad_norm": 0.796875, + "learning_rate": 0.0001895659616294907, + "loss": 0.988, + "step": 8962 + }, + { + "epoch": 0.23014470704726692, + "grad_norm": 0.8046875, + "learning_rate": 0.00018956397613026021, + "loss": 1.1132, + "step": 8963 + }, + { + "epoch": 0.23017038424318872, + "grad_norm": 0.81640625, + "learning_rate": 0.00018956199045253684, + "loss": 1.1461, + "step": 8964 + }, + { + "epoch": 0.23019606143911053, + "grad_norm": 0.7578125, + "learning_rate": 0.00018956000459632448, + "loss": 1.0227, + "step": 8965 + }, + { + "epoch": 0.23022173863503237, + "grad_norm": 0.796875, + "learning_rate": 0.00018955801856162716, + "loss": 1.0508, + "step": 8966 + }, + { + "epoch": 0.23024741583095418, + "grad_norm": 0.84765625, + "learning_rate": 0.00018955603234844877, + "loss": 1.0149, + "step": 8967 + }, + { + "epoch": 0.230273093026876, + "grad_norm": 0.7890625, + "learning_rate": 0.00018955404595679328, + "loss": 0.9552, + "step": 8968 + }, + { + "epoch": 0.23029877022279782, + "grad_norm": 0.80078125, + "learning_rate": 0.0001895520593866647, + "loss": 0.9952, + "step": 8969 + }, + { + "epoch": 0.23032444741871963, + "grad_norm": 0.78515625, + "learning_rate": 0.00018955007263806695, + "loss": 0.875, + "step": 8970 + }, + { + "epoch": 0.23035012461464147, + "grad_norm": 0.8359375, + "learning_rate": 0.00018954808571100396, + "loss": 1.0563, + "step": 8971 + }, + { + "epoch": 0.23037580181056327, + "grad_norm": 0.7890625, + "learning_rate": 0.00018954609860547978, + "loss": 0.9924, + "step": 8972 + }, + { + "epoch": 0.2304014790064851, + "grad_norm": 0.77734375, + "learning_rate": 0.00018954411132149828, + "loss": 0.9592, + "step": 8973 + }, + { + "epoch": 0.23042715620240692, + "grad_norm": 0.80859375, + "learning_rate": 0.00018954212385906348, + "loss": 1.0597, + "step": 8974 + }, + { + "epoch": 0.23045283339832873, + "grad_norm": 0.81640625, + "learning_rate": 0.00018954013621817926, + "loss": 1.0313, + "step": 8975 + }, + { + "epoch": 0.23047851059425056, + "grad_norm": 0.7734375, + "learning_rate": 0.0001895381483988497, + "loss": 0.8651, + "step": 8976 + }, + { + "epoch": 0.23050418779017237, + "grad_norm": 0.82421875, + "learning_rate": 0.00018953616040107867, + "loss": 1.009, + "step": 8977 + }, + { + "epoch": 0.2305298649860942, + "grad_norm": 0.7421875, + "learning_rate": 0.00018953417222487015, + "loss": 0.8906, + "step": 8978 + }, + { + "epoch": 0.23055554218201602, + "grad_norm": 0.796875, + "learning_rate": 0.0001895321838702281, + "loss": 1.0251, + "step": 8979 + }, + { + "epoch": 0.23058121937793782, + "grad_norm": 0.8125, + "learning_rate": 0.00018953019533715653, + "loss": 0.9143, + "step": 8980 + }, + { + "epoch": 0.23060689657385966, + "grad_norm": 0.80859375, + "learning_rate": 0.00018952820662565935, + "loss": 1.1185, + "step": 8981 + }, + { + "epoch": 0.23063257376978147, + "grad_norm": 0.7109375, + "learning_rate": 0.00018952621773574054, + "loss": 0.9437, + "step": 8982 + }, + { + "epoch": 0.2306582509657033, + "grad_norm": 0.8984375, + "learning_rate": 0.00018952422866740407, + "loss": 0.8499, + "step": 8983 + }, + { + "epoch": 0.2306839281616251, + "grad_norm": 0.765625, + "learning_rate": 0.0001895222394206539, + "loss": 0.9095, + "step": 8984 + }, + { + "epoch": 0.23070960535754692, + "grad_norm": 0.796875, + "learning_rate": 0.000189520249995494, + "loss": 0.9245, + "step": 8985 + }, + { + "epoch": 0.23073528255346876, + "grad_norm": 0.79296875, + "learning_rate": 0.00018951826039192833, + "loss": 1.0767, + "step": 8986 + }, + { + "epoch": 0.23076095974939057, + "grad_norm": 0.890625, + "learning_rate": 0.00018951627060996085, + "loss": 1.0851, + "step": 8987 + }, + { + "epoch": 0.23078663694531237, + "grad_norm": 0.81640625, + "learning_rate": 0.00018951428064959552, + "loss": 1.0263, + "step": 8988 + }, + { + "epoch": 0.2308123141412342, + "grad_norm": 0.8203125, + "learning_rate": 0.00018951229051083633, + "loss": 1.0942, + "step": 8989 + }, + { + "epoch": 0.23083799133715602, + "grad_norm": 0.84765625, + "learning_rate": 0.00018951030019368718, + "loss": 1.1074, + "step": 8990 + }, + { + "epoch": 0.23086366853307785, + "grad_norm": 0.8046875, + "learning_rate": 0.00018950830969815218, + "loss": 1.1021, + "step": 8991 + }, + { + "epoch": 0.23088934572899966, + "grad_norm": 0.91796875, + "learning_rate": 0.00018950631902423513, + "loss": 0.9994, + "step": 8992 + }, + { + "epoch": 0.23091502292492147, + "grad_norm": 0.81640625, + "learning_rate": 0.00018950432817194007, + "loss": 0.9022, + "step": 8993 + }, + { + "epoch": 0.2309407001208433, + "grad_norm": 0.8671875, + "learning_rate": 0.00018950233714127099, + "loss": 0.9801, + "step": 8994 + }, + { + "epoch": 0.23096637731676511, + "grad_norm": 0.76171875, + "learning_rate": 0.00018950034593223186, + "loss": 0.9094, + "step": 8995 + }, + { + "epoch": 0.23099205451268695, + "grad_norm": 0.8125, + "learning_rate": 0.0001894983545448266, + "loss": 1.0292, + "step": 8996 + }, + { + "epoch": 0.23101773170860876, + "grad_norm": 0.83984375, + "learning_rate": 0.0001894963629790592, + "loss": 1.0691, + "step": 8997 + }, + { + "epoch": 0.23104340890453057, + "grad_norm": 0.78125, + "learning_rate": 0.00018949437123493362, + "loss": 1.1004, + "step": 8998 + }, + { + "epoch": 0.2310690861004524, + "grad_norm": 0.76953125, + "learning_rate": 0.00018949237931245388, + "loss": 0.9116, + "step": 8999 + }, + { + "epoch": 0.2310947632963742, + "grad_norm": 0.81640625, + "learning_rate": 0.00018949038721162385, + "loss": 1.037, + "step": 9000 + }, + { + "epoch": 0.2310947632963742, + "eval_loss": 1.0149517059326172, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 405.9124, + "eval_samples_per_second": 24.636, + "eval_steps_per_second": 0.771, + "step": 9000 + }, + { + "epoch": 0.23112044049229605, + "grad_norm": 0.8359375, + "learning_rate": 0.00018948839493244764, + "loss": 1.0099, + "step": 9001 + }, + { + "epoch": 0.23114611768821786, + "grad_norm": 0.78515625, + "learning_rate": 0.0001894864024749291, + "loss": 1.1245, + "step": 9002 + }, + { + "epoch": 0.23117179488413966, + "grad_norm": 0.85546875, + "learning_rate": 0.00018948440983907226, + "loss": 0.9419, + "step": 9003 + }, + { + "epoch": 0.2311974720800615, + "grad_norm": 0.84375, + "learning_rate": 0.00018948241702488106, + "loss": 1.0653, + "step": 9004 + }, + { + "epoch": 0.2312231492759833, + "grad_norm": 0.78515625, + "learning_rate": 0.00018948042403235947, + "loss": 0.9153, + "step": 9005 + }, + { + "epoch": 0.23124882647190514, + "grad_norm": 0.859375, + "learning_rate": 0.0001894784308615115, + "loss": 1.1629, + "step": 9006 + }, + { + "epoch": 0.23127450366782695, + "grad_norm": 0.7421875, + "learning_rate": 0.0001894764375123411, + "loss": 0.9075, + "step": 9007 + }, + { + "epoch": 0.23130018086374876, + "grad_norm": 0.79296875, + "learning_rate": 0.00018947444398485224, + "loss": 0.9354, + "step": 9008 + }, + { + "epoch": 0.2313258580596706, + "grad_norm": 0.8125, + "learning_rate": 0.0001894724502790489, + "loss": 1.0043, + "step": 9009 + }, + { + "epoch": 0.2313515352555924, + "grad_norm": 0.796875, + "learning_rate": 0.00018947045639493504, + "loss": 1.0562, + "step": 9010 + }, + { + "epoch": 0.23137721245151424, + "grad_norm": 0.7578125, + "learning_rate": 0.00018946846233251465, + "loss": 0.8771, + "step": 9011 + }, + { + "epoch": 0.23140288964743605, + "grad_norm": 0.8203125, + "learning_rate": 0.0001894664680917917, + "loss": 0.9416, + "step": 9012 + }, + { + "epoch": 0.23142856684335786, + "grad_norm": 1.1171875, + "learning_rate": 0.00018946447367277017, + "loss": 0.9902, + "step": 9013 + }, + { + "epoch": 0.2314542440392797, + "grad_norm": 0.828125, + "learning_rate": 0.000189462479075454, + "loss": 0.9375, + "step": 9014 + }, + { + "epoch": 0.2314799212352015, + "grad_norm": 0.8359375, + "learning_rate": 0.00018946048429984718, + "loss": 1.1646, + "step": 9015 + }, + { + "epoch": 0.23150559843112334, + "grad_norm": 0.8125, + "learning_rate": 0.00018945848934595372, + "loss": 1.0698, + "step": 9016 + }, + { + "epoch": 0.23153127562704515, + "grad_norm": 0.75390625, + "learning_rate": 0.00018945649421377755, + "loss": 1.0787, + "step": 9017 + }, + { + "epoch": 0.23155695282296695, + "grad_norm": 0.8359375, + "learning_rate": 0.00018945449890332269, + "loss": 1.0214, + "step": 9018 + }, + { + "epoch": 0.2315826300188888, + "grad_norm": 0.78125, + "learning_rate": 0.00018945250341459308, + "loss": 0.9217, + "step": 9019 + }, + { + "epoch": 0.2316083072148106, + "grad_norm": 0.7890625, + "learning_rate": 0.00018945050774759272, + "loss": 1.1315, + "step": 9020 + }, + { + "epoch": 0.23163398441073244, + "grad_norm": 0.8046875, + "learning_rate": 0.0001894485119023256, + "loss": 1.0839, + "step": 9021 + }, + { + "epoch": 0.23165966160665424, + "grad_norm": 0.80859375, + "learning_rate": 0.00018944651587879564, + "loss": 0.8271, + "step": 9022 + }, + { + "epoch": 0.23168533880257605, + "grad_norm": 0.80078125, + "learning_rate": 0.00018944451967700684, + "loss": 1.0247, + "step": 9023 + }, + { + "epoch": 0.2317110159984979, + "grad_norm": 0.765625, + "learning_rate": 0.00018944252329696325, + "loss": 0.9297, + "step": 9024 + }, + { + "epoch": 0.2317366931944197, + "grad_norm": 0.85546875, + "learning_rate": 0.00018944052673866874, + "loss": 1.098, + "step": 9025 + }, + { + "epoch": 0.23176237039034153, + "grad_norm": 0.85546875, + "learning_rate": 0.00018943853000212733, + "loss": 1.1096, + "step": 9026 + }, + { + "epoch": 0.23178804758626334, + "grad_norm": 0.78125, + "learning_rate": 0.00018943653308734306, + "loss": 0.9113, + "step": 9027 + }, + { + "epoch": 0.23181372478218515, + "grad_norm": 0.8984375, + "learning_rate": 0.00018943453599431983, + "loss": 1.0384, + "step": 9028 + }, + { + "epoch": 0.23183940197810698, + "grad_norm": 0.80859375, + "learning_rate": 0.00018943253872306167, + "loss": 1.0789, + "step": 9029 + }, + { + "epoch": 0.2318650791740288, + "grad_norm": 0.8359375, + "learning_rate": 0.0001894305412735725, + "loss": 0.9516, + "step": 9030 + }, + { + "epoch": 0.23189075636995063, + "grad_norm": 0.7109375, + "learning_rate": 0.00018942854364585636, + "loss": 0.946, + "step": 9031 + }, + { + "epoch": 0.23191643356587244, + "grad_norm": 0.7734375, + "learning_rate": 0.00018942654583991724, + "loss": 1.0569, + "step": 9032 + }, + { + "epoch": 0.23194211076179425, + "grad_norm": 0.78125, + "learning_rate": 0.00018942454785575902, + "loss": 1.015, + "step": 9033 + }, + { + "epoch": 0.23196778795771608, + "grad_norm": 0.80859375, + "learning_rate": 0.00018942254969338582, + "loss": 1.0354, + "step": 9034 + }, + { + "epoch": 0.2319934651536379, + "grad_norm": 0.85546875, + "learning_rate": 0.00018942055135280153, + "loss": 0.9575, + "step": 9035 + }, + { + "epoch": 0.23201914234955973, + "grad_norm": 0.83203125, + "learning_rate": 0.0001894185528340102, + "loss": 1.0972, + "step": 9036 + }, + { + "epoch": 0.23204481954548153, + "grad_norm": 0.83984375, + "learning_rate": 0.00018941655413701568, + "loss": 1.1318, + "step": 9037 + }, + { + "epoch": 0.23207049674140334, + "grad_norm": 0.890625, + "learning_rate": 0.0001894145552618221, + "loss": 1.1794, + "step": 9038 + }, + { + "epoch": 0.23209617393732518, + "grad_norm": 0.83984375, + "learning_rate": 0.0001894125562084334, + "loss": 0.9542, + "step": 9039 + }, + { + "epoch": 0.232121851133247, + "grad_norm": 0.78515625, + "learning_rate": 0.00018941055697685352, + "loss": 1.0381, + "step": 9040 + }, + { + "epoch": 0.23214752832916882, + "grad_norm": 0.79296875, + "learning_rate": 0.0001894085575670865, + "loss": 1.0017, + "step": 9041 + }, + { + "epoch": 0.23217320552509063, + "grad_norm": 0.78125, + "learning_rate": 0.0001894065579791363, + "loss": 1.0282, + "step": 9042 + }, + { + "epoch": 0.23219888272101244, + "grad_norm": 0.76171875, + "learning_rate": 0.00018940455821300688, + "loss": 1.0466, + "step": 9043 + }, + { + "epoch": 0.23222455991693428, + "grad_norm": 0.83984375, + "learning_rate": 0.0001894025582687023, + "loss": 1.1177, + "step": 9044 + }, + { + "epoch": 0.23225023711285608, + "grad_norm": 0.83984375, + "learning_rate": 0.00018940055814622647, + "loss": 1.1699, + "step": 9045 + }, + { + "epoch": 0.23227591430877792, + "grad_norm": 0.78125, + "learning_rate": 0.00018939855784558338, + "loss": 0.8717, + "step": 9046 + }, + { + "epoch": 0.23230159150469973, + "grad_norm": 0.80859375, + "learning_rate": 0.0001893965573667771, + "loss": 1.1165, + "step": 9047 + }, + { + "epoch": 0.23232726870062154, + "grad_norm": 0.765625, + "learning_rate": 0.00018939455670981148, + "loss": 0.9852, + "step": 9048 + }, + { + "epoch": 0.23235294589654337, + "grad_norm": 0.77734375, + "learning_rate": 0.00018939255587469066, + "loss": 0.9693, + "step": 9049 + }, + { + "epoch": 0.23237862309246518, + "grad_norm": 0.80859375, + "learning_rate": 0.0001893905548614185, + "loss": 1.2353, + "step": 9050 + }, + { + "epoch": 0.23240430028838702, + "grad_norm": 0.78515625, + "learning_rate": 0.00018938855366999903, + "loss": 0.9246, + "step": 9051 + }, + { + "epoch": 0.23242997748430883, + "grad_norm": 0.93359375, + "learning_rate": 0.00018938655230043628, + "loss": 1.179, + "step": 9052 + }, + { + "epoch": 0.23245565468023063, + "grad_norm": 0.82421875, + "learning_rate": 0.00018938455075273418, + "loss": 1.0962, + "step": 9053 + }, + { + "epoch": 0.23248133187615247, + "grad_norm": 0.8359375, + "learning_rate": 0.0001893825490268968, + "loss": 0.9298, + "step": 9054 + }, + { + "epoch": 0.23250700907207428, + "grad_norm": 0.73046875, + "learning_rate": 0.000189380547122928, + "loss": 0.8799, + "step": 9055 + }, + { + "epoch": 0.2325326862679961, + "grad_norm": 0.83203125, + "learning_rate": 0.00018937854504083186, + "loss": 1.0053, + "step": 9056 + }, + { + "epoch": 0.23255836346391792, + "grad_norm": 0.78515625, + "learning_rate": 0.00018937654278061236, + "loss": 1.0048, + "step": 9057 + }, + { + "epoch": 0.23258404065983973, + "grad_norm": 0.8125, + "learning_rate": 0.00018937454034227352, + "loss": 0.9679, + "step": 9058 + }, + { + "epoch": 0.23260971785576157, + "grad_norm": 0.8125, + "learning_rate": 0.00018937253772581926, + "loss": 0.9433, + "step": 9059 + }, + { + "epoch": 0.23263539505168337, + "grad_norm": 1.171875, + "learning_rate": 0.0001893705349312536, + "loss": 1.0325, + "step": 9060 + }, + { + "epoch": 0.2326610722476052, + "grad_norm": 0.765625, + "learning_rate": 0.00018936853195858055, + "loss": 0.9886, + "step": 9061 + }, + { + "epoch": 0.23268674944352702, + "grad_norm": 0.9453125, + "learning_rate": 0.0001893665288078041, + "loss": 1.0582, + "step": 9062 + }, + { + "epoch": 0.23271242663944883, + "grad_norm": 0.81640625, + "learning_rate": 0.0001893645254789282, + "loss": 1.0669, + "step": 9063 + }, + { + "epoch": 0.23273810383537066, + "grad_norm": 0.83984375, + "learning_rate": 0.0001893625219719569, + "loss": 1.2453, + "step": 9064 + }, + { + "epoch": 0.23276378103129247, + "grad_norm": 0.74609375, + "learning_rate": 0.00018936051828689413, + "loss": 0.8969, + "step": 9065 + }, + { + "epoch": 0.2327894582272143, + "grad_norm": 0.82421875, + "learning_rate": 0.00018935851442374398, + "loss": 1.0689, + "step": 9066 + }, + { + "epoch": 0.23281513542313612, + "grad_norm": 0.87890625, + "learning_rate": 0.00018935651038251035, + "loss": 1.0654, + "step": 9067 + }, + { + "epoch": 0.23284081261905792, + "grad_norm": 0.82421875, + "learning_rate": 0.00018935450616319724, + "loss": 0.9979, + "step": 9068 + }, + { + "epoch": 0.23286648981497976, + "grad_norm": 0.83984375, + "learning_rate": 0.0001893525017658087, + "loss": 1.1512, + "step": 9069 + }, + { + "epoch": 0.23289216701090157, + "grad_norm": 0.81640625, + "learning_rate": 0.0001893504971903487, + "loss": 1.0596, + "step": 9070 + }, + { + "epoch": 0.2329178442068234, + "grad_norm": 0.8359375, + "learning_rate": 0.00018934849243682124, + "loss": 0.983, + "step": 9071 + }, + { + "epoch": 0.2329435214027452, + "grad_norm": 0.84765625, + "learning_rate": 0.00018934648750523026, + "loss": 0.927, + "step": 9072 + }, + { + "epoch": 0.23296919859866702, + "grad_norm": 0.80859375, + "learning_rate": 0.00018934448239557982, + "loss": 1.0241, + "step": 9073 + }, + { + "epoch": 0.23299487579458886, + "grad_norm": 0.828125, + "learning_rate": 0.0001893424771078739, + "loss": 1.0281, + "step": 9074 + }, + { + "epoch": 0.23302055299051067, + "grad_norm": 0.77734375, + "learning_rate": 0.0001893404716421165, + "loss": 1.0068, + "step": 9075 + }, + { + "epoch": 0.2330462301864325, + "grad_norm": 0.71875, + "learning_rate": 0.0001893384659983116, + "loss": 1.0406, + "step": 9076 + }, + { + "epoch": 0.2330719073823543, + "grad_norm": 0.78515625, + "learning_rate": 0.00018933646017646322, + "loss": 1.1493, + "step": 9077 + }, + { + "epoch": 0.23309758457827612, + "grad_norm": 0.77734375, + "learning_rate": 0.00018933445417657535, + "loss": 1.0388, + "step": 9078 + }, + { + "epoch": 0.23312326177419795, + "grad_norm": 0.77734375, + "learning_rate": 0.00018933244799865194, + "loss": 1.1088, + "step": 9079 + }, + { + "epoch": 0.23314893897011976, + "grad_norm": 0.76953125, + "learning_rate": 0.00018933044164269708, + "loss": 0.8641, + "step": 9080 + }, + { + "epoch": 0.2331746161660416, + "grad_norm": 0.86328125, + "learning_rate": 0.00018932843510871468, + "loss": 0.9911, + "step": 9081 + }, + { + "epoch": 0.2332002933619634, + "grad_norm": 0.9765625, + "learning_rate": 0.0001893264283967088, + "loss": 1.061, + "step": 9082 + }, + { + "epoch": 0.23322597055788521, + "grad_norm": 0.8203125, + "learning_rate": 0.00018932442150668344, + "loss": 1.009, + "step": 9083 + }, + { + "epoch": 0.23325164775380705, + "grad_norm": 0.82421875, + "learning_rate": 0.00018932241443864257, + "loss": 1.0573, + "step": 9084 + }, + { + "epoch": 0.23327732494972886, + "grad_norm": 0.83984375, + "learning_rate": 0.00018932040719259015, + "loss": 0.979, + "step": 9085 + }, + { + "epoch": 0.2333030021456507, + "grad_norm": 0.765625, + "learning_rate": 0.00018931839976853024, + "loss": 1.0322, + "step": 9086 + }, + { + "epoch": 0.2333286793415725, + "grad_norm": 0.7734375, + "learning_rate": 0.00018931639216646686, + "loss": 0.8317, + "step": 9087 + }, + { + "epoch": 0.2333543565374943, + "grad_norm": 0.80078125, + "learning_rate": 0.00018931438438640394, + "loss": 0.9786, + "step": 9088 + }, + { + "epoch": 0.23338003373341615, + "grad_norm": 0.8203125, + "learning_rate": 0.00018931237642834555, + "loss": 1.0388, + "step": 9089 + }, + { + "epoch": 0.23340571092933796, + "grad_norm": 0.78515625, + "learning_rate": 0.00018931036829229566, + "loss": 0.9991, + "step": 9090 + }, + { + "epoch": 0.2334313881252598, + "grad_norm": 0.83984375, + "learning_rate": 0.00018930835997825826, + "loss": 1.0794, + "step": 9091 + }, + { + "epoch": 0.2334570653211816, + "grad_norm": 0.81640625, + "learning_rate": 0.0001893063514862374, + "loss": 1.0507, + "step": 9092 + }, + { + "epoch": 0.2334827425171034, + "grad_norm": 0.90234375, + "learning_rate": 0.000189304342816237, + "loss": 1.0355, + "step": 9093 + }, + { + "epoch": 0.23350841971302524, + "grad_norm": 0.80078125, + "learning_rate": 0.00018930233396826114, + "loss": 1.0072, + "step": 9094 + }, + { + "epoch": 0.23353409690894705, + "grad_norm": 0.91796875, + "learning_rate": 0.00018930032494231375, + "loss": 1.0465, + "step": 9095 + }, + { + "epoch": 0.2335597741048689, + "grad_norm": 0.7578125, + "learning_rate": 0.00018929831573839893, + "loss": 1.0014, + "step": 9096 + }, + { + "epoch": 0.2335854513007907, + "grad_norm": 0.8984375, + "learning_rate": 0.0001892963063565206, + "loss": 1.1314, + "step": 9097 + }, + { + "epoch": 0.2336111284967125, + "grad_norm": 0.75390625, + "learning_rate": 0.00018929429679668283, + "loss": 0.9466, + "step": 9098 + }, + { + "epoch": 0.23363680569263434, + "grad_norm": 0.79296875, + "learning_rate": 0.00018929228705888957, + "loss": 1.0137, + "step": 9099 + }, + { + "epoch": 0.23366248288855615, + "grad_norm": 0.8203125, + "learning_rate": 0.00018929027714314486, + "loss": 1.1218, + "step": 9100 + }, + { + "epoch": 0.23368816008447799, + "grad_norm": 0.7265625, + "learning_rate": 0.0001892882670494527, + "loss": 1.0059, + "step": 9101 + }, + { + "epoch": 0.2337138372803998, + "grad_norm": 0.8125, + "learning_rate": 0.00018928625677781707, + "loss": 1.1151, + "step": 9102 + }, + { + "epoch": 0.2337395144763216, + "grad_norm": 0.79296875, + "learning_rate": 0.000189284246328242, + "loss": 1.0842, + "step": 9103 + }, + { + "epoch": 0.23376519167224344, + "grad_norm": 0.796875, + "learning_rate": 0.00018928223570073148, + "loss": 0.9083, + "step": 9104 + }, + { + "epoch": 0.23379086886816525, + "grad_norm": 0.8046875, + "learning_rate": 0.00018928022489528957, + "loss": 1.0245, + "step": 9105 + }, + { + "epoch": 0.23381654606408708, + "grad_norm": 0.90625, + "learning_rate": 0.00018927821391192019, + "loss": 1.0765, + "step": 9106 + }, + { + "epoch": 0.2338422232600089, + "grad_norm": 0.8515625, + "learning_rate": 0.00018927620275062743, + "loss": 1.0262, + "step": 9107 + }, + { + "epoch": 0.2338679004559307, + "grad_norm": 0.796875, + "learning_rate": 0.00018927419141141525, + "loss": 1.0021, + "step": 9108 + }, + { + "epoch": 0.23389357765185254, + "grad_norm": 0.8046875, + "learning_rate": 0.00018927217989428766, + "loss": 1.0392, + "step": 9109 + }, + { + "epoch": 0.23391925484777434, + "grad_norm": 0.86328125, + "learning_rate": 0.00018927016819924867, + "loss": 1.2482, + "step": 9110 + }, + { + "epoch": 0.23394493204369618, + "grad_norm": 0.74609375, + "learning_rate": 0.00018926815632630233, + "loss": 0.9335, + "step": 9111 + }, + { + "epoch": 0.233970609239618, + "grad_norm": 0.8515625, + "learning_rate": 0.0001892661442754526, + "loss": 1.0035, + "step": 9112 + }, + { + "epoch": 0.2339962864355398, + "grad_norm": 0.82421875, + "learning_rate": 0.0001892641320467035, + "loss": 1.0231, + "step": 9113 + }, + { + "epoch": 0.23402196363146163, + "grad_norm": 0.80859375, + "learning_rate": 0.00018926211964005905, + "loss": 0.9158, + "step": 9114 + }, + { + "epoch": 0.23404764082738344, + "grad_norm": 0.76953125, + "learning_rate": 0.0001892601070555233, + "loss": 0.9684, + "step": 9115 + }, + { + "epoch": 0.23407331802330528, + "grad_norm": 0.7890625, + "learning_rate": 0.00018925809429310017, + "loss": 1.0638, + "step": 9116 + }, + { + "epoch": 0.23409899521922709, + "grad_norm": 0.80859375, + "learning_rate": 0.00018925608135279376, + "loss": 1.1478, + "step": 9117 + }, + { + "epoch": 0.2341246724151489, + "grad_norm": 0.76953125, + "learning_rate": 0.00018925406823460801, + "loss": 0.951, + "step": 9118 + }, + { + "epoch": 0.23415034961107073, + "grad_norm": 0.75390625, + "learning_rate": 0.00018925205493854698, + "loss": 1.1159, + "step": 9119 + }, + { + "epoch": 0.23417602680699254, + "grad_norm": 0.83203125, + "learning_rate": 0.00018925004146461464, + "loss": 1.1091, + "step": 9120 + }, + { + "epoch": 0.23420170400291437, + "grad_norm": 0.84375, + "learning_rate": 0.00018924802781281508, + "loss": 1.0106, + "step": 9121 + }, + { + "epoch": 0.23422738119883618, + "grad_norm": 0.85546875, + "learning_rate": 0.00018924601398315224, + "loss": 1.0234, + "step": 9122 + }, + { + "epoch": 0.234253058394758, + "grad_norm": 0.828125, + "learning_rate": 0.00018924399997563015, + "loss": 1.1076, + "step": 9123 + }, + { + "epoch": 0.23427873559067983, + "grad_norm": 0.8125, + "learning_rate": 0.0001892419857902528, + "loss": 0.946, + "step": 9124 + }, + { + "epoch": 0.23430441278660163, + "grad_norm": 0.73828125, + "learning_rate": 0.00018923997142702425, + "loss": 0.9599, + "step": 9125 + }, + { + "epoch": 0.23433008998252347, + "grad_norm": 0.82421875, + "learning_rate": 0.00018923795688594852, + "loss": 1.0196, + "step": 9126 + }, + { + "epoch": 0.23435576717844528, + "grad_norm": 0.75, + "learning_rate": 0.0001892359421670296, + "loss": 0.9724, + "step": 9127 + }, + { + "epoch": 0.2343814443743671, + "grad_norm": 0.7421875, + "learning_rate": 0.0001892339272702715, + "loss": 0.9547, + "step": 9128 + }, + { + "epoch": 0.23440712157028892, + "grad_norm": 0.78515625, + "learning_rate": 0.00018923191219567825, + "loss": 1.2649, + "step": 9129 + }, + { + "epoch": 0.23443279876621073, + "grad_norm": 0.7734375, + "learning_rate": 0.00018922989694325384, + "loss": 1.0187, + "step": 9130 + }, + { + "epoch": 0.23445847596213257, + "grad_norm": 0.921875, + "learning_rate": 0.00018922788151300233, + "loss": 1.0711, + "step": 9131 + }, + { + "epoch": 0.23448415315805438, + "grad_norm": 0.80078125, + "learning_rate": 0.00018922586590492768, + "loss": 1.0461, + "step": 9132 + }, + { + "epoch": 0.23450983035397618, + "grad_norm": 0.76953125, + "learning_rate": 0.00018922385011903395, + "loss": 1.0007, + "step": 9133 + }, + { + "epoch": 0.23453550754989802, + "grad_norm": 0.8203125, + "learning_rate": 0.00018922183415532519, + "loss": 1.1344, + "step": 9134 + }, + { + "epoch": 0.23456118474581983, + "grad_norm": 0.796875, + "learning_rate": 0.0001892198180138053, + "loss": 1.0929, + "step": 9135 + }, + { + "epoch": 0.23458686194174166, + "grad_norm": 0.75390625, + "learning_rate": 0.00018921780169447842, + "loss": 0.874, + "step": 9136 + }, + { + "epoch": 0.23461253913766347, + "grad_norm": 0.79296875, + "learning_rate": 0.00018921578519734853, + "loss": 1.0386, + "step": 9137 + }, + { + "epoch": 0.23463821633358528, + "grad_norm": 0.8046875, + "learning_rate": 0.00018921376852241962, + "loss": 1.0118, + "step": 9138 + }, + { + "epoch": 0.23466389352950712, + "grad_norm": 0.859375, + "learning_rate": 0.00018921175166969573, + "loss": 0.9452, + "step": 9139 + }, + { + "epoch": 0.23468957072542893, + "grad_norm": 0.765625, + "learning_rate": 0.00018920973463918087, + "loss": 0.9683, + "step": 9140 + }, + { + "epoch": 0.23471524792135076, + "grad_norm": 0.8046875, + "learning_rate": 0.00018920771743087907, + "loss": 0.9716, + "step": 9141 + }, + { + "epoch": 0.23474092511727257, + "grad_norm": 0.796875, + "learning_rate": 0.00018920570004479434, + "loss": 1.0921, + "step": 9142 + }, + { + "epoch": 0.23476660231319438, + "grad_norm": 0.8046875, + "learning_rate": 0.00018920368248093072, + "loss": 0.9545, + "step": 9143 + }, + { + "epoch": 0.23479227950911621, + "grad_norm": 0.78515625, + "learning_rate": 0.0001892016647392922, + "loss": 0.9576, + "step": 9144 + }, + { + "epoch": 0.23481795670503802, + "grad_norm": 0.77734375, + "learning_rate": 0.00018919964681988284, + "loss": 1.0393, + "step": 9145 + }, + { + "epoch": 0.23484363390095986, + "grad_norm": 0.8828125, + "learning_rate": 0.00018919762872270665, + "loss": 1.0601, + "step": 9146 + }, + { + "epoch": 0.23486931109688167, + "grad_norm": 0.81640625, + "learning_rate": 0.00018919561044776763, + "loss": 0.9797, + "step": 9147 + }, + { + "epoch": 0.23489498829280347, + "grad_norm": 0.80859375, + "learning_rate": 0.00018919359199506982, + "loss": 0.974, + "step": 9148 + }, + { + "epoch": 0.2349206654887253, + "grad_norm": 0.6796875, + "learning_rate": 0.00018919157336461724, + "loss": 0.9121, + "step": 9149 + }, + { + "epoch": 0.23494634268464712, + "grad_norm": 0.83984375, + "learning_rate": 0.0001891895545564139, + "loss": 1.1279, + "step": 9150 + }, + { + "epoch": 0.23497201988056896, + "grad_norm": 0.8203125, + "learning_rate": 0.00018918753557046385, + "loss": 1.0638, + "step": 9151 + }, + { + "epoch": 0.23499769707649076, + "grad_norm": 0.7734375, + "learning_rate": 0.00018918551640677108, + "loss": 1.0835, + "step": 9152 + }, + { + "epoch": 0.23502337427241257, + "grad_norm": 0.7578125, + "learning_rate": 0.00018918349706533962, + "loss": 0.9638, + "step": 9153 + }, + { + "epoch": 0.2350490514683344, + "grad_norm": 0.78515625, + "learning_rate": 0.00018918147754617355, + "loss": 1.0209, + "step": 9154 + }, + { + "epoch": 0.23507472866425622, + "grad_norm": 0.8125, + "learning_rate": 0.0001891794578492768, + "loss": 0.9687, + "step": 9155 + }, + { + "epoch": 0.23510040586017805, + "grad_norm": 0.8046875, + "learning_rate": 0.0001891774379746535, + "loss": 0.9238, + "step": 9156 + }, + { + "epoch": 0.23512608305609986, + "grad_norm": 0.80859375, + "learning_rate": 0.00018917541792230758, + "loss": 0.9434, + "step": 9157 + }, + { + "epoch": 0.23515176025202167, + "grad_norm": 0.80078125, + "learning_rate": 0.00018917339769224312, + "loss": 1.0097, + "step": 9158 + }, + { + "epoch": 0.2351774374479435, + "grad_norm": 0.8046875, + "learning_rate": 0.00018917137728446414, + "loss": 0.9705, + "step": 9159 + }, + { + "epoch": 0.2352031146438653, + "grad_norm": 0.75390625, + "learning_rate": 0.00018916935669897466, + "loss": 0.8756, + "step": 9160 + }, + { + "epoch": 0.23522879183978715, + "grad_norm": 0.8046875, + "learning_rate": 0.0001891673359357787, + "loss": 1.0096, + "step": 9161 + }, + { + "epoch": 0.23525446903570896, + "grad_norm": 0.77734375, + "learning_rate": 0.00018916531499488029, + "loss": 0.899, + "step": 9162 + }, + { + "epoch": 0.23528014623163077, + "grad_norm": 0.78125, + "learning_rate": 0.00018916329387628347, + "loss": 0.9305, + "step": 9163 + }, + { + "epoch": 0.2353058234275526, + "grad_norm": 0.8828125, + "learning_rate": 0.00018916127257999224, + "loss": 1.0492, + "step": 9164 + }, + { + "epoch": 0.2353315006234744, + "grad_norm": 0.81640625, + "learning_rate": 0.00018915925110601066, + "loss": 1.0459, + "step": 9165 + }, + { + "epoch": 0.23535717781939625, + "grad_norm": 0.87109375, + "learning_rate": 0.00018915722945434275, + "loss": 1.0032, + "step": 9166 + }, + { + "epoch": 0.23538285501531805, + "grad_norm": 0.85546875, + "learning_rate": 0.00018915520762499254, + "loss": 1.0531, + "step": 9167 + }, + { + "epoch": 0.23540853221123986, + "grad_norm": 0.97265625, + "learning_rate": 0.00018915318561796404, + "loss": 1.0287, + "step": 9168 + }, + { + "epoch": 0.2354342094071617, + "grad_norm": 0.6953125, + "learning_rate": 0.00018915116343326131, + "loss": 1.0742, + "step": 9169 + }, + { + "epoch": 0.2354598866030835, + "grad_norm": 0.82421875, + "learning_rate": 0.00018914914107088837, + "loss": 1.0659, + "step": 9170 + }, + { + "epoch": 0.23548556379900534, + "grad_norm": 0.84375, + "learning_rate": 0.00018914711853084922, + "loss": 1.1855, + "step": 9171 + }, + { + "epoch": 0.23551124099492715, + "grad_norm": 0.78515625, + "learning_rate": 0.00018914509581314794, + "loss": 1.0763, + "step": 9172 + }, + { + "epoch": 0.23553691819084896, + "grad_norm": 0.8125, + "learning_rate": 0.0001891430729177885, + "loss": 0.9278, + "step": 9173 + }, + { + "epoch": 0.2355625953867708, + "grad_norm": 0.8046875, + "learning_rate": 0.00018914104984477502, + "loss": 1.069, + "step": 9174 + }, + { + "epoch": 0.2355882725826926, + "grad_norm": 0.8046875, + "learning_rate": 0.00018913902659411144, + "loss": 0.9993, + "step": 9175 + }, + { + "epoch": 0.23561394977861444, + "grad_norm": 0.80078125, + "learning_rate": 0.00018913700316580182, + "loss": 1.1086, + "step": 9176 + }, + { + "epoch": 0.23563962697453625, + "grad_norm": 0.8203125, + "learning_rate": 0.00018913497955985026, + "loss": 0.9348, + "step": 9177 + }, + { + "epoch": 0.23566530417045806, + "grad_norm": 0.734375, + "learning_rate": 0.00018913295577626069, + "loss": 0.9989, + "step": 9178 + }, + { + "epoch": 0.2356909813663799, + "grad_norm": 0.75390625, + "learning_rate": 0.0001891309318150372, + "loss": 1.0335, + "step": 9179 + }, + { + "epoch": 0.2357166585623017, + "grad_norm": 0.8203125, + "learning_rate": 0.00018912890767618383, + "loss": 0.9858, + "step": 9180 + }, + { + "epoch": 0.23574233575822354, + "grad_norm": 0.8203125, + "learning_rate": 0.00018912688335970458, + "loss": 1.0295, + "step": 9181 + }, + { + "epoch": 0.23576801295414535, + "grad_norm": 0.81640625, + "learning_rate": 0.00018912485886560352, + "loss": 1.0805, + "step": 9182 + }, + { + "epoch": 0.23579369015006715, + "grad_norm": 0.8671875, + "learning_rate": 0.00018912283419388466, + "loss": 0.9137, + "step": 9183 + }, + { + "epoch": 0.235819367345989, + "grad_norm": 0.82421875, + "learning_rate": 0.00018912080934455202, + "loss": 1.0166, + "step": 9184 + }, + { + "epoch": 0.2358450445419108, + "grad_norm": 0.87109375, + "learning_rate": 0.0001891187843176097, + "loss": 1.1795, + "step": 9185 + }, + { + "epoch": 0.23587072173783263, + "grad_norm": 0.86328125, + "learning_rate": 0.00018911675911306166, + "loss": 1.1268, + "step": 9186 + }, + { + "epoch": 0.23589639893375444, + "grad_norm": 0.7578125, + "learning_rate": 0.00018911473373091198, + "loss": 0.9681, + "step": 9187 + }, + { + "epoch": 0.23592207612967625, + "grad_norm": 0.76953125, + "learning_rate": 0.00018911270817116468, + "loss": 1.0556, + "step": 9188 + }, + { + "epoch": 0.2359477533255981, + "grad_norm": 0.76953125, + "learning_rate": 0.00018911068243382382, + "loss": 0.9688, + "step": 9189 + }, + { + "epoch": 0.2359734305215199, + "grad_norm": 0.77734375, + "learning_rate": 0.0001891086565188934, + "loss": 1.056, + "step": 9190 + }, + { + "epoch": 0.2359991077174417, + "grad_norm": 0.8125, + "learning_rate": 0.00018910663042637747, + "loss": 0.8901, + "step": 9191 + }, + { + "epoch": 0.23602478491336354, + "grad_norm": 0.83203125, + "learning_rate": 0.0001891046041562801, + "loss": 0.9718, + "step": 9192 + }, + { + "epoch": 0.23605046210928535, + "grad_norm": 0.97265625, + "learning_rate": 0.00018910257770860528, + "loss": 1.2387, + "step": 9193 + }, + { + "epoch": 0.23607613930520718, + "grad_norm": 0.81640625, + "learning_rate": 0.0001891005510833571, + "loss": 1.1164, + "step": 9194 + }, + { + "epoch": 0.236101816501129, + "grad_norm": 0.87109375, + "learning_rate": 0.00018909852428053954, + "loss": 1.0376, + "step": 9195 + }, + { + "epoch": 0.2361274936970508, + "grad_norm": 0.8125, + "learning_rate": 0.00018909649730015668, + "loss": 0.9022, + "step": 9196 + }, + { + "epoch": 0.23615317089297264, + "grad_norm": 0.74609375, + "learning_rate": 0.00018909447014221254, + "loss": 0.8181, + "step": 9197 + }, + { + "epoch": 0.23617884808889444, + "grad_norm": 0.76171875, + "learning_rate": 0.00018909244280671116, + "loss": 1.126, + "step": 9198 + }, + { + "epoch": 0.23620452528481628, + "grad_norm": 0.765625, + "learning_rate": 0.0001890904152936566, + "loss": 1.0953, + "step": 9199 + }, + { + "epoch": 0.2362302024807381, + "grad_norm": 0.796875, + "learning_rate": 0.00018908838760305291, + "loss": 1.0254, + "step": 9200 + }, + { + "epoch": 0.2362558796766599, + "grad_norm": 0.8359375, + "learning_rate": 0.0001890863597349041, + "loss": 1.0669, + "step": 9201 + }, + { + "epoch": 0.23628155687258173, + "grad_norm": 0.890625, + "learning_rate": 0.00018908433168921422, + "loss": 1.0437, + "step": 9202 + }, + { + "epoch": 0.23630723406850354, + "grad_norm": 0.79296875, + "learning_rate": 0.00018908230346598731, + "loss": 0.9955, + "step": 9203 + }, + { + "epoch": 0.23633291126442538, + "grad_norm": 0.83984375, + "learning_rate": 0.00018908027506522743, + "loss": 0.9885, + "step": 9204 + }, + { + "epoch": 0.23635858846034719, + "grad_norm": 0.90234375, + "learning_rate": 0.0001890782464869386, + "loss": 1.0459, + "step": 9205 + }, + { + "epoch": 0.236384265656269, + "grad_norm": 0.80859375, + "learning_rate": 0.00018907621773112484, + "loss": 0.8491, + "step": 9206 + }, + { + "epoch": 0.23640994285219083, + "grad_norm": 0.7578125, + "learning_rate": 0.00018907418879779027, + "loss": 1.108, + "step": 9207 + }, + { + "epoch": 0.23643562004811264, + "grad_norm": 0.71875, + "learning_rate": 0.00018907215968693887, + "loss": 0.9688, + "step": 9208 + }, + { + "epoch": 0.23646129724403447, + "grad_norm": 0.8046875, + "learning_rate": 0.0001890701303985747, + "loss": 1.1343, + "step": 9209 + }, + { + "epoch": 0.23648697443995628, + "grad_norm": 0.85546875, + "learning_rate": 0.0001890681009327018, + "loss": 1.1018, + "step": 9210 + }, + { + "epoch": 0.2365126516358781, + "grad_norm": 0.87890625, + "learning_rate": 0.00018906607128932424, + "loss": 1.0339, + "step": 9211 + }, + { + "epoch": 0.23653832883179993, + "grad_norm": 0.79296875, + "learning_rate": 0.00018906404146844605, + "loss": 0.9986, + "step": 9212 + }, + { + "epoch": 0.23656400602772173, + "grad_norm": 0.8828125, + "learning_rate": 0.00018906201147007124, + "loss": 0.9528, + "step": 9213 + }, + { + "epoch": 0.23658968322364357, + "grad_norm": 0.74609375, + "learning_rate": 0.0001890599812942039, + "loss": 0.9925, + "step": 9214 + }, + { + "epoch": 0.23661536041956538, + "grad_norm": 0.76953125, + "learning_rate": 0.00018905795094084805, + "loss": 1.0622, + "step": 9215 + }, + { + "epoch": 0.2366410376154872, + "grad_norm": 0.79296875, + "learning_rate": 0.0001890559204100078, + "loss": 1.0652, + "step": 9216 + }, + { + "epoch": 0.23666671481140902, + "grad_norm": 0.94140625, + "learning_rate": 0.0001890538897016871, + "loss": 0.9818, + "step": 9217 + }, + { + "epoch": 0.23669239200733083, + "grad_norm": 0.83984375, + "learning_rate": 0.00018905185881589004, + "loss": 0.958, + "step": 9218 + }, + { + "epoch": 0.23671806920325267, + "grad_norm": 0.8359375, + "learning_rate": 0.00018904982775262065, + "loss": 0.9239, + "step": 9219 + }, + { + "epoch": 0.23674374639917448, + "grad_norm": 0.8828125, + "learning_rate": 0.00018904779651188306, + "loss": 0.9679, + "step": 9220 + }, + { + "epoch": 0.23676942359509628, + "grad_norm": 0.7890625, + "learning_rate": 0.00018904576509368122, + "loss": 1.0496, + "step": 9221 + }, + { + "epoch": 0.23679510079101812, + "grad_norm": 0.796875, + "learning_rate": 0.0001890437334980192, + "loss": 0.9188, + "step": 9222 + }, + { + "epoch": 0.23682077798693993, + "grad_norm": 0.75390625, + "learning_rate": 0.00018904170172490107, + "loss": 0.9507, + "step": 9223 + }, + { + "epoch": 0.23684645518286176, + "grad_norm": 0.78515625, + "learning_rate": 0.00018903966977433086, + "loss": 1.2657, + "step": 9224 + }, + { + "epoch": 0.23687213237878357, + "grad_norm": 0.83984375, + "learning_rate": 0.00018903763764631265, + "loss": 1.1291, + "step": 9225 + }, + { + "epoch": 0.23689780957470538, + "grad_norm": 0.7578125, + "learning_rate": 0.0001890356053408505, + "loss": 1.0356, + "step": 9226 + }, + { + "epoch": 0.23692348677062722, + "grad_norm": 0.79296875, + "learning_rate": 0.00018903357285794838, + "loss": 0.9167, + "step": 9227 + }, + { + "epoch": 0.23694916396654903, + "grad_norm": 0.80859375, + "learning_rate": 0.00018903154019761043, + "loss": 1.0036, + "step": 9228 + }, + { + "epoch": 0.23697484116247086, + "grad_norm": 0.76953125, + "learning_rate": 0.00018902950735984062, + "loss": 1.2285, + "step": 9229 + }, + { + "epoch": 0.23700051835839267, + "grad_norm": 0.78125, + "learning_rate": 0.00018902747434464308, + "loss": 1.0574, + "step": 9230 + }, + { + "epoch": 0.23702619555431448, + "grad_norm": 0.90234375, + "learning_rate": 0.00018902544115202181, + "loss": 1.1563, + "step": 9231 + }, + { + "epoch": 0.23705187275023631, + "grad_norm": 0.83203125, + "learning_rate": 0.0001890234077819809, + "loss": 0.9763, + "step": 9232 + }, + { + "epoch": 0.23707754994615812, + "grad_norm": 0.8203125, + "learning_rate": 0.00018902137423452433, + "loss": 1.0048, + "step": 9233 + }, + { + "epoch": 0.23710322714207996, + "grad_norm": 0.8828125, + "learning_rate": 0.00018901934050965624, + "loss": 1.0788, + "step": 9234 + }, + { + "epoch": 0.23712890433800177, + "grad_norm": 0.81640625, + "learning_rate": 0.00018901730660738063, + "loss": 0.9987, + "step": 9235 + }, + { + "epoch": 0.23715458153392358, + "grad_norm": 0.80078125, + "learning_rate": 0.00018901527252770158, + "loss": 1.0096, + "step": 9236 + }, + { + "epoch": 0.2371802587298454, + "grad_norm": 0.75, + "learning_rate": 0.00018901323827062315, + "loss": 1.0109, + "step": 9237 + }, + { + "epoch": 0.23720593592576722, + "grad_norm": 0.8828125, + "learning_rate": 0.00018901120383614935, + "loss": 0.9849, + "step": 9238 + }, + { + "epoch": 0.23723161312168906, + "grad_norm": 0.8515625, + "learning_rate": 0.00018900916922428426, + "loss": 1.1135, + "step": 9239 + }, + { + "epoch": 0.23725729031761086, + "grad_norm": 0.8359375, + "learning_rate": 0.00018900713443503194, + "loss": 1.0277, + "step": 9240 + }, + { + "epoch": 0.23728296751353267, + "grad_norm": 0.765625, + "learning_rate": 0.00018900509946839648, + "loss": 1.0167, + "step": 9241 + }, + { + "epoch": 0.2373086447094545, + "grad_norm": 0.87109375, + "learning_rate": 0.00018900306432438185, + "loss": 0.8845, + "step": 9242 + }, + { + "epoch": 0.23733432190537632, + "grad_norm": 0.7734375, + "learning_rate": 0.00018900102900299215, + "loss": 0.9847, + "step": 9243 + }, + { + "epoch": 0.23735999910129815, + "grad_norm": 0.78515625, + "learning_rate": 0.00018899899350423147, + "loss": 0.9815, + "step": 9244 + }, + { + "epoch": 0.23738567629721996, + "grad_norm": 0.7578125, + "learning_rate": 0.0001889969578281038, + "loss": 0.9686, + "step": 9245 + }, + { + "epoch": 0.23741135349314177, + "grad_norm": 0.8125, + "learning_rate": 0.00018899492197461327, + "loss": 1.1122, + "step": 9246 + }, + { + "epoch": 0.2374370306890636, + "grad_norm": 0.8359375, + "learning_rate": 0.00018899288594376385, + "loss": 1.0646, + "step": 9247 + }, + { + "epoch": 0.2374627078849854, + "grad_norm": 0.85546875, + "learning_rate": 0.00018899084973555969, + "loss": 1.1195, + "step": 9248 + }, + { + "epoch": 0.23748838508090725, + "grad_norm": 0.8125, + "learning_rate": 0.00018898881335000477, + "loss": 1.0596, + "step": 9249 + }, + { + "epoch": 0.23751406227682906, + "grad_norm": 0.83203125, + "learning_rate": 0.0001889867767871032, + "loss": 1.2657, + "step": 9250 + }, + { + "epoch": 0.23753973947275087, + "grad_norm": 0.8828125, + "learning_rate": 0.00018898474004685903, + "loss": 1.124, + "step": 9251 + }, + { + "epoch": 0.2375654166686727, + "grad_norm": 0.77734375, + "learning_rate": 0.0001889827031292763, + "loss": 1.0377, + "step": 9252 + }, + { + "epoch": 0.2375910938645945, + "grad_norm": 0.78125, + "learning_rate": 0.00018898066603435912, + "loss": 1.1365, + "step": 9253 + }, + { + "epoch": 0.23761677106051635, + "grad_norm": 0.78515625, + "learning_rate": 0.00018897862876211146, + "loss": 1.0661, + "step": 9254 + }, + { + "epoch": 0.23764244825643815, + "grad_norm": 0.84765625, + "learning_rate": 0.00018897659131253747, + "loss": 1.1316, + "step": 9255 + }, + { + "epoch": 0.23766812545235996, + "grad_norm": 0.7890625, + "learning_rate": 0.00018897455368564116, + "loss": 0.9907, + "step": 9256 + }, + { + "epoch": 0.2376938026482818, + "grad_norm": 0.765625, + "learning_rate": 0.00018897251588142656, + "loss": 1.0617, + "step": 9257 + }, + { + "epoch": 0.2377194798442036, + "grad_norm": 0.79296875, + "learning_rate": 0.00018897047789989783, + "loss": 1.0994, + "step": 9258 + }, + { + "epoch": 0.23774515704012544, + "grad_norm": 0.8125, + "learning_rate": 0.00018896843974105894, + "loss": 1.0055, + "step": 9259 + }, + { + "epoch": 0.23777083423604725, + "grad_norm": 0.74609375, + "learning_rate": 0.00018896640140491397, + "loss": 0.9713, + "step": 9260 + }, + { + "epoch": 0.23779651143196906, + "grad_norm": 0.76171875, + "learning_rate": 0.00018896436289146703, + "loss": 0.9954, + "step": 9261 + }, + { + "epoch": 0.2378221886278909, + "grad_norm": 0.8359375, + "learning_rate": 0.00018896232420072213, + "loss": 0.9931, + "step": 9262 + }, + { + "epoch": 0.2378478658238127, + "grad_norm": 0.81640625, + "learning_rate": 0.00018896028533268338, + "loss": 1.093, + "step": 9263 + }, + { + "epoch": 0.23787354301973454, + "grad_norm": 0.84765625, + "learning_rate": 0.0001889582462873548, + "loss": 0.9002, + "step": 9264 + }, + { + "epoch": 0.23789922021565635, + "grad_norm": 0.8203125, + "learning_rate": 0.00018895620706474046, + "loss": 0.9818, + "step": 9265 + }, + { + "epoch": 0.23792489741157816, + "grad_norm": 1.125, + "learning_rate": 0.00018895416766484446, + "loss": 1.0889, + "step": 9266 + }, + { + "epoch": 0.2379505746075, + "grad_norm": 0.85546875, + "learning_rate": 0.00018895212808767083, + "loss": 1.1292, + "step": 9267 + }, + { + "epoch": 0.2379762518034218, + "grad_norm": 0.796875, + "learning_rate": 0.00018895008833322364, + "loss": 0.8482, + "step": 9268 + }, + { + "epoch": 0.23800192899934364, + "grad_norm": 0.82421875, + "learning_rate": 0.00018894804840150699, + "loss": 0.9185, + "step": 9269 + }, + { + "epoch": 0.23802760619526545, + "grad_norm": 0.74609375, + "learning_rate": 0.00018894600829252487, + "loss": 0.8279, + "step": 9270 + }, + { + "epoch": 0.23805328339118725, + "grad_norm": 0.80859375, + "learning_rate": 0.0001889439680062814, + "loss": 0.9387, + "step": 9271 + }, + { + "epoch": 0.2380789605871091, + "grad_norm": 0.8046875, + "learning_rate": 0.00018894192754278065, + "loss": 1.1608, + "step": 9272 + }, + { + "epoch": 0.2381046377830309, + "grad_norm": 0.8359375, + "learning_rate": 0.00018893988690202665, + "loss": 1.1047, + "step": 9273 + }, + { + "epoch": 0.23813031497895273, + "grad_norm": 0.7578125, + "learning_rate": 0.0001889378460840235, + "loss": 0.9995, + "step": 9274 + }, + { + "epoch": 0.23815599217487454, + "grad_norm": 0.76171875, + "learning_rate": 0.00018893580508877526, + "loss": 1.0332, + "step": 9275 + }, + { + "epoch": 0.23818166937079635, + "grad_norm": 0.703125, + "learning_rate": 0.00018893376391628598, + "loss": 0.963, + "step": 9276 + }, + { + "epoch": 0.2382073465667182, + "grad_norm": 0.8203125, + "learning_rate": 0.00018893172256655976, + "loss": 0.9475, + "step": 9277 + }, + { + "epoch": 0.23823302376264, + "grad_norm": 0.78515625, + "learning_rate": 0.00018892968103960067, + "loss": 0.9831, + "step": 9278 + }, + { + "epoch": 0.23825870095856183, + "grad_norm": 0.73828125, + "learning_rate": 0.00018892763933541274, + "loss": 0.8885, + "step": 9279 + }, + { + "epoch": 0.23828437815448364, + "grad_norm": 0.78515625, + "learning_rate": 0.00018892559745400004, + "loss": 0.9296, + "step": 9280 + }, + { + "epoch": 0.23831005535040545, + "grad_norm": 0.796875, + "learning_rate": 0.00018892355539536665, + "loss": 1.0263, + "step": 9281 + }, + { + "epoch": 0.23833573254632728, + "grad_norm": 0.76953125, + "learning_rate": 0.00018892151315951666, + "loss": 0.9934, + "step": 9282 + }, + { + "epoch": 0.2383614097422491, + "grad_norm": 0.8046875, + "learning_rate": 0.00018891947074645414, + "loss": 1.1325, + "step": 9283 + }, + { + "epoch": 0.23838708693817093, + "grad_norm": 0.84375, + "learning_rate": 0.00018891742815618315, + "loss": 1.1128, + "step": 9284 + }, + { + "epoch": 0.23841276413409274, + "grad_norm": 0.890625, + "learning_rate": 0.00018891538538870774, + "loss": 0.9993, + "step": 9285 + }, + { + "epoch": 0.23843844133001454, + "grad_norm": 0.81640625, + "learning_rate": 0.000188913342444032, + "loss": 1.0806, + "step": 9286 + }, + { + "epoch": 0.23846411852593638, + "grad_norm": 0.73828125, + "learning_rate": 0.00018891129932216002, + "loss": 1.0098, + "step": 9287 + }, + { + "epoch": 0.2384897957218582, + "grad_norm": 0.8203125, + "learning_rate": 0.00018890925602309583, + "loss": 0.9958, + "step": 9288 + }, + { + "epoch": 0.23851547291778002, + "grad_norm": 0.79296875, + "learning_rate": 0.00018890721254684353, + "loss": 0.9729, + "step": 9289 + }, + { + "epoch": 0.23854115011370183, + "grad_norm": 0.74609375, + "learning_rate": 0.0001889051688934072, + "loss": 1.1494, + "step": 9290 + }, + { + "epoch": 0.23856682730962364, + "grad_norm": 0.69921875, + "learning_rate": 0.00018890312506279086, + "loss": 0.8386, + "step": 9291 + }, + { + "epoch": 0.23859250450554548, + "grad_norm": 0.72265625, + "learning_rate": 0.00018890108105499865, + "loss": 0.8825, + "step": 9292 + }, + { + "epoch": 0.23861818170146729, + "grad_norm": 0.7734375, + "learning_rate": 0.0001888990368700346, + "loss": 1.0286, + "step": 9293 + }, + { + "epoch": 0.23864385889738912, + "grad_norm": 0.81640625, + "learning_rate": 0.00018889699250790282, + "loss": 1.0807, + "step": 9294 + }, + { + "epoch": 0.23866953609331093, + "grad_norm": 0.734375, + "learning_rate": 0.00018889494796860736, + "loss": 0.9537, + "step": 9295 + }, + { + "epoch": 0.23869521328923274, + "grad_norm": 0.7578125, + "learning_rate": 0.0001888929032521523, + "loss": 0.9846, + "step": 9296 + }, + { + "epoch": 0.23872089048515457, + "grad_norm": 1.0703125, + "learning_rate": 0.00018889085835854173, + "loss": 0.9439, + "step": 9297 + }, + { + "epoch": 0.23874656768107638, + "grad_norm": 0.84375, + "learning_rate": 0.00018888881328777966, + "loss": 0.8993, + "step": 9298 + }, + { + "epoch": 0.23877224487699822, + "grad_norm": 0.765625, + "learning_rate": 0.00018888676803987024, + "loss": 1.0136, + "step": 9299 + }, + { + "epoch": 0.23879792207292003, + "grad_norm": 0.83984375, + "learning_rate": 0.00018888472261481754, + "loss": 1.0176, + "step": 9300 + }, + { + "epoch": 0.23882359926884184, + "grad_norm": 0.828125, + "learning_rate": 0.0001888826770126256, + "loss": 1.1274, + "step": 9301 + }, + { + "epoch": 0.23884927646476367, + "grad_norm": 0.875, + "learning_rate": 0.0001888806312332985, + "loss": 0.9408, + "step": 9302 + }, + { + "epoch": 0.23887495366068548, + "grad_norm": 0.7734375, + "learning_rate": 0.00018887858527684036, + "loss": 0.9448, + "step": 9303 + }, + { + "epoch": 0.23890063085660732, + "grad_norm": 0.84765625, + "learning_rate": 0.0001888765391432552, + "loss": 0.9962, + "step": 9304 + }, + { + "epoch": 0.23892630805252912, + "grad_norm": 0.8046875, + "learning_rate": 0.00018887449283254713, + "loss": 1.0236, + "step": 9305 + }, + { + "epoch": 0.23895198524845093, + "grad_norm": 0.83984375, + "learning_rate": 0.00018887244634472022, + "loss": 0.9908, + "step": 9306 + }, + { + "epoch": 0.23897766244437277, + "grad_norm": 0.8203125, + "learning_rate": 0.00018887039967977858, + "loss": 1.2155, + "step": 9307 + }, + { + "epoch": 0.23900333964029458, + "grad_norm": 0.80859375, + "learning_rate": 0.00018886835283772623, + "loss": 1.0643, + "step": 9308 + }, + { + "epoch": 0.2390290168362164, + "grad_norm": 0.92578125, + "learning_rate": 0.0001888663058185673, + "loss": 1.1546, + "step": 9309 + }, + { + "epoch": 0.23905469403213822, + "grad_norm": 0.828125, + "learning_rate": 0.00018886425862230585, + "loss": 1.0748, + "step": 9310 + }, + { + "epoch": 0.23908037122806003, + "grad_norm": 0.8515625, + "learning_rate": 0.00018886221124894594, + "loss": 1.102, + "step": 9311 + }, + { + "epoch": 0.23910604842398187, + "grad_norm": 0.765625, + "learning_rate": 0.0001888601636984917, + "loss": 0.95, + "step": 9312 + }, + { + "epoch": 0.23913172561990367, + "grad_norm": 0.74609375, + "learning_rate": 0.00018885811597094717, + "loss": 0.9048, + "step": 9313 + }, + { + "epoch": 0.2391574028158255, + "grad_norm": 0.91015625, + "learning_rate": 0.0001888560680663164, + "loss": 0.9919, + "step": 9314 + }, + { + "epoch": 0.23918308001174732, + "grad_norm": 0.75, + "learning_rate": 0.00018885401998460356, + "loss": 0.9833, + "step": 9315 + }, + { + "epoch": 0.23920875720766913, + "grad_norm": 0.84375, + "learning_rate": 0.0001888519717258127, + "loss": 1.0283, + "step": 9316 + }, + { + "epoch": 0.23923443440359096, + "grad_norm": 0.7578125, + "learning_rate": 0.00018884992328994783, + "loss": 0.8027, + "step": 9317 + }, + { + "epoch": 0.23926011159951277, + "grad_norm": 0.87890625, + "learning_rate": 0.00018884787467701313, + "loss": 1.0923, + "step": 9318 + }, + { + "epoch": 0.2392857887954346, + "grad_norm": 0.7265625, + "learning_rate": 0.00018884582588701263, + "loss": 0.9792, + "step": 9319 + }, + { + "epoch": 0.23931146599135641, + "grad_norm": 0.8046875, + "learning_rate": 0.00018884377691995042, + "loss": 1.0446, + "step": 9320 + }, + { + "epoch": 0.23933714318727822, + "grad_norm": 0.86328125, + "learning_rate": 0.0001888417277758306, + "loss": 1.0918, + "step": 9321 + }, + { + "epoch": 0.23936282038320006, + "grad_norm": 0.74609375, + "learning_rate": 0.00018883967845465725, + "loss": 0.8376, + "step": 9322 + }, + { + "epoch": 0.23938849757912187, + "grad_norm": 0.7734375, + "learning_rate": 0.00018883762895643443, + "loss": 1.1056, + "step": 9323 + }, + { + "epoch": 0.2394141747750437, + "grad_norm": 0.84765625, + "learning_rate": 0.0001888355792811662, + "loss": 0.9954, + "step": 9324 + }, + { + "epoch": 0.2394398519709655, + "grad_norm": 0.796875, + "learning_rate": 0.00018883352942885675, + "loss": 0.907, + "step": 9325 + }, + { + "epoch": 0.23946552916688732, + "grad_norm": 0.79296875, + "learning_rate": 0.00018883147939951006, + "loss": 1.009, + "step": 9326 + }, + { + "epoch": 0.23949120636280916, + "grad_norm": 0.73828125, + "learning_rate": 0.00018882942919313029, + "loss": 0.941, + "step": 9327 + }, + { + "epoch": 0.23951688355873096, + "grad_norm": 0.82421875, + "learning_rate": 0.00018882737880972146, + "loss": 1.0045, + "step": 9328 + }, + { + "epoch": 0.2395425607546528, + "grad_norm": 0.87890625, + "learning_rate": 0.0001888253282492877, + "loss": 0.9139, + "step": 9329 + }, + { + "epoch": 0.2395682379505746, + "grad_norm": 0.8046875, + "learning_rate": 0.00018882327751183308, + "loss": 0.9847, + "step": 9330 + }, + { + "epoch": 0.23959391514649642, + "grad_norm": 0.84765625, + "learning_rate": 0.00018882122659736168, + "loss": 1.1287, + "step": 9331 + }, + { + "epoch": 0.23961959234241825, + "grad_norm": 0.77734375, + "learning_rate": 0.0001888191755058776, + "loss": 0.9854, + "step": 9332 + }, + { + "epoch": 0.23964526953834006, + "grad_norm": 0.78125, + "learning_rate": 0.00018881712423738498, + "loss": 0.767, + "step": 9333 + }, + { + "epoch": 0.2396709467342619, + "grad_norm": 0.77734375, + "learning_rate": 0.00018881507279188778, + "loss": 1.0056, + "step": 9334 + }, + { + "epoch": 0.2396966239301837, + "grad_norm": 0.8359375, + "learning_rate": 0.0001888130211693902, + "loss": 1.0078, + "step": 9335 + }, + { + "epoch": 0.2397223011261055, + "grad_norm": 0.81640625, + "learning_rate": 0.00018881096936989631, + "loss": 1.0332, + "step": 9336 + }, + { + "epoch": 0.23974797832202735, + "grad_norm": 0.80078125, + "learning_rate": 0.00018880891739341012, + "loss": 1.0761, + "step": 9337 + }, + { + "epoch": 0.23977365551794916, + "grad_norm": 0.74609375, + "learning_rate": 0.00018880686523993585, + "loss": 0.8544, + "step": 9338 + }, + { + "epoch": 0.239799332713871, + "grad_norm": 0.8984375, + "learning_rate": 0.0001888048129094775, + "loss": 1.0648, + "step": 9339 + }, + { + "epoch": 0.2398250099097928, + "grad_norm": 0.79296875, + "learning_rate": 0.00018880276040203915, + "loss": 1.034, + "step": 9340 + }, + { + "epoch": 0.2398506871057146, + "grad_norm": 0.77734375, + "learning_rate": 0.00018880070771762494, + "loss": 1.0626, + "step": 9341 + }, + { + "epoch": 0.23987636430163645, + "grad_norm": 0.8203125, + "learning_rate": 0.00018879865485623893, + "loss": 1.0955, + "step": 9342 + }, + { + "epoch": 0.23990204149755825, + "grad_norm": 0.8046875, + "learning_rate": 0.00018879660181788524, + "loss": 0.9552, + "step": 9343 + }, + { + "epoch": 0.2399277186934801, + "grad_norm": 0.80859375, + "learning_rate": 0.0001887945486025679, + "loss": 1.1059, + "step": 9344 + }, + { + "epoch": 0.2399533958894019, + "grad_norm": 0.79296875, + "learning_rate": 0.00018879249521029109, + "loss": 0.9832, + "step": 9345 + }, + { + "epoch": 0.2399790730853237, + "grad_norm": 0.83203125, + "learning_rate": 0.00018879044164105886, + "loss": 0.9698, + "step": 9346 + }, + { + "epoch": 0.24000475028124554, + "grad_norm": 0.8359375, + "learning_rate": 0.0001887883878948753, + "loss": 1.014, + "step": 9347 + }, + { + "epoch": 0.24003042747716735, + "grad_norm": 1.0078125, + "learning_rate": 0.00018878633397174447, + "loss": 0.9985, + "step": 9348 + }, + { + "epoch": 0.2400561046730892, + "grad_norm": 0.8515625, + "learning_rate": 0.00018878427987167052, + "loss": 1.0954, + "step": 9349 + }, + { + "epoch": 0.240081781869011, + "grad_norm": 0.7734375, + "learning_rate": 0.0001887822255946575, + "loss": 1.1052, + "step": 9350 + }, + { + "epoch": 0.2401074590649328, + "grad_norm": 0.828125, + "learning_rate": 0.00018878017114070956, + "loss": 1.1713, + "step": 9351 + }, + { + "epoch": 0.24013313626085464, + "grad_norm": 0.8515625, + "learning_rate": 0.00018877811650983071, + "loss": 1.0602, + "step": 9352 + }, + { + "epoch": 0.24015881345677645, + "grad_norm": 0.75, + "learning_rate": 0.00018877606170202515, + "loss": 1.0109, + "step": 9353 + }, + { + "epoch": 0.24018449065269828, + "grad_norm": 0.75390625, + "learning_rate": 0.0001887740067172969, + "loss": 0.988, + "step": 9354 + }, + { + "epoch": 0.2402101678486201, + "grad_norm": 0.73046875, + "learning_rate": 0.00018877195155565004, + "loss": 0.8806, + "step": 9355 + }, + { + "epoch": 0.2402358450445419, + "grad_norm": 0.7734375, + "learning_rate": 0.00018876989621708875, + "loss": 1.0338, + "step": 9356 + }, + { + "epoch": 0.24026152224046374, + "grad_norm": 0.78515625, + "learning_rate": 0.00018876784070161705, + "loss": 1.002, + "step": 9357 + }, + { + "epoch": 0.24028719943638555, + "grad_norm": 0.71484375, + "learning_rate": 0.00018876578500923904, + "loss": 0.8867, + "step": 9358 + }, + { + "epoch": 0.24031287663230738, + "grad_norm": 0.796875, + "learning_rate": 0.00018876372913995883, + "loss": 0.886, + "step": 9359 + }, + { + "epoch": 0.2403385538282292, + "grad_norm": 0.80078125, + "learning_rate": 0.0001887616730937806, + "loss": 0.9484, + "step": 9360 + }, + { + "epoch": 0.240364231024151, + "grad_norm": 0.7890625, + "learning_rate": 0.00018875961687070828, + "loss": 1.0633, + "step": 9361 + }, + { + "epoch": 0.24038990822007283, + "grad_norm": 0.82421875, + "learning_rate": 0.00018875756047074613, + "loss": 1.0645, + "step": 9362 + }, + { + "epoch": 0.24041558541599464, + "grad_norm": 0.8203125, + "learning_rate": 0.00018875550389389816, + "loss": 1.1507, + "step": 9363 + }, + { + "epoch": 0.24044126261191648, + "grad_norm": 0.73046875, + "learning_rate": 0.0001887534471401685, + "loss": 1.028, + "step": 9364 + }, + { + "epoch": 0.2404669398078383, + "grad_norm": 0.8125, + "learning_rate": 0.00018875139020956122, + "loss": 0.9308, + "step": 9365 + }, + { + "epoch": 0.2404926170037601, + "grad_norm": 0.765625, + "learning_rate": 0.00018874933310208042, + "loss": 1.0192, + "step": 9366 + }, + { + "epoch": 0.24051829419968193, + "grad_norm": 0.828125, + "learning_rate": 0.00018874727581773024, + "loss": 1.0242, + "step": 9367 + }, + { + "epoch": 0.24054397139560374, + "grad_norm": 1.1328125, + "learning_rate": 0.00018874521835651476, + "loss": 1.0091, + "step": 9368 + }, + { + "epoch": 0.24056964859152558, + "grad_norm": 0.8046875, + "learning_rate": 0.00018874316071843805, + "loss": 1.1372, + "step": 9369 + }, + { + "epoch": 0.24059532578744738, + "grad_norm": 0.82421875, + "learning_rate": 0.00018874110290350427, + "loss": 0.9856, + "step": 9370 + }, + { + "epoch": 0.2406210029833692, + "grad_norm": 0.76171875, + "learning_rate": 0.00018873904491171746, + "loss": 1.0139, + "step": 9371 + }, + { + "epoch": 0.24064668017929103, + "grad_norm": 0.75390625, + "learning_rate": 0.00018873698674308175, + "loss": 0.9616, + "step": 9372 + }, + { + "epoch": 0.24067235737521284, + "grad_norm": 0.84375, + "learning_rate": 0.00018873492839760125, + "loss": 1.1068, + "step": 9373 + }, + { + "epoch": 0.24069803457113467, + "grad_norm": 0.8125, + "learning_rate": 0.00018873286987528006, + "loss": 0.9288, + "step": 9374 + }, + { + "epoch": 0.24072371176705648, + "grad_norm": 0.76953125, + "learning_rate": 0.00018873081117612225, + "loss": 0.9414, + "step": 9375 + }, + { + "epoch": 0.2407493889629783, + "grad_norm": 0.77734375, + "learning_rate": 0.00018872875230013198, + "loss": 1.046, + "step": 9376 + }, + { + "epoch": 0.24077506615890013, + "grad_norm": 0.8203125, + "learning_rate": 0.00018872669324731332, + "loss": 0.9854, + "step": 9377 + }, + { + "epoch": 0.24080074335482193, + "grad_norm": 0.80859375, + "learning_rate": 0.00018872463401767035, + "loss": 1.1558, + "step": 9378 + }, + { + "epoch": 0.24082642055074377, + "grad_norm": 0.79296875, + "learning_rate": 0.00018872257461120725, + "loss": 1.0547, + "step": 9379 + }, + { + "epoch": 0.24085209774666558, + "grad_norm": 0.83203125, + "learning_rate": 0.000188720515027928, + "loss": 0.9967, + "step": 9380 + }, + { + "epoch": 0.24087777494258739, + "grad_norm": 0.7421875, + "learning_rate": 0.00018871845526783685, + "loss": 1.1109, + "step": 9381 + }, + { + "epoch": 0.24090345213850922, + "grad_norm": 0.78125, + "learning_rate": 0.00018871639533093779, + "loss": 0.968, + "step": 9382 + }, + { + "epoch": 0.24092912933443103, + "grad_norm": 0.83203125, + "learning_rate": 0.00018871433521723495, + "loss": 1.0706, + "step": 9383 + }, + { + "epoch": 0.24095480653035287, + "grad_norm": 0.7734375, + "learning_rate": 0.00018871227492673248, + "loss": 1.0569, + "step": 9384 + }, + { + "epoch": 0.24098048372627467, + "grad_norm": 0.79296875, + "learning_rate": 0.00018871021445943445, + "loss": 0.962, + "step": 9385 + }, + { + "epoch": 0.24100616092219648, + "grad_norm": 0.77734375, + "learning_rate": 0.000188708153815345, + "loss": 1.0213, + "step": 9386 + }, + { + "epoch": 0.24103183811811832, + "grad_norm": 0.88671875, + "learning_rate": 0.0001887060929944682, + "loss": 1.1318, + "step": 9387 + }, + { + "epoch": 0.24105751531404013, + "grad_norm": 0.76953125, + "learning_rate": 0.00018870403199680819, + "loss": 1.0379, + "step": 9388 + }, + { + "epoch": 0.24108319250996196, + "grad_norm": 0.79296875, + "learning_rate": 0.000188701970822369, + "loss": 1.094, + "step": 9389 + }, + { + "epoch": 0.24110886970588377, + "grad_norm": 0.7265625, + "learning_rate": 0.00018869990947115484, + "loss": 0.8083, + "step": 9390 + }, + { + "epoch": 0.24113454690180558, + "grad_norm": 0.76953125, + "learning_rate": 0.00018869784794316977, + "loss": 0.9981, + "step": 9391 + }, + { + "epoch": 0.24116022409772742, + "grad_norm": 0.859375, + "learning_rate": 0.00018869578623841786, + "loss": 1.0057, + "step": 9392 + }, + { + "epoch": 0.24118590129364922, + "grad_norm": 0.79296875, + "learning_rate": 0.00018869372435690332, + "loss": 1.0297, + "step": 9393 + }, + { + "epoch": 0.24121157848957106, + "grad_norm": 0.81640625, + "learning_rate": 0.00018869166229863016, + "loss": 0.9286, + "step": 9394 + }, + { + "epoch": 0.24123725568549287, + "grad_norm": 0.8046875, + "learning_rate": 0.00018868960006360254, + "loss": 0.9493, + "step": 9395 + }, + { + "epoch": 0.24126293288141468, + "grad_norm": 0.80859375, + "learning_rate": 0.00018868753765182456, + "loss": 0.9392, + "step": 9396 + }, + { + "epoch": 0.2412886100773365, + "grad_norm": 0.83984375, + "learning_rate": 0.0001886854750633003, + "loss": 0.9648, + "step": 9397 + }, + { + "epoch": 0.24131428727325832, + "grad_norm": 0.78515625, + "learning_rate": 0.0001886834122980339, + "loss": 1.0127, + "step": 9398 + }, + { + "epoch": 0.24133996446918013, + "grad_norm": 0.8203125, + "learning_rate": 0.00018868134935602954, + "loss": 1.0169, + "step": 9399 + }, + { + "epoch": 0.24136564166510197, + "grad_norm": 0.7734375, + "learning_rate": 0.00018867928623729122, + "loss": 0.9175, + "step": 9400 + }, + { + "epoch": 0.24139131886102377, + "grad_norm": 0.80078125, + "learning_rate": 0.0001886772229418231, + "loss": 0.9377, + "step": 9401 + }, + { + "epoch": 0.2414169960569456, + "grad_norm": 0.7890625, + "learning_rate": 0.00018867515946962925, + "loss": 1.0145, + "step": 9402 + }, + { + "epoch": 0.24144267325286742, + "grad_norm": 0.72265625, + "learning_rate": 0.00018867309582071385, + "loss": 0.9796, + "step": 9403 + }, + { + "epoch": 0.24146835044878923, + "grad_norm": 0.828125, + "learning_rate": 0.00018867103199508097, + "loss": 1.0189, + "step": 9404 + }, + { + "epoch": 0.24149402764471106, + "grad_norm": 0.81640625, + "learning_rate": 0.00018866896799273473, + "loss": 0.9762, + "step": 9405 + }, + { + "epoch": 0.24151970484063287, + "grad_norm": 0.7734375, + "learning_rate": 0.00018866690381367924, + "loss": 1.0739, + "step": 9406 + }, + { + "epoch": 0.2415453820365547, + "grad_norm": 0.83984375, + "learning_rate": 0.00018866483945791863, + "loss": 1.0519, + "step": 9407 + }, + { + "epoch": 0.24157105923247651, + "grad_norm": 0.84765625, + "learning_rate": 0.00018866277492545698, + "loss": 1.053, + "step": 9408 + }, + { + "epoch": 0.24159673642839832, + "grad_norm": 0.84765625, + "learning_rate": 0.00018866071021629848, + "loss": 1.0873, + "step": 9409 + }, + { + "epoch": 0.24162241362432016, + "grad_norm": 0.76953125, + "learning_rate": 0.00018865864533044717, + "loss": 0.9571, + "step": 9410 + }, + { + "epoch": 0.24164809082024197, + "grad_norm": 0.73046875, + "learning_rate": 0.00018865658026790717, + "loss": 0.942, + "step": 9411 + }, + { + "epoch": 0.2416737680161638, + "grad_norm": 0.78515625, + "learning_rate": 0.00018865451502868264, + "loss": 1.0322, + "step": 9412 + }, + { + "epoch": 0.2416994452120856, + "grad_norm": 0.77734375, + "learning_rate": 0.00018865244961277763, + "loss": 1.0052, + "step": 9413 + }, + { + "epoch": 0.24172512240800742, + "grad_norm": 0.80078125, + "learning_rate": 0.00018865038402019634, + "loss": 1.1064, + "step": 9414 + }, + { + "epoch": 0.24175079960392926, + "grad_norm": 0.84375, + "learning_rate": 0.00018864831825094282, + "loss": 1.1289, + "step": 9415 + }, + { + "epoch": 0.24177647679985106, + "grad_norm": 0.8125, + "learning_rate": 0.00018864625230502122, + "loss": 0.9675, + "step": 9416 + }, + { + "epoch": 0.2418021539957729, + "grad_norm": 0.765625, + "learning_rate": 0.00018864418618243564, + "loss": 0.872, + "step": 9417 + }, + { + "epoch": 0.2418278311916947, + "grad_norm": 0.80078125, + "learning_rate": 0.0001886421198831902, + "loss": 1.1273, + "step": 9418 + }, + { + "epoch": 0.24185350838761652, + "grad_norm": 0.77734375, + "learning_rate": 0.00018864005340728902, + "loss": 0.7797, + "step": 9419 + }, + { + "epoch": 0.24187918558353835, + "grad_norm": 0.7421875, + "learning_rate": 0.00018863798675473623, + "loss": 1.0361, + "step": 9420 + }, + { + "epoch": 0.24190486277946016, + "grad_norm": 0.87109375, + "learning_rate": 0.00018863591992553596, + "loss": 0.9376, + "step": 9421 + }, + { + "epoch": 0.241930539975382, + "grad_norm": 0.75, + "learning_rate": 0.00018863385291969227, + "loss": 1.0906, + "step": 9422 + }, + { + "epoch": 0.2419562171713038, + "grad_norm": 0.8984375, + "learning_rate": 0.00018863178573720935, + "loss": 1.0808, + "step": 9423 + }, + { + "epoch": 0.24198189436722561, + "grad_norm": 0.80859375, + "learning_rate": 0.00018862971837809125, + "loss": 0.9898, + "step": 9424 + }, + { + "epoch": 0.24200757156314745, + "grad_norm": 0.8046875, + "learning_rate": 0.00018862765084234212, + "loss": 0.9491, + "step": 9425 + }, + { + "epoch": 0.24203324875906926, + "grad_norm": 0.87890625, + "learning_rate": 0.00018862558312996615, + "loss": 1.1715, + "step": 9426 + }, + { + "epoch": 0.2420589259549911, + "grad_norm": 0.87890625, + "learning_rate": 0.00018862351524096736, + "loss": 1.0584, + "step": 9427 + }, + { + "epoch": 0.2420846031509129, + "grad_norm": 0.82421875, + "learning_rate": 0.00018862144717534987, + "loss": 1.0904, + "step": 9428 + }, + { + "epoch": 0.2421102803468347, + "grad_norm": 0.84375, + "learning_rate": 0.0001886193789331179, + "loss": 0.9628, + "step": 9429 + }, + { + "epoch": 0.24213595754275655, + "grad_norm": 1.109375, + "learning_rate": 0.00018861731051427548, + "loss": 0.9215, + "step": 9430 + }, + { + "epoch": 0.24216163473867836, + "grad_norm": 0.83203125, + "learning_rate": 0.00018861524191882682, + "loss": 1.024, + "step": 9431 + }, + { + "epoch": 0.2421873119346002, + "grad_norm": 0.8125, + "learning_rate": 0.00018861317314677593, + "loss": 0.9496, + "step": 9432 + }, + { + "epoch": 0.242212989130522, + "grad_norm": 0.80078125, + "learning_rate": 0.00018861110419812703, + "loss": 0.9286, + "step": 9433 + }, + { + "epoch": 0.2422386663264438, + "grad_norm": 0.828125, + "learning_rate": 0.00018860903507288417, + "loss": 1.0885, + "step": 9434 + }, + { + "epoch": 0.24226434352236564, + "grad_norm": 0.7578125, + "learning_rate": 0.0001886069657710515, + "loss": 1.1182, + "step": 9435 + }, + { + "epoch": 0.24229002071828745, + "grad_norm": 0.890625, + "learning_rate": 0.00018860489629263317, + "loss": 1.0342, + "step": 9436 + }, + { + "epoch": 0.2423156979142093, + "grad_norm": 0.81640625, + "learning_rate": 0.00018860282663763333, + "loss": 1.0089, + "step": 9437 + }, + { + "epoch": 0.2423413751101311, + "grad_norm": 0.81640625, + "learning_rate": 0.00018860075680605598, + "loss": 0.9159, + "step": 9438 + }, + { + "epoch": 0.2423670523060529, + "grad_norm": 0.8828125, + "learning_rate": 0.00018859868679790536, + "loss": 1.1098, + "step": 9439 + }, + { + "epoch": 0.24239272950197474, + "grad_norm": 0.796875, + "learning_rate": 0.00018859661661318558, + "loss": 0.9295, + "step": 9440 + }, + { + "epoch": 0.24241840669789655, + "grad_norm": 0.8125, + "learning_rate": 0.00018859454625190072, + "loss": 1.0449, + "step": 9441 + }, + { + "epoch": 0.24244408389381839, + "grad_norm": 0.77734375, + "learning_rate": 0.0001885924757140549, + "loss": 1.0559, + "step": 9442 + }, + { + "epoch": 0.2424697610897402, + "grad_norm": 0.76171875, + "learning_rate": 0.00018859040499965234, + "loss": 0.9882, + "step": 9443 + }, + { + "epoch": 0.242495438285662, + "grad_norm": 0.7890625, + "learning_rate": 0.0001885883341086971, + "loss": 0.9447, + "step": 9444 + }, + { + "epoch": 0.24252111548158384, + "grad_norm": 0.78125, + "learning_rate": 0.0001885862630411933, + "loss": 1.0498, + "step": 9445 + }, + { + "epoch": 0.24254679267750565, + "grad_norm": 0.92578125, + "learning_rate": 0.00018858419179714506, + "loss": 0.9532, + "step": 9446 + }, + { + "epoch": 0.24257246987342748, + "grad_norm": 1.0234375, + "learning_rate": 0.00018858212037655653, + "loss": 0.8756, + "step": 9447 + }, + { + "epoch": 0.2425981470693493, + "grad_norm": 0.7734375, + "learning_rate": 0.00018858004877943184, + "loss": 1.0315, + "step": 9448 + }, + { + "epoch": 0.2426238242652711, + "grad_norm": 0.73828125, + "learning_rate": 0.00018857797700577513, + "loss": 1.0465, + "step": 9449 + }, + { + "epoch": 0.24264950146119293, + "grad_norm": 0.8203125, + "learning_rate": 0.00018857590505559052, + "loss": 1.1995, + "step": 9450 + }, + { + "epoch": 0.24267517865711474, + "grad_norm": 0.81640625, + "learning_rate": 0.0001885738329288821, + "loss": 1.005, + "step": 9451 + }, + { + "epoch": 0.24270085585303658, + "grad_norm": 0.73046875, + "learning_rate": 0.00018857176062565405, + "loss": 0.9126, + "step": 9452 + }, + { + "epoch": 0.2427265330489584, + "grad_norm": 0.8359375, + "learning_rate": 0.00018856968814591048, + "loss": 0.9564, + "step": 9453 + }, + { + "epoch": 0.2427522102448802, + "grad_norm": 0.8515625, + "learning_rate": 0.00018856761548965552, + "loss": 1.0906, + "step": 9454 + }, + { + "epoch": 0.24277788744080203, + "grad_norm": 0.765625, + "learning_rate": 0.0001885655426568933, + "loss": 1.0288, + "step": 9455 + }, + { + "epoch": 0.24280356463672384, + "grad_norm": 0.859375, + "learning_rate": 0.00018856346964762796, + "loss": 1.0326, + "step": 9456 + }, + { + "epoch": 0.24282924183264568, + "grad_norm": 0.86328125, + "learning_rate": 0.00018856139646186365, + "loss": 1.2075, + "step": 9457 + }, + { + "epoch": 0.24285491902856748, + "grad_norm": 1.84375, + "learning_rate": 0.00018855932309960443, + "loss": 1.067, + "step": 9458 + }, + { + "epoch": 0.2428805962244893, + "grad_norm": 0.79296875, + "learning_rate": 0.0001885572495608545, + "loss": 1.0281, + "step": 9459 + }, + { + "epoch": 0.24290627342041113, + "grad_norm": 0.796875, + "learning_rate": 0.00018855517584561796, + "loss": 1.0084, + "step": 9460 + }, + { + "epoch": 0.24293195061633294, + "grad_norm": 0.75, + "learning_rate": 0.00018855310195389896, + "loss": 0.9891, + "step": 9461 + }, + { + "epoch": 0.24295762781225477, + "grad_norm": 0.7890625, + "learning_rate": 0.00018855102788570163, + "loss": 0.9789, + "step": 9462 + }, + { + "epoch": 0.24298330500817658, + "grad_norm": 0.80859375, + "learning_rate": 0.0001885489536410301, + "loss": 1.178, + "step": 9463 + }, + { + "epoch": 0.2430089822040984, + "grad_norm": 0.77734375, + "learning_rate": 0.0001885468792198885, + "loss": 0.9286, + "step": 9464 + }, + { + "epoch": 0.24303465940002023, + "grad_norm": 0.79296875, + "learning_rate": 0.00018854480462228098, + "loss": 0.9269, + "step": 9465 + }, + { + "epoch": 0.24306033659594203, + "grad_norm": 0.7421875, + "learning_rate": 0.00018854272984821164, + "loss": 0.9741, + "step": 9466 + }, + { + "epoch": 0.24308601379186387, + "grad_norm": 0.7421875, + "learning_rate": 0.00018854065489768467, + "loss": 0.9835, + "step": 9467 + }, + { + "epoch": 0.24311169098778568, + "grad_norm": 0.7734375, + "learning_rate": 0.00018853857977070416, + "loss": 0.9782, + "step": 9468 + }, + { + "epoch": 0.2431373681837075, + "grad_norm": 0.80859375, + "learning_rate": 0.00018853650446727424, + "loss": 0.9207, + "step": 9469 + }, + { + "epoch": 0.24316304537962932, + "grad_norm": 0.78125, + "learning_rate": 0.0001885344289873991, + "loss": 1.0168, + "step": 9470 + }, + { + "epoch": 0.24318872257555113, + "grad_norm": 0.765625, + "learning_rate": 0.0001885323533310828, + "loss": 1.0944, + "step": 9471 + }, + { + "epoch": 0.24321439977147297, + "grad_norm": 0.87109375, + "learning_rate": 0.00018853027749832953, + "loss": 1.0695, + "step": 9472 + }, + { + "epoch": 0.24324007696739477, + "grad_norm": 0.890625, + "learning_rate": 0.0001885282014891434, + "loss": 1.0614, + "step": 9473 + }, + { + "epoch": 0.24326575416331658, + "grad_norm": 0.78125, + "learning_rate": 0.0001885261253035286, + "loss": 0.8604, + "step": 9474 + }, + { + "epoch": 0.24329143135923842, + "grad_norm": 0.8359375, + "learning_rate": 0.0001885240489414892, + "loss": 1.0347, + "step": 9475 + }, + { + "epoch": 0.24331710855516023, + "grad_norm": 1.109375, + "learning_rate": 0.0001885219724030294, + "loss": 0.9902, + "step": 9476 + }, + { + "epoch": 0.24334278575108206, + "grad_norm": 0.87890625, + "learning_rate": 0.00018851989568815326, + "loss": 1.0674, + "step": 9477 + }, + { + "epoch": 0.24336846294700387, + "grad_norm": 0.828125, + "learning_rate": 0.000188517818796865, + "loss": 0.9055, + "step": 9478 + }, + { + "epoch": 0.24339414014292568, + "grad_norm": 0.8203125, + "learning_rate": 0.0001885157417291687, + "loss": 0.922, + "step": 9479 + }, + { + "epoch": 0.24341981733884752, + "grad_norm": 0.7890625, + "learning_rate": 0.00018851366448506852, + "loss": 0.9334, + "step": 9480 + }, + { + "epoch": 0.24344549453476932, + "grad_norm": 0.80859375, + "learning_rate": 0.00018851158706456863, + "loss": 1.0811, + "step": 9481 + }, + { + "epoch": 0.24347117173069116, + "grad_norm": 0.82421875, + "learning_rate": 0.0001885095094676731, + "loss": 1.1144, + "step": 9482 + }, + { + "epoch": 0.24349684892661297, + "grad_norm": 0.77734375, + "learning_rate": 0.00018850743169438613, + "loss": 0.8864, + "step": 9483 + }, + { + "epoch": 0.24352252612253478, + "grad_norm": 0.74609375, + "learning_rate": 0.00018850535374471185, + "loss": 1.0016, + "step": 9484 + }, + { + "epoch": 0.2435482033184566, + "grad_norm": 0.8359375, + "learning_rate": 0.00018850327561865439, + "loss": 1.0979, + "step": 9485 + }, + { + "epoch": 0.24357388051437842, + "grad_norm": 0.796875, + "learning_rate": 0.0001885011973162179, + "loss": 1.0776, + "step": 9486 + }, + { + "epoch": 0.24359955771030026, + "grad_norm": 0.796875, + "learning_rate": 0.0001884991188374065, + "loss": 1.0036, + "step": 9487 + }, + { + "epoch": 0.24362523490622207, + "grad_norm": 0.77734375, + "learning_rate": 0.00018849704018222442, + "loss": 1.0467, + "step": 9488 + }, + { + "epoch": 0.24365091210214387, + "grad_norm": 0.8125, + "learning_rate": 0.00018849496135067565, + "loss": 0.9322, + "step": 9489 + }, + { + "epoch": 0.2436765892980657, + "grad_norm": 1.4296875, + "learning_rate": 0.00018849288234276446, + "loss": 1.1119, + "step": 9490 + }, + { + "epoch": 0.24370226649398752, + "grad_norm": 0.8359375, + "learning_rate": 0.00018849080315849493, + "loss": 1.0513, + "step": 9491 + }, + { + "epoch": 0.24372794368990935, + "grad_norm": 0.79296875, + "learning_rate": 0.00018848872379787125, + "loss": 0.9559, + "step": 9492 + }, + { + "epoch": 0.24375362088583116, + "grad_norm": 0.82421875, + "learning_rate": 0.0001884866442608975, + "loss": 1.0339, + "step": 9493 + }, + { + "epoch": 0.24377929808175297, + "grad_norm": 0.76171875, + "learning_rate": 0.00018848456454757787, + "loss": 0.9733, + "step": 9494 + }, + { + "epoch": 0.2438049752776748, + "grad_norm": 0.76171875, + "learning_rate": 0.00018848248465791652, + "loss": 0.9161, + "step": 9495 + }, + { + "epoch": 0.24383065247359662, + "grad_norm": 0.796875, + "learning_rate": 0.00018848040459191754, + "loss": 1.0153, + "step": 9496 + }, + { + "epoch": 0.24385632966951845, + "grad_norm": 0.796875, + "learning_rate": 0.00018847832434958512, + "loss": 0.9377, + "step": 9497 + }, + { + "epoch": 0.24388200686544026, + "grad_norm": 0.75390625, + "learning_rate": 0.00018847624393092337, + "loss": 1.1293, + "step": 9498 + }, + { + "epoch": 0.24390768406136207, + "grad_norm": 0.76953125, + "learning_rate": 0.00018847416333593652, + "loss": 1.0235, + "step": 9499 + }, + { + "epoch": 0.2439333612572839, + "grad_norm": 0.75390625, + "learning_rate": 0.00018847208256462858, + "loss": 1.0057, + "step": 9500 + }, + { + "epoch": 0.2439590384532057, + "grad_norm": 0.7734375, + "learning_rate": 0.0001884700016170038, + "loss": 0.9671, + "step": 9501 + }, + { + "epoch": 0.24398471564912755, + "grad_norm": 0.71484375, + "learning_rate": 0.0001884679204930663, + "loss": 0.9788, + "step": 9502 + }, + { + "epoch": 0.24401039284504936, + "grad_norm": 0.78515625, + "learning_rate": 0.00018846583919282023, + "loss": 1.0628, + "step": 9503 + }, + { + "epoch": 0.24403607004097116, + "grad_norm": 0.828125, + "learning_rate": 0.00018846375771626973, + "loss": 0.9403, + "step": 9504 + }, + { + "epoch": 0.244061747236893, + "grad_norm": 0.73828125, + "learning_rate": 0.00018846167606341896, + "loss": 0.832, + "step": 9505 + }, + { + "epoch": 0.2440874244328148, + "grad_norm": 0.9140625, + "learning_rate": 0.00018845959423427204, + "loss": 1.0879, + "step": 9506 + }, + { + "epoch": 0.24411310162873665, + "grad_norm": 0.78125, + "learning_rate": 0.00018845751222883316, + "loss": 0.9605, + "step": 9507 + }, + { + "epoch": 0.24413877882465845, + "grad_norm": 0.76171875, + "learning_rate": 0.00018845543004710646, + "loss": 0.9764, + "step": 9508 + }, + { + "epoch": 0.24416445602058026, + "grad_norm": 0.80078125, + "learning_rate": 0.00018845334768909604, + "loss": 1.0146, + "step": 9509 + }, + { + "epoch": 0.2441901332165021, + "grad_norm": 0.828125, + "learning_rate": 0.0001884512651548061, + "loss": 0.8625, + "step": 9510 + }, + { + "epoch": 0.2442158104124239, + "grad_norm": 0.85546875, + "learning_rate": 0.0001884491824442408, + "loss": 0.9349, + "step": 9511 + }, + { + "epoch": 0.24424148760834574, + "grad_norm": 0.765625, + "learning_rate": 0.00018844709955740424, + "loss": 1.0639, + "step": 9512 + }, + { + "epoch": 0.24426716480426755, + "grad_norm": 0.78515625, + "learning_rate": 0.0001884450164943006, + "loss": 1.0552, + "step": 9513 + }, + { + "epoch": 0.24429284200018936, + "grad_norm": 0.74609375, + "learning_rate": 0.00018844293325493406, + "loss": 1.034, + "step": 9514 + }, + { + "epoch": 0.2443185191961112, + "grad_norm": 0.78515625, + "learning_rate": 0.00018844084983930874, + "loss": 0.9436, + "step": 9515 + }, + { + "epoch": 0.244344196392033, + "grad_norm": 0.78515625, + "learning_rate": 0.0001884387662474288, + "loss": 0.9627, + "step": 9516 + }, + { + "epoch": 0.24436987358795484, + "grad_norm": 0.87109375, + "learning_rate": 0.00018843668247929836, + "loss": 1.0758, + "step": 9517 + }, + { + "epoch": 0.24439555078387665, + "grad_norm": 0.75390625, + "learning_rate": 0.0001884345985349216, + "loss": 0.9444, + "step": 9518 + }, + { + "epoch": 0.24442122797979846, + "grad_norm": 0.796875, + "learning_rate": 0.0001884325144143027, + "loss": 0.9182, + "step": 9519 + }, + { + "epoch": 0.2444469051757203, + "grad_norm": 0.79296875, + "learning_rate": 0.00018843043011744577, + "loss": 0.8372, + "step": 9520 + }, + { + "epoch": 0.2444725823716421, + "grad_norm": 0.80859375, + "learning_rate": 0.00018842834564435497, + "loss": 0.9739, + "step": 9521 + }, + { + "epoch": 0.24449825956756394, + "grad_norm": 0.73828125, + "learning_rate": 0.00018842626099503448, + "loss": 1.0808, + "step": 9522 + }, + { + "epoch": 0.24452393676348574, + "grad_norm": 0.87890625, + "learning_rate": 0.00018842417616948845, + "loss": 0.9563, + "step": 9523 + }, + { + "epoch": 0.24454961395940755, + "grad_norm": 0.8515625, + "learning_rate": 0.000188422091167721, + "loss": 1.057, + "step": 9524 + }, + { + "epoch": 0.2445752911553294, + "grad_norm": 0.734375, + "learning_rate": 0.0001884200059897363, + "loss": 1.0187, + "step": 9525 + }, + { + "epoch": 0.2446009683512512, + "grad_norm": 0.8359375, + "learning_rate": 0.00018841792063553853, + "loss": 0.9903, + "step": 9526 + }, + { + "epoch": 0.24462664554717303, + "grad_norm": 0.87890625, + "learning_rate": 0.00018841583510513185, + "loss": 1.0322, + "step": 9527 + }, + { + "epoch": 0.24465232274309484, + "grad_norm": 0.7421875, + "learning_rate": 0.00018841374939852038, + "loss": 0.984, + "step": 9528 + }, + { + "epoch": 0.24467799993901665, + "grad_norm": 0.80859375, + "learning_rate": 0.00018841166351570831, + "loss": 1.0204, + "step": 9529 + }, + { + "epoch": 0.24470367713493849, + "grad_norm": 0.85546875, + "learning_rate": 0.00018840957745669976, + "loss": 0.955, + "step": 9530 + }, + { + "epoch": 0.2447293543308603, + "grad_norm": 0.8828125, + "learning_rate": 0.00018840749122149892, + "loss": 0.9338, + "step": 9531 + }, + { + "epoch": 0.24475503152678213, + "grad_norm": 0.77734375, + "learning_rate": 0.00018840540481010994, + "loss": 1.0459, + "step": 9532 + }, + { + "epoch": 0.24478070872270394, + "grad_norm": 0.81640625, + "learning_rate": 0.00018840331822253696, + "loss": 0.9875, + "step": 9533 + }, + { + "epoch": 0.24480638591862575, + "grad_norm": 0.74609375, + "learning_rate": 0.00018840123145878415, + "loss": 0.8806, + "step": 9534 + }, + { + "epoch": 0.24483206311454758, + "grad_norm": 0.74609375, + "learning_rate": 0.0001883991445188557, + "loss": 0.8145, + "step": 9535 + }, + { + "epoch": 0.2448577403104694, + "grad_norm": 0.81640625, + "learning_rate": 0.0001883970574027557, + "loss": 1.0263, + "step": 9536 + }, + { + "epoch": 0.24488341750639123, + "grad_norm": 0.796875, + "learning_rate": 0.0001883949701104884, + "loss": 1.0082, + "step": 9537 + }, + { + "epoch": 0.24490909470231303, + "grad_norm": 0.796875, + "learning_rate": 0.00018839288264205787, + "loss": 0.9658, + "step": 9538 + }, + { + "epoch": 0.24493477189823484, + "grad_norm": 0.83203125, + "learning_rate": 0.00018839079499746828, + "loss": 1.0597, + "step": 9539 + }, + { + "epoch": 0.24496044909415668, + "grad_norm": 0.81640625, + "learning_rate": 0.0001883887071767239, + "loss": 1.0734, + "step": 9540 + }, + { + "epoch": 0.2449861262900785, + "grad_norm": 0.72265625, + "learning_rate": 0.00018838661917982874, + "loss": 0.9398, + "step": 9541 + }, + { + "epoch": 0.24501180348600032, + "grad_norm": 0.734375, + "learning_rate": 0.00018838453100678706, + "loss": 1.1433, + "step": 9542 + }, + { + "epoch": 0.24503748068192213, + "grad_norm": 0.82421875, + "learning_rate": 0.000188382442657603, + "loss": 1.0244, + "step": 9543 + }, + { + "epoch": 0.24506315787784394, + "grad_norm": 0.8671875, + "learning_rate": 0.0001883803541322807, + "loss": 1.0399, + "step": 9544 + }, + { + "epoch": 0.24508883507376578, + "grad_norm": 0.8125, + "learning_rate": 0.00018837826543082432, + "loss": 1.0493, + "step": 9545 + }, + { + "epoch": 0.24511451226968758, + "grad_norm": 0.8984375, + "learning_rate": 0.0001883761765532381, + "loss": 1.0917, + "step": 9546 + }, + { + "epoch": 0.24514018946560942, + "grad_norm": 0.8359375, + "learning_rate": 0.0001883740874995261, + "loss": 1.0337, + "step": 9547 + }, + { + "epoch": 0.24516586666153123, + "grad_norm": 0.78125, + "learning_rate": 0.00018837199826969255, + "loss": 1.0357, + "step": 9548 + }, + { + "epoch": 0.24519154385745304, + "grad_norm": 0.8359375, + "learning_rate": 0.00018836990886374156, + "loss": 0.9993, + "step": 9549 + }, + { + "epoch": 0.24521722105337487, + "grad_norm": 0.734375, + "learning_rate": 0.00018836781928167735, + "loss": 1.0003, + "step": 9550 + }, + { + "epoch": 0.24524289824929668, + "grad_norm": 0.8046875, + "learning_rate": 0.00018836572952350404, + "loss": 0.9826, + "step": 9551 + }, + { + "epoch": 0.24526857544521852, + "grad_norm": 0.81640625, + "learning_rate": 0.00018836363958922583, + "loss": 1.0169, + "step": 9552 + }, + { + "epoch": 0.24529425264114033, + "grad_norm": 0.828125, + "learning_rate": 0.00018836154947884684, + "loss": 1.0757, + "step": 9553 + }, + { + "epoch": 0.24531992983706213, + "grad_norm": 0.828125, + "learning_rate": 0.00018835945919237128, + "loss": 0.9474, + "step": 9554 + }, + { + "epoch": 0.24534560703298397, + "grad_norm": 0.796875, + "learning_rate": 0.0001883573687298033, + "loss": 1.0411, + "step": 9555 + }, + { + "epoch": 0.24537128422890578, + "grad_norm": 0.796875, + "learning_rate": 0.00018835527809114708, + "loss": 0.9385, + "step": 9556 + }, + { + "epoch": 0.24539696142482761, + "grad_norm": 0.796875, + "learning_rate": 0.00018835318727640678, + "loss": 0.8448, + "step": 9557 + }, + { + "epoch": 0.24542263862074942, + "grad_norm": 0.78125, + "learning_rate": 0.0001883510962855865, + "loss": 0.8638, + "step": 9558 + }, + { + "epoch": 0.24544831581667123, + "grad_norm": 0.83984375, + "learning_rate": 0.0001883490051186905, + "loss": 0.891, + "step": 9559 + }, + { + "epoch": 0.24547399301259307, + "grad_norm": 0.87109375, + "learning_rate": 0.00018834691377572296, + "loss": 0.9211, + "step": 9560 + }, + { + "epoch": 0.24549967020851488, + "grad_norm": 0.828125, + "learning_rate": 0.00018834482225668793, + "loss": 1.132, + "step": 9561 + }, + { + "epoch": 0.2455253474044367, + "grad_norm": 0.7890625, + "learning_rate": 0.0001883427305615897, + "loss": 0.9284, + "step": 9562 + }, + { + "epoch": 0.24555102460035852, + "grad_norm": 0.82421875, + "learning_rate": 0.00018834063869043236, + "loss": 1.0216, + "step": 9563 + }, + { + "epoch": 0.24557670179628033, + "grad_norm": 0.8203125, + "learning_rate": 0.0001883385466432201, + "loss": 1.0679, + "step": 9564 + }, + { + "epoch": 0.24560237899220216, + "grad_norm": 0.859375, + "learning_rate": 0.00018833645441995712, + "loss": 0.9697, + "step": 9565 + }, + { + "epoch": 0.24562805618812397, + "grad_norm": 0.8828125, + "learning_rate": 0.00018833436202064755, + "loss": 1.05, + "step": 9566 + }, + { + "epoch": 0.2456537333840458, + "grad_norm": 0.796875, + "learning_rate": 0.0001883322694452956, + "loss": 1.0062, + "step": 9567 + }, + { + "epoch": 0.24567941057996762, + "grad_norm": 0.8125, + "learning_rate": 0.0001883301766939054, + "loss": 0.8825, + "step": 9568 + }, + { + "epoch": 0.24570508777588942, + "grad_norm": 0.859375, + "learning_rate": 0.00018832808376648111, + "loss": 0.9553, + "step": 9569 + }, + { + "epoch": 0.24573076497181126, + "grad_norm": 0.78515625, + "learning_rate": 0.00018832599066302694, + "loss": 1.0198, + "step": 9570 + }, + { + "epoch": 0.24575644216773307, + "grad_norm": 0.80859375, + "learning_rate": 0.00018832389738354706, + "loss": 1.0995, + "step": 9571 + }, + { + "epoch": 0.2457821193636549, + "grad_norm": 0.80078125, + "learning_rate": 0.00018832180392804564, + "loss": 0.9737, + "step": 9572 + }, + { + "epoch": 0.2458077965595767, + "grad_norm": 0.8046875, + "learning_rate": 0.00018831971029652683, + "loss": 0.9866, + "step": 9573 + }, + { + "epoch": 0.24583347375549852, + "grad_norm": 0.77734375, + "learning_rate": 0.00018831761648899483, + "loss": 1.0258, + "step": 9574 + }, + { + "epoch": 0.24585915095142036, + "grad_norm": 0.71875, + "learning_rate": 0.00018831552250545377, + "loss": 0.8848, + "step": 9575 + }, + { + "epoch": 0.24588482814734217, + "grad_norm": 0.84375, + "learning_rate": 0.00018831342834590785, + "loss": 0.9427, + "step": 9576 + }, + { + "epoch": 0.245910505343264, + "grad_norm": 0.8671875, + "learning_rate": 0.0001883113340103613, + "loss": 1.0799, + "step": 9577 + }, + { + "epoch": 0.2459361825391858, + "grad_norm": 0.828125, + "learning_rate": 0.0001883092394988182, + "loss": 1.0394, + "step": 9578 + }, + { + "epoch": 0.24596185973510762, + "grad_norm": 0.8125, + "learning_rate": 0.00018830714481128275, + "loss": 1.1274, + "step": 9579 + }, + { + "epoch": 0.24598753693102945, + "grad_norm": 0.78515625, + "learning_rate": 0.00018830504994775914, + "loss": 1.0121, + "step": 9580 + }, + { + "epoch": 0.24601321412695126, + "grad_norm": 0.7890625, + "learning_rate": 0.00018830295490825157, + "loss": 0.9697, + "step": 9581 + }, + { + "epoch": 0.2460388913228731, + "grad_norm": 0.83984375, + "learning_rate": 0.00018830085969276416, + "loss": 1.0831, + "step": 9582 + }, + { + "epoch": 0.2460645685187949, + "grad_norm": 0.78515625, + "learning_rate": 0.00018829876430130114, + "loss": 1.0535, + "step": 9583 + }, + { + "epoch": 0.24609024571471672, + "grad_norm": 1.0, + "learning_rate": 0.0001882966687338666, + "loss": 0.9398, + "step": 9584 + }, + { + "epoch": 0.24611592291063855, + "grad_norm": 0.80078125, + "learning_rate": 0.00018829457299046485, + "loss": 0.8942, + "step": 9585 + }, + { + "epoch": 0.24614160010656036, + "grad_norm": 0.7421875, + "learning_rate": 0.00018829247707109997, + "loss": 1.1011, + "step": 9586 + }, + { + "epoch": 0.2461672773024822, + "grad_norm": 0.7734375, + "learning_rate": 0.00018829038097577613, + "loss": 1.0637, + "step": 9587 + }, + { + "epoch": 0.246192954498404, + "grad_norm": 0.84765625, + "learning_rate": 0.00018828828470449755, + "loss": 0.9941, + "step": 9588 + }, + { + "epoch": 0.2462186316943258, + "grad_norm": 0.85546875, + "learning_rate": 0.0001882861882572684, + "loss": 1.0607, + "step": 9589 + }, + { + "epoch": 0.24624430889024765, + "grad_norm": 0.8203125, + "learning_rate": 0.00018828409163409285, + "loss": 1.0413, + "step": 9590 + }, + { + "epoch": 0.24626998608616946, + "grad_norm": 0.828125, + "learning_rate": 0.00018828199483497508, + "loss": 1.1257, + "step": 9591 + }, + { + "epoch": 0.2462956632820913, + "grad_norm": 0.7734375, + "learning_rate": 0.00018827989785991927, + "loss": 0.9085, + "step": 9592 + }, + { + "epoch": 0.2463213404780131, + "grad_norm": 0.921875, + "learning_rate": 0.0001882778007089296, + "loss": 1.0211, + "step": 9593 + }, + { + "epoch": 0.2463470176739349, + "grad_norm": 0.80078125, + "learning_rate": 0.00018827570338201022, + "loss": 0.9855, + "step": 9594 + }, + { + "epoch": 0.24637269486985675, + "grad_norm": 0.8515625, + "learning_rate": 0.00018827360587916537, + "loss": 1.0326, + "step": 9595 + }, + { + "epoch": 0.24639837206577855, + "grad_norm": 0.7578125, + "learning_rate": 0.0001882715082003992, + "loss": 0.9473, + "step": 9596 + }, + { + "epoch": 0.2464240492617004, + "grad_norm": 0.75, + "learning_rate": 0.00018826941034571584, + "loss": 0.9457, + "step": 9597 + }, + { + "epoch": 0.2464497264576222, + "grad_norm": 0.7734375, + "learning_rate": 0.00018826731231511956, + "loss": 0.8876, + "step": 9598 + }, + { + "epoch": 0.246475403653544, + "grad_norm": 0.7578125, + "learning_rate": 0.00018826521410861448, + "loss": 0.9988, + "step": 9599 + }, + { + "epoch": 0.24650108084946584, + "grad_norm": 0.84375, + "learning_rate": 0.00018826311572620483, + "loss": 1.1444, + "step": 9600 + }, + { + "epoch": 0.24652675804538765, + "grad_norm": 0.88671875, + "learning_rate": 0.00018826101716789474, + "loss": 1.07, + "step": 9601 + }, + { + "epoch": 0.2465524352413095, + "grad_norm": 0.9609375, + "learning_rate": 0.00018825891843368843, + "loss": 0.954, + "step": 9602 + }, + { + "epoch": 0.2465781124372313, + "grad_norm": 0.953125, + "learning_rate": 0.00018825681952359006, + "loss": 0.9842, + "step": 9603 + }, + { + "epoch": 0.2466037896331531, + "grad_norm": 0.8359375, + "learning_rate": 0.0001882547204376038, + "loss": 1.0364, + "step": 9604 + }, + { + "epoch": 0.24662946682907494, + "grad_norm": 0.87890625, + "learning_rate": 0.00018825262117573387, + "loss": 1.0964, + "step": 9605 + }, + { + "epoch": 0.24665514402499675, + "grad_norm": 0.796875, + "learning_rate": 0.00018825052173798446, + "loss": 1.1558, + "step": 9606 + }, + { + "epoch": 0.24668082122091856, + "grad_norm": 0.7734375, + "learning_rate": 0.0001882484221243597, + "loss": 0.9461, + "step": 9607 + }, + { + "epoch": 0.2467064984168404, + "grad_norm": 0.8125, + "learning_rate": 0.00018824632233486384, + "loss": 1.0786, + "step": 9608 + }, + { + "epoch": 0.2467321756127622, + "grad_norm": 0.796875, + "learning_rate": 0.00018824422236950102, + "loss": 1.1279, + "step": 9609 + }, + { + "epoch": 0.24675785280868404, + "grad_norm": 0.8359375, + "learning_rate": 0.0001882421222282754, + "loss": 1.0431, + "step": 9610 + }, + { + "epoch": 0.24678353000460584, + "grad_norm": 0.7109375, + "learning_rate": 0.00018824002191119123, + "loss": 0.8601, + "step": 9611 + }, + { + "epoch": 0.24680920720052765, + "grad_norm": 0.78515625, + "learning_rate": 0.00018823792141825266, + "loss": 0.9481, + "step": 9612 + }, + { + "epoch": 0.2468348843964495, + "grad_norm": 0.82421875, + "learning_rate": 0.00018823582074946392, + "loss": 1.0162, + "step": 9613 + }, + { + "epoch": 0.2468605615923713, + "grad_norm": 0.796875, + "learning_rate": 0.0001882337199048291, + "loss": 1.0805, + "step": 9614 + }, + { + "epoch": 0.24688623878829313, + "grad_norm": 0.828125, + "learning_rate": 0.00018823161888435248, + "loss": 1.0955, + "step": 9615 + }, + { + "epoch": 0.24691191598421494, + "grad_norm": 0.83203125, + "learning_rate": 0.00018822951768803823, + "loss": 1.0461, + "step": 9616 + }, + { + "epoch": 0.24693759318013675, + "grad_norm": 0.8671875, + "learning_rate": 0.0001882274163158905, + "loss": 0.9451, + "step": 9617 + }, + { + "epoch": 0.24696327037605859, + "grad_norm": 0.8125, + "learning_rate": 0.00018822531476791348, + "loss": 0.9878, + "step": 9618 + }, + { + "epoch": 0.2469889475719804, + "grad_norm": 0.78515625, + "learning_rate": 0.0001882232130441114, + "loss": 0.8506, + "step": 9619 + }, + { + "epoch": 0.24701462476790223, + "grad_norm": 0.83984375, + "learning_rate": 0.00018822111114448844, + "loss": 0.9549, + "step": 9620 + }, + { + "epoch": 0.24704030196382404, + "grad_norm": 0.80859375, + "learning_rate": 0.00018821900906904873, + "loss": 0.939, + "step": 9621 + }, + { + "epoch": 0.24706597915974585, + "grad_norm": 0.8046875, + "learning_rate": 0.00018821690681779653, + "loss": 1.0325, + "step": 9622 + }, + { + "epoch": 0.24709165635566768, + "grad_norm": 0.875, + "learning_rate": 0.000188214804390736, + "loss": 1.0175, + "step": 9623 + }, + { + "epoch": 0.2471173335515895, + "grad_norm": 0.87109375, + "learning_rate": 0.00018821270178787133, + "loss": 1.1274, + "step": 9624 + }, + { + "epoch": 0.24714301074751133, + "grad_norm": 0.9375, + "learning_rate": 0.00018821059900920673, + "loss": 0.9652, + "step": 9625 + }, + { + "epoch": 0.24716868794343314, + "grad_norm": 0.76171875, + "learning_rate": 0.00018820849605474636, + "loss": 1.0885, + "step": 9626 + }, + { + "epoch": 0.24719436513935494, + "grad_norm": 0.8359375, + "learning_rate": 0.00018820639292449443, + "loss": 1.1086, + "step": 9627 + }, + { + "epoch": 0.24722004233527678, + "grad_norm": 0.8046875, + "learning_rate": 0.00018820428961845514, + "loss": 0.9653, + "step": 9628 + }, + { + "epoch": 0.2472457195311986, + "grad_norm": 0.78515625, + "learning_rate": 0.00018820218613663264, + "loss": 0.9493, + "step": 9629 + }, + { + "epoch": 0.24727139672712042, + "grad_norm": 0.8046875, + "learning_rate": 0.0001882000824790312, + "loss": 1.0332, + "step": 9630 + }, + { + "epoch": 0.24729707392304223, + "grad_norm": 0.7578125, + "learning_rate": 0.00018819797864565489, + "loss": 0.9475, + "step": 9631 + }, + { + "epoch": 0.24732275111896404, + "grad_norm": 0.7734375, + "learning_rate": 0.00018819587463650802, + "loss": 0.947, + "step": 9632 + }, + { + "epoch": 0.24734842831488588, + "grad_norm": 0.875, + "learning_rate": 0.0001881937704515947, + "loss": 0.9714, + "step": 9633 + }, + { + "epoch": 0.24737410551080768, + "grad_norm": 0.8046875, + "learning_rate": 0.00018819166609091923, + "loss": 1.0701, + "step": 9634 + }, + { + "epoch": 0.24739978270672952, + "grad_norm": 0.7421875, + "learning_rate": 0.00018818956155448568, + "loss": 0.8772, + "step": 9635 + }, + { + "epoch": 0.24742545990265133, + "grad_norm": 0.7578125, + "learning_rate": 0.00018818745684229832, + "loss": 0.9685, + "step": 9636 + }, + { + "epoch": 0.24745113709857314, + "grad_norm": 0.734375, + "learning_rate": 0.0001881853519543613, + "loss": 0.9557, + "step": 9637 + }, + { + "epoch": 0.24747681429449497, + "grad_norm": 0.8203125, + "learning_rate": 0.00018818324689067888, + "loss": 1.1557, + "step": 9638 + }, + { + "epoch": 0.24750249149041678, + "grad_norm": 0.84375, + "learning_rate": 0.0001881811416512552, + "loss": 1.122, + "step": 9639 + }, + { + "epoch": 0.24752816868633862, + "grad_norm": 0.8203125, + "learning_rate": 0.00018817903623609446, + "loss": 1.0058, + "step": 9640 + }, + { + "epoch": 0.24755384588226043, + "grad_norm": 0.796875, + "learning_rate": 0.00018817693064520087, + "loss": 1.0096, + "step": 9641 + }, + { + "epoch": 0.24757952307818223, + "grad_norm": 0.73828125, + "learning_rate": 0.00018817482487857863, + "loss": 0.8591, + "step": 9642 + }, + { + "epoch": 0.24760520027410407, + "grad_norm": 0.81640625, + "learning_rate": 0.00018817271893623193, + "loss": 0.9895, + "step": 9643 + }, + { + "epoch": 0.24763087747002588, + "grad_norm": 0.80078125, + "learning_rate": 0.00018817061281816496, + "loss": 1.0174, + "step": 9644 + }, + { + "epoch": 0.24765655466594771, + "grad_norm": 0.80859375, + "learning_rate": 0.0001881685065243819, + "loss": 1.0823, + "step": 9645 + }, + { + "epoch": 0.24768223186186952, + "grad_norm": 0.796875, + "learning_rate": 0.000188166400054887, + "loss": 0.9735, + "step": 9646 + }, + { + "epoch": 0.24770790905779133, + "grad_norm": 0.77734375, + "learning_rate": 0.00018816429340968443, + "loss": 0.9562, + "step": 9647 + }, + { + "epoch": 0.24773358625371317, + "grad_norm": 0.75, + "learning_rate": 0.00018816218658877838, + "loss": 1.054, + "step": 9648 + }, + { + "epoch": 0.24775926344963498, + "grad_norm": 0.84375, + "learning_rate": 0.00018816007959217306, + "loss": 0.9954, + "step": 9649 + }, + { + "epoch": 0.2477849406455568, + "grad_norm": 0.87109375, + "learning_rate": 0.00018815797241987265, + "loss": 1.049, + "step": 9650 + }, + { + "epoch": 0.24781061784147862, + "grad_norm": 0.76171875, + "learning_rate": 0.00018815586507188136, + "loss": 0.9167, + "step": 9651 + }, + { + "epoch": 0.24783629503740043, + "grad_norm": 0.7578125, + "learning_rate": 0.00018815375754820343, + "loss": 0.8948, + "step": 9652 + }, + { + "epoch": 0.24786197223332226, + "grad_norm": 0.7734375, + "learning_rate": 0.00018815164984884299, + "loss": 0.9098, + "step": 9653 + }, + { + "epoch": 0.24788764942924407, + "grad_norm": 0.74609375, + "learning_rate": 0.00018814954197380427, + "loss": 1.091, + "step": 9654 + }, + { + "epoch": 0.2479133266251659, + "grad_norm": 0.81640625, + "learning_rate": 0.00018814743392309152, + "loss": 1.0604, + "step": 9655 + }, + { + "epoch": 0.24793900382108772, + "grad_norm": 0.80078125, + "learning_rate": 0.00018814532569670886, + "loss": 1.0216, + "step": 9656 + }, + { + "epoch": 0.24796468101700953, + "grad_norm": 0.90625, + "learning_rate": 0.00018814321729466053, + "loss": 1.0278, + "step": 9657 + }, + { + "epoch": 0.24799035821293136, + "grad_norm": 0.96875, + "learning_rate": 0.0001881411087169507, + "loss": 0.9357, + "step": 9658 + }, + { + "epoch": 0.24801603540885317, + "grad_norm": 0.85546875, + "learning_rate": 0.0001881389999635837, + "loss": 1.0753, + "step": 9659 + }, + { + "epoch": 0.248041712604775, + "grad_norm": 0.74609375, + "learning_rate": 0.00018813689103456356, + "loss": 0.8616, + "step": 9660 + }, + { + "epoch": 0.2480673898006968, + "grad_norm": 0.78125, + "learning_rate": 0.00018813478192989454, + "loss": 0.9921, + "step": 9661 + }, + { + "epoch": 0.24809306699661862, + "grad_norm": 0.828125, + "learning_rate": 0.0001881326726495809, + "loss": 0.9819, + "step": 9662 + }, + { + "epoch": 0.24811874419254046, + "grad_norm": 0.81640625, + "learning_rate": 0.0001881305631936268, + "loss": 1.0172, + "step": 9663 + }, + { + "epoch": 0.24814442138846227, + "grad_norm": 0.80859375, + "learning_rate": 0.00018812845356203645, + "loss": 0.9634, + "step": 9664 + }, + { + "epoch": 0.2481700985843841, + "grad_norm": 0.75390625, + "learning_rate": 0.00018812634375481405, + "loss": 1.1094, + "step": 9665 + }, + { + "epoch": 0.2481957757803059, + "grad_norm": 0.80859375, + "learning_rate": 0.0001881242337719638, + "loss": 1.0173, + "step": 9666 + }, + { + "epoch": 0.24822145297622772, + "grad_norm": 0.75, + "learning_rate": 0.0001881221236134899, + "loss": 1.0328, + "step": 9667 + }, + { + "epoch": 0.24824713017214955, + "grad_norm": 0.82421875, + "learning_rate": 0.00018812001327939658, + "loss": 0.9158, + "step": 9668 + }, + { + "epoch": 0.24827280736807136, + "grad_norm": 0.80859375, + "learning_rate": 0.00018811790276968807, + "loss": 1.0145, + "step": 9669 + }, + { + "epoch": 0.2482984845639932, + "grad_norm": 0.8203125, + "learning_rate": 0.00018811579208436848, + "loss": 1.1827, + "step": 9670 + }, + { + "epoch": 0.248324161759915, + "grad_norm": 0.75, + "learning_rate": 0.0001881136812234421, + "loss": 0.9668, + "step": 9671 + }, + { + "epoch": 0.24834983895583682, + "grad_norm": 0.76953125, + "learning_rate": 0.00018811157018691312, + "loss": 0.9346, + "step": 9672 + }, + { + "epoch": 0.24837551615175865, + "grad_norm": 0.7734375, + "learning_rate": 0.00018810945897478575, + "loss": 1.036, + "step": 9673 + }, + { + "epoch": 0.24840119334768046, + "grad_norm": 0.859375, + "learning_rate": 0.00018810734758706413, + "loss": 0.931, + "step": 9674 + }, + { + "epoch": 0.2484268705436023, + "grad_norm": 0.80859375, + "learning_rate": 0.00018810523602375261, + "loss": 0.8756, + "step": 9675 + }, + { + "epoch": 0.2484525477395241, + "grad_norm": 0.765625, + "learning_rate": 0.00018810312428485526, + "loss": 1.1018, + "step": 9676 + }, + { + "epoch": 0.2484782249354459, + "grad_norm": 0.8125, + "learning_rate": 0.00018810101237037635, + "loss": 1.0338, + "step": 9677 + }, + { + "epoch": 0.24850390213136775, + "grad_norm": 0.84375, + "learning_rate": 0.0001880989002803201, + "loss": 1.0081, + "step": 9678 + }, + { + "epoch": 0.24852957932728956, + "grad_norm": 0.8828125, + "learning_rate": 0.00018809678801469067, + "loss": 1.0408, + "step": 9679 + }, + { + "epoch": 0.2485552565232114, + "grad_norm": 0.89453125, + "learning_rate": 0.0001880946755734923, + "loss": 1.1428, + "step": 9680 + }, + { + "epoch": 0.2485809337191332, + "grad_norm": 0.84375, + "learning_rate": 0.00018809256295672922, + "loss": 1.1286, + "step": 9681 + }, + { + "epoch": 0.248606610915055, + "grad_norm": 0.89453125, + "learning_rate": 0.00018809045016440559, + "loss": 1.1423, + "step": 9682 + }, + { + "epoch": 0.24863228811097685, + "grad_norm": 0.80078125, + "learning_rate": 0.00018808833719652569, + "loss": 0.9347, + "step": 9683 + }, + { + "epoch": 0.24865796530689865, + "grad_norm": 0.859375, + "learning_rate": 0.00018808622405309367, + "loss": 1.1336, + "step": 9684 + }, + { + "epoch": 0.2486836425028205, + "grad_norm": 0.8203125, + "learning_rate": 0.00018808411073411376, + "loss": 0.9477, + "step": 9685 + }, + { + "epoch": 0.2487093196987423, + "grad_norm": 0.89453125, + "learning_rate": 0.00018808199723959016, + "loss": 1.1103, + "step": 9686 + }, + { + "epoch": 0.2487349968946641, + "grad_norm": 0.79296875, + "learning_rate": 0.00018807988356952713, + "loss": 1.0947, + "step": 9687 + }, + { + "epoch": 0.24876067409058594, + "grad_norm": 0.75390625, + "learning_rate": 0.00018807776972392884, + "loss": 0.9712, + "step": 9688 + }, + { + "epoch": 0.24878635128650775, + "grad_norm": 0.7890625, + "learning_rate": 0.0001880756557027995, + "loss": 0.9829, + "step": 9689 + }, + { + "epoch": 0.2488120284824296, + "grad_norm": 0.8046875, + "learning_rate": 0.0001880735415061433, + "loss": 1.0289, + "step": 9690 + }, + { + "epoch": 0.2488377056783514, + "grad_norm": 0.828125, + "learning_rate": 0.00018807142713396453, + "loss": 1.2071, + "step": 9691 + }, + { + "epoch": 0.2488633828742732, + "grad_norm": 0.828125, + "learning_rate": 0.00018806931258626735, + "loss": 1.0226, + "step": 9692 + }, + { + "epoch": 0.24888906007019504, + "grad_norm": 0.74609375, + "learning_rate": 0.000188067197863056, + "loss": 1.0458, + "step": 9693 + }, + { + "epoch": 0.24891473726611685, + "grad_norm": 0.6875, + "learning_rate": 0.00018806508296433468, + "loss": 0.9462, + "step": 9694 + }, + { + "epoch": 0.24894041446203868, + "grad_norm": 0.7734375, + "learning_rate": 0.00018806296789010757, + "loss": 0.9249, + "step": 9695 + }, + { + "epoch": 0.2489660916579605, + "grad_norm": 0.7265625, + "learning_rate": 0.00018806085264037897, + "loss": 0.9201, + "step": 9696 + }, + { + "epoch": 0.2489917688538823, + "grad_norm": 0.83984375, + "learning_rate": 0.000188058737215153, + "loss": 1.0811, + "step": 9697 + }, + { + "epoch": 0.24901744604980414, + "grad_norm": 0.75, + "learning_rate": 0.00018805662161443395, + "loss": 1.0286, + "step": 9698 + }, + { + "epoch": 0.24904312324572594, + "grad_norm": 0.7734375, + "learning_rate": 0.000188054505838226, + "loss": 0.9939, + "step": 9699 + }, + { + "epoch": 0.24906880044164778, + "grad_norm": 0.86328125, + "learning_rate": 0.00018805238988653334, + "loss": 1.0701, + "step": 9700 + }, + { + "epoch": 0.2490944776375696, + "grad_norm": 0.828125, + "learning_rate": 0.00018805027375936025, + "loss": 0.8844, + "step": 9701 + }, + { + "epoch": 0.2491201548334914, + "grad_norm": 0.77734375, + "learning_rate": 0.00018804815745671096, + "loss": 0.9426, + "step": 9702 + }, + { + "epoch": 0.24914583202941323, + "grad_norm": 0.7578125, + "learning_rate": 0.0001880460409785896, + "loss": 0.9583, + "step": 9703 + }, + { + "epoch": 0.24917150922533504, + "grad_norm": 0.84375, + "learning_rate": 0.00018804392432500043, + "loss": 1.0701, + "step": 9704 + }, + { + "epoch": 0.24919718642125688, + "grad_norm": 0.7734375, + "learning_rate": 0.00018804180749594767, + "loss": 1.0606, + "step": 9705 + }, + { + "epoch": 0.24922286361717869, + "grad_norm": 0.82421875, + "learning_rate": 0.00018803969049143557, + "loss": 1.0024, + "step": 9706 + }, + { + "epoch": 0.2492485408131005, + "grad_norm": 0.80078125, + "learning_rate": 0.00018803757331146826, + "loss": 1.1159, + "step": 9707 + }, + { + "epoch": 0.24927421800902233, + "grad_norm": 0.75, + "learning_rate": 0.00018803545595605007, + "loss": 0.8478, + "step": 9708 + }, + { + "epoch": 0.24929989520494414, + "grad_norm": 0.8984375, + "learning_rate": 0.00018803333842518517, + "loss": 0.9984, + "step": 9709 + }, + { + "epoch": 0.24932557240086597, + "grad_norm": 0.87890625, + "learning_rate": 0.0001880312207188778, + "loss": 0.9986, + "step": 9710 + }, + { + "epoch": 0.24935124959678778, + "grad_norm": 0.828125, + "learning_rate": 0.0001880291028371321, + "loss": 1.1408, + "step": 9711 + }, + { + "epoch": 0.2493769267927096, + "grad_norm": 0.765625, + "learning_rate": 0.00018802698477995237, + "loss": 1.0782, + "step": 9712 + }, + { + "epoch": 0.24940260398863143, + "grad_norm": 0.7421875, + "learning_rate": 0.00018802486654734282, + "loss": 1.058, + "step": 9713 + }, + { + "epoch": 0.24942828118455324, + "grad_norm": 0.8359375, + "learning_rate": 0.00018802274813930765, + "loss": 0.9473, + "step": 9714 + }, + { + "epoch": 0.24945395838047507, + "grad_norm": 0.953125, + "learning_rate": 0.0001880206295558511, + "loss": 1.0361, + "step": 9715 + }, + { + "epoch": 0.24947963557639688, + "grad_norm": 0.7578125, + "learning_rate": 0.0001880185107969774, + "loss": 0.9757, + "step": 9716 + }, + { + "epoch": 0.2495053127723187, + "grad_norm": 0.73828125, + "learning_rate": 0.00018801639186269073, + "loss": 0.9407, + "step": 9717 + }, + { + "epoch": 0.24953098996824052, + "grad_norm": 0.90234375, + "learning_rate": 0.00018801427275299537, + "loss": 1.2329, + "step": 9718 + }, + { + "epoch": 0.24955666716416233, + "grad_norm": 0.8046875, + "learning_rate": 0.0001880121534678955, + "loss": 0.9391, + "step": 9719 + }, + { + "epoch": 0.24958234436008417, + "grad_norm": 0.734375, + "learning_rate": 0.00018801003400739535, + "loss": 1.0891, + "step": 9720 + }, + { + "epoch": 0.24960802155600598, + "grad_norm": 0.8515625, + "learning_rate": 0.0001880079143714992, + "loss": 0.8781, + "step": 9721 + }, + { + "epoch": 0.24963369875192779, + "grad_norm": 0.796875, + "learning_rate": 0.00018800579456021118, + "loss": 1.1202, + "step": 9722 + }, + { + "epoch": 0.24965937594784962, + "grad_norm": 0.80078125, + "learning_rate": 0.00018800367457353555, + "loss": 0.9993, + "step": 9723 + }, + { + "epoch": 0.24968505314377143, + "grad_norm": 0.76171875, + "learning_rate": 0.00018800155441147656, + "loss": 1.0889, + "step": 9724 + }, + { + "epoch": 0.24971073033969327, + "grad_norm": 0.80078125, + "learning_rate": 0.00018799943407403843, + "loss": 0.9625, + "step": 9725 + }, + { + "epoch": 0.24973640753561507, + "grad_norm": 0.82421875, + "learning_rate": 0.00018799731356122537, + "loss": 1.135, + "step": 9726 + }, + { + "epoch": 0.24976208473153688, + "grad_norm": 0.796875, + "learning_rate": 0.00018799519287304164, + "loss": 1.1588, + "step": 9727 + }, + { + "epoch": 0.24978776192745872, + "grad_norm": 0.8671875, + "learning_rate": 0.0001879930720094914, + "loss": 1.087, + "step": 9728 + }, + { + "epoch": 0.24981343912338053, + "grad_norm": 0.7890625, + "learning_rate": 0.00018799095097057892, + "loss": 1.0623, + "step": 9729 + }, + { + "epoch": 0.24983911631930236, + "grad_norm": 0.859375, + "learning_rate": 0.00018798882975630844, + "loss": 1.1, + "step": 9730 + }, + { + "epoch": 0.24986479351522417, + "grad_norm": 0.76171875, + "learning_rate": 0.00018798670836668417, + "loss": 0.9448, + "step": 9731 + }, + { + "epoch": 0.24989047071114598, + "grad_norm": 0.7890625, + "learning_rate": 0.00018798458680171034, + "loss": 1.0154, + "step": 9732 + }, + { + "epoch": 0.24991614790706781, + "grad_norm": 0.82421875, + "learning_rate": 0.00018798246506139115, + "loss": 1.0345, + "step": 9733 + }, + { + "epoch": 0.24994182510298962, + "grad_norm": 0.8359375, + "learning_rate": 0.0001879803431457309, + "loss": 0.9713, + "step": 9734 + }, + { + "epoch": 0.24996750229891146, + "grad_norm": 0.81640625, + "learning_rate": 0.00018797822105473373, + "loss": 1.031, + "step": 9735 + }, + { + "epoch": 0.24999317949483327, + "grad_norm": 0.7734375, + "learning_rate": 0.0001879760987884039, + "loss": 1.1076, + "step": 9736 + }, + { + "epoch": 0.2500188566907551, + "grad_norm": 0.8203125, + "learning_rate": 0.00018797397634674572, + "loss": 1.0293, + "step": 9737 + }, + { + "epoch": 0.2500445338866769, + "grad_norm": 0.8125, + "learning_rate": 0.0001879718537297633, + "loss": 0.9958, + "step": 9738 + }, + { + "epoch": 0.25007021108259875, + "grad_norm": 0.7734375, + "learning_rate": 0.00018796973093746094, + "loss": 0.956, + "step": 9739 + }, + { + "epoch": 0.25009588827852053, + "grad_norm": 0.75390625, + "learning_rate": 0.00018796760796984286, + "loss": 1.0231, + "step": 9740 + }, + { + "epoch": 0.25012156547444236, + "grad_norm": 0.796875, + "learning_rate": 0.00018796548482691327, + "loss": 1.2034, + "step": 9741 + }, + { + "epoch": 0.2501472426703642, + "grad_norm": 0.78125, + "learning_rate": 0.0001879633615086764, + "loss": 0.9597, + "step": 9742 + }, + { + "epoch": 0.250172919866286, + "grad_norm": 0.9296875, + "learning_rate": 0.00018796123801513652, + "loss": 0.9346, + "step": 9743 + }, + { + "epoch": 0.2501985970622078, + "grad_norm": 0.7890625, + "learning_rate": 0.00018795911434629786, + "loss": 0.8685, + "step": 9744 + }, + { + "epoch": 0.25022427425812965, + "grad_norm": 0.75390625, + "learning_rate": 0.00018795699050216458, + "loss": 0.8863, + "step": 9745 + }, + { + "epoch": 0.25024995145405143, + "grad_norm": 0.79296875, + "learning_rate": 0.000187954866482741, + "loss": 1.0397, + "step": 9746 + }, + { + "epoch": 0.25027562864997327, + "grad_norm": 0.82421875, + "learning_rate": 0.00018795274228803125, + "loss": 0.9008, + "step": 9747 + }, + { + "epoch": 0.2503013058458951, + "grad_norm": 0.87109375, + "learning_rate": 0.0001879506179180397, + "loss": 0.991, + "step": 9748 + }, + { + "epoch": 0.25032698304181694, + "grad_norm": 0.83203125, + "learning_rate": 0.0001879484933727705, + "loss": 1.2055, + "step": 9749 + }, + { + "epoch": 0.2503526602377387, + "grad_norm": 0.84765625, + "learning_rate": 0.0001879463686522279, + "loss": 0.9763, + "step": 9750 + }, + { + "epoch": 0.25037833743366056, + "grad_norm": 0.82421875, + "learning_rate": 0.0001879442437564161, + "loss": 0.9438, + "step": 9751 + }, + { + "epoch": 0.2504040146295824, + "grad_norm": 0.7890625, + "learning_rate": 0.00018794211868533939, + "loss": 1.0117, + "step": 9752 + }, + { + "epoch": 0.2504296918255042, + "grad_norm": 0.7578125, + "learning_rate": 0.000187939993439002, + "loss": 1.0377, + "step": 9753 + }, + { + "epoch": 0.250455369021426, + "grad_norm": 0.75, + "learning_rate": 0.00018793786801740811, + "loss": 0.8117, + "step": 9754 + }, + { + "epoch": 0.25048104621734785, + "grad_norm": 0.8203125, + "learning_rate": 0.000187935742420562, + "loss": 1.1357, + "step": 9755 + }, + { + "epoch": 0.2505067234132696, + "grad_norm": 0.83984375, + "learning_rate": 0.0001879336166484679, + "loss": 1.013, + "step": 9756 + }, + { + "epoch": 0.25053240060919146, + "grad_norm": 0.84375, + "learning_rate": 0.00018793149070113008, + "loss": 0.9918, + "step": 9757 + }, + { + "epoch": 0.2505580778051133, + "grad_norm": 0.78515625, + "learning_rate": 0.00018792936457855271, + "loss": 0.8833, + "step": 9758 + }, + { + "epoch": 0.25058375500103514, + "grad_norm": 0.73828125, + "learning_rate": 0.00018792723828074003, + "loss": 0.987, + "step": 9759 + }, + { + "epoch": 0.2506094321969569, + "grad_norm": 0.82421875, + "learning_rate": 0.00018792511180769635, + "loss": 1.0255, + "step": 9760 + }, + { + "epoch": 0.25063510939287875, + "grad_norm": 0.8125, + "learning_rate": 0.00018792298515942586, + "loss": 0.9939, + "step": 9761 + }, + { + "epoch": 0.2506607865888006, + "grad_norm": 0.734375, + "learning_rate": 0.00018792085833593277, + "loss": 1.0954, + "step": 9762 + }, + { + "epoch": 0.25068646378472237, + "grad_norm": 0.8515625, + "learning_rate": 0.00018791873133722139, + "loss": 1.063, + "step": 9763 + }, + { + "epoch": 0.2507121409806442, + "grad_norm": 0.82421875, + "learning_rate": 0.0001879166041632959, + "loss": 1.079, + "step": 9764 + }, + { + "epoch": 0.25073781817656604, + "grad_norm": 0.75390625, + "learning_rate": 0.00018791447681416057, + "loss": 0.9969, + "step": 9765 + }, + { + "epoch": 0.2507634953724878, + "grad_norm": 0.9296875, + "learning_rate": 0.00018791234928981966, + "loss": 1.0533, + "step": 9766 + }, + { + "epoch": 0.25078917256840966, + "grad_norm": 0.7890625, + "learning_rate": 0.0001879102215902773, + "loss": 1.1205, + "step": 9767 + }, + { + "epoch": 0.2508148497643315, + "grad_norm": 0.80078125, + "learning_rate": 0.00018790809371553788, + "loss": 0.9953, + "step": 9768 + }, + { + "epoch": 0.25084052696025333, + "grad_norm": 0.80859375, + "learning_rate": 0.00018790596566560551, + "loss": 0.9519, + "step": 9769 + }, + { + "epoch": 0.2508662041561751, + "grad_norm": 0.85546875, + "learning_rate": 0.00018790383744048453, + "loss": 1.0314, + "step": 9770 + }, + { + "epoch": 0.25089188135209695, + "grad_norm": 1.03125, + "learning_rate": 0.00018790170904017917, + "loss": 0.8834, + "step": 9771 + }, + { + "epoch": 0.2509175585480188, + "grad_norm": 0.84375, + "learning_rate": 0.00018789958046469362, + "loss": 1.1177, + "step": 9772 + }, + { + "epoch": 0.25094323574394056, + "grad_norm": 0.80078125, + "learning_rate": 0.0001878974517140321, + "loss": 1.0656, + "step": 9773 + }, + { + "epoch": 0.2509689129398624, + "grad_norm": 0.7890625, + "learning_rate": 0.00018789532278819894, + "loss": 1.0262, + "step": 9774 + }, + { + "epoch": 0.25099459013578423, + "grad_norm": 0.87890625, + "learning_rate": 0.00018789319368719834, + "loss": 1.0092, + "step": 9775 + }, + { + "epoch": 0.251020267331706, + "grad_norm": 0.78515625, + "learning_rate": 0.00018789106441103454, + "loss": 0.9744, + "step": 9776 + }, + { + "epoch": 0.25104594452762785, + "grad_norm": 0.77734375, + "learning_rate": 0.0001878889349597118, + "loss": 0.9474, + "step": 9777 + }, + { + "epoch": 0.2510716217235497, + "grad_norm": 0.83984375, + "learning_rate": 0.00018788680533323433, + "loss": 1.0383, + "step": 9778 + }, + { + "epoch": 0.2510972989194715, + "grad_norm": 0.8515625, + "learning_rate": 0.0001878846755316064, + "loss": 1.0744, + "step": 9779 + }, + { + "epoch": 0.2511229761153933, + "grad_norm": 0.8359375, + "learning_rate": 0.00018788254555483227, + "loss": 1.1897, + "step": 9780 + }, + { + "epoch": 0.25114865331131514, + "grad_norm": 0.84375, + "learning_rate": 0.00018788041540291614, + "loss": 1.1429, + "step": 9781 + }, + { + "epoch": 0.251174330507237, + "grad_norm": 0.85546875, + "learning_rate": 0.0001878782850758623, + "loss": 1.0292, + "step": 9782 + }, + { + "epoch": 0.25120000770315876, + "grad_norm": 0.78515625, + "learning_rate": 0.00018787615457367495, + "loss": 0.877, + "step": 9783 + }, + { + "epoch": 0.2512256848990806, + "grad_norm": 0.78515625, + "learning_rate": 0.00018787402389635837, + "loss": 1.0546, + "step": 9784 + }, + { + "epoch": 0.25125136209500243, + "grad_norm": 0.734375, + "learning_rate": 0.0001878718930439168, + "loss": 0.958, + "step": 9785 + }, + { + "epoch": 0.2512770392909242, + "grad_norm": 0.8125, + "learning_rate": 0.00018786976201635452, + "loss": 1.1842, + "step": 9786 + }, + { + "epoch": 0.25130271648684605, + "grad_norm": 0.8359375, + "learning_rate": 0.00018786763081367572, + "loss": 1.1295, + "step": 9787 + }, + { + "epoch": 0.2513283936827679, + "grad_norm": 0.8359375, + "learning_rate": 0.00018786549943588466, + "loss": 1.0747, + "step": 9788 + }, + { + "epoch": 0.2513540708786897, + "grad_norm": 0.8203125, + "learning_rate": 0.00018786336788298558, + "loss": 0.9296, + "step": 9789 + }, + { + "epoch": 0.2513797480746115, + "grad_norm": 0.78125, + "learning_rate": 0.00018786123615498277, + "loss": 1.0313, + "step": 9790 + }, + { + "epoch": 0.25140542527053333, + "grad_norm": 0.828125, + "learning_rate": 0.00018785910425188045, + "loss": 0.9914, + "step": 9791 + }, + { + "epoch": 0.25143110246645517, + "grad_norm": 0.76171875, + "learning_rate": 0.0001878569721736829, + "loss": 0.9671, + "step": 9792 + }, + { + "epoch": 0.25145677966237695, + "grad_norm": 2.359375, + "learning_rate": 0.00018785483992039427, + "loss": 0.9881, + "step": 9793 + }, + { + "epoch": 0.2514824568582988, + "grad_norm": 0.8515625, + "learning_rate": 0.00018785270749201893, + "loss": 0.975, + "step": 9794 + }, + { + "epoch": 0.2515081340542206, + "grad_norm": 0.8671875, + "learning_rate": 0.00018785057488856106, + "loss": 1.1925, + "step": 9795 + }, + { + "epoch": 0.2515338112501424, + "grad_norm": 0.8203125, + "learning_rate": 0.00018784844211002492, + "loss": 1.0462, + "step": 9796 + }, + { + "epoch": 0.25155948844606424, + "grad_norm": 0.79296875, + "learning_rate": 0.0001878463091564148, + "loss": 0.9663, + "step": 9797 + }, + { + "epoch": 0.2515851656419861, + "grad_norm": 0.8359375, + "learning_rate": 0.00018784417602773492, + "loss": 0.9284, + "step": 9798 + }, + { + "epoch": 0.2516108428379079, + "grad_norm": 0.75, + "learning_rate": 0.00018784204272398952, + "loss": 1.0372, + "step": 9799 + }, + { + "epoch": 0.2516365200338297, + "grad_norm": 0.80078125, + "learning_rate": 0.00018783990924518284, + "loss": 0.9461, + "step": 9800 + }, + { + "epoch": 0.2516621972297515, + "grad_norm": 0.78515625, + "learning_rate": 0.0001878377755913192, + "loss": 0.991, + "step": 9801 + }, + { + "epoch": 0.25168787442567336, + "grad_norm": 0.82421875, + "learning_rate": 0.00018783564176240276, + "loss": 0.9688, + "step": 9802 + }, + { + "epoch": 0.25171355162159514, + "grad_norm": 0.796875, + "learning_rate": 0.00018783350775843788, + "loss": 1.0847, + "step": 9803 + }, + { + "epoch": 0.251739228817517, + "grad_norm": 0.77734375, + "learning_rate": 0.00018783137357942872, + "loss": 1.1411, + "step": 9804 + }, + { + "epoch": 0.2517649060134388, + "grad_norm": 0.78125, + "learning_rate": 0.00018782923922537953, + "loss": 1.1024, + "step": 9805 + }, + { + "epoch": 0.2517905832093606, + "grad_norm": 0.77734375, + "learning_rate": 0.00018782710469629466, + "loss": 0.9315, + "step": 9806 + }, + { + "epoch": 0.25181626040528243, + "grad_norm": 0.765625, + "learning_rate": 0.00018782496999217828, + "loss": 0.9342, + "step": 9807 + }, + { + "epoch": 0.25184193760120427, + "grad_norm": 0.76171875, + "learning_rate": 0.00018782283511303467, + "loss": 0.9844, + "step": 9808 + }, + { + "epoch": 0.2518676147971261, + "grad_norm": 0.75390625, + "learning_rate": 0.00018782070005886808, + "loss": 1.0318, + "step": 9809 + }, + { + "epoch": 0.2518932919930479, + "grad_norm": 0.83984375, + "learning_rate": 0.00018781856482968275, + "loss": 0.9367, + "step": 9810 + }, + { + "epoch": 0.2519189691889697, + "grad_norm": 0.82421875, + "learning_rate": 0.00018781642942548299, + "loss": 1.0682, + "step": 9811 + }, + { + "epoch": 0.25194464638489156, + "grad_norm": 0.765625, + "learning_rate": 0.00018781429384627299, + "loss": 0.8754, + "step": 9812 + }, + { + "epoch": 0.25197032358081334, + "grad_norm": 0.73828125, + "learning_rate": 0.00018781215809205706, + "loss": 0.9933, + "step": 9813 + }, + { + "epoch": 0.2519960007767352, + "grad_norm": 0.89453125, + "learning_rate": 0.0001878100221628394, + "loss": 1.0704, + "step": 9814 + }, + { + "epoch": 0.252021677972657, + "grad_norm": 1.0390625, + "learning_rate": 0.00018780788605862433, + "loss": 0.9193, + "step": 9815 + }, + { + "epoch": 0.2520473551685788, + "grad_norm": 0.89453125, + "learning_rate": 0.00018780574977941603, + "loss": 1.0412, + "step": 9816 + }, + { + "epoch": 0.2520730323645006, + "grad_norm": 0.83203125, + "learning_rate": 0.00018780361332521882, + "loss": 0.9685, + "step": 9817 + }, + { + "epoch": 0.25209870956042246, + "grad_norm": 0.74609375, + "learning_rate": 0.00018780147669603697, + "loss": 1.0938, + "step": 9818 + }, + { + "epoch": 0.2521243867563443, + "grad_norm": 0.78125, + "learning_rate": 0.00018779933989187468, + "loss": 0.974, + "step": 9819 + }, + { + "epoch": 0.2521500639522661, + "grad_norm": 0.83984375, + "learning_rate": 0.00018779720291273625, + "loss": 0.9517, + "step": 9820 + }, + { + "epoch": 0.2521757411481879, + "grad_norm": 0.828125, + "learning_rate": 0.0001877950657586259, + "loss": 1.0417, + "step": 9821 + }, + { + "epoch": 0.25220141834410975, + "grad_norm": 0.84765625, + "learning_rate": 0.00018779292842954797, + "loss": 1.0586, + "step": 9822 + }, + { + "epoch": 0.25222709554003153, + "grad_norm": 0.87890625, + "learning_rate": 0.0001877907909255066, + "loss": 1.1472, + "step": 9823 + }, + { + "epoch": 0.25225277273595337, + "grad_norm": 0.7734375, + "learning_rate": 0.00018778865324650616, + "loss": 0.8691, + "step": 9824 + }, + { + "epoch": 0.2522784499318752, + "grad_norm": 0.82421875, + "learning_rate": 0.00018778651539255083, + "loss": 1.0702, + "step": 9825 + }, + { + "epoch": 0.252304127127797, + "grad_norm": 0.81640625, + "learning_rate": 0.00018778437736364492, + "loss": 1.0675, + "step": 9826 + }, + { + "epoch": 0.2523298043237188, + "grad_norm": 0.83984375, + "learning_rate": 0.00018778223915979266, + "loss": 1.1334, + "step": 9827 + }, + { + "epoch": 0.25235548151964066, + "grad_norm": 0.78125, + "learning_rate": 0.00018778010078099837, + "loss": 1.1458, + "step": 9828 + }, + { + "epoch": 0.2523811587155625, + "grad_norm": 0.80859375, + "learning_rate": 0.00018777796222726624, + "loss": 1.0475, + "step": 9829 + }, + { + "epoch": 0.2524068359114843, + "grad_norm": 0.69140625, + "learning_rate": 0.00018777582349860055, + "loss": 0.8863, + "step": 9830 + }, + { + "epoch": 0.2524325131074061, + "grad_norm": 0.76953125, + "learning_rate": 0.0001877736845950056, + "loss": 0.8817, + "step": 9831 + }, + { + "epoch": 0.25245819030332795, + "grad_norm": 0.890625, + "learning_rate": 0.0001877715455164856, + "loss": 1.0075, + "step": 9832 + }, + { + "epoch": 0.2524838674992497, + "grad_norm": 0.79296875, + "learning_rate": 0.00018776940626304484, + "loss": 0.9234, + "step": 9833 + }, + { + "epoch": 0.25250954469517156, + "grad_norm": 0.90234375, + "learning_rate": 0.00018776726683468757, + "loss": 0.8776, + "step": 9834 + }, + { + "epoch": 0.2525352218910934, + "grad_norm": 0.7890625, + "learning_rate": 0.0001877651272314181, + "loss": 1.0011, + "step": 9835 + }, + { + "epoch": 0.2525608990870152, + "grad_norm": 0.7421875, + "learning_rate": 0.00018776298745324063, + "loss": 1.0996, + "step": 9836 + }, + { + "epoch": 0.252586576282937, + "grad_norm": 0.76171875, + "learning_rate": 0.00018776084750015947, + "loss": 0.9409, + "step": 9837 + }, + { + "epoch": 0.25261225347885885, + "grad_norm": 0.734375, + "learning_rate": 0.0001877587073721789, + "loss": 0.98, + "step": 9838 + }, + { + "epoch": 0.2526379306747807, + "grad_norm": 0.87890625, + "learning_rate": 0.00018775656706930308, + "loss": 1.0237, + "step": 9839 + }, + { + "epoch": 0.25266360787070247, + "grad_norm": 0.80078125, + "learning_rate": 0.0001877544265915364, + "loss": 1.0552, + "step": 9840 + }, + { + "epoch": 0.2526892850666243, + "grad_norm": 0.73828125, + "learning_rate": 0.00018775228593888305, + "loss": 0.915, + "step": 9841 + }, + { + "epoch": 0.25271496226254614, + "grad_norm": 0.73828125, + "learning_rate": 0.00018775014511134735, + "loss": 0.9096, + "step": 9842 + }, + { + "epoch": 0.2527406394584679, + "grad_norm": 0.84375, + "learning_rate": 0.0001877480041089335, + "loss": 1.0984, + "step": 9843 + }, + { + "epoch": 0.25276631665438976, + "grad_norm": 0.85546875, + "learning_rate": 0.00018774586293164586, + "loss": 0.9835, + "step": 9844 + }, + { + "epoch": 0.2527919938503116, + "grad_norm": 0.828125, + "learning_rate": 0.00018774372157948855, + "loss": 1.1745, + "step": 9845 + }, + { + "epoch": 0.25281767104623337, + "grad_norm": 0.89453125, + "learning_rate": 0.000187741580052466, + "loss": 1.0247, + "step": 9846 + }, + { + "epoch": 0.2528433482421552, + "grad_norm": 0.8671875, + "learning_rate": 0.0001877394383505824, + "loss": 0.9888, + "step": 9847 + }, + { + "epoch": 0.25286902543807704, + "grad_norm": 0.7265625, + "learning_rate": 0.00018773729647384203, + "loss": 0.9757, + "step": 9848 + }, + { + "epoch": 0.2528947026339989, + "grad_norm": 0.81640625, + "learning_rate": 0.00018773515442224912, + "loss": 0.9769, + "step": 9849 + }, + { + "epoch": 0.25292037982992066, + "grad_norm": 0.71484375, + "learning_rate": 0.000187733012195808, + "loss": 0.9589, + "step": 9850 + }, + { + "epoch": 0.2529460570258425, + "grad_norm": 0.77734375, + "learning_rate": 0.0001877308697945229, + "loss": 1.005, + "step": 9851 + }, + { + "epoch": 0.25297173422176433, + "grad_norm": 0.74609375, + "learning_rate": 0.0001877287272183981, + "loss": 0.9539, + "step": 9852 + }, + { + "epoch": 0.2529974114176861, + "grad_norm": 0.73828125, + "learning_rate": 0.00018772658446743787, + "loss": 0.8123, + "step": 9853 + }, + { + "epoch": 0.25302308861360795, + "grad_norm": 0.8203125, + "learning_rate": 0.00018772444154164651, + "loss": 0.935, + "step": 9854 + }, + { + "epoch": 0.2530487658095298, + "grad_norm": 0.8984375, + "learning_rate": 0.0001877222984410282, + "loss": 1.1523, + "step": 9855 + }, + { + "epoch": 0.25307444300545157, + "grad_norm": 0.78125, + "learning_rate": 0.00018772015516558735, + "loss": 0.9523, + "step": 9856 + }, + { + "epoch": 0.2531001202013734, + "grad_norm": 0.8125, + "learning_rate": 0.0001877180117153281, + "loss": 1.0391, + "step": 9857 + }, + { + "epoch": 0.25312579739729524, + "grad_norm": 0.77734375, + "learning_rate": 0.0001877158680902548, + "loss": 0.9957, + "step": 9858 + }, + { + "epoch": 0.2531514745932171, + "grad_norm": 0.77734375, + "learning_rate": 0.00018771372429037165, + "loss": 1.0054, + "step": 9859 + }, + { + "epoch": 0.25317715178913885, + "grad_norm": 0.859375, + "learning_rate": 0.00018771158031568303, + "loss": 0.9878, + "step": 9860 + }, + { + "epoch": 0.2532028289850607, + "grad_norm": 0.72265625, + "learning_rate": 0.00018770943616619313, + "loss": 0.8368, + "step": 9861 + }, + { + "epoch": 0.2532285061809825, + "grad_norm": 0.734375, + "learning_rate": 0.00018770729184190626, + "loss": 0.9643, + "step": 9862 + }, + { + "epoch": 0.2532541833769043, + "grad_norm": 0.80078125, + "learning_rate": 0.00018770514734282667, + "loss": 1.0328, + "step": 9863 + }, + { + "epoch": 0.25327986057282614, + "grad_norm": 0.96875, + "learning_rate": 0.00018770300266895864, + "loss": 0.9043, + "step": 9864 + }, + { + "epoch": 0.253305537768748, + "grad_norm": 0.83984375, + "learning_rate": 0.00018770085782030647, + "loss": 0.9353, + "step": 9865 + }, + { + "epoch": 0.25333121496466976, + "grad_norm": 0.78515625, + "learning_rate": 0.0001876987127968744, + "loss": 0.9206, + "step": 9866 + }, + { + "epoch": 0.2533568921605916, + "grad_norm": 0.79296875, + "learning_rate": 0.0001876965675986667, + "loss": 0.9717, + "step": 9867 + }, + { + "epoch": 0.25338256935651343, + "grad_norm": 0.765625, + "learning_rate": 0.00018769442222568765, + "loss": 0.9254, + "step": 9868 + }, + { + "epoch": 0.25340824655243527, + "grad_norm": 0.8359375, + "learning_rate": 0.00018769227667794156, + "loss": 0.9864, + "step": 9869 + }, + { + "epoch": 0.25343392374835705, + "grad_norm": 0.83984375, + "learning_rate": 0.00018769013095543268, + "loss": 1.0602, + "step": 9870 + }, + { + "epoch": 0.2534596009442789, + "grad_norm": 0.78515625, + "learning_rate": 0.00018768798505816532, + "loss": 1.0241, + "step": 9871 + }, + { + "epoch": 0.2534852781402007, + "grad_norm": 0.83984375, + "learning_rate": 0.0001876858389861437, + "loss": 0.9009, + "step": 9872 + }, + { + "epoch": 0.2535109553361225, + "grad_norm": 0.9609375, + "learning_rate": 0.00018768369273937212, + "loss": 0.9365, + "step": 9873 + }, + { + "epoch": 0.25353663253204434, + "grad_norm": 0.79296875, + "learning_rate": 0.00018768154631785487, + "loss": 1.0717, + "step": 9874 + }, + { + "epoch": 0.2535623097279662, + "grad_norm": 0.76171875, + "learning_rate": 0.0001876793997215962, + "loss": 1.1647, + "step": 9875 + }, + { + "epoch": 0.25358798692388795, + "grad_norm": 0.78515625, + "learning_rate": 0.00018767725295060043, + "loss": 1.0488, + "step": 9876 + }, + { + "epoch": 0.2536136641198098, + "grad_norm": 0.82421875, + "learning_rate": 0.0001876751060048718, + "loss": 1.0268, + "step": 9877 + }, + { + "epoch": 0.2536393413157316, + "grad_norm": 0.8125, + "learning_rate": 0.0001876729588844146, + "loss": 1.0173, + "step": 9878 + }, + { + "epoch": 0.25366501851165346, + "grad_norm": 0.7421875, + "learning_rate": 0.00018767081158923315, + "loss": 0.969, + "step": 9879 + }, + { + "epoch": 0.25369069570757524, + "grad_norm": 0.77734375, + "learning_rate": 0.00018766866411933165, + "loss": 0.8093, + "step": 9880 + }, + { + "epoch": 0.2537163729034971, + "grad_norm": 0.8203125, + "learning_rate": 0.00018766651647471444, + "loss": 1.1007, + "step": 9881 + }, + { + "epoch": 0.2537420500994189, + "grad_norm": 0.82421875, + "learning_rate": 0.0001876643686553858, + "loss": 0.9886, + "step": 9882 + }, + { + "epoch": 0.2537677272953407, + "grad_norm": 0.796875, + "learning_rate": 0.00018766222066134996, + "loss": 0.915, + "step": 9883 + }, + { + "epoch": 0.25379340449126253, + "grad_norm": 0.74609375, + "learning_rate": 0.00018766007249261123, + "loss": 0.8681, + "step": 9884 + }, + { + "epoch": 0.25381908168718437, + "grad_norm": 0.79296875, + "learning_rate": 0.0001876579241491739, + "loss": 0.9416, + "step": 9885 + }, + { + "epoch": 0.25384475888310615, + "grad_norm": 0.77734375, + "learning_rate": 0.00018765577563104225, + "loss": 0.9391, + "step": 9886 + }, + { + "epoch": 0.253870436079028, + "grad_norm": 0.78515625, + "learning_rate": 0.00018765362693822058, + "loss": 1.0944, + "step": 9887 + }, + { + "epoch": 0.2538961132749498, + "grad_norm": 0.80859375, + "learning_rate": 0.0001876514780707131, + "loss": 1.0772, + "step": 9888 + }, + { + "epoch": 0.25392179047087166, + "grad_norm": 0.75, + "learning_rate": 0.00018764932902852417, + "loss": 0.9547, + "step": 9889 + }, + { + "epoch": 0.25394746766679344, + "grad_norm": 0.81640625, + "learning_rate": 0.00018764717981165804, + "loss": 1.0782, + "step": 9890 + }, + { + "epoch": 0.25397314486271527, + "grad_norm": 0.94140625, + "learning_rate": 0.00018764503042011902, + "loss": 1.1597, + "step": 9891 + }, + { + "epoch": 0.2539988220586371, + "grad_norm": 0.7890625, + "learning_rate": 0.00018764288085391134, + "loss": 0.8958, + "step": 9892 + }, + { + "epoch": 0.2540244992545589, + "grad_norm": 0.82421875, + "learning_rate": 0.00018764073111303932, + "loss": 1.0548, + "step": 9893 + }, + { + "epoch": 0.2540501764504807, + "grad_norm": 0.78125, + "learning_rate": 0.00018763858119750726, + "loss": 0.9301, + "step": 9894 + }, + { + "epoch": 0.25407585364640256, + "grad_norm": 1.578125, + "learning_rate": 0.00018763643110731942, + "loss": 0.9808, + "step": 9895 + }, + { + "epoch": 0.25410153084232434, + "grad_norm": 0.828125, + "learning_rate": 0.00018763428084248008, + "loss": 0.8621, + "step": 9896 + }, + { + "epoch": 0.2541272080382462, + "grad_norm": 0.78515625, + "learning_rate": 0.0001876321304029935, + "loss": 1.0712, + "step": 9897 + }, + { + "epoch": 0.254152885234168, + "grad_norm": 0.7890625, + "learning_rate": 0.00018762997978886406, + "loss": 0.8851, + "step": 9898 + }, + { + "epoch": 0.25417856243008985, + "grad_norm": 0.83203125, + "learning_rate": 0.00018762782900009594, + "loss": 1.006, + "step": 9899 + }, + { + "epoch": 0.25420423962601163, + "grad_norm": 0.80859375, + "learning_rate": 0.0001876256780366935, + "loss": 1.2321, + "step": 9900 + }, + { + "epoch": 0.25422991682193347, + "grad_norm": 0.87109375, + "learning_rate": 0.00018762352689866098, + "loss": 0.9579, + "step": 9901 + }, + { + "epoch": 0.2542555940178553, + "grad_norm": 0.95703125, + "learning_rate": 0.0001876213755860027, + "loss": 1.0082, + "step": 9902 + }, + { + "epoch": 0.2542812712137771, + "grad_norm": 0.8203125, + "learning_rate": 0.00018761922409872296, + "loss": 0.9515, + "step": 9903 + }, + { + "epoch": 0.2543069484096989, + "grad_norm": 0.81640625, + "learning_rate": 0.000187617072436826, + "loss": 1.1683, + "step": 9904 + }, + { + "epoch": 0.25433262560562075, + "grad_norm": 0.828125, + "learning_rate": 0.0001876149206003161, + "loss": 0.9052, + "step": 9905 + }, + { + "epoch": 0.25435830280154254, + "grad_norm": 0.8203125, + "learning_rate": 0.00018761276858919758, + "loss": 0.9515, + "step": 9906 + }, + { + "epoch": 0.25438397999746437, + "grad_norm": 0.84375, + "learning_rate": 0.00018761061640347476, + "loss": 0.9692, + "step": 9907 + }, + { + "epoch": 0.2544096571933862, + "grad_norm": 0.85546875, + "learning_rate": 0.00018760846404315186, + "loss": 1.1316, + "step": 9908 + }, + { + "epoch": 0.25443533438930804, + "grad_norm": 0.79296875, + "learning_rate": 0.00018760631150823324, + "loss": 1.1624, + "step": 9909 + }, + { + "epoch": 0.2544610115852298, + "grad_norm": 0.75390625, + "learning_rate": 0.00018760415879872313, + "loss": 0.9006, + "step": 9910 + }, + { + "epoch": 0.25448668878115166, + "grad_norm": 0.7890625, + "learning_rate": 0.00018760200591462587, + "loss": 0.9682, + "step": 9911 + }, + { + "epoch": 0.2545123659770735, + "grad_norm": 0.8203125, + "learning_rate": 0.0001875998528559457, + "loss": 0.9904, + "step": 9912 + }, + { + "epoch": 0.2545380431729953, + "grad_norm": 0.84765625, + "learning_rate": 0.00018759769962268695, + "loss": 0.8222, + "step": 9913 + }, + { + "epoch": 0.2545637203689171, + "grad_norm": 0.86328125, + "learning_rate": 0.0001875955462148539, + "loss": 1.1323, + "step": 9914 + }, + { + "epoch": 0.25458939756483895, + "grad_norm": 0.78125, + "learning_rate": 0.0001875933926324508, + "loss": 1.0742, + "step": 9915 + }, + { + "epoch": 0.25461507476076073, + "grad_norm": 0.7734375, + "learning_rate": 0.00018759123887548203, + "loss": 0.901, + "step": 9916 + }, + { + "epoch": 0.25464075195668257, + "grad_norm": 0.79296875, + "learning_rate": 0.0001875890849439518, + "loss": 1.1151, + "step": 9917 + }, + { + "epoch": 0.2546664291526044, + "grad_norm": 0.84375, + "learning_rate": 0.00018758693083786447, + "loss": 1.0698, + "step": 9918 + }, + { + "epoch": 0.2546921063485262, + "grad_norm": 0.8125, + "learning_rate": 0.00018758477655722426, + "loss": 0.9888, + "step": 9919 + }, + { + "epoch": 0.254717783544448, + "grad_norm": 0.703125, + "learning_rate": 0.00018758262210203552, + "loss": 0.9672, + "step": 9920 + }, + { + "epoch": 0.25474346074036985, + "grad_norm": 0.73828125, + "learning_rate": 0.0001875804674723025, + "loss": 0.872, + "step": 9921 + }, + { + "epoch": 0.2547691379362917, + "grad_norm": 0.859375, + "learning_rate": 0.00018757831266802958, + "loss": 1.0375, + "step": 9922 + }, + { + "epoch": 0.25479481513221347, + "grad_norm": 0.77734375, + "learning_rate": 0.00018757615768922096, + "loss": 0.9272, + "step": 9923 + }, + { + "epoch": 0.2548204923281353, + "grad_norm": 0.76171875, + "learning_rate": 0.00018757400253588096, + "loss": 0.9663, + "step": 9924 + }, + { + "epoch": 0.25484616952405714, + "grad_norm": 0.7890625, + "learning_rate": 0.0001875718472080139, + "loss": 0.9355, + "step": 9925 + }, + { + "epoch": 0.2548718467199789, + "grad_norm": 0.81640625, + "learning_rate": 0.00018756969170562405, + "loss": 0.9407, + "step": 9926 + }, + { + "epoch": 0.25489752391590076, + "grad_norm": 0.84765625, + "learning_rate": 0.00018756753602871572, + "loss": 1.0173, + "step": 9927 + }, + { + "epoch": 0.2549232011118226, + "grad_norm": 0.8203125, + "learning_rate": 0.00018756538017729318, + "loss": 0.9812, + "step": 9928 + }, + { + "epoch": 0.2549488783077444, + "grad_norm": 0.7734375, + "learning_rate": 0.00018756322415136077, + "loss": 1.0395, + "step": 9929 + }, + { + "epoch": 0.2549745555036662, + "grad_norm": 0.8125, + "learning_rate": 0.00018756106795092276, + "loss": 1.0441, + "step": 9930 + }, + { + "epoch": 0.25500023269958805, + "grad_norm": 0.78125, + "learning_rate": 0.00018755891157598345, + "loss": 0.9681, + "step": 9931 + }, + { + "epoch": 0.2550259098955099, + "grad_norm": 0.84765625, + "learning_rate": 0.00018755675502654713, + "loss": 0.9811, + "step": 9932 + }, + { + "epoch": 0.25505158709143166, + "grad_norm": 0.84765625, + "learning_rate": 0.0001875545983026181, + "loss": 1.0872, + "step": 9933 + }, + { + "epoch": 0.2550772642873535, + "grad_norm": 0.76171875, + "learning_rate": 0.00018755244140420066, + "loss": 0.9785, + "step": 9934 + }, + { + "epoch": 0.25510294148327534, + "grad_norm": 0.80859375, + "learning_rate": 0.00018755028433129915, + "loss": 0.9667, + "step": 9935 + }, + { + "epoch": 0.2551286186791971, + "grad_norm": 0.796875, + "learning_rate": 0.0001875481270839178, + "loss": 0.9659, + "step": 9936 + }, + { + "epoch": 0.25515429587511895, + "grad_norm": 0.80078125, + "learning_rate": 0.00018754596966206094, + "loss": 1.0311, + "step": 9937 + }, + { + "epoch": 0.2551799730710408, + "grad_norm": 0.796875, + "learning_rate": 0.0001875438120657329, + "loss": 0.8992, + "step": 9938 + }, + { + "epoch": 0.25520565026696257, + "grad_norm": 0.89453125, + "learning_rate": 0.00018754165429493792, + "loss": 0.9216, + "step": 9939 + }, + { + "epoch": 0.2552313274628844, + "grad_norm": 0.82421875, + "learning_rate": 0.00018753949634968035, + "loss": 1.0069, + "step": 9940 + }, + { + "epoch": 0.25525700465880624, + "grad_norm": 0.80859375, + "learning_rate": 0.00018753733822996445, + "loss": 1.0395, + "step": 9941 + }, + { + "epoch": 0.2552826818547281, + "grad_norm": 0.765625, + "learning_rate": 0.00018753517993579458, + "loss": 1.0566, + "step": 9942 + }, + { + "epoch": 0.25530835905064986, + "grad_norm": 0.8828125, + "learning_rate": 0.00018753302146717497, + "loss": 0.9274, + "step": 9943 + }, + { + "epoch": 0.2553340362465717, + "grad_norm": 0.828125, + "learning_rate": 0.00018753086282410997, + "loss": 1.0123, + "step": 9944 + }, + { + "epoch": 0.25535971344249353, + "grad_norm": 0.70703125, + "learning_rate": 0.00018752870400660387, + "loss": 1.0241, + "step": 9945 + }, + { + "epoch": 0.2553853906384153, + "grad_norm": 0.75390625, + "learning_rate": 0.00018752654501466096, + "loss": 0.8293, + "step": 9946 + }, + { + "epoch": 0.25541106783433715, + "grad_norm": 0.72265625, + "learning_rate": 0.00018752438584828553, + "loss": 0.9529, + "step": 9947 + }, + { + "epoch": 0.255436745030259, + "grad_norm": 0.890625, + "learning_rate": 0.00018752222650748196, + "loss": 1.053, + "step": 9948 + }, + { + "epoch": 0.25546242222618076, + "grad_norm": 1.1796875, + "learning_rate": 0.00018752006699225446, + "loss": 0.9674, + "step": 9949 + }, + { + "epoch": 0.2554880994221026, + "grad_norm": 0.83203125, + "learning_rate": 0.00018751790730260737, + "loss": 0.9297, + "step": 9950 + }, + { + "epoch": 0.25551377661802444, + "grad_norm": 0.82421875, + "learning_rate": 0.000187515747438545, + "loss": 1.0508, + "step": 9951 + }, + { + "epoch": 0.25553945381394627, + "grad_norm": 0.828125, + "learning_rate": 0.00018751358740007166, + "loss": 0.9927, + "step": 9952 + }, + { + "epoch": 0.25556513100986805, + "grad_norm": 0.80078125, + "learning_rate": 0.00018751142718719164, + "loss": 1.0065, + "step": 9953 + }, + { + "epoch": 0.2555908082057899, + "grad_norm": 0.7890625, + "learning_rate": 0.00018750926679990924, + "loss": 0.7768, + "step": 9954 + }, + { + "epoch": 0.2556164854017117, + "grad_norm": 0.75, + "learning_rate": 0.0001875071062382288, + "loss": 0.8534, + "step": 9955 + }, + { + "epoch": 0.2556421625976335, + "grad_norm": 0.89453125, + "learning_rate": 0.00018750494550215458, + "loss": 1.0102, + "step": 9956 + }, + { + "epoch": 0.25566783979355534, + "grad_norm": 0.71875, + "learning_rate": 0.00018750278459169091, + "loss": 1.0866, + "step": 9957 + }, + { + "epoch": 0.2556935169894772, + "grad_norm": 0.828125, + "learning_rate": 0.0001875006235068421, + "loss": 0.9537, + "step": 9958 + }, + { + "epoch": 0.25571919418539896, + "grad_norm": 0.8046875, + "learning_rate": 0.00018749846224761245, + "loss": 1.1054, + "step": 9959 + }, + { + "epoch": 0.2557448713813208, + "grad_norm": 0.78125, + "learning_rate": 0.00018749630081400627, + "loss": 0.924, + "step": 9960 + }, + { + "epoch": 0.25577054857724263, + "grad_norm": 0.81640625, + "learning_rate": 0.00018749413920602788, + "loss": 1.0395, + "step": 9961 + }, + { + "epoch": 0.25579622577316447, + "grad_norm": 0.83984375, + "learning_rate": 0.00018749197742368154, + "loss": 1.155, + "step": 9962 + }, + { + "epoch": 0.25582190296908625, + "grad_norm": 0.77734375, + "learning_rate": 0.0001874898154669716, + "loss": 1.0329, + "step": 9963 + }, + { + "epoch": 0.2558475801650081, + "grad_norm": 0.7734375, + "learning_rate": 0.0001874876533359024, + "loss": 0.9537, + "step": 9964 + }, + { + "epoch": 0.2558732573609299, + "grad_norm": 0.76953125, + "learning_rate": 0.0001874854910304782, + "loss": 1.0032, + "step": 9965 + }, + { + "epoch": 0.2558989345568517, + "grad_norm": 0.8515625, + "learning_rate": 0.0001874833285507033, + "loss": 1.003, + "step": 9966 + }, + { + "epoch": 0.25592461175277353, + "grad_norm": 0.9296875, + "learning_rate": 0.000187481165896582, + "loss": 0.9786, + "step": 9967 + }, + { + "epoch": 0.25595028894869537, + "grad_norm": 0.82421875, + "learning_rate": 0.0001874790030681187, + "loss": 0.8942, + "step": 9968 + }, + { + "epoch": 0.25597596614461715, + "grad_norm": 0.74609375, + "learning_rate": 0.0001874768400653176, + "loss": 1.0066, + "step": 9969 + }, + { + "epoch": 0.256001643340539, + "grad_norm": 0.7890625, + "learning_rate": 0.00018747467688818307, + "loss": 1.0365, + "step": 9970 + }, + { + "epoch": 0.2560273205364608, + "grad_norm": 0.7890625, + "learning_rate": 0.00018747251353671944, + "loss": 0.956, + "step": 9971 + }, + { + "epoch": 0.25605299773238266, + "grad_norm": 0.76953125, + "learning_rate": 0.00018747035001093096, + "loss": 1.0066, + "step": 9972 + }, + { + "epoch": 0.25607867492830444, + "grad_norm": 0.8125, + "learning_rate": 0.00018746818631082198, + "loss": 1.0356, + "step": 9973 + }, + { + "epoch": 0.2561043521242263, + "grad_norm": 0.78125, + "learning_rate": 0.00018746602243639683, + "loss": 0.927, + "step": 9974 + }, + { + "epoch": 0.2561300293201481, + "grad_norm": 0.76171875, + "learning_rate": 0.00018746385838765977, + "loss": 1.047, + "step": 9975 + }, + { + "epoch": 0.2561557065160699, + "grad_norm": 0.79296875, + "learning_rate": 0.00018746169416461515, + "loss": 1.116, + "step": 9976 + }, + { + "epoch": 0.25618138371199173, + "grad_norm": 0.81640625, + "learning_rate": 0.00018745952976726725, + "loss": 1.0588, + "step": 9977 + }, + { + "epoch": 0.25620706090791356, + "grad_norm": 0.7578125, + "learning_rate": 0.00018745736519562044, + "loss": 0.8431, + "step": 9978 + }, + { + "epoch": 0.25623273810383534, + "grad_norm": 0.76171875, + "learning_rate": 0.000187455200449679, + "loss": 1.0406, + "step": 9979 + }, + { + "epoch": 0.2562584152997572, + "grad_norm": 0.8125, + "learning_rate": 0.00018745303552944722, + "loss": 0.9517, + "step": 9980 + }, + { + "epoch": 0.256284092495679, + "grad_norm": 0.80859375, + "learning_rate": 0.00018745087043492946, + "loss": 0.9605, + "step": 9981 + }, + { + "epoch": 0.25630976969160085, + "grad_norm": 0.8203125, + "learning_rate": 0.00018744870516613, + "loss": 1.0182, + "step": 9982 + }, + { + "epoch": 0.25633544688752263, + "grad_norm": 0.81640625, + "learning_rate": 0.0001874465397230532, + "loss": 1.0156, + "step": 9983 + }, + { + "epoch": 0.25636112408344447, + "grad_norm": 0.7890625, + "learning_rate": 0.0001874443741057033, + "loss": 1.0109, + "step": 9984 + }, + { + "epoch": 0.2563868012793663, + "grad_norm": 0.8203125, + "learning_rate": 0.0001874422083140847, + "loss": 1.0115, + "step": 9985 + }, + { + "epoch": 0.2564124784752881, + "grad_norm": 0.796875, + "learning_rate": 0.00018744004234820167, + "loss": 0.987, + "step": 9986 + }, + { + "epoch": 0.2564381556712099, + "grad_norm": 0.7578125, + "learning_rate": 0.00018743787620805852, + "loss": 1.002, + "step": 9987 + }, + { + "epoch": 0.25646383286713176, + "grad_norm": 0.765625, + "learning_rate": 0.0001874357098936596, + "loss": 0.9971, + "step": 9988 + }, + { + "epoch": 0.25648951006305354, + "grad_norm": 0.76171875, + "learning_rate": 0.00018743354340500917, + "loss": 0.8936, + "step": 9989 + }, + { + "epoch": 0.2565151872589754, + "grad_norm": 0.75, + "learning_rate": 0.00018743137674211163, + "loss": 1.0586, + "step": 9990 + }, + { + "epoch": 0.2565408644548972, + "grad_norm": 0.7734375, + "learning_rate": 0.00018742920990497125, + "loss": 1.059, + "step": 9991 + }, + { + "epoch": 0.25656654165081905, + "grad_norm": 0.79296875, + "learning_rate": 0.00018742704289359234, + "loss": 0.9939, + "step": 9992 + }, + { + "epoch": 0.2565922188467408, + "grad_norm": 1.0625, + "learning_rate": 0.00018742487570797922, + "loss": 0.9403, + "step": 9993 + }, + { + "epoch": 0.25661789604266266, + "grad_norm": 0.7734375, + "learning_rate": 0.00018742270834813622, + "loss": 0.9578, + "step": 9994 + }, + { + "epoch": 0.2566435732385845, + "grad_norm": 0.796875, + "learning_rate": 0.00018742054081406767, + "loss": 1.1064, + "step": 9995 + }, + { + "epoch": 0.2566692504345063, + "grad_norm": 0.78515625, + "learning_rate": 0.00018741837310577787, + "loss": 1.0137, + "step": 9996 + }, + { + "epoch": 0.2566949276304281, + "grad_norm": 0.76171875, + "learning_rate": 0.00018741620522327117, + "loss": 0.9761, + "step": 9997 + }, + { + "epoch": 0.25672060482634995, + "grad_norm": 0.984375, + "learning_rate": 0.00018741403716655185, + "loss": 1.0893, + "step": 9998 + }, + { + "epoch": 0.25674628202227173, + "grad_norm": 0.79296875, + "learning_rate": 0.00018741186893562426, + "loss": 0.9156, + "step": 9999 + }, + { + "epoch": 0.25677195921819357, + "grad_norm": 0.81640625, + "learning_rate": 0.00018740970053049273, + "loss": 1.0679, + "step": 10000 + }, + { + "epoch": 0.25677195921819357, + "eval_loss": 0.9963493943214417, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 404.1465, + "eval_samples_per_second": 24.744, + "eval_steps_per_second": 0.774, + "step": 10000 + }, + { + "epoch": 0.2567976364141154, + "grad_norm": 0.765625, + "learning_rate": 0.0001874075319511615, + "loss": 0.9687, + "step": 10001 + }, + { + "epoch": 0.25682331361003724, + "grad_norm": 0.8046875, + "learning_rate": 0.000187405363197635, + "loss": 0.9475, + "step": 10002 + }, + { + "epoch": 0.256848990805959, + "grad_norm": 0.8671875, + "learning_rate": 0.0001874031942699175, + "loss": 1.1426, + "step": 10003 + }, + { + "epoch": 0.25687466800188086, + "grad_norm": 0.7890625, + "learning_rate": 0.00018740102516801333, + "loss": 1.0333, + "step": 10004 + }, + { + "epoch": 0.2569003451978027, + "grad_norm": 0.81640625, + "learning_rate": 0.0001873988558919268, + "loss": 0.942, + "step": 10005 + }, + { + "epoch": 0.2569260223937245, + "grad_norm": 0.80859375, + "learning_rate": 0.00018739668644166227, + "loss": 0.9549, + "step": 10006 + }, + { + "epoch": 0.2569516995896463, + "grad_norm": 0.8203125, + "learning_rate": 0.00018739451681722401, + "loss": 0.8684, + "step": 10007 + }, + { + "epoch": 0.25697737678556815, + "grad_norm": 0.77734375, + "learning_rate": 0.00018739234701861638, + "loss": 1.0622, + "step": 10008 + }, + { + "epoch": 0.2570030539814899, + "grad_norm": 0.81640625, + "learning_rate": 0.00018739017704584373, + "loss": 1.0993, + "step": 10009 + }, + { + "epoch": 0.25702873117741176, + "grad_norm": 0.77734375, + "learning_rate": 0.00018738800689891032, + "loss": 1.1425, + "step": 10010 + }, + { + "epoch": 0.2570544083733336, + "grad_norm": 0.79296875, + "learning_rate": 0.00018738583657782048, + "loss": 0.9757, + "step": 10011 + }, + { + "epoch": 0.25708008556925543, + "grad_norm": 1.46875, + "learning_rate": 0.0001873836660825786, + "loss": 0.8994, + "step": 10012 + }, + { + "epoch": 0.2571057627651772, + "grad_norm": 0.76171875, + "learning_rate": 0.00018738149541318896, + "loss": 0.9585, + "step": 10013 + }, + { + "epoch": 0.25713143996109905, + "grad_norm": 0.81640625, + "learning_rate": 0.0001873793245696559, + "loss": 1.109, + "step": 10014 + }, + { + "epoch": 0.2571571171570209, + "grad_norm": 0.828125, + "learning_rate": 0.00018737715355198373, + "loss": 0.8697, + "step": 10015 + }, + { + "epoch": 0.25718279435294267, + "grad_norm": 0.7421875, + "learning_rate": 0.0001873749823601768, + "loss": 1.0541, + "step": 10016 + }, + { + "epoch": 0.2572084715488645, + "grad_norm": 0.82421875, + "learning_rate": 0.0001873728109942394, + "loss": 1.1324, + "step": 10017 + }, + { + "epoch": 0.25723414874478634, + "grad_norm": 0.78515625, + "learning_rate": 0.00018737063945417588, + "loss": 0.9837, + "step": 10018 + }, + { + "epoch": 0.2572598259407081, + "grad_norm": 0.828125, + "learning_rate": 0.0001873684677399906, + "loss": 1.2442, + "step": 10019 + }, + { + "epoch": 0.25728550313662996, + "grad_norm": 0.90234375, + "learning_rate": 0.0001873662958516878, + "loss": 0.9612, + "step": 10020 + }, + { + "epoch": 0.2573111803325518, + "grad_norm": 0.74609375, + "learning_rate": 0.00018736412378927191, + "loss": 1.0175, + "step": 10021 + }, + { + "epoch": 0.25733685752847363, + "grad_norm": 0.90234375, + "learning_rate": 0.0001873619515527472, + "loss": 1.1272, + "step": 10022 + }, + { + "epoch": 0.2573625347243954, + "grad_norm": 0.7734375, + "learning_rate": 0.000187359779142118, + "loss": 0.9439, + "step": 10023 + }, + { + "epoch": 0.25738821192031724, + "grad_norm": 0.83984375, + "learning_rate": 0.00018735760655738868, + "loss": 1.1254, + "step": 10024 + }, + { + "epoch": 0.2574138891162391, + "grad_norm": 0.8203125, + "learning_rate": 0.0001873554337985635, + "loss": 0.9528, + "step": 10025 + }, + { + "epoch": 0.25743956631216086, + "grad_norm": 0.75390625, + "learning_rate": 0.00018735326086564687, + "loss": 0.982, + "step": 10026 + }, + { + "epoch": 0.2574652435080827, + "grad_norm": 0.79296875, + "learning_rate": 0.00018735108775864306, + "loss": 1.0036, + "step": 10027 + }, + { + "epoch": 0.25749092070400453, + "grad_norm": 0.7890625, + "learning_rate": 0.00018734891447755642, + "loss": 1.0245, + "step": 10028 + }, + { + "epoch": 0.2575165978999263, + "grad_norm": 0.8515625, + "learning_rate": 0.00018734674102239128, + "loss": 1.0054, + "step": 10029 + }, + { + "epoch": 0.25754227509584815, + "grad_norm": 0.82421875, + "learning_rate": 0.000187344567393152, + "loss": 1.2033, + "step": 10030 + }, + { + "epoch": 0.25756795229177, + "grad_norm": 0.734375, + "learning_rate": 0.00018734239358984288, + "loss": 1.0215, + "step": 10031 + }, + { + "epoch": 0.2575936294876918, + "grad_norm": 0.875, + "learning_rate": 0.00018734021961246824, + "loss": 0.9564, + "step": 10032 + }, + { + "epoch": 0.2576193066836136, + "grad_norm": 0.796875, + "learning_rate": 0.00018733804546103245, + "loss": 0.8603, + "step": 10033 + }, + { + "epoch": 0.25764498387953544, + "grad_norm": 0.80078125, + "learning_rate": 0.00018733587113553982, + "loss": 0.9886, + "step": 10034 + }, + { + "epoch": 0.2576706610754573, + "grad_norm": 0.796875, + "learning_rate": 0.0001873336966359947, + "loss": 0.9741, + "step": 10035 + }, + { + "epoch": 0.25769633827137906, + "grad_norm": 0.78515625, + "learning_rate": 0.00018733152196240141, + "loss": 0.996, + "step": 10036 + }, + { + "epoch": 0.2577220154673009, + "grad_norm": 0.82421875, + "learning_rate": 0.00018732934711476425, + "loss": 1.1496, + "step": 10037 + }, + { + "epoch": 0.2577476926632227, + "grad_norm": 0.90625, + "learning_rate": 0.00018732717209308763, + "loss": 0.9892, + "step": 10038 + }, + { + "epoch": 0.2577733698591445, + "grad_norm": 0.73046875, + "learning_rate": 0.00018732499689737582, + "loss": 0.8424, + "step": 10039 + }, + { + "epoch": 0.25779904705506634, + "grad_norm": 0.8125, + "learning_rate": 0.0001873228215276332, + "loss": 0.9869, + "step": 10040 + }, + { + "epoch": 0.2578247242509882, + "grad_norm": 0.7734375, + "learning_rate": 0.00018732064598386405, + "loss": 0.909, + "step": 10041 + }, + { + "epoch": 0.25785040144691, + "grad_norm": 0.84765625, + "learning_rate": 0.00018731847026607276, + "loss": 1.1438, + "step": 10042 + }, + { + "epoch": 0.2578760786428318, + "grad_norm": 0.83203125, + "learning_rate": 0.00018731629437426366, + "loss": 1.0595, + "step": 10043 + }, + { + "epoch": 0.25790175583875363, + "grad_norm": 0.8046875, + "learning_rate": 0.00018731411830844106, + "loss": 1.0015, + "step": 10044 + }, + { + "epoch": 0.25792743303467547, + "grad_norm": 0.78515625, + "learning_rate": 0.0001873119420686093, + "loss": 0.9198, + "step": 10045 + }, + { + "epoch": 0.25795311023059725, + "grad_norm": 0.7734375, + "learning_rate": 0.00018730976565477274, + "loss": 1.0633, + "step": 10046 + }, + { + "epoch": 0.2579787874265191, + "grad_norm": 0.7734375, + "learning_rate": 0.00018730758906693572, + "loss": 0.9562, + "step": 10047 + }, + { + "epoch": 0.2580044646224409, + "grad_norm": 0.7421875, + "learning_rate": 0.00018730541230510252, + "loss": 1.0014, + "step": 10048 + }, + { + "epoch": 0.2580301418183627, + "grad_norm": 0.890625, + "learning_rate": 0.00018730323536927755, + "loss": 1.1518, + "step": 10049 + }, + { + "epoch": 0.25805581901428454, + "grad_norm": 0.77734375, + "learning_rate": 0.0001873010582594651, + "loss": 0.9861, + "step": 10050 + }, + { + "epoch": 0.2580814962102064, + "grad_norm": 0.82421875, + "learning_rate": 0.00018729888097566955, + "loss": 1.0085, + "step": 10051 + }, + { + "epoch": 0.2581071734061282, + "grad_norm": 0.7890625, + "learning_rate": 0.0001872967035178952, + "loss": 0.9307, + "step": 10052 + }, + { + "epoch": 0.25813285060205, + "grad_norm": 0.87109375, + "learning_rate": 0.0001872945258861464, + "loss": 1.0001, + "step": 10053 + }, + { + "epoch": 0.2581585277979718, + "grad_norm": 0.73828125, + "learning_rate": 0.00018729234808042747, + "loss": 1.0081, + "step": 10054 + }, + { + "epoch": 0.25818420499389366, + "grad_norm": 0.83203125, + "learning_rate": 0.00018729017010074282, + "loss": 1.0503, + "step": 10055 + }, + { + "epoch": 0.25820988218981544, + "grad_norm": 0.7734375, + "learning_rate": 0.00018728799194709674, + "loss": 1.1255, + "step": 10056 + }, + { + "epoch": 0.2582355593857373, + "grad_norm": 0.81640625, + "learning_rate": 0.0001872858136194936, + "loss": 1.1203, + "step": 10057 + }, + { + "epoch": 0.2582612365816591, + "grad_norm": 0.82421875, + "learning_rate": 0.00018728363511793768, + "loss": 1.0681, + "step": 10058 + }, + { + "epoch": 0.2582869137775809, + "grad_norm": 0.76953125, + "learning_rate": 0.00018728145644243335, + "loss": 0.9302, + "step": 10059 + }, + { + "epoch": 0.25831259097350273, + "grad_norm": 0.78515625, + "learning_rate": 0.00018727927759298497, + "loss": 0.967, + "step": 10060 + }, + { + "epoch": 0.25833826816942457, + "grad_norm": 0.83984375, + "learning_rate": 0.0001872770985695969, + "loss": 0.9148, + "step": 10061 + }, + { + "epoch": 0.2583639453653464, + "grad_norm": 0.75, + "learning_rate": 0.0001872749193722734, + "loss": 0.9673, + "step": 10062 + }, + { + "epoch": 0.2583896225612682, + "grad_norm": 0.8046875, + "learning_rate": 0.00018727274000101892, + "loss": 1.0621, + "step": 10063 + }, + { + "epoch": 0.25841529975719, + "grad_norm": 0.81640625, + "learning_rate": 0.00018727056045583776, + "loss": 1.0697, + "step": 10064 + }, + { + "epoch": 0.25844097695311186, + "grad_norm": 1.0234375, + "learning_rate": 0.00018726838073673422, + "loss": 1.1381, + "step": 10065 + }, + { + "epoch": 0.25846665414903364, + "grad_norm": 0.77734375, + "learning_rate": 0.0001872662008437127, + "loss": 0.9198, + "step": 10066 + }, + { + "epoch": 0.2584923313449555, + "grad_norm": 0.83203125, + "learning_rate": 0.00018726402077677754, + "loss": 1.0847, + "step": 10067 + }, + { + "epoch": 0.2585180085408773, + "grad_norm": 0.71875, + "learning_rate": 0.00018726184053593302, + "loss": 0.9037, + "step": 10068 + }, + { + "epoch": 0.2585436857367991, + "grad_norm": 0.84765625, + "learning_rate": 0.00018725966012118358, + "loss": 1.069, + "step": 10069 + }, + { + "epoch": 0.2585693629327209, + "grad_norm": 0.80859375, + "learning_rate": 0.00018725747953253353, + "loss": 0.9799, + "step": 10070 + }, + { + "epoch": 0.25859504012864276, + "grad_norm": 0.7734375, + "learning_rate": 0.00018725529876998715, + "loss": 0.9867, + "step": 10071 + }, + { + "epoch": 0.2586207173245646, + "grad_norm": 0.78125, + "learning_rate": 0.0001872531178335489, + "loss": 1.0316, + "step": 10072 + }, + { + "epoch": 0.2586463945204864, + "grad_norm": 0.77734375, + "learning_rate": 0.00018725093672322302, + "loss": 0.8281, + "step": 10073 + }, + { + "epoch": 0.2586720717164082, + "grad_norm": 0.78515625, + "learning_rate": 0.00018724875543901394, + "loss": 1.0008, + "step": 10074 + }, + { + "epoch": 0.25869774891233005, + "grad_norm": 0.74609375, + "learning_rate": 0.00018724657398092593, + "loss": 0.946, + "step": 10075 + }, + { + "epoch": 0.25872342610825183, + "grad_norm": 0.75, + "learning_rate": 0.00018724439234896342, + "loss": 1.0707, + "step": 10076 + }, + { + "epoch": 0.25874910330417367, + "grad_norm": 0.87109375, + "learning_rate": 0.00018724221054313068, + "loss": 0.894, + "step": 10077 + }, + { + "epoch": 0.2587747805000955, + "grad_norm": 0.828125, + "learning_rate": 0.0001872400285634321, + "loss": 1.1928, + "step": 10078 + }, + { + "epoch": 0.2588004576960173, + "grad_norm": 0.78125, + "learning_rate": 0.00018723784640987208, + "loss": 0.9223, + "step": 10079 + }, + { + "epoch": 0.2588261348919391, + "grad_norm": 0.8125, + "learning_rate": 0.00018723566408245486, + "loss": 1.0471, + "step": 10080 + }, + { + "epoch": 0.25885181208786096, + "grad_norm": 0.8125, + "learning_rate": 0.00018723348158118484, + "loss": 1.0956, + "step": 10081 + }, + { + "epoch": 0.2588774892837828, + "grad_norm": 0.79296875, + "learning_rate": 0.0001872312989060664, + "loss": 1.0814, + "step": 10082 + }, + { + "epoch": 0.25890316647970457, + "grad_norm": 0.79296875, + "learning_rate": 0.00018722911605710386, + "loss": 0.8467, + "step": 10083 + }, + { + "epoch": 0.2589288436756264, + "grad_norm": 0.7890625, + "learning_rate": 0.00018722693303430154, + "loss": 0.9253, + "step": 10084 + }, + { + "epoch": 0.25895452087154824, + "grad_norm": 2.109375, + "learning_rate": 0.00018722474983766383, + "loss": 1.1037, + "step": 10085 + }, + { + "epoch": 0.25898019806747, + "grad_norm": 0.8515625, + "learning_rate": 0.00018722256646719508, + "loss": 1.0695, + "step": 10086 + }, + { + "epoch": 0.25900587526339186, + "grad_norm": 0.75390625, + "learning_rate": 0.00018722038292289962, + "loss": 1.0286, + "step": 10087 + }, + { + "epoch": 0.2590315524593137, + "grad_norm": 0.78125, + "learning_rate": 0.00018721819920478183, + "loss": 1.0373, + "step": 10088 + }, + { + "epoch": 0.2590572296552355, + "grad_norm": 0.796875, + "learning_rate": 0.00018721601531284605, + "loss": 1.0749, + "step": 10089 + }, + { + "epoch": 0.2590829068511573, + "grad_norm": 0.83984375, + "learning_rate": 0.0001872138312470966, + "loss": 1.0037, + "step": 10090 + }, + { + "epoch": 0.25910858404707915, + "grad_norm": 0.76171875, + "learning_rate": 0.0001872116470075379, + "loss": 1.0117, + "step": 10091 + }, + { + "epoch": 0.259134261243001, + "grad_norm": 0.74609375, + "learning_rate": 0.00018720946259417426, + "loss": 0.9216, + "step": 10092 + }, + { + "epoch": 0.25915993843892277, + "grad_norm": 0.86328125, + "learning_rate": 0.00018720727800701004, + "loss": 0.9313, + "step": 10093 + }, + { + "epoch": 0.2591856156348446, + "grad_norm": 0.9375, + "learning_rate": 0.00018720509324604957, + "loss": 0.9199, + "step": 10094 + }, + { + "epoch": 0.25921129283076644, + "grad_norm": 0.78515625, + "learning_rate": 0.00018720290831129724, + "loss": 0.9363, + "step": 10095 + }, + { + "epoch": 0.2592369700266882, + "grad_norm": 0.7734375, + "learning_rate": 0.00018720072320275742, + "loss": 0.9229, + "step": 10096 + }, + { + "epoch": 0.25926264722261005, + "grad_norm": 0.8515625, + "learning_rate": 0.0001871985379204344, + "loss": 0.9943, + "step": 10097 + }, + { + "epoch": 0.2592883244185319, + "grad_norm": 0.86328125, + "learning_rate": 0.00018719635246433258, + "loss": 1.1223, + "step": 10098 + }, + { + "epoch": 0.25931400161445367, + "grad_norm": 1.5546875, + "learning_rate": 0.0001871941668344563, + "loss": 1.1309, + "step": 10099 + }, + { + "epoch": 0.2593396788103755, + "grad_norm": 0.8046875, + "learning_rate": 0.0001871919810308099, + "loss": 1.0272, + "step": 10100 + }, + { + "epoch": 0.25936535600629734, + "grad_norm": 0.84765625, + "learning_rate": 0.0001871897950533978, + "loss": 0.9455, + "step": 10101 + }, + { + "epoch": 0.2593910332022192, + "grad_norm": 0.84765625, + "learning_rate": 0.00018718760890222428, + "loss": 1.0813, + "step": 10102 + }, + { + "epoch": 0.25941671039814096, + "grad_norm": 0.7734375, + "learning_rate": 0.00018718542257729377, + "loss": 0.86, + "step": 10103 + }, + { + "epoch": 0.2594423875940628, + "grad_norm": 0.79296875, + "learning_rate": 0.00018718323607861054, + "loss": 0.9876, + "step": 10104 + }, + { + "epoch": 0.25946806478998463, + "grad_norm": 0.88671875, + "learning_rate": 0.000187181049406179, + "loss": 0.9263, + "step": 10105 + }, + { + "epoch": 0.2594937419859064, + "grad_norm": 0.8125, + "learning_rate": 0.00018717886256000357, + "loss": 1.0546, + "step": 10106 + }, + { + "epoch": 0.25951941918182825, + "grad_norm": 0.82421875, + "learning_rate": 0.0001871766755400885, + "loss": 0.9972, + "step": 10107 + }, + { + "epoch": 0.2595450963777501, + "grad_norm": 0.8515625, + "learning_rate": 0.00018717448834643818, + "loss": 1.037, + "step": 10108 + }, + { + "epoch": 0.25957077357367186, + "grad_norm": 0.76953125, + "learning_rate": 0.00018717230097905702, + "loss": 1.096, + "step": 10109 + }, + { + "epoch": 0.2595964507695937, + "grad_norm": 0.85546875, + "learning_rate": 0.00018717011343794929, + "loss": 0.9268, + "step": 10110 + }, + { + "epoch": 0.25962212796551554, + "grad_norm": 0.76171875, + "learning_rate": 0.00018716792572311938, + "loss": 1.0102, + "step": 10111 + }, + { + "epoch": 0.2596478051614374, + "grad_norm": 0.8671875, + "learning_rate": 0.0001871657378345717, + "loss": 1.1562, + "step": 10112 + }, + { + "epoch": 0.25967348235735915, + "grad_norm": 0.7578125, + "learning_rate": 0.00018716354977231059, + "loss": 0.8777, + "step": 10113 + }, + { + "epoch": 0.259699159553281, + "grad_norm": 0.87109375, + "learning_rate": 0.0001871613615363404, + "loss": 1.0559, + "step": 10114 + }, + { + "epoch": 0.2597248367492028, + "grad_norm": 0.77734375, + "learning_rate": 0.00018715917312666546, + "loss": 0.9847, + "step": 10115 + }, + { + "epoch": 0.2597505139451246, + "grad_norm": 0.8125, + "learning_rate": 0.0001871569845432902, + "loss": 0.9835, + "step": 10116 + }, + { + "epoch": 0.25977619114104644, + "grad_norm": 0.796875, + "learning_rate": 0.0001871547957862189, + "loss": 1.049, + "step": 10117 + }, + { + "epoch": 0.2598018683369683, + "grad_norm": 0.921875, + "learning_rate": 0.000187152606855456, + "loss": 1.1085, + "step": 10118 + }, + { + "epoch": 0.25982754553289006, + "grad_norm": 0.82421875, + "learning_rate": 0.00018715041775100582, + "loss": 1.0696, + "step": 10119 + }, + { + "epoch": 0.2598532227288119, + "grad_norm": 0.79296875, + "learning_rate": 0.00018714822847287272, + "loss": 0.9334, + "step": 10120 + }, + { + "epoch": 0.25987889992473373, + "grad_norm": 0.7578125, + "learning_rate": 0.00018714603902106105, + "loss": 1.0518, + "step": 10121 + }, + { + "epoch": 0.2599045771206555, + "grad_norm": 0.84765625, + "learning_rate": 0.00018714384939557524, + "loss": 1.0415, + "step": 10122 + }, + { + "epoch": 0.25993025431657735, + "grad_norm": 0.76953125, + "learning_rate": 0.00018714165959641957, + "loss": 1.0026, + "step": 10123 + }, + { + "epoch": 0.2599559315124992, + "grad_norm": 0.8125, + "learning_rate": 0.00018713946962359848, + "loss": 1.0183, + "step": 10124 + }, + { + "epoch": 0.259981608708421, + "grad_norm": 0.8671875, + "learning_rate": 0.0001871372794771163, + "loss": 0.9128, + "step": 10125 + }, + { + "epoch": 0.2600072859043428, + "grad_norm": 0.7265625, + "learning_rate": 0.00018713508915697738, + "loss": 0.9514, + "step": 10126 + }, + { + "epoch": 0.26003296310026464, + "grad_norm": 0.796875, + "learning_rate": 0.00018713289866318609, + "loss": 1.0121, + "step": 10127 + }, + { + "epoch": 0.26005864029618647, + "grad_norm": 0.79296875, + "learning_rate": 0.00018713070799574678, + "loss": 0.9471, + "step": 10128 + }, + { + "epoch": 0.26008431749210825, + "grad_norm": 0.7890625, + "learning_rate": 0.00018712851715466387, + "loss": 1.0331, + "step": 10129 + }, + { + "epoch": 0.2601099946880301, + "grad_norm": 0.90625, + "learning_rate": 0.0001871263261399417, + "loss": 1.007, + "step": 10130 + }, + { + "epoch": 0.2601356718839519, + "grad_norm": 0.79296875, + "learning_rate": 0.00018712413495158462, + "loss": 1.1366, + "step": 10131 + }, + { + "epoch": 0.2601613490798737, + "grad_norm": 0.8125, + "learning_rate": 0.00018712194358959703, + "loss": 0.9593, + "step": 10132 + }, + { + "epoch": 0.26018702627579554, + "grad_norm": 0.796875, + "learning_rate": 0.00018711975205398327, + "loss": 0.9765, + "step": 10133 + }, + { + "epoch": 0.2602127034717174, + "grad_norm": 0.7578125, + "learning_rate": 0.0001871175603447477, + "loss": 0.9931, + "step": 10134 + }, + { + "epoch": 0.2602383806676392, + "grad_norm": 0.75, + "learning_rate": 0.0001871153684618947, + "loss": 0.8979, + "step": 10135 + }, + { + "epoch": 0.260264057863561, + "grad_norm": 0.796875, + "learning_rate": 0.00018711317640542866, + "loss": 1.0543, + "step": 10136 + }, + { + "epoch": 0.26028973505948283, + "grad_norm": 0.7421875, + "learning_rate": 0.0001871109841753539, + "loss": 0.9954, + "step": 10137 + }, + { + "epoch": 0.26031541225540467, + "grad_norm": 0.7734375, + "learning_rate": 0.00018710879177167482, + "loss": 1.034, + "step": 10138 + }, + { + "epoch": 0.26034108945132645, + "grad_norm": 0.83203125, + "learning_rate": 0.0001871065991943958, + "loss": 1.0288, + "step": 10139 + }, + { + "epoch": 0.2603667666472483, + "grad_norm": 0.78125, + "learning_rate": 0.00018710440644352118, + "loss": 1.0533, + "step": 10140 + }, + { + "epoch": 0.2603924438431701, + "grad_norm": 0.76171875, + "learning_rate": 0.00018710221351905538, + "loss": 0.9398, + "step": 10141 + }, + { + "epoch": 0.2604181210390919, + "grad_norm": 0.796875, + "learning_rate": 0.00018710002042100272, + "loss": 0.9975, + "step": 10142 + }, + { + "epoch": 0.26044379823501373, + "grad_norm": 0.734375, + "learning_rate": 0.00018709782714936756, + "loss": 0.8484, + "step": 10143 + }, + { + "epoch": 0.26046947543093557, + "grad_norm": 0.78515625, + "learning_rate": 0.00018709563370415433, + "loss": 1.0459, + "step": 10144 + }, + { + "epoch": 0.2604951526268574, + "grad_norm": 0.77734375, + "learning_rate": 0.00018709344008536736, + "loss": 1.0838, + "step": 10145 + }, + { + "epoch": 0.2605208298227792, + "grad_norm": 0.828125, + "learning_rate": 0.00018709124629301104, + "loss": 1.0396, + "step": 10146 + }, + { + "epoch": 0.260546507018701, + "grad_norm": 0.80078125, + "learning_rate": 0.00018708905232708972, + "loss": 1.0194, + "step": 10147 + }, + { + "epoch": 0.26057218421462286, + "grad_norm": 0.78515625, + "learning_rate": 0.00018708685818760777, + "loss": 0.9622, + "step": 10148 + }, + { + "epoch": 0.26059786141054464, + "grad_norm": 0.8671875, + "learning_rate": 0.00018708466387456962, + "loss": 1.165, + "step": 10149 + }, + { + "epoch": 0.2606235386064665, + "grad_norm": 0.83984375, + "learning_rate": 0.00018708246938797958, + "loss": 0.9968, + "step": 10150 + }, + { + "epoch": 0.2606492158023883, + "grad_norm": 0.76171875, + "learning_rate": 0.00018708027472784204, + "loss": 0.8674, + "step": 10151 + }, + { + "epoch": 0.2606748929983101, + "grad_norm": 0.95703125, + "learning_rate": 0.00018707807989416138, + "loss": 1.0623, + "step": 10152 + }, + { + "epoch": 0.26070057019423193, + "grad_norm": 0.921875, + "learning_rate": 0.00018707588488694197, + "loss": 1.0361, + "step": 10153 + }, + { + "epoch": 0.26072624739015376, + "grad_norm": 0.7890625, + "learning_rate": 0.0001870736897061882, + "loss": 0.9858, + "step": 10154 + }, + { + "epoch": 0.2607519245860756, + "grad_norm": 0.84765625, + "learning_rate": 0.00018707149435190444, + "loss": 1.1347, + "step": 10155 + }, + { + "epoch": 0.2607776017819974, + "grad_norm": 0.83984375, + "learning_rate": 0.00018706929882409502, + "loss": 1.0776, + "step": 10156 + }, + { + "epoch": 0.2608032789779192, + "grad_norm": 0.9140625, + "learning_rate": 0.00018706710312276439, + "loss": 0.9518, + "step": 10157 + }, + { + "epoch": 0.26082895617384105, + "grad_norm": 1.1484375, + "learning_rate": 0.00018706490724791684, + "loss": 1.0613, + "step": 10158 + }, + { + "epoch": 0.26085463336976283, + "grad_norm": 0.76953125, + "learning_rate": 0.00018706271119955684, + "loss": 0.9301, + "step": 10159 + }, + { + "epoch": 0.26088031056568467, + "grad_norm": 0.78125, + "learning_rate": 0.0001870605149776887, + "loss": 0.986, + "step": 10160 + }, + { + "epoch": 0.2609059877616065, + "grad_norm": 0.7890625, + "learning_rate": 0.00018705831858231684, + "loss": 0.9258, + "step": 10161 + }, + { + "epoch": 0.2609316649575283, + "grad_norm": 0.796875, + "learning_rate": 0.0001870561220134456, + "loss": 1.0179, + "step": 10162 + }, + { + "epoch": 0.2609573421534501, + "grad_norm": 0.71484375, + "learning_rate": 0.00018705392527107936, + "loss": 0.8665, + "step": 10163 + }, + { + "epoch": 0.26098301934937196, + "grad_norm": 0.82421875, + "learning_rate": 0.00018705172835522252, + "loss": 0.925, + "step": 10164 + }, + { + "epoch": 0.2610086965452938, + "grad_norm": 0.7578125, + "learning_rate": 0.00018704953126587946, + "loss": 0.8884, + "step": 10165 + }, + { + "epoch": 0.2610343737412156, + "grad_norm": 0.78515625, + "learning_rate": 0.00018704733400305452, + "loss": 0.9949, + "step": 10166 + }, + { + "epoch": 0.2610600509371374, + "grad_norm": 0.80859375, + "learning_rate": 0.00018704513656675212, + "loss": 0.9564, + "step": 10167 + }, + { + "epoch": 0.26108572813305925, + "grad_norm": 0.86328125, + "learning_rate": 0.00018704293895697665, + "loss": 0.9157, + "step": 10168 + }, + { + "epoch": 0.261111405328981, + "grad_norm": 0.86328125, + "learning_rate": 0.00018704074117373244, + "loss": 1.0375, + "step": 10169 + }, + { + "epoch": 0.26113708252490286, + "grad_norm": 0.84375, + "learning_rate": 0.0001870385432170239, + "loss": 1.1432, + "step": 10170 + }, + { + "epoch": 0.2611627597208247, + "grad_norm": 0.78515625, + "learning_rate": 0.0001870363450868554, + "loss": 0.8966, + "step": 10171 + }, + { + "epoch": 0.2611884369167465, + "grad_norm": 0.8203125, + "learning_rate": 0.00018703414678323133, + "loss": 0.9213, + "step": 10172 + }, + { + "epoch": 0.2612141141126683, + "grad_norm": 0.8671875, + "learning_rate": 0.0001870319483061561, + "loss": 1.0508, + "step": 10173 + }, + { + "epoch": 0.26123979130859015, + "grad_norm": 0.8359375, + "learning_rate": 0.000187029749655634, + "loss": 1.2016, + "step": 10174 + }, + { + "epoch": 0.261265468504512, + "grad_norm": 0.8359375, + "learning_rate": 0.00018702755083166949, + "loss": 1.0527, + "step": 10175 + }, + { + "epoch": 0.26129114570043377, + "grad_norm": 0.8515625, + "learning_rate": 0.00018702535183426697, + "loss": 0.9928, + "step": 10176 + }, + { + "epoch": 0.2613168228963556, + "grad_norm": 0.78125, + "learning_rate": 0.00018702315266343073, + "loss": 0.9801, + "step": 10177 + }, + { + "epoch": 0.26134250009227744, + "grad_norm": 0.78515625, + "learning_rate": 0.00018702095331916524, + "loss": 1.0103, + "step": 10178 + }, + { + "epoch": 0.2613681772881992, + "grad_norm": 0.78125, + "learning_rate": 0.00018701875380147486, + "loss": 1.0281, + "step": 10179 + }, + { + "epoch": 0.26139385448412106, + "grad_norm": 0.75390625, + "learning_rate": 0.00018701655411036393, + "loss": 0.9062, + "step": 10180 + }, + { + "epoch": 0.2614195316800429, + "grad_norm": 0.828125, + "learning_rate": 0.0001870143542458369, + "loss": 0.9636, + "step": 10181 + }, + { + "epoch": 0.2614452088759647, + "grad_norm": 0.76953125, + "learning_rate": 0.0001870121542078981, + "loss": 1.1048, + "step": 10182 + }, + { + "epoch": 0.2614708860718865, + "grad_norm": 0.86328125, + "learning_rate": 0.00018700995399655192, + "loss": 0.858, + "step": 10183 + }, + { + "epoch": 0.26149656326780835, + "grad_norm": 0.81640625, + "learning_rate": 0.0001870077536118028, + "loss": 0.9771, + "step": 10184 + }, + { + "epoch": 0.2615222404637302, + "grad_norm": 0.75390625, + "learning_rate": 0.00018700555305365508, + "loss": 0.9751, + "step": 10185 + }, + { + "epoch": 0.26154791765965196, + "grad_norm": 0.8984375, + "learning_rate": 0.00018700335232211315, + "loss": 0.9652, + "step": 10186 + }, + { + "epoch": 0.2615735948555738, + "grad_norm": 0.8046875, + "learning_rate": 0.00018700115141718137, + "loss": 0.915, + "step": 10187 + }, + { + "epoch": 0.26159927205149563, + "grad_norm": 0.81640625, + "learning_rate": 0.00018699895033886418, + "loss": 0.9169, + "step": 10188 + }, + { + "epoch": 0.2616249492474174, + "grad_norm": 0.7890625, + "learning_rate": 0.00018699674908716592, + "loss": 1.059, + "step": 10189 + }, + { + "epoch": 0.26165062644333925, + "grad_norm": 0.8828125, + "learning_rate": 0.00018699454766209103, + "loss": 1.096, + "step": 10190 + }, + { + "epoch": 0.2616763036392611, + "grad_norm": 0.80859375, + "learning_rate": 0.00018699234606364383, + "loss": 1.0521, + "step": 10191 + }, + { + "epoch": 0.26170198083518287, + "grad_norm": 0.80859375, + "learning_rate": 0.0001869901442918288, + "loss": 1.0881, + "step": 10192 + }, + { + "epoch": 0.2617276580311047, + "grad_norm": 0.73828125, + "learning_rate": 0.0001869879423466502, + "loss": 0.9756, + "step": 10193 + }, + { + "epoch": 0.26175333522702654, + "grad_norm": 0.796875, + "learning_rate": 0.00018698574022811253, + "loss": 0.9715, + "step": 10194 + }, + { + "epoch": 0.2617790124229484, + "grad_norm": 0.76953125, + "learning_rate": 0.00018698353793622014, + "loss": 0.9934, + "step": 10195 + }, + { + "epoch": 0.26180468961887016, + "grad_norm": 0.77734375, + "learning_rate": 0.00018698133547097738, + "loss": 1.1265, + "step": 10196 + }, + { + "epoch": 0.261830366814792, + "grad_norm": 0.7578125, + "learning_rate": 0.0001869791328323887, + "loss": 0.9302, + "step": 10197 + }, + { + "epoch": 0.26185604401071383, + "grad_norm": 0.81640625, + "learning_rate": 0.00018697693002045845, + "loss": 0.9759, + "step": 10198 + }, + { + "epoch": 0.2618817212066356, + "grad_norm": 0.796875, + "learning_rate": 0.00018697472703519105, + "loss": 1.1166, + "step": 10199 + }, + { + "epoch": 0.26190739840255745, + "grad_norm": 0.82421875, + "learning_rate": 0.00018697252387659088, + "loss": 0.9506, + "step": 10200 + }, + { + "epoch": 0.2619330755984793, + "grad_norm": 0.82421875, + "learning_rate": 0.00018697032054466233, + "loss": 0.9942, + "step": 10201 + }, + { + "epoch": 0.26195875279440106, + "grad_norm": 0.8125, + "learning_rate": 0.00018696811703940977, + "loss": 1.0939, + "step": 10202 + }, + { + "epoch": 0.2619844299903229, + "grad_norm": 0.8203125, + "learning_rate": 0.00018696591336083763, + "loss": 0.9215, + "step": 10203 + }, + { + "epoch": 0.26201010718624473, + "grad_norm": 0.79296875, + "learning_rate": 0.00018696370950895027, + "loss": 0.9768, + "step": 10204 + }, + { + "epoch": 0.26203578438216657, + "grad_norm": 0.76953125, + "learning_rate": 0.00018696150548375208, + "loss": 0.9092, + "step": 10205 + }, + { + "epoch": 0.26206146157808835, + "grad_norm": 0.78125, + "learning_rate": 0.00018695930128524748, + "loss": 0.7593, + "step": 10206 + }, + { + "epoch": 0.2620871387740102, + "grad_norm": 0.77734375, + "learning_rate": 0.00018695709691344083, + "loss": 0.9732, + "step": 10207 + }, + { + "epoch": 0.262112815969932, + "grad_norm": 0.8125, + "learning_rate": 0.00018695489236833653, + "loss": 0.8607, + "step": 10208 + }, + { + "epoch": 0.2621384931658538, + "grad_norm": 0.796875, + "learning_rate": 0.00018695268764993902, + "loss": 0.8646, + "step": 10209 + }, + { + "epoch": 0.26216417036177564, + "grad_norm": 0.90625, + "learning_rate": 0.00018695048275825263, + "loss": 1.0494, + "step": 10210 + }, + { + "epoch": 0.2621898475576975, + "grad_norm": 0.7734375, + "learning_rate": 0.0001869482776932818, + "loss": 0.9864, + "step": 10211 + }, + { + "epoch": 0.26221552475361926, + "grad_norm": 0.7890625, + "learning_rate": 0.00018694607245503092, + "loss": 1.1509, + "step": 10212 + }, + { + "epoch": 0.2622412019495411, + "grad_norm": 0.81640625, + "learning_rate": 0.00018694386704350434, + "loss": 1.0792, + "step": 10213 + }, + { + "epoch": 0.26226687914546293, + "grad_norm": 0.87109375, + "learning_rate": 0.0001869416614587065, + "loss": 1.0878, + "step": 10214 + }, + { + "epoch": 0.26229255634138476, + "grad_norm": 0.74609375, + "learning_rate": 0.00018693945570064178, + "loss": 1.0246, + "step": 10215 + }, + { + "epoch": 0.26231823353730654, + "grad_norm": 0.8203125, + "learning_rate": 0.00018693724976931458, + "loss": 0.9731, + "step": 10216 + }, + { + "epoch": 0.2623439107332284, + "grad_norm": 0.88671875, + "learning_rate": 0.0001869350436647293, + "loss": 0.8709, + "step": 10217 + }, + { + "epoch": 0.2623695879291502, + "grad_norm": 0.73046875, + "learning_rate": 0.0001869328373868903, + "loss": 0.9736, + "step": 10218 + }, + { + "epoch": 0.262395265125072, + "grad_norm": 0.84375, + "learning_rate": 0.00018693063093580206, + "loss": 0.9708, + "step": 10219 + }, + { + "epoch": 0.26242094232099383, + "grad_norm": 0.76953125, + "learning_rate": 0.00018692842431146888, + "loss": 0.8672, + "step": 10220 + }, + { + "epoch": 0.26244661951691567, + "grad_norm": 0.8828125, + "learning_rate": 0.00018692621751389523, + "loss": 1.0683, + "step": 10221 + }, + { + "epoch": 0.26247229671283745, + "grad_norm": 0.84765625, + "learning_rate": 0.00018692401054308546, + "loss": 1.0864, + "step": 10222 + }, + { + "epoch": 0.2624979739087593, + "grad_norm": 0.8046875, + "learning_rate": 0.00018692180339904398, + "loss": 1.0739, + "step": 10223 + }, + { + "epoch": 0.2625236511046811, + "grad_norm": 0.75, + "learning_rate": 0.00018691959608177524, + "loss": 1.0256, + "step": 10224 + }, + { + "epoch": 0.26254932830060296, + "grad_norm": 0.76171875, + "learning_rate": 0.00018691738859128357, + "loss": 1.0748, + "step": 10225 + }, + { + "epoch": 0.26257500549652474, + "grad_norm": 0.73046875, + "learning_rate": 0.00018691518092757341, + "loss": 0.9652, + "step": 10226 + }, + { + "epoch": 0.2626006826924466, + "grad_norm": 0.81640625, + "learning_rate": 0.0001869129730906491, + "loss": 0.9765, + "step": 10227 + }, + { + "epoch": 0.2626263598883684, + "grad_norm": 0.828125, + "learning_rate": 0.00018691076508051512, + "loss": 0.9536, + "step": 10228 + }, + { + "epoch": 0.2626520370842902, + "grad_norm": 0.796875, + "learning_rate": 0.00018690855689717587, + "loss": 1.0733, + "step": 10229 + }, + { + "epoch": 0.262677714280212, + "grad_norm": 0.80078125, + "learning_rate": 0.00018690634854063567, + "loss": 1.0056, + "step": 10230 + }, + { + "epoch": 0.26270339147613386, + "grad_norm": 0.78515625, + "learning_rate": 0.00018690414001089898, + "loss": 1.0134, + "step": 10231 + }, + { + "epoch": 0.26272906867205564, + "grad_norm": 0.75390625, + "learning_rate": 0.00018690193130797017, + "loss": 0.9915, + "step": 10232 + }, + { + "epoch": 0.2627547458679775, + "grad_norm": 0.875, + "learning_rate": 0.0001868997224318537, + "loss": 1.085, + "step": 10233 + }, + { + "epoch": 0.2627804230638993, + "grad_norm": 0.8203125, + "learning_rate": 0.0001868975133825539, + "loss": 0.9727, + "step": 10234 + }, + { + "epoch": 0.26280610025982115, + "grad_norm": 0.7578125, + "learning_rate": 0.0001868953041600752, + "loss": 0.9247, + "step": 10235 + }, + { + "epoch": 0.26283177745574293, + "grad_norm": 1.0, + "learning_rate": 0.00018689309476442202, + "loss": 0.9167, + "step": 10236 + }, + { + "epoch": 0.26285745465166477, + "grad_norm": 0.83984375, + "learning_rate": 0.00018689088519559878, + "loss": 0.9808, + "step": 10237 + }, + { + "epoch": 0.2628831318475866, + "grad_norm": 0.796875, + "learning_rate": 0.00018688867545360984, + "loss": 0.938, + "step": 10238 + }, + { + "epoch": 0.2629088090435084, + "grad_norm": 0.78125, + "learning_rate": 0.00018688646553845957, + "loss": 0.9928, + "step": 10239 + }, + { + "epoch": 0.2629344862394302, + "grad_norm": 0.77734375, + "learning_rate": 0.0001868842554501525, + "loss": 1.19, + "step": 10240 + }, + { + "epoch": 0.26296016343535206, + "grad_norm": 0.76171875, + "learning_rate": 0.00018688204518869293, + "loss": 0.9418, + "step": 10241 + }, + { + "epoch": 0.26298584063127384, + "grad_norm": 0.73828125, + "learning_rate": 0.00018687983475408526, + "loss": 0.8492, + "step": 10242 + }, + { + "epoch": 0.2630115178271957, + "grad_norm": 0.84375, + "learning_rate": 0.00018687762414633396, + "loss": 1.0139, + "step": 10243 + }, + { + "epoch": 0.2630371950231175, + "grad_norm": 0.734375, + "learning_rate": 0.00018687541336544339, + "loss": 0.9225, + "step": 10244 + }, + { + "epoch": 0.26306287221903935, + "grad_norm": 0.76953125, + "learning_rate": 0.00018687320241141797, + "loss": 1.0887, + "step": 10245 + }, + { + "epoch": 0.2630885494149611, + "grad_norm": 0.7734375, + "learning_rate": 0.0001868709912842621, + "loss": 0.8794, + "step": 10246 + }, + { + "epoch": 0.26311422661088296, + "grad_norm": 0.77734375, + "learning_rate": 0.0001868687799839802, + "loss": 1.0378, + "step": 10247 + }, + { + "epoch": 0.2631399038068048, + "grad_norm": 0.75, + "learning_rate": 0.00018686656851057664, + "loss": 0.9079, + "step": 10248 + }, + { + "epoch": 0.2631655810027266, + "grad_norm": 0.82421875, + "learning_rate": 0.00018686435686405588, + "loss": 0.9394, + "step": 10249 + }, + { + "epoch": 0.2631912581986484, + "grad_norm": 0.8828125, + "learning_rate": 0.0001868621450444223, + "loss": 1.0738, + "step": 10250 + }, + { + "epoch": 0.26321693539457025, + "grad_norm": 0.8125, + "learning_rate": 0.00018685993305168032, + "loss": 0.8439, + "step": 10251 + }, + { + "epoch": 0.26324261259049203, + "grad_norm": 0.74609375, + "learning_rate": 0.00018685772088583433, + "loss": 0.9926, + "step": 10252 + }, + { + "epoch": 0.26326828978641387, + "grad_norm": 0.82421875, + "learning_rate": 0.00018685550854688875, + "loss": 1.0865, + "step": 10253 + }, + { + "epoch": 0.2632939669823357, + "grad_norm": 0.7734375, + "learning_rate": 0.000186853296034848, + "loss": 1.0572, + "step": 10254 + }, + { + "epoch": 0.26331964417825754, + "grad_norm": 0.76953125, + "learning_rate": 0.00018685108334971646, + "loss": 1.0213, + "step": 10255 + }, + { + "epoch": 0.2633453213741793, + "grad_norm": 0.76171875, + "learning_rate": 0.00018684887049149858, + "loss": 0.9355, + "step": 10256 + }, + { + "epoch": 0.26337099857010116, + "grad_norm": 0.734375, + "learning_rate": 0.0001868466574601987, + "loss": 0.8608, + "step": 10257 + }, + { + "epoch": 0.263396675766023, + "grad_norm": 0.80859375, + "learning_rate": 0.00018684444425582131, + "loss": 1.0487, + "step": 10258 + }, + { + "epoch": 0.2634223529619448, + "grad_norm": 0.7734375, + "learning_rate": 0.00018684223087837078, + "loss": 1.0061, + "step": 10259 + }, + { + "epoch": 0.2634480301578666, + "grad_norm": 0.76171875, + "learning_rate": 0.00018684001732785155, + "loss": 1.0844, + "step": 10260 + }, + { + "epoch": 0.26347370735378844, + "grad_norm": 0.859375, + "learning_rate": 0.00018683780360426798, + "loss": 1.1088, + "step": 10261 + }, + { + "epoch": 0.2634993845497102, + "grad_norm": 0.84765625, + "learning_rate": 0.00018683558970762452, + "loss": 1.0599, + "step": 10262 + }, + { + "epoch": 0.26352506174563206, + "grad_norm": 0.7734375, + "learning_rate": 0.00018683337563792557, + "loss": 0.9313, + "step": 10263 + }, + { + "epoch": 0.2635507389415539, + "grad_norm": 0.828125, + "learning_rate": 0.00018683116139517556, + "loss": 0.9678, + "step": 10264 + }, + { + "epoch": 0.26357641613747573, + "grad_norm": 0.80078125, + "learning_rate": 0.00018682894697937886, + "loss": 0.9422, + "step": 10265 + }, + { + "epoch": 0.2636020933333975, + "grad_norm": 0.7734375, + "learning_rate": 0.00018682673239053994, + "loss": 0.9508, + "step": 10266 + }, + { + "epoch": 0.26362777052931935, + "grad_norm": 0.7578125, + "learning_rate": 0.00018682451762866314, + "loss": 0.9511, + "step": 10267 + }, + { + "epoch": 0.2636534477252412, + "grad_norm": 0.77734375, + "learning_rate": 0.000186822302693753, + "loss": 0.954, + "step": 10268 + }, + { + "epoch": 0.26367912492116297, + "grad_norm": 0.76953125, + "learning_rate": 0.0001868200875858138, + "loss": 0.9721, + "step": 10269 + }, + { + "epoch": 0.2637048021170848, + "grad_norm": 0.86328125, + "learning_rate": 0.00018681787230485003, + "loss": 1.2228, + "step": 10270 + }, + { + "epoch": 0.26373047931300664, + "grad_norm": 0.83984375, + "learning_rate": 0.00018681565685086605, + "loss": 1.1907, + "step": 10271 + }, + { + "epoch": 0.2637561565089284, + "grad_norm": 0.765625, + "learning_rate": 0.00018681344122386634, + "loss": 1.0486, + "step": 10272 + }, + { + "epoch": 0.26378183370485025, + "grad_norm": 0.78515625, + "learning_rate": 0.00018681122542385525, + "loss": 1.0143, + "step": 10273 + }, + { + "epoch": 0.2638075109007721, + "grad_norm": 0.875, + "learning_rate": 0.00018680900945083727, + "loss": 1.0936, + "step": 10274 + }, + { + "epoch": 0.2638331880966939, + "grad_norm": 0.73828125, + "learning_rate": 0.00018680679330481675, + "loss": 0.9684, + "step": 10275 + }, + { + "epoch": 0.2638588652926157, + "grad_norm": 0.73828125, + "learning_rate": 0.0001868045769857981, + "loss": 1.0206, + "step": 10276 + }, + { + "epoch": 0.26388454248853754, + "grad_norm": 0.8515625, + "learning_rate": 0.00018680236049378583, + "loss": 1.1426, + "step": 10277 + }, + { + "epoch": 0.2639102196844594, + "grad_norm": 0.9140625, + "learning_rate": 0.00018680014382878426, + "loss": 0.9942, + "step": 10278 + }, + { + "epoch": 0.26393589688038116, + "grad_norm": 0.76953125, + "learning_rate": 0.0001867979269907979, + "loss": 0.9493, + "step": 10279 + }, + { + "epoch": 0.263961574076303, + "grad_norm": 0.84765625, + "learning_rate": 0.00018679570997983102, + "loss": 0.935, + "step": 10280 + }, + { + "epoch": 0.26398725127222483, + "grad_norm": 0.95703125, + "learning_rate": 0.0001867934927958882, + "loss": 0.9184, + "step": 10281 + }, + { + "epoch": 0.2640129284681466, + "grad_norm": 0.8046875, + "learning_rate": 0.00018679127543897377, + "loss": 1.0576, + "step": 10282 + }, + { + "epoch": 0.26403860566406845, + "grad_norm": 0.828125, + "learning_rate": 0.00018678905790909216, + "loss": 0.9874, + "step": 10283 + }, + { + "epoch": 0.2640642828599903, + "grad_norm": 0.77734375, + "learning_rate": 0.00018678684020624782, + "loss": 0.9272, + "step": 10284 + }, + { + "epoch": 0.2640899600559121, + "grad_norm": 0.77734375, + "learning_rate": 0.0001867846223304451, + "loss": 0.989, + "step": 10285 + }, + { + "epoch": 0.2641156372518339, + "grad_norm": 0.75, + "learning_rate": 0.0001867824042816885, + "loss": 0.9092, + "step": 10286 + }, + { + "epoch": 0.26414131444775574, + "grad_norm": 0.87109375, + "learning_rate": 0.0001867801860599824, + "loss": 1.0112, + "step": 10287 + }, + { + "epoch": 0.2641669916436776, + "grad_norm": 0.87890625, + "learning_rate": 0.00018677796766533125, + "loss": 0.9677, + "step": 10288 + }, + { + "epoch": 0.26419266883959935, + "grad_norm": 0.84375, + "learning_rate": 0.00018677574909773943, + "loss": 1.1486, + "step": 10289 + }, + { + "epoch": 0.2642183460355212, + "grad_norm": 0.81640625, + "learning_rate": 0.00018677353035721138, + "loss": 1.0782, + "step": 10290 + }, + { + "epoch": 0.264244023231443, + "grad_norm": 0.859375, + "learning_rate": 0.00018677131144375153, + "loss": 1.0181, + "step": 10291 + }, + { + "epoch": 0.2642697004273648, + "grad_norm": 0.8828125, + "learning_rate": 0.00018676909235736426, + "loss": 0.9491, + "step": 10292 + }, + { + "epoch": 0.26429537762328664, + "grad_norm": 0.78125, + "learning_rate": 0.00018676687309805408, + "loss": 0.9909, + "step": 10293 + }, + { + "epoch": 0.2643210548192085, + "grad_norm": 0.85546875, + "learning_rate": 0.00018676465366582531, + "loss": 1.0187, + "step": 10294 + }, + { + "epoch": 0.2643467320151303, + "grad_norm": 0.8125, + "learning_rate": 0.00018676243406068246, + "loss": 0.9903, + "step": 10295 + }, + { + "epoch": 0.2643724092110521, + "grad_norm": 0.8984375, + "learning_rate": 0.0001867602142826299, + "loss": 1.0373, + "step": 10296 + }, + { + "epoch": 0.26439808640697393, + "grad_norm": 0.8125, + "learning_rate": 0.00018675799433167208, + "loss": 0.9514, + "step": 10297 + }, + { + "epoch": 0.26442376360289577, + "grad_norm": 0.7421875, + "learning_rate": 0.0001867557742078134, + "loss": 1.0147, + "step": 10298 + }, + { + "epoch": 0.26444944079881755, + "grad_norm": 0.7890625, + "learning_rate": 0.0001867535539110583, + "loss": 0.9401, + "step": 10299 + }, + { + "epoch": 0.2644751179947394, + "grad_norm": 0.85546875, + "learning_rate": 0.00018675133344141123, + "loss": 0.8865, + "step": 10300 + }, + { + "epoch": 0.2645007951906612, + "grad_norm": 0.76171875, + "learning_rate": 0.00018674911279887657, + "loss": 0.952, + "step": 10301 + }, + { + "epoch": 0.264526472386583, + "grad_norm": 0.84765625, + "learning_rate": 0.00018674689198345875, + "loss": 0.9484, + "step": 10302 + }, + { + "epoch": 0.26455214958250484, + "grad_norm": 0.77734375, + "learning_rate": 0.0001867446709951622, + "loss": 0.9358, + "step": 10303 + }, + { + "epoch": 0.2645778267784267, + "grad_norm": 0.83984375, + "learning_rate": 0.0001867424498339914, + "loss": 0.8557, + "step": 10304 + }, + { + "epoch": 0.2646035039743485, + "grad_norm": 0.8828125, + "learning_rate": 0.0001867402284999507, + "loss": 1.2047, + "step": 10305 + }, + { + "epoch": 0.2646291811702703, + "grad_norm": 0.8125, + "learning_rate": 0.00018673800699304458, + "loss": 1.0, + "step": 10306 + }, + { + "epoch": 0.2646548583661921, + "grad_norm": 1.015625, + "learning_rate": 0.00018673578531327745, + "loss": 1.0594, + "step": 10307 + }, + { + "epoch": 0.26468053556211396, + "grad_norm": 0.8203125, + "learning_rate": 0.00018673356346065372, + "loss": 1.1531, + "step": 10308 + }, + { + "epoch": 0.26470621275803574, + "grad_norm": 0.8125, + "learning_rate": 0.00018673134143517783, + "loss": 0.9904, + "step": 10309 + }, + { + "epoch": 0.2647318899539576, + "grad_norm": 0.7734375, + "learning_rate": 0.00018672911923685423, + "loss": 1.0375, + "step": 10310 + }, + { + "epoch": 0.2647575671498794, + "grad_norm": 0.8046875, + "learning_rate": 0.0001867268968656873, + "loss": 0.9171, + "step": 10311 + }, + { + "epoch": 0.2647832443458012, + "grad_norm": 0.73828125, + "learning_rate": 0.0001867246743216815, + "loss": 0.9854, + "step": 10312 + }, + { + "epoch": 0.26480892154172303, + "grad_norm": 0.7734375, + "learning_rate": 0.00018672245160484128, + "loss": 0.9527, + "step": 10313 + }, + { + "epoch": 0.26483459873764487, + "grad_norm": 0.84765625, + "learning_rate": 0.00018672022871517104, + "loss": 0.9315, + "step": 10314 + }, + { + "epoch": 0.2648602759335667, + "grad_norm": 0.8125, + "learning_rate": 0.0001867180056526752, + "loss": 1.0041, + "step": 10315 + }, + { + "epoch": 0.2648859531294885, + "grad_norm": 0.78515625, + "learning_rate": 0.00018671578241735826, + "loss": 1.0672, + "step": 10316 + }, + { + "epoch": 0.2649116303254103, + "grad_norm": 0.76953125, + "learning_rate": 0.00018671355900922457, + "loss": 0.9666, + "step": 10317 + }, + { + "epoch": 0.26493730752133215, + "grad_norm": 0.8046875, + "learning_rate": 0.00018671133542827857, + "loss": 0.8516, + "step": 10318 + }, + { + "epoch": 0.26496298471725394, + "grad_norm": 0.8203125, + "learning_rate": 0.0001867091116745247, + "loss": 1.02, + "step": 10319 + }, + { + "epoch": 0.26498866191317577, + "grad_norm": 0.7734375, + "learning_rate": 0.00018670688774796745, + "loss": 0.8623, + "step": 10320 + }, + { + "epoch": 0.2650143391090976, + "grad_norm": 0.8203125, + "learning_rate": 0.00018670466364861116, + "loss": 0.9492, + "step": 10321 + }, + { + "epoch": 0.2650400163050194, + "grad_norm": 0.81640625, + "learning_rate": 0.00018670243937646033, + "loss": 0.9733, + "step": 10322 + }, + { + "epoch": 0.2650656935009412, + "grad_norm": 0.984375, + "learning_rate": 0.0001867002149315194, + "loss": 1.0687, + "step": 10323 + }, + { + "epoch": 0.26509137069686306, + "grad_norm": 0.734375, + "learning_rate": 0.00018669799031379274, + "loss": 0.8807, + "step": 10324 + }, + { + "epoch": 0.2651170478927849, + "grad_norm": 0.8046875, + "learning_rate": 0.00018669576552328483, + "loss": 0.9453, + "step": 10325 + }, + { + "epoch": 0.2651427250887067, + "grad_norm": 0.84765625, + "learning_rate": 0.00018669354056000006, + "loss": 0.877, + "step": 10326 + }, + { + "epoch": 0.2651684022846285, + "grad_norm": 0.828125, + "learning_rate": 0.0001866913154239429, + "loss": 0.871, + "step": 10327 + }, + { + "epoch": 0.26519407948055035, + "grad_norm": 0.8359375, + "learning_rate": 0.00018668909011511782, + "loss": 1.0253, + "step": 10328 + }, + { + "epoch": 0.26521975667647213, + "grad_norm": 0.9140625, + "learning_rate": 0.00018668686463352917, + "loss": 0.9879, + "step": 10329 + }, + { + "epoch": 0.26524543387239397, + "grad_norm": 0.7734375, + "learning_rate": 0.00018668463897918147, + "loss": 1.0328, + "step": 10330 + }, + { + "epoch": 0.2652711110683158, + "grad_norm": 0.81640625, + "learning_rate": 0.0001866824131520791, + "loss": 1.0939, + "step": 10331 + }, + { + "epoch": 0.2652967882642376, + "grad_norm": 0.76171875, + "learning_rate": 0.0001866801871522265, + "loss": 0.9969, + "step": 10332 + }, + { + "epoch": 0.2653224654601594, + "grad_norm": 0.70703125, + "learning_rate": 0.0001866779609796281, + "loss": 0.8569, + "step": 10333 + }, + { + "epoch": 0.26534814265608125, + "grad_norm": 0.78515625, + "learning_rate": 0.00018667573463428838, + "loss": 0.9193, + "step": 10334 + }, + { + "epoch": 0.26537381985200303, + "grad_norm": 0.80859375, + "learning_rate": 0.00018667350811621172, + "loss": 0.916, + "step": 10335 + }, + { + "epoch": 0.26539949704792487, + "grad_norm": 0.76171875, + "learning_rate": 0.0001866712814254026, + "loss": 0.9452, + "step": 10336 + }, + { + "epoch": 0.2654251742438467, + "grad_norm": 0.7421875, + "learning_rate": 0.0001866690545618655, + "loss": 0.999, + "step": 10337 + }, + { + "epoch": 0.26545085143976854, + "grad_norm": 0.73828125, + "learning_rate": 0.00018666682752560475, + "loss": 0.8813, + "step": 10338 + }, + { + "epoch": 0.2654765286356903, + "grad_norm": 0.79296875, + "learning_rate": 0.00018666460031662485, + "loss": 1.1299, + "step": 10339 + }, + { + "epoch": 0.26550220583161216, + "grad_norm": 0.703125, + "learning_rate": 0.00018666237293493023, + "loss": 0.9443, + "step": 10340 + }, + { + "epoch": 0.265527883027534, + "grad_norm": 0.828125, + "learning_rate": 0.00018666014538052533, + "loss": 1.0757, + "step": 10341 + }, + { + "epoch": 0.2655535602234558, + "grad_norm": 0.828125, + "learning_rate": 0.00018665791765341459, + "loss": 1.0687, + "step": 10342 + }, + { + "epoch": 0.2655792374193776, + "grad_norm": 0.828125, + "learning_rate": 0.00018665568975360244, + "loss": 1.1108, + "step": 10343 + }, + { + "epoch": 0.26560491461529945, + "grad_norm": 1.140625, + "learning_rate": 0.00018665346168109332, + "loss": 0.8763, + "step": 10344 + }, + { + "epoch": 0.26563059181122123, + "grad_norm": 0.7734375, + "learning_rate": 0.0001866512334358917, + "loss": 1.1192, + "step": 10345 + }, + { + "epoch": 0.26565626900714306, + "grad_norm": 0.77734375, + "learning_rate": 0.00018664900501800199, + "loss": 1.0315, + "step": 10346 + }, + { + "epoch": 0.2656819462030649, + "grad_norm": 0.8515625, + "learning_rate": 0.00018664677642742865, + "loss": 0.9702, + "step": 10347 + }, + { + "epoch": 0.26570762339898674, + "grad_norm": 0.75390625, + "learning_rate": 0.0001866445476641761, + "loss": 0.968, + "step": 10348 + }, + { + "epoch": 0.2657333005949085, + "grad_norm": 0.82421875, + "learning_rate": 0.0001866423187282488, + "loss": 0.8975, + "step": 10349 + }, + { + "epoch": 0.26575897779083035, + "grad_norm": 0.796875, + "learning_rate": 0.00018664008961965118, + "loss": 0.8028, + "step": 10350 + }, + { + "epoch": 0.2657846549867522, + "grad_norm": 0.71875, + "learning_rate": 0.00018663786033838769, + "loss": 0.9954, + "step": 10351 + }, + { + "epoch": 0.26581033218267397, + "grad_norm": 0.81640625, + "learning_rate": 0.00018663563088446274, + "loss": 0.9059, + "step": 10352 + }, + { + "epoch": 0.2658360093785958, + "grad_norm": 0.875, + "learning_rate": 0.00018663340125788086, + "loss": 0.9697, + "step": 10353 + }, + { + "epoch": 0.26586168657451764, + "grad_norm": 0.7890625, + "learning_rate": 0.0001866311714586464, + "loss": 1.2038, + "step": 10354 + }, + { + "epoch": 0.2658873637704394, + "grad_norm": 0.796875, + "learning_rate": 0.00018662894148676384, + "loss": 0.9021, + "step": 10355 + }, + { + "epoch": 0.26591304096636126, + "grad_norm": 0.8515625, + "learning_rate": 0.00018662671134223765, + "loss": 1.3137, + "step": 10356 + }, + { + "epoch": 0.2659387181622831, + "grad_norm": 0.75, + "learning_rate": 0.0001866244810250722, + "loss": 1.0229, + "step": 10357 + }, + { + "epoch": 0.26596439535820493, + "grad_norm": 0.7578125, + "learning_rate": 0.00018662225053527203, + "loss": 1.1868, + "step": 10358 + }, + { + "epoch": 0.2659900725541267, + "grad_norm": 0.84765625, + "learning_rate": 0.00018662001987284152, + "loss": 0.9539, + "step": 10359 + }, + { + "epoch": 0.26601574975004855, + "grad_norm": 0.7890625, + "learning_rate": 0.00018661778903778515, + "loss": 1.1288, + "step": 10360 + }, + { + "epoch": 0.2660414269459704, + "grad_norm": 0.80078125, + "learning_rate": 0.00018661555803010734, + "loss": 1.0637, + "step": 10361 + }, + { + "epoch": 0.26606710414189216, + "grad_norm": 0.8203125, + "learning_rate": 0.00018661332684981257, + "loss": 1.0097, + "step": 10362 + }, + { + "epoch": 0.266092781337814, + "grad_norm": 0.859375, + "learning_rate": 0.00018661109549690524, + "loss": 0.8789, + "step": 10363 + }, + { + "epoch": 0.26611845853373584, + "grad_norm": 0.76953125, + "learning_rate": 0.0001866088639713898, + "loss": 0.8834, + "step": 10364 + }, + { + "epoch": 0.2661441357296576, + "grad_norm": 0.83984375, + "learning_rate": 0.00018660663227327074, + "loss": 0.9465, + "step": 10365 + }, + { + "epoch": 0.26616981292557945, + "grad_norm": 0.7265625, + "learning_rate": 0.0001866044004025525, + "loss": 0.9468, + "step": 10366 + }, + { + "epoch": 0.2661954901215013, + "grad_norm": 0.77734375, + "learning_rate": 0.00018660216835923946, + "loss": 0.987, + "step": 10367 + }, + { + "epoch": 0.2662211673174231, + "grad_norm": 0.765625, + "learning_rate": 0.00018659993614333618, + "loss": 0.8926, + "step": 10368 + }, + { + "epoch": 0.2662468445133449, + "grad_norm": 0.8203125, + "learning_rate": 0.00018659770375484703, + "loss": 1.019, + "step": 10369 + }, + { + "epoch": 0.26627252170926674, + "grad_norm": 0.75, + "learning_rate": 0.00018659547119377647, + "loss": 0.9418, + "step": 10370 + }, + { + "epoch": 0.2662981989051886, + "grad_norm": 0.70703125, + "learning_rate": 0.00018659323846012898, + "loss": 1.0161, + "step": 10371 + }, + { + "epoch": 0.26632387610111036, + "grad_norm": 0.84375, + "learning_rate": 0.00018659100555390895, + "loss": 0.9742, + "step": 10372 + }, + { + "epoch": 0.2663495532970322, + "grad_norm": 0.8515625, + "learning_rate": 0.0001865887724751209, + "loss": 1.0545, + "step": 10373 + }, + { + "epoch": 0.26637523049295403, + "grad_norm": 0.79296875, + "learning_rate": 0.00018658653922376927, + "loss": 1.0493, + "step": 10374 + }, + { + "epoch": 0.2664009076888758, + "grad_norm": 0.8046875, + "learning_rate": 0.00018658430579985842, + "loss": 0.8235, + "step": 10375 + }, + { + "epoch": 0.26642658488479765, + "grad_norm": 0.74609375, + "learning_rate": 0.0001865820722033929, + "loss": 1.0592, + "step": 10376 + }, + { + "epoch": 0.2664522620807195, + "grad_norm": 0.76953125, + "learning_rate": 0.00018657983843437713, + "loss": 0.9991, + "step": 10377 + }, + { + "epoch": 0.2664779392766413, + "grad_norm": 0.8203125, + "learning_rate": 0.00018657760449281558, + "loss": 0.9085, + "step": 10378 + }, + { + "epoch": 0.2665036164725631, + "grad_norm": 0.828125, + "learning_rate": 0.0001865753703787127, + "loss": 0.8819, + "step": 10379 + }, + { + "epoch": 0.26652929366848493, + "grad_norm": 0.734375, + "learning_rate": 0.00018657313609207288, + "loss": 0.9924, + "step": 10380 + }, + { + "epoch": 0.26655497086440677, + "grad_norm": 0.84375, + "learning_rate": 0.00018657090163290066, + "loss": 1.063, + "step": 10381 + }, + { + "epoch": 0.26658064806032855, + "grad_norm": 0.79296875, + "learning_rate": 0.00018656866700120044, + "loss": 1.1016, + "step": 10382 + }, + { + "epoch": 0.2666063252562504, + "grad_norm": 0.80078125, + "learning_rate": 0.00018656643219697668, + "loss": 1.041, + "step": 10383 + }, + { + "epoch": 0.2666320024521722, + "grad_norm": 0.8125, + "learning_rate": 0.00018656419722023384, + "loss": 0.94, + "step": 10384 + }, + { + "epoch": 0.266657679648094, + "grad_norm": 0.81640625, + "learning_rate": 0.00018656196207097637, + "loss": 0.9422, + "step": 10385 + }, + { + "epoch": 0.26668335684401584, + "grad_norm": 0.87109375, + "learning_rate": 0.00018655972674920873, + "loss": 1.0046, + "step": 10386 + }, + { + "epoch": 0.2667090340399377, + "grad_norm": 0.796875, + "learning_rate": 0.00018655749125493537, + "loss": 0.9362, + "step": 10387 + }, + { + "epoch": 0.2667347112358595, + "grad_norm": 0.76953125, + "learning_rate": 0.00018655525558816075, + "loss": 0.8668, + "step": 10388 + }, + { + "epoch": 0.2667603884317813, + "grad_norm": 0.76171875, + "learning_rate": 0.00018655301974888935, + "loss": 0.9387, + "step": 10389 + }, + { + "epoch": 0.26678606562770313, + "grad_norm": 0.8515625, + "learning_rate": 0.00018655078373712558, + "loss": 0.9882, + "step": 10390 + }, + { + "epoch": 0.26681174282362496, + "grad_norm": 0.91015625, + "learning_rate": 0.00018654854755287392, + "loss": 0.9565, + "step": 10391 + }, + { + "epoch": 0.26683742001954674, + "grad_norm": 0.8671875, + "learning_rate": 0.00018654631119613882, + "loss": 1.0274, + "step": 10392 + }, + { + "epoch": 0.2668630972154686, + "grad_norm": 0.79296875, + "learning_rate": 0.00018654407466692474, + "loss": 0.8801, + "step": 10393 + }, + { + "epoch": 0.2668887744113904, + "grad_norm": 0.73046875, + "learning_rate": 0.00018654183796523613, + "loss": 0.9118, + "step": 10394 + }, + { + "epoch": 0.2669144516073122, + "grad_norm": 0.83203125, + "learning_rate": 0.00018653960109107746, + "loss": 1.1671, + "step": 10395 + }, + { + "epoch": 0.26694012880323403, + "grad_norm": 0.796875, + "learning_rate": 0.00018653736404445316, + "loss": 1.0132, + "step": 10396 + }, + { + "epoch": 0.26696580599915587, + "grad_norm": 0.80859375, + "learning_rate": 0.00018653512682536777, + "loss": 1.0172, + "step": 10397 + }, + { + "epoch": 0.2669914831950777, + "grad_norm": 0.84765625, + "learning_rate": 0.00018653288943382563, + "loss": 1.0819, + "step": 10398 + }, + { + "epoch": 0.2670171603909995, + "grad_norm": 0.7890625, + "learning_rate": 0.00018653065186983127, + "loss": 0.9197, + "step": 10399 + }, + { + "epoch": 0.2670428375869213, + "grad_norm": 0.8984375, + "learning_rate": 0.00018652841413338916, + "loss": 0.9104, + "step": 10400 + }, + { + "epoch": 0.26706851478284316, + "grad_norm": 0.78515625, + "learning_rate": 0.00018652617622450372, + "loss": 0.8631, + "step": 10401 + }, + { + "epoch": 0.26709419197876494, + "grad_norm": 0.86328125, + "learning_rate": 0.00018652393814317942, + "loss": 1.0322, + "step": 10402 + }, + { + "epoch": 0.2671198691746868, + "grad_norm": 0.79296875, + "learning_rate": 0.00018652169988942074, + "loss": 1.0739, + "step": 10403 + }, + { + "epoch": 0.2671455463706086, + "grad_norm": 0.80078125, + "learning_rate": 0.00018651946146323214, + "loss": 1.0206, + "step": 10404 + }, + { + "epoch": 0.2671712235665304, + "grad_norm": 0.81640625, + "learning_rate": 0.00018651722286461804, + "loss": 0.833, + "step": 10405 + }, + { + "epoch": 0.2671969007624522, + "grad_norm": 0.83203125, + "learning_rate": 0.00018651498409358295, + "loss": 1.1824, + "step": 10406 + }, + { + "epoch": 0.26722257795837406, + "grad_norm": 0.76171875, + "learning_rate": 0.0001865127451501313, + "loss": 1.0499, + "step": 10407 + }, + { + "epoch": 0.2672482551542959, + "grad_norm": 0.80859375, + "learning_rate": 0.00018651050603426755, + "loss": 1.0207, + "step": 10408 + }, + { + "epoch": 0.2672739323502177, + "grad_norm": 0.765625, + "learning_rate": 0.00018650826674599622, + "loss": 0.8732, + "step": 10409 + }, + { + "epoch": 0.2672996095461395, + "grad_norm": 0.78515625, + "learning_rate": 0.0001865060272853217, + "loss": 1.0025, + "step": 10410 + }, + { + "epoch": 0.26732528674206135, + "grad_norm": 0.7421875, + "learning_rate": 0.00018650378765224846, + "loss": 0.9546, + "step": 10411 + }, + { + "epoch": 0.26735096393798313, + "grad_norm": 0.76953125, + "learning_rate": 0.000186501547846781, + "loss": 0.9854, + "step": 10412 + }, + { + "epoch": 0.26737664113390497, + "grad_norm": 0.8046875, + "learning_rate": 0.0001864993078689238, + "loss": 1.1292, + "step": 10413 + }, + { + "epoch": 0.2674023183298268, + "grad_norm": 0.9375, + "learning_rate": 0.00018649706771868126, + "loss": 0.949, + "step": 10414 + }, + { + "epoch": 0.2674279955257486, + "grad_norm": 0.86328125, + "learning_rate": 0.0001864948273960579, + "loss": 0.9129, + "step": 10415 + }, + { + "epoch": 0.2674536727216704, + "grad_norm": 0.859375, + "learning_rate": 0.00018649258690105815, + "loss": 0.9851, + "step": 10416 + }, + { + "epoch": 0.26747934991759226, + "grad_norm": 0.77734375, + "learning_rate": 0.00018649034623368648, + "loss": 1.0269, + "step": 10417 + }, + { + "epoch": 0.2675050271135141, + "grad_norm": 0.78515625, + "learning_rate": 0.00018648810539394737, + "loss": 0.9897, + "step": 10418 + }, + { + "epoch": 0.2675307043094359, + "grad_norm": 0.81640625, + "learning_rate": 0.00018648586438184528, + "loss": 1.0244, + "step": 10419 + }, + { + "epoch": 0.2675563815053577, + "grad_norm": 0.80078125, + "learning_rate": 0.00018648362319738468, + "loss": 0.8935, + "step": 10420 + }, + { + "epoch": 0.26758205870127955, + "grad_norm": 0.81640625, + "learning_rate": 0.00018648138184057, + "loss": 1.0598, + "step": 10421 + }, + { + "epoch": 0.2676077358972013, + "grad_norm": 0.7421875, + "learning_rate": 0.00018647914031140575, + "loss": 0.8743, + "step": 10422 + }, + { + "epoch": 0.26763341309312316, + "grad_norm": 0.8359375, + "learning_rate": 0.0001864768986098964, + "loss": 0.9193, + "step": 10423 + }, + { + "epoch": 0.267659090289045, + "grad_norm": 0.78125, + "learning_rate": 0.00018647465673604644, + "loss": 0.9894, + "step": 10424 + }, + { + "epoch": 0.2676847674849668, + "grad_norm": 0.8671875, + "learning_rate": 0.00018647241468986025, + "loss": 1.0675, + "step": 10425 + }, + { + "epoch": 0.2677104446808886, + "grad_norm": 0.7734375, + "learning_rate": 0.00018647017247134235, + "loss": 0.9448, + "step": 10426 + }, + { + "epoch": 0.26773612187681045, + "grad_norm": 0.78515625, + "learning_rate": 0.0001864679300804972, + "loss": 0.8631, + "step": 10427 + }, + { + "epoch": 0.2677617990727323, + "grad_norm": 0.859375, + "learning_rate": 0.0001864656875173293, + "loss": 0.9418, + "step": 10428 + }, + { + "epoch": 0.26778747626865407, + "grad_norm": 0.83984375, + "learning_rate": 0.0001864634447818431, + "loss": 1.1961, + "step": 10429 + }, + { + "epoch": 0.2678131534645759, + "grad_norm": 0.8203125, + "learning_rate": 0.00018646120187404302, + "loss": 0.9346, + "step": 10430 + }, + { + "epoch": 0.26783883066049774, + "grad_norm": 0.82421875, + "learning_rate": 0.00018645895879393365, + "loss": 1.0646, + "step": 10431 + }, + { + "epoch": 0.2678645078564195, + "grad_norm": 0.81640625, + "learning_rate": 0.00018645671554151934, + "loss": 0.9869, + "step": 10432 + }, + { + "epoch": 0.26789018505234136, + "grad_norm": 0.796875, + "learning_rate": 0.00018645447211680457, + "loss": 0.8882, + "step": 10433 + }, + { + "epoch": 0.2679158622482632, + "grad_norm": 0.765625, + "learning_rate": 0.0001864522285197939, + "loss": 0.9501, + "step": 10434 + }, + { + "epoch": 0.267941539444185, + "grad_norm": 1.0, + "learning_rate": 0.00018644998475049173, + "loss": 1.1064, + "step": 10435 + }, + { + "epoch": 0.2679672166401068, + "grad_norm": 0.859375, + "learning_rate": 0.00018644774080890254, + "loss": 1.0861, + "step": 10436 + }, + { + "epoch": 0.26799289383602865, + "grad_norm": 0.8515625, + "learning_rate": 0.00018644549669503082, + "loss": 0.9737, + "step": 10437 + }, + { + "epoch": 0.2680185710319505, + "grad_norm": 0.828125, + "learning_rate": 0.00018644325240888104, + "loss": 1.0487, + "step": 10438 + }, + { + "epoch": 0.26804424822787226, + "grad_norm": 0.73828125, + "learning_rate": 0.00018644100795045767, + "loss": 0.8533, + "step": 10439 + }, + { + "epoch": 0.2680699254237941, + "grad_norm": 0.984375, + "learning_rate": 0.00018643876331976515, + "loss": 0.9641, + "step": 10440 + }, + { + "epoch": 0.26809560261971593, + "grad_norm": 0.765625, + "learning_rate": 0.00018643651851680802, + "loss": 1.0191, + "step": 10441 + }, + { + "epoch": 0.2681212798156377, + "grad_norm": 0.78125, + "learning_rate": 0.0001864342735415907, + "loss": 1.0792, + "step": 10442 + }, + { + "epoch": 0.26814695701155955, + "grad_norm": 0.76171875, + "learning_rate": 0.00018643202839411767, + "loss": 0.973, + "step": 10443 + }, + { + "epoch": 0.2681726342074814, + "grad_norm": 0.75390625, + "learning_rate": 0.00018642978307439341, + "loss": 0.959, + "step": 10444 + }, + { + "epoch": 0.26819831140340317, + "grad_norm": 0.8359375, + "learning_rate": 0.00018642753758242245, + "loss": 1.0231, + "step": 10445 + }, + { + "epoch": 0.268223988599325, + "grad_norm": 0.7109375, + "learning_rate": 0.00018642529191820913, + "loss": 0.9892, + "step": 10446 + }, + { + "epoch": 0.26824966579524684, + "grad_norm": 0.8125, + "learning_rate": 0.00018642304608175808, + "loss": 0.8974, + "step": 10447 + }, + { + "epoch": 0.2682753429911687, + "grad_norm": 0.77734375, + "learning_rate": 0.0001864208000730737, + "loss": 1.1228, + "step": 10448 + }, + { + "epoch": 0.26830102018709046, + "grad_norm": 0.8359375, + "learning_rate": 0.00018641855389216045, + "loss": 1.0225, + "step": 10449 + }, + { + "epoch": 0.2683266973830123, + "grad_norm": 0.7890625, + "learning_rate": 0.00018641630753902285, + "loss": 1.0646, + "step": 10450 + }, + { + "epoch": 0.2683523745789341, + "grad_norm": 0.859375, + "learning_rate": 0.00018641406101366532, + "loss": 0.9531, + "step": 10451 + }, + { + "epoch": 0.2683780517748559, + "grad_norm": 0.82421875, + "learning_rate": 0.0001864118143160924, + "loss": 1.0988, + "step": 10452 + }, + { + "epoch": 0.26840372897077774, + "grad_norm": 0.80078125, + "learning_rate": 0.00018640956744630854, + "loss": 0.9388, + "step": 10453 + }, + { + "epoch": 0.2684294061666996, + "grad_norm": 0.76953125, + "learning_rate": 0.00018640732040431818, + "loss": 0.9322, + "step": 10454 + }, + { + "epoch": 0.26845508336262136, + "grad_norm": 0.84765625, + "learning_rate": 0.0001864050731901259, + "loss": 1.1541, + "step": 10455 + }, + { + "epoch": 0.2684807605585432, + "grad_norm": 0.796875, + "learning_rate": 0.00018640282580373606, + "loss": 0.9139, + "step": 10456 + }, + { + "epoch": 0.26850643775446503, + "grad_norm": 1.140625, + "learning_rate": 0.0001864005782451532, + "loss": 1.0662, + "step": 10457 + }, + { + "epoch": 0.26853211495038687, + "grad_norm": 0.80078125, + "learning_rate": 0.00018639833051438181, + "loss": 0.9835, + "step": 10458 + }, + { + "epoch": 0.26855779214630865, + "grad_norm": 0.8828125, + "learning_rate": 0.00018639608261142635, + "loss": 0.9334, + "step": 10459 + }, + { + "epoch": 0.2685834693422305, + "grad_norm": 0.734375, + "learning_rate": 0.00018639383453629128, + "loss": 1.105, + "step": 10460 + }, + { + "epoch": 0.2686091465381523, + "grad_norm": 0.77734375, + "learning_rate": 0.00018639158628898112, + "loss": 0.9117, + "step": 10461 + }, + { + "epoch": 0.2686348237340741, + "grad_norm": 0.75, + "learning_rate": 0.00018638933786950036, + "loss": 0.8424, + "step": 10462 + }, + { + "epoch": 0.26866050092999594, + "grad_norm": 0.859375, + "learning_rate": 0.0001863870892778534, + "loss": 1.0764, + "step": 10463 + }, + { + "epoch": 0.2686861781259178, + "grad_norm": 1.0703125, + "learning_rate": 0.00018638484051404478, + "loss": 0.9123, + "step": 10464 + }, + { + "epoch": 0.26871185532183955, + "grad_norm": 0.859375, + "learning_rate": 0.00018638259157807902, + "loss": 0.986, + "step": 10465 + }, + { + "epoch": 0.2687375325177614, + "grad_norm": 0.8203125, + "learning_rate": 0.00018638034246996053, + "loss": 0.9494, + "step": 10466 + }, + { + "epoch": 0.2687632097136832, + "grad_norm": 0.8125, + "learning_rate": 0.00018637809318969382, + "loss": 1.0561, + "step": 10467 + }, + { + "epoch": 0.26878888690960506, + "grad_norm": 0.984375, + "learning_rate": 0.00018637584373728337, + "loss": 1.0513, + "step": 10468 + }, + { + "epoch": 0.26881456410552684, + "grad_norm": 0.88671875, + "learning_rate": 0.00018637359411273369, + "loss": 1.2148, + "step": 10469 + }, + { + "epoch": 0.2688402413014487, + "grad_norm": 0.8203125, + "learning_rate": 0.00018637134431604918, + "loss": 0.9322, + "step": 10470 + }, + { + "epoch": 0.2688659184973705, + "grad_norm": 0.83984375, + "learning_rate": 0.00018636909434723443, + "loss": 1.0677, + "step": 10471 + }, + { + "epoch": 0.2688915956932923, + "grad_norm": 0.8515625, + "learning_rate": 0.00018636684420629388, + "loss": 1.0475, + "step": 10472 + }, + { + "epoch": 0.26891727288921413, + "grad_norm": 0.78515625, + "learning_rate": 0.000186364593893232, + "loss": 0.8947, + "step": 10473 + }, + { + "epoch": 0.26894295008513597, + "grad_norm": 0.76171875, + "learning_rate": 0.0001863623434080533, + "loss": 0.9005, + "step": 10474 + }, + { + "epoch": 0.26896862728105775, + "grad_norm": 0.828125, + "learning_rate": 0.00018636009275076223, + "loss": 0.962, + "step": 10475 + }, + { + "epoch": 0.2689943044769796, + "grad_norm": 0.87109375, + "learning_rate": 0.00018635784192136328, + "loss": 1.0, + "step": 10476 + }, + { + "epoch": 0.2690199816729014, + "grad_norm": 0.8671875, + "learning_rate": 0.00018635559091986098, + "loss": 1.0113, + "step": 10477 + }, + { + "epoch": 0.26904565886882326, + "grad_norm": 0.7578125, + "learning_rate": 0.0001863533397462598, + "loss": 1.0137, + "step": 10478 + }, + { + "epoch": 0.26907133606474504, + "grad_norm": 0.76953125, + "learning_rate": 0.00018635108840056418, + "loss": 1.0323, + "step": 10479 + }, + { + "epoch": 0.2690970132606669, + "grad_norm": 0.8828125, + "learning_rate": 0.00018634883688277865, + "loss": 1.0002, + "step": 10480 + }, + { + "epoch": 0.2691226904565887, + "grad_norm": 0.7578125, + "learning_rate": 0.0001863465851929077, + "loss": 0.9538, + "step": 10481 + }, + { + "epoch": 0.2691483676525105, + "grad_norm": 0.765625, + "learning_rate": 0.0001863443333309558, + "loss": 0.9575, + "step": 10482 + }, + { + "epoch": 0.2691740448484323, + "grad_norm": 0.8671875, + "learning_rate": 0.00018634208129692743, + "loss": 1.0415, + "step": 10483 + }, + { + "epoch": 0.26919972204435416, + "grad_norm": 0.8671875, + "learning_rate": 0.00018633982909082713, + "loss": 1.0429, + "step": 10484 + }, + { + "epoch": 0.26922539924027594, + "grad_norm": 0.859375, + "learning_rate": 0.0001863375767126593, + "loss": 1.0381, + "step": 10485 + }, + { + "epoch": 0.2692510764361978, + "grad_norm": 0.8125, + "learning_rate": 0.00018633532416242852, + "loss": 1.0513, + "step": 10486 + }, + { + "epoch": 0.2692767536321196, + "grad_norm": 0.8359375, + "learning_rate": 0.00018633307144013924, + "loss": 1.042, + "step": 10487 + }, + { + "epoch": 0.26930243082804145, + "grad_norm": 0.8125, + "learning_rate": 0.0001863308185457959, + "loss": 1.001, + "step": 10488 + }, + { + "epoch": 0.26932810802396323, + "grad_norm": 0.87109375, + "learning_rate": 0.00018632856547940306, + "loss": 0.9728, + "step": 10489 + }, + { + "epoch": 0.26935378521988507, + "grad_norm": 0.796875, + "learning_rate": 0.00018632631224096518, + "loss": 0.9356, + "step": 10490 + }, + { + "epoch": 0.2693794624158069, + "grad_norm": 0.80078125, + "learning_rate": 0.0001863240588304868, + "loss": 0.985, + "step": 10491 + }, + { + "epoch": 0.2694051396117287, + "grad_norm": 0.765625, + "learning_rate": 0.0001863218052479723, + "loss": 1.0473, + "step": 10492 + }, + { + "epoch": 0.2694308168076505, + "grad_norm": 0.87109375, + "learning_rate": 0.00018631955149342631, + "loss": 1.1029, + "step": 10493 + }, + { + "epoch": 0.26945649400357236, + "grad_norm": 0.80859375, + "learning_rate": 0.00018631729756685318, + "loss": 1.0651, + "step": 10494 + }, + { + "epoch": 0.26948217119949414, + "grad_norm": 0.7734375, + "learning_rate": 0.00018631504346825755, + "loss": 1.1154, + "step": 10495 + }, + { + "epoch": 0.26950784839541597, + "grad_norm": 0.81640625, + "learning_rate": 0.00018631278919764376, + "loss": 1.0002, + "step": 10496 + }, + { + "epoch": 0.2695335255913378, + "grad_norm": 0.80078125, + "learning_rate": 0.00018631053475501644, + "loss": 1.0674, + "step": 10497 + }, + { + "epoch": 0.26955920278725964, + "grad_norm": 0.76953125, + "learning_rate": 0.00018630828014038, + "loss": 1.0004, + "step": 10498 + }, + { + "epoch": 0.2695848799831814, + "grad_norm": 0.96484375, + "learning_rate": 0.00018630602535373893, + "loss": 1.0548, + "step": 10499 + }, + { + "epoch": 0.26961055717910326, + "grad_norm": 0.79296875, + "learning_rate": 0.00018630377039509775, + "loss": 1.0529, + "step": 10500 + }, + { + "epoch": 0.2696362343750251, + "grad_norm": 0.765625, + "learning_rate": 0.00018630151526446097, + "loss": 0.9076, + "step": 10501 + }, + { + "epoch": 0.2696619115709469, + "grad_norm": 0.8125, + "learning_rate": 0.00018629925996183308, + "loss": 1.1129, + "step": 10502 + }, + { + "epoch": 0.2696875887668687, + "grad_norm": 0.7421875, + "learning_rate": 0.00018629700448721855, + "loss": 1.0236, + "step": 10503 + }, + { + "epoch": 0.26971326596279055, + "grad_norm": 0.83984375, + "learning_rate": 0.00018629474884062188, + "loss": 0.9502, + "step": 10504 + }, + { + "epoch": 0.26973894315871233, + "grad_norm": 0.7734375, + "learning_rate": 0.00018629249302204754, + "loss": 0.9683, + "step": 10505 + }, + { + "epoch": 0.26976462035463417, + "grad_norm": 0.84765625, + "learning_rate": 0.0001862902370315001, + "loss": 0.8711, + "step": 10506 + }, + { + "epoch": 0.269790297550556, + "grad_norm": 0.875, + "learning_rate": 0.000186287980868984, + "loss": 1.0186, + "step": 10507 + }, + { + "epoch": 0.26981597474647784, + "grad_norm": 0.8984375, + "learning_rate": 0.00018628572453450372, + "loss": 1.1041, + "step": 10508 + }, + { + "epoch": 0.2698416519423996, + "grad_norm": 0.7890625, + "learning_rate": 0.00018628346802806382, + "loss": 0.9372, + "step": 10509 + }, + { + "epoch": 0.26986732913832145, + "grad_norm": 0.91796875, + "learning_rate": 0.00018628121134966875, + "loss": 0.9755, + "step": 10510 + }, + { + "epoch": 0.2698930063342433, + "grad_norm": 0.875, + "learning_rate": 0.00018627895449932304, + "loss": 1.0136, + "step": 10511 + }, + { + "epoch": 0.26991868353016507, + "grad_norm": 0.8515625, + "learning_rate": 0.00018627669747703113, + "loss": 0.9035, + "step": 10512 + }, + { + "epoch": 0.2699443607260869, + "grad_norm": 0.78125, + "learning_rate": 0.00018627444028279758, + "loss": 0.9336, + "step": 10513 + }, + { + "epoch": 0.26997003792200874, + "grad_norm": 0.90234375, + "learning_rate": 0.00018627218291662688, + "loss": 1.0172, + "step": 10514 + }, + { + "epoch": 0.2699957151179305, + "grad_norm": 0.86328125, + "learning_rate": 0.00018626992537852348, + "loss": 1.076, + "step": 10515 + }, + { + "epoch": 0.27002139231385236, + "grad_norm": 0.890625, + "learning_rate": 0.00018626766766849192, + "loss": 1.0563, + "step": 10516 + }, + { + "epoch": 0.2700470695097742, + "grad_norm": 0.80078125, + "learning_rate": 0.00018626540978653667, + "loss": 0.9583, + "step": 10517 + }, + { + "epoch": 0.27007274670569603, + "grad_norm": 0.78125, + "learning_rate": 0.00018626315173266228, + "loss": 1.0342, + "step": 10518 + }, + { + "epoch": 0.2700984239016178, + "grad_norm": 1.046875, + "learning_rate": 0.00018626089350687322, + "loss": 0.9605, + "step": 10519 + }, + { + "epoch": 0.27012410109753965, + "grad_norm": 0.80078125, + "learning_rate": 0.00018625863510917398, + "loss": 1.1222, + "step": 10520 + }, + { + "epoch": 0.2701497782934615, + "grad_norm": 0.7734375, + "learning_rate": 0.00018625637653956907, + "loss": 1.0528, + "step": 10521 + }, + { + "epoch": 0.27017545548938326, + "grad_norm": 0.73828125, + "learning_rate": 0.000186254117798063, + "loss": 0.9442, + "step": 10522 + }, + { + "epoch": 0.2702011326853051, + "grad_norm": 0.78125, + "learning_rate": 0.00018625185888466027, + "loss": 0.9203, + "step": 10523 + }, + { + "epoch": 0.27022680988122694, + "grad_norm": 0.86328125, + "learning_rate": 0.00018624959979936538, + "loss": 0.985, + "step": 10524 + }, + { + "epoch": 0.2702524870771487, + "grad_norm": 0.84765625, + "learning_rate": 0.0001862473405421828, + "loss": 0.9889, + "step": 10525 + }, + { + "epoch": 0.27027816427307055, + "grad_norm": 0.875, + "learning_rate": 0.0001862450811131171, + "loss": 1.1263, + "step": 10526 + }, + { + "epoch": 0.2703038414689924, + "grad_norm": 0.7265625, + "learning_rate": 0.00018624282151217272, + "loss": 1.0249, + "step": 10527 + }, + { + "epoch": 0.2703295186649142, + "grad_norm": 0.7578125, + "learning_rate": 0.0001862405617393542, + "loss": 1.0828, + "step": 10528 + }, + { + "epoch": 0.270355195860836, + "grad_norm": 0.890625, + "learning_rate": 0.00018623830179466603, + "loss": 1.0246, + "step": 10529 + }, + { + "epoch": 0.27038087305675784, + "grad_norm": 0.83203125, + "learning_rate": 0.00018623604167811267, + "loss": 0.936, + "step": 10530 + }, + { + "epoch": 0.2704065502526797, + "grad_norm": 0.8203125, + "learning_rate": 0.00018623378138969874, + "loss": 0.8799, + "step": 10531 + }, + { + "epoch": 0.27043222744860146, + "grad_norm": 0.83984375, + "learning_rate": 0.0001862315209294286, + "loss": 0.9938, + "step": 10532 + }, + { + "epoch": 0.2704579046445233, + "grad_norm": 0.8671875, + "learning_rate": 0.0001862292602973069, + "loss": 1.0843, + "step": 10533 + }, + { + "epoch": 0.27048358184044513, + "grad_norm": 0.81640625, + "learning_rate": 0.000186226999493338, + "loss": 0.9494, + "step": 10534 + }, + { + "epoch": 0.2705092590363669, + "grad_norm": 0.87890625, + "learning_rate": 0.00018622473851752656, + "loss": 1.0061, + "step": 10535 + }, + { + "epoch": 0.27053493623228875, + "grad_norm": 0.83203125, + "learning_rate": 0.00018622247736987694, + "loss": 0.9579, + "step": 10536 + }, + { + "epoch": 0.2705606134282106, + "grad_norm": 0.97265625, + "learning_rate": 0.00018622021605039374, + "loss": 0.9403, + "step": 10537 + }, + { + "epoch": 0.27058629062413236, + "grad_norm": 0.74609375, + "learning_rate": 0.00018621795455908143, + "loss": 0.9371, + "step": 10538 + }, + { + "epoch": 0.2706119678200542, + "grad_norm": 0.75390625, + "learning_rate": 0.00018621569289594452, + "loss": 0.9616, + "step": 10539 + }, + { + "epoch": 0.27063764501597604, + "grad_norm": 0.78125, + "learning_rate": 0.00018621343106098754, + "loss": 1.0926, + "step": 10540 + }, + { + "epoch": 0.27066332221189787, + "grad_norm": 0.80859375, + "learning_rate": 0.00018621116905421497, + "loss": 0.9467, + "step": 10541 + }, + { + "epoch": 0.27068899940781965, + "grad_norm": 0.765625, + "learning_rate": 0.00018620890687563133, + "loss": 0.9836, + "step": 10542 + }, + { + "epoch": 0.2707146766037415, + "grad_norm": 0.89453125, + "learning_rate": 0.0001862066445252411, + "loss": 0.9289, + "step": 10543 + }, + { + "epoch": 0.2707403537996633, + "grad_norm": 0.8515625, + "learning_rate": 0.00018620438200304883, + "loss": 0.9259, + "step": 10544 + }, + { + "epoch": 0.2707660309955851, + "grad_norm": 0.87109375, + "learning_rate": 0.00018620211930905903, + "loss": 0.9719, + "step": 10545 + }, + { + "epoch": 0.27079170819150694, + "grad_norm": 0.859375, + "learning_rate": 0.00018619985644327617, + "loss": 1.1333, + "step": 10546 + }, + { + "epoch": 0.2708173853874288, + "grad_norm": 0.77734375, + "learning_rate": 0.0001861975934057048, + "loss": 0.8516, + "step": 10547 + }, + { + "epoch": 0.27084306258335056, + "grad_norm": 0.76953125, + "learning_rate": 0.0001861953301963494, + "loss": 0.8866, + "step": 10548 + }, + { + "epoch": 0.2708687397792724, + "grad_norm": 0.7578125, + "learning_rate": 0.00018619306681521452, + "loss": 1.069, + "step": 10549 + }, + { + "epoch": 0.27089441697519423, + "grad_norm": 0.76953125, + "learning_rate": 0.00018619080326230463, + "loss": 0.912, + "step": 10550 + }, + { + "epoch": 0.27092009417111607, + "grad_norm": 0.93359375, + "learning_rate": 0.00018618853953762424, + "loss": 1.0061, + "step": 10551 + }, + { + "epoch": 0.27094577136703785, + "grad_norm": 0.82421875, + "learning_rate": 0.00018618627564117787, + "loss": 1.035, + "step": 10552 + }, + { + "epoch": 0.2709714485629597, + "grad_norm": 0.8359375, + "learning_rate": 0.00018618401157297006, + "loss": 1.0552, + "step": 10553 + }, + { + "epoch": 0.2709971257588815, + "grad_norm": 0.8359375, + "learning_rate": 0.0001861817473330053, + "loss": 1.0874, + "step": 10554 + }, + { + "epoch": 0.2710228029548033, + "grad_norm": 0.734375, + "learning_rate": 0.00018617948292128808, + "loss": 0.9146, + "step": 10555 + }, + { + "epoch": 0.27104848015072514, + "grad_norm": 0.84765625, + "learning_rate": 0.000186177218337823, + "loss": 0.9943, + "step": 10556 + }, + { + "epoch": 0.27107415734664697, + "grad_norm": 0.85546875, + "learning_rate": 0.0001861749535826144, + "loss": 0.9925, + "step": 10557 + }, + { + "epoch": 0.27109983454256875, + "grad_norm": 0.80859375, + "learning_rate": 0.00018617268865566696, + "loss": 1.1009, + "step": 10558 + }, + { + "epoch": 0.2711255117384906, + "grad_norm": 0.7734375, + "learning_rate": 0.00018617042355698515, + "loss": 0.8938, + "step": 10559 + }, + { + "epoch": 0.2711511889344124, + "grad_norm": 0.84765625, + "learning_rate": 0.00018616815828657345, + "loss": 1.1855, + "step": 10560 + }, + { + "epoch": 0.27117686613033426, + "grad_norm": 0.87890625, + "learning_rate": 0.0001861658928444364, + "loss": 1.0285, + "step": 10561 + }, + { + "epoch": 0.27120254332625604, + "grad_norm": 0.8046875, + "learning_rate": 0.0001861636272305785, + "loss": 0.9566, + "step": 10562 + }, + { + "epoch": 0.2712282205221779, + "grad_norm": 0.83203125, + "learning_rate": 0.00018616136144500428, + "loss": 1.0528, + "step": 10563 + }, + { + "epoch": 0.2712538977180997, + "grad_norm": 0.8203125, + "learning_rate": 0.00018615909548771827, + "loss": 1.0271, + "step": 10564 + }, + { + "epoch": 0.2712795749140215, + "grad_norm": 0.73046875, + "learning_rate": 0.00018615682935872494, + "loss": 0.954, + "step": 10565 + }, + { + "epoch": 0.27130525210994333, + "grad_norm": 0.8984375, + "learning_rate": 0.00018615456305802884, + "loss": 1.02, + "step": 10566 + }, + { + "epoch": 0.27133092930586517, + "grad_norm": 0.80078125, + "learning_rate": 0.00018615229658563447, + "loss": 0.865, + "step": 10567 + }, + { + "epoch": 0.27135660650178695, + "grad_norm": 0.80078125, + "learning_rate": 0.00018615002994154637, + "loss": 0.9366, + "step": 10568 + }, + { + "epoch": 0.2713822836977088, + "grad_norm": 0.86328125, + "learning_rate": 0.00018614776312576902, + "loss": 1.0229, + "step": 10569 + }, + { + "epoch": 0.2714079608936306, + "grad_norm": 0.7578125, + "learning_rate": 0.00018614549613830698, + "loss": 0.9426, + "step": 10570 + }, + { + "epoch": 0.27143363808955245, + "grad_norm": 0.86328125, + "learning_rate": 0.00018614322897916475, + "loss": 1.0366, + "step": 10571 + }, + { + "epoch": 0.27145931528547423, + "grad_norm": 0.796875, + "learning_rate": 0.0001861409616483468, + "loss": 0.9903, + "step": 10572 + }, + { + "epoch": 0.27148499248139607, + "grad_norm": 0.875, + "learning_rate": 0.00018613869414585774, + "loss": 1.0814, + "step": 10573 + }, + { + "epoch": 0.2715106696773179, + "grad_norm": 0.79296875, + "learning_rate": 0.00018613642647170205, + "loss": 1.1841, + "step": 10574 + }, + { + "epoch": 0.2715363468732397, + "grad_norm": 0.8203125, + "learning_rate": 0.00018613415862588423, + "loss": 0.9165, + "step": 10575 + }, + { + "epoch": 0.2715620240691615, + "grad_norm": 0.77734375, + "learning_rate": 0.00018613189060840882, + "loss": 1.015, + "step": 10576 + }, + { + "epoch": 0.27158770126508336, + "grad_norm": 0.9609375, + "learning_rate": 0.00018612962241928028, + "loss": 0.9397, + "step": 10577 + }, + { + "epoch": 0.27161337846100514, + "grad_norm": 1.03125, + "learning_rate": 0.00018612735405850326, + "loss": 1.0377, + "step": 10578 + }, + { + "epoch": 0.271639055656927, + "grad_norm": 0.81640625, + "learning_rate": 0.00018612508552608213, + "loss": 0.9611, + "step": 10579 + }, + { + "epoch": 0.2716647328528488, + "grad_norm": 0.8046875, + "learning_rate": 0.00018612281682202155, + "loss": 1.0037, + "step": 10580 + }, + { + "epoch": 0.27169041004877065, + "grad_norm": 0.81640625, + "learning_rate": 0.00018612054794632594, + "loss": 0.8976, + "step": 10581 + }, + { + "epoch": 0.27171608724469243, + "grad_norm": 0.7109375, + "learning_rate": 0.00018611827889899985, + "loss": 0.9708, + "step": 10582 + }, + { + "epoch": 0.27174176444061426, + "grad_norm": 0.7578125, + "learning_rate": 0.00018611600968004784, + "loss": 0.9257, + "step": 10583 + }, + { + "epoch": 0.2717674416365361, + "grad_norm": 0.82421875, + "learning_rate": 0.0001861137402894744, + "loss": 1.063, + "step": 10584 + }, + { + "epoch": 0.2717931188324579, + "grad_norm": 0.7890625, + "learning_rate": 0.000186111470727284, + "loss": 0.9969, + "step": 10585 + }, + { + "epoch": 0.2718187960283797, + "grad_norm": 0.8671875, + "learning_rate": 0.00018610920099348127, + "loss": 1.0325, + "step": 10586 + }, + { + "epoch": 0.27184447322430155, + "grad_norm": 0.75390625, + "learning_rate": 0.00018610693108807067, + "loss": 0.9848, + "step": 10587 + }, + { + "epoch": 0.27187015042022333, + "grad_norm": 0.828125, + "learning_rate": 0.00018610466101105674, + "loss": 0.9896, + "step": 10588 + }, + { + "epoch": 0.27189582761614517, + "grad_norm": 0.7734375, + "learning_rate": 0.000186102390762444, + "loss": 1.0338, + "step": 10589 + }, + { + "epoch": 0.271921504812067, + "grad_norm": 1.3046875, + "learning_rate": 0.00018610012034223696, + "loss": 0.9677, + "step": 10590 + }, + { + "epoch": 0.27194718200798884, + "grad_norm": 0.73828125, + "learning_rate": 0.00018609784975044016, + "loss": 0.9004, + "step": 10591 + }, + { + "epoch": 0.2719728592039106, + "grad_norm": 0.84375, + "learning_rate": 0.0001860955789870581, + "loss": 1.025, + "step": 10592 + }, + { + "epoch": 0.27199853639983246, + "grad_norm": 0.80078125, + "learning_rate": 0.00018609330805209538, + "loss": 0.9913, + "step": 10593 + }, + { + "epoch": 0.2720242135957543, + "grad_norm": 0.8515625, + "learning_rate": 0.00018609103694555644, + "loss": 1.0029, + "step": 10594 + }, + { + "epoch": 0.2720498907916761, + "grad_norm": 0.7890625, + "learning_rate": 0.00018608876566744584, + "loss": 0.9708, + "step": 10595 + }, + { + "epoch": 0.2720755679875979, + "grad_norm": 0.9765625, + "learning_rate": 0.0001860864942177681, + "loss": 0.9865, + "step": 10596 + }, + { + "epoch": 0.27210124518351975, + "grad_norm": 0.8359375, + "learning_rate": 0.00018608422259652776, + "loss": 1.0031, + "step": 10597 + }, + { + "epoch": 0.2721269223794415, + "grad_norm": 0.72265625, + "learning_rate": 0.00018608195080372934, + "loss": 1.0041, + "step": 10598 + }, + { + "epoch": 0.27215259957536336, + "grad_norm": 0.7265625, + "learning_rate": 0.00018607967883937742, + "loss": 1.1264, + "step": 10599 + }, + { + "epoch": 0.2721782767712852, + "grad_norm": 0.8359375, + "learning_rate": 0.00018607740670347639, + "loss": 1.0963, + "step": 10600 + }, + { + "epoch": 0.27220395396720704, + "grad_norm": 0.77734375, + "learning_rate": 0.0001860751343960309, + "loss": 0.949, + "step": 10601 + }, + { + "epoch": 0.2722296311631288, + "grad_norm": 0.796875, + "learning_rate": 0.00018607286191704546, + "loss": 0.9181, + "step": 10602 + }, + { + "epoch": 0.27225530835905065, + "grad_norm": 0.8359375, + "learning_rate": 0.00018607058926652456, + "loss": 1.1231, + "step": 10603 + }, + { + "epoch": 0.2722809855549725, + "grad_norm": 0.87890625, + "learning_rate": 0.00018606831644447277, + "loss": 0.9262, + "step": 10604 + }, + { + "epoch": 0.27230666275089427, + "grad_norm": 0.79296875, + "learning_rate": 0.00018606604345089455, + "loss": 1.0687, + "step": 10605 + }, + { + "epoch": 0.2723323399468161, + "grad_norm": 0.79296875, + "learning_rate": 0.00018606377028579452, + "loss": 1.0452, + "step": 10606 + }, + { + "epoch": 0.27235801714273794, + "grad_norm": 0.81640625, + "learning_rate": 0.00018606149694917718, + "loss": 0.8843, + "step": 10607 + }, + { + "epoch": 0.2723836943386597, + "grad_norm": 0.91015625, + "learning_rate": 0.00018605922344104703, + "loss": 1.1044, + "step": 10608 + }, + { + "epoch": 0.27240937153458156, + "grad_norm": 0.7578125, + "learning_rate": 0.00018605694976140863, + "loss": 0.8817, + "step": 10609 + }, + { + "epoch": 0.2724350487305034, + "grad_norm": 0.82421875, + "learning_rate": 0.0001860546759102665, + "loss": 1.0073, + "step": 10610 + }, + { + "epoch": 0.27246072592642523, + "grad_norm": 0.7578125, + "learning_rate": 0.00018605240188762515, + "loss": 1.0127, + "step": 10611 + }, + { + "epoch": 0.272486403122347, + "grad_norm": 0.70703125, + "learning_rate": 0.00018605012769348916, + "loss": 0.924, + "step": 10612 + }, + { + "epoch": 0.27251208031826885, + "grad_norm": 0.80859375, + "learning_rate": 0.00018604785332786307, + "loss": 0.8813, + "step": 10613 + }, + { + "epoch": 0.2725377575141907, + "grad_norm": 0.84765625, + "learning_rate": 0.00018604557879075135, + "loss": 0.8848, + "step": 10614 + }, + { + "epoch": 0.27256343471011246, + "grad_norm": 0.83203125, + "learning_rate": 0.00018604330408215856, + "loss": 1.1731, + "step": 10615 + }, + { + "epoch": 0.2725891119060343, + "grad_norm": 0.8203125, + "learning_rate": 0.00018604102920208922, + "loss": 1.1229, + "step": 10616 + }, + { + "epoch": 0.27261478910195613, + "grad_norm": 0.75, + "learning_rate": 0.0001860387541505479, + "loss": 0.8663, + "step": 10617 + }, + { + "epoch": 0.2726404662978779, + "grad_norm": 0.7890625, + "learning_rate": 0.00018603647892753914, + "loss": 1.0195, + "step": 10618 + }, + { + "epoch": 0.27266614349379975, + "grad_norm": 0.87109375, + "learning_rate": 0.00018603420353306742, + "loss": 1.052, + "step": 10619 + }, + { + "epoch": 0.2726918206897216, + "grad_norm": 0.859375, + "learning_rate": 0.00018603192796713732, + "loss": 1.0329, + "step": 10620 + }, + { + "epoch": 0.2727174978856434, + "grad_norm": 0.79296875, + "learning_rate": 0.00018602965222975332, + "loss": 0.9322, + "step": 10621 + }, + { + "epoch": 0.2727431750815652, + "grad_norm": 0.80859375, + "learning_rate": 0.00018602737632092002, + "loss": 0.8767, + "step": 10622 + }, + { + "epoch": 0.27276885227748704, + "grad_norm": 0.7890625, + "learning_rate": 0.00018602510024064194, + "loss": 0.9225, + "step": 10623 + }, + { + "epoch": 0.2727945294734089, + "grad_norm": 0.9375, + "learning_rate": 0.00018602282398892358, + "loss": 1.0584, + "step": 10624 + }, + { + "epoch": 0.27282020666933066, + "grad_norm": 0.75, + "learning_rate": 0.00018602054756576954, + "loss": 0.9465, + "step": 10625 + }, + { + "epoch": 0.2728458838652525, + "grad_norm": 0.7578125, + "learning_rate": 0.00018601827097118433, + "loss": 1.0084, + "step": 10626 + }, + { + "epoch": 0.27287156106117433, + "grad_norm": 0.80078125, + "learning_rate": 0.0001860159942051724, + "loss": 0.9536, + "step": 10627 + }, + { + "epoch": 0.2728972382570961, + "grad_norm": 0.8671875, + "learning_rate": 0.00018601371726773843, + "loss": 1.041, + "step": 10628 + }, + { + "epoch": 0.27292291545301794, + "grad_norm": 0.83203125, + "learning_rate": 0.00018601144015888688, + "loss": 1.1647, + "step": 10629 + }, + { + "epoch": 0.2729485926489398, + "grad_norm": 0.75390625, + "learning_rate": 0.00018600916287862227, + "loss": 1.0395, + "step": 10630 + }, + { + "epoch": 0.2729742698448616, + "grad_norm": 0.8046875, + "learning_rate": 0.00018600688542694916, + "loss": 0.9122, + "step": 10631 + }, + { + "epoch": 0.2729999470407834, + "grad_norm": 0.8359375, + "learning_rate": 0.00018600460780387215, + "loss": 1.0317, + "step": 10632 + }, + { + "epoch": 0.27302562423670523, + "grad_norm": 0.82421875, + "learning_rate": 0.00018600233000939568, + "loss": 0.9856, + "step": 10633 + }, + { + "epoch": 0.27305130143262707, + "grad_norm": 0.87109375, + "learning_rate": 0.00018600005204352438, + "loss": 0.9879, + "step": 10634 + }, + { + "epoch": 0.27307697862854885, + "grad_norm": 0.76171875, + "learning_rate": 0.0001859977739062627, + "loss": 1.1017, + "step": 10635 + }, + { + "epoch": 0.2731026558244707, + "grad_norm": 0.796875, + "learning_rate": 0.00018599549559761527, + "loss": 0.8845, + "step": 10636 + }, + { + "epoch": 0.2731283330203925, + "grad_norm": 0.84375, + "learning_rate": 0.00018599321711758652, + "loss": 0.9312, + "step": 10637 + }, + { + "epoch": 0.2731540102163143, + "grad_norm": 0.75, + "learning_rate": 0.0001859909384661811, + "loss": 1.0307, + "step": 10638 + }, + { + "epoch": 0.27317968741223614, + "grad_norm": 0.796875, + "learning_rate": 0.0001859886596434035, + "loss": 0.9418, + "step": 10639 + }, + { + "epoch": 0.273205364608158, + "grad_norm": 0.80078125, + "learning_rate": 0.00018598638064925828, + "loss": 1.0632, + "step": 10640 + }, + { + "epoch": 0.2732310418040798, + "grad_norm": 0.8984375, + "learning_rate": 0.00018598410148374997, + "loss": 1.0369, + "step": 10641 + }, + { + "epoch": 0.2732567190000016, + "grad_norm": 0.8125, + "learning_rate": 0.0001859818221468831, + "loss": 0.9858, + "step": 10642 + }, + { + "epoch": 0.2732823961959234, + "grad_norm": 0.77734375, + "learning_rate": 0.0001859795426386622, + "loss": 0.9313, + "step": 10643 + }, + { + "epoch": 0.27330807339184526, + "grad_norm": 0.76953125, + "learning_rate": 0.00018597726295909188, + "loss": 0.8792, + "step": 10644 + }, + { + "epoch": 0.27333375058776704, + "grad_norm": 0.89453125, + "learning_rate": 0.00018597498310817662, + "loss": 0.9844, + "step": 10645 + }, + { + "epoch": 0.2733594277836889, + "grad_norm": 0.7890625, + "learning_rate": 0.000185972703085921, + "loss": 0.9797, + "step": 10646 + }, + { + "epoch": 0.2733851049796107, + "grad_norm": 0.7734375, + "learning_rate": 0.00018597042289232954, + "loss": 0.8343, + "step": 10647 + }, + { + "epoch": 0.2734107821755325, + "grad_norm": 0.76953125, + "learning_rate": 0.00018596814252740677, + "loss": 0.9364, + "step": 10648 + }, + { + "epoch": 0.27343645937145433, + "grad_norm": 0.765625, + "learning_rate": 0.00018596586199115728, + "loss": 0.9786, + "step": 10649 + }, + { + "epoch": 0.27346213656737617, + "grad_norm": 0.79296875, + "learning_rate": 0.0001859635812835856, + "loss": 1.0024, + "step": 10650 + }, + { + "epoch": 0.273487813763298, + "grad_norm": 1.109375, + "learning_rate": 0.00018596130040469627, + "loss": 0.9724, + "step": 10651 + }, + { + "epoch": 0.2735134909592198, + "grad_norm": 0.80859375, + "learning_rate": 0.00018595901935449384, + "loss": 1.1362, + "step": 10652 + }, + { + "epoch": 0.2735391681551416, + "grad_norm": 0.8203125, + "learning_rate": 0.0001859567381329828, + "loss": 0.9428, + "step": 10653 + }, + { + "epoch": 0.27356484535106346, + "grad_norm": 0.765625, + "learning_rate": 0.00018595445674016777, + "loss": 0.9739, + "step": 10654 + }, + { + "epoch": 0.27359052254698524, + "grad_norm": 0.78125, + "learning_rate": 0.0001859521751760533, + "loss": 0.9689, + "step": 10655 + }, + { + "epoch": 0.2736161997429071, + "grad_norm": 0.88671875, + "learning_rate": 0.00018594989344064387, + "loss": 1.085, + "step": 10656 + }, + { + "epoch": 0.2736418769388289, + "grad_norm": 0.953125, + "learning_rate": 0.00018594761153394412, + "loss": 0.9934, + "step": 10657 + }, + { + "epoch": 0.2736675541347507, + "grad_norm": 0.83984375, + "learning_rate": 0.0001859453294559585, + "loss": 0.9974, + "step": 10658 + }, + { + "epoch": 0.2736932313306725, + "grad_norm": 0.828125, + "learning_rate": 0.0001859430472066916, + "loss": 1.1187, + "step": 10659 + }, + { + "epoch": 0.27371890852659436, + "grad_norm": 0.83984375, + "learning_rate": 0.00018594076478614796, + "loss": 0.9918, + "step": 10660 + }, + { + "epoch": 0.2737445857225162, + "grad_norm": 0.82421875, + "learning_rate": 0.00018593848219433217, + "loss": 1.0928, + "step": 10661 + }, + { + "epoch": 0.273770262918438, + "grad_norm": 0.875, + "learning_rate": 0.00018593619943124874, + "loss": 0.9062, + "step": 10662 + }, + { + "epoch": 0.2737959401143598, + "grad_norm": 0.88671875, + "learning_rate": 0.00018593391649690224, + "loss": 1.0032, + "step": 10663 + }, + { + "epoch": 0.27382161731028165, + "grad_norm": 0.796875, + "learning_rate": 0.00018593163339129717, + "loss": 0.9991, + "step": 10664 + }, + { + "epoch": 0.27384729450620343, + "grad_norm": 0.78515625, + "learning_rate": 0.00018592935011443816, + "loss": 0.9397, + "step": 10665 + }, + { + "epoch": 0.27387297170212527, + "grad_norm": 0.97265625, + "learning_rate": 0.00018592706666632967, + "loss": 1.115, + "step": 10666 + }, + { + "epoch": 0.2738986488980471, + "grad_norm": 0.7890625, + "learning_rate": 0.00018592478304697632, + "loss": 0.95, + "step": 10667 + }, + { + "epoch": 0.2739243260939689, + "grad_norm": 0.83984375, + "learning_rate": 0.00018592249925638264, + "loss": 0.9511, + "step": 10668 + }, + { + "epoch": 0.2739500032898907, + "grad_norm": 0.85546875, + "learning_rate": 0.00018592021529455322, + "loss": 0.932, + "step": 10669 + }, + { + "epoch": 0.27397568048581256, + "grad_norm": 0.875, + "learning_rate": 0.00018591793116149252, + "loss": 1.1406, + "step": 10670 + }, + { + "epoch": 0.2740013576817344, + "grad_norm": 0.765625, + "learning_rate": 0.00018591564685720515, + "loss": 0.9678, + "step": 10671 + }, + { + "epoch": 0.2740270348776562, + "grad_norm": 0.859375, + "learning_rate": 0.00018591336238169565, + "loss": 1.1147, + "step": 10672 + }, + { + "epoch": 0.274052712073578, + "grad_norm": 1.046875, + "learning_rate": 0.0001859110777349686, + "loss": 1.0193, + "step": 10673 + }, + { + "epoch": 0.27407838926949984, + "grad_norm": 0.73046875, + "learning_rate": 0.00018590879291702854, + "loss": 0.8972, + "step": 10674 + }, + { + "epoch": 0.2741040664654216, + "grad_norm": 0.83984375, + "learning_rate": 0.00018590650792788, + "loss": 1.0956, + "step": 10675 + }, + { + "epoch": 0.27412974366134346, + "grad_norm": 0.78125, + "learning_rate": 0.00018590422276752752, + "loss": 0.9719, + "step": 10676 + }, + { + "epoch": 0.2741554208572653, + "grad_norm": 0.953125, + "learning_rate": 0.00018590193743597575, + "loss": 0.9537, + "step": 10677 + }, + { + "epoch": 0.2741810980531871, + "grad_norm": 0.7578125, + "learning_rate": 0.00018589965193322914, + "loss": 1.0937, + "step": 10678 + }, + { + "epoch": 0.2742067752491089, + "grad_norm": 0.8359375, + "learning_rate": 0.00018589736625929226, + "loss": 0.9696, + "step": 10679 + }, + { + "epoch": 0.27423245244503075, + "grad_norm": 0.80859375, + "learning_rate": 0.00018589508041416973, + "loss": 1.0213, + "step": 10680 + }, + { + "epoch": 0.2742581296409526, + "grad_norm": 0.78125, + "learning_rate": 0.00018589279439786607, + "loss": 0.956, + "step": 10681 + }, + { + "epoch": 0.27428380683687437, + "grad_norm": 0.8515625, + "learning_rate": 0.0001858905082103858, + "loss": 0.9559, + "step": 10682 + }, + { + "epoch": 0.2743094840327962, + "grad_norm": 0.8671875, + "learning_rate": 0.00018588822185173348, + "loss": 0.8642, + "step": 10683 + }, + { + "epoch": 0.27433516122871804, + "grad_norm": 0.9296875, + "learning_rate": 0.00018588593532191375, + "loss": 1.0092, + "step": 10684 + }, + { + "epoch": 0.2743608384246398, + "grad_norm": 0.734375, + "learning_rate": 0.00018588364862093108, + "loss": 0.9441, + "step": 10685 + }, + { + "epoch": 0.27438651562056166, + "grad_norm": 0.80078125, + "learning_rate": 0.00018588136174879004, + "loss": 1.0249, + "step": 10686 + }, + { + "epoch": 0.2744121928164835, + "grad_norm": 0.81640625, + "learning_rate": 0.00018587907470549525, + "loss": 0.956, + "step": 10687 + }, + { + "epoch": 0.27443787001240527, + "grad_norm": 0.82421875, + "learning_rate": 0.0001858767874910512, + "loss": 0.9365, + "step": 10688 + }, + { + "epoch": 0.2744635472083271, + "grad_norm": 1.109375, + "learning_rate": 0.00018587450010546246, + "loss": 0.9945, + "step": 10689 + }, + { + "epoch": 0.27448922440424894, + "grad_norm": 0.98828125, + "learning_rate": 0.0001858722125487336, + "loss": 1.0871, + "step": 10690 + }, + { + "epoch": 0.2745149016001708, + "grad_norm": 0.80078125, + "learning_rate": 0.00018586992482086918, + "loss": 0.9897, + "step": 10691 + }, + { + "epoch": 0.27454057879609256, + "grad_norm": 0.82421875, + "learning_rate": 0.00018586763692187374, + "loss": 1.0308, + "step": 10692 + }, + { + "epoch": 0.2745662559920144, + "grad_norm": 0.8515625, + "learning_rate": 0.00018586534885175189, + "loss": 1.2169, + "step": 10693 + }, + { + "epoch": 0.27459193318793623, + "grad_norm": 0.859375, + "learning_rate": 0.00018586306061050813, + "loss": 0.9477, + "step": 10694 + }, + { + "epoch": 0.274617610383858, + "grad_norm": 0.8203125, + "learning_rate": 0.00018586077219814707, + "loss": 0.9652, + "step": 10695 + }, + { + "epoch": 0.27464328757977985, + "grad_norm": 0.83203125, + "learning_rate": 0.00018585848361467322, + "loss": 1.0439, + "step": 10696 + }, + { + "epoch": 0.2746689647757017, + "grad_norm": 0.8515625, + "learning_rate": 0.00018585619486009117, + "loss": 1.007, + "step": 10697 + }, + { + "epoch": 0.27469464197162347, + "grad_norm": 0.83203125, + "learning_rate": 0.0001858539059344055, + "loss": 0.965, + "step": 10698 + }, + { + "epoch": 0.2747203191675453, + "grad_norm": 0.85546875, + "learning_rate": 0.00018585161683762076, + "loss": 0.9832, + "step": 10699 + }, + { + "epoch": 0.27474599636346714, + "grad_norm": 0.8359375, + "learning_rate": 0.00018584932756974146, + "loss": 1.0119, + "step": 10700 + }, + { + "epoch": 0.274771673559389, + "grad_norm": 0.8203125, + "learning_rate": 0.00018584703813077225, + "loss": 1.1297, + "step": 10701 + }, + { + "epoch": 0.27479735075531075, + "grad_norm": 0.7890625, + "learning_rate": 0.0001858447485207176, + "loss": 0.8678, + "step": 10702 + }, + { + "epoch": 0.2748230279512326, + "grad_norm": 0.88671875, + "learning_rate": 0.00018584245873958216, + "loss": 1.0569, + "step": 10703 + }, + { + "epoch": 0.2748487051471544, + "grad_norm": 0.9296875, + "learning_rate": 0.00018584016878737042, + "loss": 1.0398, + "step": 10704 + }, + { + "epoch": 0.2748743823430762, + "grad_norm": 0.8515625, + "learning_rate": 0.000185837878664087, + "loss": 0.8975, + "step": 10705 + }, + { + "epoch": 0.27490005953899804, + "grad_norm": 0.8046875, + "learning_rate": 0.00018583558836973645, + "loss": 0.9091, + "step": 10706 + }, + { + "epoch": 0.2749257367349199, + "grad_norm": 0.8984375, + "learning_rate": 0.0001858332979043233, + "loss": 0.8707, + "step": 10707 + }, + { + "epoch": 0.27495141393084166, + "grad_norm": 0.71875, + "learning_rate": 0.00018583100726785216, + "loss": 0.8482, + "step": 10708 + }, + { + "epoch": 0.2749770911267635, + "grad_norm": 0.80078125, + "learning_rate": 0.00018582871646032759, + "loss": 1.0397, + "step": 10709 + }, + { + "epoch": 0.27500276832268533, + "grad_norm": 0.734375, + "learning_rate": 0.0001858264254817541, + "loss": 1.0126, + "step": 10710 + }, + { + "epoch": 0.27502844551860717, + "grad_norm": 0.93359375, + "learning_rate": 0.00018582413433213633, + "loss": 0.9044, + "step": 10711 + }, + { + "epoch": 0.27505412271452895, + "grad_norm": 0.82421875, + "learning_rate": 0.0001858218430114788, + "loss": 0.9375, + "step": 10712 + }, + { + "epoch": 0.2750797999104508, + "grad_norm": 0.88671875, + "learning_rate": 0.00018581955151978608, + "loss": 0.9644, + "step": 10713 + }, + { + "epoch": 0.2751054771063726, + "grad_norm": 0.8515625, + "learning_rate": 0.00018581725985706275, + "loss": 0.943, + "step": 10714 + }, + { + "epoch": 0.2751311543022944, + "grad_norm": 0.7734375, + "learning_rate": 0.00018581496802331338, + "loss": 1.0584, + "step": 10715 + }, + { + "epoch": 0.27515683149821624, + "grad_norm": 0.84375, + "learning_rate": 0.00018581267601854254, + "loss": 1.0293, + "step": 10716 + }, + { + "epoch": 0.2751825086941381, + "grad_norm": 0.8125, + "learning_rate": 0.00018581038384275477, + "loss": 1.0487, + "step": 10717 + }, + { + "epoch": 0.27520818589005985, + "grad_norm": 0.8828125, + "learning_rate": 0.00018580809149595467, + "loss": 0.9949, + "step": 10718 + }, + { + "epoch": 0.2752338630859817, + "grad_norm": 0.80859375, + "learning_rate": 0.00018580579897814676, + "loss": 1.0006, + "step": 10719 + }, + { + "epoch": 0.2752595402819035, + "grad_norm": 0.8203125, + "learning_rate": 0.0001858035062893357, + "loss": 1.0099, + "step": 10720 + }, + { + "epoch": 0.27528521747782536, + "grad_norm": 0.73046875, + "learning_rate": 0.00018580121342952596, + "loss": 0.8932, + "step": 10721 + }, + { + "epoch": 0.27531089467374714, + "grad_norm": 0.74609375, + "learning_rate": 0.00018579892039872217, + "loss": 0.9063, + "step": 10722 + }, + { + "epoch": 0.275336571869669, + "grad_norm": 0.78515625, + "learning_rate": 0.00018579662719692887, + "loss": 0.9672, + "step": 10723 + }, + { + "epoch": 0.2753622490655908, + "grad_norm": 0.8125, + "learning_rate": 0.00018579433382415064, + "loss": 0.9411, + "step": 10724 + }, + { + "epoch": 0.2753879262615126, + "grad_norm": 0.86328125, + "learning_rate": 0.00018579204028039209, + "loss": 0.9837, + "step": 10725 + }, + { + "epoch": 0.27541360345743443, + "grad_norm": 0.8515625, + "learning_rate": 0.00018578974656565772, + "loss": 1.0687, + "step": 10726 + }, + { + "epoch": 0.27543928065335627, + "grad_norm": 0.81640625, + "learning_rate": 0.00018578745267995212, + "loss": 1.0017, + "step": 10727 + }, + { + "epoch": 0.27546495784927805, + "grad_norm": 0.8125, + "learning_rate": 0.0001857851586232799, + "loss": 1.0917, + "step": 10728 + }, + { + "epoch": 0.2754906350451999, + "grad_norm": 0.83984375, + "learning_rate": 0.00018578286439564561, + "loss": 0.8999, + "step": 10729 + }, + { + "epoch": 0.2755163122411217, + "grad_norm": 0.84375, + "learning_rate": 0.0001857805699970538, + "loss": 1.1274, + "step": 10730 + }, + { + "epoch": 0.27554198943704356, + "grad_norm": 1.0859375, + "learning_rate": 0.00018577827542750908, + "loss": 0.9754, + "step": 10731 + }, + { + "epoch": 0.27556766663296534, + "grad_norm": 0.96484375, + "learning_rate": 0.000185775980687016, + "loss": 1.113, + "step": 10732 + }, + { + "epoch": 0.27559334382888717, + "grad_norm": 0.78125, + "learning_rate": 0.00018577368577557914, + "loss": 1.0135, + "step": 10733 + }, + { + "epoch": 0.275619021024809, + "grad_norm": 0.828125, + "learning_rate": 0.0001857713906932031, + "loss": 0.9243, + "step": 10734 + }, + { + "epoch": 0.2756446982207308, + "grad_norm": 0.71875, + "learning_rate": 0.0001857690954398924, + "loss": 1.0522, + "step": 10735 + }, + { + "epoch": 0.2756703754166526, + "grad_norm": 0.76171875, + "learning_rate": 0.00018576680001565162, + "loss": 0.8699, + "step": 10736 + }, + { + "epoch": 0.27569605261257446, + "grad_norm": 0.78125, + "learning_rate": 0.00018576450442048538, + "loss": 0.926, + "step": 10737 + }, + { + "epoch": 0.27572172980849624, + "grad_norm": 0.875, + "learning_rate": 0.00018576220865439822, + "loss": 0.9096, + "step": 10738 + }, + { + "epoch": 0.2757474070044181, + "grad_norm": 0.78125, + "learning_rate": 0.0001857599127173947, + "loss": 1.0549, + "step": 10739 + }, + { + "epoch": 0.2757730842003399, + "grad_norm": 0.82421875, + "learning_rate": 0.00018575761660947948, + "loss": 0.9338, + "step": 10740 + }, + { + "epoch": 0.2757987613962617, + "grad_norm": 1.1171875, + "learning_rate": 0.00018575532033065703, + "loss": 1.0175, + "step": 10741 + }, + { + "epoch": 0.27582443859218353, + "grad_norm": 0.828125, + "learning_rate": 0.00018575302388093202, + "loss": 0.9873, + "step": 10742 + }, + { + "epoch": 0.27585011578810537, + "grad_norm": 0.82421875, + "learning_rate": 0.0001857507272603089, + "loss": 1.0589, + "step": 10743 + }, + { + "epoch": 0.2758757929840272, + "grad_norm": 0.87109375, + "learning_rate": 0.00018574843046879238, + "loss": 0.973, + "step": 10744 + }, + { + "epoch": 0.275901470179949, + "grad_norm": 0.79296875, + "learning_rate": 0.000185746133506387, + "loss": 0.9433, + "step": 10745 + }, + { + "epoch": 0.2759271473758708, + "grad_norm": 0.73828125, + "learning_rate": 0.00018574383637309729, + "loss": 0.9033, + "step": 10746 + }, + { + "epoch": 0.27595282457179265, + "grad_norm": 0.87109375, + "learning_rate": 0.00018574153906892788, + "loss": 1.0878, + "step": 10747 + }, + { + "epoch": 0.27597850176771443, + "grad_norm": 0.87109375, + "learning_rate": 0.0001857392415938833, + "loss": 1.0594, + "step": 10748 + }, + { + "epoch": 0.27600417896363627, + "grad_norm": 0.7578125, + "learning_rate": 0.0001857369439479682, + "loss": 0.9807, + "step": 10749 + }, + { + "epoch": 0.2760298561595581, + "grad_norm": 0.796875, + "learning_rate": 0.00018573464613118708, + "loss": 1.0027, + "step": 10750 + }, + { + "epoch": 0.2760555333554799, + "grad_norm": 0.84375, + "learning_rate": 0.00018573234814354454, + "loss": 1.0376, + "step": 10751 + }, + { + "epoch": 0.2760812105514017, + "grad_norm": 0.8203125, + "learning_rate": 0.0001857300499850452, + "loss": 1.0475, + "step": 10752 + }, + { + "epoch": 0.27610688774732356, + "grad_norm": 0.875, + "learning_rate": 0.00018572775165569364, + "loss": 1.107, + "step": 10753 + }, + { + "epoch": 0.2761325649432454, + "grad_norm": 0.80859375, + "learning_rate": 0.00018572545315549433, + "loss": 1.0404, + "step": 10754 + }, + { + "epoch": 0.2761582421391672, + "grad_norm": 0.7890625, + "learning_rate": 0.000185723154484452, + "loss": 0.9427, + "step": 10755 + }, + { + "epoch": 0.276183919335089, + "grad_norm": 0.84375, + "learning_rate": 0.00018572085564257118, + "loss": 0.9711, + "step": 10756 + }, + { + "epoch": 0.27620959653101085, + "grad_norm": 0.8671875, + "learning_rate": 0.0001857185566298564, + "loss": 1.1009, + "step": 10757 + }, + { + "epoch": 0.27623527372693263, + "grad_norm": 0.875, + "learning_rate": 0.0001857162574463123, + "loss": 1.091, + "step": 10758 + }, + { + "epoch": 0.27626095092285446, + "grad_norm": 0.75390625, + "learning_rate": 0.00018571395809194346, + "loss": 0.9804, + "step": 10759 + }, + { + "epoch": 0.2762866281187763, + "grad_norm": 0.859375, + "learning_rate": 0.00018571165856675438, + "loss": 0.905, + "step": 10760 + }, + { + "epoch": 0.2763123053146981, + "grad_norm": 0.8359375, + "learning_rate": 0.0001857093588707498, + "loss": 0.9262, + "step": 10761 + }, + { + "epoch": 0.2763379825106199, + "grad_norm": 0.8828125, + "learning_rate": 0.00018570705900393413, + "loss": 1.1186, + "step": 10762 + }, + { + "epoch": 0.27636365970654175, + "grad_norm": 0.765625, + "learning_rate": 0.00018570475896631205, + "loss": 1.1145, + "step": 10763 + }, + { + "epoch": 0.2763893369024636, + "grad_norm": 0.81640625, + "learning_rate": 0.00018570245875788814, + "loss": 0.9152, + "step": 10764 + }, + { + "epoch": 0.27641501409838537, + "grad_norm": 0.859375, + "learning_rate": 0.00018570015837866697, + "loss": 1.0191, + "step": 10765 + }, + { + "epoch": 0.2764406912943072, + "grad_norm": 0.83984375, + "learning_rate": 0.00018569785782865313, + "loss": 0.9968, + "step": 10766 + }, + { + "epoch": 0.27646636849022904, + "grad_norm": 0.91015625, + "learning_rate": 0.0001856955571078512, + "loss": 1.054, + "step": 10767 + }, + { + "epoch": 0.2764920456861508, + "grad_norm": 2.75, + "learning_rate": 0.00018569325621626575, + "loss": 0.9875, + "step": 10768 + }, + { + "epoch": 0.27651772288207266, + "grad_norm": 0.8359375, + "learning_rate": 0.0001856909551539014, + "loss": 1.0103, + "step": 10769 + }, + { + "epoch": 0.2765434000779945, + "grad_norm": 0.81640625, + "learning_rate": 0.0001856886539207627, + "loss": 1.0049, + "step": 10770 + }, + { + "epoch": 0.2765690772739163, + "grad_norm": 0.87109375, + "learning_rate": 0.00018568635251685426, + "loss": 1.1602, + "step": 10771 + }, + { + "epoch": 0.2765947544698381, + "grad_norm": 0.859375, + "learning_rate": 0.00018568405094218066, + "loss": 0.9811, + "step": 10772 + }, + { + "epoch": 0.27662043166575995, + "grad_norm": 0.828125, + "learning_rate": 0.00018568174919674648, + "loss": 1.0764, + "step": 10773 + }, + { + "epoch": 0.2766461088616818, + "grad_norm": 0.8671875, + "learning_rate": 0.00018567944728055632, + "loss": 1.0228, + "step": 10774 + }, + { + "epoch": 0.27667178605760356, + "grad_norm": 0.75390625, + "learning_rate": 0.00018567714519361476, + "loss": 0.8988, + "step": 10775 + }, + { + "epoch": 0.2766974632535254, + "grad_norm": 0.77734375, + "learning_rate": 0.0001856748429359264, + "loss": 0.9723, + "step": 10776 + }, + { + "epoch": 0.27672314044944724, + "grad_norm": 0.796875, + "learning_rate": 0.0001856725405074958, + "loss": 0.9138, + "step": 10777 + }, + { + "epoch": 0.276748817645369, + "grad_norm": 0.78125, + "learning_rate": 0.0001856702379083276, + "loss": 1.0968, + "step": 10778 + }, + { + "epoch": 0.27677449484129085, + "grad_norm": 0.79296875, + "learning_rate": 0.00018566793513842629, + "loss": 1.0268, + "step": 10779 + }, + { + "epoch": 0.2768001720372127, + "grad_norm": 0.76953125, + "learning_rate": 0.0001856656321977966, + "loss": 0.903, + "step": 10780 + }, + { + "epoch": 0.27682584923313447, + "grad_norm": 0.83984375, + "learning_rate": 0.00018566332908644297, + "loss": 0.9784, + "step": 10781 + }, + { + "epoch": 0.2768515264290563, + "grad_norm": 0.8984375, + "learning_rate": 0.0001856610258043701, + "loss": 1.0529, + "step": 10782 + }, + { + "epoch": 0.27687720362497814, + "grad_norm": 0.8828125, + "learning_rate": 0.00018565872235158252, + "loss": 1.0095, + "step": 10783 + }, + { + "epoch": 0.2769028808209, + "grad_norm": 0.7890625, + "learning_rate": 0.0001856564187280849, + "loss": 0.9797, + "step": 10784 + }, + { + "epoch": 0.27692855801682176, + "grad_norm": 0.7734375, + "learning_rate": 0.00018565411493388172, + "loss": 1.0455, + "step": 10785 + }, + { + "epoch": 0.2769542352127436, + "grad_norm": 0.8203125, + "learning_rate": 0.00018565181096897763, + "loss": 0.9673, + "step": 10786 + }, + { + "epoch": 0.27697991240866543, + "grad_norm": 1.1796875, + "learning_rate": 0.00018564950683337724, + "loss": 1.0129, + "step": 10787 + }, + { + "epoch": 0.2770055896045872, + "grad_norm": 0.765625, + "learning_rate": 0.00018564720252708513, + "loss": 0.7906, + "step": 10788 + }, + { + "epoch": 0.27703126680050905, + "grad_norm": 0.83984375, + "learning_rate": 0.00018564489805010585, + "loss": 1.0294, + "step": 10789 + }, + { + "epoch": 0.2770569439964309, + "grad_norm": 0.81640625, + "learning_rate": 0.00018564259340244404, + "loss": 0.917, + "step": 10790 + }, + { + "epoch": 0.27708262119235266, + "grad_norm": 0.88671875, + "learning_rate": 0.00018564028858410428, + "loss": 1.0261, + "step": 10791 + }, + { + "epoch": 0.2771082983882745, + "grad_norm": 0.765625, + "learning_rate": 0.00018563798359509114, + "loss": 0.8932, + "step": 10792 + }, + { + "epoch": 0.27713397558419633, + "grad_norm": 0.85546875, + "learning_rate": 0.00018563567843540925, + "loss": 1.0075, + "step": 10793 + }, + { + "epoch": 0.27715965278011817, + "grad_norm": 0.8515625, + "learning_rate": 0.0001856333731050632, + "loss": 0.9166, + "step": 10794 + }, + { + "epoch": 0.27718532997603995, + "grad_norm": 1.7890625, + "learning_rate": 0.00018563106760405756, + "loss": 0.8886, + "step": 10795 + }, + { + "epoch": 0.2772110071719618, + "grad_norm": 0.86328125, + "learning_rate": 0.00018562876193239694, + "loss": 1.0612, + "step": 10796 + }, + { + "epoch": 0.2772366843678836, + "grad_norm": 0.8671875, + "learning_rate": 0.00018562645609008593, + "loss": 1.0051, + "step": 10797 + }, + { + "epoch": 0.2772623615638054, + "grad_norm": 1.515625, + "learning_rate": 0.00018562415007712915, + "loss": 1.0929, + "step": 10798 + }, + { + "epoch": 0.27728803875972724, + "grad_norm": 0.77734375, + "learning_rate": 0.0001856218438935311, + "loss": 0.9516, + "step": 10799 + }, + { + "epoch": 0.2773137159556491, + "grad_norm": 0.82421875, + "learning_rate": 0.00018561953753929653, + "loss": 1.0034, + "step": 10800 + }, + { + "epoch": 0.27733939315157086, + "grad_norm": 0.8125, + "learning_rate": 0.00018561723101442992, + "loss": 0.9132, + "step": 10801 + }, + { + "epoch": 0.2773650703474927, + "grad_norm": 0.81640625, + "learning_rate": 0.00018561492431893591, + "loss": 0.9725, + "step": 10802 + }, + { + "epoch": 0.27739074754341453, + "grad_norm": 0.75390625, + "learning_rate": 0.00018561261745281908, + "loss": 1.0254, + "step": 10803 + }, + { + "epoch": 0.27741642473933636, + "grad_norm": 0.8125, + "learning_rate": 0.00018561031041608407, + "loss": 0.9094, + "step": 10804 + }, + { + "epoch": 0.27744210193525815, + "grad_norm": 1.0, + "learning_rate": 0.00018560800320873542, + "loss": 0.8998, + "step": 10805 + }, + { + "epoch": 0.27746777913118, + "grad_norm": 0.74609375, + "learning_rate": 0.00018560569583077775, + "loss": 0.9021, + "step": 10806 + }, + { + "epoch": 0.2774934563271018, + "grad_norm": 0.83984375, + "learning_rate": 0.00018560338828221566, + "loss": 0.9601, + "step": 10807 + }, + { + "epoch": 0.2775191335230236, + "grad_norm": 0.859375, + "learning_rate": 0.00018560108056305373, + "loss": 1.1385, + "step": 10808 + }, + { + "epoch": 0.27754481071894543, + "grad_norm": 0.78515625, + "learning_rate": 0.0001855987726732966, + "loss": 0.8977, + "step": 10809 + }, + { + "epoch": 0.27757048791486727, + "grad_norm": 0.71875, + "learning_rate": 0.0001855964646129489, + "loss": 0.9267, + "step": 10810 + }, + { + "epoch": 0.27759616511078905, + "grad_norm": 0.7734375, + "learning_rate": 0.00018559415638201508, + "loss": 0.9577, + "step": 10811 + }, + { + "epoch": 0.2776218423067109, + "grad_norm": 0.84375, + "learning_rate": 0.0001855918479804999, + "loss": 1.0114, + "step": 10812 + }, + { + "epoch": 0.2776475195026327, + "grad_norm": 0.765625, + "learning_rate": 0.0001855895394084079, + "loss": 0.9694, + "step": 10813 + }, + { + "epoch": 0.27767319669855456, + "grad_norm": 0.85546875, + "learning_rate": 0.00018558723066574366, + "loss": 1.049, + "step": 10814 + }, + { + "epoch": 0.27769887389447634, + "grad_norm": 0.7421875, + "learning_rate": 0.0001855849217525118, + "loss": 0.8911, + "step": 10815 + }, + { + "epoch": 0.2777245510903982, + "grad_norm": 0.828125, + "learning_rate": 0.00018558261266871692, + "loss": 0.9811, + "step": 10816 + }, + { + "epoch": 0.27775022828632, + "grad_norm": 0.88671875, + "learning_rate": 0.00018558030341436364, + "loss": 0.8929, + "step": 10817 + }, + { + "epoch": 0.2777759054822418, + "grad_norm": 0.83984375, + "learning_rate": 0.00018557799398945653, + "loss": 1.0016, + "step": 10818 + }, + { + "epoch": 0.2778015826781636, + "grad_norm": 0.78515625, + "learning_rate": 0.0001855756843940002, + "loss": 0.9342, + "step": 10819 + }, + { + "epoch": 0.27782725987408546, + "grad_norm": 0.82421875, + "learning_rate": 0.0001855733746279993, + "loss": 1.0364, + "step": 10820 + }, + { + "epoch": 0.27785293707000724, + "grad_norm": 0.84375, + "learning_rate": 0.00018557106469145834, + "loss": 0.9854, + "step": 10821 + }, + { + "epoch": 0.2778786142659291, + "grad_norm": 0.87109375, + "learning_rate": 0.000185568754584382, + "loss": 1.0178, + "step": 10822 + }, + { + "epoch": 0.2779042914618509, + "grad_norm": 0.74609375, + "learning_rate": 0.00018556644430677488, + "loss": 0.9167, + "step": 10823 + }, + { + "epoch": 0.27792996865777275, + "grad_norm": 0.94140625, + "learning_rate": 0.00018556413385864155, + "loss": 0.8885, + "step": 10824 + }, + { + "epoch": 0.27795564585369453, + "grad_norm": 0.84765625, + "learning_rate": 0.00018556182323998662, + "loss": 1.0538, + "step": 10825 + }, + { + "epoch": 0.27798132304961637, + "grad_norm": 0.796875, + "learning_rate": 0.00018555951245081476, + "loss": 0.9841, + "step": 10826 + }, + { + "epoch": 0.2780070002455382, + "grad_norm": 0.7734375, + "learning_rate": 0.00018555720149113046, + "loss": 0.9498, + "step": 10827 + }, + { + "epoch": 0.27803267744146, + "grad_norm": 0.81640625, + "learning_rate": 0.0001855548903609384, + "loss": 1.0964, + "step": 10828 + }, + { + "epoch": 0.2780583546373818, + "grad_norm": 0.74609375, + "learning_rate": 0.00018555257906024316, + "loss": 0.9621, + "step": 10829 + }, + { + "epoch": 0.27808403183330366, + "grad_norm": 0.72265625, + "learning_rate": 0.0001855502675890494, + "loss": 1.0261, + "step": 10830 + }, + { + "epoch": 0.27810970902922544, + "grad_norm": 0.7734375, + "learning_rate": 0.00018554795594736165, + "loss": 0.9865, + "step": 10831 + }, + { + "epoch": 0.2781353862251473, + "grad_norm": 0.7734375, + "learning_rate": 0.00018554564413518455, + "loss": 1.0326, + "step": 10832 + }, + { + "epoch": 0.2781610634210691, + "grad_norm": 0.87890625, + "learning_rate": 0.00018554333215252273, + "loss": 0.9759, + "step": 10833 + }, + { + "epoch": 0.27818674061699095, + "grad_norm": 0.87109375, + "learning_rate": 0.00018554101999938076, + "loss": 0.8866, + "step": 10834 + }, + { + "epoch": 0.2782124178129127, + "grad_norm": 1.0703125, + "learning_rate": 0.00018553870767576327, + "loss": 1.1801, + "step": 10835 + }, + { + "epoch": 0.27823809500883456, + "grad_norm": 0.734375, + "learning_rate": 0.00018553639518167488, + "loss": 0.9094, + "step": 10836 + }, + { + "epoch": 0.2782637722047564, + "grad_norm": 1.2109375, + "learning_rate": 0.00018553408251712015, + "loss": 0.8422, + "step": 10837 + }, + { + "epoch": 0.2782894494006782, + "grad_norm": 0.78515625, + "learning_rate": 0.00018553176968210373, + "loss": 1.1181, + "step": 10838 + }, + { + "epoch": 0.2783151265966, + "grad_norm": 0.77734375, + "learning_rate": 0.00018552945667663022, + "loss": 0.9033, + "step": 10839 + }, + { + "epoch": 0.27834080379252185, + "grad_norm": 0.74609375, + "learning_rate": 0.0001855271435007042, + "loss": 0.96, + "step": 10840 + }, + { + "epoch": 0.27836648098844363, + "grad_norm": 0.78515625, + "learning_rate": 0.00018552483015433035, + "loss": 0.9149, + "step": 10841 + }, + { + "epoch": 0.27839215818436547, + "grad_norm": 0.79296875, + "learning_rate": 0.00018552251663751324, + "loss": 1.0402, + "step": 10842 + }, + { + "epoch": 0.2784178353802873, + "grad_norm": 0.8046875, + "learning_rate": 0.00018552020295025746, + "loss": 0.9231, + "step": 10843 + }, + { + "epoch": 0.27844351257620914, + "grad_norm": 0.91015625, + "learning_rate": 0.00018551788909256764, + "loss": 0.9863, + "step": 10844 + }, + { + "epoch": 0.2784691897721309, + "grad_norm": 0.8125, + "learning_rate": 0.0001855155750644484, + "loss": 1.0443, + "step": 10845 + }, + { + "epoch": 0.27849486696805276, + "grad_norm": 0.8203125, + "learning_rate": 0.00018551326086590434, + "loss": 1.0536, + "step": 10846 + }, + { + "epoch": 0.2785205441639746, + "grad_norm": 0.75390625, + "learning_rate": 0.0001855109464969401, + "loss": 1.0906, + "step": 10847 + }, + { + "epoch": 0.2785462213598964, + "grad_norm": 0.82421875, + "learning_rate": 0.00018550863195756023, + "loss": 1.1944, + "step": 10848 + }, + { + "epoch": 0.2785718985558182, + "grad_norm": 1.3671875, + "learning_rate": 0.0001855063172477694, + "loss": 1.0273, + "step": 10849 + }, + { + "epoch": 0.27859757575174005, + "grad_norm": 0.8125, + "learning_rate": 0.00018550400236757222, + "loss": 0.9871, + "step": 10850 + }, + { + "epoch": 0.2786232529476618, + "grad_norm": 0.74609375, + "learning_rate": 0.00018550168731697327, + "loss": 0.9767, + "step": 10851 + }, + { + "epoch": 0.27864893014358366, + "grad_norm": 0.8984375, + "learning_rate": 0.00018549937209597715, + "loss": 0.9834, + "step": 10852 + }, + { + "epoch": 0.2786746073395055, + "grad_norm": 0.8515625, + "learning_rate": 0.00018549705670458852, + "loss": 0.8732, + "step": 10853 + }, + { + "epoch": 0.27870028453542733, + "grad_norm": 0.77734375, + "learning_rate": 0.000185494741142812, + "loss": 0.8825, + "step": 10854 + }, + { + "epoch": 0.2787259617313491, + "grad_norm": 0.81640625, + "learning_rate": 0.00018549242541065217, + "loss": 0.9189, + "step": 10855 + }, + { + "epoch": 0.27875163892727095, + "grad_norm": 0.80078125, + "learning_rate": 0.00018549010950811366, + "loss": 0.8352, + "step": 10856 + }, + { + "epoch": 0.2787773161231928, + "grad_norm": 0.76953125, + "learning_rate": 0.0001854877934352011, + "loss": 1.0117, + "step": 10857 + }, + { + "epoch": 0.27880299331911457, + "grad_norm": 0.76171875, + "learning_rate": 0.00018548547719191905, + "loss": 1.0125, + "step": 10858 + }, + { + "epoch": 0.2788286705150364, + "grad_norm": 0.79296875, + "learning_rate": 0.00018548316077827222, + "loss": 0.8874, + "step": 10859 + }, + { + "epoch": 0.27885434771095824, + "grad_norm": 0.82421875, + "learning_rate": 0.00018548084419426513, + "loss": 1.0999, + "step": 10860 + }, + { + "epoch": 0.27888002490688, + "grad_norm": 0.83203125, + "learning_rate": 0.00018547852743990245, + "loss": 0.854, + "step": 10861 + }, + { + "epoch": 0.27890570210280186, + "grad_norm": 0.765625, + "learning_rate": 0.00018547621051518882, + "loss": 0.9697, + "step": 10862 + }, + { + "epoch": 0.2789313792987237, + "grad_norm": 0.75, + "learning_rate": 0.00018547389342012878, + "loss": 1.1409, + "step": 10863 + }, + { + "epoch": 0.27895705649464553, + "grad_norm": 0.79296875, + "learning_rate": 0.000185471576154727, + "loss": 0.9644, + "step": 10864 + }, + { + "epoch": 0.2789827336905673, + "grad_norm": 0.72265625, + "learning_rate": 0.00018546925871898808, + "loss": 0.9183, + "step": 10865 + }, + { + "epoch": 0.27900841088648914, + "grad_norm": 0.82421875, + "learning_rate": 0.00018546694111291667, + "loss": 0.9674, + "step": 10866 + }, + { + "epoch": 0.279034088082411, + "grad_norm": 0.79296875, + "learning_rate": 0.00018546462333651737, + "loss": 0.971, + "step": 10867 + }, + { + "epoch": 0.27905976527833276, + "grad_norm": 0.796875, + "learning_rate": 0.00018546230538979476, + "loss": 1.0058, + "step": 10868 + }, + { + "epoch": 0.2790854424742546, + "grad_norm": 0.82421875, + "learning_rate": 0.00018545998727275353, + "loss": 0.9668, + "step": 10869 + }, + { + "epoch": 0.27911111967017643, + "grad_norm": 0.83203125, + "learning_rate": 0.00018545766898539826, + "loss": 0.9885, + "step": 10870 + }, + { + "epoch": 0.2791367968660982, + "grad_norm": 0.85546875, + "learning_rate": 0.00018545535052773358, + "loss": 0.9227, + "step": 10871 + }, + { + "epoch": 0.27916247406202005, + "grad_norm": 0.75390625, + "learning_rate": 0.00018545303189976408, + "loss": 0.9184, + "step": 10872 + }, + { + "epoch": 0.2791881512579419, + "grad_norm": 0.8125, + "learning_rate": 0.00018545071310149442, + "loss": 1.0235, + "step": 10873 + }, + { + "epoch": 0.2792138284538637, + "grad_norm": 0.80078125, + "learning_rate": 0.0001854483941329292, + "loss": 0.9454, + "step": 10874 + }, + { + "epoch": 0.2792395056497855, + "grad_norm": 0.8125, + "learning_rate": 0.00018544607499407304, + "loss": 1.1094, + "step": 10875 + }, + { + "epoch": 0.27926518284570734, + "grad_norm": 0.89453125, + "learning_rate": 0.00018544375568493063, + "loss": 0.924, + "step": 10876 + }, + { + "epoch": 0.2792908600416292, + "grad_norm": 0.76171875, + "learning_rate": 0.00018544143620550648, + "loss": 1.0384, + "step": 10877 + }, + { + "epoch": 0.27931653723755095, + "grad_norm": 0.80078125, + "learning_rate": 0.00018543911655580526, + "loss": 0.7778, + "step": 10878 + }, + { + "epoch": 0.2793422144334728, + "grad_norm": 0.74609375, + "learning_rate": 0.0001854367967358316, + "loss": 0.9885, + "step": 10879 + }, + { + "epoch": 0.2793678916293946, + "grad_norm": 0.7265625, + "learning_rate": 0.00018543447674559016, + "loss": 0.9155, + "step": 10880 + }, + { + "epoch": 0.2793935688253164, + "grad_norm": 0.796875, + "learning_rate": 0.00018543215658508547, + "loss": 0.8482, + "step": 10881 + }, + { + "epoch": 0.27941924602123824, + "grad_norm": 0.84765625, + "learning_rate": 0.00018542983625432228, + "loss": 1.0502, + "step": 10882 + }, + { + "epoch": 0.2794449232171601, + "grad_norm": 0.796875, + "learning_rate": 0.0001854275157533051, + "loss": 1.0102, + "step": 10883 + }, + { + "epoch": 0.2794706004130819, + "grad_norm": 0.74609375, + "learning_rate": 0.0001854251950820386, + "loss": 0.8012, + "step": 10884 + }, + { + "epoch": 0.2794962776090037, + "grad_norm": 0.80078125, + "learning_rate": 0.0001854228742405274, + "loss": 0.9358, + "step": 10885 + }, + { + "epoch": 0.27952195480492553, + "grad_norm": 0.8203125, + "learning_rate": 0.00018542055322877613, + "loss": 0.9791, + "step": 10886 + }, + { + "epoch": 0.27954763200084737, + "grad_norm": 0.83984375, + "learning_rate": 0.0001854182320467894, + "loss": 1.0231, + "step": 10887 + }, + { + "epoch": 0.27957330919676915, + "grad_norm": 0.796875, + "learning_rate": 0.00018541591069457186, + "loss": 0.9264, + "step": 10888 + }, + { + "epoch": 0.279598986392691, + "grad_norm": 0.78515625, + "learning_rate": 0.00018541358917212815, + "loss": 0.9107, + "step": 10889 + }, + { + "epoch": 0.2796246635886128, + "grad_norm": 0.7890625, + "learning_rate": 0.00018541126747946286, + "loss": 0.9462, + "step": 10890 + }, + { + "epoch": 0.2796503407845346, + "grad_norm": 0.76171875, + "learning_rate": 0.00018540894561658061, + "loss": 0.8338, + "step": 10891 + }, + { + "epoch": 0.27967601798045644, + "grad_norm": 0.87109375, + "learning_rate": 0.00018540662358348606, + "loss": 1.1243, + "step": 10892 + }, + { + "epoch": 0.2797016951763783, + "grad_norm": 0.7734375, + "learning_rate": 0.00018540430138018382, + "loss": 0.9127, + "step": 10893 + }, + { + "epoch": 0.2797273723723001, + "grad_norm": 0.87890625, + "learning_rate": 0.0001854019790066785, + "loss": 1.0809, + "step": 10894 + }, + { + "epoch": 0.2797530495682219, + "grad_norm": 0.7890625, + "learning_rate": 0.0001853996564629748, + "loss": 0.8076, + "step": 10895 + }, + { + "epoch": 0.2797787267641437, + "grad_norm": 0.7890625, + "learning_rate": 0.00018539733374907727, + "loss": 0.9725, + "step": 10896 + }, + { + "epoch": 0.27980440396006556, + "grad_norm": 0.76171875, + "learning_rate": 0.00018539501086499056, + "loss": 0.9883, + "step": 10897 + }, + { + "epoch": 0.27983008115598734, + "grad_norm": 0.80859375, + "learning_rate": 0.0001853926878107193, + "loss": 0.9574, + "step": 10898 + }, + { + "epoch": 0.2798557583519092, + "grad_norm": 0.78125, + "learning_rate": 0.00018539036458626817, + "loss": 0.9473, + "step": 10899 + }, + { + "epoch": 0.279881435547831, + "grad_norm": 0.87109375, + "learning_rate": 0.0001853880411916417, + "loss": 0.9216, + "step": 10900 + }, + { + "epoch": 0.2799071127437528, + "grad_norm": 0.78515625, + "learning_rate": 0.00018538571762684463, + "loss": 0.8761, + "step": 10901 + }, + { + "epoch": 0.27993278993967463, + "grad_norm": 0.85546875, + "learning_rate": 0.0001853833938918815, + "loss": 1.0367, + "step": 10902 + }, + { + "epoch": 0.27995846713559647, + "grad_norm": 0.703125, + "learning_rate": 0.000185381069986757, + "loss": 0.9185, + "step": 10903 + }, + { + "epoch": 0.2799841443315183, + "grad_norm": 0.87890625, + "learning_rate": 0.00018537874591147575, + "loss": 1.0027, + "step": 10904 + }, + { + "epoch": 0.2800098215274401, + "grad_norm": 1.2109375, + "learning_rate": 0.00018537642166604234, + "loss": 1.1006, + "step": 10905 + }, + { + "epoch": 0.2800354987233619, + "grad_norm": 0.77734375, + "learning_rate": 0.00018537409725046146, + "loss": 1.0527, + "step": 10906 + }, + { + "epoch": 0.28006117591928376, + "grad_norm": 0.8515625, + "learning_rate": 0.0001853717726647377, + "loss": 1.0475, + "step": 10907 + }, + { + "epoch": 0.28008685311520554, + "grad_norm": 0.796875, + "learning_rate": 0.0001853694479088757, + "loss": 0.9604, + "step": 10908 + }, + { + "epoch": 0.2801125303111274, + "grad_norm": 0.828125, + "learning_rate": 0.00018536712298288012, + "loss": 0.9859, + "step": 10909 + }, + { + "epoch": 0.2801382075070492, + "grad_norm": 0.8515625, + "learning_rate": 0.0001853647978867556, + "loss": 0.9703, + "step": 10910 + }, + { + "epoch": 0.280163884702971, + "grad_norm": 0.83203125, + "learning_rate": 0.0001853624726205067, + "loss": 0.9791, + "step": 10911 + }, + { + "epoch": 0.2801895618988928, + "grad_norm": 0.8203125, + "learning_rate": 0.00018536014718413814, + "loss": 0.8956, + "step": 10912 + }, + { + "epoch": 0.28021523909481466, + "grad_norm": 0.8828125, + "learning_rate": 0.0001853578215776545, + "loss": 0.9946, + "step": 10913 + }, + { + "epoch": 0.2802409162907365, + "grad_norm": 0.80859375, + "learning_rate": 0.00018535549580106042, + "loss": 0.9409, + "step": 10914 + }, + { + "epoch": 0.2802665934866583, + "grad_norm": 0.734375, + "learning_rate": 0.00018535316985436056, + "loss": 0.905, + "step": 10915 + }, + { + "epoch": 0.2802922706825801, + "grad_norm": 0.6953125, + "learning_rate": 0.00018535084373755953, + "loss": 0.9787, + "step": 10916 + }, + { + "epoch": 0.28031794787850195, + "grad_norm": 0.79296875, + "learning_rate": 0.000185348517450662, + "loss": 0.9368, + "step": 10917 + }, + { + "epoch": 0.28034362507442373, + "grad_norm": 0.80078125, + "learning_rate": 0.00018534619099367256, + "loss": 0.9839, + "step": 10918 + }, + { + "epoch": 0.28036930227034557, + "grad_norm": 0.7421875, + "learning_rate": 0.0001853438643665959, + "loss": 0.8143, + "step": 10919 + }, + { + "epoch": 0.2803949794662674, + "grad_norm": 0.76953125, + "learning_rate": 0.00018534153756943658, + "loss": 1.0033, + "step": 10920 + }, + { + "epoch": 0.2804206566621892, + "grad_norm": 0.76171875, + "learning_rate": 0.00018533921060219933, + "loss": 0.9602, + "step": 10921 + }, + { + "epoch": 0.280446333858111, + "grad_norm": 0.9140625, + "learning_rate": 0.00018533688346488873, + "loss": 0.9138, + "step": 10922 + }, + { + "epoch": 0.28047201105403285, + "grad_norm": 0.81640625, + "learning_rate": 0.00018533455615750942, + "loss": 0.9169, + "step": 10923 + }, + { + "epoch": 0.2804976882499547, + "grad_norm": 0.78515625, + "learning_rate": 0.00018533222868006606, + "loss": 0.9498, + "step": 10924 + }, + { + "epoch": 0.28052336544587647, + "grad_norm": 0.8125, + "learning_rate": 0.00018532990103256326, + "loss": 0.8365, + "step": 10925 + }, + { + "epoch": 0.2805490426417983, + "grad_norm": 0.8046875, + "learning_rate": 0.0001853275732150057, + "loss": 1.0214, + "step": 10926 + }, + { + "epoch": 0.28057471983772014, + "grad_norm": 0.80078125, + "learning_rate": 0.00018532524522739798, + "loss": 1.0398, + "step": 10927 + }, + { + "epoch": 0.2806003970336419, + "grad_norm": 0.8671875, + "learning_rate": 0.00018532291706974477, + "loss": 1.124, + "step": 10928 + }, + { + "epoch": 0.28062607422956376, + "grad_norm": 0.75, + "learning_rate": 0.00018532058874205066, + "loss": 0.9768, + "step": 10929 + }, + { + "epoch": 0.2806517514254856, + "grad_norm": 0.8125, + "learning_rate": 0.00018531826024432036, + "loss": 1.0735, + "step": 10930 + }, + { + "epoch": 0.2806774286214074, + "grad_norm": 0.75, + "learning_rate": 0.00018531593157655843, + "loss": 0.9077, + "step": 10931 + }, + { + "epoch": 0.2807031058173292, + "grad_norm": 0.9765625, + "learning_rate": 0.00018531360273876959, + "loss": 0.9717, + "step": 10932 + }, + { + "epoch": 0.28072878301325105, + "grad_norm": 0.828125, + "learning_rate": 0.00018531127373095843, + "loss": 1.0236, + "step": 10933 + }, + { + "epoch": 0.2807544602091729, + "grad_norm": 0.8046875, + "learning_rate": 0.0001853089445531296, + "loss": 0.9572, + "step": 10934 + }, + { + "epoch": 0.28078013740509467, + "grad_norm": 0.88671875, + "learning_rate": 0.0001853066152052878, + "loss": 1.1016, + "step": 10935 + }, + { + "epoch": 0.2808058146010165, + "grad_norm": 0.72265625, + "learning_rate": 0.0001853042856874376, + "loss": 0.8618, + "step": 10936 + }, + { + "epoch": 0.28083149179693834, + "grad_norm": 0.83984375, + "learning_rate": 0.0001853019559995836, + "loss": 1.0328, + "step": 10937 + }, + { + "epoch": 0.2808571689928601, + "grad_norm": 0.88671875, + "learning_rate": 0.00018529962614173058, + "loss": 0.9229, + "step": 10938 + }, + { + "epoch": 0.28088284618878195, + "grad_norm": 0.80859375, + "learning_rate": 0.00018529729611388307, + "loss": 1.0235, + "step": 10939 + }, + { + "epoch": 0.2809085233847038, + "grad_norm": 0.83203125, + "learning_rate": 0.0001852949659160458, + "loss": 0.9333, + "step": 10940 + }, + { + "epoch": 0.28093420058062557, + "grad_norm": 1.046875, + "learning_rate": 0.0001852926355482233, + "loss": 0.9072, + "step": 10941 + }, + { + "epoch": 0.2809598777765474, + "grad_norm": 1.1796875, + "learning_rate": 0.00018529030501042036, + "loss": 1.0616, + "step": 10942 + }, + { + "epoch": 0.28098555497246924, + "grad_norm": 0.76171875, + "learning_rate": 0.00018528797430264148, + "loss": 0.9377, + "step": 10943 + }, + { + "epoch": 0.2810112321683911, + "grad_norm": 0.80859375, + "learning_rate": 0.0001852856434248914, + "loss": 1.0318, + "step": 10944 + }, + { + "epoch": 0.28103690936431286, + "grad_norm": 0.7578125, + "learning_rate": 0.00018528331237717473, + "loss": 0.8276, + "step": 10945 + }, + { + "epoch": 0.2810625865602347, + "grad_norm": 0.7734375, + "learning_rate": 0.0001852809811594961, + "loss": 0.9135, + "step": 10946 + }, + { + "epoch": 0.28108826375615653, + "grad_norm": 0.796875, + "learning_rate": 0.00018527864977186024, + "loss": 1.0481, + "step": 10947 + }, + { + "epoch": 0.2811139409520783, + "grad_norm": 0.85546875, + "learning_rate": 0.0001852763182142717, + "loss": 0.966, + "step": 10948 + }, + { + "epoch": 0.28113961814800015, + "grad_norm": 0.7421875, + "learning_rate": 0.00018527398648673514, + "loss": 0.9392, + "step": 10949 + }, + { + "epoch": 0.281165295343922, + "grad_norm": 0.76171875, + "learning_rate": 0.00018527165458925526, + "loss": 0.8943, + "step": 10950 + }, + { + "epoch": 0.28119097253984376, + "grad_norm": 0.88671875, + "learning_rate": 0.00018526932252183666, + "loss": 1.1684, + "step": 10951 + }, + { + "epoch": 0.2812166497357656, + "grad_norm": 0.76953125, + "learning_rate": 0.000185266990284484, + "loss": 0.9568, + "step": 10952 + }, + { + "epoch": 0.28124232693168744, + "grad_norm": 0.94140625, + "learning_rate": 0.00018526465787720194, + "loss": 1.0675, + "step": 10953 + }, + { + "epoch": 0.2812680041276092, + "grad_norm": 0.80078125, + "learning_rate": 0.00018526232529999511, + "loss": 0.8694, + "step": 10954 + }, + { + "epoch": 0.28129368132353105, + "grad_norm": 0.80078125, + "learning_rate": 0.00018525999255286817, + "loss": 1.0753, + "step": 10955 + }, + { + "epoch": 0.2813193585194529, + "grad_norm": 0.8515625, + "learning_rate": 0.00018525765963582576, + "loss": 1.0978, + "step": 10956 + }, + { + "epoch": 0.2813450357153747, + "grad_norm": 0.8203125, + "learning_rate": 0.00018525532654887255, + "loss": 0.9819, + "step": 10957 + }, + { + "epoch": 0.2813707129112965, + "grad_norm": 0.8515625, + "learning_rate": 0.00018525299329201316, + "loss": 1.0062, + "step": 10958 + }, + { + "epoch": 0.28139639010721834, + "grad_norm": 0.74609375, + "learning_rate": 0.0001852506598652523, + "loss": 0.8367, + "step": 10959 + }, + { + "epoch": 0.2814220673031402, + "grad_norm": 0.84765625, + "learning_rate": 0.0001852483262685945, + "loss": 1.028, + "step": 10960 + }, + { + "epoch": 0.28144774449906196, + "grad_norm": 0.7890625, + "learning_rate": 0.00018524599250204453, + "loss": 1.0111, + "step": 10961 + }, + { + "epoch": 0.2814734216949838, + "grad_norm": 0.87890625, + "learning_rate": 0.000185243658565607, + "loss": 1.0497, + "step": 10962 + }, + { + "epoch": 0.28149909889090563, + "grad_norm": 0.83203125, + "learning_rate": 0.00018524132445928656, + "loss": 1.001, + "step": 10963 + }, + { + "epoch": 0.2815247760868274, + "grad_norm": 0.79296875, + "learning_rate": 0.00018523899018308784, + "loss": 1.1348, + "step": 10964 + }, + { + "epoch": 0.28155045328274925, + "grad_norm": 0.77734375, + "learning_rate": 0.0001852366557370155, + "loss": 0.9298, + "step": 10965 + }, + { + "epoch": 0.2815761304786711, + "grad_norm": 0.88671875, + "learning_rate": 0.00018523432112107423, + "loss": 1.0757, + "step": 10966 + }, + { + "epoch": 0.2816018076745929, + "grad_norm": 0.79296875, + "learning_rate": 0.00018523198633526867, + "loss": 0.9636, + "step": 10967 + }, + { + "epoch": 0.2816274848705147, + "grad_norm": 0.83984375, + "learning_rate": 0.00018522965137960344, + "loss": 1.0647, + "step": 10968 + }, + { + "epoch": 0.28165316206643654, + "grad_norm": 0.765625, + "learning_rate": 0.00018522731625408322, + "loss": 0.8982, + "step": 10969 + }, + { + "epoch": 0.28167883926235837, + "grad_norm": 1.0703125, + "learning_rate": 0.00018522498095871264, + "loss": 0.9929, + "step": 10970 + }, + { + "epoch": 0.28170451645828015, + "grad_norm": 0.99609375, + "learning_rate": 0.0001852226454934964, + "loss": 1.0305, + "step": 10971 + }, + { + "epoch": 0.281730193654202, + "grad_norm": 0.79296875, + "learning_rate": 0.0001852203098584391, + "loss": 1.0324, + "step": 10972 + }, + { + "epoch": 0.2817558708501238, + "grad_norm": 0.796875, + "learning_rate": 0.00018521797405354545, + "loss": 0.9399, + "step": 10973 + }, + { + "epoch": 0.2817815480460456, + "grad_norm": 0.78125, + "learning_rate": 0.00018521563807882005, + "loss": 0.9737, + "step": 10974 + }, + { + "epoch": 0.28180722524196744, + "grad_norm": 0.8046875, + "learning_rate": 0.0001852133019342676, + "loss": 1.034, + "step": 10975 + }, + { + "epoch": 0.2818329024378893, + "grad_norm": 0.77734375, + "learning_rate": 0.00018521096561989272, + "loss": 0.9069, + "step": 10976 + }, + { + "epoch": 0.2818585796338111, + "grad_norm": 0.984375, + "learning_rate": 0.00018520862913570007, + "loss": 1.006, + "step": 10977 + }, + { + "epoch": 0.2818842568297329, + "grad_norm": 0.84765625, + "learning_rate": 0.00018520629248169434, + "loss": 0.9815, + "step": 10978 + }, + { + "epoch": 0.28190993402565473, + "grad_norm": 0.76953125, + "learning_rate": 0.0001852039556578802, + "loss": 0.954, + "step": 10979 + }, + { + "epoch": 0.28193561122157657, + "grad_norm": 0.84765625, + "learning_rate": 0.0001852016186642622, + "loss": 0.9363, + "step": 10980 + }, + { + "epoch": 0.28196128841749835, + "grad_norm": 0.8125, + "learning_rate": 0.0001851992815008451, + "loss": 0.9475, + "step": 10981 + }, + { + "epoch": 0.2819869656134202, + "grad_norm": 0.87109375, + "learning_rate": 0.00018519694416763353, + "loss": 1.1088, + "step": 10982 + }, + { + "epoch": 0.282012642809342, + "grad_norm": 0.79296875, + "learning_rate": 0.00018519460666463217, + "loss": 1.1135, + "step": 10983 + }, + { + "epoch": 0.2820383200052638, + "grad_norm": 0.99609375, + "learning_rate": 0.00018519226899184563, + "loss": 0.9965, + "step": 10984 + }, + { + "epoch": 0.28206399720118563, + "grad_norm": 0.83203125, + "learning_rate": 0.00018518993114927858, + "loss": 0.8855, + "step": 10985 + }, + { + "epoch": 0.28208967439710747, + "grad_norm": 0.73828125, + "learning_rate": 0.0001851875931369357, + "loss": 0.9784, + "step": 10986 + }, + { + "epoch": 0.2821153515930293, + "grad_norm": 0.78515625, + "learning_rate": 0.0001851852549548217, + "loss": 0.9724, + "step": 10987 + }, + { + "epoch": 0.2821410287889511, + "grad_norm": 0.8515625, + "learning_rate": 0.0001851829166029411, + "loss": 0.879, + "step": 10988 + }, + { + "epoch": 0.2821667059848729, + "grad_norm": 0.74609375, + "learning_rate": 0.0001851805780812987, + "loss": 0.8893, + "step": 10989 + }, + { + "epoch": 0.28219238318079476, + "grad_norm": 0.73828125, + "learning_rate": 0.00018517823938989905, + "loss": 0.9413, + "step": 10990 + }, + { + "epoch": 0.28221806037671654, + "grad_norm": 0.703125, + "learning_rate": 0.0001851759005287469, + "loss": 0.8968, + "step": 10991 + }, + { + "epoch": 0.2822437375726384, + "grad_norm": 0.796875, + "learning_rate": 0.00018517356149784687, + "loss": 0.9355, + "step": 10992 + }, + { + "epoch": 0.2822694147685602, + "grad_norm": 0.8671875, + "learning_rate": 0.0001851712222972036, + "loss": 0.8836, + "step": 10993 + }, + { + "epoch": 0.282295091964482, + "grad_norm": 1.0078125, + "learning_rate": 0.00018516888292682178, + "loss": 1.066, + "step": 10994 + }, + { + "epoch": 0.28232076916040383, + "grad_norm": 0.7890625, + "learning_rate": 0.0001851665433867061, + "loss": 0.8868, + "step": 10995 + }, + { + "epoch": 0.28234644635632566, + "grad_norm": 0.796875, + "learning_rate": 0.0001851642036768612, + "loss": 1.0151, + "step": 10996 + }, + { + "epoch": 0.2823721235522475, + "grad_norm": 0.73046875, + "learning_rate": 0.0001851618637972917, + "loss": 1.0137, + "step": 10997 + }, + { + "epoch": 0.2823978007481693, + "grad_norm": 0.7265625, + "learning_rate": 0.0001851595237480023, + "loss": 0.9635, + "step": 10998 + }, + { + "epoch": 0.2824234779440911, + "grad_norm": 0.73828125, + "learning_rate": 0.00018515718352899772, + "loss": 1.0394, + "step": 10999 + }, + { + "epoch": 0.28244915514001295, + "grad_norm": 0.8203125, + "learning_rate": 0.00018515484314028248, + "loss": 0.9893, + "step": 11000 + }, + { + "epoch": 0.28244915514001295, + "eval_loss": 0.9871212840080261, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 408.083, + "eval_samples_per_second": 24.505, + "eval_steps_per_second": 0.767, + "step": 11000 + }, + { + "epoch": 0.28247483233593473, + "grad_norm": 0.82421875, + "learning_rate": 0.0001851525025818614, + "loss": 1.0285, + "step": 11001 + }, + { + "epoch": 0.28250050953185657, + "grad_norm": 0.78125, + "learning_rate": 0.00018515016185373902, + "loss": 0.888, + "step": 11002 + }, + { + "epoch": 0.2825261867277784, + "grad_norm": 0.71875, + "learning_rate": 0.0001851478209559201, + "loss": 0.8644, + "step": 11003 + }, + { + "epoch": 0.2825518639237002, + "grad_norm": 0.7890625, + "learning_rate": 0.00018514547988840923, + "loss": 1.0262, + "step": 11004 + }, + { + "epoch": 0.282577541119622, + "grad_norm": 0.8046875, + "learning_rate": 0.00018514313865121114, + "loss": 1.0273, + "step": 11005 + }, + { + "epoch": 0.28260321831554386, + "grad_norm": 0.8125, + "learning_rate": 0.00018514079724433045, + "loss": 1.0615, + "step": 11006 + }, + { + "epoch": 0.2826288955114657, + "grad_norm": 0.7890625, + "learning_rate": 0.00018513845566777183, + "loss": 0.9902, + "step": 11007 + }, + { + "epoch": 0.2826545727073875, + "grad_norm": 0.80859375, + "learning_rate": 0.00018513611392153998, + "loss": 0.9625, + "step": 11008 + }, + { + "epoch": 0.2826802499033093, + "grad_norm": 0.90234375, + "learning_rate": 0.00018513377200563954, + "loss": 1.0393, + "step": 11009 + }, + { + "epoch": 0.28270592709923115, + "grad_norm": 0.83203125, + "learning_rate": 0.00018513142992007518, + "loss": 1.0707, + "step": 11010 + }, + { + "epoch": 0.2827316042951529, + "grad_norm": 0.80078125, + "learning_rate": 0.00018512908766485158, + "loss": 0.9859, + "step": 11011 + }, + { + "epoch": 0.28275728149107476, + "grad_norm": 0.7734375, + "learning_rate": 0.00018512674523997339, + "loss": 1.078, + "step": 11012 + }, + { + "epoch": 0.2827829586869966, + "grad_norm": 0.78515625, + "learning_rate": 0.00018512440264544528, + "loss": 1.0955, + "step": 11013 + }, + { + "epoch": 0.2828086358829184, + "grad_norm": 0.71875, + "learning_rate": 0.00018512205988127192, + "loss": 0.8632, + "step": 11014 + }, + { + "epoch": 0.2828343130788402, + "grad_norm": 0.7578125, + "learning_rate": 0.00018511971694745797, + "loss": 0.9825, + "step": 11015 + }, + { + "epoch": 0.28285999027476205, + "grad_norm": 0.77734375, + "learning_rate": 0.00018511737384400813, + "loss": 0.8986, + "step": 11016 + }, + { + "epoch": 0.2828856674706839, + "grad_norm": 0.74609375, + "learning_rate": 0.0001851150305709271, + "loss": 1.0257, + "step": 11017 + }, + { + "epoch": 0.28291134466660567, + "grad_norm": 0.8046875, + "learning_rate": 0.00018511268712821942, + "loss": 0.9358, + "step": 11018 + }, + { + "epoch": 0.2829370218625275, + "grad_norm": 0.71484375, + "learning_rate": 0.00018511034351588988, + "loss": 1.0402, + "step": 11019 + }, + { + "epoch": 0.28296269905844934, + "grad_norm": 0.77734375, + "learning_rate": 0.00018510799973394315, + "loss": 1.1053, + "step": 11020 + }, + { + "epoch": 0.2829883762543711, + "grad_norm": 0.78125, + "learning_rate": 0.00018510565578238382, + "loss": 1.0536, + "step": 11021 + }, + { + "epoch": 0.28301405345029296, + "grad_norm": 0.75, + "learning_rate": 0.00018510331166121663, + "loss": 0.9573, + "step": 11022 + }, + { + "epoch": 0.2830397306462148, + "grad_norm": 0.8828125, + "learning_rate": 0.0001851009673704462, + "loss": 1.0108, + "step": 11023 + }, + { + "epoch": 0.2830654078421366, + "grad_norm": 0.79296875, + "learning_rate": 0.00018509862291007725, + "loss": 0.9993, + "step": 11024 + }, + { + "epoch": 0.2830910850380584, + "grad_norm": 0.7578125, + "learning_rate": 0.0001850962782801144, + "loss": 0.9745, + "step": 11025 + }, + { + "epoch": 0.28311676223398025, + "grad_norm": 0.80078125, + "learning_rate": 0.00018509393348056238, + "loss": 0.9195, + "step": 11026 + }, + { + "epoch": 0.2831424394299021, + "grad_norm": 0.80078125, + "learning_rate": 0.00018509158851142584, + "loss": 0.9071, + "step": 11027 + }, + { + "epoch": 0.28316811662582386, + "grad_norm": 0.80859375, + "learning_rate": 0.00018508924337270945, + "loss": 0.976, + "step": 11028 + }, + { + "epoch": 0.2831937938217457, + "grad_norm": 0.78515625, + "learning_rate": 0.00018508689806441789, + "loss": 1.0701, + "step": 11029 + }, + { + "epoch": 0.28321947101766753, + "grad_norm": 0.8046875, + "learning_rate": 0.00018508455258655582, + "loss": 0.8828, + "step": 11030 + }, + { + "epoch": 0.2832451482135893, + "grad_norm": 1.21875, + "learning_rate": 0.0001850822069391279, + "loss": 1.0164, + "step": 11031 + }, + { + "epoch": 0.28327082540951115, + "grad_norm": 0.78515625, + "learning_rate": 0.00018507986112213884, + "loss": 0.9872, + "step": 11032 + }, + { + "epoch": 0.283296502605433, + "grad_norm": 0.796875, + "learning_rate": 0.00018507751513559332, + "loss": 0.9029, + "step": 11033 + }, + { + "epoch": 0.28332217980135477, + "grad_norm": 0.8515625, + "learning_rate": 0.000185075168979496, + "loss": 0.957, + "step": 11034 + }, + { + "epoch": 0.2833478569972766, + "grad_norm": 0.84375, + "learning_rate": 0.00018507282265385153, + "loss": 0.9484, + "step": 11035 + }, + { + "epoch": 0.28337353419319844, + "grad_norm": 0.7421875, + "learning_rate": 0.00018507047615866462, + "loss": 0.932, + "step": 11036 + }, + { + "epoch": 0.2833992113891203, + "grad_norm": 0.83203125, + "learning_rate": 0.00018506812949393994, + "loss": 0.9728, + "step": 11037 + }, + { + "epoch": 0.28342488858504206, + "grad_norm": 0.83984375, + "learning_rate": 0.00018506578265968214, + "loss": 0.9813, + "step": 11038 + }, + { + "epoch": 0.2834505657809639, + "grad_norm": 0.65625, + "learning_rate": 0.00018506343565589595, + "loss": 0.8701, + "step": 11039 + }, + { + "epoch": 0.28347624297688573, + "grad_norm": 0.9609375, + "learning_rate": 0.00018506108848258599, + "loss": 1.0239, + "step": 11040 + }, + { + "epoch": 0.2835019201728075, + "grad_norm": 0.8046875, + "learning_rate": 0.00018505874113975698, + "loss": 0.9645, + "step": 11041 + }, + { + "epoch": 0.28352759736872934, + "grad_norm": 0.91796875, + "learning_rate": 0.00018505639362741358, + "loss": 0.9968, + "step": 11042 + }, + { + "epoch": 0.2835532745646512, + "grad_norm": 0.7578125, + "learning_rate": 0.0001850540459455605, + "loss": 0.9187, + "step": 11043 + }, + { + "epoch": 0.28357895176057296, + "grad_norm": 0.7890625, + "learning_rate": 0.00018505169809420233, + "loss": 0.9163, + "step": 11044 + }, + { + "epoch": 0.2836046289564948, + "grad_norm": 0.75390625, + "learning_rate": 0.00018504935007334383, + "loss": 0.9143, + "step": 11045 + }, + { + "epoch": 0.28363030615241663, + "grad_norm": 0.8984375, + "learning_rate": 0.00018504700188298967, + "loss": 0.9153, + "step": 11046 + }, + { + "epoch": 0.28365598334833847, + "grad_norm": 0.76953125, + "learning_rate": 0.0001850446535231445, + "loss": 0.9854, + "step": 11047 + }, + { + "epoch": 0.28368166054426025, + "grad_norm": 0.76953125, + "learning_rate": 0.00018504230499381304, + "loss": 0.8207, + "step": 11048 + }, + { + "epoch": 0.2837073377401821, + "grad_norm": 0.81640625, + "learning_rate": 0.00018503995629499994, + "loss": 1.1063, + "step": 11049 + }, + { + "epoch": 0.2837330149361039, + "grad_norm": 0.75390625, + "learning_rate": 0.00018503760742670988, + "loss": 0.8579, + "step": 11050 + }, + { + "epoch": 0.2837586921320257, + "grad_norm": 0.78515625, + "learning_rate": 0.00018503525838894756, + "loss": 0.8897, + "step": 11051 + }, + { + "epoch": 0.28378436932794754, + "grad_norm": 0.89453125, + "learning_rate": 0.00018503290918171766, + "loss": 0.9122, + "step": 11052 + }, + { + "epoch": 0.2838100465238694, + "grad_norm": 0.74609375, + "learning_rate": 0.00018503055980502483, + "loss": 0.8441, + "step": 11053 + }, + { + "epoch": 0.28383572371979116, + "grad_norm": 0.8828125, + "learning_rate": 0.0001850282102588738, + "loss": 1.0557, + "step": 11054 + }, + { + "epoch": 0.283861400915713, + "grad_norm": 0.88671875, + "learning_rate": 0.00018502586054326917, + "loss": 1.015, + "step": 11055 + }, + { + "epoch": 0.2838870781116348, + "grad_norm": 0.74609375, + "learning_rate": 0.00018502351065821575, + "loss": 1.0172, + "step": 11056 + }, + { + "epoch": 0.28391275530755666, + "grad_norm": 0.7734375, + "learning_rate": 0.00018502116060371813, + "loss": 0.8735, + "step": 11057 + }, + { + "epoch": 0.28393843250347844, + "grad_norm": 0.8671875, + "learning_rate": 0.000185018810379781, + "loss": 1.0216, + "step": 11058 + }, + { + "epoch": 0.2839641096994003, + "grad_norm": 0.8359375, + "learning_rate": 0.00018501645998640908, + "loss": 0.938, + "step": 11059 + }, + { + "epoch": 0.2839897868953221, + "grad_norm": 0.82421875, + "learning_rate": 0.00018501410942360703, + "loss": 1.0765, + "step": 11060 + }, + { + "epoch": 0.2840154640912439, + "grad_norm": 0.8203125, + "learning_rate": 0.00018501175869137953, + "loss": 1.0425, + "step": 11061 + }, + { + "epoch": 0.28404114128716573, + "grad_norm": 0.9140625, + "learning_rate": 0.0001850094077897313, + "loss": 0.9423, + "step": 11062 + }, + { + "epoch": 0.28406681848308757, + "grad_norm": 0.7265625, + "learning_rate": 0.00018500705671866695, + "loss": 0.8829, + "step": 11063 + }, + { + "epoch": 0.28409249567900935, + "grad_norm": 0.83984375, + "learning_rate": 0.00018500470547819125, + "loss": 1.011, + "step": 11064 + }, + { + "epoch": 0.2841181728749312, + "grad_norm": 0.8359375, + "learning_rate": 0.00018500235406830885, + "loss": 0.9631, + "step": 11065 + }, + { + "epoch": 0.284143850070853, + "grad_norm": 0.8125, + "learning_rate": 0.00018500000248902442, + "loss": 0.8873, + "step": 11066 + }, + { + "epoch": 0.28416952726677486, + "grad_norm": 0.7578125, + "learning_rate": 0.00018499765074034267, + "loss": 0.9266, + "step": 11067 + }, + { + "epoch": 0.28419520446269664, + "grad_norm": 0.73828125, + "learning_rate": 0.00018499529882226828, + "loss": 0.8926, + "step": 11068 + }, + { + "epoch": 0.2842208816586185, + "grad_norm": 0.7890625, + "learning_rate": 0.00018499294673480598, + "loss": 1.1132, + "step": 11069 + }, + { + "epoch": 0.2842465588545403, + "grad_norm": 0.76953125, + "learning_rate": 0.00018499059447796038, + "loss": 0.9794, + "step": 11070 + }, + { + "epoch": 0.2842722360504621, + "grad_norm": 0.77734375, + "learning_rate": 0.00018498824205173618, + "loss": 1.0286, + "step": 11071 + }, + { + "epoch": 0.2842979132463839, + "grad_norm": 0.765625, + "learning_rate": 0.00018498588945613811, + "loss": 0.9564, + "step": 11072 + }, + { + "epoch": 0.28432359044230576, + "grad_norm": 0.7734375, + "learning_rate": 0.00018498353669117084, + "loss": 1.0071, + "step": 11073 + }, + { + "epoch": 0.28434926763822754, + "grad_norm": 0.7734375, + "learning_rate": 0.00018498118375683906, + "loss": 0.8916, + "step": 11074 + }, + { + "epoch": 0.2843749448341494, + "grad_norm": 0.796875, + "learning_rate": 0.00018497883065314745, + "loss": 0.9372, + "step": 11075 + }, + { + "epoch": 0.2844006220300712, + "grad_norm": 0.83203125, + "learning_rate": 0.00018497647738010073, + "loss": 1.0056, + "step": 11076 + }, + { + "epoch": 0.28442629922599305, + "grad_norm": 0.82421875, + "learning_rate": 0.00018497412393770355, + "loss": 1.0038, + "step": 11077 + }, + { + "epoch": 0.28445197642191483, + "grad_norm": 0.82421875, + "learning_rate": 0.00018497177032596057, + "loss": 1.032, + "step": 11078 + }, + { + "epoch": 0.28447765361783667, + "grad_norm": 0.78125, + "learning_rate": 0.00018496941654487658, + "loss": 1.0595, + "step": 11079 + }, + { + "epoch": 0.2845033308137585, + "grad_norm": 0.78125, + "learning_rate": 0.00018496706259445622, + "loss": 1.0004, + "step": 11080 + }, + { + "epoch": 0.2845290080096803, + "grad_norm": 0.73046875, + "learning_rate": 0.00018496470847470417, + "loss": 1.0502, + "step": 11081 + }, + { + "epoch": 0.2845546852056021, + "grad_norm": 0.8203125, + "learning_rate": 0.00018496235418562513, + "loss": 0.9811, + "step": 11082 + }, + { + "epoch": 0.28458036240152396, + "grad_norm": 0.78515625, + "learning_rate": 0.0001849599997272238, + "loss": 0.8694, + "step": 11083 + }, + { + "epoch": 0.28460603959744574, + "grad_norm": 0.71484375, + "learning_rate": 0.00018495764509950485, + "loss": 0.8764, + "step": 11084 + }, + { + "epoch": 0.2846317167933676, + "grad_norm": 0.80859375, + "learning_rate": 0.00018495529030247297, + "loss": 0.9594, + "step": 11085 + }, + { + "epoch": 0.2846573939892894, + "grad_norm": 0.8515625, + "learning_rate": 0.00018495293533613293, + "loss": 0.9906, + "step": 11086 + }, + { + "epoch": 0.28468307118521124, + "grad_norm": 0.8359375, + "learning_rate": 0.00018495058020048932, + "loss": 1.1128, + "step": 11087 + }, + { + "epoch": 0.284708748381133, + "grad_norm": 0.75390625, + "learning_rate": 0.0001849482248955469, + "loss": 1.0004, + "step": 11088 + }, + { + "epoch": 0.28473442557705486, + "grad_norm": 0.796875, + "learning_rate": 0.0001849458694213103, + "loss": 1.1046, + "step": 11089 + }, + { + "epoch": 0.2847601027729767, + "grad_norm": 0.80859375, + "learning_rate": 0.00018494351377778428, + "loss": 1.0283, + "step": 11090 + }, + { + "epoch": 0.2847857799688985, + "grad_norm": 0.8984375, + "learning_rate": 0.00018494115796497353, + "loss": 1.1147, + "step": 11091 + }, + { + "epoch": 0.2848114571648203, + "grad_norm": 0.74609375, + "learning_rate": 0.0001849388019828827, + "loss": 0.8648, + "step": 11092 + }, + { + "epoch": 0.28483713436074215, + "grad_norm": 0.8203125, + "learning_rate": 0.00018493644583151653, + "loss": 1.1409, + "step": 11093 + }, + { + "epoch": 0.28486281155666393, + "grad_norm": 0.8125, + "learning_rate": 0.00018493408951087968, + "loss": 0.9487, + "step": 11094 + }, + { + "epoch": 0.28488848875258577, + "grad_norm": 0.71484375, + "learning_rate": 0.00018493173302097687, + "loss": 0.8628, + "step": 11095 + }, + { + "epoch": 0.2849141659485076, + "grad_norm": 0.75390625, + "learning_rate": 0.00018492937636181277, + "loss": 1.0118, + "step": 11096 + }, + { + "epoch": 0.28493984314442944, + "grad_norm": 0.7265625, + "learning_rate": 0.00018492701953339212, + "loss": 0.9366, + "step": 11097 + }, + { + "epoch": 0.2849655203403512, + "grad_norm": 0.8046875, + "learning_rate": 0.00018492466253571956, + "loss": 0.9999, + "step": 11098 + }, + { + "epoch": 0.28499119753627306, + "grad_norm": 0.76953125, + "learning_rate": 0.00018492230536879984, + "loss": 0.9218, + "step": 11099 + }, + { + "epoch": 0.2850168747321949, + "grad_norm": 0.78515625, + "learning_rate": 0.00018491994803263763, + "loss": 0.9106, + "step": 11100 + }, + { + "epoch": 0.28504255192811667, + "grad_norm": 0.75, + "learning_rate": 0.00018491759052723764, + "loss": 0.9124, + "step": 11101 + }, + { + "epoch": 0.2850682291240385, + "grad_norm": 0.80078125, + "learning_rate": 0.00018491523285260456, + "loss": 0.9439, + "step": 11102 + }, + { + "epoch": 0.28509390631996034, + "grad_norm": 0.78515625, + "learning_rate": 0.00018491287500874308, + "loss": 1.0362, + "step": 11103 + }, + { + "epoch": 0.2851195835158821, + "grad_norm": 0.8359375, + "learning_rate": 0.00018491051699565794, + "loss": 0.8797, + "step": 11104 + }, + { + "epoch": 0.28514526071180396, + "grad_norm": 0.89453125, + "learning_rate": 0.00018490815881335378, + "loss": 0.915, + "step": 11105 + }, + { + "epoch": 0.2851709379077258, + "grad_norm": 0.8359375, + "learning_rate": 0.00018490580046183534, + "loss": 0.9279, + "step": 11106 + }, + { + "epoch": 0.28519661510364763, + "grad_norm": 0.8203125, + "learning_rate": 0.00018490344194110731, + "loss": 0.9949, + "step": 11107 + }, + { + "epoch": 0.2852222922995694, + "grad_norm": 0.8828125, + "learning_rate": 0.00018490108325117438, + "loss": 0.9633, + "step": 11108 + }, + { + "epoch": 0.28524796949549125, + "grad_norm": 0.796875, + "learning_rate": 0.0001848987243920413, + "loss": 1.0671, + "step": 11109 + }, + { + "epoch": 0.2852736466914131, + "grad_norm": 0.78515625, + "learning_rate": 0.00018489636536371269, + "loss": 1.0018, + "step": 11110 + }, + { + "epoch": 0.28529932388733487, + "grad_norm": 0.75390625, + "learning_rate": 0.0001848940061661933, + "loss": 1.0117, + "step": 11111 + }, + { + "epoch": 0.2853250010832567, + "grad_norm": 0.72265625, + "learning_rate": 0.00018489164679948782, + "loss": 1.0017, + "step": 11112 + }, + { + "epoch": 0.28535067827917854, + "grad_norm": 0.80078125, + "learning_rate": 0.00018488928726360097, + "loss": 0.9443, + "step": 11113 + }, + { + "epoch": 0.2853763554751003, + "grad_norm": 0.734375, + "learning_rate": 0.00018488692755853743, + "loss": 0.8385, + "step": 11114 + }, + { + "epoch": 0.28540203267102215, + "grad_norm": 0.84375, + "learning_rate": 0.00018488456768430192, + "loss": 1.0293, + "step": 11115 + }, + { + "epoch": 0.285427709866944, + "grad_norm": 0.83984375, + "learning_rate": 0.00018488220764089914, + "loss": 0.9184, + "step": 11116 + }, + { + "epoch": 0.2854533870628658, + "grad_norm": 0.765625, + "learning_rate": 0.00018487984742833374, + "loss": 0.9589, + "step": 11117 + }, + { + "epoch": 0.2854790642587876, + "grad_norm": 0.7421875, + "learning_rate": 0.00018487748704661052, + "loss": 0.9064, + "step": 11118 + }, + { + "epoch": 0.28550474145470944, + "grad_norm": 0.796875, + "learning_rate": 0.00018487512649573414, + "loss": 0.9461, + "step": 11119 + }, + { + "epoch": 0.2855304186506313, + "grad_norm": 1.2734375, + "learning_rate": 0.00018487276577570927, + "loss": 0.9069, + "step": 11120 + }, + { + "epoch": 0.28555609584655306, + "grad_norm": 0.81640625, + "learning_rate": 0.00018487040488654067, + "loss": 0.9591, + "step": 11121 + }, + { + "epoch": 0.2855817730424749, + "grad_norm": 0.859375, + "learning_rate": 0.000184868043828233, + "loss": 1.1077, + "step": 11122 + }, + { + "epoch": 0.28560745023839673, + "grad_norm": 0.78515625, + "learning_rate": 0.00018486568260079098, + "loss": 0.9468, + "step": 11123 + }, + { + "epoch": 0.2856331274343185, + "grad_norm": 0.8046875, + "learning_rate": 0.00018486332120421932, + "loss": 1.0002, + "step": 11124 + }, + { + "epoch": 0.28565880463024035, + "grad_norm": 0.66796875, + "learning_rate": 0.00018486095963852274, + "loss": 0.7307, + "step": 11125 + }, + { + "epoch": 0.2856844818261622, + "grad_norm": 0.81640625, + "learning_rate": 0.00018485859790370594, + "loss": 1.0291, + "step": 11126 + }, + { + "epoch": 0.285710159022084, + "grad_norm": 0.7109375, + "learning_rate": 0.0001848562359997736, + "loss": 0.9707, + "step": 11127 + }, + { + "epoch": 0.2857358362180058, + "grad_norm": 0.9296875, + "learning_rate": 0.00018485387392673045, + "loss": 0.9705, + "step": 11128 + }, + { + "epoch": 0.28576151341392764, + "grad_norm": 0.77734375, + "learning_rate": 0.00018485151168458122, + "loss": 1.0663, + "step": 11129 + }, + { + "epoch": 0.2857871906098495, + "grad_norm": 0.703125, + "learning_rate": 0.00018484914927333054, + "loss": 0.9632, + "step": 11130 + }, + { + "epoch": 0.28581286780577125, + "grad_norm": 0.80078125, + "learning_rate": 0.00018484678669298323, + "loss": 0.9634, + "step": 11131 + }, + { + "epoch": 0.2858385450016931, + "grad_norm": 0.7578125, + "learning_rate": 0.00018484442394354388, + "loss": 0.9218, + "step": 11132 + }, + { + "epoch": 0.2858642221976149, + "grad_norm": 0.83984375, + "learning_rate": 0.00018484206102501732, + "loss": 1.0834, + "step": 11133 + }, + { + "epoch": 0.2858898993935367, + "grad_norm": 0.765625, + "learning_rate": 0.00018483969793740815, + "loss": 0.9392, + "step": 11134 + }, + { + "epoch": 0.28591557658945854, + "grad_norm": 2.234375, + "learning_rate": 0.00018483733468072116, + "loss": 1.002, + "step": 11135 + }, + { + "epoch": 0.2859412537853804, + "grad_norm": 0.84375, + "learning_rate": 0.000184834971254961, + "loss": 0.9483, + "step": 11136 + }, + { + "epoch": 0.2859669309813022, + "grad_norm": 0.875, + "learning_rate": 0.0001848326076601324, + "loss": 1.1301, + "step": 11137 + }, + { + "epoch": 0.285992608177224, + "grad_norm": 0.78515625, + "learning_rate": 0.0001848302438962401, + "loss": 0.9657, + "step": 11138 + }, + { + "epoch": 0.28601828537314583, + "grad_norm": 0.85546875, + "learning_rate": 0.00018482787996328877, + "loss": 0.9577, + "step": 11139 + }, + { + "epoch": 0.28604396256906767, + "grad_norm": 0.8671875, + "learning_rate": 0.0001848255158612831, + "loss": 1.1667, + "step": 11140 + }, + { + "epoch": 0.28606963976498945, + "grad_norm": 0.8125, + "learning_rate": 0.0001848231515902279, + "loss": 0.9775, + "step": 11141 + }, + { + "epoch": 0.2860953169609113, + "grad_norm": 0.73828125, + "learning_rate": 0.00018482078715012778, + "loss": 0.9878, + "step": 11142 + }, + { + "epoch": 0.2861209941568331, + "grad_norm": 0.77734375, + "learning_rate": 0.0001848184225409875, + "loss": 0.937, + "step": 11143 + }, + { + "epoch": 0.2861466713527549, + "grad_norm": 0.86328125, + "learning_rate": 0.00018481605776281181, + "loss": 1.0771, + "step": 11144 + }, + { + "epoch": 0.28617234854867674, + "grad_norm": 0.8359375, + "learning_rate": 0.00018481369281560535, + "loss": 1.0191, + "step": 11145 + }, + { + "epoch": 0.28619802574459857, + "grad_norm": 0.74609375, + "learning_rate": 0.00018481132769937285, + "loss": 1.0498, + "step": 11146 + }, + { + "epoch": 0.2862237029405204, + "grad_norm": 0.81640625, + "learning_rate": 0.00018480896241411903, + "loss": 1.0238, + "step": 11147 + }, + { + "epoch": 0.2862493801364422, + "grad_norm": 0.84375, + "learning_rate": 0.00018480659695984864, + "loss": 1.1779, + "step": 11148 + }, + { + "epoch": 0.286275057332364, + "grad_norm": 0.81640625, + "learning_rate": 0.00018480423133656633, + "loss": 0.9537, + "step": 11149 + }, + { + "epoch": 0.28630073452828586, + "grad_norm": 0.70703125, + "learning_rate": 0.00018480186554427688, + "loss": 1.0086, + "step": 11150 + }, + { + "epoch": 0.28632641172420764, + "grad_norm": 0.8125, + "learning_rate": 0.00018479949958298493, + "loss": 0.9998, + "step": 11151 + }, + { + "epoch": 0.2863520889201295, + "grad_norm": 0.73828125, + "learning_rate": 0.00018479713345269526, + "loss": 0.9785, + "step": 11152 + }, + { + "epoch": 0.2863777661160513, + "grad_norm": 0.765625, + "learning_rate": 0.00018479476715341258, + "loss": 1.0665, + "step": 11153 + }, + { + "epoch": 0.2864034433119731, + "grad_norm": 0.79296875, + "learning_rate": 0.00018479240068514156, + "loss": 0.9659, + "step": 11154 + }, + { + "epoch": 0.28642912050789493, + "grad_norm": 0.84375, + "learning_rate": 0.00018479003404788698, + "loss": 0.9756, + "step": 11155 + }, + { + "epoch": 0.28645479770381677, + "grad_norm": 0.82421875, + "learning_rate": 0.00018478766724165348, + "loss": 1.0527, + "step": 11156 + }, + { + "epoch": 0.28648047489973855, + "grad_norm": 0.828125, + "learning_rate": 0.00018478530026644582, + "loss": 1.0886, + "step": 11157 + }, + { + "epoch": 0.2865061520956604, + "grad_norm": 0.81640625, + "learning_rate": 0.00018478293312226875, + "loss": 0.9172, + "step": 11158 + }, + { + "epoch": 0.2865318292915822, + "grad_norm": 0.8046875, + "learning_rate": 0.00018478056580912693, + "loss": 1.0664, + "step": 11159 + }, + { + "epoch": 0.28655750648750405, + "grad_norm": 0.79296875, + "learning_rate": 0.0001847781983270251, + "loss": 0.8942, + "step": 11160 + }, + { + "epoch": 0.28658318368342584, + "grad_norm": 0.80078125, + "learning_rate": 0.00018477583067596798, + "loss": 1.0095, + "step": 11161 + }, + { + "epoch": 0.28660886087934767, + "grad_norm": 0.796875, + "learning_rate": 0.00018477346285596032, + "loss": 1.1013, + "step": 11162 + }, + { + "epoch": 0.2866345380752695, + "grad_norm": 0.859375, + "learning_rate": 0.00018477109486700677, + "loss": 1.0942, + "step": 11163 + }, + { + "epoch": 0.2866602152711913, + "grad_norm": 0.80859375, + "learning_rate": 0.0001847687267091121, + "loss": 0.9627, + "step": 11164 + }, + { + "epoch": 0.2866858924671131, + "grad_norm": 0.75, + "learning_rate": 0.000184766358382281, + "loss": 0.8965, + "step": 11165 + }, + { + "epoch": 0.28671156966303496, + "grad_norm": 0.7890625, + "learning_rate": 0.0001847639898865182, + "loss": 1.0062, + "step": 11166 + }, + { + "epoch": 0.28673724685895674, + "grad_norm": 0.76171875, + "learning_rate": 0.00018476162122182843, + "loss": 0.9509, + "step": 11167 + }, + { + "epoch": 0.2867629240548786, + "grad_norm": 0.76171875, + "learning_rate": 0.00018475925238821642, + "loss": 0.9004, + "step": 11168 + }, + { + "epoch": 0.2867886012508004, + "grad_norm": 0.828125, + "learning_rate": 0.00018475688338568687, + "loss": 0.9022, + "step": 11169 + }, + { + "epoch": 0.28681427844672225, + "grad_norm": 0.8046875, + "learning_rate": 0.00018475451421424448, + "loss": 1.0231, + "step": 11170 + }, + { + "epoch": 0.28683995564264403, + "grad_norm": 0.78125, + "learning_rate": 0.00018475214487389402, + "loss": 0.8722, + "step": 11171 + }, + { + "epoch": 0.28686563283856586, + "grad_norm": 0.75, + "learning_rate": 0.0001847497753646402, + "loss": 0.9206, + "step": 11172 + }, + { + "epoch": 0.2868913100344877, + "grad_norm": 0.734375, + "learning_rate": 0.00018474740568648774, + "loss": 0.9223, + "step": 11173 + }, + { + "epoch": 0.2869169872304095, + "grad_norm": 0.80859375, + "learning_rate": 0.00018474503583944133, + "loss": 1.0397, + "step": 11174 + }, + { + "epoch": 0.2869426644263313, + "grad_norm": 0.859375, + "learning_rate": 0.00018474266582350573, + "loss": 1.0169, + "step": 11175 + }, + { + "epoch": 0.28696834162225315, + "grad_norm": 0.8125, + "learning_rate": 0.00018474029563868567, + "loss": 1.0215, + "step": 11176 + }, + { + "epoch": 0.28699401881817493, + "grad_norm": 0.875, + "learning_rate": 0.00018473792528498584, + "loss": 1.0023, + "step": 11177 + }, + { + "epoch": 0.28701969601409677, + "grad_norm": 0.86328125, + "learning_rate": 0.00018473555476241098, + "loss": 0.9694, + "step": 11178 + }, + { + "epoch": 0.2870453732100186, + "grad_norm": 0.80078125, + "learning_rate": 0.0001847331840709658, + "loss": 1.0422, + "step": 11179 + }, + { + "epoch": 0.28707105040594044, + "grad_norm": 0.828125, + "learning_rate": 0.00018473081321065507, + "loss": 1.0229, + "step": 11180 + }, + { + "epoch": 0.2870967276018622, + "grad_norm": 0.81640625, + "learning_rate": 0.00018472844218148346, + "loss": 1.0093, + "step": 11181 + }, + { + "epoch": 0.28712240479778406, + "grad_norm": 0.8359375, + "learning_rate": 0.0001847260709834557, + "loss": 0.8936, + "step": 11182 + }, + { + "epoch": 0.2871480819937059, + "grad_norm": 0.85546875, + "learning_rate": 0.00018472369961657656, + "loss": 0.9329, + "step": 11183 + }, + { + "epoch": 0.2871737591896277, + "grad_norm": 0.76953125, + "learning_rate": 0.00018472132808085073, + "loss": 0.9423, + "step": 11184 + }, + { + "epoch": 0.2871994363855495, + "grad_norm": 0.77734375, + "learning_rate": 0.00018471895637628297, + "loss": 0.9182, + "step": 11185 + }, + { + "epoch": 0.28722511358147135, + "grad_norm": 0.87890625, + "learning_rate": 0.00018471658450287795, + "loss": 1.0883, + "step": 11186 + }, + { + "epoch": 0.28725079077739313, + "grad_norm": 0.7578125, + "learning_rate": 0.00018471421246064044, + "loss": 1.067, + "step": 11187 + }, + { + "epoch": 0.28727646797331496, + "grad_norm": 0.86328125, + "learning_rate": 0.0001847118402495752, + "loss": 1.0082, + "step": 11188 + }, + { + "epoch": 0.2873021451692368, + "grad_norm": 0.88671875, + "learning_rate": 0.00018470946786968684, + "loss": 0.9668, + "step": 11189 + }, + { + "epoch": 0.28732782236515864, + "grad_norm": 0.8203125, + "learning_rate": 0.0001847070953209802, + "loss": 0.8911, + "step": 11190 + }, + { + "epoch": 0.2873534995610804, + "grad_norm": 0.78515625, + "learning_rate": 0.00018470472260345998, + "loss": 0.9682, + "step": 11191 + }, + { + "epoch": 0.28737917675700225, + "grad_norm": 0.79296875, + "learning_rate": 0.00018470234971713087, + "loss": 1.0793, + "step": 11192 + }, + { + "epoch": 0.2874048539529241, + "grad_norm": 0.81640625, + "learning_rate": 0.00018469997666199764, + "loss": 0.939, + "step": 11193 + }, + { + "epoch": 0.28743053114884587, + "grad_norm": 0.84375, + "learning_rate": 0.000184697603438065, + "loss": 0.9661, + "step": 11194 + }, + { + "epoch": 0.2874562083447677, + "grad_norm": 0.8046875, + "learning_rate": 0.00018469523004533773, + "loss": 1.0392, + "step": 11195 + }, + { + "epoch": 0.28748188554068954, + "grad_norm": 0.76171875, + "learning_rate": 0.0001846928564838205, + "loss": 1.0565, + "step": 11196 + }, + { + "epoch": 0.2875075627366113, + "grad_norm": 0.78125, + "learning_rate": 0.00018469048275351804, + "loss": 1.0059, + "step": 11197 + }, + { + "epoch": 0.28753323993253316, + "grad_norm": 0.7734375, + "learning_rate": 0.0001846881088544351, + "loss": 0.8981, + "step": 11198 + }, + { + "epoch": 0.287558917128455, + "grad_norm": 0.74609375, + "learning_rate": 0.0001846857347865764, + "loss": 0.893, + "step": 11199 + }, + { + "epoch": 0.28758459432437683, + "grad_norm": 0.76953125, + "learning_rate": 0.0001846833605499467, + "loss": 0.9801, + "step": 11200 + }, + { + "epoch": 0.2876102715202986, + "grad_norm": 0.81640625, + "learning_rate": 0.0001846809861445507, + "loss": 0.977, + "step": 11201 + }, + { + "epoch": 0.28763594871622045, + "grad_norm": 0.80078125, + "learning_rate": 0.00018467861157039312, + "loss": 0.8504, + "step": 11202 + }, + { + "epoch": 0.2876616259121423, + "grad_norm": 0.765625, + "learning_rate": 0.00018467623682747873, + "loss": 0.963, + "step": 11203 + }, + { + "epoch": 0.28768730310806406, + "grad_norm": 0.73828125, + "learning_rate": 0.0001846738619158123, + "loss": 0.8752, + "step": 11204 + }, + { + "epoch": 0.2877129803039859, + "grad_norm": 0.87109375, + "learning_rate": 0.00018467148683539846, + "loss": 1.0267, + "step": 11205 + }, + { + "epoch": 0.28773865749990774, + "grad_norm": 0.79296875, + "learning_rate": 0.000184669111586242, + "loss": 1.0125, + "step": 11206 + }, + { + "epoch": 0.2877643346958295, + "grad_norm": 0.8046875, + "learning_rate": 0.00018466673616834766, + "loss": 0.9137, + "step": 11207 + }, + { + "epoch": 0.28779001189175135, + "grad_norm": 0.83984375, + "learning_rate": 0.00018466436058172016, + "loss": 1.101, + "step": 11208 + }, + { + "epoch": 0.2878156890876732, + "grad_norm": 0.78515625, + "learning_rate": 0.00018466198482636423, + "loss": 1.0279, + "step": 11209 + }, + { + "epoch": 0.287841366283595, + "grad_norm": 0.71484375, + "learning_rate": 0.00018465960890228463, + "loss": 0.8754, + "step": 11210 + }, + { + "epoch": 0.2878670434795168, + "grad_norm": 0.875, + "learning_rate": 0.00018465723280948606, + "loss": 1.1336, + "step": 11211 + }, + { + "epoch": 0.28789272067543864, + "grad_norm": 0.8125, + "learning_rate": 0.00018465485654797328, + "loss": 1.099, + "step": 11212 + }, + { + "epoch": 0.2879183978713605, + "grad_norm": 0.78515625, + "learning_rate": 0.000184652480117751, + "loss": 0.8823, + "step": 11213 + }, + { + "epoch": 0.28794407506728226, + "grad_norm": 0.76171875, + "learning_rate": 0.00018465010351882398, + "loss": 0.8842, + "step": 11214 + }, + { + "epoch": 0.2879697522632041, + "grad_norm": 0.71484375, + "learning_rate": 0.00018464772675119698, + "loss": 0.9244, + "step": 11215 + }, + { + "epoch": 0.28799542945912593, + "grad_norm": 1.0, + "learning_rate": 0.00018464534981487468, + "loss": 1.038, + "step": 11216 + }, + { + "epoch": 0.2880211066550477, + "grad_norm": 0.828125, + "learning_rate": 0.00018464297270986185, + "loss": 0.9274, + "step": 11217 + }, + { + "epoch": 0.28804678385096955, + "grad_norm": 0.8515625, + "learning_rate": 0.0001846405954361632, + "loss": 1.0016, + "step": 11218 + }, + { + "epoch": 0.2880724610468914, + "grad_norm": 0.83203125, + "learning_rate": 0.00018463821799378352, + "loss": 1.1263, + "step": 11219 + }, + { + "epoch": 0.2880981382428132, + "grad_norm": 0.8046875, + "learning_rate": 0.0001846358403827275, + "loss": 0.9781, + "step": 11220 + }, + { + "epoch": 0.288123815438735, + "grad_norm": 0.87890625, + "learning_rate": 0.00018463346260299993, + "loss": 1.038, + "step": 11221 + }, + { + "epoch": 0.28814949263465683, + "grad_norm": 0.79296875, + "learning_rate": 0.00018463108465460547, + "loss": 0.9479, + "step": 11222 + }, + { + "epoch": 0.28817516983057867, + "grad_norm": 0.828125, + "learning_rate": 0.00018462870653754892, + "loss": 1.0517, + "step": 11223 + }, + { + "epoch": 0.28820084702650045, + "grad_norm": 0.75390625, + "learning_rate": 0.000184626328251835, + "loss": 1.0445, + "step": 11224 + }, + { + "epoch": 0.2882265242224223, + "grad_norm": 0.765625, + "learning_rate": 0.00018462394979746846, + "loss": 1.0725, + "step": 11225 + }, + { + "epoch": 0.2882522014183441, + "grad_norm": 0.73828125, + "learning_rate": 0.00018462157117445403, + "loss": 1.0321, + "step": 11226 + }, + { + "epoch": 0.2882778786142659, + "grad_norm": 0.79296875, + "learning_rate": 0.00018461919238279642, + "loss": 1.0368, + "step": 11227 + }, + { + "epoch": 0.28830355581018774, + "grad_norm": 0.828125, + "learning_rate": 0.00018461681342250044, + "loss": 1.0727, + "step": 11228 + }, + { + "epoch": 0.2883292330061096, + "grad_norm": 0.765625, + "learning_rate": 0.00018461443429357077, + "loss": 0.8902, + "step": 11229 + }, + { + "epoch": 0.2883549102020314, + "grad_norm": 0.828125, + "learning_rate": 0.0001846120549960122, + "loss": 1.0177, + "step": 11230 + }, + { + "epoch": 0.2883805873979532, + "grad_norm": 0.80859375, + "learning_rate": 0.00018460967552982944, + "loss": 0.896, + "step": 11231 + }, + { + "epoch": 0.28840626459387503, + "grad_norm": 0.84375, + "learning_rate": 0.00018460729589502722, + "loss": 1.0843, + "step": 11232 + }, + { + "epoch": 0.28843194178979686, + "grad_norm": 0.7890625, + "learning_rate": 0.00018460491609161034, + "loss": 0.9703, + "step": 11233 + }, + { + "epoch": 0.28845761898571864, + "grad_norm": 0.76953125, + "learning_rate": 0.00018460253611958348, + "loss": 1.0202, + "step": 11234 + }, + { + "epoch": 0.2884832961816405, + "grad_norm": 0.80859375, + "learning_rate": 0.00018460015597895142, + "loss": 0.9104, + "step": 11235 + }, + { + "epoch": 0.2885089733775623, + "grad_norm": 0.8125, + "learning_rate": 0.00018459777566971885, + "loss": 0.8703, + "step": 11236 + }, + { + "epoch": 0.2885346505734841, + "grad_norm": 0.734375, + "learning_rate": 0.0001845953951918906, + "loss": 0.9433, + "step": 11237 + }, + { + "epoch": 0.28856032776940593, + "grad_norm": 0.734375, + "learning_rate": 0.00018459301454547137, + "loss": 0.988, + "step": 11238 + }, + { + "epoch": 0.28858600496532777, + "grad_norm": 0.6796875, + "learning_rate": 0.00018459063373046588, + "loss": 0.9159, + "step": 11239 + }, + { + "epoch": 0.2886116821612496, + "grad_norm": 0.81640625, + "learning_rate": 0.0001845882527468789, + "loss": 0.9833, + "step": 11240 + }, + { + "epoch": 0.2886373593571714, + "grad_norm": 0.7890625, + "learning_rate": 0.00018458587159471518, + "loss": 1.1052, + "step": 11241 + }, + { + "epoch": 0.2886630365530932, + "grad_norm": 0.74609375, + "learning_rate": 0.00018458349027397945, + "loss": 0.9298, + "step": 11242 + }, + { + "epoch": 0.28868871374901506, + "grad_norm": 0.796875, + "learning_rate": 0.00018458110878467648, + "loss": 0.9327, + "step": 11243 + }, + { + "epoch": 0.28871439094493684, + "grad_norm": 0.8671875, + "learning_rate": 0.00018457872712681099, + "loss": 1.1552, + "step": 11244 + }, + { + "epoch": 0.2887400681408587, + "grad_norm": 0.82421875, + "learning_rate": 0.00018457634530038775, + "loss": 1.0226, + "step": 11245 + }, + { + "epoch": 0.2887657453367805, + "grad_norm": 0.8671875, + "learning_rate": 0.00018457396330541148, + "loss": 1.1991, + "step": 11246 + }, + { + "epoch": 0.2887914225327023, + "grad_norm": 0.86328125, + "learning_rate": 0.00018457158114188694, + "loss": 1.043, + "step": 11247 + }, + { + "epoch": 0.2888170997286241, + "grad_norm": 0.81640625, + "learning_rate": 0.00018456919880981886, + "loss": 0.9525, + "step": 11248 + }, + { + "epoch": 0.28884277692454596, + "grad_norm": 0.75, + "learning_rate": 0.00018456681630921206, + "loss": 0.935, + "step": 11249 + }, + { + "epoch": 0.2888684541204678, + "grad_norm": 0.7578125, + "learning_rate": 0.0001845644336400712, + "loss": 1.0232, + "step": 11250 + }, + { + "epoch": 0.2888941313163896, + "grad_norm": 0.7734375, + "learning_rate": 0.00018456205080240106, + "loss": 0.996, + "step": 11251 + }, + { + "epoch": 0.2889198085123114, + "grad_norm": 0.78125, + "learning_rate": 0.0001845596677962064, + "loss": 0.9533, + "step": 11252 + }, + { + "epoch": 0.28894548570823325, + "grad_norm": 0.8046875, + "learning_rate": 0.00018455728462149194, + "loss": 0.9104, + "step": 11253 + }, + { + "epoch": 0.28897116290415503, + "grad_norm": 0.75, + "learning_rate": 0.00018455490127826247, + "loss": 0.8464, + "step": 11254 + }, + { + "epoch": 0.28899684010007687, + "grad_norm": 0.89453125, + "learning_rate": 0.0001845525177665227, + "loss": 1.1301, + "step": 11255 + }, + { + "epoch": 0.2890225172959987, + "grad_norm": 0.796875, + "learning_rate": 0.00018455013408627741, + "loss": 0.9698, + "step": 11256 + }, + { + "epoch": 0.2890481944919205, + "grad_norm": 0.859375, + "learning_rate": 0.00018454775023753138, + "loss": 0.9686, + "step": 11257 + }, + { + "epoch": 0.2890738716878423, + "grad_norm": 0.75390625, + "learning_rate": 0.00018454536622028927, + "loss": 0.9741, + "step": 11258 + }, + { + "epoch": 0.28909954888376416, + "grad_norm": 0.78125, + "learning_rate": 0.00018454298203455588, + "loss": 0.9294, + "step": 11259 + }, + { + "epoch": 0.289125226079686, + "grad_norm": 0.7578125, + "learning_rate": 0.00018454059768033601, + "loss": 0.9001, + "step": 11260 + }, + { + "epoch": 0.2891509032756078, + "grad_norm": 0.80859375, + "learning_rate": 0.00018453821315763433, + "loss": 0.9725, + "step": 11261 + }, + { + "epoch": 0.2891765804715296, + "grad_norm": 0.73828125, + "learning_rate": 0.00018453582846645564, + "loss": 0.9142, + "step": 11262 + }, + { + "epoch": 0.28920225766745145, + "grad_norm": 0.76953125, + "learning_rate": 0.00018453344360680468, + "loss": 1.0668, + "step": 11263 + }, + { + "epoch": 0.2892279348633732, + "grad_norm": 0.84375, + "learning_rate": 0.00018453105857868619, + "loss": 0.9822, + "step": 11264 + }, + { + "epoch": 0.28925361205929506, + "grad_norm": 0.78125, + "learning_rate": 0.00018452867338210495, + "loss": 0.9623, + "step": 11265 + }, + { + "epoch": 0.2892792892552169, + "grad_norm": 0.8046875, + "learning_rate": 0.0001845262880170657, + "loss": 1.076, + "step": 11266 + }, + { + "epoch": 0.2893049664511387, + "grad_norm": 0.74609375, + "learning_rate": 0.00018452390248357317, + "loss": 1.0164, + "step": 11267 + }, + { + "epoch": 0.2893306436470605, + "grad_norm": 0.76953125, + "learning_rate": 0.00018452151678163215, + "loss": 1.0045, + "step": 11268 + }, + { + "epoch": 0.28935632084298235, + "grad_norm": 0.828125, + "learning_rate": 0.00018451913091124738, + "loss": 1.1191, + "step": 11269 + }, + { + "epoch": 0.2893819980389042, + "grad_norm": 0.81640625, + "learning_rate": 0.00018451674487242364, + "loss": 1.0075, + "step": 11270 + }, + { + "epoch": 0.28940767523482597, + "grad_norm": 0.85546875, + "learning_rate": 0.00018451435866516564, + "loss": 0.8958, + "step": 11271 + }, + { + "epoch": 0.2894333524307478, + "grad_norm": 0.78125, + "learning_rate": 0.00018451197228947815, + "loss": 0.9031, + "step": 11272 + }, + { + "epoch": 0.28945902962666964, + "grad_norm": 0.8046875, + "learning_rate": 0.00018450958574536594, + "loss": 0.8772, + "step": 11273 + }, + { + "epoch": 0.2894847068225914, + "grad_norm": 0.8125, + "learning_rate": 0.00018450719903283377, + "loss": 0.8156, + "step": 11274 + }, + { + "epoch": 0.28951038401851326, + "grad_norm": 0.85546875, + "learning_rate": 0.00018450481215188637, + "loss": 1.0085, + "step": 11275 + }, + { + "epoch": 0.2895360612144351, + "grad_norm": 0.7421875, + "learning_rate": 0.0001845024251025285, + "loss": 1.033, + "step": 11276 + }, + { + "epoch": 0.2895617384103569, + "grad_norm": 0.7890625, + "learning_rate": 0.00018450003788476497, + "loss": 0.9354, + "step": 11277 + }, + { + "epoch": 0.2895874156062787, + "grad_norm": 0.80078125, + "learning_rate": 0.00018449765049860048, + "loss": 1.0098, + "step": 11278 + }, + { + "epoch": 0.28961309280220054, + "grad_norm": 0.75390625, + "learning_rate": 0.0001844952629440398, + "loss": 0.8291, + "step": 11279 + }, + { + "epoch": 0.2896387699981224, + "grad_norm": 0.77734375, + "learning_rate": 0.0001844928752210877, + "loss": 0.9777, + "step": 11280 + }, + { + "epoch": 0.28966444719404416, + "grad_norm": 0.7734375, + "learning_rate": 0.00018449048732974892, + "loss": 0.8723, + "step": 11281 + }, + { + "epoch": 0.289690124389966, + "grad_norm": 0.76171875, + "learning_rate": 0.00018448809927002825, + "loss": 0.8504, + "step": 11282 + }, + { + "epoch": 0.28971580158588783, + "grad_norm": 1.390625, + "learning_rate": 0.00018448571104193043, + "loss": 0.9256, + "step": 11283 + }, + { + "epoch": 0.2897414787818096, + "grad_norm": 0.83203125, + "learning_rate": 0.0001844833226454602, + "loss": 1.0378, + "step": 11284 + }, + { + "epoch": 0.28976715597773145, + "grad_norm": 0.875, + "learning_rate": 0.00018448093408062233, + "loss": 1.0179, + "step": 11285 + }, + { + "epoch": 0.2897928331736533, + "grad_norm": 0.765625, + "learning_rate": 0.00018447854534742164, + "loss": 0.9473, + "step": 11286 + }, + { + "epoch": 0.28981851036957507, + "grad_norm": 0.80078125, + "learning_rate": 0.0001844761564458628, + "loss": 0.937, + "step": 11287 + }, + { + "epoch": 0.2898441875654969, + "grad_norm": 0.75390625, + "learning_rate": 0.00018447376737595062, + "loss": 0.9573, + "step": 11288 + }, + { + "epoch": 0.28986986476141874, + "grad_norm": 0.73828125, + "learning_rate": 0.00018447137813768985, + "loss": 0.9965, + "step": 11289 + }, + { + "epoch": 0.2898955419573406, + "grad_norm": 0.7578125, + "learning_rate": 0.00018446898873108526, + "loss": 0.9032, + "step": 11290 + }, + { + "epoch": 0.28992121915326236, + "grad_norm": 0.80859375, + "learning_rate": 0.0001844665991561416, + "loss": 0.936, + "step": 11291 + }, + { + "epoch": 0.2899468963491842, + "grad_norm": 1.640625, + "learning_rate": 0.00018446420941286364, + "loss": 1.0897, + "step": 11292 + }, + { + "epoch": 0.289972573545106, + "grad_norm": 0.80078125, + "learning_rate": 0.00018446181950125613, + "loss": 0.8121, + "step": 11293 + }, + { + "epoch": 0.2899982507410278, + "grad_norm": 0.76953125, + "learning_rate": 0.00018445942942132385, + "loss": 1.057, + "step": 11294 + }, + { + "epoch": 0.29002392793694964, + "grad_norm": 0.765625, + "learning_rate": 0.00018445703917307156, + "loss": 0.8838, + "step": 11295 + }, + { + "epoch": 0.2900496051328715, + "grad_norm": 0.7578125, + "learning_rate": 0.00018445464875650402, + "loss": 1.0663, + "step": 11296 + }, + { + "epoch": 0.29007528232879326, + "grad_norm": 0.734375, + "learning_rate": 0.000184452258171626, + "loss": 0.9988, + "step": 11297 + }, + { + "epoch": 0.2901009595247151, + "grad_norm": 0.7890625, + "learning_rate": 0.00018444986741844225, + "loss": 1.0065, + "step": 11298 + }, + { + "epoch": 0.29012663672063693, + "grad_norm": 0.7734375, + "learning_rate": 0.00018444747649695752, + "loss": 0.9816, + "step": 11299 + }, + { + "epoch": 0.29015231391655877, + "grad_norm": 0.8828125, + "learning_rate": 0.00018444508540717665, + "loss": 1.098, + "step": 11300 + }, + { + "epoch": 0.29017799111248055, + "grad_norm": 0.86328125, + "learning_rate": 0.0001844426941491043, + "loss": 0.9151, + "step": 11301 + }, + { + "epoch": 0.2902036683084024, + "grad_norm": 0.86328125, + "learning_rate": 0.0001844403027227453, + "loss": 1.0008, + "step": 11302 + }, + { + "epoch": 0.2902293455043242, + "grad_norm": 0.76953125, + "learning_rate": 0.00018443791112810444, + "loss": 0.9516, + "step": 11303 + }, + { + "epoch": 0.290255022700246, + "grad_norm": 0.81640625, + "learning_rate": 0.00018443551936518643, + "loss": 1.1905, + "step": 11304 + }, + { + "epoch": 0.29028069989616784, + "grad_norm": 0.84765625, + "learning_rate": 0.00018443312743399604, + "loss": 1.0079, + "step": 11305 + }, + { + "epoch": 0.2903063770920897, + "grad_norm": 0.8359375, + "learning_rate": 0.00018443073533453807, + "loss": 1.0805, + "step": 11306 + }, + { + "epoch": 0.29033205428801145, + "grad_norm": 0.78125, + "learning_rate": 0.00018442834306681727, + "loss": 1.0564, + "step": 11307 + }, + { + "epoch": 0.2903577314839333, + "grad_norm": 0.734375, + "learning_rate": 0.0001844259506308384, + "loss": 0.9729, + "step": 11308 + }, + { + "epoch": 0.2903834086798551, + "grad_norm": 0.8515625, + "learning_rate": 0.00018442355802660623, + "loss": 1.0642, + "step": 11309 + }, + { + "epoch": 0.29040908587577696, + "grad_norm": 0.78515625, + "learning_rate": 0.00018442116525412553, + "loss": 0.9846, + "step": 11310 + }, + { + "epoch": 0.29043476307169874, + "grad_norm": 0.90234375, + "learning_rate": 0.00018441877231340113, + "loss": 0.8901, + "step": 11311 + }, + { + "epoch": 0.2904604402676206, + "grad_norm": 0.703125, + "learning_rate": 0.00018441637920443767, + "loss": 1.0249, + "step": 11312 + }, + { + "epoch": 0.2904861174635424, + "grad_norm": 0.80859375, + "learning_rate": 0.00018441398592724005, + "loss": 1.1697, + "step": 11313 + }, + { + "epoch": 0.2905117946594642, + "grad_norm": 1.421875, + "learning_rate": 0.00018441159248181295, + "loss": 1.1086, + "step": 11314 + }, + { + "epoch": 0.29053747185538603, + "grad_norm": 0.8359375, + "learning_rate": 0.00018440919886816116, + "loss": 0.9635, + "step": 11315 + }, + { + "epoch": 0.29056314905130787, + "grad_norm": 0.74609375, + "learning_rate": 0.0001844068050862895, + "loss": 0.9906, + "step": 11316 + }, + { + "epoch": 0.29058882624722965, + "grad_norm": 0.828125, + "learning_rate": 0.00018440441113620266, + "loss": 0.9877, + "step": 11317 + }, + { + "epoch": 0.2906145034431515, + "grad_norm": 0.88671875, + "learning_rate": 0.0001844020170179055, + "loss": 0.7911, + "step": 11318 + }, + { + "epoch": 0.2906401806390733, + "grad_norm": 0.78515625, + "learning_rate": 0.0001843996227314027, + "loss": 1.0059, + "step": 11319 + }, + { + "epoch": 0.29066585783499516, + "grad_norm": 0.7578125, + "learning_rate": 0.00018439722827669907, + "loss": 0.9644, + "step": 11320 + }, + { + "epoch": 0.29069153503091694, + "grad_norm": 0.76171875, + "learning_rate": 0.00018439483365379942, + "loss": 0.798, + "step": 11321 + }, + { + "epoch": 0.2907172122268388, + "grad_norm": 0.78515625, + "learning_rate": 0.00018439243886270848, + "loss": 1.0024, + "step": 11322 + }, + { + "epoch": 0.2907428894227606, + "grad_norm": 0.85546875, + "learning_rate": 0.00018439004390343104, + "loss": 1.0498, + "step": 11323 + }, + { + "epoch": 0.2907685666186824, + "grad_norm": 0.796875, + "learning_rate": 0.00018438764877597185, + "loss": 1.1122, + "step": 11324 + }, + { + "epoch": 0.2907942438146042, + "grad_norm": 0.8828125, + "learning_rate": 0.0001843852534803357, + "loss": 1.082, + "step": 11325 + }, + { + "epoch": 0.29081992101052606, + "grad_norm": 0.85546875, + "learning_rate": 0.0001843828580165274, + "loss": 1.0537, + "step": 11326 + }, + { + "epoch": 0.29084559820644784, + "grad_norm": 0.84375, + "learning_rate": 0.00018438046238455162, + "loss": 0.9958, + "step": 11327 + }, + { + "epoch": 0.2908712754023697, + "grad_norm": 0.7890625, + "learning_rate": 0.00018437806658441326, + "loss": 0.9549, + "step": 11328 + }, + { + "epoch": 0.2908969525982915, + "grad_norm": 0.84375, + "learning_rate": 0.000184375670616117, + "loss": 0.9134, + "step": 11329 + }, + { + "epoch": 0.29092262979421335, + "grad_norm": 0.82421875, + "learning_rate": 0.00018437327447966766, + "loss": 1.1241, + "step": 11330 + }, + { + "epoch": 0.29094830699013513, + "grad_norm": 0.78515625, + "learning_rate": 0.00018437087817507001, + "loss": 0.8985, + "step": 11331 + }, + { + "epoch": 0.29097398418605697, + "grad_norm": 0.80859375, + "learning_rate": 0.00018436848170232882, + "loss": 0.9616, + "step": 11332 + }, + { + "epoch": 0.2909996613819788, + "grad_norm": 1.78125, + "learning_rate": 0.00018436608506144887, + "loss": 0.9506, + "step": 11333 + }, + { + "epoch": 0.2910253385779006, + "grad_norm": 0.75, + "learning_rate": 0.0001843636882524349, + "loss": 0.9526, + "step": 11334 + }, + { + "epoch": 0.2910510157738224, + "grad_norm": 0.77734375, + "learning_rate": 0.00018436129127529175, + "loss": 0.9093, + "step": 11335 + }, + { + "epoch": 0.29107669296974426, + "grad_norm": 0.7578125, + "learning_rate": 0.0001843588941300242, + "loss": 0.9977, + "step": 11336 + }, + { + "epoch": 0.29110237016566604, + "grad_norm": 0.77734375, + "learning_rate": 0.00018435649681663692, + "loss": 1.0523, + "step": 11337 + }, + { + "epoch": 0.29112804736158787, + "grad_norm": 0.76171875, + "learning_rate": 0.0001843540993351348, + "loss": 0.8869, + "step": 11338 + }, + { + "epoch": 0.2911537245575097, + "grad_norm": 0.859375, + "learning_rate": 0.00018435170168552262, + "loss": 0.9154, + "step": 11339 + }, + { + "epoch": 0.29117940175343154, + "grad_norm": 0.7421875, + "learning_rate": 0.00018434930386780504, + "loss": 1.0212, + "step": 11340 + }, + { + "epoch": 0.2912050789493533, + "grad_norm": 0.8046875, + "learning_rate": 0.00018434690588198696, + "loss": 1.0545, + "step": 11341 + }, + { + "epoch": 0.29123075614527516, + "grad_norm": 0.79296875, + "learning_rate": 0.00018434450772807312, + "loss": 0.7989, + "step": 11342 + }, + { + "epoch": 0.291256433341197, + "grad_norm": 0.82421875, + "learning_rate": 0.00018434210940606826, + "loss": 1.2781, + "step": 11343 + }, + { + "epoch": 0.2912821105371188, + "grad_norm": 0.83984375, + "learning_rate": 0.00018433971091597723, + "loss": 0.8917, + "step": 11344 + }, + { + "epoch": 0.2913077877330406, + "grad_norm": 0.8125, + "learning_rate": 0.00018433731225780476, + "loss": 1.1552, + "step": 11345 + }, + { + "epoch": 0.29133346492896245, + "grad_norm": 0.72265625, + "learning_rate": 0.00018433491343155565, + "loss": 0.8915, + "step": 11346 + }, + { + "epoch": 0.29135914212488423, + "grad_norm": 0.81640625, + "learning_rate": 0.00018433251443723467, + "loss": 0.8414, + "step": 11347 + }, + { + "epoch": 0.29138481932080607, + "grad_norm": 0.7421875, + "learning_rate": 0.00018433011527484663, + "loss": 0.9701, + "step": 11348 + }, + { + "epoch": 0.2914104965167279, + "grad_norm": 0.75, + "learning_rate": 0.00018432771594439627, + "loss": 0.9378, + "step": 11349 + }, + { + "epoch": 0.29143617371264974, + "grad_norm": 0.765625, + "learning_rate": 0.00018432531644588837, + "loss": 1.0319, + "step": 11350 + }, + { + "epoch": 0.2914618509085715, + "grad_norm": 0.77734375, + "learning_rate": 0.00018432291677932775, + "loss": 1.0049, + "step": 11351 + }, + { + "epoch": 0.29148752810449335, + "grad_norm": 0.734375, + "learning_rate": 0.00018432051694471918, + "loss": 1.0344, + "step": 11352 + }, + { + "epoch": 0.2915132053004152, + "grad_norm": 0.77734375, + "learning_rate": 0.00018431811694206743, + "loss": 1.0672, + "step": 11353 + }, + { + "epoch": 0.29153888249633697, + "grad_norm": 0.76171875, + "learning_rate": 0.00018431571677137728, + "loss": 0.966, + "step": 11354 + }, + { + "epoch": 0.2915645596922588, + "grad_norm": 0.81640625, + "learning_rate": 0.00018431331643265354, + "loss": 1.0224, + "step": 11355 + }, + { + "epoch": 0.29159023688818064, + "grad_norm": 0.76953125, + "learning_rate": 0.00018431091592590096, + "loss": 0.9031, + "step": 11356 + }, + { + "epoch": 0.2916159140841024, + "grad_norm": 0.796875, + "learning_rate": 0.00018430851525112434, + "loss": 1.0666, + "step": 11357 + }, + { + "epoch": 0.29164159128002426, + "grad_norm": 0.79296875, + "learning_rate": 0.00018430611440832845, + "loss": 0.9093, + "step": 11358 + }, + { + "epoch": 0.2916672684759461, + "grad_norm": 0.7578125, + "learning_rate": 0.00018430371339751814, + "loss": 0.9648, + "step": 11359 + }, + { + "epoch": 0.2916929456718679, + "grad_norm": 0.78125, + "learning_rate": 0.0001843013122186981, + "loss": 1.0587, + "step": 11360 + }, + { + "epoch": 0.2917186228677897, + "grad_norm": 0.828125, + "learning_rate": 0.00018429891087187316, + "loss": 0.9436, + "step": 11361 + }, + { + "epoch": 0.29174430006371155, + "grad_norm": 0.7265625, + "learning_rate": 0.00018429650935704813, + "loss": 0.9182, + "step": 11362 + }, + { + "epoch": 0.2917699772596334, + "grad_norm": 0.76953125, + "learning_rate": 0.00018429410767422776, + "loss": 0.968, + "step": 11363 + }, + { + "epoch": 0.29179565445555516, + "grad_norm": 0.84375, + "learning_rate": 0.00018429170582341686, + "loss": 0.8433, + "step": 11364 + }, + { + "epoch": 0.291821331651477, + "grad_norm": 0.8984375, + "learning_rate": 0.00018428930380462018, + "loss": 0.9727, + "step": 11365 + }, + { + "epoch": 0.29184700884739884, + "grad_norm": 0.828125, + "learning_rate": 0.00018428690161784256, + "loss": 1.016, + "step": 11366 + }, + { + "epoch": 0.2918726860433206, + "grad_norm": 0.81640625, + "learning_rate": 0.00018428449926308874, + "loss": 0.9594, + "step": 11367 + }, + { + "epoch": 0.29189836323924245, + "grad_norm": 0.828125, + "learning_rate": 0.00018428209674036354, + "loss": 1.0356, + "step": 11368 + }, + { + "epoch": 0.2919240404351643, + "grad_norm": 0.6953125, + "learning_rate": 0.00018427969404967174, + "loss": 0.9281, + "step": 11369 + }, + { + "epoch": 0.29194971763108607, + "grad_norm": 0.75, + "learning_rate": 0.00018427729119101807, + "loss": 1.0192, + "step": 11370 + }, + { + "epoch": 0.2919753948270079, + "grad_norm": 0.8046875, + "learning_rate": 0.00018427488816440742, + "loss": 1.0622, + "step": 11371 + }, + { + "epoch": 0.29200107202292974, + "grad_norm": 0.83984375, + "learning_rate": 0.00018427248496984453, + "loss": 0.904, + "step": 11372 + }, + { + "epoch": 0.2920267492188516, + "grad_norm": 0.7734375, + "learning_rate": 0.00018427008160733417, + "loss": 0.8777, + "step": 11373 + }, + { + "epoch": 0.29205242641477336, + "grad_norm": 0.78125, + "learning_rate": 0.00018426767807688116, + "loss": 1.0688, + "step": 11374 + }, + { + "epoch": 0.2920781036106952, + "grad_norm": 0.78515625, + "learning_rate": 0.00018426527437849028, + "loss": 1.0021, + "step": 11375 + }, + { + "epoch": 0.29210378080661703, + "grad_norm": 0.80859375, + "learning_rate": 0.00018426287051216632, + "loss": 0.8625, + "step": 11376 + }, + { + "epoch": 0.2921294580025388, + "grad_norm": 0.77734375, + "learning_rate": 0.00018426046647791405, + "loss": 0.9802, + "step": 11377 + }, + { + "epoch": 0.29215513519846065, + "grad_norm": 0.84375, + "learning_rate": 0.00018425806227573833, + "loss": 0.9755, + "step": 11378 + }, + { + "epoch": 0.2921808123943825, + "grad_norm": 0.80078125, + "learning_rate": 0.00018425565790564384, + "loss": 0.9305, + "step": 11379 + }, + { + "epoch": 0.29220648959030426, + "grad_norm": 0.80859375, + "learning_rate": 0.0001842532533676355, + "loss": 0.8743, + "step": 11380 + }, + { + "epoch": 0.2922321667862261, + "grad_norm": 0.7578125, + "learning_rate": 0.000184250848661718, + "loss": 1.0871, + "step": 11381 + }, + { + "epoch": 0.29225784398214794, + "grad_norm": 0.79296875, + "learning_rate": 0.00018424844378789616, + "loss": 0.8893, + "step": 11382 + }, + { + "epoch": 0.29228352117806977, + "grad_norm": 0.80078125, + "learning_rate": 0.0001842460387461748, + "loss": 0.9909, + "step": 11383 + }, + { + "epoch": 0.29230919837399155, + "grad_norm": 0.72265625, + "learning_rate": 0.0001842436335365587, + "loss": 0.8441, + "step": 11384 + }, + { + "epoch": 0.2923348755699134, + "grad_norm": 0.8125, + "learning_rate": 0.00018424122815905263, + "loss": 0.9816, + "step": 11385 + }, + { + "epoch": 0.2923605527658352, + "grad_norm": 0.86328125, + "learning_rate": 0.0001842388226136614, + "loss": 1.0494, + "step": 11386 + }, + { + "epoch": 0.292386229961757, + "grad_norm": 0.94921875, + "learning_rate": 0.0001842364169003898, + "loss": 1.0824, + "step": 11387 + }, + { + "epoch": 0.29241190715767884, + "grad_norm": 0.859375, + "learning_rate": 0.00018423401101924267, + "loss": 1.0551, + "step": 11388 + }, + { + "epoch": 0.2924375843536007, + "grad_norm": 1.03125, + "learning_rate": 0.00018423160497022475, + "loss": 0.995, + "step": 11389 + }, + { + "epoch": 0.29246326154952246, + "grad_norm": 0.78515625, + "learning_rate": 0.00018422919875334083, + "loss": 1.0529, + "step": 11390 + }, + { + "epoch": 0.2924889387454443, + "grad_norm": 0.84765625, + "learning_rate": 0.0001842267923685957, + "loss": 1.1055, + "step": 11391 + }, + { + "epoch": 0.29251461594136613, + "grad_norm": 0.80078125, + "learning_rate": 0.00018422438581599425, + "loss": 0.9276, + "step": 11392 + }, + { + "epoch": 0.29254029313728797, + "grad_norm": 0.75390625, + "learning_rate": 0.00018422197909554114, + "loss": 0.9277, + "step": 11393 + }, + { + "epoch": 0.29256597033320975, + "grad_norm": 0.79296875, + "learning_rate": 0.00018421957220724129, + "loss": 0.7632, + "step": 11394 + }, + { + "epoch": 0.2925916475291316, + "grad_norm": 0.78515625, + "learning_rate": 0.00018421716515109942, + "loss": 0.9495, + "step": 11395 + }, + { + "epoch": 0.2926173247250534, + "grad_norm": 0.80859375, + "learning_rate": 0.00018421475792712032, + "loss": 0.8584, + "step": 11396 + }, + { + "epoch": 0.2926430019209752, + "grad_norm": 0.859375, + "learning_rate": 0.00018421235053530885, + "loss": 0.9168, + "step": 11397 + }, + { + "epoch": 0.29266867911689703, + "grad_norm": 0.79296875, + "learning_rate": 0.00018420994297566976, + "loss": 0.928, + "step": 11398 + }, + { + "epoch": 0.29269435631281887, + "grad_norm": 0.84765625, + "learning_rate": 0.00018420753524820785, + "loss": 0.8907, + "step": 11399 + }, + { + "epoch": 0.29272003350874065, + "grad_norm": 0.78515625, + "learning_rate": 0.00018420512735292796, + "loss": 0.8723, + "step": 11400 + }, + { + "epoch": 0.2927457107046625, + "grad_norm": 0.8515625, + "learning_rate": 0.00018420271928983484, + "loss": 0.8784, + "step": 11401 + }, + { + "epoch": 0.2927713879005843, + "grad_norm": 0.8046875, + "learning_rate": 0.00018420031105893328, + "loss": 1.006, + "step": 11402 + }, + { + "epoch": 0.29279706509650616, + "grad_norm": 0.75390625, + "learning_rate": 0.00018419790266022815, + "loss": 0.9155, + "step": 11403 + }, + { + "epoch": 0.29282274229242794, + "grad_norm": 0.79296875, + "learning_rate": 0.0001841954940937242, + "loss": 1.1546, + "step": 11404 + }, + { + "epoch": 0.2928484194883498, + "grad_norm": 1.0859375, + "learning_rate": 0.0001841930853594262, + "loss": 1.1254, + "step": 11405 + }, + { + "epoch": 0.2928740966842716, + "grad_norm": 0.84375, + "learning_rate": 0.000184190676457339, + "loss": 1.0282, + "step": 11406 + }, + { + "epoch": 0.2928997738801934, + "grad_norm": 0.875, + "learning_rate": 0.0001841882673874674, + "loss": 1.0496, + "step": 11407 + }, + { + "epoch": 0.29292545107611523, + "grad_norm": 0.8515625, + "learning_rate": 0.0001841858581498162, + "loss": 0.8594, + "step": 11408 + }, + { + "epoch": 0.29295112827203706, + "grad_norm": 0.7890625, + "learning_rate": 0.00018418344874439015, + "loss": 0.9611, + "step": 11409 + }, + { + "epoch": 0.29297680546795885, + "grad_norm": 0.7890625, + "learning_rate": 0.00018418103917119412, + "loss": 0.966, + "step": 11410 + }, + { + "epoch": 0.2930024826638807, + "grad_norm": 0.828125, + "learning_rate": 0.00018417862943023288, + "loss": 1.0956, + "step": 11411 + }, + { + "epoch": 0.2930281598598025, + "grad_norm": 0.83203125, + "learning_rate": 0.00018417621952151122, + "loss": 0.9774, + "step": 11412 + }, + { + "epoch": 0.29305383705572435, + "grad_norm": 1.84375, + "learning_rate": 0.00018417380944503396, + "loss": 0.985, + "step": 11413 + }, + { + "epoch": 0.29307951425164613, + "grad_norm": 0.7421875, + "learning_rate": 0.00018417139920080594, + "loss": 0.8535, + "step": 11414 + }, + { + "epoch": 0.29310519144756797, + "grad_norm": 0.74609375, + "learning_rate": 0.00018416898878883186, + "loss": 0.9422, + "step": 11415 + }, + { + "epoch": 0.2931308686434898, + "grad_norm": 0.8125, + "learning_rate": 0.0001841665782091166, + "loss": 0.911, + "step": 11416 + }, + { + "epoch": 0.2931565458394116, + "grad_norm": 0.72265625, + "learning_rate": 0.00018416416746166499, + "loss": 0.9845, + "step": 11417 + }, + { + "epoch": 0.2931822230353334, + "grad_norm": 0.80859375, + "learning_rate": 0.0001841617565464818, + "loss": 0.8806, + "step": 11418 + }, + { + "epoch": 0.29320790023125526, + "grad_norm": 0.76171875, + "learning_rate": 0.00018415934546357182, + "loss": 0.7721, + "step": 11419 + }, + { + "epoch": 0.29323357742717704, + "grad_norm": 0.76953125, + "learning_rate": 0.00018415693421293985, + "loss": 0.8277, + "step": 11420 + }, + { + "epoch": 0.2932592546230989, + "grad_norm": 0.8125, + "learning_rate": 0.0001841545227945907, + "loss": 1.1653, + "step": 11421 + }, + { + "epoch": 0.2932849318190207, + "grad_norm": 0.84765625, + "learning_rate": 0.0001841521112085292, + "loss": 1.1003, + "step": 11422 + }, + { + "epoch": 0.29331060901494255, + "grad_norm": 0.73828125, + "learning_rate": 0.00018414969945476016, + "loss": 0.9563, + "step": 11423 + }, + { + "epoch": 0.2933362862108643, + "grad_norm": 0.82421875, + "learning_rate": 0.00018414728753328837, + "loss": 0.9934, + "step": 11424 + }, + { + "epoch": 0.29336196340678616, + "grad_norm": 0.76171875, + "learning_rate": 0.00018414487544411864, + "loss": 0.9485, + "step": 11425 + }, + { + "epoch": 0.293387640602708, + "grad_norm": 0.7890625, + "learning_rate": 0.00018414246318725573, + "loss": 1.045, + "step": 11426 + }, + { + "epoch": 0.2934133177986298, + "grad_norm": 0.734375, + "learning_rate": 0.00018414005076270454, + "loss": 0.9718, + "step": 11427 + }, + { + "epoch": 0.2934389949945516, + "grad_norm": 0.77734375, + "learning_rate": 0.00018413763817046983, + "loss": 1.0579, + "step": 11428 + }, + { + "epoch": 0.29346467219047345, + "grad_norm": 0.87109375, + "learning_rate": 0.0001841352254105564, + "loss": 1.0677, + "step": 11429 + }, + { + "epoch": 0.29349034938639523, + "grad_norm": 0.8125, + "learning_rate": 0.00018413281248296902, + "loss": 0.9983, + "step": 11430 + }, + { + "epoch": 0.29351602658231707, + "grad_norm": 0.78125, + "learning_rate": 0.00018413039938771259, + "loss": 0.9824, + "step": 11431 + }, + { + "epoch": 0.2935417037782389, + "grad_norm": 0.8828125, + "learning_rate": 0.00018412798612479187, + "loss": 0.9844, + "step": 11432 + }, + { + "epoch": 0.29356738097416074, + "grad_norm": 0.71484375, + "learning_rate": 0.00018412557269421168, + "loss": 0.9529, + "step": 11433 + }, + { + "epoch": 0.2935930581700825, + "grad_norm": 0.734375, + "learning_rate": 0.0001841231590959768, + "loss": 1.0215, + "step": 11434 + }, + { + "epoch": 0.29361873536600436, + "grad_norm": 0.76953125, + "learning_rate": 0.00018412074533009207, + "loss": 0.9084, + "step": 11435 + }, + { + "epoch": 0.2936444125619262, + "grad_norm": 0.8359375, + "learning_rate": 0.0001841183313965623, + "loss": 0.9463, + "step": 11436 + }, + { + "epoch": 0.293670089757848, + "grad_norm": 0.76953125, + "learning_rate": 0.00018411591729539228, + "loss": 1.0163, + "step": 11437 + }, + { + "epoch": 0.2936957669537698, + "grad_norm": 0.78515625, + "learning_rate": 0.00018411350302658685, + "loss": 1.0472, + "step": 11438 + }, + { + "epoch": 0.29372144414969165, + "grad_norm": 0.8046875, + "learning_rate": 0.0001841110885901508, + "loss": 0.9479, + "step": 11439 + }, + { + "epoch": 0.2937471213456134, + "grad_norm": 0.71484375, + "learning_rate": 0.00018410867398608896, + "loss": 1.0636, + "step": 11440 + }, + { + "epoch": 0.29377279854153526, + "grad_norm": 0.75, + "learning_rate": 0.0001841062592144061, + "loss": 0.8938, + "step": 11441 + }, + { + "epoch": 0.2937984757374571, + "grad_norm": 0.8515625, + "learning_rate": 0.0001841038442751071, + "loss": 0.8803, + "step": 11442 + }, + { + "epoch": 0.29382415293337893, + "grad_norm": 0.8046875, + "learning_rate": 0.0001841014291681967, + "loss": 1.1206, + "step": 11443 + }, + { + "epoch": 0.2938498301293007, + "grad_norm": 0.84765625, + "learning_rate": 0.00018409901389367979, + "loss": 0.9501, + "step": 11444 + }, + { + "epoch": 0.29387550732522255, + "grad_norm": 0.8046875, + "learning_rate": 0.00018409659845156111, + "loss": 1.0067, + "step": 11445 + }, + { + "epoch": 0.2939011845211444, + "grad_norm": 0.75, + "learning_rate": 0.00018409418284184552, + "loss": 0.8595, + "step": 11446 + }, + { + "epoch": 0.29392686171706617, + "grad_norm": 0.73828125, + "learning_rate": 0.0001840917670645378, + "loss": 1.1174, + "step": 11447 + }, + { + "epoch": 0.293952538912988, + "grad_norm": 0.8203125, + "learning_rate": 0.0001840893511196428, + "loss": 0.8523, + "step": 11448 + }, + { + "epoch": 0.29397821610890984, + "grad_norm": 0.76953125, + "learning_rate": 0.0001840869350071653, + "loss": 1.0248, + "step": 11449 + }, + { + "epoch": 0.2940038933048316, + "grad_norm": 0.7734375, + "learning_rate": 0.00018408451872711017, + "loss": 0.8944, + "step": 11450 + }, + { + "epoch": 0.29402957050075346, + "grad_norm": 0.77734375, + "learning_rate": 0.00018408210227948218, + "loss": 0.9946, + "step": 11451 + }, + { + "epoch": 0.2940552476966753, + "grad_norm": 0.87890625, + "learning_rate": 0.00018407968566428612, + "loss": 0.9611, + "step": 11452 + }, + { + "epoch": 0.29408092489259713, + "grad_norm": 0.81640625, + "learning_rate": 0.00018407726888152687, + "loss": 1.1175, + "step": 11453 + }, + { + "epoch": 0.2941066020885189, + "grad_norm": 0.7734375, + "learning_rate": 0.00018407485193120923, + "loss": 0.8521, + "step": 11454 + }, + { + "epoch": 0.29413227928444075, + "grad_norm": 0.828125, + "learning_rate": 0.00018407243481333797, + "loss": 1.1155, + "step": 11455 + }, + { + "epoch": 0.2941579564803626, + "grad_norm": 0.86328125, + "learning_rate": 0.00018407001752791796, + "loss": 1.0627, + "step": 11456 + }, + { + "epoch": 0.29418363367628436, + "grad_norm": 0.8125, + "learning_rate": 0.00018406760007495403, + "loss": 1.0078, + "step": 11457 + }, + { + "epoch": 0.2942093108722062, + "grad_norm": 0.83984375, + "learning_rate": 0.00018406518245445093, + "loss": 0.993, + "step": 11458 + }, + { + "epoch": 0.29423498806812803, + "grad_norm": 0.75390625, + "learning_rate": 0.00018406276466641352, + "loss": 0.9414, + "step": 11459 + }, + { + "epoch": 0.2942606652640498, + "grad_norm": 0.77734375, + "learning_rate": 0.0001840603467108466, + "loss": 0.8992, + "step": 11460 + }, + { + "epoch": 0.29428634245997165, + "grad_norm": 0.8515625, + "learning_rate": 0.00018405792858775503, + "loss": 0.9971, + "step": 11461 + }, + { + "epoch": 0.2943120196558935, + "grad_norm": 0.71484375, + "learning_rate": 0.0001840555102971436, + "loss": 0.9222, + "step": 11462 + }, + { + "epoch": 0.2943376968518153, + "grad_norm": 0.76953125, + "learning_rate": 0.0001840530918390171, + "loss": 1.049, + "step": 11463 + }, + { + "epoch": 0.2943633740477371, + "grad_norm": 0.7578125, + "learning_rate": 0.00018405067321338042, + "loss": 0.9865, + "step": 11464 + }, + { + "epoch": 0.29438905124365894, + "grad_norm": 0.80078125, + "learning_rate": 0.0001840482544202383, + "loss": 1.0208, + "step": 11465 + }, + { + "epoch": 0.2944147284395808, + "grad_norm": 0.8203125, + "learning_rate": 0.0001840458354595956, + "loss": 1.0743, + "step": 11466 + }, + { + "epoch": 0.29444040563550256, + "grad_norm": 0.78125, + "learning_rate": 0.00018404341633145718, + "loss": 0.8708, + "step": 11467 + }, + { + "epoch": 0.2944660828314244, + "grad_norm": 0.76953125, + "learning_rate": 0.00018404099703582782, + "loss": 0.8263, + "step": 11468 + }, + { + "epoch": 0.2944917600273462, + "grad_norm": 0.7734375, + "learning_rate": 0.00018403857757271232, + "loss": 1.0023, + "step": 11469 + }, + { + "epoch": 0.294517437223268, + "grad_norm": 0.87890625, + "learning_rate": 0.00018403615794211552, + "loss": 1.0125, + "step": 11470 + }, + { + "epoch": 0.29454311441918984, + "grad_norm": 0.80859375, + "learning_rate": 0.00018403373814404229, + "loss": 0.9409, + "step": 11471 + }, + { + "epoch": 0.2945687916151117, + "grad_norm": 0.76953125, + "learning_rate": 0.00018403131817849738, + "loss": 0.993, + "step": 11472 + }, + { + "epoch": 0.2945944688110335, + "grad_norm": 0.8671875, + "learning_rate": 0.00018402889804548567, + "loss": 0.8502, + "step": 11473 + }, + { + "epoch": 0.2946201460069553, + "grad_norm": 0.6953125, + "learning_rate": 0.00018402647774501193, + "loss": 0.9053, + "step": 11474 + }, + { + "epoch": 0.29464582320287713, + "grad_norm": 0.79296875, + "learning_rate": 0.000184024057277081, + "loss": 0.9716, + "step": 11475 + }, + { + "epoch": 0.29467150039879897, + "grad_norm": 0.74609375, + "learning_rate": 0.00018402163664169773, + "loss": 0.9178, + "step": 11476 + }, + { + "epoch": 0.29469717759472075, + "grad_norm": 0.78515625, + "learning_rate": 0.00018401921583886694, + "loss": 1.045, + "step": 11477 + }, + { + "epoch": 0.2947228547906426, + "grad_norm": 0.75390625, + "learning_rate": 0.0001840167948685934, + "loss": 1.0338, + "step": 11478 + }, + { + "epoch": 0.2947485319865644, + "grad_norm": 0.76171875, + "learning_rate": 0.00018401437373088202, + "loss": 0.9363, + "step": 11479 + }, + { + "epoch": 0.2947742091824862, + "grad_norm": 0.79296875, + "learning_rate": 0.0001840119524257376, + "loss": 0.9827, + "step": 11480 + }, + { + "epoch": 0.29479988637840804, + "grad_norm": 0.82421875, + "learning_rate": 0.00018400953095316488, + "loss": 0.9309, + "step": 11481 + }, + { + "epoch": 0.2948255635743299, + "grad_norm": 0.81640625, + "learning_rate": 0.0001840071093131688, + "loss": 0.9357, + "step": 11482 + }, + { + "epoch": 0.2948512407702517, + "grad_norm": 0.78515625, + "learning_rate": 0.00018400468750575413, + "loss": 1.0521, + "step": 11483 + }, + { + "epoch": 0.2948769179661735, + "grad_norm": 0.7890625, + "learning_rate": 0.0001840022655309257, + "loss": 0.835, + "step": 11484 + }, + { + "epoch": 0.2949025951620953, + "grad_norm": 0.828125, + "learning_rate": 0.00018399984338868835, + "loss": 1.043, + "step": 11485 + }, + { + "epoch": 0.29492827235801716, + "grad_norm": 0.7421875, + "learning_rate": 0.00018399742107904688, + "loss": 0.9569, + "step": 11486 + }, + { + "epoch": 0.29495394955393894, + "grad_norm": 0.8203125, + "learning_rate": 0.00018399499860200614, + "loss": 1.0169, + "step": 11487 + }, + { + "epoch": 0.2949796267498608, + "grad_norm": 0.8515625, + "learning_rate": 0.000183992575957571, + "loss": 1.1103, + "step": 11488 + }, + { + "epoch": 0.2950053039457826, + "grad_norm": 0.79296875, + "learning_rate": 0.0001839901531457462, + "loss": 0.9251, + "step": 11489 + }, + { + "epoch": 0.2950309811417044, + "grad_norm": 0.76171875, + "learning_rate": 0.00018398773016653662, + "loss": 0.9822, + "step": 11490 + }, + { + "epoch": 0.29505665833762623, + "grad_norm": 0.734375, + "learning_rate": 0.0001839853070199471, + "loss": 0.8873, + "step": 11491 + }, + { + "epoch": 0.29508233553354807, + "grad_norm": 0.7890625, + "learning_rate": 0.00018398288370598243, + "loss": 0.9142, + "step": 11492 + }, + { + "epoch": 0.2951080127294699, + "grad_norm": 0.77734375, + "learning_rate": 0.00018398046022464746, + "loss": 1.0327, + "step": 11493 + }, + { + "epoch": 0.2951336899253917, + "grad_norm": 0.8671875, + "learning_rate": 0.000183978036575947, + "loss": 1.0641, + "step": 11494 + }, + { + "epoch": 0.2951593671213135, + "grad_norm": 0.76953125, + "learning_rate": 0.00018397561275988595, + "loss": 0.9094, + "step": 11495 + }, + { + "epoch": 0.29518504431723536, + "grad_norm": 0.78125, + "learning_rate": 0.00018397318877646902, + "loss": 1.013, + "step": 11496 + }, + { + "epoch": 0.29521072151315714, + "grad_norm": 0.80078125, + "learning_rate": 0.00018397076462570118, + "loss": 0.9031, + "step": 11497 + }, + { + "epoch": 0.295236398709079, + "grad_norm": 0.86328125, + "learning_rate": 0.00018396834030758713, + "loss": 0.9968, + "step": 11498 + }, + { + "epoch": 0.2952620759050008, + "grad_norm": 0.78125, + "learning_rate": 0.0001839659158221318, + "loss": 0.9748, + "step": 11499 + }, + { + "epoch": 0.2952877531009226, + "grad_norm": 0.8203125, + "learning_rate": 0.00018396349116934, + "loss": 0.9094, + "step": 11500 + }, + { + "epoch": 0.2953134302968444, + "grad_norm": 0.796875, + "learning_rate": 0.00018396106634921652, + "loss": 0.985, + "step": 11501 + }, + { + "epoch": 0.29533910749276626, + "grad_norm": 0.8125, + "learning_rate": 0.0001839586413617662, + "loss": 0.8599, + "step": 11502 + }, + { + "epoch": 0.2953647846886881, + "grad_norm": 0.796875, + "learning_rate": 0.00018395621620699392, + "loss": 0.9354, + "step": 11503 + }, + { + "epoch": 0.2953904618846099, + "grad_norm": 0.74609375, + "learning_rate": 0.0001839537908849045, + "loss": 1.121, + "step": 11504 + }, + { + "epoch": 0.2954161390805317, + "grad_norm": 0.83203125, + "learning_rate": 0.00018395136539550272, + "loss": 0.909, + "step": 11505 + }, + { + "epoch": 0.29544181627645355, + "grad_norm": 0.8203125, + "learning_rate": 0.00018394893973879348, + "loss": 0.9524, + "step": 11506 + }, + { + "epoch": 0.29546749347237533, + "grad_norm": 0.78125, + "learning_rate": 0.00018394651391478157, + "loss": 1.1266, + "step": 11507 + }, + { + "epoch": 0.29549317066829717, + "grad_norm": 0.87890625, + "learning_rate": 0.00018394408792347187, + "loss": 0.946, + "step": 11508 + }, + { + "epoch": 0.295518847864219, + "grad_norm": 0.796875, + "learning_rate": 0.00018394166176486913, + "loss": 1.0013, + "step": 11509 + }, + { + "epoch": 0.2955445250601408, + "grad_norm": 0.88671875, + "learning_rate": 0.00018393923543897828, + "loss": 1.0908, + "step": 11510 + }, + { + "epoch": 0.2955702022560626, + "grad_norm": 0.7578125, + "learning_rate": 0.0001839368089458041, + "loss": 0.8564, + "step": 11511 + }, + { + "epoch": 0.29559587945198446, + "grad_norm": 0.74609375, + "learning_rate": 0.00018393438228535148, + "loss": 0.9957, + "step": 11512 + }, + { + "epoch": 0.2956215566479063, + "grad_norm": 1.015625, + "learning_rate": 0.00018393195545762518, + "loss": 1.12, + "step": 11513 + }, + { + "epoch": 0.2956472338438281, + "grad_norm": 0.80078125, + "learning_rate": 0.00018392952846263007, + "loss": 0.9274, + "step": 11514 + }, + { + "epoch": 0.2956729110397499, + "grad_norm": 0.77734375, + "learning_rate": 0.000183927101300371, + "loss": 0.9615, + "step": 11515 + }, + { + "epoch": 0.29569858823567174, + "grad_norm": 0.85546875, + "learning_rate": 0.0001839246739708528, + "loss": 0.9407, + "step": 11516 + }, + { + "epoch": 0.2957242654315935, + "grad_norm": 0.78515625, + "learning_rate": 0.00018392224647408031, + "loss": 0.9327, + "step": 11517 + }, + { + "epoch": 0.29574994262751536, + "grad_norm": 0.81640625, + "learning_rate": 0.00018391981881005838, + "loss": 0.9535, + "step": 11518 + }, + { + "epoch": 0.2957756198234372, + "grad_norm": 0.74609375, + "learning_rate": 0.00018391739097879182, + "loss": 1.0808, + "step": 11519 + }, + { + "epoch": 0.295801297019359, + "grad_norm": 0.796875, + "learning_rate": 0.00018391496298028547, + "loss": 0.9005, + "step": 11520 + }, + { + "epoch": 0.2958269742152808, + "grad_norm": 0.75, + "learning_rate": 0.00018391253481454416, + "loss": 1.011, + "step": 11521 + }, + { + "epoch": 0.29585265141120265, + "grad_norm": 0.80859375, + "learning_rate": 0.0001839101064815728, + "loss": 0.9295, + "step": 11522 + }, + { + "epoch": 0.2958783286071245, + "grad_norm": 0.8515625, + "learning_rate": 0.00018390767798137612, + "loss": 0.9477, + "step": 11523 + }, + { + "epoch": 0.29590400580304627, + "grad_norm": 0.87109375, + "learning_rate": 0.00018390524931395904, + "loss": 0.9868, + "step": 11524 + }, + { + "epoch": 0.2959296829989681, + "grad_norm": 0.8203125, + "learning_rate": 0.00018390282047932635, + "loss": 0.9326, + "step": 11525 + }, + { + "epoch": 0.29595536019488994, + "grad_norm": 0.8046875, + "learning_rate": 0.00018390039147748297, + "loss": 1.2168, + "step": 11526 + }, + { + "epoch": 0.2959810373908117, + "grad_norm": 0.828125, + "learning_rate": 0.00018389796230843368, + "loss": 0.971, + "step": 11527 + }, + { + "epoch": 0.29600671458673355, + "grad_norm": 0.77734375, + "learning_rate": 0.00018389553297218331, + "loss": 0.8444, + "step": 11528 + }, + { + "epoch": 0.2960323917826554, + "grad_norm": 0.828125, + "learning_rate": 0.00018389310346873675, + "loss": 1.111, + "step": 11529 + }, + { + "epoch": 0.29605806897857717, + "grad_norm": 0.765625, + "learning_rate": 0.00018389067379809878, + "loss": 0.8959, + "step": 11530 + }, + { + "epoch": 0.296083746174499, + "grad_norm": 0.83203125, + "learning_rate": 0.00018388824396027426, + "loss": 0.9543, + "step": 11531 + }, + { + "epoch": 0.29610942337042084, + "grad_norm": 0.796875, + "learning_rate": 0.0001838858139552681, + "loss": 1.1384, + "step": 11532 + }, + { + "epoch": 0.2961351005663427, + "grad_norm": 0.74609375, + "learning_rate": 0.00018388338378308504, + "loss": 1.1333, + "step": 11533 + }, + { + "epoch": 0.29616077776226446, + "grad_norm": 0.796875, + "learning_rate": 0.00018388095344373, + "loss": 0.916, + "step": 11534 + }, + { + "epoch": 0.2961864549581863, + "grad_norm": 0.79296875, + "learning_rate": 0.0001838785229372078, + "loss": 0.9762, + "step": 11535 + }, + { + "epoch": 0.29621213215410813, + "grad_norm": 0.84375, + "learning_rate": 0.0001838760922635233, + "loss": 1.1101, + "step": 11536 + }, + { + "epoch": 0.2962378093500299, + "grad_norm": 0.76953125, + "learning_rate": 0.00018387366142268128, + "loss": 1.0287, + "step": 11537 + }, + { + "epoch": 0.29626348654595175, + "grad_norm": 0.7734375, + "learning_rate": 0.00018387123041468663, + "loss": 0.9294, + "step": 11538 + }, + { + "epoch": 0.2962891637418736, + "grad_norm": 0.890625, + "learning_rate": 0.0001838687992395442, + "loss": 0.949, + "step": 11539 + }, + { + "epoch": 0.29631484093779537, + "grad_norm": 0.83984375, + "learning_rate": 0.00018386636789725886, + "loss": 1.0221, + "step": 11540 + }, + { + "epoch": 0.2963405181337172, + "grad_norm": 0.78125, + "learning_rate": 0.0001838639363878354, + "loss": 1.0361, + "step": 11541 + }, + { + "epoch": 0.29636619532963904, + "grad_norm": 0.8046875, + "learning_rate": 0.0001838615047112787, + "loss": 0.9436, + "step": 11542 + }, + { + "epoch": 0.2963918725255609, + "grad_norm": 0.828125, + "learning_rate": 0.00018385907286759358, + "loss": 1.1166, + "step": 11543 + }, + { + "epoch": 0.29641754972148265, + "grad_norm": 0.7421875, + "learning_rate": 0.0001838566408567849, + "loss": 0.857, + "step": 11544 + }, + { + "epoch": 0.2964432269174045, + "grad_norm": 0.76953125, + "learning_rate": 0.00018385420867885754, + "loss": 0.8896, + "step": 11545 + }, + { + "epoch": 0.2964689041133263, + "grad_norm": 0.83203125, + "learning_rate": 0.00018385177633381628, + "loss": 1.0396, + "step": 11546 + }, + { + "epoch": 0.2964945813092481, + "grad_norm": 0.796875, + "learning_rate": 0.00018384934382166605, + "loss": 1.155, + "step": 11547 + }, + { + "epoch": 0.29652025850516994, + "grad_norm": 0.7734375, + "learning_rate": 0.00018384691114241162, + "loss": 1.2265, + "step": 11548 + }, + { + "epoch": 0.2965459357010918, + "grad_norm": 0.7890625, + "learning_rate": 0.00018384447829605785, + "loss": 0.9301, + "step": 11549 + }, + { + "epoch": 0.29657161289701356, + "grad_norm": 0.8203125, + "learning_rate": 0.00018384204528260965, + "loss": 0.9381, + "step": 11550 + }, + { + "epoch": 0.2965972900929354, + "grad_norm": 0.74609375, + "learning_rate": 0.00018383961210207181, + "loss": 0.9761, + "step": 11551 + }, + { + "epoch": 0.29662296728885723, + "grad_norm": 0.76953125, + "learning_rate": 0.00018383717875444919, + "loss": 1.0481, + "step": 11552 + }, + { + "epoch": 0.29664864448477907, + "grad_norm": 0.7421875, + "learning_rate": 0.0001838347452397467, + "loss": 1.0636, + "step": 11553 + }, + { + "epoch": 0.29667432168070085, + "grad_norm": 0.81640625, + "learning_rate": 0.00018383231155796905, + "loss": 0.9948, + "step": 11554 + }, + { + "epoch": 0.2966999988766227, + "grad_norm": 0.765625, + "learning_rate": 0.00018382987770912122, + "loss": 1.1447, + "step": 11555 + }, + { + "epoch": 0.2967256760725445, + "grad_norm": 0.70703125, + "learning_rate": 0.000183827443693208, + "loss": 0.9859, + "step": 11556 + }, + { + "epoch": 0.2967513532684663, + "grad_norm": 0.87890625, + "learning_rate": 0.0001838250095102343, + "loss": 1.0599, + "step": 11557 + }, + { + "epoch": 0.29677703046438814, + "grad_norm": 0.90234375, + "learning_rate": 0.00018382257516020488, + "loss": 0.931, + "step": 11558 + }, + { + "epoch": 0.29680270766031, + "grad_norm": 0.7734375, + "learning_rate": 0.00018382014064312465, + "loss": 0.977, + "step": 11559 + }, + { + "epoch": 0.29682838485623175, + "grad_norm": 0.78515625, + "learning_rate": 0.00018381770595899845, + "loss": 0.9793, + "step": 11560 + }, + { + "epoch": 0.2968540620521536, + "grad_norm": 1.1328125, + "learning_rate": 0.00018381527110783113, + "loss": 1.0813, + "step": 11561 + }, + { + "epoch": 0.2968797392480754, + "grad_norm": 0.8359375, + "learning_rate": 0.00018381283608962757, + "loss": 1.1208, + "step": 11562 + }, + { + "epoch": 0.29690541644399726, + "grad_norm": 0.79296875, + "learning_rate": 0.00018381040090439257, + "loss": 1.0135, + "step": 11563 + }, + { + "epoch": 0.29693109363991904, + "grad_norm": 0.76171875, + "learning_rate": 0.00018380796555213103, + "loss": 0.9882, + "step": 11564 + }, + { + "epoch": 0.2969567708358409, + "grad_norm": 0.7421875, + "learning_rate": 0.00018380553003284776, + "loss": 0.9078, + "step": 11565 + }, + { + "epoch": 0.2969824480317627, + "grad_norm": 0.83984375, + "learning_rate": 0.00018380309434654768, + "loss": 0.9494, + "step": 11566 + }, + { + "epoch": 0.2970081252276845, + "grad_norm": 0.81640625, + "learning_rate": 0.00018380065849323557, + "loss": 1.0026, + "step": 11567 + }, + { + "epoch": 0.29703380242360633, + "grad_norm": 0.8671875, + "learning_rate": 0.00018379822247291633, + "loss": 0.935, + "step": 11568 + }, + { + "epoch": 0.29705947961952817, + "grad_norm": 0.8046875, + "learning_rate": 0.0001837957862855948, + "loss": 0.9962, + "step": 11569 + }, + { + "epoch": 0.29708515681544995, + "grad_norm": 0.8125, + "learning_rate": 0.00018379334993127584, + "loss": 0.8796, + "step": 11570 + }, + { + "epoch": 0.2971108340113718, + "grad_norm": 0.87109375, + "learning_rate": 0.00018379091340996428, + "loss": 1.0827, + "step": 11571 + }, + { + "epoch": 0.2971365112072936, + "grad_norm": 0.80859375, + "learning_rate": 0.00018378847672166507, + "loss": 1.1146, + "step": 11572 + }, + { + "epoch": 0.2971621884032154, + "grad_norm": 0.765625, + "learning_rate": 0.00018378603986638292, + "loss": 0.9379, + "step": 11573 + }, + { + "epoch": 0.29718786559913724, + "grad_norm": 0.84375, + "learning_rate": 0.00018378360284412276, + "loss": 1.0237, + "step": 11574 + }, + { + "epoch": 0.29721354279505907, + "grad_norm": 0.75, + "learning_rate": 0.00018378116565488949, + "loss": 0.9589, + "step": 11575 + }, + { + "epoch": 0.2972392199909809, + "grad_norm": 0.85546875, + "learning_rate": 0.0001837787282986879, + "loss": 1.0128, + "step": 11576 + }, + { + "epoch": 0.2972648971869027, + "grad_norm": 0.75, + "learning_rate": 0.0001837762907755229, + "loss": 1.0325, + "step": 11577 + }, + { + "epoch": 0.2972905743828245, + "grad_norm": 0.80078125, + "learning_rate": 0.00018377385308539928, + "loss": 0.9953, + "step": 11578 + }, + { + "epoch": 0.29731625157874636, + "grad_norm": 0.83984375, + "learning_rate": 0.00018377141522832194, + "loss": 1.1254, + "step": 11579 + }, + { + "epoch": 0.29734192877466814, + "grad_norm": 0.83984375, + "learning_rate": 0.0001837689772042958, + "loss": 0.9235, + "step": 11580 + }, + { + "epoch": 0.29736760597059, + "grad_norm": 0.79296875, + "learning_rate": 0.0001837665390133256, + "loss": 1.0056, + "step": 11581 + }, + { + "epoch": 0.2973932831665118, + "grad_norm": 0.765625, + "learning_rate": 0.00018376410065541624, + "loss": 0.9193, + "step": 11582 + }, + { + "epoch": 0.2974189603624336, + "grad_norm": 0.80859375, + "learning_rate": 0.00018376166213057262, + "loss": 0.9173, + "step": 11583 + }, + { + "epoch": 0.29744463755835543, + "grad_norm": 0.7265625, + "learning_rate": 0.00018375922343879955, + "loss": 0.9521, + "step": 11584 + }, + { + "epoch": 0.29747031475427727, + "grad_norm": 0.8984375, + "learning_rate": 0.00018375678458010196, + "loss": 1.0125, + "step": 11585 + }, + { + "epoch": 0.2974959919501991, + "grad_norm": 0.84375, + "learning_rate": 0.00018375434555448463, + "loss": 0.9734, + "step": 11586 + }, + { + "epoch": 0.2975216691461209, + "grad_norm": 0.796875, + "learning_rate": 0.00018375190636195246, + "loss": 0.8759, + "step": 11587 + }, + { + "epoch": 0.2975473463420427, + "grad_norm": 0.74609375, + "learning_rate": 0.0001837494670025103, + "loss": 0.8794, + "step": 11588 + }, + { + "epoch": 0.29757302353796455, + "grad_norm": 0.77734375, + "learning_rate": 0.00018374702747616303, + "loss": 0.8926, + "step": 11589 + }, + { + "epoch": 0.29759870073388633, + "grad_norm": 0.76953125, + "learning_rate": 0.0001837445877829155, + "loss": 0.97, + "step": 11590 + }, + { + "epoch": 0.29762437792980817, + "grad_norm": 0.7578125, + "learning_rate": 0.00018374214792277256, + "loss": 0.8561, + "step": 11591 + }, + { + "epoch": 0.29765005512573, + "grad_norm": 0.9296875, + "learning_rate": 0.00018373970789573908, + "loss": 1.0415, + "step": 11592 + }, + { + "epoch": 0.2976757323216518, + "grad_norm": 0.72265625, + "learning_rate": 0.00018373726770181995, + "loss": 0.9701, + "step": 11593 + }, + { + "epoch": 0.2977014095175736, + "grad_norm": 0.79296875, + "learning_rate": 0.00018373482734101998, + "loss": 0.9394, + "step": 11594 + }, + { + "epoch": 0.29772708671349546, + "grad_norm": 0.7578125, + "learning_rate": 0.0001837323868133441, + "loss": 0.9376, + "step": 11595 + }, + { + "epoch": 0.2977527639094173, + "grad_norm": 0.79296875, + "learning_rate": 0.0001837299461187971, + "loss": 0.9748, + "step": 11596 + }, + { + "epoch": 0.2977784411053391, + "grad_norm": 0.84375, + "learning_rate": 0.0001837275052573839, + "loss": 1.0762, + "step": 11597 + }, + { + "epoch": 0.2978041183012609, + "grad_norm": 1.2421875, + "learning_rate": 0.00018372506422910934, + "loss": 0.8556, + "step": 11598 + }, + { + "epoch": 0.29782979549718275, + "grad_norm": 0.78515625, + "learning_rate": 0.0001837226230339783, + "loss": 1.0124, + "step": 11599 + }, + { + "epoch": 0.29785547269310453, + "grad_norm": 0.76953125, + "learning_rate": 0.0001837201816719956, + "loss": 1.1013, + "step": 11600 + }, + { + "epoch": 0.29788114988902636, + "grad_norm": 0.8203125, + "learning_rate": 0.00018371774014316617, + "loss": 0.9788, + "step": 11601 + }, + { + "epoch": 0.2979068270849482, + "grad_norm": 0.7890625, + "learning_rate": 0.00018371529844749484, + "loss": 1.0311, + "step": 11602 + }, + { + "epoch": 0.29793250428087, + "grad_norm": 0.78125, + "learning_rate": 0.0001837128565849865, + "loss": 0.846, + "step": 11603 + }, + { + "epoch": 0.2979581814767918, + "grad_norm": 0.74609375, + "learning_rate": 0.00018371041455564598, + "loss": 0.9806, + "step": 11604 + }, + { + "epoch": 0.29798385867271365, + "grad_norm": 0.77734375, + "learning_rate": 0.00018370797235947817, + "loss": 1.0803, + "step": 11605 + }, + { + "epoch": 0.2980095358686355, + "grad_norm": 0.7578125, + "learning_rate": 0.0001837055299964879, + "loss": 0.8467, + "step": 11606 + }, + { + "epoch": 0.29803521306455727, + "grad_norm": 0.8203125, + "learning_rate": 0.00018370308746668012, + "loss": 0.9341, + "step": 11607 + }, + { + "epoch": 0.2980608902604791, + "grad_norm": 0.8359375, + "learning_rate": 0.00018370064477005963, + "loss": 1.0787, + "step": 11608 + }, + { + "epoch": 0.29808656745640094, + "grad_norm": 0.80078125, + "learning_rate": 0.00018369820190663132, + "loss": 0.9125, + "step": 11609 + }, + { + "epoch": 0.2981122446523227, + "grad_norm": 1.3359375, + "learning_rate": 0.00018369575887640003, + "loss": 0.8373, + "step": 11610 + }, + { + "epoch": 0.29813792184824456, + "grad_norm": 0.7421875, + "learning_rate": 0.00018369331567937068, + "loss": 0.9493, + "step": 11611 + }, + { + "epoch": 0.2981635990441664, + "grad_norm": 0.8125, + "learning_rate": 0.0001836908723155481, + "loss": 0.9617, + "step": 11612 + }, + { + "epoch": 0.2981892762400882, + "grad_norm": 0.8515625, + "learning_rate": 0.00018368842878493718, + "loss": 1.1042, + "step": 11613 + }, + { + "epoch": 0.29821495343601, + "grad_norm": 0.7578125, + "learning_rate": 0.00018368598508754276, + "loss": 0.9706, + "step": 11614 + }, + { + "epoch": 0.29824063063193185, + "grad_norm": 0.74609375, + "learning_rate": 0.00018368354122336975, + "loss": 0.9368, + "step": 11615 + }, + { + "epoch": 0.2982663078278537, + "grad_norm": 0.9296875, + "learning_rate": 0.000183681097192423, + "loss": 0.9489, + "step": 11616 + }, + { + "epoch": 0.29829198502377546, + "grad_norm": 0.7734375, + "learning_rate": 0.00018367865299470738, + "loss": 1.0142, + "step": 11617 + }, + { + "epoch": 0.2983176622196973, + "grad_norm": 0.76953125, + "learning_rate": 0.00018367620863022775, + "loss": 0.9975, + "step": 11618 + }, + { + "epoch": 0.29834333941561914, + "grad_norm": 0.83984375, + "learning_rate": 0.000183673764098989, + "loss": 1.2263, + "step": 11619 + }, + { + "epoch": 0.2983690166115409, + "grad_norm": 1.1953125, + "learning_rate": 0.00018367131940099602, + "loss": 1.1646, + "step": 11620 + }, + { + "epoch": 0.29839469380746275, + "grad_norm": 0.7734375, + "learning_rate": 0.00018366887453625364, + "loss": 0.8615, + "step": 11621 + }, + { + "epoch": 0.2984203710033846, + "grad_norm": 0.74609375, + "learning_rate": 0.00018366642950476677, + "loss": 0.9716, + "step": 11622 + }, + { + "epoch": 0.29844604819930637, + "grad_norm": 2.421875, + "learning_rate": 0.00018366398430654023, + "loss": 1.0876, + "step": 11623 + }, + { + "epoch": 0.2984717253952282, + "grad_norm": 0.75390625, + "learning_rate": 0.00018366153894157895, + "loss": 0.8748, + "step": 11624 + }, + { + "epoch": 0.29849740259115004, + "grad_norm": 0.7578125, + "learning_rate": 0.00018365909340988776, + "loss": 0.8827, + "step": 11625 + }, + { + "epoch": 0.2985230797870719, + "grad_norm": 0.73828125, + "learning_rate": 0.00018365664771147157, + "loss": 0.9658, + "step": 11626 + }, + { + "epoch": 0.29854875698299366, + "grad_norm": 0.7734375, + "learning_rate": 0.00018365420184633522, + "loss": 0.9779, + "step": 11627 + }, + { + "epoch": 0.2985744341789155, + "grad_norm": 0.80078125, + "learning_rate": 0.00018365175581448363, + "loss": 1.1185, + "step": 11628 + }, + { + "epoch": 0.29860011137483733, + "grad_norm": 0.8203125, + "learning_rate": 0.0001836493096159216, + "loss": 0.9625, + "step": 11629 + }, + { + "epoch": 0.2986257885707591, + "grad_norm": 0.78515625, + "learning_rate": 0.0001836468632506541, + "loss": 1.0174, + "step": 11630 + }, + { + "epoch": 0.29865146576668095, + "grad_norm": 0.81640625, + "learning_rate": 0.00018364441671868593, + "loss": 1.0119, + "step": 11631 + }, + { + "epoch": 0.2986771429626028, + "grad_norm": 0.8671875, + "learning_rate": 0.00018364197002002203, + "loss": 0.9536, + "step": 11632 + }, + { + "epoch": 0.29870282015852456, + "grad_norm": 0.78125, + "learning_rate": 0.00018363952315466722, + "loss": 0.9077, + "step": 11633 + }, + { + "epoch": 0.2987284973544464, + "grad_norm": 0.890625, + "learning_rate": 0.00018363707612262638, + "loss": 1.0784, + "step": 11634 + }, + { + "epoch": 0.29875417455036823, + "grad_norm": 0.7734375, + "learning_rate": 0.00018363462892390442, + "loss": 1.0074, + "step": 11635 + }, + { + "epoch": 0.29877985174629007, + "grad_norm": 0.73828125, + "learning_rate": 0.00018363218155850616, + "loss": 0.9651, + "step": 11636 + }, + { + "epoch": 0.29880552894221185, + "grad_norm": 0.77734375, + "learning_rate": 0.00018362973402643657, + "loss": 0.994, + "step": 11637 + }, + { + "epoch": 0.2988312061381337, + "grad_norm": 0.7734375, + "learning_rate": 0.00018362728632770046, + "loss": 0.9663, + "step": 11638 + }, + { + "epoch": 0.2988568833340555, + "grad_norm": 0.859375, + "learning_rate": 0.00018362483846230272, + "loss": 1.0846, + "step": 11639 + }, + { + "epoch": 0.2988825605299773, + "grad_norm": 0.71875, + "learning_rate": 0.0001836223904302482, + "loss": 0.8477, + "step": 11640 + }, + { + "epoch": 0.29890823772589914, + "grad_norm": 0.82421875, + "learning_rate": 0.00018361994223154183, + "loss": 0.9916, + "step": 11641 + }, + { + "epoch": 0.298933914921821, + "grad_norm": 0.74609375, + "learning_rate": 0.0001836174938661885, + "loss": 0.7998, + "step": 11642 + }, + { + "epoch": 0.29895959211774276, + "grad_norm": 0.859375, + "learning_rate": 0.00018361504533419303, + "loss": 1.0091, + "step": 11643 + }, + { + "epoch": 0.2989852693136646, + "grad_norm": 0.875, + "learning_rate": 0.0001836125966355603, + "loss": 0.9992, + "step": 11644 + }, + { + "epoch": 0.29901094650958643, + "grad_norm": 0.7890625, + "learning_rate": 0.00018361014777029525, + "loss": 0.9682, + "step": 11645 + }, + { + "epoch": 0.29903662370550826, + "grad_norm": 0.82421875, + "learning_rate": 0.00018360769873840274, + "loss": 1.0571, + "step": 11646 + }, + { + "epoch": 0.29906230090143004, + "grad_norm": 0.81640625, + "learning_rate": 0.00018360524953988763, + "loss": 0.9637, + "step": 11647 + }, + { + "epoch": 0.2990879780973519, + "grad_norm": 0.8203125, + "learning_rate": 0.00018360280017475477, + "loss": 0.9125, + "step": 11648 + }, + { + "epoch": 0.2991136552932737, + "grad_norm": 0.7578125, + "learning_rate": 0.00018360035064300914, + "loss": 0.9777, + "step": 11649 + }, + { + "epoch": 0.2991393324891955, + "grad_norm": 0.82421875, + "learning_rate": 0.00018359790094465552, + "loss": 0.9857, + "step": 11650 + }, + { + "epoch": 0.29916500968511733, + "grad_norm": 0.75390625, + "learning_rate": 0.00018359545107969884, + "loss": 0.9045, + "step": 11651 + }, + { + "epoch": 0.29919068688103917, + "grad_norm": 0.859375, + "learning_rate": 0.000183593001048144, + "loss": 1.0038, + "step": 11652 + }, + { + "epoch": 0.29921636407696095, + "grad_norm": 0.734375, + "learning_rate": 0.00018359055084999585, + "loss": 0.9494, + "step": 11653 + }, + { + "epoch": 0.2992420412728828, + "grad_norm": 0.765625, + "learning_rate": 0.00018358810048525926, + "loss": 0.925, + "step": 11654 + }, + { + "epoch": 0.2992677184688046, + "grad_norm": 0.73046875, + "learning_rate": 0.0001835856499539392, + "loss": 0.9614, + "step": 11655 + }, + { + "epoch": 0.29929339566472646, + "grad_norm": 0.7734375, + "learning_rate": 0.00018358319925604043, + "loss": 0.9541, + "step": 11656 + }, + { + "epoch": 0.29931907286064824, + "grad_norm": 0.8125, + "learning_rate": 0.0001835807483915679, + "loss": 0.9831, + "step": 11657 + }, + { + "epoch": 0.2993447500565701, + "grad_norm": 0.77734375, + "learning_rate": 0.0001835782973605265, + "loss": 0.9873, + "step": 11658 + }, + { + "epoch": 0.2993704272524919, + "grad_norm": 0.81640625, + "learning_rate": 0.00018357584616292112, + "loss": 1.0099, + "step": 11659 + }, + { + "epoch": 0.2993961044484137, + "grad_norm": 0.76171875, + "learning_rate": 0.0001835733947987566, + "loss": 0.9836, + "step": 11660 + }, + { + "epoch": 0.2994217816443355, + "grad_norm": 0.84375, + "learning_rate": 0.00018357094326803788, + "loss": 0.9848, + "step": 11661 + }, + { + "epoch": 0.29944745884025736, + "grad_norm": 0.84375, + "learning_rate": 0.0001835684915707698, + "loss": 1.0486, + "step": 11662 + }, + { + "epoch": 0.29947313603617914, + "grad_norm": 0.79296875, + "learning_rate": 0.00018356603970695726, + "loss": 0.9914, + "step": 11663 + }, + { + "epoch": 0.299498813232101, + "grad_norm": 0.78125, + "learning_rate": 0.00018356358767660516, + "loss": 1.0261, + "step": 11664 + }, + { + "epoch": 0.2995244904280228, + "grad_norm": 0.8125, + "learning_rate": 0.00018356113547971837, + "loss": 0.9514, + "step": 11665 + }, + { + "epoch": 0.29955016762394465, + "grad_norm": 0.8515625, + "learning_rate": 0.00018355868311630179, + "loss": 0.9151, + "step": 11666 + }, + { + "epoch": 0.29957584481986643, + "grad_norm": 0.80859375, + "learning_rate": 0.00018355623058636028, + "loss": 1.0081, + "step": 11667 + }, + { + "epoch": 0.29960152201578827, + "grad_norm": 0.7109375, + "learning_rate": 0.0001835537778898988, + "loss": 0.9806, + "step": 11668 + }, + { + "epoch": 0.2996271992117101, + "grad_norm": 0.77734375, + "learning_rate": 0.00018355132502692216, + "loss": 1.1211, + "step": 11669 + }, + { + "epoch": 0.2996528764076319, + "grad_norm": 0.7421875, + "learning_rate": 0.00018354887199743526, + "loss": 0.9631, + "step": 11670 + }, + { + "epoch": 0.2996785536035537, + "grad_norm": 0.7734375, + "learning_rate": 0.00018354641880144302, + "loss": 0.819, + "step": 11671 + }, + { + "epoch": 0.29970423079947556, + "grad_norm": 0.80078125, + "learning_rate": 0.00018354396543895034, + "loss": 1.0124, + "step": 11672 + }, + { + "epoch": 0.29972990799539734, + "grad_norm": 0.7890625, + "learning_rate": 0.00018354151190996208, + "loss": 0.9398, + "step": 11673 + }, + { + "epoch": 0.2997555851913192, + "grad_norm": 0.796875, + "learning_rate": 0.0001835390582144831, + "loss": 0.9555, + "step": 11674 + }, + { + "epoch": 0.299781262387241, + "grad_norm": 0.75, + "learning_rate": 0.00018353660435251832, + "loss": 1.069, + "step": 11675 + }, + { + "epoch": 0.29980693958316285, + "grad_norm": 0.83984375, + "learning_rate": 0.00018353415032407267, + "loss": 1.0562, + "step": 11676 + }, + { + "epoch": 0.2998326167790846, + "grad_norm": 0.7578125, + "learning_rate": 0.00018353169612915097, + "loss": 1.086, + "step": 11677 + }, + { + "epoch": 0.29985829397500646, + "grad_norm": 0.90234375, + "learning_rate": 0.00018352924176775817, + "loss": 1.0325, + "step": 11678 + }, + { + "epoch": 0.2998839711709283, + "grad_norm": 0.71484375, + "learning_rate": 0.0001835267872398991, + "loss": 0.8851, + "step": 11679 + }, + { + "epoch": 0.2999096483668501, + "grad_norm": 0.91796875, + "learning_rate": 0.00018352433254557873, + "loss": 1.1896, + "step": 11680 + }, + { + "epoch": 0.2999353255627719, + "grad_norm": 1.1484375, + "learning_rate": 0.00018352187768480192, + "loss": 1.0214, + "step": 11681 + }, + { + "epoch": 0.29996100275869375, + "grad_norm": 0.87109375, + "learning_rate": 0.0001835194226575735, + "loss": 1.0385, + "step": 11682 + }, + { + "epoch": 0.29998667995461553, + "grad_norm": 0.8359375, + "learning_rate": 0.00018351696746389843, + "loss": 0.8603, + "step": 11683 + }, + { + "epoch": 0.30001235715053737, + "grad_norm": 0.81640625, + "learning_rate": 0.0001835145121037816, + "loss": 1.162, + "step": 11684 + }, + { + "epoch": 0.3000380343464592, + "grad_norm": 1.359375, + "learning_rate": 0.0001835120565772279, + "loss": 0.9456, + "step": 11685 + }, + { + "epoch": 0.30006371154238104, + "grad_norm": 0.73828125, + "learning_rate": 0.0001835096008842422, + "loss": 0.9254, + "step": 11686 + }, + { + "epoch": 0.3000893887383028, + "grad_norm": 0.82421875, + "learning_rate": 0.0001835071450248294, + "loss": 0.9971, + "step": 11687 + }, + { + "epoch": 0.30011506593422466, + "grad_norm": 0.82421875, + "learning_rate": 0.0001835046889989944, + "loss": 0.9548, + "step": 11688 + }, + { + "epoch": 0.3001407431301465, + "grad_norm": 0.71875, + "learning_rate": 0.00018350223280674214, + "loss": 0.9213, + "step": 11689 + }, + { + "epoch": 0.3001664203260683, + "grad_norm": 0.79296875, + "learning_rate": 0.00018349977644807746, + "loss": 1.0152, + "step": 11690 + }, + { + "epoch": 0.3001920975219901, + "grad_norm": 0.79296875, + "learning_rate": 0.00018349731992300524, + "loss": 1.0831, + "step": 11691 + }, + { + "epoch": 0.30021777471791194, + "grad_norm": 0.81640625, + "learning_rate": 0.00018349486323153042, + "loss": 1.1329, + "step": 11692 + }, + { + "epoch": 0.3002434519138337, + "grad_norm": 0.84765625, + "learning_rate": 0.00018349240637365787, + "loss": 0.9776, + "step": 11693 + }, + { + "epoch": 0.30026912910975556, + "grad_norm": 0.79296875, + "learning_rate": 0.00018348994934939253, + "loss": 1.0231, + "step": 11694 + }, + { + "epoch": 0.3002948063056774, + "grad_norm": 0.78515625, + "learning_rate": 0.00018348749215873925, + "loss": 0.9876, + "step": 11695 + }, + { + "epoch": 0.30032048350159923, + "grad_norm": 0.77734375, + "learning_rate": 0.00018348503480170288, + "loss": 0.8073, + "step": 11696 + }, + { + "epoch": 0.300346160697521, + "grad_norm": 0.7734375, + "learning_rate": 0.00018348257727828846, + "loss": 0.9054, + "step": 11697 + }, + { + "epoch": 0.30037183789344285, + "grad_norm": 0.83203125, + "learning_rate": 0.00018348011958850075, + "loss": 1.0607, + "step": 11698 + }, + { + "epoch": 0.3003975150893647, + "grad_norm": 0.78125, + "learning_rate": 0.00018347766173234473, + "loss": 0.9947, + "step": 11699 + }, + { + "epoch": 0.30042319228528647, + "grad_norm": 0.7890625, + "learning_rate": 0.00018347520370982524, + "loss": 0.8843, + "step": 11700 + }, + { + "epoch": 0.3004488694812083, + "grad_norm": 0.7578125, + "learning_rate": 0.00018347274552094723, + "loss": 0.9226, + "step": 11701 + }, + { + "epoch": 0.30047454667713014, + "grad_norm": 0.73828125, + "learning_rate": 0.00018347028716571558, + "loss": 0.9238, + "step": 11702 + }, + { + "epoch": 0.3005002238730519, + "grad_norm": 0.875, + "learning_rate": 0.0001834678286441352, + "loss": 0.9652, + "step": 11703 + }, + { + "epoch": 0.30052590106897376, + "grad_norm": 0.79296875, + "learning_rate": 0.00018346536995621096, + "loss": 1.0003, + "step": 11704 + }, + { + "epoch": 0.3005515782648956, + "grad_norm": 0.91796875, + "learning_rate": 0.00018346291110194778, + "loss": 1.019, + "step": 11705 + }, + { + "epoch": 0.3005772554608174, + "grad_norm": 0.77734375, + "learning_rate": 0.00018346045208135055, + "loss": 0.964, + "step": 11706 + }, + { + "epoch": 0.3006029326567392, + "grad_norm": 0.8515625, + "learning_rate": 0.0001834579928944242, + "loss": 0.973, + "step": 11707 + }, + { + "epoch": 0.30062860985266104, + "grad_norm": 0.86328125, + "learning_rate": 0.00018345553354117357, + "loss": 1.0628, + "step": 11708 + }, + { + "epoch": 0.3006542870485829, + "grad_norm": 0.81640625, + "learning_rate": 0.00018345307402160366, + "loss": 1.0199, + "step": 11709 + }, + { + "epoch": 0.30067996424450466, + "grad_norm": 0.84765625, + "learning_rate": 0.00018345061433571927, + "loss": 1.0162, + "step": 11710 + }, + { + "epoch": 0.3007056414404265, + "grad_norm": 0.76953125, + "learning_rate": 0.00018344815448352535, + "loss": 0.8889, + "step": 11711 + }, + { + "epoch": 0.30073131863634833, + "grad_norm": 0.8046875, + "learning_rate": 0.00018344569446502676, + "loss": 0.9609, + "step": 11712 + }, + { + "epoch": 0.3007569958322701, + "grad_norm": 0.83203125, + "learning_rate": 0.0001834432342802285, + "loss": 1.0059, + "step": 11713 + }, + { + "epoch": 0.30078267302819195, + "grad_norm": 0.77734375, + "learning_rate": 0.0001834407739291354, + "loss": 0.8821, + "step": 11714 + }, + { + "epoch": 0.3008083502241138, + "grad_norm": 0.796875, + "learning_rate": 0.00018343831341175235, + "loss": 1.0854, + "step": 11715 + }, + { + "epoch": 0.3008340274200356, + "grad_norm": 0.6796875, + "learning_rate": 0.0001834358527280843, + "loss": 0.8276, + "step": 11716 + }, + { + "epoch": 0.3008597046159574, + "grad_norm": 0.83984375, + "learning_rate": 0.0001834333918781361, + "loss": 0.9346, + "step": 11717 + }, + { + "epoch": 0.30088538181187924, + "grad_norm": 0.95703125, + "learning_rate": 0.00018343093086191272, + "loss": 1.0549, + "step": 11718 + }, + { + "epoch": 0.3009110590078011, + "grad_norm": 0.78515625, + "learning_rate": 0.00018342846967941902, + "loss": 0.8926, + "step": 11719 + }, + { + "epoch": 0.30093673620372285, + "grad_norm": 0.74609375, + "learning_rate": 0.0001834260083306599, + "loss": 0.9773, + "step": 11720 + }, + { + "epoch": 0.3009624133996447, + "grad_norm": 0.76953125, + "learning_rate": 0.00018342354681564032, + "loss": 0.9126, + "step": 11721 + }, + { + "epoch": 0.3009880905955665, + "grad_norm": 0.74609375, + "learning_rate": 0.00018342108513436512, + "loss": 1.2087, + "step": 11722 + }, + { + "epoch": 0.3010137677914883, + "grad_norm": 0.73046875, + "learning_rate": 0.0001834186232868392, + "loss": 0.8365, + "step": 11723 + }, + { + "epoch": 0.30103944498741014, + "grad_norm": 0.828125, + "learning_rate": 0.00018341616127306757, + "loss": 0.9545, + "step": 11724 + }, + { + "epoch": 0.301065122183332, + "grad_norm": 0.8125, + "learning_rate": 0.000183413699093055, + "loss": 0.8402, + "step": 11725 + }, + { + "epoch": 0.3010907993792538, + "grad_norm": 0.78125, + "learning_rate": 0.0001834112367468065, + "loss": 0.9567, + "step": 11726 + }, + { + "epoch": 0.3011164765751756, + "grad_norm": 0.78515625, + "learning_rate": 0.00018340877423432694, + "loss": 0.9616, + "step": 11727 + }, + { + "epoch": 0.30114215377109743, + "grad_norm": 0.91015625, + "learning_rate": 0.00018340631155562123, + "loss": 1.0675, + "step": 11728 + }, + { + "epoch": 0.30116783096701927, + "grad_norm": 0.74609375, + "learning_rate": 0.00018340384871069426, + "loss": 0.8263, + "step": 11729 + }, + { + "epoch": 0.30119350816294105, + "grad_norm": 0.8125, + "learning_rate": 0.00018340138569955094, + "loss": 0.9429, + "step": 11730 + }, + { + "epoch": 0.3012191853588629, + "grad_norm": 0.87890625, + "learning_rate": 0.00018339892252219623, + "loss": 0.9847, + "step": 11731 + }, + { + "epoch": 0.3012448625547847, + "grad_norm": 0.765625, + "learning_rate": 0.00018339645917863498, + "loss": 0.9725, + "step": 11732 + }, + { + "epoch": 0.3012705397507065, + "grad_norm": 0.8984375, + "learning_rate": 0.00018339399566887212, + "loss": 0.9653, + "step": 11733 + }, + { + "epoch": 0.30129621694662834, + "grad_norm": 0.82421875, + "learning_rate": 0.00018339153199291257, + "loss": 1.0212, + "step": 11734 + }, + { + "epoch": 0.3013218941425502, + "grad_norm": 0.8515625, + "learning_rate": 0.00018338906815076121, + "loss": 1.1234, + "step": 11735 + }, + { + "epoch": 0.301347571338472, + "grad_norm": 0.859375, + "learning_rate": 0.00018338660414242297, + "loss": 1.0047, + "step": 11736 + }, + { + "epoch": 0.3013732485343938, + "grad_norm": 0.84765625, + "learning_rate": 0.00018338413996790278, + "loss": 0.9579, + "step": 11737 + }, + { + "epoch": 0.3013989257303156, + "grad_norm": 0.87890625, + "learning_rate": 0.0001833816756272055, + "loss": 0.9162, + "step": 11738 + }, + { + "epoch": 0.30142460292623746, + "grad_norm": 0.76953125, + "learning_rate": 0.00018337921112033612, + "loss": 0.8702, + "step": 11739 + }, + { + "epoch": 0.30145028012215924, + "grad_norm": 0.84375, + "learning_rate": 0.00018337674644729947, + "loss": 0.8944, + "step": 11740 + }, + { + "epoch": 0.3014759573180811, + "grad_norm": 0.8125, + "learning_rate": 0.0001833742816081005, + "loss": 0.987, + "step": 11741 + }, + { + "epoch": 0.3015016345140029, + "grad_norm": 0.859375, + "learning_rate": 0.00018337181660274413, + "loss": 1.0691, + "step": 11742 + }, + { + "epoch": 0.3015273117099247, + "grad_norm": 0.875, + "learning_rate": 0.00018336935143123523, + "loss": 0.9435, + "step": 11743 + }, + { + "epoch": 0.30155298890584653, + "grad_norm": 0.83203125, + "learning_rate": 0.00018336688609357877, + "loss": 1.0286, + "step": 11744 + }, + { + "epoch": 0.30157866610176837, + "grad_norm": 0.8046875, + "learning_rate": 0.00018336442058977963, + "loss": 1.0659, + "step": 11745 + }, + { + "epoch": 0.3016043432976902, + "grad_norm": 0.73828125, + "learning_rate": 0.00018336195491984274, + "loss": 0.8693, + "step": 11746 + }, + { + "epoch": 0.301630020493612, + "grad_norm": 0.80859375, + "learning_rate": 0.00018335948908377298, + "loss": 0.9025, + "step": 11747 + }, + { + "epoch": 0.3016556976895338, + "grad_norm": 0.76953125, + "learning_rate": 0.00018335702308157532, + "loss": 0.8698, + "step": 11748 + }, + { + "epoch": 0.30168137488545566, + "grad_norm": 0.7890625, + "learning_rate": 0.0001833545569132546, + "loss": 0.9209, + "step": 11749 + }, + { + "epoch": 0.30170705208137744, + "grad_norm": 0.8125, + "learning_rate": 0.0001833520905788158, + "loss": 1.0635, + "step": 11750 + }, + { + "epoch": 0.30173272927729927, + "grad_norm": 0.8046875, + "learning_rate": 0.0001833496240782638, + "loss": 1.1011, + "step": 11751 + }, + { + "epoch": 0.3017584064732211, + "grad_norm": 0.74609375, + "learning_rate": 0.00018334715741160357, + "loss": 1.0648, + "step": 11752 + }, + { + "epoch": 0.3017840836691429, + "grad_norm": 0.7421875, + "learning_rate": 0.00018334469057883996, + "loss": 1.0954, + "step": 11753 + }, + { + "epoch": 0.3018097608650647, + "grad_norm": 0.82421875, + "learning_rate": 0.0001833422235799779, + "loss": 1.0132, + "step": 11754 + }, + { + "epoch": 0.30183543806098656, + "grad_norm": 0.828125, + "learning_rate": 0.00018333975641502233, + "loss": 0.9442, + "step": 11755 + }, + { + "epoch": 0.3018611152569084, + "grad_norm": 0.7421875, + "learning_rate": 0.00018333728908397813, + "loss": 0.8847, + "step": 11756 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.76171875, + "learning_rate": 0.00018333482158685026, + "loss": 0.9673, + "step": 11757 + }, + { + "epoch": 0.301912469648752, + "grad_norm": 0.7578125, + "learning_rate": 0.00018333235392364365, + "loss": 1.0801, + "step": 11758 + }, + { + "epoch": 0.30193814684467385, + "grad_norm": 0.765625, + "learning_rate": 0.00018332988609436312, + "loss": 1.087, + "step": 11759 + }, + { + "epoch": 0.30196382404059563, + "grad_norm": 0.703125, + "learning_rate": 0.0001833274180990137, + "loss": 0.9381, + "step": 11760 + }, + { + "epoch": 0.30198950123651747, + "grad_norm": 0.75390625, + "learning_rate": 0.00018332494993760024, + "loss": 1.071, + "step": 11761 + }, + { + "epoch": 0.3020151784324393, + "grad_norm": 0.86328125, + "learning_rate": 0.0001833224816101277, + "loss": 0.9709, + "step": 11762 + }, + { + "epoch": 0.3020408556283611, + "grad_norm": 0.85546875, + "learning_rate": 0.00018332001311660099, + "loss": 0.9685, + "step": 11763 + }, + { + "epoch": 0.3020665328242829, + "grad_norm": 0.84765625, + "learning_rate": 0.000183317544457025, + "loss": 1.0224, + "step": 11764 + }, + { + "epoch": 0.30209221002020475, + "grad_norm": 0.76953125, + "learning_rate": 0.00018331507563140465, + "loss": 1.0175, + "step": 11765 + }, + { + "epoch": 0.3021178872161266, + "grad_norm": 0.765625, + "learning_rate": 0.00018331260663974493, + "loss": 1.0209, + "step": 11766 + }, + { + "epoch": 0.30214356441204837, + "grad_norm": 0.8046875, + "learning_rate": 0.00018331013748205068, + "loss": 1.129, + "step": 11767 + }, + { + "epoch": 0.3021692416079702, + "grad_norm": 0.7578125, + "learning_rate": 0.00018330766815832683, + "loss": 1.0055, + "step": 11768 + }, + { + "epoch": 0.30219491880389204, + "grad_norm": 0.80859375, + "learning_rate": 0.00018330519866857837, + "loss": 0.9979, + "step": 11769 + }, + { + "epoch": 0.3022205959998138, + "grad_norm": 0.73828125, + "learning_rate": 0.00018330272901281015, + "loss": 0.8759, + "step": 11770 + }, + { + "epoch": 0.30224627319573566, + "grad_norm": 0.88671875, + "learning_rate": 0.0001833002591910271, + "loss": 0.9883, + "step": 11771 + }, + { + "epoch": 0.3022719503916575, + "grad_norm": 0.765625, + "learning_rate": 0.00018329778920323417, + "loss": 0.9944, + "step": 11772 + }, + { + "epoch": 0.3022976275875793, + "grad_norm": 0.73046875, + "learning_rate": 0.0001832953190494363, + "loss": 0.8312, + "step": 11773 + }, + { + "epoch": 0.3023233047835011, + "grad_norm": 0.84375, + "learning_rate": 0.00018329284872963835, + "loss": 0.9555, + "step": 11774 + }, + { + "epoch": 0.30234898197942295, + "grad_norm": 0.859375, + "learning_rate": 0.00018329037824384528, + "loss": 1.1042, + "step": 11775 + }, + { + "epoch": 0.30237465917534473, + "grad_norm": 0.80859375, + "learning_rate": 0.000183287907592062, + "loss": 0.935, + "step": 11776 + }, + { + "epoch": 0.30240033637126656, + "grad_norm": 0.828125, + "learning_rate": 0.00018328543677429345, + "loss": 0.9579, + "step": 11777 + }, + { + "epoch": 0.3024260135671884, + "grad_norm": 0.7578125, + "learning_rate": 0.00018328296579054457, + "loss": 0.9437, + "step": 11778 + }, + { + "epoch": 0.30245169076311024, + "grad_norm": 0.765625, + "learning_rate": 0.00018328049464082024, + "loss": 1.019, + "step": 11779 + }, + { + "epoch": 0.302477367959032, + "grad_norm": 0.75390625, + "learning_rate": 0.0001832780233251254, + "loss": 1.0488, + "step": 11780 + }, + { + "epoch": 0.30250304515495385, + "grad_norm": 0.77734375, + "learning_rate": 0.00018327555184346502, + "loss": 0.9357, + "step": 11781 + }, + { + "epoch": 0.3025287223508757, + "grad_norm": 0.7578125, + "learning_rate": 0.00018327308019584396, + "loss": 0.8887, + "step": 11782 + }, + { + "epoch": 0.30255439954679747, + "grad_norm": 0.734375, + "learning_rate": 0.00018327060838226717, + "loss": 1.0058, + "step": 11783 + }, + { + "epoch": 0.3025800767427193, + "grad_norm": 0.77734375, + "learning_rate": 0.00018326813640273958, + "loss": 0.9176, + "step": 11784 + }, + { + "epoch": 0.30260575393864114, + "grad_norm": 0.83984375, + "learning_rate": 0.00018326566425726614, + "loss": 0.9502, + "step": 11785 + }, + { + "epoch": 0.3026314311345629, + "grad_norm": 0.84765625, + "learning_rate": 0.00018326319194585173, + "loss": 1.1433, + "step": 11786 + }, + { + "epoch": 0.30265710833048476, + "grad_norm": 0.796875, + "learning_rate": 0.0001832607194685013, + "loss": 1.0174, + "step": 11787 + }, + { + "epoch": 0.3026827855264066, + "grad_norm": 0.8125, + "learning_rate": 0.00018325824682521977, + "loss": 1.0501, + "step": 11788 + }, + { + "epoch": 0.30270846272232843, + "grad_norm": 0.71875, + "learning_rate": 0.00018325577401601207, + "loss": 0.8993, + "step": 11789 + }, + { + "epoch": 0.3027341399182502, + "grad_norm": 0.984375, + "learning_rate": 0.0001832533010408832, + "loss": 1.0414, + "step": 11790 + }, + { + "epoch": 0.30275981711417205, + "grad_norm": 0.78125, + "learning_rate": 0.00018325082789983794, + "loss": 1.0758, + "step": 11791 + }, + { + "epoch": 0.3027854943100939, + "grad_norm": 0.73828125, + "learning_rate": 0.00018324835459288132, + "loss": 1.0742, + "step": 11792 + }, + { + "epoch": 0.30281117150601566, + "grad_norm": 0.8125, + "learning_rate": 0.00018324588112001827, + "loss": 1.1323, + "step": 11793 + }, + { + "epoch": 0.3028368487019375, + "grad_norm": 0.75, + "learning_rate": 0.0001832434074812537, + "loss": 0.9484, + "step": 11794 + }, + { + "epoch": 0.30286252589785934, + "grad_norm": 0.828125, + "learning_rate": 0.00018324093367659252, + "loss": 1.059, + "step": 11795 + }, + { + "epoch": 0.3028882030937811, + "grad_norm": 0.8046875, + "learning_rate": 0.00018323845970603968, + "loss": 1.0031, + "step": 11796 + }, + { + "epoch": 0.30291388028970295, + "grad_norm": 0.76171875, + "learning_rate": 0.00018323598556960012, + "loss": 0.9778, + "step": 11797 + }, + { + "epoch": 0.3029395574856248, + "grad_norm": 0.8359375, + "learning_rate": 0.00018323351126727872, + "loss": 0.9474, + "step": 11798 + }, + { + "epoch": 0.3029652346815466, + "grad_norm": 0.7890625, + "learning_rate": 0.0001832310367990805, + "loss": 0.9651, + "step": 11799 + }, + { + "epoch": 0.3029909118774684, + "grad_norm": 0.796875, + "learning_rate": 0.00018322856216501033, + "loss": 0.9506, + "step": 11800 + }, + { + "epoch": 0.30301658907339024, + "grad_norm": 0.7265625, + "learning_rate": 0.00018322608736507314, + "loss": 1.0126, + "step": 11801 + }, + { + "epoch": 0.3030422662693121, + "grad_norm": 0.75, + "learning_rate": 0.0001832236123992739, + "loss": 0.8821, + "step": 11802 + }, + { + "epoch": 0.30306794346523386, + "grad_norm": 0.7734375, + "learning_rate": 0.0001832211372676175, + "loss": 0.9532, + "step": 11803 + }, + { + "epoch": 0.3030936206611557, + "grad_norm": 0.74609375, + "learning_rate": 0.00018321866197010888, + "loss": 0.9079, + "step": 11804 + }, + { + "epoch": 0.30311929785707753, + "grad_norm": 0.84765625, + "learning_rate": 0.00018321618650675298, + "loss": 0.9228, + "step": 11805 + }, + { + "epoch": 0.3031449750529993, + "grad_norm": 0.8359375, + "learning_rate": 0.00018321371087755476, + "loss": 0.9987, + "step": 11806 + }, + { + "epoch": 0.30317065224892115, + "grad_norm": 0.7890625, + "learning_rate": 0.00018321123508251912, + "loss": 0.8689, + "step": 11807 + }, + { + "epoch": 0.303196329444843, + "grad_norm": 0.71875, + "learning_rate": 0.000183208759121651, + "loss": 0.9147, + "step": 11808 + }, + { + "epoch": 0.3032220066407648, + "grad_norm": 0.7734375, + "learning_rate": 0.00018320628299495537, + "loss": 0.9295, + "step": 11809 + }, + { + "epoch": 0.3032476838366866, + "grad_norm": 0.73828125, + "learning_rate": 0.0001832038067024371, + "loss": 0.9125, + "step": 11810 + }, + { + "epoch": 0.30327336103260844, + "grad_norm": 0.8203125, + "learning_rate": 0.00018320133024410119, + "loss": 0.9995, + "step": 11811 + }, + { + "epoch": 0.30329903822853027, + "grad_norm": 0.76953125, + "learning_rate": 0.00018319885361995253, + "loss": 0.8452, + "step": 11812 + }, + { + "epoch": 0.30332471542445205, + "grad_norm": 0.75, + "learning_rate": 0.00018319637682999605, + "loss": 1.0538, + "step": 11813 + }, + { + "epoch": 0.3033503926203739, + "grad_norm": 0.74609375, + "learning_rate": 0.0001831938998742367, + "loss": 0.9732, + "step": 11814 + }, + { + "epoch": 0.3033760698162957, + "grad_norm": 0.78515625, + "learning_rate": 0.00018319142275267946, + "loss": 0.9114, + "step": 11815 + }, + { + "epoch": 0.3034017470122175, + "grad_norm": 0.828125, + "learning_rate": 0.00018318894546532922, + "loss": 0.9628, + "step": 11816 + }, + { + "epoch": 0.30342742420813934, + "grad_norm": 0.85546875, + "learning_rate": 0.0001831864680121909, + "loss": 0.9615, + "step": 11817 + }, + { + "epoch": 0.3034531014040612, + "grad_norm": 0.77734375, + "learning_rate": 0.0001831839903932695, + "loss": 1.0023, + "step": 11818 + }, + { + "epoch": 0.303478778599983, + "grad_norm": 0.7578125, + "learning_rate": 0.0001831815126085699, + "loss": 0.8964, + "step": 11819 + }, + { + "epoch": 0.3035044557959048, + "grad_norm": 0.796875, + "learning_rate": 0.00018317903465809706, + "loss": 1.009, + "step": 11820 + }, + { + "epoch": 0.30353013299182663, + "grad_norm": 0.8203125, + "learning_rate": 0.00018317655654185593, + "loss": 0.9515, + "step": 11821 + }, + { + "epoch": 0.30355581018774846, + "grad_norm": 0.796875, + "learning_rate": 0.00018317407825985143, + "loss": 1.1339, + "step": 11822 + }, + { + "epoch": 0.30358148738367025, + "grad_norm": 0.90234375, + "learning_rate": 0.00018317159981208852, + "loss": 0.9798, + "step": 11823 + }, + { + "epoch": 0.3036071645795921, + "grad_norm": 0.76953125, + "learning_rate": 0.00018316912119857207, + "loss": 1.1664, + "step": 11824 + }, + { + "epoch": 0.3036328417755139, + "grad_norm": 0.73046875, + "learning_rate": 0.0001831666424193071, + "loss": 0.9489, + "step": 11825 + }, + { + "epoch": 0.3036585189714357, + "grad_norm": 0.828125, + "learning_rate": 0.00018316416347429856, + "loss": 0.9669, + "step": 11826 + }, + { + "epoch": 0.30368419616735753, + "grad_norm": 0.76171875, + "learning_rate": 0.00018316168436355133, + "loss": 0.9745, + "step": 11827 + }, + { + "epoch": 0.30370987336327937, + "grad_norm": 0.7734375, + "learning_rate": 0.00018315920508707036, + "loss": 0.9273, + "step": 11828 + }, + { + "epoch": 0.3037355505592012, + "grad_norm": 0.8046875, + "learning_rate": 0.00018315672564486063, + "loss": 1.1092, + "step": 11829 + }, + { + "epoch": 0.303761227755123, + "grad_norm": 0.78125, + "learning_rate": 0.00018315424603692707, + "loss": 0.9216, + "step": 11830 + }, + { + "epoch": 0.3037869049510448, + "grad_norm": 0.7578125, + "learning_rate": 0.00018315176626327456, + "loss": 0.8681, + "step": 11831 + }, + { + "epoch": 0.30381258214696666, + "grad_norm": 0.8203125, + "learning_rate": 0.0001831492863239081, + "loss": 1.0044, + "step": 11832 + }, + { + "epoch": 0.30383825934288844, + "grad_norm": 0.75, + "learning_rate": 0.00018314680621883266, + "loss": 0.9999, + "step": 11833 + }, + { + "epoch": 0.3038639365388103, + "grad_norm": 0.796875, + "learning_rate": 0.0001831443259480531, + "loss": 0.9118, + "step": 11834 + }, + { + "epoch": 0.3038896137347321, + "grad_norm": 0.83984375, + "learning_rate": 0.00018314184551157444, + "loss": 0.9031, + "step": 11835 + }, + { + "epoch": 0.3039152909306539, + "grad_norm": 0.86328125, + "learning_rate": 0.0001831393649094016, + "loss": 0.9178, + "step": 11836 + }, + { + "epoch": 0.30394096812657573, + "grad_norm": 0.7890625, + "learning_rate": 0.0001831368841415395, + "loss": 0.9772, + "step": 11837 + }, + { + "epoch": 0.30396664532249756, + "grad_norm": 0.8046875, + "learning_rate": 0.00018313440320799312, + "loss": 1.1098, + "step": 11838 + }, + { + "epoch": 0.3039923225184194, + "grad_norm": 0.7890625, + "learning_rate": 0.00018313192210876736, + "loss": 0.9954, + "step": 11839 + }, + { + "epoch": 0.3040179997143412, + "grad_norm": 0.76171875, + "learning_rate": 0.0001831294408438672, + "loss": 1.0525, + "step": 11840 + }, + { + "epoch": 0.304043676910263, + "grad_norm": 0.76953125, + "learning_rate": 0.00018312695941329757, + "loss": 1.0289, + "step": 11841 + }, + { + "epoch": 0.30406935410618485, + "grad_norm": 0.765625, + "learning_rate": 0.0001831244778170634, + "loss": 0.9674, + "step": 11842 + }, + { + "epoch": 0.30409503130210663, + "grad_norm": 0.81640625, + "learning_rate": 0.0001831219960551697, + "loss": 0.9238, + "step": 11843 + }, + { + "epoch": 0.30412070849802847, + "grad_norm": 0.72265625, + "learning_rate": 0.00018311951412762131, + "loss": 0.9004, + "step": 11844 + }, + { + "epoch": 0.3041463856939503, + "grad_norm": 0.8046875, + "learning_rate": 0.00018311703203442327, + "loss": 1.0331, + "step": 11845 + }, + { + "epoch": 0.3041720628898721, + "grad_norm": 0.95703125, + "learning_rate": 0.00018311454977558048, + "loss": 1.0115, + "step": 11846 + }, + { + "epoch": 0.3041977400857939, + "grad_norm": 0.7734375, + "learning_rate": 0.0001831120673510979, + "loss": 0.9437, + "step": 11847 + }, + { + "epoch": 0.30422341728171576, + "grad_norm": 0.73828125, + "learning_rate": 0.0001831095847609805, + "loss": 0.9034, + "step": 11848 + }, + { + "epoch": 0.3042490944776376, + "grad_norm": 0.76171875, + "learning_rate": 0.0001831071020052332, + "loss": 0.9672, + "step": 11849 + }, + { + "epoch": 0.3042747716735594, + "grad_norm": 0.83203125, + "learning_rate": 0.00018310461908386096, + "loss": 0.9537, + "step": 11850 + }, + { + "epoch": 0.3043004488694812, + "grad_norm": 0.80859375, + "learning_rate": 0.0001831021359968687, + "loss": 1.0515, + "step": 11851 + }, + { + "epoch": 0.30432612606540305, + "grad_norm": 0.77734375, + "learning_rate": 0.0001830996527442614, + "loss": 1.063, + "step": 11852 + }, + { + "epoch": 0.3043518032613248, + "grad_norm": 0.7578125, + "learning_rate": 0.00018309716932604398, + "loss": 0.8983, + "step": 11853 + }, + { + "epoch": 0.30437748045724666, + "grad_norm": 0.83203125, + "learning_rate": 0.0001830946857422214, + "loss": 1.0277, + "step": 11854 + }, + { + "epoch": 0.3044031576531685, + "grad_norm": 0.8203125, + "learning_rate": 0.00018309220199279867, + "loss": 0.9037, + "step": 11855 + }, + { + "epoch": 0.3044288348490903, + "grad_norm": 0.75, + "learning_rate": 0.00018308971807778064, + "loss": 0.9334, + "step": 11856 + }, + { + "epoch": 0.3044545120450121, + "grad_norm": 0.83984375, + "learning_rate": 0.00018308723399717232, + "loss": 0.9786, + "step": 11857 + }, + { + "epoch": 0.30448018924093395, + "grad_norm": 0.89453125, + "learning_rate": 0.00018308474975097867, + "loss": 0.9071, + "step": 11858 + }, + { + "epoch": 0.3045058664368558, + "grad_norm": 0.88671875, + "learning_rate": 0.0001830822653392046, + "loss": 0.9907, + "step": 11859 + }, + { + "epoch": 0.30453154363277757, + "grad_norm": 0.75390625, + "learning_rate": 0.00018307978076185507, + "loss": 0.9452, + "step": 11860 + }, + { + "epoch": 0.3045572208286994, + "grad_norm": 0.828125, + "learning_rate": 0.00018307729601893503, + "loss": 1.0427, + "step": 11861 + }, + { + "epoch": 0.30458289802462124, + "grad_norm": 0.77734375, + "learning_rate": 0.00018307481111044948, + "loss": 0.9363, + "step": 11862 + }, + { + "epoch": 0.304608575220543, + "grad_norm": 1.1015625, + "learning_rate": 0.00018307232603640333, + "loss": 0.8854, + "step": 11863 + }, + { + "epoch": 0.30463425241646486, + "grad_norm": 0.94921875, + "learning_rate": 0.00018306984079680155, + "loss": 1.0367, + "step": 11864 + }, + { + "epoch": 0.3046599296123867, + "grad_norm": 0.74609375, + "learning_rate": 0.00018306735539164905, + "loss": 0.8718, + "step": 11865 + }, + { + "epoch": 0.3046856068083085, + "grad_norm": 0.765625, + "learning_rate": 0.00018306486982095082, + "loss": 1.0187, + "step": 11866 + }, + { + "epoch": 0.3047112840042303, + "grad_norm": 0.7734375, + "learning_rate": 0.0001830623840847118, + "loss": 0.8255, + "step": 11867 + }, + { + "epoch": 0.30473696120015215, + "grad_norm": 0.8359375, + "learning_rate": 0.00018305989818293697, + "loss": 0.9673, + "step": 11868 + }, + { + "epoch": 0.304762638396074, + "grad_norm": 0.76171875, + "learning_rate": 0.00018305741211563126, + "loss": 0.8804, + "step": 11869 + }, + { + "epoch": 0.30478831559199576, + "grad_norm": 0.8359375, + "learning_rate": 0.00018305492588279964, + "loss": 0.8439, + "step": 11870 + }, + { + "epoch": 0.3048139927879176, + "grad_norm": 0.7734375, + "learning_rate": 0.00018305243948444703, + "loss": 0.9691, + "step": 11871 + }, + { + "epoch": 0.30483966998383943, + "grad_norm": 0.765625, + "learning_rate": 0.00018304995292057843, + "loss": 0.9341, + "step": 11872 + }, + { + "epoch": 0.3048653471797612, + "grad_norm": 0.85546875, + "learning_rate": 0.00018304746619119877, + "loss": 0.9926, + "step": 11873 + }, + { + "epoch": 0.30489102437568305, + "grad_norm": 0.765625, + "learning_rate": 0.000183044979296313, + "loss": 0.9843, + "step": 11874 + }, + { + "epoch": 0.3049167015716049, + "grad_norm": 0.75, + "learning_rate": 0.0001830424922359261, + "loss": 1.0254, + "step": 11875 + }, + { + "epoch": 0.30494237876752667, + "grad_norm": 0.84375, + "learning_rate": 0.000183040005010043, + "loss": 0.8959, + "step": 11876 + }, + { + "epoch": 0.3049680559634485, + "grad_norm": 0.80078125, + "learning_rate": 0.0001830375176186687, + "loss": 0.9898, + "step": 11877 + }, + { + "epoch": 0.30499373315937034, + "grad_norm": 0.86328125, + "learning_rate": 0.00018303503006180814, + "loss": 1.022, + "step": 11878 + }, + { + "epoch": 0.3050194103552922, + "grad_norm": 0.8203125, + "learning_rate": 0.00018303254233946622, + "loss": 0.9584, + "step": 11879 + }, + { + "epoch": 0.30504508755121396, + "grad_norm": 0.8359375, + "learning_rate": 0.00018303005445164796, + "loss": 1.053, + "step": 11880 + }, + { + "epoch": 0.3050707647471358, + "grad_norm": 0.765625, + "learning_rate": 0.0001830275663983583, + "loss": 0.8807, + "step": 11881 + }, + { + "epoch": 0.30509644194305763, + "grad_norm": 0.84375, + "learning_rate": 0.0001830250781796022, + "loss": 1.0439, + "step": 11882 + }, + { + "epoch": 0.3051221191389794, + "grad_norm": 0.79296875, + "learning_rate": 0.00018302258979538462, + "loss": 0.8969, + "step": 11883 + }, + { + "epoch": 0.30514779633490124, + "grad_norm": 0.80078125, + "learning_rate": 0.0001830201012457105, + "loss": 1.1185, + "step": 11884 + }, + { + "epoch": 0.3051734735308231, + "grad_norm": 0.73828125, + "learning_rate": 0.00018301761253058487, + "loss": 0.9826, + "step": 11885 + }, + { + "epoch": 0.30519915072674486, + "grad_norm": 0.78125, + "learning_rate": 0.0001830151236500126, + "loss": 0.8991, + "step": 11886 + }, + { + "epoch": 0.3052248279226667, + "grad_norm": 0.7734375, + "learning_rate": 0.0001830126346039987, + "loss": 0.817, + "step": 11887 + }, + { + "epoch": 0.30525050511858853, + "grad_norm": 0.75390625, + "learning_rate": 0.00018301014539254807, + "loss": 0.9001, + "step": 11888 + }, + { + "epoch": 0.30527618231451037, + "grad_norm": 0.796875, + "learning_rate": 0.00018300765601566579, + "loss": 0.9878, + "step": 11889 + }, + { + "epoch": 0.30530185951043215, + "grad_norm": 0.75, + "learning_rate": 0.0001830051664733567, + "loss": 1.0264, + "step": 11890 + }, + { + "epoch": 0.305327536706354, + "grad_norm": 0.83203125, + "learning_rate": 0.00018300267676562583, + "loss": 1.0467, + "step": 11891 + }, + { + "epoch": 0.3053532139022758, + "grad_norm": 0.79296875, + "learning_rate": 0.00018300018689247814, + "loss": 0.9943, + "step": 11892 + }, + { + "epoch": 0.3053788910981976, + "grad_norm": 0.78125, + "learning_rate": 0.00018299769685391855, + "loss": 0.9978, + "step": 11893 + }, + { + "epoch": 0.30540456829411944, + "grad_norm": 1.0859375, + "learning_rate": 0.00018299520664995207, + "loss": 0.863, + "step": 11894 + }, + { + "epoch": 0.3054302454900413, + "grad_norm": 0.8203125, + "learning_rate": 0.0001829927162805836, + "loss": 1.081, + "step": 11895 + }, + { + "epoch": 0.30545592268596305, + "grad_norm": 0.74609375, + "learning_rate": 0.00018299022574581818, + "loss": 0.9056, + "step": 11896 + }, + { + "epoch": 0.3054815998818849, + "grad_norm": 0.75390625, + "learning_rate": 0.00018298773504566072, + "loss": 1.0824, + "step": 11897 + }, + { + "epoch": 0.3055072770778067, + "grad_norm": 0.87890625, + "learning_rate": 0.0001829852441801162, + "loss": 0.959, + "step": 11898 + }, + { + "epoch": 0.30553295427372856, + "grad_norm": 1.0390625, + "learning_rate": 0.00018298275314918957, + "loss": 0.867, + "step": 11899 + }, + { + "epoch": 0.30555863146965034, + "grad_norm": 0.765625, + "learning_rate": 0.00018298026195288586, + "loss": 1.1356, + "step": 11900 + }, + { + "epoch": 0.3055843086655722, + "grad_norm": 0.796875, + "learning_rate": 0.00018297777059120992, + "loss": 0.8791, + "step": 11901 + }, + { + "epoch": 0.305609985861494, + "grad_norm": 0.77734375, + "learning_rate": 0.00018297527906416684, + "loss": 1.0789, + "step": 11902 + }, + { + "epoch": 0.3056356630574158, + "grad_norm": 0.76171875, + "learning_rate": 0.00018297278737176148, + "loss": 0.9837, + "step": 11903 + }, + { + "epoch": 0.30566134025333763, + "grad_norm": 0.78125, + "learning_rate": 0.00018297029551399887, + "loss": 0.9305, + "step": 11904 + }, + { + "epoch": 0.30568701744925947, + "grad_norm": 0.7265625, + "learning_rate": 0.00018296780349088393, + "loss": 0.9391, + "step": 11905 + }, + { + "epoch": 0.30571269464518125, + "grad_norm": 0.76171875, + "learning_rate": 0.00018296531130242166, + "loss": 0.9484, + "step": 11906 + }, + { + "epoch": 0.3057383718411031, + "grad_norm": 0.74609375, + "learning_rate": 0.00018296281894861704, + "loss": 0.8983, + "step": 11907 + }, + { + "epoch": 0.3057640490370249, + "grad_norm": 0.75, + "learning_rate": 0.00018296032642947503, + "loss": 0.9185, + "step": 11908 + }, + { + "epoch": 0.30578972623294676, + "grad_norm": 0.72265625, + "learning_rate": 0.00018295783374500056, + "loss": 0.9314, + "step": 11909 + }, + { + "epoch": 0.30581540342886854, + "grad_norm": 0.84375, + "learning_rate": 0.0001829553408951986, + "loss": 0.9961, + "step": 11910 + }, + { + "epoch": 0.3058410806247904, + "grad_norm": 0.80859375, + "learning_rate": 0.00018295284788007415, + "loss": 1.0552, + "step": 11911 + }, + { + "epoch": 0.3058667578207122, + "grad_norm": 0.73046875, + "learning_rate": 0.0001829503546996322, + "loss": 0.7603, + "step": 11912 + }, + { + "epoch": 0.305892435016634, + "grad_norm": 0.7421875, + "learning_rate": 0.00018294786135387764, + "loss": 0.9745, + "step": 11913 + }, + { + "epoch": 0.3059181122125558, + "grad_norm": 0.7734375, + "learning_rate": 0.0001829453678428155, + "loss": 0.9499, + "step": 11914 + }, + { + "epoch": 0.30594378940847766, + "grad_norm": 0.76171875, + "learning_rate": 0.00018294287416645074, + "loss": 0.9635, + "step": 11915 + }, + { + "epoch": 0.30596946660439944, + "grad_norm": 0.80078125, + "learning_rate": 0.00018294038032478833, + "loss": 0.9752, + "step": 11916 + }, + { + "epoch": 0.3059951438003213, + "grad_norm": 0.7578125, + "learning_rate": 0.00018293788631783321, + "loss": 0.9849, + "step": 11917 + }, + { + "epoch": 0.3060208209962431, + "grad_norm": 0.76953125, + "learning_rate": 0.00018293539214559042, + "loss": 0.8835, + "step": 11918 + }, + { + "epoch": 0.30604649819216495, + "grad_norm": 0.86328125, + "learning_rate": 0.00018293289780806483, + "loss": 1.1128, + "step": 11919 + }, + { + "epoch": 0.30607217538808673, + "grad_norm": 0.796875, + "learning_rate": 0.0001829304033052615, + "loss": 0.867, + "step": 11920 + }, + { + "epoch": 0.30609785258400857, + "grad_norm": 0.78125, + "learning_rate": 0.0001829279086371854, + "loss": 0.9767, + "step": 11921 + }, + { + "epoch": 0.3061235297799304, + "grad_norm": 0.78125, + "learning_rate": 0.0001829254138038414, + "loss": 0.921, + "step": 11922 + }, + { + "epoch": 0.3061492069758522, + "grad_norm": 0.82421875, + "learning_rate": 0.00018292291880523458, + "loss": 0.9879, + "step": 11923 + }, + { + "epoch": 0.306174884171774, + "grad_norm": 0.78515625, + "learning_rate": 0.00018292042364136989, + "loss": 0.9926, + "step": 11924 + }, + { + "epoch": 0.30620056136769586, + "grad_norm": 0.75390625, + "learning_rate": 0.00018291792831225223, + "loss": 0.9165, + "step": 11925 + }, + { + "epoch": 0.30622623856361764, + "grad_norm": 0.7421875, + "learning_rate": 0.00018291543281788667, + "loss": 0.9701, + "step": 11926 + }, + { + "epoch": 0.3062519157595395, + "grad_norm": 0.796875, + "learning_rate": 0.00018291293715827812, + "loss": 1.1197, + "step": 11927 + }, + { + "epoch": 0.3062775929554613, + "grad_norm": 0.78125, + "learning_rate": 0.00018291044133343157, + "loss": 1.0089, + "step": 11928 + }, + { + "epoch": 0.30630327015138314, + "grad_norm": 0.765625, + "learning_rate": 0.00018290794534335202, + "loss": 1.0225, + "step": 11929 + }, + { + "epoch": 0.3063289473473049, + "grad_norm": 0.78125, + "learning_rate": 0.00018290544918804442, + "loss": 1.068, + "step": 11930 + }, + { + "epoch": 0.30635462454322676, + "grad_norm": 0.85546875, + "learning_rate": 0.00018290295286751377, + "loss": 0.9832, + "step": 11931 + }, + { + "epoch": 0.3063803017391486, + "grad_norm": 0.828125, + "learning_rate": 0.00018290045638176498, + "loss": 0.999, + "step": 11932 + }, + { + "epoch": 0.3064059789350704, + "grad_norm": 0.7734375, + "learning_rate": 0.0001828979597308031, + "loss": 0.9097, + "step": 11933 + }, + { + "epoch": 0.3064316561309922, + "grad_norm": 0.88671875, + "learning_rate": 0.00018289546291463306, + "loss": 1.0594, + "step": 11934 + }, + { + "epoch": 0.30645733332691405, + "grad_norm": 0.83203125, + "learning_rate": 0.00018289296593325987, + "loss": 1.1632, + "step": 11935 + }, + { + "epoch": 0.30648301052283583, + "grad_norm": 0.828125, + "learning_rate": 0.00018289046878668846, + "loss": 0.9678, + "step": 11936 + }, + { + "epoch": 0.30650868771875767, + "grad_norm": 0.7265625, + "learning_rate": 0.00018288797147492387, + "loss": 0.9348, + "step": 11937 + }, + { + "epoch": 0.3065343649146795, + "grad_norm": 0.765625, + "learning_rate": 0.000182885473997971, + "loss": 0.9683, + "step": 11938 + }, + { + "epoch": 0.30656004211060134, + "grad_norm": 0.76171875, + "learning_rate": 0.0001828829763558349, + "loss": 1.016, + "step": 11939 + }, + { + "epoch": 0.3065857193065231, + "grad_norm": 0.7890625, + "learning_rate": 0.00018288047854852048, + "loss": 0.9951, + "step": 11940 + }, + { + "epoch": 0.30661139650244496, + "grad_norm": 1.359375, + "learning_rate": 0.00018287798057603277, + "loss": 0.9221, + "step": 11941 + }, + { + "epoch": 0.3066370736983668, + "grad_norm": 0.73828125, + "learning_rate": 0.00018287548243837672, + "loss": 0.8753, + "step": 11942 + }, + { + "epoch": 0.30666275089428857, + "grad_norm": 0.8125, + "learning_rate": 0.00018287298413555733, + "loss": 1.0729, + "step": 11943 + }, + { + "epoch": 0.3066884280902104, + "grad_norm": 0.7421875, + "learning_rate": 0.00018287048566757957, + "loss": 0.8867, + "step": 11944 + }, + { + "epoch": 0.30671410528613224, + "grad_norm": 0.7578125, + "learning_rate": 0.00018286798703444842, + "loss": 1.1212, + "step": 11945 + }, + { + "epoch": 0.306739782482054, + "grad_norm": 0.8203125, + "learning_rate": 0.00018286548823616885, + "loss": 0.9798, + "step": 11946 + }, + { + "epoch": 0.30676545967797586, + "grad_norm": 0.7890625, + "learning_rate": 0.00018286298927274586, + "loss": 1.0061, + "step": 11947 + }, + { + "epoch": 0.3067911368738977, + "grad_norm": 0.8203125, + "learning_rate": 0.00018286049014418443, + "loss": 1.1633, + "step": 11948 + }, + { + "epoch": 0.30681681406981953, + "grad_norm": 0.7890625, + "learning_rate": 0.0001828579908504895, + "loss": 0.939, + "step": 11949 + }, + { + "epoch": 0.3068424912657413, + "grad_norm": 0.828125, + "learning_rate": 0.00018285549139166607, + "loss": 1.0161, + "step": 11950 + }, + { + "epoch": 0.30686816846166315, + "grad_norm": 0.8359375, + "learning_rate": 0.00018285299176771917, + "loss": 0.983, + "step": 11951 + }, + { + "epoch": 0.306893845657585, + "grad_norm": 0.828125, + "learning_rate": 0.0001828504919786537, + "loss": 0.9429, + "step": 11952 + }, + { + "epoch": 0.30691952285350677, + "grad_norm": 0.7890625, + "learning_rate": 0.0001828479920244747, + "loss": 0.9597, + "step": 11953 + }, + { + "epoch": 0.3069452000494286, + "grad_norm": 0.74609375, + "learning_rate": 0.00018284549190518713, + "loss": 0.8462, + "step": 11954 + }, + { + "epoch": 0.30697087724535044, + "grad_norm": 0.72265625, + "learning_rate": 0.000182842991620796, + "loss": 0.8292, + "step": 11955 + }, + { + "epoch": 0.3069965544412722, + "grad_norm": 0.71875, + "learning_rate": 0.00018284049117130624, + "loss": 0.8153, + "step": 11956 + }, + { + "epoch": 0.30702223163719405, + "grad_norm": 0.796875, + "learning_rate": 0.0001828379905567229, + "loss": 1.0011, + "step": 11957 + }, + { + "epoch": 0.3070479088331159, + "grad_norm": 0.79296875, + "learning_rate": 0.0001828354897770509, + "loss": 1.0402, + "step": 11958 + }, + { + "epoch": 0.3070735860290377, + "grad_norm": 0.8828125, + "learning_rate": 0.00018283298883229528, + "loss": 1.0908, + "step": 11959 + }, + { + "epoch": 0.3070992632249595, + "grad_norm": 0.8359375, + "learning_rate": 0.00018283048772246095, + "loss": 0.9435, + "step": 11960 + }, + { + "epoch": 0.30712494042088134, + "grad_norm": 0.765625, + "learning_rate": 0.00018282798644755297, + "loss": 0.9105, + "step": 11961 + }, + { + "epoch": 0.3071506176168032, + "grad_norm": 0.83203125, + "learning_rate": 0.00018282548500757627, + "loss": 0.8857, + "step": 11962 + }, + { + "epoch": 0.30717629481272496, + "grad_norm": 0.7734375, + "learning_rate": 0.0001828229834025359, + "loss": 0.9275, + "step": 11963 + }, + { + "epoch": 0.3072019720086468, + "grad_norm": 0.80078125, + "learning_rate": 0.00018282048163243677, + "loss": 1.0631, + "step": 11964 + }, + { + "epoch": 0.30722764920456863, + "grad_norm": 0.78125, + "learning_rate": 0.0001828179796972839, + "loss": 1.0926, + "step": 11965 + }, + { + "epoch": 0.3072533264004904, + "grad_norm": 0.7265625, + "learning_rate": 0.00018281547759708232, + "loss": 0.8159, + "step": 11966 + }, + { + "epoch": 0.30727900359641225, + "grad_norm": 0.80078125, + "learning_rate": 0.0001828129753318369, + "loss": 0.9184, + "step": 11967 + }, + { + "epoch": 0.3073046807923341, + "grad_norm": 0.73828125, + "learning_rate": 0.0001828104729015528, + "loss": 0.9448, + "step": 11968 + }, + { + "epoch": 0.3073303579882559, + "grad_norm": 0.7890625, + "learning_rate": 0.00018280797030623483, + "loss": 0.972, + "step": 11969 + }, + { + "epoch": 0.3073560351841777, + "grad_norm": 0.81640625, + "learning_rate": 0.00018280546754588807, + "loss": 1.0824, + "step": 11970 + }, + { + "epoch": 0.30738171238009954, + "grad_norm": 0.80078125, + "learning_rate": 0.00018280296462051753, + "loss": 0.9128, + "step": 11971 + }, + { + "epoch": 0.3074073895760214, + "grad_norm": 0.71875, + "learning_rate": 0.0001828004615301281, + "loss": 1.0309, + "step": 11972 + }, + { + "epoch": 0.30743306677194315, + "grad_norm": 0.8671875, + "learning_rate": 0.00018279795827472488, + "loss": 0.9461, + "step": 11973 + }, + { + "epoch": 0.307458743967865, + "grad_norm": 0.83984375, + "learning_rate": 0.0001827954548543128, + "loss": 1.0759, + "step": 11974 + }, + { + "epoch": 0.3074844211637868, + "grad_norm": 0.79296875, + "learning_rate": 0.00018279295126889683, + "loss": 0.8218, + "step": 11975 + }, + { + "epoch": 0.3075100983597086, + "grad_norm": 0.796875, + "learning_rate": 0.00018279044751848202, + "loss": 0.9844, + "step": 11976 + }, + { + "epoch": 0.30753577555563044, + "grad_norm": 0.77734375, + "learning_rate": 0.00018278794360307332, + "loss": 0.923, + "step": 11977 + }, + { + "epoch": 0.3075614527515523, + "grad_norm": 0.85546875, + "learning_rate": 0.0001827854395226757, + "loss": 1.2269, + "step": 11978 + }, + { + "epoch": 0.3075871299474741, + "grad_norm": 0.91796875, + "learning_rate": 0.0001827829352772942, + "loss": 1.0129, + "step": 11979 + }, + { + "epoch": 0.3076128071433959, + "grad_norm": 0.75, + "learning_rate": 0.00018278043086693383, + "loss": 0.9577, + "step": 11980 + }, + { + "epoch": 0.30763848433931773, + "grad_norm": 0.86328125, + "learning_rate": 0.00018277792629159946, + "loss": 1.089, + "step": 11981 + }, + { + "epoch": 0.30766416153523957, + "grad_norm": 0.7734375, + "learning_rate": 0.0001827754215512962, + "loss": 0.992, + "step": 11982 + }, + { + "epoch": 0.30768983873116135, + "grad_norm": 0.78125, + "learning_rate": 0.00018277291664602898, + "loss": 0.9475, + "step": 11983 + }, + { + "epoch": 0.3077155159270832, + "grad_norm": 0.86328125, + "learning_rate": 0.00018277041157580283, + "loss": 0.8901, + "step": 11984 + }, + { + "epoch": 0.307741193123005, + "grad_norm": 0.85546875, + "learning_rate": 0.00018276790634062272, + "loss": 1.0946, + "step": 11985 + }, + { + "epoch": 0.3077668703189268, + "grad_norm": 0.71875, + "learning_rate": 0.00018276540094049366, + "loss": 0.9024, + "step": 11986 + }, + { + "epoch": 0.30779254751484864, + "grad_norm": 0.796875, + "learning_rate": 0.0001827628953754206, + "loss": 0.7888, + "step": 11987 + }, + { + "epoch": 0.30781822471077047, + "grad_norm": 0.8125, + "learning_rate": 0.00018276038964540862, + "loss": 1.0837, + "step": 11988 + }, + { + "epoch": 0.30784390190669225, + "grad_norm": 0.83203125, + "learning_rate": 0.00018275788375046262, + "loss": 0.9044, + "step": 11989 + }, + { + "epoch": 0.3078695791026141, + "grad_norm": 0.84375, + "learning_rate": 0.00018275537769058765, + "loss": 0.9651, + "step": 11990 + }, + { + "epoch": 0.3078952562985359, + "grad_norm": 0.97265625, + "learning_rate": 0.00018275287146578867, + "loss": 1.1345, + "step": 11991 + }, + { + "epoch": 0.30792093349445776, + "grad_norm": 0.91015625, + "learning_rate": 0.00018275036507607072, + "loss": 1.0988, + "step": 11992 + }, + { + "epoch": 0.30794661069037954, + "grad_norm": 0.76953125, + "learning_rate": 0.00018274785852143873, + "loss": 1.0357, + "step": 11993 + }, + { + "epoch": 0.3079722878863014, + "grad_norm": 0.7734375, + "learning_rate": 0.00018274535180189778, + "loss": 0.984, + "step": 11994 + }, + { + "epoch": 0.3079979650822232, + "grad_norm": 0.78515625, + "learning_rate": 0.0001827428449174528, + "loss": 0.971, + "step": 11995 + }, + { + "epoch": 0.308023642278145, + "grad_norm": 0.71875, + "learning_rate": 0.0001827403378681088, + "loss": 0.894, + "step": 11996 + }, + { + "epoch": 0.30804931947406683, + "grad_norm": 0.84765625, + "learning_rate": 0.00018273783065387078, + "loss": 1.0379, + "step": 11997 + }, + { + "epoch": 0.30807499666998867, + "grad_norm": 0.76171875, + "learning_rate": 0.0001827353232747437, + "loss": 0.9402, + "step": 11998 + }, + { + "epoch": 0.30810067386591045, + "grad_norm": 0.734375, + "learning_rate": 0.00018273281573073265, + "loss": 0.8644, + "step": 11999 + }, + { + "epoch": 0.3081263510618323, + "grad_norm": 0.79296875, + "learning_rate": 0.00018273030802184255, + "loss": 1.0765, + "step": 12000 + }, + { + "epoch": 0.3081263510618323, + "eval_loss": 0.965718150138855, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 406.4014, + "eval_samples_per_second": 24.606, + "eval_steps_per_second": 0.77, + "step": 12000 + }, + { + "epoch": 0.3081520282577541, + "grad_norm": 0.78125, + "learning_rate": 0.00018272780014807842, + "loss": 0.9097, + "step": 12001 + }, + { + "epoch": 0.30817770545367595, + "grad_norm": 0.8125, + "learning_rate": 0.00018272529210944526, + "loss": 0.9935, + "step": 12002 + }, + { + "epoch": 0.30820338264959773, + "grad_norm": 0.8125, + "learning_rate": 0.00018272278390594804, + "loss": 1.0013, + "step": 12003 + }, + { + "epoch": 0.30822905984551957, + "grad_norm": 0.8125, + "learning_rate": 0.0001827202755375918, + "loss": 0.8675, + "step": 12004 + }, + { + "epoch": 0.3082547370414414, + "grad_norm": 0.80859375, + "learning_rate": 0.00018271776700438155, + "loss": 0.9676, + "step": 12005 + }, + { + "epoch": 0.3082804142373632, + "grad_norm": 0.8046875, + "learning_rate": 0.00018271525830632223, + "loss": 1.1404, + "step": 12006 + }, + { + "epoch": 0.308306091433285, + "grad_norm": 0.80078125, + "learning_rate": 0.00018271274944341889, + "loss": 0.9758, + "step": 12007 + }, + { + "epoch": 0.30833176862920686, + "grad_norm": 0.78125, + "learning_rate": 0.0001827102404156765, + "loss": 0.8879, + "step": 12008 + }, + { + "epoch": 0.30835744582512864, + "grad_norm": 0.73828125, + "learning_rate": 0.00018270773122310008, + "loss": 0.9727, + "step": 12009 + }, + { + "epoch": 0.3083831230210505, + "grad_norm": 0.81640625, + "learning_rate": 0.00018270522186569462, + "loss": 1.0161, + "step": 12010 + }, + { + "epoch": 0.3084088002169723, + "grad_norm": 0.734375, + "learning_rate": 0.0001827027123434651, + "loss": 1.0104, + "step": 12011 + }, + { + "epoch": 0.30843447741289415, + "grad_norm": 0.8515625, + "learning_rate": 0.00018270020265641657, + "loss": 1.0529, + "step": 12012 + }, + { + "epoch": 0.30846015460881593, + "grad_norm": 0.84375, + "learning_rate": 0.00018269769280455402, + "loss": 0.9394, + "step": 12013 + }, + { + "epoch": 0.30848583180473776, + "grad_norm": 0.83203125, + "learning_rate": 0.0001826951827878824, + "loss": 1.0515, + "step": 12014 + }, + { + "epoch": 0.3085115090006596, + "grad_norm": 0.91015625, + "learning_rate": 0.0001826926726064068, + "loss": 1.0673, + "step": 12015 + }, + { + "epoch": 0.3085371861965814, + "grad_norm": 0.80078125, + "learning_rate": 0.00018269016226013214, + "loss": 1.0162, + "step": 12016 + }, + { + "epoch": 0.3085628633925032, + "grad_norm": 0.78125, + "learning_rate": 0.00018268765174906344, + "loss": 0.8036, + "step": 12017 + }, + { + "epoch": 0.30858854058842505, + "grad_norm": 0.765625, + "learning_rate": 0.00018268514107320575, + "loss": 0.9837, + "step": 12018 + }, + { + "epoch": 0.30861421778434683, + "grad_norm": 0.77734375, + "learning_rate": 0.00018268263023256404, + "loss": 1.0278, + "step": 12019 + }, + { + "epoch": 0.30863989498026867, + "grad_norm": 0.74609375, + "learning_rate": 0.0001826801192271433, + "loss": 0.996, + "step": 12020 + }, + { + "epoch": 0.3086655721761905, + "grad_norm": 0.78125, + "learning_rate": 0.00018267760805694856, + "loss": 0.9116, + "step": 12021 + }, + { + "epoch": 0.30869124937211234, + "grad_norm": 0.85546875, + "learning_rate": 0.0001826750967219848, + "loss": 1.1137, + "step": 12022 + }, + { + "epoch": 0.3087169265680341, + "grad_norm": 0.8359375, + "learning_rate": 0.00018267258522225706, + "loss": 1.0891, + "step": 12023 + }, + { + "epoch": 0.30874260376395596, + "grad_norm": 0.796875, + "learning_rate": 0.00018267007355777035, + "loss": 1.0294, + "step": 12024 + }, + { + "epoch": 0.3087682809598778, + "grad_norm": 0.76953125, + "learning_rate": 0.00018266756172852962, + "loss": 0.9526, + "step": 12025 + }, + { + "epoch": 0.3087939581557996, + "grad_norm": 0.8359375, + "learning_rate": 0.0001826650497345399, + "loss": 1.0288, + "step": 12026 + }, + { + "epoch": 0.3088196353517214, + "grad_norm": 0.9296875, + "learning_rate": 0.0001826625375758062, + "loss": 1.0702, + "step": 12027 + }, + { + "epoch": 0.30884531254764325, + "grad_norm": 0.81640625, + "learning_rate": 0.00018266002525233355, + "loss": 1.0629, + "step": 12028 + }, + { + "epoch": 0.308870989743565, + "grad_norm": 0.7890625, + "learning_rate": 0.00018265751276412692, + "loss": 0.9769, + "step": 12029 + }, + { + "epoch": 0.30889666693948686, + "grad_norm": 0.78515625, + "learning_rate": 0.00018265500011119135, + "loss": 0.9716, + "step": 12030 + }, + { + "epoch": 0.3089223441354087, + "grad_norm": 0.890625, + "learning_rate": 0.0001826524872935318, + "loss": 0.9789, + "step": 12031 + }, + { + "epoch": 0.30894802133133054, + "grad_norm": 0.89453125, + "learning_rate": 0.00018264997431115333, + "loss": 0.973, + "step": 12032 + }, + { + "epoch": 0.3089736985272523, + "grad_norm": 0.70703125, + "learning_rate": 0.00018264746116406096, + "loss": 1.1209, + "step": 12033 + }, + { + "epoch": 0.30899937572317415, + "grad_norm": 0.82421875, + "learning_rate": 0.0001826449478522596, + "loss": 0.9482, + "step": 12034 + }, + { + "epoch": 0.309025052919096, + "grad_norm": 0.76171875, + "learning_rate": 0.00018264243437575436, + "loss": 0.9179, + "step": 12035 + }, + { + "epoch": 0.30905073011501777, + "grad_norm": 0.74609375, + "learning_rate": 0.00018263992073455022, + "loss": 0.8003, + "step": 12036 + }, + { + "epoch": 0.3090764073109396, + "grad_norm": 0.7734375, + "learning_rate": 0.00018263740692865219, + "loss": 0.9824, + "step": 12037 + }, + { + "epoch": 0.30910208450686144, + "grad_norm": 0.70703125, + "learning_rate": 0.00018263489295806525, + "loss": 0.9863, + "step": 12038 + }, + { + "epoch": 0.3091277617027832, + "grad_norm": 0.77734375, + "learning_rate": 0.00018263237882279443, + "loss": 1.0703, + "step": 12039 + }, + { + "epoch": 0.30915343889870506, + "grad_norm": 0.71484375, + "learning_rate": 0.00018262986452284477, + "loss": 0.8912, + "step": 12040 + }, + { + "epoch": 0.3091791160946269, + "grad_norm": 0.734375, + "learning_rate": 0.00018262735005822122, + "loss": 0.9273, + "step": 12041 + }, + { + "epoch": 0.30920479329054873, + "grad_norm": 0.80859375, + "learning_rate": 0.00018262483542892883, + "loss": 1.0086, + "step": 12042 + }, + { + "epoch": 0.3092304704864705, + "grad_norm": 0.73046875, + "learning_rate": 0.00018262232063497263, + "loss": 0.9474, + "step": 12043 + }, + { + "epoch": 0.30925614768239235, + "grad_norm": 0.8671875, + "learning_rate": 0.0001826198056763576, + "loss": 0.9794, + "step": 12044 + }, + { + "epoch": 0.3092818248783142, + "grad_norm": 0.8046875, + "learning_rate": 0.00018261729055308877, + "loss": 0.9602, + "step": 12045 + }, + { + "epoch": 0.30930750207423596, + "grad_norm": 0.734375, + "learning_rate": 0.00018261477526517112, + "loss": 0.9384, + "step": 12046 + }, + { + "epoch": 0.3093331792701578, + "grad_norm": 0.703125, + "learning_rate": 0.0001826122598126097, + "loss": 0.9368, + "step": 12047 + }, + { + "epoch": 0.30935885646607963, + "grad_norm": 0.78125, + "learning_rate": 0.0001826097441954095, + "loss": 1.1225, + "step": 12048 + }, + { + "epoch": 0.3093845336620014, + "grad_norm": 0.7734375, + "learning_rate": 0.00018260722841357554, + "loss": 1.0347, + "step": 12049 + }, + { + "epoch": 0.30941021085792325, + "grad_norm": 0.73828125, + "learning_rate": 0.00018260471246711284, + "loss": 1.0169, + "step": 12050 + }, + { + "epoch": 0.3094358880538451, + "grad_norm": 0.7578125, + "learning_rate": 0.0001826021963560264, + "loss": 0.9526, + "step": 12051 + }, + { + "epoch": 0.3094615652497669, + "grad_norm": 0.73828125, + "learning_rate": 0.00018259968008032127, + "loss": 0.9707, + "step": 12052 + }, + { + "epoch": 0.3094872424456887, + "grad_norm": 0.7578125, + "learning_rate": 0.00018259716364000243, + "loss": 0.9247, + "step": 12053 + }, + { + "epoch": 0.30951291964161054, + "grad_norm": 0.78515625, + "learning_rate": 0.00018259464703507489, + "loss": 0.8479, + "step": 12054 + }, + { + "epoch": 0.3095385968375324, + "grad_norm": 0.86328125, + "learning_rate": 0.0001825921302655437, + "loss": 1.0172, + "step": 12055 + }, + { + "epoch": 0.30956427403345416, + "grad_norm": 0.8359375, + "learning_rate": 0.00018258961333141382, + "loss": 1.0378, + "step": 12056 + }, + { + "epoch": 0.309589951229376, + "grad_norm": 0.7578125, + "learning_rate": 0.00018258709623269033, + "loss": 0.8844, + "step": 12057 + }, + { + "epoch": 0.30961562842529783, + "grad_norm": 0.73046875, + "learning_rate": 0.00018258457896937823, + "loss": 0.835, + "step": 12058 + }, + { + "epoch": 0.3096413056212196, + "grad_norm": 0.78515625, + "learning_rate": 0.00018258206154148248, + "loss": 0.9253, + "step": 12059 + }, + { + "epoch": 0.30966698281714145, + "grad_norm": 0.83984375, + "learning_rate": 0.00018257954394900817, + "loss": 0.967, + "step": 12060 + }, + { + "epoch": 0.3096926600130633, + "grad_norm": 0.7890625, + "learning_rate": 0.0001825770261919603, + "loss": 0.9678, + "step": 12061 + }, + { + "epoch": 0.3097183372089851, + "grad_norm": 0.75390625, + "learning_rate": 0.00018257450827034383, + "loss": 1.047, + "step": 12062 + }, + { + "epoch": 0.3097440144049069, + "grad_norm": 0.8203125, + "learning_rate": 0.00018257199018416385, + "loss": 1.0511, + "step": 12063 + }, + { + "epoch": 0.30976969160082873, + "grad_norm": 0.76953125, + "learning_rate": 0.00018256947193342536, + "loss": 1.0022, + "step": 12064 + }, + { + "epoch": 0.30979536879675057, + "grad_norm": 0.76171875, + "learning_rate": 0.00018256695351813335, + "loss": 0.9936, + "step": 12065 + }, + { + "epoch": 0.30982104599267235, + "grad_norm": 0.81640625, + "learning_rate": 0.0001825644349382929, + "loss": 0.937, + "step": 12066 + }, + { + "epoch": 0.3098467231885942, + "grad_norm": 0.80078125, + "learning_rate": 0.00018256191619390894, + "loss": 0.8992, + "step": 12067 + }, + { + "epoch": 0.309872400384516, + "grad_norm": 0.80859375, + "learning_rate": 0.00018255939728498656, + "loss": 1.0464, + "step": 12068 + }, + { + "epoch": 0.3098980775804378, + "grad_norm": 0.78515625, + "learning_rate": 0.0001825568782115308, + "loss": 0.8198, + "step": 12069 + }, + { + "epoch": 0.30992375477635964, + "grad_norm": 0.671875, + "learning_rate": 0.00018255435897354658, + "loss": 0.9437, + "step": 12070 + }, + { + "epoch": 0.3099494319722815, + "grad_norm": 0.8359375, + "learning_rate": 0.000182551839571039, + "loss": 0.9161, + "step": 12071 + }, + { + "epoch": 0.3099751091682033, + "grad_norm": 0.890625, + "learning_rate": 0.00018254932000401306, + "loss": 0.9341, + "step": 12072 + }, + { + "epoch": 0.3100007863641251, + "grad_norm": 0.73046875, + "learning_rate": 0.00018254680027247376, + "loss": 0.9707, + "step": 12073 + }, + { + "epoch": 0.3100264635600469, + "grad_norm": 0.8828125, + "learning_rate": 0.00018254428037642618, + "loss": 0.8617, + "step": 12074 + }, + { + "epoch": 0.31005214075596876, + "grad_norm": 0.80078125, + "learning_rate": 0.00018254176031587532, + "loss": 0.934, + "step": 12075 + }, + { + "epoch": 0.31007781795189054, + "grad_norm": 0.86328125, + "learning_rate": 0.00018253924009082613, + "loss": 1.0938, + "step": 12076 + }, + { + "epoch": 0.3101034951478124, + "grad_norm": 0.77734375, + "learning_rate": 0.00018253671970128372, + "loss": 0.8914, + "step": 12077 + }, + { + "epoch": 0.3101291723437342, + "grad_norm": 0.75, + "learning_rate": 0.0001825341991472531, + "loss": 0.9532, + "step": 12078 + }, + { + "epoch": 0.310154849539656, + "grad_norm": 0.82421875, + "learning_rate": 0.00018253167842873926, + "loss": 0.9065, + "step": 12079 + }, + { + "epoch": 0.31018052673557783, + "grad_norm": 0.8046875, + "learning_rate": 0.00018252915754574724, + "loss": 0.936, + "step": 12080 + }, + { + "epoch": 0.31020620393149967, + "grad_norm": 0.76171875, + "learning_rate": 0.00018252663649828205, + "loss": 0.9222, + "step": 12081 + }, + { + "epoch": 0.3102318811274215, + "grad_norm": 0.79296875, + "learning_rate": 0.00018252411528634873, + "loss": 0.9243, + "step": 12082 + }, + { + "epoch": 0.3102575583233433, + "grad_norm": 0.77734375, + "learning_rate": 0.00018252159390995233, + "loss": 0.8652, + "step": 12083 + }, + { + "epoch": 0.3102832355192651, + "grad_norm": 0.87109375, + "learning_rate": 0.00018251907236909785, + "loss": 1.0303, + "step": 12084 + }, + { + "epoch": 0.31030891271518696, + "grad_norm": 0.77734375, + "learning_rate": 0.0001825165506637903, + "loss": 0.9052, + "step": 12085 + }, + { + "epoch": 0.31033458991110874, + "grad_norm": 0.78515625, + "learning_rate": 0.0001825140287940347, + "loss": 0.9447, + "step": 12086 + }, + { + "epoch": 0.3103602671070306, + "grad_norm": 0.80859375, + "learning_rate": 0.0001825115067598361, + "loss": 1.0254, + "step": 12087 + }, + { + "epoch": 0.3103859443029524, + "grad_norm": 0.85546875, + "learning_rate": 0.00018250898456119953, + "loss": 0.9758, + "step": 12088 + }, + { + "epoch": 0.3104116214988742, + "grad_norm": 0.9375, + "learning_rate": 0.00018250646219813, + "loss": 1.0824, + "step": 12089 + }, + { + "epoch": 0.310437298694796, + "grad_norm": 0.76171875, + "learning_rate": 0.00018250393967063255, + "loss": 0.951, + "step": 12090 + }, + { + "epoch": 0.31046297589071786, + "grad_norm": 0.98046875, + "learning_rate": 0.0001825014169787122, + "loss": 0.9895, + "step": 12091 + }, + { + "epoch": 0.3104886530866397, + "grad_norm": 0.75390625, + "learning_rate": 0.00018249889412237397, + "loss": 0.9194, + "step": 12092 + }, + { + "epoch": 0.3105143302825615, + "grad_norm": 0.796875, + "learning_rate": 0.0001824963711016229, + "loss": 0.8693, + "step": 12093 + }, + { + "epoch": 0.3105400074784833, + "grad_norm": 0.8203125, + "learning_rate": 0.00018249384791646403, + "loss": 0.8899, + "step": 12094 + }, + { + "epoch": 0.31056568467440515, + "grad_norm": 0.78125, + "learning_rate": 0.00018249132456690234, + "loss": 0.8988, + "step": 12095 + }, + { + "epoch": 0.31059136187032693, + "grad_norm": 0.7890625, + "learning_rate": 0.00018248880105294294, + "loss": 0.9786, + "step": 12096 + }, + { + "epoch": 0.31061703906624877, + "grad_norm": 0.828125, + "learning_rate": 0.00018248627737459076, + "loss": 1.0293, + "step": 12097 + }, + { + "epoch": 0.3106427162621706, + "grad_norm": 0.8046875, + "learning_rate": 0.0001824837535318509, + "loss": 1.0025, + "step": 12098 + }, + { + "epoch": 0.3106683934580924, + "grad_norm": 0.8125, + "learning_rate": 0.00018248122952472837, + "loss": 0.9071, + "step": 12099 + }, + { + "epoch": 0.3106940706540142, + "grad_norm": 0.828125, + "learning_rate": 0.0001824787053532282, + "loss": 0.9689, + "step": 12100 + }, + { + "epoch": 0.31071974784993606, + "grad_norm": 0.81640625, + "learning_rate": 0.00018247618101735542, + "loss": 1.0896, + "step": 12101 + }, + { + "epoch": 0.3107454250458579, + "grad_norm": 0.7734375, + "learning_rate": 0.00018247365651711509, + "loss": 1.0313, + "step": 12102 + }, + { + "epoch": 0.3107711022417797, + "grad_norm": 0.828125, + "learning_rate": 0.00018247113185251217, + "loss": 0.9524, + "step": 12103 + }, + { + "epoch": 0.3107967794377015, + "grad_norm": 0.78515625, + "learning_rate": 0.00018246860702355176, + "loss": 0.8971, + "step": 12104 + }, + { + "epoch": 0.31082245663362335, + "grad_norm": 0.7578125, + "learning_rate": 0.00018246608203023885, + "loss": 1.1838, + "step": 12105 + }, + { + "epoch": 0.3108481338295451, + "grad_norm": 0.76953125, + "learning_rate": 0.0001824635568725785, + "loss": 0.9258, + "step": 12106 + }, + { + "epoch": 0.31087381102546696, + "grad_norm": 0.81640625, + "learning_rate": 0.0001824610315505757, + "loss": 0.8753, + "step": 12107 + }, + { + "epoch": 0.3108994882213888, + "grad_norm": 0.78515625, + "learning_rate": 0.00018245850606423555, + "loss": 0.9421, + "step": 12108 + }, + { + "epoch": 0.3109251654173106, + "grad_norm": 0.7109375, + "learning_rate": 0.00018245598041356304, + "loss": 0.8475, + "step": 12109 + }, + { + "epoch": 0.3109508426132324, + "grad_norm": 0.7734375, + "learning_rate": 0.00018245345459856318, + "loss": 1.0493, + "step": 12110 + }, + { + "epoch": 0.31097651980915425, + "grad_norm": 0.86328125, + "learning_rate": 0.00018245092861924106, + "loss": 0.992, + "step": 12111 + }, + { + "epoch": 0.3110021970050761, + "grad_norm": 0.7734375, + "learning_rate": 0.0001824484024756017, + "loss": 0.9805, + "step": 12112 + }, + { + "epoch": 0.31102787420099787, + "grad_norm": 0.7421875, + "learning_rate": 0.0001824458761676501, + "loss": 0.9733, + "step": 12113 + }, + { + "epoch": 0.3110535513969197, + "grad_norm": 0.8515625, + "learning_rate": 0.00018244334969539128, + "loss": 1.1108, + "step": 12114 + }, + { + "epoch": 0.31107922859284154, + "grad_norm": 0.8125, + "learning_rate": 0.00018244082305883037, + "loss": 0.845, + "step": 12115 + }, + { + "epoch": 0.3111049057887633, + "grad_norm": 0.80859375, + "learning_rate": 0.00018243829625797233, + "loss": 1.0216, + "step": 12116 + }, + { + "epoch": 0.31113058298468516, + "grad_norm": 0.71484375, + "learning_rate": 0.0001824357692928222, + "loss": 0.918, + "step": 12117 + }, + { + "epoch": 0.311156260180607, + "grad_norm": 0.8203125, + "learning_rate": 0.00018243324216338504, + "loss": 1.1072, + "step": 12118 + }, + { + "epoch": 0.31118193737652877, + "grad_norm": 0.765625, + "learning_rate": 0.00018243071486966588, + "loss": 0.9629, + "step": 12119 + }, + { + "epoch": 0.3112076145724506, + "grad_norm": 0.90234375, + "learning_rate": 0.00018242818741166973, + "loss": 0.993, + "step": 12120 + }, + { + "epoch": 0.31123329176837244, + "grad_norm": 0.8125, + "learning_rate": 0.00018242565978940166, + "loss": 1.0723, + "step": 12121 + }, + { + "epoch": 0.3112589689642943, + "grad_norm": 0.68359375, + "learning_rate": 0.0001824231320028667, + "loss": 1.0388, + "step": 12122 + }, + { + "epoch": 0.31128464616021606, + "grad_norm": 0.75390625, + "learning_rate": 0.00018242060405206987, + "loss": 0.9182, + "step": 12123 + }, + { + "epoch": 0.3113103233561379, + "grad_norm": 0.78125, + "learning_rate": 0.00018241807593701625, + "loss": 0.9754, + "step": 12124 + }, + { + "epoch": 0.31133600055205973, + "grad_norm": 0.765625, + "learning_rate": 0.00018241554765771083, + "loss": 0.9086, + "step": 12125 + }, + { + "epoch": 0.3113616777479815, + "grad_norm": 0.8046875, + "learning_rate": 0.00018241301921415864, + "loss": 1.1057, + "step": 12126 + }, + { + "epoch": 0.31138735494390335, + "grad_norm": 0.80078125, + "learning_rate": 0.0001824104906063648, + "loss": 1.0598, + "step": 12127 + }, + { + "epoch": 0.3114130321398252, + "grad_norm": 0.71875, + "learning_rate": 0.00018240796183433426, + "loss": 0.9365, + "step": 12128 + }, + { + "epoch": 0.31143870933574697, + "grad_norm": 0.87890625, + "learning_rate": 0.00018240543289807214, + "loss": 0.9694, + "step": 12129 + }, + { + "epoch": 0.3114643865316688, + "grad_norm": 1.0234375, + "learning_rate": 0.00018240290379758337, + "loss": 1.0289, + "step": 12130 + }, + { + "epoch": 0.31149006372759064, + "grad_norm": 0.81640625, + "learning_rate": 0.0001824003745328731, + "loss": 0.9019, + "step": 12131 + }, + { + "epoch": 0.3115157409235125, + "grad_norm": 0.85546875, + "learning_rate": 0.00018239784510394632, + "loss": 0.9254, + "step": 12132 + }, + { + "epoch": 0.31154141811943425, + "grad_norm": 0.88671875, + "learning_rate": 0.00018239531551080806, + "loss": 1.0854, + "step": 12133 + }, + { + "epoch": 0.3115670953153561, + "grad_norm": 0.73046875, + "learning_rate": 0.00018239278575346338, + "loss": 0.8805, + "step": 12134 + }, + { + "epoch": 0.3115927725112779, + "grad_norm": 0.76953125, + "learning_rate": 0.00018239025583191736, + "loss": 0.8384, + "step": 12135 + }, + { + "epoch": 0.3116184497071997, + "grad_norm": 0.80078125, + "learning_rate": 0.00018238772574617496, + "loss": 0.9305, + "step": 12136 + }, + { + "epoch": 0.31164412690312154, + "grad_norm": 0.796875, + "learning_rate": 0.00018238519549624126, + "loss": 1.1114, + "step": 12137 + }, + { + "epoch": 0.3116698040990434, + "grad_norm": 0.765625, + "learning_rate": 0.00018238266508212135, + "loss": 0.9209, + "step": 12138 + }, + { + "epoch": 0.31169548129496516, + "grad_norm": 0.7734375, + "learning_rate": 0.00018238013450382022, + "loss": 0.9306, + "step": 12139 + }, + { + "epoch": 0.311721158490887, + "grad_norm": 0.8203125, + "learning_rate": 0.0001823776037613429, + "loss": 0.9909, + "step": 12140 + }, + { + "epoch": 0.31174683568680883, + "grad_norm": 0.78515625, + "learning_rate": 0.00018237507285469447, + "loss": 0.9426, + "step": 12141 + }, + { + "epoch": 0.31177251288273067, + "grad_norm": 0.76171875, + "learning_rate": 0.00018237254178387995, + "loss": 0.9468, + "step": 12142 + }, + { + "epoch": 0.31179819007865245, + "grad_norm": 0.78125, + "learning_rate": 0.0001823700105489044, + "loss": 0.9863, + "step": 12143 + }, + { + "epoch": 0.3118238672745743, + "grad_norm": 0.72265625, + "learning_rate": 0.00018236747914977287, + "loss": 0.9653, + "step": 12144 + }, + { + "epoch": 0.3118495444704961, + "grad_norm": 0.8359375, + "learning_rate": 0.00018236494758649038, + "loss": 0.8677, + "step": 12145 + }, + { + "epoch": 0.3118752216664179, + "grad_norm": 0.78125, + "learning_rate": 0.000182362415859062, + "loss": 0.9511, + "step": 12146 + }, + { + "epoch": 0.31190089886233974, + "grad_norm": 0.7734375, + "learning_rate": 0.00018235988396749276, + "loss": 1.1509, + "step": 12147 + }, + { + "epoch": 0.3119265760582616, + "grad_norm": 0.796875, + "learning_rate": 0.0001823573519117877, + "loss": 0.8909, + "step": 12148 + }, + { + "epoch": 0.31195225325418335, + "grad_norm": 0.75, + "learning_rate": 0.00018235481969195192, + "loss": 0.9506, + "step": 12149 + }, + { + "epoch": 0.3119779304501052, + "grad_norm": 0.765625, + "learning_rate": 0.00018235228730799038, + "loss": 1.0123, + "step": 12150 + }, + { + "epoch": 0.312003607646027, + "grad_norm": 0.765625, + "learning_rate": 0.0001823497547599082, + "loss": 0.9332, + "step": 12151 + }, + { + "epoch": 0.31202928484194886, + "grad_norm": 0.87890625, + "learning_rate": 0.00018234722204771035, + "loss": 0.9425, + "step": 12152 + }, + { + "epoch": 0.31205496203787064, + "grad_norm": 0.83203125, + "learning_rate": 0.00018234468917140197, + "loss": 1.009, + "step": 12153 + }, + { + "epoch": 0.3120806392337925, + "grad_norm": 0.828125, + "learning_rate": 0.00018234215613098804, + "loss": 1.0361, + "step": 12154 + }, + { + "epoch": 0.3121063164297143, + "grad_norm": 0.74609375, + "learning_rate": 0.00018233962292647365, + "loss": 0.9243, + "step": 12155 + }, + { + "epoch": 0.3121319936256361, + "grad_norm": 0.8125, + "learning_rate": 0.00018233708955786378, + "loss": 0.9745, + "step": 12156 + }, + { + "epoch": 0.31215767082155793, + "grad_norm": 0.8203125, + "learning_rate": 0.00018233455602516358, + "loss": 1.0366, + "step": 12157 + }, + { + "epoch": 0.31218334801747977, + "grad_norm": 0.79296875, + "learning_rate": 0.00018233202232837803, + "loss": 1.1243, + "step": 12158 + }, + { + "epoch": 0.31220902521340155, + "grad_norm": 0.72265625, + "learning_rate": 0.00018232948846751222, + "loss": 0.9431, + "step": 12159 + }, + { + "epoch": 0.3122347024093234, + "grad_norm": 0.79296875, + "learning_rate": 0.00018232695444257116, + "loss": 1.0736, + "step": 12160 + }, + { + "epoch": 0.3122603796052452, + "grad_norm": 0.78515625, + "learning_rate": 0.00018232442025355988, + "loss": 0.8554, + "step": 12161 + }, + { + "epoch": 0.31228605680116706, + "grad_norm": 0.7578125, + "learning_rate": 0.00018232188590048352, + "loss": 0.8311, + "step": 12162 + }, + { + "epoch": 0.31231173399708884, + "grad_norm": 0.796875, + "learning_rate": 0.00018231935138334705, + "loss": 0.9203, + "step": 12163 + }, + { + "epoch": 0.31233741119301067, + "grad_norm": 0.78515625, + "learning_rate": 0.00018231681670215555, + "loss": 1.0548, + "step": 12164 + }, + { + "epoch": 0.3123630883889325, + "grad_norm": 0.82421875, + "learning_rate": 0.00018231428185691407, + "loss": 0.9982, + "step": 12165 + }, + { + "epoch": 0.3123887655848543, + "grad_norm": 0.8203125, + "learning_rate": 0.00018231174684762768, + "loss": 0.9526, + "step": 12166 + }, + { + "epoch": 0.3124144427807761, + "grad_norm": 0.76953125, + "learning_rate": 0.0001823092116743014, + "loss": 0.8507, + "step": 12167 + }, + { + "epoch": 0.31244011997669796, + "grad_norm": 0.7265625, + "learning_rate": 0.00018230667633694028, + "loss": 0.9585, + "step": 12168 + }, + { + "epoch": 0.31246579717261974, + "grad_norm": 0.765625, + "learning_rate": 0.0001823041408355494, + "loss": 0.921, + "step": 12169 + }, + { + "epoch": 0.3124914743685416, + "grad_norm": 0.9375, + "learning_rate": 0.00018230160517013381, + "loss": 1.0677, + "step": 12170 + }, + { + "epoch": 0.3125171515644634, + "grad_norm": 0.7578125, + "learning_rate": 0.00018229906934069854, + "loss": 1.1444, + "step": 12171 + }, + { + "epoch": 0.31254282876038525, + "grad_norm": 0.734375, + "learning_rate": 0.0001822965333472487, + "loss": 0.8736, + "step": 12172 + }, + { + "epoch": 0.31256850595630703, + "grad_norm": 0.8203125, + "learning_rate": 0.00018229399718978928, + "loss": 1.0723, + "step": 12173 + }, + { + "epoch": 0.31259418315222887, + "grad_norm": 0.83984375, + "learning_rate": 0.00018229146086832533, + "loss": 0.853, + "step": 12174 + }, + { + "epoch": 0.3126198603481507, + "grad_norm": 0.765625, + "learning_rate": 0.00018228892438286196, + "loss": 0.9074, + "step": 12175 + }, + { + "epoch": 0.3126455375440725, + "grad_norm": 0.765625, + "learning_rate": 0.00018228638773340418, + "loss": 0.9747, + "step": 12176 + }, + { + "epoch": 0.3126712147399943, + "grad_norm": 0.7734375, + "learning_rate": 0.00018228385091995707, + "loss": 0.8471, + "step": 12177 + }, + { + "epoch": 0.31269689193591615, + "grad_norm": 0.8046875, + "learning_rate": 0.0001822813139425257, + "loss": 1.0274, + "step": 12178 + }, + { + "epoch": 0.31272256913183794, + "grad_norm": 0.87890625, + "learning_rate": 0.00018227877680111507, + "loss": 1.1331, + "step": 12179 + }, + { + "epoch": 0.31274824632775977, + "grad_norm": 0.8046875, + "learning_rate": 0.00018227623949573028, + "loss": 1.1268, + "step": 12180 + }, + { + "epoch": 0.3127739235236816, + "grad_norm": 0.80859375, + "learning_rate": 0.00018227370202637637, + "loss": 0.8764, + "step": 12181 + }, + { + "epoch": 0.31279960071960344, + "grad_norm": 0.81640625, + "learning_rate": 0.00018227116439305843, + "loss": 0.923, + "step": 12182 + }, + { + "epoch": 0.3128252779155252, + "grad_norm": 0.76171875, + "learning_rate": 0.00018226862659578144, + "loss": 1.046, + "step": 12183 + }, + { + "epoch": 0.31285095511144706, + "grad_norm": 0.7578125, + "learning_rate": 0.00018226608863455057, + "loss": 0.9958, + "step": 12184 + }, + { + "epoch": 0.3128766323073689, + "grad_norm": 0.7890625, + "learning_rate": 0.00018226355050937078, + "loss": 0.983, + "step": 12185 + }, + { + "epoch": 0.3129023095032907, + "grad_norm": 1.015625, + "learning_rate": 0.0001822610122202472, + "loss": 1.0227, + "step": 12186 + }, + { + "epoch": 0.3129279866992125, + "grad_norm": 0.8125, + "learning_rate": 0.0001822584737671848, + "loss": 0.8945, + "step": 12187 + }, + { + "epoch": 0.31295366389513435, + "grad_norm": 0.74609375, + "learning_rate": 0.00018225593515018874, + "loss": 0.9278, + "step": 12188 + }, + { + "epoch": 0.31297934109105613, + "grad_norm": 0.7421875, + "learning_rate": 0.000182253396369264, + "loss": 1.0192, + "step": 12189 + }, + { + "epoch": 0.31300501828697797, + "grad_norm": 0.8828125, + "learning_rate": 0.0001822508574244157, + "loss": 0.9954, + "step": 12190 + }, + { + "epoch": 0.3130306954828998, + "grad_norm": 0.8515625, + "learning_rate": 0.00018224831831564883, + "loss": 0.9782, + "step": 12191 + }, + { + "epoch": 0.3130563726788216, + "grad_norm": 0.8203125, + "learning_rate": 0.00018224577904296852, + "loss": 1.0279, + "step": 12192 + }, + { + "epoch": 0.3130820498747434, + "grad_norm": 0.8203125, + "learning_rate": 0.00018224323960637978, + "loss": 1.069, + "step": 12193 + }, + { + "epoch": 0.31310772707066525, + "grad_norm": 0.7734375, + "learning_rate": 0.00018224070000588772, + "loss": 1.01, + "step": 12194 + }, + { + "epoch": 0.3131334042665871, + "grad_norm": 0.859375, + "learning_rate": 0.00018223816024149736, + "loss": 0.9431, + "step": 12195 + }, + { + "epoch": 0.31315908146250887, + "grad_norm": 0.82421875, + "learning_rate": 0.00018223562031321376, + "loss": 0.9871, + "step": 12196 + }, + { + "epoch": 0.3131847586584307, + "grad_norm": 0.7890625, + "learning_rate": 0.00018223308022104204, + "loss": 1.0481, + "step": 12197 + }, + { + "epoch": 0.31321043585435254, + "grad_norm": 0.78515625, + "learning_rate": 0.0001822305399649872, + "loss": 0.8383, + "step": 12198 + }, + { + "epoch": 0.3132361130502743, + "grad_norm": 0.765625, + "learning_rate": 0.00018222799954505429, + "loss": 1.0115, + "step": 12199 + }, + { + "epoch": 0.31326179024619616, + "grad_norm": 0.8125, + "learning_rate": 0.00018222545896124843, + "loss": 1.0219, + "step": 12200 + }, + { + "epoch": 0.313287467442118, + "grad_norm": 0.7578125, + "learning_rate": 0.00018222291821357468, + "loss": 0.9864, + "step": 12201 + }, + { + "epoch": 0.3133131446380398, + "grad_norm": 0.78125, + "learning_rate": 0.00018222037730203803, + "loss": 1.1614, + "step": 12202 + }, + { + "epoch": 0.3133388218339616, + "grad_norm": 0.7890625, + "learning_rate": 0.0001822178362266436, + "loss": 1.0082, + "step": 12203 + }, + { + "epoch": 0.31336449902988345, + "grad_norm": 0.7265625, + "learning_rate": 0.0001822152949873965, + "loss": 0.8335, + "step": 12204 + }, + { + "epoch": 0.3133901762258053, + "grad_norm": 0.73828125, + "learning_rate": 0.00018221275358430171, + "loss": 0.9004, + "step": 12205 + }, + { + "epoch": 0.31341585342172706, + "grad_norm": 0.8125, + "learning_rate": 0.00018221021201736434, + "loss": 0.9362, + "step": 12206 + }, + { + "epoch": 0.3134415306176489, + "grad_norm": 0.75, + "learning_rate": 0.00018220767028658946, + "loss": 0.9772, + "step": 12207 + }, + { + "epoch": 0.31346720781357074, + "grad_norm": 0.80078125, + "learning_rate": 0.00018220512839198206, + "loss": 0.9089, + "step": 12208 + }, + { + "epoch": 0.3134928850094925, + "grad_norm": 0.83984375, + "learning_rate": 0.0001822025863335473, + "loss": 1.0695, + "step": 12209 + }, + { + "epoch": 0.31351856220541435, + "grad_norm": 0.77734375, + "learning_rate": 0.00018220004411129023, + "loss": 1.0307, + "step": 12210 + }, + { + "epoch": 0.3135442394013362, + "grad_norm": 0.80859375, + "learning_rate": 0.00018219750172521586, + "loss": 1.0801, + "step": 12211 + }, + { + "epoch": 0.31356991659725797, + "grad_norm": 0.79296875, + "learning_rate": 0.00018219495917532933, + "loss": 0.9482, + "step": 12212 + }, + { + "epoch": 0.3135955937931798, + "grad_norm": 0.78515625, + "learning_rate": 0.00018219241646163562, + "loss": 1.0715, + "step": 12213 + }, + { + "epoch": 0.31362127098910164, + "grad_norm": 0.79296875, + "learning_rate": 0.0001821898735841399, + "loss": 0.9967, + "step": 12214 + }, + { + "epoch": 0.3136469481850235, + "grad_norm": 0.8125, + "learning_rate": 0.00018218733054284719, + "loss": 1.0462, + "step": 12215 + }, + { + "epoch": 0.31367262538094526, + "grad_norm": 0.765625, + "learning_rate": 0.0001821847873377625, + "loss": 1.0955, + "step": 12216 + }, + { + "epoch": 0.3136983025768671, + "grad_norm": 0.7265625, + "learning_rate": 0.00018218224396889099, + "loss": 0.9595, + "step": 12217 + }, + { + "epoch": 0.31372397977278893, + "grad_norm": 0.76953125, + "learning_rate": 0.0001821797004362377, + "loss": 0.9791, + "step": 12218 + }, + { + "epoch": 0.3137496569687107, + "grad_norm": 0.83984375, + "learning_rate": 0.00018217715673980766, + "loss": 1.0297, + "step": 12219 + }, + { + "epoch": 0.31377533416463255, + "grad_norm": 0.73046875, + "learning_rate": 0.000182174612879606, + "loss": 0.799, + "step": 12220 + }, + { + "epoch": 0.3138010113605544, + "grad_norm": 0.765625, + "learning_rate": 0.00018217206885563775, + "loss": 1.0541, + "step": 12221 + }, + { + "epoch": 0.31382668855647616, + "grad_norm": 0.76171875, + "learning_rate": 0.00018216952466790798, + "loss": 1.1542, + "step": 12222 + }, + { + "epoch": 0.313852365752398, + "grad_norm": 0.78515625, + "learning_rate": 0.00018216698031642178, + "loss": 0.9917, + "step": 12223 + }, + { + "epoch": 0.31387804294831984, + "grad_norm": 0.78515625, + "learning_rate": 0.00018216443580118422, + "loss": 1.0398, + "step": 12224 + }, + { + "epoch": 0.31390372014424167, + "grad_norm": 0.7890625, + "learning_rate": 0.00018216189112220033, + "loss": 1.0802, + "step": 12225 + }, + { + "epoch": 0.31392939734016345, + "grad_norm": 0.95703125, + "learning_rate": 0.00018215934627947522, + "loss": 0.9887, + "step": 12226 + }, + { + "epoch": 0.3139550745360853, + "grad_norm": 0.7421875, + "learning_rate": 0.00018215680127301397, + "loss": 0.877, + "step": 12227 + }, + { + "epoch": 0.3139807517320071, + "grad_norm": 0.77734375, + "learning_rate": 0.0001821542561028216, + "loss": 1.0153, + "step": 12228 + }, + { + "epoch": 0.3140064289279289, + "grad_norm": 0.76953125, + "learning_rate": 0.00018215171076890326, + "loss": 0.8498, + "step": 12229 + }, + { + "epoch": 0.31403210612385074, + "grad_norm": 0.78125, + "learning_rate": 0.00018214916527126398, + "loss": 1.0564, + "step": 12230 + }, + { + "epoch": 0.3140577833197726, + "grad_norm": 0.7734375, + "learning_rate": 0.00018214661960990882, + "loss": 0.8526, + "step": 12231 + }, + { + "epoch": 0.31408346051569436, + "grad_norm": 0.75390625, + "learning_rate": 0.00018214407378484287, + "loss": 0.9502, + "step": 12232 + }, + { + "epoch": 0.3141091377116162, + "grad_norm": 0.765625, + "learning_rate": 0.00018214152779607117, + "loss": 1.0114, + "step": 12233 + }, + { + "epoch": 0.31413481490753803, + "grad_norm": 0.7265625, + "learning_rate": 0.0001821389816435989, + "loss": 0.9547, + "step": 12234 + }, + { + "epoch": 0.31416049210345987, + "grad_norm": 0.828125, + "learning_rate": 0.00018213643532743098, + "loss": 0.9257, + "step": 12235 + }, + { + "epoch": 0.31418616929938165, + "grad_norm": 0.73828125, + "learning_rate": 0.00018213388884757262, + "loss": 0.9311, + "step": 12236 + }, + { + "epoch": 0.3142118464953035, + "grad_norm": 0.79296875, + "learning_rate": 0.00018213134220402882, + "loss": 0.8928, + "step": 12237 + }, + { + "epoch": 0.3142375236912253, + "grad_norm": 0.76953125, + "learning_rate": 0.00018212879539680467, + "loss": 0.9278, + "step": 12238 + }, + { + "epoch": 0.3142632008871471, + "grad_norm": 0.6875, + "learning_rate": 0.00018212624842590524, + "loss": 0.8154, + "step": 12239 + }, + { + "epoch": 0.31428887808306893, + "grad_norm": 0.81640625, + "learning_rate": 0.00018212370129133562, + "loss": 1.0247, + "step": 12240 + }, + { + "epoch": 0.31431455527899077, + "grad_norm": 0.76953125, + "learning_rate": 0.0001821211539931009, + "loss": 1.0077, + "step": 12241 + }, + { + "epoch": 0.31434023247491255, + "grad_norm": 0.8046875, + "learning_rate": 0.00018211860653120614, + "loss": 0.8411, + "step": 12242 + }, + { + "epoch": 0.3143659096708344, + "grad_norm": 0.78125, + "learning_rate": 0.00018211605890565638, + "loss": 0.8666, + "step": 12243 + }, + { + "epoch": 0.3143915868667562, + "grad_norm": 0.81640625, + "learning_rate": 0.00018211351111645677, + "loss": 0.8833, + "step": 12244 + }, + { + "epoch": 0.31441726406267806, + "grad_norm": 0.75, + "learning_rate": 0.00018211096316361235, + "loss": 1.0746, + "step": 12245 + }, + { + "epoch": 0.31444294125859984, + "grad_norm": 0.7578125, + "learning_rate": 0.00018210841504712818, + "loss": 1.0969, + "step": 12246 + }, + { + "epoch": 0.3144686184545217, + "grad_norm": 0.8046875, + "learning_rate": 0.00018210586676700933, + "loss": 0.893, + "step": 12247 + }, + { + "epoch": 0.3144942956504435, + "grad_norm": 0.76953125, + "learning_rate": 0.00018210331832326093, + "loss": 0.9526, + "step": 12248 + }, + { + "epoch": 0.3145199728463653, + "grad_norm": 0.74609375, + "learning_rate": 0.00018210076971588804, + "loss": 0.9127, + "step": 12249 + }, + { + "epoch": 0.31454565004228713, + "grad_norm": 0.72265625, + "learning_rate": 0.00018209822094489574, + "loss": 1.0217, + "step": 12250 + }, + { + "epoch": 0.31457132723820896, + "grad_norm": 0.76953125, + "learning_rate": 0.00018209567201028906, + "loss": 0.9236, + "step": 12251 + }, + { + "epoch": 0.31459700443413074, + "grad_norm": 0.7890625, + "learning_rate": 0.00018209312291207316, + "loss": 0.9688, + "step": 12252 + }, + { + "epoch": 0.3146226816300526, + "grad_norm": 0.86328125, + "learning_rate": 0.00018209057365025307, + "loss": 0.9446, + "step": 12253 + }, + { + "epoch": 0.3146483588259744, + "grad_norm": 0.8203125, + "learning_rate": 0.00018208802422483387, + "loss": 0.9626, + "step": 12254 + }, + { + "epoch": 0.31467403602189625, + "grad_norm": 0.71484375, + "learning_rate": 0.00018208547463582065, + "loss": 0.907, + "step": 12255 + }, + { + "epoch": 0.31469971321781803, + "grad_norm": 0.74609375, + "learning_rate": 0.00018208292488321854, + "loss": 0.8552, + "step": 12256 + }, + { + "epoch": 0.31472539041373987, + "grad_norm": 0.82421875, + "learning_rate": 0.0001820803749670325, + "loss": 0.9587, + "step": 12257 + }, + { + "epoch": 0.3147510676096617, + "grad_norm": 0.8203125, + "learning_rate": 0.00018207782488726776, + "loss": 0.9014, + "step": 12258 + }, + { + "epoch": 0.3147767448055835, + "grad_norm": 0.79296875, + "learning_rate": 0.00018207527464392928, + "loss": 0.8091, + "step": 12259 + }, + { + "epoch": 0.3148024220015053, + "grad_norm": 0.71875, + "learning_rate": 0.00018207272423702224, + "loss": 0.9089, + "step": 12260 + }, + { + "epoch": 0.31482809919742716, + "grad_norm": 0.8203125, + "learning_rate": 0.00018207017366655162, + "loss": 0.8997, + "step": 12261 + }, + { + "epoch": 0.31485377639334894, + "grad_norm": 0.7890625, + "learning_rate": 0.0001820676229325226, + "loss": 1.0557, + "step": 12262 + }, + { + "epoch": 0.3148794535892708, + "grad_norm": 0.7578125, + "learning_rate": 0.00018206507203494021, + "loss": 1.0399, + "step": 12263 + }, + { + "epoch": 0.3149051307851926, + "grad_norm": 0.80078125, + "learning_rate": 0.0001820625209738095, + "loss": 0.8475, + "step": 12264 + }, + { + "epoch": 0.31493080798111445, + "grad_norm": 0.83984375, + "learning_rate": 0.00018205996974913563, + "loss": 0.9392, + "step": 12265 + }, + { + "epoch": 0.3149564851770362, + "grad_norm": 0.74609375, + "learning_rate": 0.00018205741836092369, + "loss": 0.8414, + "step": 12266 + }, + { + "epoch": 0.31498216237295806, + "grad_norm": 0.75, + "learning_rate": 0.0001820548668091787, + "loss": 0.8227, + "step": 12267 + }, + { + "epoch": 0.3150078395688799, + "grad_norm": 0.77734375, + "learning_rate": 0.00018205231509390576, + "loss": 0.8586, + "step": 12268 + }, + { + "epoch": 0.3150335167648017, + "grad_norm": 0.78125, + "learning_rate": 0.00018204976321510997, + "loss": 0.8955, + "step": 12269 + }, + { + "epoch": 0.3150591939607235, + "grad_norm": 0.78515625, + "learning_rate": 0.00018204721117279644, + "loss": 1.0248, + "step": 12270 + }, + { + "epoch": 0.31508487115664535, + "grad_norm": 0.7578125, + "learning_rate": 0.0001820446589669702, + "loss": 1.0452, + "step": 12271 + }, + { + "epoch": 0.31511054835256713, + "grad_norm": 0.7890625, + "learning_rate": 0.00018204210659763636, + "loss": 0.8295, + "step": 12272 + }, + { + "epoch": 0.31513622554848897, + "grad_norm": 0.77734375, + "learning_rate": 0.00018203955406480003, + "loss": 1.0275, + "step": 12273 + }, + { + "epoch": 0.3151619027444108, + "grad_norm": 0.8359375, + "learning_rate": 0.00018203700136846629, + "loss": 1.1168, + "step": 12274 + }, + { + "epoch": 0.31518757994033264, + "grad_norm": 0.81640625, + "learning_rate": 0.00018203444850864017, + "loss": 0.9204, + "step": 12275 + }, + { + "epoch": 0.3152132571362544, + "grad_norm": 0.84765625, + "learning_rate": 0.00018203189548532687, + "loss": 0.9057, + "step": 12276 + }, + { + "epoch": 0.31523893433217626, + "grad_norm": 0.765625, + "learning_rate": 0.00018202934229853137, + "loss": 0.9068, + "step": 12277 + }, + { + "epoch": 0.3152646115280981, + "grad_norm": 0.73828125, + "learning_rate": 0.0001820267889482588, + "loss": 0.9313, + "step": 12278 + }, + { + "epoch": 0.3152902887240199, + "grad_norm": 0.76171875, + "learning_rate": 0.00018202423543451425, + "loss": 0.9041, + "step": 12279 + }, + { + "epoch": 0.3153159659199417, + "grad_norm": 0.80859375, + "learning_rate": 0.00018202168175730283, + "loss": 1.1188, + "step": 12280 + }, + { + "epoch": 0.31534164311586355, + "grad_norm": 0.796875, + "learning_rate": 0.00018201912791662958, + "loss": 0.9679, + "step": 12281 + }, + { + "epoch": 0.3153673203117853, + "grad_norm": 0.796875, + "learning_rate": 0.00018201657391249965, + "loss": 0.8754, + "step": 12282 + }, + { + "epoch": 0.31539299750770716, + "grad_norm": 0.75, + "learning_rate": 0.00018201401974491809, + "loss": 0.9817, + "step": 12283 + }, + { + "epoch": 0.315418674703629, + "grad_norm": 0.6875, + "learning_rate": 0.00018201146541388998, + "loss": 0.8669, + "step": 12284 + }, + { + "epoch": 0.31544435189955083, + "grad_norm": 0.76171875, + "learning_rate": 0.00018200891091942043, + "loss": 0.9177, + "step": 12285 + }, + { + "epoch": 0.3154700290954726, + "grad_norm": 0.73828125, + "learning_rate": 0.00018200635626151453, + "loss": 0.9731, + "step": 12286 + }, + { + "epoch": 0.31549570629139445, + "grad_norm": 0.79296875, + "learning_rate": 0.00018200380144017735, + "loss": 1.0032, + "step": 12287 + }, + { + "epoch": 0.3155213834873163, + "grad_norm": 0.8828125, + "learning_rate": 0.00018200124645541404, + "loss": 1.0345, + "step": 12288 + }, + { + "epoch": 0.31554706068323807, + "grad_norm": 0.8671875, + "learning_rate": 0.00018199869130722962, + "loss": 0.8734, + "step": 12289 + }, + { + "epoch": 0.3155727378791599, + "grad_norm": 0.7734375, + "learning_rate": 0.00018199613599562922, + "loss": 1.0474, + "step": 12290 + }, + { + "epoch": 0.31559841507508174, + "grad_norm": 0.88671875, + "learning_rate": 0.00018199358052061794, + "loss": 1.0754, + "step": 12291 + }, + { + "epoch": 0.3156240922710035, + "grad_norm": 0.765625, + "learning_rate": 0.00018199102488220085, + "loss": 1.1245, + "step": 12292 + }, + { + "epoch": 0.31564976946692536, + "grad_norm": 0.72265625, + "learning_rate": 0.00018198846908038308, + "loss": 1.0278, + "step": 12293 + }, + { + "epoch": 0.3156754466628472, + "grad_norm": 0.76953125, + "learning_rate": 0.00018198591311516967, + "loss": 0.8806, + "step": 12294 + }, + { + "epoch": 0.31570112385876903, + "grad_norm": 0.81640625, + "learning_rate": 0.00018198335698656575, + "loss": 1.0516, + "step": 12295 + }, + { + "epoch": 0.3157268010546908, + "grad_norm": 1.109375, + "learning_rate": 0.0001819808006945764, + "loss": 0.8994, + "step": 12296 + }, + { + "epoch": 0.31575247825061264, + "grad_norm": 0.7890625, + "learning_rate": 0.00018197824423920672, + "loss": 0.993, + "step": 12297 + }, + { + "epoch": 0.3157781554465345, + "grad_norm": 0.81640625, + "learning_rate": 0.0001819756876204618, + "loss": 1.0958, + "step": 12298 + }, + { + "epoch": 0.31580383264245626, + "grad_norm": 0.7578125, + "learning_rate": 0.00018197313083834676, + "loss": 0.9321, + "step": 12299 + }, + { + "epoch": 0.3158295098383781, + "grad_norm": 0.7578125, + "learning_rate": 0.00018197057389286668, + "loss": 0.9099, + "step": 12300 + }, + { + "epoch": 0.31585518703429993, + "grad_norm": 0.734375, + "learning_rate": 0.0001819680167840266, + "loss": 0.9931, + "step": 12301 + }, + { + "epoch": 0.3158808642302217, + "grad_norm": 0.85546875, + "learning_rate": 0.00018196545951183174, + "loss": 1.1365, + "step": 12302 + }, + { + "epoch": 0.31590654142614355, + "grad_norm": 0.80078125, + "learning_rate": 0.00018196290207628704, + "loss": 0.9841, + "step": 12303 + }, + { + "epoch": 0.3159322186220654, + "grad_norm": 0.76171875, + "learning_rate": 0.00018196034447739772, + "loss": 0.9353, + "step": 12304 + }, + { + "epoch": 0.3159578958179872, + "grad_norm": 0.78125, + "learning_rate": 0.00018195778671516885, + "loss": 1.0199, + "step": 12305 + }, + { + "epoch": 0.315983573013909, + "grad_norm": 0.8359375, + "learning_rate": 0.0001819552287896055, + "loss": 0.9614, + "step": 12306 + }, + { + "epoch": 0.31600925020983084, + "grad_norm": 0.8046875, + "learning_rate": 0.0001819526707007128, + "loss": 0.9273, + "step": 12307 + }, + { + "epoch": 0.3160349274057527, + "grad_norm": 1.1484375, + "learning_rate": 0.00018195011244849583, + "loss": 0.9345, + "step": 12308 + }, + { + "epoch": 0.31606060460167446, + "grad_norm": 0.82421875, + "learning_rate": 0.00018194755403295963, + "loss": 0.9264, + "step": 12309 + }, + { + "epoch": 0.3160862817975963, + "grad_norm": 0.69921875, + "learning_rate": 0.00018194499545410943, + "loss": 0.9707, + "step": 12310 + }, + { + "epoch": 0.3161119589935181, + "grad_norm": 0.73046875, + "learning_rate": 0.00018194243671195023, + "loss": 0.8922, + "step": 12311 + }, + { + "epoch": 0.3161376361894399, + "grad_norm": 0.80078125, + "learning_rate": 0.00018193987780648713, + "loss": 0.8555, + "step": 12312 + }, + { + "epoch": 0.31616331338536174, + "grad_norm": 0.80078125, + "learning_rate": 0.0001819373187377253, + "loss": 1.0531, + "step": 12313 + }, + { + "epoch": 0.3161889905812836, + "grad_norm": 0.71484375, + "learning_rate": 0.00018193475950566975, + "loss": 0.8944, + "step": 12314 + }, + { + "epoch": 0.3162146677772054, + "grad_norm": 0.75, + "learning_rate": 0.00018193220011032565, + "loss": 0.8899, + "step": 12315 + }, + { + "epoch": 0.3162403449731272, + "grad_norm": 0.8515625, + "learning_rate": 0.0001819296405516981, + "loss": 1.0279, + "step": 12316 + }, + { + "epoch": 0.31626602216904903, + "grad_norm": 0.78125, + "learning_rate": 0.00018192708082979213, + "loss": 0.9363, + "step": 12317 + }, + { + "epoch": 0.31629169936497087, + "grad_norm": 0.8125, + "learning_rate": 0.00018192452094461292, + "loss": 0.923, + "step": 12318 + }, + { + "epoch": 0.31631737656089265, + "grad_norm": 0.77734375, + "learning_rate": 0.00018192196089616554, + "loss": 0.9713, + "step": 12319 + }, + { + "epoch": 0.3163430537568145, + "grad_norm": 0.75390625, + "learning_rate": 0.00018191940068445507, + "loss": 0.8543, + "step": 12320 + }, + { + "epoch": 0.3163687309527363, + "grad_norm": 0.69921875, + "learning_rate": 0.00018191684030948664, + "loss": 0.9248, + "step": 12321 + }, + { + "epoch": 0.3163944081486581, + "grad_norm": 1.0234375, + "learning_rate": 0.00018191427977126535, + "loss": 0.9701, + "step": 12322 + }, + { + "epoch": 0.31642008534457994, + "grad_norm": 0.734375, + "learning_rate": 0.00018191171906979631, + "loss": 0.9492, + "step": 12323 + }, + { + "epoch": 0.3164457625405018, + "grad_norm": 0.85546875, + "learning_rate": 0.0001819091582050846, + "loss": 1.0724, + "step": 12324 + }, + { + "epoch": 0.3164714397364236, + "grad_norm": 0.7109375, + "learning_rate": 0.00018190659717713534, + "loss": 0.7762, + "step": 12325 + }, + { + "epoch": 0.3164971169323454, + "grad_norm": 0.7265625, + "learning_rate": 0.00018190403598595362, + "loss": 0.9104, + "step": 12326 + }, + { + "epoch": 0.3165227941282672, + "grad_norm": 0.7578125, + "learning_rate": 0.00018190147463154456, + "loss": 1.0211, + "step": 12327 + }, + { + "epoch": 0.31654847132418906, + "grad_norm": 0.80078125, + "learning_rate": 0.00018189891311391328, + "loss": 0.9393, + "step": 12328 + }, + { + "epoch": 0.31657414852011084, + "grad_norm": 0.78125, + "learning_rate": 0.00018189635143306485, + "loss": 0.9851, + "step": 12329 + }, + { + "epoch": 0.3165998257160327, + "grad_norm": 0.7734375, + "learning_rate": 0.0001818937895890044, + "loss": 0.9479, + "step": 12330 + }, + { + "epoch": 0.3166255029119545, + "grad_norm": 0.78125, + "learning_rate": 0.000181891227581737, + "loss": 0.9601, + "step": 12331 + }, + { + "epoch": 0.3166511801078763, + "grad_norm": 0.7734375, + "learning_rate": 0.0001818886654112678, + "loss": 0.9624, + "step": 12332 + }, + { + "epoch": 0.31667685730379813, + "grad_norm": 0.76953125, + "learning_rate": 0.00018188610307760187, + "loss": 0.93, + "step": 12333 + }, + { + "epoch": 0.31670253449971997, + "grad_norm": 0.84375, + "learning_rate": 0.00018188354058074437, + "loss": 0.8896, + "step": 12334 + }, + { + "epoch": 0.3167282116956418, + "grad_norm": 0.7265625, + "learning_rate": 0.00018188097792070032, + "loss": 0.9073, + "step": 12335 + }, + { + "epoch": 0.3167538888915636, + "grad_norm": 0.75, + "learning_rate": 0.00018187841509747493, + "loss": 0.9401, + "step": 12336 + }, + { + "epoch": 0.3167795660874854, + "grad_norm": 0.765625, + "learning_rate": 0.0001818758521110732, + "loss": 1.0074, + "step": 12337 + }, + { + "epoch": 0.31680524328340726, + "grad_norm": 0.78125, + "learning_rate": 0.00018187328896150033, + "loss": 1.0124, + "step": 12338 + }, + { + "epoch": 0.31683092047932904, + "grad_norm": 0.76953125, + "learning_rate": 0.00018187072564876137, + "loss": 0.8477, + "step": 12339 + }, + { + "epoch": 0.3168565976752509, + "grad_norm": 0.765625, + "learning_rate": 0.00018186816217286147, + "loss": 1.0134, + "step": 12340 + }, + { + "epoch": 0.3168822748711727, + "grad_norm": 0.81640625, + "learning_rate": 0.0001818655985338057, + "loss": 0.8935, + "step": 12341 + }, + { + "epoch": 0.3169079520670945, + "grad_norm": 0.7265625, + "learning_rate": 0.0001818630347315992, + "loss": 0.9683, + "step": 12342 + }, + { + "epoch": 0.3169336292630163, + "grad_norm": 0.78125, + "learning_rate": 0.00018186047076624705, + "loss": 0.937, + "step": 12343 + }, + { + "epoch": 0.31695930645893816, + "grad_norm": 0.73046875, + "learning_rate": 0.0001818579066377544, + "loss": 0.9263, + "step": 12344 + }, + { + "epoch": 0.31698498365486, + "grad_norm": 0.796875, + "learning_rate": 0.0001818553423461263, + "loss": 1.0076, + "step": 12345 + }, + { + "epoch": 0.3170106608507818, + "grad_norm": 0.82421875, + "learning_rate": 0.00018185277789136794, + "loss": 0.9484, + "step": 12346 + }, + { + "epoch": 0.3170363380467036, + "grad_norm": 0.765625, + "learning_rate": 0.00018185021327348438, + "loss": 0.9088, + "step": 12347 + }, + { + "epoch": 0.31706201524262545, + "grad_norm": 0.828125, + "learning_rate": 0.0001818476484924807, + "loss": 0.8781, + "step": 12348 + }, + { + "epoch": 0.31708769243854723, + "grad_norm": 0.92578125, + "learning_rate": 0.00018184508354836208, + "loss": 0.8225, + "step": 12349 + }, + { + "epoch": 0.31711336963446907, + "grad_norm": 0.78515625, + "learning_rate": 0.0001818425184411336, + "loss": 1.0035, + "step": 12350 + }, + { + "epoch": 0.3171390468303909, + "grad_norm": 0.8515625, + "learning_rate": 0.0001818399531708004, + "loss": 0.9629, + "step": 12351 + }, + { + "epoch": 0.3171647240263127, + "grad_norm": 2.390625, + "learning_rate": 0.0001818373877373675, + "loss": 0.9593, + "step": 12352 + }, + { + "epoch": 0.3171904012222345, + "grad_norm": 1.0390625, + "learning_rate": 0.00018183482214084013, + "loss": 0.9026, + "step": 12353 + }, + { + "epoch": 0.31721607841815636, + "grad_norm": 0.87890625, + "learning_rate": 0.00018183225638122332, + "loss": 1.0439, + "step": 12354 + }, + { + "epoch": 0.3172417556140782, + "grad_norm": 0.73046875, + "learning_rate": 0.00018182969045852225, + "loss": 1.0583, + "step": 12355 + }, + { + "epoch": 0.31726743280999997, + "grad_norm": 0.73828125, + "learning_rate": 0.000181827124372742, + "loss": 0.9467, + "step": 12356 + }, + { + "epoch": 0.3172931100059218, + "grad_norm": 0.76953125, + "learning_rate": 0.00018182455812388763, + "loss": 1.0018, + "step": 12357 + }, + { + "epoch": 0.31731878720184364, + "grad_norm": 0.8046875, + "learning_rate": 0.00018182199171196435, + "loss": 0.8256, + "step": 12358 + }, + { + "epoch": 0.3173444643977654, + "grad_norm": 0.8203125, + "learning_rate": 0.00018181942513697722, + "loss": 0.9589, + "step": 12359 + }, + { + "epoch": 0.31737014159368726, + "grad_norm": 0.76953125, + "learning_rate": 0.00018181685839893137, + "loss": 1.0566, + "step": 12360 + }, + { + "epoch": 0.3173958187896091, + "grad_norm": 0.78515625, + "learning_rate": 0.0001818142914978319, + "loss": 0.9494, + "step": 12361 + }, + { + "epoch": 0.3174214959855309, + "grad_norm": 0.73828125, + "learning_rate": 0.00018181172443368395, + "loss": 0.9694, + "step": 12362 + }, + { + "epoch": 0.3174471731814527, + "grad_norm": 0.796875, + "learning_rate": 0.0001818091572064926, + "loss": 0.8628, + "step": 12363 + }, + { + "epoch": 0.31747285037737455, + "grad_norm": 0.796875, + "learning_rate": 0.000181806589816263, + "loss": 0.9349, + "step": 12364 + }, + { + "epoch": 0.3174985275732964, + "grad_norm": 0.8203125, + "learning_rate": 0.0001818040222630003, + "loss": 0.9947, + "step": 12365 + }, + { + "epoch": 0.31752420476921817, + "grad_norm": 0.76171875, + "learning_rate": 0.00018180145454670953, + "loss": 0.8955, + "step": 12366 + }, + { + "epoch": 0.31754988196514, + "grad_norm": 0.85546875, + "learning_rate": 0.00018179888666739586, + "loss": 1.0467, + "step": 12367 + }, + { + "epoch": 0.31757555916106184, + "grad_norm": 0.76171875, + "learning_rate": 0.00018179631862506442, + "loss": 0.9769, + "step": 12368 + }, + { + "epoch": 0.3176012363569836, + "grad_norm": 1.515625, + "learning_rate": 0.00018179375041972023, + "loss": 1.0648, + "step": 12369 + }, + { + "epoch": 0.31762691355290545, + "grad_norm": 0.73828125, + "learning_rate": 0.00018179118205136855, + "loss": 0.8976, + "step": 12370 + }, + { + "epoch": 0.3176525907488273, + "grad_norm": 0.75, + "learning_rate": 0.00018178861352001442, + "loss": 1.0216, + "step": 12371 + }, + { + "epoch": 0.31767826794474907, + "grad_norm": 0.7421875, + "learning_rate": 0.00018178604482566298, + "loss": 1.0228, + "step": 12372 + }, + { + "epoch": 0.3177039451406709, + "grad_norm": 0.84765625, + "learning_rate": 0.00018178347596831934, + "loss": 1.0365, + "step": 12373 + }, + { + "epoch": 0.31772962233659274, + "grad_norm": 0.88671875, + "learning_rate": 0.0001817809069479886, + "loss": 0.9978, + "step": 12374 + }, + { + "epoch": 0.3177552995325146, + "grad_norm": 0.78125, + "learning_rate": 0.0001817783377646759, + "loss": 0.8859, + "step": 12375 + }, + { + "epoch": 0.31778097672843636, + "grad_norm": 0.78515625, + "learning_rate": 0.00018177576841838636, + "loss": 0.986, + "step": 12376 + }, + { + "epoch": 0.3178066539243582, + "grad_norm": 0.75390625, + "learning_rate": 0.0001817731989091251, + "loss": 0.8933, + "step": 12377 + }, + { + "epoch": 0.31783233112028003, + "grad_norm": 0.8046875, + "learning_rate": 0.00018177062923689725, + "loss": 0.8597, + "step": 12378 + }, + { + "epoch": 0.3178580083162018, + "grad_norm": 0.69140625, + "learning_rate": 0.00018176805940170793, + "loss": 0.8924, + "step": 12379 + }, + { + "epoch": 0.31788368551212365, + "grad_norm": 1.0390625, + "learning_rate": 0.00018176548940356226, + "loss": 1.0621, + "step": 12380 + }, + { + "epoch": 0.3179093627080455, + "grad_norm": 0.7890625, + "learning_rate": 0.00018176291924246532, + "loss": 0.8658, + "step": 12381 + }, + { + "epoch": 0.31793503990396726, + "grad_norm": 0.75, + "learning_rate": 0.0001817603489184223, + "loss": 0.839, + "step": 12382 + }, + { + "epoch": 0.3179607170998891, + "grad_norm": 0.76953125, + "learning_rate": 0.0001817577784314383, + "loss": 1.0689, + "step": 12383 + }, + { + "epoch": 0.31798639429581094, + "grad_norm": 0.92578125, + "learning_rate": 0.0001817552077815184, + "loss": 0.997, + "step": 12384 + }, + { + "epoch": 0.3180120714917328, + "grad_norm": 0.83203125, + "learning_rate": 0.00018175263696866778, + "loss": 0.9055, + "step": 12385 + }, + { + "epoch": 0.31803774868765455, + "grad_norm": 0.71484375, + "learning_rate": 0.00018175006599289152, + "loss": 0.9134, + "step": 12386 + }, + { + "epoch": 0.3180634258835764, + "grad_norm": 0.84375, + "learning_rate": 0.00018174749485419476, + "loss": 1.0294, + "step": 12387 + }, + { + "epoch": 0.3180891030794982, + "grad_norm": 0.77734375, + "learning_rate": 0.00018174492355258262, + "loss": 0.9101, + "step": 12388 + }, + { + "epoch": 0.31811478027542, + "grad_norm": 0.78125, + "learning_rate": 0.00018174235208806024, + "loss": 0.8014, + "step": 12389 + }, + { + "epoch": 0.31814045747134184, + "grad_norm": 1.125, + "learning_rate": 0.00018173978046063277, + "loss": 1.1605, + "step": 12390 + }, + { + "epoch": 0.3181661346672637, + "grad_norm": 0.7890625, + "learning_rate": 0.00018173720867030525, + "loss": 0.9692, + "step": 12391 + }, + { + "epoch": 0.31819181186318546, + "grad_norm": 0.84765625, + "learning_rate": 0.0001817346367170829, + "loss": 1.0285, + "step": 12392 + }, + { + "epoch": 0.3182174890591073, + "grad_norm": 0.703125, + "learning_rate": 0.00018173206460097077, + "loss": 0.7621, + "step": 12393 + }, + { + "epoch": 0.31824316625502913, + "grad_norm": 0.8046875, + "learning_rate": 0.00018172949232197404, + "loss": 0.9969, + "step": 12394 + }, + { + "epoch": 0.3182688434509509, + "grad_norm": 0.82421875, + "learning_rate": 0.0001817269198800978, + "loss": 0.9829, + "step": 12395 + }, + { + "epoch": 0.31829452064687275, + "grad_norm": 0.7578125, + "learning_rate": 0.0001817243472753472, + "loss": 0.892, + "step": 12396 + }, + { + "epoch": 0.3183201978427946, + "grad_norm": 0.77734375, + "learning_rate": 0.00018172177450772735, + "loss": 1.0831, + "step": 12397 + }, + { + "epoch": 0.3183458750387164, + "grad_norm": 0.74609375, + "learning_rate": 0.0001817192015772434, + "loss": 0.9352, + "step": 12398 + }, + { + "epoch": 0.3183715522346382, + "grad_norm": 0.7578125, + "learning_rate": 0.00018171662848390045, + "loss": 0.8059, + "step": 12399 + }, + { + "epoch": 0.31839722943056004, + "grad_norm": 0.859375, + "learning_rate": 0.00018171405522770365, + "loss": 0.8675, + "step": 12400 + }, + { + "epoch": 0.31842290662648187, + "grad_norm": 0.83984375, + "learning_rate": 0.0001817114818086581, + "loss": 0.9671, + "step": 12401 + }, + { + "epoch": 0.31844858382240365, + "grad_norm": 0.74609375, + "learning_rate": 0.00018170890822676898, + "loss": 0.9682, + "step": 12402 + }, + { + "epoch": 0.3184742610183255, + "grad_norm": 0.78125, + "learning_rate": 0.0001817063344820414, + "loss": 0.8738, + "step": 12403 + }, + { + "epoch": 0.3184999382142473, + "grad_norm": 0.82421875, + "learning_rate": 0.00018170376057448042, + "loss": 0.9658, + "step": 12404 + }, + { + "epoch": 0.3185256154101691, + "grad_norm": 0.77734375, + "learning_rate": 0.00018170118650409124, + "loss": 1.011, + "step": 12405 + }, + { + "epoch": 0.31855129260609094, + "grad_norm": 0.74609375, + "learning_rate": 0.00018169861227087898, + "loss": 0.9209, + "step": 12406 + }, + { + "epoch": 0.3185769698020128, + "grad_norm": 0.7734375, + "learning_rate": 0.00018169603787484875, + "loss": 0.9891, + "step": 12407 + }, + { + "epoch": 0.3186026469979346, + "grad_norm": 0.7734375, + "learning_rate": 0.00018169346331600573, + "loss": 1.083, + "step": 12408 + }, + { + "epoch": 0.3186283241938564, + "grad_norm": 0.8046875, + "learning_rate": 0.000181690888594355, + "loss": 1.0344, + "step": 12409 + }, + { + "epoch": 0.31865400138977823, + "grad_norm": 0.69921875, + "learning_rate": 0.00018168831370990172, + "loss": 0.92, + "step": 12410 + }, + { + "epoch": 0.31867967858570007, + "grad_norm": 0.8046875, + "learning_rate": 0.000181685738662651, + "loss": 0.9204, + "step": 12411 + }, + { + "epoch": 0.31870535578162185, + "grad_norm": 0.76171875, + "learning_rate": 0.00018168316345260798, + "loss": 0.9036, + "step": 12412 + }, + { + "epoch": 0.3187310329775437, + "grad_norm": 0.79296875, + "learning_rate": 0.00018168058807977778, + "loss": 1.0674, + "step": 12413 + }, + { + "epoch": 0.3187567101734655, + "grad_norm": 0.7734375, + "learning_rate": 0.00018167801254416558, + "loss": 0.9946, + "step": 12414 + }, + { + "epoch": 0.3187823873693873, + "grad_norm": 0.796875, + "learning_rate": 0.00018167543684577647, + "loss": 1.0751, + "step": 12415 + }, + { + "epoch": 0.31880806456530913, + "grad_norm": 0.73828125, + "learning_rate": 0.00018167286098461557, + "loss": 0.9598, + "step": 12416 + }, + { + "epoch": 0.31883374176123097, + "grad_norm": 0.8046875, + "learning_rate": 0.00018167028496068806, + "loss": 0.908, + "step": 12417 + }, + { + "epoch": 0.3188594189571528, + "grad_norm": 0.8046875, + "learning_rate": 0.00018166770877399904, + "loss": 0.9959, + "step": 12418 + }, + { + "epoch": 0.3188850961530746, + "grad_norm": 0.7265625, + "learning_rate": 0.00018166513242455365, + "loss": 0.9434, + "step": 12419 + }, + { + "epoch": 0.3189107733489964, + "grad_norm": 0.87890625, + "learning_rate": 0.00018166255591235702, + "loss": 0.9644, + "step": 12420 + }, + { + "epoch": 0.31893645054491826, + "grad_norm": 0.79296875, + "learning_rate": 0.0001816599792374143, + "loss": 1.0388, + "step": 12421 + }, + { + "epoch": 0.31896212774084004, + "grad_norm": 0.7578125, + "learning_rate": 0.00018165740239973061, + "loss": 0.8118, + "step": 12422 + }, + { + "epoch": 0.3189878049367619, + "grad_norm": 0.74609375, + "learning_rate": 0.0001816548253993111, + "loss": 1.0463, + "step": 12423 + }, + { + "epoch": 0.3190134821326837, + "grad_norm": 0.7578125, + "learning_rate": 0.0001816522482361609, + "loss": 0.7914, + "step": 12424 + }, + { + "epoch": 0.3190391593286055, + "grad_norm": 0.83203125, + "learning_rate": 0.00018164967091028515, + "loss": 0.8936, + "step": 12425 + }, + { + "epoch": 0.31906483652452733, + "grad_norm": 0.77734375, + "learning_rate": 0.00018164709342168898, + "loss": 0.8595, + "step": 12426 + }, + { + "epoch": 0.31909051372044916, + "grad_norm": 0.7578125, + "learning_rate": 0.00018164451577037754, + "loss": 1.0444, + "step": 12427 + }, + { + "epoch": 0.319116190916371, + "grad_norm": 0.828125, + "learning_rate": 0.0001816419379563559, + "loss": 1.1864, + "step": 12428 + }, + { + "epoch": 0.3191418681122928, + "grad_norm": 0.78515625, + "learning_rate": 0.00018163935997962934, + "loss": 0.846, + "step": 12429 + }, + { + "epoch": 0.3191675453082146, + "grad_norm": 1.0390625, + "learning_rate": 0.00018163678184020284, + "loss": 0.907, + "step": 12430 + }, + { + "epoch": 0.31919322250413645, + "grad_norm": 0.734375, + "learning_rate": 0.00018163420353808161, + "loss": 0.8933, + "step": 12431 + }, + { + "epoch": 0.31921889970005823, + "grad_norm": 0.73046875, + "learning_rate": 0.00018163162507327084, + "loss": 0.9965, + "step": 12432 + }, + { + "epoch": 0.31924457689598007, + "grad_norm": 0.78515625, + "learning_rate": 0.00018162904644577558, + "loss": 1.0662, + "step": 12433 + }, + { + "epoch": 0.3192702540919019, + "grad_norm": 0.89453125, + "learning_rate": 0.00018162646765560098, + "loss": 0.983, + "step": 12434 + }, + { + "epoch": 0.3192959312878237, + "grad_norm": 0.76953125, + "learning_rate": 0.00018162388870275224, + "loss": 0.8563, + "step": 12435 + }, + { + "epoch": 0.3193216084837455, + "grad_norm": 0.86328125, + "learning_rate": 0.00018162130958723446, + "loss": 0.9875, + "step": 12436 + }, + { + "epoch": 0.31934728567966736, + "grad_norm": 0.7734375, + "learning_rate": 0.00018161873030905277, + "loss": 1.0514, + "step": 12437 + }, + { + "epoch": 0.3193729628755892, + "grad_norm": 0.77734375, + "learning_rate": 0.00018161615086821232, + "loss": 0.9615, + "step": 12438 + }, + { + "epoch": 0.319398640071511, + "grad_norm": 0.86328125, + "learning_rate": 0.00018161357126471827, + "loss": 0.923, + "step": 12439 + }, + { + "epoch": 0.3194243172674328, + "grad_norm": 0.83984375, + "learning_rate": 0.00018161099149857575, + "loss": 0.9569, + "step": 12440 + }, + { + "epoch": 0.31944999446335465, + "grad_norm": 0.80078125, + "learning_rate": 0.00018160841156978987, + "loss": 1.0023, + "step": 12441 + }, + { + "epoch": 0.31947567165927643, + "grad_norm": 0.76171875, + "learning_rate": 0.0001816058314783658, + "loss": 0.8758, + "step": 12442 + }, + { + "epoch": 0.31950134885519826, + "grad_norm": 0.7578125, + "learning_rate": 0.0001816032512243087, + "loss": 1.0105, + "step": 12443 + }, + { + "epoch": 0.3195270260511201, + "grad_norm": 0.7109375, + "learning_rate": 0.00018160067080762366, + "loss": 0.9821, + "step": 12444 + }, + { + "epoch": 0.3195527032470419, + "grad_norm": 0.83984375, + "learning_rate": 0.00018159809022831589, + "loss": 1.0348, + "step": 12445 + }, + { + "epoch": 0.3195783804429637, + "grad_norm": 0.734375, + "learning_rate": 0.0001815955094863905, + "loss": 1.0157, + "step": 12446 + }, + { + "epoch": 0.31960405763888555, + "grad_norm": 0.80078125, + "learning_rate": 0.00018159292858185258, + "loss": 0.8892, + "step": 12447 + }, + { + "epoch": 0.3196297348348074, + "grad_norm": 0.765625, + "learning_rate": 0.0001815903475147074, + "loss": 0.8917, + "step": 12448 + }, + { + "epoch": 0.31965541203072917, + "grad_norm": 0.77734375, + "learning_rate": 0.00018158776628495995, + "loss": 1.0006, + "step": 12449 + }, + { + "epoch": 0.319681089226651, + "grad_norm": 0.765625, + "learning_rate": 0.00018158518489261546, + "loss": 1.0174, + "step": 12450 + }, + { + "epoch": 0.31970676642257284, + "grad_norm": 0.76953125, + "learning_rate": 0.00018158260333767912, + "loss": 0.9286, + "step": 12451 + }, + { + "epoch": 0.3197324436184946, + "grad_norm": 0.8359375, + "learning_rate": 0.00018158002162015598, + "loss": 0.888, + "step": 12452 + }, + { + "epoch": 0.31975812081441646, + "grad_norm": 0.8125, + "learning_rate": 0.00018157743974005124, + "loss": 1.0473, + "step": 12453 + }, + { + "epoch": 0.3197837980103383, + "grad_norm": 0.75390625, + "learning_rate": 0.00018157485769737002, + "loss": 0.9995, + "step": 12454 + }, + { + "epoch": 0.3198094752062601, + "grad_norm": 0.8046875, + "learning_rate": 0.0001815722754921175, + "loss": 0.9662, + "step": 12455 + }, + { + "epoch": 0.3198351524021819, + "grad_norm": 0.765625, + "learning_rate": 0.00018156969312429876, + "loss": 0.98, + "step": 12456 + }, + { + "epoch": 0.31986082959810375, + "grad_norm": 0.73046875, + "learning_rate": 0.00018156711059391905, + "loss": 1.0004, + "step": 12457 + }, + { + "epoch": 0.3198865067940256, + "grad_norm": 0.80859375, + "learning_rate": 0.00018156452790098342, + "loss": 1.0664, + "step": 12458 + }, + { + "epoch": 0.31991218398994736, + "grad_norm": 0.734375, + "learning_rate": 0.00018156194504549705, + "loss": 0.8832, + "step": 12459 + }, + { + "epoch": 0.3199378611858692, + "grad_norm": 0.78125, + "learning_rate": 0.0001815593620274651, + "loss": 0.8671, + "step": 12460 + }, + { + "epoch": 0.31996353838179104, + "grad_norm": 0.859375, + "learning_rate": 0.00018155677884689272, + "loss": 1.0121, + "step": 12461 + }, + { + "epoch": 0.3199892155777128, + "grad_norm": 0.7578125, + "learning_rate": 0.00018155419550378504, + "loss": 0.9822, + "step": 12462 + }, + { + "epoch": 0.32001489277363465, + "grad_norm": 0.79296875, + "learning_rate": 0.0001815516119981472, + "loss": 0.9441, + "step": 12463 + }, + { + "epoch": 0.3200405699695565, + "grad_norm": 0.80859375, + "learning_rate": 0.00018154902832998436, + "loss": 1.0079, + "step": 12464 + }, + { + "epoch": 0.32006624716547827, + "grad_norm": 0.74609375, + "learning_rate": 0.0001815464444993017, + "loss": 0.996, + "step": 12465 + }, + { + "epoch": 0.3200919243614001, + "grad_norm": 0.8515625, + "learning_rate": 0.00018154386050610433, + "loss": 0.8103, + "step": 12466 + }, + { + "epoch": 0.32011760155732194, + "grad_norm": 0.89453125, + "learning_rate": 0.00018154127635039742, + "loss": 1.1548, + "step": 12467 + }, + { + "epoch": 0.3201432787532438, + "grad_norm": 0.8046875, + "learning_rate": 0.00018153869203218608, + "loss": 0.9151, + "step": 12468 + }, + { + "epoch": 0.32016895594916556, + "grad_norm": 0.8515625, + "learning_rate": 0.00018153610755147553, + "loss": 1.0336, + "step": 12469 + }, + { + "epoch": 0.3201946331450874, + "grad_norm": 0.76171875, + "learning_rate": 0.00018153352290827085, + "loss": 0.9681, + "step": 12470 + }, + { + "epoch": 0.32022031034100923, + "grad_norm": 0.83984375, + "learning_rate": 0.00018153093810257723, + "loss": 1.0369, + "step": 12471 + }, + { + "epoch": 0.320245987536931, + "grad_norm": 0.7734375, + "learning_rate": 0.00018152835313439987, + "loss": 1.0762, + "step": 12472 + }, + { + "epoch": 0.32027166473285285, + "grad_norm": 0.71875, + "learning_rate": 0.00018152576800374378, + "loss": 0.9926, + "step": 12473 + }, + { + "epoch": 0.3202973419287747, + "grad_norm": 0.72265625, + "learning_rate": 0.00018152318271061425, + "loss": 0.9451, + "step": 12474 + }, + { + "epoch": 0.32032301912469646, + "grad_norm": 0.734375, + "learning_rate": 0.00018152059725501637, + "loss": 1.0864, + "step": 12475 + }, + { + "epoch": 0.3203486963206183, + "grad_norm": 0.76953125, + "learning_rate": 0.00018151801163695532, + "loss": 0.8794, + "step": 12476 + }, + { + "epoch": 0.32037437351654013, + "grad_norm": 0.7890625, + "learning_rate": 0.00018151542585643623, + "loss": 0.851, + "step": 12477 + }, + { + "epoch": 0.32040005071246197, + "grad_norm": 0.7421875, + "learning_rate": 0.00018151283991346424, + "loss": 0.868, + "step": 12478 + }, + { + "epoch": 0.32042572790838375, + "grad_norm": 0.76171875, + "learning_rate": 0.00018151025380804453, + "loss": 0.9065, + "step": 12479 + }, + { + "epoch": 0.3204514051043056, + "grad_norm": 0.7734375, + "learning_rate": 0.00018150766754018228, + "loss": 0.9447, + "step": 12480 + }, + { + "epoch": 0.3204770823002274, + "grad_norm": 0.77734375, + "learning_rate": 0.00018150508110988255, + "loss": 0.8477, + "step": 12481 + }, + { + "epoch": 0.3205027594961492, + "grad_norm": 0.76953125, + "learning_rate": 0.0001815024945171506, + "loss": 1.0083, + "step": 12482 + }, + { + "epoch": 0.32052843669207104, + "grad_norm": 0.9453125, + "learning_rate": 0.00018149990776199153, + "loss": 0.8665, + "step": 12483 + }, + { + "epoch": 0.3205541138879929, + "grad_norm": 0.73046875, + "learning_rate": 0.0001814973208444105, + "loss": 0.9048, + "step": 12484 + }, + { + "epoch": 0.32057979108391466, + "grad_norm": 0.734375, + "learning_rate": 0.00018149473376441268, + "loss": 0.9066, + "step": 12485 + }, + { + "epoch": 0.3206054682798365, + "grad_norm": 0.8046875, + "learning_rate": 0.0001814921465220032, + "loss": 0.9139, + "step": 12486 + }, + { + "epoch": 0.32063114547575833, + "grad_norm": 0.828125, + "learning_rate": 0.00018148955911718726, + "loss": 0.959, + "step": 12487 + }, + { + "epoch": 0.32065682267168016, + "grad_norm": 0.8828125, + "learning_rate": 0.00018148697154997, + "loss": 0.869, + "step": 12488 + }, + { + "epoch": 0.32068249986760194, + "grad_norm": 0.796875, + "learning_rate": 0.0001814843838203565, + "loss": 0.8523, + "step": 12489 + }, + { + "epoch": 0.3207081770635238, + "grad_norm": 0.8828125, + "learning_rate": 0.00018148179592835203, + "loss": 1.1048, + "step": 12490 + }, + { + "epoch": 0.3207338542594456, + "grad_norm": 0.83203125, + "learning_rate": 0.00018147920787396168, + "loss": 0.9259, + "step": 12491 + }, + { + "epoch": 0.3207595314553674, + "grad_norm": 0.78125, + "learning_rate": 0.00018147661965719068, + "loss": 1.0471, + "step": 12492 + }, + { + "epoch": 0.32078520865128923, + "grad_norm": 0.8125, + "learning_rate": 0.0001814740312780441, + "loss": 0.9961, + "step": 12493 + }, + { + "epoch": 0.32081088584721107, + "grad_norm": 0.73828125, + "learning_rate": 0.00018147144273652713, + "loss": 0.8476, + "step": 12494 + }, + { + "epoch": 0.32083656304313285, + "grad_norm": 0.78125, + "learning_rate": 0.00018146885403264493, + "loss": 0.9885, + "step": 12495 + }, + { + "epoch": 0.3208622402390547, + "grad_norm": 0.84765625, + "learning_rate": 0.0001814662651664027, + "loss": 0.9756, + "step": 12496 + }, + { + "epoch": 0.3208879174349765, + "grad_norm": 0.79296875, + "learning_rate": 0.00018146367613780553, + "loss": 1.0221, + "step": 12497 + }, + { + "epoch": 0.32091359463089836, + "grad_norm": 0.828125, + "learning_rate": 0.00018146108694685862, + "loss": 1.0173, + "step": 12498 + }, + { + "epoch": 0.32093927182682014, + "grad_norm": 0.82421875, + "learning_rate": 0.0001814584975935671, + "loss": 0.8554, + "step": 12499 + }, + { + "epoch": 0.320964949022742, + "grad_norm": 0.7734375, + "learning_rate": 0.00018145590807793619, + "loss": 0.866, + "step": 12500 + }, + { + "epoch": 0.3209906262186638, + "grad_norm": 0.78125, + "learning_rate": 0.00018145331839997098, + "loss": 0.9605, + "step": 12501 + }, + { + "epoch": 0.3210163034145856, + "grad_norm": 0.75390625, + "learning_rate": 0.0001814507285596767, + "loss": 0.8885, + "step": 12502 + }, + { + "epoch": 0.3210419806105074, + "grad_norm": 0.80859375, + "learning_rate": 0.00018144813855705845, + "loss": 1.0353, + "step": 12503 + }, + { + "epoch": 0.32106765780642926, + "grad_norm": 0.75390625, + "learning_rate": 0.0001814455483921214, + "loss": 0.8513, + "step": 12504 + }, + { + "epoch": 0.32109333500235104, + "grad_norm": 0.80078125, + "learning_rate": 0.00018144295806487074, + "loss": 1.0473, + "step": 12505 + }, + { + "epoch": 0.3211190121982729, + "grad_norm": 0.765625, + "learning_rate": 0.00018144036757531166, + "loss": 0.9133, + "step": 12506 + }, + { + "epoch": 0.3211446893941947, + "grad_norm": 0.7734375, + "learning_rate": 0.00018143777692344924, + "loss": 1.1101, + "step": 12507 + }, + { + "epoch": 0.32117036659011655, + "grad_norm": 0.76953125, + "learning_rate": 0.0001814351861092887, + "loss": 1.0163, + "step": 12508 + }, + { + "epoch": 0.32119604378603833, + "grad_norm": 0.7578125, + "learning_rate": 0.0001814325951328352, + "loss": 0.8843, + "step": 12509 + }, + { + "epoch": 0.32122172098196017, + "grad_norm": 0.75390625, + "learning_rate": 0.00018143000399409388, + "loss": 0.9628, + "step": 12510 + }, + { + "epoch": 0.321247398177882, + "grad_norm": 0.7734375, + "learning_rate": 0.00018142741269306994, + "loss": 0.9133, + "step": 12511 + }, + { + "epoch": 0.3212730753738038, + "grad_norm": 0.80078125, + "learning_rate": 0.0001814248212297685, + "loss": 0.8098, + "step": 12512 + }, + { + "epoch": 0.3212987525697256, + "grad_norm": 0.76953125, + "learning_rate": 0.00018142222960419475, + "loss": 0.9011, + "step": 12513 + }, + { + "epoch": 0.32132442976564746, + "grad_norm": 0.7890625, + "learning_rate": 0.00018141963781635387, + "loss": 0.9587, + "step": 12514 + }, + { + "epoch": 0.32135010696156924, + "grad_norm": 0.75390625, + "learning_rate": 0.00018141704586625097, + "loss": 0.8725, + "step": 12515 + }, + { + "epoch": 0.3213757841574911, + "grad_norm": 0.80078125, + "learning_rate": 0.00018141445375389128, + "loss": 0.878, + "step": 12516 + }, + { + "epoch": 0.3214014613534129, + "grad_norm": 0.80078125, + "learning_rate": 0.00018141186147927995, + "loss": 0.9245, + "step": 12517 + }, + { + "epoch": 0.32142713854933475, + "grad_norm": 0.8515625, + "learning_rate": 0.0001814092690424221, + "loss": 0.9871, + "step": 12518 + }, + { + "epoch": 0.3214528157452565, + "grad_norm": 0.8046875, + "learning_rate": 0.00018140667644332294, + "loss": 0.866, + "step": 12519 + }, + { + "epoch": 0.32147849294117836, + "grad_norm": 0.8125, + "learning_rate": 0.00018140408368198764, + "loss": 0.9791, + "step": 12520 + }, + { + "epoch": 0.3215041701371002, + "grad_norm": 0.765625, + "learning_rate": 0.00018140149075842135, + "loss": 0.9033, + "step": 12521 + }, + { + "epoch": 0.321529847333022, + "grad_norm": 0.76171875, + "learning_rate": 0.00018139889767262922, + "loss": 1.0346, + "step": 12522 + }, + { + "epoch": 0.3215555245289438, + "grad_norm": 0.796875, + "learning_rate": 0.0001813963044246165, + "loss": 1.0945, + "step": 12523 + }, + { + "epoch": 0.32158120172486565, + "grad_norm": 0.7421875, + "learning_rate": 0.00018139371101438822, + "loss": 1.0326, + "step": 12524 + }, + { + "epoch": 0.32160687892078743, + "grad_norm": 0.796875, + "learning_rate": 0.00018139111744194968, + "loss": 0.8779, + "step": 12525 + }, + { + "epoch": 0.32163255611670927, + "grad_norm": 0.765625, + "learning_rate": 0.00018138852370730597, + "loss": 0.9855, + "step": 12526 + }, + { + "epoch": 0.3216582333126311, + "grad_norm": 0.7421875, + "learning_rate": 0.00018138592981046227, + "loss": 0.8691, + "step": 12527 + }, + { + "epoch": 0.32168391050855294, + "grad_norm": 0.86328125, + "learning_rate": 0.0001813833357514238, + "loss": 1.1579, + "step": 12528 + }, + { + "epoch": 0.3217095877044747, + "grad_norm": 0.77734375, + "learning_rate": 0.00018138074153019567, + "loss": 1.0426, + "step": 12529 + }, + { + "epoch": 0.32173526490039656, + "grad_norm": 0.765625, + "learning_rate": 0.00018137814714678306, + "loss": 0.9052, + "step": 12530 + }, + { + "epoch": 0.3217609420963184, + "grad_norm": 0.82421875, + "learning_rate": 0.00018137555260119118, + "loss": 0.8618, + "step": 12531 + }, + { + "epoch": 0.3217866192922402, + "grad_norm": 0.80078125, + "learning_rate": 0.00018137295789342518, + "loss": 0.9234, + "step": 12532 + }, + { + "epoch": 0.321812296488162, + "grad_norm": 0.76953125, + "learning_rate": 0.00018137036302349024, + "loss": 0.8504, + "step": 12533 + }, + { + "epoch": 0.32183797368408384, + "grad_norm": 0.75390625, + "learning_rate": 0.00018136776799139147, + "loss": 1.0938, + "step": 12534 + }, + { + "epoch": 0.3218636508800056, + "grad_norm": 0.76953125, + "learning_rate": 0.00018136517279713411, + "loss": 0.9881, + "step": 12535 + }, + { + "epoch": 0.32188932807592746, + "grad_norm": 0.83203125, + "learning_rate": 0.00018136257744072332, + "loss": 0.9786, + "step": 12536 + }, + { + "epoch": 0.3219150052718493, + "grad_norm": 0.7578125, + "learning_rate": 0.00018135998192216426, + "loss": 0.931, + "step": 12537 + }, + { + "epoch": 0.32194068246777113, + "grad_norm": 0.6796875, + "learning_rate": 0.0001813573862414621, + "loss": 0.8667, + "step": 12538 + }, + { + "epoch": 0.3219663596636929, + "grad_norm": 0.78125, + "learning_rate": 0.00018135479039862199, + "loss": 1.019, + "step": 12539 + }, + { + "epoch": 0.32199203685961475, + "grad_norm": 0.765625, + "learning_rate": 0.00018135219439364918, + "loss": 1.0066, + "step": 12540 + }, + { + "epoch": 0.3220177140555366, + "grad_norm": 0.79296875, + "learning_rate": 0.00018134959822654876, + "loss": 0.8719, + "step": 12541 + }, + { + "epoch": 0.32204339125145837, + "grad_norm": 1.046875, + "learning_rate": 0.00018134700189732596, + "loss": 0.922, + "step": 12542 + }, + { + "epoch": 0.3220690684473802, + "grad_norm": 0.80078125, + "learning_rate": 0.00018134440540598589, + "loss": 1.038, + "step": 12543 + }, + { + "epoch": 0.32209474564330204, + "grad_norm": 0.75390625, + "learning_rate": 0.0001813418087525338, + "loss": 0.9761, + "step": 12544 + }, + { + "epoch": 0.3221204228392238, + "grad_norm": 0.91015625, + "learning_rate": 0.00018133921193697484, + "loss": 1.0221, + "step": 12545 + }, + { + "epoch": 0.32214610003514565, + "grad_norm": 0.72265625, + "learning_rate": 0.00018133661495931417, + "loss": 1.2125, + "step": 12546 + }, + { + "epoch": 0.3221717772310675, + "grad_norm": 0.79296875, + "learning_rate": 0.00018133401781955698, + "loss": 0.9183, + "step": 12547 + }, + { + "epoch": 0.3221974544269893, + "grad_norm": 0.72265625, + "learning_rate": 0.00018133142051770843, + "loss": 1.0074, + "step": 12548 + }, + { + "epoch": 0.3222231316229111, + "grad_norm": 0.79296875, + "learning_rate": 0.00018132882305377373, + "loss": 0.9911, + "step": 12549 + }, + { + "epoch": 0.32224880881883294, + "grad_norm": 0.79296875, + "learning_rate": 0.00018132622542775802, + "loss": 1.0606, + "step": 12550 + }, + { + "epoch": 0.3222744860147548, + "grad_norm": 0.859375, + "learning_rate": 0.00018132362763966645, + "loss": 1.0381, + "step": 12551 + }, + { + "epoch": 0.32230016321067656, + "grad_norm": 0.79296875, + "learning_rate": 0.0001813210296895043, + "loss": 0.9891, + "step": 12552 + }, + { + "epoch": 0.3223258404065984, + "grad_norm": 0.84375, + "learning_rate": 0.00018131843157727663, + "loss": 1.1048, + "step": 12553 + }, + { + "epoch": 0.32235151760252023, + "grad_norm": 0.78125, + "learning_rate": 0.00018131583330298869, + "loss": 0.9142, + "step": 12554 + }, + { + "epoch": 0.322377194798442, + "grad_norm": 0.8046875, + "learning_rate": 0.00018131323486664567, + "loss": 1.0701, + "step": 12555 + }, + { + "epoch": 0.32240287199436385, + "grad_norm": 0.7578125, + "learning_rate": 0.0001813106362682527, + "loss": 0.9146, + "step": 12556 + }, + { + "epoch": 0.3224285491902857, + "grad_norm": 0.8515625, + "learning_rate": 0.00018130803750781492, + "loss": 1.0512, + "step": 12557 + }, + { + "epoch": 0.3224542263862075, + "grad_norm": 0.78515625, + "learning_rate": 0.00018130543858533763, + "loss": 0.9729, + "step": 12558 + }, + { + "epoch": 0.3224799035821293, + "grad_norm": 0.7421875, + "learning_rate": 0.00018130283950082594, + "loss": 0.8668, + "step": 12559 + }, + { + "epoch": 0.32250558077805114, + "grad_norm": 0.72265625, + "learning_rate": 0.00018130024025428502, + "loss": 0.913, + "step": 12560 + }, + { + "epoch": 0.322531257973973, + "grad_norm": 0.76953125, + "learning_rate": 0.00018129764084572005, + "loss": 0.9422, + "step": 12561 + }, + { + "epoch": 0.32255693516989475, + "grad_norm": 0.7421875, + "learning_rate": 0.00018129504127513627, + "loss": 0.9638, + "step": 12562 + }, + { + "epoch": 0.3225826123658166, + "grad_norm": 0.8046875, + "learning_rate": 0.00018129244154253878, + "loss": 0.9924, + "step": 12563 + }, + { + "epoch": 0.3226082895617384, + "grad_norm": 0.73046875, + "learning_rate": 0.0001812898416479328, + "loss": 1.0693, + "step": 12564 + }, + { + "epoch": 0.3226339667576602, + "grad_norm": 0.765625, + "learning_rate": 0.0001812872415913235, + "loss": 1.0905, + "step": 12565 + }, + { + "epoch": 0.32265964395358204, + "grad_norm": 0.9296875, + "learning_rate": 0.0001812846413727161, + "loss": 1.0741, + "step": 12566 + }, + { + "epoch": 0.3226853211495039, + "grad_norm": 0.76953125, + "learning_rate": 0.00018128204099211574, + "loss": 1.0269, + "step": 12567 + }, + { + "epoch": 0.3227109983454257, + "grad_norm": 0.8046875, + "learning_rate": 0.00018127944044952764, + "loss": 0.9105, + "step": 12568 + }, + { + "epoch": 0.3227366755413475, + "grad_norm": 0.83984375, + "learning_rate": 0.0001812768397449569, + "loss": 1.0097, + "step": 12569 + }, + { + "epoch": 0.32276235273726933, + "grad_norm": 0.7734375, + "learning_rate": 0.00018127423887840877, + "loss": 0.9531, + "step": 12570 + }, + { + "epoch": 0.32278802993319117, + "grad_norm": 0.8125, + "learning_rate": 0.00018127163784988846, + "loss": 0.9742, + "step": 12571 + }, + { + "epoch": 0.32281370712911295, + "grad_norm": 0.83984375, + "learning_rate": 0.00018126903665940112, + "loss": 0.9304, + "step": 12572 + }, + { + "epoch": 0.3228393843250348, + "grad_norm": 0.81640625, + "learning_rate": 0.00018126643530695188, + "loss": 0.8961, + "step": 12573 + }, + { + "epoch": 0.3228650615209566, + "grad_norm": 0.7890625, + "learning_rate": 0.00018126383379254602, + "loss": 0.838, + "step": 12574 + }, + { + "epoch": 0.3228907387168784, + "grad_norm": 0.78125, + "learning_rate": 0.00018126123211618865, + "loss": 1.0739, + "step": 12575 + }, + { + "epoch": 0.32291641591280024, + "grad_norm": 0.8046875, + "learning_rate": 0.000181258630277885, + "loss": 0.8599, + "step": 12576 + }, + { + "epoch": 0.3229420931087221, + "grad_norm": 0.796875, + "learning_rate": 0.00018125602827764027, + "loss": 0.9392, + "step": 12577 + }, + { + "epoch": 0.3229677703046439, + "grad_norm": 0.703125, + "learning_rate": 0.00018125342611545958, + "loss": 0.9237, + "step": 12578 + }, + { + "epoch": 0.3229934475005657, + "grad_norm": 0.81640625, + "learning_rate": 0.00018125082379134814, + "loss": 0.976, + "step": 12579 + }, + { + "epoch": 0.3230191246964875, + "grad_norm": 0.80859375, + "learning_rate": 0.0001812482213053112, + "loss": 0.9815, + "step": 12580 + }, + { + "epoch": 0.32304480189240936, + "grad_norm": 0.76171875, + "learning_rate": 0.0001812456186573538, + "loss": 0.9713, + "step": 12581 + }, + { + "epoch": 0.32307047908833114, + "grad_norm": 0.875, + "learning_rate": 0.0001812430158474813, + "loss": 1.0223, + "step": 12582 + }, + { + "epoch": 0.323096156284253, + "grad_norm": 0.8046875, + "learning_rate": 0.00018124041287569876, + "loss": 0.8796, + "step": 12583 + }, + { + "epoch": 0.3231218334801748, + "grad_norm": 0.87109375, + "learning_rate": 0.00018123780974201148, + "loss": 0.9864, + "step": 12584 + }, + { + "epoch": 0.3231475106760966, + "grad_norm": 0.82421875, + "learning_rate": 0.00018123520644642454, + "loss": 0.9916, + "step": 12585 + }, + { + "epoch": 0.32317318787201843, + "grad_norm": 0.7734375, + "learning_rate": 0.00018123260298894318, + "loss": 1.0897, + "step": 12586 + }, + { + "epoch": 0.32319886506794027, + "grad_norm": 0.81640625, + "learning_rate": 0.00018122999936957257, + "loss": 0.9652, + "step": 12587 + }, + { + "epoch": 0.3232245422638621, + "grad_norm": 0.74609375, + "learning_rate": 0.0001812273955883179, + "loss": 0.9152, + "step": 12588 + }, + { + "epoch": 0.3232502194597839, + "grad_norm": 0.77734375, + "learning_rate": 0.0001812247916451844, + "loss": 0.9707, + "step": 12589 + }, + { + "epoch": 0.3232758966557057, + "grad_norm": 0.75, + "learning_rate": 0.00018122218754017722, + "loss": 0.9644, + "step": 12590 + }, + { + "epoch": 0.32330157385162756, + "grad_norm": 1.0859375, + "learning_rate": 0.00018121958327330155, + "loss": 1.0093, + "step": 12591 + }, + { + "epoch": 0.32332725104754934, + "grad_norm": 0.734375, + "learning_rate": 0.00018121697884456256, + "loss": 1.0613, + "step": 12592 + }, + { + "epoch": 0.32335292824347117, + "grad_norm": 0.80078125, + "learning_rate": 0.0001812143742539655, + "loss": 1.0148, + "step": 12593 + }, + { + "epoch": 0.323378605439393, + "grad_norm": 0.8671875, + "learning_rate": 0.00018121176950151554, + "loss": 0.9514, + "step": 12594 + }, + { + "epoch": 0.3234042826353148, + "grad_norm": 0.88671875, + "learning_rate": 0.00018120916458721782, + "loss": 0.9324, + "step": 12595 + }, + { + "epoch": 0.3234299598312366, + "grad_norm": 0.77734375, + "learning_rate": 0.00018120655951107758, + "loss": 1.047, + "step": 12596 + }, + { + "epoch": 0.32345563702715846, + "grad_norm": 0.76953125, + "learning_rate": 0.00018120395427310003, + "loss": 0.9716, + "step": 12597 + }, + { + "epoch": 0.3234813142230803, + "grad_norm": 0.76953125, + "learning_rate": 0.0001812013488732903, + "loss": 0.9572, + "step": 12598 + }, + { + "epoch": 0.3235069914190021, + "grad_norm": 0.859375, + "learning_rate": 0.00018119874331165364, + "loss": 1.0575, + "step": 12599 + }, + { + "epoch": 0.3235326686149239, + "grad_norm": 0.7734375, + "learning_rate": 0.00018119613758819524, + "loss": 1.0829, + "step": 12600 + }, + { + "epoch": 0.32355834581084575, + "grad_norm": 0.82421875, + "learning_rate": 0.00018119353170292023, + "loss": 1.0157, + "step": 12601 + }, + { + "epoch": 0.32358402300676753, + "grad_norm": 1.0625, + "learning_rate": 0.00018119092565583388, + "loss": 0.887, + "step": 12602 + }, + { + "epoch": 0.32360970020268937, + "grad_norm": 0.75390625, + "learning_rate": 0.0001811883194469413, + "loss": 1.0443, + "step": 12603 + }, + { + "epoch": 0.3236353773986112, + "grad_norm": 0.76953125, + "learning_rate": 0.00018118571307624776, + "loss": 0.8838, + "step": 12604 + }, + { + "epoch": 0.323661054594533, + "grad_norm": 0.83984375, + "learning_rate": 0.00018118310654375845, + "loss": 0.8852, + "step": 12605 + }, + { + "epoch": 0.3236867317904548, + "grad_norm": 0.78515625, + "learning_rate": 0.0001811804998494785, + "loss": 0.8915, + "step": 12606 + }, + { + "epoch": 0.32371240898637665, + "grad_norm": 0.78125, + "learning_rate": 0.0001811778929934132, + "loss": 0.9872, + "step": 12607 + }, + { + "epoch": 0.32373808618229843, + "grad_norm": 0.78125, + "learning_rate": 0.00018117528597556763, + "loss": 0.9884, + "step": 12608 + }, + { + "epoch": 0.32376376337822027, + "grad_norm": 0.7265625, + "learning_rate": 0.0001811726787959471, + "loss": 0.9279, + "step": 12609 + }, + { + "epoch": 0.3237894405741421, + "grad_norm": 0.78515625, + "learning_rate": 0.00018117007145455674, + "loss": 0.8906, + "step": 12610 + }, + { + "epoch": 0.32381511777006394, + "grad_norm": 0.765625, + "learning_rate": 0.00018116746395140175, + "loss": 1.0555, + "step": 12611 + }, + { + "epoch": 0.3238407949659857, + "grad_norm": 0.78125, + "learning_rate": 0.00018116485628648734, + "loss": 0.9306, + "step": 12612 + }, + { + "epoch": 0.32386647216190756, + "grad_norm": 0.734375, + "learning_rate": 0.00018116224845981868, + "loss": 0.808, + "step": 12613 + }, + { + "epoch": 0.3238921493578294, + "grad_norm": 0.7734375, + "learning_rate": 0.000181159640471401, + "loss": 0.9285, + "step": 12614 + }, + { + "epoch": 0.3239178265537512, + "grad_norm": 0.796875, + "learning_rate": 0.00018115703232123953, + "loss": 1.0153, + "step": 12615 + }, + { + "epoch": 0.323943503749673, + "grad_norm": 0.75390625, + "learning_rate": 0.00018115442400933938, + "loss": 0.9498, + "step": 12616 + }, + { + "epoch": 0.32396918094559485, + "grad_norm": 0.75390625, + "learning_rate": 0.0001811518155357058, + "loss": 1.0666, + "step": 12617 + }, + { + "epoch": 0.32399485814151663, + "grad_norm": 0.82421875, + "learning_rate": 0.000181149206900344, + "loss": 0.9919, + "step": 12618 + }, + { + "epoch": 0.32402053533743846, + "grad_norm": 0.765625, + "learning_rate": 0.00018114659810325914, + "loss": 0.8968, + "step": 12619 + }, + { + "epoch": 0.3240462125333603, + "grad_norm": 0.69921875, + "learning_rate": 0.00018114398914445646, + "loss": 0.7554, + "step": 12620 + }, + { + "epoch": 0.32407188972928214, + "grad_norm": 0.71484375, + "learning_rate": 0.00018114138002394111, + "loss": 0.8864, + "step": 12621 + }, + { + "epoch": 0.3240975669252039, + "grad_norm": 0.8359375, + "learning_rate": 0.00018113877074171834, + "loss": 1.1279, + "step": 12622 + }, + { + "epoch": 0.32412324412112575, + "grad_norm": 0.77734375, + "learning_rate": 0.00018113616129779335, + "loss": 0.956, + "step": 12623 + }, + { + "epoch": 0.3241489213170476, + "grad_norm": 0.78515625, + "learning_rate": 0.0001811335516921713, + "loss": 0.9573, + "step": 12624 + }, + { + "epoch": 0.32417459851296937, + "grad_norm": 0.75, + "learning_rate": 0.00018113094192485742, + "loss": 0.9531, + "step": 12625 + }, + { + "epoch": 0.3242002757088912, + "grad_norm": 0.8359375, + "learning_rate": 0.00018112833199585685, + "loss": 0.9657, + "step": 12626 + }, + { + "epoch": 0.32422595290481304, + "grad_norm": 0.87109375, + "learning_rate": 0.0001811257219051749, + "loss": 1.0225, + "step": 12627 + }, + { + "epoch": 0.3242516301007348, + "grad_norm": 0.81640625, + "learning_rate": 0.0001811231116528167, + "loss": 0.9186, + "step": 12628 + }, + { + "epoch": 0.32427730729665666, + "grad_norm": 0.78125, + "learning_rate": 0.00018112050123878746, + "loss": 0.9425, + "step": 12629 + }, + { + "epoch": 0.3243029844925785, + "grad_norm": 0.81640625, + "learning_rate": 0.00018111789066309239, + "loss": 0.9703, + "step": 12630 + }, + { + "epoch": 0.32432866168850033, + "grad_norm": 0.79296875, + "learning_rate": 0.0001811152799257367, + "loss": 0.9527, + "step": 12631 + }, + { + "epoch": 0.3243543388844221, + "grad_norm": 0.765625, + "learning_rate": 0.00018111266902672558, + "loss": 1.0986, + "step": 12632 + }, + { + "epoch": 0.32438001608034395, + "grad_norm": 0.75, + "learning_rate": 0.00018111005796606425, + "loss": 0.8384, + "step": 12633 + }, + { + "epoch": 0.3244056932762658, + "grad_norm": 0.77734375, + "learning_rate": 0.00018110744674375786, + "loss": 0.9696, + "step": 12634 + }, + { + "epoch": 0.32443137047218756, + "grad_norm": 0.7421875, + "learning_rate": 0.00018110483535981167, + "loss": 0.9133, + "step": 12635 + }, + { + "epoch": 0.3244570476681094, + "grad_norm": 0.75390625, + "learning_rate": 0.0001811022238142309, + "loss": 0.9646, + "step": 12636 + }, + { + "epoch": 0.32448272486403124, + "grad_norm": 0.78515625, + "learning_rate": 0.0001810996121070207, + "loss": 0.9031, + "step": 12637 + }, + { + "epoch": 0.324508402059953, + "grad_norm": 0.75, + "learning_rate": 0.0001810970002381863, + "loss": 0.9059, + "step": 12638 + }, + { + "epoch": 0.32453407925587485, + "grad_norm": 0.72265625, + "learning_rate": 0.0001810943882077329, + "loss": 0.9588, + "step": 12639 + }, + { + "epoch": 0.3245597564517967, + "grad_norm": 0.7578125, + "learning_rate": 0.0001810917760156657, + "loss": 0.7562, + "step": 12640 + }, + { + "epoch": 0.3245854336477185, + "grad_norm": 0.76953125, + "learning_rate": 0.00018108916366198994, + "loss": 1.0568, + "step": 12641 + }, + { + "epoch": 0.3246111108436403, + "grad_norm": 0.73828125, + "learning_rate": 0.0001810865511467108, + "loss": 0.8316, + "step": 12642 + }, + { + "epoch": 0.32463678803956214, + "grad_norm": 1.203125, + "learning_rate": 0.00018108393846983346, + "loss": 1.1201, + "step": 12643 + }, + { + "epoch": 0.324662465235484, + "grad_norm": 0.796875, + "learning_rate": 0.0001810813256313632, + "loss": 0.9812, + "step": 12644 + }, + { + "epoch": 0.32468814243140576, + "grad_norm": 0.80859375, + "learning_rate": 0.00018107871263130517, + "loss": 0.9394, + "step": 12645 + }, + { + "epoch": 0.3247138196273276, + "grad_norm": 0.828125, + "learning_rate": 0.00018107609946966456, + "loss": 0.956, + "step": 12646 + }, + { + "epoch": 0.32473949682324943, + "grad_norm": 0.96484375, + "learning_rate": 0.0001810734861464466, + "loss": 1.0109, + "step": 12647 + }, + { + "epoch": 0.3247651740191712, + "grad_norm": 0.80078125, + "learning_rate": 0.00018107087266165656, + "loss": 0.8562, + "step": 12648 + }, + { + "epoch": 0.32479085121509305, + "grad_norm": 0.80078125, + "learning_rate": 0.00018106825901529955, + "loss": 0.788, + "step": 12649 + }, + { + "epoch": 0.3248165284110149, + "grad_norm": 0.80078125, + "learning_rate": 0.00018106564520738085, + "loss": 0.8778, + "step": 12650 + }, + { + "epoch": 0.3248422056069367, + "grad_norm": 0.78515625, + "learning_rate": 0.00018106303123790562, + "loss": 1.0918, + "step": 12651 + }, + { + "epoch": 0.3248678828028585, + "grad_norm": 0.7578125, + "learning_rate": 0.0001810604171068791, + "loss": 0.8627, + "step": 12652 + }, + { + "epoch": 0.32489355999878033, + "grad_norm": 0.90625, + "learning_rate": 0.0001810578028143065, + "loss": 0.9579, + "step": 12653 + }, + { + "epoch": 0.32491923719470217, + "grad_norm": 0.734375, + "learning_rate": 0.000181055188360193, + "loss": 0.8299, + "step": 12654 + }, + { + "epoch": 0.32494491439062395, + "grad_norm": 0.84765625, + "learning_rate": 0.00018105257374454383, + "loss": 1.0286, + "step": 12655 + }, + { + "epoch": 0.3249705915865458, + "grad_norm": 0.7578125, + "learning_rate": 0.0001810499589673642, + "loss": 0.9777, + "step": 12656 + }, + { + "epoch": 0.3249962687824676, + "grad_norm": 0.75390625, + "learning_rate": 0.00018104734402865934, + "loss": 0.8616, + "step": 12657 + }, + { + "epoch": 0.3250219459783894, + "grad_norm": 0.81640625, + "learning_rate": 0.00018104472892843448, + "loss": 0.9243, + "step": 12658 + }, + { + "epoch": 0.32504762317431124, + "grad_norm": 0.80078125, + "learning_rate": 0.00018104211366669476, + "loss": 0.8563, + "step": 12659 + }, + { + "epoch": 0.3250733003702331, + "grad_norm": 0.80859375, + "learning_rate": 0.0001810394982434454, + "loss": 0.964, + "step": 12660 + }, + { + "epoch": 0.3250989775661549, + "grad_norm": 0.82421875, + "learning_rate": 0.00018103688265869167, + "loss": 0.8097, + "step": 12661 + }, + { + "epoch": 0.3251246547620767, + "grad_norm": 0.921875, + "learning_rate": 0.00018103426691243874, + "loss": 0.958, + "step": 12662 + }, + { + "epoch": 0.32515033195799853, + "grad_norm": 0.79296875, + "learning_rate": 0.00018103165100469187, + "loss": 0.9938, + "step": 12663 + }, + { + "epoch": 0.32517600915392036, + "grad_norm": 0.7734375, + "learning_rate": 0.0001810290349354562, + "loss": 1.108, + "step": 12664 + }, + { + "epoch": 0.32520168634984215, + "grad_norm": 0.80078125, + "learning_rate": 0.000181026418704737, + "loss": 0.9136, + "step": 12665 + }, + { + "epoch": 0.325227363545764, + "grad_norm": 0.82421875, + "learning_rate": 0.00018102380231253945, + "loss": 0.9276, + "step": 12666 + }, + { + "epoch": 0.3252530407416858, + "grad_norm": 0.80078125, + "learning_rate": 0.0001810211857588688, + "loss": 1.057, + "step": 12667 + }, + { + "epoch": 0.3252787179376076, + "grad_norm": 0.828125, + "learning_rate": 0.00018101856904373023, + "loss": 0.8727, + "step": 12668 + }, + { + "epoch": 0.32530439513352943, + "grad_norm": 0.76171875, + "learning_rate": 0.000181015952167129, + "loss": 0.921, + "step": 12669 + }, + { + "epoch": 0.32533007232945127, + "grad_norm": 0.79296875, + "learning_rate": 0.00018101333512907027, + "loss": 0.8987, + "step": 12670 + }, + { + "epoch": 0.3253557495253731, + "grad_norm": 0.75390625, + "learning_rate": 0.00018101071792955928, + "loss": 0.9514, + "step": 12671 + }, + { + "epoch": 0.3253814267212949, + "grad_norm": 0.87109375, + "learning_rate": 0.00018100810056860124, + "loss": 0.9838, + "step": 12672 + }, + { + "epoch": 0.3254071039172167, + "grad_norm": 0.7734375, + "learning_rate": 0.0001810054830462014, + "loss": 0.9563, + "step": 12673 + }, + { + "epoch": 0.32543278111313856, + "grad_norm": 0.73046875, + "learning_rate": 0.00018100286536236495, + "loss": 1.0207, + "step": 12674 + }, + { + "epoch": 0.32545845830906034, + "grad_norm": 0.84765625, + "learning_rate": 0.00018100024751709708, + "loss": 0.9609, + "step": 12675 + }, + { + "epoch": 0.3254841355049822, + "grad_norm": 0.84375, + "learning_rate": 0.00018099762951040305, + "loss": 0.998, + "step": 12676 + }, + { + "epoch": 0.325509812700904, + "grad_norm": 0.80078125, + "learning_rate": 0.00018099501134228804, + "loss": 0.9386, + "step": 12677 + }, + { + "epoch": 0.3255354898968258, + "grad_norm": 0.79296875, + "learning_rate": 0.0001809923930127573, + "loss": 0.824, + "step": 12678 + }, + { + "epoch": 0.3255611670927476, + "grad_norm": 0.734375, + "learning_rate": 0.00018098977452181604, + "loss": 0.9477, + "step": 12679 + }, + { + "epoch": 0.32558684428866946, + "grad_norm": 0.78515625, + "learning_rate": 0.00018098715586946948, + "loss": 0.9436, + "step": 12680 + }, + { + "epoch": 0.3256125214845913, + "grad_norm": 1.8359375, + "learning_rate": 0.00018098453705572284, + "loss": 1.007, + "step": 12681 + }, + { + "epoch": 0.3256381986805131, + "grad_norm": 0.7734375, + "learning_rate": 0.00018098191808058132, + "loss": 0.9817, + "step": 12682 + }, + { + "epoch": 0.3256638758764349, + "grad_norm": 0.6875, + "learning_rate": 0.00018097929894405015, + "loss": 0.8028, + "step": 12683 + }, + { + "epoch": 0.32568955307235675, + "grad_norm": 0.7265625, + "learning_rate": 0.00018097667964613457, + "loss": 0.8795, + "step": 12684 + }, + { + "epoch": 0.32571523026827853, + "grad_norm": 0.6875, + "learning_rate": 0.00018097406018683976, + "loss": 0.9697, + "step": 12685 + }, + { + "epoch": 0.32574090746420037, + "grad_norm": 0.75, + "learning_rate": 0.00018097144056617096, + "loss": 0.7804, + "step": 12686 + }, + { + "epoch": 0.3257665846601222, + "grad_norm": 0.74609375, + "learning_rate": 0.00018096882078413341, + "loss": 1.0162, + "step": 12687 + }, + { + "epoch": 0.325792261856044, + "grad_norm": 0.76953125, + "learning_rate": 0.00018096620084073233, + "loss": 0.9195, + "step": 12688 + }, + { + "epoch": 0.3258179390519658, + "grad_norm": 0.76953125, + "learning_rate": 0.0001809635807359729, + "loss": 1.0547, + "step": 12689 + }, + { + "epoch": 0.32584361624788766, + "grad_norm": 0.7890625, + "learning_rate": 0.00018096096046986039, + "loss": 0.8387, + "step": 12690 + }, + { + "epoch": 0.3258692934438095, + "grad_norm": 0.7578125, + "learning_rate": 0.00018095834004239994, + "loss": 0.7836, + "step": 12691 + }, + { + "epoch": 0.3258949706397313, + "grad_norm": 0.83203125, + "learning_rate": 0.00018095571945359688, + "loss": 0.9405, + "step": 12692 + }, + { + "epoch": 0.3259206478356531, + "grad_norm": 0.74609375, + "learning_rate": 0.00018095309870345638, + "loss": 0.9304, + "step": 12693 + }, + { + "epoch": 0.32594632503157495, + "grad_norm": 0.81640625, + "learning_rate": 0.00018095047779198366, + "loss": 0.818, + "step": 12694 + }, + { + "epoch": 0.3259720022274967, + "grad_norm": 0.70703125, + "learning_rate": 0.00018094785671918397, + "loss": 0.8487, + "step": 12695 + }, + { + "epoch": 0.32599767942341856, + "grad_norm": 2.015625, + "learning_rate": 0.00018094523548506248, + "loss": 0.9873, + "step": 12696 + }, + { + "epoch": 0.3260233566193404, + "grad_norm": 0.8359375, + "learning_rate": 0.00018094261408962446, + "loss": 0.9606, + "step": 12697 + }, + { + "epoch": 0.3260490338152622, + "grad_norm": 0.8046875, + "learning_rate": 0.00018093999253287514, + "loss": 1.0147, + "step": 12698 + }, + { + "epoch": 0.326074711011184, + "grad_norm": 0.80078125, + "learning_rate": 0.0001809373708148197, + "loss": 1.0082, + "step": 12699 + }, + { + "epoch": 0.32610038820710585, + "grad_norm": 0.875, + "learning_rate": 0.0001809347489354634, + "loss": 0.9974, + "step": 12700 + }, + { + "epoch": 0.3261260654030277, + "grad_norm": 0.75390625, + "learning_rate": 0.00018093212689481147, + "loss": 0.9199, + "step": 12701 + }, + { + "epoch": 0.32615174259894947, + "grad_norm": 0.79296875, + "learning_rate": 0.0001809295046928691, + "loss": 1.0862, + "step": 12702 + }, + { + "epoch": 0.3261774197948713, + "grad_norm": 0.8046875, + "learning_rate": 0.00018092688232964154, + "loss": 0.9058, + "step": 12703 + }, + { + "epoch": 0.32620309699079314, + "grad_norm": 0.80859375, + "learning_rate": 0.00018092425980513404, + "loss": 0.9449, + "step": 12704 + }, + { + "epoch": 0.3262287741867149, + "grad_norm": 0.77734375, + "learning_rate": 0.00018092163711935178, + "loss": 0.9295, + "step": 12705 + }, + { + "epoch": 0.32625445138263676, + "grad_norm": 0.8125, + "learning_rate": 0.0001809190142723, + "loss": 0.9063, + "step": 12706 + }, + { + "epoch": 0.3262801285785586, + "grad_norm": 0.87109375, + "learning_rate": 0.00018091639126398395, + "loss": 1.0217, + "step": 12707 + }, + { + "epoch": 0.3263058057744804, + "grad_norm": 0.76953125, + "learning_rate": 0.00018091376809440882, + "loss": 0.7963, + "step": 12708 + }, + { + "epoch": 0.3263314829704022, + "grad_norm": 0.76953125, + "learning_rate": 0.00018091114476357988, + "loss": 1.0225, + "step": 12709 + }, + { + "epoch": 0.32635716016632405, + "grad_norm": 0.6953125, + "learning_rate": 0.00018090852127150233, + "loss": 0.8545, + "step": 12710 + }, + { + "epoch": 0.3263828373622459, + "grad_norm": 0.84375, + "learning_rate": 0.0001809058976181814, + "loss": 1.0154, + "step": 12711 + }, + { + "epoch": 0.32640851455816766, + "grad_norm": 0.73828125, + "learning_rate": 0.00018090327380362233, + "loss": 0.9188, + "step": 12712 + }, + { + "epoch": 0.3264341917540895, + "grad_norm": 0.76953125, + "learning_rate": 0.00018090064982783033, + "loss": 0.9831, + "step": 12713 + }, + { + "epoch": 0.32645986895001133, + "grad_norm": 0.80078125, + "learning_rate": 0.00018089802569081067, + "loss": 1.0179, + "step": 12714 + }, + { + "epoch": 0.3264855461459331, + "grad_norm": 0.796875, + "learning_rate": 0.00018089540139256852, + "loss": 0.9417, + "step": 12715 + }, + { + "epoch": 0.32651122334185495, + "grad_norm": 0.8203125, + "learning_rate": 0.00018089277693310914, + "loss": 0.9357, + "step": 12716 + }, + { + "epoch": 0.3265369005377768, + "grad_norm": 0.8515625, + "learning_rate": 0.00018089015231243782, + "loss": 0.9379, + "step": 12717 + }, + { + "epoch": 0.32656257773369857, + "grad_norm": 0.7734375, + "learning_rate": 0.0001808875275305597, + "loss": 1.0788, + "step": 12718 + }, + { + "epoch": 0.3265882549296204, + "grad_norm": 0.83203125, + "learning_rate": 0.00018088490258748004, + "loss": 0.953, + "step": 12719 + }, + { + "epoch": 0.32661393212554224, + "grad_norm": 0.78125, + "learning_rate": 0.00018088227748320405, + "loss": 0.9301, + "step": 12720 + }, + { + "epoch": 0.3266396093214641, + "grad_norm": 0.8203125, + "learning_rate": 0.00018087965221773704, + "loss": 1.0647, + "step": 12721 + }, + { + "epoch": 0.32666528651738586, + "grad_norm": 0.83203125, + "learning_rate": 0.00018087702679108416, + "loss": 1.083, + "step": 12722 + }, + { + "epoch": 0.3266909637133077, + "grad_norm": 0.76171875, + "learning_rate": 0.00018087440120325067, + "loss": 0.9504, + "step": 12723 + }, + { + "epoch": 0.3267166409092295, + "grad_norm": 0.80859375, + "learning_rate": 0.00018087177545424182, + "loss": 0.9263, + "step": 12724 + }, + { + "epoch": 0.3267423181051513, + "grad_norm": 0.828125, + "learning_rate": 0.00018086914954406277, + "loss": 0.9106, + "step": 12725 + }, + { + "epoch": 0.32676799530107314, + "grad_norm": 0.765625, + "learning_rate": 0.00018086652347271888, + "loss": 0.9892, + "step": 12726 + }, + { + "epoch": 0.326793672496995, + "grad_norm": 0.8125, + "learning_rate": 0.00018086389724021528, + "loss": 0.9124, + "step": 12727 + }, + { + "epoch": 0.32681934969291676, + "grad_norm": 0.8125, + "learning_rate": 0.00018086127084655725, + "loss": 1.1807, + "step": 12728 + }, + { + "epoch": 0.3268450268888386, + "grad_norm": 0.7734375, + "learning_rate": 0.00018085864429175, + "loss": 0.8315, + "step": 12729 + }, + { + "epoch": 0.32687070408476043, + "grad_norm": 0.69921875, + "learning_rate": 0.00018085601757579877, + "loss": 0.762, + "step": 12730 + }, + { + "epoch": 0.32689638128068227, + "grad_norm": 0.7421875, + "learning_rate": 0.00018085339069870882, + "loss": 0.8898, + "step": 12731 + }, + { + "epoch": 0.32692205847660405, + "grad_norm": 0.8125, + "learning_rate": 0.00018085076366048536, + "loss": 0.9832, + "step": 12732 + }, + { + "epoch": 0.3269477356725259, + "grad_norm": 0.7890625, + "learning_rate": 0.00018084813646113363, + "loss": 1.0589, + "step": 12733 + }, + { + "epoch": 0.3269734128684477, + "grad_norm": 0.84765625, + "learning_rate": 0.00018084550910065885, + "loss": 1.1149, + "step": 12734 + }, + { + "epoch": 0.3269990900643695, + "grad_norm": 0.78125, + "learning_rate": 0.00018084288157906627, + "loss": 0.9276, + "step": 12735 + }, + { + "epoch": 0.32702476726029134, + "grad_norm": 0.83203125, + "learning_rate": 0.00018084025389636117, + "loss": 0.9327, + "step": 12736 + }, + { + "epoch": 0.3270504444562132, + "grad_norm": 0.88671875, + "learning_rate": 0.00018083762605254871, + "loss": 1.0418, + "step": 12737 + }, + { + "epoch": 0.32707612165213495, + "grad_norm": 0.79296875, + "learning_rate": 0.0001808349980476342, + "loss": 0.9542, + "step": 12738 + }, + { + "epoch": 0.3271017988480568, + "grad_norm": 0.79296875, + "learning_rate": 0.00018083236988162283, + "loss": 0.8016, + "step": 12739 + }, + { + "epoch": 0.3271274760439786, + "grad_norm": 0.75, + "learning_rate": 0.00018082974155451983, + "loss": 0.9234, + "step": 12740 + }, + { + "epoch": 0.32715315323990046, + "grad_norm": 0.7578125, + "learning_rate": 0.00018082711306633044, + "loss": 0.8937, + "step": 12741 + }, + { + "epoch": 0.32717883043582224, + "grad_norm": 0.859375, + "learning_rate": 0.00018082448441705993, + "loss": 1.0535, + "step": 12742 + }, + { + "epoch": 0.3272045076317441, + "grad_norm": 0.78125, + "learning_rate": 0.00018082185560671355, + "loss": 1.0716, + "step": 12743 + }, + { + "epoch": 0.3272301848276659, + "grad_norm": 0.78515625, + "learning_rate": 0.0001808192266352965, + "loss": 1.0196, + "step": 12744 + }, + { + "epoch": 0.3272558620235877, + "grad_norm": 0.78125, + "learning_rate": 0.000180816597502814, + "loss": 0.9618, + "step": 12745 + }, + { + "epoch": 0.32728153921950953, + "grad_norm": 1.1328125, + "learning_rate": 0.00018081396820927137, + "loss": 1.0349, + "step": 12746 + }, + { + "epoch": 0.32730721641543137, + "grad_norm": 0.875, + "learning_rate": 0.00018081133875467374, + "loss": 0.9566, + "step": 12747 + }, + { + "epoch": 0.32733289361135315, + "grad_norm": 0.79296875, + "learning_rate": 0.00018080870913902647, + "loss": 0.8339, + "step": 12748 + }, + { + "epoch": 0.327358570807275, + "grad_norm": 0.828125, + "learning_rate": 0.00018080607936233473, + "loss": 0.8883, + "step": 12749 + }, + { + "epoch": 0.3273842480031968, + "grad_norm": 0.76171875, + "learning_rate": 0.00018080344942460376, + "loss": 0.9592, + "step": 12750 + }, + { + "epoch": 0.32740992519911866, + "grad_norm": 0.734375, + "learning_rate": 0.0001808008193258388, + "loss": 1.0075, + "step": 12751 + }, + { + "epoch": 0.32743560239504044, + "grad_norm": 0.78515625, + "learning_rate": 0.00018079818906604515, + "loss": 1.0462, + "step": 12752 + }, + { + "epoch": 0.3274612795909623, + "grad_norm": 0.7890625, + "learning_rate": 0.00018079555864522796, + "loss": 0.9515, + "step": 12753 + }, + { + "epoch": 0.3274869567868841, + "grad_norm": 0.78125, + "learning_rate": 0.00018079292806339256, + "loss": 1.1044, + "step": 12754 + }, + { + "epoch": 0.3275126339828059, + "grad_norm": 0.74609375, + "learning_rate": 0.00018079029732054414, + "loss": 0.946, + "step": 12755 + }, + { + "epoch": 0.3275383111787277, + "grad_norm": 0.7421875, + "learning_rate": 0.00018078766641668794, + "loss": 0.9138, + "step": 12756 + }, + { + "epoch": 0.32756398837464956, + "grad_norm": 0.80078125, + "learning_rate": 0.00018078503535182922, + "loss": 0.9525, + "step": 12757 + }, + { + "epoch": 0.32758966557057134, + "grad_norm": 0.9296875, + "learning_rate": 0.00018078240412597324, + "loss": 0.8831, + "step": 12758 + }, + { + "epoch": 0.3276153427664932, + "grad_norm": 0.7734375, + "learning_rate": 0.0001807797727391252, + "loss": 0.966, + "step": 12759 + }, + { + "epoch": 0.327641019962415, + "grad_norm": 0.78125, + "learning_rate": 0.0001807771411912904, + "loss": 0.8258, + "step": 12760 + }, + { + "epoch": 0.32766669715833685, + "grad_norm": 0.73046875, + "learning_rate": 0.00018077450948247403, + "loss": 0.9014, + "step": 12761 + }, + { + "epoch": 0.32769237435425863, + "grad_norm": 0.92578125, + "learning_rate": 0.00018077187761268137, + "loss": 0.972, + "step": 12762 + }, + { + "epoch": 0.32771805155018047, + "grad_norm": 0.7421875, + "learning_rate": 0.00018076924558191765, + "loss": 0.8525, + "step": 12763 + }, + { + "epoch": 0.3277437287461023, + "grad_norm": 0.80078125, + "learning_rate": 0.00018076661339018813, + "loss": 1.0371, + "step": 12764 + }, + { + "epoch": 0.3277694059420241, + "grad_norm": 0.75, + "learning_rate": 0.000180763981037498, + "loss": 0.9373, + "step": 12765 + }, + { + "epoch": 0.3277950831379459, + "grad_norm": 0.72265625, + "learning_rate": 0.0001807613485238526, + "loss": 0.859, + "step": 12766 + }, + { + "epoch": 0.32782076033386776, + "grad_norm": 0.7734375, + "learning_rate": 0.00018075871584925714, + "loss": 0.9207, + "step": 12767 + }, + { + "epoch": 0.32784643752978954, + "grad_norm": 0.671875, + "learning_rate": 0.0001807560830137168, + "loss": 1.017, + "step": 12768 + }, + { + "epoch": 0.32787211472571137, + "grad_norm": 0.953125, + "learning_rate": 0.00018075345001723693, + "loss": 0.9671, + "step": 12769 + }, + { + "epoch": 0.3278977919216332, + "grad_norm": 0.7890625, + "learning_rate": 0.0001807508168598227, + "loss": 0.9018, + "step": 12770 + }, + { + "epoch": 0.32792346911755504, + "grad_norm": 0.8671875, + "learning_rate": 0.0001807481835414794, + "loss": 1.1193, + "step": 12771 + }, + { + "epoch": 0.3279491463134768, + "grad_norm": 1.15625, + "learning_rate": 0.00018074555006221226, + "loss": 0.9446, + "step": 12772 + }, + { + "epoch": 0.32797482350939866, + "grad_norm": 0.7578125, + "learning_rate": 0.00018074291642202655, + "loss": 0.8863, + "step": 12773 + }, + { + "epoch": 0.3280005007053205, + "grad_norm": 0.77734375, + "learning_rate": 0.00018074028262092746, + "loss": 1.0272, + "step": 12774 + }, + { + "epoch": 0.3280261779012423, + "grad_norm": 0.796875, + "learning_rate": 0.00018073764865892032, + "loss": 0.9058, + "step": 12775 + }, + { + "epoch": 0.3280518550971641, + "grad_norm": 0.765625, + "learning_rate": 0.00018073501453601032, + "loss": 0.9159, + "step": 12776 + }, + { + "epoch": 0.32807753229308595, + "grad_norm": 0.7421875, + "learning_rate": 0.00018073238025220274, + "loss": 1.0434, + "step": 12777 + }, + { + "epoch": 0.32810320948900773, + "grad_norm": 0.79296875, + "learning_rate": 0.0001807297458075028, + "loss": 0.8576, + "step": 12778 + }, + { + "epoch": 0.32812888668492957, + "grad_norm": 0.79296875, + "learning_rate": 0.00018072711120191578, + "loss": 1.021, + "step": 12779 + }, + { + "epoch": 0.3281545638808514, + "grad_norm": 0.88671875, + "learning_rate": 0.0001807244764354469, + "loss": 0.9972, + "step": 12780 + }, + { + "epoch": 0.32818024107677324, + "grad_norm": 0.7421875, + "learning_rate": 0.00018072184150810146, + "loss": 0.8238, + "step": 12781 + }, + { + "epoch": 0.328205918272695, + "grad_norm": 0.72265625, + "learning_rate": 0.00018071920641988466, + "loss": 0.9436, + "step": 12782 + }, + { + "epoch": 0.32823159546861685, + "grad_norm": 0.8203125, + "learning_rate": 0.00018071657117080177, + "loss": 0.8918, + "step": 12783 + }, + { + "epoch": 0.3282572726645387, + "grad_norm": 0.84765625, + "learning_rate": 0.00018071393576085809, + "loss": 1.0134, + "step": 12784 + }, + { + "epoch": 0.32828294986046047, + "grad_norm": 0.78125, + "learning_rate": 0.00018071130019005878, + "loss": 0.9873, + "step": 12785 + }, + { + "epoch": 0.3283086270563823, + "grad_norm": 0.81640625, + "learning_rate": 0.00018070866445840916, + "loss": 1.0069, + "step": 12786 + }, + { + "epoch": 0.32833430425230414, + "grad_norm": 0.80859375, + "learning_rate": 0.00018070602856591446, + "loss": 0.9499, + "step": 12787 + }, + { + "epoch": 0.3283599814482259, + "grad_norm": 0.84375, + "learning_rate": 0.00018070339251257991, + "loss": 1.0834, + "step": 12788 + }, + { + "epoch": 0.32838565864414776, + "grad_norm": 0.8125, + "learning_rate": 0.00018070075629841083, + "loss": 0.9627, + "step": 12789 + }, + { + "epoch": 0.3284113358400696, + "grad_norm": 0.76171875, + "learning_rate": 0.0001806981199234124, + "loss": 0.9718, + "step": 12790 + }, + { + "epoch": 0.32843701303599143, + "grad_norm": 0.72265625, + "learning_rate": 0.00018069548338758992, + "loss": 1.0076, + "step": 12791 + }, + { + "epoch": 0.3284626902319132, + "grad_norm": 0.7421875, + "learning_rate": 0.00018069284669094865, + "loss": 0.8841, + "step": 12792 + }, + { + "epoch": 0.32848836742783505, + "grad_norm": 0.73828125, + "learning_rate": 0.0001806902098334938, + "loss": 0.985, + "step": 12793 + }, + { + "epoch": 0.3285140446237569, + "grad_norm": 0.76953125, + "learning_rate": 0.00018068757281523065, + "loss": 0.8211, + "step": 12794 + }, + { + "epoch": 0.32853972181967867, + "grad_norm": 0.76953125, + "learning_rate": 0.00018068493563616443, + "loss": 0.9049, + "step": 12795 + }, + { + "epoch": 0.3285653990156005, + "grad_norm": 0.7890625, + "learning_rate": 0.0001806822982963005, + "loss": 1.0555, + "step": 12796 + }, + { + "epoch": 0.32859107621152234, + "grad_norm": 0.7890625, + "learning_rate": 0.00018067966079564395, + "loss": 1.073, + "step": 12797 + }, + { + "epoch": 0.3286167534074441, + "grad_norm": 0.81640625, + "learning_rate": 0.0001806770231342002, + "loss": 0.8676, + "step": 12798 + }, + { + "epoch": 0.32864243060336595, + "grad_norm": 0.79296875, + "learning_rate": 0.00018067438531197438, + "loss": 1.0547, + "step": 12799 + }, + { + "epoch": 0.3286681077992878, + "grad_norm": 0.7578125, + "learning_rate": 0.0001806717473289718, + "loss": 0.9567, + "step": 12800 + }, + { + "epoch": 0.3286937849952096, + "grad_norm": 0.8203125, + "learning_rate": 0.0001806691091851977, + "loss": 0.8487, + "step": 12801 + }, + { + "epoch": 0.3287194621911314, + "grad_norm": 0.88671875, + "learning_rate": 0.0001806664708806574, + "loss": 0.874, + "step": 12802 + }, + { + "epoch": 0.32874513938705324, + "grad_norm": 0.81640625, + "learning_rate": 0.0001806638324153561, + "loss": 0.9876, + "step": 12803 + }, + { + "epoch": 0.3287708165829751, + "grad_norm": 0.796875, + "learning_rate": 0.00018066119378929904, + "loss": 1.0011, + "step": 12804 + }, + { + "epoch": 0.32879649377889686, + "grad_norm": 0.76953125, + "learning_rate": 0.00018065855500249155, + "loss": 0.9939, + "step": 12805 + }, + { + "epoch": 0.3288221709748187, + "grad_norm": 0.81640625, + "learning_rate": 0.00018065591605493882, + "loss": 0.8654, + "step": 12806 + }, + { + "epoch": 0.32884784817074053, + "grad_norm": 0.72265625, + "learning_rate": 0.0001806532769466461, + "loss": 0.9053, + "step": 12807 + }, + { + "epoch": 0.3288735253666623, + "grad_norm": 0.83203125, + "learning_rate": 0.00018065063767761874, + "loss": 0.8516, + "step": 12808 + }, + { + "epoch": 0.32889920256258415, + "grad_norm": 0.765625, + "learning_rate": 0.00018064799824786193, + "loss": 0.9895, + "step": 12809 + }, + { + "epoch": 0.328924879758506, + "grad_norm": 0.79296875, + "learning_rate": 0.00018064535865738092, + "loss": 0.8452, + "step": 12810 + }, + { + "epoch": 0.32895055695442776, + "grad_norm": 0.78515625, + "learning_rate": 0.00018064271890618101, + "loss": 1.1091, + "step": 12811 + }, + { + "epoch": 0.3289762341503496, + "grad_norm": 0.7734375, + "learning_rate": 0.00018064007899426746, + "loss": 0.8858, + "step": 12812 + }, + { + "epoch": 0.32900191134627144, + "grad_norm": 0.8515625, + "learning_rate": 0.00018063743892164551, + "loss": 0.9287, + "step": 12813 + }, + { + "epoch": 0.32902758854219327, + "grad_norm": 0.8203125, + "learning_rate": 0.00018063479868832041, + "loss": 1.0262, + "step": 12814 + }, + { + "epoch": 0.32905326573811505, + "grad_norm": 0.71875, + "learning_rate": 0.00018063215829429744, + "loss": 0.9103, + "step": 12815 + }, + { + "epoch": 0.3290789429340369, + "grad_norm": 0.7578125, + "learning_rate": 0.0001806295177395819, + "loss": 0.938, + "step": 12816 + }, + { + "epoch": 0.3291046201299587, + "grad_norm": 0.81640625, + "learning_rate": 0.00018062687702417898, + "loss": 1.0128, + "step": 12817 + }, + { + "epoch": 0.3291302973258805, + "grad_norm": 0.828125, + "learning_rate": 0.000180624236148094, + "loss": 0.9539, + "step": 12818 + }, + { + "epoch": 0.32915597452180234, + "grad_norm": 0.77734375, + "learning_rate": 0.00018062159511133215, + "loss": 0.9324, + "step": 12819 + }, + { + "epoch": 0.3291816517177242, + "grad_norm": 0.796875, + "learning_rate": 0.0001806189539138988, + "loss": 0.8742, + "step": 12820 + }, + { + "epoch": 0.32920732891364596, + "grad_norm": 0.73828125, + "learning_rate": 0.00018061631255579915, + "loss": 1.067, + "step": 12821 + }, + { + "epoch": 0.3292330061095678, + "grad_norm": 0.765625, + "learning_rate": 0.00018061367103703842, + "loss": 0.9315, + "step": 12822 + }, + { + "epoch": 0.32925868330548963, + "grad_norm": 0.77734375, + "learning_rate": 0.00018061102935762197, + "loss": 0.9298, + "step": 12823 + }, + { + "epoch": 0.32928436050141147, + "grad_norm": 0.76171875, + "learning_rate": 0.00018060838751755504, + "loss": 0.9667, + "step": 12824 + }, + { + "epoch": 0.32931003769733325, + "grad_norm": 0.6953125, + "learning_rate": 0.00018060574551684284, + "loss": 0.8251, + "step": 12825 + }, + { + "epoch": 0.3293357148932551, + "grad_norm": 0.73046875, + "learning_rate": 0.0001806031033554907, + "loss": 0.8515, + "step": 12826 + }, + { + "epoch": 0.3293613920891769, + "grad_norm": 0.828125, + "learning_rate": 0.0001806004610335038, + "loss": 1.0388, + "step": 12827 + }, + { + "epoch": 0.3293870692850987, + "grad_norm": 0.7890625, + "learning_rate": 0.0001805978185508875, + "loss": 1.0113, + "step": 12828 + }, + { + "epoch": 0.32941274648102054, + "grad_norm": 0.796875, + "learning_rate": 0.00018059517590764704, + "loss": 0.8493, + "step": 12829 + }, + { + "epoch": 0.32943842367694237, + "grad_norm": 0.828125, + "learning_rate": 0.00018059253310378766, + "loss": 0.9687, + "step": 12830 + }, + { + "epoch": 0.32946410087286415, + "grad_norm": 0.765625, + "learning_rate": 0.00018058989013931465, + "loss": 0.9695, + "step": 12831 + }, + { + "epoch": 0.329489778068786, + "grad_norm": 0.8359375, + "learning_rate": 0.00018058724701423324, + "loss": 0.9495, + "step": 12832 + }, + { + "epoch": 0.3295154552647078, + "grad_norm": 0.80859375, + "learning_rate": 0.00018058460372854875, + "loss": 0.9843, + "step": 12833 + }, + { + "epoch": 0.32954113246062966, + "grad_norm": 0.7421875, + "learning_rate": 0.0001805819602822664, + "loss": 0.9017, + "step": 12834 + }, + { + "epoch": 0.32956680965655144, + "grad_norm": 0.80078125, + "learning_rate": 0.00018057931667539152, + "loss": 0.8391, + "step": 12835 + }, + { + "epoch": 0.3295924868524733, + "grad_norm": 0.765625, + "learning_rate": 0.00018057667290792933, + "loss": 0.9203, + "step": 12836 + }, + { + "epoch": 0.3296181640483951, + "grad_norm": 0.75390625, + "learning_rate": 0.00018057402897988508, + "loss": 0.8631, + "step": 12837 + }, + { + "epoch": 0.3296438412443169, + "grad_norm": 1.203125, + "learning_rate": 0.0001805713848912641, + "loss": 0.9798, + "step": 12838 + }, + { + "epoch": 0.32966951844023873, + "grad_norm": 0.76171875, + "learning_rate": 0.0001805687406420716, + "loss": 0.8911, + "step": 12839 + }, + { + "epoch": 0.32969519563616057, + "grad_norm": 0.8046875, + "learning_rate": 0.00018056609623231292, + "loss": 0.9477, + "step": 12840 + }, + { + "epoch": 0.32972087283208235, + "grad_norm": 0.76953125, + "learning_rate": 0.00018056345166199325, + "loss": 0.9791, + "step": 12841 + }, + { + "epoch": 0.3297465500280042, + "grad_norm": 0.75, + "learning_rate": 0.0001805608069311179, + "loss": 0.9563, + "step": 12842 + }, + { + "epoch": 0.329772227223926, + "grad_norm": 1.078125, + "learning_rate": 0.00018055816203969214, + "loss": 0.9992, + "step": 12843 + }, + { + "epoch": 0.32979790441984785, + "grad_norm": 0.76171875, + "learning_rate": 0.00018055551698772126, + "loss": 0.7862, + "step": 12844 + }, + { + "epoch": 0.32982358161576963, + "grad_norm": 0.8046875, + "learning_rate": 0.0001805528717752105, + "loss": 0.8597, + "step": 12845 + }, + { + "epoch": 0.32984925881169147, + "grad_norm": 0.7421875, + "learning_rate": 0.00018055022640216513, + "loss": 0.9187, + "step": 12846 + }, + { + "epoch": 0.3298749360076133, + "grad_norm": 0.765625, + "learning_rate": 0.00018054758086859048, + "loss": 1.0543, + "step": 12847 + }, + { + "epoch": 0.3299006132035351, + "grad_norm": 0.80859375, + "learning_rate": 0.00018054493517449173, + "loss": 0.9385, + "step": 12848 + }, + { + "epoch": 0.3299262903994569, + "grad_norm": 0.7109375, + "learning_rate": 0.00018054228931987422, + "loss": 0.9756, + "step": 12849 + }, + { + "epoch": 0.32995196759537876, + "grad_norm": 0.79296875, + "learning_rate": 0.0001805396433047432, + "loss": 0.9799, + "step": 12850 + }, + { + "epoch": 0.32997764479130054, + "grad_norm": 0.8359375, + "learning_rate": 0.00018053699712910394, + "loss": 1.0141, + "step": 12851 + }, + { + "epoch": 0.3300033219872224, + "grad_norm": 0.8125, + "learning_rate": 0.00018053435079296173, + "loss": 0.8778, + "step": 12852 + }, + { + "epoch": 0.3300289991831442, + "grad_norm": 0.77734375, + "learning_rate": 0.00018053170429632183, + "loss": 0.9946, + "step": 12853 + }, + { + "epoch": 0.33005467637906605, + "grad_norm": 0.75, + "learning_rate": 0.0001805290576391895, + "loss": 0.8352, + "step": 12854 + }, + { + "epoch": 0.33008035357498783, + "grad_norm": 0.8046875, + "learning_rate": 0.00018052641082157007, + "loss": 0.964, + "step": 12855 + }, + { + "epoch": 0.33010603077090966, + "grad_norm": 0.72265625, + "learning_rate": 0.00018052376384346873, + "loss": 0.9709, + "step": 12856 + }, + { + "epoch": 0.3301317079668315, + "grad_norm": 0.72265625, + "learning_rate": 0.00018052111670489084, + "loss": 0.9481, + "step": 12857 + }, + { + "epoch": 0.3301573851627533, + "grad_norm": 0.7890625, + "learning_rate": 0.00018051846940584163, + "loss": 0.9272, + "step": 12858 + }, + { + "epoch": 0.3301830623586751, + "grad_norm": 0.83203125, + "learning_rate": 0.00018051582194632637, + "loss": 1.0283, + "step": 12859 + }, + { + "epoch": 0.33020873955459695, + "grad_norm": 0.9765625, + "learning_rate": 0.00018051317432635035, + "loss": 0.9894, + "step": 12860 + }, + { + "epoch": 0.33023441675051873, + "grad_norm": 0.703125, + "learning_rate": 0.00018051052654591886, + "loss": 0.9043, + "step": 12861 + }, + { + "epoch": 0.33026009394644057, + "grad_norm": 0.82421875, + "learning_rate": 0.00018050787860503714, + "loss": 0.9741, + "step": 12862 + }, + { + "epoch": 0.3302857711423624, + "grad_norm": 0.78125, + "learning_rate": 0.00018050523050371054, + "loss": 1.045, + "step": 12863 + }, + { + "epoch": 0.33031144833828424, + "grad_norm": 0.765625, + "learning_rate": 0.00018050258224194425, + "loss": 0.9082, + "step": 12864 + }, + { + "epoch": 0.330337125534206, + "grad_norm": 0.796875, + "learning_rate": 0.0001804999338197436, + "loss": 0.9835, + "step": 12865 + }, + { + "epoch": 0.33036280273012786, + "grad_norm": 0.76953125, + "learning_rate": 0.00018049728523711383, + "loss": 0.918, + "step": 12866 + }, + { + "epoch": 0.3303884799260497, + "grad_norm": 0.84375, + "learning_rate": 0.00018049463649406024, + "loss": 0.9773, + "step": 12867 + }, + { + "epoch": 0.3304141571219715, + "grad_norm": 0.81640625, + "learning_rate": 0.00018049198759058813, + "loss": 1.0718, + "step": 12868 + }, + { + "epoch": 0.3304398343178933, + "grad_norm": 0.73828125, + "learning_rate": 0.00018048933852670275, + "loss": 0.9333, + "step": 12869 + }, + { + "epoch": 0.33046551151381515, + "grad_norm": 0.82421875, + "learning_rate": 0.0001804866893024094, + "loss": 0.9866, + "step": 12870 + }, + { + "epoch": 0.3304911887097369, + "grad_norm": 0.796875, + "learning_rate": 0.00018048403991771336, + "loss": 0.8461, + "step": 12871 + }, + { + "epoch": 0.33051686590565876, + "grad_norm": 0.703125, + "learning_rate": 0.00018048139037261986, + "loss": 0.8639, + "step": 12872 + }, + { + "epoch": 0.3305425431015806, + "grad_norm": 0.82421875, + "learning_rate": 0.00018047874066713426, + "loss": 1.0596, + "step": 12873 + }, + { + "epoch": 0.33056822029750244, + "grad_norm": 0.7890625, + "learning_rate": 0.00018047609080126174, + "loss": 1.0445, + "step": 12874 + }, + { + "epoch": 0.3305938974934242, + "grad_norm": 0.74609375, + "learning_rate": 0.00018047344077500773, + "loss": 1.024, + "step": 12875 + }, + { + "epoch": 0.33061957468934605, + "grad_norm": 0.796875, + "learning_rate": 0.00018047079058837736, + "loss": 1.1884, + "step": 12876 + }, + { + "epoch": 0.3306452518852679, + "grad_norm": 0.8671875, + "learning_rate": 0.00018046814024137597, + "loss": 0.9238, + "step": 12877 + }, + { + "epoch": 0.33067092908118967, + "grad_norm": 0.8828125, + "learning_rate": 0.00018046548973400887, + "loss": 0.9527, + "step": 12878 + }, + { + "epoch": 0.3306966062771115, + "grad_norm": 0.82421875, + "learning_rate": 0.0001804628390662813, + "loss": 0.8932, + "step": 12879 + }, + { + "epoch": 0.33072228347303334, + "grad_norm": 0.9296875, + "learning_rate": 0.00018046018823819856, + "loss": 1.0478, + "step": 12880 + }, + { + "epoch": 0.3307479606689551, + "grad_norm": 0.7421875, + "learning_rate": 0.00018045753724976594, + "loss": 0.8936, + "step": 12881 + }, + { + "epoch": 0.33077363786487696, + "grad_norm": 0.7421875, + "learning_rate": 0.00018045488610098873, + "loss": 0.8915, + "step": 12882 + }, + { + "epoch": 0.3307993150607988, + "grad_norm": 0.7578125, + "learning_rate": 0.0001804522347918722, + "loss": 0.9289, + "step": 12883 + }, + { + "epoch": 0.33082499225672063, + "grad_norm": 0.75390625, + "learning_rate": 0.00018044958332242162, + "loss": 0.9123, + "step": 12884 + }, + { + "epoch": 0.3308506694526424, + "grad_norm": 0.78515625, + "learning_rate": 0.00018044693169264227, + "loss": 0.9693, + "step": 12885 + }, + { + "epoch": 0.33087634664856425, + "grad_norm": 0.796875, + "learning_rate": 0.00018044427990253947, + "loss": 0.9348, + "step": 12886 + }, + { + "epoch": 0.3309020238444861, + "grad_norm": 0.87109375, + "learning_rate": 0.0001804416279521185, + "loss": 1.018, + "step": 12887 + }, + { + "epoch": 0.33092770104040786, + "grad_norm": 0.8671875, + "learning_rate": 0.0001804389758413846, + "loss": 0.8556, + "step": 12888 + }, + { + "epoch": 0.3309533782363297, + "grad_norm": 0.7265625, + "learning_rate": 0.0001804363235703431, + "loss": 0.9841, + "step": 12889 + }, + { + "epoch": 0.33097905543225153, + "grad_norm": 0.70703125, + "learning_rate": 0.0001804336711389993, + "loss": 0.9977, + "step": 12890 + }, + { + "epoch": 0.3310047326281733, + "grad_norm": 0.7734375, + "learning_rate": 0.0001804310185473584, + "loss": 0.977, + "step": 12891 + }, + { + "epoch": 0.33103040982409515, + "grad_norm": 0.74609375, + "learning_rate": 0.00018042836579542582, + "loss": 1.0763, + "step": 12892 + }, + { + "epoch": 0.331056087020017, + "grad_norm": 0.765625, + "learning_rate": 0.0001804257128832067, + "loss": 1.0433, + "step": 12893 + }, + { + "epoch": 0.3310817642159388, + "grad_norm": 0.87109375, + "learning_rate": 0.00018042305981070644, + "loss": 0.9457, + "step": 12894 + }, + { + "epoch": 0.3311074414118606, + "grad_norm": 0.875, + "learning_rate": 0.0001804204065779303, + "loss": 0.9858, + "step": 12895 + }, + { + "epoch": 0.33113311860778244, + "grad_norm": 0.7265625, + "learning_rate": 0.0001804177531848835, + "loss": 1.0527, + "step": 12896 + }, + { + "epoch": 0.3311587958037043, + "grad_norm": 0.77734375, + "learning_rate": 0.00018041509963157143, + "loss": 1.0522, + "step": 12897 + }, + { + "epoch": 0.33118447299962606, + "grad_norm": 0.77734375, + "learning_rate": 0.0001804124459179993, + "loss": 0.9631, + "step": 12898 + }, + { + "epoch": 0.3312101501955479, + "grad_norm": 0.80859375, + "learning_rate": 0.00018040979204417246, + "loss": 0.9651, + "step": 12899 + }, + { + "epoch": 0.33123582739146973, + "grad_norm": 0.7578125, + "learning_rate": 0.00018040713801009613, + "loss": 0.8855, + "step": 12900 + }, + { + "epoch": 0.3312615045873915, + "grad_norm": 0.765625, + "learning_rate": 0.0001804044838157757, + "loss": 1.0141, + "step": 12901 + }, + { + "epoch": 0.33128718178331334, + "grad_norm": 0.8828125, + "learning_rate": 0.00018040182946121633, + "loss": 0.9616, + "step": 12902 + }, + { + "epoch": 0.3313128589792352, + "grad_norm": 0.90625, + "learning_rate": 0.00018039917494642343, + "loss": 1.0688, + "step": 12903 + }, + { + "epoch": 0.331338536175157, + "grad_norm": 0.78515625, + "learning_rate": 0.0001803965202714022, + "loss": 1.0518, + "step": 12904 + }, + { + "epoch": 0.3313642133710788, + "grad_norm": 0.82421875, + "learning_rate": 0.00018039386543615798, + "loss": 0.8954, + "step": 12905 + }, + { + "epoch": 0.33138989056700063, + "grad_norm": 0.80078125, + "learning_rate": 0.00018039121044069603, + "loss": 0.9781, + "step": 12906 + }, + { + "epoch": 0.33141556776292247, + "grad_norm": 0.78515625, + "learning_rate": 0.00018038855528502168, + "loss": 0.9656, + "step": 12907 + }, + { + "epoch": 0.33144124495884425, + "grad_norm": 0.89453125, + "learning_rate": 0.0001803858999691402, + "loss": 1.008, + "step": 12908 + }, + { + "epoch": 0.3314669221547661, + "grad_norm": 0.80078125, + "learning_rate": 0.00018038324449305685, + "loss": 0.9469, + "step": 12909 + }, + { + "epoch": 0.3314925993506879, + "grad_norm": 0.8125, + "learning_rate": 0.00018038058885677698, + "loss": 1.0493, + "step": 12910 + }, + { + "epoch": 0.3315182765466097, + "grad_norm": 0.82421875, + "learning_rate": 0.00018037793306030587, + "loss": 1.0864, + "step": 12911 + }, + { + "epoch": 0.33154395374253154, + "grad_norm": 0.77734375, + "learning_rate": 0.0001803752771036488, + "loss": 0.8968, + "step": 12912 + }, + { + "epoch": 0.3315696309384534, + "grad_norm": 0.7265625, + "learning_rate": 0.00018037262098681103, + "loss": 0.943, + "step": 12913 + }, + { + "epoch": 0.3315953081343752, + "grad_norm": 0.796875, + "learning_rate": 0.00018036996470979793, + "loss": 1.0143, + "step": 12914 + }, + { + "epoch": 0.331620985330297, + "grad_norm": 0.7578125, + "learning_rate": 0.00018036730827261473, + "loss": 1.0185, + "step": 12915 + }, + { + "epoch": 0.3316466625262188, + "grad_norm": 0.765625, + "learning_rate": 0.00018036465167526675, + "loss": 0.9639, + "step": 12916 + }, + { + "epoch": 0.33167233972214066, + "grad_norm": 0.8203125, + "learning_rate": 0.00018036199491775924, + "loss": 1.0477, + "step": 12917 + }, + { + "epoch": 0.33169801691806244, + "grad_norm": 0.76171875, + "learning_rate": 0.00018035933800009758, + "loss": 0.8591, + "step": 12918 + }, + { + "epoch": 0.3317236941139843, + "grad_norm": 0.72265625, + "learning_rate": 0.000180356680922287, + "loss": 1.0694, + "step": 12919 + }, + { + "epoch": 0.3317493713099061, + "grad_norm": 0.703125, + "learning_rate": 0.00018035402368433282, + "loss": 0.939, + "step": 12920 + }, + { + "epoch": 0.3317750485058279, + "grad_norm": 0.84765625, + "learning_rate": 0.0001803513662862403, + "loss": 0.9299, + "step": 12921 + }, + { + "epoch": 0.33180072570174973, + "grad_norm": 0.80078125, + "learning_rate": 0.00018034870872801482, + "loss": 1.0114, + "step": 12922 + }, + { + "epoch": 0.33182640289767157, + "grad_norm": 0.7734375, + "learning_rate": 0.0001803460510096616, + "loss": 0.8872, + "step": 12923 + }, + { + "epoch": 0.3318520800935934, + "grad_norm": 0.76171875, + "learning_rate": 0.00018034339313118593, + "loss": 0.8986, + "step": 12924 + }, + { + "epoch": 0.3318777572895152, + "grad_norm": 0.734375, + "learning_rate": 0.00018034073509259316, + "loss": 0.8063, + "step": 12925 + }, + { + "epoch": 0.331903434485437, + "grad_norm": 0.71875, + "learning_rate": 0.00018033807689388855, + "loss": 0.9697, + "step": 12926 + }, + { + "epoch": 0.33192911168135886, + "grad_norm": 0.79296875, + "learning_rate": 0.00018033541853507742, + "loss": 0.8981, + "step": 12927 + }, + { + "epoch": 0.33195478887728064, + "grad_norm": 0.86328125, + "learning_rate": 0.00018033276001616505, + "loss": 1.1516, + "step": 12928 + }, + { + "epoch": 0.3319804660732025, + "grad_norm": 0.79296875, + "learning_rate": 0.00018033010133715674, + "loss": 0.9523, + "step": 12929 + }, + { + "epoch": 0.3320061432691243, + "grad_norm": 0.78515625, + "learning_rate": 0.00018032744249805782, + "loss": 1.0406, + "step": 12930 + }, + { + "epoch": 0.3320318204650461, + "grad_norm": 0.703125, + "learning_rate": 0.00018032478349887354, + "loss": 0.8577, + "step": 12931 + }, + { + "epoch": 0.3320574976609679, + "grad_norm": 0.76171875, + "learning_rate": 0.00018032212433960923, + "loss": 0.8105, + "step": 12932 + }, + { + "epoch": 0.33208317485688976, + "grad_norm": 0.71875, + "learning_rate": 0.00018031946502027017, + "loss": 0.8784, + "step": 12933 + }, + { + "epoch": 0.3321088520528116, + "grad_norm": 0.80859375, + "learning_rate": 0.0001803168055408617, + "loss": 0.9564, + "step": 12934 + }, + { + "epoch": 0.3321345292487334, + "grad_norm": 0.71875, + "learning_rate": 0.00018031414590138906, + "loss": 0.9156, + "step": 12935 + }, + { + "epoch": 0.3321602064446552, + "grad_norm": 0.8515625, + "learning_rate": 0.0001803114861018576, + "loss": 0.965, + "step": 12936 + }, + { + "epoch": 0.33218588364057705, + "grad_norm": 0.90625, + "learning_rate": 0.0001803088261422726, + "loss": 0.9416, + "step": 12937 + }, + { + "epoch": 0.33221156083649883, + "grad_norm": 0.7578125, + "learning_rate": 0.00018030616602263938, + "loss": 0.9361, + "step": 12938 + }, + { + "epoch": 0.33223723803242067, + "grad_norm": 0.765625, + "learning_rate": 0.00018030350574296317, + "loss": 0.8356, + "step": 12939 + }, + { + "epoch": 0.3322629152283425, + "grad_norm": 0.80859375, + "learning_rate": 0.0001803008453032494, + "loss": 1.1058, + "step": 12940 + }, + { + "epoch": 0.3322885924242643, + "grad_norm": 0.765625, + "learning_rate": 0.00018029818470350324, + "loss": 0.9061, + "step": 12941 + }, + { + "epoch": 0.3323142696201861, + "grad_norm": 0.74609375, + "learning_rate": 0.00018029552394373006, + "loss": 0.9027, + "step": 12942 + }, + { + "epoch": 0.33233994681610796, + "grad_norm": 0.765625, + "learning_rate": 0.0001802928630239352, + "loss": 0.8949, + "step": 12943 + }, + { + "epoch": 0.3323656240120298, + "grad_norm": 0.75390625, + "learning_rate": 0.00018029020194412388, + "loss": 0.8574, + "step": 12944 + }, + { + "epoch": 0.3323913012079516, + "grad_norm": 0.79296875, + "learning_rate": 0.00018028754070430143, + "loss": 0.919, + "step": 12945 + }, + { + "epoch": 0.3324169784038734, + "grad_norm": 0.796875, + "learning_rate": 0.00018028487930447318, + "loss": 0.954, + "step": 12946 + }, + { + "epoch": 0.33244265559979524, + "grad_norm": 0.78515625, + "learning_rate": 0.00018028221774464442, + "loss": 0.9, + "step": 12947 + }, + { + "epoch": 0.332468332795717, + "grad_norm": 0.9453125, + "learning_rate": 0.00018027955602482044, + "loss": 0.9087, + "step": 12948 + }, + { + "epoch": 0.33249400999163886, + "grad_norm": 0.734375, + "learning_rate": 0.00018027689414500656, + "loss": 0.8617, + "step": 12949 + }, + { + "epoch": 0.3325196871875607, + "grad_norm": 0.8125, + "learning_rate": 0.0001802742321052081, + "loss": 1.01, + "step": 12950 + }, + { + "epoch": 0.3325453643834825, + "grad_norm": 0.75390625, + "learning_rate": 0.00018027156990543032, + "loss": 1.0844, + "step": 12951 + }, + { + "epoch": 0.3325710415794043, + "grad_norm": 0.7578125, + "learning_rate": 0.00018026890754567858, + "loss": 0.8916, + "step": 12952 + }, + { + "epoch": 0.33259671877532615, + "grad_norm": 0.8046875, + "learning_rate": 0.00018026624502595815, + "loss": 0.9013, + "step": 12953 + }, + { + "epoch": 0.332622395971248, + "grad_norm": 0.76953125, + "learning_rate": 0.00018026358234627432, + "loss": 0.806, + "step": 12954 + }, + { + "epoch": 0.33264807316716977, + "grad_norm": 0.75390625, + "learning_rate": 0.00018026091950663244, + "loss": 0.9285, + "step": 12955 + }, + { + "epoch": 0.3326737503630916, + "grad_norm": 0.734375, + "learning_rate": 0.00018025825650703779, + "loss": 0.8945, + "step": 12956 + }, + { + "epoch": 0.33269942755901344, + "grad_norm": 0.84765625, + "learning_rate": 0.0001802555933474957, + "loss": 1.1471, + "step": 12957 + }, + { + "epoch": 0.3327251047549352, + "grad_norm": 0.81640625, + "learning_rate": 0.00018025293002801144, + "loss": 0.9435, + "step": 12958 + }, + { + "epoch": 0.33275078195085706, + "grad_norm": 0.80078125, + "learning_rate": 0.00018025026654859035, + "loss": 0.9159, + "step": 12959 + }, + { + "epoch": 0.3327764591467789, + "grad_norm": 0.79296875, + "learning_rate": 0.00018024760290923775, + "loss": 0.8538, + "step": 12960 + }, + { + "epoch": 0.33280213634270067, + "grad_norm": 0.8203125, + "learning_rate": 0.0001802449391099589, + "loss": 0.9862, + "step": 12961 + }, + { + "epoch": 0.3328278135386225, + "grad_norm": 0.77734375, + "learning_rate": 0.00018024227515075914, + "loss": 0.906, + "step": 12962 + }, + { + "epoch": 0.33285349073454434, + "grad_norm": 0.859375, + "learning_rate": 0.00018023961103164377, + "loss": 1.0987, + "step": 12963 + }, + { + "epoch": 0.3328791679304662, + "grad_norm": 0.78515625, + "learning_rate": 0.0001802369467526181, + "loss": 0.908, + "step": 12964 + }, + { + "epoch": 0.33290484512638796, + "grad_norm": 0.80078125, + "learning_rate": 0.00018023428231368744, + "loss": 0.9407, + "step": 12965 + }, + { + "epoch": 0.3329305223223098, + "grad_norm": 0.78125, + "learning_rate": 0.00018023161771485712, + "loss": 0.9874, + "step": 12966 + }, + { + "epoch": 0.33295619951823163, + "grad_norm": 0.76953125, + "learning_rate": 0.00018022895295613243, + "loss": 0.9808, + "step": 12967 + }, + { + "epoch": 0.3329818767141534, + "grad_norm": 1.078125, + "learning_rate": 0.0001802262880375187, + "loss": 0.9393, + "step": 12968 + }, + { + "epoch": 0.33300755391007525, + "grad_norm": 0.71484375, + "learning_rate": 0.00018022362295902115, + "loss": 1.0871, + "step": 12969 + }, + { + "epoch": 0.3330332311059971, + "grad_norm": 0.796875, + "learning_rate": 0.00018022095772064525, + "loss": 0.8488, + "step": 12970 + }, + { + "epoch": 0.33305890830191887, + "grad_norm": 0.78515625, + "learning_rate": 0.0001802182923223962, + "loss": 0.9408, + "step": 12971 + }, + { + "epoch": 0.3330845854978407, + "grad_norm": 0.82421875, + "learning_rate": 0.00018021562676427932, + "loss": 1.1715, + "step": 12972 + }, + { + "epoch": 0.33311026269376254, + "grad_norm": 0.7890625, + "learning_rate": 0.00018021296104629996, + "loss": 0.9488, + "step": 12973 + }, + { + "epoch": 0.3331359398896844, + "grad_norm": 0.7578125, + "learning_rate": 0.0001802102951684634, + "loss": 0.9008, + "step": 12974 + }, + { + "epoch": 0.33316161708560615, + "grad_norm": 0.8125, + "learning_rate": 0.000180207629130775, + "loss": 0.8846, + "step": 12975 + }, + { + "epoch": 0.333187294281528, + "grad_norm": 0.8125, + "learning_rate": 0.00018020496293324003, + "loss": 1.1269, + "step": 12976 + }, + { + "epoch": 0.3332129714774498, + "grad_norm": 0.80078125, + "learning_rate": 0.0001802022965758638, + "loss": 1.0546, + "step": 12977 + }, + { + "epoch": 0.3332386486733716, + "grad_norm": 1.1796875, + "learning_rate": 0.00018019963005865163, + "loss": 0.9383, + "step": 12978 + }, + { + "epoch": 0.33326432586929344, + "grad_norm": 0.8125, + "learning_rate": 0.00018019696338160884, + "loss": 0.9789, + "step": 12979 + }, + { + "epoch": 0.3332900030652153, + "grad_norm": 0.85546875, + "learning_rate": 0.00018019429654474078, + "loss": 0.9578, + "step": 12980 + }, + { + "epoch": 0.33331568026113706, + "grad_norm": 0.80078125, + "learning_rate": 0.0001801916295480527, + "loss": 1.0835, + "step": 12981 + }, + { + "epoch": 0.3333413574570589, + "grad_norm": 0.75390625, + "learning_rate": 0.00018018896239154993, + "loss": 0.9389, + "step": 12982 + }, + { + "epoch": 0.33336703465298073, + "grad_norm": 0.78125, + "learning_rate": 0.00018018629507523786, + "loss": 0.9366, + "step": 12983 + }, + { + "epoch": 0.33339271184890257, + "grad_norm": 0.72265625, + "learning_rate": 0.00018018362759912173, + "loss": 0.8397, + "step": 12984 + }, + { + "epoch": 0.33341838904482435, + "grad_norm": 0.72265625, + "learning_rate": 0.00018018095996320685, + "loss": 0.9932, + "step": 12985 + }, + { + "epoch": 0.3334440662407462, + "grad_norm": 0.73046875, + "learning_rate": 0.00018017829216749855, + "loss": 1.074, + "step": 12986 + }, + { + "epoch": 0.333469743436668, + "grad_norm": 0.8359375, + "learning_rate": 0.00018017562421200218, + "loss": 0.969, + "step": 12987 + }, + { + "epoch": 0.3334954206325898, + "grad_norm": 0.79296875, + "learning_rate": 0.00018017295609672304, + "loss": 1.1326, + "step": 12988 + }, + { + "epoch": 0.33352109782851164, + "grad_norm": 0.78515625, + "learning_rate": 0.0001801702878216664, + "loss": 1.0891, + "step": 12989 + }, + { + "epoch": 0.3335467750244335, + "grad_norm": 0.7578125, + "learning_rate": 0.00018016761938683765, + "loss": 1.0114, + "step": 12990 + }, + { + "epoch": 0.33357245222035525, + "grad_norm": 0.87890625, + "learning_rate": 0.0001801649507922421, + "loss": 0.8692, + "step": 12991 + }, + { + "epoch": 0.3335981294162771, + "grad_norm": 0.84375, + "learning_rate": 0.00018016228203788498, + "loss": 0.9824, + "step": 12992 + }, + { + "epoch": 0.3336238066121989, + "grad_norm": 0.76953125, + "learning_rate": 0.00018015961312377172, + "loss": 0.9267, + "step": 12993 + }, + { + "epoch": 0.33364948380812076, + "grad_norm": 0.81640625, + "learning_rate": 0.00018015694404990758, + "loss": 0.986, + "step": 12994 + }, + { + "epoch": 0.33367516100404254, + "grad_norm": 0.82421875, + "learning_rate": 0.0001801542748162979, + "loss": 0.8399, + "step": 12995 + }, + { + "epoch": 0.3337008381999644, + "grad_norm": 0.75390625, + "learning_rate": 0.000180151605422948, + "loss": 0.865, + "step": 12996 + }, + { + "epoch": 0.3337265153958862, + "grad_norm": 0.85546875, + "learning_rate": 0.00018014893586986314, + "loss": 1.1116, + "step": 12997 + }, + { + "epoch": 0.333752192591808, + "grad_norm": 0.83203125, + "learning_rate": 0.00018014626615704872, + "loss": 0.8933, + "step": 12998 + }, + { + "epoch": 0.33377786978772983, + "grad_norm": 0.78515625, + "learning_rate": 0.00018014359628451006, + "loss": 0.8962, + "step": 12999 + }, + { + "epoch": 0.33380354698365167, + "grad_norm": 0.84765625, + "learning_rate": 0.00018014092625225244, + "loss": 0.9574, + "step": 13000 + }, + { + "epoch": 0.33380354698365167, + "eval_loss": 0.9538285136222839, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 406.1525, + "eval_samples_per_second": 24.621, + "eval_steps_per_second": 0.771, + "step": 13000 + }, + { + "epoch": 0.33382922417957345, + "grad_norm": 0.80078125, + "learning_rate": 0.00018013825606028118, + "loss": 1.0813, + "step": 13001 + }, + { + "epoch": 0.3338549013754953, + "grad_norm": 0.77734375, + "learning_rate": 0.0001801355857086016, + "loss": 0.8755, + "step": 13002 + }, + { + "epoch": 0.3338805785714171, + "grad_norm": 0.75, + "learning_rate": 0.00018013291519721904, + "loss": 0.9212, + "step": 13003 + }, + { + "epoch": 0.33390625576733896, + "grad_norm": 0.87109375, + "learning_rate": 0.00018013024452613884, + "loss": 1.149, + "step": 13004 + }, + { + "epoch": 0.33393193296326074, + "grad_norm": 0.6953125, + "learning_rate": 0.00018012757369536628, + "loss": 0.8671, + "step": 13005 + }, + { + "epoch": 0.33395761015918257, + "grad_norm": 0.8203125, + "learning_rate": 0.0001801249027049067, + "loss": 0.9997, + "step": 13006 + }, + { + "epoch": 0.3339832873551044, + "grad_norm": 0.74609375, + "learning_rate": 0.00018012223155476544, + "loss": 0.959, + "step": 13007 + }, + { + "epoch": 0.3340089645510262, + "grad_norm": 0.7890625, + "learning_rate": 0.00018011956024494783, + "loss": 1.0507, + "step": 13008 + }, + { + "epoch": 0.334034641746948, + "grad_norm": 0.9375, + "learning_rate": 0.00018011688877545915, + "loss": 0.9584, + "step": 13009 + }, + { + "epoch": 0.33406031894286986, + "grad_norm": 0.77734375, + "learning_rate": 0.00018011421714630475, + "loss": 0.8992, + "step": 13010 + }, + { + "epoch": 0.33408599613879164, + "grad_norm": 0.8125, + "learning_rate": 0.00018011154535748993, + "loss": 1.027, + "step": 13011 + }, + { + "epoch": 0.3341116733347135, + "grad_norm": 0.78125, + "learning_rate": 0.00018010887340902008, + "loss": 0.8496, + "step": 13012 + }, + { + "epoch": 0.3341373505306353, + "grad_norm": 0.7578125, + "learning_rate": 0.00018010620130090045, + "loss": 0.9195, + "step": 13013 + }, + { + "epoch": 0.3341630277265571, + "grad_norm": 0.83984375, + "learning_rate": 0.0001801035290331364, + "loss": 0.9283, + "step": 13014 + }, + { + "epoch": 0.33418870492247893, + "grad_norm": 0.74609375, + "learning_rate": 0.00018010085660573325, + "loss": 0.9047, + "step": 13015 + }, + { + "epoch": 0.33421438211840077, + "grad_norm": 0.80859375, + "learning_rate": 0.00018009818401869637, + "loss": 0.9213, + "step": 13016 + }, + { + "epoch": 0.3342400593143226, + "grad_norm": 0.68359375, + "learning_rate": 0.00018009551127203102, + "loss": 0.8357, + "step": 13017 + }, + { + "epoch": 0.3342657365102444, + "grad_norm": 1.6171875, + "learning_rate": 0.00018009283836574252, + "loss": 1.1239, + "step": 13018 + }, + { + "epoch": 0.3342914137061662, + "grad_norm": 0.81640625, + "learning_rate": 0.00018009016529983626, + "loss": 1.1022, + "step": 13019 + }, + { + "epoch": 0.33431709090208805, + "grad_norm": 0.7109375, + "learning_rate": 0.00018008749207431752, + "loss": 0.8568, + "step": 13020 + }, + { + "epoch": 0.33434276809800983, + "grad_norm": 0.8984375, + "learning_rate": 0.00018008481868919164, + "loss": 0.9794, + "step": 13021 + }, + { + "epoch": 0.33436844529393167, + "grad_norm": 0.75, + "learning_rate": 0.00018008214514446394, + "loss": 0.9704, + "step": 13022 + }, + { + "epoch": 0.3343941224898535, + "grad_norm": 0.80078125, + "learning_rate": 0.00018007947144013978, + "loss": 0.9471, + "step": 13023 + }, + { + "epoch": 0.3344197996857753, + "grad_norm": 0.84375, + "learning_rate": 0.00018007679757622448, + "loss": 0.9733, + "step": 13024 + }, + { + "epoch": 0.3344454768816971, + "grad_norm": 0.828125, + "learning_rate": 0.00018007412355272335, + "loss": 1.0404, + "step": 13025 + }, + { + "epoch": 0.33447115407761896, + "grad_norm": 0.71484375, + "learning_rate": 0.00018007144936964172, + "loss": 0.85, + "step": 13026 + }, + { + "epoch": 0.3344968312735408, + "grad_norm": 0.859375, + "learning_rate": 0.00018006877502698492, + "loss": 1.1166, + "step": 13027 + }, + { + "epoch": 0.3345225084694626, + "grad_norm": 0.7265625, + "learning_rate": 0.00018006610052475827, + "loss": 0.8706, + "step": 13028 + }, + { + "epoch": 0.3345481856653844, + "grad_norm": 0.78515625, + "learning_rate": 0.00018006342586296715, + "loss": 0.9175, + "step": 13029 + }, + { + "epoch": 0.33457386286130625, + "grad_norm": 0.7578125, + "learning_rate": 0.00018006075104161684, + "loss": 0.9589, + "step": 13030 + }, + { + "epoch": 0.33459954005722803, + "grad_norm": 0.89453125, + "learning_rate": 0.00018005807606071266, + "loss": 1.1267, + "step": 13031 + }, + { + "epoch": 0.33462521725314986, + "grad_norm": 0.80859375, + "learning_rate": 0.00018005540092025997, + "loss": 0.9801, + "step": 13032 + }, + { + "epoch": 0.3346508944490717, + "grad_norm": 0.7890625, + "learning_rate": 0.00018005272562026411, + "loss": 0.9639, + "step": 13033 + }, + { + "epoch": 0.3346765716449935, + "grad_norm": 0.74609375, + "learning_rate": 0.00018005005016073043, + "loss": 0.8939, + "step": 13034 + }, + { + "epoch": 0.3347022488409153, + "grad_norm": 0.70703125, + "learning_rate": 0.00018004737454166417, + "loss": 0.9497, + "step": 13035 + }, + { + "epoch": 0.33472792603683715, + "grad_norm": 0.6796875, + "learning_rate": 0.00018004469876307076, + "loss": 0.811, + "step": 13036 + }, + { + "epoch": 0.334753603232759, + "grad_norm": 0.8203125, + "learning_rate": 0.0001800420228249555, + "loss": 0.9072, + "step": 13037 + }, + { + "epoch": 0.33477928042868077, + "grad_norm": 0.7578125, + "learning_rate": 0.0001800393467273237, + "loss": 0.8263, + "step": 13038 + }, + { + "epoch": 0.3348049576246026, + "grad_norm": 0.75, + "learning_rate": 0.0001800366704701807, + "loss": 0.8951, + "step": 13039 + }, + { + "epoch": 0.33483063482052444, + "grad_norm": 0.8125, + "learning_rate": 0.00018003399405353186, + "loss": 0.9145, + "step": 13040 + }, + { + "epoch": 0.3348563120164462, + "grad_norm": 0.79296875, + "learning_rate": 0.00018003131747738253, + "loss": 1.0257, + "step": 13041 + }, + { + "epoch": 0.33488198921236806, + "grad_norm": 0.75390625, + "learning_rate": 0.00018002864074173798, + "loss": 0.8592, + "step": 13042 + }, + { + "epoch": 0.3349076664082899, + "grad_norm": 0.7890625, + "learning_rate": 0.0001800259638466036, + "loss": 0.9558, + "step": 13043 + }, + { + "epoch": 0.3349333436042117, + "grad_norm": 0.78515625, + "learning_rate": 0.00018002328679198469, + "loss": 0.9688, + "step": 13044 + }, + { + "epoch": 0.3349590208001335, + "grad_norm": 0.70703125, + "learning_rate": 0.0001800206095778866, + "loss": 0.9085, + "step": 13045 + }, + { + "epoch": 0.33498469799605535, + "grad_norm": 0.77734375, + "learning_rate": 0.00018001793220431464, + "loss": 0.9829, + "step": 13046 + }, + { + "epoch": 0.3350103751919772, + "grad_norm": 0.7734375, + "learning_rate": 0.0001800152546712742, + "loss": 0.9779, + "step": 13047 + }, + { + "epoch": 0.33503605238789896, + "grad_norm": 0.84375, + "learning_rate": 0.00018001257697877057, + "loss": 0.9425, + "step": 13048 + }, + { + "epoch": 0.3350617295838208, + "grad_norm": 0.79296875, + "learning_rate": 0.0001800098991268091, + "loss": 0.8844, + "step": 13049 + }, + { + "epoch": 0.33508740677974264, + "grad_norm": 0.76953125, + "learning_rate": 0.00018000722111539513, + "loss": 0.9482, + "step": 13050 + }, + { + "epoch": 0.3351130839756644, + "grad_norm": 0.76171875, + "learning_rate": 0.00018000454294453402, + "loss": 1.0006, + "step": 13051 + }, + { + "epoch": 0.33513876117158625, + "grad_norm": 0.78515625, + "learning_rate": 0.00018000186461423104, + "loss": 0.8924, + "step": 13052 + }, + { + "epoch": 0.3351644383675081, + "grad_norm": 0.84765625, + "learning_rate": 0.0001799991861244916, + "loss": 0.8967, + "step": 13053 + }, + { + "epoch": 0.33519011556342987, + "grad_norm": 0.89453125, + "learning_rate": 0.00017999650747532102, + "loss": 1.1166, + "step": 13054 + }, + { + "epoch": 0.3352157927593517, + "grad_norm": 0.76171875, + "learning_rate": 0.0001799938286667246, + "loss": 0.8154, + "step": 13055 + }, + { + "epoch": 0.33524146995527354, + "grad_norm": 0.78515625, + "learning_rate": 0.00017999114969870773, + "loss": 1.0094, + "step": 13056 + }, + { + "epoch": 0.3352671471511954, + "grad_norm": 0.77734375, + "learning_rate": 0.00017998847057127567, + "loss": 1.0053, + "step": 13057 + }, + { + "epoch": 0.33529282434711716, + "grad_norm": 0.734375, + "learning_rate": 0.00017998579128443386, + "loss": 0.8663, + "step": 13058 + }, + { + "epoch": 0.335318501543039, + "grad_norm": 0.80078125, + "learning_rate": 0.0001799831118381876, + "loss": 0.9263, + "step": 13059 + }, + { + "epoch": 0.33534417873896083, + "grad_norm": 0.8515625, + "learning_rate": 0.0001799804322325422, + "loss": 0.9149, + "step": 13060 + }, + { + "epoch": 0.3353698559348826, + "grad_norm": 0.84375, + "learning_rate": 0.00017997775246750305, + "loss": 1.0704, + "step": 13061 + }, + { + "epoch": 0.33539553313080445, + "grad_norm": 0.74609375, + "learning_rate": 0.0001799750725430754, + "loss": 0.8545, + "step": 13062 + }, + { + "epoch": 0.3354212103267263, + "grad_norm": 0.7890625, + "learning_rate": 0.00017997239245926472, + "loss": 0.8911, + "step": 13063 + }, + { + "epoch": 0.33544688752264806, + "grad_norm": 0.75390625, + "learning_rate": 0.00017996971221607625, + "loss": 0.8183, + "step": 13064 + }, + { + "epoch": 0.3354725647185699, + "grad_norm": 0.859375, + "learning_rate": 0.0001799670318135154, + "loss": 0.9254, + "step": 13065 + }, + { + "epoch": 0.33549824191449173, + "grad_norm": 0.69140625, + "learning_rate": 0.00017996435125158743, + "loss": 0.9008, + "step": 13066 + }, + { + "epoch": 0.33552391911041357, + "grad_norm": 0.83203125, + "learning_rate": 0.00017996167053029778, + "loss": 1.0188, + "step": 13067 + }, + { + "epoch": 0.33554959630633535, + "grad_norm": 0.765625, + "learning_rate": 0.00017995898964965173, + "loss": 1.0979, + "step": 13068 + }, + { + "epoch": 0.3355752735022572, + "grad_norm": 0.8046875, + "learning_rate": 0.00017995630860965462, + "loss": 0.8623, + "step": 13069 + }, + { + "epoch": 0.335600950698179, + "grad_norm": 0.73828125, + "learning_rate": 0.00017995362741031183, + "loss": 1.0317, + "step": 13070 + }, + { + "epoch": 0.3356266278941008, + "grad_norm": 0.76953125, + "learning_rate": 0.00017995094605162865, + "loss": 0.8925, + "step": 13071 + }, + { + "epoch": 0.33565230509002264, + "grad_norm": 0.73828125, + "learning_rate": 0.0001799482645336105, + "loss": 0.9949, + "step": 13072 + }, + { + "epoch": 0.3356779822859445, + "grad_norm": 0.7734375, + "learning_rate": 0.00017994558285626267, + "loss": 0.8156, + "step": 13073 + }, + { + "epoch": 0.33570365948186626, + "grad_norm": 0.80859375, + "learning_rate": 0.00017994290101959047, + "loss": 0.9942, + "step": 13074 + }, + { + "epoch": 0.3357293366777881, + "grad_norm": 1.203125, + "learning_rate": 0.00017994021902359935, + "loss": 0.9612, + "step": 13075 + }, + { + "epoch": 0.33575501387370993, + "grad_norm": 0.80078125, + "learning_rate": 0.00017993753686829455, + "loss": 1.0265, + "step": 13076 + }, + { + "epoch": 0.33578069106963176, + "grad_norm": 0.8046875, + "learning_rate": 0.00017993485455368148, + "loss": 0.8836, + "step": 13077 + }, + { + "epoch": 0.33580636826555355, + "grad_norm": 0.72265625, + "learning_rate": 0.00017993217207976547, + "loss": 0.845, + "step": 13078 + }, + { + "epoch": 0.3358320454614754, + "grad_norm": 0.75390625, + "learning_rate": 0.00017992948944655187, + "loss": 1.0403, + "step": 13079 + }, + { + "epoch": 0.3358577226573972, + "grad_norm": 0.79296875, + "learning_rate": 0.000179926806654046, + "loss": 0.9301, + "step": 13080 + }, + { + "epoch": 0.335883399853319, + "grad_norm": 0.91015625, + "learning_rate": 0.00017992412370225323, + "loss": 0.951, + "step": 13081 + }, + { + "epoch": 0.33590907704924083, + "grad_norm": 0.796875, + "learning_rate": 0.00017992144059117887, + "loss": 0.9425, + "step": 13082 + }, + { + "epoch": 0.33593475424516267, + "grad_norm": 0.76171875, + "learning_rate": 0.00017991875732082836, + "loss": 0.894, + "step": 13083 + }, + { + "epoch": 0.33596043144108445, + "grad_norm": 0.81640625, + "learning_rate": 0.00017991607389120692, + "loss": 0.7868, + "step": 13084 + }, + { + "epoch": 0.3359861086370063, + "grad_norm": 0.828125, + "learning_rate": 0.00017991339030232003, + "loss": 1.1477, + "step": 13085 + }, + { + "epoch": 0.3360117858329281, + "grad_norm": 0.8359375, + "learning_rate": 0.00017991070655417292, + "loss": 1.0056, + "step": 13086 + }, + { + "epoch": 0.33603746302884996, + "grad_norm": 0.85546875, + "learning_rate": 0.00017990802264677101, + "loss": 0.9566, + "step": 13087 + }, + { + "epoch": 0.33606314022477174, + "grad_norm": 0.76171875, + "learning_rate": 0.00017990533858011962, + "loss": 0.9636, + "step": 13088 + }, + { + "epoch": 0.3360888174206936, + "grad_norm": 0.70703125, + "learning_rate": 0.0001799026543542241, + "loss": 0.7976, + "step": 13089 + }, + { + "epoch": 0.3361144946166154, + "grad_norm": 0.80859375, + "learning_rate": 0.00017989996996908982, + "loss": 1.0743, + "step": 13090 + }, + { + "epoch": 0.3361401718125372, + "grad_norm": 0.82421875, + "learning_rate": 0.0001798972854247221, + "loss": 0.8489, + "step": 13091 + }, + { + "epoch": 0.33616584900845903, + "grad_norm": 0.75, + "learning_rate": 0.00017989460072112635, + "loss": 0.8433, + "step": 13092 + }, + { + "epoch": 0.33619152620438086, + "grad_norm": 0.76953125, + "learning_rate": 0.00017989191585830784, + "loss": 0.9993, + "step": 13093 + }, + { + "epoch": 0.33621720340030264, + "grad_norm": 0.8203125, + "learning_rate": 0.00017988923083627197, + "loss": 1.0333, + "step": 13094 + }, + { + "epoch": 0.3362428805962245, + "grad_norm": 0.8046875, + "learning_rate": 0.0001798865456550241, + "loss": 0.9216, + "step": 13095 + }, + { + "epoch": 0.3362685577921463, + "grad_norm": 0.8125, + "learning_rate": 0.00017988386031456951, + "loss": 0.8747, + "step": 13096 + }, + { + "epoch": 0.33629423498806815, + "grad_norm": 0.78125, + "learning_rate": 0.00017988117481491366, + "loss": 0.9312, + "step": 13097 + }, + { + "epoch": 0.33631991218398993, + "grad_norm": 0.8515625, + "learning_rate": 0.00017987848915606184, + "loss": 0.932, + "step": 13098 + }, + { + "epoch": 0.33634558937991177, + "grad_norm": 0.8359375, + "learning_rate": 0.00017987580333801936, + "loss": 1.0668, + "step": 13099 + }, + { + "epoch": 0.3363712665758336, + "grad_norm": 0.875, + "learning_rate": 0.00017987311736079165, + "loss": 1.0207, + "step": 13100 + }, + { + "epoch": 0.3363969437717554, + "grad_norm": 0.7890625, + "learning_rate": 0.00017987043122438402, + "loss": 0.9547, + "step": 13101 + }, + { + "epoch": 0.3364226209676772, + "grad_norm": 0.765625, + "learning_rate": 0.00017986774492880186, + "loss": 0.9956, + "step": 13102 + }, + { + "epoch": 0.33644829816359906, + "grad_norm": 0.6796875, + "learning_rate": 0.00017986505847405047, + "loss": 0.8327, + "step": 13103 + }, + { + "epoch": 0.33647397535952084, + "grad_norm": 0.84765625, + "learning_rate": 0.00017986237186013525, + "loss": 0.8606, + "step": 13104 + }, + { + "epoch": 0.3364996525554427, + "grad_norm": 0.7578125, + "learning_rate": 0.00017985968508706153, + "loss": 0.9253, + "step": 13105 + }, + { + "epoch": 0.3365253297513645, + "grad_norm": 0.734375, + "learning_rate": 0.00017985699815483466, + "loss": 0.9189, + "step": 13106 + }, + { + "epoch": 0.33655100694728635, + "grad_norm": 0.9140625, + "learning_rate": 0.00017985431106346, + "loss": 1.0127, + "step": 13107 + }, + { + "epoch": 0.3365766841432081, + "grad_norm": 0.6953125, + "learning_rate": 0.00017985162381294295, + "loss": 0.9525, + "step": 13108 + }, + { + "epoch": 0.33660236133912996, + "grad_norm": 0.7890625, + "learning_rate": 0.0001798489364032888, + "loss": 0.8867, + "step": 13109 + }, + { + "epoch": 0.3366280385350518, + "grad_norm": 0.859375, + "learning_rate": 0.00017984624883450291, + "loss": 0.9354, + "step": 13110 + }, + { + "epoch": 0.3366537157309736, + "grad_norm": 0.828125, + "learning_rate": 0.0001798435611065907, + "loss": 0.9391, + "step": 13111 + }, + { + "epoch": 0.3366793929268954, + "grad_norm": 0.765625, + "learning_rate": 0.00017984087321955747, + "loss": 0.8229, + "step": 13112 + }, + { + "epoch": 0.33670507012281725, + "grad_norm": 0.84765625, + "learning_rate": 0.0001798381851734086, + "loss": 0.966, + "step": 13113 + }, + { + "epoch": 0.33673074731873903, + "grad_norm": 0.8359375, + "learning_rate": 0.0001798354969681494, + "loss": 0.9034, + "step": 13114 + }, + { + "epoch": 0.33675642451466087, + "grad_norm": 0.796875, + "learning_rate": 0.0001798328086037853, + "loss": 0.8913, + "step": 13115 + }, + { + "epoch": 0.3367821017105827, + "grad_norm": 0.8359375, + "learning_rate": 0.00017983012008032163, + "loss": 1.0984, + "step": 13116 + }, + { + "epoch": 0.33680777890650454, + "grad_norm": 0.77734375, + "learning_rate": 0.00017982743139776376, + "loss": 0.8951, + "step": 13117 + }, + { + "epoch": 0.3368334561024263, + "grad_norm": 0.88671875, + "learning_rate": 0.000179824742556117, + "loss": 0.9311, + "step": 13118 + }, + { + "epoch": 0.33685913329834816, + "grad_norm": 0.7578125, + "learning_rate": 0.00017982205355538672, + "loss": 1.1529, + "step": 13119 + }, + { + "epoch": 0.33688481049427, + "grad_norm": 0.80859375, + "learning_rate": 0.0001798193643955783, + "loss": 1.0368, + "step": 13120 + }, + { + "epoch": 0.3369104876901918, + "grad_norm": 0.796875, + "learning_rate": 0.00017981667507669714, + "loss": 0.8744, + "step": 13121 + }, + { + "epoch": 0.3369361648861136, + "grad_norm": 0.80859375, + "learning_rate": 0.00017981398559874852, + "loss": 0.9686, + "step": 13122 + }, + { + "epoch": 0.33696184208203545, + "grad_norm": 0.77734375, + "learning_rate": 0.00017981129596173788, + "loss": 0.9487, + "step": 13123 + }, + { + "epoch": 0.3369875192779572, + "grad_norm": 0.8125, + "learning_rate": 0.00017980860616567047, + "loss": 1.0068, + "step": 13124 + }, + { + "epoch": 0.33701319647387906, + "grad_norm": 0.796875, + "learning_rate": 0.0001798059162105518, + "loss": 1.0215, + "step": 13125 + }, + { + "epoch": 0.3370388736698009, + "grad_norm": 0.75390625, + "learning_rate": 0.00017980322609638705, + "loss": 0.8463, + "step": 13126 + }, + { + "epoch": 0.33706455086572273, + "grad_norm": 0.8984375, + "learning_rate": 0.00017980053582318176, + "loss": 1.0398, + "step": 13127 + }, + { + "epoch": 0.3370902280616445, + "grad_norm": 0.76953125, + "learning_rate": 0.00017979784539094116, + "loss": 0.9117, + "step": 13128 + }, + { + "epoch": 0.33711590525756635, + "grad_norm": 0.7890625, + "learning_rate": 0.00017979515479967068, + "loss": 0.9459, + "step": 13129 + }, + { + "epoch": 0.3371415824534882, + "grad_norm": 0.796875, + "learning_rate": 0.00017979246404937566, + "loss": 0.9583, + "step": 13130 + }, + { + "epoch": 0.33716725964940997, + "grad_norm": 0.74609375, + "learning_rate": 0.00017978977314006146, + "loss": 1.0161, + "step": 13131 + }, + { + "epoch": 0.3371929368453318, + "grad_norm": 0.76953125, + "learning_rate": 0.00017978708207173346, + "loss": 0.8056, + "step": 13132 + }, + { + "epoch": 0.33721861404125364, + "grad_norm": 0.73828125, + "learning_rate": 0.00017978439084439702, + "loss": 0.9155, + "step": 13133 + }, + { + "epoch": 0.3372442912371754, + "grad_norm": 0.74609375, + "learning_rate": 0.0001797816994580575, + "loss": 1.0411, + "step": 13134 + }, + { + "epoch": 0.33726996843309726, + "grad_norm": 0.796875, + "learning_rate": 0.00017977900791272026, + "loss": 0.9163, + "step": 13135 + }, + { + "epoch": 0.3372956456290191, + "grad_norm": 0.75, + "learning_rate": 0.00017977631620839064, + "loss": 0.8873, + "step": 13136 + }, + { + "epoch": 0.33732132282494093, + "grad_norm": 0.80859375, + "learning_rate": 0.00017977362434507406, + "loss": 0.893, + "step": 13137 + }, + { + "epoch": 0.3373470000208627, + "grad_norm": 0.7890625, + "learning_rate": 0.0001797709323227758, + "loss": 0.9229, + "step": 13138 + }, + { + "epoch": 0.33737267721678454, + "grad_norm": 0.76953125, + "learning_rate": 0.00017976824014150132, + "loss": 1.0475, + "step": 13139 + }, + { + "epoch": 0.3373983544127064, + "grad_norm": 0.8203125, + "learning_rate": 0.00017976554780125593, + "loss": 0.9944, + "step": 13140 + }, + { + "epoch": 0.33742403160862816, + "grad_norm": 0.69921875, + "learning_rate": 0.00017976285530204504, + "loss": 0.9742, + "step": 13141 + }, + { + "epoch": 0.33744970880455, + "grad_norm": 0.76953125, + "learning_rate": 0.00017976016264387395, + "loss": 0.8894, + "step": 13142 + }, + { + "epoch": 0.33747538600047183, + "grad_norm": 0.78125, + "learning_rate": 0.00017975746982674807, + "loss": 1.0392, + "step": 13143 + }, + { + "epoch": 0.3375010631963936, + "grad_norm": 0.765625, + "learning_rate": 0.00017975477685067275, + "loss": 0.9219, + "step": 13144 + }, + { + "epoch": 0.33752674039231545, + "grad_norm": 0.79296875, + "learning_rate": 0.00017975208371565334, + "loss": 0.9521, + "step": 13145 + }, + { + "epoch": 0.3375524175882373, + "grad_norm": 1.625, + "learning_rate": 0.00017974939042169526, + "loss": 0.8935, + "step": 13146 + }, + { + "epoch": 0.3375780947841591, + "grad_norm": 0.70703125, + "learning_rate": 0.00017974669696880387, + "loss": 0.8534, + "step": 13147 + }, + { + "epoch": 0.3376037719800809, + "grad_norm": 0.79296875, + "learning_rate": 0.0001797440033569845, + "loss": 0.9448, + "step": 13148 + }, + { + "epoch": 0.33762944917600274, + "grad_norm": 0.8125, + "learning_rate": 0.00017974130958624255, + "loss": 0.9594, + "step": 13149 + }, + { + "epoch": 0.3376551263719246, + "grad_norm": 0.80078125, + "learning_rate": 0.0001797386156565833, + "loss": 0.8744, + "step": 13150 + }, + { + "epoch": 0.33768080356784635, + "grad_norm": 0.80078125, + "learning_rate": 0.00017973592156801226, + "loss": 1.0331, + "step": 13151 + }, + { + "epoch": 0.3377064807637682, + "grad_norm": 0.80859375, + "learning_rate": 0.00017973322732053472, + "loss": 0.9799, + "step": 13152 + }, + { + "epoch": 0.33773215795969, + "grad_norm": 0.7734375, + "learning_rate": 0.00017973053291415604, + "loss": 1.0022, + "step": 13153 + }, + { + "epoch": 0.3377578351556118, + "grad_norm": 0.73828125, + "learning_rate": 0.00017972783834888166, + "loss": 0.85, + "step": 13154 + }, + { + "epoch": 0.33778351235153364, + "grad_norm": 0.83203125, + "learning_rate": 0.00017972514362471685, + "loss": 1.0297, + "step": 13155 + }, + { + "epoch": 0.3378091895474555, + "grad_norm": 3.15625, + "learning_rate": 0.00017972244874166703, + "loss": 0.8752, + "step": 13156 + }, + { + "epoch": 0.3378348667433773, + "grad_norm": 0.8828125, + "learning_rate": 0.0001797197536997376, + "loss": 0.9812, + "step": 13157 + }, + { + "epoch": 0.3378605439392991, + "grad_norm": 0.81640625, + "learning_rate": 0.0001797170584989339, + "loss": 1.0479, + "step": 13158 + }, + { + "epoch": 0.33788622113522093, + "grad_norm": 0.74609375, + "learning_rate": 0.00017971436313926126, + "loss": 0.921, + "step": 13159 + }, + { + "epoch": 0.33791189833114277, + "grad_norm": 0.68359375, + "learning_rate": 0.00017971166762072516, + "loss": 0.9033, + "step": 13160 + }, + { + "epoch": 0.33793757552706455, + "grad_norm": 0.7109375, + "learning_rate": 0.00017970897194333088, + "loss": 0.9592, + "step": 13161 + }, + { + "epoch": 0.3379632527229864, + "grad_norm": 0.83984375, + "learning_rate": 0.0001797062761070838, + "loss": 1.0577, + "step": 13162 + }, + { + "epoch": 0.3379889299189082, + "grad_norm": 0.8125, + "learning_rate": 0.0001797035801119893, + "loss": 0.9412, + "step": 13163 + }, + { + "epoch": 0.33801460711483, + "grad_norm": 0.8203125, + "learning_rate": 0.0001797008839580528, + "loss": 1.0156, + "step": 13164 + }, + { + "epoch": 0.33804028431075184, + "grad_norm": 0.8203125, + "learning_rate": 0.00017969818764527963, + "loss": 0.9289, + "step": 13165 + }, + { + "epoch": 0.3380659615066737, + "grad_norm": 0.7734375, + "learning_rate": 0.00017969549117367517, + "loss": 0.9157, + "step": 13166 + }, + { + "epoch": 0.3380916387025955, + "grad_norm": 0.765625, + "learning_rate": 0.0001796927945432448, + "loss": 0.8917, + "step": 13167 + }, + { + "epoch": 0.3381173158985173, + "grad_norm": 0.78515625, + "learning_rate": 0.00017969009775399387, + "loss": 0.9646, + "step": 13168 + }, + { + "epoch": 0.3381429930944391, + "grad_norm": 0.74609375, + "learning_rate": 0.0001796874008059278, + "loss": 0.9263, + "step": 13169 + }, + { + "epoch": 0.33816867029036096, + "grad_norm": 0.75390625, + "learning_rate": 0.0001796847036990519, + "loss": 0.983, + "step": 13170 + }, + { + "epoch": 0.33819434748628274, + "grad_norm": 0.8046875, + "learning_rate": 0.0001796820064333716, + "loss": 0.9482, + "step": 13171 + }, + { + "epoch": 0.3382200246822046, + "grad_norm": 0.8203125, + "learning_rate": 0.00017967930900889228, + "loss": 1.0246, + "step": 13172 + }, + { + "epoch": 0.3382457018781264, + "grad_norm": 0.71484375, + "learning_rate": 0.00017967661142561926, + "loss": 0.8502, + "step": 13173 + }, + { + "epoch": 0.3382713790740482, + "grad_norm": 0.8125, + "learning_rate": 0.000179673913683558, + "loss": 0.9699, + "step": 13174 + }, + { + "epoch": 0.33829705626997003, + "grad_norm": 0.74609375, + "learning_rate": 0.00017967121578271378, + "loss": 0.8457, + "step": 13175 + }, + { + "epoch": 0.33832273346589187, + "grad_norm": 0.77734375, + "learning_rate": 0.00017966851772309203, + "loss": 0.8242, + "step": 13176 + }, + { + "epoch": 0.3383484106618137, + "grad_norm": 0.90234375, + "learning_rate": 0.00017966581950469815, + "loss": 0.9658, + "step": 13177 + }, + { + "epoch": 0.3383740878577355, + "grad_norm": 0.703125, + "learning_rate": 0.00017966312112753746, + "loss": 0.9254, + "step": 13178 + }, + { + "epoch": 0.3383997650536573, + "grad_norm": 0.73828125, + "learning_rate": 0.00017966042259161537, + "loss": 0.9542, + "step": 13179 + }, + { + "epoch": 0.33842544224957916, + "grad_norm": 0.796875, + "learning_rate": 0.00017965772389693724, + "loss": 1.0231, + "step": 13180 + }, + { + "epoch": 0.33845111944550094, + "grad_norm": 0.796875, + "learning_rate": 0.0001796550250435085, + "loss": 0.9634, + "step": 13181 + }, + { + "epoch": 0.3384767966414228, + "grad_norm": 0.95703125, + "learning_rate": 0.00017965232603133448, + "loss": 0.8783, + "step": 13182 + }, + { + "epoch": 0.3385024738373446, + "grad_norm": 0.75, + "learning_rate": 0.00017964962686042055, + "loss": 0.9072, + "step": 13183 + }, + { + "epoch": 0.3385281510332664, + "grad_norm": 0.78125, + "learning_rate": 0.00017964692753077213, + "loss": 0.9541, + "step": 13184 + }, + { + "epoch": 0.3385538282291882, + "grad_norm": 0.73046875, + "learning_rate": 0.00017964422804239455, + "loss": 0.9982, + "step": 13185 + }, + { + "epoch": 0.33857950542511006, + "grad_norm": 0.84765625, + "learning_rate": 0.00017964152839529326, + "loss": 1.0547, + "step": 13186 + }, + { + "epoch": 0.3386051826210319, + "grad_norm": 0.81640625, + "learning_rate": 0.00017963882858947354, + "loss": 0.9893, + "step": 13187 + }, + { + "epoch": 0.3386308598169537, + "grad_norm": 0.75, + "learning_rate": 0.00017963612862494088, + "loss": 0.8841, + "step": 13188 + }, + { + "epoch": 0.3386565370128755, + "grad_norm": 0.7109375, + "learning_rate": 0.00017963342850170057, + "loss": 0.9327, + "step": 13189 + }, + { + "epoch": 0.33868221420879735, + "grad_norm": 0.7578125, + "learning_rate": 0.00017963072821975805, + "loss": 0.9299, + "step": 13190 + }, + { + "epoch": 0.33870789140471913, + "grad_norm": 0.7890625, + "learning_rate": 0.0001796280277791187, + "loss": 0.8097, + "step": 13191 + }, + { + "epoch": 0.33873356860064097, + "grad_norm": 0.78515625, + "learning_rate": 0.00017962532717978784, + "loss": 0.9292, + "step": 13192 + }, + { + "epoch": 0.3387592457965628, + "grad_norm": 0.73046875, + "learning_rate": 0.00017962262642177092, + "loss": 0.9756, + "step": 13193 + }, + { + "epoch": 0.3387849229924846, + "grad_norm": 0.83203125, + "learning_rate": 0.00017961992550507333, + "loss": 0.9156, + "step": 13194 + }, + { + "epoch": 0.3388106001884064, + "grad_norm": 0.78125, + "learning_rate": 0.00017961722442970035, + "loss": 0.8912, + "step": 13195 + }, + { + "epoch": 0.33883627738432825, + "grad_norm": 0.828125, + "learning_rate": 0.00017961452319565748, + "loss": 0.9592, + "step": 13196 + }, + { + "epoch": 0.3388619545802501, + "grad_norm": 0.7265625, + "learning_rate": 0.00017961182180295006, + "loss": 0.9837, + "step": 13197 + }, + { + "epoch": 0.33888763177617187, + "grad_norm": 0.83984375, + "learning_rate": 0.00017960912025158344, + "loss": 0.8598, + "step": 13198 + }, + { + "epoch": 0.3389133089720937, + "grad_norm": 0.73046875, + "learning_rate": 0.00017960641854156305, + "loss": 0.9931, + "step": 13199 + }, + { + "epoch": 0.33893898616801554, + "grad_norm": 0.76953125, + "learning_rate": 0.00017960371667289428, + "loss": 0.9301, + "step": 13200 + }, + { + "epoch": 0.3389646633639373, + "grad_norm": 0.78515625, + "learning_rate": 0.00017960101464558248, + "loss": 0.9625, + "step": 13201 + }, + { + "epoch": 0.33899034055985916, + "grad_norm": 0.84375, + "learning_rate": 0.00017959831245963303, + "loss": 0.974, + "step": 13202 + }, + { + "epoch": 0.339016017755781, + "grad_norm": 0.83984375, + "learning_rate": 0.00017959561011505134, + "loss": 0.9608, + "step": 13203 + }, + { + "epoch": 0.3390416949517028, + "grad_norm": 0.76953125, + "learning_rate": 0.0001795929076118428, + "loss": 1.0457, + "step": 13204 + }, + { + "epoch": 0.3390673721476246, + "grad_norm": 0.8203125, + "learning_rate": 0.0001795902049500128, + "loss": 0.9725, + "step": 13205 + }, + { + "epoch": 0.33909304934354645, + "grad_norm": 0.69921875, + "learning_rate": 0.00017958750212956667, + "loss": 0.7369, + "step": 13206 + }, + { + "epoch": 0.3391187265394683, + "grad_norm": 0.87890625, + "learning_rate": 0.00017958479915050987, + "loss": 0.9912, + "step": 13207 + }, + { + "epoch": 0.33914440373539007, + "grad_norm": 0.76171875, + "learning_rate": 0.00017958209601284773, + "loss": 0.9455, + "step": 13208 + }, + { + "epoch": 0.3391700809313119, + "grad_norm": 0.796875, + "learning_rate": 0.00017957939271658567, + "loss": 0.8513, + "step": 13209 + }, + { + "epoch": 0.33919575812723374, + "grad_norm": 0.79296875, + "learning_rate": 0.00017957668926172908, + "loss": 1.0624, + "step": 13210 + }, + { + "epoch": 0.3392214353231555, + "grad_norm": 0.79296875, + "learning_rate": 0.00017957398564828333, + "loss": 1.0647, + "step": 13211 + }, + { + "epoch": 0.33924711251907735, + "grad_norm": 0.82421875, + "learning_rate": 0.00017957128187625382, + "loss": 0.838, + "step": 13212 + }, + { + "epoch": 0.3392727897149992, + "grad_norm": 0.80859375, + "learning_rate": 0.00017956857794564592, + "loss": 1.1776, + "step": 13213 + }, + { + "epoch": 0.33929846691092097, + "grad_norm": 0.7109375, + "learning_rate": 0.00017956587385646505, + "loss": 0.8762, + "step": 13214 + }, + { + "epoch": 0.3393241441068428, + "grad_norm": 0.8671875, + "learning_rate": 0.00017956316960871655, + "loss": 0.971, + "step": 13215 + }, + { + "epoch": 0.33934982130276464, + "grad_norm": 0.796875, + "learning_rate": 0.00017956046520240586, + "loss": 0.9927, + "step": 13216 + }, + { + "epoch": 0.3393754984986865, + "grad_norm": 0.74609375, + "learning_rate": 0.00017955776063753836, + "loss": 0.9442, + "step": 13217 + }, + { + "epoch": 0.33940117569460826, + "grad_norm": 0.78515625, + "learning_rate": 0.00017955505591411942, + "loss": 0.8055, + "step": 13218 + }, + { + "epoch": 0.3394268528905301, + "grad_norm": 0.76171875, + "learning_rate": 0.0001795523510321544, + "loss": 0.9873, + "step": 13219 + }, + { + "epoch": 0.33945253008645193, + "grad_norm": 0.8828125, + "learning_rate": 0.00017954964599164878, + "loss": 1.0275, + "step": 13220 + }, + { + "epoch": 0.3394782072823737, + "grad_norm": 0.765625, + "learning_rate": 0.0001795469407926079, + "loss": 0.9109, + "step": 13221 + }, + { + "epoch": 0.33950388447829555, + "grad_norm": 0.82421875, + "learning_rate": 0.00017954423543503712, + "loss": 0.9841, + "step": 13222 + }, + { + "epoch": 0.3395295616742174, + "grad_norm": 0.8125, + "learning_rate": 0.00017954152991894192, + "loss": 0.9619, + "step": 13223 + }, + { + "epoch": 0.33955523887013916, + "grad_norm": 0.73828125, + "learning_rate": 0.00017953882424432758, + "loss": 0.9006, + "step": 13224 + }, + { + "epoch": 0.339580916066061, + "grad_norm": 0.83984375, + "learning_rate": 0.00017953611841119957, + "loss": 0.9586, + "step": 13225 + }, + { + "epoch": 0.33960659326198284, + "grad_norm": 0.7734375, + "learning_rate": 0.00017953341241956324, + "loss": 0.8588, + "step": 13226 + }, + { + "epoch": 0.3396322704579046, + "grad_norm": 0.78515625, + "learning_rate": 0.00017953070626942403, + "loss": 1.0291, + "step": 13227 + }, + { + "epoch": 0.33965794765382645, + "grad_norm": 0.82421875, + "learning_rate": 0.00017952799996078731, + "loss": 1.0466, + "step": 13228 + }, + { + "epoch": 0.3396836248497483, + "grad_norm": 0.87109375, + "learning_rate": 0.00017952529349365848, + "loss": 0.9898, + "step": 13229 + }, + { + "epoch": 0.3397093020456701, + "grad_norm": 0.80859375, + "learning_rate": 0.0001795225868680429, + "loss": 0.9742, + "step": 13230 + }, + { + "epoch": 0.3397349792415919, + "grad_norm": 0.83984375, + "learning_rate": 0.000179519880083946, + "loss": 1.0218, + "step": 13231 + }, + { + "epoch": 0.33976065643751374, + "grad_norm": 0.75390625, + "learning_rate": 0.00017951717314137314, + "loss": 0.9234, + "step": 13232 + }, + { + "epoch": 0.3397863336334356, + "grad_norm": 0.79296875, + "learning_rate": 0.00017951446604032973, + "loss": 0.9297, + "step": 13233 + }, + { + "epoch": 0.33981201082935736, + "grad_norm": 0.80078125, + "learning_rate": 0.00017951175878082117, + "loss": 0.8398, + "step": 13234 + }, + { + "epoch": 0.3398376880252792, + "grad_norm": 0.76953125, + "learning_rate": 0.0001795090513628529, + "loss": 1.026, + "step": 13235 + }, + { + "epoch": 0.33986336522120103, + "grad_norm": 0.78125, + "learning_rate": 0.00017950634378643024, + "loss": 0.8357, + "step": 13236 + }, + { + "epoch": 0.3398890424171228, + "grad_norm": 0.65625, + "learning_rate": 0.00017950363605155864, + "loss": 0.7199, + "step": 13237 + }, + { + "epoch": 0.33991471961304465, + "grad_norm": 0.87109375, + "learning_rate": 0.00017950092815824346, + "loss": 1.0179, + "step": 13238 + }, + { + "epoch": 0.3399403968089665, + "grad_norm": 0.7734375, + "learning_rate": 0.0001794982201064901, + "loss": 0.9175, + "step": 13239 + }, + { + "epoch": 0.3399660740048883, + "grad_norm": 0.703125, + "learning_rate": 0.000179495511896304, + "loss": 0.8889, + "step": 13240 + }, + { + "epoch": 0.3399917512008101, + "grad_norm": 0.765625, + "learning_rate": 0.00017949280352769052, + "loss": 0.8379, + "step": 13241 + }, + { + "epoch": 0.34001742839673194, + "grad_norm": 0.796875, + "learning_rate": 0.00017949009500065502, + "loss": 0.8921, + "step": 13242 + }, + { + "epoch": 0.34004310559265377, + "grad_norm": 0.8359375, + "learning_rate": 0.00017948738631520298, + "loss": 1.1115, + "step": 13243 + }, + { + "epoch": 0.34006878278857555, + "grad_norm": 0.75, + "learning_rate": 0.00017948467747133976, + "loss": 0.9174, + "step": 13244 + }, + { + "epoch": 0.3400944599844974, + "grad_norm": 0.78515625, + "learning_rate": 0.00017948196846907075, + "loss": 0.856, + "step": 13245 + }, + { + "epoch": 0.3401201371804192, + "grad_norm": 0.8125, + "learning_rate": 0.00017947925930840138, + "loss": 1.124, + "step": 13246 + }, + { + "epoch": 0.340145814376341, + "grad_norm": 0.80078125, + "learning_rate": 0.00017947654998933698, + "loss": 0.8883, + "step": 13247 + }, + { + "epoch": 0.34017149157226284, + "grad_norm": 0.7109375, + "learning_rate": 0.000179473840511883, + "loss": 0.8701, + "step": 13248 + }, + { + "epoch": 0.3401971687681847, + "grad_norm": 0.75, + "learning_rate": 0.00017947113087604487, + "loss": 0.9367, + "step": 13249 + }, + { + "epoch": 0.3402228459641065, + "grad_norm": 0.890625, + "learning_rate": 0.00017946842108182792, + "loss": 0.7404, + "step": 13250 + }, + { + "epoch": 0.3402485231600283, + "grad_norm": 0.734375, + "learning_rate": 0.00017946571112923763, + "loss": 0.8195, + "step": 13251 + }, + { + "epoch": 0.34027420035595013, + "grad_norm": 0.7890625, + "learning_rate": 0.0001794630010182793, + "loss": 0.8934, + "step": 13252 + }, + { + "epoch": 0.34029987755187197, + "grad_norm": 0.78515625, + "learning_rate": 0.00017946029074895843, + "loss": 0.9619, + "step": 13253 + }, + { + "epoch": 0.34032555474779375, + "grad_norm": 0.78515625, + "learning_rate": 0.00017945758032128034, + "loss": 0.9607, + "step": 13254 + }, + { + "epoch": 0.3403512319437156, + "grad_norm": 0.81640625, + "learning_rate": 0.0001794548697352505, + "loss": 1.0778, + "step": 13255 + }, + { + "epoch": 0.3403769091396374, + "grad_norm": 0.7890625, + "learning_rate": 0.0001794521589908743, + "loss": 0.9531, + "step": 13256 + }, + { + "epoch": 0.3404025863355592, + "grad_norm": 0.77734375, + "learning_rate": 0.00017944944808815706, + "loss": 0.8809, + "step": 13257 + }, + { + "epoch": 0.34042826353148103, + "grad_norm": 0.74609375, + "learning_rate": 0.0001794467370271043, + "loss": 1.0034, + "step": 13258 + }, + { + "epoch": 0.34045394072740287, + "grad_norm": 0.78125, + "learning_rate": 0.00017944402580772134, + "loss": 0.9195, + "step": 13259 + }, + { + "epoch": 0.3404796179233247, + "grad_norm": 0.77734375, + "learning_rate": 0.00017944131443001365, + "loss": 0.8295, + "step": 13260 + }, + { + "epoch": 0.3405052951192465, + "grad_norm": 0.80859375, + "learning_rate": 0.00017943860289398655, + "loss": 1.0332, + "step": 13261 + }, + { + "epoch": 0.3405309723151683, + "grad_norm": 0.77734375, + "learning_rate": 0.00017943589119964553, + "loss": 0.915, + "step": 13262 + }, + { + "epoch": 0.34055664951109016, + "grad_norm": 0.7265625, + "learning_rate": 0.00017943317934699591, + "loss": 0.9225, + "step": 13263 + }, + { + "epoch": 0.34058232670701194, + "grad_norm": 0.79296875, + "learning_rate": 0.0001794304673360432, + "loss": 1.0852, + "step": 13264 + }, + { + "epoch": 0.3406080039029338, + "grad_norm": 0.7734375, + "learning_rate": 0.0001794277551667927, + "loss": 0.9581, + "step": 13265 + }, + { + "epoch": 0.3406336810988556, + "grad_norm": 0.76953125, + "learning_rate": 0.00017942504283924987, + "loss": 1.0134, + "step": 13266 + }, + { + "epoch": 0.3406593582947774, + "grad_norm": 0.73828125, + "learning_rate": 0.00017942233035342012, + "loss": 0.8674, + "step": 13267 + }, + { + "epoch": 0.34068503549069923, + "grad_norm": 0.81640625, + "learning_rate": 0.0001794196177093088, + "loss": 1.1068, + "step": 13268 + }, + { + "epoch": 0.34071071268662106, + "grad_norm": 0.8515625, + "learning_rate": 0.00017941690490692137, + "loss": 0.969, + "step": 13269 + }, + { + "epoch": 0.3407363898825429, + "grad_norm": 0.7890625, + "learning_rate": 0.00017941419194626322, + "loss": 0.9206, + "step": 13270 + }, + { + "epoch": 0.3407620670784647, + "grad_norm": 0.78125, + "learning_rate": 0.00017941147882733976, + "loss": 0.8841, + "step": 13271 + }, + { + "epoch": 0.3407877442743865, + "grad_norm": 0.8359375, + "learning_rate": 0.00017940876555015643, + "loss": 0.9464, + "step": 13272 + }, + { + "epoch": 0.34081342147030835, + "grad_norm": 0.734375, + "learning_rate": 0.00017940605211471858, + "loss": 0.8265, + "step": 13273 + }, + { + "epoch": 0.34083909866623013, + "grad_norm": 0.83203125, + "learning_rate": 0.00017940333852103164, + "loss": 0.9456, + "step": 13274 + }, + { + "epoch": 0.34086477586215197, + "grad_norm": 0.84375, + "learning_rate": 0.000179400624769101, + "loss": 1.0381, + "step": 13275 + }, + { + "epoch": 0.3408904530580738, + "grad_norm": 0.8203125, + "learning_rate": 0.0001793979108589321, + "loss": 1.0175, + "step": 13276 + }, + { + "epoch": 0.3409161302539956, + "grad_norm": 0.78515625, + "learning_rate": 0.00017939519679053035, + "loss": 0.8963, + "step": 13277 + }, + { + "epoch": 0.3409418074499174, + "grad_norm": 0.7890625, + "learning_rate": 0.00017939248256390114, + "loss": 0.9466, + "step": 13278 + }, + { + "epoch": 0.34096748464583926, + "grad_norm": 0.75390625, + "learning_rate": 0.00017938976817904987, + "loss": 0.9859, + "step": 13279 + }, + { + "epoch": 0.3409931618417611, + "grad_norm": 0.8125, + "learning_rate": 0.00017938705363598198, + "loss": 0.9058, + "step": 13280 + }, + { + "epoch": 0.3410188390376829, + "grad_norm": 0.734375, + "learning_rate": 0.00017938433893470283, + "loss": 0.7748, + "step": 13281 + }, + { + "epoch": 0.3410445162336047, + "grad_norm": 0.765625, + "learning_rate": 0.0001793816240752179, + "loss": 0.9406, + "step": 13282 + }, + { + "epoch": 0.34107019342952655, + "grad_norm": 0.76953125, + "learning_rate": 0.00017937890905753257, + "loss": 0.9565, + "step": 13283 + }, + { + "epoch": 0.3410958706254483, + "grad_norm": 0.8359375, + "learning_rate": 0.0001793761938816522, + "loss": 0.9635, + "step": 13284 + }, + { + "epoch": 0.34112154782137016, + "grad_norm": 0.7421875, + "learning_rate": 0.0001793734785475823, + "loss": 0.9239, + "step": 13285 + }, + { + "epoch": 0.341147225017292, + "grad_norm": 0.80859375, + "learning_rate": 0.0001793707630553282, + "loss": 0.9225, + "step": 13286 + }, + { + "epoch": 0.3411729022132138, + "grad_norm": 0.765625, + "learning_rate": 0.0001793680474048953, + "loss": 0.9612, + "step": 13287 + }, + { + "epoch": 0.3411985794091356, + "grad_norm": 0.765625, + "learning_rate": 0.0001793653315962891, + "loss": 0.9209, + "step": 13288 + }, + { + "epoch": 0.34122425660505745, + "grad_norm": 0.83203125, + "learning_rate": 0.00017936261562951494, + "loss": 0.9454, + "step": 13289 + }, + { + "epoch": 0.3412499338009793, + "grad_norm": 0.77734375, + "learning_rate": 0.0001793598995045783, + "loss": 0.9729, + "step": 13290 + }, + { + "epoch": 0.34127561099690107, + "grad_norm": 0.796875, + "learning_rate": 0.0001793571832214845, + "loss": 0.8928, + "step": 13291 + }, + { + "epoch": 0.3413012881928229, + "grad_norm": 0.77734375, + "learning_rate": 0.000179354466780239, + "loss": 0.9006, + "step": 13292 + }, + { + "epoch": 0.34132696538874474, + "grad_norm": 0.828125, + "learning_rate": 0.00017935175018084728, + "loss": 0.9591, + "step": 13293 + }, + { + "epoch": 0.3413526425846665, + "grad_norm": 0.7890625, + "learning_rate": 0.0001793490334233146, + "loss": 0.9654, + "step": 13294 + }, + { + "epoch": 0.34137831978058836, + "grad_norm": 0.80078125, + "learning_rate": 0.00017934631650764652, + "loss": 0.9319, + "step": 13295 + }, + { + "epoch": 0.3414039969765102, + "grad_norm": 0.76171875, + "learning_rate": 0.0001793435994338484, + "loss": 0.9642, + "step": 13296 + }, + { + "epoch": 0.341429674172432, + "grad_norm": 0.7265625, + "learning_rate": 0.0001793408822019256, + "loss": 0.9703, + "step": 13297 + }, + { + "epoch": 0.3414553513683538, + "grad_norm": 0.78515625, + "learning_rate": 0.00017933816481188365, + "loss": 1.0764, + "step": 13298 + }, + { + "epoch": 0.34148102856427565, + "grad_norm": 0.83984375, + "learning_rate": 0.0001793354472637279, + "loss": 0.8667, + "step": 13299 + }, + { + "epoch": 0.3415067057601975, + "grad_norm": 0.83203125, + "learning_rate": 0.00017933272955746374, + "loss": 0.9874, + "step": 13300 + }, + { + "epoch": 0.34153238295611926, + "grad_norm": 0.734375, + "learning_rate": 0.0001793300116930966, + "loss": 0.9729, + "step": 13301 + }, + { + "epoch": 0.3415580601520411, + "grad_norm": 0.76953125, + "learning_rate": 0.00017932729367063196, + "loss": 0.9924, + "step": 13302 + }, + { + "epoch": 0.34158373734796293, + "grad_norm": 1.1640625, + "learning_rate": 0.0001793245754900752, + "loss": 1.0499, + "step": 13303 + }, + { + "epoch": 0.3416094145438847, + "grad_norm": 0.7734375, + "learning_rate": 0.00017932185715143167, + "loss": 1.0002, + "step": 13304 + }, + { + "epoch": 0.34163509173980655, + "grad_norm": 0.78125, + "learning_rate": 0.00017931913865470688, + "loss": 0.866, + "step": 13305 + }, + { + "epoch": 0.3416607689357284, + "grad_norm": 0.8515625, + "learning_rate": 0.0001793164199999062, + "loss": 1.0772, + "step": 13306 + }, + { + "epoch": 0.34168644613165017, + "grad_norm": 0.88671875, + "learning_rate": 0.00017931370118703507, + "loss": 1.0776, + "step": 13307 + }, + { + "epoch": 0.341712123327572, + "grad_norm": 0.765625, + "learning_rate": 0.0001793109822160989, + "loss": 0.9758, + "step": 13308 + }, + { + "epoch": 0.34173780052349384, + "grad_norm": 0.76171875, + "learning_rate": 0.00017930826308710306, + "loss": 0.8847, + "step": 13309 + }, + { + "epoch": 0.3417634777194157, + "grad_norm": 0.8125, + "learning_rate": 0.00017930554380005308, + "loss": 1.038, + "step": 13310 + }, + { + "epoch": 0.34178915491533746, + "grad_norm": 0.76953125, + "learning_rate": 0.00017930282435495428, + "loss": 0.9145, + "step": 13311 + }, + { + "epoch": 0.3418148321112593, + "grad_norm": 0.7421875, + "learning_rate": 0.00017930010475181212, + "loss": 0.9629, + "step": 13312 + }, + { + "epoch": 0.34184050930718113, + "grad_norm": 0.7890625, + "learning_rate": 0.000179297384990632, + "loss": 1.0549, + "step": 13313 + }, + { + "epoch": 0.3418661865031029, + "grad_norm": 0.76171875, + "learning_rate": 0.00017929466507141939, + "loss": 0.8829, + "step": 13314 + }, + { + "epoch": 0.34189186369902475, + "grad_norm": 0.734375, + "learning_rate": 0.00017929194499417966, + "loss": 1.0423, + "step": 13315 + }, + { + "epoch": 0.3419175408949466, + "grad_norm": 0.84375, + "learning_rate": 0.00017928922475891822, + "loss": 0.9074, + "step": 13316 + }, + { + "epoch": 0.34194321809086836, + "grad_norm": 0.80078125, + "learning_rate": 0.00017928650436564055, + "loss": 0.98, + "step": 13317 + }, + { + "epoch": 0.3419688952867902, + "grad_norm": 0.8046875, + "learning_rate": 0.00017928378381435202, + "loss": 1.0122, + "step": 13318 + }, + { + "epoch": 0.34199457248271203, + "grad_norm": 0.859375, + "learning_rate": 0.0001792810631050581, + "loss": 0.9985, + "step": 13319 + }, + { + "epoch": 0.34202024967863387, + "grad_norm": 0.8203125, + "learning_rate": 0.00017927834223776414, + "loss": 0.9244, + "step": 13320 + }, + { + "epoch": 0.34204592687455565, + "grad_norm": 0.8125, + "learning_rate": 0.00017927562121247562, + "loss": 0.9094, + "step": 13321 + }, + { + "epoch": 0.3420716040704775, + "grad_norm": 0.8515625, + "learning_rate": 0.00017927290002919794, + "loss": 0.9828, + "step": 13322 + }, + { + "epoch": 0.3420972812663993, + "grad_norm": 1.3203125, + "learning_rate": 0.00017927017868793655, + "loss": 0.9806, + "step": 13323 + }, + { + "epoch": 0.3421229584623211, + "grad_norm": 0.8046875, + "learning_rate": 0.00017926745718869686, + "loss": 0.8851, + "step": 13324 + }, + { + "epoch": 0.34214863565824294, + "grad_norm": 0.80078125, + "learning_rate": 0.00017926473553148427, + "loss": 0.8751, + "step": 13325 + }, + { + "epoch": 0.3421743128541648, + "grad_norm": 0.73046875, + "learning_rate": 0.00017926201371630423, + "loss": 0.9391, + "step": 13326 + }, + { + "epoch": 0.34219999005008656, + "grad_norm": 0.78125, + "learning_rate": 0.00017925929174316215, + "loss": 0.9869, + "step": 13327 + }, + { + "epoch": 0.3422256672460084, + "grad_norm": 0.7890625, + "learning_rate": 0.00017925656961206347, + "loss": 0.9086, + "step": 13328 + }, + { + "epoch": 0.3422513444419302, + "grad_norm": 1.4296875, + "learning_rate": 0.0001792538473230136, + "loss": 0.9397, + "step": 13329 + }, + { + "epoch": 0.34227702163785206, + "grad_norm": 0.78125, + "learning_rate": 0.00017925112487601798, + "loss": 1.0154, + "step": 13330 + }, + { + "epoch": 0.34230269883377384, + "grad_norm": 0.7578125, + "learning_rate": 0.00017924840227108202, + "loss": 0.9615, + "step": 13331 + }, + { + "epoch": 0.3423283760296957, + "grad_norm": 0.79296875, + "learning_rate": 0.00017924567950821114, + "loss": 0.9496, + "step": 13332 + }, + { + "epoch": 0.3423540532256175, + "grad_norm": 0.828125, + "learning_rate": 0.0001792429565874108, + "loss": 1.0068, + "step": 13333 + }, + { + "epoch": 0.3423797304215393, + "grad_norm": 0.734375, + "learning_rate": 0.00017924023350868642, + "loss": 1.0587, + "step": 13334 + }, + { + "epoch": 0.34240540761746113, + "grad_norm": 0.86328125, + "learning_rate": 0.00017923751027204337, + "loss": 1.0255, + "step": 13335 + }, + { + "epoch": 0.34243108481338297, + "grad_norm": 0.77734375, + "learning_rate": 0.00017923478687748714, + "loss": 0.9345, + "step": 13336 + }, + { + "epoch": 0.34245676200930475, + "grad_norm": 0.83203125, + "learning_rate": 0.00017923206332502313, + "loss": 0.9548, + "step": 13337 + }, + { + "epoch": 0.3424824392052266, + "grad_norm": 0.79296875, + "learning_rate": 0.0001792293396146568, + "loss": 1.1674, + "step": 13338 + }, + { + "epoch": 0.3425081164011484, + "grad_norm": 0.83984375, + "learning_rate": 0.0001792266157463935, + "loss": 0.9316, + "step": 13339 + }, + { + "epoch": 0.34253379359707026, + "grad_norm": 0.7734375, + "learning_rate": 0.00017922389172023878, + "loss": 0.8505, + "step": 13340 + }, + { + "epoch": 0.34255947079299204, + "grad_norm": 0.91015625, + "learning_rate": 0.00017922116753619797, + "loss": 0.9315, + "step": 13341 + }, + { + "epoch": 0.3425851479889139, + "grad_norm": 0.73046875, + "learning_rate": 0.0001792184431942765, + "loss": 0.9612, + "step": 13342 + }, + { + "epoch": 0.3426108251848357, + "grad_norm": 0.79296875, + "learning_rate": 0.00017921571869447986, + "loss": 1.0553, + "step": 13343 + }, + { + "epoch": 0.3426365023807575, + "grad_norm": 0.7421875, + "learning_rate": 0.0001792129940368134, + "loss": 0.8913, + "step": 13344 + }, + { + "epoch": 0.3426621795766793, + "grad_norm": 0.73046875, + "learning_rate": 0.00017921026922128268, + "loss": 0.8794, + "step": 13345 + }, + { + "epoch": 0.34268785677260116, + "grad_norm": 0.75390625, + "learning_rate": 0.000179207544247893, + "loss": 0.9167, + "step": 13346 + }, + { + "epoch": 0.34271353396852294, + "grad_norm": 0.84765625, + "learning_rate": 0.00017920481911664986, + "loss": 1.054, + "step": 13347 + }, + { + "epoch": 0.3427392111644448, + "grad_norm": 0.74609375, + "learning_rate": 0.00017920209382755865, + "loss": 0.9958, + "step": 13348 + }, + { + "epoch": 0.3427648883603666, + "grad_norm": 0.796875, + "learning_rate": 0.00017919936838062484, + "loss": 0.9847, + "step": 13349 + }, + { + "epoch": 0.34279056555628845, + "grad_norm": 0.83203125, + "learning_rate": 0.0001791966427758538, + "loss": 0.8879, + "step": 13350 + }, + { + "epoch": 0.34281624275221023, + "grad_norm": 0.86328125, + "learning_rate": 0.00017919391701325105, + "loss": 0.8982, + "step": 13351 + }, + { + "epoch": 0.34284191994813207, + "grad_norm": 0.796875, + "learning_rate": 0.00017919119109282197, + "loss": 0.9272, + "step": 13352 + }, + { + "epoch": 0.3428675971440539, + "grad_norm": 0.83203125, + "learning_rate": 0.00017918846501457198, + "loss": 0.9715, + "step": 13353 + }, + { + "epoch": 0.3428932743399757, + "grad_norm": 0.7890625, + "learning_rate": 0.00017918573877850655, + "loss": 0.8623, + "step": 13354 + }, + { + "epoch": 0.3429189515358975, + "grad_norm": 0.90625, + "learning_rate": 0.0001791830123846311, + "loss": 0.9935, + "step": 13355 + }, + { + "epoch": 0.34294462873181936, + "grad_norm": 0.83203125, + "learning_rate": 0.00017918028583295103, + "loss": 1.0934, + "step": 13356 + }, + { + "epoch": 0.34297030592774114, + "grad_norm": 0.7890625, + "learning_rate": 0.00017917755912347181, + "loss": 0.9058, + "step": 13357 + }, + { + "epoch": 0.342995983123663, + "grad_norm": 0.76953125, + "learning_rate": 0.0001791748322561989, + "loss": 0.828, + "step": 13358 + }, + { + "epoch": 0.3430216603195848, + "grad_norm": 0.75, + "learning_rate": 0.00017917210523113767, + "loss": 0.9472, + "step": 13359 + }, + { + "epoch": 0.34304733751550665, + "grad_norm": 0.81640625, + "learning_rate": 0.00017916937804829362, + "loss": 1.2961, + "step": 13360 + }, + { + "epoch": 0.3430730147114284, + "grad_norm": 0.80859375, + "learning_rate": 0.00017916665070767212, + "loss": 1.0195, + "step": 13361 + }, + { + "epoch": 0.34309869190735026, + "grad_norm": 0.7890625, + "learning_rate": 0.00017916392320927864, + "loss": 0.7688, + "step": 13362 + }, + { + "epoch": 0.3431243691032721, + "grad_norm": 0.7421875, + "learning_rate": 0.0001791611955531186, + "loss": 0.9165, + "step": 13363 + }, + { + "epoch": 0.3431500462991939, + "grad_norm": 0.78125, + "learning_rate": 0.00017915846773919748, + "loss": 0.9076, + "step": 13364 + }, + { + "epoch": 0.3431757234951157, + "grad_norm": 0.734375, + "learning_rate": 0.00017915573976752064, + "loss": 0.8736, + "step": 13365 + }, + { + "epoch": 0.34320140069103755, + "grad_norm": 0.76953125, + "learning_rate": 0.0001791530116380936, + "loss": 0.9082, + "step": 13366 + }, + { + "epoch": 0.34322707788695933, + "grad_norm": 0.70703125, + "learning_rate": 0.00017915028335092174, + "loss": 0.8946, + "step": 13367 + }, + { + "epoch": 0.34325275508288117, + "grad_norm": 0.78125, + "learning_rate": 0.00017914755490601052, + "loss": 0.9438, + "step": 13368 + }, + { + "epoch": 0.343278432278803, + "grad_norm": 0.74609375, + "learning_rate": 0.0001791448263033654, + "loss": 0.8897, + "step": 13369 + }, + { + "epoch": 0.34330410947472484, + "grad_norm": 0.75, + "learning_rate": 0.00017914209754299175, + "loss": 0.9444, + "step": 13370 + }, + { + "epoch": 0.3433297866706466, + "grad_norm": 0.8515625, + "learning_rate": 0.00017913936862489505, + "loss": 0.9917, + "step": 13371 + }, + { + "epoch": 0.34335546386656846, + "grad_norm": 0.79296875, + "learning_rate": 0.00017913663954908073, + "loss": 0.93, + "step": 13372 + }, + { + "epoch": 0.3433811410624903, + "grad_norm": 0.78125, + "learning_rate": 0.00017913391031555427, + "loss": 1.0055, + "step": 13373 + }, + { + "epoch": 0.34340681825841207, + "grad_norm": 0.8203125, + "learning_rate": 0.00017913118092432106, + "loss": 0.9582, + "step": 13374 + }, + { + "epoch": 0.3434324954543339, + "grad_norm": 0.859375, + "learning_rate": 0.00017912845137538655, + "loss": 0.8889, + "step": 13375 + }, + { + "epoch": 0.34345817265025574, + "grad_norm": 0.8125, + "learning_rate": 0.0001791257216687562, + "loss": 1.0015, + "step": 13376 + }, + { + "epoch": 0.3434838498461775, + "grad_norm": 0.796875, + "learning_rate": 0.00017912299180443538, + "loss": 0.9096, + "step": 13377 + }, + { + "epoch": 0.34350952704209936, + "grad_norm": 0.75, + "learning_rate": 0.00017912026178242964, + "loss": 0.9159, + "step": 13378 + }, + { + "epoch": 0.3435352042380212, + "grad_norm": 0.73828125, + "learning_rate": 0.00017911753160274437, + "loss": 1.0148, + "step": 13379 + }, + { + "epoch": 0.34356088143394303, + "grad_norm": 0.77734375, + "learning_rate": 0.00017911480126538498, + "loss": 0.9479, + "step": 13380 + }, + { + "epoch": 0.3435865586298648, + "grad_norm": 0.71875, + "learning_rate": 0.00017911207077035693, + "loss": 0.9679, + "step": 13381 + }, + { + "epoch": 0.34361223582578665, + "grad_norm": 0.8671875, + "learning_rate": 0.0001791093401176657, + "loss": 0.9288, + "step": 13382 + }, + { + "epoch": 0.3436379130217085, + "grad_norm": 0.7890625, + "learning_rate": 0.00017910660930731668, + "loss": 0.9694, + "step": 13383 + }, + { + "epoch": 0.34366359021763027, + "grad_norm": 0.76171875, + "learning_rate": 0.00017910387833931535, + "loss": 1.004, + "step": 13384 + }, + { + "epoch": 0.3436892674135521, + "grad_norm": 0.83203125, + "learning_rate": 0.0001791011472136671, + "loss": 0.8716, + "step": 13385 + }, + { + "epoch": 0.34371494460947394, + "grad_norm": 0.765625, + "learning_rate": 0.00017909841593037745, + "loss": 0.8979, + "step": 13386 + }, + { + "epoch": 0.3437406218053957, + "grad_norm": 0.83203125, + "learning_rate": 0.00017909568448945176, + "loss": 0.9812, + "step": 13387 + }, + { + "epoch": 0.34376629900131755, + "grad_norm": 0.7265625, + "learning_rate": 0.00017909295289089556, + "loss": 0.8899, + "step": 13388 + }, + { + "epoch": 0.3437919761972394, + "grad_norm": 0.7890625, + "learning_rate": 0.0001790902211347142, + "loss": 0.9464, + "step": 13389 + }, + { + "epoch": 0.3438176533931612, + "grad_norm": 0.796875, + "learning_rate": 0.0001790874892209132, + "loss": 0.8766, + "step": 13390 + }, + { + "epoch": 0.343843330589083, + "grad_norm": 0.828125, + "learning_rate": 0.00017908475714949803, + "loss": 0.9128, + "step": 13391 + }, + { + "epoch": 0.34386900778500484, + "grad_norm": 0.77734375, + "learning_rate": 0.000179082024920474, + "loss": 0.9775, + "step": 13392 + }, + { + "epoch": 0.3438946849809267, + "grad_norm": 1.1796875, + "learning_rate": 0.00017907929253384669, + "loss": 1.027, + "step": 13393 + }, + { + "epoch": 0.34392036217684846, + "grad_norm": 0.75390625, + "learning_rate": 0.00017907655998962145, + "loss": 0.8775, + "step": 13394 + }, + { + "epoch": 0.3439460393727703, + "grad_norm": 0.87109375, + "learning_rate": 0.00017907382728780383, + "loss": 1.042, + "step": 13395 + }, + { + "epoch": 0.34397171656869213, + "grad_norm": 0.78515625, + "learning_rate": 0.00017907109442839912, + "loss": 1.0339, + "step": 13396 + }, + { + "epoch": 0.3439973937646139, + "grad_norm": 0.80859375, + "learning_rate": 0.00017906836141141295, + "loss": 0.9112, + "step": 13397 + }, + { + "epoch": 0.34402307096053575, + "grad_norm": 0.81640625, + "learning_rate": 0.00017906562823685065, + "loss": 0.9354, + "step": 13398 + }, + { + "epoch": 0.3440487481564576, + "grad_norm": 0.80859375, + "learning_rate": 0.00017906289490471767, + "loss": 0.9236, + "step": 13399 + }, + { + "epoch": 0.3440744253523794, + "grad_norm": 0.796875, + "learning_rate": 0.00017906016141501954, + "loss": 0.8704, + "step": 13400 + }, + { + "epoch": 0.3441001025483012, + "grad_norm": 0.78125, + "learning_rate": 0.0001790574277677616, + "loss": 0.8469, + "step": 13401 + }, + { + "epoch": 0.34412577974422304, + "grad_norm": 0.80078125, + "learning_rate": 0.00017905469396294933, + "loss": 0.9255, + "step": 13402 + }, + { + "epoch": 0.3441514569401449, + "grad_norm": 0.83203125, + "learning_rate": 0.00017905196000058825, + "loss": 0.9853, + "step": 13403 + }, + { + "epoch": 0.34417713413606665, + "grad_norm": 0.81640625, + "learning_rate": 0.00017904922588068373, + "loss": 1.059, + "step": 13404 + }, + { + "epoch": 0.3442028113319885, + "grad_norm": 0.79296875, + "learning_rate": 0.00017904649160324125, + "loss": 0.9298, + "step": 13405 + }, + { + "epoch": 0.3442284885279103, + "grad_norm": 0.7265625, + "learning_rate": 0.00017904375716826625, + "loss": 0.8829, + "step": 13406 + }, + { + "epoch": 0.3442541657238321, + "grad_norm": 0.765625, + "learning_rate": 0.00017904102257576416, + "loss": 1.031, + "step": 13407 + }, + { + "epoch": 0.34427984291975394, + "grad_norm": 0.78125, + "learning_rate": 0.00017903828782574047, + "loss": 0.907, + "step": 13408 + }, + { + "epoch": 0.3443055201156758, + "grad_norm": 0.78515625, + "learning_rate": 0.00017903555291820061, + "loss": 0.9884, + "step": 13409 + }, + { + "epoch": 0.3443311973115976, + "grad_norm": 0.72265625, + "learning_rate": 0.00017903281785315006, + "loss": 0.9425, + "step": 13410 + }, + { + "epoch": 0.3443568745075194, + "grad_norm": 0.7734375, + "learning_rate": 0.00017903008263059423, + "loss": 0.9168, + "step": 13411 + }, + { + "epoch": 0.34438255170344123, + "grad_norm": 0.7421875, + "learning_rate": 0.00017902734725053855, + "loss": 0.9846, + "step": 13412 + }, + { + "epoch": 0.34440822889936307, + "grad_norm": 0.77734375, + "learning_rate": 0.00017902461171298852, + "loss": 0.9582, + "step": 13413 + }, + { + "epoch": 0.34443390609528485, + "grad_norm": 0.7890625, + "learning_rate": 0.0001790218760179496, + "loss": 1.1088, + "step": 13414 + }, + { + "epoch": 0.3444595832912067, + "grad_norm": 0.76953125, + "learning_rate": 0.0001790191401654272, + "loss": 0.9342, + "step": 13415 + }, + { + "epoch": 0.3444852604871285, + "grad_norm": 0.7734375, + "learning_rate": 0.00017901640415542679, + "loss": 0.9686, + "step": 13416 + }, + { + "epoch": 0.3445109376830503, + "grad_norm": 0.79296875, + "learning_rate": 0.00017901366798795385, + "loss": 0.9346, + "step": 13417 + }, + { + "epoch": 0.34453661487897214, + "grad_norm": 0.8359375, + "learning_rate": 0.00017901093166301378, + "loss": 1.0299, + "step": 13418 + }, + { + "epoch": 0.34456229207489397, + "grad_norm": 0.80078125, + "learning_rate": 0.00017900819518061205, + "loss": 1.0008, + "step": 13419 + }, + { + "epoch": 0.3445879692708158, + "grad_norm": 0.859375, + "learning_rate": 0.00017900545854075418, + "loss": 1.0227, + "step": 13420 + }, + { + "epoch": 0.3446136464667376, + "grad_norm": 0.8359375, + "learning_rate": 0.0001790027217434455, + "loss": 0.8132, + "step": 13421 + }, + { + "epoch": 0.3446393236626594, + "grad_norm": 0.8203125, + "learning_rate": 0.00017899998478869156, + "loss": 0.9181, + "step": 13422 + }, + { + "epoch": 0.34466500085858126, + "grad_norm": 0.7578125, + "learning_rate": 0.0001789972476764978, + "loss": 0.965, + "step": 13423 + }, + { + "epoch": 0.34469067805450304, + "grad_norm": 0.77734375, + "learning_rate": 0.00017899451040686963, + "loss": 0.9511, + "step": 13424 + }, + { + "epoch": 0.3447163552504249, + "grad_norm": 0.76171875, + "learning_rate": 0.00017899177297981254, + "loss": 1.0453, + "step": 13425 + }, + { + "epoch": 0.3447420324463467, + "grad_norm": 0.89453125, + "learning_rate": 0.00017898903539533201, + "loss": 1.0748, + "step": 13426 + }, + { + "epoch": 0.3447677096422685, + "grad_norm": 0.76171875, + "learning_rate": 0.00017898629765343346, + "loss": 0.9145, + "step": 13427 + }, + { + "epoch": 0.34479338683819033, + "grad_norm": 0.7421875, + "learning_rate": 0.00017898355975412233, + "loss": 0.8402, + "step": 13428 + }, + { + "epoch": 0.34481906403411217, + "grad_norm": 0.79296875, + "learning_rate": 0.00017898082169740413, + "loss": 1.0568, + "step": 13429 + }, + { + "epoch": 0.34484474123003395, + "grad_norm": 0.76953125, + "learning_rate": 0.0001789780834832843, + "loss": 0.9052, + "step": 13430 + }, + { + "epoch": 0.3448704184259558, + "grad_norm": 0.83984375, + "learning_rate": 0.00017897534511176824, + "loss": 0.883, + "step": 13431 + }, + { + "epoch": 0.3448960956218776, + "grad_norm": 0.7734375, + "learning_rate": 0.00017897260658286145, + "loss": 1.0074, + "step": 13432 + }, + { + "epoch": 0.34492177281779945, + "grad_norm": 0.8125, + "learning_rate": 0.0001789698678965694, + "loss": 1.0415, + "step": 13433 + }, + { + "epoch": 0.34494745001372124, + "grad_norm": 0.83984375, + "learning_rate": 0.00017896712905289756, + "loss": 1.0994, + "step": 13434 + }, + { + "epoch": 0.34497312720964307, + "grad_norm": 0.734375, + "learning_rate": 0.00017896439005185135, + "loss": 0.9286, + "step": 13435 + }, + { + "epoch": 0.3449988044055649, + "grad_norm": 0.76171875, + "learning_rate": 0.00017896165089343623, + "loss": 1.0236, + "step": 13436 + }, + { + "epoch": 0.3450244816014867, + "grad_norm": 0.80859375, + "learning_rate": 0.0001789589115776577, + "loss": 0.9356, + "step": 13437 + }, + { + "epoch": 0.3450501587974085, + "grad_norm": 0.80859375, + "learning_rate": 0.00017895617210452117, + "loss": 0.9738, + "step": 13438 + }, + { + "epoch": 0.34507583599333036, + "grad_norm": 0.77734375, + "learning_rate": 0.00017895343247403212, + "loss": 0.9499, + "step": 13439 + }, + { + "epoch": 0.34510151318925214, + "grad_norm": 0.7109375, + "learning_rate": 0.00017895069268619604, + "loss": 0.8102, + "step": 13440 + }, + { + "epoch": 0.345127190385174, + "grad_norm": 0.83984375, + "learning_rate": 0.00017894795274101831, + "loss": 0.836, + "step": 13441 + }, + { + "epoch": 0.3451528675810958, + "grad_norm": 0.7109375, + "learning_rate": 0.00017894521263850448, + "loss": 0.9092, + "step": 13442 + }, + { + "epoch": 0.34517854477701765, + "grad_norm": 0.77734375, + "learning_rate": 0.00017894247237865995, + "loss": 1.0651, + "step": 13443 + }, + { + "epoch": 0.34520422197293943, + "grad_norm": 0.88671875, + "learning_rate": 0.00017893973196149023, + "loss": 0.981, + "step": 13444 + }, + { + "epoch": 0.34522989916886127, + "grad_norm": 0.7890625, + "learning_rate": 0.00017893699138700074, + "loss": 1.0069, + "step": 13445 + }, + { + "epoch": 0.3452555763647831, + "grad_norm": 0.828125, + "learning_rate": 0.00017893425065519696, + "loss": 0.9374, + "step": 13446 + }, + { + "epoch": 0.3452812535607049, + "grad_norm": 0.734375, + "learning_rate": 0.00017893150976608434, + "loss": 0.9614, + "step": 13447 + }, + { + "epoch": 0.3453069307566267, + "grad_norm": 0.7890625, + "learning_rate": 0.00017892876871966834, + "loss": 0.8965, + "step": 13448 + }, + { + "epoch": 0.34533260795254855, + "grad_norm": 0.73046875, + "learning_rate": 0.00017892602751595445, + "loss": 0.9184, + "step": 13449 + }, + { + "epoch": 0.34535828514847033, + "grad_norm": 0.8046875, + "learning_rate": 0.00017892328615494812, + "loss": 0.8813, + "step": 13450 + }, + { + "epoch": 0.34538396234439217, + "grad_norm": 0.765625, + "learning_rate": 0.00017892054463665481, + "loss": 1.0506, + "step": 13451 + }, + { + "epoch": 0.345409639540314, + "grad_norm": 0.8125, + "learning_rate": 0.00017891780296107999, + "loss": 0.9578, + "step": 13452 + }, + { + "epoch": 0.34543531673623584, + "grad_norm": 0.75, + "learning_rate": 0.0001789150611282291, + "loss": 0.8148, + "step": 13453 + }, + { + "epoch": 0.3454609939321576, + "grad_norm": 0.80078125, + "learning_rate": 0.00017891231913810763, + "loss": 0.9281, + "step": 13454 + }, + { + "epoch": 0.34548667112807946, + "grad_norm": 0.7890625, + "learning_rate": 0.00017890957699072103, + "loss": 1.003, + "step": 13455 + }, + { + "epoch": 0.3455123483240013, + "grad_norm": 0.8359375, + "learning_rate": 0.00017890683468607479, + "loss": 1.0482, + "step": 13456 + }, + { + "epoch": 0.3455380255199231, + "grad_norm": 0.734375, + "learning_rate": 0.00017890409222417434, + "loss": 0.9365, + "step": 13457 + }, + { + "epoch": 0.3455637027158449, + "grad_norm": 0.796875, + "learning_rate": 0.00017890134960502516, + "loss": 0.93, + "step": 13458 + }, + { + "epoch": 0.34558937991176675, + "grad_norm": 0.71875, + "learning_rate": 0.00017889860682863273, + "loss": 0.9266, + "step": 13459 + }, + { + "epoch": 0.34561505710768853, + "grad_norm": 0.87890625, + "learning_rate": 0.0001788958638950025, + "loss": 0.8644, + "step": 13460 + }, + { + "epoch": 0.34564073430361036, + "grad_norm": 0.83203125, + "learning_rate": 0.00017889312080413992, + "loss": 1.0244, + "step": 13461 + }, + { + "epoch": 0.3456664114995322, + "grad_norm": 0.7734375, + "learning_rate": 0.0001788903775560505, + "loss": 0.9267, + "step": 13462 + }, + { + "epoch": 0.34569208869545404, + "grad_norm": 0.8359375, + "learning_rate": 0.00017888763415073968, + "loss": 1.0224, + "step": 13463 + }, + { + "epoch": 0.3457177658913758, + "grad_norm": 0.75, + "learning_rate": 0.0001788848905882129, + "loss": 1.0236, + "step": 13464 + }, + { + "epoch": 0.34574344308729765, + "grad_norm": 0.80859375, + "learning_rate": 0.0001788821468684757, + "loss": 1.024, + "step": 13465 + }, + { + "epoch": 0.3457691202832195, + "grad_norm": 0.734375, + "learning_rate": 0.0001788794029915335, + "loss": 0.9796, + "step": 13466 + }, + { + "epoch": 0.34579479747914127, + "grad_norm": 0.74609375, + "learning_rate": 0.00017887665895739176, + "loss": 1.0301, + "step": 13467 + }, + { + "epoch": 0.3458204746750631, + "grad_norm": 0.88671875, + "learning_rate": 0.00017887391476605595, + "loss": 1.0329, + "step": 13468 + }, + { + "epoch": 0.34584615187098494, + "grad_norm": 0.70703125, + "learning_rate": 0.00017887117041753156, + "loss": 1.0913, + "step": 13469 + }, + { + "epoch": 0.3458718290669067, + "grad_norm": 0.80078125, + "learning_rate": 0.0001788684259118241, + "loss": 0.8998, + "step": 13470 + }, + { + "epoch": 0.34589750626282856, + "grad_norm": 0.75, + "learning_rate": 0.00017886568124893894, + "loss": 0.951, + "step": 13471 + }, + { + "epoch": 0.3459231834587504, + "grad_norm": 1.421875, + "learning_rate": 0.00017886293642888163, + "loss": 0.9681, + "step": 13472 + }, + { + "epoch": 0.34594886065467223, + "grad_norm": 0.734375, + "learning_rate": 0.00017886019145165757, + "loss": 0.8498, + "step": 13473 + }, + { + "epoch": 0.345974537850594, + "grad_norm": 0.80859375, + "learning_rate": 0.0001788574463172723, + "loss": 1.0282, + "step": 13474 + }, + { + "epoch": 0.34600021504651585, + "grad_norm": 0.75, + "learning_rate": 0.00017885470102573127, + "loss": 0.8834, + "step": 13475 + }, + { + "epoch": 0.3460258922424377, + "grad_norm": 0.72265625, + "learning_rate": 0.00017885195557703997, + "loss": 0.9197, + "step": 13476 + }, + { + "epoch": 0.34605156943835946, + "grad_norm": 0.7890625, + "learning_rate": 0.00017884920997120378, + "loss": 0.8915, + "step": 13477 + }, + { + "epoch": 0.3460772466342813, + "grad_norm": 0.75390625, + "learning_rate": 0.00017884646420822828, + "loss": 0.891, + "step": 13478 + }, + { + "epoch": 0.34610292383020314, + "grad_norm": 0.796875, + "learning_rate": 0.00017884371828811888, + "loss": 0.955, + "step": 13479 + }, + { + "epoch": 0.3461286010261249, + "grad_norm": 0.828125, + "learning_rate": 0.00017884097221088107, + "loss": 1.0187, + "step": 13480 + }, + { + "epoch": 0.34615427822204675, + "grad_norm": 0.78125, + "learning_rate": 0.00017883822597652033, + "loss": 0.9277, + "step": 13481 + }, + { + "epoch": 0.3461799554179686, + "grad_norm": 0.82421875, + "learning_rate": 0.0001788354795850421, + "loss": 0.9809, + "step": 13482 + }, + { + "epoch": 0.3462056326138904, + "grad_norm": 0.78125, + "learning_rate": 0.00017883273303645196, + "loss": 0.9878, + "step": 13483 + }, + { + "epoch": 0.3462313098098122, + "grad_norm": 0.9296875, + "learning_rate": 0.00017882998633075525, + "loss": 0.968, + "step": 13484 + }, + { + "epoch": 0.34625698700573404, + "grad_norm": 0.73046875, + "learning_rate": 0.0001788272394679575, + "loss": 0.8905, + "step": 13485 + }, + { + "epoch": 0.3462826642016559, + "grad_norm": 0.7734375, + "learning_rate": 0.00017882449244806416, + "loss": 0.9705, + "step": 13486 + }, + { + "epoch": 0.34630834139757766, + "grad_norm": 0.84375, + "learning_rate": 0.00017882174527108076, + "loss": 0.8578, + "step": 13487 + }, + { + "epoch": 0.3463340185934995, + "grad_norm": 0.87890625, + "learning_rate": 0.00017881899793701272, + "loss": 0.9918, + "step": 13488 + }, + { + "epoch": 0.34635969578942133, + "grad_norm": 0.83984375, + "learning_rate": 0.00017881625044586556, + "loss": 0.9505, + "step": 13489 + }, + { + "epoch": 0.3463853729853431, + "grad_norm": 0.7578125, + "learning_rate": 0.00017881350279764472, + "loss": 1.0871, + "step": 13490 + }, + { + "epoch": 0.34641105018126495, + "grad_norm": 0.75, + "learning_rate": 0.0001788107549923557, + "loss": 0.9481, + "step": 13491 + }, + { + "epoch": 0.3464367273771868, + "grad_norm": 0.76171875, + "learning_rate": 0.00017880800703000397, + "loss": 0.924, + "step": 13492 + }, + { + "epoch": 0.3464624045731086, + "grad_norm": 0.7734375, + "learning_rate": 0.00017880525891059496, + "loss": 1.0177, + "step": 13493 + }, + { + "epoch": 0.3464880817690304, + "grad_norm": 0.7734375, + "learning_rate": 0.00017880251063413422, + "loss": 1.0025, + "step": 13494 + }, + { + "epoch": 0.34651375896495223, + "grad_norm": 0.69140625, + "learning_rate": 0.00017879976220062718, + "loss": 0.9239, + "step": 13495 + }, + { + "epoch": 0.34653943616087407, + "grad_norm": 0.796875, + "learning_rate": 0.00017879701361007933, + "loss": 0.9655, + "step": 13496 + }, + { + "epoch": 0.34656511335679585, + "grad_norm": 0.7734375, + "learning_rate": 0.00017879426486249616, + "loss": 0.9491, + "step": 13497 + }, + { + "epoch": 0.3465907905527177, + "grad_norm": 0.80078125, + "learning_rate": 0.00017879151595788314, + "loss": 0.9931, + "step": 13498 + }, + { + "epoch": 0.3466164677486395, + "grad_norm": 0.77734375, + "learning_rate": 0.00017878876689624576, + "loss": 1.0549, + "step": 13499 + }, + { + "epoch": 0.3466421449445613, + "grad_norm": 0.84375, + "learning_rate": 0.00017878601767758946, + "loss": 1.0178, + "step": 13500 + }, + { + "epoch": 0.34666782214048314, + "grad_norm": 0.90625, + "learning_rate": 0.00017878326830191973, + "loss": 0.9427, + "step": 13501 + }, + { + "epoch": 0.346693499336405, + "grad_norm": 0.79296875, + "learning_rate": 0.0001787805187692421, + "loss": 0.8724, + "step": 13502 + }, + { + "epoch": 0.3467191765323268, + "grad_norm": 1.0078125, + "learning_rate": 0.000178777769079562, + "loss": 0.994, + "step": 13503 + }, + { + "epoch": 0.3467448537282486, + "grad_norm": 0.80078125, + "learning_rate": 0.0001787750192328849, + "loss": 0.9916, + "step": 13504 + }, + { + "epoch": 0.34677053092417043, + "grad_norm": 0.703125, + "learning_rate": 0.00017877226922921632, + "loss": 0.8995, + "step": 13505 + }, + { + "epoch": 0.34679620812009226, + "grad_norm": 0.7265625, + "learning_rate": 0.00017876951906856172, + "loss": 0.7836, + "step": 13506 + }, + { + "epoch": 0.34682188531601404, + "grad_norm": 0.7578125, + "learning_rate": 0.0001787667687509266, + "loss": 0.9563, + "step": 13507 + }, + { + "epoch": 0.3468475625119359, + "grad_norm": 0.70703125, + "learning_rate": 0.0001787640182763164, + "loss": 0.8211, + "step": 13508 + }, + { + "epoch": 0.3468732397078577, + "grad_norm": 0.73828125, + "learning_rate": 0.00017876126764473663, + "loss": 0.8458, + "step": 13509 + }, + { + "epoch": 0.3468989169037795, + "grad_norm": 0.87109375, + "learning_rate": 0.00017875851685619277, + "loss": 1.1071, + "step": 13510 + }, + { + "epoch": 0.34692459409970133, + "grad_norm": 0.8125, + "learning_rate": 0.0001787557659106903, + "loss": 1.086, + "step": 13511 + }, + { + "epoch": 0.34695027129562317, + "grad_norm": 0.8125, + "learning_rate": 0.00017875301480823472, + "loss": 0.884, + "step": 13512 + }, + { + "epoch": 0.346975948491545, + "grad_norm": 0.78125, + "learning_rate": 0.00017875026354883148, + "loss": 0.8751, + "step": 13513 + }, + { + "epoch": 0.3470016256874668, + "grad_norm": 0.7890625, + "learning_rate": 0.0001787475121324861, + "loss": 0.9397, + "step": 13514 + }, + { + "epoch": 0.3470273028833886, + "grad_norm": 0.75390625, + "learning_rate": 0.000178744760559204, + "loss": 0.8703, + "step": 13515 + }, + { + "epoch": 0.34705298007931046, + "grad_norm": 0.76953125, + "learning_rate": 0.00017874200882899075, + "loss": 1.0686, + "step": 13516 + }, + { + "epoch": 0.34707865727523224, + "grad_norm": 0.84765625, + "learning_rate": 0.00017873925694185174, + "loss": 1.0141, + "step": 13517 + }, + { + "epoch": 0.3471043344711541, + "grad_norm": 0.84765625, + "learning_rate": 0.00017873650489779252, + "loss": 0.8758, + "step": 13518 + }, + { + "epoch": 0.3471300116670759, + "grad_norm": 0.7578125, + "learning_rate": 0.0001787337526968186, + "loss": 0.8759, + "step": 13519 + }, + { + "epoch": 0.3471556888629977, + "grad_norm": 0.7578125, + "learning_rate": 0.0001787310003389354, + "loss": 0.9346, + "step": 13520 + }, + { + "epoch": 0.3471813660589195, + "grad_norm": 0.7734375, + "learning_rate": 0.0001787282478241484, + "loss": 0.8897, + "step": 13521 + }, + { + "epoch": 0.34720704325484136, + "grad_norm": 0.76953125, + "learning_rate": 0.00017872549515246313, + "loss": 0.9496, + "step": 13522 + }, + { + "epoch": 0.3472327204507632, + "grad_norm": 0.9375, + "learning_rate": 0.00017872274232388506, + "loss": 0.8508, + "step": 13523 + }, + { + "epoch": 0.347258397646685, + "grad_norm": 0.8515625, + "learning_rate": 0.00017871998933841968, + "loss": 1.0557, + "step": 13524 + }, + { + "epoch": 0.3472840748426068, + "grad_norm": 0.796875, + "learning_rate": 0.00017871723619607247, + "loss": 0.9951, + "step": 13525 + }, + { + "epoch": 0.34730975203852865, + "grad_norm": 0.71875, + "learning_rate": 0.0001787144828968489, + "loss": 0.7841, + "step": 13526 + }, + { + "epoch": 0.34733542923445043, + "grad_norm": 0.8515625, + "learning_rate": 0.0001787117294407545, + "loss": 1.1217, + "step": 13527 + }, + { + "epoch": 0.34736110643037227, + "grad_norm": 0.87109375, + "learning_rate": 0.00017870897582779476, + "loss": 0.9658, + "step": 13528 + }, + { + "epoch": 0.3473867836262941, + "grad_norm": 0.82421875, + "learning_rate": 0.00017870622205797508, + "loss": 1.0812, + "step": 13529 + }, + { + "epoch": 0.3474124608222159, + "grad_norm": 0.7890625, + "learning_rate": 0.00017870346813130103, + "loss": 0.9009, + "step": 13530 + }, + { + "epoch": 0.3474381380181377, + "grad_norm": 0.80859375, + "learning_rate": 0.0001787007140477781, + "loss": 1.0017, + "step": 13531 + }, + { + "epoch": 0.34746381521405956, + "grad_norm": 0.75390625, + "learning_rate": 0.00017869795980741176, + "loss": 0.8761, + "step": 13532 + }, + { + "epoch": 0.3474894924099814, + "grad_norm": 0.76171875, + "learning_rate": 0.00017869520541020746, + "loss": 0.8894, + "step": 13533 + }, + { + "epoch": 0.3475151696059032, + "grad_norm": 0.76953125, + "learning_rate": 0.00017869245085617075, + "loss": 0.8688, + "step": 13534 + }, + { + "epoch": 0.347540846801825, + "grad_norm": 0.83203125, + "learning_rate": 0.00017868969614530706, + "loss": 0.8637, + "step": 13535 + }, + { + "epoch": 0.34756652399774685, + "grad_norm": 0.8046875, + "learning_rate": 0.00017868694127762193, + "loss": 1.0301, + "step": 13536 + }, + { + "epoch": 0.3475922011936686, + "grad_norm": 0.765625, + "learning_rate": 0.00017868418625312086, + "loss": 0.9774, + "step": 13537 + }, + { + "epoch": 0.34761787838959046, + "grad_norm": 0.73828125, + "learning_rate": 0.00017868143107180928, + "loss": 0.773, + "step": 13538 + }, + { + "epoch": 0.3476435555855123, + "grad_norm": 0.71875, + "learning_rate": 0.00017867867573369273, + "loss": 0.8848, + "step": 13539 + }, + { + "epoch": 0.3476692327814341, + "grad_norm": 0.75390625, + "learning_rate": 0.0001786759202387767, + "loss": 1.0174, + "step": 13540 + }, + { + "epoch": 0.3476949099773559, + "grad_norm": 0.8125, + "learning_rate": 0.00017867316458706665, + "loss": 0.8881, + "step": 13541 + }, + { + "epoch": 0.34772058717327775, + "grad_norm": 0.7421875, + "learning_rate": 0.0001786704087785681, + "loss": 0.8998, + "step": 13542 + }, + { + "epoch": 0.3477462643691996, + "grad_norm": 0.80859375, + "learning_rate": 0.0001786676528132865, + "loss": 1.0459, + "step": 13543 + }, + { + "epoch": 0.34777194156512137, + "grad_norm": 0.8203125, + "learning_rate": 0.0001786648966912274, + "loss": 0.9002, + "step": 13544 + }, + { + "epoch": 0.3477976187610432, + "grad_norm": 0.73828125, + "learning_rate": 0.0001786621404123963, + "loss": 0.883, + "step": 13545 + }, + { + "epoch": 0.34782329595696504, + "grad_norm": 0.77734375, + "learning_rate": 0.0001786593839767986, + "loss": 0.9148, + "step": 13546 + }, + { + "epoch": 0.3478489731528868, + "grad_norm": 0.7109375, + "learning_rate": 0.00017865662738443988, + "loss": 0.9204, + "step": 13547 + }, + { + "epoch": 0.34787465034880866, + "grad_norm": 0.7421875, + "learning_rate": 0.0001786538706353256, + "loss": 0.8534, + "step": 13548 + }, + { + "epoch": 0.3479003275447305, + "grad_norm": 0.703125, + "learning_rate": 0.00017865111372946125, + "loss": 0.8892, + "step": 13549 + }, + { + "epoch": 0.3479260047406523, + "grad_norm": 0.71875, + "learning_rate": 0.00017864835666685239, + "loss": 0.9092, + "step": 13550 + }, + { + "epoch": 0.3479516819365741, + "grad_norm": 0.796875, + "learning_rate": 0.0001786455994475044, + "loss": 0.9577, + "step": 13551 + }, + { + "epoch": 0.34797735913249594, + "grad_norm": 0.83203125, + "learning_rate": 0.00017864284207142286, + "loss": 0.9671, + "step": 13552 + }, + { + "epoch": 0.3480030363284178, + "grad_norm": 0.765625, + "learning_rate": 0.00017864008453861324, + "loss": 0.8302, + "step": 13553 + }, + { + "epoch": 0.34802871352433956, + "grad_norm": 0.69140625, + "learning_rate": 0.00017863732684908102, + "loss": 0.876, + "step": 13554 + }, + { + "epoch": 0.3480543907202614, + "grad_norm": 0.78125, + "learning_rate": 0.00017863456900283172, + "loss": 0.8251, + "step": 13555 + }, + { + "epoch": 0.34808006791618323, + "grad_norm": 0.81640625, + "learning_rate": 0.0001786318109998708, + "loss": 0.9878, + "step": 13556 + }, + { + "epoch": 0.348105745112105, + "grad_norm": 0.80859375, + "learning_rate": 0.00017862905284020384, + "loss": 0.8172, + "step": 13557 + }, + { + "epoch": 0.34813142230802685, + "grad_norm": 0.80859375, + "learning_rate": 0.00017862629452383626, + "loss": 0.9319, + "step": 13558 + }, + { + "epoch": 0.3481570995039487, + "grad_norm": 0.75390625, + "learning_rate": 0.00017862353605077355, + "loss": 0.9017, + "step": 13559 + }, + { + "epoch": 0.34818277669987047, + "grad_norm": 0.78125, + "learning_rate": 0.00017862077742102127, + "loss": 0.9873, + "step": 13560 + }, + { + "epoch": 0.3482084538957923, + "grad_norm": 0.765625, + "learning_rate": 0.00017861801863458486, + "loss": 1.0032, + "step": 13561 + }, + { + "epoch": 0.34823413109171414, + "grad_norm": 0.73828125, + "learning_rate": 0.00017861525969146985, + "loss": 0.8686, + "step": 13562 + }, + { + "epoch": 0.348259808287636, + "grad_norm": 0.73828125, + "learning_rate": 0.00017861250059168172, + "loss": 0.9015, + "step": 13563 + }, + { + "epoch": 0.34828548548355776, + "grad_norm": 0.75, + "learning_rate": 0.00017860974133522596, + "loss": 0.8778, + "step": 13564 + }, + { + "epoch": 0.3483111626794796, + "grad_norm": 0.73046875, + "learning_rate": 0.00017860698192210812, + "loss": 1.0635, + "step": 13565 + }, + { + "epoch": 0.3483368398754014, + "grad_norm": 0.7890625, + "learning_rate": 0.00017860422235233366, + "loss": 1.0633, + "step": 13566 + }, + { + "epoch": 0.3483625170713232, + "grad_norm": 0.74609375, + "learning_rate": 0.00017860146262590806, + "loss": 0.9275, + "step": 13567 + }, + { + "epoch": 0.34838819426724504, + "grad_norm": 0.8359375, + "learning_rate": 0.00017859870274283685, + "loss": 1.0034, + "step": 13568 + }, + { + "epoch": 0.3484138714631669, + "grad_norm": 0.875, + "learning_rate": 0.00017859594270312556, + "loss": 1.0497, + "step": 13569 + }, + { + "epoch": 0.34843954865908866, + "grad_norm": 0.8046875, + "learning_rate": 0.00017859318250677963, + "loss": 0.9246, + "step": 13570 + }, + { + "epoch": 0.3484652258550105, + "grad_norm": 0.73828125, + "learning_rate": 0.00017859042215380457, + "loss": 0.7934, + "step": 13571 + }, + { + "epoch": 0.34849090305093233, + "grad_norm": 0.734375, + "learning_rate": 0.00017858766164420594, + "loss": 0.8872, + "step": 13572 + }, + { + "epoch": 0.34851658024685417, + "grad_norm": 0.83984375, + "learning_rate": 0.00017858490097798918, + "loss": 0.951, + "step": 13573 + }, + { + "epoch": 0.34854225744277595, + "grad_norm": 0.78125, + "learning_rate": 0.00017858214015515982, + "loss": 0.9009, + "step": 13574 + }, + { + "epoch": 0.3485679346386978, + "grad_norm": 0.83984375, + "learning_rate": 0.00017857937917572332, + "loss": 1.1248, + "step": 13575 + }, + { + "epoch": 0.3485936118346196, + "grad_norm": 0.77734375, + "learning_rate": 0.00017857661803968525, + "loss": 0.9738, + "step": 13576 + }, + { + "epoch": 0.3486192890305414, + "grad_norm": 0.78515625, + "learning_rate": 0.00017857385674705106, + "loss": 0.9211, + "step": 13577 + }, + { + "epoch": 0.34864496622646324, + "grad_norm": 0.8046875, + "learning_rate": 0.00017857109529782628, + "loss": 0.7846, + "step": 13578 + }, + { + "epoch": 0.3486706434223851, + "grad_norm": 0.765625, + "learning_rate": 0.0001785683336920164, + "loss": 0.9088, + "step": 13579 + }, + { + "epoch": 0.34869632061830685, + "grad_norm": 0.73828125, + "learning_rate": 0.00017856557192962692, + "loss": 0.8554, + "step": 13580 + }, + { + "epoch": 0.3487219978142287, + "grad_norm": 1.2109375, + "learning_rate": 0.00017856281001066336, + "loss": 0.9771, + "step": 13581 + }, + { + "epoch": 0.3487476750101505, + "grad_norm": 0.79296875, + "learning_rate": 0.0001785600479351312, + "loss": 0.9429, + "step": 13582 + }, + { + "epoch": 0.34877335220607236, + "grad_norm": 0.87890625, + "learning_rate": 0.00017855728570303597, + "loss": 0.9268, + "step": 13583 + }, + { + "epoch": 0.34879902940199414, + "grad_norm": 0.70703125, + "learning_rate": 0.00017855452331438319, + "loss": 0.9504, + "step": 13584 + }, + { + "epoch": 0.348824706597916, + "grad_norm": 1.7890625, + "learning_rate": 0.00017855176076917833, + "loss": 0.9601, + "step": 13585 + }, + { + "epoch": 0.3488503837938378, + "grad_norm": 0.796875, + "learning_rate": 0.0001785489980674269, + "loss": 0.9272, + "step": 13586 + }, + { + "epoch": 0.3488760609897596, + "grad_norm": 0.8046875, + "learning_rate": 0.00017854623520913442, + "loss": 1.0426, + "step": 13587 + }, + { + "epoch": 0.34890173818568143, + "grad_norm": 0.76171875, + "learning_rate": 0.00017854347219430636, + "loss": 0.8764, + "step": 13588 + }, + { + "epoch": 0.34892741538160327, + "grad_norm": 0.71484375, + "learning_rate": 0.00017854070902294827, + "loss": 0.8282, + "step": 13589 + }, + { + "epoch": 0.34895309257752505, + "grad_norm": 0.76171875, + "learning_rate": 0.00017853794569506568, + "loss": 1.0456, + "step": 13590 + }, + { + "epoch": 0.3489787697734469, + "grad_norm": 0.81640625, + "learning_rate": 0.00017853518221066402, + "loss": 0.9153, + "step": 13591 + }, + { + "epoch": 0.3490044469693687, + "grad_norm": 0.703125, + "learning_rate": 0.00017853241856974884, + "loss": 0.8428, + "step": 13592 + }, + { + "epoch": 0.34903012416529056, + "grad_norm": 0.78125, + "learning_rate": 0.00017852965477232563, + "loss": 0.9479, + "step": 13593 + }, + { + "epoch": 0.34905580136121234, + "grad_norm": 0.796875, + "learning_rate": 0.00017852689081839993, + "loss": 1.0959, + "step": 13594 + }, + { + "epoch": 0.3490814785571342, + "grad_norm": 0.7734375, + "learning_rate": 0.00017852412670797723, + "loss": 0.9783, + "step": 13595 + }, + { + "epoch": 0.349107155753056, + "grad_norm": 0.71484375, + "learning_rate": 0.000178521362441063, + "loss": 0.8313, + "step": 13596 + }, + { + "epoch": 0.3491328329489778, + "grad_norm": 0.80078125, + "learning_rate": 0.00017851859801766286, + "loss": 0.9023, + "step": 13597 + }, + { + "epoch": 0.3491585101448996, + "grad_norm": 0.74609375, + "learning_rate": 0.0001785158334377822, + "loss": 0.9255, + "step": 13598 + }, + { + "epoch": 0.34918418734082146, + "grad_norm": 0.81640625, + "learning_rate": 0.0001785130687014266, + "loss": 1.0325, + "step": 13599 + }, + { + "epoch": 0.34920986453674324, + "grad_norm": 0.73828125, + "learning_rate": 0.0001785103038086015, + "loss": 0.9118, + "step": 13600 + }, + { + "epoch": 0.3492355417326651, + "grad_norm": 0.86328125, + "learning_rate": 0.00017850753875931253, + "loss": 1.1273, + "step": 13601 + }, + { + "epoch": 0.3492612189285869, + "grad_norm": 0.75390625, + "learning_rate": 0.00017850477355356507, + "loss": 0.9739, + "step": 13602 + }, + { + "epoch": 0.34928689612450875, + "grad_norm": 0.8515625, + "learning_rate": 0.0001785020081913647, + "loss": 0.9585, + "step": 13603 + }, + { + "epoch": 0.34931257332043053, + "grad_norm": 0.85546875, + "learning_rate": 0.00017849924267271692, + "loss": 1.1458, + "step": 13604 + }, + { + "epoch": 0.34933825051635237, + "grad_norm": 0.79296875, + "learning_rate": 0.00017849647699762724, + "loss": 0.9106, + "step": 13605 + }, + { + "epoch": 0.3493639277122742, + "grad_norm": 0.79296875, + "learning_rate": 0.00017849371116610116, + "loss": 1.0387, + "step": 13606 + }, + { + "epoch": 0.349389604908196, + "grad_norm": 0.80859375, + "learning_rate": 0.00017849094517814423, + "loss": 0.9026, + "step": 13607 + }, + { + "epoch": 0.3494152821041178, + "grad_norm": 0.84765625, + "learning_rate": 0.0001784881790337619, + "loss": 0.9777, + "step": 13608 + }, + { + "epoch": 0.34944095930003966, + "grad_norm": 0.83203125, + "learning_rate": 0.00017848541273295974, + "loss": 0.8993, + "step": 13609 + }, + { + "epoch": 0.34946663649596144, + "grad_norm": 0.8125, + "learning_rate": 0.00017848264627574328, + "loss": 0.9903, + "step": 13610 + }, + { + "epoch": 0.34949231369188327, + "grad_norm": 0.83984375, + "learning_rate": 0.0001784798796621179, + "loss": 1.0058, + "step": 13611 + }, + { + "epoch": 0.3495179908878051, + "grad_norm": 1.1640625, + "learning_rate": 0.0001784771128920893, + "loss": 0.9045, + "step": 13612 + }, + { + "epoch": 0.34954366808372694, + "grad_norm": 0.76953125, + "learning_rate": 0.00017847434596566286, + "loss": 0.8363, + "step": 13613 + }, + { + "epoch": 0.3495693452796487, + "grad_norm": 0.76171875, + "learning_rate": 0.00017847157888284417, + "loss": 0.9183, + "step": 13614 + }, + { + "epoch": 0.34959502247557056, + "grad_norm": 0.8828125, + "learning_rate": 0.00017846881164363868, + "loss": 0.8645, + "step": 13615 + }, + { + "epoch": 0.3496206996714924, + "grad_norm": 0.76171875, + "learning_rate": 0.00017846604424805195, + "loss": 0.9718, + "step": 13616 + }, + { + "epoch": 0.3496463768674142, + "grad_norm": 0.75, + "learning_rate": 0.00017846327669608948, + "loss": 0.9146, + "step": 13617 + }, + { + "epoch": 0.349672054063336, + "grad_norm": 0.8828125, + "learning_rate": 0.00017846050898775678, + "loss": 1.0469, + "step": 13618 + }, + { + "epoch": 0.34969773125925785, + "grad_norm": 0.703125, + "learning_rate": 0.00017845774112305938, + "loss": 1.0776, + "step": 13619 + }, + { + "epoch": 0.34972340845517963, + "grad_norm": 0.78515625, + "learning_rate": 0.0001784549731020028, + "loss": 0.9585, + "step": 13620 + }, + { + "epoch": 0.34974908565110147, + "grad_norm": 0.80078125, + "learning_rate": 0.00017845220492459252, + "loss": 0.9077, + "step": 13621 + }, + { + "epoch": 0.3497747628470233, + "grad_norm": 0.94921875, + "learning_rate": 0.00017844943659083408, + "loss": 1.0945, + "step": 13622 + }, + { + "epoch": 0.34980044004294514, + "grad_norm": 0.76953125, + "learning_rate": 0.00017844666810073304, + "loss": 0.9258, + "step": 13623 + }, + { + "epoch": 0.3498261172388669, + "grad_norm": 0.796875, + "learning_rate": 0.00017844389945429482, + "loss": 0.9639, + "step": 13624 + }, + { + "epoch": 0.34985179443478875, + "grad_norm": 0.86328125, + "learning_rate": 0.00017844113065152505, + "loss": 0.8642, + "step": 13625 + }, + { + "epoch": 0.3498774716307106, + "grad_norm": 0.7890625, + "learning_rate": 0.00017843836169242913, + "loss": 0.9101, + "step": 13626 + }, + { + "epoch": 0.34990314882663237, + "grad_norm": 0.71875, + "learning_rate": 0.0001784355925770127, + "loss": 0.838, + "step": 13627 + }, + { + "epoch": 0.3499288260225542, + "grad_norm": 0.734375, + "learning_rate": 0.0001784328233052812, + "loss": 0.8302, + "step": 13628 + }, + { + "epoch": 0.34995450321847604, + "grad_norm": 0.77734375, + "learning_rate": 0.00017843005387724014, + "loss": 0.9066, + "step": 13629 + }, + { + "epoch": 0.3499801804143978, + "grad_norm": 0.76953125, + "learning_rate": 0.0001784272842928951, + "loss": 0.8534, + "step": 13630 + }, + { + "epoch": 0.35000585761031966, + "grad_norm": 0.73046875, + "learning_rate": 0.00017842451455225158, + "loss": 0.9178, + "step": 13631 + }, + { + "epoch": 0.3500315348062415, + "grad_norm": 0.734375, + "learning_rate": 0.00017842174465531504, + "loss": 0.8506, + "step": 13632 + }, + { + "epoch": 0.3500572120021633, + "grad_norm": 0.8125, + "learning_rate": 0.0001784189746020911, + "loss": 0.924, + "step": 13633 + }, + { + "epoch": 0.3500828891980851, + "grad_norm": 0.78515625, + "learning_rate": 0.00017841620439258517, + "loss": 0.8488, + "step": 13634 + }, + { + "epoch": 0.35010856639400695, + "grad_norm": 0.74609375, + "learning_rate": 0.00017841343402680285, + "loss": 0.9962, + "step": 13635 + }, + { + "epoch": 0.3501342435899288, + "grad_norm": 0.7890625, + "learning_rate": 0.00017841066350474965, + "loss": 0.9562, + "step": 13636 + }, + { + "epoch": 0.35015992078585056, + "grad_norm": 0.7890625, + "learning_rate": 0.0001784078928264311, + "loss": 0.9469, + "step": 13637 + }, + { + "epoch": 0.3501855979817724, + "grad_norm": 0.7578125, + "learning_rate": 0.00017840512199185265, + "loss": 0.7553, + "step": 13638 + }, + { + "epoch": 0.35021127517769424, + "grad_norm": 0.76171875, + "learning_rate": 0.0001784023510010199, + "loss": 0.9415, + "step": 13639 + }, + { + "epoch": 0.350236952373616, + "grad_norm": 0.7578125, + "learning_rate": 0.00017839957985393837, + "loss": 0.9639, + "step": 13640 + }, + { + "epoch": 0.35026262956953785, + "grad_norm": 0.8671875, + "learning_rate": 0.00017839680855061352, + "loss": 0.9091, + "step": 13641 + }, + { + "epoch": 0.3502883067654597, + "grad_norm": 0.82421875, + "learning_rate": 0.00017839403709105095, + "loss": 0.9786, + "step": 13642 + }, + { + "epoch": 0.35031398396138147, + "grad_norm": 0.76953125, + "learning_rate": 0.0001783912654752561, + "loss": 0.9418, + "step": 13643 + }, + { + "epoch": 0.3503396611573033, + "grad_norm": 0.76171875, + "learning_rate": 0.00017838849370323457, + "loss": 1.1165, + "step": 13644 + }, + { + "epoch": 0.35036533835322514, + "grad_norm": 0.796875, + "learning_rate": 0.00017838572177499183, + "loss": 1.0498, + "step": 13645 + }, + { + "epoch": 0.350391015549147, + "grad_norm": 0.796875, + "learning_rate": 0.00017838294969053348, + "loss": 1.0133, + "step": 13646 + }, + { + "epoch": 0.35041669274506876, + "grad_norm": 0.76953125, + "learning_rate": 0.00017838017744986492, + "loss": 0.881, + "step": 13647 + }, + { + "epoch": 0.3504423699409906, + "grad_norm": 0.94140625, + "learning_rate": 0.0001783774050529918, + "loss": 0.9335, + "step": 13648 + }, + { + "epoch": 0.35046804713691243, + "grad_norm": 0.78125, + "learning_rate": 0.0001783746324999196, + "loss": 0.9598, + "step": 13649 + }, + { + "epoch": 0.3504937243328342, + "grad_norm": 0.76171875, + "learning_rate": 0.00017837185979065377, + "loss": 0.9234, + "step": 13650 + }, + { + "epoch": 0.35051940152875605, + "grad_norm": 0.82421875, + "learning_rate": 0.00017836908692519995, + "loss": 1.0719, + "step": 13651 + }, + { + "epoch": 0.3505450787246779, + "grad_norm": 0.7578125, + "learning_rate": 0.00017836631390356363, + "loss": 1.0634, + "step": 13652 + }, + { + "epoch": 0.35057075592059966, + "grad_norm": 1.1015625, + "learning_rate": 0.0001783635407257503, + "loss": 0.9294, + "step": 13653 + }, + { + "epoch": 0.3505964331165215, + "grad_norm": 0.8828125, + "learning_rate": 0.0001783607673917655, + "loss": 0.9547, + "step": 13654 + }, + { + "epoch": 0.35062211031244334, + "grad_norm": 0.87109375, + "learning_rate": 0.00017835799390161482, + "loss": 0.8542, + "step": 13655 + }, + { + "epoch": 0.35064778750836517, + "grad_norm": 0.87109375, + "learning_rate": 0.00017835522025530368, + "loss": 1.0357, + "step": 13656 + }, + { + "epoch": 0.35067346470428695, + "grad_norm": 0.87109375, + "learning_rate": 0.0001783524464528377, + "loss": 0.9838, + "step": 13657 + }, + { + "epoch": 0.3506991419002088, + "grad_norm": 0.7578125, + "learning_rate": 0.00017834967249422237, + "loss": 0.7185, + "step": 13658 + }, + { + "epoch": 0.3507248190961306, + "grad_norm": 0.80078125, + "learning_rate": 0.00017834689837946318, + "loss": 0.9844, + "step": 13659 + }, + { + "epoch": 0.3507504962920524, + "grad_norm": 0.78125, + "learning_rate": 0.00017834412410856575, + "loss": 0.9854, + "step": 13660 + }, + { + "epoch": 0.35077617348797424, + "grad_norm": 0.8046875, + "learning_rate": 0.0001783413496815355, + "loss": 0.9783, + "step": 13661 + }, + { + "epoch": 0.3508018506838961, + "grad_norm": 0.7578125, + "learning_rate": 0.00017833857509837807, + "loss": 0.8748, + "step": 13662 + }, + { + "epoch": 0.35082752787981786, + "grad_norm": 0.6953125, + "learning_rate": 0.0001783358003590989, + "loss": 0.8529, + "step": 13663 + }, + { + "epoch": 0.3508532050757397, + "grad_norm": 0.796875, + "learning_rate": 0.00017833302546370358, + "loss": 0.8536, + "step": 13664 + }, + { + "epoch": 0.35087888227166153, + "grad_norm": 0.78515625, + "learning_rate": 0.00017833025041219763, + "loss": 0.89, + "step": 13665 + }, + { + "epoch": 0.35090455946758337, + "grad_norm": 0.7578125, + "learning_rate": 0.00017832747520458655, + "loss": 0.9548, + "step": 13666 + }, + { + "epoch": 0.35093023666350515, + "grad_norm": 0.8515625, + "learning_rate": 0.0001783246998408759, + "loss": 0.8511, + "step": 13667 + }, + { + "epoch": 0.350955913859427, + "grad_norm": 0.8046875, + "learning_rate": 0.00017832192432107116, + "loss": 1.0882, + "step": 13668 + }, + { + "epoch": 0.3509815910553488, + "grad_norm": 0.765625, + "learning_rate": 0.00017831914864517792, + "loss": 1.0154, + "step": 13669 + }, + { + "epoch": 0.3510072682512706, + "grad_norm": 0.8203125, + "learning_rate": 0.0001783163728132017, + "loss": 0.8899, + "step": 13670 + }, + { + "epoch": 0.35103294544719243, + "grad_norm": 0.76953125, + "learning_rate": 0.00017831359682514802, + "loss": 0.9496, + "step": 13671 + }, + { + "epoch": 0.35105862264311427, + "grad_norm": 0.84765625, + "learning_rate": 0.0001783108206810224, + "loss": 0.9528, + "step": 13672 + }, + { + "epoch": 0.35108429983903605, + "grad_norm": 1.0546875, + "learning_rate": 0.0001783080443808304, + "loss": 0.868, + "step": 13673 + }, + { + "epoch": 0.3511099770349579, + "grad_norm": 0.82421875, + "learning_rate": 0.00017830526792457754, + "loss": 0.9184, + "step": 13674 + }, + { + "epoch": 0.3511356542308797, + "grad_norm": 0.80859375, + "learning_rate": 0.00017830249131226938, + "loss": 0.969, + "step": 13675 + }, + { + "epoch": 0.35116133142680156, + "grad_norm": 0.72265625, + "learning_rate": 0.00017829971454391138, + "loss": 0.9642, + "step": 13676 + }, + { + "epoch": 0.35118700862272334, + "grad_norm": 0.69921875, + "learning_rate": 0.00017829693761950918, + "loss": 0.9378, + "step": 13677 + }, + { + "epoch": 0.3512126858186452, + "grad_norm": 0.81640625, + "learning_rate": 0.0001782941605390682, + "loss": 1.0306, + "step": 13678 + }, + { + "epoch": 0.351238363014567, + "grad_norm": 0.76171875, + "learning_rate": 0.00017829138330259406, + "loss": 0.8815, + "step": 13679 + }, + { + "epoch": 0.3512640402104888, + "grad_norm": 0.8828125, + "learning_rate": 0.00017828860591009225, + "loss": 0.9361, + "step": 13680 + }, + { + "epoch": 0.35128971740641063, + "grad_norm": 0.82421875, + "learning_rate": 0.0001782858283615683, + "loss": 1.0651, + "step": 13681 + }, + { + "epoch": 0.35131539460233246, + "grad_norm": 0.796875, + "learning_rate": 0.0001782830506570278, + "loss": 0.9259, + "step": 13682 + }, + { + "epoch": 0.35134107179825425, + "grad_norm": 0.76171875, + "learning_rate": 0.00017828027279647625, + "loss": 0.8274, + "step": 13683 + }, + { + "epoch": 0.3513667489941761, + "grad_norm": 0.7578125, + "learning_rate": 0.00017827749477991917, + "loss": 0.9865, + "step": 13684 + }, + { + "epoch": 0.3513924261900979, + "grad_norm": 0.7265625, + "learning_rate": 0.00017827471660736212, + "loss": 0.9585, + "step": 13685 + }, + { + "epoch": 0.35141810338601975, + "grad_norm": 0.7734375, + "learning_rate": 0.00017827193827881063, + "loss": 0.9185, + "step": 13686 + }, + { + "epoch": 0.35144378058194153, + "grad_norm": 0.796875, + "learning_rate": 0.00017826915979427026, + "loss": 0.8461, + "step": 13687 + }, + { + "epoch": 0.35146945777786337, + "grad_norm": 0.79296875, + "learning_rate": 0.0001782663811537465, + "loss": 0.8148, + "step": 13688 + }, + { + "epoch": 0.3514951349737852, + "grad_norm": 0.796875, + "learning_rate": 0.00017826360235724493, + "loss": 1.0486, + "step": 13689 + }, + { + "epoch": 0.351520812169707, + "grad_norm": 0.7734375, + "learning_rate": 0.00017826082340477106, + "loss": 0.9516, + "step": 13690 + }, + { + "epoch": 0.3515464893656288, + "grad_norm": 0.73828125, + "learning_rate": 0.00017825804429633042, + "loss": 0.8803, + "step": 13691 + }, + { + "epoch": 0.35157216656155066, + "grad_norm": 0.796875, + "learning_rate": 0.0001782552650319286, + "loss": 0.9405, + "step": 13692 + }, + { + "epoch": 0.35159784375747244, + "grad_norm": 0.7734375, + "learning_rate": 0.00017825248561157108, + "loss": 0.8584, + "step": 13693 + }, + { + "epoch": 0.3516235209533943, + "grad_norm": 0.8671875, + "learning_rate": 0.00017824970603526344, + "loss": 0.8786, + "step": 13694 + }, + { + "epoch": 0.3516491981493161, + "grad_norm": 0.78515625, + "learning_rate": 0.0001782469263030112, + "loss": 0.8934, + "step": 13695 + }, + { + "epoch": 0.35167487534523795, + "grad_norm": 0.85546875, + "learning_rate": 0.0001782441464148199, + "loss": 0.999, + "step": 13696 + }, + { + "epoch": 0.35170055254115973, + "grad_norm": 0.72265625, + "learning_rate": 0.00017824136637069508, + "loss": 0.8727, + "step": 13697 + }, + { + "epoch": 0.35172622973708156, + "grad_norm": 0.8359375, + "learning_rate": 0.0001782385861706423, + "loss": 0.9632, + "step": 13698 + }, + { + "epoch": 0.3517519069330034, + "grad_norm": 0.81640625, + "learning_rate": 0.00017823580581466706, + "loss": 0.9907, + "step": 13699 + }, + { + "epoch": 0.3517775841289252, + "grad_norm": 0.73828125, + "learning_rate": 0.00017823302530277496, + "loss": 1.0201, + "step": 13700 + }, + { + "epoch": 0.351803261324847, + "grad_norm": 0.8203125, + "learning_rate": 0.00017823024463497147, + "loss": 0.8519, + "step": 13701 + }, + { + "epoch": 0.35182893852076885, + "grad_norm": 0.7421875, + "learning_rate": 0.0001782274638112622, + "loss": 0.9342, + "step": 13702 + }, + { + "epoch": 0.35185461571669063, + "grad_norm": 0.765625, + "learning_rate": 0.00017822468283165262, + "loss": 0.9625, + "step": 13703 + }, + { + "epoch": 0.35188029291261247, + "grad_norm": 0.77734375, + "learning_rate": 0.00017822190169614836, + "loss": 0.9203, + "step": 13704 + }, + { + "epoch": 0.3519059701085343, + "grad_norm": 0.828125, + "learning_rate": 0.0001782191204047549, + "loss": 1.0705, + "step": 13705 + }, + { + "epoch": 0.35193164730445614, + "grad_norm": 0.7578125, + "learning_rate": 0.0001782163389574778, + "loss": 0.9899, + "step": 13706 + }, + { + "epoch": 0.3519573245003779, + "grad_norm": 0.80859375, + "learning_rate": 0.00017821355735432262, + "loss": 0.9523, + "step": 13707 + }, + { + "epoch": 0.35198300169629976, + "grad_norm": 0.8046875, + "learning_rate": 0.00017821077559529482, + "loss": 0.8863, + "step": 13708 + }, + { + "epoch": 0.3520086788922216, + "grad_norm": 0.83984375, + "learning_rate": 0.00017820799368040006, + "loss": 0.9517, + "step": 13709 + }, + { + "epoch": 0.3520343560881434, + "grad_norm": 0.77734375, + "learning_rate": 0.00017820521160964386, + "loss": 1.0329, + "step": 13710 + }, + { + "epoch": 0.3520600332840652, + "grad_norm": 0.859375, + "learning_rate": 0.00017820242938303168, + "loss": 1.147, + "step": 13711 + }, + { + "epoch": 0.35208571047998705, + "grad_norm": 0.78515625, + "learning_rate": 0.00017819964700056912, + "loss": 1.0501, + "step": 13712 + }, + { + "epoch": 0.3521113876759088, + "grad_norm": 0.72265625, + "learning_rate": 0.00017819686446226177, + "loss": 0.8967, + "step": 13713 + }, + { + "epoch": 0.35213706487183066, + "grad_norm": 0.7578125, + "learning_rate": 0.00017819408176811513, + "loss": 0.8558, + "step": 13714 + }, + { + "epoch": 0.3521627420677525, + "grad_norm": 0.81640625, + "learning_rate": 0.00017819129891813473, + "loss": 0.8899, + "step": 13715 + }, + { + "epoch": 0.35218841926367433, + "grad_norm": 0.8359375, + "learning_rate": 0.00017818851591232612, + "loss": 0.9552, + "step": 13716 + }, + { + "epoch": 0.3522140964595961, + "grad_norm": 0.74609375, + "learning_rate": 0.00017818573275069487, + "loss": 0.8645, + "step": 13717 + }, + { + "epoch": 0.35223977365551795, + "grad_norm": 0.796875, + "learning_rate": 0.00017818294943324655, + "loss": 0.9818, + "step": 13718 + }, + { + "epoch": 0.3522654508514398, + "grad_norm": 0.79296875, + "learning_rate": 0.00017818016595998664, + "loss": 1.0665, + "step": 13719 + }, + { + "epoch": 0.35229112804736157, + "grad_norm": 0.80078125, + "learning_rate": 0.00017817738233092073, + "loss": 0.9655, + "step": 13720 + }, + { + "epoch": 0.3523168052432834, + "grad_norm": 0.83203125, + "learning_rate": 0.00017817459854605435, + "loss": 0.9638, + "step": 13721 + }, + { + "epoch": 0.35234248243920524, + "grad_norm": 0.71875, + "learning_rate": 0.00017817181460539307, + "loss": 0.6858, + "step": 13722 + }, + { + "epoch": 0.352368159635127, + "grad_norm": 0.8046875, + "learning_rate": 0.00017816903050894243, + "loss": 0.9955, + "step": 13723 + }, + { + "epoch": 0.35239383683104886, + "grad_norm": 0.7578125, + "learning_rate": 0.00017816624625670795, + "loss": 0.8776, + "step": 13724 + }, + { + "epoch": 0.3524195140269707, + "grad_norm": 0.7578125, + "learning_rate": 0.0001781634618486952, + "loss": 1.1216, + "step": 13725 + }, + { + "epoch": 0.35244519122289253, + "grad_norm": 0.83984375, + "learning_rate": 0.00017816067728490978, + "loss": 0.9508, + "step": 13726 + }, + { + "epoch": 0.3524708684188143, + "grad_norm": 0.77734375, + "learning_rate": 0.00017815789256535712, + "loss": 0.8835, + "step": 13727 + }, + { + "epoch": 0.35249654561473615, + "grad_norm": 0.765625, + "learning_rate": 0.00017815510769004288, + "loss": 0.9126, + "step": 13728 + }, + { + "epoch": 0.352522222810658, + "grad_norm": 0.85546875, + "learning_rate": 0.00017815232265897256, + "loss": 0.897, + "step": 13729 + }, + { + "epoch": 0.35254790000657976, + "grad_norm": 0.76953125, + "learning_rate": 0.00017814953747215175, + "loss": 1.0577, + "step": 13730 + }, + { + "epoch": 0.3525735772025016, + "grad_norm": 0.84375, + "learning_rate": 0.00017814675212958595, + "loss": 0.9059, + "step": 13731 + }, + { + "epoch": 0.35259925439842343, + "grad_norm": 0.7734375, + "learning_rate": 0.00017814396663128073, + "loss": 1.0041, + "step": 13732 + }, + { + "epoch": 0.3526249315943452, + "grad_norm": 0.79296875, + "learning_rate": 0.00017814118097724164, + "loss": 0.9428, + "step": 13733 + }, + { + "epoch": 0.35265060879026705, + "grad_norm": 0.74609375, + "learning_rate": 0.00017813839516747423, + "loss": 0.8404, + "step": 13734 + }, + { + "epoch": 0.3526762859861889, + "grad_norm": 0.77734375, + "learning_rate": 0.00017813560920198405, + "loss": 0.9007, + "step": 13735 + }, + { + "epoch": 0.3527019631821107, + "grad_norm": 0.83203125, + "learning_rate": 0.0001781328230807767, + "loss": 0.8389, + "step": 13736 + }, + { + "epoch": 0.3527276403780325, + "grad_norm": 0.7578125, + "learning_rate": 0.00017813003680385765, + "loss": 0.8494, + "step": 13737 + }, + { + "epoch": 0.35275331757395434, + "grad_norm": 0.796875, + "learning_rate": 0.00017812725037123252, + "loss": 0.9566, + "step": 13738 + }, + { + "epoch": 0.3527789947698762, + "grad_norm": 0.78125, + "learning_rate": 0.00017812446378290684, + "loss": 0.8902, + "step": 13739 + }, + { + "epoch": 0.35280467196579796, + "grad_norm": 0.8046875, + "learning_rate": 0.00017812167703888613, + "loss": 0.9465, + "step": 13740 + }, + { + "epoch": 0.3528303491617198, + "grad_norm": 0.78125, + "learning_rate": 0.00017811889013917597, + "loss": 0.8316, + "step": 13741 + }, + { + "epoch": 0.35285602635764163, + "grad_norm": 0.8359375, + "learning_rate": 0.00017811610308378196, + "loss": 0.8979, + "step": 13742 + }, + { + "epoch": 0.3528817035535634, + "grad_norm": 1.3828125, + "learning_rate": 0.00017811331587270961, + "loss": 0.9358, + "step": 13743 + }, + { + "epoch": 0.35290738074948524, + "grad_norm": 0.8671875, + "learning_rate": 0.00017811052850596446, + "loss": 0.9403, + "step": 13744 + }, + { + "epoch": 0.3529330579454071, + "grad_norm": 0.79296875, + "learning_rate": 0.00017810774098355204, + "loss": 0.8876, + "step": 13745 + }, + { + "epoch": 0.3529587351413289, + "grad_norm": 0.796875, + "learning_rate": 0.00017810495330547802, + "loss": 0.9706, + "step": 13746 + }, + { + "epoch": 0.3529844123372507, + "grad_norm": 0.76171875, + "learning_rate": 0.00017810216547174785, + "loss": 0.8491, + "step": 13747 + }, + { + "epoch": 0.35301008953317253, + "grad_norm": 0.7890625, + "learning_rate": 0.00017809937748236712, + "loss": 1.0951, + "step": 13748 + }, + { + "epoch": 0.35303576672909437, + "grad_norm": 0.7578125, + "learning_rate": 0.0001780965893373414, + "loss": 0.893, + "step": 13749 + }, + { + "epoch": 0.35306144392501615, + "grad_norm": 0.72265625, + "learning_rate": 0.00017809380103667622, + "loss": 0.942, + "step": 13750 + }, + { + "epoch": 0.353087121120938, + "grad_norm": 0.828125, + "learning_rate": 0.00017809101258037716, + "loss": 0.8984, + "step": 13751 + }, + { + "epoch": 0.3531127983168598, + "grad_norm": 0.77734375, + "learning_rate": 0.00017808822396844974, + "loss": 0.8435, + "step": 13752 + }, + { + "epoch": 0.3531384755127816, + "grad_norm": 0.8203125, + "learning_rate": 0.00017808543520089954, + "loss": 1.0056, + "step": 13753 + }, + { + "epoch": 0.35316415270870344, + "grad_norm": 0.7421875, + "learning_rate": 0.00017808264627773216, + "loss": 0.9231, + "step": 13754 + }, + { + "epoch": 0.3531898299046253, + "grad_norm": 0.79296875, + "learning_rate": 0.00017807985719895308, + "loss": 0.9964, + "step": 13755 + }, + { + "epoch": 0.3532155071005471, + "grad_norm": 0.86328125, + "learning_rate": 0.0001780770679645679, + "loss": 0.9715, + "step": 13756 + }, + { + "epoch": 0.3532411842964689, + "grad_norm": 0.734375, + "learning_rate": 0.00017807427857458217, + "loss": 0.8539, + "step": 13757 + }, + { + "epoch": 0.3532668614923907, + "grad_norm": 0.83203125, + "learning_rate": 0.00017807148902900153, + "loss": 1.0423, + "step": 13758 + }, + { + "epoch": 0.35329253868831256, + "grad_norm": 0.78515625, + "learning_rate": 0.0001780686993278314, + "loss": 1.0158, + "step": 13759 + }, + { + "epoch": 0.35331821588423434, + "grad_norm": 0.75390625, + "learning_rate": 0.0001780659094710774, + "loss": 0.8407, + "step": 13760 + }, + { + "epoch": 0.3533438930801562, + "grad_norm": 0.8359375, + "learning_rate": 0.00017806311945874512, + "loss": 1.1043, + "step": 13761 + }, + { + "epoch": 0.353369570276078, + "grad_norm": 0.8671875, + "learning_rate": 0.00017806032929084008, + "loss": 0.9338, + "step": 13762 + }, + { + "epoch": 0.3533952474719998, + "grad_norm": 0.828125, + "learning_rate": 0.00017805753896736785, + "loss": 0.8795, + "step": 13763 + }, + { + "epoch": 0.35342092466792163, + "grad_norm": 0.890625, + "learning_rate": 0.000178054748488334, + "loss": 1.1235, + "step": 13764 + }, + { + "epoch": 0.35344660186384347, + "grad_norm": 0.78125, + "learning_rate": 0.0001780519578537441, + "loss": 0.9479, + "step": 13765 + }, + { + "epoch": 0.3534722790597653, + "grad_norm": 0.75390625, + "learning_rate": 0.0001780491670636037, + "loss": 1.0455, + "step": 13766 + }, + { + "epoch": 0.3534979562556871, + "grad_norm": 0.84765625, + "learning_rate": 0.00017804637611791834, + "loss": 1.1506, + "step": 13767 + }, + { + "epoch": 0.3535236334516089, + "grad_norm": 0.80859375, + "learning_rate": 0.00017804358501669363, + "loss": 1.024, + "step": 13768 + }, + { + "epoch": 0.35354931064753076, + "grad_norm": 0.8046875, + "learning_rate": 0.00017804079375993506, + "loss": 0.9711, + "step": 13769 + }, + { + "epoch": 0.35357498784345254, + "grad_norm": 0.8515625, + "learning_rate": 0.00017803800234764827, + "loss": 0.9614, + "step": 13770 + }, + { + "epoch": 0.3536006650393744, + "grad_norm": 0.8046875, + "learning_rate": 0.00017803521077983879, + "loss": 1.0131, + "step": 13771 + }, + { + "epoch": 0.3536263422352962, + "grad_norm": 0.74609375, + "learning_rate": 0.00017803241905651218, + "loss": 0.9351, + "step": 13772 + }, + { + "epoch": 0.353652019431218, + "grad_norm": 0.8203125, + "learning_rate": 0.00017802962717767398, + "loss": 0.9604, + "step": 13773 + }, + { + "epoch": 0.3536776966271398, + "grad_norm": 0.77734375, + "learning_rate": 0.00017802683514332984, + "loss": 1.0755, + "step": 13774 + }, + { + "epoch": 0.35370337382306166, + "grad_norm": 0.7265625, + "learning_rate": 0.00017802404295348524, + "loss": 0.9373, + "step": 13775 + }, + { + "epoch": 0.3537290510189835, + "grad_norm": 0.8203125, + "learning_rate": 0.00017802125060814575, + "loss": 1.0202, + "step": 13776 + }, + { + "epoch": 0.3537547282149053, + "grad_norm": 0.7109375, + "learning_rate": 0.00017801845810731698, + "loss": 1.0344, + "step": 13777 + }, + { + "epoch": 0.3537804054108271, + "grad_norm": 0.75390625, + "learning_rate": 0.00017801566545100442, + "loss": 0.9337, + "step": 13778 + }, + { + "epoch": 0.35380608260674895, + "grad_norm": 0.80078125, + "learning_rate": 0.00017801287263921375, + "loss": 0.828, + "step": 13779 + }, + { + "epoch": 0.35383175980267073, + "grad_norm": 0.765625, + "learning_rate": 0.00017801007967195045, + "loss": 0.8306, + "step": 13780 + }, + { + "epoch": 0.35385743699859257, + "grad_norm": 0.80859375, + "learning_rate": 0.00017800728654922008, + "loss": 0.9575, + "step": 13781 + }, + { + "epoch": 0.3538831141945144, + "grad_norm": 0.7578125, + "learning_rate": 0.00017800449327102827, + "loss": 0.9021, + "step": 13782 + }, + { + "epoch": 0.3539087913904362, + "grad_norm": 0.73828125, + "learning_rate": 0.00017800169983738054, + "loss": 0.8365, + "step": 13783 + }, + { + "epoch": 0.353934468586358, + "grad_norm": 0.8984375, + "learning_rate": 0.00017799890624828248, + "loss": 1.0808, + "step": 13784 + }, + { + "epoch": 0.35396014578227986, + "grad_norm": 0.859375, + "learning_rate": 0.00017799611250373962, + "loss": 0.8597, + "step": 13785 + }, + { + "epoch": 0.3539858229782017, + "grad_norm": 0.86328125, + "learning_rate": 0.00017799331860375757, + "loss": 1.0693, + "step": 13786 + }, + { + "epoch": 0.3540115001741235, + "grad_norm": 0.77734375, + "learning_rate": 0.00017799052454834188, + "loss": 0.8721, + "step": 13787 + }, + { + "epoch": 0.3540371773700453, + "grad_norm": 0.765625, + "learning_rate": 0.0001779877303374981, + "loss": 0.8556, + "step": 13788 + }, + { + "epoch": 0.35406285456596714, + "grad_norm": 0.84765625, + "learning_rate": 0.00017798493597123184, + "loss": 0.9165, + "step": 13789 + }, + { + "epoch": 0.3540885317618889, + "grad_norm": 0.828125, + "learning_rate": 0.00017798214144954867, + "loss": 0.8659, + "step": 13790 + }, + { + "epoch": 0.35411420895781076, + "grad_norm": 0.765625, + "learning_rate": 0.00017797934677245408, + "loss": 0.9696, + "step": 13791 + }, + { + "epoch": 0.3541398861537326, + "grad_norm": 0.71875, + "learning_rate": 0.00017797655193995372, + "loss": 0.853, + "step": 13792 + }, + { + "epoch": 0.3541655633496544, + "grad_norm": 0.8125, + "learning_rate": 0.00017797375695205317, + "loss": 0.8544, + "step": 13793 + }, + { + "epoch": 0.3541912405455762, + "grad_norm": 0.82421875, + "learning_rate": 0.00017797096180875791, + "loss": 0.9273, + "step": 13794 + }, + { + "epoch": 0.35421691774149805, + "grad_norm": 0.703125, + "learning_rate": 0.0001779681665100736, + "loss": 1.0142, + "step": 13795 + }, + { + "epoch": 0.3542425949374199, + "grad_norm": 0.83984375, + "learning_rate": 0.00017796537105600578, + "loss": 0.961, + "step": 13796 + }, + { + "epoch": 0.35426827213334167, + "grad_norm": 0.83203125, + "learning_rate": 0.00017796257544656, + "loss": 1.0477, + "step": 13797 + }, + { + "epoch": 0.3542939493292635, + "grad_norm": 0.75, + "learning_rate": 0.00017795977968174185, + "loss": 0.9675, + "step": 13798 + }, + { + "epoch": 0.35431962652518534, + "grad_norm": 0.86328125, + "learning_rate": 0.0001779569837615569, + "loss": 1.0844, + "step": 13799 + }, + { + "epoch": 0.3543453037211071, + "grad_norm": 0.84765625, + "learning_rate": 0.00017795418768601072, + "loss": 0.9514, + "step": 13800 + }, + { + "epoch": 0.35437098091702895, + "grad_norm": 0.80859375, + "learning_rate": 0.00017795139145510892, + "loss": 0.9306, + "step": 13801 + }, + { + "epoch": 0.3543966581129508, + "grad_norm": 0.76953125, + "learning_rate": 0.000177948595068857, + "loss": 0.9437, + "step": 13802 + }, + { + "epoch": 0.35442233530887257, + "grad_norm": 0.75, + "learning_rate": 0.00017794579852726058, + "loss": 0.9191, + "step": 13803 + }, + { + "epoch": 0.3544480125047944, + "grad_norm": 0.828125, + "learning_rate": 0.00017794300183032524, + "loss": 0.9923, + "step": 13804 + }, + { + "epoch": 0.35447368970071624, + "grad_norm": 0.72265625, + "learning_rate": 0.00017794020497805652, + "loss": 0.9319, + "step": 13805 + }, + { + "epoch": 0.3544993668966381, + "grad_norm": 0.78515625, + "learning_rate": 0.00017793740797046001, + "loss": 0.9431, + "step": 13806 + }, + { + "epoch": 0.35452504409255986, + "grad_norm": 0.88671875, + "learning_rate": 0.0001779346108075413, + "loss": 0.9515, + "step": 13807 + }, + { + "epoch": 0.3545507212884817, + "grad_norm": 0.71875, + "learning_rate": 0.00017793181348930593, + "loss": 0.9512, + "step": 13808 + }, + { + "epoch": 0.35457639848440353, + "grad_norm": 0.73828125, + "learning_rate": 0.00017792901601575953, + "loss": 0.902, + "step": 13809 + }, + { + "epoch": 0.3546020756803253, + "grad_norm": 0.80078125, + "learning_rate": 0.00017792621838690761, + "loss": 0.9641, + "step": 13810 + }, + { + "epoch": 0.35462775287624715, + "grad_norm": 0.84765625, + "learning_rate": 0.0001779234206027558, + "loss": 0.9379, + "step": 13811 + }, + { + "epoch": 0.354653430072169, + "grad_norm": 0.72265625, + "learning_rate": 0.00017792062266330962, + "loss": 0.8276, + "step": 13812 + }, + { + "epoch": 0.35467910726809077, + "grad_norm": 0.8515625, + "learning_rate": 0.0001779178245685747, + "loss": 1.0115, + "step": 13813 + }, + { + "epoch": 0.3547047844640126, + "grad_norm": 0.765625, + "learning_rate": 0.00017791502631855658, + "loss": 1.0371, + "step": 13814 + }, + { + "epoch": 0.35473046165993444, + "grad_norm": 0.7890625, + "learning_rate": 0.00017791222791326087, + "loss": 0.9267, + "step": 13815 + }, + { + "epoch": 0.3547561388558563, + "grad_norm": 0.7734375, + "learning_rate": 0.00017790942935269312, + "loss": 0.9353, + "step": 13816 + }, + { + "epoch": 0.35478181605177805, + "grad_norm": 0.78125, + "learning_rate": 0.0001779066306368589, + "loss": 0.874, + "step": 13817 + }, + { + "epoch": 0.3548074932476999, + "grad_norm": 0.76953125, + "learning_rate": 0.00017790383176576383, + "loss": 0.8756, + "step": 13818 + }, + { + "epoch": 0.3548331704436217, + "grad_norm": 0.73828125, + "learning_rate": 0.00017790103273941345, + "loss": 0.9983, + "step": 13819 + }, + { + "epoch": 0.3548588476395435, + "grad_norm": 0.75390625, + "learning_rate": 0.00017789823355781334, + "loss": 0.977, + "step": 13820 + }, + { + "epoch": 0.35488452483546534, + "grad_norm": 0.82421875, + "learning_rate": 0.0001778954342209691, + "loss": 0.9782, + "step": 13821 + }, + { + "epoch": 0.3549102020313872, + "grad_norm": 0.84375, + "learning_rate": 0.00017789263472888628, + "loss": 1.1334, + "step": 13822 + }, + { + "epoch": 0.35493587922730896, + "grad_norm": 0.8359375, + "learning_rate": 0.0001778898350815705, + "loss": 1.0268, + "step": 13823 + }, + { + "epoch": 0.3549615564232308, + "grad_norm": 0.79296875, + "learning_rate": 0.00017788703527902728, + "loss": 0.8679, + "step": 13824 + }, + { + "epoch": 0.35498723361915263, + "grad_norm": 0.76953125, + "learning_rate": 0.00017788423532126224, + "loss": 0.8789, + "step": 13825 + }, + { + "epoch": 0.35501291081507447, + "grad_norm": 0.7265625, + "learning_rate": 0.00017788143520828097, + "loss": 0.8545, + "step": 13826 + }, + { + "epoch": 0.35503858801099625, + "grad_norm": 0.83203125, + "learning_rate": 0.00017787863494008902, + "loss": 0.9348, + "step": 13827 + }, + { + "epoch": 0.3550642652069181, + "grad_norm": 0.86328125, + "learning_rate": 0.000177875834516692, + "loss": 1.1236, + "step": 13828 + }, + { + "epoch": 0.3550899424028399, + "grad_norm": 0.796875, + "learning_rate": 0.0001778730339380955, + "loss": 0.9601, + "step": 13829 + }, + { + "epoch": 0.3551156195987617, + "grad_norm": 0.79296875, + "learning_rate": 0.00017787023320430502, + "loss": 0.8766, + "step": 13830 + }, + { + "epoch": 0.35514129679468354, + "grad_norm": 0.7890625, + "learning_rate": 0.00017786743231532626, + "loss": 1.0282, + "step": 13831 + }, + { + "epoch": 0.3551669739906054, + "grad_norm": 0.80859375, + "learning_rate": 0.00017786463127116467, + "loss": 0.8998, + "step": 13832 + }, + { + "epoch": 0.35519265118652715, + "grad_norm": 0.8203125, + "learning_rate": 0.00017786183007182594, + "loss": 0.968, + "step": 13833 + }, + { + "epoch": 0.355218328382449, + "grad_norm": 0.73046875, + "learning_rate": 0.00017785902871731563, + "loss": 0.8629, + "step": 13834 + }, + { + "epoch": 0.3552440055783708, + "grad_norm": 0.74609375, + "learning_rate": 0.0001778562272076393, + "loss": 0.8201, + "step": 13835 + }, + { + "epoch": 0.35526968277429266, + "grad_norm": 0.81640625, + "learning_rate": 0.00017785342554280253, + "loss": 1.1522, + "step": 13836 + }, + { + "epoch": 0.35529535997021444, + "grad_norm": 0.7890625, + "learning_rate": 0.00017785062372281093, + "loss": 0.9903, + "step": 13837 + }, + { + "epoch": 0.3553210371661363, + "grad_norm": 0.8359375, + "learning_rate": 0.00017784782174767004, + "loss": 0.9475, + "step": 13838 + }, + { + "epoch": 0.3553467143620581, + "grad_norm": 0.84375, + "learning_rate": 0.00017784501961738553, + "loss": 0.9633, + "step": 13839 + }, + { + "epoch": 0.3553723915579799, + "grad_norm": 0.875, + "learning_rate": 0.0001778422173319629, + "loss": 1.1138, + "step": 13840 + }, + { + "epoch": 0.35539806875390173, + "grad_norm": 0.890625, + "learning_rate": 0.00017783941489140774, + "loss": 1.0189, + "step": 13841 + }, + { + "epoch": 0.35542374594982357, + "grad_norm": 0.84765625, + "learning_rate": 0.00017783661229572566, + "loss": 1.0346, + "step": 13842 + }, + { + "epoch": 0.35544942314574535, + "grad_norm": 0.84375, + "learning_rate": 0.00017783380954492227, + "loss": 1.1053, + "step": 13843 + }, + { + "epoch": 0.3554751003416672, + "grad_norm": 0.75390625, + "learning_rate": 0.0001778310066390031, + "loss": 0.7761, + "step": 13844 + }, + { + "epoch": 0.355500777537589, + "grad_norm": 0.79296875, + "learning_rate": 0.00017782820357797378, + "loss": 1.0864, + "step": 13845 + }, + { + "epoch": 0.3555264547335108, + "grad_norm": 0.78125, + "learning_rate": 0.0001778254003618399, + "loss": 0.9884, + "step": 13846 + }, + { + "epoch": 0.35555213192943264, + "grad_norm": 1.03125, + "learning_rate": 0.000177822596990607, + "loss": 1.026, + "step": 13847 + }, + { + "epoch": 0.35557780912535447, + "grad_norm": 0.8125, + "learning_rate": 0.0001778197934642807, + "loss": 0.9087, + "step": 13848 + }, + { + "epoch": 0.3556034863212763, + "grad_norm": 0.83984375, + "learning_rate": 0.00017781698978286658, + "loss": 1.0304, + "step": 13849 + }, + { + "epoch": 0.3556291635171981, + "grad_norm": 0.7734375, + "learning_rate": 0.00017781418594637021, + "loss": 0.9998, + "step": 13850 + }, + { + "epoch": 0.3556548407131199, + "grad_norm": 0.80078125, + "learning_rate": 0.00017781138195479724, + "loss": 0.9267, + "step": 13851 + }, + { + "epoch": 0.35568051790904176, + "grad_norm": 0.78125, + "learning_rate": 0.0001778085778081532, + "loss": 0.8858, + "step": 13852 + }, + { + "epoch": 0.35570619510496354, + "grad_norm": 0.76171875, + "learning_rate": 0.00017780577350644364, + "loss": 1.0219, + "step": 13853 + }, + { + "epoch": 0.3557318723008854, + "grad_norm": 0.76953125, + "learning_rate": 0.00017780296904967426, + "loss": 0.7934, + "step": 13854 + }, + { + "epoch": 0.3557575494968072, + "grad_norm": 0.69921875, + "learning_rate": 0.00017780016443785058, + "loss": 0.8806, + "step": 13855 + }, + { + "epoch": 0.355783226692729, + "grad_norm": 0.81640625, + "learning_rate": 0.0001777973596709782, + "loss": 0.8177, + "step": 13856 + }, + { + "epoch": 0.35580890388865083, + "grad_norm": 0.84375, + "learning_rate": 0.0001777945547490627, + "loss": 0.9253, + "step": 13857 + }, + { + "epoch": 0.35583458108457267, + "grad_norm": 0.80859375, + "learning_rate": 0.0001777917496721097, + "loss": 0.9419, + "step": 13858 + }, + { + "epoch": 0.3558602582804945, + "grad_norm": 0.68359375, + "learning_rate": 0.00017778894444012475, + "loss": 0.8582, + "step": 13859 + }, + { + "epoch": 0.3558859354764163, + "grad_norm": 0.7265625, + "learning_rate": 0.0001777861390531135, + "loss": 1.0751, + "step": 13860 + }, + { + "epoch": 0.3559116126723381, + "grad_norm": 0.86328125, + "learning_rate": 0.00017778333351108142, + "loss": 0.9803, + "step": 13861 + }, + { + "epoch": 0.35593728986825995, + "grad_norm": 0.83984375, + "learning_rate": 0.00017778052781403423, + "loss": 0.9613, + "step": 13862 + }, + { + "epoch": 0.35596296706418173, + "grad_norm": 0.8203125, + "learning_rate": 0.00017777772196197748, + "loss": 0.805, + "step": 13863 + }, + { + "epoch": 0.35598864426010357, + "grad_norm": 0.87890625, + "learning_rate": 0.00017777491595491674, + "loss": 0.976, + "step": 13864 + }, + { + "epoch": 0.3560143214560254, + "grad_norm": 0.703125, + "learning_rate": 0.00017777210979285764, + "loss": 0.8854, + "step": 13865 + }, + { + "epoch": 0.3560399986519472, + "grad_norm": 0.83203125, + "learning_rate": 0.00017776930347580572, + "loss": 0.9285, + "step": 13866 + }, + { + "epoch": 0.356065675847869, + "grad_norm": 0.8046875, + "learning_rate": 0.00017776649700376664, + "loss": 0.94, + "step": 13867 + }, + { + "epoch": 0.35609135304379086, + "grad_norm": 0.76953125, + "learning_rate": 0.00017776369037674595, + "loss": 0.7742, + "step": 13868 + }, + { + "epoch": 0.3561170302397127, + "grad_norm": 0.76171875, + "learning_rate": 0.0001777608835947492, + "loss": 0.9345, + "step": 13869 + }, + { + "epoch": 0.3561427074356345, + "grad_norm": 0.75390625, + "learning_rate": 0.0001777580766577821, + "loss": 0.9283, + "step": 13870 + }, + { + "epoch": 0.3561683846315563, + "grad_norm": 0.8125, + "learning_rate": 0.00017775526956585012, + "loss": 0.8859, + "step": 13871 + }, + { + "epoch": 0.35619406182747815, + "grad_norm": 1.09375, + "learning_rate": 0.00017775246231895896, + "loss": 0.9769, + "step": 13872 + }, + { + "epoch": 0.35621973902339993, + "grad_norm": 0.8125, + "learning_rate": 0.00017774965491711413, + "loss": 0.8587, + "step": 13873 + }, + { + "epoch": 0.35624541621932176, + "grad_norm": 0.77734375, + "learning_rate": 0.00017774684736032126, + "loss": 0.9903, + "step": 13874 + }, + { + "epoch": 0.3562710934152436, + "grad_norm": 0.796875, + "learning_rate": 0.00017774403964858598, + "loss": 0.8921, + "step": 13875 + }, + { + "epoch": 0.3562967706111654, + "grad_norm": 0.7734375, + "learning_rate": 0.00017774123178191382, + "loss": 0.995, + "step": 13876 + }, + { + "epoch": 0.3563224478070872, + "grad_norm": 0.765625, + "learning_rate": 0.0001777384237603104, + "loss": 1.0105, + "step": 13877 + }, + { + "epoch": 0.35634812500300905, + "grad_norm": 0.8203125, + "learning_rate": 0.00017773561558378137, + "loss": 0.9028, + "step": 13878 + }, + { + "epoch": 0.3563738021989309, + "grad_norm": 0.80078125, + "learning_rate": 0.00017773280725233223, + "loss": 0.8881, + "step": 13879 + }, + { + "epoch": 0.35639947939485267, + "grad_norm": 0.8359375, + "learning_rate": 0.00017772999876596864, + "loss": 1.0374, + "step": 13880 + }, + { + "epoch": 0.3564251565907745, + "grad_norm": 0.82421875, + "learning_rate": 0.0001777271901246962, + "loss": 1.0844, + "step": 13881 + }, + { + "epoch": 0.35645083378669634, + "grad_norm": 0.8515625, + "learning_rate": 0.0001777243813285205, + "loss": 0.9283, + "step": 13882 + }, + { + "epoch": 0.3564765109826181, + "grad_norm": 0.78515625, + "learning_rate": 0.0001777215723774471, + "loss": 0.9656, + "step": 13883 + }, + { + "epoch": 0.35650218817853996, + "grad_norm": 1.2578125, + "learning_rate": 0.00017771876327148166, + "loss": 0.981, + "step": 13884 + }, + { + "epoch": 0.3565278653744618, + "grad_norm": 0.7265625, + "learning_rate": 0.0001777159540106297, + "loss": 0.8223, + "step": 13885 + }, + { + "epoch": 0.3565535425703836, + "grad_norm": 0.859375, + "learning_rate": 0.00017771314459489687, + "loss": 1.0344, + "step": 13886 + }, + { + "epoch": 0.3565792197663054, + "grad_norm": 0.7734375, + "learning_rate": 0.0001777103350242888, + "loss": 0.8739, + "step": 13887 + }, + { + "epoch": 0.35660489696222725, + "grad_norm": 0.8671875, + "learning_rate": 0.00017770752529881105, + "loss": 0.8899, + "step": 13888 + }, + { + "epoch": 0.3566305741581491, + "grad_norm": 1.1640625, + "learning_rate": 0.00017770471541846917, + "loss": 0.9013, + "step": 13889 + }, + { + "epoch": 0.35665625135407086, + "grad_norm": 0.7890625, + "learning_rate": 0.00017770190538326887, + "loss": 0.9609, + "step": 13890 + }, + { + "epoch": 0.3566819285499927, + "grad_norm": 0.9140625, + "learning_rate": 0.00017769909519321566, + "loss": 1.0836, + "step": 13891 + }, + { + "epoch": 0.35670760574591454, + "grad_norm": 0.81640625, + "learning_rate": 0.00017769628484831518, + "loss": 0.9468, + "step": 13892 + }, + { + "epoch": 0.3567332829418363, + "grad_norm": 0.77734375, + "learning_rate": 0.00017769347434857302, + "loss": 0.8576, + "step": 13893 + }, + { + "epoch": 0.35675896013775815, + "grad_norm": 0.80859375, + "learning_rate": 0.0001776906636939948, + "loss": 1.0476, + "step": 13894 + }, + { + "epoch": 0.35678463733368, + "grad_norm": 0.796875, + "learning_rate": 0.00017768785288458606, + "loss": 0.9533, + "step": 13895 + }, + { + "epoch": 0.35681031452960177, + "grad_norm": 0.76171875, + "learning_rate": 0.0001776850419203525, + "loss": 0.9002, + "step": 13896 + }, + { + "epoch": 0.3568359917255236, + "grad_norm": 0.76953125, + "learning_rate": 0.00017768223080129962, + "loss": 0.8632, + "step": 13897 + }, + { + "epoch": 0.35686166892144544, + "grad_norm": 1.0390625, + "learning_rate": 0.00017767941952743312, + "loss": 0.8909, + "step": 13898 + }, + { + "epoch": 0.3568873461173673, + "grad_norm": 0.84765625, + "learning_rate": 0.00017767660809875853, + "loss": 0.8952, + "step": 13899 + }, + { + "epoch": 0.35691302331328906, + "grad_norm": 1.0078125, + "learning_rate": 0.00017767379651528146, + "loss": 0.9662, + "step": 13900 + }, + { + "epoch": 0.3569387005092109, + "grad_norm": 1.0390625, + "learning_rate": 0.00017767098477700755, + "loss": 0.9991, + "step": 13901 + }, + { + "epoch": 0.35696437770513273, + "grad_norm": 1.1015625, + "learning_rate": 0.00017766817288394237, + "loss": 0.8607, + "step": 13902 + }, + { + "epoch": 0.3569900549010545, + "grad_norm": 0.76171875, + "learning_rate": 0.00017766536083609156, + "loss": 0.8252, + "step": 13903 + }, + { + "epoch": 0.35701573209697635, + "grad_norm": 0.80078125, + "learning_rate": 0.0001776625486334607, + "loss": 1.0095, + "step": 13904 + }, + { + "epoch": 0.3570414092928982, + "grad_norm": 0.78515625, + "learning_rate": 0.00017765973627605536, + "loss": 0.9418, + "step": 13905 + }, + { + "epoch": 0.35706708648881996, + "grad_norm": 0.82421875, + "learning_rate": 0.00017765692376388122, + "loss": 1.0198, + "step": 13906 + }, + { + "epoch": 0.3570927636847418, + "grad_norm": 0.70703125, + "learning_rate": 0.0001776541110969438, + "loss": 0.9803, + "step": 13907 + }, + { + "epoch": 0.35711844088066363, + "grad_norm": 0.76171875, + "learning_rate": 0.0001776512982752488, + "loss": 0.9965, + "step": 13908 + }, + { + "epoch": 0.35714411807658547, + "grad_norm": 0.796875, + "learning_rate": 0.00017764848529880177, + "loss": 1.0259, + "step": 13909 + }, + { + "epoch": 0.35716979527250725, + "grad_norm": 0.91796875, + "learning_rate": 0.00017764567216760831, + "loss": 0.9504, + "step": 13910 + }, + { + "epoch": 0.3571954724684291, + "grad_norm": 0.79296875, + "learning_rate": 0.00017764285888167402, + "loss": 0.8087, + "step": 13911 + }, + { + "epoch": 0.3572211496643509, + "grad_norm": 0.94140625, + "learning_rate": 0.00017764004544100454, + "loss": 1.0733, + "step": 13912 + }, + { + "epoch": 0.3572468268602727, + "grad_norm": 1.46875, + "learning_rate": 0.00017763723184560548, + "loss": 1.0632, + "step": 13913 + }, + { + "epoch": 0.35727250405619454, + "grad_norm": 0.76953125, + "learning_rate": 0.0001776344180954824, + "loss": 0.896, + "step": 13914 + }, + { + "epoch": 0.3572981812521164, + "grad_norm": 0.84375, + "learning_rate": 0.00017763160419064097, + "loss": 0.9717, + "step": 13915 + }, + { + "epoch": 0.35732385844803816, + "grad_norm": 0.7578125, + "learning_rate": 0.00017762879013108674, + "loss": 1.0469, + "step": 13916 + }, + { + "epoch": 0.35734953564396, + "grad_norm": 0.81640625, + "learning_rate": 0.00017762597591682537, + "loss": 0.878, + "step": 13917 + }, + { + "epoch": 0.35737521283988183, + "grad_norm": 0.86328125, + "learning_rate": 0.00017762316154786243, + "loss": 1.0468, + "step": 13918 + }, + { + "epoch": 0.35740089003580366, + "grad_norm": 0.94140625, + "learning_rate": 0.00017762034702420353, + "loss": 1.0165, + "step": 13919 + }, + { + "epoch": 0.35742656723172544, + "grad_norm": 0.81640625, + "learning_rate": 0.0001776175323458543, + "loss": 0.8842, + "step": 13920 + }, + { + "epoch": 0.3574522444276473, + "grad_norm": 0.7734375, + "learning_rate": 0.00017761471751282033, + "loss": 0.9769, + "step": 13921 + }, + { + "epoch": 0.3574779216235691, + "grad_norm": 0.82421875, + "learning_rate": 0.00017761190252510724, + "loss": 1.0256, + "step": 13922 + }, + { + "epoch": 0.3575035988194909, + "grad_norm": 0.79296875, + "learning_rate": 0.00017760908738272065, + "loss": 0.9258, + "step": 13923 + }, + { + "epoch": 0.35752927601541273, + "grad_norm": 0.8046875, + "learning_rate": 0.00017760627208566617, + "loss": 1.0937, + "step": 13924 + }, + { + "epoch": 0.35755495321133457, + "grad_norm": 0.8984375, + "learning_rate": 0.00017760345663394938, + "loss": 0.8857, + "step": 13925 + }, + { + "epoch": 0.35758063040725635, + "grad_norm": 0.8515625, + "learning_rate": 0.0001776006410275759, + "loss": 0.9947, + "step": 13926 + }, + { + "epoch": 0.3576063076031782, + "grad_norm": 0.82421875, + "learning_rate": 0.00017759782526655135, + "loss": 0.9427, + "step": 13927 + }, + { + "epoch": 0.3576319847991, + "grad_norm": 0.77734375, + "learning_rate": 0.00017759500935088135, + "loss": 1.0549, + "step": 13928 + }, + { + "epoch": 0.35765766199502186, + "grad_norm": 0.86328125, + "learning_rate": 0.00017759219328057152, + "loss": 0.9992, + "step": 13929 + }, + { + "epoch": 0.35768333919094364, + "grad_norm": 0.8203125, + "learning_rate": 0.00017758937705562747, + "loss": 0.923, + "step": 13930 + }, + { + "epoch": 0.3577090163868655, + "grad_norm": 0.73828125, + "learning_rate": 0.00017758656067605477, + "loss": 0.9347, + "step": 13931 + }, + { + "epoch": 0.3577346935827873, + "grad_norm": 0.78515625, + "learning_rate": 0.00017758374414185906, + "loss": 0.9571, + "step": 13932 + }, + { + "epoch": 0.3577603707787091, + "grad_norm": 0.94921875, + "learning_rate": 0.000177580927453046, + "loss": 1.0714, + "step": 13933 + }, + { + "epoch": 0.3577860479746309, + "grad_norm": 0.73046875, + "learning_rate": 0.00017757811060962114, + "loss": 0.9337, + "step": 13934 + }, + { + "epoch": 0.35781172517055276, + "grad_norm": 0.875, + "learning_rate": 0.00017757529361159006, + "loss": 0.846, + "step": 13935 + }, + { + "epoch": 0.35783740236647454, + "grad_norm": 0.796875, + "learning_rate": 0.0001775724764589585, + "loss": 0.9458, + "step": 13936 + }, + { + "epoch": 0.3578630795623964, + "grad_norm": 0.74609375, + "learning_rate": 0.00017756965915173196, + "loss": 0.9019, + "step": 13937 + }, + { + "epoch": 0.3578887567583182, + "grad_norm": 0.734375, + "learning_rate": 0.0001775668416899161, + "loss": 0.9384, + "step": 13938 + }, + { + "epoch": 0.35791443395424005, + "grad_norm": 0.78125, + "learning_rate": 0.00017756402407351655, + "loss": 0.9582, + "step": 13939 + }, + { + "epoch": 0.35794011115016183, + "grad_norm": 1.3359375, + "learning_rate": 0.0001775612063025389, + "loss": 0.8645, + "step": 13940 + }, + { + "epoch": 0.35796578834608367, + "grad_norm": 0.77734375, + "learning_rate": 0.00017755838837698877, + "loss": 1.013, + "step": 13941 + }, + { + "epoch": 0.3579914655420055, + "grad_norm": 0.81640625, + "learning_rate": 0.00017755557029687177, + "loss": 0.8721, + "step": 13942 + }, + { + "epoch": 0.3580171427379273, + "grad_norm": 0.828125, + "learning_rate": 0.00017755275206219354, + "loss": 1.0379, + "step": 13943 + }, + { + "epoch": 0.3580428199338491, + "grad_norm": 0.80859375, + "learning_rate": 0.00017754993367295966, + "loss": 1.1322, + "step": 13944 + }, + { + "epoch": 0.35806849712977096, + "grad_norm": 0.765625, + "learning_rate": 0.0001775471151291758, + "loss": 0.9269, + "step": 13945 + }, + { + "epoch": 0.35809417432569274, + "grad_norm": 0.81640625, + "learning_rate": 0.0001775442964308475, + "loss": 0.8503, + "step": 13946 + }, + { + "epoch": 0.3581198515216146, + "grad_norm": 0.77734375, + "learning_rate": 0.00017754147757798044, + "loss": 1.1703, + "step": 13947 + }, + { + "epoch": 0.3581455287175364, + "grad_norm": 0.8203125, + "learning_rate": 0.00017753865857058024, + "loss": 0.9669, + "step": 13948 + }, + { + "epoch": 0.35817120591345825, + "grad_norm": 0.8046875, + "learning_rate": 0.00017753583940865246, + "loss": 1.0226, + "step": 13949 + }, + { + "epoch": 0.35819688310938, + "grad_norm": 0.76953125, + "learning_rate": 0.00017753302009220279, + "loss": 0.9051, + "step": 13950 + }, + { + "epoch": 0.35822256030530186, + "grad_norm": 0.80078125, + "learning_rate": 0.0001775302006212368, + "loss": 0.9339, + "step": 13951 + }, + { + "epoch": 0.3582482375012237, + "grad_norm": 0.83203125, + "learning_rate": 0.0001775273809957601, + "loss": 1.1391, + "step": 13952 + }, + { + "epoch": 0.3582739146971455, + "grad_norm": 0.7890625, + "learning_rate": 0.00017752456121577836, + "loss": 1.068, + "step": 13953 + }, + { + "epoch": 0.3582995918930673, + "grad_norm": 0.7734375, + "learning_rate": 0.0001775217412812972, + "loss": 0.8812, + "step": 13954 + }, + { + "epoch": 0.35832526908898915, + "grad_norm": 0.80859375, + "learning_rate": 0.00017751892119232215, + "loss": 1.0232, + "step": 13955 + }, + { + "epoch": 0.35835094628491093, + "grad_norm": 0.78125, + "learning_rate": 0.00017751610094885893, + "loss": 0.9354, + "step": 13956 + }, + { + "epoch": 0.35837662348083277, + "grad_norm": 0.7734375, + "learning_rate": 0.00017751328055091311, + "loss": 0.8501, + "step": 13957 + }, + { + "epoch": 0.3584023006767546, + "grad_norm": 0.78515625, + "learning_rate": 0.00017751045999849033, + "loss": 0.9219, + "step": 13958 + }, + { + "epoch": 0.35842797787267644, + "grad_norm": 0.82421875, + "learning_rate": 0.00017750763929159623, + "loss": 0.9642, + "step": 13959 + }, + { + "epoch": 0.3584536550685982, + "grad_norm": 0.77734375, + "learning_rate": 0.00017750481843023636, + "loss": 1.0441, + "step": 13960 + }, + { + "epoch": 0.35847933226452006, + "grad_norm": 0.8515625, + "learning_rate": 0.0001775019974144164, + "loss": 0.954, + "step": 13961 + }, + { + "epoch": 0.3585050094604419, + "grad_norm": 0.80859375, + "learning_rate": 0.000177499176244142, + "loss": 1.0355, + "step": 13962 + }, + { + "epoch": 0.3585306866563637, + "grad_norm": 0.79296875, + "learning_rate": 0.0001774963549194187, + "loss": 0.9078, + "step": 13963 + }, + { + "epoch": 0.3585563638522855, + "grad_norm": 0.74609375, + "learning_rate": 0.00017749353344025218, + "loss": 0.9549, + "step": 13964 + }, + { + "epoch": 0.35858204104820735, + "grad_norm": 0.859375, + "learning_rate": 0.00017749071180664803, + "loss": 1.0008, + "step": 13965 + }, + { + "epoch": 0.3586077182441291, + "grad_norm": 0.77734375, + "learning_rate": 0.00017748789001861191, + "loss": 1.0592, + "step": 13966 + }, + { + "epoch": 0.35863339544005096, + "grad_norm": 0.8515625, + "learning_rate": 0.0001774850680761494, + "loss": 0.8687, + "step": 13967 + }, + { + "epoch": 0.3586590726359728, + "grad_norm": 0.78515625, + "learning_rate": 0.00017748224597926616, + "loss": 0.8485, + "step": 13968 + }, + { + "epoch": 0.35868474983189463, + "grad_norm": 0.73046875, + "learning_rate": 0.00017747942372796783, + "loss": 0.8593, + "step": 13969 + }, + { + "epoch": 0.3587104270278164, + "grad_norm": 0.7890625, + "learning_rate": 0.00017747660132225998, + "loss": 0.954, + "step": 13970 + }, + { + "epoch": 0.35873610422373825, + "grad_norm": 0.7734375, + "learning_rate": 0.0001774737787621483, + "loss": 0.8758, + "step": 13971 + }, + { + "epoch": 0.3587617814196601, + "grad_norm": 0.82421875, + "learning_rate": 0.00017747095604763832, + "loss": 0.989, + "step": 13972 + }, + { + "epoch": 0.35878745861558187, + "grad_norm": 0.84375, + "learning_rate": 0.00017746813317873574, + "loss": 0.9623, + "step": 13973 + }, + { + "epoch": 0.3588131358115037, + "grad_norm": 0.9453125, + "learning_rate": 0.0001774653101554462, + "loss": 0.9726, + "step": 13974 + }, + { + "epoch": 0.35883881300742554, + "grad_norm": 0.79296875, + "learning_rate": 0.00017746248697777525, + "loss": 0.8588, + "step": 13975 + }, + { + "epoch": 0.3588644902033473, + "grad_norm": 1.1875, + "learning_rate": 0.00017745966364572858, + "loss": 0.94, + "step": 13976 + }, + { + "epoch": 0.35889016739926916, + "grad_norm": 0.859375, + "learning_rate": 0.00017745684015931182, + "loss": 0.8749, + "step": 13977 + }, + { + "epoch": 0.358915844595191, + "grad_norm": 0.77734375, + "learning_rate": 0.00017745401651853056, + "loss": 0.9478, + "step": 13978 + }, + { + "epoch": 0.3589415217911128, + "grad_norm": 0.82421875, + "learning_rate": 0.00017745119272339043, + "loss": 0.9092, + "step": 13979 + }, + { + "epoch": 0.3589671989870346, + "grad_norm": 0.83203125, + "learning_rate": 0.00017744836877389708, + "loss": 0.787, + "step": 13980 + }, + { + "epoch": 0.35899287618295644, + "grad_norm": 0.80078125, + "learning_rate": 0.0001774455446700561, + "loss": 0.8209, + "step": 13981 + }, + { + "epoch": 0.3590185533788783, + "grad_norm": 0.83984375, + "learning_rate": 0.00017744272041187315, + "loss": 0.9978, + "step": 13982 + }, + { + "epoch": 0.35904423057480006, + "grad_norm": 0.71484375, + "learning_rate": 0.0001774398959993539, + "loss": 0.8733, + "step": 13983 + }, + { + "epoch": 0.3590699077707219, + "grad_norm": 0.7734375, + "learning_rate": 0.0001774370714325039, + "loss": 0.9339, + "step": 13984 + }, + { + "epoch": 0.35909558496664373, + "grad_norm": 0.734375, + "learning_rate": 0.0001774342467113288, + "loss": 1.0327, + "step": 13985 + }, + { + "epoch": 0.3591212621625655, + "grad_norm": 0.79296875, + "learning_rate": 0.00017743142183583425, + "loss": 0.8941, + "step": 13986 + }, + { + "epoch": 0.35914693935848735, + "grad_norm": 0.7890625, + "learning_rate": 0.0001774285968060259, + "loss": 0.9673, + "step": 13987 + }, + { + "epoch": 0.3591726165544092, + "grad_norm": 0.76171875, + "learning_rate": 0.0001774257716219093, + "loss": 0.9062, + "step": 13988 + }, + { + "epoch": 0.359198293750331, + "grad_norm": 0.8046875, + "learning_rate": 0.00017742294628349017, + "loss": 0.8092, + "step": 13989 + }, + { + "epoch": 0.3592239709462528, + "grad_norm": 0.78515625, + "learning_rate": 0.0001774201207907741, + "loss": 0.9641, + "step": 13990 + }, + { + "epoch": 0.35924964814217464, + "grad_norm": 0.8203125, + "learning_rate": 0.0001774172951437667, + "loss": 0.9328, + "step": 13991 + }, + { + "epoch": 0.3592753253380965, + "grad_norm": 0.88671875, + "learning_rate": 0.00017741446934247364, + "loss": 0.9976, + "step": 13992 + }, + { + "epoch": 0.35930100253401825, + "grad_norm": 0.8046875, + "learning_rate": 0.00017741164338690052, + "loss": 1.0381, + "step": 13993 + }, + { + "epoch": 0.3593266797299401, + "grad_norm": 0.765625, + "learning_rate": 0.000177408817277053, + "loss": 0.9131, + "step": 13994 + }, + { + "epoch": 0.3593523569258619, + "grad_norm": 0.828125, + "learning_rate": 0.0001774059910129367, + "loss": 0.8908, + "step": 13995 + }, + { + "epoch": 0.3593780341217837, + "grad_norm": 0.75390625, + "learning_rate": 0.00017740316459455727, + "loss": 0.9489, + "step": 13996 + }, + { + "epoch": 0.35940371131770554, + "grad_norm": 0.7734375, + "learning_rate": 0.0001774003380219203, + "loss": 0.9511, + "step": 13997 + }, + { + "epoch": 0.3594293885136274, + "grad_norm": 0.7421875, + "learning_rate": 0.00017739751129503142, + "loss": 0.9825, + "step": 13998 + }, + { + "epoch": 0.3594550657095492, + "grad_norm": 0.79296875, + "learning_rate": 0.00017739468441389634, + "loss": 0.9537, + "step": 13999 + }, + { + "epoch": 0.359480742905471, + "grad_norm": 0.765625, + "learning_rate": 0.00017739185737852063, + "loss": 0.9151, + "step": 14000 + }, + { + "epoch": 0.359480742905471, + "eval_loss": 0.9459433555603027, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 409.3412, + "eval_samples_per_second": 24.429, + "eval_steps_per_second": 0.765, + "step": 14000 + }, + { + "epoch": 0.35950642010139283, + "grad_norm": 0.79296875, + "learning_rate": 0.00017738903018890993, + "loss": 0.9163, + "step": 14001 + }, + { + "epoch": 0.35953209729731467, + "grad_norm": 0.75390625, + "learning_rate": 0.0001773862028450699, + "loss": 1.0208, + "step": 14002 + }, + { + "epoch": 0.35955777449323645, + "grad_norm": 0.734375, + "learning_rate": 0.00017738337534700614, + "loss": 1.0114, + "step": 14003 + }, + { + "epoch": 0.3595834516891583, + "grad_norm": 0.7109375, + "learning_rate": 0.0001773805476947243, + "loss": 0.83, + "step": 14004 + }, + { + "epoch": 0.3596091288850801, + "grad_norm": 0.77734375, + "learning_rate": 0.00017737771988823005, + "loss": 0.9798, + "step": 14005 + }, + { + "epoch": 0.3596348060810019, + "grad_norm": 0.7578125, + "learning_rate": 0.00017737489192752896, + "loss": 0.908, + "step": 14006 + }, + { + "epoch": 0.35966048327692374, + "grad_norm": 0.76171875, + "learning_rate": 0.0001773720638126267, + "loss": 0.9574, + "step": 14007 + }, + { + "epoch": 0.3596861604728456, + "grad_norm": 0.77734375, + "learning_rate": 0.00017736923554352894, + "loss": 0.8844, + "step": 14008 + }, + { + "epoch": 0.3597118376687674, + "grad_norm": 0.71875, + "learning_rate": 0.00017736640712024126, + "loss": 0.9067, + "step": 14009 + }, + { + "epoch": 0.3597375148646892, + "grad_norm": 0.734375, + "learning_rate": 0.00017736357854276932, + "loss": 0.8635, + "step": 14010 + }, + { + "epoch": 0.359763192060611, + "grad_norm": 0.76171875, + "learning_rate": 0.00017736074981111873, + "loss": 0.8563, + "step": 14011 + }, + { + "epoch": 0.35978886925653286, + "grad_norm": 0.76953125, + "learning_rate": 0.0001773579209252952, + "loss": 0.9579, + "step": 14012 + }, + { + "epoch": 0.35981454645245464, + "grad_norm": 1.03125, + "learning_rate": 0.0001773550918853043, + "loss": 1.0, + "step": 14013 + }, + { + "epoch": 0.3598402236483765, + "grad_norm": 0.77734375, + "learning_rate": 0.0001773522626911517, + "loss": 0.8497, + "step": 14014 + }, + { + "epoch": 0.3598659008442983, + "grad_norm": 0.734375, + "learning_rate": 0.000177349433342843, + "loss": 0.9691, + "step": 14015 + }, + { + "epoch": 0.3598915780402201, + "grad_norm": 0.77734375, + "learning_rate": 0.00017734660384038388, + "loss": 0.8864, + "step": 14016 + }, + { + "epoch": 0.35991725523614193, + "grad_norm": 0.86328125, + "learning_rate": 0.00017734377418378, + "loss": 0.9752, + "step": 14017 + }, + { + "epoch": 0.35994293243206377, + "grad_norm": 0.78515625, + "learning_rate": 0.0001773409443730369, + "loss": 0.9233, + "step": 14018 + }, + { + "epoch": 0.3599686096279856, + "grad_norm": 0.8125, + "learning_rate": 0.00017733811440816033, + "loss": 0.8993, + "step": 14019 + }, + { + "epoch": 0.3599942868239074, + "grad_norm": 0.87109375, + "learning_rate": 0.0001773352842891559, + "loss": 0.9025, + "step": 14020 + }, + { + "epoch": 0.3600199640198292, + "grad_norm": 0.71484375, + "learning_rate": 0.0001773324540160292, + "loss": 0.9005, + "step": 14021 + }, + { + "epoch": 0.36004564121575106, + "grad_norm": 0.7734375, + "learning_rate": 0.00017732962358878588, + "loss": 0.9378, + "step": 14022 + }, + { + "epoch": 0.36007131841167284, + "grad_norm": 0.8671875, + "learning_rate": 0.00017732679300743164, + "loss": 0.9072, + "step": 14023 + }, + { + "epoch": 0.36009699560759467, + "grad_norm": 0.84375, + "learning_rate": 0.0001773239622719721, + "loss": 1.0257, + "step": 14024 + }, + { + "epoch": 0.3601226728035165, + "grad_norm": 0.78125, + "learning_rate": 0.00017732113138241288, + "loss": 1.0487, + "step": 14025 + }, + { + "epoch": 0.3601483499994383, + "grad_norm": 0.77734375, + "learning_rate": 0.0001773183003387596, + "loss": 1.0028, + "step": 14026 + }, + { + "epoch": 0.3601740271953601, + "grad_norm": 0.7734375, + "learning_rate": 0.00017731546914101796, + "loss": 0.9562, + "step": 14027 + }, + { + "epoch": 0.36019970439128196, + "grad_norm": 0.765625, + "learning_rate": 0.00017731263778919355, + "loss": 0.915, + "step": 14028 + }, + { + "epoch": 0.3602253815872038, + "grad_norm": 0.890625, + "learning_rate": 0.00017730980628329206, + "loss": 0.9521, + "step": 14029 + }, + { + "epoch": 0.3602510587831256, + "grad_norm": 0.77734375, + "learning_rate": 0.0001773069746233191, + "loss": 0.8413, + "step": 14030 + }, + { + "epoch": 0.3602767359790474, + "grad_norm": 0.76953125, + "learning_rate": 0.0001773041428092803, + "loss": 0.8845, + "step": 14031 + }, + { + "epoch": 0.36030241317496925, + "grad_norm": 0.765625, + "learning_rate": 0.00017730131084118137, + "loss": 1.1272, + "step": 14032 + }, + { + "epoch": 0.36032809037089103, + "grad_norm": 0.76171875, + "learning_rate": 0.0001772984787190279, + "loss": 0.9525, + "step": 14033 + }, + { + "epoch": 0.36035376756681287, + "grad_norm": 0.7109375, + "learning_rate": 0.00017729564644282551, + "loss": 0.9633, + "step": 14034 + }, + { + "epoch": 0.3603794447627347, + "grad_norm": 1.4375, + "learning_rate": 0.00017729281401257993, + "loss": 0.8359, + "step": 14035 + }, + { + "epoch": 0.3604051219586565, + "grad_norm": 0.859375, + "learning_rate": 0.00017728998142829672, + "loss": 0.8682, + "step": 14036 + }, + { + "epoch": 0.3604307991545783, + "grad_norm": 0.78125, + "learning_rate": 0.00017728714868998156, + "loss": 0.8134, + "step": 14037 + }, + { + "epoch": 0.36045647635050015, + "grad_norm": 0.83984375, + "learning_rate": 0.0001772843157976401, + "loss": 0.9076, + "step": 14038 + }, + { + "epoch": 0.360482153546422, + "grad_norm": 0.74609375, + "learning_rate": 0.00017728148275127797, + "loss": 0.8576, + "step": 14039 + }, + { + "epoch": 0.36050783074234377, + "grad_norm": 0.8046875, + "learning_rate": 0.00017727864955090087, + "loss": 0.8534, + "step": 14040 + }, + { + "epoch": 0.3605335079382656, + "grad_norm": 0.8203125, + "learning_rate": 0.00017727581619651433, + "loss": 1.0013, + "step": 14041 + }, + { + "epoch": 0.36055918513418744, + "grad_norm": 0.84375, + "learning_rate": 0.00017727298268812414, + "loss": 1.1051, + "step": 14042 + }, + { + "epoch": 0.3605848623301092, + "grad_norm": 0.75, + "learning_rate": 0.00017727014902573583, + "loss": 0.8294, + "step": 14043 + }, + { + "epoch": 0.36061053952603106, + "grad_norm": 0.765625, + "learning_rate": 0.00017726731520935509, + "loss": 0.9229, + "step": 14044 + }, + { + "epoch": 0.3606362167219529, + "grad_norm": 0.94921875, + "learning_rate": 0.0001772644812389876, + "loss": 0.9038, + "step": 14045 + }, + { + "epoch": 0.3606618939178747, + "grad_norm": 0.734375, + "learning_rate": 0.00017726164711463896, + "loss": 0.8916, + "step": 14046 + }, + { + "epoch": 0.3606875711137965, + "grad_norm": 0.8828125, + "learning_rate": 0.00017725881283631482, + "loss": 1.0838, + "step": 14047 + }, + { + "epoch": 0.36071324830971835, + "grad_norm": 0.7578125, + "learning_rate": 0.00017725597840402086, + "loss": 0.9547, + "step": 14048 + }, + { + "epoch": 0.36073892550564013, + "grad_norm": 0.83203125, + "learning_rate": 0.00017725314381776273, + "loss": 0.9925, + "step": 14049 + }, + { + "epoch": 0.36076460270156196, + "grad_norm": 0.77734375, + "learning_rate": 0.00017725030907754605, + "loss": 1.0079, + "step": 14050 + }, + { + "epoch": 0.3607902798974838, + "grad_norm": 0.80078125, + "learning_rate": 0.00017724747418337646, + "loss": 0.9655, + "step": 14051 + }, + { + "epoch": 0.36081595709340564, + "grad_norm": 0.78125, + "learning_rate": 0.00017724463913525965, + "loss": 1.0549, + "step": 14052 + }, + { + "epoch": 0.3608416342893274, + "grad_norm": 0.78125, + "learning_rate": 0.00017724180393320128, + "loss": 0.9465, + "step": 14053 + }, + { + "epoch": 0.36086731148524925, + "grad_norm": 0.78515625, + "learning_rate": 0.00017723896857720694, + "loss": 0.9027, + "step": 14054 + }, + { + "epoch": 0.3608929886811711, + "grad_norm": 0.82421875, + "learning_rate": 0.0001772361330672823, + "loss": 0.9974, + "step": 14055 + }, + { + "epoch": 0.36091866587709287, + "grad_norm": 0.7890625, + "learning_rate": 0.00017723329740343307, + "loss": 0.838, + "step": 14056 + }, + { + "epoch": 0.3609443430730147, + "grad_norm": 0.76953125, + "learning_rate": 0.0001772304615856648, + "loss": 0.9908, + "step": 14057 + }, + { + "epoch": 0.36097002026893654, + "grad_norm": 0.78515625, + "learning_rate": 0.00017722762561398324, + "loss": 1.0097, + "step": 14058 + }, + { + "epoch": 0.3609956974648583, + "grad_norm": 0.78125, + "learning_rate": 0.00017722478948839398, + "loss": 1.0091, + "step": 14059 + }, + { + "epoch": 0.36102137466078016, + "grad_norm": 0.83203125, + "learning_rate": 0.00017722195320890268, + "loss": 0.9956, + "step": 14060 + }, + { + "epoch": 0.361047051856702, + "grad_norm": 0.765625, + "learning_rate": 0.000177219116775515, + "loss": 0.9058, + "step": 14061 + }, + { + "epoch": 0.36107272905262383, + "grad_norm": 0.7734375, + "learning_rate": 0.00017721628018823663, + "loss": 0.8671, + "step": 14062 + }, + { + "epoch": 0.3610984062485456, + "grad_norm": 0.8203125, + "learning_rate": 0.00017721344344707318, + "loss": 0.8776, + "step": 14063 + }, + { + "epoch": 0.36112408344446745, + "grad_norm": 0.8203125, + "learning_rate": 0.0001772106065520303, + "loss": 0.9243, + "step": 14064 + }, + { + "epoch": 0.3611497606403893, + "grad_norm": 0.8515625, + "learning_rate": 0.00017720776950311365, + "loss": 0.8454, + "step": 14065 + }, + { + "epoch": 0.36117543783631106, + "grad_norm": 0.81640625, + "learning_rate": 0.0001772049323003289, + "loss": 0.8984, + "step": 14066 + }, + { + "epoch": 0.3612011150322329, + "grad_norm": 0.79296875, + "learning_rate": 0.00017720209494368166, + "loss": 0.9665, + "step": 14067 + }, + { + "epoch": 0.36122679222815474, + "grad_norm": 0.890625, + "learning_rate": 0.0001771992574331777, + "loss": 1.0369, + "step": 14068 + }, + { + "epoch": 0.3612524694240765, + "grad_norm": 0.8828125, + "learning_rate": 0.0001771964197688225, + "loss": 0.9666, + "step": 14069 + }, + { + "epoch": 0.36127814661999835, + "grad_norm": 0.7734375, + "learning_rate": 0.00017719358195062186, + "loss": 1.0288, + "step": 14070 + }, + { + "epoch": 0.3613038238159202, + "grad_norm": 0.8359375, + "learning_rate": 0.00017719074397858138, + "loss": 0.8099, + "step": 14071 + }, + { + "epoch": 0.361329501011842, + "grad_norm": 0.765625, + "learning_rate": 0.0001771879058527067, + "loss": 0.9806, + "step": 14072 + }, + { + "epoch": 0.3613551782077638, + "grad_norm": 0.78125, + "learning_rate": 0.00017718506757300354, + "loss": 0.9383, + "step": 14073 + }, + { + "epoch": 0.36138085540368564, + "grad_norm": 0.85546875, + "learning_rate": 0.00017718222913947745, + "loss": 0.9691, + "step": 14074 + }, + { + "epoch": 0.3614065325996075, + "grad_norm": 1.703125, + "learning_rate": 0.0001771793905521342, + "loss": 0.9716, + "step": 14075 + }, + { + "epoch": 0.36143220979552926, + "grad_norm": 0.84765625, + "learning_rate": 0.00017717655181097937, + "loss": 0.9762, + "step": 14076 + }, + { + "epoch": 0.3614578869914511, + "grad_norm": 0.71875, + "learning_rate": 0.00017717371291601865, + "loss": 0.9025, + "step": 14077 + }, + { + "epoch": 0.36148356418737293, + "grad_norm": 0.87109375, + "learning_rate": 0.0001771708738672577, + "loss": 0.9967, + "step": 14078 + }, + { + "epoch": 0.3615092413832947, + "grad_norm": 0.80078125, + "learning_rate": 0.00017716803466470216, + "loss": 0.9494, + "step": 14079 + }, + { + "epoch": 0.36153491857921655, + "grad_norm": 1.15625, + "learning_rate": 0.0001771651953083577, + "loss": 1.0672, + "step": 14080 + }, + { + "epoch": 0.3615605957751384, + "grad_norm": 0.83984375, + "learning_rate": 0.00017716235579822999, + "loss": 0.9505, + "step": 14081 + }, + { + "epoch": 0.3615862729710602, + "grad_norm": 0.73828125, + "learning_rate": 0.00017715951613432465, + "loss": 0.8287, + "step": 14082 + }, + { + "epoch": 0.361611950166982, + "grad_norm": 0.73046875, + "learning_rate": 0.00017715667631664736, + "loss": 0.8388, + "step": 14083 + }, + { + "epoch": 0.36163762736290384, + "grad_norm": 0.76171875, + "learning_rate": 0.0001771538363452038, + "loss": 0.9947, + "step": 14084 + }, + { + "epoch": 0.36166330455882567, + "grad_norm": 0.78125, + "learning_rate": 0.0001771509962199996, + "loss": 0.9133, + "step": 14085 + }, + { + "epoch": 0.36168898175474745, + "grad_norm": 0.78125, + "learning_rate": 0.00017714815594104045, + "loss": 1.0659, + "step": 14086 + }, + { + "epoch": 0.3617146589506693, + "grad_norm": 0.76953125, + "learning_rate": 0.000177145315508332, + "loss": 0.9455, + "step": 14087 + }, + { + "epoch": 0.3617403361465911, + "grad_norm": 0.74609375, + "learning_rate": 0.00017714247492187988, + "loss": 0.8378, + "step": 14088 + }, + { + "epoch": 0.3617660133425129, + "grad_norm": 1.015625, + "learning_rate": 0.00017713963418168981, + "loss": 0.9897, + "step": 14089 + }, + { + "epoch": 0.36179169053843474, + "grad_norm": 0.76953125, + "learning_rate": 0.0001771367932877674, + "loss": 1.0585, + "step": 14090 + }, + { + "epoch": 0.3618173677343566, + "grad_norm": 0.85546875, + "learning_rate": 0.0001771339522401183, + "loss": 0.8989, + "step": 14091 + }, + { + "epoch": 0.3618430449302784, + "grad_norm": 0.79296875, + "learning_rate": 0.00017713111103874823, + "loss": 0.9452, + "step": 14092 + }, + { + "epoch": 0.3618687221262002, + "grad_norm": 0.82421875, + "learning_rate": 0.00017712826968366283, + "loss": 1.0097, + "step": 14093 + }, + { + "epoch": 0.36189439932212203, + "grad_norm": 0.76953125, + "learning_rate": 0.00017712542817486775, + "loss": 0.838, + "step": 14094 + }, + { + "epoch": 0.36192007651804387, + "grad_norm": 0.75, + "learning_rate": 0.00017712258651236864, + "loss": 0.9729, + "step": 14095 + }, + { + "epoch": 0.36194575371396565, + "grad_norm": 0.71484375, + "learning_rate": 0.00017711974469617118, + "loss": 0.8254, + "step": 14096 + }, + { + "epoch": 0.3619714309098875, + "grad_norm": 0.8125, + "learning_rate": 0.00017711690272628107, + "loss": 0.9258, + "step": 14097 + }, + { + "epoch": 0.3619971081058093, + "grad_norm": 0.78515625, + "learning_rate": 0.00017711406060270392, + "loss": 0.9517, + "step": 14098 + }, + { + "epoch": 0.3620227853017311, + "grad_norm": 0.859375, + "learning_rate": 0.0001771112183254454, + "loss": 1.044, + "step": 14099 + }, + { + "epoch": 0.36204846249765293, + "grad_norm": 0.80859375, + "learning_rate": 0.0001771083758945112, + "loss": 0.8803, + "step": 14100 + }, + { + "epoch": 0.36207413969357477, + "grad_norm": 0.8046875, + "learning_rate": 0.00017710553330990697, + "loss": 1.0589, + "step": 14101 + }, + { + "epoch": 0.3620998168894966, + "grad_norm": 0.80078125, + "learning_rate": 0.00017710269057163837, + "loss": 1.0051, + "step": 14102 + }, + { + "epoch": 0.3621254940854184, + "grad_norm": 0.734375, + "learning_rate": 0.00017709984767971107, + "loss": 0.7988, + "step": 14103 + }, + { + "epoch": 0.3621511712813402, + "grad_norm": 0.80078125, + "learning_rate": 0.00017709700463413075, + "loss": 0.8891, + "step": 14104 + }, + { + "epoch": 0.36217684847726206, + "grad_norm": 0.890625, + "learning_rate": 0.00017709416143490308, + "loss": 0.9426, + "step": 14105 + }, + { + "epoch": 0.36220252567318384, + "grad_norm": 0.73828125, + "learning_rate": 0.00017709131808203368, + "loss": 0.8709, + "step": 14106 + }, + { + "epoch": 0.3622282028691057, + "grad_norm": 0.80859375, + "learning_rate": 0.00017708847457552826, + "loss": 0.9408, + "step": 14107 + }, + { + "epoch": 0.3622538800650275, + "grad_norm": 0.9609375, + "learning_rate": 0.00017708563091539247, + "loss": 0.9419, + "step": 14108 + }, + { + "epoch": 0.3622795572609493, + "grad_norm": 0.68359375, + "learning_rate": 0.00017708278710163198, + "loss": 0.9887, + "step": 14109 + }, + { + "epoch": 0.36230523445687113, + "grad_norm": 0.77734375, + "learning_rate": 0.00017707994313425246, + "loss": 0.9043, + "step": 14110 + }, + { + "epoch": 0.36233091165279296, + "grad_norm": 0.87890625, + "learning_rate": 0.00017707709901325957, + "loss": 1.0448, + "step": 14111 + }, + { + "epoch": 0.3623565888487148, + "grad_norm": 0.90234375, + "learning_rate": 0.000177074254738659, + "loss": 1.0319, + "step": 14112 + }, + { + "epoch": 0.3623822660446366, + "grad_norm": 0.8515625, + "learning_rate": 0.0001770714103104564, + "loss": 0.975, + "step": 14113 + }, + { + "epoch": 0.3624079432405584, + "grad_norm": 0.76953125, + "learning_rate": 0.00017706856572865745, + "loss": 0.8757, + "step": 14114 + }, + { + "epoch": 0.36243362043648025, + "grad_norm": 0.84375, + "learning_rate": 0.00017706572099326776, + "loss": 0.9983, + "step": 14115 + }, + { + "epoch": 0.36245929763240203, + "grad_norm": 0.80078125, + "learning_rate": 0.00017706287610429308, + "loss": 0.9232, + "step": 14116 + }, + { + "epoch": 0.36248497482832387, + "grad_norm": 0.7734375, + "learning_rate": 0.00017706003106173905, + "loss": 0.9351, + "step": 14117 + }, + { + "epoch": 0.3625106520242457, + "grad_norm": 0.8515625, + "learning_rate": 0.00017705718586561136, + "loss": 1.1339, + "step": 14118 + }, + { + "epoch": 0.3625363292201675, + "grad_norm": 0.796875, + "learning_rate": 0.00017705434051591565, + "loss": 0.9169, + "step": 14119 + }, + { + "epoch": 0.3625620064160893, + "grad_norm": 0.875, + "learning_rate": 0.00017705149501265756, + "loss": 0.9218, + "step": 14120 + }, + { + "epoch": 0.36258768361201116, + "grad_norm": 0.75390625, + "learning_rate": 0.00017704864935584285, + "loss": 0.91, + "step": 14121 + }, + { + "epoch": 0.362613360807933, + "grad_norm": 0.6953125, + "learning_rate": 0.00017704580354547714, + "loss": 0.9432, + "step": 14122 + }, + { + "epoch": 0.3626390380038548, + "grad_norm": 0.87109375, + "learning_rate": 0.00017704295758156607, + "loss": 0.9797, + "step": 14123 + }, + { + "epoch": 0.3626647151997766, + "grad_norm": 0.90234375, + "learning_rate": 0.00017704011146411536, + "loss": 1.0512, + "step": 14124 + }, + { + "epoch": 0.36269039239569845, + "grad_norm": 0.81640625, + "learning_rate": 0.00017703726519313067, + "loss": 0.8993, + "step": 14125 + }, + { + "epoch": 0.3627160695916202, + "grad_norm": 0.8984375, + "learning_rate": 0.00017703441876861767, + "loss": 0.9757, + "step": 14126 + }, + { + "epoch": 0.36274174678754206, + "grad_norm": 0.8359375, + "learning_rate": 0.00017703157219058202, + "loss": 1.0093, + "step": 14127 + }, + { + "epoch": 0.3627674239834639, + "grad_norm": 0.90625, + "learning_rate": 0.0001770287254590294, + "loss": 1.0305, + "step": 14128 + }, + { + "epoch": 0.3627931011793857, + "grad_norm": 0.74609375, + "learning_rate": 0.0001770258785739655, + "loss": 0.8889, + "step": 14129 + }, + { + "epoch": 0.3628187783753075, + "grad_norm": 0.8671875, + "learning_rate": 0.000177023031535396, + "loss": 0.9824, + "step": 14130 + }, + { + "epoch": 0.36284445557122935, + "grad_norm": 0.7421875, + "learning_rate": 0.00017702018434332653, + "loss": 0.8878, + "step": 14131 + }, + { + "epoch": 0.3628701327671512, + "grad_norm": 0.77734375, + "learning_rate": 0.0001770173369977628, + "loss": 1.0531, + "step": 14132 + }, + { + "epoch": 0.36289580996307297, + "grad_norm": 0.76953125, + "learning_rate": 0.00017701448949871047, + "loss": 0.938, + "step": 14133 + }, + { + "epoch": 0.3629214871589948, + "grad_norm": 0.7734375, + "learning_rate": 0.00017701164184617522, + "loss": 0.915, + "step": 14134 + }, + { + "epoch": 0.36294716435491664, + "grad_norm": 0.76171875, + "learning_rate": 0.0001770087940401627, + "loss": 0.9608, + "step": 14135 + }, + { + "epoch": 0.3629728415508384, + "grad_norm": 0.72265625, + "learning_rate": 0.00017700594608067863, + "loss": 0.8784, + "step": 14136 + }, + { + "epoch": 0.36299851874676026, + "grad_norm": 0.77734375, + "learning_rate": 0.0001770030979677287, + "loss": 0.9372, + "step": 14137 + }, + { + "epoch": 0.3630241959426821, + "grad_norm": 0.734375, + "learning_rate": 0.00017700024970131853, + "loss": 0.9102, + "step": 14138 + }, + { + "epoch": 0.3630498731386039, + "grad_norm": 0.84375, + "learning_rate": 0.00017699740128145378, + "loss": 1.0429, + "step": 14139 + }, + { + "epoch": 0.3630755503345257, + "grad_norm": 0.75390625, + "learning_rate": 0.0001769945527081402, + "loss": 1.0423, + "step": 14140 + }, + { + "epoch": 0.36310122753044755, + "grad_norm": 0.78125, + "learning_rate": 0.00017699170398138345, + "loss": 0.8615, + "step": 14141 + }, + { + "epoch": 0.3631269047263694, + "grad_norm": 0.74609375, + "learning_rate": 0.00017698885510118918, + "loss": 0.8292, + "step": 14142 + }, + { + "epoch": 0.36315258192229116, + "grad_norm": 0.85546875, + "learning_rate": 0.00017698600606756303, + "loss": 0.9612, + "step": 14143 + }, + { + "epoch": 0.363178259118213, + "grad_norm": 0.80078125, + "learning_rate": 0.00017698315688051075, + "loss": 1.079, + "step": 14144 + }, + { + "epoch": 0.36320393631413483, + "grad_norm": 0.75390625, + "learning_rate": 0.000176980307540038, + "loss": 0.8618, + "step": 14145 + }, + { + "epoch": 0.3632296135100566, + "grad_norm": 0.765625, + "learning_rate": 0.00017697745804615044, + "loss": 0.9742, + "step": 14146 + }, + { + "epoch": 0.36325529070597845, + "grad_norm": 0.78125, + "learning_rate": 0.00017697460839885377, + "loss": 0.9154, + "step": 14147 + }, + { + "epoch": 0.3632809679019003, + "grad_norm": 0.7265625, + "learning_rate": 0.00017697175859815366, + "loss": 0.9471, + "step": 14148 + }, + { + "epoch": 0.36330664509782207, + "grad_norm": 0.7421875, + "learning_rate": 0.0001769689086440558, + "loss": 0.9373, + "step": 14149 + }, + { + "epoch": 0.3633323222937439, + "grad_norm": 0.78125, + "learning_rate": 0.00017696605853656584, + "loss": 0.9676, + "step": 14150 + }, + { + "epoch": 0.36335799948966574, + "grad_norm": 0.7890625, + "learning_rate": 0.00017696320827568945, + "loss": 0.8769, + "step": 14151 + }, + { + "epoch": 0.3633836766855876, + "grad_norm": 0.79296875, + "learning_rate": 0.00017696035786143237, + "loss": 0.8858, + "step": 14152 + }, + { + "epoch": 0.36340935388150936, + "grad_norm": 0.76953125, + "learning_rate": 0.00017695750729380025, + "loss": 0.8993, + "step": 14153 + }, + { + "epoch": 0.3634350310774312, + "grad_norm": 0.921875, + "learning_rate": 0.00017695465657279876, + "loss": 0.8658, + "step": 14154 + }, + { + "epoch": 0.36346070827335303, + "grad_norm": 1.8203125, + "learning_rate": 0.00017695180569843362, + "loss": 0.9158, + "step": 14155 + }, + { + "epoch": 0.3634863854692748, + "grad_norm": 0.7890625, + "learning_rate": 0.00017694895467071042, + "loss": 0.9247, + "step": 14156 + }, + { + "epoch": 0.36351206266519664, + "grad_norm": 0.8125, + "learning_rate": 0.00017694610348963498, + "loss": 1.0802, + "step": 14157 + }, + { + "epoch": 0.3635377398611185, + "grad_norm": 0.76953125, + "learning_rate": 0.00017694325215521282, + "loss": 0.933, + "step": 14158 + }, + { + "epoch": 0.36356341705704026, + "grad_norm": 0.83984375, + "learning_rate": 0.00017694040066744976, + "loss": 1.0461, + "step": 14159 + }, + { + "epoch": 0.3635890942529621, + "grad_norm": 0.765625, + "learning_rate": 0.00017693754902635146, + "loss": 0.9095, + "step": 14160 + }, + { + "epoch": 0.36361477144888393, + "grad_norm": 0.76171875, + "learning_rate": 0.00017693469723192352, + "loss": 0.7521, + "step": 14161 + }, + { + "epoch": 0.36364044864480577, + "grad_norm": 0.78125, + "learning_rate": 0.00017693184528417172, + "loss": 0.9589, + "step": 14162 + }, + { + "epoch": 0.36366612584072755, + "grad_norm": 0.8359375, + "learning_rate": 0.00017692899318310165, + "loss": 0.9208, + "step": 14163 + }, + { + "epoch": 0.3636918030366494, + "grad_norm": 0.81640625, + "learning_rate": 0.0001769261409287191, + "loss": 1.1004, + "step": 14164 + }, + { + "epoch": 0.3637174802325712, + "grad_norm": 0.76171875, + "learning_rate": 0.00017692328852102969, + "loss": 1.0019, + "step": 14165 + }, + { + "epoch": 0.363743157428493, + "grad_norm": 0.77734375, + "learning_rate": 0.0001769204359600391, + "loss": 0.8224, + "step": 14166 + }, + { + "epoch": 0.36376883462441484, + "grad_norm": 0.77734375, + "learning_rate": 0.00017691758324575303, + "loss": 0.9254, + "step": 14167 + }, + { + "epoch": 0.3637945118203367, + "grad_norm": 0.7890625, + "learning_rate": 0.00017691473037817718, + "loss": 1.0192, + "step": 14168 + }, + { + "epoch": 0.36382018901625846, + "grad_norm": 0.76953125, + "learning_rate": 0.00017691187735731717, + "loss": 0.8685, + "step": 14169 + }, + { + "epoch": 0.3638458662121803, + "grad_norm": 0.83984375, + "learning_rate": 0.00017690902418317878, + "loss": 1.0751, + "step": 14170 + }, + { + "epoch": 0.3638715434081021, + "grad_norm": 0.8125, + "learning_rate": 0.00017690617085576763, + "loss": 0.9745, + "step": 14171 + }, + { + "epoch": 0.36389722060402396, + "grad_norm": 0.8203125, + "learning_rate": 0.00017690331737508946, + "loss": 0.8817, + "step": 14172 + }, + { + "epoch": 0.36392289779994574, + "grad_norm": 0.7265625, + "learning_rate": 0.0001769004637411499, + "loss": 0.739, + "step": 14173 + }, + { + "epoch": 0.3639485749958676, + "grad_norm": 0.78515625, + "learning_rate": 0.0001768976099539547, + "loss": 0.8422, + "step": 14174 + }, + { + "epoch": 0.3639742521917894, + "grad_norm": 0.76171875, + "learning_rate": 0.00017689475601350948, + "loss": 1.0282, + "step": 14175 + }, + { + "epoch": 0.3639999293877112, + "grad_norm": 0.78515625, + "learning_rate": 0.00017689190191981993, + "loss": 0.9587, + "step": 14176 + }, + { + "epoch": 0.36402560658363303, + "grad_norm": 0.77734375, + "learning_rate": 0.0001768890476728918, + "loss": 0.966, + "step": 14177 + }, + { + "epoch": 0.36405128377955487, + "grad_norm": 0.73046875, + "learning_rate": 0.00017688619327273073, + "loss": 0.9384, + "step": 14178 + }, + { + "epoch": 0.36407696097547665, + "grad_norm": 0.78125, + "learning_rate": 0.00017688333871934242, + "loss": 1.0667, + "step": 14179 + }, + { + "epoch": 0.3641026381713985, + "grad_norm": 0.8203125, + "learning_rate": 0.00017688048401273256, + "loss": 0.9917, + "step": 14180 + }, + { + "epoch": 0.3641283153673203, + "grad_norm": 0.8046875, + "learning_rate": 0.00017687762915290684, + "loss": 0.9115, + "step": 14181 + }, + { + "epoch": 0.36415399256324216, + "grad_norm": 0.80859375, + "learning_rate": 0.00017687477413987098, + "loss": 0.9508, + "step": 14182 + }, + { + "epoch": 0.36417966975916394, + "grad_norm": 0.84765625, + "learning_rate": 0.0001768719189736306, + "loss": 0.9091, + "step": 14183 + }, + { + "epoch": 0.3642053469550858, + "grad_norm": 0.77734375, + "learning_rate": 0.00017686906365419145, + "loss": 0.9753, + "step": 14184 + }, + { + "epoch": 0.3642310241510076, + "grad_norm": 0.86328125, + "learning_rate": 0.0001768662081815592, + "loss": 1.001, + "step": 14185 + }, + { + "epoch": 0.3642567013469294, + "grad_norm": 0.92578125, + "learning_rate": 0.00017686335255573954, + "loss": 0.8535, + "step": 14186 + }, + { + "epoch": 0.3642823785428512, + "grad_norm": 0.79296875, + "learning_rate": 0.00017686049677673814, + "loss": 1.0594, + "step": 14187 + }, + { + "epoch": 0.36430805573877306, + "grad_norm": 0.796875, + "learning_rate": 0.0001768576408445607, + "loss": 0.9283, + "step": 14188 + }, + { + "epoch": 0.36433373293469484, + "grad_norm": 0.7421875, + "learning_rate": 0.000176854784759213, + "loss": 1.0615, + "step": 14189 + }, + { + "epoch": 0.3643594101306167, + "grad_norm": 0.90625, + "learning_rate": 0.0001768519285207006, + "loss": 0.8083, + "step": 14190 + }, + { + "epoch": 0.3643850873265385, + "grad_norm": 0.79296875, + "learning_rate": 0.00017684907212902923, + "loss": 0.9834, + "step": 14191 + }, + { + "epoch": 0.36441076452246035, + "grad_norm": 0.7578125, + "learning_rate": 0.00017684621558420466, + "loss": 0.827, + "step": 14192 + }, + { + "epoch": 0.36443644171838213, + "grad_norm": 0.75390625, + "learning_rate": 0.00017684335888623246, + "loss": 0.9646, + "step": 14193 + }, + { + "epoch": 0.36446211891430397, + "grad_norm": 0.7421875, + "learning_rate": 0.00017684050203511844, + "loss": 0.9477, + "step": 14194 + }, + { + "epoch": 0.3644877961102258, + "grad_norm": 0.7890625, + "learning_rate": 0.0001768376450308682, + "loss": 0.9106, + "step": 14195 + }, + { + "epoch": 0.3645134733061476, + "grad_norm": 0.78125, + "learning_rate": 0.00017683478787348751, + "loss": 0.8683, + "step": 14196 + }, + { + "epoch": 0.3645391505020694, + "grad_norm": 1.71875, + "learning_rate": 0.00017683193056298202, + "loss": 1.0076, + "step": 14197 + }, + { + "epoch": 0.36456482769799126, + "grad_norm": 0.74609375, + "learning_rate": 0.00017682907309935742, + "loss": 0.9042, + "step": 14198 + }, + { + "epoch": 0.36459050489391304, + "grad_norm": 0.73046875, + "learning_rate": 0.00017682621548261942, + "loss": 0.8774, + "step": 14199 + }, + { + "epoch": 0.3646161820898349, + "grad_norm": 0.76171875, + "learning_rate": 0.0001768233577127737, + "loss": 0.9078, + "step": 14200 + }, + { + "epoch": 0.3646418592857567, + "grad_norm": 0.76953125, + "learning_rate": 0.000176820499789826, + "loss": 0.7945, + "step": 14201 + }, + { + "epoch": 0.36466753648167854, + "grad_norm": 0.83203125, + "learning_rate": 0.00017681764171378198, + "loss": 0.8968, + "step": 14202 + }, + { + "epoch": 0.3646932136776003, + "grad_norm": 0.69921875, + "learning_rate": 0.00017681478348464732, + "loss": 0.9603, + "step": 14203 + }, + { + "epoch": 0.36471889087352216, + "grad_norm": 0.7265625, + "learning_rate": 0.00017681192510242774, + "loss": 0.8191, + "step": 14204 + }, + { + "epoch": 0.364744568069444, + "grad_norm": 0.74609375, + "learning_rate": 0.00017680906656712892, + "loss": 0.9097, + "step": 14205 + }, + { + "epoch": 0.3647702452653658, + "grad_norm": 0.8203125, + "learning_rate": 0.00017680620787875658, + "loss": 0.9087, + "step": 14206 + }, + { + "epoch": 0.3647959224612876, + "grad_norm": 0.78125, + "learning_rate": 0.0001768033490373164, + "loss": 0.9852, + "step": 14207 + }, + { + "epoch": 0.36482159965720945, + "grad_norm": 0.796875, + "learning_rate": 0.0001768004900428141, + "loss": 1.0302, + "step": 14208 + }, + { + "epoch": 0.36484727685313123, + "grad_norm": 0.7734375, + "learning_rate": 0.00017679763089525533, + "loss": 1.1233, + "step": 14209 + }, + { + "epoch": 0.36487295404905307, + "grad_norm": 1.0234375, + "learning_rate": 0.00017679477159464587, + "loss": 0.8304, + "step": 14210 + }, + { + "epoch": 0.3648986312449749, + "grad_norm": 0.83203125, + "learning_rate": 0.0001767919121409913, + "loss": 0.9269, + "step": 14211 + }, + { + "epoch": 0.36492430844089674, + "grad_norm": 0.80859375, + "learning_rate": 0.00017678905253429742, + "loss": 0.9194, + "step": 14212 + }, + { + "epoch": 0.3649499856368185, + "grad_norm": 0.80078125, + "learning_rate": 0.0001767861927745699, + "loss": 0.9385, + "step": 14213 + }, + { + "epoch": 0.36497566283274036, + "grad_norm": 0.76953125, + "learning_rate": 0.00017678333286181444, + "loss": 1.1065, + "step": 14214 + }, + { + "epoch": 0.3650013400286622, + "grad_norm": 0.8828125, + "learning_rate": 0.00017678047279603672, + "loss": 0.9982, + "step": 14215 + }, + { + "epoch": 0.36502701722458397, + "grad_norm": 0.84765625, + "learning_rate": 0.00017677761257724246, + "loss": 1.0443, + "step": 14216 + }, + { + "epoch": 0.3650526944205058, + "grad_norm": 0.80078125, + "learning_rate": 0.00017677475220543733, + "loss": 1.073, + "step": 14217 + }, + { + "epoch": 0.36507837161642764, + "grad_norm": 0.8125, + "learning_rate": 0.0001767718916806271, + "loss": 1.051, + "step": 14218 + }, + { + "epoch": 0.3651040488123494, + "grad_norm": 0.828125, + "learning_rate": 0.00017676903100281738, + "loss": 1.0226, + "step": 14219 + }, + { + "epoch": 0.36512972600827126, + "grad_norm": 0.7890625, + "learning_rate": 0.00017676617017201393, + "loss": 0.8816, + "step": 14220 + }, + { + "epoch": 0.3651554032041931, + "grad_norm": 0.8125, + "learning_rate": 0.00017676330918822244, + "loss": 0.8773, + "step": 14221 + }, + { + "epoch": 0.36518108040011493, + "grad_norm": 0.73828125, + "learning_rate": 0.0001767604480514486, + "loss": 0.7833, + "step": 14222 + }, + { + "epoch": 0.3652067575960367, + "grad_norm": 0.828125, + "learning_rate": 0.00017675758676169814, + "loss": 0.8383, + "step": 14223 + }, + { + "epoch": 0.36523243479195855, + "grad_norm": 0.8125, + "learning_rate": 0.00017675472531897675, + "loss": 0.9977, + "step": 14224 + }, + { + "epoch": 0.3652581119878804, + "grad_norm": 1.25, + "learning_rate": 0.00017675186372329008, + "loss": 0.9722, + "step": 14225 + }, + { + "epoch": 0.36528378918380217, + "grad_norm": 0.71875, + "learning_rate": 0.00017674900197464392, + "loss": 0.877, + "step": 14226 + }, + { + "epoch": 0.365309466379724, + "grad_norm": 0.74609375, + "learning_rate": 0.0001767461400730439, + "loss": 1.0334, + "step": 14227 + }, + { + "epoch": 0.36533514357564584, + "grad_norm": 0.7890625, + "learning_rate": 0.0001767432780184958, + "loss": 0.9004, + "step": 14228 + }, + { + "epoch": 0.3653608207715676, + "grad_norm": 0.8125, + "learning_rate": 0.00017674041581100525, + "loss": 1.0623, + "step": 14229 + }, + { + "epoch": 0.36538649796748945, + "grad_norm": 0.8671875, + "learning_rate": 0.00017673755345057798, + "loss": 1.0352, + "step": 14230 + }, + { + "epoch": 0.3654121751634113, + "grad_norm": 0.765625, + "learning_rate": 0.00017673469093721972, + "loss": 1.0448, + "step": 14231 + }, + { + "epoch": 0.3654378523593331, + "grad_norm": 0.734375, + "learning_rate": 0.00017673182827093612, + "loss": 0.9568, + "step": 14232 + }, + { + "epoch": 0.3654635295552549, + "grad_norm": 0.859375, + "learning_rate": 0.00017672896545173293, + "loss": 1.0106, + "step": 14233 + }, + { + "epoch": 0.36548920675117674, + "grad_norm": 0.75390625, + "learning_rate": 0.00017672610247961585, + "loss": 0.8824, + "step": 14234 + }, + { + "epoch": 0.3655148839470986, + "grad_norm": 0.890625, + "learning_rate": 0.00017672323935459056, + "loss": 1.2539, + "step": 14235 + }, + { + "epoch": 0.36554056114302036, + "grad_norm": 0.76953125, + "learning_rate": 0.00017672037607666284, + "loss": 0.8709, + "step": 14236 + }, + { + "epoch": 0.3655662383389422, + "grad_norm": 0.796875, + "learning_rate": 0.00017671751264583828, + "loss": 0.9874, + "step": 14237 + }, + { + "epoch": 0.36559191553486403, + "grad_norm": 0.8359375, + "learning_rate": 0.00017671464906212267, + "loss": 1.0169, + "step": 14238 + }, + { + "epoch": 0.3656175927307858, + "grad_norm": 0.7734375, + "learning_rate": 0.00017671178532552168, + "loss": 0.9022, + "step": 14239 + }, + { + "epoch": 0.36564326992670765, + "grad_norm": 0.78515625, + "learning_rate": 0.00017670892143604102, + "loss": 0.9885, + "step": 14240 + }, + { + "epoch": 0.3656689471226295, + "grad_norm": 0.8046875, + "learning_rate": 0.00017670605739368645, + "loss": 0.9845, + "step": 14241 + }, + { + "epoch": 0.3656946243185513, + "grad_norm": 0.79296875, + "learning_rate": 0.00017670319319846363, + "loss": 0.8437, + "step": 14242 + }, + { + "epoch": 0.3657203015144731, + "grad_norm": 0.765625, + "learning_rate": 0.00017670032885037826, + "loss": 1.0139, + "step": 14243 + }, + { + "epoch": 0.36574597871039494, + "grad_norm": 0.8828125, + "learning_rate": 0.00017669746434943606, + "loss": 1.2101, + "step": 14244 + }, + { + "epoch": 0.3657716559063168, + "grad_norm": 0.828125, + "learning_rate": 0.00017669459969564275, + "loss": 0.8903, + "step": 14245 + }, + { + "epoch": 0.36579733310223855, + "grad_norm": 0.79296875, + "learning_rate": 0.00017669173488900404, + "loss": 0.9487, + "step": 14246 + }, + { + "epoch": 0.3658230102981604, + "grad_norm": 0.8359375, + "learning_rate": 0.0001766888699295256, + "loss": 0.9652, + "step": 14247 + }, + { + "epoch": 0.3658486874940822, + "grad_norm": 0.78125, + "learning_rate": 0.00017668600481721318, + "loss": 0.8184, + "step": 14248 + }, + { + "epoch": 0.365874364690004, + "grad_norm": 0.80859375, + "learning_rate": 0.0001766831395520725, + "loss": 0.8899, + "step": 14249 + }, + { + "epoch": 0.36590004188592584, + "grad_norm": 0.87890625, + "learning_rate": 0.00017668027413410923, + "loss": 0.8339, + "step": 14250 + }, + { + "epoch": 0.3659257190818477, + "grad_norm": 0.95703125, + "learning_rate": 0.0001766774085633291, + "loss": 1.0452, + "step": 14251 + }, + { + "epoch": 0.36595139627776946, + "grad_norm": 0.7890625, + "learning_rate": 0.00017667454283973782, + "loss": 0.9108, + "step": 14252 + }, + { + "epoch": 0.3659770734736913, + "grad_norm": 0.85546875, + "learning_rate": 0.00017667167696334111, + "loss": 0.9677, + "step": 14253 + }, + { + "epoch": 0.36600275066961313, + "grad_norm": 0.9765625, + "learning_rate": 0.0001766688109341447, + "loss": 1.0155, + "step": 14254 + }, + { + "epoch": 0.36602842786553497, + "grad_norm": 0.84765625, + "learning_rate": 0.00017666594475215422, + "loss": 1.0309, + "step": 14255 + }, + { + "epoch": 0.36605410506145675, + "grad_norm": 0.7421875, + "learning_rate": 0.00017666307841737546, + "loss": 0.9189, + "step": 14256 + }, + { + "epoch": 0.3660797822573786, + "grad_norm": 0.95703125, + "learning_rate": 0.00017666021192981412, + "loss": 1.0634, + "step": 14257 + }, + { + "epoch": 0.3661054594533004, + "grad_norm": 0.80078125, + "learning_rate": 0.00017665734528947587, + "loss": 0.8141, + "step": 14258 + }, + { + "epoch": 0.3661311366492222, + "grad_norm": 0.75390625, + "learning_rate": 0.00017665447849636652, + "loss": 0.8618, + "step": 14259 + }, + { + "epoch": 0.36615681384514404, + "grad_norm": 0.9453125, + "learning_rate": 0.00017665161155049167, + "loss": 0.9281, + "step": 14260 + }, + { + "epoch": 0.36618249104106587, + "grad_norm": 0.8046875, + "learning_rate": 0.00017664874445185707, + "loss": 0.9352, + "step": 14261 + }, + { + "epoch": 0.36620816823698765, + "grad_norm": 0.8359375, + "learning_rate": 0.00017664587720046847, + "loss": 0.8873, + "step": 14262 + }, + { + "epoch": 0.3662338454329095, + "grad_norm": 0.83203125, + "learning_rate": 0.0001766430097963316, + "loss": 0.9374, + "step": 14263 + }, + { + "epoch": 0.3662595226288313, + "grad_norm": 0.81640625, + "learning_rate": 0.00017664014223945206, + "loss": 1.0237, + "step": 14264 + }, + { + "epoch": 0.36628519982475316, + "grad_norm": 0.796875, + "learning_rate": 0.00017663727452983568, + "loss": 0.9837, + "step": 14265 + }, + { + "epoch": 0.36631087702067494, + "grad_norm": 0.921875, + "learning_rate": 0.00017663440666748813, + "loss": 0.9662, + "step": 14266 + }, + { + "epoch": 0.3663365542165968, + "grad_norm": 0.76171875, + "learning_rate": 0.00017663153865241514, + "loss": 0.8744, + "step": 14267 + }, + { + "epoch": 0.3663622314125186, + "grad_norm": 0.81640625, + "learning_rate": 0.0001766286704846224, + "loss": 0.926, + "step": 14268 + }, + { + "epoch": 0.3663879086084404, + "grad_norm": 0.80078125, + "learning_rate": 0.00017662580216411565, + "loss": 0.9943, + "step": 14269 + }, + { + "epoch": 0.36641358580436223, + "grad_norm": 0.8125, + "learning_rate": 0.00017662293369090056, + "loss": 0.9113, + "step": 14270 + }, + { + "epoch": 0.36643926300028407, + "grad_norm": 0.79296875, + "learning_rate": 0.00017662006506498292, + "loss": 0.9339, + "step": 14271 + }, + { + "epoch": 0.36646494019620585, + "grad_norm": 0.7578125, + "learning_rate": 0.00017661719628636843, + "loss": 0.8533, + "step": 14272 + }, + { + "epoch": 0.3664906173921277, + "grad_norm": 0.7890625, + "learning_rate": 0.00017661432735506275, + "loss": 0.9041, + "step": 14273 + }, + { + "epoch": 0.3665162945880495, + "grad_norm": 0.8046875, + "learning_rate": 0.00017661145827107166, + "loss": 0.9434, + "step": 14274 + }, + { + "epoch": 0.36654197178397135, + "grad_norm": 0.71484375, + "learning_rate": 0.00017660858903440085, + "loss": 1.0197, + "step": 14275 + }, + { + "epoch": 0.36656764897989313, + "grad_norm": 0.70703125, + "learning_rate": 0.00017660571964505604, + "loss": 0.9098, + "step": 14276 + }, + { + "epoch": 0.36659332617581497, + "grad_norm": 0.7890625, + "learning_rate": 0.00017660285010304293, + "loss": 0.9224, + "step": 14277 + }, + { + "epoch": 0.3666190033717368, + "grad_norm": 0.88671875, + "learning_rate": 0.0001765999804083673, + "loss": 1.2015, + "step": 14278 + }, + { + "epoch": 0.3666446805676586, + "grad_norm": 0.79296875, + "learning_rate": 0.00017659711056103482, + "loss": 0.9534, + "step": 14279 + }, + { + "epoch": 0.3666703577635804, + "grad_norm": 0.8125, + "learning_rate": 0.0001765942405610512, + "loss": 0.8997, + "step": 14280 + }, + { + "epoch": 0.36669603495950226, + "grad_norm": 0.734375, + "learning_rate": 0.00017659137040842215, + "loss": 0.7286, + "step": 14281 + }, + { + "epoch": 0.36672171215542404, + "grad_norm": 0.91015625, + "learning_rate": 0.00017658850010315345, + "loss": 0.8983, + "step": 14282 + }, + { + "epoch": 0.3667473893513459, + "grad_norm": 0.8203125, + "learning_rate": 0.0001765856296452508, + "loss": 0.9626, + "step": 14283 + }, + { + "epoch": 0.3667730665472677, + "grad_norm": 0.82421875, + "learning_rate": 0.0001765827590347199, + "loss": 1.0009, + "step": 14284 + }, + { + "epoch": 0.36679874374318955, + "grad_norm": 0.921875, + "learning_rate": 0.0001765798882715665, + "loss": 1.0365, + "step": 14285 + }, + { + "epoch": 0.36682442093911133, + "grad_norm": 0.89453125, + "learning_rate": 0.00017657701735579624, + "loss": 1.0015, + "step": 14286 + }, + { + "epoch": 0.36685009813503316, + "grad_norm": 0.78515625, + "learning_rate": 0.00017657414628741494, + "loss": 0.9999, + "step": 14287 + }, + { + "epoch": 0.366875775330955, + "grad_norm": 0.76953125, + "learning_rate": 0.00017657127506642828, + "loss": 0.8067, + "step": 14288 + }, + { + "epoch": 0.3669014525268768, + "grad_norm": 0.7578125, + "learning_rate": 0.00017656840369284198, + "loss": 0.9048, + "step": 14289 + }, + { + "epoch": 0.3669271297227986, + "grad_norm": 0.82421875, + "learning_rate": 0.00017656553216666176, + "loss": 1.1577, + "step": 14290 + }, + { + "epoch": 0.36695280691872045, + "grad_norm": 0.78515625, + "learning_rate": 0.0001765626604878934, + "loss": 1.1062, + "step": 14291 + }, + { + "epoch": 0.36697848411464223, + "grad_norm": 0.81640625, + "learning_rate": 0.0001765597886565425, + "loss": 0.9703, + "step": 14292 + }, + { + "epoch": 0.36700416131056407, + "grad_norm": 0.7421875, + "learning_rate": 0.00017655691667261493, + "loss": 0.9254, + "step": 14293 + }, + { + "epoch": 0.3670298385064859, + "grad_norm": 0.74609375, + "learning_rate": 0.00017655404453611627, + "loss": 0.9, + "step": 14294 + }, + { + "epoch": 0.36705551570240774, + "grad_norm": 0.7734375, + "learning_rate": 0.00017655117224705237, + "loss": 0.9384, + "step": 14295 + }, + { + "epoch": 0.3670811928983295, + "grad_norm": 0.7734375, + "learning_rate": 0.00017654829980542886, + "loss": 0.9995, + "step": 14296 + }, + { + "epoch": 0.36710687009425136, + "grad_norm": 0.80078125, + "learning_rate": 0.00017654542721125152, + "loss": 0.9616, + "step": 14297 + }, + { + "epoch": 0.3671325472901732, + "grad_norm": 0.75, + "learning_rate": 0.00017654255446452607, + "loss": 0.9405, + "step": 14298 + }, + { + "epoch": 0.367158224486095, + "grad_norm": 0.90234375, + "learning_rate": 0.0001765396815652582, + "loss": 0.9951, + "step": 14299 + }, + { + "epoch": 0.3671839016820168, + "grad_norm": 0.79296875, + "learning_rate": 0.00017653680851345365, + "loss": 1.0143, + "step": 14300 + }, + { + "epoch": 0.36720957887793865, + "grad_norm": 0.8125, + "learning_rate": 0.00017653393530911817, + "loss": 1.1079, + "step": 14301 + }, + { + "epoch": 0.3672352560738604, + "grad_norm": 0.8125, + "learning_rate": 0.0001765310619522575, + "loss": 0.9324, + "step": 14302 + }, + { + "epoch": 0.36726093326978226, + "grad_norm": 0.7890625, + "learning_rate": 0.00017652818844287728, + "loss": 0.7971, + "step": 14303 + }, + { + "epoch": 0.3672866104657041, + "grad_norm": 0.84375, + "learning_rate": 0.00017652531478098334, + "loss": 0.933, + "step": 14304 + }, + { + "epoch": 0.36731228766162594, + "grad_norm": 0.73828125, + "learning_rate": 0.00017652244096658134, + "loss": 0.9996, + "step": 14305 + }, + { + "epoch": 0.3673379648575477, + "grad_norm": 0.78125, + "learning_rate": 0.000176519566999677, + "loss": 1.0884, + "step": 14306 + }, + { + "epoch": 0.36736364205346955, + "grad_norm": 0.77734375, + "learning_rate": 0.00017651669288027612, + "loss": 0.907, + "step": 14307 + }, + { + "epoch": 0.3673893192493914, + "grad_norm": 0.78125, + "learning_rate": 0.0001765138186083844, + "loss": 0.839, + "step": 14308 + }, + { + "epoch": 0.36741499644531317, + "grad_norm": 0.859375, + "learning_rate": 0.0001765109441840075, + "loss": 1.0156, + "step": 14309 + }, + { + "epoch": 0.367440673641235, + "grad_norm": 0.7734375, + "learning_rate": 0.00017650806960715123, + "loss": 1.1172, + "step": 14310 + }, + { + "epoch": 0.36746635083715684, + "grad_norm": 0.77734375, + "learning_rate": 0.00017650519487782127, + "loss": 0.8306, + "step": 14311 + }, + { + "epoch": 0.3674920280330786, + "grad_norm": 0.703125, + "learning_rate": 0.00017650231999602337, + "loss": 0.9589, + "step": 14312 + }, + { + "epoch": 0.36751770522900046, + "grad_norm": 0.7890625, + "learning_rate": 0.00017649944496176325, + "loss": 0.9911, + "step": 14313 + }, + { + "epoch": 0.3675433824249223, + "grad_norm": 0.734375, + "learning_rate": 0.00017649656977504664, + "loss": 0.957, + "step": 14314 + }, + { + "epoch": 0.36756905962084413, + "grad_norm": 0.7890625, + "learning_rate": 0.0001764936944358793, + "loss": 0.982, + "step": 14315 + }, + { + "epoch": 0.3675947368167659, + "grad_norm": 0.73046875, + "learning_rate": 0.00017649081894426693, + "loss": 0.9154, + "step": 14316 + }, + { + "epoch": 0.36762041401268775, + "grad_norm": 0.85546875, + "learning_rate": 0.00017648794330021528, + "loss": 0.9663, + "step": 14317 + }, + { + "epoch": 0.3676460912086096, + "grad_norm": 0.7734375, + "learning_rate": 0.00017648506750373005, + "loss": 0.8589, + "step": 14318 + }, + { + "epoch": 0.36767176840453136, + "grad_norm": 0.8046875, + "learning_rate": 0.000176482191554817, + "loss": 1.0893, + "step": 14319 + }, + { + "epoch": 0.3676974456004532, + "grad_norm": 0.79296875, + "learning_rate": 0.00017647931545348183, + "loss": 1.0233, + "step": 14320 + }, + { + "epoch": 0.36772312279637503, + "grad_norm": 0.76953125, + "learning_rate": 0.0001764764391997303, + "loss": 0.9419, + "step": 14321 + }, + { + "epoch": 0.3677487999922968, + "grad_norm": 0.79296875, + "learning_rate": 0.00017647356279356817, + "loss": 1.1068, + "step": 14322 + }, + { + "epoch": 0.36777447718821865, + "grad_norm": 0.81640625, + "learning_rate": 0.00017647068623500108, + "loss": 0.9216, + "step": 14323 + }, + { + "epoch": 0.3678001543841405, + "grad_norm": 0.75390625, + "learning_rate": 0.00017646780952403489, + "loss": 0.9476, + "step": 14324 + }, + { + "epoch": 0.3678258315800623, + "grad_norm": 0.8046875, + "learning_rate": 0.0001764649326606752, + "loss": 1.0043, + "step": 14325 + }, + { + "epoch": 0.3678515087759841, + "grad_norm": 0.703125, + "learning_rate": 0.00017646205564492783, + "loss": 1.0803, + "step": 14326 + }, + { + "epoch": 0.36787718597190594, + "grad_norm": 0.7890625, + "learning_rate": 0.00017645917847679848, + "loss": 1.094, + "step": 14327 + }, + { + "epoch": 0.3679028631678278, + "grad_norm": 0.71875, + "learning_rate": 0.00017645630115629293, + "loss": 0.9982, + "step": 14328 + }, + { + "epoch": 0.36792854036374956, + "grad_norm": 0.8515625, + "learning_rate": 0.00017645342368341683, + "loss": 1.0601, + "step": 14329 + }, + { + "epoch": 0.3679542175596714, + "grad_norm": 0.80859375, + "learning_rate": 0.00017645054605817598, + "loss": 0.8375, + "step": 14330 + }, + { + "epoch": 0.36797989475559323, + "grad_norm": 0.77734375, + "learning_rate": 0.0001764476682805761, + "loss": 0.8891, + "step": 14331 + }, + { + "epoch": 0.368005571951515, + "grad_norm": 0.81640625, + "learning_rate": 0.00017644479035062294, + "loss": 0.8812, + "step": 14332 + }, + { + "epoch": 0.36803124914743685, + "grad_norm": 0.77734375, + "learning_rate": 0.00017644191226832222, + "loss": 0.8382, + "step": 14333 + }, + { + "epoch": 0.3680569263433587, + "grad_norm": 0.76171875, + "learning_rate": 0.00017643903403367962, + "loss": 0.8291, + "step": 14334 + }, + { + "epoch": 0.3680826035392805, + "grad_norm": 1.078125, + "learning_rate": 0.000176436155646701, + "loss": 0.981, + "step": 14335 + }, + { + "epoch": 0.3681082807352023, + "grad_norm": 0.921875, + "learning_rate": 0.00017643327710739197, + "loss": 0.9634, + "step": 14336 + }, + { + "epoch": 0.36813395793112413, + "grad_norm": 1.0234375, + "learning_rate": 0.00017643039841575836, + "loss": 0.9114, + "step": 14337 + }, + { + "epoch": 0.36815963512704597, + "grad_norm": 0.74609375, + "learning_rate": 0.00017642751957180582, + "loss": 0.9452, + "step": 14338 + }, + { + "epoch": 0.36818531232296775, + "grad_norm": 0.8125, + "learning_rate": 0.00017642464057554018, + "loss": 0.9142, + "step": 14339 + }, + { + "epoch": 0.3682109895188896, + "grad_norm": 0.87109375, + "learning_rate": 0.00017642176142696712, + "loss": 0.9927, + "step": 14340 + }, + { + "epoch": 0.3682366667148114, + "grad_norm": 0.8515625, + "learning_rate": 0.0001764188821260924, + "loss": 0.9872, + "step": 14341 + }, + { + "epoch": 0.3682623439107332, + "grad_norm": 0.78515625, + "learning_rate": 0.00017641600267292177, + "loss": 1.0015, + "step": 14342 + }, + { + "epoch": 0.36828802110665504, + "grad_norm": 0.8203125, + "learning_rate": 0.00017641312306746092, + "loss": 0.9817, + "step": 14343 + }, + { + "epoch": 0.3683136983025769, + "grad_norm": 0.73828125, + "learning_rate": 0.00017641024330971564, + "loss": 0.9723, + "step": 14344 + }, + { + "epoch": 0.3683393754984987, + "grad_norm": 0.9765625, + "learning_rate": 0.0001764073633996916, + "loss": 0.8979, + "step": 14345 + }, + { + "epoch": 0.3683650526944205, + "grad_norm": 0.77734375, + "learning_rate": 0.00017640448333739464, + "loss": 0.8522, + "step": 14346 + }, + { + "epoch": 0.3683907298903423, + "grad_norm": 0.87109375, + "learning_rate": 0.00017640160312283043, + "loss": 0.9516, + "step": 14347 + }, + { + "epoch": 0.36841640708626416, + "grad_norm": 0.77734375, + "learning_rate": 0.0001763987227560047, + "loss": 1.0355, + "step": 14348 + }, + { + "epoch": 0.36844208428218594, + "grad_norm": 0.8515625, + "learning_rate": 0.00017639584223692328, + "loss": 0.806, + "step": 14349 + }, + { + "epoch": 0.3684677614781078, + "grad_norm": 0.80078125, + "learning_rate": 0.0001763929615655918, + "loss": 1.0177, + "step": 14350 + }, + { + "epoch": 0.3684934386740296, + "grad_norm": 0.76171875, + "learning_rate": 0.00017639008074201603, + "loss": 0.8533, + "step": 14351 + }, + { + "epoch": 0.3685191158699514, + "grad_norm": 0.8203125, + "learning_rate": 0.00017638719976620175, + "loss": 0.9067, + "step": 14352 + }, + { + "epoch": 0.36854479306587323, + "grad_norm": 0.76953125, + "learning_rate": 0.0001763843186381547, + "loss": 0.9108, + "step": 14353 + }, + { + "epoch": 0.36857047026179507, + "grad_norm": 0.83203125, + "learning_rate": 0.00017638143735788057, + "loss": 0.9356, + "step": 14354 + }, + { + "epoch": 0.3685961474577169, + "grad_norm": 0.89453125, + "learning_rate": 0.00017637855592538516, + "loss": 0.9796, + "step": 14355 + }, + { + "epoch": 0.3686218246536387, + "grad_norm": 0.74609375, + "learning_rate": 0.0001763756743406742, + "loss": 1.0285, + "step": 14356 + }, + { + "epoch": 0.3686475018495605, + "grad_norm": 0.7421875, + "learning_rate": 0.00017637279260375338, + "loss": 0.9218, + "step": 14357 + }, + { + "epoch": 0.36867317904548236, + "grad_norm": 0.78125, + "learning_rate": 0.00017636991071462853, + "loss": 0.9866, + "step": 14358 + }, + { + "epoch": 0.36869885624140414, + "grad_norm": 0.80078125, + "learning_rate": 0.0001763670286733053, + "loss": 0.9773, + "step": 14359 + }, + { + "epoch": 0.368724533437326, + "grad_norm": 0.76953125, + "learning_rate": 0.00017636414647978951, + "loss": 0.9556, + "step": 14360 + }, + { + "epoch": 0.3687502106332478, + "grad_norm": 0.75390625, + "learning_rate": 0.00017636126413408688, + "loss": 0.9771, + "step": 14361 + }, + { + "epoch": 0.3687758878291696, + "grad_norm": 0.6875, + "learning_rate": 0.0001763583816362031, + "loss": 0.7646, + "step": 14362 + }, + { + "epoch": 0.3688015650250914, + "grad_norm": 0.86328125, + "learning_rate": 0.00017635549898614402, + "loss": 1.122, + "step": 14363 + }, + { + "epoch": 0.36882724222101326, + "grad_norm": 0.734375, + "learning_rate": 0.0001763526161839153, + "loss": 0.8907, + "step": 14364 + }, + { + "epoch": 0.3688529194169351, + "grad_norm": 0.7734375, + "learning_rate": 0.0001763497332295227, + "loss": 0.828, + "step": 14365 + }, + { + "epoch": 0.3688785966128569, + "grad_norm": 0.71484375, + "learning_rate": 0.000176346850122972, + "loss": 0.9623, + "step": 14366 + }, + { + "epoch": 0.3689042738087787, + "grad_norm": 0.86328125, + "learning_rate": 0.00017634396686426893, + "loss": 0.9548, + "step": 14367 + }, + { + "epoch": 0.36892995100470055, + "grad_norm": 0.7890625, + "learning_rate": 0.0001763410834534192, + "loss": 0.9793, + "step": 14368 + }, + { + "epoch": 0.36895562820062233, + "grad_norm": 0.80078125, + "learning_rate": 0.0001763381998904286, + "loss": 0.8809, + "step": 14369 + }, + { + "epoch": 0.36898130539654417, + "grad_norm": 0.7578125, + "learning_rate": 0.00017633531617530288, + "loss": 0.9879, + "step": 14370 + }, + { + "epoch": 0.369006982592466, + "grad_norm": 0.7265625, + "learning_rate": 0.00017633243230804775, + "loss": 0.953, + "step": 14371 + }, + { + "epoch": 0.3690326597883878, + "grad_norm": 0.6953125, + "learning_rate": 0.000176329548288669, + "loss": 0.8698, + "step": 14372 + }, + { + "epoch": 0.3690583369843096, + "grad_norm": 0.7265625, + "learning_rate": 0.0001763266641171723, + "loss": 0.8464, + "step": 14373 + }, + { + "epoch": 0.36908401418023146, + "grad_norm": 0.79296875, + "learning_rate": 0.00017632377979356353, + "loss": 0.9187, + "step": 14374 + }, + { + "epoch": 0.3691096913761533, + "grad_norm": 0.9140625, + "learning_rate": 0.00017632089531784833, + "loss": 0.8844, + "step": 14375 + }, + { + "epoch": 0.3691353685720751, + "grad_norm": 0.78125, + "learning_rate": 0.00017631801069003245, + "loss": 0.9646, + "step": 14376 + }, + { + "epoch": 0.3691610457679969, + "grad_norm": 0.8359375, + "learning_rate": 0.0001763151259101217, + "loss": 0.8202, + "step": 14377 + }, + { + "epoch": 0.36918672296391875, + "grad_norm": 0.71875, + "learning_rate": 0.0001763122409781218, + "loss": 0.952, + "step": 14378 + }, + { + "epoch": 0.3692124001598405, + "grad_norm": 0.7265625, + "learning_rate": 0.00017630935589403848, + "loss": 0.902, + "step": 14379 + }, + { + "epoch": 0.36923807735576236, + "grad_norm": 0.75390625, + "learning_rate": 0.00017630647065787751, + "loss": 0.8937, + "step": 14380 + }, + { + "epoch": 0.3692637545516842, + "grad_norm": 0.74609375, + "learning_rate": 0.00017630358526964462, + "loss": 1.0586, + "step": 14381 + }, + { + "epoch": 0.369289431747606, + "grad_norm": 0.7265625, + "learning_rate": 0.00017630069972934562, + "loss": 0.8163, + "step": 14382 + }, + { + "epoch": 0.3693151089435278, + "grad_norm": 0.84375, + "learning_rate": 0.00017629781403698618, + "loss": 1.1777, + "step": 14383 + }, + { + "epoch": 0.36934078613944965, + "grad_norm": 0.80078125, + "learning_rate": 0.00017629492819257214, + "loss": 1.0092, + "step": 14384 + }, + { + "epoch": 0.3693664633353715, + "grad_norm": 0.78125, + "learning_rate": 0.00017629204219610914, + "loss": 0.9737, + "step": 14385 + }, + { + "epoch": 0.36939214053129327, + "grad_norm": 0.796875, + "learning_rate": 0.000176289156047603, + "loss": 0.9664, + "step": 14386 + }, + { + "epoch": 0.3694178177272151, + "grad_norm": 0.75, + "learning_rate": 0.0001762862697470595, + "loss": 0.872, + "step": 14387 + }, + { + "epoch": 0.36944349492313694, + "grad_norm": 0.72265625, + "learning_rate": 0.00017628338329448432, + "loss": 0.8252, + "step": 14388 + }, + { + "epoch": 0.3694691721190587, + "grad_norm": 0.8203125, + "learning_rate": 0.00017628049668988327, + "loss": 1.0958, + "step": 14389 + }, + { + "epoch": 0.36949484931498056, + "grad_norm": 0.8984375, + "learning_rate": 0.00017627760993326207, + "loss": 0.9155, + "step": 14390 + }, + { + "epoch": 0.3695205265109024, + "grad_norm": 0.73046875, + "learning_rate": 0.0001762747230246265, + "loss": 0.8889, + "step": 14391 + }, + { + "epoch": 0.3695462037068242, + "grad_norm": 0.8125, + "learning_rate": 0.00017627183596398226, + "loss": 0.885, + "step": 14392 + }, + { + "epoch": 0.369571880902746, + "grad_norm": 0.796875, + "learning_rate": 0.00017626894875133519, + "loss": 0.9399, + "step": 14393 + }, + { + "epoch": 0.36959755809866784, + "grad_norm": 0.875, + "learning_rate": 0.00017626606138669096, + "loss": 0.8987, + "step": 14394 + }, + { + "epoch": 0.3696232352945897, + "grad_norm": 0.859375, + "learning_rate": 0.00017626317387005536, + "loss": 0.9851, + "step": 14395 + }, + { + "epoch": 0.36964891249051146, + "grad_norm": 0.81640625, + "learning_rate": 0.00017626028620143416, + "loss": 0.9636, + "step": 14396 + }, + { + "epoch": 0.3696745896864333, + "grad_norm": 0.796875, + "learning_rate": 0.00017625739838083307, + "loss": 0.9306, + "step": 14397 + }, + { + "epoch": 0.36970026688235513, + "grad_norm": 0.7734375, + "learning_rate": 0.00017625451040825788, + "loss": 0.9278, + "step": 14398 + }, + { + "epoch": 0.3697259440782769, + "grad_norm": 0.88671875, + "learning_rate": 0.00017625162228371438, + "loss": 0.9238, + "step": 14399 + }, + { + "epoch": 0.36975162127419875, + "grad_norm": 0.828125, + "learning_rate": 0.00017624873400720823, + "loss": 1.012, + "step": 14400 + }, + { + "epoch": 0.3697772984701206, + "grad_norm": 0.7890625, + "learning_rate": 0.0001762458455787453, + "loss": 0.8758, + "step": 14401 + }, + { + "epoch": 0.36980297566604237, + "grad_norm": 0.7578125, + "learning_rate": 0.00017624295699833123, + "loss": 0.7411, + "step": 14402 + }, + { + "epoch": 0.3698286528619642, + "grad_norm": 0.7265625, + "learning_rate": 0.00017624006826597185, + "loss": 0.8871, + "step": 14403 + }, + { + "epoch": 0.36985433005788604, + "grad_norm": 0.734375, + "learning_rate": 0.00017623717938167291, + "loss": 0.8678, + "step": 14404 + }, + { + "epoch": 0.3698800072538079, + "grad_norm": 0.75390625, + "learning_rate": 0.00017623429034544015, + "loss": 0.9975, + "step": 14405 + }, + { + "epoch": 0.36990568444972965, + "grad_norm": 0.78125, + "learning_rate": 0.00017623140115727935, + "loss": 1.0283, + "step": 14406 + }, + { + "epoch": 0.3699313616456515, + "grad_norm": 0.765625, + "learning_rate": 0.00017622851181719624, + "loss": 1.0192, + "step": 14407 + }, + { + "epoch": 0.3699570388415733, + "grad_norm": 0.90625, + "learning_rate": 0.0001762256223251966, + "loss": 0.9437, + "step": 14408 + }, + { + "epoch": 0.3699827160374951, + "grad_norm": 0.74609375, + "learning_rate": 0.00017622273268128616, + "loss": 0.9878, + "step": 14409 + }, + { + "epoch": 0.37000839323341694, + "grad_norm": 0.8359375, + "learning_rate": 0.0001762198428854707, + "loss": 0.97, + "step": 14410 + }, + { + "epoch": 0.3700340704293388, + "grad_norm": 0.75390625, + "learning_rate": 0.00017621695293775603, + "loss": 1.0532, + "step": 14411 + }, + { + "epoch": 0.37005974762526056, + "grad_norm": 0.89453125, + "learning_rate": 0.0001762140628381478, + "loss": 0.9225, + "step": 14412 + }, + { + "epoch": 0.3700854248211824, + "grad_norm": 0.70703125, + "learning_rate": 0.00017621117258665185, + "loss": 0.8537, + "step": 14413 + }, + { + "epoch": 0.37011110201710423, + "grad_norm": 0.78125, + "learning_rate": 0.00017620828218327392, + "loss": 0.972, + "step": 14414 + }, + { + "epoch": 0.37013677921302607, + "grad_norm": 0.7578125, + "learning_rate": 0.00017620539162801975, + "loss": 0.8915, + "step": 14415 + }, + { + "epoch": 0.37016245640894785, + "grad_norm": 0.78515625, + "learning_rate": 0.00017620250092089515, + "loss": 1.0576, + "step": 14416 + }, + { + "epoch": 0.3701881336048697, + "grad_norm": 0.796875, + "learning_rate": 0.0001761996100619058, + "loss": 0.9085, + "step": 14417 + }, + { + "epoch": 0.3702138108007915, + "grad_norm": 1.0625, + "learning_rate": 0.00017619671905105754, + "loss": 1.0028, + "step": 14418 + }, + { + "epoch": 0.3702394879967133, + "grad_norm": 0.79296875, + "learning_rate": 0.0001761938278883561, + "loss": 0.9303, + "step": 14419 + }, + { + "epoch": 0.37026516519263514, + "grad_norm": 0.80859375, + "learning_rate": 0.00017619093657380726, + "loss": 0.9295, + "step": 14420 + }, + { + "epoch": 0.370290842388557, + "grad_norm": 0.80078125, + "learning_rate": 0.00017618804510741674, + "loss": 0.9699, + "step": 14421 + }, + { + "epoch": 0.37031651958447875, + "grad_norm": 0.78515625, + "learning_rate": 0.00017618515348919033, + "loss": 0.9538, + "step": 14422 + }, + { + "epoch": 0.3703421967804006, + "grad_norm": 0.78125, + "learning_rate": 0.00017618226171913382, + "loss": 1.0234, + "step": 14423 + }, + { + "epoch": 0.3703678739763224, + "grad_norm": 0.73828125, + "learning_rate": 0.0001761793697972529, + "loss": 0.9571, + "step": 14424 + }, + { + "epoch": 0.37039355117224426, + "grad_norm": 0.78515625, + "learning_rate": 0.0001761764777235534, + "loss": 0.8988, + "step": 14425 + }, + { + "epoch": 0.37041922836816604, + "grad_norm": 0.80078125, + "learning_rate": 0.00017617358549804106, + "loss": 1.0009, + "step": 14426 + }, + { + "epoch": 0.3704449055640879, + "grad_norm": 0.6953125, + "learning_rate": 0.00017617069312072164, + "loss": 0.872, + "step": 14427 + }, + { + "epoch": 0.3704705827600097, + "grad_norm": 0.77734375, + "learning_rate": 0.00017616780059160093, + "loss": 0.8049, + "step": 14428 + }, + { + "epoch": 0.3704962599559315, + "grad_norm": 0.69921875, + "learning_rate": 0.00017616490791068465, + "loss": 0.8137, + "step": 14429 + }, + { + "epoch": 0.37052193715185333, + "grad_norm": 0.75, + "learning_rate": 0.0001761620150779786, + "loss": 0.7835, + "step": 14430 + }, + { + "epoch": 0.37054761434777517, + "grad_norm": 0.7265625, + "learning_rate": 0.00017615912209348854, + "loss": 0.9464, + "step": 14431 + }, + { + "epoch": 0.37057329154369695, + "grad_norm": 0.796875, + "learning_rate": 0.0001761562289572202, + "loss": 1.0645, + "step": 14432 + }, + { + "epoch": 0.3705989687396188, + "grad_norm": 0.85546875, + "learning_rate": 0.0001761533356691794, + "loss": 1.0413, + "step": 14433 + }, + { + "epoch": 0.3706246459355406, + "grad_norm": 1.0390625, + "learning_rate": 0.00017615044222937188, + "loss": 0.9774, + "step": 14434 + }, + { + "epoch": 0.37065032313146246, + "grad_norm": 0.8515625, + "learning_rate": 0.00017614754863780338, + "loss": 1.0408, + "step": 14435 + }, + { + "epoch": 0.37067600032738424, + "grad_norm": 0.7890625, + "learning_rate": 0.00017614465489447975, + "loss": 0.9474, + "step": 14436 + }, + { + "epoch": 0.3707016775233061, + "grad_norm": 0.74609375, + "learning_rate": 0.00017614176099940666, + "loss": 0.8691, + "step": 14437 + }, + { + "epoch": 0.3707273547192279, + "grad_norm": 0.73828125, + "learning_rate": 0.00017613886695258993, + "loss": 0.9597, + "step": 14438 + }, + { + "epoch": 0.3707530319151497, + "grad_norm": 0.69140625, + "learning_rate": 0.00017613597275403532, + "loss": 1.0309, + "step": 14439 + }, + { + "epoch": 0.3707787091110715, + "grad_norm": 0.796875, + "learning_rate": 0.00017613307840374858, + "loss": 1.072, + "step": 14440 + }, + { + "epoch": 0.37080438630699336, + "grad_norm": 0.85546875, + "learning_rate": 0.00017613018390173548, + "loss": 1.0642, + "step": 14441 + }, + { + "epoch": 0.37083006350291514, + "grad_norm": 0.7578125, + "learning_rate": 0.00017612728924800181, + "loss": 0.9229, + "step": 14442 + }, + { + "epoch": 0.370855740698837, + "grad_norm": 0.96875, + "learning_rate": 0.00017612439444255332, + "loss": 0.8433, + "step": 14443 + }, + { + "epoch": 0.3708814178947588, + "grad_norm": 0.7578125, + "learning_rate": 0.00017612149948539584, + "loss": 0.8005, + "step": 14444 + }, + { + "epoch": 0.37090709509068065, + "grad_norm": 0.7890625, + "learning_rate": 0.00017611860437653505, + "loss": 1.1017, + "step": 14445 + }, + { + "epoch": 0.37093277228660243, + "grad_norm": 0.796875, + "learning_rate": 0.00017611570911597676, + "loss": 1.0396, + "step": 14446 + }, + { + "epoch": 0.37095844948252427, + "grad_norm": 0.828125, + "learning_rate": 0.0001761128137037267, + "loss": 0.9916, + "step": 14447 + }, + { + "epoch": 0.3709841266784461, + "grad_norm": 0.91015625, + "learning_rate": 0.00017610991813979073, + "loss": 0.9611, + "step": 14448 + }, + { + "epoch": 0.3710098038743679, + "grad_norm": 1.109375, + "learning_rate": 0.00017610702242417455, + "loss": 0.9791, + "step": 14449 + }, + { + "epoch": 0.3710354810702897, + "grad_norm": 0.6640625, + "learning_rate": 0.00017610412655688396, + "loss": 0.7913, + "step": 14450 + }, + { + "epoch": 0.37106115826621155, + "grad_norm": 0.75, + "learning_rate": 0.0001761012305379247, + "loss": 0.9505, + "step": 14451 + }, + { + "epoch": 0.37108683546213334, + "grad_norm": 0.84375, + "learning_rate": 0.00017609833436730258, + "loss": 0.9063, + "step": 14452 + }, + { + "epoch": 0.37111251265805517, + "grad_norm": 0.83203125, + "learning_rate": 0.00017609543804502335, + "loss": 0.9291, + "step": 14453 + }, + { + "epoch": 0.371138189853977, + "grad_norm": 0.7578125, + "learning_rate": 0.00017609254157109277, + "loss": 0.7543, + "step": 14454 + }, + { + "epoch": 0.37116386704989884, + "grad_norm": 0.8359375, + "learning_rate": 0.00017608964494551663, + "loss": 0.9545, + "step": 14455 + }, + { + "epoch": 0.3711895442458206, + "grad_norm": 0.83203125, + "learning_rate": 0.00017608674816830074, + "loss": 0.9625, + "step": 14456 + }, + { + "epoch": 0.37121522144174246, + "grad_norm": 1.75, + "learning_rate": 0.00017608385123945078, + "loss": 1.0238, + "step": 14457 + }, + { + "epoch": 0.3712408986376643, + "grad_norm": 0.84765625, + "learning_rate": 0.0001760809541589726, + "loss": 1.0792, + "step": 14458 + }, + { + "epoch": 0.3712665758335861, + "grad_norm": 0.71875, + "learning_rate": 0.000176078056926872, + "loss": 0.934, + "step": 14459 + }, + { + "epoch": 0.3712922530295079, + "grad_norm": 0.75390625, + "learning_rate": 0.00017607515954315463, + "loss": 0.9848, + "step": 14460 + }, + { + "epoch": 0.37131793022542975, + "grad_norm": 0.7734375, + "learning_rate": 0.00017607226200782637, + "loss": 0.9805, + "step": 14461 + }, + { + "epoch": 0.37134360742135153, + "grad_norm": 0.78125, + "learning_rate": 0.00017606936432089295, + "loss": 1.0014, + "step": 14462 + }, + { + "epoch": 0.37136928461727337, + "grad_norm": 0.76171875, + "learning_rate": 0.00017606646648236018, + "loss": 0.9328, + "step": 14463 + }, + { + "epoch": 0.3713949618131952, + "grad_norm": 0.7890625, + "learning_rate": 0.00017606356849223377, + "loss": 0.9039, + "step": 14464 + }, + { + "epoch": 0.371420639009117, + "grad_norm": 0.80078125, + "learning_rate": 0.00017606067035051958, + "loss": 0.9846, + "step": 14465 + }, + { + "epoch": 0.3714463162050388, + "grad_norm": 0.78515625, + "learning_rate": 0.00017605777205722335, + "loss": 0.8346, + "step": 14466 + }, + { + "epoch": 0.37147199340096065, + "grad_norm": 0.734375, + "learning_rate": 0.00017605487361235081, + "loss": 0.9544, + "step": 14467 + }, + { + "epoch": 0.3714976705968825, + "grad_norm": 0.8203125, + "learning_rate": 0.0001760519750159078, + "loss": 0.9836, + "step": 14468 + }, + { + "epoch": 0.37152334779280427, + "grad_norm": 0.76171875, + "learning_rate": 0.0001760490762679001, + "loss": 1.0329, + "step": 14469 + }, + { + "epoch": 0.3715490249887261, + "grad_norm": 0.86328125, + "learning_rate": 0.00017604617736833341, + "loss": 0.8519, + "step": 14470 + }, + { + "epoch": 0.37157470218464794, + "grad_norm": 0.88671875, + "learning_rate": 0.0001760432783172136, + "loss": 1.0054, + "step": 14471 + }, + { + "epoch": 0.3716003793805697, + "grad_norm": 0.8046875, + "learning_rate": 0.0001760403791145464, + "loss": 0.8262, + "step": 14472 + }, + { + "epoch": 0.37162605657649156, + "grad_norm": 0.89453125, + "learning_rate": 0.00017603747976033759, + "loss": 1.051, + "step": 14473 + }, + { + "epoch": 0.3716517337724134, + "grad_norm": 0.7578125, + "learning_rate": 0.00017603458025459295, + "loss": 1.0609, + "step": 14474 + }, + { + "epoch": 0.3716774109683352, + "grad_norm": 0.75390625, + "learning_rate": 0.00017603168059731822, + "loss": 0.9344, + "step": 14475 + }, + { + "epoch": 0.371703088164257, + "grad_norm": 0.7890625, + "learning_rate": 0.00017602878078851926, + "loss": 0.9633, + "step": 14476 + }, + { + "epoch": 0.37172876536017885, + "grad_norm": 0.765625, + "learning_rate": 0.0001760258808282018, + "loss": 0.8379, + "step": 14477 + }, + { + "epoch": 0.3717544425561007, + "grad_norm": 0.78125, + "learning_rate": 0.00017602298071637163, + "loss": 0.9757, + "step": 14478 + }, + { + "epoch": 0.37178011975202246, + "grad_norm": 0.78125, + "learning_rate": 0.00017602008045303452, + "loss": 1.0356, + "step": 14479 + }, + { + "epoch": 0.3718057969479443, + "grad_norm": 0.828125, + "learning_rate": 0.00017601718003819628, + "loss": 0.9939, + "step": 14480 + }, + { + "epoch": 0.37183147414386614, + "grad_norm": 0.8125, + "learning_rate": 0.00017601427947186262, + "loss": 0.9231, + "step": 14481 + }, + { + "epoch": 0.3718571513397879, + "grad_norm": 0.7578125, + "learning_rate": 0.0001760113787540394, + "loss": 0.8447, + "step": 14482 + }, + { + "epoch": 0.37188282853570975, + "grad_norm": 0.7265625, + "learning_rate": 0.00017600847788473237, + "loss": 0.8695, + "step": 14483 + }, + { + "epoch": 0.3719085057316316, + "grad_norm": 0.7421875, + "learning_rate": 0.0001760055768639473, + "loss": 0.8657, + "step": 14484 + }, + { + "epoch": 0.37193418292755337, + "grad_norm": 0.74609375, + "learning_rate": 0.00017600267569169, + "loss": 0.9491, + "step": 14485 + }, + { + "epoch": 0.3719598601234752, + "grad_norm": 0.77734375, + "learning_rate": 0.0001759997743679662, + "loss": 0.9152, + "step": 14486 + }, + { + "epoch": 0.37198553731939704, + "grad_norm": 0.8671875, + "learning_rate": 0.0001759968728927817, + "loss": 0.9179, + "step": 14487 + }, + { + "epoch": 0.3720112145153189, + "grad_norm": 0.859375, + "learning_rate": 0.00017599397126614235, + "loss": 0.9808, + "step": 14488 + }, + { + "epoch": 0.37203689171124066, + "grad_norm": 0.7734375, + "learning_rate": 0.00017599106948805384, + "loss": 0.9309, + "step": 14489 + }, + { + "epoch": 0.3720625689071625, + "grad_norm": 0.796875, + "learning_rate": 0.000175988167558522, + "loss": 0.8506, + "step": 14490 + }, + { + "epoch": 0.37208824610308433, + "grad_norm": 0.859375, + "learning_rate": 0.00017598526547755262, + "loss": 1.0778, + "step": 14491 + }, + { + "epoch": 0.3721139232990061, + "grad_norm": 0.80859375, + "learning_rate": 0.00017598236324515147, + "loss": 0.8901, + "step": 14492 + }, + { + "epoch": 0.37213960049492795, + "grad_norm": 0.76953125, + "learning_rate": 0.00017597946086132433, + "loss": 1.0159, + "step": 14493 + }, + { + "epoch": 0.3721652776908498, + "grad_norm": 0.8515625, + "learning_rate": 0.000175976558326077, + "loss": 1.0423, + "step": 14494 + }, + { + "epoch": 0.37219095488677156, + "grad_norm": 0.8828125, + "learning_rate": 0.0001759736556394152, + "loss": 0.9136, + "step": 14495 + }, + { + "epoch": 0.3722166320826934, + "grad_norm": 0.7734375, + "learning_rate": 0.00017597075280134482, + "loss": 1.105, + "step": 14496 + }, + { + "epoch": 0.37224230927861524, + "grad_norm": 0.79296875, + "learning_rate": 0.00017596784981187158, + "loss": 0.9098, + "step": 14497 + }, + { + "epoch": 0.37226798647453707, + "grad_norm": 0.7734375, + "learning_rate": 0.00017596494667100127, + "loss": 0.8854, + "step": 14498 + }, + { + "epoch": 0.37229366367045885, + "grad_norm": 0.7578125, + "learning_rate": 0.0001759620433787397, + "loss": 0.8572, + "step": 14499 + }, + { + "epoch": 0.3723193408663807, + "grad_norm": 1.0625, + "learning_rate": 0.00017595913993509262, + "loss": 0.8722, + "step": 14500 + }, + { + "epoch": 0.3723450180623025, + "grad_norm": 1.0390625, + "learning_rate": 0.00017595623634006585, + "loss": 0.7428, + "step": 14501 + }, + { + "epoch": 0.3723706952582243, + "grad_norm": 0.87109375, + "learning_rate": 0.00017595333259366513, + "loss": 1.0075, + "step": 14502 + }, + { + "epoch": 0.37239637245414614, + "grad_norm": 0.82421875, + "learning_rate": 0.0001759504286958963, + "loss": 0.803, + "step": 14503 + }, + { + "epoch": 0.372422049650068, + "grad_norm": 0.7109375, + "learning_rate": 0.00017594752464676513, + "loss": 0.8805, + "step": 14504 + }, + { + "epoch": 0.37244772684598976, + "grad_norm": 0.81640625, + "learning_rate": 0.00017594462044627743, + "loss": 0.9565, + "step": 14505 + }, + { + "epoch": 0.3724734040419116, + "grad_norm": 0.87890625, + "learning_rate": 0.00017594171609443892, + "loss": 0.9223, + "step": 14506 + }, + { + "epoch": 0.37249908123783343, + "grad_norm": 0.80859375, + "learning_rate": 0.00017593881159125543, + "loss": 0.937, + "step": 14507 + }, + { + "epoch": 0.37252475843375527, + "grad_norm": 0.859375, + "learning_rate": 0.00017593590693673277, + "loss": 1.0111, + "step": 14508 + }, + { + "epoch": 0.37255043562967705, + "grad_norm": 0.8828125, + "learning_rate": 0.00017593300213087672, + "loss": 1.0575, + "step": 14509 + }, + { + "epoch": 0.3725761128255989, + "grad_norm": 0.84375, + "learning_rate": 0.00017593009717369303, + "loss": 0.953, + "step": 14510 + }, + { + "epoch": 0.3726017900215207, + "grad_norm": 0.921875, + "learning_rate": 0.0001759271920651875, + "loss": 0.8317, + "step": 14511 + }, + { + "epoch": 0.3726274672174425, + "grad_norm": 0.81640625, + "learning_rate": 0.00017592428680536596, + "loss": 0.8884, + "step": 14512 + }, + { + "epoch": 0.37265314441336433, + "grad_norm": 0.79296875, + "learning_rate": 0.00017592138139423417, + "loss": 0.9824, + "step": 14513 + }, + { + "epoch": 0.37267882160928617, + "grad_norm": 0.80078125, + "learning_rate": 0.00017591847583179793, + "loss": 0.8647, + "step": 14514 + }, + { + "epoch": 0.37270449880520795, + "grad_norm": 0.88671875, + "learning_rate": 0.00017591557011806301, + "loss": 1.0249, + "step": 14515 + }, + { + "epoch": 0.3727301760011298, + "grad_norm": 0.7578125, + "learning_rate": 0.00017591266425303522, + "loss": 0.828, + "step": 14516 + }, + { + "epoch": 0.3727558531970516, + "grad_norm": 0.8046875, + "learning_rate": 0.00017590975823672034, + "loss": 0.9336, + "step": 14517 + }, + { + "epoch": 0.37278153039297346, + "grad_norm": 0.734375, + "learning_rate": 0.0001759068520691242, + "loss": 0.8566, + "step": 14518 + }, + { + "epoch": 0.37280720758889524, + "grad_norm": 0.765625, + "learning_rate": 0.00017590394575025254, + "loss": 0.9503, + "step": 14519 + }, + { + "epoch": 0.3728328847848171, + "grad_norm": 0.8203125, + "learning_rate": 0.00017590103928011115, + "loss": 0.9634, + "step": 14520 + }, + { + "epoch": 0.3728585619807389, + "grad_norm": 0.73828125, + "learning_rate": 0.0001758981326587059, + "loss": 0.9344, + "step": 14521 + }, + { + "epoch": 0.3728842391766607, + "grad_norm": 0.8359375, + "learning_rate": 0.0001758952258860425, + "loss": 0.9503, + "step": 14522 + }, + { + "epoch": 0.37290991637258253, + "grad_norm": 0.80859375, + "learning_rate": 0.00017589231896212673, + "loss": 0.8521, + "step": 14523 + }, + { + "epoch": 0.37293559356850436, + "grad_norm": 0.765625, + "learning_rate": 0.00017588941188696449, + "loss": 0.9412, + "step": 14524 + }, + { + "epoch": 0.37296127076442614, + "grad_norm": 0.80859375, + "learning_rate": 0.00017588650466056146, + "loss": 0.9163, + "step": 14525 + }, + { + "epoch": 0.372986947960348, + "grad_norm": 0.80859375, + "learning_rate": 0.0001758835972829235, + "loss": 0.9743, + "step": 14526 + }, + { + "epoch": 0.3730126251562698, + "grad_norm": 0.75, + "learning_rate": 0.0001758806897540564, + "loss": 0.9038, + "step": 14527 + }, + { + "epoch": 0.37303830235219165, + "grad_norm": 0.7734375, + "learning_rate": 0.00017587778207396593, + "loss": 0.8272, + "step": 14528 + }, + { + "epoch": 0.37306397954811343, + "grad_norm": 0.76953125, + "learning_rate": 0.00017587487424265788, + "loss": 0.8794, + "step": 14529 + }, + { + "epoch": 0.37308965674403527, + "grad_norm": 0.796875, + "learning_rate": 0.00017587196626013806, + "loss": 0.8864, + "step": 14530 + }, + { + "epoch": 0.3731153339399571, + "grad_norm": 0.83203125, + "learning_rate": 0.00017586905812641229, + "loss": 1.1193, + "step": 14531 + }, + { + "epoch": 0.3731410111358789, + "grad_norm": 0.73828125, + "learning_rate": 0.0001758661498414863, + "loss": 0.9515, + "step": 14532 + }, + { + "epoch": 0.3731666883318007, + "grad_norm": 0.76171875, + "learning_rate": 0.00017586324140536595, + "loss": 0.8446, + "step": 14533 + }, + { + "epoch": 0.37319236552772256, + "grad_norm": 0.74609375, + "learning_rate": 0.00017586033281805704, + "loss": 0.9095, + "step": 14534 + }, + { + "epoch": 0.37321804272364434, + "grad_norm": 0.8515625, + "learning_rate": 0.00017585742407956527, + "loss": 0.9287, + "step": 14535 + }, + { + "epoch": 0.3732437199195662, + "grad_norm": 0.72265625, + "learning_rate": 0.00017585451518989658, + "loss": 0.792, + "step": 14536 + }, + { + "epoch": 0.373269397115488, + "grad_norm": 0.8125, + "learning_rate": 0.00017585160614905663, + "loss": 1.018, + "step": 14537 + }, + { + "epoch": 0.37329507431140985, + "grad_norm": 0.796875, + "learning_rate": 0.00017584869695705133, + "loss": 1.0025, + "step": 14538 + }, + { + "epoch": 0.3733207515073316, + "grad_norm": 0.75390625, + "learning_rate": 0.0001758457876138864, + "loss": 0.8074, + "step": 14539 + }, + { + "epoch": 0.37334642870325346, + "grad_norm": 0.80078125, + "learning_rate": 0.00017584287811956765, + "loss": 0.9393, + "step": 14540 + }, + { + "epoch": 0.3733721058991753, + "grad_norm": 0.7578125, + "learning_rate": 0.00017583996847410095, + "loss": 0.9747, + "step": 14541 + }, + { + "epoch": 0.3733977830950971, + "grad_norm": 0.84375, + "learning_rate": 0.00017583705867749201, + "loss": 1.0153, + "step": 14542 + }, + { + "epoch": 0.3734234602910189, + "grad_norm": 0.76953125, + "learning_rate": 0.00017583414872974666, + "loss": 0.9261, + "step": 14543 + }, + { + "epoch": 0.37344913748694075, + "grad_norm": 0.7890625, + "learning_rate": 0.0001758312386308707, + "loss": 0.8638, + "step": 14544 + }, + { + "epoch": 0.37347481468286253, + "grad_norm": 0.8125, + "learning_rate": 0.00017582832838086994, + "loss": 0.9846, + "step": 14545 + }, + { + "epoch": 0.37350049187878437, + "grad_norm": 0.85546875, + "learning_rate": 0.00017582541797975018, + "loss": 1.0032, + "step": 14546 + }, + { + "epoch": 0.3735261690747062, + "grad_norm": 0.84765625, + "learning_rate": 0.00017582250742751717, + "loss": 1.0185, + "step": 14547 + }, + { + "epoch": 0.37355184627062804, + "grad_norm": 0.7890625, + "learning_rate": 0.00017581959672417682, + "loss": 0.9404, + "step": 14548 + }, + { + "epoch": 0.3735775234665498, + "grad_norm": 0.81640625, + "learning_rate": 0.0001758166858697348, + "loss": 0.9301, + "step": 14549 + }, + { + "epoch": 0.37360320066247166, + "grad_norm": 0.7890625, + "learning_rate": 0.000175813774864197, + "loss": 1.0194, + "step": 14550 + }, + { + "epoch": 0.3736288778583935, + "grad_norm": 0.76953125, + "learning_rate": 0.00017581086370756918, + "loss": 0.9356, + "step": 14551 + }, + { + "epoch": 0.3736545550543153, + "grad_norm": 0.765625, + "learning_rate": 0.0001758079523998572, + "loss": 0.9195, + "step": 14552 + }, + { + "epoch": 0.3736802322502371, + "grad_norm": 0.7578125, + "learning_rate": 0.00017580504094106677, + "loss": 0.8326, + "step": 14553 + }, + { + "epoch": 0.37370590944615895, + "grad_norm": 0.78515625, + "learning_rate": 0.00017580212933120378, + "loss": 0.9969, + "step": 14554 + }, + { + "epoch": 0.3737315866420807, + "grad_norm": 0.73828125, + "learning_rate": 0.00017579921757027395, + "loss": 0.8376, + "step": 14555 + }, + { + "epoch": 0.37375726383800256, + "grad_norm": 0.75390625, + "learning_rate": 0.00017579630565828314, + "loss": 0.92, + "step": 14556 + }, + { + "epoch": 0.3737829410339244, + "grad_norm": 0.79296875, + "learning_rate": 0.00017579339359523717, + "loss": 0.8414, + "step": 14557 + }, + { + "epoch": 0.37380861822984623, + "grad_norm": 0.76171875, + "learning_rate": 0.00017579048138114179, + "loss": 0.9976, + "step": 14558 + }, + { + "epoch": 0.373834295425768, + "grad_norm": 0.7265625, + "learning_rate": 0.00017578756901600283, + "loss": 0.926, + "step": 14559 + }, + { + "epoch": 0.37385997262168985, + "grad_norm": 0.83203125, + "learning_rate": 0.00017578465649982608, + "loss": 0.9352, + "step": 14560 + }, + { + "epoch": 0.3738856498176117, + "grad_norm": 0.7890625, + "learning_rate": 0.0001757817438326174, + "loss": 0.8358, + "step": 14561 + }, + { + "epoch": 0.37391132701353347, + "grad_norm": 0.74609375, + "learning_rate": 0.0001757788310143825, + "loss": 0.8325, + "step": 14562 + }, + { + "epoch": 0.3739370042094553, + "grad_norm": 0.8515625, + "learning_rate": 0.00017577591804512723, + "loss": 0.9295, + "step": 14563 + }, + { + "epoch": 0.37396268140537714, + "grad_norm": 0.76953125, + "learning_rate": 0.00017577300492485742, + "loss": 0.9249, + "step": 14564 + }, + { + "epoch": 0.3739883586012989, + "grad_norm": 0.96875, + "learning_rate": 0.00017577009165357886, + "loss": 1.0598, + "step": 14565 + }, + { + "epoch": 0.37401403579722076, + "grad_norm": 0.76171875, + "learning_rate": 0.00017576717823129733, + "loss": 0.9506, + "step": 14566 + }, + { + "epoch": 0.3740397129931426, + "grad_norm": 0.72265625, + "learning_rate": 0.00017576426465801867, + "loss": 0.8049, + "step": 14567 + }, + { + "epoch": 0.37406539018906443, + "grad_norm": 0.80859375, + "learning_rate": 0.0001757613509337487, + "loss": 0.9931, + "step": 14568 + }, + { + "epoch": 0.3740910673849862, + "grad_norm": 0.80078125, + "learning_rate": 0.00017575843705849316, + "loss": 0.9946, + "step": 14569 + }, + { + "epoch": 0.37411674458090804, + "grad_norm": 0.91015625, + "learning_rate": 0.00017575552303225793, + "loss": 0.9152, + "step": 14570 + }, + { + "epoch": 0.3741424217768299, + "grad_norm": 0.765625, + "learning_rate": 0.00017575260885504878, + "loss": 0.9342, + "step": 14571 + }, + { + "epoch": 0.37416809897275166, + "grad_norm": 0.7734375, + "learning_rate": 0.0001757496945268715, + "loss": 0.8116, + "step": 14572 + }, + { + "epoch": 0.3741937761686735, + "grad_norm": 0.76171875, + "learning_rate": 0.00017574678004773193, + "loss": 0.8957, + "step": 14573 + }, + { + "epoch": 0.37421945336459533, + "grad_norm": 0.78125, + "learning_rate": 0.0001757438654176359, + "loss": 0.9252, + "step": 14574 + }, + { + "epoch": 0.3742451305605171, + "grad_norm": 0.7578125, + "learning_rate": 0.00017574095063658916, + "loss": 0.83, + "step": 14575 + }, + { + "epoch": 0.37427080775643895, + "grad_norm": 0.828125, + "learning_rate": 0.00017573803570459755, + "loss": 0.9309, + "step": 14576 + }, + { + "epoch": 0.3742964849523608, + "grad_norm": 0.83984375, + "learning_rate": 0.00017573512062166687, + "loss": 0.9791, + "step": 14577 + }, + { + "epoch": 0.3743221621482826, + "grad_norm": 0.81640625, + "learning_rate": 0.00017573220538780296, + "loss": 1.0232, + "step": 14578 + }, + { + "epoch": 0.3743478393442044, + "grad_norm": 0.76171875, + "learning_rate": 0.0001757292900030116, + "loss": 1.0681, + "step": 14579 + }, + { + "epoch": 0.37437351654012624, + "grad_norm": 0.8359375, + "learning_rate": 0.00017572637446729855, + "loss": 1.0533, + "step": 14580 + }, + { + "epoch": 0.3743991937360481, + "grad_norm": 0.91015625, + "learning_rate": 0.00017572345878066976, + "loss": 0.9955, + "step": 14581 + }, + { + "epoch": 0.37442487093196986, + "grad_norm": 0.8046875, + "learning_rate": 0.00017572054294313092, + "loss": 0.8782, + "step": 14582 + }, + { + "epoch": 0.3744505481278917, + "grad_norm": 0.78125, + "learning_rate": 0.00017571762695468786, + "loss": 0.9249, + "step": 14583 + }, + { + "epoch": 0.3744762253238135, + "grad_norm": 0.796875, + "learning_rate": 0.00017571471081534643, + "loss": 1.1143, + "step": 14584 + }, + { + "epoch": 0.3745019025197353, + "grad_norm": 0.77734375, + "learning_rate": 0.00017571179452511242, + "loss": 0.8423, + "step": 14585 + }, + { + "epoch": 0.37452757971565714, + "grad_norm": 0.8203125, + "learning_rate": 0.00017570887808399165, + "loss": 0.8236, + "step": 14586 + }, + { + "epoch": 0.374553256911579, + "grad_norm": 0.7578125, + "learning_rate": 0.0001757059614919899, + "loss": 0.9033, + "step": 14587 + }, + { + "epoch": 0.3745789341075008, + "grad_norm": 0.734375, + "learning_rate": 0.000175703044749113, + "loss": 0.8228, + "step": 14588 + }, + { + "epoch": 0.3746046113034226, + "grad_norm": 0.75, + "learning_rate": 0.0001757001278553668, + "loss": 0.9238, + "step": 14589 + }, + { + "epoch": 0.37463028849934443, + "grad_norm": 0.82421875, + "learning_rate": 0.00017569721081075708, + "loss": 0.8553, + "step": 14590 + }, + { + "epoch": 0.37465596569526627, + "grad_norm": 0.765625, + "learning_rate": 0.00017569429361528964, + "loss": 0.986, + "step": 14591 + }, + { + "epoch": 0.37468164289118805, + "grad_norm": 0.74609375, + "learning_rate": 0.0001756913762689703, + "loss": 0.7436, + "step": 14592 + }, + { + "epoch": 0.3747073200871099, + "grad_norm": 0.76171875, + "learning_rate": 0.00017568845877180493, + "loss": 0.8933, + "step": 14593 + }, + { + "epoch": 0.3747329972830317, + "grad_norm": 0.87109375, + "learning_rate": 0.00017568554112379928, + "loss": 0.9971, + "step": 14594 + }, + { + "epoch": 0.3747586744789535, + "grad_norm": 0.8359375, + "learning_rate": 0.00017568262332495914, + "loss": 0.9534, + "step": 14595 + }, + { + "epoch": 0.37478435167487534, + "grad_norm": 0.81640625, + "learning_rate": 0.00017567970537529041, + "loss": 0.9479, + "step": 14596 + }, + { + "epoch": 0.3748100288707972, + "grad_norm": 0.8515625, + "learning_rate": 0.00017567678727479886, + "loss": 0.9261, + "step": 14597 + }, + { + "epoch": 0.374835706066719, + "grad_norm": 0.82421875, + "learning_rate": 0.00017567386902349032, + "loss": 0.869, + "step": 14598 + }, + { + "epoch": 0.3748613832626408, + "grad_norm": 0.81640625, + "learning_rate": 0.00017567095062137058, + "loss": 0.9244, + "step": 14599 + }, + { + "epoch": 0.3748870604585626, + "grad_norm": 0.86328125, + "learning_rate": 0.00017566803206844547, + "loss": 1.0038, + "step": 14600 + }, + { + "epoch": 0.37491273765448446, + "grad_norm": 0.875, + "learning_rate": 0.00017566511336472083, + "loss": 0.9027, + "step": 14601 + }, + { + "epoch": 0.37493841485040624, + "grad_norm": 0.76171875, + "learning_rate": 0.00017566219451020242, + "loss": 0.943, + "step": 14602 + }, + { + "epoch": 0.3749640920463281, + "grad_norm": 0.91796875, + "learning_rate": 0.0001756592755048961, + "loss": 0.849, + "step": 14603 + }, + { + "epoch": 0.3749897692422499, + "grad_norm": 0.80078125, + "learning_rate": 0.0001756563563488077, + "loss": 0.9623, + "step": 14604 + }, + { + "epoch": 0.3750154464381717, + "grad_norm": 0.78125, + "learning_rate": 0.00017565343704194298, + "loss": 0.9551, + "step": 14605 + }, + { + "epoch": 0.37504112363409353, + "grad_norm": 0.765625, + "learning_rate": 0.0001756505175843078, + "loss": 0.8947, + "step": 14606 + }, + { + "epoch": 0.37506680083001537, + "grad_norm": 0.8125, + "learning_rate": 0.000175647597975908, + "loss": 0.8494, + "step": 14607 + }, + { + "epoch": 0.3750924780259372, + "grad_norm": 0.86328125, + "learning_rate": 0.00017564467821674934, + "loss": 0.9143, + "step": 14608 + }, + { + "epoch": 0.375118155221859, + "grad_norm": 0.81640625, + "learning_rate": 0.00017564175830683769, + "loss": 0.8621, + "step": 14609 + }, + { + "epoch": 0.3751438324177808, + "grad_norm": 0.8671875, + "learning_rate": 0.00017563883824617884, + "loss": 0.9283, + "step": 14610 + }, + { + "epoch": 0.37516950961370266, + "grad_norm": 0.8515625, + "learning_rate": 0.00017563591803477864, + "loss": 1.1314, + "step": 14611 + }, + { + "epoch": 0.37519518680962444, + "grad_norm": 0.74609375, + "learning_rate": 0.00017563299767264282, + "loss": 0.9081, + "step": 14612 + }, + { + "epoch": 0.3752208640055463, + "grad_norm": 0.8125, + "learning_rate": 0.00017563007715977732, + "loss": 1.0299, + "step": 14613 + }, + { + "epoch": 0.3752465412014681, + "grad_norm": 0.73828125, + "learning_rate": 0.00017562715649618792, + "loss": 0.9556, + "step": 14614 + }, + { + "epoch": 0.3752722183973899, + "grad_norm": 0.7578125, + "learning_rate": 0.00017562423568188044, + "loss": 0.7346, + "step": 14615 + }, + { + "epoch": 0.3752978955933117, + "grad_norm": 0.8046875, + "learning_rate": 0.00017562131471686064, + "loss": 0.9513, + "step": 14616 + }, + { + "epoch": 0.37532357278923356, + "grad_norm": 0.81640625, + "learning_rate": 0.0001756183936011344, + "loss": 0.8951, + "step": 14617 + }, + { + "epoch": 0.3753492499851554, + "grad_norm": 0.78125, + "learning_rate": 0.00017561547233470754, + "loss": 0.8167, + "step": 14618 + }, + { + "epoch": 0.3753749271810772, + "grad_norm": 0.76953125, + "learning_rate": 0.0001756125509175859, + "loss": 0.948, + "step": 14619 + }, + { + "epoch": 0.375400604376999, + "grad_norm": 0.76953125, + "learning_rate": 0.00017560962934977522, + "loss": 0.8018, + "step": 14620 + }, + { + "epoch": 0.37542628157292085, + "grad_norm": 0.83984375, + "learning_rate": 0.00017560670763128142, + "loss": 1.1501, + "step": 14621 + }, + { + "epoch": 0.37545195876884263, + "grad_norm": 0.734375, + "learning_rate": 0.00017560378576211028, + "loss": 0.9225, + "step": 14622 + }, + { + "epoch": 0.37547763596476447, + "grad_norm": 0.828125, + "learning_rate": 0.0001756008637422676, + "loss": 1.1052, + "step": 14623 + }, + { + "epoch": 0.3755033131606863, + "grad_norm": 0.796875, + "learning_rate": 0.00017559794157175925, + "loss": 0.8208, + "step": 14624 + }, + { + "epoch": 0.3755289903566081, + "grad_norm": 0.87890625, + "learning_rate": 0.00017559501925059102, + "loss": 0.9191, + "step": 14625 + }, + { + "epoch": 0.3755546675525299, + "grad_norm": 0.80078125, + "learning_rate": 0.00017559209677876875, + "loss": 0.8848, + "step": 14626 + }, + { + "epoch": 0.37558034474845176, + "grad_norm": 0.734375, + "learning_rate": 0.00017558917415629827, + "loss": 0.8545, + "step": 14627 + }, + { + "epoch": 0.3756060219443736, + "grad_norm": 0.83203125, + "learning_rate": 0.00017558625138318538, + "loss": 0.9378, + "step": 14628 + }, + { + "epoch": 0.37563169914029537, + "grad_norm": 0.76171875, + "learning_rate": 0.00017558332845943594, + "loss": 0.8325, + "step": 14629 + }, + { + "epoch": 0.3756573763362172, + "grad_norm": 0.93359375, + "learning_rate": 0.00017558040538505573, + "loss": 0.9851, + "step": 14630 + }, + { + "epoch": 0.37568305353213904, + "grad_norm": 0.8125, + "learning_rate": 0.0001755774821600506, + "loss": 1.0294, + "step": 14631 + }, + { + "epoch": 0.3757087307280608, + "grad_norm": 0.8125, + "learning_rate": 0.00017557455878442637, + "loss": 0.9543, + "step": 14632 + }, + { + "epoch": 0.37573440792398266, + "grad_norm": 1.5, + "learning_rate": 0.0001755716352581889, + "loss": 0.9348, + "step": 14633 + }, + { + "epoch": 0.3757600851199045, + "grad_norm": 0.74609375, + "learning_rate": 0.00017556871158134395, + "loss": 0.9393, + "step": 14634 + }, + { + "epoch": 0.3757857623158263, + "grad_norm": 0.703125, + "learning_rate": 0.0001755657877538974, + "loss": 0.9469, + "step": 14635 + }, + { + "epoch": 0.3758114395117481, + "grad_norm": 0.8515625, + "learning_rate": 0.00017556286377585508, + "loss": 0.9071, + "step": 14636 + }, + { + "epoch": 0.37583711670766995, + "grad_norm": 0.8046875, + "learning_rate": 0.00017555993964722276, + "loss": 0.9729, + "step": 14637 + }, + { + "epoch": 0.3758627939035918, + "grad_norm": 0.7890625, + "learning_rate": 0.00017555701536800636, + "loss": 0.9638, + "step": 14638 + }, + { + "epoch": 0.37588847109951357, + "grad_norm": 0.765625, + "learning_rate": 0.0001755540909382116, + "loss": 1.009, + "step": 14639 + }, + { + "epoch": 0.3759141482954354, + "grad_norm": 0.80078125, + "learning_rate": 0.00017555116635784436, + "loss": 0.9046, + "step": 14640 + }, + { + "epoch": 0.37593982549135724, + "grad_norm": 0.8125, + "learning_rate": 0.00017554824162691052, + "loss": 0.9643, + "step": 14641 + }, + { + "epoch": 0.375965502687279, + "grad_norm": 0.8125, + "learning_rate": 0.00017554531674541583, + "loss": 0.8482, + "step": 14642 + }, + { + "epoch": 0.37599117988320085, + "grad_norm": 0.78515625, + "learning_rate": 0.00017554239171336614, + "loss": 1.0691, + "step": 14643 + }, + { + "epoch": 0.3760168570791227, + "grad_norm": 0.79296875, + "learning_rate": 0.0001755394665307673, + "loss": 1.0004, + "step": 14644 + }, + { + "epoch": 0.37604253427504447, + "grad_norm": 0.79296875, + "learning_rate": 0.00017553654119762513, + "loss": 0.915, + "step": 14645 + }, + { + "epoch": 0.3760682114709663, + "grad_norm": 0.85546875, + "learning_rate": 0.00017553361571394542, + "loss": 1.0396, + "step": 14646 + }, + { + "epoch": 0.37609388866688814, + "grad_norm": 0.82421875, + "learning_rate": 0.00017553069007973407, + "loss": 0.9429, + "step": 14647 + }, + { + "epoch": 0.37611956586281, + "grad_norm": 0.9140625, + "learning_rate": 0.0001755277642949969, + "loss": 1.0042, + "step": 14648 + }, + { + "epoch": 0.37614524305873176, + "grad_norm": 0.796875, + "learning_rate": 0.00017552483835973968, + "loss": 0.8815, + "step": 14649 + }, + { + "epoch": 0.3761709202546536, + "grad_norm": 0.796875, + "learning_rate": 0.00017552191227396827, + "loss": 0.9707, + "step": 14650 + }, + { + "epoch": 0.37619659745057543, + "grad_norm": 0.76171875, + "learning_rate": 0.00017551898603768856, + "loss": 0.9003, + "step": 14651 + }, + { + "epoch": 0.3762222746464972, + "grad_norm": 0.84375, + "learning_rate": 0.00017551605965090628, + "loss": 0.95, + "step": 14652 + }, + { + "epoch": 0.37624795184241905, + "grad_norm": 0.765625, + "learning_rate": 0.00017551313311362735, + "loss": 0.9343, + "step": 14653 + }, + { + "epoch": 0.3762736290383409, + "grad_norm": 0.765625, + "learning_rate": 0.00017551020642585753, + "loss": 0.8741, + "step": 14654 + }, + { + "epoch": 0.37629930623426266, + "grad_norm": 0.8046875, + "learning_rate": 0.00017550727958760273, + "loss": 0.8684, + "step": 14655 + }, + { + "epoch": 0.3763249834301845, + "grad_norm": 0.8984375, + "learning_rate": 0.00017550435259886871, + "loss": 0.9518, + "step": 14656 + }, + { + "epoch": 0.37635066062610634, + "grad_norm": 0.859375, + "learning_rate": 0.00017550142545966137, + "loss": 0.8697, + "step": 14657 + }, + { + "epoch": 0.3763763378220282, + "grad_norm": 0.7578125, + "learning_rate": 0.0001754984981699865, + "loss": 0.8108, + "step": 14658 + }, + { + "epoch": 0.37640201501794995, + "grad_norm": 0.86328125, + "learning_rate": 0.00017549557072984992, + "loss": 1.0602, + "step": 14659 + }, + { + "epoch": 0.3764276922138718, + "grad_norm": 0.79296875, + "learning_rate": 0.0001754926431392575, + "loss": 0.9549, + "step": 14660 + }, + { + "epoch": 0.3764533694097936, + "grad_norm": 0.77734375, + "learning_rate": 0.00017548971539821506, + "loss": 0.8874, + "step": 14661 + }, + { + "epoch": 0.3764790466057154, + "grad_norm": 0.828125, + "learning_rate": 0.00017548678750672845, + "loss": 0.8876, + "step": 14662 + }, + { + "epoch": 0.37650472380163724, + "grad_norm": 0.79296875, + "learning_rate": 0.00017548385946480349, + "loss": 0.8751, + "step": 14663 + }, + { + "epoch": 0.3765304009975591, + "grad_norm": 0.8984375, + "learning_rate": 0.000175480931272446, + "loss": 0.9737, + "step": 14664 + }, + { + "epoch": 0.37655607819348086, + "grad_norm": 0.828125, + "learning_rate": 0.00017547800292966185, + "loss": 1.0258, + "step": 14665 + }, + { + "epoch": 0.3765817553894027, + "grad_norm": 0.85546875, + "learning_rate": 0.00017547507443645683, + "loss": 1.0154, + "step": 14666 + }, + { + "epoch": 0.37660743258532453, + "grad_norm": 0.8828125, + "learning_rate": 0.00017547214579283682, + "loss": 0.8387, + "step": 14667 + }, + { + "epoch": 0.3766331097812463, + "grad_norm": 0.89453125, + "learning_rate": 0.00017546921699880763, + "loss": 0.9756, + "step": 14668 + }, + { + "epoch": 0.37665878697716815, + "grad_norm": 0.78125, + "learning_rate": 0.00017546628805437513, + "loss": 0.8218, + "step": 14669 + }, + { + "epoch": 0.37668446417309, + "grad_norm": 0.74609375, + "learning_rate": 0.00017546335895954514, + "loss": 0.8971, + "step": 14670 + }, + { + "epoch": 0.3767101413690118, + "grad_norm": 0.83203125, + "learning_rate": 0.00017546042971432345, + "loss": 0.9464, + "step": 14671 + }, + { + "epoch": 0.3767358185649336, + "grad_norm": 0.87109375, + "learning_rate": 0.00017545750031871594, + "loss": 1.1253, + "step": 14672 + }, + { + "epoch": 0.37676149576085544, + "grad_norm": 0.81640625, + "learning_rate": 0.00017545457077272852, + "loss": 0.9614, + "step": 14673 + }, + { + "epoch": 0.37678717295677727, + "grad_norm": 0.7421875, + "learning_rate": 0.0001754516410763669, + "loss": 0.9934, + "step": 14674 + }, + { + "epoch": 0.37681285015269905, + "grad_norm": 0.75390625, + "learning_rate": 0.00017544871122963698, + "loss": 0.9021, + "step": 14675 + }, + { + "epoch": 0.3768385273486209, + "grad_norm": 0.84375, + "learning_rate": 0.0001754457812325446, + "loss": 1.1251, + "step": 14676 + }, + { + "epoch": 0.3768642045445427, + "grad_norm": 0.7734375, + "learning_rate": 0.00017544285108509557, + "loss": 0.8456, + "step": 14677 + }, + { + "epoch": 0.3768898817404645, + "grad_norm": 0.76953125, + "learning_rate": 0.00017543992078729577, + "loss": 0.933, + "step": 14678 + }, + { + "epoch": 0.37691555893638634, + "grad_norm": 0.8671875, + "learning_rate": 0.00017543699033915105, + "loss": 0.9079, + "step": 14679 + }, + { + "epoch": 0.3769412361323082, + "grad_norm": 0.859375, + "learning_rate": 0.00017543405974066717, + "loss": 0.8798, + "step": 14680 + }, + { + "epoch": 0.37696691332823, + "grad_norm": 0.84765625, + "learning_rate": 0.0001754311289918501, + "loss": 0.9778, + "step": 14681 + }, + { + "epoch": 0.3769925905241518, + "grad_norm": 1.4296875, + "learning_rate": 0.00017542819809270553, + "loss": 0.8042, + "step": 14682 + }, + { + "epoch": 0.37701826772007363, + "grad_norm": 0.98828125, + "learning_rate": 0.00017542526704323938, + "loss": 0.8318, + "step": 14683 + }, + { + "epoch": 0.37704394491599547, + "grad_norm": 0.78515625, + "learning_rate": 0.0001754223358434575, + "loss": 1.0114, + "step": 14684 + }, + { + "epoch": 0.37706962211191725, + "grad_norm": 0.765625, + "learning_rate": 0.00017541940449336573, + "loss": 1.0857, + "step": 14685 + }, + { + "epoch": 0.3770952993078391, + "grad_norm": 0.796875, + "learning_rate": 0.00017541647299296992, + "loss": 1.0234, + "step": 14686 + }, + { + "epoch": 0.3771209765037609, + "grad_norm": 0.76953125, + "learning_rate": 0.00017541354134227585, + "loss": 0.916, + "step": 14687 + }, + { + "epoch": 0.3771466536996827, + "grad_norm": 0.76171875, + "learning_rate": 0.00017541060954128943, + "loss": 0.8018, + "step": 14688 + }, + { + "epoch": 0.37717233089560454, + "grad_norm": 0.91796875, + "learning_rate": 0.00017540767759001647, + "loss": 1.0192, + "step": 14689 + }, + { + "epoch": 0.37719800809152637, + "grad_norm": 0.79296875, + "learning_rate": 0.0001754047454884628, + "loss": 0.967, + "step": 14690 + }, + { + "epoch": 0.3772236852874482, + "grad_norm": 0.8125, + "learning_rate": 0.00017540181323663428, + "loss": 0.987, + "step": 14691 + }, + { + "epoch": 0.37724936248337, + "grad_norm": 0.7421875, + "learning_rate": 0.0001753988808345368, + "loss": 0.9081, + "step": 14692 + }, + { + "epoch": 0.3772750396792918, + "grad_norm": 0.8125, + "learning_rate": 0.00017539594828217613, + "loss": 0.9851, + "step": 14693 + }, + { + "epoch": 0.37730071687521366, + "grad_norm": 0.76171875, + "learning_rate": 0.00017539301557955816, + "loss": 0.8746, + "step": 14694 + }, + { + "epoch": 0.37732639407113544, + "grad_norm": 0.85546875, + "learning_rate": 0.0001753900827266887, + "loss": 0.9336, + "step": 14695 + }, + { + "epoch": 0.3773520712670573, + "grad_norm": 0.82421875, + "learning_rate": 0.00017538714972357365, + "loss": 1.0326, + "step": 14696 + }, + { + "epoch": 0.3773777484629791, + "grad_norm": 0.8046875, + "learning_rate": 0.00017538421657021876, + "loss": 0.9308, + "step": 14697 + }, + { + "epoch": 0.3774034256589009, + "grad_norm": 0.69140625, + "learning_rate": 0.00017538128326663, + "loss": 0.9168, + "step": 14698 + }, + { + "epoch": 0.37742910285482273, + "grad_norm": 0.7890625, + "learning_rate": 0.00017537834981281312, + "loss": 0.9833, + "step": 14699 + }, + { + "epoch": 0.37745478005074456, + "grad_norm": 0.8125, + "learning_rate": 0.000175375416208774, + "loss": 1.0504, + "step": 14700 + }, + { + "epoch": 0.3774804572466664, + "grad_norm": 0.8046875, + "learning_rate": 0.0001753724824545185, + "loss": 0.8862, + "step": 14701 + }, + { + "epoch": 0.3775061344425882, + "grad_norm": 1.2890625, + "learning_rate": 0.00017536954855005242, + "loss": 0.8824, + "step": 14702 + }, + { + "epoch": 0.37753181163851, + "grad_norm": 0.73046875, + "learning_rate": 0.00017536661449538168, + "loss": 0.8523, + "step": 14703 + }, + { + "epoch": 0.37755748883443185, + "grad_norm": 0.78515625, + "learning_rate": 0.00017536368029051204, + "loss": 0.9579, + "step": 14704 + }, + { + "epoch": 0.37758316603035363, + "grad_norm": 0.8203125, + "learning_rate": 0.00017536074593544942, + "loss": 0.8934, + "step": 14705 + }, + { + "epoch": 0.37760884322627547, + "grad_norm": 0.8046875, + "learning_rate": 0.00017535781143019965, + "loss": 0.9648, + "step": 14706 + }, + { + "epoch": 0.3776345204221973, + "grad_norm": 0.77734375, + "learning_rate": 0.00017535487677476853, + "loss": 1.03, + "step": 14707 + }, + { + "epoch": 0.3776601976181191, + "grad_norm": 0.72265625, + "learning_rate": 0.00017535194196916198, + "loss": 0.9264, + "step": 14708 + }, + { + "epoch": 0.3776858748140409, + "grad_norm": 0.74609375, + "learning_rate": 0.00017534900701338582, + "loss": 0.9645, + "step": 14709 + }, + { + "epoch": 0.37771155200996276, + "grad_norm": 0.71484375, + "learning_rate": 0.00017534607190744587, + "loss": 0.7988, + "step": 14710 + }, + { + "epoch": 0.3777372292058846, + "grad_norm": 0.8125, + "learning_rate": 0.00017534313665134804, + "loss": 1.0542, + "step": 14711 + }, + { + "epoch": 0.3777629064018064, + "grad_norm": 0.77734375, + "learning_rate": 0.0001753402012450981, + "loss": 0.9026, + "step": 14712 + }, + { + "epoch": 0.3777885835977282, + "grad_norm": 0.875, + "learning_rate": 0.00017533726568870196, + "loss": 1.0038, + "step": 14713 + }, + { + "epoch": 0.37781426079365005, + "grad_norm": 0.86328125, + "learning_rate": 0.00017533432998216548, + "loss": 0.9133, + "step": 14714 + }, + { + "epoch": 0.37783993798957183, + "grad_norm": 0.73046875, + "learning_rate": 0.00017533139412549445, + "loss": 0.7561, + "step": 14715 + }, + { + "epoch": 0.37786561518549366, + "grad_norm": 0.77734375, + "learning_rate": 0.0001753284581186948, + "loss": 0.8254, + "step": 14716 + }, + { + "epoch": 0.3778912923814155, + "grad_norm": 0.72265625, + "learning_rate": 0.0001753255219617723, + "loss": 1.0098, + "step": 14717 + }, + { + "epoch": 0.3779169695773373, + "grad_norm": 0.83203125, + "learning_rate": 0.00017532258565473283, + "loss": 0.887, + "step": 14718 + }, + { + "epoch": 0.3779426467732591, + "grad_norm": 0.875, + "learning_rate": 0.00017531964919758228, + "loss": 0.9478, + "step": 14719 + }, + { + "epoch": 0.37796832396918095, + "grad_norm": 0.79296875, + "learning_rate": 0.00017531671259032647, + "loss": 0.873, + "step": 14720 + }, + { + "epoch": 0.3779940011651028, + "grad_norm": 0.8359375, + "learning_rate": 0.0001753137758329712, + "loss": 0.9947, + "step": 14721 + }, + { + "epoch": 0.37801967836102457, + "grad_norm": 0.8828125, + "learning_rate": 0.00017531083892552248, + "loss": 0.9364, + "step": 14722 + }, + { + "epoch": 0.3780453555569464, + "grad_norm": 0.7734375, + "learning_rate": 0.000175307901867986, + "loss": 0.9244, + "step": 14723 + }, + { + "epoch": 0.37807103275286824, + "grad_norm": 0.71875, + "learning_rate": 0.0001753049646603677, + "loss": 0.7935, + "step": 14724 + }, + { + "epoch": 0.37809670994879, + "grad_norm": 1.1015625, + "learning_rate": 0.00017530202730267334, + "loss": 0.9985, + "step": 14725 + }, + { + "epoch": 0.37812238714471186, + "grad_norm": 0.70703125, + "learning_rate": 0.0001752990897949089, + "loss": 0.8866, + "step": 14726 + }, + { + "epoch": 0.3781480643406337, + "grad_norm": 0.796875, + "learning_rate": 0.00017529615213708017, + "loss": 0.9411, + "step": 14727 + }, + { + "epoch": 0.3781737415365555, + "grad_norm": 0.71875, + "learning_rate": 0.000175293214329193, + "loss": 0.8704, + "step": 14728 + }, + { + "epoch": 0.3781994187324773, + "grad_norm": 0.75, + "learning_rate": 0.00017529027637125326, + "loss": 0.8197, + "step": 14729 + }, + { + "epoch": 0.37822509592839915, + "grad_norm": 0.8125, + "learning_rate": 0.0001752873382632668, + "loss": 0.8869, + "step": 14730 + }, + { + "epoch": 0.378250773124321, + "grad_norm": 0.796875, + "learning_rate": 0.00017528440000523946, + "loss": 0.9356, + "step": 14731 + }, + { + "epoch": 0.37827645032024276, + "grad_norm": 0.828125, + "learning_rate": 0.00017528146159717713, + "loss": 1.032, + "step": 14732 + }, + { + "epoch": 0.3783021275161646, + "grad_norm": 0.7890625, + "learning_rate": 0.00017527852303908566, + "loss": 0.8229, + "step": 14733 + }, + { + "epoch": 0.37832780471208644, + "grad_norm": 0.734375, + "learning_rate": 0.0001752755843309709, + "loss": 0.8148, + "step": 14734 + }, + { + "epoch": 0.3783534819080082, + "grad_norm": 0.76953125, + "learning_rate": 0.00017527264547283867, + "loss": 0.8065, + "step": 14735 + }, + { + "epoch": 0.37837915910393005, + "grad_norm": 0.8515625, + "learning_rate": 0.00017526970646469485, + "loss": 0.9271, + "step": 14736 + }, + { + "epoch": 0.3784048362998519, + "grad_norm": 0.7578125, + "learning_rate": 0.0001752667673065453, + "loss": 0.7944, + "step": 14737 + }, + { + "epoch": 0.37843051349577367, + "grad_norm": 0.76953125, + "learning_rate": 0.0001752638279983959, + "loss": 0.907, + "step": 14738 + }, + { + "epoch": 0.3784561906916955, + "grad_norm": 0.76171875, + "learning_rate": 0.00017526088854025252, + "loss": 1.0189, + "step": 14739 + }, + { + "epoch": 0.37848186788761734, + "grad_norm": 0.8203125, + "learning_rate": 0.00017525794893212093, + "loss": 0.9251, + "step": 14740 + }, + { + "epoch": 0.3785075450835392, + "grad_norm": 0.7734375, + "learning_rate": 0.00017525500917400707, + "loss": 0.8727, + "step": 14741 + }, + { + "epoch": 0.37853322227946096, + "grad_norm": 0.8671875, + "learning_rate": 0.0001752520692659168, + "loss": 1.0066, + "step": 14742 + }, + { + "epoch": 0.3785588994753828, + "grad_norm": 0.7265625, + "learning_rate": 0.0001752491292078559, + "loss": 0.935, + "step": 14743 + }, + { + "epoch": 0.37858457667130463, + "grad_norm": 0.83984375, + "learning_rate": 0.00017524618899983035, + "loss": 0.8262, + "step": 14744 + }, + { + "epoch": 0.3786102538672264, + "grad_norm": 0.765625, + "learning_rate": 0.00017524324864184588, + "loss": 0.8863, + "step": 14745 + }, + { + "epoch": 0.37863593106314825, + "grad_norm": 0.8203125, + "learning_rate": 0.00017524030813390845, + "loss": 1.0564, + "step": 14746 + }, + { + "epoch": 0.3786616082590701, + "grad_norm": 0.765625, + "learning_rate": 0.00017523736747602388, + "loss": 0.8915, + "step": 14747 + }, + { + "epoch": 0.37868728545499186, + "grad_norm": 0.765625, + "learning_rate": 0.000175234426668198, + "loss": 0.9054, + "step": 14748 + }, + { + "epoch": 0.3787129626509137, + "grad_norm": 0.80078125, + "learning_rate": 0.00017523148571043672, + "loss": 0.9427, + "step": 14749 + }, + { + "epoch": 0.37873863984683553, + "grad_norm": 0.8671875, + "learning_rate": 0.0001752285446027459, + "loss": 0.9563, + "step": 14750 + }, + { + "epoch": 0.37876431704275737, + "grad_norm": 0.72265625, + "learning_rate": 0.0001752256033451314, + "loss": 1.0691, + "step": 14751 + }, + { + "epoch": 0.37878999423867915, + "grad_norm": 0.89453125, + "learning_rate": 0.00017522266193759905, + "loss": 1.0541, + "step": 14752 + }, + { + "epoch": 0.378815671434601, + "grad_norm": 0.76171875, + "learning_rate": 0.0001752197203801547, + "loss": 0.9414, + "step": 14753 + }, + { + "epoch": 0.3788413486305228, + "grad_norm": 0.74609375, + "learning_rate": 0.00017521677867280428, + "loss": 1.0414, + "step": 14754 + }, + { + "epoch": 0.3788670258264446, + "grad_norm": 0.87890625, + "learning_rate": 0.00017521383681555358, + "loss": 0.8524, + "step": 14755 + }, + { + "epoch": 0.37889270302236644, + "grad_norm": 0.83984375, + "learning_rate": 0.00017521089480840852, + "loss": 0.876, + "step": 14756 + }, + { + "epoch": 0.3789183802182883, + "grad_norm": 1.8984375, + "learning_rate": 0.00017520795265137495, + "loss": 0.9616, + "step": 14757 + }, + { + "epoch": 0.37894405741421006, + "grad_norm": 0.7109375, + "learning_rate": 0.00017520501034445872, + "loss": 0.8175, + "step": 14758 + }, + { + "epoch": 0.3789697346101319, + "grad_norm": 0.80859375, + "learning_rate": 0.00017520206788766568, + "loss": 1.0651, + "step": 14759 + }, + { + "epoch": 0.37899541180605373, + "grad_norm": 1.0703125, + "learning_rate": 0.00017519912528100173, + "loss": 1.0195, + "step": 14760 + }, + { + "epoch": 0.37902108900197556, + "grad_norm": 0.83203125, + "learning_rate": 0.0001751961825244727, + "loss": 0.9975, + "step": 14761 + }, + { + "epoch": 0.37904676619789734, + "grad_norm": 0.7890625, + "learning_rate": 0.00017519323961808446, + "loss": 0.9619, + "step": 14762 + }, + { + "epoch": 0.3790724433938192, + "grad_norm": 0.765625, + "learning_rate": 0.0001751902965618429, + "loss": 0.8135, + "step": 14763 + }, + { + "epoch": 0.379098120589741, + "grad_norm": 0.796875, + "learning_rate": 0.00017518735335575388, + "loss": 1.046, + "step": 14764 + }, + { + "epoch": 0.3791237977856628, + "grad_norm": 0.84375, + "learning_rate": 0.00017518440999982326, + "loss": 0.9375, + "step": 14765 + }, + { + "epoch": 0.37914947498158463, + "grad_norm": 0.83984375, + "learning_rate": 0.0001751814664940569, + "loss": 1.1232, + "step": 14766 + }, + { + "epoch": 0.37917515217750647, + "grad_norm": 0.72265625, + "learning_rate": 0.00017517852283846069, + "loss": 0.847, + "step": 14767 + }, + { + "epoch": 0.37920082937342825, + "grad_norm": 0.828125, + "learning_rate": 0.00017517557903304042, + "loss": 0.9551, + "step": 14768 + }, + { + "epoch": 0.3792265065693501, + "grad_norm": 0.83203125, + "learning_rate": 0.00017517263507780203, + "loss": 0.7958, + "step": 14769 + }, + { + "epoch": 0.3792521837652719, + "grad_norm": 0.8984375, + "learning_rate": 0.00017516969097275138, + "loss": 0.9228, + "step": 14770 + }, + { + "epoch": 0.37927786096119376, + "grad_norm": 0.78515625, + "learning_rate": 0.00017516674671789431, + "loss": 1.033, + "step": 14771 + }, + { + "epoch": 0.37930353815711554, + "grad_norm": 0.74609375, + "learning_rate": 0.00017516380231323675, + "loss": 0.9216, + "step": 14772 + }, + { + "epoch": 0.3793292153530374, + "grad_norm": 0.78515625, + "learning_rate": 0.0001751608577587845, + "loss": 0.9283, + "step": 14773 + }, + { + "epoch": 0.3793548925489592, + "grad_norm": 0.79296875, + "learning_rate": 0.0001751579130545434, + "loss": 0.8541, + "step": 14774 + }, + { + "epoch": 0.379380569744881, + "grad_norm": 0.77734375, + "learning_rate": 0.00017515496820051942, + "loss": 0.9645, + "step": 14775 + }, + { + "epoch": 0.3794062469408028, + "grad_norm": 0.85546875, + "learning_rate": 0.00017515202319671834, + "loss": 0.8887, + "step": 14776 + }, + { + "epoch": 0.37943192413672466, + "grad_norm": 0.82421875, + "learning_rate": 0.0001751490780431461, + "loss": 1.0078, + "step": 14777 + }, + { + "epoch": 0.37945760133264644, + "grad_norm": 0.9765625, + "learning_rate": 0.00017514613273980853, + "loss": 0.9579, + "step": 14778 + }, + { + "epoch": 0.3794832785285683, + "grad_norm": 0.7890625, + "learning_rate": 0.00017514318728671148, + "loss": 0.9254, + "step": 14779 + }, + { + "epoch": 0.3795089557244901, + "grad_norm": 0.77734375, + "learning_rate": 0.00017514024168386087, + "loss": 0.9191, + "step": 14780 + }, + { + "epoch": 0.37953463292041195, + "grad_norm": 0.83984375, + "learning_rate": 0.00017513729593126253, + "loss": 0.7902, + "step": 14781 + }, + { + "epoch": 0.37956031011633373, + "grad_norm": 0.796875, + "learning_rate": 0.00017513435002892237, + "loss": 0.9475, + "step": 14782 + }, + { + "epoch": 0.37958598731225557, + "grad_norm": 0.828125, + "learning_rate": 0.00017513140397684625, + "loss": 0.916, + "step": 14783 + }, + { + "epoch": 0.3796116645081774, + "grad_norm": 0.87890625, + "learning_rate": 0.00017512845777504, + "loss": 0.9892, + "step": 14784 + }, + { + "epoch": 0.3796373417040992, + "grad_norm": 0.890625, + "learning_rate": 0.0001751255114235095, + "loss": 1.1218, + "step": 14785 + }, + { + "epoch": 0.379663018900021, + "grad_norm": 0.7734375, + "learning_rate": 0.00017512256492226066, + "loss": 0.9506, + "step": 14786 + }, + { + "epoch": 0.37968869609594286, + "grad_norm": 0.8828125, + "learning_rate": 0.00017511961827129934, + "loss": 0.9078, + "step": 14787 + }, + { + "epoch": 0.37971437329186464, + "grad_norm": 0.80078125, + "learning_rate": 0.0001751166714706314, + "loss": 0.986, + "step": 14788 + }, + { + "epoch": 0.3797400504877865, + "grad_norm": 0.80859375, + "learning_rate": 0.00017511372452026276, + "loss": 0.8415, + "step": 14789 + }, + { + "epoch": 0.3797657276837083, + "grad_norm": 0.80859375, + "learning_rate": 0.0001751107774201992, + "loss": 1.1193, + "step": 14790 + }, + { + "epoch": 0.37979140487963015, + "grad_norm": 0.75390625, + "learning_rate": 0.00017510783017044666, + "loss": 0.9365, + "step": 14791 + }, + { + "epoch": 0.3798170820755519, + "grad_norm": 0.75, + "learning_rate": 0.000175104882771011, + "loss": 1.1275, + "step": 14792 + }, + { + "epoch": 0.37984275927147376, + "grad_norm": 0.80078125, + "learning_rate": 0.0001751019352218981, + "loss": 0.9406, + "step": 14793 + }, + { + "epoch": 0.3798684364673956, + "grad_norm": 0.83203125, + "learning_rate": 0.00017509898752311382, + "loss": 0.8942, + "step": 14794 + }, + { + "epoch": 0.3798941136633174, + "grad_norm": 0.77734375, + "learning_rate": 0.00017509603967466405, + "loss": 0.9572, + "step": 14795 + }, + { + "epoch": 0.3799197908592392, + "grad_norm": 0.80078125, + "learning_rate": 0.00017509309167655464, + "loss": 1.0164, + "step": 14796 + }, + { + "epoch": 0.37994546805516105, + "grad_norm": 0.7734375, + "learning_rate": 0.0001750901435287915, + "loss": 0.9421, + "step": 14797 + }, + { + "epoch": 0.37997114525108283, + "grad_norm": 0.8359375, + "learning_rate": 0.00017508719523138047, + "loss": 1.1053, + "step": 14798 + }, + { + "epoch": 0.37999682244700467, + "grad_norm": 0.8046875, + "learning_rate": 0.00017508424678432745, + "loss": 0.9112, + "step": 14799 + }, + { + "epoch": 0.3800224996429265, + "grad_norm": 0.98046875, + "learning_rate": 0.0001750812981876383, + "loss": 0.9307, + "step": 14800 + }, + { + "epoch": 0.38004817683884834, + "grad_norm": 0.78125, + "learning_rate": 0.0001750783494413189, + "loss": 0.9976, + "step": 14801 + }, + { + "epoch": 0.3800738540347701, + "grad_norm": 0.76171875, + "learning_rate": 0.00017507540054537515, + "loss": 0.9671, + "step": 14802 + }, + { + "epoch": 0.38009953123069196, + "grad_norm": 0.7421875, + "learning_rate": 0.0001750724514998129, + "loss": 0.8548, + "step": 14803 + }, + { + "epoch": 0.3801252084266138, + "grad_norm": 0.8515625, + "learning_rate": 0.00017506950230463804, + "loss": 0.949, + "step": 14804 + }, + { + "epoch": 0.3801508856225356, + "grad_norm": 0.82421875, + "learning_rate": 0.00017506655295985643, + "loss": 0.7889, + "step": 14805 + }, + { + "epoch": 0.3801765628184574, + "grad_norm": 0.78515625, + "learning_rate": 0.00017506360346547398, + "loss": 1.0847, + "step": 14806 + }, + { + "epoch": 0.38020224001437924, + "grad_norm": 0.7421875, + "learning_rate": 0.00017506065382149656, + "loss": 0.8417, + "step": 14807 + }, + { + "epoch": 0.380227917210301, + "grad_norm": 0.77734375, + "learning_rate": 0.00017505770402793, + "loss": 0.9908, + "step": 14808 + }, + { + "epoch": 0.38025359440622286, + "grad_norm": 0.80078125, + "learning_rate": 0.00017505475408478022, + "loss": 0.9381, + "step": 14809 + }, + { + "epoch": 0.3802792716021447, + "grad_norm": 0.79296875, + "learning_rate": 0.0001750518039920531, + "loss": 1.0112, + "step": 14810 + }, + { + "epoch": 0.38030494879806653, + "grad_norm": 0.7578125, + "learning_rate": 0.00017504885374975454, + "loss": 0.9279, + "step": 14811 + }, + { + "epoch": 0.3803306259939883, + "grad_norm": 0.734375, + "learning_rate": 0.00017504590335789037, + "loss": 0.9347, + "step": 14812 + }, + { + "epoch": 0.38035630318991015, + "grad_norm": 0.7890625, + "learning_rate": 0.0001750429528164665, + "loss": 0.8843, + "step": 14813 + }, + { + "epoch": 0.380381980385832, + "grad_norm": 0.8046875, + "learning_rate": 0.0001750400021254888, + "loss": 0.9442, + "step": 14814 + }, + { + "epoch": 0.38040765758175377, + "grad_norm": 0.8359375, + "learning_rate": 0.00017503705128496316, + "loss": 1.0364, + "step": 14815 + }, + { + "epoch": 0.3804333347776756, + "grad_norm": 0.76953125, + "learning_rate": 0.00017503410029489545, + "loss": 0.8998, + "step": 14816 + }, + { + "epoch": 0.38045901197359744, + "grad_norm": 0.7734375, + "learning_rate": 0.00017503114915529155, + "loss": 0.8501, + "step": 14817 + }, + { + "epoch": 0.3804846891695192, + "grad_norm": 0.7890625, + "learning_rate": 0.00017502819786615736, + "loss": 0.9444, + "step": 14818 + }, + { + "epoch": 0.38051036636544106, + "grad_norm": 0.75390625, + "learning_rate": 0.00017502524642749875, + "loss": 0.9057, + "step": 14819 + }, + { + "epoch": 0.3805360435613629, + "grad_norm": 0.84765625, + "learning_rate": 0.0001750222948393216, + "loss": 1.0651, + "step": 14820 + }, + { + "epoch": 0.3805617207572847, + "grad_norm": 0.7890625, + "learning_rate": 0.00017501934310163177, + "loss": 0.946, + "step": 14821 + }, + { + "epoch": 0.3805873979532065, + "grad_norm": 0.984375, + "learning_rate": 0.00017501639121443518, + "loss": 1.0203, + "step": 14822 + }, + { + "epoch": 0.38061307514912834, + "grad_norm": 0.73828125, + "learning_rate": 0.00017501343917773773, + "loss": 0.844, + "step": 14823 + }, + { + "epoch": 0.3806387523450502, + "grad_norm": 0.78125, + "learning_rate": 0.00017501048699154523, + "loss": 0.867, + "step": 14824 + }, + { + "epoch": 0.38066442954097196, + "grad_norm": 0.78125, + "learning_rate": 0.0001750075346558636, + "loss": 0.9198, + "step": 14825 + }, + { + "epoch": 0.3806901067368938, + "grad_norm": 0.82421875, + "learning_rate": 0.00017500458217069875, + "loss": 0.8909, + "step": 14826 + }, + { + "epoch": 0.38071578393281563, + "grad_norm": 0.78515625, + "learning_rate": 0.00017500162953605654, + "loss": 0.8374, + "step": 14827 + }, + { + "epoch": 0.3807414611287374, + "grad_norm": 0.72265625, + "learning_rate": 0.0001749986767519429, + "loss": 0.7214, + "step": 14828 + }, + { + "epoch": 0.38076713832465925, + "grad_norm": 0.8671875, + "learning_rate": 0.00017499572381836356, + "loss": 1.0294, + "step": 14829 + }, + { + "epoch": 0.3807928155205811, + "grad_norm": 0.84765625, + "learning_rate": 0.0001749927707353246, + "loss": 0.9457, + "step": 14830 + }, + { + "epoch": 0.3808184927165029, + "grad_norm": 0.90234375, + "learning_rate": 0.00017498981750283183, + "loss": 0.8037, + "step": 14831 + }, + { + "epoch": 0.3808441699124247, + "grad_norm": 0.734375, + "learning_rate": 0.0001749868641208911, + "loss": 0.9581, + "step": 14832 + }, + { + "epoch": 0.38086984710834654, + "grad_norm": 0.70703125, + "learning_rate": 0.0001749839105895083, + "loss": 0.9154, + "step": 14833 + }, + { + "epoch": 0.3808955243042684, + "grad_norm": 0.7421875, + "learning_rate": 0.00017498095690868942, + "loss": 0.9403, + "step": 14834 + }, + { + "epoch": 0.38092120150019015, + "grad_norm": 0.74609375, + "learning_rate": 0.00017497800307844018, + "loss": 0.9349, + "step": 14835 + }, + { + "epoch": 0.380946878696112, + "grad_norm": 0.74609375, + "learning_rate": 0.0001749750490987666, + "loss": 0.9692, + "step": 14836 + }, + { + "epoch": 0.3809725558920338, + "grad_norm": 0.8203125, + "learning_rate": 0.00017497209496967448, + "loss": 0.899, + "step": 14837 + }, + { + "epoch": 0.3809982330879556, + "grad_norm": 1.140625, + "learning_rate": 0.0001749691406911698, + "loss": 0.9712, + "step": 14838 + }, + { + "epoch": 0.38102391028387744, + "grad_norm": 0.80078125, + "learning_rate": 0.00017496618626325835, + "loss": 0.9254, + "step": 14839 + }, + { + "epoch": 0.3810495874797993, + "grad_norm": 0.8203125, + "learning_rate": 0.0001749632316859461, + "loss": 0.9793, + "step": 14840 + }, + { + "epoch": 0.3810752646757211, + "grad_norm": 0.80078125, + "learning_rate": 0.0001749602769592389, + "loss": 0.9698, + "step": 14841 + }, + { + "epoch": 0.3811009418716429, + "grad_norm": 0.734375, + "learning_rate": 0.0001749573220831426, + "loss": 0.85, + "step": 14842 + }, + { + "epoch": 0.38112661906756473, + "grad_norm": 0.82421875, + "learning_rate": 0.00017495436705766313, + "loss": 1.0802, + "step": 14843 + }, + { + "epoch": 0.38115229626348657, + "grad_norm": 0.8515625, + "learning_rate": 0.00017495141188280638, + "loss": 0.9133, + "step": 14844 + }, + { + "epoch": 0.38117797345940835, + "grad_norm": 0.765625, + "learning_rate": 0.0001749484565585783, + "loss": 1.0647, + "step": 14845 + }, + { + "epoch": 0.3812036506553302, + "grad_norm": 0.76953125, + "learning_rate": 0.00017494550108498462, + "loss": 1.0045, + "step": 14846 + }, + { + "epoch": 0.381229327851252, + "grad_norm": 0.86328125, + "learning_rate": 0.0001749425454620314, + "loss": 1.1342, + "step": 14847 + }, + { + "epoch": 0.3812550050471738, + "grad_norm": 0.6953125, + "learning_rate": 0.00017493958968972445, + "loss": 0.736, + "step": 14848 + }, + { + "epoch": 0.38128068224309564, + "grad_norm": 0.80078125, + "learning_rate": 0.00017493663376806962, + "loss": 0.9408, + "step": 14849 + }, + { + "epoch": 0.3813063594390175, + "grad_norm": 0.86328125, + "learning_rate": 0.00017493367769707288, + "loss": 0.9125, + "step": 14850 + }, + { + "epoch": 0.3813320366349393, + "grad_norm": 0.77734375, + "learning_rate": 0.00017493072147674007, + "loss": 1.0625, + "step": 14851 + }, + { + "epoch": 0.3813577138308611, + "grad_norm": 0.7421875, + "learning_rate": 0.00017492776510707713, + "loss": 0.8023, + "step": 14852 + }, + { + "epoch": 0.3813833910267829, + "grad_norm": 0.80078125, + "learning_rate": 0.0001749248085880899, + "loss": 0.9364, + "step": 14853 + }, + { + "epoch": 0.38140906822270476, + "grad_norm": 0.7578125, + "learning_rate": 0.00017492185191978428, + "loss": 0.8539, + "step": 14854 + }, + { + "epoch": 0.38143474541862654, + "grad_norm": 0.7890625, + "learning_rate": 0.00017491889510216622, + "loss": 0.9575, + "step": 14855 + }, + { + "epoch": 0.3814604226145484, + "grad_norm": 0.828125, + "learning_rate": 0.00017491593813524154, + "loss": 0.8829, + "step": 14856 + }, + { + "epoch": 0.3814860998104702, + "grad_norm": 0.77734375, + "learning_rate": 0.00017491298101901615, + "loss": 0.7491, + "step": 14857 + }, + { + "epoch": 0.381511777006392, + "grad_norm": 0.81640625, + "learning_rate": 0.00017491002375349601, + "loss": 1.0953, + "step": 14858 + }, + { + "epoch": 0.38153745420231383, + "grad_norm": 0.7265625, + "learning_rate": 0.00017490706633868693, + "loss": 0.9374, + "step": 14859 + }, + { + "epoch": 0.38156313139823567, + "grad_norm": 0.77734375, + "learning_rate": 0.0001749041087745948, + "loss": 0.8296, + "step": 14860 + }, + { + "epoch": 0.3815888085941575, + "grad_norm": 0.75, + "learning_rate": 0.0001749011510612256, + "loss": 0.9425, + "step": 14861 + }, + { + "epoch": 0.3816144857900793, + "grad_norm": 0.73828125, + "learning_rate": 0.00017489819319858513, + "loss": 0.9379, + "step": 14862 + }, + { + "epoch": 0.3816401629860011, + "grad_norm": 1.2734375, + "learning_rate": 0.00017489523518667937, + "loss": 0.9191, + "step": 14863 + }, + { + "epoch": 0.38166584018192296, + "grad_norm": 0.7734375, + "learning_rate": 0.00017489227702551413, + "loss": 0.8012, + "step": 14864 + }, + { + "epoch": 0.38169151737784474, + "grad_norm": 0.7734375, + "learning_rate": 0.00017488931871509534, + "loss": 0.8978, + "step": 14865 + }, + { + "epoch": 0.38171719457376657, + "grad_norm": 0.91015625, + "learning_rate": 0.00017488636025542891, + "loss": 1.0197, + "step": 14866 + }, + { + "epoch": 0.3817428717696884, + "grad_norm": 0.87890625, + "learning_rate": 0.00017488340164652078, + "loss": 0.8448, + "step": 14867 + }, + { + "epoch": 0.3817685489656102, + "grad_norm": 0.8359375, + "learning_rate": 0.00017488044288837675, + "loss": 1.0791, + "step": 14868 + }, + { + "epoch": 0.381794226161532, + "grad_norm": 0.80859375, + "learning_rate": 0.00017487748398100278, + "loss": 1.0094, + "step": 14869 + }, + { + "epoch": 0.38181990335745386, + "grad_norm": 0.859375, + "learning_rate": 0.00017487452492440472, + "loss": 1.0171, + "step": 14870 + }, + { + "epoch": 0.38184558055337564, + "grad_norm": 0.84765625, + "learning_rate": 0.0001748715657185885, + "loss": 0.9714, + "step": 14871 + }, + { + "epoch": 0.3818712577492975, + "grad_norm": 0.76171875, + "learning_rate": 0.00017486860636356004, + "loss": 0.8072, + "step": 14872 + }, + { + "epoch": 0.3818969349452193, + "grad_norm": 0.68359375, + "learning_rate": 0.00017486564685932518, + "loss": 0.9035, + "step": 14873 + }, + { + "epoch": 0.38192261214114115, + "grad_norm": 0.8125, + "learning_rate": 0.00017486268720588987, + "loss": 0.9842, + "step": 14874 + }, + { + "epoch": 0.38194828933706293, + "grad_norm": 0.91015625, + "learning_rate": 0.00017485972740326, + "loss": 0.9011, + "step": 14875 + }, + { + "epoch": 0.38197396653298477, + "grad_norm": 0.82421875, + "learning_rate": 0.0001748567674514414, + "loss": 1.0384, + "step": 14876 + }, + { + "epoch": 0.3819996437289066, + "grad_norm": 0.78125, + "learning_rate": 0.00017485380735044007, + "loss": 0.835, + "step": 14877 + }, + { + "epoch": 0.3820253209248284, + "grad_norm": 0.83203125, + "learning_rate": 0.00017485084710026184, + "loss": 0.9816, + "step": 14878 + }, + { + "epoch": 0.3820509981207502, + "grad_norm": 0.796875, + "learning_rate": 0.00017484788670091267, + "loss": 0.8621, + "step": 14879 + }, + { + "epoch": 0.38207667531667205, + "grad_norm": 0.796875, + "learning_rate": 0.0001748449261523984, + "loss": 0.9869, + "step": 14880 + }, + { + "epoch": 0.38210235251259383, + "grad_norm": 0.86328125, + "learning_rate": 0.00017484196545472494, + "loss": 0.8925, + "step": 14881 + }, + { + "epoch": 0.38212802970851567, + "grad_norm": 0.79296875, + "learning_rate": 0.00017483900460789822, + "loss": 0.8812, + "step": 14882 + }, + { + "epoch": 0.3821537069044375, + "grad_norm": 0.7421875, + "learning_rate": 0.00017483604361192412, + "loss": 0.8509, + "step": 14883 + }, + { + "epoch": 0.38217938410035934, + "grad_norm": 2.671875, + "learning_rate": 0.00017483308246680855, + "loss": 0.9273, + "step": 14884 + }, + { + "epoch": 0.3822050612962811, + "grad_norm": 0.7890625, + "learning_rate": 0.0001748301211725574, + "loss": 0.8523, + "step": 14885 + }, + { + "epoch": 0.38223073849220296, + "grad_norm": 0.93359375, + "learning_rate": 0.0001748271597291766, + "loss": 1.0873, + "step": 14886 + }, + { + "epoch": 0.3822564156881248, + "grad_norm": 0.828125, + "learning_rate": 0.00017482419813667196, + "loss": 1.0245, + "step": 14887 + }, + { + "epoch": 0.3822820928840466, + "grad_norm": 0.76171875, + "learning_rate": 0.00017482123639504953, + "loss": 0.9788, + "step": 14888 + }, + { + "epoch": 0.3823077700799684, + "grad_norm": 0.7578125, + "learning_rate": 0.0001748182745043151, + "loss": 0.8743, + "step": 14889 + }, + { + "epoch": 0.38233344727589025, + "grad_norm": 0.8046875, + "learning_rate": 0.0001748153124644746, + "loss": 0.8426, + "step": 14890 + }, + { + "epoch": 0.38235912447181203, + "grad_norm": 0.73046875, + "learning_rate": 0.00017481235027553397, + "loss": 0.9836, + "step": 14891 + }, + { + "epoch": 0.38238480166773386, + "grad_norm": 0.77734375, + "learning_rate": 0.00017480938793749905, + "loss": 1.0382, + "step": 14892 + }, + { + "epoch": 0.3824104788636557, + "grad_norm": 0.76953125, + "learning_rate": 0.0001748064254503758, + "loss": 0.9119, + "step": 14893 + }, + { + "epoch": 0.38243615605957754, + "grad_norm": 0.76953125, + "learning_rate": 0.0001748034628141701, + "loss": 0.9924, + "step": 14894 + }, + { + "epoch": 0.3824618332554993, + "grad_norm": 0.78125, + "learning_rate": 0.00017480050002888784, + "loss": 1.0503, + "step": 14895 + }, + { + "epoch": 0.38248751045142115, + "grad_norm": 0.78515625, + "learning_rate": 0.00017479753709453494, + "loss": 0.7456, + "step": 14896 + }, + { + "epoch": 0.382513187647343, + "grad_norm": 0.796875, + "learning_rate": 0.00017479457401111735, + "loss": 0.8982, + "step": 14897 + }, + { + "epoch": 0.38253886484326477, + "grad_norm": 0.875, + "learning_rate": 0.00017479161077864087, + "loss": 0.9297, + "step": 14898 + }, + { + "epoch": 0.3825645420391866, + "grad_norm": 0.7578125, + "learning_rate": 0.0001747886473971115, + "loss": 0.8997, + "step": 14899 + }, + { + "epoch": 0.38259021923510844, + "grad_norm": 0.84765625, + "learning_rate": 0.0001747856838665351, + "loss": 0.9157, + "step": 14900 + }, + { + "epoch": 0.3826158964310302, + "grad_norm": 0.77734375, + "learning_rate": 0.00017478272018691757, + "loss": 0.8334, + "step": 14901 + }, + { + "epoch": 0.38264157362695206, + "grad_norm": 0.80078125, + "learning_rate": 0.00017477975635826488, + "loss": 0.9387, + "step": 14902 + }, + { + "epoch": 0.3826672508228739, + "grad_norm": 0.78125, + "learning_rate": 0.00017477679238058286, + "loss": 0.8856, + "step": 14903 + }, + { + "epoch": 0.38269292801879573, + "grad_norm": 0.82421875, + "learning_rate": 0.00017477382825387746, + "loss": 0.895, + "step": 14904 + }, + { + "epoch": 0.3827186052147175, + "grad_norm": 0.75, + "learning_rate": 0.00017477086397815454, + "loss": 1.0337, + "step": 14905 + }, + { + "epoch": 0.38274428241063935, + "grad_norm": 0.8046875, + "learning_rate": 0.00017476789955342005, + "loss": 1.0093, + "step": 14906 + }, + { + "epoch": 0.3827699596065612, + "grad_norm": 0.7421875, + "learning_rate": 0.0001747649349796799, + "loss": 1.0566, + "step": 14907 + }, + { + "epoch": 0.38279563680248296, + "grad_norm": 0.77734375, + "learning_rate": 0.00017476197025694003, + "loss": 0.7714, + "step": 14908 + }, + { + "epoch": 0.3828213139984048, + "grad_norm": 0.8359375, + "learning_rate": 0.00017475900538520628, + "loss": 1.0442, + "step": 14909 + }, + { + "epoch": 0.38284699119432664, + "grad_norm": 0.8125, + "learning_rate": 0.00017475604036448457, + "loss": 0.8771, + "step": 14910 + }, + { + "epoch": 0.3828726683902484, + "grad_norm": 0.8671875, + "learning_rate": 0.00017475307519478083, + "loss": 1.209, + "step": 14911 + }, + { + "epoch": 0.38289834558617025, + "grad_norm": 0.74609375, + "learning_rate": 0.00017475010987610097, + "loss": 1.0169, + "step": 14912 + }, + { + "epoch": 0.3829240227820921, + "grad_norm": 0.875, + "learning_rate": 0.00017474714440845088, + "loss": 0.9369, + "step": 14913 + }, + { + "epoch": 0.3829496999780139, + "grad_norm": 0.77734375, + "learning_rate": 0.00017474417879183649, + "loss": 0.9105, + "step": 14914 + }, + { + "epoch": 0.3829753771739357, + "grad_norm": 0.7578125, + "learning_rate": 0.0001747412130262637, + "loss": 0.98, + "step": 14915 + }, + { + "epoch": 0.38300105436985754, + "grad_norm": 0.75390625, + "learning_rate": 0.00017473824711173843, + "loss": 0.8227, + "step": 14916 + }, + { + "epoch": 0.3830267315657794, + "grad_norm": 0.7109375, + "learning_rate": 0.00017473528104826658, + "loss": 0.8121, + "step": 14917 + }, + { + "epoch": 0.38305240876170116, + "grad_norm": 0.85546875, + "learning_rate": 0.0001747323148358541, + "loss": 0.9062, + "step": 14918 + }, + { + "epoch": 0.383078085957623, + "grad_norm": 0.83203125, + "learning_rate": 0.0001747293484745068, + "loss": 0.9027, + "step": 14919 + }, + { + "epoch": 0.38310376315354483, + "grad_norm": 0.921875, + "learning_rate": 0.00017472638196423072, + "loss": 0.9024, + "step": 14920 + }, + { + "epoch": 0.3831294403494666, + "grad_norm": 0.77734375, + "learning_rate": 0.00017472341530503166, + "loss": 1.0472, + "step": 14921 + }, + { + "epoch": 0.38315511754538845, + "grad_norm": 0.81640625, + "learning_rate": 0.0001747204484969156, + "loss": 0.9354, + "step": 14922 + }, + { + "epoch": 0.3831807947413103, + "grad_norm": 0.9453125, + "learning_rate": 0.00017471748153988846, + "loss": 0.8977, + "step": 14923 + }, + { + "epoch": 0.3832064719372321, + "grad_norm": 0.77734375, + "learning_rate": 0.00017471451443395612, + "loss": 0.9194, + "step": 14924 + }, + { + "epoch": 0.3832321491331539, + "grad_norm": 0.765625, + "learning_rate": 0.00017471154717912447, + "loss": 0.8991, + "step": 14925 + }, + { + "epoch": 0.38325782632907573, + "grad_norm": 0.7578125, + "learning_rate": 0.00017470857977539952, + "loss": 0.9591, + "step": 14926 + }, + { + "epoch": 0.38328350352499757, + "grad_norm": 0.78515625, + "learning_rate": 0.00017470561222278705, + "loss": 0.9703, + "step": 14927 + }, + { + "epoch": 0.38330918072091935, + "grad_norm": 0.85546875, + "learning_rate": 0.00017470264452129308, + "loss": 0.921, + "step": 14928 + }, + { + "epoch": 0.3833348579168412, + "grad_norm": 1.0703125, + "learning_rate": 0.00017469967667092345, + "loss": 0.9942, + "step": 14929 + }, + { + "epoch": 0.383360535112763, + "grad_norm": 0.86328125, + "learning_rate": 0.00017469670867168417, + "loss": 0.9638, + "step": 14930 + }, + { + "epoch": 0.3833862123086848, + "grad_norm": 0.7421875, + "learning_rate": 0.00017469374052358108, + "loss": 0.9197, + "step": 14931 + }, + { + "epoch": 0.38341188950460664, + "grad_norm": 0.71875, + "learning_rate": 0.0001746907722266201, + "loss": 0.9833, + "step": 14932 + }, + { + "epoch": 0.3834375667005285, + "grad_norm": 0.8046875, + "learning_rate": 0.00017468780378080716, + "loss": 0.9549, + "step": 14933 + }, + { + "epoch": 0.3834632438964503, + "grad_norm": 0.7421875, + "learning_rate": 0.00017468483518614814, + "loss": 0.9197, + "step": 14934 + }, + { + "epoch": 0.3834889210923721, + "grad_norm": 0.76953125, + "learning_rate": 0.000174681866442649, + "loss": 0.8714, + "step": 14935 + }, + { + "epoch": 0.38351459828829393, + "grad_norm": 0.75, + "learning_rate": 0.0001746788975503157, + "loss": 0.9667, + "step": 14936 + }, + { + "epoch": 0.38354027548421576, + "grad_norm": 0.68359375, + "learning_rate": 0.00017467592850915406, + "loss": 0.8203, + "step": 14937 + }, + { + "epoch": 0.38356595268013755, + "grad_norm": 0.7890625, + "learning_rate": 0.00017467295931917004, + "loss": 0.8544, + "step": 14938 + }, + { + "epoch": 0.3835916298760594, + "grad_norm": 0.85546875, + "learning_rate": 0.00017466998998036956, + "loss": 0.8821, + "step": 14939 + }, + { + "epoch": 0.3836173070719812, + "grad_norm": 0.796875, + "learning_rate": 0.00017466702049275857, + "loss": 0.8857, + "step": 14940 + }, + { + "epoch": 0.383642984267903, + "grad_norm": 0.8203125, + "learning_rate": 0.00017466405085634292, + "loss": 0.9481, + "step": 14941 + }, + { + "epoch": 0.38366866146382483, + "grad_norm": 0.71875, + "learning_rate": 0.00017466108107112857, + "loss": 0.9484, + "step": 14942 + }, + { + "epoch": 0.38369433865974667, + "grad_norm": 0.83203125, + "learning_rate": 0.0001746581111371214, + "loss": 0.9017, + "step": 14943 + }, + { + "epoch": 0.3837200158556685, + "grad_norm": 0.80078125, + "learning_rate": 0.0001746551410543274, + "loss": 0.8456, + "step": 14944 + }, + { + "epoch": 0.3837456930515903, + "grad_norm": 0.83984375, + "learning_rate": 0.00017465217082275242, + "loss": 1.0236, + "step": 14945 + }, + { + "epoch": 0.3837713702475121, + "grad_norm": 0.828125, + "learning_rate": 0.00017464920044240242, + "loss": 1.0542, + "step": 14946 + }, + { + "epoch": 0.38379704744343396, + "grad_norm": 0.81640625, + "learning_rate": 0.0001746462299132833, + "loss": 0.9662, + "step": 14947 + }, + { + "epoch": 0.38382272463935574, + "grad_norm": 0.80859375, + "learning_rate": 0.000174643259235401, + "loss": 0.8384, + "step": 14948 + }, + { + "epoch": 0.3838484018352776, + "grad_norm": 0.8203125, + "learning_rate": 0.00017464028840876144, + "loss": 0.9871, + "step": 14949 + }, + { + "epoch": 0.3838740790311994, + "grad_norm": 0.76171875, + "learning_rate": 0.0001746373174333705, + "loss": 0.9737, + "step": 14950 + }, + { + "epoch": 0.3838997562271212, + "grad_norm": 0.8359375, + "learning_rate": 0.00017463434630923412, + "loss": 1.0791, + "step": 14951 + }, + { + "epoch": 0.383925433423043, + "grad_norm": 0.78125, + "learning_rate": 0.00017463137503635824, + "loss": 0.9457, + "step": 14952 + }, + { + "epoch": 0.38395111061896486, + "grad_norm": 0.76953125, + "learning_rate": 0.0001746284036147488, + "loss": 0.8467, + "step": 14953 + }, + { + "epoch": 0.3839767878148867, + "grad_norm": 0.7578125, + "learning_rate": 0.00017462543204441164, + "loss": 0.8898, + "step": 14954 + }, + { + "epoch": 0.3840024650108085, + "grad_norm": 0.734375, + "learning_rate": 0.00017462246032535277, + "loss": 0.9067, + "step": 14955 + }, + { + "epoch": 0.3840281422067303, + "grad_norm": 0.84375, + "learning_rate": 0.00017461948845757809, + "loss": 0.9075, + "step": 14956 + }, + { + "epoch": 0.38405381940265215, + "grad_norm": 0.74609375, + "learning_rate": 0.00017461651644109347, + "loss": 0.9335, + "step": 14957 + }, + { + "epoch": 0.38407949659857393, + "grad_norm": 0.734375, + "learning_rate": 0.0001746135442759049, + "loss": 0.8426, + "step": 14958 + }, + { + "epoch": 0.38410517379449577, + "grad_norm": 0.7578125, + "learning_rate": 0.00017461057196201826, + "loss": 1.0054, + "step": 14959 + }, + { + "epoch": 0.3841308509904176, + "grad_norm": 0.80859375, + "learning_rate": 0.0001746075994994395, + "loss": 0.9298, + "step": 14960 + }, + { + "epoch": 0.3841565281863394, + "grad_norm": 0.8046875, + "learning_rate": 0.0001746046268881745, + "loss": 0.8936, + "step": 14961 + }, + { + "epoch": 0.3841822053822612, + "grad_norm": 0.7265625, + "learning_rate": 0.00017460165412822926, + "loss": 0.9468, + "step": 14962 + }, + { + "epoch": 0.38420788257818306, + "grad_norm": 0.8125, + "learning_rate": 0.00017459868121960965, + "loss": 0.8773, + "step": 14963 + }, + { + "epoch": 0.3842335597741049, + "grad_norm": 0.75, + "learning_rate": 0.0001745957081623216, + "loss": 1.0215, + "step": 14964 + }, + { + "epoch": 0.3842592369700267, + "grad_norm": 0.7578125, + "learning_rate": 0.00017459273495637104, + "loss": 1.0255, + "step": 14965 + }, + { + "epoch": 0.3842849141659485, + "grad_norm": 0.71875, + "learning_rate": 0.00017458976160176395, + "loss": 0.866, + "step": 14966 + }, + { + "epoch": 0.38431059136187035, + "grad_norm": 0.7734375, + "learning_rate": 0.00017458678809850614, + "loss": 0.8721, + "step": 14967 + }, + { + "epoch": 0.3843362685577921, + "grad_norm": 0.83984375, + "learning_rate": 0.0001745838144466036, + "loss": 0.9493, + "step": 14968 + }, + { + "epoch": 0.38436194575371396, + "grad_norm": 0.76953125, + "learning_rate": 0.00017458084064606228, + "loss": 0.9205, + "step": 14969 + }, + { + "epoch": 0.3843876229496358, + "grad_norm": 0.7734375, + "learning_rate": 0.00017457786669688806, + "loss": 0.8886, + "step": 14970 + }, + { + "epoch": 0.3844133001455576, + "grad_norm": 0.79296875, + "learning_rate": 0.00017457489259908691, + "loss": 0.9486, + "step": 14971 + }, + { + "epoch": 0.3844389773414794, + "grad_norm": 0.80859375, + "learning_rate": 0.00017457191835266477, + "loss": 0.8954, + "step": 14972 + }, + { + "epoch": 0.38446465453740125, + "grad_norm": 0.765625, + "learning_rate": 0.00017456894395762745, + "loss": 1.0517, + "step": 14973 + }, + { + "epoch": 0.3844903317333231, + "grad_norm": 0.78125, + "learning_rate": 0.000174565969413981, + "loss": 0.9068, + "step": 14974 + }, + { + "epoch": 0.38451600892924487, + "grad_norm": 0.85546875, + "learning_rate": 0.00017456299472173132, + "loss": 0.8662, + "step": 14975 + }, + { + "epoch": 0.3845416861251667, + "grad_norm": 0.796875, + "learning_rate": 0.00017456001988088433, + "loss": 0.8635, + "step": 14976 + }, + { + "epoch": 0.38456736332108854, + "grad_norm": 0.78515625, + "learning_rate": 0.00017455704489144594, + "loss": 0.959, + "step": 14977 + }, + { + "epoch": 0.3845930405170103, + "grad_norm": 0.7734375, + "learning_rate": 0.00017455406975342208, + "loss": 0.9209, + "step": 14978 + }, + { + "epoch": 0.38461871771293216, + "grad_norm": 0.83984375, + "learning_rate": 0.00017455109446681873, + "loss": 0.8475, + "step": 14979 + }, + { + "epoch": 0.384644394908854, + "grad_norm": 0.765625, + "learning_rate": 0.00017454811903164176, + "loss": 0.9711, + "step": 14980 + }, + { + "epoch": 0.3846700721047758, + "grad_norm": 0.80078125, + "learning_rate": 0.00017454514344789713, + "loss": 0.9029, + "step": 14981 + }, + { + "epoch": 0.3846957493006976, + "grad_norm": 0.8203125, + "learning_rate": 0.00017454216771559073, + "loss": 1.1088, + "step": 14982 + }, + { + "epoch": 0.38472142649661945, + "grad_norm": 0.7265625, + "learning_rate": 0.00017453919183472855, + "loss": 1.0394, + "step": 14983 + }, + { + "epoch": 0.3847471036925413, + "grad_norm": 0.86328125, + "learning_rate": 0.0001745362158053165, + "loss": 0.8825, + "step": 14984 + }, + { + "epoch": 0.38477278088846306, + "grad_norm": 0.73828125, + "learning_rate": 0.0001745332396273605, + "loss": 1.1211, + "step": 14985 + }, + { + "epoch": 0.3847984580843849, + "grad_norm": 0.73046875, + "learning_rate": 0.00017453026330086649, + "loss": 0.9627, + "step": 14986 + }, + { + "epoch": 0.38482413528030673, + "grad_norm": 0.8359375, + "learning_rate": 0.00017452728682584037, + "loss": 1.0433, + "step": 14987 + }, + { + "epoch": 0.3848498124762285, + "grad_norm": 0.734375, + "learning_rate": 0.00017452431020228816, + "loss": 0.9376, + "step": 14988 + }, + { + "epoch": 0.38487548967215035, + "grad_norm": 0.84375, + "learning_rate": 0.0001745213334302157, + "loss": 0.8894, + "step": 14989 + }, + { + "epoch": 0.3849011668680722, + "grad_norm": 0.69921875, + "learning_rate": 0.00017451835650962895, + "loss": 0.8638, + "step": 14990 + }, + { + "epoch": 0.38492684406399397, + "grad_norm": 0.7265625, + "learning_rate": 0.00017451537944053382, + "loss": 0.8519, + "step": 14991 + }, + { + "epoch": 0.3849525212599158, + "grad_norm": 0.78125, + "learning_rate": 0.0001745124022229363, + "loss": 1.0304, + "step": 14992 + }, + { + "epoch": 0.38497819845583764, + "grad_norm": 0.85546875, + "learning_rate": 0.00017450942485684227, + "loss": 1.0858, + "step": 14993 + }, + { + "epoch": 0.3850038756517595, + "grad_norm": 0.7734375, + "learning_rate": 0.0001745064473422577, + "loss": 0.9716, + "step": 14994 + }, + { + "epoch": 0.38502955284768126, + "grad_norm": 0.75390625, + "learning_rate": 0.0001745034696791885, + "loss": 0.8473, + "step": 14995 + }, + { + "epoch": 0.3850552300436031, + "grad_norm": 0.7421875, + "learning_rate": 0.00017450049186764062, + "loss": 0.949, + "step": 14996 + }, + { + "epoch": 0.3850809072395249, + "grad_norm": 0.8984375, + "learning_rate": 0.00017449751390762, + "loss": 0.8405, + "step": 14997 + }, + { + "epoch": 0.3851065844354467, + "grad_norm": 0.85546875, + "learning_rate": 0.0001744945357991326, + "loss": 1.0058, + "step": 14998 + }, + { + "epoch": 0.38513226163136854, + "grad_norm": 0.734375, + "learning_rate": 0.00017449155754218428, + "loss": 0.9201, + "step": 14999 + }, + { + "epoch": 0.3851579388272904, + "grad_norm": 0.76953125, + "learning_rate": 0.000174488579136781, + "loss": 0.9798, + "step": 15000 + }, + { + "epoch": 0.3851579388272904, + "eval_loss": 0.9327870011329651, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 405.5102, + "eval_samples_per_second": 24.66, + "eval_steps_per_second": 0.772, + "step": 15000 + }, + { + "epoch": 0.38518361602321216, + "grad_norm": 0.765625, + "learning_rate": 0.00017448560058292873, + "loss": 1.0399, + "step": 15001 + }, + { + "epoch": 0.385209293219134, + "grad_norm": 0.8671875, + "learning_rate": 0.0001744826218806334, + "loss": 0.9164, + "step": 15002 + }, + { + "epoch": 0.38523497041505583, + "grad_norm": 0.703125, + "learning_rate": 0.0001744796430299009, + "loss": 0.9123, + "step": 15003 + }, + { + "epoch": 0.38526064761097767, + "grad_norm": 0.828125, + "learning_rate": 0.0001744766640307372, + "loss": 0.8372, + "step": 15004 + }, + { + "epoch": 0.38528632480689945, + "grad_norm": 0.8515625, + "learning_rate": 0.00017447368488314826, + "loss": 0.9548, + "step": 15005 + }, + { + "epoch": 0.3853120020028213, + "grad_norm": 0.83984375, + "learning_rate": 0.00017447070558714, + "loss": 1.0119, + "step": 15006 + }, + { + "epoch": 0.3853376791987431, + "grad_norm": 0.73046875, + "learning_rate": 0.00017446772614271834, + "loss": 0.7284, + "step": 15007 + }, + { + "epoch": 0.3853633563946649, + "grad_norm": 0.75, + "learning_rate": 0.0001744647465498892, + "loss": 0.8975, + "step": 15008 + }, + { + "epoch": 0.38538903359058674, + "grad_norm": 0.796875, + "learning_rate": 0.00017446176680865858, + "loss": 0.8627, + "step": 15009 + }, + { + "epoch": 0.3854147107865086, + "grad_norm": 0.82421875, + "learning_rate": 0.00017445878691903238, + "loss": 0.9364, + "step": 15010 + }, + { + "epoch": 0.38544038798243035, + "grad_norm": 0.98046875, + "learning_rate": 0.00017445580688101652, + "loss": 0.7977, + "step": 15011 + }, + { + "epoch": 0.3854660651783522, + "grad_norm": 0.7578125, + "learning_rate": 0.000174452826694617, + "loss": 0.8968, + "step": 15012 + }, + { + "epoch": 0.385491742374274, + "grad_norm": 0.80859375, + "learning_rate": 0.00017444984635983968, + "loss": 0.8883, + "step": 15013 + }, + { + "epoch": 0.38551741957019586, + "grad_norm": 0.7265625, + "learning_rate": 0.00017444686587669056, + "loss": 0.8507, + "step": 15014 + }, + { + "epoch": 0.38554309676611764, + "grad_norm": 0.79296875, + "learning_rate": 0.00017444388524517557, + "loss": 0.894, + "step": 15015 + }, + { + "epoch": 0.3855687739620395, + "grad_norm": 0.78125, + "learning_rate": 0.00017444090446530063, + "loss": 0.8655, + "step": 15016 + }, + { + "epoch": 0.3855944511579613, + "grad_norm": 0.7578125, + "learning_rate": 0.0001744379235370717, + "loss": 0.9672, + "step": 15017 + }, + { + "epoch": 0.3856201283538831, + "grad_norm": 0.828125, + "learning_rate": 0.00017443494246049471, + "loss": 0.9177, + "step": 15018 + }, + { + "epoch": 0.38564580554980493, + "grad_norm": 0.82421875, + "learning_rate": 0.0001744319612355756, + "loss": 1.071, + "step": 15019 + }, + { + "epoch": 0.38567148274572677, + "grad_norm": 0.85546875, + "learning_rate": 0.00017442897986232032, + "loss": 0.9983, + "step": 15020 + }, + { + "epoch": 0.38569715994164855, + "grad_norm": 0.71875, + "learning_rate": 0.0001744259983407348, + "loss": 0.9532, + "step": 15021 + }, + { + "epoch": 0.3857228371375704, + "grad_norm": 0.82421875, + "learning_rate": 0.00017442301667082498, + "loss": 1.0113, + "step": 15022 + }, + { + "epoch": 0.3857485143334922, + "grad_norm": 0.7890625, + "learning_rate": 0.00017442003485259683, + "loss": 0.9259, + "step": 15023 + }, + { + "epoch": 0.38577419152941406, + "grad_norm": 0.7890625, + "learning_rate": 0.00017441705288605627, + "loss": 1.0487, + "step": 15024 + }, + { + "epoch": 0.38579986872533584, + "grad_norm": 0.73046875, + "learning_rate": 0.00017441407077120925, + "loss": 0.8414, + "step": 15025 + }, + { + "epoch": 0.3858255459212577, + "grad_norm": 0.80078125, + "learning_rate": 0.0001744110885080617, + "loss": 1.0196, + "step": 15026 + }, + { + "epoch": 0.3858512231171795, + "grad_norm": 0.72265625, + "learning_rate": 0.00017440810609661958, + "loss": 0.9014, + "step": 15027 + }, + { + "epoch": 0.3858769003131013, + "grad_norm": 0.76953125, + "learning_rate": 0.00017440512353688882, + "loss": 0.8484, + "step": 15028 + }, + { + "epoch": 0.3859025775090231, + "grad_norm": 0.73046875, + "learning_rate": 0.00017440214082887535, + "loss": 0.9361, + "step": 15029 + }, + { + "epoch": 0.38592825470494496, + "grad_norm": 0.81640625, + "learning_rate": 0.00017439915797258515, + "loss": 0.9117, + "step": 15030 + }, + { + "epoch": 0.38595393190086674, + "grad_norm": 0.828125, + "learning_rate": 0.00017439617496802417, + "loss": 0.9058, + "step": 15031 + }, + { + "epoch": 0.3859796090967886, + "grad_norm": 0.78125, + "learning_rate": 0.00017439319181519828, + "loss": 1.0102, + "step": 15032 + }, + { + "epoch": 0.3860052862927104, + "grad_norm": 0.8515625, + "learning_rate": 0.00017439020851411354, + "loss": 0.8788, + "step": 15033 + }, + { + "epoch": 0.38603096348863225, + "grad_norm": 0.84375, + "learning_rate": 0.00017438722506477579, + "loss": 1.0011, + "step": 15034 + }, + { + "epoch": 0.38605664068455403, + "grad_norm": 0.88671875, + "learning_rate": 0.00017438424146719103, + "loss": 1.0355, + "step": 15035 + }, + { + "epoch": 0.38608231788047587, + "grad_norm": 0.7578125, + "learning_rate": 0.00017438125772136523, + "loss": 0.9049, + "step": 15036 + }, + { + "epoch": 0.3861079950763977, + "grad_norm": 0.80078125, + "learning_rate": 0.00017437827382730425, + "loss": 0.9493, + "step": 15037 + }, + { + "epoch": 0.3861336722723195, + "grad_norm": 0.81640625, + "learning_rate": 0.00017437528978501412, + "loss": 1.1483, + "step": 15038 + }, + { + "epoch": 0.3861593494682413, + "grad_norm": 0.78125, + "learning_rate": 0.00017437230559450074, + "loss": 0.9302, + "step": 15039 + }, + { + "epoch": 0.38618502666416316, + "grad_norm": 0.75390625, + "learning_rate": 0.00017436932125577007, + "loss": 0.9327, + "step": 15040 + }, + { + "epoch": 0.38621070386008494, + "grad_norm": 0.73828125, + "learning_rate": 0.0001743663367688281, + "loss": 0.9586, + "step": 15041 + }, + { + "epoch": 0.3862363810560068, + "grad_norm": 0.76171875, + "learning_rate": 0.00017436335213368072, + "loss": 0.9539, + "step": 15042 + }, + { + "epoch": 0.3862620582519286, + "grad_norm": 0.8125, + "learning_rate": 0.00017436036735033389, + "loss": 1.0519, + "step": 15043 + }, + { + "epoch": 0.38628773544785044, + "grad_norm": 0.75, + "learning_rate": 0.00017435738241879353, + "loss": 1.0513, + "step": 15044 + }, + { + "epoch": 0.3863134126437722, + "grad_norm": 0.73828125, + "learning_rate": 0.00017435439733906567, + "loss": 0.8765, + "step": 15045 + }, + { + "epoch": 0.38633908983969406, + "grad_norm": 0.84765625, + "learning_rate": 0.0001743514121111562, + "loss": 1.0309, + "step": 15046 + }, + { + "epoch": 0.3863647670356159, + "grad_norm": 0.77734375, + "learning_rate": 0.00017434842673507106, + "loss": 1.0399, + "step": 15047 + }, + { + "epoch": 0.3863904442315377, + "grad_norm": 0.7890625, + "learning_rate": 0.00017434544121081624, + "loss": 0.9019, + "step": 15048 + }, + { + "epoch": 0.3864161214274595, + "grad_norm": 0.8046875, + "learning_rate": 0.00017434245553839769, + "loss": 0.8134, + "step": 15049 + }, + { + "epoch": 0.38644179862338135, + "grad_norm": 0.80078125, + "learning_rate": 0.00017433946971782128, + "loss": 1.0141, + "step": 15050 + }, + { + "epoch": 0.38646747581930313, + "grad_norm": 0.7421875, + "learning_rate": 0.00017433648374909312, + "loss": 1.0336, + "step": 15051 + }, + { + "epoch": 0.38649315301522497, + "grad_norm": 0.80859375, + "learning_rate": 0.00017433349763221897, + "loss": 0.8711, + "step": 15052 + }, + { + "epoch": 0.3865188302111468, + "grad_norm": 0.95703125, + "learning_rate": 0.0001743305113672049, + "loss": 0.9413, + "step": 15053 + }, + { + "epoch": 0.38654450740706864, + "grad_norm": 0.8203125, + "learning_rate": 0.00017432752495405687, + "loss": 0.9281, + "step": 15054 + }, + { + "epoch": 0.3865701846029904, + "grad_norm": 0.75, + "learning_rate": 0.00017432453839278078, + "loss": 0.7597, + "step": 15055 + }, + { + "epoch": 0.38659586179891225, + "grad_norm": 0.78125, + "learning_rate": 0.00017432155168338258, + "loss": 0.8962, + "step": 15056 + }, + { + "epoch": 0.3866215389948341, + "grad_norm": 0.7578125, + "learning_rate": 0.00017431856482586825, + "loss": 0.9426, + "step": 15057 + }, + { + "epoch": 0.38664721619075587, + "grad_norm": 0.7890625, + "learning_rate": 0.00017431557782024375, + "loss": 0.9328, + "step": 15058 + }, + { + "epoch": 0.3866728933866777, + "grad_norm": 0.8046875, + "learning_rate": 0.000174312590666515, + "loss": 0.9709, + "step": 15059 + }, + { + "epoch": 0.38669857058259954, + "grad_norm": 0.7578125, + "learning_rate": 0.00017430960336468795, + "loss": 1.008, + "step": 15060 + }, + { + "epoch": 0.3867242477785213, + "grad_norm": 0.76953125, + "learning_rate": 0.00017430661591476862, + "loss": 0.9614, + "step": 15061 + }, + { + "epoch": 0.38674992497444316, + "grad_norm": 0.8125, + "learning_rate": 0.00017430362831676288, + "loss": 0.8424, + "step": 15062 + }, + { + "epoch": 0.386775602170365, + "grad_norm": 0.828125, + "learning_rate": 0.00017430064057067672, + "loss": 1.0196, + "step": 15063 + }, + { + "epoch": 0.38680127936628683, + "grad_norm": 0.796875, + "learning_rate": 0.00017429765267651612, + "loss": 0.8839, + "step": 15064 + }, + { + "epoch": 0.3868269565622086, + "grad_norm": 0.78515625, + "learning_rate": 0.00017429466463428697, + "loss": 0.8208, + "step": 15065 + }, + { + "epoch": 0.38685263375813045, + "grad_norm": 0.75, + "learning_rate": 0.00017429167644399533, + "loss": 0.8982, + "step": 15066 + }, + { + "epoch": 0.3868783109540523, + "grad_norm": 0.80078125, + "learning_rate": 0.00017428868810564704, + "loss": 0.8308, + "step": 15067 + }, + { + "epoch": 0.38690398814997407, + "grad_norm": 0.79296875, + "learning_rate": 0.00017428569961924813, + "loss": 0.8086, + "step": 15068 + }, + { + "epoch": 0.3869296653458959, + "grad_norm": 0.85546875, + "learning_rate": 0.0001742827109848045, + "loss": 0.9043, + "step": 15069 + }, + { + "epoch": 0.38695534254181774, + "grad_norm": 0.80078125, + "learning_rate": 0.00017427972220232217, + "loss": 0.8983, + "step": 15070 + }, + { + "epoch": 0.3869810197377395, + "grad_norm": 0.8671875, + "learning_rate": 0.00017427673327180703, + "loss": 0.9857, + "step": 15071 + }, + { + "epoch": 0.38700669693366135, + "grad_norm": 0.73828125, + "learning_rate": 0.0001742737441932651, + "loss": 0.8379, + "step": 15072 + }, + { + "epoch": 0.3870323741295832, + "grad_norm": 0.875, + "learning_rate": 0.0001742707549667023, + "loss": 0.9103, + "step": 15073 + }, + { + "epoch": 0.387058051325505, + "grad_norm": 0.76953125, + "learning_rate": 0.00017426776559212459, + "loss": 0.8606, + "step": 15074 + }, + { + "epoch": 0.3870837285214268, + "grad_norm": 0.796875, + "learning_rate": 0.0001742647760695379, + "loss": 1.0005, + "step": 15075 + }, + { + "epoch": 0.38710940571734864, + "grad_norm": 0.76953125, + "learning_rate": 0.00017426178639894826, + "loss": 0.9039, + "step": 15076 + }, + { + "epoch": 0.3871350829132705, + "grad_norm": 0.921875, + "learning_rate": 0.00017425879658036157, + "loss": 0.8115, + "step": 15077 + }, + { + "epoch": 0.38716076010919226, + "grad_norm": 0.78125, + "learning_rate": 0.00017425580661378383, + "loss": 0.9949, + "step": 15078 + }, + { + "epoch": 0.3871864373051141, + "grad_norm": 0.8203125, + "learning_rate": 0.00017425281649922095, + "loss": 1.0703, + "step": 15079 + }, + { + "epoch": 0.38721211450103593, + "grad_norm": 0.765625, + "learning_rate": 0.00017424982623667892, + "loss": 1.0277, + "step": 15080 + }, + { + "epoch": 0.3872377916969577, + "grad_norm": 0.8828125, + "learning_rate": 0.0001742468358261637, + "loss": 0.9907, + "step": 15081 + }, + { + "epoch": 0.38726346889287955, + "grad_norm": 0.77734375, + "learning_rate": 0.00017424384526768124, + "loss": 0.8711, + "step": 15082 + }, + { + "epoch": 0.3872891460888014, + "grad_norm": 0.8125, + "learning_rate": 0.0001742408545612375, + "loss": 1.0146, + "step": 15083 + }, + { + "epoch": 0.38731482328472316, + "grad_norm": 0.8203125, + "learning_rate": 0.00017423786370683844, + "loss": 0.9101, + "step": 15084 + }, + { + "epoch": 0.387340500480645, + "grad_norm": 0.7734375, + "learning_rate": 0.00017423487270449003, + "loss": 1.0384, + "step": 15085 + }, + { + "epoch": 0.38736617767656684, + "grad_norm": 0.78515625, + "learning_rate": 0.00017423188155419822, + "loss": 0.8784, + "step": 15086 + }, + { + "epoch": 0.3873918548724887, + "grad_norm": 0.79296875, + "learning_rate": 0.00017422889025596895, + "loss": 0.9498, + "step": 15087 + }, + { + "epoch": 0.38741753206841045, + "grad_norm": 0.8125, + "learning_rate": 0.00017422589880980826, + "loss": 0.8192, + "step": 15088 + }, + { + "epoch": 0.3874432092643323, + "grad_norm": 0.78125, + "learning_rate": 0.00017422290721572203, + "loss": 0.8998, + "step": 15089 + }, + { + "epoch": 0.3874688864602541, + "grad_norm": 0.7890625, + "learning_rate": 0.00017421991547371626, + "loss": 0.9406, + "step": 15090 + }, + { + "epoch": 0.3874945636561759, + "grad_norm": 0.80078125, + "learning_rate": 0.00017421692358379688, + "loss": 0.9987, + "step": 15091 + }, + { + "epoch": 0.38752024085209774, + "grad_norm": 0.73828125, + "learning_rate": 0.00017421393154596989, + "loss": 0.9475, + "step": 15092 + }, + { + "epoch": 0.3875459180480196, + "grad_norm": 0.72265625, + "learning_rate": 0.00017421093936024123, + "loss": 0.9069, + "step": 15093 + }, + { + "epoch": 0.38757159524394136, + "grad_norm": 0.83203125, + "learning_rate": 0.00017420794702661688, + "loss": 0.8508, + "step": 15094 + }, + { + "epoch": 0.3875972724398632, + "grad_norm": 0.78515625, + "learning_rate": 0.0001742049545451028, + "loss": 0.9122, + "step": 15095 + }, + { + "epoch": 0.38762294963578503, + "grad_norm": 0.80078125, + "learning_rate": 0.00017420196191570496, + "loss": 1.0585, + "step": 15096 + }, + { + "epoch": 0.38764862683170687, + "grad_norm": 0.76953125, + "learning_rate": 0.00017419896913842929, + "loss": 0.9546, + "step": 15097 + }, + { + "epoch": 0.38767430402762865, + "grad_norm": 0.7265625, + "learning_rate": 0.0001741959762132818, + "loss": 0.7929, + "step": 15098 + }, + { + "epoch": 0.3876999812235505, + "grad_norm": 0.80078125, + "learning_rate": 0.0001741929831402684, + "loss": 0.9431, + "step": 15099 + }, + { + "epoch": 0.3877256584194723, + "grad_norm": 0.80078125, + "learning_rate": 0.0001741899899193951, + "loss": 0.9098, + "step": 15100 + }, + { + "epoch": 0.3877513356153941, + "grad_norm": 0.75, + "learning_rate": 0.00017418699655066786, + "loss": 1.0199, + "step": 15101 + }, + { + "epoch": 0.38777701281131594, + "grad_norm": 0.76953125, + "learning_rate": 0.00017418400303409261, + "loss": 0.9089, + "step": 15102 + }, + { + "epoch": 0.38780269000723777, + "grad_norm": 0.7734375, + "learning_rate": 0.00017418100936967537, + "loss": 0.8117, + "step": 15103 + }, + { + "epoch": 0.38782836720315955, + "grad_norm": 0.76171875, + "learning_rate": 0.00017417801555742206, + "loss": 0.9504, + "step": 15104 + }, + { + "epoch": 0.3878540443990814, + "grad_norm": 0.7578125, + "learning_rate": 0.00017417502159733868, + "loss": 1.0666, + "step": 15105 + }, + { + "epoch": 0.3878797215950032, + "grad_norm": 0.76953125, + "learning_rate": 0.00017417202748943117, + "loss": 0.9385, + "step": 15106 + }, + { + "epoch": 0.38790539879092506, + "grad_norm": 0.7578125, + "learning_rate": 0.00017416903323370556, + "loss": 0.8222, + "step": 15107 + }, + { + "epoch": 0.38793107598684684, + "grad_norm": 0.8828125, + "learning_rate": 0.00017416603883016772, + "loss": 0.8956, + "step": 15108 + }, + { + "epoch": 0.3879567531827687, + "grad_norm": 0.85546875, + "learning_rate": 0.00017416304427882366, + "loss": 0.9997, + "step": 15109 + }, + { + "epoch": 0.3879824303786905, + "grad_norm": 0.84765625, + "learning_rate": 0.00017416004957967936, + "loss": 0.9037, + "step": 15110 + }, + { + "epoch": 0.3880081075746123, + "grad_norm": 2.484375, + "learning_rate": 0.00017415705473274078, + "loss": 0.9879, + "step": 15111 + }, + { + "epoch": 0.38803378477053413, + "grad_norm": 0.828125, + "learning_rate": 0.0001741540597380139, + "loss": 1.1019, + "step": 15112 + }, + { + "epoch": 0.38805946196645597, + "grad_norm": 0.8203125, + "learning_rate": 0.00017415106459550464, + "loss": 0.9211, + "step": 15113 + }, + { + "epoch": 0.38808513916237775, + "grad_norm": 0.78125, + "learning_rate": 0.00017414806930521906, + "loss": 0.9186, + "step": 15114 + }, + { + "epoch": 0.3881108163582996, + "grad_norm": 0.77734375, + "learning_rate": 0.00017414507386716302, + "loss": 0.7519, + "step": 15115 + }, + { + "epoch": 0.3881364935542214, + "grad_norm": 0.73046875, + "learning_rate": 0.00017414207828134258, + "loss": 0.9007, + "step": 15116 + }, + { + "epoch": 0.38816217075014325, + "grad_norm": 0.8046875, + "learning_rate": 0.00017413908254776365, + "loss": 0.8904, + "step": 15117 + }, + { + "epoch": 0.38818784794606503, + "grad_norm": 0.7109375, + "learning_rate": 0.00017413608666643225, + "loss": 0.8093, + "step": 15118 + }, + { + "epoch": 0.38821352514198687, + "grad_norm": 0.72265625, + "learning_rate": 0.00017413309063735435, + "loss": 0.9689, + "step": 15119 + }, + { + "epoch": 0.3882392023379087, + "grad_norm": 0.90234375, + "learning_rate": 0.00017413009446053584, + "loss": 1.0029, + "step": 15120 + }, + { + "epoch": 0.3882648795338305, + "grad_norm": 0.73828125, + "learning_rate": 0.0001741270981359828, + "loss": 0.9656, + "step": 15121 + }, + { + "epoch": 0.3882905567297523, + "grad_norm": 0.71484375, + "learning_rate": 0.0001741241016637011, + "loss": 0.8054, + "step": 15122 + }, + { + "epoch": 0.38831623392567416, + "grad_norm": 0.84375, + "learning_rate": 0.00017412110504369679, + "loss": 0.9218, + "step": 15123 + }, + { + "epoch": 0.38834191112159594, + "grad_norm": 0.77734375, + "learning_rate": 0.00017411810827597582, + "loss": 0.8428, + "step": 15124 + }, + { + "epoch": 0.3883675883175178, + "grad_norm": 0.79296875, + "learning_rate": 0.00017411511136054415, + "loss": 0.8628, + "step": 15125 + }, + { + "epoch": 0.3883932655134396, + "grad_norm": 0.80078125, + "learning_rate": 0.00017411211429740772, + "loss": 0.923, + "step": 15126 + }, + { + "epoch": 0.38841894270936145, + "grad_norm": 0.81640625, + "learning_rate": 0.0001741091170865726, + "loss": 0.9704, + "step": 15127 + }, + { + "epoch": 0.38844461990528323, + "grad_norm": 0.8125, + "learning_rate": 0.00017410611972804466, + "loss": 0.9427, + "step": 15128 + }, + { + "epoch": 0.38847029710120506, + "grad_norm": 0.796875, + "learning_rate": 0.00017410312222182993, + "loss": 0.9131, + "step": 15129 + }, + { + "epoch": 0.3884959742971269, + "grad_norm": 0.8046875, + "learning_rate": 0.0001741001245679344, + "loss": 1.0033, + "step": 15130 + }, + { + "epoch": 0.3885216514930487, + "grad_norm": 0.73046875, + "learning_rate": 0.000174097126766364, + "loss": 0.8669, + "step": 15131 + }, + { + "epoch": 0.3885473286889705, + "grad_norm": 0.7890625, + "learning_rate": 0.00017409412881712468, + "loss": 0.8388, + "step": 15132 + }, + { + "epoch": 0.38857300588489235, + "grad_norm": 0.8125, + "learning_rate": 0.00017409113072022249, + "loss": 1.0838, + "step": 15133 + }, + { + "epoch": 0.38859868308081413, + "grad_norm": 0.75, + "learning_rate": 0.00017408813247566336, + "loss": 0.9618, + "step": 15134 + }, + { + "epoch": 0.38862436027673597, + "grad_norm": 0.76171875, + "learning_rate": 0.00017408513408345328, + "loss": 0.9314, + "step": 15135 + }, + { + "epoch": 0.3886500374726578, + "grad_norm": 0.78515625, + "learning_rate": 0.0001740821355435982, + "loss": 0.9084, + "step": 15136 + }, + { + "epoch": 0.38867571466857964, + "grad_norm": 0.75390625, + "learning_rate": 0.00017407913685610417, + "loss": 0.9661, + "step": 15137 + }, + { + "epoch": 0.3887013918645014, + "grad_norm": 0.80078125, + "learning_rate": 0.00017407613802097703, + "loss": 0.8587, + "step": 15138 + }, + { + "epoch": 0.38872706906042326, + "grad_norm": 0.81640625, + "learning_rate": 0.0001740731390382229, + "loss": 0.9807, + "step": 15139 + }, + { + "epoch": 0.3887527462563451, + "grad_norm": 0.74609375, + "learning_rate": 0.00017407013990784767, + "loss": 0.8853, + "step": 15140 + }, + { + "epoch": 0.3887784234522669, + "grad_norm": 0.73046875, + "learning_rate": 0.00017406714062985734, + "loss": 0.9545, + "step": 15141 + }, + { + "epoch": 0.3888041006481887, + "grad_norm": 0.79296875, + "learning_rate": 0.0001740641412042579, + "loss": 0.8895, + "step": 15142 + }, + { + "epoch": 0.38882977784411055, + "grad_norm": 0.93359375, + "learning_rate": 0.0001740611416310553, + "loss": 0.9531, + "step": 15143 + }, + { + "epoch": 0.3888554550400323, + "grad_norm": 0.76171875, + "learning_rate": 0.00017405814191025557, + "loss": 0.9858, + "step": 15144 + }, + { + "epoch": 0.38888113223595416, + "grad_norm": 0.82421875, + "learning_rate": 0.00017405514204186463, + "loss": 0.7999, + "step": 15145 + }, + { + "epoch": 0.388906809431876, + "grad_norm": 0.8203125, + "learning_rate": 0.00017405214202588848, + "loss": 1.0304, + "step": 15146 + }, + { + "epoch": 0.38893248662779784, + "grad_norm": 0.76171875, + "learning_rate": 0.00017404914186233312, + "loss": 1.0034, + "step": 15147 + }, + { + "epoch": 0.3889581638237196, + "grad_norm": 0.73046875, + "learning_rate": 0.00017404614155120445, + "loss": 0.7834, + "step": 15148 + }, + { + "epoch": 0.38898384101964145, + "grad_norm": 0.8046875, + "learning_rate": 0.00017404314109250858, + "loss": 0.9826, + "step": 15149 + }, + { + "epoch": 0.3890095182155633, + "grad_norm": 0.79296875, + "learning_rate": 0.0001740401404862514, + "loss": 0.8657, + "step": 15150 + }, + { + "epoch": 0.38903519541148507, + "grad_norm": 0.75390625, + "learning_rate": 0.0001740371397324389, + "loss": 0.9935, + "step": 15151 + }, + { + "epoch": 0.3890608726074069, + "grad_norm": 0.8125, + "learning_rate": 0.00017403413883107705, + "loss": 0.8333, + "step": 15152 + }, + { + "epoch": 0.38908654980332874, + "grad_norm": 0.7578125, + "learning_rate": 0.00017403113778217185, + "loss": 0.9327, + "step": 15153 + }, + { + "epoch": 0.3891122269992505, + "grad_norm": 0.72265625, + "learning_rate": 0.0001740281365857293, + "loss": 0.9516, + "step": 15154 + }, + { + "epoch": 0.38913790419517236, + "grad_norm": 0.7578125, + "learning_rate": 0.00017402513524175536, + "loss": 0.9103, + "step": 15155 + }, + { + "epoch": 0.3891635813910942, + "grad_norm": 0.73828125, + "learning_rate": 0.000174022133750256, + "loss": 0.8842, + "step": 15156 + }, + { + "epoch": 0.38918925858701603, + "grad_norm": 0.75390625, + "learning_rate": 0.00017401913211123724, + "loss": 0.921, + "step": 15157 + }, + { + "epoch": 0.3892149357829378, + "grad_norm": 0.82421875, + "learning_rate": 0.00017401613032470503, + "loss": 0.8491, + "step": 15158 + }, + { + "epoch": 0.38924061297885965, + "grad_norm": 0.85546875, + "learning_rate": 0.00017401312839066533, + "loss": 0.8516, + "step": 15159 + }, + { + "epoch": 0.3892662901747815, + "grad_norm": 0.7890625, + "learning_rate": 0.00017401012630912415, + "loss": 0.969, + "step": 15160 + }, + { + "epoch": 0.38929196737070326, + "grad_norm": 0.72265625, + "learning_rate": 0.0001740071240800875, + "loss": 0.919, + "step": 15161 + }, + { + "epoch": 0.3893176445666251, + "grad_norm": 0.74609375, + "learning_rate": 0.00017400412170356133, + "loss": 0.9392, + "step": 15162 + }, + { + "epoch": 0.38934332176254693, + "grad_norm": 0.79296875, + "learning_rate": 0.00017400111917955163, + "loss": 0.8458, + "step": 15163 + }, + { + "epoch": 0.3893689989584687, + "grad_norm": 0.78515625, + "learning_rate": 0.00017399811650806437, + "loss": 1.0023, + "step": 15164 + }, + { + "epoch": 0.38939467615439055, + "grad_norm": 0.84375, + "learning_rate": 0.00017399511368910555, + "loss": 0.9828, + "step": 15165 + }, + { + "epoch": 0.3894203533503124, + "grad_norm": 0.7734375, + "learning_rate": 0.00017399211072268117, + "loss": 0.9245, + "step": 15166 + }, + { + "epoch": 0.3894460305462342, + "grad_norm": 0.80859375, + "learning_rate": 0.0001739891076087972, + "loss": 1.0123, + "step": 15167 + }, + { + "epoch": 0.389471707742156, + "grad_norm": 0.79296875, + "learning_rate": 0.0001739861043474596, + "loss": 0.8944, + "step": 15168 + }, + { + "epoch": 0.38949738493807784, + "grad_norm": 0.7890625, + "learning_rate": 0.0001739831009386744, + "loss": 0.9672, + "step": 15169 + }, + { + "epoch": 0.3895230621339997, + "grad_norm": 0.79296875, + "learning_rate": 0.00017398009738244755, + "loss": 0.915, + "step": 15170 + }, + { + "epoch": 0.38954873932992146, + "grad_norm": 0.7578125, + "learning_rate": 0.00017397709367878504, + "loss": 0.8778, + "step": 15171 + }, + { + "epoch": 0.3895744165258433, + "grad_norm": 0.79296875, + "learning_rate": 0.00017397408982769286, + "loss": 0.9377, + "step": 15172 + }, + { + "epoch": 0.38960009372176513, + "grad_norm": 0.76171875, + "learning_rate": 0.00017397108582917702, + "loss": 0.9313, + "step": 15173 + }, + { + "epoch": 0.3896257709176869, + "grad_norm": 0.7578125, + "learning_rate": 0.0001739680816832435, + "loss": 0.8224, + "step": 15174 + }, + { + "epoch": 0.38965144811360874, + "grad_norm": 0.74609375, + "learning_rate": 0.00017396507738989825, + "loss": 0.8969, + "step": 15175 + }, + { + "epoch": 0.3896771253095306, + "grad_norm": 0.7890625, + "learning_rate": 0.00017396207294914726, + "loss": 0.9055, + "step": 15176 + }, + { + "epoch": 0.3897028025054524, + "grad_norm": 0.76171875, + "learning_rate": 0.00017395906836099658, + "loss": 0.9129, + "step": 15177 + }, + { + "epoch": 0.3897284797013742, + "grad_norm": 0.78125, + "learning_rate": 0.00017395606362545214, + "loss": 0.9526, + "step": 15178 + }, + { + "epoch": 0.38975415689729603, + "grad_norm": 0.73046875, + "learning_rate": 0.00017395305874251996, + "loss": 0.9316, + "step": 15179 + }, + { + "epoch": 0.38977983409321787, + "grad_norm": 0.83984375, + "learning_rate": 0.000173950053712206, + "loss": 0.9362, + "step": 15180 + }, + { + "epoch": 0.38980551128913965, + "grad_norm": 0.82421875, + "learning_rate": 0.00017394704853451626, + "loss": 0.9734, + "step": 15181 + }, + { + "epoch": 0.3898311884850615, + "grad_norm": 0.84765625, + "learning_rate": 0.00017394404320945675, + "loss": 1.0764, + "step": 15182 + }, + { + "epoch": 0.3898568656809833, + "grad_norm": 0.8046875, + "learning_rate": 0.0001739410377370334, + "loss": 0.9822, + "step": 15183 + }, + { + "epoch": 0.3898825428769051, + "grad_norm": 1.1328125, + "learning_rate": 0.0001739380321172523, + "loss": 0.9903, + "step": 15184 + }, + { + "epoch": 0.38990822007282694, + "grad_norm": 0.73828125, + "learning_rate": 0.00017393502635011933, + "loss": 0.7887, + "step": 15185 + }, + { + "epoch": 0.3899338972687488, + "grad_norm": 0.7578125, + "learning_rate": 0.00017393202043564058, + "loss": 0.8946, + "step": 15186 + }, + { + "epoch": 0.3899595744646706, + "grad_norm": 0.74609375, + "learning_rate": 0.00017392901437382195, + "loss": 0.8775, + "step": 15187 + }, + { + "epoch": 0.3899852516605924, + "grad_norm": 0.86328125, + "learning_rate": 0.0001739260081646695, + "loss": 0.9544, + "step": 15188 + }, + { + "epoch": 0.3900109288565142, + "grad_norm": 0.796875, + "learning_rate": 0.00017392300180818916, + "loss": 1.0175, + "step": 15189 + }, + { + "epoch": 0.39003660605243606, + "grad_norm": 0.83203125, + "learning_rate": 0.00017391999530438698, + "loss": 0.8363, + "step": 15190 + }, + { + "epoch": 0.39006228324835784, + "grad_norm": 0.859375, + "learning_rate": 0.0001739169886532689, + "loss": 1.0147, + "step": 15191 + }, + { + "epoch": 0.3900879604442797, + "grad_norm": 0.7734375, + "learning_rate": 0.000173913981854841, + "loss": 0.9146, + "step": 15192 + }, + { + "epoch": 0.3901136376402015, + "grad_norm": 0.7421875, + "learning_rate": 0.00017391097490910914, + "loss": 0.981, + "step": 15193 + }, + { + "epoch": 0.3901393148361233, + "grad_norm": 0.91796875, + "learning_rate": 0.00017390796781607942, + "loss": 1.0626, + "step": 15194 + }, + { + "epoch": 0.39016499203204513, + "grad_norm": 0.76171875, + "learning_rate": 0.0001739049605757578, + "loss": 0.8137, + "step": 15195 + }, + { + "epoch": 0.39019066922796697, + "grad_norm": 0.8046875, + "learning_rate": 0.00017390195318815025, + "loss": 0.9272, + "step": 15196 + }, + { + "epoch": 0.3902163464238888, + "grad_norm": 0.94140625, + "learning_rate": 0.0001738989456532628, + "loss": 0.8402, + "step": 15197 + }, + { + "epoch": 0.3902420236198106, + "grad_norm": 0.765625, + "learning_rate": 0.00017389593797110142, + "loss": 0.9618, + "step": 15198 + }, + { + "epoch": 0.3902677008157324, + "grad_norm": 0.7734375, + "learning_rate": 0.0001738929301416721, + "loss": 0.9261, + "step": 15199 + }, + { + "epoch": 0.39029337801165426, + "grad_norm": 0.86328125, + "learning_rate": 0.00017388992216498087, + "loss": 0.9374, + "step": 15200 + }, + { + "epoch": 0.39031905520757604, + "grad_norm": 0.7734375, + "learning_rate": 0.00017388691404103367, + "loss": 0.9954, + "step": 15201 + }, + { + "epoch": 0.3903447324034979, + "grad_norm": 0.7734375, + "learning_rate": 0.00017388390576983652, + "loss": 0.8591, + "step": 15202 + }, + { + "epoch": 0.3903704095994197, + "grad_norm": 0.828125, + "learning_rate": 0.00017388089735139546, + "loss": 0.9817, + "step": 15203 + }, + { + "epoch": 0.3903960867953415, + "grad_norm": 0.71875, + "learning_rate": 0.00017387788878571643, + "loss": 0.8671, + "step": 15204 + }, + { + "epoch": 0.3904217639912633, + "grad_norm": 0.8359375, + "learning_rate": 0.00017387488007280543, + "loss": 0.8828, + "step": 15205 + }, + { + "epoch": 0.39044744118718516, + "grad_norm": 0.76171875, + "learning_rate": 0.00017387187121266847, + "loss": 0.9447, + "step": 15206 + }, + { + "epoch": 0.390473118383107, + "grad_norm": 0.76953125, + "learning_rate": 0.00017386886220531155, + "loss": 0.868, + "step": 15207 + }, + { + "epoch": 0.3904987955790288, + "grad_norm": 0.84375, + "learning_rate": 0.00017386585305074066, + "loss": 1.0163, + "step": 15208 + }, + { + "epoch": 0.3905244727749506, + "grad_norm": 0.796875, + "learning_rate": 0.00017386284374896178, + "loss": 1.0192, + "step": 15209 + }, + { + "epoch": 0.39055014997087245, + "grad_norm": 0.80859375, + "learning_rate": 0.00017385983429998097, + "loss": 0.9303, + "step": 15210 + }, + { + "epoch": 0.39057582716679423, + "grad_norm": 0.84375, + "learning_rate": 0.00017385682470380412, + "loss": 0.9945, + "step": 15211 + }, + { + "epoch": 0.39060150436271607, + "grad_norm": 0.8046875, + "learning_rate": 0.00017385381496043731, + "loss": 0.8199, + "step": 15212 + }, + { + "epoch": 0.3906271815586379, + "grad_norm": 0.7890625, + "learning_rate": 0.00017385080506988654, + "loss": 0.8923, + "step": 15213 + }, + { + "epoch": 0.3906528587545597, + "grad_norm": 0.80859375, + "learning_rate": 0.00017384779503215776, + "loss": 0.9179, + "step": 15214 + }, + { + "epoch": 0.3906785359504815, + "grad_norm": 0.76953125, + "learning_rate": 0.00017384478484725702, + "loss": 0.8975, + "step": 15215 + }, + { + "epoch": 0.39070421314640336, + "grad_norm": 0.8515625, + "learning_rate": 0.00017384177451519028, + "loss": 0.9402, + "step": 15216 + }, + { + "epoch": 0.3907298903423252, + "grad_norm": 0.84765625, + "learning_rate": 0.00017383876403596356, + "loss": 0.9549, + "step": 15217 + }, + { + "epoch": 0.390755567538247, + "grad_norm": 0.76953125, + "learning_rate": 0.00017383575340958285, + "loss": 0.8228, + "step": 15218 + }, + { + "epoch": 0.3907812447341688, + "grad_norm": 0.7578125, + "learning_rate": 0.00017383274263605415, + "loss": 0.8039, + "step": 15219 + }, + { + "epoch": 0.39080692193009064, + "grad_norm": 0.8359375, + "learning_rate": 0.00017382973171538345, + "loss": 0.8121, + "step": 15220 + }, + { + "epoch": 0.3908325991260124, + "grad_norm": 0.79296875, + "learning_rate": 0.0001738267206475768, + "loss": 0.9503, + "step": 15221 + }, + { + "epoch": 0.39085827632193426, + "grad_norm": 0.76953125, + "learning_rate": 0.00017382370943264014, + "loss": 1.0196, + "step": 15222 + }, + { + "epoch": 0.3908839535178561, + "grad_norm": 0.8046875, + "learning_rate": 0.0001738206980705795, + "loss": 1.004, + "step": 15223 + }, + { + "epoch": 0.3909096307137779, + "grad_norm": 0.79296875, + "learning_rate": 0.00017381768656140088, + "loss": 1.0608, + "step": 15224 + }, + { + "epoch": 0.3909353079096997, + "grad_norm": 0.796875, + "learning_rate": 0.00017381467490511026, + "loss": 1.0923, + "step": 15225 + }, + { + "epoch": 0.39096098510562155, + "grad_norm": 0.8203125, + "learning_rate": 0.00017381166310171365, + "loss": 0.9416, + "step": 15226 + }, + { + "epoch": 0.3909866623015434, + "grad_norm": 0.7734375, + "learning_rate": 0.0001738086511512171, + "loss": 1.0679, + "step": 15227 + }, + { + "epoch": 0.39101233949746517, + "grad_norm": 0.890625, + "learning_rate": 0.00017380563905362658, + "loss": 1.0428, + "step": 15228 + }, + { + "epoch": 0.391038016693387, + "grad_norm": 0.79296875, + "learning_rate": 0.0001738026268089481, + "loss": 0.9148, + "step": 15229 + }, + { + "epoch": 0.39106369388930884, + "grad_norm": 1.5390625, + "learning_rate": 0.0001737996144171876, + "loss": 0.8254, + "step": 15230 + }, + { + "epoch": 0.3910893710852306, + "grad_norm": 0.75, + "learning_rate": 0.00017379660187835113, + "loss": 0.9153, + "step": 15231 + }, + { + "epoch": 0.39111504828115246, + "grad_norm": 0.75390625, + "learning_rate": 0.00017379358919244475, + "loss": 0.9538, + "step": 15232 + }, + { + "epoch": 0.3911407254770743, + "grad_norm": 0.76953125, + "learning_rate": 0.0001737905763594744, + "loss": 0.9329, + "step": 15233 + }, + { + "epoch": 0.39116640267299607, + "grad_norm": 0.80078125, + "learning_rate": 0.00017378756337944608, + "loss": 0.9915, + "step": 15234 + }, + { + "epoch": 0.3911920798689179, + "grad_norm": 0.77734375, + "learning_rate": 0.00017378455025236582, + "loss": 1.0417, + "step": 15235 + }, + { + "epoch": 0.39121775706483974, + "grad_norm": 0.78125, + "learning_rate": 0.0001737815369782396, + "loss": 0.97, + "step": 15236 + }, + { + "epoch": 0.3912434342607616, + "grad_norm": 0.84765625, + "learning_rate": 0.00017377852355707346, + "loss": 0.9801, + "step": 15237 + }, + { + "epoch": 0.39126911145668336, + "grad_norm": 0.70703125, + "learning_rate": 0.00017377550998887338, + "loss": 0.842, + "step": 15238 + }, + { + "epoch": 0.3912947886526052, + "grad_norm": 0.73828125, + "learning_rate": 0.00017377249627364538, + "loss": 0.9854, + "step": 15239 + }, + { + "epoch": 0.39132046584852703, + "grad_norm": 0.8203125, + "learning_rate": 0.00017376948241139544, + "loss": 0.8873, + "step": 15240 + }, + { + "epoch": 0.3913461430444488, + "grad_norm": 0.828125, + "learning_rate": 0.0001737664684021296, + "loss": 0.9575, + "step": 15241 + }, + { + "epoch": 0.39137182024037065, + "grad_norm": 0.890625, + "learning_rate": 0.00017376345424585386, + "loss": 1.0105, + "step": 15242 + }, + { + "epoch": 0.3913974974362925, + "grad_norm": 0.76171875, + "learning_rate": 0.0001737604399425742, + "loss": 1.0613, + "step": 15243 + }, + { + "epoch": 0.39142317463221427, + "grad_norm": 0.73046875, + "learning_rate": 0.00017375742549229667, + "loss": 0.9726, + "step": 15244 + }, + { + "epoch": 0.3914488518281361, + "grad_norm": 0.8203125, + "learning_rate": 0.0001737544108950272, + "loss": 0.9974, + "step": 15245 + }, + { + "epoch": 0.39147452902405794, + "grad_norm": 0.8046875, + "learning_rate": 0.00017375139615077192, + "loss": 1.0287, + "step": 15246 + }, + { + "epoch": 0.3915002062199798, + "grad_norm": 0.9375, + "learning_rate": 0.0001737483812595367, + "loss": 1.0383, + "step": 15247 + }, + { + "epoch": 0.39152588341590155, + "grad_norm": 0.84765625, + "learning_rate": 0.00017374536622132767, + "loss": 0.9709, + "step": 15248 + }, + { + "epoch": 0.3915515606118234, + "grad_norm": 0.7578125, + "learning_rate": 0.0001737423510361508, + "loss": 0.8721, + "step": 15249 + }, + { + "epoch": 0.3915772378077452, + "grad_norm": 0.8046875, + "learning_rate": 0.00017373933570401205, + "loss": 0.7895, + "step": 15250 + }, + { + "epoch": 0.391602915003667, + "grad_norm": 0.73046875, + "learning_rate": 0.00017373632022491746, + "loss": 0.7987, + "step": 15251 + }, + { + "epoch": 0.39162859219958884, + "grad_norm": 0.8359375, + "learning_rate": 0.00017373330459887306, + "loss": 0.9449, + "step": 15252 + }, + { + "epoch": 0.3916542693955107, + "grad_norm": 0.82421875, + "learning_rate": 0.00017373028882588484, + "loss": 0.7451, + "step": 15253 + }, + { + "epoch": 0.39167994659143246, + "grad_norm": 0.74609375, + "learning_rate": 0.00017372727290595883, + "loss": 0.9336, + "step": 15254 + }, + { + "epoch": 0.3917056237873543, + "grad_norm": 0.75390625, + "learning_rate": 0.000173724256839101, + "loss": 0.8958, + "step": 15255 + }, + { + "epoch": 0.39173130098327613, + "grad_norm": 0.75, + "learning_rate": 0.00017372124062531741, + "loss": 0.7785, + "step": 15256 + }, + { + "epoch": 0.39175697817919797, + "grad_norm": 0.8203125, + "learning_rate": 0.00017371822426461405, + "loss": 1.0193, + "step": 15257 + }, + { + "epoch": 0.39178265537511975, + "grad_norm": 0.78515625, + "learning_rate": 0.00017371520775699687, + "loss": 0.9253, + "step": 15258 + }, + { + "epoch": 0.3918083325710416, + "grad_norm": 0.73046875, + "learning_rate": 0.000173712191102472, + "loss": 0.8788, + "step": 15259 + }, + { + "epoch": 0.3918340097669634, + "grad_norm": 0.77734375, + "learning_rate": 0.00017370917430104536, + "loss": 0.8397, + "step": 15260 + }, + { + "epoch": 0.3918596869628852, + "grad_norm": 0.75, + "learning_rate": 0.000173706157352723, + "loss": 0.8646, + "step": 15261 + }, + { + "epoch": 0.39188536415880704, + "grad_norm": 0.765625, + "learning_rate": 0.00017370314025751094, + "loss": 0.9662, + "step": 15262 + }, + { + "epoch": 0.3919110413547289, + "grad_norm": 0.79296875, + "learning_rate": 0.00017370012301541518, + "loss": 0.9493, + "step": 15263 + }, + { + "epoch": 0.39193671855065065, + "grad_norm": 0.78125, + "learning_rate": 0.00017369710562644173, + "loss": 1.0136, + "step": 15264 + }, + { + "epoch": 0.3919623957465725, + "grad_norm": 0.79296875, + "learning_rate": 0.0001736940880905966, + "loss": 0.8061, + "step": 15265 + }, + { + "epoch": 0.3919880729424943, + "grad_norm": 0.765625, + "learning_rate": 0.00017369107040788578, + "loss": 0.8717, + "step": 15266 + }, + { + "epoch": 0.39201375013841616, + "grad_norm": 0.859375, + "learning_rate": 0.00017368805257831537, + "loss": 0.9444, + "step": 15267 + }, + { + "epoch": 0.39203942733433794, + "grad_norm": 0.85546875, + "learning_rate": 0.0001736850346018913, + "loss": 1.0261, + "step": 15268 + }, + { + "epoch": 0.3920651045302598, + "grad_norm": 0.7578125, + "learning_rate": 0.0001736820164786196, + "loss": 1.0334, + "step": 15269 + }, + { + "epoch": 0.3920907817261816, + "grad_norm": 0.796875, + "learning_rate": 0.0001736789982085063, + "loss": 0.9734, + "step": 15270 + }, + { + "epoch": 0.3921164589221034, + "grad_norm": 0.77734375, + "learning_rate": 0.00017367597979155744, + "loss": 0.906, + "step": 15271 + }, + { + "epoch": 0.39214213611802523, + "grad_norm": 0.7578125, + "learning_rate": 0.00017367296122777897, + "loss": 0.982, + "step": 15272 + }, + { + "epoch": 0.39216781331394707, + "grad_norm": 0.78125, + "learning_rate": 0.00017366994251717696, + "loss": 0.9598, + "step": 15273 + }, + { + "epoch": 0.39219349050986885, + "grad_norm": 0.82421875, + "learning_rate": 0.0001736669236597574, + "loss": 0.9071, + "step": 15274 + }, + { + "epoch": 0.3922191677057907, + "grad_norm": 0.78125, + "learning_rate": 0.00017366390465552636, + "loss": 1.0281, + "step": 15275 + }, + { + "epoch": 0.3922448449017125, + "grad_norm": 0.8203125, + "learning_rate": 0.00017366088550448976, + "loss": 0.9156, + "step": 15276 + }, + { + "epoch": 0.39227052209763436, + "grad_norm": 0.7578125, + "learning_rate": 0.00017365786620665368, + "loss": 0.9225, + "step": 15277 + }, + { + "epoch": 0.39229619929355614, + "grad_norm": 0.82421875, + "learning_rate": 0.00017365484676202413, + "loss": 0.961, + "step": 15278 + }, + { + "epoch": 0.39232187648947797, + "grad_norm": 0.7734375, + "learning_rate": 0.00017365182717060714, + "loss": 1.0233, + "step": 15279 + }, + { + "epoch": 0.3923475536853998, + "grad_norm": 0.8203125, + "learning_rate": 0.0001736488074324087, + "loss": 0.9181, + "step": 15280 + }, + { + "epoch": 0.3923732308813216, + "grad_norm": 0.7890625, + "learning_rate": 0.00017364578754743484, + "loss": 0.8418, + "step": 15281 + }, + { + "epoch": 0.3923989080772434, + "grad_norm": 0.8515625, + "learning_rate": 0.00017364276751569155, + "loss": 0.9334, + "step": 15282 + }, + { + "epoch": 0.39242458527316526, + "grad_norm": 0.75390625, + "learning_rate": 0.0001736397473371849, + "loss": 1.0212, + "step": 15283 + }, + { + "epoch": 0.39245026246908704, + "grad_norm": 0.80078125, + "learning_rate": 0.0001736367270119209, + "loss": 0.86, + "step": 15284 + }, + { + "epoch": 0.3924759396650089, + "grad_norm": 0.8515625, + "learning_rate": 0.00017363370653990554, + "loss": 0.9779, + "step": 15285 + }, + { + "epoch": 0.3925016168609307, + "grad_norm": 0.73828125, + "learning_rate": 0.00017363068592114484, + "loss": 0.9243, + "step": 15286 + }, + { + "epoch": 0.3925272940568525, + "grad_norm": 0.96484375, + "learning_rate": 0.00017362766515564485, + "loss": 1.1261, + "step": 15287 + }, + { + "epoch": 0.39255297125277433, + "grad_norm": 0.74609375, + "learning_rate": 0.0001736246442434116, + "loss": 0.9727, + "step": 15288 + }, + { + "epoch": 0.39257864844869617, + "grad_norm": 0.83203125, + "learning_rate": 0.00017362162318445106, + "loss": 1.1028, + "step": 15289 + }, + { + "epoch": 0.392604325644618, + "grad_norm": 0.828125, + "learning_rate": 0.00017361860197876924, + "loss": 1.0314, + "step": 15290 + }, + { + "epoch": 0.3926300028405398, + "grad_norm": 0.84765625, + "learning_rate": 0.00017361558062637224, + "loss": 0.9701, + "step": 15291 + }, + { + "epoch": 0.3926556800364616, + "grad_norm": 0.7578125, + "learning_rate": 0.00017361255912726602, + "loss": 0.8897, + "step": 15292 + }, + { + "epoch": 0.39268135723238345, + "grad_norm": 0.87109375, + "learning_rate": 0.00017360953748145664, + "loss": 0.9164, + "step": 15293 + }, + { + "epoch": 0.39270703442830523, + "grad_norm": 0.8203125, + "learning_rate": 0.00017360651568895004, + "loss": 0.9344, + "step": 15294 + }, + { + "epoch": 0.39273271162422707, + "grad_norm": 0.8125, + "learning_rate": 0.00017360349374975239, + "loss": 1.0234, + "step": 15295 + }, + { + "epoch": 0.3927583888201489, + "grad_norm": 0.8046875, + "learning_rate": 0.0001736004716638696, + "loss": 1.0054, + "step": 15296 + }, + { + "epoch": 0.3927840660160707, + "grad_norm": 0.796875, + "learning_rate": 0.00017359744943130768, + "loss": 0.8879, + "step": 15297 + }, + { + "epoch": 0.3928097432119925, + "grad_norm": 0.86328125, + "learning_rate": 0.00017359442705207272, + "loss": 0.8993, + "step": 15298 + }, + { + "epoch": 0.39283542040791436, + "grad_norm": 0.76171875, + "learning_rate": 0.0001735914045261707, + "loss": 0.8935, + "step": 15299 + }, + { + "epoch": 0.3928610976038362, + "grad_norm": 0.79296875, + "learning_rate": 0.00017358838185360767, + "loss": 0.9205, + "step": 15300 + }, + { + "epoch": 0.392886774799758, + "grad_norm": 0.80078125, + "learning_rate": 0.00017358535903438965, + "loss": 0.9765, + "step": 15301 + }, + { + "epoch": 0.3929124519956798, + "grad_norm": 0.80859375, + "learning_rate": 0.00017358233606852265, + "loss": 0.8741, + "step": 15302 + }, + { + "epoch": 0.39293812919160165, + "grad_norm": 0.78515625, + "learning_rate": 0.00017357931295601268, + "loss": 0.8793, + "step": 15303 + }, + { + "epoch": 0.39296380638752343, + "grad_norm": 0.96875, + "learning_rate": 0.0001735762896968658, + "loss": 0.8833, + "step": 15304 + }, + { + "epoch": 0.39298948358344526, + "grad_norm": 0.7890625, + "learning_rate": 0.00017357326629108803, + "loss": 0.8647, + "step": 15305 + }, + { + "epoch": 0.3930151607793671, + "grad_norm": 0.796875, + "learning_rate": 0.0001735702427386854, + "loss": 0.9772, + "step": 15306 + }, + { + "epoch": 0.3930408379752889, + "grad_norm": 0.84375, + "learning_rate": 0.0001735672190396639, + "loss": 1.1083, + "step": 15307 + }, + { + "epoch": 0.3930665151712107, + "grad_norm": 0.734375, + "learning_rate": 0.00017356419519402958, + "loss": 0.9389, + "step": 15308 + }, + { + "epoch": 0.39309219236713255, + "grad_norm": 0.91796875, + "learning_rate": 0.00017356117120178842, + "loss": 1.1802, + "step": 15309 + }, + { + "epoch": 0.3931178695630544, + "grad_norm": 0.70703125, + "learning_rate": 0.00017355814706294654, + "loss": 0.9447, + "step": 15310 + }, + { + "epoch": 0.39314354675897617, + "grad_norm": 0.86328125, + "learning_rate": 0.00017355512277750993, + "loss": 0.9544, + "step": 15311 + }, + { + "epoch": 0.393169223954898, + "grad_norm": 0.8125, + "learning_rate": 0.00017355209834548458, + "loss": 0.93, + "step": 15312 + }, + { + "epoch": 0.39319490115081984, + "grad_norm": 0.80078125, + "learning_rate": 0.00017354907376687653, + "loss": 0.9422, + "step": 15313 + }, + { + "epoch": 0.3932205783467416, + "grad_norm": 0.765625, + "learning_rate": 0.00017354604904169186, + "loss": 0.8936, + "step": 15314 + }, + { + "epoch": 0.39324625554266346, + "grad_norm": 0.77734375, + "learning_rate": 0.00017354302416993655, + "loss": 0.8661, + "step": 15315 + }, + { + "epoch": 0.3932719327385853, + "grad_norm": 0.82421875, + "learning_rate": 0.0001735399991516166, + "loss": 0.8593, + "step": 15316 + }, + { + "epoch": 0.3932976099345071, + "grad_norm": 0.8671875, + "learning_rate": 0.0001735369739867381, + "loss": 0.8268, + "step": 15317 + }, + { + "epoch": 0.3933232871304289, + "grad_norm": 0.828125, + "learning_rate": 0.00017353394867530706, + "loss": 1.0314, + "step": 15318 + }, + { + "epoch": 0.39334896432635075, + "grad_norm": 0.7578125, + "learning_rate": 0.0001735309232173295, + "loss": 0.9665, + "step": 15319 + }, + { + "epoch": 0.3933746415222726, + "grad_norm": 0.77734375, + "learning_rate": 0.0001735278976128114, + "loss": 0.9209, + "step": 15320 + }, + { + "epoch": 0.39340031871819436, + "grad_norm": 0.79296875, + "learning_rate": 0.0001735248718617589, + "loss": 0.8795, + "step": 15321 + }, + { + "epoch": 0.3934259959141162, + "grad_norm": 0.6875, + "learning_rate": 0.00017352184596417796, + "loss": 0.7857, + "step": 15322 + }, + { + "epoch": 0.39345167311003804, + "grad_norm": 0.828125, + "learning_rate": 0.00017351881992007463, + "loss": 0.8091, + "step": 15323 + }, + { + "epoch": 0.3934773503059598, + "grad_norm": 0.7265625, + "learning_rate": 0.00017351579372945492, + "loss": 0.8974, + "step": 15324 + }, + { + "epoch": 0.39350302750188165, + "grad_norm": 0.79296875, + "learning_rate": 0.00017351276739232486, + "loss": 1.0882, + "step": 15325 + }, + { + "epoch": 0.3935287046978035, + "grad_norm": 0.796875, + "learning_rate": 0.0001735097409086905, + "loss": 0.8337, + "step": 15326 + }, + { + "epoch": 0.39355438189372527, + "grad_norm": 0.7421875, + "learning_rate": 0.00017350671427855788, + "loss": 0.8579, + "step": 15327 + }, + { + "epoch": 0.3935800590896471, + "grad_norm": 0.7734375, + "learning_rate": 0.00017350368750193305, + "loss": 0.9024, + "step": 15328 + }, + { + "epoch": 0.39360573628556894, + "grad_norm": 0.78515625, + "learning_rate": 0.00017350066057882194, + "loss": 0.9549, + "step": 15329 + }, + { + "epoch": 0.3936314134814908, + "grad_norm": 0.734375, + "learning_rate": 0.0001734976335092307, + "loss": 0.9421, + "step": 15330 + }, + { + "epoch": 0.39365709067741256, + "grad_norm": 0.78515625, + "learning_rate": 0.0001734946062931653, + "loss": 0.9498, + "step": 15331 + }, + { + "epoch": 0.3936827678733344, + "grad_norm": 0.7734375, + "learning_rate": 0.00017349157893063182, + "loss": 1.0609, + "step": 15332 + }, + { + "epoch": 0.39370844506925623, + "grad_norm": 0.69921875, + "learning_rate": 0.00017348855142163623, + "loss": 0.8818, + "step": 15333 + }, + { + "epoch": 0.393734122265178, + "grad_norm": 0.79296875, + "learning_rate": 0.00017348552376618458, + "loss": 0.8958, + "step": 15334 + }, + { + "epoch": 0.39375979946109985, + "grad_norm": 0.7421875, + "learning_rate": 0.00017348249596428294, + "loss": 0.8593, + "step": 15335 + }, + { + "epoch": 0.3937854766570217, + "grad_norm": 0.81640625, + "learning_rate": 0.00017347946801593732, + "loss": 1.0025, + "step": 15336 + }, + { + "epoch": 0.39381115385294346, + "grad_norm": 0.78515625, + "learning_rate": 0.00017347643992115378, + "loss": 0.9597, + "step": 15337 + }, + { + "epoch": 0.3938368310488653, + "grad_norm": 0.69921875, + "learning_rate": 0.0001734734116799383, + "loss": 0.884, + "step": 15338 + }, + { + "epoch": 0.39386250824478714, + "grad_norm": 0.75390625, + "learning_rate": 0.00017347038329229694, + "loss": 0.7914, + "step": 15339 + }, + { + "epoch": 0.39388818544070897, + "grad_norm": 0.94921875, + "learning_rate": 0.00017346735475823577, + "loss": 0.8141, + "step": 15340 + }, + { + "epoch": 0.39391386263663075, + "grad_norm": 0.8046875, + "learning_rate": 0.00017346432607776079, + "loss": 0.8705, + "step": 15341 + }, + { + "epoch": 0.3939395398325526, + "grad_norm": 0.86328125, + "learning_rate": 0.00017346129725087804, + "loss": 0.8766, + "step": 15342 + }, + { + "epoch": 0.3939652170284744, + "grad_norm": 0.86328125, + "learning_rate": 0.00017345826827759356, + "loss": 0.8643, + "step": 15343 + }, + { + "epoch": 0.3939908942243962, + "grad_norm": 0.71484375, + "learning_rate": 0.0001734552391579134, + "loss": 0.8621, + "step": 15344 + }, + { + "epoch": 0.39401657142031804, + "grad_norm": 0.8203125, + "learning_rate": 0.0001734522098918436, + "loss": 0.9607, + "step": 15345 + }, + { + "epoch": 0.3940422486162399, + "grad_norm": 0.7890625, + "learning_rate": 0.0001734491804793901, + "loss": 0.886, + "step": 15346 + }, + { + "epoch": 0.39406792581216166, + "grad_norm": 0.70703125, + "learning_rate": 0.00017344615092055912, + "loss": 1.114, + "step": 15347 + }, + { + "epoch": 0.3940936030080835, + "grad_norm": 0.76953125, + "learning_rate": 0.00017344312121535653, + "loss": 0.9515, + "step": 15348 + }, + { + "epoch": 0.39411928020400533, + "grad_norm": 0.765625, + "learning_rate": 0.00017344009136378844, + "loss": 0.8117, + "step": 15349 + }, + { + "epoch": 0.39414495739992716, + "grad_norm": 0.81640625, + "learning_rate": 0.0001734370613658609, + "loss": 0.9661, + "step": 15350 + }, + { + "epoch": 0.39417063459584895, + "grad_norm": 0.80859375, + "learning_rate": 0.00017343403122157993, + "loss": 0.941, + "step": 15351 + }, + { + "epoch": 0.3941963117917708, + "grad_norm": 0.8203125, + "learning_rate": 0.00017343100093095156, + "loss": 1.0006, + "step": 15352 + }, + { + "epoch": 0.3942219889876926, + "grad_norm": 0.7578125, + "learning_rate": 0.00017342797049398184, + "loss": 0.9405, + "step": 15353 + }, + { + "epoch": 0.3942476661836144, + "grad_norm": 0.7890625, + "learning_rate": 0.0001734249399106768, + "loss": 0.938, + "step": 15354 + }, + { + "epoch": 0.39427334337953623, + "grad_norm": 0.84375, + "learning_rate": 0.0001734219091810425, + "loss": 0.9277, + "step": 15355 + }, + { + "epoch": 0.39429902057545807, + "grad_norm": 0.77734375, + "learning_rate": 0.00017341887830508496, + "loss": 1.0046, + "step": 15356 + }, + { + "epoch": 0.39432469777137985, + "grad_norm": 0.77734375, + "learning_rate": 0.00017341584728281024, + "loss": 1.0095, + "step": 15357 + }, + { + "epoch": 0.3943503749673017, + "grad_norm": 0.75390625, + "learning_rate": 0.00017341281611422436, + "loss": 0.9115, + "step": 15358 + }, + { + "epoch": 0.3943760521632235, + "grad_norm": 0.77734375, + "learning_rate": 0.00017340978479933337, + "loss": 0.848, + "step": 15359 + }, + { + "epoch": 0.39440172935914536, + "grad_norm": 0.80859375, + "learning_rate": 0.0001734067533381433, + "loss": 0.9079, + "step": 15360 + }, + { + "epoch": 0.39442740655506714, + "grad_norm": 0.765625, + "learning_rate": 0.0001734037217306602, + "loss": 0.9397, + "step": 15361 + }, + { + "epoch": 0.394453083750989, + "grad_norm": 0.73828125, + "learning_rate": 0.00017340068997689012, + "loss": 0.8309, + "step": 15362 + }, + { + "epoch": 0.3944787609469108, + "grad_norm": 0.80078125, + "learning_rate": 0.0001733976580768391, + "loss": 0.9265, + "step": 15363 + }, + { + "epoch": 0.3945044381428326, + "grad_norm": 0.84765625, + "learning_rate": 0.00017339462603051318, + "loss": 0.8652, + "step": 15364 + }, + { + "epoch": 0.39453011533875443, + "grad_norm": 0.7890625, + "learning_rate": 0.00017339159383791838, + "loss": 0.926, + "step": 15365 + }, + { + "epoch": 0.39455579253467626, + "grad_norm": 0.88671875, + "learning_rate": 0.00017338856149906078, + "loss": 0.9742, + "step": 15366 + }, + { + "epoch": 0.39458146973059804, + "grad_norm": 0.8828125, + "learning_rate": 0.0001733855290139464, + "loss": 0.9877, + "step": 15367 + }, + { + "epoch": 0.3946071469265199, + "grad_norm": 0.828125, + "learning_rate": 0.00017338249638258133, + "loss": 1.0072, + "step": 15368 + }, + { + "epoch": 0.3946328241224417, + "grad_norm": 0.828125, + "learning_rate": 0.00017337946360497153, + "loss": 0.9932, + "step": 15369 + }, + { + "epoch": 0.39465850131836355, + "grad_norm": 0.8203125, + "learning_rate": 0.00017337643068112308, + "loss": 0.8426, + "step": 15370 + }, + { + "epoch": 0.39468417851428533, + "grad_norm": 0.84375, + "learning_rate": 0.00017337339761104207, + "loss": 0.9107, + "step": 15371 + }, + { + "epoch": 0.39470985571020717, + "grad_norm": 0.74609375, + "learning_rate": 0.00017337036439473447, + "loss": 1.0393, + "step": 15372 + }, + { + "epoch": 0.394735532906129, + "grad_norm": 0.84765625, + "learning_rate": 0.00017336733103220638, + "loss": 1.0784, + "step": 15373 + }, + { + "epoch": 0.3947612101020508, + "grad_norm": 0.73046875, + "learning_rate": 0.0001733642975234638, + "loss": 0.8779, + "step": 15374 + }, + { + "epoch": 0.3947868872979726, + "grad_norm": 0.7578125, + "learning_rate": 0.0001733612638685128, + "loss": 0.9413, + "step": 15375 + }, + { + "epoch": 0.39481256449389446, + "grad_norm": 0.828125, + "learning_rate": 0.00017335823006735948, + "loss": 0.9035, + "step": 15376 + }, + { + "epoch": 0.39483824168981624, + "grad_norm": 0.85546875, + "learning_rate": 0.0001733551961200098, + "loss": 0.9737, + "step": 15377 + }, + { + "epoch": 0.3948639188857381, + "grad_norm": 0.79296875, + "learning_rate": 0.00017335216202646984, + "loss": 0.8944, + "step": 15378 + }, + { + "epoch": 0.3948895960816599, + "grad_norm": 0.7421875, + "learning_rate": 0.00017334912778674565, + "loss": 0.907, + "step": 15379 + }, + { + "epoch": 0.39491527327758175, + "grad_norm": 0.828125, + "learning_rate": 0.0001733460934008433, + "loss": 0.9406, + "step": 15380 + }, + { + "epoch": 0.3949409504735035, + "grad_norm": 0.76953125, + "learning_rate": 0.00017334305886876876, + "loss": 0.9561, + "step": 15381 + }, + { + "epoch": 0.39496662766942536, + "grad_norm": 0.82421875, + "learning_rate": 0.00017334002419052815, + "loss": 0.8831, + "step": 15382 + }, + { + "epoch": 0.3949923048653472, + "grad_norm": 0.76953125, + "learning_rate": 0.0001733369893661275, + "loss": 0.8108, + "step": 15383 + }, + { + "epoch": 0.395017982061269, + "grad_norm": 0.703125, + "learning_rate": 0.00017333395439557284, + "loss": 0.9511, + "step": 15384 + }, + { + "epoch": 0.3950436592571908, + "grad_norm": 0.703125, + "learning_rate": 0.00017333091927887025, + "loss": 0.9412, + "step": 15385 + }, + { + "epoch": 0.39506933645311265, + "grad_norm": 0.765625, + "learning_rate": 0.00017332788401602578, + "loss": 0.9208, + "step": 15386 + }, + { + "epoch": 0.39509501364903443, + "grad_norm": 0.82421875, + "learning_rate": 0.00017332484860704543, + "loss": 0.9487, + "step": 15387 + }, + { + "epoch": 0.39512069084495627, + "grad_norm": 0.76953125, + "learning_rate": 0.0001733218130519353, + "loss": 0.7874, + "step": 15388 + }, + { + "epoch": 0.3951463680408781, + "grad_norm": 0.796875, + "learning_rate": 0.0001733187773507014, + "loss": 0.9492, + "step": 15389 + }, + { + "epoch": 0.39517204523679994, + "grad_norm": 0.86328125, + "learning_rate": 0.00017331574150334983, + "loss": 0.9318, + "step": 15390 + }, + { + "epoch": 0.3951977224327217, + "grad_norm": 0.9296875, + "learning_rate": 0.00017331270550988655, + "loss": 0.8921, + "step": 15391 + }, + { + "epoch": 0.39522339962864356, + "grad_norm": 0.84375, + "learning_rate": 0.00017330966937031774, + "loss": 0.9252, + "step": 15392 + }, + { + "epoch": 0.3952490768245654, + "grad_norm": 0.80859375, + "learning_rate": 0.00017330663308464934, + "loss": 1.088, + "step": 15393 + }, + { + "epoch": 0.3952747540204872, + "grad_norm": 0.8046875, + "learning_rate": 0.00017330359665288747, + "loss": 0.8674, + "step": 15394 + }, + { + "epoch": 0.395300431216409, + "grad_norm": 0.76953125, + "learning_rate": 0.00017330056007503812, + "loss": 0.9331, + "step": 15395 + }, + { + "epoch": 0.39532610841233085, + "grad_norm": 0.84375, + "learning_rate": 0.0001732975233511074, + "loss": 0.9796, + "step": 15396 + }, + { + "epoch": 0.3953517856082526, + "grad_norm": 0.78515625, + "learning_rate": 0.00017329448648110133, + "loss": 0.8537, + "step": 15397 + }, + { + "epoch": 0.39537746280417446, + "grad_norm": 0.91015625, + "learning_rate": 0.00017329144946502597, + "loss": 0.9399, + "step": 15398 + }, + { + "epoch": 0.3954031400000963, + "grad_norm": 0.8046875, + "learning_rate": 0.00017328841230288737, + "loss": 1.0021, + "step": 15399 + }, + { + "epoch": 0.39542881719601813, + "grad_norm": 0.78515625, + "learning_rate": 0.00017328537499469157, + "loss": 0.9106, + "step": 15400 + }, + { + "epoch": 0.3954544943919399, + "grad_norm": 0.84765625, + "learning_rate": 0.00017328233754044465, + "loss": 0.8953, + "step": 15401 + }, + { + "epoch": 0.39548017158786175, + "grad_norm": 0.7890625, + "learning_rate": 0.00017327929994015266, + "loss": 0.9527, + "step": 15402 + }, + { + "epoch": 0.3955058487837836, + "grad_norm": 0.80078125, + "learning_rate": 0.00017327626219382163, + "loss": 0.8742, + "step": 15403 + }, + { + "epoch": 0.39553152597970537, + "grad_norm": 0.74609375, + "learning_rate": 0.00017327322430145764, + "loss": 0.9986, + "step": 15404 + }, + { + "epoch": 0.3955572031756272, + "grad_norm": 0.80078125, + "learning_rate": 0.00017327018626306674, + "loss": 0.8626, + "step": 15405 + }, + { + "epoch": 0.39558288037154904, + "grad_norm": 0.7578125, + "learning_rate": 0.00017326714807865495, + "loss": 0.9197, + "step": 15406 + }, + { + "epoch": 0.3956085575674708, + "grad_norm": 0.78125, + "learning_rate": 0.00017326410974822837, + "loss": 0.9358, + "step": 15407 + }, + { + "epoch": 0.39563423476339266, + "grad_norm": 0.828125, + "learning_rate": 0.00017326107127179304, + "loss": 0.9719, + "step": 15408 + }, + { + "epoch": 0.3956599119593145, + "grad_norm": 0.9375, + "learning_rate": 0.000173258032649355, + "loss": 0.9495, + "step": 15409 + }, + { + "epoch": 0.39568558915523633, + "grad_norm": 0.734375, + "learning_rate": 0.00017325499388092034, + "loss": 0.8528, + "step": 15410 + }, + { + "epoch": 0.3957112663511581, + "grad_norm": 0.84765625, + "learning_rate": 0.00017325195496649507, + "loss": 0.9937, + "step": 15411 + }, + { + "epoch": 0.39573694354707994, + "grad_norm": 0.78125, + "learning_rate": 0.00017324891590608527, + "loss": 1.1134, + "step": 15412 + }, + { + "epoch": 0.3957626207430018, + "grad_norm": 0.8046875, + "learning_rate": 0.00017324587669969704, + "loss": 0.9673, + "step": 15413 + }, + { + "epoch": 0.39578829793892356, + "grad_norm": 0.78125, + "learning_rate": 0.00017324283734733635, + "loss": 0.9504, + "step": 15414 + }, + { + "epoch": 0.3958139751348454, + "grad_norm": 0.91015625, + "learning_rate": 0.0001732397978490093, + "loss": 0.8665, + "step": 15415 + }, + { + "epoch": 0.39583965233076723, + "grad_norm": 0.8203125, + "learning_rate": 0.000173236758204722, + "loss": 0.8957, + "step": 15416 + }, + { + "epoch": 0.395865329526689, + "grad_norm": 0.74609375, + "learning_rate": 0.00017323371841448037, + "loss": 1.0019, + "step": 15417 + }, + { + "epoch": 0.39589100672261085, + "grad_norm": 0.703125, + "learning_rate": 0.00017323067847829063, + "loss": 0.8154, + "step": 15418 + }, + { + "epoch": 0.3959166839185327, + "grad_norm": 0.8125, + "learning_rate": 0.00017322763839615872, + "loss": 0.9275, + "step": 15419 + }, + { + "epoch": 0.3959423611144545, + "grad_norm": 0.7734375, + "learning_rate": 0.00017322459816809075, + "loss": 0.9896, + "step": 15420 + }, + { + "epoch": 0.3959680383103763, + "grad_norm": 0.75390625, + "learning_rate": 0.00017322155779409278, + "loss": 0.8945, + "step": 15421 + }, + { + "epoch": 0.39599371550629814, + "grad_norm": 0.7890625, + "learning_rate": 0.00017321851727417084, + "loss": 0.9638, + "step": 15422 + }, + { + "epoch": 0.39601939270222, + "grad_norm": 0.8203125, + "learning_rate": 0.00017321547660833104, + "loss": 0.985, + "step": 15423 + }, + { + "epoch": 0.39604506989814175, + "grad_norm": 0.76171875, + "learning_rate": 0.0001732124357965794, + "loss": 0.9537, + "step": 15424 + }, + { + "epoch": 0.3960707470940636, + "grad_norm": 0.83984375, + "learning_rate": 0.000173209394838922, + "loss": 0.9671, + "step": 15425 + }, + { + "epoch": 0.3960964242899854, + "grad_norm": 0.76171875, + "learning_rate": 0.00017320635373536483, + "loss": 0.9214, + "step": 15426 + }, + { + "epoch": 0.3961221014859072, + "grad_norm": 0.8125, + "learning_rate": 0.00017320331248591407, + "loss": 0.9945, + "step": 15427 + }, + { + "epoch": 0.39614777868182904, + "grad_norm": 0.77734375, + "learning_rate": 0.0001732002710905757, + "loss": 0.9849, + "step": 15428 + }, + { + "epoch": 0.3961734558777509, + "grad_norm": 0.7890625, + "learning_rate": 0.0001731972295493558, + "loss": 0.8458, + "step": 15429 + }, + { + "epoch": 0.3961991330736727, + "grad_norm": 0.8203125, + "learning_rate": 0.00017319418786226044, + "loss": 0.9651, + "step": 15430 + }, + { + "epoch": 0.3962248102695945, + "grad_norm": 0.78125, + "learning_rate": 0.00017319114602929565, + "loss": 0.9911, + "step": 15431 + }, + { + "epoch": 0.39625048746551633, + "grad_norm": 0.78125, + "learning_rate": 0.00017318810405046758, + "loss": 0.9467, + "step": 15432 + }, + { + "epoch": 0.39627616466143817, + "grad_norm": 0.75, + "learning_rate": 0.00017318506192578217, + "loss": 0.8278, + "step": 15433 + }, + { + "epoch": 0.39630184185735995, + "grad_norm": 0.7734375, + "learning_rate": 0.00017318201965524555, + "loss": 1.0584, + "step": 15434 + }, + { + "epoch": 0.3963275190532818, + "grad_norm": 0.7890625, + "learning_rate": 0.0001731789772388638, + "loss": 0.9394, + "step": 15435 + }, + { + "epoch": 0.3963531962492036, + "grad_norm": 0.77734375, + "learning_rate": 0.00017317593467664294, + "loss": 0.8822, + "step": 15436 + }, + { + "epoch": 0.3963788734451254, + "grad_norm": 0.76953125, + "learning_rate": 0.00017317289196858905, + "loss": 0.9302, + "step": 15437 + }, + { + "epoch": 0.39640455064104724, + "grad_norm": 0.9453125, + "learning_rate": 0.0001731698491147082, + "loss": 0.9054, + "step": 15438 + }, + { + "epoch": 0.3964302278369691, + "grad_norm": 0.828125, + "learning_rate": 0.00017316680611500647, + "loss": 0.9275, + "step": 15439 + }, + { + "epoch": 0.3964559050328909, + "grad_norm": 0.796875, + "learning_rate": 0.00017316376296948988, + "loss": 0.963, + "step": 15440 + }, + { + "epoch": 0.3964815822288127, + "grad_norm": 0.75390625, + "learning_rate": 0.00017316071967816453, + "loss": 0.8625, + "step": 15441 + }, + { + "epoch": 0.3965072594247345, + "grad_norm": 0.78515625, + "learning_rate": 0.00017315767624103646, + "loss": 0.9181, + "step": 15442 + }, + { + "epoch": 0.39653293662065636, + "grad_norm": 0.72265625, + "learning_rate": 0.00017315463265811176, + "loss": 0.9004, + "step": 15443 + }, + { + "epoch": 0.39655861381657814, + "grad_norm": 0.93359375, + "learning_rate": 0.00017315158892939648, + "loss": 0.9516, + "step": 15444 + }, + { + "epoch": 0.3965842910125, + "grad_norm": 0.75390625, + "learning_rate": 0.0001731485450548967, + "loss": 1.0393, + "step": 15445 + }, + { + "epoch": 0.3966099682084218, + "grad_norm": 0.7421875, + "learning_rate": 0.00017314550103461848, + "loss": 0.8004, + "step": 15446 + }, + { + "epoch": 0.3966356454043436, + "grad_norm": 0.796875, + "learning_rate": 0.00017314245686856787, + "loss": 0.881, + "step": 15447 + }, + { + "epoch": 0.39666132260026543, + "grad_norm": 0.80859375, + "learning_rate": 0.00017313941255675097, + "loss": 0.9432, + "step": 15448 + }, + { + "epoch": 0.39668699979618727, + "grad_norm": 0.7734375, + "learning_rate": 0.0001731363680991738, + "loss": 0.7934, + "step": 15449 + }, + { + "epoch": 0.3967126769921091, + "grad_norm": 0.75390625, + "learning_rate": 0.00017313332349584248, + "loss": 0.8489, + "step": 15450 + }, + { + "epoch": 0.3967383541880309, + "grad_norm": 0.76171875, + "learning_rate": 0.00017313027874676305, + "loss": 0.9954, + "step": 15451 + }, + { + "epoch": 0.3967640313839527, + "grad_norm": 0.828125, + "learning_rate": 0.00017312723385194155, + "loss": 1.0055, + "step": 15452 + }, + { + "epoch": 0.39678970857987456, + "grad_norm": 0.828125, + "learning_rate": 0.00017312418881138412, + "loss": 0.9327, + "step": 15453 + }, + { + "epoch": 0.39681538577579634, + "grad_norm": 0.79296875, + "learning_rate": 0.00017312114362509674, + "loss": 0.8926, + "step": 15454 + }, + { + "epoch": 0.3968410629717182, + "grad_norm": 0.72265625, + "learning_rate": 0.00017311809829308555, + "loss": 0.8615, + "step": 15455 + }, + { + "epoch": 0.39686674016764, + "grad_norm": 0.76953125, + "learning_rate": 0.00017311505281535663, + "loss": 0.9583, + "step": 15456 + }, + { + "epoch": 0.3968924173635618, + "grad_norm": 0.77734375, + "learning_rate": 0.00017311200719191599, + "loss": 0.8809, + "step": 15457 + }, + { + "epoch": 0.3969180945594836, + "grad_norm": 0.72265625, + "learning_rate": 0.0001731089614227697, + "loss": 0.8791, + "step": 15458 + }, + { + "epoch": 0.39694377175540546, + "grad_norm": 0.76171875, + "learning_rate": 0.00017310591550792387, + "loss": 0.9154, + "step": 15459 + }, + { + "epoch": 0.3969694489513273, + "grad_norm": 0.78125, + "learning_rate": 0.00017310286944738457, + "loss": 0.9391, + "step": 15460 + }, + { + "epoch": 0.3969951261472491, + "grad_norm": 0.84765625, + "learning_rate": 0.00017309982324115782, + "loss": 0.8904, + "step": 15461 + }, + { + "epoch": 0.3970208033431709, + "grad_norm": 0.83203125, + "learning_rate": 0.00017309677688924973, + "loss": 0.8289, + "step": 15462 + }, + { + "epoch": 0.39704648053909275, + "grad_norm": 0.76171875, + "learning_rate": 0.0001730937303916664, + "loss": 0.8701, + "step": 15463 + }, + { + "epoch": 0.39707215773501453, + "grad_norm": 0.80859375, + "learning_rate": 0.00017309068374841386, + "loss": 0.9498, + "step": 15464 + }, + { + "epoch": 0.39709783493093637, + "grad_norm": 0.75390625, + "learning_rate": 0.00017308763695949816, + "loss": 0.7561, + "step": 15465 + }, + { + "epoch": 0.3971235121268582, + "grad_norm": 0.83984375, + "learning_rate": 0.00017308459002492543, + "loss": 1.0646, + "step": 15466 + }, + { + "epoch": 0.39714918932278, + "grad_norm": 0.74609375, + "learning_rate": 0.00017308154294470168, + "loss": 0.8135, + "step": 15467 + }, + { + "epoch": 0.3971748665187018, + "grad_norm": 0.79296875, + "learning_rate": 0.00017307849571883307, + "loss": 0.9899, + "step": 15468 + }, + { + "epoch": 0.39720054371462366, + "grad_norm": 0.83203125, + "learning_rate": 0.0001730754483473256, + "loss": 0.9742, + "step": 15469 + }, + { + "epoch": 0.3972262209105455, + "grad_norm": 0.734375, + "learning_rate": 0.00017307240083018534, + "loss": 0.961, + "step": 15470 + }, + { + "epoch": 0.39725189810646727, + "grad_norm": 0.82421875, + "learning_rate": 0.00017306935316741838, + "loss": 0.859, + "step": 15471 + }, + { + "epoch": 0.3972775753023891, + "grad_norm": 0.79296875, + "learning_rate": 0.00017306630535903083, + "loss": 0.9943, + "step": 15472 + }, + { + "epoch": 0.39730325249831094, + "grad_norm": 0.8125, + "learning_rate": 0.00017306325740502874, + "loss": 0.9326, + "step": 15473 + }, + { + "epoch": 0.3973289296942327, + "grad_norm": 0.7578125, + "learning_rate": 0.00017306020930541816, + "loss": 0.8322, + "step": 15474 + }, + { + "epoch": 0.39735460689015456, + "grad_norm": 0.77734375, + "learning_rate": 0.00017305716106020516, + "loss": 1.0587, + "step": 15475 + }, + { + "epoch": 0.3973802840860764, + "grad_norm": 0.8203125, + "learning_rate": 0.00017305411266939585, + "loss": 0.9718, + "step": 15476 + }, + { + "epoch": 0.3974059612819982, + "grad_norm": 0.71875, + "learning_rate": 0.00017305106413299632, + "loss": 0.8745, + "step": 15477 + }, + { + "epoch": 0.39743163847792, + "grad_norm": 0.66796875, + "learning_rate": 0.0001730480154510126, + "loss": 0.8217, + "step": 15478 + }, + { + "epoch": 0.39745731567384185, + "grad_norm": 0.703125, + "learning_rate": 0.0001730449666234508, + "loss": 0.8261, + "step": 15479 + }, + { + "epoch": 0.3974829928697637, + "grad_norm": 0.69140625, + "learning_rate": 0.00017304191765031695, + "loss": 0.8662, + "step": 15480 + }, + { + "epoch": 0.39750867006568547, + "grad_norm": 0.7734375, + "learning_rate": 0.00017303886853161716, + "loss": 0.9559, + "step": 15481 + }, + { + "epoch": 0.3975343472616073, + "grad_norm": 0.81640625, + "learning_rate": 0.0001730358192673575, + "loss": 0.903, + "step": 15482 + }, + { + "epoch": 0.39756002445752914, + "grad_norm": 0.7890625, + "learning_rate": 0.00017303276985754405, + "loss": 0.8661, + "step": 15483 + }, + { + "epoch": 0.3975857016534509, + "grad_norm": 0.94140625, + "learning_rate": 0.0001730297203021829, + "loss": 0.9463, + "step": 15484 + }, + { + "epoch": 0.39761137884937275, + "grad_norm": 0.796875, + "learning_rate": 0.0001730266706012801, + "loss": 0.9133, + "step": 15485 + }, + { + "epoch": 0.3976370560452946, + "grad_norm": 0.8125, + "learning_rate": 0.00017302362075484178, + "loss": 1.1405, + "step": 15486 + }, + { + "epoch": 0.39766273324121637, + "grad_norm": 0.8125, + "learning_rate": 0.00017302057076287394, + "loss": 0.9705, + "step": 15487 + }, + { + "epoch": 0.3976884104371382, + "grad_norm": 0.734375, + "learning_rate": 0.0001730175206253827, + "loss": 0.9154, + "step": 15488 + }, + { + "epoch": 0.39771408763306004, + "grad_norm": 0.8203125, + "learning_rate": 0.00017301447034237416, + "loss": 0.9388, + "step": 15489 + }, + { + "epoch": 0.3977397648289819, + "grad_norm": 0.7578125, + "learning_rate": 0.00017301141991385435, + "loss": 1.0102, + "step": 15490 + }, + { + "epoch": 0.39776544202490366, + "grad_norm": 0.765625, + "learning_rate": 0.0001730083693398294, + "loss": 1.0174, + "step": 15491 + }, + { + "epoch": 0.3977911192208255, + "grad_norm": 0.82421875, + "learning_rate": 0.00017300531862030533, + "loss": 0.8994, + "step": 15492 + }, + { + "epoch": 0.39781679641674733, + "grad_norm": 0.75, + "learning_rate": 0.00017300226775528833, + "loss": 0.9426, + "step": 15493 + }, + { + "epoch": 0.3978424736126691, + "grad_norm": 0.7265625, + "learning_rate": 0.00017299921674478433, + "loss": 1.1122, + "step": 15494 + }, + { + "epoch": 0.39786815080859095, + "grad_norm": 0.8515625, + "learning_rate": 0.0001729961655887995, + "loss": 0.9766, + "step": 15495 + }, + { + "epoch": 0.3978938280045128, + "grad_norm": 0.75, + "learning_rate": 0.0001729931142873399, + "loss": 0.9779, + "step": 15496 + }, + { + "epoch": 0.39791950520043456, + "grad_norm": 0.80859375, + "learning_rate": 0.00017299006284041165, + "loss": 0.9227, + "step": 15497 + }, + { + "epoch": 0.3979451823963564, + "grad_norm": 0.7109375, + "learning_rate": 0.00017298701124802076, + "loss": 0.9247, + "step": 15498 + }, + { + "epoch": 0.39797085959227824, + "grad_norm": 0.75, + "learning_rate": 0.0001729839595101734, + "loss": 0.7539, + "step": 15499 + }, + { + "epoch": 0.3979965367882, + "grad_norm": 0.796875, + "learning_rate": 0.00017298090762687553, + "loss": 0.9289, + "step": 15500 + }, + { + "epoch": 0.39802221398412185, + "grad_norm": 0.83984375, + "learning_rate": 0.00017297785559813335, + "loss": 1.0712, + "step": 15501 + }, + { + "epoch": 0.3980478911800437, + "grad_norm": 0.84375, + "learning_rate": 0.00017297480342395288, + "loss": 0.9413, + "step": 15502 + }, + { + "epoch": 0.3980735683759655, + "grad_norm": 0.734375, + "learning_rate": 0.00017297175110434022, + "loss": 0.8608, + "step": 15503 + }, + { + "epoch": 0.3980992455718873, + "grad_norm": 0.7578125, + "learning_rate": 0.0001729686986393015, + "loss": 0.889, + "step": 15504 + }, + { + "epoch": 0.39812492276780914, + "grad_norm": 0.79296875, + "learning_rate": 0.0001729656460288427, + "loss": 0.9508, + "step": 15505 + }, + { + "epoch": 0.398150599963731, + "grad_norm": 0.81640625, + "learning_rate": 0.00017296259327296998, + "loss": 1.0021, + "step": 15506 + }, + { + "epoch": 0.39817627715965276, + "grad_norm": 2.59375, + "learning_rate": 0.00017295954037168938, + "loss": 1.0458, + "step": 15507 + }, + { + "epoch": 0.3982019543555746, + "grad_norm": 0.75390625, + "learning_rate": 0.00017295648732500702, + "loss": 0.8432, + "step": 15508 + }, + { + "epoch": 0.39822763155149643, + "grad_norm": 0.76953125, + "learning_rate": 0.00017295343413292898, + "loss": 0.8505, + "step": 15509 + }, + { + "epoch": 0.3982533087474182, + "grad_norm": 0.75390625, + "learning_rate": 0.00017295038079546133, + "loss": 0.9204, + "step": 15510 + }, + { + "epoch": 0.39827898594334005, + "grad_norm": 0.890625, + "learning_rate": 0.00017294732731261013, + "loss": 1.0088, + "step": 15511 + }, + { + "epoch": 0.3983046631392619, + "grad_norm": 0.79296875, + "learning_rate": 0.00017294427368438154, + "loss": 0.8412, + "step": 15512 + }, + { + "epoch": 0.3983303403351837, + "grad_norm": 0.83203125, + "learning_rate": 0.00017294121991078158, + "loss": 0.8774, + "step": 15513 + }, + { + "epoch": 0.3983560175311055, + "grad_norm": 0.84765625, + "learning_rate": 0.00017293816599181635, + "loss": 0.9717, + "step": 15514 + }, + { + "epoch": 0.39838169472702734, + "grad_norm": 0.78515625, + "learning_rate": 0.00017293511192749197, + "loss": 0.844, + "step": 15515 + }, + { + "epoch": 0.39840737192294917, + "grad_norm": 0.8125, + "learning_rate": 0.00017293205771781449, + "loss": 0.9495, + "step": 15516 + }, + { + "epoch": 0.39843304911887095, + "grad_norm": 0.71484375, + "learning_rate": 0.00017292900336279, + "loss": 0.8809, + "step": 15517 + }, + { + "epoch": 0.3984587263147928, + "grad_norm": 0.82421875, + "learning_rate": 0.0001729259488624246, + "loss": 1.0137, + "step": 15518 + }, + { + "epoch": 0.3984844035107146, + "grad_norm": 0.671875, + "learning_rate": 0.00017292289421672437, + "loss": 0.8756, + "step": 15519 + }, + { + "epoch": 0.3985100807066364, + "grad_norm": 0.796875, + "learning_rate": 0.00017291983942569538, + "loss": 0.9557, + "step": 15520 + }, + { + "epoch": 0.39853575790255824, + "grad_norm": 0.76171875, + "learning_rate": 0.00017291678448934378, + "loss": 0.9511, + "step": 15521 + }, + { + "epoch": 0.3985614350984801, + "grad_norm": 0.765625, + "learning_rate": 0.00017291372940767555, + "loss": 0.8528, + "step": 15522 + }, + { + "epoch": 0.3985871122944019, + "grad_norm": 0.7734375, + "learning_rate": 0.00017291067418069692, + "loss": 0.9397, + "step": 15523 + }, + { + "epoch": 0.3986127894903237, + "grad_norm": 0.7578125, + "learning_rate": 0.00017290761880841383, + "loss": 0.9973, + "step": 15524 + }, + { + "epoch": 0.39863846668624553, + "grad_norm": 0.78125, + "learning_rate": 0.0001729045632908325, + "loss": 0.8279, + "step": 15525 + }, + { + "epoch": 0.39866414388216737, + "grad_norm": 0.796875, + "learning_rate": 0.00017290150762795894, + "loss": 1.0011, + "step": 15526 + }, + { + "epoch": 0.39868982107808915, + "grad_norm": 0.7890625, + "learning_rate": 0.00017289845181979924, + "loss": 1.0223, + "step": 15527 + }, + { + "epoch": 0.398715498274011, + "grad_norm": 0.79296875, + "learning_rate": 0.00017289539586635955, + "loss": 0.8763, + "step": 15528 + }, + { + "epoch": 0.3987411754699328, + "grad_norm": 0.7109375, + "learning_rate": 0.0001728923397676459, + "loss": 0.8818, + "step": 15529 + }, + { + "epoch": 0.3987668526658546, + "grad_norm": 0.79296875, + "learning_rate": 0.0001728892835236644, + "loss": 0.8919, + "step": 15530 + }, + { + "epoch": 0.39879252986177643, + "grad_norm": 0.73046875, + "learning_rate": 0.00017288622713442113, + "loss": 0.8936, + "step": 15531 + }, + { + "epoch": 0.39881820705769827, + "grad_norm": 0.8046875, + "learning_rate": 0.00017288317059992221, + "loss": 0.9125, + "step": 15532 + }, + { + "epoch": 0.3988438842536201, + "grad_norm": 0.76171875, + "learning_rate": 0.0001728801139201737, + "loss": 0.9594, + "step": 15533 + }, + { + "epoch": 0.3988695614495419, + "grad_norm": 0.84375, + "learning_rate": 0.0001728770570951817, + "loss": 0.893, + "step": 15534 + }, + { + "epoch": 0.3988952386454637, + "grad_norm": 0.7421875, + "learning_rate": 0.00017287400012495235, + "loss": 0.9709, + "step": 15535 + }, + { + "epoch": 0.39892091584138556, + "grad_norm": 0.76953125, + "learning_rate": 0.00017287094300949164, + "loss": 0.8169, + "step": 15536 + }, + { + "epoch": 0.39894659303730734, + "grad_norm": 0.74609375, + "learning_rate": 0.0001728678857488058, + "loss": 0.8448, + "step": 15537 + }, + { + "epoch": 0.3989722702332292, + "grad_norm": 0.7734375, + "learning_rate": 0.00017286482834290079, + "loss": 0.8242, + "step": 15538 + }, + { + "epoch": 0.398997947429151, + "grad_norm": 0.765625, + "learning_rate": 0.00017286177079178276, + "loss": 0.9159, + "step": 15539 + }, + { + "epoch": 0.3990236246250728, + "grad_norm": 0.72265625, + "learning_rate": 0.0001728587130954578, + "loss": 0.8982, + "step": 15540 + }, + { + "epoch": 0.39904930182099463, + "grad_norm": 0.796875, + "learning_rate": 0.00017285565525393203, + "loss": 0.953, + "step": 15541 + }, + { + "epoch": 0.39907497901691646, + "grad_norm": 0.83203125, + "learning_rate": 0.00017285259726721152, + "loss": 0.9519, + "step": 15542 + }, + { + "epoch": 0.3991006562128383, + "grad_norm": 0.80859375, + "learning_rate": 0.00017284953913530236, + "loss": 1.0494, + "step": 15543 + }, + { + "epoch": 0.3991263334087601, + "grad_norm": 0.75, + "learning_rate": 0.00017284648085821064, + "loss": 0.8712, + "step": 15544 + }, + { + "epoch": 0.3991520106046819, + "grad_norm": 0.828125, + "learning_rate": 0.00017284342243594248, + "loss": 1.132, + "step": 15545 + }, + { + "epoch": 0.39917768780060375, + "grad_norm": 0.83984375, + "learning_rate": 0.00017284036386850394, + "loss": 1.0706, + "step": 15546 + }, + { + "epoch": 0.39920336499652553, + "grad_norm": 0.875, + "learning_rate": 0.00017283730515590113, + "loss": 0.8631, + "step": 15547 + }, + { + "epoch": 0.39922904219244737, + "grad_norm": 0.87109375, + "learning_rate": 0.0001728342462981402, + "loss": 0.9123, + "step": 15548 + }, + { + "epoch": 0.3992547193883692, + "grad_norm": 0.79296875, + "learning_rate": 0.00017283118729522712, + "loss": 0.9433, + "step": 15549 + }, + { + "epoch": 0.399280396584291, + "grad_norm": 0.8125, + "learning_rate": 0.00017282812814716812, + "loss": 0.9037, + "step": 15550 + }, + { + "epoch": 0.3993060737802128, + "grad_norm": 0.8203125, + "learning_rate": 0.0001728250688539692, + "loss": 0.9286, + "step": 15551 + }, + { + "epoch": 0.39933175097613466, + "grad_norm": 0.81640625, + "learning_rate": 0.00017282200941563653, + "loss": 0.9539, + "step": 15552 + }, + { + "epoch": 0.3993574281720565, + "grad_norm": 0.7890625, + "learning_rate": 0.00017281894983217614, + "loss": 0.9277, + "step": 15553 + }, + { + "epoch": 0.3993831053679783, + "grad_norm": 0.796875, + "learning_rate": 0.0001728158901035942, + "loss": 0.9444, + "step": 15554 + }, + { + "epoch": 0.3994087825639001, + "grad_norm": 0.78125, + "learning_rate": 0.00017281283022989674, + "loss": 1.0236, + "step": 15555 + }, + { + "epoch": 0.39943445975982195, + "grad_norm": 0.83203125, + "learning_rate": 0.00017280977021108987, + "loss": 0.9899, + "step": 15556 + }, + { + "epoch": 0.3994601369557437, + "grad_norm": 0.84375, + "learning_rate": 0.00017280671004717976, + "loss": 1.0028, + "step": 15557 + }, + { + "epoch": 0.39948581415166556, + "grad_norm": 0.72265625, + "learning_rate": 0.00017280364973817244, + "loss": 0.8388, + "step": 15558 + }, + { + "epoch": 0.3995114913475874, + "grad_norm": 0.7578125, + "learning_rate": 0.00017280058928407396, + "loss": 0.8503, + "step": 15559 + }, + { + "epoch": 0.3995371685435092, + "grad_norm": 0.71875, + "learning_rate": 0.00017279752868489055, + "loss": 0.997, + "step": 15560 + }, + { + "epoch": 0.399562845739431, + "grad_norm": 0.75390625, + "learning_rate": 0.0001727944679406282, + "loss": 0.879, + "step": 15561 + }, + { + "epoch": 0.39958852293535285, + "grad_norm": 0.71875, + "learning_rate": 0.00017279140705129308, + "loss": 0.8189, + "step": 15562 + }, + { + "epoch": 0.3996142001312747, + "grad_norm": 0.8046875, + "learning_rate": 0.00017278834601689125, + "loss": 0.8866, + "step": 15563 + }, + { + "epoch": 0.39963987732719647, + "grad_norm": 0.8359375, + "learning_rate": 0.00017278528483742882, + "loss": 0.9935, + "step": 15564 + }, + { + "epoch": 0.3996655545231183, + "grad_norm": 0.77734375, + "learning_rate": 0.00017278222351291191, + "loss": 0.9749, + "step": 15565 + }, + { + "epoch": 0.39969123171904014, + "grad_norm": 0.74609375, + "learning_rate": 0.00017277916204334656, + "loss": 0.9456, + "step": 15566 + }, + { + "epoch": 0.3997169089149619, + "grad_norm": 0.75390625, + "learning_rate": 0.00017277610042873892, + "loss": 0.8173, + "step": 15567 + }, + { + "epoch": 0.39974258611088376, + "grad_norm": 0.83203125, + "learning_rate": 0.00017277303866909514, + "loss": 0.8356, + "step": 15568 + }, + { + "epoch": 0.3997682633068056, + "grad_norm": 0.77734375, + "learning_rate": 0.00017276997676442122, + "loss": 1.0936, + "step": 15569 + }, + { + "epoch": 0.3997939405027274, + "grad_norm": 0.84765625, + "learning_rate": 0.00017276691471472332, + "loss": 0.9758, + "step": 15570 + }, + { + "epoch": 0.3998196176986492, + "grad_norm": 0.83203125, + "learning_rate": 0.00017276385252000756, + "loss": 0.8231, + "step": 15571 + }, + { + "epoch": 0.39984529489457105, + "grad_norm": 0.7734375, + "learning_rate": 0.00017276079018027998, + "loss": 0.7541, + "step": 15572 + }, + { + "epoch": 0.3998709720904929, + "grad_norm": 0.72265625, + "learning_rate": 0.00017275772769554672, + "loss": 0.8869, + "step": 15573 + }, + { + "epoch": 0.39989664928641466, + "grad_norm": 0.75390625, + "learning_rate": 0.00017275466506581387, + "loss": 0.9723, + "step": 15574 + }, + { + "epoch": 0.3999223264823365, + "grad_norm": 0.79296875, + "learning_rate": 0.00017275160229108757, + "loss": 1.0267, + "step": 15575 + }, + { + "epoch": 0.39994800367825833, + "grad_norm": 0.8046875, + "learning_rate": 0.0001727485393713739, + "loss": 0.9515, + "step": 15576 + }, + { + "epoch": 0.3999736808741801, + "grad_norm": 0.7890625, + "learning_rate": 0.00017274547630667897, + "loss": 0.9767, + "step": 15577 + }, + { + "epoch": 0.39999935807010195, + "grad_norm": 0.80859375, + "learning_rate": 0.00017274241309700884, + "loss": 0.8436, + "step": 15578 + }, + { + "epoch": 0.4000250352660238, + "grad_norm": 0.8125, + "learning_rate": 0.00017273934974236968, + "loss": 0.9746, + "step": 15579 + }, + { + "epoch": 0.40005071246194557, + "grad_norm": 0.8203125, + "learning_rate": 0.00017273628624276755, + "loss": 0.8816, + "step": 15580 + }, + { + "epoch": 0.4000763896578674, + "grad_norm": 0.80859375, + "learning_rate": 0.00017273322259820856, + "loss": 0.8839, + "step": 15581 + }, + { + "epoch": 0.40010206685378924, + "grad_norm": 0.7421875, + "learning_rate": 0.00017273015880869887, + "loss": 0.9215, + "step": 15582 + }, + { + "epoch": 0.4001277440497111, + "grad_norm": 0.7421875, + "learning_rate": 0.00017272709487424452, + "loss": 0.9427, + "step": 15583 + }, + { + "epoch": 0.40015342124563286, + "grad_norm": 0.765625, + "learning_rate": 0.00017272403079485164, + "loss": 0.9004, + "step": 15584 + }, + { + "epoch": 0.4001790984415547, + "grad_norm": 0.79296875, + "learning_rate": 0.00017272096657052632, + "loss": 1.0099, + "step": 15585 + }, + { + "epoch": 0.40020477563747653, + "grad_norm": 0.82421875, + "learning_rate": 0.0001727179022012747, + "loss": 0.9528, + "step": 15586 + }, + { + "epoch": 0.4002304528333983, + "grad_norm": 0.74609375, + "learning_rate": 0.00017271483768710287, + "loss": 0.9535, + "step": 15587 + }, + { + "epoch": 0.40025613002932015, + "grad_norm": 0.7421875, + "learning_rate": 0.00017271177302801696, + "loss": 0.9586, + "step": 15588 + }, + { + "epoch": 0.400281807225242, + "grad_norm": 0.7890625, + "learning_rate": 0.000172708708224023, + "loss": 0.9507, + "step": 15589 + }, + { + "epoch": 0.40030748442116376, + "grad_norm": 0.81640625, + "learning_rate": 0.0001727056432751272, + "loss": 1.0125, + "step": 15590 + }, + { + "epoch": 0.4003331616170856, + "grad_norm": 0.8046875, + "learning_rate": 0.0001727025781813356, + "loss": 0.9503, + "step": 15591 + }, + { + "epoch": 0.40035883881300743, + "grad_norm": 0.79296875, + "learning_rate": 0.00017269951294265432, + "loss": 0.9581, + "step": 15592 + }, + { + "epoch": 0.40038451600892927, + "grad_norm": 0.73828125, + "learning_rate": 0.0001726964475590895, + "loss": 0.9921, + "step": 15593 + }, + { + "epoch": 0.40041019320485105, + "grad_norm": 0.7421875, + "learning_rate": 0.0001726933820306472, + "loss": 0.9764, + "step": 15594 + }, + { + "epoch": 0.4004358704007729, + "grad_norm": 0.80859375, + "learning_rate": 0.00017269031635733356, + "loss": 0.8834, + "step": 15595 + }, + { + "epoch": 0.4004615475966947, + "grad_norm": 0.828125, + "learning_rate": 0.0001726872505391547, + "loss": 1.0585, + "step": 15596 + }, + { + "epoch": 0.4004872247926165, + "grad_norm": 0.8125, + "learning_rate": 0.00017268418457611672, + "loss": 0.7939, + "step": 15597 + }, + { + "epoch": 0.40051290198853834, + "grad_norm": 0.796875, + "learning_rate": 0.00017268111846822573, + "loss": 0.912, + "step": 15598 + }, + { + "epoch": 0.4005385791844602, + "grad_norm": 0.87890625, + "learning_rate": 0.0001726780522154878, + "loss": 0.9537, + "step": 15599 + }, + { + "epoch": 0.40056425638038196, + "grad_norm": 0.77734375, + "learning_rate": 0.0001726749858179091, + "loss": 1.046, + "step": 15600 + }, + { + "epoch": 0.4005899335763038, + "grad_norm": 0.76953125, + "learning_rate": 0.0001726719192754957, + "loss": 0.9424, + "step": 15601 + }, + { + "epoch": 0.4006156107722256, + "grad_norm": 0.828125, + "learning_rate": 0.00017266885258825378, + "loss": 0.8608, + "step": 15602 + }, + { + "epoch": 0.40064128796814746, + "grad_norm": 0.78515625, + "learning_rate": 0.00017266578575618935, + "loss": 0.9803, + "step": 15603 + }, + { + "epoch": 0.40066696516406924, + "grad_norm": 0.7421875, + "learning_rate": 0.0001726627187793086, + "loss": 0.8221, + "step": 15604 + }, + { + "epoch": 0.4006926423599911, + "grad_norm": 0.74609375, + "learning_rate": 0.0001726596516576176, + "loss": 0.8459, + "step": 15605 + }, + { + "epoch": 0.4007183195559129, + "grad_norm": 0.75, + "learning_rate": 0.0001726565843911225, + "loss": 0.9265, + "step": 15606 + }, + { + "epoch": 0.4007439967518347, + "grad_norm": 0.7890625, + "learning_rate": 0.00017265351697982938, + "loss": 0.8486, + "step": 15607 + }, + { + "epoch": 0.40076967394775653, + "grad_norm": 0.8125, + "learning_rate": 0.00017265044942374434, + "loss": 0.9313, + "step": 15608 + }, + { + "epoch": 0.40079535114367837, + "grad_norm": 0.75390625, + "learning_rate": 0.00017264738172287353, + "loss": 0.8566, + "step": 15609 + }, + { + "epoch": 0.40082102833960015, + "grad_norm": 0.890625, + "learning_rate": 0.00017264431387722305, + "loss": 1.0683, + "step": 15610 + }, + { + "epoch": 0.400846705535522, + "grad_norm": 0.83984375, + "learning_rate": 0.00017264124588679903, + "loss": 0.9914, + "step": 15611 + }, + { + "epoch": 0.4008723827314438, + "grad_norm": 0.8203125, + "learning_rate": 0.00017263817775160756, + "loss": 0.9086, + "step": 15612 + }, + { + "epoch": 0.40089805992736566, + "grad_norm": 0.84765625, + "learning_rate": 0.00017263510947165475, + "loss": 0.9465, + "step": 15613 + }, + { + "epoch": 0.40092373712328744, + "grad_norm": 0.86328125, + "learning_rate": 0.00017263204104694675, + "loss": 0.9636, + "step": 15614 + }, + { + "epoch": 0.4009494143192093, + "grad_norm": 0.8671875, + "learning_rate": 0.00017262897247748962, + "loss": 0.919, + "step": 15615 + }, + { + "epoch": 0.4009750915151311, + "grad_norm": 0.8203125, + "learning_rate": 0.00017262590376328953, + "loss": 0.9192, + "step": 15616 + }, + { + "epoch": 0.4010007687110529, + "grad_norm": 0.77734375, + "learning_rate": 0.0001726228349043526, + "loss": 0.992, + "step": 15617 + }, + { + "epoch": 0.4010264459069747, + "grad_norm": 0.796875, + "learning_rate": 0.00017261976590068488, + "loss": 0.8589, + "step": 15618 + }, + { + "epoch": 0.40105212310289656, + "grad_norm": 0.78515625, + "learning_rate": 0.00017261669675229256, + "loss": 1.0411, + "step": 15619 + }, + { + "epoch": 0.40107780029881834, + "grad_norm": 0.84765625, + "learning_rate": 0.00017261362745918168, + "loss": 0.8318, + "step": 15620 + }, + { + "epoch": 0.4011034774947402, + "grad_norm": 0.75390625, + "learning_rate": 0.00017261055802135844, + "loss": 0.9958, + "step": 15621 + }, + { + "epoch": 0.401129154690662, + "grad_norm": 0.76953125, + "learning_rate": 0.0001726074884388289, + "loss": 1.0146, + "step": 15622 + }, + { + "epoch": 0.40115483188658385, + "grad_norm": 0.734375, + "learning_rate": 0.0001726044187115992, + "loss": 1.0026, + "step": 15623 + }, + { + "epoch": 0.40118050908250563, + "grad_norm": 0.77734375, + "learning_rate": 0.00017260134883967542, + "loss": 0.8872, + "step": 15624 + }, + { + "epoch": 0.40120618627842747, + "grad_norm": 0.96484375, + "learning_rate": 0.00017259827882306375, + "loss": 1.0725, + "step": 15625 + }, + { + "epoch": 0.4012318634743493, + "grad_norm": 0.7734375, + "learning_rate": 0.00017259520866177023, + "loss": 0.9083, + "step": 15626 + }, + { + "epoch": 0.4012575406702711, + "grad_norm": 0.78125, + "learning_rate": 0.00017259213835580105, + "loss": 0.9217, + "step": 15627 + }, + { + "epoch": 0.4012832178661929, + "grad_norm": 0.75, + "learning_rate": 0.0001725890679051623, + "loss": 0.8674, + "step": 15628 + }, + { + "epoch": 0.40130889506211476, + "grad_norm": 0.70703125, + "learning_rate": 0.00017258599730986008, + "loss": 0.9057, + "step": 15629 + }, + { + "epoch": 0.40133457225803654, + "grad_norm": 0.7890625, + "learning_rate": 0.0001725829265699005, + "loss": 0.8605, + "step": 15630 + }, + { + "epoch": 0.4013602494539584, + "grad_norm": 0.76953125, + "learning_rate": 0.00017257985568528973, + "loss": 0.8271, + "step": 15631 + }, + { + "epoch": 0.4013859266498802, + "grad_norm": 0.765625, + "learning_rate": 0.00017257678465603386, + "loss": 0.7503, + "step": 15632 + }, + { + "epoch": 0.40141160384580205, + "grad_norm": 0.69921875, + "learning_rate": 0.00017257371348213901, + "loss": 0.8277, + "step": 15633 + }, + { + "epoch": 0.4014372810417238, + "grad_norm": 0.78515625, + "learning_rate": 0.00017257064216361133, + "loss": 0.9864, + "step": 15634 + }, + { + "epoch": 0.40146295823764566, + "grad_norm": 0.80859375, + "learning_rate": 0.0001725675707004569, + "loss": 0.899, + "step": 15635 + }, + { + "epoch": 0.4014886354335675, + "grad_norm": 0.8046875, + "learning_rate": 0.00017256449909268184, + "loss": 1.0193, + "step": 15636 + }, + { + "epoch": 0.4015143126294893, + "grad_norm": 0.734375, + "learning_rate": 0.0001725614273402923, + "loss": 0.8254, + "step": 15637 + }, + { + "epoch": 0.4015399898254111, + "grad_norm": 0.74609375, + "learning_rate": 0.00017255835544329438, + "loss": 0.8946, + "step": 15638 + }, + { + "epoch": 0.40156566702133295, + "grad_norm": 0.7734375, + "learning_rate": 0.00017255528340169422, + "loss": 0.9178, + "step": 15639 + }, + { + "epoch": 0.40159134421725473, + "grad_norm": 0.70703125, + "learning_rate": 0.00017255221121549797, + "loss": 0.7947, + "step": 15640 + }, + { + "epoch": 0.40161702141317657, + "grad_norm": 0.75390625, + "learning_rate": 0.00017254913888471168, + "loss": 1.0419, + "step": 15641 + }, + { + "epoch": 0.4016426986090984, + "grad_norm": 0.78125, + "learning_rate": 0.0001725460664093415, + "loss": 0.9563, + "step": 15642 + }, + { + "epoch": 0.40166837580502024, + "grad_norm": 0.7578125, + "learning_rate": 0.00017254299378939358, + "loss": 1.0264, + "step": 15643 + }, + { + "epoch": 0.401694053000942, + "grad_norm": 0.78515625, + "learning_rate": 0.00017253992102487403, + "loss": 0.9478, + "step": 15644 + }, + { + "epoch": 0.40171973019686386, + "grad_norm": 0.7265625, + "learning_rate": 0.00017253684811578893, + "loss": 0.9504, + "step": 15645 + }, + { + "epoch": 0.4017454073927857, + "grad_norm": 0.7890625, + "learning_rate": 0.0001725337750621445, + "loss": 0.932, + "step": 15646 + }, + { + "epoch": 0.40177108458870747, + "grad_norm": 0.8125, + "learning_rate": 0.00017253070186394677, + "loss": 0.9063, + "step": 15647 + }, + { + "epoch": 0.4017967617846293, + "grad_norm": 0.78125, + "learning_rate": 0.00017252762852120192, + "loss": 0.8689, + "step": 15648 + }, + { + "epoch": 0.40182243898055114, + "grad_norm": 0.71484375, + "learning_rate": 0.00017252455503391603, + "loss": 0.8743, + "step": 15649 + }, + { + "epoch": 0.4018481161764729, + "grad_norm": 0.7578125, + "learning_rate": 0.00017252148140209532, + "loss": 0.9961, + "step": 15650 + }, + { + "epoch": 0.40187379337239476, + "grad_norm": 0.78125, + "learning_rate": 0.00017251840762574577, + "loss": 1.0169, + "step": 15651 + }, + { + "epoch": 0.4018994705683166, + "grad_norm": 0.80859375, + "learning_rate": 0.00017251533370487364, + "loss": 0.976, + "step": 15652 + }, + { + "epoch": 0.40192514776423843, + "grad_norm": 0.80078125, + "learning_rate": 0.00017251225963948498, + "loss": 0.9222, + "step": 15653 + }, + { + "epoch": 0.4019508249601602, + "grad_norm": 0.77734375, + "learning_rate": 0.00017250918542958594, + "loss": 0.9634, + "step": 15654 + }, + { + "epoch": 0.40197650215608205, + "grad_norm": 0.78125, + "learning_rate": 0.0001725061110751826, + "loss": 1.0697, + "step": 15655 + }, + { + "epoch": 0.4020021793520039, + "grad_norm": 0.76171875, + "learning_rate": 0.0001725030365762812, + "loss": 0.9513, + "step": 15656 + }, + { + "epoch": 0.40202785654792567, + "grad_norm": 0.72265625, + "learning_rate": 0.00017249996193288776, + "loss": 0.8821, + "step": 15657 + }, + { + "epoch": 0.4020535337438475, + "grad_norm": 0.80859375, + "learning_rate": 0.00017249688714500848, + "loss": 0.8172, + "step": 15658 + }, + { + "epoch": 0.40207921093976934, + "grad_norm": 3.6875, + "learning_rate": 0.0001724938122126494, + "loss": 1.0589, + "step": 15659 + }, + { + "epoch": 0.4021048881356911, + "grad_norm": 0.7890625, + "learning_rate": 0.00017249073713581674, + "loss": 0.8813, + "step": 15660 + }, + { + "epoch": 0.40213056533161295, + "grad_norm": 0.8515625, + "learning_rate": 0.00017248766191451656, + "loss": 1.0557, + "step": 15661 + }, + { + "epoch": 0.4021562425275348, + "grad_norm": 0.83984375, + "learning_rate": 0.00017248458654875502, + "loss": 1.0593, + "step": 15662 + }, + { + "epoch": 0.4021819197234566, + "grad_norm": 0.80859375, + "learning_rate": 0.00017248151103853827, + "loss": 0.8838, + "step": 15663 + }, + { + "epoch": 0.4022075969193784, + "grad_norm": 0.8125, + "learning_rate": 0.00017247843538387237, + "loss": 0.9466, + "step": 15664 + }, + { + "epoch": 0.40223327411530024, + "grad_norm": 0.7890625, + "learning_rate": 0.00017247535958476355, + "loss": 1.008, + "step": 15665 + }, + { + "epoch": 0.4022589513112221, + "grad_norm": 0.8046875, + "learning_rate": 0.0001724722836412179, + "loss": 0.9867, + "step": 15666 + }, + { + "epoch": 0.40228462850714386, + "grad_norm": 0.8125, + "learning_rate": 0.00017246920755324148, + "loss": 0.9183, + "step": 15667 + }, + { + "epoch": 0.4023103057030657, + "grad_norm": 0.76171875, + "learning_rate": 0.00017246613132084048, + "loss": 0.8761, + "step": 15668 + }, + { + "epoch": 0.40233598289898753, + "grad_norm": 0.76953125, + "learning_rate": 0.00017246305494402105, + "loss": 1.0183, + "step": 15669 + }, + { + "epoch": 0.4023616600949093, + "grad_norm": 0.7890625, + "learning_rate": 0.0001724599784227893, + "loss": 0.8875, + "step": 15670 + }, + { + "epoch": 0.40238733729083115, + "grad_norm": 0.734375, + "learning_rate": 0.00017245690175715132, + "loss": 0.9542, + "step": 15671 + }, + { + "epoch": 0.402413014486753, + "grad_norm": 0.8046875, + "learning_rate": 0.00017245382494711332, + "loss": 0.8922, + "step": 15672 + }, + { + "epoch": 0.4024386916826748, + "grad_norm": 0.7578125, + "learning_rate": 0.00017245074799268135, + "loss": 0.8997, + "step": 15673 + }, + { + "epoch": 0.4024643688785966, + "grad_norm": 0.76953125, + "learning_rate": 0.00017244767089386162, + "loss": 0.9377, + "step": 15674 + }, + { + "epoch": 0.40249004607451844, + "grad_norm": 0.71875, + "learning_rate": 0.00017244459365066024, + "loss": 0.9426, + "step": 15675 + }, + { + "epoch": 0.4025157232704403, + "grad_norm": 0.80859375, + "learning_rate": 0.0001724415162630833, + "loss": 0.839, + "step": 15676 + }, + { + "epoch": 0.40254140046636205, + "grad_norm": 0.875, + "learning_rate": 0.00017243843873113694, + "loss": 0.9584, + "step": 15677 + }, + { + "epoch": 0.4025670776622839, + "grad_norm": 0.78515625, + "learning_rate": 0.00017243536105482736, + "loss": 0.9481, + "step": 15678 + }, + { + "epoch": 0.4025927548582057, + "grad_norm": 0.77734375, + "learning_rate": 0.00017243228323416063, + "loss": 0.9629, + "step": 15679 + }, + { + "epoch": 0.4026184320541275, + "grad_norm": 0.79296875, + "learning_rate": 0.0001724292052691429, + "loss": 1.0906, + "step": 15680 + }, + { + "epoch": 0.40264410925004934, + "grad_norm": 0.80078125, + "learning_rate": 0.0001724261271597803, + "loss": 0.9414, + "step": 15681 + }, + { + "epoch": 0.4026697864459712, + "grad_norm": 0.8203125, + "learning_rate": 0.000172423048906079, + "loss": 0.9835, + "step": 15682 + }, + { + "epoch": 0.402695463641893, + "grad_norm": 0.7109375, + "learning_rate": 0.00017241997050804507, + "loss": 0.8253, + "step": 15683 + }, + { + "epoch": 0.4027211408378148, + "grad_norm": 0.796875, + "learning_rate": 0.0001724168919656847, + "loss": 0.9338, + "step": 15684 + }, + { + "epoch": 0.40274681803373663, + "grad_norm": 0.765625, + "learning_rate": 0.000172413813279004, + "loss": 0.9776, + "step": 15685 + }, + { + "epoch": 0.40277249522965847, + "grad_norm": 0.70703125, + "learning_rate": 0.00017241073444800912, + "loss": 0.9199, + "step": 15686 + }, + { + "epoch": 0.40279817242558025, + "grad_norm": 0.7734375, + "learning_rate": 0.0001724076554727062, + "loss": 0.928, + "step": 15687 + }, + { + "epoch": 0.4028238496215021, + "grad_norm": 0.71875, + "learning_rate": 0.0001724045763531013, + "loss": 0.7888, + "step": 15688 + }, + { + "epoch": 0.4028495268174239, + "grad_norm": 0.7578125, + "learning_rate": 0.00017240149708920067, + "loss": 0.9506, + "step": 15689 + }, + { + "epoch": 0.4028752040133457, + "grad_norm": 0.7890625, + "learning_rate": 0.00017239841768101042, + "loss": 1.0501, + "step": 15690 + }, + { + "epoch": 0.40290088120926754, + "grad_norm": 0.796875, + "learning_rate": 0.0001723953381285366, + "loss": 1.0337, + "step": 15691 + }, + { + "epoch": 0.4029265584051894, + "grad_norm": 0.80078125, + "learning_rate": 0.00017239225843178546, + "loss": 0.8763, + "step": 15692 + }, + { + "epoch": 0.4029522356011112, + "grad_norm": 0.81640625, + "learning_rate": 0.00017238917859076306, + "loss": 1.0357, + "step": 15693 + }, + { + "epoch": 0.402977912797033, + "grad_norm": 0.83984375, + "learning_rate": 0.00017238609860547558, + "loss": 0.9793, + "step": 15694 + }, + { + "epoch": 0.4030035899929548, + "grad_norm": 0.8203125, + "learning_rate": 0.00017238301847592914, + "loss": 1.0335, + "step": 15695 + }, + { + "epoch": 0.40302926718887666, + "grad_norm": 0.75390625, + "learning_rate": 0.00017237993820212987, + "loss": 1.0336, + "step": 15696 + }, + { + "epoch": 0.40305494438479844, + "grad_norm": 0.75, + "learning_rate": 0.00017237685778408393, + "loss": 0.9268, + "step": 15697 + }, + { + "epoch": 0.4030806215807203, + "grad_norm": 0.7734375, + "learning_rate": 0.00017237377722179747, + "loss": 0.9581, + "step": 15698 + }, + { + "epoch": 0.4031062987766421, + "grad_norm": 0.734375, + "learning_rate": 0.00017237069651527658, + "loss": 0.9076, + "step": 15699 + }, + { + "epoch": 0.4031319759725639, + "grad_norm": 0.8046875, + "learning_rate": 0.00017236761566452745, + "loss": 1.0126, + "step": 15700 + }, + { + "epoch": 0.40315765316848573, + "grad_norm": 0.7578125, + "learning_rate": 0.0001723645346695562, + "loss": 0.9321, + "step": 15701 + }, + { + "epoch": 0.40318333036440757, + "grad_norm": 0.7578125, + "learning_rate": 0.00017236145353036897, + "loss": 0.8628, + "step": 15702 + }, + { + "epoch": 0.40320900756032935, + "grad_norm": 0.7578125, + "learning_rate": 0.00017235837224697189, + "loss": 0.845, + "step": 15703 + }, + { + "epoch": 0.4032346847562512, + "grad_norm": 0.77734375, + "learning_rate": 0.00017235529081937112, + "loss": 0.9438, + "step": 15704 + }, + { + "epoch": 0.403260361952173, + "grad_norm": 0.91796875, + "learning_rate": 0.00017235220924757277, + "loss": 0.9769, + "step": 15705 + }, + { + "epoch": 0.40328603914809485, + "grad_norm": 0.80859375, + "learning_rate": 0.000172349127531583, + "loss": 0.8626, + "step": 15706 + }, + { + "epoch": 0.40331171634401664, + "grad_norm": 0.81640625, + "learning_rate": 0.000172346045671408, + "loss": 0.9231, + "step": 15707 + }, + { + "epoch": 0.40333739353993847, + "grad_norm": 0.734375, + "learning_rate": 0.0001723429636670538, + "loss": 0.9602, + "step": 15708 + }, + { + "epoch": 0.4033630707358603, + "grad_norm": 0.83984375, + "learning_rate": 0.00017233988151852664, + "loss": 0.9968, + "step": 15709 + }, + { + "epoch": 0.4033887479317821, + "grad_norm": 1.0078125, + "learning_rate": 0.00017233679922583266, + "loss": 1.0133, + "step": 15710 + }, + { + "epoch": 0.4034144251277039, + "grad_norm": 0.828125, + "learning_rate": 0.00017233371678897794, + "loss": 1.0594, + "step": 15711 + }, + { + "epoch": 0.40344010232362576, + "grad_norm": 0.82421875, + "learning_rate": 0.00017233063420796866, + "loss": 1.0415, + "step": 15712 + }, + { + "epoch": 0.40346577951954754, + "grad_norm": 0.8203125, + "learning_rate": 0.00017232755148281097, + "loss": 1.0729, + "step": 15713 + }, + { + "epoch": 0.4034914567154694, + "grad_norm": 0.77734375, + "learning_rate": 0.000172324468613511, + "loss": 0.9286, + "step": 15714 + }, + { + "epoch": 0.4035171339113912, + "grad_norm": 1.421875, + "learning_rate": 0.0001723213856000749, + "loss": 0.9029, + "step": 15715 + }, + { + "epoch": 0.40354281110731305, + "grad_norm": 0.8046875, + "learning_rate": 0.0001723183024425088, + "loss": 0.8453, + "step": 15716 + }, + { + "epoch": 0.40356848830323483, + "grad_norm": 0.83984375, + "learning_rate": 0.00017231521914081883, + "loss": 0.9712, + "step": 15717 + }, + { + "epoch": 0.40359416549915667, + "grad_norm": 0.94140625, + "learning_rate": 0.0001723121356950112, + "loss": 0.877, + "step": 15718 + }, + { + "epoch": 0.4036198426950785, + "grad_norm": 0.8125, + "learning_rate": 0.00017230905210509202, + "loss": 0.8676, + "step": 15719 + }, + { + "epoch": 0.4036455198910003, + "grad_norm": 0.796875, + "learning_rate": 0.0001723059683710674, + "loss": 0.8935, + "step": 15720 + }, + { + "epoch": 0.4036711970869221, + "grad_norm": 0.76171875, + "learning_rate": 0.00017230288449294353, + "loss": 0.8809, + "step": 15721 + }, + { + "epoch": 0.40369687428284395, + "grad_norm": 0.71875, + "learning_rate": 0.00017229980047072657, + "loss": 0.862, + "step": 15722 + }, + { + "epoch": 0.40372255147876573, + "grad_norm": 0.7578125, + "learning_rate": 0.0001722967163044226, + "loss": 1.0688, + "step": 15723 + }, + { + "epoch": 0.40374822867468757, + "grad_norm": 1.0390625, + "learning_rate": 0.00017229363199403782, + "loss": 0.9263, + "step": 15724 + }, + { + "epoch": 0.4037739058706094, + "grad_norm": 0.8359375, + "learning_rate": 0.00017229054753957837, + "loss": 0.9377, + "step": 15725 + }, + { + "epoch": 0.40379958306653124, + "grad_norm": 0.765625, + "learning_rate": 0.00017228746294105037, + "loss": 0.9131, + "step": 15726 + }, + { + "epoch": 0.403825260262453, + "grad_norm": 0.74609375, + "learning_rate": 0.00017228437819846, + "loss": 0.9571, + "step": 15727 + }, + { + "epoch": 0.40385093745837486, + "grad_norm": 0.8359375, + "learning_rate": 0.00017228129331181338, + "loss": 1.0129, + "step": 15728 + }, + { + "epoch": 0.4038766146542967, + "grad_norm": 0.79296875, + "learning_rate": 0.00017227820828111668, + "loss": 0.9027, + "step": 15729 + }, + { + "epoch": 0.4039022918502185, + "grad_norm": 0.76953125, + "learning_rate": 0.00017227512310637603, + "loss": 0.9413, + "step": 15730 + }, + { + "epoch": 0.4039279690461403, + "grad_norm": 0.7421875, + "learning_rate": 0.00017227203778759763, + "loss": 0.9821, + "step": 15731 + }, + { + "epoch": 0.40395364624206215, + "grad_norm": 0.7109375, + "learning_rate": 0.00017226895232478755, + "loss": 0.8555, + "step": 15732 + }, + { + "epoch": 0.40397932343798393, + "grad_norm": 0.7421875, + "learning_rate": 0.00017226586671795197, + "loss": 0.9981, + "step": 15733 + }, + { + "epoch": 0.40400500063390576, + "grad_norm": 0.8203125, + "learning_rate": 0.00017226278096709705, + "loss": 0.9741, + "step": 15734 + }, + { + "epoch": 0.4040306778298276, + "grad_norm": 0.76953125, + "learning_rate": 0.00017225969507222896, + "loss": 0.9261, + "step": 15735 + }, + { + "epoch": 0.40405635502574944, + "grad_norm": 1.296875, + "learning_rate": 0.0001722566090333538, + "loss": 0.8266, + "step": 15736 + }, + { + "epoch": 0.4040820322216712, + "grad_norm": 0.80859375, + "learning_rate": 0.00017225352285047776, + "loss": 0.9208, + "step": 15737 + }, + { + "epoch": 0.40410770941759305, + "grad_norm": 0.734375, + "learning_rate": 0.00017225043652360696, + "loss": 0.9068, + "step": 15738 + }, + { + "epoch": 0.4041333866135149, + "grad_norm": 0.7578125, + "learning_rate": 0.00017224735005274758, + "loss": 0.9755, + "step": 15739 + }, + { + "epoch": 0.40415906380943667, + "grad_norm": 0.74609375, + "learning_rate": 0.00017224426343790574, + "loss": 0.8908, + "step": 15740 + }, + { + "epoch": 0.4041847410053585, + "grad_norm": 0.78125, + "learning_rate": 0.0001722411766790876, + "loss": 0.905, + "step": 15741 + }, + { + "epoch": 0.40421041820128034, + "grad_norm": 0.6875, + "learning_rate": 0.00017223808977629936, + "loss": 0.8528, + "step": 15742 + }, + { + "epoch": 0.4042360953972021, + "grad_norm": 0.78515625, + "learning_rate": 0.0001722350027295471, + "loss": 0.8118, + "step": 15743 + }, + { + "epoch": 0.40426177259312396, + "grad_norm": 0.78125, + "learning_rate": 0.00017223191553883703, + "loss": 0.8353, + "step": 15744 + }, + { + "epoch": 0.4042874497890458, + "grad_norm": 0.76171875, + "learning_rate": 0.00017222882820417526, + "loss": 0.9918, + "step": 15745 + }, + { + "epoch": 0.40431312698496763, + "grad_norm": 0.76953125, + "learning_rate": 0.00017222574072556796, + "loss": 0.9588, + "step": 15746 + }, + { + "epoch": 0.4043388041808894, + "grad_norm": 0.85546875, + "learning_rate": 0.0001722226531030213, + "loss": 0.9452, + "step": 15747 + }, + { + "epoch": 0.40436448137681125, + "grad_norm": 0.77734375, + "learning_rate": 0.0001722195653365414, + "loss": 0.9191, + "step": 15748 + }, + { + "epoch": 0.4043901585727331, + "grad_norm": 0.8359375, + "learning_rate": 0.00017221647742613443, + "loss": 0.8525, + "step": 15749 + }, + { + "epoch": 0.40441583576865486, + "grad_norm": 0.8125, + "learning_rate": 0.00017221338937180653, + "loss": 0.9428, + "step": 15750 + }, + { + "epoch": 0.4044415129645767, + "grad_norm": 0.80078125, + "learning_rate": 0.0001722103011735639, + "loss": 0.9126, + "step": 15751 + }, + { + "epoch": 0.40446719016049854, + "grad_norm": 0.765625, + "learning_rate": 0.00017220721283141266, + "loss": 0.9734, + "step": 15752 + }, + { + "epoch": 0.4044928673564203, + "grad_norm": 0.73046875, + "learning_rate": 0.00017220412434535893, + "loss": 0.9365, + "step": 15753 + }, + { + "epoch": 0.40451854455234215, + "grad_norm": 0.7578125, + "learning_rate": 0.00017220103571540896, + "loss": 1.0596, + "step": 15754 + }, + { + "epoch": 0.404544221748264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001721979469415688, + "loss": 0.9646, + "step": 15755 + }, + { + "epoch": 0.4045698989441858, + "grad_norm": 0.91015625, + "learning_rate": 0.00017219485802384465, + "loss": 0.9742, + "step": 15756 + }, + { + "epoch": 0.4045955761401076, + "grad_norm": 0.80078125, + "learning_rate": 0.0001721917689622427, + "loss": 0.9418, + "step": 15757 + }, + { + "epoch": 0.40462125333602944, + "grad_norm": 0.72265625, + "learning_rate": 0.00017218867975676906, + "loss": 0.9088, + "step": 15758 + }, + { + "epoch": 0.4046469305319513, + "grad_norm": 0.82421875, + "learning_rate": 0.00017218559040742991, + "loss": 0.9715, + "step": 15759 + }, + { + "epoch": 0.40467260772787306, + "grad_norm": 0.76953125, + "learning_rate": 0.0001721825009142314, + "loss": 0.9216, + "step": 15760 + }, + { + "epoch": 0.4046982849237949, + "grad_norm": 0.78515625, + "learning_rate": 0.00017217941127717966, + "loss": 0.9894, + "step": 15761 + }, + { + "epoch": 0.40472396211971673, + "grad_norm": 0.765625, + "learning_rate": 0.00017217632149628092, + "loss": 0.9293, + "step": 15762 + }, + { + "epoch": 0.4047496393156385, + "grad_norm": 0.796875, + "learning_rate": 0.00017217323157154123, + "loss": 0.9103, + "step": 15763 + }, + { + "epoch": 0.40477531651156035, + "grad_norm": 0.76171875, + "learning_rate": 0.00017217014150296687, + "loss": 0.9318, + "step": 15764 + }, + { + "epoch": 0.4048009937074822, + "grad_norm": 0.80078125, + "learning_rate": 0.0001721670512905639, + "loss": 0.8332, + "step": 15765 + }, + { + "epoch": 0.404826670903404, + "grad_norm": 0.76171875, + "learning_rate": 0.00017216396093433853, + "loss": 0.9376, + "step": 15766 + }, + { + "epoch": 0.4048523480993258, + "grad_norm": 0.734375, + "learning_rate": 0.00017216087043429688, + "loss": 0.8988, + "step": 15767 + }, + { + "epoch": 0.40487802529524763, + "grad_norm": 0.72265625, + "learning_rate": 0.00017215777979044517, + "loss": 0.8532, + "step": 15768 + }, + { + "epoch": 0.40490370249116947, + "grad_norm": 0.78125, + "learning_rate": 0.0001721546890027895, + "loss": 0.875, + "step": 15769 + }, + { + "epoch": 0.40492937968709125, + "grad_norm": 0.75390625, + "learning_rate": 0.00017215159807133605, + "loss": 0.902, + "step": 15770 + }, + { + "epoch": 0.4049550568830131, + "grad_norm": 0.82421875, + "learning_rate": 0.00017214850699609103, + "loss": 1.0208, + "step": 15771 + }, + { + "epoch": 0.4049807340789349, + "grad_norm": 0.77734375, + "learning_rate": 0.0001721454157770605, + "loss": 0.9756, + "step": 15772 + }, + { + "epoch": 0.4050064112748567, + "grad_norm": 0.7421875, + "learning_rate": 0.0001721423244142507, + "loss": 0.9599, + "step": 15773 + }, + { + "epoch": 0.40503208847077854, + "grad_norm": 0.7578125, + "learning_rate": 0.00017213923290766773, + "loss": 1.0642, + "step": 15774 + }, + { + "epoch": 0.4050577656667004, + "grad_norm": 0.796875, + "learning_rate": 0.0001721361412573178, + "loss": 0.9136, + "step": 15775 + }, + { + "epoch": 0.4050834428626222, + "grad_norm": 0.76171875, + "learning_rate": 0.00017213304946320706, + "loss": 0.7992, + "step": 15776 + }, + { + "epoch": 0.405109120058544, + "grad_norm": 0.75390625, + "learning_rate": 0.00017212995752534165, + "loss": 0.8983, + "step": 15777 + }, + { + "epoch": 0.40513479725446583, + "grad_norm": 0.859375, + "learning_rate": 0.00017212686544372782, + "loss": 1.0778, + "step": 15778 + }, + { + "epoch": 0.40516047445038766, + "grad_norm": 0.85546875, + "learning_rate": 0.0001721237732183716, + "loss": 0.9907, + "step": 15779 + }, + { + "epoch": 0.40518615164630944, + "grad_norm": 0.8359375, + "learning_rate": 0.00017212068084927922, + "loss": 0.9907, + "step": 15780 + }, + { + "epoch": 0.4052118288422313, + "grad_norm": 0.765625, + "learning_rate": 0.00017211758833645684, + "loss": 0.9038, + "step": 15781 + }, + { + "epoch": 0.4052375060381531, + "grad_norm": 0.89453125, + "learning_rate": 0.00017211449567991065, + "loss": 0.9566, + "step": 15782 + }, + { + "epoch": 0.4052631832340749, + "grad_norm": 0.72265625, + "learning_rate": 0.00017211140287964676, + "loss": 0.8946, + "step": 15783 + }, + { + "epoch": 0.40528886042999673, + "grad_norm": 0.8046875, + "learning_rate": 0.00017210830993567134, + "loss": 0.8855, + "step": 15784 + }, + { + "epoch": 0.40531453762591857, + "grad_norm": 0.83203125, + "learning_rate": 0.00017210521684799058, + "loss": 0.8676, + "step": 15785 + }, + { + "epoch": 0.4053402148218404, + "grad_norm": 0.7890625, + "learning_rate": 0.00017210212361661067, + "loss": 0.9823, + "step": 15786 + }, + { + "epoch": 0.4053658920177622, + "grad_norm": 0.8359375, + "learning_rate": 0.0001720990302415377, + "loss": 0.967, + "step": 15787 + }, + { + "epoch": 0.405391569213684, + "grad_norm": 0.79296875, + "learning_rate": 0.00017209593672277792, + "loss": 1.0506, + "step": 15788 + }, + { + "epoch": 0.40541724640960586, + "grad_norm": 0.81640625, + "learning_rate": 0.0001720928430603374, + "loss": 0.9466, + "step": 15789 + }, + { + "epoch": 0.40544292360552764, + "grad_norm": 0.76953125, + "learning_rate": 0.00017208974925422238, + "loss": 0.9758, + "step": 15790 + }, + { + "epoch": 0.4054686008014495, + "grad_norm": 0.76171875, + "learning_rate": 0.000172086655304439, + "loss": 0.7924, + "step": 15791 + }, + { + "epoch": 0.4054942779973713, + "grad_norm": 0.73828125, + "learning_rate": 0.00017208356121099343, + "loss": 0.8854, + "step": 15792 + }, + { + "epoch": 0.4055199551932931, + "grad_norm": 0.76171875, + "learning_rate": 0.00017208046697389184, + "loss": 0.875, + "step": 15793 + }, + { + "epoch": 0.4055456323892149, + "grad_norm": 0.76953125, + "learning_rate": 0.0001720773725931404, + "loss": 1.0175, + "step": 15794 + }, + { + "epoch": 0.40557130958513676, + "grad_norm": 0.78125, + "learning_rate": 0.00017207427806874523, + "loss": 0.8875, + "step": 15795 + }, + { + "epoch": 0.4055969867810586, + "grad_norm": 0.76953125, + "learning_rate": 0.00017207118340071254, + "loss": 0.9576, + "step": 15796 + }, + { + "epoch": 0.4056226639769804, + "grad_norm": 0.84375, + "learning_rate": 0.0001720680885890485, + "loss": 0.8952, + "step": 15797 + }, + { + "epoch": 0.4056483411729022, + "grad_norm": 0.74609375, + "learning_rate": 0.00017206499363375927, + "loss": 0.8454, + "step": 15798 + }, + { + "epoch": 0.40567401836882405, + "grad_norm": 0.796875, + "learning_rate": 0.00017206189853485103, + "loss": 1.0158, + "step": 15799 + }, + { + "epoch": 0.40569969556474583, + "grad_norm": 0.75390625, + "learning_rate": 0.00017205880329232992, + "loss": 0.8899, + "step": 15800 + }, + { + "epoch": 0.40572537276066767, + "grad_norm": 0.7265625, + "learning_rate": 0.00017205570790620214, + "loss": 1.0885, + "step": 15801 + }, + { + "epoch": 0.4057510499565895, + "grad_norm": 0.75390625, + "learning_rate": 0.00017205261237647384, + "loss": 0.882, + "step": 15802 + }, + { + "epoch": 0.4057767271525113, + "grad_norm": 0.72265625, + "learning_rate": 0.0001720495167031512, + "loss": 0.8785, + "step": 15803 + }, + { + "epoch": 0.4058024043484331, + "grad_norm": 0.8359375, + "learning_rate": 0.00017204642088624036, + "loss": 0.9354, + "step": 15804 + }, + { + "epoch": 0.40582808154435496, + "grad_norm": 0.7265625, + "learning_rate": 0.0001720433249257475, + "loss": 0.8854, + "step": 15805 + }, + { + "epoch": 0.4058537587402768, + "grad_norm": 0.82421875, + "learning_rate": 0.00017204022882167882, + "loss": 1.118, + "step": 15806 + }, + { + "epoch": 0.4058794359361986, + "grad_norm": 0.7734375, + "learning_rate": 0.00017203713257404047, + "loss": 0.9112, + "step": 15807 + }, + { + "epoch": 0.4059051131321204, + "grad_norm": 0.8671875, + "learning_rate": 0.00017203403618283865, + "loss": 1.0127, + "step": 15808 + }, + { + "epoch": 0.40593079032804225, + "grad_norm": 0.75, + "learning_rate": 0.00017203093964807947, + "loss": 0.941, + "step": 15809 + }, + { + "epoch": 0.405956467523964, + "grad_norm": 0.828125, + "learning_rate": 0.00017202784296976913, + "loss": 0.9779, + "step": 15810 + }, + { + "epoch": 0.40598214471988586, + "grad_norm": 0.703125, + "learning_rate": 0.0001720247461479138, + "loss": 0.9295, + "step": 15811 + }, + { + "epoch": 0.4060078219158077, + "grad_norm": 0.7734375, + "learning_rate": 0.00017202164918251968, + "loss": 0.9041, + "step": 15812 + }, + { + "epoch": 0.4060334991117295, + "grad_norm": 0.8203125, + "learning_rate": 0.00017201855207359291, + "loss": 0.9254, + "step": 15813 + }, + { + "epoch": 0.4060591763076513, + "grad_norm": 0.74609375, + "learning_rate": 0.00017201545482113968, + "loss": 0.862, + "step": 15814 + }, + { + "epoch": 0.40608485350357315, + "grad_norm": 0.8515625, + "learning_rate": 0.00017201235742516616, + "loss": 0.9454, + "step": 15815 + }, + { + "epoch": 0.406110530699495, + "grad_norm": 0.83203125, + "learning_rate": 0.00017200925988567847, + "loss": 0.933, + "step": 15816 + }, + { + "epoch": 0.40613620789541677, + "grad_norm": 0.7890625, + "learning_rate": 0.0001720061622026829, + "loss": 0.8295, + "step": 15817 + }, + { + "epoch": 0.4061618850913386, + "grad_norm": 0.8203125, + "learning_rate": 0.0001720030643761855, + "loss": 0.8936, + "step": 15818 + }, + { + "epoch": 0.40618756228726044, + "grad_norm": 0.81640625, + "learning_rate": 0.0001719999664061925, + "loss": 0.9443, + "step": 15819 + }, + { + "epoch": 0.4062132394831822, + "grad_norm": 0.734375, + "learning_rate": 0.00017199686829271008, + "loss": 0.8709, + "step": 15820 + }, + { + "epoch": 0.40623891667910406, + "grad_norm": 0.83984375, + "learning_rate": 0.00017199377003574443, + "loss": 0.9583, + "step": 15821 + }, + { + "epoch": 0.4062645938750259, + "grad_norm": 0.77734375, + "learning_rate": 0.00017199067163530164, + "loss": 0.9069, + "step": 15822 + }, + { + "epoch": 0.4062902710709477, + "grad_norm": 0.7578125, + "learning_rate": 0.000171987573091388, + "loss": 0.9426, + "step": 15823 + }, + { + "epoch": 0.4063159482668695, + "grad_norm": 0.80078125, + "learning_rate": 0.00017198447440400962, + "loss": 1.1052, + "step": 15824 + }, + { + "epoch": 0.40634162546279134, + "grad_norm": 0.7578125, + "learning_rate": 0.00017198137557317267, + "loss": 0.9408, + "step": 15825 + }, + { + "epoch": 0.4063673026587132, + "grad_norm": 0.7421875, + "learning_rate": 0.00017197827659888332, + "loss": 0.9551, + "step": 15826 + }, + { + "epoch": 0.40639297985463496, + "grad_norm": 0.828125, + "learning_rate": 0.00017197517748114783, + "loss": 1.0029, + "step": 15827 + }, + { + "epoch": 0.4064186570505568, + "grad_norm": 0.71875, + "learning_rate": 0.00017197207821997225, + "loss": 0.9832, + "step": 15828 + }, + { + "epoch": 0.40644433424647863, + "grad_norm": 0.80859375, + "learning_rate": 0.00017196897881536285, + "loss": 0.8443, + "step": 15829 + }, + { + "epoch": 0.4064700114424004, + "grad_norm": 0.81640625, + "learning_rate": 0.00017196587926732578, + "loss": 0.967, + "step": 15830 + }, + { + "epoch": 0.40649568863832225, + "grad_norm": 0.91796875, + "learning_rate": 0.0001719627795758672, + "loss": 0.9945, + "step": 15831 + }, + { + "epoch": 0.4065213658342441, + "grad_norm": 0.7578125, + "learning_rate": 0.0001719596797409933, + "loss": 0.8873, + "step": 15832 + }, + { + "epoch": 0.40654704303016587, + "grad_norm": 0.77734375, + "learning_rate": 0.00017195657976271024, + "loss": 0.9542, + "step": 15833 + }, + { + "epoch": 0.4065727202260877, + "grad_norm": 0.69140625, + "learning_rate": 0.00017195347964102427, + "loss": 0.8506, + "step": 15834 + }, + { + "epoch": 0.40659839742200954, + "grad_norm": 0.796875, + "learning_rate": 0.00017195037937594147, + "loss": 0.9113, + "step": 15835 + }, + { + "epoch": 0.4066240746179314, + "grad_norm": 0.73046875, + "learning_rate": 0.0001719472789674681, + "loss": 0.9697, + "step": 15836 + }, + { + "epoch": 0.40664975181385316, + "grad_norm": 0.75, + "learning_rate": 0.00017194417841561027, + "loss": 0.8806, + "step": 15837 + }, + { + "epoch": 0.406675429009775, + "grad_norm": 0.78125, + "learning_rate": 0.0001719410777203742, + "loss": 0.9362, + "step": 15838 + }, + { + "epoch": 0.4067011062056968, + "grad_norm": 0.7734375, + "learning_rate": 0.00017193797688176606, + "loss": 0.8286, + "step": 15839 + }, + { + "epoch": 0.4067267834016186, + "grad_norm": 0.828125, + "learning_rate": 0.000171934875899792, + "loss": 0.981, + "step": 15840 + }, + { + "epoch": 0.40675246059754044, + "grad_norm": 0.796875, + "learning_rate": 0.0001719317747744583, + "loss": 0.8758, + "step": 15841 + }, + { + "epoch": 0.4067781377934623, + "grad_norm": 0.82421875, + "learning_rate": 0.00017192867350577102, + "loss": 1.0148, + "step": 15842 + }, + { + "epoch": 0.40680381498938406, + "grad_norm": 0.8359375, + "learning_rate": 0.00017192557209373642, + "loss": 0.8627, + "step": 15843 + }, + { + "epoch": 0.4068294921853059, + "grad_norm": 0.8359375, + "learning_rate": 0.0001719224705383606, + "loss": 1.0877, + "step": 15844 + }, + { + "epoch": 0.40685516938122773, + "grad_norm": 0.8203125, + "learning_rate": 0.00017191936883964983, + "loss": 0.8963, + "step": 15845 + }, + { + "epoch": 0.40688084657714957, + "grad_norm": 0.8046875, + "learning_rate": 0.00017191626699761026, + "loss": 0.9137, + "step": 15846 + }, + { + "epoch": 0.40690652377307135, + "grad_norm": 0.70703125, + "learning_rate": 0.00017191316501224805, + "loss": 0.9078, + "step": 15847 + }, + { + "epoch": 0.4069322009689932, + "grad_norm": 0.79296875, + "learning_rate": 0.00017191006288356942, + "loss": 0.9435, + "step": 15848 + }, + { + "epoch": 0.406957878164915, + "grad_norm": 0.98046875, + "learning_rate": 0.0001719069606115805, + "loss": 0.8673, + "step": 15849 + }, + { + "epoch": 0.4069835553608368, + "grad_norm": 0.7421875, + "learning_rate": 0.00017190385819628752, + "loss": 0.8982, + "step": 15850 + }, + { + "epoch": 0.40700923255675864, + "grad_norm": 0.83203125, + "learning_rate": 0.00017190075563769662, + "loss": 1.0634, + "step": 15851 + }, + { + "epoch": 0.4070349097526805, + "grad_norm": 0.75, + "learning_rate": 0.00017189765293581407, + "loss": 0.7958, + "step": 15852 + }, + { + "epoch": 0.40706058694860225, + "grad_norm": 0.7734375, + "learning_rate": 0.00017189455009064596, + "loss": 0.9332, + "step": 15853 + }, + { + "epoch": 0.4070862641445241, + "grad_norm": 0.80078125, + "learning_rate": 0.0001718914471021985, + "loss": 0.9049, + "step": 15854 + }, + { + "epoch": 0.4071119413404459, + "grad_norm": 0.73046875, + "learning_rate": 0.00017188834397047787, + "loss": 0.881, + "step": 15855 + }, + { + "epoch": 0.40713761853636776, + "grad_norm": 0.78125, + "learning_rate": 0.00017188524069549027, + "loss": 0.9077, + "step": 15856 + }, + { + "epoch": 0.40716329573228954, + "grad_norm": 0.6875, + "learning_rate": 0.0001718821372772419, + "loss": 0.7138, + "step": 15857 + }, + { + "epoch": 0.4071889729282114, + "grad_norm": 0.7890625, + "learning_rate": 0.0001718790337157389, + "loss": 0.9776, + "step": 15858 + }, + { + "epoch": 0.4072146501241332, + "grad_norm": 0.78515625, + "learning_rate": 0.00017187593001098748, + "loss": 0.9251, + "step": 15859 + }, + { + "epoch": 0.407240327320055, + "grad_norm": 0.859375, + "learning_rate": 0.00017187282616299384, + "loss": 0.8868, + "step": 15860 + }, + { + "epoch": 0.40726600451597683, + "grad_norm": 0.71484375, + "learning_rate": 0.00017186972217176413, + "loss": 0.8405, + "step": 15861 + }, + { + "epoch": 0.40729168171189867, + "grad_norm": 0.83984375, + "learning_rate": 0.00017186661803730456, + "loss": 0.917, + "step": 15862 + }, + { + "epoch": 0.40731735890782045, + "grad_norm": 0.86328125, + "learning_rate": 0.00017186351375962132, + "loss": 0.8573, + "step": 15863 + }, + { + "epoch": 0.4073430361037423, + "grad_norm": 0.80078125, + "learning_rate": 0.00017186040933872058, + "loss": 0.8753, + "step": 15864 + }, + { + "epoch": 0.4073687132996641, + "grad_norm": 0.8046875, + "learning_rate": 0.00017185730477460855, + "loss": 0.8991, + "step": 15865 + }, + { + "epoch": 0.40739439049558596, + "grad_norm": 0.7421875, + "learning_rate": 0.00017185420006729139, + "loss": 0.9057, + "step": 15866 + }, + { + "epoch": 0.40742006769150774, + "grad_norm": 0.7890625, + "learning_rate": 0.0001718510952167753, + "loss": 0.7857, + "step": 15867 + }, + { + "epoch": 0.4074457448874296, + "grad_norm": 0.8046875, + "learning_rate": 0.00017184799022306648, + "loss": 0.936, + "step": 15868 + }, + { + "epoch": 0.4074714220833514, + "grad_norm": 0.88671875, + "learning_rate": 0.0001718448850861711, + "loss": 0.912, + "step": 15869 + }, + { + "epoch": 0.4074970992792732, + "grad_norm": 0.7890625, + "learning_rate": 0.00017184177980609534, + "loss": 0.9067, + "step": 15870 + }, + { + "epoch": 0.407522776475195, + "grad_norm": 0.7890625, + "learning_rate": 0.00017183867438284541, + "loss": 0.7752, + "step": 15871 + }, + { + "epoch": 0.40754845367111686, + "grad_norm": 0.83203125, + "learning_rate": 0.00017183556881642748, + "loss": 0.907, + "step": 15872 + }, + { + "epoch": 0.40757413086703864, + "grad_norm": 0.80078125, + "learning_rate": 0.0001718324631068478, + "loss": 0.8925, + "step": 15873 + }, + { + "epoch": 0.4075998080629605, + "grad_norm": 0.75, + "learning_rate": 0.00017182935725411244, + "loss": 0.9471, + "step": 15874 + }, + { + "epoch": 0.4076254852588823, + "grad_norm": 0.8359375, + "learning_rate": 0.0001718262512582277, + "loss": 0.9788, + "step": 15875 + }, + { + "epoch": 0.40765116245480415, + "grad_norm": 0.76953125, + "learning_rate": 0.00017182314511919974, + "loss": 1.0045, + "step": 15876 + }, + { + "epoch": 0.40767683965072593, + "grad_norm": 0.875, + "learning_rate": 0.0001718200388370347, + "loss": 0.9555, + "step": 15877 + }, + { + "epoch": 0.40770251684664777, + "grad_norm": 0.77734375, + "learning_rate": 0.0001718169324117388, + "loss": 0.8586, + "step": 15878 + }, + { + "epoch": 0.4077281940425696, + "grad_norm": 0.69140625, + "learning_rate": 0.0001718138258433183, + "loss": 0.847, + "step": 15879 + }, + { + "epoch": 0.4077538712384914, + "grad_norm": 0.82421875, + "learning_rate": 0.0001718107191317793, + "loss": 0.9472, + "step": 15880 + }, + { + "epoch": 0.4077795484344132, + "grad_norm": 0.76171875, + "learning_rate": 0.00017180761227712804, + "loss": 0.9156, + "step": 15881 + }, + { + "epoch": 0.40780522563033506, + "grad_norm": 0.72265625, + "learning_rate": 0.00017180450527937068, + "loss": 0.8057, + "step": 15882 + }, + { + "epoch": 0.40783090282625684, + "grad_norm": 0.9765625, + "learning_rate": 0.00017180139813851344, + "loss": 1.0031, + "step": 15883 + }, + { + "epoch": 0.40785658002217867, + "grad_norm": 0.83203125, + "learning_rate": 0.00017179829085456246, + "loss": 0.8952, + "step": 15884 + }, + { + "epoch": 0.4078822572181005, + "grad_norm": 0.92578125, + "learning_rate": 0.00017179518342752404, + "loss": 0.8712, + "step": 15885 + }, + { + "epoch": 0.40790793441402234, + "grad_norm": 0.73046875, + "learning_rate": 0.00017179207585740425, + "loss": 0.8456, + "step": 15886 + }, + { + "epoch": 0.4079336116099441, + "grad_norm": 0.7734375, + "learning_rate": 0.00017178896814420936, + "loss": 0.9512, + "step": 15887 + }, + { + "epoch": 0.40795928880586596, + "grad_norm": 0.76953125, + "learning_rate": 0.00017178586028794552, + "loss": 0.891, + "step": 15888 + }, + { + "epoch": 0.4079849660017878, + "grad_norm": 0.8203125, + "learning_rate": 0.00017178275228861898, + "loss": 0.8961, + "step": 15889 + }, + { + "epoch": 0.4080106431977096, + "grad_norm": 0.765625, + "learning_rate": 0.00017177964414623586, + "loss": 1.0799, + "step": 15890 + }, + { + "epoch": 0.4080363203936314, + "grad_norm": 0.77734375, + "learning_rate": 0.00017177653586080243, + "loss": 1.0366, + "step": 15891 + }, + { + "epoch": 0.40806199758955325, + "grad_norm": 0.79296875, + "learning_rate": 0.00017177342743232481, + "loss": 0.9678, + "step": 15892 + }, + { + "epoch": 0.40808767478547503, + "grad_norm": 0.7265625, + "learning_rate": 0.00017177031886080925, + "loss": 0.8674, + "step": 15893 + }, + { + "epoch": 0.40811335198139687, + "grad_norm": 0.72265625, + "learning_rate": 0.0001717672101462619, + "loss": 0.9833, + "step": 15894 + }, + { + "epoch": 0.4081390291773187, + "grad_norm": 0.703125, + "learning_rate": 0.00017176410128868903, + "loss": 0.8281, + "step": 15895 + }, + { + "epoch": 0.40816470637324054, + "grad_norm": 0.703125, + "learning_rate": 0.0001717609922880968, + "loss": 0.8543, + "step": 15896 + }, + { + "epoch": 0.4081903835691623, + "grad_norm": 0.80078125, + "learning_rate": 0.00017175788314449135, + "loss": 0.8541, + "step": 15897 + }, + { + "epoch": 0.40821606076508415, + "grad_norm": 0.78515625, + "learning_rate": 0.00017175477385787894, + "loss": 0.9245, + "step": 15898 + }, + { + "epoch": 0.408241737961006, + "grad_norm": 0.79296875, + "learning_rate": 0.00017175166442826573, + "loss": 0.8987, + "step": 15899 + }, + { + "epoch": 0.40826741515692777, + "grad_norm": 0.8125, + "learning_rate": 0.00017174855485565794, + "loss": 0.9413, + "step": 15900 + }, + { + "epoch": 0.4082930923528496, + "grad_norm": 0.78515625, + "learning_rate": 0.00017174544514006178, + "loss": 1.0069, + "step": 15901 + }, + { + "epoch": 0.40831876954877144, + "grad_norm": 0.77734375, + "learning_rate": 0.00017174233528148341, + "loss": 1.048, + "step": 15902 + }, + { + "epoch": 0.4083444467446932, + "grad_norm": 0.7109375, + "learning_rate": 0.00017173922527992906, + "loss": 0.9029, + "step": 15903 + }, + { + "epoch": 0.40837012394061506, + "grad_norm": 0.79296875, + "learning_rate": 0.0001717361151354049, + "loss": 0.8894, + "step": 15904 + }, + { + "epoch": 0.4083958011365369, + "grad_norm": 0.81640625, + "learning_rate": 0.00017173300484791716, + "loss": 0.8021, + "step": 15905 + }, + { + "epoch": 0.4084214783324587, + "grad_norm": 0.7578125, + "learning_rate": 0.000171729894417472, + "loss": 0.9386, + "step": 15906 + }, + { + "epoch": 0.4084471555283805, + "grad_norm": 0.78515625, + "learning_rate": 0.00017172678384407567, + "loss": 0.9734, + "step": 15907 + }, + { + "epoch": 0.40847283272430235, + "grad_norm": 0.75390625, + "learning_rate": 0.0001717236731277343, + "loss": 0.877, + "step": 15908 + }, + { + "epoch": 0.4084985099202242, + "grad_norm": 0.76953125, + "learning_rate": 0.00017172056226845414, + "loss": 1.0307, + "step": 15909 + }, + { + "epoch": 0.40852418711614596, + "grad_norm": 0.78125, + "learning_rate": 0.00017171745126624138, + "loss": 0.8003, + "step": 15910 + }, + { + "epoch": 0.4085498643120678, + "grad_norm": 0.8359375, + "learning_rate": 0.00017171434012110224, + "loss": 1.0392, + "step": 15911 + }, + { + "epoch": 0.40857554150798964, + "grad_norm": 0.73828125, + "learning_rate": 0.00017171122883304286, + "loss": 0.8729, + "step": 15912 + }, + { + "epoch": 0.4086012187039114, + "grad_norm": 0.74609375, + "learning_rate": 0.0001717081174020695, + "loss": 0.9868, + "step": 15913 + }, + { + "epoch": 0.40862689589983325, + "grad_norm": 0.703125, + "learning_rate": 0.00017170500582818832, + "loss": 0.8525, + "step": 15914 + }, + { + "epoch": 0.4086525730957551, + "grad_norm": 0.765625, + "learning_rate": 0.00017170189411140562, + "loss": 0.9961, + "step": 15915 + }, + { + "epoch": 0.40867825029167687, + "grad_norm": 0.765625, + "learning_rate": 0.00017169878225172745, + "loss": 0.9094, + "step": 15916 + }, + { + "epoch": 0.4087039274875987, + "grad_norm": 0.8828125, + "learning_rate": 0.00017169567024916008, + "loss": 0.9998, + "step": 15917 + }, + { + "epoch": 0.40872960468352054, + "grad_norm": 0.7890625, + "learning_rate": 0.0001716925581037097, + "loss": 0.8888, + "step": 15918 + }, + { + "epoch": 0.4087552818794424, + "grad_norm": 0.82421875, + "learning_rate": 0.0001716894458153826, + "loss": 0.8504, + "step": 15919 + }, + { + "epoch": 0.40878095907536416, + "grad_norm": 0.77734375, + "learning_rate": 0.00017168633338418484, + "loss": 0.9166, + "step": 15920 + }, + { + "epoch": 0.408806636271286, + "grad_norm": 0.80859375, + "learning_rate": 0.00017168322081012274, + "loss": 0.8538, + "step": 15921 + }, + { + "epoch": 0.40883231346720783, + "grad_norm": 0.82421875, + "learning_rate": 0.00017168010809320242, + "loss": 1.0419, + "step": 15922 + }, + { + "epoch": 0.4088579906631296, + "grad_norm": 0.90234375, + "learning_rate": 0.00017167699523343015, + "loss": 0.8829, + "step": 15923 + }, + { + "epoch": 0.40888366785905145, + "grad_norm": 0.7890625, + "learning_rate": 0.0001716738822308121, + "loss": 0.9302, + "step": 15924 + }, + { + "epoch": 0.4089093450549733, + "grad_norm": 0.69140625, + "learning_rate": 0.00017167076908535447, + "loss": 0.939, + "step": 15925 + }, + { + "epoch": 0.40893502225089506, + "grad_norm": 0.79296875, + "learning_rate": 0.00017166765579706346, + "loss": 0.9732, + "step": 15926 + }, + { + "epoch": 0.4089606994468169, + "grad_norm": 0.72265625, + "learning_rate": 0.00017166454236594526, + "loss": 0.9364, + "step": 15927 + }, + { + "epoch": 0.40898637664273874, + "grad_norm": 0.87890625, + "learning_rate": 0.00017166142879200614, + "loss": 0.8265, + "step": 15928 + }, + { + "epoch": 0.40901205383866057, + "grad_norm": 0.796875, + "learning_rate": 0.0001716583150752523, + "loss": 0.9478, + "step": 15929 + }, + { + "epoch": 0.40903773103458235, + "grad_norm": 0.73046875, + "learning_rate": 0.00017165520121568983, + "loss": 0.877, + "step": 15930 + }, + { + "epoch": 0.4090634082305042, + "grad_norm": 0.75390625, + "learning_rate": 0.00017165208721332505, + "loss": 0.9316, + "step": 15931 + }, + { + "epoch": 0.409089085426426, + "grad_norm": 0.83203125, + "learning_rate": 0.00017164897306816415, + "loss": 0.9395, + "step": 15932 + }, + { + "epoch": 0.4091147626223478, + "grad_norm": 0.7421875, + "learning_rate": 0.0001716458587802133, + "loss": 0.8071, + "step": 15933 + }, + { + "epoch": 0.40914043981826964, + "grad_norm": 0.71484375, + "learning_rate": 0.00017164274434947876, + "loss": 0.8557, + "step": 15934 + }, + { + "epoch": 0.4091661170141915, + "grad_norm": 0.8046875, + "learning_rate": 0.00017163962977596666, + "loss": 0.8933, + "step": 15935 + }, + { + "epoch": 0.40919179421011326, + "grad_norm": 0.83203125, + "learning_rate": 0.00017163651505968327, + "loss": 0.9573, + "step": 15936 + }, + { + "epoch": 0.4092174714060351, + "grad_norm": 0.765625, + "learning_rate": 0.00017163340020063479, + "loss": 1.0228, + "step": 15937 + }, + { + "epoch": 0.40924314860195693, + "grad_norm": 0.78515625, + "learning_rate": 0.00017163028519882737, + "loss": 0.8458, + "step": 15938 + }, + { + "epoch": 0.40926882579787877, + "grad_norm": 0.75390625, + "learning_rate": 0.0001716271700542673, + "loss": 0.9461, + "step": 15939 + }, + { + "epoch": 0.40929450299380055, + "grad_norm": 0.77734375, + "learning_rate": 0.00017162405476696074, + "loss": 0.831, + "step": 15940 + }, + { + "epoch": 0.4093201801897224, + "grad_norm": 0.80859375, + "learning_rate": 0.0001716209393369139, + "loss": 0.8939, + "step": 15941 + }, + { + "epoch": 0.4093458573856442, + "grad_norm": 0.8125, + "learning_rate": 0.000171617823764133, + "loss": 0.9252, + "step": 15942 + }, + { + "epoch": 0.409371534581566, + "grad_norm": 0.83203125, + "learning_rate": 0.00017161470804862426, + "loss": 0.9608, + "step": 15943 + }, + { + "epoch": 0.40939721177748783, + "grad_norm": 0.796875, + "learning_rate": 0.00017161159219039384, + "loss": 0.9336, + "step": 15944 + }, + { + "epoch": 0.40942288897340967, + "grad_norm": 0.74609375, + "learning_rate": 0.00017160847618944802, + "loss": 0.8631, + "step": 15945 + }, + { + "epoch": 0.40944856616933145, + "grad_norm": 0.8046875, + "learning_rate": 0.00017160536004579298, + "loss": 0.8029, + "step": 15946 + }, + { + "epoch": 0.4094742433652533, + "grad_norm": 0.70703125, + "learning_rate": 0.0001716022437594349, + "loss": 0.7539, + "step": 15947 + }, + { + "epoch": 0.4094999205611751, + "grad_norm": 0.734375, + "learning_rate": 0.00017159912733038003, + "loss": 0.9335, + "step": 15948 + }, + { + "epoch": 0.40952559775709696, + "grad_norm": 0.7421875, + "learning_rate": 0.00017159601075863456, + "loss": 0.901, + "step": 15949 + }, + { + "epoch": 0.40955127495301874, + "grad_norm": 1.1171875, + "learning_rate": 0.00017159289404420472, + "loss": 0.912, + "step": 15950 + }, + { + "epoch": 0.4095769521489406, + "grad_norm": 0.8203125, + "learning_rate": 0.00017158977718709667, + "loss": 0.8474, + "step": 15951 + }, + { + "epoch": 0.4096026293448624, + "grad_norm": 0.78515625, + "learning_rate": 0.0001715866601873167, + "loss": 1.0356, + "step": 15952 + }, + { + "epoch": 0.4096283065407842, + "grad_norm": 0.75, + "learning_rate": 0.00017158354304487098, + "loss": 0.8548, + "step": 15953 + }, + { + "epoch": 0.40965398373670603, + "grad_norm": 0.80078125, + "learning_rate": 0.00017158042575976575, + "loss": 0.9181, + "step": 15954 + }, + { + "epoch": 0.40967966093262786, + "grad_norm": 0.76171875, + "learning_rate": 0.00017157730833200713, + "loss": 0.9183, + "step": 15955 + }, + { + "epoch": 0.40970533812854965, + "grad_norm": 0.796875, + "learning_rate": 0.00017157419076160144, + "loss": 1.0153, + "step": 15956 + }, + { + "epoch": 0.4097310153244715, + "grad_norm": 0.78515625, + "learning_rate": 0.00017157107304855486, + "loss": 0.7823, + "step": 15957 + }, + { + "epoch": 0.4097566925203933, + "grad_norm": 0.77734375, + "learning_rate": 0.00017156795519287357, + "loss": 0.9495, + "step": 15958 + }, + { + "epoch": 0.40978236971631515, + "grad_norm": 0.8203125, + "learning_rate": 0.00017156483719456384, + "loss": 1.0514, + "step": 15959 + }, + { + "epoch": 0.40980804691223693, + "grad_norm": 0.80859375, + "learning_rate": 0.00017156171905363183, + "loss": 0.8749, + "step": 15960 + }, + { + "epoch": 0.40983372410815877, + "grad_norm": 0.83203125, + "learning_rate": 0.00017155860077008379, + "loss": 0.9693, + "step": 15961 + }, + { + "epoch": 0.4098594013040806, + "grad_norm": 0.8125, + "learning_rate": 0.00017155548234392594, + "loss": 1.1246, + "step": 15962 + }, + { + "epoch": 0.4098850785000024, + "grad_norm": 0.8671875, + "learning_rate": 0.00017155236377516445, + "loss": 0.8993, + "step": 15963 + }, + { + "epoch": 0.4099107556959242, + "grad_norm": 0.81640625, + "learning_rate": 0.00017154924506380557, + "loss": 0.8174, + "step": 15964 + }, + { + "epoch": 0.40993643289184606, + "grad_norm": 0.75, + "learning_rate": 0.00017154612620985551, + "loss": 0.9098, + "step": 15965 + }, + { + "epoch": 0.40996211008776784, + "grad_norm": 0.78125, + "learning_rate": 0.00017154300721332048, + "loss": 0.9332, + "step": 15966 + }, + { + "epoch": 0.4099877872836897, + "grad_norm": 0.8125, + "learning_rate": 0.0001715398880742067, + "loss": 0.9743, + "step": 15967 + }, + { + "epoch": 0.4100134644796115, + "grad_norm": 0.78125, + "learning_rate": 0.00017153676879252036, + "loss": 0.8469, + "step": 15968 + }, + { + "epoch": 0.41003914167553335, + "grad_norm": 0.7734375, + "learning_rate": 0.00017153364936826773, + "loss": 0.8401, + "step": 15969 + }, + { + "epoch": 0.41006481887145513, + "grad_norm": 0.8359375, + "learning_rate": 0.00017153052980145502, + "loss": 0.9248, + "step": 15970 + }, + { + "epoch": 0.41009049606737696, + "grad_norm": 0.79296875, + "learning_rate": 0.00017152741009208837, + "loss": 0.97, + "step": 15971 + }, + { + "epoch": 0.4101161732632988, + "grad_norm": 0.8046875, + "learning_rate": 0.0001715242902401741, + "loss": 1.0195, + "step": 15972 + }, + { + "epoch": 0.4101418504592206, + "grad_norm": 0.8125, + "learning_rate": 0.00017152117024571837, + "loss": 0.872, + "step": 15973 + }, + { + "epoch": 0.4101675276551424, + "grad_norm": 0.8125, + "learning_rate": 0.0001715180501087274, + "loss": 0.9162, + "step": 15974 + }, + { + "epoch": 0.41019320485106425, + "grad_norm": 0.8125, + "learning_rate": 0.00017151492982920742, + "loss": 0.8966, + "step": 15975 + }, + { + "epoch": 0.41021888204698603, + "grad_norm": 0.73828125, + "learning_rate": 0.00017151180940716464, + "loss": 0.9068, + "step": 15976 + }, + { + "epoch": 0.41024455924290787, + "grad_norm": 0.88671875, + "learning_rate": 0.00017150868884260528, + "loss": 0.9385, + "step": 15977 + }, + { + "epoch": 0.4102702364388297, + "grad_norm": 0.78125, + "learning_rate": 0.00017150556813553557, + "loss": 0.9048, + "step": 15978 + }, + { + "epoch": 0.41029591363475154, + "grad_norm": 0.80078125, + "learning_rate": 0.0001715024472859617, + "loss": 1.048, + "step": 15979 + }, + { + "epoch": 0.4103215908306733, + "grad_norm": 0.7734375, + "learning_rate": 0.00017149932629388994, + "loss": 0.9104, + "step": 15980 + }, + { + "epoch": 0.41034726802659516, + "grad_norm": 0.78515625, + "learning_rate": 0.00017149620515932647, + "loss": 0.9683, + "step": 15981 + }, + { + "epoch": 0.410372945222517, + "grad_norm": 0.76171875, + "learning_rate": 0.00017149308388227755, + "loss": 1.1276, + "step": 15982 + }, + { + "epoch": 0.4103986224184388, + "grad_norm": 0.77734375, + "learning_rate": 0.00017148996246274932, + "loss": 0.8297, + "step": 15983 + }, + { + "epoch": 0.4104242996143606, + "grad_norm": 0.859375, + "learning_rate": 0.00017148684090074809, + "loss": 1.089, + "step": 15984 + }, + { + "epoch": 0.41044997681028245, + "grad_norm": 0.796875, + "learning_rate": 0.00017148371919628006, + "loss": 0.8109, + "step": 15985 + }, + { + "epoch": 0.4104756540062042, + "grad_norm": 0.7890625, + "learning_rate": 0.0001714805973493514, + "loss": 1.0377, + "step": 15986 + }, + { + "epoch": 0.41050133120212606, + "grad_norm": 0.83203125, + "learning_rate": 0.00017147747535996833, + "loss": 0.8798, + "step": 15987 + }, + { + "epoch": 0.4105270083980479, + "grad_norm": 0.7421875, + "learning_rate": 0.0001714743532281372, + "loss": 0.8605, + "step": 15988 + }, + { + "epoch": 0.41055268559396974, + "grad_norm": 0.73046875, + "learning_rate": 0.00017147123095386405, + "loss": 0.9379, + "step": 15989 + }, + { + "epoch": 0.4105783627898915, + "grad_norm": 0.8359375, + "learning_rate": 0.00017146810853715526, + "loss": 0.7356, + "step": 15990 + }, + { + "epoch": 0.41060403998581335, + "grad_norm": 0.8125, + "learning_rate": 0.00017146498597801695, + "loss": 1.011, + "step": 15991 + }, + { + "epoch": 0.4106297171817352, + "grad_norm": 0.70703125, + "learning_rate": 0.0001714618632764554, + "loss": 0.9412, + "step": 15992 + }, + { + "epoch": 0.41065539437765697, + "grad_norm": 0.7421875, + "learning_rate": 0.0001714587404324768, + "loss": 0.9424, + "step": 15993 + }, + { + "epoch": 0.4106810715735788, + "grad_norm": 0.796875, + "learning_rate": 0.00017145561744608739, + "loss": 1.0277, + "step": 15994 + }, + { + "epoch": 0.41070674876950064, + "grad_norm": 0.7734375, + "learning_rate": 0.00017145249431729337, + "loss": 0.9218, + "step": 15995 + }, + { + "epoch": 0.4107324259654224, + "grad_norm": 0.8046875, + "learning_rate": 0.000171449371046101, + "loss": 0.973, + "step": 15996 + }, + { + "epoch": 0.41075810316134426, + "grad_norm": 0.80078125, + "learning_rate": 0.0001714462476325165, + "loss": 0.9412, + "step": 15997 + }, + { + "epoch": 0.4107837803572661, + "grad_norm": 0.85546875, + "learning_rate": 0.00017144312407654607, + "loss": 0.8902, + "step": 15998 + }, + { + "epoch": 0.41080945755318793, + "grad_norm": 0.84765625, + "learning_rate": 0.00017144000037819593, + "loss": 0.8214, + "step": 15999 + }, + { + "epoch": 0.4108351347491097, + "grad_norm": 0.796875, + "learning_rate": 0.00017143687653747231, + "loss": 0.9131, + "step": 16000 + }, + { + "epoch": 0.4108351347491097, + "eval_loss": 0.9210696220397949, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 407.7096, + "eval_samples_per_second": 24.527, + "eval_steps_per_second": 0.768, + "step": 16000 + }, + { + "epoch": 0.41086081194503155, + "grad_norm": 0.70703125, + "learning_rate": 0.00017143375255438147, + "loss": 0.818, + "step": 16001 + }, + { + "epoch": 0.4108864891409534, + "grad_norm": 0.84375, + "learning_rate": 0.00017143062842892964, + "loss": 1.0292, + "step": 16002 + }, + { + "epoch": 0.41091216633687516, + "grad_norm": 0.8125, + "learning_rate": 0.000171427504161123, + "loss": 0.9602, + "step": 16003 + }, + { + "epoch": 0.410937843532797, + "grad_norm": 0.71875, + "learning_rate": 0.00017142437975096778, + "loss": 0.9425, + "step": 16004 + }, + { + "epoch": 0.41096352072871883, + "grad_norm": 0.7890625, + "learning_rate": 0.00017142125519847026, + "loss": 0.9418, + "step": 16005 + }, + { + "epoch": 0.4109891979246406, + "grad_norm": 0.76171875, + "learning_rate": 0.0001714181305036366, + "loss": 0.8732, + "step": 16006 + }, + { + "epoch": 0.41101487512056245, + "grad_norm": 0.85546875, + "learning_rate": 0.00017141500566647303, + "loss": 0.9561, + "step": 16007 + }, + { + "epoch": 0.4110405523164843, + "grad_norm": 0.81640625, + "learning_rate": 0.00017141188068698585, + "loss": 0.9314, + "step": 16008 + }, + { + "epoch": 0.4110662295124061, + "grad_norm": 0.78125, + "learning_rate": 0.00017140875556518123, + "loss": 0.9766, + "step": 16009 + }, + { + "epoch": 0.4110919067083279, + "grad_norm": 0.77734375, + "learning_rate": 0.00017140563030106543, + "loss": 0.9489, + "step": 16010 + }, + { + "epoch": 0.41111758390424974, + "grad_norm": 0.74609375, + "learning_rate": 0.00017140250489464464, + "loss": 0.8757, + "step": 16011 + }, + { + "epoch": 0.4111432611001716, + "grad_norm": 0.90234375, + "learning_rate": 0.00017139937934592512, + "loss": 1.016, + "step": 16012 + }, + { + "epoch": 0.41116893829609336, + "grad_norm": 0.828125, + "learning_rate": 0.00017139625365491306, + "loss": 1.0772, + "step": 16013 + }, + { + "epoch": 0.4111946154920152, + "grad_norm": 0.78515625, + "learning_rate": 0.00017139312782161472, + "loss": 0.9634, + "step": 16014 + }, + { + "epoch": 0.41122029268793703, + "grad_norm": 0.79296875, + "learning_rate": 0.00017139000184603634, + "loss": 0.8679, + "step": 16015 + }, + { + "epoch": 0.4112459698838588, + "grad_norm": 0.71875, + "learning_rate": 0.00017138687572818413, + "loss": 0.8668, + "step": 16016 + }, + { + "epoch": 0.41127164707978064, + "grad_norm": 0.8203125, + "learning_rate": 0.00017138374946806432, + "loss": 0.8706, + "step": 16017 + }, + { + "epoch": 0.4112973242757025, + "grad_norm": 0.84375, + "learning_rate": 0.00017138062306568317, + "loss": 1.0397, + "step": 16018 + }, + { + "epoch": 0.4113230014716243, + "grad_norm": 0.703125, + "learning_rate": 0.00017137749652104683, + "loss": 0.8859, + "step": 16019 + }, + { + "epoch": 0.4113486786675461, + "grad_norm": 0.74609375, + "learning_rate": 0.00017137436983416163, + "loss": 0.9473, + "step": 16020 + }, + { + "epoch": 0.41137435586346793, + "grad_norm": 0.74609375, + "learning_rate": 0.0001713712430050338, + "loss": 0.8688, + "step": 16021 + }, + { + "epoch": 0.41140003305938977, + "grad_norm": 0.8515625, + "learning_rate": 0.00017136811603366944, + "loss": 0.9158, + "step": 16022 + }, + { + "epoch": 0.41142571025531155, + "grad_norm": 0.73046875, + "learning_rate": 0.00017136498892007492, + "loss": 0.8576, + "step": 16023 + }, + { + "epoch": 0.4114513874512334, + "grad_norm": 0.7734375, + "learning_rate": 0.00017136186166425641, + "loss": 0.9401, + "step": 16024 + }, + { + "epoch": 0.4114770646471552, + "grad_norm": 0.79296875, + "learning_rate": 0.00017135873426622018, + "loss": 0.9505, + "step": 16025 + }, + { + "epoch": 0.411502741843077, + "grad_norm": 0.75, + "learning_rate": 0.00017135560672597243, + "loss": 1.0871, + "step": 16026 + }, + { + "epoch": 0.41152841903899884, + "grad_norm": 0.72265625, + "learning_rate": 0.00017135247904351937, + "loss": 0.923, + "step": 16027 + }, + { + "epoch": 0.4115540962349207, + "grad_norm": 0.83984375, + "learning_rate": 0.00017134935121886728, + "loss": 0.783, + "step": 16028 + }, + { + "epoch": 0.4115797734308425, + "grad_norm": 1.125, + "learning_rate": 0.00017134622325202237, + "loss": 0.9258, + "step": 16029 + }, + { + "epoch": 0.4116054506267643, + "grad_norm": 0.87109375, + "learning_rate": 0.0001713430951429909, + "loss": 1.0261, + "step": 16030 + }, + { + "epoch": 0.4116311278226861, + "grad_norm": 0.76953125, + "learning_rate": 0.00017133996689177908, + "loss": 0.9681, + "step": 16031 + }, + { + "epoch": 0.41165680501860796, + "grad_norm": 0.7265625, + "learning_rate": 0.00017133683849839316, + "loss": 0.7858, + "step": 16032 + }, + { + "epoch": 0.41168248221452974, + "grad_norm": 0.828125, + "learning_rate": 0.00017133370996283938, + "loss": 0.8503, + "step": 16033 + }, + { + "epoch": 0.4117081594104516, + "grad_norm": 0.86328125, + "learning_rate": 0.00017133058128512393, + "loss": 0.9446, + "step": 16034 + }, + { + "epoch": 0.4117338366063734, + "grad_norm": 0.6796875, + "learning_rate": 0.00017132745246525307, + "loss": 0.8603, + "step": 16035 + }, + { + "epoch": 0.4117595138022952, + "grad_norm": 0.7734375, + "learning_rate": 0.00017132432350323303, + "loss": 0.9833, + "step": 16036 + }, + { + "epoch": 0.41178519099821703, + "grad_norm": 0.76171875, + "learning_rate": 0.0001713211943990701, + "loss": 0.8951, + "step": 16037 + }, + { + "epoch": 0.41181086819413887, + "grad_norm": 0.81640625, + "learning_rate": 0.00017131806515277044, + "loss": 0.8921, + "step": 16038 + }, + { + "epoch": 0.4118365453900607, + "grad_norm": 0.7421875, + "learning_rate": 0.00017131493576434033, + "loss": 0.9089, + "step": 16039 + }, + { + "epoch": 0.4118622225859825, + "grad_norm": 0.75, + "learning_rate": 0.000171311806233786, + "loss": 0.8762, + "step": 16040 + }, + { + "epoch": 0.4118878997819043, + "grad_norm": 0.74609375, + "learning_rate": 0.00017130867656111367, + "loss": 0.9352, + "step": 16041 + }, + { + "epoch": 0.41191357697782616, + "grad_norm": 0.83203125, + "learning_rate": 0.0001713055467463296, + "loss": 0.9976, + "step": 16042 + }, + { + "epoch": 0.41193925417374794, + "grad_norm": 0.71875, + "learning_rate": 0.00017130241678943998, + "loss": 0.9949, + "step": 16043 + }, + { + "epoch": 0.4119649313696698, + "grad_norm": 0.78515625, + "learning_rate": 0.0001712992866904511, + "loss": 0.9122, + "step": 16044 + }, + { + "epoch": 0.4119906085655916, + "grad_norm": 0.7734375, + "learning_rate": 0.00017129615644936923, + "loss": 0.8223, + "step": 16045 + }, + { + "epoch": 0.4120162857615134, + "grad_norm": 0.71875, + "learning_rate": 0.0001712930260662005, + "loss": 0.8629, + "step": 16046 + }, + { + "epoch": 0.4120419629574352, + "grad_norm": 0.77734375, + "learning_rate": 0.00017128989554095124, + "loss": 0.905, + "step": 16047 + }, + { + "epoch": 0.41206764015335706, + "grad_norm": 0.8046875, + "learning_rate": 0.00017128676487362764, + "loss": 0.9528, + "step": 16048 + }, + { + "epoch": 0.4120933173492789, + "grad_norm": 0.7734375, + "learning_rate": 0.000171283634064236, + "loss": 1.1207, + "step": 16049 + }, + { + "epoch": 0.4121189945452007, + "grad_norm": 0.73046875, + "learning_rate": 0.00017128050311278247, + "loss": 0.9347, + "step": 16050 + }, + { + "epoch": 0.4121446717411225, + "grad_norm": 0.7734375, + "learning_rate": 0.00017127737201927336, + "loss": 0.9449, + "step": 16051 + }, + { + "epoch": 0.41217034893704435, + "grad_norm": 0.78125, + "learning_rate": 0.00017127424078371487, + "loss": 0.9691, + "step": 16052 + }, + { + "epoch": 0.41219602613296613, + "grad_norm": 0.80078125, + "learning_rate": 0.00017127110940611329, + "loss": 0.855, + "step": 16053 + }, + { + "epoch": 0.41222170332888797, + "grad_norm": 0.81640625, + "learning_rate": 0.0001712679778864748, + "loss": 0.9485, + "step": 16054 + }, + { + "epoch": 0.4122473805248098, + "grad_norm": 0.8046875, + "learning_rate": 0.00017126484622480566, + "loss": 0.9446, + "step": 16055 + }, + { + "epoch": 0.4122730577207316, + "grad_norm": 0.7578125, + "learning_rate": 0.0001712617144211121, + "loss": 0.861, + "step": 16056 + }, + { + "epoch": 0.4122987349166534, + "grad_norm": 0.796875, + "learning_rate": 0.0001712585824754004, + "loss": 0.9294, + "step": 16057 + }, + { + "epoch": 0.41232441211257526, + "grad_norm": 0.8125, + "learning_rate": 0.00017125545038767682, + "loss": 0.7998, + "step": 16058 + }, + { + "epoch": 0.4123500893084971, + "grad_norm": 0.83203125, + "learning_rate": 0.00017125231815794752, + "loss": 1.0298, + "step": 16059 + }, + { + "epoch": 0.4123757665044189, + "grad_norm": 0.75390625, + "learning_rate": 0.0001712491857862188, + "loss": 0.8253, + "step": 16060 + }, + { + "epoch": 0.4124014437003407, + "grad_norm": 0.82421875, + "learning_rate": 0.00017124605327249688, + "loss": 0.8824, + "step": 16061 + }, + { + "epoch": 0.41242712089626254, + "grad_norm": 0.81640625, + "learning_rate": 0.00017124292061678804, + "loss": 1.0339, + "step": 16062 + }, + { + "epoch": 0.4124527980921843, + "grad_norm": 0.75, + "learning_rate": 0.00017123978781909846, + "loss": 0.8808, + "step": 16063 + }, + { + "epoch": 0.41247847528810616, + "grad_norm": 0.84765625, + "learning_rate": 0.00017123665487943444, + "loss": 1.0938, + "step": 16064 + }, + { + "epoch": 0.412504152484028, + "grad_norm": 0.8125, + "learning_rate": 0.0001712335217978022, + "loss": 0.9511, + "step": 16065 + }, + { + "epoch": 0.4125298296799498, + "grad_norm": 0.84765625, + "learning_rate": 0.00017123038857420797, + "loss": 0.9694, + "step": 16066 + }, + { + "epoch": 0.4125555068758716, + "grad_norm": 0.80859375, + "learning_rate": 0.000171227255208658, + "loss": 0.9009, + "step": 16067 + }, + { + "epoch": 0.41258118407179345, + "grad_norm": 0.75390625, + "learning_rate": 0.00017122412170115858, + "loss": 1.0711, + "step": 16068 + }, + { + "epoch": 0.4126068612677153, + "grad_norm": 0.80859375, + "learning_rate": 0.0001712209880517159, + "loss": 1.1284, + "step": 16069 + }, + { + "epoch": 0.41263253846363707, + "grad_norm": 0.765625, + "learning_rate": 0.0001712178542603362, + "loss": 1.0026, + "step": 16070 + }, + { + "epoch": 0.4126582156595589, + "grad_norm": 0.80859375, + "learning_rate": 0.0001712147203270258, + "loss": 1.0843, + "step": 16071 + }, + { + "epoch": 0.41268389285548074, + "grad_norm": 0.73828125, + "learning_rate": 0.00017121158625179084, + "loss": 0.78, + "step": 16072 + }, + { + "epoch": 0.4127095700514025, + "grad_norm": 0.7578125, + "learning_rate": 0.00017120845203463767, + "loss": 0.915, + "step": 16073 + }, + { + "epoch": 0.41273524724732435, + "grad_norm": 0.734375, + "learning_rate": 0.0001712053176755725, + "loss": 0.8991, + "step": 16074 + }, + { + "epoch": 0.4127609244432462, + "grad_norm": 0.75, + "learning_rate": 0.0001712021831746015, + "loss": 0.9795, + "step": 16075 + }, + { + "epoch": 0.41278660163916797, + "grad_norm": 0.8203125, + "learning_rate": 0.00017119904853173101, + "loss": 1.0876, + "step": 16076 + }, + { + "epoch": 0.4128122788350898, + "grad_norm": 0.77734375, + "learning_rate": 0.00017119591374696727, + "loss": 0.9821, + "step": 16077 + }, + { + "epoch": 0.41283795603101164, + "grad_norm": 0.8359375, + "learning_rate": 0.00017119277882031648, + "loss": 0.9794, + "step": 16078 + }, + { + "epoch": 0.4128636332269335, + "grad_norm": 0.79296875, + "learning_rate": 0.0001711896437517849, + "loss": 0.8193, + "step": 16079 + }, + { + "epoch": 0.41288931042285526, + "grad_norm": 0.78515625, + "learning_rate": 0.00017118650854137883, + "loss": 0.8596, + "step": 16080 + }, + { + "epoch": 0.4129149876187771, + "grad_norm": 0.69140625, + "learning_rate": 0.00017118337318910445, + "loss": 0.8578, + "step": 16081 + }, + { + "epoch": 0.41294066481469893, + "grad_norm": 0.91015625, + "learning_rate": 0.00017118023769496804, + "loss": 1.0478, + "step": 16082 + }, + { + "epoch": 0.4129663420106207, + "grad_norm": 0.74609375, + "learning_rate": 0.00017117710205897586, + "loss": 0.8784, + "step": 16083 + }, + { + "epoch": 0.41299201920654255, + "grad_norm": 0.75390625, + "learning_rate": 0.00017117396628113412, + "loss": 0.9363, + "step": 16084 + }, + { + "epoch": 0.4130176964024644, + "grad_norm": 0.7578125, + "learning_rate": 0.00017117083036144912, + "loss": 1.0668, + "step": 16085 + }, + { + "epoch": 0.41304337359838617, + "grad_norm": 0.7734375, + "learning_rate": 0.0001711676942999271, + "loss": 0.9981, + "step": 16086 + }, + { + "epoch": 0.413069050794308, + "grad_norm": 0.82421875, + "learning_rate": 0.00017116455809657426, + "loss": 1.0425, + "step": 16087 + }, + { + "epoch": 0.41309472799022984, + "grad_norm": 0.82421875, + "learning_rate": 0.00017116142175139688, + "loss": 1.0704, + "step": 16088 + }, + { + "epoch": 0.4131204051861517, + "grad_norm": 0.8671875, + "learning_rate": 0.00017115828526440124, + "loss": 1.0226, + "step": 16089 + }, + { + "epoch": 0.41314608238207345, + "grad_norm": 0.8203125, + "learning_rate": 0.00017115514863559354, + "loss": 0.8999, + "step": 16090 + }, + { + "epoch": 0.4131717595779953, + "grad_norm": 0.75, + "learning_rate": 0.00017115201186498007, + "loss": 0.9384, + "step": 16091 + }, + { + "epoch": 0.4131974367739171, + "grad_norm": 0.75390625, + "learning_rate": 0.00017114887495256706, + "loss": 1.0226, + "step": 16092 + }, + { + "epoch": 0.4132231139698389, + "grad_norm": 0.7578125, + "learning_rate": 0.00017114573789836078, + "loss": 1.0186, + "step": 16093 + }, + { + "epoch": 0.41324879116576074, + "grad_norm": 0.8515625, + "learning_rate": 0.00017114260070236746, + "loss": 0.8989, + "step": 16094 + }, + { + "epoch": 0.4132744683616826, + "grad_norm": 0.83984375, + "learning_rate": 0.00017113946336459336, + "loss": 0.8497, + "step": 16095 + }, + { + "epoch": 0.41330014555760436, + "grad_norm": 0.8515625, + "learning_rate": 0.00017113632588504473, + "loss": 0.8296, + "step": 16096 + }, + { + "epoch": 0.4133258227535262, + "grad_norm": 0.8203125, + "learning_rate": 0.00017113318826372787, + "loss": 0.835, + "step": 16097 + }, + { + "epoch": 0.41335149994944803, + "grad_norm": 0.796875, + "learning_rate": 0.00017113005050064896, + "loss": 0.917, + "step": 16098 + }, + { + "epoch": 0.41337717714536987, + "grad_norm": 0.79296875, + "learning_rate": 0.00017112691259581428, + "loss": 0.8861, + "step": 16099 + }, + { + "epoch": 0.41340285434129165, + "grad_norm": 0.8046875, + "learning_rate": 0.0001711237745492301, + "loss": 0.9019, + "step": 16100 + }, + { + "epoch": 0.4134285315372135, + "grad_norm": 0.79296875, + "learning_rate": 0.00017112063636090264, + "loss": 0.9977, + "step": 16101 + }, + { + "epoch": 0.4134542087331353, + "grad_norm": 0.74609375, + "learning_rate": 0.00017111749803083823, + "loss": 0.8398, + "step": 16102 + }, + { + "epoch": 0.4134798859290571, + "grad_norm": 0.7265625, + "learning_rate": 0.00017111435955904304, + "loss": 0.9342, + "step": 16103 + }, + { + "epoch": 0.41350556312497894, + "grad_norm": 0.6953125, + "learning_rate": 0.00017111122094552332, + "loss": 0.8213, + "step": 16104 + }, + { + "epoch": 0.4135312403209008, + "grad_norm": 0.74609375, + "learning_rate": 0.00017110808219028537, + "loss": 0.8722, + "step": 16105 + }, + { + "epoch": 0.41355691751682255, + "grad_norm": 0.75390625, + "learning_rate": 0.00017110494329333547, + "loss": 1.0607, + "step": 16106 + }, + { + "epoch": 0.4135825947127444, + "grad_norm": 0.75, + "learning_rate": 0.00017110180425467984, + "loss": 0.8567, + "step": 16107 + }, + { + "epoch": 0.4136082719086662, + "grad_norm": 0.7734375, + "learning_rate": 0.0001710986650743247, + "loss": 0.9433, + "step": 16108 + }, + { + "epoch": 0.41363394910458806, + "grad_norm": 0.75, + "learning_rate": 0.0001710955257522764, + "loss": 0.9058, + "step": 16109 + }, + { + "epoch": 0.41365962630050984, + "grad_norm": 0.765625, + "learning_rate": 0.0001710923862885411, + "loss": 0.9286, + "step": 16110 + }, + { + "epoch": 0.4136853034964317, + "grad_norm": 0.73828125, + "learning_rate": 0.0001710892466831251, + "loss": 0.9396, + "step": 16111 + }, + { + "epoch": 0.4137109806923535, + "grad_norm": 0.7734375, + "learning_rate": 0.00017108610693603463, + "loss": 0.8689, + "step": 16112 + }, + { + "epoch": 0.4137366578882753, + "grad_norm": 0.75390625, + "learning_rate": 0.000171082967047276, + "loss": 0.8903, + "step": 16113 + }, + { + "epoch": 0.41376233508419713, + "grad_norm": 0.8125, + "learning_rate": 0.00017107982701685542, + "loss": 1.0135, + "step": 16114 + }, + { + "epoch": 0.41378801228011897, + "grad_norm": 0.73828125, + "learning_rate": 0.0001710766868447792, + "loss": 0.8305, + "step": 16115 + }, + { + "epoch": 0.41381368947604075, + "grad_norm": 0.80078125, + "learning_rate": 0.0001710735465310535, + "loss": 0.9568, + "step": 16116 + }, + { + "epoch": 0.4138393666719626, + "grad_norm": 0.73828125, + "learning_rate": 0.00017107040607568472, + "loss": 0.8417, + "step": 16117 + }, + { + "epoch": 0.4138650438678844, + "grad_norm": 0.74609375, + "learning_rate": 0.000171067265478679, + "loss": 0.8208, + "step": 16118 + }, + { + "epoch": 0.4138907210638062, + "grad_norm": 0.84375, + "learning_rate": 0.00017106412474004264, + "loss": 0.8605, + "step": 16119 + }, + { + "epoch": 0.41391639825972804, + "grad_norm": 0.75, + "learning_rate": 0.0001710609838597819, + "loss": 0.8392, + "step": 16120 + }, + { + "epoch": 0.41394207545564987, + "grad_norm": 0.75390625, + "learning_rate": 0.00017105784283790305, + "loss": 0.9425, + "step": 16121 + }, + { + "epoch": 0.4139677526515717, + "grad_norm": 0.796875, + "learning_rate": 0.0001710547016744123, + "loss": 0.9375, + "step": 16122 + }, + { + "epoch": 0.4139934298474935, + "grad_norm": 0.76953125, + "learning_rate": 0.000171051560369316, + "loss": 0.9275, + "step": 16123 + }, + { + "epoch": 0.4140191070434153, + "grad_norm": 0.77734375, + "learning_rate": 0.00017104841892262034, + "loss": 1.0254, + "step": 16124 + }, + { + "epoch": 0.41404478423933716, + "grad_norm": 0.71484375, + "learning_rate": 0.00017104527733433157, + "loss": 0.9007, + "step": 16125 + }, + { + "epoch": 0.41407046143525894, + "grad_norm": 0.79296875, + "learning_rate": 0.000171042135604456, + "loss": 1.0191, + "step": 16126 + }, + { + "epoch": 0.4140961386311808, + "grad_norm": 0.73046875, + "learning_rate": 0.00017103899373299987, + "loss": 0.7659, + "step": 16127 + }, + { + "epoch": 0.4141218158271026, + "grad_norm": 0.7265625, + "learning_rate": 0.00017103585171996945, + "loss": 0.9327, + "step": 16128 + }, + { + "epoch": 0.4141474930230244, + "grad_norm": 0.796875, + "learning_rate": 0.00017103270956537098, + "loss": 0.8996, + "step": 16129 + }, + { + "epoch": 0.41417317021894623, + "grad_norm": 0.80078125, + "learning_rate": 0.00017102956726921075, + "loss": 0.921, + "step": 16130 + }, + { + "epoch": 0.41419884741486807, + "grad_norm": 0.80078125, + "learning_rate": 0.000171026424831495, + "loss": 0.9544, + "step": 16131 + }, + { + "epoch": 0.4142245246107899, + "grad_norm": 0.77734375, + "learning_rate": 0.00017102328225223002, + "loss": 0.9181, + "step": 16132 + }, + { + "epoch": 0.4142502018067117, + "grad_norm": 0.79296875, + "learning_rate": 0.000171020139531422, + "loss": 0.8357, + "step": 16133 + }, + { + "epoch": 0.4142758790026335, + "grad_norm": 0.78515625, + "learning_rate": 0.00017101699666907734, + "loss": 0.9893, + "step": 16134 + }, + { + "epoch": 0.41430155619855535, + "grad_norm": 0.78515625, + "learning_rate": 0.00017101385366520216, + "loss": 0.845, + "step": 16135 + }, + { + "epoch": 0.41432723339447713, + "grad_norm": 0.74609375, + "learning_rate": 0.0001710107105198028, + "loss": 0.935, + "step": 16136 + }, + { + "epoch": 0.41435291059039897, + "grad_norm": 0.84765625, + "learning_rate": 0.0001710075672328855, + "loss": 0.9601, + "step": 16137 + }, + { + "epoch": 0.4143785877863208, + "grad_norm": 0.75, + "learning_rate": 0.00017100442380445653, + "loss": 0.8883, + "step": 16138 + }, + { + "epoch": 0.4144042649822426, + "grad_norm": 0.875, + "learning_rate": 0.00017100128023452216, + "loss": 0.9628, + "step": 16139 + }, + { + "epoch": 0.4144299421781644, + "grad_norm": 0.78515625, + "learning_rate": 0.00017099813652308868, + "loss": 1.0172, + "step": 16140 + }, + { + "epoch": 0.41445561937408626, + "grad_norm": 0.84765625, + "learning_rate": 0.0001709949926701623, + "loss": 0.8806, + "step": 16141 + }, + { + "epoch": 0.4144812965700081, + "grad_norm": 0.81640625, + "learning_rate": 0.00017099184867574934, + "loss": 0.8761, + "step": 16142 + }, + { + "epoch": 0.4145069737659299, + "grad_norm": 0.765625, + "learning_rate": 0.00017098870453985602, + "loss": 0.9843, + "step": 16143 + }, + { + "epoch": 0.4145326509618517, + "grad_norm": 0.75390625, + "learning_rate": 0.00017098556026248863, + "loss": 1.1064, + "step": 16144 + }, + { + "epoch": 0.41455832815777355, + "grad_norm": 1.1640625, + "learning_rate": 0.0001709824158436534, + "loss": 1.0375, + "step": 16145 + }, + { + "epoch": 0.41458400535369533, + "grad_norm": 0.80078125, + "learning_rate": 0.00017097927128335668, + "loss": 0.8962, + "step": 16146 + }, + { + "epoch": 0.41460968254961716, + "grad_norm": 0.796875, + "learning_rate": 0.0001709761265816046, + "loss": 0.9297, + "step": 16147 + }, + { + "epoch": 0.414635359745539, + "grad_norm": 0.71875, + "learning_rate": 0.0001709729817384036, + "loss": 0.8679, + "step": 16148 + }, + { + "epoch": 0.4146610369414608, + "grad_norm": 0.81640625, + "learning_rate": 0.00017096983675375982, + "loss": 1.0507, + "step": 16149 + }, + { + "epoch": 0.4146867141373826, + "grad_norm": 0.77734375, + "learning_rate": 0.00017096669162767957, + "loss": 0.9662, + "step": 16150 + }, + { + "epoch": 0.41471239133330445, + "grad_norm": 0.76171875, + "learning_rate": 0.0001709635463601691, + "loss": 0.8547, + "step": 16151 + }, + { + "epoch": 0.4147380685292263, + "grad_norm": 0.76953125, + "learning_rate": 0.0001709604009512347, + "loss": 0.9935, + "step": 16152 + }, + { + "epoch": 0.41476374572514807, + "grad_norm": 0.79296875, + "learning_rate": 0.00017095725540088266, + "loss": 0.8635, + "step": 16153 + }, + { + "epoch": 0.4147894229210699, + "grad_norm": 0.76953125, + "learning_rate": 0.0001709541097091192, + "loss": 0.8696, + "step": 16154 + }, + { + "epoch": 0.41481510011699174, + "grad_norm": 0.80859375, + "learning_rate": 0.0001709509638759506, + "loss": 1.1014, + "step": 16155 + }, + { + "epoch": 0.4148407773129135, + "grad_norm": 0.78125, + "learning_rate": 0.0001709478179013831, + "loss": 0.8738, + "step": 16156 + }, + { + "epoch": 0.41486645450883536, + "grad_norm": 0.75390625, + "learning_rate": 0.00017094467178542305, + "loss": 0.9092, + "step": 16157 + }, + { + "epoch": 0.4148921317047572, + "grad_norm": 0.78125, + "learning_rate": 0.0001709415255280767, + "loss": 0.9047, + "step": 16158 + }, + { + "epoch": 0.414917808900679, + "grad_norm": 0.84765625, + "learning_rate": 0.00017093837912935026, + "loss": 1.0007, + "step": 16159 + }, + { + "epoch": 0.4149434860966008, + "grad_norm": 0.76171875, + "learning_rate": 0.00017093523258925007, + "loss": 0.9561, + "step": 16160 + }, + { + "epoch": 0.41496916329252265, + "grad_norm": 0.73828125, + "learning_rate": 0.00017093208590778235, + "loss": 0.8559, + "step": 16161 + }, + { + "epoch": 0.4149948404884445, + "grad_norm": 0.79296875, + "learning_rate": 0.0001709289390849534, + "loss": 0.8784, + "step": 16162 + }, + { + "epoch": 0.41502051768436626, + "grad_norm": 0.80859375, + "learning_rate": 0.00017092579212076947, + "loss": 0.9363, + "step": 16163 + }, + { + "epoch": 0.4150461948802881, + "grad_norm": 0.7734375, + "learning_rate": 0.00017092264501523686, + "loss": 0.8059, + "step": 16164 + }, + { + "epoch": 0.41507187207620994, + "grad_norm": 0.77734375, + "learning_rate": 0.00017091949776836177, + "loss": 0.927, + "step": 16165 + }, + { + "epoch": 0.4150975492721317, + "grad_norm": 0.76953125, + "learning_rate": 0.00017091635038015058, + "loss": 0.7972, + "step": 16166 + }, + { + "epoch": 0.41512322646805355, + "grad_norm": 0.78125, + "learning_rate": 0.00017091320285060951, + "loss": 0.9207, + "step": 16167 + }, + { + "epoch": 0.4151489036639754, + "grad_norm": 0.765625, + "learning_rate": 0.0001709100551797448, + "loss": 0.955, + "step": 16168 + }, + { + "epoch": 0.41517458085989717, + "grad_norm": 0.734375, + "learning_rate": 0.00017090690736756278, + "loss": 0.8961, + "step": 16169 + }, + { + "epoch": 0.415200258055819, + "grad_norm": 0.78125, + "learning_rate": 0.00017090375941406971, + "loss": 0.9837, + "step": 16170 + }, + { + "epoch": 0.41522593525174084, + "grad_norm": 0.828125, + "learning_rate": 0.00017090061131927185, + "loss": 1.0988, + "step": 16171 + }, + { + "epoch": 0.4152516124476627, + "grad_norm": 0.76171875, + "learning_rate": 0.00017089746308317544, + "loss": 0.845, + "step": 16172 + }, + { + "epoch": 0.41527728964358446, + "grad_norm": 0.76953125, + "learning_rate": 0.00017089431470578684, + "loss": 0.9162, + "step": 16173 + }, + { + "epoch": 0.4153029668395063, + "grad_norm": 0.77734375, + "learning_rate": 0.00017089116618711225, + "loss": 0.8885, + "step": 16174 + }, + { + "epoch": 0.41532864403542813, + "grad_norm": 0.78515625, + "learning_rate": 0.00017088801752715796, + "loss": 0.8926, + "step": 16175 + }, + { + "epoch": 0.4153543212313499, + "grad_norm": 0.7734375, + "learning_rate": 0.00017088486872593025, + "loss": 0.9657, + "step": 16176 + }, + { + "epoch": 0.41537999842727175, + "grad_norm": 0.7890625, + "learning_rate": 0.00017088171978343542, + "loss": 0.8994, + "step": 16177 + }, + { + "epoch": 0.4154056756231936, + "grad_norm": 0.7265625, + "learning_rate": 0.0001708785706996797, + "loss": 1.0695, + "step": 16178 + }, + { + "epoch": 0.41543135281911536, + "grad_norm": 0.82421875, + "learning_rate": 0.00017087542147466942, + "loss": 0.9197, + "step": 16179 + }, + { + "epoch": 0.4154570300150372, + "grad_norm": 0.83984375, + "learning_rate": 0.0001708722721084108, + "loss": 0.8588, + "step": 16180 + }, + { + "epoch": 0.41548270721095903, + "grad_norm": 0.79296875, + "learning_rate": 0.00017086912260091017, + "loss": 0.8492, + "step": 16181 + }, + { + "epoch": 0.41550838440688087, + "grad_norm": 0.796875, + "learning_rate": 0.00017086597295217377, + "loss": 0.7404, + "step": 16182 + }, + { + "epoch": 0.41553406160280265, + "grad_norm": 0.76953125, + "learning_rate": 0.0001708628231622079, + "loss": 0.8539, + "step": 16183 + }, + { + "epoch": 0.4155597387987245, + "grad_norm": 0.75, + "learning_rate": 0.0001708596732310188, + "loss": 0.9323, + "step": 16184 + }, + { + "epoch": 0.4155854159946463, + "grad_norm": 0.71875, + "learning_rate": 0.00017085652315861278, + "loss": 0.9232, + "step": 16185 + }, + { + "epoch": 0.4156110931905681, + "grad_norm": 0.765625, + "learning_rate": 0.0001708533729449961, + "loss": 0.9443, + "step": 16186 + }, + { + "epoch": 0.41563677038648994, + "grad_norm": 0.76953125, + "learning_rate": 0.00017085022259017506, + "loss": 0.9253, + "step": 16187 + }, + { + "epoch": 0.4156624475824118, + "grad_norm": 0.7421875, + "learning_rate": 0.0001708470720941559, + "loss": 0.8585, + "step": 16188 + }, + { + "epoch": 0.41568812477833356, + "grad_norm": 0.7578125, + "learning_rate": 0.00017084392145694496, + "loss": 0.8634, + "step": 16189 + }, + { + "epoch": 0.4157138019742554, + "grad_norm": 0.796875, + "learning_rate": 0.0001708407706785485, + "loss": 0.8498, + "step": 16190 + }, + { + "epoch": 0.41573947917017723, + "grad_norm": 0.78515625, + "learning_rate": 0.00017083761975897276, + "loss": 0.9008, + "step": 16191 + }, + { + "epoch": 0.41576515636609906, + "grad_norm": 0.83203125, + "learning_rate": 0.00017083446869822403, + "loss": 1.0089, + "step": 16192 + }, + { + "epoch": 0.41579083356202085, + "grad_norm": 0.796875, + "learning_rate": 0.0001708313174963086, + "loss": 0.9306, + "step": 16193 + }, + { + "epoch": 0.4158165107579427, + "grad_norm": 0.83984375, + "learning_rate": 0.00017082816615323276, + "loss": 0.9531, + "step": 16194 + }, + { + "epoch": 0.4158421879538645, + "grad_norm": 0.7890625, + "learning_rate": 0.0001708250146690028, + "loss": 0.9214, + "step": 16195 + }, + { + "epoch": 0.4158678651497863, + "grad_norm": 0.796875, + "learning_rate": 0.00017082186304362497, + "loss": 0.8394, + "step": 16196 + }, + { + "epoch": 0.41589354234570813, + "grad_norm": 0.76953125, + "learning_rate": 0.0001708187112771056, + "loss": 0.9593, + "step": 16197 + }, + { + "epoch": 0.41591921954162997, + "grad_norm": 0.80078125, + "learning_rate": 0.00017081555936945087, + "loss": 0.9792, + "step": 16198 + }, + { + "epoch": 0.41594489673755175, + "grad_norm": 0.77734375, + "learning_rate": 0.00017081240732066716, + "loss": 0.9114, + "step": 16199 + }, + { + "epoch": 0.4159705739334736, + "grad_norm": 0.7890625, + "learning_rate": 0.0001708092551307607, + "loss": 0.9424, + "step": 16200 + }, + { + "epoch": 0.4159962511293954, + "grad_norm": 0.85546875, + "learning_rate": 0.00017080610279973782, + "loss": 0.8879, + "step": 16201 + }, + { + "epoch": 0.41602192832531726, + "grad_norm": 0.7265625, + "learning_rate": 0.00017080295032760476, + "loss": 0.946, + "step": 16202 + }, + { + "epoch": 0.41604760552123904, + "grad_norm": 0.69921875, + "learning_rate": 0.00017079979771436783, + "loss": 0.8442, + "step": 16203 + }, + { + "epoch": 0.4160732827171609, + "grad_norm": 0.8125, + "learning_rate": 0.00017079664496003327, + "loss": 1.1063, + "step": 16204 + }, + { + "epoch": 0.4160989599130827, + "grad_norm": 0.72265625, + "learning_rate": 0.0001707934920646074, + "loss": 0.9934, + "step": 16205 + }, + { + "epoch": 0.4161246371090045, + "grad_norm": 0.7421875, + "learning_rate": 0.00017079033902809649, + "loss": 0.896, + "step": 16206 + }, + { + "epoch": 0.4161503143049263, + "grad_norm": 0.8046875, + "learning_rate": 0.00017078718585050685, + "loss": 0.7789, + "step": 16207 + }, + { + "epoch": 0.41617599150084816, + "grad_norm": 0.82421875, + "learning_rate": 0.00017078403253184473, + "loss": 0.9124, + "step": 16208 + }, + { + "epoch": 0.41620166869676994, + "grad_norm": 0.80859375, + "learning_rate": 0.0001707808790721164, + "loss": 1.1648, + "step": 16209 + }, + { + "epoch": 0.4162273458926918, + "grad_norm": 0.828125, + "learning_rate": 0.00017077772547132822, + "loss": 0.8719, + "step": 16210 + }, + { + "epoch": 0.4162530230886136, + "grad_norm": 0.7890625, + "learning_rate": 0.0001707745717294864, + "loss": 0.8446, + "step": 16211 + }, + { + "epoch": 0.41627870028453545, + "grad_norm": 0.78515625, + "learning_rate": 0.00017077141784659728, + "loss": 0.852, + "step": 16212 + }, + { + "epoch": 0.41630437748045723, + "grad_norm": 0.7421875, + "learning_rate": 0.0001707682638226671, + "loss": 0.8275, + "step": 16213 + }, + { + "epoch": 0.41633005467637907, + "grad_norm": 0.8046875, + "learning_rate": 0.00017076510965770213, + "loss": 1.0557, + "step": 16214 + }, + { + "epoch": 0.4163557318723009, + "grad_norm": 0.77734375, + "learning_rate": 0.00017076195535170875, + "loss": 0.8325, + "step": 16215 + }, + { + "epoch": 0.4163814090682227, + "grad_norm": 0.72265625, + "learning_rate": 0.00017075880090469315, + "loss": 0.8473, + "step": 16216 + }, + { + "epoch": 0.4164070862641445, + "grad_norm": 0.75390625, + "learning_rate": 0.00017075564631666165, + "loss": 0.8985, + "step": 16217 + }, + { + "epoch": 0.41643276346006636, + "grad_norm": 0.734375, + "learning_rate": 0.00017075249158762055, + "loss": 0.8212, + "step": 16218 + }, + { + "epoch": 0.41645844065598814, + "grad_norm": 0.76171875, + "learning_rate": 0.0001707493367175761, + "loss": 0.9263, + "step": 16219 + }, + { + "epoch": 0.41648411785191, + "grad_norm": 0.80859375, + "learning_rate": 0.00017074618170653466, + "loss": 0.8231, + "step": 16220 + }, + { + "epoch": 0.4165097950478318, + "grad_norm": 0.7578125, + "learning_rate": 0.00017074302655450241, + "loss": 0.8739, + "step": 16221 + }, + { + "epoch": 0.41653547224375365, + "grad_norm": 0.8203125, + "learning_rate": 0.00017073987126148575, + "loss": 0.9998, + "step": 16222 + }, + { + "epoch": 0.4165611494396754, + "grad_norm": 0.84375, + "learning_rate": 0.0001707367158274909, + "loss": 1.061, + "step": 16223 + }, + { + "epoch": 0.41658682663559726, + "grad_norm": 0.80859375, + "learning_rate": 0.00017073356025252415, + "loss": 0.9195, + "step": 16224 + }, + { + "epoch": 0.4166125038315191, + "grad_norm": 0.81640625, + "learning_rate": 0.0001707304045365918, + "loss": 0.8843, + "step": 16225 + }, + { + "epoch": 0.4166381810274409, + "grad_norm": 0.796875, + "learning_rate": 0.00017072724867970017, + "loss": 0.8945, + "step": 16226 + }, + { + "epoch": 0.4166638582233627, + "grad_norm": 0.8046875, + "learning_rate": 0.0001707240926818555, + "loss": 0.8339, + "step": 16227 + }, + { + "epoch": 0.41668953541928455, + "grad_norm": 0.87109375, + "learning_rate": 0.00017072093654306412, + "loss": 1.1761, + "step": 16228 + }, + { + "epoch": 0.41671521261520633, + "grad_norm": 0.796875, + "learning_rate": 0.00017071778026333227, + "loss": 0.9309, + "step": 16229 + }, + { + "epoch": 0.41674088981112817, + "grad_norm": 0.7578125, + "learning_rate": 0.00017071462384266628, + "loss": 0.9242, + "step": 16230 + }, + { + "epoch": 0.41676656700705, + "grad_norm": 0.7578125, + "learning_rate": 0.00017071146728107246, + "loss": 1.0184, + "step": 16231 + }, + { + "epoch": 0.41679224420297184, + "grad_norm": 0.7578125, + "learning_rate": 0.00017070831057855706, + "loss": 0.8149, + "step": 16232 + }, + { + "epoch": 0.4168179213988936, + "grad_norm": 0.81640625, + "learning_rate": 0.00017070515373512638, + "loss": 0.8341, + "step": 16233 + }, + { + "epoch": 0.41684359859481546, + "grad_norm": 0.7265625, + "learning_rate": 0.00017070199675078672, + "loss": 0.8152, + "step": 16234 + }, + { + "epoch": 0.4168692757907373, + "grad_norm": 0.671875, + "learning_rate": 0.00017069883962554435, + "loss": 0.7701, + "step": 16235 + }, + { + "epoch": 0.4168949529866591, + "grad_norm": 0.828125, + "learning_rate": 0.0001706956823594056, + "loss": 1.0161, + "step": 16236 + }, + { + "epoch": 0.4169206301825809, + "grad_norm": 0.80078125, + "learning_rate": 0.00017069252495237675, + "loss": 0.8954, + "step": 16237 + }, + { + "epoch": 0.41694630737850275, + "grad_norm": 0.7421875, + "learning_rate": 0.00017068936740446408, + "loss": 0.9201, + "step": 16238 + }, + { + "epoch": 0.4169719845744245, + "grad_norm": 0.78515625, + "learning_rate": 0.00017068620971567383, + "loss": 0.9687, + "step": 16239 + }, + { + "epoch": 0.41699766177034636, + "grad_norm": 0.72265625, + "learning_rate": 0.00017068305188601245, + "loss": 0.8583, + "step": 16240 + }, + { + "epoch": 0.4170233389662682, + "grad_norm": 0.76171875, + "learning_rate": 0.00017067989391548606, + "loss": 0.8957, + "step": 16241 + }, + { + "epoch": 0.41704901616219003, + "grad_norm": 0.765625, + "learning_rate": 0.00017067673580410103, + "loss": 0.9886, + "step": 16242 + }, + { + "epoch": 0.4170746933581118, + "grad_norm": 0.796875, + "learning_rate": 0.00017067357755186366, + "loss": 1.0255, + "step": 16243 + }, + { + "epoch": 0.41710037055403365, + "grad_norm": 0.75390625, + "learning_rate": 0.00017067041915878024, + "loss": 0.8297, + "step": 16244 + }, + { + "epoch": 0.4171260477499555, + "grad_norm": 0.91796875, + "learning_rate": 0.00017066726062485706, + "loss": 0.8824, + "step": 16245 + }, + { + "epoch": 0.41715172494587727, + "grad_norm": 0.796875, + "learning_rate": 0.00017066410195010038, + "loss": 0.8998, + "step": 16246 + }, + { + "epoch": 0.4171774021417991, + "grad_norm": 0.74609375, + "learning_rate": 0.00017066094313451658, + "loss": 0.7964, + "step": 16247 + }, + { + "epoch": 0.41720307933772094, + "grad_norm": 0.82421875, + "learning_rate": 0.00017065778417811188, + "loss": 0.8997, + "step": 16248 + }, + { + "epoch": 0.4172287565336427, + "grad_norm": 0.765625, + "learning_rate": 0.00017065462508089257, + "loss": 0.8807, + "step": 16249 + }, + { + "epoch": 0.41725443372956456, + "grad_norm": 0.7578125, + "learning_rate": 0.00017065146584286503, + "loss": 0.8463, + "step": 16250 + }, + { + "epoch": 0.4172801109254864, + "grad_norm": 0.78515625, + "learning_rate": 0.00017064830646403543, + "loss": 0.9061, + "step": 16251 + }, + { + "epoch": 0.4173057881214082, + "grad_norm": 0.8203125, + "learning_rate": 0.0001706451469444102, + "loss": 1.0123, + "step": 16252 + }, + { + "epoch": 0.41733146531733, + "grad_norm": 0.86328125, + "learning_rate": 0.00017064198728399557, + "loss": 0.8982, + "step": 16253 + }, + { + "epoch": 0.41735714251325184, + "grad_norm": 0.7265625, + "learning_rate": 0.00017063882748279783, + "loss": 0.8576, + "step": 16254 + }, + { + "epoch": 0.4173828197091737, + "grad_norm": 0.7578125, + "learning_rate": 0.00017063566754082325, + "loss": 0.7831, + "step": 16255 + }, + { + "epoch": 0.41740849690509546, + "grad_norm": 0.80078125, + "learning_rate": 0.0001706325074580782, + "loss": 0.7933, + "step": 16256 + }, + { + "epoch": 0.4174341741010173, + "grad_norm": 0.8203125, + "learning_rate": 0.00017062934723456895, + "loss": 0.9551, + "step": 16257 + }, + { + "epoch": 0.41745985129693913, + "grad_norm": 0.74609375, + "learning_rate": 0.00017062618687030176, + "loss": 0.9196, + "step": 16258 + }, + { + "epoch": 0.4174855284928609, + "grad_norm": 0.78515625, + "learning_rate": 0.00017062302636528299, + "loss": 1.0146, + "step": 16259 + }, + { + "epoch": 0.41751120568878275, + "grad_norm": 0.7421875, + "learning_rate": 0.0001706198657195189, + "loss": 1.0688, + "step": 16260 + }, + { + "epoch": 0.4175368828847046, + "grad_norm": 0.78125, + "learning_rate": 0.0001706167049330158, + "loss": 0.9609, + "step": 16261 + }, + { + "epoch": 0.4175625600806264, + "grad_norm": 0.84375, + "learning_rate": 0.00017061354400577995, + "loss": 1.0461, + "step": 16262 + }, + { + "epoch": 0.4175882372765482, + "grad_norm": 0.8125, + "learning_rate": 0.00017061038293781773, + "loss": 0.8332, + "step": 16263 + }, + { + "epoch": 0.41761391447247004, + "grad_norm": 0.796875, + "learning_rate": 0.00017060722172913538, + "loss": 1.0865, + "step": 16264 + }, + { + "epoch": 0.4176395916683919, + "grad_norm": 0.78515625, + "learning_rate": 0.0001706040603797392, + "loss": 0.869, + "step": 16265 + }, + { + "epoch": 0.41766526886431365, + "grad_norm": 0.84375, + "learning_rate": 0.00017060089888963552, + "loss": 1.0119, + "step": 16266 + }, + { + "epoch": 0.4176909460602355, + "grad_norm": 0.81640625, + "learning_rate": 0.00017059773725883063, + "loss": 0.9383, + "step": 16267 + }, + { + "epoch": 0.4177166232561573, + "grad_norm": 0.8828125, + "learning_rate": 0.00017059457548733082, + "loss": 0.8533, + "step": 16268 + }, + { + "epoch": 0.4177423004520791, + "grad_norm": 0.88671875, + "learning_rate": 0.0001705914135751424, + "loss": 0.9971, + "step": 16269 + }, + { + "epoch": 0.41776797764800094, + "grad_norm": 0.80078125, + "learning_rate": 0.00017058825152227168, + "loss": 0.9101, + "step": 16270 + }, + { + "epoch": 0.4177936548439228, + "grad_norm": 0.78515625, + "learning_rate": 0.00017058508932872493, + "loss": 0.8646, + "step": 16271 + }, + { + "epoch": 0.4178193320398446, + "grad_norm": 0.8125, + "learning_rate": 0.00017058192699450847, + "loss": 0.887, + "step": 16272 + }, + { + "epoch": 0.4178450092357664, + "grad_norm": 0.83203125, + "learning_rate": 0.00017057876451962863, + "loss": 0.9185, + "step": 16273 + }, + { + "epoch": 0.41787068643168823, + "grad_norm": 0.7578125, + "learning_rate": 0.00017057560190409169, + "loss": 0.9742, + "step": 16274 + }, + { + "epoch": 0.41789636362761007, + "grad_norm": 0.77734375, + "learning_rate": 0.0001705724391479039, + "loss": 1.022, + "step": 16275 + }, + { + "epoch": 0.41792204082353185, + "grad_norm": 0.7578125, + "learning_rate": 0.00017056927625107166, + "loss": 0.9581, + "step": 16276 + }, + { + "epoch": 0.4179477180194537, + "grad_norm": 0.734375, + "learning_rate": 0.0001705661132136012, + "loss": 0.9773, + "step": 16277 + }, + { + "epoch": 0.4179733952153755, + "grad_norm": 0.83984375, + "learning_rate": 0.00017056295003549886, + "loss": 0.9055, + "step": 16278 + }, + { + "epoch": 0.4179990724112973, + "grad_norm": 0.65234375, + "learning_rate": 0.00017055978671677097, + "loss": 0.804, + "step": 16279 + }, + { + "epoch": 0.41802474960721914, + "grad_norm": 0.75, + "learning_rate": 0.0001705566232574238, + "loss": 0.9766, + "step": 16280 + }, + { + "epoch": 0.418050426803141, + "grad_norm": 0.9140625, + "learning_rate": 0.0001705534596574636, + "loss": 0.8525, + "step": 16281 + }, + { + "epoch": 0.4180761039990628, + "grad_norm": 0.76171875, + "learning_rate": 0.00017055029591689675, + "loss": 0.8555, + "step": 16282 + }, + { + "epoch": 0.4181017811949846, + "grad_norm": 0.7578125, + "learning_rate": 0.00017054713203572952, + "loss": 0.9648, + "step": 16283 + }, + { + "epoch": 0.4181274583909064, + "grad_norm": 0.796875, + "learning_rate": 0.00017054396801396827, + "loss": 0.7494, + "step": 16284 + }, + { + "epoch": 0.41815313558682826, + "grad_norm": 0.79296875, + "learning_rate": 0.0001705408038516192, + "loss": 0.9023, + "step": 16285 + }, + { + "epoch": 0.41817881278275004, + "grad_norm": 0.80078125, + "learning_rate": 0.00017053763954868873, + "loss": 0.8743, + "step": 16286 + }, + { + "epoch": 0.4182044899786719, + "grad_norm": 0.72265625, + "learning_rate": 0.0001705344751051831, + "loss": 0.7888, + "step": 16287 + }, + { + "epoch": 0.4182301671745937, + "grad_norm": 0.78125, + "learning_rate": 0.00017053131052110863, + "loss": 0.9143, + "step": 16288 + }, + { + "epoch": 0.4182558443705155, + "grad_norm": 0.74609375, + "learning_rate": 0.00017052814579647163, + "loss": 0.9353, + "step": 16289 + }, + { + "epoch": 0.41828152156643733, + "grad_norm": 0.765625, + "learning_rate": 0.00017052498093127842, + "loss": 0.9602, + "step": 16290 + }, + { + "epoch": 0.41830719876235917, + "grad_norm": 0.77734375, + "learning_rate": 0.00017052181592553528, + "loss": 0.9451, + "step": 16291 + }, + { + "epoch": 0.418332875958281, + "grad_norm": 0.875, + "learning_rate": 0.00017051865077924853, + "loss": 0.9725, + "step": 16292 + }, + { + "epoch": 0.4183585531542028, + "grad_norm": 0.8125, + "learning_rate": 0.00017051548549242452, + "loss": 0.9074, + "step": 16293 + }, + { + "epoch": 0.4183842303501246, + "grad_norm": 0.8046875, + "learning_rate": 0.00017051232006506944, + "loss": 1.0478, + "step": 16294 + }, + { + "epoch": 0.41840990754604646, + "grad_norm": 0.71875, + "learning_rate": 0.00017050915449718973, + "loss": 0.9295, + "step": 16295 + }, + { + "epoch": 0.41843558474196824, + "grad_norm": 0.7890625, + "learning_rate": 0.00017050598878879162, + "loss": 0.977, + "step": 16296 + }, + { + "epoch": 0.41846126193789007, + "grad_norm": 0.9375, + "learning_rate": 0.00017050282293988147, + "loss": 1.0752, + "step": 16297 + }, + { + "epoch": 0.4184869391338119, + "grad_norm": 0.765625, + "learning_rate": 0.00017049965695046553, + "loss": 1.0662, + "step": 16298 + }, + { + "epoch": 0.4185126163297337, + "grad_norm": 0.73046875, + "learning_rate": 0.00017049649082055015, + "loss": 0.9542, + "step": 16299 + }, + { + "epoch": 0.4185382935256555, + "grad_norm": 0.765625, + "learning_rate": 0.00017049332455014167, + "loss": 0.8516, + "step": 16300 + }, + { + "epoch": 0.41856397072157736, + "grad_norm": 0.6953125, + "learning_rate": 0.0001704901581392463, + "loss": 0.794, + "step": 16301 + }, + { + "epoch": 0.4185896479174992, + "grad_norm": 0.7578125, + "learning_rate": 0.00017048699158787046, + "loss": 0.8321, + "step": 16302 + }, + { + "epoch": 0.418615325113421, + "grad_norm": 0.96875, + "learning_rate": 0.0001704838248960204, + "loss": 0.8058, + "step": 16303 + }, + { + "epoch": 0.4186410023093428, + "grad_norm": 0.88671875, + "learning_rate": 0.00017048065806370244, + "loss": 1.0012, + "step": 16304 + }, + { + "epoch": 0.41866667950526465, + "grad_norm": 0.76953125, + "learning_rate": 0.00017047749109092288, + "loss": 0.9504, + "step": 16305 + }, + { + "epoch": 0.41869235670118643, + "grad_norm": 0.80078125, + "learning_rate": 0.00017047432397768807, + "loss": 0.9336, + "step": 16306 + }, + { + "epoch": 0.41871803389710827, + "grad_norm": 0.8671875, + "learning_rate": 0.0001704711567240043, + "loss": 1.0199, + "step": 16307 + }, + { + "epoch": 0.4187437110930301, + "grad_norm": 0.796875, + "learning_rate": 0.00017046798932987786, + "loss": 0.849, + "step": 16308 + }, + { + "epoch": 0.4187693882889519, + "grad_norm": 0.76171875, + "learning_rate": 0.0001704648217953151, + "loss": 0.8405, + "step": 16309 + }, + { + "epoch": 0.4187950654848737, + "grad_norm": 0.78125, + "learning_rate": 0.00017046165412032229, + "loss": 0.9322, + "step": 16310 + }, + { + "epoch": 0.41882074268079555, + "grad_norm": 0.76953125, + "learning_rate": 0.00017045848630490578, + "loss": 1.0038, + "step": 16311 + }, + { + "epoch": 0.4188464198767174, + "grad_norm": 0.7890625, + "learning_rate": 0.0001704553183490719, + "loss": 0.8577, + "step": 16312 + }, + { + "epoch": 0.41887209707263917, + "grad_norm": 0.84375, + "learning_rate": 0.0001704521502528269, + "loss": 1.0094, + "step": 16313 + }, + { + "epoch": 0.418897774268561, + "grad_norm": 0.7421875, + "learning_rate": 0.00017044898201617713, + "loss": 0.9306, + "step": 16314 + }, + { + "epoch": 0.41892345146448284, + "grad_norm": 0.82421875, + "learning_rate": 0.00017044581363912892, + "loss": 1.0088, + "step": 16315 + }, + { + "epoch": 0.4189491286604046, + "grad_norm": 0.77734375, + "learning_rate": 0.00017044264512168855, + "loss": 0.8534, + "step": 16316 + }, + { + "epoch": 0.41897480585632646, + "grad_norm": 0.765625, + "learning_rate": 0.00017043947646386238, + "loss": 1.0232, + "step": 16317 + }, + { + "epoch": 0.4190004830522483, + "grad_norm": 0.8359375, + "learning_rate": 0.00017043630766565667, + "loss": 0.8033, + "step": 16318 + }, + { + "epoch": 0.4190261602481701, + "grad_norm": 0.78515625, + "learning_rate": 0.00017043313872707776, + "loss": 0.975, + "step": 16319 + }, + { + "epoch": 0.4190518374440919, + "grad_norm": 0.75, + "learning_rate": 0.000170429969648132, + "loss": 0.9726, + "step": 16320 + }, + { + "epoch": 0.41907751464001375, + "grad_norm": 0.8671875, + "learning_rate": 0.00017042680042882565, + "loss": 1.0285, + "step": 16321 + }, + { + "epoch": 0.41910319183593553, + "grad_norm": 0.75, + "learning_rate": 0.000170423631069165, + "loss": 0.742, + "step": 16322 + }, + { + "epoch": 0.41912886903185737, + "grad_norm": 0.84375, + "learning_rate": 0.00017042046156915647, + "loss": 0.989, + "step": 16323 + }, + { + "epoch": 0.4191545462277792, + "grad_norm": 0.734375, + "learning_rate": 0.00017041729192880634, + "loss": 0.8259, + "step": 16324 + }, + { + "epoch": 0.41918022342370104, + "grad_norm": 0.75, + "learning_rate": 0.00017041412214812088, + "loss": 0.8441, + "step": 16325 + }, + { + "epoch": 0.4192059006196228, + "grad_norm": 0.7578125, + "learning_rate": 0.0001704109522271064, + "loss": 0.9571, + "step": 16326 + }, + { + "epoch": 0.41923157781554465, + "grad_norm": 0.77734375, + "learning_rate": 0.0001704077821657693, + "loss": 1.0343, + "step": 16327 + }, + { + "epoch": 0.4192572550114665, + "grad_norm": 0.8046875, + "learning_rate": 0.00017040461196411583, + "loss": 0.9717, + "step": 16328 + }, + { + "epoch": 0.41928293220738827, + "grad_norm": 0.8359375, + "learning_rate": 0.00017040144162215235, + "loss": 0.9599, + "step": 16329 + }, + { + "epoch": 0.4193086094033101, + "grad_norm": 0.75, + "learning_rate": 0.00017039827113988514, + "loss": 0.8846, + "step": 16330 + }, + { + "epoch": 0.41933428659923194, + "grad_norm": 0.73828125, + "learning_rate": 0.00017039510051732054, + "loss": 0.8195, + "step": 16331 + }, + { + "epoch": 0.4193599637951537, + "grad_norm": 0.80078125, + "learning_rate": 0.00017039192975446484, + "loss": 0.7996, + "step": 16332 + }, + { + "epoch": 0.41938564099107556, + "grad_norm": 0.7578125, + "learning_rate": 0.0001703887588513244, + "loss": 0.9156, + "step": 16333 + }, + { + "epoch": 0.4194113181869974, + "grad_norm": 0.8359375, + "learning_rate": 0.00017038558780790552, + "loss": 0.9829, + "step": 16334 + }, + { + "epoch": 0.41943699538291923, + "grad_norm": 0.6328125, + "learning_rate": 0.00017038241662421455, + "loss": 0.8682, + "step": 16335 + }, + { + "epoch": 0.419462672578841, + "grad_norm": 0.78515625, + "learning_rate": 0.00017037924530025772, + "loss": 0.9526, + "step": 16336 + }, + { + "epoch": 0.41948834977476285, + "grad_norm": 0.75390625, + "learning_rate": 0.00017037607383604147, + "loss": 0.8139, + "step": 16337 + }, + { + "epoch": 0.4195140269706847, + "grad_norm": 0.80859375, + "learning_rate": 0.00017037290223157202, + "loss": 1.0748, + "step": 16338 + }, + { + "epoch": 0.41953970416660646, + "grad_norm": 0.796875, + "learning_rate": 0.00017036973048685576, + "loss": 0.9637, + "step": 16339 + }, + { + "epoch": 0.4195653813625283, + "grad_norm": 0.71875, + "learning_rate": 0.00017036655860189898, + "loss": 0.9085, + "step": 16340 + }, + { + "epoch": 0.41959105855845014, + "grad_norm": 0.828125, + "learning_rate": 0.00017036338657670802, + "loss": 0.9306, + "step": 16341 + }, + { + "epoch": 0.4196167357543719, + "grad_norm": 0.8046875, + "learning_rate": 0.00017036021441128917, + "loss": 0.8945, + "step": 16342 + }, + { + "epoch": 0.41964241295029375, + "grad_norm": 0.82421875, + "learning_rate": 0.00017035704210564877, + "loss": 0.9583, + "step": 16343 + }, + { + "epoch": 0.4196680901462156, + "grad_norm": 0.71484375, + "learning_rate": 0.00017035386965979309, + "loss": 0.7837, + "step": 16344 + }, + { + "epoch": 0.4196937673421374, + "grad_norm": 0.73828125, + "learning_rate": 0.00017035069707372857, + "loss": 0.999, + "step": 16345 + }, + { + "epoch": 0.4197194445380592, + "grad_norm": 0.77734375, + "learning_rate": 0.00017034752434746145, + "loss": 0.8521, + "step": 16346 + }, + { + "epoch": 0.41974512173398104, + "grad_norm": 0.80078125, + "learning_rate": 0.00017034435148099804, + "loss": 0.9226, + "step": 16347 + }, + { + "epoch": 0.4197707989299029, + "grad_norm": 0.80078125, + "learning_rate": 0.00017034117847434472, + "loss": 0.9088, + "step": 16348 + }, + { + "epoch": 0.41979647612582466, + "grad_norm": 0.78515625, + "learning_rate": 0.0001703380053275078, + "loss": 0.974, + "step": 16349 + }, + { + "epoch": 0.4198221533217465, + "grad_norm": 0.71875, + "learning_rate": 0.00017033483204049356, + "loss": 0.8413, + "step": 16350 + }, + { + "epoch": 0.41984783051766833, + "grad_norm": 0.8046875, + "learning_rate": 0.00017033165861330836, + "loss": 0.9116, + "step": 16351 + }, + { + "epoch": 0.4198735077135901, + "grad_norm": 0.73046875, + "learning_rate": 0.00017032848504595853, + "loss": 0.8678, + "step": 16352 + }, + { + "epoch": 0.41989918490951195, + "grad_norm": 0.7265625, + "learning_rate": 0.0001703253113384504, + "loss": 0.7922, + "step": 16353 + }, + { + "epoch": 0.4199248621054338, + "grad_norm": 0.75390625, + "learning_rate": 0.00017032213749079022, + "loss": 0.8479, + "step": 16354 + }, + { + "epoch": 0.4199505393013556, + "grad_norm": 0.7734375, + "learning_rate": 0.0001703189635029844, + "loss": 0.8281, + "step": 16355 + }, + { + "epoch": 0.4199762164972774, + "grad_norm": 0.8046875, + "learning_rate": 0.00017031578937503922, + "loss": 0.9747, + "step": 16356 + }, + { + "epoch": 0.42000189369319924, + "grad_norm": 0.7734375, + "learning_rate": 0.00017031261510696104, + "loss": 0.9386, + "step": 16357 + }, + { + "epoch": 0.42002757088912107, + "grad_norm": 0.77734375, + "learning_rate": 0.0001703094406987562, + "loss": 0.9127, + "step": 16358 + }, + { + "epoch": 0.42005324808504285, + "grad_norm": 0.75, + "learning_rate": 0.00017030626615043094, + "loss": 0.9441, + "step": 16359 + }, + { + "epoch": 0.4200789252809647, + "grad_norm": 0.828125, + "learning_rate": 0.0001703030914619917, + "loss": 0.8393, + "step": 16360 + }, + { + "epoch": 0.4201046024768865, + "grad_norm": 0.81640625, + "learning_rate": 0.0001702999166334447, + "loss": 0.933, + "step": 16361 + }, + { + "epoch": 0.4201302796728083, + "grad_norm": 0.75390625, + "learning_rate": 0.00017029674166479632, + "loss": 1.033, + "step": 16362 + }, + { + "epoch": 0.42015595686873014, + "grad_norm": 0.90234375, + "learning_rate": 0.0001702935665560529, + "loss": 1.0426, + "step": 16363 + }, + { + "epoch": 0.420181634064652, + "grad_norm": 0.70703125, + "learning_rate": 0.00017029039130722073, + "loss": 0.8302, + "step": 16364 + }, + { + "epoch": 0.4202073112605738, + "grad_norm": 0.80859375, + "learning_rate": 0.00017028721591830622, + "loss": 0.8898, + "step": 16365 + }, + { + "epoch": 0.4202329884564956, + "grad_norm": 0.87109375, + "learning_rate": 0.00017028404038931557, + "loss": 0.9895, + "step": 16366 + }, + { + "epoch": 0.42025866565241743, + "grad_norm": 1.265625, + "learning_rate": 0.00017028086472025523, + "loss": 0.8168, + "step": 16367 + }, + { + "epoch": 0.42028434284833927, + "grad_norm": 0.703125, + "learning_rate": 0.00017027768891113144, + "loss": 0.8203, + "step": 16368 + }, + { + "epoch": 0.42031002004426105, + "grad_norm": 0.7734375, + "learning_rate": 0.00017027451296195055, + "loss": 1.0004, + "step": 16369 + }, + { + "epoch": 0.4203356972401829, + "grad_norm": 0.78515625, + "learning_rate": 0.00017027133687271893, + "loss": 0.911, + "step": 16370 + }, + { + "epoch": 0.4203613744361047, + "grad_norm": 0.734375, + "learning_rate": 0.00017026816064344286, + "loss": 0.8438, + "step": 16371 + }, + { + "epoch": 0.4203870516320265, + "grad_norm": 0.8046875, + "learning_rate": 0.00017026498427412872, + "loss": 0.8828, + "step": 16372 + }, + { + "epoch": 0.42041272882794833, + "grad_norm": 0.78125, + "learning_rate": 0.0001702618077647828, + "loss": 0.924, + "step": 16373 + }, + { + "epoch": 0.42043840602387017, + "grad_norm": 0.7421875, + "learning_rate": 0.00017025863111541146, + "loss": 0.7622, + "step": 16374 + }, + { + "epoch": 0.420464083219792, + "grad_norm": 0.8046875, + "learning_rate": 0.000170255454326021, + "loss": 0.9474, + "step": 16375 + }, + { + "epoch": 0.4204897604157138, + "grad_norm": 0.7734375, + "learning_rate": 0.00017025227739661774, + "loss": 1.0141, + "step": 16376 + }, + { + "epoch": 0.4205154376116356, + "grad_norm": 0.74609375, + "learning_rate": 0.00017024910032720808, + "loss": 0.9538, + "step": 16377 + }, + { + "epoch": 0.42054111480755746, + "grad_norm": 0.82421875, + "learning_rate": 0.00017024592311779831, + "loss": 0.9882, + "step": 16378 + }, + { + "epoch": 0.42056679200347924, + "grad_norm": 0.76171875, + "learning_rate": 0.00017024274576839474, + "loss": 0.9206, + "step": 16379 + }, + { + "epoch": 0.4205924691994011, + "grad_norm": 0.82421875, + "learning_rate": 0.00017023956827900372, + "loss": 0.9522, + "step": 16380 + }, + { + "epoch": 0.4206181463953229, + "grad_norm": 0.70703125, + "learning_rate": 0.00017023639064963158, + "loss": 0.8515, + "step": 16381 + }, + { + "epoch": 0.4206438235912447, + "grad_norm": 0.87890625, + "learning_rate": 0.00017023321288028468, + "loss": 1.0007, + "step": 16382 + }, + { + "epoch": 0.42066950078716653, + "grad_norm": 0.76953125, + "learning_rate": 0.00017023003497096935, + "loss": 0.831, + "step": 16383 + }, + { + "epoch": 0.42069517798308836, + "grad_norm": 0.72265625, + "learning_rate": 0.00017022685692169186, + "loss": 0.8373, + "step": 16384 + }, + { + "epoch": 0.4207208551790102, + "grad_norm": 0.77734375, + "learning_rate": 0.0001702236787324586, + "loss": 0.885, + "step": 16385 + }, + { + "epoch": 0.420746532374932, + "grad_norm": 0.82421875, + "learning_rate": 0.0001702205004032759, + "loss": 0.8566, + "step": 16386 + }, + { + "epoch": 0.4207722095708538, + "grad_norm": 0.8046875, + "learning_rate": 0.0001702173219341501, + "loss": 0.8579, + "step": 16387 + }, + { + "epoch": 0.42079788676677565, + "grad_norm": 0.79296875, + "learning_rate": 0.0001702141433250875, + "loss": 0.9479, + "step": 16388 + }, + { + "epoch": 0.42082356396269743, + "grad_norm": 0.890625, + "learning_rate": 0.00017021096457609444, + "loss": 0.8279, + "step": 16389 + }, + { + "epoch": 0.42084924115861927, + "grad_norm": 0.8125, + "learning_rate": 0.00017020778568717728, + "loss": 0.9708, + "step": 16390 + }, + { + "epoch": 0.4208749183545411, + "grad_norm": 0.80859375, + "learning_rate": 0.00017020460665834235, + "loss": 0.9783, + "step": 16391 + }, + { + "epoch": 0.4209005955504629, + "grad_norm": 0.80859375, + "learning_rate": 0.000170201427489596, + "loss": 0.9018, + "step": 16392 + }, + { + "epoch": 0.4209262727463847, + "grad_norm": 0.8046875, + "learning_rate": 0.00017019824818094453, + "loss": 0.9182, + "step": 16393 + }, + { + "epoch": 0.42095194994230656, + "grad_norm": 0.7734375, + "learning_rate": 0.00017019506873239428, + "loss": 0.9086, + "step": 16394 + }, + { + "epoch": 0.4209776271382284, + "grad_norm": 0.75, + "learning_rate": 0.0001701918891439516, + "loss": 0.9291, + "step": 16395 + }, + { + "epoch": 0.4210033043341502, + "grad_norm": 0.7578125, + "learning_rate": 0.00017018870941562284, + "loss": 1.1117, + "step": 16396 + }, + { + "epoch": 0.421028981530072, + "grad_norm": 0.875, + "learning_rate": 0.00017018552954741433, + "loss": 1.0541, + "step": 16397 + }, + { + "epoch": 0.42105465872599385, + "grad_norm": 0.75, + "learning_rate": 0.00017018234953933235, + "loss": 0.839, + "step": 16398 + }, + { + "epoch": 0.4210803359219156, + "grad_norm": 1.0390625, + "learning_rate": 0.0001701791693913833, + "loss": 1.0274, + "step": 16399 + }, + { + "epoch": 0.42110601311783746, + "grad_norm": 0.8359375, + "learning_rate": 0.00017017598910357354, + "loss": 1.0958, + "step": 16400 + }, + { + "epoch": 0.4211316903137593, + "grad_norm": 0.8046875, + "learning_rate": 0.0001701728086759094, + "loss": 1.0269, + "step": 16401 + }, + { + "epoch": 0.4211573675096811, + "grad_norm": 0.77734375, + "learning_rate": 0.00017016962810839715, + "loss": 0.9001, + "step": 16402 + }, + { + "epoch": 0.4211830447056029, + "grad_norm": 0.8515625, + "learning_rate": 0.00017016644740104316, + "loss": 0.9838, + "step": 16403 + }, + { + "epoch": 0.42120872190152475, + "grad_norm": 0.7578125, + "learning_rate": 0.00017016326655385378, + "loss": 1.0876, + "step": 16404 + }, + { + "epoch": 0.4212343990974466, + "grad_norm": 0.8515625, + "learning_rate": 0.00017016008556683537, + "loss": 0.8961, + "step": 16405 + }, + { + "epoch": 0.42126007629336837, + "grad_norm": 0.71484375, + "learning_rate": 0.00017015690443999423, + "loss": 0.7484, + "step": 16406 + }, + { + "epoch": 0.4212857534892902, + "grad_norm": 0.78125, + "learning_rate": 0.00017015372317333668, + "loss": 1.0077, + "step": 16407 + }, + { + "epoch": 0.42131143068521204, + "grad_norm": 0.828125, + "learning_rate": 0.00017015054176686918, + "loss": 1.0773, + "step": 16408 + }, + { + "epoch": 0.4213371078811338, + "grad_norm": 0.7421875, + "learning_rate": 0.00017014736022059792, + "loss": 0.806, + "step": 16409 + }, + { + "epoch": 0.42136278507705566, + "grad_norm": 0.78125, + "learning_rate": 0.00017014417853452934, + "loss": 0.8399, + "step": 16410 + }, + { + "epoch": 0.4213884622729775, + "grad_norm": 0.84375, + "learning_rate": 0.00017014099670866975, + "loss": 0.9112, + "step": 16411 + }, + { + "epoch": 0.4214141394688993, + "grad_norm": 0.73828125, + "learning_rate": 0.00017013781474302546, + "loss": 0.9203, + "step": 16412 + }, + { + "epoch": 0.4214398166648211, + "grad_norm": 0.76953125, + "learning_rate": 0.00017013463263760285, + "loss": 0.8776, + "step": 16413 + }, + { + "epoch": 0.42146549386074295, + "grad_norm": 0.8984375, + "learning_rate": 0.00017013145039240828, + "loss": 1.0359, + "step": 16414 + }, + { + "epoch": 0.4214911710566648, + "grad_norm": 0.8046875, + "learning_rate": 0.00017012826800744804, + "loss": 0.8343, + "step": 16415 + }, + { + "epoch": 0.42151684825258656, + "grad_norm": 0.734375, + "learning_rate": 0.0001701250854827285, + "loss": 0.8234, + "step": 16416 + }, + { + "epoch": 0.4215425254485084, + "grad_norm": 0.7734375, + "learning_rate": 0.000170121902818256, + "loss": 0.8829, + "step": 16417 + }, + { + "epoch": 0.42156820264443023, + "grad_norm": 0.8359375, + "learning_rate": 0.0001701187200140369, + "loss": 0.9559, + "step": 16418 + }, + { + "epoch": 0.421593879840352, + "grad_norm": 0.828125, + "learning_rate": 0.0001701155370700775, + "loss": 1.0021, + "step": 16419 + }, + { + "epoch": 0.42161955703627385, + "grad_norm": 0.83984375, + "learning_rate": 0.00017011235398638416, + "loss": 0.8658, + "step": 16420 + }, + { + "epoch": 0.4216452342321957, + "grad_norm": 0.71875, + "learning_rate": 0.00017010917076296325, + "loss": 0.8434, + "step": 16421 + }, + { + "epoch": 0.42167091142811747, + "grad_norm": 0.80078125, + "learning_rate": 0.0001701059873998211, + "loss": 0.9269, + "step": 16422 + }, + { + "epoch": 0.4216965886240393, + "grad_norm": 0.78515625, + "learning_rate": 0.00017010280389696404, + "loss": 1.0193, + "step": 16423 + }, + { + "epoch": 0.42172226581996114, + "grad_norm": 0.77734375, + "learning_rate": 0.00017009962025439843, + "loss": 0.9271, + "step": 16424 + }, + { + "epoch": 0.421747943015883, + "grad_norm": 1.3046875, + "learning_rate": 0.0001700964364721306, + "loss": 1.0153, + "step": 16425 + }, + { + "epoch": 0.42177362021180476, + "grad_norm": 0.72265625, + "learning_rate": 0.00017009325255016694, + "loss": 0.9513, + "step": 16426 + }, + { + "epoch": 0.4217992974077266, + "grad_norm": 0.74609375, + "learning_rate": 0.0001700900684885137, + "loss": 0.8593, + "step": 16427 + }, + { + "epoch": 0.42182497460364843, + "grad_norm": 0.85546875, + "learning_rate": 0.00017008688428717732, + "loss": 0.9151, + "step": 16428 + }, + { + "epoch": 0.4218506517995702, + "grad_norm": 0.76171875, + "learning_rate": 0.0001700836999461641, + "loss": 0.9912, + "step": 16429 + }, + { + "epoch": 0.42187632899549204, + "grad_norm": 0.83203125, + "learning_rate": 0.0001700805154654804, + "loss": 1.0193, + "step": 16430 + }, + { + "epoch": 0.4219020061914139, + "grad_norm": 0.79296875, + "learning_rate": 0.00017007733084513257, + "loss": 0.9992, + "step": 16431 + }, + { + "epoch": 0.42192768338733566, + "grad_norm": 0.83984375, + "learning_rate": 0.00017007414608512694, + "loss": 1.0165, + "step": 16432 + }, + { + "epoch": 0.4219533605832575, + "grad_norm": 0.75, + "learning_rate": 0.00017007096118546987, + "loss": 0.9825, + "step": 16433 + }, + { + "epoch": 0.42197903777917933, + "grad_norm": 0.8046875, + "learning_rate": 0.00017006777614616772, + "loss": 0.8446, + "step": 16434 + }, + { + "epoch": 0.42200471497510117, + "grad_norm": 0.71484375, + "learning_rate": 0.0001700645909672268, + "loss": 0.7806, + "step": 16435 + }, + { + "epoch": 0.42203039217102295, + "grad_norm": 0.74609375, + "learning_rate": 0.00017006140564865347, + "loss": 0.831, + "step": 16436 + }, + { + "epoch": 0.4220560693669448, + "grad_norm": 0.75, + "learning_rate": 0.00017005822019045412, + "loss": 0.9255, + "step": 16437 + }, + { + "epoch": 0.4220817465628666, + "grad_norm": 0.7265625, + "learning_rate": 0.00017005503459263506, + "loss": 0.798, + "step": 16438 + }, + { + "epoch": 0.4221074237587884, + "grad_norm": 0.85546875, + "learning_rate": 0.0001700518488552026, + "loss": 0.9074, + "step": 16439 + }, + { + "epoch": 0.42213310095471024, + "grad_norm": 0.87109375, + "learning_rate": 0.00017004866297816318, + "loss": 0.8265, + "step": 16440 + }, + { + "epoch": 0.4221587781506321, + "grad_norm": 0.7421875, + "learning_rate": 0.00017004547696152308, + "loss": 0.9809, + "step": 16441 + }, + { + "epoch": 0.42218445534655386, + "grad_norm": 0.80078125, + "learning_rate": 0.00017004229080528867, + "loss": 0.9488, + "step": 16442 + }, + { + "epoch": 0.4222101325424757, + "grad_norm": 0.78125, + "learning_rate": 0.0001700391045094663, + "loss": 0.937, + "step": 16443 + }, + { + "epoch": 0.4222358097383975, + "grad_norm": 0.8203125, + "learning_rate": 0.00017003591807406235, + "loss": 0.9099, + "step": 16444 + }, + { + "epoch": 0.42226148693431936, + "grad_norm": 0.8125, + "learning_rate": 0.00017003273149908312, + "loss": 0.888, + "step": 16445 + }, + { + "epoch": 0.42228716413024114, + "grad_norm": 0.75390625, + "learning_rate": 0.00017002954478453498, + "loss": 0.8946, + "step": 16446 + }, + { + "epoch": 0.422312841326163, + "grad_norm": 0.76953125, + "learning_rate": 0.0001700263579304243, + "loss": 0.9213, + "step": 16447 + }, + { + "epoch": 0.4223385185220848, + "grad_norm": 0.7578125, + "learning_rate": 0.0001700231709367574, + "loss": 0.8609, + "step": 16448 + }, + { + "epoch": 0.4223641957180066, + "grad_norm": 0.83984375, + "learning_rate": 0.00017001998380354063, + "loss": 1.0231, + "step": 16449 + }, + { + "epoch": 0.42238987291392843, + "grad_norm": 0.80078125, + "learning_rate": 0.0001700167965307804, + "loss": 0.8795, + "step": 16450 + }, + { + "epoch": 0.42241555010985027, + "grad_norm": 0.734375, + "learning_rate": 0.00017001360911848297, + "loss": 0.8929, + "step": 16451 + }, + { + "epoch": 0.42244122730577205, + "grad_norm": 0.76171875, + "learning_rate": 0.00017001042156665477, + "loss": 0.822, + "step": 16452 + }, + { + "epoch": 0.4224669045016939, + "grad_norm": 0.765625, + "learning_rate": 0.00017000723387530214, + "loss": 1.0431, + "step": 16453 + }, + { + "epoch": 0.4224925816976157, + "grad_norm": 0.796875, + "learning_rate": 0.0001700040460444314, + "loss": 0.9592, + "step": 16454 + }, + { + "epoch": 0.42251825889353756, + "grad_norm": 0.8125, + "learning_rate": 0.0001700008580740489, + "loss": 0.9941, + "step": 16455 + }, + { + "epoch": 0.42254393608945934, + "grad_norm": 0.828125, + "learning_rate": 0.000169997669964161, + "loss": 0.962, + "step": 16456 + }, + { + "epoch": 0.4225696132853812, + "grad_norm": 0.72265625, + "learning_rate": 0.0001699944817147741, + "loss": 0.8255, + "step": 16457 + }, + { + "epoch": 0.422595290481303, + "grad_norm": 0.7109375, + "learning_rate": 0.00016999129332589453, + "loss": 0.8286, + "step": 16458 + }, + { + "epoch": 0.4226209676772248, + "grad_norm": 0.71484375, + "learning_rate": 0.0001699881047975286, + "loss": 1.0007, + "step": 16459 + }, + { + "epoch": 0.4226466448731466, + "grad_norm": 0.74609375, + "learning_rate": 0.00016998491612968273, + "loss": 0.7736, + "step": 16460 + }, + { + "epoch": 0.42267232206906846, + "grad_norm": 0.85546875, + "learning_rate": 0.00016998172732236324, + "loss": 1.0264, + "step": 16461 + }, + { + "epoch": 0.42269799926499024, + "grad_norm": 0.69140625, + "learning_rate": 0.00016997853837557648, + "loss": 0.7714, + "step": 16462 + }, + { + "epoch": 0.4227236764609121, + "grad_norm": 0.75390625, + "learning_rate": 0.0001699753492893288, + "loss": 1.0705, + "step": 16463 + }, + { + "epoch": 0.4227493536568339, + "grad_norm": 0.7265625, + "learning_rate": 0.00016997216006362658, + "loss": 0.8652, + "step": 16464 + }, + { + "epoch": 0.42277503085275575, + "grad_norm": 0.78125, + "learning_rate": 0.00016996897069847619, + "loss": 0.9307, + "step": 16465 + }, + { + "epoch": 0.42280070804867753, + "grad_norm": 0.77734375, + "learning_rate": 0.00016996578119388394, + "loss": 0.9459, + "step": 16466 + }, + { + "epoch": 0.42282638524459937, + "grad_norm": 0.7578125, + "learning_rate": 0.0001699625915498562, + "loss": 0.9682, + "step": 16467 + }, + { + "epoch": 0.4228520624405212, + "grad_norm": 0.74609375, + "learning_rate": 0.00016995940176639935, + "loss": 0.7914, + "step": 16468 + }, + { + "epoch": 0.422877739636443, + "grad_norm": 0.79296875, + "learning_rate": 0.00016995621184351972, + "loss": 0.995, + "step": 16469 + }, + { + "epoch": 0.4229034168323648, + "grad_norm": 0.8359375, + "learning_rate": 0.0001699530217812237, + "loss": 0.8662, + "step": 16470 + }, + { + "epoch": 0.42292909402828666, + "grad_norm": 0.7421875, + "learning_rate": 0.00016994983157951762, + "loss": 0.8694, + "step": 16471 + }, + { + "epoch": 0.42295477122420844, + "grad_norm": 0.76953125, + "learning_rate": 0.00016994664123840783, + "loss": 0.7724, + "step": 16472 + }, + { + "epoch": 0.4229804484201303, + "grad_norm": 0.7265625, + "learning_rate": 0.00016994345075790071, + "loss": 0.7309, + "step": 16473 + }, + { + "epoch": 0.4230061256160521, + "grad_norm": 0.828125, + "learning_rate": 0.0001699402601380026, + "loss": 0.9572, + "step": 16474 + }, + { + "epoch": 0.42303180281197394, + "grad_norm": 0.80859375, + "learning_rate": 0.0001699370693787199, + "loss": 0.926, + "step": 16475 + }, + { + "epoch": 0.4230574800078957, + "grad_norm": 0.73828125, + "learning_rate": 0.0001699338784800589, + "loss": 0.9315, + "step": 16476 + }, + { + "epoch": 0.42308315720381756, + "grad_norm": 0.73828125, + "learning_rate": 0.00016993068744202601, + "loss": 0.7945, + "step": 16477 + }, + { + "epoch": 0.4231088343997394, + "grad_norm": 0.7421875, + "learning_rate": 0.00016992749626462762, + "loss": 0.8391, + "step": 16478 + }, + { + "epoch": 0.4231345115956612, + "grad_norm": 0.77734375, + "learning_rate": 0.00016992430494786997, + "loss": 0.9043, + "step": 16479 + }, + { + "epoch": 0.423160188791583, + "grad_norm": 0.75, + "learning_rate": 0.00016992111349175954, + "loss": 0.9161, + "step": 16480 + }, + { + "epoch": 0.42318586598750485, + "grad_norm": 0.78125, + "learning_rate": 0.00016991792189630266, + "loss": 1.0186, + "step": 16481 + }, + { + "epoch": 0.42321154318342663, + "grad_norm": 0.75390625, + "learning_rate": 0.00016991473016150563, + "loss": 1.0235, + "step": 16482 + }, + { + "epoch": 0.42323722037934847, + "grad_norm": 0.7578125, + "learning_rate": 0.0001699115382873749, + "loss": 0.9262, + "step": 16483 + }, + { + "epoch": 0.4232628975752703, + "grad_norm": 0.703125, + "learning_rate": 0.00016990834627391677, + "loss": 0.8881, + "step": 16484 + }, + { + "epoch": 0.42328857477119214, + "grad_norm": 0.72265625, + "learning_rate": 0.0001699051541211376, + "loss": 0.8849, + "step": 16485 + }, + { + "epoch": 0.4233142519671139, + "grad_norm": 0.7421875, + "learning_rate": 0.00016990196182904385, + "loss": 0.876, + "step": 16486 + }, + { + "epoch": 0.42333992916303576, + "grad_norm": 0.83203125, + "learning_rate": 0.0001698987693976417, + "loss": 1.0166, + "step": 16487 + }, + { + "epoch": 0.4233656063589576, + "grad_norm": 0.84765625, + "learning_rate": 0.00016989557682693768, + "loss": 0.8897, + "step": 16488 + }, + { + "epoch": 0.42339128355487937, + "grad_norm": 0.7578125, + "learning_rate": 0.0001698923841169381, + "loss": 0.985, + "step": 16489 + }, + { + "epoch": 0.4234169607508012, + "grad_norm": 0.7734375, + "learning_rate": 0.0001698891912676493, + "loss": 0.9812, + "step": 16490 + }, + { + "epoch": 0.42344263794672304, + "grad_norm": 0.79296875, + "learning_rate": 0.00016988599827907764, + "loss": 0.921, + "step": 16491 + }, + { + "epoch": 0.4234683151426448, + "grad_norm": 0.828125, + "learning_rate": 0.00016988280515122949, + "loss": 0.8267, + "step": 16492 + }, + { + "epoch": 0.42349399233856666, + "grad_norm": 0.8828125, + "learning_rate": 0.00016987961188411123, + "loss": 0.9553, + "step": 16493 + }, + { + "epoch": 0.4235196695344885, + "grad_norm": 0.765625, + "learning_rate": 0.0001698764184777292, + "loss": 1.0768, + "step": 16494 + }, + { + "epoch": 0.42354534673041033, + "grad_norm": 0.7578125, + "learning_rate": 0.00016987322493208982, + "loss": 0.8664, + "step": 16495 + }, + { + "epoch": 0.4235710239263321, + "grad_norm": 0.73828125, + "learning_rate": 0.0001698700312471994, + "loss": 1.0457, + "step": 16496 + }, + { + "epoch": 0.42359670112225395, + "grad_norm": 0.83203125, + "learning_rate": 0.0001698668374230643, + "loss": 0.9883, + "step": 16497 + }, + { + "epoch": 0.4236223783181758, + "grad_norm": 0.72265625, + "learning_rate": 0.00016986364345969094, + "loss": 0.8231, + "step": 16498 + }, + { + "epoch": 0.42364805551409757, + "grad_norm": 0.796875, + "learning_rate": 0.00016986044935708562, + "loss": 0.733, + "step": 16499 + }, + { + "epoch": 0.4236737327100194, + "grad_norm": 0.734375, + "learning_rate": 0.00016985725511525472, + "loss": 0.9047, + "step": 16500 + }, + { + "epoch": 0.42369940990594124, + "grad_norm": 0.72265625, + "learning_rate": 0.00016985406073420466, + "loss": 0.7388, + "step": 16501 + }, + { + "epoch": 0.423725087101863, + "grad_norm": 0.73828125, + "learning_rate": 0.00016985086621394176, + "loss": 0.85, + "step": 16502 + }, + { + "epoch": 0.42375076429778485, + "grad_norm": 0.69921875, + "learning_rate": 0.0001698476715544724, + "loss": 0.7339, + "step": 16503 + }, + { + "epoch": 0.4237764414937067, + "grad_norm": 0.82421875, + "learning_rate": 0.0001698444767558029, + "loss": 0.8512, + "step": 16504 + }, + { + "epoch": 0.4238021186896285, + "grad_norm": 0.765625, + "learning_rate": 0.0001698412818179397, + "loss": 0.8133, + "step": 16505 + }, + { + "epoch": 0.4238277958855503, + "grad_norm": 0.71484375, + "learning_rate": 0.0001698380867408891, + "loss": 0.9459, + "step": 16506 + }, + { + "epoch": 0.42385347308147214, + "grad_norm": 0.8359375, + "learning_rate": 0.0001698348915246575, + "loss": 0.9294, + "step": 16507 + }, + { + "epoch": 0.423879150277394, + "grad_norm": 0.73828125, + "learning_rate": 0.00016983169616925135, + "loss": 0.9081, + "step": 16508 + }, + { + "epoch": 0.42390482747331576, + "grad_norm": 0.79296875, + "learning_rate": 0.00016982850067467686, + "loss": 1.0489, + "step": 16509 + }, + { + "epoch": 0.4239305046692376, + "grad_norm": 0.75390625, + "learning_rate": 0.0001698253050409405, + "loss": 0.9693, + "step": 16510 + }, + { + "epoch": 0.42395618186515943, + "grad_norm": 0.7421875, + "learning_rate": 0.00016982210926804863, + "loss": 0.8291, + "step": 16511 + }, + { + "epoch": 0.4239818590610812, + "grad_norm": 0.76953125, + "learning_rate": 0.00016981891335600757, + "loss": 0.988, + "step": 16512 + }, + { + "epoch": 0.42400753625700305, + "grad_norm": 0.7109375, + "learning_rate": 0.00016981571730482372, + "loss": 0.9074, + "step": 16513 + }, + { + "epoch": 0.4240332134529249, + "grad_norm": 0.7578125, + "learning_rate": 0.00016981252111450347, + "loss": 0.9201, + "step": 16514 + }, + { + "epoch": 0.4240588906488467, + "grad_norm": 0.78125, + "learning_rate": 0.00016980932478505317, + "loss": 0.8825, + "step": 16515 + }, + { + "epoch": 0.4240845678447685, + "grad_norm": 0.8203125, + "learning_rate": 0.0001698061283164792, + "loss": 0.9709, + "step": 16516 + }, + { + "epoch": 0.42411024504069034, + "grad_norm": 0.81640625, + "learning_rate": 0.0001698029317087879, + "loss": 0.8919, + "step": 16517 + }, + { + "epoch": 0.4241359222366122, + "grad_norm": 0.78515625, + "learning_rate": 0.0001697997349619857, + "loss": 1.03, + "step": 16518 + }, + { + "epoch": 0.42416159943253395, + "grad_norm": 0.73828125, + "learning_rate": 0.00016979653807607892, + "loss": 0.8034, + "step": 16519 + }, + { + "epoch": 0.4241872766284558, + "grad_norm": 0.79296875, + "learning_rate": 0.00016979334105107392, + "loss": 0.9204, + "step": 16520 + }, + { + "epoch": 0.4242129538243776, + "grad_norm": 0.7734375, + "learning_rate": 0.0001697901438869771, + "loss": 0.9842, + "step": 16521 + }, + { + "epoch": 0.4242386310202994, + "grad_norm": 0.81640625, + "learning_rate": 0.00016978694658379486, + "loss": 0.9223, + "step": 16522 + }, + { + "epoch": 0.42426430821622124, + "grad_norm": 0.76171875, + "learning_rate": 0.00016978374914153349, + "loss": 0.9501, + "step": 16523 + }, + { + "epoch": 0.4242899854121431, + "grad_norm": 0.78515625, + "learning_rate": 0.00016978055156019944, + "loss": 0.9591, + "step": 16524 + }, + { + "epoch": 0.42431566260806486, + "grad_norm": 0.8359375, + "learning_rate": 0.00016977735383979907, + "loss": 1.1126, + "step": 16525 + }, + { + "epoch": 0.4243413398039867, + "grad_norm": 0.7265625, + "learning_rate": 0.0001697741559803387, + "loss": 0.9079, + "step": 16526 + }, + { + "epoch": 0.42436701699990853, + "grad_norm": 0.75, + "learning_rate": 0.00016977095798182478, + "loss": 0.8685, + "step": 16527 + }, + { + "epoch": 0.42439269419583037, + "grad_norm": 0.83203125, + "learning_rate": 0.0001697677598442636, + "loss": 0.8397, + "step": 16528 + }, + { + "epoch": 0.42441837139175215, + "grad_norm": 0.7421875, + "learning_rate": 0.0001697645615676616, + "loss": 0.859, + "step": 16529 + }, + { + "epoch": 0.424444048587674, + "grad_norm": 0.80078125, + "learning_rate": 0.00016976136315202515, + "loss": 0.9997, + "step": 16530 + }, + { + "epoch": 0.4244697257835958, + "grad_norm": 0.71875, + "learning_rate": 0.00016975816459736059, + "loss": 1.0516, + "step": 16531 + }, + { + "epoch": 0.4244954029795176, + "grad_norm": 0.73828125, + "learning_rate": 0.0001697549659036743, + "loss": 0.8652, + "step": 16532 + }, + { + "epoch": 0.42452108017543944, + "grad_norm": 0.7265625, + "learning_rate": 0.00016975176707097265, + "loss": 1.0186, + "step": 16533 + }, + { + "epoch": 0.42454675737136127, + "grad_norm": 0.7734375, + "learning_rate": 0.00016974856809926205, + "loss": 0.9332, + "step": 16534 + }, + { + "epoch": 0.42457243456728305, + "grad_norm": 0.7890625, + "learning_rate": 0.00016974536898854884, + "loss": 0.8746, + "step": 16535 + }, + { + "epoch": 0.4245981117632049, + "grad_norm": 0.796875, + "learning_rate": 0.00016974216973883945, + "loss": 0.9941, + "step": 16536 + }, + { + "epoch": 0.4246237889591267, + "grad_norm": 0.7109375, + "learning_rate": 0.00016973897035014017, + "loss": 0.8958, + "step": 16537 + }, + { + "epoch": 0.42464946615504856, + "grad_norm": 0.7578125, + "learning_rate": 0.00016973577082245745, + "loss": 0.8641, + "step": 16538 + }, + { + "epoch": 0.42467514335097034, + "grad_norm": 0.76171875, + "learning_rate": 0.00016973257115579762, + "loss": 0.9277, + "step": 16539 + }, + { + "epoch": 0.4247008205468922, + "grad_norm": 0.76953125, + "learning_rate": 0.00016972937135016706, + "loss": 0.9299, + "step": 16540 + }, + { + "epoch": 0.424726497742814, + "grad_norm": 0.80859375, + "learning_rate": 0.0001697261714055722, + "loss": 1.0229, + "step": 16541 + }, + { + "epoch": 0.4247521749387358, + "grad_norm": 0.82421875, + "learning_rate": 0.00016972297132201937, + "loss": 0.9137, + "step": 16542 + }, + { + "epoch": 0.42477785213465763, + "grad_norm": 0.7734375, + "learning_rate": 0.00016971977109951495, + "loss": 0.9848, + "step": 16543 + }, + { + "epoch": 0.42480352933057947, + "grad_norm": 0.765625, + "learning_rate": 0.00016971657073806527, + "loss": 0.8387, + "step": 16544 + }, + { + "epoch": 0.42482920652650125, + "grad_norm": 0.765625, + "learning_rate": 0.00016971337023767683, + "loss": 0.8411, + "step": 16545 + }, + { + "epoch": 0.4248548837224231, + "grad_norm": 0.7578125, + "learning_rate": 0.00016971016959835592, + "loss": 0.8816, + "step": 16546 + }, + { + "epoch": 0.4248805609183449, + "grad_norm": 0.91796875, + "learning_rate": 0.00016970696882010895, + "loss": 1.0007, + "step": 16547 + }, + { + "epoch": 0.42490623811426675, + "grad_norm": 0.78125, + "learning_rate": 0.00016970376790294227, + "loss": 0.936, + "step": 16548 + }, + { + "epoch": 0.42493191531018853, + "grad_norm": 0.7734375, + "learning_rate": 0.00016970056684686231, + "loss": 0.9817, + "step": 16549 + }, + { + "epoch": 0.42495759250611037, + "grad_norm": 0.7421875, + "learning_rate": 0.0001696973656518754, + "loss": 0.836, + "step": 16550 + }, + { + "epoch": 0.4249832697020322, + "grad_norm": 1.265625, + "learning_rate": 0.0001696941643179879, + "loss": 0.9467, + "step": 16551 + }, + { + "epoch": 0.425008946897954, + "grad_norm": 0.80078125, + "learning_rate": 0.00016969096284520627, + "loss": 0.9114, + "step": 16552 + }, + { + "epoch": 0.4250346240938758, + "grad_norm": 0.859375, + "learning_rate": 0.00016968776123353683, + "loss": 1.025, + "step": 16553 + }, + { + "epoch": 0.42506030128979766, + "grad_norm": 0.88671875, + "learning_rate": 0.00016968455948298595, + "loss": 0.9381, + "step": 16554 + }, + { + "epoch": 0.42508597848571944, + "grad_norm": 0.74609375, + "learning_rate": 0.00016968135759356007, + "loss": 0.9101, + "step": 16555 + }, + { + "epoch": 0.4251116556816413, + "grad_norm": 0.77734375, + "learning_rate": 0.00016967815556526557, + "loss": 0.9118, + "step": 16556 + }, + { + "epoch": 0.4251373328775631, + "grad_norm": 0.74609375, + "learning_rate": 0.00016967495339810875, + "loss": 0.8974, + "step": 16557 + }, + { + "epoch": 0.42516301007348495, + "grad_norm": 0.765625, + "learning_rate": 0.00016967175109209603, + "loss": 0.862, + "step": 16558 + }, + { + "epoch": 0.42518868726940673, + "grad_norm": 0.796875, + "learning_rate": 0.00016966854864723386, + "loss": 0.9194, + "step": 16559 + }, + { + "epoch": 0.42521436446532856, + "grad_norm": 0.7734375, + "learning_rate": 0.0001696653460635285, + "loss": 0.9047, + "step": 16560 + }, + { + "epoch": 0.4252400416612504, + "grad_norm": 0.79296875, + "learning_rate": 0.00016966214334098644, + "loss": 0.8528, + "step": 16561 + }, + { + "epoch": 0.4252657188571722, + "grad_norm": 0.93359375, + "learning_rate": 0.00016965894047961402, + "loss": 0.913, + "step": 16562 + }, + { + "epoch": 0.425291396053094, + "grad_norm": 0.91796875, + "learning_rate": 0.0001696557374794176, + "loss": 0.9978, + "step": 16563 + }, + { + "epoch": 0.42531707324901585, + "grad_norm": 0.7734375, + "learning_rate": 0.0001696525343404036, + "loss": 0.8477, + "step": 16564 + }, + { + "epoch": 0.42534275044493763, + "grad_norm": 0.74609375, + "learning_rate": 0.0001696493310625784, + "loss": 0.8822, + "step": 16565 + }, + { + "epoch": 0.42536842764085947, + "grad_norm": 0.734375, + "learning_rate": 0.00016964612764594839, + "loss": 0.9514, + "step": 16566 + }, + { + "epoch": 0.4253941048367813, + "grad_norm": 0.71875, + "learning_rate": 0.0001696429240905199, + "loss": 0.8769, + "step": 16567 + }, + { + "epoch": 0.42541978203270314, + "grad_norm": 0.828125, + "learning_rate": 0.00016963972039629938, + "loss": 0.9244, + "step": 16568 + }, + { + "epoch": 0.4254454592286249, + "grad_norm": 0.77734375, + "learning_rate": 0.00016963651656329317, + "loss": 0.9679, + "step": 16569 + }, + { + "epoch": 0.42547113642454676, + "grad_norm": 0.71484375, + "learning_rate": 0.0001696333125915077, + "loss": 0.8667, + "step": 16570 + }, + { + "epoch": 0.4254968136204686, + "grad_norm": 0.91015625, + "learning_rate": 0.00016963010848094928, + "loss": 1.0267, + "step": 16571 + }, + { + "epoch": 0.4255224908163904, + "grad_norm": 0.83984375, + "learning_rate": 0.0001696269042316244, + "loss": 1.069, + "step": 16572 + }, + { + "epoch": 0.4255481680123122, + "grad_norm": 0.734375, + "learning_rate": 0.00016962369984353933, + "loss": 0.7694, + "step": 16573 + }, + { + "epoch": 0.42557384520823405, + "grad_norm": 0.8125, + "learning_rate": 0.00016962049531670057, + "loss": 0.8445, + "step": 16574 + }, + { + "epoch": 0.42559952240415583, + "grad_norm": 0.8203125, + "learning_rate": 0.00016961729065111442, + "loss": 1.0251, + "step": 16575 + }, + { + "epoch": 0.42562519960007766, + "grad_norm": 1.1015625, + "learning_rate": 0.0001696140858467873, + "loss": 0.8972, + "step": 16576 + }, + { + "epoch": 0.4256508767959995, + "grad_norm": 0.77734375, + "learning_rate": 0.0001696108809037256, + "loss": 0.8285, + "step": 16577 + }, + { + "epoch": 0.42567655399192134, + "grad_norm": 0.83984375, + "learning_rate": 0.00016960767582193572, + "loss": 0.9471, + "step": 16578 + }, + { + "epoch": 0.4257022311878431, + "grad_norm": 0.8203125, + "learning_rate": 0.00016960447060142402, + "loss": 0.9341, + "step": 16579 + }, + { + "epoch": 0.42572790838376495, + "grad_norm": 0.7421875, + "learning_rate": 0.00016960126524219688, + "loss": 0.9742, + "step": 16580 + }, + { + "epoch": 0.4257535855796868, + "grad_norm": 0.80859375, + "learning_rate": 0.00016959805974426072, + "loss": 1.0294, + "step": 16581 + }, + { + "epoch": 0.42577926277560857, + "grad_norm": 0.796875, + "learning_rate": 0.00016959485410762194, + "loss": 0.8285, + "step": 16582 + }, + { + "epoch": 0.4258049399715304, + "grad_norm": 0.86328125, + "learning_rate": 0.00016959164833228683, + "loss": 0.8527, + "step": 16583 + }, + { + "epoch": 0.42583061716745224, + "grad_norm": 0.7734375, + "learning_rate": 0.00016958844241826194, + "loss": 0.9971, + "step": 16584 + }, + { + "epoch": 0.425856294363374, + "grad_norm": 0.74609375, + "learning_rate": 0.0001695852363655535, + "loss": 0.8203, + "step": 16585 + }, + { + "epoch": 0.42588197155929586, + "grad_norm": 0.8046875, + "learning_rate": 0.000169582030174168, + "loss": 0.9344, + "step": 16586 + }, + { + "epoch": 0.4259076487552177, + "grad_norm": 0.76953125, + "learning_rate": 0.00016957882384411178, + "loss": 0.9114, + "step": 16587 + }, + { + "epoch": 0.42593332595113953, + "grad_norm": 0.734375, + "learning_rate": 0.00016957561737539125, + "loss": 0.8691, + "step": 16588 + }, + { + "epoch": 0.4259590031470613, + "grad_norm": 0.77734375, + "learning_rate": 0.00016957241076801283, + "loss": 1.047, + "step": 16589 + }, + { + "epoch": 0.42598468034298315, + "grad_norm": 0.8046875, + "learning_rate": 0.00016956920402198288, + "loss": 1.0524, + "step": 16590 + }, + { + "epoch": 0.426010357538905, + "grad_norm": 0.734375, + "learning_rate": 0.00016956599713730778, + "loss": 0.9674, + "step": 16591 + }, + { + "epoch": 0.42603603473482676, + "grad_norm": 0.7421875, + "learning_rate": 0.0001695627901139939, + "loss": 1.0212, + "step": 16592 + }, + { + "epoch": 0.4260617119307486, + "grad_norm": 0.80078125, + "learning_rate": 0.00016955958295204767, + "loss": 0.8536, + "step": 16593 + }, + { + "epoch": 0.42608738912667043, + "grad_norm": 0.81640625, + "learning_rate": 0.0001695563756514755, + "loss": 0.9306, + "step": 16594 + }, + { + "epoch": 0.4261130663225922, + "grad_norm": 0.79296875, + "learning_rate": 0.00016955316821228375, + "loss": 0.9181, + "step": 16595 + }, + { + "epoch": 0.42613874351851405, + "grad_norm": 0.94921875, + "learning_rate": 0.00016954996063447882, + "loss": 0.9397, + "step": 16596 + }, + { + "epoch": 0.4261644207144359, + "grad_norm": 0.734375, + "learning_rate": 0.00016954675291806708, + "loss": 0.8586, + "step": 16597 + }, + { + "epoch": 0.4261900979103577, + "grad_norm": 0.7734375, + "learning_rate": 0.00016954354506305497, + "loss": 0.8688, + "step": 16598 + }, + { + "epoch": 0.4262157751062795, + "grad_norm": 0.70703125, + "learning_rate": 0.00016954033706944883, + "loss": 0.8943, + "step": 16599 + }, + { + "epoch": 0.42624145230220134, + "grad_norm": 0.8515625, + "learning_rate": 0.00016953712893725512, + "loss": 0.8771, + "step": 16600 + }, + { + "epoch": 0.4262671294981232, + "grad_norm": 0.7109375, + "learning_rate": 0.00016953392066648015, + "loss": 0.8733, + "step": 16601 + }, + { + "epoch": 0.42629280669404496, + "grad_norm": 0.73828125, + "learning_rate": 0.0001695307122571304, + "loss": 0.8868, + "step": 16602 + }, + { + "epoch": 0.4263184838899668, + "grad_norm": 0.7265625, + "learning_rate": 0.0001695275037092122, + "loss": 0.9743, + "step": 16603 + }, + { + "epoch": 0.42634416108588863, + "grad_norm": 0.83984375, + "learning_rate": 0.00016952429502273194, + "loss": 0.8911, + "step": 16604 + }, + { + "epoch": 0.4263698382818104, + "grad_norm": 0.74609375, + "learning_rate": 0.00016952108619769607, + "loss": 0.9243, + "step": 16605 + }, + { + "epoch": 0.42639551547773225, + "grad_norm": 0.796875, + "learning_rate": 0.00016951787723411098, + "loss": 0.8246, + "step": 16606 + }, + { + "epoch": 0.4264211926736541, + "grad_norm": 0.69921875, + "learning_rate": 0.00016951466813198298, + "loss": 0.9481, + "step": 16607 + }, + { + "epoch": 0.4264468698695759, + "grad_norm": 0.703125, + "learning_rate": 0.00016951145889131856, + "loss": 0.8373, + "step": 16608 + }, + { + "epoch": 0.4264725470654977, + "grad_norm": 0.78125, + "learning_rate": 0.00016950824951212406, + "loss": 0.9742, + "step": 16609 + }, + { + "epoch": 0.42649822426141953, + "grad_norm": 0.8046875, + "learning_rate": 0.00016950503999440595, + "loss": 0.8196, + "step": 16610 + }, + { + "epoch": 0.42652390145734137, + "grad_norm": 0.80859375, + "learning_rate": 0.0001695018303381705, + "loss": 0.8982, + "step": 16611 + }, + { + "epoch": 0.42654957865326315, + "grad_norm": 0.80859375, + "learning_rate": 0.00016949862054342423, + "loss": 0.8764, + "step": 16612 + }, + { + "epoch": 0.426575255849185, + "grad_norm": 0.74609375, + "learning_rate": 0.00016949541061017347, + "loss": 0.8767, + "step": 16613 + }, + { + "epoch": 0.4266009330451068, + "grad_norm": 0.87890625, + "learning_rate": 0.00016949220053842466, + "loss": 0.9711, + "step": 16614 + }, + { + "epoch": 0.4266266102410286, + "grad_norm": 0.8828125, + "learning_rate": 0.00016948899032818412, + "loss": 1.0237, + "step": 16615 + }, + { + "epoch": 0.42665228743695044, + "grad_norm": 0.7421875, + "learning_rate": 0.00016948577997945833, + "loss": 0.9131, + "step": 16616 + }, + { + "epoch": 0.4266779646328723, + "grad_norm": 0.78125, + "learning_rate": 0.00016948256949225366, + "loss": 0.9279, + "step": 16617 + }, + { + "epoch": 0.4267036418287941, + "grad_norm": 0.78515625, + "learning_rate": 0.0001694793588665765, + "loss": 0.9213, + "step": 16618 + }, + { + "epoch": 0.4267293190247159, + "grad_norm": 0.74609375, + "learning_rate": 0.00016947614810243326, + "loss": 0.9529, + "step": 16619 + }, + { + "epoch": 0.42675499622063773, + "grad_norm": 0.7890625, + "learning_rate": 0.0001694729371998303, + "loss": 0.9584, + "step": 16620 + }, + { + "epoch": 0.42678067341655956, + "grad_norm": 0.828125, + "learning_rate": 0.0001694697261587741, + "loss": 0.8829, + "step": 16621 + }, + { + "epoch": 0.42680635061248134, + "grad_norm": 0.75, + "learning_rate": 0.00016946651497927099, + "loss": 1.0193, + "step": 16622 + }, + { + "epoch": 0.4268320278084032, + "grad_norm": 0.7890625, + "learning_rate": 0.00016946330366132738, + "loss": 0.8842, + "step": 16623 + }, + { + "epoch": 0.426857705004325, + "grad_norm": 0.8203125, + "learning_rate": 0.00016946009220494972, + "loss": 0.7777, + "step": 16624 + }, + { + "epoch": 0.4268833822002468, + "grad_norm": 0.80078125, + "learning_rate": 0.00016945688061014433, + "loss": 0.9346, + "step": 16625 + }, + { + "epoch": 0.42690905939616863, + "grad_norm": 0.80078125, + "learning_rate": 0.00016945366887691766, + "loss": 0.9667, + "step": 16626 + }, + { + "epoch": 0.42693473659209047, + "grad_norm": 0.77734375, + "learning_rate": 0.0001694504570052761, + "loss": 0.937, + "step": 16627 + }, + { + "epoch": 0.4269604137880123, + "grad_norm": 0.79296875, + "learning_rate": 0.00016944724499522607, + "loss": 0.8742, + "step": 16628 + }, + { + "epoch": 0.4269860909839341, + "grad_norm": 0.83203125, + "learning_rate": 0.00016944403284677395, + "loss": 0.8671, + "step": 16629 + }, + { + "epoch": 0.4270117681798559, + "grad_norm": 0.82421875, + "learning_rate": 0.00016944082055992614, + "loss": 0.9711, + "step": 16630 + }, + { + "epoch": 0.42703744537577776, + "grad_norm": 0.765625, + "learning_rate": 0.00016943760813468902, + "loss": 0.8654, + "step": 16631 + }, + { + "epoch": 0.42706312257169954, + "grad_norm": 0.74609375, + "learning_rate": 0.00016943439557106903, + "loss": 0.7781, + "step": 16632 + }, + { + "epoch": 0.4270887997676214, + "grad_norm": 0.77734375, + "learning_rate": 0.00016943118286907258, + "loss": 1.0118, + "step": 16633 + }, + { + "epoch": 0.4271144769635432, + "grad_norm": 0.76953125, + "learning_rate": 0.00016942797002870606, + "loss": 1.1045, + "step": 16634 + }, + { + "epoch": 0.427140154159465, + "grad_norm": 0.828125, + "learning_rate": 0.00016942475704997585, + "loss": 1.0329, + "step": 16635 + }, + { + "epoch": 0.4271658313553868, + "grad_norm": 0.703125, + "learning_rate": 0.00016942154393288837, + "loss": 0.8545, + "step": 16636 + }, + { + "epoch": 0.42719150855130866, + "grad_norm": 0.80078125, + "learning_rate": 0.00016941833067745003, + "loss": 0.8873, + "step": 16637 + }, + { + "epoch": 0.4272171857472305, + "grad_norm": 0.765625, + "learning_rate": 0.00016941511728366722, + "loss": 0.8914, + "step": 16638 + }, + { + "epoch": 0.4272428629431523, + "grad_norm": 0.796875, + "learning_rate": 0.00016941190375154637, + "loss": 0.7977, + "step": 16639 + }, + { + "epoch": 0.4272685401390741, + "grad_norm": 0.8125, + "learning_rate": 0.00016940869008109385, + "loss": 1.039, + "step": 16640 + }, + { + "epoch": 0.42729421733499595, + "grad_norm": 0.82421875, + "learning_rate": 0.00016940547627231612, + "loss": 0.9252, + "step": 16641 + }, + { + "epoch": 0.42731989453091773, + "grad_norm": 0.80078125, + "learning_rate": 0.00016940226232521946, + "loss": 1.0602, + "step": 16642 + }, + { + "epoch": 0.42734557172683957, + "grad_norm": 0.83203125, + "learning_rate": 0.00016939904823981046, + "loss": 0.8344, + "step": 16643 + }, + { + "epoch": 0.4273712489227614, + "grad_norm": 0.78515625, + "learning_rate": 0.00016939583401609536, + "loss": 0.9669, + "step": 16644 + }, + { + "epoch": 0.4273969261186832, + "grad_norm": 0.80078125, + "learning_rate": 0.00016939261965408063, + "loss": 0.9059, + "step": 16645 + }, + { + "epoch": 0.427422603314605, + "grad_norm": 0.8203125, + "learning_rate": 0.0001693894051537727, + "loss": 0.9644, + "step": 16646 + }, + { + "epoch": 0.42744828051052686, + "grad_norm": 0.7421875, + "learning_rate": 0.00016938619051517795, + "loss": 0.998, + "step": 16647 + }, + { + "epoch": 0.4274739577064487, + "grad_norm": 0.7890625, + "learning_rate": 0.0001693829757383028, + "loss": 0.9615, + "step": 16648 + }, + { + "epoch": 0.4274996349023705, + "grad_norm": 0.7890625, + "learning_rate": 0.00016937976082315363, + "loss": 0.8985, + "step": 16649 + }, + { + "epoch": 0.4275253120982923, + "grad_norm": 0.81640625, + "learning_rate": 0.00016937654576973686, + "loss": 0.9979, + "step": 16650 + }, + { + "epoch": 0.42755098929421415, + "grad_norm": 0.75, + "learning_rate": 0.0001693733305780589, + "loss": 0.9622, + "step": 16651 + }, + { + "epoch": 0.4275766664901359, + "grad_norm": 0.77734375, + "learning_rate": 0.0001693701152481262, + "loss": 0.9799, + "step": 16652 + }, + { + "epoch": 0.42760234368605776, + "grad_norm": 0.70703125, + "learning_rate": 0.00016936689977994505, + "loss": 0.8166, + "step": 16653 + }, + { + "epoch": 0.4276280208819796, + "grad_norm": 0.8125, + "learning_rate": 0.000169363684173522, + "loss": 0.9749, + "step": 16654 + }, + { + "epoch": 0.4276536980779014, + "grad_norm": 0.734375, + "learning_rate": 0.0001693604684288634, + "loss": 0.8464, + "step": 16655 + }, + { + "epoch": 0.4276793752738232, + "grad_norm": 0.8359375, + "learning_rate": 0.00016935725254597559, + "loss": 0.8967, + "step": 16656 + }, + { + "epoch": 0.42770505246974505, + "grad_norm": 0.73046875, + "learning_rate": 0.0001693540365248651, + "loss": 1.0252, + "step": 16657 + }, + { + "epoch": 0.4277307296656669, + "grad_norm": 0.72265625, + "learning_rate": 0.00016935082036553825, + "loss": 0.8411, + "step": 16658 + }, + { + "epoch": 0.42775640686158867, + "grad_norm": 0.72265625, + "learning_rate": 0.00016934760406800146, + "loss": 0.8063, + "step": 16659 + }, + { + "epoch": 0.4277820840575105, + "grad_norm": 0.89453125, + "learning_rate": 0.00016934438763226118, + "loss": 0.9594, + "step": 16660 + }, + { + "epoch": 0.42780776125343234, + "grad_norm": 0.83203125, + "learning_rate": 0.00016934117105832383, + "loss": 1.0534, + "step": 16661 + }, + { + "epoch": 0.4278334384493541, + "grad_norm": 0.7265625, + "learning_rate": 0.00016933795434619575, + "loss": 0.9025, + "step": 16662 + }, + { + "epoch": 0.42785911564527596, + "grad_norm": 0.71484375, + "learning_rate": 0.0001693347374958834, + "loss": 0.8231, + "step": 16663 + }, + { + "epoch": 0.4278847928411978, + "grad_norm": 0.765625, + "learning_rate": 0.00016933152050739318, + "loss": 0.9605, + "step": 16664 + }, + { + "epoch": 0.4279104700371196, + "grad_norm": 0.84765625, + "learning_rate": 0.00016932830338073148, + "loss": 0.8966, + "step": 16665 + }, + { + "epoch": 0.4279361472330414, + "grad_norm": 0.78515625, + "learning_rate": 0.00016932508611590478, + "loss": 0.8325, + "step": 16666 + }, + { + "epoch": 0.42796182442896324, + "grad_norm": 0.78515625, + "learning_rate": 0.00016932186871291944, + "loss": 0.9062, + "step": 16667 + }, + { + "epoch": 0.4279875016248851, + "grad_norm": 0.73828125, + "learning_rate": 0.00016931865117178187, + "loss": 0.9238, + "step": 16668 + }, + { + "epoch": 0.42801317882080686, + "grad_norm": 0.78515625, + "learning_rate": 0.00016931543349249845, + "loss": 0.8736, + "step": 16669 + }, + { + "epoch": 0.4280388560167287, + "grad_norm": 0.7734375, + "learning_rate": 0.00016931221567507567, + "loss": 0.9695, + "step": 16670 + }, + { + "epoch": 0.42806453321265053, + "grad_norm": 0.73828125, + "learning_rate": 0.00016930899771951992, + "loss": 0.7732, + "step": 16671 + }, + { + "epoch": 0.4280902104085723, + "grad_norm": 0.875, + "learning_rate": 0.00016930577962583757, + "loss": 0.9426, + "step": 16672 + }, + { + "epoch": 0.42811588760449415, + "grad_norm": 0.7734375, + "learning_rate": 0.00016930256139403507, + "loss": 0.9211, + "step": 16673 + }, + { + "epoch": 0.428141564800416, + "grad_norm": 0.8125, + "learning_rate": 0.0001692993430241188, + "loss": 0.9384, + "step": 16674 + }, + { + "epoch": 0.42816724199633777, + "grad_norm": 0.75, + "learning_rate": 0.00016929612451609526, + "loss": 0.9188, + "step": 16675 + }, + { + "epoch": 0.4281929191922596, + "grad_norm": 0.74609375, + "learning_rate": 0.00016929290586997076, + "loss": 0.8294, + "step": 16676 + }, + { + "epoch": 0.42821859638818144, + "grad_norm": 0.921875, + "learning_rate": 0.00016928968708575175, + "loss": 0.9248, + "step": 16677 + }, + { + "epoch": 0.4282442735841033, + "grad_norm": 0.7578125, + "learning_rate": 0.00016928646816344466, + "loss": 0.913, + "step": 16678 + }, + { + "epoch": 0.42826995078002505, + "grad_norm": 0.80859375, + "learning_rate": 0.00016928324910305592, + "loss": 1.0417, + "step": 16679 + }, + { + "epoch": 0.4282956279759469, + "grad_norm": 0.79296875, + "learning_rate": 0.00016928002990459188, + "loss": 0.9, + "step": 16680 + }, + { + "epoch": 0.4283213051718687, + "grad_norm": 0.75, + "learning_rate": 0.00016927681056805907, + "loss": 0.9167, + "step": 16681 + }, + { + "epoch": 0.4283469823677905, + "grad_norm": 0.81640625, + "learning_rate": 0.00016927359109346376, + "loss": 0.8802, + "step": 16682 + }, + { + "epoch": 0.42837265956371234, + "grad_norm": 0.94140625, + "learning_rate": 0.00016927037148081248, + "loss": 0.9614, + "step": 16683 + }, + { + "epoch": 0.4283983367596342, + "grad_norm": 0.75, + "learning_rate": 0.0001692671517301116, + "loss": 0.7906, + "step": 16684 + }, + { + "epoch": 0.42842401395555596, + "grad_norm": 0.7734375, + "learning_rate": 0.00016926393184136753, + "loss": 0.9415, + "step": 16685 + }, + { + "epoch": 0.4284496911514778, + "grad_norm": 0.81640625, + "learning_rate": 0.00016926071181458673, + "loss": 0.8655, + "step": 16686 + }, + { + "epoch": 0.42847536834739963, + "grad_norm": 0.828125, + "learning_rate": 0.00016925749164977555, + "loss": 0.9889, + "step": 16687 + }, + { + "epoch": 0.42850104554332147, + "grad_norm": 0.7421875, + "learning_rate": 0.00016925427134694047, + "loss": 1.0363, + "step": 16688 + }, + { + "epoch": 0.42852672273924325, + "grad_norm": 0.78125, + "learning_rate": 0.0001692510509060879, + "loss": 0.8018, + "step": 16689 + }, + { + "epoch": 0.4285523999351651, + "grad_norm": 0.78515625, + "learning_rate": 0.0001692478303272242, + "loss": 1.0788, + "step": 16690 + }, + { + "epoch": 0.4285780771310869, + "grad_norm": 0.83203125, + "learning_rate": 0.00016924460961035586, + "loss": 0.9453, + "step": 16691 + }, + { + "epoch": 0.4286037543270087, + "grad_norm": 0.765625, + "learning_rate": 0.00016924138875548924, + "loss": 0.8669, + "step": 16692 + }, + { + "epoch": 0.42862943152293054, + "grad_norm": 0.75, + "learning_rate": 0.0001692381677626308, + "loss": 0.874, + "step": 16693 + }, + { + "epoch": 0.4286551087188524, + "grad_norm": 0.81640625, + "learning_rate": 0.00016923494663178694, + "loss": 0.9958, + "step": 16694 + }, + { + "epoch": 0.42868078591477415, + "grad_norm": 0.6875, + "learning_rate": 0.0001692317253629641, + "loss": 0.8699, + "step": 16695 + }, + { + "epoch": 0.428706463110696, + "grad_norm": 0.765625, + "learning_rate": 0.0001692285039561687, + "loss": 0.8698, + "step": 16696 + }, + { + "epoch": 0.4287321403066178, + "grad_norm": 0.8671875, + "learning_rate": 0.00016922528241140712, + "loss": 1.018, + "step": 16697 + }, + { + "epoch": 0.42875781750253966, + "grad_norm": 0.80078125, + "learning_rate": 0.0001692220607286858, + "loss": 0.8549, + "step": 16698 + }, + { + "epoch": 0.42878349469846144, + "grad_norm": 0.7890625, + "learning_rate": 0.00016921883890801115, + "loss": 0.9155, + "step": 16699 + }, + { + "epoch": 0.4288091718943833, + "grad_norm": 0.76953125, + "learning_rate": 0.00016921561694938965, + "loss": 0.9676, + "step": 16700 + }, + { + "epoch": 0.4288348490903051, + "grad_norm": 0.75390625, + "learning_rate": 0.00016921239485282765, + "loss": 0.8066, + "step": 16701 + }, + { + "epoch": 0.4288605262862269, + "grad_norm": 0.73046875, + "learning_rate": 0.00016920917261833163, + "loss": 0.9576, + "step": 16702 + }, + { + "epoch": 0.42888620348214873, + "grad_norm": 0.8359375, + "learning_rate": 0.00016920595024590796, + "loss": 0.9942, + "step": 16703 + }, + { + "epoch": 0.42891188067807057, + "grad_norm": 0.78125, + "learning_rate": 0.00016920272773556305, + "loss": 0.9728, + "step": 16704 + }, + { + "epoch": 0.42893755787399235, + "grad_norm": 0.80859375, + "learning_rate": 0.0001691995050873034, + "loss": 0.9985, + "step": 16705 + }, + { + "epoch": 0.4289632350699142, + "grad_norm": 0.92578125, + "learning_rate": 0.00016919628230113536, + "loss": 0.8853, + "step": 16706 + }, + { + "epoch": 0.428988912265836, + "grad_norm": 0.73828125, + "learning_rate": 0.00016919305937706543, + "loss": 0.8295, + "step": 16707 + }, + { + "epoch": 0.42901458946175786, + "grad_norm": 0.80078125, + "learning_rate": 0.00016918983631509992, + "loss": 1.0484, + "step": 16708 + }, + { + "epoch": 0.42904026665767964, + "grad_norm": 0.8203125, + "learning_rate": 0.00016918661311524536, + "loss": 0.9223, + "step": 16709 + }, + { + "epoch": 0.4290659438536015, + "grad_norm": 0.71875, + "learning_rate": 0.0001691833897775081, + "loss": 0.8496, + "step": 16710 + }, + { + "epoch": 0.4290916210495233, + "grad_norm": 0.80078125, + "learning_rate": 0.0001691801663018946, + "loss": 0.9599, + "step": 16711 + }, + { + "epoch": 0.4291172982454451, + "grad_norm": 0.6953125, + "learning_rate": 0.0001691769426884113, + "loss": 0.7035, + "step": 16712 + }, + { + "epoch": 0.4291429754413669, + "grad_norm": 0.82421875, + "learning_rate": 0.00016917371893706459, + "loss": 0.8136, + "step": 16713 + }, + { + "epoch": 0.42916865263728876, + "grad_norm": 0.75, + "learning_rate": 0.0001691704950478609, + "loss": 0.9578, + "step": 16714 + }, + { + "epoch": 0.42919432983321054, + "grad_norm": 0.76171875, + "learning_rate": 0.00016916727102080662, + "loss": 0.9796, + "step": 16715 + }, + { + "epoch": 0.4292200070291324, + "grad_norm": 0.8046875, + "learning_rate": 0.00016916404685590828, + "loss": 0.9885, + "step": 16716 + }, + { + "epoch": 0.4292456842250542, + "grad_norm": 0.7578125, + "learning_rate": 0.00016916082255317224, + "loss": 0.8479, + "step": 16717 + }, + { + "epoch": 0.42927136142097605, + "grad_norm": 0.83203125, + "learning_rate": 0.00016915759811260487, + "loss": 0.8625, + "step": 16718 + }, + { + "epoch": 0.42929703861689783, + "grad_norm": 0.7890625, + "learning_rate": 0.0001691543735342127, + "loss": 0.9286, + "step": 16719 + }, + { + "epoch": 0.42932271581281967, + "grad_norm": 0.8984375, + "learning_rate": 0.00016915114881800212, + "loss": 0.849, + "step": 16720 + }, + { + "epoch": 0.4293483930087415, + "grad_norm": 0.796875, + "learning_rate": 0.00016914792396397954, + "loss": 0.9751, + "step": 16721 + }, + { + "epoch": 0.4293740702046633, + "grad_norm": 0.81640625, + "learning_rate": 0.00016914469897215138, + "loss": 0.8649, + "step": 16722 + }, + { + "epoch": 0.4293997474005851, + "grad_norm": 0.8828125, + "learning_rate": 0.00016914147384252407, + "loss": 0.9483, + "step": 16723 + }, + { + "epoch": 0.42942542459650695, + "grad_norm": 0.74609375, + "learning_rate": 0.00016913824857510407, + "loss": 1.0217, + "step": 16724 + }, + { + "epoch": 0.42945110179242874, + "grad_norm": 0.7734375, + "learning_rate": 0.00016913502316989778, + "loss": 0.8814, + "step": 16725 + }, + { + "epoch": 0.42947677898835057, + "grad_norm": 0.765625, + "learning_rate": 0.0001691317976269116, + "loss": 0.9227, + "step": 16726 + }, + { + "epoch": 0.4295024561842724, + "grad_norm": 0.80859375, + "learning_rate": 0.00016912857194615206, + "loss": 1.007, + "step": 16727 + }, + { + "epoch": 0.42952813338019424, + "grad_norm": 0.7890625, + "learning_rate": 0.00016912534612762548, + "loss": 0.8994, + "step": 16728 + }, + { + "epoch": 0.429553810576116, + "grad_norm": 0.8984375, + "learning_rate": 0.00016912212017133833, + "loss": 0.9127, + "step": 16729 + }, + { + "epoch": 0.42957948777203786, + "grad_norm": 0.7734375, + "learning_rate": 0.000169118894077297, + "loss": 0.985, + "step": 16730 + }, + { + "epoch": 0.4296051649679597, + "grad_norm": 0.75390625, + "learning_rate": 0.00016911566784550802, + "loss": 0.7658, + "step": 16731 + }, + { + "epoch": 0.4296308421638815, + "grad_norm": 0.75, + "learning_rate": 0.00016911244147597775, + "loss": 1.0108, + "step": 16732 + }, + { + "epoch": 0.4296565193598033, + "grad_norm": 0.76953125, + "learning_rate": 0.0001691092149687126, + "loss": 0.9212, + "step": 16733 + }, + { + "epoch": 0.42968219655572515, + "grad_norm": 0.73046875, + "learning_rate": 0.00016910598832371908, + "loss": 0.8642, + "step": 16734 + }, + { + "epoch": 0.42970787375164693, + "grad_norm": 0.828125, + "learning_rate": 0.0001691027615410035, + "loss": 0.9335, + "step": 16735 + }, + { + "epoch": 0.42973355094756877, + "grad_norm": 0.75390625, + "learning_rate": 0.0001690995346205724, + "loss": 0.8955, + "step": 16736 + }, + { + "epoch": 0.4297592281434906, + "grad_norm": 0.71484375, + "learning_rate": 0.00016909630756243216, + "loss": 0.9423, + "step": 16737 + }, + { + "epoch": 0.4297849053394124, + "grad_norm": 0.93359375, + "learning_rate": 0.00016909308036658919, + "loss": 0.9223, + "step": 16738 + }, + { + "epoch": 0.4298105825353342, + "grad_norm": 0.78125, + "learning_rate": 0.00016908985303305, + "loss": 0.8785, + "step": 16739 + }, + { + "epoch": 0.42983625973125605, + "grad_norm": 0.7421875, + "learning_rate": 0.00016908662556182094, + "loss": 1.0263, + "step": 16740 + }, + { + "epoch": 0.4298619369271779, + "grad_norm": 0.7734375, + "learning_rate": 0.0001690833979529085, + "loss": 0.8513, + "step": 16741 + }, + { + "epoch": 0.42988761412309967, + "grad_norm": 1.40625, + "learning_rate": 0.00016908017020631906, + "loss": 0.9441, + "step": 16742 + }, + { + "epoch": 0.4299132913190215, + "grad_norm": 0.83203125, + "learning_rate": 0.0001690769423220591, + "loss": 0.9899, + "step": 16743 + }, + { + "epoch": 0.42993896851494334, + "grad_norm": 0.73046875, + "learning_rate": 0.00016907371430013504, + "loss": 0.7652, + "step": 16744 + }, + { + "epoch": 0.4299646457108651, + "grad_norm": 0.73046875, + "learning_rate": 0.0001690704861405533, + "loss": 0.8923, + "step": 16745 + }, + { + "epoch": 0.42999032290678696, + "grad_norm": 0.7421875, + "learning_rate": 0.0001690672578433203, + "loss": 0.8053, + "step": 16746 + }, + { + "epoch": 0.4300160001027088, + "grad_norm": 0.7890625, + "learning_rate": 0.00016906402940844253, + "loss": 1.0669, + "step": 16747 + }, + { + "epoch": 0.4300416772986306, + "grad_norm": 0.73828125, + "learning_rate": 0.00016906080083592637, + "loss": 0.7943, + "step": 16748 + }, + { + "epoch": 0.4300673544945524, + "grad_norm": 0.79296875, + "learning_rate": 0.00016905757212577828, + "loss": 0.861, + "step": 16749 + }, + { + "epoch": 0.43009303169047425, + "grad_norm": 0.76171875, + "learning_rate": 0.00016905434327800465, + "loss": 0.9103, + "step": 16750 + }, + { + "epoch": 0.4301187088863961, + "grad_norm": 0.7421875, + "learning_rate": 0.00016905111429261197, + "loss": 0.8978, + "step": 16751 + }, + { + "epoch": 0.43014438608231786, + "grad_norm": 0.83203125, + "learning_rate": 0.0001690478851696067, + "loss": 0.8962, + "step": 16752 + }, + { + "epoch": 0.4301700632782397, + "grad_norm": 0.76171875, + "learning_rate": 0.0001690446559089952, + "loss": 0.8843, + "step": 16753 + }, + { + "epoch": 0.43019574047416154, + "grad_norm": 0.87890625, + "learning_rate": 0.00016904142651078395, + "loss": 0.935, + "step": 16754 + }, + { + "epoch": 0.4302214176700833, + "grad_norm": 0.80859375, + "learning_rate": 0.00016903819697497934, + "loss": 1.0851, + "step": 16755 + }, + { + "epoch": 0.43024709486600515, + "grad_norm": 0.85546875, + "learning_rate": 0.00016903496730158785, + "loss": 0.8791, + "step": 16756 + }, + { + "epoch": 0.430272772061927, + "grad_norm": 0.796875, + "learning_rate": 0.00016903173749061593, + "loss": 0.9683, + "step": 16757 + }, + { + "epoch": 0.43029844925784877, + "grad_norm": 0.7734375, + "learning_rate": 0.00016902850754206997, + "loss": 1.0201, + "step": 16758 + }, + { + "epoch": 0.4303241264537706, + "grad_norm": 0.81640625, + "learning_rate": 0.00016902527745595643, + "loss": 0.9578, + "step": 16759 + }, + { + "epoch": 0.43034980364969244, + "grad_norm": 0.6875, + "learning_rate": 0.00016902204723228176, + "loss": 0.8486, + "step": 16760 + }, + { + "epoch": 0.4303754808456143, + "grad_norm": 0.70703125, + "learning_rate": 0.00016901881687105238, + "loss": 0.8102, + "step": 16761 + }, + { + "epoch": 0.43040115804153606, + "grad_norm": 0.72265625, + "learning_rate": 0.00016901558637227472, + "loss": 0.8282, + "step": 16762 + }, + { + "epoch": 0.4304268352374579, + "grad_norm": 0.75, + "learning_rate": 0.00016901235573595527, + "loss": 0.9272, + "step": 16763 + }, + { + "epoch": 0.43045251243337973, + "grad_norm": 0.80859375, + "learning_rate": 0.00016900912496210037, + "loss": 1.0135, + "step": 16764 + }, + { + "epoch": 0.4304781896293015, + "grad_norm": 0.7421875, + "learning_rate": 0.00016900589405071656, + "loss": 0.8399, + "step": 16765 + }, + { + "epoch": 0.43050386682522335, + "grad_norm": 0.6953125, + "learning_rate": 0.00016900266300181024, + "loss": 0.8232, + "step": 16766 + }, + { + "epoch": 0.4305295440211452, + "grad_norm": 0.88671875, + "learning_rate": 0.0001689994318153878, + "loss": 1.184, + "step": 16767 + }, + { + "epoch": 0.43055522121706696, + "grad_norm": 0.75, + "learning_rate": 0.00016899620049145577, + "loss": 0.788, + "step": 16768 + }, + { + "epoch": 0.4305808984129888, + "grad_norm": 0.8515625, + "learning_rate": 0.00016899296903002053, + "loss": 1.0246, + "step": 16769 + }, + { + "epoch": 0.43060657560891064, + "grad_norm": 0.88671875, + "learning_rate": 0.00016898973743108853, + "loss": 0.9788, + "step": 16770 + }, + { + "epoch": 0.43063225280483247, + "grad_norm": 0.8515625, + "learning_rate": 0.00016898650569466622, + "loss": 0.9243, + "step": 16771 + }, + { + "epoch": 0.43065793000075425, + "grad_norm": 0.7734375, + "learning_rate": 0.00016898327382076006, + "loss": 0.9865, + "step": 16772 + }, + { + "epoch": 0.4306836071966761, + "grad_norm": 0.77734375, + "learning_rate": 0.00016898004180937643, + "loss": 1.0195, + "step": 16773 + }, + { + "epoch": 0.4307092843925979, + "grad_norm": 0.8359375, + "learning_rate": 0.0001689768096605218, + "loss": 1.0096, + "step": 16774 + }, + { + "epoch": 0.4307349615885197, + "grad_norm": 0.8203125, + "learning_rate": 0.00016897357737420267, + "loss": 1.0771, + "step": 16775 + }, + { + "epoch": 0.43076063878444154, + "grad_norm": 0.73046875, + "learning_rate": 0.00016897034495042538, + "loss": 0.9264, + "step": 16776 + }, + { + "epoch": 0.4307863159803634, + "grad_norm": 0.83984375, + "learning_rate": 0.00016896711238919646, + "loss": 0.9389, + "step": 16777 + }, + { + "epoch": 0.43081199317628516, + "grad_norm": 0.83984375, + "learning_rate": 0.00016896387969052228, + "loss": 0.9277, + "step": 16778 + }, + { + "epoch": 0.430837670372207, + "grad_norm": 0.7578125, + "learning_rate": 0.00016896064685440932, + "loss": 0.8899, + "step": 16779 + }, + { + "epoch": 0.43086334756812883, + "grad_norm": 0.7421875, + "learning_rate": 0.00016895741388086406, + "loss": 0.7609, + "step": 16780 + }, + { + "epoch": 0.43088902476405067, + "grad_norm": 0.76171875, + "learning_rate": 0.0001689541807698929, + "loss": 0.9077, + "step": 16781 + }, + { + "epoch": 0.43091470195997245, + "grad_norm": 0.80859375, + "learning_rate": 0.00016895094752150222, + "loss": 1.0722, + "step": 16782 + }, + { + "epoch": 0.4309403791558943, + "grad_norm": 0.76171875, + "learning_rate": 0.0001689477141356986, + "loss": 0.7983, + "step": 16783 + }, + { + "epoch": 0.4309660563518161, + "grad_norm": 0.7890625, + "learning_rate": 0.00016894448061248837, + "loss": 1.0713, + "step": 16784 + }, + { + "epoch": 0.4309917335477379, + "grad_norm": 0.7890625, + "learning_rate": 0.00016894124695187804, + "loss": 0.8106, + "step": 16785 + }, + { + "epoch": 0.43101741074365973, + "grad_norm": 0.734375, + "learning_rate": 0.00016893801315387403, + "loss": 0.8527, + "step": 16786 + }, + { + "epoch": 0.43104308793958157, + "grad_norm": 0.875, + "learning_rate": 0.00016893477921848277, + "loss": 0.8715, + "step": 16787 + }, + { + "epoch": 0.43106876513550335, + "grad_norm": 0.81640625, + "learning_rate": 0.00016893154514571076, + "loss": 0.9273, + "step": 16788 + }, + { + "epoch": 0.4310944423314252, + "grad_norm": 0.82421875, + "learning_rate": 0.00016892831093556437, + "loss": 0.8805, + "step": 16789 + }, + { + "epoch": 0.431120119527347, + "grad_norm": 0.73828125, + "learning_rate": 0.0001689250765880501, + "loss": 0.9147, + "step": 16790 + }, + { + "epoch": 0.43114579672326886, + "grad_norm": 0.87109375, + "learning_rate": 0.00016892184210317437, + "loss": 1.0132, + "step": 16791 + }, + { + "epoch": 0.43117147391919064, + "grad_norm": 0.75390625, + "learning_rate": 0.00016891860748094367, + "loss": 0.9621, + "step": 16792 + }, + { + "epoch": 0.4311971511151125, + "grad_norm": 0.8046875, + "learning_rate": 0.00016891537272136438, + "loss": 0.965, + "step": 16793 + }, + { + "epoch": 0.4312228283110343, + "grad_norm": 0.828125, + "learning_rate": 0.00016891213782444295, + "loss": 0.8118, + "step": 16794 + }, + { + "epoch": 0.4312485055069561, + "grad_norm": 0.8203125, + "learning_rate": 0.00016890890279018587, + "loss": 1.0129, + "step": 16795 + }, + { + "epoch": 0.43127418270287793, + "grad_norm": 0.84375, + "learning_rate": 0.0001689056676185996, + "loss": 0.9755, + "step": 16796 + }, + { + "epoch": 0.43129985989879976, + "grad_norm": 0.83203125, + "learning_rate": 0.0001689024323096905, + "loss": 1.0287, + "step": 16797 + }, + { + "epoch": 0.43132553709472155, + "grad_norm": 0.78515625, + "learning_rate": 0.00016889919686346514, + "loss": 0.9948, + "step": 16798 + }, + { + "epoch": 0.4313512142906434, + "grad_norm": 0.81640625, + "learning_rate": 0.0001688959612799299, + "loss": 0.8888, + "step": 16799 + }, + { + "epoch": 0.4313768914865652, + "grad_norm": 0.83984375, + "learning_rate": 0.0001688927255590912, + "loss": 0.9152, + "step": 16800 + }, + { + "epoch": 0.43140256868248705, + "grad_norm": 0.71484375, + "learning_rate": 0.00016888948970095554, + "loss": 0.8876, + "step": 16801 + }, + { + "epoch": 0.43142824587840883, + "grad_norm": 0.77734375, + "learning_rate": 0.00016888625370552936, + "loss": 0.8423, + "step": 16802 + }, + { + "epoch": 0.43145392307433067, + "grad_norm": 0.73828125, + "learning_rate": 0.00016888301757281906, + "loss": 0.9112, + "step": 16803 + }, + { + "epoch": 0.4314796002702525, + "grad_norm": 0.7578125, + "learning_rate": 0.00016887978130283114, + "loss": 1.0364, + "step": 16804 + }, + { + "epoch": 0.4315052774661743, + "grad_norm": 0.82421875, + "learning_rate": 0.00016887654489557206, + "loss": 0.9901, + "step": 16805 + }, + { + "epoch": 0.4315309546620961, + "grad_norm": 0.74609375, + "learning_rate": 0.00016887330835104828, + "loss": 0.9191, + "step": 16806 + }, + { + "epoch": 0.43155663185801796, + "grad_norm": 0.8515625, + "learning_rate": 0.00016887007166926617, + "loss": 0.9597, + "step": 16807 + }, + { + "epoch": 0.43158230905393974, + "grad_norm": 0.89453125, + "learning_rate": 0.0001688668348502322, + "loss": 1.0607, + "step": 16808 + }, + { + "epoch": 0.4316079862498616, + "grad_norm": 0.90234375, + "learning_rate": 0.00016886359789395288, + "loss": 0.8777, + "step": 16809 + }, + { + "epoch": 0.4316336634457834, + "grad_norm": 0.79296875, + "learning_rate": 0.00016886036080043465, + "loss": 0.8638, + "step": 16810 + }, + { + "epoch": 0.43165934064170525, + "grad_norm": 0.7890625, + "learning_rate": 0.0001688571235696839, + "loss": 0.9785, + "step": 16811 + }, + { + "epoch": 0.431685017837627, + "grad_norm": 0.76953125, + "learning_rate": 0.00016885388620170716, + "loss": 0.9161, + "step": 16812 + }, + { + "epoch": 0.43171069503354886, + "grad_norm": 0.80859375, + "learning_rate": 0.00016885064869651083, + "loss": 0.9671, + "step": 16813 + }, + { + "epoch": 0.4317363722294707, + "grad_norm": 0.7265625, + "learning_rate": 0.00016884741105410137, + "loss": 0.8577, + "step": 16814 + }, + { + "epoch": 0.4317620494253925, + "grad_norm": 0.74609375, + "learning_rate": 0.00016884417327448525, + "loss": 0.8918, + "step": 16815 + }, + { + "epoch": 0.4317877266213143, + "grad_norm": 0.78125, + "learning_rate": 0.00016884093535766889, + "loss": 0.9099, + "step": 16816 + }, + { + "epoch": 0.43181340381723615, + "grad_norm": 0.74609375, + "learning_rate": 0.0001688376973036588, + "loss": 0.8116, + "step": 16817 + }, + { + "epoch": 0.43183908101315793, + "grad_norm": 0.77734375, + "learning_rate": 0.00016883445911246135, + "loss": 0.8083, + "step": 16818 + }, + { + "epoch": 0.43186475820907977, + "grad_norm": 0.81640625, + "learning_rate": 0.0001688312207840831, + "loss": 1.0506, + "step": 16819 + }, + { + "epoch": 0.4318904354050016, + "grad_norm": 0.828125, + "learning_rate": 0.00016882798231853042, + "loss": 0.8986, + "step": 16820 + }, + { + "epoch": 0.43191611260092344, + "grad_norm": 0.8515625, + "learning_rate": 0.00016882474371580978, + "loss": 0.8886, + "step": 16821 + }, + { + "epoch": 0.4319417897968452, + "grad_norm": 0.8125, + "learning_rate": 0.00016882150497592763, + "loss": 0.8983, + "step": 16822 + }, + { + "epoch": 0.43196746699276706, + "grad_norm": 0.76171875, + "learning_rate": 0.00016881826609889045, + "loss": 0.8809, + "step": 16823 + }, + { + "epoch": 0.4319931441886889, + "grad_norm": 0.875, + "learning_rate": 0.00016881502708470468, + "loss": 1.0035, + "step": 16824 + }, + { + "epoch": 0.4320188213846107, + "grad_norm": 0.8046875, + "learning_rate": 0.0001688117879333768, + "loss": 1.0167, + "step": 16825 + }, + { + "epoch": 0.4320444985805325, + "grad_norm": 0.76953125, + "learning_rate": 0.00016880854864491322, + "loss": 0.802, + "step": 16826 + }, + { + "epoch": 0.43207017577645435, + "grad_norm": 0.71875, + "learning_rate": 0.0001688053092193204, + "loss": 0.8124, + "step": 16827 + }, + { + "epoch": 0.4320958529723761, + "grad_norm": 0.76953125, + "learning_rate": 0.00016880206965660483, + "loss": 0.8108, + "step": 16828 + }, + { + "epoch": 0.43212153016829796, + "grad_norm": 0.75, + "learning_rate": 0.00016879882995677296, + "loss": 0.9463, + "step": 16829 + }, + { + "epoch": 0.4321472073642198, + "grad_norm": 0.7421875, + "learning_rate": 0.00016879559011983124, + "loss": 0.7386, + "step": 16830 + }, + { + "epoch": 0.43217288456014163, + "grad_norm": 0.796875, + "learning_rate": 0.0001687923501457861, + "loss": 0.8343, + "step": 16831 + }, + { + "epoch": 0.4321985617560634, + "grad_norm": 0.7734375, + "learning_rate": 0.00016878911003464405, + "loss": 0.8317, + "step": 16832 + }, + { + "epoch": 0.43222423895198525, + "grad_norm": 0.84375, + "learning_rate": 0.0001687858697864115, + "loss": 0.9566, + "step": 16833 + }, + { + "epoch": 0.4322499161479071, + "grad_norm": 0.78125, + "learning_rate": 0.00016878262940109493, + "loss": 0.9926, + "step": 16834 + }, + { + "epoch": 0.43227559334382887, + "grad_norm": 0.76171875, + "learning_rate": 0.00016877938887870078, + "loss": 0.9064, + "step": 16835 + }, + { + "epoch": 0.4323012705397507, + "grad_norm": 0.78125, + "learning_rate": 0.00016877614821923556, + "loss": 0.9173, + "step": 16836 + }, + { + "epoch": 0.43232694773567254, + "grad_norm": 0.796875, + "learning_rate": 0.0001687729074227056, + "loss": 0.8219, + "step": 16837 + }, + { + "epoch": 0.4323526249315943, + "grad_norm": 0.81640625, + "learning_rate": 0.00016876966648911754, + "loss": 0.8606, + "step": 16838 + }, + { + "epoch": 0.43237830212751616, + "grad_norm": 0.79296875, + "learning_rate": 0.00016876642541847772, + "loss": 0.8728, + "step": 16839 + }, + { + "epoch": 0.432403979323438, + "grad_norm": 0.8203125, + "learning_rate": 0.0001687631842107926, + "loss": 0.9514, + "step": 16840 + }, + { + "epoch": 0.43242965651935983, + "grad_norm": 0.82421875, + "learning_rate": 0.0001687599428660687, + "loss": 0.9601, + "step": 16841 + }, + { + "epoch": 0.4324553337152816, + "grad_norm": 0.859375, + "learning_rate": 0.00016875670138431244, + "loss": 0.9158, + "step": 16842 + }, + { + "epoch": 0.43248101091120345, + "grad_norm": 0.75390625, + "learning_rate": 0.00016875345976553026, + "loss": 0.8651, + "step": 16843 + }, + { + "epoch": 0.4325066881071253, + "grad_norm": 0.765625, + "learning_rate": 0.0001687502180097287, + "loss": 0.8528, + "step": 16844 + }, + { + "epoch": 0.43253236530304706, + "grad_norm": 0.75390625, + "learning_rate": 0.0001687469761169141, + "loss": 0.9054, + "step": 16845 + }, + { + "epoch": 0.4325580424989689, + "grad_norm": 0.8671875, + "learning_rate": 0.000168743734087093, + "loss": 0.9229, + "step": 16846 + }, + { + "epoch": 0.43258371969489073, + "grad_norm": 0.73828125, + "learning_rate": 0.00016874049192027186, + "loss": 0.8557, + "step": 16847 + }, + { + "epoch": 0.4326093968908125, + "grad_norm": 0.75390625, + "learning_rate": 0.00016873724961645714, + "loss": 0.7639, + "step": 16848 + }, + { + "epoch": 0.43263507408673435, + "grad_norm": 0.84375, + "learning_rate": 0.00016873400717565525, + "loss": 0.8733, + "step": 16849 + }, + { + "epoch": 0.4326607512826562, + "grad_norm": 0.91796875, + "learning_rate": 0.00016873076459787275, + "loss": 0.9249, + "step": 16850 + }, + { + "epoch": 0.432686428478578, + "grad_norm": 0.875, + "learning_rate": 0.000168727521883116, + "loss": 1.079, + "step": 16851 + }, + { + "epoch": 0.4327121056744998, + "grad_norm": 0.7578125, + "learning_rate": 0.00016872427903139153, + "loss": 0.9418, + "step": 16852 + }, + { + "epoch": 0.43273778287042164, + "grad_norm": 0.83203125, + "learning_rate": 0.0001687210360427058, + "loss": 0.9333, + "step": 16853 + }, + { + "epoch": 0.4327634600663435, + "grad_norm": 0.81640625, + "learning_rate": 0.0001687177929170652, + "loss": 0.8755, + "step": 16854 + }, + { + "epoch": 0.43278913726226526, + "grad_norm": 0.8125, + "learning_rate": 0.00016871454965447627, + "loss": 0.9596, + "step": 16855 + }, + { + "epoch": 0.4328148144581871, + "grad_norm": 0.875, + "learning_rate": 0.00016871130625494544, + "loss": 0.8178, + "step": 16856 + }, + { + "epoch": 0.4328404916541089, + "grad_norm": 0.7890625, + "learning_rate": 0.0001687080627184792, + "loss": 0.9119, + "step": 16857 + }, + { + "epoch": 0.4328661688500307, + "grad_norm": 0.78515625, + "learning_rate": 0.000168704819045084, + "loss": 0.9363, + "step": 16858 + }, + { + "epoch": 0.43289184604595254, + "grad_norm": 0.76171875, + "learning_rate": 0.0001687015752347663, + "loss": 0.804, + "step": 16859 + }, + { + "epoch": 0.4329175232418744, + "grad_norm": 0.875, + "learning_rate": 0.00016869833128753254, + "loss": 0.9407, + "step": 16860 + }, + { + "epoch": 0.4329432004377962, + "grad_norm": 0.78125, + "learning_rate": 0.00016869508720338921, + "loss": 0.9749, + "step": 16861 + }, + { + "epoch": 0.432968877633718, + "grad_norm": 0.83203125, + "learning_rate": 0.0001686918429823428, + "loss": 0.9518, + "step": 16862 + }, + { + "epoch": 0.43299455482963983, + "grad_norm": 0.84765625, + "learning_rate": 0.0001686885986243998, + "loss": 0.9914, + "step": 16863 + }, + { + "epoch": 0.43302023202556167, + "grad_norm": 0.76171875, + "learning_rate": 0.00016868535412956653, + "loss": 0.9619, + "step": 16864 + }, + { + "epoch": 0.43304590922148345, + "grad_norm": 0.69140625, + "learning_rate": 0.00016868210949784964, + "loss": 0.8389, + "step": 16865 + }, + { + "epoch": 0.4330715864174053, + "grad_norm": 0.8515625, + "learning_rate": 0.00016867886472925547, + "loss": 0.9243, + "step": 16866 + }, + { + "epoch": 0.4330972636133271, + "grad_norm": 1.015625, + "learning_rate": 0.0001686756198237905, + "loss": 1.064, + "step": 16867 + }, + { + "epoch": 0.4331229408092489, + "grad_norm": 0.8046875, + "learning_rate": 0.00016867237478146126, + "loss": 0.9906, + "step": 16868 + }, + { + "epoch": 0.43314861800517074, + "grad_norm": 0.78515625, + "learning_rate": 0.00016866912960227418, + "loss": 0.8224, + "step": 16869 + }, + { + "epoch": 0.4331742952010926, + "grad_norm": 0.80859375, + "learning_rate": 0.00016866588428623568, + "loss": 1.0311, + "step": 16870 + }, + { + "epoch": 0.4331999723970144, + "grad_norm": 0.765625, + "learning_rate": 0.00016866263883335233, + "loss": 0.9099, + "step": 16871 + }, + { + "epoch": 0.4332256495929362, + "grad_norm": 0.77734375, + "learning_rate": 0.00016865939324363054, + "loss": 0.8875, + "step": 16872 + }, + { + "epoch": 0.433251326788858, + "grad_norm": 0.77734375, + "learning_rate": 0.00016865614751707678, + "loss": 0.9233, + "step": 16873 + }, + { + "epoch": 0.43327700398477986, + "grad_norm": 1.0, + "learning_rate": 0.00016865290165369746, + "loss": 0.914, + "step": 16874 + }, + { + "epoch": 0.43330268118070164, + "grad_norm": 0.70703125, + "learning_rate": 0.00016864965565349918, + "loss": 0.845, + "step": 16875 + }, + { + "epoch": 0.4333283583766235, + "grad_norm": 0.828125, + "learning_rate": 0.00016864640951648832, + "loss": 0.991, + "step": 16876 + }, + { + "epoch": 0.4333540355725453, + "grad_norm": 0.80859375, + "learning_rate": 0.00016864316324267137, + "loss": 0.8934, + "step": 16877 + }, + { + "epoch": 0.4333797127684671, + "grad_norm": 0.81640625, + "learning_rate": 0.00016863991683205474, + "loss": 0.987, + "step": 16878 + }, + { + "epoch": 0.43340538996438893, + "grad_norm": 0.765625, + "learning_rate": 0.00016863667028464504, + "loss": 0.9939, + "step": 16879 + }, + { + "epoch": 0.43343106716031077, + "grad_norm": 0.73046875, + "learning_rate": 0.0001686334236004486, + "loss": 0.869, + "step": 16880 + }, + { + "epoch": 0.4334567443562326, + "grad_norm": 0.74609375, + "learning_rate": 0.00016863017677947196, + "loss": 0.8455, + "step": 16881 + }, + { + "epoch": 0.4334824215521544, + "grad_norm": 0.78125, + "learning_rate": 0.00016862692982172158, + "loss": 0.899, + "step": 16882 + }, + { + "epoch": 0.4335080987480762, + "grad_norm": 0.75, + "learning_rate": 0.00016862368272720391, + "loss": 0.8795, + "step": 16883 + }, + { + "epoch": 0.43353377594399806, + "grad_norm": 0.734375, + "learning_rate": 0.0001686204354959255, + "loss": 0.824, + "step": 16884 + }, + { + "epoch": 0.43355945313991984, + "grad_norm": 0.76953125, + "learning_rate": 0.00016861718812789271, + "loss": 0.9431, + "step": 16885 + }, + { + "epoch": 0.4335851303358417, + "grad_norm": 0.765625, + "learning_rate": 0.00016861394062311205, + "loss": 1.0174, + "step": 16886 + }, + { + "epoch": 0.4336108075317635, + "grad_norm": 0.8125, + "learning_rate": 0.00016861069298159005, + "loss": 0.9821, + "step": 16887 + }, + { + "epoch": 0.4336364847276853, + "grad_norm": 0.75390625, + "learning_rate": 0.00016860744520333308, + "loss": 0.8055, + "step": 16888 + }, + { + "epoch": 0.4336621619236071, + "grad_norm": 0.75390625, + "learning_rate": 0.0001686041972883477, + "loss": 0.9331, + "step": 16889 + }, + { + "epoch": 0.43368783911952896, + "grad_norm": 0.75390625, + "learning_rate": 0.00016860094923664036, + "loss": 0.824, + "step": 16890 + }, + { + "epoch": 0.4337135163154508, + "grad_norm": 0.78125, + "learning_rate": 0.00016859770104821754, + "loss": 0.7601, + "step": 16891 + }, + { + "epoch": 0.4337391935113726, + "grad_norm": 0.76953125, + "learning_rate": 0.00016859445272308566, + "loss": 0.8323, + "step": 16892 + }, + { + "epoch": 0.4337648707072944, + "grad_norm": 0.83203125, + "learning_rate": 0.00016859120426125122, + "loss": 0.8326, + "step": 16893 + }, + { + "epoch": 0.43379054790321625, + "grad_norm": 0.69921875, + "learning_rate": 0.00016858795566272074, + "loss": 0.7223, + "step": 16894 + }, + { + "epoch": 0.43381622509913803, + "grad_norm": 0.78515625, + "learning_rate": 0.00016858470692750067, + "loss": 0.7829, + "step": 16895 + }, + { + "epoch": 0.43384190229505987, + "grad_norm": 0.8515625, + "learning_rate": 0.00016858145805559747, + "loss": 0.9206, + "step": 16896 + }, + { + "epoch": 0.4338675794909817, + "grad_norm": 0.73046875, + "learning_rate": 0.0001685782090470176, + "loss": 0.8579, + "step": 16897 + }, + { + "epoch": 0.4338932566869035, + "grad_norm": 0.83203125, + "learning_rate": 0.00016857495990176756, + "loss": 0.9937, + "step": 16898 + }, + { + "epoch": 0.4339189338828253, + "grad_norm": 0.76171875, + "learning_rate": 0.0001685717106198538, + "loss": 0.8606, + "step": 16899 + }, + { + "epoch": 0.43394461107874716, + "grad_norm": 0.83984375, + "learning_rate": 0.00016856846120128286, + "loss": 1.0185, + "step": 16900 + }, + { + "epoch": 0.433970288274669, + "grad_norm": 0.74609375, + "learning_rate": 0.0001685652116460611, + "loss": 0.8989, + "step": 16901 + }, + { + "epoch": 0.43399596547059077, + "grad_norm": 0.796875, + "learning_rate": 0.00016856196195419513, + "loss": 0.8862, + "step": 16902 + }, + { + "epoch": 0.4340216426665126, + "grad_norm": 0.7734375, + "learning_rate": 0.00016855871212569134, + "loss": 1.0607, + "step": 16903 + }, + { + "epoch": 0.43404731986243444, + "grad_norm": 0.77734375, + "learning_rate": 0.00016855546216055625, + "loss": 0.9837, + "step": 16904 + }, + { + "epoch": 0.4340729970583562, + "grad_norm": 0.7421875, + "learning_rate": 0.0001685522120587963, + "loss": 0.8601, + "step": 16905 + }, + { + "epoch": 0.43409867425427806, + "grad_norm": 0.765625, + "learning_rate": 0.000168548961820418, + "loss": 0.8672, + "step": 16906 + }, + { + "epoch": 0.4341243514501999, + "grad_norm": 0.8046875, + "learning_rate": 0.00016854571144542778, + "loss": 1.0602, + "step": 16907 + }, + { + "epoch": 0.4341500286461217, + "grad_norm": 0.8671875, + "learning_rate": 0.00016854246093383216, + "loss": 0.8504, + "step": 16908 + }, + { + "epoch": 0.4341757058420435, + "grad_norm": 0.765625, + "learning_rate": 0.0001685392102856376, + "loss": 0.9512, + "step": 16909 + }, + { + "epoch": 0.43420138303796535, + "grad_norm": 0.8515625, + "learning_rate": 0.00016853595950085058, + "loss": 0.8887, + "step": 16910 + }, + { + "epoch": 0.4342270602338872, + "grad_norm": 0.80078125, + "learning_rate": 0.00016853270857947764, + "loss": 0.9192, + "step": 16911 + }, + { + "epoch": 0.43425273742980897, + "grad_norm": 0.78125, + "learning_rate": 0.00016852945752152514, + "loss": 1.067, + "step": 16912 + }, + { + "epoch": 0.4342784146257308, + "grad_norm": 0.7421875, + "learning_rate": 0.00016852620632699965, + "loss": 0.8415, + "step": 16913 + }, + { + "epoch": 0.43430409182165264, + "grad_norm": 0.7578125, + "learning_rate": 0.0001685229549959076, + "loss": 0.7675, + "step": 16914 + }, + { + "epoch": 0.4343297690175744, + "grad_norm": 0.828125, + "learning_rate": 0.0001685197035282555, + "loss": 0.8782, + "step": 16915 + }, + { + "epoch": 0.43435544621349625, + "grad_norm": 0.75390625, + "learning_rate": 0.00016851645192404984, + "loss": 0.8961, + "step": 16916 + }, + { + "epoch": 0.4343811234094181, + "grad_norm": 0.73828125, + "learning_rate": 0.00016851320018329707, + "loss": 0.8094, + "step": 16917 + }, + { + "epoch": 0.43440680060533987, + "grad_norm": 0.75390625, + "learning_rate": 0.00016850994830600366, + "loss": 0.8459, + "step": 16918 + }, + { + "epoch": 0.4344324778012617, + "grad_norm": 0.76171875, + "learning_rate": 0.00016850669629217611, + "loss": 1.0457, + "step": 16919 + }, + { + "epoch": 0.43445815499718354, + "grad_norm": 0.8203125, + "learning_rate": 0.00016850344414182092, + "loss": 0.8557, + "step": 16920 + }, + { + "epoch": 0.4344838321931054, + "grad_norm": 0.70703125, + "learning_rate": 0.00016850019185494456, + "loss": 0.8304, + "step": 16921 + }, + { + "epoch": 0.43450950938902716, + "grad_norm": 0.8046875, + "learning_rate": 0.0001684969394315535, + "loss": 0.8756, + "step": 16922 + }, + { + "epoch": 0.434535186584949, + "grad_norm": 0.80078125, + "learning_rate": 0.00016849368687165423, + "loss": 0.9763, + "step": 16923 + }, + { + "epoch": 0.43456086378087083, + "grad_norm": 0.70703125, + "learning_rate": 0.00016849043417525324, + "loss": 0.8833, + "step": 16924 + }, + { + "epoch": 0.4345865409767926, + "grad_norm": 0.7578125, + "learning_rate": 0.00016848718134235697, + "loss": 0.9636, + "step": 16925 + }, + { + "epoch": 0.43461221817271445, + "grad_norm": 0.83203125, + "learning_rate": 0.00016848392837297196, + "loss": 0.9609, + "step": 16926 + }, + { + "epoch": 0.4346378953686363, + "grad_norm": 0.7890625, + "learning_rate": 0.00016848067526710464, + "loss": 0.9328, + "step": 16927 + }, + { + "epoch": 0.43466357256455807, + "grad_norm": 0.7578125, + "learning_rate": 0.00016847742202476154, + "loss": 0.8368, + "step": 16928 + }, + { + "epoch": 0.4346892497604799, + "grad_norm": 0.73046875, + "learning_rate": 0.00016847416864594912, + "loss": 0.8404, + "step": 16929 + }, + { + "epoch": 0.43471492695640174, + "grad_norm": 0.87109375, + "learning_rate": 0.00016847091513067387, + "loss": 1.0537, + "step": 16930 + }, + { + "epoch": 0.4347406041523236, + "grad_norm": 0.7890625, + "learning_rate": 0.00016846766147894226, + "loss": 1.0696, + "step": 16931 + }, + { + "epoch": 0.43476628134824535, + "grad_norm": 0.7421875, + "learning_rate": 0.0001684644076907608, + "loss": 1.0723, + "step": 16932 + }, + { + "epoch": 0.4347919585441672, + "grad_norm": 0.79296875, + "learning_rate": 0.00016846115376613596, + "loss": 0.8841, + "step": 16933 + }, + { + "epoch": 0.434817635740089, + "grad_norm": 0.80859375, + "learning_rate": 0.00016845789970507423, + "loss": 0.9333, + "step": 16934 + }, + { + "epoch": 0.4348433129360108, + "grad_norm": 0.7265625, + "learning_rate": 0.00016845464550758205, + "loss": 0.814, + "step": 16935 + }, + { + "epoch": 0.43486899013193264, + "grad_norm": 0.82421875, + "learning_rate": 0.00016845139117366599, + "loss": 0.9653, + "step": 16936 + }, + { + "epoch": 0.4348946673278545, + "grad_norm": 0.734375, + "learning_rate": 0.00016844813670333246, + "loss": 0.9792, + "step": 16937 + }, + { + "epoch": 0.43492034452377626, + "grad_norm": 0.765625, + "learning_rate": 0.000168444882096588, + "loss": 0.9597, + "step": 16938 + }, + { + "epoch": 0.4349460217196981, + "grad_norm": 0.796875, + "learning_rate": 0.00016844162735343905, + "loss": 1.0356, + "step": 16939 + }, + { + "epoch": 0.43497169891561993, + "grad_norm": 0.83984375, + "learning_rate": 0.00016843837247389215, + "loss": 0.8627, + "step": 16940 + }, + { + "epoch": 0.4349973761115417, + "grad_norm": 0.8046875, + "learning_rate": 0.00016843511745795372, + "loss": 0.9067, + "step": 16941 + }, + { + "epoch": 0.43502305330746355, + "grad_norm": 0.6953125, + "learning_rate": 0.00016843186230563028, + "loss": 0.9088, + "step": 16942 + }, + { + "epoch": 0.4350487305033854, + "grad_norm": 0.7421875, + "learning_rate": 0.00016842860701692835, + "loss": 0.9159, + "step": 16943 + }, + { + "epoch": 0.4350744076993072, + "grad_norm": 0.8203125, + "learning_rate": 0.0001684253515918544, + "loss": 0.9289, + "step": 16944 + }, + { + "epoch": 0.435100084895229, + "grad_norm": 0.78125, + "learning_rate": 0.00016842209603041487, + "loss": 0.8682, + "step": 16945 + }, + { + "epoch": 0.43512576209115084, + "grad_norm": 0.765625, + "learning_rate": 0.0001684188403326163, + "loss": 0.836, + "step": 16946 + }, + { + "epoch": 0.43515143928707267, + "grad_norm": 0.7734375, + "learning_rate": 0.00016841558449846516, + "loss": 0.9743, + "step": 16947 + }, + { + "epoch": 0.43517711648299445, + "grad_norm": 0.8203125, + "learning_rate": 0.0001684123285279679, + "loss": 0.8092, + "step": 16948 + }, + { + "epoch": 0.4352027936789163, + "grad_norm": 0.79296875, + "learning_rate": 0.0001684090724211311, + "loss": 0.9933, + "step": 16949 + }, + { + "epoch": 0.4352284708748381, + "grad_norm": 0.75, + "learning_rate": 0.0001684058161779612, + "loss": 1.0388, + "step": 16950 + }, + { + "epoch": 0.4352541480707599, + "grad_norm": 0.765625, + "learning_rate": 0.00016840255979846467, + "loss": 0.9153, + "step": 16951 + }, + { + "epoch": 0.43527982526668174, + "grad_norm": 0.72265625, + "learning_rate": 0.000168399303282648, + "loss": 0.8935, + "step": 16952 + }, + { + "epoch": 0.4353055024626036, + "grad_norm": 0.75390625, + "learning_rate": 0.00016839604663051774, + "loss": 0.9953, + "step": 16953 + }, + { + "epoch": 0.4353311796585254, + "grad_norm": 0.76171875, + "learning_rate": 0.00016839278984208029, + "loss": 0.9777, + "step": 16954 + }, + { + "epoch": 0.4353568568544472, + "grad_norm": 0.7734375, + "learning_rate": 0.00016838953291734222, + "loss": 0.9672, + "step": 16955 + }, + { + "epoch": 0.43538253405036903, + "grad_norm": 0.71875, + "learning_rate": 0.00016838627585630996, + "loss": 0.932, + "step": 16956 + }, + { + "epoch": 0.43540821124629087, + "grad_norm": 0.73828125, + "learning_rate": 0.0001683830186589901, + "loss": 0.9576, + "step": 16957 + }, + { + "epoch": 0.43543388844221265, + "grad_norm": 0.8359375, + "learning_rate": 0.00016837976132538898, + "loss": 0.9509, + "step": 16958 + }, + { + "epoch": 0.4354595656381345, + "grad_norm": 0.80859375, + "learning_rate": 0.00016837650385551317, + "loss": 0.7966, + "step": 16959 + }, + { + "epoch": 0.4354852428340563, + "grad_norm": 0.83203125, + "learning_rate": 0.0001683732462493692, + "loss": 0.8598, + "step": 16960 + }, + { + "epoch": 0.4355109200299781, + "grad_norm": 0.7890625, + "learning_rate": 0.00016836998850696354, + "loss": 0.8106, + "step": 16961 + }, + { + "epoch": 0.43553659722589994, + "grad_norm": 0.75390625, + "learning_rate": 0.00016836673062830263, + "loss": 0.9791, + "step": 16962 + }, + { + "epoch": 0.43556227442182177, + "grad_norm": 0.76953125, + "learning_rate": 0.00016836347261339304, + "loss": 0.9906, + "step": 16963 + }, + { + "epoch": 0.4355879516177436, + "grad_norm": 0.76953125, + "learning_rate": 0.0001683602144622412, + "loss": 0.8569, + "step": 16964 + }, + { + "epoch": 0.4356136288136654, + "grad_norm": 0.77734375, + "learning_rate": 0.00016835695617485366, + "loss": 0.8778, + "step": 16965 + }, + { + "epoch": 0.4356393060095872, + "grad_norm": 0.71484375, + "learning_rate": 0.00016835369775123683, + "loss": 0.8155, + "step": 16966 + }, + { + "epoch": 0.43566498320550906, + "grad_norm": 0.88671875, + "learning_rate": 0.00016835043919139728, + "loss": 0.991, + "step": 16967 + }, + { + "epoch": 0.43569066040143084, + "grad_norm": 0.7578125, + "learning_rate": 0.00016834718049534147, + "loss": 1.0084, + "step": 16968 + }, + { + "epoch": 0.4357163375973527, + "grad_norm": 0.7265625, + "learning_rate": 0.0001683439216630759, + "loss": 0.9186, + "step": 16969 + }, + { + "epoch": 0.4357420147932745, + "grad_norm": 0.9375, + "learning_rate": 0.00016834066269460712, + "loss": 0.8004, + "step": 16970 + }, + { + "epoch": 0.4357676919891963, + "grad_norm": 0.75, + "learning_rate": 0.00016833740358994153, + "loss": 0.859, + "step": 16971 + }, + { + "epoch": 0.43579336918511813, + "grad_norm": 0.7578125, + "learning_rate": 0.00016833414434908565, + "loss": 0.8728, + "step": 16972 + }, + { + "epoch": 0.43581904638103997, + "grad_norm": 0.765625, + "learning_rate": 0.000168330884972046, + "loss": 0.7675, + "step": 16973 + }, + { + "epoch": 0.4358447235769618, + "grad_norm": 1.0859375, + "learning_rate": 0.0001683276254588291, + "loss": 0.9249, + "step": 16974 + }, + { + "epoch": 0.4358704007728836, + "grad_norm": 0.765625, + "learning_rate": 0.00016832436580944138, + "loss": 1.013, + "step": 16975 + }, + { + "epoch": 0.4358960779688054, + "grad_norm": 0.8203125, + "learning_rate": 0.0001683211060238894, + "loss": 0.957, + "step": 16976 + }, + { + "epoch": 0.43592175516472725, + "grad_norm": 0.83984375, + "learning_rate": 0.00016831784610217963, + "loss": 0.9492, + "step": 16977 + }, + { + "epoch": 0.43594743236064903, + "grad_norm": 0.71484375, + "learning_rate": 0.00016831458604431854, + "loss": 0.8508, + "step": 16978 + }, + { + "epoch": 0.43597310955657087, + "grad_norm": 0.73046875, + "learning_rate": 0.00016831132585031264, + "loss": 0.8345, + "step": 16979 + }, + { + "epoch": 0.4359987867524927, + "grad_norm": 0.82421875, + "learning_rate": 0.00016830806552016843, + "loss": 1.0591, + "step": 16980 + }, + { + "epoch": 0.4360244639484145, + "grad_norm": 0.7578125, + "learning_rate": 0.00016830480505389245, + "loss": 0.9604, + "step": 16981 + }, + { + "epoch": 0.4360501411443363, + "grad_norm": 0.69140625, + "learning_rate": 0.00016830154445149117, + "loss": 0.8705, + "step": 16982 + }, + { + "epoch": 0.43607581834025816, + "grad_norm": 0.73828125, + "learning_rate": 0.00016829828371297108, + "loss": 0.8612, + "step": 16983 + }, + { + "epoch": 0.43610149553618, + "grad_norm": 0.8203125, + "learning_rate": 0.00016829502283833864, + "loss": 0.8991, + "step": 16984 + }, + { + "epoch": 0.4361271727321018, + "grad_norm": 0.85546875, + "learning_rate": 0.00016829176182760045, + "loss": 0.9348, + "step": 16985 + }, + { + "epoch": 0.4361528499280236, + "grad_norm": 0.7578125, + "learning_rate": 0.0001682885006807629, + "loss": 0.7796, + "step": 16986 + }, + { + "epoch": 0.43617852712394545, + "grad_norm": 0.80078125, + "learning_rate": 0.0001682852393978325, + "loss": 1.0738, + "step": 16987 + }, + { + "epoch": 0.43620420431986723, + "grad_norm": 0.7734375, + "learning_rate": 0.00016828197797881587, + "loss": 1.0294, + "step": 16988 + }, + { + "epoch": 0.43622988151578906, + "grad_norm": 0.76953125, + "learning_rate": 0.0001682787164237194, + "loss": 0.9295, + "step": 16989 + }, + { + "epoch": 0.4362555587117109, + "grad_norm": 0.8828125, + "learning_rate": 0.0001682754547325496, + "loss": 0.8956, + "step": 16990 + }, + { + "epoch": 0.4362812359076327, + "grad_norm": 0.75390625, + "learning_rate": 0.000168272192905313, + "loss": 0.8176, + "step": 16991 + }, + { + "epoch": 0.4363069131035545, + "grad_norm": 0.8125, + "learning_rate": 0.00016826893094201606, + "loss": 0.9158, + "step": 16992 + }, + { + "epoch": 0.43633259029947635, + "grad_norm": 0.79296875, + "learning_rate": 0.00016826566884266532, + "loss": 0.8545, + "step": 16993 + }, + { + "epoch": 0.4363582674953982, + "grad_norm": 0.69921875, + "learning_rate": 0.00016826240660726727, + "loss": 0.8819, + "step": 16994 + }, + { + "epoch": 0.43638394469131997, + "grad_norm": 0.79296875, + "learning_rate": 0.00016825914423582842, + "loss": 0.8702, + "step": 16995 + }, + { + "epoch": 0.4364096218872418, + "grad_norm": 0.765625, + "learning_rate": 0.00016825588172835528, + "loss": 0.8177, + "step": 16996 + }, + { + "epoch": 0.43643529908316364, + "grad_norm": 0.7421875, + "learning_rate": 0.0001682526190848543, + "loss": 0.9477, + "step": 16997 + }, + { + "epoch": 0.4364609762790854, + "grad_norm": 0.7734375, + "learning_rate": 0.000168249356305332, + "loss": 0.9385, + "step": 16998 + }, + { + "epoch": 0.43648665347500726, + "grad_norm": 0.7890625, + "learning_rate": 0.00016824609338979496, + "loss": 0.9709, + "step": 16999 + }, + { + "epoch": 0.4365123306709291, + "grad_norm": 0.78125, + "learning_rate": 0.00016824283033824955, + "loss": 0.9649, + "step": 17000 + }, + { + "epoch": 0.4365123306709291, + "eval_loss": 0.9112228751182556, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 404.9315, + "eval_samples_per_second": 24.696, + "eval_steps_per_second": 0.773, + "step": 17000 + }, + { + "epoch": 0.4365380078668509, + "grad_norm": 0.8203125, + "learning_rate": 0.0001682395671507024, + "loss": 0.9291, + "step": 17001 + }, + { + "epoch": 0.4365636850627727, + "grad_norm": 0.81640625, + "learning_rate": 0.0001682363038271599, + "loss": 0.9989, + "step": 17002 + }, + { + "epoch": 0.43658936225869455, + "grad_norm": 0.76171875, + "learning_rate": 0.00016823304036762865, + "loss": 0.8449, + "step": 17003 + }, + { + "epoch": 0.4366150394546164, + "grad_norm": 0.78515625, + "learning_rate": 0.00016822977677211513, + "loss": 0.9396, + "step": 17004 + }, + { + "epoch": 0.43664071665053816, + "grad_norm": 0.78125, + "learning_rate": 0.0001682265130406258, + "loss": 0.8723, + "step": 17005 + }, + { + "epoch": 0.43666639384646, + "grad_norm": 0.8984375, + "learning_rate": 0.00016822324917316719, + "loss": 1.0017, + "step": 17006 + }, + { + "epoch": 0.43669207104238184, + "grad_norm": 0.828125, + "learning_rate": 0.00016821998516974583, + "loss": 0.9312, + "step": 17007 + }, + { + "epoch": 0.4367177482383036, + "grad_norm": 0.75, + "learning_rate": 0.00016821672103036817, + "loss": 0.8643, + "step": 17008 + }, + { + "epoch": 0.43674342543422545, + "grad_norm": 0.75390625, + "learning_rate": 0.00016821345675504076, + "loss": 0.8738, + "step": 17009 + }, + { + "epoch": 0.4367691026301473, + "grad_norm": 0.7578125, + "learning_rate": 0.0001682101923437701, + "loss": 0.9244, + "step": 17010 + }, + { + "epoch": 0.43679477982606907, + "grad_norm": 0.7421875, + "learning_rate": 0.00016820692779656268, + "loss": 0.8961, + "step": 17011 + }, + { + "epoch": 0.4368204570219909, + "grad_norm": 0.73828125, + "learning_rate": 0.00016820366311342503, + "loss": 0.8609, + "step": 17012 + }, + { + "epoch": 0.43684613421791274, + "grad_norm": 0.79296875, + "learning_rate": 0.00016820039829436363, + "loss": 0.9416, + "step": 17013 + }, + { + "epoch": 0.4368718114138346, + "grad_norm": 0.76171875, + "learning_rate": 0.00016819713333938497, + "loss": 0.9539, + "step": 17014 + }, + { + "epoch": 0.43689748860975636, + "grad_norm": 0.7578125, + "learning_rate": 0.0001681938682484956, + "loss": 0.9313, + "step": 17015 + }, + { + "epoch": 0.4369231658056782, + "grad_norm": 0.828125, + "learning_rate": 0.00016819060302170204, + "loss": 0.929, + "step": 17016 + }, + { + "epoch": 0.43694884300160003, + "grad_norm": 0.75, + "learning_rate": 0.00016818733765901076, + "loss": 0.8927, + "step": 17017 + }, + { + "epoch": 0.4369745201975218, + "grad_norm": 0.7734375, + "learning_rate": 0.00016818407216042823, + "loss": 1.0019, + "step": 17018 + }, + { + "epoch": 0.43700019739344365, + "grad_norm": 0.84765625, + "learning_rate": 0.00016818080652596104, + "loss": 0.9869, + "step": 17019 + }, + { + "epoch": 0.4370258745893655, + "grad_norm": 0.82421875, + "learning_rate": 0.00016817754075561565, + "loss": 0.7201, + "step": 17020 + }, + { + "epoch": 0.43705155178528726, + "grad_norm": 0.74609375, + "learning_rate": 0.0001681742748493986, + "loss": 0.8067, + "step": 17021 + }, + { + "epoch": 0.4370772289812091, + "grad_norm": 0.82421875, + "learning_rate": 0.00016817100880731637, + "loss": 1.006, + "step": 17022 + }, + { + "epoch": 0.43710290617713093, + "grad_norm": 0.74609375, + "learning_rate": 0.00016816774262937547, + "loss": 0.8939, + "step": 17023 + }, + { + "epoch": 0.43712858337305277, + "grad_norm": 0.83203125, + "learning_rate": 0.00016816447631558243, + "loss": 0.9775, + "step": 17024 + }, + { + "epoch": 0.43715426056897455, + "grad_norm": 0.78125, + "learning_rate": 0.00016816120986594372, + "loss": 0.9215, + "step": 17025 + }, + { + "epoch": 0.4371799377648964, + "grad_norm": 0.734375, + "learning_rate": 0.0001681579432804659, + "loss": 0.8514, + "step": 17026 + }, + { + "epoch": 0.4372056149608182, + "grad_norm": 0.81640625, + "learning_rate": 0.0001681546765591554, + "loss": 0.8136, + "step": 17027 + }, + { + "epoch": 0.43723129215674, + "grad_norm": 0.765625, + "learning_rate": 0.00016815140970201885, + "loss": 0.9358, + "step": 17028 + }, + { + "epoch": 0.43725696935266184, + "grad_norm": 0.85546875, + "learning_rate": 0.00016814814270906268, + "loss": 1.0639, + "step": 17029 + }, + { + "epoch": 0.4372826465485837, + "grad_norm": 0.77734375, + "learning_rate": 0.00016814487558029344, + "loss": 0.9375, + "step": 17030 + }, + { + "epoch": 0.43730832374450546, + "grad_norm": 0.85546875, + "learning_rate": 0.00016814160831571758, + "loss": 0.8739, + "step": 17031 + }, + { + "epoch": 0.4373340009404273, + "grad_norm": 0.78125, + "learning_rate": 0.00016813834091534166, + "loss": 0.867, + "step": 17032 + }, + { + "epoch": 0.43735967813634913, + "grad_norm": 0.734375, + "learning_rate": 0.00016813507337917218, + "loss": 0.8048, + "step": 17033 + }, + { + "epoch": 0.43738535533227096, + "grad_norm": 0.79296875, + "learning_rate": 0.00016813180570721567, + "loss": 0.997, + "step": 17034 + }, + { + "epoch": 0.43741103252819274, + "grad_norm": 0.88671875, + "learning_rate": 0.00016812853789947862, + "loss": 0.9823, + "step": 17035 + }, + { + "epoch": 0.4374367097241146, + "grad_norm": 0.71875, + "learning_rate": 0.00016812526995596753, + "loss": 0.8363, + "step": 17036 + }, + { + "epoch": 0.4374623869200364, + "grad_norm": 0.7578125, + "learning_rate": 0.00016812200187668895, + "loss": 1.0766, + "step": 17037 + }, + { + "epoch": 0.4374880641159582, + "grad_norm": 0.76953125, + "learning_rate": 0.0001681187336616494, + "loss": 0.8654, + "step": 17038 + }, + { + "epoch": 0.43751374131188003, + "grad_norm": 0.79296875, + "learning_rate": 0.0001681154653108553, + "loss": 0.8666, + "step": 17039 + }, + { + "epoch": 0.43753941850780187, + "grad_norm": 0.78515625, + "learning_rate": 0.00016811219682431324, + "loss": 0.9249, + "step": 17040 + }, + { + "epoch": 0.43756509570372365, + "grad_norm": 0.81640625, + "learning_rate": 0.00016810892820202976, + "loss": 0.9079, + "step": 17041 + }, + { + "epoch": 0.4375907728996455, + "grad_norm": 0.86328125, + "learning_rate": 0.00016810565944401135, + "loss": 0.9807, + "step": 17042 + }, + { + "epoch": 0.4376164500955673, + "grad_norm": 0.7578125, + "learning_rate": 0.0001681023905502645, + "loss": 0.7954, + "step": 17043 + }, + { + "epoch": 0.43764212729148916, + "grad_norm": 0.80859375, + "learning_rate": 0.0001680991215207957, + "loss": 0.9567, + "step": 17044 + }, + { + "epoch": 0.43766780448741094, + "grad_norm": 0.84375, + "learning_rate": 0.00016809585235561154, + "loss": 0.8233, + "step": 17045 + }, + { + "epoch": 0.4376934816833328, + "grad_norm": 0.80859375, + "learning_rate": 0.00016809258305471847, + "loss": 0.8878, + "step": 17046 + }, + { + "epoch": 0.4377191588792546, + "grad_norm": 0.80078125, + "learning_rate": 0.00016808931361812304, + "loss": 0.8603, + "step": 17047 + }, + { + "epoch": 0.4377448360751764, + "grad_norm": 0.76171875, + "learning_rate": 0.0001680860440458318, + "loss": 0.9475, + "step": 17048 + }, + { + "epoch": 0.4377705132710982, + "grad_norm": 0.81640625, + "learning_rate": 0.00016808277433785117, + "loss": 0.8072, + "step": 17049 + }, + { + "epoch": 0.43779619046702006, + "grad_norm": 0.77734375, + "learning_rate": 0.00016807950449418776, + "loss": 0.9829, + "step": 17050 + }, + { + "epoch": 0.43782186766294184, + "grad_norm": 0.734375, + "learning_rate": 0.00016807623451484802, + "loss": 0.8337, + "step": 17051 + }, + { + "epoch": 0.4378475448588637, + "grad_norm": 0.83984375, + "learning_rate": 0.0001680729643998385, + "loss": 1.0066, + "step": 17052 + }, + { + "epoch": 0.4378732220547855, + "grad_norm": 0.8203125, + "learning_rate": 0.00016806969414916574, + "loss": 0.9453, + "step": 17053 + }, + { + "epoch": 0.43789889925070735, + "grad_norm": 0.7265625, + "learning_rate": 0.00016806642376283615, + "loss": 0.8675, + "step": 17054 + }, + { + "epoch": 0.43792457644662913, + "grad_norm": 0.76953125, + "learning_rate": 0.00016806315324085643, + "loss": 1.021, + "step": 17055 + }, + { + "epoch": 0.43795025364255097, + "grad_norm": 0.88671875, + "learning_rate": 0.0001680598825832329, + "loss": 0.9774, + "step": 17056 + }, + { + "epoch": 0.4379759308384728, + "grad_norm": 0.8203125, + "learning_rate": 0.00016805661178997223, + "loss": 0.9454, + "step": 17057 + }, + { + "epoch": 0.4380016080343946, + "grad_norm": 0.7421875, + "learning_rate": 0.00016805334086108088, + "loss": 0.8654, + "step": 17058 + }, + { + "epoch": 0.4380272852303164, + "grad_norm": 0.765625, + "learning_rate": 0.00016805006979656533, + "loss": 0.9184, + "step": 17059 + }, + { + "epoch": 0.43805296242623826, + "grad_norm": 0.7109375, + "learning_rate": 0.00016804679859643216, + "loss": 0.8739, + "step": 17060 + }, + { + "epoch": 0.43807863962216004, + "grad_norm": 0.7734375, + "learning_rate": 0.00016804352726068787, + "loss": 0.9952, + "step": 17061 + }, + { + "epoch": 0.4381043168180819, + "grad_norm": 0.78515625, + "learning_rate": 0.000168040255789339, + "loss": 0.8442, + "step": 17062 + }, + { + "epoch": 0.4381299940140037, + "grad_norm": 0.80078125, + "learning_rate": 0.000168036984182392, + "loss": 0.9348, + "step": 17063 + }, + { + "epoch": 0.43815567120992555, + "grad_norm": 0.734375, + "learning_rate": 0.00016803371243985347, + "loss": 0.9491, + "step": 17064 + }, + { + "epoch": 0.4381813484058473, + "grad_norm": 0.76953125, + "learning_rate": 0.00016803044056172987, + "loss": 0.8486, + "step": 17065 + }, + { + "epoch": 0.43820702560176916, + "grad_norm": 0.74609375, + "learning_rate": 0.00016802716854802778, + "loss": 0.9091, + "step": 17066 + }, + { + "epoch": 0.438232702797691, + "grad_norm": 0.76171875, + "learning_rate": 0.00016802389639875367, + "loss": 0.8254, + "step": 17067 + }, + { + "epoch": 0.4382583799936128, + "grad_norm": 0.7421875, + "learning_rate": 0.00016802062411391404, + "loss": 0.9901, + "step": 17068 + }, + { + "epoch": 0.4382840571895346, + "grad_norm": 0.859375, + "learning_rate": 0.00016801735169351552, + "loss": 0.9878, + "step": 17069 + }, + { + "epoch": 0.43830973438545645, + "grad_norm": 0.828125, + "learning_rate": 0.00016801407913756452, + "loss": 1.0522, + "step": 17070 + }, + { + "epoch": 0.43833541158137823, + "grad_norm": 0.77734375, + "learning_rate": 0.00016801080644606763, + "loss": 0.8986, + "step": 17071 + }, + { + "epoch": 0.43836108877730007, + "grad_norm": 0.71484375, + "learning_rate": 0.0001680075336190313, + "loss": 0.848, + "step": 17072 + }, + { + "epoch": 0.4383867659732219, + "grad_norm": 0.734375, + "learning_rate": 0.00016800426065646215, + "loss": 0.8603, + "step": 17073 + }, + { + "epoch": 0.43841244316914374, + "grad_norm": 0.71484375, + "learning_rate": 0.00016800098755836662, + "loss": 0.8366, + "step": 17074 + }, + { + "epoch": 0.4384381203650655, + "grad_norm": 0.7265625, + "learning_rate": 0.00016799771432475128, + "loss": 0.8849, + "step": 17075 + }, + { + "epoch": 0.43846379756098736, + "grad_norm": 0.80078125, + "learning_rate": 0.00016799444095562263, + "loss": 1.0509, + "step": 17076 + }, + { + "epoch": 0.4384894747569092, + "grad_norm": 0.77734375, + "learning_rate": 0.0001679911674509872, + "loss": 0.9412, + "step": 17077 + }, + { + "epoch": 0.438515151952831, + "grad_norm": 0.78125, + "learning_rate": 0.00016798789381085153, + "loss": 0.9282, + "step": 17078 + }, + { + "epoch": 0.4385408291487528, + "grad_norm": 0.765625, + "learning_rate": 0.0001679846200352221, + "loss": 0.9472, + "step": 17079 + }, + { + "epoch": 0.43856650634467464, + "grad_norm": 0.8984375, + "learning_rate": 0.00016798134612410547, + "loss": 0.9632, + "step": 17080 + }, + { + "epoch": 0.4385921835405964, + "grad_norm": 0.8203125, + "learning_rate": 0.00016797807207750818, + "loss": 0.8444, + "step": 17081 + }, + { + "epoch": 0.43861786073651826, + "grad_norm": 0.765625, + "learning_rate": 0.00016797479789543672, + "loss": 1.0118, + "step": 17082 + }, + { + "epoch": 0.4386435379324401, + "grad_norm": 0.71484375, + "learning_rate": 0.00016797152357789764, + "loss": 0.8404, + "step": 17083 + }, + { + "epoch": 0.43866921512836193, + "grad_norm": 0.8125, + "learning_rate": 0.00016796824912489744, + "loss": 0.988, + "step": 17084 + }, + { + "epoch": 0.4386948923242837, + "grad_norm": 0.7578125, + "learning_rate": 0.00016796497453644267, + "loss": 0.8301, + "step": 17085 + }, + { + "epoch": 0.43872056952020555, + "grad_norm": 0.80859375, + "learning_rate": 0.0001679616998125398, + "loss": 0.9584, + "step": 17086 + }, + { + "epoch": 0.4387462467161274, + "grad_norm": 0.78515625, + "learning_rate": 0.00016795842495319544, + "loss": 0.9937, + "step": 17087 + }, + { + "epoch": 0.43877192391204917, + "grad_norm": 0.78125, + "learning_rate": 0.00016795514995841611, + "loss": 0.8153, + "step": 17088 + }, + { + "epoch": 0.438797601107971, + "grad_norm": 0.75390625, + "learning_rate": 0.00016795187482820825, + "loss": 0.9158, + "step": 17089 + }, + { + "epoch": 0.43882327830389284, + "grad_norm": 0.76171875, + "learning_rate": 0.00016794859956257847, + "loss": 0.9281, + "step": 17090 + }, + { + "epoch": 0.4388489554998146, + "grad_norm": 0.7578125, + "learning_rate": 0.00016794532416153325, + "loss": 0.7848, + "step": 17091 + }, + { + "epoch": 0.43887463269573646, + "grad_norm": 0.76171875, + "learning_rate": 0.00016794204862507917, + "loss": 0.8342, + "step": 17092 + }, + { + "epoch": 0.4389003098916583, + "grad_norm": 0.75390625, + "learning_rate": 0.00016793877295322269, + "loss": 0.9435, + "step": 17093 + }, + { + "epoch": 0.4389259870875801, + "grad_norm": 0.7109375, + "learning_rate": 0.00016793549714597038, + "loss": 0.8964, + "step": 17094 + }, + { + "epoch": 0.4389516642835019, + "grad_norm": 0.76171875, + "learning_rate": 0.00016793222120332877, + "loss": 0.9988, + "step": 17095 + }, + { + "epoch": 0.43897734147942374, + "grad_norm": 0.796875, + "learning_rate": 0.00016792894512530438, + "loss": 0.882, + "step": 17096 + }, + { + "epoch": 0.4390030186753456, + "grad_norm": 0.8359375, + "learning_rate": 0.0001679256689119037, + "loss": 0.9524, + "step": 17097 + }, + { + "epoch": 0.43902869587126736, + "grad_norm": 0.76953125, + "learning_rate": 0.00016792239256313332, + "loss": 0.9295, + "step": 17098 + }, + { + "epoch": 0.4390543730671892, + "grad_norm": 1.9375, + "learning_rate": 0.00016791911607899978, + "loss": 0.9072, + "step": 17099 + }, + { + "epoch": 0.43908005026311103, + "grad_norm": 0.76953125, + "learning_rate": 0.00016791583945950956, + "loss": 0.905, + "step": 17100 + }, + { + "epoch": 0.4391057274590328, + "grad_norm": 0.7578125, + "learning_rate": 0.0001679125627046692, + "loss": 0.8261, + "step": 17101 + }, + { + "epoch": 0.43913140465495465, + "grad_norm": 0.79296875, + "learning_rate": 0.00016790928581448525, + "loss": 0.8548, + "step": 17102 + }, + { + "epoch": 0.4391570818508765, + "grad_norm": 0.75390625, + "learning_rate": 0.00016790600878896421, + "loss": 0.9309, + "step": 17103 + }, + { + "epoch": 0.4391827590467983, + "grad_norm": 0.80078125, + "learning_rate": 0.00016790273162811263, + "loss": 0.9295, + "step": 17104 + }, + { + "epoch": 0.4392084362427201, + "grad_norm": 0.75, + "learning_rate": 0.00016789945433193703, + "loss": 0.8222, + "step": 17105 + }, + { + "epoch": 0.43923411343864194, + "grad_norm": 0.78515625, + "learning_rate": 0.00016789617690044398, + "loss": 0.9128, + "step": 17106 + }, + { + "epoch": 0.4392597906345638, + "grad_norm": 0.76953125, + "learning_rate": 0.00016789289933363995, + "loss": 0.8798, + "step": 17107 + }, + { + "epoch": 0.43928546783048555, + "grad_norm": 0.98046875, + "learning_rate": 0.00016788962163153152, + "loss": 1.0011, + "step": 17108 + }, + { + "epoch": 0.4393111450264074, + "grad_norm": 0.8671875, + "learning_rate": 0.0001678863437941252, + "loss": 0.8786, + "step": 17109 + }, + { + "epoch": 0.4393368222223292, + "grad_norm": 0.79296875, + "learning_rate": 0.00016788306582142754, + "loss": 0.9849, + "step": 17110 + }, + { + "epoch": 0.439362499418251, + "grad_norm": 0.796875, + "learning_rate": 0.00016787978771344505, + "loss": 0.9791, + "step": 17111 + }, + { + "epoch": 0.43938817661417284, + "grad_norm": 0.75, + "learning_rate": 0.00016787650947018426, + "loss": 0.8678, + "step": 17112 + }, + { + "epoch": 0.4394138538100947, + "grad_norm": 0.765625, + "learning_rate": 0.00016787323109165175, + "loss": 0.9783, + "step": 17113 + }, + { + "epoch": 0.4394395310060165, + "grad_norm": 0.71875, + "learning_rate": 0.00016786995257785404, + "loss": 0.8379, + "step": 17114 + }, + { + "epoch": 0.4394652082019383, + "grad_norm": 0.77734375, + "learning_rate": 0.0001678666739287976, + "loss": 0.8896, + "step": 17115 + }, + { + "epoch": 0.43949088539786013, + "grad_norm": 0.7890625, + "learning_rate": 0.00016786339514448904, + "loss": 1.0434, + "step": 17116 + }, + { + "epoch": 0.43951656259378197, + "grad_norm": 0.875, + "learning_rate": 0.00016786011622493482, + "loss": 1.0044, + "step": 17117 + }, + { + "epoch": 0.43954223978970375, + "grad_norm": 0.74609375, + "learning_rate": 0.0001678568371701416, + "loss": 0.9171, + "step": 17118 + }, + { + "epoch": 0.4395679169856256, + "grad_norm": 0.73046875, + "learning_rate": 0.00016785355798011576, + "loss": 0.7939, + "step": 17119 + }, + { + "epoch": 0.4395935941815474, + "grad_norm": 0.8046875, + "learning_rate": 0.0001678502786548639, + "loss": 0.9767, + "step": 17120 + }, + { + "epoch": 0.4396192713774692, + "grad_norm": 0.7421875, + "learning_rate": 0.00016784699919439262, + "loss": 0.8453, + "step": 17121 + }, + { + "epoch": 0.43964494857339104, + "grad_norm": 0.81640625, + "learning_rate": 0.00016784371959870838, + "loss": 0.9395, + "step": 17122 + }, + { + "epoch": 0.4396706257693129, + "grad_norm": 0.7890625, + "learning_rate": 0.00016784043986781773, + "loss": 0.9116, + "step": 17123 + }, + { + "epoch": 0.4396963029652347, + "grad_norm": 0.80859375, + "learning_rate": 0.00016783716000172717, + "loss": 0.8355, + "step": 17124 + }, + { + "epoch": 0.4397219801611565, + "grad_norm": 0.72265625, + "learning_rate": 0.0001678338800004433, + "loss": 1.0744, + "step": 17125 + }, + { + "epoch": 0.4397476573570783, + "grad_norm": 0.78515625, + "learning_rate": 0.0001678305998639727, + "loss": 0.9893, + "step": 17126 + }, + { + "epoch": 0.43977333455300016, + "grad_norm": 0.81640625, + "learning_rate": 0.00016782731959232177, + "loss": 0.9236, + "step": 17127 + }, + { + "epoch": 0.43979901174892194, + "grad_norm": 0.79296875, + "learning_rate": 0.00016782403918549714, + "loss": 0.9533, + "step": 17128 + }, + { + "epoch": 0.4398246889448438, + "grad_norm": 0.75, + "learning_rate": 0.00016782075864350533, + "loss": 0.9606, + "step": 17129 + }, + { + "epoch": 0.4398503661407656, + "grad_norm": 0.8125, + "learning_rate": 0.00016781747796635287, + "loss": 0.9511, + "step": 17130 + }, + { + "epoch": 0.4398760433366874, + "grad_norm": 0.76953125, + "learning_rate": 0.0001678141971540463, + "loss": 0.8393, + "step": 17131 + }, + { + "epoch": 0.43990172053260923, + "grad_norm": 0.8359375, + "learning_rate": 0.00016781091620659218, + "loss": 1.0341, + "step": 17132 + }, + { + "epoch": 0.43992739772853107, + "grad_norm": 3.875, + "learning_rate": 0.000167807635123997, + "loss": 0.8898, + "step": 17133 + }, + { + "epoch": 0.4399530749244529, + "grad_norm": 0.8125, + "learning_rate": 0.00016780435390626735, + "loss": 1.0084, + "step": 17134 + }, + { + "epoch": 0.4399787521203747, + "grad_norm": 0.8125, + "learning_rate": 0.00016780107255340973, + "loss": 1.0133, + "step": 17135 + }, + { + "epoch": 0.4400044293162965, + "grad_norm": 0.80078125, + "learning_rate": 0.0001677977910654307, + "loss": 0.9756, + "step": 17136 + }, + { + "epoch": 0.44003010651221836, + "grad_norm": 0.77734375, + "learning_rate": 0.0001677945094423368, + "loss": 0.9738, + "step": 17137 + }, + { + "epoch": 0.44005578370814014, + "grad_norm": 0.7578125, + "learning_rate": 0.00016779122768413457, + "loss": 0.8384, + "step": 17138 + }, + { + "epoch": 0.44008146090406197, + "grad_norm": 0.76953125, + "learning_rate": 0.00016778794579083055, + "loss": 0.8409, + "step": 17139 + }, + { + "epoch": 0.4401071380999838, + "grad_norm": 0.84765625, + "learning_rate": 0.00016778466376243128, + "loss": 0.8991, + "step": 17140 + }, + { + "epoch": 0.4401328152959056, + "grad_norm": 0.8046875, + "learning_rate": 0.0001677813815989433, + "loss": 0.9999, + "step": 17141 + }, + { + "epoch": 0.4401584924918274, + "grad_norm": 0.87109375, + "learning_rate": 0.00016777809930037312, + "loss": 0.7909, + "step": 17142 + }, + { + "epoch": 0.44018416968774926, + "grad_norm": 0.82421875, + "learning_rate": 0.00016777481686672735, + "loss": 0.9762, + "step": 17143 + }, + { + "epoch": 0.44020984688367104, + "grad_norm": 0.796875, + "learning_rate": 0.00016777153429801245, + "loss": 0.8519, + "step": 17144 + }, + { + "epoch": 0.4402355240795929, + "grad_norm": 0.83203125, + "learning_rate": 0.00016776825159423503, + "loss": 0.8966, + "step": 17145 + }, + { + "epoch": 0.4402612012755147, + "grad_norm": 0.76171875, + "learning_rate": 0.00016776496875540162, + "loss": 0.7511, + "step": 17146 + }, + { + "epoch": 0.44028687847143655, + "grad_norm": 0.76953125, + "learning_rate": 0.00016776168578151875, + "loss": 0.8987, + "step": 17147 + }, + { + "epoch": 0.44031255566735833, + "grad_norm": 0.765625, + "learning_rate": 0.00016775840267259292, + "loss": 0.8557, + "step": 17148 + }, + { + "epoch": 0.44033823286328017, + "grad_norm": 0.71484375, + "learning_rate": 0.00016775511942863073, + "loss": 0.9744, + "step": 17149 + }, + { + "epoch": 0.440363910059202, + "grad_norm": 0.6953125, + "learning_rate": 0.0001677518360496387, + "loss": 0.8747, + "step": 17150 + }, + { + "epoch": 0.4403895872551238, + "grad_norm": 0.73828125, + "learning_rate": 0.0001677485525356234, + "loss": 0.9502, + "step": 17151 + }, + { + "epoch": 0.4404152644510456, + "grad_norm": 0.79296875, + "learning_rate": 0.00016774526888659134, + "loss": 0.9227, + "step": 17152 + }, + { + "epoch": 0.44044094164696745, + "grad_norm": 0.8359375, + "learning_rate": 0.0001677419851025491, + "loss": 0.9199, + "step": 17153 + }, + { + "epoch": 0.44046661884288923, + "grad_norm": 0.7734375, + "learning_rate": 0.0001677387011835032, + "loss": 0.8384, + "step": 17154 + }, + { + "epoch": 0.44049229603881107, + "grad_norm": 0.7109375, + "learning_rate": 0.0001677354171294602, + "loss": 0.8136, + "step": 17155 + }, + { + "epoch": 0.4405179732347329, + "grad_norm": 0.68359375, + "learning_rate": 0.00016773213294042657, + "loss": 0.8244, + "step": 17156 + }, + { + "epoch": 0.44054365043065474, + "grad_norm": 0.7578125, + "learning_rate": 0.000167728848616409, + "loss": 0.8826, + "step": 17157 + }, + { + "epoch": 0.4405693276265765, + "grad_norm": 0.73828125, + "learning_rate": 0.0001677255641574139, + "loss": 0.7737, + "step": 17158 + }, + { + "epoch": 0.44059500482249836, + "grad_norm": 0.8125, + "learning_rate": 0.00016772227956344786, + "loss": 0.9255, + "step": 17159 + }, + { + "epoch": 0.4406206820184202, + "grad_norm": 0.69140625, + "learning_rate": 0.00016771899483451744, + "loss": 0.8398, + "step": 17160 + }, + { + "epoch": 0.440646359214342, + "grad_norm": 0.890625, + "learning_rate": 0.0001677157099706292, + "loss": 0.892, + "step": 17161 + }, + { + "epoch": 0.4406720364102638, + "grad_norm": 0.83203125, + "learning_rate": 0.00016771242497178967, + "loss": 0.9561, + "step": 17162 + }, + { + "epoch": 0.44069771360618565, + "grad_norm": 0.75390625, + "learning_rate": 0.00016770913983800537, + "loss": 0.8749, + "step": 17163 + }, + { + "epoch": 0.44072339080210743, + "grad_norm": 0.82421875, + "learning_rate": 0.00016770585456928289, + "loss": 0.8873, + "step": 17164 + }, + { + "epoch": 0.44074906799802926, + "grad_norm": 0.8125, + "learning_rate": 0.00016770256916562873, + "loss": 0.9188, + "step": 17165 + }, + { + "epoch": 0.4407747451939511, + "grad_norm": 0.78125, + "learning_rate": 0.0001676992836270495, + "loss": 0.961, + "step": 17166 + }, + { + "epoch": 0.44080042238987294, + "grad_norm": 0.828125, + "learning_rate": 0.0001676959979535517, + "loss": 0.8851, + "step": 17167 + }, + { + "epoch": 0.4408260995857947, + "grad_norm": 0.8125, + "learning_rate": 0.0001676927121451419, + "loss": 0.8638, + "step": 17168 + }, + { + "epoch": 0.44085177678171655, + "grad_norm": 0.76171875, + "learning_rate": 0.00016768942620182663, + "loss": 0.9167, + "step": 17169 + }, + { + "epoch": 0.4408774539776384, + "grad_norm": 0.734375, + "learning_rate": 0.00016768614012361243, + "loss": 0.8622, + "step": 17170 + }, + { + "epoch": 0.44090313117356017, + "grad_norm": 0.74609375, + "learning_rate": 0.0001676828539105059, + "loss": 0.8695, + "step": 17171 + }, + { + "epoch": 0.440928808369482, + "grad_norm": 0.78125, + "learning_rate": 0.00016767956756251354, + "loss": 0.9693, + "step": 17172 + }, + { + "epoch": 0.44095448556540384, + "grad_norm": 0.83203125, + "learning_rate": 0.00016767628107964191, + "loss": 0.8575, + "step": 17173 + }, + { + "epoch": 0.4409801627613256, + "grad_norm": 0.828125, + "learning_rate": 0.00016767299446189757, + "loss": 0.8477, + "step": 17174 + }, + { + "epoch": 0.44100583995724746, + "grad_norm": 0.80859375, + "learning_rate": 0.00016766970770928707, + "loss": 0.9768, + "step": 17175 + }, + { + "epoch": 0.4410315171531693, + "grad_norm": 0.828125, + "learning_rate": 0.00016766642082181697, + "loss": 0.8334, + "step": 17176 + }, + { + "epoch": 0.44105719434909113, + "grad_norm": 0.80078125, + "learning_rate": 0.00016766313379949375, + "loss": 0.8976, + "step": 17177 + }, + { + "epoch": 0.4410828715450129, + "grad_norm": 0.83203125, + "learning_rate": 0.00016765984664232405, + "loss": 0.9318, + "step": 17178 + }, + { + "epoch": 0.44110854874093475, + "grad_norm": 0.75, + "learning_rate": 0.0001676565593503144, + "loss": 0.8377, + "step": 17179 + }, + { + "epoch": 0.4411342259368566, + "grad_norm": 0.91796875, + "learning_rate": 0.00016765327192347135, + "loss": 0.9671, + "step": 17180 + }, + { + "epoch": 0.44115990313277836, + "grad_norm": 0.7734375, + "learning_rate": 0.0001676499843618014, + "loss": 1.0423, + "step": 17181 + }, + { + "epoch": 0.4411855803287002, + "grad_norm": 0.7265625, + "learning_rate": 0.00016764669666531113, + "loss": 0.9996, + "step": 17182 + }, + { + "epoch": 0.44121125752462204, + "grad_norm": 0.8125, + "learning_rate": 0.00016764340883400718, + "loss": 0.9728, + "step": 17183 + }, + { + "epoch": 0.4412369347205438, + "grad_norm": 0.78125, + "learning_rate": 0.00016764012086789597, + "loss": 0.9017, + "step": 17184 + }, + { + "epoch": 0.44126261191646565, + "grad_norm": 0.796875, + "learning_rate": 0.00016763683276698412, + "loss": 0.9303, + "step": 17185 + }, + { + "epoch": 0.4412882891123875, + "grad_norm": 0.75390625, + "learning_rate": 0.00016763354453127818, + "loss": 0.7796, + "step": 17186 + }, + { + "epoch": 0.4413139663083093, + "grad_norm": 0.78515625, + "learning_rate": 0.00016763025616078466, + "loss": 0.8702, + "step": 17187 + }, + { + "epoch": 0.4413396435042311, + "grad_norm": 0.94140625, + "learning_rate": 0.00016762696765551017, + "loss": 0.8639, + "step": 17188 + }, + { + "epoch": 0.44136532070015294, + "grad_norm": 0.73828125, + "learning_rate": 0.00016762367901546128, + "loss": 0.7987, + "step": 17189 + }, + { + "epoch": 0.4413909978960748, + "grad_norm": 0.71875, + "learning_rate": 0.00016762039024064443, + "loss": 0.8769, + "step": 17190 + }, + { + "epoch": 0.44141667509199656, + "grad_norm": 0.77734375, + "learning_rate": 0.00016761710133106632, + "loss": 0.8446, + "step": 17191 + }, + { + "epoch": 0.4414423522879184, + "grad_norm": 0.828125, + "learning_rate": 0.0001676138122867334, + "loss": 0.931, + "step": 17192 + }, + { + "epoch": 0.44146802948384023, + "grad_norm": 0.7890625, + "learning_rate": 0.00016761052310765225, + "loss": 0.8252, + "step": 17193 + }, + { + "epoch": 0.441493706679762, + "grad_norm": 0.69140625, + "learning_rate": 0.00016760723379382947, + "loss": 0.809, + "step": 17194 + }, + { + "epoch": 0.44151938387568385, + "grad_norm": 0.8671875, + "learning_rate": 0.00016760394434527158, + "loss": 0.9942, + "step": 17195 + }, + { + "epoch": 0.4415450610716057, + "grad_norm": 0.8828125, + "learning_rate": 0.0001676006547619851, + "loss": 0.9635, + "step": 17196 + }, + { + "epoch": 0.4415707382675275, + "grad_norm": 0.875, + "learning_rate": 0.00016759736504397664, + "loss": 0.9979, + "step": 17197 + }, + { + "epoch": 0.4415964154634493, + "grad_norm": 0.8125, + "learning_rate": 0.00016759407519125277, + "loss": 0.985, + "step": 17198 + }, + { + "epoch": 0.44162209265937113, + "grad_norm": 0.83203125, + "learning_rate": 0.00016759078520381997, + "loss": 0.9394, + "step": 17199 + }, + { + "epoch": 0.44164776985529297, + "grad_norm": 0.796875, + "learning_rate": 0.00016758749508168484, + "loss": 0.975, + "step": 17200 + }, + { + "epoch": 0.44167344705121475, + "grad_norm": 0.82421875, + "learning_rate": 0.00016758420482485396, + "loss": 0.9713, + "step": 17201 + }, + { + "epoch": 0.4416991242471366, + "grad_norm": 0.7109375, + "learning_rate": 0.00016758091443333384, + "loss": 0.8996, + "step": 17202 + }, + { + "epoch": 0.4417248014430584, + "grad_norm": 0.83203125, + "learning_rate": 0.0001675776239071311, + "loss": 0.8831, + "step": 17203 + }, + { + "epoch": 0.4417504786389802, + "grad_norm": 0.765625, + "learning_rate": 0.0001675743332462522, + "loss": 0.6599, + "step": 17204 + }, + { + "epoch": 0.44177615583490204, + "grad_norm": 0.796875, + "learning_rate": 0.0001675710424507038, + "loss": 0.9759, + "step": 17205 + }, + { + "epoch": 0.4418018330308239, + "grad_norm": 0.7578125, + "learning_rate": 0.00016756775152049243, + "loss": 0.9987, + "step": 17206 + }, + { + "epoch": 0.4418275102267457, + "grad_norm": 0.85546875, + "learning_rate": 0.00016756446045562462, + "loss": 1.0185, + "step": 17207 + }, + { + "epoch": 0.4418531874226675, + "grad_norm": 0.75390625, + "learning_rate": 0.00016756116925610695, + "loss": 1.0184, + "step": 17208 + }, + { + "epoch": 0.44187886461858933, + "grad_norm": 0.73046875, + "learning_rate": 0.00016755787792194595, + "loss": 0.8006, + "step": 17209 + }, + { + "epoch": 0.44190454181451116, + "grad_norm": 0.79296875, + "learning_rate": 0.00016755458645314818, + "loss": 1.0653, + "step": 17210 + }, + { + "epoch": 0.44193021901043295, + "grad_norm": 0.85546875, + "learning_rate": 0.00016755129484972026, + "loss": 1.0267, + "step": 17211 + }, + { + "epoch": 0.4419558962063548, + "grad_norm": 0.8046875, + "learning_rate": 0.0001675480031116687, + "loss": 0.7868, + "step": 17212 + }, + { + "epoch": 0.4419815734022766, + "grad_norm": 0.7890625, + "learning_rate": 0.00016754471123900012, + "loss": 0.7981, + "step": 17213 + }, + { + "epoch": 0.4420072505981984, + "grad_norm": 0.81640625, + "learning_rate": 0.00016754141923172098, + "loss": 1.0445, + "step": 17214 + }, + { + "epoch": 0.44203292779412023, + "grad_norm": 0.76171875, + "learning_rate": 0.00016753812708983787, + "loss": 0.8844, + "step": 17215 + }, + { + "epoch": 0.44205860499004207, + "grad_norm": 0.796875, + "learning_rate": 0.0001675348348133574, + "loss": 0.8907, + "step": 17216 + }, + { + "epoch": 0.4420842821859639, + "grad_norm": 0.8125, + "learning_rate": 0.00016753154240228612, + "loss": 0.9581, + "step": 17217 + }, + { + "epoch": 0.4421099593818857, + "grad_norm": 0.7890625, + "learning_rate": 0.00016752824985663057, + "loss": 0.8198, + "step": 17218 + }, + { + "epoch": 0.4421356365778075, + "grad_norm": 0.84375, + "learning_rate": 0.0001675249571763973, + "loss": 0.8742, + "step": 17219 + }, + { + "epoch": 0.44216131377372936, + "grad_norm": 0.7265625, + "learning_rate": 0.0001675216643615929, + "loss": 0.901, + "step": 17220 + }, + { + "epoch": 0.44218699096965114, + "grad_norm": 0.796875, + "learning_rate": 0.00016751837141222394, + "loss": 0.9135, + "step": 17221 + }, + { + "epoch": 0.442212668165573, + "grad_norm": 0.71875, + "learning_rate": 0.00016751507832829694, + "loss": 0.9164, + "step": 17222 + }, + { + "epoch": 0.4422383453614948, + "grad_norm": 0.8203125, + "learning_rate": 0.0001675117851098185, + "loss": 0.7623, + "step": 17223 + }, + { + "epoch": 0.4422640225574166, + "grad_norm": 0.80859375, + "learning_rate": 0.00016750849175679514, + "loss": 1.031, + "step": 17224 + }, + { + "epoch": 0.44228969975333843, + "grad_norm": 0.78125, + "learning_rate": 0.0001675051982692335, + "loss": 1.0335, + "step": 17225 + }, + { + "epoch": 0.44231537694926026, + "grad_norm": 0.73046875, + "learning_rate": 0.0001675019046471401, + "loss": 0.859, + "step": 17226 + }, + { + "epoch": 0.4423410541451821, + "grad_norm": 0.70703125, + "learning_rate": 0.0001674986108905215, + "loss": 0.8526, + "step": 17227 + }, + { + "epoch": 0.4423667313411039, + "grad_norm": 0.76171875, + "learning_rate": 0.00016749531699938422, + "loss": 0.8303, + "step": 17228 + }, + { + "epoch": 0.4423924085370257, + "grad_norm": 0.74609375, + "learning_rate": 0.0001674920229737349, + "loss": 0.8915, + "step": 17229 + }, + { + "epoch": 0.44241808573294755, + "grad_norm": 0.76953125, + "learning_rate": 0.00016748872881358008, + "loss": 1.0319, + "step": 17230 + }, + { + "epoch": 0.44244376292886933, + "grad_norm": 0.7890625, + "learning_rate": 0.00016748543451892628, + "loss": 0.9147, + "step": 17231 + }, + { + "epoch": 0.44246944012479117, + "grad_norm": 0.7734375, + "learning_rate": 0.00016748214008978013, + "loss": 1.0282, + "step": 17232 + }, + { + "epoch": 0.442495117320713, + "grad_norm": 0.71875, + "learning_rate": 0.00016747884552614819, + "loss": 0.8927, + "step": 17233 + }, + { + "epoch": 0.4425207945166348, + "grad_norm": 0.71484375, + "learning_rate": 0.00016747555082803698, + "loss": 0.8473, + "step": 17234 + }, + { + "epoch": 0.4425464717125566, + "grad_norm": 0.7421875, + "learning_rate": 0.00016747225599545313, + "loss": 0.8897, + "step": 17235 + }, + { + "epoch": 0.44257214890847846, + "grad_norm": 0.76953125, + "learning_rate": 0.00016746896102840311, + "loss": 0.9123, + "step": 17236 + }, + { + "epoch": 0.4425978261044003, + "grad_norm": 0.8359375, + "learning_rate": 0.0001674656659268936, + "loss": 0.9983, + "step": 17237 + }, + { + "epoch": 0.4426235033003221, + "grad_norm": 0.8046875, + "learning_rate": 0.00016746237069093108, + "loss": 0.9623, + "step": 17238 + }, + { + "epoch": 0.4426491804962439, + "grad_norm": 0.80859375, + "learning_rate": 0.00016745907532052216, + "loss": 1.025, + "step": 17239 + }, + { + "epoch": 0.44267485769216575, + "grad_norm": 0.7109375, + "learning_rate": 0.0001674557798156734, + "loss": 0.7626, + "step": 17240 + }, + { + "epoch": 0.4427005348880875, + "grad_norm": 0.76171875, + "learning_rate": 0.00016745248417639137, + "loss": 0.9369, + "step": 17241 + }, + { + "epoch": 0.44272621208400936, + "grad_norm": 0.77734375, + "learning_rate": 0.0001674491884026826, + "loss": 1.0098, + "step": 17242 + }, + { + "epoch": 0.4427518892799312, + "grad_norm": 0.7109375, + "learning_rate": 0.00016744589249455372, + "loss": 0.853, + "step": 17243 + }, + { + "epoch": 0.442777566475853, + "grad_norm": 0.78515625, + "learning_rate": 0.00016744259645201125, + "loss": 0.9535, + "step": 17244 + }, + { + "epoch": 0.4428032436717748, + "grad_norm": 0.84375, + "learning_rate": 0.00016743930027506178, + "loss": 1.0004, + "step": 17245 + }, + { + "epoch": 0.44282892086769665, + "grad_norm": 0.7890625, + "learning_rate": 0.0001674360039637119, + "loss": 0.881, + "step": 17246 + }, + { + "epoch": 0.4428545980636185, + "grad_norm": 0.74609375, + "learning_rate": 0.00016743270751796814, + "loss": 0.8982, + "step": 17247 + }, + { + "epoch": 0.44288027525954027, + "grad_norm": 0.79296875, + "learning_rate": 0.0001674294109378371, + "loss": 0.9669, + "step": 17248 + }, + { + "epoch": 0.4429059524554621, + "grad_norm": 0.859375, + "learning_rate": 0.00016742611422332528, + "loss": 0.989, + "step": 17249 + }, + { + "epoch": 0.44293162965138394, + "grad_norm": 0.81640625, + "learning_rate": 0.00016742281737443934, + "loss": 0.9385, + "step": 17250 + }, + { + "epoch": 0.4429573068473057, + "grad_norm": 0.734375, + "learning_rate": 0.00016741952039118582, + "loss": 0.8678, + "step": 17251 + }, + { + "epoch": 0.44298298404322756, + "grad_norm": 0.7109375, + "learning_rate": 0.0001674162232735713, + "loss": 0.8115, + "step": 17252 + }, + { + "epoch": 0.4430086612391494, + "grad_norm": 0.80078125, + "learning_rate": 0.0001674129260216023, + "loss": 0.9003, + "step": 17253 + }, + { + "epoch": 0.4430343384350712, + "grad_norm": 0.77734375, + "learning_rate": 0.0001674096286352855, + "loss": 0.8919, + "step": 17254 + }, + { + "epoch": 0.443060015630993, + "grad_norm": 0.77734375, + "learning_rate": 0.00016740633111462732, + "loss": 0.9955, + "step": 17255 + }, + { + "epoch": 0.44308569282691485, + "grad_norm": 0.8125, + "learning_rate": 0.00016740303345963445, + "loss": 0.9621, + "step": 17256 + }, + { + "epoch": 0.4431113700228367, + "grad_norm": 0.80078125, + "learning_rate": 0.00016739973567031342, + "loss": 0.9213, + "step": 17257 + }, + { + "epoch": 0.44313704721875846, + "grad_norm": 0.8046875, + "learning_rate": 0.00016739643774667077, + "loss": 1.0423, + "step": 17258 + }, + { + "epoch": 0.4431627244146803, + "grad_norm": 0.8046875, + "learning_rate": 0.00016739313968871315, + "loss": 0.843, + "step": 17259 + }, + { + "epoch": 0.44318840161060213, + "grad_norm": 0.734375, + "learning_rate": 0.0001673898414964471, + "loss": 0.8329, + "step": 17260 + }, + { + "epoch": 0.4432140788065239, + "grad_norm": 0.78125, + "learning_rate": 0.00016738654316987914, + "loss": 0.951, + "step": 17261 + }, + { + "epoch": 0.44323975600244575, + "grad_norm": 0.7578125, + "learning_rate": 0.0001673832447090159, + "loss": 0.7792, + "step": 17262 + }, + { + "epoch": 0.4432654331983676, + "grad_norm": 0.83984375, + "learning_rate": 0.00016737994611386393, + "loss": 1.0284, + "step": 17263 + }, + { + "epoch": 0.44329111039428937, + "grad_norm": 0.7578125, + "learning_rate": 0.00016737664738442984, + "loss": 0.8601, + "step": 17264 + }, + { + "epoch": 0.4433167875902112, + "grad_norm": 0.73046875, + "learning_rate": 0.00016737334852072016, + "loss": 0.8641, + "step": 17265 + }, + { + "epoch": 0.44334246478613304, + "grad_norm": 0.76953125, + "learning_rate": 0.00016737004952274148, + "loss": 0.9196, + "step": 17266 + }, + { + "epoch": 0.4433681419820549, + "grad_norm": 0.8515625, + "learning_rate": 0.0001673667503905004, + "loss": 1.0341, + "step": 17267 + }, + { + "epoch": 0.44339381917797666, + "grad_norm": 0.796875, + "learning_rate": 0.00016736345112400346, + "loss": 0.9795, + "step": 17268 + }, + { + "epoch": 0.4434194963738985, + "grad_norm": 0.78125, + "learning_rate": 0.00016736015172325725, + "loss": 0.9841, + "step": 17269 + }, + { + "epoch": 0.44344517356982033, + "grad_norm": 0.78515625, + "learning_rate": 0.00016735685218826835, + "loss": 0.911, + "step": 17270 + }, + { + "epoch": 0.4434708507657421, + "grad_norm": 0.796875, + "learning_rate": 0.0001673535525190433, + "loss": 0.9108, + "step": 17271 + }, + { + "epoch": 0.44349652796166394, + "grad_norm": 0.7890625, + "learning_rate": 0.00016735025271558872, + "loss": 0.858, + "step": 17272 + }, + { + "epoch": 0.4435222051575858, + "grad_norm": 0.921875, + "learning_rate": 0.00016734695277791114, + "loss": 1.0643, + "step": 17273 + }, + { + "epoch": 0.44354788235350756, + "grad_norm": 0.7578125, + "learning_rate": 0.00016734365270601722, + "loss": 0.8611, + "step": 17274 + }, + { + "epoch": 0.4435735595494294, + "grad_norm": 0.8359375, + "learning_rate": 0.00016734035249991343, + "loss": 0.9345, + "step": 17275 + }, + { + "epoch": 0.44359923674535123, + "grad_norm": 0.78515625, + "learning_rate": 0.00016733705215960646, + "loss": 0.8506, + "step": 17276 + }, + { + "epoch": 0.44362491394127307, + "grad_norm": 0.76953125, + "learning_rate": 0.00016733375168510279, + "loss": 0.9892, + "step": 17277 + }, + { + "epoch": 0.44365059113719485, + "grad_norm": 0.78515625, + "learning_rate": 0.000167330451076409, + "loss": 0.8234, + "step": 17278 + }, + { + "epoch": 0.4436762683331167, + "grad_norm": 0.8515625, + "learning_rate": 0.00016732715033353178, + "loss": 0.8191, + "step": 17279 + }, + { + "epoch": 0.4437019455290385, + "grad_norm": 0.80859375, + "learning_rate": 0.00016732384945647758, + "loss": 0.9311, + "step": 17280 + }, + { + "epoch": 0.4437276227249603, + "grad_norm": 0.80859375, + "learning_rate": 0.00016732054844525304, + "loss": 0.9857, + "step": 17281 + }, + { + "epoch": 0.44375329992088214, + "grad_norm": 0.86328125, + "learning_rate": 0.00016731724729986475, + "loss": 1.0004, + "step": 17282 + }, + { + "epoch": 0.443778977116804, + "grad_norm": 0.8125, + "learning_rate": 0.00016731394602031925, + "loss": 0.9129, + "step": 17283 + }, + { + "epoch": 0.44380465431272575, + "grad_norm": 0.88671875, + "learning_rate": 0.00016731064460662315, + "loss": 0.963, + "step": 17284 + }, + { + "epoch": 0.4438303315086476, + "grad_norm": 0.83984375, + "learning_rate": 0.00016730734305878296, + "loss": 1.0104, + "step": 17285 + }, + { + "epoch": 0.4438560087045694, + "grad_norm": 0.8671875, + "learning_rate": 0.0001673040413768054, + "loss": 0.9615, + "step": 17286 + }, + { + "epoch": 0.44388168590049126, + "grad_norm": 0.84375, + "learning_rate": 0.0001673007395606969, + "loss": 0.8953, + "step": 17287 + }, + { + "epoch": 0.44390736309641304, + "grad_norm": 0.76171875, + "learning_rate": 0.00016729743761046412, + "loss": 1.0417, + "step": 17288 + }, + { + "epoch": 0.4439330402923349, + "grad_norm": 0.765625, + "learning_rate": 0.00016729413552611365, + "loss": 0.8521, + "step": 17289 + }, + { + "epoch": 0.4439587174882567, + "grad_norm": 0.796875, + "learning_rate": 0.000167290833307652, + "loss": 1.0041, + "step": 17290 + }, + { + "epoch": 0.4439843946841785, + "grad_norm": 0.765625, + "learning_rate": 0.00016728753095508582, + "loss": 0.801, + "step": 17291 + }, + { + "epoch": 0.44401007188010033, + "grad_norm": 0.8671875, + "learning_rate": 0.00016728422846842172, + "loss": 0.9387, + "step": 17292 + }, + { + "epoch": 0.44403574907602217, + "grad_norm": 0.7734375, + "learning_rate": 0.00016728092584766618, + "loss": 0.9187, + "step": 17293 + }, + { + "epoch": 0.44406142627194395, + "grad_norm": 0.7578125, + "learning_rate": 0.0001672776230928258, + "loss": 0.9093, + "step": 17294 + }, + { + "epoch": 0.4440871034678658, + "grad_norm": 0.8671875, + "learning_rate": 0.00016727432020390722, + "loss": 0.9523, + "step": 17295 + }, + { + "epoch": 0.4441127806637876, + "grad_norm": 0.73046875, + "learning_rate": 0.00016727101718091704, + "loss": 0.9077, + "step": 17296 + }, + { + "epoch": 0.44413845785970946, + "grad_norm": 0.91796875, + "learning_rate": 0.00016726771402386176, + "loss": 1.0164, + "step": 17297 + }, + { + "epoch": 0.44416413505563124, + "grad_norm": 0.7578125, + "learning_rate": 0.000167264410732748, + "loss": 0.9156, + "step": 17298 + }, + { + "epoch": 0.4441898122515531, + "grad_norm": 0.73046875, + "learning_rate": 0.00016726110730758237, + "loss": 0.9268, + "step": 17299 + }, + { + "epoch": 0.4442154894474749, + "grad_norm": 0.75, + "learning_rate": 0.0001672578037483714, + "loss": 0.8873, + "step": 17300 + }, + { + "epoch": 0.4442411666433967, + "grad_norm": 0.7265625, + "learning_rate": 0.00016725450005512172, + "loss": 0.8434, + "step": 17301 + }, + { + "epoch": 0.4442668438393185, + "grad_norm": 0.77734375, + "learning_rate": 0.0001672511962278399, + "loss": 1.0363, + "step": 17302 + }, + { + "epoch": 0.44429252103524036, + "grad_norm": 0.79296875, + "learning_rate": 0.00016724789226653253, + "loss": 0.8893, + "step": 17303 + }, + { + "epoch": 0.44431819823116214, + "grad_norm": 0.765625, + "learning_rate": 0.00016724458817120614, + "loss": 0.9905, + "step": 17304 + }, + { + "epoch": 0.444343875427084, + "grad_norm": 0.89453125, + "learning_rate": 0.00016724128394186743, + "loss": 0.9289, + "step": 17305 + }, + { + "epoch": 0.4443695526230058, + "grad_norm": 0.765625, + "learning_rate": 0.00016723797957852286, + "loss": 0.8989, + "step": 17306 + }, + { + "epoch": 0.44439522981892765, + "grad_norm": 0.703125, + "learning_rate": 0.00016723467508117907, + "loss": 0.8999, + "step": 17307 + }, + { + "epoch": 0.44442090701484943, + "grad_norm": 0.76171875, + "learning_rate": 0.00016723137044984268, + "loss": 0.8439, + "step": 17308 + }, + { + "epoch": 0.44444658421077127, + "grad_norm": 0.76953125, + "learning_rate": 0.00016722806568452022, + "loss": 0.8375, + "step": 17309 + }, + { + "epoch": 0.4444722614066931, + "grad_norm": 0.8046875, + "learning_rate": 0.0001672247607852183, + "loss": 0.8895, + "step": 17310 + }, + { + "epoch": 0.4444979386026149, + "grad_norm": 0.7734375, + "learning_rate": 0.0001672214557519435, + "loss": 0.9395, + "step": 17311 + }, + { + "epoch": 0.4445236157985367, + "grad_norm": 0.77734375, + "learning_rate": 0.0001672181505847024, + "loss": 0.8388, + "step": 17312 + }, + { + "epoch": 0.44454929299445856, + "grad_norm": 0.7421875, + "learning_rate": 0.00016721484528350162, + "loss": 0.9817, + "step": 17313 + }, + { + "epoch": 0.44457497019038034, + "grad_norm": 0.84765625, + "learning_rate": 0.00016721153984834773, + "loss": 0.9663, + "step": 17314 + }, + { + "epoch": 0.4446006473863022, + "grad_norm": 0.76171875, + "learning_rate": 0.00016720823427924728, + "loss": 0.8267, + "step": 17315 + }, + { + "epoch": 0.444626324582224, + "grad_norm": 0.7109375, + "learning_rate": 0.0001672049285762069, + "loss": 0.9395, + "step": 17316 + }, + { + "epoch": 0.44465200177814584, + "grad_norm": 0.734375, + "learning_rate": 0.0001672016227392332, + "loss": 0.9841, + "step": 17317 + }, + { + "epoch": 0.4446776789740676, + "grad_norm": 0.7890625, + "learning_rate": 0.0001671983167683327, + "loss": 0.9045, + "step": 17318 + }, + { + "epoch": 0.44470335616998946, + "grad_norm": 0.81640625, + "learning_rate": 0.00016719501066351204, + "loss": 1.0052, + "step": 17319 + }, + { + "epoch": 0.4447290333659113, + "grad_norm": 0.7265625, + "learning_rate": 0.0001671917044247778, + "loss": 0.8776, + "step": 17320 + }, + { + "epoch": 0.4447547105618331, + "grad_norm": 0.80078125, + "learning_rate": 0.00016718839805213653, + "loss": 0.899, + "step": 17321 + }, + { + "epoch": 0.4447803877577549, + "grad_norm": 0.82421875, + "learning_rate": 0.0001671850915455949, + "loss": 1.0239, + "step": 17322 + }, + { + "epoch": 0.44480606495367675, + "grad_norm": 0.8828125, + "learning_rate": 0.00016718178490515942, + "loss": 0.9659, + "step": 17323 + }, + { + "epoch": 0.44483174214959853, + "grad_norm": 0.81640625, + "learning_rate": 0.0001671784781308367, + "loss": 0.9451, + "step": 17324 + }, + { + "epoch": 0.44485741934552037, + "grad_norm": 0.8203125, + "learning_rate": 0.0001671751712226334, + "loss": 0.9423, + "step": 17325 + }, + { + "epoch": 0.4448830965414422, + "grad_norm": 0.74609375, + "learning_rate": 0.000167171864180556, + "loss": 0.8297, + "step": 17326 + }, + { + "epoch": 0.44490877373736404, + "grad_norm": 0.75, + "learning_rate": 0.00016716855700461115, + "loss": 0.82, + "step": 17327 + }, + { + "epoch": 0.4449344509332858, + "grad_norm": 0.8125, + "learning_rate": 0.00016716524969480543, + "loss": 0.9862, + "step": 17328 + }, + { + "epoch": 0.44496012812920765, + "grad_norm": 0.72265625, + "learning_rate": 0.00016716194225114544, + "loss": 0.8635, + "step": 17329 + }, + { + "epoch": 0.4449858053251295, + "grad_norm": 0.81640625, + "learning_rate": 0.00016715863467363774, + "loss": 1.0632, + "step": 17330 + }, + { + "epoch": 0.44501148252105127, + "grad_norm": 0.7265625, + "learning_rate": 0.000167155326962289, + "loss": 0.8672, + "step": 17331 + }, + { + "epoch": 0.4450371597169731, + "grad_norm": 0.73828125, + "learning_rate": 0.00016715201911710574, + "loss": 0.9999, + "step": 17332 + }, + { + "epoch": 0.44506283691289494, + "grad_norm": 0.82421875, + "learning_rate": 0.00016714871113809456, + "loss": 0.8877, + "step": 17333 + }, + { + "epoch": 0.4450885141088167, + "grad_norm": 0.78125, + "learning_rate": 0.00016714540302526206, + "loss": 0.8613, + "step": 17334 + }, + { + "epoch": 0.44511419130473856, + "grad_norm": 0.765625, + "learning_rate": 0.00016714209477861487, + "loss": 0.9514, + "step": 17335 + }, + { + "epoch": 0.4451398685006604, + "grad_norm": 0.75390625, + "learning_rate": 0.0001671387863981595, + "loss": 1.0146, + "step": 17336 + }, + { + "epoch": 0.44516554569658223, + "grad_norm": 0.74609375, + "learning_rate": 0.00016713547788390262, + "loss": 0.9381, + "step": 17337 + }, + { + "epoch": 0.445191222892504, + "grad_norm": 0.75390625, + "learning_rate": 0.00016713216923585082, + "loss": 0.8683, + "step": 17338 + }, + { + "epoch": 0.44521690008842585, + "grad_norm": 0.7890625, + "learning_rate": 0.00016712886045401064, + "loss": 0.9629, + "step": 17339 + }, + { + "epoch": 0.4452425772843477, + "grad_norm": 0.7578125, + "learning_rate": 0.0001671255515383887, + "loss": 0.918, + "step": 17340 + }, + { + "epoch": 0.44526825448026947, + "grad_norm": 0.796875, + "learning_rate": 0.00016712224248899163, + "loss": 1.016, + "step": 17341 + }, + { + "epoch": 0.4452939316761913, + "grad_norm": 0.86328125, + "learning_rate": 0.00016711893330582597, + "loss": 1.0301, + "step": 17342 + }, + { + "epoch": 0.44531960887211314, + "grad_norm": 0.7578125, + "learning_rate": 0.00016711562398889833, + "loss": 0.9162, + "step": 17343 + }, + { + "epoch": 0.4453452860680349, + "grad_norm": 0.76953125, + "learning_rate": 0.00016711231453821532, + "loss": 0.9773, + "step": 17344 + }, + { + "epoch": 0.44537096326395675, + "grad_norm": 0.7421875, + "learning_rate": 0.00016710900495378356, + "loss": 0.8804, + "step": 17345 + }, + { + "epoch": 0.4453966404598786, + "grad_norm": 0.76953125, + "learning_rate": 0.00016710569523560958, + "loss": 0.8832, + "step": 17346 + }, + { + "epoch": 0.4454223176558004, + "grad_norm": 0.97265625, + "learning_rate": 0.00016710238538370002, + "loss": 0.9355, + "step": 17347 + }, + { + "epoch": 0.4454479948517222, + "grad_norm": 0.78125, + "learning_rate": 0.0001670990753980615, + "loss": 0.8825, + "step": 17348 + }, + { + "epoch": 0.44547367204764404, + "grad_norm": 0.7265625, + "learning_rate": 0.00016709576527870054, + "loss": 0.9392, + "step": 17349 + }, + { + "epoch": 0.4454993492435659, + "grad_norm": 0.8125, + "learning_rate": 0.0001670924550256238, + "loss": 0.813, + "step": 17350 + }, + { + "epoch": 0.44552502643948766, + "grad_norm": 0.76953125, + "learning_rate": 0.00016708914463883784, + "loss": 0.8839, + "step": 17351 + }, + { + "epoch": 0.4455507036354095, + "grad_norm": 0.828125, + "learning_rate": 0.0001670858341183493, + "loss": 1.016, + "step": 17352 + }, + { + "epoch": 0.44557638083133133, + "grad_norm": 0.76171875, + "learning_rate": 0.00016708252346416474, + "loss": 0.919, + "step": 17353 + }, + { + "epoch": 0.4456020580272531, + "grad_norm": 0.8828125, + "learning_rate": 0.00016707921267629077, + "loss": 0.9729, + "step": 17354 + }, + { + "epoch": 0.44562773522317495, + "grad_norm": 0.80078125, + "learning_rate": 0.000167075901754734, + "loss": 1.0135, + "step": 17355 + }, + { + "epoch": 0.4456534124190968, + "grad_norm": 0.76953125, + "learning_rate": 0.000167072590699501, + "loss": 0.9641, + "step": 17356 + }, + { + "epoch": 0.44567908961501856, + "grad_norm": 0.83984375, + "learning_rate": 0.00016706927951059836, + "loss": 1.0827, + "step": 17357 + }, + { + "epoch": 0.4457047668109404, + "grad_norm": 0.80859375, + "learning_rate": 0.00016706596818803274, + "loss": 0.9224, + "step": 17358 + }, + { + "epoch": 0.44573044400686224, + "grad_norm": 0.88671875, + "learning_rate": 0.00016706265673181072, + "loss": 0.943, + "step": 17359 + }, + { + "epoch": 0.4457561212027841, + "grad_norm": 0.72265625, + "learning_rate": 0.00016705934514193887, + "loss": 0.8408, + "step": 17360 + }, + { + "epoch": 0.44578179839870585, + "grad_norm": 0.76171875, + "learning_rate": 0.00016705603341842377, + "loss": 0.939, + "step": 17361 + }, + { + "epoch": 0.4458074755946277, + "grad_norm": 0.98828125, + "learning_rate": 0.0001670527215612721, + "loss": 0.9136, + "step": 17362 + }, + { + "epoch": 0.4458331527905495, + "grad_norm": 0.8046875, + "learning_rate": 0.00016704940957049038, + "loss": 0.9161, + "step": 17363 + }, + { + "epoch": 0.4458588299864713, + "grad_norm": 0.83203125, + "learning_rate": 0.00016704609744608523, + "loss": 0.7499, + "step": 17364 + }, + { + "epoch": 0.44588450718239314, + "grad_norm": 0.73828125, + "learning_rate": 0.0001670427851880633, + "loss": 0.8825, + "step": 17365 + }, + { + "epoch": 0.445910184378315, + "grad_norm": 0.76953125, + "learning_rate": 0.00016703947279643114, + "loss": 0.8898, + "step": 17366 + }, + { + "epoch": 0.44593586157423676, + "grad_norm": 0.84375, + "learning_rate": 0.00016703616027119537, + "loss": 0.8526, + "step": 17367 + }, + { + "epoch": 0.4459615387701586, + "grad_norm": 0.73046875, + "learning_rate": 0.0001670328476123626, + "loss": 0.8046, + "step": 17368 + }, + { + "epoch": 0.44598721596608043, + "grad_norm": 0.79296875, + "learning_rate": 0.00016702953481993938, + "loss": 0.9355, + "step": 17369 + }, + { + "epoch": 0.44601289316200227, + "grad_norm": 0.765625, + "learning_rate": 0.00016702622189393237, + "loss": 1.0455, + "step": 17370 + }, + { + "epoch": 0.44603857035792405, + "grad_norm": 0.82421875, + "learning_rate": 0.00016702290883434818, + "loss": 0.847, + "step": 17371 + }, + { + "epoch": 0.4460642475538459, + "grad_norm": 0.76953125, + "learning_rate": 0.00016701959564119337, + "loss": 0.89, + "step": 17372 + }, + { + "epoch": 0.4460899247497677, + "grad_norm": 0.80859375, + "learning_rate": 0.00016701628231447456, + "loss": 0.8286, + "step": 17373 + }, + { + "epoch": 0.4461156019456895, + "grad_norm": 0.8671875, + "learning_rate": 0.00016701296885419836, + "loss": 1.0059, + "step": 17374 + }, + { + "epoch": 0.44614127914161134, + "grad_norm": 0.78515625, + "learning_rate": 0.00016700965526037135, + "loss": 0.8661, + "step": 17375 + }, + { + "epoch": 0.44616695633753317, + "grad_norm": 0.78515625, + "learning_rate": 0.00016700634153300016, + "loss": 0.9797, + "step": 17376 + }, + { + "epoch": 0.44619263353345495, + "grad_norm": 0.73828125, + "learning_rate": 0.00016700302767209138, + "loss": 0.9698, + "step": 17377 + }, + { + "epoch": 0.4462183107293768, + "grad_norm": 0.75390625, + "learning_rate": 0.00016699971367765165, + "loss": 0.818, + "step": 17378 + }, + { + "epoch": 0.4462439879252986, + "grad_norm": 0.7421875, + "learning_rate": 0.00016699639954968748, + "loss": 0.8321, + "step": 17379 + }, + { + "epoch": 0.44626966512122046, + "grad_norm": 0.76171875, + "learning_rate": 0.00016699308528820558, + "loss": 0.894, + "step": 17380 + }, + { + "epoch": 0.44629534231714224, + "grad_norm": 0.734375, + "learning_rate": 0.00016698977089321247, + "loss": 0.8879, + "step": 17381 + }, + { + "epoch": 0.4463210195130641, + "grad_norm": 0.71875, + "learning_rate": 0.00016698645636471486, + "loss": 0.884, + "step": 17382 + }, + { + "epoch": 0.4463466967089859, + "grad_norm": 0.890625, + "learning_rate": 0.00016698314170271927, + "loss": 0.8765, + "step": 17383 + }, + { + "epoch": 0.4463723739049077, + "grad_norm": 0.73828125, + "learning_rate": 0.00016697982690723232, + "loss": 0.8216, + "step": 17384 + }, + { + "epoch": 0.44639805110082953, + "grad_norm": 0.72265625, + "learning_rate": 0.00016697651197826062, + "loss": 0.8121, + "step": 17385 + }, + { + "epoch": 0.44642372829675137, + "grad_norm": 0.7578125, + "learning_rate": 0.00016697319691581078, + "loss": 0.9128, + "step": 17386 + }, + { + "epoch": 0.44644940549267315, + "grad_norm": 0.75, + "learning_rate": 0.00016696988171988942, + "loss": 0.9747, + "step": 17387 + }, + { + "epoch": 0.446475082688595, + "grad_norm": 0.8984375, + "learning_rate": 0.00016696656639050316, + "loss": 0.9878, + "step": 17388 + }, + { + "epoch": 0.4465007598845168, + "grad_norm": 0.75, + "learning_rate": 0.00016696325092765854, + "loss": 0.921, + "step": 17389 + }, + { + "epoch": 0.44652643708043865, + "grad_norm": 0.91015625, + "learning_rate": 0.00016695993533136222, + "loss": 1.0408, + "step": 17390 + }, + { + "epoch": 0.44655211427636043, + "grad_norm": 0.73046875, + "learning_rate": 0.0001669566196016208, + "loss": 0.8997, + "step": 17391 + }, + { + "epoch": 0.44657779147228227, + "grad_norm": 0.7890625, + "learning_rate": 0.00016695330373844088, + "loss": 0.8817, + "step": 17392 + }, + { + "epoch": 0.4466034686682041, + "grad_norm": 0.79296875, + "learning_rate": 0.00016694998774182911, + "loss": 0.8232, + "step": 17393 + }, + { + "epoch": 0.4466291458641259, + "grad_norm": 0.8203125, + "learning_rate": 0.00016694667161179202, + "loss": 0.796, + "step": 17394 + }, + { + "epoch": 0.4466548230600477, + "grad_norm": 0.87109375, + "learning_rate": 0.00016694335534833625, + "loss": 0.9811, + "step": 17395 + }, + { + "epoch": 0.44668050025596956, + "grad_norm": 0.890625, + "learning_rate": 0.00016694003895146846, + "loss": 0.9327, + "step": 17396 + }, + { + "epoch": 0.44670617745189134, + "grad_norm": 0.71875, + "learning_rate": 0.0001669367224211952, + "loss": 0.8569, + "step": 17397 + }, + { + "epoch": 0.4467318546478132, + "grad_norm": 0.78515625, + "learning_rate": 0.0001669334057575231, + "loss": 0.911, + "step": 17398 + }, + { + "epoch": 0.446757531843735, + "grad_norm": 0.765625, + "learning_rate": 0.00016693008896045877, + "loss": 0.9059, + "step": 17399 + }, + { + "epoch": 0.44678320903965685, + "grad_norm": 0.77734375, + "learning_rate": 0.00016692677203000882, + "loss": 1.0439, + "step": 17400 + }, + { + "epoch": 0.44680888623557863, + "grad_norm": 0.828125, + "learning_rate": 0.00016692345496617988, + "loss": 1.0366, + "step": 17401 + }, + { + "epoch": 0.44683456343150046, + "grad_norm": 0.75390625, + "learning_rate": 0.0001669201377689785, + "loss": 0.9302, + "step": 17402 + }, + { + "epoch": 0.4468602406274223, + "grad_norm": 0.8203125, + "learning_rate": 0.00016691682043841133, + "loss": 1.0343, + "step": 17403 + }, + { + "epoch": 0.4468859178233441, + "grad_norm": 0.76171875, + "learning_rate": 0.000166913502974485, + "loss": 0.8937, + "step": 17404 + }, + { + "epoch": 0.4469115950192659, + "grad_norm": 0.75390625, + "learning_rate": 0.0001669101853772061, + "loss": 0.9834, + "step": 17405 + }, + { + "epoch": 0.44693727221518775, + "grad_norm": 0.76953125, + "learning_rate": 0.00016690686764658127, + "loss": 1.0659, + "step": 17406 + }, + { + "epoch": 0.44696294941110953, + "grad_norm": 0.76953125, + "learning_rate": 0.00016690354978261707, + "loss": 0.9534, + "step": 17407 + }, + { + "epoch": 0.44698862660703137, + "grad_norm": 0.80859375, + "learning_rate": 0.00016690023178532015, + "loss": 0.8759, + "step": 17408 + }, + { + "epoch": 0.4470143038029532, + "grad_norm": 0.78515625, + "learning_rate": 0.00016689691365469707, + "loss": 0.8298, + "step": 17409 + }, + { + "epoch": 0.44703998099887504, + "grad_norm": 0.83984375, + "learning_rate": 0.00016689359539075454, + "loss": 0.9914, + "step": 17410 + }, + { + "epoch": 0.4470656581947968, + "grad_norm": 0.7890625, + "learning_rate": 0.0001668902769934991, + "loss": 0.924, + "step": 17411 + }, + { + "epoch": 0.44709133539071866, + "grad_norm": 0.82421875, + "learning_rate": 0.00016688695846293734, + "loss": 1.0692, + "step": 17412 + }, + { + "epoch": 0.4471170125866405, + "grad_norm": 0.765625, + "learning_rate": 0.00016688363979907596, + "loss": 1.0369, + "step": 17413 + }, + { + "epoch": 0.4471426897825623, + "grad_norm": 0.78515625, + "learning_rate": 0.0001668803210019215, + "loss": 0.94, + "step": 17414 + }, + { + "epoch": 0.4471683669784841, + "grad_norm": 0.73828125, + "learning_rate": 0.00016687700207148064, + "loss": 0.8941, + "step": 17415 + }, + { + "epoch": 0.44719404417440595, + "grad_norm": 0.77734375, + "learning_rate": 0.00016687368300775991, + "loss": 0.8551, + "step": 17416 + }, + { + "epoch": 0.4472197213703277, + "grad_norm": 0.72265625, + "learning_rate": 0.000166870363810766, + "loss": 0.8384, + "step": 17417 + }, + { + "epoch": 0.44724539856624956, + "grad_norm": 0.73828125, + "learning_rate": 0.0001668670444805055, + "loss": 0.9556, + "step": 17418 + }, + { + "epoch": 0.4472710757621714, + "grad_norm": 0.8203125, + "learning_rate": 0.00016686372501698498, + "loss": 0.9915, + "step": 17419 + }, + { + "epoch": 0.44729675295809324, + "grad_norm": 0.7734375, + "learning_rate": 0.00016686040542021114, + "loss": 0.8888, + "step": 17420 + }, + { + "epoch": 0.447322430154015, + "grad_norm": 0.75390625, + "learning_rate": 0.00016685708569019053, + "loss": 0.9015, + "step": 17421 + }, + { + "epoch": 0.44734810734993685, + "grad_norm": 0.84375, + "learning_rate": 0.0001668537658269298, + "loss": 0.9643, + "step": 17422 + }, + { + "epoch": 0.4473737845458587, + "grad_norm": 0.8125, + "learning_rate": 0.00016685044583043555, + "loss": 0.9482, + "step": 17423 + }, + { + "epoch": 0.44739946174178047, + "grad_norm": 0.828125, + "learning_rate": 0.0001668471257007144, + "loss": 1.2293, + "step": 17424 + }, + { + "epoch": 0.4474251389377023, + "grad_norm": 0.75, + "learning_rate": 0.00016684380543777297, + "loss": 0.8321, + "step": 17425 + }, + { + "epoch": 0.44745081613362414, + "grad_norm": 0.7265625, + "learning_rate": 0.00016684048504161784, + "loss": 0.7866, + "step": 17426 + }, + { + "epoch": 0.4474764933295459, + "grad_norm": 0.76171875, + "learning_rate": 0.00016683716451225568, + "loss": 0.8977, + "step": 17427 + }, + { + "epoch": 0.44750217052546776, + "grad_norm": 0.7265625, + "learning_rate": 0.0001668338438496931, + "loss": 0.8961, + "step": 17428 + }, + { + "epoch": 0.4475278477213896, + "grad_norm": 0.77734375, + "learning_rate": 0.0001668305230539367, + "loss": 0.9631, + "step": 17429 + }, + { + "epoch": 0.44755352491731143, + "grad_norm": 0.7421875, + "learning_rate": 0.00016682720212499313, + "loss": 0.9392, + "step": 17430 + }, + { + "epoch": 0.4475792021132332, + "grad_norm": 0.7578125, + "learning_rate": 0.00016682388106286896, + "loss": 0.859, + "step": 17431 + }, + { + "epoch": 0.44760487930915505, + "grad_norm": 0.90234375, + "learning_rate": 0.00016682055986757082, + "loss": 0.9555, + "step": 17432 + }, + { + "epoch": 0.4476305565050769, + "grad_norm": 0.91015625, + "learning_rate": 0.00016681723853910536, + "loss": 0.7861, + "step": 17433 + }, + { + "epoch": 0.44765623370099866, + "grad_norm": 0.734375, + "learning_rate": 0.00016681391707747912, + "loss": 1.0315, + "step": 17434 + }, + { + "epoch": 0.4476819108969205, + "grad_norm": 0.8046875, + "learning_rate": 0.00016681059548269883, + "loss": 0.9689, + "step": 17435 + }, + { + "epoch": 0.44770758809284233, + "grad_norm": 0.73828125, + "learning_rate": 0.00016680727375477107, + "loss": 0.8198, + "step": 17436 + }, + { + "epoch": 0.4477332652887641, + "grad_norm": 0.8515625, + "learning_rate": 0.00016680395189370246, + "loss": 1.0193, + "step": 17437 + }, + { + "epoch": 0.44775894248468595, + "grad_norm": 0.72265625, + "learning_rate": 0.00016680062989949957, + "loss": 0.8962, + "step": 17438 + }, + { + "epoch": 0.4477846196806078, + "grad_norm": 0.765625, + "learning_rate": 0.00016679730777216907, + "loss": 0.8146, + "step": 17439 + }, + { + "epoch": 0.4478102968765296, + "grad_norm": 0.765625, + "learning_rate": 0.00016679398551171758, + "loss": 0.8504, + "step": 17440 + }, + { + "epoch": 0.4478359740724514, + "grad_norm": 0.77734375, + "learning_rate": 0.0001667906631181517, + "loss": 1.0039, + "step": 17441 + }, + { + "epoch": 0.44786165126837324, + "grad_norm": 0.78515625, + "learning_rate": 0.00016678734059147804, + "loss": 0.9287, + "step": 17442 + }, + { + "epoch": 0.4478873284642951, + "grad_norm": 0.765625, + "learning_rate": 0.00016678401793170328, + "loss": 0.8571, + "step": 17443 + }, + { + "epoch": 0.44791300566021686, + "grad_norm": 0.8203125, + "learning_rate": 0.000166780695138834, + "loss": 0.9256, + "step": 17444 + }, + { + "epoch": 0.4479386828561387, + "grad_norm": 0.7109375, + "learning_rate": 0.00016677737221287682, + "loss": 0.898, + "step": 17445 + }, + { + "epoch": 0.44796436005206053, + "grad_norm": 0.8203125, + "learning_rate": 0.00016677404915383837, + "loss": 0.9078, + "step": 17446 + }, + { + "epoch": 0.4479900372479823, + "grad_norm": 0.80078125, + "learning_rate": 0.00016677072596172525, + "loss": 0.8905, + "step": 17447 + }, + { + "epoch": 0.44801571444390415, + "grad_norm": 0.89453125, + "learning_rate": 0.00016676740263654413, + "loss": 0.8837, + "step": 17448 + }, + { + "epoch": 0.448041391639826, + "grad_norm": 0.76953125, + "learning_rate": 0.00016676407917830157, + "loss": 0.851, + "step": 17449 + }, + { + "epoch": 0.4480670688357478, + "grad_norm": 0.74609375, + "learning_rate": 0.00016676075558700427, + "loss": 0.8816, + "step": 17450 + }, + { + "epoch": 0.4480927460316696, + "grad_norm": 0.73828125, + "learning_rate": 0.00016675743186265882, + "loss": 0.8752, + "step": 17451 + }, + { + "epoch": 0.44811842322759143, + "grad_norm": 0.7890625, + "learning_rate": 0.00016675410800527182, + "loss": 1.0133, + "step": 17452 + }, + { + "epoch": 0.44814410042351327, + "grad_norm": 0.83203125, + "learning_rate": 0.0001667507840148499, + "loss": 1.0337, + "step": 17453 + }, + { + "epoch": 0.44816977761943505, + "grad_norm": 0.8046875, + "learning_rate": 0.00016674745989139972, + "loss": 0.9074, + "step": 17454 + }, + { + "epoch": 0.4481954548153569, + "grad_norm": 0.74609375, + "learning_rate": 0.00016674413563492788, + "loss": 1.0401, + "step": 17455 + }, + { + "epoch": 0.4482211320112787, + "grad_norm": 0.83203125, + "learning_rate": 0.00016674081124544101, + "loss": 0.8992, + "step": 17456 + }, + { + "epoch": 0.4482468092072005, + "grad_norm": 0.79296875, + "learning_rate": 0.00016673748672294575, + "loss": 0.9206, + "step": 17457 + }, + { + "epoch": 0.44827248640312234, + "grad_norm": 0.76953125, + "learning_rate": 0.00016673416206744866, + "loss": 0.8445, + "step": 17458 + }, + { + "epoch": 0.4482981635990442, + "grad_norm": 0.81640625, + "learning_rate": 0.00016673083727895642, + "loss": 0.9723, + "step": 17459 + }, + { + "epoch": 0.448323840794966, + "grad_norm": 0.78125, + "learning_rate": 0.00016672751235747568, + "loss": 0.9823, + "step": 17460 + }, + { + "epoch": 0.4483495179908878, + "grad_norm": 0.80078125, + "learning_rate": 0.000166724187303013, + "loss": 0.9295, + "step": 17461 + }, + { + "epoch": 0.4483751951868096, + "grad_norm": 0.79296875, + "learning_rate": 0.00016672086211557506, + "loss": 0.815, + "step": 17462 + }, + { + "epoch": 0.44840087238273146, + "grad_norm": 0.80078125, + "learning_rate": 0.0001667175367951685, + "loss": 0.8818, + "step": 17463 + }, + { + "epoch": 0.44842654957865324, + "grad_norm": 0.76171875, + "learning_rate": 0.00016671421134179987, + "loss": 0.9444, + "step": 17464 + }, + { + "epoch": 0.4484522267745751, + "grad_norm": 0.77734375, + "learning_rate": 0.00016671088575547586, + "loss": 1.0462, + "step": 17465 + }, + { + "epoch": 0.4484779039704969, + "grad_norm": 0.8125, + "learning_rate": 0.00016670756003620308, + "loss": 0.8652, + "step": 17466 + }, + { + "epoch": 0.4485035811664187, + "grad_norm": 0.75, + "learning_rate": 0.00016670423418398814, + "loss": 0.9186, + "step": 17467 + }, + { + "epoch": 0.44852925836234053, + "grad_norm": 0.69921875, + "learning_rate": 0.00016670090819883774, + "loss": 0.9146, + "step": 17468 + }, + { + "epoch": 0.44855493555826237, + "grad_norm": 0.765625, + "learning_rate": 0.0001666975820807584, + "loss": 0.9472, + "step": 17469 + }, + { + "epoch": 0.4485806127541842, + "grad_norm": 0.8359375, + "learning_rate": 0.00016669425582975682, + "loss": 0.9897, + "step": 17470 + }, + { + "epoch": 0.448606289950106, + "grad_norm": 0.8515625, + "learning_rate": 0.0001666909294458396, + "loss": 0.961, + "step": 17471 + }, + { + "epoch": 0.4486319671460278, + "grad_norm": 0.79296875, + "learning_rate": 0.0001666876029290134, + "loss": 1.0614, + "step": 17472 + }, + { + "epoch": 0.44865764434194966, + "grad_norm": 0.76171875, + "learning_rate": 0.00016668427627928486, + "loss": 0.857, + "step": 17473 + }, + { + "epoch": 0.44868332153787144, + "grad_norm": 0.8046875, + "learning_rate": 0.00016668094949666052, + "loss": 0.899, + "step": 17474 + }, + { + "epoch": 0.4487089987337933, + "grad_norm": 0.8828125, + "learning_rate": 0.0001666776225811471, + "loss": 0.9016, + "step": 17475 + }, + { + "epoch": 0.4487346759297151, + "grad_norm": 0.8046875, + "learning_rate": 0.0001666742955327512, + "loss": 0.9146, + "step": 17476 + }, + { + "epoch": 0.4487603531256369, + "grad_norm": 0.7890625, + "learning_rate": 0.00016667096835147946, + "loss": 0.8966, + "step": 17477 + }, + { + "epoch": 0.4487860303215587, + "grad_norm": 0.8515625, + "learning_rate": 0.0001666676410373385, + "loss": 0.9619, + "step": 17478 + }, + { + "epoch": 0.44881170751748056, + "grad_norm": 0.8359375, + "learning_rate": 0.00016666431359033492, + "loss": 0.9453, + "step": 17479 + }, + { + "epoch": 0.4488373847134024, + "grad_norm": 0.80859375, + "learning_rate": 0.00016666098601047545, + "loss": 0.9385, + "step": 17480 + }, + { + "epoch": 0.4488630619093242, + "grad_norm": 0.83984375, + "learning_rate": 0.00016665765829776657, + "loss": 1.0333, + "step": 17481 + }, + { + "epoch": 0.448888739105246, + "grad_norm": 0.70703125, + "learning_rate": 0.00016665433045221506, + "loss": 0.9421, + "step": 17482 + }, + { + "epoch": 0.44891441630116785, + "grad_norm": 0.73046875, + "learning_rate": 0.0001666510024738275, + "loss": 0.9245, + "step": 17483 + }, + { + "epoch": 0.44894009349708963, + "grad_norm": 0.78515625, + "learning_rate": 0.00016664767436261048, + "loss": 1.0749, + "step": 17484 + }, + { + "epoch": 0.44896577069301147, + "grad_norm": 0.77734375, + "learning_rate": 0.00016664434611857068, + "loss": 0.9961, + "step": 17485 + }, + { + "epoch": 0.4489914478889333, + "grad_norm": 0.7265625, + "learning_rate": 0.00016664101774171473, + "loss": 0.8454, + "step": 17486 + }, + { + "epoch": 0.4490171250848551, + "grad_norm": 0.78515625, + "learning_rate": 0.00016663768923204922, + "loss": 0.9439, + "step": 17487 + }, + { + "epoch": 0.4490428022807769, + "grad_norm": 0.82421875, + "learning_rate": 0.00016663436058958082, + "loss": 0.9502, + "step": 17488 + }, + { + "epoch": 0.44906847947669876, + "grad_norm": 0.7734375, + "learning_rate": 0.0001666310318143162, + "loss": 0.8834, + "step": 17489 + }, + { + "epoch": 0.4490941566726206, + "grad_norm": 0.71484375, + "learning_rate": 0.0001666277029062619, + "loss": 0.798, + "step": 17490 + }, + { + "epoch": 0.4491198338685424, + "grad_norm": 0.87890625, + "learning_rate": 0.00016662437386542463, + "loss": 0.8848, + "step": 17491 + }, + { + "epoch": 0.4491455110644642, + "grad_norm": 0.7734375, + "learning_rate": 0.000166621044691811, + "loss": 0.837, + "step": 17492 + }, + { + "epoch": 0.44917118826038605, + "grad_norm": 0.77734375, + "learning_rate": 0.00016661771538542763, + "loss": 0.9056, + "step": 17493 + }, + { + "epoch": 0.4491968654563078, + "grad_norm": 0.83203125, + "learning_rate": 0.00016661438594628121, + "loss": 0.836, + "step": 17494 + }, + { + "epoch": 0.44922254265222966, + "grad_norm": 0.78125, + "learning_rate": 0.0001666110563743783, + "loss": 0.8096, + "step": 17495 + }, + { + "epoch": 0.4492482198481515, + "grad_norm": 0.88671875, + "learning_rate": 0.0001666077266697256, + "loss": 1.0932, + "step": 17496 + }, + { + "epoch": 0.4492738970440733, + "grad_norm": 0.765625, + "learning_rate": 0.00016660439683232968, + "loss": 0.9332, + "step": 17497 + }, + { + "epoch": 0.4492995742399951, + "grad_norm": 0.6640625, + "learning_rate": 0.00016660106686219726, + "loss": 0.8195, + "step": 17498 + }, + { + "epoch": 0.44932525143591695, + "grad_norm": 0.77734375, + "learning_rate": 0.00016659773675933487, + "loss": 0.9367, + "step": 17499 + }, + { + "epoch": 0.4493509286318388, + "grad_norm": 0.765625, + "learning_rate": 0.00016659440652374926, + "loss": 0.835, + "step": 17500 + }, + { + "epoch": 0.44937660582776057, + "grad_norm": 0.734375, + "learning_rate": 0.00016659107615544697, + "loss": 1.0437, + "step": 17501 + }, + { + "epoch": 0.4494022830236824, + "grad_norm": 0.7421875, + "learning_rate": 0.0001665877456544347, + "loss": 0.8661, + "step": 17502 + }, + { + "epoch": 0.44942796021960424, + "grad_norm": 0.796875, + "learning_rate": 0.00016658441502071907, + "loss": 0.8886, + "step": 17503 + }, + { + "epoch": 0.449453637415526, + "grad_norm": 0.78125, + "learning_rate": 0.00016658108425430675, + "loss": 0.8452, + "step": 17504 + }, + { + "epoch": 0.44947931461144786, + "grad_norm": 0.80859375, + "learning_rate": 0.00016657775335520428, + "loss": 0.9821, + "step": 17505 + }, + { + "epoch": 0.4495049918073697, + "grad_norm": 0.83203125, + "learning_rate": 0.0001665744223234184, + "loss": 0.8507, + "step": 17506 + }, + { + "epoch": 0.44953066900329147, + "grad_norm": 0.7734375, + "learning_rate": 0.0001665710911589557, + "loss": 0.9675, + "step": 17507 + }, + { + "epoch": 0.4495563461992133, + "grad_norm": 0.828125, + "learning_rate": 0.00016656775986182283, + "loss": 0.9761, + "step": 17508 + }, + { + "epoch": 0.44958202339513514, + "grad_norm": 0.7890625, + "learning_rate": 0.00016656442843202641, + "loss": 0.8624, + "step": 17509 + }, + { + "epoch": 0.449607700591057, + "grad_norm": 0.7890625, + "learning_rate": 0.00016656109686957312, + "loss": 0.8585, + "step": 17510 + }, + { + "epoch": 0.44963337778697876, + "grad_norm": 0.77734375, + "learning_rate": 0.0001665577651744696, + "loss": 0.8462, + "step": 17511 + }, + { + "epoch": 0.4496590549829006, + "grad_norm": 0.8203125, + "learning_rate": 0.00016655443334672243, + "loss": 0.8875, + "step": 17512 + }, + { + "epoch": 0.44968473217882243, + "grad_norm": 0.78125, + "learning_rate": 0.0001665511013863383, + "loss": 0.9251, + "step": 17513 + }, + { + "epoch": 0.4497104093747442, + "grad_norm": 0.79296875, + "learning_rate": 0.0001665477692933238, + "loss": 0.8987, + "step": 17514 + }, + { + "epoch": 0.44973608657066605, + "grad_norm": 0.78125, + "learning_rate": 0.00016654443706768564, + "loss": 0.8962, + "step": 17515 + }, + { + "epoch": 0.4497617637665879, + "grad_norm": 0.78125, + "learning_rate": 0.00016654110470943042, + "loss": 0.8651, + "step": 17516 + }, + { + "epoch": 0.44978744096250967, + "grad_norm": 0.76953125, + "learning_rate": 0.0001665377722185648, + "loss": 0.8838, + "step": 17517 + }, + { + "epoch": 0.4498131181584315, + "grad_norm": 0.80078125, + "learning_rate": 0.00016653443959509543, + "loss": 0.8693, + "step": 17518 + }, + { + "epoch": 0.44983879535435334, + "grad_norm": 0.828125, + "learning_rate": 0.0001665311068390289, + "loss": 0.8701, + "step": 17519 + }, + { + "epoch": 0.4498644725502752, + "grad_norm": 0.8671875, + "learning_rate": 0.0001665277739503719, + "loss": 0.9804, + "step": 17520 + }, + { + "epoch": 0.44989014974619695, + "grad_norm": 0.7265625, + "learning_rate": 0.00016652444092913102, + "loss": 0.8672, + "step": 17521 + }, + { + "epoch": 0.4499158269421188, + "grad_norm": 0.87109375, + "learning_rate": 0.00016652110777531297, + "loss": 0.9107, + "step": 17522 + }, + { + "epoch": 0.4499415041380406, + "grad_norm": 0.79296875, + "learning_rate": 0.00016651777448892436, + "loss": 0.835, + "step": 17523 + }, + { + "epoch": 0.4499671813339624, + "grad_norm": 0.70703125, + "learning_rate": 0.00016651444106997182, + "loss": 0.9473, + "step": 17524 + }, + { + "epoch": 0.44999285852988424, + "grad_norm": 0.7265625, + "learning_rate": 0.000166511107518462, + "loss": 0.9799, + "step": 17525 + }, + { + "epoch": 0.4500185357258061, + "grad_norm": 0.7265625, + "learning_rate": 0.0001665077738344016, + "loss": 0.8873, + "step": 17526 + }, + { + "epoch": 0.45004421292172786, + "grad_norm": 0.7734375, + "learning_rate": 0.00016650444001779717, + "loss": 0.7874, + "step": 17527 + }, + { + "epoch": 0.4500698901176497, + "grad_norm": 0.75390625, + "learning_rate": 0.00016650110606865538, + "loss": 1.0092, + "step": 17528 + }, + { + "epoch": 0.45009556731357153, + "grad_norm": 0.7265625, + "learning_rate": 0.00016649777198698295, + "loss": 0.8894, + "step": 17529 + }, + { + "epoch": 0.45012124450949337, + "grad_norm": 0.79296875, + "learning_rate": 0.00016649443777278644, + "loss": 0.9421, + "step": 17530 + }, + { + "epoch": 0.45014692170541515, + "grad_norm": 0.7421875, + "learning_rate": 0.00016649110342607253, + "loss": 0.8591, + "step": 17531 + }, + { + "epoch": 0.450172598901337, + "grad_norm": 0.7890625, + "learning_rate": 0.00016648776894684784, + "loss": 0.9895, + "step": 17532 + }, + { + "epoch": 0.4501982760972588, + "grad_norm": 0.94140625, + "learning_rate": 0.00016648443433511903, + "loss": 1.0488, + "step": 17533 + }, + { + "epoch": 0.4502239532931806, + "grad_norm": 0.76171875, + "learning_rate": 0.00016648109959089275, + "loss": 0.818, + "step": 17534 + }, + { + "epoch": 0.45024963048910244, + "grad_norm": 0.8046875, + "learning_rate": 0.00016647776471417566, + "loss": 0.9581, + "step": 17535 + }, + { + "epoch": 0.4502753076850243, + "grad_norm": 0.79296875, + "learning_rate": 0.0001664744297049744, + "loss": 0.9791, + "step": 17536 + }, + { + "epoch": 0.45030098488094605, + "grad_norm": 0.76953125, + "learning_rate": 0.0001664710945632956, + "loss": 1.0532, + "step": 17537 + }, + { + "epoch": 0.4503266620768679, + "grad_norm": 0.83984375, + "learning_rate": 0.00016646775928914588, + "loss": 0.8672, + "step": 17538 + }, + { + "epoch": 0.4503523392727897, + "grad_norm": 0.90234375, + "learning_rate": 0.00016646442388253195, + "loss": 0.914, + "step": 17539 + }, + { + "epoch": 0.45037801646871156, + "grad_norm": 0.859375, + "learning_rate": 0.00016646108834346039, + "loss": 0.8909, + "step": 17540 + }, + { + "epoch": 0.45040369366463334, + "grad_norm": 0.78125, + "learning_rate": 0.00016645775267193791, + "loss": 0.7857, + "step": 17541 + }, + { + "epoch": 0.4504293708605552, + "grad_norm": 0.80859375, + "learning_rate": 0.00016645441686797118, + "loss": 0.9525, + "step": 17542 + }, + { + "epoch": 0.450455048056477, + "grad_norm": 0.73828125, + "learning_rate": 0.00016645108093156673, + "loss": 0.8472, + "step": 17543 + }, + { + "epoch": 0.4504807252523988, + "grad_norm": 0.80078125, + "learning_rate": 0.0001664477448627313, + "loss": 1.0138, + "step": 17544 + }, + { + "epoch": 0.45050640244832063, + "grad_norm": 0.8046875, + "learning_rate": 0.00016644440866147152, + "loss": 0.9353, + "step": 17545 + }, + { + "epoch": 0.45053207964424247, + "grad_norm": 0.7421875, + "learning_rate": 0.00016644107232779404, + "loss": 0.8364, + "step": 17546 + }, + { + "epoch": 0.45055775684016425, + "grad_norm": 0.8046875, + "learning_rate": 0.0001664377358617055, + "loss": 0.9201, + "step": 17547 + }, + { + "epoch": 0.4505834340360861, + "grad_norm": 0.859375, + "learning_rate": 0.00016643439926321252, + "loss": 0.8449, + "step": 17548 + }, + { + "epoch": 0.4506091112320079, + "grad_norm": 0.78125, + "learning_rate": 0.00016643106253232186, + "loss": 0.8976, + "step": 17549 + }, + { + "epoch": 0.45063478842792976, + "grad_norm": 0.765625, + "learning_rate": 0.00016642772566904005, + "loss": 0.9293, + "step": 17550 + }, + { + "epoch": 0.45066046562385154, + "grad_norm": 0.81640625, + "learning_rate": 0.00016642438867337376, + "loss": 1.0241, + "step": 17551 + }, + { + "epoch": 0.45068614281977337, + "grad_norm": 0.77734375, + "learning_rate": 0.00016642105154532968, + "loss": 0.9351, + "step": 17552 + }, + { + "epoch": 0.4507118200156952, + "grad_norm": 0.83984375, + "learning_rate": 0.00016641771428491445, + "loss": 1.0073, + "step": 17553 + }, + { + "epoch": 0.450737497211617, + "grad_norm": 0.80859375, + "learning_rate": 0.00016641437689213473, + "loss": 0.914, + "step": 17554 + }, + { + "epoch": 0.4507631744075388, + "grad_norm": 0.796875, + "learning_rate": 0.00016641103936699715, + "loss": 0.8835, + "step": 17555 + }, + { + "epoch": 0.45078885160346066, + "grad_norm": 0.81640625, + "learning_rate": 0.00016640770170950835, + "loss": 0.9406, + "step": 17556 + }, + { + "epoch": 0.45081452879938244, + "grad_norm": 0.78515625, + "learning_rate": 0.000166404363919675, + "loss": 0.8535, + "step": 17557 + }, + { + "epoch": 0.4508402059953043, + "grad_norm": 0.75390625, + "learning_rate": 0.00016640102599750376, + "loss": 0.8612, + "step": 17558 + }, + { + "epoch": 0.4508658831912261, + "grad_norm": 0.76171875, + "learning_rate": 0.00016639768794300127, + "loss": 0.8882, + "step": 17559 + }, + { + "epoch": 0.4508915603871479, + "grad_norm": 0.79296875, + "learning_rate": 0.00016639434975617418, + "loss": 0.9975, + "step": 17560 + }, + { + "epoch": 0.45091723758306973, + "grad_norm": 0.8203125, + "learning_rate": 0.00016639101143702917, + "loss": 0.8288, + "step": 17561 + }, + { + "epoch": 0.45094291477899157, + "grad_norm": 0.69140625, + "learning_rate": 0.00016638767298557286, + "loss": 0.852, + "step": 17562 + }, + { + "epoch": 0.4509685919749134, + "grad_norm": 0.74609375, + "learning_rate": 0.0001663843344018119, + "loss": 0.8584, + "step": 17563 + }, + { + "epoch": 0.4509942691708352, + "grad_norm": 0.76171875, + "learning_rate": 0.00016638099568575297, + "loss": 0.8696, + "step": 17564 + }, + { + "epoch": 0.451019946366757, + "grad_norm": 0.76953125, + "learning_rate": 0.0001663776568374027, + "loss": 0.8608, + "step": 17565 + }, + { + "epoch": 0.45104562356267885, + "grad_norm": 0.83203125, + "learning_rate": 0.0001663743178567678, + "loss": 0.8933, + "step": 17566 + }, + { + "epoch": 0.45107130075860064, + "grad_norm": 0.75390625, + "learning_rate": 0.0001663709787438548, + "loss": 1.0395, + "step": 17567 + }, + { + "epoch": 0.45109697795452247, + "grad_norm": 0.8203125, + "learning_rate": 0.0001663676394986705, + "loss": 0.9897, + "step": 17568 + }, + { + "epoch": 0.4511226551504443, + "grad_norm": 0.8359375, + "learning_rate": 0.00016636430012122148, + "loss": 0.8714, + "step": 17569 + }, + { + "epoch": 0.4511483323463661, + "grad_norm": 0.8359375, + "learning_rate": 0.00016636096061151438, + "loss": 0.9764, + "step": 17570 + }, + { + "epoch": 0.4511740095422879, + "grad_norm": 0.91015625, + "learning_rate": 0.0001663576209695559, + "loss": 1.0549, + "step": 17571 + }, + { + "epoch": 0.45119968673820976, + "grad_norm": 0.765625, + "learning_rate": 0.00016635428119535266, + "loss": 0.9548, + "step": 17572 + }, + { + "epoch": 0.4512253639341316, + "grad_norm": 0.8046875, + "learning_rate": 0.00016635094128891135, + "loss": 0.9467, + "step": 17573 + }, + { + "epoch": 0.4512510411300534, + "grad_norm": 0.80859375, + "learning_rate": 0.0001663476012502386, + "loss": 0.9165, + "step": 17574 + }, + { + "epoch": 0.4512767183259752, + "grad_norm": 0.828125, + "learning_rate": 0.00016634426107934108, + "loss": 0.9375, + "step": 17575 + }, + { + "epoch": 0.45130239552189705, + "grad_norm": 0.76953125, + "learning_rate": 0.00016634092077622543, + "loss": 0.9016, + "step": 17576 + }, + { + "epoch": 0.45132807271781883, + "grad_norm": 0.83984375, + "learning_rate": 0.00016633758034089832, + "loss": 1.0317, + "step": 17577 + }, + { + "epoch": 0.45135374991374067, + "grad_norm": 0.796875, + "learning_rate": 0.0001663342397733664, + "loss": 0.8778, + "step": 17578 + }, + { + "epoch": 0.4513794271096625, + "grad_norm": 0.796875, + "learning_rate": 0.00016633089907363636, + "loss": 1.0491, + "step": 17579 + }, + { + "epoch": 0.4514051043055843, + "grad_norm": 0.71875, + "learning_rate": 0.00016632755824171479, + "loss": 0.9528, + "step": 17580 + }, + { + "epoch": 0.4514307815015061, + "grad_norm": 0.6953125, + "learning_rate": 0.0001663242172776084, + "loss": 0.8868, + "step": 17581 + }, + { + "epoch": 0.45145645869742795, + "grad_norm": 0.71484375, + "learning_rate": 0.00016632087618132383, + "loss": 0.8317, + "step": 17582 + }, + { + "epoch": 0.4514821358933498, + "grad_norm": 0.80078125, + "learning_rate": 0.00016631753495286778, + "loss": 0.9383, + "step": 17583 + }, + { + "epoch": 0.45150781308927157, + "grad_norm": 0.84375, + "learning_rate": 0.00016631419359224684, + "loss": 1.1327, + "step": 17584 + }, + { + "epoch": 0.4515334902851934, + "grad_norm": 0.75, + "learning_rate": 0.0001663108520994677, + "loss": 0.8643, + "step": 17585 + }, + { + "epoch": 0.45155916748111524, + "grad_norm": 0.7734375, + "learning_rate": 0.00016630751047453702, + "loss": 0.9352, + "step": 17586 + }, + { + "epoch": 0.451584844677037, + "grad_norm": 0.76171875, + "learning_rate": 0.00016630416871746148, + "loss": 0.9882, + "step": 17587 + }, + { + "epoch": 0.45161052187295886, + "grad_norm": 0.7421875, + "learning_rate": 0.00016630082682824772, + "loss": 0.948, + "step": 17588 + }, + { + "epoch": 0.4516361990688807, + "grad_norm": 0.71484375, + "learning_rate": 0.00016629748480690242, + "loss": 0.8637, + "step": 17589 + }, + { + "epoch": 0.4516618762648025, + "grad_norm": 0.79296875, + "learning_rate": 0.00016629414265343216, + "loss": 1.0284, + "step": 17590 + }, + { + "epoch": 0.4516875534607243, + "grad_norm": 0.7421875, + "learning_rate": 0.00016629080036784373, + "loss": 0.9123, + "step": 17591 + }, + { + "epoch": 0.45171323065664615, + "grad_norm": 0.76171875, + "learning_rate": 0.00016628745795014367, + "loss": 0.7366, + "step": 17592 + }, + { + "epoch": 0.451738907852568, + "grad_norm": 0.80078125, + "learning_rate": 0.00016628411540033872, + "loss": 0.9038, + "step": 17593 + }, + { + "epoch": 0.45176458504848976, + "grad_norm": 0.734375, + "learning_rate": 0.00016628077271843553, + "loss": 0.9907, + "step": 17594 + }, + { + "epoch": 0.4517902622444116, + "grad_norm": 0.73046875, + "learning_rate": 0.00016627742990444073, + "loss": 0.8683, + "step": 17595 + }, + { + "epoch": 0.45181593944033344, + "grad_norm": 0.8828125, + "learning_rate": 0.000166274086958361, + "loss": 1.0556, + "step": 17596 + }, + { + "epoch": 0.4518416166362552, + "grad_norm": 0.76953125, + "learning_rate": 0.00016627074388020298, + "loss": 0.9242, + "step": 17597 + }, + { + "epoch": 0.45186729383217705, + "grad_norm": 0.8125, + "learning_rate": 0.00016626740066997338, + "loss": 0.9072, + "step": 17598 + }, + { + "epoch": 0.4518929710280989, + "grad_norm": 0.77734375, + "learning_rate": 0.00016626405732767882, + "loss": 0.8659, + "step": 17599 + }, + { + "epoch": 0.45191864822402067, + "grad_norm": 0.828125, + "learning_rate": 0.000166260713853326, + "loss": 0.9653, + "step": 17600 + }, + { + "epoch": 0.4519443254199425, + "grad_norm": 0.84375, + "learning_rate": 0.00016625737024692153, + "loss": 0.8947, + "step": 17601 + }, + { + "epoch": 0.45197000261586434, + "grad_norm": 0.828125, + "learning_rate": 0.00016625402650847216, + "loss": 0.8347, + "step": 17602 + }, + { + "epoch": 0.4519956798117862, + "grad_norm": 0.8125, + "learning_rate": 0.00016625068263798447, + "loss": 0.999, + "step": 17603 + }, + { + "epoch": 0.45202135700770796, + "grad_norm": 0.78125, + "learning_rate": 0.00016624733863546514, + "loss": 0.8652, + "step": 17604 + }, + { + "epoch": 0.4520470342036298, + "grad_norm": 0.8359375, + "learning_rate": 0.00016624399450092085, + "loss": 0.9183, + "step": 17605 + }, + { + "epoch": 0.45207271139955163, + "grad_norm": 0.76953125, + "learning_rate": 0.00016624065023435825, + "loss": 0.9203, + "step": 17606 + }, + { + "epoch": 0.4520983885954734, + "grad_norm": 0.73828125, + "learning_rate": 0.00016623730583578404, + "loss": 0.923, + "step": 17607 + }, + { + "epoch": 0.45212406579139525, + "grad_norm": 0.7734375, + "learning_rate": 0.0001662339613052049, + "loss": 0.8849, + "step": 17608 + }, + { + "epoch": 0.4521497429873171, + "grad_norm": 0.94921875, + "learning_rate": 0.00016623061664262744, + "loss": 0.8679, + "step": 17609 + }, + { + "epoch": 0.45217542018323886, + "grad_norm": 0.765625, + "learning_rate": 0.0001662272718480583, + "loss": 0.8337, + "step": 17610 + }, + { + "epoch": 0.4522010973791607, + "grad_norm": 0.7890625, + "learning_rate": 0.0001662239269215042, + "loss": 0.8895, + "step": 17611 + }, + { + "epoch": 0.45222677457508254, + "grad_norm": 0.79296875, + "learning_rate": 0.0001662205818629718, + "loss": 1.0044, + "step": 17612 + }, + { + "epoch": 0.45225245177100437, + "grad_norm": 0.8046875, + "learning_rate": 0.00016621723667246778, + "loss": 1.0143, + "step": 17613 + }, + { + "epoch": 0.45227812896692615, + "grad_norm": 0.80859375, + "learning_rate": 0.0001662138913499988, + "loss": 0.9538, + "step": 17614 + }, + { + "epoch": 0.452303806162848, + "grad_norm": 0.71875, + "learning_rate": 0.00016621054589557147, + "loss": 0.7906, + "step": 17615 + }, + { + "epoch": 0.4523294833587698, + "grad_norm": 0.8125, + "learning_rate": 0.00016620720030919252, + "loss": 0.9591, + "step": 17616 + }, + { + "epoch": 0.4523551605546916, + "grad_norm": 0.79296875, + "learning_rate": 0.00016620385459086863, + "loss": 1.0174, + "step": 17617 + }, + { + "epoch": 0.45238083775061344, + "grad_norm": 0.84375, + "learning_rate": 0.0001662005087406064, + "loss": 1.0123, + "step": 17618 + }, + { + "epoch": 0.4524065149465353, + "grad_norm": 0.87890625, + "learning_rate": 0.00016619716275841255, + "loss": 0.9325, + "step": 17619 + }, + { + "epoch": 0.45243219214245706, + "grad_norm": 0.78125, + "learning_rate": 0.00016619381664429375, + "loss": 0.9292, + "step": 17620 + }, + { + "epoch": 0.4524578693383789, + "grad_norm": 0.71875, + "learning_rate": 0.00016619047039825665, + "loss": 0.9629, + "step": 17621 + }, + { + "epoch": 0.45248354653430073, + "grad_norm": 0.8359375, + "learning_rate": 0.0001661871240203079, + "loss": 0.919, + "step": 17622 + }, + { + "epoch": 0.45250922373022257, + "grad_norm": 0.84765625, + "learning_rate": 0.00016618377751045422, + "loss": 0.9983, + "step": 17623 + }, + { + "epoch": 0.45253490092614435, + "grad_norm": 0.75390625, + "learning_rate": 0.00016618043086870222, + "loss": 0.906, + "step": 17624 + }, + { + "epoch": 0.4525605781220662, + "grad_norm": 0.79296875, + "learning_rate": 0.0001661770840950586, + "loss": 0.8829, + "step": 17625 + }, + { + "epoch": 0.452586255317988, + "grad_norm": 0.84375, + "learning_rate": 0.00016617373718953005, + "loss": 0.9954, + "step": 17626 + }, + { + "epoch": 0.4526119325139098, + "grad_norm": 0.79296875, + "learning_rate": 0.0001661703901521232, + "loss": 0.9228, + "step": 17627 + }, + { + "epoch": 0.45263760970983163, + "grad_norm": 0.83984375, + "learning_rate": 0.00016616704298284476, + "loss": 0.9233, + "step": 17628 + }, + { + "epoch": 0.45266328690575347, + "grad_norm": 0.734375, + "learning_rate": 0.00016616369568170136, + "loss": 0.9298, + "step": 17629 + }, + { + "epoch": 0.45268896410167525, + "grad_norm": 0.8046875, + "learning_rate": 0.0001661603482486997, + "loss": 0.8046, + "step": 17630 + }, + { + "epoch": 0.4527146412975971, + "grad_norm": 0.97265625, + "learning_rate": 0.00016615700068384643, + "loss": 0.8605, + "step": 17631 + }, + { + "epoch": 0.4527403184935189, + "grad_norm": 0.8359375, + "learning_rate": 0.00016615365298714823, + "loss": 0.782, + "step": 17632 + }, + { + "epoch": 0.45276599568944076, + "grad_norm": 0.734375, + "learning_rate": 0.00016615030515861179, + "loss": 0.9847, + "step": 17633 + }, + { + "epoch": 0.45279167288536254, + "grad_norm": 0.8203125, + "learning_rate": 0.00016614695719824378, + "loss": 0.8553, + "step": 17634 + }, + { + "epoch": 0.4528173500812844, + "grad_norm": 0.828125, + "learning_rate": 0.00016614360910605086, + "loss": 0.9801, + "step": 17635 + }, + { + "epoch": 0.4528430272772062, + "grad_norm": 0.78125, + "learning_rate": 0.0001661402608820397, + "loss": 1.0304, + "step": 17636 + }, + { + "epoch": 0.452868704473128, + "grad_norm": 0.76953125, + "learning_rate": 0.00016613691252621694, + "loss": 0.9315, + "step": 17637 + }, + { + "epoch": 0.45289438166904983, + "grad_norm": 0.76171875, + "learning_rate": 0.0001661335640385893, + "loss": 0.9706, + "step": 17638 + }, + { + "epoch": 0.45292005886497166, + "grad_norm": 0.8203125, + "learning_rate": 0.00016613021541916347, + "loss": 0.9203, + "step": 17639 + }, + { + "epoch": 0.45294573606089344, + "grad_norm": 0.79296875, + "learning_rate": 0.00016612686666794607, + "loss": 0.8858, + "step": 17640 + }, + { + "epoch": 0.4529714132568153, + "grad_norm": 0.80859375, + "learning_rate": 0.00016612351778494378, + "loss": 0.9791, + "step": 17641 + }, + { + "epoch": 0.4529970904527371, + "grad_norm": 0.92578125, + "learning_rate": 0.00016612016877016333, + "loss": 0.9869, + "step": 17642 + }, + { + "epoch": 0.45302276764865895, + "grad_norm": 0.796875, + "learning_rate": 0.00016611681962361137, + "loss": 0.7944, + "step": 17643 + }, + { + "epoch": 0.45304844484458073, + "grad_norm": 0.734375, + "learning_rate": 0.0001661134703452945, + "loss": 0.9157, + "step": 17644 + }, + { + "epoch": 0.45307412204050257, + "grad_norm": 0.78125, + "learning_rate": 0.0001661101209352195, + "loss": 0.987, + "step": 17645 + }, + { + "epoch": 0.4530997992364244, + "grad_norm": 0.83203125, + "learning_rate": 0.000166106771393393, + "loss": 0.9277, + "step": 17646 + }, + { + "epoch": 0.4531254764323462, + "grad_norm": 0.80078125, + "learning_rate": 0.00016610342171982168, + "loss": 0.915, + "step": 17647 + }, + { + "epoch": 0.453151153628268, + "grad_norm": 0.78515625, + "learning_rate": 0.00016610007191451219, + "loss": 0.8768, + "step": 17648 + }, + { + "epoch": 0.45317683082418986, + "grad_norm": 0.8046875, + "learning_rate": 0.0001660967219774712, + "loss": 0.9943, + "step": 17649 + }, + { + "epoch": 0.45320250802011164, + "grad_norm": 0.765625, + "learning_rate": 0.00016609337190870546, + "loss": 0.8894, + "step": 17650 + }, + { + "epoch": 0.4532281852160335, + "grad_norm": 0.7734375, + "learning_rate": 0.0001660900217082216, + "loss": 0.9849, + "step": 17651 + }, + { + "epoch": 0.4532538624119553, + "grad_norm": 0.81640625, + "learning_rate": 0.00016608667137602627, + "loss": 0.9042, + "step": 17652 + }, + { + "epoch": 0.45327953960787715, + "grad_norm": 0.7421875, + "learning_rate": 0.0001660833209121262, + "loss": 1.0515, + "step": 17653 + }, + { + "epoch": 0.4533052168037989, + "grad_norm": 0.75390625, + "learning_rate": 0.00016607997031652802, + "loss": 0.8035, + "step": 17654 + }, + { + "epoch": 0.45333089399972076, + "grad_norm": 0.76171875, + "learning_rate": 0.00016607661958923842, + "loss": 0.9419, + "step": 17655 + }, + { + "epoch": 0.4533565711956426, + "grad_norm": 0.8359375, + "learning_rate": 0.00016607326873026412, + "loss": 0.893, + "step": 17656 + }, + { + "epoch": 0.4533822483915644, + "grad_norm": 0.7109375, + "learning_rate": 0.00016606991773961173, + "loss": 0.875, + "step": 17657 + }, + { + "epoch": 0.4534079255874862, + "grad_norm": 0.80078125, + "learning_rate": 0.00016606656661728798, + "loss": 1.0044, + "step": 17658 + }, + { + "epoch": 0.45343360278340805, + "grad_norm": 0.8203125, + "learning_rate": 0.00016606321536329955, + "loss": 0.9058, + "step": 17659 + }, + { + "epoch": 0.45345927997932983, + "grad_norm": 0.75390625, + "learning_rate": 0.00016605986397765307, + "loss": 0.8361, + "step": 17660 + }, + { + "epoch": 0.45348495717525167, + "grad_norm": 0.7734375, + "learning_rate": 0.00016605651246035525, + "loss": 0.8655, + "step": 17661 + }, + { + "epoch": 0.4535106343711735, + "grad_norm": 0.796875, + "learning_rate": 0.0001660531608114128, + "loss": 0.9589, + "step": 17662 + }, + { + "epoch": 0.45353631156709534, + "grad_norm": 0.78125, + "learning_rate": 0.00016604980903083233, + "loss": 0.9275, + "step": 17663 + }, + { + "epoch": 0.4535619887630171, + "grad_norm": 0.88671875, + "learning_rate": 0.00016604645711862056, + "loss": 0.851, + "step": 17664 + }, + { + "epoch": 0.45358766595893896, + "grad_norm": 0.86328125, + "learning_rate": 0.00016604310507478416, + "loss": 0.8973, + "step": 17665 + }, + { + "epoch": 0.4536133431548608, + "grad_norm": 0.79296875, + "learning_rate": 0.00016603975289932984, + "loss": 0.9417, + "step": 17666 + }, + { + "epoch": 0.4536390203507826, + "grad_norm": 0.765625, + "learning_rate": 0.00016603640059226427, + "loss": 0.9225, + "step": 17667 + }, + { + "epoch": 0.4536646975467044, + "grad_norm": 0.8359375, + "learning_rate": 0.0001660330481535941, + "loss": 0.937, + "step": 17668 + }, + { + "epoch": 0.45369037474262625, + "grad_norm": 0.7890625, + "learning_rate": 0.00016602969558332598, + "loss": 0.7995, + "step": 17669 + }, + { + "epoch": 0.453716051938548, + "grad_norm": 0.73828125, + "learning_rate": 0.0001660263428814667, + "loss": 0.8926, + "step": 17670 + }, + { + "epoch": 0.45374172913446986, + "grad_norm": 0.80078125, + "learning_rate": 0.00016602299004802288, + "loss": 0.9453, + "step": 17671 + }, + { + "epoch": 0.4537674063303917, + "grad_norm": 0.77734375, + "learning_rate": 0.0001660196370830012, + "loss": 0.925, + "step": 17672 + }, + { + "epoch": 0.45379308352631353, + "grad_norm": 0.8125, + "learning_rate": 0.00016601628398640834, + "loss": 0.8556, + "step": 17673 + }, + { + "epoch": 0.4538187607222353, + "grad_norm": 0.7109375, + "learning_rate": 0.00016601293075825099, + "loss": 0.6541, + "step": 17674 + }, + { + "epoch": 0.45384443791815715, + "grad_norm": 0.80859375, + "learning_rate": 0.00016600957739853583, + "loss": 0.9884, + "step": 17675 + }, + { + "epoch": 0.453870115114079, + "grad_norm": 0.82421875, + "learning_rate": 0.00016600622390726955, + "loss": 0.8685, + "step": 17676 + }, + { + "epoch": 0.45389579231000077, + "grad_norm": 0.73046875, + "learning_rate": 0.00016600287028445884, + "loss": 0.7945, + "step": 17677 + }, + { + "epoch": 0.4539214695059226, + "grad_norm": 0.76953125, + "learning_rate": 0.00016599951653011033, + "loss": 0.9562, + "step": 17678 + }, + { + "epoch": 0.45394714670184444, + "grad_norm": 0.75, + "learning_rate": 0.00016599616264423078, + "loss": 0.9194, + "step": 17679 + }, + { + "epoch": 0.4539728238977662, + "grad_norm": 0.78125, + "learning_rate": 0.00016599280862682685, + "loss": 0.9491, + "step": 17680 + }, + { + "epoch": 0.45399850109368806, + "grad_norm": 0.765625, + "learning_rate": 0.00016598945447790522, + "loss": 0.8194, + "step": 17681 + }, + { + "epoch": 0.4540241782896099, + "grad_norm": 0.76171875, + "learning_rate": 0.00016598610019747252, + "loss": 0.8114, + "step": 17682 + }, + { + "epoch": 0.45404985548553173, + "grad_norm": 0.7734375, + "learning_rate": 0.0001659827457855355, + "loss": 0.8908, + "step": 17683 + }, + { + "epoch": 0.4540755326814535, + "grad_norm": 0.8125, + "learning_rate": 0.00016597939124210085, + "loss": 0.8967, + "step": 17684 + }, + { + "epoch": 0.45410120987737534, + "grad_norm": 0.796875, + "learning_rate": 0.0001659760365671752, + "loss": 0.8304, + "step": 17685 + }, + { + "epoch": 0.4541268870732972, + "grad_norm": 0.796875, + "learning_rate": 0.0001659726817607653, + "loss": 0.897, + "step": 17686 + }, + { + "epoch": 0.45415256426921896, + "grad_norm": 0.73046875, + "learning_rate": 0.00016596932682287778, + "loss": 0.8723, + "step": 17687 + }, + { + "epoch": 0.4541782414651408, + "grad_norm": 0.80078125, + "learning_rate": 0.00016596597175351938, + "loss": 0.9007, + "step": 17688 + }, + { + "epoch": 0.45420391866106263, + "grad_norm": 0.7734375, + "learning_rate": 0.00016596261655269675, + "loss": 0.7929, + "step": 17689 + }, + { + "epoch": 0.4542295958569844, + "grad_norm": 0.796875, + "learning_rate": 0.00016595926122041656, + "loss": 0.9884, + "step": 17690 + }, + { + "epoch": 0.45425527305290625, + "grad_norm": 0.78125, + "learning_rate": 0.00016595590575668555, + "loss": 0.9064, + "step": 17691 + }, + { + "epoch": 0.4542809502488281, + "grad_norm": 0.78125, + "learning_rate": 0.00016595255016151032, + "loss": 0.7991, + "step": 17692 + }, + { + "epoch": 0.4543066274447499, + "grad_norm": 0.80859375, + "learning_rate": 0.00016594919443489769, + "loss": 0.9565, + "step": 17693 + }, + { + "epoch": 0.4543323046406717, + "grad_norm": 0.78125, + "learning_rate": 0.00016594583857685425, + "loss": 0.9329, + "step": 17694 + }, + { + "epoch": 0.45435798183659354, + "grad_norm": 0.81640625, + "learning_rate": 0.00016594248258738667, + "loss": 0.8918, + "step": 17695 + }, + { + "epoch": 0.4543836590325154, + "grad_norm": 0.8125, + "learning_rate": 0.00016593912646650172, + "loss": 0.9595, + "step": 17696 + }, + { + "epoch": 0.45440933622843716, + "grad_norm": 0.73828125, + "learning_rate": 0.00016593577021420603, + "loss": 0.8795, + "step": 17697 + }, + { + "epoch": 0.454435013424359, + "grad_norm": 0.8515625, + "learning_rate": 0.00016593241383050633, + "loss": 0.901, + "step": 17698 + }, + { + "epoch": 0.4544606906202808, + "grad_norm": 0.71484375, + "learning_rate": 0.00016592905731540924, + "loss": 0.9016, + "step": 17699 + }, + { + "epoch": 0.4544863678162026, + "grad_norm": 0.76171875, + "learning_rate": 0.00016592570066892152, + "loss": 0.8478, + "step": 17700 + }, + { + "epoch": 0.45451204501212444, + "grad_norm": 0.796875, + "learning_rate": 0.00016592234389104985, + "loss": 0.8307, + "step": 17701 + }, + { + "epoch": 0.4545377222080463, + "grad_norm": 0.76953125, + "learning_rate": 0.00016591898698180087, + "loss": 1.0105, + "step": 17702 + }, + { + "epoch": 0.4545633994039681, + "grad_norm": 0.8046875, + "learning_rate": 0.00016591562994118132, + "loss": 0.9991, + "step": 17703 + }, + { + "epoch": 0.4545890765998899, + "grad_norm": 0.77734375, + "learning_rate": 0.00016591227276919787, + "loss": 0.9578, + "step": 17704 + }, + { + "epoch": 0.45461475379581173, + "grad_norm": 0.7421875, + "learning_rate": 0.00016590891546585724, + "loss": 1.1164, + "step": 17705 + }, + { + "epoch": 0.45464043099173357, + "grad_norm": 0.80078125, + "learning_rate": 0.00016590555803116608, + "loss": 0.9447, + "step": 17706 + }, + { + "epoch": 0.45466610818765535, + "grad_norm": 0.81640625, + "learning_rate": 0.00016590220046513107, + "loss": 0.956, + "step": 17707 + }, + { + "epoch": 0.4546917853835772, + "grad_norm": 0.80859375, + "learning_rate": 0.00016589884276775894, + "loss": 0.8847, + "step": 17708 + }, + { + "epoch": 0.454717462579499, + "grad_norm": 0.8203125, + "learning_rate": 0.00016589548493905638, + "loss": 0.9818, + "step": 17709 + }, + { + "epoch": 0.4547431397754208, + "grad_norm": 0.796875, + "learning_rate": 0.00016589212697903008, + "loss": 0.8524, + "step": 17710 + }, + { + "epoch": 0.45476881697134264, + "grad_norm": 0.7578125, + "learning_rate": 0.00016588876888768672, + "loss": 0.9588, + "step": 17711 + }, + { + "epoch": 0.4547944941672645, + "grad_norm": 0.765625, + "learning_rate": 0.00016588541066503296, + "loss": 0.9097, + "step": 17712 + }, + { + "epoch": 0.4548201713631863, + "grad_norm": 0.71484375, + "learning_rate": 0.00016588205231107556, + "loss": 0.9404, + "step": 17713 + }, + { + "epoch": 0.4548458485591081, + "grad_norm": 0.8125, + "learning_rate": 0.00016587869382582118, + "loss": 0.948, + "step": 17714 + }, + { + "epoch": 0.4548715257550299, + "grad_norm": 0.7578125, + "learning_rate": 0.00016587533520927652, + "loss": 0.8005, + "step": 17715 + }, + { + "epoch": 0.45489720295095176, + "grad_norm": 0.80078125, + "learning_rate": 0.00016587197646144826, + "loss": 0.9651, + "step": 17716 + }, + { + "epoch": 0.45492288014687354, + "grad_norm": 0.77734375, + "learning_rate": 0.00016586861758234307, + "loss": 0.9887, + "step": 17717 + }, + { + "epoch": 0.4549485573427954, + "grad_norm": 0.75, + "learning_rate": 0.0001658652585719677, + "loss": 0.8343, + "step": 17718 + }, + { + "epoch": 0.4549742345387172, + "grad_norm": 0.77734375, + "learning_rate": 0.00016586189943032883, + "loss": 0.9223, + "step": 17719 + }, + { + "epoch": 0.454999911734639, + "grad_norm": 0.83203125, + "learning_rate": 0.00016585854015743314, + "loss": 0.981, + "step": 17720 + }, + { + "epoch": 0.45502558893056083, + "grad_norm": 0.8125, + "learning_rate": 0.00016585518075328733, + "loss": 0.9646, + "step": 17721 + }, + { + "epoch": 0.45505126612648267, + "grad_norm": 0.8046875, + "learning_rate": 0.00016585182121789805, + "loss": 1.1557, + "step": 17722 + }, + { + "epoch": 0.4550769433224045, + "grad_norm": 0.8515625, + "learning_rate": 0.0001658484615512721, + "loss": 0.9063, + "step": 17723 + }, + { + "epoch": 0.4551026205183263, + "grad_norm": 0.78515625, + "learning_rate": 0.00016584510175341607, + "loss": 0.9715, + "step": 17724 + }, + { + "epoch": 0.4551282977142481, + "grad_norm": 0.74609375, + "learning_rate": 0.00016584174182433673, + "loss": 0.7553, + "step": 17725 + }, + { + "epoch": 0.45515397491016996, + "grad_norm": 0.8359375, + "learning_rate": 0.00016583838176404073, + "loss": 0.9823, + "step": 17726 + }, + { + "epoch": 0.45517965210609174, + "grad_norm": 0.94140625, + "learning_rate": 0.00016583502157253477, + "loss": 0.9152, + "step": 17727 + }, + { + "epoch": 0.4552053293020136, + "grad_norm": 0.78125, + "learning_rate": 0.00016583166124982558, + "loss": 1.0028, + "step": 17728 + }, + { + "epoch": 0.4552310064979354, + "grad_norm": 0.82421875, + "learning_rate": 0.00016582830079591985, + "loss": 0.8479, + "step": 17729 + }, + { + "epoch": 0.4552566836938572, + "grad_norm": 0.828125, + "learning_rate": 0.00016582494021082425, + "loss": 0.9156, + "step": 17730 + }, + { + "epoch": 0.455282360889779, + "grad_norm": 0.71484375, + "learning_rate": 0.00016582157949454547, + "loss": 0.7595, + "step": 17731 + }, + { + "epoch": 0.45530803808570086, + "grad_norm": 0.81640625, + "learning_rate": 0.00016581821864709024, + "loss": 0.7455, + "step": 17732 + }, + { + "epoch": 0.4553337152816227, + "grad_norm": 0.75, + "learning_rate": 0.00016581485766846527, + "loss": 1.0259, + "step": 17733 + }, + { + "epoch": 0.4553593924775445, + "grad_norm": 0.83203125, + "learning_rate": 0.0001658114965586772, + "loss": 1.0352, + "step": 17734 + }, + { + "epoch": 0.4553850696734663, + "grad_norm": 0.6796875, + "learning_rate": 0.00016580813531773278, + "loss": 0.946, + "step": 17735 + }, + { + "epoch": 0.45541074686938815, + "grad_norm": 0.71484375, + "learning_rate": 0.00016580477394563867, + "loss": 0.8418, + "step": 17736 + }, + { + "epoch": 0.45543642406530993, + "grad_norm": 0.71875, + "learning_rate": 0.0001658014124424016, + "loss": 0.7802, + "step": 17737 + }, + { + "epoch": 0.45546210126123177, + "grad_norm": 0.72265625, + "learning_rate": 0.00016579805080802826, + "loss": 0.892, + "step": 17738 + }, + { + "epoch": 0.4554877784571536, + "grad_norm": 0.79296875, + "learning_rate": 0.00016579468904252538, + "loss": 0.975, + "step": 17739 + }, + { + "epoch": 0.4555134556530754, + "grad_norm": 0.74609375, + "learning_rate": 0.0001657913271458996, + "loss": 0.8112, + "step": 17740 + }, + { + "epoch": 0.4555391328489972, + "grad_norm": 0.8046875, + "learning_rate": 0.00016578796511815765, + "loss": 0.935, + "step": 17741 + }, + { + "epoch": 0.45556481004491906, + "grad_norm": 0.75, + "learning_rate": 0.0001657846029593062, + "loss": 0.8378, + "step": 17742 + }, + { + "epoch": 0.4555904872408409, + "grad_norm": 0.7421875, + "learning_rate": 0.000165781240669352, + "loss": 0.864, + "step": 17743 + }, + { + "epoch": 0.45561616443676267, + "grad_norm": 0.78125, + "learning_rate": 0.00016577787824830176, + "loss": 0.8278, + "step": 17744 + }, + { + "epoch": 0.4556418416326845, + "grad_norm": 0.7734375, + "learning_rate": 0.00016577451569616209, + "loss": 0.9473, + "step": 17745 + }, + { + "epoch": 0.45566751882860634, + "grad_norm": 0.80859375, + "learning_rate": 0.00016577115301293983, + "loss": 0.9229, + "step": 17746 + }, + { + "epoch": 0.4556931960245281, + "grad_norm": 0.796875, + "learning_rate": 0.00016576779019864156, + "loss": 1.0282, + "step": 17747 + }, + { + "epoch": 0.45571887322044996, + "grad_norm": 0.89453125, + "learning_rate": 0.000165764427253274, + "loss": 0.8987, + "step": 17748 + }, + { + "epoch": 0.4557445504163718, + "grad_norm": 0.84375, + "learning_rate": 0.0001657610641768439, + "loss": 1.0006, + "step": 17749 + }, + { + "epoch": 0.4557702276122936, + "grad_norm": 0.75390625, + "learning_rate": 0.0001657577009693579, + "loss": 0.9168, + "step": 17750 + }, + { + "epoch": 0.4557959048082154, + "grad_norm": 0.6875, + "learning_rate": 0.00016575433763082282, + "loss": 0.8734, + "step": 17751 + }, + { + "epoch": 0.45582158200413725, + "grad_norm": 0.74609375, + "learning_rate": 0.00016575097416124523, + "loss": 0.8956, + "step": 17752 + }, + { + "epoch": 0.4558472592000591, + "grad_norm": 0.8125, + "learning_rate": 0.00016574761056063192, + "loss": 0.8244, + "step": 17753 + }, + { + "epoch": 0.45587293639598087, + "grad_norm": 0.78125, + "learning_rate": 0.00016574424682898954, + "loss": 0.8471, + "step": 17754 + }, + { + "epoch": 0.4558986135919027, + "grad_norm": 0.765625, + "learning_rate": 0.0001657408829663248, + "loss": 0.874, + "step": 17755 + }, + { + "epoch": 0.45592429078782454, + "grad_norm": 0.79296875, + "learning_rate": 0.00016573751897264446, + "loss": 1.1302, + "step": 17756 + }, + { + "epoch": 0.4559499679837463, + "grad_norm": 0.76171875, + "learning_rate": 0.00016573415484795516, + "loss": 0.9249, + "step": 17757 + }, + { + "epoch": 0.45597564517966815, + "grad_norm": 0.84765625, + "learning_rate": 0.0001657307905922636, + "loss": 1.1425, + "step": 17758 + }, + { + "epoch": 0.45600132237559, + "grad_norm": 0.76953125, + "learning_rate": 0.00016572742620557656, + "loss": 0.8852, + "step": 17759 + }, + { + "epoch": 0.45602699957151177, + "grad_norm": 0.84765625, + "learning_rate": 0.00016572406168790068, + "loss": 0.921, + "step": 17760 + }, + { + "epoch": 0.4560526767674336, + "grad_norm": 0.69921875, + "learning_rate": 0.00016572069703924268, + "loss": 0.8515, + "step": 17761 + }, + { + "epoch": 0.45607835396335544, + "grad_norm": 0.71875, + "learning_rate": 0.00016571733225960927, + "loss": 0.8323, + "step": 17762 + }, + { + "epoch": 0.4561040311592772, + "grad_norm": 0.7734375, + "learning_rate": 0.00016571396734900716, + "loss": 0.897, + "step": 17763 + }, + { + "epoch": 0.45612970835519906, + "grad_norm": 0.7421875, + "learning_rate": 0.00016571060230744303, + "loss": 0.9406, + "step": 17764 + }, + { + "epoch": 0.4561553855511209, + "grad_norm": 0.875, + "learning_rate": 0.00016570723713492359, + "loss": 0.889, + "step": 17765 + }, + { + "epoch": 0.45618106274704273, + "grad_norm": 0.73828125, + "learning_rate": 0.0001657038718314556, + "loss": 0.8834, + "step": 17766 + }, + { + "epoch": 0.4562067399429645, + "grad_norm": 0.73046875, + "learning_rate": 0.0001657005063970457, + "loss": 0.7988, + "step": 17767 + }, + { + "epoch": 0.45623241713888635, + "grad_norm": 0.7890625, + "learning_rate": 0.00016569714083170065, + "loss": 0.8573, + "step": 17768 + }, + { + "epoch": 0.4562580943348082, + "grad_norm": 0.8671875, + "learning_rate": 0.00016569377513542713, + "loss": 0.9912, + "step": 17769 + }, + { + "epoch": 0.45628377153072996, + "grad_norm": 0.6953125, + "learning_rate": 0.00016569040930823183, + "loss": 0.8179, + "step": 17770 + }, + { + "epoch": 0.4563094487266518, + "grad_norm": 0.73828125, + "learning_rate": 0.00016568704335012148, + "loss": 0.9266, + "step": 17771 + }, + { + "epoch": 0.45633512592257364, + "grad_norm": 0.796875, + "learning_rate": 0.00016568367726110282, + "loss": 0.9526, + "step": 17772 + }, + { + "epoch": 0.4563608031184954, + "grad_norm": 0.76953125, + "learning_rate": 0.0001656803110411825, + "loss": 0.9393, + "step": 17773 + }, + { + "epoch": 0.45638648031441725, + "grad_norm": 0.84765625, + "learning_rate": 0.00016567694469036726, + "loss": 0.8629, + "step": 17774 + }, + { + "epoch": 0.4564121575103391, + "grad_norm": 0.78515625, + "learning_rate": 0.00016567357820866375, + "loss": 0.9915, + "step": 17775 + }, + { + "epoch": 0.4564378347062609, + "grad_norm": 0.8359375, + "learning_rate": 0.00016567021159607877, + "loss": 0.8915, + "step": 17776 + }, + { + "epoch": 0.4564635119021827, + "grad_norm": 0.74609375, + "learning_rate": 0.00016566684485261904, + "loss": 0.9645, + "step": 17777 + }, + { + "epoch": 0.45648918909810454, + "grad_norm": 0.80078125, + "learning_rate": 0.00016566347797829117, + "loss": 0.7892, + "step": 17778 + }, + { + "epoch": 0.4565148662940264, + "grad_norm": 0.74609375, + "learning_rate": 0.00016566011097310192, + "loss": 0.8049, + "step": 17779 + }, + { + "epoch": 0.45654054348994816, + "grad_norm": 0.796875, + "learning_rate": 0.000165656743837058, + "loss": 0.8232, + "step": 17780 + }, + { + "epoch": 0.45656622068587, + "grad_norm": 0.76953125, + "learning_rate": 0.00016565337657016612, + "loss": 0.8457, + "step": 17781 + }, + { + "epoch": 0.45659189788179183, + "grad_norm": 0.8125, + "learning_rate": 0.00016565000917243298, + "loss": 0.8451, + "step": 17782 + }, + { + "epoch": 0.4566175750777136, + "grad_norm": 0.7421875, + "learning_rate": 0.00016564664164386535, + "loss": 0.7312, + "step": 17783 + }, + { + "epoch": 0.45664325227363545, + "grad_norm": 0.78515625, + "learning_rate": 0.00016564327398446986, + "loss": 0.8069, + "step": 17784 + }, + { + "epoch": 0.4566689294695573, + "grad_norm": 0.7734375, + "learning_rate": 0.00016563990619425324, + "loss": 0.8017, + "step": 17785 + }, + { + "epoch": 0.4566946066654791, + "grad_norm": 0.77734375, + "learning_rate": 0.00016563653827322226, + "loss": 0.8624, + "step": 17786 + }, + { + "epoch": 0.4567202838614009, + "grad_norm": 0.76171875, + "learning_rate": 0.00016563317022138353, + "loss": 0.9189, + "step": 17787 + }, + { + "epoch": 0.45674596105732274, + "grad_norm": 0.75, + "learning_rate": 0.00016562980203874386, + "loss": 0.8243, + "step": 17788 + }, + { + "epoch": 0.45677163825324457, + "grad_norm": 0.80078125, + "learning_rate": 0.00016562643372530992, + "loss": 0.8819, + "step": 17789 + }, + { + "epoch": 0.45679731544916635, + "grad_norm": 0.8046875, + "learning_rate": 0.0001656230652810884, + "loss": 0.87, + "step": 17790 + }, + { + "epoch": 0.4568229926450882, + "grad_norm": 0.8046875, + "learning_rate": 0.0001656196967060861, + "loss": 0.9404, + "step": 17791 + }, + { + "epoch": 0.45684866984101, + "grad_norm": 0.84765625, + "learning_rate": 0.0001656163280003096, + "loss": 0.8973, + "step": 17792 + }, + { + "epoch": 0.4568743470369318, + "grad_norm": 0.8203125, + "learning_rate": 0.0001656129591637657, + "loss": 0.8742, + "step": 17793 + }, + { + "epoch": 0.45690002423285364, + "grad_norm": 0.734375, + "learning_rate": 0.0001656095901964611, + "loss": 0.7816, + "step": 17794 + }, + { + "epoch": 0.4569257014287755, + "grad_norm": 0.71484375, + "learning_rate": 0.00016560622109840255, + "loss": 0.882, + "step": 17795 + }, + { + "epoch": 0.4569513786246973, + "grad_norm": 0.7890625, + "learning_rate": 0.00016560285186959672, + "loss": 0.8807, + "step": 17796 + }, + { + "epoch": 0.4569770558206191, + "grad_norm": 0.6875, + "learning_rate": 0.0001655994825100503, + "loss": 0.8162, + "step": 17797 + }, + { + "epoch": 0.45700273301654093, + "grad_norm": 0.71875, + "learning_rate": 0.00016559611301977006, + "loss": 0.9372, + "step": 17798 + }, + { + "epoch": 0.45702841021246277, + "grad_norm": 0.7890625, + "learning_rate": 0.00016559274339876268, + "loss": 0.881, + "step": 17799 + }, + { + "epoch": 0.45705408740838455, + "grad_norm": 0.76171875, + "learning_rate": 0.00016558937364703492, + "loss": 0.8379, + "step": 17800 + }, + { + "epoch": 0.4570797646043064, + "grad_norm": 0.7421875, + "learning_rate": 0.0001655860037645934, + "loss": 0.8602, + "step": 17801 + }, + { + "epoch": 0.4571054418002282, + "grad_norm": 0.73828125, + "learning_rate": 0.00016558263375144496, + "loss": 0.8115, + "step": 17802 + }, + { + "epoch": 0.45713111899615, + "grad_norm": 0.890625, + "learning_rate": 0.0001655792636075962, + "loss": 0.8122, + "step": 17803 + }, + { + "epoch": 0.45715679619207183, + "grad_norm": 0.69140625, + "learning_rate": 0.00016557589333305393, + "loss": 0.9976, + "step": 17804 + }, + { + "epoch": 0.45718247338799367, + "grad_norm": 0.76953125, + "learning_rate": 0.00016557252292782483, + "loss": 0.9123, + "step": 17805 + }, + { + "epoch": 0.4572081505839155, + "grad_norm": 0.73828125, + "learning_rate": 0.00016556915239191561, + "loss": 0.8006, + "step": 17806 + }, + { + "epoch": 0.4572338277798373, + "grad_norm": 0.73828125, + "learning_rate": 0.00016556578172533296, + "loss": 0.8327, + "step": 17807 + }, + { + "epoch": 0.4572595049757591, + "grad_norm": 0.74609375, + "learning_rate": 0.0001655624109280837, + "loss": 0.8119, + "step": 17808 + }, + { + "epoch": 0.45728518217168096, + "grad_norm": 0.74609375, + "learning_rate": 0.00016555904000017442, + "loss": 0.7957, + "step": 17809 + }, + { + "epoch": 0.45731085936760274, + "grad_norm": 0.83984375, + "learning_rate": 0.00016555566894161188, + "loss": 0.9482, + "step": 17810 + }, + { + "epoch": 0.4573365365635246, + "grad_norm": 0.73828125, + "learning_rate": 0.00016555229775240286, + "loss": 1.055, + "step": 17811 + }, + { + "epoch": 0.4573622137594464, + "grad_norm": 0.68359375, + "learning_rate": 0.00016554892643255402, + "loss": 0.8187, + "step": 17812 + }, + { + "epoch": 0.4573878909553682, + "grad_norm": 0.734375, + "learning_rate": 0.0001655455549820721, + "loss": 0.9121, + "step": 17813 + }, + { + "epoch": 0.45741356815129003, + "grad_norm": 0.765625, + "learning_rate": 0.0001655421834009638, + "loss": 0.9385, + "step": 17814 + }, + { + "epoch": 0.45743924534721186, + "grad_norm": 0.76171875, + "learning_rate": 0.00016553881168923583, + "loss": 0.8473, + "step": 17815 + }, + { + "epoch": 0.4574649225431337, + "grad_norm": 0.74609375, + "learning_rate": 0.00016553543984689496, + "loss": 0.9228, + "step": 17816 + }, + { + "epoch": 0.4574905997390555, + "grad_norm": 0.7265625, + "learning_rate": 0.0001655320678739478, + "loss": 0.9116, + "step": 17817 + }, + { + "epoch": 0.4575162769349773, + "grad_norm": 0.79296875, + "learning_rate": 0.00016552869577040125, + "loss": 0.845, + "step": 17818 + }, + { + "epoch": 0.45754195413089915, + "grad_norm": 0.84765625, + "learning_rate": 0.00016552532353626188, + "loss": 0.9203, + "step": 17819 + }, + { + "epoch": 0.45756763132682093, + "grad_norm": 0.76953125, + "learning_rate": 0.00016552195117153646, + "loss": 0.8943, + "step": 17820 + }, + { + "epoch": 0.45759330852274277, + "grad_norm": 0.7421875, + "learning_rate": 0.0001655185786762317, + "loss": 0.8085, + "step": 17821 + }, + { + "epoch": 0.4576189857186646, + "grad_norm": 0.796875, + "learning_rate": 0.00016551520605035435, + "loss": 0.9294, + "step": 17822 + }, + { + "epoch": 0.4576446629145864, + "grad_norm": 0.79296875, + "learning_rate": 0.00016551183329391114, + "loss": 0.9603, + "step": 17823 + }, + { + "epoch": 0.4576703401105082, + "grad_norm": 0.8828125, + "learning_rate": 0.0001655084604069087, + "loss": 0.8494, + "step": 17824 + }, + { + "epoch": 0.45769601730643006, + "grad_norm": 0.7578125, + "learning_rate": 0.00016550508738935387, + "loss": 1.0737, + "step": 17825 + }, + { + "epoch": 0.4577216945023519, + "grad_norm": 0.75, + "learning_rate": 0.0001655017142412533, + "loss": 0.8271, + "step": 17826 + }, + { + "epoch": 0.4577473716982737, + "grad_norm": 0.765625, + "learning_rate": 0.00016549834096261373, + "loss": 0.9228, + "step": 17827 + }, + { + "epoch": 0.4577730488941955, + "grad_norm": 0.80859375, + "learning_rate": 0.00016549496755344185, + "loss": 0.8701, + "step": 17828 + }, + { + "epoch": 0.45779872609011735, + "grad_norm": 0.81640625, + "learning_rate": 0.00016549159401374445, + "loss": 0.9038, + "step": 17829 + }, + { + "epoch": 0.4578244032860391, + "grad_norm": 0.79296875, + "learning_rate": 0.0001654882203435282, + "loss": 0.9453, + "step": 17830 + }, + { + "epoch": 0.45785008048196096, + "grad_norm": 0.75, + "learning_rate": 0.00016548484654279987, + "loss": 0.9036, + "step": 17831 + }, + { + "epoch": 0.4578757576778828, + "grad_norm": 0.76171875, + "learning_rate": 0.00016548147261156616, + "loss": 0.9599, + "step": 17832 + }, + { + "epoch": 0.4579014348738046, + "grad_norm": 0.7578125, + "learning_rate": 0.00016547809854983378, + "loss": 0.9144, + "step": 17833 + }, + { + "epoch": 0.4579271120697264, + "grad_norm": 0.74609375, + "learning_rate": 0.00016547472435760944, + "loss": 0.9466, + "step": 17834 + }, + { + "epoch": 0.45795278926564825, + "grad_norm": 0.7265625, + "learning_rate": 0.0001654713500348999, + "loss": 0.9029, + "step": 17835 + }, + { + "epoch": 0.4579784664615701, + "grad_norm": 0.78515625, + "learning_rate": 0.0001654679755817119, + "loss": 0.9025, + "step": 17836 + }, + { + "epoch": 0.45800414365749187, + "grad_norm": 0.80859375, + "learning_rate": 0.0001654646009980521, + "loss": 0.7985, + "step": 17837 + }, + { + "epoch": 0.4580298208534137, + "grad_norm": 0.83984375, + "learning_rate": 0.00016546122628392727, + "loss": 0.9154, + "step": 17838 + }, + { + "epoch": 0.45805549804933554, + "grad_norm": 0.765625, + "learning_rate": 0.00016545785143934417, + "loss": 0.9206, + "step": 17839 + }, + { + "epoch": 0.4580811752452573, + "grad_norm": 0.796875, + "learning_rate": 0.00016545447646430943, + "loss": 0.8886, + "step": 17840 + }, + { + "epoch": 0.45810685244117916, + "grad_norm": 0.76953125, + "learning_rate": 0.00016545110135882986, + "loss": 0.9246, + "step": 17841 + }, + { + "epoch": 0.458132529637101, + "grad_norm": 0.79296875, + "learning_rate": 0.00016544772612291214, + "loss": 0.9432, + "step": 17842 + }, + { + "epoch": 0.4581582068330228, + "grad_norm": 0.78125, + "learning_rate": 0.000165444350756563, + "loss": 0.9395, + "step": 17843 + }, + { + "epoch": 0.4581838840289446, + "grad_norm": 0.7734375, + "learning_rate": 0.00016544097525978923, + "loss": 0.9817, + "step": 17844 + }, + { + "epoch": 0.45820956122486645, + "grad_norm": 0.79296875, + "learning_rate": 0.0001654375996325975, + "loss": 0.8572, + "step": 17845 + }, + { + "epoch": 0.4582352384207883, + "grad_norm": 0.80078125, + "learning_rate": 0.00016543422387499455, + "loss": 0.9621, + "step": 17846 + }, + { + "epoch": 0.45826091561671006, + "grad_norm": 0.7421875, + "learning_rate": 0.00016543084798698705, + "loss": 0.9254, + "step": 17847 + }, + { + "epoch": 0.4582865928126319, + "grad_norm": 0.76953125, + "learning_rate": 0.0001654274719685818, + "loss": 0.9293, + "step": 17848 + }, + { + "epoch": 0.45831227000855373, + "grad_norm": 0.79296875, + "learning_rate": 0.00016542409581978554, + "loss": 1.0537, + "step": 17849 + }, + { + "epoch": 0.4583379472044755, + "grad_norm": 0.734375, + "learning_rate": 0.00016542071954060494, + "loss": 0.9566, + "step": 17850 + }, + { + "epoch": 0.45836362440039735, + "grad_norm": 0.75390625, + "learning_rate": 0.00016541734313104677, + "loss": 0.8668, + "step": 17851 + }, + { + "epoch": 0.4583893015963192, + "grad_norm": 0.7890625, + "learning_rate": 0.00016541396659111774, + "loss": 0.8406, + "step": 17852 + }, + { + "epoch": 0.45841497879224097, + "grad_norm": 0.8671875, + "learning_rate": 0.0001654105899208246, + "loss": 0.9677, + "step": 17853 + }, + { + "epoch": 0.4584406559881628, + "grad_norm": 0.78515625, + "learning_rate": 0.00016540721312017403, + "loss": 0.9777, + "step": 17854 + }, + { + "epoch": 0.45846633318408464, + "grad_norm": 0.8046875, + "learning_rate": 0.00016540383618917278, + "loss": 0.7919, + "step": 17855 + }, + { + "epoch": 0.4584920103800065, + "grad_norm": 0.77734375, + "learning_rate": 0.00016540045912782762, + "loss": 0.956, + "step": 17856 + }, + { + "epoch": 0.45851768757592826, + "grad_norm": 0.875, + "learning_rate": 0.00016539708193614528, + "loss": 0.9353, + "step": 17857 + }, + { + "epoch": 0.4585433647718501, + "grad_norm": 0.79296875, + "learning_rate": 0.00016539370461413243, + "loss": 0.9994, + "step": 17858 + }, + { + "epoch": 0.45856904196777193, + "grad_norm": 0.73046875, + "learning_rate": 0.00016539032716179582, + "loss": 0.8414, + "step": 17859 + }, + { + "epoch": 0.4585947191636937, + "grad_norm": 0.85546875, + "learning_rate": 0.00016538694957914224, + "loss": 1.029, + "step": 17860 + }, + { + "epoch": 0.45862039635961555, + "grad_norm": 0.859375, + "learning_rate": 0.00016538357186617834, + "loss": 0.8725, + "step": 17861 + }, + { + "epoch": 0.4586460735555374, + "grad_norm": 0.80078125, + "learning_rate": 0.0001653801940229109, + "loss": 1.0525, + "step": 17862 + }, + { + "epoch": 0.45867175075145916, + "grad_norm": 0.734375, + "learning_rate": 0.0001653768160493466, + "loss": 0.8923, + "step": 17863 + }, + { + "epoch": 0.458697427947381, + "grad_norm": 0.79296875, + "learning_rate": 0.00016537343794549228, + "loss": 0.8077, + "step": 17864 + }, + { + "epoch": 0.45872310514330283, + "grad_norm": 0.75, + "learning_rate": 0.00016537005971135456, + "loss": 0.7582, + "step": 17865 + }, + { + "epoch": 0.45874878233922467, + "grad_norm": 0.80859375, + "learning_rate": 0.00016536668134694023, + "loss": 1.0169, + "step": 17866 + }, + { + "epoch": 0.45877445953514645, + "grad_norm": 0.76171875, + "learning_rate": 0.000165363302852256, + "loss": 0.7511, + "step": 17867 + }, + { + "epoch": 0.4588001367310683, + "grad_norm": 0.73828125, + "learning_rate": 0.0001653599242273086, + "loss": 0.701, + "step": 17868 + }, + { + "epoch": 0.4588258139269901, + "grad_norm": 0.765625, + "learning_rate": 0.0001653565454721048, + "loss": 1.0287, + "step": 17869 + }, + { + "epoch": 0.4588514911229119, + "grad_norm": 0.78125, + "learning_rate": 0.00016535316658665127, + "loss": 0.8653, + "step": 17870 + }, + { + "epoch": 0.45887716831883374, + "grad_norm": 0.8203125, + "learning_rate": 0.00016534978757095484, + "loss": 1.0869, + "step": 17871 + }, + { + "epoch": 0.4589028455147556, + "grad_norm": 0.74609375, + "learning_rate": 0.00016534640842502215, + "loss": 0.8168, + "step": 17872 + }, + { + "epoch": 0.45892852271067736, + "grad_norm": 0.91796875, + "learning_rate": 0.00016534302914885998, + "loss": 1.047, + "step": 17873 + }, + { + "epoch": 0.4589541999065992, + "grad_norm": 0.78515625, + "learning_rate": 0.00016533964974247504, + "loss": 0.9826, + "step": 17874 + }, + { + "epoch": 0.45897987710252103, + "grad_norm": 0.8828125, + "learning_rate": 0.0001653362702058741, + "loss": 0.8321, + "step": 17875 + }, + { + "epoch": 0.45900555429844286, + "grad_norm": 0.75390625, + "learning_rate": 0.00016533289053906387, + "loss": 1.1268, + "step": 17876 + }, + { + "epoch": 0.45903123149436464, + "grad_norm": 0.83984375, + "learning_rate": 0.00016532951074205108, + "loss": 0.9456, + "step": 17877 + }, + { + "epoch": 0.4590569086902865, + "grad_norm": 0.890625, + "learning_rate": 0.0001653261308148425, + "loss": 0.9215, + "step": 17878 + }, + { + "epoch": 0.4590825858862083, + "grad_norm": 0.76171875, + "learning_rate": 0.00016532275075744482, + "loss": 0.9854, + "step": 17879 + }, + { + "epoch": 0.4591082630821301, + "grad_norm": 0.8515625, + "learning_rate": 0.00016531937056986479, + "loss": 0.9874, + "step": 17880 + }, + { + "epoch": 0.45913394027805193, + "grad_norm": 0.7109375, + "learning_rate": 0.00016531599025210918, + "loss": 0.804, + "step": 17881 + }, + { + "epoch": 0.45915961747397377, + "grad_norm": 0.81640625, + "learning_rate": 0.0001653126098041847, + "loss": 0.7791, + "step": 17882 + }, + { + "epoch": 0.45918529466989555, + "grad_norm": 0.8046875, + "learning_rate": 0.00016530922922609808, + "loss": 0.9407, + "step": 17883 + }, + { + "epoch": 0.4592109718658174, + "grad_norm": 0.78125, + "learning_rate": 0.00016530584851785605, + "loss": 0.7851, + "step": 17884 + }, + { + "epoch": 0.4592366490617392, + "grad_norm": 0.8203125, + "learning_rate": 0.0001653024676794654, + "loss": 0.9533, + "step": 17885 + }, + { + "epoch": 0.45926232625766106, + "grad_norm": 0.76953125, + "learning_rate": 0.0001652990867109328, + "loss": 0.9941, + "step": 17886 + }, + { + "epoch": 0.45928800345358284, + "grad_norm": 0.80078125, + "learning_rate": 0.00016529570561226505, + "loss": 0.9576, + "step": 17887 + }, + { + "epoch": 0.4593136806495047, + "grad_norm": 0.76171875, + "learning_rate": 0.00016529232438346883, + "loss": 0.8498, + "step": 17888 + }, + { + "epoch": 0.4593393578454265, + "grad_norm": 0.81640625, + "learning_rate": 0.0001652889430245509, + "loss": 0.9769, + "step": 17889 + }, + { + "epoch": 0.4593650350413483, + "grad_norm": 0.77734375, + "learning_rate": 0.000165285561535518, + "loss": 0.9531, + "step": 17890 + }, + { + "epoch": 0.4593907122372701, + "grad_norm": 0.8671875, + "learning_rate": 0.0001652821799163769, + "loss": 0.9977, + "step": 17891 + }, + { + "epoch": 0.45941638943319196, + "grad_norm": 0.91796875, + "learning_rate": 0.00016527879816713429, + "loss": 0.7913, + "step": 17892 + }, + { + "epoch": 0.45944206662911374, + "grad_norm": 0.71484375, + "learning_rate": 0.00016527541628779695, + "loss": 1.0295, + "step": 17893 + }, + { + "epoch": 0.4594677438250356, + "grad_norm": 0.71484375, + "learning_rate": 0.0001652720342783716, + "loss": 0.8985, + "step": 17894 + }, + { + "epoch": 0.4594934210209574, + "grad_norm": 0.76171875, + "learning_rate": 0.00016526865213886495, + "loss": 0.8889, + "step": 17895 + }, + { + "epoch": 0.45951909821687925, + "grad_norm": 0.7421875, + "learning_rate": 0.00016526526986928382, + "loss": 0.9138, + "step": 17896 + }, + { + "epoch": 0.45954477541280103, + "grad_norm": 0.8046875, + "learning_rate": 0.00016526188746963488, + "loss": 1.0414, + "step": 17897 + }, + { + "epoch": 0.45957045260872287, + "grad_norm": 0.7109375, + "learning_rate": 0.00016525850493992488, + "loss": 0.9058, + "step": 17898 + }, + { + "epoch": 0.4595961298046447, + "grad_norm": 0.77734375, + "learning_rate": 0.0001652551222801606, + "loss": 0.8017, + "step": 17899 + }, + { + "epoch": 0.4596218070005665, + "grad_norm": 0.74609375, + "learning_rate": 0.00016525173949034872, + "loss": 0.7997, + "step": 17900 + }, + { + "epoch": 0.4596474841964883, + "grad_norm": 0.81640625, + "learning_rate": 0.00016524835657049603, + "loss": 1.0064, + "step": 17901 + }, + { + "epoch": 0.45967316139241016, + "grad_norm": 0.74609375, + "learning_rate": 0.00016524497352060928, + "loss": 0.7831, + "step": 17902 + }, + { + "epoch": 0.45969883858833194, + "grad_norm": 0.70703125, + "learning_rate": 0.0001652415903406952, + "loss": 0.8358, + "step": 17903 + }, + { + "epoch": 0.4597245157842538, + "grad_norm": 0.828125, + "learning_rate": 0.0001652382070307605, + "loss": 0.8608, + "step": 17904 + }, + { + "epoch": 0.4597501929801756, + "grad_norm": 0.86328125, + "learning_rate": 0.00016523482359081197, + "loss": 1.036, + "step": 17905 + }, + { + "epoch": 0.45977587017609745, + "grad_norm": 0.79296875, + "learning_rate": 0.0001652314400208563, + "loss": 0.8436, + "step": 17906 + }, + { + "epoch": 0.4598015473720192, + "grad_norm": 0.8203125, + "learning_rate": 0.00016522805632090027, + "loss": 0.7545, + "step": 17907 + }, + { + "epoch": 0.45982722456794106, + "grad_norm": 0.77734375, + "learning_rate": 0.00016522467249095062, + "loss": 0.8115, + "step": 17908 + }, + { + "epoch": 0.4598529017638629, + "grad_norm": 0.75, + "learning_rate": 0.00016522128853101409, + "loss": 0.893, + "step": 17909 + }, + { + "epoch": 0.4598785789597847, + "grad_norm": 0.7421875, + "learning_rate": 0.00016521790444109743, + "loss": 0.7963, + "step": 17910 + }, + { + "epoch": 0.4599042561557065, + "grad_norm": 0.91015625, + "learning_rate": 0.00016521452022120735, + "loss": 0.9971, + "step": 17911 + }, + { + "epoch": 0.45992993335162835, + "grad_norm": 0.78125, + "learning_rate": 0.00016521113587135062, + "loss": 0.9178, + "step": 17912 + }, + { + "epoch": 0.45995561054755013, + "grad_norm": 0.7421875, + "learning_rate": 0.00016520775139153404, + "loss": 0.7967, + "step": 17913 + }, + { + "epoch": 0.45998128774347197, + "grad_norm": 0.73046875, + "learning_rate": 0.00016520436678176426, + "loss": 0.8306, + "step": 17914 + }, + { + "epoch": 0.4600069649393938, + "grad_norm": 0.76171875, + "learning_rate": 0.00016520098204204805, + "loss": 0.8974, + "step": 17915 + }, + { + "epoch": 0.46003264213531564, + "grad_norm": 0.74609375, + "learning_rate": 0.0001651975971723922, + "loss": 0.957, + "step": 17916 + }, + { + "epoch": 0.4600583193312374, + "grad_norm": 0.82421875, + "learning_rate": 0.00016519421217280343, + "loss": 0.8568, + "step": 17917 + }, + { + "epoch": 0.46008399652715926, + "grad_norm": 0.78125, + "learning_rate": 0.0001651908270432885, + "loss": 0.9001, + "step": 17918 + }, + { + "epoch": 0.4601096737230811, + "grad_norm": 0.77734375, + "learning_rate": 0.0001651874417838541, + "loss": 0.7582, + "step": 17919 + }, + { + "epoch": 0.4601353509190029, + "grad_norm": 0.78515625, + "learning_rate": 0.00016518405639450703, + "loss": 0.9293, + "step": 17920 + }, + { + "epoch": 0.4601610281149247, + "grad_norm": 0.7578125, + "learning_rate": 0.00016518067087525405, + "loss": 0.719, + "step": 17921 + }, + { + "epoch": 0.46018670531084654, + "grad_norm": 0.80078125, + "learning_rate": 0.00016517728522610185, + "loss": 0.8259, + "step": 17922 + }, + { + "epoch": 0.4602123825067683, + "grad_norm": 0.76953125, + "learning_rate": 0.0001651738994470572, + "loss": 1.0, + "step": 17923 + }, + { + "epoch": 0.46023805970269016, + "grad_norm": 0.76953125, + "learning_rate": 0.00016517051353812692, + "loss": 0.8132, + "step": 17924 + }, + { + "epoch": 0.460263736898612, + "grad_norm": 0.77734375, + "learning_rate": 0.00016516712749931764, + "loss": 0.9548, + "step": 17925 + }, + { + "epoch": 0.46028941409453383, + "grad_norm": 0.765625, + "learning_rate": 0.00016516374133063616, + "loss": 0.9214, + "step": 17926 + }, + { + "epoch": 0.4603150912904556, + "grad_norm": 0.7734375, + "learning_rate": 0.00016516035503208926, + "loss": 0.8892, + "step": 17927 + }, + { + "epoch": 0.46034076848637745, + "grad_norm": 0.87109375, + "learning_rate": 0.0001651569686036836, + "loss": 0.9734, + "step": 17928 + }, + { + "epoch": 0.4603664456822993, + "grad_norm": 0.76171875, + "learning_rate": 0.00016515358204542607, + "loss": 0.8857, + "step": 17929 + }, + { + "epoch": 0.46039212287822107, + "grad_norm": 0.71484375, + "learning_rate": 0.00016515019535732328, + "loss": 0.8531, + "step": 17930 + }, + { + "epoch": 0.4604178000741429, + "grad_norm": 0.73046875, + "learning_rate": 0.00016514680853938206, + "loss": 0.7684, + "step": 17931 + }, + { + "epoch": 0.46044347727006474, + "grad_norm": 0.75, + "learning_rate": 0.0001651434215916091, + "loss": 0.8947, + "step": 17932 + }, + { + "epoch": 0.4604691544659865, + "grad_norm": 0.78125, + "learning_rate": 0.0001651400345140112, + "loss": 1.0361, + "step": 17933 + }, + { + "epoch": 0.46049483166190835, + "grad_norm": 0.84765625, + "learning_rate": 0.00016513664730659512, + "loss": 1.1593, + "step": 17934 + }, + { + "epoch": 0.4605205088578302, + "grad_norm": 0.8046875, + "learning_rate": 0.00016513325996936757, + "loss": 0.9719, + "step": 17935 + }, + { + "epoch": 0.460546186053752, + "grad_norm": 0.74609375, + "learning_rate": 0.00016512987250233532, + "loss": 0.8425, + "step": 17936 + }, + { + "epoch": 0.4605718632496738, + "grad_norm": 0.7890625, + "learning_rate": 0.0001651264849055051, + "loss": 0.8254, + "step": 17937 + }, + { + "epoch": 0.46059754044559564, + "grad_norm": 0.81640625, + "learning_rate": 0.0001651230971788837, + "loss": 0.9373, + "step": 17938 + }, + { + "epoch": 0.4606232176415175, + "grad_norm": 0.73046875, + "learning_rate": 0.00016511970932247785, + "loss": 1.0185, + "step": 17939 + }, + { + "epoch": 0.46064889483743926, + "grad_norm": 0.7578125, + "learning_rate": 0.00016511632133629425, + "loss": 0.8814, + "step": 17940 + }, + { + "epoch": 0.4606745720333611, + "grad_norm": 0.6953125, + "learning_rate": 0.0001651129332203398, + "loss": 0.7815, + "step": 17941 + }, + { + "epoch": 0.46070024922928293, + "grad_norm": 0.71484375, + "learning_rate": 0.00016510954497462108, + "loss": 0.8708, + "step": 17942 + }, + { + "epoch": 0.4607259264252047, + "grad_norm": 0.79296875, + "learning_rate": 0.00016510615659914493, + "loss": 0.9325, + "step": 17943 + }, + { + "epoch": 0.46075160362112655, + "grad_norm": 0.7734375, + "learning_rate": 0.00016510276809391812, + "loss": 0.8821, + "step": 17944 + }, + { + "epoch": 0.4607772808170484, + "grad_norm": 0.73828125, + "learning_rate": 0.00016509937945894735, + "loss": 0.9566, + "step": 17945 + }, + { + "epoch": 0.4608029580129702, + "grad_norm": 0.80078125, + "learning_rate": 0.0001650959906942394, + "loss": 0.9602, + "step": 17946 + }, + { + "epoch": 0.460828635208892, + "grad_norm": 0.75390625, + "learning_rate": 0.000165092601799801, + "loss": 0.8612, + "step": 17947 + }, + { + "epoch": 0.46085431240481384, + "grad_norm": 0.8046875, + "learning_rate": 0.00016508921277563892, + "loss": 0.8175, + "step": 17948 + }, + { + "epoch": 0.4608799896007357, + "grad_norm": 0.76171875, + "learning_rate": 0.00016508582362175997, + "loss": 0.8625, + "step": 17949 + }, + { + "epoch": 0.46090566679665745, + "grad_norm": 0.70703125, + "learning_rate": 0.00016508243433817086, + "loss": 0.8458, + "step": 17950 + }, + { + "epoch": 0.4609313439925793, + "grad_norm": 0.7734375, + "learning_rate": 0.0001650790449248783, + "loss": 0.8174, + "step": 17951 + }, + { + "epoch": 0.4609570211885011, + "grad_norm": 0.734375, + "learning_rate": 0.00016507565538188907, + "loss": 0.7805, + "step": 17952 + }, + { + "epoch": 0.4609826983844229, + "grad_norm": 0.796875, + "learning_rate": 0.00016507226570920995, + "loss": 0.8451, + "step": 17953 + }, + { + "epoch": 0.46100837558034474, + "grad_norm": 0.76171875, + "learning_rate": 0.00016506887590684766, + "loss": 0.9116, + "step": 17954 + }, + { + "epoch": 0.4610340527762666, + "grad_norm": 0.796875, + "learning_rate": 0.00016506548597480905, + "loss": 0.7175, + "step": 17955 + }, + { + "epoch": 0.4610597299721884, + "grad_norm": 0.7578125, + "learning_rate": 0.00016506209591310076, + "loss": 0.9481, + "step": 17956 + }, + { + "epoch": 0.4610854071681102, + "grad_norm": 0.8125, + "learning_rate": 0.00016505870572172958, + "loss": 0.7908, + "step": 17957 + }, + { + "epoch": 0.46111108436403203, + "grad_norm": 0.796875, + "learning_rate": 0.00016505531540070227, + "loss": 1.0309, + "step": 17958 + }, + { + "epoch": 0.46113676155995387, + "grad_norm": 0.83203125, + "learning_rate": 0.0001650519249500256, + "loss": 0.8347, + "step": 17959 + }, + { + "epoch": 0.46116243875587565, + "grad_norm": 0.8046875, + "learning_rate": 0.00016504853436970633, + "loss": 0.8298, + "step": 17960 + }, + { + "epoch": 0.4611881159517975, + "grad_norm": 0.7578125, + "learning_rate": 0.0001650451436597512, + "loss": 0.8176, + "step": 17961 + }, + { + "epoch": 0.4612137931477193, + "grad_norm": 0.69140625, + "learning_rate": 0.00016504175282016698, + "loss": 0.7189, + "step": 17962 + }, + { + "epoch": 0.4612394703436411, + "grad_norm": 0.7890625, + "learning_rate": 0.00016503836185096044, + "loss": 0.9168, + "step": 17963 + }, + { + "epoch": 0.46126514753956294, + "grad_norm": 0.79296875, + "learning_rate": 0.0001650349707521383, + "loss": 0.7773, + "step": 17964 + }, + { + "epoch": 0.4612908247354848, + "grad_norm": 0.8359375, + "learning_rate": 0.00016503157952370734, + "loss": 1.1432, + "step": 17965 + }, + { + "epoch": 0.4613165019314066, + "grad_norm": 0.84375, + "learning_rate": 0.0001650281881656743, + "loss": 0.9784, + "step": 17966 + }, + { + "epoch": 0.4613421791273284, + "grad_norm": 0.80078125, + "learning_rate": 0.00016502479667804596, + "loss": 0.7573, + "step": 17967 + }, + { + "epoch": 0.4613678563232502, + "grad_norm": 0.73046875, + "learning_rate": 0.0001650214050608291, + "loss": 0.8594, + "step": 17968 + }, + { + "epoch": 0.46139353351917206, + "grad_norm": 0.8046875, + "learning_rate": 0.00016501801331403045, + "loss": 0.9068, + "step": 17969 + }, + { + "epoch": 0.46141921071509384, + "grad_norm": 0.765625, + "learning_rate": 0.00016501462143765676, + "loss": 0.9802, + "step": 17970 + }, + { + "epoch": 0.4614448879110157, + "grad_norm": 0.7890625, + "learning_rate": 0.00016501122943171477, + "loss": 0.873, + "step": 17971 + }, + { + "epoch": 0.4614705651069375, + "grad_norm": 0.80078125, + "learning_rate": 0.0001650078372962113, + "loss": 1.0033, + "step": 17972 + }, + { + "epoch": 0.4614962423028593, + "grad_norm": 0.79296875, + "learning_rate": 0.0001650044450311531, + "loss": 0.958, + "step": 17973 + }, + { + "epoch": 0.46152191949878113, + "grad_norm": 0.81640625, + "learning_rate": 0.0001650010526365469, + "loss": 0.9487, + "step": 17974 + }, + { + "epoch": 0.46154759669470297, + "grad_norm": 0.7578125, + "learning_rate": 0.00016499766011239945, + "loss": 0.9409, + "step": 17975 + }, + { + "epoch": 0.46157327389062475, + "grad_norm": 0.79296875, + "learning_rate": 0.00016499426745871757, + "loss": 0.9531, + "step": 17976 + }, + { + "epoch": 0.4615989510865466, + "grad_norm": 0.7890625, + "learning_rate": 0.00016499087467550794, + "loss": 0.89, + "step": 17977 + }, + { + "epoch": 0.4616246282824684, + "grad_norm": 0.76953125, + "learning_rate": 0.00016498748176277744, + "loss": 0.9588, + "step": 17978 + }, + { + "epoch": 0.46165030547839025, + "grad_norm": 0.75, + "learning_rate": 0.0001649840887205327, + "loss": 0.9199, + "step": 17979 + }, + { + "epoch": 0.46167598267431204, + "grad_norm": 0.82421875, + "learning_rate": 0.00016498069554878058, + "loss": 0.933, + "step": 17980 + }, + { + "epoch": 0.46170165987023387, + "grad_norm": 0.734375, + "learning_rate": 0.00016497730224752778, + "loss": 0.8637, + "step": 17981 + }, + { + "epoch": 0.4617273370661557, + "grad_norm": 0.8125, + "learning_rate": 0.00016497390881678106, + "loss": 1.0247, + "step": 17982 + }, + { + "epoch": 0.4617530142620775, + "grad_norm": 0.78515625, + "learning_rate": 0.00016497051525654727, + "loss": 0.8315, + "step": 17983 + }, + { + "epoch": 0.4617786914579993, + "grad_norm": 0.765625, + "learning_rate": 0.00016496712156683306, + "loss": 0.9005, + "step": 17984 + }, + { + "epoch": 0.46180436865392116, + "grad_norm": 0.8046875, + "learning_rate": 0.00016496372774764528, + "loss": 0.8285, + "step": 17985 + }, + { + "epoch": 0.46183004584984294, + "grad_norm": 0.734375, + "learning_rate": 0.00016496033379899064, + "loss": 0.847, + "step": 17986 + }, + { + "epoch": 0.4618557230457648, + "grad_norm": 0.8515625, + "learning_rate": 0.00016495693972087591, + "loss": 0.9094, + "step": 17987 + }, + { + "epoch": 0.4618814002416866, + "grad_norm": 0.77734375, + "learning_rate": 0.00016495354551330788, + "loss": 0.8302, + "step": 17988 + }, + { + "epoch": 0.46190707743760845, + "grad_norm": 0.765625, + "learning_rate": 0.00016495015117629332, + "loss": 0.8889, + "step": 17989 + }, + { + "epoch": 0.46193275463353023, + "grad_norm": 0.78125, + "learning_rate": 0.00016494675670983896, + "loss": 0.9579, + "step": 17990 + }, + { + "epoch": 0.46195843182945207, + "grad_norm": 0.8046875, + "learning_rate": 0.00016494336211395159, + "loss": 0.9271, + "step": 17991 + }, + { + "epoch": 0.4619841090253739, + "grad_norm": 0.8828125, + "learning_rate": 0.00016493996738863796, + "loss": 1.0151, + "step": 17992 + }, + { + "epoch": 0.4620097862212957, + "grad_norm": 0.70703125, + "learning_rate": 0.00016493657253390483, + "loss": 0.8145, + "step": 17993 + }, + { + "epoch": 0.4620354634172175, + "grad_norm": 0.72265625, + "learning_rate": 0.00016493317754975897, + "loss": 0.9059, + "step": 17994 + }, + { + "epoch": 0.46206114061313935, + "grad_norm": 0.7421875, + "learning_rate": 0.00016492978243620715, + "loss": 1.0877, + "step": 17995 + }, + { + "epoch": 0.46208681780906113, + "grad_norm": 0.734375, + "learning_rate": 0.00016492638719325616, + "loss": 0.8735, + "step": 17996 + }, + { + "epoch": 0.46211249500498297, + "grad_norm": 0.74609375, + "learning_rate": 0.00016492299182091274, + "loss": 0.9088, + "step": 17997 + }, + { + "epoch": 0.4621381722009048, + "grad_norm": 0.828125, + "learning_rate": 0.00016491959631918367, + "loss": 0.8969, + "step": 17998 + }, + { + "epoch": 0.46216384939682664, + "grad_norm": 0.73046875, + "learning_rate": 0.0001649162006880757, + "loss": 0.834, + "step": 17999 + }, + { + "epoch": 0.4621895265927484, + "grad_norm": 0.8359375, + "learning_rate": 0.0001649128049275956, + "loss": 0.8521, + "step": 18000 + }, + { + "epoch": 0.4621895265927484, + "eval_loss": 0.8996534943580627, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 403.3605, + "eval_samples_per_second": 24.792, + "eval_steps_per_second": 0.776, + "step": 18000 + }, + { + "epoch": 0.46221520378867026, + "grad_norm": 0.78515625, + "learning_rate": 0.00016490940903775015, + "loss": 1.0669, + "step": 18001 + }, + { + "epoch": 0.4622408809845921, + "grad_norm": 0.7265625, + "learning_rate": 0.0001649060130185461, + "loss": 0.8496, + "step": 18002 + }, + { + "epoch": 0.4622665581805139, + "grad_norm": 0.86328125, + "learning_rate": 0.00016490261686999026, + "loss": 0.9214, + "step": 18003 + }, + { + "epoch": 0.4622922353764357, + "grad_norm": 0.78125, + "learning_rate": 0.00016489922059208936, + "loss": 0.7453, + "step": 18004 + }, + { + "epoch": 0.46231791257235755, + "grad_norm": 0.7890625, + "learning_rate": 0.00016489582418485014, + "loss": 0.8727, + "step": 18005 + }, + { + "epoch": 0.46234358976827933, + "grad_norm": 0.73046875, + "learning_rate": 0.00016489242764827942, + "loss": 0.7963, + "step": 18006 + }, + { + "epoch": 0.46236926696420116, + "grad_norm": 0.73828125, + "learning_rate": 0.00016488903098238396, + "loss": 0.8651, + "step": 18007 + }, + { + "epoch": 0.462394944160123, + "grad_norm": 0.7734375, + "learning_rate": 0.0001648856341871705, + "loss": 0.8404, + "step": 18008 + }, + { + "epoch": 0.46242062135604484, + "grad_norm": 0.83984375, + "learning_rate": 0.0001648822372626459, + "loss": 0.8374, + "step": 18009 + }, + { + "epoch": 0.4624462985519666, + "grad_norm": 0.76953125, + "learning_rate": 0.0001648788402088168, + "loss": 0.9875, + "step": 18010 + }, + { + "epoch": 0.46247197574788845, + "grad_norm": 0.7734375, + "learning_rate": 0.00016487544302569006, + "loss": 0.8843, + "step": 18011 + }, + { + "epoch": 0.4624976529438103, + "grad_norm": 0.78515625, + "learning_rate": 0.0001648720457132724, + "loss": 1.0226, + "step": 18012 + }, + { + "epoch": 0.46252333013973207, + "grad_norm": 0.7265625, + "learning_rate": 0.00016486864827157063, + "loss": 0.8666, + "step": 18013 + }, + { + "epoch": 0.4625490073356539, + "grad_norm": 0.76953125, + "learning_rate": 0.00016486525070059149, + "loss": 0.9014, + "step": 18014 + }, + { + "epoch": 0.46257468453157574, + "grad_norm": 0.7265625, + "learning_rate": 0.00016486185300034175, + "loss": 0.8226, + "step": 18015 + }, + { + "epoch": 0.4626003617274975, + "grad_norm": 0.83203125, + "learning_rate": 0.00016485845517082823, + "loss": 0.8525, + "step": 18016 + }, + { + "epoch": 0.46262603892341936, + "grad_norm": 0.8125, + "learning_rate": 0.00016485505721205765, + "loss": 0.9167, + "step": 18017 + }, + { + "epoch": 0.4626517161193412, + "grad_norm": 0.83984375, + "learning_rate": 0.00016485165912403683, + "loss": 0.9242, + "step": 18018 + }, + { + "epoch": 0.46267739331526303, + "grad_norm": 0.796875, + "learning_rate": 0.00016484826090677247, + "loss": 0.9163, + "step": 18019 + }, + { + "epoch": 0.4627030705111848, + "grad_norm": 0.7421875, + "learning_rate": 0.00016484486256027137, + "loss": 0.7344, + "step": 18020 + }, + { + "epoch": 0.46272874770710665, + "grad_norm": 0.74609375, + "learning_rate": 0.00016484146408454037, + "loss": 0.9351, + "step": 18021 + }, + { + "epoch": 0.4627544249030285, + "grad_norm": 0.73046875, + "learning_rate": 0.0001648380654795862, + "loss": 0.7779, + "step": 18022 + }, + { + "epoch": 0.46278010209895026, + "grad_norm": 0.7734375, + "learning_rate": 0.00016483466674541557, + "loss": 0.8924, + "step": 18023 + }, + { + "epoch": 0.4628057792948721, + "grad_norm": 0.765625, + "learning_rate": 0.00016483126788203531, + "loss": 0.8394, + "step": 18024 + }, + { + "epoch": 0.46283145649079394, + "grad_norm": 0.8203125, + "learning_rate": 0.0001648278688894522, + "loss": 0.882, + "step": 18025 + }, + { + "epoch": 0.4628571336867157, + "grad_norm": 0.80078125, + "learning_rate": 0.00016482446976767302, + "loss": 0.9615, + "step": 18026 + }, + { + "epoch": 0.46288281088263755, + "grad_norm": 0.93359375, + "learning_rate": 0.0001648210705167045, + "loss": 0.8776, + "step": 18027 + }, + { + "epoch": 0.4629084880785594, + "grad_norm": 0.81640625, + "learning_rate": 0.00016481767113655344, + "loss": 0.9761, + "step": 18028 + }, + { + "epoch": 0.4629341652744812, + "grad_norm": 0.80859375, + "learning_rate": 0.00016481427162722665, + "loss": 0.9153, + "step": 18029 + }, + { + "epoch": 0.462959842470403, + "grad_norm": 0.76171875, + "learning_rate": 0.00016481087198873086, + "loss": 0.8487, + "step": 18030 + }, + { + "epoch": 0.46298551966632484, + "grad_norm": 0.72265625, + "learning_rate": 0.00016480747222107284, + "loss": 0.8233, + "step": 18031 + }, + { + "epoch": 0.4630111968622467, + "grad_norm": 0.78125, + "learning_rate": 0.00016480407232425938, + "loss": 0.8452, + "step": 18032 + }, + { + "epoch": 0.46303687405816846, + "grad_norm": 0.78125, + "learning_rate": 0.00016480067229829727, + "loss": 0.8862, + "step": 18033 + }, + { + "epoch": 0.4630625512540903, + "grad_norm": 0.78515625, + "learning_rate": 0.00016479727214319326, + "loss": 1.0275, + "step": 18034 + }, + { + "epoch": 0.46308822845001213, + "grad_norm": 0.8359375, + "learning_rate": 0.0001647938718589542, + "loss": 1.1197, + "step": 18035 + }, + { + "epoch": 0.4631139056459339, + "grad_norm": 0.77734375, + "learning_rate": 0.00016479047144558675, + "loss": 0.7652, + "step": 18036 + }, + { + "epoch": 0.46313958284185575, + "grad_norm": 0.85546875, + "learning_rate": 0.00016478707090309772, + "loss": 1.0769, + "step": 18037 + }, + { + "epoch": 0.4631652600377776, + "grad_norm": 0.765625, + "learning_rate": 0.00016478367023149395, + "loss": 0.965, + "step": 18038 + }, + { + "epoch": 0.4631909372336994, + "grad_norm": 0.76953125, + "learning_rate": 0.00016478026943078216, + "loss": 0.9382, + "step": 18039 + }, + { + "epoch": 0.4632166144296212, + "grad_norm": 0.796875, + "learning_rate": 0.00016477686850096916, + "loss": 0.8995, + "step": 18040 + }, + { + "epoch": 0.46324229162554303, + "grad_norm": 0.78125, + "learning_rate": 0.0001647734674420617, + "loss": 0.9453, + "step": 18041 + }, + { + "epoch": 0.46326796882146487, + "grad_norm": 0.7734375, + "learning_rate": 0.0001647700662540666, + "loss": 0.858, + "step": 18042 + }, + { + "epoch": 0.46329364601738665, + "grad_norm": 0.94140625, + "learning_rate": 0.00016476666493699056, + "loss": 0.975, + "step": 18043 + }, + { + "epoch": 0.4633193232133085, + "grad_norm": 0.75390625, + "learning_rate": 0.00016476326349084043, + "loss": 0.8761, + "step": 18044 + }, + { + "epoch": 0.4633450004092303, + "grad_norm": 0.796875, + "learning_rate": 0.000164759861915623, + "loss": 0.801, + "step": 18045 + }, + { + "epoch": 0.4633706776051521, + "grad_norm": 0.76953125, + "learning_rate": 0.00016475646021134495, + "loss": 0.8625, + "step": 18046 + }, + { + "epoch": 0.46339635480107394, + "grad_norm": 0.71875, + "learning_rate": 0.00016475305837801316, + "loss": 0.912, + "step": 18047 + }, + { + "epoch": 0.4634220319969958, + "grad_norm": 0.80078125, + "learning_rate": 0.0001647496564156344, + "loss": 0.9135, + "step": 18048 + }, + { + "epoch": 0.4634477091929176, + "grad_norm": 0.7890625, + "learning_rate": 0.00016474625432421534, + "loss": 0.7802, + "step": 18049 + }, + { + "epoch": 0.4634733863888394, + "grad_norm": 0.828125, + "learning_rate": 0.00016474285210376293, + "loss": 0.8749, + "step": 18050 + }, + { + "epoch": 0.46349906358476123, + "grad_norm": 0.8125, + "learning_rate": 0.00016473944975428382, + "loss": 0.8955, + "step": 18051 + }, + { + "epoch": 0.46352474078068306, + "grad_norm": 0.8046875, + "learning_rate": 0.00016473604727578484, + "loss": 0.863, + "step": 18052 + }, + { + "epoch": 0.46355041797660484, + "grad_norm": 0.7578125, + "learning_rate": 0.0001647326446682728, + "loss": 0.8853, + "step": 18053 + }, + { + "epoch": 0.4635760951725267, + "grad_norm": 0.78515625, + "learning_rate": 0.00016472924193175438, + "loss": 0.8593, + "step": 18054 + }, + { + "epoch": 0.4636017723684485, + "grad_norm": 0.81640625, + "learning_rate": 0.0001647258390662365, + "loss": 0.9053, + "step": 18055 + }, + { + "epoch": 0.4636274495643703, + "grad_norm": 0.73046875, + "learning_rate": 0.0001647224360717258, + "loss": 0.8982, + "step": 18056 + }, + { + "epoch": 0.46365312676029213, + "grad_norm": 0.73828125, + "learning_rate": 0.00016471903294822918, + "loss": 0.9322, + "step": 18057 + }, + { + "epoch": 0.46367880395621397, + "grad_norm": 0.73828125, + "learning_rate": 0.00016471562969575337, + "loss": 1.0279, + "step": 18058 + }, + { + "epoch": 0.4637044811521358, + "grad_norm": 0.87890625, + "learning_rate": 0.00016471222631430515, + "loss": 0.9318, + "step": 18059 + }, + { + "epoch": 0.4637301583480576, + "grad_norm": 0.765625, + "learning_rate": 0.00016470882280389127, + "loss": 0.8655, + "step": 18060 + }, + { + "epoch": 0.4637558355439794, + "grad_norm": 0.79296875, + "learning_rate": 0.0001647054191645186, + "loss": 0.8997, + "step": 18061 + }, + { + "epoch": 0.46378151273990126, + "grad_norm": 0.80078125, + "learning_rate": 0.00016470201539619386, + "loss": 0.8834, + "step": 18062 + }, + { + "epoch": 0.46380718993582304, + "grad_norm": 0.80859375, + "learning_rate": 0.00016469861149892383, + "loss": 0.9575, + "step": 18063 + }, + { + "epoch": 0.4638328671317449, + "grad_norm": 0.73046875, + "learning_rate": 0.00016469520747271535, + "loss": 0.8295, + "step": 18064 + }, + { + "epoch": 0.4638585443276667, + "grad_norm": 0.6953125, + "learning_rate": 0.00016469180331757512, + "loss": 0.8422, + "step": 18065 + }, + { + "epoch": 0.4638842215235885, + "grad_norm": 0.84375, + "learning_rate": 0.00016468839903351, + "loss": 0.9773, + "step": 18066 + }, + { + "epoch": 0.4639098987195103, + "grad_norm": 0.81640625, + "learning_rate": 0.00016468499462052674, + "loss": 0.8739, + "step": 18067 + }, + { + "epoch": 0.46393557591543216, + "grad_norm": 0.7578125, + "learning_rate": 0.00016468159007863213, + "loss": 0.7716, + "step": 18068 + }, + { + "epoch": 0.463961253111354, + "grad_norm": 0.734375, + "learning_rate": 0.00016467818540783294, + "loss": 0.9163, + "step": 18069 + }, + { + "epoch": 0.4639869303072758, + "grad_norm": 0.78125, + "learning_rate": 0.00016467478060813598, + "loss": 0.831, + "step": 18070 + }, + { + "epoch": 0.4640126075031976, + "grad_norm": 0.71484375, + "learning_rate": 0.000164671375679548, + "loss": 0.9326, + "step": 18071 + }, + { + "epoch": 0.46403828469911945, + "grad_norm": 0.78515625, + "learning_rate": 0.00016466797062207586, + "loss": 0.9521, + "step": 18072 + }, + { + "epoch": 0.46406396189504123, + "grad_norm": 0.79296875, + "learning_rate": 0.00016466456543572626, + "loss": 0.9053, + "step": 18073 + }, + { + "epoch": 0.46408963909096307, + "grad_norm": 0.76953125, + "learning_rate": 0.00016466116012050602, + "loss": 0.8601, + "step": 18074 + }, + { + "epoch": 0.4641153162868849, + "grad_norm": 0.7265625, + "learning_rate": 0.00016465775467642196, + "loss": 0.8376, + "step": 18075 + }, + { + "epoch": 0.4641409934828067, + "grad_norm": 0.78515625, + "learning_rate": 0.0001646543491034808, + "loss": 0.9507, + "step": 18076 + }, + { + "epoch": 0.4641666706787285, + "grad_norm": 0.796875, + "learning_rate": 0.0001646509434016894, + "loss": 0.8382, + "step": 18077 + }, + { + "epoch": 0.46419234787465036, + "grad_norm": 0.82421875, + "learning_rate": 0.00016464753757105445, + "loss": 0.8752, + "step": 18078 + }, + { + "epoch": 0.4642180250705722, + "grad_norm": 0.74609375, + "learning_rate": 0.00016464413161158286, + "loss": 0.8721, + "step": 18079 + }, + { + "epoch": 0.464243702266494, + "grad_norm": 0.765625, + "learning_rate": 0.00016464072552328131, + "loss": 0.8792, + "step": 18080 + }, + { + "epoch": 0.4642693794624158, + "grad_norm": 0.7578125, + "learning_rate": 0.00016463731930615666, + "loss": 0.8639, + "step": 18081 + }, + { + "epoch": 0.46429505665833765, + "grad_norm": 0.8125, + "learning_rate": 0.00016463391296021567, + "loss": 0.9279, + "step": 18082 + }, + { + "epoch": 0.4643207338542594, + "grad_norm": 0.84765625, + "learning_rate": 0.00016463050648546514, + "loss": 0.9501, + "step": 18083 + }, + { + "epoch": 0.46434641105018126, + "grad_norm": 0.77734375, + "learning_rate": 0.0001646270998819118, + "loss": 0.8808, + "step": 18084 + }, + { + "epoch": 0.4643720882461031, + "grad_norm": 0.7421875, + "learning_rate": 0.0001646236931495625, + "loss": 0.8428, + "step": 18085 + }, + { + "epoch": 0.4643977654420249, + "grad_norm": 0.79296875, + "learning_rate": 0.00016462028628842407, + "loss": 0.9196, + "step": 18086 + }, + { + "epoch": 0.4644234426379467, + "grad_norm": 0.69140625, + "learning_rate": 0.0001646168792985032, + "loss": 0.8252, + "step": 18087 + }, + { + "epoch": 0.46444911983386855, + "grad_norm": 0.76171875, + "learning_rate": 0.00016461347217980676, + "loss": 1.044, + "step": 18088 + }, + { + "epoch": 0.4644747970297904, + "grad_norm": 0.734375, + "learning_rate": 0.00016461006493234147, + "loss": 0.9556, + "step": 18089 + }, + { + "epoch": 0.46450047422571217, + "grad_norm": 0.79296875, + "learning_rate": 0.00016460665755611416, + "loss": 0.9427, + "step": 18090 + }, + { + "epoch": 0.464526151421634, + "grad_norm": 0.84375, + "learning_rate": 0.00016460325005113164, + "loss": 0.9915, + "step": 18091 + }, + { + "epoch": 0.46455182861755584, + "grad_norm": 0.734375, + "learning_rate": 0.00016459984241740068, + "loss": 0.861, + "step": 18092 + }, + { + "epoch": 0.4645775058134776, + "grad_norm": 0.71484375, + "learning_rate": 0.00016459643465492805, + "loss": 0.7977, + "step": 18093 + }, + { + "epoch": 0.46460318300939946, + "grad_norm": 0.7578125, + "learning_rate": 0.00016459302676372057, + "loss": 0.9747, + "step": 18094 + }, + { + "epoch": 0.4646288602053213, + "grad_norm": 0.7421875, + "learning_rate": 0.00016458961874378503, + "loss": 1.0541, + "step": 18095 + }, + { + "epoch": 0.4646545374012431, + "grad_norm": 0.765625, + "learning_rate": 0.0001645862105951282, + "loss": 0.8711, + "step": 18096 + }, + { + "epoch": 0.4646802145971649, + "grad_norm": 0.68359375, + "learning_rate": 0.00016458280231775689, + "loss": 0.8658, + "step": 18097 + }, + { + "epoch": 0.46470589179308674, + "grad_norm": 0.75, + "learning_rate": 0.0001645793939116779, + "loss": 0.7941, + "step": 18098 + }, + { + "epoch": 0.4647315689890086, + "grad_norm": 0.7890625, + "learning_rate": 0.000164575985376898, + "loss": 0.8914, + "step": 18099 + }, + { + "epoch": 0.46475724618493036, + "grad_norm": 0.90625, + "learning_rate": 0.000164572576713424, + "loss": 0.935, + "step": 18100 + }, + { + "epoch": 0.4647829233808522, + "grad_norm": 0.78515625, + "learning_rate": 0.00016456916792126266, + "loss": 1.0561, + "step": 18101 + }, + { + "epoch": 0.46480860057677403, + "grad_norm": 0.80078125, + "learning_rate": 0.00016456575900042082, + "loss": 0.8422, + "step": 18102 + }, + { + "epoch": 0.4648342777726958, + "grad_norm": 0.8515625, + "learning_rate": 0.00016456234995090526, + "loss": 0.9906, + "step": 18103 + }, + { + "epoch": 0.46485995496861765, + "grad_norm": 0.9375, + "learning_rate": 0.00016455894077272278, + "loss": 0.963, + "step": 18104 + }, + { + "epoch": 0.4648856321645395, + "grad_norm": 0.72265625, + "learning_rate": 0.00016455553146588013, + "loss": 0.8826, + "step": 18105 + }, + { + "epoch": 0.46491130936046127, + "grad_norm": 0.73828125, + "learning_rate": 0.00016455212203038416, + "loss": 0.7902, + "step": 18106 + }, + { + "epoch": 0.4649369865563831, + "grad_norm": 0.73046875, + "learning_rate": 0.00016454871246624164, + "loss": 0.9971, + "step": 18107 + }, + { + "epoch": 0.46496266375230494, + "grad_norm": 0.75, + "learning_rate": 0.00016454530277345937, + "loss": 0.9435, + "step": 18108 + }, + { + "epoch": 0.4649883409482268, + "grad_norm": 0.74609375, + "learning_rate": 0.00016454189295204413, + "loss": 0.8876, + "step": 18109 + }, + { + "epoch": 0.46501401814414856, + "grad_norm": 0.75, + "learning_rate": 0.00016453848300200275, + "loss": 0.9845, + "step": 18110 + }, + { + "epoch": 0.4650396953400704, + "grad_norm": 0.84765625, + "learning_rate": 0.00016453507292334198, + "loss": 0.9927, + "step": 18111 + }, + { + "epoch": 0.4650653725359922, + "grad_norm": 0.77734375, + "learning_rate": 0.00016453166271606864, + "loss": 0.84, + "step": 18112 + }, + { + "epoch": 0.465091049731914, + "grad_norm": 0.79296875, + "learning_rate": 0.00016452825238018952, + "loss": 1.0215, + "step": 18113 + }, + { + "epoch": 0.46511672692783584, + "grad_norm": 0.796875, + "learning_rate": 0.00016452484191571145, + "loss": 0.7724, + "step": 18114 + }, + { + "epoch": 0.4651424041237577, + "grad_norm": 0.79296875, + "learning_rate": 0.0001645214313226412, + "loss": 0.8763, + "step": 18115 + }, + { + "epoch": 0.46516808131967946, + "grad_norm": 0.80078125, + "learning_rate": 0.00016451802060098554, + "loss": 0.9301, + "step": 18116 + }, + { + "epoch": 0.4651937585156013, + "grad_norm": 0.76171875, + "learning_rate": 0.0001645146097507513, + "loss": 1.0008, + "step": 18117 + }, + { + "epoch": 0.46521943571152313, + "grad_norm": 0.765625, + "learning_rate": 0.00016451119877194524, + "loss": 0.8572, + "step": 18118 + }, + { + "epoch": 0.46524511290744497, + "grad_norm": 0.76171875, + "learning_rate": 0.00016450778766457423, + "loss": 0.9262, + "step": 18119 + }, + { + "epoch": 0.46527079010336675, + "grad_norm": 0.7734375, + "learning_rate": 0.000164504376428645, + "loss": 0.7941, + "step": 18120 + }, + { + "epoch": 0.4652964672992886, + "grad_norm": 0.83984375, + "learning_rate": 0.00016450096506416442, + "loss": 0.9856, + "step": 18121 + }, + { + "epoch": 0.4653221444952104, + "grad_norm": 0.77734375, + "learning_rate": 0.0001644975535711392, + "loss": 0.9265, + "step": 18122 + }, + { + "epoch": 0.4653478216911322, + "grad_norm": 0.8046875, + "learning_rate": 0.0001644941419495762, + "loss": 0.8952, + "step": 18123 + }, + { + "epoch": 0.46537349888705404, + "grad_norm": 0.734375, + "learning_rate": 0.0001644907301994822, + "loss": 0.827, + "step": 18124 + }, + { + "epoch": 0.4653991760829759, + "grad_norm": 0.78515625, + "learning_rate": 0.000164487318320864, + "loss": 0.8776, + "step": 18125 + }, + { + "epoch": 0.46542485327889765, + "grad_norm": 0.89453125, + "learning_rate": 0.00016448390631372837, + "loss": 0.9737, + "step": 18126 + }, + { + "epoch": 0.4654505304748195, + "grad_norm": 0.7578125, + "learning_rate": 0.0001644804941780822, + "loss": 0.8338, + "step": 18127 + }, + { + "epoch": 0.4654762076707413, + "grad_norm": 0.82421875, + "learning_rate": 0.00016447708191393218, + "loss": 0.9375, + "step": 18128 + }, + { + "epoch": 0.46550188486666316, + "grad_norm": 0.8203125, + "learning_rate": 0.00016447366952128517, + "loss": 0.9855, + "step": 18129 + }, + { + "epoch": 0.46552756206258494, + "grad_norm": 0.796875, + "learning_rate": 0.00016447025700014794, + "loss": 0.8919, + "step": 18130 + }, + { + "epoch": 0.4655532392585068, + "grad_norm": 0.83203125, + "learning_rate": 0.00016446684435052737, + "loss": 0.9401, + "step": 18131 + }, + { + "epoch": 0.4655789164544286, + "grad_norm": 0.734375, + "learning_rate": 0.00016446343157243014, + "loss": 0.9231, + "step": 18132 + }, + { + "epoch": 0.4656045936503504, + "grad_norm": 0.7890625, + "learning_rate": 0.00016446001866586313, + "loss": 0.8802, + "step": 18133 + }, + { + "epoch": 0.46563027084627223, + "grad_norm": 0.734375, + "learning_rate": 0.00016445660563083316, + "loss": 0.9415, + "step": 18134 + }, + { + "epoch": 0.46565594804219407, + "grad_norm": 0.81640625, + "learning_rate": 0.00016445319246734697, + "loss": 0.9305, + "step": 18135 + }, + { + "epoch": 0.46568162523811585, + "grad_norm": 0.796875, + "learning_rate": 0.00016444977917541142, + "loss": 0.8179, + "step": 18136 + }, + { + "epoch": 0.4657073024340377, + "grad_norm": 0.80078125, + "learning_rate": 0.00016444636575503325, + "loss": 0.8899, + "step": 18137 + }, + { + "epoch": 0.4657329796299595, + "grad_norm": 0.703125, + "learning_rate": 0.00016444295220621928, + "loss": 0.7807, + "step": 18138 + }, + { + "epoch": 0.46575865682588136, + "grad_norm": 0.73046875, + "learning_rate": 0.00016443953852897635, + "loss": 0.8714, + "step": 18139 + }, + { + "epoch": 0.46578433402180314, + "grad_norm": 0.73828125, + "learning_rate": 0.00016443612472331122, + "loss": 0.9347, + "step": 18140 + }, + { + "epoch": 0.465810011217725, + "grad_norm": 0.76953125, + "learning_rate": 0.00016443271078923078, + "loss": 0.9293, + "step": 18141 + }, + { + "epoch": 0.4658356884136468, + "grad_norm": 0.77734375, + "learning_rate": 0.0001644292967267417, + "loss": 0.9679, + "step": 18142 + }, + { + "epoch": 0.4658613656095686, + "grad_norm": 0.734375, + "learning_rate": 0.00016442588253585088, + "loss": 0.8686, + "step": 18143 + }, + { + "epoch": 0.4658870428054904, + "grad_norm": 0.6875, + "learning_rate": 0.00016442246821656505, + "loss": 0.7812, + "step": 18144 + }, + { + "epoch": 0.46591272000141226, + "grad_norm": 0.7734375, + "learning_rate": 0.00016441905376889113, + "loss": 0.8333, + "step": 18145 + }, + { + "epoch": 0.46593839719733404, + "grad_norm": 0.73046875, + "learning_rate": 0.0001644156391928358, + "loss": 0.9852, + "step": 18146 + }, + { + "epoch": 0.4659640743932559, + "grad_norm": 0.84375, + "learning_rate": 0.00016441222448840596, + "loss": 0.8879, + "step": 18147 + }, + { + "epoch": 0.4659897515891777, + "grad_norm": 0.76953125, + "learning_rate": 0.00016440880965560838, + "loss": 0.9062, + "step": 18148 + }, + { + "epoch": 0.46601542878509955, + "grad_norm": 0.796875, + "learning_rate": 0.0001644053946944498, + "loss": 0.9784, + "step": 18149 + }, + { + "epoch": 0.46604110598102133, + "grad_norm": 0.7734375, + "learning_rate": 0.00016440197960493713, + "loss": 0.9427, + "step": 18150 + }, + { + "epoch": 0.46606678317694317, + "grad_norm": 0.75, + "learning_rate": 0.00016439856438707712, + "loss": 0.8271, + "step": 18151 + }, + { + "epoch": 0.466092460372865, + "grad_norm": 0.73828125, + "learning_rate": 0.0001643951490408766, + "loss": 0.7981, + "step": 18152 + }, + { + "epoch": 0.4661181375687868, + "grad_norm": 0.80078125, + "learning_rate": 0.00016439173356634233, + "loss": 0.8732, + "step": 18153 + }, + { + "epoch": 0.4661438147647086, + "grad_norm": 0.71875, + "learning_rate": 0.0001643883179634812, + "loss": 0.8141, + "step": 18154 + }, + { + "epoch": 0.46616949196063046, + "grad_norm": 0.73828125, + "learning_rate": 0.00016438490223229997, + "loss": 0.8164, + "step": 18155 + }, + { + "epoch": 0.46619516915655224, + "grad_norm": 0.81640625, + "learning_rate": 0.0001643814863728054, + "loss": 0.8745, + "step": 18156 + }, + { + "epoch": 0.46622084635247407, + "grad_norm": 0.76171875, + "learning_rate": 0.00016437807038500437, + "loss": 0.7466, + "step": 18157 + }, + { + "epoch": 0.4662465235483959, + "grad_norm": 0.73828125, + "learning_rate": 0.00016437465426890366, + "loss": 0.8393, + "step": 18158 + }, + { + "epoch": 0.46627220074431774, + "grad_norm": 0.875, + "learning_rate": 0.00016437123802451006, + "loss": 0.9008, + "step": 18159 + }, + { + "epoch": 0.4662978779402395, + "grad_norm": 0.79296875, + "learning_rate": 0.00016436782165183046, + "loss": 0.8108, + "step": 18160 + }, + { + "epoch": 0.46632355513616136, + "grad_norm": 0.79296875, + "learning_rate": 0.00016436440515087155, + "loss": 0.8179, + "step": 18161 + }, + { + "epoch": 0.4663492323320832, + "grad_norm": 0.84765625, + "learning_rate": 0.0001643609885216402, + "loss": 1.0766, + "step": 18162 + }, + { + "epoch": 0.466374909528005, + "grad_norm": 0.83203125, + "learning_rate": 0.0001643575717641432, + "loss": 0.8833, + "step": 18163 + }, + { + "epoch": 0.4664005867239268, + "grad_norm": 0.80078125, + "learning_rate": 0.0001643541548783874, + "loss": 1.0352, + "step": 18164 + }, + { + "epoch": 0.46642626391984865, + "grad_norm": 0.765625, + "learning_rate": 0.00016435073786437955, + "loss": 0.9079, + "step": 18165 + }, + { + "epoch": 0.46645194111577043, + "grad_norm": 0.765625, + "learning_rate": 0.00016434732072212653, + "loss": 1.0371, + "step": 18166 + }, + { + "epoch": 0.46647761831169227, + "grad_norm": 0.74609375, + "learning_rate": 0.0001643439034516351, + "loss": 0.79, + "step": 18167 + }, + { + "epoch": 0.4665032955076141, + "grad_norm": 0.7578125, + "learning_rate": 0.00016434048605291207, + "loss": 1.0019, + "step": 18168 + }, + { + "epoch": 0.46652897270353594, + "grad_norm": 0.81640625, + "learning_rate": 0.00016433706852596428, + "loss": 0.9138, + "step": 18169 + }, + { + "epoch": 0.4665546498994577, + "grad_norm": 0.74609375, + "learning_rate": 0.00016433365087079851, + "loss": 0.8635, + "step": 18170 + }, + { + "epoch": 0.46658032709537955, + "grad_norm": 0.7265625, + "learning_rate": 0.00016433023308742162, + "loss": 0.8239, + "step": 18171 + }, + { + "epoch": 0.4666060042913014, + "grad_norm": 0.71875, + "learning_rate": 0.00016432681517584033, + "loss": 0.8674, + "step": 18172 + }, + { + "epoch": 0.46663168148722317, + "grad_norm": 0.8515625, + "learning_rate": 0.00016432339713606155, + "loss": 0.8205, + "step": 18173 + }, + { + "epoch": 0.466657358683145, + "grad_norm": 0.74609375, + "learning_rate": 0.00016431997896809204, + "loss": 0.8345, + "step": 18174 + }, + { + "epoch": 0.46668303587906684, + "grad_norm": 0.80078125, + "learning_rate": 0.00016431656067193864, + "loss": 0.9925, + "step": 18175 + }, + { + "epoch": 0.4667087130749886, + "grad_norm": 0.7578125, + "learning_rate": 0.0001643131422476081, + "loss": 0.8824, + "step": 18176 + }, + { + "epoch": 0.46673439027091046, + "grad_norm": 0.74609375, + "learning_rate": 0.0001643097236951073, + "loss": 0.9236, + "step": 18177 + }, + { + "epoch": 0.4667600674668323, + "grad_norm": 0.796875, + "learning_rate": 0.00016430630501444303, + "loss": 0.8989, + "step": 18178 + }, + { + "epoch": 0.4667857446627541, + "grad_norm": 0.72265625, + "learning_rate": 0.0001643028862056221, + "loss": 0.8555, + "step": 18179 + }, + { + "epoch": 0.4668114218586759, + "grad_norm": 0.72265625, + "learning_rate": 0.00016429946726865136, + "loss": 0.9241, + "step": 18180 + }, + { + "epoch": 0.46683709905459775, + "grad_norm": 0.77734375, + "learning_rate": 0.00016429604820353754, + "loss": 0.8066, + "step": 18181 + }, + { + "epoch": 0.4668627762505196, + "grad_norm": 0.7734375, + "learning_rate": 0.00016429262901028753, + "loss": 0.944, + "step": 18182 + }, + { + "epoch": 0.46688845344644136, + "grad_norm": 0.77734375, + "learning_rate": 0.0001642892096889081, + "loss": 0.9775, + "step": 18183 + }, + { + "epoch": 0.4669141306423632, + "grad_norm": 0.80859375, + "learning_rate": 0.0001642857902394061, + "loss": 0.9306, + "step": 18184 + }, + { + "epoch": 0.46693980783828504, + "grad_norm": 0.7109375, + "learning_rate": 0.00016428237066178831, + "loss": 0.9415, + "step": 18185 + }, + { + "epoch": 0.4669654850342068, + "grad_norm": 0.72265625, + "learning_rate": 0.00016427895095606155, + "loss": 0.8781, + "step": 18186 + }, + { + "epoch": 0.46699116223012865, + "grad_norm": 0.7578125, + "learning_rate": 0.0001642755311222327, + "loss": 1.0455, + "step": 18187 + }, + { + "epoch": 0.4670168394260505, + "grad_norm": 0.8203125, + "learning_rate": 0.0001642721111603085, + "loss": 0.9122, + "step": 18188 + }, + { + "epoch": 0.46704251662197227, + "grad_norm": 0.765625, + "learning_rate": 0.00016426869107029576, + "loss": 0.9074, + "step": 18189 + }, + { + "epoch": 0.4670681938178941, + "grad_norm": 0.7578125, + "learning_rate": 0.00016426527085220134, + "loss": 0.8557, + "step": 18190 + }, + { + "epoch": 0.46709387101381594, + "grad_norm": 0.73828125, + "learning_rate": 0.00016426185050603205, + "loss": 0.8529, + "step": 18191 + }, + { + "epoch": 0.4671195482097378, + "grad_norm": 0.80078125, + "learning_rate": 0.00016425843003179468, + "loss": 0.9157, + "step": 18192 + }, + { + "epoch": 0.46714522540565956, + "grad_norm": 0.8046875, + "learning_rate": 0.0001642550094294961, + "loss": 0.9725, + "step": 18193 + }, + { + "epoch": 0.4671709026015814, + "grad_norm": 0.7265625, + "learning_rate": 0.00016425158869914306, + "loss": 0.9036, + "step": 18194 + }, + { + "epoch": 0.46719657979750323, + "grad_norm": 0.828125, + "learning_rate": 0.00016424816784074242, + "loss": 1.0405, + "step": 18195 + }, + { + "epoch": 0.467222256993425, + "grad_norm": 0.7578125, + "learning_rate": 0.00016424474685430095, + "loss": 0.8863, + "step": 18196 + }, + { + "epoch": 0.46724793418934685, + "grad_norm": 0.8046875, + "learning_rate": 0.00016424132573982552, + "loss": 0.8502, + "step": 18197 + }, + { + "epoch": 0.4672736113852687, + "grad_norm": 0.72265625, + "learning_rate": 0.00016423790449732295, + "loss": 0.7584, + "step": 18198 + }, + { + "epoch": 0.46729928858119046, + "grad_norm": 0.73046875, + "learning_rate": 0.00016423448312680004, + "loss": 0.8713, + "step": 18199 + }, + { + "epoch": 0.4673249657771123, + "grad_norm": 0.72265625, + "learning_rate": 0.0001642310616282636, + "loss": 0.8802, + "step": 18200 + }, + { + "epoch": 0.46735064297303414, + "grad_norm": 0.77734375, + "learning_rate": 0.00016422764000172043, + "loss": 0.815, + "step": 18201 + }, + { + "epoch": 0.46737632016895597, + "grad_norm": 0.8359375, + "learning_rate": 0.00016422421824717742, + "loss": 0.841, + "step": 18202 + }, + { + "epoch": 0.46740199736487775, + "grad_norm": 0.703125, + "learning_rate": 0.0001642207963646413, + "loss": 0.8054, + "step": 18203 + }, + { + "epoch": 0.4674276745607996, + "grad_norm": 0.7890625, + "learning_rate": 0.00016421737435411897, + "loss": 0.9657, + "step": 18204 + }, + { + "epoch": 0.4674533517567214, + "grad_norm": 0.765625, + "learning_rate": 0.0001642139522156172, + "loss": 0.9076, + "step": 18205 + }, + { + "epoch": 0.4674790289526432, + "grad_norm": 0.76953125, + "learning_rate": 0.0001642105299491428, + "loss": 0.9166, + "step": 18206 + }, + { + "epoch": 0.46750470614856504, + "grad_norm": 0.828125, + "learning_rate": 0.00016420710755470265, + "loss": 0.8625, + "step": 18207 + }, + { + "epoch": 0.4675303833444869, + "grad_norm": 0.7265625, + "learning_rate": 0.00016420368503230353, + "loss": 0.8904, + "step": 18208 + }, + { + "epoch": 0.46755606054040866, + "grad_norm": 0.74609375, + "learning_rate": 0.00016420026238195225, + "loss": 0.8691, + "step": 18209 + }, + { + "epoch": 0.4675817377363305, + "grad_norm": 0.8203125, + "learning_rate": 0.00016419683960365564, + "loss": 0.9276, + "step": 18210 + }, + { + "epoch": 0.46760741493225233, + "grad_norm": 0.7421875, + "learning_rate": 0.00016419341669742058, + "loss": 0.8331, + "step": 18211 + }, + { + "epoch": 0.46763309212817417, + "grad_norm": 0.76953125, + "learning_rate": 0.00016418999366325375, + "loss": 1.005, + "step": 18212 + }, + { + "epoch": 0.46765876932409595, + "grad_norm": 0.82421875, + "learning_rate": 0.00016418657050116213, + "loss": 0.9696, + "step": 18213 + }, + { + "epoch": 0.4676844465200178, + "grad_norm": 0.7734375, + "learning_rate": 0.00016418314721115243, + "loss": 1.0013, + "step": 18214 + }, + { + "epoch": 0.4677101237159396, + "grad_norm": 0.75, + "learning_rate": 0.00016417972379323157, + "loss": 0.7605, + "step": 18215 + }, + { + "epoch": 0.4677358009118614, + "grad_norm": 0.7109375, + "learning_rate": 0.0001641763002474063, + "loss": 0.8741, + "step": 18216 + }, + { + "epoch": 0.46776147810778324, + "grad_norm": 0.90234375, + "learning_rate": 0.0001641728765736834, + "loss": 1.0394, + "step": 18217 + }, + { + "epoch": 0.46778715530370507, + "grad_norm": 0.85546875, + "learning_rate": 0.00016416945277206982, + "loss": 0.7781, + "step": 18218 + }, + { + "epoch": 0.46781283249962685, + "grad_norm": 0.8046875, + "learning_rate": 0.0001641660288425723, + "loss": 0.8631, + "step": 18219 + }, + { + "epoch": 0.4678385096955487, + "grad_norm": 0.79296875, + "learning_rate": 0.0001641626047851977, + "loss": 0.8873, + "step": 18220 + }, + { + "epoch": 0.4678641868914705, + "grad_norm": 0.765625, + "learning_rate": 0.00016415918059995278, + "loss": 0.8804, + "step": 18221 + }, + { + "epoch": 0.46788986408739236, + "grad_norm": 0.7734375, + "learning_rate": 0.00016415575628684443, + "loss": 0.8301, + "step": 18222 + }, + { + "epoch": 0.46791554128331414, + "grad_norm": 0.7734375, + "learning_rate": 0.00016415233184587945, + "loss": 0.8839, + "step": 18223 + }, + { + "epoch": 0.467941218479236, + "grad_norm": 0.78125, + "learning_rate": 0.0001641489072770647, + "loss": 0.893, + "step": 18224 + }, + { + "epoch": 0.4679668956751578, + "grad_norm": 0.80078125, + "learning_rate": 0.0001641454825804069, + "loss": 0.9018, + "step": 18225 + }, + { + "epoch": 0.4679925728710796, + "grad_norm": 0.8125, + "learning_rate": 0.00016414205775591302, + "loss": 0.9606, + "step": 18226 + }, + { + "epoch": 0.46801825006700143, + "grad_norm": 0.72265625, + "learning_rate": 0.00016413863280358978, + "loss": 0.8307, + "step": 18227 + }, + { + "epoch": 0.46804392726292326, + "grad_norm": 0.7890625, + "learning_rate": 0.00016413520772344406, + "loss": 0.9598, + "step": 18228 + }, + { + "epoch": 0.46806960445884505, + "grad_norm": 0.7734375, + "learning_rate": 0.00016413178251548262, + "loss": 0.8839, + "step": 18229 + }, + { + "epoch": 0.4680952816547669, + "grad_norm": 0.87109375, + "learning_rate": 0.00016412835717971236, + "loss": 1.0321, + "step": 18230 + }, + { + "epoch": 0.4681209588506887, + "grad_norm": 0.8203125, + "learning_rate": 0.0001641249317161401, + "loss": 0.8081, + "step": 18231 + }, + { + "epoch": 0.46814663604661055, + "grad_norm": 0.75, + "learning_rate": 0.00016412150612477258, + "loss": 0.8285, + "step": 18232 + }, + { + "epoch": 0.46817231324253233, + "grad_norm": 0.8203125, + "learning_rate": 0.00016411808040561675, + "loss": 0.9497, + "step": 18233 + }, + { + "epoch": 0.46819799043845417, + "grad_norm": 0.74609375, + "learning_rate": 0.00016411465455867935, + "loss": 0.8211, + "step": 18234 + }, + { + "epoch": 0.468223667634376, + "grad_norm": 0.83203125, + "learning_rate": 0.00016411122858396723, + "loss": 1.0052, + "step": 18235 + }, + { + "epoch": 0.4682493448302978, + "grad_norm": 0.7734375, + "learning_rate": 0.00016410780248148725, + "loss": 0.9082, + "step": 18236 + }, + { + "epoch": 0.4682750220262196, + "grad_norm": 0.7734375, + "learning_rate": 0.00016410437625124618, + "loss": 0.8063, + "step": 18237 + }, + { + "epoch": 0.46830069922214146, + "grad_norm": 0.77734375, + "learning_rate": 0.0001641009498932509, + "loss": 0.8908, + "step": 18238 + }, + { + "epoch": 0.46832637641806324, + "grad_norm": 0.80859375, + "learning_rate": 0.0001640975234075082, + "loss": 0.8169, + "step": 18239 + }, + { + "epoch": 0.4683520536139851, + "grad_norm": 0.796875, + "learning_rate": 0.00016409409679402493, + "loss": 0.8806, + "step": 18240 + }, + { + "epoch": 0.4683777308099069, + "grad_norm": 0.7578125, + "learning_rate": 0.00016409067005280793, + "loss": 0.93, + "step": 18241 + }, + { + "epoch": 0.46840340800582875, + "grad_norm": 0.84375, + "learning_rate": 0.00016408724318386399, + "loss": 0.9285, + "step": 18242 + }, + { + "epoch": 0.46842908520175053, + "grad_norm": 0.70703125, + "learning_rate": 0.0001640838161872, + "loss": 0.9835, + "step": 18243 + }, + { + "epoch": 0.46845476239767236, + "grad_norm": 0.78515625, + "learning_rate": 0.00016408038906282272, + "loss": 0.956, + "step": 18244 + }, + { + "epoch": 0.4684804395935942, + "grad_norm": 0.73828125, + "learning_rate": 0.00016407696181073905, + "loss": 0.8501, + "step": 18245 + }, + { + "epoch": 0.468506116789516, + "grad_norm": 0.8125, + "learning_rate": 0.00016407353443095575, + "loss": 0.971, + "step": 18246 + }, + { + "epoch": 0.4685317939854378, + "grad_norm": 0.765625, + "learning_rate": 0.00016407010692347967, + "loss": 0.8253, + "step": 18247 + }, + { + "epoch": 0.46855747118135965, + "grad_norm": 0.82421875, + "learning_rate": 0.0001640666792883177, + "loss": 1.0228, + "step": 18248 + }, + { + "epoch": 0.46858314837728143, + "grad_norm": 0.8125, + "learning_rate": 0.00016406325152547657, + "loss": 0.9217, + "step": 18249 + }, + { + "epoch": 0.46860882557320327, + "grad_norm": 0.8203125, + "learning_rate": 0.0001640598236349632, + "loss": 0.9825, + "step": 18250 + }, + { + "epoch": 0.4686345027691251, + "grad_norm": 0.80078125, + "learning_rate": 0.00016405639561678438, + "loss": 0.8162, + "step": 18251 + }, + { + "epoch": 0.46866017996504694, + "grad_norm": 0.7109375, + "learning_rate": 0.00016405296747094695, + "loss": 0.8445, + "step": 18252 + }, + { + "epoch": 0.4686858571609687, + "grad_norm": 0.8046875, + "learning_rate": 0.00016404953919745775, + "loss": 1.0191, + "step": 18253 + }, + { + "epoch": 0.46871153435689056, + "grad_norm": 0.8046875, + "learning_rate": 0.00016404611079632358, + "loss": 0.9024, + "step": 18254 + }, + { + "epoch": 0.4687372115528124, + "grad_norm": 0.75, + "learning_rate": 0.00016404268226755133, + "loss": 0.8574, + "step": 18255 + }, + { + "epoch": 0.4687628887487342, + "grad_norm": 0.78125, + "learning_rate": 0.00016403925361114778, + "loss": 0.9753, + "step": 18256 + }, + { + "epoch": 0.468788565944656, + "grad_norm": 0.7890625, + "learning_rate": 0.00016403582482711978, + "loss": 0.8604, + "step": 18257 + }, + { + "epoch": 0.46881424314057785, + "grad_norm": 0.8046875, + "learning_rate": 0.0001640323959154742, + "loss": 0.8164, + "step": 18258 + }, + { + "epoch": 0.4688399203364996, + "grad_norm": 0.8203125, + "learning_rate": 0.0001640289668762178, + "loss": 0.778, + "step": 18259 + }, + { + "epoch": 0.46886559753242146, + "grad_norm": 0.80078125, + "learning_rate": 0.0001640255377093575, + "loss": 0.961, + "step": 18260 + }, + { + "epoch": 0.4688912747283433, + "grad_norm": 0.73046875, + "learning_rate": 0.00016402210841490002, + "loss": 0.8869, + "step": 18261 + }, + { + "epoch": 0.46891695192426514, + "grad_norm": 0.7734375, + "learning_rate": 0.0001640186789928523, + "loss": 0.9184, + "step": 18262 + }, + { + "epoch": 0.4689426291201869, + "grad_norm": 0.8203125, + "learning_rate": 0.00016401524944322112, + "loss": 0.8196, + "step": 18263 + }, + { + "epoch": 0.46896830631610875, + "grad_norm": 0.78515625, + "learning_rate": 0.00016401181976601333, + "loss": 0.8737, + "step": 18264 + }, + { + "epoch": 0.4689939835120306, + "grad_norm": 0.8828125, + "learning_rate": 0.00016400838996123577, + "loss": 1.0503, + "step": 18265 + }, + { + "epoch": 0.46901966070795237, + "grad_norm": 0.71484375, + "learning_rate": 0.00016400496002889529, + "loss": 0.9362, + "step": 18266 + }, + { + "epoch": 0.4690453379038742, + "grad_norm": 0.8046875, + "learning_rate": 0.00016400152996899866, + "loss": 0.9589, + "step": 18267 + }, + { + "epoch": 0.46907101509979604, + "grad_norm": 0.8203125, + "learning_rate": 0.0001639980997815528, + "loss": 1.0345, + "step": 18268 + }, + { + "epoch": 0.4690966922957178, + "grad_norm": 0.78125, + "learning_rate": 0.00016399466946656453, + "loss": 1.004, + "step": 18269 + }, + { + "epoch": 0.46912236949163966, + "grad_norm": 0.828125, + "learning_rate": 0.0001639912390240406, + "loss": 0.935, + "step": 18270 + }, + { + "epoch": 0.4691480466875615, + "grad_norm": 0.8203125, + "learning_rate": 0.00016398780845398796, + "loss": 1.0321, + "step": 18271 + }, + { + "epoch": 0.46917372388348333, + "grad_norm": 0.74609375, + "learning_rate": 0.0001639843777564134, + "loss": 0.8497, + "step": 18272 + }, + { + "epoch": 0.4691994010794051, + "grad_norm": 0.7578125, + "learning_rate": 0.00016398094693132373, + "loss": 0.8701, + "step": 18273 + }, + { + "epoch": 0.46922507827532695, + "grad_norm": 0.73828125, + "learning_rate": 0.0001639775159787258, + "loss": 0.9171, + "step": 18274 + }, + { + "epoch": 0.4692507554712488, + "grad_norm": 0.86328125, + "learning_rate": 0.00016397408489862649, + "loss": 1.0069, + "step": 18275 + }, + { + "epoch": 0.46927643266717056, + "grad_norm": 0.76953125, + "learning_rate": 0.00016397065369103258, + "loss": 0.9305, + "step": 18276 + }, + { + "epoch": 0.4693021098630924, + "grad_norm": 0.76171875, + "learning_rate": 0.00016396722235595098, + "loss": 0.8711, + "step": 18277 + }, + { + "epoch": 0.46932778705901423, + "grad_norm": 0.7578125, + "learning_rate": 0.00016396379089338845, + "loss": 0.8361, + "step": 18278 + }, + { + "epoch": 0.469353464254936, + "grad_norm": 0.73828125, + "learning_rate": 0.00016396035930335188, + "loss": 0.8897, + "step": 18279 + }, + { + "epoch": 0.46937914145085785, + "grad_norm": 0.77734375, + "learning_rate": 0.0001639569275858481, + "loss": 0.8794, + "step": 18280 + }, + { + "epoch": 0.4694048186467797, + "grad_norm": 0.74609375, + "learning_rate": 0.0001639534957408839, + "loss": 0.999, + "step": 18281 + }, + { + "epoch": 0.4694304958427015, + "grad_norm": 0.75, + "learning_rate": 0.00016395006376846618, + "loss": 0.7923, + "step": 18282 + }, + { + "epoch": 0.4694561730386233, + "grad_norm": 0.734375, + "learning_rate": 0.00016394663166860175, + "loss": 0.9148, + "step": 18283 + }, + { + "epoch": 0.46948185023454514, + "grad_norm": 0.75390625, + "learning_rate": 0.0001639431994412975, + "loss": 0.9605, + "step": 18284 + }, + { + "epoch": 0.469507527430467, + "grad_norm": 0.7734375, + "learning_rate": 0.0001639397670865602, + "loss": 0.8864, + "step": 18285 + }, + { + "epoch": 0.46953320462638876, + "grad_norm": 0.87109375, + "learning_rate": 0.0001639363346043967, + "loss": 0.9418, + "step": 18286 + }, + { + "epoch": 0.4695588818223106, + "grad_norm": 0.76953125, + "learning_rate": 0.00016393290199481387, + "loss": 0.9065, + "step": 18287 + }, + { + "epoch": 0.46958455901823243, + "grad_norm": 0.71875, + "learning_rate": 0.00016392946925781855, + "loss": 0.7435, + "step": 18288 + }, + { + "epoch": 0.4696102362141542, + "grad_norm": 0.734375, + "learning_rate": 0.0001639260363934176, + "loss": 0.9289, + "step": 18289 + }, + { + "epoch": 0.46963591341007604, + "grad_norm": 0.76171875, + "learning_rate": 0.00016392260340161778, + "loss": 0.846, + "step": 18290 + }, + { + "epoch": 0.4696615906059979, + "grad_norm": 0.76171875, + "learning_rate": 0.00016391917028242602, + "loss": 1.009, + "step": 18291 + }, + { + "epoch": 0.4696872678019197, + "grad_norm": 0.79296875, + "learning_rate": 0.00016391573703584912, + "loss": 0.8968, + "step": 18292 + }, + { + "epoch": 0.4697129449978415, + "grad_norm": 0.796875, + "learning_rate": 0.00016391230366189394, + "loss": 0.8347, + "step": 18293 + }, + { + "epoch": 0.46973862219376333, + "grad_norm": 0.75390625, + "learning_rate": 0.0001639088701605673, + "loss": 0.8078, + "step": 18294 + }, + { + "epoch": 0.46976429938968517, + "grad_norm": 0.828125, + "learning_rate": 0.00016390543653187605, + "loss": 0.9267, + "step": 18295 + }, + { + "epoch": 0.46978997658560695, + "grad_norm": 0.75, + "learning_rate": 0.00016390200277582704, + "loss": 0.9449, + "step": 18296 + }, + { + "epoch": 0.4698156537815288, + "grad_norm": 0.78125, + "learning_rate": 0.0001638985688924271, + "loss": 0.9881, + "step": 18297 + }, + { + "epoch": 0.4698413309774506, + "grad_norm": 0.7578125, + "learning_rate": 0.00016389513488168314, + "loss": 0.8673, + "step": 18298 + }, + { + "epoch": 0.4698670081733724, + "grad_norm": 0.79296875, + "learning_rate": 0.0001638917007436019, + "loss": 0.9854, + "step": 18299 + }, + { + "epoch": 0.46989268536929424, + "grad_norm": 0.76953125, + "learning_rate": 0.00016388826647819026, + "loss": 0.8956, + "step": 18300 + }, + { + "epoch": 0.4699183625652161, + "grad_norm": 0.6953125, + "learning_rate": 0.0001638848320854551, + "loss": 0.906, + "step": 18301 + }, + { + "epoch": 0.4699440397611379, + "grad_norm": 0.91015625, + "learning_rate": 0.00016388139756540323, + "loss": 0.8845, + "step": 18302 + }, + { + "epoch": 0.4699697169570597, + "grad_norm": 0.7421875, + "learning_rate": 0.00016387796291804152, + "loss": 0.8985, + "step": 18303 + }, + { + "epoch": 0.4699953941529815, + "grad_norm": 0.76171875, + "learning_rate": 0.0001638745281433768, + "loss": 0.9938, + "step": 18304 + }, + { + "epoch": 0.47002107134890336, + "grad_norm": 0.80859375, + "learning_rate": 0.0001638710932414159, + "loss": 0.9424, + "step": 18305 + }, + { + "epoch": 0.47004674854482514, + "grad_norm": 0.80859375, + "learning_rate": 0.0001638676582121657, + "loss": 1.0574, + "step": 18306 + }, + { + "epoch": 0.470072425740747, + "grad_norm": 0.80078125, + "learning_rate": 0.000163864223055633, + "loss": 0.8722, + "step": 18307 + }, + { + "epoch": 0.4700981029366688, + "grad_norm": 0.7421875, + "learning_rate": 0.00016386078777182468, + "loss": 0.8121, + "step": 18308 + }, + { + "epoch": 0.4701237801325906, + "grad_norm": 0.76953125, + "learning_rate": 0.00016385735236074757, + "loss": 0.9737, + "step": 18309 + }, + { + "epoch": 0.47014945732851243, + "grad_norm": 0.828125, + "learning_rate": 0.00016385391682240854, + "loss": 0.9401, + "step": 18310 + }, + { + "epoch": 0.47017513452443427, + "grad_norm": 0.7890625, + "learning_rate": 0.0001638504811568144, + "loss": 0.9305, + "step": 18311 + }, + { + "epoch": 0.4702008117203561, + "grad_norm": 0.81640625, + "learning_rate": 0.00016384704536397205, + "loss": 0.792, + "step": 18312 + }, + { + "epoch": 0.4702264889162779, + "grad_norm": 0.79296875, + "learning_rate": 0.0001638436094438883, + "loss": 0.9786, + "step": 18313 + }, + { + "epoch": 0.4702521661121997, + "grad_norm": 0.74609375, + "learning_rate": 0.00016384017339657, + "loss": 0.9694, + "step": 18314 + }, + { + "epoch": 0.47027784330812156, + "grad_norm": 0.796875, + "learning_rate": 0.00016383673722202398, + "loss": 0.9182, + "step": 18315 + }, + { + "epoch": 0.47030352050404334, + "grad_norm": 0.8671875, + "learning_rate": 0.00016383330092025714, + "loss": 0.9574, + "step": 18316 + }, + { + "epoch": 0.4703291976999652, + "grad_norm": 0.7734375, + "learning_rate": 0.00016382986449127627, + "loss": 0.9159, + "step": 18317 + }, + { + "epoch": 0.470354874895887, + "grad_norm": 0.796875, + "learning_rate": 0.0001638264279350883, + "loss": 0.8755, + "step": 18318 + }, + { + "epoch": 0.4703805520918088, + "grad_norm": 0.76171875, + "learning_rate": 0.00016382299125169996, + "loss": 0.9326, + "step": 18319 + }, + { + "epoch": 0.4704062292877306, + "grad_norm": 0.734375, + "learning_rate": 0.0001638195544411182, + "loss": 1.0579, + "step": 18320 + }, + { + "epoch": 0.47043190648365246, + "grad_norm": 0.73828125, + "learning_rate": 0.00016381611750334983, + "loss": 0.879, + "step": 18321 + }, + { + "epoch": 0.4704575836795743, + "grad_norm": 0.7734375, + "learning_rate": 0.00016381268043840166, + "loss": 0.9924, + "step": 18322 + }, + { + "epoch": 0.4704832608754961, + "grad_norm": 0.7421875, + "learning_rate": 0.00016380924324628065, + "loss": 0.8795, + "step": 18323 + }, + { + "epoch": 0.4705089380714179, + "grad_norm": 0.734375, + "learning_rate": 0.00016380580592699354, + "loss": 0.8242, + "step": 18324 + }, + { + "epoch": 0.47053461526733975, + "grad_norm": 0.76171875, + "learning_rate": 0.00016380236848054724, + "loss": 0.88, + "step": 18325 + }, + { + "epoch": 0.47056029246326153, + "grad_norm": 0.7890625, + "learning_rate": 0.00016379893090694858, + "loss": 0.9188, + "step": 18326 + }, + { + "epoch": 0.47058596965918337, + "grad_norm": 0.765625, + "learning_rate": 0.00016379549320620437, + "loss": 0.8856, + "step": 18327 + }, + { + "epoch": 0.4706116468551052, + "grad_norm": 0.8125, + "learning_rate": 0.00016379205537832156, + "loss": 0.9191, + "step": 18328 + }, + { + "epoch": 0.470637324051027, + "grad_norm": 0.7578125, + "learning_rate": 0.00016378861742330692, + "loss": 0.8685, + "step": 18329 + }, + { + "epoch": 0.4706630012469488, + "grad_norm": 0.8046875, + "learning_rate": 0.00016378517934116733, + "loss": 0.9791, + "step": 18330 + }, + { + "epoch": 0.47068867844287066, + "grad_norm": 0.72265625, + "learning_rate": 0.00016378174113190966, + "loss": 0.7475, + "step": 18331 + }, + { + "epoch": 0.4707143556387925, + "grad_norm": 0.73828125, + "learning_rate": 0.0001637783027955407, + "loss": 0.7995, + "step": 18332 + }, + { + "epoch": 0.4707400328347143, + "grad_norm": 0.76953125, + "learning_rate": 0.0001637748643320674, + "loss": 0.9923, + "step": 18333 + }, + { + "epoch": 0.4707657100306361, + "grad_norm": 0.765625, + "learning_rate": 0.00016377142574149653, + "loss": 0.7963, + "step": 18334 + }, + { + "epoch": 0.47079138722655794, + "grad_norm": 0.78125, + "learning_rate": 0.00016376798702383494, + "loss": 0.8337, + "step": 18335 + }, + { + "epoch": 0.4708170644224797, + "grad_norm": 0.80078125, + "learning_rate": 0.00016376454817908955, + "loss": 0.8614, + "step": 18336 + }, + { + "epoch": 0.47084274161840156, + "grad_norm": 0.79296875, + "learning_rate": 0.00016376110920726715, + "loss": 0.8244, + "step": 18337 + }, + { + "epoch": 0.4708684188143234, + "grad_norm": 0.76171875, + "learning_rate": 0.00016375767010837463, + "loss": 0.8188, + "step": 18338 + }, + { + "epoch": 0.4708940960102452, + "grad_norm": 0.765625, + "learning_rate": 0.00016375423088241885, + "loss": 0.8874, + "step": 18339 + }, + { + "epoch": 0.470919773206167, + "grad_norm": 0.80859375, + "learning_rate": 0.00016375079152940663, + "loss": 0.989, + "step": 18340 + }, + { + "epoch": 0.47094545040208885, + "grad_norm": 0.7421875, + "learning_rate": 0.00016374735204934484, + "loss": 0.9742, + "step": 18341 + }, + { + "epoch": 0.4709711275980107, + "grad_norm": 0.7265625, + "learning_rate": 0.00016374391244224033, + "loss": 0.8973, + "step": 18342 + }, + { + "epoch": 0.47099680479393247, + "grad_norm": 0.7421875, + "learning_rate": 0.00016374047270809996, + "loss": 0.8426, + "step": 18343 + }, + { + "epoch": 0.4710224819898543, + "grad_norm": 0.76171875, + "learning_rate": 0.00016373703284693061, + "loss": 0.7794, + "step": 18344 + }, + { + "epoch": 0.47104815918577614, + "grad_norm": 0.77734375, + "learning_rate": 0.00016373359285873907, + "loss": 0.8141, + "step": 18345 + }, + { + "epoch": 0.4710738363816979, + "grad_norm": 0.8359375, + "learning_rate": 0.00016373015274353227, + "loss": 1.053, + "step": 18346 + }, + { + "epoch": 0.47109951357761976, + "grad_norm": 0.77734375, + "learning_rate": 0.000163726712501317, + "loss": 0.9716, + "step": 18347 + }, + { + "epoch": 0.4711251907735416, + "grad_norm": 0.78125, + "learning_rate": 0.0001637232721321002, + "loss": 0.9685, + "step": 18348 + }, + { + "epoch": 0.47115086796946337, + "grad_norm": 0.734375, + "learning_rate": 0.00016371983163588865, + "loss": 0.891, + "step": 18349 + }, + { + "epoch": 0.4711765451653852, + "grad_norm": 0.8046875, + "learning_rate": 0.00016371639101268922, + "loss": 0.8149, + "step": 18350 + }, + { + "epoch": 0.47120222236130704, + "grad_norm": 0.79296875, + "learning_rate": 0.00016371295026250876, + "loss": 0.8886, + "step": 18351 + }, + { + "epoch": 0.4712278995572289, + "grad_norm": 0.8046875, + "learning_rate": 0.00016370950938535418, + "loss": 0.815, + "step": 18352 + }, + { + "epoch": 0.47125357675315066, + "grad_norm": 0.796875, + "learning_rate": 0.0001637060683812323, + "loss": 1.0043, + "step": 18353 + }, + { + "epoch": 0.4712792539490725, + "grad_norm": 0.71875, + "learning_rate": 0.00016370262725014999, + "loss": 0.8389, + "step": 18354 + }, + { + "epoch": 0.47130493114499433, + "grad_norm": 0.6875, + "learning_rate": 0.00016369918599211408, + "loss": 0.8642, + "step": 18355 + }, + { + "epoch": 0.4713306083409161, + "grad_norm": 0.66796875, + "learning_rate": 0.00016369574460713143, + "loss": 0.9084, + "step": 18356 + }, + { + "epoch": 0.47135628553683795, + "grad_norm": 0.72265625, + "learning_rate": 0.00016369230309520893, + "loss": 0.8801, + "step": 18357 + }, + { + "epoch": 0.4713819627327598, + "grad_norm": 0.7421875, + "learning_rate": 0.00016368886145635343, + "loss": 0.9022, + "step": 18358 + }, + { + "epoch": 0.47140763992868157, + "grad_norm": 0.76171875, + "learning_rate": 0.0001636854196905718, + "loss": 0.954, + "step": 18359 + }, + { + "epoch": 0.4714333171246034, + "grad_norm": 0.80859375, + "learning_rate": 0.00016368197779787082, + "loss": 0.9002, + "step": 18360 + }, + { + "epoch": 0.47145899432052524, + "grad_norm": 0.765625, + "learning_rate": 0.00016367853577825748, + "loss": 0.9673, + "step": 18361 + }, + { + "epoch": 0.4714846715164471, + "grad_norm": 0.765625, + "learning_rate": 0.00016367509363173852, + "loss": 0.9562, + "step": 18362 + }, + { + "epoch": 0.47151034871236885, + "grad_norm": 0.7578125, + "learning_rate": 0.00016367165135832088, + "loss": 0.9532, + "step": 18363 + }, + { + "epoch": 0.4715360259082907, + "grad_norm": 0.8359375, + "learning_rate": 0.00016366820895801138, + "loss": 0.864, + "step": 18364 + }, + { + "epoch": 0.4715617031042125, + "grad_norm": 0.74609375, + "learning_rate": 0.0001636647664308169, + "loss": 0.9061, + "step": 18365 + }, + { + "epoch": 0.4715873803001343, + "grad_norm": 0.7734375, + "learning_rate": 0.00016366132377674427, + "loss": 0.9073, + "step": 18366 + }, + { + "epoch": 0.47161305749605614, + "grad_norm": 0.8203125, + "learning_rate": 0.00016365788099580037, + "loss": 0.9307, + "step": 18367 + }, + { + "epoch": 0.471638734691978, + "grad_norm": 0.9375, + "learning_rate": 0.00016365443808799207, + "loss": 0.9507, + "step": 18368 + }, + { + "epoch": 0.47166441188789976, + "grad_norm": 0.765625, + "learning_rate": 0.00016365099505332627, + "loss": 0.9623, + "step": 18369 + }, + { + "epoch": 0.4716900890838216, + "grad_norm": 0.6953125, + "learning_rate": 0.00016364755189180973, + "loss": 0.9136, + "step": 18370 + }, + { + "epoch": 0.47171576627974343, + "grad_norm": 0.7421875, + "learning_rate": 0.00016364410860344937, + "loss": 0.9554, + "step": 18371 + }, + { + "epoch": 0.47174144347566527, + "grad_norm": 0.671875, + "learning_rate": 0.0001636406651882521, + "loss": 0.7272, + "step": 18372 + }, + { + "epoch": 0.47176712067158705, + "grad_norm": 0.80078125, + "learning_rate": 0.00016363722164622468, + "loss": 0.8375, + "step": 18373 + }, + { + "epoch": 0.4717927978675089, + "grad_norm": 0.83203125, + "learning_rate": 0.00016363377797737406, + "loss": 0.8513, + "step": 18374 + }, + { + "epoch": 0.4718184750634307, + "grad_norm": 0.74609375, + "learning_rate": 0.00016363033418170702, + "loss": 0.8903, + "step": 18375 + }, + { + "epoch": 0.4718441522593525, + "grad_norm": 0.86328125, + "learning_rate": 0.00016362689025923053, + "loss": 0.8034, + "step": 18376 + }, + { + "epoch": 0.47186982945527434, + "grad_norm": 0.74609375, + "learning_rate": 0.00016362344620995137, + "loss": 0.8687, + "step": 18377 + }, + { + "epoch": 0.4718955066511962, + "grad_norm": 0.78125, + "learning_rate": 0.00016362000203387644, + "loss": 0.7982, + "step": 18378 + }, + { + "epoch": 0.47192118384711795, + "grad_norm": 0.7578125, + "learning_rate": 0.00016361655773101258, + "loss": 0.8163, + "step": 18379 + }, + { + "epoch": 0.4719468610430398, + "grad_norm": 0.828125, + "learning_rate": 0.00016361311330136667, + "loss": 0.9506, + "step": 18380 + }, + { + "epoch": 0.4719725382389616, + "grad_norm": 0.8515625, + "learning_rate": 0.00016360966874494557, + "loss": 0.9539, + "step": 18381 + }, + { + "epoch": 0.4719982154348834, + "grad_norm": 0.73046875, + "learning_rate": 0.00016360622406175613, + "loss": 0.856, + "step": 18382 + }, + { + "epoch": 0.47202389263080524, + "grad_norm": 0.734375, + "learning_rate": 0.00016360277925180525, + "loss": 0.9714, + "step": 18383 + }, + { + "epoch": 0.4720495698267271, + "grad_norm": 0.71484375, + "learning_rate": 0.0001635993343150998, + "loss": 0.8417, + "step": 18384 + }, + { + "epoch": 0.4720752470226489, + "grad_norm": 0.8125, + "learning_rate": 0.00016359588925164657, + "loss": 0.8751, + "step": 18385 + }, + { + "epoch": 0.4721009242185707, + "grad_norm": 0.7578125, + "learning_rate": 0.00016359244406145251, + "loss": 0.8515, + "step": 18386 + }, + { + "epoch": 0.47212660141449253, + "grad_norm": 0.78515625, + "learning_rate": 0.00016358899874452444, + "loss": 0.7974, + "step": 18387 + }, + { + "epoch": 0.47215227861041437, + "grad_norm": 0.81640625, + "learning_rate": 0.00016358555330086926, + "loss": 1.008, + "step": 18388 + }, + { + "epoch": 0.47217795580633615, + "grad_norm": 1.0390625, + "learning_rate": 0.0001635821077304938, + "loss": 0.8719, + "step": 18389 + }, + { + "epoch": 0.472203633002258, + "grad_norm": 0.80859375, + "learning_rate": 0.00016357866203340496, + "loss": 0.9336, + "step": 18390 + }, + { + "epoch": 0.4722293101981798, + "grad_norm": 0.8125, + "learning_rate": 0.00016357521620960957, + "loss": 0.957, + "step": 18391 + }, + { + "epoch": 0.4722549873941016, + "grad_norm": 0.76953125, + "learning_rate": 0.0001635717702591145, + "loss": 0.9285, + "step": 18392 + }, + { + "epoch": 0.47228066459002344, + "grad_norm": 0.7578125, + "learning_rate": 0.00016356832418192667, + "loss": 1.0198, + "step": 18393 + }, + { + "epoch": 0.47230634178594527, + "grad_norm": 0.703125, + "learning_rate": 0.00016356487797805288, + "loss": 0.7815, + "step": 18394 + }, + { + "epoch": 0.4723320189818671, + "grad_norm": 0.79296875, + "learning_rate": 0.00016356143164750005, + "loss": 0.9205, + "step": 18395 + }, + { + "epoch": 0.4723576961777889, + "grad_norm": 0.91796875, + "learning_rate": 0.000163557985190275, + "loss": 0.8676, + "step": 18396 + }, + { + "epoch": 0.4723833733737107, + "grad_norm": 0.73828125, + "learning_rate": 0.00016355453860638467, + "loss": 0.8533, + "step": 18397 + }, + { + "epoch": 0.47240905056963256, + "grad_norm": 0.80859375, + "learning_rate": 0.0001635510918958359, + "loss": 0.8732, + "step": 18398 + }, + { + "epoch": 0.47243472776555434, + "grad_norm": 0.8203125, + "learning_rate": 0.0001635476450586355, + "loss": 0.8159, + "step": 18399 + }, + { + "epoch": 0.4724604049614762, + "grad_norm": 0.796875, + "learning_rate": 0.00016354419809479039, + "loss": 1.0039, + "step": 18400 + }, + { + "epoch": 0.472486082157398, + "grad_norm": 0.8125, + "learning_rate": 0.00016354075100430743, + "loss": 0.8834, + "step": 18401 + }, + { + "epoch": 0.4725117593533198, + "grad_norm": 0.79296875, + "learning_rate": 0.00016353730378719348, + "loss": 0.9209, + "step": 18402 + }, + { + "epoch": 0.47253743654924163, + "grad_norm": 0.734375, + "learning_rate": 0.00016353385644345546, + "loss": 0.9488, + "step": 18403 + }, + { + "epoch": 0.47256311374516347, + "grad_norm": 0.73828125, + "learning_rate": 0.00016353040897310018, + "loss": 0.6785, + "step": 18404 + }, + { + "epoch": 0.4725887909410853, + "grad_norm": 0.8671875, + "learning_rate": 0.00016352696137613454, + "loss": 1.0356, + "step": 18405 + }, + { + "epoch": 0.4726144681370071, + "grad_norm": 0.71484375, + "learning_rate": 0.00016352351365256541, + "loss": 0.7941, + "step": 18406 + }, + { + "epoch": 0.4726401453329289, + "grad_norm": 0.79296875, + "learning_rate": 0.00016352006580239964, + "loss": 0.9747, + "step": 18407 + }, + { + "epoch": 0.47266582252885075, + "grad_norm": 0.7265625, + "learning_rate": 0.00016351661782564416, + "loss": 0.9663, + "step": 18408 + }, + { + "epoch": 0.47269149972477253, + "grad_norm": 0.75390625, + "learning_rate": 0.00016351316972230574, + "loss": 0.9698, + "step": 18409 + }, + { + "epoch": 0.47271717692069437, + "grad_norm": 0.71875, + "learning_rate": 0.00016350972149239135, + "loss": 0.8627, + "step": 18410 + }, + { + "epoch": 0.4727428541166162, + "grad_norm": 0.828125, + "learning_rate": 0.0001635062731359078, + "loss": 0.936, + "step": 18411 + }, + { + "epoch": 0.472768531312538, + "grad_norm": 0.765625, + "learning_rate": 0.000163502824652862, + "loss": 0.9552, + "step": 18412 + }, + { + "epoch": 0.4727942085084598, + "grad_norm": 0.75, + "learning_rate": 0.0001634993760432608, + "loss": 0.8697, + "step": 18413 + }, + { + "epoch": 0.47281988570438166, + "grad_norm": 0.76953125, + "learning_rate": 0.00016349592730711108, + "loss": 0.8773, + "step": 18414 + }, + { + "epoch": 0.4728455629003035, + "grad_norm": 0.75, + "learning_rate": 0.00016349247844441972, + "loss": 0.8712, + "step": 18415 + }, + { + "epoch": 0.4728712400962253, + "grad_norm": 0.80859375, + "learning_rate": 0.00016348902945519355, + "loss": 1.0684, + "step": 18416 + }, + { + "epoch": 0.4728969172921471, + "grad_norm": 0.78125, + "learning_rate": 0.0001634855803394395, + "loss": 0.9195, + "step": 18417 + }, + { + "epoch": 0.47292259448806895, + "grad_norm": 0.70703125, + "learning_rate": 0.00016348213109716443, + "loss": 0.8802, + "step": 18418 + }, + { + "epoch": 0.47294827168399073, + "grad_norm": 0.84765625, + "learning_rate": 0.00016347868172837522, + "loss": 0.9551, + "step": 18419 + }, + { + "epoch": 0.47297394887991256, + "grad_norm": 0.7890625, + "learning_rate": 0.0001634752322330787, + "loss": 1.1448, + "step": 18420 + }, + { + "epoch": 0.4729996260758344, + "grad_norm": 0.859375, + "learning_rate": 0.0001634717826112818, + "loss": 1.0474, + "step": 18421 + }, + { + "epoch": 0.4730253032717562, + "grad_norm": 0.7734375, + "learning_rate": 0.00016346833286299135, + "loss": 0.9556, + "step": 18422 + }, + { + "epoch": 0.473050980467678, + "grad_norm": 0.8046875, + "learning_rate": 0.00016346488298821428, + "loss": 0.8494, + "step": 18423 + }, + { + "epoch": 0.47307665766359985, + "grad_norm": 0.65625, + "learning_rate": 0.0001634614329869574, + "loss": 0.9006, + "step": 18424 + }, + { + "epoch": 0.4731023348595217, + "grad_norm": 0.8046875, + "learning_rate": 0.00016345798285922762, + "loss": 0.9257, + "step": 18425 + }, + { + "epoch": 0.47312801205544347, + "grad_norm": 0.765625, + "learning_rate": 0.00016345453260503183, + "loss": 0.9443, + "step": 18426 + }, + { + "epoch": 0.4731536892513653, + "grad_norm": 0.875, + "learning_rate": 0.00016345108222437686, + "loss": 0.9738, + "step": 18427 + }, + { + "epoch": 0.47317936644728714, + "grad_norm": 0.734375, + "learning_rate": 0.00016344763171726964, + "loss": 0.8081, + "step": 18428 + }, + { + "epoch": 0.4732050436432089, + "grad_norm": 0.74609375, + "learning_rate": 0.000163444181083717, + "loss": 0.9137, + "step": 18429 + }, + { + "epoch": 0.47323072083913076, + "grad_norm": 0.75390625, + "learning_rate": 0.00016344073032372586, + "loss": 0.7465, + "step": 18430 + }, + { + "epoch": 0.4732563980350526, + "grad_norm": 0.80078125, + "learning_rate": 0.00016343727943730308, + "loss": 0.9373, + "step": 18431 + }, + { + "epoch": 0.4732820752309744, + "grad_norm": 0.76171875, + "learning_rate": 0.00016343382842445552, + "loss": 0.7855, + "step": 18432 + }, + { + "epoch": 0.4733077524268962, + "grad_norm": 0.98828125, + "learning_rate": 0.00016343037728519004, + "loss": 0.8375, + "step": 18433 + }, + { + "epoch": 0.47333342962281805, + "grad_norm": 0.72265625, + "learning_rate": 0.00016342692601951358, + "loss": 0.9196, + "step": 18434 + }, + { + "epoch": 0.4733591068187399, + "grad_norm": 0.8203125, + "learning_rate": 0.00016342347462743297, + "loss": 0.7691, + "step": 18435 + }, + { + "epoch": 0.47338478401466166, + "grad_norm": 0.8515625, + "learning_rate": 0.00016342002310895516, + "loss": 0.8513, + "step": 18436 + }, + { + "epoch": 0.4734104612105835, + "grad_norm": 0.8671875, + "learning_rate": 0.0001634165714640869, + "loss": 1.0337, + "step": 18437 + }, + { + "epoch": 0.47343613840650534, + "grad_norm": 0.78125, + "learning_rate": 0.00016341311969283518, + "loss": 0.8802, + "step": 18438 + }, + { + "epoch": 0.4734618156024271, + "grad_norm": 0.73828125, + "learning_rate": 0.00016340966779520682, + "loss": 0.9143, + "step": 18439 + }, + { + "epoch": 0.47348749279834895, + "grad_norm": 0.76171875, + "learning_rate": 0.0001634062157712087, + "loss": 0.937, + "step": 18440 + }, + { + "epoch": 0.4735131699942708, + "grad_norm": 0.77734375, + "learning_rate": 0.00016340276362084775, + "loss": 1.019, + "step": 18441 + }, + { + "epoch": 0.47353884719019257, + "grad_norm": 0.734375, + "learning_rate": 0.00016339931134413081, + "loss": 0.8292, + "step": 18442 + }, + { + "epoch": 0.4735645243861144, + "grad_norm": 0.77734375, + "learning_rate": 0.00016339585894106483, + "loss": 0.8067, + "step": 18443 + }, + { + "epoch": 0.47359020158203624, + "grad_norm": 0.7421875, + "learning_rate": 0.00016339240641165655, + "loss": 0.9974, + "step": 18444 + }, + { + "epoch": 0.4736158787779581, + "grad_norm": 0.86328125, + "learning_rate": 0.00016338895375591297, + "loss": 0.9337, + "step": 18445 + }, + { + "epoch": 0.47364155597387986, + "grad_norm": 0.7734375, + "learning_rate": 0.0001633855009738409, + "loss": 0.8919, + "step": 18446 + }, + { + "epoch": 0.4736672331698017, + "grad_norm": 0.6484375, + "learning_rate": 0.00016338204806544727, + "loss": 0.7293, + "step": 18447 + }, + { + "epoch": 0.47369291036572353, + "grad_norm": 0.796875, + "learning_rate": 0.00016337859503073892, + "loss": 0.987, + "step": 18448 + }, + { + "epoch": 0.4737185875616453, + "grad_norm": 0.78515625, + "learning_rate": 0.00016337514186972284, + "loss": 0.9529, + "step": 18449 + }, + { + "epoch": 0.47374426475756715, + "grad_norm": 0.86328125, + "learning_rate": 0.00016337168858240574, + "loss": 1.0208, + "step": 18450 + }, + { + "epoch": 0.473769941953489, + "grad_norm": 0.734375, + "learning_rate": 0.00016336823516879462, + "loss": 0.8235, + "step": 18451 + }, + { + "epoch": 0.47379561914941076, + "grad_norm": 0.69921875, + "learning_rate": 0.00016336478162889634, + "loss": 0.8663, + "step": 18452 + }, + { + "epoch": 0.4738212963453326, + "grad_norm": 0.796875, + "learning_rate": 0.00016336132796271774, + "loss": 0.9704, + "step": 18453 + }, + { + "epoch": 0.47384697354125443, + "grad_norm": 0.83984375, + "learning_rate": 0.00016335787417026576, + "loss": 0.9438, + "step": 18454 + }, + { + "epoch": 0.47387265073717627, + "grad_norm": 0.73046875, + "learning_rate": 0.00016335442025154726, + "loss": 0.9069, + "step": 18455 + }, + { + "epoch": 0.47389832793309805, + "grad_norm": 0.7734375, + "learning_rate": 0.00016335096620656914, + "loss": 0.9565, + "step": 18456 + }, + { + "epoch": 0.4739240051290199, + "grad_norm": 0.8515625, + "learning_rate": 0.00016334751203533822, + "loss": 0.8939, + "step": 18457 + }, + { + "epoch": 0.4739496823249417, + "grad_norm": 0.79296875, + "learning_rate": 0.00016334405773786145, + "loss": 0.9171, + "step": 18458 + }, + { + "epoch": 0.4739753595208635, + "grad_norm": 0.84375, + "learning_rate": 0.0001633406033141457, + "loss": 0.7849, + "step": 18459 + }, + { + "epoch": 0.47400103671678534, + "grad_norm": 0.72265625, + "learning_rate": 0.00016333714876419784, + "loss": 0.801, + "step": 18460 + }, + { + "epoch": 0.4740267139127072, + "grad_norm": 0.77734375, + "learning_rate": 0.00016333369408802478, + "loss": 0.9686, + "step": 18461 + }, + { + "epoch": 0.47405239110862896, + "grad_norm": 0.80078125, + "learning_rate": 0.0001633302392856334, + "loss": 0.978, + "step": 18462 + }, + { + "epoch": 0.4740780683045508, + "grad_norm": 0.8203125, + "learning_rate": 0.00016332678435703057, + "loss": 0.939, + "step": 18463 + }, + { + "epoch": 0.47410374550047263, + "grad_norm": 0.76953125, + "learning_rate": 0.00016332332930222315, + "loss": 0.9021, + "step": 18464 + }, + { + "epoch": 0.47412942269639446, + "grad_norm": 0.7734375, + "learning_rate": 0.00016331987412121808, + "loss": 0.8232, + "step": 18465 + }, + { + "epoch": 0.47415509989231625, + "grad_norm": 0.84765625, + "learning_rate": 0.0001633164188140222, + "loss": 0.8736, + "step": 18466 + }, + { + "epoch": 0.4741807770882381, + "grad_norm": 0.78125, + "learning_rate": 0.00016331296338064244, + "loss": 0.9978, + "step": 18467 + }, + { + "epoch": 0.4742064542841599, + "grad_norm": 0.7890625, + "learning_rate": 0.00016330950782108566, + "loss": 0.9565, + "step": 18468 + }, + { + "epoch": 0.4742321314800817, + "grad_norm": 0.80859375, + "learning_rate": 0.00016330605213535875, + "loss": 0.8894, + "step": 18469 + }, + { + "epoch": 0.47425780867600353, + "grad_norm": 0.71484375, + "learning_rate": 0.0001633025963234686, + "loss": 0.8826, + "step": 18470 + }, + { + "epoch": 0.47428348587192537, + "grad_norm": 0.73046875, + "learning_rate": 0.00016329914038542203, + "loss": 0.8466, + "step": 18471 + }, + { + "epoch": 0.47430916306784715, + "grad_norm": 0.71875, + "learning_rate": 0.00016329568432122606, + "loss": 0.8033, + "step": 18472 + }, + { + "epoch": 0.474334840263769, + "grad_norm": 0.734375, + "learning_rate": 0.0001632922281308875, + "loss": 0.9705, + "step": 18473 + }, + { + "epoch": 0.4743605174596908, + "grad_norm": 0.77734375, + "learning_rate": 0.00016328877181441326, + "loss": 0.7367, + "step": 18474 + }, + { + "epoch": 0.47438619465561266, + "grad_norm": 0.73046875, + "learning_rate": 0.00016328531537181015, + "loss": 0.8756, + "step": 18475 + }, + { + "epoch": 0.47441187185153444, + "grad_norm": 0.74609375, + "learning_rate": 0.0001632818588030852, + "loss": 0.8192, + "step": 18476 + }, + { + "epoch": 0.4744375490474563, + "grad_norm": 0.80078125, + "learning_rate": 0.00016327840210824516, + "loss": 0.8382, + "step": 18477 + }, + { + "epoch": 0.4744632262433781, + "grad_norm": 0.7421875, + "learning_rate": 0.00016327494528729704, + "loss": 0.9383, + "step": 18478 + }, + { + "epoch": 0.4744889034392999, + "grad_norm": 0.7109375, + "learning_rate": 0.00016327148834024762, + "loss": 0.8311, + "step": 18479 + }, + { + "epoch": 0.4745145806352217, + "grad_norm": 0.7265625, + "learning_rate": 0.00016326803126710384, + "loss": 0.772, + "step": 18480 + }, + { + "epoch": 0.47454025783114356, + "grad_norm": 0.7734375, + "learning_rate": 0.0001632645740678726, + "loss": 0.9048, + "step": 18481 + }, + { + "epoch": 0.47456593502706534, + "grad_norm": 0.78125, + "learning_rate": 0.0001632611167425608, + "loss": 0.9444, + "step": 18482 + }, + { + "epoch": 0.4745916122229872, + "grad_norm": 0.7734375, + "learning_rate": 0.0001632576592911753, + "loss": 0.8069, + "step": 18483 + }, + { + "epoch": 0.474617289418909, + "grad_norm": 0.73046875, + "learning_rate": 0.00016325420171372297, + "loss": 0.8334, + "step": 18484 + }, + { + "epoch": 0.47464296661483085, + "grad_norm": 0.765625, + "learning_rate": 0.00016325074401021076, + "loss": 0.9481, + "step": 18485 + }, + { + "epoch": 0.47466864381075263, + "grad_norm": 0.7578125, + "learning_rate": 0.0001632472861806455, + "loss": 0.8431, + "step": 18486 + }, + { + "epoch": 0.47469432100667447, + "grad_norm": 0.7421875, + "learning_rate": 0.00016324382822503414, + "loss": 0.8478, + "step": 18487 + }, + { + "epoch": 0.4747199982025963, + "grad_norm": 0.78515625, + "learning_rate": 0.00016324037014338354, + "loss": 0.9435, + "step": 18488 + }, + { + "epoch": 0.4747456753985181, + "grad_norm": 0.7578125, + "learning_rate": 0.0001632369119357006, + "loss": 0.8575, + "step": 18489 + }, + { + "epoch": 0.4747713525944399, + "grad_norm": 0.8046875, + "learning_rate": 0.0001632334536019922, + "loss": 0.87, + "step": 18490 + }, + { + "epoch": 0.47479702979036176, + "grad_norm": 0.75390625, + "learning_rate": 0.00016322999514226526, + "loss": 0.9295, + "step": 18491 + }, + { + "epoch": 0.47482270698628354, + "grad_norm": 0.74609375, + "learning_rate": 0.0001632265365565266, + "loss": 0.8113, + "step": 18492 + }, + { + "epoch": 0.4748483841822054, + "grad_norm": 0.78125, + "learning_rate": 0.00016322307784478318, + "loss": 0.8289, + "step": 18493 + }, + { + "epoch": 0.4748740613781272, + "grad_norm": 0.8203125, + "learning_rate": 0.0001632196190070419, + "loss": 0.9958, + "step": 18494 + }, + { + "epoch": 0.47489973857404905, + "grad_norm": 0.84375, + "learning_rate": 0.00016321616004330962, + "loss": 0.8258, + "step": 18495 + }, + { + "epoch": 0.4749254157699708, + "grad_norm": 0.78515625, + "learning_rate": 0.00016321270095359322, + "loss": 1.0154, + "step": 18496 + }, + { + "epoch": 0.47495109296589266, + "grad_norm": 0.73828125, + "learning_rate": 0.00016320924173789965, + "loss": 0.8387, + "step": 18497 + }, + { + "epoch": 0.4749767701618145, + "grad_norm": 0.78515625, + "learning_rate": 0.00016320578239623575, + "loss": 0.8647, + "step": 18498 + }, + { + "epoch": 0.4750024473577363, + "grad_norm": 0.76953125, + "learning_rate": 0.00016320232292860846, + "loss": 0.9304, + "step": 18499 + }, + { + "epoch": 0.4750281245536581, + "grad_norm": 0.78125, + "learning_rate": 0.00016319886333502462, + "loss": 0.8561, + "step": 18500 + }, + { + "epoch": 0.47505380174957995, + "grad_norm": 0.8359375, + "learning_rate": 0.00016319540361549117, + "loss": 0.978, + "step": 18501 + }, + { + "epoch": 0.47507947894550173, + "grad_norm": 0.875, + "learning_rate": 0.000163191943770015, + "loss": 0.9374, + "step": 18502 + }, + { + "epoch": 0.47510515614142357, + "grad_norm": 0.7734375, + "learning_rate": 0.000163188483798603, + "loss": 0.7918, + "step": 18503 + }, + { + "epoch": 0.4751308333373454, + "grad_norm": 0.73828125, + "learning_rate": 0.00016318502370126203, + "loss": 0.8135, + "step": 18504 + }, + { + "epoch": 0.47515651053326724, + "grad_norm": 0.97265625, + "learning_rate": 0.000163181563477999, + "loss": 0.8399, + "step": 18505 + }, + { + "epoch": 0.475182187729189, + "grad_norm": 0.859375, + "learning_rate": 0.00016317810312882087, + "loss": 0.9164, + "step": 18506 + }, + { + "epoch": 0.47520786492511086, + "grad_norm": 0.80859375, + "learning_rate": 0.00016317464265373443, + "loss": 0.8633, + "step": 18507 + }, + { + "epoch": 0.4752335421210327, + "grad_norm": 0.75, + "learning_rate": 0.00016317118205274667, + "loss": 0.8482, + "step": 18508 + }, + { + "epoch": 0.4752592193169545, + "grad_norm": 0.7734375, + "learning_rate": 0.00016316772132586447, + "loss": 0.9038, + "step": 18509 + }, + { + "epoch": 0.4752848965128763, + "grad_norm": 0.8203125, + "learning_rate": 0.0001631642604730947, + "loss": 0.9236, + "step": 18510 + }, + { + "epoch": 0.47531057370879815, + "grad_norm": 0.7734375, + "learning_rate": 0.00016316079949444423, + "loss": 0.8858, + "step": 18511 + }, + { + "epoch": 0.4753362509047199, + "grad_norm": 0.73046875, + "learning_rate": 0.00016315733838992, + "loss": 0.8733, + "step": 18512 + }, + { + "epoch": 0.47536192810064176, + "grad_norm": 0.74609375, + "learning_rate": 0.00016315387715952893, + "loss": 0.8115, + "step": 18513 + }, + { + "epoch": 0.4753876052965636, + "grad_norm": 0.78515625, + "learning_rate": 0.00016315041580327786, + "loss": 0.8254, + "step": 18514 + }, + { + "epoch": 0.47541328249248543, + "grad_norm": 0.7890625, + "learning_rate": 0.00016314695432117376, + "loss": 0.8857, + "step": 18515 + }, + { + "epoch": 0.4754389596884072, + "grad_norm": 0.7421875, + "learning_rate": 0.00016314349271322342, + "loss": 0.8418, + "step": 18516 + }, + { + "epoch": 0.47546463688432905, + "grad_norm": 0.76953125, + "learning_rate": 0.00016314003097943385, + "loss": 0.8228, + "step": 18517 + }, + { + "epoch": 0.4754903140802509, + "grad_norm": 0.7890625, + "learning_rate": 0.00016313656911981186, + "loss": 0.8928, + "step": 18518 + }, + { + "epoch": 0.47551599127617267, + "grad_norm": 0.76171875, + "learning_rate": 0.00016313310713436443, + "loss": 0.8913, + "step": 18519 + }, + { + "epoch": 0.4755416684720945, + "grad_norm": 0.75390625, + "learning_rate": 0.00016312964502309839, + "loss": 1.052, + "step": 18520 + }, + { + "epoch": 0.47556734566801634, + "grad_norm": 0.76171875, + "learning_rate": 0.0001631261827860207, + "loss": 1.0812, + "step": 18521 + }, + { + "epoch": 0.4755930228639381, + "grad_norm": 0.7578125, + "learning_rate": 0.00016312272042313822, + "loss": 1.0253, + "step": 18522 + }, + { + "epoch": 0.47561870005985996, + "grad_norm": 0.7890625, + "learning_rate": 0.00016311925793445784, + "loss": 0.9223, + "step": 18523 + }, + { + "epoch": 0.4756443772557818, + "grad_norm": 0.8046875, + "learning_rate": 0.0001631157953199865, + "loss": 0.8677, + "step": 18524 + }, + { + "epoch": 0.47567005445170363, + "grad_norm": 0.8515625, + "learning_rate": 0.00016311233257973107, + "loss": 0.9738, + "step": 18525 + }, + { + "epoch": 0.4756957316476254, + "grad_norm": 0.75390625, + "learning_rate": 0.00016310886971369846, + "loss": 0.9902, + "step": 18526 + }, + { + "epoch": 0.47572140884354724, + "grad_norm": 0.8671875, + "learning_rate": 0.0001631054067218956, + "loss": 0.9586, + "step": 18527 + }, + { + "epoch": 0.4757470860394691, + "grad_norm": 0.7890625, + "learning_rate": 0.00016310194360432933, + "loss": 0.8337, + "step": 18528 + }, + { + "epoch": 0.47577276323539086, + "grad_norm": 0.73046875, + "learning_rate": 0.0001630984803610066, + "loss": 0.7956, + "step": 18529 + }, + { + "epoch": 0.4757984404313127, + "grad_norm": 0.703125, + "learning_rate": 0.0001630950169919343, + "loss": 0.943, + "step": 18530 + }, + { + "epoch": 0.47582411762723453, + "grad_norm": 0.83203125, + "learning_rate": 0.00016309155349711934, + "loss": 0.9881, + "step": 18531 + }, + { + "epoch": 0.4758497948231563, + "grad_norm": 0.796875, + "learning_rate": 0.00016308808987656857, + "loss": 0.8742, + "step": 18532 + }, + { + "epoch": 0.47587547201907815, + "grad_norm": 0.7421875, + "learning_rate": 0.000163084626130289, + "loss": 0.8494, + "step": 18533 + }, + { + "epoch": 0.475901149215, + "grad_norm": 0.7734375, + "learning_rate": 0.00016308116225828743, + "loss": 0.8144, + "step": 18534 + }, + { + "epoch": 0.4759268264109218, + "grad_norm": 0.73828125, + "learning_rate": 0.00016307769826057083, + "loss": 0.8615, + "step": 18535 + }, + { + "epoch": 0.4759525036068436, + "grad_norm": 0.796875, + "learning_rate": 0.00016307423413714604, + "loss": 0.9475, + "step": 18536 + }, + { + "epoch": 0.47597818080276544, + "grad_norm": 0.84375, + "learning_rate": 0.00016307076988802, + "loss": 0.8033, + "step": 18537 + }, + { + "epoch": 0.4760038579986873, + "grad_norm": 0.7734375, + "learning_rate": 0.00016306730551319965, + "loss": 0.9391, + "step": 18538 + }, + { + "epoch": 0.47602953519460905, + "grad_norm": 0.8125, + "learning_rate": 0.0001630638410126918, + "loss": 0.8281, + "step": 18539 + }, + { + "epoch": 0.4760552123905309, + "grad_norm": 0.91015625, + "learning_rate": 0.00016306037638650345, + "loss": 0.8626, + "step": 18540 + }, + { + "epoch": 0.4760808895864527, + "grad_norm": 0.7265625, + "learning_rate": 0.00016305691163464147, + "loss": 0.7292, + "step": 18541 + }, + { + "epoch": 0.4761065667823745, + "grad_norm": 0.71875, + "learning_rate": 0.00016305344675711275, + "loss": 0.9381, + "step": 18542 + }, + { + "epoch": 0.47613224397829634, + "grad_norm": 0.78515625, + "learning_rate": 0.0001630499817539242, + "loss": 0.8029, + "step": 18543 + }, + { + "epoch": 0.4761579211742182, + "grad_norm": 0.77734375, + "learning_rate": 0.00016304651662508274, + "loss": 0.9177, + "step": 18544 + }, + { + "epoch": 0.47618359837014, + "grad_norm": 0.7421875, + "learning_rate": 0.00016304305137059528, + "loss": 0.8167, + "step": 18545 + }, + { + "epoch": 0.4762092755660618, + "grad_norm": 0.72265625, + "learning_rate": 0.0001630395859904687, + "loss": 0.7515, + "step": 18546 + }, + { + "epoch": 0.47623495276198363, + "grad_norm": 0.75390625, + "learning_rate": 0.0001630361204847099, + "loss": 0.8949, + "step": 18547 + }, + { + "epoch": 0.47626062995790547, + "grad_norm": 0.76171875, + "learning_rate": 0.00016303265485332586, + "loss": 0.8268, + "step": 18548 + }, + { + "epoch": 0.47628630715382725, + "grad_norm": 0.71484375, + "learning_rate": 0.0001630291890963234, + "loss": 0.8881, + "step": 18549 + }, + { + "epoch": 0.4763119843497491, + "grad_norm": 0.80078125, + "learning_rate": 0.00016302572321370945, + "loss": 0.9025, + "step": 18550 + }, + { + "epoch": 0.4763376615456709, + "grad_norm": 0.7421875, + "learning_rate": 0.00016302225720549093, + "loss": 1.1236, + "step": 18551 + }, + { + "epoch": 0.4763633387415927, + "grad_norm": 0.68359375, + "learning_rate": 0.00016301879107167474, + "loss": 0.8596, + "step": 18552 + }, + { + "epoch": 0.47638901593751454, + "grad_norm": 0.75, + "learning_rate": 0.0001630153248122678, + "loss": 0.9245, + "step": 18553 + }, + { + "epoch": 0.4764146931334364, + "grad_norm": 0.79296875, + "learning_rate": 0.00016301185842727703, + "loss": 0.974, + "step": 18554 + }, + { + "epoch": 0.4764403703293582, + "grad_norm": 0.84765625, + "learning_rate": 0.0001630083919167093, + "loss": 0.8444, + "step": 18555 + }, + { + "epoch": 0.47646604752528, + "grad_norm": 0.7578125, + "learning_rate": 0.00016300492528057152, + "loss": 0.8823, + "step": 18556 + }, + { + "epoch": 0.4764917247212018, + "grad_norm": 0.80859375, + "learning_rate": 0.00016300145851887062, + "loss": 0.9793, + "step": 18557 + }, + { + "epoch": 0.47651740191712366, + "grad_norm": 0.78125, + "learning_rate": 0.00016299799163161353, + "loss": 1.0341, + "step": 18558 + }, + { + "epoch": 0.47654307911304544, + "grad_norm": 0.71875, + "learning_rate": 0.0001629945246188071, + "loss": 0.961, + "step": 18559 + }, + { + "epoch": 0.4765687563089673, + "grad_norm": 0.73828125, + "learning_rate": 0.0001629910574804583, + "loss": 0.9394, + "step": 18560 + }, + { + "epoch": 0.4765944335048891, + "grad_norm": 0.76171875, + "learning_rate": 0.000162987590216574, + "loss": 0.8476, + "step": 18561 + }, + { + "epoch": 0.4766201107008109, + "grad_norm": 0.734375, + "learning_rate": 0.00016298412282716114, + "loss": 0.9123, + "step": 18562 + }, + { + "epoch": 0.47664578789673273, + "grad_norm": 0.78515625, + "learning_rate": 0.0001629806553122266, + "loss": 1.0247, + "step": 18563 + }, + { + "epoch": 0.47667146509265457, + "grad_norm": 0.7265625, + "learning_rate": 0.0001629771876717773, + "loss": 0.8413, + "step": 18564 + }, + { + "epoch": 0.4766971422885764, + "grad_norm": 0.74609375, + "learning_rate": 0.00016297371990582016, + "loss": 0.7955, + "step": 18565 + }, + { + "epoch": 0.4767228194844982, + "grad_norm": 0.7734375, + "learning_rate": 0.00016297025201436206, + "loss": 0.9279, + "step": 18566 + }, + { + "epoch": 0.47674849668042, + "grad_norm": 0.734375, + "learning_rate": 0.00016296678399740997, + "loss": 0.8279, + "step": 18567 + }, + { + "epoch": 0.47677417387634186, + "grad_norm": 0.82421875, + "learning_rate": 0.00016296331585497072, + "loss": 0.9838, + "step": 18568 + }, + { + "epoch": 0.47679985107226364, + "grad_norm": 0.875, + "learning_rate": 0.0001629598475870513, + "loss": 0.7248, + "step": 18569 + }, + { + "epoch": 0.4768255282681855, + "grad_norm": 0.90234375, + "learning_rate": 0.0001629563791936586, + "loss": 0.9718, + "step": 18570 + }, + { + "epoch": 0.4768512054641073, + "grad_norm": 0.8203125, + "learning_rate": 0.00016295291067479952, + "loss": 0.9125, + "step": 18571 + }, + { + "epoch": 0.4768768826600291, + "grad_norm": 0.7265625, + "learning_rate": 0.00016294944203048097, + "loss": 0.6809, + "step": 18572 + }, + { + "epoch": 0.4769025598559509, + "grad_norm": 0.8203125, + "learning_rate": 0.00016294597326070988, + "loss": 0.9085, + "step": 18573 + }, + { + "epoch": 0.47692823705187276, + "grad_norm": 0.8359375, + "learning_rate": 0.00016294250436549312, + "loss": 0.8523, + "step": 18574 + }, + { + "epoch": 0.4769539142477946, + "grad_norm": 0.78515625, + "learning_rate": 0.00016293903534483767, + "loss": 0.9608, + "step": 18575 + }, + { + "epoch": 0.4769795914437164, + "grad_norm": 0.7421875, + "learning_rate": 0.00016293556619875039, + "loss": 0.7969, + "step": 18576 + }, + { + "epoch": 0.4770052686396382, + "grad_norm": 0.76171875, + "learning_rate": 0.0001629320969272382, + "loss": 0.9347, + "step": 18577 + }, + { + "epoch": 0.47703094583556005, + "grad_norm": 0.78515625, + "learning_rate": 0.00016292862753030805, + "loss": 0.9749, + "step": 18578 + }, + { + "epoch": 0.47705662303148183, + "grad_norm": 0.70703125, + "learning_rate": 0.0001629251580079668, + "loss": 0.8972, + "step": 18579 + }, + { + "epoch": 0.47708230022740367, + "grad_norm": 0.83203125, + "learning_rate": 0.00016292168836022142, + "loss": 0.9207, + "step": 18580 + }, + { + "epoch": 0.4771079774233255, + "grad_norm": 0.7890625, + "learning_rate": 0.00016291821858707882, + "loss": 0.9235, + "step": 18581 + }, + { + "epoch": 0.4771336546192473, + "grad_norm": 0.74609375, + "learning_rate": 0.00016291474868854584, + "loss": 0.8529, + "step": 18582 + }, + { + "epoch": 0.4771593318151691, + "grad_norm": 0.73828125, + "learning_rate": 0.0001629112786646295, + "loss": 0.9599, + "step": 18583 + }, + { + "epoch": 0.47718500901109095, + "grad_norm": 0.70703125, + "learning_rate": 0.00016290780851533663, + "loss": 0.9041, + "step": 18584 + }, + { + "epoch": 0.4772106862070128, + "grad_norm": 0.75390625, + "learning_rate": 0.00016290433824067417, + "loss": 1.0061, + "step": 18585 + }, + { + "epoch": 0.47723636340293457, + "grad_norm": 0.765625, + "learning_rate": 0.0001629008678406491, + "loss": 0.9777, + "step": 18586 + }, + { + "epoch": 0.4772620405988564, + "grad_norm": 0.76953125, + "learning_rate": 0.00016289739731526823, + "loss": 0.8523, + "step": 18587 + }, + { + "epoch": 0.47728771779477824, + "grad_norm": 0.6953125, + "learning_rate": 0.00016289392666453853, + "loss": 0.7866, + "step": 18588 + }, + { + "epoch": 0.4773133949907, + "grad_norm": 0.84375, + "learning_rate": 0.00016289045588846693, + "loss": 0.9737, + "step": 18589 + }, + { + "epoch": 0.47733907218662186, + "grad_norm": 0.71875, + "learning_rate": 0.00016288698498706034, + "loss": 0.9267, + "step": 18590 + }, + { + "epoch": 0.4773647493825437, + "grad_norm": 0.7265625, + "learning_rate": 0.00016288351396032567, + "loss": 0.9823, + "step": 18591 + }, + { + "epoch": 0.4773904265784655, + "grad_norm": 0.74609375, + "learning_rate": 0.00016288004280826983, + "loss": 0.8509, + "step": 18592 + }, + { + "epoch": 0.4774161037743873, + "grad_norm": 0.83203125, + "learning_rate": 0.00016287657153089973, + "loss": 0.8091, + "step": 18593 + }, + { + "epoch": 0.47744178097030915, + "grad_norm": 0.80078125, + "learning_rate": 0.00016287310012822232, + "loss": 0.9195, + "step": 18594 + }, + { + "epoch": 0.47746745816623093, + "grad_norm": 0.71484375, + "learning_rate": 0.0001628696286002445, + "loss": 0.8625, + "step": 18595 + }, + { + "epoch": 0.47749313536215277, + "grad_norm": 0.78515625, + "learning_rate": 0.0001628661569469732, + "loss": 0.949, + "step": 18596 + }, + { + "epoch": 0.4775188125580746, + "grad_norm": 0.8046875, + "learning_rate": 0.0001628626851684153, + "loss": 0.9448, + "step": 18597 + }, + { + "epoch": 0.47754448975399644, + "grad_norm": 0.84375, + "learning_rate": 0.00016285921326457777, + "loss": 0.7735, + "step": 18598 + }, + { + "epoch": 0.4775701669499182, + "grad_norm": 0.76171875, + "learning_rate": 0.00016285574123546748, + "loss": 0.9136, + "step": 18599 + }, + { + "epoch": 0.47759584414584005, + "grad_norm": 0.95703125, + "learning_rate": 0.0001628522690810914, + "loss": 0.8558, + "step": 18600 + }, + { + "epoch": 0.4776215213417619, + "grad_norm": 0.74609375, + "learning_rate": 0.0001628487968014564, + "loss": 0.8564, + "step": 18601 + }, + { + "epoch": 0.47764719853768367, + "grad_norm": 0.7734375, + "learning_rate": 0.00016284532439656945, + "loss": 0.8754, + "step": 18602 + }, + { + "epoch": 0.4776728757336055, + "grad_norm": 0.73046875, + "learning_rate": 0.00016284185186643747, + "loss": 0.809, + "step": 18603 + }, + { + "epoch": 0.47769855292952734, + "grad_norm": 0.83203125, + "learning_rate": 0.0001628383792110673, + "loss": 0.8656, + "step": 18604 + }, + { + "epoch": 0.4777242301254491, + "grad_norm": 0.75390625, + "learning_rate": 0.00016283490643046594, + "loss": 0.7907, + "step": 18605 + }, + { + "epoch": 0.47774990732137096, + "grad_norm": 0.8203125, + "learning_rate": 0.00016283143352464029, + "loss": 0.9182, + "step": 18606 + }, + { + "epoch": 0.4777755845172928, + "grad_norm": 0.828125, + "learning_rate": 0.00016282796049359727, + "loss": 0.9097, + "step": 18607 + }, + { + "epoch": 0.47780126171321463, + "grad_norm": 0.88671875, + "learning_rate": 0.00016282448733734382, + "loss": 0.8107, + "step": 18608 + }, + { + "epoch": 0.4778269389091364, + "grad_norm": 0.88671875, + "learning_rate": 0.00016282101405588678, + "loss": 1.0807, + "step": 18609 + }, + { + "epoch": 0.47785261610505825, + "grad_norm": 0.83203125, + "learning_rate": 0.00016281754064923317, + "loss": 1.0051, + "step": 18610 + }, + { + "epoch": 0.4778782933009801, + "grad_norm": 0.8046875, + "learning_rate": 0.00016281406711738988, + "loss": 0.9046, + "step": 18611 + }, + { + "epoch": 0.47790397049690186, + "grad_norm": 0.76171875, + "learning_rate": 0.00016281059346036383, + "loss": 0.8714, + "step": 18612 + }, + { + "epoch": 0.4779296476928237, + "grad_norm": 0.79296875, + "learning_rate": 0.00016280711967816194, + "loss": 1.1344, + "step": 18613 + }, + { + "epoch": 0.47795532488874554, + "grad_norm": 0.73046875, + "learning_rate": 0.00016280364577079114, + "loss": 0.7667, + "step": 18614 + }, + { + "epoch": 0.4779810020846673, + "grad_norm": 0.84375, + "learning_rate": 0.00016280017173825835, + "loss": 1.0256, + "step": 18615 + }, + { + "epoch": 0.47800667928058915, + "grad_norm": 0.81640625, + "learning_rate": 0.00016279669758057045, + "loss": 1.0381, + "step": 18616 + }, + { + "epoch": 0.478032356476511, + "grad_norm": 0.78125, + "learning_rate": 0.00016279322329773444, + "loss": 0.9157, + "step": 18617 + }, + { + "epoch": 0.4780580336724328, + "grad_norm": 0.84765625, + "learning_rate": 0.0001627897488897572, + "loss": 0.8712, + "step": 18618 + }, + { + "epoch": 0.4780837108683546, + "grad_norm": 0.765625, + "learning_rate": 0.00016278627435664563, + "loss": 0.9123, + "step": 18619 + }, + { + "epoch": 0.47810938806427644, + "grad_norm": 0.703125, + "learning_rate": 0.00016278279969840674, + "loss": 0.7441, + "step": 18620 + }, + { + "epoch": 0.4781350652601983, + "grad_norm": 0.76953125, + "learning_rate": 0.00016277932491504736, + "loss": 0.8332, + "step": 18621 + }, + { + "epoch": 0.47816074245612006, + "grad_norm": 0.7109375, + "learning_rate": 0.0001627758500065745, + "loss": 0.8291, + "step": 18622 + }, + { + "epoch": 0.4781864196520419, + "grad_norm": 0.7890625, + "learning_rate": 0.00016277237497299503, + "loss": 0.9334, + "step": 18623 + }, + { + "epoch": 0.47821209684796373, + "grad_norm": 0.8359375, + "learning_rate": 0.00016276889981431584, + "loss": 0.9174, + "step": 18624 + }, + { + "epoch": 0.4782377740438855, + "grad_norm": 0.9609375, + "learning_rate": 0.00016276542453054396, + "loss": 1.0175, + "step": 18625 + }, + { + "epoch": 0.47826345123980735, + "grad_norm": 1.171875, + "learning_rate": 0.00016276194912168623, + "loss": 0.8378, + "step": 18626 + }, + { + "epoch": 0.4782891284357292, + "grad_norm": 0.7578125, + "learning_rate": 0.0001627584735877496, + "loss": 0.9584, + "step": 18627 + }, + { + "epoch": 0.478314805631651, + "grad_norm": 0.70703125, + "learning_rate": 0.00016275499792874103, + "loss": 0.7631, + "step": 18628 + }, + { + "epoch": 0.4783404828275728, + "grad_norm": 0.71875, + "learning_rate": 0.0001627515221446674, + "loss": 0.9004, + "step": 18629 + }, + { + "epoch": 0.47836616002349464, + "grad_norm": 0.76171875, + "learning_rate": 0.00016274804623553566, + "loss": 0.7479, + "step": 18630 + }, + { + "epoch": 0.47839183721941647, + "grad_norm": 0.90234375, + "learning_rate": 0.00016274457020135273, + "loss": 0.8366, + "step": 18631 + }, + { + "epoch": 0.47841751441533825, + "grad_norm": 0.7734375, + "learning_rate": 0.0001627410940421255, + "loss": 0.9054, + "step": 18632 + }, + { + "epoch": 0.4784431916112601, + "grad_norm": 0.71484375, + "learning_rate": 0.00016273761775786096, + "loss": 0.7492, + "step": 18633 + }, + { + "epoch": 0.4784688688071819, + "grad_norm": 0.796875, + "learning_rate": 0.00016273414134856606, + "loss": 0.9531, + "step": 18634 + }, + { + "epoch": 0.4784945460031037, + "grad_norm": 0.7421875, + "learning_rate": 0.00016273066481424766, + "loss": 0.741, + "step": 18635 + }, + { + "epoch": 0.47852022319902554, + "grad_norm": 0.7421875, + "learning_rate": 0.00016272718815491267, + "loss": 0.7921, + "step": 18636 + }, + { + "epoch": 0.4785459003949474, + "grad_norm": 0.76953125, + "learning_rate": 0.0001627237113705681, + "loss": 0.8753, + "step": 18637 + }, + { + "epoch": 0.4785715775908692, + "grad_norm": 0.7421875, + "learning_rate": 0.00016272023446122082, + "loss": 0.9776, + "step": 18638 + }, + { + "epoch": 0.478597254786791, + "grad_norm": 0.78125, + "learning_rate": 0.0001627167574268778, + "loss": 1.0592, + "step": 18639 + }, + { + "epoch": 0.47862293198271283, + "grad_norm": 0.6953125, + "learning_rate": 0.00016271328026754595, + "loss": 0.9485, + "step": 18640 + }, + { + "epoch": 0.47864860917863467, + "grad_norm": 0.77734375, + "learning_rate": 0.00016270980298323218, + "loss": 0.8199, + "step": 18641 + }, + { + "epoch": 0.47867428637455645, + "grad_norm": 0.71484375, + "learning_rate": 0.00016270632557394344, + "loss": 0.8658, + "step": 18642 + }, + { + "epoch": 0.4786999635704783, + "grad_norm": 0.7578125, + "learning_rate": 0.00016270284803968665, + "loss": 0.8958, + "step": 18643 + }, + { + "epoch": 0.4787256407664001, + "grad_norm": 0.8203125, + "learning_rate": 0.00016269937038046876, + "loss": 0.963, + "step": 18644 + }, + { + "epoch": 0.4787513179623219, + "grad_norm": 0.76171875, + "learning_rate": 0.00016269589259629668, + "loss": 0.8885, + "step": 18645 + }, + { + "epoch": 0.47877699515824373, + "grad_norm": 0.75, + "learning_rate": 0.00016269241468717734, + "loss": 0.8707, + "step": 18646 + }, + { + "epoch": 0.47880267235416557, + "grad_norm": 0.765625, + "learning_rate": 0.0001626889366531177, + "loss": 0.7695, + "step": 18647 + }, + { + "epoch": 0.4788283495500874, + "grad_norm": 0.765625, + "learning_rate": 0.0001626854584941247, + "loss": 0.8594, + "step": 18648 + }, + { + "epoch": 0.4788540267460092, + "grad_norm": 0.76953125, + "learning_rate": 0.00016268198021020518, + "loss": 0.9161, + "step": 18649 + }, + { + "epoch": 0.478879703941931, + "grad_norm": 0.7734375, + "learning_rate": 0.00016267850180136617, + "loss": 0.8445, + "step": 18650 + }, + { + "epoch": 0.47890538113785286, + "grad_norm": 0.71875, + "learning_rate": 0.00016267502326761454, + "loss": 0.8508, + "step": 18651 + }, + { + "epoch": 0.47893105833377464, + "grad_norm": 0.79296875, + "learning_rate": 0.00016267154460895727, + "loss": 0.8313, + "step": 18652 + }, + { + "epoch": 0.4789567355296965, + "grad_norm": 0.79296875, + "learning_rate": 0.00016266806582540132, + "loss": 0.8525, + "step": 18653 + }, + { + "epoch": 0.4789824127256183, + "grad_norm": 0.8203125, + "learning_rate": 0.0001626645869169535, + "loss": 1.0562, + "step": 18654 + }, + { + "epoch": 0.4790080899215401, + "grad_norm": 0.73828125, + "learning_rate": 0.00016266110788362088, + "loss": 0.8399, + "step": 18655 + }, + { + "epoch": 0.47903376711746193, + "grad_norm": 0.6953125, + "learning_rate": 0.00016265762872541027, + "loss": 0.8525, + "step": 18656 + }, + { + "epoch": 0.47905944431338376, + "grad_norm": 0.7421875, + "learning_rate": 0.00016265414944232872, + "loss": 0.9678, + "step": 18657 + }, + { + "epoch": 0.4790851215093056, + "grad_norm": 0.72265625, + "learning_rate": 0.0001626506700343831, + "loss": 0.897, + "step": 18658 + }, + { + "epoch": 0.4791107987052274, + "grad_norm": 0.8515625, + "learning_rate": 0.00016264719050158033, + "loss": 1.1745, + "step": 18659 + }, + { + "epoch": 0.4791364759011492, + "grad_norm": 0.79296875, + "learning_rate": 0.0001626437108439274, + "loss": 1.0028, + "step": 18660 + }, + { + "epoch": 0.47916215309707105, + "grad_norm": 0.76171875, + "learning_rate": 0.00016264023106143117, + "loss": 0.8535, + "step": 18661 + }, + { + "epoch": 0.47918783029299283, + "grad_norm": 0.73828125, + "learning_rate": 0.00016263675115409863, + "loss": 1.0171, + "step": 18662 + }, + { + "epoch": 0.47921350748891467, + "grad_norm": 0.7734375, + "learning_rate": 0.00016263327112193672, + "loss": 0.8919, + "step": 18663 + }, + { + "epoch": 0.4792391846848365, + "grad_norm": 0.765625, + "learning_rate": 0.00016262979096495235, + "loss": 0.9373, + "step": 18664 + }, + { + "epoch": 0.4792648618807583, + "grad_norm": 0.765625, + "learning_rate": 0.00016262631068315243, + "loss": 0.9927, + "step": 18665 + }, + { + "epoch": 0.4792905390766801, + "grad_norm": 0.79296875, + "learning_rate": 0.00016262283027654399, + "loss": 0.8713, + "step": 18666 + }, + { + "epoch": 0.47931621627260196, + "grad_norm": 0.76953125, + "learning_rate": 0.0001626193497451339, + "loss": 0.8866, + "step": 18667 + }, + { + "epoch": 0.4793418934685238, + "grad_norm": 0.71484375, + "learning_rate": 0.00016261586908892907, + "loss": 0.8768, + "step": 18668 + }, + { + "epoch": 0.4793675706644456, + "grad_norm": 0.796875, + "learning_rate": 0.00016261238830793647, + "loss": 0.9823, + "step": 18669 + }, + { + "epoch": 0.4793932478603674, + "grad_norm": 0.72265625, + "learning_rate": 0.00016260890740216305, + "loss": 0.9031, + "step": 18670 + }, + { + "epoch": 0.47941892505628925, + "grad_norm": 0.77734375, + "learning_rate": 0.00016260542637161574, + "loss": 0.7811, + "step": 18671 + }, + { + "epoch": 0.479444602252211, + "grad_norm": 0.734375, + "learning_rate": 0.00016260194521630145, + "loss": 0.7752, + "step": 18672 + }, + { + "epoch": 0.47947027944813286, + "grad_norm": 0.67578125, + "learning_rate": 0.00016259846393622715, + "loss": 0.8182, + "step": 18673 + }, + { + "epoch": 0.4794959566440547, + "grad_norm": 0.69921875, + "learning_rate": 0.00016259498253139978, + "loss": 0.8714, + "step": 18674 + }, + { + "epoch": 0.4795216338399765, + "grad_norm": 0.81640625, + "learning_rate": 0.00016259150100182625, + "loss": 0.9669, + "step": 18675 + }, + { + "epoch": 0.4795473110358983, + "grad_norm": 0.85546875, + "learning_rate": 0.0001625880193475135, + "loss": 1.0252, + "step": 18676 + }, + { + "epoch": 0.47957298823182015, + "grad_norm": 0.7578125, + "learning_rate": 0.00016258453756846849, + "loss": 0.8746, + "step": 18677 + }, + { + "epoch": 0.479598665427742, + "grad_norm": 0.72265625, + "learning_rate": 0.00016258105566469813, + "loss": 0.7908, + "step": 18678 + }, + { + "epoch": 0.47962434262366377, + "grad_norm": 0.78515625, + "learning_rate": 0.00016257757363620943, + "loss": 0.8861, + "step": 18679 + }, + { + "epoch": 0.4796500198195856, + "grad_norm": 0.73828125, + "learning_rate": 0.00016257409148300926, + "loss": 0.972, + "step": 18680 + }, + { + "epoch": 0.47967569701550744, + "grad_norm": 0.83984375, + "learning_rate": 0.00016257060920510456, + "loss": 0.841, + "step": 18681 + }, + { + "epoch": 0.4797013742114292, + "grad_norm": 0.78515625, + "learning_rate": 0.00016256712680250227, + "loss": 0.9486, + "step": 18682 + }, + { + "epoch": 0.47972705140735106, + "grad_norm": 0.72265625, + "learning_rate": 0.0001625636442752094, + "loss": 0.7737, + "step": 18683 + }, + { + "epoch": 0.4797527286032729, + "grad_norm": 0.7734375, + "learning_rate": 0.0001625601616232328, + "loss": 1.0573, + "step": 18684 + }, + { + "epoch": 0.4797784057991947, + "grad_norm": 0.7734375, + "learning_rate": 0.00016255667884657946, + "loss": 0.8465, + "step": 18685 + }, + { + "epoch": 0.4798040829951165, + "grad_norm": 0.73828125, + "learning_rate": 0.00016255319594525632, + "loss": 0.7899, + "step": 18686 + }, + { + "epoch": 0.47982976019103835, + "grad_norm": 0.89453125, + "learning_rate": 0.0001625497129192703, + "loss": 0.9564, + "step": 18687 + }, + { + "epoch": 0.4798554373869602, + "grad_norm": 0.78515625, + "learning_rate": 0.00016254622976862834, + "loss": 0.9478, + "step": 18688 + }, + { + "epoch": 0.47988111458288196, + "grad_norm": 0.85546875, + "learning_rate": 0.00016254274649333742, + "loss": 0.8546, + "step": 18689 + }, + { + "epoch": 0.4799067917788038, + "grad_norm": 0.7578125, + "learning_rate": 0.00016253926309340444, + "loss": 0.8438, + "step": 18690 + }, + { + "epoch": 0.47993246897472563, + "grad_norm": 0.9921875, + "learning_rate": 0.00016253577956883638, + "loss": 1.0047, + "step": 18691 + }, + { + "epoch": 0.4799581461706474, + "grad_norm": 0.8125, + "learning_rate": 0.00016253229591964012, + "loss": 0.8965, + "step": 18692 + }, + { + "epoch": 0.47998382336656925, + "grad_norm": 0.73828125, + "learning_rate": 0.0001625288121458227, + "loss": 0.8131, + "step": 18693 + }, + { + "epoch": 0.4800095005624911, + "grad_norm": 0.82421875, + "learning_rate": 0.00016252532824739094, + "loss": 0.8814, + "step": 18694 + }, + { + "epoch": 0.48003517775841287, + "grad_norm": 0.765625, + "learning_rate": 0.00016252184422435188, + "loss": 0.9479, + "step": 18695 + }, + { + "epoch": 0.4800608549543347, + "grad_norm": 0.85546875, + "learning_rate": 0.00016251836007671243, + "loss": 0.7719, + "step": 18696 + }, + { + "epoch": 0.48008653215025654, + "grad_norm": 0.87109375, + "learning_rate": 0.00016251487580447953, + "loss": 0.9042, + "step": 18697 + }, + { + "epoch": 0.4801122093461784, + "grad_norm": 0.796875, + "learning_rate": 0.00016251139140766014, + "loss": 0.9289, + "step": 18698 + }, + { + "epoch": 0.48013788654210016, + "grad_norm": 0.81640625, + "learning_rate": 0.00016250790688626118, + "loss": 0.8177, + "step": 18699 + }, + { + "epoch": 0.480163563738022, + "grad_norm": 0.88671875, + "learning_rate": 0.00016250442224028964, + "loss": 0.9324, + "step": 18700 + }, + { + "epoch": 0.48018924093394383, + "grad_norm": 0.77734375, + "learning_rate": 0.0001625009374697524, + "loss": 0.8637, + "step": 18701 + }, + { + "epoch": 0.4802149181298656, + "grad_norm": 0.82421875, + "learning_rate": 0.00016249745257465646, + "loss": 0.926, + "step": 18702 + }, + { + "epoch": 0.48024059532578744, + "grad_norm": 0.796875, + "learning_rate": 0.00016249396755500873, + "loss": 0.9143, + "step": 18703 + }, + { + "epoch": 0.4802662725217093, + "grad_norm": 0.73828125, + "learning_rate": 0.00016249048241081615, + "loss": 0.8913, + "step": 18704 + }, + { + "epoch": 0.48029194971763106, + "grad_norm": 0.890625, + "learning_rate": 0.00016248699714208571, + "loss": 0.9123, + "step": 18705 + }, + { + "epoch": 0.4803176269135529, + "grad_norm": 0.73828125, + "learning_rate": 0.00016248351174882433, + "loss": 0.8238, + "step": 18706 + }, + { + "epoch": 0.48034330410947473, + "grad_norm": 0.85546875, + "learning_rate": 0.0001624800262310389, + "loss": 0.8919, + "step": 18707 + }, + { + "epoch": 0.48036898130539657, + "grad_norm": 0.80078125, + "learning_rate": 0.0001624765405887365, + "loss": 0.9298, + "step": 18708 + }, + { + "epoch": 0.48039465850131835, + "grad_norm": 0.82421875, + "learning_rate": 0.00016247305482192394, + "loss": 1.0049, + "step": 18709 + }, + { + "epoch": 0.4804203356972402, + "grad_norm": 0.71875, + "learning_rate": 0.00016246956893060826, + "loss": 0.9069, + "step": 18710 + }, + { + "epoch": 0.480446012893162, + "grad_norm": 0.7578125, + "learning_rate": 0.00016246608291479637, + "loss": 1.0159, + "step": 18711 + }, + { + "epoch": 0.4804716900890838, + "grad_norm": 0.796875, + "learning_rate": 0.00016246259677449518, + "loss": 0.8015, + "step": 18712 + }, + { + "epoch": 0.48049736728500564, + "grad_norm": 0.81640625, + "learning_rate": 0.00016245911050971173, + "loss": 0.9458, + "step": 18713 + }, + { + "epoch": 0.4805230444809275, + "grad_norm": 0.859375, + "learning_rate": 0.0001624556241204529, + "loss": 0.9551, + "step": 18714 + }, + { + "epoch": 0.48054872167684926, + "grad_norm": 0.7890625, + "learning_rate": 0.00016245213760672562, + "loss": 0.7728, + "step": 18715 + }, + { + "epoch": 0.4805743988727711, + "grad_norm": 1.4765625, + "learning_rate": 0.00016244865096853693, + "loss": 1.0843, + "step": 18716 + }, + { + "epoch": 0.4806000760686929, + "grad_norm": 0.765625, + "learning_rate": 0.00016244516420589367, + "loss": 0.8929, + "step": 18717 + }, + { + "epoch": 0.48062575326461476, + "grad_norm": 0.8046875, + "learning_rate": 0.00016244167731880284, + "loss": 0.8721, + "step": 18718 + }, + { + "epoch": 0.48065143046053654, + "grad_norm": 0.71875, + "learning_rate": 0.0001624381903072714, + "loss": 0.8701, + "step": 18719 + }, + { + "epoch": 0.4806771076564584, + "grad_norm": 0.76171875, + "learning_rate": 0.00016243470317130632, + "loss": 0.813, + "step": 18720 + }, + { + "epoch": 0.4807027848523802, + "grad_norm": 0.78125, + "learning_rate": 0.00016243121591091446, + "loss": 0.8316, + "step": 18721 + }, + { + "epoch": 0.480728462048302, + "grad_norm": 0.84765625, + "learning_rate": 0.00016242772852610284, + "loss": 0.876, + "step": 18722 + }, + { + "epoch": 0.48075413924422383, + "grad_norm": 0.7265625, + "learning_rate": 0.0001624242410168784, + "loss": 0.9892, + "step": 18723 + }, + { + "epoch": 0.48077981644014567, + "grad_norm": 0.7890625, + "learning_rate": 0.00016242075338324813, + "loss": 0.8975, + "step": 18724 + }, + { + "epoch": 0.48080549363606745, + "grad_norm": 0.796875, + "learning_rate": 0.00016241726562521888, + "loss": 1.0929, + "step": 18725 + }, + { + "epoch": 0.4808311708319893, + "grad_norm": 0.74609375, + "learning_rate": 0.0001624137777427977, + "loss": 0.8084, + "step": 18726 + }, + { + "epoch": 0.4808568480279111, + "grad_norm": 0.7421875, + "learning_rate": 0.00016241028973599145, + "loss": 0.9528, + "step": 18727 + }, + { + "epoch": 0.48088252522383296, + "grad_norm": 0.79296875, + "learning_rate": 0.00016240680160480717, + "loss": 0.8272, + "step": 18728 + }, + { + "epoch": 0.48090820241975474, + "grad_norm": 0.74609375, + "learning_rate": 0.00016240331334925175, + "loss": 0.7391, + "step": 18729 + }, + { + "epoch": 0.4809338796156766, + "grad_norm": 0.71484375, + "learning_rate": 0.00016239982496933216, + "loss": 0.8841, + "step": 18730 + }, + { + "epoch": 0.4809595568115984, + "grad_norm": 0.765625, + "learning_rate": 0.00016239633646505537, + "loss": 0.889, + "step": 18731 + }, + { + "epoch": 0.4809852340075202, + "grad_norm": 0.7734375, + "learning_rate": 0.00016239284783642831, + "loss": 0.9438, + "step": 18732 + }, + { + "epoch": 0.481010911203442, + "grad_norm": 0.7734375, + "learning_rate": 0.00016238935908345796, + "loss": 0.9815, + "step": 18733 + }, + { + "epoch": 0.48103658839936386, + "grad_norm": 0.75390625, + "learning_rate": 0.00016238587020615125, + "loss": 0.9039, + "step": 18734 + }, + { + "epoch": 0.48106226559528564, + "grad_norm": 0.875, + "learning_rate": 0.0001623823812045151, + "loss": 0.8793, + "step": 18735 + }, + { + "epoch": 0.4810879427912075, + "grad_norm": 0.73828125, + "learning_rate": 0.00016237889207855653, + "loss": 0.9141, + "step": 18736 + }, + { + "epoch": 0.4811136199871293, + "grad_norm": 0.890625, + "learning_rate": 0.00016237540282828245, + "loss": 0.8933, + "step": 18737 + }, + { + "epoch": 0.48113929718305115, + "grad_norm": 0.83203125, + "learning_rate": 0.00016237191345369983, + "loss": 1.0345, + "step": 18738 + }, + { + "epoch": 0.48116497437897293, + "grad_norm": 0.828125, + "learning_rate": 0.00016236842395481562, + "loss": 0.899, + "step": 18739 + }, + { + "epoch": 0.48119065157489477, + "grad_norm": 0.83984375, + "learning_rate": 0.00016236493433163676, + "loss": 1.0833, + "step": 18740 + }, + { + "epoch": 0.4812163287708166, + "grad_norm": 0.8671875, + "learning_rate": 0.00016236144458417026, + "loss": 1.0066, + "step": 18741 + }, + { + "epoch": 0.4812420059667384, + "grad_norm": 0.75, + "learning_rate": 0.000162357954712423, + "loss": 0.8869, + "step": 18742 + }, + { + "epoch": 0.4812676831626602, + "grad_norm": 0.76171875, + "learning_rate": 0.00016235446471640197, + "loss": 0.9139, + "step": 18743 + }, + { + "epoch": 0.48129336035858206, + "grad_norm": 0.87890625, + "learning_rate": 0.00016235097459611411, + "loss": 0.9836, + "step": 18744 + }, + { + "epoch": 0.48131903755450384, + "grad_norm": 0.75, + "learning_rate": 0.00016234748435156645, + "loss": 0.7787, + "step": 18745 + }, + { + "epoch": 0.4813447147504257, + "grad_norm": 0.8203125, + "learning_rate": 0.00016234399398276585, + "loss": 0.924, + "step": 18746 + }, + { + "epoch": 0.4813703919463475, + "grad_norm": 0.734375, + "learning_rate": 0.00016234050348971928, + "loss": 0.8618, + "step": 18747 + }, + { + "epoch": 0.48139606914226934, + "grad_norm": 0.73828125, + "learning_rate": 0.00016233701287243375, + "loss": 1.037, + "step": 18748 + }, + { + "epoch": 0.4814217463381911, + "grad_norm": 0.7265625, + "learning_rate": 0.00016233352213091616, + "loss": 0.8167, + "step": 18749 + }, + { + "epoch": 0.48144742353411296, + "grad_norm": 0.7578125, + "learning_rate": 0.00016233003126517353, + "loss": 0.8676, + "step": 18750 + }, + { + "epoch": 0.4814731007300348, + "grad_norm": 0.78515625, + "learning_rate": 0.00016232654027521272, + "loss": 0.8998, + "step": 18751 + }, + { + "epoch": 0.4814987779259566, + "grad_norm": 0.77734375, + "learning_rate": 0.0001623230491610408, + "loss": 0.8739, + "step": 18752 + }, + { + "epoch": 0.4815244551218784, + "grad_norm": 0.74609375, + "learning_rate": 0.00016231955792266465, + "loss": 0.9403, + "step": 18753 + }, + { + "epoch": 0.48155013231780025, + "grad_norm": 0.79296875, + "learning_rate": 0.00016231606656009123, + "loss": 0.8745, + "step": 18754 + }, + { + "epoch": 0.48157580951372203, + "grad_norm": 0.7890625, + "learning_rate": 0.00016231257507332757, + "loss": 0.8282, + "step": 18755 + }, + { + "epoch": 0.48160148670964387, + "grad_norm": 0.7890625, + "learning_rate": 0.00016230908346238053, + "loss": 0.9525, + "step": 18756 + }, + { + "epoch": 0.4816271639055657, + "grad_norm": 0.81640625, + "learning_rate": 0.00016230559172725713, + "loss": 0.9383, + "step": 18757 + }, + { + "epoch": 0.48165284110148754, + "grad_norm": 0.78515625, + "learning_rate": 0.00016230209986796429, + "loss": 0.9426, + "step": 18758 + }, + { + "epoch": 0.4816785182974093, + "grad_norm": 0.78125, + "learning_rate": 0.00016229860788450904, + "loss": 0.9126, + "step": 18759 + }, + { + "epoch": 0.48170419549333116, + "grad_norm": 0.734375, + "learning_rate": 0.00016229511577689826, + "loss": 0.8382, + "step": 18760 + }, + { + "epoch": 0.481729872689253, + "grad_norm": 0.78515625, + "learning_rate": 0.00016229162354513895, + "loss": 0.8352, + "step": 18761 + }, + { + "epoch": 0.48175554988517477, + "grad_norm": 0.71875, + "learning_rate": 0.00016228813118923804, + "loss": 0.861, + "step": 18762 + }, + { + "epoch": 0.4817812270810966, + "grad_norm": 0.75, + "learning_rate": 0.00016228463870920256, + "loss": 0.8398, + "step": 18763 + }, + { + "epoch": 0.48180690427701844, + "grad_norm": 0.76171875, + "learning_rate": 0.00016228114610503938, + "loss": 0.8099, + "step": 18764 + }, + { + "epoch": 0.4818325814729402, + "grad_norm": 0.8203125, + "learning_rate": 0.0001622776533767555, + "loss": 1.0559, + "step": 18765 + }, + { + "epoch": 0.48185825866886206, + "grad_norm": 0.80078125, + "learning_rate": 0.0001622741605243579, + "loss": 0.8951, + "step": 18766 + }, + { + "epoch": 0.4818839358647839, + "grad_norm": 0.8203125, + "learning_rate": 0.00016227066754785353, + "loss": 0.8948, + "step": 18767 + }, + { + "epoch": 0.48190961306070573, + "grad_norm": 0.765625, + "learning_rate": 0.00016226717444724935, + "loss": 0.9425, + "step": 18768 + }, + { + "epoch": 0.4819352902566275, + "grad_norm": 0.75390625, + "learning_rate": 0.0001622636812225523, + "loss": 0.9405, + "step": 18769 + }, + { + "epoch": 0.48196096745254935, + "grad_norm": 0.796875, + "learning_rate": 0.00016226018787376934, + "loss": 0.8811, + "step": 18770 + }, + { + "epoch": 0.4819866446484712, + "grad_norm": 0.7890625, + "learning_rate": 0.00016225669440090746, + "loss": 0.9523, + "step": 18771 + }, + { + "epoch": 0.48201232184439297, + "grad_norm": 0.76171875, + "learning_rate": 0.0001622532008039736, + "loss": 0.8881, + "step": 18772 + }, + { + "epoch": 0.4820379990403148, + "grad_norm": 0.8046875, + "learning_rate": 0.00016224970708297477, + "loss": 0.8237, + "step": 18773 + }, + { + "epoch": 0.48206367623623664, + "grad_norm": 0.75, + "learning_rate": 0.0001622462132379179, + "loss": 0.7616, + "step": 18774 + }, + { + "epoch": 0.4820893534321584, + "grad_norm": 0.73046875, + "learning_rate": 0.0001622427192688099, + "loss": 0.8974, + "step": 18775 + }, + { + "epoch": 0.48211503062808025, + "grad_norm": 0.7421875, + "learning_rate": 0.00016223922517565783, + "loss": 0.8209, + "step": 18776 + }, + { + "epoch": 0.4821407078240021, + "grad_norm": 0.78125, + "learning_rate": 0.00016223573095846858, + "loss": 1.0561, + "step": 18777 + }, + { + "epoch": 0.4821663850199239, + "grad_norm": 0.8125, + "learning_rate": 0.00016223223661724915, + "loss": 0.872, + "step": 18778 + }, + { + "epoch": 0.4821920622158457, + "grad_norm": 0.81640625, + "learning_rate": 0.0001622287421520065, + "loss": 1.0209, + "step": 18779 + }, + { + "epoch": 0.48221773941176754, + "grad_norm": 0.734375, + "learning_rate": 0.00016222524756274756, + "loss": 0.8857, + "step": 18780 + }, + { + "epoch": 0.4822434166076894, + "grad_norm": 0.83203125, + "learning_rate": 0.00016222175284947936, + "loss": 0.8159, + "step": 18781 + }, + { + "epoch": 0.48226909380361116, + "grad_norm": 0.76953125, + "learning_rate": 0.0001622182580122088, + "loss": 0.8489, + "step": 18782 + }, + { + "epoch": 0.482294770999533, + "grad_norm": 0.6640625, + "learning_rate": 0.00016221476305094287, + "loss": 0.8031, + "step": 18783 + }, + { + "epoch": 0.48232044819545483, + "grad_norm": 0.72265625, + "learning_rate": 0.00016221126796568856, + "loss": 0.83, + "step": 18784 + }, + { + "epoch": 0.4823461253913766, + "grad_norm": 0.734375, + "learning_rate": 0.00016220777275645284, + "loss": 0.9219, + "step": 18785 + }, + { + "epoch": 0.48237180258729845, + "grad_norm": 0.8515625, + "learning_rate": 0.00016220427742324258, + "loss": 0.9328, + "step": 18786 + }, + { + "epoch": 0.4823974797832203, + "grad_norm": 0.83984375, + "learning_rate": 0.00016220078196606483, + "loss": 1.0077, + "step": 18787 + }, + { + "epoch": 0.4824231569791421, + "grad_norm": 0.77734375, + "learning_rate": 0.00016219728638492656, + "loss": 0.8392, + "step": 18788 + }, + { + "epoch": 0.4824488341750639, + "grad_norm": 0.79296875, + "learning_rate": 0.0001621937906798347, + "loss": 0.8055, + "step": 18789 + }, + { + "epoch": 0.48247451137098574, + "grad_norm": 0.765625, + "learning_rate": 0.00016219029485079625, + "loss": 0.9155, + "step": 18790 + }, + { + "epoch": 0.4825001885669076, + "grad_norm": 0.76953125, + "learning_rate": 0.00016218679889781816, + "loss": 1.0621, + "step": 18791 + }, + { + "epoch": 0.48252586576282935, + "grad_norm": 0.8359375, + "learning_rate": 0.0001621833028209074, + "loss": 1.0242, + "step": 18792 + }, + { + "epoch": 0.4825515429587512, + "grad_norm": 0.79296875, + "learning_rate": 0.0001621798066200709, + "loss": 0.9272, + "step": 18793 + }, + { + "epoch": 0.482577220154673, + "grad_norm": 0.6953125, + "learning_rate": 0.0001621763102953157, + "loss": 0.8683, + "step": 18794 + }, + { + "epoch": 0.4826028973505948, + "grad_norm": 0.8125, + "learning_rate": 0.00016217281384664872, + "loss": 0.8884, + "step": 18795 + }, + { + "epoch": 0.48262857454651664, + "grad_norm": 0.7890625, + "learning_rate": 0.00016216931727407695, + "loss": 0.8543, + "step": 18796 + }, + { + "epoch": 0.4826542517424385, + "grad_norm": 0.7421875, + "learning_rate": 0.0001621658205776073, + "loss": 1.0132, + "step": 18797 + }, + { + "epoch": 0.48267992893836026, + "grad_norm": 0.8125, + "learning_rate": 0.00016216232375724683, + "loss": 0.9487, + "step": 18798 + }, + { + "epoch": 0.4827056061342821, + "grad_norm": 0.81640625, + "learning_rate": 0.00016215882681300245, + "loss": 0.9403, + "step": 18799 + }, + { + "epoch": 0.48273128333020393, + "grad_norm": 0.7265625, + "learning_rate": 0.00016215532974488115, + "loss": 0.89, + "step": 18800 + }, + { + "epoch": 0.48275696052612577, + "grad_norm": 0.6796875, + "learning_rate": 0.00016215183255288986, + "loss": 0.8493, + "step": 18801 + }, + { + "epoch": 0.48278263772204755, + "grad_norm": 0.76171875, + "learning_rate": 0.00016214833523703562, + "loss": 1.0209, + "step": 18802 + }, + { + "epoch": 0.4828083149179694, + "grad_norm": 0.78515625, + "learning_rate": 0.00016214483779732533, + "loss": 0.8271, + "step": 18803 + }, + { + "epoch": 0.4828339921138912, + "grad_norm": 0.76953125, + "learning_rate": 0.00016214134023376602, + "loss": 0.8131, + "step": 18804 + }, + { + "epoch": 0.482859669309813, + "grad_norm": 0.734375, + "learning_rate": 0.00016213784254636462, + "loss": 0.8276, + "step": 18805 + }, + { + "epoch": 0.48288534650573484, + "grad_norm": 0.8046875, + "learning_rate": 0.00016213434473512812, + "loss": 0.8834, + "step": 18806 + }, + { + "epoch": 0.48291102370165667, + "grad_norm": 0.7578125, + "learning_rate": 0.00016213084680006347, + "loss": 0.9076, + "step": 18807 + }, + { + "epoch": 0.48293670089757845, + "grad_norm": 0.7265625, + "learning_rate": 0.00016212734874117767, + "loss": 0.9025, + "step": 18808 + }, + { + "epoch": 0.4829623780935003, + "grad_norm": 0.8359375, + "learning_rate": 0.00016212385055847767, + "loss": 0.8187, + "step": 18809 + }, + { + "epoch": 0.4829880552894221, + "grad_norm": 0.73828125, + "learning_rate": 0.00016212035225197043, + "loss": 0.8498, + "step": 18810 + }, + { + "epoch": 0.48301373248534396, + "grad_norm": 0.88671875, + "learning_rate": 0.00016211685382166297, + "loss": 0.8596, + "step": 18811 + }, + { + "epoch": 0.48303940968126574, + "grad_norm": 0.76171875, + "learning_rate": 0.00016211335526756222, + "loss": 0.8626, + "step": 18812 + }, + { + "epoch": 0.4830650868771876, + "grad_norm": 0.79296875, + "learning_rate": 0.00016210985658967516, + "loss": 0.9513, + "step": 18813 + }, + { + "epoch": 0.4830907640731094, + "grad_norm": 0.7421875, + "learning_rate": 0.00016210635778800877, + "loss": 0.7822, + "step": 18814 + }, + { + "epoch": 0.4831164412690312, + "grad_norm": 0.76953125, + "learning_rate": 0.00016210285886257003, + "loss": 0.9141, + "step": 18815 + }, + { + "epoch": 0.48314211846495303, + "grad_norm": 0.83984375, + "learning_rate": 0.00016209935981336586, + "loss": 0.972, + "step": 18816 + }, + { + "epoch": 0.48316779566087487, + "grad_norm": 0.75390625, + "learning_rate": 0.00016209586064040333, + "loss": 0.9362, + "step": 18817 + }, + { + "epoch": 0.48319347285679665, + "grad_norm": 0.75390625, + "learning_rate": 0.00016209236134368934, + "loss": 0.9102, + "step": 18818 + }, + { + "epoch": 0.4832191500527185, + "grad_norm": 0.83203125, + "learning_rate": 0.00016208886192323085, + "loss": 1.0498, + "step": 18819 + }, + { + "epoch": 0.4832448272486403, + "grad_norm": 0.82421875, + "learning_rate": 0.0001620853623790349, + "loss": 0.9006, + "step": 18820 + }, + { + "epoch": 0.48327050444456215, + "grad_norm": 0.78125, + "learning_rate": 0.00016208186271110843, + "loss": 1.0008, + "step": 18821 + }, + { + "epoch": 0.48329618164048394, + "grad_norm": 0.8125, + "learning_rate": 0.0001620783629194584, + "loss": 0.9501, + "step": 18822 + }, + { + "epoch": 0.48332185883640577, + "grad_norm": 0.71484375, + "learning_rate": 0.00016207486300409178, + "loss": 0.8328, + "step": 18823 + }, + { + "epoch": 0.4833475360323276, + "grad_norm": 0.69140625, + "learning_rate": 0.0001620713629650156, + "loss": 0.9464, + "step": 18824 + }, + { + "epoch": 0.4833732132282494, + "grad_norm": 0.828125, + "learning_rate": 0.0001620678628022368, + "loss": 0.9951, + "step": 18825 + }, + { + "epoch": 0.4833988904241712, + "grad_norm": 0.74609375, + "learning_rate": 0.00016206436251576235, + "loss": 0.9887, + "step": 18826 + }, + { + "epoch": 0.48342456762009306, + "grad_norm": 0.84375, + "learning_rate": 0.00016206086210559923, + "loss": 1.0115, + "step": 18827 + }, + { + "epoch": 0.48345024481601484, + "grad_norm": 0.71875, + "learning_rate": 0.00016205736157175443, + "loss": 0.9834, + "step": 18828 + }, + { + "epoch": 0.4834759220119367, + "grad_norm": 0.78515625, + "learning_rate": 0.0001620538609142349, + "loss": 0.8472, + "step": 18829 + }, + { + "epoch": 0.4835015992078585, + "grad_norm": 0.75, + "learning_rate": 0.0001620503601330476, + "loss": 0.8186, + "step": 18830 + }, + { + "epoch": 0.48352727640378035, + "grad_norm": 0.78515625, + "learning_rate": 0.0001620468592281996, + "loss": 0.9501, + "step": 18831 + }, + { + "epoch": 0.48355295359970213, + "grad_norm": 0.76171875, + "learning_rate": 0.00016204335819969777, + "loss": 1.0183, + "step": 18832 + }, + { + "epoch": 0.48357863079562396, + "grad_norm": 0.76953125, + "learning_rate": 0.00016203985704754917, + "loss": 0.8354, + "step": 18833 + }, + { + "epoch": 0.4836043079915458, + "grad_norm": 0.71875, + "learning_rate": 0.00016203635577176072, + "loss": 0.8833, + "step": 18834 + }, + { + "epoch": 0.4836299851874676, + "grad_norm": 0.84375, + "learning_rate": 0.0001620328543723394, + "loss": 0.9012, + "step": 18835 + }, + { + "epoch": 0.4836556623833894, + "grad_norm": 0.7265625, + "learning_rate": 0.00016202935284929221, + "loss": 0.8775, + "step": 18836 + }, + { + "epoch": 0.48368133957931125, + "grad_norm": 0.7890625, + "learning_rate": 0.00016202585120262612, + "loss": 0.9771, + "step": 18837 + }, + { + "epoch": 0.48370701677523303, + "grad_norm": 0.765625, + "learning_rate": 0.00016202234943234816, + "loss": 0.9898, + "step": 18838 + }, + { + "epoch": 0.48373269397115487, + "grad_norm": 0.7578125, + "learning_rate": 0.00016201884753846518, + "loss": 0.9059, + "step": 18839 + }, + { + "epoch": 0.4837583711670767, + "grad_norm": 0.8125, + "learning_rate": 0.0001620153455209843, + "loss": 1.0341, + "step": 18840 + }, + { + "epoch": 0.48378404836299854, + "grad_norm": 2.875, + "learning_rate": 0.0001620118433799124, + "loss": 0.8199, + "step": 18841 + }, + { + "epoch": 0.4838097255589203, + "grad_norm": 0.8359375, + "learning_rate": 0.0001620083411152565, + "loss": 0.9497, + "step": 18842 + }, + { + "epoch": 0.48383540275484216, + "grad_norm": 0.76171875, + "learning_rate": 0.0001620048387270236, + "loss": 0.9509, + "step": 18843 + }, + { + "epoch": 0.483861079950764, + "grad_norm": 0.82421875, + "learning_rate": 0.00016200133621522067, + "loss": 0.9581, + "step": 18844 + }, + { + "epoch": 0.4838867571466858, + "grad_norm": 0.7890625, + "learning_rate": 0.00016199783357985465, + "loss": 1.0322, + "step": 18845 + }, + { + "epoch": 0.4839124343426076, + "grad_norm": 0.78515625, + "learning_rate": 0.00016199433082093257, + "loss": 0.8902, + "step": 18846 + }, + { + "epoch": 0.48393811153852945, + "grad_norm": 0.75, + "learning_rate": 0.00016199082793846137, + "loss": 0.7742, + "step": 18847 + }, + { + "epoch": 0.48396378873445123, + "grad_norm": 0.703125, + "learning_rate": 0.00016198732493244805, + "loss": 0.7989, + "step": 18848 + }, + { + "epoch": 0.48398946593037306, + "grad_norm": 0.7265625, + "learning_rate": 0.00016198382180289959, + "loss": 0.8577, + "step": 18849 + }, + { + "epoch": 0.4840151431262949, + "grad_norm": 0.77734375, + "learning_rate": 0.00016198031854982298, + "loss": 0.8654, + "step": 18850 + }, + { + "epoch": 0.48404082032221674, + "grad_norm": 0.7890625, + "learning_rate": 0.00016197681517322518, + "loss": 0.8606, + "step": 18851 + }, + { + "epoch": 0.4840664975181385, + "grad_norm": 0.734375, + "learning_rate": 0.0001619733116731132, + "loss": 0.9202, + "step": 18852 + }, + { + "epoch": 0.48409217471406035, + "grad_norm": 0.7578125, + "learning_rate": 0.00016196980804949402, + "loss": 0.8777, + "step": 18853 + }, + { + "epoch": 0.4841178519099822, + "grad_norm": 0.81640625, + "learning_rate": 0.00016196630430237457, + "loss": 0.9724, + "step": 18854 + }, + { + "epoch": 0.48414352910590397, + "grad_norm": 0.71484375, + "learning_rate": 0.00016196280043176192, + "loss": 0.9412, + "step": 18855 + }, + { + "epoch": 0.4841692063018258, + "grad_norm": 0.7421875, + "learning_rate": 0.00016195929643766298, + "loss": 0.7658, + "step": 18856 + }, + { + "epoch": 0.48419488349774764, + "grad_norm": 0.859375, + "learning_rate": 0.0001619557923200848, + "loss": 0.9083, + "step": 18857 + }, + { + "epoch": 0.4842205606936694, + "grad_norm": 0.7578125, + "learning_rate": 0.00016195228807903424, + "loss": 0.7915, + "step": 18858 + }, + { + "epoch": 0.48424623788959126, + "grad_norm": 0.7890625, + "learning_rate": 0.00016194878371451841, + "loss": 1.0727, + "step": 18859 + }, + { + "epoch": 0.4842719150855131, + "grad_norm": 1.1171875, + "learning_rate": 0.00016194527922654427, + "loss": 0.9109, + "step": 18860 + }, + { + "epoch": 0.48429759228143493, + "grad_norm": 0.859375, + "learning_rate": 0.00016194177461511874, + "loss": 0.7787, + "step": 18861 + }, + { + "epoch": 0.4843232694773567, + "grad_norm": 0.75390625, + "learning_rate": 0.00016193826988024888, + "loss": 0.8665, + "step": 18862 + }, + { + "epoch": 0.48434894667327855, + "grad_norm": 0.76953125, + "learning_rate": 0.00016193476502194163, + "loss": 0.9508, + "step": 18863 + }, + { + "epoch": 0.4843746238692004, + "grad_norm": 0.83984375, + "learning_rate": 0.000161931260040204, + "loss": 0.9347, + "step": 18864 + }, + { + "epoch": 0.48440030106512216, + "grad_norm": 0.7578125, + "learning_rate": 0.00016192775493504294, + "loss": 0.9368, + "step": 18865 + }, + { + "epoch": 0.484425978261044, + "grad_norm": 0.7109375, + "learning_rate": 0.0001619242497064655, + "loss": 0.8922, + "step": 18866 + }, + { + "epoch": 0.48445165545696584, + "grad_norm": 0.75, + "learning_rate": 0.00016192074435447858, + "loss": 0.8237, + "step": 18867 + }, + { + "epoch": 0.4844773326528876, + "grad_norm": 0.828125, + "learning_rate": 0.00016191723887908923, + "loss": 0.9911, + "step": 18868 + }, + { + "epoch": 0.48450300984880945, + "grad_norm": 0.76171875, + "learning_rate": 0.00016191373328030442, + "loss": 0.9361, + "step": 18869 + }, + { + "epoch": 0.4845286870447313, + "grad_norm": 0.7734375, + "learning_rate": 0.00016191022755813114, + "loss": 0.9967, + "step": 18870 + }, + { + "epoch": 0.4845543642406531, + "grad_norm": 0.796875, + "learning_rate": 0.00016190672171257636, + "loss": 0.8448, + "step": 18871 + }, + { + "epoch": 0.4845800414365749, + "grad_norm": 0.8046875, + "learning_rate": 0.00016190321574364704, + "loss": 0.8515, + "step": 18872 + }, + { + "epoch": 0.48460571863249674, + "grad_norm": 0.75, + "learning_rate": 0.00016189970965135023, + "loss": 0.9004, + "step": 18873 + }, + { + "epoch": 0.4846313958284186, + "grad_norm": 0.7734375, + "learning_rate": 0.00016189620343569293, + "loss": 0.9228, + "step": 18874 + }, + { + "epoch": 0.48465707302434036, + "grad_norm": 0.7265625, + "learning_rate": 0.00016189269709668202, + "loss": 0.8517, + "step": 18875 + }, + { + "epoch": 0.4846827502202622, + "grad_norm": 0.76171875, + "learning_rate": 0.0001618891906343246, + "loss": 0.8272, + "step": 18876 + }, + { + "epoch": 0.48470842741618403, + "grad_norm": 0.796875, + "learning_rate": 0.0001618856840486276, + "loss": 0.975, + "step": 18877 + }, + { + "epoch": 0.4847341046121058, + "grad_norm": 0.8203125, + "learning_rate": 0.00016188217733959802, + "loss": 0.8599, + "step": 18878 + }, + { + "epoch": 0.48475978180802765, + "grad_norm": 0.6953125, + "learning_rate": 0.00016187867050724287, + "loss": 0.7769, + "step": 18879 + }, + { + "epoch": 0.4847854590039495, + "grad_norm": 0.8046875, + "learning_rate": 0.0001618751635515691, + "loss": 0.9432, + "step": 18880 + }, + { + "epoch": 0.4848111361998713, + "grad_norm": 0.76953125, + "learning_rate": 0.00016187165647258369, + "loss": 0.9424, + "step": 18881 + }, + { + "epoch": 0.4848368133957931, + "grad_norm": 0.80078125, + "learning_rate": 0.00016186814927029368, + "loss": 0.7972, + "step": 18882 + }, + { + "epoch": 0.48486249059171493, + "grad_norm": 0.78515625, + "learning_rate": 0.00016186464194470606, + "loss": 0.9941, + "step": 18883 + }, + { + "epoch": 0.48488816778763677, + "grad_norm": 0.8203125, + "learning_rate": 0.0001618611344958278, + "loss": 0.8362, + "step": 18884 + }, + { + "epoch": 0.48491384498355855, + "grad_norm": 0.86328125, + "learning_rate": 0.00016185762692366585, + "loss": 0.9435, + "step": 18885 + }, + { + "epoch": 0.4849395221794804, + "grad_norm": 0.82421875, + "learning_rate": 0.00016185411922822724, + "loss": 0.9439, + "step": 18886 + }, + { + "epoch": 0.4849651993754022, + "grad_norm": 0.7890625, + "learning_rate": 0.000161850611409519, + "loss": 0.9115, + "step": 18887 + }, + { + "epoch": 0.484990876571324, + "grad_norm": 0.796875, + "learning_rate": 0.00016184710346754804, + "loss": 0.8569, + "step": 18888 + }, + { + "epoch": 0.48501655376724584, + "grad_norm": 0.6953125, + "learning_rate": 0.0001618435954023214, + "loss": 0.8476, + "step": 18889 + }, + { + "epoch": 0.4850422309631677, + "grad_norm": 0.80078125, + "learning_rate": 0.00016184008721384607, + "loss": 0.9976, + "step": 18890 + }, + { + "epoch": 0.4850679081590895, + "grad_norm": 0.74609375, + "learning_rate": 0.000161836578902129, + "loss": 0.855, + "step": 18891 + }, + { + "epoch": 0.4850935853550113, + "grad_norm": 0.84375, + "learning_rate": 0.00016183307046717724, + "loss": 0.8363, + "step": 18892 + }, + { + "epoch": 0.48511926255093313, + "grad_norm": 0.75390625, + "learning_rate": 0.00016182956190899776, + "loss": 1.0017, + "step": 18893 + }, + { + "epoch": 0.48514493974685496, + "grad_norm": 0.70703125, + "learning_rate": 0.00016182605322759755, + "loss": 0.7704, + "step": 18894 + }, + { + "epoch": 0.48517061694277674, + "grad_norm": 0.78125, + "learning_rate": 0.00016182254442298358, + "loss": 0.8956, + "step": 18895 + }, + { + "epoch": 0.4851962941386986, + "grad_norm": 0.8359375, + "learning_rate": 0.00016181903549516288, + "loss": 0.9022, + "step": 18896 + }, + { + "epoch": 0.4852219713346204, + "grad_norm": 0.734375, + "learning_rate": 0.00016181552644414242, + "loss": 0.9071, + "step": 18897 + }, + { + "epoch": 0.4852476485305422, + "grad_norm": 0.79296875, + "learning_rate": 0.0001618120172699292, + "loss": 0.9304, + "step": 18898 + }, + { + "epoch": 0.48527332572646403, + "grad_norm": 0.76171875, + "learning_rate": 0.00016180850797253022, + "loss": 0.9089, + "step": 18899 + }, + { + "epoch": 0.48529900292238587, + "grad_norm": 0.765625, + "learning_rate": 0.00016180499855195245, + "loss": 0.9891, + "step": 18900 + }, + { + "epoch": 0.4853246801183077, + "grad_norm": 0.76171875, + "learning_rate": 0.0001618014890082029, + "loss": 1.0368, + "step": 18901 + }, + { + "epoch": 0.4853503573142295, + "grad_norm": 0.71875, + "learning_rate": 0.00016179797934128858, + "loss": 0.931, + "step": 18902 + }, + { + "epoch": 0.4853760345101513, + "grad_norm": 0.7734375, + "learning_rate": 0.00016179446955121647, + "loss": 0.8494, + "step": 18903 + }, + { + "epoch": 0.48540171170607316, + "grad_norm": 0.80859375, + "learning_rate": 0.0001617909596379936, + "loss": 1.0426, + "step": 18904 + }, + { + "epoch": 0.48542738890199494, + "grad_norm": 0.8671875, + "learning_rate": 0.00016178744960162686, + "loss": 0.9184, + "step": 18905 + }, + { + "epoch": 0.4854530660979168, + "grad_norm": 0.76171875, + "learning_rate": 0.00016178393944212336, + "loss": 0.8114, + "step": 18906 + }, + { + "epoch": 0.4854787432938386, + "grad_norm": 0.71875, + "learning_rate": 0.00016178042915949002, + "loss": 0.8274, + "step": 18907 + }, + { + "epoch": 0.4855044204897604, + "grad_norm": 0.796875, + "learning_rate": 0.00016177691875373388, + "loss": 0.8855, + "step": 18908 + }, + { + "epoch": 0.4855300976856822, + "grad_norm": 0.88671875, + "learning_rate": 0.0001617734082248619, + "loss": 0.8909, + "step": 18909 + }, + { + "epoch": 0.48555577488160406, + "grad_norm": 0.78125, + "learning_rate": 0.00016176989757288115, + "loss": 0.9525, + "step": 18910 + }, + { + "epoch": 0.4855814520775259, + "grad_norm": 0.76171875, + "learning_rate": 0.00016176638679779853, + "loss": 0.6793, + "step": 18911 + }, + { + "epoch": 0.4856071292734477, + "grad_norm": 0.7890625, + "learning_rate": 0.00016176287589962106, + "loss": 0.8944, + "step": 18912 + }, + { + "epoch": 0.4856328064693695, + "grad_norm": 0.890625, + "learning_rate": 0.0001617593648783558, + "loss": 0.8193, + "step": 18913 + }, + { + "epoch": 0.48565848366529135, + "grad_norm": 0.828125, + "learning_rate": 0.0001617558537340097, + "loss": 0.8645, + "step": 18914 + }, + { + "epoch": 0.48568416086121313, + "grad_norm": 0.8046875, + "learning_rate": 0.00016175234246658973, + "loss": 0.9014, + "step": 18915 + }, + { + "epoch": 0.48570983805713497, + "grad_norm": 0.796875, + "learning_rate": 0.00016174883107610298, + "loss": 0.9175, + "step": 18916 + }, + { + "epoch": 0.4857355152530568, + "grad_norm": 0.80078125, + "learning_rate": 0.00016174531956255635, + "loss": 0.9745, + "step": 18917 + }, + { + "epoch": 0.4857611924489786, + "grad_norm": 0.8125, + "learning_rate": 0.00016174180792595687, + "loss": 0.8731, + "step": 18918 + }, + { + "epoch": 0.4857868696449004, + "grad_norm": 0.8984375, + "learning_rate": 0.00016173829616631156, + "loss": 0.9309, + "step": 18919 + }, + { + "epoch": 0.48581254684082226, + "grad_norm": 0.8046875, + "learning_rate": 0.00016173478428362737, + "loss": 0.9915, + "step": 18920 + }, + { + "epoch": 0.4858382240367441, + "grad_norm": 0.78515625, + "learning_rate": 0.00016173127227791137, + "loss": 0.84, + "step": 18921 + }, + { + "epoch": 0.4858639012326659, + "grad_norm": 0.73828125, + "learning_rate": 0.00016172776014917052, + "loss": 0.8434, + "step": 18922 + }, + { + "epoch": 0.4858895784285877, + "grad_norm": 0.78125, + "learning_rate": 0.0001617242478974118, + "loss": 0.8372, + "step": 18923 + }, + { + "epoch": 0.48591525562450955, + "grad_norm": 0.73828125, + "learning_rate": 0.0001617207355226422, + "loss": 0.9462, + "step": 18924 + }, + { + "epoch": 0.4859409328204313, + "grad_norm": 0.83203125, + "learning_rate": 0.00016171722302486883, + "loss": 1.0483, + "step": 18925 + }, + { + "epoch": 0.48596661001635316, + "grad_norm": 0.72265625, + "learning_rate": 0.00016171371040409852, + "loss": 0.7946, + "step": 18926 + }, + { + "epoch": 0.485992287212275, + "grad_norm": 0.8125, + "learning_rate": 0.00016171019766033842, + "loss": 0.734, + "step": 18927 + }, + { + "epoch": 0.4860179644081968, + "grad_norm": 0.7890625, + "learning_rate": 0.00016170668479359546, + "loss": 0.9586, + "step": 18928 + }, + { + "epoch": 0.4860436416041186, + "grad_norm": 0.8984375, + "learning_rate": 0.00016170317180387665, + "loss": 1.0004, + "step": 18929 + }, + { + "epoch": 0.48606931880004045, + "grad_norm": 0.80859375, + "learning_rate": 0.00016169965869118898, + "loss": 1.0196, + "step": 18930 + }, + { + "epoch": 0.4860949959959623, + "grad_norm": 0.78125, + "learning_rate": 0.00016169614545553945, + "loss": 0.8728, + "step": 18931 + }, + { + "epoch": 0.48612067319188407, + "grad_norm": 0.87109375, + "learning_rate": 0.0001616926320969351, + "loss": 0.8595, + "step": 18932 + }, + { + "epoch": 0.4861463503878059, + "grad_norm": 0.8125, + "learning_rate": 0.0001616891186153829, + "loss": 0.9376, + "step": 18933 + }, + { + "epoch": 0.48617202758372774, + "grad_norm": 0.79296875, + "learning_rate": 0.00016168560501088987, + "loss": 0.9488, + "step": 18934 + }, + { + "epoch": 0.4861977047796495, + "grad_norm": 0.84765625, + "learning_rate": 0.00016168209128346298, + "loss": 0.9679, + "step": 18935 + }, + { + "epoch": 0.48622338197557136, + "grad_norm": 0.703125, + "learning_rate": 0.00016167857743310928, + "loss": 0.8232, + "step": 18936 + }, + { + "epoch": 0.4862490591714932, + "grad_norm": 0.82421875, + "learning_rate": 0.00016167506345983573, + "loss": 0.8618, + "step": 18937 + }, + { + "epoch": 0.486274736367415, + "grad_norm": 0.89453125, + "learning_rate": 0.00016167154936364936, + "loss": 0.8839, + "step": 18938 + }, + { + "epoch": 0.4863004135633368, + "grad_norm": 0.72265625, + "learning_rate": 0.00016166803514455714, + "loss": 0.8185, + "step": 18939 + }, + { + "epoch": 0.48632609075925864, + "grad_norm": 0.72265625, + "learning_rate": 0.00016166452080256614, + "loss": 0.8693, + "step": 18940 + }, + { + "epoch": 0.4863517679551805, + "grad_norm": 0.765625, + "learning_rate": 0.0001616610063376833, + "loss": 0.8739, + "step": 18941 + }, + { + "epoch": 0.48637744515110226, + "grad_norm": 0.8203125, + "learning_rate": 0.00016165749174991564, + "loss": 1.0246, + "step": 18942 + }, + { + "epoch": 0.4864031223470241, + "grad_norm": 0.71484375, + "learning_rate": 0.00016165397703927017, + "loss": 0.8221, + "step": 18943 + }, + { + "epoch": 0.48642879954294593, + "grad_norm": 0.796875, + "learning_rate": 0.00016165046220575387, + "loss": 0.8178, + "step": 18944 + }, + { + "epoch": 0.4864544767388677, + "grad_norm": 0.8671875, + "learning_rate": 0.0001616469472493738, + "loss": 1.0122, + "step": 18945 + }, + { + "epoch": 0.48648015393478955, + "grad_norm": 0.71875, + "learning_rate": 0.00016164343217013691, + "loss": 0.8282, + "step": 18946 + }, + { + "epoch": 0.4865058311307114, + "grad_norm": 0.75390625, + "learning_rate": 0.0001616399169680502, + "loss": 0.8478, + "step": 18947 + }, + { + "epoch": 0.48653150832663317, + "grad_norm": 0.7109375, + "learning_rate": 0.00016163640164312075, + "loss": 0.8884, + "step": 18948 + }, + { + "epoch": 0.486557185522555, + "grad_norm": 0.734375, + "learning_rate": 0.00016163288619535553, + "loss": 0.9424, + "step": 18949 + }, + { + "epoch": 0.48658286271847684, + "grad_norm": 0.84375, + "learning_rate": 0.0001616293706247615, + "loss": 0.9335, + "step": 18950 + }, + { + "epoch": 0.4866085399143987, + "grad_norm": 0.73828125, + "learning_rate": 0.00016162585493134573, + "loss": 0.8965, + "step": 18951 + }, + { + "epoch": 0.48663421711032046, + "grad_norm": 0.77734375, + "learning_rate": 0.00016162233911511519, + "loss": 0.8261, + "step": 18952 + }, + { + "epoch": 0.4866598943062423, + "grad_norm": 0.796875, + "learning_rate": 0.00016161882317607687, + "loss": 0.9863, + "step": 18953 + }, + { + "epoch": 0.4866855715021641, + "grad_norm": 0.78515625, + "learning_rate": 0.0001616153071142378, + "loss": 0.9123, + "step": 18954 + }, + { + "epoch": 0.4867112486980859, + "grad_norm": 0.71484375, + "learning_rate": 0.00016161179092960502, + "loss": 0.7399, + "step": 18955 + }, + { + "epoch": 0.48673692589400774, + "grad_norm": 0.796875, + "learning_rate": 0.0001616082746221855, + "loss": 0.8195, + "step": 18956 + }, + { + "epoch": 0.4867626030899296, + "grad_norm": 0.8125, + "learning_rate": 0.00016160475819198625, + "loss": 0.8499, + "step": 18957 + }, + { + "epoch": 0.48678828028585136, + "grad_norm": 0.875, + "learning_rate": 0.00016160124163901429, + "loss": 0.9595, + "step": 18958 + }, + { + "epoch": 0.4868139574817732, + "grad_norm": 0.78125, + "learning_rate": 0.0001615977249632766, + "loss": 0.8185, + "step": 18959 + }, + { + "epoch": 0.48683963467769503, + "grad_norm": 0.75, + "learning_rate": 0.00016159420816478018, + "loss": 0.95, + "step": 18960 + }, + { + "epoch": 0.48686531187361687, + "grad_norm": 0.84375, + "learning_rate": 0.00016159069124353212, + "loss": 0.8789, + "step": 18961 + }, + { + "epoch": 0.48689098906953865, + "grad_norm": 0.76953125, + "learning_rate": 0.00016158717419953935, + "loss": 1.0124, + "step": 18962 + }, + { + "epoch": 0.4869166662654605, + "grad_norm": 0.6953125, + "learning_rate": 0.00016158365703280894, + "loss": 0.7205, + "step": 18963 + }, + { + "epoch": 0.4869423434613823, + "grad_norm": 0.71484375, + "learning_rate": 0.00016158013974334784, + "loss": 0.8781, + "step": 18964 + }, + { + "epoch": 0.4869680206573041, + "grad_norm": 0.765625, + "learning_rate": 0.00016157662233116306, + "loss": 0.9608, + "step": 18965 + }, + { + "epoch": 0.48699369785322594, + "grad_norm": 0.85546875, + "learning_rate": 0.00016157310479626165, + "loss": 0.9751, + "step": 18966 + }, + { + "epoch": 0.4870193750491478, + "grad_norm": 0.7734375, + "learning_rate": 0.00016156958713865063, + "loss": 0.8846, + "step": 18967 + }, + { + "epoch": 0.48704505224506955, + "grad_norm": 0.90625, + "learning_rate": 0.00016156606935833695, + "loss": 0.9204, + "step": 18968 + }, + { + "epoch": 0.4870707294409914, + "grad_norm": 0.75390625, + "learning_rate": 0.0001615625514553277, + "loss": 0.9022, + "step": 18969 + }, + { + "epoch": 0.4870964066369132, + "grad_norm": 0.92578125, + "learning_rate": 0.00016155903342962978, + "loss": 0.9487, + "step": 18970 + }, + { + "epoch": 0.48712208383283506, + "grad_norm": 0.71484375, + "learning_rate": 0.00016155551528125033, + "loss": 0.8033, + "step": 18971 + }, + { + "epoch": 0.48714776102875684, + "grad_norm": 0.734375, + "learning_rate": 0.00016155199701019628, + "loss": 0.8575, + "step": 18972 + }, + { + "epoch": 0.4871734382246787, + "grad_norm": 0.8125, + "learning_rate": 0.00016154847861647465, + "loss": 0.9225, + "step": 18973 + }, + { + "epoch": 0.4871991154206005, + "grad_norm": 1.9375, + "learning_rate": 0.00016154496010009246, + "loss": 0.8502, + "step": 18974 + }, + { + "epoch": 0.4872247926165223, + "grad_norm": 0.7265625, + "learning_rate": 0.00016154144146105672, + "loss": 0.9138, + "step": 18975 + }, + { + "epoch": 0.48725046981244413, + "grad_norm": 0.984375, + "learning_rate": 0.00016153792269937446, + "loss": 0.8553, + "step": 18976 + }, + { + "epoch": 0.48727614700836597, + "grad_norm": 0.79296875, + "learning_rate": 0.00016153440381505267, + "loss": 0.9385, + "step": 18977 + }, + { + "epoch": 0.48730182420428775, + "grad_norm": 0.8203125, + "learning_rate": 0.00016153088480809835, + "loss": 0.8839, + "step": 18978 + }, + { + "epoch": 0.4873275014002096, + "grad_norm": 0.796875, + "learning_rate": 0.00016152736567851857, + "loss": 1.0908, + "step": 18979 + }, + { + "epoch": 0.4873531785961314, + "grad_norm": 0.8125, + "learning_rate": 0.0001615238464263203, + "loss": 0.89, + "step": 18980 + }, + { + "epoch": 0.48737885579205326, + "grad_norm": 0.72265625, + "learning_rate": 0.00016152032705151053, + "loss": 0.9489, + "step": 18981 + }, + { + "epoch": 0.48740453298797504, + "grad_norm": 0.83984375, + "learning_rate": 0.00016151680755409635, + "loss": 0.9952, + "step": 18982 + }, + { + "epoch": 0.4874302101838969, + "grad_norm": 0.8359375, + "learning_rate": 0.00016151328793408473, + "loss": 0.9048, + "step": 18983 + }, + { + "epoch": 0.4874558873798187, + "grad_norm": 0.78515625, + "learning_rate": 0.00016150976819148265, + "loss": 0.9806, + "step": 18984 + }, + { + "epoch": 0.4874815645757405, + "grad_norm": 0.78515625, + "learning_rate": 0.00016150624832629717, + "loss": 0.8669, + "step": 18985 + }, + { + "epoch": 0.4875072417716623, + "grad_norm": 0.734375, + "learning_rate": 0.00016150272833853532, + "loss": 0.861, + "step": 18986 + }, + { + "epoch": 0.48753291896758416, + "grad_norm": 0.734375, + "learning_rate": 0.00016149920822820403, + "loss": 0.8931, + "step": 18987 + }, + { + "epoch": 0.48755859616350594, + "grad_norm": 0.7734375, + "learning_rate": 0.0001614956879953104, + "loss": 0.9611, + "step": 18988 + }, + { + "epoch": 0.4875842733594278, + "grad_norm": 0.6875, + "learning_rate": 0.00016149216763986147, + "loss": 0.7637, + "step": 18989 + }, + { + "epoch": 0.4876099505553496, + "grad_norm": 0.8046875, + "learning_rate": 0.00016148864716186413, + "loss": 0.8779, + "step": 18990 + }, + { + "epoch": 0.48763562775127145, + "grad_norm": 0.75390625, + "learning_rate": 0.00016148512656132552, + "loss": 0.8364, + "step": 18991 + }, + { + "epoch": 0.48766130494719323, + "grad_norm": 0.71875, + "learning_rate": 0.00016148160583825258, + "loss": 0.935, + "step": 18992 + }, + { + "epoch": 0.48768698214311507, + "grad_norm": 0.73828125, + "learning_rate": 0.00016147808499265235, + "loss": 0.855, + "step": 18993 + }, + { + "epoch": 0.4877126593390369, + "grad_norm": 0.7578125, + "learning_rate": 0.00016147456402453188, + "loss": 0.8538, + "step": 18994 + }, + { + "epoch": 0.4877383365349587, + "grad_norm": 0.7890625, + "learning_rate": 0.00016147104293389814, + "loss": 0.8945, + "step": 18995 + }, + { + "epoch": 0.4877640137308805, + "grad_norm": 0.765625, + "learning_rate": 0.00016146752172075814, + "loss": 1.0018, + "step": 18996 + }, + { + "epoch": 0.48778969092680236, + "grad_norm": 0.8125, + "learning_rate": 0.00016146400038511896, + "loss": 0.9479, + "step": 18997 + }, + { + "epoch": 0.48781536812272414, + "grad_norm": 0.78515625, + "learning_rate": 0.00016146047892698758, + "loss": 0.9195, + "step": 18998 + }, + { + "epoch": 0.48784104531864597, + "grad_norm": 0.765625, + "learning_rate": 0.000161456957346371, + "loss": 0.9381, + "step": 18999 + }, + { + "epoch": 0.4878667225145678, + "grad_norm": 0.8046875, + "learning_rate": 0.00016145343564327624, + "loss": 0.8493, + "step": 19000 + }, + { + "epoch": 0.4878667225145678, + "eval_loss": 0.8943595886230469, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 406.8839, + "eval_samples_per_second": 24.577, + "eval_steps_per_second": 0.769, + "step": 19000 + }, + { + "epoch": 0.48789239971048964, + "grad_norm": 0.6953125, + "learning_rate": 0.00016144991381771036, + "loss": 0.9363, + "step": 19001 + }, + { + "epoch": 0.4879180769064114, + "grad_norm": 0.734375, + "learning_rate": 0.00016144639186968033, + "loss": 0.9164, + "step": 19002 + }, + { + "epoch": 0.48794375410233326, + "grad_norm": 0.75, + "learning_rate": 0.0001614428697991932, + "loss": 1.0026, + "step": 19003 + }, + { + "epoch": 0.4879694312982551, + "grad_norm": 0.80859375, + "learning_rate": 0.000161439347606256, + "loss": 0.8822, + "step": 19004 + }, + { + "epoch": 0.4879951084941769, + "grad_norm": 0.73046875, + "learning_rate": 0.00016143582529087568, + "loss": 0.8658, + "step": 19005 + }, + { + "epoch": 0.4880207856900987, + "grad_norm": 0.80859375, + "learning_rate": 0.00016143230285305937, + "loss": 0.931, + "step": 19006 + }, + { + "epoch": 0.48804646288602055, + "grad_norm": 0.80859375, + "learning_rate": 0.00016142878029281398, + "loss": 0.8566, + "step": 19007 + }, + { + "epoch": 0.48807214008194233, + "grad_norm": 0.80859375, + "learning_rate": 0.00016142525761014663, + "loss": 0.8301, + "step": 19008 + }, + { + "epoch": 0.48809781727786417, + "grad_norm": 0.81640625, + "learning_rate": 0.00016142173480506426, + "loss": 0.8058, + "step": 19009 + }, + { + "epoch": 0.488123494473786, + "grad_norm": 0.7578125, + "learning_rate": 0.00016141821187757392, + "loss": 0.9344, + "step": 19010 + }, + { + "epoch": 0.4881491716697078, + "grad_norm": 0.83984375, + "learning_rate": 0.00016141468882768263, + "loss": 0.9833, + "step": 19011 + }, + { + "epoch": 0.4881748488656296, + "grad_norm": 0.78125, + "learning_rate": 0.00016141116565539742, + "loss": 0.9849, + "step": 19012 + }, + { + "epoch": 0.48820052606155145, + "grad_norm": 0.875, + "learning_rate": 0.0001614076423607253, + "loss": 0.8869, + "step": 19013 + }, + { + "epoch": 0.4882262032574733, + "grad_norm": 0.74609375, + "learning_rate": 0.0001614041189436733, + "loss": 0.8002, + "step": 19014 + }, + { + "epoch": 0.48825188045339507, + "grad_norm": 0.78515625, + "learning_rate": 0.00016140059540424844, + "loss": 0.924, + "step": 19015 + }, + { + "epoch": 0.4882775576493169, + "grad_norm": 0.8828125, + "learning_rate": 0.00016139707174245773, + "loss": 0.9283, + "step": 19016 + }, + { + "epoch": 0.48830323484523874, + "grad_norm": 0.89453125, + "learning_rate": 0.0001613935479583082, + "loss": 0.9046, + "step": 19017 + }, + { + "epoch": 0.4883289120411605, + "grad_norm": 0.74609375, + "learning_rate": 0.00016139002405180688, + "loss": 0.9676, + "step": 19018 + }, + { + "epoch": 0.48835458923708236, + "grad_norm": 0.703125, + "learning_rate": 0.0001613865000229608, + "loss": 0.9597, + "step": 19019 + }, + { + "epoch": 0.4883802664330042, + "grad_norm": 0.86328125, + "learning_rate": 0.00016138297587177695, + "loss": 0.9514, + "step": 19020 + }, + { + "epoch": 0.488405943628926, + "grad_norm": 0.7109375, + "learning_rate": 0.00016137945159826238, + "loss": 0.7943, + "step": 19021 + }, + { + "epoch": 0.4884316208248478, + "grad_norm": 0.8515625, + "learning_rate": 0.0001613759272024241, + "loss": 0.9851, + "step": 19022 + }, + { + "epoch": 0.48845729802076965, + "grad_norm": 0.78125, + "learning_rate": 0.00016137240268426916, + "loss": 0.7961, + "step": 19023 + }, + { + "epoch": 0.4884829752166915, + "grad_norm": 0.83984375, + "learning_rate": 0.00016136887804380455, + "loss": 0.8939, + "step": 19024 + }, + { + "epoch": 0.48850865241261326, + "grad_norm": 0.84375, + "learning_rate": 0.00016136535328103733, + "loss": 0.9056, + "step": 19025 + }, + { + "epoch": 0.4885343296085351, + "grad_norm": 0.72265625, + "learning_rate": 0.00016136182839597449, + "loss": 0.8975, + "step": 19026 + }, + { + "epoch": 0.48856000680445694, + "grad_norm": 0.75390625, + "learning_rate": 0.00016135830338862304, + "loss": 0.9481, + "step": 19027 + }, + { + "epoch": 0.4885856840003787, + "grad_norm": 0.765625, + "learning_rate": 0.0001613547782589901, + "loss": 0.804, + "step": 19028 + }, + { + "epoch": 0.48861136119630055, + "grad_norm": 0.7578125, + "learning_rate": 0.00016135125300708258, + "loss": 0.8632, + "step": 19029 + }, + { + "epoch": 0.4886370383922224, + "grad_norm": 0.76171875, + "learning_rate": 0.00016134772763290757, + "loss": 1.0169, + "step": 19030 + }, + { + "epoch": 0.48866271558814417, + "grad_norm": 0.78515625, + "learning_rate": 0.0001613442021364721, + "loss": 0.8166, + "step": 19031 + }, + { + "epoch": 0.488688392784066, + "grad_norm": 0.8046875, + "learning_rate": 0.00016134067651778314, + "loss": 0.9337, + "step": 19032 + }, + { + "epoch": 0.48871406997998784, + "grad_norm": 0.828125, + "learning_rate": 0.00016133715077684774, + "loss": 0.9033, + "step": 19033 + }, + { + "epoch": 0.4887397471759097, + "grad_norm": 0.79296875, + "learning_rate": 0.000161333624913673, + "loss": 0.8995, + "step": 19034 + }, + { + "epoch": 0.48876542437183146, + "grad_norm": 0.76171875, + "learning_rate": 0.00016133009892826584, + "loss": 0.8608, + "step": 19035 + }, + { + "epoch": 0.4887911015677533, + "grad_norm": 0.7109375, + "learning_rate": 0.00016132657282063331, + "loss": 0.9215, + "step": 19036 + }, + { + "epoch": 0.48881677876367513, + "grad_norm": 0.7265625, + "learning_rate": 0.0001613230465907825, + "loss": 0.9282, + "step": 19037 + }, + { + "epoch": 0.4888424559595969, + "grad_norm": 0.77734375, + "learning_rate": 0.00016131952023872042, + "loss": 0.8451, + "step": 19038 + }, + { + "epoch": 0.48886813315551875, + "grad_norm": 0.8046875, + "learning_rate": 0.00016131599376445404, + "loss": 0.9142, + "step": 19039 + }, + { + "epoch": 0.4888938103514406, + "grad_norm": 0.8828125, + "learning_rate": 0.00016131246716799043, + "loss": 1.0614, + "step": 19040 + }, + { + "epoch": 0.48891948754736236, + "grad_norm": 0.71484375, + "learning_rate": 0.0001613089404493366, + "loss": 0.8266, + "step": 19041 + }, + { + "epoch": 0.4889451647432842, + "grad_norm": 0.81640625, + "learning_rate": 0.0001613054136084996, + "loss": 0.9253, + "step": 19042 + }, + { + "epoch": 0.48897084193920604, + "grad_norm": 0.73828125, + "learning_rate": 0.00016130188664548648, + "loss": 0.9402, + "step": 19043 + }, + { + "epoch": 0.48899651913512787, + "grad_norm": 0.83203125, + "learning_rate": 0.00016129835956030418, + "loss": 0.8737, + "step": 19044 + }, + { + "epoch": 0.48902219633104965, + "grad_norm": 0.80078125, + "learning_rate": 0.0001612948323529598, + "loss": 0.9015, + "step": 19045 + }, + { + "epoch": 0.4890478735269715, + "grad_norm": 0.6796875, + "learning_rate": 0.00016129130502346039, + "loss": 0.8699, + "step": 19046 + }, + { + "epoch": 0.4890735507228933, + "grad_norm": 0.76171875, + "learning_rate": 0.0001612877775718129, + "loss": 0.8377, + "step": 19047 + }, + { + "epoch": 0.4890992279188151, + "grad_norm": 0.72265625, + "learning_rate": 0.00016128424999802446, + "loss": 0.8247, + "step": 19048 + }, + { + "epoch": 0.48912490511473694, + "grad_norm": 0.765625, + "learning_rate": 0.000161280722302102, + "loss": 0.8437, + "step": 19049 + }, + { + "epoch": 0.4891505823106588, + "grad_norm": 0.75390625, + "learning_rate": 0.0001612771944840526, + "loss": 0.8957, + "step": 19050 + }, + { + "epoch": 0.48917625950658056, + "grad_norm": 0.734375, + "learning_rate": 0.0001612736665438833, + "loss": 0.7136, + "step": 19051 + }, + { + "epoch": 0.4892019367025024, + "grad_norm": 0.71875, + "learning_rate": 0.0001612701384816011, + "loss": 0.9939, + "step": 19052 + }, + { + "epoch": 0.48922761389842423, + "grad_norm": 0.8046875, + "learning_rate": 0.00016126661029721307, + "loss": 1.0073, + "step": 19053 + }, + { + "epoch": 0.48925329109434607, + "grad_norm": 0.8671875, + "learning_rate": 0.00016126308199072622, + "loss": 0.9689, + "step": 19054 + }, + { + "epoch": 0.48927896829026785, + "grad_norm": 0.70703125, + "learning_rate": 0.00016125955356214759, + "loss": 0.7575, + "step": 19055 + }, + { + "epoch": 0.4893046454861897, + "grad_norm": 0.74609375, + "learning_rate": 0.00016125602501148418, + "loss": 0.8696, + "step": 19056 + }, + { + "epoch": 0.4893303226821115, + "grad_norm": 0.75, + "learning_rate": 0.00016125249633874305, + "loss": 0.8346, + "step": 19057 + }, + { + "epoch": 0.4893559998780333, + "grad_norm": 0.7734375, + "learning_rate": 0.00016124896754393122, + "loss": 0.9052, + "step": 19058 + }, + { + "epoch": 0.48938167707395513, + "grad_norm": 0.765625, + "learning_rate": 0.00016124543862705576, + "loss": 0.8285, + "step": 19059 + }, + { + "epoch": 0.48940735426987697, + "grad_norm": 0.80078125, + "learning_rate": 0.00016124190958812364, + "loss": 0.8786, + "step": 19060 + }, + { + "epoch": 0.48943303146579875, + "grad_norm": 0.81640625, + "learning_rate": 0.00016123838042714195, + "loss": 0.9528, + "step": 19061 + }, + { + "epoch": 0.4894587086617206, + "grad_norm": 0.77734375, + "learning_rate": 0.0001612348511441177, + "loss": 0.9475, + "step": 19062 + }, + { + "epoch": 0.4894843858576424, + "grad_norm": 0.8984375, + "learning_rate": 0.00016123132173905788, + "loss": 1.1434, + "step": 19063 + }, + { + "epoch": 0.48951006305356426, + "grad_norm": 0.7734375, + "learning_rate": 0.0001612277922119696, + "loss": 0.9164, + "step": 19064 + }, + { + "epoch": 0.48953574024948604, + "grad_norm": 0.75390625, + "learning_rate": 0.00016122426256285987, + "loss": 0.8073, + "step": 19065 + }, + { + "epoch": 0.4895614174454079, + "grad_norm": 0.78515625, + "learning_rate": 0.00016122073279173572, + "loss": 0.82, + "step": 19066 + }, + { + "epoch": 0.4895870946413297, + "grad_norm": 0.75390625, + "learning_rate": 0.00016121720289860413, + "loss": 0.7861, + "step": 19067 + }, + { + "epoch": 0.4896127718372515, + "grad_norm": 0.7421875, + "learning_rate": 0.00016121367288347225, + "loss": 0.883, + "step": 19068 + }, + { + "epoch": 0.48963844903317333, + "grad_norm": 0.72265625, + "learning_rate": 0.000161210142746347, + "loss": 0.8144, + "step": 19069 + }, + { + "epoch": 0.48966412622909516, + "grad_norm": 0.84375, + "learning_rate": 0.0001612066124872355, + "loss": 0.8836, + "step": 19070 + }, + { + "epoch": 0.48968980342501695, + "grad_norm": 0.92578125, + "learning_rate": 0.00016120308210614473, + "loss": 1.0358, + "step": 19071 + }, + { + "epoch": 0.4897154806209388, + "grad_norm": 0.8046875, + "learning_rate": 0.00016119955160308176, + "loss": 0.8675, + "step": 19072 + }, + { + "epoch": 0.4897411578168606, + "grad_norm": 0.7578125, + "learning_rate": 0.00016119602097805358, + "loss": 0.8926, + "step": 19073 + }, + { + "epoch": 0.48976683501278245, + "grad_norm": 0.734375, + "learning_rate": 0.0001611924902310673, + "loss": 1.0126, + "step": 19074 + }, + { + "epoch": 0.48979251220870423, + "grad_norm": 0.74609375, + "learning_rate": 0.0001611889593621299, + "loss": 0.8397, + "step": 19075 + }, + { + "epoch": 0.48981818940462607, + "grad_norm": 0.71484375, + "learning_rate": 0.00016118542837124838, + "loss": 0.8149, + "step": 19076 + }, + { + "epoch": 0.4898438666005479, + "grad_norm": 0.73046875, + "learning_rate": 0.0001611818972584299, + "loss": 0.7798, + "step": 19077 + }, + { + "epoch": 0.4898695437964697, + "grad_norm": 0.8125, + "learning_rate": 0.00016117836602368138, + "loss": 0.8722, + "step": 19078 + }, + { + "epoch": 0.4898952209923915, + "grad_norm": 0.75, + "learning_rate": 0.00016117483466700995, + "loss": 0.905, + "step": 19079 + }, + { + "epoch": 0.48992089818831336, + "grad_norm": 0.76171875, + "learning_rate": 0.00016117130318842255, + "loss": 0.785, + "step": 19080 + }, + { + "epoch": 0.48994657538423514, + "grad_norm": 0.74609375, + "learning_rate": 0.0001611677715879263, + "loss": 0.861, + "step": 19081 + }, + { + "epoch": 0.489972252580157, + "grad_norm": 0.671875, + "learning_rate": 0.0001611642398655282, + "loss": 0.821, + "step": 19082 + }, + { + "epoch": 0.4899979297760788, + "grad_norm": 0.7734375, + "learning_rate": 0.0001611607080212353, + "loss": 1.0065, + "step": 19083 + }, + { + "epoch": 0.49002360697200065, + "grad_norm": 0.77734375, + "learning_rate": 0.0001611571760550546, + "loss": 0.7901, + "step": 19084 + }, + { + "epoch": 0.4900492841679224, + "grad_norm": 0.74609375, + "learning_rate": 0.0001611536439669932, + "loss": 0.9098, + "step": 19085 + }, + { + "epoch": 0.49007496136384426, + "grad_norm": 0.78125, + "learning_rate": 0.0001611501117570581, + "loss": 0.9229, + "step": 19086 + }, + { + "epoch": 0.4901006385597661, + "grad_norm": 0.78125, + "learning_rate": 0.00016114657942525637, + "loss": 0.9275, + "step": 19087 + }, + { + "epoch": 0.4901263157556879, + "grad_norm": 0.7578125, + "learning_rate": 0.000161143046971595, + "loss": 0.8608, + "step": 19088 + }, + { + "epoch": 0.4901519929516097, + "grad_norm": 0.7421875, + "learning_rate": 0.0001611395143960811, + "loss": 0.8876, + "step": 19089 + }, + { + "epoch": 0.49017767014753155, + "grad_norm": 0.8515625, + "learning_rate": 0.00016113598169872163, + "loss": 0.8984, + "step": 19090 + }, + { + "epoch": 0.49020334734345333, + "grad_norm": 0.7578125, + "learning_rate": 0.0001611324488795237, + "loss": 0.8122, + "step": 19091 + }, + { + "epoch": 0.49022902453937517, + "grad_norm": 0.8203125, + "learning_rate": 0.00016112891593849434, + "loss": 0.9075, + "step": 19092 + }, + { + "epoch": 0.490254701735297, + "grad_norm": 0.828125, + "learning_rate": 0.0001611253828756405, + "loss": 0.9233, + "step": 19093 + }, + { + "epoch": 0.49028037893121884, + "grad_norm": 0.74609375, + "learning_rate": 0.00016112184969096938, + "loss": 0.849, + "step": 19094 + }, + { + "epoch": 0.4903060561271406, + "grad_norm": 0.8125, + "learning_rate": 0.0001611183163844879, + "loss": 0.9578, + "step": 19095 + }, + { + "epoch": 0.49033173332306246, + "grad_norm": 0.7421875, + "learning_rate": 0.00016111478295620312, + "loss": 0.8548, + "step": 19096 + }, + { + "epoch": 0.4903574105189843, + "grad_norm": 0.79296875, + "learning_rate": 0.0001611112494061221, + "loss": 0.9358, + "step": 19097 + }, + { + "epoch": 0.4903830877149061, + "grad_norm": 0.73828125, + "learning_rate": 0.00016110771573425188, + "loss": 0.7448, + "step": 19098 + }, + { + "epoch": 0.4904087649108279, + "grad_norm": 0.7734375, + "learning_rate": 0.00016110418194059952, + "loss": 0.905, + "step": 19099 + }, + { + "epoch": 0.49043444210674975, + "grad_norm": 0.79296875, + "learning_rate": 0.00016110064802517203, + "loss": 0.8925, + "step": 19100 + }, + { + "epoch": 0.4904601193026715, + "grad_norm": 0.78125, + "learning_rate": 0.0001610971139879765, + "loss": 0.9705, + "step": 19101 + }, + { + "epoch": 0.49048579649859336, + "grad_norm": 0.7890625, + "learning_rate": 0.00016109357982901988, + "loss": 1.0066, + "step": 19102 + }, + { + "epoch": 0.4905114736945152, + "grad_norm": 0.828125, + "learning_rate": 0.0001610900455483093, + "loss": 0.9011, + "step": 19103 + }, + { + "epoch": 0.49053715089043703, + "grad_norm": 0.875, + "learning_rate": 0.0001610865111458518, + "loss": 1.0112, + "step": 19104 + }, + { + "epoch": 0.4905628280863588, + "grad_norm": 0.7578125, + "learning_rate": 0.0001610829766216544, + "loss": 0.88, + "step": 19105 + }, + { + "epoch": 0.49058850528228065, + "grad_norm": 0.8359375, + "learning_rate": 0.00016107944197572412, + "loss": 0.8308, + "step": 19106 + }, + { + "epoch": 0.4906141824782025, + "grad_norm": 0.72265625, + "learning_rate": 0.00016107590720806805, + "loss": 0.9452, + "step": 19107 + }, + { + "epoch": 0.49063985967412427, + "grad_norm": 0.75, + "learning_rate": 0.0001610723723186932, + "loss": 0.8497, + "step": 19108 + }, + { + "epoch": 0.4906655368700461, + "grad_norm": 0.7109375, + "learning_rate": 0.00016106883730760663, + "loss": 0.846, + "step": 19109 + }, + { + "epoch": 0.49069121406596794, + "grad_norm": 0.7890625, + "learning_rate": 0.00016106530217481537, + "loss": 0.9919, + "step": 19110 + }, + { + "epoch": 0.4907168912618897, + "grad_norm": 0.734375, + "learning_rate": 0.0001610617669203265, + "loss": 0.9411, + "step": 19111 + }, + { + "epoch": 0.49074256845781156, + "grad_norm": 0.73828125, + "learning_rate": 0.00016105823154414703, + "loss": 0.8106, + "step": 19112 + }, + { + "epoch": 0.4907682456537334, + "grad_norm": 0.7421875, + "learning_rate": 0.00016105469604628403, + "loss": 0.8333, + "step": 19113 + }, + { + "epoch": 0.49079392284965523, + "grad_norm": 0.7578125, + "learning_rate": 0.00016105116042674456, + "loss": 0.8255, + "step": 19114 + }, + { + "epoch": 0.490819600045577, + "grad_norm": 0.81640625, + "learning_rate": 0.0001610476246855356, + "loss": 0.9038, + "step": 19115 + }, + { + "epoch": 0.49084527724149885, + "grad_norm": 0.73828125, + "learning_rate": 0.00016104408882266426, + "loss": 0.8793, + "step": 19116 + }, + { + "epoch": 0.4908709544374207, + "grad_norm": 0.765625, + "learning_rate": 0.00016104055283813756, + "loss": 0.9826, + "step": 19117 + }, + { + "epoch": 0.49089663163334246, + "grad_norm": 0.75, + "learning_rate": 0.00016103701673196255, + "loss": 0.9763, + "step": 19118 + }, + { + "epoch": 0.4909223088292643, + "grad_norm": 0.7734375, + "learning_rate": 0.00016103348050414628, + "loss": 0.9707, + "step": 19119 + }, + { + "epoch": 0.49094798602518613, + "grad_norm": 0.77734375, + "learning_rate": 0.0001610299441546958, + "loss": 0.9127, + "step": 19120 + }, + { + "epoch": 0.4909736632211079, + "grad_norm": 0.80078125, + "learning_rate": 0.00016102640768361816, + "loss": 0.801, + "step": 19121 + }, + { + "epoch": 0.49099934041702975, + "grad_norm": 0.76171875, + "learning_rate": 0.00016102287109092037, + "loss": 0.942, + "step": 19122 + }, + { + "epoch": 0.4910250176129516, + "grad_norm": 0.796875, + "learning_rate": 0.00016101933437660956, + "loss": 0.8386, + "step": 19123 + }, + { + "epoch": 0.4910506948088734, + "grad_norm": 0.72265625, + "learning_rate": 0.0001610157975406927, + "loss": 0.8198, + "step": 19124 + }, + { + "epoch": 0.4910763720047952, + "grad_norm": 0.765625, + "learning_rate": 0.00016101226058317687, + "loss": 0.8754, + "step": 19125 + }, + { + "epoch": 0.49110204920071704, + "grad_norm": 0.78515625, + "learning_rate": 0.0001610087235040691, + "loss": 0.8261, + "step": 19126 + }, + { + "epoch": 0.4911277263966389, + "grad_norm": 0.75, + "learning_rate": 0.0001610051863033765, + "loss": 0.9022, + "step": 19127 + }, + { + "epoch": 0.49115340359256066, + "grad_norm": 0.80859375, + "learning_rate": 0.00016100164898110602, + "loss": 0.828, + "step": 19128 + }, + { + "epoch": 0.4911790807884825, + "grad_norm": 0.7734375, + "learning_rate": 0.00016099811153726478, + "loss": 0.8084, + "step": 19129 + }, + { + "epoch": 0.4912047579844043, + "grad_norm": 0.8203125, + "learning_rate": 0.00016099457397185984, + "loss": 0.8798, + "step": 19130 + }, + { + "epoch": 0.4912304351803261, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609910362848982, + "loss": 0.9399, + "step": 19131 + }, + { + "epoch": 0.49125611237624794, + "grad_norm": 0.79296875, + "learning_rate": 0.00016098749847638695, + "loss": 0.8532, + "step": 19132 + }, + { + "epoch": 0.4912817895721698, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609839605463331, + "loss": 0.9515, + "step": 19133 + }, + { + "epoch": 0.4913074667680916, + "grad_norm": 0.7421875, + "learning_rate": 0.00016098042249474377, + "loss": 0.7709, + "step": 19134 + }, + { + "epoch": 0.4913331439640134, + "grad_norm": 0.75390625, + "learning_rate": 0.00016097688432162595, + "loss": 0.9226, + "step": 19135 + }, + { + "epoch": 0.49135882115993523, + "grad_norm": 0.87109375, + "learning_rate": 0.00016097334602698665, + "loss": 0.8569, + "step": 19136 + }, + { + "epoch": 0.49138449835585707, + "grad_norm": 0.81640625, + "learning_rate": 0.00016096980761083304, + "loss": 0.997, + "step": 19137 + }, + { + "epoch": 0.49141017555177885, + "grad_norm": 0.77734375, + "learning_rate": 0.0001609662690731721, + "loss": 0.8688, + "step": 19138 + }, + { + "epoch": 0.4914358527477007, + "grad_norm": 0.81640625, + "learning_rate": 0.00016096273041401089, + "loss": 0.7622, + "step": 19139 + }, + { + "epoch": 0.4914615299436225, + "grad_norm": 0.81640625, + "learning_rate": 0.00016095919163335647, + "loss": 0.9683, + "step": 19140 + }, + { + "epoch": 0.4914872071395443, + "grad_norm": 0.85546875, + "learning_rate": 0.00016095565273121587, + "loss": 0.84, + "step": 19141 + }, + { + "epoch": 0.49151288433546614, + "grad_norm": 0.7421875, + "learning_rate": 0.00016095211370759614, + "loss": 0.8619, + "step": 19142 + }, + { + "epoch": 0.491538561531388, + "grad_norm": 0.7734375, + "learning_rate": 0.0001609485745625044, + "loss": 0.8242, + "step": 19143 + }, + { + "epoch": 0.4915642387273098, + "grad_norm": 0.8046875, + "learning_rate": 0.00016094503529594763, + "loss": 0.9884, + "step": 19144 + }, + { + "epoch": 0.4915899159232316, + "grad_norm": 0.73828125, + "learning_rate": 0.0001609414959079329, + "loss": 0.8562, + "step": 19145 + }, + { + "epoch": 0.4916155931191534, + "grad_norm": 0.8203125, + "learning_rate": 0.0001609379563984673, + "loss": 0.8315, + "step": 19146 + }, + { + "epoch": 0.49164127031507526, + "grad_norm": 0.71484375, + "learning_rate": 0.00016093441676755788, + "loss": 0.9272, + "step": 19147 + }, + { + "epoch": 0.49166694751099704, + "grad_norm": 0.73046875, + "learning_rate": 0.00016093087701521164, + "loss": 0.905, + "step": 19148 + }, + { + "epoch": 0.4916926247069189, + "grad_norm": 0.79296875, + "learning_rate": 0.00016092733714143567, + "loss": 0.964, + "step": 19149 + }, + { + "epoch": 0.4917183019028407, + "grad_norm": 1.1171875, + "learning_rate": 0.000160923797146237, + "loss": 0.7351, + "step": 19150 + }, + { + "epoch": 0.4917439790987625, + "grad_norm": 0.8125, + "learning_rate": 0.00016092025702962273, + "loss": 0.8629, + "step": 19151 + }, + { + "epoch": 0.49176965629468433, + "grad_norm": 0.63671875, + "learning_rate": 0.0001609167167915999, + "loss": 0.7137, + "step": 19152 + }, + { + "epoch": 0.49179533349060617, + "grad_norm": 0.82421875, + "learning_rate": 0.00016091317643217553, + "loss": 1.1063, + "step": 19153 + }, + { + "epoch": 0.491821010686528, + "grad_norm": 0.7578125, + "learning_rate": 0.00016090963595135672, + "loss": 0.8836, + "step": 19154 + }, + { + "epoch": 0.4918466878824498, + "grad_norm": 0.78125, + "learning_rate": 0.0001609060953491505, + "loss": 0.8521, + "step": 19155 + }, + { + "epoch": 0.4918723650783716, + "grad_norm": 0.76953125, + "learning_rate": 0.0001609025546255639, + "loss": 0.8074, + "step": 19156 + }, + { + "epoch": 0.49189804227429346, + "grad_norm": 0.7890625, + "learning_rate": 0.00016089901378060406, + "loss": 0.9321, + "step": 19157 + }, + { + "epoch": 0.49192371947021524, + "grad_norm": 0.734375, + "learning_rate": 0.00016089547281427797, + "loss": 0.8996, + "step": 19158 + }, + { + "epoch": 0.4919493966661371, + "grad_norm": 0.92578125, + "learning_rate": 0.0001608919317265927, + "loss": 0.8614, + "step": 19159 + }, + { + "epoch": 0.4919750738620589, + "grad_norm": 0.76953125, + "learning_rate": 0.00016088839051755534, + "loss": 1.0034, + "step": 19160 + }, + { + "epoch": 0.4920007510579807, + "grad_norm": 0.70703125, + "learning_rate": 0.00016088484918717286, + "loss": 0.8586, + "step": 19161 + }, + { + "epoch": 0.4920264282539025, + "grad_norm": 0.8046875, + "learning_rate": 0.0001608813077354524, + "loss": 0.9322, + "step": 19162 + }, + { + "epoch": 0.49205210544982436, + "grad_norm": 0.7734375, + "learning_rate": 0.00016087776616240102, + "loss": 0.8825, + "step": 19163 + }, + { + "epoch": 0.4920777826457462, + "grad_norm": 0.80078125, + "learning_rate": 0.00016087422446802571, + "loss": 0.8344, + "step": 19164 + }, + { + "epoch": 0.492103459841668, + "grad_norm": 0.80859375, + "learning_rate": 0.00016087068265233358, + "loss": 0.9292, + "step": 19165 + }, + { + "epoch": 0.4921291370375898, + "grad_norm": 0.75, + "learning_rate": 0.0001608671407153317, + "loss": 0.9999, + "step": 19166 + }, + { + "epoch": 0.49215481423351165, + "grad_norm": 0.81640625, + "learning_rate": 0.0001608635986570271, + "loss": 0.8774, + "step": 19167 + }, + { + "epoch": 0.49218049142943343, + "grad_norm": 0.796875, + "learning_rate": 0.00016086005647742682, + "loss": 0.9044, + "step": 19168 + }, + { + "epoch": 0.49220616862535527, + "grad_norm": 0.76171875, + "learning_rate": 0.00016085651417653798, + "loss": 0.8831, + "step": 19169 + }, + { + "epoch": 0.4922318458212771, + "grad_norm": 0.81640625, + "learning_rate": 0.0001608529717543676, + "loss": 0.9588, + "step": 19170 + }, + { + "epoch": 0.4922575230171989, + "grad_norm": 0.80078125, + "learning_rate": 0.0001608494292109227, + "loss": 0.9217, + "step": 19171 + }, + { + "epoch": 0.4922832002131207, + "grad_norm": 0.828125, + "learning_rate": 0.00016084588654621045, + "loss": 0.8489, + "step": 19172 + }, + { + "epoch": 0.49230887740904256, + "grad_norm": 0.703125, + "learning_rate": 0.00016084234376023782, + "loss": 0.8199, + "step": 19173 + }, + { + "epoch": 0.4923345546049644, + "grad_norm": 0.73046875, + "learning_rate": 0.00016083880085301187, + "loss": 0.8642, + "step": 19174 + }, + { + "epoch": 0.49236023180088617, + "grad_norm": 0.80078125, + "learning_rate": 0.0001608352578245397, + "loss": 0.9824, + "step": 19175 + }, + { + "epoch": 0.492385908996808, + "grad_norm": 0.80078125, + "learning_rate": 0.00016083171467482836, + "loss": 0.9208, + "step": 19176 + }, + { + "epoch": 0.49241158619272984, + "grad_norm": 0.7578125, + "learning_rate": 0.00016082817140388489, + "loss": 0.838, + "step": 19177 + }, + { + "epoch": 0.4924372633886516, + "grad_norm": 0.80078125, + "learning_rate": 0.0001608246280117164, + "loss": 0.8504, + "step": 19178 + }, + { + "epoch": 0.49246294058457346, + "grad_norm": 0.76953125, + "learning_rate": 0.00016082108449832992, + "loss": 0.8886, + "step": 19179 + }, + { + "epoch": 0.4924886177804953, + "grad_norm": 0.79296875, + "learning_rate": 0.0001608175408637325, + "loss": 0.975, + "step": 19180 + }, + { + "epoch": 0.4925142949764171, + "grad_norm": 0.77734375, + "learning_rate": 0.00016081399710793118, + "loss": 1.0011, + "step": 19181 + }, + { + "epoch": 0.4925399721723389, + "grad_norm": 0.75390625, + "learning_rate": 0.0001608104532309331, + "loss": 0.975, + "step": 19182 + }, + { + "epoch": 0.49256564936826075, + "grad_norm": 0.8046875, + "learning_rate": 0.0001608069092327453, + "loss": 0.9459, + "step": 19183 + }, + { + "epoch": 0.4925913265641826, + "grad_norm": 0.79296875, + "learning_rate": 0.00016080336511337478, + "loss": 0.8202, + "step": 19184 + }, + { + "epoch": 0.49261700376010437, + "grad_norm": 0.7734375, + "learning_rate": 0.00016079982087282863, + "loss": 0.7942, + "step": 19185 + }, + { + "epoch": 0.4926426809560262, + "grad_norm": 0.796875, + "learning_rate": 0.00016079627651111394, + "loss": 1.0512, + "step": 19186 + }, + { + "epoch": 0.49266835815194804, + "grad_norm": 0.77734375, + "learning_rate": 0.00016079273202823782, + "loss": 0.8241, + "step": 19187 + }, + { + "epoch": 0.4926940353478698, + "grad_norm": 0.7421875, + "learning_rate": 0.0001607891874242072, + "loss": 0.9126, + "step": 19188 + }, + { + "epoch": 0.49271971254379165, + "grad_norm": 0.796875, + "learning_rate": 0.00016078564269902926, + "loss": 0.9143, + "step": 19189 + }, + { + "epoch": 0.4927453897397135, + "grad_norm": 0.73828125, + "learning_rate": 0.00016078209785271103, + "loss": 0.871, + "step": 19190 + }, + { + "epoch": 0.49277106693563527, + "grad_norm": 0.84765625, + "learning_rate": 0.00016077855288525956, + "loss": 1.0322, + "step": 19191 + }, + { + "epoch": 0.4927967441315571, + "grad_norm": 0.80859375, + "learning_rate": 0.0001607750077966819, + "loss": 0.9064, + "step": 19192 + }, + { + "epoch": 0.49282242132747894, + "grad_norm": 0.78515625, + "learning_rate": 0.00016077146258698522, + "loss": 0.8823, + "step": 19193 + }, + { + "epoch": 0.4928480985234008, + "grad_norm": 0.70703125, + "learning_rate": 0.00016076791725617642, + "loss": 0.8994, + "step": 19194 + }, + { + "epoch": 0.49287377571932256, + "grad_norm": 0.80859375, + "learning_rate": 0.00016076437180426268, + "loss": 0.9049, + "step": 19195 + }, + { + "epoch": 0.4928994529152444, + "grad_norm": 0.81640625, + "learning_rate": 0.00016076082623125103, + "loss": 1.0274, + "step": 19196 + }, + { + "epoch": 0.49292513011116623, + "grad_norm": 0.76171875, + "learning_rate": 0.00016075728053714853, + "loss": 0.8927, + "step": 19197 + }, + { + "epoch": 0.492950807307088, + "grad_norm": 0.8046875, + "learning_rate": 0.00016075373472196228, + "loss": 1.0416, + "step": 19198 + }, + { + "epoch": 0.49297648450300985, + "grad_norm": 0.90625, + "learning_rate": 0.00016075018878569934, + "loss": 0.8566, + "step": 19199 + }, + { + "epoch": 0.4930021616989317, + "grad_norm": 0.8046875, + "learning_rate": 0.00016074664272836673, + "loss": 0.8779, + "step": 19200 + }, + { + "epoch": 0.49302783889485347, + "grad_norm": 0.7890625, + "learning_rate": 0.00016074309654997158, + "loss": 0.895, + "step": 19201 + }, + { + "epoch": 0.4930535160907753, + "grad_norm": 0.765625, + "learning_rate": 0.00016073955025052087, + "loss": 0.9032, + "step": 19202 + }, + { + "epoch": 0.49307919328669714, + "grad_norm": 0.71484375, + "learning_rate": 0.00016073600383002178, + "loss": 0.8899, + "step": 19203 + }, + { + "epoch": 0.493104870482619, + "grad_norm": 0.77734375, + "learning_rate": 0.00016073245728848127, + "loss": 0.7969, + "step": 19204 + }, + { + "epoch": 0.49313054767854075, + "grad_norm": 0.73046875, + "learning_rate": 0.0001607289106259065, + "loss": 1.0596, + "step": 19205 + }, + { + "epoch": 0.4931562248744626, + "grad_norm": 0.796875, + "learning_rate": 0.00016072536384230447, + "loss": 0.9947, + "step": 19206 + }, + { + "epoch": 0.4931819020703844, + "grad_norm": 0.7421875, + "learning_rate": 0.00016072181693768228, + "loss": 0.8086, + "step": 19207 + }, + { + "epoch": 0.4932075792663062, + "grad_norm": 0.7578125, + "learning_rate": 0.00016071826991204705, + "loss": 0.7742, + "step": 19208 + }, + { + "epoch": 0.49323325646222804, + "grad_norm": 0.8046875, + "learning_rate": 0.00016071472276540572, + "loss": 0.8326, + "step": 19209 + }, + { + "epoch": 0.4932589336581499, + "grad_norm": 0.80859375, + "learning_rate": 0.00016071117549776545, + "loss": 0.8844, + "step": 19210 + }, + { + "epoch": 0.49328461085407166, + "grad_norm": 0.76953125, + "learning_rate": 0.00016070762810913329, + "loss": 0.8263, + "step": 19211 + }, + { + "epoch": 0.4933102880499935, + "grad_norm": 0.83203125, + "learning_rate": 0.00016070408059951633, + "loss": 0.8346, + "step": 19212 + }, + { + "epoch": 0.49333596524591533, + "grad_norm": 0.74609375, + "learning_rate": 0.0001607005329689216, + "loss": 0.8459, + "step": 19213 + }, + { + "epoch": 0.4933616424418371, + "grad_norm": 0.75390625, + "learning_rate": 0.00016069698521735618, + "loss": 0.9051, + "step": 19214 + }, + { + "epoch": 0.49338731963775895, + "grad_norm": 0.74609375, + "learning_rate": 0.00016069343734482718, + "loss": 0.9269, + "step": 19215 + }, + { + "epoch": 0.4934129968336808, + "grad_norm": 0.76171875, + "learning_rate": 0.00016068988935134164, + "loss": 0.9501, + "step": 19216 + }, + { + "epoch": 0.4934386740296026, + "grad_norm": 0.75390625, + "learning_rate": 0.00016068634123690662, + "loss": 0.8032, + "step": 19217 + }, + { + "epoch": 0.4934643512255244, + "grad_norm": 0.74609375, + "learning_rate": 0.0001606827930015292, + "loss": 0.9353, + "step": 19218 + }, + { + "epoch": 0.49349002842144624, + "grad_norm": 0.76953125, + "learning_rate": 0.0001606792446452165, + "loss": 1.0264, + "step": 19219 + }, + { + "epoch": 0.4935157056173681, + "grad_norm": 0.77734375, + "learning_rate": 0.0001606756961679755, + "loss": 0.9292, + "step": 19220 + }, + { + "epoch": 0.49354138281328985, + "grad_norm": 0.71875, + "learning_rate": 0.00016067214756981333, + "loss": 1.0448, + "step": 19221 + }, + { + "epoch": 0.4935670600092117, + "grad_norm": 0.87109375, + "learning_rate": 0.00016066859885073705, + "loss": 0.9063, + "step": 19222 + }, + { + "epoch": 0.4935927372051335, + "grad_norm": 0.7421875, + "learning_rate": 0.00016066505001075372, + "loss": 0.8816, + "step": 19223 + }, + { + "epoch": 0.4936184144010553, + "grad_norm": 0.796875, + "learning_rate": 0.00016066150104987044, + "loss": 0.9522, + "step": 19224 + }, + { + "epoch": 0.49364409159697714, + "grad_norm": 0.77734375, + "learning_rate": 0.00016065795196809426, + "loss": 0.8807, + "step": 19225 + }, + { + "epoch": 0.493669768792899, + "grad_norm": 0.8046875, + "learning_rate": 0.0001606544027654323, + "loss": 0.914, + "step": 19226 + }, + { + "epoch": 0.4936954459888208, + "grad_norm": 0.7109375, + "learning_rate": 0.0001606508534418915, + "loss": 0.9278, + "step": 19227 + }, + { + "epoch": 0.4937211231847426, + "grad_norm": 0.75, + "learning_rate": 0.00016064730399747914, + "loss": 0.8284, + "step": 19228 + }, + { + "epoch": 0.49374680038066443, + "grad_norm": 0.80078125, + "learning_rate": 0.0001606437544322021, + "loss": 0.934, + "step": 19229 + }, + { + "epoch": 0.49377247757658627, + "grad_norm": 0.72265625, + "learning_rate": 0.00016064020474606755, + "loss": 0.8704, + "step": 19230 + }, + { + "epoch": 0.49379815477250805, + "grad_norm": 0.72265625, + "learning_rate": 0.00016063665493908257, + "loss": 0.8921, + "step": 19231 + }, + { + "epoch": 0.4938238319684299, + "grad_norm": 0.78125, + "learning_rate": 0.00016063310501125423, + "loss": 0.7808, + "step": 19232 + }, + { + "epoch": 0.4938495091643517, + "grad_norm": 0.7421875, + "learning_rate": 0.00016062955496258958, + "loss": 0.8527, + "step": 19233 + }, + { + "epoch": 0.4938751863602735, + "grad_norm": 0.75390625, + "learning_rate": 0.0001606260047930957, + "loss": 0.7969, + "step": 19234 + }, + { + "epoch": 0.49390086355619534, + "grad_norm": 0.75390625, + "learning_rate": 0.00016062245450277964, + "loss": 1.0068, + "step": 19235 + }, + { + "epoch": 0.49392654075211717, + "grad_norm": 0.7265625, + "learning_rate": 0.0001606189040916485, + "loss": 0.8592, + "step": 19236 + }, + { + "epoch": 0.493952217948039, + "grad_norm": 0.78515625, + "learning_rate": 0.0001606153535597094, + "loss": 0.9091, + "step": 19237 + }, + { + "epoch": 0.4939778951439608, + "grad_norm": 0.71875, + "learning_rate": 0.00016061180290696933, + "loss": 0.8641, + "step": 19238 + }, + { + "epoch": 0.4940035723398826, + "grad_norm": 0.74609375, + "learning_rate": 0.00016060825213343546, + "loss": 0.8798, + "step": 19239 + }, + { + "epoch": 0.49402924953580446, + "grad_norm": 0.78125, + "learning_rate": 0.00016060470123911483, + "loss": 0.9374, + "step": 19240 + }, + { + "epoch": 0.49405492673172624, + "grad_norm": 0.734375, + "learning_rate": 0.00016060115022401445, + "loss": 0.8083, + "step": 19241 + }, + { + "epoch": 0.4940806039276481, + "grad_norm": 0.78125, + "learning_rate": 0.00016059759908814146, + "loss": 0.8981, + "step": 19242 + }, + { + "epoch": 0.4941062811235699, + "grad_norm": 0.8046875, + "learning_rate": 0.00016059404783150296, + "loss": 0.8361, + "step": 19243 + }, + { + "epoch": 0.4941319583194917, + "grad_norm": 0.92578125, + "learning_rate": 0.00016059049645410597, + "loss": 1.0318, + "step": 19244 + }, + { + "epoch": 0.49415763551541353, + "grad_norm": 0.74609375, + "learning_rate": 0.0001605869449559576, + "loss": 0.8518, + "step": 19245 + }, + { + "epoch": 0.49418331271133537, + "grad_norm": 0.85546875, + "learning_rate": 0.00016058339333706494, + "loss": 1.0988, + "step": 19246 + }, + { + "epoch": 0.4942089899072572, + "grad_norm": 0.75, + "learning_rate": 0.00016057984159743502, + "loss": 0.9117, + "step": 19247 + }, + { + "epoch": 0.494234667103179, + "grad_norm": 0.890625, + "learning_rate": 0.00016057628973707496, + "loss": 0.8916, + "step": 19248 + }, + { + "epoch": 0.4942603442991008, + "grad_norm": 0.85546875, + "learning_rate": 0.00016057273775599184, + "loss": 0.9275, + "step": 19249 + }, + { + "epoch": 0.49428602149502265, + "grad_norm": 0.86328125, + "learning_rate": 0.0001605691856541927, + "loss": 0.9643, + "step": 19250 + }, + { + "epoch": 0.49431169869094443, + "grad_norm": 0.82421875, + "learning_rate": 0.0001605656334316847, + "loss": 0.8456, + "step": 19251 + }, + { + "epoch": 0.49433737588686627, + "grad_norm": 0.82421875, + "learning_rate": 0.0001605620810884748, + "loss": 0.8586, + "step": 19252 + }, + { + "epoch": 0.4943630530827881, + "grad_norm": 0.7890625, + "learning_rate": 0.00016055852862457016, + "loss": 1.0622, + "step": 19253 + }, + { + "epoch": 0.4943887302787099, + "grad_norm": 0.86328125, + "learning_rate": 0.00016055497603997788, + "loss": 0.986, + "step": 19254 + }, + { + "epoch": 0.4944144074746317, + "grad_norm": 0.69140625, + "learning_rate": 0.00016055142333470495, + "loss": 0.7737, + "step": 19255 + }, + { + "epoch": 0.49444008467055356, + "grad_norm": 0.78125, + "learning_rate": 0.0001605478705087585, + "loss": 0.7916, + "step": 19256 + }, + { + "epoch": 0.4944657618664754, + "grad_norm": 0.77734375, + "learning_rate": 0.00016054431756214565, + "loss": 0.9557, + "step": 19257 + }, + { + "epoch": 0.4944914390623972, + "grad_norm": 0.86328125, + "learning_rate": 0.00016054076449487344, + "loss": 0.8078, + "step": 19258 + }, + { + "epoch": 0.494517116258319, + "grad_norm": 0.77734375, + "learning_rate": 0.00016053721130694895, + "loss": 0.8613, + "step": 19259 + }, + { + "epoch": 0.49454279345424085, + "grad_norm": 0.796875, + "learning_rate": 0.00016053365799837926, + "loss": 0.9121, + "step": 19260 + }, + { + "epoch": 0.49456847065016263, + "grad_norm": 0.8125, + "learning_rate": 0.00016053010456917145, + "loss": 0.9121, + "step": 19261 + }, + { + "epoch": 0.49459414784608446, + "grad_norm": 0.75, + "learning_rate": 0.0001605265510193326, + "loss": 1.0557, + "step": 19262 + }, + { + "epoch": 0.4946198250420063, + "grad_norm": 0.79296875, + "learning_rate": 0.00016052299734886982, + "loss": 0.6789, + "step": 19263 + }, + { + "epoch": 0.4946455022379281, + "grad_norm": 0.83984375, + "learning_rate": 0.0001605194435577902, + "loss": 0.9249, + "step": 19264 + }, + { + "epoch": 0.4946711794338499, + "grad_norm": 0.83203125, + "learning_rate": 0.00016051588964610076, + "loss": 0.9806, + "step": 19265 + }, + { + "epoch": 0.49469685662977175, + "grad_norm": 0.7265625, + "learning_rate": 0.00016051233561380863, + "loss": 0.8169, + "step": 19266 + }, + { + "epoch": 0.4947225338256936, + "grad_norm": 0.75, + "learning_rate": 0.00016050878146092086, + "loss": 0.912, + "step": 19267 + }, + { + "epoch": 0.49474821102161537, + "grad_norm": 0.75, + "learning_rate": 0.00016050522718744458, + "loss": 1.0044, + "step": 19268 + }, + { + "epoch": 0.4947738882175372, + "grad_norm": 0.80859375, + "learning_rate": 0.00016050167279338683, + "loss": 0.8968, + "step": 19269 + }, + { + "epoch": 0.49479956541345904, + "grad_norm": 0.78515625, + "learning_rate": 0.00016049811827875473, + "loss": 0.9878, + "step": 19270 + }, + { + "epoch": 0.4948252426093808, + "grad_norm": 0.76171875, + "learning_rate": 0.00016049456364355534, + "loss": 0.8709, + "step": 19271 + }, + { + "epoch": 0.49485091980530266, + "grad_norm": 0.71875, + "learning_rate": 0.00016049100888779574, + "loss": 0.8782, + "step": 19272 + }, + { + "epoch": 0.4948765970012245, + "grad_norm": 0.7578125, + "learning_rate": 0.00016048745401148304, + "loss": 1.1569, + "step": 19273 + }, + { + "epoch": 0.4949022741971463, + "grad_norm": 0.84375, + "learning_rate": 0.00016048389901462428, + "loss": 0.8734, + "step": 19274 + }, + { + "epoch": 0.4949279513930681, + "grad_norm": 0.765625, + "learning_rate": 0.00016048034389722662, + "loss": 1.0323, + "step": 19275 + }, + { + "epoch": 0.49495362858898995, + "grad_norm": 0.7578125, + "learning_rate": 0.00016047678865929705, + "loss": 1.024, + "step": 19276 + }, + { + "epoch": 0.4949793057849118, + "grad_norm": 0.7421875, + "learning_rate": 0.0001604732333008427, + "loss": 0.8084, + "step": 19277 + }, + { + "epoch": 0.49500498298083356, + "grad_norm": 0.7265625, + "learning_rate": 0.0001604696778218707, + "loss": 0.9287, + "step": 19278 + }, + { + "epoch": 0.4950306601767554, + "grad_norm": 0.89453125, + "learning_rate": 0.00016046612222238809, + "loss": 0.8271, + "step": 19279 + }, + { + "epoch": 0.49505633737267724, + "grad_norm": 0.70703125, + "learning_rate": 0.00016046256650240195, + "loss": 0.7907, + "step": 19280 + }, + { + "epoch": 0.495082014568599, + "grad_norm": 0.7890625, + "learning_rate": 0.00016045901066191935, + "loss": 0.8461, + "step": 19281 + }, + { + "epoch": 0.49510769176452085, + "grad_norm": 0.71875, + "learning_rate": 0.0001604554547009474, + "loss": 0.8153, + "step": 19282 + }, + { + "epoch": 0.4951333689604427, + "grad_norm": 0.7578125, + "learning_rate": 0.0001604518986194932, + "loss": 0.8726, + "step": 19283 + }, + { + "epoch": 0.49515904615636447, + "grad_norm": 0.76171875, + "learning_rate": 0.0001604483424175639, + "loss": 0.8469, + "step": 19284 + }, + { + "epoch": 0.4951847233522863, + "grad_norm": 0.7734375, + "learning_rate": 0.00016044478609516643, + "loss": 0.8621, + "step": 19285 + }, + { + "epoch": 0.49521040054820814, + "grad_norm": 0.8125, + "learning_rate": 0.00016044122965230798, + "loss": 0.8025, + "step": 19286 + }, + { + "epoch": 0.49523607774413, + "grad_norm": 0.7578125, + "learning_rate": 0.00016043767308899563, + "loss": 0.8802, + "step": 19287 + }, + { + "epoch": 0.49526175494005176, + "grad_norm": 0.77734375, + "learning_rate": 0.00016043411640523646, + "loss": 0.9515, + "step": 19288 + }, + { + "epoch": 0.4952874321359736, + "grad_norm": 0.7578125, + "learning_rate": 0.00016043055960103753, + "loss": 0.7889, + "step": 19289 + }, + { + "epoch": 0.49531310933189543, + "grad_norm": 0.76171875, + "learning_rate": 0.00016042700267640596, + "loss": 0.9596, + "step": 19290 + }, + { + "epoch": 0.4953387865278172, + "grad_norm": 0.83203125, + "learning_rate": 0.00016042344563134886, + "loss": 0.8715, + "step": 19291 + }, + { + "epoch": 0.49536446372373905, + "grad_norm": 0.83203125, + "learning_rate": 0.00016041988846587327, + "loss": 0.7875, + "step": 19292 + }, + { + "epoch": 0.4953901409196609, + "grad_norm": 0.73828125, + "learning_rate": 0.00016041633117998632, + "loss": 0.8509, + "step": 19293 + }, + { + "epoch": 0.49541581811558266, + "grad_norm": 0.796875, + "learning_rate": 0.00016041277377369504, + "loss": 0.9065, + "step": 19294 + }, + { + "epoch": 0.4954414953115045, + "grad_norm": 0.75390625, + "learning_rate": 0.00016040921624700655, + "loss": 0.8098, + "step": 19295 + }, + { + "epoch": 0.49546717250742633, + "grad_norm": 0.8046875, + "learning_rate": 0.000160405658599928, + "loss": 0.9903, + "step": 19296 + }, + { + "epoch": 0.49549284970334817, + "grad_norm": 0.765625, + "learning_rate": 0.0001604021008324664, + "loss": 0.8774, + "step": 19297 + }, + { + "epoch": 0.49551852689926995, + "grad_norm": 0.83984375, + "learning_rate": 0.00016039854294462892, + "loss": 0.8713, + "step": 19298 + }, + { + "epoch": 0.4955442040951918, + "grad_norm": 0.71484375, + "learning_rate": 0.00016039498493642252, + "loss": 0.7492, + "step": 19299 + }, + { + "epoch": 0.4955698812911136, + "grad_norm": 0.7890625, + "learning_rate": 0.00016039142680785443, + "loss": 0.8659, + "step": 19300 + }, + { + "epoch": 0.4955955584870354, + "grad_norm": 0.78125, + "learning_rate": 0.00016038786855893165, + "loss": 0.942, + "step": 19301 + }, + { + "epoch": 0.49562123568295724, + "grad_norm": 0.72265625, + "learning_rate": 0.0001603843101896613, + "loss": 0.7701, + "step": 19302 + }, + { + "epoch": 0.4956469128788791, + "grad_norm": 0.765625, + "learning_rate": 0.00016038075170005046, + "loss": 0.8561, + "step": 19303 + }, + { + "epoch": 0.49567259007480086, + "grad_norm": 0.78515625, + "learning_rate": 0.00016037719309010628, + "loss": 0.9052, + "step": 19304 + }, + { + "epoch": 0.4956982672707227, + "grad_norm": 0.76171875, + "learning_rate": 0.0001603736343598358, + "loss": 0.9032, + "step": 19305 + }, + { + "epoch": 0.49572394446664453, + "grad_norm": 0.73046875, + "learning_rate": 0.0001603700755092461, + "loss": 1.0151, + "step": 19306 + }, + { + "epoch": 0.49574962166256636, + "grad_norm": 0.78515625, + "learning_rate": 0.0001603665165383443, + "loss": 0.9269, + "step": 19307 + }, + { + "epoch": 0.49577529885848814, + "grad_norm": 0.87890625, + "learning_rate": 0.0001603629574471375, + "loss": 0.9031, + "step": 19308 + }, + { + "epoch": 0.49580097605441, + "grad_norm": 0.79296875, + "learning_rate": 0.00016035939823563276, + "loss": 0.9239, + "step": 19309 + }, + { + "epoch": 0.4958266532503318, + "grad_norm": 0.78515625, + "learning_rate": 0.00016035583890383718, + "loss": 0.9681, + "step": 19310 + }, + { + "epoch": 0.4958523304462536, + "grad_norm": 0.7734375, + "learning_rate": 0.0001603522794517579, + "loss": 0.7657, + "step": 19311 + }, + { + "epoch": 0.49587800764217543, + "grad_norm": 0.85546875, + "learning_rate": 0.00016034871987940196, + "loss": 0.9157, + "step": 19312 + }, + { + "epoch": 0.49590368483809727, + "grad_norm": 0.73046875, + "learning_rate": 0.00016034516018677647, + "loss": 0.8818, + "step": 19313 + }, + { + "epoch": 0.49592936203401905, + "grad_norm": 0.7578125, + "learning_rate": 0.00016034160037388853, + "loss": 0.8225, + "step": 19314 + }, + { + "epoch": 0.4959550392299409, + "grad_norm": 0.75, + "learning_rate": 0.00016033804044074523, + "loss": 0.849, + "step": 19315 + }, + { + "epoch": 0.4959807164258627, + "grad_norm": 0.78515625, + "learning_rate": 0.00016033448038735366, + "loss": 0.9037, + "step": 19316 + }, + { + "epoch": 0.49600639362178456, + "grad_norm": 0.7578125, + "learning_rate": 0.00016033092021372093, + "loss": 0.8007, + "step": 19317 + }, + { + "epoch": 0.49603207081770634, + "grad_norm": 0.734375, + "learning_rate": 0.00016032735991985413, + "loss": 0.7612, + "step": 19318 + }, + { + "epoch": 0.4960577480136282, + "grad_norm": 0.82421875, + "learning_rate": 0.00016032379950576035, + "loss": 0.9752, + "step": 19319 + }, + { + "epoch": 0.49608342520955, + "grad_norm": 0.76953125, + "learning_rate": 0.00016032023897144667, + "loss": 0.9643, + "step": 19320 + }, + { + "epoch": 0.4961091024054718, + "grad_norm": 0.73828125, + "learning_rate": 0.00016031667831692024, + "loss": 0.787, + "step": 19321 + }, + { + "epoch": 0.4961347796013936, + "grad_norm": 0.69921875, + "learning_rate": 0.00016031311754218807, + "loss": 0.742, + "step": 19322 + }, + { + "epoch": 0.49616045679731546, + "grad_norm": 0.76953125, + "learning_rate": 0.0001603095566472573, + "loss": 0.8527, + "step": 19323 + }, + { + "epoch": 0.49618613399323724, + "grad_norm": 0.75, + "learning_rate": 0.0001603059956321351, + "loss": 0.8082, + "step": 19324 + }, + { + "epoch": 0.4962118111891591, + "grad_norm": 0.75390625, + "learning_rate": 0.00016030243449682846, + "loss": 0.8936, + "step": 19325 + }, + { + "epoch": 0.4962374883850809, + "grad_norm": 0.73828125, + "learning_rate": 0.00016029887324134448, + "loss": 0.8494, + "step": 19326 + }, + { + "epoch": 0.49626316558100275, + "grad_norm": 0.77734375, + "learning_rate": 0.00016029531186569033, + "loss": 0.8724, + "step": 19327 + }, + { + "epoch": 0.49628884277692453, + "grad_norm": 0.765625, + "learning_rate": 0.0001602917503698731, + "loss": 0.9662, + "step": 19328 + }, + { + "epoch": 0.49631451997284637, + "grad_norm": 0.8125, + "learning_rate": 0.0001602881887538998, + "loss": 0.9319, + "step": 19329 + }, + { + "epoch": 0.4963401971687682, + "grad_norm": 0.6953125, + "learning_rate": 0.0001602846270177776, + "loss": 0.7889, + "step": 19330 + }, + { + "epoch": 0.49636587436469, + "grad_norm": 0.7734375, + "learning_rate": 0.0001602810651615136, + "loss": 0.8505, + "step": 19331 + }, + { + "epoch": 0.4963915515606118, + "grad_norm": 0.734375, + "learning_rate": 0.00016027750318511487, + "loss": 0.8615, + "step": 19332 + }, + { + "epoch": 0.49641722875653366, + "grad_norm": 0.7578125, + "learning_rate": 0.00016027394108858854, + "loss": 0.8445, + "step": 19333 + }, + { + "epoch": 0.49644290595245544, + "grad_norm": 0.82421875, + "learning_rate": 0.00016027037887194164, + "loss": 0.8867, + "step": 19334 + }, + { + "epoch": 0.4964685831483773, + "grad_norm": 0.78125, + "learning_rate": 0.00016026681653518137, + "loss": 0.7465, + "step": 19335 + }, + { + "epoch": 0.4964942603442991, + "grad_norm": 0.8671875, + "learning_rate": 0.00016026325407831476, + "loss": 0.941, + "step": 19336 + }, + { + "epoch": 0.49651993754022095, + "grad_norm": 0.82421875, + "learning_rate": 0.00016025969150134894, + "loss": 0.844, + "step": 19337 + }, + { + "epoch": 0.4965456147361427, + "grad_norm": 0.7109375, + "learning_rate": 0.00016025612880429097, + "loss": 0.7837, + "step": 19338 + }, + { + "epoch": 0.49657129193206456, + "grad_norm": 0.81640625, + "learning_rate": 0.00016025256598714801, + "loss": 0.9836, + "step": 19339 + }, + { + "epoch": 0.4965969691279864, + "grad_norm": 0.7890625, + "learning_rate": 0.00016024900304992712, + "loss": 0.7911, + "step": 19340 + }, + { + "epoch": 0.4966226463239082, + "grad_norm": 0.7578125, + "learning_rate": 0.0001602454399926354, + "loss": 0.9034, + "step": 19341 + }, + { + "epoch": 0.49664832351983, + "grad_norm": 0.84765625, + "learning_rate": 0.00016024187681527995, + "loss": 1.0112, + "step": 19342 + }, + { + "epoch": 0.49667400071575185, + "grad_norm": 0.8046875, + "learning_rate": 0.0001602383135178679, + "loss": 0.8532, + "step": 19343 + }, + { + "epoch": 0.49669967791167363, + "grad_norm": 0.84375, + "learning_rate": 0.00016023475010040633, + "loss": 0.8855, + "step": 19344 + }, + { + "epoch": 0.49672535510759547, + "grad_norm": 0.76171875, + "learning_rate": 0.00016023118656290235, + "loss": 0.9065, + "step": 19345 + }, + { + "epoch": 0.4967510323035173, + "grad_norm": 0.8203125, + "learning_rate": 0.00016022762290536304, + "loss": 1.0071, + "step": 19346 + }, + { + "epoch": 0.49677670949943914, + "grad_norm": 0.76171875, + "learning_rate": 0.00016022405912779553, + "loss": 0.919, + "step": 19347 + }, + { + "epoch": 0.4968023866953609, + "grad_norm": 0.765625, + "learning_rate": 0.0001602204952302069, + "loss": 0.9208, + "step": 19348 + }, + { + "epoch": 0.49682806389128276, + "grad_norm": 0.78515625, + "learning_rate": 0.00016021693121260429, + "loss": 0.9327, + "step": 19349 + }, + { + "epoch": 0.4968537410872046, + "grad_norm": 0.7734375, + "learning_rate": 0.00016021336707499473, + "loss": 0.9031, + "step": 19350 + }, + { + "epoch": 0.4968794182831264, + "grad_norm": 0.77734375, + "learning_rate": 0.0001602098028173854, + "loss": 0.9853, + "step": 19351 + }, + { + "epoch": 0.4969050954790482, + "grad_norm": 0.7890625, + "learning_rate": 0.00016020623843978338, + "loss": 0.9811, + "step": 19352 + }, + { + "epoch": 0.49693077267497004, + "grad_norm": 0.8046875, + "learning_rate": 0.00016020267394219575, + "loss": 1.0786, + "step": 19353 + }, + { + "epoch": 0.4969564498708918, + "grad_norm": 0.76953125, + "learning_rate": 0.0001601991093246296, + "loss": 0.8732, + "step": 19354 + }, + { + "epoch": 0.49698212706681366, + "grad_norm": 0.84375, + "learning_rate": 0.0001601955445870921, + "loss": 0.9101, + "step": 19355 + }, + { + "epoch": 0.4970078042627355, + "grad_norm": 0.76953125, + "learning_rate": 0.0001601919797295903, + "loss": 0.9337, + "step": 19356 + }, + { + "epoch": 0.49703348145865733, + "grad_norm": 0.7734375, + "learning_rate": 0.00016018841475213137, + "loss": 0.9579, + "step": 19357 + }, + { + "epoch": 0.4970591586545791, + "grad_norm": 0.703125, + "learning_rate": 0.0001601848496547223, + "loss": 0.8625, + "step": 19358 + }, + { + "epoch": 0.49708483585050095, + "grad_norm": 0.76953125, + "learning_rate": 0.0001601812844373703, + "loss": 0.7726, + "step": 19359 + }, + { + "epoch": 0.4971105130464228, + "grad_norm": 0.77734375, + "learning_rate": 0.00016017771910008239, + "loss": 0.8656, + "step": 19360 + }, + { + "epoch": 0.49713619024234457, + "grad_norm": 0.77734375, + "learning_rate": 0.00016017415364286575, + "loss": 0.8108, + "step": 19361 + }, + { + "epoch": 0.4971618674382664, + "grad_norm": 0.8203125, + "learning_rate": 0.00016017058806572747, + "loss": 0.8559, + "step": 19362 + }, + { + "epoch": 0.49718754463418824, + "grad_norm": 0.78125, + "learning_rate": 0.00016016702236867463, + "loss": 0.8655, + "step": 19363 + }, + { + "epoch": 0.49721322183011, + "grad_norm": 0.76953125, + "learning_rate": 0.00016016345655171435, + "loss": 0.83, + "step": 19364 + }, + { + "epoch": 0.49723889902603186, + "grad_norm": 0.86328125, + "learning_rate": 0.00016015989061485378, + "loss": 0.8988, + "step": 19365 + }, + { + "epoch": 0.4972645762219537, + "grad_norm": 0.74609375, + "learning_rate": 0.00016015632455809994, + "loss": 0.9532, + "step": 19366 + }, + { + "epoch": 0.4972902534178755, + "grad_norm": 0.83984375, + "learning_rate": 0.00016015275838145995, + "loss": 0.8672, + "step": 19367 + }, + { + "epoch": 0.4973159306137973, + "grad_norm": 0.84765625, + "learning_rate": 0.000160149192084941, + "loss": 0.8204, + "step": 19368 + }, + { + "epoch": 0.49734160780971914, + "grad_norm": 0.73046875, + "learning_rate": 0.0001601456256685501, + "loss": 0.8181, + "step": 19369 + }, + { + "epoch": 0.497367285005641, + "grad_norm": 0.73046875, + "learning_rate": 0.00016014205913229446, + "loss": 0.8032, + "step": 19370 + }, + { + "epoch": 0.49739296220156276, + "grad_norm": 0.78515625, + "learning_rate": 0.00016013849247618106, + "loss": 0.7918, + "step": 19371 + }, + { + "epoch": 0.4974186393974846, + "grad_norm": 0.796875, + "learning_rate": 0.00016013492570021712, + "loss": 0.8645, + "step": 19372 + }, + { + "epoch": 0.49744431659340643, + "grad_norm": 0.80078125, + "learning_rate": 0.00016013135880440967, + "loss": 0.8977, + "step": 19373 + }, + { + "epoch": 0.4974699937893282, + "grad_norm": 0.69140625, + "learning_rate": 0.00016012779178876588, + "loss": 0.7334, + "step": 19374 + }, + { + "epoch": 0.49749567098525005, + "grad_norm": 0.83984375, + "learning_rate": 0.00016012422465329284, + "loss": 0.9179, + "step": 19375 + }, + { + "epoch": 0.4975213481811719, + "grad_norm": 0.76953125, + "learning_rate": 0.00016012065739799764, + "loss": 1.0869, + "step": 19376 + }, + { + "epoch": 0.4975470253770937, + "grad_norm": 0.75390625, + "learning_rate": 0.0001601170900228874, + "loss": 1.0858, + "step": 19377 + }, + { + "epoch": 0.4975727025730155, + "grad_norm": 0.7578125, + "learning_rate": 0.00016011352252796922, + "loss": 0.9303, + "step": 19378 + }, + { + "epoch": 0.49759837976893734, + "grad_norm": 0.71484375, + "learning_rate": 0.00016010995491325022, + "loss": 0.8859, + "step": 19379 + }, + { + "epoch": 0.4976240569648592, + "grad_norm": 0.7421875, + "learning_rate": 0.00016010638717873753, + "loss": 0.8667, + "step": 19380 + }, + { + "epoch": 0.49764973416078095, + "grad_norm": 0.6953125, + "learning_rate": 0.00016010281932443823, + "loss": 0.895, + "step": 19381 + }, + { + "epoch": 0.4976754113567028, + "grad_norm": 0.7265625, + "learning_rate": 0.00016009925135035944, + "loss": 0.7871, + "step": 19382 + }, + { + "epoch": 0.4977010885526246, + "grad_norm": 0.73828125, + "learning_rate": 0.00016009568325650827, + "loss": 0.9218, + "step": 19383 + }, + { + "epoch": 0.4977267657485464, + "grad_norm": 0.859375, + "learning_rate": 0.00016009211504289182, + "loss": 0.9464, + "step": 19384 + }, + { + "epoch": 0.49775244294446824, + "grad_norm": 0.76953125, + "learning_rate": 0.00016008854670951725, + "loss": 0.879, + "step": 19385 + }, + { + "epoch": 0.4977781201403901, + "grad_norm": 0.765625, + "learning_rate": 0.00016008497825639162, + "loss": 0.9381, + "step": 19386 + }, + { + "epoch": 0.4978037973363119, + "grad_norm": 0.7578125, + "learning_rate": 0.000160081409683522, + "loss": 0.9331, + "step": 19387 + }, + { + "epoch": 0.4978294745322337, + "grad_norm": 0.72265625, + "learning_rate": 0.0001600778409909156, + "loss": 0.8196, + "step": 19388 + }, + { + "epoch": 0.49785515172815553, + "grad_norm": 0.88671875, + "learning_rate": 0.00016007427217857952, + "loss": 0.6991, + "step": 19389 + }, + { + "epoch": 0.49788082892407737, + "grad_norm": 0.73046875, + "learning_rate": 0.00016007070324652084, + "loss": 0.8784, + "step": 19390 + }, + { + "epoch": 0.49790650611999915, + "grad_norm": 0.80078125, + "learning_rate": 0.0001600671341947466, + "loss": 0.8146, + "step": 19391 + }, + { + "epoch": 0.497932183315921, + "grad_norm": 0.73828125, + "learning_rate": 0.00016006356502326405, + "loss": 0.8817, + "step": 19392 + }, + { + "epoch": 0.4979578605118428, + "grad_norm": 0.6875, + "learning_rate": 0.00016005999573208022, + "loss": 0.7921, + "step": 19393 + }, + { + "epoch": 0.4979835377077646, + "grad_norm": 0.8046875, + "learning_rate": 0.00016005642632120224, + "loss": 1.0463, + "step": 19394 + }, + { + "epoch": 0.49800921490368644, + "grad_norm": 0.80859375, + "learning_rate": 0.00016005285679063724, + "loss": 1.0286, + "step": 19395 + }, + { + "epoch": 0.4980348920996083, + "grad_norm": 0.8359375, + "learning_rate": 0.00016004928714039233, + "loss": 0.9096, + "step": 19396 + }, + { + "epoch": 0.4980605692955301, + "grad_norm": 0.796875, + "learning_rate": 0.00016004571737047462, + "loss": 0.9538, + "step": 19397 + }, + { + "epoch": 0.4980862464914519, + "grad_norm": 0.8203125, + "learning_rate": 0.0001600421474808912, + "loss": 0.7543, + "step": 19398 + }, + { + "epoch": 0.4981119236873737, + "grad_norm": 0.76953125, + "learning_rate": 0.0001600385774716492, + "loss": 0.7958, + "step": 19399 + }, + { + "epoch": 0.49813760088329556, + "grad_norm": 0.703125, + "learning_rate": 0.00016003500734275574, + "loss": 0.8412, + "step": 19400 + }, + { + "epoch": 0.49816327807921734, + "grad_norm": 0.78125, + "learning_rate": 0.00016003143709421793, + "loss": 1.0333, + "step": 19401 + }, + { + "epoch": 0.4981889552751392, + "grad_norm": 0.71484375, + "learning_rate": 0.0001600278667260429, + "loss": 0.8105, + "step": 19402 + }, + { + "epoch": 0.498214632471061, + "grad_norm": 0.79296875, + "learning_rate": 0.00016002429623823775, + "loss": 0.932, + "step": 19403 + }, + { + "epoch": 0.4982403096669828, + "grad_norm": 0.74609375, + "learning_rate": 0.00016002072563080962, + "loss": 1.0813, + "step": 19404 + }, + { + "epoch": 0.49826598686290463, + "grad_norm": 0.7578125, + "learning_rate": 0.00016001715490376558, + "loss": 0.8606, + "step": 19405 + }, + { + "epoch": 0.49829166405882647, + "grad_norm": 0.80078125, + "learning_rate": 0.0001600135840571128, + "loss": 0.9139, + "step": 19406 + }, + { + "epoch": 0.4983173412547483, + "grad_norm": 0.77734375, + "learning_rate": 0.0001600100130908583, + "loss": 0.6974, + "step": 19407 + }, + { + "epoch": 0.4983430184506701, + "grad_norm": 0.92578125, + "learning_rate": 0.0001600064420050093, + "loss": 0.9093, + "step": 19408 + }, + { + "epoch": 0.4983686956465919, + "grad_norm": 0.7265625, + "learning_rate": 0.0001600028707995729, + "loss": 0.9119, + "step": 19409 + }, + { + "epoch": 0.49839437284251376, + "grad_norm": 0.76171875, + "learning_rate": 0.0001599992994745562, + "loss": 0.9255, + "step": 19410 + }, + { + "epoch": 0.49842005003843554, + "grad_norm": 0.75390625, + "learning_rate": 0.0001599957280299663, + "loss": 0.966, + "step": 19411 + }, + { + "epoch": 0.49844572723435737, + "grad_norm": 0.73828125, + "learning_rate": 0.00015999215646581033, + "loss": 0.9035, + "step": 19412 + }, + { + "epoch": 0.4984714044302792, + "grad_norm": 0.82421875, + "learning_rate": 0.00015998858478209543, + "loss": 0.8939, + "step": 19413 + }, + { + "epoch": 0.498497081626201, + "grad_norm": 0.8203125, + "learning_rate": 0.0001599850129788287, + "loss": 0.8076, + "step": 19414 + }, + { + "epoch": 0.4985227588221228, + "grad_norm": 0.83203125, + "learning_rate": 0.00015998144105601724, + "loss": 0.9197, + "step": 19415 + }, + { + "epoch": 0.49854843601804466, + "grad_norm": 0.73828125, + "learning_rate": 0.0001599778690136682, + "loss": 0.7707, + "step": 19416 + }, + { + "epoch": 0.49857411321396644, + "grad_norm": 0.78515625, + "learning_rate": 0.00015997429685178868, + "loss": 0.931, + "step": 19417 + }, + { + "epoch": 0.4985997904098883, + "grad_norm": 0.73828125, + "learning_rate": 0.0001599707245703858, + "loss": 0.9067, + "step": 19418 + }, + { + "epoch": 0.4986254676058101, + "grad_norm": 0.70703125, + "learning_rate": 0.00015996715216946668, + "loss": 0.909, + "step": 19419 + }, + { + "epoch": 0.49865114480173195, + "grad_norm": 0.78515625, + "learning_rate": 0.00015996357964903843, + "loss": 0.8827, + "step": 19420 + }, + { + "epoch": 0.49867682199765373, + "grad_norm": 0.8515625, + "learning_rate": 0.0001599600070091082, + "loss": 0.8674, + "step": 19421 + }, + { + "epoch": 0.49870249919357557, + "grad_norm": 0.734375, + "learning_rate": 0.00015995643424968309, + "loss": 0.9161, + "step": 19422 + }, + { + "epoch": 0.4987281763894974, + "grad_norm": 0.76953125, + "learning_rate": 0.00015995286137077025, + "loss": 1.0166, + "step": 19423 + }, + { + "epoch": 0.4987538535854192, + "grad_norm": 0.74609375, + "learning_rate": 0.00015994928837237673, + "loss": 0.9422, + "step": 19424 + }, + { + "epoch": 0.498779530781341, + "grad_norm": 0.80859375, + "learning_rate": 0.00015994571525450971, + "loss": 0.8038, + "step": 19425 + }, + { + "epoch": 0.49880520797726285, + "grad_norm": 0.703125, + "learning_rate": 0.0001599421420171763, + "loss": 0.7588, + "step": 19426 + }, + { + "epoch": 0.49883088517318463, + "grad_norm": 0.796875, + "learning_rate": 0.00015993856866038358, + "loss": 0.8572, + "step": 19427 + }, + { + "epoch": 0.49885656236910647, + "grad_norm": 0.78125, + "learning_rate": 0.00015993499518413877, + "loss": 0.9722, + "step": 19428 + }, + { + "epoch": 0.4988822395650283, + "grad_norm": 0.87890625, + "learning_rate": 0.0001599314215884489, + "loss": 0.8662, + "step": 19429 + }, + { + "epoch": 0.49890791676095014, + "grad_norm": 0.81640625, + "learning_rate": 0.00015992784787332112, + "loss": 0.8356, + "step": 19430 + }, + { + "epoch": 0.4989335939568719, + "grad_norm": 0.8125, + "learning_rate": 0.00015992427403876253, + "loss": 0.9864, + "step": 19431 + }, + { + "epoch": 0.49895927115279376, + "grad_norm": 0.7421875, + "learning_rate": 0.0001599207000847803, + "loss": 0.843, + "step": 19432 + }, + { + "epoch": 0.4989849483487156, + "grad_norm": 0.91796875, + "learning_rate": 0.0001599171260113815, + "loss": 0.8135, + "step": 19433 + }, + { + "epoch": 0.4990106255446374, + "grad_norm": 0.7890625, + "learning_rate": 0.0001599135518185733, + "loss": 0.9597, + "step": 19434 + }, + { + "epoch": 0.4990363027405592, + "grad_norm": 0.7421875, + "learning_rate": 0.00015990997750636282, + "loss": 0.9187, + "step": 19435 + }, + { + "epoch": 0.49906197993648105, + "grad_norm": 0.74609375, + "learning_rate": 0.00015990640307475716, + "loss": 0.9543, + "step": 19436 + }, + { + "epoch": 0.49908765713240283, + "grad_norm": 0.8125, + "learning_rate": 0.00015990282852376342, + "loss": 1.0355, + "step": 19437 + }, + { + "epoch": 0.49911333432832466, + "grad_norm": 0.76953125, + "learning_rate": 0.00015989925385338877, + "loss": 0.9457, + "step": 19438 + }, + { + "epoch": 0.4991390115242465, + "grad_norm": 0.8046875, + "learning_rate": 0.00015989567906364035, + "loss": 1.0172, + "step": 19439 + }, + { + "epoch": 0.49916468872016834, + "grad_norm": 0.79296875, + "learning_rate": 0.0001598921041545252, + "loss": 0.915, + "step": 19440 + }, + { + "epoch": 0.4991903659160901, + "grad_norm": 0.78125, + "learning_rate": 0.00015988852912605052, + "loss": 0.8978, + "step": 19441 + }, + { + "epoch": 0.49921604311201195, + "grad_norm": 0.7421875, + "learning_rate": 0.0001598849539782234, + "loss": 0.9019, + "step": 19442 + }, + { + "epoch": 0.4992417203079338, + "grad_norm": 0.94140625, + "learning_rate": 0.000159881378711051, + "loss": 0.8214, + "step": 19443 + }, + { + "epoch": 0.49926739750385557, + "grad_norm": 0.796875, + "learning_rate": 0.0001598778033245404, + "loss": 0.8443, + "step": 19444 + }, + { + "epoch": 0.4992930746997774, + "grad_norm": 0.7734375, + "learning_rate": 0.00015987422781869876, + "loss": 0.933, + "step": 19445 + }, + { + "epoch": 0.49931875189569924, + "grad_norm": 0.8046875, + "learning_rate": 0.0001598706521935332, + "loss": 1.0368, + "step": 19446 + }, + { + "epoch": 0.499344429091621, + "grad_norm": 0.73828125, + "learning_rate": 0.00015986707644905082, + "loss": 0.8872, + "step": 19447 + }, + { + "epoch": 0.49937010628754286, + "grad_norm": 0.74609375, + "learning_rate": 0.00015986350058525876, + "loss": 0.886, + "step": 19448 + }, + { + "epoch": 0.4993957834834647, + "grad_norm": 0.75390625, + "learning_rate": 0.0001598599246021642, + "loss": 0.8952, + "step": 19449 + }, + { + "epoch": 0.49942146067938653, + "grad_norm": 0.7265625, + "learning_rate": 0.00015985634849977415, + "loss": 0.9343, + "step": 19450 + }, + { + "epoch": 0.4994471378753083, + "grad_norm": 0.76953125, + "learning_rate": 0.00015985277227809584, + "loss": 0.8478, + "step": 19451 + }, + { + "epoch": 0.49947281507123015, + "grad_norm": 0.83203125, + "learning_rate": 0.00015984919593713636, + "loss": 1.0489, + "step": 19452 + }, + { + "epoch": 0.499498492267152, + "grad_norm": 0.7734375, + "learning_rate": 0.0001598456194769028, + "loss": 0.789, + "step": 19453 + }, + { + "epoch": 0.49952416946307376, + "grad_norm": 0.7421875, + "learning_rate": 0.0001598420428974024, + "loss": 0.9232, + "step": 19454 + }, + { + "epoch": 0.4995498466589956, + "grad_norm": 0.77734375, + "learning_rate": 0.00015983846619864215, + "loss": 0.9259, + "step": 19455 + }, + { + "epoch": 0.49957552385491744, + "grad_norm": 0.6953125, + "learning_rate": 0.00015983488938062928, + "loss": 0.7369, + "step": 19456 + }, + { + "epoch": 0.4996012010508392, + "grad_norm": 0.75, + "learning_rate": 0.00015983131244337085, + "loss": 0.7409, + "step": 19457 + }, + { + "epoch": 0.49962687824676105, + "grad_norm": 0.84375, + "learning_rate": 0.00015982773538687404, + "loss": 1.0464, + "step": 19458 + }, + { + "epoch": 0.4996525554426829, + "grad_norm": 0.828125, + "learning_rate": 0.00015982415821114593, + "loss": 0.9413, + "step": 19459 + }, + { + "epoch": 0.4996782326386047, + "grad_norm": 0.7421875, + "learning_rate": 0.00015982058091619373, + "loss": 0.8818, + "step": 19460 + }, + { + "epoch": 0.4997039098345265, + "grad_norm": 0.8046875, + "learning_rate": 0.00015981700350202447, + "loss": 0.8659, + "step": 19461 + }, + { + "epoch": 0.49972958703044834, + "grad_norm": 0.75390625, + "learning_rate": 0.00015981342596864534, + "loss": 0.8172, + "step": 19462 + }, + { + "epoch": 0.4997552642263702, + "grad_norm": 0.7421875, + "learning_rate": 0.00015980984831606346, + "loss": 1.0051, + "step": 19463 + }, + { + "epoch": 0.49978094142229196, + "grad_norm": 0.7890625, + "learning_rate": 0.00015980627054428594, + "loss": 0.9336, + "step": 19464 + }, + { + "epoch": 0.4998066186182138, + "grad_norm": 0.76171875, + "learning_rate": 0.00015980269265331994, + "loss": 0.8437, + "step": 19465 + }, + { + "epoch": 0.49983229581413563, + "grad_norm": 0.80078125, + "learning_rate": 0.00015979911464317255, + "loss": 0.9827, + "step": 19466 + }, + { + "epoch": 0.4998579730100574, + "grad_norm": 0.76953125, + "learning_rate": 0.00015979553651385096, + "loss": 0.8547, + "step": 19467 + }, + { + "epoch": 0.49988365020597925, + "grad_norm": 0.796875, + "learning_rate": 0.00015979195826536225, + "loss": 0.9387, + "step": 19468 + }, + { + "epoch": 0.4999093274019011, + "grad_norm": 0.83984375, + "learning_rate": 0.00015978837989771358, + "loss": 0.9488, + "step": 19469 + }, + { + "epoch": 0.4999350045978229, + "grad_norm": 0.81640625, + "learning_rate": 0.00015978480141091205, + "loss": 1.0032, + "step": 19470 + }, + { + "epoch": 0.4999606817937447, + "grad_norm": 0.71875, + "learning_rate": 0.00015978122280496483, + "loss": 0.9597, + "step": 19471 + }, + { + "epoch": 0.49998635898966654, + "grad_norm": 0.75, + "learning_rate": 0.000159777644079879, + "loss": 0.8135, + "step": 19472 + }, + { + "epoch": 0.5000120361855883, + "grad_norm": 0.81640625, + "learning_rate": 0.00015977406523566177, + "loss": 0.9455, + "step": 19473 + }, + { + "epoch": 0.5000377133815102, + "grad_norm": 0.734375, + "learning_rate": 0.0001597704862723202, + "loss": 0.9278, + "step": 19474 + }, + { + "epoch": 0.500063390577432, + "grad_norm": 0.80078125, + "learning_rate": 0.00015976690718986144, + "loss": 0.9569, + "step": 19475 + }, + { + "epoch": 0.5000890677733538, + "grad_norm": 0.77734375, + "learning_rate": 0.0001597633279882927, + "loss": 1.0054, + "step": 19476 + }, + { + "epoch": 0.5001147449692757, + "grad_norm": 0.70703125, + "learning_rate": 0.00015975974866762097, + "loss": 0.9169, + "step": 19477 + }, + { + "epoch": 0.5001404221651975, + "grad_norm": 0.765625, + "learning_rate": 0.00015975616922785348, + "loss": 0.9156, + "step": 19478 + }, + { + "epoch": 0.5001660993611192, + "grad_norm": 0.734375, + "learning_rate": 0.00015975258966899734, + "loss": 0.9407, + "step": 19479 + }, + { + "epoch": 0.5001917765570411, + "grad_norm": 0.77734375, + "learning_rate": 0.00015974900999105967, + "loss": 0.8125, + "step": 19480 + }, + { + "epoch": 0.5002174537529629, + "grad_norm": 0.73046875, + "learning_rate": 0.00015974543019404766, + "loss": 0.9276, + "step": 19481 + }, + { + "epoch": 0.5002431309488847, + "grad_norm": 0.72265625, + "learning_rate": 0.0001597418502779684, + "loss": 0.8284, + "step": 19482 + }, + { + "epoch": 0.5002688081448066, + "grad_norm": 0.703125, + "learning_rate": 0.00015973827024282904, + "loss": 0.8018, + "step": 19483 + }, + { + "epoch": 0.5002944853407284, + "grad_norm": 0.80078125, + "learning_rate": 0.00015973469008863665, + "loss": 0.9562, + "step": 19484 + }, + { + "epoch": 0.5003201625366502, + "grad_norm": 0.8359375, + "learning_rate": 0.00015973110981539845, + "loss": 0.9177, + "step": 19485 + }, + { + "epoch": 0.500345839732572, + "grad_norm": 0.75390625, + "learning_rate": 0.00015972752942312155, + "loss": 0.6844, + "step": 19486 + }, + { + "epoch": 0.5003715169284938, + "grad_norm": 0.83984375, + "learning_rate": 0.00015972394891181305, + "loss": 0.9663, + "step": 19487 + }, + { + "epoch": 0.5003971941244156, + "grad_norm": 0.72265625, + "learning_rate": 0.00015972036828148015, + "loss": 0.7562, + "step": 19488 + }, + { + "epoch": 0.5004228713203375, + "grad_norm": 0.7109375, + "learning_rate": 0.00015971678753212994, + "loss": 0.8397, + "step": 19489 + }, + { + "epoch": 0.5004485485162593, + "grad_norm": 0.765625, + "learning_rate": 0.00015971320666376956, + "loss": 1.0024, + "step": 19490 + }, + { + "epoch": 0.5004742257121811, + "grad_norm": 0.77734375, + "learning_rate": 0.00015970962567640618, + "loss": 0.8435, + "step": 19491 + }, + { + "epoch": 0.5004999029081029, + "grad_norm": 0.84375, + "learning_rate": 0.00015970604457004686, + "loss": 0.8043, + "step": 19492 + }, + { + "epoch": 0.5005255801040247, + "grad_norm": 0.74609375, + "learning_rate": 0.0001597024633446988, + "loss": 0.8433, + "step": 19493 + }, + { + "epoch": 0.5005512572999465, + "grad_norm": 0.74609375, + "learning_rate": 0.00015969888200036914, + "loss": 0.9282, + "step": 19494 + }, + { + "epoch": 0.5005769344958684, + "grad_norm": 0.79296875, + "learning_rate": 0.00015969530053706502, + "loss": 0.7962, + "step": 19495 + }, + { + "epoch": 0.5006026116917902, + "grad_norm": 0.73828125, + "learning_rate": 0.00015969171895479354, + "loss": 0.9211, + "step": 19496 + }, + { + "epoch": 0.500628288887712, + "grad_norm": 0.81640625, + "learning_rate": 0.00015968813725356182, + "loss": 1.0181, + "step": 19497 + }, + { + "epoch": 0.5006539660836339, + "grad_norm": 0.82421875, + "learning_rate": 0.00015968455543337708, + "loss": 1.1063, + "step": 19498 + }, + { + "epoch": 0.5006796432795556, + "grad_norm": 0.83203125, + "learning_rate": 0.0001596809734942464, + "loss": 0.7969, + "step": 19499 + }, + { + "epoch": 0.5007053204754774, + "grad_norm": 1.1875, + "learning_rate": 0.00015967739143617692, + "loss": 0.9195, + "step": 19500 + }, + { + "epoch": 0.5007309976713993, + "grad_norm": 0.6796875, + "learning_rate": 0.0001596738092591758, + "loss": 0.7744, + "step": 19501 + }, + { + "epoch": 0.5007566748673211, + "grad_norm": 0.74609375, + "learning_rate": 0.0001596702269632502, + "loss": 0.874, + "step": 19502 + }, + { + "epoch": 0.500782352063243, + "grad_norm": 0.7578125, + "learning_rate": 0.00015966664454840717, + "loss": 0.9949, + "step": 19503 + }, + { + "epoch": 0.5008080292591648, + "grad_norm": 0.7421875, + "learning_rate": 0.00015966306201465393, + "loss": 1.0794, + "step": 19504 + }, + { + "epoch": 0.5008337064550866, + "grad_norm": 0.765625, + "learning_rate": 0.0001596594793619976, + "loss": 0.8305, + "step": 19505 + }, + { + "epoch": 0.5008593836510083, + "grad_norm": 0.83203125, + "learning_rate": 0.00015965589659044528, + "loss": 0.9081, + "step": 19506 + }, + { + "epoch": 0.5008850608469302, + "grad_norm": 0.72265625, + "learning_rate": 0.0001596523137000042, + "loss": 0.8743, + "step": 19507 + }, + { + "epoch": 0.500910738042852, + "grad_norm": 0.79296875, + "learning_rate": 0.00015964873069068144, + "loss": 1.0105, + "step": 19508 + }, + { + "epoch": 0.5009364152387739, + "grad_norm": 0.83984375, + "learning_rate": 0.00015964514756248413, + "loss": 0.9503, + "step": 19509 + }, + { + "epoch": 0.5009620924346957, + "grad_norm": 0.73046875, + "learning_rate": 0.00015964156431541942, + "loss": 0.9449, + "step": 19510 + }, + { + "epoch": 0.5009877696306175, + "grad_norm": 0.828125, + "learning_rate": 0.00015963798094949447, + "loss": 0.9434, + "step": 19511 + }, + { + "epoch": 0.5010134468265393, + "grad_norm": 0.734375, + "learning_rate": 0.0001596343974647164, + "loss": 0.8333, + "step": 19512 + }, + { + "epoch": 0.5010391240224611, + "grad_norm": 0.8125, + "learning_rate": 0.0001596308138610924, + "loss": 1.0177, + "step": 19513 + }, + { + "epoch": 0.5010648012183829, + "grad_norm": 0.77734375, + "learning_rate": 0.00015962723013862956, + "loss": 0.9974, + "step": 19514 + }, + { + "epoch": 0.5010904784143048, + "grad_norm": 0.72265625, + "learning_rate": 0.00015962364629733503, + "loss": 0.9734, + "step": 19515 + }, + { + "epoch": 0.5011161556102266, + "grad_norm": 0.8125, + "learning_rate": 0.00015962006233721595, + "loss": 1.0949, + "step": 19516 + }, + { + "epoch": 0.5011418328061484, + "grad_norm": 0.73046875, + "learning_rate": 0.00015961647825827948, + "loss": 0.7584, + "step": 19517 + }, + { + "epoch": 0.5011675100020703, + "grad_norm": 0.74609375, + "learning_rate": 0.00015961289406053278, + "loss": 0.8218, + "step": 19518 + }, + { + "epoch": 0.501193187197992, + "grad_norm": 0.7734375, + "learning_rate": 0.00015960930974398289, + "loss": 0.8305, + "step": 19519 + }, + { + "epoch": 0.5012188643939138, + "grad_norm": 0.73046875, + "learning_rate": 0.0001596057253086371, + "loss": 0.7455, + "step": 19520 + }, + { + "epoch": 0.5012445415898357, + "grad_norm": 0.79296875, + "learning_rate": 0.00015960214075450246, + "loss": 0.8736, + "step": 19521 + }, + { + "epoch": 0.5012702187857575, + "grad_norm": 0.80859375, + "learning_rate": 0.00015959855608158616, + "loss": 1.0155, + "step": 19522 + }, + { + "epoch": 0.5012958959816793, + "grad_norm": 0.8828125, + "learning_rate": 0.0001595949712898953, + "loss": 0.929, + "step": 19523 + }, + { + "epoch": 0.5013215731776012, + "grad_norm": 0.8046875, + "learning_rate": 0.00015959138637943701, + "loss": 0.9293, + "step": 19524 + }, + { + "epoch": 0.501347250373523, + "grad_norm": 0.8671875, + "learning_rate": 0.0001595878013502185, + "loss": 1.0091, + "step": 19525 + }, + { + "epoch": 0.5013729275694447, + "grad_norm": 0.83984375, + "learning_rate": 0.0001595842162022469, + "loss": 0.9447, + "step": 19526 + }, + { + "epoch": 0.5013986047653666, + "grad_norm": 0.8046875, + "learning_rate": 0.00015958063093552934, + "loss": 1.031, + "step": 19527 + }, + { + "epoch": 0.5014242819612884, + "grad_norm": 0.74609375, + "learning_rate": 0.00015957704555007296, + "loss": 0.9306, + "step": 19528 + }, + { + "epoch": 0.5014499591572102, + "grad_norm": 0.9609375, + "learning_rate": 0.00015957346004588492, + "loss": 0.8514, + "step": 19529 + }, + { + "epoch": 0.5014756363531321, + "grad_norm": 0.80859375, + "learning_rate": 0.00015956987442297232, + "loss": 1.0571, + "step": 19530 + }, + { + "epoch": 0.5015013135490539, + "grad_norm": 0.7421875, + "learning_rate": 0.00015956628868134235, + "loss": 0.9796, + "step": 19531 + }, + { + "epoch": 0.5015269907449756, + "grad_norm": 0.82421875, + "learning_rate": 0.0001595627028210022, + "loss": 0.8596, + "step": 19532 + }, + { + "epoch": 0.5015526679408975, + "grad_norm": 0.78515625, + "learning_rate": 0.0001595591168419589, + "loss": 0.8472, + "step": 19533 + }, + { + "epoch": 0.5015783451368193, + "grad_norm": 0.87109375, + "learning_rate": 0.00015955553074421968, + "loss": 0.962, + "step": 19534 + }, + { + "epoch": 0.5016040223327412, + "grad_norm": 0.84765625, + "learning_rate": 0.00015955194452779166, + "loss": 0.873, + "step": 19535 + }, + { + "epoch": 0.501629699528663, + "grad_norm": 0.84765625, + "learning_rate": 0.00015954835819268203, + "loss": 0.9538, + "step": 19536 + }, + { + "epoch": 0.5016553767245848, + "grad_norm": 0.8359375, + "learning_rate": 0.00015954477173889785, + "loss": 0.9601, + "step": 19537 + }, + { + "epoch": 0.5016810539205067, + "grad_norm": 0.84765625, + "learning_rate": 0.00015954118516644633, + "loss": 0.9668, + "step": 19538 + }, + { + "epoch": 0.5017067311164284, + "grad_norm": 0.8359375, + "learning_rate": 0.00015953759847533463, + "loss": 0.9147, + "step": 19539 + }, + { + "epoch": 0.5017324083123502, + "grad_norm": 0.71875, + "learning_rate": 0.00015953401166556987, + "loss": 0.9435, + "step": 19540 + }, + { + "epoch": 0.5017580855082721, + "grad_norm": 0.7578125, + "learning_rate": 0.0001595304247371592, + "loss": 0.7622, + "step": 19541 + }, + { + "epoch": 0.5017837627041939, + "grad_norm": 0.72265625, + "learning_rate": 0.00015952683769010977, + "loss": 0.8187, + "step": 19542 + }, + { + "epoch": 0.5018094399001157, + "grad_norm": 0.75, + "learning_rate": 0.00015952325052442871, + "loss": 0.8993, + "step": 19543 + }, + { + "epoch": 0.5018351170960376, + "grad_norm": 0.71484375, + "learning_rate": 0.00015951966324012322, + "loss": 1.003, + "step": 19544 + }, + { + "epoch": 0.5018607942919594, + "grad_norm": 0.76953125, + "learning_rate": 0.00015951607583720038, + "loss": 0.8611, + "step": 19545 + }, + { + "epoch": 0.5018864714878811, + "grad_norm": 0.72265625, + "learning_rate": 0.00015951248831566744, + "loss": 0.9499, + "step": 19546 + }, + { + "epoch": 0.501912148683803, + "grad_norm": 0.80078125, + "learning_rate": 0.0001595089006755314, + "loss": 0.9091, + "step": 19547 + }, + { + "epoch": 0.5019378258797248, + "grad_norm": 0.74609375, + "learning_rate": 0.00015950531291679957, + "loss": 0.8886, + "step": 19548 + }, + { + "epoch": 0.5019635030756466, + "grad_norm": 0.8515625, + "learning_rate": 0.00015950172503947897, + "loss": 0.964, + "step": 19549 + }, + { + "epoch": 0.5019891802715685, + "grad_norm": 0.765625, + "learning_rate": 0.00015949813704357687, + "loss": 0.9255, + "step": 19550 + }, + { + "epoch": 0.5020148574674903, + "grad_norm": 0.79296875, + "learning_rate": 0.0001594945489291003, + "loss": 0.9391, + "step": 19551 + }, + { + "epoch": 0.502040534663412, + "grad_norm": 0.77734375, + "learning_rate": 0.00015949096069605649, + "loss": 0.7167, + "step": 19552 + }, + { + "epoch": 0.5020662118593339, + "grad_norm": 0.7734375, + "learning_rate": 0.00015948737234445257, + "loss": 1.1207, + "step": 19553 + }, + { + "epoch": 0.5020918890552557, + "grad_norm": 0.82421875, + "learning_rate": 0.0001594837838742957, + "loss": 0.8505, + "step": 19554 + }, + { + "epoch": 0.5021175662511775, + "grad_norm": 0.796875, + "learning_rate": 0.00015948019528559302, + "loss": 0.8237, + "step": 19555 + }, + { + "epoch": 0.5021432434470994, + "grad_norm": 0.79296875, + "learning_rate": 0.00015947660657835168, + "loss": 0.9162, + "step": 19556 + }, + { + "epoch": 0.5021689206430212, + "grad_norm": 0.75390625, + "learning_rate": 0.00015947301775257882, + "loss": 0.8829, + "step": 19557 + }, + { + "epoch": 0.502194597838943, + "grad_norm": 0.76953125, + "learning_rate": 0.0001594694288082816, + "loss": 0.9616, + "step": 19558 + }, + { + "epoch": 0.5022202750348648, + "grad_norm": 0.82421875, + "learning_rate": 0.00015946583974546722, + "loss": 0.7633, + "step": 19559 + }, + { + "epoch": 0.5022459522307866, + "grad_norm": 0.75390625, + "learning_rate": 0.00015946225056414276, + "loss": 0.9309, + "step": 19560 + }, + { + "epoch": 0.5022716294267084, + "grad_norm": 0.765625, + "learning_rate": 0.0001594586612643154, + "loss": 0.808, + "step": 19561 + }, + { + "epoch": 0.5022973066226303, + "grad_norm": 0.81640625, + "learning_rate": 0.00015945507184599234, + "loss": 0.8985, + "step": 19562 + }, + { + "epoch": 0.5023229838185521, + "grad_norm": 0.73046875, + "learning_rate": 0.00015945148230918067, + "loss": 1.0321, + "step": 19563 + }, + { + "epoch": 0.502348661014474, + "grad_norm": 0.76171875, + "learning_rate": 0.00015944789265388755, + "loss": 0.8338, + "step": 19564 + }, + { + "epoch": 0.5023743382103958, + "grad_norm": 0.70703125, + "learning_rate": 0.00015944430288012017, + "loss": 0.8442, + "step": 19565 + }, + { + "epoch": 0.5024000154063175, + "grad_norm": 0.8125, + "learning_rate": 0.00015944071298788564, + "loss": 1.0108, + "step": 19566 + }, + { + "epoch": 0.5024256926022393, + "grad_norm": 0.73046875, + "learning_rate": 0.00015943712297719119, + "loss": 0.8015, + "step": 19567 + }, + { + "epoch": 0.5024513697981612, + "grad_norm": 0.73828125, + "learning_rate": 0.0001594335328480439, + "loss": 0.8995, + "step": 19568 + }, + { + "epoch": 0.502477046994083, + "grad_norm": 0.80078125, + "learning_rate": 0.00015942994260045092, + "loss": 0.9499, + "step": 19569 + }, + { + "epoch": 0.5025027241900049, + "grad_norm": 0.7421875, + "learning_rate": 0.00015942635223441945, + "loss": 1.0206, + "step": 19570 + }, + { + "epoch": 0.5025284013859267, + "grad_norm": 0.75, + "learning_rate": 0.00015942276174995662, + "loss": 0.9331, + "step": 19571 + }, + { + "epoch": 0.5025540785818484, + "grad_norm": 0.78125, + "learning_rate": 0.0001594191711470696, + "loss": 0.8511, + "step": 19572 + }, + { + "epoch": 0.5025797557777703, + "grad_norm": 0.79296875, + "learning_rate": 0.00015941558042576555, + "loss": 0.8885, + "step": 19573 + }, + { + "epoch": 0.5026054329736921, + "grad_norm": 0.79296875, + "learning_rate": 0.00015941198958605162, + "loss": 1.017, + "step": 19574 + }, + { + "epoch": 0.5026311101696139, + "grad_norm": 0.7578125, + "learning_rate": 0.00015940839862793493, + "loss": 0.8626, + "step": 19575 + }, + { + "epoch": 0.5026567873655358, + "grad_norm": 0.81640625, + "learning_rate": 0.00015940480755142269, + "loss": 1.0084, + "step": 19576 + }, + { + "epoch": 0.5026824645614576, + "grad_norm": 0.71484375, + "learning_rate": 0.00015940121635652204, + "loss": 0.9427, + "step": 19577 + }, + { + "epoch": 0.5027081417573794, + "grad_norm": 0.8203125, + "learning_rate": 0.00015939762504324013, + "loss": 0.8395, + "step": 19578 + }, + { + "epoch": 0.5027338189533012, + "grad_norm": 0.77734375, + "learning_rate": 0.0001593940336115841, + "loss": 0.8066, + "step": 19579 + }, + { + "epoch": 0.502759496149223, + "grad_norm": 0.8125, + "learning_rate": 0.0001593904420615611, + "loss": 0.8563, + "step": 19580 + }, + { + "epoch": 0.5027851733451448, + "grad_norm": 0.828125, + "learning_rate": 0.0001593868503931784, + "loss": 0.8891, + "step": 19581 + }, + { + "epoch": 0.5028108505410667, + "grad_norm": 0.796875, + "learning_rate": 0.00015938325860644298, + "loss": 1.0165, + "step": 19582 + }, + { + "epoch": 0.5028365277369885, + "grad_norm": 0.78125, + "learning_rate": 0.00015937966670136216, + "loss": 0.9513, + "step": 19583 + }, + { + "epoch": 0.5028622049329103, + "grad_norm": 0.78125, + "learning_rate": 0.000159376074677943, + "loss": 0.9154, + "step": 19584 + }, + { + "epoch": 0.5028878821288322, + "grad_norm": 0.76953125, + "learning_rate": 0.0001593724825361927, + "loss": 0.8764, + "step": 19585 + }, + { + "epoch": 0.5029135593247539, + "grad_norm": 0.83984375, + "learning_rate": 0.00015936889027611835, + "loss": 0.8146, + "step": 19586 + }, + { + "epoch": 0.5029392365206757, + "grad_norm": 0.73828125, + "learning_rate": 0.00015936529789772723, + "loss": 0.8492, + "step": 19587 + }, + { + "epoch": 0.5029649137165976, + "grad_norm": 0.8515625, + "learning_rate": 0.0001593617054010264, + "loss": 0.8696, + "step": 19588 + }, + { + "epoch": 0.5029905909125194, + "grad_norm": 0.77734375, + "learning_rate": 0.00015935811278602305, + "loss": 0.9395, + "step": 19589 + }, + { + "epoch": 0.5030162681084412, + "grad_norm": 0.80859375, + "learning_rate": 0.00015935452005272436, + "loss": 0.878, + "step": 19590 + }, + { + "epoch": 0.5030419453043631, + "grad_norm": 0.71484375, + "learning_rate": 0.00015935092720113748, + "loss": 0.7564, + "step": 19591 + }, + { + "epoch": 0.5030676225002848, + "grad_norm": 0.73046875, + "learning_rate": 0.00015934733423126956, + "loss": 0.8285, + "step": 19592 + }, + { + "epoch": 0.5030932996962066, + "grad_norm": 0.73046875, + "learning_rate": 0.00015934374114312774, + "loss": 0.7748, + "step": 19593 + }, + { + "epoch": 0.5031189768921285, + "grad_norm": 0.796875, + "learning_rate": 0.00015934014793671922, + "loss": 0.9577, + "step": 19594 + }, + { + "epoch": 0.5031446540880503, + "grad_norm": 0.73046875, + "learning_rate": 0.00015933655461205116, + "loss": 0.9364, + "step": 19595 + }, + { + "epoch": 0.5031703312839721, + "grad_norm": 0.83203125, + "learning_rate": 0.00015933296116913068, + "loss": 0.93, + "step": 19596 + }, + { + "epoch": 0.503196008479894, + "grad_norm": 0.78125, + "learning_rate": 0.00015932936760796497, + "loss": 0.8527, + "step": 19597 + }, + { + "epoch": 0.5032216856758158, + "grad_norm": 0.76953125, + "learning_rate": 0.0001593257739285612, + "loss": 0.8355, + "step": 19598 + }, + { + "epoch": 0.5032473628717375, + "grad_norm": 0.77734375, + "learning_rate": 0.00015932218013092652, + "loss": 0.8497, + "step": 19599 + }, + { + "epoch": 0.5032730400676594, + "grad_norm": 0.74609375, + "learning_rate": 0.00015931858621506811, + "loss": 0.9504, + "step": 19600 + }, + { + "epoch": 0.5032987172635812, + "grad_norm": 0.77734375, + "learning_rate": 0.0001593149921809931, + "loss": 0.9594, + "step": 19601 + }, + { + "epoch": 0.503324394459503, + "grad_norm": 0.8359375, + "learning_rate": 0.00015931139802870866, + "loss": 0.8715, + "step": 19602 + }, + { + "epoch": 0.5033500716554249, + "grad_norm": 0.78515625, + "learning_rate": 0.00015930780375822198, + "loss": 1.1357, + "step": 19603 + }, + { + "epoch": 0.5033757488513467, + "grad_norm": 0.7109375, + "learning_rate": 0.00015930420936954018, + "loss": 0.9472, + "step": 19604 + }, + { + "epoch": 0.5034014260472686, + "grad_norm": 0.8125, + "learning_rate": 0.00015930061486267046, + "loss": 1.072, + "step": 19605 + }, + { + "epoch": 0.5034271032431903, + "grad_norm": 0.69921875, + "learning_rate": 0.00015929702023761996, + "loss": 0.7264, + "step": 19606 + }, + { + "epoch": 0.5034527804391121, + "grad_norm": 0.7421875, + "learning_rate": 0.0001592934254943959, + "loss": 0.9164, + "step": 19607 + }, + { + "epoch": 0.503478457635034, + "grad_norm": 0.8046875, + "learning_rate": 0.00015928983063300537, + "loss": 0.9258, + "step": 19608 + }, + { + "epoch": 0.5035041348309558, + "grad_norm": 0.79296875, + "learning_rate": 0.00015928623565345556, + "loss": 0.8658, + "step": 19609 + }, + { + "epoch": 0.5035298120268776, + "grad_norm": 0.859375, + "learning_rate": 0.0001592826405557536, + "loss": 0.8806, + "step": 19610 + }, + { + "epoch": 0.5035554892227995, + "grad_norm": 0.80859375, + "learning_rate": 0.00015927904533990676, + "loss": 0.9327, + "step": 19611 + }, + { + "epoch": 0.5035811664187212, + "grad_norm": 0.80859375, + "learning_rate": 0.0001592754500059221, + "loss": 0.9364, + "step": 19612 + }, + { + "epoch": 0.503606843614643, + "grad_norm": 0.7421875, + "learning_rate": 0.00015927185455380681, + "loss": 0.8504, + "step": 19613 + }, + { + "epoch": 0.5036325208105649, + "grad_norm": 0.75, + "learning_rate": 0.00015926825898356813, + "loss": 0.7766, + "step": 19614 + }, + { + "epoch": 0.5036581980064867, + "grad_norm": 0.77734375, + "learning_rate": 0.0001592646632952131, + "loss": 1.1258, + "step": 19615 + }, + { + "epoch": 0.5036838752024085, + "grad_norm": 0.69140625, + "learning_rate": 0.00015926106748874897, + "loss": 0.9469, + "step": 19616 + }, + { + "epoch": 0.5037095523983304, + "grad_norm": 0.70703125, + "learning_rate": 0.00015925747156418288, + "loss": 0.9018, + "step": 19617 + }, + { + "epoch": 0.5037352295942522, + "grad_norm": 0.7578125, + "learning_rate": 0.000159253875521522, + "loss": 0.8394, + "step": 19618 + }, + { + "epoch": 0.5037609067901739, + "grad_norm": 0.75390625, + "learning_rate": 0.00015925027936077352, + "loss": 1.0163, + "step": 19619 + }, + { + "epoch": 0.5037865839860958, + "grad_norm": 0.703125, + "learning_rate": 0.00015924668308194456, + "loss": 0.7848, + "step": 19620 + }, + { + "epoch": 0.5038122611820176, + "grad_norm": 0.78125, + "learning_rate": 0.00015924308668504233, + "loss": 0.8342, + "step": 19621 + }, + { + "epoch": 0.5038379383779394, + "grad_norm": 0.7265625, + "learning_rate": 0.00015923949017007398, + "loss": 0.8494, + "step": 19622 + }, + { + "epoch": 0.5038636155738613, + "grad_norm": 0.80078125, + "learning_rate": 0.00015923589353704668, + "loss": 0.8953, + "step": 19623 + }, + { + "epoch": 0.5038892927697831, + "grad_norm": 0.73046875, + "learning_rate": 0.00015923229678596758, + "loss": 0.8324, + "step": 19624 + }, + { + "epoch": 0.5039149699657048, + "grad_norm": 0.7890625, + "learning_rate": 0.00015922869991684386, + "loss": 0.8378, + "step": 19625 + }, + { + "epoch": 0.5039406471616267, + "grad_norm": 0.7578125, + "learning_rate": 0.0001592251029296827, + "loss": 0.8893, + "step": 19626 + }, + { + "epoch": 0.5039663243575485, + "grad_norm": 0.80859375, + "learning_rate": 0.00015922150582449126, + "loss": 0.9535, + "step": 19627 + }, + { + "epoch": 0.5039920015534703, + "grad_norm": 0.86328125, + "learning_rate": 0.00015921790860127673, + "loss": 0.9338, + "step": 19628 + }, + { + "epoch": 0.5040176787493922, + "grad_norm": 0.7890625, + "learning_rate": 0.00015921431126004622, + "loss": 0.8443, + "step": 19629 + }, + { + "epoch": 0.504043355945314, + "grad_norm": 0.80078125, + "learning_rate": 0.00015921071380080694, + "loss": 1.0514, + "step": 19630 + }, + { + "epoch": 0.5040690331412359, + "grad_norm": 0.828125, + "learning_rate": 0.00015920711622356609, + "loss": 0.8387, + "step": 19631 + }, + { + "epoch": 0.5040947103371576, + "grad_norm": 0.765625, + "learning_rate": 0.00015920351852833075, + "loss": 0.8265, + "step": 19632 + }, + { + "epoch": 0.5041203875330794, + "grad_norm": 0.79296875, + "learning_rate": 0.00015919992071510822, + "loss": 0.8511, + "step": 19633 + }, + { + "epoch": 0.5041460647290013, + "grad_norm": 0.87109375, + "learning_rate": 0.00015919632278390555, + "loss": 0.8794, + "step": 19634 + }, + { + "epoch": 0.5041717419249231, + "grad_norm": 0.7578125, + "learning_rate": 0.00015919272473472994, + "loss": 0.7874, + "step": 19635 + }, + { + "epoch": 0.5041974191208449, + "grad_norm": 0.72265625, + "learning_rate": 0.0001591891265675886, + "loss": 1.0227, + "step": 19636 + }, + { + "epoch": 0.5042230963167668, + "grad_norm": 0.75390625, + "learning_rate": 0.00015918552828248867, + "loss": 0.8258, + "step": 19637 + }, + { + "epoch": 0.5042487735126886, + "grad_norm": 0.7734375, + "learning_rate": 0.00015918192987943733, + "loss": 0.925, + "step": 19638 + }, + { + "epoch": 0.5042744507086103, + "grad_norm": 0.83203125, + "learning_rate": 0.00015917833135844177, + "loss": 0.8018, + "step": 19639 + }, + { + "epoch": 0.5043001279045322, + "grad_norm": 0.8125, + "learning_rate": 0.0001591747327195091, + "loss": 0.883, + "step": 19640 + }, + { + "epoch": 0.504325805100454, + "grad_norm": 0.76171875, + "learning_rate": 0.00015917113396264658, + "loss": 0.924, + "step": 19641 + }, + { + "epoch": 0.5043514822963758, + "grad_norm": 0.8125, + "learning_rate": 0.0001591675350878613, + "loss": 0.7788, + "step": 19642 + }, + { + "epoch": 0.5043771594922977, + "grad_norm": 0.78125, + "learning_rate": 0.00015916393609516046, + "loss": 0.8082, + "step": 19643 + }, + { + "epoch": 0.5044028366882195, + "grad_norm": 0.80078125, + "learning_rate": 0.00015916033698455125, + "loss": 0.8971, + "step": 19644 + }, + { + "epoch": 0.5044285138841412, + "grad_norm": 0.75, + "learning_rate": 0.00015915673775604086, + "loss": 0.9235, + "step": 19645 + }, + { + "epoch": 0.5044541910800631, + "grad_norm": 0.76953125, + "learning_rate": 0.0001591531384096364, + "loss": 0.8672, + "step": 19646 + }, + { + "epoch": 0.5044798682759849, + "grad_norm": 0.72265625, + "learning_rate": 0.0001591495389453451, + "loss": 0.883, + "step": 19647 + }, + { + "epoch": 0.5045055454719067, + "grad_norm": 0.76171875, + "learning_rate": 0.0001591459393631741, + "loss": 0.9217, + "step": 19648 + }, + { + "epoch": 0.5045312226678286, + "grad_norm": 0.8125, + "learning_rate": 0.00015914233966313058, + "loss": 1.0045, + "step": 19649 + }, + { + "epoch": 0.5045568998637504, + "grad_norm": 0.7890625, + "learning_rate": 0.00015913873984522176, + "loss": 1.0272, + "step": 19650 + }, + { + "epoch": 0.5045825770596722, + "grad_norm": 0.79296875, + "learning_rate": 0.00015913513990945474, + "loss": 0.8388, + "step": 19651 + }, + { + "epoch": 0.504608254255594, + "grad_norm": 0.78515625, + "learning_rate": 0.0001591315398558367, + "loss": 0.8416, + "step": 19652 + }, + { + "epoch": 0.5046339314515158, + "grad_norm": 0.734375, + "learning_rate": 0.00015912793968437492, + "loss": 0.9213, + "step": 19653 + }, + { + "epoch": 0.5046596086474376, + "grad_norm": 0.98046875, + "learning_rate": 0.00015912433939507646, + "loss": 1.0575, + "step": 19654 + }, + { + "epoch": 0.5046852858433595, + "grad_norm": 0.765625, + "learning_rate": 0.0001591207389879485, + "loss": 0.8532, + "step": 19655 + }, + { + "epoch": 0.5047109630392813, + "grad_norm": 0.8203125, + "learning_rate": 0.00015911713846299827, + "loss": 1.0256, + "step": 19656 + }, + { + "epoch": 0.5047366402352031, + "grad_norm": 0.7421875, + "learning_rate": 0.0001591135378202329, + "loss": 1.0777, + "step": 19657 + }, + { + "epoch": 0.504762317431125, + "grad_norm": 0.796875, + "learning_rate": 0.00015910993705965963, + "loss": 0.8674, + "step": 19658 + }, + { + "epoch": 0.5047879946270467, + "grad_norm": 0.71484375, + "learning_rate": 0.00015910633618128559, + "loss": 0.758, + "step": 19659 + }, + { + "epoch": 0.5048136718229685, + "grad_norm": 0.828125, + "learning_rate": 0.00015910273518511794, + "loss": 0.829, + "step": 19660 + }, + { + "epoch": 0.5048393490188904, + "grad_norm": 0.71484375, + "learning_rate": 0.00015909913407116387, + "loss": 0.7912, + "step": 19661 + }, + { + "epoch": 0.5048650262148122, + "grad_norm": 0.76171875, + "learning_rate": 0.00015909553283943059, + "loss": 0.7748, + "step": 19662 + }, + { + "epoch": 0.504890703410734, + "grad_norm": 0.734375, + "learning_rate": 0.00015909193148992524, + "loss": 0.8781, + "step": 19663 + }, + { + "epoch": 0.5049163806066559, + "grad_norm": 0.76953125, + "learning_rate": 0.00015908833002265503, + "loss": 0.9778, + "step": 19664 + }, + { + "epoch": 0.5049420578025776, + "grad_norm": 0.72265625, + "learning_rate": 0.00015908472843762707, + "loss": 0.9144, + "step": 19665 + }, + { + "epoch": 0.5049677349984995, + "grad_norm": 0.7890625, + "learning_rate": 0.00015908112673484863, + "loss": 0.9038, + "step": 19666 + }, + { + "epoch": 0.5049934121944213, + "grad_norm": 0.765625, + "learning_rate": 0.0001590775249143268, + "loss": 0.9736, + "step": 19667 + }, + { + "epoch": 0.5050190893903431, + "grad_norm": 0.796875, + "learning_rate": 0.00015907392297606884, + "loss": 0.7953, + "step": 19668 + }, + { + "epoch": 0.505044766586265, + "grad_norm": 0.77734375, + "learning_rate": 0.00015907032092008186, + "loss": 1.0116, + "step": 19669 + }, + { + "epoch": 0.5050704437821868, + "grad_norm": 0.8671875, + "learning_rate": 0.00015906671874637307, + "loss": 0.9601, + "step": 19670 + }, + { + "epoch": 0.5050961209781086, + "grad_norm": 0.8515625, + "learning_rate": 0.00015906311645494964, + "loss": 0.9269, + "step": 19671 + }, + { + "epoch": 0.5051217981740304, + "grad_norm": 0.82421875, + "learning_rate": 0.00015905951404581877, + "loss": 0.8846, + "step": 19672 + }, + { + "epoch": 0.5051474753699522, + "grad_norm": 0.953125, + "learning_rate": 0.0001590559115189876, + "loss": 0.8974, + "step": 19673 + }, + { + "epoch": 0.505173152565874, + "grad_norm": 0.71875, + "learning_rate": 0.00015905230887446335, + "loss": 0.8501, + "step": 19674 + }, + { + "epoch": 0.5051988297617959, + "grad_norm": 0.73828125, + "learning_rate": 0.0001590487061122532, + "loss": 0.765, + "step": 19675 + }, + { + "epoch": 0.5052245069577177, + "grad_norm": 0.79296875, + "learning_rate": 0.00015904510323236427, + "loss": 0.8128, + "step": 19676 + }, + { + "epoch": 0.5052501841536395, + "grad_norm": 0.765625, + "learning_rate": 0.0001590415002348038, + "loss": 0.8343, + "step": 19677 + }, + { + "epoch": 0.5052758613495614, + "grad_norm": 0.7265625, + "learning_rate": 0.00015903789711957898, + "loss": 0.9539, + "step": 19678 + }, + { + "epoch": 0.5053015385454831, + "grad_norm": 0.80078125, + "learning_rate": 0.00015903429388669694, + "loss": 0.9282, + "step": 19679 + }, + { + "epoch": 0.5053272157414049, + "grad_norm": 0.77734375, + "learning_rate": 0.00015903069053616487, + "loss": 0.7323, + "step": 19680 + }, + { + "epoch": 0.5053528929373268, + "grad_norm": 0.86328125, + "learning_rate": 0.00015902708706799, + "loss": 1.0715, + "step": 19681 + }, + { + "epoch": 0.5053785701332486, + "grad_norm": 0.76171875, + "learning_rate": 0.00015902348348217945, + "loss": 0.8084, + "step": 19682 + }, + { + "epoch": 0.5054042473291704, + "grad_norm": 0.70703125, + "learning_rate": 0.00015901987977874046, + "loss": 0.9386, + "step": 19683 + }, + { + "epoch": 0.5054299245250923, + "grad_norm": 0.78125, + "learning_rate": 0.00015901627595768015, + "loss": 0.9136, + "step": 19684 + }, + { + "epoch": 0.505455601721014, + "grad_norm": 0.796875, + "learning_rate": 0.00015901267201900575, + "loss": 0.9184, + "step": 19685 + }, + { + "epoch": 0.5054812789169358, + "grad_norm": 0.75390625, + "learning_rate": 0.00015900906796272445, + "loss": 0.7956, + "step": 19686 + }, + { + "epoch": 0.5055069561128577, + "grad_norm": 0.79296875, + "learning_rate": 0.00015900546378884336, + "loss": 0.8756, + "step": 19687 + }, + { + "epoch": 0.5055326333087795, + "grad_norm": 0.77734375, + "learning_rate": 0.0001590018594973697, + "loss": 0.8969, + "step": 19688 + }, + { + "epoch": 0.5055583105047013, + "grad_norm": 0.71875, + "learning_rate": 0.00015899825508831073, + "loss": 0.7725, + "step": 19689 + }, + { + "epoch": 0.5055839877006232, + "grad_norm": 0.8359375, + "learning_rate": 0.00015899465056167352, + "loss": 1.0612, + "step": 19690 + }, + { + "epoch": 0.505609664896545, + "grad_norm": 0.74609375, + "learning_rate": 0.0001589910459174653, + "loss": 0.8229, + "step": 19691 + }, + { + "epoch": 0.5056353420924667, + "grad_norm": 0.74609375, + "learning_rate": 0.00015898744115569325, + "loss": 0.8448, + "step": 19692 + }, + { + "epoch": 0.5056610192883886, + "grad_norm": 0.828125, + "learning_rate": 0.00015898383627636458, + "loss": 0.9897, + "step": 19693 + }, + { + "epoch": 0.5056866964843104, + "grad_norm": 0.73046875, + "learning_rate": 0.00015898023127948645, + "loss": 0.8275, + "step": 19694 + }, + { + "epoch": 0.5057123736802323, + "grad_norm": 0.8984375, + "learning_rate": 0.00015897662616506602, + "loss": 0.9379, + "step": 19695 + }, + { + "epoch": 0.5057380508761541, + "grad_norm": 0.75390625, + "learning_rate": 0.00015897302093311052, + "loss": 0.7783, + "step": 19696 + }, + { + "epoch": 0.5057637280720759, + "grad_norm": 0.7265625, + "learning_rate": 0.00015896941558362712, + "loss": 0.7836, + "step": 19697 + }, + { + "epoch": 0.5057894052679978, + "grad_norm": 0.7421875, + "learning_rate": 0.00015896581011662302, + "loss": 0.8902, + "step": 19698 + }, + { + "epoch": 0.5058150824639195, + "grad_norm": 0.74609375, + "learning_rate": 0.00015896220453210535, + "loss": 0.7569, + "step": 19699 + }, + { + "epoch": 0.5058407596598413, + "grad_norm": 0.765625, + "learning_rate": 0.00015895859883008137, + "loss": 0.9877, + "step": 19700 + }, + { + "epoch": 0.5058664368557632, + "grad_norm": 0.80078125, + "learning_rate": 0.0001589549930105582, + "loss": 0.9875, + "step": 19701 + }, + { + "epoch": 0.505892114051685, + "grad_norm": 0.734375, + "learning_rate": 0.00015895138707354306, + "loss": 1.006, + "step": 19702 + }, + { + "epoch": 0.5059177912476068, + "grad_norm": 0.75, + "learning_rate": 0.00015894778101904313, + "loss": 0.7765, + "step": 19703 + }, + { + "epoch": 0.5059434684435287, + "grad_norm": 0.79296875, + "learning_rate": 0.0001589441748470656, + "loss": 0.8796, + "step": 19704 + }, + { + "epoch": 0.5059691456394504, + "grad_norm": 0.828125, + "learning_rate": 0.00015894056855761765, + "loss": 0.8918, + "step": 19705 + }, + { + "epoch": 0.5059948228353722, + "grad_norm": 0.7734375, + "learning_rate": 0.00015893696215070645, + "loss": 0.8387, + "step": 19706 + }, + { + "epoch": 0.5060205000312941, + "grad_norm": 0.83203125, + "learning_rate": 0.00015893335562633927, + "loss": 1.0203, + "step": 19707 + }, + { + "epoch": 0.5060461772272159, + "grad_norm": 0.80078125, + "learning_rate": 0.00015892974898452317, + "loss": 0.9474, + "step": 19708 + }, + { + "epoch": 0.5060718544231377, + "grad_norm": 0.8046875, + "learning_rate": 0.00015892614222526544, + "loss": 0.8321, + "step": 19709 + }, + { + "epoch": 0.5060975316190596, + "grad_norm": 0.76953125, + "learning_rate": 0.00015892253534857323, + "loss": 0.9837, + "step": 19710 + }, + { + "epoch": 0.5061232088149814, + "grad_norm": 0.8125, + "learning_rate": 0.0001589189283544537, + "loss": 1.0555, + "step": 19711 + }, + { + "epoch": 0.5061488860109031, + "grad_norm": 0.73046875, + "learning_rate": 0.0001589153212429141, + "loss": 0.8536, + "step": 19712 + }, + { + "epoch": 0.506174563206825, + "grad_norm": 0.79296875, + "learning_rate": 0.0001589117140139616, + "loss": 0.9621, + "step": 19713 + }, + { + "epoch": 0.5062002404027468, + "grad_norm": 0.73046875, + "learning_rate": 0.00015890810666760333, + "loss": 0.8254, + "step": 19714 + }, + { + "epoch": 0.5062259175986686, + "grad_norm": 0.76171875, + "learning_rate": 0.00015890449920384655, + "loss": 0.9255, + "step": 19715 + }, + { + "epoch": 0.5062515947945905, + "grad_norm": 0.87890625, + "learning_rate": 0.0001589008916226984, + "loss": 0.8901, + "step": 19716 + }, + { + "epoch": 0.5062772719905123, + "grad_norm": 0.73046875, + "learning_rate": 0.00015889728392416613, + "loss": 0.8581, + "step": 19717 + }, + { + "epoch": 0.5063029491864341, + "grad_norm": 0.84375, + "learning_rate": 0.00015889367610825685, + "loss": 0.9976, + "step": 19718 + }, + { + "epoch": 0.5063286263823559, + "grad_norm": 0.78515625, + "learning_rate": 0.00015889006817497783, + "loss": 0.8941, + "step": 19719 + }, + { + "epoch": 0.5063543035782777, + "grad_norm": 0.8125, + "learning_rate": 0.00015888646012433624, + "loss": 0.9126, + "step": 19720 + }, + { + "epoch": 0.5063799807741995, + "grad_norm": 0.78515625, + "learning_rate": 0.00015888285195633922, + "loss": 0.8972, + "step": 19721 + }, + { + "epoch": 0.5064056579701214, + "grad_norm": 0.7578125, + "learning_rate": 0.000158879243670994, + "loss": 0.8239, + "step": 19722 + }, + { + "epoch": 0.5064313351660432, + "grad_norm": 0.87890625, + "learning_rate": 0.00015887563526830777, + "loss": 1.0193, + "step": 19723 + }, + { + "epoch": 0.506457012361965, + "grad_norm": 0.82421875, + "learning_rate": 0.00015887202674828773, + "loss": 0.9249, + "step": 19724 + }, + { + "epoch": 0.5064826895578868, + "grad_norm": 0.828125, + "learning_rate": 0.00015886841811094105, + "loss": 0.851, + "step": 19725 + }, + { + "epoch": 0.5065083667538086, + "grad_norm": 0.78125, + "learning_rate": 0.00015886480935627491, + "loss": 0.9042, + "step": 19726 + }, + { + "epoch": 0.5065340439497305, + "grad_norm": 0.83203125, + "learning_rate": 0.00015886120048429653, + "loss": 0.9192, + "step": 19727 + }, + { + "epoch": 0.5065597211456523, + "grad_norm": 0.8046875, + "learning_rate": 0.00015885759149501309, + "loss": 0.8427, + "step": 19728 + }, + { + "epoch": 0.5065853983415741, + "grad_norm": 0.83203125, + "learning_rate": 0.0001588539823884318, + "loss": 0.848, + "step": 19729 + }, + { + "epoch": 0.506611075537496, + "grad_norm": 0.78125, + "learning_rate": 0.00015885037316455984, + "loss": 0.8733, + "step": 19730 + }, + { + "epoch": 0.5066367527334178, + "grad_norm": 0.7421875, + "learning_rate": 0.0001588467638234044, + "loss": 0.8913, + "step": 19731 + }, + { + "epoch": 0.5066624299293395, + "grad_norm": 0.72265625, + "learning_rate": 0.0001588431543649727, + "loss": 0.889, + "step": 19732 + }, + { + "epoch": 0.5066881071252614, + "grad_norm": 0.8828125, + "learning_rate": 0.00015883954478927188, + "loss": 0.9985, + "step": 19733 + }, + { + "epoch": 0.5067137843211832, + "grad_norm": 0.8203125, + "learning_rate": 0.00015883593509630916, + "loss": 0.947, + "step": 19734 + }, + { + "epoch": 0.506739461517105, + "grad_norm": 0.78125, + "learning_rate": 0.00015883232528609174, + "loss": 0.8555, + "step": 19735 + }, + { + "epoch": 0.5067651387130269, + "grad_norm": 0.76953125, + "learning_rate": 0.0001588287153586268, + "loss": 0.7639, + "step": 19736 + }, + { + "epoch": 0.5067908159089487, + "grad_norm": 0.75, + "learning_rate": 0.00015882510531392154, + "loss": 0.8848, + "step": 19737 + }, + { + "epoch": 0.5068164931048705, + "grad_norm": 0.75, + "learning_rate": 0.00015882149515198318, + "loss": 0.814, + "step": 19738 + }, + { + "epoch": 0.5068421703007923, + "grad_norm": 0.75, + "learning_rate": 0.0001588178848728189, + "loss": 0.7984, + "step": 19739 + }, + { + "epoch": 0.5068678474967141, + "grad_norm": 0.79296875, + "learning_rate": 0.0001588142744764359, + "loss": 0.8724, + "step": 19740 + }, + { + "epoch": 0.5068935246926359, + "grad_norm": 0.79296875, + "learning_rate": 0.0001588106639628413, + "loss": 0.8547, + "step": 19741 + }, + { + "epoch": 0.5069192018885578, + "grad_norm": 0.80078125, + "learning_rate": 0.0001588070533320424, + "loss": 0.9933, + "step": 19742 + }, + { + "epoch": 0.5069448790844796, + "grad_norm": 0.8125, + "learning_rate": 0.00015880344258404637, + "loss": 1.0168, + "step": 19743 + }, + { + "epoch": 0.5069705562804014, + "grad_norm": 0.76953125, + "learning_rate": 0.00015879983171886036, + "loss": 0.9201, + "step": 19744 + }, + { + "epoch": 0.5069962334763232, + "grad_norm": 0.78515625, + "learning_rate": 0.00015879622073649164, + "loss": 0.9386, + "step": 19745 + }, + { + "epoch": 0.507021910672245, + "grad_norm": 0.80078125, + "learning_rate": 0.00015879260963694734, + "loss": 0.8658, + "step": 19746 + }, + { + "epoch": 0.5070475878681668, + "grad_norm": 0.765625, + "learning_rate": 0.00015878899842023466, + "loss": 0.8742, + "step": 19747 + }, + { + "epoch": 0.5070732650640887, + "grad_norm": 0.76171875, + "learning_rate": 0.00015878538708636084, + "loss": 0.7594, + "step": 19748 + }, + { + "epoch": 0.5070989422600105, + "grad_norm": 0.73828125, + "learning_rate": 0.00015878177563533302, + "loss": 0.8118, + "step": 19749 + }, + { + "epoch": 0.5071246194559323, + "grad_norm": 0.83203125, + "learning_rate": 0.00015877816406715848, + "loss": 0.8535, + "step": 19750 + }, + { + "epoch": 0.5071502966518542, + "grad_norm": 0.7578125, + "learning_rate": 0.00015877455238184432, + "loss": 0.885, + "step": 19751 + }, + { + "epoch": 0.5071759738477759, + "grad_norm": 0.70703125, + "learning_rate": 0.00015877094057939784, + "loss": 0.7393, + "step": 19752 + }, + { + "epoch": 0.5072016510436977, + "grad_norm": 0.77734375, + "learning_rate": 0.00015876732865982614, + "loss": 0.7635, + "step": 19753 + }, + { + "epoch": 0.5072273282396196, + "grad_norm": 0.76953125, + "learning_rate": 0.00015876371662313648, + "loss": 0.8568, + "step": 19754 + }, + { + "epoch": 0.5072530054355414, + "grad_norm": 0.71484375, + "learning_rate": 0.00015876010446933602, + "loss": 0.9677, + "step": 19755 + }, + { + "epoch": 0.5072786826314633, + "grad_norm": 0.7578125, + "learning_rate": 0.00015875649219843202, + "loss": 0.8384, + "step": 19756 + }, + { + "epoch": 0.5073043598273851, + "grad_norm": 0.83984375, + "learning_rate": 0.00015875287981043158, + "loss": 0.9676, + "step": 19757 + }, + { + "epoch": 0.5073300370233069, + "grad_norm": 0.77734375, + "learning_rate": 0.00015874926730534203, + "loss": 0.8633, + "step": 19758 + }, + { + "epoch": 0.5073557142192286, + "grad_norm": 0.83984375, + "learning_rate": 0.00015874565468317045, + "loss": 0.8135, + "step": 19759 + }, + { + "epoch": 0.5073813914151505, + "grad_norm": 0.75, + "learning_rate": 0.0001587420419439241, + "loss": 0.8638, + "step": 19760 + }, + { + "epoch": 0.5074070686110723, + "grad_norm": 0.765625, + "learning_rate": 0.00015873842908761015, + "loss": 0.9123, + "step": 19761 + }, + { + "epoch": 0.5074327458069942, + "grad_norm": 0.73828125, + "learning_rate": 0.00015873481611423582, + "loss": 0.8154, + "step": 19762 + }, + { + "epoch": 0.507458423002916, + "grad_norm": 0.734375, + "learning_rate": 0.00015873120302380833, + "loss": 0.8569, + "step": 19763 + }, + { + "epoch": 0.5074841001988378, + "grad_norm": 0.75, + "learning_rate": 0.00015872758981633485, + "loss": 1.0976, + "step": 19764 + }, + { + "epoch": 0.5075097773947596, + "grad_norm": 0.7265625, + "learning_rate": 0.00015872397649182261, + "loss": 0.8183, + "step": 19765 + }, + { + "epoch": 0.5075354545906814, + "grad_norm": 0.75, + "learning_rate": 0.00015872036305027872, + "loss": 0.8621, + "step": 19766 + }, + { + "epoch": 0.5075611317866032, + "grad_norm": 0.82421875, + "learning_rate": 0.00015871674949171052, + "loss": 0.9297, + "step": 19767 + }, + { + "epoch": 0.5075868089825251, + "grad_norm": 0.7421875, + "learning_rate": 0.0001587131358161251, + "loss": 0.8492, + "step": 19768 + }, + { + "epoch": 0.5076124861784469, + "grad_norm": 0.71875, + "learning_rate": 0.00015870952202352972, + "loss": 0.9092, + "step": 19769 + }, + { + "epoch": 0.5076381633743687, + "grad_norm": 0.76953125, + "learning_rate": 0.00015870590811393155, + "loss": 0.803, + "step": 19770 + }, + { + "epoch": 0.5076638405702906, + "grad_norm": 0.890625, + "learning_rate": 0.00015870229408733786, + "loss": 0.8297, + "step": 19771 + }, + { + "epoch": 0.5076895177662123, + "grad_norm": 0.8203125, + "learning_rate": 0.00015869867994375574, + "loss": 0.8531, + "step": 19772 + }, + { + "epoch": 0.5077151949621341, + "grad_norm": 0.69921875, + "learning_rate": 0.00015869506568319248, + "loss": 0.7561, + "step": 19773 + }, + { + "epoch": 0.507740872158056, + "grad_norm": 0.7890625, + "learning_rate": 0.00015869145130565526, + "loss": 1.0773, + "step": 19774 + }, + { + "epoch": 0.5077665493539778, + "grad_norm": 0.74609375, + "learning_rate": 0.00015868783681115124, + "loss": 0.7801, + "step": 19775 + }, + { + "epoch": 0.5077922265498996, + "grad_norm": 0.828125, + "learning_rate": 0.00015868422219968771, + "loss": 0.9292, + "step": 19776 + }, + { + "epoch": 0.5078179037458215, + "grad_norm": 0.81640625, + "learning_rate": 0.0001586806074712718, + "loss": 0.8734, + "step": 19777 + }, + { + "epoch": 0.5078435809417433, + "grad_norm": 0.7734375, + "learning_rate": 0.00015867699262591076, + "loss": 0.9888, + "step": 19778 + }, + { + "epoch": 0.507869258137665, + "grad_norm": 0.734375, + "learning_rate": 0.00015867337766361177, + "loss": 0.8066, + "step": 19779 + }, + { + "epoch": 0.5078949353335869, + "grad_norm": 0.77734375, + "learning_rate": 0.00015866976258438204, + "loss": 0.8994, + "step": 19780 + }, + { + "epoch": 0.5079206125295087, + "grad_norm": 0.75390625, + "learning_rate": 0.00015866614738822875, + "loss": 0.8238, + "step": 19781 + }, + { + "epoch": 0.5079462897254305, + "grad_norm": 0.8046875, + "learning_rate": 0.00015866253207515914, + "loss": 0.8708, + "step": 19782 + }, + { + "epoch": 0.5079719669213524, + "grad_norm": 0.7734375, + "learning_rate": 0.0001586589166451804, + "loss": 0.8216, + "step": 19783 + }, + { + "epoch": 0.5079976441172742, + "grad_norm": 0.78125, + "learning_rate": 0.00015865530109829975, + "loss": 0.9339, + "step": 19784 + }, + { + "epoch": 0.5080233213131959, + "grad_norm": 0.75, + "learning_rate": 0.0001586516854345244, + "loss": 0.969, + "step": 19785 + }, + { + "epoch": 0.5080489985091178, + "grad_norm": 0.765625, + "learning_rate": 0.00015864806965386148, + "loss": 0.9038, + "step": 19786 + }, + { + "epoch": 0.5080746757050396, + "grad_norm": 0.75, + "learning_rate": 0.00015864445375631832, + "loss": 0.8471, + "step": 19787 + }, + { + "epoch": 0.5081003529009614, + "grad_norm": 0.76171875, + "learning_rate": 0.000158640837741902, + "loss": 0.8639, + "step": 19788 + }, + { + "epoch": 0.5081260300968833, + "grad_norm": 0.7109375, + "learning_rate": 0.00015863722161061982, + "loss": 0.8106, + "step": 19789 + }, + { + "epoch": 0.5081517072928051, + "grad_norm": 0.76171875, + "learning_rate": 0.00015863360536247896, + "loss": 0.925, + "step": 19790 + }, + { + "epoch": 0.508177384488727, + "grad_norm": 0.80078125, + "learning_rate": 0.0001586299889974866, + "loss": 0.945, + "step": 19791 + }, + { + "epoch": 0.5082030616846487, + "grad_norm": 0.79296875, + "learning_rate": 0.00015862637251565, + "loss": 0.9878, + "step": 19792 + }, + { + "epoch": 0.5082287388805705, + "grad_norm": 0.765625, + "learning_rate": 0.0001586227559169763, + "loss": 0.8763, + "step": 19793 + }, + { + "epoch": 0.5082544160764924, + "grad_norm": 0.828125, + "learning_rate": 0.00015861913920147278, + "loss": 0.8696, + "step": 19794 + }, + { + "epoch": 0.5082800932724142, + "grad_norm": 0.73828125, + "learning_rate": 0.00015861552236914655, + "loss": 0.8456, + "step": 19795 + }, + { + "epoch": 0.508305770468336, + "grad_norm": 0.85546875, + "learning_rate": 0.00015861190542000494, + "loss": 0.908, + "step": 19796 + }, + { + "epoch": 0.5083314476642579, + "grad_norm": 0.77734375, + "learning_rate": 0.00015860828835405507, + "loss": 1.0337, + "step": 19797 + }, + { + "epoch": 0.5083571248601797, + "grad_norm": 0.79296875, + "learning_rate": 0.0001586046711713042, + "loss": 0.8549, + "step": 19798 + }, + { + "epoch": 0.5083828020561014, + "grad_norm": 0.78515625, + "learning_rate": 0.0001586010538717595, + "loss": 0.9849, + "step": 19799 + }, + { + "epoch": 0.5084084792520233, + "grad_norm": 0.75390625, + "learning_rate": 0.00015859743645542818, + "loss": 0.7869, + "step": 19800 + }, + { + "epoch": 0.5084341564479451, + "grad_norm": 0.6796875, + "learning_rate": 0.00015859381892231747, + "loss": 0.9345, + "step": 19801 + }, + { + "epoch": 0.5084598336438669, + "grad_norm": 0.8046875, + "learning_rate": 0.00015859020127243455, + "loss": 0.9468, + "step": 19802 + }, + { + "epoch": 0.5084855108397888, + "grad_norm": 0.8671875, + "learning_rate": 0.0001585865835057867, + "loss": 0.8714, + "step": 19803 + }, + { + "epoch": 0.5085111880357106, + "grad_norm": 0.75, + "learning_rate": 0.00015858296562238104, + "loss": 0.9652, + "step": 19804 + }, + { + "epoch": 0.5085368652316323, + "grad_norm": 0.859375, + "learning_rate": 0.00015857934762222487, + "loss": 0.9456, + "step": 19805 + }, + { + "epoch": 0.5085625424275542, + "grad_norm": 0.6796875, + "learning_rate": 0.00015857572950532528, + "loss": 0.8556, + "step": 19806 + }, + { + "epoch": 0.508588219623476, + "grad_norm": 0.78125, + "learning_rate": 0.0001585721112716896, + "loss": 0.8001, + "step": 19807 + }, + { + "epoch": 0.5086138968193978, + "grad_norm": 0.76171875, + "learning_rate": 0.00015856849292132495, + "loss": 0.9251, + "step": 19808 + }, + { + "epoch": 0.5086395740153197, + "grad_norm": 0.8515625, + "learning_rate": 0.00015856487445423866, + "loss": 1.005, + "step": 19809 + }, + { + "epoch": 0.5086652512112415, + "grad_norm": 0.78125, + "learning_rate": 0.0001585612558704378, + "loss": 0.8441, + "step": 19810 + }, + { + "epoch": 0.5086909284071633, + "grad_norm": 0.796875, + "learning_rate": 0.00015855763716992967, + "loss": 0.8062, + "step": 19811 + }, + { + "epoch": 0.5087166056030851, + "grad_norm": 0.7734375, + "learning_rate": 0.00015855401835272146, + "loss": 0.8489, + "step": 19812 + }, + { + "epoch": 0.5087422827990069, + "grad_norm": 0.8046875, + "learning_rate": 0.0001585503994188204, + "loss": 0.7784, + "step": 19813 + }, + { + "epoch": 0.5087679599949287, + "grad_norm": 0.8515625, + "learning_rate": 0.00015854678036823362, + "loss": 0.9679, + "step": 19814 + }, + { + "epoch": 0.5087936371908506, + "grad_norm": 0.7265625, + "learning_rate": 0.00015854316120096846, + "loss": 0.8414, + "step": 19815 + }, + { + "epoch": 0.5088193143867724, + "grad_norm": 0.71484375, + "learning_rate": 0.00015853954191703205, + "loss": 0.9303, + "step": 19816 + }, + { + "epoch": 0.5088449915826943, + "grad_norm": 0.703125, + "learning_rate": 0.0001585359225164316, + "loss": 0.7868, + "step": 19817 + }, + { + "epoch": 0.5088706687786161, + "grad_norm": 0.76171875, + "learning_rate": 0.00015853230299917435, + "loss": 0.8637, + "step": 19818 + }, + { + "epoch": 0.5088963459745378, + "grad_norm": 0.7734375, + "learning_rate": 0.00015852868336526754, + "loss": 0.9117, + "step": 19819 + }, + { + "epoch": 0.5089220231704596, + "grad_norm": 0.78125, + "learning_rate": 0.0001585250636147183, + "loss": 0.7909, + "step": 19820 + }, + { + "epoch": 0.5089477003663815, + "grad_norm": 0.82421875, + "learning_rate": 0.00015852144374753393, + "loss": 0.8896, + "step": 19821 + }, + { + "epoch": 0.5089733775623033, + "grad_norm": 0.75, + "learning_rate": 0.0001585178237637216, + "loss": 0.8227, + "step": 19822 + }, + { + "epoch": 0.5089990547582252, + "grad_norm": 0.765625, + "learning_rate": 0.00015851420366328853, + "loss": 0.8623, + "step": 19823 + }, + { + "epoch": 0.509024731954147, + "grad_norm": 0.796875, + "learning_rate": 0.00015851058344624195, + "loss": 0.7859, + "step": 19824 + }, + { + "epoch": 0.5090504091500687, + "grad_norm": 0.765625, + "learning_rate": 0.00015850696311258905, + "loss": 0.9706, + "step": 19825 + }, + { + "epoch": 0.5090760863459906, + "grad_norm": 0.78515625, + "learning_rate": 0.00015850334266233706, + "loss": 0.939, + "step": 19826 + }, + { + "epoch": 0.5091017635419124, + "grad_norm": 0.8125, + "learning_rate": 0.00015849972209549317, + "loss": 0.8475, + "step": 19827 + }, + { + "epoch": 0.5091274407378342, + "grad_norm": 0.79296875, + "learning_rate": 0.00015849610141206467, + "loss": 0.9012, + "step": 19828 + }, + { + "epoch": 0.5091531179337561, + "grad_norm": 0.76171875, + "learning_rate": 0.00015849248061205868, + "loss": 0.8871, + "step": 19829 + }, + { + "epoch": 0.5091787951296779, + "grad_norm": 0.796875, + "learning_rate": 0.00015848885969548245, + "loss": 0.8405, + "step": 19830 + }, + { + "epoch": 0.5092044723255997, + "grad_norm": 0.76953125, + "learning_rate": 0.00015848523866234326, + "loss": 0.8531, + "step": 19831 + }, + { + "epoch": 0.5092301495215215, + "grad_norm": 0.84375, + "learning_rate": 0.00015848161751264825, + "loss": 1.0523, + "step": 19832 + }, + { + "epoch": 0.5092558267174433, + "grad_norm": 0.734375, + "learning_rate": 0.00015847799624640465, + "loss": 0.8398, + "step": 19833 + }, + { + "epoch": 0.5092815039133651, + "grad_norm": 0.76171875, + "learning_rate": 0.0001584743748636197, + "loss": 0.8274, + "step": 19834 + }, + { + "epoch": 0.509307181109287, + "grad_norm": 0.75, + "learning_rate": 0.00015847075336430057, + "loss": 0.8475, + "step": 19835 + }, + { + "epoch": 0.5093328583052088, + "grad_norm": 0.78125, + "learning_rate": 0.00015846713174845454, + "loss": 0.8213, + "step": 19836 + }, + { + "epoch": 0.5093585355011306, + "grad_norm": 0.7890625, + "learning_rate": 0.0001584635100160888, + "loss": 0.7849, + "step": 19837 + }, + { + "epoch": 0.5093842126970524, + "grad_norm": 0.71875, + "learning_rate": 0.00015845988816721052, + "loss": 0.8268, + "step": 19838 + }, + { + "epoch": 0.5094098898929742, + "grad_norm": 0.78125, + "learning_rate": 0.000158456266201827, + "loss": 0.873, + "step": 19839 + }, + { + "epoch": 0.509435567088896, + "grad_norm": 0.8046875, + "learning_rate": 0.00015845264411994543, + "loss": 0.9794, + "step": 19840 + }, + { + "epoch": 0.5094612442848179, + "grad_norm": 0.7734375, + "learning_rate": 0.00015844902192157302, + "loss": 0.8032, + "step": 19841 + }, + { + "epoch": 0.5094869214807397, + "grad_norm": 0.7734375, + "learning_rate": 0.00015844539960671697, + "loss": 0.823, + "step": 19842 + }, + { + "epoch": 0.5095125986766615, + "grad_norm": 0.8125, + "learning_rate": 0.00015844177717538454, + "loss": 0.8069, + "step": 19843 + }, + { + "epoch": 0.5095382758725834, + "grad_norm": 0.8203125, + "learning_rate": 0.0001584381546275829, + "loss": 1.0138, + "step": 19844 + }, + { + "epoch": 0.5095639530685051, + "grad_norm": 0.80078125, + "learning_rate": 0.0001584345319633193, + "loss": 0.8944, + "step": 19845 + }, + { + "epoch": 0.5095896302644269, + "grad_norm": 0.8203125, + "learning_rate": 0.00015843090918260096, + "loss": 0.7931, + "step": 19846 + }, + { + "epoch": 0.5096153074603488, + "grad_norm": 0.75390625, + "learning_rate": 0.00015842728628543513, + "loss": 1.0101, + "step": 19847 + }, + { + "epoch": 0.5096409846562706, + "grad_norm": 0.69140625, + "learning_rate": 0.00015842366327182896, + "loss": 0.8785, + "step": 19848 + }, + { + "epoch": 0.5096666618521924, + "grad_norm": 0.796875, + "learning_rate": 0.00015842004014178973, + "loss": 1.0017, + "step": 19849 + }, + { + "epoch": 0.5096923390481143, + "grad_norm": 0.76171875, + "learning_rate": 0.0001584164168953246, + "loss": 0.8373, + "step": 19850 + }, + { + "epoch": 0.5097180162440361, + "grad_norm": 0.79296875, + "learning_rate": 0.00015841279353244088, + "loss": 0.9645, + "step": 19851 + }, + { + "epoch": 0.5097436934399578, + "grad_norm": 0.68359375, + "learning_rate": 0.00015840917005314567, + "loss": 0.8105, + "step": 19852 + }, + { + "epoch": 0.5097693706358797, + "grad_norm": 0.82421875, + "learning_rate": 0.00015840554645744632, + "loss": 1.0051, + "step": 19853 + }, + { + "epoch": 0.5097950478318015, + "grad_norm": 0.69140625, + "learning_rate": 0.00015840192274534998, + "loss": 0.9125, + "step": 19854 + }, + { + "epoch": 0.5098207250277234, + "grad_norm": 0.75, + "learning_rate": 0.00015839829891686386, + "loss": 0.8721, + "step": 19855 + }, + { + "epoch": 0.5098464022236452, + "grad_norm": 0.703125, + "learning_rate": 0.00015839467497199522, + "loss": 0.7768, + "step": 19856 + }, + { + "epoch": 0.509872079419567, + "grad_norm": 0.81640625, + "learning_rate": 0.00015839105091075128, + "loss": 0.9598, + "step": 19857 + }, + { + "epoch": 0.5098977566154888, + "grad_norm": 0.7265625, + "learning_rate": 0.00015838742673313924, + "loss": 0.8345, + "step": 19858 + }, + { + "epoch": 0.5099234338114106, + "grad_norm": 0.7890625, + "learning_rate": 0.00015838380243916633, + "loss": 0.8979, + "step": 19859 + }, + { + "epoch": 0.5099491110073324, + "grad_norm": 0.83203125, + "learning_rate": 0.00015838017802883978, + "loss": 0.8897, + "step": 19860 + }, + { + "epoch": 0.5099747882032543, + "grad_norm": 0.859375, + "learning_rate": 0.0001583765535021668, + "loss": 0.8766, + "step": 19861 + }, + { + "epoch": 0.5100004653991761, + "grad_norm": 0.7421875, + "learning_rate": 0.00015837292885915463, + "loss": 0.9461, + "step": 19862 + }, + { + "epoch": 0.5100261425950979, + "grad_norm": 0.7265625, + "learning_rate": 0.00015836930409981047, + "loss": 0.9116, + "step": 19863 + }, + { + "epoch": 0.5100518197910198, + "grad_norm": 0.734375, + "learning_rate": 0.00015836567922414155, + "loss": 0.7574, + "step": 19864 + }, + { + "epoch": 0.5100774969869415, + "grad_norm": 0.83984375, + "learning_rate": 0.00015836205423215514, + "loss": 0.8935, + "step": 19865 + }, + { + "epoch": 0.5101031741828633, + "grad_norm": 0.7265625, + "learning_rate": 0.0001583584291238584, + "loss": 0.8598, + "step": 19866 + }, + { + "epoch": 0.5101288513787852, + "grad_norm": 0.70703125, + "learning_rate": 0.0001583548038992586, + "loss": 0.7441, + "step": 19867 + }, + { + "epoch": 0.510154528574707, + "grad_norm": 0.75, + "learning_rate": 0.00015835117855836294, + "loss": 0.8852, + "step": 19868 + }, + { + "epoch": 0.5101802057706288, + "grad_norm": 0.734375, + "learning_rate": 0.00015834755310117866, + "loss": 0.9333, + "step": 19869 + }, + { + "epoch": 0.5102058829665507, + "grad_norm": 0.74609375, + "learning_rate": 0.000158343927527713, + "loss": 0.8093, + "step": 19870 + }, + { + "epoch": 0.5102315601624725, + "grad_norm": 0.77734375, + "learning_rate": 0.00015834030183797313, + "loss": 0.8811, + "step": 19871 + }, + { + "epoch": 0.5102572373583942, + "grad_norm": 0.828125, + "learning_rate": 0.0001583366760319663, + "loss": 0.8204, + "step": 19872 + }, + { + "epoch": 0.5102829145543161, + "grad_norm": 0.78515625, + "learning_rate": 0.00015833305010969973, + "loss": 0.8768, + "step": 19873 + }, + { + "epoch": 0.5103085917502379, + "grad_norm": 0.76171875, + "learning_rate": 0.0001583294240711807, + "loss": 0.9198, + "step": 19874 + }, + { + "epoch": 0.5103342689461597, + "grad_norm": 0.8515625, + "learning_rate": 0.00015832579791641638, + "loss": 0.8093, + "step": 19875 + }, + { + "epoch": 0.5103599461420816, + "grad_norm": 0.76953125, + "learning_rate": 0.00015832217164541402, + "loss": 0.8639, + "step": 19876 + }, + { + "epoch": 0.5103856233380034, + "grad_norm": 0.7890625, + "learning_rate": 0.00015831854525818085, + "loss": 0.9516, + "step": 19877 + }, + { + "epoch": 0.5104113005339251, + "grad_norm": 0.77734375, + "learning_rate": 0.00015831491875472405, + "loss": 0.8547, + "step": 19878 + }, + { + "epoch": 0.510436977729847, + "grad_norm": 0.6953125, + "learning_rate": 0.00015831129213505091, + "loss": 0.7465, + "step": 19879 + }, + { + "epoch": 0.5104626549257688, + "grad_norm": 0.72265625, + "learning_rate": 0.00015830766539916863, + "loss": 0.917, + "step": 19880 + }, + { + "epoch": 0.5104883321216906, + "grad_norm": 0.8203125, + "learning_rate": 0.00015830403854708445, + "loss": 0.9747, + "step": 19881 + }, + { + "epoch": 0.5105140093176125, + "grad_norm": 0.75, + "learning_rate": 0.00015830041157880557, + "loss": 0.8781, + "step": 19882 + }, + { + "epoch": 0.5105396865135343, + "grad_norm": 0.828125, + "learning_rate": 0.00015829678449433927, + "loss": 0.8331, + "step": 19883 + }, + { + "epoch": 0.5105653637094562, + "grad_norm": 0.8046875, + "learning_rate": 0.0001582931572936927, + "loss": 0.9451, + "step": 19884 + }, + { + "epoch": 0.5105910409053779, + "grad_norm": 0.734375, + "learning_rate": 0.00015828952997687314, + "loss": 0.8449, + "step": 19885 + }, + { + "epoch": 0.5106167181012997, + "grad_norm": 0.76171875, + "learning_rate": 0.00015828590254388783, + "loss": 1.0719, + "step": 19886 + }, + { + "epoch": 0.5106423952972216, + "grad_norm": 0.7109375, + "learning_rate": 0.00015828227499474398, + "loss": 0.7656, + "step": 19887 + }, + { + "epoch": 0.5106680724931434, + "grad_norm": 0.7734375, + "learning_rate": 0.0001582786473294488, + "loss": 0.8944, + "step": 19888 + }, + { + "epoch": 0.5106937496890652, + "grad_norm": 0.87890625, + "learning_rate": 0.00015827501954800957, + "loss": 0.9195, + "step": 19889 + }, + { + "epoch": 0.5107194268849871, + "grad_norm": 0.76953125, + "learning_rate": 0.00015827139165043347, + "loss": 0.9563, + "step": 19890 + }, + { + "epoch": 0.5107451040809089, + "grad_norm": 0.7421875, + "learning_rate": 0.00015826776363672774, + "loss": 0.8327, + "step": 19891 + }, + { + "epoch": 0.5107707812768306, + "grad_norm": 0.83984375, + "learning_rate": 0.00015826413550689965, + "loss": 0.9606, + "step": 19892 + }, + { + "epoch": 0.5107964584727525, + "grad_norm": 0.7421875, + "learning_rate": 0.00015826050726095638, + "loss": 0.9429, + "step": 19893 + }, + { + "epoch": 0.5108221356686743, + "grad_norm": 0.76171875, + "learning_rate": 0.0001582568788989052, + "loss": 0.8923, + "step": 19894 + }, + { + "epoch": 0.5108478128645961, + "grad_norm": 0.73828125, + "learning_rate": 0.0001582532504207533, + "loss": 0.8938, + "step": 19895 + }, + { + "epoch": 0.510873490060518, + "grad_norm": 0.71875, + "learning_rate": 0.00015824962182650796, + "loss": 0.9022, + "step": 19896 + }, + { + "epoch": 0.5108991672564398, + "grad_norm": 0.78125, + "learning_rate": 0.00015824599311617637, + "loss": 0.7739, + "step": 19897 + }, + { + "epoch": 0.5109248444523615, + "grad_norm": 0.71484375, + "learning_rate": 0.00015824236428976579, + "loss": 0.9604, + "step": 19898 + }, + { + "epoch": 0.5109505216482834, + "grad_norm": 0.75, + "learning_rate": 0.0001582387353472834, + "loss": 0.9787, + "step": 19899 + }, + { + "epoch": 0.5109761988442052, + "grad_norm": 0.74609375, + "learning_rate": 0.00015823510628873647, + "loss": 0.8112, + "step": 19900 + }, + { + "epoch": 0.511001876040127, + "grad_norm": 0.8046875, + "learning_rate": 0.0001582314771141323, + "loss": 1.1439, + "step": 19901 + }, + { + "epoch": 0.5110275532360489, + "grad_norm": 0.73046875, + "learning_rate": 0.000158227847823478, + "loss": 0.8674, + "step": 19902 + }, + { + "epoch": 0.5110532304319707, + "grad_norm": 0.8359375, + "learning_rate": 0.0001582242184167809, + "loss": 0.9426, + "step": 19903 + }, + { + "epoch": 0.5110789076278925, + "grad_norm": 0.85546875, + "learning_rate": 0.00015822058889404815, + "loss": 0.9484, + "step": 19904 + }, + { + "epoch": 0.5111045848238143, + "grad_norm": 0.74609375, + "learning_rate": 0.00015821695925528702, + "loss": 1.0335, + "step": 19905 + }, + { + "epoch": 0.5111302620197361, + "grad_norm": 0.73828125, + "learning_rate": 0.00015821332950050477, + "loss": 0.9196, + "step": 19906 + }, + { + "epoch": 0.5111559392156579, + "grad_norm": 0.82421875, + "learning_rate": 0.0001582096996297086, + "loss": 0.8468, + "step": 19907 + }, + { + "epoch": 0.5111816164115798, + "grad_norm": 0.84765625, + "learning_rate": 0.0001582060696429058, + "loss": 1.007, + "step": 19908 + }, + { + "epoch": 0.5112072936075016, + "grad_norm": 0.78125, + "learning_rate": 0.0001582024395401035, + "loss": 0.8459, + "step": 19909 + }, + { + "epoch": 0.5112329708034234, + "grad_norm": 0.765625, + "learning_rate": 0.00015819880932130907, + "loss": 0.8975, + "step": 19910 + }, + { + "epoch": 0.5112586479993453, + "grad_norm": 0.85546875, + "learning_rate": 0.00015819517898652962, + "loss": 0.9994, + "step": 19911 + }, + { + "epoch": 0.511284325195267, + "grad_norm": 0.82421875, + "learning_rate": 0.00015819154853577246, + "loss": 0.8776, + "step": 19912 + }, + { + "epoch": 0.5113100023911888, + "grad_norm": 0.9140625, + "learning_rate": 0.00015818791796904478, + "loss": 0.8613, + "step": 19913 + }, + { + "epoch": 0.5113356795871107, + "grad_norm": 0.79296875, + "learning_rate": 0.00015818428728635385, + "loss": 0.8543, + "step": 19914 + }, + { + "epoch": 0.5113613567830325, + "grad_norm": 0.828125, + "learning_rate": 0.00015818065648770687, + "loss": 0.7963, + "step": 19915 + }, + { + "epoch": 0.5113870339789544, + "grad_norm": 0.859375, + "learning_rate": 0.0001581770255731111, + "loss": 0.9842, + "step": 19916 + }, + { + "epoch": 0.5114127111748762, + "grad_norm": 0.78515625, + "learning_rate": 0.00015817339454257377, + "loss": 0.8423, + "step": 19917 + }, + { + "epoch": 0.5114383883707979, + "grad_norm": 0.83203125, + "learning_rate": 0.00015816976339610213, + "loss": 0.9754, + "step": 19918 + }, + { + "epoch": 0.5114640655667197, + "grad_norm": 0.77734375, + "learning_rate": 0.00015816613213370342, + "loss": 0.9109, + "step": 19919 + }, + { + "epoch": 0.5114897427626416, + "grad_norm": 0.8984375, + "learning_rate": 0.00015816250075538482, + "loss": 0.9397, + "step": 19920 + }, + { + "epoch": 0.5115154199585634, + "grad_norm": 0.71484375, + "learning_rate": 0.00015815886926115366, + "loss": 0.6661, + "step": 19921 + }, + { + "epoch": 0.5115410971544853, + "grad_norm": 0.75390625, + "learning_rate": 0.0001581552376510171, + "loss": 0.7396, + "step": 19922 + }, + { + "epoch": 0.5115667743504071, + "grad_norm": 0.7890625, + "learning_rate": 0.00015815160592498243, + "loss": 0.9081, + "step": 19923 + }, + { + "epoch": 0.5115924515463289, + "grad_norm": 0.8046875, + "learning_rate": 0.00015814797408305684, + "loss": 0.8332, + "step": 19924 + }, + { + "epoch": 0.5116181287422507, + "grad_norm": 0.76171875, + "learning_rate": 0.00015814434212524757, + "loss": 0.8195, + "step": 19925 + }, + { + "epoch": 0.5116438059381725, + "grad_norm": 0.74609375, + "learning_rate": 0.0001581407100515619, + "loss": 1.0353, + "step": 19926 + }, + { + "epoch": 0.5116694831340943, + "grad_norm": 0.79296875, + "learning_rate": 0.00015813707786200705, + "loss": 0.8618, + "step": 19927 + }, + { + "epoch": 0.5116951603300162, + "grad_norm": 0.734375, + "learning_rate": 0.00015813344555659024, + "loss": 0.9655, + "step": 19928 + }, + { + "epoch": 0.511720837525938, + "grad_norm": 0.828125, + "learning_rate": 0.00015812981313531875, + "loss": 0.8537, + "step": 19929 + }, + { + "epoch": 0.5117465147218598, + "grad_norm": 0.828125, + "learning_rate": 0.0001581261805981998, + "loss": 0.9471, + "step": 19930 + }, + { + "epoch": 0.5117721919177817, + "grad_norm": 0.80078125, + "learning_rate": 0.00015812254794524057, + "loss": 1.0888, + "step": 19931 + }, + { + "epoch": 0.5117978691137034, + "grad_norm": 0.76953125, + "learning_rate": 0.00015811891517644838, + "loss": 0.8918, + "step": 19932 + }, + { + "epoch": 0.5118235463096252, + "grad_norm": 0.7890625, + "learning_rate": 0.00015811528229183047, + "loss": 0.9405, + "step": 19933 + }, + { + "epoch": 0.5118492235055471, + "grad_norm": 0.7578125, + "learning_rate": 0.000158111649291394, + "loss": 0.8202, + "step": 19934 + }, + { + "epoch": 0.5118749007014689, + "grad_norm": 0.734375, + "learning_rate": 0.0001581080161751463, + "loss": 0.8828, + "step": 19935 + }, + { + "epoch": 0.5119005778973907, + "grad_norm": 0.86328125, + "learning_rate": 0.00015810438294309458, + "loss": 0.8617, + "step": 19936 + }, + { + "epoch": 0.5119262550933126, + "grad_norm": 0.73828125, + "learning_rate": 0.00015810074959524606, + "loss": 0.8569, + "step": 19937 + }, + { + "epoch": 0.5119519322892343, + "grad_norm": 0.75, + "learning_rate": 0.000158097116131608, + "loss": 0.8477, + "step": 19938 + }, + { + "epoch": 0.5119776094851561, + "grad_norm": 0.83203125, + "learning_rate": 0.0001580934825521876, + "loss": 0.9335, + "step": 19939 + }, + { + "epoch": 0.512003286681078, + "grad_norm": 0.78515625, + "learning_rate": 0.00015808984885699217, + "loss": 0.8304, + "step": 19940 + }, + { + "epoch": 0.5120289638769998, + "grad_norm": 0.80859375, + "learning_rate": 0.0001580862150460289, + "loss": 0.8289, + "step": 19941 + }, + { + "epoch": 0.5120546410729216, + "grad_norm": 0.72265625, + "learning_rate": 0.00015808258111930508, + "loss": 0.8398, + "step": 19942 + }, + { + "epoch": 0.5120803182688435, + "grad_norm": 0.80078125, + "learning_rate": 0.00015807894707682788, + "loss": 0.8574, + "step": 19943 + }, + { + "epoch": 0.5121059954647653, + "grad_norm": 0.85546875, + "learning_rate": 0.00015807531291860464, + "loss": 0.9692, + "step": 19944 + }, + { + "epoch": 0.512131672660687, + "grad_norm": 0.76953125, + "learning_rate": 0.0001580716786446425, + "loss": 1.0133, + "step": 19945 + }, + { + "epoch": 0.5121573498566089, + "grad_norm": 0.74609375, + "learning_rate": 0.00015806804425494878, + "loss": 0.9668, + "step": 19946 + }, + { + "epoch": 0.5121830270525307, + "grad_norm": 0.7890625, + "learning_rate": 0.00015806440974953066, + "loss": 0.9618, + "step": 19947 + }, + { + "epoch": 0.5122087042484526, + "grad_norm": 0.8046875, + "learning_rate": 0.00015806077512839543, + "loss": 1.0259, + "step": 19948 + }, + { + "epoch": 0.5122343814443744, + "grad_norm": 0.72265625, + "learning_rate": 0.00015805714039155032, + "loss": 0.8611, + "step": 19949 + }, + { + "epoch": 0.5122600586402962, + "grad_norm": 0.7890625, + "learning_rate": 0.0001580535055390026, + "loss": 0.8877, + "step": 19950 + }, + { + "epoch": 0.5122857358362181, + "grad_norm": 0.7890625, + "learning_rate": 0.00015804987057075947, + "loss": 0.827, + "step": 19951 + }, + { + "epoch": 0.5123114130321398, + "grad_norm": 0.796875, + "learning_rate": 0.00015804623548682818, + "loss": 0.8342, + "step": 19952 + }, + { + "epoch": 0.5123370902280616, + "grad_norm": 0.7421875, + "learning_rate": 0.00015804260028721601, + "loss": 0.9172, + "step": 19953 + }, + { + "epoch": 0.5123627674239835, + "grad_norm": 0.77734375, + "learning_rate": 0.00015803896497193013, + "loss": 0.863, + "step": 19954 + }, + { + "epoch": 0.5123884446199053, + "grad_norm": 0.69140625, + "learning_rate": 0.0001580353295409779, + "loss": 0.7525, + "step": 19955 + }, + { + "epoch": 0.5124141218158271, + "grad_norm": 2.078125, + "learning_rate": 0.00015803169399436647, + "loss": 0.834, + "step": 19956 + }, + { + "epoch": 0.512439799011749, + "grad_norm": 0.7421875, + "learning_rate": 0.0001580280583321031, + "loss": 0.934, + "step": 19957 + }, + { + "epoch": 0.5124654762076707, + "grad_norm": 0.7265625, + "learning_rate": 0.00015802442255419507, + "loss": 0.8787, + "step": 19958 + }, + { + "epoch": 0.5124911534035925, + "grad_norm": 0.78515625, + "learning_rate": 0.0001580207866606496, + "loss": 0.9979, + "step": 19959 + }, + { + "epoch": 0.5125168305995144, + "grad_norm": 0.875, + "learning_rate": 0.00015801715065147395, + "loss": 0.857, + "step": 19960 + }, + { + "epoch": 0.5125425077954362, + "grad_norm": 0.80078125, + "learning_rate": 0.00015801351452667537, + "loss": 0.8042, + "step": 19961 + }, + { + "epoch": 0.512568184991358, + "grad_norm": 0.79296875, + "learning_rate": 0.00015800987828626107, + "loss": 0.8993, + "step": 19962 + }, + { + "epoch": 0.5125938621872799, + "grad_norm": 0.8046875, + "learning_rate": 0.00015800624193023835, + "loss": 0.9521, + "step": 19963 + }, + { + "epoch": 0.5126195393832017, + "grad_norm": 0.734375, + "learning_rate": 0.0001580026054586144, + "loss": 0.7996, + "step": 19964 + }, + { + "epoch": 0.5126452165791234, + "grad_norm": 0.78125, + "learning_rate": 0.0001579989688713965, + "loss": 0.8692, + "step": 19965 + }, + { + "epoch": 0.5126708937750453, + "grad_norm": 0.78515625, + "learning_rate": 0.00015799533216859192, + "loss": 1.0217, + "step": 19966 + }, + { + "epoch": 0.5126965709709671, + "grad_norm": 0.80859375, + "learning_rate": 0.00015799169535020785, + "loss": 0.9846, + "step": 19967 + }, + { + "epoch": 0.5127222481668889, + "grad_norm": 0.7578125, + "learning_rate": 0.0001579880584162516, + "loss": 0.822, + "step": 19968 + }, + { + "epoch": 0.5127479253628108, + "grad_norm": 0.73046875, + "learning_rate": 0.0001579844213667304, + "loss": 0.6594, + "step": 19969 + }, + { + "epoch": 0.5127736025587326, + "grad_norm": 0.69921875, + "learning_rate": 0.00015798078420165145, + "loss": 0.9764, + "step": 19970 + }, + { + "epoch": 0.5127992797546544, + "grad_norm": 0.80078125, + "learning_rate": 0.00015797714692102207, + "loss": 0.9391, + "step": 19971 + }, + { + "epoch": 0.5128249569505762, + "grad_norm": 0.765625, + "learning_rate": 0.00015797350952484942, + "loss": 0.831, + "step": 19972 + }, + { + "epoch": 0.512850634146498, + "grad_norm": 0.7265625, + "learning_rate": 0.00015796987201314083, + "loss": 0.8145, + "step": 19973 + }, + { + "epoch": 0.5128763113424198, + "grad_norm": 0.7421875, + "learning_rate": 0.00015796623438590353, + "loss": 0.9087, + "step": 19974 + }, + { + "epoch": 0.5129019885383417, + "grad_norm": 0.7578125, + "learning_rate": 0.00015796259664314474, + "loss": 0.9171, + "step": 19975 + }, + { + "epoch": 0.5129276657342635, + "grad_norm": 0.76171875, + "learning_rate": 0.00015795895878487174, + "loss": 0.9701, + "step": 19976 + }, + { + "epoch": 0.5129533429301854, + "grad_norm": 0.74609375, + "learning_rate": 0.00015795532081109176, + "loss": 0.7798, + "step": 19977 + }, + { + "epoch": 0.5129790201261071, + "grad_norm": 0.7421875, + "learning_rate": 0.00015795168272181211, + "loss": 0.9757, + "step": 19978 + }, + { + "epoch": 0.5130046973220289, + "grad_norm": 0.72265625, + "learning_rate": 0.00015794804451703994, + "loss": 0.8604, + "step": 19979 + }, + { + "epoch": 0.5130303745179507, + "grad_norm": 0.7734375, + "learning_rate": 0.00015794440619678257, + "loss": 0.7572, + "step": 19980 + }, + { + "epoch": 0.5130560517138726, + "grad_norm": 0.7734375, + "learning_rate": 0.00015794076776104722, + "loss": 0.9183, + "step": 19981 + }, + { + "epoch": 0.5130817289097944, + "grad_norm": 0.8359375, + "learning_rate": 0.00015793712920984118, + "loss": 0.8901, + "step": 19982 + }, + { + "epoch": 0.5131074061057163, + "grad_norm": 0.75390625, + "learning_rate": 0.00015793349054317167, + "loss": 0.8035, + "step": 19983 + }, + { + "epoch": 0.5131330833016381, + "grad_norm": 0.76953125, + "learning_rate": 0.0001579298517610459, + "loss": 0.9123, + "step": 19984 + }, + { + "epoch": 0.5131587604975598, + "grad_norm": 0.83203125, + "learning_rate": 0.00015792621286347123, + "loss": 0.8863, + "step": 19985 + }, + { + "epoch": 0.5131844376934817, + "grad_norm": 0.75, + "learning_rate": 0.00015792257385045482, + "loss": 0.8997, + "step": 19986 + }, + { + "epoch": 0.5132101148894035, + "grad_norm": 0.734375, + "learning_rate": 0.00015791893472200393, + "loss": 0.8639, + "step": 19987 + }, + { + "epoch": 0.5132357920853253, + "grad_norm": 0.734375, + "learning_rate": 0.0001579152954781259, + "loss": 0.8479, + "step": 19988 + }, + { + "epoch": 0.5132614692812472, + "grad_norm": 0.75, + "learning_rate": 0.00015791165611882786, + "loss": 0.8658, + "step": 19989 + }, + { + "epoch": 0.513287146477169, + "grad_norm": 0.79296875, + "learning_rate": 0.00015790801664411714, + "loss": 0.8573, + "step": 19990 + }, + { + "epoch": 0.5133128236730908, + "grad_norm": 0.796875, + "learning_rate": 0.00015790437705400098, + "loss": 0.9719, + "step": 19991 + }, + { + "epoch": 0.5133385008690126, + "grad_norm": 0.765625, + "learning_rate": 0.0001579007373484866, + "loss": 0.8127, + "step": 19992 + }, + { + "epoch": 0.5133641780649344, + "grad_norm": 0.71875, + "learning_rate": 0.0001578970975275813, + "loss": 0.9123, + "step": 19993 + }, + { + "epoch": 0.5133898552608562, + "grad_norm": 0.78515625, + "learning_rate": 0.00015789345759129232, + "loss": 0.8794, + "step": 19994 + }, + { + "epoch": 0.5134155324567781, + "grad_norm": 0.80078125, + "learning_rate": 0.00015788981753962693, + "loss": 0.8956, + "step": 19995 + }, + { + "epoch": 0.5134412096526999, + "grad_norm": 0.8046875, + "learning_rate": 0.00015788617737259232, + "loss": 0.9075, + "step": 19996 + }, + { + "epoch": 0.5134668868486217, + "grad_norm": 0.86328125, + "learning_rate": 0.0001578825370901958, + "loss": 1.0393, + "step": 19997 + }, + { + "epoch": 0.5134925640445435, + "grad_norm": 0.76953125, + "learning_rate": 0.00015787889669244463, + "loss": 0.8227, + "step": 19998 + }, + { + "epoch": 0.5135182412404653, + "grad_norm": 0.78515625, + "learning_rate": 0.00015787525617934602, + "loss": 0.7602, + "step": 19999 + }, + { + "epoch": 0.5135439184363871, + "grad_norm": 0.76953125, + "learning_rate": 0.00015787161555090725, + "loss": 1.0335, + "step": 20000 + }, + { + "epoch": 0.5135439184363871, + "eval_loss": 0.8822827935218811, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 405.1172, + "eval_samples_per_second": 24.684, + "eval_steps_per_second": 0.773, + "step": 20000 + }, + { + "epoch": 0.513569595632309, + "grad_norm": 0.8671875, + "learning_rate": 0.00015786797480713563, + "loss": 0.9543, + "step": 20001 + }, + { + "epoch": 0.5135952728282308, + "grad_norm": 0.82421875, + "learning_rate": 0.0001578643339480383, + "loss": 0.9423, + "step": 20002 + }, + { + "epoch": 0.5136209500241526, + "grad_norm": 0.87109375, + "learning_rate": 0.0001578606929736226, + "loss": 0.9768, + "step": 20003 + }, + { + "epoch": 0.5136466272200745, + "grad_norm": 0.76171875, + "learning_rate": 0.00015785705188389577, + "loss": 0.8896, + "step": 20004 + }, + { + "epoch": 0.5136723044159962, + "grad_norm": 0.78515625, + "learning_rate": 0.00015785341067886508, + "loss": 0.9152, + "step": 20005 + }, + { + "epoch": 0.513697981611918, + "grad_norm": 0.83984375, + "learning_rate": 0.00015784976935853776, + "loss": 1.0084, + "step": 20006 + }, + { + "epoch": 0.5137236588078399, + "grad_norm": 0.74609375, + "learning_rate": 0.00015784612792292104, + "loss": 0.8207, + "step": 20007 + }, + { + "epoch": 0.5137493360037617, + "grad_norm": 0.78125, + "learning_rate": 0.00015784248637202227, + "loss": 0.9628, + "step": 20008 + }, + { + "epoch": 0.5137750131996836, + "grad_norm": 0.796875, + "learning_rate": 0.0001578388447058486, + "loss": 0.9614, + "step": 20009 + }, + { + "epoch": 0.5138006903956054, + "grad_norm": 0.77734375, + "learning_rate": 0.00015783520292440736, + "loss": 0.9297, + "step": 20010 + }, + { + "epoch": 0.5138263675915272, + "grad_norm": 0.78125, + "learning_rate": 0.00015783156102770578, + "loss": 0.9286, + "step": 20011 + }, + { + "epoch": 0.513852044787449, + "grad_norm": 0.78125, + "learning_rate": 0.00015782791901575112, + "loss": 0.958, + "step": 20012 + }, + { + "epoch": 0.5138777219833708, + "grad_norm": 0.8671875, + "learning_rate": 0.00015782427688855065, + "loss": 0.868, + "step": 20013 + }, + { + "epoch": 0.5139033991792926, + "grad_norm": 0.76171875, + "learning_rate": 0.0001578206346461116, + "loss": 0.8798, + "step": 20014 + }, + { + "epoch": 0.5139290763752145, + "grad_norm": 0.73828125, + "learning_rate": 0.00015781699228844127, + "loss": 0.9009, + "step": 20015 + }, + { + "epoch": 0.5139547535711363, + "grad_norm": 0.7734375, + "learning_rate": 0.00015781334981554687, + "loss": 0.9893, + "step": 20016 + }, + { + "epoch": 0.5139804307670581, + "grad_norm": 0.8203125, + "learning_rate": 0.0001578097072274357, + "loss": 0.8327, + "step": 20017 + }, + { + "epoch": 0.5140061079629799, + "grad_norm": 0.8125, + "learning_rate": 0.00015780606452411503, + "loss": 0.8356, + "step": 20018 + }, + { + "epoch": 0.5140317851589017, + "grad_norm": 0.8046875, + "learning_rate": 0.00015780242170559204, + "loss": 0.8981, + "step": 20019 + }, + { + "epoch": 0.5140574623548235, + "grad_norm": 0.83984375, + "learning_rate": 0.0001577987787718741, + "loss": 0.8442, + "step": 20020 + }, + { + "epoch": 0.5140831395507454, + "grad_norm": 0.7890625, + "learning_rate": 0.00015779513572296842, + "loss": 0.8967, + "step": 20021 + }, + { + "epoch": 0.5141088167466672, + "grad_norm": 0.765625, + "learning_rate": 0.00015779149255888223, + "loss": 0.9106, + "step": 20022 + }, + { + "epoch": 0.514134493942589, + "grad_norm": 0.76171875, + "learning_rate": 0.0001577878492796228, + "loss": 0.8704, + "step": 20023 + }, + { + "epoch": 0.5141601711385109, + "grad_norm": 0.76953125, + "learning_rate": 0.00015778420588519743, + "loss": 0.9321, + "step": 20024 + }, + { + "epoch": 0.5141858483344326, + "grad_norm": 0.890625, + "learning_rate": 0.0001577805623756133, + "loss": 0.8864, + "step": 20025 + }, + { + "epoch": 0.5142115255303544, + "grad_norm": 0.74609375, + "learning_rate": 0.0001577769187508778, + "loss": 0.8453, + "step": 20026 + }, + { + "epoch": 0.5142372027262763, + "grad_norm": 0.73046875, + "learning_rate": 0.0001577732750109981, + "loss": 0.814, + "step": 20027 + }, + { + "epoch": 0.5142628799221981, + "grad_norm": 0.79296875, + "learning_rate": 0.0001577696311559815, + "loss": 0.8222, + "step": 20028 + }, + { + "epoch": 0.5142885571181199, + "grad_norm": 0.81640625, + "learning_rate": 0.00015776598718583523, + "loss": 0.8584, + "step": 20029 + }, + { + "epoch": 0.5143142343140418, + "grad_norm": 0.76953125, + "learning_rate": 0.00015776234310056655, + "loss": 0.808, + "step": 20030 + }, + { + "epoch": 0.5143399115099636, + "grad_norm": 0.82421875, + "learning_rate": 0.00015775869890018274, + "loss": 0.9571, + "step": 20031 + }, + { + "epoch": 0.5143655887058853, + "grad_norm": 0.75390625, + "learning_rate": 0.00015775505458469105, + "loss": 0.8889, + "step": 20032 + }, + { + "epoch": 0.5143912659018072, + "grad_norm": 0.81640625, + "learning_rate": 0.00015775141015409876, + "loss": 0.8422, + "step": 20033 + }, + { + "epoch": 0.514416943097729, + "grad_norm": 0.7421875, + "learning_rate": 0.00015774776560841317, + "loss": 0.8021, + "step": 20034 + }, + { + "epoch": 0.5144426202936508, + "grad_norm": 0.6953125, + "learning_rate": 0.00015774412094764147, + "loss": 0.9959, + "step": 20035 + }, + { + "epoch": 0.5144682974895727, + "grad_norm": 0.7109375, + "learning_rate": 0.00015774047617179093, + "loss": 0.8483, + "step": 20036 + }, + { + "epoch": 0.5144939746854945, + "grad_norm": 0.79296875, + "learning_rate": 0.00015773683128086886, + "loss": 0.8043, + "step": 20037 + }, + { + "epoch": 0.5145196518814162, + "grad_norm": 0.75, + "learning_rate": 0.0001577331862748825, + "loss": 0.7963, + "step": 20038 + }, + { + "epoch": 0.5145453290773381, + "grad_norm": 0.78125, + "learning_rate": 0.0001577295411538391, + "loss": 0.8256, + "step": 20039 + }, + { + "epoch": 0.5145710062732599, + "grad_norm": 0.76953125, + "learning_rate": 0.00015772589591774594, + "loss": 1.0084, + "step": 20040 + }, + { + "epoch": 0.5145966834691817, + "grad_norm": 0.79296875, + "learning_rate": 0.0001577222505666103, + "loss": 0.8635, + "step": 20041 + }, + { + "epoch": 0.5146223606651036, + "grad_norm": 0.78125, + "learning_rate": 0.00015771860510043942, + "loss": 0.886, + "step": 20042 + }, + { + "epoch": 0.5146480378610254, + "grad_norm": 0.80078125, + "learning_rate": 0.00015771495951924058, + "loss": 0.9604, + "step": 20043 + }, + { + "epoch": 0.5146737150569473, + "grad_norm": 0.734375, + "learning_rate": 0.00015771131382302103, + "loss": 0.8191, + "step": 20044 + }, + { + "epoch": 0.514699392252869, + "grad_norm": 0.8125, + "learning_rate": 0.00015770766801178802, + "loss": 0.9271, + "step": 20045 + }, + { + "epoch": 0.5147250694487908, + "grad_norm": 0.73828125, + "learning_rate": 0.00015770402208554888, + "loss": 0.8477, + "step": 20046 + }, + { + "epoch": 0.5147507466447127, + "grad_norm": 0.8828125, + "learning_rate": 0.00015770037604431083, + "loss": 0.9604, + "step": 20047 + }, + { + "epoch": 0.5147764238406345, + "grad_norm": 0.7734375, + "learning_rate": 0.00015769672988808116, + "loss": 0.7656, + "step": 20048 + }, + { + "epoch": 0.5148021010365563, + "grad_norm": 0.83203125, + "learning_rate": 0.00015769308361686707, + "loss": 0.9592, + "step": 20049 + }, + { + "epoch": 0.5148277782324782, + "grad_norm": 0.72265625, + "learning_rate": 0.0001576894372306759, + "loss": 0.7385, + "step": 20050 + }, + { + "epoch": 0.5148534554283999, + "grad_norm": 0.75, + "learning_rate": 0.00015768579072951493, + "loss": 0.8331, + "step": 20051 + }, + { + "epoch": 0.5148791326243217, + "grad_norm": 0.78515625, + "learning_rate": 0.00015768214411339133, + "loss": 0.8108, + "step": 20052 + }, + { + "epoch": 0.5149048098202436, + "grad_norm": 0.78515625, + "learning_rate": 0.00015767849738231244, + "loss": 0.9936, + "step": 20053 + }, + { + "epoch": 0.5149304870161654, + "grad_norm": 0.8125, + "learning_rate": 0.00015767485053628553, + "loss": 0.8628, + "step": 20054 + }, + { + "epoch": 0.5149561642120872, + "grad_norm": 0.77734375, + "learning_rate": 0.0001576712035753179, + "loss": 0.8721, + "step": 20055 + }, + { + "epoch": 0.5149818414080091, + "grad_norm": 0.76953125, + "learning_rate": 0.00015766755649941668, + "loss": 0.9179, + "step": 20056 + }, + { + "epoch": 0.5150075186039309, + "grad_norm": 0.859375, + "learning_rate": 0.00015766390930858928, + "loss": 1.0006, + "step": 20057 + }, + { + "epoch": 0.5150331957998526, + "grad_norm": 0.8125, + "learning_rate": 0.0001576602620028429, + "loss": 0.8832, + "step": 20058 + }, + { + "epoch": 0.5150588729957745, + "grad_norm": 0.71875, + "learning_rate": 0.00015765661458218484, + "loss": 0.7765, + "step": 20059 + }, + { + "epoch": 0.5150845501916963, + "grad_norm": 0.703125, + "learning_rate": 0.00015765296704662235, + "loss": 0.7124, + "step": 20060 + }, + { + "epoch": 0.5151102273876181, + "grad_norm": 0.70703125, + "learning_rate": 0.0001576493193961627, + "loss": 0.8827, + "step": 20061 + }, + { + "epoch": 0.51513590458354, + "grad_norm": 0.875, + "learning_rate": 0.00015764567163081315, + "loss": 0.9194, + "step": 20062 + }, + { + "epoch": 0.5151615817794618, + "grad_norm": 0.76953125, + "learning_rate": 0.000157642023750581, + "loss": 0.9477, + "step": 20063 + }, + { + "epoch": 0.5151872589753836, + "grad_norm": 0.71484375, + "learning_rate": 0.00015763837575547354, + "loss": 0.8719, + "step": 20064 + }, + { + "epoch": 0.5152129361713054, + "grad_norm": 0.76171875, + "learning_rate": 0.00015763472764549795, + "loss": 0.7549, + "step": 20065 + }, + { + "epoch": 0.5152386133672272, + "grad_norm": 0.83203125, + "learning_rate": 0.00015763107942066154, + "loss": 0.9832, + "step": 20066 + }, + { + "epoch": 0.515264290563149, + "grad_norm": 0.75390625, + "learning_rate": 0.00015762743108097167, + "loss": 0.9233, + "step": 20067 + }, + { + "epoch": 0.5152899677590709, + "grad_norm": 0.7734375, + "learning_rate": 0.00015762378262643547, + "loss": 0.7906, + "step": 20068 + }, + { + "epoch": 0.5153156449549927, + "grad_norm": 0.78515625, + "learning_rate": 0.00015762013405706032, + "loss": 0.8641, + "step": 20069 + }, + { + "epoch": 0.5153413221509145, + "grad_norm": 0.734375, + "learning_rate": 0.00015761648537285342, + "loss": 0.7765, + "step": 20070 + }, + { + "epoch": 0.5153669993468363, + "grad_norm": 0.71875, + "learning_rate": 0.00015761283657382205, + "loss": 0.7752, + "step": 20071 + }, + { + "epoch": 0.5153926765427581, + "grad_norm": 0.7578125, + "learning_rate": 0.00015760918765997354, + "loss": 0.8287, + "step": 20072 + }, + { + "epoch": 0.51541835373868, + "grad_norm": 0.734375, + "learning_rate": 0.00015760553863131512, + "loss": 0.9172, + "step": 20073 + }, + { + "epoch": 0.5154440309346018, + "grad_norm": 0.8046875, + "learning_rate": 0.00015760188948785403, + "loss": 0.8513, + "step": 20074 + }, + { + "epoch": 0.5154697081305236, + "grad_norm": 0.7734375, + "learning_rate": 0.0001575982402295976, + "loss": 0.8858, + "step": 20075 + }, + { + "epoch": 0.5154953853264455, + "grad_norm": 0.78125, + "learning_rate": 0.00015759459085655308, + "loss": 0.8604, + "step": 20076 + }, + { + "epoch": 0.5155210625223673, + "grad_norm": 0.78125, + "learning_rate": 0.00015759094136872778, + "loss": 0.9251, + "step": 20077 + }, + { + "epoch": 0.515546739718289, + "grad_norm": 0.75390625, + "learning_rate": 0.00015758729176612888, + "loss": 0.914, + "step": 20078 + }, + { + "epoch": 0.5155724169142109, + "grad_norm": 0.87109375, + "learning_rate": 0.0001575836420487637, + "loss": 0.9156, + "step": 20079 + }, + { + "epoch": 0.5155980941101327, + "grad_norm": 0.76953125, + "learning_rate": 0.00015757999221663958, + "loss": 0.8702, + "step": 20080 + }, + { + "epoch": 0.5156237713060545, + "grad_norm": 0.69921875, + "learning_rate": 0.0001575763422697637, + "loss": 0.8288, + "step": 20081 + }, + { + "epoch": 0.5156494485019764, + "grad_norm": 0.80859375, + "learning_rate": 0.0001575726922081434, + "loss": 0.8292, + "step": 20082 + }, + { + "epoch": 0.5156751256978982, + "grad_norm": 0.8125, + "learning_rate": 0.00015756904203178588, + "loss": 0.9568, + "step": 20083 + }, + { + "epoch": 0.51570080289382, + "grad_norm": 0.77734375, + "learning_rate": 0.00015756539174069847, + "loss": 0.9065, + "step": 20084 + }, + { + "epoch": 0.5157264800897418, + "grad_norm": 0.765625, + "learning_rate": 0.00015756174133488846, + "loss": 0.8109, + "step": 20085 + }, + { + "epoch": 0.5157521572856636, + "grad_norm": 0.7578125, + "learning_rate": 0.0001575580908143631, + "loss": 0.9421, + "step": 20086 + }, + { + "epoch": 0.5157778344815854, + "grad_norm": 0.69921875, + "learning_rate": 0.00015755444017912963, + "loss": 0.8241, + "step": 20087 + }, + { + "epoch": 0.5158035116775073, + "grad_norm": 0.8046875, + "learning_rate": 0.0001575507894291954, + "loss": 0.8555, + "step": 20088 + }, + { + "epoch": 0.5158291888734291, + "grad_norm": 0.78125, + "learning_rate": 0.00015754713856456763, + "loss": 0.8861, + "step": 20089 + }, + { + "epoch": 0.5158548660693509, + "grad_norm": 0.8046875, + "learning_rate": 0.0001575434875852536, + "loss": 1.0477, + "step": 20090 + }, + { + "epoch": 0.5158805432652727, + "grad_norm": 0.734375, + "learning_rate": 0.0001575398364912606, + "loss": 0.847, + "step": 20091 + }, + { + "epoch": 0.5159062204611945, + "grad_norm": 0.87109375, + "learning_rate": 0.00015753618528259588, + "loss": 0.938, + "step": 20092 + }, + { + "epoch": 0.5159318976571163, + "grad_norm": 0.76171875, + "learning_rate": 0.00015753253395926677, + "loss": 0.8579, + "step": 20093 + }, + { + "epoch": 0.5159575748530382, + "grad_norm": 0.828125, + "learning_rate": 0.00015752888252128054, + "loss": 0.8522, + "step": 20094 + }, + { + "epoch": 0.51598325204896, + "grad_norm": 0.75, + "learning_rate": 0.00015752523096864443, + "loss": 0.8758, + "step": 20095 + }, + { + "epoch": 0.5160089292448818, + "grad_norm": 0.80078125, + "learning_rate": 0.0001575215793013657, + "loss": 0.7647, + "step": 20096 + }, + { + "epoch": 0.5160346064408037, + "grad_norm": 0.734375, + "learning_rate": 0.00015751792751945168, + "loss": 0.6714, + "step": 20097 + }, + { + "epoch": 0.5160602836367254, + "grad_norm": 0.75390625, + "learning_rate": 0.00015751427562290962, + "loss": 0.7805, + "step": 20098 + }, + { + "epoch": 0.5160859608326472, + "grad_norm": 0.93359375, + "learning_rate": 0.0001575106236117468, + "loss": 0.9154, + "step": 20099 + }, + { + "epoch": 0.5161116380285691, + "grad_norm": 0.765625, + "learning_rate": 0.00015750697148597055, + "loss": 0.9649, + "step": 20100 + }, + { + "epoch": 0.5161373152244909, + "grad_norm": 0.875, + "learning_rate": 0.000157503319245588, + "loss": 0.8935, + "step": 20101 + }, + { + "epoch": 0.5161629924204127, + "grad_norm": 0.73828125, + "learning_rate": 0.0001574996668906066, + "loss": 0.8271, + "step": 20102 + }, + { + "epoch": 0.5161886696163346, + "grad_norm": 0.859375, + "learning_rate": 0.00015749601442103357, + "loss": 0.8768, + "step": 20103 + }, + { + "epoch": 0.5162143468122564, + "grad_norm": 0.79296875, + "learning_rate": 0.00015749236183687612, + "loss": 0.851, + "step": 20104 + }, + { + "epoch": 0.5162400240081781, + "grad_norm": 0.78515625, + "learning_rate": 0.00015748870913814163, + "loss": 0.866, + "step": 20105 + }, + { + "epoch": 0.5162657012041, + "grad_norm": 0.8203125, + "learning_rate": 0.00015748505632483733, + "loss": 0.7938, + "step": 20106 + }, + { + "epoch": 0.5162913784000218, + "grad_norm": 0.78515625, + "learning_rate": 0.0001574814033969705, + "loss": 0.8876, + "step": 20107 + }, + { + "epoch": 0.5163170555959437, + "grad_norm": 0.73828125, + "learning_rate": 0.00015747775035454843, + "loss": 0.8228, + "step": 20108 + }, + { + "epoch": 0.5163427327918655, + "grad_norm": 0.76953125, + "learning_rate": 0.00015747409719757839, + "loss": 0.7751, + "step": 20109 + }, + { + "epoch": 0.5163684099877873, + "grad_norm": 0.8046875, + "learning_rate": 0.00015747044392606764, + "loss": 0.8654, + "step": 20110 + }, + { + "epoch": 0.516394087183709, + "grad_norm": 0.84765625, + "learning_rate": 0.00015746679054002354, + "loss": 0.9466, + "step": 20111 + }, + { + "epoch": 0.5164197643796309, + "grad_norm": 1.984375, + "learning_rate": 0.0001574631370394533, + "loss": 1.0305, + "step": 20112 + }, + { + "epoch": 0.5164454415755527, + "grad_norm": 0.83984375, + "learning_rate": 0.00015745948342436422, + "loss": 1.02, + "step": 20113 + }, + { + "epoch": 0.5164711187714746, + "grad_norm": 0.8125, + "learning_rate": 0.00015745582969476355, + "loss": 0.917, + "step": 20114 + }, + { + "epoch": 0.5164967959673964, + "grad_norm": 0.78125, + "learning_rate": 0.00015745217585065864, + "loss": 0.7906, + "step": 20115 + }, + { + "epoch": 0.5165224731633182, + "grad_norm": 0.75, + "learning_rate": 0.00015744852189205667, + "loss": 0.9939, + "step": 20116 + }, + { + "epoch": 0.5165481503592401, + "grad_norm": 0.8515625, + "learning_rate": 0.00015744486781896506, + "loss": 0.9495, + "step": 20117 + }, + { + "epoch": 0.5165738275551618, + "grad_norm": 0.91796875, + "learning_rate": 0.00015744121363139098, + "loss": 0.9702, + "step": 20118 + }, + { + "epoch": 0.5165995047510836, + "grad_norm": 0.859375, + "learning_rate": 0.00015743755932934173, + "loss": 0.9105, + "step": 20119 + }, + { + "epoch": 0.5166251819470055, + "grad_norm": 0.7421875, + "learning_rate": 0.00015743390491282467, + "loss": 0.8946, + "step": 20120 + }, + { + "epoch": 0.5166508591429273, + "grad_norm": 0.80078125, + "learning_rate": 0.000157430250381847, + "loss": 0.9154, + "step": 20121 + }, + { + "epoch": 0.5166765363388491, + "grad_norm": 0.7578125, + "learning_rate": 0.00015742659573641602, + "loss": 0.8649, + "step": 20122 + }, + { + "epoch": 0.516702213534771, + "grad_norm": 0.8203125, + "learning_rate": 0.000157422940976539, + "loss": 0.8054, + "step": 20123 + }, + { + "epoch": 0.5167278907306928, + "grad_norm": 0.80078125, + "learning_rate": 0.00015741928610222327, + "loss": 0.9156, + "step": 20124 + }, + { + "epoch": 0.5167535679266145, + "grad_norm": 0.8125, + "learning_rate": 0.0001574156311134761, + "loss": 1.021, + "step": 20125 + }, + { + "epoch": 0.5167792451225364, + "grad_norm": 0.73046875, + "learning_rate": 0.00015741197601030476, + "loss": 0.844, + "step": 20126 + }, + { + "epoch": 0.5168049223184582, + "grad_norm": 0.796875, + "learning_rate": 0.00015740832079271652, + "loss": 0.9242, + "step": 20127 + }, + { + "epoch": 0.51683059951438, + "grad_norm": 0.75, + "learning_rate": 0.00015740466546071871, + "loss": 0.7885, + "step": 20128 + }, + { + "epoch": 0.5168562767103019, + "grad_norm": 0.78515625, + "learning_rate": 0.00015740101001431855, + "loss": 0.8565, + "step": 20129 + }, + { + "epoch": 0.5168819539062237, + "grad_norm": 0.74609375, + "learning_rate": 0.0001573973544535234, + "loss": 0.8753, + "step": 20130 + }, + { + "epoch": 0.5169076311021454, + "grad_norm": 0.83203125, + "learning_rate": 0.00015739369877834048, + "loss": 0.8234, + "step": 20131 + }, + { + "epoch": 0.5169333082980673, + "grad_norm": 0.765625, + "learning_rate": 0.0001573900429887771, + "loss": 0.912, + "step": 20132 + }, + { + "epoch": 0.5169589854939891, + "grad_norm": 0.76953125, + "learning_rate": 0.00015738638708484058, + "loss": 0.8715, + "step": 20133 + }, + { + "epoch": 0.516984662689911, + "grad_norm": 0.8203125, + "learning_rate": 0.00015738273106653816, + "loss": 0.8564, + "step": 20134 + }, + { + "epoch": 0.5170103398858328, + "grad_norm": 0.80078125, + "learning_rate": 0.00015737907493387714, + "loss": 0.9063, + "step": 20135 + }, + { + "epoch": 0.5170360170817546, + "grad_norm": 0.72265625, + "learning_rate": 0.0001573754186868648, + "loss": 0.8159, + "step": 20136 + }, + { + "epoch": 0.5170616942776765, + "grad_norm": 0.8203125, + "learning_rate": 0.00015737176232550842, + "loss": 0.9104, + "step": 20137 + }, + { + "epoch": 0.5170873714735982, + "grad_norm": 0.74609375, + "learning_rate": 0.0001573681058498153, + "loss": 0.9499, + "step": 20138 + }, + { + "epoch": 0.51711304866952, + "grad_norm": 0.7578125, + "learning_rate": 0.00015736444925979276, + "loss": 0.8648, + "step": 20139 + }, + { + "epoch": 0.5171387258654419, + "grad_norm": 0.796875, + "learning_rate": 0.00015736079255544805, + "loss": 0.8455, + "step": 20140 + }, + { + "epoch": 0.5171644030613637, + "grad_norm": 0.84375, + "learning_rate": 0.00015735713573678843, + "loss": 1.0449, + "step": 20141 + }, + { + "epoch": 0.5171900802572855, + "grad_norm": 0.73046875, + "learning_rate": 0.00015735347880382124, + "loss": 0.7938, + "step": 20142 + }, + { + "epoch": 0.5172157574532074, + "grad_norm": 0.77734375, + "learning_rate": 0.00015734982175655374, + "loss": 0.8745, + "step": 20143 + }, + { + "epoch": 0.5172414346491292, + "grad_norm": 0.69140625, + "learning_rate": 0.00015734616459499323, + "loss": 0.9084, + "step": 20144 + }, + { + "epoch": 0.5172671118450509, + "grad_norm": 0.75390625, + "learning_rate": 0.000157342507319147, + "loss": 0.8125, + "step": 20145 + }, + { + "epoch": 0.5172927890409728, + "grad_norm": 0.79296875, + "learning_rate": 0.00015733884992902235, + "loss": 0.8498, + "step": 20146 + }, + { + "epoch": 0.5173184662368946, + "grad_norm": 0.80078125, + "learning_rate": 0.0001573351924246265, + "loss": 0.9545, + "step": 20147 + }, + { + "epoch": 0.5173441434328164, + "grad_norm": 0.95703125, + "learning_rate": 0.00015733153480596682, + "loss": 0.8793, + "step": 20148 + }, + { + "epoch": 0.5173698206287383, + "grad_norm": 0.87109375, + "learning_rate": 0.0001573278770730506, + "loss": 0.9331, + "step": 20149 + }, + { + "epoch": 0.5173954978246601, + "grad_norm": 0.84765625, + "learning_rate": 0.00015732421922588503, + "loss": 0.8854, + "step": 20150 + }, + { + "epoch": 0.5174211750205818, + "grad_norm": 0.69921875, + "learning_rate": 0.0001573205612644775, + "loss": 0.7812, + "step": 20151 + }, + { + "epoch": 0.5174468522165037, + "grad_norm": 0.74609375, + "learning_rate": 0.0001573169031888353, + "loss": 0.7962, + "step": 20152 + }, + { + "epoch": 0.5174725294124255, + "grad_norm": 0.7265625, + "learning_rate": 0.00015731324499896568, + "loss": 0.8402, + "step": 20153 + }, + { + "epoch": 0.5174982066083473, + "grad_norm": 0.7265625, + "learning_rate": 0.00015730958669487593, + "loss": 0.7419, + "step": 20154 + }, + { + "epoch": 0.5175238838042692, + "grad_norm": 0.82421875, + "learning_rate": 0.00015730592827657334, + "loss": 0.933, + "step": 20155 + }, + { + "epoch": 0.517549561000191, + "grad_norm": 0.7734375, + "learning_rate": 0.0001573022697440652, + "loss": 0.8864, + "step": 20156 + }, + { + "epoch": 0.5175752381961128, + "grad_norm": 0.75390625, + "learning_rate": 0.00015729861109735883, + "loss": 0.8164, + "step": 20157 + }, + { + "epoch": 0.5176009153920346, + "grad_norm": 0.7734375, + "learning_rate": 0.00015729495233646152, + "loss": 0.791, + "step": 20158 + }, + { + "epoch": 0.5176265925879564, + "grad_norm": 0.73828125, + "learning_rate": 0.00015729129346138052, + "loss": 0.8996, + "step": 20159 + }, + { + "epoch": 0.5176522697838782, + "grad_norm": 0.81640625, + "learning_rate": 0.00015728763447212316, + "loss": 0.893, + "step": 20160 + }, + { + "epoch": 0.5176779469798001, + "grad_norm": 0.85546875, + "learning_rate": 0.00015728397536869673, + "loss": 0.928, + "step": 20161 + }, + { + "epoch": 0.5177036241757219, + "grad_norm": 0.75, + "learning_rate": 0.0001572803161511085, + "loss": 0.969, + "step": 20162 + }, + { + "epoch": 0.5177293013716437, + "grad_norm": 0.78515625, + "learning_rate": 0.00015727665681936577, + "loss": 0.9015, + "step": 20163 + }, + { + "epoch": 0.5177549785675656, + "grad_norm": 0.82421875, + "learning_rate": 0.00015727299737347582, + "loss": 0.8727, + "step": 20164 + }, + { + "epoch": 0.5177806557634873, + "grad_norm": 0.6875, + "learning_rate": 0.000157269337813446, + "loss": 0.8281, + "step": 20165 + }, + { + "epoch": 0.5178063329594091, + "grad_norm": 0.73828125, + "learning_rate": 0.00015726567813928352, + "loss": 0.8357, + "step": 20166 + }, + { + "epoch": 0.517832010155331, + "grad_norm": 0.72265625, + "learning_rate": 0.00015726201835099574, + "loss": 0.8497, + "step": 20167 + }, + { + "epoch": 0.5178576873512528, + "grad_norm": 0.76953125, + "learning_rate": 0.00015725835844858995, + "loss": 0.7898, + "step": 20168 + }, + { + "epoch": 0.5178833645471747, + "grad_norm": 0.84375, + "learning_rate": 0.0001572546984320734, + "loss": 0.9281, + "step": 20169 + }, + { + "epoch": 0.5179090417430965, + "grad_norm": 0.83984375, + "learning_rate": 0.0001572510383014534, + "loss": 0.9468, + "step": 20170 + }, + { + "epoch": 0.5179347189390182, + "grad_norm": 0.734375, + "learning_rate": 0.00015724737805673726, + "loss": 0.9246, + "step": 20171 + }, + { + "epoch": 0.51796039613494, + "grad_norm": 0.8203125, + "learning_rate": 0.00015724371769793228, + "loss": 0.8438, + "step": 20172 + }, + { + "epoch": 0.5179860733308619, + "grad_norm": 0.76953125, + "learning_rate": 0.00015724005722504573, + "loss": 0.826, + "step": 20173 + }, + { + "epoch": 0.5180117505267837, + "grad_norm": 0.79296875, + "learning_rate": 0.0001572363966380849, + "loss": 0.9039, + "step": 20174 + }, + { + "epoch": 0.5180374277227056, + "grad_norm": 0.8515625, + "learning_rate": 0.0001572327359370571, + "loss": 0.9236, + "step": 20175 + }, + { + "epoch": 0.5180631049186274, + "grad_norm": 0.76171875, + "learning_rate": 0.00015722907512196965, + "loss": 0.8816, + "step": 20176 + }, + { + "epoch": 0.5180887821145492, + "grad_norm": 0.70703125, + "learning_rate": 0.00015722541419282981, + "loss": 0.7409, + "step": 20177 + }, + { + "epoch": 0.518114459310471, + "grad_norm": 0.78125, + "learning_rate": 0.0001572217531496449, + "loss": 0.9864, + "step": 20178 + }, + { + "epoch": 0.5181401365063928, + "grad_norm": 0.82421875, + "learning_rate": 0.0001572180919924222, + "loss": 1.0238, + "step": 20179 + }, + { + "epoch": 0.5181658137023146, + "grad_norm": 0.78125, + "learning_rate": 0.00015721443072116905, + "loss": 1.0321, + "step": 20180 + }, + { + "epoch": 0.5181914908982365, + "grad_norm": 0.8046875, + "learning_rate": 0.00015721076933589265, + "loss": 0.8973, + "step": 20181 + }, + { + "epoch": 0.5182171680941583, + "grad_norm": 0.75390625, + "learning_rate": 0.00015720710783660038, + "loss": 1.0063, + "step": 20182 + }, + { + "epoch": 0.5182428452900801, + "grad_norm": 0.80859375, + "learning_rate": 0.0001572034462232995, + "loss": 0.8885, + "step": 20183 + }, + { + "epoch": 0.518268522486002, + "grad_norm": 0.78125, + "learning_rate": 0.00015719978449599734, + "loss": 0.8799, + "step": 20184 + }, + { + "epoch": 0.5182941996819237, + "grad_norm": 0.79296875, + "learning_rate": 0.00015719612265470114, + "loss": 0.8314, + "step": 20185 + }, + { + "epoch": 0.5183198768778455, + "grad_norm": 0.79296875, + "learning_rate": 0.00015719246069941827, + "loss": 0.7407, + "step": 20186 + }, + { + "epoch": 0.5183455540737674, + "grad_norm": 0.78125, + "learning_rate": 0.00015718879863015596, + "loss": 0.9374, + "step": 20187 + }, + { + "epoch": 0.5183712312696892, + "grad_norm": 0.75, + "learning_rate": 0.00015718513644692156, + "loss": 0.9025, + "step": 20188 + }, + { + "epoch": 0.518396908465611, + "grad_norm": 0.7734375, + "learning_rate": 0.00015718147414972236, + "loss": 1.0008, + "step": 20189 + }, + { + "epoch": 0.5184225856615329, + "grad_norm": 0.80859375, + "learning_rate": 0.0001571778117385656, + "loss": 0.8256, + "step": 20190 + }, + { + "epoch": 0.5184482628574546, + "grad_norm": 0.7265625, + "learning_rate": 0.00015717414921345867, + "loss": 0.8075, + "step": 20191 + }, + { + "epoch": 0.5184739400533764, + "grad_norm": 0.78515625, + "learning_rate": 0.00015717048657440884, + "loss": 0.8551, + "step": 20192 + }, + { + "epoch": 0.5184996172492983, + "grad_norm": 0.828125, + "learning_rate": 0.00015716682382142336, + "loss": 0.9445, + "step": 20193 + }, + { + "epoch": 0.5185252944452201, + "grad_norm": 0.875, + "learning_rate": 0.00015716316095450958, + "loss": 0.8542, + "step": 20194 + }, + { + "epoch": 0.518550971641142, + "grad_norm": 0.83984375, + "learning_rate": 0.00015715949797367475, + "loss": 0.8982, + "step": 20195 + }, + { + "epoch": 0.5185766488370638, + "grad_norm": 0.8203125, + "learning_rate": 0.00015715583487892624, + "loss": 0.968, + "step": 20196 + }, + { + "epoch": 0.5186023260329856, + "grad_norm": 0.73046875, + "learning_rate": 0.0001571521716702713, + "loss": 0.8468, + "step": 20197 + }, + { + "epoch": 0.5186280032289073, + "grad_norm": 0.80078125, + "learning_rate": 0.00015714850834771724, + "loss": 0.8826, + "step": 20198 + }, + { + "epoch": 0.5186536804248292, + "grad_norm": 0.8359375, + "learning_rate": 0.00015714484491127138, + "loss": 0.815, + "step": 20199 + }, + { + "epoch": 0.518679357620751, + "grad_norm": 0.8671875, + "learning_rate": 0.00015714118136094101, + "loss": 0.9325, + "step": 20200 + }, + { + "epoch": 0.5187050348166728, + "grad_norm": 0.80078125, + "learning_rate": 0.00015713751769673342, + "loss": 0.8783, + "step": 20201 + }, + { + "epoch": 0.5187307120125947, + "grad_norm": 0.75, + "learning_rate": 0.0001571338539186559, + "loss": 0.8668, + "step": 20202 + }, + { + "epoch": 0.5187563892085165, + "grad_norm": 0.77734375, + "learning_rate": 0.00015713019002671578, + "loss": 0.8132, + "step": 20203 + }, + { + "epoch": 0.5187820664044384, + "grad_norm": 0.73828125, + "learning_rate": 0.00015712652602092034, + "loss": 0.8158, + "step": 20204 + }, + { + "epoch": 0.5188077436003601, + "grad_norm": 0.7421875, + "learning_rate": 0.00015712286190127696, + "loss": 0.919, + "step": 20205 + }, + { + "epoch": 0.5188334207962819, + "grad_norm": 0.7421875, + "learning_rate": 0.0001571191976677928, + "loss": 0.8091, + "step": 20206 + }, + { + "epoch": 0.5188590979922038, + "grad_norm": 0.78515625, + "learning_rate": 0.00015711553332047528, + "loss": 0.8675, + "step": 20207 + }, + { + "epoch": 0.5188847751881256, + "grad_norm": 0.74609375, + "learning_rate": 0.00015711186885933166, + "loss": 0.8312, + "step": 20208 + }, + { + "epoch": 0.5189104523840474, + "grad_norm": 0.765625, + "learning_rate": 0.00015710820428436922, + "loss": 0.9805, + "step": 20209 + }, + { + "epoch": 0.5189361295799693, + "grad_norm": 0.75390625, + "learning_rate": 0.0001571045395955953, + "loss": 0.8851, + "step": 20210 + }, + { + "epoch": 0.518961806775891, + "grad_norm": 0.8046875, + "learning_rate": 0.00015710087479301716, + "loss": 0.8812, + "step": 20211 + }, + { + "epoch": 0.5189874839718128, + "grad_norm": 0.83984375, + "learning_rate": 0.0001570972098766422, + "loss": 0.8757, + "step": 20212 + }, + { + "epoch": 0.5190131611677347, + "grad_norm": 0.77734375, + "learning_rate": 0.0001570935448464776, + "loss": 0.9717, + "step": 20213 + }, + { + "epoch": 0.5190388383636565, + "grad_norm": 0.7578125, + "learning_rate": 0.00015708987970253075, + "loss": 0.8163, + "step": 20214 + }, + { + "epoch": 0.5190645155595783, + "grad_norm": 0.82421875, + "learning_rate": 0.0001570862144448089, + "loss": 0.7989, + "step": 20215 + }, + { + "epoch": 0.5190901927555002, + "grad_norm": 0.7734375, + "learning_rate": 0.00015708254907331943, + "loss": 0.8237, + "step": 20216 + }, + { + "epoch": 0.519115869951422, + "grad_norm": 0.84765625, + "learning_rate": 0.00015707888358806955, + "loss": 0.9468, + "step": 20217 + }, + { + "epoch": 0.5191415471473437, + "grad_norm": 0.83984375, + "learning_rate": 0.0001570752179890666, + "loss": 1.039, + "step": 20218 + }, + { + "epoch": 0.5191672243432656, + "grad_norm": 0.72265625, + "learning_rate": 0.00015707155227631797, + "loss": 0.7708, + "step": 20219 + }, + { + "epoch": 0.5191929015391874, + "grad_norm": 0.7421875, + "learning_rate": 0.00015706788644983085, + "loss": 0.8897, + "step": 20220 + }, + { + "epoch": 0.5192185787351092, + "grad_norm": 0.78125, + "learning_rate": 0.00015706422050961258, + "loss": 0.9475, + "step": 20221 + }, + { + "epoch": 0.5192442559310311, + "grad_norm": 0.796875, + "learning_rate": 0.00015706055445567047, + "loss": 0.873, + "step": 20222 + }, + { + "epoch": 0.5192699331269529, + "grad_norm": 0.77734375, + "learning_rate": 0.0001570568882880118, + "loss": 0.8928, + "step": 20223 + }, + { + "epoch": 0.5192956103228747, + "grad_norm": 0.7890625, + "learning_rate": 0.000157053222006644, + "loss": 0.8014, + "step": 20224 + }, + { + "epoch": 0.5193212875187965, + "grad_norm": 0.796875, + "learning_rate": 0.00015704955561157423, + "loss": 0.9949, + "step": 20225 + }, + { + "epoch": 0.5193469647147183, + "grad_norm": 0.76171875, + "learning_rate": 0.00015704588910280982, + "loss": 0.8386, + "step": 20226 + }, + { + "epoch": 0.5193726419106401, + "grad_norm": 0.7265625, + "learning_rate": 0.00015704222248035814, + "loss": 0.9489, + "step": 20227 + }, + { + "epoch": 0.519398319106562, + "grad_norm": 0.8125, + "learning_rate": 0.00015703855574422643, + "loss": 0.8478, + "step": 20228 + }, + { + "epoch": 0.5194239963024838, + "grad_norm": 0.80078125, + "learning_rate": 0.00015703488889442206, + "loss": 0.823, + "step": 20229 + }, + { + "epoch": 0.5194496734984057, + "grad_norm": 0.734375, + "learning_rate": 0.0001570312219309523, + "loss": 0.9402, + "step": 20230 + }, + { + "epoch": 0.5194753506943274, + "grad_norm": 0.8125, + "learning_rate": 0.00015702755485382447, + "loss": 0.973, + "step": 20231 + }, + { + "epoch": 0.5195010278902492, + "grad_norm": 0.79296875, + "learning_rate": 0.00015702388766304588, + "loss": 0.9533, + "step": 20232 + }, + { + "epoch": 0.519526705086171, + "grad_norm": 0.7265625, + "learning_rate": 0.00015702022035862383, + "loss": 0.8669, + "step": 20233 + }, + { + "epoch": 0.5195523822820929, + "grad_norm": 0.8125, + "learning_rate": 0.0001570165529405656, + "loss": 0.8627, + "step": 20234 + }, + { + "epoch": 0.5195780594780147, + "grad_norm": 1.640625, + "learning_rate": 0.00015701288540887856, + "loss": 0.9004, + "step": 20235 + }, + { + "epoch": 0.5196037366739366, + "grad_norm": 0.72265625, + "learning_rate": 0.00015700921776356998, + "loss": 0.8678, + "step": 20236 + }, + { + "epoch": 0.5196294138698584, + "grad_norm": 0.74609375, + "learning_rate": 0.00015700555000464718, + "loss": 0.8456, + "step": 20237 + }, + { + "epoch": 0.5196550910657801, + "grad_norm": 0.74609375, + "learning_rate": 0.00015700188213211747, + "loss": 0.8987, + "step": 20238 + }, + { + "epoch": 0.519680768261702, + "grad_norm": 0.76953125, + "learning_rate": 0.00015699821414598818, + "loss": 0.7991, + "step": 20239 + }, + { + "epoch": 0.5197064454576238, + "grad_norm": 0.7734375, + "learning_rate": 0.0001569945460462666, + "loss": 0.8355, + "step": 20240 + }, + { + "epoch": 0.5197321226535456, + "grad_norm": 0.7890625, + "learning_rate": 0.00015699087783296, + "loss": 0.9506, + "step": 20241 + }, + { + "epoch": 0.5197577998494675, + "grad_norm": 0.78125, + "learning_rate": 0.00015698720950607572, + "loss": 1.0322, + "step": 20242 + }, + { + "epoch": 0.5197834770453893, + "grad_norm": 0.7734375, + "learning_rate": 0.00015698354106562112, + "loss": 0.891, + "step": 20243 + }, + { + "epoch": 0.519809154241311, + "grad_norm": 0.703125, + "learning_rate": 0.00015697987251160346, + "loss": 0.9594, + "step": 20244 + }, + { + "epoch": 0.5198348314372329, + "grad_norm": 0.7734375, + "learning_rate": 0.00015697620384403006, + "loss": 0.9634, + "step": 20245 + }, + { + "epoch": 0.5198605086331547, + "grad_norm": 0.7890625, + "learning_rate": 0.00015697253506290822, + "loss": 0.9036, + "step": 20246 + }, + { + "epoch": 0.5198861858290765, + "grad_norm": 0.73046875, + "learning_rate": 0.00015696886616824526, + "loss": 0.8582, + "step": 20247 + }, + { + "epoch": 0.5199118630249984, + "grad_norm": 0.73046875, + "learning_rate": 0.00015696519716004853, + "loss": 0.7527, + "step": 20248 + }, + { + "epoch": 0.5199375402209202, + "grad_norm": 0.8125, + "learning_rate": 0.00015696152803832525, + "loss": 0.9035, + "step": 20249 + }, + { + "epoch": 0.519963217416842, + "grad_norm": 0.78125, + "learning_rate": 0.00015695785880308284, + "loss": 0.9034, + "step": 20250 + }, + { + "epoch": 0.5199888946127638, + "grad_norm": 0.8203125, + "learning_rate": 0.00015695418945432853, + "loss": 0.9321, + "step": 20251 + }, + { + "epoch": 0.5200145718086856, + "grad_norm": 0.79296875, + "learning_rate": 0.0001569505199920697, + "loss": 0.8686, + "step": 20252 + }, + { + "epoch": 0.5200402490046074, + "grad_norm": 0.75, + "learning_rate": 0.0001569468504163136, + "loss": 0.9879, + "step": 20253 + }, + { + "epoch": 0.5200659262005293, + "grad_norm": 0.7265625, + "learning_rate": 0.0001569431807270676, + "loss": 0.8697, + "step": 20254 + }, + { + "epoch": 0.5200916033964511, + "grad_norm": 0.75, + "learning_rate": 0.00015693951092433895, + "loss": 0.8345, + "step": 20255 + }, + { + "epoch": 0.5201172805923729, + "grad_norm": 0.765625, + "learning_rate": 0.00015693584100813501, + "loss": 0.8405, + "step": 20256 + }, + { + "epoch": 0.5201429577882948, + "grad_norm": 0.7734375, + "learning_rate": 0.00015693217097846309, + "loss": 0.8357, + "step": 20257 + }, + { + "epoch": 0.5201686349842165, + "grad_norm": 0.73828125, + "learning_rate": 0.0001569285008353305, + "loss": 0.824, + "step": 20258 + }, + { + "epoch": 0.5201943121801383, + "grad_norm": 0.75390625, + "learning_rate": 0.00015692483057874453, + "loss": 0.8921, + "step": 20259 + }, + { + "epoch": 0.5202199893760602, + "grad_norm": 0.79296875, + "learning_rate": 0.00015692116020871253, + "loss": 0.7858, + "step": 20260 + }, + { + "epoch": 0.520245666571982, + "grad_norm": 0.80078125, + "learning_rate": 0.0001569174897252418, + "loss": 0.8337, + "step": 20261 + }, + { + "epoch": 0.5202713437679038, + "grad_norm": 0.8046875, + "learning_rate": 0.00015691381912833965, + "loss": 0.9346, + "step": 20262 + }, + { + "epoch": 0.5202970209638257, + "grad_norm": 0.78515625, + "learning_rate": 0.00015691014841801338, + "loss": 0.9794, + "step": 20263 + }, + { + "epoch": 0.5203226981597474, + "grad_norm": 0.71875, + "learning_rate": 0.00015690647759427034, + "loss": 0.8148, + "step": 20264 + }, + { + "epoch": 0.5203483753556692, + "grad_norm": 0.82421875, + "learning_rate": 0.00015690280665711786, + "loss": 0.903, + "step": 20265 + }, + { + "epoch": 0.5203740525515911, + "grad_norm": 0.8046875, + "learning_rate": 0.0001568991356065632, + "loss": 0.9742, + "step": 20266 + }, + { + "epoch": 0.5203997297475129, + "grad_norm": 0.74609375, + "learning_rate": 0.0001568954644426137, + "loss": 0.8209, + "step": 20267 + }, + { + "epoch": 0.5204254069434348, + "grad_norm": 0.75, + "learning_rate": 0.0001568917931652767, + "loss": 0.8255, + "step": 20268 + }, + { + "epoch": 0.5204510841393566, + "grad_norm": 0.76953125, + "learning_rate": 0.00015688812177455945, + "loss": 0.8353, + "step": 20269 + }, + { + "epoch": 0.5204767613352784, + "grad_norm": 0.796875, + "learning_rate": 0.00015688445027046933, + "loss": 0.8008, + "step": 20270 + }, + { + "epoch": 0.5205024385312002, + "grad_norm": 0.80859375, + "learning_rate": 0.00015688077865301368, + "loss": 0.9582, + "step": 20271 + }, + { + "epoch": 0.520528115727122, + "grad_norm": 0.72265625, + "learning_rate": 0.00015687710692219973, + "loss": 0.922, + "step": 20272 + }, + { + "epoch": 0.5205537929230438, + "grad_norm": 0.86328125, + "learning_rate": 0.00015687343507803485, + "loss": 0.9121, + "step": 20273 + }, + { + "epoch": 0.5205794701189657, + "grad_norm": 0.82421875, + "learning_rate": 0.00015686976312052634, + "loss": 0.9418, + "step": 20274 + }, + { + "epoch": 0.5206051473148875, + "grad_norm": 0.80859375, + "learning_rate": 0.00015686609104968157, + "loss": 0.9184, + "step": 20275 + }, + { + "epoch": 0.5206308245108093, + "grad_norm": 0.7578125, + "learning_rate": 0.0001568624188655078, + "loss": 0.75, + "step": 20276 + }, + { + "epoch": 0.5206565017067312, + "grad_norm": 0.80078125, + "learning_rate": 0.00015685874656801233, + "loss": 0.9041, + "step": 20277 + }, + { + "epoch": 0.5206821789026529, + "grad_norm": 0.828125, + "learning_rate": 0.00015685507415720258, + "loss": 1.0034, + "step": 20278 + }, + { + "epoch": 0.5207078560985747, + "grad_norm": 0.671875, + "learning_rate": 0.00015685140163308576, + "loss": 0.667, + "step": 20279 + }, + { + "epoch": 0.5207335332944966, + "grad_norm": 0.7578125, + "learning_rate": 0.00015684772899566925, + "loss": 0.8409, + "step": 20280 + }, + { + "epoch": 0.5207592104904184, + "grad_norm": 0.76953125, + "learning_rate": 0.00015684405624496033, + "loss": 0.7806, + "step": 20281 + }, + { + "epoch": 0.5207848876863402, + "grad_norm": 0.72265625, + "learning_rate": 0.00015684038338096637, + "loss": 0.7641, + "step": 20282 + }, + { + "epoch": 0.5208105648822621, + "grad_norm": 0.734375, + "learning_rate": 0.0001568367104036946, + "loss": 0.9036, + "step": 20283 + }, + { + "epoch": 0.5208362420781838, + "grad_norm": 0.73828125, + "learning_rate": 0.00015683303731315248, + "loss": 0.8416, + "step": 20284 + }, + { + "epoch": 0.5208619192741056, + "grad_norm": 0.76953125, + "learning_rate": 0.0001568293641093472, + "loss": 0.9281, + "step": 20285 + }, + { + "epoch": 0.5208875964700275, + "grad_norm": 0.7421875, + "learning_rate": 0.00015682569079228614, + "loss": 0.9329, + "step": 20286 + }, + { + "epoch": 0.5209132736659493, + "grad_norm": 0.7109375, + "learning_rate": 0.0001568220173619766, + "loss": 0.9392, + "step": 20287 + }, + { + "epoch": 0.5209389508618711, + "grad_norm": 0.72265625, + "learning_rate": 0.00015681834381842596, + "loss": 0.8596, + "step": 20288 + }, + { + "epoch": 0.520964628057793, + "grad_norm": 0.7265625, + "learning_rate": 0.00015681467016164145, + "loss": 0.9865, + "step": 20289 + }, + { + "epoch": 0.5209903052537148, + "grad_norm": 0.78515625, + "learning_rate": 0.00015681099639163043, + "loss": 0.8352, + "step": 20290 + }, + { + "epoch": 0.5210159824496365, + "grad_norm": 0.69921875, + "learning_rate": 0.00015680732250840027, + "loss": 0.9093, + "step": 20291 + }, + { + "epoch": 0.5210416596455584, + "grad_norm": 0.796875, + "learning_rate": 0.0001568036485119582, + "loss": 0.8599, + "step": 20292 + }, + { + "epoch": 0.5210673368414802, + "grad_norm": 0.828125, + "learning_rate": 0.0001567999744023116, + "loss": 0.8282, + "step": 20293 + }, + { + "epoch": 0.521093014037402, + "grad_norm": 0.7890625, + "learning_rate": 0.0001567963001794678, + "loss": 0.8728, + "step": 20294 + }, + { + "epoch": 0.5211186912333239, + "grad_norm": 0.81640625, + "learning_rate": 0.00015679262584343407, + "loss": 0.9322, + "step": 20295 + }, + { + "epoch": 0.5211443684292457, + "grad_norm": 0.64453125, + "learning_rate": 0.00015678895139421782, + "loss": 0.7625, + "step": 20296 + }, + { + "epoch": 0.5211700456251676, + "grad_norm": 0.78125, + "learning_rate": 0.00015678527683182626, + "loss": 0.8079, + "step": 20297 + }, + { + "epoch": 0.5211957228210893, + "grad_norm": 0.78515625, + "learning_rate": 0.0001567816021562668, + "loss": 0.9018, + "step": 20298 + }, + { + "epoch": 0.5212214000170111, + "grad_norm": 0.8984375, + "learning_rate": 0.00015677792736754675, + "loss": 0.8657, + "step": 20299 + }, + { + "epoch": 0.521247077212933, + "grad_norm": 0.74609375, + "learning_rate": 0.0001567742524656734, + "loss": 0.928, + "step": 20300 + }, + { + "epoch": 0.5212727544088548, + "grad_norm": 0.74609375, + "learning_rate": 0.0001567705774506541, + "loss": 0.8514, + "step": 20301 + }, + { + "epoch": 0.5212984316047766, + "grad_norm": 0.83984375, + "learning_rate": 0.00015676690232249618, + "loss": 0.8907, + "step": 20302 + }, + { + "epoch": 0.5213241088006985, + "grad_norm": 0.76171875, + "learning_rate": 0.00015676322708120695, + "loss": 0.9035, + "step": 20303 + }, + { + "epoch": 0.5213497859966202, + "grad_norm": 0.76953125, + "learning_rate": 0.00015675955172679372, + "loss": 0.7971, + "step": 20304 + }, + { + "epoch": 0.521375463192542, + "grad_norm": 0.78515625, + "learning_rate": 0.00015675587625926384, + "loss": 0.8111, + "step": 20305 + }, + { + "epoch": 0.5214011403884639, + "grad_norm": 0.76171875, + "learning_rate": 0.00015675220067862462, + "loss": 0.7912, + "step": 20306 + }, + { + "epoch": 0.5214268175843857, + "grad_norm": 0.75, + "learning_rate": 0.00015674852498488338, + "loss": 0.906, + "step": 20307 + }, + { + "epoch": 0.5214524947803075, + "grad_norm": 0.828125, + "learning_rate": 0.0001567448491780475, + "loss": 0.8286, + "step": 20308 + }, + { + "epoch": 0.5214781719762294, + "grad_norm": 0.79296875, + "learning_rate": 0.0001567411732581242, + "loss": 0.7992, + "step": 20309 + }, + { + "epoch": 0.5215038491721512, + "grad_norm": 0.74609375, + "learning_rate": 0.00015673749722512089, + "loss": 0.8427, + "step": 20310 + }, + { + "epoch": 0.5215295263680729, + "grad_norm": 0.734375, + "learning_rate": 0.00015673382107904492, + "loss": 0.886, + "step": 20311 + }, + { + "epoch": 0.5215552035639948, + "grad_norm": 0.85546875, + "learning_rate": 0.0001567301448199035, + "loss": 0.9326, + "step": 20312 + }, + { + "epoch": 0.5215808807599166, + "grad_norm": 0.76171875, + "learning_rate": 0.00015672646844770407, + "loss": 0.7416, + "step": 20313 + }, + { + "epoch": 0.5216065579558384, + "grad_norm": 0.8203125, + "learning_rate": 0.00015672279196245388, + "loss": 0.8739, + "step": 20314 + }, + { + "epoch": 0.5216322351517603, + "grad_norm": 0.86328125, + "learning_rate": 0.0001567191153641603, + "loss": 0.9616, + "step": 20315 + }, + { + "epoch": 0.5216579123476821, + "grad_norm": 0.74609375, + "learning_rate": 0.00015671543865283065, + "loss": 0.8971, + "step": 20316 + }, + { + "epoch": 0.5216835895436039, + "grad_norm": 0.77734375, + "learning_rate": 0.00015671176182847227, + "loss": 0.9812, + "step": 20317 + }, + { + "epoch": 0.5217092667395257, + "grad_norm": 0.84375, + "learning_rate": 0.00015670808489109245, + "loss": 0.8926, + "step": 20318 + }, + { + "epoch": 0.5217349439354475, + "grad_norm": 0.73828125, + "learning_rate": 0.00015670440784069855, + "loss": 0.9117, + "step": 20319 + }, + { + "epoch": 0.5217606211313693, + "grad_norm": 0.8046875, + "learning_rate": 0.00015670073067729787, + "loss": 0.9548, + "step": 20320 + }, + { + "epoch": 0.5217862983272912, + "grad_norm": 0.734375, + "learning_rate": 0.00015669705340089777, + "loss": 0.8618, + "step": 20321 + }, + { + "epoch": 0.521811975523213, + "grad_norm": 0.78515625, + "learning_rate": 0.00015669337601150555, + "loss": 0.9302, + "step": 20322 + }, + { + "epoch": 0.5218376527191348, + "grad_norm": 0.83984375, + "learning_rate": 0.00015668969850912858, + "loss": 0.892, + "step": 20323 + }, + { + "epoch": 0.5218633299150566, + "grad_norm": 0.98828125, + "learning_rate": 0.00015668602089377416, + "loss": 0.8847, + "step": 20324 + }, + { + "epoch": 0.5218890071109784, + "grad_norm": 0.81640625, + "learning_rate": 0.00015668234316544962, + "loss": 0.867, + "step": 20325 + }, + { + "epoch": 0.5219146843069002, + "grad_norm": 0.81640625, + "learning_rate": 0.00015667866532416227, + "loss": 0.8708, + "step": 20326 + }, + { + "epoch": 0.5219403615028221, + "grad_norm": 0.8359375, + "learning_rate": 0.00015667498736991945, + "loss": 0.8787, + "step": 20327 + }, + { + "epoch": 0.5219660386987439, + "grad_norm": 0.82421875, + "learning_rate": 0.00015667130930272852, + "loss": 0.9834, + "step": 20328 + }, + { + "epoch": 0.5219917158946658, + "grad_norm": 0.84375, + "learning_rate": 0.0001566676311225968, + "loss": 0.9068, + "step": 20329 + }, + { + "epoch": 0.5220173930905876, + "grad_norm": 0.71875, + "learning_rate": 0.00015666395282953158, + "loss": 0.9399, + "step": 20330 + }, + { + "epoch": 0.5220430702865093, + "grad_norm": 0.78125, + "learning_rate": 0.00015666027442354024, + "loss": 0.794, + "step": 20331 + }, + { + "epoch": 0.5220687474824312, + "grad_norm": 0.78125, + "learning_rate": 0.00015665659590463008, + "loss": 1.0526, + "step": 20332 + }, + { + "epoch": 0.522094424678353, + "grad_norm": 0.76171875, + "learning_rate": 0.00015665291727280848, + "loss": 0.8838, + "step": 20333 + }, + { + "epoch": 0.5221201018742748, + "grad_norm": 0.8203125, + "learning_rate": 0.0001566492385280827, + "loss": 0.8547, + "step": 20334 + }, + { + "epoch": 0.5221457790701967, + "grad_norm": 0.7890625, + "learning_rate": 0.0001566455596704601, + "loss": 0.7578, + "step": 20335 + }, + { + "epoch": 0.5221714562661185, + "grad_norm": 0.77734375, + "learning_rate": 0.00015664188069994803, + "loss": 0.756, + "step": 20336 + }, + { + "epoch": 0.5221971334620403, + "grad_norm": 0.84765625, + "learning_rate": 0.00015663820161655382, + "loss": 0.9303, + "step": 20337 + }, + { + "epoch": 0.522222810657962, + "grad_norm": 0.80078125, + "learning_rate": 0.00015663452242028474, + "loss": 0.9976, + "step": 20338 + }, + { + "epoch": 0.5222484878538839, + "grad_norm": 0.890625, + "learning_rate": 0.00015663084311114825, + "loss": 0.9562, + "step": 20339 + }, + { + "epoch": 0.5222741650498057, + "grad_norm": 0.78515625, + "learning_rate": 0.00015662716368915154, + "loss": 0.9427, + "step": 20340 + }, + { + "epoch": 0.5222998422457276, + "grad_norm": 0.75390625, + "learning_rate": 0.00015662348415430204, + "loss": 0.9547, + "step": 20341 + }, + { + "epoch": 0.5223255194416494, + "grad_norm": 0.78125, + "learning_rate": 0.00015661980450660702, + "loss": 1.0467, + "step": 20342 + }, + { + "epoch": 0.5223511966375712, + "grad_norm": 0.703125, + "learning_rate": 0.00015661612474607387, + "loss": 0.8308, + "step": 20343 + }, + { + "epoch": 0.522376873833493, + "grad_norm": 0.75390625, + "learning_rate": 0.0001566124448727099, + "loss": 0.8749, + "step": 20344 + }, + { + "epoch": 0.5224025510294148, + "grad_norm": 0.7890625, + "learning_rate": 0.00015660876488652242, + "loss": 0.8116, + "step": 20345 + }, + { + "epoch": 0.5224282282253366, + "grad_norm": 0.79296875, + "learning_rate": 0.0001566050847875188, + "loss": 0.9194, + "step": 20346 + }, + { + "epoch": 0.5224539054212585, + "grad_norm": 0.71875, + "learning_rate": 0.00015660140457570635, + "loss": 0.925, + "step": 20347 + }, + { + "epoch": 0.5224795826171803, + "grad_norm": 0.7421875, + "learning_rate": 0.00015659772425109244, + "loss": 0.9872, + "step": 20348 + }, + { + "epoch": 0.5225052598131021, + "grad_norm": 0.72265625, + "learning_rate": 0.00015659404381368437, + "loss": 0.8331, + "step": 20349 + }, + { + "epoch": 0.522530937009024, + "grad_norm": 0.75, + "learning_rate": 0.00015659036326348948, + "loss": 0.8779, + "step": 20350 + }, + { + "epoch": 0.5225566142049457, + "grad_norm": 0.77734375, + "learning_rate": 0.00015658668260051507, + "loss": 0.8653, + "step": 20351 + }, + { + "epoch": 0.5225822914008675, + "grad_norm": 0.8203125, + "learning_rate": 0.00015658300182476854, + "loss": 0.7875, + "step": 20352 + }, + { + "epoch": 0.5226079685967894, + "grad_norm": 0.73828125, + "learning_rate": 0.00015657932093625723, + "loss": 0.9034, + "step": 20353 + }, + { + "epoch": 0.5226336457927112, + "grad_norm": 0.8046875, + "learning_rate": 0.0001565756399349884, + "loss": 0.8385, + "step": 20354 + }, + { + "epoch": 0.522659322988633, + "grad_norm": 0.80078125, + "learning_rate": 0.00015657195882096944, + "loss": 0.8549, + "step": 20355 + }, + { + "epoch": 0.5226850001845549, + "grad_norm": 0.97265625, + "learning_rate": 0.00015656827759420768, + "loss": 0.7833, + "step": 20356 + }, + { + "epoch": 0.5227106773804767, + "grad_norm": 0.78515625, + "learning_rate": 0.00015656459625471047, + "loss": 0.8219, + "step": 20357 + }, + { + "epoch": 0.5227363545763984, + "grad_norm": 0.765625, + "learning_rate": 0.00015656091480248513, + "loss": 0.8169, + "step": 20358 + }, + { + "epoch": 0.5227620317723203, + "grad_norm": 0.79296875, + "learning_rate": 0.00015655723323753894, + "loss": 0.8923, + "step": 20359 + }, + { + "epoch": 0.5227877089682421, + "grad_norm": 0.78125, + "learning_rate": 0.00015655355155987934, + "loss": 0.8421, + "step": 20360 + }, + { + "epoch": 0.522813386164164, + "grad_norm": 0.7578125, + "learning_rate": 0.00015654986976951362, + "loss": 0.9042, + "step": 20361 + }, + { + "epoch": 0.5228390633600858, + "grad_norm": 0.73046875, + "learning_rate": 0.0001565461878664491, + "loss": 0.8494, + "step": 20362 + }, + { + "epoch": 0.5228647405560076, + "grad_norm": 0.82421875, + "learning_rate": 0.00015654250585069317, + "loss": 0.8955, + "step": 20363 + }, + { + "epoch": 0.5228904177519293, + "grad_norm": 0.7578125, + "learning_rate": 0.0001565388237222531, + "loss": 0.7902, + "step": 20364 + }, + { + "epoch": 0.5229160949478512, + "grad_norm": 0.83984375, + "learning_rate": 0.0001565351414811363, + "loss": 0.885, + "step": 20365 + }, + { + "epoch": 0.522941772143773, + "grad_norm": 0.81640625, + "learning_rate": 0.00015653145912735, + "loss": 0.91, + "step": 20366 + }, + { + "epoch": 0.5229674493396949, + "grad_norm": 0.7734375, + "learning_rate": 0.00015652777666090163, + "loss": 0.92, + "step": 20367 + }, + { + "epoch": 0.5229931265356167, + "grad_norm": 0.82421875, + "learning_rate": 0.00015652409408179857, + "loss": 0.7775, + "step": 20368 + }, + { + "epoch": 0.5230188037315385, + "grad_norm": 0.78125, + "learning_rate": 0.00015652041139004803, + "loss": 0.8368, + "step": 20369 + }, + { + "epoch": 0.5230444809274604, + "grad_norm": 0.8359375, + "learning_rate": 0.00015651672858565745, + "loss": 0.9827, + "step": 20370 + }, + { + "epoch": 0.5230701581233821, + "grad_norm": 0.7421875, + "learning_rate": 0.00015651304566863413, + "loss": 0.9524, + "step": 20371 + }, + { + "epoch": 0.5230958353193039, + "grad_norm": 0.8515625, + "learning_rate": 0.0001565093626389854, + "loss": 0.8939, + "step": 20372 + }, + { + "epoch": 0.5231215125152258, + "grad_norm": 0.84375, + "learning_rate": 0.0001565056794967186, + "loss": 0.9485, + "step": 20373 + }, + { + "epoch": 0.5231471897111476, + "grad_norm": 0.82421875, + "learning_rate": 0.0001565019962418411, + "loss": 0.8663, + "step": 20374 + }, + { + "epoch": 0.5231728669070694, + "grad_norm": 0.78515625, + "learning_rate": 0.00015649831287436024, + "loss": 0.8919, + "step": 20375 + }, + { + "epoch": 0.5231985441029913, + "grad_norm": 0.6640625, + "learning_rate": 0.00015649462939428338, + "loss": 0.721, + "step": 20376 + }, + { + "epoch": 0.5232242212989131, + "grad_norm": 0.71875, + "learning_rate": 0.00015649094580161776, + "loss": 0.849, + "step": 20377 + }, + { + "epoch": 0.5232498984948348, + "grad_norm": 0.765625, + "learning_rate": 0.0001564872620963708, + "loss": 0.7365, + "step": 20378 + }, + { + "epoch": 0.5232755756907567, + "grad_norm": 0.80078125, + "learning_rate": 0.00015648357827854984, + "loss": 0.991, + "step": 20379 + }, + { + "epoch": 0.5233012528866785, + "grad_norm": 0.68359375, + "learning_rate": 0.0001564798943481622, + "loss": 0.7399, + "step": 20380 + }, + { + "epoch": 0.5233269300826003, + "grad_norm": 0.76953125, + "learning_rate": 0.00015647621030521526, + "loss": 0.8614, + "step": 20381 + }, + { + "epoch": 0.5233526072785222, + "grad_norm": 0.81640625, + "learning_rate": 0.00015647252614971633, + "loss": 0.7559, + "step": 20382 + }, + { + "epoch": 0.523378284474444, + "grad_norm": 0.76953125, + "learning_rate": 0.00015646884188167274, + "loss": 0.7664, + "step": 20383 + }, + { + "epoch": 0.5234039616703657, + "grad_norm": 0.828125, + "learning_rate": 0.00015646515750109184, + "loss": 0.9096, + "step": 20384 + }, + { + "epoch": 0.5234296388662876, + "grad_norm": 0.79296875, + "learning_rate": 0.00015646147300798103, + "loss": 0.9321, + "step": 20385 + }, + { + "epoch": 0.5234553160622094, + "grad_norm": 0.76953125, + "learning_rate": 0.00015645778840234757, + "loss": 0.8435, + "step": 20386 + }, + { + "epoch": 0.5234809932581312, + "grad_norm": 0.76953125, + "learning_rate": 0.0001564541036841988, + "loss": 0.8107, + "step": 20387 + }, + { + "epoch": 0.5235066704540531, + "grad_norm": 0.74609375, + "learning_rate": 0.00015645041885354214, + "loss": 0.8778, + "step": 20388 + }, + { + "epoch": 0.5235323476499749, + "grad_norm": 0.703125, + "learning_rate": 0.0001564467339103849, + "loss": 0.8663, + "step": 20389 + }, + { + "epoch": 0.5235580248458968, + "grad_norm": 0.7734375, + "learning_rate": 0.00015644304885473445, + "loss": 0.8578, + "step": 20390 + }, + { + "epoch": 0.5235837020418185, + "grad_norm": 0.796875, + "learning_rate": 0.00015643936368659803, + "loss": 0.7862, + "step": 20391 + }, + { + "epoch": 0.5236093792377403, + "grad_norm": 0.74609375, + "learning_rate": 0.0001564356784059831, + "loss": 0.8723, + "step": 20392 + }, + { + "epoch": 0.5236350564336621, + "grad_norm": 0.83984375, + "learning_rate": 0.00015643199301289696, + "loss": 0.9383, + "step": 20393 + }, + { + "epoch": 0.523660733629584, + "grad_norm": 0.8203125, + "learning_rate": 0.00015642830750734694, + "loss": 0.9543, + "step": 20394 + }, + { + "epoch": 0.5236864108255058, + "grad_norm": 0.7265625, + "learning_rate": 0.0001564246218893404, + "loss": 0.8507, + "step": 20395 + }, + { + "epoch": 0.5237120880214277, + "grad_norm": 0.73828125, + "learning_rate": 0.0001564209361588847, + "loss": 0.8017, + "step": 20396 + }, + { + "epoch": 0.5237377652173495, + "grad_norm": 1.6796875, + "learning_rate": 0.00015641725031598718, + "loss": 0.9575, + "step": 20397 + }, + { + "epoch": 0.5237634424132712, + "grad_norm": 0.75, + "learning_rate": 0.00015641356436065515, + "loss": 0.9777, + "step": 20398 + }, + { + "epoch": 0.523789119609193, + "grad_norm": 0.7578125, + "learning_rate": 0.000156409878292896, + "loss": 0.9351, + "step": 20399 + }, + { + "epoch": 0.5238147968051149, + "grad_norm": 0.74609375, + "learning_rate": 0.00015640619211271703, + "loss": 0.9154, + "step": 20400 + }, + { + "epoch": 0.5238404740010367, + "grad_norm": 0.78515625, + "learning_rate": 0.00015640250582012562, + "loss": 0.9799, + "step": 20401 + }, + { + "epoch": 0.5238661511969586, + "grad_norm": 0.71875, + "learning_rate": 0.00015639881941512912, + "loss": 0.7992, + "step": 20402 + }, + { + "epoch": 0.5238918283928804, + "grad_norm": 0.75, + "learning_rate": 0.00015639513289773487, + "loss": 0.8379, + "step": 20403 + }, + { + "epoch": 0.5239175055888021, + "grad_norm": 0.80078125, + "learning_rate": 0.00015639144626795025, + "loss": 0.8761, + "step": 20404 + }, + { + "epoch": 0.523943182784724, + "grad_norm": 0.75, + "learning_rate": 0.0001563877595257825, + "loss": 1.0213, + "step": 20405 + }, + { + "epoch": 0.5239688599806458, + "grad_norm": 0.75390625, + "learning_rate": 0.00015638407267123912, + "loss": 0.9169, + "step": 20406 + }, + { + "epoch": 0.5239945371765676, + "grad_norm": 0.7578125, + "learning_rate": 0.0001563803857043273, + "loss": 0.7947, + "step": 20407 + }, + { + "epoch": 0.5240202143724895, + "grad_norm": 0.71875, + "learning_rate": 0.00015637669862505452, + "loss": 0.769, + "step": 20408 + }, + { + "epoch": 0.5240458915684113, + "grad_norm": 1.046875, + "learning_rate": 0.00015637301143342803, + "loss": 0.8448, + "step": 20409 + }, + { + "epoch": 0.5240715687643331, + "grad_norm": 0.73828125, + "learning_rate": 0.00015636932412945527, + "loss": 0.8536, + "step": 20410 + }, + { + "epoch": 0.5240972459602549, + "grad_norm": 0.7421875, + "learning_rate": 0.00015636563671314347, + "loss": 0.9717, + "step": 20411 + }, + { + "epoch": 0.5241229231561767, + "grad_norm": 0.765625, + "learning_rate": 0.00015636194918450008, + "loss": 0.7997, + "step": 20412 + }, + { + "epoch": 0.5241486003520985, + "grad_norm": 0.89453125, + "learning_rate": 0.00015635826154353243, + "loss": 0.8922, + "step": 20413 + }, + { + "epoch": 0.5241742775480204, + "grad_norm": 0.76171875, + "learning_rate": 0.00015635457379024786, + "loss": 0.8976, + "step": 20414 + }, + { + "epoch": 0.5241999547439422, + "grad_norm": 0.75390625, + "learning_rate": 0.00015635088592465368, + "loss": 0.9055, + "step": 20415 + }, + { + "epoch": 0.524225631939864, + "grad_norm": 0.7421875, + "learning_rate": 0.0001563471979467573, + "loss": 0.8003, + "step": 20416 + }, + { + "epoch": 0.5242513091357859, + "grad_norm": 0.69140625, + "learning_rate": 0.00015634350985656604, + "loss": 0.9256, + "step": 20417 + }, + { + "epoch": 0.5242769863317076, + "grad_norm": 0.859375, + "learning_rate": 0.00015633982165408727, + "loss": 0.9096, + "step": 20418 + }, + { + "epoch": 0.5243026635276294, + "grad_norm": 0.78515625, + "learning_rate": 0.0001563361333393283, + "loss": 0.9504, + "step": 20419 + }, + { + "epoch": 0.5243283407235513, + "grad_norm": 0.75, + "learning_rate": 0.00015633244491229653, + "loss": 0.7941, + "step": 20420 + }, + { + "epoch": 0.5243540179194731, + "grad_norm": 0.87890625, + "learning_rate": 0.00015632875637299927, + "loss": 0.852, + "step": 20421 + }, + { + "epoch": 0.524379695115395, + "grad_norm": 0.77734375, + "learning_rate": 0.0001563250677214439, + "loss": 0.8729, + "step": 20422 + }, + { + "epoch": 0.5244053723113168, + "grad_norm": 0.79296875, + "learning_rate": 0.00015632137895763774, + "loss": 0.9478, + "step": 20423 + }, + { + "epoch": 0.5244310495072385, + "grad_norm": 0.70703125, + "learning_rate": 0.00015631769008158815, + "loss": 0.8484, + "step": 20424 + }, + { + "epoch": 0.5244567267031603, + "grad_norm": 0.8203125, + "learning_rate": 0.00015631400109330249, + "loss": 0.7969, + "step": 20425 + }, + { + "epoch": 0.5244824038990822, + "grad_norm": 0.75390625, + "learning_rate": 0.00015631031199278813, + "loss": 1.0175, + "step": 20426 + }, + { + "epoch": 0.524508081095004, + "grad_norm": 0.7265625, + "learning_rate": 0.00015630662278005243, + "loss": 0.8882, + "step": 20427 + }, + { + "epoch": 0.5245337582909259, + "grad_norm": 0.82421875, + "learning_rate": 0.00015630293345510267, + "loss": 0.9324, + "step": 20428 + }, + { + "epoch": 0.5245594354868477, + "grad_norm": 0.80078125, + "learning_rate": 0.00015629924401794627, + "loss": 0.9376, + "step": 20429 + }, + { + "epoch": 0.5245851126827695, + "grad_norm": 0.6953125, + "learning_rate": 0.0001562955544685906, + "loss": 0.8679, + "step": 20430 + }, + { + "epoch": 0.5246107898786913, + "grad_norm": 0.83984375, + "learning_rate": 0.00015629186480704292, + "loss": 1.0354, + "step": 20431 + }, + { + "epoch": 0.5246364670746131, + "grad_norm": 0.76953125, + "learning_rate": 0.00015628817503331063, + "loss": 0.8612, + "step": 20432 + }, + { + "epoch": 0.5246621442705349, + "grad_norm": 0.8125, + "learning_rate": 0.00015628448514740112, + "loss": 0.9461, + "step": 20433 + }, + { + "epoch": 0.5246878214664568, + "grad_norm": 0.70703125, + "learning_rate": 0.0001562807951493217, + "loss": 0.8731, + "step": 20434 + }, + { + "epoch": 0.5247134986623786, + "grad_norm": 0.81640625, + "learning_rate": 0.00015627710503907975, + "loss": 0.9043, + "step": 20435 + }, + { + "epoch": 0.5247391758583004, + "grad_norm": 0.75, + "learning_rate": 0.00015627341481668262, + "loss": 0.9026, + "step": 20436 + }, + { + "epoch": 0.5247648530542223, + "grad_norm": 0.7265625, + "learning_rate": 0.00015626972448213767, + "loss": 0.9326, + "step": 20437 + }, + { + "epoch": 0.524790530250144, + "grad_norm": 0.76953125, + "learning_rate": 0.0001562660340354522, + "loss": 0.8555, + "step": 20438 + }, + { + "epoch": 0.5248162074460658, + "grad_norm": 0.796875, + "learning_rate": 0.00015626234347663364, + "loss": 0.8956, + "step": 20439 + }, + { + "epoch": 0.5248418846419877, + "grad_norm": 0.78125, + "learning_rate": 0.0001562586528056893, + "loss": 0.9303, + "step": 20440 + }, + { + "epoch": 0.5248675618379095, + "grad_norm": 0.81640625, + "learning_rate": 0.00015625496202262656, + "loss": 0.8052, + "step": 20441 + }, + { + "epoch": 0.5248932390338313, + "grad_norm": 0.73828125, + "learning_rate": 0.00015625127112745276, + "loss": 0.8358, + "step": 20442 + }, + { + "epoch": 0.5249189162297532, + "grad_norm": 0.79296875, + "learning_rate": 0.00015624758012017522, + "loss": 1.0631, + "step": 20443 + }, + { + "epoch": 0.5249445934256749, + "grad_norm": 0.71875, + "learning_rate": 0.0001562438890008014, + "loss": 0.8641, + "step": 20444 + }, + { + "epoch": 0.5249702706215967, + "grad_norm": 0.86328125, + "learning_rate": 0.0001562401977693385, + "loss": 0.8682, + "step": 20445 + }, + { + "epoch": 0.5249959478175186, + "grad_norm": 0.7421875, + "learning_rate": 0.00015623650642579404, + "loss": 0.8067, + "step": 20446 + }, + { + "epoch": 0.5250216250134404, + "grad_norm": 0.76953125, + "learning_rate": 0.00015623281497017527, + "loss": 0.8244, + "step": 20447 + }, + { + "epoch": 0.5250473022093622, + "grad_norm": 0.765625, + "learning_rate": 0.0001562291234024896, + "loss": 0.9431, + "step": 20448 + }, + { + "epoch": 0.5250729794052841, + "grad_norm": 0.94921875, + "learning_rate": 0.00015622543172274435, + "loss": 0.9424, + "step": 20449 + }, + { + "epoch": 0.5250986566012059, + "grad_norm": 0.98046875, + "learning_rate": 0.0001562217399309469, + "loss": 0.8814, + "step": 20450 + }, + { + "epoch": 0.5251243337971276, + "grad_norm": 0.8125, + "learning_rate": 0.0001562180480271046, + "loss": 0.9735, + "step": 20451 + }, + { + "epoch": 0.5251500109930495, + "grad_norm": 0.7265625, + "learning_rate": 0.0001562143560112248, + "loss": 0.8643, + "step": 20452 + }, + { + "epoch": 0.5251756881889713, + "grad_norm": 0.765625, + "learning_rate": 0.00015621066388331488, + "loss": 0.9995, + "step": 20453 + }, + { + "epoch": 0.5252013653848931, + "grad_norm": 0.76953125, + "learning_rate": 0.0001562069716433822, + "loss": 0.7787, + "step": 20454 + }, + { + "epoch": 0.525227042580815, + "grad_norm": 0.74609375, + "learning_rate": 0.00015620327929143408, + "loss": 0.8764, + "step": 20455 + }, + { + "epoch": 0.5252527197767368, + "grad_norm": 0.7890625, + "learning_rate": 0.00015619958682747787, + "loss": 0.8747, + "step": 20456 + }, + { + "epoch": 0.5252783969726585, + "grad_norm": 0.82421875, + "learning_rate": 0.000156195894251521, + "loss": 0.9965, + "step": 20457 + }, + { + "epoch": 0.5253040741685804, + "grad_norm": 0.7578125, + "learning_rate": 0.0001561922015635708, + "loss": 0.8327, + "step": 20458 + }, + { + "epoch": 0.5253297513645022, + "grad_norm": 0.89453125, + "learning_rate": 0.0001561885087636346, + "loss": 0.7641, + "step": 20459 + }, + { + "epoch": 0.525355428560424, + "grad_norm": 0.75390625, + "learning_rate": 0.00015618481585171975, + "loss": 0.8023, + "step": 20460 + }, + { + "epoch": 0.5253811057563459, + "grad_norm": 0.90625, + "learning_rate": 0.00015618112282783368, + "loss": 0.9725, + "step": 20461 + }, + { + "epoch": 0.5254067829522677, + "grad_norm": 0.75, + "learning_rate": 0.0001561774296919837, + "loss": 0.8489, + "step": 20462 + }, + { + "epoch": 0.5254324601481896, + "grad_norm": 0.77734375, + "learning_rate": 0.00015617373644417716, + "loss": 0.8554, + "step": 20463 + }, + { + "epoch": 0.5254581373441113, + "grad_norm": 0.76171875, + "learning_rate": 0.00015617004308442144, + "loss": 0.8971, + "step": 20464 + }, + { + "epoch": 0.5254838145400331, + "grad_norm": 0.80859375, + "learning_rate": 0.00015616634961272393, + "loss": 0.964, + "step": 20465 + }, + { + "epoch": 0.525509491735955, + "grad_norm": 0.86328125, + "learning_rate": 0.0001561626560290919, + "loss": 0.991, + "step": 20466 + }, + { + "epoch": 0.5255351689318768, + "grad_norm": 0.765625, + "learning_rate": 0.0001561589623335328, + "loss": 0.839, + "step": 20467 + }, + { + "epoch": 0.5255608461277986, + "grad_norm": 0.8203125, + "learning_rate": 0.00015615526852605398, + "loss": 0.859, + "step": 20468 + }, + { + "epoch": 0.5255865233237205, + "grad_norm": 0.8046875, + "learning_rate": 0.00015615157460666278, + "loss": 0.9266, + "step": 20469 + }, + { + "epoch": 0.5256122005196423, + "grad_norm": 0.88671875, + "learning_rate": 0.00015614788057536654, + "loss": 0.8231, + "step": 20470 + }, + { + "epoch": 0.525637877715564, + "grad_norm": 0.77734375, + "learning_rate": 0.00015614418643217267, + "loss": 0.8062, + "step": 20471 + }, + { + "epoch": 0.5256635549114859, + "grad_norm": 0.83203125, + "learning_rate": 0.00015614049217708845, + "loss": 0.8808, + "step": 20472 + }, + { + "epoch": 0.5256892321074077, + "grad_norm": 1.3046875, + "learning_rate": 0.00015613679781012135, + "loss": 1.0441, + "step": 20473 + }, + { + "epoch": 0.5257149093033295, + "grad_norm": 0.79296875, + "learning_rate": 0.00015613310333127868, + "loss": 0.8209, + "step": 20474 + }, + { + "epoch": 0.5257405864992514, + "grad_norm": 0.73046875, + "learning_rate": 0.0001561294087405678, + "loss": 0.8712, + "step": 20475 + }, + { + "epoch": 0.5257662636951732, + "grad_norm": 0.875, + "learning_rate": 0.0001561257140379961, + "loss": 0.9435, + "step": 20476 + }, + { + "epoch": 0.5257919408910949, + "grad_norm": 0.70703125, + "learning_rate": 0.0001561220192235709, + "loss": 0.9028, + "step": 20477 + }, + { + "epoch": 0.5258176180870168, + "grad_norm": 0.73046875, + "learning_rate": 0.00015611832429729958, + "loss": 0.9274, + "step": 20478 + }, + { + "epoch": 0.5258432952829386, + "grad_norm": 0.78515625, + "learning_rate": 0.00015611462925918952, + "loss": 0.8043, + "step": 20479 + }, + { + "epoch": 0.5258689724788604, + "grad_norm": 0.80078125, + "learning_rate": 0.00015611093410924806, + "loss": 0.9054, + "step": 20480 + }, + { + "epoch": 0.5258946496747823, + "grad_norm": 0.79296875, + "learning_rate": 0.0001561072388474826, + "loss": 0.7697, + "step": 20481 + }, + { + "epoch": 0.5259203268707041, + "grad_norm": 0.8828125, + "learning_rate": 0.00015610354347390045, + "loss": 1.0743, + "step": 20482 + }, + { + "epoch": 0.525946004066626, + "grad_norm": 0.7109375, + "learning_rate": 0.00015609984798850904, + "loss": 0.791, + "step": 20483 + }, + { + "epoch": 0.5259716812625477, + "grad_norm": 0.8359375, + "learning_rate": 0.00015609615239131568, + "loss": 0.8724, + "step": 20484 + }, + { + "epoch": 0.5259973584584695, + "grad_norm": 0.76953125, + "learning_rate": 0.00015609245668232773, + "loss": 0.8603, + "step": 20485 + }, + { + "epoch": 0.5260230356543913, + "grad_norm": 0.734375, + "learning_rate": 0.0001560887608615526, + "loss": 0.9658, + "step": 20486 + }, + { + "epoch": 0.5260487128503132, + "grad_norm": 0.76953125, + "learning_rate": 0.00015608506492899765, + "loss": 0.8566, + "step": 20487 + }, + { + "epoch": 0.526074390046235, + "grad_norm": 0.79296875, + "learning_rate": 0.00015608136888467023, + "loss": 0.8632, + "step": 20488 + }, + { + "epoch": 0.5261000672421569, + "grad_norm": 0.9375, + "learning_rate": 0.00015607767272857772, + "loss": 1.0287, + "step": 20489 + }, + { + "epoch": 0.5261257444380787, + "grad_norm": 0.76171875, + "learning_rate": 0.00015607397646072747, + "loss": 0.932, + "step": 20490 + }, + { + "epoch": 0.5261514216340004, + "grad_norm": 0.75, + "learning_rate": 0.00015607028008112682, + "loss": 0.8844, + "step": 20491 + }, + { + "epoch": 0.5261770988299223, + "grad_norm": 0.83203125, + "learning_rate": 0.0001560665835897832, + "loss": 0.902, + "step": 20492 + }, + { + "epoch": 0.5262027760258441, + "grad_norm": 1.0859375, + "learning_rate": 0.00015606288698670392, + "loss": 1.0234, + "step": 20493 + }, + { + "epoch": 0.5262284532217659, + "grad_norm": 0.80859375, + "learning_rate": 0.00015605919027189636, + "loss": 1.0195, + "step": 20494 + }, + { + "epoch": 0.5262541304176878, + "grad_norm": 0.79296875, + "learning_rate": 0.00015605549344536795, + "loss": 0.983, + "step": 20495 + }, + { + "epoch": 0.5262798076136096, + "grad_norm": 0.87890625, + "learning_rate": 0.00015605179650712595, + "loss": 0.9249, + "step": 20496 + }, + { + "epoch": 0.5263054848095313, + "grad_norm": 0.7421875, + "learning_rate": 0.0001560480994571778, + "loss": 0.7981, + "step": 20497 + }, + { + "epoch": 0.5263311620054532, + "grad_norm": 0.80859375, + "learning_rate": 0.00015604440229553088, + "loss": 0.8383, + "step": 20498 + }, + { + "epoch": 0.526356839201375, + "grad_norm": 0.78515625, + "learning_rate": 0.00015604070502219249, + "loss": 1.0308, + "step": 20499 + }, + { + "epoch": 0.5263825163972968, + "grad_norm": 0.7734375, + "learning_rate": 0.00015603700763717004, + "loss": 0.9106, + "step": 20500 + }, + { + "epoch": 0.5264081935932187, + "grad_norm": 0.734375, + "learning_rate": 0.00015603331014047092, + "loss": 0.7536, + "step": 20501 + }, + { + "epoch": 0.5264338707891405, + "grad_norm": 0.7890625, + "learning_rate": 0.00015602961253210245, + "loss": 0.8461, + "step": 20502 + }, + { + "epoch": 0.5264595479850623, + "grad_norm": 0.74609375, + "learning_rate": 0.00015602591481207205, + "loss": 0.824, + "step": 20503 + }, + { + "epoch": 0.5264852251809841, + "grad_norm": 1.03125, + "learning_rate": 0.00015602221698038702, + "loss": 0.878, + "step": 20504 + }, + { + "epoch": 0.5265109023769059, + "grad_norm": 0.73828125, + "learning_rate": 0.0001560185190370548, + "loss": 0.9203, + "step": 20505 + }, + { + "epoch": 0.5265365795728277, + "grad_norm": 0.81640625, + "learning_rate": 0.00015601482098208272, + "loss": 0.9205, + "step": 20506 + }, + { + "epoch": 0.5265622567687496, + "grad_norm": 1.359375, + "learning_rate": 0.00015601112281547817, + "loss": 0.9148, + "step": 20507 + }, + { + "epoch": 0.5265879339646714, + "grad_norm": 0.703125, + "learning_rate": 0.00015600742453724853, + "loss": 0.8358, + "step": 20508 + }, + { + "epoch": 0.5266136111605932, + "grad_norm": 0.78515625, + "learning_rate": 0.00015600372614740112, + "loss": 0.9692, + "step": 20509 + }, + { + "epoch": 0.5266392883565151, + "grad_norm": 0.76953125, + "learning_rate": 0.00015600002764594337, + "loss": 0.9502, + "step": 20510 + }, + { + "epoch": 0.5266649655524368, + "grad_norm": 0.75, + "learning_rate": 0.0001559963290328826, + "loss": 0.8452, + "step": 20511 + }, + { + "epoch": 0.5266906427483586, + "grad_norm": 0.73828125, + "learning_rate": 0.00015599263030822622, + "loss": 0.7794, + "step": 20512 + }, + { + "epoch": 0.5267163199442805, + "grad_norm": 0.84375, + "learning_rate": 0.00015598893147198158, + "loss": 0.8958, + "step": 20513 + }, + { + "epoch": 0.5267419971402023, + "grad_norm": 0.78515625, + "learning_rate": 0.00015598523252415608, + "loss": 0.9318, + "step": 20514 + }, + { + "epoch": 0.5267676743361241, + "grad_norm": 0.76171875, + "learning_rate": 0.00015598153346475704, + "loss": 0.8819, + "step": 20515 + }, + { + "epoch": 0.526793351532046, + "grad_norm": 0.80859375, + "learning_rate": 0.00015597783429379186, + "loss": 0.9529, + "step": 20516 + }, + { + "epoch": 0.5268190287279677, + "grad_norm": 0.76953125, + "learning_rate": 0.00015597413501126792, + "loss": 0.8509, + "step": 20517 + }, + { + "epoch": 0.5268447059238895, + "grad_norm": 0.69140625, + "learning_rate": 0.00015597043561719259, + "loss": 0.8625, + "step": 20518 + }, + { + "epoch": 0.5268703831198114, + "grad_norm": 0.80859375, + "learning_rate": 0.00015596673611157323, + "loss": 0.8237, + "step": 20519 + }, + { + "epoch": 0.5268960603157332, + "grad_norm": 0.7734375, + "learning_rate": 0.00015596303649441723, + "loss": 0.9946, + "step": 20520 + }, + { + "epoch": 0.526921737511655, + "grad_norm": 0.78125, + "learning_rate": 0.00015595933676573192, + "loss": 0.85, + "step": 20521 + }, + { + "epoch": 0.5269474147075769, + "grad_norm": 0.8046875, + "learning_rate": 0.00015595563692552474, + "loss": 0.8176, + "step": 20522 + }, + { + "epoch": 0.5269730919034987, + "grad_norm": 0.75390625, + "learning_rate": 0.00015595193697380303, + "loss": 0.8483, + "step": 20523 + }, + { + "epoch": 0.5269987690994204, + "grad_norm": 0.76953125, + "learning_rate": 0.00015594823691057416, + "loss": 0.9159, + "step": 20524 + }, + { + "epoch": 0.5270244462953423, + "grad_norm": 0.78125, + "learning_rate": 0.0001559445367358455, + "loss": 0.8093, + "step": 20525 + }, + { + "epoch": 0.5270501234912641, + "grad_norm": 0.80078125, + "learning_rate": 0.00015594083644962444, + "loss": 0.967, + "step": 20526 + }, + { + "epoch": 0.527075800687186, + "grad_norm": 0.8046875, + "learning_rate": 0.0001559371360519183, + "loss": 0.8929, + "step": 20527 + }, + { + "epoch": 0.5271014778831078, + "grad_norm": 0.7578125, + "learning_rate": 0.00015593343554273458, + "loss": 0.867, + "step": 20528 + }, + { + "epoch": 0.5271271550790296, + "grad_norm": 0.82421875, + "learning_rate": 0.0001559297349220805, + "loss": 0.8452, + "step": 20529 + }, + { + "epoch": 0.5271528322749515, + "grad_norm": 0.8359375, + "learning_rate": 0.00015592603418996357, + "loss": 0.9803, + "step": 20530 + }, + { + "epoch": 0.5271785094708732, + "grad_norm": 0.73828125, + "learning_rate": 0.00015592233334639111, + "loss": 0.9305, + "step": 20531 + }, + { + "epoch": 0.527204186666795, + "grad_norm": 0.796875, + "learning_rate": 0.00015591863239137042, + "loss": 0.7013, + "step": 20532 + }, + { + "epoch": 0.5272298638627169, + "grad_norm": 0.7578125, + "learning_rate": 0.000155914931324909, + "loss": 0.8411, + "step": 20533 + }, + { + "epoch": 0.5272555410586387, + "grad_norm": 0.74609375, + "learning_rate": 0.0001559112301470142, + "loss": 0.8413, + "step": 20534 + }, + { + "epoch": 0.5272812182545605, + "grad_norm": 0.8515625, + "learning_rate": 0.00015590752885769334, + "loss": 0.8946, + "step": 20535 + }, + { + "epoch": 0.5273068954504824, + "grad_norm": 0.8203125, + "learning_rate": 0.0001559038274569538, + "loss": 0.8465, + "step": 20536 + }, + { + "epoch": 0.5273325726464041, + "grad_norm": 0.78125, + "learning_rate": 0.00015590012594480298, + "loss": 0.8693, + "step": 20537 + }, + { + "epoch": 0.5273582498423259, + "grad_norm": 0.8125, + "learning_rate": 0.0001558964243212483, + "loss": 0.9776, + "step": 20538 + }, + { + "epoch": 0.5273839270382478, + "grad_norm": 0.70703125, + "learning_rate": 0.0001558927225862971, + "loss": 0.9142, + "step": 20539 + }, + { + "epoch": 0.5274096042341696, + "grad_norm": 0.890625, + "learning_rate": 0.0001558890207399567, + "loss": 0.8197, + "step": 20540 + }, + { + "epoch": 0.5274352814300914, + "grad_norm": 0.796875, + "learning_rate": 0.00015588531878223455, + "loss": 0.8598, + "step": 20541 + }, + { + "epoch": 0.5274609586260133, + "grad_norm": 0.8203125, + "learning_rate": 0.00015588161671313802, + "loss": 0.9015, + "step": 20542 + }, + { + "epoch": 0.5274866358219351, + "grad_norm": 0.79296875, + "learning_rate": 0.00015587791453267448, + "loss": 0.8578, + "step": 20543 + }, + { + "epoch": 0.5275123130178568, + "grad_norm": 0.92578125, + "learning_rate": 0.00015587421224085127, + "loss": 0.8773, + "step": 20544 + }, + { + "epoch": 0.5275379902137787, + "grad_norm": 0.8828125, + "learning_rate": 0.0001558705098376758, + "loss": 1.0151, + "step": 20545 + }, + { + "epoch": 0.5275636674097005, + "grad_norm": 0.7734375, + "learning_rate": 0.0001558668073231555, + "loss": 0.8954, + "step": 20546 + }, + { + "epoch": 0.5275893446056223, + "grad_norm": 0.7578125, + "learning_rate": 0.00015586310469729771, + "loss": 0.8787, + "step": 20547 + }, + { + "epoch": 0.5276150218015442, + "grad_norm": 0.765625, + "learning_rate": 0.00015585940196010976, + "loss": 1.0289, + "step": 20548 + }, + { + "epoch": 0.527640698997466, + "grad_norm": 0.82421875, + "learning_rate": 0.00015585569911159906, + "loss": 0.9668, + "step": 20549 + }, + { + "epoch": 0.5276663761933879, + "grad_norm": 0.74609375, + "learning_rate": 0.000155851996151773, + "loss": 0.8554, + "step": 20550 + }, + { + "epoch": 0.5276920533893096, + "grad_norm": 0.859375, + "learning_rate": 0.00015584829308063896, + "loss": 0.878, + "step": 20551 + }, + { + "epoch": 0.5277177305852314, + "grad_norm": 0.7578125, + "learning_rate": 0.00015584458989820435, + "loss": 0.8913, + "step": 20552 + }, + { + "epoch": 0.5277434077811533, + "grad_norm": 0.765625, + "learning_rate": 0.00015584088660447644, + "loss": 0.856, + "step": 20553 + }, + { + "epoch": 0.5277690849770751, + "grad_norm": 0.734375, + "learning_rate": 0.00015583718319946277, + "loss": 0.8525, + "step": 20554 + }, + { + "epoch": 0.5277947621729969, + "grad_norm": 0.8203125, + "learning_rate": 0.0001558334796831706, + "loss": 1.0715, + "step": 20555 + }, + { + "epoch": 0.5278204393689188, + "grad_norm": 0.80078125, + "learning_rate": 0.00015582977605560735, + "loss": 1.0087, + "step": 20556 + }, + { + "epoch": 0.5278461165648405, + "grad_norm": 0.73046875, + "learning_rate": 0.0001558260723167804, + "loss": 0.8775, + "step": 20557 + }, + { + "epoch": 0.5278717937607623, + "grad_norm": 0.7734375, + "learning_rate": 0.00015582236846669714, + "loss": 0.7913, + "step": 20558 + }, + { + "epoch": 0.5278974709566842, + "grad_norm": 0.84375, + "learning_rate": 0.00015581866450536494, + "loss": 0.9198, + "step": 20559 + }, + { + "epoch": 0.527923148152606, + "grad_norm": 0.79296875, + "learning_rate": 0.00015581496043279115, + "loss": 0.8837, + "step": 20560 + }, + { + "epoch": 0.5279488253485278, + "grad_norm": 0.80078125, + "learning_rate": 0.00015581125624898324, + "loss": 0.8224, + "step": 20561 + }, + { + "epoch": 0.5279745025444497, + "grad_norm": 0.8046875, + "learning_rate": 0.0001558075519539485, + "loss": 0.9084, + "step": 20562 + }, + { + "epoch": 0.5280001797403715, + "grad_norm": 0.7890625, + "learning_rate": 0.00015580384754769437, + "loss": 0.9804, + "step": 20563 + }, + { + "epoch": 0.5280258569362932, + "grad_norm": 0.75, + "learning_rate": 0.0001558001430302282, + "loss": 0.8984, + "step": 20564 + }, + { + "epoch": 0.5280515341322151, + "grad_norm": 0.74609375, + "learning_rate": 0.00015579643840155736, + "loss": 0.788, + "step": 20565 + }, + { + "epoch": 0.5280772113281369, + "grad_norm": 0.80859375, + "learning_rate": 0.00015579273366168932, + "loss": 0.8749, + "step": 20566 + }, + { + "epoch": 0.5281028885240587, + "grad_norm": 0.7890625, + "learning_rate": 0.00015578902881063135, + "loss": 0.8258, + "step": 20567 + }, + { + "epoch": 0.5281285657199806, + "grad_norm": 0.828125, + "learning_rate": 0.00015578532384839092, + "loss": 0.9781, + "step": 20568 + }, + { + "epoch": 0.5281542429159024, + "grad_norm": 0.8359375, + "learning_rate": 0.00015578161877497534, + "loss": 1.0521, + "step": 20569 + }, + { + "epoch": 0.5281799201118242, + "grad_norm": 0.78515625, + "learning_rate": 0.00015577791359039207, + "loss": 0.8853, + "step": 20570 + }, + { + "epoch": 0.528205597307746, + "grad_norm": 0.8203125, + "learning_rate": 0.00015577420829464847, + "loss": 0.8622, + "step": 20571 + }, + { + "epoch": 0.5282312745036678, + "grad_norm": 0.78125, + "learning_rate": 0.00015577050288775185, + "loss": 0.84, + "step": 20572 + }, + { + "epoch": 0.5282569516995896, + "grad_norm": 0.75, + "learning_rate": 0.0001557667973697097, + "loss": 0.8967, + "step": 20573 + }, + { + "epoch": 0.5282826288955115, + "grad_norm": 0.7890625, + "learning_rate": 0.00015576309174052938, + "loss": 0.9105, + "step": 20574 + }, + { + "epoch": 0.5283083060914333, + "grad_norm": 0.74609375, + "learning_rate": 0.0001557593860002182, + "loss": 0.9727, + "step": 20575 + }, + { + "epoch": 0.5283339832873551, + "grad_norm": 0.7265625, + "learning_rate": 0.0001557556801487836, + "loss": 0.8813, + "step": 20576 + }, + { + "epoch": 0.5283596604832769, + "grad_norm": 0.70703125, + "learning_rate": 0.000155751974186233, + "loss": 0.8965, + "step": 20577 + }, + { + "epoch": 0.5283853376791987, + "grad_norm": 0.76953125, + "learning_rate": 0.00015574826811257374, + "loss": 1.2244, + "step": 20578 + }, + { + "epoch": 0.5284110148751205, + "grad_norm": 0.703125, + "learning_rate": 0.00015574456192781323, + "loss": 0.812, + "step": 20579 + }, + { + "epoch": 0.5284366920710424, + "grad_norm": 0.7578125, + "learning_rate": 0.00015574085563195884, + "loss": 0.7198, + "step": 20580 + }, + { + "epoch": 0.5284623692669642, + "grad_norm": 0.8046875, + "learning_rate": 0.00015573714922501795, + "loss": 0.9913, + "step": 20581 + }, + { + "epoch": 0.528488046462886, + "grad_norm": 0.765625, + "learning_rate": 0.00015573344270699795, + "loss": 0.873, + "step": 20582 + }, + { + "epoch": 0.5285137236588079, + "grad_norm": 0.83984375, + "learning_rate": 0.00015572973607790624, + "loss": 0.9247, + "step": 20583 + }, + { + "epoch": 0.5285394008547296, + "grad_norm": 0.796875, + "learning_rate": 0.0001557260293377502, + "loss": 0.9728, + "step": 20584 + }, + { + "epoch": 0.5285650780506514, + "grad_norm": 0.71484375, + "learning_rate": 0.00015572232248653721, + "loss": 0.8328, + "step": 20585 + }, + { + "epoch": 0.5285907552465733, + "grad_norm": 0.828125, + "learning_rate": 0.0001557186155242747, + "loss": 0.8585, + "step": 20586 + }, + { + "epoch": 0.5286164324424951, + "grad_norm": 0.7265625, + "learning_rate": 0.00015571490845097, + "loss": 0.8761, + "step": 20587 + }, + { + "epoch": 0.528642109638417, + "grad_norm": 0.89453125, + "learning_rate": 0.0001557112012666305, + "loss": 0.9071, + "step": 20588 + }, + { + "epoch": 0.5286677868343388, + "grad_norm": 0.75390625, + "learning_rate": 0.00015570749397126362, + "loss": 0.7933, + "step": 20589 + }, + { + "epoch": 0.5286934640302606, + "grad_norm": 0.765625, + "learning_rate": 0.00015570378656487675, + "loss": 0.9054, + "step": 20590 + }, + { + "epoch": 0.5287191412261824, + "grad_norm": 0.80859375, + "learning_rate": 0.00015570007904747724, + "loss": 0.9242, + "step": 20591 + }, + { + "epoch": 0.5287448184221042, + "grad_norm": 0.76953125, + "learning_rate": 0.0001556963714190725, + "loss": 0.8388, + "step": 20592 + }, + { + "epoch": 0.528770495618026, + "grad_norm": 0.7890625, + "learning_rate": 0.00015569266367966998, + "loss": 1.0722, + "step": 20593 + }, + { + "epoch": 0.5287961728139479, + "grad_norm": 0.80078125, + "learning_rate": 0.000155688955829277, + "loss": 1.0151, + "step": 20594 + }, + { + "epoch": 0.5288218500098697, + "grad_norm": 0.74609375, + "learning_rate": 0.0001556852478679009, + "loss": 0.8269, + "step": 20595 + }, + { + "epoch": 0.5288475272057915, + "grad_norm": 0.86328125, + "learning_rate": 0.00015568153979554915, + "loss": 0.9687, + "step": 20596 + }, + { + "epoch": 0.5288732044017133, + "grad_norm": 0.78515625, + "learning_rate": 0.00015567783161222915, + "loss": 0.8521, + "step": 20597 + }, + { + "epoch": 0.5288988815976351, + "grad_norm": 1.28125, + "learning_rate": 0.00015567412331794826, + "loss": 0.9329, + "step": 20598 + }, + { + "epoch": 0.5289245587935569, + "grad_norm": 0.734375, + "learning_rate": 0.00015567041491271387, + "loss": 0.789, + "step": 20599 + }, + { + "epoch": 0.5289502359894788, + "grad_norm": 0.859375, + "learning_rate": 0.00015566670639653336, + "loss": 1.0333, + "step": 20600 + }, + { + "epoch": 0.5289759131854006, + "grad_norm": 0.77734375, + "learning_rate": 0.00015566299776941414, + "loss": 0.7858, + "step": 20601 + }, + { + "epoch": 0.5290015903813224, + "grad_norm": 0.86328125, + "learning_rate": 0.00015565928903136356, + "loss": 0.8898, + "step": 20602 + }, + { + "epoch": 0.5290272675772443, + "grad_norm": 0.84765625, + "learning_rate": 0.00015565558018238907, + "loss": 0.8005, + "step": 20603 + }, + { + "epoch": 0.529052944773166, + "grad_norm": 0.8046875, + "learning_rate": 0.00015565187122249804, + "loss": 0.8657, + "step": 20604 + }, + { + "epoch": 0.5290786219690878, + "grad_norm": 0.7265625, + "learning_rate": 0.00015564816215169787, + "loss": 0.8347, + "step": 20605 + }, + { + "epoch": 0.5291042991650097, + "grad_norm": 0.81640625, + "learning_rate": 0.0001556444529699959, + "loss": 0.8452, + "step": 20606 + }, + { + "epoch": 0.5291299763609315, + "grad_norm": 0.85546875, + "learning_rate": 0.0001556407436773996, + "loss": 0.8493, + "step": 20607 + }, + { + "epoch": 0.5291556535568533, + "grad_norm": 1.65625, + "learning_rate": 0.0001556370342739163, + "loss": 0.8906, + "step": 20608 + }, + { + "epoch": 0.5291813307527752, + "grad_norm": 0.80859375, + "learning_rate": 0.00015563332475955344, + "loss": 0.8292, + "step": 20609 + }, + { + "epoch": 0.529207007948697, + "grad_norm": 0.80859375, + "learning_rate": 0.00015562961513431838, + "loss": 0.8687, + "step": 20610 + }, + { + "epoch": 0.5292326851446187, + "grad_norm": 0.828125, + "learning_rate": 0.00015562590539821853, + "loss": 0.7403, + "step": 20611 + }, + { + "epoch": 0.5292583623405406, + "grad_norm": 0.71875, + "learning_rate": 0.00015562219555126125, + "loss": 0.7196, + "step": 20612 + }, + { + "epoch": 0.5292840395364624, + "grad_norm": 0.8203125, + "learning_rate": 0.00015561848559345398, + "loss": 0.9404, + "step": 20613 + }, + { + "epoch": 0.5293097167323843, + "grad_norm": 0.7109375, + "learning_rate": 0.0001556147755248041, + "loss": 0.863, + "step": 20614 + }, + { + "epoch": 0.5293353939283061, + "grad_norm": 0.71875, + "learning_rate": 0.00015561106534531897, + "loss": 0.7353, + "step": 20615 + }, + { + "epoch": 0.5293610711242279, + "grad_norm": 0.8125, + "learning_rate": 0.000155607355055006, + "loss": 0.8851, + "step": 20616 + }, + { + "epoch": 0.5293867483201496, + "grad_norm": 0.71484375, + "learning_rate": 0.00015560364465387264, + "loss": 0.8053, + "step": 20617 + }, + { + "epoch": 0.5294124255160715, + "grad_norm": 0.76171875, + "learning_rate": 0.0001555999341419262, + "loss": 1.0035, + "step": 20618 + }, + { + "epoch": 0.5294381027119933, + "grad_norm": 0.78125, + "learning_rate": 0.00015559622351917412, + "loss": 0.8971, + "step": 20619 + }, + { + "epoch": 0.5294637799079152, + "grad_norm": 0.90625, + "learning_rate": 0.0001555925127856238, + "loss": 0.8823, + "step": 20620 + }, + { + "epoch": 0.529489457103837, + "grad_norm": 0.8515625, + "learning_rate": 0.00015558880194128263, + "loss": 0.8682, + "step": 20621 + }, + { + "epoch": 0.5295151342997588, + "grad_norm": 0.83203125, + "learning_rate": 0.000155585090986158, + "loss": 0.8657, + "step": 20622 + }, + { + "epoch": 0.5295408114956807, + "grad_norm": 0.8046875, + "learning_rate": 0.00015558137992025727, + "loss": 0.9041, + "step": 20623 + }, + { + "epoch": 0.5295664886916024, + "grad_norm": 0.79296875, + "learning_rate": 0.0001555776687435879, + "loss": 0.87, + "step": 20624 + }, + { + "epoch": 0.5295921658875242, + "grad_norm": 0.734375, + "learning_rate": 0.00015557395745615724, + "loss": 0.9288, + "step": 20625 + }, + { + "epoch": 0.5296178430834461, + "grad_norm": 0.8125, + "learning_rate": 0.00015557024605797272, + "loss": 0.9289, + "step": 20626 + }, + { + "epoch": 0.5296435202793679, + "grad_norm": 0.71484375, + "learning_rate": 0.00015556653454904172, + "loss": 0.7873, + "step": 20627 + }, + { + "epoch": 0.5296691974752897, + "grad_norm": 0.78515625, + "learning_rate": 0.00015556282292937163, + "loss": 0.8699, + "step": 20628 + }, + { + "epoch": 0.5296948746712116, + "grad_norm": 0.79296875, + "learning_rate": 0.00015555911119896984, + "loss": 1.0027, + "step": 20629 + }, + { + "epoch": 0.5297205518671334, + "grad_norm": 0.77734375, + "learning_rate": 0.00015555539935784376, + "loss": 0.9284, + "step": 20630 + }, + { + "epoch": 0.5297462290630551, + "grad_norm": 0.7421875, + "learning_rate": 0.0001555516874060008, + "loss": 0.8668, + "step": 20631 + }, + { + "epoch": 0.529771906258977, + "grad_norm": 0.75390625, + "learning_rate": 0.00015554797534344833, + "loss": 0.9629, + "step": 20632 + }, + { + "epoch": 0.5297975834548988, + "grad_norm": 1.2265625, + "learning_rate": 0.0001555442631701938, + "loss": 0.8304, + "step": 20633 + }, + { + "epoch": 0.5298232606508206, + "grad_norm": 0.87109375, + "learning_rate": 0.00015554055088624454, + "loss": 0.9362, + "step": 20634 + }, + { + "epoch": 0.5298489378467425, + "grad_norm": 0.890625, + "learning_rate": 0.000155536838491608, + "loss": 0.9161, + "step": 20635 + }, + { + "epoch": 0.5298746150426643, + "grad_norm": 0.8046875, + "learning_rate": 0.00015553312598629154, + "loss": 0.967, + "step": 20636 + }, + { + "epoch": 0.529900292238586, + "grad_norm": 0.79296875, + "learning_rate": 0.00015552941337030257, + "loss": 0.9438, + "step": 20637 + }, + { + "epoch": 0.5299259694345079, + "grad_norm": 0.71875, + "learning_rate": 0.0001555257006436485, + "loss": 0.8045, + "step": 20638 + }, + { + "epoch": 0.5299516466304297, + "grad_norm": 0.79296875, + "learning_rate": 0.00015552198780633674, + "loss": 0.9449, + "step": 20639 + }, + { + "epoch": 0.5299773238263515, + "grad_norm": 0.80078125, + "learning_rate": 0.00015551827485837466, + "loss": 0.8094, + "step": 20640 + }, + { + "epoch": 0.5300030010222734, + "grad_norm": 0.7734375, + "learning_rate": 0.0001555145617997697, + "loss": 0.844, + "step": 20641 + }, + { + "epoch": 0.5300286782181952, + "grad_norm": 0.81640625, + "learning_rate": 0.0001555108486305292, + "loss": 0.8829, + "step": 20642 + }, + { + "epoch": 0.530054355414117, + "grad_norm": 0.79296875, + "learning_rate": 0.0001555071353506606, + "loss": 0.8948, + "step": 20643 + }, + { + "epoch": 0.5300800326100388, + "grad_norm": 0.89453125, + "learning_rate": 0.0001555034219601713, + "loss": 1.1108, + "step": 20644 + }, + { + "epoch": 0.5301057098059606, + "grad_norm": 0.88671875, + "learning_rate": 0.00015549970845906867, + "loss": 0.9515, + "step": 20645 + }, + { + "epoch": 0.5301313870018824, + "grad_norm": 0.796875, + "learning_rate": 0.00015549599484736016, + "loss": 0.8629, + "step": 20646 + }, + { + "epoch": 0.5301570641978043, + "grad_norm": 0.77734375, + "learning_rate": 0.00015549228112505314, + "loss": 0.8198, + "step": 20647 + }, + { + "epoch": 0.5301827413937261, + "grad_norm": 0.890625, + "learning_rate": 0.00015548856729215502, + "loss": 0.8438, + "step": 20648 + }, + { + "epoch": 0.530208418589648, + "grad_norm": 0.75, + "learning_rate": 0.00015548485334867317, + "loss": 0.8806, + "step": 20649 + }, + { + "epoch": 0.5302340957855698, + "grad_norm": 0.765625, + "learning_rate": 0.00015548113929461505, + "loss": 0.9158, + "step": 20650 + }, + { + "epoch": 0.5302597729814915, + "grad_norm": 0.75390625, + "learning_rate": 0.00015547742512998802, + "loss": 0.8464, + "step": 20651 + }, + { + "epoch": 0.5302854501774134, + "grad_norm": 0.7421875, + "learning_rate": 0.0001554737108547995, + "loss": 0.7621, + "step": 20652 + }, + { + "epoch": 0.5303111273733352, + "grad_norm": 0.74609375, + "learning_rate": 0.00015546999646905688, + "loss": 0.9069, + "step": 20653 + }, + { + "epoch": 0.530336804569257, + "grad_norm": 0.68359375, + "learning_rate": 0.00015546628197276756, + "loss": 0.7756, + "step": 20654 + }, + { + "epoch": 0.5303624817651789, + "grad_norm": 0.7890625, + "learning_rate": 0.00015546256736593896, + "loss": 1.0134, + "step": 20655 + }, + { + "epoch": 0.5303881589611007, + "grad_norm": 0.7734375, + "learning_rate": 0.00015545885264857848, + "loss": 0.9332, + "step": 20656 + }, + { + "epoch": 0.5304138361570224, + "grad_norm": 0.7734375, + "learning_rate": 0.0001554551378206935, + "loss": 0.8057, + "step": 20657 + }, + { + "epoch": 0.5304395133529443, + "grad_norm": 0.80859375, + "learning_rate": 0.00015545142288229144, + "loss": 1.0179, + "step": 20658 + }, + { + "epoch": 0.5304651905488661, + "grad_norm": 0.828125, + "learning_rate": 0.0001554477078333797, + "loss": 0.849, + "step": 20659 + }, + { + "epoch": 0.5304908677447879, + "grad_norm": 0.71875, + "learning_rate": 0.00015544399267396574, + "loss": 0.8559, + "step": 20660 + }, + { + "epoch": 0.5305165449407098, + "grad_norm": 0.79296875, + "learning_rate": 0.00015544027740405685, + "loss": 0.8983, + "step": 20661 + }, + { + "epoch": 0.5305422221366316, + "grad_norm": 0.82421875, + "learning_rate": 0.0001554365620236605, + "loss": 1.0381, + "step": 20662 + }, + { + "epoch": 0.5305678993325534, + "grad_norm": 0.80078125, + "learning_rate": 0.0001554328465327841, + "loss": 0.8997, + "step": 20663 + }, + { + "epoch": 0.5305935765284752, + "grad_norm": 0.73046875, + "learning_rate": 0.00015542913093143505, + "loss": 0.9309, + "step": 20664 + }, + { + "epoch": 0.530619253724397, + "grad_norm": 0.79296875, + "learning_rate": 0.00015542541521962075, + "loss": 0.7741, + "step": 20665 + }, + { + "epoch": 0.5306449309203188, + "grad_norm": 0.765625, + "learning_rate": 0.00015542169939734862, + "loss": 0.9277, + "step": 20666 + }, + { + "epoch": 0.5306706081162407, + "grad_norm": 0.70703125, + "learning_rate": 0.000155417983464626, + "loss": 0.7865, + "step": 20667 + }, + { + "epoch": 0.5306962853121625, + "grad_norm": 0.7265625, + "learning_rate": 0.0001554142674214604, + "loss": 0.8929, + "step": 20668 + }, + { + "epoch": 0.5307219625080843, + "grad_norm": 0.81640625, + "learning_rate": 0.00015541055126785914, + "loss": 0.9193, + "step": 20669 + }, + { + "epoch": 0.5307476397040061, + "grad_norm": 0.7890625, + "learning_rate": 0.00015540683500382964, + "loss": 0.9074, + "step": 20670 + }, + { + "epoch": 0.5307733168999279, + "grad_norm": 0.7578125, + "learning_rate": 0.00015540311862937935, + "loss": 0.7447, + "step": 20671 + }, + { + "epoch": 0.5307989940958497, + "grad_norm": 0.7421875, + "learning_rate": 0.00015539940214451568, + "loss": 0.9456, + "step": 20672 + }, + { + "epoch": 0.5308246712917716, + "grad_norm": 0.83203125, + "learning_rate": 0.00015539568554924596, + "loss": 0.8948, + "step": 20673 + }, + { + "epoch": 0.5308503484876934, + "grad_norm": 0.73046875, + "learning_rate": 0.00015539196884357763, + "loss": 0.7615, + "step": 20674 + }, + { + "epoch": 0.5308760256836152, + "grad_norm": 0.7265625, + "learning_rate": 0.00015538825202751811, + "loss": 0.8723, + "step": 20675 + }, + { + "epoch": 0.5309017028795371, + "grad_norm": 0.73828125, + "learning_rate": 0.00015538453510107483, + "loss": 0.9188, + "step": 20676 + }, + { + "epoch": 0.5309273800754588, + "grad_norm": 0.72265625, + "learning_rate": 0.00015538081806425516, + "loss": 0.9522, + "step": 20677 + }, + { + "epoch": 0.5309530572713806, + "grad_norm": 0.73828125, + "learning_rate": 0.00015537710091706653, + "loss": 0.8867, + "step": 20678 + }, + { + "epoch": 0.5309787344673025, + "grad_norm": 0.75, + "learning_rate": 0.00015537338365951637, + "loss": 0.9785, + "step": 20679 + }, + { + "epoch": 0.5310044116632243, + "grad_norm": 0.7109375, + "learning_rate": 0.00015536966629161204, + "loss": 0.8161, + "step": 20680 + }, + { + "epoch": 0.5310300888591462, + "grad_norm": 0.78515625, + "learning_rate": 0.00015536594881336097, + "loss": 0.9173, + "step": 20681 + }, + { + "epoch": 0.531055766055068, + "grad_norm": 0.78125, + "learning_rate": 0.00015536223122477054, + "loss": 0.9218, + "step": 20682 + }, + { + "epoch": 0.5310814432509898, + "grad_norm": 0.875, + "learning_rate": 0.0001553585135258482, + "loss": 0.9533, + "step": 20683 + }, + { + "epoch": 0.5311071204469116, + "grad_norm": 0.87109375, + "learning_rate": 0.00015535479571660132, + "loss": 0.8797, + "step": 20684 + }, + { + "epoch": 0.5311327976428334, + "grad_norm": 0.765625, + "learning_rate": 0.00015535107779703736, + "loss": 0.8884, + "step": 20685 + }, + { + "epoch": 0.5311584748387552, + "grad_norm": 0.7265625, + "learning_rate": 0.0001553473597671637, + "loss": 0.8979, + "step": 20686 + }, + { + "epoch": 0.5311841520346771, + "grad_norm": 0.8203125, + "learning_rate": 0.00015534364162698777, + "loss": 1.0199, + "step": 20687 + }, + { + "epoch": 0.5312098292305989, + "grad_norm": 0.80859375, + "learning_rate": 0.00015533992337651693, + "loss": 0.9357, + "step": 20688 + }, + { + "epoch": 0.5312355064265207, + "grad_norm": 0.7265625, + "learning_rate": 0.0001553362050157586, + "loss": 0.7848, + "step": 20689 + }, + { + "epoch": 0.5312611836224425, + "grad_norm": 0.7890625, + "learning_rate": 0.00015533248654472025, + "loss": 0.8283, + "step": 20690 + }, + { + "epoch": 0.5312868608183643, + "grad_norm": 0.734375, + "learning_rate": 0.00015532876796340927, + "loss": 0.7752, + "step": 20691 + }, + { + "epoch": 0.5313125380142861, + "grad_norm": 0.88671875, + "learning_rate": 0.00015532504927183303, + "loss": 0.8483, + "step": 20692 + }, + { + "epoch": 0.531338215210208, + "grad_norm": 0.78125, + "learning_rate": 0.00015532133046999895, + "loss": 0.9764, + "step": 20693 + }, + { + "epoch": 0.5313638924061298, + "grad_norm": 0.8125, + "learning_rate": 0.00015531761155791447, + "loss": 0.9815, + "step": 20694 + }, + { + "epoch": 0.5313895696020516, + "grad_norm": 0.76171875, + "learning_rate": 0.000155313892535587, + "loss": 0.9108, + "step": 20695 + }, + { + "epoch": 0.5314152467979735, + "grad_norm": 0.74609375, + "learning_rate": 0.00015531017340302393, + "loss": 1.059, + "step": 20696 + }, + { + "epoch": 0.5314409239938952, + "grad_norm": 0.74609375, + "learning_rate": 0.00015530645416023264, + "loss": 0.7235, + "step": 20697 + }, + { + "epoch": 0.531466601189817, + "grad_norm": 0.80859375, + "learning_rate": 0.0001553027348072206, + "loss": 0.9177, + "step": 20698 + }, + { + "epoch": 0.5314922783857389, + "grad_norm": 0.71875, + "learning_rate": 0.00015529901534399524, + "loss": 0.9313, + "step": 20699 + }, + { + "epoch": 0.5315179555816607, + "grad_norm": 0.82421875, + "learning_rate": 0.00015529529577056392, + "loss": 0.9022, + "step": 20700 + }, + { + "epoch": 0.5315436327775825, + "grad_norm": 0.78125, + "learning_rate": 0.00015529157608693407, + "loss": 0.8746, + "step": 20701 + }, + { + "epoch": 0.5315693099735044, + "grad_norm": 0.796875, + "learning_rate": 0.0001552878562931131, + "loss": 0.9614, + "step": 20702 + }, + { + "epoch": 0.5315949871694262, + "grad_norm": 0.765625, + "learning_rate": 0.00015528413638910845, + "loss": 0.98, + "step": 20703 + }, + { + "epoch": 0.5316206643653479, + "grad_norm": 0.7890625, + "learning_rate": 0.00015528041637492747, + "loss": 0.897, + "step": 20704 + }, + { + "epoch": 0.5316463415612698, + "grad_norm": 0.71875, + "learning_rate": 0.00015527669625057763, + "loss": 0.8808, + "step": 20705 + }, + { + "epoch": 0.5316720187571916, + "grad_norm": 0.76171875, + "learning_rate": 0.00015527297601606633, + "loss": 0.8879, + "step": 20706 + }, + { + "epoch": 0.5316976959531134, + "grad_norm": 0.8203125, + "learning_rate": 0.00015526925567140097, + "loss": 0.9366, + "step": 20707 + }, + { + "epoch": 0.5317233731490353, + "grad_norm": 0.6796875, + "learning_rate": 0.000155265535216589, + "loss": 0.9003, + "step": 20708 + }, + { + "epoch": 0.5317490503449571, + "grad_norm": 0.7265625, + "learning_rate": 0.0001552618146516378, + "loss": 0.8147, + "step": 20709 + }, + { + "epoch": 0.5317747275408788, + "grad_norm": 0.7890625, + "learning_rate": 0.0001552580939765548, + "loss": 1.0481, + "step": 20710 + }, + { + "epoch": 0.5318004047368007, + "grad_norm": 0.7890625, + "learning_rate": 0.0001552543731913474, + "loss": 0.9121, + "step": 20711 + }, + { + "epoch": 0.5318260819327225, + "grad_norm": 0.80078125, + "learning_rate": 0.00015525065229602303, + "loss": 0.9399, + "step": 20712 + }, + { + "epoch": 0.5318517591286444, + "grad_norm": 0.78515625, + "learning_rate": 0.0001552469312905891, + "loss": 0.9203, + "step": 20713 + }, + { + "epoch": 0.5318774363245662, + "grad_norm": 0.75390625, + "learning_rate": 0.00015524321017505304, + "loss": 0.9404, + "step": 20714 + }, + { + "epoch": 0.531903113520488, + "grad_norm": 0.75, + "learning_rate": 0.0001552394889494222, + "loss": 0.8072, + "step": 20715 + }, + { + "epoch": 0.5319287907164099, + "grad_norm": 0.79296875, + "learning_rate": 0.00015523576761370409, + "loss": 0.7702, + "step": 20716 + }, + { + "epoch": 0.5319544679123316, + "grad_norm": 0.78515625, + "learning_rate": 0.00015523204616790608, + "loss": 0.838, + "step": 20717 + }, + { + "epoch": 0.5319801451082534, + "grad_norm": 0.8828125, + "learning_rate": 0.00015522832461203561, + "loss": 0.7991, + "step": 20718 + }, + { + "epoch": 0.5320058223041753, + "grad_norm": 0.76953125, + "learning_rate": 0.00015522460294610006, + "loss": 0.9223, + "step": 20719 + }, + { + "epoch": 0.5320314995000971, + "grad_norm": 0.76953125, + "learning_rate": 0.00015522088117010687, + "loss": 0.862, + "step": 20720 + }, + { + "epoch": 0.5320571766960189, + "grad_norm": 0.796875, + "learning_rate": 0.00015521715928406345, + "loss": 1.0335, + "step": 20721 + }, + { + "epoch": 0.5320828538919408, + "grad_norm": 0.78515625, + "learning_rate": 0.0001552134372879772, + "loss": 0.9937, + "step": 20722 + }, + { + "epoch": 0.5321085310878626, + "grad_norm": 0.78125, + "learning_rate": 0.00015520971518185556, + "loss": 1.1018, + "step": 20723 + }, + { + "epoch": 0.5321342082837843, + "grad_norm": 0.78125, + "learning_rate": 0.00015520599296570597, + "loss": 0.8164, + "step": 20724 + }, + { + "epoch": 0.5321598854797062, + "grad_norm": 0.7890625, + "learning_rate": 0.00015520227063953583, + "loss": 0.8987, + "step": 20725 + }, + { + "epoch": 0.532185562675628, + "grad_norm": 0.81640625, + "learning_rate": 0.0001551985482033525, + "loss": 0.8857, + "step": 20726 + }, + { + "epoch": 0.5322112398715498, + "grad_norm": 0.7578125, + "learning_rate": 0.0001551948256571635, + "loss": 0.9866, + "step": 20727 + }, + { + "epoch": 0.5322369170674717, + "grad_norm": 0.81640625, + "learning_rate": 0.00015519110300097616, + "loss": 0.8878, + "step": 20728 + }, + { + "epoch": 0.5322625942633935, + "grad_norm": 0.7578125, + "learning_rate": 0.00015518738023479798, + "loss": 0.8518, + "step": 20729 + }, + { + "epoch": 0.5322882714593152, + "grad_norm": 0.8046875, + "learning_rate": 0.00015518365735863628, + "loss": 0.8386, + "step": 20730 + }, + { + "epoch": 0.5323139486552371, + "grad_norm": 0.72265625, + "learning_rate": 0.0001551799343724986, + "loss": 0.9289, + "step": 20731 + }, + { + "epoch": 0.5323396258511589, + "grad_norm": 0.7578125, + "learning_rate": 0.00015517621127639225, + "loss": 0.9091, + "step": 20732 + }, + { + "epoch": 0.5323653030470807, + "grad_norm": 0.80078125, + "learning_rate": 0.00015517248807032469, + "loss": 0.9061, + "step": 20733 + }, + { + "epoch": 0.5323909802430026, + "grad_norm": 1.0234375, + "learning_rate": 0.00015516876475430338, + "loss": 0.8099, + "step": 20734 + }, + { + "epoch": 0.5324166574389244, + "grad_norm": 0.76171875, + "learning_rate": 0.0001551650413283357, + "loss": 0.9395, + "step": 20735 + }, + { + "epoch": 0.5324423346348462, + "grad_norm": 0.90234375, + "learning_rate": 0.00015516131779242905, + "loss": 0.9809, + "step": 20736 + }, + { + "epoch": 0.532468011830768, + "grad_norm": 0.73828125, + "learning_rate": 0.00015515759414659087, + "loss": 0.7848, + "step": 20737 + }, + { + "epoch": 0.5324936890266898, + "grad_norm": 0.81640625, + "learning_rate": 0.00015515387039082864, + "loss": 0.8848, + "step": 20738 + }, + { + "epoch": 0.5325193662226116, + "grad_norm": 0.8125, + "learning_rate": 0.00015515014652514968, + "loss": 0.9869, + "step": 20739 + }, + { + "epoch": 0.5325450434185335, + "grad_norm": 0.8203125, + "learning_rate": 0.0001551464225495615, + "loss": 1.0218, + "step": 20740 + }, + { + "epoch": 0.5325707206144553, + "grad_norm": 0.75390625, + "learning_rate": 0.00015514269846407142, + "loss": 0.9175, + "step": 20741 + }, + { + "epoch": 0.5325963978103772, + "grad_norm": 0.8359375, + "learning_rate": 0.00015513897426868697, + "loss": 0.9473, + "step": 20742 + }, + { + "epoch": 0.532622075006299, + "grad_norm": 0.78515625, + "learning_rate": 0.0001551352499634155, + "loss": 0.8356, + "step": 20743 + }, + { + "epoch": 0.5326477522022207, + "grad_norm": 0.828125, + "learning_rate": 0.00015513152554826447, + "loss": 0.9028, + "step": 20744 + }, + { + "epoch": 0.5326734293981426, + "grad_norm": 0.8046875, + "learning_rate": 0.0001551278010232413, + "loss": 0.8701, + "step": 20745 + }, + { + "epoch": 0.5326991065940644, + "grad_norm": 0.828125, + "learning_rate": 0.0001551240763883534, + "loss": 0.8191, + "step": 20746 + }, + { + "epoch": 0.5327247837899862, + "grad_norm": 0.76953125, + "learning_rate": 0.00015512035164360819, + "loss": 0.9483, + "step": 20747 + }, + { + "epoch": 0.5327504609859081, + "grad_norm": 0.80859375, + "learning_rate": 0.0001551166267890131, + "loss": 0.7808, + "step": 20748 + }, + { + "epoch": 0.5327761381818299, + "grad_norm": 0.765625, + "learning_rate": 0.00015511290182457555, + "loss": 0.8337, + "step": 20749 + }, + { + "epoch": 0.5328018153777516, + "grad_norm": 0.85546875, + "learning_rate": 0.00015510917675030293, + "loss": 0.8544, + "step": 20750 + }, + { + "epoch": 0.5328274925736735, + "grad_norm": 0.796875, + "learning_rate": 0.00015510545156620275, + "loss": 0.8817, + "step": 20751 + }, + { + "epoch": 0.5328531697695953, + "grad_norm": 0.7265625, + "learning_rate": 0.00015510172627228238, + "loss": 0.8939, + "step": 20752 + }, + { + "epoch": 0.5328788469655171, + "grad_norm": 0.71484375, + "learning_rate": 0.00015509800086854924, + "loss": 0.9553, + "step": 20753 + }, + { + "epoch": 0.532904524161439, + "grad_norm": 0.765625, + "learning_rate": 0.00015509427535501073, + "loss": 0.9967, + "step": 20754 + }, + { + "epoch": 0.5329302013573608, + "grad_norm": 0.7265625, + "learning_rate": 0.0001550905497316743, + "loss": 0.7886, + "step": 20755 + }, + { + "epoch": 0.5329558785532826, + "grad_norm": 0.71875, + "learning_rate": 0.00015508682399854743, + "loss": 0.9084, + "step": 20756 + }, + { + "epoch": 0.5329815557492044, + "grad_norm": 0.87890625, + "learning_rate": 0.0001550830981556375, + "loss": 0.9748, + "step": 20757 + }, + { + "epoch": 0.5330072329451262, + "grad_norm": 0.78125, + "learning_rate": 0.0001550793722029519, + "loss": 0.8642, + "step": 20758 + }, + { + "epoch": 0.533032910141048, + "grad_norm": 0.8203125, + "learning_rate": 0.00015507564614049808, + "loss": 0.9371, + "step": 20759 + }, + { + "epoch": 0.5330585873369699, + "grad_norm": 0.91015625, + "learning_rate": 0.00015507191996828347, + "loss": 0.7834, + "step": 20760 + }, + { + "epoch": 0.5330842645328917, + "grad_norm": 0.82421875, + "learning_rate": 0.00015506819368631553, + "loss": 0.8519, + "step": 20761 + }, + { + "epoch": 0.5331099417288135, + "grad_norm": 0.89453125, + "learning_rate": 0.00015506446729460165, + "loss": 0.9152, + "step": 20762 + }, + { + "epoch": 0.5331356189247354, + "grad_norm": 0.8984375, + "learning_rate": 0.00015506074079314923, + "loss": 0.9892, + "step": 20763 + }, + { + "epoch": 0.5331612961206571, + "grad_norm": 0.83203125, + "learning_rate": 0.00015505701418196574, + "loss": 0.872, + "step": 20764 + }, + { + "epoch": 0.5331869733165789, + "grad_norm": 0.73046875, + "learning_rate": 0.00015505328746105864, + "loss": 0.7738, + "step": 20765 + }, + { + "epoch": 0.5332126505125008, + "grad_norm": 0.80859375, + "learning_rate": 0.00015504956063043524, + "loss": 0.8837, + "step": 20766 + }, + { + "epoch": 0.5332383277084226, + "grad_norm": 0.80859375, + "learning_rate": 0.00015504583369010307, + "loss": 0.9992, + "step": 20767 + }, + { + "epoch": 0.5332640049043444, + "grad_norm": 0.79296875, + "learning_rate": 0.00015504210664006953, + "loss": 0.9999, + "step": 20768 + }, + { + "epoch": 0.5332896821002663, + "grad_norm": 0.8359375, + "learning_rate": 0.000155038379480342, + "loss": 0.8236, + "step": 20769 + }, + { + "epoch": 0.533315359296188, + "grad_norm": 0.85546875, + "learning_rate": 0.000155034652210928, + "loss": 0.8127, + "step": 20770 + }, + { + "epoch": 0.5333410364921098, + "grad_norm": 0.828125, + "learning_rate": 0.00015503092483183493, + "loss": 0.9315, + "step": 20771 + }, + { + "epoch": 0.5333667136880317, + "grad_norm": 0.75, + "learning_rate": 0.00015502719734307015, + "loss": 0.8538, + "step": 20772 + }, + { + "epoch": 0.5333923908839535, + "grad_norm": 0.828125, + "learning_rate": 0.00015502346974464114, + "loss": 0.8196, + "step": 20773 + }, + { + "epoch": 0.5334180680798754, + "grad_norm": 0.8203125, + "learning_rate": 0.00015501974203655533, + "loss": 0.8558, + "step": 20774 + }, + { + "epoch": 0.5334437452757972, + "grad_norm": 0.734375, + "learning_rate": 0.00015501601421882017, + "loss": 0.9005, + "step": 20775 + }, + { + "epoch": 0.533469422471719, + "grad_norm": 1.2734375, + "learning_rate": 0.00015501228629144305, + "loss": 0.8739, + "step": 20776 + }, + { + "epoch": 0.5334950996676407, + "grad_norm": 0.83203125, + "learning_rate": 0.0001550085582544314, + "loss": 0.8494, + "step": 20777 + }, + { + "epoch": 0.5335207768635626, + "grad_norm": 0.796875, + "learning_rate": 0.00015500483010779268, + "loss": 0.8298, + "step": 20778 + }, + { + "epoch": 0.5335464540594844, + "grad_norm": 0.80859375, + "learning_rate": 0.00015500110185153428, + "loss": 0.8536, + "step": 20779 + }, + { + "epoch": 0.5335721312554063, + "grad_norm": 0.78515625, + "learning_rate": 0.00015499737348566365, + "loss": 0.8839, + "step": 20780 + }, + { + "epoch": 0.5335978084513281, + "grad_norm": 0.859375, + "learning_rate": 0.00015499364501018824, + "loss": 0.9582, + "step": 20781 + }, + { + "epoch": 0.5336234856472499, + "grad_norm": 0.7421875, + "learning_rate": 0.00015498991642511547, + "loss": 0.8186, + "step": 20782 + }, + { + "epoch": 0.5336491628431718, + "grad_norm": 0.7890625, + "learning_rate": 0.00015498618773045273, + "loss": 1.0012, + "step": 20783 + }, + { + "epoch": 0.5336748400390935, + "grad_norm": 0.77734375, + "learning_rate": 0.00015498245892620754, + "loss": 0.8355, + "step": 20784 + }, + { + "epoch": 0.5337005172350153, + "grad_norm": 0.734375, + "learning_rate": 0.00015497873001238724, + "loss": 0.8883, + "step": 20785 + }, + { + "epoch": 0.5337261944309372, + "grad_norm": 0.890625, + "learning_rate": 0.0001549750009889993, + "loss": 0.9519, + "step": 20786 + }, + { + "epoch": 0.533751871626859, + "grad_norm": 0.7734375, + "learning_rate": 0.00015497127185605113, + "loss": 1.0116, + "step": 20787 + }, + { + "epoch": 0.5337775488227808, + "grad_norm": 0.8125, + "learning_rate": 0.00015496754261355022, + "loss": 0.8576, + "step": 20788 + }, + { + "epoch": 0.5338032260187027, + "grad_norm": 0.7265625, + "learning_rate": 0.00015496381326150394, + "loss": 0.7942, + "step": 20789 + }, + { + "epoch": 0.5338289032146244, + "grad_norm": 0.71484375, + "learning_rate": 0.00015496008379991974, + "loss": 0.6974, + "step": 20790 + }, + { + "epoch": 0.5338545804105462, + "grad_norm": 0.8203125, + "learning_rate": 0.00015495635422880507, + "loss": 0.8574, + "step": 20791 + }, + { + "epoch": 0.5338802576064681, + "grad_norm": 0.8515625, + "learning_rate": 0.00015495262454816734, + "loss": 0.8841, + "step": 20792 + }, + { + "epoch": 0.5339059348023899, + "grad_norm": 0.7890625, + "learning_rate": 0.00015494889475801399, + "loss": 0.7962, + "step": 20793 + }, + { + "epoch": 0.5339316119983117, + "grad_norm": 0.7578125, + "learning_rate": 0.00015494516485835248, + "loss": 0.8294, + "step": 20794 + }, + { + "epoch": 0.5339572891942336, + "grad_norm": 0.80078125, + "learning_rate": 0.0001549414348491902, + "loss": 0.8748, + "step": 20795 + }, + { + "epoch": 0.5339829663901554, + "grad_norm": 0.828125, + "learning_rate": 0.00015493770473053463, + "loss": 0.9014, + "step": 20796 + }, + { + "epoch": 0.5340086435860771, + "grad_norm": 0.890625, + "learning_rate": 0.00015493397450239314, + "loss": 0.8615, + "step": 20797 + }, + { + "epoch": 0.534034320781999, + "grad_norm": 0.7890625, + "learning_rate": 0.0001549302441647732, + "loss": 0.8761, + "step": 20798 + }, + { + "epoch": 0.5340599979779208, + "grad_norm": 0.81640625, + "learning_rate": 0.00015492651371768227, + "loss": 0.9473, + "step": 20799 + }, + { + "epoch": 0.5340856751738426, + "grad_norm": 0.80859375, + "learning_rate": 0.00015492278316112774, + "loss": 0.7995, + "step": 20800 + }, + { + "epoch": 0.5341113523697645, + "grad_norm": 0.83203125, + "learning_rate": 0.00015491905249511708, + "loss": 0.7894, + "step": 20801 + }, + { + "epoch": 0.5341370295656863, + "grad_norm": 0.7734375, + "learning_rate": 0.0001549153217196577, + "loss": 0.797, + "step": 20802 + }, + { + "epoch": 0.5341627067616082, + "grad_norm": 0.73046875, + "learning_rate": 0.00015491159083475705, + "loss": 0.953, + "step": 20803 + }, + { + "epoch": 0.5341883839575299, + "grad_norm": 0.76953125, + "learning_rate": 0.00015490785984042258, + "loss": 0.8799, + "step": 20804 + }, + { + "epoch": 0.5342140611534517, + "grad_norm": 0.75, + "learning_rate": 0.0001549041287366617, + "loss": 0.8625, + "step": 20805 + }, + { + "epoch": 0.5342397383493735, + "grad_norm": 0.76953125, + "learning_rate": 0.00015490039752348182, + "loss": 0.8067, + "step": 20806 + }, + { + "epoch": 0.5342654155452954, + "grad_norm": 0.79296875, + "learning_rate": 0.00015489666620089042, + "loss": 0.8176, + "step": 20807 + }, + { + "epoch": 0.5342910927412172, + "grad_norm": 0.66796875, + "learning_rate": 0.0001548929347688949, + "loss": 0.8567, + "step": 20808 + }, + { + "epoch": 0.5343167699371391, + "grad_norm": 0.7421875, + "learning_rate": 0.00015488920322750277, + "loss": 0.9285, + "step": 20809 + }, + { + "epoch": 0.5343424471330608, + "grad_norm": 0.80859375, + "learning_rate": 0.0001548854715767214, + "loss": 0.8604, + "step": 20810 + }, + { + "epoch": 0.5343681243289826, + "grad_norm": 0.859375, + "learning_rate": 0.00015488173981655822, + "loss": 0.8329, + "step": 20811 + }, + { + "epoch": 0.5343938015249045, + "grad_norm": 0.703125, + "learning_rate": 0.0001548780079470207, + "loss": 0.8646, + "step": 20812 + }, + { + "epoch": 0.5344194787208263, + "grad_norm": 0.765625, + "learning_rate": 0.00015487427596811627, + "loss": 0.8358, + "step": 20813 + }, + { + "epoch": 0.5344451559167481, + "grad_norm": 0.84765625, + "learning_rate": 0.00015487054387985236, + "loss": 1.0763, + "step": 20814 + }, + { + "epoch": 0.53447083311267, + "grad_norm": 0.81640625, + "learning_rate": 0.0001548668116822364, + "loss": 0.9413, + "step": 20815 + }, + { + "epoch": 0.5344965103085918, + "grad_norm": 1.1953125, + "learning_rate": 0.0001548630793752759, + "loss": 0.9197, + "step": 20816 + }, + { + "epoch": 0.5345221875045135, + "grad_norm": 0.91796875, + "learning_rate": 0.00015485934695897818, + "loss": 0.942, + "step": 20817 + }, + { + "epoch": 0.5345478647004354, + "grad_norm": 0.80078125, + "learning_rate": 0.00015485561443335074, + "loss": 0.9375, + "step": 20818 + }, + { + "epoch": 0.5345735418963572, + "grad_norm": 0.8515625, + "learning_rate": 0.00015485188179840105, + "loss": 0.91, + "step": 20819 + }, + { + "epoch": 0.534599219092279, + "grad_norm": 0.91796875, + "learning_rate": 0.00015484814905413648, + "loss": 0.8035, + "step": 20820 + }, + { + "epoch": 0.5346248962882009, + "grad_norm": 0.71875, + "learning_rate": 0.0001548444162005645, + "loss": 0.8625, + "step": 20821 + }, + { + "epoch": 0.5346505734841227, + "grad_norm": 0.7421875, + "learning_rate": 0.00015484068323769255, + "loss": 0.928, + "step": 20822 + }, + { + "epoch": 0.5346762506800445, + "grad_norm": 0.77734375, + "learning_rate": 0.00015483695016552812, + "loss": 0.9438, + "step": 20823 + }, + { + "epoch": 0.5347019278759663, + "grad_norm": 0.78515625, + "learning_rate": 0.00015483321698407857, + "loss": 0.8013, + "step": 20824 + }, + { + "epoch": 0.5347276050718881, + "grad_norm": 0.82421875, + "learning_rate": 0.0001548294836933514, + "loss": 0.7706, + "step": 20825 + }, + { + "epoch": 0.5347532822678099, + "grad_norm": 0.734375, + "learning_rate": 0.00015482575029335399, + "loss": 0.7959, + "step": 20826 + }, + { + "epoch": 0.5347789594637318, + "grad_norm": 0.7734375, + "learning_rate": 0.0001548220167840938, + "loss": 0.9671, + "step": 20827 + }, + { + "epoch": 0.5348046366596536, + "grad_norm": 0.77734375, + "learning_rate": 0.00015481828316557831, + "loss": 0.9241, + "step": 20828 + }, + { + "epoch": 0.5348303138555754, + "grad_norm": 0.75390625, + "learning_rate": 0.0001548145494378149, + "loss": 0.8794, + "step": 20829 + }, + { + "epoch": 0.5348559910514972, + "grad_norm": 0.83203125, + "learning_rate": 0.0001548108156008111, + "loss": 0.9337, + "step": 20830 + }, + { + "epoch": 0.534881668247419, + "grad_norm": 0.80078125, + "learning_rate": 0.00015480708165457427, + "loss": 0.8885, + "step": 20831 + }, + { + "epoch": 0.5349073454433408, + "grad_norm": 0.78515625, + "learning_rate": 0.00015480334759911186, + "loss": 0.8132, + "step": 20832 + }, + { + "epoch": 0.5349330226392627, + "grad_norm": 0.7734375, + "learning_rate": 0.00015479961343443135, + "loss": 0.9884, + "step": 20833 + }, + { + "epoch": 0.5349586998351845, + "grad_norm": 0.82421875, + "learning_rate": 0.00015479587916054013, + "loss": 0.9417, + "step": 20834 + }, + { + "epoch": 0.5349843770311064, + "grad_norm": 0.79296875, + "learning_rate": 0.00015479214477744568, + "loss": 0.8627, + "step": 20835 + }, + { + "epoch": 0.5350100542270282, + "grad_norm": 0.87109375, + "learning_rate": 0.00015478841028515548, + "loss": 0.9295, + "step": 20836 + }, + { + "epoch": 0.5350357314229499, + "grad_norm": 0.82421875, + "learning_rate": 0.00015478467568367688, + "loss": 0.8874, + "step": 20837 + }, + { + "epoch": 0.5350614086188717, + "grad_norm": 0.82421875, + "learning_rate": 0.0001547809409730174, + "loss": 0.9747, + "step": 20838 + }, + { + "epoch": 0.5350870858147936, + "grad_norm": 0.7265625, + "learning_rate": 0.00015477720615318443, + "loss": 0.8557, + "step": 20839 + }, + { + "epoch": 0.5351127630107154, + "grad_norm": 0.74609375, + "learning_rate": 0.0001547734712241854, + "loss": 0.8096, + "step": 20840 + }, + { + "epoch": 0.5351384402066373, + "grad_norm": 0.765625, + "learning_rate": 0.00015476973618602785, + "loss": 0.9611, + "step": 20841 + }, + { + "epoch": 0.5351641174025591, + "grad_norm": 0.85546875, + "learning_rate": 0.00015476600103871916, + "loss": 1.0574, + "step": 20842 + }, + { + "epoch": 0.5351897945984809, + "grad_norm": 0.7890625, + "learning_rate": 0.00015476226578226677, + "loss": 0.9441, + "step": 20843 + }, + { + "epoch": 0.5352154717944027, + "grad_norm": 0.71484375, + "learning_rate": 0.00015475853041667815, + "loss": 0.8065, + "step": 20844 + }, + { + "epoch": 0.5352411489903245, + "grad_norm": 0.65625, + "learning_rate": 0.00015475479494196068, + "loss": 0.7885, + "step": 20845 + }, + { + "epoch": 0.5352668261862463, + "grad_norm": 0.84375, + "learning_rate": 0.00015475105935812186, + "loss": 0.8353, + "step": 20846 + }, + { + "epoch": 0.5352925033821682, + "grad_norm": 0.7265625, + "learning_rate": 0.00015474732366516913, + "loss": 0.8313, + "step": 20847 + }, + { + "epoch": 0.53531818057809, + "grad_norm": 0.8046875, + "learning_rate": 0.0001547435878631099, + "loss": 0.8021, + "step": 20848 + }, + { + "epoch": 0.5353438577740118, + "grad_norm": 0.74609375, + "learning_rate": 0.0001547398519519517, + "loss": 0.8646, + "step": 20849 + }, + { + "epoch": 0.5353695349699336, + "grad_norm": 0.765625, + "learning_rate": 0.00015473611593170192, + "loss": 0.7875, + "step": 20850 + }, + { + "epoch": 0.5353952121658554, + "grad_norm": 0.8359375, + "learning_rate": 0.00015473237980236798, + "loss": 0.8829, + "step": 20851 + }, + { + "epoch": 0.5354208893617772, + "grad_norm": 0.796875, + "learning_rate": 0.00015472864356395736, + "loss": 0.9712, + "step": 20852 + }, + { + "epoch": 0.5354465665576991, + "grad_norm": 0.765625, + "learning_rate": 0.00015472490721647747, + "loss": 0.766, + "step": 20853 + }, + { + "epoch": 0.5354722437536209, + "grad_norm": 0.72265625, + "learning_rate": 0.0001547211707599358, + "loss": 0.7529, + "step": 20854 + }, + { + "epoch": 0.5354979209495427, + "grad_norm": 0.83203125, + "learning_rate": 0.00015471743419433976, + "loss": 0.9071, + "step": 20855 + }, + { + "epoch": 0.5355235981454646, + "grad_norm": 0.78125, + "learning_rate": 0.00015471369751969688, + "loss": 0.7046, + "step": 20856 + }, + { + "epoch": 0.5355492753413863, + "grad_norm": 0.78515625, + "learning_rate": 0.0001547099607360145, + "loss": 0.7882, + "step": 20857 + }, + { + "epoch": 0.5355749525373081, + "grad_norm": 0.765625, + "learning_rate": 0.0001547062238433001, + "loss": 0.9125, + "step": 20858 + }, + { + "epoch": 0.53560062973323, + "grad_norm": 0.859375, + "learning_rate": 0.00015470248684156115, + "loss": 0.8014, + "step": 20859 + }, + { + "epoch": 0.5356263069291518, + "grad_norm": 0.765625, + "learning_rate": 0.0001546987497308051, + "loss": 0.9139, + "step": 20860 + }, + { + "epoch": 0.5356519841250736, + "grad_norm": 0.76953125, + "learning_rate": 0.00015469501251103935, + "loss": 0.8096, + "step": 20861 + }, + { + "epoch": 0.5356776613209955, + "grad_norm": 0.76953125, + "learning_rate": 0.0001546912751822714, + "loss": 0.8568, + "step": 20862 + }, + { + "epoch": 0.5357033385169172, + "grad_norm": 0.765625, + "learning_rate": 0.00015468753774450868, + "loss": 0.9645, + "step": 20863 + }, + { + "epoch": 0.535729015712839, + "grad_norm": 0.7734375, + "learning_rate": 0.00015468380019775864, + "loss": 0.9864, + "step": 20864 + }, + { + "epoch": 0.5357546929087609, + "grad_norm": 0.8203125, + "learning_rate": 0.00015468006254202871, + "loss": 0.9454, + "step": 20865 + }, + { + "epoch": 0.5357803701046827, + "grad_norm": 0.78515625, + "learning_rate": 0.00015467632477732636, + "loss": 0.9537, + "step": 20866 + }, + { + "epoch": 0.5358060473006045, + "grad_norm": 0.67578125, + "learning_rate": 0.00015467258690365904, + "loss": 0.642, + "step": 20867 + }, + { + "epoch": 0.5358317244965264, + "grad_norm": 0.7421875, + "learning_rate": 0.0001546688489210342, + "loss": 0.8644, + "step": 20868 + }, + { + "epoch": 0.5358574016924482, + "grad_norm": 0.8515625, + "learning_rate": 0.00015466511082945928, + "loss": 0.985, + "step": 20869 + }, + { + "epoch": 0.53588307888837, + "grad_norm": 0.75390625, + "learning_rate": 0.00015466137262894173, + "loss": 0.7915, + "step": 20870 + }, + { + "epoch": 0.5359087560842918, + "grad_norm": 0.94140625, + "learning_rate": 0.000154657634319489, + "loss": 0.9836, + "step": 20871 + }, + { + "epoch": 0.5359344332802136, + "grad_norm": 0.8203125, + "learning_rate": 0.00015465389590110852, + "loss": 0.8215, + "step": 20872 + }, + { + "epoch": 0.5359601104761355, + "grad_norm": 0.76171875, + "learning_rate": 0.0001546501573738078, + "loss": 0.9579, + "step": 20873 + }, + { + "epoch": 0.5359857876720573, + "grad_norm": 0.78125, + "learning_rate": 0.00015464641873759422, + "loss": 0.7696, + "step": 20874 + }, + { + "epoch": 0.5360114648679791, + "grad_norm": 0.80859375, + "learning_rate": 0.00015464267999247525, + "loss": 0.9687, + "step": 20875 + }, + { + "epoch": 0.536037142063901, + "grad_norm": 0.75, + "learning_rate": 0.00015463894113845842, + "loss": 0.977, + "step": 20876 + }, + { + "epoch": 0.5360628192598227, + "grad_norm": 0.734375, + "learning_rate": 0.00015463520217555107, + "loss": 0.8374, + "step": 20877 + }, + { + "epoch": 0.5360884964557445, + "grad_norm": 0.72265625, + "learning_rate": 0.0001546314631037607, + "loss": 0.8589, + "step": 20878 + }, + { + "epoch": 0.5361141736516664, + "grad_norm": 0.71484375, + "learning_rate": 0.00015462772392309476, + "loss": 0.8301, + "step": 20879 + }, + { + "epoch": 0.5361398508475882, + "grad_norm": 0.8125, + "learning_rate": 0.00015462398463356072, + "loss": 0.899, + "step": 20880 + }, + { + "epoch": 0.53616552804351, + "grad_norm": 0.84765625, + "learning_rate": 0.00015462024523516598, + "loss": 0.7932, + "step": 20881 + }, + { + "epoch": 0.5361912052394319, + "grad_norm": 0.75, + "learning_rate": 0.00015461650572791804, + "loss": 0.7927, + "step": 20882 + }, + { + "epoch": 0.5362168824353536, + "grad_norm": 0.796875, + "learning_rate": 0.00015461276611182436, + "loss": 0.9274, + "step": 20883 + }, + { + "epoch": 0.5362425596312754, + "grad_norm": 0.76953125, + "learning_rate": 0.00015460902638689237, + "loss": 0.9082, + "step": 20884 + }, + { + "epoch": 0.5362682368271973, + "grad_norm": 0.7421875, + "learning_rate": 0.00015460528655312947, + "loss": 0.7624, + "step": 20885 + }, + { + "epoch": 0.5362939140231191, + "grad_norm": 0.7421875, + "learning_rate": 0.0001546015466105432, + "loss": 0.7452, + "step": 20886 + }, + { + "epoch": 0.5363195912190409, + "grad_norm": 0.77734375, + "learning_rate": 0.000154597806559141, + "loss": 0.8084, + "step": 20887 + }, + { + "epoch": 0.5363452684149628, + "grad_norm": 0.734375, + "learning_rate": 0.00015459406639893025, + "loss": 0.871, + "step": 20888 + }, + { + "epoch": 0.5363709456108846, + "grad_norm": 0.69921875, + "learning_rate": 0.00015459032612991853, + "loss": 0.9271, + "step": 20889 + }, + { + "epoch": 0.5363966228068063, + "grad_norm": 0.80078125, + "learning_rate": 0.00015458658575211317, + "loss": 0.9314, + "step": 20890 + }, + { + "epoch": 0.5364223000027282, + "grad_norm": 0.85546875, + "learning_rate": 0.0001545828452655217, + "loss": 0.8698, + "step": 20891 + }, + { + "epoch": 0.53644797719865, + "grad_norm": 0.765625, + "learning_rate": 0.00015457910467015152, + "loss": 0.8385, + "step": 20892 + }, + { + "epoch": 0.5364736543945718, + "grad_norm": 0.86328125, + "learning_rate": 0.00015457536396601014, + "loss": 0.8892, + "step": 20893 + }, + { + "epoch": 0.5364993315904937, + "grad_norm": 0.875, + "learning_rate": 0.00015457162315310499, + "loss": 1.0273, + "step": 20894 + }, + { + "epoch": 0.5365250087864155, + "grad_norm": 0.85546875, + "learning_rate": 0.00015456788223144348, + "loss": 0.8105, + "step": 20895 + }, + { + "epoch": 0.5365506859823373, + "grad_norm": 0.890625, + "learning_rate": 0.00015456414120103317, + "loss": 0.8784, + "step": 20896 + }, + { + "epoch": 0.5365763631782591, + "grad_norm": 0.73046875, + "learning_rate": 0.00015456040006188145, + "loss": 0.8026, + "step": 20897 + }, + { + "epoch": 0.5366020403741809, + "grad_norm": 0.83203125, + "learning_rate": 0.00015455665881399574, + "loss": 0.9281, + "step": 20898 + }, + { + "epoch": 0.5366277175701027, + "grad_norm": 0.765625, + "learning_rate": 0.00015455291745738355, + "loss": 0.8806, + "step": 20899 + }, + { + "epoch": 0.5366533947660246, + "grad_norm": 0.828125, + "learning_rate": 0.00015454917599205234, + "loss": 0.9292, + "step": 20900 + }, + { + "epoch": 0.5366790719619464, + "grad_norm": 0.7890625, + "learning_rate": 0.00015454543441800956, + "loss": 0.9494, + "step": 20901 + }, + { + "epoch": 0.5367047491578683, + "grad_norm": 0.80078125, + "learning_rate": 0.0001545416927352626, + "loss": 0.9284, + "step": 20902 + }, + { + "epoch": 0.53673042635379, + "grad_norm": 0.7734375, + "learning_rate": 0.00015453795094381905, + "loss": 0.8787, + "step": 20903 + }, + { + "epoch": 0.5367561035497118, + "grad_norm": 0.7265625, + "learning_rate": 0.00015453420904368623, + "loss": 0.9056, + "step": 20904 + }, + { + "epoch": 0.5367817807456337, + "grad_norm": 0.70703125, + "learning_rate": 0.00015453046703487169, + "loss": 0.7635, + "step": 20905 + }, + { + "epoch": 0.5368074579415555, + "grad_norm": 0.7421875, + "learning_rate": 0.0001545267249173828, + "loss": 0.8092, + "step": 20906 + }, + { + "epoch": 0.5368331351374773, + "grad_norm": 0.80078125, + "learning_rate": 0.00015452298269122715, + "loss": 0.8991, + "step": 20907 + }, + { + "epoch": 0.5368588123333992, + "grad_norm": 0.76171875, + "learning_rate": 0.0001545192403564121, + "loss": 0.7684, + "step": 20908 + }, + { + "epoch": 0.536884489529321, + "grad_norm": 0.796875, + "learning_rate": 0.0001545154979129451, + "loss": 0.8338, + "step": 20909 + }, + { + "epoch": 0.5369101667252427, + "grad_norm": 0.8125, + "learning_rate": 0.00015451175536083367, + "loss": 0.9107, + "step": 20910 + }, + { + "epoch": 0.5369358439211646, + "grad_norm": 0.7578125, + "learning_rate": 0.0001545080127000852, + "loss": 0.8354, + "step": 20911 + }, + { + "epoch": 0.5369615211170864, + "grad_norm": 0.80078125, + "learning_rate": 0.0001545042699307072, + "loss": 0.8845, + "step": 20912 + }, + { + "epoch": 0.5369871983130082, + "grad_norm": 0.71875, + "learning_rate": 0.00015450052705270713, + "loss": 0.9342, + "step": 20913 + }, + { + "epoch": 0.5370128755089301, + "grad_norm": 0.7890625, + "learning_rate": 0.0001544967840660924, + "loss": 1.0777, + "step": 20914 + }, + { + "epoch": 0.5370385527048519, + "grad_norm": 0.84765625, + "learning_rate": 0.0001544930409708705, + "loss": 0.9867, + "step": 20915 + }, + { + "epoch": 0.5370642299007737, + "grad_norm": 0.87109375, + "learning_rate": 0.00015448929776704895, + "loss": 0.8698, + "step": 20916 + }, + { + "epoch": 0.5370899070966955, + "grad_norm": 0.77734375, + "learning_rate": 0.00015448555445463514, + "loss": 0.8666, + "step": 20917 + }, + { + "epoch": 0.5371155842926173, + "grad_norm": 0.8671875, + "learning_rate": 0.00015448181103363648, + "loss": 0.9343, + "step": 20918 + }, + { + "epoch": 0.5371412614885391, + "grad_norm": 0.78515625, + "learning_rate": 0.00015447806750406052, + "loss": 0.9945, + "step": 20919 + }, + { + "epoch": 0.537166938684461, + "grad_norm": 0.80078125, + "learning_rate": 0.0001544743238659147, + "loss": 0.7972, + "step": 20920 + }, + { + "epoch": 0.5371926158803828, + "grad_norm": 0.74609375, + "learning_rate": 0.00015447058011920647, + "loss": 0.7823, + "step": 20921 + }, + { + "epoch": 0.5372182930763046, + "grad_norm": 0.78125, + "learning_rate": 0.00015446683626394331, + "loss": 0.7898, + "step": 20922 + }, + { + "epoch": 0.5372439702722264, + "grad_norm": 0.71484375, + "learning_rate": 0.00015446309230013268, + "loss": 0.8918, + "step": 20923 + }, + { + "epoch": 0.5372696474681482, + "grad_norm": 0.80078125, + "learning_rate": 0.000154459348227782, + "loss": 0.8591, + "step": 20924 + }, + { + "epoch": 0.53729532466407, + "grad_norm": 0.796875, + "learning_rate": 0.00015445560404689875, + "loss": 0.8955, + "step": 20925 + }, + { + "epoch": 0.5373210018599919, + "grad_norm": 0.7578125, + "learning_rate": 0.00015445185975749038, + "loss": 0.8706, + "step": 20926 + }, + { + "epoch": 0.5373466790559137, + "grad_norm": 0.90625, + "learning_rate": 0.0001544481153595644, + "loss": 1.0065, + "step": 20927 + }, + { + "epoch": 0.5373723562518355, + "grad_norm": 0.79296875, + "learning_rate": 0.00015444437085312824, + "loss": 0.7767, + "step": 20928 + }, + { + "epoch": 0.5373980334477574, + "grad_norm": 0.671875, + "learning_rate": 0.0001544406262381894, + "loss": 0.7959, + "step": 20929 + }, + { + "epoch": 0.5374237106436791, + "grad_norm": 0.7421875, + "learning_rate": 0.00015443688151475527, + "loss": 0.9112, + "step": 20930 + }, + { + "epoch": 0.537449387839601, + "grad_norm": 0.7421875, + "learning_rate": 0.00015443313668283336, + "loss": 0.9134, + "step": 20931 + }, + { + "epoch": 0.5374750650355228, + "grad_norm": 0.78515625, + "learning_rate": 0.00015442939174243115, + "loss": 0.815, + "step": 20932 + }, + { + "epoch": 0.5375007422314446, + "grad_norm": 0.71484375, + "learning_rate": 0.00015442564669355605, + "loss": 0.8717, + "step": 20933 + }, + { + "epoch": 0.5375264194273665, + "grad_norm": 0.7734375, + "learning_rate": 0.00015442190153621554, + "loss": 0.7637, + "step": 20934 + }, + { + "epoch": 0.5375520966232883, + "grad_norm": 0.8359375, + "learning_rate": 0.00015441815627041712, + "loss": 0.8924, + "step": 20935 + }, + { + "epoch": 0.5375777738192101, + "grad_norm": 0.734375, + "learning_rate": 0.0001544144108961682, + "loss": 0.7971, + "step": 20936 + }, + { + "epoch": 0.5376034510151319, + "grad_norm": 1.0546875, + "learning_rate": 0.00015441066541347633, + "loss": 0.8612, + "step": 20937 + }, + { + "epoch": 0.5376291282110537, + "grad_norm": 0.90234375, + "learning_rate": 0.00015440691982234887, + "loss": 0.959, + "step": 20938 + }, + { + "epoch": 0.5376548054069755, + "grad_norm": 0.76953125, + "learning_rate": 0.00015440317412279336, + "loss": 0.9087, + "step": 20939 + }, + { + "epoch": 0.5376804826028974, + "grad_norm": 0.7734375, + "learning_rate": 0.00015439942831481723, + "loss": 0.8434, + "step": 20940 + }, + { + "epoch": 0.5377061597988192, + "grad_norm": 0.81640625, + "learning_rate": 0.00015439568239842795, + "loss": 0.9749, + "step": 20941 + }, + { + "epoch": 0.537731836994741, + "grad_norm": 0.81640625, + "learning_rate": 0.00015439193637363302, + "loss": 0.964, + "step": 20942 + }, + { + "epoch": 0.5377575141906628, + "grad_norm": 0.828125, + "learning_rate": 0.00015438819024043982, + "loss": 0.9473, + "step": 20943 + }, + { + "epoch": 0.5377831913865846, + "grad_norm": 0.859375, + "learning_rate": 0.0001543844439988559, + "loss": 1.0628, + "step": 20944 + }, + { + "epoch": 0.5378088685825064, + "grad_norm": 0.78125, + "learning_rate": 0.0001543806976488887, + "loss": 0.9148, + "step": 20945 + }, + { + "epoch": 0.5378345457784283, + "grad_norm": 0.96484375, + "learning_rate": 0.00015437695119054567, + "loss": 0.8767, + "step": 20946 + }, + { + "epoch": 0.5378602229743501, + "grad_norm": 0.765625, + "learning_rate": 0.00015437320462383428, + "loss": 0.8706, + "step": 20947 + }, + { + "epoch": 0.5378859001702719, + "grad_norm": 0.75, + "learning_rate": 0.000154369457948762, + "loss": 0.762, + "step": 20948 + }, + { + "epoch": 0.5379115773661938, + "grad_norm": 0.734375, + "learning_rate": 0.00015436571116533637, + "loss": 0.9806, + "step": 20949 + }, + { + "epoch": 0.5379372545621155, + "grad_norm": 0.75, + "learning_rate": 0.00015436196427356472, + "loss": 0.8117, + "step": 20950 + }, + { + "epoch": 0.5379629317580373, + "grad_norm": 0.7890625, + "learning_rate": 0.0001543582172734546, + "loss": 0.8062, + "step": 20951 + }, + { + "epoch": 0.5379886089539592, + "grad_norm": 0.78125, + "learning_rate": 0.00015435447016501344, + "loss": 0.8699, + "step": 20952 + }, + { + "epoch": 0.538014286149881, + "grad_norm": 0.82421875, + "learning_rate": 0.00015435072294824878, + "loss": 0.8825, + "step": 20953 + }, + { + "epoch": 0.5380399633458028, + "grad_norm": 0.7109375, + "learning_rate": 0.000154346975623168, + "loss": 0.8058, + "step": 20954 + }, + { + "epoch": 0.5380656405417247, + "grad_norm": 0.74609375, + "learning_rate": 0.00015434322818977863, + "loss": 0.8851, + "step": 20955 + }, + { + "epoch": 0.5380913177376465, + "grad_norm": 0.78515625, + "learning_rate": 0.0001543394806480881, + "loss": 0.889, + "step": 20956 + }, + { + "epoch": 0.5381169949335682, + "grad_norm": 0.80078125, + "learning_rate": 0.00015433573299810388, + "loss": 1.0633, + "step": 20957 + }, + { + "epoch": 0.5381426721294901, + "grad_norm": 0.77734375, + "learning_rate": 0.0001543319852398335, + "loss": 0.9145, + "step": 20958 + }, + { + "epoch": 0.5381683493254119, + "grad_norm": 0.7109375, + "learning_rate": 0.00015432823737328435, + "loss": 0.8231, + "step": 20959 + }, + { + "epoch": 0.5381940265213337, + "grad_norm": 0.7890625, + "learning_rate": 0.00015432448939846393, + "loss": 0.9007, + "step": 20960 + }, + { + "epoch": 0.5382197037172556, + "grad_norm": 0.7890625, + "learning_rate": 0.00015432074131537971, + "loss": 0.8029, + "step": 20961 + }, + { + "epoch": 0.5382453809131774, + "grad_norm": 0.81640625, + "learning_rate": 0.0001543169931240392, + "loss": 0.9901, + "step": 20962 + }, + { + "epoch": 0.5382710581090991, + "grad_norm": 0.7421875, + "learning_rate": 0.00015431324482444975, + "loss": 0.9724, + "step": 20963 + }, + { + "epoch": 0.538296735305021, + "grad_norm": 0.87109375, + "learning_rate": 0.00015430949641661896, + "loss": 0.8547, + "step": 20964 + }, + { + "epoch": 0.5383224125009428, + "grad_norm": 0.75, + "learning_rate": 0.00015430574790055424, + "loss": 0.9452, + "step": 20965 + }, + { + "epoch": 0.5383480896968647, + "grad_norm": 0.75, + "learning_rate": 0.00015430199927626305, + "loss": 0.901, + "step": 20966 + }, + { + "epoch": 0.5383737668927865, + "grad_norm": 0.8046875, + "learning_rate": 0.0001542982505437529, + "loss": 0.8404, + "step": 20967 + }, + { + "epoch": 0.5383994440887083, + "grad_norm": 0.69921875, + "learning_rate": 0.00015429450170303128, + "loss": 0.9647, + "step": 20968 + }, + { + "epoch": 0.5384251212846302, + "grad_norm": 0.765625, + "learning_rate": 0.00015429075275410556, + "loss": 0.8328, + "step": 20969 + }, + { + "epoch": 0.5384507984805519, + "grad_norm": 0.76953125, + "learning_rate": 0.00015428700369698333, + "loss": 0.8174, + "step": 20970 + }, + { + "epoch": 0.5384764756764737, + "grad_norm": 0.74609375, + "learning_rate": 0.00015428325453167196, + "loss": 0.8834, + "step": 20971 + }, + { + "epoch": 0.5385021528723956, + "grad_norm": 0.73046875, + "learning_rate": 0.00015427950525817897, + "loss": 0.8318, + "step": 20972 + }, + { + "epoch": 0.5385278300683174, + "grad_norm": 0.8203125, + "learning_rate": 0.00015427575587651182, + "loss": 0.9911, + "step": 20973 + }, + { + "epoch": 0.5385535072642392, + "grad_norm": 0.8359375, + "learning_rate": 0.00015427200638667803, + "loss": 0.9027, + "step": 20974 + }, + { + "epoch": 0.5385791844601611, + "grad_norm": 0.7890625, + "learning_rate": 0.000154268256788685, + "loss": 0.7324, + "step": 20975 + }, + { + "epoch": 0.5386048616560829, + "grad_norm": 0.76953125, + "learning_rate": 0.00015426450708254028, + "loss": 0.9135, + "step": 20976 + }, + { + "epoch": 0.5386305388520046, + "grad_norm": 0.81640625, + "learning_rate": 0.00015426075726825126, + "loss": 0.9324, + "step": 20977 + }, + { + "epoch": 0.5386562160479265, + "grad_norm": 0.84765625, + "learning_rate": 0.00015425700734582546, + "loss": 0.9445, + "step": 20978 + }, + { + "epoch": 0.5386818932438483, + "grad_norm": 0.765625, + "learning_rate": 0.00015425325731527033, + "loss": 0.8899, + "step": 20979 + }, + { + "epoch": 0.5387075704397701, + "grad_norm": 0.8046875, + "learning_rate": 0.00015424950717659339, + "loss": 0.952, + "step": 20980 + }, + { + "epoch": 0.538733247635692, + "grad_norm": 0.765625, + "learning_rate": 0.00015424575692980208, + "loss": 0.8458, + "step": 20981 + }, + { + "epoch": 0.5387589248316138, + "grad_norm": 0.796875, + "learning_rate": 0.00015424200657490386, + "loss": 0.7843, + "step": 20982 + }, + { + "epoch": 0.5387846020275355, + "grad_norm": 0.76171875, + "learning_rate": 0.00015423825611190624, + "loss": 1.0267, + "step": 20983 + }, + { + "epoch": 0.5388102792234574, + "grad_norm": 0.796875, + "learning_rate": 0.00015423450554081662, + "loss": 0.8919, + "step": 20984 + }, + { + "epoch": 0.5388359564193792, + "grad_norm": 0.76953125, + "learning_rate": 0.0001542307548616426, + "loss": 0.8848, + "step": 20985 + }, + { + "epoch": 0.538861633615301, + "grad_norm": 0.8046875, + "learning_rate": 0.00015422700407439154, + "loss": 0.8318, + "step": 20986 + }, + { + "epoch": 0.5388873108112229, + "grad_norm": 0.8046875, + "learning_rate": 0.00015422325317907096, + "loss": 0.8626, + "step": 20987 + }, + { + "epoch": 0.5389129880071447, + "grad_norm": 0.82421875, + "learning_rate": 0.00015421950217568836, + "loss": 1.0942, + "step": 20988 + }, + { + "epoch": 0.5389386652030665, + "grad_norm": 0.77734375, + "learning_rate": 0.00015421575106425118, + "loss": 0.9535, + "step": 20989 + }, + { + "epoch": 0.5389643423989883, + "grad_norm": 0.7421875, + "learning_rate": 0.00015421199984476692, + "loss": 0.8091, + "step": 20990 + }, + { + "epoch": 0.5389900195949101, + "grad_norm": 0.703125, + "learning_rate": 0.000154208248517243, + "loss": 0.8262, + "step": 20991 + }, + { + "epoch": 0.5390156967908319, + "grad_norm": 0.73828125, + "learning_rate": 0.00015420449708168697, + "loss": 0.755, + "step": 20992 + }, + { + "epoch": 0.5390413739867538, + "grad_norm": 0.7734375, + "learning_rate": 0.00015420074553810627, + "loss": 0.828, + "step": 20993 + }, + { + "epoch": 0.5390670511826756, + "grad_norm": 0.75390625, + "learning_rate": 0.00015419699388650836, + "loss": 0.8397, + "step": 20994 + }, + { + "epoch": 0.5390927283785975, + "grad_norm": 0.765625, + "learning_rate": 0.00015419324212690078, + "loss": 0.8304, + "step": 20995 + }, + { + "epoch": 0.5391184055745193, + "grad_norm": 0.79296875, + "learning_rate": 0.00015418949025929092, + "loss": 0.9304, + "step": 20996 + }, + { + "epoch": 0.539144082770441, + "grad_norm": 0.8515625, + "learning_rate": 0.0001541857382836863, + "loss": 0.7664, + "step": 20997 + }, + { + "epoch": 0.5391697599663628, + "grad_norm": 0.82421875, + "learning_rate": 0.00015418198620009444, + "loss": 0.8932, + "step": 20998 + }, + { + "epoch": 0.5391954371622847, + "grad_norm": 0.80078125, + "learning_rate": 0.00015417823400852272, + "loss": 0.8884, + "step": 20999 + }, + { + "epoch": 0.5392211143582065, + "grad_norm": 0.75390625, + "learning_rate": 0.0001541744817089787, + "loss": 0.817, + "step": 21000 + }, + { + "epoch": 0.5392211143582065, + "eval_loss": 0.8762781023979187, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 409.2644, + "eval_samples_per_second": 24.434, + "eval_steps_per_second": 0.765, + "step": 21000 + }, + { + "epoch": 0.5392467915541284, + "grad_norm": 0.86328125, + "learning_rate": 0.00015417072930146985, + "loss": 0.8817, + "step": 21001 + }, + { + "epoch": 0.5392724687500502, + "grad_norm": 0.7578125, + "learning_rate": 0.00015416697678600363, + "loss": 0.9497, + "step": 21002 + }, + { + "epoch": 0.5392981459459719, + "grad_norm": 0.796875, + "learning_rate": 0.0001541632241625875, + "loss": 1.0918, + "step": 21003 + }, + { + "epoch": 0.5393238231418938, + "grad_norm": 0.734375, + "learning_rate": 0.00015415947143122895, + "loss": 0.9293, + "step": 21004 + }, + { + "epoch": 0.5393495003378156, + "grad_norm": 0.80078125, + "learning_rate": 0.00015415571859193547, + "loss": 0.8585, + "step": 21005 + }, + { + "epoch": 0.5393751775337374, + "grad_norm": 0.77734375, + "learning_rate": 0.00015415196564471458, + "loss": 0.9171, + "step": 21006 + }, + { + "epoch": 0.5394008547296593, + "grad_norm": 0.8359375, + "learning_rate": 0.00015414821258957363, + "loss": 0.8619, + "step": 21007 + }, + { + "epoch": 0.5394265319255811, + "grad_norm": 0.98828125, + "learning_rate": 0.00015414445942652026, + "loss": 0.8109, + "step": 21008 + }, + { + "epoch": 0.5394522091215029, + "grad_norm": 0.7578125, + "learning_rate": 0.00015414070615556185, + "loss": 1.016, + "step": 21009 + }, + { + "epoch": 0.5394778863174247, + "grad_norm": 0.88671875, + "learning_rate": 0.00015413695277670587, + "loss": 0.9347, + "step": 21010 + }, + { + "epoch": 0.5395035635133465, + "grad_norm": 0.80859375, + "learning_rate": 0.00015413319928995987, + "loss": 0.8719, + "step": 21011 + }, + { + "epoch": 0.5395292407092683, + "grad_norm": 0.76171875, + "learning_rate": 0.00015412944569533128, + "loss": 0.8366, + "step": 21012 + }, + { + "epoch": 0.5395549179051902, + "grad_norm": 0.9609375, + "learning_rate": 0.0001541256919928276, + "loss": 0.8151, + "step": 21013 + }, + { + "epoch": 0.539580595101112, + "grad_norm": 0.765625, + "learning_rate": 0.0001541219381824563, + "loss": 0.8264, + "step": 21014 + }, + { + "epoch": 0.5396062722970338, + "grad_norm": 0.79296875, + "learning_rate": 0.00015411818426422485, + "loss": 1.0383, + "step": 21015 + }, + { + "epoch": 0.5396319494929557, + "grad_norm": 0.85546875, + "learning_rate": 0.00015411443023814075, + "loss": 0.8893, + "step": 21016 + }, + { + "epoch": 0.5396576266888774, + "grad_norm": 0.7890625, + "learning_rate": 0.00015411067610421152, + "loss": 0.8305, + "step": 21017 + }, + { + "epoch": 0.5396833038847992, + "grad_norm": 1.0703125, + "learning_rate": 0.00015410692186244455, + "loss": 1.0092, + "step": 21018 + }, + { + "epoch": 0.5397089810807211, + "grad_norm": 0.765625, + "learning_rate": 0.0001541031675128474, + "loss": 0.8072, + "step": 21019 + }, + { + "epoch": 0.5397346582766429, + "grad_norm": 0.85546875, + "learning_rate": 0.00015409941305542752, + "loss": 0.8469, + "step": 21020 + }, + { + "epoch": 0.5397603354725647, + "grad_norm": 0.78515625, + "learning_rate": 0.0001540956584901924, + "loss": 0.8393, + "step": 21021 + }, + { + "epoch": 0.5397860126684866, + "grad_norm": 0.7265625, + "learning_rate": 0.0001540919038171495, + "loss": 0.8519, + "step": 21022 + }, + { + "epoch": 0.5398116898644083, + "grad_norm": 0.75, + "learning_rate": 0.00015408814903630636, + "loss": 0.8535, + "step": 21023 + }, + { + "epoch": 0.5398373670603301, + "grad_norm": 0.75390625, + "learning_rate": 0.0001540843941476704, + "loss": 0.9027, + "step": 21024 + }, + { + "epoch": 0.539863044256252, + "grad_norm": 0.765625, + "learning_rate": 0.00015408063915124914, + "loss": 0.8473, + "step": 21025 + }, + { + "epoch": 0.5398887214521738, + "grad_norm": 0.82421875, + "learning_rate": 0.00015407688404705004, + "loss": 0.9213, + "step": 21026 + }, + { + "epoch": 0.5399143986480957, + "grad_norm": 0.78515625, + "learning_rate": 0.00015407312883508056, + "loss": 1.0015, + "step": 21027 + }, + { + "epoch": 0.5399400758440175, + "grad_norm": 0.84765625, + "learning_rate": 0.00015406937351534827, + "loss": 0.8699, + "step": 21028 + }, + { + "epoch": 0.5399657530399393, + "grad_norm": 0.76171875, + "learning_rate": 0.0001540656180878606, + "loss": 0.9774, + "step": 21029 + }, + { + "epoch": 0.539991430235861, + "grad_norm": 0.76171875, + "learning_rate": 0.00015406186255262502, + "loss": 0.8036, + "step": 21030 + }, + { + "epoch": 0.5400171074317829, + "grad_norm": 0.734375, + "learning_rate": 0.00015405810690964904, + "loss": 1.0175, + "step": 21031 + }, + { + "epoch": 0.5400427846277047, + "grad_norm": 0.7421875, + "learning_rate": 0.0001540543511589401, + "loss": 0.8725, + "step": 21032 + }, + { + "epoch": 0.5400684618236266, + "grad_norm": 0.71875, + "learning_rate": 0.00015405059530050575, + "loss": 0.813, + "step": 21033 + }, + { + "epoch": 0.5400941390195484, + "grad_norm": 0.8671875, + "learning_rate": 0.00015404683933435348, + "loss": 0.8903, + "step": 21034 + }, + { + "epoch": 0.5401198162154702, + "grad_norm": 0.796875, + "learning_rate": 0.0001540430832604907, + "loss": 0.8283, + "step": 21035 + }, + { + "epoch": 0.5401454934113921, + "grad_norm": 0.8203125, + "learning_rate": 0.00015403932707892495, + "loss": 0.9269, + "step": 21036 + }, + { + "epoch": 0.5401711706073138, + "grad_norm": 0.7421875, + "learning_rate": 0.00015403557078966367, + "loss": 0.8277, + "step": 21037 + }, + { + "epoch": 0.5401968478032356, + "grad_norm": 0.80078125, + "learning_rate": 0.00015403181439271443, + "loss": 0.8093, + "step": 21038 + }, + { + "epoch": 0.5402225249991575, + "grad_norm": 0.84375, + "learning_rate": 0.00015402805788808464, + "loss": 0.935, + "step": 21039 + }, + { + "epoch": 0.5402482021950793, + "grad_norm": 0.73046875, + "learning_rate": 0.0001540243012757818, + "loss": 0.8491, + "step": 21040 + }, + { + "epoch": 0.5402738793910011, + "grad_norm": 0.71484375, + "learning_rate": 0.00015402054455581344, + "loss": 0.8391, + "step": 21041 + }, + { + "epoch": 0.540299556586923, + "grad_norm": 0.859375, + "learning_rate": 0.00015401678772818697, + "loss": 1.0084, + "step": 21042 + }, + { + "epoch": 0.5403252337828447, + "grad_norm": 0.84375, + "learning_rate": 0.00015401303079290996, + "loss": 0.7861, + "step": 21043 + }, + { + "epoch": 0.5403509109787665, + "grad_norm": 0.78515625, + "learning_rate": 0.00015400927374998983, + "loss": 0.8915, + "step": 21044 + }, + { + "epoch": 0.5403765881746884, + "grad_norm": 0.8125, + "learning_rate": 0.00015400551659943413, + "loss": 1.0364, + "step": 21045 + }, + { + "epoch": 0.5404022653706102, + "grad_norm": 0.77734375, + "learning_rate": 0.0001540017593412503, + "loss": 0.867, + "step": 21046 + }, + { + "epoch": 0.540427942566532, + "grad_norm": 0.76953125, + "learning_rate": 0.00015399800197544582, + "loss": 0.8654, + "step": 21047 + }, + { + "epoch": 0.5404536197624539, + "grad_norm": 0.76953125, + "learning_rate": 0.00015399424450202823, + "loss": 0.9413, + "step": 21048 + }, + { + "epoch": 0.5404792969583757, + "grad_norm": 0.7265625, + "learning_rate": 0.00015399048692100496, + "loss": 0.7439, + "step": 21049 + }, + { + "epoch": 0.5405049741542974, + "grad_norm": 0.71875, + "learning_rate": 0.00015398672923238354, + "loss": 0.7379, + "step": 21050 + }, + { + "epoch": 0.5405306513502193, + "grad_norm": 0.79296875, + "learning_rate": 0.00015398297143617143, + "loss": 0.8959, + "step": 21051 + }, + { + "epoch": 0.5405563285461411, + "grad_norm": 0.75390625, + "learning_rate": 0.00015397921353237616, + "loss": 0.8883, + "step": 21052 + }, + { + "epoch": 0.5405820057420629, + "grad_norm": 0.74609375, + "learning_rate": 0.00015397545552100515, + "loss": 0.806, + "step": 21053 + }, + { + "epoch": 0.5406076829379848, + "grad_norm": 0.734375, + "learning_rate": 0.000153971697402066, + "loss": 0.862, + "step": 21054 + }, + { + "epoch": 0.5406333601339066, + "grad_norm": 0.7578125, + "learning_rate": 0.0001539679391755661, + "loss": 0.8724, + "step": 21055 + }, + { + "epoch": 0.5406590373298285, + "grad_norm": 0.75, + "learning_rate": 0.00015396418084151297, + "loss": 0.7814, + "step": 21056 + }, + { + "epoch": 0.5406847145257502, + "grad_norm": 0.76171875, + "learning_rate": 0.00015396042239991411, + "loss": 0.809, + "step": 21057 + }, + { + "epoch": 0.540710391721672, + "grad_norm": 0.7109375, + "learning_rate": 0.00015395666385077696, + "loss": 0.7481, + "step": 21058 + }, + { + "epoch": 0.5407360689175938, + "grad_norm": 0.7421875, + "learning_rate": 0.0001539529051941091, + "loss": 0.8411, + "step": 21059 + }, + { + "epoch": 0.5407617461135157, + "grad_norm": 0.77734375, + "learning_rate": 0.00015394914642991795, + "loss": 0.9348, + "step": 21060 + }, + { + "epoch": 0.5407874233094375, + "grad_norm": 0.7890625, + "learning_rate": 0.00015394538755821106, + "loss": 0.8221, + "step": 21061 + }, + { + "epoch": 0.5408131005053594, + "grad_norm": 0.7734375, + "learning_rate": 0.00015394162857899584, + "loss": 0.8843, + "step": 21062 + }, + { + "epoch": 0.5408387777012811, + "grad_norm": 0.7109375, + "learning_rate": 0.00015393786949227987, + "loss": 0.9236, + "step": 21063 + }, + { + "epoch": 0.5408644548972029, + "grad_norm": 0.72265625, + "learning_rate": 0.00015393411029807056, + "loss": 0.8836, + "step": 21064 + }, + { + "epoch": 0.5408901320931248, + "grad_norm": 0.77734375, + "learning_rate": 0.00015393035099637548, + "loss": 0.9711, + "step": 21065 + }, + { + "epoch": 0.5409158092890466, + "grad_norm": 0.76953125, + "learning_rate": 0.00015392659158720205, + "loss": 0.9605, + "step": 21066 + }, + { + "epoch": 0.5409414864849684, + "grad_norm": 0.7578125, + "learning_rate": 0.0001539228320705578, + "loss": 0.7983, + "step": 21067 + }, + { + "epoch": 0.5409671636808903, + "grad_norm": 0.8046875, + "learning_rate": 0.0001539190724464502, + "loss": 0.8341, + "step": 21068 + }, + { + "epoch": 0.5409928408768121, + "grad_norm": 0.828125, + "learning_rate": 0.00015391531271488677, + "loss": 0.8533, + "step": 21069 + }, + { + "epoch": 0.5410185180727338, + "grad_norm": 0.7578125, + "learning_rate": 0.00015391155287587503, + "loss": 0.903, + "step": 21070 + }, + { + "epoch": 0.5410441952686557, + "grad_norm": 0.796875, + "learning_rate": 0.00015390779292942237, + "loss": 0.9244, + "step": 21071 + }, + { + "epoch": 0.5410698724645775, + "grad_norm": 0.78515625, + "learning_rate": 0.0001539040328755364, + "loss": 0.8215, + "step": 21072 + }, + { + "epoch": 0.5410955496604993, + "grad_norm": 0.7890625, + "learning_rate": 0.00015390027271422455, + "loss": 1.0312, + "step": 21073 + }, + { + "epoch": 0.5411212268564212, + "grad_norm": 0.828125, + "learning_rate": 0.0001538965124454943, + "loss": 0.8508, + "step": 21074 + }, + { + "epoch": 0.541146904052343, + "grad_norm": 0.796875, + "learning_rate": 0.0001538927520693532, + "loss": 0.9042, + "step": 21075 + }, + { + "epoch": 0.5411725812482647, + "grad_norm": 0.80859375, + "learning_rate": 0.0001538889915858087, + "loss": 0.7456, + "step": 21076 + }, + { + "epoch": 0.5411982584441866, + "grad_norm": 0.828125, + "learning_rate": 0.00015388523099486832, + "loss": 0.8971, + "step": 21077 + }, + { + "epoch": 0.5412239356401084, + "grad_norm": 0.6796875, + "learning_rate": 0.00015388147029653953, + "loss": 0.7593, + "step": 21078 + }, + { + "epoch": 0.5412496128360302, + "grad_norm": 0.7890625, + "learning_rate": 0.00015387770949082982, + "loss": 0.9246, + "step": 21079 + }, + { + "epoch": 0.5412752900319521, + "grad_norm": 0.796875, + "learning_rate": 0.00015387394857774674, + "loss": 0.9292, + "step": 21080 + }, + { + "epoch": 0.5413009672278739, + "grad_norm": 0.9375, + "learning_rate": 0.0001538701875572977, + "loss": 1.0856, + "step": 21081 + }, + { + "epoch": 0.5413266444237957, + "grad_norm": 0.75390625, + "learning_rate": 0.00015386642642949028, + "loss": 0.8868, + "step": 21082 + }, + { + "epoch": 0.5413523216197175, + "grad_norm": 0.796875, + "learning_rate": 0.00015386266519433193, + "loss": 0.892, + "step": 21083 + }, + { + "epoch": 0.5413779988156393, + "grad_norm": 0.80859375, + "learning_rate": 0.00015385890385183015, + "loss": 1.0303, + "step": 21084 + }, + { + "epoch": 0.5414036760115611, + "grad_norm": 0.734375, + "learning_rate": 0.00015385514240199243, + "loss": 0.868, + "step": 21085 + }, + { + "epoch": 0.541429353207483, + "grad_norm": 0.8125, + "learning_rate": 0.00015385138084482628, + "loss": 0.7713, + "step": 21086 + }, + { + "epoch": 0.5414550304034048, + "grad_norm": 0.6875, + "learning_rate": 0.0001538476191803392, + "loss": 0.7703, + "step": 21087 + }, + { + "epoch": 0.5414807075993266, + "grad_norm": 0.84765625, + "learning_rate": 0.0001538438574085387, + "loss": 0.8177, + "step": 21088 + }, + { + "epoch": 0.5415063847952485, + "grad_norm": 0.76953125, + "learning_rate": 0.00015384009552943222, + "loss": 0.8699, + "step": 21089 + }, + { + "epoch": 0.5415320619911702, + "grad_norm": 0.70703125, + "learning_rate": 0.00015383633354302729, + "loss": 0.9129, + "step": 21090 + }, + { + "epoch": 0.541557739187092, + "grad_norm": 0.83203125, + "learning_rate": 0.00015383257144933142, + "loss": 0.8506, + "step": 21091 + }, + { + "epoch": 0.5415834163830139, + "grad_norm": 0.78125, + "learning_rate": 0.0001538288092483521, + "loss": 0.8762, + "step": 21092 + }, + { + "epoch": 0.5416090935789357, + "grad_norm": 0.76171875, + "learning_rate": 0.00015382504694009684, + "loss": 0.908, + "step": 21093 + }, + { + "epoch": 0.5416347707748576, + "grad_norm": 0.6875, + "learning_rate": 0.00015382128452457315, + "loss": 0.758, + "step": 21094 + }, + { + "epoch": 0.5416604479707794, + "grad_norm": 0.8125, + "learning_rate": 0.00015381752200178841, + "loss": 0.8798, + "step": 21095 + }, + { + "epoch": 0.5416861251667011, + "grad_norm": 0.73828125, + "learning_rate": 0.0001538137593717503, + "loss": 0.9207, + "step": 21096 + }, + { + "epoch": 0.541711802362623, + "grad_norm": 0.78125, + "learning_rate": 0.0001538099966344662, + "loss": 0.932, + "step": 21097 + }, + { + "epoch": 0.5417374795585448, + "grad_norm": 0.80078125, + "learning_rate": 0.0001538062337899436, + "loss": 0.7607, + "step": 21098 + }, + { + "epoch": 0.5417631567544666, + "grad_norm": 0.76953125, + "learning_rate": 0.00015380247083819008, + "loss": 0.7937, + "step": 21099 + }, + { + "epoch": 0.5417888339503885, + "grad_norm": 0.875, + "learning_rate": 0.00015379870777921308, + "loss": 0.9464, + "step": 21100 + }, + { + "epoch": 0.5418145111463103, + "grad_norm": 0.8984375, + "learning_rate": 0.00015379494461302014, + "loss": 0.824, + "step": 21101 + }, + { + "epoch": 0.5418401883422321, + "grad_norm": 0.734375, + "learning_rate": 0.00015379118133961871, + "loss": 0.7017, + "step": 21102 + }, + { + "epoch": 0.5418658655381539, + "grad_norm": 0.7421875, + "learning_rate": 0.0001537874179590163, + "loss": 1.0158, + "step": 21103 + }, + { + "epoch": 0.5418915427340757, + "grad_norm": 0.78515625, + "learning_rate": 0.00015378365447122044, + "loss": 0.8196, + "step": 21104 + }, + { + "epoch": 0.5419172199299975, + "grad_norm": 0.77734375, + "learning_rate": 0.00015377989087623863, + "loss": 0.7587, + "step": 21105 + }, + { + "epoch": 0.5419428971259194, + "grad_norm": 0.73828125, + "learning_rate": 0.00015377612717407834, + "loss": 1.0308, + "step": 21106 + }, + { + "epoch": 0.5419685743218412, + "grad_norm": 0.76171875, + "learning_rate": 0.0001537723633647471, + "loss": 0.9031, + "step": 21107 + }, + { + "epoch": 0.541994251517763, + "grad_norm": 0.75390625, + "learning_rate": 0.00015376859944825236, + "loss": 0.8522, + "step": 21108 + }, + { + "epoch": 0.5420199287136849, + "grad_norm": 0.7421875, + "learning_rate": 0.0001537648354246017, + "loss": 0.8429, + "step": 21109 + }, + { + "epoch": 0.5420456059096066, + "grad_norm": 0.7265625, + "learning_rate": 0.00015376107129380255, + "loss": 0.9276, + "step": 21110 + }, + { + "epoch": 0.5420712831055284, + "grad_norm": 0.828125, + "learning_rate": 0.00015375730705586245, + "loss": 0.9574, + "step": 21111 + }, + { + "epoch": 0.5420969603014503, + "grad_norm": 0.73046875, + "learning_rate": 0.00015375354271078887, + "loss": 0.8616, + "step": 21112 + }, + { + "epoch": 0.5421226374973721, + "grad_norm": 0.81640625, + "learning_rate": 0.0001537497782585894, + "loss": 0.8025, + "step": 21113 + }, + { + "epoch": 0.5421483146932939, + "grad_norm": 0.78125, + "learning_rate": 0.00015374601369927144, + "loss": 0.8506, + "step": 21114 + }, + { + "epoch": 0.5421739918892158, + "grad_norm": 0.7734375, + "learning_rate": 0.00015374224903284253, + "loss": 0.9175, + "step": 21115 + }, + { + "epoch": 0.5421996690851375, + "grad_norm": 0.76953125, + "learning_rate": 0.00015373848425931014, + "loss": 0.8345, + "step": 21116 + }, + { + "epoch": 0.5422253462810593, + "grad_norm": 0.8046875, + "learning_rate": 0.00015373471937868186, + "loss": 0.8724, + "step": 21117 + }, + { + "epoch": 0.5422510234769812, + "grad_norm": 0.7734375, + "learning_rate": 0.0001537309543909651, + "loss": 0.9583, + "step": 21118 + }, + { + "epoch": 0.542276700672903, + "grad_norm": 0.78515625, + "learning_rate": 0.0001537271892961674, + "loss": 0.8698, + "step": 21119 + }, + { + "epoch": 0.5423023778688248, + "grad_norm": 0.78515625, + "learning_rate": 0.0001537234240942963, + "loss": 0.8172, + "step": 21120 + }, + { + "epoch": 0.5423280550647467, + "grad_norm": 0.703125, + "learning_rate": 0.00015371965878535924, + "loss": 0.9236, + "step": 21121 + }, + { + "epoch": 0.5423537322606685, + "grad_norm": 0.7578125, + "learning_rate": 0.00015371589336936377, + "loss": 0.9399, + "step": 21122 + }, + { + "epoch": 0.5423794094565902, + "grad_norm": 0.734375, + "learning_rate": 0.00015371212784631738, + "loss": 0.7591, + "step": 21123 + }, + { + "epoch": 0.5424050866525121, + "grad_norm": 0.76953125, + "learning_rate": 0.00015370836221622754, + "loss": 0.9182, + "step": 21124 + }, + { + "epoch": 0.5424307638484339, + "grad_norm": 0.7109375, + "learning_rate": 0.0001537045964791018, + "loss": 0.8301, + "step": 21125 + }, + { + "epoch": 0.5424564410443558, + "grad_norm": 0.73828125, + "learning_rate": 0.00015370083063494767, + "loss": 0.8306, + "step": 21126 + }, + { + "epoch": 0.5424821182402776, + "grad_norm": 0.765625, + "learning_rate": 0.0001536970646837726, + "loss": 0.9201, + "step": 21127 + }, + { + "epoch": 0.5425077954361994, + "grad_norm": 0.765625, + "learning_rate": 0.00015369329862558416, + "loss": 0.8043, + "step": 21128 + }, + { + "epoch": 0.5425334726321213, + "grad_norm": 0.7265625, + "learning_rate": 0.00015368953246038984, + "loss": 0.9455, + "step": 21129 + }, + { + "epoch": 0.542559149828043, + "grad_norm": 0.81640625, + "learning_rate": 0.00015368576618819707, + "loss": 0.8073, + "step": 21130 + }, + { + "epoch": 0.5425848270239648, + "grad_norm": 0.7109375, + "learning_rate": 0.00015368199980901346, + "loss": 0.731, + "step": 21131 + }, + { + "epoch": 0.5426105042198867, + "grad_norm": 0.796875, + "learning_rate": 0.00015367823332284645, + "loss": 0.8416, + "step": 21132 + }, + { + "epoch": 0.5426361814158085, + "grad_norm": 0.80078125, + "learning_rate": 0.00015367446672970362, + "loss": 0.8644, + "step": 21133 + }, + { + "epoch": 0.5426618586117303, + "grad_norm": 0.7421875, + "learning_rate": 0.0001536707000295924, + "loss": 0.7515, + "step": 21134 + }, + { + "epoch": 0.5426875358076522, + "grad_norm": 0.91796875, + "learning_rate": 0.0001536669332225203, + "loss": 0.7997, + "step": 21135 + }, + { + "epoch": 0.5427132130035739, + "grad_norm": 0.81640625, + "learning_rate": 0.00015366316630849487, + "loss": 0.9216, + "step": 21136 + }, + { + "epoch": 0.5427388901994957, + "grad_norm": 0.83203125, + "learning_rate": 0.0001536593992875236, + "loss": 0.8552, + "step": 21137 + }, + { + "epoch": 0.5427645673954176, + "grad_norm": 0.8046875, + "learning_rate": 0.00015365563215961395, + "loss": 0.879, + "step": 21138 + }, + { + "epoch": 0.5427902445913394, + "grad_norm": 0.80859375, + "learning_rate": 0.00015365186492477358, + "loss": 0.9197, + "step": 21139 + }, + { + "epoch": 0.5428159217872612, + "grad_norm": 0.7421875, + "learning_rate": 0.00015364809758300978, + "loss": 0.904, + "step": 21140 + }, + { + "epoch": 0.5428415989831831, + "grad_norm": 0.77734375, + "learning_rate": 0.00015364433013433023, + "loss": 0.8826, + "step": 21141 + }, + { + "epoch": 0.5428672761791049, + "grad_norm": 0.7734375, + "learning_rate": 0.00015364056257874235, + "loss": 0.9381, + "step": 21142 + }, + { + "epoch": 0.5428929533750266, + "grad_norm": 0.74609375, + "learning_rate": 0.00015363679491625363, + "loss": 0.813, + "step": 21143 + }, + { + "epoch": 0.5429186305709485, + "grad_norm": 0.79296875, + "learning_rate": 0.0001536330271468717, + "loss": 1.0006, + "step": 21144 + }, + { + "epoch": 0.5429443077668703, + "grad_norm": 0.81640625, + "learning_rate": 0.00015362925927060397, + "loss": 0.9544, + "step": 21145 + }, + { + "epoch": 0.5429699849627921, + "grad_norm": 0.7890625, + "learning_rate": 0.00015362549128745796, + "loss": 0.9696, + "step": 21146 + }, + { + "epoch": 0.542995662158714, + "grad_norm": 0.8203125, + "learning_rate": 0.0001536217231974412, + "loss": 0.865, + "step": 21147 + }, + { + "epoch": 0.5430213393546358, + "grad_norm": 0.72265625, + "learning_rate": 0.00015361795500056115, + "loss": 1.0156, + "step": 21148 + }, + { + "epoch": 0.5430470165505576, + "grad_norm": 0.75, + "learning_rate": 0.0001536141866968254, + "loss": 0.7693, + "step": 21149 + }, + { + "epoch": 0.5430726937464794, + "grad_norm": 0.734375, + "learning_rate": 0.0001536104182862414, + "loss": 0.7909, + "step": 21150 + }, + { + "epoch": 0.5430983709424012, + "grad_norm": 0.75390625, + "learning_rate": 0.0001536066497688167, + "loss": 0.8804, + "step": 21151 + }, + { + "epoch": 0.543124048138323, + "grad_norm": 0.734375, + "learning_rate": 0.0001536028811445588, + "loss": 0.9493, + "step": 21152 + }, + { + "epoch": 0.5431497253342449, + "grad_norm": 0.83203125, + "learning_rate": 0.00015359911241347515, + "loss": 0.8945, + "step": 21153 + }, + { + "epoch": 0.5431754025301667, + "grad_norm": 0.78515625, + "learning_rate": 0.00015359534357557334, + "loss": 1.0363, + "step": 21154 + }, + { + "epoch": 0.5432010797260886, + "grad_norm": 0.78125, + "learning_rate": 0.00015359157463086084, + "loss": 0.8619, + "step": 21155 + }, + { + "epoch": 0.5432267569220103, + "grad_norm": 0.734375, + "learning_rate": 0.00015358780557934518, + "loss": 0.8697, + "step": 21156 + }, + { + "epoch": 0.5432524341179321, + "grad_norm": 0.75390625, + "learning_rate": 0.00015358403642103387, + "loss": 0.8067, + "step": 21157 + }, + { + "epoch": 0.543278111313854, + "grad_norm": 0.7578125, + "learning_rate": 0.00015358026715593442, + "loss": 1.0106, + "step": 21158 + }, + { + "epoch": 0.5433037885097758, + "grad_norm": 0.765625, + "learning_rate": 0.00015357649778405431, + "loss": 0.8554, + "step": 21159 + }, + { + "epoch": 0.5433294657056976, + "grad_norm": 0.8203125, + "learning_rate": 0.00015357272830540114, + "loss": 0.9717, + "step": 21160 + }, + { + "epoch": 0.5433551429016195, + "grad_norm": 0.76171875, + "learning_rate": 0.00015356895871998226, + "loss": 0.8338, + "step": 21161 + }, + { + "epoch": 0.5433808200975413, + "grad_norm": 0.7421875, + "learning_rate": 0.00015356518902780536, + "loss": 0.8221, + "step": 21162 + }, + { + "epoch": 0.543406497293463, + "grad_norm": 0.75390625, + "learning_rate": 0.00015356141922887788, + "loss": 0.8718, + "step": 21163 + }, + { + "epoch": 0.5434321744893849, + "grad_norm": 0.80078125, + "learning_rate": 0.00015355764932320728, + "loss": 0.9586, + "step": 21164 + }, + { + "epoch": 0.5434578516853067, + "grad_norm": 0.78125, + "learning_rate": 0.00015355387931080116, + "loss": 0.8523, + "step": 21165 + }, + { + "epoch": 0.5434835288812285, + "grad_norm": 0.82421875, + "learning_rate": 0.00015355010919166704, + "loss": 0.9361, + "step": 21166 + }, + { + "epoch": 0.5435092060771504, + "grad_norm": 0.796875, + "learning_rate": 0.00015354633896581232, + "loss": 0.8198, + "step": 21167 + }, + { + "epoch": 0.5435348832730722, + "grad_norm": 0.75390625, + "learning_rate": 0.0001535425686332446, + "loss": 0.8534, + "step": 21168 + }, + { + "epoch": 0.543560560468994, + "grad_norm": 0.73828125, + "learning_rate": 0.00015353879819397136, + "loss": 0.8078, + "step": 21169 + }, + { + "epoch": 0.5435862376649158, + "grad_norm": 0.73046875, + "learning_rate": 0.00015353502764800013, + "loss": 0.7966, + "step": 21170 + }, + { + "epoch": 0.5436119148608376, + "grad_norm": 0.74609375, + "learning_rate": 0.00015353125699533847, + "loss": 0.9669, + "step": 21171 + }, + { + "epoch": 0.5436375920567594, + "grad_norm": 0.7734375, + "learning_rate": 0.00015352748623599383, + "loss": 0.9485, + "step": 21172 + }, + { + "epoch": 0.5436632692526813, + "grad_norm": 0.73828125, + "learning_rate": 0.00015352371536997374, + "loss": 0.798, + "step": 21173 + }, + { + "epoch": 0.5436889464486031, + "grad_norm": 0.7578125, + "learning_rate": 0.0001535199443972857, + "loss": 0.8441, + "step": 21174 + }, + { + "epoch": 0.5437146236445249, + "grad_norm": 0.8359375, + "learning_rate": 0.00015351617331793728, + "loss": 0.9215, + "step": 21175 + }, + { + "epoch": 0.5437403008404467, + "grad_norm": 0.80078125, + "learning_rate": 0.00015351240213193595, + "loss": 0.8249, + "step": 21176 + }, + { + "epoch": 0.5437659780363685, + "grad_norm": 0.7421875, + "learning_rate": 0.00015350863083928923, + "loss": 0.8569, + "step": 21177 + }, + { + "epoch": 0.5437916552322903, + "grad_norm": 0.765625, + "learning_rate": 0.00015350485944000462, + "loss": 0.9404, + "step": 21178 + }, + { + "epoch": 0.5438173324282122, + "grad_norm": 0.7109375, + "learning_rate": 0.00015350108793408973, + "loss": 0.8517, + "step": 21179 + }, + { + "epoch": 0.543843009624134, + "grad_norm": 0.7421875, + "learning_rate": 0.00015349731632155193, + "loss": 0.7725, + "step": 21180 + }, + { + "epoch": 0.5438686868200558, + "grad_norm": 0.7578125, + "learning_rate": 0.00015349354460239887, + "loss": 0.8772, + "step": 21181 + }, + { + "epoch": 0.5438943640159777, + "grad_norm": 0.7578125, + "learning_rate": 0.00015348977277663796, + "loss": 0.8382, + "step": 21182 + }, + { + "epoch": 0.5439200412118994, + "grad_norm": 0.83203125, + "learning_rate": 0.00015348600084427678, + "loss": 0.8444, + "step": 21183 + }, + { + "epoch": 0.5439457184078212, + "grad_norm": 0.74609375, + "learning_rate": 0.00015348222880532284, + "loss": 0.7827, + "step": 21184 + }, + { + "epoch": 0.5439713956037431, + "grad_norm": 0.7890625, + "learning_rate": 0.00015347845665978363, + "loss": 0.8845, + "step": 21185 + }, + { + "epoch": 0.5439970727996649, + "grad_norm": 1.6015625, + "learning_rate": 0.0001534746844076667, + "loss": 1.0492, + "step": 21186 + }, + { + "epoch": 0.5440227499955868, + "grad_norm": 0.796875, + "learning_rate": 0.00015347091204897959, + "loss": 0.8267, + "step": 21187 + }, + { + "epoch": 0.5440484271915086, + "grad_norm": 0.76953125, + "learning_rate": 0.0001534671395837297, + "loss": 0.769, + "step": 21188 + }, + { + "epoch": 0.5440741043874304, + "grad_norm": 0.7734375, + "learning_rate": 0.0001534633670119247, + "loss": 0.8716, + "step": 21189 + }, + { + "epoch": 0.5440997815833521, + "grad_norm": 0.79296875, + "learning_rate": 0.00015345959433357202, + "loss": 0.7782, + "step": 21190 + }, + { + "epoch": 0.544125458779274, + "grad_norm": 0.82421875, + "learning_rate": 0.00015345582154867917, + "loss": 0.9246, + "step": 21191 + }, + { + "epoch": 0.5441511359751958, + "grad_norm": 0.765625, + "learning_rate": 0.00015345204865725376, + "loss": 0.8382, + "step": 21192 + }, + { + "epoch": 0.5441768131711177, + "grad_norm": 0.765625, + "learning_rate": 0.00015344827565930318, + "loss": 0.7798, + "step": 21193 + }, + { + "epoch": 0.5442024903670395, + "grad_norm": 0.69140625, + "learning_rate": 0.00015344450255483504, + "loss": 0.8172, + "step": 21194 + }, + { + "epoch": 0.5442281675629613, + "grad_norm": 0.78125, + "learning_rate": 0.00015344072934385684, + "loss": 1.0353, + "step": 21195 + }, + { + "epoch": 0.544253844758883, + "grad_norm": 0.7890625, + "learning_rate": 0.00015343695602637608, + "loss": 0.828, + "step": 21196 + }, + { + "epoch": 0.5442795219548049, + "grad_norm": 0.77734375, + "learning_rate": 0.0001534331826024003, + "loss": 0.9274, + "step": 21197 + }, + { + "epoch": 0.5443051991507267, + "grad_norm": 0.8828125, + "learning_rate": 0.00015342940907193698, + "loss": 0.9462, + "step": 21198 + }, + { + "epoch": 0.5443308763466486, + "grad_norm": 0.76953125, + "learning_rate": 0.00015342563543499374, + "loss": 0.8513, + "step": 21199 + }, + { + "epoch": 0.5443565535425704, + "grad_norm": 0.78125, + "learning_rate": 0.000153421861691578, + "loss": 0.8806, + "step": 21200 + }, + { + "epoch": 0.5443822307384922, + "grad_norm": 0.75390625, + "learning_rate": 0.00015341808784169732, + "loss": 0.8384, + "step": 21201 + }, + { + "epoch": 0.5444079079344141, + "grad_norm": 0.79296875, + "learning_rate": 0.0001534143138853592, + "loss": 0.9479, + "step": 21202 + }, + { + "epoch": 0.5444335851303358, + "grad_norm": 0.77734375, + "learning_rate": 0.00015341053982257118, + "loss": 0.85, + "step": 21203 + }, + { + "epoch": 0.5444592623262576, + "grad_norm": 0.80859375, + "learning_rate": 0.0001534067656533408, + "loss": 1.0058, + "step": 21204 + }, + { + "epoch": 0.5444849395221795, + "grad_norm": 0.76171875, + "learning_rate": 0.00015340299137767556, + "loss": 0.8563, + "step": 21205 + }, + { + "epoch": 0.5445106167181013, + "grad_norm": 0.69921875, + "learning_rate": 0.00015339921699558296, + "loss": 0.8346, + "step": 21206 + }, + { + "epoch": 0.5445362939140231, + "grad_norm": 0.75, + "learning_rate": 0.00015339544250707055, + "loss": 0.841, + "step": 21207 + }, + { + "epoch": 0.544561971109945, + "grad_norm": 0.71484375, + "learning_rate": 0.00015339166791214587, + "loss": 0.8266, + "step": 21208 + }, + { + "epoch": 0.5445876483058668, + "grad_norm": 0.828125, + "learning_rate": 0.00015338789321081637, + "loss": 0.7978, + "step": 21209 + }, + { + "epoch": 0.5446133255017885, + "grad_norm": 0.78125, + "learning_rate": 0.00015338411840308967, + "loss": 0.9767, + "step": 21210 + }, + { + "epoch": 0.5446390026977104, + "grad_norm": 0.828125, + "learning_rate": 0.00015338034348897324, + "loss": 0.9999, + "step": 21211 + }, + { + "epoch": 0.5446646798936322, + "grad_norm": 0.7109375, + "learning_rate": 0.0001533765684684746, + "loss": 0.804, + "step": 21212 + }, + { + "epoch": 0.544690357089554, + "grad_norm": 0.7734375, + "learning_rate": 0.0001533727933416013, + "loss": 0.7918, + "step": 21213 + }, + { + "epoch": 0.5447160342854759, + "grad_norm": 0.82421875, + "learning_rate": 0.0001533690181083608, + "loss": 0.9567, + "step": 21214 + }, + { + "epoch": 0.5447417114813977, + "grad_norm": 0.77734375, + "learning_rate": 0.00015336524276876067, + "loss": 0.9151, + "step": 21215 + }, + { + "epoch": 0.5447673886773194, + "grad_norm": 0.8203125, + "learning_rate": 0.00015336146732280846, + "loss": 0.8139, + "step": 21216 + }, + { + "epoch": 0.5447930658732413, + "grad_norm": 0.84375, + "learning_rate": 0.00015335769177051163, + "loss": 0.8267, + "step": 21217 + }, + { + "epoch": 0.5448187430691631, + "grad_norm": 0.80859375, + "learning_rate": 0.00015335391611187777, + "loss": 0.7919, + "step": 21218 + }, + { + "epoch": 0.544844420265085, + "grad_norm": 0.77734375, + "learning_rate": 0.00015335014034691436, + "loss": 0.9667, + "step": 21219 + }, + { + "epoch": 0.5448700974610068, + "grad_norm": 0.73046875, + "learning_rate": 0.00015334636447562896, + "loss": 0.7623, + "step": 21220 + }, + { + "epoch": 0.5448957746569286, + "grad_norm": 0.7578125, + "learning_rate": 0.00015334258849802906, + "loss": 0.8252, + "step": 21221 + }, + { + "epoch": 0.5449214518528505, + "grad_norm": 0.82421875, + "learning_rate": 0.0001533388124141222, + "loss": 0.8784, + "step": 21222 + }, + { + "epoch": 0.5449471290487722, + "grad_norm": 0.8125, + "learning_rate": 0.0001533350362239159, + "loss": 0.9209, + "step": 21223 + }, + { + "epoch": 0.544972806244694, + "grad_norm": 0.78125, + "learning_rate": 0.0001533312599274177, + "loss": 0.7799, + "step": 21224 + }, + { + "epoch": 0.5449984834406159, + "grad_norm": 0.92578125, + "learning_rate": 0.00015332748352463513, + "loss": 0.8932, + "step": 21225 + }, + { + "epoch": 0.5450241606365377, + "grad_norm": 0.8671875, + "learning_rate": 0.00015332370701557566, + "loss": 1.0064, + "step": 21226 + }, + { + "epoch": 0.5450498378324595, + "grad_norm": 0.76171875, + "learning_rate": 0.0001533199304002469, + "loss": 0.8534, + "step": 21227 + }, + { + "epoch": 0.5450755150283814, + "grad_norm": 0.74609375, + "learning_rate": 0.0001533161536786563, + "loss": 0.7949, + "step": 21228 + }, + { + "epoch": 0.5451011922243032, + "grad_norm": 0.8125, + "learning_rate": 0.00015331237685081145, + "loss": 0.8205, + "step": 21229 + }, + { + "epoch": 0.5451268694202249, + "grad_norm": 0.75, + "learning_rate": 0.00015330859991671985, + "loss": 0.9747, + "step": 21230 + }, + { + "epoch": 0.5451525466161468, + "grad_norm": 0.828125, + "learning_rate": 0.000153304822876389, + "loss": 0.9272, + "step": 21231 + }, + { + "epoch": 0.5451782238120686, + "grad_norm": 0.6484375, + "learning_rate": 0.00015330104572982648, + "loss": 0.7186, + "step": 21232 + }, + { + "epoch": 0.5452039010079904, + "grad_norm": 0.7265625, + "learning_rate": 0.00015329726847703977, + "loss": 0.8637, + "step": 21233 + }, + { + "epoch": 0.5452295782039123, + "grad_norm": 0.796875, + "learning_rate": 0.00015329349111803644, + "loss": 0.9042, + "step": 21234 + }, + { + "epoch": 0.5452552553998341, + "grad_norm": 0.7890625, + "learning_rate": 0.00015328971365282396, + "loss": 0.8786, + "step": 21235 + }, + { + "epoch": 0.5452809325957558, + "grad_norm": 0.7578125, + "learning_rate": 0.00015328593608140994, + "loss": 0.9388, + "step": 21236 + }, + { + "epoch": 0.5453066097916777, + "grad_norm": 0.69921875, + "learning_rate": 0.00015328215840380182, + "loss": 0.8447, + "step": 21237 + }, + { + "epoch": 0.5453322869875995, + "grad_norm": 0.75, + "learning_rate": 0.00015327838062000718, + "loss": 0.8731, + "step": 21238 + }, + { + "epoch": 0.5453579641835213, + "grad_norm": 0.81640625, + "learning_rate": 0.0001532746027300336, + "loss": 1.011, + "step": 21239 + }, + { + "epoch": 0.5453836413794432, + "grad_norm": 0.7578125, + "learning_rate": 0.0001532708247338885, + "loss": 0.9045, + "step": 21240 + }, + { + "epoch": 0.545409318575365, + "grad_norm": 0.71484375, + "learning_rate": 0.00015326704663157944, + "loss": 0.7695, + "step": 21241 + }, + { + "epoch": 0.5454349957712868, + "grad_norm": 0.85546875, + "learning_rate": 0.00015326326842311398, + "loss": 0.8117, + "step": 21242 + }, + { + "epoch": 0.5454606729672086, + "grad_norm": 0.69921875, + "learning_rate": 0.00015325949010849965, + "loss": 0.8643, + "step": 21243 + }, + { + "epoch": 0.5454863501631304, + "grad_norm": 0.7265625, + "learning_rate": 0.00015325571168774396, + "loss": 0.8452, + "step": 21244 + }, + { + "epoch": 0.5455120273590522, + "grad_norm": 0.80859375, + "learning_rate": 0.00015325193316085444, + "loss": 0.8821, + "step": 21245 + }, + { + "epoch": 0.5455377045549741, + "grad_norm": 0.81640625, + "learning_rate": 0.00015324815452783864, + "loss": 0.9175, + "step": 21246 + }, + { + "epoch": 0.5455633817508959, + "grad_norm": 0.85546875, + "learning_rate": 0.00015324437578870405, + "loss": 0.8991, + "step": 21247 + }, + { + "epoch": 0.5455890589468178, + "grad_norm": 0.80859375, + "learning_rate": 0.00015324059694345828, + "loss": 0.8684, + "step": 21248 + }, + { + "epoch": 0.5456147361427396, + "grad_norm": 0.72265625, + "learning_rate": 0.00015323681799210876, + "loss": 0.8424, + "step": 21249 + }, + { + "epoch": 0.5456404133386613, + "grad_norm": 0.734375, + "learning_rate": 0.0001532330389346631, + "loss": 0.8079, + "step": 21250 + }, + { + "epoch": 0.5456660905345831, + "grad_norm": 0.75390625, + "learning_rate": 0.0001532292597711288, + "loss": 0.7747, + "step": 21251 + }, + { + "epoch": 0.545691767730505, + "grad_norm": 0.81640625, + "learning_rate": 0.00015322548050151339, + "loss": 1.0366, + "step": 21252 + }, + { + "epoch": 0.5457174449264268, + "grad_norm": 0.85546875, + "learning_rate": 0.00015322170112582438, + "loss": 0.8577, + "step": 21253 + }, + { + "epoch": 0.5457431221223487, + "grad_norm": 0.734375, + "learning_rate": 0.00015321792164406937, + "loss": 0.8184, + "step": 21254 + }, + { + "epoch": 0.5457687993182705, + "grad_norm": 0.79296875, + "learning_rate": 0.00015321414205625585, + "loss": 0.94, + "step": 21255 + }, + { + "epoch": 0.5457944765141922, + "grad_norm": 0.765625, + "learning_rate": 0.0001532103623623913, + "loss": 1.0088, + "step": 21256 + }, + { + "epoch": 0.545820153710114, + "grad_norm": 0.7890625, + "learning_rate": 0.00015320658256248333, + "loss": 0.8687, + "step": 21257 + }, + { + "epoch": 0.5458458309060359, + "grad_norm": 0.7734375, + "learning_rate": 0.00015320280265653947, + "loss": 0.8782, + "step": 21258 + }, + { + "epoch": 0.5458715081019577, + "grad_norm": 0.7578125, + "learning_rate": 0.0001531990226445672, + "loss": 0.8235, + "step": 21259 + }, + { + "epoch": 0.5458971852978796, + "grad_norm": 0.796875, + "learning_rate": 0.00015319524252657409, + "loss": 0.8667, + "step": 21260 + }, + { + "epoch": 0.5459228624938014, + "grad_norm": 0.8359375, + "learning_rate": 0.00015319146230256768, + "loss": 0.8565, + "step": 21261 + }, + { + "epoch": 0.5459485396897232, + "grad_norm": 0.734375, + "learning_rate": 0.00015318768197255552, + "loss": 0.7487, + "step": 21262 + }, + { + "epoch": 0.545974216885645, + "grad_norm": 0.71875, + "learning_rate": 0.00015318390153654506, + "loss": 0.8205, + "step": 21263 + }, + { + "epoch": 0.5459998940815668, + "grad_norm": 0.8203125, + "learning_rate": 0.0001531801209945439, + "loss": 0.9111, + "step": 21264 + }, + { + "epoch": 0.5460255712774886, + "grad_norm": 0.796875, + "learning_rate": 0.00015317634034655954, + "loss": 0.9887, + "step": 21265 + }, + { + "epoch": 0.5460512484734105, + "grad_norm": 0.78515625, + "learning_rate": 0.00015317255959259958, + "loss": 0.8932, + "step": 21266 + }, + { + "epoch": 0.5460769256693323, + "grad_norm": 0.77734375, + "learning_rate": 0.0001531687787326715, + "loss": 0.8941, + "step": 21267 + }, + { + "epoch": 0.5461026028652541, + "grad_norm": 0.78125, + "learning_rate": 0.00015316499776678284, + "loss": 0.8236, + "step": 21268 + }, + { + "epoch": 0.546128280061176, + "grad_norm": 0.81640625, + "learning_rate": 0.00015316121669494114, + "loss": 0.8105, + "step": 21269 + }, + { + "epoch": 0.5461539572570977, + "grad_norm": 0.8203125, + "learning_rate": 0.00015315743551715393, + "loss": 0.881, + "step": 21270 + }, + { + "epoch": 0.5461796344530195, + "grad_norm": 1.1328125, + "learning_rate": 0.00015315365423342877, + "loss": 0.8325, + "step": 21271 + }, + { + "epoch": 0.5462053116489414, + "grad_norm": 0.9140625, + "learning_rate": 0.0001531498728437732, + "loss": 0.9828, + "step": 21272 + }, + { + "epoch": 0.5462309888448632, + "grad_norm": 0.796875, + "learning_rate": 0.0001531460913481947, + "loss": 0.8134, + "step": 21273 + }, + { + "epoch": 0.546256666040785, + "grad_norm": 0.76171875, + "learning_rate": 0.00015314230974670085, + "loss": 0.8473, + "step": 21274 + }, + { + "epoch": 0.5462823432367069, + "grad_norm": 0.75, + "learning_rate": 0.00015313852803929917, + "loss": 0.8003, + "step": 21275 + }, + { + "epoch": 0.5463080204326286, + "grad_norm": 0.75390625, + "learning_rate": 0.0001531347462259972, + "loss": 0.8815, + "step": 21276 + }, + { + "epoch": 0.5463336976285504, + "grad_norm": 0.7421875, + "learning_rate": 0.0001531309643068025, + "loss": 0.8718, + "step": 21277 + }, + { + "epoch": 0.5463593748244723, + "grad_norm": 0.8671875, + "learning_rate": 0.0001531271822817226, + "loss": 0.829, + "step": 21278 + }, + { + "epoch": 0.5463850520203941, + "grad_norm": 0.796875, + "learning_rate": 0.00015312340015076497, + "loss": 0.8093, + "step": 21279 + }, + { + "epoch": 0.546410729216316, + "grad_norm": 0.8359375, + "learning_rate": 0.00015311961791393726, + "loss": 0.9098, + "step": 21280 + }, + { + "epoch": 0.5464364064122378, + "grad_norm": 0.8359375, + "learning_rate": 0.00015311583557124688, + "loss": 0.9769, + "step": 21281 + }, + { + "epoch": 0.5464620836081596, + "grad_norm": 0.87890625, + "learning_rate": 0.0001531120531227015, + "loss": 0.95, + "step": 21282 + }, + { + "epoch": 0.5464877608040813, + "grad_norm": 0.77734375, + "learning_rate": 0.0001531082705683086, + "loss": 0.7959, + "step": 21283 + }, + { + "epoch": 0.5465134380000032, + "grad_norm": 0.80078125, + "learning_rate": 0.00015310448790807567, + "loss": 0.9282, + "step": 21284 + }, + { + "epoch": 0.546539115195925, + "grad_norm": 0.8046875, + "learning_rate": 0.0001531007051420103, + "loss": 0.8742, + "step": 21285 + }, + { + "epoch": 0.5465647923918469, + "grad_norm": 0.79296875, + "learning_rate": 0.00015309692227012006, + "loss": 0.9052, + "step": 21286 + }, + { + "epoch": 0.5465904695877687, + "grad_norm": 0.7734375, + "learning_rate": 0.00015309313929241244, + "loss": 0.8653, + "step": 21287 + }, + { + "epoch": 0.5466161467836905, + "grad_norm": 0.73046875, + "learning_rate": 0.00015308935620889495, + "loss": 0.8081, + "step": 21288 + }, + { + "epoch": 0.5466418239796123, + "grad_norm": 0.8046875, + "learning_rate": 0.0001530855730195752, + "loss": 0.9895, + "step": 21289 + }, + { + "epoch": 0.5466675011755341, + "grad_norm": 0.76953125, + "learning_rate": 0.00015308178972446066, + "loss": 0.8892, + "step": 21290 + }, + { + "epoch": 0.5466931783714559, + "grad_norm": 0.68359375, + "learning_rate": 0.00015307800632355899, + "loss": 0.7593, + "step": 21291 + }, + { + "epoch": 0.5467188555673778, + "grad_norm": 0.796875, + "learning_rate": 0.00015307422281687756, + "loss": 0.9279, + "step": 21292 + }, + { + "epoch": 0.5467445327632996, + "grad_norm": 0.77734375, + "learning_rate": 0.00015307043920442402, + "loss": 0.9057, + "step": 21293 + }, + { + "epoch": 0.5467702099592214, + "grad_norm": 0.890625, + "learning_rate": 0.0001530666554862059, + "loss": 0.9001, + "step": 21294 + }, + { + "epoch": 0.5467958871551433, + "grad_norm": 0.77734375, + "learning_rate": 0.00015306287166223074, + "loss": 1.0153, + "step": 21295 + }, + { + "epoch": 0.546821564351065, + "grad_norm": 0.7578125, + "learning_rate": 0.0001530590877325061, + "loss": 1.0147, + "step": 21296 + }, + { + "epoch": 0.5468472415469868, + "grad_norm": 0.7890625, + "learning_rate": 0.00015305530369703942, + "loss": 0.8254, + "step": 21297 + }, + { + "epoch": 0.5468729187429087, + "grad_norm": 0.82421875, + "learning_rate": 0.00015305151955583837, + "loss": 0.9578, + "step": 21298 + }, + { + "epoch": 0.5468985959388305, + "grad_norm": 0.82421875, + "learning_rate": 0.0001530477353089104, + "loss": 0.8249, + "step": 21299 + }, + { + "epoch": 0.5469242731347523, + "grad_norm": 0.78125, + "learning_rate": 0.0001530439509562631, + "loss": 0.871, + "step": 21300 + }, + { + "epoch": 0.5469499503306742, + "grad_norm": 0.87109375, + "learning_rate": 0.000153040166497904, + "loss": 0.9946, + "step": 21301 + }, + { + "epoch": 0.546975627526596, + "grad_norm": 0.84375, + "learning_rate": 0.00015303638193384062, + "loss": 0.8924, + "step": 21302 + }, + { + "epoch": 0.5470013047225177, + "grad_norm": 0.703125, + "learning_rate": 0.00015303259726408053, + "loss": 0.8059, + "step": 21303 + }, + { + "epoch": 0.5470269819184396, + "grad_norm": 0.828125, + "learning_rate": 0.0001530288124886313, + "loss": 0.8775, + "step": 21304 + }, + { + "epoch": 0.5470526591143614, + "grad_norm": 0.67578125, + "learning_rate": 0.00015302502760750037, + "loss": 0.7498, + "step": 21305 + }, + { + "epoch": 0.5470783363102832, + "grad_norm": 0.71875, + "learning_rate": 0.00015302124262069538, + "loss": 0.8711, + "step": 21306 + }, + { + "epoch": 0.5471040135062051, + "grad_norm": 0.85546875, + "learning_rate": 0.00015301745752822386, + "loss": 0.9878, + "step": 21307 + }, + { + "epoch": 0.5471296907021269, + "grad_norm": 0.73828125, + "learning_rate": 0.0001530136723300933, + "loss": 0.807, + "step": 21308 + }, + { + "epoch": 0.5471553678980486, + "grad_norm": 0.73828125, + "learning_rate": 0.0001530098870263113, + "loss": 0.8766, + "step": 21309 + }, + { + "epoch": 0.5471810450939705, + "grad_norm": 0.84375, + "learning_rate": 0.0001530061016168854, + "loss": 1.2128, + "step": 21310 + }, + { + "epoch": 0.5472067222898923, + "grad_norm": 0.73046875, + "learning_rate": 0.0001530023161018231, + "loss": 0.8605, + "step": 21311 + }, + { + "epoch": 0.5472323994858141, + "grad_norm": 0.77734375, + "learning_rate": 0.00015299853048113203, + "loss": 0.9022, + "step": 21312 + }, + { + "epoch": 0.547258076681736, + "grad_norm": 0.84765625, + "learning_rate": 0.00015299474475481957, + "loss": 0.8503, + "step": 21313 + }, + { + "epoch": 0.5472837538776578, + "grad_norm": 0.765625, + "learning_rate": 0.00015299095892289347, + "loss": 0.8573, + "step": 21314 + }, + { + "epoch": 0.5473094310735797, + "grad_norm": 0.85546875, + "learning_rate": 0.00015298717298536113, + "loss": 0.9675, + "step": 21315 + }, + { + "epoch": 0.5473351082695014, + "grad_norm": 0.7734375, + "learning_rate": 0.00015298338694223014, + "loss": 0.8932, + "step": 21316 + }, + { + "epoch": 0.5473607854654232, + "grad_norm": 0.8046875, + "learning_rate": 0.00015297960079350807, + "loss": 0.9134, + "step": 21317 + }, + { + "epoch": 0.547386462661345, + "grad_norm": 0.875, + "learning_rate": 0.0001529758145392024, + "loss": 1.0424, + "step": 21318 + }, + { + "epoch": 0.5474121398572669, + "grad_norm": 0.77734375, + "learning_rate": 0.00015297202817932075, + "loss": 0.8113, + "step": 21319 + }, + { + "epoch": 0.5474378170531887, + "grad_norm": 0.7421875, + "learning_rate": 0.00015296824171387064, + "loss": 0.8161, + "step": 21320 + }, + { + "epoch": 0.5474634942491106, + "grad_norm": 0.76953125, + "learning_rate": 0.00015296445514285955, + "loss": 0.8372, + "step": 21321 + }, + { + "epoch": 0.5474891714450324, + "grad_norm": 0.7734375, + "learning_rate": 0.00015296066846629515, + "loss": 0.978, + "step": 21322 + }, + { + "epoch": 0.5475148486409541, + "grad_norm": 0.828125, + "learning_rate": 0.0001529568816841849, + "loss": 0.9257, + "step": 21323 + }, + { + "epoch": 0.547540525836876, + "grad_norm": 0.80859375, + "learning_rate": 0.0001529530947965364, + "loss": 0.85, + "step": 21324 + }, + { + "epoch": 0.5475662030327978, + "grad_norm": 0.77734375, + "learning_rate": 0.00015294930780335712, + "loss": 0.8776, + "step": 21325 + }, + { + "epoch": 0.5475918802287196, + "grad_norm": 0.76953125, + "learning_rate": 0.00015294552070465464, + "loss": 1.004, + "step": 21326 + }, + { + "epoch": 0.5476175574246415, + "grad_norm": 0.83203125, + "learning_rate": 0.00015294173350043658, + "loss": 0.8342, + "step": 21327 + }, + { + "epoch": 0.5476432346205633, + "grad_norm": 0.76171875, + "learning_rate": 0.00015293794619071038, + "loss": 0.7821, + "step": 21328 + }, + { + "epoch": 0.547668911816485, + "grad_norm": 0.8359375, + "learning_rate": 0.00015293415877548362, + "loss": 0.8886, + "step": 21329 + }, + { + "epoch": 0.5476945890124069, + "grad_norm": 0.71484375, + "learning_rate": 0.00015293037125476393, + "loss": 0.8391, + "step": 21330 + }, + { + "epoch": 0.5477202662083287, + "grad_norm": 0.71875, + "learning_rate": 0.0001529265836285587, + "loss": 0.873, + "step": 21331 + }, + { + "epoch": 0.5477459434042505, + "grad_norm": 0.7734375, + "learning_rate": 0.00015292279589687566, + "loss": 0.8188, + "step": 21332 + }, + { + "epoch": 0.5477716206001724, + "grad_norm": 0.77734375, + "learning_rate": 0.00015291900805972222, + "loss": 0.8232, + "step": 21333 + }, + { + "epoch": 0.5477972977960942, + "grad_norm": 0.83203125, + "learning_rate": 0.00015291522011710596, + "loss": 0.9349, + "step": 21334 + }, + { + "epoch": 0.547822974992016, + "grad_norm": 0.73046875, + "learning_rate": 0.00015291143206903448, + "loss": 0.8591, + "step": 21335 + }, + { + "epoch": 0.5478486521879378, + "grad_norm": 0.73828125, + "learning_rate": 0.00015290764391551531, + "loss": 0.8059, + "step": 21336 + }, + { + "epoch": 0.5478743293838596, + "grad_norm": 0.7734375, + "learning_rate": 0.000152903855656556, + "loss": 0.8559, + "step": 21337 + }, + { + "epoch": 0.5479000065797814, + "grad_norm": 0.8046875, + "learning_rate": 0.000152900067292164, + "loss": 0.8417, + "step": 21338 + }, + { + "epoch": 0.5479256837757033, + "grad_norm": 0.97265625, + "learning_rate": 0.00015289627882234701, + "loss": 0.9926, + "step": 21339 + }, + { + "epoch": 0.5479513609716251, + "grad_norm": 0.85546875, + "learning_rate": 0.0001528924902471125, + "loss": 0.9524, + "step": 21340 + }, + { + "epoch": 0.547977038167547, + "grad_norm": 0.71875, + "learning_rate": 0.00015288870156646804, + "loss": 0.7746, + "step": 21341 + }, + { + "epoch": 0.5480027153634688, + "grad_norm": 0.71484375, + "learning_rate": 0.00015288491278042115, + "loss": 0.8813, + "step": 21342 + }, + { + "epoch": 0.5480283925593905, + "grad_norm": 0.74609375, + "learning_rate": 0.00015288112388897943, + "loss": 0.911, + "step": 21343 + }, + { + "epoch": 0.5480540697553123, + "grad_norm": 0.890625, + "learning_rate": 0.00015287733489215038, + "loss": 0.9305, + "step": 21344 + }, + { + "epoch": 0.5480797469512342, + "grad_norm": 0.74609375, + "learning_rate": 0.0001528735457899416, + "loss": 0.9039, + "step": 21345 + }, + { + "epoch": 0.548105424147156, + "grad_norm": 0.7421875, + "learning_rate": 0.00015286975658236065, + "loss": 0.8211, + "step": 21346 + }, + { + "epoch": 0.5481311013430779, + "grad_norm": 0.7265625, + "learning_rate": 0.000152865967269415, + "loss": 0.9608, + "step": 21347 + }, + { + "epoch": 0.5481567785389997, + "grad_norm": 0.83203125, + "learning_rate": 0.00015286217785111226, + "loss": 0.9044, + "step": 21348 + }, + { + "epoch": 0.5481824557349214, + "grad_norm": 0.78515625, + "learning_rate": 0.00015285838832746, + "loss": 0.9294, + "step": 21349 + }, + { + "epoch": 0.5482081329308433, + "grad_norm": 0.70703125, + "learning_rate": 0.00015285459869846573, + "loss": 0.9576, + "step": 21350 + }, + { + "epoch": 0.5482338101267651, + "grad_norm": 0.74609375, + "learning_rate": 0.00015285080896413703, + "loss": 0.8435, + "step": 21351 + }, + { + "epoch": 0.5482594873226869, + "grad_norm": 0.75390625, + "learning_rate": 0.00015284701912448144, + "loss": 0.7987, + "step": 21352 + }, + { + "epoch": 0.5482851645186088, + "grad_norm": 0.80078125, + "learning_rate": 0.00015284322917950645, + "loss": 0.8246, + "step": 21353 + }, + { + "epoch": 0.5483108417145306, + "grad_norm": 0.7421875, + "learning_rate": 0.00015283943912921977, + "loss": 0.8726, + "step": 21354 + }, + { + "epoch": 0.5483365189104524, + "grad_norm": 0.79296875, + "learning_rate": 0.0001528356489736288, + "loss": 0.9409, + "step": 21355 + }, + { + "epoch": 0.5483621961063742, + "grad_norm": 0.7265625, + "learning_rate": 0.00015283185871274121, + "loss": 0.8169, + "step": 21356 + }, + { + "epoch": 0.548387873302296, + "grad_norm": 0.7890625, + "learning_rate": 0.00015282806834656447, + "loss": 0.7398, + "step": 21357 + }, + { + "epoch": 0.5484135504982178, + "grad_norm": 0.80078125, + "learning_rate": 0.00015282427787510617, + "loss": 1.0191, + "step": 21358 + }, + { + "epoch": 0.5484392276941397, + "grad_norm": 0.74609375, + "learning_rate": 0.00015282048729837384, + "loss": 0.7969, + "step": 21359 + }, + { + "epoch": 0.5484649048900615, + "grad_norm": 0.828125, + "learning_rate": 0.00015281669661637505, + "loss": 0.9516, + "step": 21360 + }, + { + "epoch": 0.5484905820859833, + "grad_norm": 0.7578125, + "learning_rate": 0.00015281290582911734, + "loss": 0.8262, + "step": 21361 + }, + { + "epoch": 0.5485162592819052, + "grad_norm": 0.80078125, + "learning_rate": 0.0001528091149366083, + "loss": 0.9221, + "step": 21362 + }, + { + "epoch": 0.5485419364778269, + "grad_norm": 0.87890625, + "learning_rate": 0.00015280532393885547, + "loss": 0.9517, + "step": 21363 + }, + { + "epoch": 0.5485676136737487, + "grad_norm": 0.76171875, + "learning_rate": 0.00015280153283586637, + "loss": 0.8856, + "step": 21364 + }, + { + "epoch": 0.5485932908696706, + "grad_norm": 0.80859375, + "learning_rate": 0.00015279774162764865, + "loss": 1.0219, + "step": 21365 + }, + { + "epoch": 0.5486189680655924, + "grad_norm": 0.80078125, + "learning_rate": 0.00015279395031420973, + "loss": 0.9191, + "step": 21366 + }, + { + "epoch": 0.5486446452615142, + "grad_norm": 0.8046875, + "learning_rate": 0.00015279015889555728, + "loss": 0.9426, + "step": 21367 + }, + { + "epoch": 0.5486703224574361, + "grad_norm": 0.75, + "learning_rate": 0.0001527863673716988, + "loss": 0.9566, + "step": 21368 + }, + { + "epoch": 0.5486959996533578, + "grad_norm": 0.78515625, + "learning_rate": 0.00015278257574264186, + "loss": 0.8499, + "step": 21369 + }, + { + "epoch": 0.5487216768492796, + "grad_norm": 0.78125, + "learning_rate": 0.000152778784008394, + "loss": 0.8826, + "step": 21370 + }, + { + "epoch": 0.5487473540452015, + "grad_norm": 0.80859375, + "learning_rate": 0.0001527749921689628, + "loss": 0.9217, + "step": 21371 + }, + { + "epoch": 0.5487730312411233, + "grad_norm": 0.75390625, + "learning_rate": 0.0001527712002243558, + "loss": 0.9436, + "step": 21372 + }, + { + "epoch": 0.5487987084370451, + "grad_norm": 0.828125, + "learning_rate": 0.00015276740817458056, + "loss": 0.8021, + "step": 21373 + }, + { + "epoch": 0.548824385632967, + "grad_norm": 0.78515625, + "learning_rate": 0.00015276361601964465, + "loss": 1.0251, + "step": 21374 + }, + { + "epoch": 0.5488500628288888, + "grad_norm": 0.828125, + "learning_rate": 0.0001527598237595556, + "loss": 0.8214, + "step": 21375 + }, + { + "epoch": 0.5488757400248105, + "grad_norm": 0.8046875, + "learning_rate": 0.00015275603139432105, + "loss": 0.9444, + "step": 21376 + }, + { + "epoch": 0.5489014172207324, + "grad_norm": 0.734375, + "learning_rate": 0.00015275223892394842, + "loss": 0.9931, + "step": 21377 + }, + { + "epoch": 0.5489270944166542, + "grad_norm": 0.7890625, + "learning_rate": 0.0001527484463484454, + "loss": 0.8042, + "step": 21378 + }, + { + "epoch": 0.548952771612576, + "grad_norm": 0.79296875, + "learning_rate": 0.00015274465366781946, + "loss": 0.8281, + "step": 21379 + }, + { + "epoch": 0.5489784488084979, + "grad_norm": 0.73828125, + "learning_rate": 0.00015274086088207817, + "loss": 0.8102, + "step": 21380 + }, + { + "epoch": 0.5490041260044197, + "grad_norm": 0.78125, + "learning_rate": 0.00015273706799122917, + "loss": 0.8004, + "step": 21381 + }, + { + "epoch": 0.5490298032003416, + "grad_norm": 0.828125, + "learning_rate": 0.0001527332749952799, + "loss": 0.9062, + "step": 21382 + }, + { + "epoch": 0.5490554803962633, + "grad_norm": 0.80859375, + "learning_rate": 0.00015272948189423802, + "loss": 0.932, + "step": 21383 + }, + { + "epoch": 0.5490811575921851, + "grad_norm": 0.73046875, + "learning_rate": 0.00015272568868811103, + "loss": 0.892, + "step": 21384 + }, + { + "epoch": 0.549106834788107, + "grad_norm": 0.7890625, + "learning_rate": 0.0001527218953769065, + "loss": 0.8745, + "step": 21385 + }, + { + "epoch": 0.5491325119840288, + "grad_norm": 0.73828125, + "learning_rate": 0.00015271810196063198, + "loss": 0.85, + "step": 21386 + }, + { + "epoch": 0.5491581891799506, + "grad_norm": 0.71875, + "learning_rate": 0.00015271430843929505, + "loss": 0.7984, + "step": 21387 + }, + { + "epoch": 0.5491838663758725, + "grad_norm": 0.7578125, + "learning_rate": 0.0001527105148129033, + "loss": 0.8307, + "step": 21388 + }, + { + "epoch": 0.5492095435717942, + "grad_norm": 0.8828125, + "learning_rate": 0.00015270672108146424, + "loss": 1.0065, + "step": 21389 + }, + { + "epoch": 0.549235220767716, + "grad_norm": 0.796875, + "learning_rate": 0.0001527029272449854, + "loss": 0.9314, + "step": 21390 + }, + { + "epoch": 0.5492608979636379, + "grad_norm": 0.73828125, + "learning_rate": 0.00015269913330347444, + "loss": 0.7534, + "step": 21391 + }, + { + "epoch": 0.5492865751595597, + "grad_norm": 0.7734375, + "learning_rate": 0.00015269533925693884, + "loss": 0.9356, + "step": 21392 + }, + { + "epoch": 0.5493122523554815, + "grad_norm": 0.828125, + "learning_rate": 0.0001526915451053862, + "loss": 1.0246, + "step": 21393 + }, + { + "epoch": 0.5493379295514034, + "grad_norm": 0.78125, + "learning_rate": 0.00015268775084882407, + "loss": 0.9079, + "step": 21394 + }, + { + "epoch": 0.5493636067473252, + "grad_norm": 0.859375, + "learning_rate": 0.00015268395648726, + "loss": 0.9551, + "step": 21395 + }, + { + "epoch": 0.5493892839432469, + "grad_norm": 0.83984375, + "learning_rate": 0.00015268016202070158, + "loss": 0.8084, + "step": 21396 + }, + { + "epoch": 0.5494149611391688, + "grad_norm": 0.77734375, + "learning_rate": 0.00015267636744915636, + "loss": 0.9062, + "step": 21397 + }, + { + "epoch": 0.5494406383350906, + "grad_norm": 0.80859375, + "learning_rate": 0.00015267257277263186, + "loss": 0.8806, + "step": 21398 + }, + { + "epoch": 0.5494663155310124, + "grad_norm": 0.7421875, + "learning_rate": 0.00015266877799113572, + "loss": 0.7679, + "step": 21399 + }, + { + "epoch": 0.5494919927269343, + "grad_norm": 0.80078125, + "learning_rate": 0.00015266498310467544, + "loss": 0.8165, + "step": 21400 + }, + { + "epoch": 0.5495176699228561, + "grad_norm": 0.82421875, + "learning_rate": 0.00015266118811325865, + "loss": 1.0866, + "step": 21401 + }, + { + "epoch": 0.549543347118778, + "grad_norm": 0.78515625, + "learning_rate": 0.00015265739301689285, + "loss": 0.9436, + "step": 21402 + }, + { + "epoch": 0.5495690243146997, + "grad_norm": 0.75390625, + "learning_rate": 0.0001526535978155856, + "loss": 0.9523, + "step": 21403 + }, + { + "epoch": 0.5495947015106215, + "grad_norm": 0.71875, + "learning_rate": 0.0001526498025093445, + "loss": 0.7862, + "step": 21404 + }, + { + "epoch": 0.5496203787065433, + "grad_norm": 0.76171875, + "learning_rate": 0.0001526460070981771, + "loss": 0.926, + "step": 21405 + }, + { + "epoch": 0.5496460559024652, + "grad_norm": 0.84765625, + "learning_rate": 0.0001526422115820909, + "loss": 0.8573, + "step": 21406 + }, + { + "epoch": 0.549671733098387, + "grad_norm": 0.80078125, + "learning_rate": 0.00015263841596109364, + "loss": 0.9768, + "step": 21407 + }, + { + "epoch": 0.5496974102943089, + "grad_norm": 0.73046875, + "learning_rate": 0.0001526346202351927, + "loss": 0.8877, + "step": 21408 + }, + { + "epoch": 0.5497230874902306, + "grad_norm": 0.859375, + "learning_rate": 0.00015263082440439572, + "loss": 0.9678, + "step": 21409 + }, + { + "epoch": 0.5497487646861524, + "grad_norm": 0.765625, + "learning_rate": 0.0001526270284687103, + "loss": 0.8522, + "step": 21410 + }, + { + "epoch": 0.5497744418820742, + "grad_norm": 0.6953125, + "learning_rate": 0.0001526232324281439, + "loss": 0.7648, + "step": 21411 + }, + { + "epoch": 0.5498001190779961, + "grad_norm": 0.7890625, + "learning_rate": 0.00015261943628270422, + "loss": 0.9352, + "step": 21412 + }, + { + "epoch": 0.5498257962739179, + "grad_norm": 0.86328125, + "learning_rate": 0.00015261564003239873, + "loss": 1.0128, + "step": 21413 + }, + { + "epoch": 0.5498514734698398, + "grad_norm": 0.7578125, + "learning_rate": 0.000152611843677235, + "loss": 0.923, + "step": 21414 + }, + { + "epoch": 0.5498771506657616, + "grad_norm": 0.7890625, + "learning_rate": 0.00015260804721722065, + "loss": 0.9276, + "step": 21415 + }, + { + "epoch": 0.5499028278616833, + "grad_norm": 0.91796875, + "learning_rate": 0.0001526042506523632, + "loss": 0.9488, + "step": 21416 + }, + { + "epoch": 0.5499285050576052, + "grad_norm": 0.78515625, + "learning_rate": 0.00015260045398267025, + "loss": 0.8831, + "step": 21417 + }, + { + "epoch": 0.549954182253527, + "grad_norm": 0.70703125, + "learning_rate": 0.0001525966572081493, + "loss": 0.8187, + "step": 21418 + }, + { + "epoch": 0.5499798594494488, + "grad_norm": 0.8515625, + "learning_rate": 0.00015259286032880798, + "loss": 0.7908, + "step": 21419 + }, + { + "epoch": 0.5500055366453707, + "grad_norm": 0.84375, + "learning_rate": 0.00015258906334465386, + "loss": 0.9356, + "step": 21420 + }, + { + "epoch": 0.5500312138412925, + "grad_norm": 0.78125, + "learning_rate": 0.0001525852662556945, + "loss": 0.8975, + "step": 21421 + }, + { + "epoch": 0.5500568910372143, + "grad_norm": 0.77734375, + "learning_rate": 0.00015258146906193743, + "loss": 0.9572, + "step": 21422 + }, + { + "epoch": 0.5500825682331361, + "grad_norm": 0.6796875, + "learning_rate": 0.00015257767176339023, + "loss": 0.7678, + "step": 21423 + }, + { + "epoch": 0.5501082454290579, + "grad_norm": 0.81640625, + "learning_rate": 0.00015257387436006051, + "loss": 1.0644, + "step": 21424 + }, + { + "epoch": 0.5501339226249797, + "grad_norm": 0.79296875, + "learning_rate": 0.0001525700768519558, + "loss": 0.813, + "step": 21425 + }, + { + "epoch": 0.5501595998209016, + "grad_norm": 0.8046875, + "learning_rate": 0.00015256627923908365, + "loss": 0.8082, + "step": 21426 + }, + { + "epoch": 0.5501852770168234, + "grad_norm": 0.77734375, + "learning_rate": 0.00015256248152145167, + "loss": 0.8286, + "step": 21427 + }, + { + "epoch": 0.5502109542127452, + "grad_norm": 0.77734375, + "learning_rate": 0.0001525586836990674, + "loss": 0.8422, + "step": 21428 + }, + { + "epoch": 0.550236631408667, + "grad_norm": 0.80078125, + "learning_rate": 0.00015255488577193847, + "loss": 0.8264, + "step": 21429 + }, + { + "epoch": 0.5502623086045888, + "grad_norm": 0.7734375, + "learning_rate": 0.00015255108774007235, + "loss": 0.8, + "step": 21430 + }, + { + "epoch": 0.5502879858005106, + "grad_norm": 0.78125, + "learning_rate": 0.00015254728960347667, + "loss": 0.8677, + "step": 21431 + }, + { + "epoch": 0.5503136629964325, + "grad_norm": 0.76953125, + "learning_rate": 0.000152543491362159, + "loss": 0.9533, + "step": 21432 + }, + { + "epoch": 0.5503393401923543, + "grad_norm": 0.8046875, + "learning_rate": 0.00015253969301612686, + "loss": 0.9116, + "step": 21433 + }, + { + "epoch": 0.5503650173882761, + "grad_norm": 0.7265625, + "learning_rate": 0.00015253589456538791, + "loss": 0.8257, + "step": 21434 + }, + { + "epoch": 0.550390694584198, + "grad_norm": 0.7578125, + "learning_rate": 0.00015253209600994965, + "loss": 0.8997, + "step": 21435 + }, + { + "epoch": 0.5504163717801197, + "grad_norm": 0.796875, + "learning_rate": 0.00015252829734981965, + "loss": 0.9383, + "step": 21436 + }, + { + "epoch": 0.5504420489760415, + "grad_norm": 0.82421875, + "learning_rate": 0.00015252449858500552, + "loss": 0.9062, + "step": 21437 + }, + { + "epoch": 0.5504677261719634, + "grad_norm": 0.83984375, + "learning_rate": 0.0001525206997155148, + "loss": 0.8802, + "step": 21438 + }, + { + "epoch": 0.5504934033678852, + "grad_norm": 0.83203125, + "learning_rate": 0.00015251690074135507, + "loss": 0.986, + "step": 21439 + }, + { + "epoch": 0.550519080563807, + "grad_norm": 0.765625, + "learning_rate": 0.0001525131016625339, + "loss": 0.7525, + "step": 21440 + }, + { + "epoch": 0.5505447577597289, + "grad_norm": 0.8125, + "learning_rate": 0.00015250930247905884, + "loss": 0.9418, + "step": 21441 + }, + { + "epoch": 0.5505704349556507, + "grad_norm": 0.75390625, + "learning_rate": 0.00015250550319093753, + "loss": 0.8519, + "step": 21442 + }, + { + "epoch": 0.5505961121515724, + "grad_norm": 0.7265625, + "learning_rate": 0.00015250170379817746, + "loss": 0.7391, + "step": 21443 + }, + { + "epoch": 0.5506217893474943, + "grad_norm": 0.7578125, + "learning_rate": 0.00015249790430078624, + "loss": 0.8725, + "step": 21444 + }, + { + "epoch": 0.5506474665434161, + "grad_norm": 0.72265625, + "learning_rate": 0.00015249410469877144, + "loss": 0.838, + "step": 21445 + }, + { + "epoch": 0.550673143739338, + "grad_norm": 0.765625, + "learning_rate": 0.00015249030499214062, + "loss": 0.8236, + "step": 21446 + }, + { + "epoch": 0.5506988209352598, + "grad_norm": 0.75, + "learning_rate": 0.00015248650518090141, + "loss": 1.0406, + "step": 21447 + }, + { + "epoch": 0.5507244981311816, + "grad_norm": 0.73828125, + "learning_rate": 0.0001524827052650613, + "loss": 0.9198, + "step": 21448 + }, + { + "epoch": 0.5507501753271034, + "grad_norm": 0.7890625, + "learning_rate": 0.0001524789052446279, + "loss": 0.8681, + "step": 21449 + }, + { + "epoch": 0.5507758525230252, + "grad_norm": 0.83203125, + "learning_rate": 0.0001524751051196088, + "loss": 1.0472, + "step": 21450 + }, + { + "epoch": 0.550801529718947, + "grad_norm": 0.79296875, + "learning_rate": 0.00015247130489001154, + "loss": 0.8675, + "step": 21451 + }, + { + "epoch": 0.5508272069148689, + "grad_norm": 0.8359375, + "learning_rate": 0.00015246750455584374, + "loss": 0.9721, + "step": 21452 + }, + { + "epoch": 0.5508528841107907, + "grad_norm": 0.79296875, + "learning_rate": 0.0001524637041171129, + "loss": 0.8557, + "step": 21453 + }, + { + "epoch": 0.5508785613067125, + "grad_norm": 0.7421875, + "learning_rate": 0.00015245990357382665, + "loss": 0.8701, + "step": 21454 + }, + { + "epoch": 0.5509042385026344, + "grad_norm": 0.7890625, + "learning_rate": 0.00015245610292599255, + "loss": 0.9247, + "step": 21455 + }, + { + "epoch": 0.5509299156985561, + "grad_norm": 0.7890625, + "learning_rate": 0.00015245230217361816, + "loss": 0.9999, + "step": 21456 + }, + { + "epoch": 0.5509555928944779, + "grad_norm": 0.828125, + "learning_rate": 0.0001524485013167111, + "loss": 0.8932, + "step": 21457 + }, + { + "epoch": 0.5509812700903998, + "grad_norm": 0.7890625, + "learning_rate": 0.0001524447003552789, + "loss": 0.9039, + "step": 21458 + }, + { + "epoch": 0.5510069472863216, + "grad_norm": 0.79296875, + "learning_rate": 0.00015244089928932914, + "loss": 0.9333, + "step": 21459 + }, + { + "epoch": 0.5510326244822434, + "grad_norm": 0.828125, + "learning_rate": 0.00015243709811886942, + "loss": 1.0235, + "step": 21460 + }, + { + "epoch": 0.5510583016781653, + "grad_norm": 0.7890625, + "learning_rate": 0.00015243329684390733, + "loss": 0.8155, + "step": 21461 + }, + { + "epoch": 0.5510839788740871, + "grad_norm": 0.71875, + "learning_rate": 0.00015242949546445038, + "loss": 0.9282, + "step": 21462 + }, + { + "epoch": 0.5511096560700088, + "grad_norm": 0.68359375, + "learning_rate": 0.00015242569398050618, + "loss": 0.7498, + "step": 21463 + }, + { + "epoch": 0.5511353332659307, + "grad_norm": 0.765625, + "learning_rate": 0.00015242189239208235, + "loss": 0.8109, + "step": 21464 + }, + { + "epoch": 0.5511610104618525, + "grad_norm": 0.78125, + "learning_rate": 0.00015241809069918634, + "loss": 0.8797, + "step": 21465 + }, + { + "epoch": 0.5511866876577743, + "grad_norm": 0.76171875, + "learning_rate": 0.00015241428890182585, + "loss": 0.8556, + "step": 21466 + }, + { + "epoch": 0.5512123648536962, + "grad_norm": 0.7421875, + "learning_rate": 0.00015241048700000842, + "loss": 0.8861, + "step": 21467 + }, + { + "epoch": 0.551238042049618, + "grad_norm": 0.828125, + "learning_rate": 0.00015240668499374164, + "loss": 0.8371, + "step": 21468 + }, + { + "epoch": 0.5512637192455397, + "grad_norm": 0.8046875, + "learning_rate": 0.00015240288288303306, + "loss": 0.8539, + "step": 21469 + }, + { + "epoch": 0.5512893964414616, + "grad_norm": 1.0625, + "learning_rate": 0.00015239908066789026, + "loss": 0.9889, + "step": 21470 + }, + { + "epoch": 0.5513150736373834, + "grad_norm": 0.7578125, + "learning_rate": 0.00015239527834832084, + "loss": 0.8675, + "step": 21471 + }, + { + "epoch": 0.5513407508333052, + "grad_norm": 0.80859375, + "learning_rate": 0.00015239147592433235, + "loss": 0.8499, + "step": 21472 + }, + { + "epoch": 0.5513664280292271, + "grad_norm": 0.8046875, + "learning_rate": 0.00015238767339593236, + "loss": 0.8411, + "step": 21473 + }, + { + "epoch": 0.5513921052251489, + "grad_norm": 0.76953125, + "learning_rate": 0.0001523838707631285, + "loss": 0.8881, + "step": 21474 + }, + { + "epoch": 0.5514177824210708, + "grad_norm": 0.765625, + "learning_rate": 0.00015238006802592832, + "loss": 0.8089, + "step": 21475 + }, + { + "epoch": 0.5514434596169925, + "grad_norm": 0.76171875, + "learning_rate": 0.0001523762651843394, + "loss": 0.8275, + "step": 21476 + }, + { + "epoch": 0.5514691368129143, + "grad_norm": 0.73828125, + "learning_rate": 0.00015237246223836928, + "loss": 0.9335, + "step": 21477 + }, + { + "epoch": 0.5514948140088362, + "grad_norm": 0.83203125, + "learning_rate": 0.00015236865918802562, + "loss": 0.9231, + "step": 21478 + }, + { + "epoch": 0.551520491204758, + "grad_norm": 0.7890625, + "learning_rate": 0.00015236485603331592, + "loss": 0.7561, + "step": 21479 + }, + { + "epoch": 0.5515461684006798, + "grad_norm": 0.8125, + "learning_rate": 0.00015236105277424778, + "loss": 0.9833, + "step": 21480 + }, + { + "epoch": 0.5515718455966017, + "grad_norm": 0.765625, + "learning_rate": 0.00015235724941082884, + "loss": 0.7986, + "step": 21481 + }, + { + "epoch": 0.5515975227925234, + "grad_norm": 0.765625, + "learning_rate": 0.0001523534459430666, + "loss": 0.9226, + "step": 21482 + }, + { + "epoch": 0.5516231999884452, + "grad_norm": 0.7578125, + "learning_rate": 0.00015234964237096865, + "loss": 1.0725, + "step": 21483 + }, + { + "epoch": 0.5516488771843671, + "grad_norm": 0.7421875, + "learning_rate": 0.00015234583869454263, + "loss": 0.8001, + "step": 21484 + }, + { + "epoch": 0.5516745543802889, + "grad_norm": 0.81640625, + "learning_rate": 0.00015234203491379606, + "loss": 0.9251, + "step": 21485 + }, + { + "epoch": 0.5517002315762107, + "grad_norm": 0.859375, + "learning_rate": 0.00015233823102873655, + "loss": 0.9787, + "step": 21486 + }, + { + "epoch": 0.5517259087721326, + "grad_norm": 0.82421875, + "learning_rate": 0.00015233442703937167, + "loss": 0.8568, + "step": 21487 + }, + { + "epoch": 0.5517515859680544, + "grad_norm": 0.75390625, + "learning_rate": 0.00015233062294570901, + "loss": 0.8902, + "step": 21488 + }, + { + "epoch": 0.5517772631639761, + "grad_norm": 0.8203125, + "learning_rate": 0.00015232681874775613, + "loss": 0.8986, + "step": 21489 + }, + { + "epoch": 0.551802940359898, + "grad_norm": 0.8359375, + "learning_rate": 0.00015232301444552065, + "loss": 0.793, + "step": 21490 + }, + { + "epoch": 0.5518286175558198, + "grad_norm": 0.83984375, + "learning_rate": 0.0001523192100390101, + "loss": 0.9467, + "step": 21491 + }, + { + "epoch": 0.5518542947517416, + "grad_norm": 0.75, + "learning_rate": 0.00015231540552823212, + "loss": 0.8866, + "step": 21492 + }, + { + "epoch": 0.5518799719476635, + "grad_norm": 0.78515625, + "learning_rate": 0.00015231160091319426, + "loss": 0.8459, + "step": 21493 + }, + { + "epoch": 0.5519056491435853, + "grad_norm": 0.765625, + "learning_rate": 0.0001523077961939041, + "loss": 0.8668, + "step": 21494 + }, + { + "epoch": 0.5519313263395071, + "grad_norm": 0.8828125, + "learning_rate": 0.00015230399137036921, + "loss": 0.9685, + "step": 21495 + }, + { + "epoch": 0.5519570035354289, + "grad_norm": 0.828125, + "learning_rate": 0.00015230018644259722, + "loss": 0.8527, + "step": 21496 + }, + { + "epoch": 0.5519826807313507, + "grad_norm": 0.7421875, + "learning_rate": 0.00015229638141059563, + "loss": 0.8799, + "step": 21497 + }, + { + "epoch": 0.5520083579272725, + "grad_norm": 0.83984375, + "learning_rate": 0.00015229257627437213, + "loss": 0.8109, + "step": 21498 + }, + { + "epoch": 0.5520340351231944, + "grad_norm": 0.859375, + "learning_rate": 0.00015228877103393423, + "loss": 0.8373, + "step": 21499 + }, + { + "epoch": 0.5520597123191162, + "grad_norm": 0.72265625, + "learning_rate": 0.00015228496568928953, + "loss": 0.8122, + "step": 21500 + }, + { + "epoch": 0.552085389515038, + "grad_norm": 0.76953125, + "learning_rate": 0.00015228116024044563, + "loss": 0.939, + "step": 21501 + }, + { + "epoch": 0.5521110667109598, + "grad_norm": 0.8125, + "learning_rate": 0.00015227735468741006, + "loss": 0.8848, + "step": 21502 + }, + { + "epoch": 0.5521367439068816, + "grad_norm": 0.78125, + "learning_rate": 0.00015227354903019048, + "loss": 0.9517, + "step": 21503 + }, + { + "epoch": 0.5521624211028034, + "grad_norm": 0.73828125, + "learning_rate": 0.00015226974326879443, + "loss": 0.6768, + "step": 21504 + }, + { + "epoch": 0.5521880982987253, + "grad_norm": 0.78125, + "learning_rate": 0.0001522659374032295, + "loss": 0.8191, + "step": 21505 + }, + { + "epoch": 0.5522137754946471, + "grad_norm": 0.73046875, + "learning_rate": 0.0001522621314335033, + "loss": 0.9008, + "step": 21506 + }, + { + "epoch": 0.552239452690569, + "grad_norm": 0.6796875, + "learning_rate": 0.00015225832535962334, + "loss": 0.7745, + "step": 21507 + }, + { + "epoch": 0.5522651298864908, + "grad_norm": 0.7734375, + "learning_rate": 0.00015225451918159734, + "loss": 0.8999, + "step": 21508 + }, + { + "epoch": 0.5522908070824125, + "grad_norm": 0.73046875, + "learning_rate": 0.00015225071289943275, + "loss": 0.8372, + "step": 21509 + }, + { + "epoch": 0.5523164842783344, + "grad_norm": 0.76171875, + "learning_rate": 0.0001522469065131372, + "loss": 0.8839, + "step": 21510 + }, + { + "epoch": 0.5523421614742562, + "grad_norm": 0.80078125, + "learning_rate": 0.0001522431000227183, + "loss": 0.8636, + "step": 21511 + }, + { + "epoch": 0.552367838670178, + "grad_norm": 0.76953125, + "learning_rate": 0.00015223929342818364, + "loss": 0.8902, + "step": 21512 + }, + { + "epoch": 0.5523935158660999, + "grad_norm": 0.71875, + "learning_rate": 0.00015223548672954075, + "loss": 0.7035, + "step": 21513 + }, + { + "epoch": 0.5524191930620217, + "grad_norm": 0.74609375, + "learning_rate": 0.0001522316799267973, + "loss": 0.9007, + "step": 21514 + }, + { + "epoch": 0.5524448702579435, + "grad_norm": 0.78125, + "learning_rate": 0.0001522278730199608, + "loss": 0.8482, + "step": 21515 + }, + { + "epoch": 0.5524705474538653, + "grad_norm": 0.796875, + "learning_rate": 0.00015222406600903888, + "loss": 0.9166, + "step": 21516 + }, + { + "epoch": 0.5524962246497871, + "grad_norm": 0.76953125, + "learning_rate": 0.00015222025889403908, + "loss": 0.8869, + "step": 21517 + }, + { + "epoch": 0.5525219018457089, + "grad_norm": 0.76953125, + "learning_rate": 0.00015221645167496904, + "loss": 0.883, + "step": 21518 + }, + { + "epoch": 0.5525475790416308, + "grad_norm": 0.7421875, + "learning_rate": 0.00015221264435183633, + "loss": 1.0411, + "step": 21519 + }, + { + "epoch": 0.5525732562375526, + "grad_norm": 0.71875, + "learning_rate": 0.00015220883692464854, + "loss": 0.9861, + "step": 21520 + }, + { + "epoch": 0.5525989334334744, + "grad_norm": 0.74609375, + "learning_rate": 0.00015220502939341325, + "loss": 0.8554, + "step": 21521 + }, + { + "epoch": 0.5526246106293962, + "grad_norm": 0.8046875, + "learning_rate": 0.00015220122175813806, + "loss": 0.8728, + "step": 21522 + }, + { + "epoch": 0.552650287825318, + "grad_norm": 0.8125, + "learning_rate": 0.00015219741401883052, + "loss": 0.8801, + "step": 21523 + }, + { + "epoch": 0.5526759650212398, + "grad_norm": 0.8046875, + "learning_rate": 0.0001521936061754983, + "loss": 0.9331, + "step": 21524 + }, + { + "epoch": 0.5527016422171617, + "grad_norm": 0.9609375, + "learning_rate": 0.00015218979822814892, + "loss": 0.8692, + "step": 21525 + }, + { + "epoch": 0.5527273194130835, + "grad_norm": 0.875, + "learning_rate": 0.00015218599017678995, + "loss": 0.9058, + "step": 21526 + }, + { + "epoch": 0.5527529966090053, + "grad_norm": 0.7578125, + "learning_rate": 0.00015218218202142907, + "loss": 0.8059, + "step": 21527 + }, + { + "epoch": 0.5527786738049272, + "grad_norm": 0.8203125, + "learning_rate": 0.00015217837376207375, + "loss": 0.9862, + "step": 21528 + }, + { + "epoch": 0.5528043510008489, + "grad_norm": 0.828125, + "learning_rate": 0.0001521745653987317, + "loss": 0.8952, + "step": 21529 + }, + { + "epoch": 0.5528300281967707, + "grad_norm": 0.7734375, + "learning_rate": 0.00015217075693141046, + "loss": 0.9131, + "step": 21530 + }, + { + "epoch": 0.5528557053926926, + "grad_norm": 0.91015625, + "learning_rate": 0.00015216694836011756, + "loss": 0.9277, + "step": 21531 + }, + { + "epoch": 0.5528813825886144, + "grad_norm": 0.74609375, + "learning_rate": 0.00015216313968486066, + "loss": 0.92, + "step": 21532 + }, + { + "epoch": 0.5529070597845362, + "grad_norm": 0.73828125, + "learning_rate": 0.00015215933090564736, + "loss": 0.8916, + "step": 21533 + }, + { + "epoch": 0.5529327369804581, + "grad_norm": 0.765625, + "learning_rate": 0.00015215552202248525, + "loss": 0.8576, + "step": 21534 + }, + { + "epoch": 0.5529584141763799, + "grad_norm": 0.859375, + "learning_rate": 0.00015215171303538182, + "loss": 0.9678, + "step": 21535 + }, + { + "epoch": 0.5529840913723016, + "grad_norm": 0.75, + "learning_rate": 0.0001521479039443448, + "loss": 0.8853, + "step": 21536 + }, + { + "epoch": 0.5530097685682235, + "grad_norm": 0.8046875, + "learning_rate": 0.0001521440947493817, + "loss": 0.905, + "step": 21537 + }, + { + "epoch": 0.5530354457641453, + "grad_norm": 0.72265625, + "learning_rate": 0.00015214028545050011, + "loss": 0.857, + "step": 21538 + }, + { + "epoch": 0.5530611229600672, + "grad_norm": 0.73828125, + "learning_rate": 0.00015213647604770764, + "loss": 0.8802, + "step": 21539 + }, + { + "epoch": 0.553086800155989, + "grad_norm": 0.7734375, + "learning_rate": 0.00015213266654101192, + "loss": 0.8724, + "step": 21540 + }, + { + "epoch": 0.5531124773519108, + "grad_norm": 0.79296875, + "learning_rate": 0.0001521288569304205, + "loss": 0.8258, + "step": 21541 + }, + { + "epoch": 0.5531381545478326, + "grad_norm": 0.71484375, + "learning_rate": 0.00015212504721594093, + "loss": 0.7669, + "step": 21542 + }, + { + "epoch": 0.5531638317437544, + "grad_norm": 0.88671875, + "learning_rate": 0.00015212123739758088, + "loss": 0.8712, + "step": 21543 + }, + { + "epoch": 0.5531895089396762, + "grad_norm": 0.76171875, + "learning_rate": 0.0001521174274753479, + "loss": 0.8663, + "step": 21544 + }, + { + "epoch": 0.5532151861355981, + "grad_norm": 0.78515625, + "learning_rate": 0.0001521136174492496, + "loss": 0.8702, + "step": 21545 + }, + { + "epoch": 0.5532408633315199, + "grad_norm": 0.796875, + "learning_rate": 0.00015210980731929357, + "loss": 0.9372, + "step": 21546 + }, + { + "epoch": 0.5532665405274417, + "grad_norm": 0.828125, + "learning_rate": 0.0001521059970854874, + "loss": 0.8281, + "step": 21547 + }, + { + "epoch": 0.5532922177233636, + "grad_norm": 0.8125, + "learning_rate": 0.00015210218674783873, + "loss": 0.7993, + "step": 21548 + }, + { + "epoch": 0.5533178949192853, + "grad_norm": 0.8125, + "learning_rate": 0.00015209837630635503, + "loss": 0.8192, + "step": 21549 + }, + { + "epoch": 0.5533435721152071, + "grad_norm": 0.7734375, + "learning_rate": 0.00015209456576104402, + "loss": 0.8345, + "step": 21550 + }, + { + "epoch": 0.553369249311129, + "grad_norm": 0.80078125, + "learning_rate": 0.00015209075511191325, + "loss": 0.8663, + "step": 21551 + }, + { + "epoch": 0.5533949265070508, + "grad_norm": 1.0859375, + "learning_rate": 0.0001520869443589703, + "loss": 0.8228, + "step": 21552 + }, + { + "epoch": 0.5534206037029726, + "grad_norm": 0.78125, + "learning_rate": 0.00015208313350222275, + "loss": 0.881, + "step": 21553 + }, + { + "epoch": 0.5534462808988945, + "grad_norm": 0.83984375, + "learning_rate": 0.00015207932254167824, + "loss": 0.9609, + "step": 21554 + }, + { + "epoch": 0.5534719580948163, + "grad_norm": 0.875, + "learning_rate": 0.00015207551147734436, + "loss": 0.9009, + "step": 21555 + }, + { + "epoch": 0.553497635290738, + "grad_norm": 0.8046875, + "learning_rate": 0.00015207170030922867, + "loss": 0.9749, + "step": 21556 + }, + { + "epoch": 0.5535233124866599, + "grad_norm": 0.75, + "learning_rate": 0.00015206788903733876, + "loss": 0.8961, + "step": 21557 + }, + { + "epoch": 0.5535489896825817, + "grad_norm": 0.8203125, + "learning_rate": 0.0001520640776616823, + "loss": 0.9246, + "step": 21558 + }, + { + "epoch": 0.5535746668785035, + "grad_norm": 0.8125, + "learning_rate": 0.0001520602661822668, + "loss": 0.9438, + "step": 21559 + }, + { + "epoch": 0.5536003440744254, + "grad_norm": 0.72265625, + "learning_rate": 0.00015205645459909992, + "loss": 0.8403, + "step": 21560 + }, + { + "epoch": 0.5536260212703472, + "grad_norm": 0.76171875, + "learning_rate": 0.0001520526429121892, + "loss": 0.8991, + "step": 21561 + }, + { + "epoch": 0.5536516984662689, + "grad_norm": 0.7734375, + "learning_rate": 0.0001520488311215423, + "loss": 0.9643, + "step": 21562 + }, + { + "epoch": 0.5536773756621908, + "grad_norm": 0.81640625, + "learning_rate": 0.00015204501922716675, + "loss": 0.9106, + "step": 21563 + }, + { + "epoch": 0.5537030528581126, + "grad_norm": 0.7421875, + "learning_rate": 0.00015204120722907017, + "loss": 0.9326, + "step": 21564 + }, + { + "epoch": 0.5537287300540344, + "grad_norm": 0.82421875, + "learning_rate": 0.0001520373951272602, + "loss": 0.8437, + "step": 21565 + }, + { + "epoch": 0.5537544072499563, + "grad_norm": 0.76953125, + "learning_rate": 0.0001520335829217444, + "loss": 0.8023, + "step": 21566 + }, + { + "epoch": 0.5537800844458781, + "grad_norm": 0.78125, + "learning_rate": 0.00015202977061253036, + "loss": 0.8686, + "step": 21567 + }, + { + "epoch": 0.5538057616418, + "grad_norm": 0.80859375, + "learning_rate": 0.00015202595819962565, + "loss": 0.9253, + "step": 21568 + }, + { + "epoch": 0.5538314388377217, + "grad_norm": 0.76953125, + "learning_rate": 0.00015202214568303796, + "loss": 0.774, + "step": 21569 + }, + { + "epoch": 0.5538571160336435, + "grad_norm": 0.80078125, + "learning_rate": 0.0001520183330627748, + "loss": 0.8016, + "step": 21570 + }, + { + "epoch": 0.5538827932295654, + "grad_norm": 0.78125, + "learning_rate": 0.0001520145203388438, + "loss": 0.8217, + "step": 21571 + }, + { + "epoch": 0.5539084704254872, + "grad_norm": 0.8203125, + "learning_rate": 0.0001520107075112526, + "loss": 0.7748, + "step": 21572 + }, + { + "epoch": 0.553934147621409, + "grad_norm": 0.7734375, + "learning_rate": 0.00015200689458000872, + "loss": 0.9216, + "step": 21573 + }, + { + "epoch": 0.5539598248173309, + "grad_norm": 0.95703125, + "learning_rate": 0.0001520030815451198, + "loss": 0.8015, + "step": 21574 + }, + { + "epoch": 0.5539855020132527, + "grad_norm": 0.890625, + "learning_rate": 0.00015199926840659344, + "loss": 0.8925, + "step": 21575 + }, + { + "epoch": 0.5540111792091744, + "grad_norm": 0.765625, + "learning_rate": 0.00015199545516443724, + "loss": 0.9824, + "step": 21576 + }, + { + "epoch": 0.5540368564050963, + "grad_norm": 0.71484375, + "learning_rate": 0.0001519916418186588, + "loss": 0.9425, + "step": 21577 + }, + { + "epoch": 0.5540625336010181, + "grad_norm": 0.7578125, + "learning_rate": 0.00015198782836926567, + "loss": 0.9527, + "step": 21578 + }, + { + "epoch": 0.5540882107969399, + "grad_norm": 0.82421875, + "learning_rate": 0.00015198401481626554, + "loss": 0.8795, + "step": 21579 + }, + { + "epoch": 0.5541138879928618, + "grad_norm": 0.78515625, + "learning_rate": 0.00015198020115966598, + "loss": 0.7559, + "step": 21580 + }, + { + "epoch": 0.5541395651887836, + "grad_norm": 0.8203125, + "learning_rate": 0.00015197638739947452, + "loss": 0.9592, + "step": 21581 + }, + { + "epoch": 0.5541652423847053, + "grad_norm": 0.76953125, + "learning_rate": 0.00015197257353569886, + "loss": 0.7497, + "step": 21582 + }, + { + "epoch": 0.5541909195806272, + "grad_norm": 0.8359375, + "learning_rate": 0.00015196875956834655, + "loss": 0.9721, + "step": 21583 + }, + { + "epoch": 0.554216596776549, + "grad_norm": 0.73828125, + "learning_rate": 0.00015196494549742516, + "loss": 0.8821, + "step": 21584 + }, + { + "epoch": 0.5542422739724708, + "grad_norm": 0.80078125, + "learning_rate": 0.00015196113132294237, + "loss": 0.8736, + "step": 21585 + }, + { + "epoch": 0.5542679511683927, + "grad_norm": 0.80078125, + "learning_rate": 0.0001519573170449057, + "loss": 0.8581, + "step": 21586 + }, + { + "epoch": 0.5542936283643145, + "grad_norm": 0.76953125, + "learning_rate": 0.00015195350266332283, + "loss": 0.9626, + "step": 21587 + }, + { + "epoch": 0.5543193055602363, + "grad_norm": 0.796875, + "learning_rate": 0.0001519496881782013, + "loss": 0.9283, + "step": 21588 + }, + { + "epoch": 0.5543449827561581, + "grad_norm": 1.65625, + "learning_rate": 0.00015194587358954875, + "loss": 0.8632, + "step": 21589 + }, + { + "epoch": 0.5543706599520799, + "grad_norm": 0.75, + "learning_rate": 0.00015194205889737275, + "loss": 0.868, + "step": 21590 + }, + { + "epoch": 0.5543963371480017, + "grad_norm": 0.74609375, + "learning_rate": 0.0001519382441016809, + "loss": 0.7988, + "step": 21591 + }, + { + "epoch": 0.5544220143439236, + "grad_norm": 0.73046875, + "learning_rate": 0.00015193442920248085, + "loss": 0.8434, + "step": 21592 + }, + { + "epoch": 0.5544476915398454, + "grad_norm": 0.7890625, + "learning_rate": 0.0001519306141997802, + "loss": 0.8094, + "step": 21593 + }, + { + "epoch": 0.5544733687357672, + "grad_norm": 0.7421875, + "learning_rate": 0.00015192679909358648, + "loss": 0.7353, + "step": 21594 + }, + { + "epoch": 0.5544990459316891, + "grad_norm": 0.796875, + "learning_rate": 0.00015192298388390734, + "loss": 0.863, + "step": 21595 + }, + { + "epoch": 0.5545247231276108, + "grad_norm": 0.78125, + "learning_rate": 0.00015191916857075038, + "loss": 0.802, + "step": 21596 + }, + { + "epoch": 0.5545504003235326, + "grad_norm": 0.7734375, + "learning_rate": 0.00015191535315412327, + "loss": 0.9707, + "step": 21597 + }, + { + "epoch": 0.5545760775194545, + "grad_norm": 0.74609375, + "learning_rate": 0.00015191153763403347, + "loss": 0.9006, + "step": 21598 + }, + { + "epoch": 0.5546017547153763, + "grad_norm": 0.75, + "learning_rate": 0.0001519077220104887, + "loss": 0.6936, + "step": 21599 + }, + { + "epoch": 0.5546274319112982, + "grad_norm": 0.7578125, + "learning_rate": 0.00015190390628349654, + "loss": 0.9956, + "step": 21600 + }, + { + "epoch": 0.55465310910722, + "grad_norm": 0.6953125, + "learning_rate": 0.00015190009045306456, + "loss": 0.9802, + "step": 21601 + }, + { + "epoch": 0.5546787863031417, + "grad_norm": 0.81640625, + "learning_rate": 0.00015189627451920042, + "loss": 0.8254, + "step": 21602 + }, + { + "epoch": 0.5547044634990635, + "grad_norm": 0.75390625, + "learning_rate": 0.00015189245848191166, + "loss": 0.7907, + "step": 21603 + }, + { + "epoch": 0.5547301406949854, + "grad_norm": 0.78125, + "learning_rate": 0.00015188864234120592, + "loss": 0.848, + "step": 21604 + }, + { + "epoch": 0.5547558178909072, + "grad_norm": 0.7421875, + "learning_rate": 0.00015188482609709082, + "loss": 0.9417, + "step": 21605 + }, + { + "epoch": 0.5547814950868291, + "grad_norm": 0.7578125, + "learning_rate": 0.00015188100974957393, + "loss": 0.8847, + "step": 21606 + }, + { + "epoch": 0.5548071722827509, + "grad_norm": 0.77734375, + "learning_rate": 0.0001518771932986629, + "loss": 0.8683, + "step": 21607 + }, + { + "epoch": 0.5548328494786727, + "grad_norm": 0.82421875, + "learning_rate": 0.0001518733767443653, + "loss": 0.8472, + "step": 21608 + }, + { + "epoch": 0.5548585266745945, + "grad_norm": 0.7890625, + "learning_rate": 0.00015186956008668873, + "loss": 0.8813, + "step": 21609 + }, + { + "epoch": 0.5548842038705163, + "grad_norm": 0.79296875, + "learning_rate": 0.00015186574332564084, + "loss": 0.9647, + "step": 21610 + }, + { + "epoch": 0.5549098810664381, + "grad_norm": 0.78515625, + "learning_rate": 0.00015186192646122917, + "loss": 0.7904, + "step": 21611 + }, + { + "epoch": 0.55493555826236, + "grad_norm": 0.78125, + "learning_rate": 0.00015185810949346138, + "loss": 0.6925, + "step": 21612 + }, + { + "epoch": 0.5549612354582818, + "grad_norm": 0.828125, + "learning_rate": 0.0001518542924223451, + "loss": 1.0157, + "step": 21613 + }, + { + "epoch": 0.5549869126542036, + "grad_norm": 0.8203125, + "learning_rate": 0.00015185047524788784, + "loss": 0.9518, + "step": 21614 + }, + { + "epoch": 0.5550125898501255, + "grad_norm": 0.8125, + "learning_rate": 0.00015184665797009732, + "loss": 0.8432, + "step": 21615 + }, + { + "epoch": 0.5550382670460472, + "grad_norm": 0.76171875, + "learning_rate": 0.00015184284058898103, + "loss": 0.7999, + "step": 21616 + }, + { + "epoch": 0.555063944241969, + "grad_norm": 0.76953125, + "learning_rate": 0.0001518390231045467, + "loss": 0.7813, + "step": 21617 + }, + { + "epoch": 0.5550896214378909, + "grad_norm": 0.703125, + "learning_rate": 0.00015183520551680186, + "loss": 0.782, + "step": 21618 + }, + { + "epoch": 0.5551152986338127, + "grad_norm": 0.72265625, + "learning_rate": 0.00015183138782575415, + "loss": 0.7184, + "step": 21619 + }, + { + "epoch": 0.5551409758297345, + "grad_norm": 0.8359375, + "learning_rate": 0.00015182757003141115, + "loss": 0.9459, + "step": 21620 + }, + { + "epoch": 0.5551666530256564, + "grad_norm": 0.828125, + "learning_rate": 0.00015182375213378045, + "loss": 0.9671, + "step": 21621 + }, + { + "epoch": 0.5551923302215781, + "grad_norm": 0.796875, + "learning_rate": 0.00015181993413286975, + "loss": 0.8764, + "step": 21622 + }, + { + "epoch": 0.5552180074174999, + "grad_norm": 0.7890625, + "learning_rate": 0.0001518161160286866, + "loss": 0.8126, + "step": 21623 + }, + { + "epoch": 0.5552436846134218, + "grad_norm": 0.7421875, + "learning_rate": 0.0001518122978212386, + "loss": 0.7565, + "step": 21624 + }, + { + "epoch": 0.5552693618093436, + "grad_norm": 0.765625, + "learning_rate": 0.00015180847951053335, + "loss": 0.9081, + "step": 21625 + }, + { + "epoch": 0.5552950390052654, + "grad_norm": 0.76171875, + "learning_rate": 0.0001518046610965785, + "loss": 0.8065, + "step": 21626 + }, + { + "epoch": 0.5553207162011873, + "grad_norm": 0.796875, + "learning_rate": 0.00015180084257938162, + "loss": 0.8276, + "step": 21627 + }, + { + "epoch": 0.5553463933971091, + "grad_norm": 0.7109375, + "learning_rate": 0.00015179702395895037, + "loss": 0.8144, + "step": 21628 + }, + { + "epoch": 0.5553720705930308, + "grad_norm": 0.828125, + "learning_rate": 0.0001517932052352923, + "loss": 0.8888, + "step": 21629 + }, + { + "epoch": 0.5553977477889527, + "grad_norm": 0.74609375, + "learning_rate": 0.00015178938640841508, + "loss": 0.846, + "step": 21630 + }, + { + "epoch": 0.5554234249848745, + "grad_norm": 0.8046875, + "learning_rate": 0.00015178556747832625, + "loss": 0.9029, + "step": 21631 + }, + { + "epoch": 0.5554491021807964, + "grad_norm": 0.75, + "learning_rate": 0.0001517817484450335, + "loss": 0.871, + "step": 21632 + }, + { + "epoch": 0.5554747793767182, + "grad_norm": 0.79296875, + "learning_rate": 0.0001517779293085444, + "loss": 0.8974, + "step": 21633 + }, + { + "epoch": 0.55550045657264, + "grad_norm": 0.81640625, + "learning_rate": 0.00015177411006886657, + "loss": 0.8299, + "step": 21634 + }, + { + "epoch": 0.5555261337685619, + "grad_norm": 0.8515625, + "learning_rate": 0.0001517702907260076, + "loss": 0.9006, + "step": 21635 + }, + { + "epoch": 0.5555518109644836, + "grad_norm": 1.0234375, + "learning_rate": 0.0001517664712799751, + "loss": 0.9948, + "step": 21636 + }, + { + "epoch": 0.5555774881604054, + "grad_norm": 0.79296875, + "learning_rate": 0.00015176265173077672, + "loss": 0.8831, + "step": 21637 + }, + { + "epoch": 0.5556031653563273, + "grad_norm": 0.765625, + "learning_rate": 0.00015175883207842002, + "loss": 0.8605, + "step": 21638 + }, + { + "epoch": 0.5556288425522491, + "grad_norm": 0.828125, + "learning_rate": 0.00015175501232291268, + "loss": 1.1522, + "step": 21639 + }, + { + "epoch": 0.5556545197481709, + "grad_norm": 0.73828125, + "learning_rate": 0.0001517511924642623, + "loss": 0.8043, + "step": 21640 + }, + { + "epoch": 0.5556801969440928, + "grad_norm": 0.81640625, + "learning_rate": 0.0001517473725024764, + "loss": 1.0183, + "step": 21641 + }, + { + "epoch": 0.5557058741400145, + "grad_norm": 0.84765625, + "learning_rate": 0.0001517435524375627, + "loss": 0.9088, + "step": 21642 + }, + { + "epoch": 0.5557315513359363, + "grad_norm": 0.76953125, + "learning_rate": 0.00015173973226952878, + "loss": 0.8781, + "step": 21643 + }, + { + "epoch": 0.5557572285318582, + "grad_norm": 0.8515625, + "learning_rate": 0.00015173591199838223, + "loss": 0.8538, + "step": 21644 + }, + { + "epoch": 0.55578290572778, + "grad_norm": 0.7265625, + "learning_rate": 0.0001517320916241307, + "loss": 0.8839, + "step": 21645 + }, + { + "epoch": 0.5558085829237018, + "grad_norm": 0.765625, + "learning_rate": 0.00015172827114678177, + "loss": 0.8037, + "step": 21646 + }, + { + "epoch": 0.5558342601196237, + "grad_norm": 0.75390625, + "learning_rate": 0.00015172445056634305, + "loss": 0.8634, + "step": 21647 + }, + { + "epoch": 0.5558599373155455, + "grad_norm": 0.71875, + "learning_rate": 0.0001517206298828222, + "loss": 0.7706, + "step": 21648 + }, + { + "epoch": 0.5558856145114672, + "grad_norm": 0.8125, + "learning_rate": 0.0001517168090962268, + "loss": 0.9203, + "step": 21649 + }, + { + "epoch": 0.5559112917073891, + "grad_norm": 0.80078125, + "learning_rate": 0.00015171298820656448, + "loss": 0.8726, + "step": 21650 + }, + { + "epoch": 0.5559369689033109, + "grad_norm": 0.80078125, + "learning_rate": 0.0001517091672138428, + "loss": 0.8598, + "step": 21651 + }, + { + "epoch": 0.5559626460992327, + "grad_norm": 0.74609375, + "learning_rate": 0.00015170534611806948, + "loss": 0.8486, + "step": 21652 + }, + { + "epoch": 0.5559883232951546, + "grad_norm": 0.8828125, + "learning_rate": 0.00015170152491925202, + "loss": 0.8748, + "step": 21653 + }, + { + "epoch": 0.5560140004910764, + "grad_norm": 0.75390625, + "learning_rate": 0.00015169770361739816, + "loss": 0.8613, + "step": 21654 + }, + { + "epoch": 0.5560396776869982, + "grad_norm": 0.828125, + "learning_rate": 0.0001516938822125154, + "loss": 0.9499, + "step": 21655 + }, + { + "epoch": 0.55606535488292, + "grad_norm": 0.77734375, + "learning_rate": 0.0001516900607046114, + "loss": 0.8181, + "step": 21656 + }, + { + "epoch": 0.5560910320788418, + "grad_norm": 0.875, + "learning_rate": 0.00015168623909369376, + "loss": 0.9045, + "step": 21657 + }, + { + "epoch": 0.5561167092747636, + "grad_norm": 0.85546875, + "learning_rate": 0.0001516824173797702, + "loss": 1.0133, + "step": 21658 + }, + { + "epoch": 0.5561423864706855, + "grad_norm": 0.74609375, + "learning_rate": 0.00015167859556284818, + "loss": 0.7366, + "step": 21659 + }, + { + "epoch": 0.5561680636666073, + "grad_norm": 0.75390625, + "learning_rate": 0.0001516747736429354, + "loss": 0.8139, + "step": 21660 + }, + { + "epoch": 0.5561937408625292, + "grad_norm": 0.6796875, + "learning_rate": 0.00015167095162003946, + "loss": 0.6514, + "step": 21661 + }, + { + "epoch": 0.5562194180584509, + "grad_norm": 0.73828125, + "learning_rate": 0.00015166712949416796, + "loss": 0.776, + "step": 21662 + }, + { + "epoch": 0.5562450952543727, + "grad_norm": 0.72265625, + "learning_rate": 0.0001516633072653286, + "loss": 0.8507, + "step": 21663 + }, + { + "epoch": 0.5562707724502945, + "grad_norm": 0.72265625, + "learning_rate": 0.00015165948493352886, + "loss": 0.7641, + "step": 21664 + }, + { + "epoch": 0.5562964496462164, + "grad_norm": 0.83203125, + "learning_rate": 0.00015165566249877648, + "loss": 0.8091, + "step": 21665 + }, + { + "epoch": 0.5563221268421382, + "grad_norm": 0.82421875, + "learning_rate": 0.00015165183996107903, + "loss": 0.954, + "step": 21666 + }, + { + "epoch": 0.5563478040380601, + "grad_norm": 0.81640625, + "learning_rate": 0.00015164801732044412, + "loss": 0.9911, + "step": 21667 + }, + { + "epoch": 0.5563734812339819, + "grad_norm": 0.7734375, + "learning_rate": 0.0001516441945768794, + "loss": 0.8652, + "step": 21668 + }, + { + "epoch": 0.5563991584299036, + "grad_norm": 0.75, + "learning_rate": 0.0001516403717303924, + "loss": 0.8863, + "step": 21669 + }, + { + "epoch": 0.5564248356258255, + "grad_norm": 0.78125, + "learning_rate": 0.00015163654878099086, + "loss": 0.9205, + "step": 21670 + }, + { + "epoch": 0.5564505128217473, + "grad_norm": 0.73046875, + "learning_rate": 0.00015163272572868235, + "loss": 0.8841, + "step": 21671 + }, + { + "epoch": 0.5564761900176691, + "grad_norm": 0.765625, + "learning_rate": 0.00015162890257347446, + "loss": 0.8337, + "step": 21672 + }, + { + "epoch": 0.556501867213591, + "grad_norm": 0.765625, + "learning_rate": 0.00015162507931537484, + "loss": 0.8178, + "step": 21673 + }, + { + "epoch": 0.5565275444095128, + "grad_norm": 0.8203125, + "learning_rate": 0.00015162125595439107, + "loss": 0.9675, + "step": 21674 + }, + { + "epoch": 0.5565532216054346, + "grad_norm": 0.78515625, + "learning_rate": 0.00015161743249053082, + "loss": 0.9047, + "step": 21675 + }, + { + "epoch": 0.5565788988013564, + "grad_norm": 0.71484375, + "learning_rate": 0.0001516136089238017, + "loss": 0.8213, + "step": 21676 + }, + { + "epoch": 0.5566045759972782, + "grad_norm": 0.6953125, + "learning_rate": 0.00015160978525421132, + "loss": 0.6929, + "step": 21677 + }, + { + "epoch": 0.5566302531932, + "grad_norm": 0.7734375, + "learning_rate": 0.00015160596148176727, + "loss": 0.8681, + "step": 21678 + }, + { + "epoch": 0.5566559303891219, + "grad_norm": 0.73046875, + "learning_rate": 0.00015160213760647724, + "loss": 0.8222, + "step": 21679 + }, + { + "epoch": 0.5566816075850437, + "grad_norm": 0.83203125, + "learning_rate": 0.00015159831362834876, + "loss": 0.8973, + "step": 21680 + }, + { + "epoch": 0.5567072847809655, + "grad_norm": 0.73046875, + "learning_rate": 0.00015159448954738956, + "loss": 0.8673, + "step": 21681 + }, + { + "epoch": 0.5567329619768873, + "grad_norm": 0.82421875, + "learning_rate": 0.00015159066536360718, + "loss": 0.8928, + "step": 21682 + }, + { + "epoch": 0.5567586391728091, + "grad_norm": 0.80078125, + "learning_rate": 0.00015158684107700925, + "loss": 0.8839, + "step": 21683 + }, + { + "epoch": 0.5567843163687309, + "grad_norm": 0.7578125, + "learning_rate": 0.00015158301668760342, + "loss": 0.8538, + "step": 21684 + }, + { + "epoch": 0.5568099935646528, + "grad_norm": 0.7421875, + "learning_rate": 0.00015157919219539729, + "loss": 0.8767, + "step": 21685 + }, + { + "epoch": 0.5568356707605746, + "grad_norm": 0.8046875, + "learning_rate": 0.0001515753676003985, + "loss": 0.9575, + "step": 21686 + }, + { + "epoch": 0.5568613479564964, + "grad_norm": 0.7578125, + "learning_rate": 0.00015157154290261463, + "loss": 0.8112, + "step": 21687 + }, + { + "epoch": 0.5568870251524183, + "grad_norm": 0.80859375, + "learning_rate": 0.00015156771810205337, + "loss": 1.006, + "step": 21688 + }, + { + "epoch": 0.55691270234834, + "grad_norm": 0.80859375, + "learning_rate": 0.0001515638931987223, + "loss": 0.9079, + "step": 21689 + }, + { + "epoch": 0.5569383795442618, + "grad_norm": 0.73046875, + "learning_rate": 0.00015156006819262903, + "loss": 0.7452, + "step": 21690 + }, + { + "epoch": 0.5569640567401837, + "grad_norm": 0.73046875, + "learning_rate": 0.00015155624308378122, + "loss": 0.7185, + "step": 21691 + }, + { + "epoch": 0.5569897339361055, + "grad_norm": 0.7734375, + "learning_rate": 0.00015155241787218648, + "loss": 0.9733, + "step": 21692 + }, + { + "epoch": 0.5570154111320273, + "grad_norm": 0.71484375, + "learning_rate": 0.00015154859255785242, + "loss": 0.8218, + "step": 21693 + }, + { + "epoch": 0.5570410883279492, + "grad_norm": 0.8125, + "learning_rate": 0.00015154476714078664, + "loss": 0.8401, + "step": 21694 + }, + { + "epoch": 0.5570667655238709, + "grad_norm": 0.81640625, + "learning_rate": 0.00015154094162099684, + "loss": 0.823, + "step": 21695 + }, + { + "epoch": 0.5570924427197927, + "grad_norm": 0.80078125, + "learning_rate": 0.00015153711599849055, + "loss": 0.9757, + "step": 21696 + }, + { + "epoch": 0.5571181199157146, + "grad_norm": 0.7734375, + "learning_rate": 0.0001515332902732755, + "loss": 0.7936, + "step": 21697 + }, + { + "epoch": 0.5571437971116364, + "grad_norm": 0.79296875, + "learning_rate": 0.00015152946444535923, + "loss": 0.885, + "step": 21698 + }, + { + "epoch": 0.5571694743075583, + "grad_norm": 0.81640625, + "learning_rate": 0.00015152563851474938, + "loss": 0.937, + "step": 21699 + }, + { + "epoch": 0.5571951515034801, + "grad_norm": 0.77734375, + "learning_rate": 0.00015152181248145362, + "loss": 0.9286, + "step": 21700 + }, + { + "epoch": 0.5572208286994019, + "grad_norm": 0.78125, + "learning_rate": 0.00015151798634547951, + "loss": 0.8851, + "step": 21701 + }, + { + "epoch": 0.5572465058953237, + "grad_norm": 0.75390625, + "learning_rate": 0.00015151416010683472, + "loss": 0.7855, + "step": 21702 + }, + { + "epoch": 0.5572721830912455, + "grad_norm": 0.76953125, + "learning_rate": 0.00015151033376552684, + "loss": 0.7819, + "step": 21703 + }, + { + "epoch": 0.5572978602871673, + "grad_norm": 0.7265625, + "learning_rate": 0.00015150650732156354, + "loss": 0.8603, + "step": 21704 + }, + { + "epoch": 0.5573235374830892, + "grad_norm": 0.75390625, + "learning_rate": 0.00015150268077495244, + "loss": 0.9805, + "step": 21705 + }, + { + "epoch": 0.557349214679011, + "grad_norm": 0.76171875, + "learning_rate": 0.00015149885412570112, + "loss": 0.8831, + "step": 21706 + }, + { + "epoch": 0.5573748918749328, + "grad_norm": 0.84765625, + "learning_rate": 0.00015149502737381727, + "loss": 0.9925, + "step": 21707 + }, + { + "epoch": 0.5574005690708547, + "grad_norm": 0.73828125, + "learning_rate": 0.00015149120051930845, + "loss": 0.8254, + "step": 21708 + }, + { + "epoch": 0.5574262462667764, + "grad_norm": 0.65234375, + "learning_rate": 0.00015148737356218234, + "loss": 0.8604, + "step": 21709 + }, + { + "epoch": 0.5574519234626982, + "grad_norm": 0.76171875, + "learning_rate": 0.00015148354650244651, + "loss": 0.9533, + "step": 21710 + }, + { + "epoch": 0.5574776006586201, + "grad_norm": 0.78515625, + "learning_rate": 0.00015147971934010868, + "loss": 0.8316, + "step": 21711 + }, + { + "epoch": 0.5575032778545419, + "grad_norm": 0.83203125, + "learning_rate": 0.00015147589207517638, + "loss": 0.9957, + "step": 21712 + }, + { + "epoch": 0.5575289550504637, + "grad_norm": 0.8359375, + "learning_rate": 0.0001514720647076573, + "loss": 0.8792, + "step": 21713 + }, + { + "epoch": 0.5575546322463856, + "grad_norm": 0.7578125, + "learning_rate": 0.000151468237237559, + "loss": 0.8968, + "step": 21714 + }, + { + "epoch": 0.5575803094423073, + "grad_norm": 0.765625, + "learning_rate": 0.00015146440966488917, + "loss": 0.9332, + "step": 21715 + }, + { + "epoch": 0.5576059866382291, + "grad_norm": 0.734375, + "learning_rate": 0.00015146058198965543, + "loss": 0.8257, + "step": 21716 + }, + { + "epoch": 0.557631663834151, + "grad_norm": 0.81640625, + "learning_rate": 0.0001514567542118654, + "loss": 0.9077, + "step": 21717 + }, + { + "epoch": 0.5576573410300728, + "grad_norm": 0.7265625, + "learning_rate": 0.0001514529263315267, + "loss": 0.8311, + "step": 21718 + }, + { + "epoch": 0.5576830182259946, + "grad_norm": 0.765625, + "learning_rate": 0.000151449098348647, + "loss": 0.9544, + "step": 21719 + }, + { + "epoch": 0.5577086954219165, + "grad_norm": 0.8203125, + "learning_rate": 0.00015144527026323385, + "loss": 0.9388, + "step": 21720 + }, + { + "epoch": 0.5577343726178383, + "grad_norm": 0.78515625, + "learning_rate": 0.00015144144207529493, + "loss": 1.0556, + "step": 21721 + }, + { + "epoch": 0.55776004981376, + "grad_norm": 0.890625, + "learning_rate": 0.00015143761378483788, + "loss": 0.8252, + "step": 21722 + }, + { + "epoch": 0.5577857270096819, + "grad_norm": 0.73046875, + "learning_rate": 0.00015143378539187027, + "loss": 0.8428, + "step": 21723 + }, + { + "epoch": 0.5578114042056037, + "grad_norm": 0.796875, + "learning_rate": 0.0001514299568963998, + "loss": 0.8292, + "step": 21724 + }, + { + "epoch": 0.5578370814015255, + "grad_norm": 0.765625, + "learning_rate": 0.0001514261282984341, + "loss": 0.833, + "step": 21725 + }, + { + "epoch": 0.5578627585974474, + "grad_norm": 0.75390625, + "learning_rate": 0.00015142229959798074, + "loss": 0.9064, + "step": 21726 + }, + { + "epoch": 0.5578884357933692, + "grad_norm": 0.81640625, + "learning_rate": 0.0001514184707950474, + "loss": 1.0103, + "step": 21727 + }, + { + "epoch": 0.5579141129892911, + "grad_norm": 0.76171875, + "learning_rate": 0.00015141464188964167, + "loss": 0.8224, + "step": 21728 + }, + { + "epoch": 0.5579397901852128, + "grad_norm": 0.77734375, + "learning_rate": 0.00015141081288177122, + "loss": 0.8749, + "step": 21729 + }, + { + "epoch": 0.5579654673811346, + "grad_norm": 0.796875, + "learning_rate": 0.00015140698377144366, + "loss": 0.8147, + "step": 21730 + }, + { + "epoch": 0.5579911445770565, + "grad_norm": 0.85546875, + "learning_rate": 0.00015140315455866663, + "loss": 0.9387, + "step": 21731 + }, + { + "epoch": 0.5580168217729783, + "grad_norm": 0.78125, + "learning_rate": 0.00015139932524344777, + "loss": 0.9824, + "step": 21732 + }, + { + "epoch": 0.5580424989689001, + "grad_norm": 0.8671875, + "learning_rate": 0.00015139549582579464, + "loss": 0.9482, + "step": 21733 + }, + { + "epoch": 0.558068176164822, + "grad_norm": 0.78515625, + "learning_rate": 0.00015139166630571496, + "loss": 0.865, + "step": 21734 + }, + { + "epoch": 0.5580938533607437, + "grad_norm": 0.7890625, + "learning_rate": 0.00015138783668321634, + "loss": 0.9659, + "step": 21735 + }, + { + "epoch": 0.5581195305566655, + "grad_norm": 0.80859375, + "learning_rate": 0.0001513840069583064, + "loss": 0.9867, + "step": 21736 + }, + { + "epoch": 0.5581452077525874, + "grad_norm": 0.7421875, + "learning_rate": 0.0001513801771309928, + "loss": 0.9112, + "step": 21737 + }, + { + "epoch": 0.5581708849485092, + "grad_norm": 0.82421875, + "learning_rate": 0.0001513763472012831, + "loss": 0.9434, + "step": 21738 + }, + { + "epoch": 0.558196562144431, + "grad_norm": 0.78125, + "learning_rate": 0.000151372517169185, + "loss": 0.7955, + "step": 21739 + }, + { + "epoch": 0.5582222393403529, + "grad_norm": 0.75, + "learning_rate": 0.00015136868703470615, + "loss": 1.024, + "step": 21740 + }, + { + "epoch": 0.5582479165362747, + "grad_norm": 0.84375, + "learning_rate": 0.0001513648567978541, + "loss": 0.8521, + "step": 21741 + }, + { + "epoch": 0.5582735937321964, + "grad_norm": 0.8046875, + "learning_rate": 0.00015136102645863655, + "loss": 0.9581, + "step": 21742 + }, + { + "epoch": 0.5582992709281183, + "grad_norm": 0.76953125, + "learning_rate": 0.0001513571960170611, + "loss": 0.7561, + "step": 21743 + }, + { + "epoch": 0.5583249481240401, + "grad_norm": 0.8203125, + "learning_rate": 0.00015135336547313544, + "loss": 0.8387, + "step": 21744 + }, + { + "epoch": 0.5583506253199619, + "grad_norm": 0.79296875, + "learning_rate": 0.00015134953482686712, + "loss": 0.9411, + "step": 21745 + }, + { + "epoch": 0.5583763025158838, + "grad_norm": 0.76953125, + "learning_rate": 0.0001513457040782638, + "loss": 0.8493, + "step": 21746 + }, + { + "epoch": 0.5584019797118056, + "grad_norm": 0.7265625, + "learning_rate": 0.00015134187322733315, + "loss": 0.8592, + "step": 21747 + }, + { + "epoch": 0.5584276569077274, + "grad_norm": 0.91015625, + "learning_rate": 0.0001513380422740828, + "loss": 0.8144, + "step": 21748 + }, + { + "epoch": 0.5584533341036492, + "grad_norm": 0.7734375, + "learning_rate": 0.00015133421121852036, + "loss": 1.0346, + "step": 21749 + }, + { + "epoch": 0.558479011299571, + "grad_norm": 0.796875, + "learning_rate": 0.00015133038006065344, + "loss": 0.9976, + "step": 21750 + }, + { + "epoch": 0.5585046884954928, + "grad_norm": 0.7109375, + "learning_rate": 0.0001513265488004898, + "loss": 0.7381, + "step": 21751 + }, + { + "epoch": 0.5585303656914147, + "grad_norm": 0.76171875, + "learning_rate": 0.0001513227174380369, + "loss": 0.8497, + "step": 21752 + }, + { + "epoch": 0.5585560428873365, + "grad_norm": 0.8203125, + "learning_rate": 0.00015131888597330248, + "loss": 0.9217, + "step": 21753 + }, + { + "epoch": 0.5585817200832583, + "grad_norm": 0.77734375, + "learning_rate": 0.00015131505440629415, + "loss": 0.8005, + "step": 21754 + }, + { + "epoch": 0.5586073972791801, + "grad_norm": 0.79296875, + "learning_rate": 0.00015131122273701956, + "loss": 0.9284, + "step": 21755 + }, + { + "epoch": 0.5586330744751019, + "grad_norm": 0.77734375, + "learning_rate": 0.00015130739096548635, + "loss": 0.8363, + "step": 21756 + }, + { + "epoch": 0.5586587516710237, + "grad_norm": 0.7890625, + "learning_rate": 0.00015130355909170215, + "loss": 0.9707, + "step": 21757 + }, + { + "epoch": 0.5586844288669456, + "grad_norm": 0.73828125, + "learning_rate": 0.00015129972711567458, + "loss": 0.9657, + "step": 21758 + }, + { + "epoch": 0.5587101060628674, + "grad_norm": 0.75, + "learning_rate": 0.00015129589503741129, + "loss": 0.7865, + "step": 21759 + }, + { + "epoch": 0.5587357832587893, + "grad_norm": 0.890625, + "learning_rate": 0.0001512920628569199, + "loss": 0.881, + "step": 21760 + }, + { + "epoch": 0.5587614604547111, + "grad_norm": 0.7265625, + "learning_rate": 0.00015128823057420807, + "loss": 0.8165, + "step": 21761 + }, + { + "epoch": 0.5587871376506328, + "grad_norm": 0.76171875, + "learning_rate": 0.00015128439818928345, + "loss": 0.803, + "step": 21762 + }, + { + "epoch": 0.5588128148465547, + "grad_norm": 0.73046875, + "learning_rate": 0.0001512805657021536, + "loss": 0.8125, + "step": 21763 + }, + { + "epoch": 0.5588384920424765, + "grad_norm": 0.84765625, + "learning_rate": 0.0001512767331128263, + "loss": 1.0417, + "step": 21764 + }, + { + "epoch": 0.5588641692383983, + "grad_norm": 0.78125, + "learning_rate": 0.000151272900421309, + "loss": 0.8159, + "step": 21765 + }, + { + "epoch": 0.5588898464343202, + "grad_norm": 0.703125, + "learning_rate": 0.00015126906762760953, + "loss": 0.745, + "step": 21766 + }, + { + "epoch": 0.558915523630242, + "grad_norm": 0.78125, + "learning_rate": 0.0001512652347317354, + "loss": 0.9336, + "step": 21767 + }, + { + "epoch": 0.5589412008261638, + "grad_norm": 0.78515625, + "learning_rate": 0.00015126140173369427, + "loss": 0.812, + "step": 21768 + }, + { + "epoch": 0.5589668780220856, + "grad_norm": 0.8515625, + "learning_rate": 0.00015125756863349383, + "loss": 0.7618, + "step": 21769 + }, + { + "epoch": 0.5589925552180074, + "grad_norm": 0.7890625, + "learning_rate": 0.00015125373543114167, + "loss": 0.8524, + "step": 21770 + }, + { + "epoch": 0.5590182324139292, + "grad_norm": 0.96484375, + "learning_rate": 0.00015124990212664544, + "loss": 0.9137, + "step": 21771 + }, + { + "epoch": 0.5590439096098511, + "grad_norm": 0.82421875, + "learning_rate": 0.00015124606872001284, + "loss": 0.8807, + "step": 21772 + }, + { + "epoch": 0.5590695868057729, + "grad_norm": 0.7578125, + "learning_rate": 0.00015124223521125138, + "loss": 0.8125, + "step": 21773 + }, + { + "epoch": 0.5590952640016947, + "grad_norm": 0.7734375, + "learning_rate": 0.0001512384016003688, + "loss": 0.8237, + "step": 21774 + }, + { + "epoch": 0.5591209411976165, + "grad_norm": 0.8125, + "learning_rate": 0.00015123456788737274, + "loss": 0.7388, + "step": 21775 + }, + { + "epoch": 0.5591466183935383, + "grad_norm": 0.7265625, + "learning_rate": 0.00015123073407227077, + "loss": 0.7827, + "step": 21776 + }, + { + "epoch": 0.5591722955894601, + "grad_norm": 0.8515625, + "learning_rate": 0.00015122690015507062, + "loss": 0.9202, + "step": 21777 + }, + { + "epoch": 0.559197972785382, + "grad_norm": 0.81640625, + "learning_rate": 0.00015122306613577982, + "loss": 0.796, + "step": 21778 + }, + { + "epoch": 0.5592236499813038, + "grad_norm": 0.83984375, + "learning_rate": 0.00015121923201440613, + "loss": 0.9659, + "step": 21779 + }, + { + "epoch": 0.5592493271772256, + "grad_norm": 0.81640625, + "learning_rate": 0.00015121539779095714, + "loss": 0.7714, + "step": 21780 + }, + { + "epoch": 0.5592750043731475, + "grad_norm": 0.78515625, + "learning_rate": 0.00015121156346544046, + "loss": 1.026, + "step": 21781 + }, + { + "epoch": 0.5593006815690692, + "grad_norm": 0.7890625, + "learning_rate": 0.0001512077290378638, + "loss": 0.9426, + "step": 21782 + }, + { + "epoch": 0.559326358764991, + "grad_norm": 0.81640625, + "learning_rate": 0.00015120389450823472, + "loss": 1.02, + "step": 21783 + }, + { + "epoch": 0.5593520359609129, + "grad_norm": 0.8359375, + "learning_rate": 0.00015120005987656092, + "loss": 0.9898, + "step": 21784 + }, + { + "epoch": 0.5593777131568347, + "grad_norm": 0.8203125, + "learning_rate": 0.00015119622514285, + "loss": 0.8609, + "step": 21785 + }, + { + "epoch": 0.5594033903527565, + "grad_norm": 0.77734375, + "learning_rate": 0.00015119239030710966, + "loss": 0.9178, + "step": 21786 + }, + { + "epoch": 0.5594290675486784, + "grad_norm": 0.78515625, + "learning_rate": 0.0001511885553693475, + "loss": 0.9933, + "step": 21787 + }, + { + "epoch": 0.5594547447446002, + "grad_norm": 0.76171875, + "learning_rate": 0.00015118472032957117, + "loss": 0.7203, + "step": 21788 + }, + { + "epoch": 0.5594804219405219, + "grad_norm": 0.91015625, + "learning_rate": 0.00015118088518778833, + "loss": 0.8766, + "step": 21789 + }, + { + "epoch": 0.5595060991364438, + "grad_norm": 0.7578125, + "learning_rate": 0.00015117704994400662, + "loss": 0.9069, + "step": 21790 + }, + { + "epoch": 0.5595317763323656, + "grad_norm": 0.71875, + "learning_rate": 0.00015117321459823363, + "loss": 0.8289, + "step": 21791 + }, + { + "epoch": 0.5595574535282875, + "grad_norm": 0.734375, + "learning_rate": 0.00015116937915047706, + "loss": 0.9021, + "step": 21792 + }, + { + "epoch": 0.5595831307242093, + "grad_norm": 0.77734375, + "learning_rate": 0.00015116554360074456, + "loss": 0.8874, + "step": 21793 + }, + { + "epoch": 0.5596088079201311, + "grad_norm": 0.66796875, + "learning_rate": 0.0001511617079490437, + "loss": 0.9289, + "step": 21794 + }, + { + "epoch": 0.5596344851160528, + "grad_norm": 0.75390625, + "learning_rate": 0.00015115787219538223, + "loss": 0.9204, + "step": 21795 + }, + { + "epoch": 0.5596601623119747, + "grad_norm": 0.79296875, + "learning_rate": 0.00015115403633976775, + "loss": 0.9721, + "step": 21796 + }, + { + "epoch": 0.5596858395078965, + "grad_norm": 0.734375, + "learning_rate": 0.00015115020038220788, + "loss": 0.782, + "step": 21797 + }, + { + "epoch": 0.5597115167038184, + "grad_norm": 0.73046875, + "learning_rate": 0.00015114636432271027, + "loss": 0.8953, + "step": 21798 + }, + { + "epoch": 0.5597371938997402, + "grad_norm": 0.7890625, + "learning_rate": 0.0001511425281612826, + "loss": 0.9382, + "step": 21799 + }, + { + "epoch": 0.559762871095662, + "grad_norm": 0.80859375, + "learning_rate": 0.00015113869189793246, + "loss": 1.0234, + "step": 21800 + }, + { + "epoch": 0.5597885482915839, + "grad_norm": 0.8125, + "learning_rate": 0.00015113485553266753, + "loss": 0.913, + "step": 21801 + }, + { + "epoch": 0.5598142254875056, + "grad_norm": 0.875, + "learning_rate": 0.00015113101906549548, + "loss": 0.849, + "step": 21802 + }, + { + "epoch": 0.5598399026834274, + "grad_norm": 0.7734375, + "learning_rate": 0.00015112718249642388, + "loss": 0.9372, + "step": 21803 + }, + { + "epoch": 0.5598655798793493, + "grad_norm": 0.78515625, + "learning_rate": 0.0001511233458254605, + "loss": 0.9669, + "step": 21804 + }, + { + "epoch": 0.5598912570752711, + "grad_norm": 0.734375, + "learning_rate": 0.00015111950905261282, + "loss": 0.8815, + "step": 21805 + }, + { + "epoch": 0.5599169342711929, + "grad_norm": 0.76171875, + "learning_rate": 0.00015111567217788863, + "loss": 0.672, + "step": 21806 + }, + { + "epoch": 0.5599426114671148, + "grad_norm": 0.8203125, + "learning_rate": 0.0001511118352012955, + "loss": 0.7838, + "step": 21807 + }, + { + "epoch": 0.5599682886630366, + "grad_norm": 0.84375, + "learning_rate": 0.00015110799812284112, + "loss": 0.9445, + "step": 21808 + }, + { + "epoch": 0.5599939658589583, + "grad_norm": 0.734375, + "learning_rate": 0.0001511041609425331, + "loss": 0.7898, + "step": 21809 + }, + { + "epoch": 0.5600196430548802, + "grad_norm": 0.80078125, + "learning_rate": 0.0001511003236603791, + "loss": 0.9416, + "step": 21810 + }, + { + "epoch": 0.560045320250802, + "grad_norm": 0.8203125, + "learning_rate": 0.00015109648627638678, + "loss": 0.8612, + "step": 21811 + }, + { + "epoch": 0.5600709974467238, + "grad_norm": 0.734375, + "learning_rate": 0.00015109264879056383, + "loss": 0.9202, + "step": 21812 + }, + { + "epoch": 0.5600966746426457, + "grad_norm": 0.67578125, + "learning_rate": 0.00015108881120291776, + "loss": 0.8691, + "step": 21813 + }, + { + "epoch": 0.5601223518385675, + "grad_norm": 0.75390625, + "learning_rate": 0.00015108497351345635, + "loss": 1.0395, + "step": 21814 + }, + { + "epoch": 0.5601480290344892, + "grad_norm": 0.80859375, + "learning_rate": 0.00015108113572218722, + "loss": 0.8517, + "step": 21815 + }, + { + "epoch": 0.5601737062304111, + "grad_norm": 0.75, + "learning_rate": 0.00015107729782911793, + "loss": 0.9008, + "step": 21816 + }, + { + "epoch": 0.5601993834263329, + "grad_norm": 0.796875, + "learning_rate": 0.00015107345983425628, + "loss": 0.9533, + "step": 21817 + }, + { + "epoch": 0.5602250606222547, + "grad_norm": 0.76953125, + "learning_rate": 0.00015106962173760976, + "loss": 0.8692, + "step": 21818 + }, + { + "epoch": 0.5602507378181766, + "grad_norm": 0.8203125, + "learning_rate": 0.00015106578353918617, + "loss": 0.76, + "step": 21819 + }, + { + "epoch": 0.5602764150140984, + "grad_norm": 0.8125, + "learning_rate": 0.00015106194523899303, + "loss": 0.8161, + "step": 21820 + }, + { + "epoch": 0.5603020922100203, + "grad_norm": 0.77734375, + "learning_rate": 0.00015105810683703808, + "loss": 0.7919, + "step": 21821 + }, + { + "epoch": 0.560327769405942, + "grad_norm": 0.7734375, + "learning_rate": 0.0001510542683333289, + "loss": 0.7887, + "step": 21822 + }, + { + "epoch": 0.5603534466018638, + "grad_norm": 0.7734375, + "learning_rate": 0.00015105042972787323, + "loss": 0.9665, + "step": 21823 + }, + { + "epoch": 0.5603791237977856, + "grad_norm": 0.7109375, + "learning_rate": 0.00015104659102067864, + "loss": 0.8842, + "step": 21824 + }, + { + "epoch": 0.5604048009937075, + "grad_norm": 0.734375, + "learning_rate": 0.0001510427522117528, + "loss": 0.8375, + "step": 21825 + }, + { + "epoch": 0.5604304781896293, + "grad_norm": 0.75, + "learning_rate": 0.00015103891330110332, + "loss": 0.9522, + "step": 21826 + }, + { + "epoch": 0.5604561553855512, + "grad_norm": 0.73046875, + "learning_rate": 0.00015103507428873794, + "loss": 0.8241, + "step": 21827 + }, + { + "epoch": 0.560481832581473, + "grad_norm": 0.8203125, + "learning_rate": 0.00015103123517466429, + "loss": 0.8458, + "step": 21828 + }, + { + "epoch": 0.5605075097773947, + "grad_norm": 0.8125, + "learning_rate": 0.00015102739595888997, + "loss": 0.9421, + "step": 21829 + }, + { + "epoch": 0.5605331869733166, + "grad_norm": 0.79296875, + "learning_rate": 0.0001510235566414227, + "loss": 0.8773, + "step": 21830 + }, + { + "epoch": 0.5605588641692384, + "grad_norm": 0.6953125, + "learning_rate": 0.00015101971722227003, + "loss": 0.7947, + "step": 21831 + }, + { + "epoch": 0.5605845413651602, + "grad_norm": 0.8046875, + "learning_rate": 0.0001510158777014397, + "loss": 0.7624, + "step": 21832 + }, + { + "epoch": 0.5606102185610821, + "grad_norm": 0.74609375, + "learning_rate": 0.00015101203807893934, + "loss": 0.7541, + "step": 21833 + }, + { + "epoch": 0.5606358957570039, + "grad_norm": 0.76171875, + "learning_rate": 0.00015100819835477656, + "loss": 0.9402, + "step": 21834 + }, + { + "epoch": 0.5606615729529256, + "grad_norm": 0.73046875, + "learning_rate": 0.00015100435852895906, + "loss": 1.0435, + "step": 21835 + }, + { + "epoch": 0.5606872501488475, + "grad_norm": 0.66796875, + "learning_rate": 0.00015100051860149452, + "loss": 0.7581, + "step": 21836 + }, + { + "epoch": 0.5607129273447693, + "grad_norm": 0.75390625, + "learning_rate": 0.00015099667857239055, + "loss": 0.8065, + "step": 21837 + }, + { + "epoch": 0.5607386045406911, + "grad_norm": 0.83203125, + "learning_rate": 0.00015099283844165478, + "loss": 0.9145, + "step": 21838 + }, + { + "epoch": 0.560764281736613, + "grad_norm": 0.859375, + "learning_rate": 0.0001509889982092949, + "loss": 0.9244, + "step": 21839 + }, + { + "epoch": 0.5607899589325348, + "grad_norm": 0.82421875, + "learning_rate": 0.00015098515787531852, + "loss": 0.9774, + "step": 21840 + }, + { + "epoch": 0.5608156361284566, + "grad_norm": 0.7265625, + "learning_rate": 0.00015098131743973336, + "loss": 0.8852, + "step": 21841 + }, + { + "epoch": 0.5608413133243784, + "grad_norm": 0.75390625, + "learning_rate": 0.00015097747690254702, + "loss": 0.9698, + "step": 21842 + }, + { + "epoch": 0.5608669905203002, + "grad_norm": 0.83203125, + "learning_rate": 0.00015097363626376722, + "loss": 0.8916, + "step": 21843 + }, + { + "epoch": 0.560892667716222, + "grad_norm": 0.78125, + "learning_rate": 0.0001509697955234015, + "loss": 0.9046, + "step": 21844 + }, + { + "epoch": 0.5609183449121439, + "grad_norm": 0.87109375, + "learning_rate": 0.00015096595468145764, + "loss": 0.9468, + "step": 21845 + }, + { + "epoch": 0.5609440221080657, + "grad_norm": 0.74609375, + "learning_rate": 0.0001509621137379432, + "loss": 0.7948, + "step": 21846 + }, + { + "epoch": 0.5609696993039875, + "grad_norm": 0.80078125, + "learning_rate": 0.00015095827269286587, + "loss": 0.863, + "step": 21847 + }, + { + "epoch": 0.5609953764999094, + "grad_norm": 0.80859375, + "learning_rate": 0.0001509544315462333, + "loss": 0.8345, + "step": 21848 + }, + { + "epoch": 0.5610210536958311, + "grad_norm": 1.1328125, + "learning_rate": 0.0001509505902980532, + "loss": 0.9747, + "step": 21849 + }, + { + "epoch": 0.5610467308917529, + "grad_norm": 0.71484375, + "learning_rate": 0.00015094674894833315, + "loss": 0.6613, + "step": 21850 + }, + { + "epoch": 0.5610724080876748, + "grad_norm": 0.76171875, + "learning_rate": 0.0001509429074970808, + "loss": 0.7705, + "step": 21851 + }, + { + "epoch": 0.5610980852835966, + "grad_norm": 0.82421875, + "learning_rate": 0.00015093906594430386, + "loss": 0.8962, + "step": 21852 + }, + { + "epoch": 0.5611237624795185, + "grad_norm": 0.78125, + "learning_rate": 0.00015093522429001, + "loss": 0.8796, + "step": 21853 + }, + { + "epoch": 0.5611494396754403, + "grad_norm": 0.79296875, + "learning_rate": 0.00015093138253420677, + "loss": 0.9142, + "step": 21854 + }, + { + "epoch": 0.561175116871362, + "grad_norm": 0.78515625, + "learning_rate": 0.00015092754067690196, + "loss": 0.8302, + "step": 21855 + }, + { + "epoch": 0.5612007940672838, + "grad_norm": 0.73046875, + "learning_rate": 0.00015092369871810312, + "loss": 0.8392, + "step": 21856 + }, + { + "epoch": 0.5612264712632057, + "grad_norm": 0.74609375, + "learning_rate": 0.00015091985665781796, + "loss": 0.8122, + "step": 21857 + }, + { + "epoch": 0.5612521484591275, + "grad_norm": 0.76953125, + "learning_rate": 0.0001509160144960541, + "loss": 0.7722, + "step": 21858 + }, + { + "epoch": 0.5612778256550494, + "grad_norm": 0.82421875, + "learning_rate": 0.00015091217223281927, + "loss": 0.9109, + "step": 21859 + }, + { + "epoch": 0.5613035028509712, + "grad_norm": 0.77734375, + "learning_rate": 0.00015090832986812104, + "loss": 0.8924, + "step": 21860 + }, + { + "epoch": 0.561329180046893, + "grad_norm": 0.7265625, + "learning_rate": 0.0001509044874019671, + "loss": 0.75, + "step": 21861 + }, + { + "epoch": 0.5613548572428148, + "grad_norm": 0.7265625, + "learning_rate": 0.00015090064483436517, + "loss": 0.6874, + "step": 21862 + }, + { + "epoch": 0.5613805344387366, + "grad_norm": 0.75, + "learning_rate": 0.00015089680216532283, + "loss": 0.9774, + "step": 21863 + }, + { + "epoch": 0.5614062116346584, + "grad_norm": 0.79296875, + "learning_rate": 0.00015089295939484774, + "loss": 0.7641, + "step": 21864 + }, + { + "epoch": 0.5614318888305803, + "grad_norm": 0.734375, + "learning_rate": 0.00015088911652294762, + "loss": 0.855, + "step": 21865 + }, + { + "epoch": 0.5614575660265021, + "grad_norm": 0.7421875, + "learning_rate": 0.00015088527354963, + "loss": 0.8733, + "step": 21866 + }, + { + "epoch": 0.5614832432224239, + "grad_norm": 0.7578125, + "learning_rate": 0.00015088143047490272, + "loss": 0.823, + "step": 21867 + }, + { + "epoch": 0.5615089204183458, + "grad_norm": 0.8125, + "learning_rate": 0.0001508775872987733, + "loss": 0.7405, + "step": 21868 + }, + { + "epoch": 0.5615345976142675, + "grad_norm": 0.8359375, + "learning_rate": 0.00015087374402124944, + "loss": 0.8022, + "step": 21869 + }, + { + "epoch": 0.5615602748101893, + "grad_norm": 0.8125, + "learning_rate": 0.00015086990064233885, + "loss": 0.9102, + "step": 21870 + }, + { + "epoch": 0.5615859520061112, + "grad_norm": 0.83984375, + "learning_rate": 0.0001508660571620491, + "loss": 0.9125, + "step": 21871 + }, + { + "epoch": 0.561611629202033, + "grad_norm": 0.72265625, + "learning_rate": 0.0001508622135803879, + "loss": 0.9336, + "step": 21872 + }, + { + "epoch": 0.5616373063979548, + "grad_norm": 0.734375, + "learning_rate": 0.0001508583698973629, + "loss": 0.678, + "step": 21873 + }, + { + "epoch": 0.5616629835938767, + "grad_norm": 0.80078125, + "learning_rate": 0.00015085452611298176, + "loss": 0.9509, + "step": 21874 + }, + { + "epoch": 0.5616886607897984, + "grad_norm": 0.75, + "learning_rate": 0.00015085068222725216, + "loss": 0.7706, + "step": 21875 + }, + { + "epoch": 0.5617143379857202, + "grad_norm": 0.8828125, + "learning_rate": 0.00015084683824018175, + "loss": 0.8882, + "step": 21876 + }, + { + "epoch": 0.5617400151816421, + "grad_norm": 0.8046875, + "learning_rate": 0.00015084299415177817, + "loss": 1.0129, + "step": 21877 + }, + { + "epoch": 0.5617656923775639, + "grad_norm": 0.73046875, + "learning_rate": 0.0001508391499620491, + "loss": 0.8501, + "step": 21878 + }, + { + "epoch": 0.5617913695734857, + "grad_norm": 0.796875, + "learning_rate": 0.00015083530567100217, + "loss": 0.8439, + "step": 21879 + }, + { + "epoch": 0.5618170467694076, + "grad_norm": 0.6875, + "learning_rate": 0.0001508314612786451, + "loss": 0.8231, + "step": 21880 + }, + { + "epoch": 0.5618427239653294, + "grad_norm": 0.7890625, + "learning_rate": 0.0001508276167849855, + "loss": 0.9395, + "step": 21881 + }, + { + "epoch": 0.5618684011612511, + "grad_norm": 0.77734375, + "learning_rate": 0.00015082377219003106, + "loss": 0.912, + "step": 21882 + }, + { + "epoch": 0.561894078357173, + "grad_norm": 0.765625, + "learning_rate": 0.00015081992749378946, + "loss": 0.8373, + "step": 21883 + }, + { + "epoch": 0.5619197555530948, + "grad_norm": 0.8125, + "learning_rate": 0.0001508160826962683, + "loss": 0.9926, + "step": 21884 + }, + { + "epoch": 0.5619454327490166, + "grad_norm": 0.7578125, + "learning_rate": 0.00015081223779747527, + "loss": 0.7756, + "step": 21885 + }, + { + "epoch": 0.5619711099449385, + "grad_norm": 0.78515625, + "learning_rate": 0.00015080839279741807, + "loss": 0.9127, + "step": 21886 + }, + { + "epoch": 0.5619967871408603, + "grad_norm": 0.7578125, + "learning_rate": 0.00015080454769610433, + "loss": 0.8336, + "step": 21887 + }, + { + "epoch": 0.5620224643367822, + "grad_norm": 0.76953125, + "learning_rate": 0.00015080070249354167, + "loss": 0.774, + "step": 21888 + }, + { + "epoch": 0.5620481415327039, + "grad_norm": 0.72265625, + "learning_rate": 0.00015079685718973784, + "loss": 0.8367, + "step": 21889 + }, + { + "epoch": 0.5620738187286257, + "grad_norm": 0.78515625, + "learning_rate": 0.00015079301178470047, + "loss": 0.8201, + "step": 21890 + }, + { + "epoch": 0.5620994959245476, + "grad_norm": 0.79296875, + "learning_rate": 0.00015078916627843717, + "loss": 0.8874, + "step": 21891 + }, + { + "epoch": 0.5621251731204694, + "grad_norm": 0.83203125, + "learning_rate": 0.0001507853206709557, + "loss": 0.8512, + "step": 21892 + }, + { + "epoch": 0.5621508503163912, + "grad_norm": 0.765625, + "learning_rate": 0.00015078147496226362, + "loss": 0.7297, + "step": 21893 + }, + { + "epoch": 0.5621765275123131, + "grad_norm": 0.6953125, + "learning_rate": 0.0001507776291523687, + "loss": 0.8308, + "step": 21894 + }, + { + "epoch": 0.5622022047082348, + "grad_norm": 0.796875, + "learning_rate": 0.00015077378324127854, + "loss": 0.9421, + "step": 21895 + }, + { + "epoch": 0.5622278819041566, + "grad_norm": 0.79296875, + "learning_rate": 0.00015076993722900082, + "loss": 0.7649, + "step": 21896 + }, + { + "epoch": 0.5622535591000785, + "grad_norm": 0.71484375, + "learning_rate": 0.00015076609111554318, + "loss": 0.7589, + "step": 21897 + }, + { + "epoch": 0.5622792362960003, + "grad_norm": 0.8203125, + "learning_rate": 0.0001507622449009133, + "loss": 0.7897, + "step": 21898 + }, + { + "epoch": 0.5623049134919221, + "grad_norm": 0.76171875, + "learning_rate": 0.00015075839858511887, + "loss": 0.7958, + "step": 21899 + }, + { + "epoch": 0.562330590687844, + "grad_norm": 0.7734375, + "learning_rate": 0.00015075455216816754, + "loss": 0.8992, + "step": 21900 + }, + { + "epoch": 0.5623562678837658, + "grad_norm": 0.81640625, + "learning_rate": 0.00015075070565006695, + "loss": 0.8749, + "step": 21901 + }, + { + "epoch": 0.5623819450796875, + "grad_norm": 0.7734375, + "learning_rate": 0.00015074685903082485, + "loss": 0.8612, + "step": 21902 + }, + { + "epoch": 0.5624076222756094, + "grad_norm": 0.79296875, + "learning_rate": 0.00015074301231044875, + "loss": 0.9284, + "step": 21903 + }, + { + "epoch": 0.5624332994715312, + "grad_norm": 0.8359375, + "learning_rate": 0.00015073916548894646, + "loss": 0.8141, + "step": 21904 + }, + { + "epoch": 0.562458976667453, + "grad_norm": 0.87109375, + "learning_rate": 0.00015073531856632562, + "loss": 0.9552, + "step": 21905 + }, + { + "epoch": 0.5624846538633749, + "grad_norm": 0.76171875, + "learning_rate": 0.00015073147154259382, + "loss": 0.9064, + "step": 21906 + }, + { + "epoch": 0.5625103310592967, + "grad_norm": 0.73828125, + "learning_rate": 0.00015072762441775881, + "loss": 0.9303, + "step": 21907 + }, + { + "epoch": 0.5625360082552184, + "grad_norm": 0.8046875, + "learning_rate": 0.00015072377719182821, + "loss": 0.8294, + "step": 21908 + }, + { + "epoch": 0.5625616854511403, + "grad_norm": 0.8046875, + "learning_rate": 0.00015071992986480975, + "loss": 0.8547, + "step": 21909 + }, + { + "epoch": 0.5625873626470621, + "grad_norm": 0.74609375, + "learning_rate": 0.000150716082436711, + "loss": 0.8815, + "step": 21910 + }, + { + "epoch": 0.5626130398429839, + "grad_norm": 0.76171875, + "learning_rate": 0.00015071223490753969, + "loss": 0.9115, + "step": 21911 + }, + { + "epoch": 0.5626387170389058, + "grad_norm": 0.76171875, + "learning_rate": 0.0001507083872773035, + "loss": 0.7672, + "step": 21912 + }, + { + "epoch": 0.5626643942348276, + "grad_norm": 0.8203125, + "learning_rate": 0.00015070453954601004, + "loss": 0.8834, + "step": 21913 + }, + { + "epoch": 0.5626900714307494, + "grad_norm": 0.8046875, + "learning_rate": 0.00015070069171366703, + "loss": 0.7757, + "step": 21914 + }, + { + "epoch": 0.5627157486266712, + "grad_norm": 0.81640625, + "learning_rate": 0.00015069684378028216, + "loss": 1.0008, + "step": 21915 + }, + { + "epoch": 0.562741425822593, + "grad_norm": 0.87109375, + "learning_rate": 0.000150692995745863, + "loss": 0.8913, + "step": 21916 + }, + { + "epoch": 0.5627671030185148, + "grad_norm": 0.7890625, + "learning_rate": 0.0001506891476104173, + "loss": 0.9804, + "step": 21917 + }, + { + "epoch": 0.5627927802144367, + "grad_norm": 0.83203125, + "learning_rate": 0.0001506852993739527, + "loss": 0.9764, + "step": 21918 + }, + { + "epoch": 0.5628184574103585, + "grad_norm": 1.0390625, + "learning_rate": 0.0001506814510364769, + "loss": 0.9725, + "step": 21919 + }, + { + "epoch": 0.5628441346062804, + "grad_norm": 0.6875, + "learning_rate": 0.0001506776025979975, + "loss": 1.0011, + "step": 21920 + }, + { + "epoch": 0.5628698118022022, + "grad_norm": 0.83203125, + "learning_rate": 0.00015067375405852226, + "loss": 0.8178, + "step": 21921 + }, + { + "epoch": 0.5628954889981239, + "grad_norm": 0.70703125, + "learning_rate": 0.00015066990541805878, + "loss": 0.8531, + "step": 21922 + }, + { + "epoch": 0.5629211661940458, + "grad_norm": 0.74609375, + "learning_rate": 0.0001506660566766148, + "loss": 0.8622, + "step": 21923 + }, + { + "epoch": 0.5629468433899676, + "grad_norm": 0.765625, + "learning_rate": 0.0001506622078341979, + "loss": 0.792, + "step": 21924 + }, + { + "epoch": 0.5629725205858894, + "grad_norm": 0.72265625, + "learning_rate": 0.00015065835889081582, + "loss": 0.925, + "step": 21925 + }, + { + "epoch": 0.5629981977818113, + "grad_norm": 0.7890625, + "learning_rate": 0.00015065450984647617, + "loss": 0.9638, + "step": 21926 + }, + { + "epoch": 0.5630238749777331, + "grad_norm": 0.8828125, + "learning_rate": 0.0001506506607011867, + "loss": 1.0525, + "step": 21927 + }, + { + "epoch": 0.5630495521736548, + "grad_norm": 0.91015625, + "learning_rate": 0.00015064681145495504, + "loss": 0.8379, + "step": 21928 + }, + { + "epoch": 0.5630752293695767, + "grad_norm": 0.80078125, + "learning_rate": 0.00015064296210778882, + "loss": 0.8639, + "step": 21929 + }, + { + "epoch": 0.5631009065654985, + "grad_norm": 0.7265625, + "learning_rate": 0.00015063911265969577, + "loss": 0.8793, + "step": 21930 + }, + { + "epoch": 0.5631265837614203, + "grad_norm": 0.78125, + "learning_rate": 0.00015063526311068353, + "loss": 0.9131, + "step": 21931 + }, + { + "epoch": 0.5631522609573422, + "grad_norm": 0.78125, + "learning_rate": 0.00015063141346075983, + "loss": 0.8176, + "step": 21932 + }, + { + "epoch": 0.563177938153264, + "grad_norm": 0.77734375, + "learning_rate": 0.00015062756370993224, + "loss": 0.7453, + "step": 21933 + }, + { + "epoch": 0.5632036153491858, + "grad_norm": 0.8359375, + "learning_rate": 0.00015062371385820853, + "loss": 0.8556, + "step": 21934 + }, + { + "epoch": 0.5632292925451076, + "grad_norm": 0.75390625, + "learning_rate": 0.00015061986390559631, + "loss": 0.9177, + "step": 21935 + }, + { + "epoch": 0.5632549697410294, + "grad_norm": 0.77734375, + "learning_rate": 0.00015061601385210328, + "loss": 0.8284, + "step": 21936 + }, + { + "epoch": 0.5632806469369512, + "grad_norm": 0.796875, + "learning_rate": 0.00015061216369773712, + "loss": 0.8142, + "step": 21937 + }, + { + "epoch": 0.5633063241328731, + "grad_norm": 0.8203125, + "learning_rate": 0.00015060831344250545, + "loss": 0.9712, + "step": 21938 + }, + { + "epoch": 0.5633320013287949, + "grad_norm": 0.74609375, + "learning_rate": 0.000150604463086416, + "loss": 0.9656, + "step": 21939 + }, + { + "epoch": 0.5633576785247167, + "grad_norm": 0.7578125, + "learning_rate": 0.00015060061262947647, + "loss": 1.0044, + "step": 21940 + }, + { + "epoch": 0.5633833557206386, + "grad_norm": 0.765625, + "learning_rate": 0.00015059676207169445, + "loss": 0.7935, + "step": 21941 + }, + { + "epoch": 0.5634090329165603, + "grad_norm": 0.7734375, + "learning_rate": 0.00015059291141307767, + "loss": 0.8878, + "step": 21942 + }, + { + "epoch": 0.5634347101124821, + "grad_norm": 0.890625, + "learning_rate": 0.00015058906065363377, + "loss": 0.9612, + "step": 21943 + }, + { + "epoch": 0.563460387308404, + "grad_norm": 0.7890625, + "learning_rate": 0.00015058520979337043, + "loss": 0.7308, + "step": 21944 + }, + { + "epoch": 0.5634860645043258, + "grad_norm": 0.734375, + "learning_rate": 0.0001505813588322954, + "loss": 0.9517, + "step": 21945 + }, + { + "epoch": 0.5635117417002476, + "grad_norm": 0.8125, + "learning_rate": 0.00015057750777041623, + "loss": 0.9072, + "step": 21946 + }, + { + "epoch": 0.5635374188961695, + "grad_norm": 0.76171875, + "learning_rate": 0.00015057365660774066, + "loss": 0.8819, + "step": 21947 + }, + { + "epoch": 0.5635630960920912, + "grad_norm": 0.7578125, + "learning_rate": 0.00015056980534427635, + "loss": 0.7982, + "step": 21948 + }, + { + "epoch": 0.563588773288013, + "grad_norm": 0.890625, + "learning_rate": 0.00015056595398003103, + "loss": 0.8537, + "step": 21949 + }, + { + "epoch": 0.5636144504839349, + "grad_norm": 0.83203125, + "learning_rate": 0.0001505621025150123, + "loss": 0.8339, + "step": 21950 + }, + { + "epoch": 0.5636401276798567, + "grad_norm": 0.7890625, + "learning_rate": 0.00015055825094922786, + "loss": 0.8901, + "step": 21951 + }, + { + "epoch": 0.5636658048757786, + "grad_norm": 0.78125, + "learning_rate": 0.00015055439928268543, + "loss": 0.8378, + "step": 21952 + }, + { + "epoch": 0.5636914820717004, + "grad_norm": 0.82421875, + "learning_rate": 0.00015055054751539262, + "loss": 0.9075, + "step": 21953 + }, + { + "epoch": 0.5637171592676222, + "grad_norm": 0.83984375, + "learning_rate": 0.00015054669564735712, + "loss": 0.9139, + "step": 21954 + }, + { + "epoch": 0.563742836463544, + "grad_norm": 0.8046875, + "learning_rate": 0.0001505428436785867, + "loss": 0.9615, + "step": 21955 + }, + { + "epoch": 0.5637685136594658, + "grad_norm": 0.75390625, + "learning_rate": 0.00015053899160908887, + "loss": 0.8652, + "step": 21956 + }, + { + "epoch": 0.5637941908553876, + "grad_norm": 0.76953125, + "learning_rate": 0.00015053513943887143, + "loss": 0.8813, + "step": 21957 + }, + { + "epoch": 0.5638198680513095, + "grad_norm": 0.74609375, + "learning_rate": 0.000150531287167942, + "loss": 0.9031, + "step": 21958 + }, + { + "epoch": 0.5638455452472313, + "grad_norm": 0.796875, + "learning_rate": 0.0001505274347963083, + "loss": 0.852, + "step": 21959 + }, + { + "epoch": 0.5638712224431531, + "grad_norm": 0.72265625, + "learning_rate": 0.00015052358232397797, + "loss": 0.7614, + "step": 21960 + }, + { + "epoch": 0.563896899639075, + "grad_norm": 0.8046875, + "learning_rate": 0.00015051972975095877, + "loss": 0.8489, + "step": 21961 + }, + { + "epoch": 0.5639225768349967, + "grad_norm": 0.75390625, + "learning_rate": 0.00015051587707725822, + "loss": 0.8979, + "step": 21962 + }, + { + "epoch": 0.5639482540309185, + "grad_norm": 0.71875, + "learning_rate": 0.00015051202430288414, + "loss": 0.8602, + "step": 21963 + }, + { + "epoch": 0.5639739312268404, + "grad_norm": 0.7265625, + "learning_rate": 0.00015050817142784416, + "loss": 0.8608, + "step": 21964 + }, + { + "epoch": 0.5639996084227622, + "grad_norm": 0.78515625, + "learning_rate": 0.00015050431845214593, + "loss": 0.9742, + "step": 21965 + }, + { + "epoch": 0.564025285618684, + "grad_norm": 0.7265625, + "learning_rate": 0.0001505004653757972, + "loss": 0.8688, + "step": 21966 + }, + { + "epoch": 0.5640509628146059, + "grad_norm": 0.7109375, + "learning_rate": 0.00015049661219880553, + "loss": 0.7505, + "step": 21967 + }, + { + "epoch": 0.5640766400105276, + "grad_norm": 0.71875, + "learning_rate": 0.00015049275892117873, + "loss": 0.9256, + "step": 21968 + }, + { + "epoch": 0.5641023172064494, + "grad_norm": 0.84765625, + "learning_rate": 0.00015048890554292442, + "loss": 0.8121, + "step": 21969 + }, + { + "epoch": 0.5641279944023713, + "grad_norm": 0.76171875, + "learning_rate": 0.00015048505206405026, + "loss": 0.9207, + "step": 21970 + }, + { + "epoch": 0.5641536715982931, + "grad_norm": 0.69140625, + "learning_rate": 0.000150481198484564, + "loss": 0.9012, + "step": 21971 + }, + { + "epoch": 0.5641793487942149, + "grad_norm": 0.74609375, + "learning_rate": 0.0001504773448044732, + "loss": 0.913, + "step": 21972 + }, + { + "epoch": 0.5642050259901368, + "grad_norm": 0.81640625, + "learning_rate": 0.00015047349102378563, + "loss": 0.9972, + "step": 21973 + }, + { + "epoch": 0.5642307031860586, + "grad_norm": 0.76171875, + "learning_rate": 0.000150469637142509, + "loss": 0.8948, + "step": 21974 + }, + { + "epoch": 0.5642563803819803, + "grad_norm": 0.8203125, + "learning_rate": 0.0001504657831606509, + "loss": 0.8872, + "step": 21975 + }, + { + "epoch": 0.5642820575779022, + "grad_norm": 0.78515625, + "learning_rate": 0.00015046192907821907, + "loss": 0.9942, + "step": 21976 + }, + { + "epoch": 0.564307734773824, + "grad_norm": 0.84375, + "learning_rate": 0.00015045807489522117, + "loss": 0.9742, + "step": 21977 + }, + { + "epoch": 0.5643334119697458, + "grad_norm": 0.9453125, + "learning_rate": 0.00015045422061166485, + "loss": 0.9303, + "step": 21978 + }, + { + "epoch": 0.5643590891656677, + "grad_norm": 0.6953125, + "learning_rate": 0.00015045036622755788, + "loss": 0.9219, + "step": 21979 + }, + { + "epoch": 0.5643847663615895, + "grad_norm": 0.8984375, + "learning_rate": 0.00015044651174290786, + "loss": 1.0453, + "step": 21980 + }, + { + "epoch": 0.5644104435575114, + "grad_norm": 0.69921875, + "learning_rate": 0.00015044265715772253, + "loss": 0.8746, + "step": 21981 + }, + { + "epoch": 0.5644361207534331, + "grad_norm": 0.81640625, + "learning_rate": 0.0001504388024720095, + "loss": 0.9433, + "step": 21982 + }, + { + "epoch": 0.5644617979493549, + "grad_norm": 0.7890625, + "learning_rate": 0.0001504349476857765, + "loss": 0.8972, + "step": 21983 + }, + { + "epoch": 0.5644874751452768, + "grad_norm": 0.76171875, + "learning_rate": 0.0001504310927990312, + "loss": 0.7377, + "step": 21984 + }, + { + "epoch": 0.5645131523411986, + "grad_norm": 0.81640625, + "learning_rate": 0.00015042723781178132, + "loss": 0.8781, + "step": 21985 + }, + { + "epoch": 0.5645388295371204, + "grad_norm": 0.8203125, + "learning_rate": 0.00015042338272403446, + "loss": 0.7386, + "step": 21986 + }, + { + "epoch": 0.5645645067330423, + "grad_norm": 0.78515625, + "learning_rate": 0.0001504195275357984, + "loss": 0.8551, + "step": 21987 + }, + { + "epoch": 0.564590183928964, + "grad_norm": 0.76953125, + "learning_rate": 0.00015041567224708074, + "loss": 0.8509, + "step": 21988 + }, + { + "epoch": 0.5646158611248858, + "grad_norm": 0.80078125, + "learning_rate": 0.00015041181685788923, + "loss": 0.859, + "step": 21989 + }, + { + "epoch": 0.5646415383208077, + "grad_norm": 0.75390625, + "learning_rate": 0.00015040796136823152, + "loss": 0.942, + "step": 21990 + }, + { + "epoch": 0.5646672155167295, + "grad_norm": 0.765625, + "learning_rate": 0.00015040410577811528, + "loss": 0.7477, + "step": 21991 + }, + { + "epoch": 0.5646928927126513, + "grad_norm": 0.8984375, + "learning_rate": 0.00015040025008754821, + "loss": 0.9837, + "step": 21992 + }, + { + "epoch": 0.5647185699085732, + "grad_norm": 0.78125, + "learning_rate": 0.00015039639429653803, + "loss": 0.8733, + "step": 21993 + }, + { + "epoch": 0.564744247104495, + "grad_norm": 0.7421875, + "learning_rate": 0.00015039253840509233, + "loss": 0.7548, + "step": 21994 + }, + { + "epoch": 0.5647699243004167, + "grad_norm": 0.8046875, + "learning_rate": 0.0001503886824132189, + "loss": 0.8308, + "step": 21995 + }, + { + "epoch": 0.5647956014963386, + "grad_norm": 0.7265625, + "learning_rate": 0.00015038482632092536, + "loss": 0.7786, + "step": 21996 + }, + { + "epoch": 0.5648212786922604, + "grad_norm": 0.78125, + "learning_rate": 0.0001503809701282194, + "loss": 0.7957, + "step": 21997 + }, + { + "epoch": 0.5648469558881822, + "grad_norm": 0.8203125, + "learning_rate": 0.00015037711383510875, + "loss": 0.8898, + "step": 21998 + }, + { + "epoch": 0.5648726330841041, + "grad_norm": 0.78125, + "learning_rate": 0.00015037325744160103, + "loss": 0.9073, + "step": 21999 + }, + { + "epoch": 0.5648983102800259, + "grad_norm": 0.796875, + "learning_rate": 0.00015036940094770396, + "loss": 0.8524, + "step": 22000 + }, + { + "epoch": 0.5648983102800259, + "eval_loss": 0.8665466904640198, + "eval_model_preparation_time": 0.0065, + "eval_runtime": 417.0215, + "eval_samples_per_second": 23.98, + "eval_steps_per_second": 0.751, + "step": 22000 + }, + { + "epoch": 0.5649239874759477, + "grad_norm": 0.83203125, + "learning_rate": 0.00015036554435342528, + "loss": 0.9165, + "step": 22001 + }, + { + "epoch": 0.5649496646718695, + "grad_norm": 0.8359375, + "learning_rate": 0.00015036168765877251, + "loss": 0.9714, + "step": 22002 + }, + { + "epoch": 0.5649753418677913, + "grad_norm": 0.7890625, + "learning_rate": 0.00015035783086375353, + "loss": 0.84, + "step": 22003 + }, + { + "epoch": 0.5650010190637131, + "grad_norm": 0.76171875, + "learning_rate": 0.0001503539739683759, + "loss": 0.9205, + "step": 22004 + }, + { + "epoch": 0.565026696259635, + "grad_norm": 0.71875, + "learning_rate": 0.00015035011697264738, + "loss": 0.8385, + "step": 22005 + }, + { + "epoch": 0.5650523734555568, + "grad_norm": 0.81640625, + "learning_rate": 0.00015034625987657565, + "loss": 0.7699, + "step": 22006 + }, + { + "epoch": 0.5650780506514786, + "grad_norm": 0.74609375, + "learning_rate": 0.0001503424026801683, + "loss": 0.9317, + "step": 22007 + }, + { + "epoch": 0.5651037278474004, + "grad_norm": 0.7890625, + "learning_rate": 0.00015033854538343315, + "loss": 0.8595, + "step": 22008 + }, + { + "epoch": 0.5651294050433222, + "grad_norm": 0.87109375, + "learning_rate": 0.0001503346879863778, + "loss": 0.9934, + "step": 22009 + }, + { + "epoch": 0.565155082239244, + "grad_norm": 0.828125, + "learning_rate": 0.00015033083048900995, + "loss": 1.0713, + "step": 22010 + }, + { + "epoch": 0.5651807594351659, + "grad_norm": 0.81640625, + "learning_rate": 0.0001503269728913373, + "loss": 0.9404, + "step": 22011 + }, + { + "epoch": 0.5652064366310877, + "grad_norm": 0.8046875, + "learning_rate": 0.0001503231151933676, + "loss": 0.884, + "step": 22012 + }, + { + "epoch": 0.5652321138270096, + "grad_norm": 0.76171875, + "learning_rate": 0.0001503192573951084, + "loss": 0.9168, + "step": 22013 + }, + { + "epoch": 0.5652577910229314, + "grad_norm": 0.69921875, + "learning_rate": 0.00015031539949656751, + "loss": 0.7071, + "step": 22014 + }, + { + "epoch": 0.5652834682188531, + "grad_norm": 0.7578125, + "learning_rate": 0.00015031154149775254, + "loss": 0.8884, + "step": 22015 + }, + { + "epoch": 0.565309145414775, + "grad_norm": 0.7578125, + "learning_rate": 0.00015030768339867128, + "loss": 0.8119, + "step": 22016 + }, + { + "epoch": 0.5653348226106968, + "grad_norm": 0.74609375, + "learning_rate": 0.00015030382519933127, + "loss": 0.8648, + "step": 22017 + }, + { + "epoch": 0.5653604998066186, + "grad_norm": 0.75, + "learning_rate": 0.00015029996689974032, + "loss": 0.8349, + "step": 22018 + }, + { + "epoch": 0.5653861770025405, + "grad_norm": 0.77734375, + "learning_rate": 0.00015029610849990607, + "loss": 1.0208, + "step": 22019 + }, + { + "epoch": 0.5654118541984623, + "grad_norm": 0.79296875, + "learning_rate": 0.0001502922499998362, + "loss": 0.8954, + "step": 22020 + }, + { + "epoch": 0.5654375313943841, + "grad_norm": 0.78125, + "learning_rate": 0.0001502883913995385, + "loss": 0.8785, + "step": 22021 + }, + { + "epoch": 0.5654632085903059, + "grad_norm": 0.703125, + "learning_rate": 0.0001502845326990205, + "loss": 0.902, + "step": 22022 + }, + { + "epoch": 0.5654888857862277, + "grad_norm": 0.7109375, + "learning_rate": 0.00015028067389828999, + "loss": 0.7536, + "step": 22023 + }, + { + "epoch": 0.5655145629821495, + "grad_norm": 0.83203125, + "learning_rate": 0.00015027681499735462, + "loss": 1.0278, + "step": 22024 + }, + { + "epoch": 0.5655402401780714, + "grad_norm": 0.765625, + "learning_rate": 0.00015027295599622214, + "loss": 0.8666, + "step": 22025 + }, + { + "epoch": 0.5655659173739932, + "grad_norm": 0.77734375, + "learning_rate": 0.00015026909689490016, + "loss": 0.8115, + "step": 22026 + }, + { + "epoch": 0.565591594569915, + "grad_norm": 0.828125, + "learning_rate": 0.00015026523769339644, + "loss": 0.9071, + "step": 22027 + }, + { + "epoch": 0.5656172717658368, + "grad_norm": 0.68359375, + "learning_rate": 0.0001502613783917186, + "loss": 0.757, + "step": 22028 + }, + { + "epoch": 0.5656429489617586, + "grad_norm": 0.7265625, + "learning_rate": 0.00015025751898987443, + "loss": 0.819, + "step": 22029 + }, + { + "epoch": 0.5656686261576804, + "grad_norm": 0.80078125, + "learning_rate": 0.00015025365948787153, + "loss": 0.8577, + "step": 22030 + }, + { + "epoch": 0.5656943033536023, + "grad_norm": 0.70703125, + "learning_rate": 0.00015024979988571764, + "loss": 0.8231, + "step": 22031 + }, + { + "epoch": 0.5657199805495241, + "grad_norm": 0.78515625, + "learning_rate": 0.00015024594018342042, + "loss": 0.927, + "step": 22032 + }, + { + "epoch": 0.5657456577454459, + "grad_norm": 3.734375, + "learning_rate": 0.0001502420803809876, + "loss": 0.8263, + "step": 22033 + }, + { + "epoch": 0.5657713349413678, + "grad_norm": 0.83984375, + "learning_rate": 0.00015023822047842685, + "loss": 0.8565, + "step": 22034 + }, + { + "epoch": 0.5657970121372895, + "grad_norm": 0.79296875, + "learning_rate": 0.00015023436047574586, + "loss": 0.9525, + "step": 22035 + }, + { + "epoch": 0.5658226893332113, + "grad_norm": 0.66796875, + "learning_rate": 0.00015023050037295233, + "loss": 0.7642, + "step": 22036 + }, + { + "epoch": 0.5658483665291332, + "grad_norm": 0.83203125, + "learning_rate": 0.00015022664017005392, + "loss": 0.9011, + "step": 22037 + }, + { + "epoch": 0.565874043725055, + "grad_norm": 0.79296875, + "learning_rate": 0.0001502227798670584, + "loss": 1.005, + "step": 22038 + }, + { + "epoch": 0.5658997209209768, + "grad_norm": 0.73046875, + "learning_rate": 0.00015021891946397336, + "loss": 0.8225, + "step": 22039 + }, + { + "epoch": 0.5659253981168987, + "grad_norm": 0.76953125, + "learning_rate": 0.00015021505896080663, + "loss": 0.8085, + "step": 22040 + }, + { + "epoch": 0.5659510753128205, + "grad_norm": 0.8125, + "learning_rate": 0.00015021119835756575, + "loss": 0.8999, + "step": 22041 + }, + { + "epoch": 0.5659767525087422, + "grad_norm": 0.7578125, + "learning_rate": 0.00015020733765425852, + "loss": 0.8771, + "step": 22042 + }, + { + "epoch": 0.5660024297046641, + "grad_norm": 0.734375, + "learning_rate": 0.0001502034768508926, + "loss": 0.9041, + "step": 22043 + }, + { + "epoch": 0.5660281069005859, + "grad_norm": 0.73046875, + "learning_rate": 0.00015019961594747567, + "loss": 0.9391, + "step": 22044 + }, + { + "epoch": 0.5660537840965078, + "grad_norm": 0.7890625, + "learning_rate": 0.00015019575494401545, + "loss": 0.8596, + "step": 22045 + }, + { + "epoch": 0.5660794612924296, + "grad_norm": 0.77734375, + "learning_rate": 0.00015019189384051962, + "loss": 0.8942, + "step": 22046 + }, + { + "epoch": 0.5661051384883514, + "grad_norm": 0.890625, + "learning_rate": 0.0001501880326369959, + "loss": 0.8702, + "step": 22047 + }, + { + "epoch": 0.5661308156842731, + "grad_norm": 0.7734375, + "learning_rate": 0.00015018417133345197, + "loss": 0.8837, + "step": 22048 + }, + { + "epoch": 0.566156492880195, + "grad_norm": 0.7265625, + "learning_rate": 0.0001501803099298955, + "loss": 0.7737, + "step": 22049 + }, + { + "epoch": 0.5661821700761168, + "grad_norm": 0.6953125, + "learning_rate": 0.00015017644842633417, + "loss": 0.7765, + "step": 22050 + }, + { + "epoch": 0.5662078472720387, + "grad_norm": 0.703125, + "learning_rate": 0.00015017258682277577, + "loss": 0.9131, + "step": 22051 + }, + { + "epoch": 0.5662335244679605, + "grad_norm": 0.81640625, + "learning_rate": 0.0001501687251192279, + "loss": 0.8863, + "step": 22052 + }, + { + "epoch": 0.5662592016638823, + "grad_norm": 0.78125, + "learning_rate": 0.00015016486331569833, + "loss": 0.9173, + "step": 22053 + }, + { + "epoch": 0.5662848788598042, + "grad_norm": 0.80078125, + "learning_rate": 0.00015016100141219466, + "loss": 0.9425, + "step": 22054 + }, + { + "epoch": 0.5663105560557259, + "grad_norm": 0.81640625, + "learning_rate": 0.00015015713940872468, + "loss": 1.032, + "step": 22055 + }, + { + "epoch": 0.5663362332516477, + "grad_norm": 0.79296875, + "learning_rate": 0.00015015327730529606, + "loss": 0.7678, + "step": 22056 + }, + { + "epoch": 0.5663619104475696, + "grad_norm": 0.75390625, + "learning_rate": 0.0001501494151019165, + "loss": 0.8134, + "step": 22057 + }, + { + "epoch": 0.5663875876434914, + "grad_norm": 0.8046875, + "learning_rate": 0.00015014555279859362, + "loss": 0.7782, + "step": 22058 + }, + { + "epoch": 0.5664132648394132, + "grad_norm": 0.76171875, + "learning_rate": 0.00015014169039533526, + "loss": 0.825, + "step": 22059 + }, + { + "epoch": 0.5664389420353351, + "grad_norm": 0.80859375, + "learning_rate": 0.000150137827892149, + "loss": 0.8332, + "step": 22060 + }, + { + "epoch": 0.5664646192312569, + "grad_norm": 0.796875, + "learning_rate": 0.0001501339652890426, + "loss": 1.007, + "step": 22061 + }, + { + "epoch": 0.5664902964271786, + "grad_norm": 0.7734375, + "learning_rate": 0.00015013010258602372, + "loss": 0.9497, + "step": 22062 + }, + { + "epoch": 0.5665159736231005, + "grad_norm": 0.78515625, + "learning_rate": 0.00015012623978310005, + "loss": 0.9547, + "step": 22063 + }, + { + "epoch": 0.5665416508190223, + "grad_norm": 0.74609375, + "learning_rate": 0.00015012237688027936, + "loss": 0.9096, + "step": 22064 + }, + { + "epoch": 0.5665673280149441, + "grad_norm": 0.7734375, + "learning_rate": 0.00015011851387756927, + "loss": 0.8293, + "step": 22065 + }, + { + "epoch": 0.566593005210866, + "grad_norm": 0.76953125, + "learning_rate": 0.0001501146507749775, + "loss": 0.9436, + "step": 22066 + }, + { + "epoch": 0.5666186824067878, + "grad_norm": 0.84375, + "learning_rate": 0.0001501107875725118, + "loss": 0.978, + "step": 22067 + }, + { + "epoch": 0.5666443596027095, + "grad_norm": 0.72265625, + "learning_rate": 0.0001501069242701798, + "loss": 0.8314, + "step": 22068 + }, + { + "epoch": 0.5666700367986314, + "grad_norm": 0.8984375, + "learning_rate": 0.00015010306086798923, + "loss": 0.8188, + "step": 22069 + }, + { + "epoch": 0.5666957139945532, + "grad_norm": 0.7578125, + "learning_rate": 0.00015009919736594778, + "loss": 0.6498, + "step": 22070 + }, + { + "epoch": 0.566721391190475, + "grad_norm": 0.734375, + "learning_rate": 0.00015009533376406315, + "loss": 0.793, + "step": 22071 + }, + { + "epoch": 0.5667470683863969, + "grad_norm": 0.8203125, + "learning_rate": 0.00015009147006234304, + "loss": 0.8404, + "step": 22072 + }, + { + "epoch": 0.5667727455823187, + "grad_norm": 0.77734375, + "learning_rate": 0.00015008760626079518, + "loss": 0.9673, + "step": 22073 + }, + { + "epoch": 0.5667984227782406, + "grad_norm": 0.84375, + "learning_rate": 0.00015008374235942723, + "loss": 0.8388, + "step": 22074 + }, + { + "epoch": 0.5668240999741623, + "grad_norm": 0.7578125, + "learning_rate": 0.00015007987835824693, + "loss": 0.9383, + "step": 22075 + }, + { + "epoch": 0.5668497771700841, + "grad_norm": 0.7421875, + "learning_rate": 0.0001500760142572619, + "loss": 0.8666, + "step": 22076 + }, + { + "epoch": 0.566875454366006, + "grad_norm": 0.765625, + "learning_rate": 0.00015007215005647995, + "loss": 0.7468, + "step": 22077 + }, + { + "epoch": 0.5669011315619278, + "grad_norm": 0.90625, + "learning_rate": 0.0001500682857559087, + "loss": 0.7581, + "step": 22078 + }, + { + "epoch": 0.5669268087578496, + "grad_norm": 0.765625, + "learning_rate": 0.00015006442135555588, + "loss": 0.9583, + "step": 22079 + }, + { + "epoch": 0.5669524859537715, + "grad_norm": 0.75, + "learning_rate": 0.00015006055685542924, + "loss": 1.0273, + "step": 22080 + }, + { + "epoch": 0.5669781631496933, + "grad_norm": 0.76953125, + "learning_rate": 0.00015005669225553637, + "loss": 0.8651, + "step": 22081 + }, + { + "epoch": 0.567003840345615, + "grad_norm": 0.88671875, + "learning_rate": 0.00015005282755588506, + "loss": 0.9082, + "step": 22082 + }, + { + "epoch": 0.5670295175415369, + "grad_norm": 0.76171875, + "learning_rate": 0.00015004896275648297, + "loss": 0.9091, + "step": 22083 + }, + { + "epoch": 0.5670551947374587, + "grad_norm": 0.80078125, + "learning_rate": 0.0001500450978573378, + "loss": 0.9166, + "step": 22084 + }, + { + "epoch": 0.5670808719333805, + "grad_norm": 0.765625, + "learning_rate": 0.00015004123285845732, + "loss": 0.8942, + "step": 22085 + }, + { + "epoch": 0.5671065491293024, + "grad_norm": 0.7734375, + "learning_rate": 0.00015003736775984919, + "loss": 0.8594, + "step": 22086 + }, + { + "epoch": 0.5671322263252242, + "grad_norm": 0.75, + "learning_rate": 0.00015003350256152106, + "loss": 0.8903, + "step": 22087 + }, + { + "epoch": 0.5671579035211459, + "grad_norm": 0.7890625, + "learning_rate": 0.0001500296372634807, + "loss": 0.85, + "step": 22088 + }, + { + "epoch": 0.5671835807170678, + "grad_norm": 0.83203125, + "learning_rate": 0.0001500257718657358, + "loss": 0.888, + "step": 22089 + }, + { + "epoch": 0.5672092579129896, + "grad_norm": 0.71484375, + "learning_rate": 0.00015002190636829402, + "loss": 0.8094, + "step": 22090 + }, + { + "epoch": 0.5672349351089114, + "grad_norm": 0.734375, + "learning_rate": 0.00015001804077116314, + "loss": 0.9494, + "step": 22091 + }, + { + "epoch": 0.5672606123048333, + "grad_norm": 0.828125, + "learning_rate": 0.00015001417507435079, + "loss": 0.9728, + "step": 22092 + }, + { + "epoch": 0.5672862895007551, + "grad_norm": 0.75, + "learning_rate": 0.00015001030927786475, + "loss": 0.8438, + "step": 22093 + }, + { + "epoch": 0.5673119666966769, + "grad_norm": 0.8671875, + "learning_rate": 0.00015000644338171267, + "loss": 0.8409, + "step": 22094 + }, + { + "epoch": 0.5673376438925987, + "grad_norm": 0.8125, + "learning_rate": 0.0001500025773859022, + "loss": 0.9566, + "step": 22095 + }, + { + "epoch": 0.5673633210885205, + "grad_norm": 0.765625, + "learning_rate": 0.0001499987112904412, + "loss": 0.8833, + "step": 22096 + }, + { + "epoch": 0.5673889982844423, + "grad_norm": 0.7734375, + "learning_rate": 0.00014999484509533724, + "loss": 0.8973, + "step": 22097 + }, + { + "epoch": 0.5674146754803642, + "grad_norm": 0.7890625, + "learning_rate": 0.00014999097880059805, + "loss": 0.8996, + "step": 22098 + }, + { + "epoch": 0.567440352676286, + "grad_norm": 0.703125, + "learning_rate": 0.0001499871124062314, + "loss": 0.8783, + "step": 22099 + }, + { + "epoch": 0.5674660298722078, + "grad_norm": 0.75390625, + "learning_rate": 0.00014998324591224493, + "loss": 0.8327, + "step": 22100 + }, + { + "epoch": 0.5674917070681296, + "grad_norm": 0.79296875, + "learning_rate": 0.00014997937931864635, + "loss": 0.7001, + "step": 22101 + }, + { + "epoch": 0.5675173842640514, + "grad_norm": 0.76953125, + "learning_rate": 0.0001499755126254434, + "loss": 0.787, + "step": 22102 + }, + { + "epoch": 0.5675430614599732, + "grad_norm": 0.83984375, + "learning_rate": 0.00014997164583264376, + "loss": 0.8946, + "step": 22103 + }, + { + "epoch": 0.5675687386558951, + "grad_norm": 0.82421875, + "learning_rate": 0.00014996777894025516, + "loss": 0.7582, + "step": 22104 + }, + { + "epoch": 0.5675944158518169, + "grad_norm": 0.78515625, + "learning_rate": 0.00014996391194828528, + "loss": 0.9672, + "step": 22105 + }, + { + "epoch": 0.5676200930477387, + "grad_norm": 0.7890625, + "learning_rate": 0.00014996004485674184, + "loss": 0.9372, + "step": 22106 + }, + { + "epoch": 0.5676457702436606, + "grad_norm": 0.7890625, + "learning_rate": 0.00014995617766563256, + "loss": 0.8428, + "step": 22107 + }, + { + "epoch": 0.5676714474395823, + "grad_norm": 0.79296875, + "learning_rate": 0.00014995231037496506, + "loss": 0.8097, + "step": 22108 + }, + { + "epoch": 0.5676971246355041, + "grad_norm": 0.77734375, + "learning_rate": 0.0001499484429847472, + "loss": 0.8067, + "step": 22109 + }, + { + "epoch": 0.567722801831426, + "grad_norm": 0.79296875, + "learning_rate": 0.00014994457549498657, + "loss": 0.8377, + "step": 22110 + }, + { + "epoch": 0.5677484790273478, + "grad_norm": 0.84765625, + "learning_rate": 0.0001499407079056909, + "loss": 0.7864, + "step": 22111 + }, + { + "epoch": 0.5677741562232697, + "grad_norm": 0.82421875, + "learning_rate": 0.00014993684021686793, + "loss": 0.9328, + "step": 22112 + }, + { + "epoch": 0.5677998334191915, + "grad_norm": 0.80859375, + "learning_rate": 0.00014993297242852534, + "loss": 0.8524, + "step": 22113 + }, + { + "epoch": 0.5678255106151133, + "grad_norm": 0.78515625, + "learning_rate": 0.00014992910454067086, + "loss": 0.8842, + "step": 22114 + }, + { + "epoch": 0.567851187811035, + "grad_norm": 0.76953125, + "learning_rate": 0.00014992523655331217, + "loss": 0.85, + "step": 22115 + }, + { + "epoch": 0.5678768650069569, + "grad_norm": 1.234375, + "learning_rate": 0.000149921368466457, + "loss": 0.7942, + "step": 22116 + }, + { + "epoch": 0.5679025422028787, + "grad_norm": 0.7578125, + "learning_rate": 0.00014991750028011305, + "loss": 0.8715, + "step": 22117 + }, + { + "epoch": 0.5679282193988006, + "grad_norm": 0.828125, + "learning_rate": 0.00014991363199428804, + "loss": 0.8782, + "step": 22118 + }, + { + "epoch": 0.5679538965947224, + "grad_norm": 0.73828125, + "learning_rate": 0.00014990976360898964, + "loss": 0.9195, + "step": 22119 + }, + { + "epoch": 0.5679795737906442, + "grad_norm": 0.7421875, + "learning_rate": 0.00014990589512422565, + "loss": 0.9184, + "step": 22120 + }, + { + "epoch": 0.568005250986566, + "grad_norm": 0.76953125, + "learning_rate": 0.00014990202654000364, + "loss": 0.7883, + "step": 22121 + }, + { + "epoch": 0.5680309281824878, + "grad_norm": 0.78515625, + "learning_rate": 0.00014989815785633142, + "loss": 0.8086, + "step": 22122 + }, + { + "epoch": 0.5680566053784096, + "grad_norm": 0.7734375, + "learning_rate": 0.00014989428907321672, + "loss": 0.95, + "step": 22123 + }, + { + "epoch": 0.5680822825743315, + "grad_norm": 0.78125, + "learning_rate": 0.00014989042019066716, + "loss": 0.8511, + "step": 22124 + }, + { + "epoch": 0.5681079597702533, + "grad_norm": 0.80859375, + "learning_rate": 0.00014988655120869054, + "loss": 0.9876, + "step": 22125 + }, + { + "epoch": 0.5681336369661751, + "grad_norm": 0.78125, + "learning_rate": 0.00014988268212729456, + "loss": 0.7855, + "step": 22126 + }, + { + "epoch": 0.568159314162097, + "grad_norm": 0.76171875, + "learning_rate": 0.0001498788129464868, + "loss": 0.8816, + "step": 22127 + }, + { + "epoch": 0.5681849913580187, + "grad_norm": 0.7734375, + "learning_rate": 0.00014987494366627515, + "loss": 0.9041, + "step": 22128 + }, + { + "epoch": 0.5682106685539405, + "grad_norm": 0.78515625, + "learning_rate": 0.0001498710742866672, + "loss": 0.9043, + "step": 22129 + }, + { + "epoch": 0.5682363457498624, + "grad_norm": 0.71875, + "learning_rate": 0.00014986720480767075, + "loss": 0.9168, + "step": 22130 + }, + { + "epoch": 0.5682620229457842, + "grad_norm": 0.81640625, + "learning_rate": 0.00014986333522929343, + "loss": 1.0233, + "step": 22131 + }, + { + "epoch": 0.568287700141706, + "grad_norm": 0.75, + "learning_rate": 0.000149859465551543, + "loss": 0.8929, + "step": 22132 + }, + { + "epoch": 0.5683133773376279, + "grad_norm": 0.76171875, + "learning_rate": 0.00014985559577442717, + "loss": 0.8273, + "step": 22133 + }, + { + "epoch": 0.5683390545335497, + "grad_norm": 0.7734375, + "learning_rate": 0.00014985172589795362, + "loss": 0.8146, + "step": 22134 + }, + { + "epoch": 0.5683647317294714, + "grad_norm": 0.796875, + "learning_rate": 0.00014984785592213006, + "loss": 0.8267, + "step": 22135 + }, + { + "epoch": 0.5683904089253933, + "grad_norm": 0.8671875, + "learning_rate": 0.00014984398584696427, + "loss": 0.7801, + "step": 22136 + }, + { + "epoch": 0.5684160861213151, + "grad_norm": 0.71875, + "learning_rate": 0.0001498401156724639, + "loss": 0.7353, + "step": 22137 + }, + { + "epoch": 0.568441763317237, + "grad_norm": 0.71484375, + "learning_rate": 0.00014983624539863667, + "loss": 0.8987, + "step": 22138 + }, + { + "epoch": 0.5684674405131588, + "grad_norm": 0.81640625, + "learning_rate": 0.00014983237502549034, + "loss": 0.9256, + "step": 22139 + }, + { + "epoch": 0.5684931177090806, + "grad_norm": 0.71875, + "learning_rate": 0.00014982850455303254, + "loss": 0.7974, + "step": 22140 + }, + { + "epoch": 0.5685187949050023, + "grad_norm": 0.71875, + "learning_rate": 0.00014982463398127108, + "loss": 0.7511, + "step": 22141 + }, + { + "epoch": 0.5685444721009242, + "grad_norm": 0.73828125, + "learning_rate": 0.0001498207633102136, + "loss": 0.8521, + "step": 22142 + }, + { + "epoch": 0.568570149296846, + "grad_norm": 0.7734375, + "learning_rate": 0.00014981689253986785, + "loss": 0.7574, + "step": 22143 + }, + { + "epoch": 0.5685958264927679, + "grad_norm": 0.8359375, + "learning_rate": 0.0001498130216702415, + "loss": 0.8812, + "step": 22144 + }, + { + "epoch": 0.5686215036886897, + "grad_norm": 0.96875, + "learning_rate": 0.00014980915070134234, + "loss": 0.8732, + "step": 22145 + }, + { + "epoch": 0.5686471808846115, + "grad_norm": 0.74609375, + "learning_rate": 0.000149805279633178, + "loss": 0.8606, + "step": 22146 + }, + { + "epoch": 0.5686728580805334, + "grad_norm": 0.7421875, + "learning_rate": 0.00014980140846575628, + "loss": 0.7601, + "step": 22147 + }, + { + "epoch": 0.5686985352764551, + "grad_norm": 0.80078125, + "learning_rate": 0.00014979753719908482, + "loss": 0.9706, + "step": 22148 + }, + { + "epoch": 0.5687242124723769, + "grad_norm": 0.81640625, + "learning_rate": 0.00014979366583317137, + "loss": 1.0266, + "step": 22149 + }, + { + "epoch": 0.5687498896682988, + "grad_norm": 0.76953125, + "learning_rate": 0.00014978979436802367, + "loss": 0.9482, + "step": 22150 + }, + { + "epoch": 0.5687755668642206, + "grad_norm": 0.76953125, + "learning_rate": 0.00014978592280364938, + "loss": 0.8083, + "step": 22151 + }, + { + "epoch": 0.5688012440601424, + "grad_norm": 0.73828125, + "learning_rate": 0.00014978205114005627, + "loss": 0.8716, + "step": 22152 + }, + { + "epoch": 0.5688269212560643, + "grad_norm": 0.73046875, + "learning_rate": 0.000149778179377252, + "loss": 0.8899, + "step": 22153 + }, + { + "epoch": 0.5688525984519861, + "grad_norm": 0.7734375, + "learning_rate": 0.00014977430751524434, + "loss": 0.984, + "step": 22154 + }, + { + "epoch": 0.5688782756479078, + "grad_norm": 0.734375, + "learning_rate": 0.00014977043555404097, + "loss": 0.8558, + "step": 22155 + }, + { + "epoch": 0.5689039528438297, + "grad_norm": 0.80078125, + "learning_rate": 0.0001497665634936496, + "loss": 0.8266, + "step": 22156 + }, + { + "epoch": 0.5689296300397515, + "grad_norm": 0.78515625, + "learning_rate": 0.00014976269133407796, + "loss": 0.8611, + "step": 22157 + }, + { + "epoch": 0.5689553072356733, + "grad_norm": 0.875, + "learning_rate": 0.0001497588190753338, + "loss": 0.9996, + "step": 22158 + }, + { + "epoch": 0.5689809844315952, + "grad_norm": 0.88671875, + "learning_rate": 0.0001497549467174248, + "loss": 0.8606, + "step": 22159 + }, + { + "epoch": 0.569006661627517, + "grad_norm": 0.8046875, + "learning_rate": 0.00014975107426035868, + "loss": 0.8138, + "step": 22160 + }, + { + "epoch": 0.5690323388234387, + "grad_norm": 0.8125, + "learning_rate": 0.00014974720170414315, + "loss": 0.8799, + "step": 22161 + }, + { + "epoch": 0.5690580160193606, + "grad_norm": 0.7578125, + "learning_rate": 0.00014974332904878597, + "loss": 0.8341, + "step": 22162 + }, + { + "epoch": 0.5690836932152824, + "grad_norm": 0.78515625, + "learning_rate": 0.00014973945629429483, + "loss": 0.8084, + "step": 22163 + }, + { + "epoch": 0.5691093704112042, + "grad_norm": 0.78125, + "learning_rate": 0.0001497355834406774, + "loss": 0.8569, + "step": 22164 + }, + { + "epoch": 0.5691350476071261, + "grad_norm": 0.80859375, + "learning_rate": 0.00014973171048794152, + "loss": 0.8965, + "step": 22165 + }, + { + "epoch": 0.5691607248030479, + "grad_norm": 0.69921875, + "learning_rate": 0.00014972783743609478, + "loss": 0.7561, + "step": 22166 + }, + { + "epoch": 0.5691864019989697, + "grad_norm": 0.7734375, + "learning_rate": 0.00014972396428514494, + "loss": 0.859, + "step": 22167 + }, + { + "epoch": 0.5692120791948915, + "grad_norm": 0.78125, + "learning_rate": 0.00014972009103509976, + "loss": 0.7791, + "step": 22168 + }, + { + "epoch": 0.5692377563908133, + "grad_norm": 0.7578125, + "learning_rate": 0.0001497162176859669, + "loss": 0.7999, + "step": 22169 + }, + { + "epoch": 0.5692634335867351, + "grad_norm": 0.71484375, + "learning_rate": 0.00014971234423775417, + "loss": 0.741, + "step": 22170 + }, + { + "epoch": 0.569289110782657, + "grad_norm": 0.75390625, + "learning_rate": 0.00014970847069046918, + "loss": 0.9791, + "step": 22171 + }, + { + "epoch": 0.5693147879785788, + "grad_norm": 0.796875, + "learning_rate": 0.0001497045970441197, + "loss": 0.8779, + "step": 22172 + }, + { + "epoch": 0.5693404651745007, + "grad_norm": 0.78125, + "learning_rate": 0.00014970072329871346, + "loss": 0.8605, + "step": 22173 + }, + { + "epoch": 0.5693661423704225, + "grad_norm": 0.73046875, + "learning_rate": 0.00014969684945425818, + "loss": 0.786, + "step": 22174 + }, + { + "epoch": 0.5693918195663442, + "grad_norm": 0.859375, + "learning_rate": 0.00014969297551076154, + "loss": 0.8682, + "step": 22175 + }, + { + "epoch": 0.569417496762266, + "grad_norm": 0.77734375, + "learning_rate": 0.0001496891014682313, + "loss": 0.8206, + "step": 22176 + }, + { + "epoch": 0.5694431739581879, + "grad_norm": 0.77734375, + "learning_rate": 0.00014968522732667516, + "loss": 0.7886, + "step": 22177 + }, + { + "epoch": 0.5694688511541097, + "grad_norm": 0.85546875, + "learning_rate": 0.00014968135308610086, + "loss": 0.9522, + "step": 22178 + }, + { + "epoch": 0.5694945283500316, + "grad_norm": 0.74609375, + "learning_rate": 0.00014967747874651614, + "loss": 0.7874, + "step": 22179 + }, + { + "epoch": 0.5695202055459534, + "grad_norm": 0.73046875, + "learning_rate": 0.00014967360430792864, + "loss": 0.801, + "step": 22180 + }, + { + "epoch": 0.5695458827418751, + "grad_norm": 0.703125, + "learning_rate": 0.00014966972977034616, + "loss": 0.7467, + "step": 22181 + }, + { + "epoch": 0.569571559937797, + "grad_norm": 0.73828125, + "learning_rate": 0.0001496658551337764, + "loss": 0.9848, + "step": 22182 + }, + { + "epoch": 0.5695972371337188, + "grad_norm": 0.8125, + "learning_rate": 0.00014966198039822705, + "loss": 1.033, + "step": 22183 + }, + { + "epoch": 0.5696229143296406, + "grad_norm": 0.77734375, + "learning_rate": 0.00014965810556370588, + "loss": 1.0281, + "step": 22184 + }, + { + "epoch": 0.5696485915255625, + "grad_norm": 0.79296875, + "learning_rate": 0.00014965423063022058, + "loss": 1.0541, + "step": 22185 + }, + { + "epoch": 0.5696742687214843, + "grad_norm": 0.7578125, + "learning_rate": 0.0001496503555977789, + "loss": 0.7452, + "step": 22186 + }, + { + "epoch": 0.5696999459174061, + "grad_norm": 0.72265625, + "learning_rate": 0.00014964648046638854, + "loss": 0.8305, + "step": 22187 + }, + { + "epoch": 0.5697256231133279, + "grad_norm": 0.765625, + "learning_rate": 0.00014964260523605722, + "loss": 0.8694, + "step": 22188 + }, + { + "epoch": 0.5697513003092497, + "grad_norm": 0.8515625, + "learning_rate": 0.00014963872990679267, + "loss": 0.9643, + "step": 22189 + }, + { + "epoch": 0.5697769775051715, + "grad_norm": 0.81640625, + "learning_rate": 0.00014963485447860265, + "loss": 0.9712, + "step": 22190 + }, + { + "epoch": 0.5698026547010934, + "grad_norm": 0.78515625, + "learning_rate": 0.0001496309789514948, + "loss": 0.9219, + "step": 22191 + }, + { + "epoch": 0.5698283318970152, + "grad_norm": 0.8515625, + "learning_rate": 0.00014962710332547694, + "loss": 1.037, + "step": 22192 + }, + { + "epoch": 0.569854009092937, + "grad_norm": 0.8515625, + "learning_rate": 0.00014962322760055672, + "loss": 0.7752, + "step": 22193 + }, + { + "epoch": 0.5698796862888589, + "grad_norm": 0.80078125, + "learning_rate": 0.00014961935177674188, + "loss": 0.9649, + "step": 22194 + }, + { + "epoch": 0.5699053634847806, + "grad_norm": 0.78125, + "learning_rate": 0.00014961547585404017, + "loss": 0.9201, + "step": 22195 + }, + { + "epoch": 0.5699310406807024, + "grad_norm": 0.73046875, + "learning_rate": 0.0001496115998324593, + "loss": 0.8877, + "step": 22196 + }, + { + "epoch": 0.5699567178766243, + "grad_norm": 0.78125, + "learning_rate": 0.00014960772371200697, + "loss": 0.9119, + "step": 22197 + }, + { + "epoch": 0.5699823950725461, + "grad_norm": 0.75390625, + "learning_rate": 0.00014960384749269093, + "loss": 1.0123, + "step": 22198 + }, + { + "epoch": 0.570008072268468, + "grad_norm": 0.79296875, + "learning_rate": 0.00014959997117451894, + "loss": 0.7714, + "step": 22199 + }, + { + "epoch": 0.5700337494643898, + "grad_norm": 0.78125, + "learning_rate": 0.00014959609475749866, + "loss": 0.8405, + "step": 22200 + }, + { + "epoch": 0.5700594266603115, + "grad_norm": 0.828125, + "learning_rate": 0.00014959221824163786, + "loss": 0.9292, + "step": 22201 + }, + { + "epoch": 0.5700851038562333, + "grad_norm": 0.78125, + "learning_rate": 0.00014958834162694421, + "loss": 0.8605, + "step": 22202 + }, + { + "epoch": 0.5701107810521552, + "grad_norm": 0.77734375, + "learning_rate": 0.00014958446491342552, + "loss": 0.8228, + "step": 22203 + }, + { + "epoch": 0.570136458248077, + "grad_norm": 0.7578125, + "learning_rate": 0.00014958058810108943, + "loss": 0.8832, + "step": 22204 + }, + { + "epoch": 0.5701621354439989, + "grad_norm": 0.8046875, + "learning_rate": 0.00014957671118994377, + "loss": 0.8617, + "step": 22205 + }, + { + "epoch": 0.5701878126399207, + "grad_norm": 0.75390625, + "learning_rate": 0.00014957283417999613, + "loss": 0.9558, + "step": 22206 + }, + { + "epoch": 0.5702134898358425, + "grad_norm": 0.73828125, + "learning_rate": 0.00014956895707125435, + "loss": 0.7473, + "step": 22207 + }, + { + "epoch": 0.5702391670317642, + "grad_norm": 0.796875, + "learning_rate": 0.0001495650798637261, + "loss": 0.8397, + "step": 22208 + }, + { + "epoch": 0.5702648442276861, + "grad_norm": 0.8203125, + "learning_rate": 0.00014956120255741912, + "loss": 0.8502, + "step": 22209 + }, + { + "epoch": 0.5702905214236079, + "grad_norm": 0.76953125, + "learning_rate": 0.00014955732515234115, + "loss": 0.7855, + "step": 22210 + }, + { + "epoch": 0.5703161986195298, + "grad_norm": 0.79296875, + "learning_rate": 0.00014955344764849992, + "loss": 0.9003, + "step": 22211 + }, + { + "epoch": 0.5703418758154516, + "grad_norm": 0.73046875, + "learning_rate": 0.00014954957004590313, + "loss": 0.8699, + "step": 22212 + }, + { + "epoch": 0.5703675530113734, + "grad_norm": 0.7578125, + "learning_rate": 0.00014954569234455853, + "loss": 0.7759, + "step": 22213 + }, + { + "epoch": 0.5703932302072953, + "grad_norm": 0.734375, + "learning_rate": 0.00014954181454447386, + "loss": 0.9888, + "step": 22214 + }, + { + "epoch": 0.570418907403217, + "grad_norm": 0.82421875, + "learning_rate": 0.00014953793664565677, + "loss": 1.0352, + "step": 22215 + }, + { + "epoch": 0.5704445845991388, + "grad_norm": 0.75390625, + "learning_rate": 0.00014953405864811506, + "loss": 0.8025, + "step": 22216 + }, + { + "epoch": 0.5704702617950607, + "grad_norm": 0.765625, + "learning_rate": 0.0001495301805518565, + "loss": 0.8007, + "step": 22217 + }, + { + "epoch": 0.5704959389909825, + "grad_norm": 0.74609375, + "learning_rate": 0.00014952630235688874, + "loss": 0.9639, + "step": 22218 + }, + { + "epoch": 0.5705216161869043, + "grad_norm": 0.74609375, + "learning_rate": 0.0001495224240632195, + "loss": 0.8627, + "step": 22219 + }, + { + "epoch": 0.5705472933828262, + "grad_norm": 0.80078125, + "learning_rate": 0.00014951854567085658, + "loss": 0.9722, + "step": 22220 + }, + { + "epoch": 0.5705729705787479, + "grad_norm": 0.7421875, + "learning_rate": 0.00014951466717980764, + "loss": 0.8397, + "step": 22221 + }, + { + "epoch": 0.5705986477746697, + "grad_norm": 0.6875, + "learning_rate": 0.00014951078859008047, + "loss": 0.818, + "step": 22222 + }, + { + "epoch": 0.5706243249705916, + "grad_norm": 0.875, + "learning_rate": 0.00014950690990168274, + "loss": 0.8512, + "step": 22223 + }, + { + "epoch": 0.5706500021665134, + "grad_norm": 0.78515625, + "learning_rate": 0.00014950303111462224, + "loss": 0.9555, + "step": 22224 + }, + { + "epoch": 0.5706756793624352, + "grad_norm": 0.74609375, + "learning_rate": 0.00014949915222890666, + "loss": 0.8453, + "step": 22225 + }, + { + "epoch": 0.5707013565583571, + "grad_norm": 0.78515625, + "learning_rate": 0.00014949527324454372, + "loss": 0.9682, + "step": 22226 + }, + { + "epoch": 0.5707270337542789, + "grad_norm": 0.7890625, + "learning_rate": 0.0001494913941615412, + "loss": 0.8378, + "step": 22227 + }, + { + "epoch": 0.5707527109502006, + "grad_norm": 0.73046875, + "learning_rate": 0.0001494875149799068, + "loss": 0.8829, + "step": 22228 + }, + { + "epoch": 0.5707783881461225, + "grad_norm": 0.87890625, + "learning_rate": 0.00014948363569964825, + "loss": 0.9343, + "step": 22229 + }, + { + "epoch": 0.5708040653420443, + "grad_norm": 0.77734375, + "learning_rate": 0.00014947975632077326, + "loss": 0.821, + "step": 22230 + }, + { + "epoch": 0.5708297425379661, + "grad_norm": 0.82421875, + "learning_rate": 0.00014947587684328964, + "loss": 0.8717, + "step": 22231 + }, + { + "epoch": 0.570855419733888, + "grad_norm": 0.74609375, + "learning_rate": 0.00014947199726720502, + "loss": 1.02, + "step": 22232 + }, + { + "epoch": 0.5708810969298098, + "grad_norm": 0.74609375, + "learning_rate": 0.0001494681175925272, + "loss": 0.8733, + "step": 22233 + }, + { + "epoch": 0.5709067741257317, + "grad_norm": 1.1484375, + "learning_rate": 0.00014946423781926385, + "loss": 0.9519, + "step": 22234 + }, + { + "epoch": 0.5709324513216534, + "grad_norm": 0.8515625, + "learning_rate": 0.00014946035794742278, + "loss": 0.7661, + "step": 22235 + }, + { + "epoch": 0.5709581285175752, + "grad_norm": 0.7578125, + "learning_rate": 0.0001494564779770117, + "loss": 0.8856, + "step": 22236 + }, + { + "epoch": 0.570983805713497, + "grad_norm": 0.7265625, + "learning_rate": 0.0001494525979080383, + "loss": 0.8591, + "step": 22237 + }, + { + "epoch": 0.5710094829094189, + "grad_norm": 0.78125, + "learning_rate": 0.00014944871774051033, + "loss": 0.7818, + "step": 22238 + }, + { + "epoch": 0.5710351601053407, + "grad_norm": 0.7578125, + "learning_rate": 0.00014944483747443554, + "loss": 0.8139, + "step": 22239 + }, + { + "epoch": 0.5710608373012626, + "grad_norm": 0.734375, + "learning_rate": 0.0001494409571098217, + "loss": 0.747, + "step": 22240 + }, + { + "epoch": 0.5710865144971843, + "grad_norm": 0.72265625, + "learning_rate": 0.00014943707664667643, + "loss": 0.7527, + "step": 22241 + }, + { + "epoch": 0.5711121916931061, + "grad_norm": 0.74609375, + "learning_rate": 0.00014943319608500756, + "loss": 1.0666, + "step": 22242 + }, + { + "epoch": 0.571137868889028, + "grad_norm": 0.7578125, + "learning_rate": 0.00014942931542482279, + "loss": 0.8751, + "step": 22243 + }, + { + "epoch": 0.5711635460849498, + "grad_norm": 0.765625, + "learning_rate": 0.00014942543466612987, + "loss": 0.8748, + "step": 22244 + }, + { + "epoch": 0.5711892232808716, + "grad_norm": 0.7578125, + "learning_rate": 0.0001494215538089365, + "loss": 0.8881, + "step": 22245 + }, + { + "epoch": 0.5712149004767935, + "grad_norm": 0.7421875, + "learning_rate": 0.0001494176728532505, + "loss": 0.9972, + "step": 22246 + }, + { + "epoch": 0.5712405776727153, + "grad_norm": 0.8125, + "learning_rate": 0.0001494137917990795, + "loss": 1.0158, + "step": 22247 + }, + { + "epoch": 0.571266254868637, + "grad_norm": 0.77734375, + "learning_rate": 0.00014940991064643126, + "loss": 0.8171, + "step": 22248 + }, + { + "epoch": 0.5712919320645589, + "grad_norm": 0.8046875, + "learning_rate": 0.0001494060293953135, + "loss": 0.8505, + "step": 22249 + }, + { + "epoch": 0.5713176092604807, + "grad_norm": 0.8203125, + "learning_rate": 0.00014940214804573407, + "loss": 0.984, + "step": 22250 + }, + { + "epoch": 0.5713432864564025, + "grad_norm": 0.71875, + "learning_rate": 0.0001493982665977006, + "loss": 0.8429, + "step": 22251 + }, + { + "epoch": 0.5713689636523244, + "grad_norm": 0.7734375, + "learning_rate": 0.00014939438505122082, + "loss": 0.8643, + "step": 22252 + }, + { + "epoch": 0.5713946408482462, + "grad_norm": 0.7734375, + "learning_rate": 0.0001493905034063025, + "loss": 0.7507, + "step": 22253 + }, + { + "epoch": 0.571420318044168, + "grad_norm": 0.69140625, + "learning_rate": 0.00014938662166295337, + "loss": 0.831, + "step": 22254 + }, + { + "epoch": 0.5714459952400898, + "grad_norm": 0.75, + "learning_rate": 0.00014938273982118117, + "loss": 0.9503, + "step": 22255 + }, + { + "epoch": 0.5714716724360116, + "grad_norm": 0.75, + "learning_rate": 0.00014937885788099362, + "loss": 0.9526, + "step": 22256 + }, + { + "epoch": 0.5714973496319334, + "grad_norm": 0.796875, + "learning_rate": 0.00014937497584239847, + "loss": 0.8496, + "step": 22257 + }, + { + "epoch": 0.5715230268278553, + "grad_norm": 0.828125, + "learning_rate": 0.00014937109370540347, + "loss": 0.8912, + "step": 22258 + }, + { + "epoch": 0.5715487040237771, + "grad_norm": 0.7265625, + "learning_rate": 0.00014936721147001632, + "loss": 0.8962, + "step": 22259 + }, + { + "epoch": 0.571574381219699, + "grad_norm": 0.8359375, + "learning_rate": 0.00014936332913624475, + "loss": 0.8186, + "step": 22260 + }, + { + "epoch": 0.5716000584156207, + "grad_norm": 0.80859375, + "learning_rate": 0.0001493594467040966, + "loss": 0.8142, + "step": 22261 + }, + { + "epoch": 0.5716257356115425, + "grad_norm": 0.70703125, + "learning_rate": 0.00014935556417357949, + "loss": 0.745, + "step": 22262 + }, + { + "epoch": 0.5716514128074643, + "grad_norm": 0.73828125, + "learning_rate": 0.00014935168154470117, + "loss": 0.8652, + "step": 22263 + }, + { + "epoch": 0.5716770900033862, + "grad_norm": 0.78515625, + "learning_rate": 0.00014934779881746945, + "loss": 0.9052, + "step": 22264 + }, + { + "epoch": 0.571702767199308, + "grad_norm": 0.79296875, + "learning_rate": 0.00014934391599189198, + "loss": 0.8192, + "step": 22265 + }, + { + "epoch": 0.5717284443952299, + "grad_norm": 0.75390625, + "learning_rate": 0.00014934003306797658, + "loss": 0.8776, + "step": 22266 + }, + { + "epoch": 0.5717541215911517, + "grad_norm": 0.76171875, + "learning_rate": 0.00014933615004573096, + "loss": 0.7933, + "step": 22267 + }, + { + "epoch": 0.5717797987870734, + "grad_norm": 0.80859375, + "learning_rate": 0.0001493322669251628, + "loss": 0.9578, + "step": 22268 + }, + { + "epoch": 0.5718054759829952, + "grad_norm": 0.7578125, + "learning_rate": 0.00014932838370627992, + "loss": 0.979, + "step": 22269 + }, + { + "epoch": 0.5718311531789171, + "grad_norm": 0.76171875, + "learning_rate": 0.00014932450038909005, + "loss": 0.8496, + "step": 22270 + }, + { + "epoch": 0.5718568303748389, + "grad_norm": 0.81640625, + "learning_rate": 0.00014932061697360086, + "loss": 0.8922, + "step": 22271 + }, + { + "epoch": 0.5718825075707608, + "grad_norm": 0.80859375, + "learning_rate": 0.00014931673345982017, + "loss": 0.9054, + "step": 22272 + }, + { + "epoch": 0.5719081847666826, + "grad_norm": 0.76953125, + "learning_rate": 0.00014931284984775567, + "loss": 0.9366, + "step": 22273 + }, + { + "epoch": 0.5719338619626044, + "grad_norm": 0.83203125, + "learning_rate": 0.00014930896613741512, + "loss": 0.769, + "step": 22274 + }, + { + "epoch": 0.5719595391585262, + "grad_norm": 0.765625, + "learning_rate": 0.00014930508232880626, + "loss": 0.8166, + "step": 22275 + }, + { + "epoch": 0.571985216354448, + "grad_norm": 0.7421875, + "learning_rate": 0.0001493011984219368, + "loss": 0.8838, + "step": 22276 + }, + { + "epoch": 0.5720108935503698, + "grad_norm": 0.7265625, + "learning_rate": 0.00014929731441681453, + "loss": 0.9403, + "step": 22277 + }, + { + "epoch": 0.5720365707462917, + "grad_norm": 0.83984375, + "learning_rate": 0.00014929343031344714, + "loss": 0.8625, + "step": 22278 + }, + { + "epoch": 0.5720622479422135, + "grad_norm": 0.74609375, + "learning_rate": 0.00014928954611184242, + "loss": 1.0019, + "step": 22279 + }, + { + "epoch": 0.5720879251381353, + "grad_norm": 0.82421875, + "learning_rate": 0.0001492856618120081, + "loss": 0.8384, + "step": 22280 + }, + { + "epoch": 0.5721136023340571, + "grad_norm": 0.7734375, + "learning_rate": 0.00014928177741395187, + "loss": 0.9296, + "step": 22281 + }, + { + "epoch": 0.5721392795299789, + "grad_norm": 0.75, + "learning_rate": 0.00014927789291768154, + "loss": 0.8142, + "step": 22282 + }, + { + "epoch": 0.5721649567259007, + "grad_norm": 0.7734375, + "learning_rate": 0.0001492740083232048, + "loss": 0.8362, + "step": 22283 + }, + { + "epoch": 0.5721906339218226, + "grad_norm": 0.6953125, + "learning_rate": 0.0001492701236305294, + "loss": 0.7511, + "step": 22284 + }, + { + "epoch": 0.5722163111177444, + "grad_norm": 0.76171875, + "learning_rate": 0.00014926623883966314, + "loss": 0.9829, + "step": 22285 + }, + { + "epoch": 0.5722419883136662, + "grad_norm": 0.76171875, + "learning_rate": 0.0001492623539506137, + "loss": 0.9975, + "step": 22286 + }, + { + "epoch": 0.5722676655095881, + "grad_norm": 0.703125, + "learning_rate": 0.0001492584689633888, + "loss": 0.7935, + "step": 22287 + }, + { + "epoch": 0.5722933427055098, + "grad_norm": 0.7578125, + "learning_rate": 0.00014925458387799628, + "loss": 0.9807, + "step": 22288 + }, + { + "epoch": 0.5723190199014316, + "grad_norm": 0.84765625, + "learning_rate": 0.00014925069869444374, + "loss": 0.9389, + "step": 22289 + }, + { + "epoch": 0.5723446970973535, + "grad_norm": 0.828125, + "learning_rate": 0.0001492468134127391, + "loss": 0.897, + "step": 22290 + }, + { + "epoch": 0.5723703742932753, + "grad_norm": 0.8671875, + "learning_rate": 0.00014924292803288996, + "loss": 0.9954, + "step": 22291 + }, + { + "epoch": 0.5723960514891971, + "grad_norm": 0.765625, + "learning_rate": 0.0001492390425549041, + "loss": 0.9595, + "step": 22292 + }, + { + "epoch": 0.572421728685119, + "grad_norm": 0.80078125, + "learning_rate": 0.00014923515697878932, + "loss": 0.8747, + "step": 22293 + }, + { + "epoch": 0.5724474058810408, + "grad_norm": 0.81640625, + "learning_rate": 0.00014923127130455328, + "loss": 1.0094, + "step": 22294 + }, + { + "epoch": 0.5724730830769625, + "grad_norm": 0.80859375, + "learning_rate": 0.00014922738553220378, + "loss": 0.892, + "step": 22295 + }, + { + "epoch": 0.5724987602728844, + "grad_norm": 0.80078125, + "learning_rate": 0.00014922349966174856, + "loss": 0.9427, + "step": 22296 + }, + { + "epoch": 0.5725244374688062, + "grad_norm": 0.7265625, + "learning_rate": 0.00014921961369319532, + "loss": 0.7936, + "step": 22297 + }, + { + "epoch": 0.572550114664728, + "grad_norm": 0.69921875, + "learning_rate": 0.00014921572762655188, + "loss": 0.9053, + "step": 22298 + }, + { + "epoch": 0.5725757918606499, + "grad_norm": 0.75, + "learning_rate": 0.00014921184146182592, + "loss": 0.8331, + "step": 22299 + }, + { + "epoch": 0.5726014690565717, + "grad_norm": 0.7265625, + "learning_rate": 0.00014920795519902518, + "loss": 0.8232, + "step": 22300 + }, + { + "epoch": 0.5726271462524934, + "grad_norm": 0.7578125, + "learning_rate": 0.00014920406883815745, + "loss": 0.7837, + "step": 22301 + }, + { + "epoch": 0.5726528234484153, + "grad_norm": 0.76953125, + "learning_rate": 0.00014920018237923045, + "loss": 0.8048, + "step": 22302 + }, + { + "epoch": 0.5726785006443371, + "grad_norm": 0.7578125, + "learning_rate": 0.00014919629582225196, + "loss": 0.9123, + "step": 22303 + }, + { + "epoch": 0.572704177840259, + "grad_norm": 1.140625, + "learning_rate": 0.00014919240916722967, + "loss": 0.9333, + "step": 22304 + }, + { + "epoch": 0.5727298550361808, + "grad_norm": 0.78125, + "learning_rate": 0.00014918852241417135, + "loss": 0.8847, + "step": 22305 + }, + { + "epoch": 0.5727555322321026, + "grad_norm": 0.84765625, + "learning_rate": 0.00014918463556308476, + "loss": 0.8816, + "step": 22306 + }, + { + "epoch": 0.5727812094280245, + "grad_norm": 0.91796875, + "learning_rate": 0.00014918074861397762, + "loss": 0.903, + "step": 22307 + }, + { + "epoch": 0.5728068866239462, + "grad_norm": 0.7734375, + "learning_rate": 0.00014917686156685768, + "loss": 0.816, + "step": 22308 + }, + { + "epoch": 0.572832563819868, + "grad_norm": 0.76171875, + "learning_rate": 0.00014917297442173272, + "loss": 0.8664, + "step": 22309 + }, + { + "epoch": 0.5728582410157899, + "grad_norm": 0.75, + "learning_rate": 0.00014916908717861046, + "loss": 0.8131, + "step": 22310 + }, + { + "epoch": 0.5728839182117117, + "grad_norm": 0.7421875, + "learning_rate": 0.00014916519983749867, + "loss": 0.8611, + "step": 22311 + }, + { + "epoch": 0.5729095954076335, + "grad_norm": 0.734375, + "learning_rate": 0.00014916131239840505, + "loss": 0.7636, + "step": 22312 + }, + { + "epoch": 0.5729352726035554, + "grad_norm": 0.70703125, + "learning_rate": 0.00014915742486133738, + "loss": 0.8597, + "step": 22313 + }, + { + "epoch": 0.5729609497994771, + "grad_norm": 0.79296875, + "learning_rate": 0.0001491535372263034, + "loss": 0.8847, + "step": 22314 + }, + { + "epoch": 0.5729866269953989, + "grad_norm": 0.7890625, + "learning_rate": 0.00014914964949331086, + "loss": 0.7955, + "step": 22315 + }, + { + "epoch": 0.5730123041913208, + "grad_norm": 0.80859375, + "learning_rate": 0.0001491457616623675, + "loss": 0.9498, + "step": 22316 + }, + { + "epoch": 0.5730379813872426, + "grad_norm": 0.76953125, + "learning_rate": 0.0001491418737334811, + "loss": 0.8576, + "step": 22317 + }, + { + "epoch": 0.5730636585831644, + "grad_norm": 0.7734375, + "learning_rate": 0.00014913798570665937, + "loss": 0.8614, + "step": 22318 + }, + { + "epoch": 0.5730893357790863, + "grad_norm": 0.83203125, + "learning_rate": 0.0001491340975819101, + "loss": 0.8898, + "step": 22319 + }, + { + "epoch": 0.5731150129750081, + "grad_norm": 0.7890625, + "learning_rate": 0.000149130209359241, + "loss": 0.8357, + "step": 22320 + }, + { + "epoch": 0.5731406901709298, + "grad_norm": 0.76171875, + "learning_rate": 0.0001491263210386598, + "loss": 0.9103, + "step": 22321 + }, + { + "epoch": 0.5731663673668517, + "grad_norm": 0.70703125, + "learning_rate": 0.0001491224326201743, + "loss": 0.7679, + "step": 22322 + }, + { + "epoch": 0.5731920445627735, + "grad_norm": 0.796875, + "learning_rate": 0.0001491185441037922, + "loss": 0.7262, + "step": 22323 + }, + { + "epoch": 0.5732177217586953, + "grad_norm": 0.8046875, + "learning_rate": 0.00014911465548952133, + "loss": 0.8193, + "step": 22324 + }, + { + "epoch": 0.5732433989546172, + "grad_norm": 0.765625, + "learning_rate": 0.00014911076677736937, + "loss": 0.8101, + "step": 22325 + }, + { + "epoch": 0.573269076150539, + "grad_norm": 0.70703125, + "learning_rate": 0.00014910687796734407, + "loss": 0.8831, + "step": 22326 + }, + { + "epoch": 0.5732947533464609, + "grad_norm": 0.73046875, + "learning_rate": 0.0001491029890594532, + "loss": 0.8972, + "step": 22327 + }, + { + "epoch": 0.5733204305423826, + "grad_norm": 0.73828125, + "learning_rate": 0.00014909910005370453, + "loss": 0.9628, + "step": 22328 + }, + { + "epoch": 0.5733461077383044, + "grad_norm": 0.7734375, + "learning_rate": 0.00014909521095010577, + "loss": 0.8914, + "step": 22329 + }, + { + "epoch": 0.5733717849342262, + "grad_norm": 0.7734375, + "learning_rate": 0.00014909132174866473, + "loss": 0.9222, + "step": 22330 + }, + { + "epoch": 0.5733974621301481, + "grad_norm": 1.625, + "learning_rate": 0.00014908743244938905, + "loss": 0.9032, + "step": 22331 + }, + { + "epoch": 0.5734231393260699, + "grad_norm": 0.78515625, + "learning_rate": 0.0001490835430522866, + "loss": 0.8284, + "step": 22332 + }, + { + "epoch": 0.5734488165219918, + "grad_norm": 0.79296875, + "learning_rate": 0.0001490796535573651, + "loss": 0.8344, + "step": 22333 + }, + { + "epoch": 0.5734744937179135, + "grad_norm": 0.80859375, + "learning_rate": 0.0001490757639646322, + "loss": 0.9601, + "step": 22334 + }, + { + "epoch": 0.5735001709138353, + "grad_norm": 0.828125, + "learning_rate": 0.00014907187427409582, + "loss": 0.8533, + "step": 22335 + }, + { + "epoch": 0.5735258481097572, + "grad_norm": 0.8359375, + "learning_rate": 0.0001490679844857636, + "loss": 0.845, + "step": 22336 + }, + { + "epoch": 0.573551525305679, + "grad_norm": 0.85546875, + "learning_rate": 0.00014906409459964333, + "loss": 0.8943, + "step": 22337 + }, + { + "epoch": 0.5735772025016008, + "grad_norm": 0.73046875, + "learning_rate": 0.00014906020461574274, + "loss": 0.844, + "step": 22338 + }, + { + "epoch": 0.5736028796975227, + "grad_norm": 0.7734375, + "learning_rate": 0.00014905631453406956, + "loss": 0.8376, + "step": 22339 + }, + { + "epoch": 0.5736285568934445, + "grad_norm": 0.83203125, + "learning_rate": 0.00014905242435463162, + "loss": 0.8061, + "step": 22340 + }, + { + "epoch": 0.5736542340893662, + "grad_norm": 0.8203125, + "learning_rate": 0.0001490485340774366, + "loss": 0.8967, + "step": 22341 + }, + { + "epoch": 0.5736799112852881, + "grad_norm": 0.86328125, + "learning_rate": 0.0001490446437024923, + "loss": 0.9513, + "step": 22342 + }, + { + "epoch": 0.5737055884812099, + "grad_norm": 0.7890625, + "learning_rate": 0.00014904075322980646, + "loss": 0.8826, + "step": 22343 + }, + { + "epoch": 0.5737312656771317, + "grad_norm": 0.8046875, + "learning_rate": 0.00014903686265938684, + "loss": 0.9492, + "step": 22344 + }, + { + "epoch": 0.5737569428730536, + "grad_norm": 0.78515625, + "learning_rate": 0.00014903297199124116, + "loss": 0.9193, + "step": 22345 + }, + { + "epoch": 0.5737826200689754, + "grad_norm": 0.76171875, + "learning_rate": 0.00014902908122537719, + "loss": 0.9318, + "step": 22346 + }, + { + "epoch": 0.5738082972648972, + "grad_norm": 0.73046875, + "learning_rate": 0.00014902519036180271, + "loss": 0.9924, + "step": 22347 + }, + { + "epoch": 0.573833974460819, + "grad_norm": 0.78515625, + "learning_rate": 0.00014902129940052542, + "loss": 0.8918, + "step": 22348 + }, + { + "epoch": 0.5738596516567408, + "grad_norm": 0.83203125, + "learning_rate": 0.00014901740834155318, + "loss": 0.9347, + "step": 22349 + }, + { + "epoch": 0.5738853288526626, + "grad_norm": 0.73828125, + "learning_rate": 0.0001490135171848936, + "loss": 0.8044, + "step": 22350 + }, + { + "epoch": 0.5739110060485845, + "grad_norm": 0.86328125, + "learning_rate": 0.00014900962593055454, + "loss": 1.1027, + "step": 22351 + }, + { + "epoch": 0.5739366832445063, + "grad_norm": 0.7734375, + "learning_rate": 0.00014900573457854373, + "loss": 0.8631, + "step": 22352 + }, + { + "epoch": 0.5739623604404281, + "grad_norm": 0.78125, + "learning_rate": 0.0001490018431288689, + "loss": 0.8028, + "step": 22353 + }, + { + "epoch": 0.5739880376363499, + "grad_norm": 0.73046875, + "learning_rate": 0.0001489979515815378, + "loss": 0.8339, + "step": 22354 + }, + { + "epoch": 0.5740137148322717, + "grad_norm": 0.83984375, + "learning_rate": 0.00014899405993655824, + "loss": 1.0028, + "step": 22355 + }, + { + "epoch": 0.5740393920281935, + "grad_norm": 0.83203125, + "learning_rate": 0.00014899016819393797, + "loss": 0.8517, + "step": 22356 + }, + { + "epoch": 0.5740650692241154, + "grad_norm": 0.84375, + "learning_rate": 0.00014898627635368466, + "loss": 0.8623, + "step": 22357 + }, + { + "epoch": 0.5740907464200372, + "grad_norm": 0.7109375, + "learning_rate": 0.00014898238441580616, + "loss": 0.9531, + "step": 22358 + }, + { + "epoch": 0.574116423615959, + "grad_norm": 0.78125, + "learning_rate": 0.00014897849238031019, + "loss": 0.8664, + "step": 22359 + }, + { + "epoch": 0.5741421008118809, + "grad_norm": 0.77734375, + "learning_rate": 0.00014897460024720452, + "loss": 0.8968, + "step": 22360 + }, + { + "epoch": 0.5741677780078026, + "grad_norm": 0.73046875, + "learning_rate": 0.00014897070801649686, + "loss": 0.7888, + "step": 22361 + }, + { + "epoch": 0.5741934552037244, + "grad_norm": 0.78125, + "learning_rate": 0.00014896681568819504, + "loss": 0.898, + "step": 22362 + }, + { + "epoch": 0.5742191323996463, + "grad_norm": 0.83984375, + "learning_rate": 0.00014896292326230676, + "loss": 1.0733, + "step": 22363 + }, + { + "epoch": 0.5742448095955681, + "grad_norm": 0.69921875, + "learning_rate": 0.0001489590307388398, + "loss": 0.8125, + "step": 22364 + }, + { + "epoch": 0.57427048679149, + "grad_norm": 0.83203125, + "learning_rate": 0.00014895513811780193, + "loss": 1.0148, + "step": 22365 + }, + { + "epoch": 0.5742961639874118, + "grad_norm": 0.765625, + "learning_rate": 0.00014895124539920086, + "loss": 0.8341, + "step": 22366 + }, + { + "epoch": 0.5743218411833336, + "grad_norm": 0.80859375, + "learning_rate": 0.0001489473525830444, + "loss": 0.8573, + "step": 22367 + }, + { + "epoch": 0.5743475183792554, + "grad_norm": 0.81640625, + "learning_rate": 0.00014894345966934028, + "loss": 0.8637, + "step": 22368 + }, + { + "epoch": 0.5743731955751772, + "grad_norm": 0.75, + "learning_rate": 0.0001489395666580963, + "loss": 1.0015, + "step": 22369 + }, + { + "epoch": 0.574398872771099, + "grad_norm": 0.76953125, + "learning_rate": 0.00014893567354932016, + "loss": 0.7962, + "step": 22370 + }, + { + "epoch": 0.5744245499670209, + "grad_norm": 0.75390625, + "learning_rate": 0.0001489317803430196, + "loss": 0.7873, + "step": 22371 + }, + { + "epoch": 0.5744502271629427, + "grad_norm": 0.8515625, + "learning_rate": 0.00014892788703920249, + "loss": 0.9795, + "step": 22372 + }, + { + "epoch": 0.5744759043588645, + "grad_norm": 0.83203125, + "learning_rate": 0.00014892399363787648, + "loss": 0.888, + "step": 22373 + }, + { + "epoch": 0.5745015815547863, + "grad_norm": 0.76953125, + "learning_rate": 0.00014892010013904935, + "loss": 0.8653, + "step": 22374 + }, + { + "epoch": 0.5745272587507081, + "grad_norm": 0.73828125, + "learning_rate": 0.00014891620654272893, + "loss": 0.8153, + "step": 22375 + }, + { + "epoch": 0.5745529359466299, + "grad_norm": 0.734375, + "learning_rate": 0.00014891231284892294, + "loss": 0.9029, + "step": 22376 + }, + { + "epoch": 0.5745786131425518, + "grad_norm": 0.7578125, + "learning_rate": 0.00014890841905763907, + "loss": 0.9023, + "step": 22377 + }, + { + "epoch": 0.5746042903384736, + "grad_norm": 0.8515625, + "learning_rate": 0.00014890452516888518, + "loss": 0.8439, + "step": 22378 + }, + { + "epoch": 0.5746299675343954, + "grad_norm": 0.8046875, + "learning_rate": 0.00014890063118266894, + "loss": 0.8708, + "step": 22379 + }, + { + "epoch": 0.5746556447303173, + "grad_norm": 0.78125, + "learning_rate": 0.00014889673709899822, + "loss": 1.0059, + "step": 22380 + }, + { + "epoch": 0.574681321926239, + "grad_norm": 0.80859375, + "learning_rate": 0.00014889284291788067, + "loss": 0.8649, + "step": 22381 + }, + { + "epoch": 0.5747069991221608, + "grad_norm": 0.75, + "learning_rate": 0.00014888894863932415, + "loss": 0.8444, + "step": 22382 + }, + { + "epoch": 0.5747326763180827, + "grad_norm": 0.78515625, + "learning_rate": 0.00014888505426333633, + "loss": 0.8365, + "step": 22383 + }, + { + "epoch": 0.5747583535140045, + "grad_norm": 0.79296875, + "learning_rate": 0.00014888115978992504, + "loss": 0.8838, + "step": 22384 + }, + { + "epoch": 0.5747840307099263, + "grad_norm": 0.72265625, + "learning_rate": 0.000148877265219098, + "loss": 0.9126, + "step": 22385 + }, + { + "epoch": 0.5748097079058482, + "grad_norm": 0.7890625, + "learning_rate": 0.000148873370550863, + "loss": 0.9151, + "step": 22386 + }, + { + "epoch": 0.57483538510177, + "grad_norm": 0.76171875, + "learning_rate": 0.00014886947578522777, + "loss": 0.8048, + "step": 22387 + }, + { + "epoch": 0.5748610622976917, + "grad_norm": 0.76953125, + "learning_rate": 0.00014886558092220008, + "loss": 0.9052, + "step": 22388 + }, + { + "epoch": 0.5748867394936136, + "grad_norm": 0.796875, + "learning_rate": 0.00014886168596178772, + "loss": 0.8853, + "step": 22389 + }, + { + "epoch": 0.5749124166895354, + "grad_norm": 0.75390625, + "learning_rate": 0.00014885779090399845, + "loss": 0.8664, + "step": 22390 + }, + { + "epoch": 0.5749380938854572, + "grad_norm": 0.8359375, + "learning_rate": 0.00014885389574884, + "loss": 0.8993, + "step": 22391 + }, + { + "epoch": 0.5749637710813791, + "grad_norm": 0.74609375, + "learning_rate": 0.00014885000049632015, + "loss": 0.895, + "step": 22392 + }, + { + "epoch": 0.5749894482773009, + "grad_norm": 0.72265625, + "learning_rate": 0.00014884610514644665, + "loss": 0.7812, + "step": 22393 + }, + { + "epoch": 0.5750151254732226, + "grad_norm": 0.7734375, + "learning_rate": 0.00014884220969922727, + "loss": 0.8654, + "step": 22394 + }, + { + "epoch": 0.5750408026691445, + "grad_norm": 0.73046875, + "learning_rate": 0.00014883831415466977, + "loss": 0.9379, + "step": 22395 + }, + { + "epoch": 0.5750664798650663, + "grad_norm": 0.77734375, + "learning_rate": 0.00014883441851278194, + "loss": 0.9262, + "step": 22396 + }, + { + "epoch": 0.5750921570609882, + "grad_norm": 0.7890625, + "learning_rate": 0.00014883052277357155, + "loss": 1.0534, + "step": 22397 + }, + { + "epoch": 0.57511783425691, + "grad_norm": 0.8125, + "learning_rate": 0.0001488266269370463, + "loss": 0.8745, + "step": 22398 + }, + { + "epoch": 0.5751435114528318, + "grad_norm": 0.78515625, + "learning_rate": 0.00014882273100321402, + "loss": 0.8597, + "step": 22399 + }, + { + "epoch": 0.5751691886487537, + "grad_norm": 0.75, + "learning_rate": 0.0001488188349720824, + "loss": 0.9571, + "step": 22400 + }, + { + "epoch": 0.5751948658446754, + "grad_norm": 0.78515625, + "learning_rate": 0.00014881493884365932, + "loss": 0.8516, + "step": 22401 + }, + { + "epoch": 0.5752205430405972, + "grad_norm": 0.734375, + "learning_rate": 0.00014881104261795246, + "loss": 1.0052, + "step": 22402 + }, + { + "epoch": 0.5752462202365191, + "grad_norm": 0.74609375, + "learning_rate": 0.00014880714629496956, + "loss": 0.7461, + "step": 22403 + }, + { + "epoch": 0.5752718974324409, + "grad_norm": 0.7890625, + "learning_rate": 0.00014880324987471844, + "loss": 0.94, + "step": 22404 + }, + { + "epoch": 0.5752975746283627, + "grad_norm": 0.7734375, + "learning_rate": 0.00014879935335720688, + "loss": 0.7663, + "step": 22405 + }, + { + "epoch": 0.5753232518242846, + "grad_norm": 0.828125, + "learning_rate": 0.00014879545674244257, + "loss": 0.9639, + "step": 22406 + }, + { + "epoch": 0.5753489290202064, + "grad_norm": 0.79296875, + "learning_rate": 0.00014879156003043336, + "loss": 0.9522, + "step": 22407 + }, + { + "epoch": 0.5753746062161281, + "grad_norm": 0.80859375, + "learning_rate": 0.00014878766322118698, + "loss": 0.7465, + "step": 22408 + }, + { + "epoch": 0.57540028341205, + "grad_norm": 0.70703125, + "learning_rate": 0.0001487837663147112, + "loss": 0.8226, + "step": 22409 + }, + { + "epoch": 0.5754259606079718, + "grad_norm": 0.7734375, + "learning_rate": 0.00014877986931101375, + "loss": 0.8851, + "step": 22410 + }, + { + "epoch": 0.5754516378038936, + "grad_norm": 0.8359375, + "learning_rate": 0.00014877597221010243, + "loss": 0.8164, + "step": 22411 + }, + { + "epoch": 0.5754773149998155, + "grad_norm": 0.8359375, + "learning_rate": 0.000148772075011985, + "loss": 0.8742, + "step": 22412 + }, + { + "epoch": 0.5755029921957373, + "grad_norm": 0.81640625, + "learning_rate": 0.00014876817771666926, + "loss": 0.8853, + "step": 22413 + }, + { + "epoch": 0.575528669391659, + "grad_norm": 0.7109375, + "learning_rate": 0.00014876428032416293, + "loss": 0.7536, + "step": 22414 + }, + { + "epoch": 0.5755543465875809, + "grad_norm": 0.73828125, + "learning_rate": 0.0001487603828344738, + "loss": 0.9458, + "step": 22415 + }, + { + "epoch": 0.5755800237835027, + "grad_norm": 0.70703125, + "learning_rate": 0.00014875648524760961, + "loss": 1.0239, + "step": 22416 + }, + { + "epoch": 0.5756057009794245, + "grad_norm": 0.91796875, + "learning_rate": 0.00014875258756357818, + "loss": 0.8544, + "step": 22417 + }, + { + "epoch": 0.5756313781753464, + "grad_norm": 0.7578125, + "learning_rate": 0.00014874868978238722, + "loss": 0.8965, + "step": 22418 + }, + { + "epoch": 0.5756570553712682, + "grad_norm": 0.8515625, + "learning_rate": 0.0001487447919040445, + "loss": 0.9034, + "step": 22419 + }, + { + "epoch": 0.57568273256719, + "grad_norm": 0.73046875, + "learning_rate": 0.00014874089392855787, + "loss": 0.7984, + "step": 22420 + }, + { + "epoch": 0.5757084097631118, + "grad_norm": 0.81640625, + "learning_rate": 0.00014873699585593502, + "loss": 0.92, + "step": 22421 + }, + { + "epoch": 0.5757340869590336, + "grad_norm": 0.7734375, + "learning_rate": 0.00014873309768618375, + "loss": 0.8601, + "step": 22422 + }, + { + "epoch": 0.5757597641549554, + "grad_norm": 0.8125, + "learning_rate": 0.00014872919941931183, + "loss": 0.9155, + "step": 22423 + }, + { + "epoch": 0.5757854413508773, + "grad_norm": 0.75390625, + "learning_rate": 0.00014872530105532699, + "loss": 0.8899, + "step": 22424 + }, + { + "epoch": 0.5758111185467991, + "grad_norm": 0.8125, + "learning_rate": 0.00014872140259423702, + "loss": 0.8097, + "step": 22425 + }, + { + "epoch": 0.575836795742721, + "grad_norm": 0.81640625, + "learning_rate": 0.00014871750403604972, + "loss": 0.9279, + "step": 22426 + }, + { + "epoch": 0.5758624729386428, + "grad_norm": 2.015625, + "learning_rate": 0.0001487136053807728, + "loss": 0.8349, + "step": 22427 + }, + { + "epoch": 0.5758881501345645, + "grad_norm": 0.75, + "learning_rate": 0.0001487097066284141, + "loss": 1.0322, + "step": 22428 + }, + { + "epoch": 0.5759138273304863, + "grad_norm": 0.66015625, + "learning_rate": 0.00014870580777898135, + "loss": 0.8332, + "step": 22429 + }, + { + "epoch": 0.5759395045264082, + "grad_norm": 0.77734375, + "learning_rate": 0.0001487019088324823, + "loss": 0.9713, + "step": 22430 + }, + { + "epoch": 0.57596518172233, + "grad_norm": 0.82421875, + "learning_rate": 0.00014869800978892478, + "loss": 0.9855, + "step": 22431 + }, + { + "epoch": 0.5759908589182519, + "grad_norm": 0.8203125, + "learning_rate": 0.0001486941106483165, + "loss": 0.8639, + "step": 22432 + }, + { + "epoch": 0.5760165361141737, + "grad_norm": 0.7890625, + "learning_rate": 0.00014869021141066525, + "loss": 0.9361, + "step": 22433 + }, + { + "epoch": 0.5760422133100954, + "grad_norm": 0.75390625, + "learning_rate": 0.00014868631207597883, + "loss": 0.7821, + "step": 22434 + }, + { + "epoch": 0.5760678905060173, + "grad_norm": 0.74609375, + "learning_rate": 0.00014868241264426497, + "loss": 0.7574, + "step": 22435 + }, + { + "epoch": 0.5760935677019391, + "grad_norm": 0.703125, + "learning_rate": 0.00014867851311553147, + "loss": 0.8702, + "step": 22436 + }, + { + "epoch": 0.5761192448978609, + "grad_norm": 0.9453125, + "learning_rate": 0.0001486746134897861, + "loss": 0.8159, + "step": 22437 + }, + { + "epoch": 0.5761449220937828, + "grad_norm": 0.87109375, + "learning_rate": 0.00014867071376703658, + "loss": 0.8299, + "step": 22438 + }, + { + "epoch": 0.5761705992897046, + "grad_norm": 0.75, + "learning_rate": 0.00014866681394729077, + "loss": 0.8158, + "step": 22439 + }, + { + "epoch": 0.5761962764856264, + "grad_norm": 0.91796875, + "learning_rate": 0.00014866291403055638, + "loss": 0.8746, + "step": 22440 + }, + { + "epoch": 0.5762219536815482, + "grad_norm": 0.8203125, + "learning_rate": 0.00014865901401684122, + "loss": 1.0011, + "step": 22441 + }, + { + "epoch": 0.57624763087747, + "grad_norm": 0.78125, + "learning_rate": 0.00014865511390615302, + "loss": 0.894, + "step": 22442 + }, + { + "epoch": 0.5762733080733918, + "grad_norm": 0.69921875, + "learning_rate": 0.0001486512136984995, + "loss": 0.7689, + "step": 22443 + }, + { + "epoch": 0.5762989852693137, + "grad_norm": 0.95703125, + "learning_rate": 0.00014864731339388863, + "loss": 0.8765, + "step": 22444 + }, + { + "epoch": 0.5763246624652355, + "grad_norm": 0.75, + "learning_rate": 0.000148643412992328, + "loss": 0.7243, + "step": 22445 + }, + { + "epoch": 0.5763503396611573, + "grad_norm": 0.6953125, + "learning_rate": 0.00014863951249382545, + "loss": 0.747, + "step": 22446 + }, + { + "epoch": 0.5763760168570792, + "grad_norm": 0.75390625, + "learning_rate": 0.00014863561189838872, + "loss": 0.8847, + "step": 22447 + }, + { + "epoch": 0.5764016940530009, + "grad_norm": 0.83203125, + "learning_rate": 0.00014863171120602564, + "loss": 0.936, + "step": 22448 + }, + { + "epoch": 0.5764273712489227, + "grad_norm": 0.7890625, + "learning_rate": 0.00014862781041674394, + "loss": 0.958, + "step": 22449 + }, + { + "epoch": 0.5764530484448446, + "grad_norm": 0.828125, + "learning_rate": 0.0001486239095305514, + "loss": 0.8594, + "step": 22450 + }, + { + "epoch": 0.5764787256407664, + "grad_norm": 0.74609375, + "learning_rate": 0.00014862000854745582, + "loss": 0.8776, + "step": 22451 + }, + { + "epoch": 0.5765044028366882, + "grad_norm": 0.78125, + "learning_rate": 0.00014861610746746496, + "loss": 0.947, + "step": 22452 + }, + { + "epoch": 0.5765300800326101, + "grad_norm": 0.80859375, + "learning_rate": 0.00014861220629058656, + "loss": 0.9121, + "step": 22453 + }, + { + "epoch": 0.5765557572285318, + "grad_norm": 0.76953125, + "learning_rate": 0.00014860830501682844, + "loss": 0.9368, + "step": 22454 + }, + { + "epoch": 0.5765814344244536, + "grad_norm": 0.73046875, + "learning_rate": 0.00014860440364619838, + "loss": 0.7777, + "step": 22455 + }, + { + "epoch": 0.5766071116203755, + "grad_norm": 0.79296875, + "learning_rate": 0.00014860050217870406, + "loss": 0.8074, + "step": 22456 + }, + { + "epoch": 0.5766327888162973, + "grad_norm": 0.80859375, + "learning_rate": 0.00014859660061435342, + "loss": 0.9892, + "step": 22457 + }, + { + "epoch": 0.5766584660122192, + "grad_norm": 0.76953125, + "learning_rate": 0.00014859269895315408, + "loss": 0.9228, + "step": 22458 + }, + { + "epoch": 0.576684143208141, + "grad_norm": 0.84375, + "learning_rate": 0.0001485887971951139, + "loss": 0.9505, + "step": 22459 + }, + { + "epoch": 0.5767098204040628, + "grad_norm": 0.73046875, + "learning_rate": 0.00014858489534024064, + "loss": 0.8186, + "step": 22460 + }, + { + "epoch": 0.5767354975999845, + "grad_norm": 0.8359375, + "learning_rate": 0.0001485809933885421, + "loss": 0.7582, + "step": 22461 + }, + { + "epoch": 0.5767611747959064, + "grad_norm": 0.796875, + "learning_rate": 0.000148577091340026, + "loss": 0.9281, + "step": 22462 + }, + { + "epoch": 0.5767868519918282, + "grad_norm": 0.85546875, + "learning_rate": 0.00014857318919470012, + "loss": 1.0845, + "step": 22463 + }, + { + "epoch": 0.5768125291877501, + "grad_norm": 0.84765625, + "learning_rate": 0.00014856928695257228, + "loss": 0.8749, + "step": 22464 + }, + { + "epoch": 0.5768382063836719, + "grad_norm": 0.82421875, + "learning_rate": 0.00014856538461365025, + "loss": 1.0501, + "step": 22465 + }, + { + "epoch": 0.5768638835795937, + "grad_norm": 0.73046875, + "learning_rate": 0.00014856148217794182, + "loss": 0.769, + "step": 22466 + }, + { + "epoch": 0.5768895607755156, + "grad_norm": 0.78125, + "learning_rate": 0.0001485575796454547, + "loss": 0.8434, + "step": 22467 + }, + { + "epoch": 0.5769152379714373, + "grad_norm": 0.8515625, + "learning_rate": 0.00014855367701619674, + "loss": 0.8036, + "step": 22468 + }, + { + "epoch": 0.5769409151673591, + "grad_norm": 0.828125, + "learning_rate": 0.00014854977429017567, + "loss": 0.9792, + "step": 22469 + }, + { + "epoch": 0.576966592363281, + "grad_norm": 0.77734375, + "learning_rate": 0.00014854587146739928, + "loss": 0.9008, + "step": 22470 + }, + { + "epoch": 0.5769922695592028, + "grad_norm": 0.80859375, + "learning_rate": 0.00014854196854787537, + "loss": 0.7692, + "step": 22471 + }, + { + "epoch": 0.5770179467551246, + "grad_norm": 0.74609375, + "learning_rate": 0.0001485380655316117, + "loss": 0.8605, + "step": 22472 + }, + { + "epoch": 0.5770436239510465, + "grad_norm": 0.74609375, + "learning_rate": 0.00014853416241861602, + "loss": 0.8362, + "step": 22473 + }, + { + "epoch": 0.5770693011469682, + "grad_norm": 0.72265625, + "learning_rate": 0.00014853025920889617, + "loss": 0.8016, + "step": 22474 + }, + { + "epoch": 0.57709497834289, + "grad_norm": 0.79296875, + "learning_rate": 0.00014852635590245988, + "loss": 0.9318, + "step": 22475 + }, + { + "epoch": 0.5771206555388119, + "grad_norm": 0.79296875, + "learning_rate": 0.00014852245249931495, + "loss": 0.8037, + "step": 22476 + }, + { + "epoch": 0.5771463327347337, + "grad_norm": 0.828125, + "learning_rate": 0.00014851854899946917, + "loss": 0.8317, + "step": 22477 + }, + { + "epoch": 0.5771720099306555, + "grad_norm": 0.73828125, + "learning_rate": 0.00014851464540293027, + "loss": 0.7967, + "step": 22478 + }, + { + "epoch": 0.5771976871265774, + "grad_norm": 0.80859375, + "learning_rate": 0.0001485107417097061, + "loss": 0.9542, + "step": 22479 + }, + { + "epoch": 0.5772233643224992, + "grad_norm": 0.73046875, + "learning_rate": 0.0001485068379198044, + "loss": 0.8535, + "step": 22480 + }, + { + "epoch": 0.5772490415184209, + "grad_norm": 0.80859375, + "learning_rate": 0.00014850293403323296, + "loss": 0.9882, + "step": 22481 + }, + { + "epoch": 0.5772747187143428, + "grad_norm": 0.76171875, + "learning_rate": 0.00014849903004999953, + "loss": 0.862, + "step": 22482 + }, + { + "epoch": 0.5773003959102646, + "grad_norm": 0.77734375, + "learning_rate": 0.0001484951259701119, + "loss": 0.7934, + "step": 22483 + }, + { + "epoch": 0.5773260731061864, + "grad_norm": 0.76953125, + "learning_rate": 0.00014849122179357791, + "loss": 0.7442, + "step": 22484 + }, + { + "epoch": 0.5773517503021083, + "grad_norm": 0.8125, + "learning_rate": 0.00014848731752040525, + "loss": 0.858, + "step": 22485 + }, + { + "epoch": 0.5773774274980301, + "grad_norm": 0.73046875, + "learning_rate": 0.00014848341315060176, + "loss": 0.9237, + "step": 22486 + }, + { + "epoch": 0.577403104693952, + "grad_norm": 0.7578125, + "learning_rate": 0.00014847950868417524, + "loss": 0.7946, + "step": 22487 + }, + { + "epoch": 0.5774287818898737, + "grad_norm": 0.7421875, + "learning_rate": 0.0001484756041211334, + "loss": 0.8499, + "step": 22488 + }, + { + "epoch": 0.5774544590857955, + "grad_norm": 0.77734375, + "learning_rate": 0.0001484716994614841, + "loss": 0.844, + "step": 22489 + }, + { + "epoch": 0.5774801362817173, + "grad_norm": 0.7890625, + "learning_rate": 0.00014846779470523505, + "loss": 0.8167, + "step": 22490 + }, + { + "epoch": 0.5775058134776392, + "grad_norm": 0.8125, + "learning_rate": 0.00014846388985239405, + "loss": 0.9537, + "step": 22491 + }, + { + "epoch": 0.577531490673561, + "grad_norm": 0.75390625, + "learning_rate": 0.0001484599849029689, + "loss": 0.8592, + "step": 22492 + }, + { + "epoch": 0.5775571678694829, + "grad_norm": 0.7109375, + "learning_rate": 0.0001484560798569674, + "loss": 1.0361, + "step": 22493 + }, + { + "epoch": 0.5775828450654046, + "grad_norm": 0.75390625, + "learning_rate": 0.0001484521747143973, + "loss": 0.9053, + "step": 22494 + }, + { + "epoch": 0.5776085222613264, + "grad_norm": 0.89453125, + "learning_rate": 0.0001484482694752664, + "loss": 0.8407, + "step": 22495 + }, + { + "epoch": 0.5776341994572483, + "grad_norm": 0.765625, + "learning_rate": 0.00014844436413958246, + "loss": 0.9822, + "step": 22496 + }, + { + "epoch": 0.5776598766531701, + "grad_norm": 0.8671875, + "learning_rate": 0.0001484404587073533, + "loss": 0.9622, + "step": 22497 + }, + { + "epoch": 0.5776855538490919, + "grad_norm": 0.7578125, + "learning_rate": 0.00014843655317858668, + "loss": 0.8924, + "step": 22498 + }, + { + "epoch": 0.5777112310450138, + "grad_norm": 0.921875, + "learning_rate": 0.00014843264755329033, + "loss": 0.9913, + "step": 22499 + }, + { + "epoch": 0.5777369082409356, + "grad_norm": 0.70703125, + "learning_rate": 0.00014842874183147216, + "loss": 0.8161, + "step": 22500 + }, + { + "epoch": 0.5777625854368573, + "grad_norm": 0.765625, + "learning_rate": 0.00014842483601313985, + "loss": 0.6835, + "step": 22501 + }, + { + "epoch": 0.5777882626327792, + "grad_norm": 0.77734375, + "learning_rate": 0.0001484209300983012, + "loss": 0.8564, + "step": 22502 + }, + { + "epoch": 0.577813939828701, + "grad_norm": 0.75, + "learning_rate": 0.00014841702408696406, + "loss": 0.8837, + "step": 22503 + }, + { + "epoch": 0.5778396170246228, + "grad_norm": 0.7734375, + "learning_rate": 0.0001484131179791361, + "loss": 0.8916, + "step": 22504 + }, + { + "epoch": 0.5778652942205447, + "grad_norm": 0.7734375, + "learning_rate": 0.0001484092117748252, + "loss": 0.938, + "step": 22505 + }, + { + "epoch": 0.5778909714164665, + "grad_norm": 0.84375, + "learning_rate": 0.00014840530547403913, + "loss": 1.011, + "step": 22506 + }, + { + "epoch": 0.5779166486123883, + "grad_norm": 0.80078125, + "learning_rate": 0.00014840139907678566, + "loss": 0.7641, + "step": 22507 + }, + { + "epoch": 0.5779423258083101, + "grad_norm": 0.75390625, + "learning_rate": 0.00014839749258307257, + "loss": 0.8679, + "step": 22508 + }, + { + "epoch": 0.5779680030042319, + "grad_norm": 0.765625, + "learning_rate": 0.00014839358599290763, + "loss": 0.6797, + "step": 22509 + }, + { + "epoch": 0.5779936802001537, + "grad_norm": 0.75, + "learning_rate": 0.00014838967930629862, + "loss": 0.8205, + "step": 22510 + }, + { + "epoch": 0.5780193573960756, + "grad_norm": 0.81640625, + "learning_rate": 0.0001483857725232534, + "loss": 0.8588, + "step": 22511 + }, + { + "epoch": 0.5780450345919974, + "grad_norm": 0.77734375, + "learning_rate": 0.0001483818656437797, + "loss": 0.8139, + "step": 22512 + }, + { + "epoch": 0.5780707117879192, + "grad_norm": 0.72265625, + "learning_rate": 0.00014837795866788528, + "loss": 0.7979, + "step": 22513 + }, + { + "epoch": 0.578096388983841, + "grad_norm": 0.7265625, + "learning_rate": 0.000148374051595578, + "loss": 0.7335, + "step": 22514 + }, + { + "epoch": 0.5781220661797628, + "grad_norm": 0.80859375, + "learning_rate": 0.00014837014442686556, + "loss": 0.8739, + "step": 22515 + }, + { + "epoch": 0.5781477433756846, + "grad_norm": 0.765625, + "learning_rate": 0.00014836623716175585, + "loss": 1.0068, + "step": 22516 + }, + { + "epoch": 0.5781734205716065, + "grad_norm": 0.7890625, + "learning_rate": 0.00014836232980025655, + "loss": 0.8082, + "step": 22517 + }, + { + "epoch": 0.5781990977675283, + "grad_norm": 0.8046875, + "learning_rate": 0.0001483584223423755, + "loss": 0.9442, + "step": 22518 + }, + { + "epoch": 0.5782247749634501, + "grad_norm": 0.75, + "learning_rate": 0.00014835451478812048, + "loss": 0.8191, + "step": 22519 + }, + { + "epoch": 0.578250452159372, + "grad_norm": 0.69140625, + "learning_rate": 0.0001483506071374993, + "loss": 0.8892, + "step": 22520 + }, + { + "epoch": 0.5782761293552937, + "grad_norm": 0.76171875, + "learning_rate": 0.00014834669939051972, + "loss": 0.8901, + "step": 22521 + }, + { + "epoch": 0.5783018065512155, + "grad_norm": 0.75, + "learning_rate": 0.00014834279154718954, + "loss": 0.9316, + "step": 22522 + }, + { + "epoch": 0.5783274837471374, + "grad_norm": 0.7578125, + "learning_rate": 0.00014833888360751653, + "loss": 0.7893, + "step": 22523 + }, + { + "epoch": 0.5783531609430592, + "grad_norm": 0.78515625, + "learning_rate": 0.00014833497557150847, + "loss": 0.8293, + "step": 22524 + }, + { + "epoch": 0.578378838138981, + "grad_norm": 0.79296875, + "learning_rate": 0.00014833106743917322, + "loss": 0.8704, + "step": 22525 + }, + { + "epoch": 0.5784045153349029, + "grad_norm": 0.7890625, + "learning_rate": 0.00014832715921051849, + "loss": 0.8835, + "step": 22526 + }, + { + "epoch": 0.5784301925308246, + "grad_norm": 0.79296875, + "learning_rate": 0.00014832325088555212, + "loss": 0.7468, + "step": 22527 + }, + { + "epoch": 0.5784558697267465, + "grad_norm": 0.78125, + "learning_rate": 0.00014831934246428184, + "loss": 0.8448, + "step": 22528 + }, + { + "epoch": 0.5784815469226683, + "grad_norm": 0.72265625, + "learning_rate": 0.0001483154339467155, + "loss": 0.8682, + "step": 22529 + }, + { + "epoch": 0.5785072241185901, + "grad_norm": 0.84375, + "learning_rate": 0.00014831152533286085, + "loss": 0.8071, + "step": 22530 + }, + { + "epoch": 0.578532901314512, + "grad_norm": 0.80859375, + "learning_rate": 0.0001483076166227257, + "loss": 0.849, + "step": 22531 + }, + { + "epoch": 0.5785585785104338, + "grad_norm": 0.7578125, + "learning_rate": 0.00014830370781631787, + "loss": 1.0144, + "step": 22532 + }, + { + "epoch": 0.5785842557063556, + "grad_norm": 0.7734375, + "learning_rate": 0.00014829979891364508, + "loss": 0.8437, + "step": 22533 + }, + { + "epoch": 0.5786099329022774, + "grad_norm": 0.7109375, + "learning_rate": 0.00014829588991471514, + "loss": 0.7966, + "step": 22534 + }, + { + "epoch": 0.5786356100981992, + "grad_norm": 0.8046875, + "learning_rate": 0.00014829198081953585, + "loss": 1.0093, + "step": 22535 + }, + { + "epoch": 0.578661287294121, + "grad_norm": 0.76171875, + "learning_rate": 0.00014828807162811503, + "loss": 0.7523, + "step": 22536 + }, + { + "epoch": 0.5786869644900429, + "grad_norm": 0.75, + "learning_rate": 0.00014828416234046046, + "loss": 0.8232, + "step": 22537 + }, + { + "epoch": 0.5787126416859647, + "grad_norm": 0.75390625, + "learning_rate": 0.00014828025295657987, + "loss": 0.8333, + "step": 22538 + }, + { + "epoch": 0.5787383188818865, + "grad_norm": 0.9296875, + "learning_rate": 0.00014827634347648112, + "loss": 0.8531, + "step": 22539 + }, + { + "epoch": 0.5787639960778084, + "grad_norm": 0.78125, + "learning_rate": 0.00014827243390017197, + "loss": 0.8192, + "step": 22540 + }, + { + "epoch": 0.5787896732737301, + "grad_norm": 0.77734375, + "learning_rate": 0.00014826852422766023, + "loss": 0.8912, + "step": 22541 + }, + { + "epoch": 0.5788153504696519, + "grad_norm": 0.73046875, + "learning_rate": 0.00014826461445895366, + "loss": 0.9015, + "step": 22542 + }, + { + "epoch": 0.5788410276655738, + "grad_norm": 0.7578125, + "learning_rate": 0.00014826070459406015, + "loss": 0.8799, + "step": 22543 + }, + { + "epoch": 0.5788667048614956, + "grad_norm": 0.75390625, + "learning_rate": 0.00014825679463298733, + "loss": 0.8319, + "step": 22544 + }, + { + "epoch": 0.5788923820574174, + "grad_norm": 0.89453125, + "learning_rate": 0.00014825288457574311, + "loss": 0.8673, + "step": 22545 + }, + { + "epoch": 0.5789180592533393, + "grad_norm": 0.7421875, + "learning_rate": 0.00014824897442233524, + "loss": 0.7855, + "step": 22546 + }, + { + "epoch": 0.578943736449261, + "grad_norm": 0.72265625, + "learning_rate": 0.00014824506417277154, + "loss": 0.8927, + "step": 22547 + }, + { + "epoch": 0.5789694136451828, + "grad_norm": 0.84765625, + "learning_rate": 0.00014824115382705975, + "loss": 0.8189, + "step": 22548 + }, + { + "epoch": 0.5789950908411047, + "grad_norm": 0.82421875, + "learning_rate": 0.00014823724338520775, + "loss": 0.7358, + "step": 22549 + }, + { + "epoch": 0.5790207680370265, + "grad_norm": 0.78125, + "learning_rate": 0.00014823333284722323, + "loss": 1.0435, + "step": 22550 + }, + { + "epoch": 0.5790464452329483, + "grad_norm": 0.796875, + "learning_rate": 0.00014822942221311407, + "loss": 0.8646, + "step": 22551 + }, + { + "epoch": 0.5790721224288702, + "grad_norm": 0.86328125, + "learning_rate": 0.00014822551148288798, + "loss": 0.855, + "step": 22552 + }, + { + "epoch": 0.579097799624792, + "grad_norm": 0.78125, + "learning_rate": 0.00014822160065655287, + "loss": 0.9471, + "step": 22553 + }, + { + "epoch": 0.5791234768207137, + "grad_norm": 0.7578125, + "learning_rate": 0.00014821768973411644, + "loss": 0.8609, + "step": 22554 + }, + { + "epoch": 0.5791491540166356, + "grad_norm": 0.79296875, + "learning_rate": 0.0001482137787155865, + "loss": 0.7335, + "step": 22555 + }, + { + "epoch": 0.5791748312125574, + "grad_norm": 0.80859375, + "learning_rate": 0.00014820986760097083, + "loss": 0.8624, + "step": 22556 + }, + { + "epoch": 0.5792005084084793, + "grad_norm": 0.859375, + "learning_rate": 0.00014820595639027727, + "loss": 1.0604, + "step": 22557 + }, + { + "epoch": 0.5792261856044011, + "grad_norm": 0.80859375, + "learning_rate": 0.00014820204508351358, + "loss": 0.8475, + "step": 22558 + }, + { + "epoch": 0.5792518628003229, + "grad_norm": 0.74609375, + "learning_rate": 0.0001481981336806876, + "loss": 0.7878, + "step": 22559 + }, + { + "epoch": 0.5792775399962448, + "grad_norm": 0.77734375, + "learning_rate": 0.00014819422218180706, + "loss": 0.9117, + "step": 22560 + }, + { + "epoch": 0.5793032171921665, + "grad_norm": 0.78515625, + "learning_rate": 0.00014819031058687984, + "loss": 0.8572, + "step": 22561 + }, + { + "epoch": 0.5793288943880883, + "grad_norm": 0.76953125, + "learning_rate": 0.00014818639889591366, + "loss": 0.7904, + "step": 22562 + }, + { + "epoch": 0.5793545715840102, + "grad_norm": 0.78125, + "learning_rate": 0.0001481824871089163, + "loss": 0.8631, + "step": 22563 + }, + { + "epoch": 0.579380248779932, + "grad_norm": 0.8515625, + "learning_rate": 0.00014817857522589565, + "loss": 0.8639, + "step": 22564 + }, + { + "epoch": 0.5794059259758538, + "grad_norm": 0.7109375, + "learning_rate": 0.00014817466324685942, + "loss": 0.848, + "step": 22565 + }, + { + "epoch": 0.5794316031717757, + "grad_norm": 0.77734375, + "learning_rate": 0.00014817075117181544, + "loss": 0.7825, + "step": 22566 + }, + { + "epoch": 0.5794572803676974, + "grad_norm": 0.796875, + "learning_rate": 0.00014816683900077153, + "loss": 0.9755, + "step": 22567 + }, + { + "epoch": 0.5794829575636192, + "grad_norm": 0.74609375, + "learning_rate": 0.00014816292673373543, + "loss": 1.0038, + "step": 22568 + }, + { + "epoch": 0.5795086347595411, + "grad_norm": 0.80078125, + "learning_rate": 0.000148159014370715, + "loss": 0.7639, + "step": 22569 + }, + { + "epoch": 0.5795343119554629, + "grad_norm": 0.7421875, + "learning_rate": 0.00014815510191171796, + "loss": 0.7618, + "step": 22570 + }, + { + "epoch": 0.5795599891513847, + "grad_norm": 0.73046875, + "learning_rate": 0.00014815118935675216, + "loss": 0.7793, + "step": 22571 + }, + { + "epoch": 0.5795856663473066, + "grad_norm": 0.76171875, + "learning_rate": 0.0001481472767058254, + "loss": 0.9014, + "step": 22572 + }, + { + "epoch": 0.5796113435432284, + "grad_norm": 0.78125, + "learning_rate": 0.00014814336395894547, + "loss": 0.7844, + "step": 22573 + }, + { + "epoch": 0.5796370207391501, + "grad_norm": 0.81640625, + "learning_rate": 0.00014813945111612017, + "loss": 0.8406, + "step": 22574 + }, + { + "epoch": 0.579662697935072, + "grad_norm": 0.75390625, + "learning_rate": 0.0001481355381773573, + "loss": 0.8002, + "step": 22575 + }, + { + "epoch": 0.5796883751309938, + "grad_norm": 0.8046875, + "learning_rate": 0.00014813162514266463, + "loss": 0.9431, + "step": 22576 + }, + { + "epoch": 0.5797140523269156, + "grad_norm": 0.76171875, + "learning_rate": 0.00014812771201204997, + "loss": 0.9836, + "step": 22577 + }, + { + "epoch": 0.5797397295228375, + "grad_norm": 0.8125, + "learning_rate": 0.00014812379878552116, + "loss": 0.9139, + "step": 22578 + }, + { + "epoch": 0.5797654067187593, + "grad_norm": 0.7890625, + "learning_rate": 0.00014811988546308596, + "loss": 0.8713, + "step": 22579 + }, + { + "epoch": 0.5797910839146811, + "grad_norm": 0.80078125, + "learning_rate": 0.00014811597204475216, + "loss": 0.922, + "step": 22580 + }, + { + "epoch": 0.5798167611106029, + "grad_norm": 0.78125, + "learning_rate": 0.00014811205853052756, + "loss": 0.9547, + "step": 22581 + }, + { + "epoch": 0.5798424383065247, + "grad_norm": 0.81640625, + "learning_rate": 0.00014810814492042, + "loss": 0.972, + "step": 22582 + }, + { + "epoch": 0.5798681155024465, + "grad_norm": 0.73046875, + "learning_rate": 0.00014810423121443725, + "loss": 0.6983, + "step": 22583 + }, + { + "epoch": 0.5798937926983684, + "grad_norm": 0.7578125, + "learning_rate": 0.0001481003174125871, + "loss": 0.9075, + "step": 22584 + }, + { + "epoch": 0.5799194698942902, + "grad_norm": 0.796875, + "learning_rate": 0.00014809640351487737, + "loss": 0.9286, + "step": 22585 + }, + { + "epoch": 0.579945147090212, + "grad_norm": 0.75390625, + "learning_rate": 0.00014809248952131588, + "loss": 0.823, + "step": 22586 + }, + { + "epoch": 0.5799708242861338, + "grad_norm": 0.7578125, + "learning_rate": 0.00014808857543191034, + "loss": 0.8477, + "step": 22587 + }, + { + "epoch": 0.5799965014820556, + "grad_norm": 0.6875, + "learning_rate": 0.00014808466124666865, + "loss": 0.9121, + "step": 22588 + }, + { + "epoch": 0.5800221786779775, + "grad_norm": 0.80078125, + "learning_rate": 0.00014808074696559857, + "loss": 1.056, + "step": 22589 + }, + { + "epoch": 0.5800478558738993, + "grad_norm": 0.7890625, + "learning_rate": 0.0001480768325887079, + "loss": 0.8882, + "step": 22590 + }, + { + "epoch": 0.5800735330698211, + "grad_norm": 0.7890625, + "learning_rate": 0.00014807291811600446, + "loss": 0.8583, + "step": 22591 + }, + { + "epoch": 0.580099210265743, + "grad_norm": 0.73828125, + "learning_rate": 0.00014806900354749603, + "loss": 1.0149, + "step": 22592 + }, + { + "epoch": 0.5801248874616648, + "grad_norm": 0.76171875, + "learning_rate": 0.00014806508888319043, + "loss": 0.8168, + "step": 22593 + }, + { + "epoch": 0.5801505646575865, + "grad_norm": 0.80859375, + "learning_rate": 0.00014806117412309545, + "loss": 0.8504, + "step": 22594 + }, + { + "epoch": 0.5801762418535084, + "grad_norm": 0.7578125, + "learning_rate": 0.00014805725926721885, + "loss": 0.9285, + "step": 22595 + }, + { + "epoch": 0.5802019190494302, + "grad_norm": 0.76953125, + "learning_rate": 0.0001480533443155685, + "loss": 0.925, + "step": 22596 + }, + { + "epoch": 0.580227596245352, + "grad_norm": 0.78515625, + "learning_rate": 0.00014804942926815217, + "loss": 0.9185, + "step": 22597 + }, + { + "epoch": 0.5802532734412739, + "grad_norm": 0.71875, + "learning_rate": 0.00014804551412497767, + "loss": 0.7491, + "step": 22598 + }, + { + "epoch": 0.5802789506371957, + "grad_norm": 0.76953125, + "learning_rate": 0.00014804159888605282, + "loss": 0.7686, + "step": 22599 + }, + { + "epoch": 0.5803046278331175, + "grad_norm": 0.83203125, + "learning_rate": 0.0001480376835513854, + "loss": 0.9457, + "step": 22600 + }, + { + "epoch": 0.5803303050290393, + "grad_norm": 0.81640625, + "learning_rate": 0.00014803376812098318, + "loss": 0.892, + "step": 22601 + }, + { + "epoch": 0.5803559822249611, + "grad_norm": 0.765625, + "learning_rate": 0.00014802985259485403, + "loss": 0.9505, + "step": 22602 + }, + { + "epoch": 0.5803816594208829, + "grad_norm": 0.92578125, + "learning_rate": 0.00014802593697300567, + "loss": 0.8569, + "step": 22603 + }, + { + "epoch": 0.5804073366168048, + "grad_norm": 0.78125, + "learning_rate": 0.000148022021255446, + "loss": 0.727, + "step": 22604 + }, + { + "epoch": 0.5804330138127266, + "grad_norm": 0.77734375, + "learning_rate": 0.0001480181054421828, + "loss": 0.8234, + "step": 22605 + }, + { + "epoch": 0.5804586910086484, + "grad_norm": 0.83203125, + "learning_rate": 0.00014801418953322383, + "loss": 0.8918, + "step": 22606 + }, + { + "epoch": 0.5804843682045702, + "grad_norm": 0.7890625, + "learning_rate": 0.00014801027352857692, + "loss": 0.8074, + "step": 22607 + }, + { + "epoch": 0.580510045400492, + "grad_norm": 0.8125, + "learning_rate": 0.00014800635742824987, + "loss": 0.7839, + "step": 22608 + }, + { + "epoch": 0.5805357225964138, + "grad_norm": 0.765625, + "learning_rate": 0.0001480024412322505, + "loss": 0.7783, + "step": 22609 + }, + { + "epoch": 0.5805613997923357, + "grad_norm": 0.93359375, + "learning_rate": 0.0001479985249405866, + "loss": 0.7638, + "step": 22610 + }, + { + "epoch": 0.5805870769882575, + "grad_norm": 0.65234375, + "learning_rate": 0.00014799460855326592, + "loss": 0.9242, + "step": 22611 + }, + { + "epoch": 0.5806127541841793, + "grad_norm": 0.75390625, + "learning_rate": 0.00014799069207029639, + "loss": 0.8344, + "step": 22612 + }, + { + "epoch": 0.5806384313801012, + "grad_norm": 0.78125, + "learning_rate": 0.00014798677549168572, + "loss": 0.9113, + "step": 22613 + }, + { + "epoch": 0.5806641085760229, + "grad_norm": 0.734375, + "learning_rate": 0.00014798285881744173, + "loss": 1.0157, + "step": 22614 + }, + { + "epoch": 0.5806897857719447, + "grad_norm": 0.7890625, + "learning_rate": 0.00014797894204757225, + "loss": 0.8295, + "step": 22615 + }, + { + "epoch": 0.5807154629678666, + "grad_norm": 0.80859375, + "learning_rate": 0.00014797502518208505, + "loss": 0.8412, + "step": 22616 + }, + { + "epoch": 0.5807411401637884, + "grad_norm": 0.73828125, + "learning_rate": 0.000147971108220988, + "loss": 0.8369, + "step": 22617 + }, + { + "epoch": 0.5807668173597103, + "grad_norm": 0.72265625, + "learning_rate": 0.00014796719116428884, + "loss": 0.8747, + "step": 22618 + }, + { + "epoch": 0.5807924945556321, + "grad_norm": 0.74609375, + "learning_rate": 0.00014796327401199542, + "loss": 0.899, + "step": 22619 + }, + { + "epoch": 0.5808181717515539, + "grad_norm": 0.76953125, + "learning_rate": 0.0001479593567641155, + "loss": 0.9014, + "step": 22620 + }, + { + "epoch": 0.5808438489474756, + "grad_norm": 0.8671875, + "learning_rate": 0.00014795543942065691, + "loss": 0.9897, + "step": 22621 + }, + { + "epoch": 0.5808695261433975, + "grad_norm": 0.83203125, + "learning_rate": 0.00014795152198162747, + "loss": 0.9147, + "step": 22622 + }, + { + "epoch": 0.5808952033393193, + "grad_norm": 0.72265625, + "learning_rate": 0.00014794760444703498, + "loss": 0.7267, + "step": 22623 + }, + { + "epoch": 0.5809208805352412, + "grad_norm": 0.84375, + "learning_rate": 0.00014794368681688726, + "loss": 0.9704, + "step": 22624 + }, + { + "epoch": 0.580946557731163, + "grad_norm": 0.76171875, + "learning_rate": 0.00014793976909119208, + "loss": 0.8635, + "step": 22625 + }, + { + "epoch": 0.5809722349270848, + "grad_norm": 0.78515625, + "learning_rate": 0.0001479358512699573, + "loss": 0.7235, + "step": 22626 + }, + { + "epoch": 0.5809979121230066, + "grad_norm": 0.79296875, + "learning_rate": 0.00014793193335319068, + "loss": 0.8851, + "step": 22627 + }, + { + "epoch": 0.5810235893189284, + "grad_norm": 0.8984375, + "learning_rate": 0.00014792801534090004, + "loss": 0.8371, + "step": 22628 + }, + { + "epoch": 0.5810492665148502, + "grad_norm": 0.7578125, + "learning_rate": 0.0001479240972330932, + "loss": 0.8657, + "step": 22629 + }, + { + "epoch": 0.5810749437107721, + "grad_norm": 0.9375, + "learning_rate": 0.00014792017902977796, + "loss": 0.8446, + "step": 22630 + }, + { + "epoch": 0.5811006209066939, + "grad_norm": 0.7734375, + "learning_rate": 0.00014791626073096215, + "loss": 0.9261, + "step": 22631 + }, + { + "epoch": 0.5811262981026157, + "grad_norm": 0.828125, + "learning_rate": 0.00014791234233665353, + "loss": 0.8585, + "step": 22632 + }, + { + "epoch": 0.5811519752985376, + "grad_norm": 0.79296875, + "learning_rate": 0.00014790842384686, + "loss": 0.9749, + "step": 22633 + }, + { + "epoch": 0.5811776524944593, + "grad_norm": 0.859375, + "learning_rate": 0.00014790450526158927, + "loss": 1.0179, + "step": 22634 + }, + { + "epoch": 0.5812033296903811, + "grad_norm": 0.75, + "learning_rate": 0.00014790058658084915, + "loss": 0.7862, + "step": 22635 + }, + { + "epoch": 0.581229006886303, + "grad_norm": 0.796875, + "learning_rate": 0.00014789666780464753, + "loss": 0.8838, + "step": 22636 + }, + { + "epoch": 0.5812546840822248, + "grad_norm": 0.77734375, + "learning_rate": 0.00014789274893299217, + "loss": 0.8476, + "step": 22637 + }, + { + "epoch": 0.5812803612781466, + "grad_norm": 0.74609375, + "learning_rate": 0.00014788882996589087, + "loss": 0.8502, + "step": 22638 + }, + { + "epoch": 0.5813060384740685, + "grad_norm": 0.78125, + "learning_rate": 0.0001478849109033515, + "loss": 0.8991, + "step": 22639 + }, + { + "epoch": 0.5813317156699903, + "grad_norm": 0.7890625, + "learning_rate": 0.00014788099174538178, + "loss": 0.9465, + "step": 22640 + }, + { + "epoch": 0.581357392865912, + "grad_norm": 0.7265625, + "learning_rate": 0.0001478770724919896, + "loss": 0.7154, + "step": 22641 + }, + { + "epoch": 0.5813830700618339, + "grad_norm": 0.83203125, + "learning_rate": 0.00014787315314318273, + "loss": 0.9172, + "step": 22642 + }, + { + "epoch": 0.5814087472577557, + "grad_norm": 0.84375, + "learning_rate": 0.000147869233698969, + "loss": 0.8189, + "step": 22643 + }, + { + "epoch": 0.5814344244536775, + "grad_norm": 0.7734375, + "learning_rate": 0.00014786531415935617, + "loss": 0.8377, + "step": 22644 + }, + { + "epoch": 0.5814601016495994, + "grad_norm": 0.80859375, + "learning_rate": 0.00014786139452435215, + "loss": 0.9237, + "step": 22645 + }, + { + "epoch": 0.5814857788455212, + "grad_norm": 0.82421875, + "learning_rate": 0.0001478574747939647, + "loss": 0.9684, + "step": 22646 + }, + { + "epoch": 0.5815114560414429, + "grad_norm": 0.74609375, + "learning_rate": 0.00014785355496820156, + "loss": 0.7881, + "step": 22647 + }, + { + "epoch": 0.5815371332373648, + "grad_norm": 0.73046875, + "learning_rate": 0.00014784963504707062, + "loss": 0.7999, + "step": 22648 + }, + { + "epoch": 0.5815628104332866, + "grad_norm": 0.8125, + "learning_rate": 0.00014784571503057974, + "loss": 0.9233, + "step": 22649 + }, + { + "epoch": 0.5815884876292085, + "grad_norm": 0.81640625, + "learning_rate": 0.0001478417949187366, + "loss": 0.912, + "step": 22650 + }, + { + "epoch": 0.5816141648251303, + "grad_norm": 0.83984375, + "learning_rate": 0.00014783787471154912, + "loss": 1.0036, + "step": 22651 + }, + { + "epoch": 0.5816398420210521, + "grad_norm": 0.83984375, + "learning_rate": 0.0001478339544090251, + "loss": 0.8682, + "step": 22652 + }, + { + "epoch": 0.581665519216974, + "grad_norm": 0.83203125, + "learning_rate": 0.00014783003401117227, + "loss": 0.9629, + "step": 22653 + }, + { + "epoch": 0.5816911964128957, + "grad_norm": 0.78515625, + "learning_rate": 0.00014782611351799857, + "loss": 0.7797, + "step": 22654 + }, + { + "epoch": 0.5817168736088175, + "grad_norm": 0.77734375, + "learning_rate": 0.0001478221929295117, + "loss": 0.8653, + "step": 22655 + }, + { + "epoch": 0.5817425508047394, + "grad_norm": 0.81640625, + "learning_rate": 0.00014781827224571954, + "loss": 0.7566, + "step": 22656 + }, + { + "epoch": 0.5817682280006612, + "grad_norm": 0.80859375, + "learning_rate": 0.00014781435146662988, + "loss": 0.9018, + "step": 22657 + }, + { + "epoch": 0.581793905196583, + "grad_norm": 0.82421875, + "learning_rate": 0.00014781043059225053, + "loss": 0.8035, + "step": 22658 + }, + { + "epoch": 0.5818195823925049, + "grad_norm": 0.8046875, + "learning_rate": 0.0001478065096225893, + "loss": 0.8626, + "step": 22659 + }, + { + "epoch": 0.5818452595884267, + "grad_norm": 0.734375, + "learning_rate": 0.00014780258855765402, + "loss": 0.889, + "step": 22660 + }, + { + "epoch": 0.5818709367843484, + "grad_norm": 0.7890625, + "learning_rate": 0.00014779866739745253, + "loss": 0.8682, + "step": 22661 + }, + { + "epoch": 0.5818966139802703, + "grad_norm": 0.82421875, + "learning_rate": 0.00014779474614199257, + "loss": 0.8253, + "step": 22662 + }, + { + "epoch": 0.5819222911761921, + "grad_norm": 0.85546875, + "learning_rate": 0.000147790824791282, + "loss": 0.9299, + "step": 22663 + }, + { + "epoch": 0.5819479683721139, + "grad_norm": 0.79296875, + "learning_rate": 0.00014778690334532865, + "loss": 0.8551, + "step": 22664 + }, + { + "epoch": 0.5819736455680358, + "grad_norm": 0.7109375, + "learning_rate": 0.00014778298180414035, + "loss": 0.8531, + "step": 22665 + }, + { + "epoch": 0.5819993227639576, + "grad_norm": 0.734375, + "learning_rate": 0.00014777906016772482, + "loss": 0.8253, + "step": 22666 + }, + { + "epoch": 0.5820249999598793, + "grad_norm": 0.83203125, + "learning_rate": 0.00014777513843608998, + "loss": 0.8516, + "step": 22667 + }, + { + "epoch": 0.5820506771558012, + "grad_norm": 0.78515625, + "learning_rate": 0.0001477712166092436, + "loss": 0.8541, + "step": 22668 + }, + { + "epoch": 0.582076354351723, + "grad_norm": 0.77734375, + "learning_rate": 0.00014776729468719347, + "loss": 0.8646, + "step": 22669 + }, + { + "epoch": 0.5821020315476448, + "grad_norm": 0.73828125, + "learning_rate": 0.00014776337266994747, + "loss": 0.7053, + "step": 22670 + }, + { + "epoch": 0.5821277087435667, + "grad_norm": 0.82421875, + "learning_rate": 0.00014775945055751338, + "loss": 0.8331, + "step": 22671 + }, + { + "epoch": 0.5821533859394885, + "grad_norm": 0.74609375, + "learning_rate": 0.00014775552834989903, + "loss": 0.9914, + "step": 22672 + }, + { + "epoch": 0.5821790631354103, + "grad_norm": 0.796875, + "learning_rate": 0.00014775160604711224, + "loss": 0.935, + "step": 22673 + }, + { + "epoch": 0.5822047403313321, + "grad_norm": 0.765625, + "learning_rate": 0.00014774768364916077, + "loss": 0.8769, + "step": 22674 + }, + { + "epoch": 0.5822304175272539, + "grad_norm": 0.85546875, + "learning_rate": 0.00014774376115605245, + "loss": 0.9922, + "step": 22675 + }, + { + "epoch": 0.5822560947231757, + "grad_norm": 0.70703125, + "learning_rate": 0.00014773983856779518, + "loss": 0.7864, + "step": 22676 + }, + { + "epoch": 0.5822817719190976, + "grad_norm": 1.25, + "learning_rate": 0.00014773591588439672, + "loss": 0.89, + "step": 22677 + }, + { + "epoch": 0.5823074491150194, + "grad_norm": 0.7734375, + "learning_rate": 0.0001477319931058649, + "loss": 0.9164, + "step": 22678 + }, + { + "epoch": 0.5823331263109413, + "grad_norm": 0.79296875, + "learning_rate": 0.00014772807023220752, + "loss": 0.8291, + "step": 22679 + }, + { + "epoch": 0.5823588035068631, + "grad_norm": 0.71484375, + "learning_rate": 0.0001477241472634324, + "loss": 0.8047, + "step": 22680 + }, + { + "epoch": 0.5823844807027848, + "grad_norm": 0.796875, + "learning_rate": 0.00014772022419954736, + "loss": 0.9616, + "step": 22681 + }, + { + "epoch": 0.5824101578987066, + "grad_norm": 0.8515625, + "learning_rate": 0.00014771630104056023, + "loss": 0.8793, + "step": 22682 + }, + { + "epoch": 0.5824358350946285, + "grad_norm": 0.7421875, + "learning_rate": 0.0001477123777864788, + "loss": 0.7344, + "step": 22683 + }, + { + "epoch": 0.5824615122905503, + "grad_norm": 0.77734375, + "learning_rate": 0.00014770845443731095, + "loss": 0.7891, + "step": 22684 + }, + { + "epoch": 0.5824871894864722, + "grad_norm": 0.75390625, + "learning_rate": 0.00014770453099306445, + "loss": 0.7162, + "step": 22685 + }, + { + "epoch": 0.582512866682394, + "grad_norm": 0.7734375, + "learning_rate": 0.00014770060745374712, + "loss": 0.8706, + "step": 22686 + }, + { + "epoch": 0.5825385438783157, + "grad_norm": 0.76171875, + "learning_rate": 0.00014769668381936678, + "loss": 0.8515, + "step": 22687 + }, + { + "epoch": 0.5825642210742376, + "grad_norm": 0.69921875, + "learning_rate": 0.00014769276008993126, + "loss": 0.8791, + "step": 22688 + }, + { + "epoch": 0.5825898982701594, + "grad_norm": 0.75390625, + "learning_rate": 0.0001476888362654484, + "loss": 0.8377, + "step": 22689 + }, + { + "epoch": 0.5826155754660812, + "grad_norm": 0.76171875, + "learning_rate": 0.000147684912345926, + "loss": 0.8401, + "step": 22690 + }, + { + "epoch": 0.5826412526620031, + "grad_norm": 0.68359375, + "learning_rate": 0.00014768098833137184, + "loss": 0.868, + "step": 22691 + }, + { + "epoch": 0.5826669298579249, + "grad_norm": 1.203125, + "learning_rate": 0.00014767706422179382, + "loss": 0.7672, + "step": 22692 + }, + { + "epoch": 0.5826926070538467, + "grad_norm": 0.77734375, + "learning_rate": 0.00014767314001719965, + "loss": 0.9723, + "step": 22693 + }, + { + "epoch": 0.5827182842497685, + "grad_norm": 0.72265625, + "learning_rate": 0.00014766921571759725, + "loss": 0.9201, + "step": 22694 + }, + { + "epoch": 0.5827439614456903, + "grad_norm": 0.76171875, + "learning_rate": 0.00014766529132299446, + "loss": 0.736, + "step": 22695 + }, + { + "epoch": 0.5827696386416121, + "grad_norm": 0.8046875, + "learning_rate": 0.000147661366833399, + "loss": 0.8256, + "step": 22696 + }, + { + "epoch": 0.582795315837534, + "grad_norm": 0.765625, + "learning_rate": 0.00014765744224881874, + "loss": 0.7343, + "step": 22697 + }, + { + "epoch": 0.5828209930334558, + "grad_norm": 0.75390625, + "learning_rate": 0.00014765351756926154, + "loss": 0.852, + "step": 22698 + }, + { + "epoch": 0.5828466702293776, + "grad_norm": 0.7578125, + "learning_rate": 0.00014764959279473514, + "loss": 0.9284, + "step": 22699 + }, + { + "epoch": 0.5828723474252995, + "grad_norm": 0.703125, + "learning_rate": 0.00014764566792524743, + "loss": 0.7738, + "step": 22700 + }, + { + "epoch": 0.5828980246212212, + "grad_norm": 0.78515625, + "learning_rate": 0.0001476417429608062, + "loss": 0.973, + "step": 22701 + }, + { + "epoch": 0.582923701817143, + "grad_norm": 0.84765625, + "learning_rate": 0.00014763781790141928, + "loss": 0.7469, + "step": 22702 + }, + { + "epoch": 0.5829493790130649, + "grad_norm": 0.7265625, + "learning_rate": 0.0001476338927470945, + "loss": 0.7573, + "step": 22703 + }, + { + "epoch": 0.5829750562089867, + "grad_norm": 0.80859375, + "learning_rate": 0.0001476299674978397, + "loss": 0.8856, + "step": 22704 + }, + { + "epoch": 0.5830007334049085, + "grad_norm": 0.8046875, + "learning_rate": 0.00014762604215366263, + "loss": 0.9121, + "step": 22705 + }, + { + "epoch": 0.5830264106008304, + "grad_norm": 0.8125, + "learning_rate": 0.00014762211671457114, + "loss": 0.8856, + "step": 22706 + }, + { + "epoch": 0.5830520877967521, + "grad_norm": 1.109375, + "learning_rate": 0.00014761819118057313, + "loss": 0.9111, + "step": 22707 + }, + { + "epoch": 0.5830777649926739, + "grad_norm": 0.84375, + "learning_rate": 0.00014761426555167634, + "loss": 0.9532, + "step": 22708 + }, + { + "epoch": 0.5831034421885958, + "grad_norm": 0.765625, + "learning_rate": 0.0001476103398278886, + "loss": 0.847, + "step": 22709 + }, + { + "epoch": 0.5831291193845176, + "grad_norm": 0.76953125, + "learning_rate": 0.0001476064140092178, + "loss": 0.9309, + "step": 22710 + }, + { + "epoch": 0.5831547965804394, + "grad_norm": 0.765625, + "learning_rate": 0.00014760248809567172, + "loss": 0.8802, + "step": 22711 + }, + { + "epoch": 0.5831804737763613, + "grad_norm": 0.74609375, + "learning_rate": 0.00014759856208725812, + "loss": 0.9573, + "step": 22712 + }, + { + "epoch": 0.5832061509722831, + "grad_norm": 0.75, + "learning_rate": 0.00014759463598398495, + "loss": 0.9337, + "step": 22713 + }, + { + "epoch": 0.5832318281682048, + "grad_norm": 0.78515625, + "learning_rate": 0.0001475907097858599, + "loss": 0.9, + "step": 22714 + }, + { + "epoch": 0.5832575053641267, + "grad_norm": 0.73828125, + "learning_rate": 0.00014758678349289093, + "loss": 0.9487, + "step": 22715 + }, + { + "epoch": 0.5832831825600485, + "grad_norm": 0.74609375, + "learning_rate": 0.00014758285710508577, + "loss": 0.8117, + "step": 22716 + }, + { + "epoch": 0.5833088597559704, + "grad_norm": 0.73828125, + "learning_rate": 0.00014757893062245227, + "loss": 0.6994, + "step": 22717 + }, + { + "epoch": 0.5833345369518922, + "grad_norm": 0.6875, + "learning_rate": 0.0001475750040449983, + "loss": 0.8298, + "step": 22718 + }, + { + "epoch": 0.583360214147814, + "grad_norm": 0.82421875, + "learning_rate": 0.00014757107737273158, + "loss": 0.8861, + "step": 22719 + }, + { + "epoch": 0.5833858913437358, + "grad_norm": 0.69921875, + "learning_rate": 0.00014756715060566006, + "loss": 0.8398, + "step": 22720 + }, + { + "epoch": 0.5834115685396576, + "grad_norm": 0.68359375, + "learning_rate": 0.00014756322374379145, + "loss": 0.7395, + "step": 22721 + }, + { + "epoch": 0.5834372457355794, + "grad_norm": 0.82421875, + "learning_rate": 0.00014755929678713368, + "loss": 0.8511, + "step": 22722 + }, + { + "epoch": 0.5834629229315013, + "grad_norm": 0.85546875, + "learning_rate": 0.0001475553697356945, + "loss": 0.7319, + "step": 22723 + }, + { + "epoch": 0.5834886001274231, + "grad_norm": 0.73828125, + "learning_rate": 0.00014755144258948177, + "loss": 0.8116, + "step": 22724 + }, + { + "epoch": 0.5835142773233449, + "grad_norm": 0.76953125, + "learning_rate": 0.0001475475153485033, + "loss": 0.8964, + "step": 22725 + }, + { + "epoch": 0.5835399545192668, + "grad_norm": 0.7890625, + "learning_rate": 0.00014754358801276696, + "loss": 0.9996, + "step": 22726 + }, + { + "epoch": 0.5835656317151885, + "grad_norm": 0.78515625, + "learning_rate": 0.00014753966058228052, + "loss": 0.893, + "step": 22727 + }, + { + "epoch": 0.5835913089111103, + "grad_norm": 0.80859375, + "learning_rate": 0.00014753573305705182, + "loss": 0.9353, + "step": 22728 + }, + { + "epoch": 0.5836169861070322, + "grad_norm": 0.78515625, + "learning_rate": 0.00014753180543708872, + "loss": 0.945, + "step": 22729 + }, + { + "epoch": 0.583642663302954, + "grad_norm": 0.76953125, + "learning_rate": 0.000147527877722399, + "loss": 0.7606, + "step": 22730 + }, + { + "epoch": 0.5836683404988758, + "grad_norm": 0.8203125, + "learning_rate": 0.00014752394991299051, + "loss": 0.9568, + "step": 22731 + }, + { + "epoch": 0.5836940176947977, + "grad_norm": 0.84765625, + "learning_rate": 0.00014752002200887115, + "loss": 0.9015, + "step": 22732 + }, + { + "epoch": 0.5837196948907195, + "grad_norm": 0.8125, + "learning_rate": 0.00014751609401004857, + "loss": 0.832, + "step": 22733 + }, + { + "epoch": 0.5837453720866412, + "grad_norm": 0.69140625, + "learning_rate": 0.0001475121659165308, + "loss": 0.8994, + "step": 22734 + }, + { + "epoch": 0.5837710492825631, + "grad_norm": 0.7734375, + "learning_rate": 0.00014750823772832555, + "loss": 0.867, + "step": 22735 + }, + { + "epoch": 0.5837967264784849, + "grad_norm": 0.8203125, + "learning_rate": 0.00014750430944544061, + "loss": 0.8526, + "step": 22736 + }, + { + "epoch": 0.5838224036744067, + "grad_norm": 0.74609375, + "learning_rate": 0.000147500381067884, + "loss": 0.8389, + "step": 22737 + }, + { + "epoch": 0.5838480808703286, + "grad_norm": 0.6953125, + "learning_rate": 0.00014749645259566328, + "loss": 0.7503, + "step": 22738 + }, + { + "epoch": 0.5838737580662504, + "grad_norm": 0.82421875, + "learning_rate": 0.0001474925240287865, + "loss": 0.8875, + "step": 22739 + }, + { + "epoch": 0.5838994352621721, + "grad_norm": 0.7421875, + "learning_rate": 0.0001474885953672614, + "loss": 0.8811, + "step": 22740 + }, + { + "epoch": 0.583925112458094, + "grad_norm": 0.76953125, + "learning_rate": 0.0001474846666110958, + "loss": 0.9631, + "step": 22741 + }, + { + "epoch": 0.5839507896540158, + "grad_norm": 0.9140625, + "learning_rate": 0.0001474807377602976, + "loss": 0.9754, + "step": 22742 + }, + { + "epoch": 0.5839764668499376, + "grad_norm": 0.828125, + "learning_rate": 0.00014747680881487451, + "loss": 0.8752, + "step": 22743 + }, + { + "epoch": 0.5840021440458595, + "grad_norm": 0.8203125, + "learning_rate": 0.00014747287977483447, + "loss": 0.8868, + "step": 22744 + }, + { + "epoch": 0.5840278212417813, + "grad_norm": 0.703125, + "learning_rate": 0.00014746895064018527, + "loss": 0.8357, + "step": 22745 + }, + { + "epoch": 0.5840534984377032, + "grad_norm": 0.80078125, + "learning_rate": 0.0001474650214109347, + "loss": 0.7249, + "step": 22746 + }, + { + "epoch": 0.5840791756336249, + "grad_norm": 0.75390625, + "learning_rate": 0.00014746109208709066, + "loss": 0.8556, + "step": 22747 + }, + { + "epoch": 0.5841048528295467, + "grad_norm": 0.6953125, + "learning_rate": 0.00014745716266866097, + "loss": 0.8466, + "step": 22748 + }, + { + "epoch": 0.5841305300254686, + "grad_norm": 0.74609375, + "learning_rate": 0.0001474532331556534, + "loss": 0.7211, + "step": 22749 + }, + { + "epoch": 0.5841562072213904, + "grad_norm": 0.71484375, + "learning_rate": 0.00014744930354807585, + "loss": 0.8315, + "step": 22750 + }, + { + "epoch": 0.5841818844173122, + "grad_norm": 0.7578125, + "learning_rate": 0.00014744537384593612, + "loss": 0.8652, + "step": 22751 + }, + { + "epoch": 0.5842075616132341, + "grad_norm": 0.828125, + "learning_rate": 0.00014744144404924204, + "loss": 0.8806, + "step": 22752 + }, + { + "epoch": 0.5842332388091559, + "grad_norm": 0.80859375, + "learning_rate": 0.00014743751415800146, + "loss": 0.8951, + "step": 22753 + }, + { + "epoch": 0.5842589160050776, + "grad_norm": 0.75, + "learning_rate": 0.00014743358417222216, + "loss": 0.841, + "step": 22754 + }, + { + "epoch": 0.5842845932009995, + "grad_norm": 0.74609375, + "learning_rate": 0.00014742965409191206, + "loss": 0.8146, + "step": 22755 + }, + { + "epoch": 0.5843102703969213, + "grad_norm": 1.0625, + "learning_rate": 0.00014742572391707894, + "loss": 0.7662, + "step": 22756 + }, + { + "epoch": 0.5843359475928431, + "grad_norm": 0.75390625, + "learning_rate": 0.00014742179364773063, + "loss": 0.8914, + "step": 22757 + }, + { + "epoch": 0.584361624788765, + "grad_norm": 0.76171875, + "learning_rate": 0.00014741786328387497, + "loss": 0.8978, + "step": 22758 + }, + { + "epoch": 0.5843873019846868, + "grad_norm": 0.7578125, + "learning_rate": 0.00014741393282551976, + "loss": 0.9002, + "step": 22759 + }, + { + "epoch": 0.5844129791806085, + "grad_norm": 0.8203125, + "learning_rate": 0.00014741000227267288, + "loss": 0.8619, + "step": 22760 + }, + { + "epoch": 0.5844386563765304, + "grad_norm": 0.8359375, + "learning_rate": 0.00014740607162534218, + "loss": 0.832, + "step": 22761 + }, + { + "epoch": 0.5844643335724522, + "grad_norm": 0.73828125, + "learning_rate": 0.00014740214088353542, + "loss": 0.8152, + "step": 22762 + }, + { + "epoch": 0.584490010768374, + "grad_norm": 0.73046875, + "learning_rate": 0.00014739821004726048, + "loss": 0.7629, + "step": 22763 + }, + { + "epoch": 0.5845156879642959, + "grad_norm": 0.765625, + "learning_rate": 0.0001473942791165252, + "loss": 0.7701, + "step": 22764 + }, + { + "epoch": 0.5845413651602177, + "grad_norm": 0.80078125, + "learning_rate": 0.0001473903480913374, + "loss": 0.9385, + "step": 22765 + }, + { + "epoch": 0.5845670423561395, + "grad_norm": 0.734375, + "learning_rate": 0.00014738641697170491, + "loss": 0.8109, + "step": 22766 + }, + { + "epoch": 0.5845927195520613, + "grad_norm": 0.7421875, + "learning_rate": 0.0001473824857576356, + "loss": 0.7606, + "step": 22767 + }, + { + "epoch": 0.5846183967479831, + "grad_norm": 0.78125, + "learning_rate": 0.00014737855444913725, + "loss": 0.8438, + "step": 22768 + }, + { + "epoch": 0.5846440739439049, + "grad_norm": 1.1796875, + "learning_rate": 0.00014737462304621773, + "loss": 0.7132, + "step": 22769 + }, + { + "epoch": 0.5846697511398268, + "grad_norm": 0.73828125, + "learning_rate": 0.00014737069154888488, + "loss": 0.8216, + "step": 22770 + }, + { + "epoch": 0.5846954283357486, + "grad_norm": 0.80078125, + "learning_rate": 0.00014736675995714649, + "loss": 0.9594, + "step": 22771 + }, + { + "epoch": 0.5847211055316704, + "grad_norm": 0.73046875, + "learning_rate": 0.00014736282827101046, + "loss": 0.9642, + "step": 22772 + }, + { + "epoch": 0.5847467827275923, + "grad_norm": 0.82421875, + "learning_rate": 0.00014735889649048454, + "loss": 0.9694, + "step": 22773 + }, + { + "epoch": 0.584772459923514, + "grad_norm": 0.7265625, + "learning_rate": 0.00014735496461557666, + "loss": 0.8521, + "step": 22774 + }, + { + "epoch": 0.5847981371194358, + "grad_norm": 0.7421875, + "learning_rate": 0.00014735103264629462, + "loss": 0.9414, + "step": 22775 + }, + { + "epoch": 0.5848238143153577, + "grad_norm": 0.73046875, + "learning_rate": 0.0001473471005826462, + "loss": 0.8605, + "step": 22776 + }, + { + "epoch": 0.5848494915112795, + "grad_norm": 0.73046875, + "learning_rate": 0.00014734316842463936, + "loss": 0.8456, + "step": 22777 + }, + { + "epoch": 0.5848751687072014, + "grad_norm": 0.73828125, + "learning_rate": 0.00014733923617228178, + "loss": 0.8227, + "step": 22778 + }, + { + "epoch": 0.5849008459031232, + "grad_norm": 0.80859375, + "learning_rate": 0.00014733530382558143, + "loss": 0.9428, + "step": 22779 + }, + { + "epoch": 0.5849265230990449, + "grad_norm": 0.75390625, + "learning_rate": 0.00014733137138454606, + "loss": 0.7874, + "step": 22780 + }, + { + "epoch": 0.5849522002949668, + "grad_norm": 0.734375, + "learning_rate": 0.00014732743884918358, + "loss": 0.8012, + "step": 22781 + }, + { + "epoch": 0.5849778774908886, + "grad_norm": 0.76953125, + "learning_rate": 0.00014732350621950177, + "loss": 0.8216, + "step": 22782 + }, + { + "epoch": 0.5850035546868104, + "grad_norm": 0.8515625, + "learning_rate": 0.0001473195734955085, + "loss": 0.942, + "step": 22783 + }, + { + "epoch": 0.5850292318827323, + "grad_norm": 0.83203125, + "learning_rate": 0.00014731564067721158, + "loss": 0.9541, + "step": 22784 + }, + { + "epoch": 0.5850549090786541, + "grad_norm": 0.71875, + "learning_rate": 0.00014731170776461887, + "loss": 0.8061, + "step": 22785 + }, + { + "epoch": 0.5850805862745759, + "grad_norm": 0.71875, + "learning_rate": 0.0001473077747577382, + "loss": 0.813, + "step": 22786 + }, + { + "epoch": 0.5851062634704977, + "grad_norm": 0.76953125, + "learning_rate": 0.0001473038416565774, + "loss": 0.916, + "step": 22787 + }, + { + "epoch": 0.5851319406664195, + "grad_norm": 0.8203125, + "learning_rate": 0.00014729990846114432, + "loss": 0.9105, + "step": 22788 + }, + { + "epoch": 0.5851576178623413, + "grad_norm": 0.76953125, + "learning_rate": 0.00014729597517144677, + "loss": 0.7342, + "step": 22789 + }, + { + "epoch": 0.5851832950582632, + "grad_norm": 0.78515625, + "learning_rate": 0.00014729204178749267, + "loss": 0.7915, + "step": 22790 + }, + { + "epoch": 0.585208972254185, + "grad_norm": 0.78515625, + "learning_rate": 0.00014728810830928976, + "loss": 0.8461, + "step": 22791 + }, + { + "epoch": 0.5852346494501068, + "grad_norm": 0.80859375, + "learning_rate": 0.00014728417473684595, + "loss": 0.7981, + "step": 22792 + }, + { + "epoch": 0.5852603266460287, + "grad_norm": 0.87109375, + "learning_rate": 0.00014728024107016902, + "loss": 1.0055, + "step": 22793 + }, + { + "epoch": 0.5852860038419504, + "grad_norm": 0.80078125, + "learning_rate": 0.00014727630730926685, + "loss": 0.8577, + "step": 22794 + }, + { + "epoch": 0.5853116810378722, + "grad_norm": 0.75390625, + "learning_rate": 0.0001472723734541473, + "loss": 0.9996, + "step": 22795 + }, + { + "epoch": 0.5853373582337941, + "grad_norm": 0.76953125, + "learning_rate": 0.00014726843950481813, + "loss": 0.8089, + "step": 22796 + }, + { + "epoch": 0.5853630354297159, + "grad_norm": 0.796875, + "learning_rate": 0.00014726450546128728, + "loss": 0.9786, + "step": 22797 + }, + { + "epoch": 0.5853887126256377, + "grad_norm": 0.76171875, + "learning_rate": 0.0001472605713235625, + "loss": 0.9803, + "step": 22798 + }, + { + "epoch": 0.5854143898215596, + "grad_norm": 0.83984375, + "learning_rate": 0.0001472566370916517, + "loss": 0.8156, + "step": 22799 + }, + { + "epoch": 0.5854400670174813, + "grad_norm": 0.69921875, + "learning_rate": 0.00014725270276556265, + "loss": 0.8272, + "step": 22800 + }, + { + "epoch": 0.5854657442134031, + "grad_norm": 0.76953125, + "learning_rate": 0.00014724876834530329, + "loss": 0.8806, + "step": 22801 + }, + { + "epoch": 0.585491421409325, + "grad_norm": 0.8828125, + "learning_rate": 0.00014724483383088134, + "loss": 0.8808, + "step": 22802 + }, + { + "epoch": 0.5855170986052468, + "grad_norm": 0.7421875, + "learning_rate": 0.00014724089922230475, + "loss": 0.8577, + "step": 22803 + }, + { + "epoch": 0.5855427758011686, + "grad_norm": 0.71484375, + "learning_rate": 0.0001472369645195813, + "loss": 0.9254, + "step": 22804 + }, + { + "epoch": 0.5855684529970905, + "grad_norm": 0.77734375, + "learning_rate": 0.00014723302972271884, + "loss": 0.8757, + "step": 22805 + }, + { + "epoch": 0.5855941301930123, + "grad_norm": 0.765625, + "learning_rate": 0.00014722909483172523, + "loss": 0.9265, + "step": 22806 + }, + { + "epoch": 0.585619807388934, + "grad_norm": 0.73828125, + "learning_rate": 0.0001472251598466083, + "loss": 0.7717, + "step": 22807 + }, + { + "epoch": 0.5856454845848559, + "grad_norm": 0.78515625, + "learning_rate": 0.0001472212247673759, + "loss": 0.9767, + "step": 22808 + }, + { + "epoch": 0.5856711617807777, + "grad_norm": 0.82421875, + "learning_rate": 0.00014721728959403584, + "loss": 0.9129, + "step": 22809 + }, + { + "epoch": 0.5856968389766996, + "grad_norm": 0.828125, + "learning_rate": 0.000147213354326596, + "loss": 0.8188, + "step": 22810 + }, + { + "epoch": 0.5857225161726214, + "grad_norm": 0.859375, + "learning_rate": 0.00014720941896506423, + "loss": 0.8984, + "step": 22811 + }, + { + "epoch": 0.5857481933685432, + "grad_norm": 0.84765625, + "learning_rate": 0.00014720548350944832, + "loss": 0.7949, + "step": 22812 + }, + { + "epoch": 0.5857738705644651, + "grad_norm": 0.86328125, + "learning_rate": 0.00014720154795975614, + "loss": 0.9452, + "step": 22813 + }, + { + "epoch": 0.5857995477603868, + "grad_norm": 0.8828125, + "learning_rate": 0.00014719761231599555, + "loss": 0.9491, + "step": 22814 + }, + { + "epoch": 0.5858252249563086, + "grad_norm": 0.921875, + "learning_rate": 0.0001471936765781744, + "loss": 0.9075, + "step": 22815 + }, + { + "epoch": 0.5858509021522305, + "grad_norm": 0.765625, + "learning_rate": 0.0001471897407463005, + "loss": 0.8005, + "step": 22816 + }, + { + "epoch": 0.5858765793481523, + "grad_norm": 0.76171875, + "learning_rate": 0.0001471858048203817, + "loss": 0.8755, + "step": 22817 + }, + { + "epoch": 0.5859022565440741, + "grad_norm": 0.72265625, + "learning_rate": 0.00014718186880042586, + "loss": 0.8936, + "step": 22818 + }, + { + "epoch": 0.585927933739996, + "grad_norm": 0.79296875, + "learning_rate": 0.00014717793268644082, + "loss": 0.989, + "step": 22819 + }, + { + "epoch": 0.5859536109359177, + "grad_norm": 0.81640625, + "learning_rate": 0.00014717399647843444, + "loss": 0.7895, + "step": 22820 + }, + { + "epoch": 0.5859792881318395, + "grad_norm": 0.75390625, + "learning_rate": 0.0001471700601764145, + "loss": 0.8423, + "step": 22821 + }, + { + "epoch": 0.5860049653277614, + "grad_norm": 0.8671875, + "learning_rate": 0.00014716612378038894, + "loss": 0.8686, + "step": 22822 + }, + { + "epoch": 0.5860306425236832, + "grad_norm": 0.81640625, + "learning_rate": 0.00014716218729036556, + "loss": 0.8551, + "step": 22823 + }, + { + "epoch": 0.586056319719605, + "grad_norm": 0.84765625, + "learning_rate": 0.00014715825070635215, + "loss": 0.8951, + "step": 22824 + }, + { + "epoch": 0.5860819969155269, + "grad_norm": 0.7578125, + "learning_rate": 0.00014715431402835666, + "loss": 0.7385, + "step": 22825 + }, + { + "epoch": 0.5861076741114487, + "grad_norm": 0.70703125, + "learning_rate": 0.00014715037725638683, + "loss": 0.8815, + "step": 22826 + }, + { + "epoch": 0.5861333513073704, + "grad_norm": 0.7578125, + "learning_rate": 0.0001471464403904506, + "loss": 0.8479, + "step": 22827 + }, + { + "epoch": 0.5861590285032923, + "grad_norm": 0.7890625, + "learning_rate": 0.00014714250343055577, + "loss": 0.8487, + "step": 22828 + }, + { + "epoch": 0.5861847056992141, + "grad_norm": 0.859375, + "learning_rate": 0.00014713856637671015, + "loss": 0.9969, + "step": 22829 + }, + { + "epoch": 0.5862103828951359, + "grad_norm": 0.75, + "learning_rate": 0.00014713462922892167, + "loss": 0.8526, + "step": 22830 + }, + { + "epoch": 0.5862360600910578, + "grad_norm": 0.859375, + "learning_rate": 0.00014713069198719807, + "loss": 0.7766, + "step": 22831 + }, + { + "epoch": 0.5862617372869796, + "grad_norm": 0.76171875, + "learning_rate": 0.00014712675465154733, + "loss": 0.7156, + "step": 22832 + }, + { + "epoch": 0.5862874144829014, + "grad_norm": 0.7578125, + "learning_rate": 0.0001471228172219772, + "loss": 0.8272, + "step": 22833 + }, + { + "epoch": 0.5863130916788232, + "grad_norm": 0.73828125, + "learning_rate": 0.0001471188796984955, + "loss": 0.8428, + "step": 22834 + }, + { + "epoch": 0.586338768874745, + "grad_norm": 0.76953125, + "learning_rate": 0.0001471149420811102, + "loss": 0.937, + "step": 22835 + }, + { + "epoch": 0.5863644460706668, + "grad_norm": 0.7421875, + "learning_rate": 0.00014711100436982908, + "loss": 0.8503, + "step": 22836 + }, + { + "epoch": 0.5863901232665887, + "grad_norm": 0.734375, + "learning_rate": 0.00014710706656465993, + "loss": 0.8845, + "step": 22837 + }, + { + "epoch": 0.5864158004625105, + "grad_norm": 0.734375, + "learning_rate": 0.00014710312866561068, + "loss": 0.9157, + "step": 22838 + }, + { + "epoch": 0.5864414776584324, + "grad_norm": 0.73828125, + "learning_rate": 0.00014709919067268915, + "loss": 0.9055, + "step": 22839 + }, + { + "epoch": 0.5864671548543541, + "grad_norm": 0.765625, + "learning_rate": 0.0001470952525859032, + "loss": 0.8747, + "step": 22840 + }, + { + "epoch": 0.5864928320502759, + "grad_norm": 0.75, + "learning_rate": 0.00014709131440526063, + "loss": 0.8882, + "step": 22841 + }, + { + "epoch": 0.5865185092461978, + "grad_norm": 0.8125, + "learning_rate": 0.00014708737613076936, + "loss": 0.8039, + "step": 22842 + }, + { + "epoch": 0.5865441864421196, + "grad_norm": 0.78125, + "learning_rate": 0.0001470834377624372, + "loss": 0.9289, + "step": 22843 + }, + { + "epoch": 0.5865698636380414, + "grad_norm": 0.7578125, + "learning_rate": 0.000147079499300272, + "loss": 0.871, + "step": 22844 + }, + { + "epoch": 0.5865955408339633, + "grad_norm": 0.796875, + "learning_rate": 0.00014707556074428156, + "loss": 1.0548, + "step": 22845 + }, + { + "epoch": 0.5866212180298851, + "grad_norm": 0.796875, + "learning_rate": 0.00014707162209447384, + "loss": 0.8728, + "step": 22846 + }, + { + "epoch": 0.5866468952258068, + "grad_norm": 0.734375, + "learning_rate": 0.00014706768335085658, + "loss": 0.8578, + "step": 22847 + }, + { + "epoch": 0.5866725724217287, + "grad_norm": 0.74609375, + "learning_rate": 0.00014706374451343773, + "loss": 0.7763, + "step": 22848 + }, + { + "epoch": 0.5866982496176505, + "grad_norm": 0.76953125, + "learning_rate": 0.00014705980558222506, + "loss": 0.9587, + "step": 22849 + }, + { + "epoch": 0.5867239268135723, + "grad_norm": 0.796875, + "learning_rate": 0.00014705586655722644, + "loss": 0.9199, + "step": 22850 + }, + { + "epoch": 0.5867496040094942, + "grad_norm": 0.7109375, + "learning_rate": 0.00014705192743844975, + "loss": 0.8932, + "step": 22851 + }, + { + "epoch": 0.586775281205416, + "grad_norm": 0.7890625, + "learning_rate": 0.00014704798822590283, + "loss": 0.78, + "step": 22852 + }, + { + "epoch": 0.5868009584013378, + "grad_norm": 0.79296875, + "learning_rate": 0.00014704404891959346, + "loss": 0.8234, + "step": 22853 + }, + { + "epoch": 0.5868266355972596, + "grad_norm": 0.76953125, + "learning_rate": 0.0001470401095195296, + "loss": 0.8581, + "step": 22854 + }, + { + "epoch": 0.5868523127931814, + "grad_norm": 0.80078125, + "learning_rate": 0.00014703617002571906, + "loss": 0.9372, + "step": 22855 + }, + { + "epoch": 0.5868779899891032, + "grad_norm": 0.81640625, + "learning_rate": 0.00014703223043816967, + "loss": 0.8116, + "step": 22856 + }, + { + "epoch": 0.5869036671850251, + "grad_norm": 0.8203125, + "learning_rate": 0.0001470282907568893, + "loss": 0.8759, + "step": 22857 + }, + { + "epoch": 0.5869293443809469, + "grad_norm": 0.78515625, + "learning_rate": 0.00014702435098188575, + "loss": 0.8245, + "step": 22858 + }, + { + "epoch": 0.5869550215768687, + "grad_norm": 0.875, + "learning_rate": 0.00014702041111316696, + "loss": 0.9439, + "step": 22859 + }, + { + "epoch": 0.5869806987727905, + "grad_norm": 0.7734375, + "learning_rate": 0.0001470164711507407, + "loss": 0.8417, + "step": 22860 + }, + { + "epoch": 0.5870063759687123, + "grad_norm": 0.80078125, + "learning_rate": 0.00014701253109461487, + "loss": 0.9175, + "step": 22861 + }, + { + "epoch": 0.5870320531646341, + "grad_norm": 0.87109375, + "learning_rate": 0.00014700859094479734, + "loss": 0.8704, + "step": 22862 + }, + { + "epoch": 0.587057730360556, + "grad_norm": 0.8046875, + "learning_rate": 0.0001470046507012959, + "loss": 0.8284, + "step": 22863 + }, + { + "epoch": 0.5870834075564778, + "grad_norm": 0.76171875, + "learning_rate": 0.0001470007103641185, + "loss": 0.8304, + "step": 22864 + }, + { + "epoch": 0.5871090847523996, + "grad_norm": 0.96875, + "learning_rate": 0.00014699676993327286, + "loss": 0.805, + "step": 22865 + }, + { + "epoch": 0.5871347619483215, + "grad_norm": 0.75, + "learning_rate": 0.00014699282940876693, + "loss": 0.8169, + "step": 22866 + }, + { + "epoch": 0.5871604391442432, + "grad_norm": 0.796875, + "learning_rate": 0.00014698888879060854, + "loss": 0.8341, + "step": 22867 + }, + { + "epoch": 0.587186116340165, + "grad_norm": 0.7734375, + "learning_rate": 0.00014698494807880555, + "loss": 0.9295, + "step": 22868 + }, + { + "epoch": 0.5872117935360869, + "grad_norm": 0.73828125, + "learning_rate": 0.0001469810072733658, + "loss": 0.8212, + "step": 22869 + }, + { + "epoch": 0.5872374707320087, + "grad_norm": 0.77734375, + "learning_rate": 0.00014697706637429713, + "loss": 0.7839, + "step": 22870 + }, + { + "epoch": 0.5872631479279306, + "grad_norm": 0.8203125, + "learning_rate": 0.0001469731253816074, + "loss": 0.8574, + "step": 22871 + }, + { + "epoch": 0.5872888251238524, + "grad_norm": 0.7421875, + "learning_rate": 0.0001469691842953045, + "loss": 1.0107, + "step": 22872 + }, + { + "epoch": 0.5873145023197742, + "grad_norm": 0.77734375, + "learning_rate": 0.00014696524311539624, + "loss": 0.9343, + "step": 22873 + }, + { + "epoch": 0.587340179515696, + "grad_norm": 0.75390625, + "learning_rate": 0.0001469613018418905, + "loss": 0.999, + "step": 22874 + }, + { + "epoch": 0.5873658567116178, + "grad_norm": 0.87890625, + "learning_rate": 0.00014695736047479514, + "loss": 0.8387, + "step": 22875 + }, + { + "epoch": 0.5873915339075396, + "grad_norm": 0.80859375, + "learning_rate": 0.00014695341901411796, + "loss": 0.9044, + "step": 22876 + }, + { + "epoch": 0.5874172111034615, + "grad_norm": 0.7578125, + "learning_rate": 0.0001469494774598669, + "loss": 0.8821, + "step": 22877 + }, + { + "epoch": 0.5874428882993833, + "grad_norm": 0.80078125, + "learning_rate": 0.00014694553581204978, + "loss": 0.8704, + "step": 22878 + }, + { + "epoch": 0.5874685654953051, + "grad_norm": 0.734375, + "learning_rate": 0.0001469415940706744, + "loss": 0.7906, + "step": 22879 + }, + { + "epoch": 0.5874942426912269, + "grad_norm": 0.75390625, + "learning_rate": 0.00014693765223574868, + "loss": 0.8797, + "step": 22880 + }, + { + "epoch": 0.5875199198871487, + "grad_norm": 0.7421875, + "learning_rate": 0.0001469337103072805, + "loss": 0.8124, + "step": 22881 + }, + { + "epoch": 0.5875455970830705, + "grad_norm": 0.8515625, + "learning_rate": 0.00014692976828527762, + "loss": 0.9457, + "step": 22882 + }, + { + "epoch": 0.5875712742789924, + "grad_norm": 0.73828125, + "learning_rate": 0.000146925826169748, + "loss": 0.8831, + "step": 22883 + }, + { + "epoch": 0.5875969514749142, + "grad_norm": 0.76953125, + "learning_rate": 0.00014692188396069942, + "loss": 0.9763, + "step": 22884 + }, + { + "epoch": 0.587622628670836, + "grad_norm": 0.79296875, + "learning_rate": 0.00014691794165813975, + "loss": 0.9044, + "step": 22885 + }, + { + "epoch": 0.5876483058667579, + "grad_norm": 0.7578125, + "learning_rate": 0.0001469139992620769, + "loss": 0.7856, + "step": 22886 + }, + { + "epoch": 0.5876739830626796, + "grad_norm": 0.765625, + "learning_rate": 0.00014691005677251864, + "loss": 0.8581, + "step": 22887 + }, + { + "epoch": 0.5876996602586014, + "grad_norm": 0.7890625, + "learning_rate": 0.00014690611418947288, + "loss": 0.9954, + "step": 22888 + }, + { + "epoch": 0.5877253374545233, + "grad_norm": 0.8046875, + "learning_rate": 0.0001469021715129475, + "loss": 1.0253, + "step": 22889 + }, + { + "epoch": 0.5877510146504451, + "grad_norm": 0.734375, + "learning_rate": 0.00014689822874295035, + "loss": 0.9754, + "step": 22890 + }, + { + "epoch": 0.5877766918463669, + "grad_norm": 0.8125, + "learning_rate": 0.00014689428587948922, + "loss": 0.9059, + "step": 22891 + }, + { + "epoch": 0.5878023690422888, + "grad_norm": 0.77734375, + "learning_rate": 0.000146890342922572, + "loss": 0.8948, + "step": 22892 + }, + { + "epoch": 0.5878280462382106, + "grad_norm": 0.7578125, + "learning_rate": 0.0001468863998722066, + "loss": 0.8845, + "step": 22893 + }, + { + "epoch": 0.5878537234341323, + "grad_norm": 0.76953125, + "learning_rate": 0.00014688245672840085, + "loss": 0.9821, + "step": 22894 + }, + { + "epoch": 0.5878794006300542, + "grad_norm": 0.77734375, + "learning_rate": 0.00014687851349116257, + "loss": 0.9402, + "step": 22895 + }, + { + "epoch": 0.587905077825976, + "grad_norm": 0.734375, + "learning_rate": 0.00014687457016049967, + "loss": 0.7864, + "step": 22896 + }, + { + "epoch": 0.5879307550218978, + "grad_norm": 0.75, + "learning_rate": 0.00014687062673641999, + "loss": 0.8715, + "step": 22897 + }, + { + "epoch": 0.5879564322178197, + "grad_norm": 0.76171875, + "learning_rate": 0.00014686668321893133, + "loss": 0.9135, + "step": 22898 + }, + { + "epoch": 0.5879821094137415, + "grad_norm": 0.72265625, + "learning_rate": 0.00014686273960804163, + "loss": 0.7444, + "step": 22899 + }, + { + "epoch": 0.5880077866096632, + "grad_norm": 0.765625, + "learning_rate": 0.00014685879590375876, + "loss": 0.955, + "step": 22900 + }, + { + "epoch": 0.5880334638055851, + "grad_norm": 0.74609375, + "learning_rate": 0.0001468548521060905, + "loss": 0.7954, + "step": 22901 + }, + { + "epoch": 0.5880591410015069, + "grad_norm": 0.78125, + "learning_rate": 0.0001468509082150448, + "loss": 0.8674, + "step": 22902 + }, + { + "epoch": 0.5880848181974287, + "grad_norm": 0.73828125, + "learning_rate": 0.00014684696423062943, + "loss": 0.7899, + "step": 22903 + }, + { + "epoch": 0.5881104953933506, + "grad_norm": 0.86328125, + "learning_rate": 0.0001468430201528523, + "loss": 0.9699, + "step": 22904 + }, + { + "epoch": 0.5881361725892724, + "grad_norm": 0.71484375, + "learning_rate": 0.00014683907598172128, + "loss": 0.7476, + "step": 22905 + }, + { + "epoch": 0.5881618497851943, + "grad_norm": 0.77734375, + "learning_rate": 0.00014683513171724418, + "loss": 0.7948, + "step": 22906 + }, + { + "epoch": 0.588187526981116, + "grad_norm": 0.84765625, + "learning_rate": 0.00014683118735942888, + "loss": 0.9689, + "step": 22907 + }, + { + "epoch": 0.5882132041770378, + "grad_norm": 0.7890625, + "learning_rate": 0.0001468272429082833, + "loss": 0.918, + "step": 22908 + }, + { + "epoch": 0.5882388813729597, + "grad_norm": 0.69921875, + "learning_rate": 0.00014682329836381526, + "loss": 0.8673, + "step": 22909 + }, + { + "epoch": 0.5882645585688815, + "grad_norm": 0.73046875, + "learning_rate": 0.0001468193537260326, + "loss": 0.841, + "step": 22910 + }, + { + "epoch": 0.5882902357648033, + "grad_norm": 0.83203125, + "learning_rate": 0.00014681540899494317, + "loss": 0.8698, + "step": 22911 + }, + { + "epoch": 0.5883159129607252, + "grad_norm": 0.8125, + "learning_rate": 0.00014681146417055486, + "loss": 0.8483, + "step": 22912 + }, + { + "epoch": 0.588341590156647, + "grad_norm": 0.78125, + "learning_rate": 0.00014680751925287558, + "loss": 0.8891, + "step": 22913 + }, + { + "epoch": 0.5883672673525687, + "grad_norm": 0.76171875, + "learning_rate": 0.0001468035742419131, + "loss": 0.8629, + "step": 22914 + }, + { + "epoch": 0.5883929445484906, + "grad_norm": 0.75, + "learning_rate": 0.00014679962913767536, + "loss": 0.7449, + "step": 22915 + }, + { + "epoch": 0.5884186217444124, + "grad_norm": 0.77734375, + "learning_rate": 0.00014679568394017014, + "loss": 1.0193, + "step": 22916 + }, + { + "epoch": 0.5884442989403342, + "grad_norm": 0.76953125, + "learning_rate": 0.00014679173864940538, + "loss": 0.8733, + "step": 22917 + }, + { + "epoch": 0.5884699761362561, + "grad_norm": 0.76171875, + "learning_rate": 0.00014678779326538893, + "loss": 0.8709, + "step": 22918 + }, + { + "epoch": 0.5884956533321779, + "grad_norm": 0.78125, + "learning_rate": 0.00014678384778812857, + "loss": 0.831, + "step": 22919 + }, + { + "epoch": 0.5885213305280996, + "grad_norm": 0.79296875, + "learning_rate": 0.00014677990221763227, + "loss": 0.9353, + "step": 22920 + }, + { + "epoch": 0.5885470077240215, + "grad_norm": 0.83203125, + "learning_rate": 0.0001467759565539079, + "loss": 0.8507, + "step": 22921 + }, + { + "epoch": 0.5885726849199433, + "grad_norm": 0.75, + "learning_rate": 0.0001467720107969632, + "loss": 0.7982, + "step": 22922 + }, + { + "epoch": 0.5885983621158651, + "grad_norm": 0.69921875, + "learning_rate": 0.0001467680649468061, + "loss": 0.7743, + "step": 22923 + }, + { + "epoch": 0.588624039311787, + "grad_norm": 0.78515625, + "learning_rate": 0.00014676411900344452, + "loss": 0.9028, + "step": 22924 + }, + { + "epoch": 0.5886497165077088, + "grad_norm": 0.828125, + "learning_rate": 0.00014676017296688624, + "loss": 0.9112, + "step": 22925 + }, + { + "epoch": 0.5886753937036306, + "grad_norm": 0.78515625, + "learning_rate": 0.00014675622683713916, + "loss": 0.8, + "step": 22926 + }, + { + "epoch": 0.5887010708995524, + "grad_norm": 0.7578125, + "learning_rate": 0.00014675228061421117, + "loss": 0.8128, + "step": 22927 + }, + { + "epoch": 0.5887267480954742, + "grad_norm": 0.73828125, + "learning_rate": 0.0001467483342981101, + "loss": 0.88, + "step": 22928 + }, + { + "epoch": 0.588752425291396, + "grad_norm": 0.7890625, + "learning_rate": 0.00014674438788884382, + "loss": 0.9329, + "step": 22929 + }, + { + "epoch": 0.5887781024873179, + "grad_norm": 0.9140625, + "learning_rate": 0.0001467404413864202, + "loss": 0.9823, + "step": 22930 + }, + { + "epoch": 0.5888037796832397, + "grad_norm": 0.73046875, + "learning_rate": 0.0001467364947908471, + "loss": 0.9374, + "step": 22931 + }, + { + "epoch": 0.5888294568791616, + "grad_norm": 0.85546875, + "learning_rate": 0.00014673254810213237, + "loss": 0.935, + "step": 22932 + }, + { + "epoch": 0.5888551340750833, + "grad_norm": 0.74609375, + "learning_rate": 0.0001467286013202839, + "loss": 0.8797, + "step": 22933 + }, + { + "epoch": 0.5888808112710051, + "grad_norm": 0.8046875, + "learning_rate": 0.00014672465444530954, + "loss": 0.8957, + "step": 22934 + }, + { + "epoch": 0.588906488466927, + "grad_norm": 0.78515625, + "learning_rate": 0.0001467207074772172, + "loss": 0.8319, + "step": 22935 + }, + { + "epoch": 0.5889321656628488, + "grad_norm": 0.90234375, + "learning_rate": 0.0001467167604160147, + "loss": 0.9663, + "step": 22936 + }, + { + "epoch": 0.5889578428587706, + "grad_norm": 0.78515625, + "learning_rate": 0.0001467128132617099, + "loss": 0.901, + "step": 22937 + }, + { + "epoch": 0.5889835200546925, + "grad_norm": 0.73046875, + "learning_rate": 0.0001467088660143107, + "loss": 0.849, + "step": 22938 + }, + { + "epoch": 0.5890091972506143, + "grad_norm": 0.80078125, + "learning_rate": 0.00014670491867382492, + "loss": 0.9142, + "step": 22939 + }, + { + "epoch": 0.589034874446536, + "grad_norm": 0.8125, + "learning_rate": 0.0001467009712402605, + "loss": 0.9311, + "step": 22940 + }, + { + "epoch": 0.5890605516424579, + "grad_norm": 0.8046875, + "learning_rate": 0.00014669702371362522, + "loss": 0.8803, + "step": 22941 + }, + { + "epoch": 0.5890862288383797, + "grad_norm": 0.84765625, + "learning_rate": 0.00014669307609392702, + "loss": 1.003, + "step": 22942 + }, + { + "epoch": 0.5891119060343015, + "grad_norm": 0.8203125, + "learning_rate": 0.0001466891283811737, + "loss": 0.8222, + "step": 22943 + }, + { + "epoch": 0.5891375832302234, + "grad_norm": 0.76171875, + "learning_rate": 0.0001466851805753732, + "loss": 0.8617, + "step": 22944 + }, + { + "epoch": 0.5891632604261452, + "grad_norm": 0.86328125, + "learning_rate": 0.00014668123267653333, + "loss": 0.7172, + "step": 22945 + }, + { + "epoch": 0.589188937622067, + "grad_norm": 0.74609375, + "learning_rate": 0.00014667728468466197, + "loss": 0.9552, + "step": 22946 + }, + { + "epoch": 0.5892146148179888, + "grad_norm": 0.74609375, + "learning_rate": 0.00014667333659976702, + "loss": 0.8354, + "step": 22947 + }, + { + "epoch": 0.5892402920139106, + "grad_norm": 0.80859375, + "learning_rate": 0.00014666938842185635, + "loss": 0.879, + "step": 22948 + }, + { + "epoch": 0.5892659692098324, + "grad_norm": 0.671875, + "learning_rate": 0.00014666544015093778, + "loss": 0.8641, + "step": 22949 + }, + { + "epoch": 0.5892916464057543, + "grad_norm": 0.80859375, + "learning_rate": 0.0001466614917870192, + "loss": 0.7997, + "step": 22950 + }, + { + "epoch": 0.5893173236016761, + "grad_norm": 0.76953125, + "learning_rate": 0.00014665754333010849, + "loss": 0.9223, + "step": 22951 + }, + { + "epoch": 0.5893430007975979, + "grad_norm": 0.8125, + "learning_rate": 0.0001466535947802135, + "loss": 0.8998, + "step": 22952 + }, + { + "epoch": 0.5893686779935197, + "grad_norm": 0.8046875, + "learning_rate": 0.00014664964613734212, + "loss": 0.9446, + "step": 22953 + }, + { + "epoch": 0.5893943551894415, + "grad_norm": 0.765625, + "learning_rate": 0.0001466456974015022, + "loss": 1.0392, + "step": 22954 + }, + { + "epoch": 0.5894200323853633, + "grad_norm": 0.7265625, + "learning_rate": 0.00014664174857270164, + "loss": 0.8231, + "step": 22955 + }, + { + "epoch": 0.5894457095812852, + "grad_norm": 0.81640625, + "learning_rate": 0.00014663779965094827, + "loss": 0.8625, + "step": 22956 + }, + { + "epoch": 0.589471386777207, + "grad_norm": 0.8203125, + "learning_rate": 0.00014663385063624996, + "loss": 0.9166, + "step": 22957 + }, + { + "epoch": 0.5894970639731288, + "grad_norm": 0.76171875, + "learning_rate": 0.00014662990152861463, + "loss": 0.8304, + "step": 22958 + }, + { + "epoch": 0.5895227411690507, + "grad_norm": 0.7265625, + "learning_rate": 0.00014662595232805008, + "loss": 0.8602, + "step": 22959 + }, + { + "epoch": 0.5895484183649724, + "grad_norm": 0.8359375, + "learning_rate": 0.00014662200303456422, + "loss": 0.7682, + "step": 22960 + }, + { + "epoch": 0.5895740955608942, + "grad_norm": 0.79296875, + "learning_rate": 0.00014661805364816498, + "loss": 0.8305, + "step": 22961 + }, + { + "epoch": 0.5895997727568161, + "grad_norm": 0.83203125, + "learning_rate": 0.00014661410416886013, + "loss": 0.8786, + "step": 22962 + }, + { + "epoch": 0.5896254499527379, + "grad_norm": 0.73828125, + "learning_rate": 0.00014661015459665759, + "loss": 0.7405, + "step": 22963 + }, + { + "epoch": 0.5896511271486597, + "grad_norm": 0.765625, + "learning_rate": 0.0001466062049315652, + "loss": 0.8916, + "step": 22964 + }, + { + "epoch": 0.5896768043445816, + "grad_norm": 0.7734375, + "learning_rate": 0.00014660225517359088, + "loss": 0.9308, + "step": 22965 + }, + { + "epoch": 0.5897024815405034, + "grad_norm": 0.734375, + "learning_rate": 0.0001465983053227425, + "loss": 0.8032, + "step": 22966 + }, + { + "epoch": 0.5897281587364251, + "grad_norm": 0.8046875, + "learning_rate": 0.00014659435537902783, + "loss": 0.8321, + "step": 22967 + }, + { + "epoch": 0.589753835932347, + "grad_norm": 1.0, + "learning_rate": 0.00014659040534245488, + "loss": 0.9147, + "step": 22968 + }, + { + "epoch": 0.5897795131282688, + "grad_norm": 0.6796875, + "learning_rate": 0.00014658645521303145, + "loss": 0.8422, + "step": 22969 + }, + { + "epoch": 0.5898051903241907, + "grad_norm": 0.75390625, + "learning_rate": 0.00014658250499076538, + "loss": 0.7459, + "step": 22970 + }, + { + "epoch": 0.5898308675201125, + "grad_norm": 0.84765625, + "learning_rate": 0.00014657855467566463, + "loss": 0.8848, + "step": 22971 + }, + { + "epoch": 0.5898565447160343, + "grad_norm": 0.78125, + "learning_rate": 0.00014657460426773704, + "loss": 0.9206, + "step": 22972 + }, + { + "epoch": 0.589882221911956, + "grad_norm": 0.74609375, + "learning_rate": 0.00014657065376699041, + "loss": 0.876, + "step": 22973 + }, + { + "epoch": 0.5899078991078779, + "grad_norm": 0.7890625, + "learning_rate": 0.00014656670317343275, + "loss": 0.9468, + "step": 22974 + }, + { + "epoch": 0.5899335763037997, + "grad_norm": 0.8203125, + "learning_rate": 0.0001465627524870718, + "loss": 0.8129, + "step": 22975 + }, + { + "epoch": 0.5899592534997216, + "grad_norm": 0.7109375, + "learning_rate": 0.00014655880170791552, + "loss": 0.7611, + "step": 22976 + }, + { + "epoch": 0.5899849306956434, + "grad_norm": 0.765625, + "learning_rate": 0.00014655485083597174, + "loss": 0.7233, + "step": 22977 + }, + { + "epoch": 0.5900106078915652, + "grad_norm": 0.7578125, + "learning_rate": 0.00014655089987124833, + "loss": 0.7742, + "step": 22978 + }, + { + "epoch": 0.5900362850874871, + "grad_norm": 0.75, + "learning_rate": 0.0001465469488137532, + "loss": 0.9841, + "step": 22979 + }, + { + "epoch": 0.5900619622834088, + "grad_norm": 0.72265625, + "learning_rate": 0.0001465429976634942, + "loss": 0.8253, + "step": 22980 + }, + { + "epoch": 0.5900876394793306, + "grad_norm": 0.70703125, + "learning_rate": 0.00014653904642047925, + "loss": 0.83, + "step": 22981 + }, + { + "epoch": 0.5901133166752525, + "grad_norm": 0.74609375, + "learning_rate": 0.00014653509508471613, + "loss": 0.766, + "step": 22982 + }, + { + "epoch": 0.5901389938711743, + "grad_norm": 0.8125, + "learning_rate": 0.00014653114365621278, + "loss": 0.9354, + "step": 22983 + }, + { + "epoch": 0.5901646710670961, + "grad_norm": 0.73828125, + "learning_rate": 0.0001465271921349771, + "loss": 0.919, + "step": 22984 + }, + { + "epoch": 0.590190348263018, + "grad_norm": 0.7890625, + "learning_rate": 0.0001465232405210169, + "loss": 0.8047, + "step": 22985 + }, + { + "epoch": 0.5902160254589398, + "grad_norm": 0.69921875, + "learning_rate": 0.00014651928881434008, + "loss": 0.8444, + "step": 22986 + }, + { + "epoch": 0.5902417026548615, + "grad_norm": 0.7578125, + "learning_rate": 0.00014651533701495455, + "loss": 0.8422, + "step": 22987 + }, + { + "epoch": 0.5902673798507834, + "grad_norm": 0.75390625, + "learning_rate": 0.0001465113851228681, + "loss": 0.8155, + "step": 22988 + }, + { + "epoch": 0.5902930570467052, + "grad_norm": 0.765625, + "learning_rate": 0.0001465074331380887, + "loss": 0.8115, + "step": 22989 + }, + { + "epoch": 0.590318734242627, + "grad_norm": 0.83203125, + "learning_rate": 0.0001465034810606242, + "loss": 0.848, + "step": 22990 + }, + { + "epoch": 0.5903444114385489, + "grad_norm": 0.81640625, + "learning_rate": 0.00014649952889048243, + "loss": 0.8743, + "step": 22991 + }, + { + "epoch": 0.5903700886344707, + "grad_norm": 0.81640625, + "learning_rate": 0.0001464955766276713, + "loss": 0.8121, + "step": 22992 + }, + { + "epoch": 0.5903957658303924, + "grad_norm": 0.8359375, + "learning_rate": 0.00014649162427219872, + "loss": 0.8615, + "step": 22993 + }, + { + "epoch": 0.5904214430263143, + "grad_norm": 0.83203125, + "learning_rate": 0.0001464876718240725, + "loss": 0.8613, + "step": 22994 + }, + { + "epoch": 0.5904471202222361, + "grad_norm": 0.7578125, + "learning_rate": 0.00014648371928330055, + "loss": 0.8944, + "step": 22995 + }, + { + "epoch": 0.590472797418158, + "grad_norm": 0.7578125, + "learning_rate": 0.00014647976664989077, + "loss": 0.8189, + "step": 22996 + }, + { + "epoch": 0.5904984746140798, + "grad_norm": 0.7734375, + "learning_rate": 0.000146475813923851, + "loss": 0.7911, + "step": 22997 + }, + { + "epoch": 0.5905241518100016, + "grad_norm": 0.78125, + "learning_rate": 0.00014647186110518912, + "loss": 0.8511, + "step": 22998 + }, + { + "epoch": 0.5905498290059235, + "grad_norm": 0.7421875, + "learning_rate": 0.000146467908193913, + "loss": 0.8178, + "step": 22999 + }, + { + "epoch": 0.5905755062018452, + "grad_norm": 0.74609375, + "learning_rate": 0.0001464639551900306, + "loss": 0.8476, + "step": 23000 + }, + { + "epoch": 0.5905755062018452, + "eval_loss": 0.8621730208396912, + "eval_runtime": 391.923, + "eval_samples_per_second": 25.515, + "eval_steps_per_second": 0.799, + "step": 23000 + }, + { + "epoch": 0.590601183397767, + "grad_norm": 0.76953125, + "learning_rate": 0.00014646000209354968, + "loss": 0.746, + "step": 23001 + }, + { + "epoch": 0.5906268605936889, + "grad_norm": 0.7578125, + "learning_rate": 0.0001464560489044782, + "loss": 0.7591, + "step": 23002 + }, + { + "epoch": 0.5906525377896107, + "grad_norm": 0.796875, + "learning_rate": 0.00014645209562282402, + "loss": 0.9481, + "step": 23003 + }, + { + "epoch": 0.5906782149855325, + "grad_norm": 0.81640625, + "learning_rate": 0.00014644814224859496, + "loss": 0.8734, + "step": 23004 + }, + { + "epoch": 0.5907038921814544, + "grad_norm": 0.73828125, + "learning_rate": 0.000146444188781799, + "loss": 0.7481, + "step": 23005 + }, + { + "epoch": 0.5907295693773762, + "grad_norm": 0.78125, + "learning_rate": 0.00014644023522244394, + "loss": 0.9059, + "step": 23006 + }, + { + "epoch": 0.5907552465732979, + "grad_norm": 0.79296875, + "learning_rate": 0.0001464362815705377, + "loss": 0.9199, + "step": 23007 + }, + { + "epoch": 0.5907809237692198, + "grad_norm": 0.75, + "learning_rate": 0.00014643232782608813, + "loss": 0.8031, + "step": 23008 + }, + { + "epoch": 0.5908066009651416, + "grad_norm": 0.75, + "learning_rate": 0.00014642837398910316, + "loss": 1.0016, + "step": 23009 + }, + { + "epoch": 0.5908322781610634, + "grad_norm": 0.8515625, + "learning_rate": 0.00014642442005959058, + "loss": 0.8877, + "step": 23010 + }, + { + "epoch": 0.5908579553569853, + "grad_norm": 0.73828125, + "learning_rate": 0.00014642046603755837, + "loss": 0.8038, + "step": 23011 + }, + { + "epoch": 0.5908836325529071, + "grad_norm": 0.84375, + "learning_rate": 0.0001464165119230143, + "loss": 0.9201, + "step": 23012 + }, + { + "epoch": 0.5909093097488288, + "grad_norm": 0.7421875, + "learning_rate": 0.00014641255771596638, + "loss": 0.8513, + "step": 23013 + }, + { + "epoch": 0.5909349869447507, + "grad_norm": 0.765625, + "learning_rate": 0.00014640860341642243, + "loss": 0.9406, + "step": 23014 + }, + { + "epoch": 0.5909606641406725, + "grad_norm": 0.8125, + "learning_rate": 0.0001464046490243903, + "loss": 0.9762, + "step": 23015 + }, + { + "epoch": 0.5909863413365943, + "grad_norm": 0.8203125, + "learning_rate": 0.00014640069453987787, + "loss": 0.895, + "step": 23016 + }, + { + "epoch": 0.5910120185325162, + "grad_norm": 0.796875, + "learning_rate": 0.00014639673996289312, + "loss": 0.8512, + "step": 23017 + }, + { + "epoch": 0.591037695728438, + "grad_norm": 0.7734375, + "learning_rate": 0.0001463927852934438, + "loss": 0.8962, + "step": 23018 + }, + { + "epoch": 0.5910633729243598, + "grad_norm": 0.76953125, + "learning_rate": 0.00014638883053153784, + "loss": 0.8217, + "step": 23019 + }, + { + "epoch": 0.5910890501202816, + "grad_norm": 0.80078125, + "learning_rate": 0.00014638487567718316, + "loss": 0.8719, + "step": 23020 + }, + { + "epoch": 0.5911147273162034, + "grad_norm": 0.75390625, + "learning_rate": 0.00014638092073038762, + "loss": 0.7874, + "step": 23021 + }, + { + "epoch": 0.5911404045121252, + "grad_norm": 0.8125, + "learning_rate": 0.00014637696569115907, + "loss": 0.9057, + "step": 23022 + }, + { + "epoch": 0.5911660817080471, + "grad_norm": 0.78125, + "learning_rate": 0.00014637301055950543, + "loss": 0.8308, + "step": 23023 + }, + { + "epoch": 0.5911917589039689, + "grad_norm": 0.7890625, + "learning_rate": 0.00014636905533543457, + "loss": 0.8643, + "step": 23024 + }, + { + "epoch": 0.5912174360998907, + "grad_norm": 0.6796875, + "learning_rate": 0.00014636510001895435, + "loss": 0.7303, + "step": 23025 + }, + { + "epoch": 0.5912431132958126, + "grad_norm": 0.765625, + "learning_rate": 0.00014636114461007269, + "loss": 0.8082, + "step": 23026 + }, + { + "epoch": 0.5912687904917343, + "grad_norm": 0.734375, + "learning_rate": 0.00014635718910879748, + "loss": 0.8215, + "step": 23027 + }, + { + "epoch": 0.5912944676876561, + "grad_norm": 0.6875, + "learning_rate": 0.00014635323351513653, + "loss": 0.7981, + "step": 23028 + }, + { + "epoch": 0.591320144883578, + "grad_norm": 0.796875, + "learning_rate": 0.00014634927782909783, + "loss": 0.8464, + "step": 23029 + }, + { + "epoch": 0.5913458220794998, + "grad_norm": 0.75390625, + "learning_rate": 0.00014634532205068919, + "loss": 0.7013, + "step": 23030 + }, + { + "epoch": 0.5913714992754217, + "grad_norm": 0.84765625, + "learning_rate": 0.00014634136617991845, + "loss": 0.9444, + "step": 23031 + }, + { + "epoch": 0.5913971764713435, + "grad_norm": 0.8828125, + "learning_rate": 0.0001463374102167936, + "loss": 0.9214, + "step": 23032 + }, + { + "epoch": 0.5914228536672652, + "grad_norm": 0.8671875, + "learning_rate": 0.0001463334541613225, + "loss": 0.8499, + "step": 23033 + }, + { + "epoch": 0.591448530863187, + "grad_norm": 0.87890625, + "learning_rate": 0.000146329498013513, + "loss": 0.7896, + "step": 23034 + }, + { + "epoch": 0.5914742080591089, + "grad_norm": 0.72265625, + "learning_rate": 0.00014632554177337296, + "loss": 0.9047, + "step": 23035 + }, + { + "epoch": 0.5914998852550307, + "grad_norm": 0.72265625, + "learning_rate": 0.00014632158544091033, + "loss": 0.8621, + "step": 23036 + }, + { + "epoch": 0.5915255624509526, + "grad_norm": 0.73828125, + "learning_rate": 0.00014631762901613294, + "loss": 0.88, + "step": 23037 + }, + { + "epoch": 0.5915512396468744, + "grad_norm": 0.80859375, + "learning_rate": 0.00014631367249904874, + "loss": 1.0037, + "step": 23038 + }, + { + "epoch": 0.5915769168427962, + "grad_norm": 0.78515625, + "learning_rate": 0.00014630971588966554, + "loss": 0.9264, + "step": 23039 + }, + { + "epoch": 0.591602594038718, + "grad_norm": 0.78515625, + "learning_rate": 0.00014630575918799128, + "loss": 0.8267, + "step": 23040 + }, + { + "epoch": 0.5916282712346398, + "grad_norm": 0.7734375, + "learning_rate": 0.0001463018023940338, + "loss": 0.7935, + "step": 23041 + }, + { + "epoch": 0.5916539484305616, + "grad_norm": 0.828125, + "learning_rate": 0.00014629784550780105, + "loss": 0.8472, + "step": 23042 + }, + { + "epoch": 0.5916796256264835, + "grad_norm": 0.73046875, + "learning_rate": 0.00014629388852930084, + "loss": 0.7845, + "step": 23043 + }, + { + "epoch": 0.5917053028224053, + "grad_norm": 0.8046875, + "learning_rate": 0.0001462899314585411, + "loss": 0.8555, + "step": 23044 + }, + { + "epoch": 0.5917309800183271, + "grad_norm": 0.76171875, + "learning_rate": 0.00014628597429552968, + "loss": 0.9038, + "step": 23045 + }, + { + "epoch": 0.591756657214249, + "grad_norm": 0.7890625, + "learning_rate": 0.00014628201704027457, + "loss": 0.9346, + "step": 23046 + }, + { + "epoch": 0.5917823344101707, + "grad_norm": 0.78515625, + "learning_rate": 0.00014627805969278352, + "loss": 0.8692, + "step": 23047 + }, + { + "epoch": 0.5918080116060925, + "grad_norm": 0.7421875, + "learning_rate": 0.00014627410225306447, + "loss": 0.7117, + "step": 23048 + }, + { + "epoch": 0.5918336888020144, + "grad_norm": 0.7265625, + "learning_rate": 0.00014627014472112538, + "loss": 0.81, + "step": 23049 + }, + { + "epoch": 0.5918593659979362, + "grad_norm": 0.68359375, + "learning_rate": 0.000146266187096974, + "loss": 0.8483, + "step": 23050 + }, + { + "epoch": 0.591885043193858, + "grad_norm": 0.78515625, + "learning_rate": 0.00014626222938061832, + "loss": 0.8484, + "step": 23051 + }, + { + "epoch": 0.5919107203897799, + "grad_norm": 0.90625, + "learning_rate": 0.00014625827157206618, + "loss": 0.7581, + "step": 23052 + }, + { + "epoch": 0.5919363975857016, + "grad_norm": 0.8125, + "learning_rate": 0.00014625431367132553, + "loss": 0.9994, + "step": 23053 + }, + { + "epoch": 0.5919620747816234, + "grad_norm": 0.765625, + "learning_rate": 0.00014625035567840416, + "loss": 0.8771, + "step": 23054 + }, + { + "epoch": 0.5919877519775453, + "grad_norm": 0.859375, + "learning_rate": 0.00014624639759331, + "loss": 0.9142, + "step": 23055 + }, + { + "epoch": 0.5920134291734671, + "grad_norm": 0.82421875, + "learning_rate": 0.00014624243941605098, + "loss": 0.9578, + "step": 23056 + }, + { + "epoch": 0.592039106369389, + "grad_norm": 0.75390625, + "learning_rate": 0.00014623848114663496, + "loss": 0.855, + "step": 23057 + }, + { + "epoch": 0.5920647835653108, + "grad_norm": 0.7421875, + "learning_rate": 0.0001462345227850698, + "loss": 0.7136, + "step": 23058 + }, + { + "epoch": 0.5920904607612326, + "grad_norm": 0.80859375, + "learning_rate": 0.00014623056433136342, + "loss": 0.9748, + "step": 23059 + }, + { + "epoch": 0.5921161379571543, + "grad_norm": 0.796875, + "learning_rate": 0.00014622660578552372, + "loss": 0.7558, + "step": 23060 + }, + { + "epoch": 0.5921418151530762, + "grad_norm": 0.83984375, + "learning_rate": 0.00014622264714755855, + "loss": 0.9139, + "step": 23061 + }, + { + "epoch": 0.592167492348998, + "grad_norm": 0.7890625, + "learning_rate": 0.00014621868841747584, + "loss": 0.9273, + "step": 23062 + }, + { + "epoch": 0.5921931695449199, + "grad_norm": 0.71484375, + "learning_rate": 0.00014621472959528342, + "loss": 0.9422, + "step": 23063 + }, + { + "epoch": 0.5922188467408417, + "grad_norm": 0.8203125, + "learning_rate": 0.00014621077068098924, + "loss": 0.8857, + "step": 23064 + }, + { + "epoch": 0.5922445239367635, + "grad_norm": 0.77734375, + "learning_rate": 0.00014620681167460116, + "loss": 0.8015, + "step": 23065 + }, + { + "epoch": 0.5922702011326854, + "grad_norm": 0.80078125, + "learning_rate": 0.0001462028525761271, + "loss": 0.8262, + "step": 23066 + }, + { + "epoch": 0.5922958783286071, + "grad_norm": 0.796875, + "learning_rate": 0.00014619889338557494, + "loss": 0.9073, + "step": 23067 + }, + { + "epoch": 0.5923215555245289, + "grad_norm": 0.921875, + "learning_rate": 0.0001461949341029525, + "loss": 0.973, + "step": 23068 + }, + { + "epoch": 0.5923472327204508, + "grad_norm": 0.75390625, + "learning_rate": 0.00014619097472826778, + "loss": 0.9525, + "step": 23069 + }, + { + "epoch": 0.5923729099163726, + "grad_norm": 0.78515625, + "learning_rate": 0.0001461870152615286, + "loss": 0.9169, + "step": 23070 + }, + { + "epoch": 0.5923985871122944, + "grad_norm": 0.828125, + "learning_rate": 0.00014618305570274285, + "loss": 0.9671, + "step": 23071 + }, + { + "epoch": 0.5924242643082163, + "grad_norm": 0.8125, + "learning_rate": 0.00014617909605191848, + "loss": 0.8714, + "step": 23072 + }, + { + "epoch": 0.592449941504138, + "grad_norm": 0.74609375, + "learning_rate": 0.00014617513630906334, + "loss": 0.8097, + "step": 23073 + }, + { + "epoch": 0.5924756187000598, + "grad_norm": 0.78125, + "learning_rate": 0.00014617117647418532, + "loss": 0.9666, + "step": 23074 + }, + { + "epoch": 0.5925012958959817, + "grad_norm": 0.765625, + "learning_rate": 0.0001461672165472923, + "loss": 0.8166, + "step": 23075 + }, + { + "epoch": 0.5925269730919035, + "grad_norm": 0.78515625, + "learning_rate": 0.00014616325652839221, + "loss": 0.8195, + "step": 23076 + }, + { + "epoch": 0.5925526502878253, + "grad_norm": 0.79296875, + "learning_rate": 0.00014615929641749288, + "loss": 0.897, + "step": 23077 + }, + { + "epoch": 0.5925783274837472, + "grad_norm": 0.77734375, + "learning_rate": 0.0001461553362146023, + "loss": 1.0052, + "step": 23078 + }, + { + "epoch": 0.592604004679669, + "grad_norm": 0.828125, + "learning_rate": 0.00014615137591972828, + "loss": 0.9006, + "step": 23079 + }, + { + "epoch": 0.5926296818755907, + "grad_norm": 0.7734375, + "learning_rate": 0.00014614741553287873, + "loss": 0.8106, + "step": 23080 + }, + { + "epoch": 0.5926553590715126, + "grad_norm": 0.84375, + "learning_rate": 0.00014614345505406152, + "loss": 1.0553, + "step": 23081 + }, + { + "epoch": 0.5926810362674344, + "grad_norm": 0.78125, + "learning_rate": 0.0001461394944832846, + "loss": 0.9198, + "step": 23082 + }, + { + "epoch": 0.5927067134633562, + "grad_norm": 0.828125, + "learning_rate": 0.00014613553382055586, + "loss": 0.9052, + "step": 23083 + }, + { + "epoch": 0.5927323906592781, + "grad_norm": 0.703125, + "learning_rate": 0.00014613157306588313, + "loss": 0.8815, + "step": 23084 + }, + { + "epoch": 0.5927580678551999, + "grad_norm": 0.87109375, + "learning_rate": 0.00014612761221927435, + "loss": 0.9097, + "step": 23085 + }, + { + "epoch": 0.5927837450511217, + "grad_norm": 0.77734375, + "learning_rate": 0.00014612365128073744, + "loss": 0.9651, + "step": 23086 + }, + { + "epoch": 0.5928094222470435, + "grad_norm": 0.7578125, + "learning_rate": 0.0001461196902502802, + "loss": 1.0846, + "step": 23087 + }, + { + "epoch": 0.5928350994429653, + "grad_norm": 0.7890625, + "learning_rate": 0.00014611572912791063, + "loss": 0.8225, + "step": 23088 + }, + { + "epoch": 0.5928607766388871, + "grad_norm": 0.75390625, + "learning_rate": 0.00014611176791363655, + "loss": 0.9031, + "step": 23089 + }, + { + "epoch": 0.592886453834809, + "grad_norm": 0.8203125, + "learning_rate": 0.0001461078066074659, + "loss": 0.9554, + "step": 23090 + }, + { + "epoch": 0.5929121310307308, + "grad_norm": 0.7421875, + "learning_rate": 0.00014610384520940655, + "loss": 0.7522, + "step": 23091 + }, + { + "epoch": 0.5929378082266527, + "grad_norm": 0.7734375, + "learning_rate": 0.0001460998837194664, + "loss": 0.9158, + "step": 23092 + }, + { + "epoch": 0.5929634854225744, + "grad_norm": 0.78125, + "learning_rate": 0.00014609592213765335, + "loss": 0.8651, + "step": 23093 + }, + { + "epoch": 0.5929891626184962, + "grad_norm": 0.8203125, + "learning_rate": 0.0001460919604639753, + "loss": 0.8814, + "step": 23094 + }, + { + "epoch": 0.593014839814418, + "grad_norm": 0.84765625, + "learning_rate": 0.00014608799869844012, + "loss": 0.9524, + "step": 23095 + }, + { + "epoch": 0.5930405170103399, + "grad_norm": 0.80078125, + "learning_rate": 0.00014608403684105572, + "loss": 0.9232, + "step": 23096 + }, + { + "epoch": 0.5930661942062617, + "grad_norm": 0.8125, + "learning_rate": 0.00014608007489183003, + "loss": 0.7519, + "step": 23097 + }, + { + "epoch": 0.5930918714021836, + "grad_norm": 0.7421875, + "learning_rate": 0.00014607611285077086, + "loss": 0.6918, + "step": 23098 + }, + { + "epoch": 0.5931175485981054, + "grad_norm": 0.7890625, + "learning_rate": 0.0001460721507178862, + "loss": 0.7769, + "step": 23099 + }, + { + "epoch": 0.5931432257940271, + "grad_norm": 0.734375, + "learning_rate": 0.00014606818849318386, + "loss": 0.8679, + "step": 23100 + }, + { + "epoch": 0.593168902989949, + "grad_norm": 0.7890625, + "learning_rate": 0.00014606422617667185, + "loss": 0.9415, + "step": 23101 + }, + { + "epoch": 0.5931945801858708, + "grad_norm": 0.7578125, + "learning_rate": 0.00014606026376835795, + "loss": 1.0096, + "step": 23102 + }, + { + "epoch": 0.5932202573817926, + "grad_norm": 0.78125, + "learning_rate": 0.0001460563012682501, + "loss": 0.8326, + "step": 23103 + }, + { + "epoch": 0.5932459345777145, + "grad_norm": 0.80078125, + "learning_rate": 0.00014605233867635625, + "loss": 0.9003, + "step": 23104 + }, + { + "epoch": 0.5932716117736363, + "grad_norm": 0.82421875, + "learning_rate": 0.0001460483759926842, + "loss": 0.9282, + "step": 23105 + }, + { + "epoch": 0.5932972889695581, + "grad_norm": 0.734375, + "learning_rate": 0.0001460444132172419, + "loss": 0.8697, + "step": 23106 + }, + { + "epoch": 0.5933229661654799, + "grad_norm": 0.75, + "learning_rate": 0.00014604045035003727, + "loss": 0.7725, + "step": 23107 + }, + { + "epoch": 0.5933486433614017, + "grad_norm": 0.76171875, + "learning_rate": 0.00014603648739107815, + "loss": 0.8377, + "step": 23108 + }, + { + "epoch": 0.5933743205573235, + "grad_norm": 0.80078125, + "learning_rate": 0.00014603252434037249, + "loss": 0.8976, + "step": 23109 + }, + { + "epoch": 0.5933999977532454, + "grad_norm": 0.80078125, + "learning_rate": 0.0001460285611979282, + "loss": 0.8209, + "step": 23110 + }, + { + "epoch": 0.5934256749491672, + "grad_norm": 0.76171875, + "learning_rate": 0.00014602459796375308, + "loss": 0.9716, + "step": 23111 + }, + { + "epoch": 0.593451352145089, + "grad_norm": 0.890625, + "learning_rate": 0.00014602063463785514, + "loss": 0.941, + "step": 23112 + }, + { + "epoch": 0.5934770293410108, + "grad_norm": 0.85546875, + "learning_rate": 0.0001460166712202422, + "loss": 0.9788, + "step": 23113 + }, + { + "epoch": 0.5935027065369326, + "grad_norm": 0.7734375, + "learning_rate": 0.00014601270771092222, + "loss": 0.8116, + "step": 23114 + }, + { + "epoch": 0.5935283837328544, + "grad_norm": 0.7109375, + "learning_rate": 0.00014600874410990305, + "loss": 0.8317, + "step": 23115 + }, + { + "epoch": 0.5935540609287763, + "grad_norm": 0.75, + "learning_rate": 0.0001460047804171926, + "loss": 0.8212, + "step": 23116 + }, + { + "epoch": 0.5935797381246981, + "grad_norm": 0.72265625, + "learning_rate": 0.0001460008166327988, + "loss": 0.805, + "step": 23117 + }, + { + "epoch": 0.59360541532062, + "grad_norm": 0.859375, + "learning_rate": 0.00014599685275672953, + "loss": 0.8545, + "step": 23118 + }, + { + "epoch": 0.5936310925165418, + "grad_norm": 0.8125, + "learning_rate": 0.00014599288878899266, + "loss": 0.7701, + "step": 23119 + }, + { + "epoch": 0.5936567697124635, + "grad_norm": 0.83203125, + "learning_rate": 0.00014598892472959615, + "loss": 0.9612, + "step": 23120 + }, + { + "epoch": 0.5936824469083853, + "grad_norm": 0.76171875, + "learning_rate": 0.00014598496057854785, + "loss": 0.9143, + "step": 23121 + }, + { + "epoch": 0.5937081241043072, + "grad_norm": 0.72265625, + "learning_rate": 0.00014598099633585568, + "loss": 0.8527, + "step": 23122 + }, + { + "epoch": 0.593733801300229, + "grad_norm": 0.75390625, + "learning_rate": 0.00014597703200152756, + "loss": 0.8058, + "step": 23123 + }, + { + "epoch": 0.5937594784961508, + "grad_norm": 0.74609375, + "learning_rate": 0.0001459730675755713, + "loss": 0.8476, + "step": 23124 + }, + { + "epoch": 0.5937851556920727, + "grad_norm": 0.8125, + "learning_rate": 0.00014596910305799494, + "loss": 0.9054, + "step": 23125 + }, + { + "epoch": 0.5938108328879945, + "grad_norm": 0.75390625, + "learning_rate": 0.00014596513844880628, + "loss": 0.8998, + "step": 23126 + }, + { + "epoch": 0.5938365100839162, + "grad_norm": 0.73828125, + "learning_rate": 0.00014596117374801324, + "loss": 0.8465, + "step": 23127 + }, + { + "epoch": 0.5938621872798381, + "grad_norm": 0.734375, + "learning_rate": 0.00014595720895562377, + "loss": 0.8442, + "step": 23128 + }, + { + "epoch": 0.5938878644757599, + "grad_norm": 0.7265625, + "learning_rate": 0.0001459532440716457, + "loss": 0.9071, + "step": 23129 + }, + { + "epoch": 0.5939135416716818, + "grad_norm": 0.828125, + "learning_rate": 0.000145949279096087, + "loss": 0.7619, + "step": 23130 + }, + { + "epoch": 0.5939392188676036, + "grad_norm": 0.75390625, + "learning_rate": 0.0001459453140289555, + "loss": 0.8116, + "step": 23131 + }, + { + "epoch": 0.5939648960635254, + "grad_norm": 0.7265625, + "learning_rate": 0.00014594134887025914, + "loss": 0.8488, + "step": 23132 + }, + { + "epoch": 0.5939905732594472, + "grad_norm": 0.7421875, + "learning_rate": 0.00014593738362000583, + "loss": 0.8451, + "step": 23133 + }, + { + "epoch": 0.594016250455369, + "grad_norm": 0.86328125, + "learning_rate": 0.0001459334182782035, + "loss": 0.8365, + "step": 23134 + }, + { + "epoch": 0.5940419276512908, + "grad_norm": 0.7734375, + "learning_rate": 0.00014592945284485996, + "loss": 0.8534, + "step": 23135 + }, + { + "epoch": 0.5940676048472127, + "grad_norm": 0.85546875, + "learning_rate": 0.0001459254873199832, + "loss": 0.8525, + "step": 23136 + }, + { + "epoch": 0.5940932820431345, + "grad_norm": 0.734375, + "learning_rate": 0.00014592152170358108, + "loss": 0.706, + "step": 23137 + }, + { + "epoch": 0.5941189592390563, + "grad_norm": 0.7265625, + "learning_rate": 0.00014591755599566153, + "loss": 0.8278, + "step": 23138 + }, + { + "epoch": 0.5941446364349782, + "grad_norm": 0.734375, + "learning_rate": 0.00014591359019623245, + "loss": 0.9142, + "step": 23139 + }, + { + "epoch": 0.5941703136308999, + "grad_norm": 0.8046875, + "learning_rate": 0.0001459096243053017, + "loss": 0.7561, + "step": 23140 + }, + { + "epoch": 0.5941959908268217, + "grad_norm": 0.8046875, + "learning_rate": 0.00014590565832287724, + "loss": 0.7729, + "step": 23141 + }, + { + "epoch": 0.5942216680227436, + "grad_norm": 0.7734375, + "learning_rate": 0.00014590169224896696, + "loss": 0.9258, + "step": 23142 + }, + { + "epoch": 0.5942473452186654, + "grad_norm": 0.7890625, + "learning_rate": 0.0001458977260835787, + "loss": 0.7936, + "step": 23143 + }, + { + "epoch": 0.5942730224145872, + "grad_norm": 0.76953125, + "learning_rate": 0.00014589375982672044, + "loss": 0.9287, + "step": 23144 + }, + { + "epoch": 0.5942986996105091, + "grad_norm": 0.80078125, + "learning_rate": 0.00014588979347840008, + "loss": 0.8055, + "step": 23145 + }, + { + "epoch": 0.5943243768064308, + "grad_norm": 0.765625, + "learning_rate": 0.00014588582703862552, + "loss": 0.7808, + "step": 23146 + }, + { + "epoch": 0.5943500540023526, + "grad_norm": 0.8125, + "learning_rate": 0.00014588186050740464, + "loss": 0.9362, + "step": 23147 + }, + { + "epoch": 0.5943757311982745, + "grad_norm": 0.796875, + "learning_rate": 0.00014587789388474534, + "loss": 0.8029, + "step": 23148 + }, + { + "epoch": 0.5944014083941963, + "grad_norm": 0.80078125, + "learning_rate": 0.00014587392717065552, + "loss": 0.8605, + "step": 23149 + }, + { + "epoch": 0.5944270855901181, + "grad_norm": 0.77734375, + "learning_rate": 0.00014586996036514315, + "loss": 0.8775, + "step": 23150 + }, + { + "epoch": 0.59445276278604, + "grad_norm": 0.75, + "learning_rate": 0.00014586599346821607, + "loss": 0.9038, + "step": 23151 + }, + { + "epoch": 0.5944784399819618, + "grad_norm": 0.7109375, + "learning_rate": 0.00014586202647988222, + "loss": 0.8807, + "step": 23152 + }, + { + "epoch": 0.5945041171778835, + "grad_norm": 0.70703125, + "learning_rate": 0.0001458580594001495, + "loss": 0.7704, + "step": 23153 + }, + { + "epoch": 0.5945297943738054, + "grad_norm": 0.77734375, + "learning_rate": 0.0001458540922290258, + "loss": 0.8325, + "step": 23154 + }, + { + "epoch": 0.5945554715697272, + "grad_norm": 0.7578125, + "learning_rate": 0.00014585012496651903, + "loss": 0.7496, + "step": 23155 + }, + { + "epoch": 0.594581148765649, + "grad_norm": 0.75, + "learning_rate": 0.00014584615761263712, + "loss": 0.8021, + "step": 23156 + }, + { + "epoch": 0.5946068259615709, + "grad_norm": 0.8046875, + "learning_rate": 0.00014584219016738794, + "loss": 0.8547, + "step": 23157 + }, + { + "epoch": 0.5946325031574927, + "grad_norm": 0.7421875, + "learning_rate": 0.00014583822263077942, + "loss": 0.9033, + "step": 23158 + }, + { + "epoch": 0.5946581803534146, + "grad_norm": 0.765625, + "learning_rate": 0.0001458342550028195, + "loss": 0.8641, + "step": 23159 + }, + { + "epoch": 0.5946838575493363, + "grad_norm": 0.76953125, + "learning_rate": 0.000145830287283516, + "loss": 0.8261, + "step": 23160 + }, + { + "epoch": 0.5947095347452581, + "grad_norm": 0.80859375, + "learning_rate": 0.0001458263194728769, + "loss": 0.8827, + "step": 23161 + }, + { + "epoch": 0.59473521194118, + "grad_norm": 0.78125, + "learning_rate": 0.00014582235157091007, + "loss": 0.8215, + "step": 23162 + }, + { + "epoch": 0.5947608891371018, + "grad_norm": 0.76953125, + "learning_rate": 0.00014581838357762345, + "loss": 0.7998, + "step": 23163 + }, + { + "epoch": 0.5947865663330236, + "grad_norm": 0.765625, + "learning_rate": 0.0001458144154930249, + "loss": 0.8369, + "step": 23164 + }, + { + "epoch": 0.5948122435289455, + "grad_norm": 0.79296875, + "learning_rate": 0.0001458104473171224, + "loss": 0.976, + "step": 23165 + }, + { + "epoch": 0.5948379207248672, + "grad_norm": 0.89453125, + "learning_rate": 0.00014580647904992379, + "loss": 1.0442, + "step": 23166 + }, + { + "epoch": 0.594863597920789, + "grad_norm": 0.7421875, + "learning_rate": 0.000145802510691437, + "loss": 0.9052, + "step": 23167 + }, + { + "epoch": 0.5948892751167109, + "grad_norm": 0.76171875, + "learning_rate": 0.00014579854224166998, + "loss": 0.9363, + "step": 23168 + }, + { + "epoch": 0.5949149523126327, + "grad_norm": 0.79296875, + "learning_rate": 0.00014579457370063055, + "loss": 0.8773, + "step": 23169 + }, + { + "epoch": 0.5949406295085545, + "grad_norm": 0.796875, + "learning_rate": 0.0001457906050683267, + "loss": 0.8647, + "step": 23170 + }, + { + "epoch": 0.5949663067044764, + "grad_norm": 0.82421875, + "learning_rate": 0.00014578663634476634, + "loss": 0.9438, + "step": 23171 + }, + { + "epoch": 0.5949919839003982, + "grad_norm": 0.75390625, + "learning_rate": 0.00014578266752995732, + "loss": 0.7805, + "step": 23172 + }, + { + "epoch": 0.5950176610963199, + "grad_norm": 0.68359375, + "learning_rate": 0.00014577869862390756, + "loss": 0.8304, + "step": 23173 + }, + { + "epoch": 0.5950433382922418, + "grad_norm": 0.7734375, + "learning_rate": 0.000145774729626625, + "loss": 0.8697, + "step": 23174 + }, + { + "epoch": 0.5950690154881636, + "grad_norm": 0.7890625, + "learning_rate": 0.00014577076053811755, + "loss": 0.7847, + "step": 23175 + }, + { + "epoch": 0.5950946926840854, + "grad_norm": 0.71484375, + "learning_rate": 0.0001457667913583931, + "loss": 0.8541, + "step": 23176 + }, + { + "epoch": 0.5951203698800073, + "grad_norm": 0.76171875, + "learning_rate": 0.00014576282208745957, + "loss": 0.9455, + "step": 23177 + }, + { + "epoch": 0.5951460470759291, + "grad_norm": 0.75390625, + "learning_rate": 0.0001457588527253249, + "loss": 0.7001, + "step": 23178 + }, + { + "epoch": 0.5951717242718509, + "grad_norm": 0.9140625, + "learning_rate": 0.00014575488327199694, + "loss": 0.9108, + "step": 23179 + }, + { + "epoch": 0.5951974014677727, + "grad_norm": 0.77734375, + "learning_rate": 0.00014575091372748361, + "loss": 0.8739, + "step": 23180 + }, + { + "epoch": 0.5952230786636945, + "grad_norm": 0.796875, + "learning_rate": 0.00014574694409179287, + "loss": 0.9191, + "step": 23181 + }, + { + "epoch": 0.5952487558596163, + "grad_norm": 0.80078125, + "learning_rate": 0.0001457429743649326, + "loss": 0.8377, + "step": 23182 + }, + { + "epoch": 0.5952744330555382, + "grad_norm": 0.796875, + "learning_rate": 0.0001457390045469107, + "loss": 0.9169, + "step": 23183 + }, + { + "epoch": 0.59530011025146, + "grad_norm": 0.8125, + "learning_rate": 0.00014573503463773512, + "loss": 0.8268, + "step": 23184 + }, + { + "epoch": 0.5953257874473818, + "grad_norm": 0.80859375, + "learning_rate": 0.00014573106463741374, + "loss": 0.8883, + "step": 23185 + }, + { + "epoch": 0.5953514646433036, + "grad_norm": 0.85546875, + "learning_rate": 0.0001457270945459545, + "loss": 0.8776, + "step": 23186 + }, + { + "epoch": 0.5953771418392254, + "grad_norm": 1.0078125, + "learning_rate": 0.00014572312436336523, + "loss": 0.7838, + "step": 23187 + }, + { + "epoch": 0.5954028190351472, + "grad_norm": 0.80859375, + "learning_rate": 0.00014571915408965392, + "loss": 0.7304, + "step": 23188 + }, + { + "epoch": 0.5954284962310691, + "grad_norm": 0.859375, + "learning_rate": 0.00014571518372482852, + "loss": 0.8598, + "step": 23189 + }, + { + "epoch": 0.5954541734269909, + "grad_norm": 0.75390625, + "learning_rate": 0.00014571121326889682, + "loss": 0.9159, + "step": 23190 + }, + { + "epoch": 0.5954798506229128, + "grad_norm": 0.76171875, + "learning_rate": 0.00014570724272186685, + "loss": 0.9159, + "step": 23191 + }, + { + "epoch": 0.5955055278188346, + "grad_norm": 0.70703125, + "learning_rate": 0.00014570327208374647, + "loss": 0.7895, + "step": 23192 + }, + { + "epoch": 0.5955312050147563, + "grad_norm": 0.80078125, + "learning_rate": 0.00014569930135454356, + "loss": 0.8975, + "step": 23193 + }, + { + "epoch": 0.5955568822106782, + "grad_norm": 0.70703125, + "learning_rate": 0.0001456953305342661, + "loss": 0.7901, + "step": 23194 + }, + { + "epoch": 0.5955825594066, + "grad_norm": 0.7421875, + "learning_rate": 0.00014569135962292198, + "loss": 0.7715, + "step": 23195 + }, + { + "epoch": 0.5956082366025218, + "grad_norm": 0.74609375, + "learning_rate": 0.00014568738862051907, + "loss": 0.8346, + "step": 23196 + }, + { + "epoch": 0.5956339137984437, + "grad_norm": 0.78125, + "learning_rate": 0.00014568341752706535, + "loss": 0.9523, + "step": 23197 + }, + { + "epoch": 0.5956595909943655, + "grad_norm": 0.75, + "learning_rate": 0.0001456794463425687, + "loss": 0.8427, + "step": 23198 + }, + { + "epoch": 0.5956852681902873, + "grad_norm": 0.84375, + "learning_rate": 0.00014567547506703704, + "loss": 0.9077, + "step": 23199 + }, + { + "epoch": 0.5957109453862091, + "grad_norm": 0.796875, + "learning_rate": 0.0001456715037004783, + "loss": 0.8786, + "step": 23200 + }, + { + "epoch": 0.5957366225821309, + "grad_norm": 0.7265625, + "learning_rate": 0.00014566753224290035, + "loss": 0.9127, + "step": 23201 + }, + { + "epoch": 0.5957622997780527, + "grad_norm": 0.79296875, + "learning_rate": 0.00014566356069431116, + "loss": 0.8439, + "step": 23202 + }, + { + "epoch": 0.5957879769739746, + "grad_norm": 0.796875, + "learning_rate": 0.0001456595890547186, + "loss": 0.8513, + "step": 23203 + }, + { + "epoch": 0.5958136541698964, + "grad_norm": 0.67578125, + "learning_rate": 0.00014565561732413063, + "loss": 0.8423, + "step": 23204 + }, + { + "epoch": 0.5958393313658182, + "grad_norm": 0.73828125, + "learning_rate": 0.0001456516455025551, + "loss": 0.8861, + "step": 23205 + }, + { + "epoch": 0.59586500856174, + "grad_norm": 0.80859375, + "learning_rate": 0.00014564767359, + "loss": 0.8486, + "step": 23206 + }, + { + "epoch": 0.5958906857576618, + "grad_norm": 0.77734375, + "learning_rate": 0.00014564370158647318, + "loss": 0.9112, + "step": 23207 + }, + { + "epoch": 0.5959163629535836, + "grad_norm": 0.765625, + "learning_rate": 0.0001456397294919826, + "loss": 0.8753, + "step": 23208 + }, + { + "epoch": 0.5959420401495055, + "grad_norm": 0.6953125, + "learning_rate": 0.00014563575730653614, + "loss": 0.7918, + "step": 23209 + }, + { + "epoch": 0.5959677173454273, + "grad_norm": 0.66796875, + "learning_rate": 0.00014563178503014177, + "loss": 0.8367, + "step": 23210 + }, + { + "epoch": 0.5959933945413491, + "grad_norm": 0.7109375, + "learning_rate": 0.0001456278126628074, + "loss": 0.8503, + "step": 23211 + }, + { + "epoch": 0.596019071737271, + "grad_norm": 0.7421875, + "learning_rate": 0.00014562384020454084, + "loss": 0.7748, + "step": 23212 + }, + { + "epoch": 0.5960447489331927, + "grad_norm": 0.75390625, + "learning_rate": 0.00014561986765535014, + "loss": 0.8527, + "step": 23213 + }, + { + "epoch": 0.5960704261291145, + "grad_norm": 0.765625, + "learning_rate": 0.00014561589501524313, + "loss": 0.8263, + "step": 23214 + }, + { + "epoch": 0.5960961033250364, + "grad_norm": 0.74609375, + "learning_rate": 0.00014561192228422783, + "loss": 0.7245, + "step": 23215 + }, + { + "epoch": 0.5961217805209582, + "grad_norm": 0.76953125, + "learning_rate": 0.000145607949462312, + "loss": 0.8288, + "step": 23216 + }, + { + "epoch": 0.59614745771688, + "grad_norm": 0.76953125, + "learning_rate": 0.00014560397654950373, + "loss": 0.8042, + "step": 23217 + }, + { + "epoch": 0.5961731349128019, + "grad_norm": 0.7890625, + "learning_rate": 0.0001456000035458108, + "loss": 0.915, + "step": 23218 + }, + { + "epoch": 0.5961988121087237, + "grad_norm": 0.7109375, + "learning_rate": 0.00014559603045124123, + "loss": 0.8547, + "step": 23219 + }, + { + "epoch": 0.5962244893046454, + "grad_norm": 0.78125, + "learning_rate": 0.00014559205726580283, + "loss": 0.9365, + "step": 23220 + }, + { + "epoch": 0.5962501665005673, + "grad_norm": 0.75390625, + "learning_rate": 0.0001455880839895036, + "loss": 0.7994, + "step": 23221 + }, + { + "epoch": 0.5962758436964891, + "grad_norm": 0.75, + "learning_rate": 0.00014558411062235146, + "loss": 0.8516, + "step": 23222 + }, + { + "epoch": 0.596301520892411, + "grad_norm": 0.734375, + "learning_rate": 0.00014558013716435428, + "loss": 0.7813, + "step": 23223 + }, + { + "epoch": 0.5963271980883328, + "grad_norm": 0.8203125, + "learning_rate": 0.00014557616361552002, + "loss": 0.9846, + "step": 23224 + }, + { + "epoch": 0.5963528752842546, + "grad_norm": 0.75390625, + "learning_rate": 0.0001455721899758566, + "loss": 0.89, + "step": 23225 + }, + { + "epoch": 0.5963785524801763, + "grad_norm": 0.70703125, + "learning_rate": 0.00014556821624537187, + "loss": 0.7442, + "step": 23226 + }, + { + "epoch": 0.5964042296760982, + "grad_norm": 0.8203125, + "learning_rate": 0.00014556424242407383, + "loss": 0.8421, + "step": 23227 + }, + { + "epoch": 0.59642990687202, + "grad_norm": 0.7109375, + "learning_rate": 0.00014556026851197034, + "loss": 0.7783, + "step": 23228 + }, + { + "epoch": 0.5964555840679419, + "grad_norm": 0.78515625, + "learning_rate": 0.0001455562945090694, + "loss": 0.9931, + "step": 23229 + }, + { + "epoch": 0.5964812612638637, + "grad_norm": 0.7734375, + "learning_rate": 0.00014555232041537885, + "loss": 0.8432, + "step": 23230 + }, + { + "epoch": 0.5965069384597855, + "grad_norm": 0.79296875, + "learning_rate": 0.00014554834623090667, + "loss": 0.9104, + "step": 23231 + }, + { + "epoch": 0.5965326156557074, + "grad_norm": 0.8359375, + "learning_rate": 0.00014554437195566072, + "loss": 0.9593, + "step": 23232 + }, + { + "epoch": 0.5965582928516291, + "grad_norm": 0.703125, + "learning_rate": 0.00014554039758964894, + "loss": 0.7752, + "step": 23233 + }, + { + "epoch": 0.5965839700475509, + "grad_norm": 0.75390625, + "learning_rate": 0.0001455364231328793, + "loss": 0.8846, + "step": 23234 + }, + { + "epoch": 0.5966096472434728, + "grad_norm": 0.80078125, + "learning_rate": 0.00014553244858535967, + "loss": 0.8447, + "step": 23235 + }, + { + "epoch": 0.5966353244393946, + "grad_norm": 1.0, + "learning_rate": 0.00014552847394709798, + "loss": 0.8914, + "step": 23236 + }, + { + "epoch": 0.5966610016353164, + "grad_norm": 0.734375, + "learning_rate": 0.00014552449921810217, + "loss": 0.8346, + "step": 23237 + }, + { + "epoch": 0.5966866788312383, + "grad_norm": 0.765625, + "learning_rate": 0.0001455205243983801, + "loss": 0.7587, + "step": 23238 + }, + { + "epoch": 0.5967123560271601, + "grad_norm": 0.796875, + "learning_rate": 0.00014551654948793976, + "loss": 0.8523, + "step": 23239 + }, + { + "epoch": 0.5967380332230818, + "grad_norm": 0.84765625, + "learning_rate": 0.00014551257448678904, + "loss": 1.0012, + "step": 23240 + }, + { + "epoch": 0.5967637104190037, + "grad_norm": 0.8046875, + "learning_rate": 0.0001455085993949359, + "loss": 0.8157, + "step": 23241 + }, + { + "epoch": 0.5967893876149255, + "grad_norm": 0.80859375, + "learning_rate": 0.0001455046242123882, + "loss": 0.9054, + "step": 23242 + }, + { + "epoch": 0.5968150648108473, + "grad_norm": 0.8203125, + "learning_rate": 0.00014550064893915392, + "loss": 0.9572, + "step": 23243 + }, + { + "epoch": 0.5968407420067692, + "grad_norm": 0.8671875, + "learning_rate": 0.00014549667357524094, + "loss": 0.9327, + "step": 23244 + }, + { + "epoch": 0.596866419202691, + "grad_norm": 0.7578125, + "learning_rate": 0.0001454926981206572, + "loss": 0.7808, + "step": 23245 + }, + { + "epoch": 0.5968920963986127, + "grad_norm": 0.76953125, + "learning_rate": 0.00014548872257541062, + "loss": 0.8634, + "step": 23246 + }, + { + "epoch": 0.5969177735945346, + "grad_norm": 0.7734375, + "learning_rate": 0.00014548474693950914, + "loss": 1.0206, + "step": 23247 + }, + { + "epoch": 0.5969434507904564, + "grad_norm": 0.85546875, + "learning_rate": 0.00014548077121296068, + "loss": 0.8181, + "step": 23248 + }, + { + "epoch": 0.5969691279863782, + "grad_norm": 0.74609375, + "learning_rate": 0.0001454767953957731, + "loss": 0.7918, + "step": 23249 + }, + { + "epoch": 0.5969948051823001, + "grad_norm": 0.7265625, + "learning_rate": 0.00014547281948795442, + "loss": 0.8765, + "step": 23250 + }, + { + "epoch": 0.5970204823782219, + "grad_norm": 0.68359375, + "learning_rate": 0.00014546884348951247, + "loss": 0.7348, + "step": 23251 + }, + { + "epoch": 0.5970461595741438, + "grad_norm": 0.7734375, + "learning_rate": 0.0001454648674004553, + "loss": 0.9442, + "step": 23252 + }, + { + "epoch": 0.5970718367700655, + "grad_norm": 0.77734375, + "learning_rate": 0.0001454608912207907, + "loss": 0.8584, + "step": 23253 + }, + { + "epoch": 0.5970975139659873, + "grad_norm": 0.8046875, + "learning_rate": 0.00014545691495052664, + "loss": 0.8447, + "step": 23254 + }, + { + "epoch": 0.5971231911619092, + "grad_norm": 0.7890625, + "learning_rate": 0.0001454529385896711, + "loss": 0.8273, + "step": 23255 + }, + { + "epoch": 0.597148868357831, + "grad_norm": 0.79296875, + "learning_rate": 0.0001454489621382319, + "loss": 0.8778, + "step": 23256 + }, + { + "epoch": 0.5971745455537528, + "grad_norm": 0.890625, + "learning_rate": 0.00014544498559621708, + "loss": 0.9051, + "step": 23257 + }, + { + "epoch": 0.5972002227496747, + "grad_norm": 0.796875, + "learning_rate": 0.00014544100896363447, + "loss": 0.7942, + "step": 23258 + }, + { + "epoch": 0.5972258999455965, + "grad_norm": 0.7578125, + "learning_rate": 0.00014543703224049207, + "loss": 0.8532, + "step": 23259 + }, + { + "epoch": 0.5972515771415182, + "grad_norm": 0.734375, + "learning_rate": 0.00014543305542679774, + "loss": 0.8932, + "step": 23260 + }, + { + "epoch": 0.5972772543374401, + "grad_norm": 0.7578125, + "learning_rate": 0.00014542907852255945, + "loss": 1.0044, + "step": 23261 + }, + { + "epoch": 0.5973029315333619, + "grad_norm": 0.85546875, + "learning_rate": 0.0001454251015277851, + "loss": 0.8754, + "step": 23262 + }, + { + "epoch": 0.5973286087292837, + "grad_norm": 0.77734375, + "learning_rate": 0.00014542112444248265, + "loss": 0.9286, + "step": 23263 + }, + { + "epoch": 0.5973542859252056, + "grad_norm": 0.7421875, + "learning_rate": 0.00014541714726665997, + "loss": 0.8782, + "step": 23264 + }, + { + "epoch": 0.5973799631211274, + "grad_norm": 0.73828125, + "learning_rate": 0.00014541317000032504, + "loss": 0.8697, + "step": 23265 + }, + { + "epoch": 0.5974056403170491, + "grad_norm": 0.75390625, + "learning_rate": 0.00014540919264348573, + "loss": 0.7967, + "step": 23266 + }, + { + "epoch": 0.597431317512971, + "grad_norm": 0.859375, + "learning_rate": 0.00014540521519615005, + "loss": 0.8493, + "step": 23267 + }, + { + "epoch": 0.5974569947088928, + "grad_norm": 0.73828125, + "learning_rate": 0.00014540123765832585, + "loss": 0.8135, + "step": 23268 + }, + { + "epoch": 0.5974826719048146, + "grad_norm": 0.8359375, + "learning_rate": 0.00014539726003002108, + "loss": 0.8748, + "step": 23269 + }, + { + "epoch": 0.5975083491007365, + "grad_norm": 0.828125, + "learning_rate": 0.0001453932823112437, + "loss": 0.9235, + "step": 23270 + }, + { + "epoch": 0.5975340262966583, + "grad_norm": 0.79296875, + "learning_rate": 0.00014538930450200158, + "loss": 0.8732, + "step": 23271 + }, + { + "epoch": 0.5975597034925801, + "grad_norm": 0.828125, + "learning_rate": 0.00014538532660230268, + "loss": 0.868, + "step": 23272 + }, + { + "epoch": 0.5975853806885019, + "grad_norm": 0.82421875, + "learning_rate": 0.00014538134861215491, + "loss": 0.8916, + "step": 23273 + }, + { + "epoch": 0.5976110578844237, + "grad_norm": 0.87890625, + "learning_rate": 0.00014537737053156623, + "loss": 0.9053, + "step": 23274 + }, + { + "epoch": 0.5976367350803455, + "grad_norm": 0.77734375, + "learning_rate": 0.00014537339236054457, + "loss": 0.7913, + "step": 23275 + }, + { + "epoch": 0.5976624122762674, + "grad_norm": 0.8046875, + "learning_rate": 0.0001453694140990978, + "loss": 0.9069, + "step": 23276 + }, + { + "epoch": 0.5976880894721892, + "grad_norm": 0.8203125, + "learning_rate": 0.00014536543574723392, + "loss": 0.8794, + "step": 23277 + }, + { + "epoch": 0.597713766668111, + "grad_norm": 0.78125, + "learning_rate": 0.0001453614573049608, + "loss": 0.8094, + "step": 23278 + }, + { + "epoch": 0.5977394438640329, + "grad_norm": 0.73046875, + "learning_rate": 0.0001453574787722864, + "loss": 0.8693, + "step": 23279 + }, + { + "epoch": 0.5977651210599546, + "grad_norm": 0.7265625, + "learning_rate": 0.00014535350014921865, + "loss": 0.7505, + "step": 23280 + }, + { + "epoch": 0.5977907982558764, + "grad_norm": 0.78125, + "learning_rate": 0.00014534952143576544, + "loss": 0.8998, + "step": 23281 + }, + { + "epoch": 0.5978164754517983, + "grad_norm": 0.75390625, + "learning_rate": 0.00014534554263193476, + "loss": 0.9534, + "step": 23282 + }, + { + "epoch": 0.5978421526477201, + "grad_norm": 0.83203125, + "learning_rate": 0.0001453415637377345, + "loss": 0.8865, + "step": 23283 + }, + { + "epoch": 0.597867829843642, + "grad_norm": 0.70703125, + "learning_rate": 0.0001453375847531726, + "loss": 0.7374, + "step": 23284 + }, + { + "epoch": 0.5978935070395638, + "grad_norm": 0.78515625, + "learning_rate": 0.000145333605678257, + "loss": 0.8768, + "step": 23285 + }, + { + "epoch": 0.5979191842354855, + "grad_norm": 0.8515625, + "learning_rate": 0.0001453296265129956, + "loss": 0.9043, + "step": 23286 + }, + { + "epoch": 0.5979448614314073, + "grad_norm": 0.7265625, + "learning_rate": 0.00014532564725739636, + "loss": 0.8675, + "step": 23287 + }, + { + "epoch": 0.5979705386273292, + "grad_norm": 0.84375, + "learning_rate": 0.0001453216679114672, + "loss": 0.8636, + "step": 23288 + }, + { + "epoch": 0.597996215823251, + "grad_norm": 0.7890625, + "learning_rate": 0.00014531768847521602, + "loss": 0.8064, + "step": 23289 + }, + { + "epoch": 0.5980218930191729, + "grad_norm": 0.75, + "learning_rate": 0.00014531370894865084, + "loss": 0.8057, + "step": 23290 + }, + { + "epoch": 0.5980475702150947, + "grad_norm": 0.72265625, + "learning_rate": 0.0001453097293317795, + "loss": 0.9123, + "step": 23291 + }, + { + "epoch": 0.5980732474110165, + "grad_norm": 0.78125, + "learning_rate": 0.00014530574962460998, + "loss": 0.8646, + "step": 23292 + }, + { + "epoch": 0.5980989246069383, + "grad_norm": 0.80859375, + "learning_rate": 0.00014530176982715016, + "loss": 0.8733, + "step": 23293 + }, + { + "epoch": 0.5981246018028601, + "grad_norm": 0.90234375, + "learning_rate": 0.00014529778993940798, + "loss": 0.8175, + "step": 23294 + }, + { + "epoch": 0.5981502789987819, + "grad_norm": 0.8046875, + "learning_rate": 0.00014529380996139145, + "loss": 0.8918, + "step": 23295 + }, + { + "epoch": 0.5981759561947038, + "grad_norm": 0.8046875, + "learning_rate": 0.00014528982989310847, + "loss": 0.8095, + "step": 23296 + }, + { + "epoch": 0.5982016333906256, + "grad_norm": 0.73828125, + "learning_rate": 0.0001452858497345669, + "loss": 0.7931, + "step": 23297 + }, + { + "epoch": 0.5982273105865474, + "grad_norm": 0.78125, + "learning_rate": 0.00014528186948577473, + "loss": 0.8438, + "step": 23298 + }, + { + "epoch": 0.5982529877824693, + "grad_norm": 0.75, + "learning_rate": 0.00014527788914673988, + "loss": 0.7399, + "step": 23299 + }, + { + "epoch": 0.598278664978391, + "grad_norm": 0.78125, + "learning_rate": 0.00014527390871747032, + "loss": 0.7914, + "step": 23300 + }, + { + "epoch": 0.5983043421743128, + "grad_norm": 0.765625, + "learning_rate": 0.00014526992819797392, + "loss": 0.9919, + "step": 23301 + }, + { + "epoch": 0.5983300193702347, + "grad_norm": 0.7734375, + "learning_rate": 0.00014526594758825862, + "loss": 0.7493, + "step": 23302 + }, + { + "epoch": 0.5983556965661565, + "grad_norm": 0.8046875, + "learning_rate": 0.00014526196688833244, + "loss": 0.8804, + "step": 23303 + }, + { + "epoch": 0.5983813737620783, + "grad_norm": 0.796875, + "learning_rate": 0.0001452579860982032, + "loss": 0.7727, + "step": 23304 + }, + { + "epoch": 0.5984070509580002, + "grad_norm": 0.75, + "learning_rate": 0.00014525400521787888, + "loss": 0.9135, + "step": 23305 + }, + { + "epoch": 0.5984327281539219, + "grad_norm": 0.77734375, + "learning_rate": 0.00014525002424736744, + "loss": 0.8286, + "step": 23306 + }, + { + "epoch": 0.5984584053498437, + "grad_norm": 0.8203125, + "learning_rate": 0.0001452460431866768, + "loss": 0.8362, + "step": 23307 + }, + { + "epoch": 0.5984840825457656, + "grad_norm": 0.7578125, + "learning_rate": 0.00014524206203581484, + "loss": 0.8278, + "step": 23308 + }, + { + "epoch": 0.5985097597416874, + "grad_norm": 0.8046875, + "learning_rate": 0.00014523808079478957, + "loss": 0.7552, + "step": 23309 + }, + { + "epoch": 0.5985354369376092, + "grad_norm": 0.80859375, + "learning_rate": 0.0001452340994636089, + "loss": 0.8911, + "step": 23310 + }, + { + "epoch": 0.5985611141335311, + "grad_norm": 0.78125, + "learning_rate": 0.00014523011804228072, + "loss": 0.7939, + "step": 23311 + }, + { + "epoch": 0.5985867913294529, + "grad_norm": 0.77734375, + "learning_rate": 0.00014522613653081303, + "loss": 0.8601, + "step": 23312 + }, + { + "epoch": 0.5986124685253746, + "grad_norm": 0.76953125, + "learning_rate": 0.0001452221549292137, + "loss": 0.9154, + "step": 23313 + }, + { + "epoch": 0.5986381457212965, + "grad_norm": 0.75, + "learning_rate": 0.00014521817323749072, + "loss": 0.7316, + "step": 23314 + }, + { + "epoch": 0.5986638229172183, + "grad_norm": 0.7578125, + "learning_rate": 0.00014521419145565201, + "loss": 0.7656, + "step": 23315 + }, + { + "epoch": 0.5986895001131401, + "grad_norm": 0.71875, + "learning_rate": 0.0001452102095837055, + "loss": 0.7886, + "step": 23316 + }, + { + "epoch": 0.598715177309062, + "grad_norm": 0.765625, + "learning_rate": 0.00014520622762165914, + "loss": 0.7707, + "step": 23317 + }, + { + "epoch": 0.5987408545049838, + "grad_norm": 0.88671875, + "learning_rate": 0.0001452022455695208, + "loss": 0.7976, + "step": 23318 + }, + { + "epoch": 0.5987665317009057, + "grad_norm": 0.72265625, + "learning_rate": 0.00014519826342729853, + "loss": 0.819, + "step": 23319 + }, + { + "epoch": 0.5987922088968274, + "grad_norm": 0.83984375, + "learning_rate": 0.0001451942811950002, + "loss": 0.8003, + "step": 23320 + }, + { + "epoch": 0.5988178860927492, + "grad_norm": 0.8359375, + "learning_rate": 0.00014519029887263368, + "loss": 0.8722, + "step": 23321 + }, + { + "epoch": 0.598843563288671, + "grad_norm": 0.81640625, + "learning_rate": 0.00014518631646020703, + "loss": 1.0247, + "step": 23322 + }, + { + "epoch": 0.5988692404845929, + "grad_norm": 0.78125, + "learning_rate": 0.00014518233395772814, + "loss": 0.8639, + "step": 23323 + }, + { + "epoch": 0.5988949176805147, + "grad_norm": 0.7421875, + "learning_rate": 0.00014517835136520493, + "loss": 0.8675, + "step": 23324 + }, + { + "epoch": 0.5989205948764366, + "grad_norm": 0.80078125, + "learning_rate": 0.00014517436868264537, + "loss": 0.7865, + "step": 23325 + }, + { + "epoch": 0.5989462720723583, + "grad_norm": 0.7578125, + "learning_rate": 0.00014517038591005732, + "loss": 0.8495, + "step": 23326 + }, + { + "epoch": 0.5989719492682801, + "grad_norm": 0.7265625, + "learning_rate": 0.0001451664030474488, + "loss": 0.847, + "step": 23327 + }, + { + "epoch": 0.598997626464202, + "grad_norm": 0.68359375, + "learning_rate": 0.0001451624200948277, + "loss": 0.7112, + "step": 23328 + }, + { + "epoch": 0.5990233036601238, + "grad_norm": 0.69140625, + "learning_rate": 0.00014515843705220201, + "loss": 0.9072, + "step": 23329 + }, + { + "epoch": 0.5990489808560456, + "grad_norm": 0.7734375, + "learning_rate": 0.00014515445391957962, + "loss": 0.9089, + "step": 23330 + }, + { + "epoch": 0.5990746580519675, + "grad_norm": 0.828125, + "learning_rate": 0.00014515047069696847, + "loss": 0.7738, + "step": 23331 + }, + { + "epoch": 0.5991003352478893, + "grad_norm": 0.86328125, + "learning_rate": 0.0001451464873843765, + "loss": 0.8417, + "step": 23332 + }, + { + "epoch": 0.599126012443811, + "grad_norm": 0.8203125, + "learning_rate": 0.00014514250398181167, + "loss": 0.9999, + "step": 23333 + }, + { + "epoch": 0.5991516896397329, + "grad_norm": 0.7578125, + "learning_rate": 0.00014513852048928194, + "loss": 0.9047, + "step": 23334 + }, + { + "epoch": 0.5991773668356547, + "grad_norm": 0.7578125, + "learning_rate": 0.00014513453690679517, + "loss": 0.8703, + "step": 23335 + }, + { + "epoch": 0.5992030440315765, + "grad_norm": 0.7890625, + "learning_rate": 0.0001451305532343594, + "loss": 0.8755, + "step": 23336 + }, + { + "epoch": 0.5992287212274984, + "grad_norm": 0.75, + "learning_rate": 0.00014512656947198246, + "loss": 0.7051, + "step": 23337 + }, + { + "epoch": 0.5992543984234202, + "grad_norm": 0.796875, + "learning_rate": 0.00014512258561967238, + "loss": 0.8336, + "step": 23338 + }, + { + "epoch": 0.5992800756193419, + "grad_norm": 0.74609375, + "learning_rate": 0.00014511860167743703, + "loss": 0.8335, + "step": 23339 + }, + { + "epoch": 0.5993057528152638, + "grad_norm": 0.79296875, + "learning_rate": 0.0001451146176452844, + "loss": 0.8224, + "step": 23340 + }, + { + "epoch": 0.5993314300111856, + "grad_norm": 0.828125, + "learning_rate": 0.0001451106335232224, + "loss": 1.015, + "step": 23341 + }, + { + "epoch": 0.5993571072071074, + "grad_norm": 0.8203125, + "learning_rate": 0.00014510664931125902, + "loss": 0.875, + "step": 23342 + }, + { + "epoch": 0.5993827844030293, + "grad_norm": 0.74609375, + "learning_rate": 0.00014510266500940213, + "loss": 0.9334, + "step": 23343 + }, + { + "epoch": 0.5994084615989511, + "grad_norm": 0.7265625, + "learning_rate": 0.00014509868061765972, + "loss": 0.8123, + "step": 23344 + }, + { + "epoch": 0.599434138794873, + "grad_norm": 0.78515625, + "learning_rate": 0.00014509469613603967, + "loss": 0.8607, + "step": 23345 + }, + { + "epoch": 0.5994598159907947, + "grad_norm": 0.90234375, + "learning_rate": 0.00014509071156455002, + "loss": 0.9291, + "step": 23346 + }, + { + "epoch": 0.5994854931867165, + "grad_norm": 0.72265625, + "learning_rate": 0.00014508672690319863, + "loss": 0.8621, + "step": 23347 + }, + { + "epoch": 0.5995111703826383, + "grad_norm": 0.796875, + "learning_rate": 0.00014508274215199344, + "loss": 0.8664, + "step": 23348 + }, + { + "epoch": 0.5995368475785602, + "grad_norm": 0.73828125, + "learning_rate": 0.00014507875731094248, + "loss": 0.871, + "step": 23349 + }, + { + "epoch": 0.599562524774482, + "grad_norm": 0.8046875, + "learning_rate": 0.00014507477238005358, + "loss": 0.9054, + "step": 23350 + }, + { + "epoch": 0.5995882019704039, + "grad_norm": 0.74609375, + "learning_rate": 0.00014507078735933476, + "loss": 0.8527, + "step": 23351 + }, + { + "epoch": 0.5996138791663257, + "grad_norm": 0.72265625, + "learning_rate": 0.0001450668022487939, + "loss": 0.8699, + "step": 23352 + }, + { + "epoch": 0.5996395563622474, + "grad_norm": 0.78515625, + "learning_rate": 0.00014506281704843897, + "loss": 0.8618, + "step": 23353 + }, + { + "epoch": 0.5996652335581693, + "grad_norm": 0.75, + "learning_rate": 0.00014505883175827794, + "loss": 0.8746, + "step": 23354 + }, + { + "epoch": 0.5996909107540911, + "grad_norm": 0.75, + "learning_rate": 0.00014505484637831873, + "loss": 0.846, + "step": 23355 + }, + { + "epoch": 0.5997165879500129, + "grad_norm": 0.83203125, + "learning_rate": 0.00014505086090856926, + "loss": 1.0232, + "step": 23356 + }, + { + "epoch": 0.5997422651459348, + "grad_norm": 0.75390625, + "learning_rate": 0.00014504687534903752, + "loss": 0.9108, + "step": 23357 + }, + { + "epoch": 0.5997679423418566, + "grad_norm": 0.796875, + "learning_rate": 0.0001450428896997314, + "loss": 0.8448, + "step": 23358 + }, + { + "epoch": 0.5997936195377783, + "grad_norm": 0.83984375, + "learning_rate": 0.00014503890396065888, + "loss": 0.9082, + "step": 23359 + }, + { + "epoch": 0.5998192967337002, + "grad_norm": 0.8046875, + "learning_rate": 0.0001450349181318279, + "loss": 0.884, + "step": 23360 + }, + { + "epoch": 0.599844973929622, + "grad_norm": 0.76171875, + "learning_rate": 0.0001450309322132464, + "loss": 0.8952, + "step": 23361 + }, + { + "epoch": 0.5998706511255438, + "grad_norm": 0.7890625, + "learning_rate": 0.00014502694620492233, + "loss": 0.8392, + "step": 23362 + }, + { + "epoch": 0.5998963283214657, + "grad_norm": 0.859375, + "learning_rate": 0.00014502296010686358, + "loss": 0.9688, + "step": 23363 + }, + { + "epoch": 0.5999220055173875, + "grad_norm": 0.8046875, + "learning_rate": 0.00014501897391907818, + "loss": 0.8365, + "step": 23364 + }, + { + "epoch": 0.5999476827133093, + "grad_norm": 0.7890625, + "learning_rate": 0.00014501498764157402, + "loss": 0.7104, + "step": 23365 + }, + { + "epoch": 0.5999733599092311, + "grad_norm": 0.74609375, + "learning_rate": 0.00014501100127435902, + "loss": 0.8445, + "step": 23366 + }, + { + "epoch": 0.5999990371051529, + "grad_norm": 0.72265625, + "learning_rate": 0.00014500701481744122, + "loss": 0.877, + "step": 23367 + }, + { + "epoch": 0.6000247143010747, + "grad_norm": 0.765625, + "learning_rate": 0.0001450030282708285, + "loss": 0.8621, + "step": 23368 + }, + { + "epoch": 0.6000503914969966, + "grad_norm": 0.88671875, + "learning_rate": 0.00014499904163452876, + "loss": 0.805, + "step": 23369 + }, + { + "epoch": 0.6000760686929184, + "grad_norm": 0.78515625, + "learning_rate": 0.00014499505490855001, + "loss": 0.8873, + "step": 23370 + }, + { + "epoch": 0.6001017458888402, + "grad_norm": 0.7734375, + "learning_rate": 0.00014499106809290018, + "loss": 0.7673, + "step": 23371 + }, + { + "epoch": 0.6001274230847621, + "grad_norm": 0.8671875, + "learning_rate": 0.00014498708118758724, + "loss": 0.9577, + "step": 23372 + }, + { + "epoch": 0.6001531002806838, + "grad_norm": 0.85546875, + "learning_rate": 0.0001449830941926191, + "loss": 0.7974, + "step": 23373 + }, + { + "epoch": 0.6001787774766056, + "grad_norm": 0.81640625, + "learning_rate": 0.0001449791071080037, + "loss": 0.8812, + "step": 23374 + }, + { + "epoch": 0.6002044546725275, + "grad_norm": 0.80078125, + "learning_rate": 0.00014497511993374904, + "loss": 0.8343, + "step": 23375 + }, + { + "epoch": 0.6002301318684493, + "grad_norm": 0.72265625, + "learning_rate": 0.00014497113266986297, + "loss": 0.9001, + "step": 23376 + }, + { + "epoch": 0.6002558090643711, + "grad_norm": 0.75390625, + "learning_rate": 0.00014496714531635354, + "loss": 0.8985, + "step": 23377 + }, + { + "epoch": 0.600281486260293, + "grad_norm": 0.7421875, + "learning_rate": 0.00014496315787322864, + "loss": 0.9547, + "step": 23378 + }, + { + "epoch": 0.6003071634562147, + "grad_norm": 0.72265625, + "learning_rate": 0.00014495917034049622, + "loss": 0.8742, + "step": 23379 + }, + { + "epoch": 0.6003328406521365, + "grad_norm": 0.796875, + "learning_rate": 0.00014495518271816424, + "loss": 1.0577, + "step": 23380 + }, + { + "epoch": 0.6003585178480584, + "grad_norm": 0.8671875, + "learning_rate": 0.00014495119500624066, + "loss": 0.8353, + "step": 23381 + }, + { + "epoch": 0.6003841950439802, + "grad_norm": 0.703125, + "learning_rate": 0.0001449472072047334, + "loss": 0.9049, + "step": 23382 + }, + { + "epoch": 0.600409872239902, + "grad_norm": 0.734375, + "learning_rate": 0.0001449432193136504, + "loss": 0.8914, + "step": 23383 + }, + { + "epoch": 0.6004355494358239, + "grad_norm": 0.79296875, + "learning_rate": 0.00014493923133299965, + "loss": 0.8183, + "step": 23384 + }, + { + "epoch": 0.6004612266317457, + "grad_norm": 0.796875, + "learning_rate": 0.00014493524326278902, + "loss": 0.9175, + "step": 23385 + }, + { + "epoch": 0.6004869038276675, + "grad_norm": 0.8203125, + "learning_rate": 0.00014493125510302657, + "loss": 0.9686, + "step": 23386 + }, + { + "epoch": 0.6005125810235893, + "grad_norm": 0.7421875, + "learning_rate": 0.00014492726685372013, + "loss": 0.8426, + "step": 23387 + }, + { + "epoch": 0.6005382582195111, + "grad_norm": 0.79296875, + "learning_rate": 0.00014492327851487778, + "loss": 0.9168, + "step": 23388 + }, + { + "epoch": 0.600563935415433, + "grad_norm": 0.75, + "learning_rate": 0.00014491929008650733, + "loss": 0.918, + "step": 23389 + }, + { + "epoch": 0.6005896126113548, + "grad_norm": 0.76171875, + "learning_rate": 0.0001449153015686168, + "loss": 0.9017, + "step": 23390 + }, + { + "epoch": 0.6006152898072766, + "grad_norm": 0.84765625, + "learning_rate": 0.00014491131296121416, + "loss": 0.76, + "step": 23391 + }, + { + "epoch": 0.6006409670031985, + "grad_norm": 0.84765625, + "learning_rate": 0.0001449073242643073, + "loss": 0.8577, + "step": 23392 + }, + { + "epoch": 0.6006666441991202, + "grad_norm": 0.7734375, + "learning_rate": 0.0001449033354779042, + "loss": 0.8712, + "step": 23393 + }, + { + "epoch": 0.600692321395042, + "grad_norm": 0.77734375, + "learning_rate": 0.00014489934660201283, + "loss": 0.8829, + "step": 23394 + }, + { + "epoch": 0.6007179985909639, + "grad_norm": 0.77734375, + "learning_rate": 0.00014489535763664111, + "loss": 0.9193, + "step": 23395 + }, + { + "epoch": 0.6007436757868857, + "grad_norm": 0.9375, + "learning_rate": 0.00014489136858179702, + "loss": 0.8058, + "step": 23396 + }, + { + "epoch": 0.6007693529828075, + "grad_norm": 0.8515625, + "learning_rate": 0.00014488737943748844, + "loss": 0.9832, + "step": 23397 + }, + { + "epoch": 0.6007950301787294, + "grad_norm": 0.765625, + "learning_rate": 0.0001448833902037234, + "loss": 0.7628, + "step": 23398 + }, + { + "epoch": 0.6008207073746511, + "grad_norm": 0.7109375, + "learning_rate": 0.0001448794008805098, + "loss": 0.8989, + "step": 23399 + }, + { + "epoch": 0.6008463845705729, + "grad_norm": 0.80078125, + "learning_rate": 0.00014487541146785562, + "loss": 0.7966, + "step": 23400 + }, + { + "epoch": 0.6008720617664948, + "grad_norm": 0.7578125, + "learning_rate": 0.0001448714219657688, + "loss": 0.8734, + "step": 23401 + }, + { + "epoch": 0.6008977389624166, + "grad_norm": 0.76953125, + "learning_rate": 0.0001448674323742573, + "loss": 0.905, + "step": 23402 + }, + { + "epoch": 0.6009234161583384, + "grad_norm": 0.73828125, + "learning_rate": 0.00014486344269332903, + "loss": 0.831, + "step": 23403 + }, + { + "epoch": 0.6009490933542603, + "grad_norm": 0.80078125, + "learning_rate": 0.000144859452922992, + "loss": 0.9701, + "step": 23404 + }, + { + "epoch": 0.6009747705501821, + "grad_norm": 0.7578125, + "learning_rate": 0.0001448554630632541, + "loss": 0.7835, + "step": 23405 + }, + { + "epoch": 0.6010004477461038, + "grad_norm": 0.69921875, + "learning_rate": 0.00014485147311412333, + "loss": 0.8016, + "step": 23406 + }, + { + "epoch": 0.6010261249420257, + "grad_norm": 0.796875, + "learning_rate": 0.00014484748307560762, + "loss": 0.8203, + "step": 23407 + }, + { + "epoch": 0.6010518021379475, + "grad_norm": 0.7421875, + "learning_rate": 0.00014484349294771495, + "loss": 0.9104, + "step": 23408 + }, + { + "epoch": 0.6010774793338693, + "grad_norm": 0.82421875, + "learning_rate": 0.00014483950273045323, + "loss": 0.8449, + "step": 23409 + }, + { + "epoch": 0.6011031565297912, + "grad_norm": 0.7890625, + "learning_rate": 0.00014483551242383045, + "loss": 0.8649, + "step": 23410 + }, + { + "epoch": 0.601128833725713, + "grad_norm": 0.72265625, + "learning_rate": 0.0001448315220278545, + "loss": 0.8516, + "step": 23411 + }, + { + "epoch": 0.6011545109216349, + "grad_norm": 0.8203125, + "learning_rate": 0.00014482753154253342, + "loss": 0.8927, + "step": 23412 + }, + { + "epoch": 0.6011801881175566, + "grad_norm": 0.7265625, + "learning_rate": 0.0001448235409678751, + "loss": 0.7768, + "step": 23413 + }, + { + "epoch": 0.6012058653134784, + "grad_norm": 0.7109375, + "learning_rate": 0.00014481955030388753, + "loss": 0.8034, + "step": 23414 + }, + { + "epoch": 0.6012315425094003, + "grad_norm": 0.74609375, + "learning_rate": 0.00014481555955057863, + "loss": 0.9366, + "step": 23415 + }, + { + "epoch": 0.6012572197053221, + "grad_norm": 0.76171875, + "learning_rate": 0.00014481156870795636, + "loss": 0.8346, + "step": 23416 + }, + { + "epoch": 0.6012828969012439, + "grad_norm": 0.796875, + "learning_rate": 0.00014480757777602868, + "loss": 0.888, + "step": 23417 + }, + { + "epoch": 0.6013085740971658, + "grad_norm": 0.8203125, + "learning_rate": 0.00014480358675480356, + "loss": 0.9571, + "step": 23418 + }, + { + "epoch": 0.6013342512930875, + "grad_norm": 0.78515625, + "learning_rate": 0.00014479959564428895, + "loss": 0.7987, + "step": 23419 + }, + { + "epoch": 0.6013599284890093, + "grad_norm": 0.73828125, + "learning_rate": 0.00014479560444449275, + "loss": 0.8436, + "step": 23420 + }, + { + "epoch": 0.6013856056849312, + "grad_norm": 0.82421875, + "learning_rate": 0.00014479161315542302, + "loss": 0.8907, + "step": 23421 + }, + { + "epoch": 0.601411282880853, + "grad_norm": 0.86328125, + "learning_rate": 0.0001447876217770876, + "loss": 0.7635, + "step": 23422 + }, + { + "epoch": 0.6014369600767748, + "grad_norm": 0.7578125, + "learning_rate": 0.0001447836303094945, + "loss": 0.8823, + "step": 23423 + }, + { + "epoch": 0.6014626372726967, + "grad_norm": 0.8671875, + "learning_rate": 0.00014477963875265167, + "loss": 0.865, + "step": 23424 + }, + { + "epoch": 0.6014883144686185, + "grad_norm": 0.734375, + "learning_rate": 0.00014477564710656707, + "loss": 0.9064, + "step": 23425 + }, + { + "epoch": 0.6015139916645402, + "grad_norm": 0.74609375, + "learning_rate": 0.00014477165537124867, + "loss": 0.9235, + "step": 23426 + }, + { + "epoch": 0.6015396688604621, + "grad_norm": 0.796875, + "learning_rate": 0.0001447676635467044, + "loss": 0.9126, + "step": 23427 + }, + { + "epoch": 0.6015653460563839, + "grad_norm": 0.78515625, + "learning_rate": 0.00014476367163294222, + "loss": 0.9153, + "step": 23428 + }, + { + "epoch": 0.6015910232523057, + "grad_norm": 0.73828125, + "learning_rate": 0.0001447596796299701, + "loss": 0.7824, + "step": 23429 + }, + { + "epoch": 0.6016167004482276, + "grad_norm": 0.75, + "learning_rate": 0.00014475568753779592, + "loss": 0.8522, + "step": 23430 + }, + { + "epoch": 0.6016423776441494, + "grad_norm": 0.703125, + "learning_rate": 0.00014475169535642775, + "loss": 0.7166, + "step": 23431 + }, + { + "epoch": 0.6016680548400712, + "grad_norm": 0.80078125, + "learning_rate": 0.00014474770308587346, + "loss": 0.9812, + "step": 23432 + }, + { + "epoch": 0.601693732035993, + "grad_norm": 0.87890625, + "learning_rate": 0.00014474371072614105, + "loss": 0.8679, + "step": 23433 + }, + { + "epoch": 0.6017194092319148, + "grad_norm": 0.6875, + "learning_rate": 0.0001447397182772385, + "loss": 0.6627, + "step": 23434 + }, + { + "epoch": 0.6017450864278366, + "grad_norm": 0.8046875, + "learning_rate": 0.0001447357257391737, + "loss": 0.7953, + "step": 23435 + }, + { + "epoch": 0.6017707636237585, + "grad_norm": 0.79296875, + "learning_rate": 0.00014473173311195465, + "loss": 0.7718, + "step": 23436 + }, + { + "epoch": 0.6017964408196803, + "grad_norm": 0.77734375, + "learning_rate": 0.0001447277403955893, + "loss": 0.7774, + "step": 23437 + }, + { + "epoch": 0.6018221180156021, + "grad_norm": 0.75, + "learning_rate": 0.00014472374759008557, + "loss": 0.8043, + "step": 23438 + }, + { + "epoch": 0.6018477952115239, + "grad_norm": 0.9140625, + "learning_rate": 0.0001447197546954515, + "loss": 0.786, + "step": 23439 + }, + { + "epoch": 0.6018734724074457, + "grad_norm": 0.78125, + "learning_rate": 0.00014471576171169498, + "loss": 0.8342, + "step": 23440 + }, + { + "epoch": 0.6018991496033675, + "grad_norm": 0.8046875, + "learning_rate": 0.00014471176863882398, + "loss": 0.8524, + "step": 23441 + }, + { + "epoch": 0.6019248267992894, + "grad_norm": 0.84765625, + "learning_rate": 0.00014470777547684647, + "loss": 0.8997, + "step": 23442 + }, + { + "epoch": 0.6019505039952112, + "grad_norm": 0.91796875, + "learning_rate": 0.0001447037822257704, + "loss": 0.8375, + "step": 23443 + }, + { + "epoch": 0.601976181191133, + "grad_norm": 0.79296875, + "learning_rate": 0.0001446997888856037, + "loss": 0.9144, + "step": 23444 + }, + { + "epoch": 0.6020018583870549, + "grad_norm": 0.80078125, + "learning_rate": 0.0001446957954563544, + "loss": 0.8413, + "step": 23445 + }, + { + "epoch": 0.6020275355829766, + "grad_norm": 0.74609375, + "learning_rate": 0.0001446918019380304, + "loss": 0.8602, + "step": 23446 + }, + { + "epoch": 0.6020532127788984, + "grad_norm": 0.72265625, + "learning_rate": 0.00014468780833063968, + "loss": 0.8295, + "step": 23447 + }, + { + "epoch": 0.6020788899748203, + "grad_norm": 0.828125, + "learning_rate": 0.0001446838146341902, + "loss": 0.9579, + "step": 23448 + }, + { + "epoch": 0.6021045671707421, + "grad_norm": 0.859375, + "learning_rate": 0.00014467982084868992, + "loss": 0.823, + "step": 23449 + }, + { + "epoch": 0.602130244366664, + "grad_norm": 0.796875, + "learning_rate": 0.0001446758269741468, + "loss": 0.902, + "step": 23450 + }, + { + "epoch": 0.6021559215625858, + "grad_norm": 0.87109375, + "learning_rate": 0.00014467183301056875, + "loss": 0.9288, + "step": 23451 + }, + { + "epoch": 0.6021815987585076, + "grad_norm": 0.8046875, + "learning_rate": 0.0001446678389579638, + "loss": 0.9151, + "step": 23452 + }, + { + "epoch": 0.6022072759544294, + "grad_norm": 0.75, + "learning_rate": 0.00014466384481633986, + "loss": 0.8835, + "step": 23453 + }, + { + "epoch": 0.6022329531503512, + "grad_norm": 0.79296875, + "learning_rate": 0.00014465985058570495, + "loss": 0.852, + "step": 23454 + }, + { + "epoch": 0.602258630346273, + "grad_norm": 0.79296875, + "learning_rate": 0.00014465585626606697, + "loss": 0.7351, + "step": 23455 + }, + { + "epoch": 0.6022843075421949, + "grad_norm": 0.73046875, + "learning_rate": 0.0001446518618574339, + "loss": 0.7695, + "step": 23456 + }, + { + "epoch": 0.6023099847381167, + "grad_norm": 0.74609375, + "learning_rate": 0.0001446478673598137, + "loss": 0.794, + "step": 23457 + }, + { + "epoch": 0.6023356619340385, + "grad_norm": 0.70703125, + "learning_rate": 0.00014464387277321434, + "loss": 0.7213, + "step": 23458 + }, + { + "epoch": 0.6023613391299603, + "grad_norm": 0.77734375, + "learning_rate": 0.00014463987809764376, + "loss": 0.7774, + "step": 23459 + }, + { + "epoch": 0.6023870163258821, + "grad_norm": 0.734375, + "learning_rate": 0.00014463588333310995, + "loss": 0.6975, + "step": 23460 + }, + { + "epoch": 0.6024126935218039, + "grad_norm": 0.7734375, + "learning_rate": 0.00014463188847962087, + "loss": 0.9045, + "step": 23461 + }, + { + "epoch": 0.6024383707177258, + "grad_norm": 0.734375, + "learning_rate": 0.00014462789353718444, + "loss": 0.7834, + "step": 23462 + }, + { + "epoch": 0.6024640479136476, + "grad_norm": 0.75390625, + "learning_rate": 0.00014462389850580868, + "loss": 0.8079, + "step": 23463 + }, + { + "epoch": 0.6024897251095694, + "grad_norm": 0.76953125, + "learning_rate": 0.0001446199033855015, + "loss": 0.7146, + "step": 23464 + }, + { + "epoch": 0.6025154023054913, + "grad_norm": 0.78515625, + "learning_rate": 0.00014461590817627088, + "loss": 0.7575, + "step": 23465 + }, + { + "epoch": 0.602541079501413, + "grad_norm": 0.73046875, + "learning_rate": 0.0001446119128781248, + "loss": 1.0124, + "step": 23466 + }, + { + "epoch": 0.6025667566973348, + "grad_norm": 0.70703125, + "learning_rate": 0.00014460791749107117, + "loss": 0.7624, + "step": 23467 + }, + { + "epoch": 0.6025924338932567, + "grad_norm": 0.8046875, + "learning_rate": 0.00014460392201511805, + "loss": 0.8239, + "step": 23468 + }, + { + "epoch": 0.6026181110891785, + "grad_norm": 0.77734375, + "learning_rate": 0.00014459992645027333, + "loss": 0.7729, + "step": 23469 + }, + { + "epoch": 0.6026437882851003, + "grad_norm": 0.76953125, + "learning_rate": 0.00014459593079654495, + "loss": 0.9347, + "step": 23470 + }, + { + "epoch": 0.6026694654810222, + "grad_norm": 0.70703125, + "learning_rate": 0.00014459193505394092, + "loss": 0.7462, + "step": 23471 + }, + { + "epoch": 0.602695142676944, + "grad_norm": 0.7421875, + "learning_rate": 0.0001445879392224692, + "loss": 0.8462, + "step": 23472 + }, + { + "epoch": 0.6027208198728657, + "grad_norm": 0.8359375, + "learning_rate": 0.0001445839433021377, + "loss": 1.0053, + "step": 23473 + }, + { + "epoch": 0.6027464970687876, + "grad_norm": 0.7265625, + "learning_rate": 0.0001445799472929545, + "loss": 0.8835, + "step": 23474 + }, + { + "epoch": 0.6027721742647094, + "grad_norm": 0.8515625, + "learning_rate": 0.00014457595119492746, + "loss": 0.903, + "step": 23475 + }, + { + "epoch": 0.6027978514606313, + "grad_norm": 0.81640625, + "learning_rate": 0.00014457195500806457, + "loss": 0.9062, + "step": 23476 + }, + { + "epoch": 0.6028235286565531, + "grad_norm": 0.90234375, + "learning_rate": 0.00014456795873237383, + "loss": 0.933, + "step": 23477 + }, + { + "epoch": 0.6028492058524749, + "grad_norm": 0.75, + "learning_rate": 0.00014456396236786314, + "loss": 0.8142, + "step": 23478 + }, + { + "epoch": 0.6028748830483966, + "grad_norm": 0.7734375, + "learning_rate": 0.0001445599659145405, + "loss": 0.8684, + "step": 23479 + }, + { + "epoch": 0.6029005602443185, + "grad_norm": 0.74609375, + "learning_rate": 0.0001445559693724139, + "loss": 0.8985, + "step": 23480 + }, + { + "epoch": 0.6029262374402403, + "grad_norm": 0.7890625, + "learning_rate": 0.00014455197274149126, + "loss": 0.8932, + "step": 23481 + }, + { + "epoch": 0.6029519146361622, + "grad_norm": 0.8046875, + "learning_rate": 0.00014454797602178056, + "loss": 0.8146, + "step": 23482 + }, + { + "epoch": 0.602977591832084, + "grad_norm": 0.734375, + "learning_rate": 0.00014454397921328976, + "loss": 0.9146, + "step": 23483 + }, + { + "epoch": 0.6030032690280058, + "grad_norm": 0.74609375, + "learning_rate": 0.00014453998231602687, + "loss": 0.8482, + "step": 23484 + }, + { + "epoch": 0.6030289462239277, + "grad_norm": 0.78125, + "learning_rate": 0.00014453598532999977, + "loss": 0.9448, + "step": 23485 + }, + { + "epoch": 0.6030546234198494, + "grad_norm": 0.87109375, + "learning_rate": 0.00014453198825521647, + "loss": 0.929, + "step": 23486 + }, + { + "epoch": 0.6030803006157712, + "grad_norm": 0.80078125, + "learning_rate": 0.000144527991091685, + "loss": 0.8558, + "step": 23487 + }, + { + "epoch": 0.6031059778116931, + "grad_norm": 0.796875, + "learning_rate": 0.00014452399383941322, + "loss": 0.7749, + "step": 23488 + }, + { + "epoch": 0.6031316550076149, + "grad_norm": 0.76171875, + "learning_rate": 0.00014451999649840914, + "loss": 0.9622, + "step": 23489 + }, + { + "epoch": 0.6031573322035367, + "grad_norm": 0.7265625, + "learning_rate": 0.00014451599906868077, + "loss": 1.0348, + "step": 23490 + }, + { + "epoch": 0.6031830093994586, + "grad_norm": 0.80859375, + "learning_rate": 0.00014451200155023597, + "loss": 0.8585, + "step": 23491 + }, + { + "epoch": 0.6032086865953804, + "grad_norm": 0.75, + "learning_rate": 0.00014450800394308285, + "loss": 0.9669, + "step": 23492 + }, + { + "epoch": 0.6032343637913021, + "grad_norm": 0.71875, + "learning_rate": 0.00014450400624722924, + "loss": 0.8323, + "step": 23493 + }, + { + "epoch": 0.603260040987224, + "grad_norm": 0.69140625, + "learning_rate": 0.00014450000846268318, + "loss": 0.751, + "step": 23494 + }, + { + "epoch": 0.6032857181831458, + "grad_norm": 0.890625, + "learning_rate": 0.00014449601058945266, + "loss": 0.7946, + "step": 23495 + }, + { + "epoch": 0.6033113953790676, + "grad_norm": 0.828125, + "learning_rate": 0.00014449201262754555, + "loss": 0.797, + "step": 23496 + }, + { + "epoch": 0.6033370725749895, + "grad_norm": 0.66796875, + "learning_rate": 0.0001444880145769699, + "loss": 0.8353, + "step": 23497 + }, + { + "epoch": 0.6033627497709113, + "grad_norm": 0.765625, + "learning_rate": 0.00014448401643773367, + "loss": 0.8841, + "step": 23498 + }, + { + "epoch": 0.603388426966833, + "grad_norm": 0.80859375, + "learning_rate": 0.0001444800182098448, + "loss": 0.9013, + "step": 23499 + }, + { + "epoch": 0.6034141041627549, + "grad_norm": 0.71875, + "learning_rate": 0.00014447601989331127, + "loss": 0.9441, + "step": 23500 + }, + { + "epoch": 0.6034397813586767, + "grad_norm": 0.74609375, + "learning_rate": 0.00014447202148814105, + "loss": 0.8567, + "step": 23501 + }, + { + "epoch": 0.6034654585545985, + "grad_norm": 0.7421875, + "learning_rate": 0.00014446802299434213, + "loss": 0.8201, + "step": 23502 + }, + { + "epoch": 0.6034911357505204, + "grad_norm": 0.828125, + "learning_rate": 0.00014446402441192243, + "loss": 0.9339, + "step": 23503 + }, + { + "epoch": 0.6035168129464422, + "grad_norm": 0.77734375, + "learning_rate": 0.00014446002574088995, + "loss": 0.8334, + "step": 23504 + }, + { + "epoch": 0.603542490142364, + "grad_norm": 0.73828125, + "learning_rate": 0.00014445602698125265, + "loss": 0.9343, + "step": 23505 + }, + { + "epoch": 0.6035681673382858, + "grad_norm": 0.90625, + "learning_rate": 0.00014445202813301853, + "loss": 0.9394, + "step": 23506 + }, + { + "epoch": 0.6035938445342076, + "grad_norm": 0.7578125, + "learning_rate": 0.00014444802919619552, + "loss": 0.9003, + "step": 23507 + }, + { + "epoch": 0.6036195217301294, + "grad_norm": 0.81640625, + "learning_rate": 0.00014444403017079162, + "loss": 0.7661, + "step": 23508 + }, + { + "epoch": 0.6036451989260513, + "grad_norm": 0.7265625, + "learning_rate": 0.00014444003105681477, + "loss": 0.7767, + "step": 23509 + }, + { + "epoch": 0.6036708761219731, + "grad_norm": 0.80078125, + "learning_rate": 0.00014443603185427292, + "loss": 0.9284, + "step": 23510 + }, + { + "epoch": 0.603696553317895, + "grad_norm": 0.81640625, + "learning_rate": 0.0001444320325631741, + "loss": 0.7329, + "step": 23511 + }, + { + "epoch": 0.6037222305138168, + "grad_norm": 0.80078125, + "learning_rate": 0.00014442803318352623, + "loss": 0.7911, + "step": 23512 + }, + { + "epoch": 0.6037479077097385, + "grad_norm": 0.84765625, + "learning_rate": 0.00014442403371533734, + "loss": 1.099, + "step": 23513 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.79296875, + "learning_rate": 0.00014442003415861537, + "loss": 0.9188, + "step": 23514 + }, + { + "epoch": 0.6037992621015822, + "grad_norm": 0.76171875, + "learning_rate": 0.00014441603451336826, + "loss": 0.8195, + "step": 23515 + }, + { + "epoch": 0.603824939297504, + "grad_norm": 0.73828125, + "learning_rate": 0.000144412034779604, + "loss": 0.7624, + "step": 23516 + }, + { + "epoch": 0.6038506164934259, + "grad_norm": 0.76953125, + "learning_rate": 0.00014440803495733057, + "loss": 0.9374, + "step": 23517 + }, + { + "epoch": 0.6038762936893477, + "grad_norm": 0.7421875, + "learning_rate": 0.00014440403504655593, + "loss": 0.8217, + "step": 23518 + }, + { + "epoch": 0.6039019708852694, + "grad_norm": 1.3984375, + "learning_rate": 0.00014440003504728808, + "loss": 0.848, + "step": 23519 + }, + { + "epoch": 0.6039276480811913, + "grad_norm": 0.83984375, + "learning_rate": 0.00014439603495953495, + "loss": 1.0887, + "step": 23520 + }, + { + "epoch": 0.6039533252771131, + "grad_norm": 0.69921875, + "learning_rate": 0.00014439203478330454, + "loss": 0.8481, + "step": 23521 + }, + { + "epoch": 0.6039790024730349, + "grad_norm": 0.7265625, + "learning_rate": 0.00014438803451860484, + "loss": 0.841, + "step": 23522 + }, + { + "epoch": 0.6040046796689568, + "grad_norm": 0.69921875, + "learning_rate": 0.00014438403416544375, + "loss": 0.8586, + "step": 23523 + }, + { + "epoch": 0.6040303568648786, + "grad_norm": 0.75390625, + "learning_rate": 0.00014438003372382934, + "loss": 0.803, + "step": 23524 + }, + { + "epoch": 0.6040560340608004, + "grad_norm": 0.7578125, + "learning_rate": 0.0001443760331937695, + "loss": 0.9392, + "step": 23525 + }, + { + "epoch": 0.6040817112567222, + "grad_norm": 0.78515625, + "learning_rate": 0.0001443720325752722, + "loss": 0.7961, + "step": 23526 + }, + { + "epoch": 0.604107388452644, + "grad_norm": 0.7578125, + "learning_rate": 0.00014436803186834553, + "loss": 0.8249, + "step": 23527 + }, + { + "epoch": 0.6041330656485658, + "grad_norm": 0.73046875, + "learning_rate": 0.00014436403107299733, + "loss": 0.7732, + "step": 23528 + }, + { + "epoch": 0.6041587428444877, + "grad_norm": 0.7890625, + "learning_rate": 0.00014436003018923564, + "loss": 0.872, + "step": 23529 + }, + { + "epoch": 0.6041844200404095, + "grad_norm": 0.8125, + "learning_rate": 0.00014435602921706838, + "loss": 0.8637, + "step": 23530 + }, + { + "epoch": 0.6042100972363313, + "grad_norm": 0.79296875, + "learning_rate": 0.0001443520281565036, + "loss": 0.7904, + "step": 23531 + }, + { + "epoch": 0.6042357744322532, + "grad_norm": 0.84375, + "learning_rate": 0.00014434802700754923, + "loss": 0.7443, + "step": 23532 + }, + { + "epoch": 0.6042614516281749, + "grad_norm": 0.80859375, + "learning_rate": 0.00014434402577021325, + "loss": 0.8989, + "step": 23533 + }, + { + "epoch": 0.6042871288240967, + "grad_norm": 0.859375, + "learning_rate": 0.0001443400244445036, + "loss": 0.7901, + "step": 23534 + }, + { + "epoch": 0.6043128060200186, + "grad_norm": 0.73046875, + "learning_rate": 0.00014433602303042832, + "loss": 0.8248, + "step": 23535 + }, + { + "epoch": 0.6043384832159404, + "grad_norm": 0.71875, + "learning_rate": 0.00014433202152799532, + "loss": 0.81, + "step": 23536 + }, + { + "epoch": 0.6043641604118623, + "grad_norm": 0.83203125, + "learning_rate": 0.00014432801993721262, + "loss": 0.875, + "step": 23537 + }, + { + "epoch": 0.6043898376077841, + "grad_norm": 0.7734375, + "learning_rate": 0.00014432401825808822, + "loss": 0.7121, + "step": 23538 + }, + { + "epoch": 0.6044155148037058, + "grad_norm": 0.8125, + "learning_rate": 0.00014432001649063, + "loss": 0.9134, + "step": 23539 + }, + { + "epoch": 0.6044411919996276, + "grad_norm": 0.7890625, + "learning_rate": 0.00014431601463484603, + "loss": 0.8163, + "step": 23540 + }, + { + "epoch": 0.6044668691955495, + "grad_norm": 0.81640625, + "learning_rate": 0.00014431201269074422, + "loss": 0.7659, + "step": 23541 + }, + { + "epoch": 0.6044925463914713, + "grad_norm": 1.0390625, + "learning_rate": 0.00014430801065833256, + "loss": 1.0016, + "step": 23542 + }, + { + "epoch": 0.6045182235873932, + "grad_norm": 0.86328125, + "learning_rate": 0.00014430400853761906, + "loss": 0.9281, + "step": 23543 + }, + { + "epoch": 0.604543900783315, + "grad_norm": 0.7890625, + "learning_rate": 0.00014430000632861166, + "loss": 1.0529, + "step": 23544 + }, + { + "epoch": 0.6045695779792368, + "grad_norm": 0.73046875, + "learning_rate": 0.00014429600403131839, + "loss": 0.9453, + "step": 23545 + }, + { + "epoch": 0.6045952551751586, + "grad_norm": 0.72265625, + "learning_rate": 0.00014429200164574717, + "loss": 0.7868, + "step": 23546 + }, + { + "epoch": 0.6046209323710804, + "grad_norm": 0.80859375, + "learning_rate": 0.00014428799917190594, + "loss": 1.078, + "step": 23547 + }, + { + "epoch": 0.6046466095670022, + "grad_norm": 0.7578125, + "learning_rate": 0.00014428399660980278, + "loss": 0.8813, + "step": 23548 + }, + { + "epoch": 0.6046722867629241, + "grad_norm": 0.765625, + "learning_rate": 0.00014427999395944557, + "loss": 0.9135, + "step": 23549 + }, + { + "epoch": 0.6046979639588459, + "grad_norm": 0.765625, + "learning_rate": 0.00014427599122084235, + "loss": 0.7611, + "step": 23550 + }, + { + "epoch": 0.6047236411547677, + "grad_norm": 0.77734375, + "learning_rate": 0.0001442719883940011, + "loss": 0.9976, + "step": 23551 + }, + { + "epoch": 0.6047493183506895, + "grad_norm": 0.70703125, + "learning_rate": 0.00014426798547892978, + "loss": 0.903, + "step": 23552 + }, + { + "epoch": 0.6047749955466113, + "grad_norm": 0.86328125, + "learning_rate": 0.00014426398247563635, + "loss": 0.7943, + "step": 23553 + }, + { + "epoch": 0.6048006727425331, + "grad_norm": 0.8046875, + "learning_rate": 0.0001442599793841288, + "loss": 0.7886, + "step": 23554 + }, + { + "epoch": 0.604826349938455, + "grad_norm": 0.7109375, + "learning_rate": 0.0001442559762044151, + "loss": 0.9368, + "step": 23555 + }, + { + "epoch": 0.6048520271343768, + "grad_norm": 0.75390625, + "learning_rate": 0.00014425197293650325, + "loss": 0.8803, + "step": 23556 + }, + { + "epoch": 0.6048777043302986, + "grad_norm": 0.82421875, + "learning_rate": 0.00014424796958040123, + "loss": 0.9271, + "step": 23557 + }, + { + "epoch": 0.6049033815262205, + "grad_norm": 0.828125, + "learning_rate": 0.00014424396613611697, + "loss": 0.8705, + "step": 23558 + }, + { + "epoch": 0.6049290587221422, + "grad_norm": 0.86328125, + "learning_rate": 0.0001442399626036585, + "loss": 0.9042, + "step": 23559 + }, + { + "epoch": 0.604954735918064, + "grad_norm": 0.73046875, + "learning_rate": 0.00014423595898303376, + "loss": 0.8102, + "step": 23560 + }, + { + "epoch": 0.6049804131139859, + "grad_norm": 0.77734375, + "learning_rate": 0.0001442319552742508, + "loss": 0.7532, + "step": 23561 + }, + { + "epoch": 0.6050060903099077, + "grad_norm": 0.80078125, + "learning_rate": 0.0001442279514773175, + "loss": 0.8936, + "step": 23562 + }, + { + "epoch": 0.6050317675058295, + "grad_norm": 0.79296875, + "learning_rate": 0.0001442239475922419, + "loss": 0.9799, + "step": 23563 + }, + { + "epoch": 0.6050574447017514, + "grad_norm": 0.73828125, + "learning_rate": 0.00014421994361903196, + "loss": 0.8644, + "step": 23564 + }, + { + "epoch": 0.6050831218976732, + "grad_norm": 0.78515625, + "learning_rate": 0.0001442159395576957, + "loss": 0.9065, + "step": 23565 + }, + { + "epoch": 0.6051087990935949, + "grad_norm": 3.625, + "learning_rate": 0.00014421193540824104, + "loss": 0.9882, + "step": 23566 + }, + { + "epoch": 0.6051344762895168, + "grad_norm": 0.8046875, + "learning_rate": 0.000144207931170676, + "loss": 0.9292, + "step": 23567 + }, + { + "epoch": 0.6051601534854386, + "grad_norm": 0.78125, + "learning_rate": 0.0001442039268450085, + "loss": 0.719, + "step": 23568 + }, + { + "epoch": 0.6051858306813604, + "grad_norm": 0.83984375, + "learning_rate": 0.0001441999224312466, + "loss": 0.956, + "step": 23569 + }, + { + "epoch": 0.6052115078772823, + "grad_norm": 0.69921875, + "learning_rate": 0.00014419591792939827, + "loss": 0.8105, + "step": 23570 + }, + { + "epoch": 0.6052371850732041, + "grad_norm": 0.78515625, + "learning_rate": 0.00014419191333947144, + "loss": 0.8475, + "step": 23571 + }, + { + "epoch": 0.6052628622691258, + "grad_norm": 0.73828125, + "learning_rate": 0.00014418790866147413, + "loss": 0.8006, + "step": 23572 + }, + { + "epoch": 0.6052885394650477, + "grad_norm": 0.76171875, + "learning_rate": 0.0001441839038954143, + "loss": 0.8909, + "step": 23573 + }, + { + "epoch": 0.6053142166609695, + "grad_norm": 0.80859375, + "learning_rate": 0.00014417989904129996, + "loss": 1.0197, + "step": 23574 + }, + { + "epoch": 0.6053398938568914, + "grad_norm": 0.8125, + "learning_rate": 0.00014417589409913907, + "loss": 0.9002, + "step": 23575 + }, + { + "epoch": 0.6053655710528132, + "grad_norm": 0.68359375, + "learning_rate": 0.0001441718890689396, + "loss": 0.8222, + "step": 23576 + }, + { + "epoch": 0.605391248248735, + "grad_norm": 0.7421875, + "learning_rate": 0.00014416788395070954, + "loss": 0.8794, + "step": 23577 + }, + { + "epoch": 0.6054169254446569, + "grad_norm": 0.76953125, + "learning_rate": 0.0001441638787444569, + "loss": 0.774, + "step": 23578 + }, + { + "epoch": 0.6054426026405786, + "grad_norm": 0.80078125, + "learning_rate": 0.00014415987345018963, + "loss": 0.9633, + "step": 23579 + }, + { + "epoch": 0.6054682798365004, + "grad_norm": 0.7734375, + "learning_rate": 0.0001441558680679157, + "loss": 0.972, + "step": 23580 + }, + { + "epoch": 0.6054939570324223, + "grad_norm": 0.73828125, + "learning_rate": 0.0001441518625976431, + "loss": 0.9172, + "step": 23581 + }, + { + "epoch": 0.6055196342283441, + "grad_norm": 0.7890625, + "learning_rate": 0.00014414785703937988, + "loss": 0.8642, + "step": 23582 + }, + { + "epoch": 0.6055453114242659, + "grad_norm": 0.78125, + "learning_rate": 0.00014414385139313395, + "loss": 0.8903, + "step": 23583 + }, + { + "epoch": 0.6055709886201878, + "grad_norm": 0.86328125, + "learning_rate": 0.00014413984565891328, + "loss": 0.8597, + "step": 23584 + }, + { + "epoch": 0.6055966658161096, + "grad_norm": 0.70703125, + "learning_rate": 0.00014413583983672592, + "loss": 0.8739, + "step": 23585 + }, + { + "epoch": 0.6056223430120313, + "grad_norm": 0.80859375, + "learning_rate": 0.00014413183392657982, + "loss": 0.9216, + "step": 23586 + }, + { + "epoch": 0.6056480202079532, + "grad_norm": 0.86328125, + "learning_rate": 0.00014412782792848293, + "loss": 0.915, + "step": 23587 + }, + { + "epoch": 0.605673697403875, + "grad_norm": 0.82421875, + "learning_rate": 0.00014412382184244332, + "loss": 0.9253, + "step": 23588 + }, + { + "epoch": 0.6056993745997968, + "grad_norm": 0.78125, + "learning_rate": 0.00014411981566846884, + "loss": 0.9169, + "step": 23589 + }, + { + "epoch": 0.6057250517957187, + "grad_norm": 0.72265625, + "learning_rate": 0.00014411580940656762, + "loss": 0.7249, + "step": 23590 + }, + { + "epoch": 0.6057507289916405, + "grad_norm": 0.74609375, + "learning_rate": 0.00014411180305674753, + "loss": 0.7553, + "step": 23591 + }, + { + "epoch": 0.6057764061875622, + "grad_norm": 0.7265625, + "learning_rate": 0.00014410779661901666, + "loss": 0.7947, + "step": 23592 + }, + { + "epoch": 0.6058020833834841, + "grad_norm": 0.765625, + "learning_rate": 0.0001441037900933829, + "loss": 0.9413, + "step": 23593 + }, + { + "epoch": 0.6058277605794059, + "grad_norm": 0.82421875, + "learning_rate": 0.00014409978347985425, + "loss": 0.9546, + "step": 23594 + }, + { + "epoch": 0.6058534377753277, + "grad_norm": 0.76171875, + "learning_rate": 0.00014409577677843873, + "loss": 0.8844, + "step": 23595 + }, + { + "epoch": 0.6058791149712496, + "grad_norm": 0.75, + "learning_rate": 0.0001440917699891443, + "loss": 0.8791, + "step": 23596 + }, + { + "epoch": 0.6059047921671714, + "grad_norm": 0.7734375, + "learning_rate": 0.00014408776311197898, + "loss": 0.8864, + "step": 23597 + }, + { + "epoch": 0.6059304693630932, + "grad_norm": 0.81640625, + "learning_rate": 0.0001440837561469507, + "loss": 0.9435, + "step": 23598 + }, + { + "epoch": 0.605956146559015, + "grad_norm": 0.74609375, + "learning_rate": 0.00014407974909406754, + "loss": 0.7866, + "step": 23599 + }, + { + "epoch": 0.6059818237549368, + "grad_norm": 0.7734375, + "learning_rate": 0.00014407574195333736, + "loss": 0.7575, + "step": 23600 + }, + { + "epoch": 0.6060075009508586, + "grad_norm": 0.8203125, + "learning_rate": 0.00014407173472476823, + "loss": 0.9226, + "step": 23601 + }, + { + "epoch": 0.6060331781467805, + "grad_norm": 0.73828125, + "learning_rate": 0.0001440677274083681, + "loss": 0.9301, + "step": 23602 + }, + { + "epoch": 0.6060588553427023, + "grad_norm": 0.75390625, + "learning_rate": 0.00014406372000414498, + "loss": 0.8521, + "step": 23603 + }, + { + "epoch": 0.6060845325386242, + "grad_norm": 0.796875, + "learning_rate": 0.00014405971251210683, + "loss": 0.9027, + "step": 23604 + }, + { + "epoch": 0.606110209734546, + "grad_norm": 0.72265625, + "learning_rate": 0.00014405570493226168, + "loss": 0.8838, + "step": 23605 + }, + { + "epoch": 0.6061358869304677, + "grad_norm": 0.76171875, + "learning_rate": 0.00014405169726461748, + "loss": 0.8185, + "step": 23606 + }, + { + "epoch": 0.6061615641263896, + "grad_norm": 0.76953125, + "learning_rate": 0.00014404768950918224, + "loss": 0.9273, + "step": 23607 + }, + { + "epoch": 0.6061872413223114, + "grad_norm": 0.8359375, + "learning_rate": 0.00014404368166596389, + "loss": 0.8971, + "step": 23608 + }, + { + "epoch": 0.6062129185182332, + "grad_norm": 0.75390625, + "learning_rate": 0.0001440396737349705, + "loss": 0.8885, + "step": 23609 + }, + { + "epoch": 0.6062385957141551, + "grad_norm": 0.82421875, + "learning_rate": 0.00014403566571621, + "loss": 0.8918, + "step": 23610 + }, + { + "epoch": 0.6062642729100769, + "grad_norm": 0.75, + "learning_rate": 0.0001440316576096904, + "loss": 0.9083, + "step": 23611 + }, + { + "epoch": 0.6062899501059986, + "grad_norm": 0.71484375, + "learning_rate": 0.0001440276494154197, + "loss": 0.8661, + "step": 23612 + }, + { + "epoch": 0.6063156273019205, + "grad_norm": 0.703125, + "learning_rate": 0.00014402364113340585, + "loss": 0.8781, + "step": 23613 + }, + { + "epoch": 0.6063413044978423, + "grad_norm": 0.75390625, + "learning_rate": 0.00014401963276365688, + "loss": 0.8477, + "step": 23614 + }, + { + "epoch": 0.6063669816937641, + "grad_norm": 0.75390625, + "learning_rate": 0.00014401562430618075, + "loss": 0.7514, + "step": 23615 + }, + { + "epoch": 0.606392658889686, + "grad_norm": 0.73046875, + "learning_rate": 0.00014401161576098543, + "loss": 0.9122, + "step": 23616 + }, + { + "epoch": 0.6064183360856078, + "grad_norm": 0.703125, + "learning_rate": 0.00014400760712807896, + "loss": 0.7628, + "step": 23617 + }, + { + "epoch": 0.6064440132815296, + "grad_norm": 0.78515625, + "learning_rate": 0.00014400359840746933, + "loss": 0.94, + "step": 23618 + }, + { + "epoch": 0.6064696904774514, + "grad_norm": 0.72265625, + "learning_rate": 0.00014399958959916446, + "loss": 0.8409, + "step": 23619 + }, + { + "epoch": 0.6064953676733732, + "grad_norm": 0.73046875, + "learning_rate": 0.0001439955807031724, + "loss": 0.8406, + "step": 23620 + }, + { + "epoch": 0.606521044869295, + "grad_norm": 0.8125, + "learning_rate": 0.0001439915717195011, + "loss": 0.8205, + "step": 23621 + }, + { + "epoch": 0.6065467220652169, + "grad_norm": 0.72265625, + "learning_rate": 0.0001439875626481586, + "loss": 0.7737, + "step": 23622 + }, + { + "epoch": 0.6065723992611387, + "grad_norm": 0.7265625, + "learning_rate": 0.00014398355348915287, + "loss": 0.79, + "step": 23623 + }, + { + "epoch": 0.6065980764570605, + "grad_norm": 0.7109375, + "learning_rate": 0.00014397954424249184, + "loss": 0.7617, + "step": 23624 + }, + { + "epoch": 0.6066237536529824, + "grad_norm": 0.77734375, + "learning_rate": 0.00014397553490818363, + "loss": 0.9388, + "step": 23625 + }, + { + "epoch": 0.6066494308489041, + "grad_norm": 0.79296875, + "learning_rate": 0.0001439715254862361, + "loss": 0.8064, + "step": 23626 + }, + { + "epoch": 0.6066751080448259, + "grad_norm": 0.8046875, + "learning_rate": 0.0001439675159766573, + "loss": 0.8846, + "step": 23627 + }, + { + "epoch": 0.6067007852407478, + "grad_norm": 0.78125, + "learning_rate": 0.0001439635063794552, + "loss": 0.9024, + "step": 23628 + }, + { + "epoch": 0.6067264624366696, + "grad_norm": 0.7734375, + "learning_rate": 0.0001439594966946378, + "loss": 0.8827, + "step": 23629 + }, + { + "epoch": 0.6067521396325914, + "grad_norm": 0.75390625, + "learning_rate": 0.0001439554869222131, + "loss": 0.9958, + "step": 23630 + }, + { + "epoch": 0.6067778168285133, + "grad_norm": 0.83984375, + "learning_rate": 0.0001439514770621891, + "loss": 0.8664, + "step": 23631 + }, + { + "epoch": 0.606803494024435, + "grad_norm": 0.8203125, + "learning_rate": 0.00014394746711457378, + "loss": 0.937, + "step": 23632 + }, + { + "epoch": 0.6068291712203568, + "grad_norm": 0.79296875, + "learning_rate": 0.0001439434570793751, + "loss": 1.0113, + "step": 23633 + }, + { + "epoch": 0.6068548484162787, + "grad_norm": 0.75390625, + "learning_rate": 0.00014393944695660112, + "loss": 0.8352, + "step": 23634 + }, + { + "epoch": 0.6068805256122005, + "grad_norm": 0.8125, + "learning_rate": 0.00014393543674625975, + "loss": 0.893, + "step": 23635 + }, + { + "epoch": 0.6069062028081224, + "grad_norm": 0.73046875, + "learning_rate": 0.00014393142644835905, + "loss": 0.7334, + "step": 23636 + }, + { + "epoch": 0.6069318800040442, + "grad_norm": 0.71484375, + "learning_rate": 0.00014392741606290694, + "loss": 0.8051, + "step": 23637 + }, + { + "epoch": 0.606957557199966, + "grad_norm": 0.75, + "learning_rate": 0.00014392340558991152, + "loss": 0.7268, + "step": 23638 + }, + { + "epoch": 0.6069832343958877, + "grad_norm": 0.8125, + "learning_rate": 0.0001439193950293807, + "loss": 0.8763, + "step": 23639 + }, + { + "epoch": 0.6070089115918096, + "grad_norm": 0.75, + "learning_rate": 0.00014391538438132247, + "loss": 0.8061, + "step": 23640 + }, + { + "epoch": 0.6070345887877314, + "grad_norm": 0.76953125, + "learning_rate": 0.00014391137364574486, + "loss": 0.8963, + "step": 23641 + }, + { + "epoch": 0.6070602659836533, + "grad_norm": 0.7734375, + "learning_rate": 0.00014390736282265587, + "loss": 0.9038, + "step": 23642 + }, + { + "epoch": 0.6070859431795751, + "grad_norm": 0.77734375, + "learning_rate": 0.00014390335191206342, + "loss": 0.8655, + "step": 23643 + }, + { + "epoch": 0.6071116203754969, + "grad_norm": 0.79296875, + "learning_rate": 0.00014389934091397558, + "loss": 0.9044, + "step": 23644 + }, + { + "epoch": 0.6071372975714188, + "grad_norm": 0.75390625, + "learning_rate": 0.00014389532982840035, + "loss": 0.7556, + "step": 23645 + }, + { + "epoch": 0.6071629747673405, + "grad_norm": 0.73828125, + "learning_rate": 0.00014389131865534568, + "loss": 0.9005, + "step": 23646 + }, + { + "epoch": 0.6071886519632623, + "grad_norm": 0.8203125, + "learning_rate": 0.00014388730739481957, + "loss": 0.9461, + "step": 23647 + }, + { + "epoch": 0.6072143291591842, + "grad_norm": 0.7734375, + "learning_rate": 0.00014388329604682999, + "loss": 0.8506, + "step": 23648 + }, + { + "epoch": 0.607240006355106, + "grad_norm": 1.125, + "learning_rate": 0.000143879284611385, + "loss": 0.9591, + "step": 23649 + }, + { + "epoch": 0.6072656835510278, + "grad_norm": 0.85546875, + "learning_rate": 0.00014387527308849256, + "loss": 0.9326, + "step": 23650 + }, + { + "epoch": 0.6072913607469497, + "grad_norm": 0.73828125, + "learning_rate": 0.00014387126147816066, + "loss": 0.8206, + "step": 23651 + }, + { + "epoch": 0.6073170379428714, + "grad_norm": 0.75390625, + "learning_rate": 0.0001438672497803973, + "loss": 0.9387, + "step": 23652 + }, + { + "epoch": 0.6073427151387932, + "grad_norm": 0.7890625, + "learning_rate": 0.00014386323799521044, + "loss": 0.9166, + "step": 23653 + }, + { + "epoch": 0.6073683923347151, + "grad_norm": 0.75, + "learning_rate": 0.00014385922612260817, + "loss": 0.8084, + "step": 23654 + }, + { + "epoch": 0.6073940695306369, + "grad_norm": 0.72265625, + "learning_rate": 0.00014385521416259838, + "loss": 0.7543, + "step": 23655 + }, + { + "epoch": 0.6074197467265587, + "grad_norm": 0.79296875, + "learning_rate": 0.0001438512021151891, + "loss": 0.8456, + "step": 23656 + }, + { + "epoch": 0.6074454239224806, + "grad_norm": 0.79296875, + "learning_rate": 0.00014384718998038836, + "loss": 1.0281, + "step": 23657 + }, + { + "epoch": 0.6074711011184024, + "grad_norm": 0.765625, + "learning_rate": 0.00014384317775820412, + "loss": 0.8723, + "step": 23658 + }, + { + "epoch": 0.6074967783143241, + "grad_norm": 1.09375, + "learning_rate": 0.00014383916544864438, + "loss": 0.8538, + "step": 23659 + }, + { + "epoch": 0.607522455510246, + "grad_norm": 0.73046875, + "learning_rate": 0.0001438351530517172, + "loss": 0.8889, + "step": 23660 + }, + { + "epoch": 0.6075481327061678, + "grad_norm": 0.8046875, + "learning_rate": 0.00014383114056743044, + "loss": 0.8578, + "step": 23661 + }, + { + "epoch": 0.6075738099020896, + "grad_norm": 0.73046875, + "learning_rate": 0.00014382712799579223, + "loss": 0.7513, + "step": 23662 + }, + { + "epoch": 0.6075994870980115, + "grad_norm": 0.78125, + "learning_rate": 0.0001438231153368105, + "loss": 0.8968, + "step": 23663 + }, + { + "epoch": 0.6076251642939333, + "grad_norm": 0.828125, + "learning_rate": 0.00014381910259049323, + "loss": 0.9697, + "step": 23664 + }, + { + "epoch": 0.6076508414898552, + "grad_norm": 0.73828125, + "learning_rate": 0.0001438150897568485, + "loss": 0.693, + "step": 23665 + }, + { + "epoch": 0.6076765186857769, + "grad_norm": 0.74609375, + "learning_rate": 0.0001438110768358842, + "loss": 0.9423, + "step": 23666 + }, + { + "epoch": 0.6077021958816987, + "grad_norm": 0.73828125, + "learning_rate": 0.00014380706382760843, + "loss": 0.8172, + "step": 23667 + }, + { + "epoch": 0.6077278730776206, + "grad_norm": 0.75390625, + "learning_rate": 0.0001438030507320291, + "loss": 0.8261, + "step": 23668 + }, + { + "epoch": 0.6077535502735424, + "grad_norm": 0.72265625, + "learning_rate": 0.00014379903754915425, + "loss": 0.7365, + "step": 23669 + }, + { + "epoch": 0.6077792274694642, + "grad_norm": 0.77734375, + "learning_rate": 0.0001437950242789919, + "loss": 0.841, + "step": 23670 + }, + { + "epoch": 0.6078049046653861, + "grad_norm": 0.796875, + "learning_rate": 0.00014379101092155003, + "loss": 0.861, + "step": 23671 + }, + { + "epoch": 0.6078305818613078, + "grad_norm": 0.74609375, + "learning_rate": 0.00014378699747683658, + "loss": 0.7722, + "step": 23672 + }, + { + "epoch": 0.6078562590572296, + "grad_norm": 0.71484375, + "learning_rate": 0.00014378298394485966, + "loss": 0.741, + "step": 23673 + }, + { + "epoch": 0.6078819362531515, + "grad_norm": 0.8046875, + "learning_rate": 0.00014377897032562714, + "loss": 0.8701, + "step": 23674 + }, + { + "epoch": 0.6079076134490733, + "grad_norm": 0.8125, + "learning_rate": 0.00014377495661914712, + "loss": 0.7948, + "step": 23675 + }, + { + "epoch": 0.6079332906449951, + "grad_norm": 0.72265625, + "learning_rate": 0.0001437709428254276, + "loss": 0.7827, + "step": 23676 + }, + { + "epoch": 0.607958967840917, + "grad_norm": 0.84375, + "learning_rate": 0.0001437669289444765, + "loss": 0.8444, + "step": 23677 + }, + { + "epoch": 0.6079846450368388, + "grad_norm": 0.765625, + "learning_rate": 0.0001437629149763019, + "loss": 0.8568, + "step": 23678 + }, + { + "epoch": 0.6080103222327605, + "grad_norm": 0.77734375, + "learning_rate": 0.00014375890092091175, + "loss": 0.8782, + "step": 23679 + }, + { + "epoch": 0.6080359994286824, + "grad_norm": 0.80859375, + "learning_rate": 0.00014375488677831403, + "loss": 0.8597, + "step": 23680 + }, + { + "epoch": 0.6080616766246042, + "grad_norm": 0.75, + "learning_rate": 0.0001437508725485168, + "loss": 0.799, + "step": 23681 + }, + { + "epoch": 0.608087353820526, + "grad_norm": 0.83984375, + "learning_rate": 0.00014374685823152804, + "loss": 0.9119, + "step": 23682 + }, + { + "epoch": 0.6081130310164479, + "grad_norm": 0.7578125, + "learning_rate": 0.00014374284382735573, + "loss": 0.8158, + "step": 23683 + }, + { + "epoch": 0.6081387082123697, + "grad_norm": 0.7890625, + "learning_rate": 0.0001437388293360079, + "loss": 0.9207, + "step": 23684 + }, + { + "epoch": 0.6081643854082915, + "grad_norm": 0.7890625, + "learning_rate": 0.0001437348147574925, + "loss": 0.7764, + "step": 23685 + }, + { + "epoch": 0.6081900626042133, + "grad_norm": 0.79296875, + "learning_rate": 0.0001437308000918176, + "loss": 0.8378, + "step": 23686 + }, + { + "epoch": 0.6082157398001351, + "grad_norm": 0.81640625, + "learning_rate": 0.00014372678533899116, + "loss": 0.871, + "step": 23687 + }, + { + "epoch": 0.6082414169960569, + "grad_norm": 0.71484375, + "learning_rate": 0.00014372277049902116, + "loss": 0.8259, + "step": 23688 + }, + { + "epoch": 0.6082670941919788, + "grad_norm": 0.73046875, + "learning_rate": 0.00014371875557191564, + "loss": 0.7836, + "step": 23689 + }, + { + "epoch": 0.6082927713879006, + "grad_norm": 0.7734375, + "learning_rate": 0.00014371474055768261, + "loss": 0.9272, + "step": 23690 + }, + { + "epoch": 0.6083184485838224, + "grad_norm": 0.8046875, + "learning_rate": 0.00014371072545633004, + "loss": 0.8238, + "step": 23691 + }, + { + "epoch": 0.6083441257797442, + "grad_norm": 0.8359375, + "learning_rate": 0.00014370671026786596, + "loss": 0.8447, + "step": 23692 + }, + { + "epoch": 0.608369802975666, + "grad_norm": 0.80859375, + "learning_rate": 0.0001437026949922983, + "loss": 0.9223, + "step": 23693 + }, + { + "epoch": 0.6083954801715878, + "grad_norm": 0.7890625, + "learning_rate": 0.00014369867962963517, + "loss": 0.8559, + "step": 23694 + }, + { + "epoch": 0.6084211573675097, + "grad_norm": 0.73828125, + "learning_rate": 0.0001436946641798845, + "loss": 0.7912, + "step": 23695 + }, + { + "epoch": 0.6084468345634315, + "grad_norm": 0.8359375, + "learning_rate": 0.00014369064864305428, + "loss": 0.9342, + "step": 23696 + }, + { + "epoch": 0.6084725117593534, + "grad_norm": 0.7421875, + "learning_rate": 0.0001436866330191526, + "loss": 0.8712, + "step": 23697 + }, + { + "epoch": 0.6084981889552752, + "grad_norm": 0.78515625, + "learning_rate": 0.00014368261730818738, + "loss": 0.7947, + "step": 23698 + }, + { + "epoch": 0.6085238661511969, + "grad_norm": 0.78125, + "learning_rate": 0.00014367860151016663, + "loss": 0.945, + "step": 23699 + }, + { + "epoch": 0.6085495433471187, + "grad_norm": 0.66796875, + "learning_rate": 0.0001436745856250984, + "loss": 0.7737, + "step": 23700 + }, + { + "epoch": 0.6085752205430406, + "grad_norm": 0.81640625, + "learning_rate": 0.00014367056965299067, + "loss": 1.0066, + "step": 23701 + }, + { + "epoch": 0.6086008977389624, + "grad_norm": 0.76953125, + "learning_rate": 0.00014366655359385143, + "loss": 0.8487, + "step": 23702 + }, + { + "epoch": 0.6086265749348843, + "grad_norm": 0.7734375, + "learning_rate": 0.00014366253744768867, + "loss": 0.7924, + "step": 23703 + }, + { + "epoch": 0.6086522521308061, + "grad_norm": 0.80859375, + "learning_rate": 0.00014365852121451047, + "loss": 0.9206, + "step": 23704 + }, + { + "epoch": 0.6086779293267279, + "grad_norm": 0.8125, + "learning_rate": 0.00014365450489432476, + "loss": 0.8365, + "step": 23705 + }, + { + "epoch": 0.6087036065226497, + "grad_norm": 0.8125, + "learning_rate": 0.00014365048848713951, + "loss": 0.8664, + "step": 23706 + }, + { + "epoch": 0.6087292837185715, + "grad_norm": 0.76953125, + "learning_rate": 0.00014364647199296285, + "loss": 0.8641, + "step": 23707 + }, + { + "epoch": 0.6087549609144933, + "grad_norm": 0.796875, + "learning_rate": 0.00014364245541180267, + "loss": 0.7807, + "step": 23708 + }, + { + "epoch": 0.6087806381104152, + "grad_norm": 0.78125, + "learning_rate": 0.00014363843874366702, + "loss": 0.87, + "step": 23709 + }, + { + "epoch": 0.608806315306337, + "grad_norm": 0.8359375, + "learning_rate": 0.00014363442198856393, + "loss": 0.8401, + "step": 23710 + }, + { + "epoch": 0.6088319925022588, + "grad_norm": 0.80078125, + "learning_rate": 0.00014363040514650137, + "loss": 0.8454, + "step": 23711 + }, + { + "epoch": 0.6088576696981806, + "grad_norm": 0.77734375, + "learning_rate": 0.00014362638821748734, + "loss": 0.8721, + "step": 23712 + }, + { + "epoch": 0.6088833468941024, + "grad_norm": 0.78125, + "learning_rate": 0.00014362237120152987, + "loss": 0.7337, + "step": 23713 + }, + { + "epoch": 0.6089090240900242, + "grad_norm": 0.8203125, + "learning_rate": 0.00014361835409863697, + "loss": 0.946, + "step": 23714 + }, + { + "epoch": 0.6089347012859461, + "grad_norm": 0.796875, + "learning_rate": 0.00014361433690881662, + "loss": 0.969, + "step": 23715 + }, + { + "epoch": 0.6089603784818679, + "grad_norm": 0.8203125, + "learning_rate": 0.00014361031963207682, + "loss": 0.814, + "step": 23716 + }, + { + "epoch": 0.6089860556777897, + "grad_norm": 0.7890625, + "learning_rate": 0.00014360630226842563, + "loss": 0.7944, + "step": 23717 + }, + { + "epoch": 0.6090117328737116, + "grad_norm": 0.8125, + "learning_rate": 0.000143602284817871, + "loss": 0.9396, + "step": 23718 + }, + { + "epoch": 0.6090374100696333, + "grad_norm": 0.76953125, + "learning_rate": 0.00014359826728042093, + "loss": 0.8113, + "step": 23719 + }, + { + "epoch": 0.6090630872655551, + "grad_norm": 0.8125, + "learning_rate": 0.00014359424965608347, + "loss": 0.8872, + "step": 23720 + }, + { + "epoch": 0.609088764461477, + "grad_norm": 0.828125, + "learning_rate": 0.0001435902319448666, + "loss": 0.9282, + "step": 23721 + }, + { + "epoch": 0.6091144416573988, + "grad_norm": 0.93359375, + "learning_rate": 0.00014358621414677834, + "loss": 0.9652, + "step": 23722 + }, + { + "epoch": 0.6091401188533206, + "grad_norm": 0.859375, + "learning_rate": 0.0001435821962618267, + "loss": 1.0336, + "step": 23723 + }, + { + "epoch": 0.6091657960492425, + "grad_norm": 0.73828125, + "learning_rate": 0.00014357817829001968, + "loss": 0.7516, + "step": 23724 + }, + { + "epoch": 0.6091914732451643, + "grad_norm": 0.7890625, + "learning_rate": 0.00014357416023136528, + "loss": 0.8517, + "step": 23725 + }, + { + "epoch": 0.609217150441086, + "grad_norm": 0.81640625, + "learning_rate": 0.0001435701420858715, + "loss": 0.7906, + "step": 23726 + }, + { + "epoch": 0.6092428276370079, + "grad_norm": 0.71875, + "learning_rate": 0.0001435661238535464, + "loss": 0.8614, + "step": 23727 + }, + { + "epoch": 0.6092685048329297, + "grad_norm": 0.9609375, + "learning_rate": 0.00014356210553439792, + "loss": 0.8609, + "step": 23728 + }, + { + "epoch": 0.6092941820288515, + "grad_norm": 0.75390625, + "learning_rate": 0.00014355808712843412, + "loss": 0.8248, + "step": 23729 + }, + { + "epoch": 0.6093198592247734, + "grad_norm": 0.73828125, + "learning_rate": 0.00014355406863566296, + "loss": 0.8028, + "step": 23730 + }, + { + "epoch": 0.6093455364206952, + "grad_norm": 0.734375, + "learning_rate": 0.0001435500500560925, + "loss": 0.8071, + "step": 23731 + }, + { + "epoch": 0.609371213616617, + "grad_norm": 0.73828125, + "learning_rate": 0.0001435460313897307, + "loss": 0.786, + "step": 23732 + }, + { + "epoch": 0.6093968908125388, + "grad_norm": 0.796875, + "learning_rate": 0.0001435420126365856, + "loss": 0.7896, + "step": 23733 + }, + { + "epoch": 0.6094225680084606, + "grad_norm": 0.765625, + "learning_rate": 0.00014353799379666522, + "loss": 0.8689, + "step": 23734 + }, + { + "epoch": 0.6094482452043825, + "grad_norm": 0.76953125, + "learning_rate": 0.00014353397486997755, + "loss": 0.8838, + "step": 23735 + }, + { + "epoch": 0.6094739224003043, + "grad_norm": 0.7578125, + "learning_rate": 0.00014352995585653058, + "loss": 0.6839, + "step": 23736 + }, + { + "epoch": 0.6094995995962261, + "grad_norm": 0.73046875, + "learning_rate": 0.00014352593675633237, + "loss": 0.758, + "step": 23737 + }, + { + "epoch": 0.609525276792148, + "grad_norm": 0.70703125, + "learning_rate": 0.00014352191756939085, + "loss": 0.7625, + "step": 23738 + }, + { + "epoch": 0.6095509539880697, + "grad_norm": 0.7578125, + "learning_rate": 0.00014351789829571411, + "loss": 0.8367, + "step": 23739 + }, + { + "epoch": 0.6095766311839915, + "grad_norm": 0.8046875, + "learning_rate": 0.00014351387893531015, + "loss": 0.8926, + "step": 23740 + }, + { + "epoch": 0.6096023083799134, + "grad_norm": 0.76171875, + "learning_rate": 0.00014350985948818692, + "loss": 0.9099, + "step": 23741 + }, + { + "epoch": 0.6096279855758352, + "grad_norm": 0.72265625, + "learning_rate": 0.0001435058399543525, + "loss": 0.7713, + "step": 23742 + }, + { + "epoch": 0.609653662771757, + "grad_norm": 0.7890625, + "learning_rate": 0.00014350182033381485, + "loss": 0.9796, + "step": 23743 + }, + { + "epoch": 0.6096793399676789, + "grad_norm": 0.80859375, + "learning_rate": 0.000143497800626582, + "loss": 0.936, + "step": 23744 + }, + { + "epoch": 0.6097050171636007, + "grad_norm": 0.7578125, + "learning_rate": 0.000143493780832662, + "loss": 0.7795, + "step": 23745 + }, + { + "epoch": 0.6097306943595224, + "grad_norm": 0.8203125, + "learning_rate": 0.0001434897609520628, + "loss": 1.0038, + "step": 23746 + }, + { + "epoch": 0.6097563715554443, + "grad_norm": 0.72265625, + "learning_rate": 0.0001434857409847924, + "loss": 0.7233, + "step": 23747 + }, + { + "epoch": 0.6097820487513661, + "grad_norm": 0.74609375, + "learning_rate": 0.0001434817209308589, + "loss": 0.8328, + "step": 23748 + }, + { + "epoch": 0.6098077259472879, + "grad_norm": 0.734375, + "learning_rate": 0.00014347770079027022, + "loss": 0.8181, + "step": 23749 + }, + { + "epoch": 0.6098334031432098, + "grad_norm": 0.7578125, + "learning_rate": 0.00014347368056303448, + "loss": 0.7553, + "step": 23750 + }, + { + "epoch": 0.6098590803391316, + "grad_norm": 0.83203125, + "learning_rate": 0.00014346966024915954, + "loss": 0.8575, + "step": 23751 + }, + { + "epoch": 0.6098847575350533, + "grad_norm": 0.80078125, + "learning_rate": 0.00014346563984865354, + "loss": 0.8628, + "step": 23752 + }, + { + "epoch": 0.6099104347309752, + "grad_norm": 0.83984375, + "learning_rate": 0.00014346161936152442, + "loss": 1.0265, + "step": 23753 + }, + { + "epoch": 0.609936111926897, + "grad_norm": 0.8125, + "learning_rate": 0.0001434575987877802, + "loss": 1.0427, + "step": 23754 + }, + { + "epoch": 0.6099617891228188, + "grad_norm": 0.77734375, + "learning_rate": 0.00014345357812742894, + "loss": 0.8214, + "step": 23755 + }, + { + "epoch": 0.6099874663187407, + "grad_norm": 0.76953125, + "learning_rate": 0.00014344955738047866, + "loss": 0.9012, + "step": 23756 + }, + { + "epoch": 0.6100131435146625, + "grad_norm": 0.73828125, + "learning_rate": 0.0001434455365469373, + "loss": 0.8174, + "step": 23757 + }, + { + "epoch": 0.6100388207105844, + "grad_norm": 0.6875, + "learning_rate": 0.0001434415156268129, + "loss": 0.7777, + "step": 23758 + }, + { + "epoch": 0.6100644979065061, + "grad_norm": 0.8359375, + "learning_rate": 0.00014343749462011348, + "loss": 0.8814, + "step": 23759 + }, + { + "epoch": 0.6100901751024279, + "grad_norm": 0.7265625, + "learning_rate": 0.00014343347352684707, + "loss": 0.7857, + "step": 23760 + }, + { + "epoch": 0.6101158522983497, + "grad_norm": 0.83203125, + "learning_rate": 0.00014342945234702164, + "loss": 0.8356, + "step": 23761 + }, + { + "epoch": 0.6101415294942716, + "grad_norm": 0.75390625, + "learning_rate": 0.00014342543108064528, + "loss": 0.7752, + "step": 23762 + }, + { + "epoch": 0.6101672066901934, + "grad_norm": 0.796875, + "learning_rate": 0.00014342140972772594, + "loss": 0.8165, + "step": 23763 + }, + { + "epoch": 0.6101928838861153, + "grad_norm": 0.70703125, + "learning_rate": 0.00014341738828827167, + "loss": 0.7685, + "step": 23764 + }, + { + "epoch": 0.610218561082037, + "grad_norm": 0.76171875, + "learning_rate": 0.00014341336676229044, + "loss": 1.0174, + "step": 23765 + }, + { + "epoch": 0.6102442382779588, + "grad_norm": 0.76953125, + "learning_rate": 0.0001434093451497903, + "loss": 0.73, + "step": 23766 + }, + { + "epoch": 0.6102699154738807, + "grad_norm": 0.7890625, + "learning_rate": 0.00014340532345077928, + "loss": 0.8522, + "step": 23767 + }, + { + "epoch": 0.6102955926698025, + "grad_norm": 0.7265625, + "learning_rate": 0.00014340130166526534, + "loss": 0.867, + "step": 23768 + }, + { + "epoch": 0.6103212698657243, + "grad_norm": 0.796875, + "learning_rate": 0.00014339727979325654, + "loss": 1.0468, + "step": 23769 + }, + { + "epoch": 0.6103469470616462, + "grad_norm": 0.75390625, + "learning_rate": 0.00014339325783476088, + "loss": 0.9242, + "step": 23770 + }, + { + "epoch": 0.610372624257568, + "grad_norm": 0.828125, + "learning_rate": 0.00014338923578978638, + "loss": 0.9147, + "step": 23771 + }, + { + "epoch": 0.6103983014534897, + "grad_norm": 0.85546875, + "learning_rate": 0.00014338521365834104, + "loss": 0.8476, + "step": 23772 + }, + { + "epoch": 0.6104239786494116, + "grad_norm": 0.6875, + "learning_rate": 0.00014338119144043288, + "loss": 0.8872, + "step": 23773 + }, + { + "epoch": 0.6104496558453334, + "grad_norm": 0.8125, + "learning_rate": 0.00014337716913606996, + "loss": 0.9632, + "step": 23774 + }, + { + "epoch": 0.6104753330412552, + "grad_norm": 0.765625, + "learning_rate": 0.00014337314674526023, + "loss": 0.8361, + "step": 23775 + }, + { + "epoch": 0.6105010102371771, + "grad_norm": 0.78125, + "learning_rate": 0.00014336912426801178, + "loss": 0.9681, + "step": 23776 + }, + { + "epoch": 0.6105266874330989, + "grad_norm": 0.7734375, + "learning_rate": 0.00014336510170433254, + "loss": 0.8173, + "step": 23777 + }, + { + "epoch": 0.6105523646290207, + "grad_norm": 0.796875, + "learning_rate": 0.00014336107905423058, + "loss": 0.8216, + "step": 23778 + }, + { + "epoch": 0.6105780418249425, + "grad_norm": 0.80078125, + "learning_rate": 0.0001433570563177139, + "loss": 0.7298, + "step": 23779 + }, + { + "epoch": 0.6106037190208643, + "grad_norm": 0.8984375, + "learning_rate": 0.00014335303349479053, + "loss": 0.9855, + "step": 23780 + }, + { + "epoch": 0.6106293962167861, + "grad_norm": 0.8203125, + "learning_rate": 0.00014334901058546845, + "loss": 0.8321, + "step": 23781 + }, + { + "epoch": 0.610655073412708, + "grad_norm": 0.71484375, + "learning_rate": 0.00014334498758975577, + "loss": 0.8113, + "step": 23782 + }, + { + "epoch": 0.6106807506086298, + "grad_norm": 0.80859375, + "learning_rate": 0.0001433409645076604, + "loss": 0.8203, + "step": 23783 + }, + { + "epoch": 0.6107064278045516, + "grad_norm": 0.71484375, + "learning_rate": 0.00014333694133919042, + "loss": 0.8603, + "step": 23784 + }, + { + "epoch": 0.6107321050004734, + "grad_norm": 0.78125, + "learning_rate": 0.00014333291808435382, + "loss": 0.8284, + "step": 23785 + }, + { + "epoch": 0.6107577821963952, + "grad_norm": 0.8046875, + "learning_rate": 0.00014332889474315863, + "loss": 0.9514, + "step": 23786 + }, + { + "epoch": 0.610783459392317, + "grad_norm": 0.73828125, + "learning_rate": 0.00014332487131561288, + "loss": 0.8316, + "step": 23787 + }, + { + "epoch": 0.6108091365882389, + "grad_norm": 0.7421875, + "learning_rate": 0.00014332084780172455, + "loss": 0.8984, + "step": 23788 + }, + { + "epoch": 0.6108348137841607, + "grad_norm": 0.76171875, + "learning_rate": 0.0001433168242015017, + "loss": 0.8286, + "step": 23789 + }, + { + "epoch": 0.6108604909800825, + "grad_norm": 0.8046875, + "learning_rate": 0.00014331280051495235, + "loss": 0.9535, + "step": 23790 + }, + { + "epoch": 0.6108861681760044, + "grad_norm": 0.828125, + "learning_rate": 0.00014330877674208447, + "loss": 0.7915, + "step": 23791 + }, + { + "epoch": 0.6109118453719261, + "grad_norm": 0.81640625, + "learning_rate": 0.00014330475288290613, + "loss": 0.8302, + "step": 23792 + }, + { + "epoch": 0.610937522567848, + "grad_norm": 0.7890625, + "learning_rate": 0.00014330072893742533, + "loss": 0.9261, + "step": 23793 + }, + { + "epoch": 0.6109631997637698, + "grad_norm": 0.8203125, + "learning_rate": 0.00014329670490565002, + "loss": 0.9067, + "step": 23794 + }, + { + "epoch": 0.6109888769596916, + "grad_norm": 0.80859375, + "learning_rate": 0.00014329268078758838, + "loss": 0.8045, + "step": 23795 + }, + { + "epoch": 0.6110145541556135, + "grad_norm": 0.72265625, + "learning_rate": 0.00014328865658324832, + "loss": 0.7663, + "step": 23796 + }, + { + "epoch": 0.6110402313515353, + "grad_norm": 0.80078125, + "learning_rate": 0.00014328463229263788, + "loss": 0.942, + "step": 23797 + }, + { + "epoch": 0.6110659085474571, + "grad_norm": 0.7890625, + "learning_rate": 0.00014328060791576506, + "loss": 0.8341, + "step": 23798 + }, + { + "epoch": 0.6110915857433789, + "grad_norm": 0.8671875, + "learning_rate": 0.00014327658345263786, + "loss": 0.9214, + "step": 23799 + }, + { + "epoch": 0.6111172629393007, + "grad_norm": 0.76953125, + "learning_rate": 0.0001432725589032644, + "loss": 0.8713, + "step": 23800 + }, + { + "epoch": 0.6111429401352225, + "grad_norm": 0.70703125, + "learning_rate": 0.0001432685342676526, + "loss": 0.7993, + "step": 23801 + }, + { + "epoch": 0.6111686173311444, + "grad_norm": 0.74609375, + "learning_rate": 0.0001432645095458105, + "loss": 0.9252, + "step": 23802 + }, + { + "epoch": 0.6111942945270662, + "grad_norm": 0.7734375, + "learning_rate": 0.00014326048473774623, + "loss": 1.0657, + "step": 23803 + }, + { + "epoch": 0.611219971722988, + "grad_norm": 0.76953125, + "learning_rate": 0.00014325645984346766, + "loss": 0.9577, + "step": 23804 + }, + { + "epoch": 0.6112456489189098, + "grad_norm": 0.79296875, + "learning_rate": 0.00014325243486298288, + "loss": 0.8021, + "step": 23805 + }, + { + "epoch": 0.6112713261148316, + "grad_norm": 0.859375, + "learning_rate": 0.0001432484097962999, + "loss": 0.9251, + "step": 23806 + }, + { + "epoch": 0.6112970033107534, + "grad_norm": 0.78125, + "learning_rate": 0.00014324438464342676, + "loss": 0.8279, + "step": 23807 + }, + { + "epoch": 0.6113226805066753, + "grad_norm": 0.7578125, + "learning_rate": 0.00014324035940437143, + "loss": 0.8651, + "step": 23808 + }, + { + "epoch": 0.6113483577025971, + "grad_norm": 1.0, + "learning_rate": 0.00014323633407914204, + "loss": 0.7724, + "step": 23809 + }, + { + "epoch": 0.6113740348985189, + "grad_norm": 0.75390625, + "learning_rate": 0.00014323230866774647, + "loss": 0.8766, + "step": 23810 + }, + { + "epoch": 0.6113997120944408, + "grad_norm": 0.75390625, + "learning_rate": 0.00014322828317019285, + "loss": 0.8302, + "step": 23811 + }, + { + "epoch": 0.6114253892903625, + "grad_norm": 0.78125, + "learning_rate": 0.00014322425758648918, + "loss": 0.8015, + "step": 23812 + }, + { + "epoch": 0.6114510664862843, + "grad_norm": 0.69921875, + "learning_rate": 0.00014322023191664342, + "loss": 0.8653, + "step": 23813 + }, + { + "epoch": 0.6114767436822062, + "grad_norm": 0.71484375, + "learning_rate": 0.00014321620616066367, + "loss": 0.8061, + "step": 23814 + }, + { + "epoch": 0.611502420878128, + "grad_norm": 0.80078125, + "learning_rate": 0.00014321218031855795, + "loss": 0.8892, + "step": 23815 + }, + { + "epoch": 0.6115280980740498, + "grad_norm": 0.75390625, + "learning_rate": 0.0001432081543903342, + "loss": 0.7901, + "step": 23816 + }, + { + "epoch": 0.6115537752699717, + "grad_norm": 0.796875, + "learning_rate": 0.00014320412837600058, + "loss": 0.9256, + "step": 23817 + }, + { + "epoch": 0.6115794524658935, + "grad_norm": 0.71484375, + "learning_rate": 0.00014320010227556495, + "loss": 0.7163, + "step": 23818 + }, + { + "epoch": 0.6116051296618152, + "grad_norm": 0.76953125, + "learning_rate": 0.00014319607608903547, + "loss": 0.8926, + "step": 23819 + }, + { + "epoch": 0.6116308068577371, + "grad_norm": 0.78125, + "learning_rate": 0.0001431920498164201, + "loss": 0.7668, + "step": 23820 + }, + { + "epoch": 0.6116564840536589, + "grad_norm": 0.78125, + "learning_rate": 0.00014318802345772688, + "loss": 0.8242, + "step": 23821 + }, + { + "epoch": 0.6116821612495807, + "grad_norm": 0.82421875, + "learning_rate": 0.00014318399701296385, + "loss": 0.8779, + "step": 23822 + }, + { + "epoch": 0.6117078384455026, + "grad_norm": 0.73046875, + "learning_rate": 0.00014317997048213897, + "loss": 0.8245, + "step": 23823 + }, + { + "epoch": 0.6117335156414244, + "grad_norm": 0.82421875, + "learning_rate": 0.00014317594386526033, + "loss": 0.8988, + "step": 23824 + }, + { + "epoch": 0.6117591928373461, + "grad_norm": 0.7734375, + "learning_rate": 0.00014317191716233592, + "loss": 0.857, + "step": 23825 + }, + { + "epoch": 0.611784870033268, + "grad_norm": 0.76171875, + "learning_rate": 0.0001431678903733738, + "loss": 0.9419, + "step": 23826 + }, + { + "epoch": 0.6118105472291898, + "grad_norm": 0.8671875, + "learning_rate": 0.00014316386349838198, + "loss": 0.8681, + "step": 23827 + }, + { + "epoch": 0.6118362244251117, + "grad_norm": 0.7890625, + "learning_rate": 0.00014315983653736847, + "loss": 0.9692, + "step": 23828 + }, + { + "epoch": 0.6118619016210335, + "grad_norm": 0.7578125, + "learning_rate": 0.0001431558094903413, + "loss": 0.8646, + "step": 23829 + }, + { + "epoch": 0.6118875788169553, + "grad_norm": 0.73828125, + "learning_rate": 0.0001431517823573085, + "loss": 0.8675, + "step": 23830 + }, + { + "epoch": 0.6119132560128772, + "grad_norm": 0.765625, + "learning_rate": 0.0001431477551382781, + "loss": 0.8311, + "step": 23831 + }, + { + "epoch": 0.6119389332087989, + "grad_norm": 0.9140625, + "learning_rate": 0.0001431437278332581, + "loss": 0.9592, + "step": 23832 + }, + { + "epoch": 0.6119646104047207, + "grad_norm": 0.796875, + "learning_rate": 0.00014313970044225658, + "loss": 0.896, + "step": 23833 + }, + { + "epoch": 0.6119902876006426, + "grad_norm": 0.7421875, + "learning_rate": 0.0001431356729652815, + "loss": 0.8311, + "step": 23834 + }, + { + "epoch": 0.6120159647965644, + "grad_norm": 0.76953125, + "learning_rate": 0.00014313164540234095, + "loss": 0.9673, + "step": 23835 + }, + { + "epoch": 0.6120416419924862, + "grad_norm": 0.796875, + "learning_rate": 0.0001431276177534429, + "loss": 0.921, + "step": 23836 + }, + { + "epoch": 0.6120673191884081, + "grad_norm": 0.7578125, + "learning_rate": 0.00014312359001859544, + "loss": 0.7974, + "step": 23837 + }, + { + "epoch": 0.6120929963843299, + "grad_norm": 0.796875, + "learning_rate": 0.00014311956219780655, + "loss": 0.9459, + "step": 23838 + }, + { + "epoch": 0.6121186735802516, + "grad_norm": 0.75390625, + "learning_rate": 0.00014311553429108425, + "loss": 0.8298, + "step": 23839 + }, + { + "epoch": 0.6121443507761735, + "grad_norm": 0.7890625, + "learning_rate": 0.0001431115062984366, + "loss": 0.74, + "step": 23840 + }, + { + "epoch": 0.6121700279720953, + "grad_norm": 0.76171875, + "learning_rate": 0.00014310747821987162, + "loss": 0.8641, + "step": 23841 + }, + { + "epoch": 0.6121957051680171, + "grad_norm": 0.78515625, + "learning_rate": 0.0001431034500553973, + "loss": 0.8385, + "step": 23842 + }, + { + "epoch": 0.612221382363939, + "grad_norm": 0.828125, + "learning_rate": 0.0001430994218050217, + "loss": 0.844, + "step": 23843 + }, + { + "epoch": 0.6122470595598608, + "grad_norm": 0.78125, + "learning_rate": 0.00014309539346875287, + "loss": 0.82, + "step": 23844 + }, + { + "epoch": 0.6122727367557825, + "grad_norm": 0.78515625, + "learning_rate": 0.0001430913650465988, + "loss": 0.7853, + "step": 23845 + }, + { + "epoch": 0.6122984139517044, + "grad_norm": 0.7109375, + "learning_rate": 0.00014308733653856752, + "loss": 0.8881, + "step": 23846 + }, + { + "epoch": 0.6123240911476262, + "grad_norm": 0.84375, + "learning_rate": 0.0001430833079446671, + "loss": 0.8634, + "step": 23847 + }, + { + "epoch": 0.612349768343548, + "grad_norm": 0.77734375, + "learning_rate": 0.00014307927926490553, + "loss": 1.1508, + "step": 23848 + }, + { + "epoch": 0.6123754455394699, + "grad_norm": 0.75, + "learning_rate": 0.00014307525049929085, + "loss": 0.9155, + "step": 23849 + }, + { + "epoch": 0.6124011227353917, + "grad_norm": 0.67578125, + "learning_rate": 0.00014307122164783107, + "loss": 0.87, + "step": 23850 + }, + { + "epoch": 0.6124267999313135, + "grad_norm": 0.76171875, + "learning_rate": 0.00014306719271053425, + "loss": 0.8448, + "step": 23851 + }, + { + "epoch": 0.6124524771272353, + "grad_norm": 0.73046875, + "learning_rate": 0.00014306316368740838, + "loss": 0.845, + "step": 23852 + }, + { + "epoch": 0.6124781543231571, + "grad_norm": 0.7734375, + "learning_rate": 0.0001430591345784615, + "loss": 0.8363, + "step": 23853 + }, + { + "epoch": 0.612503831519079, + "grad_norm": 0.71484375, + "learning_rate": 0.00014305510538370172, + "loss": 0.8548, + "step": 23854 + }, + { + "epoch": 0.6125295087150008, + "grad_norm": 0.80859375, + "learning_rate": 0.00014305107610313697, + "loss": 0.9292, + "step": 23855 + }, + { + "epoch": 0.6125551859109226, + "grad_norm": 0.78125, + "learning_rate": 0.00014304704673677533, + "loss": 0.9033, + "step": 23856 + }, + { + "epoch": 0.6125808631068445, + "grad_norm": 0.6875, + "learning_rate": 0.0001430430172846248, + "loss": 0.7634, + "step": 23857 + }, + { + "epoch": 0.6126065403027663, + "grad_norm": 0.78515625, + "learning_rate": 0.0001430389877466934, + "loss": 0.8985, + "step": 23858 + }, + { + "epoch": 0.612632217498688, + "grad_norm": 0.7421875, + "learning_rate": 0.0001430349581229892, + "loss": 0.9281, + "step": 23859 + }, + { + "epoch": 0.6126578946946099, + "grad_norm": 0.7421875, + "learning_rate": 0.00014303092841352023, + "loss": 0.9385, + "step": 23860 + }, + { + "epoch": 0.6126835718905317, + "grad_norm": 0.80078125, + "learning_rate": 0.0001430268986182945, + "loss": 0.8637, + "step": 23861 + }, + { + "epoch": 0.6127092490864535, + "grad_norm": 0.7578125, + "learning_rate": 0.00014302286873732006, + "loss": 0.8288, + "step": 23862 + }, + { + "epoch": 0.6127349262823754, + "grad_norm": 0.734375, + "learning_rate": 0.0001430188387706049, + "loss": 0.757, + "step": 23863 + }, + { + "epoch": 0.6127606034782972, + "grad_norm": 0.80859375, + "learning_rate": 0.00014301480871815712, + "loss": 0.899, + "step": 23864 + }, + { + "epoch": 0.6127862806742189, + "grad_norm": 0.84375, + "learning_rate": 0.00014301077857998472, + "loss": 0.8449, + "step": 23865 + }, + { + "epoch": 0.6128119578701408, + "grad_norm": 0.8203125, + "learning_rate": 0.00014300674835609564, + "loss": 1.002, + "step": 23866 + }, + { + "epoch": 0.6128376350660626, + "grad_norm": 0.796875, + "learning_rate": 0.00014300271804649809, + "loss": 0.8432, + "step": 23867 + }, + { + "epoch": 0.6128633122619844, + "grad_norm": 0.76171875, + "learning_rate": 0.00014299868765119995, + "loss": 0.7391, + "step": 23868 + }, + { + "epoch": 0.6128889894579063, + "grad_norm": 0.765625, + "learning_rate": 0.00014299465717020933, + "loss": 0.905, + "step": 23869 + }, + { + "epoch": 0.6129146666538281, + "grad_norm": 0.73046875, + "learning_rate": 0.00014299062660353423, + "loss": 0.8968, + "step": 23870 + }, + { + "epoch": 0.6129403438497499, + "grad_norm": 0.7890625, + "learning_rate": 0.0001429865959511827, + "loss": 0.8766, + "step": 23871 + }, + { + "epoch": 0.6129660210456717, + "grad_norm": 0.76171875, + "learning_rate": 0.00014298256521316278, + "loss": 0.8146, + "step": 23872 + }, + { + "epoch": 0.6129916982415935, + "grad_norm": 0.828125, + "learning_rate": 0.00014297853438948247, + "loss": 1.0314, + "step": 23873 + }, + { + "epoch": 0.6130173754375153, + "grad_norm": 0.74609375, + "learning_rate": 0.00014297450348014983, + "loss": 0.801, + "step": 23874 + }, + { + "epoch": 0.6130430526334372, + "grad_norm": 0.75390625, + "learning_rate": 0.0001429704724851729, + "loss": 0.8538, + "step": 23875 + }, + { + "epoch": 0.613068729829359, + "grad_norm": 0.765625, + "learning_rate": 0.00014296644140455967, + "loss": 0.8617, + "step": 23876 + }, + { + "epoch": 0.6130944070252808, + "grad_norm": 0.7578125, + "learning_rate": 0.00014296241023831826, + "loss": 0.7339, + "step": 23877 + }, + { + "epoch": 0.6131200842212027, + "grad_norm": 0.75, + "learning_rate": 0.00014295837898645658, + "loss": 0.7521, + "step": 23878 + }, + { + "epoch": 0.6131457614171244, + "grad_norm": 0.7421875, + "learning_rate": 0.00014295434764898275, + "loss": 0.7661, + "step": 23879 + }, + { + "epoch": 0.6131714386130462, + "grad_norm": 0.71484375, + "learning_rate": 0.0001429503162259048, + "loss": 0.8971, + "step": 23880 + }, + { + "epoch": 0.6131971158089681, + "grad_norm": 0.75, + "learning_rate": 0.00014294628471723075, + "loss": 0.8428, + "step": 23881 + }, + { + "epoch": 0.6132227930048899, + "grad_norm": 0.7890625, + "learning_rate": 0.00014294225312296862, + "loss": 0.8838, + "step": 23882 + }, + { + "epoch": 0.6132484702008117, + "grad_norm": 0.7734375, + "learning_rate": 0.0001429382214431265, + "loss": 0.7773, + "step": 23883 + }, + { + "epoch": 0.6132741473967336, + "grad_norm": 0.734375, + "learning_rate": 0.00014293418967771234, + "loss": 0.7066, + "step": 23884 + }, + { + "epoch": 0.6132998245926553, + "grad_norm": 0.82421875, + "learning_rate": 0.00014293015782673423, + "loss": 0.9433, + "step": 23885 + }, + { + "epoch": 0.6133255017885771, + "grad_norm": 0.75390625, + "learning_rate": 0.0001429261258902002, + "loss": 0.8607, + "step": 23886 + }, + { + "epoch": 0.613351178984499, + "grad_norm": 0.703125, + "learning_rate": 0.00014292209386811825, + "loss": 0.8917, + "step": 23887 + }, + { + "epoch": 0.6133768561804208, + "grad_norm": 0.79296875, + "learning_rate": 0.0001429180617604965, + "loss": 0.8175, + "step": 23888 + }, + { + "epoch": 0.6134025333763427, + "grad_norm": 0.81640625, + "learning_rate": 0.00014291402956734289, + "loss": 1.0187, + "step": 23889 + }, + { + "epoch": 0.6134282105722645, + "grad_norm": 0.80859375, + "learning_rate": 0.00014290999728866548, + "loss": 0.893, + "step": 23890 + }, + { + "epoch": 0.6134538877681863, + "grad_norm": 0.75390625, + "learning_rate": 0.00014290596492447236, + "loss": 0.8377, + "step": 23891 + }, + { + "epoch": 0.613479564964108, + "grad_norm": 0.74609375, + "learning_rate": 0.00014290193247477152, + "loss": 0.8097, + "step": 23892 + }, + { + "epoch": 0.6135052421600299, + "grad_norm": 0.82421875, + "learning_rate": 0.00014289789993957095, + "loss": 0.778, + "step": 23893 + }, + { + "epoch": 0.6135309193559517, + "grad_norm": 0.71875, + "learning_rate": 0.0001428938673188788, + "loss": 0.8354, + "step": 23894 + }, + { + "epoch": 0.6135565965518736, + "grad_norm": 0.796875, + "learning_rate": 0.00014288983461270305, + "loss": 0.8823, + "step": 23895 + }, + { + "epoch": 0.6135822737477954, + "grad_norm": 0.82421875, + "learning_rate": 0.0001428858018210517, + "loss": 0.9909, + "step": 23896 + }, + { + "epoch": 0.6136079509437172, + "grad_norm": 0.7734375, + "learning_rate": 0.00014288176894393284, + "loss": 0.8594, + "step": 23897 + }, + { + "epoch": 0.6136336281396391, + "grad_norm": 0.71484375, + "learning_rate": 0.00014287773598135447, + "loss": 0.8005, + "step": 23898 + }, + { + "epoch": 0.6136593053355608, + "grad_norm": 0.77734375, + "learning_rate": 0.00014287370293332465, + "loss": 0.8369, + "step": 23899 + }, + { + "epoch": 0.6136849825314826, + "grad_norm": 0.76953125, + "learning_rate": 0.00014286966979985144, + "loss": 0.808, + "step": 23900 + }, + { + "epoch": 0.6137106597274045, + "grad_norm": 0.8046875, + "learning_rate": 0.00014286563658094282, + "loss": 0.7828, + "step": 23901 + }, + { + "epoch": 0.6137363369233263, + "grad_norm": 0.77734375, + "learning_rate": 0.0001428616032766069, + "loss": 0.7648, + "step": 23902 + }, + { + "epoch": 0.6137620141192481, + "grad_norm": 0.6953125, + "learning_rate": 0.0001428575698868516, + "loss": 0.7267, + "step": 23903 + }, + { + "epoch": 0.61378769131517, + "grad_norm": 0.79296875, + "learning_rate": 0.0001428535364116851, + "loss": 0.936, + "step": 23904 + }, + { + "epoch": 0.6138133685110917, + "grad_norm": 0.73828125, + "learning_rate": 0.00014284950285111536, + "loss": 0.8981, + "step": 23905 + }, + { + "epoch": 0.6138390457070135, + "grad_norm": 0.7421875, + "learning_rate": 0.0001428454692051504, + "loss": 0.8234, + "step": 23906 + }, + { + "epoch": 0.6138647229029354, + "grad_norm": 0.859375, + "learning_rate": 0.00014284143547379835, + "loss": 0.7524, + "step": 23907 + }, + { + "epoch": 0.6138904000988572, + "grad_norm": 0.75390625, + "learning_rate": 0.00014283740165706713, + "loss": 0.9312, + "step": 23908 + }, + { + "epoch": 0.613916077294779, + "grad_norm": 0.7578125, + "learning_rate": 0.0001428333677549649, + "loss": 0.8051, + "step": 23909 + }, + { + "epoch": 0.6139417544907009, + "grad_norm": 0.765625, + "learning_rate": 0.00014282933376749958, + "loss": 0.995, + "step": 23910 + }, + { + "epoch": 0.6139674316866227, + "grad_norm": 0.7578125, + "learning_rate": 0.00014282529969467927, + "loss": 0.9025, + "step": 23911 + }, + { + "epoch": 0.6139931088825444, + "grad_norm": 0.7109375, + "learning_rate": 0.00014282126553651203, + "loss": 0.7645, + "step": 23912 + }, + { + "epoch": 0.6140187860784663, + "grad_norm": 0.78125, + "learning_rate": 0.00014281723129300588, + "loss": 0.8651, + "step": 23913 + }, + { + "epoch": 0.6140444632743881, + "grad_norm": 0.7421875, + "learning_rate": 0.00014281319696416881, + "loss": 0.8193, + "step": 23914 + }, + { + "epoch": 0.6140701404703099, + "grad_norm": 0.84375, + "learning_rate": 0.00014280916255000897, + "loss": 0.8387, + "step": 23915 + }, + { + "epoch": 0.6140958176662318, + "grad_norm": 0.76953125, + "learning_rate": 0.00014280512805053429, + "loss": 0.7469, + "step": 23916 + }, + { + "epoch": 0.6141214948621536, + "grad_norm": 0.70703125, + "learning_rate": 0.00014280109346575287, + "loss": 0.8944, + "step": 23917 + }, + { + "epoch": 0.6141471720580755, + "grad_norm": 0.75390625, + "learning_rate": 0.00014279705879567277, + "loss": 0.8032, + "step": 23918 + }, + { + "epoch": 0.6141728492539972, + "grad_norm": 0.69921875, + "learning_rate": 0.00014279302404030192, + "loss": 0.8211, + "step": 23919 + }, + { + "epoch": 0.614198526449919, + "grad_norm": 0.71875, + "learning_rate": 0.00014278898919964852, + "loss": 0.9442, + "step": 23920 + }, + { + "epoch": 0.6142242036458408, + "grad_norm": 0.80859375, + "learning_rate": 0.00014278495427372053, + "loss": 0.8086, + "step": 23921 + }, + { + "epoch": 0.6142498808417627, + "grad_norm": 0.7265625, + "learning_rate": 0.00014278091926252592, + "loss": 0.8237, + "step": 23922 + }, + { + "epoch": 0.6142755580376845, + "grad_norm": 0.75390625, + "learning_rate": 0.00014277688416607285, + "loss": 0.6878, + "step": 23923 + }, + { + "epoch": 0.6143012352336064, + "grad_norm": 0.76171875, + "learning_rate": 0.00014277284898436926, + "loss": 0.8397, + "step": 23924 + }, + { + "epoch": 0.6143269124295281, + "grad_norm": 0.76953125, + "learning_rate": 0.0001427688137174233, + "loss": 0.8872, + "step": 23925 + }, + { + "epoch": 0.6143525896254499, + "grad_norm": 0.75, + "learning_rate": 0.00014276477836524295, + "loss": 0.7989, + "step": 23926 + }, + { + "epoch": 0.6143782668213718, + "grad_norm": 0.83203125, + "learning_rate": 0.00014276074292783624, + "loss": 0.8652, + "step": 23927 + }, + { + "epoch": 0.6144039440172936, + "grad_norm": 0.7734375, + "learning_rate": 0.00014275670740521127, + "loss": 0.7569, + "step": 23928 + }, + { + "epoch": 0.6144296212132154, + "grad_norm": 0.72265625, + "learning_rate": 0.00014275267179737603, + "loss": 0.7923, + "step": 23929 + }, + { + "epoch": 0.6144552984091373, + "grad_norm": 0.75, + "learning_rate": 0.00014274863610433855, + "loss": 0.9358, + "step": 23930 + }, + { + "epoch": 0.6144809756050591, + "grad_norm": 0.71875, + "learning_rate": 0.00014274460032610693, + "loss": 0.8424, + "step": 23931 + }, + { + "epoch": 0.6145066528009808, + "grad_norm": 0.7734375, + "learning_rate": 0.0001427405644626892, + "loss": 0.8341, + "step": 23932 + }, + { + "epoch": 0.6145323299969027, + "grad_norm": 0.78515625, + "learning_rate": 0.00014273652851409333, + "loss": 0.8215, + "step": 23933 + }, + { + "epoch": 0.6145580071928245, + "grad_norm": 0.8046875, + "learning_rate": 0.00014273249248032745, + "loss": 0.8093, + "step": 23934 + }, + { + "epoch": 0.6145836843887463, + "grad_norm": 0.8125, + "learning_rate": 0.00014272845636139955, + "loss": 0.9178, + "step": 23935 + }, + { + "epoch": 0.6146093615846682, + "grad_norm": 0.7890625, + "learning_rate": 0.00014272442015731774, + "loss": 0.8021, + "step": 23936 + }, + { + "epoch": 0.61463503878059, + "grad_norm": 0.7734375, + "learning_rate": 0.00014272038386809, + "loss": 0.89, + "step": 23937 + }, + { + "epoch": 0.6146607159765118, + "grad_norm": 0.81640625, + "learning_rate": 0.00014271634749372437, + "loss": 0.7541, + "step": 23938 + }, + { + "epoch": 0.6146863931724336, + "grad_norm": 0.875, + "learning_rate": 0.00014271231103422892, + "loss": 0.812, + "step": 23939 + }, + { + "epoch": 0.6147120703683554, + "grad_norm": 0.91796875, + "learning_rate": 0.00014270827448961172, + "loss": 0.9719, + "step": 23940 + }, + { + "epoch": 0.6147377475642772, + "grad_norm": 0.8828125, + "learning_rate": 0.00014270423785988075, + "loss": 0.7722, + "step": 23941 + }, + { + "epoch": 0.6147634247601991, + "grad_norm": 0.765625, + "learning_rate": 0.00014270020114504414, + "loss": 0.9643, + "step": 23942 + }, + { + "epoch": 0.6147891019561209, + "grad_norm": 0.828125, + "learning_rate": 0.00014269616434510986, + "loss": 0.844, + "step": 23943 + }, + { + "epoch": 0.6148147791520427, + "grad_norm": 0.703125, + "learning_rate": 0.00014269212746008598, + "loss": 0.7306, + "step": 23944 + }, + { + "epoch": 0.6148404563479645, + "grad_norm": 0.69921875, + "learning_rate": 0.00014268809048998055, + "loss": 0.8183, + "step": 23945 + }, + { + "epoch": 0.6148661335438863, + "grad_norm": 0.80078125, + "learning_rate": 0.0001426840534348016, + "loss": 0.7511, + "step": 23946 + }, + { + "epoch": 0.6148918107398081, + "grad_norm": 0.83984375, + "learning_rate": 0.00014268001629455724, + "loss": 0.9205, + "step": 23947 + }, + { + "epoch": 0.61491748793573, + "grad_norm": 0.81640625, + "learning_rate": 0.0001426759790692554, + "loss": 0.7823, + "step": 23948 + }, + { + "epoch": 0.6149431651316518, + "grad_norm": 0.8203125, + "learning_rate": 0.00014267194175890418, + "loss": 0.8354, + "step": 23949 + }, + { + "epoch": 0.6149688423275737, + "grad_norm": 0.77734375, + "learning_rate": 0.0001426679043635117, + "loss": 0.8542, + "step": 23950 + }, + { + "epoch": 0.6149945195234955, + "grad_norm": 0.7421875, + "learning_rate": 0.0001426638668830859, + "loss": 0.8327, + "step": 23951 + }, + { + "epoch": 0.6150201967194172, + "grad_norm": 0.80859375, + "learning_rate": 0.00014265982931763483, + "loss": 0.9049, + "step": 23952 + }, + { + "epoch": 0.615045873915339, + "grad_norm": 0.73828125, + "learning_rate": 0.00014265579166716666, + "loss": 0.7834, + "step": 23953 + }, + { + "epoch": 0.6150715511112609, + "grad_norm": 0.80859375, + "learning_rate": 0.0001426517539316893, + "loss": 0.9434, + "step": 23954 + }, + { + "epoch": 0.6150972283071827, + "grad_norm": 0.734375, + "learning_rate": 0.00014264771611121086, + "loss": 0.8221, + "step": 23955 + }, + { + "epoch": 0.6151229055031046, + "grad_norm": 0.7421875, + "learning_rate": 0.00014264367820573934, + "loss": 0.8473, + "step": 23956 + }, + { + "epoch": 0.6151485826990264, + "grad_norm": 0.8125, + "learning_rate": 0.00014263964021528287, + "loss": 0.8623, + "step": 23957 + }, + { + "epoch": 0.6151742598949482, + "grad_norm": 0.77734375, + "learning_rate": 0.0001426356021398494, + "loss": 0.952, + "step": 23958 + }, + { + "epoch": 0.61519993709087, + "grad_norm": 0.74609375, + "learning_rate": 0.00014263156397944707, + "loss": 0.8413, + "step": 23959 + }, + { + "epoch": 0.6152256142867918, + "grad_norm": 0.75390625, + "learning_rate": 0.00014262752573408386, + "loss": 0.7085, + "step": 23960 + }, + { + "epoch": 0.6152512914827136, + "grad_norm": 0.7734375, + "learning_rate": 0.00014262348740376785, + "loss": 0.8494, + "step": 23961 + }, + { + "epoch": 0.6152769686786355, + "grad_norm": 0.875, + "learning_rate": 0.0001426194489885071, + "loss": 0.8497, + "step": 23962 + }, + { + "epoch": 0.6153026458745573, + "grad_norm": 0.78125, + "learning_rate": 0.0001426154104883096, + "loss": 0.86, + "step": 23963 + }, + { + "epoch": 0.6153283230704791, + "grad_norm": 0.796875, + "learning_rate": 0.00014261137190318347, + "loss": 1.0108, + "step": 23964 + }, + { + "epoch": 0.6153540002664009, + "grad_norm": 0.6953125, + "learning_rate": 0.0001426073332331367, + "loss": 0.7511, + "step": 23965 + }, + { + "epoch": 0.6153796774623227, + "grad_norm": 2.734375, + "learning_rate": 0.00014260329447817735, + "loss": 0.863, + "step": 23966 + }, + { + "epoch": 0.6154053546582445, + "grad_norm": 0.7578125, + "learning_rate": 0.0001425992556383135, + "loss": 0.7872, + "step": 23967 + }, + { + "epoch": 0.6154310318541664, + "grad_norm": 0.83203125, + "learning_rate": 0.00014259521671355322, + "loss": 0.8755, + "step": 23968 + }, + { + "epoch": 0.6154567090500882, + "grad_norm": 0.80859375, + "learning_rate": 0.00014259117770390448, + "loss": 0.9195, + "step": 23969 + }, + { + "epoch": 0.61548238624601, + "grad_norm": 0.796875, + "learning_rate": 0.00014258713860937537, + "loss": 0.9195, + "step": 23970 + }, + { + "epoch": 0.6155080634419319, + "grad_norm": 0.8046875, + "learning_rate": 0.00014258309942997394, + "loss": 0.754, + "step": 23971 + }, + { + "epoch": 0.6155337406378536, + "grad_norm": 0.796875, + "learning_rate": 0.00014257906016570824, + "loss": 0.7847, + "step": 23972 + }, + { + "epoch": 0.6155594178337754, + "grad_norm": 0.78515625, + "learning_rate": 0.0001425750208165863, + "loss": 0.8725, + "step": 23973 + }, + { + "epoch": 0.6155850950296973, + "grad_norm": 0.734375, + "learning_rate": 0.00014257098138261627, + "loss": 0.8409, + "step": 23974 + }, + { + "epoch": 0.6156107722256191, + "grad_norm": 0.78515625, + "learning_rate": 0.00014256694186380603, + "loss": 0.8206, + "step": 23975 + }, + { + "epoch": 0.6156364494215409, + "grad_norm": 0.7890625, + "learning_rate": 0.00014256290226016377, + "loss": 0.8796, + "step": 23976 + }, + { + "epoch": 0.6156621266174628, + "grad_norm": 0.7421875, + "learning_rate": 0.00014255886257169749, + "loss": 0.7652, + "step": 23977 + }, + { + "epoch": 0.6156878038133845, + "grad_norm": 0.7734375, + "learning_rate": 0.0001425548227984152, + "loss": 0.7988, + "step": 23978 + }, + { + "epoch": 0.6157134810093063, + "grad_norm": 0.78515625, + "learning_rate": 0.00014255078294032504, + "loss": 0.9619, + "step": 23979 + }, + { + "epoch": 0.6157391582052282, + "grad_norm": 0.7578125, + "learning_rate": 0.00014254674299743497, + "loss": 0.9292, + "step": 23980 + }, + { + "epoch": 0.61576483540115, + "grad_norm": 0.74609375, + "learning_rate": 0.0001425427029697531, + "loss": 0.857, + "step": 23981 + }, + { + "epoch": 0.6157905125970718, + "grad_norm": 0.8515625, + "learning_rate": 0.0001425386628572875, + "loss": 0.9754, + "step": 23982 + }, + { + "epoch": 0.6158161897929937, + "grad_norm": 0.74609375, + "learning_rate": 0.00014253462266004612, + "loss": 0.8452, + "step": 23983 + }, + { + "epoch": 0.6158418669889155, + "grad_norm": 0.734375, + "learning_rate": 0.00014253058237803713, + "loss": 0.7938, + "step": 23984 + }, + { + "epoch": 0.6158675441848372, + "grad_norm": 0.77734375, + "learning_rate": 0.00014252654201126852, + "loss": 0.7936, + "step": 23985 + }, + { + "epoch": 0.6158932213807591, + "grad_norm": 0.7421875, + "learning_rate": 0.00014252250155974837, + "loss": 0.8302, + "step": 23986 + }, + { + "epoch": 0.6159188985766809, + "grad_norm": 0.69140625, + "learning_rate": 0.0001425184610234847, + "loss": 0.7964, + "step": 23987 + }, + { + "epoch": 0.6159445757726028, + "grad_norm": 0.77734375, + "learning_rate": 0.00014251442040248555, + "loss": 0.9662, + "step": 23988 + }, + { + "epoch": 0.6159702529685246, + "grad_norm": 0.80078125, + "learning_rate": 0.00014251037969675902, + "loss": 0.8869, + "step": 23989 + }, + { + "epoch": 0.6159959301644464, + "grad_norm": 0.78125, + "learning_rate": 0.00014250633890631318, + "loss": 0.9177, + "step": 23990 + }, + { + "epoch": 0.6160216073603683, + "grad_norm": 0.79296875, + "learning_rate": 0.000142502298031156, + "loss": 0.9413, + "step": 23991 + }, + { + "epoch": 0.61604728455629, + "grad_norm": 0.76171875, + "learning_rate": 0.00014249825707129558, + "loss": 0.754, + "step": 23992 + }, + { + "epoch": 0.6160729617522118, + "grad_norm": 0.76953125, + "learning_rate": 0.00014249421602674, + "loss": 0.823, + "step": 23993 + }, + { + "epoch": 0.6160986389481337, + "grad_norm": 0.796875, + "learning_rate": 0.00014249017489749728, + "loss": 0.9117, + "step": 23994 + }, + { + "epoch": 0.6161243161440555, + "grad_norm": 0.75390625, + "learning_rate": 0.00014248613368357546, + "loss": 0.8413, + "step": 23995 + }, + { + "epoch": 0.6161499933399773, + "grad_norm": 0.734375, + "learning_rate": 0.00014248209238498262, + "loss": 0.7889, + "step": 23996 + }, + { + "epoch": 0.6161756705358992, + "grad_norm": 0.74609375, + "learning_rate": 0.00014247805100172682, + "loss": 0.7985, + "step": 23997 + }, + { + "epoch": 0.6162013477318209, + "grad_norm": 0.7734375, + "learning_rate": 0.00014247400953381608, + "loss": 0.8393, + "step": 23998 + }, + { + "epoch": 0.6162270249277427, + "grad_norm": 0.78515625, + "learning_rate": 0.00014246996798125848, + "loss": 0.8489, + "step": 23999 + }, + { + "epoch": 0.6162527021236646, + "grad_norm": 0.73828125, + "learning_rate": 0.0001424659263440621, + "loss": 0.7069, + "step": 24000 + }, + { + "epoch": 0.6162527021236646, + "eval_loss": 0.8535081744194031, + "eval_runtime": 388.1277, + "eval_samples_per_second": 25.765, + "eval_steps_per_second": 0.806, + "step": 24000 + }, + { + "epoch": 0.6162783793195864, + "grad_norm": 0.7265625, + "learning_rate": 0.00014246188462223492, + "loss": 0.955, + "step": 24001 + }, + { + "epoch": 0.6163040565155082, + "grad_norm": 0.73046875, + "learning_rate": 0.0001424578428157851, + "loss": 0.8169, + "step": 24002 + }, + { + "epoch": 0.6163297337114301, + "grad_norm": 0.75, + "learning_rate": 0.00014245380092472057, + "loss": 0.8568, + "step": 24003 + }, + { + "epoch": 0.6163554109073519, + "grad_norm": 0.796875, + "learning_rate": 0.00014244975894904947, + "loss": 0.7798, + "step": 24004 + }, + { + "epoch": 0.6163810881032736, + "grad_norm": 0.71875, + "learning_rate": 0.00014244571688877986, + "loss": 0.8201, + "step": 24005 + }, + { + "epoch": 0.6164067652991955, + "grad_norm": 0.81640625, + "learning_rate": 0.00014244167474391977, + "loss": 0.9603, + "step": 24006 + }, + { + "epoch": 0.6164324424951173, + "grad_norm": 0.8359375, + "learning_rate": 0.00014243763251447724, + "loss": 0.8757, + "step": 24007 + }, + { + "epoch": 0.6164581196910391, + "grad_norm": 0.765625, + "learning_rate": 0.00014243359020046033, + "loss": 0.786, + "step": 24008 + }, + { + "epoch": 0.616483796886961, + "grad_norm": 0.828125, + "learning_rate": 0.00014242954780187712, + "loss": 1.0145, + "step": 24009 + }, + { + "epoch": 0.6165094740828828, + "grad_norm": 0.75390625, + "learning_rate": 0.00014242550531873565, + "loss": 0.8242, + "step": 24010 + }, + { + "epoch": 0.6165351512788046, + "grad_norm": 0.68359375, + "learning_rate": 0.00014242146275104397, + "loss": 0.7463, + "step": 24011 + }, + { + "epoch": 0.6165608284747264, + "grad_norm": 0.734375, + "learning_rate": 0.00014241742009881017, + "loss": 0.9347, + "step": 24012 + }, + { + "epoch": 0.6165865056706482, + "grad_norm": 0.77734375, + "learning_rate": 0.00014241337736204228, + "loss": 0.9205, + "step": 24013 + }, + { + "epoch": 0.61661218286657, + "grad_norm": 0.68359375, + "learning_rate": 0.00014240933454074837, + "loss": 0.9183, + "step": 24014 + }, + { + "epoch": 0.6166378600624919, + "grad_norm": 0.8046875, + "learning_rate": 0.00014240529163493646, + "loss": 0.8968, + "step": 24015 + }, + { + "epoch": 0.6166635372584137, + "grad_norm": 0.7890625, + "learning_rate": 0.00014240124864461464, + "loss": 1.0386, + "step": 24016 + }, + { + "epoch": 0.6166892144543356, + "grad_norm": 0.828125, + "learning_rate": 0.00014239720556979098, + "loss": 0.9445, + "step": 24017 + }, + { + "epoch": 0.6167148916502573, + "grad_norm": 0.80859375, + "learning_rate": 0.00014239316241047346, + "loss": 0.9104, + "step": 24018 + }, + { + "epoch": 0.6167405688461791, + "grad_norm": 0.7890625, + "learning_rate": 0.00014238911916667025, + "loss": 0.8307, + "step": 24019 + }, + { + "epoch": 0.616766246042101, + "grad_norm": 0.96484375, + "learning_rate": 0.00014238507583838936, + "loss": 0.8825, + "step": 24020 + }, + { + "epoch": 0.6167919232380228, + "grad_norm": 0.78125, + "learning_rate": 0.00014238103242563883, + "loss": 0.785, + "step": 24021 + }, + { + "epoch": 0.6168176004339446, + "grad_norm": 0.71484375, + "learning_rate": 0.00014237698892842674, + "loss": 0.9341, + "step": 24022 + }, + { + "epoch": 0.6168432776298665, + "grad_norm": 0.75, + "learning_rate": 0.00014237294534676112, + "loss": 0.7622, + "step": 24023 + }, + { + "epoch": 0.6168689548257883, + "grad_norm": 0.828125, + "learning_rate": 0.00014236890168065004, + "loss": 0.8937, + "step": 24024 + }, + { + "epoch": 0.61689463202171, + "grad_norm": 0.76953125, + "learning_rate": 0.0001423648579301016, + "loss": 0.727, + "step": 24025 + }, + { + "epoch": 0.6169203092176319, + "grad_norm": 0.76171875, + "learning_rate": 0.00014236081409512382, + "loss": 0.9327, + "step": 24026 + }, + { + "epoch": 0.6169459864135537, + "grad_norm": 0.8203125, + "learning_rate": 0.00014235677017572474, + "loss": 0.8425, + "step": 24027 + }, + { + "epoch": 0.6169716636094755, + "grad_norm": 0.796875, + "learning_rate": 0.00014235272617191245, + "loss": 0.8411, + "step": 24028 + }, + { + "epoch": 0.6169973408053974, + "grad_norm": 0.76953125, + "learning_rate": 0.000142348682083695, + "loss": 0.943, + "step": 24029 + }, + { + "epoch": 0.6170230180013192, + "grad_norm": 0.6640625, + "learning_rate": 0.00014234463791108046, + "loss": 0.763, + "step": 24030 + }, + { + "epoch": 0.617048695197241, + "grad_norm": 0.71875, + "learning_rate": 0.00014234059365407685, + "loss": 0.8596, + "step": 24031 + }, + { + "epoch": 0.6170743723931628, + "grad_norm": 0.7578125, + "learning_rate": 0.00014233654931269233, + "loss": 0.8665, + "step": 24032 + }, + { + "epoch": 0.6171000495890846, + "grad_norm": 0.78515625, + "learning_rate": 0.00014233250488693484, + "loss": 0.7613, + "step": 24033 + }, + { + "epoch": 0.6171257267850064, + "grad_norm": 0.78515625, + "learning_rate": 0.00014232846037681252, + "loss": 0.8755, + "step": 24034 + }, + { + "epoch": 0.6171514039809283, + "grad_norm": 0.75390625, + "learning_rate": 0.00014232441578233338, + "loss": 0.8267, + "step": 24035 + }, + { + "epoch": 0.6171770811768501, + "grad_norm": 0.88671875, + "learning_rate": 0.0001423203711035055, + "loss": 0.8681, + "step": 24036 + }, + { + "epoch": 0.6172027583727719, + "grad_norm": 0.7890625, + "learning_rate": 0.00014231632634033697, + "loss": 0.7878, + "step": 24037 + }, + { + "epoch": 0.6172284355686937, + "grad_norm": 0.7734375, + "learning_rate": 0.00014231228149283578, + "loss": 0.7909, + "step": 24038 + }, + { + "epoch": 0.6172541127646155, + "grad_norm": 0.765625, + "learning_rate": 0.00014230823656101007, + "loss": 0.9142, + "step": 24039 + }, + { + "epoch": 0.6172797899605373, + "grad_norm": 0.81640625, + "learning_rate": 0.00014230419154486788, + "loss": 0.8563, + "step": 24040 + }, + { + "epoch": 0.6173054671564592, + "grad_norm": 0.71875, + "learning_rate": 0.0001423001464444172, + "loss": 0.7778, + "step": 24041 + }, + { + "epoch": 0.617331144352381, + "grad_norm": 0.8125, + "learning_rate": 0.0001422961012596662, + "loss": 0.9206, + "step": 24042 + }, + { + "epoch": 0.6173568215483028, + "grad_norm": 0.69921875, + "learning_rate": 0.00014229205599062286, + "loss": 0.8704, + "step": 24043 + }, + { + "epoch": 0.6173824987442247, + "grad_norm": 0.7421875, + "learning_rate": 0.0001422880106372953, + "loss": 0.9154, + "step": 24044 + }, + { + "epoch": 0.6174081759401464, + "grad_norm": 0.78125, + "learning_rate": 0.00014228396519969154, + "loss": 0.9271, + "step": 24045 + }, + { + "epoch": 0.6174338531360682, + "grad_norm": 0.74609375, + "learning_rate": 0.0001422799196778197, + "loss": 0.8856, + "step": 24046 + }, + { + "epoch": 0.6174595303319901, + "grad_norm": 0.765625, + "learning_rate": 0.00014227587407168773, + "loss": 0.872, + "step": 24047 + }, + { + "epoch": 0.6174852075279119, + "grad_norm": 0.7734375, + "learning_rate": 0.00014227182838130379, + "loss": 0.9717, + "step": 24048 + }, + { + "epoch": 0.6175108847238338, + "grad_norm": 0.80078125, + "learning_rate": 0.00014226778260667587, + "loss": 0.9607, + "step": 24049 + }, + { + "epoch": 0.6175365619197556, + "grad_norm": 0.765625, + "learning_rate": 0.00014226373674781215, + "loss": 0.9681, + "step": 24050 + }, + { + "epoch": 0.6175622391156774, + "grad_norm": 0.74609375, + "learning_rate": 0.0001422596908047206, + "loss": 0.9095, + "step": 24051 + }, + { + "epoch": 0.6175879163115991, + "grad_norm": 0.765625, + "learning_rate": 0.00014225564477740927, + "loss": 0.7881, + "step": 24052 + }, + { + "epoch": 0.617613593507521, + "grad_norm": 0.74609375, + "learning_rate": 0.00014225159866588632, + "loss": 0.8113, + "step": 24053 + }, + { + "epoch": 0.6176392707034428, + "grad_norm": 0.91796875, + "learning_rate": 0.0001422475524701597, + "loss": 1.0303, + "step": 24054 + }, + { + "epoch": 0.6176649478993647, + "grad_norm": 0.76171875, + "learning_rate": 0.00014224350619023753, + "loss": 0.8097, + "step": 24055 + }, + { + "epoch": 0.6176906250952865, + "grad_norm": 0.7421875, + "learning_rate": 0.00014223945982612786, + "loss": 0.8139, + "step": 24056 + }, + { + "epoch": 0.6177163022912083, + "grad_norm": 0.72265625, + "learning_rate": 0.0001422354133778388, + "loss": 0.7276, + "step": 24057 + }, + { + "epoch": 0.61774197948713, + "grad_norm": 0.7421875, + "learning_rate": 0.00014223136684537833, + "loss": 0.9204, + "step": 24058 + }, + { + "epoch": 0.6177676566830519, + "grad_norm": 0.7890625, + "learning_rate": 0.0001422273202287546, + "loss": 0.8448, + "step": 24059 + }, + { + "epoch": 0.6177933338789737, + "grad_norm": 0.671875, + "learning_rate": 0.0001422232735279756, + "loss": 0.7436, + "step": 24060 + }, + { + "epoch": 0.6178190110748956, + "grad_norm": 0.80859375, + "learning_rate": 0.00014221922674304945, + "loss": 0.8206, + "step": 24061 + }, + { + "epoch": 0.6178446882708174, + "grad_norm": 0.8359375, + "learning_rate": 0.0001422151798739842, + "loss": 0.8919, + "step": 24062 + }, + { + "epoch": 0.6178703654667392, + "grad_norm": 0.7734375, + "learning_rate": 0.0001422111329207879, + "loss": 0.8378, + "step": 24063 + }, + { + "epoch": 0.6178960426626611, + "grad_norm": 0.796875, + "learning_rate": 0.00014220708588346862, + "loss": 0.8014, + "step": 24064 + }, + { + "epoch": 0.6179217198585828, + "grad_norm": 0.73046875, + "learning_rate": 0.00014220303876203444, + "loss": 0.7597, + "step": 24065 + }, + { + "epoch": 0.6179473970545046, + "grad_norm": 0.75, + "learning_rate": 0.00014219899155649344, + "loss": 0.6869, + "step": 24066 + }, + { + "epoch": 0.6179730742504265, + "grad_norm": 0.7734375, + "learning_rate": 0.00014219494426685362, + "loss": 0.7804, + "step": 24067 + }, + { + "epoch": 0.6179987514463483, + "grad_norm": 0.796875, + "learning_rate": 0.0001421908968931231, + "loss": 0.8078, + "step": 24068 + }, + { + "epoch": 0.6180244286422701, + "grad_norm": 0.796875, + "learning_rate": 0.00014218684943530994, + "loss": 0.8935, + "step": 24069 + }, + { + "epoch": 0.618050105838192, + "grad_norm": 0.8125, + "learning_rate": 0.0001421828018934222, + "loss": 0.8353, + "step": 24070 + }, + { + "epoch": 0.6180757830341138, + "grad_norm": 0.82421875, + "learning_rate": 0.00014217875426746795, + "loss": 0.8178, + "step": 24071 + }, + { + "epoch": 0.6181014602300355, + "grad_norm": 0.78125, + "learning_rate": 0.00014217470655745524, + "loss": 0.8116, + "step": 24072 + }, + { + "epoch": 0.6181271374259574, + "grad_norm": 0.75, + "learning_rate": 0.00014217065876339215, + "loss": 0.7541, + "step": 24073 + }, + { + "epoch": 0.6181528146218792, + "grad_norm": 0.7890625, + "learning_rate": 0.00014216661088528677, + "loss": 0.8726, + "step": 24074 + }, + { + "epoch": 0.618178491817801, + "grad_norm": 0.7734375, + "learning_rate": 0.0001421625629231471, + "loss": 0.8922, + "step": 24075 + }, + { + "epoch": 0.6182041690137229, + "grad_norm": 0.78125, + "learning_rate": 0.00014215851487698128, + "loss": 0.9765, + "step": 24076 + }, + { + "epoch": 0.6182298462096447, + "grad_norm": 0.7734375, + "learning_rate": 0.00014215446674679732, + "loss": 0.876, + "step": 24077 + }, + { + "epoch": 0.6182555234055664, + "grad_norm": 0.75390625, + "learning_rate": 0.0001421504185326034, + "loss": 0.8905, + "step": 24078 + }, + { + "epoch": 0.6182812006014883, + "grad_norm": 0.765625, + "learning_rate": 0.00014214637023440743, + "loss": 0.7847, + "step": 24079 + }, + { + "epoch": 0.6183068777974101, + "grad_norm": 0.7734375, + "learning_rate": 0.00014214232185221753, + "loss": 0.8508, + "step": 24080 + }, + { + "epoch": 0.618332554993332, + "grad_norm": 0.77734375, + "learning_rate": 0.00014213827338604182, + "loss": 0.9267, + "step": 24081 + }, + { + "epoch": 0.6183582321892538, + "grad_norm": 0.69921875, + "learning_rate": 0.00014213422483588836, + "loss": 0.7335, + "step": 24082 + }, + { + "epoch": 0.6183839093851756, + "grad_norm": 0.78125, + "learning_rate": 0.0001421301762017652, + "loss": 0.823, + "step": 24083 + }, + { + "epoch": 0.6184095865810975, + "grad_norm": 0.71875, + "learning_rate": 0.00014212612748368034, + "loss": 0.6867, + "step": 24084 + }, + { + "epoch": 0.6184352637770192, + "grad_norm": 0.7734375, + "learning_rate": 0.000142122078681642, + "loss": 0.8424, + "step": 24085 + }, + { + "epoch": 0.618460940972941, + "grad_norm": 0.84765625, + "learning_rate": 0.0001421180297956581, + "loss": 0.85, + "step": 24086 + }, + { + "epoch": 0.6184866181688629, + "grad_norm": 0.7421875, + "learning_rate": 0.00014211398082573676, + "loss": 0.7758, + "step": 24087 + }, + { + "epoch": 0.6185122953647847, + "grad_norm": 0.8203125, + "learning_rate": 0.0001421099317718861, + "loss": 0.8025, + "step": 24088 + }, + { + "epoch": 0.6185379725607065, + "grad_norm": 0.66015625, + "learning_rate": 0.00014210588263411413, + "loss": 0.7958, + "step": 24089 + }, + { + "epoch": 0.6185636497566284, + "grad_norm": 0.90234375, + "learning_rate": 0.00014210183341242894, + "loss": 0.9067, + "step": 24090 + }, + { + "epoch": 0.6185893269525502, + "grad_norm": 0.80078125, + "learning_rate": 0.00014209778410683864, + "loss": 0.9677, + "step": 24091 + }, + { + "epoch": 0.6186150041484719, + "grad_norm": 0.77734375, + "learning_rate": 0.0001420937347173512, + "loss": 0.7912, + "step": 24092 + }, + { + "epoch": 0.6186406813443938, + "grad_norm": 0.75390625, + "learning_rate": 0.0001420896852439748, + "loss": 0.9731, + "step": 24093 + }, + { + "epoch": 0.6186663585403156, + "grad_norm": 0.796875, + "learning_rate": 0.00014208563568671742, + "loss": 0.8258, + "step": 24094 + }, + { + "epoch": 0.6186920357362374, + "grad_norm": 0.75390625, + "learning_rate": 0.00014208158604558718, + "loss": 0.7907, + "step": 24095 + }, + { + "epoch": 0.6187177129321593, + "grad_norm": 0.71875, + "learning_rate": 0.00014207753632059213, + "loss": 0.7692, + "step": 24096 + }, + { + "epoch": 0.6187433901280811, + "grad_norm": 0.72265625, + "learning_rate": 0.00014207348651174034, + "loss": 0.7722, + "step": 24097 + }, + { + "epoch": 0.6187690673240028, + "grad_norm": 0.66015625, + "learning_rate": 0.00014206943661903993, + "loss": 0.8462, + "step": 24098 + }, + { + "epoch": 0.6187947445199247, + "grad_norm": 0.7421875, + "learning_rate": 0.00014206538664249894, + "loss": 0.9011, + "step": 24099 + }, + { + "epoch": 0.6188204217158465, + "grad_norm": 0.76171875, + "learning_rate": 0.00014206133658212542, + "loss": 0.8164, + "step": 24100 + }, + { + "epoch": 0.6188460989117683, + "grad_norm": 0.76171875, + "learning_rate": 0.00014205728643792745, + "loss": 0.8746, + "step": 24101 + }, + { + "epoch": 0.6188717761076902, + "grad_norm": 0.7890625, + "learning_rate": 0.0001420532362099131, + "loss": 0.8844, + "step": 24102 + }, + { + "epoch": 0.618897453303612, + "grad_norm": 0.7890625, + "learning_rate": 0.00014204918589809045, + "loss": 0.8886, + "step": 24103 + }, + { + "epoch": 0.6189231304995338, + "grad_norm": 0.796875, + "learning_rate": 0.00014204513550246758, + "loss": 0.7275, + "step": 24104 + }, + { + "epoch": 0.6189488076954556, + "grad_norm": 0.76953125, + "learning_rate": 0.00014204108502305254, + "loss": 0.8238, + "step": 24105 + }, + { + "epoch": 0.6189744848913774, + "grad_norm": 0.8125, + "learning_rate": 0.00014203703445985342, + "loss": 1.0573, + "step": 24106 + }, + { + "epoch": 0.6190001620872992, + "grad_norm": 0.75, + "learning_rate": 0.0001420329838128783, + "loss": 0.909, + "step": 24107 + }, + { + "epoch": 0.6190258392832211, + "grad_norm": 0.76953125, + "learning_rate": 0.0001420289330821352, + "loss": 0.6738, + "step": 24108 + }, + { + "epoch": 0.6190515164791429, + "grad_norm": 0.79296875, + "learning_rate": 0.00014202488226763228, + "loss": 0.6968, + "step": 24109 + }, + { + "epoch": 0.6190771936750648, + "grad_norm": 0.86328125, + "learning_rate": 0.00014202083136937756, + "loss": 0.7834, + "step": 24110 + }, + { + "epoch": 0.6191028708709866, + "grad_norm": 0.7109375, + "learning_rate": 0.00014201678038737908, + "loss": 0.7251, + "step": 24111 + }, + { + "epoch": 0.6191285480669083, + "grad_norm": 0.7890625, + "learning_rate": 0.00014201272932164502, + "loss": 0.8858, + "step": 24112 + }, + { + "epoch": 0.6191542252628301, + "grad_norm": 0.74609375, + "learning_rate": 0.00014200867817218334, + "loss": 0.8791, + "step": 24113 + }, + { + "epoch": 0.619179902458752, + "grad_norm": 0.8046875, + "learning_rate": 0.00014200462693900217, + "loss": 0.9492, + "step": 24114 + }, + { + "epoch": 0.6192055796546738, + "grad_norm": 0.7734375, + "learning_rate": 0.00014200057562210956, + "loss": 0.852, + "step": 24115 + }, + { + "epoch": 0.6192312568505957, + "grad_norm": 0.78515625, + "learning_rate": 0.00014199652422151358, + "loss": 0.8957, + "step": 24116 + }, + { + "epoch": 0.6192569340465175, + "grad_norm": 0.76171875, + "learning_rate": 0.00014199247273722233, + "loss": 0.8908, + "step": 24117 + }, + { + "epoch": 0.6192826112424392, + "grad_norm": 0.75, + "learning_rate": 0.0001419884211692439, + "loss": 0.8311, + "step": 24118 + }, + { + "epoch": 0.619308288438361, + "grad_norm": 0.80078125, + "learning_rate": 0.00014198436951758634, + "loss": 0.9164, + "step": 24119 + }, + { + "epoch": 0.6193339656342829, + "grad_norm": 0.734375, + "learning_rate": 0.00014198031778225772, + "loss": 0.8523, + "step": 24120 + }, + { + "epoch": 0.6193596428302047, + "grad_norm": 0.8515625, + "learning_rate": 0.0001419762659632661, + "loss": 0.8844, + "step": 24121 + }, + { + "epoch": 0.6193853200261266, + "grad_norm": 0.73828125, + "learning_rate": 0.00014197221406061957, + "loss": 0.8315, + "step": 24122 + }, + { + "epoch": 0.6194109972220484, + "grad_norm": 0.79296875, + "learning_rate": 0.00014196816207432623, + "loss": 0.7732, + "step": 24123 + }, + { + "epoch": 0.6194366744179702, + "grad_norm": 0.78515625, + "learning_rate": 0.00014196411000439414, + "loss": 0.869, + "step": 24124 + }, + { + "epoch": 0.619462351613892, + "grad_norm": 0.78515625, + "learning_rate": 0.0001419600578508314, + "loss": 0.901, + "step": 24125 + }, + { + "epoch": 0.6194880288098138, + "grad_norm": 0.75390625, + "learning_rate": 0.00014195600561364596, + "loss": 0.8406, + "step": 24126 + }, + { + "epoch": 0.6195137060057356, + "grad_norm": 0.8046875, + "learning_rate": 0.00014195195329284605, + "loss": 0.896, + "step": 24127 + }, + { + "epoch": 0.6195393832016575, + "grad_norm": 0.79296875, + "learning_rate": 0.0001419479008884397, + "loss": 0.9681, + "step": 24128 + }, + { + "epoch": 0.6195650603975793, + "grad_norm": 0.83203125, + "learning_rate": 0.00014194384840043493, + "loss": 0.9114, + "step": 24129 + }, + { + "epoch": 0.6195907375935011, + "grad_norm": 0.75390625, + "learning_rate": 0.0001419397958288399, + "loss": 0.9282, + "step": 24130 + }, + { + "epoch": 0.619616414789423, + "grad_norm": 0.81640625, + "learning_rate": 0.00014193574317366263, + "loss": 1.0562, + "step": 24131 + }, + { + "epoch": 0.6196420919853447, + "grad_norm": 0.796875, + "learning_rate": 0.00014193169043491118, + "loss": 1.0004, + "step": 24132 + }, + { + "epoch": 0.6196677691812665, + "grad_norm": 0.76953125, + "learning_rate": 0.00014192763761259372, + "loss": 0.8772, + "step": 24133 + }, + { + "epoch": 0.6196934463771884, + "grad_norm": 0.76171875, + "learning_rate": 0.0001419235847067182, + "loss": 0.8123, + "step": 24134 + }, + { + "epoch": 0.6197191235731102, + "grad_norm": 0.75, + "learning_rate": 0.00014191953171729282, + "loss": 0.7759, + "step": 24135 + }, + { + "epoch": 0.619744800769032, + "grad_norm": 0.76171875, + "learning_rate": 0.00014191547864432556, + "loss": 0.7693, + "step": 24136 + }, + { + "epoch": 0.6197704779649539, + "grad_norm": 0.734375, + "learning_rate": 0.00014191142548782454, + "loss": 0.7583, + "step": 24137 + }, + { + "epoch": 0.6197961551608756, + "grad_norm": 0.72265625, + "learning_rate": 0.0001419073722477979, + "loss": 0.871, + "step": 24138 + }, + { + "epoch": 0.6198218323567974, + "grad_norm": 0.8046875, + "learning_rate": 0.00014190331892425357, + "loss": 0.6903, + "step": 24139 + }, + { + "epoch": 0.6198475095527193, + "grad_norm": 0.7890625, + "learning_rate": 0.00014189926551719974, + "loss": 0.847, + "step": 24140 + }, + { + "epoch": 0.6198731867486411, + "grad_norm": 0.7265625, + "learning_rate": 0.00014189521202664446, + "loss": 0.7369, + "step": 24141 + }, + { + "epoch": 0.619898863944563, + "grad_norm": 0.80859375, + "learning_rate": 0.0001418911584525958, + "loss": 0.979, + "step": 24142 + }, + { + "epoch": 0.6199245411404848, + "grad_norm": 0.77734375, + "learning_rate": 0.00014188710479506183, + "loss": 0.8958, + "step": 24143 + }, + { + "epoch": 0.6199502183364066, + "grad_norm": 0.75, + "learning_rate": 0.00014188305105405067, + "loss": 0.8082, + "step": 24144 + }, + { + "epoch": 0.6199758955323283, + "grad_norm": 0.77734375, + "learning_rate": 0.0001418789972295704, + "loss": 0.7373, + "step": 24145 + }, + { + "epoch": 0.6200015727282502, + "grad_norm": 0.7734375, + "learning_rate": 0.00014187494332162904, + "loss": 0.8214, + "step": 24146 + }, + { + "epoch": 0.620027249924172, + "grad_norm": 0.7578125, + "learning_rate": 0.0001418708893302347, + "loss": 0.8463, + "step": 24147 + }, + { + "epoch": 0.6200529271200939, + "grad_norm": 0.796875, + "learning_rate": 0.00014186683525539543, + "loss": 0.7425, + "step": 24148 + }, + { + "epoch": 0.6200786043160157, + "grad_norm": 0.76953125, + "learning_rate": 0.00014186278109711936, + "loss": 0.8083, + "step": 24149 + }, + { + "epoch": 0.6201042815119375, + "grad_norm": 0.734375, + "learning_rate": 0.00014185872685541456, + "loss": 0.922, + "step": 24150 + }, + { + "epoch": 0.6201299587078594, + "grad_norm": 0.8046875, + "learning_rate": 0.00014185467253028912, + "loss": 0.8633, + "step": 24151 + }, + { + "epoch": 0.6201556359037811, + "grad_norm": 0.8359375, + "learning_rate": 0.00014185061812175105, + "loss": 0.9772, + "step": 24152 + }, + { + "epoch": 0.6201813130997029, + "grad_norm": 0.734375, + "learning_rate": 0.0001418465636298085, + "loss": 0.8045, + "step": 24153 + }, + { + "epoch": 0.6202069902956248, + "grad_norm": 0.83984375, + "learning_rate": 0.00014184250905446955, + "loss": 0.8826, + "step": 24154 + }, + { + "epoch": 0.6202326674915466, + "grad_norm": 0.80859375, + "learning_rate": 0.00014183845439574223, + "loss": 1.018, + "step": 24155 + }, + { + "epoch": 0.6202583446874684, + "grad_norm": 0.78515625, + "learning_rate": 0.00014183439965363465, + "loss": 0.7751, + "step": 24156 + }, + { + "epoch": 0.6202840218833903, + "grad_norm": 0.75, + "learning_rate": 0.00014183034482815492, + "loss": 0.8843, + "step": 24157 + }, + { + "epoch": 0.620309699079312, + "grad_norm": 0.796875, + "learning_rate": 0.00014182628991931106, + "loss": 0.8724, + "step": 24158 + }, + { + "epoch": 0.6203353762752338, + "grad_norm": 0.81640625, + "learning_rate": 0.0001418222349271112, + "loss": 0.9328, + "step": 24159 + }, + { + "epoch": 0.6203610534711557, + "grad_norm": 0.76171875, + "learning_rate": 0.0001418181798515634, + "loss": 0.8166, + "step": 24160 + }, + { + "epoch": 0.6203867306670775, + "grad_norm": 0.7578125, + "learning_rate": 0.0001418141246926757, + "loss": 0.7853, + "step": 24161 + }, + { + "epoch": 0.6204124078629993, + "grad_norm": 0.7421875, + "learning_rate": 0.00014181006945045632, + "loss": 0.8433, + "step": 24162 + }, + { + "epoch": 0.6204380850589212, + "grad_norm": 0.73828125, + "learning_rate": 0.0001418060141249132, + "loss": 0.8394, + "step": 24163 + }, + { + "epoch": 0.620463762254843, + "grad_norm": 0.9921875, + "learning_rate": 0.00014180195871605445, + "loss": 0.9202, + "step": 24164 + }, + { + "epoch": 0.6204894394507647, + "grad_norm": 0.71484375, + "learning_rate": 0.0001417979032238882, + "loss": 0.8508, + "step": 24165 + }, + { + "epoch": 0.6205151166466866, + "grad_norm": 0.82421875, + "learning_rate": 0.00014179384764842248, + "loss": 0.8218, + "step": 24166 + }, + { + "epoch": 0.6205407938426084, + "grad_norm": 0.7578125, + "learning_rate": 0.0001417897919896654, + "loss": 0.922, + "step": 24167 + }, + { + "epoch": 0.6205664710385302, + "grad_norm": 0.734375, + "learning_rate": 0.00014178573624762505, + "loss": 0.7886, + "step": 24168 + }, + { + "epoch": 0.6205921482344521, + "grad_norm": 0.7421875, + "learning_rate": 0.0001417816804223095, + "loss": 0.8247, + "step": 24169 + }, + { + "epoch": 0.6206178254303739, + "grad_norm": 0.79296875, + "learning_rate": 0.00014177762451372686, + "loss": 0.9198, + "step": 24170 + }, + { + "epoch": 0.6206435026262956, + "grad_norm": 0.7421875, + "learning_rate": 0.00014177356852188518, + "loss": 0.8279, + "step": 24171 + }, + { + "epoch": 0.6206691798222175, + "grad_norm": 0.7421875, + "learning_rate": 0.0001417695124467925, + "loss": 0.7155, + "step": 24172 + }, + { + "epoch": 0.6206948570181393, + "grad_norm": 0.80078125, + "learning_rate": 0.00014176545628845702, + "loss": 0.8452, + "step": 24173 + }, + { + "epoch": 0.6207205342140611, + "grad_norm": 0.7578125, + "learning_rate": 0.00014176140004688672, + "loss": 0.8729, + "step": 24174 + }, + { + "epoch": 0.620746211409983, + "grad_norm": 0.8515625, + "learning_rate": 0.00014175734372208972, + "loss": 0.8587, + "step": 24175 + }, + { + "epoch": 0.6207718886059048, + "grad_norm": 0.77734375, + "learning_rate": 0.0001417532873140741, + "loss": 0.8745, + "step": 24176 + }, + { + "epoch": 0.6207975658018267, + "grad_norm": 0.76171875, + "learning_rate": 0.00014174923082284796, + "loss": 0.7521, + "step": 24177 + }, + { + "epoch": 0.6208232429977484, + "grad_norm": 0.75390625, + "learning_rate": 0.0001417451742484194, + "loss": 0.8374, + "step": 24178 + }, + { + "epoch": 0.6208489201936702, + "grad_norm": 0.76171875, + "learning_rate": 0.0001417411175907965, + "loss": 0.8777, + "step": 24179 + }, + { + "epoch": 0.620874597389592, + "grad_norm": 0.80078125, + "learning_rate": 0.00014173706084998726, + "loss": 0.8511, + "step": 24180 + }, + { + "epoch": 0.6209002745855139, + "grad_norm": 0.80078125, + "learning_rate": 0.00014173300402599984, + "loss": 0.7418, + "step": 24181 + }, + { + "epoch": 0.6209259517814357, + "grad_norm": 0.74609375, + "learning_rate": 0.00014172894711884232, + "loss": 0.7679, + "step": 24182 + }, + { + "epoch": 0.6209516289773576, + "grad_norm": 0.71875, + "learning_rate": 0.00014172489012852276, + "loss": 0.8518, + "step": 24183 + }, + { + "epoch": 0.6209773061732794, + "grad_norm": 0.83984375, + "learning_rate": 0.00014172083305504933, + "loss": 1.0647, + "step": 24184 + }, + { + "epoch": 0.6210029833692011, + "grad_norm": 0.83984375, + "learning_rate": 0.00014171677589843, + "loss": 0.8026, + "step": 24185 + }, + { + "epoch": 0.621028660565123, + "grad_norm": 0.78125, + "learning_rate": 0.0001417127186586729, + "loss": 0.8576, + "step": 24186 + }, + { + "epoch": 0.6210543377610448, + "grad_norm": 0.8359375, + "learning_rate": 0.00014170866133578612, + "loss": 0.8124, + "step": 24187 + }, + { + "epoch": 0.6210800149569666, + "grad_norm": 0.83984375, + "learning_rate": 0.00014170460392977775, + "loss": 0.8787, + "step": 24188 + }, + { + "epoch": 0.6211056921528885, + "grad_norm": 0.796875, + "learning_rate": 0.00014170054644065586, + "loss": 0.8988, + "step": 24189 + }, + { + "epoch": 0.6211313693488103, + "grad_norm": 0.734375, + "learning_rate": 0.0001416964888684286, + "loss": 0.8365, + "step": 24190 + }, + { + "epoch": 0.621157046544732, + "grad_norm": 0.69921875, + "learning_rate": 0.00014169243121310397, + "loss": 0.6968, + "step": 24191 + }, + { + "epoch": 0.6211827237406539, + "grad_norm": 0.859375, + "learning_rate": 0.00014168837347469006, + "loss": 0.8961, + "step": 24192 + }, + { + "epoch": 0.6212084009365757, + "grad_norm": 0.7578125, + "learning_rate": 0.000141684315653195, + "loss": 0.7515, + "step": 24193 + }, + { + "epoch": 0.6212340781324975, + "grad_norm": 0.78125, + "learning_rate": 0.00014168025774862687, + "loss": 0.9541, + "step": 24194 + }, + { + "epoch": 0.6212597553284194, + "grad_norm": 0.8359375, + "learning_rate": 0.00014167619976099377, + "loss": 0.9487, + "step": 24195 + }, + { + "epoch": 0.6212854325243412, + "grad_norm": 0.765625, + "learning_rate": 0.00014167214169030377, + "loss": 0.9544, + "step": 24196 + }, + { + "epoch": 0.621311109720263, + "grad_norm": 0.80078125, + "learning_rate": 0.00014166808353656497, + "loss": 0.8636, + "step": 24197 + }, + { + "epoch": 0.6213367869161848, + "grad_norm": 0.7578125, + "learning_rate": 0.00014166402529978538, + "loss": 0.7669, + "step": 24198 + }, + { + "epoch": 0.6213624641121066, + "grad_norm": 0.74609375, + "learning_rate": 0.00014165996697997318, + "loss": 0.8034, + "step": 24199 + }, + { + "epoch": 0.6213881413080284, + "grad_norm": 0.7578125, + "learning_rate": 0.00014165590857713646, + "loss": 0.8043, + "step": 24200 + }, + { + "epoch": 0.6214138185039503, + "grad_norm": 0.78515625, + "learning_rate": 0.00014165185009128324, + "loss": 0.7653, + "step": 24201 + }, + { + "epoch": 0.6214394956998721, + "grad_norm": 0.76953125, + "learning_rate": 0.00014164779152242164, + "loss": 0.8248, + "step": 24202 + }, + { + "epoch": 0.621465172895794, + "grad_norm": 0.7890625, + "learning_rate": 0.00014164373287055978, + "loss": 0.8311, + "step": 24203 + }, + { + "epoch": 0.6214908500917158, + "grad_norm": 0.87109375, + "learning_rate": 0.00014163967413570574, + "loss": 0.9767, + "step": 24204 + }, + { + "epoch": 0.6215165272876375, + "grad_norm": 0.7890625, + "learning_rate": 0.00014163561531786755, + "loss": 0.883, + "step": 24205 + }, + { + "epoch": 0.6215422044835593, + "grad_norm": 0.80859375, + "learning_rate": 0.00014163155641705336, + "loss": 0.8805, + "step": 24206 + }, + { + "epoch": 0.6215678816794812, + "grad_norm": 0.75390625, + "learning_rate": 0.00014162749743327124, + "loss": 0.8574, + "step": 24207 + }, + { + "epoch": 0.621593558875403, + "grad_norm": 0.7578125, + "learning_rate": 0.00014162343836652928, + "loss": 0.9026, + "step": 24208 + }, + { + "epoch": 0.6216192360713249, + "grad_norm": 0.75390625, + "learning_rate": 0.00014161937921683553, + "loss": 0.8156, + "step": 24209 + }, + { + "epoch": 0.6216449132672467, + "grad_norm": 0.78125, + "learning_rate": 0.00014161531998419817, + "loss": 0.8751, + "step": 24210 + }, + { + "epoch": 0.6216705904631684, + "grad_norm": 0.83984375, + "learning_rate": 0.0001416112606686252, + "loss": 0.8589, + "step": 24211 + }, + { + "epoch": 0.6216962676590903, + "grad_norm": 0.70703125, + "learning_rate": 0.00014160720127012475, + "loss": 0.7776, + "step": 24212 + }, + { + "epoch": 0.6217219448550121, + "grad_norm": 0.79296875, + "learning_rate": 0.00014160314178870493, + "loss": 0.8419, + "step": 24213 + }, + { + "epoch": 0.6217476220509339, + "grad_norm": 0.81640625, + "learning_rate": 0.00014159908222437378, + "loss": 0.7544, + "step": 24214 + }, + { + "epoch": 0.6217732992468558, + "grad_norm": 0.7578125, + "learning_rate": 0.0001415950225771394, + "loss": 0.8246, + "step": 24215 + }, + { + "epoch": 0.6217989764427776, + "grad_norm": 0.71875, + "learning_rate": 0.00014159096284700994, + "loss": 0.9198, + "step": 24216 + }, + { + "epoch": 0.6218246536386994, + "grad_norm": 0.7734375, + "learning_rate": 0.00014158690303399344, + "loss": 0.8772, + "step": 24217 + }, + { + "epoch": 0.6218503308346212, + "grad_norm": 0.796875, + "learning_rate": 0.000141582843138098, + "loss": 0.9149, + "step": 24218 + }, + { + "epoch": 0.621876008030543, + "grad_norm": 0.77734375, + "learning_rate": 0.0001415787831593317, + "loss": 0.8626, + "step": 24219 + }, + { + "epoch": 0.6219016852264648, + "grad_norm": 0.875, + "learning_rate": 0.00014157472309770262, + "loss": 0.9073, + "step": 24220 + }, + { + "epoch": 0.6219273624223867, + "grad_norm": 0.77734375, + "learning_rate": 0.0001415706629532189, + "loss": 0.7848, + "step": 24221 + }, + { + "epoch": 0.6219530396183085, + "grad_norm": 0.73828125, + "learning_rate": 0.00014156660272588857, + "loss": 0.8437, + "step": 24222 + }, + { + "epoch": 0.6219787168142303, + "grad_norm": 0.75, + "learning_rate": 0.0001415625424157198, + "loss": 0.9148, + "step": 24223 + }, + { + "epoch": 0.6220043940101522, + "grad_norm": 0.8359375, + "learning_rate": 0.00014155848202272064, + "loss": 0.9787, + "step": 24224 + }, + { + "epoch": 0.6220300712060739, + "grad_norm": 0.76171875, + "learning_rate": 0.00014155442154689915, + "loss": 0.7762, + "step": 24225 + }, + { + "epoch": 0.6220557484019957, + "grad_norm": 0.84765625, + "learning_rate": 0.00014155036098826348, + "loss": 0.8013, + "step": 24226 + }, + { + "epoch": 0.6220814255979176, + "grad_norm": 0.765625, + "learning_rate": 0.00014154630034682165, + "loss": 0.895, + "step": 24227 + }, + { + "epoch": 0.6221071027938394, + "grad_norm": 0.7890625, + "learning_rate": 0.00014154223962258182, + "loss": 0.8735, + "step": 24228 + }, + { + "epoch": 0.6221327799897612, + "grad_norm": 0.75390625, + "learning_rate": 0.00014153817881555207, + "loss": 0.7943, + "step": 24229 + }, + { + "epoch": 0.6221584571856831, + "grad_norm": 0.83203125, + "learning_rate": 0.00014153411792574048, + "loss": 0.9595, + "step": 24230 + }, + { + "epoch": 0.6221841343816048, + "grad_norm": 0.9140625, + "learning_rate": 0.00014153005695315515, + "loss": 0.9026, + "step": 24231 + }, + { + "epoch": 0.6222098115775266, + "grad_norm": 0.7578125, + "learning_rate": 0.00014152599589780412, + "loss": 0.8, + "step": 24232 + }, + { + "epoch": 0.6222354887734485, + "grad_norm": 0.75, + "learning_rate": 0.00014152193475969557, + "loss": 0.8399, + "step": 24233 + }, + { + "epoch": 0.6222611659693703, + "grad_norm": 0.76953125, + "learning_rate": 0.00014151787353883756, + "loss": 0.9052, + "step": 24234 + }, + { + "epoch": 0.6222868431652921, + "grad_norm": 0.75390625, + "learning_rate": 0.00014151381223523817, + "loss": 0.855, + "step": 24235 + }, + { + "epoch": 0.622312520361214, + "grad_norm": 0.77734375, + "learning_rate": 0.0001415097508489055, + "loss": 0.9464, + "step": 24236 + }, + { + "epoch": 0.6223381975571358, + "grad_norm": 0.73828125, + "learning_rate": 0.00014150568937984766, + "loss": 0.7827, + "step": 24237 + }, + { + "epoch": 0.6223638747530575, + "grad_norm": 0.7734375, + "learning_rate": 0.0001415016278280727, + "loss": 0.8737, + "step": 24238 + }, + { + "epoch": 0.6223895519489794, + "grad_norm": 0.671875, + "learning_rate": 0.00014149756619358876, + "loss": 0.691, + "step": 24239 + }, + { + "epoch": 0.6224152291449012, + "grad_norm": 0.859375, + "learning_rate": 0.00014149350447640392, + "loss": 0.8493, + "step": 24240 + }, + { + "epoch": 0.622440906340823, + "grad_norm": 0.98046875, + "learning_rate": 0.00014148944267652625, + "loss": 0.8988, + "step": 24241 + }, + { + "epoch": 0.6224665835367449, + "grad_norm": 0.80078125, + "learning_rate": 0.0001414853807939639, + "loss": 0.8879, + "step": 24242 + }, + { + "epoch": 0.6224922607326667, + "grad_norm": 0.7578125, + "learning_rate": 0.00014148131882872495, + "loss": 0.8266, + "step": 24243 + }, + { + "epoch": 0.6225179379285886, + "grad_norm": 0.796875, + "learning_rate": 0.00014147725678081744, + "loss": 0.8632, + "step": 24244 + }, + { + "epoch": 0.6225436151245103, + "grad_norm": 0.81640625, + "learning_rate": 0.00014147319465024954, + "loss": 0.8252, + "step": 24245 + }, + { + "epoch": 0.6225692923204321, + "grad_norm": 0.796875, + "learning_rate": 0.00014146913243702925, + "loss": 0.8221, + "step": 24246 + }, + { + "epoch": 0.622594969516354, + "grad_norm": 0.765625, + "learning_rate": 0.00014146507014116478, + "loss": 0.836, + "step": 24247 + }, + { + "epoch": 0.6226206467122758, + "grad_norm": 0.74609375, + "learning_rate": 0.00014146100776266416, + "loss": 0.8048, + "step": 24248 + }, + { + "epoch": 0.6226463239081976, + "grad_norm": 0.76953125, + "learning_rate": 0.0001414569453015355, + "loss": 0.779, + "step": 24249 + }, + { + "epoch": 0.6226720011041195, + "grad_norm": 0.76953125, + "learning_rate": 0.00014145288275778687, + "loss": 0.9531, + "step": 24250 + }, + { + "epoch": 0.6226976783000412, + "grad_norm": 0.765625, + "learning_rate": 0.00014144882013142639, + "loss": 0.9299, + "step": 24251 + }, + { + "epoch": 0.622723355495963, + "grad_norm": 0.7578125, + "learning_rate": 0.00014144475742246216, + "loss": 0.8666, + "step": 24252 + }, + { + "epoch": 0.6227490326918849, + "grad_norm": 0.77734375, + "learning_rate": 0.0001414406946309023, + "loss": 0.8893, + "step": 24253 + }, + { + "epoch": 0.6227747098878067, + "grad_norm": 0.74609375, + "learning_rate": 0.00014143663175675487, + "loss": 0.8922, + "step": 24254 + }, + { + "epoch": 0.6228003870837285, + "grad_norm": 0.71484375, + "learning_rate": 0.00014143256880002795, + "loss": 0.9105, + "step": 24255 + }, + { + "epoch": 0.6228260642796504, + "grad_norm": 0.7265625, + "learning_rate": 0.0001414285057607297, + "loss": 0.8911, + "step": 24256 + }, + { + "epoch": 0.6228517414755722, + "grad_norm": 0.94140625, + "learning_rate": 0.00014142444263886812, + "loss": 0.9166, + "step": 24257 + }, + { + "epoch": 0.6228774186714939, + "grad_norm": 0.734375, + "learning_rate": 0.00014142037943445143, + "loss": 0.7381, + "step": 24258 + }, + { + "epoch": 0.6229030958674158, + "grad_norm": 0.703125, + "learning_rate": 0.0001414163161474876, + "loss": 0.7561, + "step": 24259 + }, + { + "epoch": 0.6229287730633376, + "grad_norm": 0.78515625, + "learning_rate": 0.00014141225277798487, + "loss": 0.8264, + "step": 24260 + }, + { + "epoch": 0.6229544502592594, + "grad_norm": 0.80078125, + "learning_rate": 0.00014140818932595122, + "loss": 0.9615, + "step": 24261 + }, + { + "epoch": 0.6229801274551813, + "grad_norm": 0.73046875, + "learning_rate": 0.0001414041257913948, + "loss": 0.9645, + "step": 24262 + }, + { + "epoch": 0.6230058046511031, + "grad_norm": 0.80078125, + "learning_rate": 0.0001414000621743237, + "loss": 0.8702, + "step": 24263 + }, + { + "epoch": 0.623031481847025, + "grad_norm": 0.76171875, + "learning_rate": 0.000141395998474746, + "loss": 0.9448, + "step": 24264 + }, + { + "epoch": 0.6230571590429467, + "grad_norm": 0.828125, + "learning_rate": 0.00014139193469266982, + "loss": 0.8084, + "step": 24265 + }, + { + "epoch": 0.6230828362388685, + "grad_norm": 0.8046875, + "learning_rate": 0.00014138787082810327, + "loss": 0.9772, + "step": 24266 + }, + { + "epoch": 0.6231085134347903, + "grad_norm": 0.70703125, + "learning_rate": 0.0001413838068810544, + "loss": 0.9775, + "step": 24267 + }, + { + "epoch": 0.6231341906307122, + "grad_norm": 0.859375, + "learning_rate": 0.00014137974285153136, + "loss": 1.0677, + "step": 24268 + }, + { + "epoch": 0.623159867826634, + "grad_norm": 0.875, + "learning_rate": 0.00014137567873954223, + "loss": 0.8087, + "step": 24269 + }, + { + "epoch": 0.6231855450225559, + "grad_norm": 0.73046875, + "learning_rate": 0.00014137161454509512, + "loss": 0.8105, + "step": 24270 + }, + { + "epoch": 0.6232112222184776, + "grad_norm": 0.765625, + "learning_rate": 0.0001413675502681981, + "loss": 0.8656, + "step": 24271 + }, + { + "epoch": 0.6232368994143994, + "grad_norm": 0.83984375, + "learning_rate": 0.00014136348590885927, + "loss": 0.9144, + "step": 24272 + }, + { + "epoch": 0.6232625766103213, + "grad_norm": 0.83984375, + "learning_rate": 0.0001413594214670868, + "loss": 0.7969, + "step": 24273 + }, + { + "epoch": 0.6232882538062431, + "grad_norm": 0.76953125, + "learning_rate": 0.0001413553569428887, + "loss": 0.944, + "step": 24274 + }, + { + "epoch": 0.6233139310021649, + "grad_norm": 0.74609375, + "learning_rate": 0.00014135129233627315, + "loss": 0.817, + "step": 24275 + }, + { + "epoch": 0.6233396081980868, + "grad_norm": 0.7734375, + "learning_rate": 0.00014134722764724817, + "loss": 0.782, + "step": 24276 + }, + { + "epoch": 0.6233652853940086, + "grad_norm": 0.7578125, + "learning_rate": 0.00014134316287582193, + "loss": 0.7299, + "step": 24277 + }, + { + "epoch": 0.6233909625899303, + "grad_norm": 0.875, + "learning_rate": 0.00014133909802200244, + "loss": 1.019, + "step": 24278 + }, + { + "epoch": 0.6234166397858522, + "grad_norm": 0.8203125, + "learning_rate": 0.0001413350330857979, + "loss": 0.8571, + "step": 24279 + }, + { + "epoch": 0.623442316981774, + "grad_norm": 0.796875, + "learning_rate": 0.0001413309680672164, + "loss": 0.9532, + "step": 24280 + }, + { + "epoch": 0.6234679941776958, + "grad_norm": 0.86328125, + "learning_rate": 0.000141326902966266, + "loss": 0.8578, + "step": 24281 + }, + { + "epoch": 0.6234936713736177, + "grad_norm": 0.81640625, + "learning_rate": 0.00014132283778295479, + "loss": 0.8752, + "step": 24282 + }, + { + "epoch": 0.6235193485695395, + "grad_norm": 0.83203125, + "learning_rate": 0.00014131877251729092, + "loss": 0.8811, + "step": 24283 + }, + { + "epoch": 0.6235450257654613, + "grad_norm": 0.87109375, + "learning_rate": 0.00014131470716928247, + "loss": 0.7433, + "step": 24284 + }, + { + "epoch": 0.6235707029613831, + "grad_norm": 0.7578125, + "learning_rate": 0.00014131064173893753, + "loss": 0.816, + "step": 24285 + }, + { + "epoch": 0.6235963801573049, + "grad_norm": 0.8359375, + "learning_rate": 0.0001413065762262642, + "loss": 0.9858, + "step": 24286 + }, + { + "epoch": 0.6236220573532267, + "grad_norm": 0.80078125, + "learning_rate": 0.00014130251063127064, + "loss": 0.7614, + "step": 24287 + }, + { + "epoch": 0.6236477345491486, + "grad_norm": 0.78125, + "learning_rate": 0.00014129844495396488, + "loss": 0.8698, + "step": 24288 + }, + { + "epoch": 0.6236734117450704, + "grad_norm": 0.7890625, + "learning_rate": 0.00014129437919435504, + "loss": 0.84, + "step": 24289 + }, + { + "epoch": 0.6236990889409922, + "grad_norm": 0.8125, + "learning_rate": 0.00014129031335244924, + "loss": 0.8216, + "step": 24290 + }, + { + "epoch": 0.623724766136914, + "grad_norm": 0.74609375, + "learning_rate": 0.00014128624742825555, + "loss": 0.8642, + "step": 24291 + }, + { + "epoch": 0.6237504433328358, + "grad_norm": 0.83203125, + "learning_rate": 0.00014128218142178215, + "loss": 0.8959, + "step": 24292 + }, + { + "epoch": 0.6237761205287576, + "grad_norm": 0.78125, + "learning_rate": 0.00014127811533303706, + "loss": 0.8511, + "step": 24293 + }, + { + "epoch": 0.6238017977246795, + "grad_norm": 0.84765625, + "learning_rate": 0.0001412740491620284, + "loss": 0.8815, + "step": 24294 + }, + { + "epoch": 0.6238274749206013, + "grad_norm": 0.8046875, + "learning_rate": 0.00014126998290876433, + "loss": 0.7994, + "step": 24295 + }, + { + "epoch": 0.6238531521165231, + "grad_norm": 0.8046875, + "learning_rate": 0.00014126591657325288, + "loss": 0.8149, + "step": 24296 + }, + { + "epoch": 0.623878829312445, + "grad_norm": 0.76171875, + "learning_rate": 0.0001412618501555022, + "loss": 0.9334, + "step": 24297 + }, + { + "epoch": 0.6239045065083667, + "grad_norm": 0.875, + "learning_rate": 0.00014125778365552038, + "loss": 0.8418, + "step": 24298 + }, + { + "epoch": 0.6239301837042885, + "grad_norm": 0.765625, + "learning_rate": 0.0001412537170733155, + "loss": 0.7598, + "step": 24299 + }, + { + "epoch": 0.6239558609002104, + "grad_norm": 0.83203125, + "learning_rate": 0.00014124965040889572, + "loss": 0.9901, + "step": 24300 + }, + { + "epoch": 0.6239815380961322, + "grad_norm": 0.796875, + "learning_rate": 0.00014124558366226912, + "loss": 0.8324, + "step": 24301 + }, + { + "epoch": 0.624007215292054, + "grad_norm": 0.79296875, + "learning_rate": 0.00014124151683344376, + "loss": 0.8303, + "step": 24302 + }, + { + "epoch": 0.6240328924879759, + "grad_norm": 0.77734375, + "learning_rate": 0.00014123744992242782, + "loss": 0.7856, + "step": 24303 + }, + { + "epoch": 0.6240585696838977, + "grad_norm": 0.78125, + "learning_rate": 0.00014123338292922934, + "loss": 0.9087, + "step": 24304 + }, + { + "epoch": 0.6240842468798194, + "grad_norm": 0.7578125, + "learning_rate": 0.00014122931585385646, + "loss": 0.9604, + "step": 24305 + }, + { + "epoch": 0.6241099240757413, + "grad_norm": 0.8203125, + "learning_rate": 0.0001412252486963173, + "loss": 0.8544, + "step": 24306 + }, + { + "epoch": 0.6241356012716631, + "grad_norm": 0.7265625, + "learning_rate": 0.0001412211814566199, + "loss": 0.791, + "step": 24307 + }, + { + "epoch": 0.624161278467585, + "grad_norm": 0.78515625, + "learning_rate": 0.00014121711413477242, + "loss": 0.9505, + "step": 24308 + }, + { + "epoch": 0.6241869556635068, + "grad_norm": 0.73046875, + "learning_rate": 0.00014121304673078298, + "loss": 0.8155, + "step": 24309 + }, + { + "epoch": 0.6242126328594286, + "grad_norm": 0.78125, + "learning_rate": 0.00014120897924465962, + "loss": 0.9, + "step": 24310 + }, + { + "epoch": 0.6242383100553504, + "grad_norm": 0.75, + "learning_rate": 0.00014120491167641052, + "loss": 0.8261, + "step": 24311 + }, + { + "epoch": 0.6242639872512722, + "grad_norm": 0.73046875, + "learning_rate": 0.00014120084402604376, + "loss": 0.8075, + "step": 24312 + }, + { + "epoch": 0.624289664447194, + "grad_norm": 0.7890625, + "learning_rate": 0.00014119677629356741, + "loss": 0.8057, + "step": 24313 + }, + { + "epoch": 0.6243153416431159, + "grad_norm": 0.8125, + "learning_rate": 0.00014119270847898964, + "loss": 0.9433, + "step": 24314 + }, + { + "epoch": 0.6243410188390377, + "grad_norm": 0.828125, + "learning_rate": 0.0001411886405823185, + "loss": 0.7985, + "step": 24315 + }, + { + "epoch": 0.6243666960349595, + "grad_norm": 0.83203125, + "learning_rate": 0.00014118457260356212, + "loss": 0.9378, + "step": 24316 + }, + { + "epoch": 0.6243923732308814, + "grad_norm": 0.859375, + "learning_rate": 0.00014118050454272861, + "loss": 0.7966, + "step": 24317 + }, + { + "epoch": 0.6244180504268031, + "grad_norm": 0.765625, + "learning_rate": 0.00014117643639982607, + "loss": 0.86, + "step": 24318 + }, + { + "epoch": 0.6244437276227249, + "grad_norm": 0.80078125, + "learning_rate": 0.00014117236817486262, + "loss": 0.8725, + "step": 24319 + }, + { + "epoch": 0.6244694048186468, + "grad_norm": 0.8359375, + "learning_rate": 0.00014116829986784638, + "loss": 0.9039, + "step": 24320 + }, + { + "epoch": 0.6244950820145686, + "grad_norm": 0.7734375, + "learning_rate": 0.00014116423147878538, + "loss": 0.9289, + "step": 24321 + }, + { + "epoch": 0.6245207592104904, + "grad_norm": 0.74609375, + "learning_rate": 0.00014116016300768784, + "loss": 0.8957, + "step": 24322 + }, + { + "epoch": 0.6245464364064123, + "grad_norm": 0.75390625, + "learning_rate": 0.0001411560944545618, + "loss": 0.8335, + "step": 24323 + }, + { + "epoch": 0.6245721136023341, + "grad_norm": 0.83203125, + "learning_rate": 0.00014115202581941536, + "loss": 0.8996, + "step": 24324 + }, + { + "epoch": 0.6245977907982558, + "grad_norm": 0.78515625, + "learning_rate": 0.00014114795710225666, + "loss": 0.8349, + "step": 24325 + }, + { + "epoch": 0.6246234679941777, + "grad_norm": 0.78515625, + "learning_rate": 0.0001411438883030938, + "loss": 0.8432, + "step": 24326 + }, + { + "epoch": 0.6246491451900995, + "grad_norm": 0.76953125, + "learning_rate": 0.00014113981942193488, + "loss": 0.7524, + "step": 24327 + }, + { + "epoch": 0.6246748223860213, + "grad_norm": 0.765625, + "learning_rate": 0.00014113575045878802, + "loss": 0.8496, + "step": 24328 + }, + { + "epoch": 0.6247004995819432, + "grad_norm": 0.890625, + "learning_rate": 0.00014113168141366135, + "loss": 0.9547, + "step": 24329 + }, + { + "epoch": 0.624726176777865, + "grad_norm": 0.78515625, + "learning_rate": 0.00014112761228656293, + "loss": 0.7956, + "step": 24330 + }, + { + "epoch": 0.6247518539737867, + "grad_norm": 0.75, + "learning_rate": 0.00014112354307750088, + "loss": 0.6684, + "step": 24331 + }, + { + "epoch": 0.6247775311697086, + "grad_norm": 0.80078125, + "learning_rate": 0.00014111947378648334, + "loss": 0.8303, + "step": 24332 + }, + { + "epoch": 0.6248032083656304, + "grad_norm": 0.84765625, + "learning_rate": 0.00014111540441351841, + "loss": 0.8181, + "step": 24333 + }, + { + "epoch": 0.6248288855615522, + "grad_norm": 0.84765625, + "learning_rate": 0.00014111133495861417, + "loss": 1.0613, + "step": 24334 + }, + { + "epoch": 0.6248545627574741, + "grad_norm": 0.7890625, + "learning_rate": 0.00014110726542177878, + "loss": 0.9302, + "step": 24335 + }, + { + "epoch": 0.6248802399533959, + "grad_norm": 0.76953125, + "learning_rate": 0.00014110319580302032, + "loss": 0.88, + "step": 24336 + }, + { + "epoch": 0.6249059171493178, + "grad_norm": 0.8203125, + "learning_rate": 0.0001410991261023469, + "loss": 0.9552, + "step": 24337 + }, + { + "epoch": 0.6249315943452395, + "grad_norm": 0.77734375, + "learning_rate": 0.00014109505631976663, + "loss": 0.896, + "step": 24338 + }, + { + "epoch": 0.6249572715411613, + "grad_norm": 0.78515625, + "learning_rate": 0.00014109098645528762, + "loss": 0.8813, + "step": 24339 + }, + { + "epoch": 0.6249829487370832, + "grad_norm": 0.73828125, + "learning_rate": 0.00014108691650891798, + "loss": 0.9414, + "step": 24340 + }, + { + "epoch": 0.625008625933005, + "grad_norm": 0.8359375, + "learning_rate": 0.00014108284648066586, + "loss": 0.9295, + "step": 24341 + }, + { + "epoch": 0.6250343031289268, + "grad_norm": 0.82421875, + "learning_rate": 0.00014107877637053933, + "loss": 0.8952, + "step": 24342 + }, + { + "epoch": 0.6250599803248487, + "grad_norm": 0.71875, + "learning_rate": 0.00014107470617854648, + "loss": 0.9043, + "step": 24343 + }, + { + "epoch": 0.6250856575207705, + "grad_norm": 0.7734375, + "learning_rate": 0.00014107063590469547, + "loss": 0.7999, + "step": 24344 + }, + { + "epoch": 0.6251113347166922, + "grad_norm": 0.73046875, + "learning_rate": 0.00014106656554899438, + "loss": 0.8254, + "step": 24345 + }, + { + "epoch": 0.6251370119126141, + "grad_norm": 0.703125, + "learning_rate": 0.00014106249511145133, + "loss": 0.8416, + "step": 24346 + }, + { + "epoch": 0.6251626891085359, + "grad_norm": 0.859375, + "learning_rate": 0.00014105842459207442, + "loss": 0.9014, + "step": 24347 + }, + { + "epoch": 0.6251883663044577, + "grad_norm": 0.77734375, + "learning_rate": 0.00014105435399087185, + "loss": 0.8443, + "step": 24348 + }, + { + "epoch": 0.6252140435003796, + "grad_norm": 0.84765625, + "learning_rate": 0.00014105028330785164, + "loss": 0.8565, + "step": 24349 + }, + { + "epoch": 0.6252397206963014, + "grad_norm": 0.7265625, + "learning_rate": 0.00014104621254302187, + "loss": 0.9235, + "step": 24350 + }, + { + "epoch": 0.6252653978922231, + "grad_norm": 0.7421875, + "learning_rate": 0.00014104214169639073, + "loss": 0.8314, + "step": 24351 + }, + { + "epoch": 0.625291075088145, + "grad_norm": 0.7890625, + "learning_rate": 0.00014103807076796634, + "loss": 0.7892, + "step": 24352 + }, + { + "epoch": 0.6253167522840668, + "grad_norm": 0.73046875, + "learning_rate": 0.00014103399975775674, + "loss": 0.9154, + "step": 24353 + }, + { + "epoch": 0.6253424294799886, + "grad_norm": 0.81640625, + "learning_rate": 0.0001410299286657701, + "loss": 0.8248, + "step": 24354 + }, + { + "epoch": 0.6253681066759105, + "grad_norm": 0.796875, + "learning_rate": 0.00014102585749201454, + "loss": 0.8294, + "step": 24355 + }, + { + "epoch": 0.6253937838718323, + "grad_norm": 0.78515625, + "learning_rate": 0.00014102178623649815, + "loss": 0.8897, + "step": 24356 + }, + { + "epoch": 0.6254194610677541, + "grad_norm": 0.82421875, + "learning_rate": 0.00014101771489922902, + "loss": 0.8767, + "step": 24357 + }, + { + "epoch": 0.6254451382636759, + "grad_norm": 0.7890625, + "learning_rate": 0.0001410136434802153, + "loss": 0.8511, + "step": 24358 + }, + { + "epoch": 0.6254708154595977, + "grad_norm": 0.8046875, + "learning_rate": 0.00014100957197946508, + "loss": 0.9645, + "step": 24359 + }, + { + "epoch": 0.6254964926555195, + "grad_norm": 0.76953125, + "learning_rate": 0.00014100550039698652, + "loss": 0.9936, + "step": 24360 + }, + { + "epoch": 0.6255221698514414, + "grad_norm": 0.79296875, + "learning_rate": 0.00014100142873278766, + "loss": 0.9003, + "step": 24361 + }, + { + "epoch": 0.6255478470473632, + "grad_norm": 0.73828125, + "learning_rate": 0.0001409973569868767, + "loss": 0.7884, + "step": 24362 + }, + { + "epoch": 0.625573524243285, + "grad_norm": 0.71484375, + "learning_rate": 0.00014099328515926167, + "loss": 0.7705, + "step": 24363 + }, + { + "epoch": 0.6255992014392069, + "grad_norm": 0.73046875, + "learning_rate": 0.00014098921324995077, + "loss": 0.78, + "step": 24364 + }, + { + "epoch": 0.6256248786351286, + "grad_norm": 0.80078125, + "learning_rate": 0.00014098514125895204, + "loss": 0.8605, + "step": 24365 + }, + { + "epoch": 0.6256505558310504, + "grad_norm": 0.7890625, + "learning_rate": 0.0001409810691862736, + "loss": 0.9364, + "step": 24366 + }, + { + "epoch": 0.6256762330269723, + "grad_norm": 0.80859375, + "learning_rate": 0.00014097699703192364, + "loss": 0.8436, + "step": 24367 + }, + { + "epoch": 0.6257019102228941, + "grad_norm": 0.84765625, + "learning_rate": 0.0001409729247959102, + "loss": 0.8707, + "step": 24368 + }, + { + "epoch": 0.625727587418816, + "grad_norm": 0.84375, + "learning_rate": 0.00014096885247824142, + "loss": 0.7894, + "step": 24369 + }, + { + "epoch": 0.6257532646147378, + "grad_norm": 0.76171875, + "learning_rate": 0.00014096478007892546, + "loss": 0.8423, + "step": 24370 + }, + { + "epoch": 0.6257789418106595, + "grad_norm": 0.6953125, + "learning_rate": 0.00014096070759797032, + "loss": 0.7757, + "step": 24371 + }, + { + "epoch": 0.6258046190065814, + "grad_norm": 0.79296875, + "learning_rate": 0.00014095663503538425, + "loss": 0.9246, + "step": 24372 + }, + { + "epoch": 0.6258302962025032, + "grad_norm": 0.734375, + "learning_rate": 0.00014095256239117528, + "loss": 0.7786, + "step": 24373 + }, + { + "epoch": 0.625855973398425, + "grad_norm": 0.84765625, + "learning_rate": 0.00014094848966535154, + "loss": 0.8711, + "step": 24374 + }, + { + "epoch": 0.6258816505943469, + "grad_norm": 0.73046875, + "learning_rate": 0.00014094441685792123, + "loss": 0.731, + "step": 24375 + }, + { + "epoch": 0.6259073277902687, + "grad_norm": 0.88671875, + "learning_rate": 0.00014094034396889234, + "loss": 0.9169, + "step": 24376 + }, + { + "epoch": 0.6259330049861905, + "grad_norm": 0.71875, + "learning_rate": 0.000140936270998273, + "loss": 0.8921, + "step": 24377 + }, + { + "epoch": 0.6259586821821123, + "grad_norm": 0.87109375, + "learning_rate": 0.00014093219794607145, + "loss": 0.881, + "step": 24378 + }, + { + "epoch": 0.6259843593780341, + "grad_norm": 0.80859375, + "learning_rate": 0.00014092812481229566, + "loss": 0.8627, + "step": 24379 + }, + { + "epoch": 0.6260100365739559, + "grad_norm": 0.82421875, + "learning_rate": 0.00014092405159695388, + "loss": 1.0435, + "step": 24380 + }, + { + "epoch": 0.6260357137698778, + "grad_norm": 0.8359375, + "learning_rate": 0.00014091997830005415, + "loss": 0.8762, + "step": 24381 + }, + { + "epoch": 0.6260613909657996, + "grad_norm": 0.8203125, + "learning_rate": 0.00014091590492160456, + "loss": 0.9776, + "step": 24382 + }, + { + "epoch": 0.6260870681617214, + "grad_norm": 0.74609375, + "learning_rate": 0.00014091183146161328, + "loss": 0.8507, + "step": 24383 + }, + { + "epoch": 0.6261127453576432, + "grad_norm": 0.75, + "learning_rate": 0.0001409077579200884, + "loss": 0.7035, + "step": 24384 + }, + { + "epoch": 0.626138422553565, + "grad_norm": 0.75, + "learning_rate": 0.0001409036842970381, + "loss": 0.8573, + "step": 24385 + }, + { + "epoch": 0.6261640997494868, + "grad_norm": 0.76171875, + "learning_rate": 0.00014089961059247043, + "loss": 0.8451, + "step": 24386 + }, + { + "epoch": 0.6261897769454087, + "grad_norm": 0.796875, + "learning_rate": 0.0001408955368063935, + "loss": 0.8771, + "step": 24387 + }, + { + "epoch": 0.6262154541413305, + "grad_norm": 0.76953125, + "learning_rate": 0.0001408914629388155, + "loss": 0.692, + "step": 24388 + }, + { + "epoch": 0.6262411313372523, + "grad_norm": 0.7734375, + "learning_rate": 0.00014088738898974452, + "loss": 0.8508, + "step": 24389 + }, + { + "epoch": 0.6262668085331742, + "grad_norm": 0.78515625, + "learning_rate": 0.0001408833149591886, + "loss": 0.9726, + "step": 24390 + }, + { + "epoch": 0.6262924857290959, + "grad_norm": 0.7265625, + "learning_rate": 0.00014087924084715597, + "loss": 0.7891, + "step": 24391 + }, + { + "epoch": 0.6263181629250177, + "grad_norm": 0.72265625, + "learning_rate": 0.0001408751666536547, + "loss": 0.8388, + "step": 24392 + }, + { + "epoch": 0.6263438401209396, + "grad_norm": 0.8203125, + "learning_rate": 0.00014087109237869293, + "loss": 0.8717, + "step": 24393 + }, + { + "epoch": 0.6263695173168614, + "grad_norm": 0.875, + "learning_rate": 0.00014086701802227877, + "loss": 0.9285, + "step": 24394 + }, + { + "epoch": 0.6263951945127832, + "grad_norm": 0.6953125, + "learning_rate": 0.00014086294358442032, + "loss": 0.8531, + "step": 24395 + }, + { + "epoch": 0.6264208717087051, + "grad_norm": 0.77734375, + "learning_rate": 0.00014085886906512572, + "loss": 0.8943, + "step": 24396 + }, + { + "epoch": 0.6264465489046269, + "grad_norm": 0.76953125, + "learning_rate": 0.00014085479446440305, + "loss": 0.851, + "step": 24397 + }, + { + "epoch": 0.6264722261005486, + "grad_norm": 0.7734375, + "learning_rate": 0.0001408507197822605, + "loss": 0.8154, + "step": 24398 + }, + { + "epoch": 0.6264979032964705, + "grad_norm": 0.7109375, + "learning_rate": 0.00014084664501870614, + "loss": 0.7864, + "step": 24399 + }, + { + "epoch": 0.6265235804923923, + "grad_norm": 0.7421875, + "learning_rate": 0.00014084257017374813, + "loss": 0.7032, + "step": 24400 + }, + { + "epoch": 0.6265492576883142, + "grad_norm": 0.765625, + "learning_rate": 0.00014083849524739456, + "loss": 0.7901, + "step": 24401 + }, + { + "epoch": 0.626574934884236, + "grad_norm": 0.78125, + "learning_rate": 0.00014083442023965357, + "loss": 0.937, + "step": 24402 + }, + { + "epoch": 0.6266006120801578, + "grad_norm": 0.77734375, + "learning_rate": 0.00014083034515053323, + "loss": 0.9043, + "step": 24403 + }, + { + "epoch": 0.6266262892760796, + "grad_norm": 0.77734375, + "learning_rate": 0.00014082626998004172, + "loss": 0.7283, + "step": 24404 + }, + { + "epoch": 0.6266519664720014, + "grad_norm": 1.2109375, + "learning_rate": 0.00014082219472818714, + "loss": 0.8051, + "step": 24405 + }, + { + "epoch": 0.6266776436679232, + "grad_norm": 0.80078125, + "learning_rate": 0.00014081811939497757, + "loss": 0.792, + "step": 24406 + }, + { + "epoch": 0.6267033208638451, + "grad_norm": 0.79296875, + "learning_rate": 0.00014081404398042123, + "loss": 0.9146, + "step": 24407 + }, + { + "epoch": 0.6267289980597669, + "grad_norm": 0.640625, + "learning_rate": 0.00014080996848452618, + "loss": 0.738, + "step": 24408 + }, + { + "epoch": 0.6267546752556887, + "grad_norm": 0.75390625, + "learning_rate": 0.00014080589290730057, + "loss": 0.7455, + "step": 24409 + }, + { + "epoch": 0.6267803524516106, + "grad_norm": 0.78515625, + "learning_rate": 0.00014080181724875247, + "loss": 0.8952, + "step": 24410 + }, + { + "epoch": 0.6268060296475323, + "grad_norm": 0.78125, + "learning_rate": 0.00014079774150889003, + "loss": 0.8441, + "step": 24411 + }, + { + "epoch": 0.6268317068434541, + "grad_norm": 0.78125, + "learning_rate": 0.0001407936656877214, + "loss": 0.7441, + "step": 24412 + }, + { + "epoch": 0.626857384039376, + "grad_norm": 0.73828125, + "learning_rate": 0.00014078958978525465, + "loss": 0.9081, + "step": 24413 + }, + { + "epoch": 0.6268830612352978, + "grad_norm": 0.74609375, + "learning_rate": 0.00014078551380149797, + "loss": 0.8055, + "step": 24414 + }, + { + "epoch": 0.6269087384312196, + "grad_norm": 0.74609375, + "learning_rate": 0.00014078143773645942, + "loss": 0.8299, + "step": 24415 + }, + { + "epoch": 0.6269344156271415, + "grad_norm": 0.734375, + "learning_rate": 0.00014077736159014714, + "loss": 0.7619, + "step": 24416 + }, + { + "epoch": 0.6269600928230633, + "grad_norm": 0.7734375, + "learning_rate": 0.00014077328536256926, + "loss": 0.7798, + "step": 24417 + }, + { + "epoch": 0.626985770018985, + "grad_norm": 0.7734375, + "learning_rate": 0.00014076920905373394, + "loss": 0.8642, + "step": 24418 + }, + { + "epoch": 0.6270114472149069, + "grad_norm": 0.76953125, + "learning_rate": 0.00014076513266364922, + "loss": 0.8838, + "step": 24419 + }, + { + "epoch": 0.6270371244108287, + "grad_norm": 0.8046875, + "learning_rate": 0.00014076105619232333, + "loss": 0.9241, + "step": 24420 + }, + { + "epoch": 0.6270628016067505, + "grad_norm": 0.76953125, + "learning_rate": 0.0001407569796397643, + "loss": 0.8855, + "step": 24421 + }, + { + "epoch": 0.6270884788026724, + "grad_norm": 0.81640625, + "learning_rate": 0.0001407529030059803, + "loss": 0.8016, + "step": 24422 + }, + { + "epoch": 0.6271141559985942, + "grad_norm": 0.734375, + "learning_rate": 0.00014074882629097943, + "loss": 0.8214, + "step": 24423 + }, + { + "epoch": 0.6271398331945159, + "grad_norm": 0.7578125, + "learning_rate": 0.00014074474949476984, + "loss": 0.7402, + "step": 24424 + }, + { + "epoch": 0.6271655103904378, + "grad_norm": 0.8359375, + "learning_rate": 0.00014074067261735966, + "loss": 0.8628, + "step": 24425 + }, + { + "epoch": 0.6271911875863596, + "grad_norm": 0.77734375, + "learning_rate": 0.000140736595658757, + "loss": 0.7869, + "step": 24426 + }, + { + "epoch": 0.6272168647822814, + "grad_norm": 0.75, + "learning_rate": 0.00014073251861896997, + "loss": 0.9645, + "step": 24427 + }, + { + "epoch": 0.6272425419782033, + "grad_norm": 0.75390625, + "learning_rate": 0.0001407284414980067, + "loss": 0.8106, + "step": 24428 + }, + { + "epoch": 0.6272682191741251, + "grad_norm": 0.76953125, + "learning_rate": 0.00014072436429587534, + "loss": 0.9283, + "step": 24429 + }, + { + "epoch": 0.627293896370047, + "grad_norm": 0.76171875, + "learning_rate": 0.000140720287012584, + "loss": 0.8579, + "step": 24430 + }, + { + "epoch": 0.6273195735659687, + "grad_norm": 0.75390625, + "learning_rate": 0.00014071620964814082, + "loss": 0.8949, + "step": 24431 + }, + { + "epoch": 0.6273452507618905, + "grad_norm": 0.78125, + "learning_rate": 0.0001407121322025539, + "loss": 1.0064, + "step": 24432 + }, + { + "epoch": 0.6273709279578124, + "grad_norm": 0.79296875, + "learning_rate": 0.00014070805467583135, + "loss": 0.8225, + "step": 24433 + }, + { + "epoch": 0.6273966051537342, + "grad_norm": 0.78125, + "learning_rate": 0.00014070397706798134, + "loss": 0.9745, + "step": 24434 + }, + { + "epoch": 0.627422282349656, + "grad_norm": 0.8359375, + "learning_rate": 0.000140699899379012, + "loss": 0.9223, + "step": 24435 + }, + { + "epoch": 0.6274479595455779, + "grad_norm": 0.7890625, + "learning_rate": 0.00014069582160893142, + "loss": 0.787, + "step": 24436 + }, + { + "epoch": 0.6274736367414997, + "grad_norm": 0.7109375, + "learning_rate": 0.00014069174375774777, + "loss": 0.9523, + "step": 24437 + }, + { + "epoch": 0.6274993139374214, + "grad_norm": 0.80078125, + "learning_rate": 0.0001406876658254691, + "loss": 0.9326, + "step": 24438 + }, + { + "epoch": 0.6275249911333433, + "grad_norm": 0.78125, + "learning_rate": 0.00014068358781210363, + "loss": 0.8521, + "step": 24439 + }, + { + "epoch": 0.6275506683292651, + "grad_norm": 0.80859375, + "learning_rate": 0.00014067950971765944, + "loss": 0.799, + "step": 24440 + }, + { + "epoch": 0.6275763455251869, + "grad_norm": 0.7890625, + "learning_rate": 0.00014067543154214467, + "loss": 0.8527, + "step": 24441 + }, + { + "epoch": 0.6276020227211088, + "grad_norm": 0.8203125, + "learning_rate": 0.00014067135328556742, + "loss": 0.7793, + "step": 24442 + }, + { + "epoch": 0.6276276999170306, + "grad_norm": 0.78125, + "learning_rate": 0.0001406672749479358, + "loss": 0.9426, + "step": 24443 + }, + { + "epoch": 0.6276533771129523, + "grad_norm": 0.81640625, + "learning_rate": 0.00014066319652925803, + "loss": 1.0014, + "step": 24444 + }, + { + "epoch": 0.6276790543088742, + "grad_norm": 0.77734375, + "learning_rate": 0.00014065911802954218, + "loss": 0.7542, + "step": 24445 + }, + { + "epoch": 0.627704731504796, + "grad_norm": 0.796875, + "learning_rate": 0.00014065503944879634, + "loss": 0.7471, + "step": 24446 + }, + { + "epoch": 0.6277304087007178, + "grad_norm": 0.7734375, + "learning_rate": 0.00014065096078702872, + "loss": 0.7923, + "step": 24447 + }, + { + "epoch": 0.6277560858966397, + "grad_norm": 0.8046875, + "learning_rate": 0.0001406468820442474, + "loss": 0.9969, + "step": 24448 + }, + { + "epoch": 0.6277817630925615, + "grad_norm": 0.82421875, + "learning_rate": 0.0001406428032204605, + "loss": 0.794, + "step": 24449 + }, + { + "epoch": 0.6278074402884833, + "grad_norm": 0.79296875, + "learning_rate": 0.00014063872431567618, + "loss": 0.7874, + "step": 24450 + }, + { + "epoch": 0.6278331174844051, + "grad_norm": 0.81640625, + "learning_rate": 0.00014063464532990255, + "loss": 0.7871, + "step": 24451 + }, + { + "epoch": 0.6278587946803269, + "grad_norm": 0.88671875, + "learning_rate": 0.00014063056626314771, + "loss": 0.7434, + "step": 24452 + }, + { + "epoch": 0.6278844718762487, + "grad_norm": 0.8671875, + "learning_rate": 0.0001406264871154199, + "loss": 1.0572, + "step": 24453 + }, + { + "epoch": 0.6279101490721706, + "grad_norm": 0.7265625, + "learning_rate": 0.0001406224078867271, + "loss": 0.7982, + "step": 24454 + }, + { + "epoch": 0.6279358262680924, + "grad_norm": 0.82421875, + "learning_rate": 0.00014061832857707753, + "loss": 1.0864, + "step": 24455 + }, + { + "epoch": 0.6279615034640142, + "grad_norm": 0.73828125, + "learning_rate": 0.0001406142491864793, + "loss": 0.7428, + "step": 24456 + }, + { + "epoch": 0.6279871806599361, + "grad_norm": 0.734375, + "learning_rate": 0.00014061016971494053, + "loss": 0.9193, + "step": 24457 + }, + { + "epoch": 0.6280128578558578, + "grad_norm": 0.75, + "learning_rate": 0.00014060609016246938, + "loss": 0.873, + "step": 24458 + }, + { + "epoch": 0.6280385350517796, + "grad_norm": 0.84765625, + "learning_rate": 0.00014060201052907393, + "loss": 0.8576, + "step": 24459 + }, + { + "epoch": 0.6280642122477015, + "grad_norm": 0.859375, + "learning_rate": 0.00014059793081476238, + "loss": 0.9509, + "step": 24460 + }, + { + "epoch": 0.6280898894436233, + "grad_norm": 0.875, + "learning_rate": 0.00014059385101954281, + "loss": 0.9432, + "step": 24461 + }, + { + "epoch": 0.6281155666395452, + "grad_norm": 0.7421875, + "learning_rate": 0.00014058977114342336, + "loss": 0.8341, + "step": 24462 + }, + { + "epoch": 0.628141243835467, + "grad_norm": 0.8359375, + "learning_rate": 0.00014058569118641215, + "loss": 0.9027, + "step": 24463 + }, + { + "epoch": 0.6281669210313887, + "grad_norm": 0.82421875, + "learning_rate": 0.00014058161114851732, + "loss": 0.8031, + "step": 24464 + }, + { + "epoch": 0.6281925982273106, + "grad_norm": 0.75390625, + "learning_rate": 0.00014057753102974702, + "loss": 0.7975, + "step": 24465 + }, + { + "epoch": 0.6282182754232324, + "grad_norm": 0.78125, + "learning_rate": 0.00014057345083010937, + "loss": 0.9196, + "step": 24466 + }, + { + "epoch": 0.6282439526191542, + "grad_norm": 0.83203125, + "learning_rate": 0.00014056937054961248, + "loss": 1.012, + "step": 24467 + }, + { + "epoch": 0.6282696298150761, + "grad_norm": 0.80078125, + "learning_rate": 0.00014056529018826452, + "loss": 0.9244, + "step": 24468 + }, + { + "epoch": 0.6282953070109979, + "grad_norm": 0.90625, + "learning_rate": 0.00014056120974607358, + "loss": 0.9083, + "step": 24469 + }, + { + "epoch": 0.6283209842069197, + "grad_norm": 0.7734375, + "learning_rate": 0.00014055712922304782, + "loss": 0.8945, + "step": 24470 + }, + { + "epoch": 0.6283466614028415, + "grad_norm": 0.6484375, + "learning_rate": 0.00014055304861919536, + "loss": 0.5537, + "step": 24471 + }, + { + "epoch": 0.6283723385987633, + "grad_norm": 0.796875, + "learning_rate": 0.00014054896793452436, + "loss": 0.8216, + "step": 24472 + }, + { + "epoch": 0.6283980157946851, + "grad_norm": 0.8203125, + "learning_rate": 0.00014054488716904292, + "loss": 0.7972, + "step": 24473 + }, + { + "epoch": 0.628423692990607, + "grad_norm": 0.75390625, + "learning_rate": 0.00014054080632275916, + "loss": 0.8212, + "step": 24474 + }, + { + "epoch": 0.6284493701865288, + "grad_norm": 0.79296875, + "learning_rate": 0.00014053672539568126, + "loss": 0.8929, + "step": 24475 + }, + { + "epoch": 0.6284750473824506, + "grad_norm": 0.78125, + "learning_rate": 0.00014053264438781732, + "loss": 0.7911, + "step": 24476 + }, + { + "epoch": 0.6285007245783725, + "grad_norm": 0.76953125, + "learning_rate": 0.0001405285632991755, + "loss": 0.828, + "step": 24477 + }, + { + "epoch": 0.6285264017742942, + "grad_norm": 0.7265625, + "learning_rate": 0.00014052448212976385, + "loss": 0.71, + "step": 24478 + }, + { + "epoch": 0.628552078970216, + "grad_norm": 0.78125, + "learning_rate": 0.0001405204008795906, + "loss": 0.9157, + "step": 24479 + }, + { + "epoch": 0.6285777561661379, + "grad_norm": 0.703125, + "learning_rate": 0.00014051631954866388, + "loss": 0.7946, + "step": 24480 + }, + { + "epoch": 0.6286034333620597, + "grad_norm": 0.83984375, + "learning_rate": 0.0001405122381369918, + "loss": 0.867, + "step": 24481 + }, + { + "epoch": 0.6286291105579815, + "grad_norm": 1.1875, + "learning_rate": 0.00014050815664458247, + "loss": 0.8523, + "step": 24482 + }, + { + "epoch": 0.6286547877539034, + "grad_norm": 0.73828125, + "learning_rate": 0.00014050407507144402, + "loss": 0.9618, + "step": 24483 + }, + { + "epoch": 0.6286804649498251, + "grad_norm": 0.78515625, + "learning_rate": 0.00014049999341758464, + "loss": 0.8374, + "step": 24484 + }, + { + "epoch": 0.6287061421457469, + "grad_norm": 0.79296875, + "learning_rate": 0.00014049591168301242, + "loss": 0.9075, + "step": 24485 + }, + { + "epoch": 0.6287318193416688, + "grad_norm": 0.7578125, + "learning_rate": 0.00014049182986773548, + "loss": 0.946, + "step": 24486 + }, + { + "epoch": 0.6287574965375906, + "grad_norm": 0.75390625, + "learning_rate": 0.00014048774797176204, + "loss": 0.8623, + "step": 24487 + }, + { + "epoch": 0.6287831737335124, + "grad_norm": 0.76171875, + "learning_rate": 0.0001404836659951001, + "loss": 0.8637, + "step": 24488 + }, + { + "epoch": 0.6288088509294343, + "grad_norm": 0.84375, + "learning_rate": 0.00014047958393775795, + "loss": 0.9578, + "step": 24489 + }, + { + "epoch": 0.6288345281253561, + "grad_norm": 0.7734375, + "learning_rate": 0.00014047550179974362, + "loss": 0.78, + "step": 24490 + }, + { + "epoch": 0.6288602053212778, + "grad_norm": 0.76953125, + "learning_rate": 0.00014047141958106524, + "loss": 0.859, + "step": 24491 + }, + { + "epoch": 0.6288858825171997, + "grad_norm": 0.76953125, + "learning_rate": 0.00014046733728173097, + "loss": 0.8154, + "step": 24492 + }, + { + "epoch": 0.6289115597131215, + "grad_norm": 0.8125, + "learning_rate": 0.000140463254901749, + "loss": 0.8412, + "step": 24493 + }, + { + "epoch": 0.6289372369090434, + "grad_norm": 0.9765625, + "learning_rate": 0.0001404591724411274, + "loss": 1.0551, + "step": 24494 + }, + { + "epoch": 0.6289629141049652, + "grad_norm": 0.72265625, + "learning_rate": 0.00014045508989987432, + "loss": 0.8524, + "step": 24495 + }, + { + "epoch": 0.628988591300887, + "grad_norm": 0.8125, + "learning_rate": 0.00014045100727799788, + "loss": 0.7497, + "step": 24496 + }, + { + "epoch": 0.6290142684968089, + "grad_norm": 0.80859375, + "learning_rate": 0.0001404469245755063, + "loss": 0.9722, + "step": 24497 + }, + { + "epoch": 0.6290399456927306, + "grad_norm": 0.7578125, + "learning_rate": 0.0001404428417924076, + "loss": 0.8619, + "step": 24498 + }, + { + "epoch": 0.6290656228886524, + "grad_norm": 0.80078125, + "learning_rate": 0.00014043875892870994, + "loss": 0.9146, + "step": 24499 + }, + { + "epoch": 0.6290913000845743, + "grad_norm": 0.76953125, + "learning_rate": 0.00014043467598442156, + "loss": 0.918, + "step": 24500 + }, + { + "epoch": 0.6291169772804961, + "grad_norm": 0.69921875, + "learning_rate": 0.00014043059295955045, + "loss": 0.7359, + "step": 24501 + }, + { + "epoch": 0.6291426544764179, + "grad_norm": 0.7421875, + "learning_rate": 0.00014042650985410487, + "loss": 0.7632, + "step": 24502 + }, + { + "epoch": 0.6291683316723398, + "grad_norm": 0.71484375, + "learning_rate": 0.0001404224266680929, + "loss": 0.7431, + "step": 24503 + }, + { + "epoch": 0.6291940088682615, + "grad_norm": 0.84375, + "learning_rate": 0.00014041834340152267, + "loss": 0.897, + "step": 24504 + }, + { + "epoch": 0.6292196860641833, + "grad_norm": 0.7734375, + "learning_rate": 0.00014041426005440236, + "loss": 0.87, + "step": 24505 + }, + { + "epoch": 0.6292453632601052, + "grad_norm": 0.75, + "learning_rate": 0.00014041017662674008, + "loss": 0.7628, + "step": 24506 + }, + { + "epoch": 0.629271040456027, + "grad_norm": 0.94140625, + "learning_rate": 0.00014040609311854395, + "loss": 0.914, + "step": 24507 + }, + { + "epoch": 0.6292967176519488, + "grad_norm": 0.8203125, + "learning_rate": 0.0001404020095298221, + "loss": 0.755, + "step": 24508 + }, + { + "epoch": 0.6293223948478707, + "grad_norm": 0.74609375, + "learning_rate": 0.0001403979258605827, + "loss": 0.9047, + "step": 24509 + }, + { + "epoch": 0.6293480720437925, + "grad_norm": 0.73046875, + "learning_rate": 0.00014039384211083394, + "loss": 0.8247, + "step": 24510 + }, + { + "epoch": 0.6293737492397142, + "grad_norm": 0.82421875, + "learning_rate": 0.00014038975828058387, + "loss": 0.8841, + "step": 24511 + }, + { + "epoch": 0.6293994264356361, + "grad_norm": 0.76953125, + "learning_rate": 0.00014038567436984063, + "loss": 0.9118, + "step": 24512 + }, + { + "epoch": 0.6294251036315579, + "grad_norm": 0.734375, + "learning_rate": 0.00014038159037861245, + "loss": 0.8733, + "step": 24513 + }, + { + "epoch": 0.6294507808274797, + "grad_norm": 0.7265625, + "learning_rate": 0.00014037750630690738, + "loss": 0.8245, + "step": 24514 + }, + { + "epoch": 0.6294764580234016, + "grad_norm": 0.8125, + "learning_rate": 0.00014037342215473357, + "loss": 0.9181, + "step": 24515 + }, + { + "epoch": 0.6295021352193234, + "grad_norm": 0.8203125, + "learning_rate": 0.00014036933792209918, + "loss": 0.9199, + "step": 24516 + }, + { + "epoch": 0.6295278124152452, + "grad_norm": 0.75, + "learning_rate": 0.00014036525360901236, + "loss": 0.9168, + "step": 24517 + }, + { + "epoch": 0.629553489611167, + "grad_norm": 0.7421875, + "learning_rate": 0.0001403611692154812, + "loss": 0.8676, + "step": 24518 + }, + { + "epoch": 0.6295791668070888, + "grad_norm": 0.8203125, + "learning_rate": 0.00014035708474151393, + "loss": 0.8514, + "step": 24519 + }, + { + "epoch": 0.6296048440030106, + "grad_norm": 0.8125, + "learning_rate": 0.00014035300018711862, + "loss": 0.8184, + "step": 24520 + }, + { + "epoch": 0.6296305211989325, + "grad_norm": 0.828125, + "learning_rate": 0.0001403489155523034, + "loss": 0.9168, + "step": 24521 + }, + { + "epoch": 0.6296561983948543, + "grad_norm": 0.74609375, + "learning_rate": 0.00014034483083707645, + "loss": 0.6503, + "step": 24522 + }, + { + "epoch": 0.6296818755907762, + "grad_norm": 0.77734375, + "learning_rate": 0.0001403407460414459, + "loss": 0.9104, + "step": 24523 + }, + { + "epoch": 0.6297075527866979, + "grad_norm": 0.79296875, + "learning_rate": 0.0001403366611654199, + "loss": 0.9351, + "step": 24524 + }, + { + "epoch": 0.6297332299826197, + "grad_norm": 0.82421875, + "learning_rate": 0.00014033257620900655, + "loss": 0.8123, + "step": 24525 + }, + { + "epoch": 0.6297589071785415, + "grad_norm": 0.7109375, + "learning_rate": 0.00014032849117221404, + "loss": 0.7982, + "step": 24526 + }, + { + "epoch": 0.6297845843744634, + "grad_norm": 0.78125, + "learning_rate": 0.0001403244060550505, + "loss": 0.8673, + "step": 24527 + }, + { + "epoch": 0.6298102615703852, + "grad_norm": 0.80078125, + "learning_rate": 0.00014032032085752402, + "loss": 0.9323, + "step": 24528 + }, + { + "epoch": 0.6298359387663071, + "grad_norm": 0.78515625, + "learning_rate": 0.00014031623557964282, + "loss": 0.8521, + "step": 24529 + }, + { + "epoch": 0.6298616159622289, + "grad_norm": 0.76171875, + "learning_rate": 0.000140312150221415, + "loss": 0.8215, + "step": 24530 + }, + { + "epoch": 0.6298872931581506, + "grad_norm": 0.75390625, + "learning_rate": 0.00014030806478284867, + "loss": 0.8361, + "step": 24531 + }, + { + "epoch": 0.6299129703540725, + "grad_norm": 0.80859375, + "learning_rate": 0.00014030397926395203, + "loss": 0.9533, + "step": 24532 + }, + { + "epoch": 0.6299386475499943, + "grad_norm": 0.80859375, + "learning_rate": 0.0001402998936647332, + "loss": 0.7879, + "step": 24533 + }, + { + "epoch": 0.6299643247459161, + "grad_norm": 0.77734375, + "learning_rate": 0.00014029580798520034, + "loss": 0.934, + "step": 24534 + }, + { + "epoch": 0.629990001941838, + "grad_norm": 0.78125, + "learning_rate": 0.0001402917222253616, + "loss": 0.8667, + "step": 24535 + }, + { + "epoch": 0.6300156791377598, + "grad_norm": 0.70703125, + "learning_rate": 0.00014028763638522501, + "loss": 0.7079, + "step": 24536 + }, + { + "epoch": 0.6300413563336816, + "grad_norm": 0.78515625, + "learning_rate": 0.00014028355046479885, + "loss": 0.824, + "step": 24537 + }, + { + "epoch": 0.6300670335296034, + "grad_norm": 0.8671875, + "learning_rate": 0.00014027946446409123, + "loss": 0.9322, + "step": 24538 + }, + { + "epoch": 0.6300927107255252, + "grad_norm": 0.73046875, + "learning_rate": 0.00014027537838311025, + "loss": 0.8466, + "step": 24539 + }, + { + "epoch": 0.630118387921447, + "grad_norm": 0.8046875, + "learning_rate": 0.0001402712922218641, + "loss": 0.8209, + "step": 24540 + }, + { + "epoch": 0.6301440651173689, + "grad_norm": 0.828125, + "learning_rate": 0.00014026720598036088, + "loss": 0.7459, + "step": 24541 + }, + { + "epoch": 0.6301697423132907, + "grad_norm": 0.7890625, + "learning_rate": 0.00014026311965860877, + "loss": 0.8379, + "step": 24542 + }, + { + "epoch": 0.6301954195092125, + "grad_norm": 0.71875, + "learning_rate": 0.00014025903325661589, + "loss": 0.8066, + "step": 24543 + }, + { + "epoch": 0.6302210967051343, + "grad_norm": 0.78515625, + "learning_rate": 0.00014025494677439036, + "loss": 0.8796, + "step": 24544 + }, + { + "epoch": 0.6302467739010561, + "grad_norm": 0.8515625, + "learning_rate": 0.00014025086021194044, + "loss": 0.8543, + "step": 24545 + }, + { + "epoch": 0.6302724510969779, + "grad_norm": 0.82421875, + "learning_rate": 0.0001402467735692741, + "loss": 0.8297, + "step": 24546 + }, + { + "epoch": 0.6302981282928998, + "grad_norm": 0.7578125, + "learning_rate": 0.00014024268684639966, + "loss": 0.7455, + "step": 24547 + }, + { + "epoch": 0.6303238054888216, + "grad_norm": 0.69921875, + "learning_rate": 0.00014023860004332513, + "loss": 0.8255, + "step": 24548 + }, + { + "epoch": 0.6303494826847434, + "grad_norm": 0.7890625, + "learning_rate": 0.0001402345131600587, + "loss": 0.8484, + "step": 24549 + }, + { + "epoch": 0.6303751598806653, + "grad_norm": 0.80078125, + "learning_rate": 0.00014023042619660855, + "loss": 0.9114, + "step": 24550 + }, + { + "epoch": 0.630400837076587, + "grad_norm": 0.8046875, + "learning_rate": 0.00014022633915298277, + "loss": 0.8474, + "step": 24551 + }, + { + "epoch": 0.6304265142725088, + "grad_norm": 0.78125, + "learning_rate": 0.00014022225202918955, + "loss": 0.821, + "step": 24552 + }, + { + "epoch": 0.6304521914684307, + "grad_norm": 0.7890625, + "learning_rate": 0.00014021816482523701, + "loss": 0.8924, + "step": 24553 + }, + { + "epoch": 0.6304778686643525, + "grad_norm": 0.80859375, + "learning_rate": 0.00014021407754113331, + "loss": 0.8352, + "step": 24554 + }, + { + "epoch": 0.6305035458602744, + "grad_norm": 0.74609375, + "learning_rate": 0.00014020999017688655, + "loss": 0.8709, + "step": 24555 + }, + { + "epoch": 0.6305292230561962, + "grad_norm": 0.71484375, + "learning_rate": 0.00014020590273250492, + "loss": 0.7928, + "step": 24556 + }, + { + "epoch": 0.630554900252118, + "grad_norm": 0.7734375, + "learning_rate": 0.0001402018152079966, + "loss": 0.8661, + "step": 24557 + }, + { + "epoch": 0.6305805774480397, + "grad_norm": 0.7734375, + "learning_rate": 0.00014019772760336964, + "loss": 0.8766, + "step": 24558 + }, + { + "epoch": 0.6306062546439616, + "grad_norm": 0.75, + "learning_rate": 0.00014019363991863232, + "loss": 0.9129, + "step": 24559 + }, + { + "epoch": 0.6306319318398834, + "grad_norm": 0.73046875, + "learning_rate": 0.0001401895521537926, + "loss": 0.8161, + "step": 24560 + }, + { + "epoch": 0.6306576090358053, + "grad_norm": 0.75390625, + "learning_rate": 0.0001401854643088588, + "loss": 0.8878, + "step": 24561 + }, + { + "epoch": 0.6306832862317271, + "grad_norm": 0.74609375, + "learning_rate": 0.000140181376383839, + "loss": 0.8077, + "step": 24562 + }, + { + "epoch": 0.6307089634276489, + "grad_norm": 0.6640625, + "learning_rate": 0.00014017728837874128, + "loss": 0.7964, + "step": 24563 + }, + { + "epoch": 0.6307346406235707, + "grad_norm": 0.80078125, + "learning_rate": 0.00014017320029357394, + "loss": 0.8855, + "step": 24564 + }, + { + "epoch": 0.6307603178194925, + "grad_norm": 0.765625, + "learning_rate": 0.000140169112128345, + "loss": 0.7473, + "step": 24565 + }, + { + "epoch": 0.6307859950154143, + "grad_norm": 0.80078125, + "learning_rate": 0.00014016502388306264, + "loss": 0.8243, + "step": 24566 + }, + { + "epoch": 0.6308116722113362, + "grad_norm": 0.74609375, + "learning_rate": 0.00014016093555773503, + "loss": 0.7906, + "step": 24567 + }, + { + "epoch": 0.630837349407258, + "grad_norm": 0.79296875, + "learning_rate": 0.00014015684715237028, + "loss": 0.8902, + "step": 24568 + }, + { + "epoch": 0.6308630266031798, + "grad_norm": 0.83984375, + "learning_rate": 0.0001401527586669766, + "loss": 0.8426, + "step": 24569 + }, + { + "epoch": 0.6308887037991017, + "grad_norm": 0.8046875, + "learning_rate": 0.0001401486701015621, + "loss": 0.8495, + "step": 24570 + }, + { + "epoch": 0.6309143809950234, + "grad_norm": 0.76953125, + "learning_rate": 0.0001401445814561349, + "loss": 0.8001, + "step": 24571 + }, + { + "epoch": 0.6309400581909452, + "grad_norm": 0.80859375, + "learning_rate": 0.00014014049273070319, + "loss": 0.9166, + "step": 24572 + }, + { + "epoch": 0.6309657353868671, + "grad_norm": 0.76171875, + "learning_rate": 0.00014013640392527508, + "loss": 0.814, + "step": 24573 + }, + { + "epoch": 0.6309914125827889, + "grad_norm": 0.71484375, + "learning_rate": 0.00014013231503985876, + "loss": 0.7683, + "step": 24574 + }, + { + "epoch": 0.6310170897787107, + "grad_norm": 0.8515625, + "learning_rate": 0.00014012822607446237, + "loss": 0.89, + "step": 24575 + }, + { + "epoch": 0.6310427669746326, + "grad_norm": 0.71484375, + "learning_rate": 0.000140124137029094, + "loss": 0.8057, + "step": 24576 + }, + { + "epoch": 0.6310684441705544, + "grad_norm": 0.8046875, + "learning_rate": 0.0001401200479037619, + "loss": 0.8365, + "step": 24577 + }, + { + "epoch": 0.6310941213664761, + "grad_norm": 0.7890625, + "learning_rate": 0.0001401159586984742, + "loss": 0.8573, + "step": 24578 + }, + { + "epoch": 0.631119798562398, + "grad_norm": 0.796875, + "learning_rate": 0.00014011186941323897, + "loss": 0.811, + "step": 24579 + }, + { + "epoch": 0.6311454757583198, + "grad_norm": 0.828125, + "learning_rate": 0.0001401077800480644, + "loss": 0.9418, + "step": 24580 + }, + { + "epoch": 0.6311711529542416, + "grad_norm": 0.87890625, + "learning_rate": 0.00014010369060295865, + "loss": 0.8585, + "step": 24581 + }, + { + "epoch": 0.6311968301501635, + "grad_norm": 0.7734375, + "learning_rate": 0.0001400996010779299, + "loss": 0.9736, + "step": 24582 + }, + { + "epoch": 0.6312225073460853, + "grad_norm": 0.78515625, + "learning_rate": 0.00014009551147298622, + "loss": 0.9077, + "step": 24583 + }, + { + "epoch": 0.631248184542007, + "grad_norm": 0.8515625, + "learning_rate": 0.00014009142178813584, + "loss": 0.9429, + "step": 24584 + }, + { + "epoch": 0.6312738617379289, + "grad_norm": 0.75390625, + "learning_rate": 0.0001400873320233869, + "loss": 0.8965, + "step": 24585 + }, + { + "epoch": 0.6312995389338507, + "grad_norm": 0.75390625, + "learning_rate": 0.00014008324217874745, + "loss": 0.7412, + "step": 24586 + }, + { + "epoch": 0.6313252161297725, + "grad_norm": 0.75, + "learning_rate": 0.00014007915225422579, + "loss": 0.8793, + "step": 24587 + }, + { + "epoch": 0.6313508933256944, + "grad_norm": 0.76953125, + "learning_rate": 0.00014007506224983, + "loss": 0.8328, + "step": 24588 + }, + { + "epoch": 0.6313765705216162, + "grad_norm": 0.75, + "learning_rate": 0.00014007097216556819, + "loss": 0.8499, + "step": 24589 + }, + { + "epoch": 0.6314022477175381, + "grad_norm": 0.80078125, + "learning_rate": 0.00014006688200144857, + "loss": 0.8627, + "step": 24590 + }, + { + "epoch": 0.6314279249134598, + "grad_norm": 0.73046875, + "learning_rate": 0.00014006279175747928, + "loss": 0.9046, + "step": 24591 + }, + { + "epoch": 0.6314536021093816, + "grad_norm": 0.71875, + "learning_rate": 0.00014005870143366845, + "loss": 0.8345, + "step": 24592 + }, + { + "epoch": 0.6314792793053035, + "grad_norm": 0.74609375, + "learning_rate": 0.00014005461103002426, + "loss": 0.9249, + "step": 24593 + }, + { + "epoch": 0.6315049565012253, + "grad_norm": 0.80859375, + "learning_rate": 0.00014005052054655485, + "loss": 0.9506, + "step": 24594 + }, + { + "epoch": 0.6315306336971471, + "grad_norm": 0.828125, + "learning_rate": 0.00014004642998326834, + "loss": 0.9573, + "step": 24595 + }, + { + "epoch": 0.631556310893069, + "grad_norm": 0.80078125, + "learning_rate": 0.00014004233934017296, + "loss": 0.8981, + "step": 24596 + }, + { + "epoch": 0.6315819880889907, + "grad_norm": 0.7421875, + "learning_rate": 0.0001400382486172768, + "loss": 0.8264, + "step": 24597 + }, + { + "epoch": 0.6316076652849125, + "grad_norm": 0.78125, + "learning_rate": 0.000140034157814588, + "loss": 0.8742, + "step": 24598 + }, + { + "epoch": 0.6316333424808344, + "grad_norm": 0.71484375, + "learning_rate": 0.00014003006693211477, + "loss": 0.8214, + "step": 24599 + }, + { + "epoch": 0.6316590196767562, + "grad_norm": 0.7890625, + "learning_rate": 0.00014002597596986523, + "loss": 0.9855, + "step": 24600 + }, + { + "epoch": 0.631684696872678, + "grad_norm": 0.77734375, + "learning_rate": 0.00014002188492784755, + "loss": 0.9422, + "step": 24601 + }, + { + "epoch": 0.6317103740685999, + "grad_norm": 0.796875, + "learning_rate": 0.00014001779380606982, + "loss": 0.8455, + "step": 24602 + }, + { + "epoch": 0.6317360512645217, + "grad_norm": 0.74609375, + "learning_rate": 0.00014001370260454025, + "loss": 0.8582, + "step": 24603 + }, + { + "epoch": 0.6317617284604434, + "grad_norm": 0.796875, + "learning_rate": 0.00014000961132326699, + "loss": 0.9861, + "step": 24604 + }, + { + "epoch": 0.6317874056563653, + "grad_norm": 0.67578125, + "learning_rate": 0.0001400055199622582, + "loss": 0.8414, + "step": 24605 + }, + { + "epoch": 0.6318130828522871, + "grad_norm": 0.80078125, + "learning_rate": 0.00014000142852152204, + "loss": 0.9278, + "step": 24606 + }, + { + "epoch": 0.6318387600482089, + "grad_norm": 0.7890625, + "learning_rate": 0.00013999733700106664, + "loss": 0.9317, + "step": 24607 + }, + { + "epoch": 0.6318644372441308, + "grad_norm": 0.71484375, + "learning_rate": 0.00013999324540090013, + "loss": 0.7144, + "step": 24608 + }, + { + "epoch": 0.6318901144400526, + "grad_norm": 0.76171875, + "learning_rate": 0.00013998915372103073, + "loss": 0.7821, + "step": 24609 + }, + { + "epoch": 0.6319157916359744, + "grad_norm": 0.7578125, + "learning_rate": 0.00013998506196146655, + "loss": 0.7276, + "step": 24610 + }, + { + "epoch": 0.6319414688318962, + "grad_norm": 0.77734375, + "learning_rate": 0.0001399809701222157, + "loss": 0.8253, + "step": 24611 + }, + { + "epoch": 0.631967146027818, + "grad_norm": 0.79296875, + "learning_rate": 0.00013997687820328648, + "loss": 0.7747, + "step": 24612 + }, + { + "epoch": 0.6319928232237398, + "grad_norm": 0.84375, + "learning_rate": 0.0001399727862046869, + "loss": 0.8962, + "step": 24613 + }, + { + "epoch": 0.6320185004196617, + "grad_norm": 0.76953125, + "learning_rate": 0.00013996869412642517, + "loss": 0.8482, + "step": 24614 + }, + { + "epoch": 0.6320441776155835, + "grad_norm": 0.82421875, + "learning_rate": 0.00013996460196850948, + "loss": 0.8679, + "step": 24615 + }, + { + "epoch": 0.6320698548115053, + "grad_norm": 0.82421875, + "learning_rate": 0.00013996050973094786, + "loss": 0.8403, + "step": 24616 + }, + { + "epoch": 0.6320955320074271, + "grad_norm": 0.8125, + "learning_rate": 0.00013995641741374864, + "loss": 0.8737, + "step": 24617 + }, + { + "epoch": 0.6321212092033489, + "grad_norm": 0.79296875, + "learning_rate": 0.00013995232501691986, + "loss": 0.8341, + "step": 24618 + }, + { + "epoch": 0.6321468863992707, + "grad_norm": 0.8046875, + "learning_rate": 0.0001399482325404697, + "loss": 0.7401, + "step": 24619 + }, + { + "epoch": 0.6321725635951926, + "grad_norm": 0.79296875, + "learning_rate": 0.00013994413998440634, + "loss": 0.8562, + "step": 24620 + }, + { + "epoch": 0.6321982407911144, + "grad_norm": 0.71875, + "learning_rate": 0.00013994004734873788, + "loss": 0.8372, + "step": 24621 + }, + { + "epoch": 0.6322239179870363, + "grad_norm": 0.87890625, + "learning_rate": 0.00013993595463347255, + "loss": 0.9861, + "step": 24622 + }, + { + "epoch": 0.6322495951829581, + "grad_norm": 0.77734375, + "learning_rate": 0.00013993186183861847, + "loss": 0.8969, + "step": 24623 + }, + { + "epoch": 0.6322752723788798, + "grad_norm": 0.80078125, + "learning_rate": 0.00013992776896418375, + "loss": 0.8749, + "step": 24624 + }, + { + "epoch": 0.6323009495748017, + "grad_norm": 0.76953125, + "learning_rate": 0.00013992367601017665, + "loss": 0.8954, + "step": 24625 + }, + { + "epoch": 0.6323266267707235, + "grad_norm": 0.734375, + "learning_rate": 0.00013991958297660524, + "loss": 0.7387, + "step": 24626 + }, + { + "epoch": 0.6323523039666453, + "grad_norm": 0.75390625, + "learning_rate": 0.00013991548986347772, + "loss": 0.8787, + "step": 24627 + }, + { + "epoch": 0.6323779811625672, + "grad_norm": 0.7421875, + "learning_rate": 0.0001399113966708022, + "loss": 0.7712, + "step": 24628 + }, + { + "epoch": 0.632403658358489, + "grad_norm": 0.7890625, + "learning_rate": 0.0001399073033985869, + "loss": 0.9084, + "step": 24629 + }, + { + "epoch": 0.6324293355544108, + "grad_norm": 0.83203125, + "learning_rate": 0.00013990321004683995, + "loss": 0.9877, + "step": 24630 + }, + { + "epoch": 0.6324550127503326, + "grad_norm": 0.75, + "learning_rate": 0.00013989911661556955, + "loss": 0.8948, + "step": 24631 + }, + { + "epoch": 0.6324806899462544, + "grad_norm": 0.7109375, + "learning_rate": 0.00013989502310478373, + "loss": 0.6447, + "step": 24632 + }, + { + "epoch": 0.6325063671421762, + "grad_norm": 0.78515625, + "learning_rate": 0.00013989092951449079, + "loss": 0.8179, + "step": 24633 + }, + { + "epoch": 0.6325320443380981, + "grad_norm": 0.77734375, + "learning_rate": 0.0001398868358446988, + "loss": 0.7741, + "step": 24634 + }, + { + "epoch": 0.6325577215340199, + "grad_norm": 0.828125, + "learning_rate": 0.00013988274209541593, + "loss": 0.9271, + "step": 24635 + }, + { + "epoch": 0.6325833987299417, + "grad_norm": 0.6875, + "learning_rate": 0.0001398786482666504, + "loss": 0.8735, + "step": 24636 + }, + { + "epoch": 0.6326090759258635, + "grad_norm": 0.8046875, + "learning_rate": 0.00013987455435841031, + "loss": 0.9137, + "step": 24637 + }, + { + "epoch": 0.6326347531217853, + "grad_norm": 0.84375, + "learning_rate": 0.00013987046037070385, + "loss": 0.834, + "step": 24638 + }, + { + "epoch": 0.6326604303177071, + "grad_norm": 0.7265625, + "learning_rate": 0.00013986636630353917, + "loss": 0.8521, + "step": 24639 + }, + { + "epoch": 0.632686107513629, + "grad_norm": 0.77734375, + "learning_rate": 0.0001398622721569244, + "loss": 0.8998, + "step": 24640 + }, + { + "epoch": 0.6327117847095508, + "grad_norm": 0.78515625, + "learning_rate": 0.00013985817793086774, + "loss": 0.8911, + "step": 24641 + }, + { + "epoch": 0.6327374619054726, + "grad_norm": 0.7421875, + "learning_rate": 0.0001398540836253773, + "loss": 0.9181, + "step": 24642 + }, + { + "epoch": 0.6327631391013945, + "grad_norm": 0.8046875, + "learning_rate": 0.0001398499892404613, + "loss": 0.8371, + "step": 24643 + }, + { + "epoch": 0.6327888162973162, + "grad_norm": 0.76953125, + "learning_rate": 0.00013984589477612788, + "loss": 0.8041, + "step": 24644 + }, + { + "epoch": 0.632814493493238, + "grad_norm": 0.76171875, + "learning_rate": 0.0001398418002323852, + "loss": 0.8066, + "step": 24645 + }, + { + "epoch": 0.6328401706891599, + "grad_norm": 0.73828125, + "learning_rate": 0.00013983770560924133, + "loss": 1.0044, + "step": 24646 + }, + { + "epoch": 0.6328658478850817, + "grad_norm": 0.81640625, + "learning_rate": 0.0001398336109067046, + "loss": 0.9514, + "step": 24647 + }, + { + "epoch": 0.6328915250810035, + "grad_norm": 0.7421875, + "learning_rate": 0.00013982951612478303, + "loss": 0.8274, + "step": 24648 + }, + { + "epoch": 0.6329172022769254, + "grad_norm": 0.78125, + "learning_rate": 0.00013982542126348487, + "loss": 0.8888, + "step": 24649 + }, + { + "epoch": 0.6329428794728472, + "grad_norm": 0.7734375, + "learning_rate": 0.0001398213263228182, + "loss": 0.9012, + "step": 24650 + }, + { + "epoch": 0.632968556668769, + "grad_norm": 0.80078125, + "learning_rate": 0.00013981723130279127, + "loss": 0.8855, + "step": 24651 + }, + { + "epoch": 0.6329942338646908, + "grad_norm": 0.81640625, + "learning_rate": 0.00013981313620341215, + "loss": 0.9481, + "step": 24652 + }, + { + "epoch": 0.6330199110606126, + "grad_norm": 0.69921875, + "learning_rate": 0.0001398090410246891, + "loss": 0.8604, + "step": 24653 + }, + { + "epoch": 0.6330455882565345, + "grad_norm": 0.68359375, + "learning_rate": 0.00013980494576663018, + "loss": 0.8457, + "step": 24654 + }, + { + "epoch": 0.6330712654524563, + "grad_norm": 0.765625, + "learning_rate": 0.0001398008504292436, + "loss": 0.8438, + "step": 24655 + }, + { + "epoch": 0.6330969426483781, + "grad_norm": 0.828125, + "learning_rate": 0.00013979675501253752, + "loss": 0.8448, + "step": 24656 + }, + { + "epoch": 0.6331226198442998, + "grad_norm": 0.84375, + "learning_rate": 0.0001397926595165201, + "loss": 0.8547, + "step": 24657 + }, + { + "epoch": 0.6331482970402217, + "grad_norm": 0.7734375, + "learning_rate": 0.00013978856394119955, + "loss": 0.7906, + "step": 24658 + }, + { + "epoch": 0.6331739742361435, + "grad_norm": 0.79296875, + "learning_rate": 0.00013978446828658396, + "loss": 0.8336, + "step": 24659 + }, + { + "epoch": 0.6331996514320654, + "grad_norm": 0.7734375, + "learning_rate": 0.00013978037255268151, + "loss": 0.8769, + "step": 24660 + }, + { + "epoch": 0.6332253286279872, + "grad_norm": 0.8359375, + "learning_rate": 0.00013977627673950039, + "loss": 0.8281, + "step": 24661 + }, + { + "epoch": 0.633251005823909, + "grad_norm": 0.83203125, + "learning_rate": 0.00013977218084704874, + "loss": 0.9133, + "step": 24662 + }, + { + "epoch": 0.6332766830198309, + "grad_norm": 0.86328125, + "learning_rate": 0.0001397680848753347, + "loss": 1.0172, + "step": 24663 + }, + { + "epoch": 0.6333023602157526, + "grad_norm": 0.765625, + "learning_rate": 0.0001397639888243665, + "loss": 0.9491, + "step": 24664 + }, + { + "epoch": 0.6333280374116744, + "grad_norm": 0.796875, + "learning_rate": 0.00013975989269415223, + "loss": 0.8657, + "step": 24665 + }, + { + "epoch": 0.6333537146075963, + "grad_norm": 0.734375, + "learning_rate": 0.00013975579648470006, + "loss": 0.8394, + "step": 24666 + }, + { + "epoch": 0.6333793918035181, + "grad_norm": 0.79296875, + "learning_rate": 0.00013975170019601822, + "loss": 0.7663, + "step": 24667 + }, + { + "epoch": 0.6334050689994399, + "grad_norm": 0.80859375, + "learning_rate": 0.00013974760382811483, + "loss": 0.7635, + "step": 24668 + }, + { + "epoch": 0.6334307461953618, + "grad_norm": 0.82421875, + "learning_rate": 0.000139743507380998, + "loss": 0.8651, + "step": 24669 + }, + { + "epoch": 0.6334564233912836, + "grad_norm": 0.7890625, + "learning_rate": 0.00013973941085467604, + "loss": 0.8098, + "step": 24670 + }, + { + "epoch": 0.6334821005872053, + "grad_norm": 0.8515625, + "learning_rate": 0.00013973531424915699, + "loss": 0.8218, + "step": 24671 + }, + { + "epoch": 0.6335077777831272, + "grad_norm": 0.78125, + "learning_rate": 0.00013973121756444905, + "loss": 0.844, + "step": 24672 + }, + { + "epoch": 0.633533454979049, + "grad_norm": 0.70703125, + "learning_rate": 0.00013972712080056036, + "loss": 0.9001, + "step": 24673 + }, + { + "epoch": 0.6335591321749708, + "grad_norm": 0.71484375, + "learning_rate": 0.0001397230239574991, + "loss": 0.6969, + "step": 24674 + }, + { + "epoch": 0.6335848093708927, + "grad_norm": 0.8125, + "learning_rate": 0.00013971892703527346, + "loss": 0.9096, + "step": 24675 + }, + { + "epoch": 0.6336104865668145, + "grad_norm": 0.78515625, + "learning_rate": 0.00013971483003389162, + "loss": 0.9846, + "step": 24676 + }, + { + "epoch": 0.6336361637627362, + "grad_norm": 0.7421875, + "learning_rate": 0.00013971073295336166, + "loss": 0.8333, + "step": 24677 + }, + { + "epoch": 0.6336618409586581, + "grad_norm": 0.73046875, + "learning_rate": 0.0001397066357936918, + "loss": 0.9491, + "step": 24678 + }, + { + "epoch": 0.6336875181545799, + "grad_norm": 0.71484375, + "learning_rate": 0.00013970253855489023, + "loss": 0.9016, + "step": 24679 + }, + { + "epoch": 0.6337131953505017, + "grad_norm": 0.82421875, + "learning_rate": 0.00013969844123696507, + "loss": 0.9646, + "step": 24680 + }, + { + "epoch": 0.6337388725464236, + "grad_norm": 0.73828125, + "learning_rate": 0.00013969434383992452, + "loss": 0.7868, + "step": 24681 + }, + { + "epoch": 0.6337645497423454, + "grad_norm": 0.78515625, + "learning_rate": 0.00013969024636377667, + "loss": 0.9155, + "step": 24682 + }, + { + "epoch": 0.6337902269382673, + "grad_norm": 0.74609375, + "learning_rate": 0.00013968614880852982, + "loss": 0.7287, + "step": 24683 + }, + { + "epoch": 0.633815904134189, + "grad_norm": 0.8046875, + "learning_rate": 0.00013968205117419205, + "loss": 0.7896, + "step": 24684 + }, + { + "epoch": 0.6338415813301108, + "grad_norm": 0.828125, + "learning_rate": 0.00013967795346077152, + "loss": 0.9028, + "step": 24685 + }, + { + "epoch": 0.6338672585260327, + "grad_norm": 0.8046875, + "learning_rate": 0.0001396738556682764, + "loss": 0.8943, + "step": 24686 + }, + { + "epoch": 0.6338929357219545, + "grad_norm": 0.8125, + "learning_rate": 0.00013966975779671487, + "loss": 0.8987, + "step": 24687 + }, + { + "epoch": 0.6339186129178763, + "grad_norm": 0.84765625, + "learning_rate": 0.00013966565984609508, + "loss": 0.8489, + "step": 24688 + }, + { + "epoch": 0.6339442901137982, + "grad_norm": 0.70703125, + "learning_rate": 0.00013966156181642524, + "loss": 0.7454, + "step": 24689 + }, + { + "epoch": 0.63396996730972, + "grad_norm": 0.828125, + "learning_rate": 0.00013965746370771348, + "loss": 0.8238, + "step": 24690 + }, + { + "epoch": 0.6339956445056417, + "grad_norm": 0.8046875, + "learning_rate": 0.00013965336551996803, + "loss": 0.8642, + "step": 24691 + }, + { + "epoch": 0.6340213217015636, + "grad_norm": 0.78515625, + "learning_rate": 0.00013964926725319694, + "loss": 0.8363, + "step": 24692 + }, + { + "epoch": 0.6340469988974854, + "grad_norm": 0.87109375, + "learning_rate": 0.00013964516890740844, + "loss": 0.8518, + "step": 24693 + }, + { + "epoch": 0.6340726760934072, + "grad_norm": 0.76953125, + "learning_rate": 0.00013964107048261071, + "loss": 0.7632, + "step": 24694 + }, + { + "epoch": 0.6340983532893291, + "grad_norm": 0.7890625, + "learning_rate": 0.00013963697197881195, + "loss": 1.0131, + "step": 24695 + }, + { + "epoch": 0.6341240304852509, + "grad_norm": 0.73828125, + "learning_rate": 0.00013963287339602021, + "loss": 0.7929, + "step": 24696 + }, + { + "epoch": 0.6341497076811726, + "grad_norm": 0.7265625, + "learning_rate": 0.0001396287747342438, + "loss": 0.7695, + "step": 24697 + }, + { + "epoch": 0.6341753848770945, + "grad_norm": 0.7421875, + "learning_rate": 0.0001396246759934908, + "loss": 0.7849, + "step": 24698 + }, + { + "epoch": 0.6342010620730163, + "grad_norm": 0.70703125, + "learning_rate": 0.00013962057717376939, + "loss": 0.8016, + "step": 24699 + }, + { + "epoch": 0.6342267392689381, + "grad_norm": 0.8203125, + "learning_rate": 0.00013961647827508776, + "loss": 0.9331, + "step": 24700 + }, + { + "epoch": 0.63425241646486, + "grad_norm": 0.7890625, + "learning_rate": 0.00013961237929745405, + "loss": 0.8691, + "step": 24701 + }, + { + "epoch": 0.6342780936607818, + "grad_norm": 0.80859375, + "learning_rate": 0.00013960828024087645, + "loss": 0.9435, + "step": 24702 + }, + { + "epoch": 0.6343037708567036, + "grad_norm": 0.765625, + "learning_rate": 0.00013960418110536314, + "loss": 0.8557, + "step": 24703 + }, + { + "epoch": 0.6343294480526254, + "grad_norm": 0.8046875, + "learning_rate": 0.00013960008189092227, + "loss": 0.9063, + "step": 24704 + }, + { + "epoch": 0.6343551252485472, + "grad_norm": 0.76171875, + "learning_rate": 0.00013959598259756203, + "loss": 0.8563, + "step": 24705 + }, + { + "epoch": 0.634380802444469, + "grad_norm": 0.83203125, + "learning_rate": 0.00013959188322529054, + "loss": 0.7841, + "step": 24706 + }, + { + "epoch": 0.6344064796403909, + "grad_norm": 0.796875, + "learning_rate": 0.00013958778377411602, + "loss": 0.8516, + "step": 24707 + }, + { + "epoch": 0.6344321568363127, + "grad_norm": 0.80859375, + "learning_rate": 0.00013958368424404663, + "loss": 0.958, + "step": 24708 + }, + { + "epoch": 0.6344578340322345, + "grad_norm": 0.80078125, + "learning_rate": 0.0001395795846350905, + "loss": 0.8228, + "step": 24709 + }, + { + "epoch": 0.6344835112281564, + "grad_norm": 0.79296875, + "learning_rate": 0.00013957548494725587, + "loss": 0.7977, + "step": 24710 + }, + { + "epoch": 0.6345091884240781, + "grad_norm": 0.73828125, + "learning_rate": 0.00013957138518055088, + "loss": 0.8498, + "step": 24711 + }, + { + "epoch": 0.6345348656199999, + "grad_norm": 0.734375, + "learning_rate": 0.00013956728533498364, + "loss": 0.9139, + "step": 24712 + }, + { + "epoch": 0.6345605428159218, + "grad_norm": 0.80859375, + "learning_rate": 0.00013956318541056244, + "loss": 0.801, + "step": 24713 + }, + { + "epoch": 0.6345862200118436, + "grad_norm": 0.73046875, + "learning_rate": 0.00013955908540729531, + "loss": 0.7661, + "step": 24714 + }, + { + "epoch": 0.6346118972077655, + "grad_norm": 0.828125, + "learning_rate": 0.00013955498532519057, + "loss": 0.7193, + "step": 24715 + }, + { + "epoch": 0.6346375744036873, + "grad_norm": 0.7890625, + "learning_rate": 0.00013955088516425627, + "loss": 0.7833, + "step": 24716 + }, + { + "epoch": 0.634663251599609, + "grad_norm": 0.765625, + "learning_rate": 0.00013954678492450066, + "loss": 1.0295, + "step": 24717 + }, + { + "epoch": 0.6346889287955308, + "grad_norm": 0.87109375, + "learning_rate": 0.00013954268460593188, + "loss": 0.86, + "step": 24718 + }, + { + "epoch": 0.6347146059914527, + "grad_norm": 0.8203125, + "learning_rate": 0.0001395385842085581, + "loss": 0.9835, + "step": 24719 + }, + { + "epoch": 0.6347402831873745, + "grad_norm": 0.796875, + "learning_rate": 0.00013953448373238745, + "loss": 0.7315, + "step": 24720 + }, + { + "epoch": 0.6347659603832964, + "grad_norm": 0.78125, + "learning_rate": 0.00013953038317742817, + "loss": 0.9203, + "step": 24721 + }, + { + "epoch": 0.6347916375792182, + "grad_norm": 0.6875, + "learning_rate": 0.0001395262825436884, + "loss": 0.7878, + "step": 24722 + }, + { + "epoch": 0.63481731477514, + "grad_norm": 0.796875, + "learning_rate": 0.00013952218183117636, + "loss": 0.9123, + "step": 24723 + }, + { + "epoch": 0.6348429919710618, + "grad_norm": 0.71875, + "learning_rate": 0.00013951808103990017, + "loss": 0.7586, + "step": 24724 + }, + { + "epoch": 0.6348686691669836, + "grad_norm": 0.796875, + "learning_rate": 0.00013951398016986798, + "loss": 0.9161, + "step": 24725 + }, + { + "epoch": 0.6348943463629054, + "grad_norm": 0.78515625, + "learning_rate": 0.000139509879221088, + "loss": 0.8997, + "step": 24726 + }, + { + "epoch": 0.6349200235588273, + "grad_norm": 0.7265625, + "learning_rate": 0.00013950577819356842, + "loss": 0.8463, + "step": 24727 + }, + { + "epoch": 0.6349457007547491, + "grad_norm": 0.765625, + "learning_rate": 0.00013950167708731738, + "loss": 0.868, + "step": 24728 + }, + { + "epoch": 0.6349713779506709, + "grad_norm": 0.72265625, + "learning_rate": 0.00013949757590234308, + "loss": 0.8319, + "step": 24729 + }, + { + "epoch": 0.6349970551465928, + "grad_norm": 0.87890625, + "learning_rate": 0.00013949347463865368, + "loss": 0.9655, + "step": 24730 + }, + { + "epoch": 0.6350227323425145, + "grad_norm": 0.85546875, + "learning_rate": 0.00013948937329625734, + "loss": 0.9435, + "step": 24731 + }, + { + "epoch": 0.6350484095384363, + "grad_norm": 0.7109375, + "learning_rate": 0.00013948527187516226, + "loss": 0.7671, + "step": 24732 + }, + { + "epoch": 0.6350740867343582, + "grad_norm": 0.87890625, + "learning_rate": 0.00013948117037537657, + "loss": 0.9403, + "step": 24733 + }, + { + "epoch": 0.63509976393028, + "grad_norm": 0.77734375, + "learning_rate": 0.00013947706879690848, + "loss": 0.8484, + "step": 24734 + }, + { + "epoch": 0.6351254411262018, + "grad_norm": 0.69140625, + "learning_rate": 0.0001394729671397662, + "loss": 0.7164, + "step": 24735 + }, + { + "epoch": 0.6351511183221237, + "grad_norm": 0.87890625, + "learning_rate": 0.0001394688654039578, + "loss": 0.8685, + "step": 24736 + }, + { + "epoch": 0.6351767955180454, + "grad_norm": 0.76171875, + "learning_rate": 0.00013946476358949158, + "loss": 0.8479, + "step": 24737 + }, + { + "epoch": 0.6352024727139672, + "grad_norm": 0.70703125, + "learning_rate": 0.0001394606616963756, + "loss": 0.7931, + "step": 24738 + }, + { + "epoch": 0.6352281499098891, + "grad_norm": 0.8046875, + "learning_rate": 0.00013945655972461812, + "loss": 0.923, + "step": 24739 + }, + { + "epoch": 0.6352538271058109, + "grad_norm": 0.69921875, + "learning_rate": 0.00013945245767422727, + "loss": 0.8519, + "step": 24740 + }, + { + "epoch": 0.6352795043017327, + "grad_norm": 0.76953125, + "learning_rate": 0.00013944835554521122, + "loss": 0.8999, + "step": 24741 + }, + { + "epoch": 0.6353051814976546, + "grad_norm": 0.83203125, + "learning_rate": 0.00013944425333757817, + "loss": 0.8237, + "step": 24742 + }, + { + "epoch": 0.6353308586935764, + "grad_norm": 0.75390625, + "learning_rate": 0.0001394401510513363, + "loss": 0.8202, + "step": 24743 + }, + { + "epoch": 0.6353565358894981, + "grad_norm": 0.87109375, + "learning_rate": 0.00013943604868649377, + "loss": 0.8713, + "step": 24744 + }, + { + "epoch": 0.63538221308542, + "grad_norm": 0.86328125, + "learning_rate": 0.00013943194624305876, + "loss": 0.7803, + "step": 24745 + }, + { + "epoch": 0.6354078902813418, + "grad_norm": 0.82421875, + "learning_rate": 0.0001394278437210394, + "loss": 0.7929, + "step": 24746 + }, + { + "epoch": 0.6354335674772636, + "grad_norm": 0.86328125, + "learning_rate": 0.00013942374112044394, + "loss": 0.9366, + "step": 24747 + }, + { + "epoch": 0.6354592446731855, + "grad_norm": 0.7890625, + "learning_rate": 0.00013941963844128053, + "loss": 0.9538, + "step": 24748 + }, + { + "epoch": 0.6354849218691073, + "grad_norm": 0.87109375, + "learning_rate": 0.00013941553568355732, + "loss": 0.8826, + "step": 24749 + }, + { + "epoch": 0.6355105990650292, + "grad_norm": 0.81640625, + "learning_rate": 0.00013941143284728255, + "loss": 0.8922, + "step": 24750 + }, + { + "epoch": 0.6355362762609509, + "grad_norm": 0.84375, + "learning_rate": 0.00013940732993246433, + "loss": 1.0224, + "step": 24751 + }, + { + "epoch": 0.6355619534568727, + "grad_norm": 0.8515625, + "learning_rate": 0.00013940322693911086, + "loss": 0.8316, + "step": 24752 + }, + { + "epoch": 0.6355876306527946, + "grad_norm": 0.79296875, + "learning_rate": 0.00013939912386723034, + "loss": 0.9174, + "step": 24753 + }, + { + "epoch": 0.6356133078487164, + "grad_norm": 0.75390625, + "learning_rate": 0.00013939502071683087, + "loss": 0.8322, + "step": 24754 + }, + { + "epoch": 0.6356389850446382, + "grad_norm": 0.78125, + "learning_rate": 0.0001393909174879207, + "loss": 0.748, + "step": 24755 + }, + { + "epoch": 0.6356646622405601, + "grad_norm": 0.78515625, + "learning_rate": 0.00013938681418050806, + "loss": 0.8646, + "step": 24756 + }, + { + "epoch": 0.6356903394364818, + "grad_norm": 0.83203125, + "learning_rate": 0.00013938271079460098, + "loss": 0.9922, + "step": 24757 + }, + { + "epoch": 0.6357160166324036, + "grad_norm": 0.72265625, + "learning_rate": 0.00013937860733020772, + "loss": 0.8291, + "step": 24758 + }, + { + "epoch": 0.6357416938283255, + "grad_norm": 0.75, + "learning_rate": 0.00013937450378733647, + "loss": 0.7776, + "step": 24759 + }, + { + "epoch": 0.6357673710242473, + "grad_norm": 0.87890625, + "learning_rate": 0.0001393704001659954, + "loss": 0.8791, + "step": 24760 + }, + { + "epoch": 0.6357930482201691, + "grad_norm": 0.7734375, + "learning_rate": 0.0001393662964661927, + "loss": 0.8948, + "step": 24761 + }, + { + "epoch": 0.635818725416091, + "grad_norm": 0.828125, + "learning_rate": 0.00013936219268793643, + "loss": 0.8137, + "step": 24762 + }, + { + "epoch": 0.6358444026120128, + "grad_norm": 0.83984375, + "learning_rate": 0.00013935808883123497, + "loss": 0.895, + "step": 24763 + }, + { + "epoch": 0.6358700798079345, + "grad_norm": 0.77734375, + "learning_rate": 0.00013935398489609637, + "loss": 0.7704, + "step": 24764 + }, + { + "epoch": 0.6358957570038564, + "grad_norm": 0.84375, + "learning_rate": 0.0001393498808825288, + "loss": 0.8691, + "step": 24765 + }, + { + "epoch": 0.6359214341997782, + "grad_norm": 0.7734375, + "learning_rate": 0.00013934577679054048, + "loss": 0.8922, + "step": 24766 + }, + { + "epoch": 0.6359471113957, + "grad_norm": 0.73828125, + "learning_rate": 0.00013934167262013962, + "loss": 0.7609, + "step": 24767 + }, + { + "epoch": 0.6359727885916219, + "grad_norm": 0.875, + "learning_rate": 0.0001393375683713343, + "loss": 0.9691, + "step": 24768 + }, + { + "epoch": 0.6359984657875437, + "grad_norm": 0.78515625, + "learning_rate": 0.00013933346404413282, + "loss": 0.7625, + "step": 24769 + }, + { + "epoch": 0.6360241429834655, + "grad_norm": 0.76953125, + "learning_rate": 0.00013932935963854325, + "loss": 0.8688, + "step": 24770 + }, + { + "epoch": 0.6360498201793873, + "grad_norm": 0.75390625, + "learning_rate": 0.00013932525515457383, + "loss": 0.9614, + "step": 24771 + }, + { + "epoch": 0.6360754973753091, + "grad_norm": 0.83984375, + "learning_rate": 0.00013932115059223275, + "loss": 0.8408, + "step": 24772 + }, + { + "epoch": 0.6361011745712309, + "grad_norm": 0.72265625, + "learning_rate": 0.00013931704595152814, + "loss": 0.8997, + "step": 24773 + }, + { + "epoch": 0.6361268517671528, + "grad_norm": 0.70703125, + "learning_rate": 0.00013931294123246826, + "loss": 0.8467, + "step": 24774 + }, + { + "epoch": 0.6361525289630746, + "grad_norm": 1.1171875, + "learning_rate": 0.00013930883643506118, + "loss": 0.9053, + "step": 24775 + }, + { + "epoch": 0.6361782061589965, + "grad_norm": 0.78515625, + "learning_rate": 0.0001393047315593152, + "loss": 0.9444, + "step": 24776 + }, + { + "epoch": 0.6362038833549182, + "grad_norm": 0.7734375, + "learning_rate": 0.0001393006266052384, + "loss": 0.861, + "step": 24777 + }, + { + "epoch": 0.63622956055084, + "grad_norm": 0.859375, + "learning_rate": 0.00013929652157283897, + "loss": 0.7412, + "step": 24778 + }, + { + "epoch": 0.6362552377467618, + "grad_norm": 0.82421875, + "learning_rate": 0.00013929241646212516, + "loss": 1.0427, + "step": 24779 + }, + { + "epoch": 0.6362809149426837, + "grad_norm": 0.7734375, + "learning_rate": 0.00013928831127310514, + "loss": 0.7937, + "step": 24780 + }, + { + "epoch": 0.6363065921386055, + "grad_norm": 1.1171875, + "learning_rate": 0.00013928420600578702, + "loss": 0.8663, + "step": 24781 + }, + { + "epoch": 0.6363322693345274, + "grad_norm": 0.8125, + "learning_rate": 0.000139280100660179, + "loss": 0.8128, + "step": 24782 + }, + { + "epoch": 0.6363579465304492, + "grad_norm": 0.85546875, + "learning_rate": 0.0001392759952362894, + "loss": 0.9224, + "step": 24783 + }, + { + "epoch": 0.6363836237263709, + "grad_norm": 0.7109375, + "learning_rate": 0.0001392718897341262, + "loss": 0.7956, + "step": 24784 + }, + { + "epoch": 0.6364093009222928, + "grad_norm": 0.76171875, + "learning_rate": 0.00013926778415369767, + "loss": 0.8668, + "step": 24785 + }, + { + "epoch": 0.6364349781182146, + "grad_norm": 0.859375, + "learning_rate": 0.000139263678495012, + "loss": 0.8552, + "step": 24786 + }, + { + "epoch": 0.6364606553141364, + "grad_norm": 0.765625, + "learning_rate": 0.0001392595727580774, + "loss": 0.8366, + "step": 24787 + }, + { + "epoch": 0.6364863325100583, + "grad_norm": 0.71484375, + "learning_rate": 0.00013925546694290198, + "loss": 0.7696, + "step": 24788 + }, + { + "epoch": 0.6365120097059801, + "grad_norm": 0.75, + "learning_rate": 0.000139251361049494, + "loss": 0.8225, + "step": 24789 + }, + { + "epoch": 0.6365376869019018, + "grad_norm": 0.8828125, + "learning_rate": 0.0001392472550778616, + "loss": 0.8067, + "step": 24790 + }, + { + "epoch": 0.6365633640978237, + "grad_norm": 0.81640625, + "learning_rate": 0.0001392431490280129, + "loss": 1.0387, + "step": 24791 + }, + { + "epoch": 0.6365890412937455, + "grad_norm": 0.84375, + "learning_rate": 0.00013923904289995622, + "loss": 0.8587, + "step": 24792 + }, + { + "epoch": 0.6366147184896673, + "grad_norm": 0.71484375, + "learning_rate": 0.00013923493669369964, + "loss": 0.7751, + "step": 24793 + }, + { + "epoch": 0.6366403956855892, + "grad_norm": 0.7578125, + "learning_rate": 0.00013923083040925138, + "loss": 0.7656, + "step": 24794 + }, + { + "epoch": 0.636666072881511, + "grad_norm": 0.7265625, + "learning_rate": 0.00013922672404661961, + "loss": 0.6701, + "step": 24795 + }, + { + "epoch": 0.6366917500774328, + "grad_norm": 0.734375, + "learning_rate": 0.00013922261760581254, + "loss": 0.7746, + "step": 24796 + }, + { + "epoch": 0.6367174272733546, + "grad_norm": 0.7734375, + "learning_rate": 0.00013921851108683833, + "loss": 0.8174, + "step": 24797 + }, + { + "epoch": 0.6367431044692764, + "grad_norm": 0.77734375, + "learning_rate": 0.00013921440448970518, + "loss": 0.9979, + "step": 24798 + }, + { + "epoch": 0.6367687816651982, + "grad_norm": 0.6953125, + "learning_rate": 0.00013921029781442123, + "loss": 0.7318, + "step": 24799 + }, + { + "epoch": 0.6367944588611201, + "grad_norm": 0.81640625, + "learning_rate": 0.00013920619106099472, + "loss": 0.9635, + "step": 24800 + }, + { + "epoch": 0.6368201360570419, + "grad_norm": 0.81640625, + "learning_rate": 0.00013920208422943384, + "loss": 0.8934, + "step": 24801 + }, + { + "epoch": 0.6368458132529637, + "grad_norm": 0.72265625, + "learning_rate": 0.0001391979773197467, + "loss": 0.7813, + "step": 24802 + }, + { + "epoch": 0.6368714904488856, + "grad_norm": 0.78515625, + "learning_rate": 0.00013919387033194157, + "loss": 0.8566, + "step": 24803 + }, + { + "epoch": 0.6368971676448073, + "grad_norm": 0.78515625, + "learning_rate": 0.00013918976326602657, + "loss": 1.0198, + "step": 24804 + }, + { + "epoch": 0.6369228448407291, + "grad_norm": 0.734375, + "learning_rate": 0.00013918565612200992, + "loss": 0.7631, + "step": 24805 + }, + { + "epoch": 0.636948522036651, + "grad_norm": 0.80078125, + "learning_rate": 0.0001391815488998998, + "loss": 0.8625, + "step": 24806 + }, + { + "epoch": 0.6369741992325728, + "grad_norm": 0.82421875, + "learning_rate": 0.00013917744159970438, + "loss": 0.9533, + "step": 24807 + }, + { + "epoch": 0.6369998764284946, + "grad_norm": 0.7578125, + "learning_rate": 0.00013917333422143187, + "loss": 0.8649, + "step": 24808 + }, + { + "epoch": 0.6370255536244165, + "grad_norm": 0.75, + "learning_rate": 0.00013916922676509045, + "loss": 0.7305, + "step": 24809 + }, + { + "epoch": 0.6370512308203382, + "grad_norm": 0.69921875, + "learning_rate": 0.0001391651192306883, + "loss": 0.6638, + "step": 24810 + }, + { + "epoch": 0.63707690801626, + "grad_norm": 0.75390625, + "learning_rate": 0.0001391610116182336, + "loss": 0.8968, + "step": 24811 + }, + { + "epoch": 0.6371025852121819, + "grad_norm": 0.7578125, + "learning_rate": 0.00013915690392773454, + "loss": 0.8447, + "step": 24812 + }, + { + "epoch": 0.6371282624081037, + "grad_norm": 0.796875, + "learning_rate": 0.00013915279615919928, + "loss": 0.8441, + "step": 24813 + }, + { + "epoch": 0.6371539396040256, + "grad_norm": 0.75390625, + "learning_rate": 0.00013914868831263604, + "loss": 0.8694, + "step": 24814 + }, + { + "epoch": 0.6371796167999474, + "grad_norm": 0.87890625, + "learning_rate": 0.00013914458038805306, + "loss": 0.8386, + "step": 24815 + }, + { + "epoch": 0.6372052939958692, + "grad_norm": 0.8671875, + "learning_rate": 0.0001391404723854584, + "loss": 0.9995, + "step": 24816 + }, + { + "epoch": 0.637230971191791, + "grad_norm": 0.7578125, + "learning_rate": 0.00013913636430486036, + "loss": 0.811, + "step": 24817 + }, + { + "epoch": 0.6372566483877128, + "grad_norm": 0.765625, + "learning_rate": 0.00013913225614626705, + "loss": 0.8836, + "step": 24818 + }, + { + "epoch": 0.6372823255836346, + "grad_norm": 0.76953125, + "learning_rate": 0.00013912814790968672, + "loss": 0.8666, + "step": 24819 + }, + { + "epoch": 0.6373080027795565, + "grad_norm": 0.7109375, + "learning_rate": 0.0001391240395951275, + "loss": 0.8169, + "step": 24820 + }, + { + "epoch": 0.6373336799754783, + "grad_norm": 0.8671875, + "learning_rate": 0.00013911993120259758, + "loss": 0.7867, + "step": 24821 + }, + { + "epoch": 0.6373593571714001, + "grad_norm": 0.8203125, + "learning_rate": 0.00013911582273210525, + "loss": 0.8394, + "step": 24822 + }, + { + "epoch": 0.637385034367322, + "grad_norm": 0.7421875, + "learning_rate": 0.00013911171418365853, + "loss": 0.7505, + "step": 24823 + }, + { + "epoch": 0.6374107115632437, + "grad_norm": 0.7578125, + "learning_rate": 0.00013910760555726573, + "loss": 0.8039, + "step": 24824 + }, + { + "epoch": 0.6374363887591655, + "grad_norm": 0.77734375, + "learning_rate": 0.00013910349685293506, + "loss": 0.7345, + "step": 24825 + }, + { + "epoch": 0.6374620659550874, + "grad_norm": 0.73046875, + "learning_rate": 0.00013909938807067456, + "loss": 0.8351, + "step": 24826 + }, + { + "epoch": 0.6374877431510092, + "grad_norm": 0.76171875, + "learning_rate": 0.00013909527921049256, + "loss": 0.822, + "step": 24827 + }, + { + "epoch": 0.637513420346931, + "grad_norm": 0.78515625, + "learning_rate": 0.0001390911702723972, + "loss": 0.9052, + "step": 24828 + }, + { + "epoch": 0.6375390975428529, + "grad_norm": 0.8203125, + "learning_rate": 0.00013908706125639666, + "loss": 0.8949, + "step": 24829 + }, + { + "epoch": 0.6375647747387746, + "grad_norm": 0.77734375, + "learning_rate": 0.00013908295216249914, + "loss": 0.7083, + "step": 24830 + }, + { + "epoch": 0.6375904519346964, + "grad_norm": 0.7578125, + "learning_rate": 0.0001390788429907128, + "loss": 0.7835, + "step": 24831 + }, + { + "epoch": 0.6376161291306183, + "grad_norm": 0.80078125, + "learning_rate": 0.00013907473374104589, + "loss": 0.9204, + "step": 24832 + }, + { + "epoch": 0.6376418063265401, + "grad_norm": 0.78125, + "learning_rate": 0.00013907062441350659, + "loss": 0.7764, + "step": 24833 + }, + { + "epoch": 0.6376674835224619, + "grad_norm": 0.77734375, + "learning_rate": 0.000139066515008103, + "loss": 0.7908, + "step": 24834 + }, + { + "epoch": 0.6376931607183838, + "grad_norm": 0.76171875, + "learning_rate": 0.00013906240552484343, + "loss": 0.8834, + "step": 24835 + }, + { + "epoch": 0.6377188379143056, + "grad_norm": 0.76953125, + "learning_rate": 0.00013905829596373598, + "loss": 0.6344, + "step": 24836 + }, + { + "epoch": 0.6377445151102273, + "grad_norm": 0.7890625, + "learning_rate": 0.0001390541863247889, + "loss": 0.7641, + "step": 24837 + }, + { + "epoch": 0.6377701923061492, + "grad_norm": 0.80859375, + "learning_rate": 0.00013905007660801032, + "loss": 0.8934, + "step": 24838 + }, + { + "epoch": 0.637795869502071, + "grad_norm": 0.91015625, + "learning_rate": 0.0001390459668134085, + "loss": 1.095, + "step": 24839 + }, + { + "epoch": 0.6378215466979928, + "grad_norm": 0.7734375, + "learning_rate": 0.00013904185694099157, + "loss": 0.9742, + "step": 24840 + }, + { + "epoch": 0.6378472238939147, + "grad_norm": 0.76953125, + "learning_rate": 0.00013903774699076778, + "loss": 0.9655, + "step": 24841 + }, + { + "epoch": 0.6378729010898365, + "grad_norm": 0.74609375, + "learning_rate": 0.00013903363696274524, + "loss": 0.841, + "step": 24842 + }, + { + "epoch": 0.6378985782857584, + "grad_norm": 0.78125, + "learning_rate": 0.00013902952685693223, + "loss": 0.8266, + "step": 24843 + }, + { + "epoch": 0.6379242554816801, + "grad_norm": 0.828125, + "learning_rate": 0.00013902541667333687, + "loss": 0.867, + "step": 24844 + }, + { + "epoch": 0.6379499326776019, + "grad_norm": 0.78125, + "learning_rate": 0.00013902130641196738, + "loss": 0.8601, + "step": 24845 + }, + { + "epoch": 0.6379756098735238, + "grad_norm": 0.78515625, + "learning_rate": 0.00013901719607283198, + "loss": 0.9478, + "step": 24846 + }, + { + "epoch": 0.6380012870694456, + "grad_norm": 0.85546875, + "learning_rate": 0.00013901308565593878, + "loss": 0.925, + "step": 24847 + }, + { + "epoch": 0.6380269642653674, + "grad_norm": 0.75390625, + "learning_rate": 0.00013900897516129607, + "loss": 0.8921, + "step": 24848 + }, + { + "epoch": 0.6380526414612893, + "grad_norm": 0.73046875, + "learning_rate": 0.000139004864588912, + "loss": 0.7944, + "step": 24849 + }, + { + "epoch": 0.638078318657211, + "grad_norm": 0.72265625, + "learning_rate": 0.00013900075393879471, + "loss": 0.8242, + "step": 24850 + }, + { + "epoch": 0.6381039958531328, + "grad_norm": 0.73828125, + "learning_rate": 0.0001389966432109525, + "loss": 0.9163, + "step": 24851 + }, + { + "epoch": 0.6381296730490547, + "grad_norm": 0.80859375, + "learning_rate": 0.00013899253240539348, + "loss": 0.8903, + "step": 24852 + }, + { + "epoch": 0.6381553502449765, + "grad_norm": 0.73828125, + "learning_rate": 0.00013898842152212584, + "loss": 0.7839, + "step": 24853 + }, + { + "epoch": 0.6381810274408983, + "grad_norm": 0.79296875, + "learning_rate": 0.00013898431056115783, + "loss": 0.6807, + "step": 24854 + }, + { + "epoch": 0.6382067046368202, + "grad_norm": 0.75, + "learning_rate": 0.00013898019952249756, + "loss": 0.8621, + "step": 24855 + }, + { + "epoch": 0.638232381832742, + "grad_norm": 0.76953125, + "learning_rate": 0.00013897608840615332, + "loss": 0.8717, + "step": 24856 + }, + { + "epoch": 0.6382580590286637, + "grad_norm": 0.79296875, + "learning_rate": 0.00013897197721213326, + "loss": 0.9622, + "step": 24857 + }, + { + "epoch": 0.6382837362245856, + "grad_norm": 0.79296875, + "learning_rate": 0.00013896786594044556, + "loss": 0.8985, + "step": 24858 + }, + { + "epoch": 0.6383094134205074, + "grad_norm": 0.8125, + "learning_rate": 0.0001389637545910984, + "loss": 0.9275, + "step": 24859 + }, + { + "epoch": 0.6383350906164292, + "grad_norm": 0.78515625, + "learning_rate": 0.00013895964316410004, + "loss": 0.8407, + "step": 24860 + }, + { + "epoch": 0.6383607678123511, + "grad_norm": 0.84375, + "learning_rate": 0.00013895553165945858, + "loss": 0.8768, + "step": 24861 + }, + { + "epoch": 0.6383864450082729, + "grad_norm": 0.765625, + "learning_rate": 0.00013895142007718233, + "loss": 0.7267, + "step": 24862 + }, + { + "epoch": 0.6384121222041947, + "grad_norm": 0.81640625, + "learning_rate": 0.00013894730841727934, + "loss": 0.9651, + "step": 24863 + }, + { + "epoch": 0.6384377994001165, + "grad_norm": 0.73828125, + "learning_rate": 0.00013894319667975794, + "loss": 0.7543, + "step": 24864 + }, + { + "epoch": 0.6384634765960383, + "grad_norm": 0.8046875, + "learning_rate": 0.00013893908486462624, + "loss": 0.839, + "step": 24865 + }, + { + "epoch": 0.6384891537919601, + "grad_norm": 0.78125, + "learning_rate": 0.00013893497297189246, + "loss": 0.8401, + "step": 24866 + }, + { + "epoch": 0.638514830987882, + "grad_norm": 0.7109375, + "learning_rate": 0.0001389308610015648, + "loss": 0.7894, + "step": 24867 + }, + { + "epoch": 0.6385405081838038, + "grad_norm": 0.70703125, + "learning_rate": 0.00013892674895365148, + "loss": 0.8679, + "step": 24868 + }, + { + "epoch": 0.6385661853797256, + "grad_norm": 0.80859375, + "learning_rate": 0.00013892263682816064, + "loss": 0.8656, + "step": 24869 + }, + { + "epoch": 0.6385918625756474, + "grad_norm": 0.78515625, + "learning_rate": 0.0001389185246251005, + "loss": 0.9246, + "step": 24870 + }, + { + "epoch": 0.6386175397715692, + "grad_norm": 0.83203125, + "learning_rate": 0.00013891441234447925, + "loss": 0.9122, + "step": 24871 + }, + { + "epoch": 0.638643216967491, + "grad_norm": 0.7890625, + "learning_rate": 0.0001389102999863051, + "loss": 0.79, + "step": 24872 + }, + { + "epoch": 0.6386688941634129, + "grad_norm": 0.87109375, + "learning_rate": 0.00013890618755058626, + "loss": 0.9725, + "step": 24873 + }, + { + "epoch": 0.6386945713593347, + "grad_norm": 0.74609375, + "learning_rate": 0.00013890207503733083, + "loss": 0.8718, + "step": 24874 + }, + { + "epoch": 0.6387202485552566, + "grad_norm": 0.72265625, + "learning_rate": 0.00013889796244654718, + "loss": 0.8329, + "step": 24875 + }, + { + "epoch": 0.6387459257511784, + "grad_norm": 0.75390625, + "learning_rate": 0.0001388938497782433, + "loss": 0.8464, + "step": 24876 + }, + { + "epoch": 0.6387716029471001, + "grad_norm": 0.8359375, + "learning_rate": 0.00013888973703242758, + "loss": 0.947, + "step": 24877 + }, + { + "epoch": 0.638797280143022, + "grad_norm": 0.734375, + "learning_rate": 0.0001388856242091081, + "loss": 0.7921, + "step": 24878 + }, + { + "epoch": 0.6388229573389438, + "grad_norm": 0.828125, + "learning_rate": 0.00013888151130829303, + "loss": 0.8355, + "step": 24879 + }, + { + "epoch": 0.6388486345348656, + "grad_norm": 0.78515625, + "learning_rate": 0.00013887739832999066, + "loss": 0.8264, + "step": 24880 + }, + { + "epoch": 0.6388743117307875, + "grad_norm": 0.75, + "learning_rate": 0.0001388732852742092, + "loss": 0.7042, + "step": 24881 + }, + { + "epoch": 0.6388999889267093, + "grad_norm": 0.765625, + "learning_rate": 0.00013886917214095672, + "loss": 0.7957, + "step": 24882 + }, + { + "epoch": 0.6389256661226311, + "grad_norm": 0.7578125, + "learning_rate": 0.0001388650589302415, + "loss": 0.7711, + "step": 24883 + }, + { + "epoch": 0.6389513433185529, + "grad_norm": 0.82421875, + "learning_rate": 0.00013886094564207174, + "loss": 0.877, + "step": 24884 + }, + { + "epoch": 0.6389770205144747, + "grad_norm": 0.7578125, + "learning_rate": 0.00013885683227645563, + "loss": 0.8188, + "step": 24885 + }, + { + "epoch": 0.6390026977103965, + "grad_norm": 0.78515625, + "learning_rate": 0.00013885271883340138, + "loss": 0.7266, + "step": 24886 + }, + { + "epoch": 0.6390283749063184, + "grad_norm": 0.8828125, + "learning_rate": 0.00013884860531291712, + "loss": 1.044, + "step": 24887 + }, + { + "epoch": 0.6390540521022402, + "grad_norm": 0.73046875, + "learning_rate": 0.00013884449171501118, + "loss": 0.8701, + "step": 24888 + }, + { + "epoch": 0.639079729298162, + "grad_norm": 0.7421875, + "learning_rate": 0.00013884037803969164, + "loss": 0.7798, + "step": 24889 + }, + { + "epoch": 0.6391054064940838, + "grad_norm": 0.953125, + "learning_rate": 0.00013883626428696667, + "loss": 0.9032, + "step": 24890 + }, + { + "epoch": 0.6391310836900056, + "grad_norm": 0.78125, + "learning_rate": 0.0001388321504568446, + "loss": 0.9256, + "step": 24891 + }, + { + "epoch": 0.6391567608859274, + "grad_norm": 0.79296875, + "learning_rate": 0.00013882803654933357, + "loss": 0.9384, + "step": 24892 + }, + { + "epoch": 0.6391824380818493, + "grad_norm": 0.84375, + "learning_rate": 0.00013882392256444172, + "loss": 0.9053, + "step": 24893 + }, + { + "epoch": 0.6392081152777711, + "grad_norm": 0.82421875, + "learning_rate": 0.00013881980850217738, + "loss": 0.7956, + "step": 24894 + }, + { + "epoch": 0.6392337924736929, + "grad_norm": 0.828125, + "learning_rate": 0.00013881569436254858, + "loss": 0.8644, + "step": 24895 + }, + { + "epoch": 0.6392594696696148, + "grad_norm": 0.9296875, + "learning_rate": 0.00013881158014556367, + "loss": 0.8383, + "step": 24896 + }, + { + "epoch": 0.6392851468655365, + "grad_norm": 0.8046875, + "learning_rate": 0.00013880746585123073, + "loss": 0.847, + "step": 24897 + }, + { + "epoch": 0.6393108240614583, + "grad_norm": 0.90234375, + "learning_rate": 0.00013880335147955806, + "loss": 0.9024, + "step": 24898 + }, + { + "epoch": 0.6393365012573802, + "grad_norm": 0.83984375, + "learning_rate": 0.0001387992370305538, + "loss": 1.0401, + "step": 24899 + }, + { + "epoch": 0.639362178453302, + "grad_norm": 0.6953125, + "learning_rate": 0.00013879512250422618, + "loss": 0.7956, + "step": 24900 + }, + { + "epoch": 0.6393878556492238, + "grad_norm": 0.80859375, + "learning_rate": 0.00013879100790058338, + "loss": 0.9358, + "step": 24901 + }, + { + "epoch": 0.6394135328451457, + "grad_norm": 0.88671875, + "learning_rate": 0.00013878689321963357, + "loss": 0.9416, + "step": 24902 + }, + { + "epoch": 0.6394392100410675, + "grad_norm": 0.81640625, + "learning_rate": 0.000138782778461385, + "loss": 0.9516, + "step": 24903 + }, + { + "epoch": 0.6394648872369892, + "grad_norm": 0.7890625, + "learning_rate": 0.0001387786636258459, + "loss": 0.9392, + "step": 24904 + }, + { + "epoch": 0.6394905644329111, + "grad_norm": 0.72265625, + "learning_rate": 0.00013877454871302438, + "loss": 0.7631, + "step": 24905 + }, + { + "epoch": 0.6395162416288329, + "grad_norm": 0.69921875, + "learning_rate": 0.0001387704337229287, + "loss": 0.7991, + "step": 24906 + }, + { + "epoch": 0.6395419188247548, + "grad_norm": 0.73828125, + "learning_rate": 0.00013876631865556704, + "loss": 0.8079, + "step": 24907 + }, + { + "epoch": 0.6395675960206766, + "grad_norm": 0.75390625, + "learning_rate": 0.0001387622035109476, + "loss": 0.6531, + "step": 24908 + }, + { + "epoch": 0.6395932732165984, + "grad_norm": 0.7265625, + "learning_rate": 0.00013875808828907863, + "loss": 0.847, + "step": 24909 + }, + { + "epoch": 0.6396189504125201, + "grad_norm": 0.7421875, + "learning_rate": 0.00013875397298996828, + "loss": 0.793, + "step": 24910 + }, + { + "epoch": 0.639644627608442, + "grad_norm": 0.76953125, + "learning_rate": 0.00013874985761362472, + "loss": 0.7649, + "step": 24911 + }, + { + "epoch": 0.6396703048043638, + "grad_norm": 0.7578125, + "learning_rate": 0.00013874574216005624, + "loss": 0.8272, + "step": 24912 + }, + { + "epoch": 0.6396959820002857, + "grad_norm": 0.7890625, + "learning_rate": 0.000138741626629271, + "loss": 0.8842, + "step": 24913 + }, + { + "epoch": 0.6397216591962075, + "grad_norm": 0.8359375, + "learning_rate": 0.00013873751102127718, + "loss": 0.8881, + "step": 24914 + }, + { + "epoch": 0.6397473363921293, + "grad_norm": 0.75390625, + "learning_rate": 0.000138733395336083, + "loss": 0.9095, + "step": 24915 + }, + { + "epoch": 0.6397730135880512, + "grad_norm": 0.71875, + "learning_rate": 0.00013872927957369666, + "loss": 0.8272, + "step": 24916 + }, + { + "epoch": 0.6397986907839729, + "grad_norm": 0.7890625, + "learning_rate": 0.00013872516373412635, + "loss": 0.8398, + "step": 24917 + }, + { + "epoch": 0.6398243679798947, + "grad_norm": 0.78515625, + "learning_rate": 0.0001387210478173803, + "loss": 0.9307, + "step": 24918 + }, + { + "epoch": 0.6398500451758166, + "grad_norm": 0.76171875, + "learning_rate": 0.0001387169318234667, + "loss": 0.7743, + "step": 24919 + }, + { + "epoch": 0.6398757223717384, + "grad_norm": 0.765625, + "learning_rate": 0.00013871281575239376, + "loss": 0.8108, + "step": 24920 + }, + { + "epoch": 0.6399013995676602, + "grad_norm": 0.765625, + "learning_rate": 0.00013870869960416968, + "loss": 0.8807, + "step": 24921 + }, + { + "epoch": 0.6399270767635821, + "grad_norm": 0.79296875, + "learning_rate": 0.00013870458337880264, + "loss": 0.8448, + "step": 24922 + }, + { + "epoch": 0.6399527539595039, + "grad_norm": 0.765625, + "learning_rate": 0.00013870046707630088, + "loss": 0.8136, + "step": 24923 + }, + { + "epoch": 0.6399784311554256, + "grad_norm": 0.765625, + "learning_rate": 0.00013869635069667255, + "loss": 0.7996, + "step": 24924 + }, + { + "epoch": 0.6400041083513475, + "grad_norm": 0.7890625, + "learning_rate": 0.00013869223423992592, + "loss": 0.8936, + "step": 24925 + }, + { + "epoch": 0.6400297855472693, + "grad_norm": 0.7890625, + "learning_rate": 0.00013868811770606916, + "loss": 0.803, + "step": 24926 + }, + { + "epoch": 0.6400554627431911, + "grad_norm": 0.75390625, + "learning_rate": 0.0001386840010951105, + "loss": 0.922, + "step": 24927 + }, + { + "epoch": 0.640081139939113, + "grad_norm": 0.8203125, + "learning_rate": 0.00013867988440705806, + "loss": 0.8159, + "step": 24928 + }, + { + "epoch": 0.6401068171350348, + "grad_norm": 0.7890625, + "learning_rate": 0.00013867576764192013, + "loss": 0.9242, + "step": 24929 + }, + { + "epoch": 0.6401324943309565, + "grad_norm": 0.8125, + "learning_rate": 0.00013867165079970489, + "loss": 0.8704, + "step": 24930 + }, + { + "epoch": 0.6401581715268784, + "grad_norm": 0.79296875, + "learning_rate": 0.00013866753388042056, + "loss": 0.8169, + "step": 24931 + }, + { + "epoch": 0.6401838487228002, + "grad_norm": 0.78125, + "learning_rate": 0.0001386634168840753, + "loss": 0.7643, + "step": 24932 + }, + { + "epoch": 0.640209525918722, + "grad_norm": 0.7578125, + "learning_rate": 0.00013865929981067736, + "loss": 0.8341, + "step": 24933 + }, + { + "epoch": 0.6402352031146439, + "grad_norm": 0.75390625, + "learning_rate": 0.00013865518266023493, + "loss": 0.8687, + "step": 24934 + }, + { + "epoch": 0.6402608803105657, + "grad_norm": 0.80859375, + "learning_rate": 0.0001386510654327562, + "loss": 0.8346, + "step": 24935 + }, + { + "epoch": 0.6402865575064876, + "grad_norm": 0.80859375, + "learning_rate": 0.0001386469481282494, + "loss": 0.8798, + "step": 24936 + }, + { + "epoch": 0.6403122347024093, + "grad_norm": 0.734375, + "learning_rate": 0.00013864283074672273, + "loss": 0.8621, + "step": 24937 + }, + { + "epoch": 0.6403379118983311, + "grad_norm": 0.734375, + "learning_rate": 0.00013863871328818433, + "loss": 0.8111, + "step": 24938 + }, + { + "epoch": 0.640363589094253, + "grad_norm": 0.75390625, + "learning_rate": 0.00013863459575264253, + "loss": 0.8218, + "step": 24939 + }, + { + "epoch": 0.6403892662901748, + "grad_norm": 0.7578125, + "learning_rate": 0.00013863047814010546, + "loss": 0.9322, + "step": 24940 + }, + { + "epoch": 0.6404149434860966, + "grad_norm": 1.25, + "learning_rate": 0.00013862636045058134, + "loss": 0.7907, + "step": 24941 + }, + { + "epoch": 0.6404406206820185, + "grad_norm": 0.78125, + "learning_rate": 0.00013862224268407837, + "loss": 0.8085, + "step": 24942 + }, + { + "epoch": 0.6404662978779403, + "grad_norm": 0.8671875, + "learning_rate": 0.00013861812484060473, + "loss": 0.8951, + "step": 24943 + }, + { + "epoch": 0.640491975073862, + "grad_norm": 0.75, + "learning_rate": 0.0001386140069201687, + "loss": 0.7893, + "step": 24944 + }, + { + "epoch": 0.6405176522697839, + "grad_norm": 0.7734375, + "learning_rate": 0.0001386098889227784, + "loss": 0.7397, + "step": 24945 + }, + { + "epoch": 0.6405433294657057, + "grad_norm": 0.8359375, + "learning_rate": 0.00013860577084844208, + "loss": 0.9002, + "step": 24946 + }, + { + "epoch": 0.6405690066616275, + "grad_norm": 0.8046875, + "learning_rate": 0.00013860165269716798, + "loss": 0.9332, + "step": 24947 + }, + { + "epoch": 0.6405946838575494, + "grad_norm": 0.88671875, + "learning_rate": 0.00013859753446896425, + "loss": 0.8046, + "step": 24948 + }, + { + "epoch": 0.6406203610534712, + "grad_norm": 0.734375, + "learning_rate": 0.00013859341616383913, + "loss": 0.8029, + "step": 24949 + }, + { + "epoch": 0.6406460382493929, + "grad_norm": 0.7109375, + "learning_rate": 0.00013858929778180082, + "loss": 0.8296, + "step": 24950 + }, + { + "epoch": 0.6406717154453148, + "grad_norm": 0.82421875, + "learning_rate": 0.0001385851793228575, + "loss": 0.8663, + "step": 24951 + }, + { + "epoch": 0.6406973926412366, + "grad_norm": 0.94921875, + "learning_rate": 0.00013858106078701743, + "loss": 0.7386, + "step": 24952 + }, + { + "epoch": 0.6407230698371584, + "grad_norm": 0.72265625, + "learning_rate": 0.0001385769421742888, + "loss": 0.8298, + "step": 24953 + }, + { + "epoch": 0.6407487470330803, + "grad_norm": 0.87890625, + "learning_rate": 0.00013857282348467978, + "loss": 0.8703, + "step": 24954 + }, + { + "epoch": 0.6407744242290021, + "grad_norm": 0.7109375, + "learning_rate": 0.00013856870471819864, + "loss": 0.8758, + "step": 24955 + }, + { + "epoch": 0.6408001014249239, + "grad_norm": 0.84375, + "learning_rate": 0.0001385645858748535, + "loss": 0.9068, + "step": 24956 + }, + { + "epoch": 0.6408257786208457, + "grad_norm": 0.82421875, + "learning_rate": 0.00013856046695465265, + "loss": 0.8158, + "step": 24957 + }, + { + "epoch": 0.6408514558167675, + "grad_norm": 0.73828125, + "learning_rate": 0.0001385563479576043, + "loss": 0.8702, + "step": 24958 + }, + { + "epoch": 0.6408771330126893, + "grad_norm": 0.8515625, + "learning_rate": 0.0001385522288837166, + "loss": 0.8047, + "step": 24959 + }, + { + "epoch": 0.6409028102086112, + "grad_norm": 0.9375, + "learning_rate": 0.00013854810973299784, + "loss": 0.9074, + "step": 24960 + }, + { + "epoch": 0.640928487404533, + "grad_norm": 0.859375, + "learning_rate": 0.00013854399050545612, + "loss": 0.8421, + "step": 24961 + }, + { + "epoch": 0.6409541646004548, + "grad_norm": 0.80078125, + "learning_rate": 0.00013853987120109973, + "loss": 0.8721, + "step": 24962 + }, + { + "epoch": 0.6409798417963767, + "grad_norm": 0.796875, + "learning_rate": 0.0001385357518199369, + "loss": 0.8766, + "step": 24963 + }, + { + "epoch": 0.6410055189922984, + "grad_norm": 0.75, + "learning_rate": 0.0001385316323619757, + "loss": 0.9027, + "step": 24964 + }, + { + "epoch": 0.6410311961882202, + "grad_norm": 0.87890625, + "learning_rate": 0.0001385275128272245, + "loss": 0.8869, + "step": 24965 + }, + { + "epoch": 0.6410568733841421, + "grad_norm": 0.84765625, + "learning_rate": 0.00013852339321569144, + "loss": 0.8005, + "step": 24966 + }, + { + "epoch": 0.6410825505800639, + "grad_norm": 0.75390625, + "learning_rate": 0.00013851927352738475, + "loss": 0.8691, + "step": 24967 + }, + { + "epoch": 0.6411082277759858, + "grad_norm": 0.7109375, + "learning_rate": 0.0001385151537623126, + "loss": 0.8325, + "step": 24968 + }, + { + "epoch": 0.6411339049719076, + "grad_norm": 0.71875, + "learning_rate": 0.00013851103392048325, + "loss": 0.693, + "step": 24969 + }, + { + "epoch": 0.6411595821678293, + "grad_norm": 0.78125, + "learning_rate": 0.0001385069140019049, + "loss": 0.8065, + "step": 24970 + }, + { + "epoch": 0.6411852593637511, + "grad_norm": 0.7109375, + "learning_rate": 0.00013850279400658572, + "loss": 0.8881, + "step": 24971 + }, + { + "epoch": 0.641210936559673, + "grad_norm": 0.76953125, + "learning_rate": 0.00013849867393453394, + "loss": 0.8886, + "step": 24972 + }, + { + "epoch": 0.6412366137555948, + "grad_norm": 0.82421875, + "learning_rate": 0.00013849455378575783, + "loss": 0.8603, + "step": 24973 + }, + { + "epoch": 0.6412622909515167, + "grad_norm": 0.796875, + "learning_rate": 0.00013849043356026553, + "loss": 0.9791, + "step": 24974 + }, + { + "epoch": 0.6412879681474385, + "grad_norm": 0.79296875, + "learning_rate": 0.00013848631325806521, + "loss": 0.7771, + "step": 24975 + }, + { + "epoch": 0.6413136453433603, + "grad_norm": 0.8203125, + "learning_rate": 0.00013848219287916522, + "loss": 1.0107, + "step": 24976 + }, + { + "epoch": 0.641339322539282, + "grad_norm": 0.7421875, + "learning_rate": 0.0001384780724235737, + "loss": 0.9431, + "step": 24977 + }, + { + "epoch": 0.6413649997352039, + "grad_norm": 0.71484375, + "learning_rate": 0.0001384739518912988, + "loss": 0.782, + "step": 24978 + }, + { + "epoch": 0.6413906769311257, + "grad_norm": 0.828125, + "learning_rate": 0.00013846983128234882, + "loss": 0.8653, + "step": 24979 + }, + { + "epoch": 0.6414163541270476, + "grad_norm": 0.80078125, + "learning_rate": 0.00013846571059673195, + "loss": 0.8462, + "step": 24980 + }, + { + "epoch": 0.6414420313229694, + "grad_norm": 0.765625, + "learning_rate": 0.00013846158983445638, + "loss": 0.9144, + "step": 24981 + }, + { + "epoch": 0.6414677085188912, + "grad_norm": 0.78125, + "learning_rate": 0.00013845746899553032, + "loss": 0.8392, + "step": 24982 + }, + { + "epoch": 0.6414933857148131, + "grad_norm": 0.7421875, + "learning_rate": 0.00013845334807996204, + "loss": 0.9596, + "step": 24983 + }, + { + "epoch": 0.6415190629107348, + "grad_norm": 0.77734375, + "learning_rate": 0.0001384492270877597, + "loss": 0.8884, + "step": 24984 + }, + { + "epoch": 0.6415447401066566, + "grad_norm": 0.7890625, + "learning_rate": 0.00013844510601893152, + "loss": 0.9264, + "step": 24985 + }, + { + "epoch": 0.6415704173025785, + "grad_norm": 0.74609375, + "learning_rate": 0.0001384409848734857, + "loss": 0.9151, + "step": 24986 + }, + { + "epoch": 0.6415960944985003, + "grad_norm": 0.75390625, + "learning_rate": 0.00013843686365143048, + "loss": 0.9301, + "step": 24987 + }, + { + "epoch": 0.6416217716944221, + "grad_norm": 0.8203125, + "learning_rate": 0.00013843274235277403, + "loss": 0.7932, + "step": 24988 + }, + { + "epoch": 0.641647448890344, + "grad_norm": 0.765625, + "learning_rate": 0.00013842862097752465, + "loss": 0.9124, + "step": 24989 + }, + { + "epoch": 0.6416731260862657, + "grad_norm": 0.75390625, + "learning_rate": 0.00013842449952569048, + "loss": 0.7777, + "step": 24990 + }, + { + "epoch": 0.6416988032821875, + "grad_norm": 0.73828125, + "learning_rate": 0.00013842037799727974, + "loss": 0.8239, + "step": 24991 + }, + { + "epoch": 0.6417244804781094, + "grad_norm": 0.796875, + "learning_rate": 0.00013841625639230066, + "loss": 0.9103, + "step": 24992 + }, + { + "epoch": 0.6417501576740312, + "grad_norm": 0.7734375, + "learning_rate": 0.00013841213471076146, + "loss": 0.8549, + "step": 24993 + }, + { + "epoch": 0.641775834869953, + "grad_norm": 0.78125, + "learning_rate": 0.00013840801295267037, + "loss": 0.8183, + "step": 24994 + }, + { + "epoch": 0.6418015120658749, + "grad_norm": 1.0546875, + "learning_rate": 0.00013840389111803553, + "loss": 0.7827, + "step": 24995 + }, + { + "epoch": 0.6418271892617967, + "grad_norm": 0.78125, + "learning_rate": 0.0001383997692068652, + "loss": 1.0176, + "step": 24996 + }, + { + "epoch": 0.6418528664577184, + "grad_norm": 0.73828125, + "learning_rate": 0.00013839564721916764, + "loss": 0.8375, + "step": 24997 + }, + { + "epoch": 0.6418785436536403, + "grad_norm": 0.7734375, + "learning_rate": 0.00013839152515495102, + "loss": 0.881, + "step": 24998 + }, + { + "epoch": 0.6419042208495621, + "grad_norm": 0.765625, + "learning_rate": 0.00013838740301422354, + "loss": 0.759, + "step": 24999 + }, + { + "epoch": 0.641929898045484, + "grad_norm": 0.76171875, + "learning_rate": 0.00013838328079699347, + "loss": 0.904, + "step": 25000 + }, + { + "epoch": 0.641929898045484, + "eval_loss": 0.8474920988082886, + "eval_runtime": 384.9961, + "eval_samples_per_second": 25.974, + "eval_steps_per_second": 0.813, + "step": 25000 + }, + { + "epoch": 0.6419555752414058, + "grad_norm": 0.74609375, + "learning_rate": 0.00013837915850326894, + "loss": 0.792, + "step": 25001 + }, + { + "epoch": 0.6419812524373276, + "grad_norm": 0.8671875, + "learning_rate": 0.00013837503613305823, + "loss": 0.8383, + "step": 25002 + }, + { + "epoch": 0.6420069296332493, + "grad_norm": 0.83203125, + "learning_rate": 0.00013837091368636956, + "loss": 0.9426, + "step": 25003 + }, + { + "epoch": 0.6420326068291712, + "grad_norm": 0.73046875, + "learning_rate": 0.00013836679116321107, + "loss": 0.8764, + "step": 25004 + }, + { + "epoch": 0.642058284025093, + "grad_norm": 0.79296875, + "learning_rate": 0.00013836266856359107, + "loss": 0.9005, + "step": 25005 + }, + { + "epoch": 0.6420839612210149, + "grad_norm": 0.94921875, + "learning_rate": 0.00013835854588751777, + "loss": 0.8923, + "step": 25006 + }, + { + "epoch": 0.6421096384169367, + "grad_norm": 0.77734375, + "learning_rate": 0.0001383544231349993, + "loss": 0.7546, + "step": 25007 + }, + { + "epoch": 0.6421353156128585, + "grad_norm": 0.70703125, + "learning_rate": 0.00013835030030604398, + "loss": 0.8194, + "step": 25008 + }, + { + "epoch": 0.6421609928087804, + "grad_norm": 0.80859375, + "learning_rate": 0.00013834617740065988, + "loss": 0.9364, + "step": 25009 + }, + { + "epoch": 0.6421866700047021, + "grad_norm": 0.828125, + "learning_rate": 0.0001383420544188554, + "loss": 0.8578, + "step": 25010 + }, + { + "epoch": 0.6422123472006239, + "grad_norm": 0.8515625, + "learning_rate": 0.00013833793136063866, + "loss": 0.9584, + "step": 25011 + }, + { + "epoch": 0.6422380243965458, + "grad_norm": 0.83203125, + "learning_rate": 0.00013833380822601786, + "loss": 0.7816, + "step": 25012 + }, + { + "epoch": 0.6422637015924676, + "grad_norm": 0.7578125, + "learning_rate": 0.0001383296850150013, + "loss": 0.9183, + "step": 25013 + }, + { + "epoch": 0.6422893787883894, + "grad_norm": 1.0703125, + "learning_rate": 0.00013832556172759704, + "loss": 0.7789, + "step": 25014 + }, + { + "epoch": 0.6423150559843113, + "grad_norm": 0.7734375, + "learning_rate": 0.00013832143836381346, + "loss": 0.8996, + "step": 25015 + }, + { + "epoch": 0.6423407331802331, + "grad_norm": 0.80859375, + "learning_rate": 0.0001383173149236587, + "loss": 0.8965, + "step": 25016 + }, + { + "epoch": 0.6423664103761548, + "grad_norm": 0.7578125, + "learning_rate": 0.00013831319140714102, + "loss": 0.7865, + "step": 25017 + }, + { + "epoch": 0.6423920875720767, + "grad_norm": 0.7734375, + "learning_rate": 0.00013830906781426858, + "loss": 0.8328, + "step": 25018 + }, + { + "epoch": 0.6424177647679985, + "grad_norm": 0.75, + "learning_rate": 0.00013830494414504964, + "loss": 0.7054, + "step": 25019 + }, + { + "epoch": 0.6424434419639203, + "grad_norm": 0.765625, + "learning_rate": 0.0001383008203994924, + "loss": 0.8392, + "step": 25020 + }, + { + "epoch": 0.6424691191598422, + "grad_norm": 0.796875, + "learning_rate": 0.00013829669657760508, + "loss": 0.9371, + "step": 25021 + }, + { + "epoch": 0.642494796355764, + "grad_norm": 0.6953125, + "learning_rate": 0.00013829257267939594, + "loss": 0.7808, + "step": 25022 + }, + { + "epoch": 0.6425204735516857, + "grad_norm": 0.76171875, + "learning_rate": 0.0001382884487048731, + "loss": 0.9282, + "step": 25023 + }, + { + "epoch": 0.6425461507476076, + "grad_norm": 0.79296875, + "learning_rate": 0.00013828432465404488, + "loss": 0.8297, + "step": 25024 + }, + { + "epoch": 0.6425718279435294, + "grad_norm": 0.7734375, + "learning_rate": 0.00013828020052691946, + "loss": 0.8177, + "step": 25025 + }, + { + "epoch": 0.6425975051394512, + "grad_norm": 0.81640625, + "learning_rate": 0.00013827607632350505, + "loss": 0.8816, + "step": 25026 + }, + { + "epoch": 0.6426231823353731, + "grad_norm": 0.734375, + "learning_rate": 0.00013827195204380987, + "loss": 0.8499, + "step": 25027 + }, + { + "epoch": 0.6426488595312949, + "grad_norm": 0.71875, + "learning_rate": 0.00013826782768784214, + "loss": 0.7515, + "step": 25028 + }, + { + "epoch": 0.6426745367272167, + "grad_norm": 0.7890625, + "learning_rate": 0.00013826370325561012, + "loss": 0.7683, + "step": 25029 + }, + { + "epoch": 0.6427002139231385, + "grad_norm": 0.734375, + "learning_rate": 0.00013825957874712196, + "loss": 0.8735, + "step": 25030 + }, + { + "epoch": 0.6427258911190603, + "grad_norm": 0.77734375, + "learning_rate": 0.00013825545416238592, + "loss": 0.8805, + "step": 25031 + }, + { + "epoch": 0.6427515683149821, + "grad_norm": 0.984375, + "learning_rate": 0.00013825132950141028, + "loss": 0.9207, + "step": 25032 + }, + { + "epoch": 0.642777245510904, + "grad_norm": 0.80078125, + "learning_rate": 0.0001382472047642031, + "loss": 0.9315, + "step": 25033 + }, + { + "epoch": 0.6428029227068258, + "grad_norm": 1.1171875, + "learning_rate": 0.00013824307995077275, + "loss": 0.9103, + "step": 25034 + }, + { + "epoch": 0.6428285999027477, + "grad_norm": 0.75390625, + "learning_rate": 0.00013823895506112738, + "loss": 0.8657, + "step": 25035 + }, + { + "epoch": 0.6428542770986695, + "grad_norm": 0.73046875, + "learning_rate": 0.00013823483009527522, + "loss": 0.7459, + "step": 25036 + }, + { + "epoch": 0.6428799542945912, + "grad_norm": 0.75390625, + "learning_rate": 0.0001382307050532245, + "loss": 0.8434, + "step": 25037 + }, + { + "epoch": 0.642905631490513, + "grad_norm": 0.76953125, + "learning_rate": 0.00013822657993498347, + "loss": 0.7914, + "step": 25038 + }, + { + "epoch": 0.6429313086864349, + "grad_norm": 0.73828125, + "learning_rate": 0.0001382224547405603, + "loss": 0.692, + "step": 25039 + }, + { + "epoch": 0.6429569858823567, + "grad_norm": 0.81640625, + "learning_rate": 0.00013821832946996323, + "loss": 0.8697, + "step": 25040 + }, + { + "epoch": 0.6429826630782786, + "grad_norm": 0.7421875, + "learning_rate": 0.00013821420412320044, + "loss": 0.8541, + "step": 25041 + }, + { + "epoch": 0.6430083402742004, + "grad_norm": 0.765625, + "learning_rate": 0.00013821007870028025, + "loss": 0.8687, + "step": 25042 + }, + { + "epoch": 0.6430340174701221, + "grad_norm": 0.78515625, + "learning_rate": 0.0001382059532012108, + "loss": 0.8522, + "step": 25043 + }, + { + "epoch": 0.643059694666044, + "grad_norm": 0.72265625, + "learning_rate": 0.00013820182762600033, + "loss": 0.7915, + "step": 25044 + }, + { + "epoch": 0.6430853718619658, + "grad_norm": 0.78125, + "learning_rate": 0.0001381977019746571, + "loss": 0.8885, + "step": 25045 + }, + { + "epoch": 0.6431110490578876, + "grad_norm": 0.765625, + "learning_rate": 0.00013819357624718928, + "loss": 0.8746, + "step": 25046 + }, + { + "epoch": 0.6431367262538095, + "grad_norm": 0.8125, + "learning_rate": 0.00013818945044360512, + "loss": 0.9158, + "step": 25047 + }, + { + "epoch": 0.6431624034497313, + "grad_norm": 0.9921875, + "learning_rate": 0.00013818532456391283, + "loss": 0.9226, + "step": 25048 + }, + { + "epoch": 0.6431880806456531, + "grad_norm": 0.8359375, + "learning_rate": 0.00013818119860812062, + "loss": 0.7716, + "step": 25049 + }, + { + "epoch": 0.6432137578415749, + "grad_norm": 0.82421875, + "learning_rate": 0.00013817707257623676, + "loss": 0.9057, + "step": 25050 + }, + { + "epoch": 0.6432394350374967, + "grad_norm": 0.796875, + "learning_rate": 0.00013817294646826942, + "loss": 0.7789, + "step": 25051 + }, + { + "epoch": 0.6432651122334185, + "grad_norm": 0.7421875, + "learning_rate": 0.00013816882028422686, + "loss": 0.8207, + "step": 25052 + }, + { + "epoch": 0.6432907894293404, + "grad_norm": 0.76171875, + "learning_rate": 0.00013816469402411728, + "loss": 0.7757, + "step": 25053 + }, + { + "epoch": 0.6433164666252622, + "grad_norm": 0.77734375, + "learning_rate": 0.00013816056768794894, + "loss": 0.7825, + "step": 25054 + }, + { + "epoch": 0.643342143821184, + "grad_norm": 0.8046875, + "learning_rate": 0.00013815644127572997, + "loss": 0.8232, + "step": 25055 + }, + { + "epoch": 0.6433678210171059, + "grad_norm": 0.75, + "learning_rate": 0.00013815231478746872, + "loss": 0.9622, + "step": 25056 + }, + { + "epoch": 0.6433934982130276, + "grad_norm": 0.78515625, + "learning_rate": 0.00013814818822317331, + "loss": 0.9579, + "step": 25057 + }, + { + "epoch": 0.6434191754089494, + "grad_norm": 0.84375, + "learning_rate": 0.00013814406158285203, + "loss": 1.0074, + "step": 25058 + }, + { + "epoch": 0.6434448526048713, + "grad_norm": 0.74609375, + "learning_rate": 0.0001381399348665131, + "loss": 0.8178, + "step": 25059 + }, + { + "epoch": 0.6434705298007931, + "grad_norm": 0.79296875, + "learning_rate": 0.00013813580807416472, + "loss": 0.8631, + "step": 25060 + }, + { + "epoch": 0.643496206996715, + "grad_norm": 0.796875, + "learning_rate": 0.0001381316812058151, + "loss": 0.8654, + "step": 25061 + }, + { + "epoch": 0.6435218841926368, + "grad_norm": 0.734375, + "learning_rate": 0.00013812755426147248, + "loss": 0.824, + "step": 25062 + }, + { + "epoch": 0.6435475613885585, + "grad_norm": 0.77734375, + "learning_rate": 0.00013812342724114508, + "loss": 0.7897, + "step": 25063 + }, + { + "epoch": 0.6435732385844803, + "grad_norm": 0.75390625, + "learning_rate": 0.00013811930014484116, + "loss": 0.9345, + "step": 25064 + }, + { + "epoch": 0.6435989157804022, + "grad_norm": 0.81640625, + "learning_rate": 0.0001381151729725689, + "loss": 0.9837, + "step": 25065 + }, + { + "epoch": 0.643624592976324, + "grad_norm": 0.77734375, + "learning_rate": 0.00013811104572433657, + "loss": 0.8734, + "step": 25066 + }, + { + "epoch": 0.6436502701722459, + "grad_norm": 0.7265625, + "learning_rate": 0.00013810691840015235, + "loss": 0.7954, + "step": 25067 + }, + { + "epoch": 0.6436759473681677, + "grad_norm": 0.73046875, + "learning_rate": 0.00013810279100002446, + "loss": 0.814, + "step": 25068 + }, + { + "epoch": 0.6437016245640895, + "grad_norm": 0.7265625, + "learning_rate": 0.0001380986635239612, + "loss": 0.7705, + "step": 25069 + }, + { + "epoch": 0.6437273017600113, + "grad_norm": 0.7734375, + "learning_rate": 0.0001380945359719707, + "loss": 0.7278, + "step": 25070 + }, + { + "epoch": 0.6437529789559331, + "grad_norm": 0.73046875, + "learning_rate": 0.00013809040834406123, + "loss": 0.7941, + "step": 25071 + }, + { + "epoch": 0.6437786561518549, + "grad_norm": 0.75, + "learning_rate": 0.00013808628064024107, + "loss": 0.8291, + "step": 25072 + }, + { + "epoch": 0.6438043333477768, + "grad_norm": 0.9375, + "learning_rate": 0.00013808215286051836, + "loss": 0.826, + "step": 25073 + }, + { + "epoch": 0.6438300105436986, + "grad_norm": 0.8359375, + "learning_rate": 0.00013807802500490134, + "loss": 0.9419, + "step": 25074 + }, + { + "epoch": 0.6438556877396204, + "grad_norm": 0.7578125, + "learning_rate": 0.0001380738970733983, + "loss": 0.7796, + "step": 25075 + }, + { + "epoch": 0.6438813649355423, + "grad_norm": 0.765625, + "learning_rate": 0.00013806976906601735, + "loss": 0.7788, + "step": 25076 + }, + { + "epoch": 0.643907042131464, + "grad_norm": 0.8203125, + "learning_rate": 0.00013806564098276685, + "loss": 0.8536, + "step": 25077 + }, + { + "epoch": 0.6439327193273858, + "grad_norm": 0.8828125, + "learning_rate": 0.00013806151282365497, + "loss": 0.8229, + "step": 25078 + }, + { + "epoch": 0.6439583965233077, + "grad_norm": 0.76953125, + "learning_rate": 0.0001380573845886899, + "loss": 0.9013, + "step": 25079 + }, + { + "epoch": 0.6439840737192295, + "grad_norm": 0.8671875, + "learning_rate": 0.00013805325627787993, + "loss": 0.9898, + "step": 25080 + }, + { + "epoch": 0.6440097509151513, + "grad_norm": 0.6875, + "learning_rate": 0.00013804912789123323, + "loss": 0.731, + "step": 25081 + }, + { + "epoch": 0.6440354281110732, + "grad_norm": 0.82421875, + "learning_rate": 0.00013804499942875808, + "loss": 0.8439, + "step": 25082 + }, + { + "epoch": 0.6440611053069949, + "grad_norm": 0.734375, + "learning_rate": 0.00013804087089046266, + "loss": 0.8486, + "step": 25083 + }, + { + "epoch": 0.6440867825029167, + "grad_norm": 0.87890625, + "learning_rate": 0.00013803674227635523, + "loss": 0.934, + "step": 25084 + }, + { + "epoch": 0.6441124596988386, + "grad_norm": 0.69921875, + "learning_rate": 0.000138032613586444, + "loss": 0.7008, + "step": 25085 + }, + { + "epoch": 0.6441381368947604, + "grad_norm": 0.8203125, + "learning_rate": 0.00013802848482073721, + "loss": 0.9467, + "step": 25086 + }, + { + "epoch": 0.6441638140906822, + "grad_norm": 0.76171875, + "learning_rate": 0.0001380243559792431, + "loss": 0.7205, + "step": 25087 + }, + { + "epoch": 0.6441894912866041, + "grad_norm": 0.7578125, + "learning_rate": 0.00013802022706196988, + "loss": 0.827, + "step": 25088 + }, + { + "epoch": 0.6442151684825259, + "grad_norm": 0.76953125, + "learning_rate": 0.00013801609806892576, + "loss": 0.8682, + "step": 25089 + }, + { + "epoch": 0.6442408456784476, + "grad_norm": 0.73828125, + "learning_rate": 0.000138011969000119, + "loss": 0.8373, + "step": 25090 + }, + { + "epoch": 0.6442665228743695, + "grad_norm": 0.796875, + "learning_rate": 0.00013800783985555784, + "loss": 0.8356, + "step": 25091 + }, + { + "epoch": 0.6442922000702913, + "grad_norm": 0.7734375, + "learning_rate": 0.00013800371063525047, + "loss": 0.9684, + "step": 25092 + }, + { + "epoch": 0.6443178772662131, + "grad_norm": 0.87109375, + "learning_rate": 0.00013799958133920516, + "loss": 0.8851, + "step": 25093 + }, + { + "epoch": 0.644343554462135, + "grad_norm": 0.73046875, + "learning_rate": 0.00013799545196743006, + "loss": 0.6948, + "step": 25094 + }, + { + "epoch": 0.6443692316580568, + "grad_norm": 0.8359375, + "learning_rate": 0.00013799132251993352, + "loss": 0.8573, + "step": 25095 + }, + { + "epoch": 0.6443949088539787, + "grad_norm": 0.91015625, + "learning_rate": 0.0001379871929967237, + "loss": 0.7979, + "step": 25096 + }, + { + "epoch": 0.6444205860499004, + "grad_norm": 0.76953125, + "learning_rate": 0.00013798306339780878, + "loss": 0.9127, + "step": 25097 + }, + { + "epoch": 0.6444462632458222, + "grad_norm": 0.8125, + "learning_rate": 0.0001379789337231971, + "loss": 0.7129, + "step": 25098 + }, + { + "epoch": 0.644471940441744, + "grad_norm": 0.890625, + "learning_rate": 0.00013797480397289685, + "loss": 0.9269, + "step": 25099 + }, + { + "epoch": 0.6444976176376659, + "grad_norm": 0.81640625, + "learning_rate": 0.00013797067414691617, + "loss": 0.9873, + "step": 25100 + }, + { + "epoch": 0.6445232948335877, + "grad_norm": 0.75390625, + "learning_rate": 0.00013796654424526344, + "loss": 0.7991, + "step": 25101 + }, + { + "epoch": 0.6445489720295096, + "grad_norm": 0.7890625, + "learning_rate": 0.0001379624142679468, + "loss": 0.9966, + "step": 25102 + }, + { + "epoch": 0.6445746492254313, + "grad_norm": 0.74609375, + "learning_rate": 0.00013795828421497443, + "loss": 0.7595, + "step": 25103 + }, + { + "epoch": 0.6446003264213531, + "grad_norm": 0.76953125, + "learning_rate": 0.0001379541540863547, + "loss": 0.8369, + "step": 25104 + }, + { + "epoch": 0.644626003617275, + "grad_norm": 0.7265625, + "learning_rate": 0.00013795002388209576, + "loss": 0.7474, + "step": 25105 + }, + { + "epoch": 0.6446516808131968, + "grad_norm": 0.8359375, + "learning_rate": 0.00013794589360220584, + "loss": 0.7917, + "step": 25106 + }, + { + "epoch": 0.6446773580091186, + "grad_norm": 0.73828125, + "learning_rate": 0.00013794176324669318, + "loss": 0.7819, + "step": 25107 + }, + { + "epoch": 0.6447030352050405, + "grad_norm": 0.828125, + "learning_rate": 0.000137937632815566, + "loss": 0.9319, + "step": 25108 + }, + { + "epoch": 0.6447287124009623, + "grad_norm": 0.91015625, + "learning_rate": 0.00013793350230883257, + "loss": 0.8049, + "step": 25109 + }, + { + "epoch": 0.644754389596884, + "grad_norm": 0.8046875, + "learning_rate": 0.0001379293717265011, + "loss": 0.8189, + "step": 25110 + }, + { + "epoch": 0.6447800667928059, + "grad_norm": 0.8125, + "learning_rate": 0.0001379252410685798, + "loss": 0.917, + "step": 25111 + }, + { + "epoch": 0.6448057439887277, + "grad_norm": 0.80859375, + "learning_rate": 0.00013792111033507695, + "loss": 0.8252, + "step": 25112 + }, + { + "epoch": 0.6448314211846495, + "grad_norm": 0.94921875, + "learning_rate": 0.0001379169795260007, + "loss": 0.783, + "step": 25113 + }, + { + "epoch": 0.6448570983805714, + "grad_norm": 0.7734375, + "learning_rate": 0.00013791284864135938, + "loss": 0.743, + "step": 25114 + }, + { + "epoch": 0.6448827755764932, + "grad_norm": 0.69140625, + "learning_rate": 0.00013790871768116117, + "loss": 0.8098, + "step": 25115 + }, + { + "epoch": 0.644908452772415, + "grad_norm": 0.76171875, + "learning_rate": 0.00013790458664541427, + "loss": 0.8458, + "step": 25116 + }, + { + "epoch": 0.6449341299683368, + "grad_norm": 0.98046875, + "learning_rate": 0.000137900455534127, + "loss": 0.8252, + "step": 25117 + }, + { + "epoch": 0.6449598071642586, + "grad_norm": 0.83984375, + "learning_rate": 0.00013789632434730754, + "loss": 0.902, + "step": 25118 + }, + { + "epoch": 0.6449854843601804, + "grad_norm": 0.8515625, + "learning_rate": 0.00013789219308496411, + "loss": 0.7989, + "step": 25119 + }, + { + "epoch": 0.6450111615561023, + "grad_norm": 0.82421875, + "learning_rate": 0.00013788806174710498, + "loss": 0.8998, + "step": 25120 + }, + { + "epoch": 0.6450368387520241, + "grad_norm": 0.796875, + "learning_rate": 0.00013788393033373835, + "loss": 0.9596, + "step": 25121 + }, + { + "epoch": 0.645062515947946, + "grad_norm": 0.734375, + "learning_rate": 0.00013787979884487248, + "loss": 0.8665, + "step": 25122 + }, + { + "epoch": 0.6450881931438677, + "grad_norm": 0.79296875, + "learning_rate": 0.00013787566728051557, + "loss": 0.7832, + "step": 25123 + }, + { + "epoch": 0.6451138703397895, + "grad_norm": 0.74609375, + "learning_rate": 0.0001378715356406759, + "loss": 0.9074, + "step": 25124 + }, + { + "epoch": 0.6451395475357113, + "grad_norm": 0.78125, + "learning_rate": 0.00013786740392536168, + "loss": 0.8672, + "step": 25125 + }, + { + "epoch": 0.6451652247316332, + "grad_norm": 0.7890625, + "learning_rate": 0.00013786327213458112, + "loss": 0.8528, + "step": 25126 + }, + { + "epoch": 0.645190901927555, + "grad_norm": 0.77734375, + "learning_rate": 0.0001378591402683425, + "loss": 0.8145, + "step": 25127 + }, + { + "epoch": 0.6452165791234769, + "grad_norm": 0.75390625, + "learning_rate": 0.00013785500832665405, + "loss": 0.9143, + "step": 25128 + }, + { + "epoch": 0.6452422563193987, + "grad_norm": 0.765625, + "learning_rate": 0.00013785087630952396, + "loss": 1.0317, + "step": 25129 + }, + { + "epoch": 0.6452679335153204, + "grad_norm": 0.83984375, + "learning_rate": 0.0001378467442169605, + "loss": 0.7844, + "step": 25130 + }, + { + "epoch": 0.6452936107112422, + "grad_norm": 0.69140625, + "learning_rate": 0.00013784261204897191, + "loss": 0.7177, + "step": 25131 + }, + { + "epoch": 0.6453192879071641, + "grad_norm": 0.72265625, + "learning_rate": 0.0001378384798055664, + "loss": 0.7269, + "step": 25132 + }, + { + "epoch": 0.6453449651030859, + "grad_norm": 0.75, + "learning_rate": 0.00013783434748675222, + "loss": 0.8651, + "step": 25133 + }, + { + "epoch": 0.6453706422990078, + "grad_norm": 0.73828125, + "learning_rate": 0.00013783021509253759, + "loss": 0.8662, + "step": 25134 + }, + { + "epoch": 0.6453963194949296, + "grad_norm": 0.79296875, + "learning_rate": 0.0001378260826229308, + "loss": 0.792, + "step": 25135 + }, + { + "epoch": 0.6454219966908514, + "grad_norm": 0.80859375, + "learning_rate": 0.00013782195007793998, + "loss": 0.9707, + "step": 25136 + }, + { + "epoch": 0.6454476738867732, + "grad_norm": 0.8046875, + "learning_rate": 0.00013781781745757346, + "loss": 0.7156, + "step": 25137 + }, + { + "epoch": 0.645473351082695, + "grad_norm": 0.70703125, + "learning_rate": 0.00013781368476183947, + "loss": 0.8004, + "step": 25138 + }, + { + "epoch": 0.6454990282786168, + "grad_norm": 0.73828125, + "learning_rate": 0.0001378095519907462, + "loss": 0.7658, + "step": 25139 + }, + { + "epoch": 0.6455247054745387, + "grad_norm": 0.73828125, + "learning_rate": 0.0001378054191443019, + "loss": 0.7994, + "step": 25140 + }, + { + "epoch": 0.6455503826704605, + "grad_norm": 0.6953125, + "learning_rate": 0.00013780128622251482, + "loss": 0.7634, + "step": 25141 + }, + { + "epoch": 0.6455760598663823, + "grad_norm": 0.71484375, + "learning_rate": 0.00013779715322539323, + "loss": 0.7435, + "step": 25142 + }, + { + "epoch": 0.6456017370623041, + "grad_norm": 0.78515625, + "learning_rate": 0.00013779302015294524, + "loss": 0.8891, + "step": 25143 + }, + { + "epoch": 0.6456274142582259, + "grad_norm": 0.8125, + "learning_rate": 0.00013778888700517926, + "loss": 0.8722, + "step": 25144 + }, + { + "epoch": 0.6456530914541477, + "grad_norm": 0.765625, + "learning_rate": 0.0001377847537821034, + "loss": 0.7993, + "step": 25145 + }, + { + "epoch": 0.6456787686500696, + "grad_norm": 0.78125, + "learning_rate": 0.00013778062048372595, + "loss": 0.7797, + "step": 25146 + }, + { + "epoch": 0.6457044458459914, + "grad_norm": 0.7890625, + "learning_rate": 0.00013777648711005514, + "loss": 0.817, + "step": 25147 + }, + { + "epoch": 0.6457301230419132, + "grad_norm": 0.75, + "learning_rate": 0.00013777235366109917, + "loss": 0.9045, + "step": 25148 + }, + { + "epoch": 0.6457558002378351, + "grad_norm": 0.8203125, + "learning_rate": 0.00013776822013686636, + "loss": 0.9062, + "step": 25149 + }, + { + "epoch": 0.6457814774337568, + "grad_norm": 0.76953125, + "learning_rate": 0.00013776408653736488, + "loss": 0.8028, + "step": 25150 + }, + { + "epoch": 0.6458071546296786, + "grad_norm": 0.828125, + "learning_rate": 0.000137759952862603, + "loss": 0.8185, + "step": 25151 + }, + { + "epoch": 0.6458328318256005, + "grad_norm": 0.8046875, + "learning_rate": 0.0001377558191125889, + "loss": 0.8625, + "step": 25152 + }, + { + "epoch": 0.6458585090215223, + "grad_norm": 0.83984375, + "learning_rate": 0.0001377516852873309, + "loss": 0.9987, + "step": 25153 + }, + { + "epoch": 0.6458841862174441, + "grad_norm": 0.77734375, + "learning_rate": 0.0001377475513868372, + "loss": 0.8259, + "step": 25154 + }, + { + "epoch": 0.645909863413366, + "grad_norm": 0.796875, + "learning_rate": 0.00013774341741111602, + "loss": 0.9701, + "step": 25155 + }, + { + "epoch": 0.6459355406092878, + "grad_norm": 0.796875, + "learning_rate": 0.00013773928336017563, + "loss": 0.8583, + "step": 25156 + }, + { + "epoch": 0.6459612178052095, + "grad_norm": 0.796875, + "learning_rate": 0.00013773514923402428, + "loss": 0.9051, + "step": 25157 + }, + { + "epoch": 0.6459868950011314, + "grad_norm": 0.75390625, + "learning_rate": 0.00013773101503267016, + "loss": 0.8709, + "step": 25158 + }, + { + "epoch": 0.6460125721970532, + "grad_norm": 0.79296875, + "learning_rate": 0.00013772688075612153, + "loss": 0.8519, + "step": 25159 + }, + { + "epoch": 0.646038249392975, + "grad_norm": 1.1875, + "learning_rate": 0.00013772274640438668, + "loss": 0.8306, + "step": 25160 + }, + { + "epoch": 0.6460639265888969, + "grad_norm": 0.76171875, + "learning_rate": 0.00013771861197747374, + "loss": 0.9317, + "step": 25161 + }, + { + "epoch": 0.6460896037848187, + "grad_norm": 0.77734375, + "learning_rate": 0.00013771447747539106, + "loss": 0.8018, + "step": 25162 + }, + { + "epoch": 0.6461152809807404, + "grad_norm": 0.8125, + "learning_rate": 0.00013771034289814682, + "loss": 0.8556, + "step": 25163 + }, + { + "epoch": 0.6461409581766623, + "grad_norm": 0.76171875, + "learning_rate": 0.00013770620824574926, + "loss": 0.8477, + "step": 25164 + }, + { + "epoch": 0.6461666353725841, + "grad_norm": 0.6875, + "learning_rate": 0.00013770207351820665, + "loss": 0.7179, + "step": 25165 + }, + { + "epoch": 0.646192312568506, + "grad_norm": 0.82421875, + "learning_rate": 0.00013769793871552718, + "loss": 0.7515, + "step": 25166 + }, + { + "epoch": 0.6462179897644278, + "grad_norm": 0.76953125, + "learning_rate": 0.00013769380383771916, + "loss": 0.8149, + "step": 25167 + }, + { + "epoch": 0.6462436669603496, + "grad_norm": 0.7734375, + "learning_rate": 0.0001376896688847908, + "loss": 0.8605, + "step": 25168 + }, + { + "epoch": 0.6462693441562715, + "grad_norm": 0.77734375, + "learning_rate": 0.0001376855338567503, + "loss": 0.8216, + "step": 25169 + }, + { + "epoch": 0.6462950213521932, + "grad_norm": 0.71875, + "learning_rate": 0.00013768139875360594, + "loss": 0.7876, + "step": 25170 + }, + { + "epoch": 0.646320698548115, + "grad_norm": 0.76953125, + "learning_rate": 0.000137677263575366, + "loss": 0.9225, + "step": 25171 + }, + { + "epoch": 0.6463463757440369, + "grad_norm": 0.72265625, + "learning_rate": 0.00013767312832203862, + "loss": 0.7184, + "step": 25172 + }, + { + "epoch": 0.6463720529399587, + "grad_norm": 0.8359375, + "learning_rate": 0.00013766899299363214, + "loss": 0.7527, + "step": 25173 + }, + { + "epoch": 0.6463977301358805, + "grad_norm": 0.70703125, + "learning_rate": 0.00013766485759015472, + "loss": 0.8769, + "step": 25174 + }, + { + "epoch": 0.6464234073318024, + "grad_norm": 0.9453125, + "learning_rate": 0.00013766072211161468, + "loss": 0.8256, + "step": 25175 + }, + { + "epoch": 0.6464490845277242, + "grad_norm": 0.7421875, + "learning_rate": 0.0001376565865580202, + "loss": 0.9438, + "step": 25176 + }, + { + "epoch": 0.6464747617236459, + "grad_norm": 0.9609375, + "learning_rate": 0.00013765245092937955, + "loss": 0.7397, + "step": 25177 + }, + { + "epoch": 0.6465004389195678, + "grad_norm": 0.78125, + "learning_rate": 0.000137648315225701, + "loss": 0.7263, + "step": 25178 + }, + { + "epoch": 0.6465261161154896, + "grad_norm": 0.7890625, + "learning_rate": 0.0001376441794469927, + "loss": 0.8494, + "step": 25179 + }, + { + "epoch": 0.6465517933114114, + "grad_norm": 0.78125, + "learning_rate": 0.00013764004359326297, + "loss": 0.8722, + "step": 25180 + }, + { + "epoch": 0.6465774705073333, + "grad_norm": 0.73828125, + "learning_rate": 0.00013763590766452003, + "loss": 1.0094, + "step": 25181 + }, + { + "epoch": 0.6466031477032551, + "grad_norm": 0.8125, + "learning_rate": 0.00013763177166077216, + "loss": 0.9433, + "step": 25182 + }, + { + "epoch": 0.6466288248991768, + "grad_norm": 0.765625, + "learning_rate": 0.00013762763558202755, + "loss": 1.0309, + "step": 25183 + }, + { + "epoch": 0.6466545020950987, + "grad_norm": 0.828125, + "learning_rate": 0.00013762349942829448, + "loss": 0.8462, + "step": 25184 + }, + { + "epoch": 0.6466801792910205, + "grad_norm": 0.7578125, + "learning_rate": 0.0001376193631995811, + "loss": 0.9336, + "step": 25185 + }, + { + "epoch": 0.6467058564869423, + "grad_norm": 0.7890625, + "learning_rate": 0.0001376152268958958, + "loss": 0.7647, + "step": 25186 + }, + { + "epoch": 0.6467315336828642, + "grad_norm": 0.7734375, + "learning_rate": 0.00013761109051724671, + "loss": 0.7401, + "step": 25187 + }, + { + "epoch": 0.646757210878786, + "grad_norm": 0.78125, + "learning_rate": 0.00013760695406364213, + "loss": 0.8149, + "step": 25188 + }, + { + "epoch": 0.6467828880747079, + "grad_norm": 0.76171875, + "learning_rate": 0.0001376028175350903, + "loss": 0.9836, + "step": 25189 + }, + { + "epoch": 0.6468085652706296, + "grad_norm": 0.67578125, + "learning_rate": 0.00013759868093159944, + "loss": 0.7256, + "step": 25190 + }, + { + "epoch": 0.6468342424665514, + "grad_norm": 0.78515625, + "learning_rate": 0.00013759454425317781, + "loss": 0.7426, + "step": 25191 + }, + { + "epoch": 0.6468599196624732, + "grad_norm": 0.75, + "learning_rate": 0.00013759040749983362, + "loss": 0.8309, + "step": 25192 + }, + { + "epoch": 0.6468855968583951, + "grad_norm": 0.73828125, + "learning_rate": 0.00013758627067157517, + "loss": 0.8531, + "step": 25193 + }, + { + "epoch": 0.6469112740543169, + "grad_norm": 0.8125, + "learning_rate": 0.00013758213376841068, + "loss": 0.967, + "step": 25194 + }, + { + "epoch": 0.6469369512502388, + "grad_norm": 0.78515625, + "learning_rate": 0.0001375779967903484, + "loss": 0.8181, + "step": 25195 + }, + { + "epoch": 0.6469626284461606, + "grad_norm": 0.67578125, + "learning_rate": 0.00013757385973739653, + "loss": 0.7432, + "step": 25196 + }, + { + "epoch": 0.6469883056420823, + "grad_norm": 0.7109375, + "learning_rate": 0.0001375697226095634, + "loss": 0.9185, + "step": 25197 + }, + { + "epoch": 0.6470139828380042, + "grad_norm": 0.73828125, + "learning_rate": 0.0001375655854068572, + "loss": 0.8082, + "step": 25198 + }, + { + "epoch": 0.647039660033926, + "grad_norm": 0.76953125, + "learning_rate": 0.00013756144812928614, + "loss": 0.7711, + "step": 25199 + }, + { + "epoch": 0.6470653372298478, + "grad_norm": 0.74609375, + "learning_rate": 0.00013755731077685855, + "loss": 0.8645, + "step": 25200 + }, + { + "epoch": 0.6470910144257697, + "grad_norm": 0.84375, + "learning_rate": 0.00013755317334958258, + "loss": 0.8453, + "step": 25201 + }, + { + "epoch": 0.6471166916216915, + "grad_norm": 0.89453125, + "learning_rate": 0.00013754903584746656, + "loss": 0.8532, + "step": 25202 + }, + { + "epoch": 0.6471423688176132, + "grad_norm": 0.92578125, + "learning_rate": 0.0001375448982705187, + "loss": 0.758, + "step": 25203 + }, + { + "epoch": 0.6471680460135351, + "grad_norm": 0.796875, + "learning_rate": 0.00013754076061874728, + "loss": 0.8418, + "step": 25204 + }, + { + "epoch": 0.6471937232094569, + "grad_norm": 0.75390625, + "learning_rate": 0.0001375366228921605, + "loss": 0.7524, + "step": 25205 + }, + { + "epoch": 0.6472194004053787, + "grad_norm": 0.91015625, + "learning_rate": 0.00013753248509076657, + "loss": 0.7678, + "step": 25206 + }, + { + "epoch": 0.6472450776013006, + "grad_norm": 0.78515625, + "learning_rate": 0.00013752834721457384, + "loss": 0.7911, + "step": 25207 + }, + { + "epoch": 0.6472707547972224, + "grad_norm": 0.7890625, + "learning_rate": 0.00013752420926359049, + "loss": 0.8406, + "step": 25208 + }, + { + "epoch": 0.6472964319931442, + "grad_norm": 0.84375, + "learning_rate": 0.00013752007123782477, + "loss": 0.8892, + "step": 25209 + }, + { + "epoch": 0.647322109189066, + "grad_norm": 0.98828125, + "learning_rate": 0.00013751593313728497, + "loss": 0.7886, + "step": 25210 + }, + { + "epoch": 0.6473477863849878, + "grad_norm": 0.73046875, + "learning_rate": 0.00013751179496197927, + "loss": 0.902, + "step": 25211 + }, + { + "epoch": 0.6473734635809096, + "grad_norm": 0.765625, + "learning_rate": 0.00013750765671191598, + "loss": 0.8989, + "step": 25212 + }, + { + "epoch": 0.6473991407768315, + "grad_norm": 0.84375, + "learning_rate": 0.0001375035183871033, + "loss": 0.927, + "step": 25213 + }, + { + "epoch": 0.6474248179727533, + "grad_norm": 0.77734375, + "learning_rate": 0.00013749937998754948, + "loss": 0.916, + "step": 25214 + }, + { + "epoch": 0.6474504951686751, + "grad_norm": 0.77734375, + "learning_rate": 0.0001374952415132628, + "loss": 0.8348, + "step": 25215 + }, + { + "epoch": 0.6474761723645969, + "grad_norm": 0.76953125, + "learning_rate": 0.0001374911029642515, + "loss": 0.8636, + "step": 25216 + }, + { + "epoch": 0.6475018495605187, + "grad_norm": 0.8359375, + "learning_rate": 0.0001374869643405238, + "loss": 0.9451, + "step": 25217 + }, + { + "epoch": 0.6475275267564405, + "grad_norm": 0.859375, + "learning_rate": 0.000137482825642088, + "loss": 0.8917, + "step": 25218 + }, + { + "epoch": 0.6475532039523624, + "grad_norm": 0.71875, + "learning_rate": 0.00013747868686895227, + "loss": 0.8309, + "step": 25219 + }, + { + "epoch": 0.6475788811482842, + "grad_norm": 0.75390625, + "learning_rate": 0.00013747454802112493, + "loss": 0.7224, + "step": 25220 + }, + { + "epoch": 0.647604558344206, + "grad_norm": 0.80078125, + "learning_rate": 0.0001374704090986142, + "loss": 0.7609, + "step": 25221 + }, + { + "epoch": 0.6476302355401279, + "grad_norm": 0.81640625, + "learning_rate": 0.0001374662701014283, + "loss": 0.9017, + "step": 25222 + }, + { + "epoch": 0.6476559127360496, + "grad_norm": 0.8125, + "learning_rate": 0.00013746213102957557, + "loss": 0.8444, + "step": 25223 + }, + { + "epoch": 0.6476815899319714, + "grad_norm": 0.8125, + "learning_rate": 0.00013745799188306416, + "loss": 0.8554, + "step": 25224 + }, + { + "epoch": 0.6477072671278933, + "grad_norm": 0.7578125, + "learning_rate": 0.00013745385266190235, + "loss": 0.7776, + "step": 25225 + }, + { + "epoch": 0.6477329443238151, + "grad_norm": 0.80859375, + "learning_rate": 0.0001374497133660984, + "loss": 0.9426, + "step": 25226 + }, + { + "epoch": 0.647758621519737, + "grad_norm": 0.79296875, + "learning_rate": 0.00013744557399566053, + "loss": 0.8544, + "step": 25227 + }, + { + "epoch": 0.6477842987156588, + "grad_norm": 0.7734375, + "learning_rate": 0.00013744143455059703, + "loss": 0.8346, + "step": 25228 + }, + { + "epoch": 0.6478099759115806, + "grad_norm": 0.7578125, + "learning_rate": 0.00013743729503091614, + "loss": 0.7173, + "step": 25229 + }, + { + "epoch": 0.6478356531075024, + "grad_norm": 0.7890625, + "learning_rate": 0.00013743315543662612, + "loss": 0.7846, + "step": 25230 + }, + { + "epoch": 0.6478613303034242, + "grad_norm": 0.75390625, + "learning_rate": 0.00013742901576773518, + "loss": 0.8756, + "step": 25231 + }, + { + "epoch": 0.647887007499346, + "grad_norm": 0.75390625, + "learning_rate": 0.0001374248760242516, + "loss": 0.8379, + "step": 25232 + }, + { + "epoch": 0.6479126846952679, + "grad_norm": 0.76953125, + "learning_rate": 0.00013742073620618358, + "loss": 0.8419, + "step": 25233 + }, + { + "epoch": 0.6479383618911897, + "grad_norm": 0.8359375, + "learning_rate": 0.00013741659631353947, + "loss": 0.863, + "step": 25234 + }, + { + "epoch": 0.6479640390871115, + "grad_norm": 0.7734375, + "learning_rate": 0.00013741245634632746, + "loss": 0.8429, + "step": 25235 + }, + { + "epoch": 0.6479897162830333, + "grad_norm": 0.74609375, + "learning_rate": 0.00013740831630455579, + "loss": 0.778, + "step": 25236 + }, + { + "epoch": 0.6480153934789551, + "grad_norm": 0.796875, + "learning_rate": 0.00013740417618823272, + "loss": 0.8313, + "step": 25237 + }, + { + "epoch": 0.6480410706748769, + "grad_norm": 0.73828125, + "learning_rate": 0.0001374000359973665, + "loss": 0.8314, + "step": 25238 + }, + { + "epoch": 0.6480667478707988, + "grad_norm": 0.75390625, + "learning_rate": 0.00013739589573196542, + "loss": 0.7156, + "step": 25239 + }, + { + "epoch": 0.6480924250667206, + "grad_norm": 0.75390625, + "learning_rate": 0.00013739175539203766, + "loss": 0.7901, + "step": 25240 + }, + { + "epoch": 0.6481181022626424, + "grad_norm": 0.82421875, + "learning_rate": 0.0001373876149775915, + "loss": 0.8591, + "step": 25241 + }, + { + "epoch": 0.6481437794585643, + "grad_norm": 0.8125, + "learning_rate": 0.00013738347448863521, + "loss": 0.8449, + "step": 25242 + }, + { + "epoch": 0.648169456654486, + "grad_norm": 0.8203125, + "learning_rate": 0.0001373793339251771, + "loss": 0.853, + "step": 25243 + }, + { + "epoch": 0.6481951338504078, + "grad_norm": 0.74609375, + "learning_rate": 0.00013737519328722527, + "loss": 0.8895, + "step": 25244 + }, + { + "epoch": 0.6482208110463297, + "grad_norm": 0.72265625, + "learning_rate": 0.0001373710525747881, + "loss": 0.7283, + "step": 25245 + }, + { + "epoch": 0.6482464882422515, + "grad_norm": 0.69921875, + "learning_rate": 0.00013736691178787379, + "loss": 0.7821, + "step": 25246 + }, + { + "epoch": 0.6482721654381733, + "grad_norm": 0.859375, + "learning_rate": 0.00013736277092649056, + "loss": 0.8915, + "step": 25247 + }, + { + "epoch": 0.6482978426340952, + "grad_norm": 0.765625, + "learning_rate": 0.00013735862999064678, + "loss": 0.9375, + "step": 25248 + }, + { + "epoch": 0.648323519830017, + "grad_norm": 0.83984375, + "learning_rate": 0.00013735448898035055, + "loss": 0.91, + "step": 25249 + }, + { + "epoch": 0.6483491970259387, + "grad_norm": 0.74609375, + "learning_rate": 0.00013735034789561025, + "loss": 0.7986, + "step": 25250 + }, + { + "epoch": 0.6483748742218606, + "grad_norm": 0.8203125, + "learning_rate": 0.00013734620673643404, + "loss": 0.8174, + "step": 25251 + }, + { + "epoch": 0.6484005514177824, + "grad_norm": 0.75, + "learning_rate": 0.00013734206550283025, + "loss": 0.8746, + "step": 25252 + }, + { + "epoch": 0.6484262286137042, + "grad_norm": 0.87109375, + "learning_rate": 0.00013733792419480707, + "loss": 0.8863, + "step": 25253 + }, + { + "epoch": 0.6484519058096261, + "grad_norm": 0.796875, + "learning_rate": 0.00013733378281237277, + "loss": 0.8321, + "step": 25254 + }, + { + "epoch": 0.6484775830055479, + "grad_norm": 0.75390625, + "learning_rate": 0.00013732964135553566, + "loss": 0.9151, + "step": 25255 + }, + { + "epoch": 0.6485032602014696, + "grad_norm": 0.75390625, + "learning_rate": 0.00013732549982430392, + "loss": 0.8445, + "step": 25256 + }, + { + "epoch": 0.6485289373973915, + "grad_norm": 0.7890625, + "learning_rate": 0.00013732135821868582, + "loss": 0.8493, + "step": 25257 + }, + { + "epoch": 0.6485546145933133, + "grad_norm": 0.86328125, + "learning_rate": 0.00013731721653868962, + "loss": 0.8554, + "step": 25258 + }, + { + "epoch": 0.6485802917892352, + "grad_norm": 0.84375, + "learning_rate": 0.0001373130747843236, + "loss": 0.9813, + "step": 25259 + }, + { + "epoch": 0.648605968985157, + "grad_norm": 0.84375, + "learning_rate": 0.00013730893295559596, + "loss": 0.9134, + "step": 25260 + }, + { + "epoch": 0.6486316461810788, + "grad_norm": 0.76171875, + "learning_rate": 0.00013730479105251504, + "loss": 0.7726, + "step": 25261 + }, + { + "epoch": 0.6486573233770007, + "grad_norm": 0.71484375, + "learning_rate": 0.00013730064907508898, + "loss": 0.8127, + "step": 25262 + }, + { + "epoch": 0.6486830005729224, + "grad_norm": 0.74609375, + "learning_rate": 0.00013729650702332618, + "loss": 0.7979, + "step": 25263 + }, + { + "epoch": 0.6487086777688442, + "grad_norm": 0.78515625, + "learning_rate": 0.00013729236489723476, + "loss": 0.9021, + "step": 25264 + }, + { + "epoch": 0.6487343549647661, + "grad_norm": 0.8125, + "learning_rate": 0.000137288222696823, + "loss": 0.9618, + "step": 25265 + }, + { + "epoch": 0.6487600321606879, + "grad_norm": 0.7734375, + "learning_rate": 0.0001372840804220992, + "loss": 0.8003, + "step": 25266 + }, + { + "epoch": 0.6487857093566097, + "grad_norm": 0.75390625, + "learning_rate": 0.0001372799380730716, + "loss": 0.7921, + "step": 25267 + }, + { + "epoch": 0.6488113865525316, + "grad_norm": 0.77734375, + "learning_rate": 0.00013727579564974844, + "loss": 0.837, + "step": 25268 + }, + { + "epoch": 0.6488370637484534, + "grad_norm": 0.8828125, + "learning_rate": 0.00013727165315213804, + "loss": 0.8806, + "step": 25269 + }, + { + "epoch": 0.6488627409443751, + "grad_norm": 0.796875, + "learning_rate": 0.00013726751058024854, + "loss": 0.7885, + "step": 25270 + }, + { + "epoch": 0.648888418140297, + "grad_norm": 0.85546875, + "learning_rate": 0.00013726336793408827, + "loss": 0.9729, + "step": 25271 + }, + { + "epoch": 0.6489140953362188, + "grad_norm": 0.79296875, + "learning_rate": 0.0001372592252136655, + "loss": 0.8562, + "step": 25272 + }, + { + "epoch": 0.6489397725321406, + "grad_norm": 0.83203125, + "learning_rate": 0.0001372550824189884, + "loss": 0.8278, + "step": 25273 + }, + { + "epoch": 0.6489654497280625, + "grad_norm": 0.79296875, + "learning_rate": 0.00013725093955006533, + "loss": 0.8107, + "step": 25274 + }, + { + "epoch": 0.6489911269239843, + "grad_norm": 0.78515625, + "learning_rate": 0.0001372467966069045, + "loss": 0.7904, + "step": 25275 + }, + { + "epoch": 0.649016804119906, + "grad_norm": 0.83203125, + "learning_rate": 0.0001372426535895142, + "loss": 0.7392, + "step": 25276 + }, + { + "epoch": 0.6490424813158279, + "grad_norm": 0.76953125, + "learning_rate": 0.0001372385104979026, + "loss": 0.9405, + "step": 25277 + }, + { + "epoch": 0.6490681585117497, + "grad_norm": 0.7109375, + "learning_rate": 0.00013723436733207805, + "loss": 0.9727, + "step": 25278 + }, + { + "epoch": 0.6490938357076715, + "grad_norm": 0.83203125, + "learning_rate": 0.00013723022409204875, + "loss": 0.9506, + "step": 25279 + }, + { + "epoch": 0.6491195129035934, + "grad_norm": 0.765625, + "learning_rate": 0.000137226080777823, + "loss": 0.7834, + "step": 25280 + }, + { + "epoch": 0.6491451900995152, + "grad_norm": 0.8125, + "learning_rate": 0.000137221937389409, + "loss": 1.003, + "step": 25281 + }, + { + "epoch": 0.649170867295437, + "grad_norm": 0.828125, + "learning_rate": 0.00013721779392681508, + "loss": 0.8163, + "step": 25282 + }, + { + "epoch": 0.6491965444913588, + "grad_norm": 0.79296875, + "learning_rate": 0.0001372136503900494, + "loss": 0.8513, + "step": 25283 + }, + { + "epoch": 0.6492222216872806, + "grad_norm": 0.875, + "learning_rate": 0.00013720950677912033, + "loss": 0.8631, + "step": 25284 + }, + { + "epoch": 0.6492478988832024, + "grad_norm": 0.76953125, + "learning_rate": 0.00013720536309403604, + "loss": 0.8273, + "step": 25285 + }, + { + "epoch": 0.6492735760791243, + "grad_norm": 0.78515625, + "learning_rate": 0.00013720121933480484, + "loss": 0.9312, + "step": 25286 + }, + { + "epoch": 0.6492992532750461, + "grad_norm": 0.87109375, + "learning_rate": 0.00013719707550143495, + "loss": 0.8187, + "step": 25287 + }, + { + "epoch": 0.649324930470968, + "grad_norm": 0.76953125, + "learning_rate": 0.00013719293159393468, + "loss": 0.8737, + "step": 25288 + }, + { + "epoch": 0.6493506076668898, + "grad_norm": 0.7421875, + "learning_rate": 0.00013718878761231223, + "loss": 0.8214, + "step": 25289 + }, + { + "epoch": 0.6493762848628115, + "grad_norm": 0.67578125, + "learning_rate": 0.00013718464355657592, + "loss": 0.7403, + "step": 25290 + }, + { + "epoch": 0.6494019620587334, + "grad_norm": 0.70703125, + "learning_rate": 0.00013718049942673395, + "loss": 0.7515, + "step": 25291 + }, + { + "epoch": 0.6494276392546552, + "grad_norm": 0.859375, + "learning_rate": 0.0001371763552227946, + "loss": 0.8756, + "step": 25292 + }, + { + "epoch": 0.649453316450577, + "grad_norm": 0.7890625, + "learning_rate": 0.00013717221094476612, + "loss": 1.0629, + "step": 25293 + }, + { + "epoch": 0.6494789936464989, + "grad_norm": 0.80078125, + "learning_rate": 0.0001371680665926568, + "loss": 0.8477, + "step": 25294 + }, + { + "epoch": 0.6495046708424207, + "grad_norm": 0.75390625, + "learning_rate": 0.0001371639221664749, + "loss": 0.7958, + "step": 25295 + }, + { + "epoch": 0.6495303480383424, + "grad_norm": 0.8046875, + "learning_rate": 0.00013715977766622862, + "loss": 0.8357, + "step": 25296 + }, + { + "epoch": 0.6495560252342643, + "grad_norm": 0.7734375, + "learning_rate": 0.0001371556330919263, + "loss": 0.8375, + "step": 25297 + }, + { + "epoch": 0.6495817024301861, + "grad_norm": 0.76953125, + "learning_rate": 0.00013715148844357616, + "loss": 0.7745, + "step": 25298 + }, + { + "epoch": 0.6496073796261079, + "grad_norm": 0.8203125, + "learning_rate": 0.0001371473437211864, + "loss": 0.7372, + "step": 25299 + }, + { + "epoch": 0.6496330568220298, + "grad_norm": 0.83984375, + "learning_rate": 0.00013714319892476538, + "loss": 0.8717, + "step": 25300 + }, + { + "epoch": 0.6496587340179516, + "grad_norm": 0.7578125, + "learning_rate": 0.00013713905405432134, + "loss": 0.7514, + "step": 25301 + }, + { + "epoch": 0.6496844112138734, + "grad_norm": 0.73046875, + "learning_rate": 0.0001371349091098625, + "loss": 0.9245, + "step": 25302 + }, + { + "epoch": 0.6497100884097952, + "grad_norm": 0.82421875, + "learning_rate": 0.00013713076409139712, + "loss": 0.9084, + "step": 25303 + }, + { + "epoch": 0.649735765605717, + "grad_norm": 0.83203125, + "learning_rate": 0.00013712661899893352, + "loss": 0.8907, + "step": 25304 + }, + { + "epoch": 0.6497614428016388, + "grad_norm": 0.7734375, + "learning_rate": 0.0001371224738324799, + "loss": 0.8802, + "step": 25305 + }, + { + "epoch": 0.6497871199975607, + "grad_norm": 0.7578125, + "learning_rate": 0.0001371183285920445, + "loss": 0.8881, + "step": 25306 + }, + { + "epoch": 0.6498127971934825, + "grad_norm": 0.73828125, + "learning_rate": 0.00013711418327763566, + "loss": 0.7871, + "step": 25307 + }, + { + "epoch": 0.6498384743894043, + "grad_norm": 0.7578125, + "learning_rate": 0.00013711003788926165, + "loss": 0.9131, + "step": 25308 + }, + { + "epoch": 0.6498641515853262, + "grad_norm": 0.77734375, + "learning_rate": 0.00013710589242693066, + "loss": 0.915, + "step": 25309 + }, + { + "epoch": 0.6498898287812479, + "grad_norm": 0.85546875, + "learning_rate": 0.00013710174689065096, + "loss": 0.8808, + "step": 25310 + }, + { + "epoch": 0.6499155059771697, + "grad_norm": 0.80859375, + "learning_rate": 0.00013709760128043083, + "loss": 0.903, + "step": 25311 + }, + { + "epoch": 0.6499411831730916, + "grad_norm": 0.80859375, + "learning_rate": 0.00013709345559627856, + "loss": 0.8343, + "step": 25312 + }, + { + "epoch": 0.6499668603690134, + "grad_norm": 0.828125, + "learning_rate": 0.00013708930983820234, + "loss": 0.7951, + "step": 25313 + }, + { + "epoch": 0.6499925375649352, + "grad_norm": 0.86328125, + "learning_rate": 0.0001370851640062105, + "loss": 0.8855, + "step": 25314 + }, + { + "epoch": 0.6500182147608571, + "grad_norm": 0.9296875, + "learning_rate": 0.00013708101810031129, + "loss": 0.8466, + "step": 25315 + }, + { + "epoch": 0.6500438919567788, + "grad_norm": 0.80078125, + "learning_rate": 0.00013707687212051293, + "loss": 0.8539, + "step": 25316 + }, + { + "epoch": 0.6500695691527006, + "grad_norm": 0.8046875, + "learning_rate": 0.00013707272606682376, + "loss": 0.8718, + "step": 25317 + }, + { + "epoch": 0.6500952463486225, + "grad_norm": 0.87890625, + "learning_rate": 0.00013706857993925196, + "loss": 0.977, + "step": 25318 + }, + { + "epoch": 0.6501209235445443, + "grad_norm": 0.75390625, + "learning_rate": 0.00013706443373780582, + "loss": 0.6853, + "step": 25319 + }, + { + "epoch": 0.6501466007404662, + "grad_norm": 0.796875, + "learning_rate": 0.00013706028746249363, + "loss": 0.9151, + "step": 25320 + }, + { + "epoch": 0.650172277936388, + "grad_norm": 0.703125, + "learning_rate": 0.00013705614111332365, + "loss": 0.727, + "step": 25321 + }, + { + "epoch": 0.6501979551323098, + "grad_norm": 0.76171875, + "learning_rate": 0.0001370519946903041, + "loss": 0.9129, + "step": 25322 + }, + { + "epoch": 0.6502236323282315, + "grad_norm": 0.8203125, + "learning_rate": 0.00013704784819344327, + "loss": 0.8574, + "step": 25323 + }, + { + "epoch": 0.6502493095241534, + "grad_norm": 0.796875, + "learning_rate": 0.00013704370162274945, + "loss": 0.891, + "step": 25324 + }, + { + "epoch": 0.6502749867200752, + "grad_norm": 0.7109375, + "learning_rate": 0.00013703955497823084, + "loss": 0.77, + "step": 25325 + }, + { + "epoch": 0.6503006639159971, + "grad_norm": 0.72265625, + "learning_rate": 0.00013703540825989578, + "loss": 0.8782, + "step": 25326 + }, + { + "epoch": 0.6503263411119189, + "grad_norm": 0.74609375, + "learning_rate": 0.00013703126146775247, + "loss": 0.7488, + "step": 25327 + }, + { + "epoch": 0.6503520183078407, + "grad_norm": 0.78515625, + "learning_rate": 0.0001370271146018092, + "loss": 0.871, + "step": 25328 + }, + { + "epoch": 0.6503776955037626, + "grad_norm": 0.7734375, + "learning_rate": 0.00013702296766207427, + "loss": 0.8812, + "step": 25329 + }, + { + "epoch": 0.6504033726996843, + "grad_norm": 0.7421875, + "learning_rate": 0.0001370188206485559, + "loss": 0.7828, + "step": 25330 + }, + { + "epoch": 0.6504290498956061, + "grad_norm": 0.78515625, + "learning_rate": 0.00013701467356126235, + "loss": 0.9096, + "step": 25331 + }, + { + "epoch": 0.650454727091528, + "grad_norm": 0.8046875, + "learning_rate": 0.0001370105264002019, + "loss": 0.8746, + "step": 25332 + }, + { + "epoch": 0.6504804042874498, + "grad_norm": 0.86328125, + "learning_rate": 0.00013700637916538283, + "loss": 0.8504, + "step": 25333 + }, + { + "epoch": 0.6505060814833716, + "grad_norm": 0.80078125, + "learning_rate": 0.00013700223185681334, + "loss": 0.8321, + "step": 25334 + }, + { + "epoch": 0.6505317586792935, + "grad_norm": 0.80078125, + "learning_rate": 0.00013699808447450182, + "loss": 0.8822, + "step": 25335 + }, + { + "epoch": 0.6505574358752152, + "grad_norm": 0.75, + "learning_rate": 0.0001369939370184564, + "loss": 0.9152, + "step": 25336 + }, + { + "epoch": 0.650583113071137, + "grad_norm": 0.8515625, + "learning_rate": 0.00013698978948868543, + "loss": 0.87, + "step": 25337 + }, + { + "epoch": 0.6506087902670589, + "grad_norm": 0.78125, + "learning_rate": 0.00013698564188519715, + "loss": 0.7638, + "step": 25338 + }, + { + "epoch": 0.6506344674629807, + "grad_norm": 0.84765625, + "learning_rate": 0.00013698149420799982, + "loss": 0.8468, + "step": 25339 + }, + { + "epoch": 0.6506601446589025, + "grad_norm": 0.78125, + "learning_rate": 0.0001369773464571017, + "loss": 0.8516, + "step": 25340 + }, + { + "epoch": 0.6506858218548244, + "grad_norm": 0.7890625, + "learning_rate": 0.0001369731986325111, + "loss": 0.9131, + "step": 25341 + }, + { + "epoch": 0.6507114990507462, + "grad_norm": 0.76171875, + "learning_rate": 0.00013696905073423625, + "loss": 0.7637, + "step": 25342 + }, + { + "epoch": 0.6507371762466679, + "grad_norm": 0.7109375, + "learning_rate": 0.00013696490276228542, + "loss": 0.7535, + "step": 25343 + }, + { + "epoch": 0.6507628534425898, + "grad_norm": 0.74609375, + "learning_rate": 0.00013696075471666684, + "loss": 0.8304, + "step": 25344 + }, + { + "epoch": 0.6507885306385116, + "grad_norm": 0.7578125, + "learning_rate": 0.00013695660659738884, + "loss": 0.7963, + "step": 25345 + }, + { + "epoch": 0.6508142078344334, + "grad_norm": 0.67578125, + "learning_rate": 0.00013695245840445968, + "loss": 0.8172, + "step": 25346 + }, + { + "epoch": 0.6508398850303553, + "grad_norm": 0.83984375, + "learning_rate": 0.00013694831013788758, + "loss": 0.7258, + "step": 25347 + }, + { + "epoch": 0.6508655622262771, + "grad_norm": 0.76953125, + "learning_rate": 0.00013694416179768087, + "loss": 0.8585, + "step": 25348 + }, + { + "epoch": 0.650891239422199, + "grad_norm": 0.74609375, + "learning_rate": 0.00013694001338384777, + "loss": 0.966, + "step": 25349 + }, + { + "epoch": 0.6509169166181207, + "grad_norm": 0.74609375, + "learning_rate": 0.00013693586489639653, + "loss": 0.7769, + "step": 25350 + }, + { + "epoch": 0.6509425938140425, + "grad_norm": 0.74609375, + "learning_rate": 0.00013693171633533547, + "loss": 0.899, + "step": 25351 + }, + { + "epoch": 0.6509682710099643, + "grad_norm": 0.75, + "learning_rate": 0.00013692756770067283, + "loss": 0.7928, + "step": 25352 + }, + { + "epoch": 0.6509939482058862, + "grad_norm": 0.8203125, + "learning_rate": 0.00013692341899241688, + "loss": 0.8696, + "step": 25353 + }, + { + "epoch": 0.651019625401808, + "grad_norm": 0.79296875, + "learning_rate": 0.0001369192702105759, + "loss": 0.8896, + "step": 25354 + }, + { + "epoch": 0.6510453025977299, + "grad_norm": 0.85546875, + "learning_rate": 0.00013691512135515817, + "loss": 0.8301, + "step": 25355 + }, + { + "epoch": 0.6510709797936516, + "grad_norm": 0.73046875, + "learning_rate": 0.0001369109724261719, + "loss": 0.8542, + "step": 25356 + }, + { + "epoch": 0.6510966569895734, + "grad_norm": 0.796875, + "learning_rate": 0.0001369068234236254, + "loss": 0.8312, + "step": 25357 + }, + { + "epoch": 0.6511223341854953, + "grad_norm": 0.76953125, + "learning_rate": 0.00013690267434752695, + "loss": 0.8596, + "step": 25358 + }, + { + "epoch": 0.6511480113814171, + "grad_norm": 0.765625, + "learning_rate": 0.0001368985251978848, + "loss": 0.7708, + "step": 25359 + }, + { + "epoch": 0.6511736885773389, + "grad_norm": 0.77734375, + "learning_rate": 0.0001368943759747072, + "loss": 0.7824, + "step": 25360 + }, + { + "epoch": 0.6511993657732608, + "grad_norm": 0.8125, + "learning_rate": 0.0001368902266780025, + "loss": 0.803, + "step": 25361 + }, + { + "epoch": 0.6512250429691826, + "grad_norm": 0.796875, + "learning_rate": 0.00013688607730777888, + "loss": 0.952, + "step": 25362 + }, + { + "epoch": 0.6512507201651043, + "grad_norm": 0.734375, + "learning_rate": 0.00013688192786404462, + "loss": 0.7274, + "step": 25363 + }, + { + "epoch": 0.6512763973610262, + "grad_norm": 0.703125, + "learning_rate": 0.00013687777834680802, + "loss": 0.7084, + "step": 25364 + }, + { + "epoch": 0.651302074556948, + "grad_norm": 0.83203125, + "learning_rate": 0.00013687362875607734, + "loss": 0.9834, + "step": 25365 + }, + { + "epoch": 0.6513277517528698, + "grad_norm": 0.7890625, + "learning_rate": 0.00013686947909186085, + "loss": 1.0087, + "step": 25366 + }, + { + "epoch": 0.6513534289487917, + "grad_norm": 0.796875, + "learning_rate": 0.0001368653293541668, + "loss": 0.8499, + "step": 25367 + }, + { + "epoch": 0.6513791061447135, + "grad_norm": 0.7578125, + "learning_rate": 0.00013686117954300351, + "loss": 0.7777, + "step": 25368 + }, + { + "epoch": 0.6514047833406353, + "grad_norm": 0.9609375, + "learning_rate": 0.0001368570296583792, + "loss": 0.8412, + "step": 25369 + }, + { + "epoch": 0.6514304605365571, + "grad_norm": 0.7578125, + "learning_rate": 0.00013685287970030218, + "loss": 0.7913, + "step": 25370 + }, + { + "epoch": 0.6514561377324789, + "grad_norm": 0.77734375, + "learning_rate": 0.00013684872966878068, + "loss": 0.745, + "step": 25371 + }, + { + "epoch": 0.6514818149284007, + "grad_norm": 0.75390625, + "learning_rate": 0.000136844579563823, + "loss": 0.8515, + "step": 25372 + }, + { + "epoch": 0.6515074921243226, + "grad_norm": 0.78515625, + "learning_rate": 0.0001368404293854374, + "loss": 0.898, + "step": 25373 + }, + { + "epoch": 0.6515331693202444, + "grad_norm": 0.78125, + "learning_rate": 0.00013683627913363216, + "loss": 0.8524, + "step": 25374 + }, + { + "epoch": 0.6515588465161662, + "grad_norm": 0.734375, + "learning_rate": 0.00013683212880841552, + "loss": 0.6683, + "step": 25375 + }, + { + "epoch": 0.651584523712088, + "grad_norm": 0.7890625, + "learning_rate": 0.00013682797840979576, + "loss": 0.8587, + "step": 25376 + }, + { + "epoch": 0.6516102009080098, + "grad_norm": 0.80078125, + "learning_rate": 0.00013682382793778121, + "loss": 0.8764, + "step": 25377 + }, + { + "epoch": 0.6516358781039316, + "grad_norm": 0.84765625, + "learning_rate": 0.00013681967739238008, + "loss": 0.932, + "step": 25378 + }, + { + "epoch": 0.6516615552998535, + "grad_norm": 0.76953125, + "learning_rate": 0.00013681552677360066, + "loss": 0.8378, + "step": 25379 + }, + { + "epoch": 0.6516872324957753, + "grad_norm": 0.78515625, + "learning_rate": 0.0001368113760814512, + "loss": 0.862, + "step": 25380 + }, + { + "epoch": 0.6517129096916972, + "grad_norm": 0.77734375, + "learning_rate": 0.00013680722531594003, + "loss": 0.7325, + "step": 25381 + }, + { + "epoch": 0.651738586887619, + "grad_norm": 0.67578125, + "learning_rate": 0.00013680307447707537, + "loss": 0.8017, + "step": 25382 + }, + { + "epoch": 0.6517642640835407, + "grad_norm": 0.765625, + "learning_rate": 0.0001367989235648655, + "loss": 0.8831, + "step": 25383 + }, + { + "epoch": 0.6517899412794625, + "grad_norm": 0.8125, + "learning_rate": 0.0001367947725793187, + "loss": 0.8214, + "step": 25384 + }, + { + "epoch": 0.6518156184753844, + "grad_norm": 0.73046875, + "learning_rate": 0.00013679062152044323, + "loss": 0.7757, + "step": 25385 + }, + { + "epoch": 0.6518412956713062, + "grad_norm": 0.8125, + "learning_rate": 0.0001367864703882474, + "loss": 0.6855, + "step": 25386 + }, + { + "epoch": 0.6518669728672281, + "grad_norm": 0.71484375, + "learning_rate": 0.00013678231918273944, + "loss": 0.8605, + "step": 25387 + }, + { + "epoch": 0.6518926500631499, + "grad_norm": 0.78125, + "learning_rate": 0.00013677816790392766, + "loss": 0.7933, + "step": 25388 + }, + { + "epoch": 0.6519183272590717, + "grad_norm": 0.7421875, + "learning_rate": 0.00013677401655182031, + "loss": 0.794, + "step": 25389 + }, + { + "epoch": 0.6519440044549935, + "grad_norm": 0.77734375, + "learning_rate": 0.00013676986512642565, + "loss": 0.9102, + "step": 25390 + }, + { + "epoch": 0.6519696816509153, + "grad_norm": 0.70703125, + "learning_rate": 0.00013676571362775198, + "loss": 0.8297, + "step": 25391 + }, + { + "epoch": 0.6519953588468371, + "grad_norm": 0.7109375, + "learning_rate": 0.00013676156205580755, + "loss": 0.6215, + "step": 25392 + }, + { + "epoch": 0.652021036042759, + "grad_norm": 0.67578125, + "learning_rate": 0.00013675741041060065, + "loss": 0.7614, + "step": 25393 + }, + { + "epoch": 0.6520467132386808, + "grad_norm": 0.796875, + "learning_rate": 0.00013675325869213958, + "loss": 0.9036, + "step": 25394 + }, + { + "epoch": 0.6520723904346026, + "grad_norm": 0.73828125, + "learning_rate": 0.00013674910690043254, + "loss": 0.9698, + "step": 25395 + }, + { + "epoch": 0.6520980676305244, + "grad_norm": 0.80078125, + "learning_rate": 0.00013674495503548787, + "loss": 0.7786, + "step": 25396 + }, + { + "epoch": 0.6521237448264462, + "grad_norm": 0.72265625, + "learning_rate": 0.00013674080309731385, + "loss": 0.8018, + "step": 25397 + }, + { + "epoch": 0.652149422022368, + "grad_norm": 0.7109375, + "learning_rate": 0.0001367366510859187, + "loss": 0.86, + "step": 25398 + }, + { + "epoch": 0.6521750992182899, + "grad_norm": 0.8125, + "learning_rate": 0.00013673249900131074, + "loss": 0.9284, + "step": 25399 + }, + { + "epoch": 0.6522007764142117, + "grad_norm": 0.7421875, + "learning_rate": 0.0001367283468434982, + "loss": 0.7815, + "step": 25400 + }, + { + "epoch": 0.6522264536101335, + "grad_norm": 0.70703125, + "learning_rate": 0.0001367241946124894, + "loss": 0.853, + "step": 25401 + }, + { + "epoch": 0.6522521308060554, + "grad_norm": 0.76953125, + "learning_rate": 0.00013672004230829262, + "loss": 0.7796, + "step": 25402 + }, + { + "epoch": 0.6522778080019771, + "grad_norm": 0.76171875, + "learning_rate": 0.00013671588993091608, + "loss": 0.924, + "step": 25403 + }, + { + "epoch": 0.6523034851978989, + "grad_norm": 0.734375, + "learning_rate": 0.0001367117374803681, + "loss": 0.8069, + "step": 25404 + }, + { + "epoch": 0.6523291623938208, + "grad_norm": 0.84765625, + "learning_rate": 0.00013670758495665694, + "loss": 0.8412, + "step": 25405 + }, + { + "epoch": 0.6523548395897426, + "grad_norm": 0.8203125, + "learning_rate": 0.00013670343235979083, + "loss": 0.924, + "step": 25406 + }, + { + "epoch": 0.6523805167856644, + "grad_norm": 0.78125, + "learning_rate": 0.00013669927968977817, + "loss": 0.6844, + "step": 25407 + }, + { + "epoch": 0.6524061939815863, + "grad_norm": 0.8515625, + "learning_rate": 0.00013669512694662714, + "loss": 0.9204, + "step": 25408 + }, + { + "epoch": 0.652431871177508, + "grad_norm": 0.7421875, + "learning_rate": 0.00013669097413034604, + "loss": 0.6929, + "step": 25409 + }, + { + "epoch": 0.6524575483734298, + "grad_norm": 0.765625, + "learning_rate": 0.00013668682124094315, + "loss": 0.8871, + "step": 25410 + }, + { + "epoch": 0.6524832255693517, + "grad_norm": 0.7265625, + "learning_rate": 0.00013668266827842669, + "loss": 0.7748, + "step": 25411 + }, + { + "epoch": 0.6525089027652735, + "grad_norm": 0.7265625, + "learning_rate": 0.00013667851524280504, + "loss": 0.8742, + "step": 25412 + }, + { + "epoch": 0.6525345799611953, + "grad_norm": 0.7421875, + "learning_rate": 0.0001366743621340864, + "loss": 0.7503, + "step": 25413 + }, + { + "epoch": 0.6525602571571172, + "grad_norm": 0.73046875, + "learning_rate": 0.00013667020895227907, + "loss": 0.8798, + "step": 25414 + }, + { + "epoch": 0.652585934353039, + "grad_norm": 0.7421875, + "learning_rate": 0.00013666605569739133, + "loss": 0.7904, + "step": 25415 + }, + { + "epoch": 0.6526116115489607, + "grad_norm": 0.7265625, + "learning_rate": 0.00013666190236943143, + "loss": 0.796, + "step": 25416 + }, + { + "epoch": 0.6526372887448826, + "grad_norm": 0.7578125, + "learning_rate": 0.00013665774896840768, + "loss": 0.8594, + "step": 25417 + }, + { + "epoch": 0.6526629659408044, + "grad_norm": 0.7734375, + "learning_rate": 0.00013665359549432836, + "loss": 0.7487, + "step": 25418 + }, + { + "epoch": 0.6526886431367263, + "grad_norm": 1.03125, + "learning_rate": 0.00013664944194720173, + "loss": 0.9537, + "step": 25419 + }, + { + "epoch": 0.6527143203326481, + "grad_norm": 0.796875, + "learning_rate": 0.00013664528832703612, + "loss": 0.7627, + "step": 25420 + }, + { + "epoch": 0.6527399975285699, + "grad_norm": 1.1328125, + "learning_rate": 0.00013664113463383968, + "loss": 0.8332, + "step": 25421 + }, + { + "epoch": 0.6527656747244918, + "grad_norm": 0.76171875, + "learning_rate": 0.0001366369808676208, + "loss": 0.8911, + "step": 25422 + }, + { + "epoch": 0.6527913519204135, + "grad_norm": 0.77734375, + "learning_rate": 0.00013663282702838772, + "loss": 0.8457, + "step": 25423 + }, + { + "epoch": 0.6528170291163353, + "grad_norm": 0.6953125, + "learning_rate": 0.00013662867311614875, + "loss": 0.7155, + "step": 25424 + }, + { + "epoch": 0.6528427063122572, + "grad_norm": 0.80859375, + "learning_rate": 0.00013662451913091212, + "loss": 0.9981, + "step": 25425 + }, + { + "epoch": 0.652868383508179, + "grad_norm": 0.73046875, + "learning_rate": 0.00013662036507268614, + "loss": 0.8473, + "step": 25426 + }, + { + "epoch": 0.6528940607041008, + "grad_norm": 0.79296875, + "learning_rate": 0.0001366162109414791, + "loss": 0.8208, + "step": 25427 + }, + { + "epoch": 0.6529197379000227, + "grad_norm": 0.75390625, + "learning_rate": 0.00013661205673729925, + "loss": 0.8326, + "step": 25428 + }, + { + "epoch": 0.6529454150959444, + "grad_norm": 0.859375, + "learning_rate": 0.00013660790246015485, + "loss": 0.8468, + "step": 25429 + }, + { + "epoch": 0.6529710922918662, + "grad_norm": 0.79296875, + "learning_rate": 0.00013660374811005422, + "loss": 0.8579, + "step": 25430 + }, + { + "epoch": 0.6529967694877881, + "grad_norm": 0.77734375, + "learning_rate": 0.00013659959368700564, + "loss": 0.896, + "step": 25431 + }, + { + "epoch": 0.6530224466837099, + "grad_norm": 0.7578125, + "learning_rate": 0.00013659543919101734, + "loss": 0.7598, + "step": 25432 + }, + { + "epoch": 0.6530481238796317, + "grad_norm": 0.734375, + "learning_rate": 0.00013659128462209768, + "loss": 0.7557, + "step": 25433 + }, + { + "epoch": 0.6530738010755536, + "grad_norm": 0.71484375, + "learning_rate": 0.00013658712998025488, + "loss": 0.8349, + "step": 25434 + }, + { + "epoch": 0.6530994782714754, + "grad_norm": 0.78515625, + "learning_rate": 0.00013658297526549724, + "loss": 0.8321, + "step": 25435 + }, + { + "epoch": 0.6531251554673971, + "grad_norm": 0.77734375, + "learning_rate": 0.00013657882047783302, + "loss": 0.8802, + "step": 25436 + }, + { + "epoch": 0.653150832663319, + "grad_norm": 0.75390625, + "learning_rate": 0.0001365746656172705, + "loss": 0.8313, + "step": 25437 + }, + { + "epoch": 0.6531765098592408, + "grad_norm": 0.76171875, + "learning_rate": 0.00013657051068381802, + "loss": 0.8839, + "step": 25438 + }, + { + "epoch": 0.6532021870551626, + "grad_norm": 0.796875, + "learning_rate": 0.0001365663556774838, + "loss": 0.8775, + "step": 25439 + }, + { + "epoch": 0.6532278642510845, + "grad_norm": 0.8359375, + "learning_rate": 0.00013656220059827612, + "loss": 0.8998, + "step": 25440 + }, + { + "epoch": 0.6532535414470063, + "grad_norm": 0.8125, + "learning_rate": 0.0001365580454462033, + "loss": 0.7981, + "step": 25441 + }, + { + "epoch": 0.6532792186429282, + "grad_norm": 0.80078125, + "learning_rate": 0.00013655389022127357, + "loss": 0.7489, + "step": 25442 + }, + { + "epoch": 0.6533048958388499, + "grad_norm": 0.71875, + "learning_rate": 0.00013654973492349527, + "loss": 0.7028, + "step": 25443 + }, + { + "epoch": 0.6533305730347717, + "grad_norm": 0.84765625, + "learning_rate": 0.00013654557955287662, + "loss": 0.8907, + "step": 25444 + }, + { + "epoch": 0.6533562502306935, + "grad_norm": 0.77734375, + "learning_rate": 0.00013654142410942594, + "loss": 0.906, + "step": 25445 + }, + { + "epoch": 0.6533819274266154, + "grad_norm": 0.84375, + "learning_rate": 0.0001365372685931515, + "loss": 0.8997, + "step": 25446 + }, + { + "epoch": 0.6534076046225372, + "grad_norm": 0.75, + "learning_rate": 0.0001365331130040616, + "loss": 0.7619, + "step": 25447 + }, + { + "epoch": 0.653433281818459, + "grad_norm": 0.78125, + "learning_rate": 0.00013652895734216446, + "loss": 0.9015, + "step": 25448 + }, + { + "epoch": 0.6534589590143808, + "grad_norm": 0.78125, + "learning_rate": 0.00013652480160746846, + "loss": 0.8512, + "step": 25449 + }, + { + "epoch": 0.6534846362103026, + "grad_norm": 1.2421875, + "learning_rate": 0.00013652064579998182, + "loss": 0.8376, + "step": 25450 + }, + { + "epoch": 0.6535103134062245, + "grad_norm": 0.76953125, + "learning_rate": 0.0001365164899197128, + "loss": 0.8335, + "step": 25451 + }, + { + "epoch": 0.6535359906021463, + "grad_norm": 0.75, + "learning_rate": 0.00013651233396666976, + "loss": 0.9078, + "step": 25452 + }, + { + "epoch": 0.6535616677980681, + "grad_norm": 0.71875, + "learning_rate": 0.00013650817794086091, + "loss": 0.8774, + "step": 25453 + }, + { + "epoch": 0.65358734499399, + "grad_norm": 0.80078125, + "learning_rate": 0.00013650402184229457, + "loss": 0.7892, + "step": 25454 + }, + { + "epoch": 0.6536130221899118, + "grad_norm": 0.79296875, + "learning_rate": 0.00013649986567097901, + "loss": 0.8263, + "step": 25455 + }, + { + "epoch": 0.6536386993858335, + "grad_norm": 0.828125, + "learning_rate": 0.0001364957094269225, + "loss": 0.8426, + "step": 25456 + }, + { + "epoch": 0.6536643765817554, + "grad_norm": 0.80078125, + "learning_rate": 0.00013649155311013336, + "loss": 0.8135, + "step": 25457 + }, + { + "epoch": 0.6536900537776772, + "grad_norm": 0.75390625, + "learning_rate": 0.00013648739672061984, + "loss": 0.8121, + "step": 25458 + }, + { + "epoch": 0.653715730973599, + "grad_norm": 0.72265625, + "learning_rate": 0.0001364832402583902, + "loss": 0.7698, + "step": 25459 + }, + { + "epoch": 0.6537414081695209, + "grad_norm": 0.7890625, + "learning_rate": 0.00013647908372345285, + "loss": 0.8363, + "step": 25460 + }, + { + "epoch": 0.6537670853654427, + "grad_norm": 0.71484375, + "learning_rate": 0.00013647492711581588, + "loss": 0.7311, + "step": 25461 + }, + { + "epoch": 0.6537927625613645, + "grad_norm": 0.84375, + "learning_rate": 0.00013647077043548773, + "loss": 0.7691, + "step": 25462 + }, + { + "epoch": 0.6538184397572863, + "grad_norm": 0.8046875, + "learning_rate": 0.00013646661368247663, + "loss": 0.8763, + "step": 25463 + }, + { + "epoch": 0.6538441169532081, + "grad_norm": 0.765625, + "learning_rate": 0.00013646245685679082, + "loss": 0.755, + "step": 25464 + }, + { + "epoch": 0.6538697941491299, + "grad_norm": 0.73828125, + "learning_rate": 0.00013645829995843865, + "loss": 0.8856, + "step": 25465 + }, + { + "epoch": 0.6538954713450518, + "grad_norm": 0.81640625, + "learning_rate": 0.0001364541429874284, + "loss": 0.8195, + "step": 25466 + }, + { + "epoch": 0.6539211485409736, + "grad_norm": 0.80078125, + "learning_rate": 0.00013644998594376835, + "loss": 0.909, + "step": 25467 + }, + { + "epoch": 0.6539468257368954, + "grad_norm": 0.86328125, + "learning_rate": 0.00013644582882746671, + "loss": 0.8429, + "step": 25468 + }, + { + "epoch": 0.6539725029328172, + "grad_norm": 0.7890625, + "learning_rate": 0.00013644167163853188, + "loss": 0.824, + "step": 25469 + }, + { + "epoch": 0.653998180128739, + "grad_norm": 0.8125, + "learning_rate": 0.00013643751437697205, + "loss": 0.9275, + "step": 25470 + }, + { + "epoch": 0.6540238573246608, + "grad_norm": 0.7421875, + "learning_rate": 0.0001364333570427956, + "loss": 0.8452, + "step": 25471 + }, + { + "epoch": 0.6540495345205827, + "grad_norm": 0.71484375, + "learning_rate": 0.00013642919963601072, + "loss": 0.8607, + "step": 25472 + }, + { + "epoch": 0.6540752117165045, + "grad_norm": 0.81640625, + "learning_rate": 0.00013642504215662576, + "loss": 0.9245, + "step": 25473 + }, + { + "epoch": 0.6541008889124263, + "grad_norm": 0.77734375, + "learning_rate": 0.00013642088460464894, + "loss": 0.8505, + "step": 25474 + }, + { + "epoch": 0.6541265661083482, + "grad_norm": 0.734375, + "learning_rate": 0.0001364167269800886, + "loss": 0.8338, + "step": 25475 + }, + { + "epoch": 0.6541522433042699, + "grad_norm": 0.765625, + "learning_rate": 0.00013641256928295303, + "loss": 0.8112, + "step": 25476 + }, + { + "epoch": 0.6541779205001917, + "grad_norm": 0.77734375, + "learning_rate": 0.0001364084115132505, + "loss": 0.8603, + "step": 25477 + }, + { + "epoch": 0.6542035976961136, + "grad_norm": 0.75, + "learning_rate": 0.00013640425367098926, + "loss": 0.8957, + "step": 25478 + }, + { + "epoch": 0.6542292748920354, + "grad_norm": 0.79296875, + "learning_rate": 0.00013640009575617768, + "loss": 0.8923, + "step": 25479 + }, + { + "epoch": 0.6542549520879573, + "grad_norm": 0.80859375, + "learning_rate": 0.00013639593776882395, + "loss": 0.8647, + "step": 25480 + }, + { + "epoch": 0.6542806292838791, + "grad_norm": 0.80078125, + "learning_rate": 0.00013639177970893644, + "loss": 0.8771, + "step": 25481 + }, + { + "epoch": 0.6543063064798009, + "grad_norm": 0.82421875, + "learning_rate": 0.0001363876215765234, + "loss": 0.7829, + "step": 25482 + }, + { + "epoch": 0.6543319836757227, + "grad_norm": 0.703125, + "learning_rate": 0.0001363834633715931, + "loss": 0.8555, + "step": 25483 + }, + { + "epoch": 0.6543576608716445, + "grad_norm": 0.79296875, + "learning_rate": 0.00013637930509415383, + "loss": 0.8296, + "step": 25484 + }, + { + "epoch": 0.6543833380675663, + "grad_norm": 0.80078125, + "learning_rate": 0.00013637514674421392, + "loss": 0.9583, + "step": 25485 + }, + { + "epoch": 0.6544090152634882, + "grad_norm": 0.7578125, + "learning_rate": 0.00013637098832178164, + "loss": 0.7909, + "step": 25486 + }, + { + "epoch": 0.65443469245941, + "grad_norm": 0.8359375, + "learning_rate": 0.00013636682982686522, + "loss": 1.0382, + "step": 25487 + }, + { + "epoch": 0.6544603696553318, + "grad_norm": 0.7734375, + "learning_rate": 0.000136362671259473, + "loss": 0.7375, + "step": 25488 + }, + { + "epoch": 0.6544860468512536, + "grad_norm": 0.78515625, + "learning_rate": 0.0001363585126196133, + "loss": 0.8039, + "step": 25489 + }, + { + "epoch": 0.6545117240471754, + "grad_norm": 0.859375, + "learning_rate": 0.00013635435390729433, + "loss": 0.9324, + "step": 25490 + }, + { + "epoch": 0.6545374012430972, + "grad_norm": 0.73046875, + "learning_rate": 0.00013635019512252443, + "loss": 0.8521, + "step": 25491 + }, + { + "epoch": 0.6545630784390191, + "grad_norm": 0.765625, + "learning_rate": 0.00013634603626531188, + "loss": 0.9122, + "step": 25492 + }, + { + "epoch": 0.6545887556349409, + "grad_norm": 0.76171875, + "learning_rate": 0.00013634187733566497, + "loss": 0.9341, + "step": 25493 + }, + { + "epoch": 0.6546144328308627, + "grad_norm": 0.82421875, + "learning_rate": 0.00013633771833359196, + "loss": 0.9222, + "step": 25494 + }, + { + "epoch": 0.6546401100267846, + "grad_norm": 0.765625, + "learning_rate": 0.00013633355925910117, + "loss": 0.7926, + "step": 25495 + }, + { + "epoch": 0.6546657872227063, + "grad_norm": 0.8203125, + "learning_rate": 0.0001363294001122009, + "loss": 0.777, + "step": 25496 + }, + { + "epoch": 0.6546914644186281, + "grad_norm": 0.78515625, + "learning_rate": 0.00013632524089289937, + "loss": 0.9196, + "step": 25497 + }, + { + "epoch": 0.65471714161455, + "grad_norm": 0.81640625, + "learning_rate": 0.00013632108160120497, + "loss": 0.9141, + "step": 25498 + }, + { + "epoch": 0.6547428188104718, + "grad_norm": 0.78515625, + "learning_rate": 0.0001363169222371259, + "loss": 0.9346, + "step": 25499 + }, + { + "epoch": 0.6547684960063936, + "grad_norm": 0.7734375, + "learning_rate": 0.00013631276280067048, + "loss": 0.8939, + "step": 25500 + }, + { + "epoch": 0.6547941732023155, + "grad_norm": 0.7734375, + "learning_rate": 0.00013630860329184703, + "loss": 0.9627, + "step": 25501 + }, + { + "epoch": 0.6548198503982373, + "grad_norm": 0.73046875, + "learning_rate": 0.0001363044437106638, + "loss": 0.7876, + "step": 25502 + }, + { + "epoch": 0.654845527594159, + "grad_norm": 0.83984375, + "learning_rate": 0.0001363002840571291, + "loss": 0.8848, + "step": 25503 + }, + { + "epoch": 0.6548712047900809, + "grad_norm": 0.74609375, + "learning_rate": 0.00013629612433125118, + "loss": 0.8252, + "step": 25504 + }, + { + "epoch": 0.6548968819860027, + "grad_norm": 0.7890625, + "learning_rate": 0.00013629196453303837, + "loss": 0.9021, + "step": 25505 + }, + { + "epoch": 0.6549225591819245, + "grad_norm": 0.77734375, + "learning_rate": 0.000136287804662499, + "loss": 0.7657, + "step": 25506 + }, + { + "epoch": 0.6549482363778464, + "grad_norm": 0.79296875, + "learning_rate": 0.0001362836447196413, + "loss": 0.856, + "step": 25507 + }, + { + "epoch": 0.6549739135737682, + "grad_norm": 0.76953125, + "learning_rate": 0.00013627948470447353, + "loss": 0.7844, + "step": 25508 + }, + { + "epoch": 0.6549995907696899, + "grad_norm": 0.78515625, + "learning_rate": 0.00013627532461700406, + "loss": 0.8667, + "step": 25509 + }, + { + "epoch": 0.6550252679656118, + "grad_norm": 0.76953125, + "learning_rate": 0.00013627116445724113, + "loss": 0.8562, + "step": 25510 + }, + { + "epoch": 0.6550509451615336, + "grad_norm": 0.80078125, + "learning_rate": 0.00013626700422519306, + "loss": 0.8307, + "step": 25511 + }, + { + "epoch": 0.6550766223574555, + "grad_norm": 0.72265625, + "learning_rate": 0.0001362628439208681, + "loss": 0.8399, + "step": 25512 + }, + { + "epoch": 0.6551022995533773, + "grad_norm": 0.7890625, + "learning_rate": 0.00013625868354427462, + "loss": 0.7886, + "step": 25513 + }, + { + "epoch": 0.6551279767492991, + "grad_norm": 0.87890625, + "learning_rate": 0.0001362545230954208, + "loss": 0.6971, + "step": 25514 + }, + { + "epoch": 0.655153653945221, + "grad_norm": 0.79296875, + "learning_rate": 0.00013625036257431503, + "loss": 0.684, + "step": 25515 + }, + { + "epoch": 0.6551793311411427, + "grad_norm": 0.796875, + "learning_rate": 0.00013624620198096556, + "loss": 0.849, + "step": 25516 + }, + { + "epoch": 0.6552050083370645, + "grad_norm": 1.09375, + "learning_rate": 0.00013624204131538066, + "loss": 0.8005, + "step": 25517 + }, + { + "epoch": 0.6552306855329864, + "grad_norm": 0.796875, + "learning_rate": 0.00013623788057756864, + "loss": 0.7337, + "step": 25518 + }, + { + "epoch": 0.6552563627289082, + "grad_norm": 0.82421875, + "learning_rate": 0.00013623371976753784, + "loss": 0.7276, + "step": 25519 + }, + { + "epoch": 0.65528203992483, + "grad_norm": 0.8203125, + "learning_rate": 0.00013622955888529647, + "loss": 0.8791, + "step": 25520 + }, + { + "epoch": 0.6553077171207519, + "grad_norm": 0.8046875, + "learning_rate": 0.0001362253979308529, + "loss": 0.9121, + "step": 25521 + }, + { + "epoch": 0.6553333943166737, + "grad_norm": 0.76953125, + "learning_rate": 0.00013622123690421536, + "loss": 0.7913, + "step": 25522 + }, + { + "epoch": 0.6553590715125954, + "grad_norm": 0.7890625, + "learning_rate": 0.00013621707580539213, + "loss": 0.902, + "step": 25523 + }, + { + "epoch": 0.6553847487085173, + "grad_norm": 0.83203125, + "learning_rate": 0.00013621291463439158, + "loss": 0.7166, + "step": 25524 + }, + { + "epoch": 0.6554104259044391, + "grad_norm": 0.83203125, + "learning_rate": 0.00013620875339122196, + "loss": 0.8008, + "step": 25525 + }, + { + "epoch": 0.6554361031003609, + "grad_norm": 0.80859375, + "learning_rate": 0.00013620459207589158, + "loss": 0.8952, + "step": 25526 + }, + { + "epoch": 0.6554617802962828, + "grad_norm": 0.765625, + "learning_rate": 0.00013620043068840871, + "loss": 0.6721, + "step": 25527 + }, + { + "epoch": 0.6554874574922046, + "grad_norm": 0.76953125, + "learning_rate": 0.00013619626922878162, + "loss": 0.897, + "step": 25528 + }, + { + "epoch": 0.6555131346881263, + "grad_norm": 0.74609375, + "learning_rate": 0.00013619210769701866, + "loss": 0.8233, + "step": 25529 + }, + { + "epoch": 0.6555388118840482, + "grad_norm": 0.87890625, + "learning_rate": 0.00013618794609312812, + "loss": 0.92, + "step": 25530 + }, + { + "epoch": 0.65556448907997, + "grad_norm": 0.7890625, + "learning_rate": 0.00013618378441711824, + "loss": 0.8461, + "step": 25531 + }, + { + "epoch": 0.6555901662758918, + "grad_norm": 1.171875, + "learning_rate": 0.00013617962266899736, + "loss": 0.8406, + "step": 25532 + }, + { + "epoch": 0.6556158434718137, + "grad_norm": 0.84375, + "learning_rate": 0.00013617546084877374, + "loss": 0.7566, + "step": 25533 + }, + { + "epoch": 0.6556415206677355, + "grad_norm": 0.8203125, + "learning_rate": 0.0001361712989564557, + "loss": 0.8248, + "step": 25534 + }, + { + "epoch": 0.6556671978636573, + "grad_norm": 0.69921875, + "learning_rate": 0.00013616713699205155, + "loss": 0.8019, + "step": 25535 + }, + { + "epoch": 0.6556928750595791, + "grad_norm": 0.765625, + "learning_rate": 0.00013616297495556954, + "loss": 0.8019, + "step": 25536 + }, + { + "epoch": 0.6557185522555009, + "grad_norm": 0.703125, + "learning_rate": 0.000136158812847018, + "loss": 0.6367, + "step": 25537 + }, + { + "epoch": 0.6557442294514227, + "grad_norm": 0.80859375, + "learning_rate": 0.00013615465066640523, + "loss": 0.8964, + "step": 25538 + }, + { + "epoch": 0.6557699066473446, + "grad_norm": 0.7109375, + "learning_rate": 0.00013615048841373947, + "loss": 0.8376, + "step": 25539 + }, + { + "epoch": 0.6557955838432664, + "grad_norm": 0.69921875, + "learning_rate": 0.0001361463260890291, + "loss": 0.7495, + "step": 25540 + }, + { + "epoch": 0.6558212610391883, + "grad_norm": 0.69140625, + "learning_rate": 0.0001361421636922823, + "loss": 0.8548, + "step": 25541 + }, + { + "epoch": 0.6558469382351101, + "grad_norm": 0.76171875, + "learning_rate": 0.0001361380012235075, + "loss": 0.9724, + "step": 25542 + }, + { + "epoch": 0.6558726154310318, + "grad_norm": 0.78515625, + "learning_rate": 0.00013613383868271288, + "loss": 0.9534, + "step": 25543 + }, + { + "epoch": 0.6558982926269536, + "grad_norm": 0.84375, + "learning_rate": 0.0001361296760699068, + "loss": 0.8691, + "step": 25544 + }, + { + "epoch": 0.6559239698228755, + "grad_norm": 0.8828125, + "learning_rate": 0.00013612551338509755, + "loss": 0.8515, + "step": 25545 + }, + { + "epoch": 0.6559496470187973, + "grad_norm": 0.97265625, + "learning_rate": 0.0001361213506282934, + "loss": 0.9065, + "step": 25546 + }, + { + "epoch": 0.6559753242147192, + "grad_norm": 0.82421875, + "learning_rate": 0.00013611718779950267, + "loss": 0.8624, + "step": 25547 + }, + { + "epoch": 0.656001001410641, + "grad_norm": 0.80859375, + "learning_rate": 0.00013611302489873364, + "loss": 0.8034, + "step": 25548 + }, + { + "epoch": 0.6560266786065627, + "grad_norm": 0.7421875, + "learning_rate": 0.0001361088619259946, + "loss": 0.8043, + "step": 25549 + }, + { + "epoch": 0.6560523558024846, + "grad_norm": 0.8359375, + "learning_rate": 0.00013610469888129386, + "loss": 0.8244, + "step": 25550 + }, + { + "epoch": 0.6560780329984064, + "grad_norm": 0.7734375, + "learning_rate": 0.00013610053576463974, + "loss": 0.9414, + "step": 25551 + }, + { + "epoch": 0.6561037101943282, + "grad_norm": 0.7265625, + "learning_rate": 0.00013609637257604054, + "loss": 0.7471, + "step": 25552 + }, + { + "epoch": 0.6561293873902501, + "grad_norm": 0.8125, + "learning_rate": 0.00013609220931550447, + "loss": 0.8064, + "step": 25553 + }, + { + "epoch": 0.6561550645861719, + "grad_norm": 0.7734375, + "learning_rate": 0.00013608804598303994, + "loss": 0.8551, + "step": 25554 + }, + { + "epoch": 0.6561807417820937, + "grad_norm": 0.76953125, + "learning_rate": 0.00013608388257865513, + "loss": 0.9071, + "step": 25555 + }, + { + "epoch": 0.6562064189780155, + "grad_norm": 0.78125, + "learning_rate": 0.00013607971910235847, + "loss": 0.7457, + "step": 25556 + }, + { + "epoch": 0.6562320961739373, + "grad_norm": 0.765625, + "learning_rate": 0.00013607555555415814, + "loss": 0.7585, + "step": 25557 + }, + { + "epoch": 0.6562577733698591, + "grad_norm": 0.796875, + "learning_rate": 0.00013607139193406252, + "loss": 0.7372, + "step": 25558 + }, + { + "epoch": 0.656283450565781, + "grad_norm": 0.76171875, + "learning_rate": 0.00013606722824207988, + "loss": 0.8071, + "step": 25559 + }, + { + "epoch": 0.6563091277617028, + "grad_norm": 0.82421875, + "learning_rate": 0.00013606306447821846, + "loss": 0.8588, + "step": 25560 + }, + { + "epoch": 0.6563348049576246, + "grad_norm": 0.734375, + "learning_rate": 0.00013605890064248666, + "loss": 0.7999, + "step": 25561 + }, + { + "epoch": 0.6563604821535465, + "grad_norm": 0.85546875, + "learning_rate": 0.0001360547367348927, + "loss": 0.8295, + "step": 25562 + }, + { + "epoch": 0.6563861593494682, + "grad_norm": 0.796875, + "learning_rate": 0.0001360505727554449, + "loss": 0.9251, + "step": 25563 + }, + { + "epoch": 0.65641183654539, + "grad_norm": 0.73828125, + "learning_rate": 0.00013604640870415157, + "loss": 0.7227, + "step": 25564 + }, + { + "epoch": 0.6564375137413119, + "grad_norm": 0.87890625, + "learning_rate": 0.00013604224458102103, + "loss": 0.8035, + "step": 25565 + }, + { + "epoch": 0.6564631909372337, + "grad_norm": 0.8671875, + "learning_rate": 0.00013603808038606155, + "loss": 0.8197, + "step": 25566 + }, + { + "epoch": 0.6564888681331555, + "grad_norm": 0.80859375, + "learning_rate": 0.0001360339161192814, + "loss": 0.8398, + "step": 25567 + }, + { + "epoch": 0.6565145453290774, + "grad_norm": 1.140625, + "learning_rate": 0.0001360297517806889, + "loss": 0.8974, + "step": 25568 + }, + { + "epoch": 0.6565402225249991, + "grad_norm": 0.76953125, + "learning_rate": 0.00013602558737029242, + "loss": 0.899, + "step": 25569 + }, + { + "epoch": 0.6565658997209209, + "grad_norm": 0.8125, + "learning_rate": 0.00013602142288810017, + "loss": 0.8617, + "step": 25570 + }, + { + "epoch": 0.6565915769168428, + "grad_norm": 0.75, + "learning_rate": 0.00013601725833412045, + "loss": 0.8659, + "step": 25571 + }, + { + "epoch": 0.6566172541127646, + "grad_norm": 0.8125, + "learning_rate": 0.00013601309370836164, + "loss": 0.8027, + "step": 25572 + }, + { + "epoch": 0.6566429313086865, + "grad_norm": 0.765625, + "learning_rate": 0.00013600892901083193, + "loss": 0.786, + "step": 25573 + }, + { + "epoch": 0.6566686085046083, + "grad_norm": 0.80859375, + "learning_rate": 0.00013600476424153972, + "loss": 0.8731, + "step": 25574 + }, + { + "epoch": 0.6566942857005301, + "grad_norm": 0.8203125, + "learning_rate": 0.00013600059940049327, + "loss": 0.8126, + "step": 25575 + }, + { + "epoch": 0.6567199628964518, + "grad_norm": 0.70703125, + "learning_rate": 0.0001359964344877008, + "loss": 0.7638, + "step": 25576 + }, + { + "epoch": 0.6567456400923737, + "grad_norm": 0.81640625, + "learning_rate": 0.00013599226950317077, + "loss": 0.9317, + "step": 25577 + }, + { + "epoch": 0.6567713172882955, + "grad_norm": 0.734375, + "learning_rate": 0.00013598810444691138, + "loss": 0.7859, + "step": 25578 + }, + { + "epoch": 0.6567969944842174, + "grad_norm": 0.7265625, + "learning_rate": 0.00013598393931893093, + "loss": 0.7665, + "step": 25579 + }, + { + "epoch": 0.6568226716801392, + "grad_norm": 0.8203125, + "learning_rate": 0.00013597977411923778, + "loss": 0.8678, + "step": 25580 + }, + { + "epoch": 0.656848348876061, + "grad_norm": 0.796875, + "learning_rate": 0.00013597560884784016, + "loss": 0.7489, + "step": 25581 + }, + { + "epoch": 0.6568740260719829, + "grad_norm": 0.796875, + "learning_rate": 0.00013597144350474639, + "loss": 0.7766, + "step": 25582 + }, + { + "epoch": 0.6568997032679046, + "grad_norm": 0.8359375, + "learning_rate": 0.00013596727808996481, + "loss": 0.7766, + "step": 25583 + }, + { + "epoch": 0.6569253804638264, + "grad_norm": 0.83203125, + "learning_rate": 0.00013596311260350366, + "loss": 0.8359, + "step": 25584 + }, + { + "epoch": 0.6569510576597483, + "grad_norm": 0.8359375, + "learning_rate": 0.0001359589470453713, + "loss": 0.9078, + "step": 25585 + }, + { + "epoch": 0.6569767348556701, + "grad_norm": 0.7734375, + "learning_rate": 0.000135954781415576, + "loss": 0.7915, + "step": 25586 + }, + { + "epoch": 0.6570024120515919, + "grad_norm": 0.77734375, + "learning_rate": 0.00013595061571412609, + "loss": 0.9107, + "step": 25587 + }, + { + "epoch": 0.6570280892475138, + "grad_norm": 0.76953125, + "learning_rate": 0.00013594644994102982, + "loss": 0.8064, + "step": 25588 + }, + { + "epoch": 0.6570537664434355, + "grad_norm": 0.70703125, + "learning_rate": 0.00013594228409629551, + "loss": 0.865, + "step": 25589 + }, + { + "epoch": 0.6570794436393573, + "grad_norm": 0.78515625, + "learning_rate": 0.00013593811817993152, + "loss": 0.736, + "step": 25590 + }, + { + "epoch": 0.6571051208352792, + "grad_norm": 0.70703125, + "learning_rate": 0.00013593395219194611, + "loss": 0.827, + "step": 25591 + }, + { + "epoch": 0.657130798031201, + "grad_norm": 0.76171875, + "learning_rate": 0.00013592978613234753, + "loss": 0.8767, + "step": 25592 + }, + { + "epoch": 0.6571564752271228, + "grad_norm": 0.78515625, + "learning_rate": 0.00013592562000114416, + "loss": 0.8366, + "step": 25593 + }, + { + "epoch": 0.6571821524230447, + "grad_norm": 1.2421875, + "learning_rate": 0.00013592145379834425, + "loss": 0.7554, + "step": 25594 + }, + { + "epoch": 0.6572078296189665, + "grad_norm": 0.796875, + "learning_rate": 0.00013591728752395616, + "loss": 0.8209, + "step": 25595 + }, + { + "epoch": 0.6572335068148882, + "grad_norm": 0.79296875, + "learning_rate": 0.00013591312117798815, + "loss": 0.861, + "step": 25596 + }, + { + "epoch": 0.6572591840108101, + "grad_norm": 0.76953125, + "learning_rate": 0.0001359089547604485, + "loss": 0.7638, + "step": 25597 + }, + { + "epoch": 0.6572848612067319, + "grad_norm": 0.765625, + "learning_rate": 0.0001359047882713456, + "loss": 0.8931, + "step": 25598 + }, + { + "epoch": 0.6573105384026537, + "grad_norm": 0.796875, + "learning_rate": 0.0001359006217106877, + "loss": 0.8482, + "step": 25599 + }, + { + "epoch": 0.6573362155985756, + "grad_norm": 0.8125, + "learning_rate": 0.00013589645507848305, + "loss": 0.8979, + "step": 25600 + }, + { + "epoch": 0.6573618927944974, + "grad_norm": 0.82421875, + "learning_rate": 0.00013589228837474005, + "loss": 0.851, + "step": 25601 + }, + { + "epoch": 0.6573875699904193, + "grad_norm": 0.78515625, + "learning_rate": 0.00013588812159946696, + "loss": 0.7947, + "step": 25602 + }, + { + "epoch": 0.657413247186341, + "grad_norm": 0.78125, + "learning_rate": 0.00013588395475267205, + "loss": 0.9239, + "step": 25603 + }, + { + "epoch": 0.6574389243822628, + "grad_norm": 0.76171875, + "learning_rate": 0.0001358797878343637, + "loss": 0.8734, + "step": 25604 + }, + { + "epoch": 0.6574646015781846, + "grad_norm": 0.78125, + "learning_rate": 0.00013587562084455018, + "loss": 0.8259, + "step": 25605 + }, + { + "epoch": 0.6574902787741065, + "grad_norm": 0.81640625, + "learning_rate": 0.00013587145378323974, + "loss": 0.8445, + "step": 25606 + }, + { + "epoch": 0.6575159559700283, + "grad_norm": 0.69140625, + "learning_rate": 0.0001358672866504408, + "loss": 0.9848, + "step": 25607 + }, + { + "epoch": 0.6575416331659502, + "grad_norm": 0.8125, + "learning_rate": 0.00013586311944616152, + "loss": 0.8696, + "step": 25608 + }, + { + "epoch": 0.6575673103618719, + "grad_norm": 0.78125, + "learning_rate": 0.00013585895217041035, + "loss": 0.7143, + "step": 25609 + }, + { + "epoch": 0.6575929875577937, + "grad_norm": 0.7734375, + "learning_rate": 0.0001358547848231955, + "loss": 0.8156, + "step": 25610 + }, + { + "epoch": 0.6576186647537156, + "grad_norm": 0.75, + "learning_rate": 0.0001358506174045253, + "loss": 0.8822, + "step": 25611 + }, + { + "epoch": 0.6576443419496374, + "grad_norm": 0.77734375, + "learning_rate": 0.00013584644991440804, + "loss": 0.9456, + "step": 25612 + }, + { + "epoch": 0.6576700191455592, + "grad_norm": 0.80859375, + "learning_rate": 0.00013584228235285207, + "loss": 0.8427, + "step": 25613 + }, + { + "epoch": 0.6576956963414811, + "grad_norm": 0.765625, + "learning_rate": 0.00013583811471986566, + "loss": 0.7437, + "step": 25614 + }, + { + "epoch": 0.6577213735374029, + "grad_norm": 0.74609375, + "learning_rate": 0.00013583394701545713, + "loss": 0.9078, + "step": 25615 + }, + { + "epoch": 0.6577470507333246, + "grad_norm": 0.7734375, + "learning_rate": 0.00013582977923963475, + "loss": 0.7373, + "step": 25616 + }, + { + "epoch": 0.6577727279292465, + "grad_norm": 0.78125, + "learning_rate": 0.0001358256113924069, + "loss": 0.8509, + "step": 25617 + }, + { + "epoch": 0.6577984051251683, + "grad_norm": 0.71875, + "learning_rate": 0.00013582144347378184, + "loss": 0.7965, + "step": 25618 + }, + { + "epoch": 0.6578240823210901, + "grad_norm": 0.90625, + "learning_rate": 0.00013581727548376787, + "loss": 0.9578, + "step": 25619 + }, + { + "epoch": 0.657849759517012, + "grad_norm": 0.796875, + "learning_rate": 0.00013581310742237328, + "loss": 0.8443, + "step": 25620 + }, + { + "epoch": 0.6578754367129338, + "grad_norm": 0.8359375, + "learning_rate": 0.00013580893928960642, + "loss": 1.0388, + "step": 25621 + }, + { + "epoch": 0.6579011139088555, + "grad_norm": 0.77734375, + "learning_rate": 0.00013580477108547558, + "loss": 0.8751, + "step": 25622 + }, + { + "epoch": 0.6579267911047774, + "grad_norm": 0.73828125, + "learning_rate": 0.00013580060280998907, + "loss": 0.7617, + "step": 25623 + }, + { + "epoch": 0.6579524683006992, + "grad_norm": 0.8046875, + "learning_rate": 0.0001357964344631552, + "loss": 0.8926, + "step": 25624 + }, + { + "epoch": 0.657978145496621, + "grad_norm": 0.72265625, + "learning_rate": 0.00013579226604498224, + "loss": 0.8376, + "step": 25625 + }, + { + "epoch": 0.6580038226925429, + "grad_norm": 0.796875, + "learning_rate": 0.00013578809755547856, + "loss": 0.8405, + "step": 25626 + }, + { + "epoch": 0.6580294998884647, + "grad_norm": 0.765625, + "learning_rate": 0.0001357839289946524, + "loss": 0.8423, + "step": 25627 + }, + { + "epoch": 0.6580551770843865, + "grad_norm": 0.8125, + "learning_rate": 0.00013577976036251212, + "loss": 0.9177, + "step": 25628 + }, + { + "epoch": 0.6580808542803083, + "grad_norm": 0.7421875, + "learning_rate": 0.00013577559165906599, + "loss": 0.7035, + "step": 25629 + }, + { + "epoch": 0.6581065314762301, + "grad_norm": 0.74609375, + "learning_rate": 0.00013577142288432237, + "loss": 0.8574, + "step": 25630 + }, + { + "epoch": 0.6581322086721519, + "grad_norm": 0.80859375, + "learning_rate": 0.00013576725403828953, + "loss": 0.9205, + "step": 25631 + }, + { + "epoch": 0.6581578858680738, + "grad_norm": 0.8125, + "learning_rate": 0.0001357630851209758, + "loss": 0.8618, + "step": 25632 + }, + { + "epoch": 0.6581835630639956, + "grad_norm": 0.8125, + "learning_rate": 0.00013575891613238942, + "loss": 0.9491, + "step": 25633 + }, + { + "epoch": 0.6582092402599174, + "grad_norm": 0.76953125, + "learning_rate": 0.00013575474707253876, + "loss": 0.8183, + "step": 25634 + }, + { + "epoch": 0.6582349174558393, + "grad_norm": 0.83984375, + "learning_rate": 0.00013575057794143214, + "loss": 0.9315, + "step": 25635 + }, + { + "epoch": 0.658260594651761, + "grad_norm": 0.79296875, + "learning_rate": 0.00013574640873907786, + "loss": 0.7968, + "step": 25636 + }, + { + "epoch": 0.6582862718476828, + "grad_norm": 0.7734375, + "learning_rate": 0.0001357422394654842, + "loss": 0.9135, + "step": 25637 + }, + { + "epoch": 0.6583119490436047, + "grad_norm": 0.79296875, + "learning_rate": 0.00013573807012065952, + "loss": 0.9402, + "step": 25638 + }, + { + "epoch": 0.6583376262395265, + "grad_norm": 0.7578125, + "learning_rate": 0.00013573390070461205, + "loss": 0.8787, + "step": 25639 + }, + { + "epoch": 0.6583633034354484, + "grad_norm": 0.828125, + "learning_rate": 0.00013572973121735014, + "loss": 0.7565, + "step": 25640 + }, + { + "epoch": 0.6583889806313702, + "grad_norm": 0.71484375, + "learning_rate": 0.00013572556165888212, + "loss": 0.7647, + "step": 25641 + }, + { + "epoch": 0.6584146578272919, + "grad_norm": 0.84375, + "learning_rate": 0.00013572139202921627, + "loss": 0.8588, + "step": 25642 + }, + { + "epoch": 0.6584403350232138, + "grad_norm": 0.859375, + "learning_rate": 0.00013571722232836092, + "loss": 0.788, + "step": 25643 + }, + { + "epoch": 0.6584660122191356, + "grad_norm": 0.78515625, + "learning_rate": 0.0001357130525563244, + "loss": 0.9333, + "step": 25644 + }, + { + "epoch": 0.6584916894150574, + "grad_norm": 0.8203125, + "learning_rate": 0.00013570888271311496, + "loss": 0.8321, + "step": 25645 + }, + { + "epoch": 0.6585173666109793, + "grad_norm": 0.80078125, + "learning_rate": 0.00013570471279874096, + "loss": 0.8535, + "step": 25646 + }, + { + "epoch": 0.6585430438069011, + "grad_norm": 0.85546875, + "learning_rate": 0.00013570054281321069, + "loss": 0.9919, + "step": 25647 + }, + { + "epoch": 0.6585687210028229, + "grad_norm": 0.78125, + "learning_rate": 0.0001356963727565324, + "loss": 0.9916, + "step": 25648 + }, + { + "epoch": 0.6585943981987447, + "grad_norm": 0.7265625, + "learning_rate": 0.00013569220262871454, + "loss": 0.8211, + "step": 25649 + }, + { + "epoch": 0.6586200753946665, + "grad_norm": 0.796875, + "learning_rate": 0.00013568803242976533, + "loss": 0.7977, + "step": 25650 + }, + { + "epoch": 0.6586457525905883, + "grad_norm": 0.81640625, + "learning_rate": 0.00013568386215969308, + "loss": 0.9089, + "step": 25651 + }, + { + "epoch": 0.6586714297865102, + "grad_norm": 0.71484375, + "learning_rate": 0.00013567969181850614, + "loss": 0.8681, + "step": 25652 + }, + { + "epoch": 0.658697106982432, + "grad_norm": 0.75390625, + "learning_rate": 0.00013567552140621278, + "loss": 0.8417, + "step": 25653 + }, + { + "epoch": 0.6587227841783538, + "grad_norm": 0.734375, + "learning_rate": 0.00013567135092282132, + "loss": 0.7049, + "step": 25654 + }, + { + "epoch": 0.6587484613742757, + "grad_norm": 0.796875, + "learning_rate": 0.0001356671803683401, + "loss": 0.9245, + "step": 25655 + }, + { + "epoch": 0.6587741385701974, + "grad_norm": 0.953125, + "learning_rate": 0.0001356630097427774, + "loss": 0.8029, + "step": 25656 + }, + { + "epoch": 0.6587998157661192, + "grad_norm": 0.74609375, + "learning_rate": 0.00013565883904614157, + "loss": 0.7321, + "step": 25657 + }, + { + "epoch": 0.6588254929620411, + "grad_norm": 0.75390625, + "learning_rate": 0.00013565466827844083, + "loss": 0.8703, + "step": 25658 + }, + { + "epoch": 0.6588511701579629, + "grad_norm": 0.84765625, + "learning_rate": 0.0001356504974396836, + "loss": 0.92, + "step": 25659 + }, + { + "epoch": 0.6588768473538847, + "grad_norm": 0.80078125, + "learning_rate": 0.00013564632652987815, + "loss": 0.7837, + "step": 25660 + }, + { + "epoch": 0.6589025245498066, + "grad_norm": 0.8359375, + "learning_rate": 0.00013564215554903277, + "loss": 0.8987, + "step": 25661 + }, + { + "epoch": 0.6589282017457283, + "grad_norm": 0.71875, + "learning_rate": 0.0001356379844971558, + "loss": 0.9242, + "step": 25662 + }, + { + "epoch": 0.6589538789416501, + "grad_norm": 0.8046875, + "learning_rate": 0.00013563381337425558, + "loss": 1.0749, + "step": 25663 + }, + { + "epoch": 0.658979556137572, + "grad_norm": 0.8125, + "learning_rate": 0.00013562964218034036, + "loss": 0.9238, + "step": 25664 + }, + { + "epoch": 0.6590052333334938, + "grad_norm": 0.75390625, + "learning_rate": 0.00013562547091541848, + "loss": 0.7948, + "step": 25665 + }, + { + "epoch": 0.6590309105294156, + "grad_norm": 0.87109375, + "learning_rate": 0.00013562129957949825, + "loss": 0.7906, + "step": 25666 + }, + { + "epoch": 0.6590565877253375, + "grad_norm": 0.72265625, + "learning_rate": 0.000135617128172588, + "loss": 0.7191, + "step": 25667 + }, + { + "epoch": 0.6590822649212593, + "grad_norm": 0.75, + "learning_rate": 0.00013561295669469603, + "loss": 0.8556, + "step": 25668 + }, + { + "epoch": 0.659107942117181, + "grad_norm": 0.76171875, + "learning_rate": 0.00013560878514583065, + "loss": 0.9281, + "step": 25669 + }, + { + "epoch": 0.6591336193131029, + "grad_norm": 0.828125, + "learning_rate": 0.0001356046135260002, + "loss": 0.8755, + "step": 25670 + }, + { + "epoch": 0.6591592965090247, + "grad_norm": 0.828125, + "learning_rate": 0.00013560044183521294, + "loss": 0.8529, + "step": 25671 + }, + { + "epoch": 0.6591849737049466, + "grad_norm": 0.75, + "learning_rate": 0.0001355962700734772, + "loss": 0.7357, + "step": 25672 + }, + { + "epoch": 0.6592106509008684, + "grad_norm": 0.78125, + "learning_rate": 0.00013559209824080137, + "loss": 0.9173, + "step": 25673 + }, + { + "epoch": 0.6592363280967902, + "grad_norm": 0.7421875, + "learning_rate": 0.00013558792633719367, + "loss": 0.7893, + "step": 25674 + }, + { + "epoch": 0.6592620052927121, + "grad_norm": 0.796875, + "learning_rate": 0.00013558375436266244, + "loss": 0.9103, + "step": 25675 + }, + { + "epoch": 0.6592876824886338, + "grad_norm": 0.796875, + "learning_rate": 0.000135579582317216, + "loss": 0.9198, + "step": 25676 + }, + { + "epoch": 0.6593133596845556, + "grad_norm": 0.76171875, + "learning_rate": 0.00013557541020086267, + "loss": 0.9461, + "step": 25677 + }, + { + "epoch": 0.6593390368804775, + "grad_norm": 0.75390625, + "learning_rate": 0.00013557123801361077, + "loss": 0.8916, + "step": 25678 + }, + { + "epoch": 0.6593647140763993, + "grad_norm": 1.0859375, + "learning_rate": 0.00013556706575546863, + "loss": 0.979, + "step": 25679 + }, + { + "epoch": 0.6593903912723211, + "grad_norm": 0.81640625, + "learning_rate": 0.0001355628934264445, + "loss": 0.7707, + "step": 25680 + }, + { + "epoch": 0.659416068468243, + "grad_norm": 0.84375, + "learning_rate": 0.00013555872102654674, + "loss": 0.908, + "step": 25681 + }, + { + "epoch": 0.6594417456641647, + "grad_norm": 0.73046875, + "learning_rate": 0.00013555454855578364, + "loss": 0.7824, + "step": 25682 + }, + { + "epoch": 0.6594674228600865, + "grad_norm": 0.78515625, + "learning_rate": 0.0001355503760141636, + "loss": 0.811, + "step": 25683 + }, + { + "epoch": 0.6594931000560084, + "grad_norm": 0.828125, + "learning_rate": 0.00013554620340169483, + "loss": 0.8659, + "step": 25684 + }, + { + "epoch": 0.6595187772519302, + "grad_norm": 0.7734375, + "learning_rate": 0.0001355420307183857, + "loss": 0.7821, + "step": 25685 + }, + { + "epoch": 0.659544454447852, + "grad_norm": 0.7734375, + "learning_rate": 0.0001355378579642445, + "loss": 0.8256, + "step": 25686 + }, + { + "epoch": 0.6595701316437739, + "grad_norm": 0.83203125, + "learning_rate": 0.00013553368513927958, + "loss": 0.8457, + "step": 25687 + }, + { + "epoch": 0.6595958088396957, + "grad_norm": 0.7578125, + "learning_rate": 0.0001355295122434992, + "loss": 0.9246, + "step": 25688 + }, + { + "epoch": 0.6596214860356174, + "grad_norm": 0.80859375, + "learning_rate": 0.00013552533927691174, + "loss": 0.8852, + "step": 25689 + }, + { + "epoch": 0.6596471632315393, + "grad_norm": 0.81640625, + "learning_rate": 0.0001355211662395255, + "loss": 0.9251, + "step": 25690 + }, + { + "epoch": 0.6596728404274611, + "grad_norm": 0.71875, + "learning_rate": 0.00013551699313134875, + "loss": 0.8519, + "step": 25691 + }, + { + "epoch": 0.6596985176233829, + "grad_norm": 0.7890625, + "learning_rate": 0.00013551281995238988, + "loss": 0.8813, + "step": 25692 + }, + { + "epoch": 0.6597241948193048, + "grad_norm": 0.83984375, + "learning_rate": 0.00013550864670265711, + "loss": 0.8083, + "step": 25693 + }, + { + "epoch": 0.6597498720152266, + "grad_norm": 0.78515625, + "learning_rate": 0.00013550447338215885, + "loss": 0.7517, + "step": 25694 + }, + { + "epoch": 0.6597755492111484, + "grad_norm": 0.8125, + "learning_rate": 0.0001355002999909034, + "loss": 0.8995, + "step": 25695 + }, + { + "epoch": 0.6598012264070702, + "grad_norm": 0.7578125, + "learning_rate": 0.00013549612652889902, + "loss": 0.8192, + "step": 25696 + }, + { + "epoch": 0.659826903602992, + "grad_norm": 0.74609375, + "learning_rate": 0.0001354919529961541, + "loss": 0.903, + "step": 25697 + }, + { + "epoch": 0.6598525807989138, + "grad_norm": 0.83203125, + "learning_rate": 0.0001354877793926769, + "loss": 0.8829, + "step": 25698 + }, + { + "epoch": 0.6598782579948357, + "grad_norm": 0.76953125, + "learning_rate": 0.00013548360571847575, + "loss": 0.8324, + "step": 25699 + }, + { + "epoch": 0.6599039351907575, + "grad_norm": 0.75, + "learning_rate": 0.00013547943197355904, + "loss": 0.7717, + "step": 25700 + }, + { + "epoch": 0.6599296123866794, + "grad_norm": 0.83203125, + "learning_rate": 0.00013547525815793497, + "loss": 0.8133, + "step": 25701 + }, + { + "epoch": 0.6599552895826011, + "grad_norm": 0.75390625, + "learning_rate": 0.00013547108427161192, + "loss": 0.8104, + "step": 25702 + }, + { + "epoch": 0.6599809667785229, + "grad_norm": 0.74609375, + "learning_rate": 0.00013546691031459822, + "loss": 0.8196, + "step": 25703 + }, + { + "epoch": 0.6600066439744448, + "grad_norm": 0.8359375, + "learning_rate": 0.00013546273628690217, + "loss": 0.8905, + "step": 25704 + }, + { + "epoch": 0.6600323211703666, + "grad_norm": 0.796875, + "learning_rate": 0.0001354585621885321, + "loss": 0.9, + "step": 25705 + }, + { + "epoch": 0.6600579983662884, + "grad_norm": 0.75, + "learning_rate": 0.00013545438801949628, + "loss": 0.6657, + "step": 25706 + }, + { + "epoch": 0.6600836755622103, + "grad_norm": 0.859375, + "learning_rate": 0.0001354502137798031, + "loss": 0.7707, + "step": 25707 + }, + { + "epoch": 0.6601093527581321, + "grad_norm": 0.83203125, + "learning_rate": 0.00013544603946946085, + "loss": 0.8718, + "step": 25708 + }, + { + "epoch": 0.6601350299540538, + "grad_norm": 0.78515625, + "learning_rate": 0.0001354418650884778, + "loss": 0.9238, + "step": 25709 + }, + { + "epoch": 0.6601607071499757, + "grad_norm": 0.77734375, + "learning_rate": 0.00013543769063686237, + "loss": 0.7713, + "step": 25710 + }, + { + "epoch": 0.6601863843458975, + "grad_norm": 0.8359375, + "learning_rate": 0.0001354335161146228, + "loss": 0.8273, + "step": 25711 + }, + { + "epoch": 0.6602120615418193, + "grad_norm": 0.78125, + "learning_rate": 0.00013542934152176742, + "loss": 0.8279, + "step": 25712 + }, + { + "epoch": 0.6602377387377412, + "grad_norm": 0.78125, + "learning_rate": 0.0001354251668583046, + "loss": 0.884, + "step": 25713 + }, + { + "epoch": 0.660263415933663, + "grad_norm": 0.765625, + "learning_rate": 0.00013542099212424257, + "loss": 0.7808, + "step": 25714 + }, + { + "epoch": 0.6602890931295848, + "grad_norm": 0.76171875, + "learning_rate": 0.00013541681731958974, + "loss": 0.966, + "step": 25715 + }, + { + "epoch": 0.6603147703255066, + "grad_norm": 0.80859375, + "learning_rate": 0.0001354126424443544, + "loss": 0.8486, + "step": 25716 + }, + { + "epoch": 0.6603404475214284, + "grad_norm": 0.75, + "learning_rate": 0.00013540846749854484, + "loss": 0.7742, + "step": 25717 + }, + { + "epoch": 0.6603661247173502, + "grad_norm": 0.8046875, + "learning_rate": 0.00013540429248216942, + "loss": 0.8734, + "step": 25718 + }, + { + "epoch": 0.6603918019132721, + "grad_norm": 0.79296875, + "learning_rate": 0.00013540011739523642, + "loss": 0.7164, + "step": 25719 + }, + { + "epoch": 0.6604174791091939, + "grad_norm": 0.78515625, + "learning_rate": 0.0001353959422377542, + "loss": 0.9115, + "step": 25720 + }, + { + "epoch": 0.6604431563051157, + "grad_norm": 0.8203125, + "learning_rate": 0.00013539176700973107, + "loss": 0.7116, + "step": 25721 + }, + { + "epoch": 0.6604688335010375, + "grad_norm": 0.7578125, + "learning_rate": 0.0001353875917111753, + "loss": 0.7923, + "step": 25722 + }, + { + "epoch": 0.6604945106969593, + "grad_norm": 0.75390625, + "learning_rate": 0.00013538341634209534, + "loss": 0.7967, + "step": 25723 + }, + { + "epoch": 0.6605201878928811, + "grad_norm": 0.8671875, + "learning_rate": 0.00013537924090249937, + "loss": 0.8475, + "step": 25724 + }, + { + "epoch": 0.660545865088803, + "grad_norm": 0.88671875, + "learning_rate": 0.00013537506539239576, + "loss": 0.884, + "step": 25725 + }, + { + "epoch": 0.6605715422847248, + "grad_norm": 0.734375, + "learning_rate": 0.00013537088981179288, + "loss": 0.8427, + "step": 25726 + }, + { + "epoch": 0.6605972194806466, + "grad_norm": 0.8203125, + "learning_rate": 0.00013536671416069897, + "loss": 0.8941, + "step": 25727 + }, + { + "epoch": 0.6606228966765685, + "grad_norm": 0.7421875, + "learning_rate": 0.0001353625384391224, + "loss": 0.7128, + "step": 25728 + }, + { + "epoch": 0.6606485738724902, + "grad_norm": 0.75390625, + "learning_rate": 0.0001353583626470715, + "loss": 0.6822, + "step": 25729 + }, + { + "epoch": 0.660674251068412, + "grad_norm": 0.79296875, + "learning_rate": 0.00013535418678455458, + "loss": 0.972, + "step": 25730 + }, + { + "epoch": 0.6606999282643339, + "grad_norm": 0.79296875, + "learning_rate": 0.00013535001085157998, + "loss": 0.8549, + "step": 25731 + }, + { + "epoch": 0.6607256054602557, + "grad_norm": 0.8125, + "learning_rate": 0.00013534583484815595, + "loss": 0.7556, + "step": 25732 + }, + { + "epoch": 0.6607512826561776, + "grad_norm": 0.74609375, + "learning_rate": 0.00013534165877429087, + "loss": 0.8583, + "step": 25733 + }, + { + "epoch": 0.6607769598520994, + "grad_norm": 0.7890625, + "learning_rate": 0.0001353374826299931, + "loss": 1.0131, + "step": 25734 + }, + { + "epoch": 0.6608026370480212, + "grad_norm": 0.765625, + "learning_rate": 0.0001353333064152709, + "loss": 0.8174, + "step": 25735 + }, + { + "epoch": 0.660828314243943, + "grad_norm": 0.77734375, + "learning_rate": 0.0001353291301301326, + "loss": 0.8458, + "step": 25736 + }, + { + "epoch": 0.6608539914398648, + "grad_norm": 0.75390625, + "learning_rate": 0.00013532495377458655, + "loss": 0.7704, + "step": 25737 + }, + { + "epoch": 0.6608796686357866, + "grad_norm": 0.7421875, + "learning_rate": 0.00013532077734864105, + "loss": 0.8757, + "step": 25738 + }, + { + "epoch": 0.6609053458317085, + "grad_norm": 0.75390625, + "learning_rate": 0.00013531660085230442, + "loss": 0.8956, + "step": 25739 + }, + { + "epoch": 0.6609310230276303, + "grad_norm": 0.7578125, + "learning_rate": 0.00013531242428558503, + "loss": 0.8516, + "step": 25740 + }, + { + "epoch": 0.6609567002235521, + "grad_norm": 0.828125, + "learning_rate": 0.00013530824764849112, + "loss": 0.9739, + "step": 25741 + }, + { + "epoch": 0.6609823774194739, + "grad_norm": 0.8125, + "learning_rate": 0.0001353040709410311, + "loss": 0.8385, + "step": 25742 + }, + { + "epoch": 0.6610080546153957, + "grad_norm": 0.84375, + "learning_rate": 0.00013529989416321322, + "loss": 0.8525, + "step": 25743 + }, + { + "epoch": 0.6610337318113175, + "grad_norm": 0.7734375, + "learning_rate": 0.0001352957173150459, + "loss": 0.9517, + "step": 25744 + }, + { + "epoch": 0.6610594090072394, + "grad_norm": 0.8203125, + "learning_rate": 0.00013529154039653736, + "loss": 0.9762, + "step": 25745 + }, + { + "epoch": 0.6610850862031612, + "grad_norm": 0.7421875, + "learning_rate": 0.00013528736340769594, + "loss": 0.7462, + "step": 25746 + }, + { + "epoch": 0.661110763399083, + "grad_norm": 0.78515625, + "learning_rate": 0.00013528318634853004, + "loss": 0.8333, + "step": 25747 + }, + { + "epoch": 0.6611364405950049, + "grad_norm": 0.765625, + "learning_rate": 0.00013527900921904792, + "loss": 0.8793, + "step": 25748 + }, + { + "epoch": 0.6611621177909266, + "grad_norm": 0.75390625, + "learning_rate": 0.0001352748320192579, + "loss": 0.7812, + "step": 25749 + }, + { + "epoch": 0.6611877949868484, + "grad_norm": 0.75, + "learning_rate": 0.00013527065474916837, + "loss": 0.7192, + "step": 25750 + }, + { + "epoch": 0.6612134721827703, + "grad_norm": 0.78515625, + "learning_rate": 0.0001352664774087876, + "loss": 0.8636, + "step": 25751 + }, + { + "epoch": 0.6612391493786921, + "grad_norm": 0.74609375, + "learning_rate": 0.0001352622999981239, + "loss": 0.9525, + "step": 25752 + }, + { + "epoch": 0.6612648265746139, + "grad_norm": 0.875, + "learning_rate": 0.00013525812251718566, + "loss": 0.831, + "step": 25753 + }, + { + "epoch": 0.6612905037705358, + "grad_norm": 0.81640625, + "learning_rate": 0.00013525394496598112, + "loss": 0.8325, + "step": 25754 + }, + { + "epoch": 0.6613161809664576, + "grad_norm": 0.74609375, + "learning_rate": 0.00013524976734451867, + "loss": 0.7839, + "step": 25755 + }, + { + "epoch": 0.6613418581623793, + "grad_norm": 0.7734375, + "learning_rate": 0.00013524558965280665, + "loss": 0.8264, + "step": 25756 + }, + { + "epoch": 0.6613675353583012, + "grad_norm": 1.109375, + "learning_rate": 0.00013524141189085332, + "loss": 0.8477, + "step": 25757 + }, + { + "epoch": 0.661393212554223, + "grad_norm": 0.7421875, + "learning_rate": 0.00013523723405866705, + "loss": 0.8825, + "step": 25758 + }, + { + "epoch": 0.6614188897501448, + "grad_norm": 0.80859375, + "learning_rate": 0.00013523305615625614, + "loss": 0.7006, + "step": 25759 + }, + { + "epoch": 0.6614445669460667, + "grad_norm": 0.85546875, + "learning_rate": 0.00013522887818362894, + "loss": 0.796, + "step": 25760 + }, + { + "epoch": 0.6614702441419885, + "grad_norm": 0.78125, + "learning_rate": 0.0001352247001407938, + "loss": 0.8029, + "step": 25761 + }, + { + "epoch": 0.6614959213379102, + "grad_norm": 1.09375, + "learning_rate": 0.00013522052202775896, + "loss": 0.9925, + "step": 25762 + }, + { + "epoch": 0.6615215985338321, + "grad_norm": 0.8359375, + "learning_rate": 0.00013521634384453286, + "loss": 0.916, + "step": 25763 + }, + { + "epoch": 0.6615472757297539, + "grad_norm": 0.7890625, + "learning_rate": 0.00013521216559112372, + "loss": 0.829, + "step": 25764 + }, + { + "epoch": 0.6615729529256758, + "grad_norm": 0.734375, + "learning_rate": 0.00013520798726753995, + "loss": 0.752, + "step": 25765 + }, + { + "epoch": 0.6615986301215976, + "grad_norm": 0.84765625, + "learning_rate": 0.0001352038088737898, + "loss": 0.8807, + "step": 25766 + }, + { + "epoch": 0.6616243073175194, + "grad_norm": 0.765625, + "learning_rate": 0.00013519963040988165, + "loss": 0.8579, + "step": 25767 + }, + { + "epoch": 0.6616499845134413, + "grad_norm": 0.78125, + "learning_rate": 0.00013519545187582382, + "loss": 0.78, + "step": 25768 + }, + { + "epoch": 0.661675661709363, + "grad_norm": 0.8125, + "learning_rate": 0.00013519127327162468, + "loss": 0.8542, + "step": 25769 + }, + { + "epoch": 0.6617013389052848, + "grad_norm": 0.76171875, + "learning_rate": 0.00013518709459729245, + "loss": 0.9472, + "step": 25770 + }, + { + "epoch": 0.6617270161012067, + "grad_norm": 0.80078125, + "learning_rate": 0.00013518291585283554, + "loss": 0.984, + "step": 25771 + }, + { + "epoch": 0.6617526932971285, + "grad_norm": 0.796875, + "learning_rate": 0.00013517873703826228, + "loss": 0.9862, + "step": 25772 + }, + { + "epoch": 0.6617783704930503, + "grad_norm": 0.765625, + "learning_rate": 0.00013517455815358092, + "loss": 0.8502, + "step": 25773 + }, + { + "epoch": 0.6618040476889722, + "grad_norm": 0.76953125, + "learning_rate": 0.00013517037919879988, + "loss": 0.7846, + "step": 25774 + }, + { + "epoch": 0.661829724884894, + "grad_norm": 0.78515625, + "learning_rate": 0.00013516620017392747, + "loss": 0.769, + "step": 25775 + }, + { + "epoch": 0.6618554020808157, + "grad_norm": 0.79296875, + "learning_rate": 0.00013516202107897196, + "loss": 0.8483, + "step": 25776 + }, + { + "epoch": 0.6618810792767376, + "grad_norm": 0.86328125, + "learning_rate": 0.00013515784191394173, + "loss": 0.8713, + "step": 25777 + }, + { + "epoch": 0.6619067564726594, + "grad_norm": 1.109375, + "learning_rate": 0.00013515366267884512, + "loss": 0.8186, + "step": 25778 + }, + { + "epoch": 0.6619324336685812, + "grad_norm": 0.7265625, + "learning_rate": 0.0001351494833736904, + "loss": 0.7436, + "step": 25779 + }, + { + "epoch": 0.6619581108645031, + "grad_norm": 0.8359375, + "learning_rate": 0.00013514530399848598, + "loss": 0.826, + "step": 25780 + }, + { + "epoch": 0.6619837880604249, + "grad_norm": 0.6875, + "learning_rate": 0.0001351411245532401, + "loss": 0.722, + "step": 25781 + }, + { + "epoch": 0.6620094652563466, + "grad_norm": 0.828125, + "learning_rate": 0.00013513694503796114, + "loss": 0.799, + "step": 25782 + }, + { + "epoch": 0.6620351424522685, + "grad_norm": 0.71875, + "learning_rate": 0.00013513276545265741, + "loss": 0.7094, + "step": 25783 + }, + { + "epoch": 0.6620608196481903, + "grad_norm": 0.796875, + "learning_rate": 0.0001351285857973373, + "loss": 0.8282, + "step": 25784 + }, + { + "epoch": 0.6620864968441121, + "grad_norm": 0.73828125, + "learning_rate": 0.00013512440607200906, + "loss": 0.7689, + "step": 25785 + }, + { + "epoch": 0.662112174040034, + "grad_norm": 0.7734375, + "learning_rate": 0.00013512022627668103, + "loss": 0.8808, + "step": 25786 + }, + { + "epoch": 0.6621378512359558, + "grad_norm": 0.72265625, + "learning_rate": 0.0001351160464113616, + "loss": 0.8885, + "step": 25787 + }, + { + "epoch": 0.6621635284318776, + "grad_norm": 0.8046875, + "learning_rate": 0.00013511186647605908, + "loss": 0.794, + "step": 25788 + }, + { + "epoch": 0.6621892056277994, + "grad_norm": 0.82421875, + "learning_rate": 0.00013510768647078173, + "loss": 0.8239, + "step": 25789 + }, + { + "epoch": 0.6622148828237212, + "grad_norm": 0.703125, + "learning_rate": 0.00013510350639553797, + "loss": 0.7798, + "step": 25790 + }, + { + "epoch": 0.662240560019643, + "grad_norm": 0.71875, + "learning_rate": 0.00013509932625033606, + "loss": 0.7513, + "step": 25791 + }, + { + "epoch": 0.6622662372155649, + "grad_norm": 0.859375, + "learning_rate": 0.00013509514603518436, + "loss": 0.8747, + "step": 25792 + }, + { + "epoch": 0.6622919144114867, + "grad_norm": 0.75, + "learning_rate": 0.00013509096575009123, + "loss": 0.7926, + "step": 25793 + }, + { + "epoch": 0.6623175916074086, + "grad_norm": 0.76953125, + "learning_rate": 0.00013508678539506496, + "loss": 0.8346, + "step": 25794 + }, + { + "epoch": 0.6623432688033304, + "grad_norm": 0.7890625, + "learning_rate": 0.00013508260497011393, + "loss": 0.8208, + "step": 25795 + }, + { + "epoch": 0.6623689459992521, + "grad_norm": 0.74609375, + "learning_rate": 0.0001350784244752464, + "loss": 0.8652, + "step": 25796 + }, + { + "epoch": 0.662394623195174, + "grad_norm": 0.828125, + "learning_rate": 0.00013507424391047076, + "loss": 0.8935, + "step": 25797 + }, + { + "epoch": 0.6624203003910958, + "grad_norm": 0.8125, + "learning_rate": 0.0001350700632757953, + "loss": 0.9544, + "step": 25798 + }, + { + "epoch": 0.6624459775870176, + "grad_norm": 0.8046875, + "learning_rate": 0.00013506588257122836, + "loss": 0.8308, + "step": 25799 + }, + { + "epoch": 0.6624716547829395, + "grad_norm": 0.83203125, + "learning_rate": 0.00013506170179677834, + "loss": 1.0129, + "step": 25800 + }, + { + "epoch": 0.6624973319788613, + "grad_norm": 0.81640625, + "learning_rate": 0.00013505752095245346, + "loss": 0.9229, + "step": 25801 + }, + { + "epoch": 0.662523009174783, + "grad_norm": 0.75390625, + "learning_rate": 0.00013505334003826214, + "loss": 0.9207, + "step": 25802 + }, + { + "epoch": 0.6625486863707049, + "grad_norm": 0.78515625, + "learning_rate": 0.00013504915905421266, + "loss": 0.8027, + "step": 25803 + }, + { + "epoch": 0.6625743635666267, + "grad_norm": 0.85546875, + "learning_rate": 0.00013504497800031337, + "loss": 0.7828, + "step": 25804 + }, + { + "epoch": 0.6626000407625485, + "grad_norm": 0.7734375, + "learning_rate": 0.00013504079687657263, + "loss": 0.8478, + "step": 25805 + }, + { + "epoch": 0.6626257179584704, + "grad_norm": 0.83984375, + "learning_rate": 0.00013503661568299872, + "loss": 0.8814, + "step": 25806 + }, + { + "epoch": 0.6626513951543922, + "grad_norm": 0.734375, + "learning_rate": 0.0001350324344196, + "loss": 0.896, + "step": 25807 + }, + { + "epoch": 0.662677072350314, + "grad_norm": 0.90625, + "learning_rate": 0.00013502825308638484, + "loss": 0.8947, + "step": 25808 + }, + { + "epoch": 0.6627027495462358, + "grad_norm": 0.73046875, + "learning_rate": 0.0001350240716833615, + "loss": 0.7848, + "step": 25809 + }, + { + "epoch": 0.6627284267421576, + "grad_norm": 0.859375, + "learning_rate": 0.00013501989021053837, + "loss": 0.7885, + "step": 25810 + }, + { + "epoch": 0.6627541039380794, + "grad_norm": 0.765625, + "learning_rate": 0.00013501570866792374, + "loss": 0.9409, + "step": 25811 + }, + { + "epoch": 0.6627797811340013, + "grad_norm": 0.8125, + "learning_rate": 0.00013501152705552597, + "loss": 0.8995, + "step": 25812 + }, + { + "epoch": 0.6628054583299231, + "grad_norm": 0.8203125, + "learning_rate": 0.0001350073453733534, + "loss": 0.8507, + "step": 25813 + }, + { + "epoch": 0.6628311355258449, + "grad_norm": 0.78515625, + "learning_rate": 0.00013500316362141435, + "loss": 0.824, + "step": 25814 + }, + { + "epoch": 0.6628568127217668, + "grad_norm": 0.7890625, + "learning_rate": 0.00013499898179971718, + "loss": 0.7345, + "step": 25815 + }, + { + "epoch": 0.6628824899176885, + "grad_norm": 0.75390625, + "learning_rate": 0.00013499479990827014, + "loss": 0.8928, + "step": 25816 + }, + { + "epoch": 0.6629081671136103, + "grad_norm": 0.76171875, + "learning_rate": 0.00013499061794708167, + "loss": 0.9776, + "step": 25817 + }, + { + "epoch": 0.6629338443095322, + "grad_norm": 0.8046875, + "learning_rate": 0.00013498643591616005, + "loss": 0.9846, + "step": 25818 + }, + { + "epoch": 0.662959521505454, + "grad_norm": 0.875, + "learning_rate": 0.00013498225381551363, + "loss": 0.8493, + "step": 25819 + }, + { + "epoch": 0.6629851987013758, + "grad_norm": 0.84765625, + "learning_rate": 0.00013497807164515073, + "loss": 0.9105, + "step": 25820 + }, + { + "epoch": 0.6630108758972977, + "grad_norm": 0.80078125, + "learning_rate": 0.00013497388940507968, + "loss": 0.7653, + "step": 25821 + }, + { + "epoch": 0.6630365530932194, + "grad_norm": 0.76171875, + "learning_rate": 0.00013496970709530887, + "loss": 0.8447, + "step": 25822 + }, + { + "epoch": 0.6630622302891412, + "grad_norm": 0.78515625, + "learning_rate": 0.00013496552471584653, + "loss": 0.8635, + "step": 25823 + }, + { + "epoch": 0.6630879074850631, + "grad_norm": 0.765625, + "learning_rate": 0.0001349613422667011, + "loss": 0.9066, + "step": 25824 + }, + { + "epoch": 0.6631135846809849, + "grad_norm": 0.84765625, + "learning_rate": 0.0001349571597478809, + "loss": 0.8167, + "step": 25825 + }, + { + "epoch": 0.6631392618769067, + "grad_norm": 0.82421875, + "learning_rate": 0.00013495297715939418, + "loss": 0.8194, + "step": 25826 + }, + { + "epoch": 0.6631649390728286, + "grad_norm": 0.81640625, + "learning_rate": 0.00013494879450124935, + "loss": 0.8599, + "step": 25827 + }, + { + "epoch": 0.6631906162687504, + "grad_norm": 0.80859375, + "learning_rate": 0.00013494461177345474, + "loss": 0.884, + "step": 25828 + }, + { + "epoch": 0.6632162934646721, + "grad_norm": 0.77734375, + "learning_rate": 0.0001349404289760187, + "loss": 0.8485, + "step": 25829 + }, + { + "epoch": 0.663241970660594, + "grad_norm": 0.80859375, + "learning_rate": 0.0001349362461089495, + "loss": 0.848, + "step": 25830 + }, + { + "epoch": 0.6632676478565158, + "grad_norm": 0.7734375, + "learning_rate": 0.0001349320631722555, + "loss": 0.8748, + "step": 25831 + }, + { + "epoch": 0.6632933250524377, + "grad_norm": 0.7890625, + "learning_rate": 0.00013492788016594508, + "loss": 0.8494, + "step": 25832 + }, + { + "epoch": 0.6633190022483595, + "grad_norm": 0.75390625, + "learning_rate": 0.00013492369709002658, + "loss": 0.8623, + "step": 25833 + }, + { + "epoch": 0.6633446794442813, + "grad_norm": 0.84375, + "learning_rate": 0.00013491951394450826, + "loss": 0.9083, + "step": 25834 + }, + { + "epoch": 0.663370356640203, + "grad_norm": 0.77734375, + "learning_rate": 0.00013491533072939856, + "loss": 0.8198, + "step": 25835 + }, + { + "epoch": 0.6633960338361249, + "grad_norm": 0.796875, + "learning_rate": 0.0001349111474447057, + "loss": 0.7569, + "step": 25836 + }, + { + "epoch": 0.6634217110320467, + "grad_norm": 0.83203125, + "learning_rate": 0.00013490696409043808, + "loss": 0.9288, + "step": 25837 + }, + { + "epoch": 0.6634473882279686, + "grad_norm": 0.76953125, + "learning_rate": 0.00013490278066660406, + "loss": 0.7842, + "step": 25838 + }, + { + "epoch": 0.6634730654238904, + "grad_norm": 0.8125, + "learning_rate": 0.00013489859717321194, + "loss": 0.7827, + "step": 25839 + }, + { + "epoch": 0.6634987426198122, + "grad_norm": 0.76953125, + "learning_rate": 0.00013489441361027007, + "loss": 0.8637, + "step": 25840 + }, + { + "epoch": 0.6635244198157341, + "grad_norm": 0.75, + "learning_rate": 0.00013489022997778682, + "loss": 0.9282, + "step": 25841 + }, + { + "epoch": 0.6635500970116558, + "grad_norm": 0.73046875, + "learning_rate": 0.00013488604627577043, + "loss": 0.7217, + "step": 25842 + }, + { + "epoch": 0.6635757742075776, + "grad_norm": 0.7265625, + "learning_rate": 0.00013488186250422933, + "loss": 0.7066, + "step": 25843 + }, + { + "epoch": 0.6636014514034995, + "grad_norm": 0.765625, + "learning_rate": 0.00013487767866317182, + "loss": 0.8645, + "step": 25844 + }, + { + "epoch": 0.6636271285994213, + "grad_norm": 0.765625, + "learning_rate": 0.00013487349475260627, + "loss": 0.8147, + "step": 25845 + }, + { + "epoch": 0.6636528057953431, + "grad_norm": 0.83984375, + "learning_rate": 0.000134869310772541, + "loss": 0.9545, + "step": 25846 + }, + { + "epoch": 0.663678482991265, + "grad_norm": 0.75, + "learning_rate": 0.0001348651267229843, + "loss": 0.8899, + "step": 25847 + }, + { + "epoch": 0.6637041601871868, + "grad_norm": 0.796875, + "learning_rate": 0.0001348609426039446, + "loss": 0.8827, + "step": 25848 + }, + { + "epoch": 0.6637298373831085, + "grad_norm": 0.8515625, + "learning_rate": 0.0001348567584154302, + "loss": 1.0587, + "step": 25849 + }, + { + "epoch": 0.6637555145790304, + "grad_norm": 0.80859375, + "learning_rate": 0.00013485257415744937, + "loss": 0.8487, + "step": 25850 + }, + { + "epoch": 0.6637811917749522, + "grad_norm": 0.796875, + "learning_rate": 0.00013484838983001052, + "loss": 0.9043, + "step": 25851 + }, + { + "epoch": 0.663806868970874, + "grad_norm": 0.82421875, + "learning_rate": 0.000134844205433122, + "loss": 0.8325, + "step": 25852 + }, + { + "epoch": 0.6638325461667959, + "grad_norm": 0.7734375, + "learning_rate": 0.0001348400209667921, + "loss": 0.8508, + "step": 25853 + }, + { + "epoch": 0.6638582233627177, + "grad_norm": 0.73046875, + "learning_rate": 0.0001348358364310292, + "loss": 0.8507, + "step": 25854 + }, + { + "epoch": 0.6638839005586394, + "grad_norm": 0.8125, + "learning_rate": 0.00013483165182584164, + "loss": 0.961, + "step": 25855 + }, + { + "epoch": 0.6639095777545613, + "grad_norm": 0.76953125, + "learning_rate": 0.00013482746715123774, + "loss": 0.8525, + "step": 25856 + }, + { + "epoch": 0.6639352549504831, + "grad_norm": 0.77734375, + "learning_rate": 0.00013482328240722584, + "loss": 0.9089, + "step": 25857 + }, + { + "epoch": 0.663960932146405, + "grad_norm": 0.84375, + "learning_rate": 0.00013481909759381427, + "loss": 0.9026, + "step": 25858 + }, + { + "epoch": 0.6639866093423268, + "grad_norm": 0.7890625, + "learning_rate": 0.0001348149127110114, + "loss": 0.8321, + "step": 25859 + }, + { + "epoch": 0.6640122865382486, + "grad_norm": 0.84375, + "learning_rate": 0.00013481072775882555, + "loss": 0.83, + "step": 25860 + }, + { + "epoch": 0.6640379637341705, + "grad_norm": 0.70703125, + "learning_rate": 0.00013480654273726508, + "loss": 0.7562, + "step": 25861 + }, + { + "epoch": 0.6640636409300922, + "grad_norm": 0.734375, + "learning_rate": 0.00013480235764633827, + "loss": 0.805, + "step": 25862 + }, + { + "epoch": 0.664089318126014, + "grad_norm": 0.734375, + "learning_rate": 0.00013479817248605354, + "loss": 0.9422, + "step": 25863 + }, + { + "epoch": 0.6641149953219359, + "grad_norm": 0.7890625, + "learning_rate": 0.0001347939872564192, + "loss": 0.7428, + "step": 25864 + }, + { + "epoch": 0.6641406725178577, + "grad_norm": 0.72265625, + "learning_rate": 0.00013478980195744355, + "loss": 0.8752, + "step": 25865 + }, + { + "epoch": 0.6641663497137795, + "grad_norm": 0.828125, + "learning_rate": 0.00013478561658913498, + "loss": 0.854, + "step": 25866 + }, + { + "epoch": 0.6641920269097014, + "grad_norm": 0.81640625, + "learning_rate": 0.00013478143115150184, + "loss": 0.9429, + "step": 25867 + }, + { + "epoch": 0.6642177041056232, + "grad_norm": 0.7421875, + "learning_rate": 0.00013477724564455243, + "loss": 0.822, + "step": 25868 + }, + { + "epoch": 0.6642433813015449, + "grad_norm": 0.72265625, + "learning_rate": 0.00013477306006829512, + "loss": 0.6943, + "step": 25869 + }, + { + "epoch": 0.6642690584974668, + "grad_norm": 0.6875, + "learning_rate": 0.00013476887442273825, + "loss": 0.8764, + "step": 25870 + }, + { + "epoch": 0.6642947356933886, + "grad_norm": 0.82421875, + "learning_rate": 0.00013476468870789011, + "loss": 0.7987, + "step": 25871 + }, + { + "epoch": 0.6643204128893104, + "grad_norm": 0.71875, + "learning_rate": 0.00013476050292375913, + "loss": 0.8248, + "step": 25872 + }, + { + "epoch": 0.6643460900852323, + "grad_norm": 0.72265625, + "learning_rate": 0.0001347563170703536, + "loss": 0.7467, + "step": 25873 + }, + { + "epoch": 0.6643717672811541, + "grad_norm": 0.80078125, + "learning_rate": 0.00013475213114768188, + "loss": 0.8277, + "step": 25874 + }, + { + "epoch": 0.6643974444770758, + "grad_norm": 0.734375, + "learning_rate": 0.0001347479451557523, + "loss": 0.8588, + "step": 25875 + }, + { + "epoch": 0.6644231216729977, + "grad_norm": 0.76171875, + "learning_rate": 0.00013474375909457316, + "loss": 0.8636, + "step": 25876 + }, + { + "epoch": 0.6644487988689195, + "grad_norm": 0.75390625, + "learning_rate": 0.00013473957296415291, + "loss": 0.905, + "step": 25877 + }, + { + "epoch": 0.6644744760648413, + "grad_norm": 0.84375, + "learning_rate": 0.0001347353867644998, + "loss": 0.8017, + "step": 25878 + }, + { + "epoch": 0.6645001532607632, + "grad_norm": 0.80859375, + "learning_rate": 0.0001347312004956222, + "loss": 0.8788, + "step": 25879 + }, + { + "epoch": 0.664525830456685, + "grad_norm": 0.765625, + "learning_rate": 0.00013472701415752843, + "loss": 0.8126, + "step": 25880 + }, + { + "epoch": 0.6645515076526068, + "grad_norm": 0.8203125, + "learning_rate": 0.0001347228277502269, + "loss": 0.8781, + "step": 25881 + }, + { + "epoch": 0.6645771848485286, + "grad_norm": 0.7890625, + "learning_rate": 0.0001347186412737259, + "loss": 0.7736, + "step": 25882 + }, + { + "epoch": 0.6646028620444504, + "grad_norm": 0.76953125, + "learning_rate": 0.0001347144547280338, + "loss": 0.9665, + "step": 25883 + }, + { + "epoch": 0.6646285392403722, + "grad_norm": 0.7734375, + "learning_rate": 0.0001347102681131589, + "loss": 0.8136, + "step": 25884 + }, + { + "epoch": 0.6646542164362941, + "grad_norm": 0.7109375, + "learning_rate": 0.00013470608142910953, + "loss": 0.8221, + "step": 25885 + }, + { + "epoch": 0.6646798936322159, + "grad_norm": 0.8203125, + "learning_rate": 0.00013470189467589414, + "loss": 0.9821, + "step": 25886 + }, + { + "epoch": 0.6647055708281377, + "grad_norm": 0.8359375, + "learning_rate": 0.000134697707853521, + "loss": 0.9764, + "step": 25887 + }, + { + "epoch": 0.6647312480240596, + "grad_norm": 0.77734375, + "learning_rate": 0.00013469352096199844, + "loss": 0.8837, + "step": 25888 + }, + { + "epoch": 0.6647569252199813, + "grad_norm": 0.71875, + "learning_rate": 0.00013468933400133487, + "loss": 0.7343, + "step": 25889 + }, + { + "epoch": 0.6647826024159031, + "grad_norm": 0.7890625, + "learning_rate": 0.00013468514697153854, + "loss": 0.8306, + "step": 25890 + }, + { + "epoch": 0.664808279611825, + "grad_norm": 0.7890625, + "learning_rate": 0.00013468095987261786, + "loss": 0.781, + "step": 25891 + }, + { + "epoch": 0.6648339568077468, + "grad_norm": 0.79296875, + "learning_rate": 0.00013467677270458114, + "loss": 0.8499, + "step": 25892 + }, + { + "epoch": 0.6648596340036687, + "grad_norm": 0.77734375, + "learning_rate": 0.0001346725854674368, + "loss": 0.8351, + "step": 25893 + }, + { + "epoch": 0.6648853111995905, + "grad_norm": 0.75390625, + "learning_rate": 0.0001346683981611931, + "loss": 0.7862, + "step": 25894 + }, + { + "epoch": 0.6649109883955122, + "grad_norm": 0.83203125, + "learning_rate": 0.0001346642107858584, + "loss": 0.747, + "step": 25895 + }, + { + "epoch": 0.664936665591434, + "grad_norm": 0.84375, + "learning_rate": 0.00013466002334144107, + "loss": 0.8638, + "step": 25896 + }, + { + "epoch": 0.6649623427873559, + "grad_norm": 0.81640625, + "learning_rate": 0.00013465583582794946, + "loss": 0.9118, + "step": 25897 + }, + { + "epoch": 0.6649880199832777, + "grad_norm": 0.75390625, + "learning_rate": 0.00013465164824539186, + "loss": 0.8671, + "step": 25898 + }, + { + "epoch": 0.6650136971791996, + "grad_norm": 0.76171875, + "learning_rate": 0.0001346474605937767, + "loss": 0.8652, + "step": 25899 + }, + { + "epoch": 0.6650393743751214, + "grad_norm": 0.89453125, + "learning_rate": 0.00013464327287311228, + "loss": 0.8958, + "step": 25900 + }, + { + "epoch": 0.6650650515710432, + "grad_norm": 0.73046875, + "learning_rate": 0.0001346390850834069, + "loss": 0.7894, + "step": 25901 + }, + { + "epoch": 0.665090728766965, + "grad_norm": 0.75, + "learning_rate": 0.000134634897224669, + "loss": 0.8777, + "step": 25902 + }, + { + "epoch": 0.6651164059628868, + "grad_norm": 0.84765625, + "learning_rate": 0.00013463070929690683, + "loss": 0.8025, + "step": 25903 + }, + { + "epoch": 0.6651420831588086, + "grad_norm": 0.80859375, + "learning_rate": 0.00013462652130012882, + "loss": 0.7733, + "step": 25904 + }, + { + "epoch": 0.6651677603547305, + "grad_norm": 0.7265625, + "learning_rate": 0.0001346223332343433, + "loss": 0.7527, + "step": 25905 + }, + { + "epoch": 0.6651934375506523, + "grad_norm": 0.796875, + "learning_rate": 0.00013461814509955854, + "loss": 0.8003, + "step": 25906 + }, + { + "epoch": 0.6652191147465741, + "grad_norm": 0.69140625, + "learning_rate": 0.000134613956895783, + "loss": 0.8026, + "step": 25907 + }, + { + "epoch": 0.665244791942496, + "grad_norm": 0.953125, + "learning_rate": 0.00013460976862302493, + "loss": 0.8802, + "step": 25908 + }, + { + "epoch": 0.6652704691384177, + "grad_norm": 0.81640625, + "learning_rate": 0.00013460558028129273, + "loss": 0.7465, + "step": 25909 + }, + { + "epoch": 0.6652961463343395, + "grad_norm": 0.828125, + "learning_rate": 0.00013460139187059474, + "loss": 0.7065, + "step": 25910 + }, + { + "epoch": 0.6653218235302614, + "grad_norm": 0.81640625, + "learning_rate": 0.0001345972033909393, + "loss": 0.9346, + "step": 25911 + }, + { + "epoch": 0.6653475007261832, + "grad_norm": 0.75390625, + "learning_rate": 0.00013459301484233476, + "loss": 0.948, + "step": 25912 + }, + { + "epoch": 0.665373177922105, + "grad_norm": 0.75390625, + "learning_rate": 0.00013458882622478945, + "loss": 0.8321, + "step": 25913 + }, + { + "epoch": 0.6653988551180269, + "grad_norm": 0.80078125, + "learning_rate": 0.00013458463753831176, + "loss": 0.7826, + "step": 25914 + }, + { + "epoch": 0.6654245323139486, + "grad_norm": 0.78515625, + "learning_rate": 0.00013458044878291, + "loss": 0.8232, + "step": 25915 + }, + { + "epoch": 0.6654502095098704, + "grad_norm": 0.828125, + "learning_rate": 0.00013457625995859251, + "loss": 0.8743, + "step": 25916 + }, + { + "epoch": 0.6654758867057923, + "grad_norm": 1.0546875, + "learning_rate": 0.0001345720710653677, + "loss": 0.9277, + "step": 25917 + }, + { + "epoch": 0.6655015639017141, + "grad_norm": 0.9140625, + "learning_rate": 0.00013456788210324384, + "loss": 0.7684, + "step": 25918 + }, + { + "epoch": 0.665527241097636, + "grad_norm": 0.73046875, + "learning_rate": 0.0001345636930722293, + "loss": 0.959, + "step": 25919 + }, + { + "epoch": 0.6655529182935578, + "grad_norm": 0.765625, + "learning_rate": 0.0001345595039723325, + "loss": 0.8366, + "step": 25920 + }, + { + "epoch": 0.6655785954894796, + "grad_norm": 0.72265625, + "learning_rate": 0.0001345553148035617, + "loss": 0.8473, + "step": 25921 + }, + { + "epoch": 0.6656042726854013, + "grad_norm": 0.88671875, + "learning_rate": 0.00013455112556592526, + "loss": 0.8486, + "step": 25922 + }, + { + "epoch": 0.6656299498813232, + "grad_norm": 0.7890625, + "learning_rate": 0.00013454693625943156, + "loss": 0.8213, + "step": 25923 + }, + { + "epoch": 0.665655627077245, + "grad_norm": 0.8203125, + "learning_rate": 0.00013454274688408892, + "loss": 0.8263, + "step": 25924 + }, + { + "epoch": 0.6656813042731669, + "grad_norm": 0.796875, + "learning_rate": 0.00013453855743990572, + "loss": 0.8228, + "step": 25925 + }, + { + "epoch": 0.6657069814690887, + "grad_norm": 0.8046875, + "learning_rate": 0.00013453436792689033, + "loss": 0.7956, + "step": 25926 + }, + { + "epoch": 0.6657326586650105, + "grad_norm": 0.76171875, + "learning_rate": 0.00013453017834505105, + "loss": 0.8085, + "step": 25927 + }, + { + "epoch": 0.6657583358609324, + "grad_norm": 0.7890625, + "learning_rate": 0.00013452598869439621, + "loss": 0.7926, + "step": 25928 + }, + { + "epoch": 0.6657840130568541, + "grad_norm": 0.75390625, + "learning_rate": 0.0001345217989749342, + "loss": 0.8449, + "step": 25929 + }, + { + "epoch": 0.6658096902527759, + "grad_norm": 0.8828125, + "learning_rate": 0.00013451760918667337, + "loss": 0.8296, + "step": 25930 + }, + { + "epoch": 0.6658353674486978, + "grad_norm": 0.7109375, + "learning_rate": 0.00013451341932962208, + "loss": 0.73, + "step": 25931 + }, + { + "epoch": 0.6658610446446196, + "grad_norm": 0.83203125, + "learning_rate": 0.00013450922940378863, + "loss": 0.8265, + "step": 25932 + }, + { + "epoch": 0.6658867218405414, + "grad_norm": 0.921875, + "learning_rate": 0.00013450503940918144, + "loss": 0.8324, + "step": 25933 + }, + { + "epoch": 0.6659123990364633, + "grad_norm": 0.8203125, + "learning_rate": 0.00013450084934580882, + "loss": 0.79, + "step": 25934 + }, + { + "epoch": 0.665938076232385, + "grad_norm": 0.7734375, + "learning_rate": 0.00013449665921367912, + "loss": 0.7155, + "step": 25935 + }, + { + "epoch": 0.6659637534283068, + "grad_norm": 0.8046875, + "learning_rate": 0.00013449246901280066, + "loss": 0.9473, + "step": 25936 + }, + { + "epoch": 0.6659894306242287, + "grad_norm": 0.87109375, + "learning_rate": 0.00013448827874318187, + "loss": 1.0054, + "step": 25937 + }, + { + "epoch": 0.6660151078201505, + "grad_norm": 0.7578125, + "learning_rate": 0.000134484088404831, + "loss": 0.8229, + "step": 25938 + }, + { + "epoch": 0.6660407850160723, + "grad_norm": 0.77734375, + "learning_rate": 0.00013447989799775652, + "loss": 0.8343, + "step": 25939 + }, + { + "epoch": 0.6660664622119942, + "grad_norm": 0.80078125, + "learning_rate": 0.0001344757075219667, + "loss": 0.8612, + "step": 25940 + }, + { + "epoch": 0.666092139407916, + "grad_norm": 0.7890625, + "learning_rate": 0.00013447151697746986, + "loss": 0.8849, + "step": 25941 + }, + { + "epoch": 0.6661178166038377, + "grad_norm": 0.84765625, + "learning_rate": 0.00013446732636427446, + "loss": 0.9461, + "step": 25942 + }, + { + "epoch": 0.6661434937997596, + "grad_norm": 0.7421875, + "learning_rate": 0.00013446313568238875, + "loss": 0.7852, + "step": 25943 + }, + { + "epoch": 0.6661691709956814, + "grad_norm": 0.8125, + "learning_rate": 0.00013445894493182116, + "loss": 0.8535, + "step": 25944 + }, + { + "epoch": 0.6661948481916032, + "grad_norm": 0.83203125, + "learning_rate": 0.00013445475411258, + "loss": 0.8565, + "step": 25945 + }, + { + "epoch": 0.6662205253875251, + "grad_norm": 0.7890625, + "learning_rate": 0.0001344505632246736, + "loss": 0.8271, + "step": 25946 + }, + { + "epoch": 0.6662462025834469, + "grad_norm": 0.7578125, + "learning_rate": 0.00013444637226811036, + "loss": 0.8003, + "step": 25947 + }, + { + "epoch": 0.6662718797793687, + "grad_norm": 0.71484375, + "learning_rate": 0.0001344421812428986, + "loss": 0.7726, + "step": 25948 + }, + { + "epoch": 0.6662975569752905, + "grad_norm": 0.8359375, + "learning_rate": 0.0001344379901490467, + "loss": 0.8752, + "step": 25949 + }, + { + "epoch": 0.6663232341712123, + "grad_norm": 0.78125, + "learning_rate": 0.000134433798986563, + "loss": 0.8175, + "step": 25950 + }, + { + "epoch": 0.6663489113671341, + "grad_norm": 0.8671875, + "learning_rate": 0.0001344296077554558, + "loss": 0.8853, + "step": 25951 + }, + { + "epoch": 0.666374588563056, + "grad_norm": 0.75, + "learning_rate": 0.00013442541645573356, + "loss": 0.899, + "step": 25952 + }, + { + "epoch": 0.6664002657589778, + "grad_norm": 0.77734375, + "learning_rate": 0.00013442122508740455, + "loss": 1.0328, + "step": 25953 + }, + { + "epoch": 0.6664259429548997, + "grad_norm": 0.75, + "learning_rate": 0.00013441703365047716, + "loss": 0.9021, + "step": 25954 + }, + { + "epoch": 0.6664516201508214, + "grad_norm": 0.75, + "learning_rate": 0.00013441284214495974, + "loss": 0.7801, + "step": 25955 + }, + { + "epoch": 0.6664772973467432, + "grad_norm": 0.72265625, + "learning_rate": 0.00013440865057086062, + "loss": 0.8387, + "step": 25956 + }, + { + "epoch": 0.666502974542665, + "grad_norm": 0.83203125, + "learning_rate": 0.00013440445892818815, + "loss": 0.9245, + "step": 25957 + }, + { + "epoch": 0.6665286517385869, + "grad_norm": 0.78125, + "learning_rate": 0.0001344002672169507, + "loss": 0.8231, + "step": 25958 + }, + { + "epoch": 0.6665543289345087, + "grad_norm": 0.734375, + "learning_rate": 0.00013439607543715664, + "loss": 0.8297, + "step": 25959 + }, + { + "epoch": 0.6665800061304306, + "grad_norm": 0.80859375, + "learning_rate": 0.00013439188358881432, + "loss": 0.9012, + "step": 25960 + }, + { + "epoch": 0.6666056833263524, + "grad_norm": 0.7734375, + "learning_rate": 0.00013438769167193205, + "loss": 0.8257, + "step": 25961 + }, + { + "epoch": 0.6666313605222741, + "grad_norm": 0.77734375, + "learning_rate": 0.00013438349968651825, + "loss": 0.7993, + "step": 25962 + }, + { + "epoch": 0.666657037718196, + "grad_norm": 0.8671875, + "learning_rate": 0.00013437930763258123, + "loss": 0.9162, + "step": 25963 + }, + { + "epoch": 0.6666827149141178, + "grad_norm": 0.796875, + "learning_rate": 0.00013437511551012935, + "loss": 0.8129, + "step": 25964 + }, + { + "epoch": 0.6667083921100396, + "grad_norm": 0.8125, + "learning_rate": 0.000134370923319171, + "loss": 0.8456, + "step": 25965 + }, + { + "epoch": 0.6667340693059615, + "grad_norm": 0.859375, + "learning_rate": 0.0001343667310597145, + "loss": 0.9872, + "step": 25966 + }, + { + "epoch": 0.6667597465018833, + "grad_norm": 0.84765625, + "learning_rate": 0.00013436253873176814, + "loss": 0.8319, + "step": 25967 + }, + { + "epoch": 0.6667854236978051, + "grad_norm": 0.81640625, + "learning_rate": 0.0001343583463353404, + "loss": 0.7869, + "step": 25968 + }, + { + "epoch": 0.6668111008937269, + "grad_norm": 0.828125, + "learning_rate": 0.00013435415387043958, + "loss": 0.8768, + "step": 25969 + }, + { + "epoch": 0.6668367780896487, + "grad_norm": 0.8828125, + "learning_rate": 0.00013434996133707402, + "loss": 0.8208, + "step": 25970 + }, + { + "epoch": 0.6668624552855705, + "grad_norm": 0.78515625, + "learning_rate": 0.00013434576873525212, + "loss": 0.8658, + "step": 25971 + }, + { + "epoch": 0.6668881324814924, + "grad_norm": 0.796875, + "learning_rate": 0.00013434157606498215, + "loss": 0.8108, + "step": 25972 + }, + { + "epoch": 0.6669138096774142, + "grad_norm": 2.96875, + "learning_rate": 0.00013433738332627258, + "loss": 0.7483, + "step": 25973 + }, + { + "epoch": 0.666939486873336, + "grad_norm": 0.796875, + "learning_rate": 0.00013433319051913166, + "loss": 0.9029, + "step": 25974 + }, + { + "epoch": 0.6669651640692578, + "grad_norm": 0.73828125, + "learning_rate": 0.00013432899764356782, + "loss": 0.7651, + "step": 25975 + }, + { + "epoch": 0.6669908412651796, + "grad_norm": 0.83984375, + "learning_rate": 0.00013432480469958936, + "loss": 0.7685, + "step": 25976 + }, + { + "epoch": 0.6670165184611014, + "grad_norm": 0.78125, + "learning_rate": 0.00013432061168720471, + "loss": 0.8565, + "step": 25977 + }, + { + "epoch": 0.6670421956570233, + "grad_norm": 0.84765625, + "learning_rate": 0.00013431641860642211, + "loss": 0.8557, + "step": 25978 + }, + { + "epoch": 0.6670678728529451, + "grad_norm": 0.73828125, + "learning_rate": 0.00013431222545725008, + "loss": 0.803, + "step": 25979 + }, + { + "epoch": 0.667093550048867, + "grad_norm": 0.89453125, + "learning_rate": 0.00013430803223969684, + "loss": 0.9118, + "step": 25980 + }, + { + "epoch": 0.6671192272447888, + "grad_norm": 0.7421875, + "learning_rate": 0.0001343038389537708, + "loss": 0.7777, + "step": 25981 + }, + { + "epoch": 0.6671449044407105, + "grad_norm": 0.78515625, + "learning_rate": 0.00013429964559948028, + "loss": 0.7917, + "step": 25982 + }, + { + "epoch": 0.6671705816366323, + "grad_norm": 0.796875, + "learning_rate": 0.00013429545217683367, + "loss": 0.7904, + "step": 25983 + }, + { + "epoch": 0.6671962588325542, + "grad_norm": 0.74609375, + "learning_rate": 0.00013429125868583933, + "loss": 0.8979, + "step": 25984 + }, + { + "epoch": 0.667221936028476, + "grad_norm": 0.828125, + "learning_rate": 0.0001342870651265056, + "loss": 1.0113, + "step": 25985 + }, + { + "epoch": 0.6672476132243979, + "grad_norm": 0.76171875, + "learning_rate": 0.00013428287149884087, + "loss": 0.8752, + "step": 25986 + }, + { + "epoch": 0.6672732904203197, + "grad_norm": 0.7265625, + "learning_rate": 0.0001342786778028535, + "loss": 0.7966, + "step": 25987 + }, + { + "epoch": 0.6672989676162415, + "grad_norm": 0.7578125, + "learning_rate": 0.00013427448403855174, + "loss": 0.8328, + "step": 25988 + }, + { + "epoch": 0.6673246448121632, + "grad_norm": 0.71875, + "learning_rate": 0.00013427029020594407, + "loss": 0.7805, + "step": 25989 + }, + { + "epoch": 0.6673503220080851, + "grad_norm": 0.90625, + "learning_rate": 0.0001342660963050388, + "loss": 0.8608, + "step": 25990 + }, + { + "epoch": 0.6673759992040069, + "grad_norm": 0.87109375, + "learning_rate": 0.0001342619023358443, + "loss": 0.8476, + "step": 25991 + }, + { + "epoch": 0.6674016763999288, + "grad_norm": 0.7578125, + "learning_rate": 0.00013425770829836894, + "loss": 0.8197, + "step": 25992 + }, + { + "epoch": 0.6674273535958506, + "grad_norm": 0.7578125, + "learning_rate": 0.00013425351419262108, + "loss": 0.8893, + "step": 25993 + }, + { + "epoch": 0.6674530307917724, + "grad_norm": 0.72265625, + "learning_rate": 0.00013424932001860902, + "loss": 0.7713, + "step": 25994 + }, + { + "epoch": 0.6674787079876942, + "grad_norm": 0.765625, + "learning_rate": 0.00013424512577634115, + "loss": 0.8835, + "step": 25995 + }, + { + "epoch": 0.667504385183616, + "grad_norm": 0.76171875, + "learning_rate": 0.00013424093146582585, + "loss": 0.7312, + "step": 25996 + }, + { + "epoch": 0.6675300623795378, + "grad_norm": 0.74609375, + "learning_rate": 0.0001342367370870715, + "loss": 0.6951, + "step": 25997 + }, + { + "epoch": 0.6675557395754597, + "grad_norm": 0.828125, + "learning_rate": 0.00013423254264008637, + "loss": 0.8904, + "step": 25998 + }, + { + "epoch": 0.6675814167713815, + "grad_norm": 0.7734375, + "learning_rate": 0.00013422834812487893, + "loss": 0.8393, + "step": 25999 + }, + { + "epoch": 0.6676070939673033, + "grad_norm": 0.73828125, + "learning_rate": 0.00013422415354145744, + "loss": 0.7981, + "step": 26000 + }, + { + "epoch": 0.6676070939673033, + "eval_loss": 0.8428560495376587, + "eval_runtime": 384.8264, + "eval_samples_per_second": 25.986, + "eval_steps_per_second": 0.813, + "step": 26000 + }, + { + "epoch": 0.6676327711632252, + "grad_norm": 0.71484375, + "learning_rate": 0.00013421995888983031, + "loss": 0.7936, + "step": 26001 + }, + { + "epoch": 0.6676584483591469, + "grad_norm": 0.80078125, + "learning_rate": 0.00013421576417000592, + "loss": 0.7887, + "step": 26002 + }, + { + "epoch": 0.6676841255550687, + "grad_norm": 0.73046875, + "learning_rate": 0.00013421156938199258, + "loss": 0.6647, + "step": 26003 + }, + { + "epoch": 0.6677098027509906, + "grad_norm": 0.80859375, + "learning_rate": 0.00013420737452579868, + "loss": 0.9763, + "step": 26004 + }, + { + "epoch": 0.6677354799469124, + "grad_norm": 0.74609375, + "learning_rate": 0.00013420317960143258, + "loss": 0.7615, + "step": 26005 + }, + { + "epoch": 0.6677611571428342, + "grad_norm": 0.78125, + "learning_rate": 0.00013419898460890263, + "loss": 0.8959, + "step": 26006 + }, + { + "epoch": 0.6677868343387561, + "grad_norm": 0.83203125, + "learning_rate": 0.0001341947895482172, + "loss": 0.852, + "step": 26007 + }, + { + "epoch": 0.6678125115346779, + "grad_norm": 0.75390625, + "learning_rate": 0.0001341905944193846, + "loss": 0.8634, + "step": 26008 + }, + { + "epoch": 0.6678381887305996, + "grad_norm": 0.7890625, + "learning_rate": 0.00013418639922241327, + "loss": 0.9053, + "step": 26009 + }, + { + "epoch": 0.6678638659265215, + "grad_norm": 0.87109375, + "learning_rate": 0.00013418220395731154, + "loss": 0.9855, + "step": 26010 + }, + { + "epoch": 0.6678895431224433, + "grad_norm": 0.82421875, + "learning_rate": 0.00013417800862408776, + "loss": 0.792, + "step": 26011 + }, + { + "epoch": 0.6679152203183651, + "grad_norm": 0.78125, + "learning_rate": 0.00013417381322275028, + "loss": 0.9099, + "step": 26012 + }, + { + "epoch": 0.667940897514287, + "grad_norm": 0.76953125, + "learning_rate": 0.0001341696177533075, + "loss": 0.9016, + "step": 26013 + }, + { + "epoch": 0.6679665747102088, + "grad_norm": 0.82421875, + "learning_rate": 0.00013416542221576775, + "loss": 0.8905, + "step": 26014 + }, + { + "epoch": 0.6679922519061305, + "grad_norm": 0.77734375, + "learning_rate": 0.0001341612266101394, + "loss": 0.8712, + "step": 26015 + }, + { + "epoch": 0.6680179291020524, + "grad_norm": 0.78125, + "learning_rate": 0.00013415703093643078, + "loss": 0.9079, + "step": 26016 + }, + { + "epoch": 0.6680436062979742, + "grad_norm": 0.7890625, + "learning_rate": 0.0001341528351946503, + "loss": 0.8828, + "step": 26017 + }, + { + "epoch": 0.668069283493896, + "grad_norm": 0.734375, + "learning_rate": 0.00013414863938480633, + "loss": 0.834, + "step": 26018 + }, + { + "epoch": 0.6680949606898179, + "grad_norm": 0.84765625, + "learning_rate": 0.0001341444435069072, + "loss": 0.9479, + "step": 26019 + }, + { + "epoch": 0.6681206378857397, + "grad_norm": 0.80859375, + "learning_rate": 0.00013414024756096127, + "loss": 0.7778, + "step": 26020 + }, + { + "epoch": 0.6681463150816616, + "grad_norm": 0.75, + "learning_rate": 0.0001341360515469769, + "loss": 0.8422, + "step": 26021 + }, + { + "epoch": 0.6681719922775833, + "grad_norm": 0.79296875, + "learning_rate": 0.00013413185546496247, + "loss": 0.8179, + "step": 26022 + }, + { + "epoch": 0.6681976694735051, + "grad_norm": 0.76953125, + "learning_rate": 0.00013412765931492633, + "loss": 0.8081, + "step": 26023 + }, + { + "epoch": 0.668223346669427, + "grad_norm": 0.7578125, + "learning_rate": 0.00013412346309687686, + "loss": 0.9196, + "step": 26024 + }, + { + "epoch": 0.6682490238653488, + "grad_norm": 0.8203125, + "learning_rate": 0.0001341192668108224, + "loss": 1.0207, + "step": 26025 + }, + { + "epoch": 0.6682747010612706, + "grad_norm": 0.74609375, + "learning_rate": 0.00013411507045677133, + "loss": 0.8243, + "step": 26026 + }, + { + "epoch": 0.6683003782571925, + "grad_norm": 0.828125, + "learning_rate": 0.00013411087403473202, + "loss": 0.7471, + "step": 26027 + }, + { + "epoch": 0.6683260554531142, + "grad_norm": 0.83203125, + "learning_rate": 0.0001341066775447128, + "loss": 0.9201, + "step": 26028 + }, + { + "epoch": 0.668351732649036, + "grad_norm": 0.8203125, + "learning_rate": 0.00013410248098672203, + "loss": 0.8743, + "step": 26029 + }, + { + "epoch": 0.6683774098449579, + "grad_norm": 2.96875, + "learning_rate": 0.00013409828436076814, + "loss": 0.7889, + "step": 26030 + }, + { + "epoch": 0.6684030870408797, + "grad_norm": 1.1640625, + "learning_rate": 0.00013409408766685942, + "loss": 0.8608, + "step": 26031 + }, + { + "epoch": 0.6684287642368015, + "grad_norm": 0.75, + "learning_rate": 0.0001340898909050043, + "loss": 0.8012, + "step": 26032 + }, + { + "epoch": 0.6684544414327234, + "grad_norm": 0.74609375, + "learning_rate": 0.00013408569407521108, + "loss": 0.8274, + "step": 26033 + }, + { + "epoch": 0.6684801186286452, + "grad_norm": 0.82421875, + "learning_rate": 0.00013408149717748815, + "loss": 0.81, + "step": 26034 + }, + { + "epoch": 0.6685057958245669, + "grad_norm": 0.98828125, + "learning_rate": 0.00013407730021184388, + "loss": 0.9413, + "step": 26035 + }, + { + "epoch": 0.6685314730204888, + "grad_norm": 0.87890625, + "learning_rate": 0.0001340731031782866, + "loss": 0.9493, + "step": 26036 + }, + { + "epoch": 0.6685571502164106, + "grad_norm": 0.78125, + "learning_rate": 0.00013406890607682475, + "loss": 0.7982, + "step": 26037 + }, + { + "epoch": 0.6685828274123324, + "grad_norm": 0.75, + "learning_rate": 0.00013406470890746662, + "loss": 0.7834, + "step": 26038 + }, + { + "epoch": 0.6686085046082543, + "grad_norm": 0.75, + "learning_rate": 0.00013406051167022063, + "loss": 0.8567, + "step": 26039 + }, + { + "epoch": 0.6686341818041761, + "grad_norm": 0.8125, + "learning_rate": 0.00013405631436509506, + "loss": 0.9336, + "step": 26040 + }, + { + "epoch": 0.668659859000098, + "grad_norm": 0.7890625, + "learning_rate": 0.0001340521169920984, + "loss": 0.888, + "step": 26041 + }, + { + "epoch": 0.6686855361960197, + "grad_norm": 0.73046875, + "learning_rate": 0.0001340479195512389, + "loss": 0.771, + "step": 26042 + }, + { + "epoch": 0.6687112133919415, + "grad_norm": 0.84375, + "learning_rate": 0.00013404372204252498, + "loss": 0.8565, + "step": 26043 + }, + { + "epoch": 0.6687368905878633, + "grad_norm": 0.75, + "learning_rate": 0.00013403952446596498, + "loss": 0.809, + "step": 26044 + }, + { + "epoch": 0.6687625677837852, + "grad_norm": 0.95703125, + "learning_rate": 0.00013403532682156735, + "loss": 0.9777, + "step": 26045 + }, + { + "epoch": 0.668788244979707, + "grad_norm": 0.7421875, + "learning_rate": 0.0001340311291093403, + "loss": 0.7878, + "step": 26046 + }, + { + "epoch": 0.6688139221756288, + "grad_norm": 0.8125, + "learning_rate": 0.00013402693132929234, + "loss": 0.8521, + "step": 26047 + }, + { + "epoch": 0.6688395993715506, + "grad_norm": 0.82421875, + "learning_rate": 0.00013402273348143179, + "loss": 0.8076, + "step": 26048 + }, + { + "epoch": 0.6688652765674724, + "grad_norm": 0.77734375, + "learning_rate": 0.00013401853556576696, + "loss": 0.8893, + "step": 26049 + }, + { + "epoch": 0.6688909537633942, + "grad_norm": 0.86328125, + "learning_rate": 0.00013401433758230627, + "loss": 0.9369, + "step": 26050 + }, + { + "epoch": 0.6689166309593161, + "grad_norm": 0.7421875, + "learning_rate": 0.0001340101395310581, + "loss": 0.7725, + "step": 26051 + }, + { + "epoch": 0.6689423081552379, + "grad_norm": 0.78125, + "learning_rate": 0.0001340059414120308, + "loss": 0.9071, + "step": 26052 + }, + { + "epoch": 0.6689679853511598, + "grad_norm": 0.75, + "learning_rate": 0.0001340017432252327, + "loss": 0.9261, + "step": 26053 + }, + { + "epoch": 0.6689936625470816, + "grad_norm": 0.81640625, + "learning_rate": 0.0001339975449706722, + "loss": 0.7855, + "step": 26054 + }, + { + "epoch": 0.6690193397430033, + "grad_norm": 0.76953125, + "learning_rate": 0.0001339933466483577, + "loss": 0.7539, + "step": 26055 + }, + { + "epoch": 0.6690450169389252, + "grad_norm": 0.76953125, + "learning_rate": 0.0001339891482582975, + "loss": 0.6943, + "step": 26056 + }, + { + "epoch": 0.669070694134847, + "grad_norm": 0.78125, + "learning_rate": 0.0001339849498005, + "loss": 0.8073, + "step": 26057 + }, + { + "epoch": 0.6690963713307688, + "grad_norm": 0.7734375, + "learning_rate": 0.00013398075127497362, + "loss": 0.8166, + "step": 26058 + }, + { + "epoch": 0.6691220485266907, + "grad_norm": 0.7734375, + "learning_rate": 0.0001339765526817266, + "loss": 0.7849, + "step": 26059 + }, + { + "epoch": 0.6691477257226125, + "grad_norm": 0.83984375, + "learning_rate": 0.0001339723540207674, + "loss": 1.0086, + "step": 26060 + }, + { + "epoch": 0.6691734029185343, + "grad_norm": 0.734375, + "learning_rate": 0.00013396815529210436, + "loss": 0.735, + "step": 26061 + }, + { + "epoch": 0.6691990801144561, + "grad_norm": 1.1171875, + "learning_rate": 0.00013396395649574588, + "loss": 0.7684, + "step": 26062 + }, + { + "epoch": 0.6692247573103779, + "grad_norm": 0.80078125, + "learning_rate": 0.00013395975763170025, + "loss": 0.8982, + "step": 26063 + }, + { + "epoch": 0.6692504345062997, + "grad_norm": 0.72265625, + "learning_rate": 0.00013395555869997593, + "loss": 0.8446, + "step": 26064 + }, + { + "epoch": 0.6692761117022216, + "grad_norm": 0.73828125, + "learning_rate": 0.00013395135970058127, + "loss": 0.7509, + "step": 26065 + }, + { + "epoch": 0.6693017888981434, + "grad_norm": 0.7578125, + "learning_rate": 0.00013394716063352459, + "loss": 0.7541, + "step": 26066 + }, + { + "epoch": 0.6693274660940652, + "grad_norm": 0.7265625, + "learning_rate": 0.00013394296149881428, + "loss": 0.8932, + "step": 26067 + }, + { + "epoch": 0.669353143289987, + "grad_norm": 0.8203125, + "learning_rate": 0.0001339387622964587, + "loss": 0.792, + "step": 26068 + }, + { + "epoch": 0.6693788204859088, + "grad_norm": 0.8125, + "learning_rate": 0.00013393456302646626, + "loss": 0.9226, + "step": 26069 + }, + { + "epoch": 0.6694044976818306, + "grad_norm": 0.75390625, + "learning_rate": 0.00013393036368884533, + "loss": 0.7573, + "step": 26070 + }, + { + "epoch": 0.6694301748777525, + "grad_norm": 0.765625, + "learning_rate": 0.0001339261642836042, + "loss": 0.898, + "step": 26071 + }, + { + "epoch": 0.6694558520736743, + "grad_norm": 0.71875, + "learning_rate": 0.00013392196481075132, + "loss": 0.8069, + "step": 26072 + }, + { + "epoch": 0.6694815292695961, + "grad_norm": 0.84765625, + "learning_rate": 0.000133917765270295, + "loss": 1.039, + "step": 26073 + }, + { + "epoch": 0.669507206465518, + "grad_norm": 0.82421875, + "learning_rate": 0.00013391356566224365, + "loss": 0.9047, + "step": 26074 + }, + { + "epoch": 0.6695328836614397, + "grad_norm": 0.890625, + "learning_rate": 0.0001339093659866056, + "loss": 0.8636, + "step": 26075 + }, + { + "epoch": 0.6695585608573615, + "grad_norm": 0.6875, + "learning_rate": 0.00013390516624338928, + "loss": 0.7832, + "step": 26076 + }, + { + "epoch": 0.6695842380532834, + "grad_norm": 0.77734375, + "learning_rate": 0.00013390096643260303, + "loss": 0.7874, + "step": 26077 + }, + { + "epoch": 0.6696099152492052, + "grad_norm": 0.70703125, + "learning_rate": 0.0001338967665542552, + "loss": 0.7735, + "step": 26078 + }, + { + "epoch": 0.669635592445127, + "grad_norm": 0.796875, + "learning_rate": 0.00013389256660835421, + "loss": 0.7964, + "step": 26079 + }, + { + "epoch": 0.6696612696410489, + "grad_norm": 0.828125, + "learning_rate": 0.00013388836659490835, + "loss": 0.8452, + "step": 26080 + }, + { + "epoch": 0.6696869468369707, + "grad_norm": 0.76171875, + "learning_rate": 0.00013388416651392607, + "loss": 0.8318, + "step": 26081 + }, + { + "epoch": 0.6697126240328924, + "grad_norm": 0.8359375, + "learning_rate": 0.0001338799663654157, + "loss": 0.8627, + "step": 26082 + }, + { + "epoch": 0.6697383012288143, + "grad_norm": 0.7734375, + "learning_rate": 0.0001338757661493856, + "loss": 0.7606, + "step": 26083 + }, + { + "epoch": 0.6697639784247361, + "grad_norm": 0.8515625, + "learning_rate": 0.00013387156586584415, + "loss": 0.8776, + "step": 26084 + }, + { + "epoch": 0.669789655620658, + "grad_norm": 0.75, + "learning_rate": 0.00013386736551479977, + "loss": 0.9072, + "step": 26085 + }, + { + "epoch": 0.6698153328165798, + "grad_norm": 0.71875, + "learning_rate": 0.00013386316509626076, + "loss": 0.7279, + "step": 26086 + }, + { + "epoch": 0.6698410100125016, + "grad_norm": 0.7421875, + "learning_rate": 0.00013385896461023554, + "loss": 0.8001, + "step": 26087 + }, + { + "epoch": 0.6698666872084234, + "grad_norm": 0.8125, + "learning_rate": 0.00013385476405673242, + "loss": 0.7714, + "step": 26088 + }, + { + "epoch": 0.6698923644043452, + "grad_norm": 1.890625, + "learning_rate": 0.00013385056343575985, + "loss": 0.8339, + "step": 26089 + }, + { + "epoch": 0.669918041600267, + "grad_norm": 0.86328125, + "learning_rate": 0.00013384636274732615, + "loss": 0.8902, + "step": 26090 + }, + { + "epoch": 0.6699437187961889, + "grad_norm": 0.7890625, + "learning_rate": 0.00013384216199143972, + "loss": 0.8835, + "step": 26091 + }, + { + "epoch": 0.6699693959921107, + "grad_norm": 0.7578125, + "learning_rate": 0.0001338379611681089, + "loss": 0.9276, + "step": 26092 + }, + { + "epoch": 0.6699950731880325, + "grad_norm": 0.80859375, + "learning_rate": 0.00013383376027734212, + "loss": 0.7834, + "step": 26093 + }, + { + "epoch": 0.6700207503839544, + "grad_norm": 0.82421875, + "learning_rate": 0.00013382955931914766, + "loss": 0.8661, + "step": 26094 + }, + { + "epoch": 0.6700464275798761, + "grad_norm": 0.6953125, + "learning_rate": 0.000133825358293534, + "loss": 0.8233, + "step": 26095 + }, + { + "epoch": 0.6700721047757979, + "grad_norm": 0.7578125, + "learning_rate": 0.0001338211572005094, + "loss": 0.7954, + "step": 26096 + }, + { + "epoch": 0.6700977819717198, + "grad_norm": 0.765625, + "learning_rate": 0.0001338169560400823, + "loss": 0.871, + "step": 26097 + }, + { + "epoch": 0.6701234591676416, + "grad_norm": 0.79296875, + "learning_rate": 0.0001338127548122611, + "loss": 0.7942, + "step": 26098 + }, + { + "epoch": 0.6701491363635634, + "grad_norm": 0.71484375, + "learning_rate": 0.00013380855351705412, + "loss": 0.8372, + "step": 26099 + }, + { + "epoch": 0.6701748135594853, + "grad_norm": 0.7578125, + "learning_rate": 0.0001338043521544697, + "loss": 0.8091, + "step": 26100 + }, + { + "epoch": 0.6702004907554071, + "grad_norm": 0.8046875, + "learning_rate": 0.0001338001507245163, + "loss": 0.9792, + "step": 26101 + }, + { + "epoch": 0.6702261679513288, + "grad_norm": 0.76171875, + "learning_rate": 0.00013379594922720227, + "loss": 0.9328, + "step": 26102 + }, + { + "epoch": 0.6702518451472507, + "grad_norm": 0.76171875, + "learning_rate": 0.00013379174766253593, + "loss": 0.8843, + "step": 26103 + }, + { + "epoch": 0.6702775223431725, + "grad_norm": 0.890625, + "learning_rate": 0.0001337875460305257, + "loss": 0.9591, + "step": 26104 + }, + { + "epoch": 0.6703031995390943, + "grad_norm": 0.7734375, + "learning_rate": 0.00013378334433117997, + "loss": 0.7536, + "step": 26105 + }, + { + "epoch": 0.6703288767350162, + "grad_norm": 0.73046875, + "learning_rate": 0.00013377914256450704, + "loss": 0.959, + "step": 26106 + }, + { + "epoch": 0.670354553930938, + "grad_norm": 0.8515625, + "learning_rate": 0.00013377494073051537, + "loss": 0.9222, + "step": 26107 + }, + { + "epoch": 0.6703802311268597, + "grad_norm": 0.73046875, + "learning_rate": 0.00013377073882921324, + "loss": 0.8312, + "step": 26108 + }, + { + "epoch": 0.6704059083227816, + "grad_norm": 0.7421875, + "learning_rate": 0.00013376653686060914, + "loss": 0.7764, + "step": 26109 + }, + { + "epoch": 0.6704315855187034, + "grad_norm": 0.796875, + "learning_rate": 0.00013376233482471136, + "loss": 0.8749, + "step": 26110 + }, + { + "epoch": 0.6704572627146252, + "grad_norm": 0.8203125, + "learning_rate": 0.00013375813272152828, + "loss": 0.8759, + "step": 26111 + }, + { + "epoch": 0.6704829399105471, + "grad_norm": 0.84765625, + "learning_rate": 0.0001337539305510683, + "loss": 0.8419, + "step": 26112 + }, + { + "epoch": 0.6705086171064689, + "grad_norm": 0.78515625, + "learning_rate": 0.0001337497283133398, + "loss": 0.8228, + "step": 26113 + }, + { + "epoch": 0.6705342943023908, + "grad_norm": 0.78515625, + "learning_rate": 0.00013374552600835113, + "loss": 0.7438, + "step": 26114 + }, + { + "epoch": 0.6705599714983125, + "grad_norm": 0.7421875, + "learning_rate": 0.00013374132363611066, + "loss": 0.7605, + "step": 26115 + }, + { + "epoch": 0.6705856486942343, + "grad_norm": 0.8203125, + "learning_rate": 0.00013373712119662677, + "loss": 0.9047, + "step": 26116 + }, + { + "epoch": 0.6706113258901562, + "grad_norm": 0.83203125, + "learning_rate": 0.0001337329186899079, + "loss": 0.781, + "step": 26117 + }, + { + "epoch": 0.670637003086078, + "grad_norm": 0.74609375, + "learning_rate": 0.00013372871611596235, + "loss": 0.8303, + "step": 26118 + }, + { + "epoch": 0.6706626802819998, + "grad_norm": 0.73046875, + "learning_rate": 0.0001337245134747985, + "loss": 0.8807, + "step": 26119 + }, + { + "epoch": 0.6706883574779217, + "grad_norm": 0.75, + "learning_rate": 0.00013372031076642476, + "loss": 0.9097, + "step": 26120 + }, + { + "epoch": 0.6707140346738435, + "grad_norm": 0.8515625, + "learning_rate": 0.00013371610799084945, + "loss": 0.8257, + "step": 26121 + }, + { + "epoch": 0.6707397118697652, + "grad_norm": 0.80859375, + "learning_rate": 0.00013371190514808098, + "loss": 0.8818, + "step": 26122 + }, + { + "epoch": 0.6707653890656871, + "grad_norm": 0.76953125, + "learning_rate": 0.00013370770223812775, + "loss": 0.8234, + "step": 26123 + }, + { + "epoch": 0.6707910662616089, + "grad_norm": 0.78515625, + "learning_rate": 0.00013370349926099815, + "loss": 0.8695, + "step": 26124 + }, + { + "epoch": 0.6708167434575307, + "grad_norm": 0.8203125, + "learning_rate": 0.00013369929621670048, + "loss": 0.8102, + "step": 26125 + }, + { + "epoch": 0.6708424206534526, + "grad_norm": 0.859375, + "learning_rate": 0.00013369509310524316, + "loss": 0.9363, + "step": 26126 + }, + { + "epoch": 0.6708680978493744, + "grad_norm": 0.81640625, + "learning_rate": 0.00013369088992663456, + "loss": 0.8639, + "step": 26127 + }, + { + "epoch": 0.6708937750452961, + "grad_norm": 0.80078125, + "learning_rate": 0.00013368668668088307, + "loss": 0.8237, + "step": 26128 + }, + { + "epoch": 0.670919452241218, + "grad_norm": 0.765625, + "learning_rate": 0.00013368248336799703, + "loss": 0.7234, + "step": 26129 + }, + { + "epoch": 0.6709451294371398, + "grad_norm": 0.78515625, + "learning_rate": 0.00013367827998798487, + "loss": 0.9015, + "step": 26130 + }, + { + "epoch": 0.6709708066330616, + "grad_norm": 0.7265625, + "learning_rate": 0.00013367407654085495, + "loss": 0.7741, + "step": 26131 + }, + { + "epoch": 0.6709964838289835, + "grad_norm": 0.80078125, + "learning_rate": 0.00013366987302661562, + "loss": 0.8099, + "step": 26132 + }, + { + "epoch": 0.6710221610249053, + "grad_norm": 0.78515625, + "learning_rate": 0.0001336656694452753, + "loss": 0.8692, + "step": 26133 + }, + { + "epoch": 0.6710478382208271, + "grad_norm": 0.76171875, + "learning_rate": 0.0001336614657968423, + "loss": 0.8551, + "step": 26134 + }, + { + "epoch": 0.6710735154167489, + "grad_norm": 0.75, + "learning_rate": 0.00013365726208132506, + "loss": 0.8369, + "step": 26135 + }, + { + "epoch": 0.6710991926126707, + "grad_norm": 0.83203125, + "learning_rate": 0.00013365305829873193, + "loss": 0.822, + "step": 26136 + }, + { + "epoch": 0.6711248698085925, + "grad_norm": 0.80859375, + "learning_rate": 0.0001336488544490713, + "loss": 0.9702, + "step": 26137 + }, + { + "epoch": 0.6711505470045144, + "grad_norm": 0.703125, + "learning_rate": 0.00013364465053235155, + "loss": 0.8104, + "step": 26138 + }, + { + "epoch": 0.6711762242004362, + "grad_norm": 0.76171875, + "learning_rate": 0.00013364044654858105, + "loss": 0.8946, + "step": 26139 + }, + { + "epoch": 0.671201901396358, + "grad_norm": 0.765625, + "learning_rate": 0.00013363624249776817, + "loss": 0.8135, + "step": 26140 + }, + { + "epoch": 0.6712275785922799, + "grad_norm": 0.78125, + "learning_rate": 0.00013363203837992132, + "loss": 0.8112, + "step": 26141 + }, + { + "epoch": 0.6712532557882016, + "grad_norm": 0.7109375, + "learning_rate": 0.0001336278341950488, + "loss": 0.7355, + "step": 26142 + }, + { + "epoch": 0.6712789329841234, + "grad_norm": 0.7578125, + "learning_rate": 0.00013362362994315908, + "loss": 0.6864, + "step": 26143 + }, + { + "epoch": 0.6713046101800453, + "grad_norm": 0.78125, + "learning_rate": 0.00013361942562426052, + "loss": 1.0435, + "step": 26144 + }, + { + "epoch": 0.6713302873759671, + "grad_norm": 0.75390625, + "learning_rate": 0.00013361522123836147, + "loss": 0.7383, + "step": 26145 + }, + { + "epoch": 0.671355964571889, + "grad_norm": 0.75390625, + "learning_rate": 0.0001336110167854703, + "loss": 0.8218, + "step": 26146 + }, + { + "epoch": 0.6713816417678108, + "grad_norm": 0.73046875, + "learning_rate": 0.00013360681226559545, + "loss": 0.773, + "step": 26147 + }, + { + "epoch": 0.6714073189637325, + "grad_norm": 0.765625, + "learning_rate": 0.00013360260767874519, + "loss": 0.9, + "step": 26148 + }, + { + "epoch": 0.6714329961596543, + "grad_norm": 0.81640625, + "learning_rate": 0.00013359840302492804, + "loss": 0.8475, + "step": 26149 + }, + { + "epoch": 0.6714586733555762, + "grad_norm": 0.84375, + "learning_rate": 0.00013359419830415227, + "loss": 0.8462, + "step": 26150 + }, + { + "epoch": 0.671484350551498, + "grad_norm": 0.80859375, + "learning_rate": 0.0001335899935164263, + "loss": 0.8975, + "step": 26151 + }, + { + "epoch": 0.6715100277474199, + "grad_norm": 0.78125, + "learning_rate": 0.00013358578866175851, + "loss": 0.8731, + "step": 26152 + }, + { + "epoch": 0.6715357049433417, + "grad_norm": 0.765625, + "learning_rate": 0.00013358158374015728, + "loss": 0.7614, + "step": 26153 + }, + { + "epoch": 0.6715613821392635, + "grad_norm": 0.88671875, + "learning_rate": 0.00013357737875163098, + "loss": 0.8326, + "step": 26154 + }, + { + "epoch": 0.6715870593351853, + "grad_norm": 0.76953125, + "learning_rate": 0.000133573173696188, + "loss": 0.8459, + "step": 26155 + }, + { + "epoch": 0.6716127365311071, + "grad_norm": 0.9921875, + "learning_rate": 0.00013356896857383673, + "loss": 0.7246, + "step": 26156 + }, + { + "epoch": 0.6716384137270289, + "grad_norm": 0.85546875, + "learning_rate": 0.00013356476338458555, + "loss": 0.811, + "step": 26157 + }, + { + "epoch": 0.6716640909229508, + "grad_norm": 0.7734375, + "learning_rate": 0.00013356055812844278, + "loss": 0.7807, + "step": 26158 + }, + { + "epoch": 0.6716897681188726, + "grad_norm": 0.85546875, + "learning_rate": 0.0001335563528054169, + "loss": 0.9988, + "step": 26159 + }, + { + "epoch": 0.6717154453147944, + "grad_norm": 0.91796875, + "learning_rate": 0.0001335521474155162, + "loss": 0.979, + "step": 26160 + }, + { + "epoch": 0.6717411225107163, + "grad_norm": 0.765625, + "learning_rate": 0.00013354794195874912, + "loss": 0.7775, + "step": 26161 + }, + { + "epoch": 0.671766799706638, + "grad_norm": 0.83203125, + "learning_rate": 0.000133543736435124, + "loss": 0.7728, + "step": 26162 + }, + { + "epoch": 0.6717924769025598, + "grad_norm": 0.81640625, + "learning_rate": 0.0001335395308446493, + "loss": 0.7759, + "step": 26163 + }, + { + "epoch": 0.6718181540984817, + "grad_norm": 0.75390625, + "learning_rate": 0.00013353532518733328, + "loss": 0.7066, + "step": 26164 + }, + { + "epoch": 0.6718438312944035, + "grad_norm": 0.8125, + "learning_rate": 0.00013353111946318443, + "loss": 0.7983, + "step": 26165 + }, + { + "epoch": 0.6718695084903253, + "grad_norm": 0.8125, + "learning_rate": 0.00013352691367221107, + "loss": 0.8607, + "step": 26166 + }, + { + "epoch": 0.6718951856862472, + "grad_norm": 0.9921875, + "learning_rate": 0.0001335227078144216, + "loss": 0.7652, + "step": 26167 + }, + { + "epoch": 0.6719208628821689, + "grad_norm": 0.80078125, + "learning_rate": 0.0001335185018898244, + "loss": 0.8934, + "step": 26168 + }, + { + "epoch": 0.6719465400780907, + "grad_norm": 0.9140625, + "learning_rate": 0.00013351429589842785, + "loss": 0.8406, + "step": 26169 + }, + { + "epoch": 0.6719722172740126, + "grad_norm": 0.80859375, + "learning_rate": 0.00013351008984024035, + "loss": 0.8643, + "step": 26170 + }, + { + "epoch": 0.6719978944699344, + "grad_norm": 1.265625, + "learning_rate": 0.00013350588371527025, + "loss": 0.8144, + "step": 26171 + }, + { + "epoch": 0.6720235716658562, + "grad_norm": 0.83203125, + "learning_rate": 0.00013350167752352595, + "loss": 0.8348, + "step": 26172 + }, + { + "epoch": 0.6720492488617781, + "grad_norm": 0.7265625, + "learning_rate": 0.00013349747126501586, + "loss": 0.754, + "step": 26173 + }, + { + "epoch": 0.6720749260576999, + "grad_norm": 0.75, + "learning_rate": 0.0001334932649397483, + "loss": 0.8768, + "step": 26174 + }, + { + "epoch": 0.6721006032536216, + "grad_norm": 0.828125, + "learning_rate": 0.0001334890585477317, + "loss": 0.8986, + "step": 26175 + }, + { + "epoch": 0.6721262804495435, + "grad_norm": 0.70703125, + "learning_rate": 0.00013348485208897445, + "loss": 0.8051, + "step": 26176 + }, + { + "epoch": 0.6721519576454653, + "grad_norm": 0.69140625, + "learning_rate": 0.00013348064556348491, + "loss": 0.7261, + "step": 26177 + }, + { + "epoch": 0.6721776348413872, + "grad_norm": 0.765625, + "learning_rate": 0.00013347643897127146, + "loss": 0.8981, + "step": 26178 + }, + { + "epoch": 0.672203312037309, + "grad_norm": 0.78125, + "learning_rate": 0.00013347223231234247, + "loss": 0.8231, + "step": 26179 + }, + { + "epoch": 0.6722289892332308, + "grad_norm": 0.75390625, + "learning_rate": 0.0001334680255867064, + "loss": 0.7588, + "step": 26180 + }, + { + "epoch": 0.6722546664291527, + "grad_norm": 0.83203125, + "learning_rate": 0.00013346381879437153, + "loss": 0.8007, + "step": 26181 + }, + { + "epoch": 0.6722803436250744, + "grad_norm": 0.875, + "learning_rate": 0.00013345961193534628, + "loss": 0.9615, + "step": 26182 + }, + { + "epoch": 0.6723060208209962, + "grad_norm": 0.7265625, + "learning_rate": 0.00013345540500963912, + "loss": 0.8129, + "step": 26183 + }, + { + "epoch": 0.6723316980169181, + "grad_norm": 0.85546875, + "learning_rate": 0.0001334511980172583, + "loss": 0.9984, + "step": 26184 + }, + { + "epoch": 0.6723573752128399, + "grad_norm": 0.74609375, + "learning_rate": 0.00013344699095821228, + "loss": 0.9372, + "step": 26185 + }, + { + "epoch": 0.6723830524087617, + "grad_norm": 0.79296875, + "learning_rate": 0.0001334427838325094, + "loss": 0.8464, + "step": 26186 + }, + { + "epoch": 0.6724087296046836, + "grad_norm": 0.7734375, + "learning_rate": 0.00013343857664015811, + "loss": 0.8968, + "step": 26187 + }, + { + "epoch": 0.6724344068006053, + "grad_norm": 0.72265625, + "learning_rate": 0.00013343436938116672, + "loss": 0.7526, + "step": 26188 + }, + { + "epoch": 0.6724600839965271, + "grad_norm": 0.84375, + "learning_rate": 0.00013343016205554368, + "loss": 0.8181, + "step": 26189 + }, + { + "epoch": 0.672485761192449, + "grad_norm": 0.83203125, + "learning_rate": 0.00013342595466329734, + "loss": 0.9861, + "step": 26190 + }, + { + "epoch": 0.6725114383883708, + "grad_norm": 0.80859375, + "learning_rate": 0.00013342174720443612, + "loss": 0.7553, + "step": 26191 + }, + { + "epoch": 0.6725371155842926, + "grad_norm": 0.8125, + "learning_rate": 0.00013341753967896836, + "loss": 0.9368, + "step": 26192 + }, + { + "epoch": 0.6725627927802145, + "grad_norm": 0.76953125, + "learning_rate": 0.00013341333208690244, + "loss": 0.846, + "step": 26193 + }, + { + "epoch": 0.6725884699761363, + "grad_norm": 0.79296875, + "learning_rate": 0.00013340912442824678, + "loss": 0.8791, + "step": 26194 + }, + { + "epoch": 0.672614147172058, + "grad_norm": 0.90625, + "learning_rate": 0.00013340491670300975, + "loss": 0.7482, + "step": 26195 + }, + { + "epoch": 0.6726398243679799, + "grad_norm": 0.73046875, + "learning_rate": 0.00013340070891119975, + "loss": 0.7628, + "step": 26196 + }, + { + "epoch": 0.6726655015639017, + "grad_norm": 0.7734375, + "learning_rate": 0.00013339650105282516, + "loss": 0.8387, + "step": 26197 + }, + { + "epoch": 0.6726911787598235, + "grad_norm": 0.734375, + "learning_rate": 0.00013339229312789433, + "loss": 0.8092, + "step": 26198 + }, + { + "epoch": 0.6727168559557454, + "grad_norm": 0.73046875, + "learning_rate": 0.00013338808513641568, + "loss": 0.8238, + "step": 26199 + }, + { + "epoch": 0.6727425331516672, + "grad_norm": 0.7578125, + "learning_rate": 0.00013338387707839761, + "loss": 0.8639, + "step": 26200 + }, + { + "epoch": 0.672768210347589, + "grad_norm": 0.74609375, + "learning_rate": 0.00013337966895384846, + "loss": 0.7962, + "step": 26201 + }, + { + "epoch": 0.6727938875435108, + "grad_norm": 0.8046875, + "learning_rate": 0.00013337546076277668, + "loss": 0.8196, + "step": 26202 + }, + { + "epoch": 0.6728195647394326, + "grad_norm": 0.75390625, + "learning_rate": 0.00013337125250519063, + "loss": 0.8327, + "step": 26203 + }, + { + "epoch": 0.6728452419353544, + "grad_norm": 0.81640625, + "learning_rate": 0.00013336704418109868, + "loss": 0.9455, + "step": 26204 + }, + { + "epoch": 0.6728709191312763, + "grad_norm": 0.796875, + "learning_rate": 0.0001333628357905092, + "loss": 0.713, + "step": 26205 + }, + { + "epoch": 0.6728965963271981, + "grad_norm": 0.82421875, + "learning_rate": 0.0001333586273334306, + "loss": 0.8583, + "step": 26206 + }, + { + "epoch": 0.67292227352312, + "grad_norm": 0.7578125, + "learning_rate": 0.00013335441880987128, + "loss": 0.8253, + "step": 26207 + }, + { + "epoch": 0.6729479507190417, + "grad_norm": 0.8359375, + "learning_rate": 0.00013335021021983963, + "loss": 0.933, + "step": 26208 + }, + { + "epoch": 0.6729736279149635, + "grad_norm": 0.83203125, + "learning_rate": 0.000133346001563344, + "loss": 0.8161, + "step": 26209 + }, + { + "epoch": 0.6729993051108853, + "grad_norm": 0.86328125, + "learning_rate": 0.00013334179284039282, + "loss": 0.9011, + "step": 26210 + }, + { + "epoch": 0.6730249823068072, + "grad_norm": 0.7421875, + "learning_rate": 0.00013333758405099444, + "loss": 0.8165, + "step": 26211 + }, + { + "epoch": 0.673050659502729, + "grad_norm": 0.7578125, + "learning_rate": 0.0001333333751951573, + "loss": 0.7488, + "step": 26212 + }, + { + "epoch": 0.6730763366986509, + "grad_norm": 0.7578125, + "learning_rate": 0.00013332916627288973, + "loss": 0.7511, + "step": 26213 + }, + { + "epoch": 0.6731020138945727, + "grad_norm": 0.765625, + "learning_rate": 0.00013332495728420014, + "loss": 0.7946, + "step": 26214 + }, + { + "epoch": 0.6731276910904944, + "grad_norm": 0.828125, + "learning_rate": 0.00013332074822909693, + "loss": 0.7961, + "step": 26215 + }, + { + "epoch": 0.6731533682864163, + "grad_norm": 0.71875, + "learning_rate": 0.00013331653910758848, + "loss": 0.8272, + "step": 26216 + }, + { + "epoch": 0.6731790454823381, + "grad_norm": 0.72265625, + "learning_rate": 0.00013331232991968317, + "loss": 0.8322, + "step": 26217 + }, + { + "epoch": 0.6732047226782599, + "grad_norm": 0.73046875, + "learning_rate": 0.00013330812066538942, + "loss": 0.8718, + "step": 26218 + }, + { + "epoch": 0.6732303998741818, + "grad_norm": 0.7578125, + "learning_rate": 0.00013330391134471552, + "loss": 0.7709, + "step": 26219 + }, + { + "epoch": 0.6732560770701036, + "grad_norm": 0.765625, + "learning_rate": 0.00013329970195767002, + "loss": 0.8379, + "step": 26220 + }, + { + "epoch": 0.6732817542660254, + "grad_norm": 0.78125, + "learning_rate": 0.00013329549250426116, + "loss": 0.8876, + "step": 26221 + }, + { + "epoch": 0.6733074314619472, + "grad_norm": 0.74609375, + "learning_rate": 0.0001332912829844974, + "loss": 0.7567, + "step": 26222 + }, + { + "epoch": 0.673333108657869, + "grad_norm": 0.984375, + "learning_rate": 0.00013328707339838717, + "loss": 0.8785, + "step": 26223 + }, + { + "epoch": 0.6733587858537908, + "grad_norm": 0.80078125, + "learning_rate": 0.00013328286374593876, + "loss": 0.8268, + "step": 26224 + }, + { + "epoch": 0.6733844630497127, + "grad_norm": 0.8203125, + "learning_rate": 0.00013327865402716063, + "loss": 0.8809, + "step": 26225 + }, + { + "epoch": 0.6734101402456345, + "grad_norm": 0.74609375, + "learning_rate": 0.00013327444424206114, + "loss": 0.8358, + "step": 26226 + }, + { + "epoch": 0.6734358174415563, + "grad_norm": 0.7421875, + "learning_rate": 0.0001332702343906487, + "loss": 0.7992, + "step": 26227 + }, + { + "epoch": 0.6734614946374781, + "grad_norm": 0.81640625, + "learning_rate": 0.00013326602447293165, + "loss": 1.0073, + "step": 26228 + }, + { + "epoch": 0.6734871718333999, + "grad_norm": 0.76171875, + "learning_rate": 0.0001332618144889185, + "loss": 0.8745, + "step": 26229 + }, + { + "epoch": 0.6735128490293217, + "grad_norm": 0.97265625, + "learning_rate": 0.00013325760443861747, + "loss": 0.8212, + "step": 26230 + }, + { + "epoch": 0.6735385262252436, + "grad_norm": 0.74609375, + "learning_rate": 0.0001332533943220371, + "loss": 0.7265, + "step": 26231 + }, + { + "epoch": 0.6735642034211654, + "grad_norm": 0.70703125, + "learning_rate": 0.0001332491841391857, + "loss": 0.8545, + "step": 26232 + }, + { + "epoch": 0.6735898806170872, + "grad_norm": 0.734375, + "learning_rate": 0.00013324497389007167, + "loss": 0.7439, + "step": 26233 + }, + { + "epoch": 0.6736155578130091, + "grad_norm": 0.79296875, + "learning_rate": 0.0001332407635747034, + "loss": 0.8633, + "step": 26234 + }, + { + "epoch": 0.6736412350089308, + "grad_norm": 0.828125, + "learning_rate": 0.00013323655319308933, + "loss": 0.9243, + "step": 26235 + }, + { + "epoch": 0.6736669122048526, + "grad_norm": 0.8203125, + "learning_rate": 0.00013323234274523776, + "loss": 0.8236, + "step": 26236 + }, + { + "epoch": 0.6736925894007745, + "grad_norm": 0.8046875, + "learning_rate": 0.00013322813223115717, + "loss": 1.0274, + "step": 26237 + }, + { + "epoch": 0.6737182665966963, + "grad_norm": 0.82421875, + "learning_rate": 0.00013322392165085592, + "loss": 0.8453, + "step": 26238 + }, + { + "epoch": 0.6737439437926181, + "grad_norm": 0.83984375, + "learning_rate": 0.00013321971100434236, + "loss": 0.7608, + "step": 26239 + }, + { + "epoch": 0.67376962098854, + "grad_norm": 0.8203125, + "learning_rate": 0.00013321550029162492, + "loss": 0.7949, + "step": 26240 + }, + { + "epoch": 0.6737952981844617, + "grad_norm": 0.73046875, + "learning_rate": 0.000133211289512712, + "loss": 0.7772, + "step": 26241 + }, + { + "epoch": 0.6738209753803835, + "grad_norm": 0.734375, + "learning_rate": 0.000133207078667612, + "loss": 0.7252, + "step": 26242 + }, + { + "epoch": 0.6738466525763054, + "grad_norm": 0.8203125, + "learning_rate": 0.00013320286775633328, + "loss": 0.8736, + "step": 26243 + }, + { + "epoch": 0.6738723297722272, + "grad_norm": 0.83984375, + "learning_rate": 0.00013319865677888424, + "loss": 0.8403, + "step": 26244 + }, + { + "epoch": 0.673898006968149, + "grad_norm": 0.76953125, + "learning_rate": 0.00013319444573527326, + "loss": 0.9831, + "step": 26245 + }, + { + "epoch": 0.6739236841640709, + "grad_norm": 1.046875, + "learning_rate": 0.00013319023462550876, + "loss": 0.827, + "step": 26246 + }, + { + "epoch": 0.6739493613599927, + "grad_norm": 0.81640625, + "learning_rate": 0.00013318602344959915, + "loss": 0.9503, + "step": 26247 + }, + { + "epoch": 0.6739750385559145, + "grad_norm": 0.75390625, + "learning_rate": 0.00013318181220755274, + "loss": 0.7221, + "step": 26248 + }, + { + "epoch": 0.6740007157518363, + "grad_norm": 0.78515625, + "learning_rate": 0.000133177600899378, + "loss": 0.8203, + "step": 26249 + }, + { + "epoch": 0.6740263929477581, + "grad_norm": 0.87890625, + "learning_rate": 0.00013317338952508332, + "loss": 0.817, + "step": 26250 + }, + { + "epoch": 0.67405207014368, + "grad_norm": 0.73046875, + "learning_rate": 0.00013316917808467704, + "loss": 0.8067, + "step": 26251 + }, + { + "epoch": 0.6740777473396018, + "grad_norm": 0.796875, + "learning_rate": 0.0001331649665781676, + "loss": 0.7974, + "step": 26252 + }, + { + "epoch": 0.6741034245355236, + "grad_norm": 0.90625, + "learning_rate": 0.00013316075500556336, + "loss": 0.7912, + "step": 26253 + }, + { + "epoch": 0.6741291017314455, + "grad_norm": 0.8125, + "learning_rate": 0.00013315654336687276, + "loss": 0.8372, + "step": 26254 + }, + { + "epoch": 0.6741547789273672, + "grad_norm": 0.83984375, + "learning_rate": 0.00013315233166210415, + "loss": 0.9095, + "step": 26255 + }, + { + "epoch": 0.674180456123289, + "grad_norm": 0.7890625, + "learning_rate": 0.00013314811989126592, + "loss": 0.8245, + "step": 26256 + }, + { + "epoch": 0.6742061333192109, + "grad_norm": 0.9609375, + "learning_rate": 0.00013314390805436652, + "loss": 0.7326, + "step": 26257 + }, + { + "epoch": 0.6742318105151327, + "grad_norm": 0.8203125, + "learning_rate": 0.0001331396961514143, + "loss": 0.9882, + "step": 26258 + }, + { + "epoch": 0.6742574877110545, + "grad_norm": 0.76171875, + "learning_rate": 0.0001331354841824176, + "loss": 0.9018, + "step": 26259 + }, + { + "epoch": 0.6742831649069764, + "grad_norm": 0.77734375, + "learning_rate": 0.00013313127214738493, + "loss": 0.8371, + "step": 26260 + }, + { + "epoch": 0.6743088421028981, + "grad_norm": 0.8125, + "learning_rate": 0.0001331270600463246, + "loss": 0.8273, + "step": 26261 + }, + { + "epoch": 0.6743345192988199, + "grad_norm": 0.78125, + "learning_rate": 0.00013312284787924506, + "loss": 0.8745, + "step": 26262 + }, + { + "epoch": 0.6743601964947418, + "grad_norm": 0.734375, + "learning_rate": 0.00013311863564615466, + "loss": 0.7393, + "step": 26263 + }, + { + "epoch": 0.6743858736906636, + "grad_norm": 0.79296875, + "learning_rate": 0.00013311442334706182, + "loss": 0.8561, + "step": 26264 + }, + { + "epoch": 0.6744115508865854, + "grad_norm": 0.74609375, + "learning_rate": 0.0001331102109819749, + "loss": 0.7767, + "step": 26265 + }, + { + "epoch": 0.6744372280825073, + "grad_norm": 0.734375, + "learning_rate": 0.00013310599855090235, + "loss": 0.7769, + "step": 26266 + }, + { + "epoch": 0.6744629052784291, + "grad_norm": 0.8671875, + "learning_rate": 0.00013310178605385249, + "loss": 0.8452, + "step": 26267 + }, + { + "epoch": 0.6744885824743508, + "grad_norm": 0.8046875, + "learning_rate": 0.00013309757349083378, + "loss": 0.745, + "step": 26268 + }, + { + "epoch": 0.6745142596702727, + "grad_norm": 0.7109375, + "learning_rate": 0.00013309336086185465, + "loss": 0.8636, + "step": 26269 + }, + { + "epoch": 0.6745399368661945, + "grad_norm": 0.83203125, + "learning_rate": 0.00013308914816692336, + "loss": 0.8853, + "step": 26270 + }, + { + "epoch": 0.6745656140621163, + "grad_norm": 0.8125, + "learning_rate": 0.00013308493540604844, + "loss": 0.8558, + "step": 26271 + }, + { + "epoch": 0.6745912912580382, + "grad_norm": 0.72265625, + "learning_rate": 0.00013308072257923822, + "loss": 0.8637, + "step": 26272 + }, + { + "epoch": 0.67461696845396, + "grad_norm": 0.84765625, + "learning_rate": 0.0001330765096865011, + "loss": 0.7888, + "step": 26273 + }, + { + "epoch": 0.6746426456498819, + "grad_norm": 0.8046875, + "learning_rate": 0.0001330722967278455, + "loss": 0.8689, + "step": 26274 + }, + { + "epoch": 0.6746683228458036, + "grad_norm": 0.6953125, + "learning_rate": 0.0001330680837032798, + "loss": 0.842, + "step": 26275 + }, + { + "epoch": 0.6746940000417254, + "grad_norm": 0.85546875, + "learning_rate": 0.0001330638706128124, + "loss": 0.8305, + "step": 26276 + }, + { + "epoch": 0.6747196772376473, + "grad_norm": 0.94140625, + "learning_rate": 0.00013305965745645165, + "loss": 0.9713, + "step": 26277 + }, + { + "epoch": 0.6747453544335691, + "grad_norm": 0.78515625, + "learning_rate": 0.000133055444234206, + "loss": 0.8242, + "step": 26278 + }, + { + "epoch": 0.6747710316294909, + "grad_norm": 0.77734375, + "learning_rate": 0.00013305123094608387, + "loss": 0.8117, + "step": 26279 + }, + { + "epoch": 0.6747967088254128, + "grad_norm": 0.83203125, + "learning_rate": 0.00013304701759209362, + "loss": 0.8256, + "step": 26280 + }, + { + "epoch": 0.6748223860213345, + "grad_norm": 0.80859375, + "learning_rate": 0.00013304280417224363, + "loss": 0.8423, + "step": 26281 + }, + { + "epoch": 0.6748480632172563, + "grad_norm": 0.859375, + "learning_rate": 0.00013303859068654233, + "loss": 0.8456, + "step": 26282 + }, + { + "epoch": 0.6748737404131782, + "grad_norm": 0.71484375, + "learning_rate": 0.00013303437713499806, + "loss": 0.8412, + "step": 26283 + }, + { + "epoch": 0.6748994176091, + "grad_norm": 0.8046875, + "learning_rate": 0.0001330301635176193, + "loss": 0.7872, + "step": 26284 + }, + { + "epoch": 0.6749250948050218, + "grad_norm": 0.9453125, + "learning_rate": 0.0001330259498344144, + "loss": 0.9143, + "step": 26285 + }, + { + "epoch": 0.6749507720009437, + "grad_norm": 0.84375, + "learning_rate": 0.00013302173608539176, + "loss": 0.9663, + "step": 26286 + }, + { + "epoch": 0.6749764491968655, + "grad_norm": 0.79296875, + "learning_rate": 0.00013301752227055978, + "loss": 0.8508, + "step": 26287 + }, + { + "epoch": 0.6750021263927872, + "grad_norm": 0.8125, + "learning_rate": 0.0001330133083899269, + "loss": 0.8787, + "step": 26288 + }, + { + "epoch": 0.6750278035887091, + "grad_norm": 0.75, + "learning_rate": 0.00013300909444350145, + "loss": 0.826, + "step": 26289 + }, + { + "epoch": 0.6750534807846309, + "grad_norm": 0.76953125, + "learning_rate": 0.00013300488043129188, + "loss": 0.6995, + "step": 26290 + }, + { + "epoch": 0.6750791579805527, + "grad_norm": 0.7265625, + "learning_rate": 0.0001330006663533065, + "loss": 0.6995, + "step": 26291 + }, + { + "epoch": 0.6751048351764746, + "grad_norm": 0.99609375, + "learning_rate": 0.00013299645220955382, + "loss": 0.8698, + "step": 26292 + }, + { + "epoch": 0.6751305123723964, + "grad_norm": 0.80078125, + "learning_rate": 0.0001329922380000422, + "loss": 0.797, + "step": 26293 + }, + { + "epoch": 0.6751561895683182, + "grad_norm": 0.72265625, + "learning_rate": 0.00013298802372478, + "loss": 0.8622, + "step": 26294 + }, + { + "epoch": 0.67518186676424, + "grad_norm": 0.75, + "learning_rate": 0.00013298380938377567, + "loss": 0.8679, + "step": 26295 + }, + { + "epoch": 0.6752075439601618, + "grad_norm": 0.80859375, + "learning_rate": 0.00013297959497703756, + "loss": 0.9301, + "step": 26296 + }, + { + "epoch": 0.6752332211560836, + "grad_norm": 0.765625, + "learning_rate": 0.00013297538050457413, + "loss": 0.7999, + "step": 26297 + }, + { + "epoch": 0.6752588983520055, + "grad_norm": 0.734375, + "learning_rate": 0.00013297116596639374, + "loss": 0.7503, + "step": 26298 + }, + { + "epoch": 0.6752845755479273, + "grad_norm": 0.7265625, + "learning_rate": 0.00013296695136250474, + "loss": 0.7202, + "step": 26299 + }, + { + "epoch": 0.6753102527438491, + "grad_norm": 0.703125, + "learning_rate": 0.00013296273669291566, + "loss": 0.7772, + "step": 26300 + }, + { + "epoch": 0.6753359299397709, + "grad_norm": 0.75390625, + "learning_rate": 0.0001329585219576348, + "loss": 0.8496, + "step": 26301 + }, + { + "epoch": 0.6753616071356927, + "grad_norm": 1.2109375, + "learning_rate": 0.00013295430715667056, + "loss": 0.8493, + "step": 26302 + }, + { + "epoch": 0.6753872843316145, + "grad_norm": 0.8984375, + "learning_rate": 0.0001329500922900314, + "loss": 0.8821, + "step": 26303 + }, + { + "epoch": 0.6754129615275364, + "grad_norm": 0.86328125, + "learning_rate": 0.00013294587735772562, + "loss": 0.9405, + "step": 26304 + }, + { + "epoch": 0.6754386387234582, + "grad_norm": 0.7421875, + "learning_rate": 0.00013294166235976174, + "loss": 0.7986, + "step": 26305 + }, + { + "epoch": 0.67546431591938, + "grad_norm": 0.8046875, + "learning_rate": 0.0001329374472961481, + "loss": 0.9417, + "step": 26306 + }, + { + "epoch": 0.6754899931153019, + "grad_norm": 0.65625, + "learning_rate": 0.00013293323216689306, + "loss": 0.7499, + "step": 26307 + }, + { + "epoch": 0.6755156703112236, + "grad_norm": 0.83984375, + "learning_rate": 0.00013292901697200513, + "loss": 0.8893, + "step": 26308 + }, + { + "epoch": 0.6755413475071455, + "grad_norm": 0.70703125, + "learning_rate": 0.0001329248017114926, + "loss": 0.6777, + "step": 26309 + }, + { + "epoch": 0.6755670247030673, + "grad_norm": 0.82421875, + "learning_rate": 0.0001329205863853639, + "loss": 0.8337, + "step": 26310 + }, + { + "epoch": 0.6755927018989891, + "grad_norm": 0.80859375, + "learning_rate": 0.00013291637099362746, + "loss": 0.8527, + "step": 26311 + }, + { + "epoch": 0.675618379094911, + "grad_norm": 0.78125, + "learning_rate": 0.0001329121555362917, + "loss": 0.9142, + "step": 26312 + }, + { + "epoch": 0.6756440562908328, + "grad_norm": 0.74609375, + "learning_rate": 0.00013290794001336492, + "loss": 0.8851, + "step": 26313 + }, + { + "epoch": 0.6756697334867546, + "grad_norm": 0.81640625, + "learning_rate": 0.00013290372442485563, + "loss": 0.8, + "step": 26314 + }, + { + "epoch": 0.6756954106826764, + "grad_norm": 0.80859375, + "learning_rate": 0.0001328995087707722, + "loss": 0.9511, + "step": 26315 + }, + { + "epoch": 0.6757210878785982, + "grad_norm": 0.76171875, + "learning_rate": 0.000132895293051123, + "loss": 0.7039, + "step": 26316 + }, + { + "epoch": 0.67574676507452, + "grad_norm": 0.8671875, + "learning_rate": 0.00013289107726591648, + "loss": 0.9427, + "step": 26317 + }, + { + "epoch": 0.6757724422704419, + "grad_norm": 0.78125, + "learning_rate": 0.00013288686141516096, + "loss": 0.8268, + "step": 26318 + }, + { + "epoch": 0.6757981194663637, + "grad_norm": 0.796875, + "learning_rate": 0.00013288264549886494, + "loss": 1.0028, + "step": 26319 + }, + { + "epoch": 0.6758237966622855, + "grad_norm": 0.7890625, + "learning_rate": 0.00013287842951703678, + "loss": 0.8569, + "step": 26320 + }, + { + "epoch": 0.6758494738582073, + "grad_norm": 0.82421875, + "learning_rate": 0.00013287421346968486, + "loss": 0.8812, + "step": 26321 + }, + { + "epoch": 0.6758751510541291, + "grad_norm": 0.76953125, + "learning_rate": 0.0001328699973568176, + "loss": 0.7661, + "step": 26322 + }, + { + "epoch": 0.6759008282500509, + "grad_norm": 0.73046875, + "learning_rate": 0.0001328657811784434, + "loss": 0.7708, + "step": 26323 + }, + { + "epoch": 0.6759265054459728, + "grad_norm": 0.86328125, + "learning_rate": 0.00013286156493457068, + "loss": 0.8384, + "step": 26324 + }, + { + "epoch": 0.6759521826418946, + "grad_norm": 0.81640625, + "learning_rate": 0.00013285734862520784, + "loss": 0.8229, + "step": 26325 + }, + { + "epoch": 0.6759778598378164, + "grad_norm": 0.8671875, + "learning_rate": 0.00013285313225036325, + "loss": 0.8987, + "step": 26326 + }, + { + "epoch": 0.6760035370337383, + "grad_norm": 0.7734375, + "learning_rate": 0.00013284891581004534, + "loss": 0.8226, + "step": 26327 + }, + { + "epoch": 0.67602921422966, + "grad_norm": 0.77734375, + "learning_rate": 0.00013284469930426252, + "loss": 0.8646, + "step": 26328 + }, + { + "epoch": 0.6760548914255818, + "grad_norm": 0.76953125, + "learning_rate": 0.0001328404827330232, + "loss": 0.8904, + "step": 26329 + }, + { + "epoch": 0.6760805686215037, + "grad_norm": 0.8515625, + "learning_rate": 0.00013283626609633572, + "loss": 0.8195, + "step": 26330 + }, + { + "epoch": 0.6761062458174255, + "grad_norm": 0.7578125, + "learning_rate": 0.00013283204939420852, + "loss": 0.922, + "step": 26331 + }, + { + "epoch": 0.6761319230133473, + "grad_norm": 0.8515625, + "learning_rate": 0.00013282783262665008, + "loss": 0.8737, + "step": 26332 + }, + { + "epoch": 0.6761576002092692, + "grad_norm": 1.0078125, + "learning_rate": 0.00013282361579366867, + "loss": 1.0637, + "step": 26333 + }, + { + "epoch": 0.676183277405191, + "grad_norm": 0.7734375, + "learning_rate": 0.00013281939889527277, + "loss": 0.8686, + "step": 26334 + }, + { + "epoch": 0.6762089546011127, + "grad_norm": 0.8671875, + "learning_rate": 0.0001328151819314708, + "loss": 0.8041, + "step": 26335 + }, + { + "epoch": 0.6762346317970346, + "grad_norm": 0.8203125, + "learning_rate": 0.00013281096490227112, + "loss": 0.9102, + "step": 26336 + }, + { + "epoch": 0.6762603089929564, + "grad_norm": 0.8203125, + "learning_rate": 0.00013280674780768214, + "loss": 0.846, + "step": 26337 + }, + { + "epoch": 0.6762859861888783, + "grad_norm": 1.6875, + "learning_rate": 0.0001328025306477123, + "loss": 0.7601, + "step": 26338 + }, + { + "epoch": 0.6763116633848001, + "grad_norm": 0.7890625, + "learning_rate": 0.00013279831342236995, + "loss": 0.7935, + "step": 26339 + }, + { + "epoch": 0.6763373405807219, + "grad_norm": 0.734375, + "learning_rate": 0.00013279409613166355, + "loss": 0.9336, + "step": 26340 + }, + { + "epoch": 0.6763630177766436, + "grad_norm": 0.78515625, + "learning_rate": 0.00013278987877560146, + "loss": 0.8131, + "step": 26341 + }, + { + "epoch": 0.6763886949725655, + "grad_norm": 0.79296875, + "learning_rate": 0.00013278566135419212, + "loss": 0.856, + "step": 26342 + }, + { + "epoch": 0.6764143721684873, + "grad_norm": 0.796875, + "learning_rate": 0.00013278144386744391, + "loss": 0.7829, + "step": 26343 + }, + { + "epoch": 0.6764400493644092, + "grad_norm": 0.73046875, + "learning_rate": 0.00013277722631536523, + "loss": 0.9097, + "step": 26344 + }, + { + "epoch": 0.676465726560331, + "grad_norm": 1.0859375, + "learning_rate": 0.0001327730086979645, + "loss": 0.861, + "step": 26345 + }, + { + "epoch": 0.6764914037562528, + "grad_norm": 0.75, + "learning_rate": 0.00013276879101525017, + "loss": 0.8151, + "step": 26346 + }, + { + "epoch": 0.6765170809521747, + "grad_norm": 0.75390625, + "learning_rate": 0.00013276457326723054, + "loss": 0.8984, + "step": 26347 + }, + { + "epoch": 0.6765427581480964, + "grad_norm": 0.73828125, + "learning_rate": 0.00013276035545391413, + "loss": 0.866, + "step": 26348 + }, + { + "epoch": 0.6765684353440182, + "grad_norm": 0.76953125, + "learning_rate": 0.00013275613757530926, + "loss": 0.9318, + "step": 26349 + }, + { + "epoch": 0.6765941125399401, + "grad_norm": 0.796875, + "learning_rate": 0.00013275191963142436, + "loss": 0.7921, + "step": 26350 + }, + { + "epoch": 0.6766197897358619, + "grad_norm": 0.7890625, + "learning_rate": 0.00013274770162226786, + "loss": 0.7839, + "step": 26351 + }, + { + "epoch": 0.6766454669317837, + "grad_norm": 0.7890625, + "learning_rate": 0.00013274348354784814, + "loss": 0.9844, + "step": 26352 + }, + { + "epoch": 0.6766711441277056, + "grad_norm": 0.81640625, + "learning_rate": 0.00013273926540817362, + "loss": 0.7455, + "step": 26353 + }, + { + "epoch": 0.6766968213236274, + "grad_norm": 0.765625, + "learning_rate": 0.00013273504720325268, + "loss": 0.8465, + "step": 26354 + }, + { + "epoch": 0.6767224985195491, + "grad_norm": 0.76953125, + "learning_rate": 0.00013273082893309375, + "loss": 0.7937, + "step": 26355 + }, + { + "epoch": 0.676748175715471, + "grad_norm": 0.7421875, + "learning_rate": 0.0001327266105977053, + "loss": 0.7638, + "step": 26356 + }, + { + "epoch": 0.6767738529113928, + "grad_norm": 0.76171875, + "learning_rate": 0.00013272239219709562, + "loss": 0.8708, + "step": 26357 + }, + { + "epoch": 0.6767995301073146, + "grad_norm": 0.78125, + "learning_rate": 0.00013271817373127314, + "loss": 0.8147, + "step": 26358 + }, + { + "epoch": 0.6768252073032365, + "grad_norm": 0.80078125, + "learning_rate": 0.0001327139552002463, + "loss": 0.8061, + "step": 26359 + }, + { + "epoch": 0.6768508844991583, + "grad_norm": 0.75, + "learning_rate": 0.00013270973660402355, + "loss": 0.7523, + "step": 26360 + }, + { + "epoch": 0.67687656169508, + "grad_norm": 0.875, + "learning_rate": 0.00013270551794261322, + "loss": 0.9512, + "step": 26361 + }, + { + "epoch": 0.6769022388910019, + "grad_norm": 0.80859375, + "learning_rate": 0.00013270129921602377, + "loss": 0.8222, + "step": 26362 + }, + { + "epoch": 0.6769279160869237, + "grad_norm": 0.90234375, + "learning_rate": 0.00013269708042426356, + "loss": 0.9443, + "step": 26363 + }, + { + "epoch": 0.6769535932828455, + "grad_norm": 0.765625, + "learning_rate": 0.00013269286156734103, + "loss": 0.8212, + "step": 26364 + }, + { + "epoch": 0.6769792704787674, + "grad_norm": 0.8203125, + "learning_rate": 0.00013268864264526456, + "loss": 0.9028, + "step": 26365 + }, + { + "epoch": 0.6770049476746892, + "grad_norm": 0.78515625, + "learning_rate": 0.00013268442365804259, + "loss": 0.8296, + "step": 26366 + }, + { + "epoch": 0.677030624870611, + "grad_norm": 0.75390625, + "learning_rate": 0.00013268020460568356, + "loss": 0.8024, + "step": 26367 + }, + { + "epoch": 0.6770563020665328, + "grad_norm": 0.80859375, + "learning_rate": 0.00013267598548819578, + "loss": 0.8062, + "step": 26368 + }, + { + "epoch": 0.6770819792624546, + "grad_norm": 0.7734375, + "learning_rate": 0.00013267176630558773, + "loss": 0.9007, + "step": 26369 + }, + { + "epoch": 0.6771076564583765, + "grad_norm": 0.984375, + "learning_rate": 0.0001326675470578678, + "loss": 0.806, + "step": 26370 + }, + { + "epoch": 0.6771333336542983, + "grad_norm": 0.74609375, + "learning_rate": 0.0001326633277450444, + "loss": 0.8198, + "step": 26371 + }, + { + "epoch": 0.6771590108502201, + "grad_norm": 0.73046875, + "learning_rate": 0.00013265910836712593, + "loss": 0.7871, + "step": 26372 + }, + { + "epoch": 0.677184688046142, + "grad_norm": 0.76953125, + "learning_rate": 0.00013265488892412082, + "loss": 0.8797, + "step": 26373 + }, + { + "epoch": 0.6772103652420638, + "grad_norm": 0.76171875, + "learning_rate": 0.00013265066941603747, + "loss": 0.9356, + "step": 26374 + }, + { + "epoch": 0.6772360424379855, + "grad_norm": 0.78125, + "learning_rate": 0.0001326464498428843, + "loss": 0.8842, + "step": 26375 + }, + { + "epoch": 0.6772617196339074, + "grad_norm": 0.921875, + "learning_rate": 0.00013264223020466965, + "loss": 0.6711, + "step": 26376 + }, + { + "epoch": 0.6772873968298292, + "grad_norm": 0.7578125, + "learning_rate": 0.00013263801050140202, + "loss": 0.7952, + "step": 26377 + }, + { + "epoch": 0.677313074025751, + "grad_norm": 0.7265625, + "learning_rate": 0.00013263379073308977, + "loss": 0.7431, + "step": 26378 + }, + { + "epoch": 0.6773387512216729, + "grad_norm": 0.79296875, + "learning_rate": 0.00013262957089974132, + "loss": 0.8564, + "step": 26379 + }, + { + "epoch": 0.6773644284175947, + "grad_norm": 0.80078125, + "learning_rate": 0.0001326253510013651, + "loss": 0.8393, + "step": 26380 + }, + { + "epoch": 0.6773901056135164, + "grad_norm": 0.7421875, + "learning_rate": 0.00013262113103796951, + "loss": 0.91, + "step": 26381 + }, + { + "epoch": 0.6774157828094383, + "grad_norm": 0.83984375, + "learning_rate": 0.00013261691100956293, + "loss": 0.8512, + "step": 26382 + }, + { + "epoch": 0.6774414600053601, + "grad_norm": 0.7890625, + "learning_rate": 0.0001326126909161538, + "loss": 1.0009, + "step": 26383 + }, + { + "epoch": 0.6774671372012819, + "grad_norm": 0.80859375, + "learning_rate": 0.00013260847075775053, + "loss": 0.7716, + "step": 26384 + }, + { + "epoch": 0.6774928143972038, + "grad_norm": 0.82421875, + "learning_rate": 0.00013260425053436152, + "loss": 0.8009, + "step": 26385 + }, + { + "epoch": 0.6775184915931256, + "grad_norm": 0.79296875, + "learning_rate": 0.00013260003024599519, + "loss": 0.8583, + "step": 26386 + }, + { + "epoch": 0.6775441687890474, + "grad_norm": 0.8125, + "learning_rate": 0.00013259580989265994, + "loss": 0.8845, + "step": 26387 + }, + { + "epoch": 0.6775698459849692, + "grad_norm": 0.75, + "learning_rate": 0.0001325915894743642, + "loss": 0.8026, + "step": 26388 + }, + { + "epoch": 0.677595523180891, + "grad_norm": 0.74609375, + "learning_rate": 0.00013258736899111634, + "loss": 0.7554, + "step": 26389 + }, + { + "epoch": 0.6776212003768128, + "grad_norm": 0.73046875, + "learning_rate": 0.00013258314844292478, + "loss": 0.7782, + "step": 26390 + }, + { + "epoch": 0.6776468775727347, + "grad_norm": 0.7734375, + "learning_rate": 0.00013257892782979797, + "loss": 0.8095, + "step": 26391 + }, + { + "epoch": 0.6776725547686565, + "grad_norm": 0.8828125, + "learning_rate": 0.0001325747071517443, + "loss": 0.9718, + "step": 26392 + }, + { + "epoch": 0.6776982319645783, + "grad_norm": 0.7578125, + "learning_rate": 0.0001325704864087722, + "loss": 0.8521, + "step": 26393 + }, + { + "epoch": 0.6777239091605002, + "grad_norm": 0.828125, + "learning_rate": 0.00013256626560089005, + "loss": 0.9129, + "step": 26394 + }, + { + "epoch": 0.6777495863564219, + "grad_norm": 0.765625, + "learning_rate": 0.00013256204472810628, + "loss": 0.7453, + "step": 26395 + }, + { + "epoch": 0.6777752635523437, + "grad_norm": 0.83203125, + "learning_rate": 0.00013255782379042926, + "loss": 0.8577, + "step": 26396 + }, + { + "epoch": 0.6778009407482656, + "grad_norm": 0.75, + "learning_rate": 0.0001325536027878675, + "loss": 0.8216, + "step": 26397 + }, + { + "epoch": 0.6778266179441874, + "grad_norm": 2.3125, + "learning_rate": 0.00013254938172042928, + "loss": 0.7033, + "step": 26398 + }, + { + "epoch": 0.6778522951401093, + "grad_norm": 0.7265625, + "learning_rate": 0.00013254516058812312, + "loss": 0.8142, + "step": 26399 + }, + { + "epoch": 0.6778779723360311, + "grad_norm": 0.80078125, + "learning_rate": 0.00013254093939095738, + "loss": 0.8953, + "step": 26400 + }, + { + "epoch": 0.6779036495319528, + "grad_norm": 0.74609375, + "learning_rate": 0.00013253671812894053, + "loss": 0.8454, + "step": 26401 + }, + { + "epoch": 0.6779293267278746, + "grad_norm": 0.7734375, + "learning_rate": 0.0001325324968020809, + "loss": 0.8391, + "step": 26402 + }, + { + "epoch": 0.6779550039237965, + "grad_norm": 0.76171875, + "learning_rate": 0.00013252827541038694, + "loss": 0.8294, + "step": 26403 + }, + { + "epoch": 0.6779806811197183, + "grad_norm": 0.7578125, + "learning_rate": 0.00013252405395386708, + "loss": 0.7372, + "step": 26404 + }, + { + "epoch": 0.6780063583156402, + "grad_norm": 0.671875, + "learning_rate": 0.00013251983243252972, + "loss": 0.7605, + "step": 26405 + }, + { + "epoch": 0.678032035511562, + "grad_norm": 0.74609375, + "learning_rate": 0.00013251561084638326, + "loss": 0.7064, + "step": 26406 + }, + { + "epoch": 0.6780577127074838, + "grad_norm": 0.69921875, + "learning_rate": 0.00013251138919543617, + "loss": 0.7891, + "step": 26407 + }, + { + "epoch": 0.6780833899034056, + "grad_norm": 0.79296875, + "learning_rate": 0.00013250716747969675, + "loss": 0.9035, + "step": 26408 + }, + { + "epoch": 0.6781090670993274, + "grad_norm": 0.85546875, + "learning_rate": 0.0001325029456991735, + "loss": 0.8373, + "step": 26409 + }, + { + "epoch": 0.6781347442952492, + "grad_norm": 0.83203125, + "learning_rate": 0.00013249872385387486, + "loss": 0.8872, + "step": 26410 + }, + { + "epoch": 0.6781604214911711, + "grad_norm": 0.73046875, + "learning_rate": 0.00013249450194380912, + "loss": 0.8434, + "step": 26411 + }, + { + "epoch": 0.6781860986870929, + "grad_norm": 1.0625, + "learning_rate": 0.00013249027996898484, + "loss": 0.7786, + "step": 26412 + }, + { + "epoch": 0.6782117758830147, + "grad_norm": 0.80078125, + "learning_rate": 0.00013248605792941037, + "loss": 0.8559, + "step": 26413 + }, + { + "epoch": 0.6782374530789366, + "grad_norm": 0.80859375, + "learning_rate": 0.00013248183582509408, + "loss": 0.8142, + "step": 26414 + }, + { + "epoch": 0.6782631302748583, + "grad_norm": 0.7421875, + "learning_rate": 0.00013247761365604445, + "loss": 0.8209, + "step": 26415 + }, + { + "epoch": 0.6782888074707801, + "grad_norm": 0.8203125, + "learning_rate": 0.00013247339142226984, + "loss": 0.8859, + "step": 26416 + }, + { + "epoch": 0.678314484666702, + "grad_norm": 0.8984375, + "learning_rate": 0.00013246916912377872, + "loss": 0.8432, + "step": 26417 + }, + { + "epoch": 0.6783401618626238, + "grad_norm": 0.79296875, + "learning_rate": 0.0001324649467605795, + "loss": 0.803, + "step": 26418 + }, + { + "epoch": 0.6783658390585456, + "grad_norm": 0.72265625, + "learning_rate": 0.00013246072433268052, + "loss": 0.8241, + "step": 26419 + }, + { + "epoch": 0.6783915162544675, + "grad_norm": 2.71875, + "learning_rate": 0.0001324565018400903, + "loss": 0.768, + "step": 26420 + }, + { + "epoch": 0.6784171934503892, + "grad_norm": 0.73046875, + "learning_rate": 0.00013245227928281717, + "loss": 0.827, + "step": 26421 + }, + { + "epoch": 0.678442870646311, + "grad_norm": 0.765625, + "learning_rate": 0.0001324480566608696, + "loss": 0.7488, + "step": 26422 + }, + { + "epoch": 0.6784685478422329, + "grad_norm": 0.81640625, + "learning_rate": 0.00013244383397425597, + "loss": 0.8931, + "step": 26423 + }, + { + "epoch": 0.6784942250381547, + "grad_norm": 0.73828125, + "learning_rate": 0.0001324396112229847, + "loss": 0.7704, + "step": 26424 + }, + { + "epoch": 0.6785199022340765, + "grad_norm": 0.7890625, + "learning_rate": 0.00013243538840706422, + "loss": 0.8609, + "step": 26425 + }, + { + "epoch": 0.6785455794299984, + "grad_norm": 0.69921875, + "learning_rate": 0.00013243116552650295, + "loss": 0.8505, + "step": 26426 + }, + { + "epoch": 0.6785712566259202, + "grad_norm": 0.78515625, + "learning_rate": 0.0001324269425813093, + "loss": 0.9167, + "step": 26427 + }, + { + "epoch": 0.6785969338218419, + "grad_norm": 0.796875, + "learning_rate": 0.00013242271957149168, + "loss": 0.7041, + "step": 26428 + }, + { + "epoch": 0.6786226110177638, + "grad_norm": 0.78125, + "learning_rate": 0.0001324184964970585, + "loss": 0.8577, + "step": 26429 + }, + { + "epoch": 0.6786482882136856, + "grad_norm": 0.9453125, + "learning_rate": 0.00013241427335801818, + "loss": 0.8841, + "step": 26430 + }, + { + "epoch": 0.6786739654096074, + "grad_norm": 0.73046875, + "learning_rate": 0.00013241005015437917, + "loss": 0.7001, + "step": 26431 + }, + { + "epoch": 0.6786996426055293, + "grad_norm": 0.7734375, + "learning_rate": 0.00013240582688614982, + "loss": 0.8988, + "step": 26432 + }, + { + "epoch": 0.6787253198014511, + "grad_norm": 0.80859375, + "learning_rate": 0.00013240160355333864, + "loss": 0.8752, + "step": 26433 + }, + { + "epoch": 0.678750996997373, + "grad_norm": 0.7578125, + "learning_rate": 0.00013239738015595394, + "loss": 0.9008, + "step": 26434 + }, + { + "epoch": 0.6787766741932947, + "grad_norm": 0.7265625, + "learning_rate": 0.0001323931566940042, + "loss": 0.7536, + "step": 26435 + }, + { + "epoch": 0.6788023513892165, + "grad_norm": 0.796875, + "learning_rate": 0.00013238893316749783, + "loss": 0.9062, + "step": 26436 + }, + { + "epoch": 0.6788280285851384, + "grad_norm": 0.78125, + "learning_rate": 0.00013238470957644324, + "loss": 0.8462, + "step": 26437 + }, + { + "epoch": 0.6788537057810602, + "grad_norm": 0.79296875, + "learning_rate": 0.00013238048592084884, + "loss": 0.9023, + "step": 26438 + }, + { + "epoch": 0.678879382976982, + "grad_norm": 0.8203125, + "learning_rate": 0.00013237626220072307, + "loss": 0.6853, + "step": 26439 + }, + { + "epoch": 0.6789050601729039, + "grad_norm": 0.83984375, + "learning_rate": 0.00013237203841607432, + "loss": 0.8985, + "step": 26440 + }, + { + "epoch": 0.6789307373688256, + "grad_norm": 0.828125, + "learning_rate": 0.00013236781456691106, + "loss": 0.8881, + "step": 26441 + }, + { + "epoch": 0.6789564145647474, + "grad_norm": 0.7265625, + "learning_rate": 0.00013236359065324163, + "loss": 0.8059, + "step": 26442 + }, + { + "epoch": 0.6789820917606693, + "grad_norm": 0.8046875, + "learning_rate": 0.0001323593666750745, + "loss": 0.8224, + "step": 26443 + }, + { + "epoch": 0.6790077689565911, + "grad_norm": 0.9296875, + "learning_rate": 0.00013235514263241807, + "loss": 1.032, + "step": 26444 + }, + { + "epoch": 0.6790334461525129, + "grad_norm": 0.828125, + "learning_rate": 0.00013235091852528076, + "loss": 0.8024, + "step": 26445 + }, + { + "epoch": 0.6790591233484348, + "grad_norm": 0.76171875, + "learning_rate": 0.000132346694353671, + "loss": 0.9076, + "step": 26446 + }, + { + "epoch": 0.6790848005443566, + "grad_norm": 0.80859375, + "learning_rate": 0.00013234247011759722, + "loss": 0.9643, + "step": 26447 + }, + { + "epoch": 0.6791104777402783, + "grad_norm": 0.80078125, + "learning_rate": 0.00013233824581706778, + "loss": 0.8361, + "step": 26448 + }, + { + "epoch": 0.6791361549362002, + "grad_norm": 0.73828125, + "learning_rate": 0.00013233402145209114, + "loss": 0.8154, + "step": 26449 + }, + { + "epoch": 0.679161832132122, + "grad_norm": 0.74609375, + "learning_rate": 0.00013232979702267574, + "loss": 0.7559, + "step": 26450 + }, + { + "epoch": 0.6791875093280438, + "grad_norm": 0.78515625, + "learning_rate": 0.00013232557252882993, + "loss": 0.928, + "step": 26451 + }, + { + "epoch": 0.6792131865239657, + "grad_norm": 0.72265625, + "learning_rate": 0.00013232134797056225, + "loss": 0.8635, + "step": 26452 + }, + { + "epoch": 0.6792388637198875, + "grad_norm": 0.8125, + "learning_rate": 0.00013231712334788097, + "loss": 0.8475, + "step": 26453 + }, + { + "epoch": 0.6792645409158092, + "grad_norm": 0.78125, + "learning_rate": 0.00013231289866079464, + "loss": 0.9353, + "step": 26454 + }, + { + "epoch": 0.6792902181117311, + "grad_norm": 0.77734375, + "learning_rate": 0.00013230867390931158, + "loss": 0.9021, + "step": 26455 + }, + { + "epoch": 0.6793158953076529, + "grad_norm": 0.828125, + "learning_rate": 0.00013230444909344025, + "loss": 0.9222, + "step": 26456 + }, + { + "epoch": 0.6793415725035747, + "grad_norm": 0.76171875, + "learning_rate": 0.0001323002242131891, + "loss": 0.7592, + "step": 26457 + }, + { + "epoch": 0.6793672496994966, + "grad_norm": 0.875, + "learning_rate": 0.00013229599926856654, + "loss": 0.8372, + "step": 26458 + }, + { + "epoch": 0.6793929268954184, + "grad_norm": 0.75390625, + "learning_rate": 0.0001322917742595809, + "loss": 0.6468, + "step": 26459 + }, + { + "epoch": 0.6794186040913403, + "grad_norm": 0.8359375, + "learning_rate": 0.00013228754918624073, + "loss": 0.9766, + "step": 26460 + }, + { + "epoch": 0.679444281287262, + "grad_norm": 0.73828125, + "learning_rate": 0.00013228332404855437, + "loss": 0.7536, + "step": 26461 + }, + { + "epoch": 0.6794699584831838, + "grad_norm": 0.76953125, + "learning_rate": 0.00013227909884653026, + "loss": 0.769, + "step": 26462 + }, + { + "epoch": 0.6794956356791056, + "grad_norm": 0.9921875, + "learning_rate": 0.00013227487358017683, + "loss": 0.7995, + "step": 26463 + }, + { + "epoch": 0.6795213128750275, + "grad_norm": 0.8125, + "learning_rate": 0.00013227064824950245, + "loss": 0.9038, + "step": 26464 + }, + { + "epoch": 0.6795469900709493, + "grad_norm": 0.73046875, + "learning_rate": 0.00013226642285451564, + "loss": 0.8772, + "step": 26465 + }, + { + "epoch": 0.6795726672668712, + "grad_norm": 0.796875, + "learning_rate": 0.00013226219739522475, + "loss": 0.8252, + "step": 26466 + }, + { + "epoch": 0.679598344462793, + "grad_norm": 0.77734375, + "learning_rate": 0.00013225797187163822, + "loss": 0.7834, + "step": 26467 + }, + { + "epoch": 0.6796240216587147, + "grad_norm": 0.84375, + "learning_rate": 0.00013225374628376445, + "loss": 0.8435, + "step": 26468 + }, + { + "epoch": 0.6796496988546366, + "grad_norm": 0.8828125, + "learning_rate": 0.00013224952063161188, + "loss": 0.8045, + "step": 26469 + }, + { + "epoch": 0.6796753760505584, + "grad_norm": 0.78125, + "learning_rate": 0.00013224529491518891, + "loss": 0.8489, + "step": 26470 + }, + { + "epoch": 0.6797010532464802, + "grad_norm": 0.76171875, + "learning_rate": 0.000132241069134504, + "loss": 0.7246, + "step": 26471 + }, + { + "epoch": 0.6797267304424021, + "grad_norm": 0.7578125, + "learning_rate": 0.00013223684328956555, + "loss": 0.8116, + "step": 26472 + }, + { + "epoch": 0.6797524076383239, + "grad_norm": 0.84375, + "learning_rate": 0.000132232617380382, + "loss": 0.931, + "step": 26473 + }, + { + "epoch": 0.6797780848342456, + "grad_norm": 0.88671875, + "learning_rate": 0.00013222839140696174, + "loss": 0.8288, + "step": 26474 + }, + { + "epoch": 0.6798037620301675, + "grad_norm": 0.875, + "learning_rate": 0.0001322241653693132, + "loss": 0.9639, + "step": 26475 + }, + { + "epoch": 0.6798294392260893, + "grad_norm": 0.84375, + "learning_rate": 0.00013221993926744482, + "loss": 0.8692, + "step": 26476 + }, + { + "epoch": 0.6798551164220111, + "grad_norm": 0.7890625, + "learning_rate": 0.000132215713101365, + "loss": 0.9845, + "step": 26477 + }, + { + "epoch": 0.679880793617933, + "grad_norm": 0.73046875, + "learning_rate": 0.00013221148687108219, + "loss": 0.8727, + "step": 26478 + }, + { + "epoch": 0.6799064708138548, + "grad_norm": 0.79296875, + "learning_rate": 0.0001322072605766048, + "loss": 0.9201, + "step": 26479 + }, + { + "epoch": 0.6799321480097766, + "grad_norm": 0.78515625, + "learning_rate": 0.00013220303421794123, + "loss": 0.9149, + "step": 26480 + }, + { + "epoch": 0.6799578252056984, + "grad_norm": 0.80859375, + "learning_rate": 0.00013219880779509992, + "loss": 0.7766, + "step": 26481 + }, + { + "epoch": 0.6799835024016202, + "grad_norm": 0.80078125, + "learning_rate": 0.0001321945813080893, + "loss": 0.9169, + "step": 26482 + }, + { + "epoch": 0.680009179597542, + "grad_norm": 0.6875, + "learning_rate": 0.0001321903547569178, + "loss": 0.672, + "step": 26483 + }, + { + "epoch": 0.6800348567934639, + "grad_norm": 0.8125, + "learning_rate": 0.0001321861281415938, + "loss": 0.8868, + "step": 26484 + }, + { + "epoch": 0.6800605339893857, + "grad_norm": 0.79296875, + "learning_rate": 0.0001321819014621258, + "loss": 0.8782, + "step": 26485 + }, + { + "epoch": 0.6800862111853075, + "grad_norm": 0.76171875, + "learning_rate": 0.00013217767471852216, + "loss": 0.7666, + "step": 26486 + }, + { + "epoch": 0.6801118883812294, + "grad_norm": 0.80078125, + "learning_rate": 0.00013217344791079133, + "loss": 0.8865, + "step": 26487 + }, + { + "epoch": 0.6801375655771511, + "grad_norm": 0.89453125, + "learning_rate": 0.00013216922103894167, + "loss": 0.957, + "step": 26488 + }, + { + "epoch": 0.6801632427730729, + "grad_norm": 0.75, + "learning_rate": 0.0001321649941029817, + "loss": 0.899, + "step": 26489 + }, + { + "epoch": 0.6801889199689948, + "grad_norm": 0.73046875, + "learning_rate": 0.0001321607671029198, + "loss": 0.7215, + "step": 26490 + }, + { + "epoch": 0.6802145971649166, + "grad_norm": 0.88671875, + "learning_rate": 0.00013215654003876436, + "loss": 0.7672, + "step": 26491 + }, + { + "epoch": 0.6802402743608384, + "grad_norm": 0.8046875, + "learning_rate": 0.00013215231291052393, + "loss": 0.8127, + "step": 26492 + }, + { + "epoch": 0.6802659515567603, + "grad_norm": 0.9140625, + "learning_rate": 0.00013214808571820677, + "loss": 1.1086, + "step": 26493 + }, + { + "epoch": 0.680291628752682, + "grad_norm": 0.76171875, + "learning_rate": 0.0001321438584618214, + "loss": 0.8897, + "step": 26494 + }, + { + "epoch": 0.6803173059486038, + "grad_norm": 0.76953125, + "learning_rate": 0.00013213963114137623, + "loss": 0.7848, + "step": 26495 + }, + { + "epoch": 0.6803429831445257, + "grad_norm": 0.7890625, + "learning_rate": 0.00013213540375687965, + "loss": 0.8391, + "step": 26496 + }, + { + "epoch": 0.6803686603404475, + "grad_norm": 0.71875, + "learning_rate": 0.00013213117630834012, + "loss": 0.7138, + "step": 26497 + }, + { + "epoch": 0.6803943375363694, + "grad_norm": 0.8203125, + "learning_rate": 0.0001321269487957661, + "loss": 0.8476, + "step": 26498 + }, + { + "epoch": 0.6804200147322912, + "grad_norm": 0.7421875, + "learning_rate": 0.00013212272121916594, + "loss": 0.8102, + "step": 26499 + }, + { + "epoch": 0.680445691928213, + "grad_norm": 0.76171875, + "learning_rate": 0.00013211849357854812, + "loss": 0.7082, + "step": 26500 + }, + { + "epoch": 0.6804713691241348, + "grad_norm": 0.7421875, + "learning_rate": 0.00013211426587392099, + "loss": 0.6846, + "step": 26501 + }, + { + "epoch": 0.6804970463200566, + "grad_norm": 0.765625, + "learning_rate": 0.00013211003810529307, + "loss": 0.8405, + "step": 26502 + }, + { + "epoch": 0.6805227235159784, + "grad_norm": 0.75390625, + "learning_rate": 0.00013210581027267275, + "loss": 0.7175, + "step": 26503 + }, + { + "epoch": 0.6805484007119003, + "grad_norm": 0.8046875, + "learning_rate": 0.00013210158237606843, + "loss": 0.945, + "step": 26504 + }, + { + "epoch": 0.6805740779078221, + "grad_norm": 0.77734375, + "learning_rate": 0.00013209735441548857, + "loss": 0.8477, + "step": 26505 + }, + { + "epoch": 0.6805997551037439, + "grad_norm": 0.80078125, + "learning_rate": 0.0001320931263909416, + "loss": 0.8236, + "step": 26506 + }, + { + "epoch": 0.6806254322996658, + "grad_norm": 0.7734375, + "learning_rate": 0.0001320888983024359, + "loss": 0.7678, + "step": 26507 + }, + { + "epoch": 0.6806511094955875, + "grad_norm": 0.828125, + "learning_rate": 0.00013208467014997992, + "loss": 0.9368, + "step": 26508 + }, + { + "epoch": 0.6806767866915093, + "grad_norm": 0.78125, + "learning_rate": 0.0001320804419335821, + "loss": 0.7824, + "step": 26509 + }, + { + "epoch": 0.6807024638874312, + "grad_norm": 0.8046875, + "learning_rate": 0.00013207621365325084, + "loss": 0.8165, + "step": 26510 + }, + { + "epoch": 0.680728141083353, + "grad_norm": 0.75390625, + "learning_rate": 0.00013207198530899462, + "loss": 0.7544, + "step": 26511 + }, + { + "epoch": 0.6807538182792748, + "grad_norm": 0.82421875, + "learning_rate": 0.00013206775690082184, + "loss": 0.7286, + "step": 26512 + }, + { + "epoch": 0.6807794954751967, + "grad_norm": 0.78515625, + "learning_rate": 0.00013206352842874087, + "loss": 0.8101, + "step": 26513 + }, + { + "epoch": 0.6808051726711184, + "grad_norm": 0.73828125, + "learning_rate": 0.0001320592998927602, + "loss": 0.7763, + "step": 26514 + }, + { + "epoch": 0.6808308498670402, + "grad_norm": 0.80078125, + "learning_rate": 0.00013205507129288824, + "loss": 0.7776, + "step": 26515 + }, + { + "epoch": 0.6808565270629621, + "grad_norm": 0.73046875, + "learning_rate": 0.00013205084262913342, + "loss": 0.7959, + "step": 26516 + }, + { + "epoch": 0.6808822042588839, + "grad_norm": 0.76953125, + "learning_rate": 0.00013204661390150415, + "loss": 0.8107, + "step": 26517 + }, + { + "epoch": 0.6809078814548057, + "grad_norm": 0.765625, + "learning_rate": 0.0001320423851100089, + "loss": 0.8549, + "step": 26518 + }, + { + "epoch": 0.6809335586507276, + "grad_norm": 0.7265625, + "learning_rate": 0.00013203815625465606, + "loss": 0.796, + "step": 26519 + }, + { + "epoch": 0.6809592358466494, + "grad_norm": 0.69140625, + "learning_rate": 0.00013203392733545407, + "loss": 0.6583, + "step": 26520 + }, + { + "epoch": 0.6809849130425711, + "grad_norm": 0.7578125, + "learning_rate": 0.00013202969835241136, + "loss": 0.7927, + "step": 26521 + }, + { + "epoch": 0.681010590238493, + "grad_norm": 0.734375, + "learning_rate": 0.00013202546930553635, + "loss": 0.7441, + "step": 26522 + }, + { + "epoch": 0.6810362674344148, + "grad_norm": 0.80078125, + "learning_rate": 0.00013202124019483748, + "loss": 0.8918, + "step": 26523 + }, + { + "epoch": 0.6810619446303366, + "grad_norm": 0.76953125, + "learning_rate": 0.00013201701102032317, + "loss": 0.9145, + "step": 26524 + }, + { + "epoch": 0.6810876218262585, + "grad_norm": 0.77734375, + "learning_rate": 0.00013201278178200186, + "loss": 0.9161, + "step": 26525 + }, + { + "epoch": 0.6811132990221803, + "grad_norm": 0.88671875, + "learning_rate": 0.00013200855247988194, + "loss": 0.8058, + "step": 26526 + }, + { + "epoch": 0.6811389762181022, + "grad_norm": 0.82421875, + "learning_rate": 0.00013200432311397188, + "loss": 0.8467, + "step": 26527 + }, + { + "epoch": 0.6811646534140239, + "grad_norm": 0.7890625, + "learning_rate": 0.00013200009368428007, + "loss": 0.8118, + "step": 26528 + }, + { + "epoch": 0.6811903306099457, + "grad_norm": 0.8046875, + "learning_rate": 0.000131995864190815, + "loss": 0.9785, + "step": 26529 + }, + { + "epoch": 0.6812160078058676, + "grad_norm": 0.7578125, + "learning_rate": 0.00013199163463358505, + "loss": 0.7786, + "step": 26530 + }, + { + "epoch": 0.6812416850017894, + "grad_norm": 0.82421875, + "learning_rate": 0.00013198740501259865, + "loss": 0.7613, + "step": 26531 + }, + { + "epoch": 0.6812673621977112, + "grad_norm": 0.73828125, + "learning_rate": 0.00013198317532786427, + "loss": 0.8499, + "step": 26532 + }, + { + "epoch": 0.6812930393936331, + "grad_norm": 0.87890625, + "learning_rate": 0.0001319789455793903, + "loss": 0.9403, + "step": 26533 + }, + { + "epoch": 0.6813187165895548, + "grad_norm": 0.8203125, + "learning_rate": 0.00013197471576718517, + "loss": 0.8673, + "step": 26534 + }, + { + "epoch": 0.6813443937854766, + "grad_norm": 0.84375, + "learning_rate": 0.00013197048589125732, + "loss": 0.9986, + "step": 26535 + }, + { + "epoch": 0.6813700709813985, + "grad_norm": 0.7109375, + "learning_rate": 0.00013196625595161517, + "loss": 0.8294, + "step": 26536 + }, + { + "epoch": 0.6813957481773203, + "grad_norm": 0.703125, + "learning_rate": 0.0001319620259482672, + "loss": 0.7357, + "step": 26537 + }, + { + "epoch": 0.6814214253732421, + "grad_norm": 0.890625, + "learning_rate": 0.00013195779588122178, + "loss": 0.9094, + "step": 26538 + }, + { + "epoch": 0.681447102569164, + "grad_norm": 0.82421875, + "learning_rate": 0.00013195356575048734, + "loss": 0.7872, + "step": 26539 + }, + { + "epoch": 0.6814727797650858, + "grad_norm": 0.84375, + "learning_rate": 0.00013194933555607234, + "loss": 0.7227, + "step": 26540 + }, + { + "epoch": 0.6814984569610075, + "grad_norm": 0.78125, + "learning_rate": 0.00013194510529798521, + "loss": 0.7131, + "step": 26541 + }, + { + "epoch": 0.6815241341569294, + "grad_norm": 0.72265625, + "learning_rate": 0.00013194087497623436, + "loss": 0.8768, + "step": 26542 + }, + { + "epoch": 0.6815498113528512, + "grad_norm": 0.77734375, + "learning_rate": 0.00013193664459082823, + "loss": 0.9291, + "step": 26543 + }, + { + "epoch": 0.681575488548773, + "grad_norm": 0.77734375, + "learning_rate": 0.00013193241414177523, + "loss": 0.9402, + "step": 26544 + }, + { + "epoch": 0.6816011657446949, + "grad_norm": 0.79296875, + "learning_rate": 0.00013192818362908388, + "loss": 0.8536, + "step": 26545 + }, + { + "epoch": 0.6816268429406167, + "grad_norm": 0.80859375, + "learning_rate": 0.00013192395305276249, + "loss": 0.843, + "step": 26546 + }, + { + "epoch": 0.6816525201365385, + "grad_norm": 0.83203125, + "learning_rate": 0.00013191972241281957, + "loss": 0.8622, + "step": 26547 + }, + { + "epoch": 0.6816781973324603, + "grad_norm": 0.8125, + "learning_rate": 0.0001319154917092635, + "loss": 0.8977, + "step": 26548 + }, + { + "epoch": 0.6817038745283821, + "grad_norm": 0.91015625, + "learning_rate": 0.00013191126094210274, + "loss": 0.8523, + "step": 26549 + }, + { + "epoch": 0.6817295517243039, + "grad_norm": 0.9453125, + "learning_rate": 0.00013190703011134572, + "loss": 0.9071, + "step": 26550 + }, + { + "epoch": 0.6817552289202258, + "grad_norm": 0.7265625, + "learning_rate": 0.00013190279921700091, + "loss": 0.9197, + "step": 26551 + }, + { + "epoch": 0.6817809061161476, + "grad_norm": 0.81640625, + "learning_rate": 0.00013189856825907668, + "loss": 0.8649, + "step": 26552 + }, + { + "epoch": 0.6818065833120694, + "grad_norm": 0.77734375, + "learning_rate": 0.00013189433723758147, + "loss": 0.8129, + "step": 26553 + }, + { + "epoch": 0.6818322605079912, + "grad_norm": 0.78515625, + "learning_rate": 0.00013189010615252374, + "loss": 0.9473, + "step": 26554 + }, + { + "epoch": 0.681857937703913, + "grad_norm": 0.796875, + "learning_rate": 0.0001318858750039119, + "loss": 0.7475, + "step": 26555 + }, + { + "epoch": 0.6818836148998348, + "grad_norm": 0.8046875, + "learning_rate": 0.0001318816437917544, + "loss": 1.0872, + "step": 26556 + }, + { + "epoch": 0.6819092920957567, + "grad_norm": 0.7734375, + "learning_rate": 0.00013187741251605964, + "loss": 0.8448, + "step": 26557 + }, + { + "epoch": 0.6819349692916785, + "grad_norm": 0.79296875, + "learning_rate": 0.00013187318117683614, + "loss": 0.7932, + "step": 26558 + }, + { + "epoch": 0.6819606464876004, + "grad_norm": 0.9609375, + "learning_rate": 0.00013186894977409224, + "loss": 0.9481, + "step": 26559 + }, + { + "epoch": 0.6819863236835222, + "grad_norm": 0.76171875, + "learning_rate": 0.00013186471830783634, + "loss": 0.8829, + "step": 26560 + }, + { + "epoch": 0.6820120008794439, + "grad_norm": 0.73828125, + "learning_rate": 0.000131860486778077, + "loss": 0.758, + "step": 26561 + }, + { + "epoch": 0.6820376780753657, + "grad_norm": 0.81640625, + "learning_rate": 0.0001318562551848226, + "loss": 0.7714, + "step": 26562 + }, + { + "epoch": 0.6820633552712876, + "grad_norm": 0.8046875, + "learning_rate": 0.0001318520235280815, + "loss": 0.948, + "step": 26563 + }, + { + "epoch": 0.6820890324672094, + "grad_norm": 0.78125, + "learning_rate": 0.00013184779180786224, + "loss": 0.8053, + "step": 26564 + }, + { + "epoch": 0.6821147096631313, + "grad_norm": 0.81640625, + "learning_rate": 0.0001318435600241732, + "loss": 0.8987, + "step": 26565 + }, + { + "epoch": 0.6821403868590531, + "grad_norm": 0.7265625, + "learning_rate": 0.0001318393281770228, + "loss": 0.716, + "step": 26566 + }, + { + "epoch": 0.6821660640549749, + "grad_norm": 0.7734375, + "learning_rate": 0.00013183509626641952, + "loss": 0.9491, + "step": 26567 + }, + { + "epoch": 0.6821917412508967, + "grad_norm": 0.74609375, + "learning_rate": 0.00013183086429237174, + "loss": 0.8562, + "step": 26568 + }, + { + "epoch": 0.6822174184468185, + "grad_norm": 0.7890625, + "learning_rate": 0.00013182663225488797, + "loss": 0.8369, + "step": 26569 + }, + { + "epoch": 0.6822430956427403, + "grad_norm": 0.73046875, + "learning_rate": 0.00013182240015397655, + "loss": 0.7419, + "step": 26570 + }, + { + "epoch": 0.6822687728386622, + "grad_norm": 0.71875, + "learning_rate": 0.000131818167989646, + "loss": 0.8221, + "step": 26571 + }, + { + "epoch": 0.682294450034584, + "grad_norm": 0.76953125, + "learning_rate": 0.0001318139357619047, + "loss": 0.7978, + "step": 26572 + }, + { + "epoch": 0.6823201272305058, + "grad_norm": 0.76171875, + "learning_rate": 0.0001318097034707611, + "loss": 0.7413, + "step": 26573 + }, + { + "epoch": 0.6823458044264276, + "grad_norm": 0.79296875, + "learning_rate": 0.00013180547111622362, + "loss": 0.9105, + "step": 26574 + }, + { + "epoch": 0.6823714816223494, + "grad_norm": 0.7109375, + "learning_rate": 0.00013180123869830073, + "loss": 0.8445, + "step": 26575 + }, + { + "epoch": 0.6823971588182712, + "grad_norm": 0.89453125, + "learning_rate": 0.00013179700621700083, + "loss": 0.8169, + "step": 26576 + }, + { + "epoch": 0.6824228360141931, + "grad_norm": 0.80078125, + "learning_rate": 0.00013179277367233235, + "loss": 0.8167, + "step": 26577 + }, + { + "epoch": 0.6824485132101149, + "grad_norm": 0.7890625, + "learning_rate": 0.00013178854106430377, + "loss": 0.7722, + "step": 26578 + }, + { + "epoch": 0.6824741904060367, + "grad_norm": 0.828125, + "learning_rate": 0.0001317843083929235, + "loss": 0.9609, + "step": 26579 + }, + { + "epoch": 0.6824998676019586, + "grad_norm": 0.76953125, + "learning_rate": 0.0001317800756582, + "loss": 0.9489, + "step": 26580 + }, + { + "epoch": 0.6825255447978803, + "grad_norm": 0.83984375, + "learning_rate": 0.00013177584286014162, + "loss": 0.9862, + "step": 26581 + }, + { + "epoch": 0.6825512219938021, + "grad_norm": 0.7109375, + "learning_rate": 0.0001317716099987569, + "loss": 0.7465, + "step": 26582 + }, + { + "epoch": 0.682576899189724, + "grad_norm": 0.80078125, + "learning_rate": 0.0001317673770740542, + "loss": 0.7929, + "step": 26583 + }, + { + "epoch": 0.6826025763856458, + "grad_norm": 0.8828125, + "learning_rate": 0.000131763144086042, + "loss": 0.7724, + "step": 26584 + }, + { + "epoch": 0.6826282535815676, + "grad_norm": 0.71875, + "learning_rate": 0.00013175891103472872, + "loss": 0.7361, + "step": 26585 + }, + { + "epoch": 0.6826539307774895, + "grad_norm": 0.796875, + "learning_rate": 0.00013175467792012283, + "loss": 0.8296, + "step": 26586 + }, + { + "epoch": 0.6826796079734113, + "grad_norm": 0.703125, + "learning_rate": 0.00013175044474223268, + "loss": 0.8802, + "step": 26587 + }, + { + "epoch": 0.682705285169333, + "grad_norm": 0.78515625, + "learning_rate": 0.00013174621150106682, + "loss": 0.8134, + "step": 26588 + }, + { + "epoch": 0.6827309623652549, + "grad_norm": 0.8359375, + "learning_rate": 0.00013174197819663356, + "loss": 0.8591, + "step": 26589 + }, + { + "epoch": 0.6827566395611767, + "grad_norm": 0.8203125, + "learning_rate": 0.00013173774482894146, + "loss": 0.8457, + "step": 26590 + }, + { + "epoch": 0.6827823167570986, + "grad_norm": 0.75390625, + "learning_rate": 0.0001317335113979989, + "loss": 0.8545, + "step": 26591 + }, + { + "epoch": 0.6828079939530204, + "grad_norm": 0.8203125, + "learning_rate": 0.0001317292779038143, + "loss": 0.8631, + "step": 26592 + }, + { + "epoch": 0.6828336711489422, + "grad_norm": 0.79296875, + "learning_rate": 0.00013172504434639612, + "loss": 0.8374, + "step": 26593 + }, + { + "epoch": 0.682859348344864, + "grad_norm": 0.75390625, + "learning_rate": 0.00013172081072575277, + "loss": 0.7184, + "step": 26594 + }, + { + "epoch": 0.6828850255407858, + "grad_norm": 0.796875, + "learning_rate": 0.00013171657704189274, + "loss": 0.7093, + "step": 26595 + }, + { + "epoch": 0.6829107027367076, + "grad_norm": 0.8046875, + "learning_rate": 0.00013171234329482445, + "loss": 0.7977, + "step": 26596 + }, + { + "epoch": 0.6829363799326295, + "grad_norm": 0.765625, + "learning_rate": 0.00013170810948455626, + "loss": 0.7155, + "step": 26597 + }, + { + "epoch": 0.6829620571285513, + "grad_norm": 0.80078125, + "learning_rate": 0.00013170387561109673, + "loss": 0.7622, + "step": 26598 + }, + { + "epoch": 0.6829877343244731, + "grad_norm": 0.8828125, + "learning_rate": 0.00013169964167445425, + "loss": 0.8445, + "step": 26599 + }, + { + "epoch": 0.683013411520395, + "grad_norm": 0.765625, + "learning_rate": 0.0001316954076746372, + "loss": 0.845, + "step": 26600 + }, + { + "epoch": 0.6830390887163167, + "grad_norm": 0.9765625, + "learning_rate": 0.0001316911736116541, + "loss": 0.8214, + "step": 26601 + }, + { + "epoch": 0.6830647659122385, + "grad_norm": 0.80859375, + "learning_rate": 0.0001316869394855133, + "loss": 0.8129, + "step": 26602 + }, + { + "epoch": 0.6830904431081604, + "grad_norm": 0.796875, + "learning_rate": 0.00013168270529622338, + "loss": 0.8227, + "step": 26603 + }, + { + "epoch": 0.6831161203040822, + "grad_norm": 0.7109375, + "learning_rate": 0.00013167847104379266, + "loss": 0.8564, + "step": 26604 + }, + { + "epoch": 0.683141797500004, + "grad_norm": 0.86328125, + "learning_rate": 0.00013167423672822955, + "loss": 0.8463, + "step": 26605 + }, + { + "epoch": 0.6831674746959259, + "grad_norm": 0.75, + "learning_rate": 0.00013167000234954262, + "loss": 0.8819, + "step": 26606 + }, + { + "epoch": 0.6831931518918477, + "grad_norm": 0.7265625, + "learning_rate": 0.0001316657679077402, + "loss": 0.8634, + "step": 26607 + }, + { + "epoch": 0.6832188290877694, + "grad_norm": 0.9375, + "learning_rate": 0.00013166153340283073, + "loss": 0.8305, + "step": 26608 + }, + { + "epoch": 0.6832445062836913, + "grad_norm": 0.86328125, + "learning_rate": 0.00013165729883482275, + "loss": 0.844, + "step": 26609 + }, + { + "epoch": 0.6832701834796131, + "grad_norm": 0.859375, + "learning_rate": 0.0001316530642037246, + "loss": 0.8213, + "step": 26610 + }, + { + "epoch": 0.6832958606755349, + "grad_norm": 0.8671875, + "learning_rate": 0.0001316488295095448, + "loss": 0.7456, + "step": 26611 + }, + { + "epoch": 0.6833215378714568, + "grad_norm": 0.8828125, + "learning_rate": 0.0001316445947522917, + "loss": 0.9356, + "step": 26612 + }, + { + "epoch": 0.6833472150673786, + "grad_norm": 0.75390625, + "learning_rate": 0.00013164035993197375, + "loss": 0.7296, + "step": 26613 + }, + { + "epoch": 0.6833728922633003, + "grad_norm": 0.76171875, + "learning_rate": 0.00013163612504859946, + "loss": 0.8054, + "step": 26614 + }, + { + "epoch": 0.6833985694592222, + "grad_norm": 0.77734375, + "learning_rate": 0.00013163189010217721, + "loss": 0.8234, + "step": 26615 + }, + { + "epoch": 0.683424246655144, + "grad_norm": 0.828125, + "learning_rate": 0.00013162765509271547, + "loss": 0.8836, + "step": 26616 + }, + { + "epoch": 0.6834499238510658, + "grad_norm": 0.94921875, + "learning_rate": 0.0001316234200202227, + "loss": 0.8398, + "step": 26617 + }, + { + "epoch": 0.6834756010469877, + "grad_norm": 0.76171875, + "learning_rate": 0.00013161918488470726, + "loss": 0.7819, + "step": 26618 + }, + { + "epoch": 0.6835012782429095, + "grad_norm": 0.77734375, + "learning_rate": 0.00013161494968617768, + "loss": 0.8147, + "step": 26619 + }, + { + "epoch": 0.6835269554388314, + "grad_norm": 0.76171875, + "learning_rate": 0.00013161071442464235, + "loss": 0.8438, + "step": 26620 + }, + { + "epoch": 0.6835526326347531, + "grad_norm": 1.140625, + "learning_rate": 0.0001316064791001097, + "loss": 0.8383, + "step": 26621 + }, + { + "epoch": 0.6835783098306749, + "grad_norm": 0.65625, + "learning_rate": 0.0001316022437125882, + "loss": 0.7027, + "step": 26622 + }, + { + "epoch": 0.6836039870265967, + "grad_norm": 0.79296875, + "learning_rate": 0.0001315980082620863, + "loss": 0.8089, + "step": 26623 + }, + { + "epoch": 0.6836296642225186, + "grad_norm": 0.81640625, + "learning_rate": 0.0001315937727486124, + "loss": 0.7193, + "step": 26624 + }, + { + "epoch": 0.6836553414184404, + "grad_norm": 0.7734375, + "learning_rate": 0.000131589537172175, + "loss": 0.7554, + "step": 26625 + }, + { + "epoch": 0.6836810186143623, + "grad_norm": 0.796875, + "learning_rate": 0.0001315853015327825, + "loss": 0.8394, + "step": 26626 + }, + { + "epoch": 0.6837066958102841, + "grad_norm": 0.84765625, + "learning_rate": 0.0001315810658304433, + "loss": 0.9157, + "step": 26627 + }, + { + "epoch": 0.6837323730062058, + "grad_norm": 0.78125, + "learning_rate": 0.00013157683006516594, + "loss": 0.833, + "step": 26628 + }, + { + "epoch": 0.6837580502021277, + "grad_norm": 0.72265625, + "learning_rate": 0.0001315725942369588, + "loss": 0.7206, + "step": 26629 + }, + { + "epoch": 0.6837837273980495, + "grad_norm": 0.77734375, + "learning_rate": 0.00013156835834583032, + "loss": 0.8397, + "step": 26630 + }, + { + "epoch": 0.6838094045939713, + "grad_norm": 0.734375, + "learning_rate": 0.00013156412239178898, + "loss": 0.8554, + "step": 26631 + }, + { + "epoch": 0.6838350817898932, + "grad_norm": 0.796875, + "learning_rate": 0.00013155988637484316, + "loss": 0.855, + "step": 26632 + }, + { + "epoch": 0.683860758985815, + "grad_norm": 0.7421875, + "learning_rate": 0.00013155565029500136, + "loss": 0.8896, + "step": 26633 + }, + { + "epoch": 0.6838864361817367, + "grad_norm": 0.796875, + "learning_rate": 0.00013155141415227198, + "loss": 0.8109, + "step": 26634 + }, + { + "epoch": 0.6839121133776586, + "grad_norm": 0.765625, + "learning_rate": 0.0001315471779466635, + "loss": 0.8312, + "step": 26635 + }, + { + "epoch": 0.6839377905735804, + "grad_norm": 0.84375, + "learning_rate": 0.00013154294167818433, + "loss": 0.9622, + "step": 26636 + }, + { + "epoch": 0.6839634677695022, + "grad_norm": 0.80078125, + "learning_rate": 0.00013153870534684292, + "loss": 0.851, + "step": 26637 + }, + { + "epoch": 0.6839891449654241, + "grad_norm": 0.78125, + "learning_rate": 0.00013153446895264773, + "loss": 0.9827, + "step": 26638 + }, + { + "epoch": 0.6840148221613459, + "grad_norm": 0.73046875, + "learning_rate": 0.0001315302324956072, + "loss": 0.8658, + "step": 26639 + }, + { + "epoch": 0.6840404993572677, + "grad_norm": 0.79296875, + "learning_rate": 0.00013152599597572974, + "loss": 0.8773, + "step": 26640 + }, + { + "epoch": 0.6840661765531895, + "grad_norm": 0.83203125, + "learning_rate": 0.00013152175939302384, + "loss": 0.9731, + "step": 26641 + }, + { + "epoch": 0.6840918537491113, + "grad_norm": 0.69921875, + "learning_rate": 0.00013151752274749793, + "loss": 0.7186, + "step": 26642 + }, + { + "epoch": 0.6841175309450331, + "grad_norm": 0.8125, + "learning_rate": 0.00013151328603916042, + "loss": 0.8385, + "step": 26643 + }, + { + "epoch": 0.684143208140955, + "grad_norm": 0.72265625, + "learning_rate": 0.00013150904926801982, + "loss": 0.7045, + "step": 26644 + }, + { + "epoch": 0.6841688853368768, + "grad_norm": 0.8359375, + "learning_rate": 0.00013150481243408448, + "loss": 0.7759, + "step": 26645 + }, + { + "epoch": 0.6841945625327986, + "grad_norm": 0.84375, + "learning_rate": 0.0001315005755373629, + "loss": 0.7838, + "step": 26646 + }, + { + "epoch": 0.6842202397287204, + "grad_norm": 0.7578125, + "learning_rate": 0.00013149633857786354, + "loss": 0.8104, + "step": 26647 + }, + { + "epoch": 0.6842459169246422, + "grad_norm": 0.859375, + "learning_rate": 0.0001314921015555948, + "loss": 0.8589, + "step": 26648 + }, + { + "epoch": 0.684271594120564, + "grad_norm": 0.72265625, + "learning_rate": 0.00013148786447056518, + "loss": 0.9313, + "step": 26649 + }, + { + "epoch": 0.6842972713164859, + "grad_norm": 0.734375, + "learning_rate": 0.00013148362732278308, + "loss": 0.8147, + "step": 26650 + }, + { + "epoch": 0.6843229485124077, + "grad_norm": 0.79296875, + "learning_rate": 0.00013147939011225694, + "loss": 0.8166, + "step": 26651 + }, + { + "epoch": 0.6843486257083295, + "grad_norm": 0.8515625, + "learning_rate": 0.00013147515283899521, + "loss": 0.9459, + "step": 26652 + }, + { + "epoch": 0.6843743029042514, + "grad_norm": 0.71875, + "learning_rate": 0.00013147091550300637, + "loss": 0.7479, + "step": 26653 + }, + { + "epoch": 0.6843999801001731, + "grad_norm": 0.76953125, + "learning_rate": 0.0001314666781042988, + "loss": 0.8571, + "step": 26654 + }, + { + "epoch": 0.684425657296095, + "grad_norm": 0.8515625, + "learning_rate": 0.000131462440642881, + "loss": 0.9417, + "step": 26655 + }, + { + "epoch": 0.6844513344920168, + "grad_norm": 0.8671875, + "learning_rate": 0.00013145820311876144, + "loss": 0.8204, + "step": 26656 + }, + { + "epoch": 0.6844770116879386, + "grad_norm": 0.7578125, + "learning_rate": 0.0001314539655319485, + "loss": 0.8752, + "step": 26657 + }, + { + "epoch": 0.6845026888838605, + "grad_norm": 0.84765625, + "learning_rate": 0.0001314497278824506, + "loss": 0.8753, + "step": 26658 + }, + { + "epoch": 0.6845283660797823, + "grad_norm": 0.83203125, + "learning_rate": 0.00013144549017027628, + "loss": 0.9382, + "step": 26659 + }, + { + "epoch": 0.6845540432757041, + "grad_norm": 0.75390625, + "learning_rate": 0.00013144125239543394, + "loss": 0.8271, + "step": 26660 + }, + { + "epoch": 0.6845797204716259, + "grad_norm": 0.796875, + "learning_rate": 0.000131437014557932, + "loss": 0.8969, + "step": 26661 + }, + { + "epoch": 0.6846053976675477, + "grad_norm": 0.76171875, + "learning_rate": 0.00013143277665777893, + "loss": 0.8532, + "step": 26662 + }, + { + "epoch": 0.6846310748634695, + "grad_norm": 0.74609375, + "learning_rate": 0.0001314285386949832, + "loss": 0.7375, + "step": 26663 + }, + { + "epoch": 0.6846567520593914, + "grad_norm": 0.71484375, + "learning_rate": 0.00013142430066955324, + "loss": 0.7495, + "step": 26664 + }, + { + "epoch": 0.6846824292553132, + "grad_norm": 0.7421875, + "learning_rate": 0.00013142006258149745, + "loss": 0.6703, + "step": 26665 + }, + { + "epoch": 0.684708106451235, + "grad_norm": 0.73828125, + "learning_rate": 0.00013141582443082435, + "loss": 0.9208, + "step": 26666 + }, + { + "epoch": 0.6847337836471568, + "grad_norm": 0.80859375, + "learning_rate": 0.00013141158621754232, + "loss": 0.9397, + "step": 26667 + }, + { + "epoch": 0.6847594608430786, + "grad_norm": 0.8046875, + "learning_rate": 0.00013140734794165986, + "loss": 0.7836, + "step": 26668 + }, + { + "epoch": 0.6847851380390004, + "grad_norm": 0.90625, + "learning_rate": 0.00013140310960318535, + "loss": 0.8126, + "step": 26669 + }, + { + "epoch": 0.6848108152349223, + "grad_norm": 0.7890625, + "learning_rate": 0.00013139887120212733, + "loss": 0.853, + "step": 26670 + }, + { + "epoch": 0.6848364924308441, + "grad_norm": 0.7578125, + "learning_rate": 0.00013139463273849415, + "loss": 0.7712, + "step": 26671 + }, + { + "epoch": 0.6848621696267659, + "grad_norm": 0.80078125, + "learning_rate": 0.00013139039421229432, + "loss": 0.8154, + "step": 26672 + }, + { + "epoch": 0.6848878468226878, + "grad_norm": 0.78515625, + "learning_rate": 0.0001313861556235363, + "loss": 0.7949, + "step": 26673 + }, + { + "epoch": 0.6849135240186095, + "grad_norm": 0.7421875, + "learning_rate": 0.00013138191697222845, + "loss": 0.7206, + "step": 26674 + }, + { + "epoch": 0.6849392012145313, + "grad_norm": 0.73828125, + "learning_rate": 0.00013137767825837932, + "loss": 0.759, + "step": 26675 + }, + { + "epoch": 0.6849648784104532, + "grad_norm": 0.8046875, + "learning_rate": 0.00013137343948199732, + "loss": 0.8588, + "step": 26676 + }, + { + "epoch": 0.684990555606375, + "grad_norm": 0.82421875, + "learning_rate": 0.00013136920064309087, + "loss": 0.8185, + "step": 26677 + }, + { + "epoch": 0.6850162328022968, + "grad_norm": 0.7734375, + "learning_rate": 0.00013136496174166843, + "loss": 0.8553, + "step": 26678 + }, + { + "epoch": 0.6850419099982187, + "grad_norm": 0.703125, + "learning_rate": 0.00013136072277773845, + "loss": 0.7599, + "step": 26679 + }, + { + "epoch": 0.6850675871941405, + "grad_norm": 0.84375, + "learning_rate": 0.0001313564837513094, + "loss": 0.8678, + "step": 26680 + }, + { + "epoch": 0.6850932643900622, + "grad_norm": 0.73046875, + "learning_rate": 0.00013135224466238968, + "loss": 0.8276, + "step": 26681 + }, + { + "epoch": 0.6851189415859841, + "grad_norm": 0.78515625, + "learning_rate": 0.0001313480055109878, + "loss": 0.8819, + "step": 26682 + }, + { + "epoch": 0.6851446187819059, + "grad_norm": 0.7109375, + "learning_rate": 0.00013134376629711218, + "loss": 0.8196, + "step": 26683 + }, + { + "epoch": 0.6851702959778277, + "grad_norm": 0.74609375, + "learning_rate": 0.00013133952702077125, + "loss": 0.8836, + "step": 26684 + }, + { + "epoch": 0.6851959731737496, + "grad_norm": 0.77734375, + "learning_rate": 0.00013133528768197346, + "loss": 0.7223, + "step": 26685 + }, + { + "epoch": 0.6852216503696714, + "grad_norm": 0.765625, + "learning_rate": 0.0001313310482807273, + "loss": 0.8283, + "step": 26686 + }, + { + "epoch": 0.6852473275655931, + "grad_norm": 0.7890625, + "learning_rate": 0.00013132680881704117, + "loss": 0.8003, + "step": 26687 + }, + { + "epoch": 0.685273004761515, + "grad_norm": 0.77734375, + "learning_rate": 0.00013132256929092355, + "loss": 0.8494, + "step": 26688 + }, + { + "epoch": 0.6852986819574368, + "grad_norm": 0.7578125, + "learning_rate": 0.00013131832970238286, + "loss": 0.8497, + "step": 26689 + }, + { + "epoch": 0.6853243591533587, + "grad_norm": 0.73828125, + "learning_rate": 0.0001313140900514276, + "loss": 0.8335, + "step": 26690 + }, + { + "epoch": 0.6853500363492805, + "grad_norm": 0.734375, + "learning_rate": 0.00013130985033806618, + "loss": 0.869, + "step": 26691 + }, + { + "epoch": 0.6853757135452023, + "grad_norm": 0.7890625, + "learning_rate": 0.00013130561056230706, + "loss": 0.7838, + "step": 26692 + }, + { + "epoch": 0.6854013907411242, + "grad_norm": 0.78515625, + "learning_rate": 0.00013130137072415866, + "loss": 0.9254, + "step": 26693 + }, + { + "epoch": 0.6854270679370459, + "grad_norm": 0.72265625, + "learning_rate": 0.00013129713082362946, + "loss": 0.7513, + "step": 26694 + }, + { + "epoch": 0.6854527451329677, + "grad_norm": 0.87109375, + "learning_rate": 0.00013129289086072794, + "loss": 0.8283, + "step": 26695 + }, + { + "epoch": 0.6854784223288896, + "grad_norm": 0.84765625, + "learning_rate": 0.0001312886508354625, + "loss": 0.6823, + "step": 26696 + }, + { + "epoch": 0.6855040995248114, + "grad_norm": 0.84765625, + "learning_rate": 0.0001312844107478416, + "loss": 0.9094, + "step": 26697 + }, + { + "epoch": 0.6855297767207332, + "grad_norm": 0.7890625, + "learning_rate": 0.00013128017059787367, + "loss": 0.6803, + "step": 26698 + }, + { + "epoch": 0.6855554539166551, + "grad_norm": 0.77734375, + "learning_rate": 0.0001312759303855672, + "loss": 0.8522, + "step": 26699 + }, + { + "epoch": 0.6855811311125769, + "grad_norm": 0.75390625, + "learning_rate": 0.00013127169011093064, + "loss": 0.7814, + "step": 26700 + }, + { + "epoch": 0.6856068083084986, + "grad_norm": 0.78125, + "learning_rate": 0.0001312674497739724, + "loss": 0.9144, + "step": 26701 + }, + { + "epoch": 0.6856324855044205, + "grad_norm": 0.78515625, + "learning_rate": 0.000131263209374701, + "loss": 0.8447, + "step": 26702 + }, + { + "epoch": 0.6856581627003423, + "grad_norm": 0.80078125, + "learning_rate": 0.00013125896891312482, + "loss": 0.9791, + "step": 26703 + }, + { + "epoch": 0.6856838398962641, + "grad_norm": 0.6796875, + "learning_rate": 0.00013125472838925234, + "loss": 0.8586, + "step": 26704 + }, + { + "epoch": 0.685709517092186, + "grad_norm": 0.90625, + "learning_rate": 0.00013125048780309202, + "loss": 0.8, + "step": 26705 + }, + { + "epoch": 0.6857351942881078, + "grad_norm": 0.76171875, + "learning_rate": 0.00013124624715465228, + "loss": 0.8624, + "step": 26706 + }, + { + "epoch": 0.6857608714840295, + "grad_norm": 0.75390625, + "learning_rate": 0.0001312420064439416, + "loss": 0.8979, + "step": 26707 + }, + { + "epoch": 0.6857865486799514, + "grad_norm": 0.70703125, + "learning_rate": 0.00013123776567096845, + "loss": 0.8023, + "step": 26708 + }, + { + "epoch": 0.6858122258758732, + "grad_norm": 0.74609375, + "learning_rate": 0.00013123352483574121, + "loss": 0.7861, + "step": 26709 + }, + { + "epoch": 0.685837903071795, + "grad_norm": 0.77734375, + "learning_rate": 0.00013122928393826842, + "loss": 0.8613, + "step": 26710 + }, + { + "epoch": 0.6858635802677169, + "grad_norm": 0.796875, + "learning_rate": 0.00013122504297855843, + "loss": 0.8087, + "step": 26711 + }, + { + "epoch": 0.6858892574636387, + "grad_norm": 0.77734375, + "learning_rate": 0.0001312208019566198, + "loss": 0.6896, + "step": 26712 + }, + { + "epoch": 0.6859149346595605, + "grad_norm": 0.78515625, + "learning_rate": 0.00013121656087246091, + "loss": 0.8923, + "step": 26713 + }, + { + "epoch": 0.6859406118554823, + "grad_norm": 0.83203125, + "learning_rate": 0.00013121231972609024, + "loss": 0.8425, + "step": 26714 + }, + { + "epoch": 0.6859662890514041, + "grad_norm": 0.84765625, + "learning_rate": 0.00013120807851751625, + "loss": 0.905, + "step": 26715 + }, + { + "epoch": 0.685991966247326, + "grad_norm": 0.99609375, + "learning_rate": 0.00013120383724674737, + "loss": 0.8624, + "step": 26716 + }, + { + "epoch": 0.6860176434432478, + "grad_norm": 0.74609375, + "learning_rate": 0.00013119959591379208, + "loss": 0.8075, + "step": 26717 + }, + { + "epoch": 0.6860433206391696, + "grad_norm": 0.7578125, + "learning_rate": 0.00013119535451865878, + "loss": 0.7922, + "step": 26718 + }, + { + "epoch": 0.6860689978350915, + "grad_norm": 0.76953125, + "learning_rate": 0.00013119111306135594, + "loss": 0.8075, + "step": 26719 + }, + { + "epoch": 0.6860946750310133, + "grad_norm": 0.8046875, + "learning_rate": 0.00013118687154189206, + "loss": 0.9635, + "step": 26720 + }, + { + "epoch": 0.686120352226935, + "grad_norm": 0.73828125, + "learning_rate": 0.00013118262996027557, + "loss": 0.9683, + "step": 26721 + }, + { + "epoch": 0.6861460294228569, + "grad_norm": 0.74609375, + "learning_rate": 0.00013117838831651487, + "loss": 0.7599, + "step": 26722 + }, + { + "epoch": 0.6861717066187787, + "grad_norm": 0.73828125, + "learning_rate": 0.00013117414661061854, + "loss": 0.7276, + "step": 26723 + }, + { + "epoch": 0.6861973838147005, + "grad_norm": 0.7734375, + "learning_rate": 0.0001311699048425949, + "loss": 0.7434, + "step": 26724 + }, + { + "epoch": 0.6862230610106224, + "grad_norm": 0.70703125, + "learning_rate": 0.00013116566301245246, + "loss": 0.764, + "step": 26725 + }, + { + "epoch": 0.6862487382065442, + "grad_norm": 0.78515625, + "learning_rate": 0.00013116142112019967, + "loss": 0.8057, + "step": 26726 + }, + { + "epoch": 0.6862744154024659, + "grad_norm": 0.75, + "learning_rate": 0.00013115717916584497, + "loss": 0.8803, + "step": 26727 + }, + { + "epoch": 0.6863000925983878, + "grad_norm": 0.734375, + "learning_rate": 0.00013115293714939682, + "loss": 0.7659, + "step": 26728 + }, + { + "epoch": 0.6863257697943096, + "grad_norm": 0.78125, + "learning_rate": 0.00013114869507086372, + "loss": 0.7385, + "step": 26729 + }, + { + "epoch": 0.6863514469902314, + "grad_norm": 0.8125, + "learning_rate": 0.00013114445293025407, + "loss": 0.9016, + "step": 26730 + }, + { + "epoch": 0.6863771241861533, + "grad_norm": 0.72265625, + "learning_rate": 0.0001311402107275763, + "loss": 0.7456, + "step": 26731 + }, + { + "epoch": 0.6864028013820751, + "grad_norm": 0.7734375, + "learning_rate": 0.00013113596846283895, + "loss": 0.7849, + "step": 26732 + }, + { + "epoch": 0.6864284785779969, + "grad_norm": 0.80859375, + "learning_rate": 0.0001311317261360504, + "loss": 0.8104, + "step": 26733 + }, + { + "epoch": 0.6864541557739187, + "grad_norm": 0.88671875, + "learning_rate": 0.00013112748374721916, + "loss": 0.9181, + "step": 26734 + }, + { + "epoch": 0.6864798329698405, + "grad_norm": 0.72265625, + "learning_rate": 0.00013112324129635364, + "loss": 0.8595, + "step": 26735 + }, + { + "epoch": 0.6865055101657623, + "grad_norm": 0.7890625, + "learning_rate": 0.00013111899878346232, + "loss": 0.7788, + "step": 26736 + }, + { + "epoch": 0.6865311873616842, + "grad_norm": 0.80859375, + "learning_rate": 0.00013111475620855364, + "loss": 0.8111, + "step": 26737 + }, + { + "epoch": 0.686556864557606, + "grad_norm": 0.72265625, + "learning_rate": 0.00013111051357163605, + "loss": 0.8616, + "step": 26738 + }, + { + "epoch": 0.6865825417535278, + "grad_norm": 0.796875, + "learning_rate": 0.00013110627087271803, + "loss": 0.8381, + "step": 26739 + }, + { + "epoch": 0.6866082189494497, + "grad_norm": 0.7734375, + "learning_rate": 0.000131102028111808, + "loss": 0.8351, + "step": 26740 + }, + { + "epoch": 0.6866338961453714, + "grad_norm": 0.796875, + "learning_rate": 0.00013109778528891446, + "loss": 0.9152, + "step": 26741 + }, + { + "epoch": 0.6866595733412932, + "grad_norm": 0.81640625, + "learning_rate": 0.00013109354240404585, + "loss": 0.8425, + "step": 26742 + }, + { + "epoch": 0.6866852505372151, + "grad_norm": 0.8203125, + "learning_rate": 0.0001310892994572106, + "loss": 0.8364, + "step": 26743 + }, + { + "epoch": 0.6867109277331369, + "grad_norm": 0.79296875, + "learning_rate": 0.00013108505644841717, + "loss": 0.7685, + "step": 26744 + }, + { + "epoch": 0.6867366049290587, + "grad_norm": 0.77734375, + "learning_rate": 0.00013108081337767405, + "loss": 0.971, + "step": 26745 + }, + { + "epoch": 0.6867622821249806, + "grad_norm": 0.828125, + "learning_rate": 0.0001310765702449897, + "loss": 0.8035, + "step": 26746 + }, + { + "epoch": 0.6867879593209023, + "grad_norm": 0.79296875, + "learning_rate": 0.00013107232705037252, + "loss": 0.8641, + "step": 26747 + }, + { + "epoch": 0.6868136365168241, + "grad_norm": 0.796875, + "learning_rate": 0.00013106808379383103, + "loss": 0.8245, + "step": 26748 + }, + { + "epoch": 0.686839313712746, + "grad_norm": 1.0, + "learning_rate": 0.00013106384047537363, + "loss": 0.8224, + "step": 26749 + }, + { + "epoch": 0.6868649909086678, + "grad_norm": 0.75390625, + "learning_rate": 0.00013105959709500882, + "loss": 0.8726, + "step": 26750 + }, + { + "epoch": 0.6868906681045897, + "grad_norm": 0.859375, + "learning_rate": 0.00013105535365274499, + "loss": 0.9096, + "step": 26751 + }, + { + "epoch": 0.6869163453005115, + "grad_norm": 0.8828125, + "learning_rate": 0.0001310511101485907, + "loss": 0.9887, + "step": 26752 + }, + { + "epoch": 0.6869420224964333, + "grad_norm": 0.8125, + "learning_rate": 0.00013104686658255434, + "loss": 0.8489, + "step": 26753 + }, + { + "epoch": 0.686967699692355, + "grad_norm": 0.78125, + "learning_rate": 0.00013104262295464433, + "loss": 0.9027, + "step": 26754 + }, + { + "epoch": 0.6869933768882769, + "grad_norm": 1.0078125, + "learning_rate": 0.00013103837926486925, + "loss": 0.9083, + "step": 26755 + }, + { + "epoch": 0.6870190540841987, + "grad_norm": 0.71484375, + "learning_rate": 0.00013103413551323744, + "loss": 0.8962, + "step": 26756 + }, + { + "epoch": 0.6870447312801206, + "grad_norm": 0.76171875, + "learning_rate": 0.00013102989169975742, + "loss": 0.8, + "step": 26757 + }, + { + "epoch": 0.6870704084760424, + "grad_norm": 0.80078125, + "learning_rate": 0.00013102564782443762, + "loss": 0.8349, + "step": 26758 + }, + { + "epoch": 0.6870960856719642, + "grad_norm": 0.8125, + "learning_rate": 0.0001310214038872865, + "loss": 0.7807, + "step": 26759 + }, + { + "epoch": 0.6871217628678861, + "grad_norm": 0.75390625, + "learning_rate": 0.00013101715988831253, + "loss": 0.8322, + "step": 26760 + }, + { + "epoch": 0.6871474400638078, + "grad_norm": 0.84375, + "learning_rate": 0.00013101291582752418, + "loss": 0.9378, + "step": 26761 + }, + { + "epoch": 0.6871731172597296, + "grad_norm": 0.8515625, + "learning_rate": 0.00013100867170492988, + "loss": 0.9034, + "step": 26762 + }, + { + "epoch": 0.6871987944556515, + "grad_norm": 0.86328125, + "learning_rate": 0.0001310044275205381, + "loss": 0.9549, + "step": 26763 + }, + { + "epoch": 0.6872244716515733, + "grad_norm": 0.75, + "learning_rate": 0.00013100018327435726, + "loss": 0.8164, + "step": 26764 + }, + { + "epoch": 0.6872501488474951, + "grad_norm": 0.796875, + "learning_rate": 0.00013099593896639592, + "loss": 0.7626, + "step": 26765 + }, + { + "epoch": 0.687275826043417, + "grad_norm": 0.82421875, + "learning_rate": 0.00013099169459666242, + "loss": 0.9608, + "step": 26766 + }, + { + "epoch": 0.6873015032393387, + "grad_norm": 0.80859375, + "learning_rate": 0.00013098745016516526, + "loss": 0.8403, + "step": 26767 + }, + { + "epoch": 0.6873271804352605, + "grad_norm": 0.7421875, + "learning_rate": 0.00013098320567191295, + "loss": 0.7987, + "step": 26768 + }, + { + "epoch": 0.6873528576311824, + "grad_norm": 0.76953125, + "learning_rate": 0.00013097896111691394, + "loss": 0.8291, + "step": 26769 + }, + { + "epoch": 0.6873785348271042, + "grad_norm": 0.69140625, + "learning_rate": 0.0001309747165001766, + "loss": 0.8359, + "step": 26770 + }, + { + "epoch": 0.687404212023026, + "grad_norm": 0.77734375, + "learning_rate": 0.00013097047182170947, + "loss": 0.8726, + "step": 26771 + }, + { + "epoch": 0.6874298892189479, + "grad_norm": 0.8046875, + "learning_rate": 0.000130966227081521, + "loss": 0.7616, + "step": 26772 + }, + { + "epoch": 0.6874555664148697, + "grad_norm": 0.80859375, + "learning_rate": 0.0001309619822796196, + "loss": 0.8936, + "step": 26773 + }, + { + "epoch": 0.6874812436107914, + "grad_norm": 0.765625, + "learning_rate": 0.0001309577374160138, + "loss": 0.7312, + "step": 26774 + }, + { + "epoch": 0.6875069208067133, + "grad_norm": 1.0703125, + "learning_rate": 0.000130953492490712, + "loss": 0.8168, + "step": 26775 + }, + { + "epoch": 0.6875325980026351, + "grad_norm": 0.74609375, + "learning_rate": 0.0001309492475037227, + "loss": 0.7474, + "step": 26776 + }, + { + "epoch": 0.687558275198557, + "grad_norm": 0.74609375, + "learning_rate": 0.00013094500245505438, + "loss": 0.8113, + "step": 26777 + }, + { + "epoch": 0.6875839523944788, + "grad_norm": 0.74609375, + "learning_rate": 0.00013094075734471541, + "loss": 0.7377, + "step": 26778 + }, + { + "epoch": 0.6876096295904006, + "grad_norm": 0.7890625, + "learning_rate": 0.00013093651217271432, + "loss": 0.9297, + "step": 26779 + }, + { + "epoch": 0.6876353067863225, + "grad_norm": 0.74609375, + "learning_rate": 0.0001309322669390596, + "loss": 0.8293, + "step": 26780 + }, + { + "epoch": 0.6876609839822442, + "grad_norm": 0.72265625, + "learning_rate": 0.00013092802164375956, + "loss": 0.7746, + "step": 26781 + }, + { + "epoch": 0.687686661178166, + "grad_norm": 0.78125, + "learning_rate": 0.00013092377628682288, + "loss": 0.7933, + "step": 26782 + }, + { + "epoch": 0.6877123383740879, + "grad_norm": 0.85546875, + "learning_rate": 0.00013091953086825787, + "loss": 0.7091, + "step": 26783 + }, + { + "epoch": 0.6877380155700097, + "grad_norm": 0.77734375, + "learning_rate": 0.00013091528538807299, + "loss": 0.76, + "step": 26784 + }, + { + "epoch": 0.6877636927659315, + "grad_norm": 0.80859375, + "learning_rate": 0.00013091103984627677, + "loss": 0.8538, + "step": 26785 + }, + { + "epoch": 0.6877893699618534, + "grad_norm": 0.83203125, + "learning_rate": 0.00013090679424287763, + "loss": 0.875, + "step": 26786 + }, + { + "epoch": 0.6878150471577751, + "grad_norm": 0.81640625, + "learning_rate": 0.00013090254857788403, + "loss": 0.8452, + "step": 26787 + }, + { + "epoch": 0.6878407243536969, + "grad_norm": 0.79296875, + "learning_rate": 0.00013089830285130448, + "loss": 0.7725, + "step": 26788 + }, + { + "epoch": 0.6878664015496188, + "grad_norm": 0.82421875, + "learning_rate": 0.0001308940570631474, + "loss": 0.8668, + "step": 26789 + }, + { + "epoch": 0.6878920787455406, + "grad_norm": 0.78125, + "learning_rate": 0.00013088981121342123, + "loss": 0.8852, + "step": 26790 + }, + { + "epoch": 0.6879177559414624, + "grad_norm": 0.7734375, + "learning_rate": 0.00013088556530213444, + "loss": 0.7253, + "step": 26791 + }, + { + "epoch": 0.6879434331373843, + "grad_norm": 0.81640625, + "learning_rate": 0.00013088131932929557, + "loss": 0.8883, + "step": 26792 + }, + { + "epoch": 0.6879691103333061, + "grad_norm": 0.83203125, + "learning_rate": 0.000130877073294913, + "loss": 0.8523, + "step": 26793 + }, + { + "epoch": 0.6879947875292278, + "grad_norm": 0.8359375, + "learning_rate": 0.00013087282719899515, + "loss": 1.027, + "step": 26794 + }, + { + "epoch": 0.6880204647251497, + "grad_norm": 0.76953125, + "learning_rate": 0.00013086858104155062, + "loss": 0.919, + "step": 26795 + }, + { + "epoch": 0.6880461419210715, + "grad_norm": 0.7890625, + "learning_rate": 0.00013086433482258775, + "loss": 0.8703, + "step": 26796 + }, + { + "epoch": 0.6880718191169933, + "grad_norm": 0.76953125, + "learning_rate": 0.00013086008854211508, + "loss": 0.861, + "step": 26797 + }, + { + "epoch": 0.6880974963129152, + "grad_norm": 0.72265625, + "learning_rate": 0.00013085584220014101, + "loss": 0.7316, + "step": 26798 + }, + { + "epoch": 0.688123173508837, + "grad_norm": 0.7265625, + "learning_rate": 0.00013085159579667407, + "loss": 0.8574, + "step": 26799 + }, + { + "epoch": 0.6881488507047588, + "grad_norm": 0.8125, + "learning_rate": 0.00013084734933172264, + "loss": 0.8409, + "step": 26800 + }, + { + "epoch": 0.6881745279006806, + "grad_norm": 0.76953125, + "learning_rate": 0.00013084310280529526, + "loss": 0.8665, + "step": 26801 + }, + { + "epoch": 0.6882002050966024, + "grad_norm": 0.73046875, + "learning_rate": 0.00013083885621740038, + "loss": 0.8155, + "step": 26802 + }, + { + "epoch": 0.6882258822925242, + "grad_norm": 0.84375, + "learning_rate": 0.00013083460956804643, + "loss": 0.799, + "step": 26803 + }, + { + "epoch": 0.6882515594884461, + "grad_norm": 0.80078125, + "learning_rate": 0.00013083036285724186, + "loss": 0.854, + "step": 26804 + }, + { + "epoch": 0.6882772366843679, + "grad_norm": 0.796875, + "learning_rate": 0.00013082611608499517, + "loss": 0.6999, + "step": 26805 + }, + { + "epoch": 0.6883029138802897, + "grad_norm": 0.85546875, + "learning_rate": 0.00013082186925131482, + "loss": 0.8933, + "step": 26806 + }, + { + "epoch": 0.6883285910762115, + "grad_norm": 0.828125, + "learning_rate": 0.0001308176223562093, + "loss": 0.7693, + "step": 26807 + }, + { + "epoch": 0.6883542682721333, + "grad_norm": 0.765625, + "learning_rate": 0.00013081337539968703, + "loss": 0.7167, + "step": 26808 + }, + { + "epoch": 0.6883799454680551, + "grad_norm": 0.80859375, + "learning_rate": 0.00013080912838175647, + "loss": 0.907, + "step": 26809 + }, + { + "epoch": 0.688405622663977, + "grad_norm": 0.83984375, + "learning_rate": 0.0001308048813024261, + "loss": 0.8408, + "step": 26810 + }, + { + "epoch": 0.6884312998598988, + "grad_norm": 0.74609375, + "learning_rate": 0.00013080063416170439, + "loss": 0.7567, + "step": 26811 + }, + { + "epoch": 0.6884569770558207, + "grad_norm": 0.7265625, + "learning_rate": 0.0001307963869595998, + "loss": 0.8393, + "step": 26812 + }, + { + "epoch": 0.6884826542517425, + "grad_norm": 0.796875, + "learning_rate": 0.0001307921396961208, + "loss": 0.7639, + "step": 26813 + }, + { + "epoch": 0.6885083314476642, + "grad_norm": 0.78515625, + "learning_rate": 0.00013078789237127582, + "loss": 0.7936, + "step": 26814 + }, + { + "epoch": 0.688534008643586, + "grad_norm": 0.828125, + "learning_rate": 0.00013078364498507336, + "loss": 0.919, + "step": 26815 + }, + { + "epoch": 0.6885596858395079, + "grad_norm": 0.73046875, + "learning_rate": 0.00013077939753752192, + "loss": 0.7389, + "step": 26816 + }, + { + "epoch": 0.6885853630354297, + "grad_norm": 0.8359375, + "learning_rate": 0.00013077515002862987, + "loss": 0.9264, + "step": 26817 + }, + { + "epoch": 0.6886110402313516, + "grad_norm": 0.75390625, + "learning_rate": 0.00013077090245840572, + "loss": 0.7664, + "step": 26818 + }, + { + "epoch": 0.6886367174272734, + "grad_norm": 0.83203125, + "learning_rate": 0.00013076665482685798, + "loss": 0.8087, + "step": 26819 + }, + { + "epoch": 0.6886623946231952, + "grad_norm": 0.75, + "learning_rate": 0.00013076240713399506, + "loss": 0.8247, + "step": 26820 + }, + { + "epoch": 0.688688071819117, + "grad_norm": 0.8203125, + "learning_rate": 0.00013075815937982544, + "loss": 0.9767, + "step": 26821 + }, + { + "epoch": 0.6887137490150388, + "grad_norm": 0.80859375, + "learning_rate": 0.00013075391156435758, + "loss": 0.8503, + "step": 26822 + }, + { + "epoch": 0.6887394262109606, + "grad_norm": 0.7578125, + "learning_rate": 0.00013074966368759996, + "loss": 0.69, + "step": 26823 + }, + { + "epoch": 0.6887651034068825, + "grad_norm": 0.84375, + "learning_rate": 0.00013074541574956106, + "loss": 0.7128, + "step": 26824 + }, + { + "epoch": 0.6887907806028043, + "grad_norm": 0.76171875, + "learning_rate": 0.0001307411677502493, + "loss": 0.8301, + "step": 26825 + }, + { + "epoch": 0.6888164577987261, + "grad_norm": 0.7578125, + "learning_rate": 0.00013073691968967314, + "loss": 0.833, + "step": 26826 + }, + { + "epoch": 0.6888421349946479, + "grad_norm": 0.78515625, + "learning_rate": 0.0001307326715678411, + "loss": 0.7984, + "step": 26827 + }, + { + "epoch": 0.6888678121905697, + "grad_norm": 0.77734375, + "learning_rate": 0.00013072842338476164, + "loss": 0.9096, + "step": 26828 + }, + { + "epoch": 0.6888934893864915, + "grad_norm": 0.79296875, + "learning_rate": 0.0001307241751404432, + "loss": 0.841, + "step": 26829 + }, + { + "epoch": 0.6889191665824134, + "grad_norm": 0.76171875, + "learning_rate": 0.00013071992683489427, + "loss": 0.9461, + "step": 26830 + }, + { + "epoch": 0.6889448437783352, + "grad_norm": 0.74609375, + "learning_rate": 0.00013071567846812326, + "loss": 0.7744, + "step": 26831 + }, + { + "epoch": 0.688970520974257, + "grad_norm": 0.71484375, + "learning_rate": 0.00013071143004013872, + "loss": 0.8555, + "step": 26832 + }, + { + "epoch": 0.6889961981701789, + "grad_norm": 1.3125, + "learning_rate": 0.00013070718155094903, + "loss": 1.0153, + "step": 26833 + }, + { + "epoch": 0.6890218753661006, + "grad_norm": 0.76953125, + "learning_rate": 0.00013070293300056272, + "loss": 0.8674, + "step": 26834 + }, + { + "epoch": 0.6890475525620224, + "grad_norm": 0.71484375, + "learning_rate": 0.00013069868438898827, + "loss": 0.8737, + "step": 26835 + }, + { + "epoch": 0.6890732297579443, + "grad_norm": 0.703125, + "learning_rate": 0.00013069443571623405, + "loss": 0.8302, + "step": 26836 + }, + { + "epoch": 0.6890989069538661, + "grad_norm": 0.82421875, + "learning_rate": 0.00013069018698230863, + "loss": 0.8477, + "step": 26837 + }, + { + "epoch": 0.6891245841497879, + "grad_norm": 0.80859375, + "learning_rate": 0.00013068593818722047, + "loss": 0.8388, + "step": 26838 + }, + { + "epoch": 0.6891502613457098, + "grad_norm": 0.734375, + "learning_rate": 0.00013068168933097796, + "loss": 0.7828, + "step": 26839 + }, + { + "epoch": 0.6891759385416316, + "grad_norm": 0.75390625, + "learning_rate": 0.00013067744041358963, + "loss": 0.7653, + "step": 26840 + }, + { + "epoch": 0.6892016157375533, + "grad_norm": 0.8671875, + "learning_rate": 0.00013067319143506391, + "loss": 0.8623, + "step": 26841 + }, + { + "epoch": 0.6892272929334752, + "grad_norm": 0.84375, + "learning_rate": 0.0001306689423954093, + "loss": 0.7802, + "step": 26842 + }, + { + "epoch": 0.689252970129397, + "grad_norm": 0.78125, + "learning_rate": 0.0001306646932946343, + "loss": 0.7063, + "step": 26843 + }, + { + "epoch": 0.6892786473253188, + "grad_norm": 0.79296875, + "learning_rate": 0.0001306604441327473, + "loss": 0.8229, + "step": 26844 + }, + { + "epoch": 0.6893043245212407, + "grad_norm": 0.7890625, + "learning_rate": 0.0001306561949097568, + "loss": 0.8729, + "step": 26845 + }, + { + "epoch": 0.6893300017171625, + "grad_norm": 0.796875, + "learning_rate": 0.0001306519456256713, + "loss": 0.9071, + "step": 26846 + }, + { + "epoch": 0.6893556789130842, + "grad_norm": 0.72265625, + "learning_rate": 0.0001306476962804992, + "loss": 0.8865, + "step": 26847 + }, + { + "epoch": 0.6893813561090061, + "grad_norm": 0.7890625, + "learning_rate": 0.00013064344687424906, + "loss": 0.7769, + "step": 26848 + }, + { + "epoch": 0.6894070333049279, + "grad_norm": 0.77734375, + "learning_rate": 0.00013063919740692928, + "loss": 0.8785, + "step": 26849 + }, + { + "epoch": 0.6894327105008498, + "grad_norm": 0.92578125, + "learning_rate": 0.00013063494787854833, + "loss": 0.937, + "step": 26850 + }, + { + "epoch": 0.6894583876967716, + "grad_norm": 0.8203125, + "learning_rate": 0.0001306306982891147, + "loss": 0.9039, + "step": 26851 + }, + { + "epoch": 0.6894840648926934, + "grad_norm": 0.76171875, + "learning_rate": 0.00013062644863863685, + "loss": 0.7444, + "step": 26852 + }, + { + "epoch": 0.6895097420886153, + "grad_norm": 0.7109375, + "learning_rate": 0.00013062219892712325, + "loss": 0.7462, + "step": 26853 + }, + { + "epoch": 0.689535419284537, + "grad_norm": 0.7578125, + "learning_rate": 0.00013061794915458243, + "loss": 0.8565, + "step": 26854 + }, + { + "epoch": 0.6895610964804588, + "grad_norm": 1.0078125, + "learning_rate": 0.00013061369932102274, + "loss": 0.8604, + "step": 26855 + }, + { + "epoch": 0.6895867736763807, + "grad_norm": 0.8359375, + "learning_rate": 0.00013060944942645274, + "loss": 0.7634, + "step": 26856 + }, + { + "epoch": 0.6896124508723025, + "grad_norm": 0.8671875, + "learning_rate": 0.00013060519947088087, + "loss": 0.8992, + "step": 26857 + }, + { + "epoch": 0.6896381280682243, + "grad_norm": 0.82421875, + "learning_rate": 0.00013060094945431557, + "loss": 0.8791, + "step": 26858 + }, + { + "epoch": 0.6896638052641462, + "grad_norm": 0.86328125, + "learning_rate": 0.0001305966993767654, + "loss": 0.8493, + "step": 26859 + }, + { + "epoch": 0.6896894824600679, + "grad_norm": 0.76953125, + "learning_rate": 0.00013059244923823874, + "loss": 0.7878, + "step": 26860 + }, + { + "epoch": 0.6897151596559897, + "grad_norm": 0.74609375, + "learning_rate": 0.0001305881990387441, + "loss": 0.8174, + "step": 26861 + }, + { + "epoch": 0.6897408368519116, + "grad_norm": 0.734375, + "learning_rate": 0.00013058394877828993, + "loss": 0.9012, + "step": 26862 + }, + { + "epoch": 0.6897665140478334, + "grad_norm": 0.86328125, + "learning_rate": 0.00013057969845688472, + "loss": 0.9825, + "step": 26863 + }, + { + "epoch": 0.6897921912437552, + "grad_norm": 0.78125, + "learning_rate": 0.0001305754480745369, + "loss": 0.818, + "step": 26864 + }, + { + "epoch": 0.6898178684396771, + "grad_norm": 0.89453125, + "learning_rate": 0.00013057119763125504, + "loss": 0.8783, + "step": 26865 + }, + { + "epoch": 0.6898435456355989, + "grad_norm": 0.76953125, + "learning_rate": 0.0001305669471270475, + "loss": 0.7901, + "step": 26866 + }, + { + "epoch": 0.6898692228315206, + "grad_norm": 0.9140625, + "learning_rate": 0.00013056269656192283, + "loss": 0.8687, + "step": 26867 + }, + { + "epoch": 0.6898949000274425, + "grad_norm": 0.80859375, + "learning_rate": 0.00013055844593588942, + "loss": 0.9122, + "step": 26868 + }, + { + "epoch": 0.6899205772233643, + "grad_norm": 0.76171875, + "learning_rate": 0.00013055419524895582, + "loss": 0.731, + "step": 26869 + }, + { + "epoch": 0.6899462544192861, + "grad_norm": 0.76171875, + "learning_rate": 0.00013054994450113048, + "loss": 0.8333, + "step": 26870 + }, + { + "epoch": 0.689971931615208, + "grad_norm": 0.78515625, + "learning_rate": 0.00013054569369242181, + "loss": 0.9154, + "step": 26871 + }, + { + "epoch": 0.6899976088111298, + "grad_norm": 0.76953125, + "learning_rate": 0.00013054144282283837, + "loss": 0.8735, + "step": 26872 + }, + { + "epoch": 0.6900232860070517, + "grad_norm": 0.76953125, + "learning_rate": 0.0001305371918923886, + "loss": 0.847, + "step": 26873 + }, + { + "epoch": 0.6900489632029734, + "grad_norm": 0.765625, + "learning_rate": 0.00013053294090108094, + "loss": 0.8093, + "step": 26874 + }, + { + "epoch": 0.6900746403988952, + "grad_norm": 0.7578125, + "learning_rate": 0.0001305286898489239, + "loss": 0.8805, + "step": 26875 + }, + { + "epoch": 0.690100317594817, + "grad_norm": 0.79296875, + "learning_rate": 0.00013052443873592593, + "loss": 0.7654, + "step": 26876 + }, + { + "epoch": 0.6901259947907389, + "grad_norm": 0.703125, + "learning_rate": 0.0001305201875620955, + "loss": 0.7588, + "step": 26877 + }, + { + "epoch": 0.6901516719866607, + "grad_norm": 0.78515625, + "learning_rate": 0.0001305159363274411, + "loss": 0.8215, + "step": 26878 + }, + { + "epoch": 0.6901773491825826, + "grad_norm": 0.76171875, + "learning_rate": 0.0001305116850319712, + "loss": 0.8391, + "step": 26879 + }, + { + "epoch": 0.6902030263785043, + "grad_norm": 0.90234375, + "learning_rate": 0.00013050743367569432, + "loss": 0.8484, + "step": 26880 + }, + { + "epoch": 0.6902287035744261, + "grad_norm": 0.78125, + "learning_rate": 0.0001305031822586188, + "loss": 0.9136, + "step": 26881 + }, + { + "epoch": 0.690254380770348, + "grad_norm": 0.70703125, + "learning_rate": 0.00013049893078075325, + "loss": 0.8485, + "step": 26882 + }, + { + "epoch": 0.6902800579662698, + "grad_norm": 0.703125, + "learning_rate": 0.00013049467924210604, + "loss": 0.7208, + "step": 26883 + }, + { + "epoch": 0.6903057351621916, + "grad_norm": 0.88671875, + "learning_rate": 0.0001304904276426857, + "loss": 0.8664, + "step": 26884 + }, + { + "epoch": 0.6903314123581135, + "grad_norm": 0.8359375, + "learning_rate": 0.00013048617598250073, + "loss": 0.8053, + "step": 26885 + }, + { + "epoch": 0.6903570895540353, + "grad_norm": 0.83984375, + "learning_rate": 0.00013048192426155954, + "loss": 0.8918, + "step": 26886 + }, + { + "epoch": 0.690382766749957, + "grad_norm": 0.765625, + "learning_rate": 0.00013047767247987064, + "loss": 0.9044, + "step": 26887 + }, + { + "epoch": 0.6904084439458789, + "grad_norm": 0.76171875, + "learning_rate": 0.00013047342063744245, + "loss": 0.8008, + "step": 26888 + }, + { + "epoch": 0.6904341211418007, + "grad_norm": 0.7578125, + "learning_rate": 0.00013046916873428352, + "loss": 0.723, + "step": 26889 + }, + { + "epoch": 0.6904597983377225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001304649167704023, + "loss": 0.8452, + "step": 26890 + }, + { + "epoch": 0.6904854755336444, + "grad_norm": 0.7578125, + "learning_rate": 0.00013046066474580722, + "loss": 0.7076, + "step": 26891 + }, + { + "epoch": 0.6905111527295662, + "grad_norm": 0.796875, + "learning_rate": 0.0001304564126605068, + "loss": 0.6931, + "step": 26892 + }, + { + "epoch": 0.690536829925488, + "grad_norm": 0.8203125, + "learning_rate": 0.0001304521605145095, + "loss": 1.0514, + "step": 26893 + }, + { + "epoch": 0.6905625071214098, + "grad_norm": 0.87890625, + "learning_rate": 0.00013044790830782382, + "loss": 0.9045, + "step": 26894 + }, + { + "epoch": 0.6905881843173316, + "grad_norm": 0.75, + "learning_rate": 0.00013044365604045818, + "loss": 0.9138, + "step": 26895 + }, + { + "epoch": 0.6906138615132534, + "grad_norm": 0.73046875, + "learning_rate": 0.00013043940371242107, + "loss": 0.8212, + "step": 26896 + }, + { + "epoch": 0.6906395387091753, + "grad_norm": 0.7890625, + "learning_rate": 0.000130435151323721, + "loss": 0.891, + "step": 26897 + }, + { + "epoch": 0.6906652159050971, + "grad_norm": 0.73828125, + "learning_rate": 0.00013043089887436642, + "loss": 0.7875, + "step": 26898 + }, + { + "epoch": 0.6906908931010189, + "grad_norm": 0.75390625, + "learning_rate": 0.0001304266463643658, + "loss": 0.7602, + "step": 26899 + }, + { + "epoch": 0.6907165702969407, + "grad_norm": 0.84375, + "learning_rate": 0.00013042239379372765, + "loss": 0.9375, + "step": 26900 + }, + { + "epoch": 0.6907422474928625, + "grad_norm": 0.7421875, + "learning_rate": 0.0001304181411624604, + "loss": 0.8082, + "step": 26901 + }, + { + "epoch": 0.6907679246887843, + "grad_norm": 0.796875, + "learning_rate": 0.00013041388847057254, + "loss": 0.9276, + "step": 26902 + }, + { + "epoch": 0.6907936018847062, + "grad_norm": 0.7578125, + "learning_rate": 0.00013040963571807253, + "loss": 0.9034, + "step": 26903 + }, + { + "epoch": 0.690819279080628, + "grad_norm": 0.7421875, + "learning_rate": 0.00013040538290496888, + "loss": 0.9547, + "step": 26904 + }, + { + "epoch": 0.6908449562765498, + "grad_norm": 1.0390625, + "learning_rate": 0.00013040113003127005, + "loss": 0.9169, + "step": 26905 + }, + { + "epoch": 0.6908706334724717, + "grad_norm": 0.83984375, + "learning_rate": 0.00013039687709698452, + "loss": 0.9069, + "step": 26906 + }, + { + "epoch": 0.6908963106683934, + "grad_norm": 0.81640625, + "learning_rate": 0.00013039262410212077, + "loss": 0.9278, + "step": 26907 + }, + { + "epoch": 0.6909219878643152, + "grad_norm": 0.88671875, + "learning_rate": 0.00013038837104668722, + "loss": 0.7863, + "step": 26908 + }, + { + "epoch": 0.6909476650602371, + "grad_norm": 0.8359375, + "learning_rate": 0.00013038411793069244, + "loss": 0.747, + "step": 26909 + }, + { + "epoch": 0.6909733422561589, + "grad_norm": 0.76953125, + "learning_rate": 0.00013037986475414483, + "loss": 0.8906, + "step": 26910 + }, + { + "epoch": 0.6909990194520808, + "grad_norm": 0.78125, + "learning_rate": 0.00013037561151705288, + "loss": 0.9618, + "step": 26911 + }, + { + "epoch": 0.6910246966480026, + "grad_norm": 0.67578125, + "learning_rate": 0.0001303713582194251, + "loss": 0.8012, + "step": 26912 + }, + { + "epoch": 0.6910503738439244, + "grad_norm": 0.8359375, + "learning_rate": 0.00013036710486126994, + "loss": 0.8235, + "step": 26913 + }, + { + "epoch": 0.6910760510398462, + "grad_norm": 0.7421875, + "learning_rate": 0.0001303628514425959, + "loss": 0.8683, + "step": 26914 + }, + { + "epoch": 0.691101728235768, + "grad_norm": 0.7890625, + "learning_rate": 0.00013035859796341147, + "loss": 0.9761, + "step": 26915 + }, + { + "epoch": 0.6911274054316898, + "grad_norm": 0.83203125, + "learning_rate": 0.000130354344423725, + "loss": 0.9681, + "step": 26916 + }, + { + "epoch": 0.6911530826276117, + "grad_norm": 0.7578125, + "learning_rate": 0.00013035009082354513, + "loss": 0.7844, + "step": 26917 + }, + { + "epoch": 0.6911787598235335, + "grad_norm": 0.74609375, + "learning_rate": 0.00013034583716288028, + "loss": 0.8832, + "step": 26918 + }, + { + "epoch": 0.6912044370194553, + "grad_norm": 0.828125, + "learning_rate": 0.00013034158344173887, + "loss": 0.746, + "step": 26919 + }, + { + "epoch": 0.6912301142153771, + "grad_norm": 0.6953125, + "learning_rate": 0.00013033732966012947, + "loss": 0.9254, + "step": 26920 + }, + { + "epoch": 0.6912557914112989, + "grad_norm": 0.796875, + "learning_rate": 0.0001303330758180605, + "loss": 0.7922, + "step": 26921 + }, + { + "epoch": 0.6912814686072207, + "grad_norm": 0.8046875, + "learning_rate": 0.00013032882191554043, + "loss": 0.8753, + "step": 26922 + }, + { + "epoch": 0.6913071458031426, + "grad_norm": 0.859375, + "learning_rate": 0.00013032456795257778, + "loss": 0.7433, + "step": 26923 + }, + { + "epoch": 0.6913328229990644, + "grad_norm": 0.76953125, + "learning_rate": 0.000130320313929181, + "loss": 0.6986, + "step": 26924 + }, + { + "epoch": 0.6913585001949862, + "grad_norm": 0.73828125, + "learning_rate": 0.00013031605984535857, + "loss": 0.8164, + "step": 26925 + }, + { + "epoch": 0.6913841773909081, + "grad_norm": 0.76953125, + "learning_rate": 0.00013031180570111897, + "loss": 0.8312, + "step": 26926 + }, + { + "epoch": 0.6914098545868298, + "grad_norm": 0.8203125, + "learning_rate": 0.0001303075514964707, + "loss": 0.7112, + "step": 26927 + }, + { + "epoch": 0.6914355317827516, + "grad_norm": 0.79296875, + "learning_rate": 0.00013030329723142222, + "loss": 0.8098, + "step": 26928 + }, + { + "epoch": 0.6914612089786735, + "grad_norm": 0.796875, + "learning_rate": 0.00013029904290598194, + "loss": 0.9593, + "step": 26929 + }, + { + "epoch": 0.6914868861745953, + "grad_norm": 0.7578125, + "learning_rate": 0.00013029478852015847, + "loss": 0.8465, + "step": 26930 + }, + { + "epoch": 0.6915125633705171, + "grad_norm": 0.7421875, + "learning_rate": 0.0001302905340739602, + "loss": 0.8389, + "step": 26931 + }, + { + "epoch": 0.691538240566439, + "grad_norm": 0.79296875, + "learning_rate": 0.00013028627956739562, + "loss": 0.8486, + "step": 26932 + }, + { + "epoch": 0.6915639177623608, + "grad_norm": 0.83984375, + "learning_rate": 0.00013028202500047327, + "loss": 0.8586, + "step": 26933 + }, + { + "epoch": 0.6915895949582825, + "grad_norm": 0.71484375, + "learning_rate": 0.00013027777037320152, + "loss": 0.7866, + "step": 26934 + }, + { + "epoch": 0.6916152721542044, + "grad_norm": 0.7109375, + "learning_rate": 0.0001302735156855889, + "loss": 0.8147, + "step": 26935 + }, + { + "epoch": 0.6916409493501262, + "grad_norm": 0.75390625, + "learning_rate": 0.00013026926093764397, + "loss": 0.7997, + "step": 26936 + }, + { + "epoch": 0.691666626546048, + "grad_norm": 0.77734375, + "learning_rate": 0.0001302650061293751, + "loss": 0.8511, + "step": 26937 + }, + { + "epoch": 0.6916923037419699, + "grad_norm": 0.78125, + "learning_rate": 0.0001302607512607908, + "loss": 0.8288, + "step": 26938 + }, + { + "epoch": 0.6917179809378917, + "grad_norm": 0.7421875, + "learning_rate": 0.00013025649633189955, + "loss": 0.7588, + "step": 26939 + }, + { + "epoch": 0.6917436581338134, + "grad_norm": 0.76953125, + "learning_rate": 0.00013025224134270987, + "loss": 0.8416, + "step": 26940 + }, + { + "epoch": 0.6917693353297353, + "grad_norm": 0.83203125, + "learning_rate": 0.00013024798629323017, + "loss": 0.9662, + "step": 26941 + }, + { + "epoch": 0.6917950125256571, + "grad_norm": 0.77734375, + "learning_rate": 0.00013024373118346896, + "loss": 0.8684, + "step": 26942 + }, + { + "epoch": 0.691820689721579, + "grad_norm": 0.99609375, + "learning_rate": 0.00013023947601343474, + "loss": 0.858, + "step": 26943 + }, + { + "epoch": 0.6918463669175008, + "grad_norm": 0.828125, + "learning_rate": 0.000130235220783136, + "loss": 1.0202, + "step": 26944 + }, + { + "epoch": 0.6918720441134226, + "grad_norm": 0.75390625, + "learning_rate": 0.00013023096549258115, + "loss": 0.7869, + "step": 26945 + }, + { + "epoch": 0.6918977213093445, + "grad_norm": 0.70703125, + "learning_rate": 0.00013022671014177877, + "loss": 0.6975, + "step": 26946 + }, + { + "epoch": 0.6919233985052662, + "grad_norm": 0.7578125, + "learning_rate": 0.00013022245473073726, + "loss": 0.8294, + "step": 26947 + }, + { + "epoch": 0.691949075701188, + "grad_norm": 0.76171875, + "learning_rate": 0.0001302181992594651, + "loss": 0.7974, + "step": 26948 + }, + { + "epoch": 0.6919747528971099, + "grad_norm": 0.8515625, + "learning_rate": 0.00013021394372797083, + "loss": 0.922, + "step": 26949 + }, + { + "epoch": 0.6920004300930317, + "grad_norm": 0.77734375, + "learning_rate": 0.0001302096881362629, + "loss": 0.8165, + "step": 26950 + }, + { + "epoch": 0.6920261072889535, + "grad_norm": 0.81640625, + "learning_rate": 0.00013020543248434978, + "loss": 0.87, + "step": 26951 + }, + { + "epoch": 0.6920517844848754, + "grad_norm": 0.8984375, + "learning_rate": 0.00013020117677223998, + "loss": 0.8382, + "step": 26952 + }, + { + "epoch": 0.6920774616807972, + "grad_norm": 0.81640625, + "learning_rate": 0.00013019692099994193, + "loss": 0.8478, + "step": 26953 + }, + { + "epoch": 0.6921031388767189, + "grad_norm": 0.74609375, + "learning_rate": 0.0001301926651674642, + "loss": 0.812, + "step": 26954 + }, + { + "epoch": 0.6921288160726408, + "grad_norm": 0.7421875, + "learning_rate": 0.00013018840927481518, + "loss": 0.7743, + "step": 26955 + }, + { + "epoch": 0.6921544932685626, + "grad_norm": 0.796875, + "learning_rate": 0.00013018415332200335, + "loss": 0.9425, + "step": 26956 + }, + { + "epoch": 0.6921801704644844, + "grad_norm": 0.8125, + "learning_rate": 0.00013017989730903726, + "loss": 0.8236, + "step": 26957 + }, + { + "epoch": 0.6922058476604063, + "grad_norm": 0.73046875, + "learning_rate": 0.0001301756412359254, + "loss": 0.775, + "step": 26958 + }, + { + "epoch": 0.6922315248563281, + "grad_norm": 0.8125, + "learning_rate": 0.00013017138510267615, + "loss": 0.8714, + "step": 26959 + }, + { + "epoch": 0.6922572020522498, + "grad_norm": 0.76953125, + "learning_rate": 0.0001301671289092981, + "loss": 0.9375, + "step": 26960 + }, + { + "epoch": 0.6922828792481717, + "grad_norm": 0.78515625, + "learning_rate": 0.00013016287265579964, + "loss": 0.7737, + "step": 26961 + }, + { + "epoch": 0.6923085564440935, + "grad_norm": 0.79296875, + "learning_rate": 0.00013015861634218935, + "loss": 0.777, + "step": 26962 + }, + { + "epoch": 0.6923342336400153, + "grad_norm": 0.8359375, + "learning_rate": 0.00013015435996847566, + "loss": 0.8358, + "step": 26963 + }, + { + "epoch": 0.6923599108359372, + "grad_norm": 0.796875, + "learning_rate": 0.00013015010353466702, + "loss": 0.8756, + "step": 26964 + }, + { + "epoch": 0.692385588031859, + "grad_norm": 0.8046875, + "learning_rate": 0.00013014584704077196, + "loss": 1.0232, + "step": 26965 + }, + { + "epoch": 0.6924112652277808, + "grad_norm": 0.78125, + "learning_rate": 0.00013014159048679896, + "loss": 0.8421, + "step": 26966 + }, + { + "epoch": 0.6924369424237026, + "grad_norm": 0.72265625, + "learning_rate": 0.00013013733387275649, + "loss": 0.7253, + "step": 26967 + }, + { + "epoch": 0.6924626196196244, + "grad_norm": 0.86328125, + "learning_rate": 0.000130133077198653, + "loss": 0.9746, + "step": 26968 + }, + { + "epoch": 0.6924882968155462, + "grad_norm": 0.7109375, + "learning_rate": 0.00013012882046449704, + "loss": 0.826, + "step": 26969 + }, + { + "epoch": 0.6925139740114681, + "grad_norm": 0.828125, + "learning_rate": 0.00013012456367029709, + "loss": 0.7927, + "step": 26970 + }, + { + "epoch": 0.6925396512073899, + "grad_norm": 0.73046875, + "learning_rate": 0.00013012030681606158, + "loss": 0.6965, + "step": 26971 + }, + { + "epoch": 0.6925653284033118, + "grad_norm": 0.83984375, + "learning_rate": 0.00013011604990179903, + "loss": 0.9028, + "step": 26972 + }, + { + "epoch": 0.6925910055992336, + "grad_norm": 0.83984375, + "learning_rate": 0.0001301117929275179, + "loss": 0.7899, + "step": 26973 + }, + { + "epoch": 0.6926166827951553, + "grad_norm": 0.7421875, + "learning_rate": 0.00013010753589322668, + "loss": 0.787, + "step": 26974 + }, + { + "epoch": 0.6926423599910772, + "grad_norm": 0.78125, + "learning_rate": 0.00013010327879893385, + "loss": 0.8875, + "step": 26975 + }, + { + "epoch": 0.692668037186999, + "grad_norm": 0.79296875, + "learning_rate": 0.00013009902164464794, + "loss": 0.8262, + "step": 26976 + }, + { + "epoch": 0.6926937143829208, + "grad_norm": 0.6796875, + "learning_rate": 0.00013009476443037735, + "loss": 0.8111, + "step": 26977 + }, + { + "epoch": 0.6927193915788427, + "grad_norm": 0.9375, + "learning_rate": 0.00013009050715613065, + "loss": 0.8512, + "step": 26978 + }, + { + "epoch": 0.6927450687747645, + "grad_norm": 0.76953125, + "learning_rate": 0.00013008624982191632, + "loss": 0.7515, + "step": 26979 + }, + { + "epoch": 0.6927707459706862, + "grad_norm": 0.7890625, + "learning_rate": 0.00013008199242774276, + "loss": 0.8526, + "step": 26980 + }, + { + "epoch": 0.692796423166608, + "grad_norm": 0.79296875, + "learning_rate": 0.00013007773497361851, + "loss": 0.7401, + "step": 26981 + }, + { + "epoch": 0.6928221003625299, + "grad_norm": 0.73828125, + "learning_rate": 0.00013007347745955207, + "loss": 0.8983, + "step": 26982 + }, + { + "epoch": 0.6928477775584517, + "grad_norm": 0.83203125, + "learning_rate": 0.00013006921988555187, + "loss": 0.7823, + "step": 26983 + }, + { + "epoch": 0.6928734547543736, + "grad_norm": 0.75390625, + "learning_rate": 0.00013006496225162646, + "loss": 0.8548, + "step": 26984 + }, + { + "epoch": 0.6928991319502954, + "grad_norm": 0.85546875, + "learning_rate": 0.0001300607045577843, + "loss": 0.7555, + "step": 26985 + }, + { + "epoch": 0.6929248091462172, + "grad_norm": 0.79296875, + "learning_rate": 0.00013005644680403388, + "loss": 0.9061, + "step": 26986 + }, + { + "epoch": 0.692950486342139, + "grad_norm": 0.81640625, + "learning_rate": 0.00013005218899038367, + "loss": 0.7394, + "step": 26987 + }, + { + "epoch": 0.6929761635380608, + "grad_norm": 0.72265625, + "learning_rate": 0.00013004793111684213, + "loss": 0.8244, + "step": 26988 + }, + { + "epoch": 0.6930018407339826, + "grad_norm": 0.859375, + "learning_rate": 0.00013004367318341782, + "loss": 0.8804, + "step": 26989 + }, + { + "epoch": 0.6930275179299045, + "grad_norm": 0.828125, + "learning_rate": 0.00013003941519011916, + "loss": 0.8565, + "step": 26990 + }, + { + "epoch": 0.6930531951258263, + "grad_norm": 0.80078125, + "learning_rate": 0.00013003515713695465, + "loss": 0.7662, + "step": 26991 + }, + { + "epoch": 0.6930788723217481, + "grad_norm": 0.87890625, + "learning_rate": 0.00013003089902393283, + "loss": 0.8637, + "step": 26992 + }, + { + "epoch": 0.69310454951767, + "grad_norm": 0.7890625, + "learning_rate": 0.0001300266408510621, + "loss": 0.8273, + "step": 26993 + }, + { + "epoch": 0.6931302267135917, + "grad_norm": 0.78515625, + "learning_rate": 0.00013002238261835097, + "loss": 0.7686, + "step": 26994 + }, + { + "epoch": 0.6931559039095135, + "grad_norm": 0.68359375, + "learning_rate": 0.00013001812432580798, + "loss": 0.6451, + "step": 26995 + }, + { + "epoch": 0.6931815811054354, + "grad_norm": 0.72265625, + "learning_rate": 0.00013001386597344156, + "loss": 0.853, + "step": 26996 + }, + { + "epoch": 0.6932072583013572, + "grad_norm": 0.76953125, + "learning_rate": 0.00013000960756126025, + "loss": 0.8902, + "step": 26997 + }, + { + "epoch": 0.693232935497279, + "grad_norm": 0.93359375, + "learning_rate": 0.00013000534908927248, + "loss": 0.82, + "step": 26998 + }, + { + "epoch": 0.6932586126932009, + "grad_norm": 0.85546875, + "learning_rate": 0.00013000109055748675, + "loss": 1.0165, + "step": 26999 + }, + { + "epoch": 0.6932842898891226, + "grad_norm": 0.84765625, + "learning_rate": 0.00012999683196591156, + "loss": 0.9192, + "step": 27000 + }, + { + "epoch": 0.6932842898891226, + "eval_loss": 0.8348429799079895, + "eval_runtime": 388.4811, + "eval_samples_per_second": 25.741, + "eval_steps_per_second": 0.806, + "step": 27000 + }, + { + "epoch": 0.6933099670850444, + "grad_norm": 0.76953125, + "learning_rate": 0.0001299925733145554, + "loss": 0.8545, + "step": 27001 + }, + { + "epoch": 0.6933356442809663, + "grad_norm": 0.7421875, + "learning_rate": 0.00012998831460342674, + "loss": 0.8844, + "step": 27002 + }, + { + "epoch": 0.6933613214768881, + "grad_norm": 0.796875, + "learning_rate": 0.00012998405583253408, + "loss": 0.8758, + "step": 27003 + }, + { + "epoch": 0.69338699867281, + "grad_norm": 0.87109375, + "learning_rate": 0.00012997979700188589, + "loss": 0.9657, + "step": 27004 + }, + { + "epoch": 0.6934126758687318, + "grad_norm": 0.86328125, + "learning_rate": 0.00012997553811149075, + "loss": 0.9708, + "step": 27005 + }, + { + "epoch": 0.6934383530646536, + "grad_norm": 0.77734375, + "learning_rate": 0.00012997127916135697, + "loss": 0.7353, + "step": 27006 + }, + { + "epoch": 0.6934640302605753, + "grad_norm": 0.8046875, + "learning_rate": 0.0001299670201514932, + "loss": 0.8765, + "step": 27007 + }, + { + "epoch": 0.6934897074564972, + "grad_norm": 0.7734375, + "learning_rate": 0.00012996276108190785, + "loss": 0.7747, + "step": 27008 + }, + { + "epoch": 0.693515384652419, + "grad_norm": 0.77734375, + "learning_rate": 0.00012995850195260942, + "loss": 0.9435, + "step": 27009 + }, + { + "epoch": 0.6935410618483409, + "grad_norm": 0.7421875, + "learning_rate": 0.0001299542427636064, + "loss": 0.9104, + "step": 27010 + }, + { + "epoch": 0.6935667390442627, + "grad_norm": 0.96875, + "learning_rate": 0.0001299499835149073, + "loss": 0.82, + "step": 27011 + }, + { + "epoch": 0.6935924162401845, + "grad_norm": 0.734375, + "learning_rate": 0.00012994572420652056, + "loss": 0.7521, + "step": 27012 + }, + { + "epoch": 0.6936180934361064, + "grad_norm": 0.7890625, + "learning_rate": 0.0001299414648384547, + "loss": 0.8337, + "step": 27013 + }, + { + "epoch": 0.6936437706320281, + "grad_norm": 0.78515625, + "learning_rate": 0.00012993720541071822, + "loss": 0.8509, + "step": 27014 + }, + { + "epoch": 0.6936694478279499, + "grad_norm": 0.77734375, + "learning_rate": 0.00012993294592331957, + "loss": 0.806, + "step": 27015 + }, + { + "epoch": 0.6936951250238718, + "grad_norm": 0.6875, + "learning_rate": 0.00012992868637626729, + "loss": 0.8181, + "step": 27016 + }, + { + "epoch": 0.6937208022197936, + "grad_norm": 0.71484375, + "learning_rate": 0.0001299244267695698, + "loss": 0.7755, + "step": 27017 + }, + { + "epoch": 0.6937464794157154, + "grad_norm": 0.96484375, + "learning_rate": 0.00012992016710323568, + "loss": 0.8631, + "step": 27018 + }, + { + "epoch": 0.6937721566116373, + "grad_norm": 0.76171875, + "learning_rate": 0.0001299159073772734, + "loss": 0.6687, + "step": 27019 + }, + { + "epoch": 0.693797833807559, + "grad_norm": 0.796875, + "learning_rate": 0.00012991164759169134, + "loss": 0.9351, + "step": 27020 + }, + { + "epoch": 0.6938235110034808, + "grad_norm": 0.82421875, + "learning_rate": 0.00012990738774649812, + "loss": 0.8974, + "step": 27021 + }, + { + "epoch": 0.6938491881994027, + "grad_norm": 0.76171875, + "learning_rate": 0.00012990312784170213, + "loss": 0.7767, + "step": 27022 + }, + { + "epoch": 0.6938748653953245, + "grad_norm": 0.6953125, + "learning_rate": 0.00012989886787731194, + "loss": 0.7514, + "step": 27023 + }, + { + "epoch": 0.6939005425912463, + "grad_norm": 0.71875, + "learning_rate": 0.000129894607853336, + "loss": 0.8103, + "step": 27024 + }, + { + "epoch": 0.6939262197871682, + "grad_norm": 0.89453125, + "learning_rate": 0.00012989034776978282, + "loss": 1.0924, + "step": 27025 + }, + { + "epoch": 0.69395189698309, + "grad_norm": 0.72265625, + "learning_rate": 0.00012988608762666085, + "loss": 0.7654, + "step": 27026 + }, + { + "epoch": 0.6939775741790117, + "grad_norm": 0.77734375, + "learning_rate": 0.00012988182742397863, + "loss": 0.8888, + "step": 27027 + }, + { + "epoch": 0.6940032513749336, + "grad_norm": 0.84375, + "learning_rate": 0.00012987756716174463, + "loss": 0.788, + "step": 27028 + }, + { + "epoch": 0.6940289285708554, + "grad_norm": 0.78125, + "learning_rate": 0.0001298733068399673, + "loss": 0.869, + "step": 27029 + }, + { + "epoch": 0.6940546057667772, + "grad_norm": 0.78125, + "learning_rate": 0.00012986904645865521, + "loss": 0.741, + "step": 27030 + }, + { + "epoch": 0.6940802829626991, + "grad_norm": 0.81640625, + "learning_rate": 0.00012986478601781677, + "loss": 0.7718, + "step": 27031 + }, + { + "epoch": 0.6941059601586209, + "grad_norm": 0.7265625, + "learning_rate": 0.00012986052551746058, + "loss": 0.8663, + "step": 27032 + }, + { + "epoch": 0.6941316373545428, + "grad_norm": 0.6953125, + "learning_rate": 0.00012985626495759498, + "loss": 0.8549, + "step": 27033 + }, + { + "epoch": 0.6941573145504645, + "grad_norm": 0.8203125, + "learning_rate": 0.00012985200433822858, + "loss": 0.8789, + "step": 27034 + }, + { + "epoch": 0.6941829917463863, + "grad_norm": 0.875, + "learning_rate": 0.00012984774365936983, + "loss": 0.8344, + "step": 27035 + }, + { + "epoch": 0.6942086689423081, + "grad_norm": 0.8359375, + "learning_rate": 0.0001298434829210272, + "loss": 0.8636, + "step": 27036 + }, + { + "epoch": 0.69423434613823, + "grad_norm": 0.81640625, + "learning_rate": 0.00012983922212320923, + "loss": 0.9247, + "step": 27037 + }, + { + "epoch": 0.6942600233341518, + "grad_norm": 0.82421875, + "learning_rate": 0.0001298349612659244, + "loss": 0.9421, + "step": 27038 + }, + { + "epoch": 0.6942857005300737, + "grad_norm": 0.78125, + "learning_rate": 0.00012983070034918115, + "loss": 0.7907, + "step": 27039 + }, + { + "epoch": 0.6943113777259954, + "grad_norm": 0.78515625, + "learning_rate": 0.000129826439372988, + "loss": 0.9031, + "step": 27040 + }, + { + "epoch": 0.6943370549219172, + "grad_norm": 0.90625, + "learning_rate": 0.0001298221783373535, + "loss": 0.9106, + "step": 27041 + }, + { + "epoch": 0.694362732117839, + "grad_norm": 0.734375, + "learning_rate": 0.00012981791724228604, + "loss": 0.8701, + "step": 27042 + }, + { + "epoch": 0.6943884093137609, + "grad_norm": 0.7734375, + "learning_rate": 0.0001298136560877942, + "loss": 0.7764, + "step": 27043 + }, + { + "epoch": 0.6944140865096827, + "grad_norm": 0.7734375, + "learning_rate": 0.00012980939487388643, + "loss": 0.8586, + "step": 27044 + }, + { + "epoch": 0.6944397637056046, + "grad_norm": 0.7890625, + "learning_rate": 0.00012980513360057123, + "loss": 0.9678, + "step": 27045 + }, + { + "epoch": 0.6944654409015264, + "grad_norm": 0.8671875, + "learning_rate": 0.00012980087226785708, + "loss": 0.8284, + "step": 27046 + }, + { + "epoch": 0.6944911180974481, + "grad_norm": 0.80859375, + "learning_rate": 0.0001297966108757525, + "loss": 0.8503, + "step": 27047 + }, + { + "epoch": 0.69451679529337, + "grad_norm": 0.8046875, + "learning_rate": 0.00012979234942426594, + "loss": 0.7764, + "step": 27048 + }, + { + "epoch": 0.6945424724892918, + "grad_norm": 0.859375, + "learning_rate": 0.00012978808791340592, + "loss": 0.9258, + "step": 27049 + }, + { + "epoch": 0.6945681496852136, + "grad_norm": 0.75390625, + "learning_rate": 0.00012978382634318096, + "loss": 0.7947, + "step": 27050 + }, + { + "epoch": 0.6945938268811355, + "grad_norm": 0.76953125, + "learning_rate": 0.00012977956471359954, + "loss": 0.8257, + "step": 27051 + }, + { + "epoch": 0.6946195040770573, + "grad_norm": 0.765625, + "learning_rate": 0.0001297753030246701, + "loss": 0.8572, + "step": 27052 + }, + { + "epoch": 0.6946451812729791, + "grad_norm": 0.73828125, + "learning_rate": 0.00012977104127640118, + "loss": 0.7765, + "step": 27053 + }, + { + "epoch": 0.6946708584689009, + "grad_norm": 0.75, + "learning_rate": 0.00012976677946880124, + "loss": 0.7698, + "step": 27054 + }, + { + "epoch": 0.6946965356648227, + "grad_norm": 0.77734375, + "learning_rate": 0.00012976251760187882, + "loss": 1.0086, + "step": 27055 + }, + { + "epoch": 0.6947222128607445, + "grad_norm": 0.734375, + "learning_rate": 0.0001297582556756424, + "loss": 0.7012, + "step": 27056 + }, + { + "epoch": 0.6947478900566664, + "grad_norm": 0.73828125, + "learning_rate": 0.00012975399369010045, + "loss": 0.7293, + "step": 27057 + }, + { + "epoch": 0.6947735672525882, + "grad_norm": 0.75390625, + "learning_rate": 0.0001297497316452615, + "loss": 0.7826, + "step": 27058 + }, + { + "epoch": 0.69479924444851, + "grad_norm": 0.7265625, + "learning_rate": 0.000129745469541134, + "loss": 0.8508, + "step": 27059 + }, + { + "epoch": 0.6948249216444318, + "grad_norm": 0.80078125, + "learning_rate": 0.00012974120737772649, + "loss": 0.7806, + "step": 27060 + }, + { + "epoch": 0.6948505988403536, + "grad_norm": 0.78125, + "learning_rate": 0.0001297369451550474, + "loss": 0.8171, + "step": 27061 + }, + { + "epoch": 0.6948762760362754, + "grad_norm": 0.8046875, + "learning_rate": 0.00012973268287310528, + "loss": 0.7946, + "step": 27062 + }, + { + "epoch": 0.6949019532321973, + "grad_norm": 0.72265625, + "learning_rate": 0.0001297284205319086, + "loss": 0.7521, + "step": 27063 + }, + { + "epoch": 0.6949276304281191, + "grad_norm": 0.77734375, + "learning_rate": 0.0001297241581314659, + "loss": 0.8402, + "step": 27064 + }, + { + "epoch": 0.694953307624041, + "grad_norm": 0.74609375, + "learning_rate": 0.0001297198956717856, + "loss": 0.8223, + "step": 27065 + }, + { + "epoch": 0.6949789848199628, + "grad_norm": 0.78515625, + "learning_rate": 0.0001297156331528763, + "loss": 0.7447, + "step": 27066 + }, + { + "epoch": 0.6950046620158845, + "grad_norm": 0.83203125, + "learning_rate": 0.00012971137057474636, + "loss": 0.8058, + "step": 27067 + }, + { + "epoch": 0.6950303392118063, + "grad_norm": 0.73046875, + "learning_rate": 0.00012970710793740436, + "loss": 0.79, + "step": 27068 + }, + { + "epoch": 0.6950560164077282, + "grad_norm": 0.88671875, + "learning_rate": 0.00012970284524085876, + "loss": 0.9817, + "step": 27069 + }, + { + "epoch": 0.69508169360365, + "grad_norm": 0.8125, + "learning_rate": 0.0001296985824851181, + "loss": 0.9347, + "step": 27070 + }, + { + "epoch": 0.6951073707995719, + "grad_norm": 0.80078125, + "learning_rate": 0.00012969431967019084, + "loss": 0.8792, + "step": 27071 + }, + { + "epoch": 0.6951330479954937, + "grad_norm": 0.8359375, + "learning_rate": 0.00012969005679608548, + "loss": 0.9413, + "step": 27072 + }, + { + "epoch": 0.6951587251914154, + "grad_norm": 0.80078125, + "learning_rate": 0.0001296857938628105, + "loss": 0.7548, + "step": 27073 + }, + { + "epoch": 0.6951844023873373, + "grad_norm": 0.7421875, + "learning_rate": 0.00012968153087037444, + "loss": 0.8963, + "step": 27074 + }, + { + "epoch": 0.6952100795832591, + "grad_norm": 0.80859375, + "learning_rate": 0.00012967726781878578, + "loss": 0.8185, + "step": 27075 + }, + { + "epoch": 0.6952357567791809, + "grad_norm": 0.72265625, + "learning_rate": 0.00012967300470805297, + "loss": 0.7172, + "step": 27076 + }, + { + "epoch": 0.6952614339751028, + "grad_norm": 0.73046875, + "learning_rate": 0.00012966874153818455, + "loss": 0.8589, + "step": 27077 + }, + { + "epoch": 0.6952871111710246, + "grad_norm": 0.8046875, + "learning_rate": 0.000129664478309189, + "loss": 0.9193, + "step": 27078 + }, + { + "epoch": 0.6953127883669464, + "grad_norm": 0.7734375, + "learning_rate": 0.0001296602150210749, + "loss": 0.8078, + "step": 27079 + }, + { + "epoch": 0.6953384655628682, + "grad_norm": 0.76171875, + "learning_rate": 0.0001296559516738506, + "loss": 0.8478, + "step": 27080 + }, + { + "epoch": 0.69536414275879, + "grad_norm": 0.8125, + "learning_rate": 0.00012965168826752467, + "loss": 0.8114, + "step": 27081 + }, + { + "epoch": 0.6953898199547118, + "grad_norm": 0.8203125, + "learning_rate": 0.00012964742480210562, + "loss": 0.7694, + "step": 27082 + }, + { + "epoch": 0.6954154971506337, + "grad_norm": 0.8125, + "learning_rate": 0.00012964316127760193, + "loss": 0.818, + "step": 27083 + }, + { + "epoch": 0.6954411743465555, + "grad_norm": 0.74609375, + "learning_rate": 0.0001296388976940221, + "loss": 0.782, + "step": 27084 + }, + { + "epoch": 0.6954668515424773, + "grad_norm": 0.80859375, + "learning_rate": 0.0001296346340513746, + "loss": 0.9571, + "step": 27085 + }, + { + "epoch": 0.6954925287383992, + "grad_norm": 0.71875, + "learning_rate": 0.00012963037034966797, + "loss": 0.8164, + "step": 27086 + }, + { + "epoch": 0.6955182059343209, + "grad_norm": 0.75390625, + "learning_rate": 0.00012962610658891067, + "loss": 0.8194, + "step": 27087 + }, + { + "epoch": 0.6955438831302427, + "grad_norm": 0.85546875, + "learning_rate": 0.00012962184276911124, + "loss": 0.8997, + "step": 27088 + }, + { + "epoch": 0.6955695603261646, + "grad_norm": 0.71875, + "learning_rate": 0.00012961757889027814, + "loss": 0.7293, + "step": 27089 + }, + { + "epoch": 0.6955952375220864, + "grad_norm": 0.75390625, + "learning_rate": 0.00012961331495241988, + "loss": 0.8559, + "step": 27090 + }, + { + "epoch": 0.6956209147180082, + "grad_norm": 0.8671875, + "learning_rate": 0.000129609050955545, + "loss": 0.8741, + "step": 27091 + }, + { + "epoch": 0.6956465919139301, + "grad_norm": 0.78515625, + "learning_rate": 0.0001296047868996619, + "loss": 0.9075, + "step": 27092 + }, + { + "epoch": 0.6956722691098518, + "grad_norm": 0.7421875, + "learning_rate": 0.00012960052278477915, + "loss": 0.7209, + "step": 27093 + }, + { + "epoch": 0.6956979463057736, + "grad_norm": 0.734375, + "learning_rate": 0.00012959625861090523, + "loss": 0.7788, + "step": 27094 + }, + { + "epoch": 0.6957236235016955, + "grad_norm": 0.79296875, + "learning_rate": 0.00012959199437804865, + "loss": 0.8057, + "step": 27095 + }, + { + "epoch": 0.6957493006976173, + "grad_norm": 0.7578125, + "learning_rate": 0.0001295877300862179, + "loss": 0.7896, + "step": 27096 + }, + { + "epoch": 0.6957749778935391, + "grad_norm": 0.83203125, + "learning_rate": 0.00012958346573542148, + "loss": 0.8345, + "step": 27097 + }, + { + "epoch": 0.695800655089461, + "grad_norm": 0.75390625, + "learning_rate": 0.0001295792013256679, + "loss": 0.8428, + "step": 27098 + }, + { + "epoch": 0.6958263322853828, + "grad_norm": 0.72265625, + "learning_rate": 0.0001295749368569656, + "loss": 0.7341, + "step": 27099 + }, + { + "epoch": 0.6958520094813045, + "grad_norm": 0.765625, + "learning_rate": 0.00012957067232932312, + "loss": 0.8406, + "step": 27100 + }, + { + "epoch": 0.6958776866772264, + "grad_norm": 0.82421875, + "learning_rate": 0.00012956640774274897, + "loss": 0.873, + "step": 27101 + }, + { + "epoch": 0.6959033638731482, + "grad_norm": 0.7109375, + "learning_rate": 0.00012956214309725166, + "loss": 0.7333, + "step": 27102 + }, + { + "epoch": 0.69592904106907, + "grad_norm": 0.828125, + "learning_rate": 0.00012955787839283967, + "loss": 0.7694, + "step": 27103 + }, + { + "epoch": 0.6959547182649919, + "grad_norm": 0.76953125, + "learning_rate": 0.0001295536136295215, + "loss": 0.926, + "step": 27104 + }, + { + "epoch": 0.6959803954609137, + "grad_norm": 1.0390625, + "learning_rate": 0.00012954934880730562, + "loss": 0.803, + "step": 27105 + }, + { + "epoch": 0.6960060726568356, + "grad_norm": 0.703125, + "learning_rate": 0.0001295450839262006, + "loss": 0.769, + "step": 27106 + }, + { + "epoch": 0.6960317498527573, + "grad_norm": 0.734375, + "learning_rate": 0.00012954081898621487, + "loss": 0.8019, + "step": 27107 + }, + { + "epoch": 0.6960574270486791, + "grad_norm": 0.703125, + "learning_rate": 0.00012953655398735694, + "loss": 0.6727, + "step": 27108 + }, + { + "epoch": 0.696083104244601, + "grad_norm": 0.80078125, + "learning_rate": 0.00012953228892963534, + "loss": 0.891, + "step": 27109 + }, + { + "epoch": 0.6961087814405228, + "grad_norm": 0.98828125, + "learning_rate": 0.00012952802381305855, + "loss": 1.0149, + "step": 27110 + }, + { + "epoch": 0.6961344586364446, + "grad_norm": 0.84765625, + "learning_rate": 0.0001295237586376351, + "loss": 0.9459, + "step": 27111 + }, + { + "epoch": 0.6961601358323665, + "grad_norm": 0.76953125, + "learning_rate": 0.00012951949340337345, + "loss": 0.9327, + "step": 27112 + }, + { + "epoch": 0.6961858130282882, + "grad_norm": 0.7734375, + "learning_rate": 0.00012951522811028212, + "loss": 0.7506, + "step": 27113 + }, + { + "epoch": 0.69621149022421, + "grad_norm": 0.83984375, + "learning_rate": 0.0001295109627583696, + "loss": 0.8449, + "step": 27114 + }, + { + "epoch": 0.6962371674201319, + "grad_norm": 0.7734375, + "learning_rate": 0.0001295066973476444, + "loss": 0.7301, + "step": 27115 + }, + { + "epoch": 0.6962628446160537, + "grad_norm": 0.84765625, + "learning_rate": 0.000129502431878115, + "loss": 0.8828, + "step": 27116 + }, + { + "epoch": 0.6962885218119755, + "grad_norm": 0.80078125, + "learning_rate": 0.00012949816634978998, + "loss": 0.8546, + "step": 27117 + }, + { + "epoch": 0.6963141990078974, + "grad_norm": 0.87890625, + "learning_rate": 0.00012949390076267772, + "loss": 0.9524, + "step": 27118 + }, + { + "epoch": 0.6963398762038192, + "grad_norm": 0.9296875, + "learning_rate": 0.00012948963511678677, + "loss": 0.8795, + "step": 27119 + }, + { + "epoch": 0.6963655533997409, + "grad_norm": 0.80078125, + "learning_rate": 0.00012948536941212568, + "loss": 0.7989, + "step": 27120 + }, + { + "epoch": 0.6963912305956628, + "grad_norm": 0.7421875, + "learning_rate": 0.00012948110364870288, + "loss": 0.8227, + "step": 27121 + }, + { + "epoch": 0.6964169077915846, + "grad_norm": 0.80859375, + "learning_rate": 0.00012947683782652694, + "loss": 0.9172, + "step": 27122 + }, + { + "epoch": 0.6964425849875064, + "grad_norm": 0.85546875, + "learning_rate": 0.0001294725719456063, + "loss": 0.8579, + "step": 27123 + }, + { + "epoch": 0.6964682621834283, + "grad_norm": 0.8515625, + "learning_rate": 0.0001294683060059495, + "loss": 0.8172, + "step": 27124 + }, + { + "epoch": 0.6964939393793501, + "grad_norm": 0.79296875, + "learning_rate": 0.00012946404000756503, + "loss": 0.8463, + "step": 27125 + }, + { + "epoch": 0.696519616575272, + "grad_norm": 0.90625, + "learning_rate": 0.00012945977395046137, + "loss": 0.804, + "step": 27126 + }, + { + "epoch": 0.6965452937711937, + "grad_norm": 0.78515625, + "learning_rate": 0.00012945550783464706, + "loss": 0.6487, + "step": 27127 + }, + { + "epoch": 0.6965709709671155, + "grad_norm": 0.76171875, + "learning_rate": 0.00012945124166013056, + "loss": 0.775, + "step": 27128 + }, + { + "epoch": 0.6965966481630373, + "grad_norm": 0.796875, + "learning_rate": 0.00012944697542692038, + "loss": 0.8164, + "step": 27129 + }, + { + "epoch": 0.6966223253589592, + "grad_norm": 0.76171875, + "learning_rate": 0.00012944270913502513, + "loss": 0.8864, + "step": 27130 + }, + { + "epoch": 0.696648002554881, + "grad_norm": 0.7109375, + "learning_rate": 0.00012943844278445315, + "loss": 0.7699, + "step": 27131 + }, + { + "epoch": 0.6966736797508029, + "grad_norm": 0.7734375, + "learning_rate": 0.000129434176375213, + "loss": 0.8068, + "step": 27132 + }, + { + "epoch": 0.6966993569467246, + "grad_norm": 0.73828125, + "learning_rate": 0.00012942990990731323, + "loss": 0.923, + "step": 27133 + }, + { + "epoch": 0.6967250341426464, + "grad_norm": 0.76953125, + "learning_rate": 0.00012942564338076228, + "loss": 0.8791, + "step": 27134 + }, + { + "epoch": 0.6967507113385683, + "grad_norm": 0.71875, + "learning_rate": 0.00012942137679556868, + "loss": 0.8497, + "step": 27135 + }, + { + "epoch": 0.6967763885344901, + "grad_norm": 0.8125, + "learning_rate": 0.00012941711015174097, + "loss": 0.802, + "step": 27136 + }, + { + "epoch": 0.6968020657304119, + "grad_norm": 0.84375, + "learning_rate": 0.0001294128434492876, + "loss": 0.839, + "step": 27137 + }, + { + "epoch": 0.6968277429263338, + "grad_norm": 0.8359375, + "learning_rate": 0.0001294085766882171, + "loss": 0.8539, + "step": 27138 + }, + { + "epoch": 0.6968534201222556, + "grad_norm": 0.828125, + "learning_rate": 0.00012940430986853793, + "loss": 0.8493, + "step": 27139 + }, + { + "epoch": 0.6968790973181773, + "grad_norm": 0.7890625, + "learning_rate": 0.00012940004299025865, + "loss": 0.7705, + "step": 27140 + }, + { + "epoch": 0.6969047745140992, + "grad_norm": 0.78515625, + "learning_rate": 0.00012939577605338776, + "loss": 0.8685, + "step": 27141 + }, + { + "epoch": 0.696930451710021, + "grad_norm": 0.84375, + "learning_rate": 0.0001293915090579337, + "loss": 0.8756, + "step": 27142 + }, + { + "epoch": 0.6969561289059428, + "grad_norm": 0.76171875, + "learning_rate": 0.00012938724200390507, + "loss": 0.8203, + "step": 27143 + }, + { + "epoch": 0.6969818061018647, + "grad_norm": 0.82421875, + "learning_rate": 0.00012938297489131027, + "loss": 0.8961, + "step": 27144 + }, + { + "epoch": 0.6970074832977865, + "grad_norm": 0.69140625, + "learning_rate": 0.00012937870772015785, + "loss": 0.7882, + "step": 27145 + }, + { + "epoch": 0.6970331604937083, + "grad_norm": 0.77734375, + "learning_rate": 0.00012937444049045634, + "loss": 0.8874, + "step": 27146 + }, + { + "epoch": 0.6970588376896301, + "grad_norm": 0.7421875, + "learning_rate": 0.00012937017320221425, + "loss": 0.866, + "step": 27147 + }, + { + "epoch": 0.6970845148855519, + "grad_norm": 0.76171875, + "learning_rate": 0.00012936590585544002, + "loss": 0.8365, + "step": 27148 + }, + { + "epoch": 0.6971101920814737, + "grad_norm": 0.81640625, + "learning_rate": 0.00012936163845014218, + "loss": 1.0474, + "step": 27149 + }, + { + "epoch": 0.6971358692773956, + "grad_norm": 0.80859375, + "learning_rate": 0.0001293573709863293, + "loss": 0.8689, + "step": 27150 + }, + { + "epoch": 0.6971615464733174, + "grad_norm": 0.80078125, + "learning_rate": 0.0001293531034640098, + "loss": 0.7873, + "step": 27151 + }, + { + "epoch": 0.6971872236692392, + "grad_norm": 0.7734375, + "learning_rate": 0.00012934883588319225, + "loss": 0.9756, + "step": 27152 + }, + { + "epoch": 0.697212900865161, + "grad_norm": 0.73828125, + "learning_rate": 0.00012934456824388507, + "loss": 0.7695, + "step": 27153 + }, + { + "epoch": 0.6972385780610828, + "grad_norm": 0.78515625, + "learning_rate": 0.00012934030054609683, + "loss": 0.7668, + "step": 27154 + }, + { + "epoch": 0.6972642552570046, + "grad_norm": 0.74609375, + "learning_rate": 0.00012933603278983603, + "loss": 0.8134, + "step": 27155 + }, + { + "epoch": 0.6972899324529265, + "grad_norm": 0.7578125, + "learning_rate": 0.00012933176497511117, + "loss": 0.7391, + "step": 27156 + }, + { + "epoch": 0.6973156096488483, + "grad_norm": 0.78125, + "learning_rate": 0.00012932749710193076, + "loss": 0.79, + "step": 27157 + }, + { + "epoch": 0.6973412868447701, + "grad_norm": 0.82421875, + "learning_rate": 0.0001293232291703033, + "loss": 0.9059, + "step": 27158 + }, + { + "epoch": 0.697366964040692, + "grad_norm": 0.72265625, + "learning_rate": 0.00012931896118023725, + "loss": 0.7583, + "step": 27159 + }, + { + "epoch": 0.6973926412366137, + "grad_norm": 0.74609375, + "learning_rate": 0.00012931469313174121, + "loss": 0.8677, + "step": 27160 + }, + { + "epoch": 0.6974183184325355, + "grad_norm": 0.82421875, + "learning_rate": 0.00012931042502482357, + "loss": 0.9937, + "step": 27161 + }, + { + "epoch": 0.6974439956284574, + "grad_norm": 0.796875, + "learning_rate": 0.00012930615685949296, + "loss": 0.8672, + "step": 27162 + }, + { + "epoch": 0.6974696728243792, + "grad_norm": 0.76953125, + "learning_rate": 0.0001293018886357578, + "loss": 0.868, + "step": 27163 + }, + { + "epoch": 0.697495350020301, + "grad_norm": 0.78125, + "learning_rate": 0.00012929762035362666, + "loss": 0.7687, + "step": 27164 + }, + { + "epoch": 0.6975210272162229, + "grad_norm": 0.765625, + "learning_rate": 0.00012929335201310798, + "loss": 0.8666, + "step": 27165 + }, + { + "epoch": 0.6975467044121447, + "grad_norm": 0.77734375, + "learning_rate": 0.00012928908361421027, + "loss": 0.8152, + "step": 27166 + }, + { + "epoch": 0.6975723816080664, + "grad_norm": 0.765625, + "learning_rate": 0.0001292848151569421, + "loss": 0.7166, + "step": 27167 + }, + { + "epoch": 0.6975980588039883, + "grad_norm": 0.76171875, + "learning_rate": 0.00012928054664131192, + "loss": 0.8754, + "step": 27168 + }, + { + "epoch": 0.6976237359999101, + "grad_norm": 0.84375, + "learning_rate": 0.00012927627806732828, + "loss": 0.8971, + "step": 27169 + }, + { + "epoch": 0.697649413195832, + "grad_norm": 0.75390625, + "learning_rate": 0.00012927200943499965, + "loss": 0.8266, + "step": 27170 + }, + { + "epoch": 0.6976750903917538, + "grad_norm": 0.75390625, + "learning_rate": 0.00012926774074433452, + "loss": 0.8928, + "step": 27171 + }, + { + "epoch": 0.6977007675876756, + "grad_norm": 0.71875, + "learning_rate": 0.00012926347199534145, + "loss": 0.7127, + "step": 27172 + }, + { + "epoch": 0.6977264447835974, + "grad_norm": 0.76171875, + "learning_rate": 0.00012925920318802896, + "loss": 0.802, + "step": 27173 + }, + { + "epoch": 0.6977521219795192, + "grad_norm": 0.76953125, + "learning_rate": 0.00012925493432240545, + "loss": 0.7147, + "step": 27174 + }, + { + "epoch": 0.697777799175441, + "grad_norm": 0.74609375, + "learning_rate": 0.00012925066539847953, + "loss": 0.9097, + "step": 27175 + }, + { + "epoch": 0.6978034763713629, + "grad_norm": 0.89453125, + "learning_rate": 0.0001292463964162597, + "loss": 0.9255, + "step": 27176 + }, + { + "epoch": 0.6978291535672847, + "grad_norm": 0.6640625, + "learning_rate": 0.00012924212737575443, + "loss": 0.7324, + "step": 27177 + }, + { + "epoch": 0.6978548307632065, + "grad_norm": 0.84375, + "learning_rate": 0.00012923785827697224, + "loss": 0.9719, + "step": 27178 + }, + { + "epoch": 0.6978805079591284, + "grad_norm": 0.9140625, + "learning_rate": 0.0001292335891199216, + "loss": 0.8078, + "step": 27179 + }, + { + "epoch": 0.6979061851550501, + "grad_norm": 0.76953125, + "learning_rate": 0.00012922931990461112, + "loss": 0.7359, + "step": 27180 + }, + { + "epoch": 0.6979318623509719, + "grad_norm": 0.81640625, + "learning_rate": 0.00012922505063104922, + "loss": 0.9068, + "step": 27181 + }, + { + "epoch": 0.6979575395468938, + "grad_norm": 0.86328125, + "learning_rate": 0.0001292207812992444, + "loss": 1.0258, + "step": 27182 + }, + { + "epoch": 0.6979832167428156, + "grad_norm": 0.765625, + "learning_rate": 0.00012921651190920524, + "loss": 0.8386, + "step": 27183 + }, + { + "epoch": 0.6980088939387374, + "grad_norm": 0.828125, + "learning_rate": 0.00012921224246094024, + "loss": 0.7955, + "step": 27184 + }, + { + "epoch": 0.6980345711346593, + "grad_norm": 0.8125, + "learning_rate": 0.00012920797295445783, + "loss": 0.8423, + "step": 27185 + }, + { + "epoch": 0.6980602483305811, + "grad_norm": 0.78515625, + "learning_rate": 0.00012920370338976656, + "loss": 0.8348, + "step": 27186 + }, + { + "epoch": 0.6980859255265028, + "grad_norm": 0.71484375, + "learning_rate": 0.000129199433766875, + "loss": 0.7872, + "step": 27187 + }, + { + "epoch": 0.6981116027224247, + "grad_norm": 0.7578125, + "learning_rate": 0.00012919516408579156, + "loss": 0.7446, + "step": 27188 + }, + { + "epoch": 0.6981372799183465, + "grad_norm": 0.78515625, + "learning_rate": 0.00012919089434652482, + "loss": 0.8663, + "step": 27189 + }, + { + "epoch": 0.6981629571142683, + "grad_norm": 0.76953125, + "learning_rate": 0.00012918662454908323, + "loss": 0.7469, + "step": 27190 + }, + { + "epoch": 0.6981886343101902, + "grad_norm": 0.80078125, + "learning_rate": 0.0001291823546934754, + "loss": 0.8609, + "step": 27191 + }, + { + "epoch": 0.698214311506112, + "grad_norm": 0.83203125, + "learning_rate": 0.00012917808477970973, + "loss": 0.8737, + "step": 27192 + }, + { + "epoch": 0.6982399887020337, + "grad_norm": 0.78515625, + "learning_rate": 0.00012917381480779473, + "loss": 1.0503, + "step": 27193 + }, + { + "epoch": 0.6982656658979556, + "grad_norm": 0.734375, + "learning_rate": 0.000129169544777739, + "loss": 0.7871, + "step": 27194 + }, + { + "epoch": 0.6982913430938774, + "grad_norm": 0.76953125, + "learning_rate": 0.000129165274689551, + "loss": 0.7906, + "step": 27195 + }, + { + "epoch": 0.6983170202897993, + "grad_norm": 0.78125, + "learning_rate": 0.00012916100454323927, + "loss": 0.7662, + "step": 27196 + }, + { + "epoch": 0.6983426974857211, + "grad_norm": 0.7734375, + "learning_rate": 0.00012915673433881226, + "loss": 0.94, + "step": 27197 + }, + { + "epoch": 0.6983683746816429, + "grad_norm": 0.69921875, + "learning_rate": 0.0001291524640762785, + "loss": 0.823, + "step": 27198 + }, + { + "epoch": 0.6983940518775648, + "grad_norm": 0.7578125, + "learning_rate": 0.00012914819375564655, + "loss": 0.7989, + "step": 27199 + }, + { + "epoch": 0.6984197290734865, + "grad_norm": 0.734375, + "learning_rate": 0.00012914392337692486, + "loss": 0.8225, + "step": 27200 + }, + { + "epoch": 0.6984454062694083, + "grad_norm": 0.76171875, + "learning_rate": 0.00012913965294012193, + "loss": 0.7991, + "step": 27201 + }, + { + "epoch": 0.6984710834653302, + "grad_norm": 0.734375, + "learning_rate": 0.00012913538244524637, + "loss": 0.7978, + "step": 27202 + }, + { + "epoch": 0.698496760661252, + "grad_norm": 0.80859375, + "learning_rate": 0.00012913111189230658, + "loss": 0.8922, + "step": 27203 + }, + { + "epoch": 0.6985224378571738, + "grad_norm": 0.73046875, + "learning_rate": 0.00012912684128131113, + "loss": 0.7513, + "step": 27204 + }, + { + "epoch": 0.6985481150530957, + "grad_norm": 0.6875, + "learning_rate": 0.00012912257061226852, + "loss": 0.7092, + "step": 27205 + }, + { + "epoch": 0.6985737922490175, + "grad_norm": 0.85546875, + "learning_rate": 0.00012911829988518725, + "loss": 0.8263, + "step": 27206 + }, + { + "epoch": 0.6985994694449392, + "grad_norm": 0.83203125, + "learning_rate": 0.00012911402910007583, + "loss": 0.794, + "step": 27207 + }, + { + "epoch": 0.6986251466408611, + "grad_norm": 0.8984375, + "learning_rate": 0.0001291097582569428, + "loss": 0.9705, + "step": 27208 + }, + { + "epoch": 0.6986508238367829, + "grad_norm": 0.73046875, + "learning_rate": 0.00012910548735579663, + "loss": 0.8997, + "step": 27209 + }, + { + "epoch": 0.6986765010327047, + "grad_norm": 0.80078125, + "learning_rate": 0.0001291012163966459, + "loss": 0.8592, + "step": 27210 + }, + { + "epoch": 0.6987021782286266, + "grad_norm": 0.81640625, + "learning_rate": 0.000129096945379499, + "loss": 0.8957, + "step": 27211 + }, + { + "epoch": 0.6987278554245484, + "grad_norm": 0.7734375, + "learning_rate": 0.00012909267430436455, + "loss": 0.9969, + "step": 27212 + }, + { + "epoch": 0.6987535326204701, + "grad_norm": 0.71875, + "learning_rate": 0.00012908840317125107, + "loss": 0.798, + "step": 27213 + }, + { + "epoch": 0.698779209816392, + "grad_norm": 0.71875, + "learning_rate": 0.00012908413198016696, + "loss": 0.7422, + "step": 27214 + }, + { + "epoch": 0.6988048870123138, + "grad_norm": 0.734375, + "learning_rate": 0.00012907986073112083, + "loss": 0.8007, + "step": 27215 + }, + { + "epoch": 0.6988305642082356, + "grad_norm": 0.80078125, + "learning_rate": 0.0001290755894241212, + "loss": 0.727, + "step": 27216 + }, + { + "epoch": 0.6988562414041575, + "grad_norm": 0.75390625, + "learning_rate": 0.0001290713180591765, + "loss": 0.7798, + "step": 27217 + }, + { + "epoch": 0.6988819186000793, + "grad_norm": 0.8125, + "learning_rate": 0.00012906704663629532, + "loss": 0.7836, + "step": 27218 + }, + { + "epoch": 0.6989075957960011, + "grad_norm": 0.859375, + "learning_rate": 0.0001290627751554861, + "loss": 0.9023, + "step": 27219 + }, + { + "epoch": 0.6989332729919229, + "grad_norm": 0.6875, + "learning_rate": 0.00012905850361675744, + "loss": 0.8027, + "step": 27220 + }, + { + "epoch": 0.6989589501878447, + "grad_norm": 0.72265625, + "learning_rate": 0.0001290542320201178, + "loss": 0.825, + "step": 27221 + }, + { + "epoch": 0.6989846273837665, + "grad_norm": 0.796875, + "learning_rate": 0.0001290499603655757, + "loss": 0.7519, + "step": 27222 + }, + { + "epoch": 0.6990103045796884, + "grad_norm": 0.73828125, + "learning_rate": 0.0001290456886531396, + "loss": 0.7995, + "step": 27223 + }, + { + "epoch": 0.6990359817756102, + "grad_norm": 0.83203125, + "learning_rate": 0.00012904141688281816, + "loss": 0.8109, + "step": 27224 + }, + { + "epoch": 0.699061658971532, + "grad_norm": 0.76953125, + "learning_rate": 0.00012903714505461971, + "loss": 0.8666, + "step": 27225 + }, + { + "epoch": 0.6990873361674539, + "grad_norm": 0.7890625, + "learning_rate": 0.00012903287316855288, + "loss": 0.9417, + "step": 27226 + }, + { + "epoch": 0.6991130133633756, + "grad_norm": 0.8046875, + "learning_rate": 0.00012902860122462615, + "loss": 0.7482, + "step": 27227 + }, + { + "epoch": 0.6991386905592974, + "grad_norm": 0.8359375, + "learning_rate": 0.00012902432922284808, + "loss": 0.781, + "step": 27228 + }, + { + "epoch": 0.6991643677552193, + "grad_norm": 0.8046875, + "learning_rate": 0.00012902005716322715, + "loss": 0.8272, + "step": 27229 + }, + { + "epoch": 0.6991900449511411, + "grad_norm": 0.76171875, + "learning_rate": 0.0001290157850457718, + "loss": 0.9114, + "step": 27230 + }, + { + "epoch": 0.699215722147063, + "grad_norm": 0.8359375, + "learning_rate": 0.00012901151287049068, + "loss": 0.7871, + "step": 27231 + }, + { + "epoch": 0.6992413993429848, + "grad_norm": 0.72265625, + "learning_rate": 0.0001290072406373922, + "loss": 0.8095, + "step": 27232 + }, + { + "epoch": 0.6992670765389065, + "grad_norm": 0.8046875, + "learning_rate": 0.0001290029683464849, + "loss": 0.7888, + "step": 27233 + }, + { + "epoch": 0.6992927537348284, + "grad_norm": 0.78515625, + "learning_rate": 0.00012899869599777733, + "loss": 0.8676, + "step": 27234 + }, + { + "epoch": 0.6993184309307502, + "grad_norm": 0.73046875, + "learning_rate": 0.00012899442359127794, + "loss": 0.7082, + "step": 27235 + }, + { + "epoch": 0.699344108126672, + "grad_norm": 0.8828125, + "learning_rate": 0.00012899015112699533, + "loss": 0.7966, + "step": 27236 + }, + { + "epoch": 0.6993697853225939, + "grad_norm": 0.796875, + "learning_rate": 0.00012898587860493796, + "loss": 0.8929, + "step": 27237 + }, + { + "epoch": 0.6993954625185157, + "grad_norm": 0.73046875, + "learning_rate": 0.0001289816060251143, + "loss": 0.9214, + "step": 27238 + }, + { + "epoch": 0.6994211397144375, + "grad_norm": 0.84765625, + "learning_rate": 0.00012897733338753298, + "loss": 0.804, + "step": 27239 + }, + { + "epoch": 0.6994468169103593, + "grad_norm": 0.81640625, + "learning_rate": 0.00012897306069220243, + "loss": 0.8823, + "step": 27240 + }, + { + "epoch": 0.6994724941062811, + "grad_norm": 0.8125, + "learning_rate": 0.00012896878793913115, + "loss": 0.9599, + "step": 27241 + }, + { + "epoch": 0.6994981713022029, + "grad_norm": 0.90234375, + "learning_rate": 0.00012896451512832776, + "loss": 0.9454, + "step": 27242 + }, + { + "epoch": 0.6995238484981248, + "grad_norm": 0.75390625, + "learning_rate": 0.00012896024225980067, + "loss": 0.8452, + "step": 27243 + }, + { + "epoch": 0.6995495256940466, + "grad_norm": 0.77734375, + "learning_rate": 0.00012895596933355842, + "loss": 0.8056, + "step": 27244 + }, + { + "epoch": 0.6995752028899684, + "grad_norm": 0.75, + "learning_rate": 0.00012895169634960957, + "loss": 0.7975, + "step": 27245 + }, + { + "epoch": 0.6996008800858903, + "grad_norm": 0.71484375, + "learning_rate": 0.00012894742330796257, + "loss": 0.7161, + "step": 27246 + }, + { + "epoch": 0.699626557281812, + "grad_norm": 0.765625, + "learning_rate": 0.00012894315020862602, + "loss": 0.8238, + "step": 27247 + }, + { + "epoch": 0.6996522344777338, + "grad_norm": 0.9921875, + "learning_rate": 0.00012893887705160833, + "loss": 0.8939, + "step": 27248 + }, + { + "epoch": 0.6996779116736557, + "grad_norm": 0.74609375, + "learning_rate": 0.00012893460383691814, + "loss": 0.7716, + "step": 27249 + }, + { + "epoch": 0.6997035888695775, + "grad_norm": 0.83984375, + "learning_rate": 0.00012893033056456384, + "loss": 0.8194, + "step": 27250 + }, + { + "epoch": 0.6997292660654993, + "grad_norm": 0.765625, + "learning_rate": 0.000128926057234554, + "loss": 0.855, + "step": 27251 + }, + { + "epoch": 0.6997549432614212, + "grad_norm": 0.859375, + "learning_rate": 0.00012892178384689716, + "loss": 0.7114, + "step": 27252 + }, + { + "epoch": 0.6997806204573429, + "grad_norm": 0.734375, + "learning_rate": 0.00012891751040160183, + "loss": 0.7814, + "step": 27253 + }, + { + "epoch": 0.6998062976532647, + "grad_norm": 0.7421875, + "learning_rate": 0.0001289132368986765, + "loss": 0.8026, + "step": 27254 + }, + { + "epoch": 0.6998319748491866, + "grad_norm": 0.75, + "learning_rate": 0.00012890896333812973, + "loss": 0.811, + "step": 27255 + }, + { + "epoch": 0.6998576520451084, + "grad_norm": 0.7734375, + "learning_rate": 0.00012890468971996996, + "loss": 0.8879, + "step": 27256 + }, + { + "epoch": 0.6998833292410302, + "grad_norm": 0.8046875, + "learning_rate": 0.00012890041604420578, + "loss": 0.782, + "step": 27257 + }, + { + "epoch": 0.6999090064369521, + "grad_norm": 0.72265625, + "learning_rate": 0.00012889614231084568, + "loss": 0.8644, + "step": 27258 + }, + { + "epoch": 0.6999346836328739, + "grad_norm": 0.74609375, + "learning_rate": 0.00012889186851989815, + "loss": 0.7974, + "step": 27259 + }, + { + "epoch": 0.6999603608287956, + "grad_norm": 0.76953125, + "learning_rate": 0.00012888759467137177, + "loss": 0.7931, + "step": 27260 + }, + { + "epoch": 0.6999860380247175, + "grad_norm": 0.7265625, + "learning_rate": 0.00012888332076527504, + "loss": 0.7452, + "step": 27261 + }, + { + "epoch": 0.7000117152206393, + "grad_norm": 0.73046875, + "learning_rate": 0.0001288790468016164, + "loss": 0.6989, + "step": 27262 + }, + { + "epoch": 0.7000373924165612, + "grad_norm": 0.78515625, + "learning_rate": 0.0001288747727804045, + "loss": 0.8554, + "step": 27263 + }, + { + "epoch": 0.700063069612483, + "grad_norm": 0.72265625, + "learning_rate": 0.00012887049870164774, + "loss": 0.9377, + "step": 27264 + }, + { + "epoch": 0.7000887468084048, + "grad_norm": 0.828125, + "learning_rate": 0.0001288662245653547, + "loss": 0.9093, + "step": 27265 + }, + { + "epoch": 0.7001144240043266, + "grad_norm": 0.7421875, + "learning_rate": 0.00012886195037153388, + "loss": 0.8294, + "step": 27266 + }, + { + "epoch": 0.7001401012002484, + "grad_norm": 0.8359375, + "learning_rate": 0.00012885767612019378, + "loss": 0.9004, + "step": 27267 + }, + { + "epoch": 0.7001657783961702, + "grad_norm": 0.6953125, + "learning_rate": 0.00012885340181134297, + "loss": 0.8684, + "step": 27268 + }, + { + "epoch": 0.7001914555920921, + "grad_norm": 0.86328125, + "learning_rate": 0.00012884912744498994, + "loss": 0.8517, + "step": 27269 + }, + { + "epoch": 0.7002171327880139, + "grad_norm": 0.89453125, + "learning_rate": 0.00012884485302114314, + "loss": 0.8353, + "step": 27270 + }, + { + "epoch": 0.7002428099839357, + "grad_norm": 0.90625, + "learning_rate": 0.0001288405785398112, + "loss": 0.9446, + "step": 27271 + }, + { + "epoch": 0.7002684871798576, + "grad_norm": 0.875, + "learning_rate": 0.0001288363040010026, + "loss": 0.851, + "step": 27272 + }, + { + "epoch": 0.7002941643757793, + "grad_norm": 0.796875, + "learning_rate": 0.00012883202940472586, + "loss": 0.7917, + "step": 27273 + }, + { + "epoch": 0.7003198415717011, + "grad_norm": 0.6796875, + "learning_rate": 0.00012882775475098947, + "loss": 0.7029, + "step": 27274 + }, + { + "epoch": 0.700345518767623, + "grad_norm": 0.81640625, + "learning_rate": 0.00012882348003980196, + "loss": 0.8629, + "step": 27275 + }, + { + "epoch": 0.7003711959635448, + "grad_norm": 0.75390625, + "learning_rate": 0.00012881920527117187, + "loss": 0.8171, + "step": 27276 + }, + { + "epoch": 0.7003968731594666, + "grad_norm": 0.80859375, + "learning_rate": 0.00012881493044510772, + "loss": 1.0428, + "step": 27277 + }, + { + "epoch": 0.7004225503553885, + "grad_norm": 0.73046875, + "learning_rate": 0.000128810655561618, + "loss": 0.8013, + "step": 27278 + }, + { + "epoch": 0.7004482275513103, + "grad_norm": 0.68359375, + "learning_rate": 0.00012880638062071125, + "loss": 0.7218, + "step": 27279 + }, + { + "epoch": 0.700473904747232, + "grad_norm": 0.76953125, + "learning_rate": 0.000128802105622396, + "loss": 0.8777, + "step": 27280 + }, + { + "epoch": 0.7004995819431539, + "grad_norm": 0.78515625, + "learning_rate": 0.00012879783056668072, + "loss": 0.8899, + "step": 27281 + }, + { + "epoch": 0.7005252591390757, + "grad_norm": 0.76171875, + "learning_rate": 0.000128793555453574, + "loss": 0.758, + "step": 27282 + }, + { + "epoch": 0.7005509363349975, + "grad_norm": 0.80078125, + "learning_rate": 0.0001287892802830843, + "loss": 0.8472, + "step": 27283 + }, + { + "epoch": 0.7005766135309194, + "grad_norm": 0.70703125, + "learning_rate": 0.00012878500505522018, + "loss": 0.822, + "step": 27284 + }, + { + "epoch": 0.7006022907268412, + "grad_norm": 0.77734375, + "learning_rate": 0.00012878072976999012, + "loss": 0.7062, + "step": 27285 + }, + { + "epoch": 0.7006279679227629, + "grad_norm": 0.80078125, + "learning_rate": 0.00012877645442740268, + "loss": 0.9807, + "step": 27286 + }, + { + "epoch": 0.7006536451186848, + "grad_norm": 0.75, + "learning_rate": 0.00012877217902746636, + "loss": 0.8333, + "step": 27287 + }, + { + "epoch": 0.7006793223146066, + "grad_norm": 0.85546875, + "learning_rate": 0.0001287679035701897, + "loss": 1.1637, + "step": 27288 + }, + { + "epoch": 0.7007049995105284, + "grad_norm": 0.69140625, + "learning_rate": 0.00012876362805558123, + "loss": 0.8045, + "step": 27289 + }, + { + "epoch": 0.7007306767064503, + "grad_norm": 0.734375, + "learning_rate": 0.0001287593524836494, + "loss": 0.8546, + "step": 27290 + }, + { + "epoch": 0.7007563539023721, + "grad_norm": 0.72265625, + "learning_rate": 0.0001287550768544028, + "loss": 0.8057, + "step": 27291 + }, + { + "epoch": 0.700782031098294, + "grad_norm": 0.8203125, + "learning_rate": 0.00012875080116784996, + "loss": 0.865, + "step": 27292 + }, + { + "epoch": 0.7008077082942157, + "grad_norm": 0.7734375, + "learning_rate": 0.00012874652542399932, + "loss": 0.9225, + "step": 27293 + }, + { + "epoch": 0.7008333854901375, + "grad_norm": 0.93359375, + "learning_rate": 0.00012874224962285946, + "loss": 0.8728, + "step": 27294 + }, + { + "epoch": 0.7008590626860594, + "grad_norm": 0.7578125, + "learning_rate": 0.00012873797376443893, + "loss": 0.9588, + "step": 27295 + }, + { + "epoch": 0.7008847398819812, + "grad_norm": 0.82421875, + "learning_rate": 0.00012873369784874617, + "loss": 0.9032, + "step": 27296 + }, + { + "epoch": 0.700910417077903, + "grad_norm": 0.80859375, + "learning_rate": 0.00012872942187578976, + "loss": 1.0169, + "step": 27297 + }, + { + "epoch": 0.7009360942738249, + "grad_norm": 0.765625, + "learning_rate": 0.0001287251458455782, + "loss": 0.7879, + "step": 27298 + }, + { + "epoch": 0.7009617714697467, + "grad_norm": 0.8203125, + "learning_rate": 0.00012872086975812002, + "loss": 0.8679, + "step": 27299 + }, + { + "epoch": 0.7009874486656684, + "grad_norm": 0.71875, + "learning_rate": 0.00012871659361342376, + "loss": 0.9028, + "step": 27300 + }, + { + "epoch": 0.7010131258615903, + "grad_norm": 0.78125, + "learning_rate": 0.00012871231741149792, + "loss": 0.831, + "step": 27301 + }, + { + "epoch": 0.7010388030575121, + "grad_norm": 0.78125, + "learning_rate": 0.000128708041152351, + "loss": 0.7064, + "step": 27302 + }, + { + "epoch": 0.7010644802534339, + "grad_norm": 0.734375, + "learning_rate": 0.00012870376483599158, + "loss": 0.7862, + "step": 27303 + }, + { + "epoch": 0.7010901574493558, + "grad_norm": 0.76171875, + "learning_rate": 0.0001286994884624281, + "loss": 0.8035, + "step": 27304 + }, + { + "epoch": 0.7011158346452776, + "grad_norm": 0.76171875, + "learning_rate": 0.0001286952120316692, + "loss": 0.88, + "step": 27305 + }, + { + "epoch": 0.7011415118411993, + "grad_norm": 0.828125, + "learning_rate": 0.0001286909355437233, + "loss": 0.7665, + "step": 27306 + }, + { + "epoch": 0.7011671890371212, + "grad_norm": 0.734375, + "learning_rate": 0.00012868665899859894, + "loss": 0.7904, + "step": 27307 + }, + { + "epoch": 0.701192866233043, + "grad_norm": 0.796875, + "learning_rate": 0.0001286823823963047, + "loss": 0.8987, + "step": 27308 + }, + { + "epoch": 0.7012185434289648, + "grad_norm": 0.7421875, + "learning_rate": 0.00012867810573684904, + "loss": 0.9696, + "step": 27309 + }, + { + "epoch": 0.7012442206248867, + "grad_norm": 0.80859375, + "learning_rate": 0.0001286738290202405, + "loss": 0.8171, + "step": 27310 + }, + { + "epoch": 0.7012698978208085, + "grad_norm": 0.77734375, + "learning_rate": 0.00012866955224648762, + "loss": 0.7681, + "step": 27311 + }, + { + "epoch": 0.7012955750167303, + "grad_norm": 0.7578125, + "learning_rate": 0.00012866527541559889, + "loss": 0.6988, + "step": 27312 + }, + { + "epoch": 0.7013212522126521, + "grad_norm": 0.80078125, + "learning_rate": 0.00012866099852758286, + "loss": 0.8421, + "step": 27313 + }, + { + "epoch": 0.7013469294085739, + "grad_norm": 0.828125, + "learning_rate": 0.0001286567215824481, + "loss": 0.9205, + "step": 27314 + }, + { + "epoch": 0.7013726066044957, + "grad_norm": 0.6953125, + "learning_rate": 0.00012865244458020302, + "loss": 0.6702, + "step": 27315 + }, + { + "epoch": 0.7013982838004176, + "grad_norm": 0.71484375, + "learning_rate": 0.00012864816752085624, + "loss": 0.7489, + "step": 27316 + }, + { + "epoch": 0.7014239609963394, + "grad_norm": 0.74609375, + "learning_rate": 0.00012864389040441625, + "loss": 0.8246, + "step": 27317 + }, + { + "epoch": 0.7014496381922612, + "grad_norm": 0.74609375, + "learning_rate": 0.00012863961323089152, + "loss": 0.7355, + "step": 27318 + }, + { + "epoch": 0.7014753153881831, + "grad_norm": 0.8125, + "learning_rate": 0.0001286353360002907, + "loss": 0.86, + "step": 27319 + }, + { + "epoch": 0.7015009925841048, + "grad_norm": 0.80859375, + "learning_rate": 0.00012863105871262223, + "loss": 0.747, + "step": 27320 + }, + { + "epoch": 0.7015266697800266, + "grad_norm": 0.765625, + "learning_rate": 0.0001286267813678946, + "loss": 0.9022, + "step": 27321 + }, + { + "epoch": 0.7015523469759485, + "grad_norm": 0.76953125, + "learning_rate": 0.00012862250396611643, + "loss": 0.9501, + "step": 27322 + }, + { + "epoch": 0.7015780241718703, + "grad_norm": 0.76171875, + "learning_rate": 0.00012861822650729615, + "loss": 0.8026, + "step": 27323 + }, + { + "epoch": 0.7016037013677922, + "grad_norm": 0.76171875, + "learning_rate": 0.00012861394899144236, + "loss": 0.8358, + "step": 27324 + }, + { + "epoch": 0.701629378563714, + "grad_norm": 0.7421875, + "learning_rate": 0.00012860967141856358, + "loss": 0.8595, + "step": 27325 + }, + { + "epoch": 0.7016550557596357, + "grad_norm": 0.75390625, + "learning_rate": 0.00012860539378866824, + "loss": 0.8057, + "step": 27326 + }, + { + "epoch": 0.7016807329555576, + "grad_norm": 0.76171875, + "learning_rate": 0.00012860111610176497, + "loss": 0.7464, + "step": 27327 + }, + { + "epoch": 0.7017064101514794, + "grad_norm": 0.83203125, + "learning_rate": 0.00012859683835786227, + "loss": 0.982, + "step": 27328 + }, + { + "epoch": 0.7017320873474012, + "grad_norm": 0.74609375, + "learning_rate": 0.00012859256055696867, + "loss": 0.8366, + "step": 27329 + }, + { + "epoch": 0.7017577645433231, + "grad_norm": 0.7109375, + "learning_rate": 0.00012858828269909263, + "loss": 0.7883, + "step": 27330 + }, + { + "epoch": 0.7017834417392449, + "grad_norm": 0.70703125, + "learning_rate": 0.00012858400478424275, + "loss": 0.7571, + "step": 27331 + }, + { + "epoch": 0.7018091189351667, + "grad_norm": 0.7578125, + "learning_rate": 0.00012857972681242755, + "loss": 0.8326, + "step": 27332 + }, + { + "epoch": 0.7018347961310885, + "grad_norm": 0.74609375, + "learning_rate": 0.0001285754487836555, + "loss": 0.7762, + "step": 27333 + }, + { + "epoch": 0.7018604733270103, + "grad_norm": 0.76953125, + "learning_rate": 0.0001285711706979352, + "loss": 0.7908, + "step": 27334 + }, + { + "epoch": 0.7018861505229321, + "grad_norm": 0.765625, + "learning_rate": 0.00012856689255527512, + "loss": 0.8036, + "step": 27335 + }, + { + "epoch": 0.701911827718854, + "grad_norm": 0.78515625, + "learning_rate": 0.00012856261435568376, + "loss": 0.8585, + "step": 27336 + }, + { + "epoch": 0.7019375049147758, + "grad_norm": 0.796875, + "learning_rate": 0.00012855833609916976, + "loss": 0.7269, + "step": 27337 + }, + { + "epoch": 0.7019631821106976, + "grad_norm": 0.75, + "learning_rate": 0.00012855405778574154, + "loss": 0.7251, + "step": 27338 + }, + { + "epoch": 0.7019888593066195, + "grad_norm": 0.7890625, + "learning_rate": 0.00012854977941540767, + "loss": 0.7665, + "step": 27339 + }, + { + "epoch": 0.7020145365025412, + "grad_norm": 0.85546875, + "learning_rate": 0.00012854550098817665, + "loss": 0.8889, + "step": 27340 + }, + { + "epoch": 0.702040213698463, + "grad_norm": 0.77734375, + "learning_rate": 0.00012854122250405706, + "loss": 0.8934, + "step": 27341 + }, + { + "epoch": 0.7020658908943849, + "grad_norm": 0.8515625, + "learning_rate": 0.00012853694396305738, + "loss": 0.9413, + "step": 27342 + }, + { + "epoch": 0.7020915680903067, + "grad_norm": 0.828125, + "learning_rate": 0.00012853266536518612, + "loss": 0.8464, + "step": 27343 + }, + { + "epoch": 0.7021172452862285, + "grad_norm": 0.8359375, + "learning_rate": 0.00012852838671045187, + "loss": 0.8577, + "step": 27344 + }, + { + "epoch": 0.7021429224821504, + "grad_norm": 0.7890625, + "learning_rate": 0.00012852410799886309, + "loss": 0.693, + "step": 27345 + }, + { + "epoch": 0.7021685996780721, + "grad_norm": 0.76171875, + "learning_rate": 0.0001285198292304284, + "loss": 0.7301, + "step": 27346 + }, + { + "epoch": 0.7021942768739939, + "grad_norm": 0.80859375, + "learning_rate": 0.00012851555040515618, + "loss": 1.0019, + "step": 27347 + }, + { + "epoch": 0.7022199540699158, + "grad_norm": 0.73046875, + "learning_rate": 0.00012851127152305511, + "loss": 0.7471, + "step": 27348 + }, + { + "epoch": 0.7022456312658376, + "grad_norm": 0.79296875, + "learning_rate": 0.00012850699258413364, + "loss": 0.798, + "step": 27349 + }, + { + "epoch": 0.7022713084617594, + "grad_norm": 0.8359375, + "learning_rate": 0.00012850271358840027, + "loss": 0.9157, + "step": 27350 + }, + { + "epoch": 0.7022969856576813, + "grad_norm": 0.79296875, + "learning_rate": 0.00012849843453586362, + "loss": 0.8495, + "step": 27351 + }, + { + "epoch": 0.7023226628536031, + "grad_norm": 0.796875, + "learning_rate": 0.00012849415542653213, + "loss": 0.8797, + "step": 27352 + }, + { + "epoch": 0.7023483400495248, + "grad_norm": 0.8359375, + "learning_rate": 0.00012848987626041435, + "loss": 0.8244, + "step": 27353 + }, + { + "epoch": 0.7023740172454467, + "grad_norm": 0.74609375, + "learning_rate": 0.00012848559703751888, + "loss": 0.8418, + "step": 27354 + }, + { + "epoch": 0.7023996944413685, + "grad_norm": 0.796875, + "learning_rate": 0.00012848131775785413, + "loss": 0.777, + "step": 27355 + }, + { + "epoch": 0.7024253716372904, + "grad_norm": 0.9609375, + "learning_rate": 0.00012847703842142871, + "loss": 0.8472, + "step": 27356 + }, + { + "epoch": 0.7024510488332122, + "grad_norm": 0.76171875, + "learning_rate": 0.00012847275902825114, + "loss": 0.9153, + "step": 27357 + }, + { + "epoch": 0.702476726029134, + "grad_norm": 0.81640625, + "learning_rate": 0.0001284684795783299, + "loss": 0.876, + "step": 27358 + }, + { + "epoch": 0.7025024032250559, + "grad_norm": 0.86328125, + "learning_rate": 0.00012846420007167354, + "loss": 0.862, + "step": 27359 + }, + { + "epoch": 0.7025280804209776, + "grad_norm": 0.7578125, + "learning_rate": 0.00012845992050829064, + "loss": 0.8266, + "step": 27360 + }, + { + "epoch": 0.7025537576168994, + "grad_norm": 0.7890625, + "learning_rate": 0.0001284556408881897, + "loss": 0.7683, + "step": 27361 + }, + { + "epoch": 0.7025794348128213, + "grad_norm": 0.78515625, + "learning_rate": 0.0001284513612113792, + "loss": 0.8933, + "step": 27362 + }, + { + "epoch": 0.7026051120087431, + "grad_norm": 0.83203125, + "learning_rate": 0.0001284470814778677, + "loss": 0.8799, + "step": 27363 + }, + { + "epoch": 0.7026307892046649, + "grad_norm": 0.796875, + "learning_rate": 0.00012844280168766375, + "loss": 0.8689, + "step": 27364 + }, + { + "epoch": 0.7026564664005868, + "grad_norm": 0.75, + "learning_rate": 0.0001284385218407759, + "loss": 0.9278, + "step": 27365 + }, + { + "epoch": 0.7026821435965085, + "grad_norm": 0.83984375, + "learning_rate": 0.0001284342419372126, + "loss": 0.7155, + "step": 27366 + }, + { + "epoch": 0.7027078207924303, + "grad_norm": 0.7109375, + "learning_rate": 0.00012842996197698247, + "loss": 0.8312, + "step": 27367 + }, + { + "epoch": 0.7027334979883522, + "grad_norm": 0.82421875, + "learning_rate": 0.00012842568196009394, + "loss": 0.9166, + "step": 27368 + }, + { + "epoch": 0.702759175184274, + "grad_norm": 0.78515625, + "learning_rate": 0.00012842140188655562, + "loss": 0.797, + "step": 27369 + }, + { + "epoch": 0.7027848523801958, + "grad_norm": 0.84765625, + "learning_rate": 0.000128417121756376, + "loss": 0.7981, + "step": 27370 + }, + { + "epoch": 0.7028105295761177, + "grad_norm": 0.77734375, + "learning_rate": 0.00012841284156956363, + "loss": 0.7923, + "step": 27371 + }, + { + "epoch": 0.7028362067720395, + "grad_norm": 0.81640625, + "learning_rate": 0.00012840856132612704, + "loss": 0.8084, + "step": 27372 + }, + { + "epoch": 0.7028618839679612, + "grad_norm": 0.76953125, + "learning_rate": 0.0001284042810260748, + "loss": 0.7433, + "step": 27373 + }, + { + "epoch": 0.7028875611638831, + "grad_norm": 0.828125, + "learning_rate": 0.00012840000066941532, + "loss": 0.888, + "step": 27374 + }, + { + "epoch": 0.7029132383598049, + "grad_norm": 0.73828125, + "learning_rate": 0.0001283957202561572, + "loss": 0.7657, + "step": 27375 + }, + { + "epoch": 0.7029389155557267, + "grad_norm": 0.828125, + "learning_rate": 0.00012839143978630904, + "loss": 0.725, + "step": 27376 + }, + { + "epoch": 0.7029645927516486, + "grad_norm": 0.83984375, + "learning_rate": 0.00012838715925987925, + "loss": 0.9112, + "step": 27377 + }, + { + "epoch": 0.7029902699475704, + "grad_norm": 0.75, + "learning_rate": 0.00012838287867687645, + "loss": 0.8555, + "step": 27378 + }, + { + "epoch": 0.7030159471434922, + "grad_norm": 0.7578125, + "learning_rate": 0.00012837859803730908, + "loss": 0.6447, + "step": 27379 + }, + { + "epoch": 0.703041624339414, + "grad_norm": 0.796875, + "learning_rate": 0.0001283743173411858, + "loss": 0.7558, + "step": 27380 + }, + { + "epoch": 0.7030673015353358, + "grad_norm": 0.7578125, + "learning_rate": 0.00012837003658851503, + "loss": 0.8854, + "step": 27381 + }, + { + "epoch": 0.7030929787312576, + "grad_norm": 0.73828125, + "learning_rate": 0.00012836575577930537, + "loss": 0.791, + "step": 27382 + }, + { + "epoch": 0.7031186559271795, + "grad_norm": 0.80078125, + "learning_rate": 0.0001283614749135653, + "loss": 0.7961, + "step": 27383 + }, + { + "epoch": 0.7031443331231013, + "grad_norm": 0.83203125, + "learning_rate": 0.00012835719399130336, + "loss": 0.879, + "step": 27384 + }, + { + "epoch": 0.7031700103190232, + "grad_norm": 0.7109375, + "learning_rate": 0.0001283529130125281, + "loss": 0.7689, + "step": 27385 + }, + { + "epoch": 0.7031956875149449, + "grad_norm": 0.84765625, + "learning_rate": 0.00012834863197724804, + "loss": 0.8158, + "step": 27386 + }, + { + "epoch": 0.7032213647108667, + "grad_norm": 0.73046875, + "learning_rate": 0.00012834435088547174, + "loss": 0.7861, + "step": 27387 + }, + { + "epoch": 0.7032470419067886, + "grad_norm": 0.734375, + "learning_rate": 0.0001283400697372077, + "loss": 0.7973, + "step": 27388 + }, + { + "epoch": 0.7032727191027104, + "grad_norm": 0.8671875, + "learning_rate": 0.00012833578853246443, + "loss": 0.8056, + "step": 27389 + }, + { + "epoch": 0.7032983962986322, + "grad_norm": 0.75, + "learning_rate": 0.00012833150727125053, + "loss": 0.9349, + "step": 27390 + }, + { + "epoch": 0.7033240734945541, + "grad_norm": 0.80859375, + "learning_rate": 0.0001283272259535745, + "loss": 0.7807, + "step": 27391 + }, + { + "epoch": 0.7033497506904759, + "grad_norm": 0.87890625, + "learning_rate": 0.00012832294457944482, + "loss": 0.8716, + "step": 27392 + }, + { + "epoch": 0.7033754278863976, + "grad_norm": 0.76953125, + "learning_rate": 0.00012831866314887014, + "loss": 0.7879, + "step": 27393 + }, + { + "epoch": 0.7034011050823195, + "grad_norm": 0.88671875, + "learning_rate": 0.0001283143816618589, + "loss": 0.825, + "step": 27394 + }, + { + "epoch": 0.7034267822782413, + "grad_norm": 0.75, + "learning_rate": 0.00012831010011841963, + "loss": 0.802, + "step": 27395 + }, + { + "epoch": 0.7034524594741631, + "grad_norm": 0.8203125, + "learning_rate": 0.0001283058185185609, + "loss": 0.7747, + "step": 27396 + }, + { + "epoch": 0.703478136670085, + "grad_norm": 0.81640625, + "learning_rate": 0.00012830153686229124, + "loss": 0.9454, + "step": 27397 + }, + { + "epoch": 0.7035038138660068, + "grad_norm": 0.76171875, + "learning_rate": 0.00012829725514961913, + "loss": 0.8095, + "step": 27398 + }, + { + "epoch": 0.7035294910619286, + "grad_norm": 0.7265625, + "learning_rate": 0.00012829297338055317, + "loss": 0.7196, + "step": 27399 + }, + { + "epoch": 0.7035551682578504, + "grad_norm": 0.78125, + "learning_rate": 0.0001282886915551019, + "loss": 0.9866, + "step": 27400 + }, + { + "epoch": 0.7035808454537722, + "grad_norm": 0.7734375, + "learning_rate": 0.0001282844096732738, + "loss": 0.7999, + "step": 27401 + }, + { + "epoch": 0.703606522649694, + "grad_norm": 0.77734375, + "learning_rate": 0.00012828012773507744, + "loss": 0.9051, + "step": 27402 + }, + { + "epoch": 0.7036321998456159, + "grad_norm": 0.87109375, + "learning_rate": 0.0001282758457405213, + "loss": 0.9046, + "step": 27403 + }, + { + "epoch": 0.7036578770415377, + "grad_norm": 0.79296875, + "learning_rate": 0.00012827156368961398, + "loss": 0.8828, + "step": 27404 + }, + { + "epoch": 0.7036835542374595, + "grad_norm": 0.84765625, + "learning_rate": 0.000128267281582364, + "loss": 0.791, + "step": 27405 + }, + { + "epoch": 0.7037092314333813, + "grad_norm": 0.78125, + "learning_rate": 0.00012826299941877983, + "loss": 0.8858, + "step": 27406 + }, + { + "epoch": 0.7037349086293031, + "grad_norm": 0.7734375, + "learning_rate": 0.00012825871719887016, + "loss": 0.8229, + "step": 27407 + }, + { + "epoch": 0.7037605858252249, + "grad_norm": 0.8046875, + "learning_rate": 0.00012825443492264332, + "loss": 0.7842, + "step": 27408 + }, + { + "epoch": 0.7037862630211468, + "grad_norm": 0.8125, + "learning_rate": 0.00012825015259010798, + "loss": 0.8479, + "step": 27409 + }, + { + "epoch": 0.7038119402170686, + "grad_norm": 0.77734375, + "learning_rate": 0.00012824587020127264, + "loss": 0.8499, + "step": 27410 + }, + { + "epoch": 0.7038376174129904, + "grad_norm": 0.6640625, + "learning_rate": 0.0001282415877561458, + "loss": 0.7515, + "step": 27411 + }, + { + "epoch": 0.7038632946089123, + "grad_norm": 0.78125, + "learning_rate": 0.00012823730525473608, + "loss": 0.9866, + "step": 27412 + }, + { + "epoch": 0.703888971804834, + "grad_norm": 0.734375, + "learning_rate": 0.00012823302269705192, + "loss": 0.8632, + "step": 27413 + }, + { + "epoch": 0.7039146490007558, + "grad_norm": 0.734375, + "learning_rate": 0.00012822874008310193, + "loss": 0.9019, + "step": 27414 + }, + { + "epoch": 0.7039403261966777, + "grad_norm": 0.76953125, + "learning_rate": 0.0001282244574128946, + "loss": 0.8532, + "step": 27415 + }, + { + "epoch": 0.7039660033925995, + "grad_norm": 0.82421875, + "learning_rate": 0.00012822017468643843, + "loss": 0.8807, + "step": 27416 + }, + { + "epoch": 0.7039916805885214, + "grad_norm": 0.73828125, + "learning_rate": 0.00012821589190374207, + "loss": 0.9231, + "step": 27417 + }, + { + "epoch": 0.7040173577844432, + "grad_norm": 0.75, + "learning_rate": 0.00012821160906481395, + "loss": 0.9148, + "step": 27418 + }, + { + "epoch": 0.704043034980365, + "grad_norm": 0.78515625, + "learning_rate": 0.00012820732616966262, + "loss": 0.8379, + "step": 27419 + }, + { + "epoch": 0.7040687121762867, + "grad_norm": 0.734375, + "learning_rate": 0.0001282030432182967, + "loss": 0.8003, + "step": 27420 + }, + { + "epoch": 0.7040943893722086, + "grad_norm": 0.73046875, + "learning_rate": 0.0001281987602107246, + "loss": 0.7873, + "step": 27421 + }, + { + "epoch": 0.7041200665681304, + "grad_norm": 0.734375, + "learning_rate": 0.00012819447714695496, + "loss": 0.8964, + "step": 27422 + }, + { + "epoch": 0.7041457437640523, + "grad_norm": 0.75, + "learning_rate": 0.00012819019402699627, + "loss": 0.9465, + "step": 27423 + }, + { + "epoch": 0.7041714209599741, + "grad_norm": 0.765625, + "learning_rate": 0.00012818591085085704, + "loss": 0.7966, + "step": 27424 + }, + { + "epoch": 0.7041970981558959, + "grad_norm": 0.81640625, + "learning_rate": 0.00012818162761854584, + "loss": 0.8315, + "step": 27425 + }, + { + "epoch": 0.7042227753518177, + "grad_norm": 0.74609375, + "learning_rate": 0.00012817734433007123, + "loss": 0.8689, + "step": 27426 + }, + { + "epoch": 0.7042484525477395, + "grad_norm": 0.75390625, + "learning_rate": 0.0001281730609854417, + "loss": 0.9425, + "step": 27427 + }, + { + "epoch": 0.7042741297436613, + "grad_norm": 0.90625, + "learning_rate": 0.0001281687775846658, + "loss": 0.9166, + "step": 27428 + }, + { + "epoch": 0.7042998069395832, + "grad_norm": 0.984375, + "learning_rate": 0.0001281644941277521, + "loss": 0.7772, + "step": 27429 + }, + { + "epoch": 0.704325484135505, + "grad_norm": 0.765625, + "learning_rate": 0.00012816021061470907, + "loss": 0.7137, + "step": 27430 + }, + { + "epoch": 0.7043511613314268, + "grad_norm": 0.75, + "learning_rate": 0.0001281559270455453, + "loss": 0.8446, + "step": 27431 + }, + { + "epoch": 0.7043768385273487, + "grad_norm": 0.83203125, + "learning_rate": 0.00012815164342026928, + "loss": 0.8508, + "step": 27432 + }, + { + "epoch": 0.7044025157232704, + "grad_norm": 0.796875, + "learning_rate": 0.00012814735973888964, + "loss": 0.7535, + "step": 27433 + }, + { + "epoch": 0.7044281929191922, + "grad_norm": 0.75, + "learning_rate": 0.00012814307600141485, + "loss": 0.7368, + "step": 27434 + }, + { + "epoch": 0.7044538701151141, + "grad_norm": 0.75390625, + "learning_rate": 0.0001281387922078534, + "loss": 0.7736, + "step": 27435 + }, + { + "epoch": 0.7044795473110359, + "grad_norm": 0.75390625, + "learning_rate": 0.0001281345083582139, + "loss": 0.968, + "step": 27436 + }, + { + "epoch": 0.7045052245069577, + "grad_norm": 0.80859375, + "learning_rate": 0.00012813022445250485, + "loss": 0.8564, + "step": 27437 + }, + { + "epoch": 0.7045309017028796, + "grad_norm": 0.8203125, + "learning_rate": 0.00012812594049073484, + "loss": 0.8142, + "step": 27438 + }, + { + "epoch": 0.7045565788988014, + "grad_norm": 0.84375, + "learning_rate": 0.00012812165647291236, + "loss": 0.7227, + "step": 27439 + }, + { + "epoch": 0.7045822560947231, + "grad_norm": 0.74609375, + "learning_rate": 0.00012811737239904595, + "loss": 0.7702, + "step": 27440 + }, + { + "epoch": 0.704607933290645, + "grad_norm": 0.7734375, + "learning_rate": 0.00012811308826914418, + "loss": 0.7502, + "step": 27441 + }, + { + "epoch": 0.7046336104865668, + "grad_norm": 0.76171875, + "learning_rate": 0.00012810880408321553, + "loss": 0.9132, + "step": 27442 + }, + { + "epoch": 0.7046592876824886, + "grad_norm": 0.77734375, + "learning_rate": 0.00012810451984126856, + "loss": 0.8284, + "step": 27443 + }, + { + "epoch": 0.7046849648784105, + "grad_norm": 0.83984375, + "learning_rate": 0.00012810023554331185, + "loss": 0.7877, + "step": 27444 + }, + { + "epoch": 0.7047106420743323, + "grad_norm": 0.765625, + "learning_rate": 0.00012809595118935392, + "loss": 0.8397, + "step": 27445 + }, + { + "epoch": 0.704736319270254, + "grad_norm": 0.76171875, + "learning_rate": 0.0001280916667794033, + "loss": 0.8123, + "step": 27446 + }, + { + "epoch": 0.7047619964661759, + "grad_norm": 0.796875, + "learning_rate": 0.00012808738231346852, + "loss": 0.7637, + "step": 27447 + }, + { + "epoch": 0.7047876736620977, + "grad_norm": 0.859375, + "learning_rate": 0.00012808309779155808, + "loss": 0.7733, + "step": 27448 + }, + { + "epoch": 0.7048133508580195, + "grad_norm": 0.75390625, + "learning_rate": 0.00012807881321368062, + "loss": 0.7576, + "step": 27449 + }, + { + "epoch": 0.7048390280539414, + "grad_norm": 0.76953125, + "learning_rate": 0.0001280745285798446, + "loss": 0.7701, + "step": 27450 + }, + { + "epoch": 0.7048647052498632, + "grad_norm": 0.8203125, + "learning_rate": 0.00012807024389005855, + "loss": 0.8217, + "step": 27451 + }, + { + "epoch": 0.7048903824457851, + "grad_norm": 0.7578125, + "learning_rate": 0.00012806595914433108, + "loss": 0.8844, + "step": 27452 + }, + { + "epoch": 0.7049160596417068, + "grad_norm": 0.796875, + "learning_rate": 0.00012806167434267066, + "loss": 1.0064, + "step": 27453 + }, + { + "epoch": 0.7049417368376286, + "grad_norm": 0.8359375, + "learning_rate": 0.00012805738948508586, + "loss": 0.7964, + "step": 27454 + }, + { + "epoch": 0.7049674140335505, + "grad_norm": 0.703125, + "learning_rate": 0.00012805310457158527, + "loss": 0.739, + "step": 27455 + }, + { + "epoch": 0.7049930912294723, + "grad_norm": 0.78125, + "learning_rate": 0.00012804881960217732, + "loss": 0.9082, + "step": 27456 + }, + { + "epoch": 0.7050187684253941, + "grad_norm": 0.70703125, + "learning_rate": 0.00012804453457687063, + "loss": 0.8059, + "step": 27457 + }, + { + "epoch": 0.705044445621316, + "grad_norm": 0.8046875, + "learning_rate": 0.00012804024949567371, + "loss": 0.7195, + "step": 27458 + }, + { + "epoch": 0.7050701228172378, + "grad_norm": 0.74609375, + "learning_rate": 0.0001280359643585951, + "loss": 0.8669, + "step": 27459 + }, + { + "epoch": 0.7050958000131595, + "grad_norm": 0.82421875, + "learning_rate": 0.00012803167916564337, + "loss": 1.0048, + "step": 27460 + }, + { + "epoch": 0.7051214772090814, + "grad_norm": 0.734375, + "learning_rate": 0.00012802739391682699, + "loss": 0.9292, + "step": 27461 + }, + { + "epoch": 0.7051471544050032, + "grad_norm": 0.83203125, + "learning_rate": 0.0001280231086121546, + "loss": 0.9358, + "step": 27462 + }, + { + "epoch": 0.705172831600925, + "grad_norm": 0.765625, + "learning_rate": 0.00012801882325163465, + "loss": 0.7792, + "step": 27463 + }, + { + "epoch": 0.7051985087968469, + "grad_norm": 0.734375, + "learning_rate": 0.00012801453783527572, + "loss": 0.7641, + "step": 27464 + }, + { + "epoch": 0.7052241859927687, + "grad_norm": 0.81640625, + "learning_rate": 0.00012801025236308634, + "loss": 0.852, + "step": 27465 + }, + { + "epoch": 0.7052498631886904, + "grad_norm": 0.78125, + "learning_rate": 0.00012800596683507507, + "loss": 0.854, + "step": 27466 + }, + { + "epoch": 0.7052755403846123, + "grad_norm": 0.77734375, + "learning_rate": 0.00012800168125125044, + "loss": 0.8684, + "step": 27467 + }, + { + "epoch": 0.7053012175805341, + "grad_norm": 0.73046875, + "learning_rate": 0.000127997395611621, + "loss": 0.7614, + "step": 27468 + }, + { + "epoch": 0.7053268947764559, + "grad_norm": 0.83984375, + "learning_rate": 0.00012799310991619525, + "loss": 0.9294, + "step": 27469 + }, + { + "epoch": 0.7053525719723778, + "grad_norm": 0.8046875, + "learning_rate": 0.00012798882416498178, + "loss": 0.9476, + "step": 27470 + }, + { + "epoch": 0.7053782491682996, + "grad_norm": 0.7734375, + "learning_rate": 0.0001279845383579891, + "loss": 0.8185, + "step": 27471 + }, + { + "epoch": 0.7054039263642214, + "grad_norm": 0.76953125, + "learning_rate": 0.00012798025249522577, + "loss": 0.787, + "step": 27472 + }, + { + "epoch": 0.7054296035601432, + "grad_norm": 0.75390625, + "learning_rate": 0.00012797596657670033, + "loss": 0.8307, + "step": 27473 + }, + { + "epoch": 0.705455280756065, + "grad_norm": 0.78125, + "learning_rate": 0.00012797168060242132, + "loss": 0.8755, + "step": 27474 + }, + { + "epoch": 0.7054809579519868, + "grad_norm": 0.7578125, + "learning_rate": 0.00012796739457239723, + "loss": 0.7762, + "step": 27475 + }, + { + "epoch": 0.7055066351479087, + "grad_norm": 0.7265625, + "learning_rate": 0.0001279631084866367, + "loss": 0.8045, + "step": 27476 + }, + { + "epoch": 0.7055323123438305, + "grad_norm": 0.82421875, + "learning_rate": 0.00012795882234514818, + "loss": 0.8099, + "step": 27477 + }, + { + "epoch": 0.7055579895397524, + "grad_norm": 0.7265625, + "learning_rate": 0.00012795453614794028, + "loss": 0.814, + "step": 27478 + }, + { + "epoch": 0.7055836667356741, + "grad_norm": 0.83203125, + "learning_rate": 0.00012795024989502155, + "loss": 0.9553, + "step": 27479 + }, + { + "epoch": 0.7056093439315959, + "grad_norm": 0.80078125, + "learning_rate": 0.00012794596358640042, + "loss": 0.8861, + "step": 27480 + }, + { + "epoch": 0.7056350211275177, + "grad_norm": 0.73828125, + "learning_rate": 0.00012794167722208555, + "loss": 0.8515, + "step": 27481 + }, + { + "epoch": 0.7056606983234396, + "grad_norm": 0.78515625, + "learning_rate": 0.00012793739080208547, + "loss": 0.7613, + "step": 27482 + }, + { + "epoch": 0.7056863755193614, + "grad_norm": 0.78515625, + "learning_rate": 0.00012793310432640865, + "loss": 0.8724, + "step": 27483 + }, + { + "epoch": 0.7057120527152833, + "grad_norm": 0.7109375, + "learning_rate": 0.00012792881779506368, + "loss": 0.7622, + "step": 27484 + }, + { + "epoch": 0.7057377299112051, + "grad_norm": 0.8203125, + "learning_rate": 0.0001279245312080591, + "loss": 0.8662, + "step": 27485 + }, + { + "epoch": 0.7057634071071268, + "grad_norm": 1.015625, + "learning_rate": 0.00012792024456540347, + "loss": 0.8431, + "step": 27486 + }, + { + "epoch": 0.7057890843030487, + "grad_norm": 0.78125, + "learning_rate": 0.00012791595786710533, + "loss": 0.7418, + "step": 27487 + }, + { + "epoch": 0.7058147614989705, + "grad_norm": 0.74609375, + "learning_rate": 0.00012791167111317317, + "loss": 0.8417, + "step": 27488 + }, + { + "epoch": 0.7058404386948923, + "grad_norm": 0.73828125, + "learning_rate": 0.0001279073843036156, + "loss": 0.8355, + "step": 27489 + }, + { + "epoch": 0.7058661158908142, + "grad_norm": 0.81640625, + "learning_rate": 0.00012790309743844112, + "loss": 0.9422, + "step": 27490 + }, + { + "epoch": 0.705891793086736, + "grad_norm": 0.70703125, + "learning_rate": 0.00012789881051765827, + "loss": 0.6774, + "step": 27491 + }, + { + "epoch": 0.7059174702826578, + "grad_norm": 0.765625, + "learning_rate": 0.00012789452354127563, + "loss": 0.8725, + "step": 27492 + }, + { + "epoch": 0.7059431474785796, + "grad_norm": 0.75, + "learning_rate": 0.00012789023650930172, + "loss": 0.7472, + "step": 27493 + }, + { + "epoch": 0.7059688246745014, + "grad_norm": 0.78125, + "learning_rate": 0.00012788594942174508, + "loss": 0.8071, + "step": 27494 + }, + { + "epoch": 0.7059945018704232, + "grad_norm": 0.8125, + "learning_rate": 0.00012788166227861426, + "loss": 0.8263, + "step": 27495 + }, + { + "epoch": 0.7060201790663451, + "grad_norm": 0.76953125, + "learning_rate": 0.00012787737507991782, + "loss": 0.8502, + "step": 27496 + }, + { + "epoch": 0.7060458562622669, + "grad_norm": 0.87109375, + "learning_rate": 0.00012787308782566428, + "loss": 0.9177, + "step": 27497 + }, + { + "epoch": 0.7060715334581887, + "grad_norm": 0.83984375, + "learning_rate": 0.0001278688005158622, + "loss": 0.9769, + "step": 27498 + }, + { + "epoch": 0.7060972106541105, + "grad_norm": 0.71484375, + "learning_rate": 0.00012786451315052012, + "loss": 0.8166, + "step": 27499 + }, + { + "epoch": 0.7061228878500323, + "grad_norm": 0.765625, + "learning_rate": 0.00012786022572964658, + "loss": 0.7703, + "step": 27500 + }, + { + "epoch": 0.7061485650459541, + "grad_norm": 0.71484375, + "learning_rate": 0.00012785593825325012, + "loss": 0.9251, + "step": 27501 + }, + { + "epoch": 0.706174242241876, + "grad_norm": 0.83203125, + "learning_rate": 0.00012785165072133929, + "loss": 0.7908, + "step": 27502 + }, + { + "epoch": 0.7061999194377978, + "grad_norm": 0.7734375, + "learning_rate": 0.00012784736313392264, + "loss": 0.9393, + "step": 27503 + }, + { + "epoch": 0.7062255966337196, + "grad_norm": 0.78515625, + "learning_rate": 0.00012784307549100872, + "loss": 0.7962, + "step": 27504 + }, + { + "epoch": 0.7062512738296415, + "grad_norm": 0.79296875, + "learning_rate": 0.00012783878779260607, + "loss": 0.8227, + "step": 27505 + }, + { + "epoch": 0.7062769510255632, + "grad_norm": 0.80859375, + "learning_rate": 0.00012783450003872322, + "loss": 0.8073, + "step": 27506 + }, + { + "epoch": 0.706302628221485, + "grad_norm": 0.82421875, + "learning_rate": 0.00012783021222936874, + "loss": 0.8682, + "step": 27507 + }, + { + "epoch": 0.7063283054174069, + "grad_norm": 0.76953125, + "learning_rate": 0.0001278259243645511, + "loss": 0.852, + "step": 27508 + }, + { + "epoch": 0.7063539826133287, + "grad_norm": 0.8515625, + "learning_rate": 0.00012782163644427895, + "loss": 0.8016, + "step": 27509 + }, + { + "epoch": 0.7063796598092505, + "grad_norm": 0.73046875, + "learning_rate": 0.00012781734846856082, + "loss": 0.8636, + "step": 27510 + }, + { + "epoch": 0.7064053370051724, + "grad_norm": 0.7734375, + "learning_rate": 0.0001278130604374052, + "loss": 0.9689, + "step": 27511 + }, + { + "epoch": 0.7064310142010942, + "grad_norm": 0.7734375, + "learning_rate": 0.00012780877235082068, + "loss": 0.785, + "step": 27512 + }, + { + "epoch": 0.706456691397016, + "grad_norm": 0.83984375, + "learning_rate": 0.00012780448420881575, + "loss": 0.7869, + "step": 27513 + }, + { + "epoch": 0.7064823685929378, + "grad_norm": 0.83984375, + "learning_rate": 0.00012780019601139902, + "loss": 0.8029, + "step": 27514 + }, + { + "epoch": 0.7065080457888596, + "grad_norm": 0.75, + "learning_rate": 0.000127795907758579, + "loss": 0.8572, + "step": 27515 + }, + { + "epoch": 0.7065337229847815, + "grad_norm": 0.7421875, + "learning_rate": 0.00012779161945036428, + "loss": 0.9805, + "step": 27516 + }, + { + "epoch": 0.7065594001807033, + "grad_norm": 0.76953125, + "learning_rate": 0.0001277873310867633, + "loss": 0.8402, + "step": 27517 + }, + { + "epoch": 0.7065850773766251, + "grad_norm": 0.73828125, + "learning_rate": 0.00012778304266778477, + "loss": 0.8891, + "step": 27518 + }, + { + "epoch": 0.7066107545725469, + "grad_norm": 0.81640625, + "learning_rate": 0.0001277787541934371, + "loss": 0.8814, + "step": 27519 + }, + { + "epoch": 0.7066364317684687, + "grad_norm": 0.80078125, + "learning_rate": 0.0001277744656637289, + "loss": 0.7922, + "step": 27520 + }, + { + "epoch": 0.7066621089643905, + "grad_norm": 0.80859375, + "learning_rate": 0.00012777017707866865, + "loss": 0.8268, + "step": 27521 + }, + { + "epoch": 0.7066877861603124, + "grad_norm": 0.84765625, + "learning_rate": 0.000127765888438265, + "loss": 0.727, + "step": 27522 + }, + { + "epoch": 0.7067134633562342, + "grad_norm": 0.80078125, + "learning_rate": 0.0001277615997425264, + "loss": 0.7806, + "step": 27523 + }, + { + "epoch": 0.706739140552156, + "grad_norm": 0.78515625, + "learning_rate": 0.00012775731099146147, + "loss": 0.8765, + "step": 27524 + }, + { + "epoch": 0.7067648177480779, + "grad_norm": 0.7578125, + "learning_rate": 0.0001277530221850787, + "loss": 0.7771, + "step": 27525 + }, + { + "epoch": 0.7067904949439996, + "grad_norm": 0.76171875, + "learning_rate": 0.0001277487333233867, + "loss": 0.7786, + "step": 27526 + }, + { + "epoch": 0.7068161721399214, + "grad_norm": 0.796875, + "learning_rate": 0.00012774444440639398, + "loss": 0.8963, + "step": 27527 + }, + { + "epoch": 0.7068418493358433, + "grad_norm": 0.73046875, + "learning_rate": 0.00012774015543410904, + "loss": 0.8822, + "step": 27528 + }, + { + "epoch": 0.7068675265317651, + "grad_norm": 0.8515625, + "learning_rate": 0.0001277358664065405, + "loss": 0.8719, + "step": 27529 + }, + { + "epoch": 0.7068932037276869, + "grad_norm": 0.78125, + "learning_rate": 0.0001277315773236969, + "loss": 0.7711, + "step": 27530 + }, + { + "epoch": 0.7069188809236088, + "grad_norm": 0.73828125, + "learning_rate": 0.00012772728818558678, + "loss": 0.7691, + "step": 27531 + }, + { + "epoch": 0.7069445581195306, + "grad_norm": 0.88671875, + "learning_rate": 0.00012772299899221865, + "loss": 0.8581, + "step": 27532 + }, + { + "epoch": 0.7069702353154523, + "grad_norm": 0.76953125, + "learning_rate": 0.0001277187097436011, + "loss": 0.7911, + "step": 27533 + }, + { + "epoch": 0.7069959125113742, + "grad_norm": 0.75390625, + "learning_rate": 0.00012771442043974264, + "loss": 0.7785, + "step": 27534 + }, + { + "epoch": 0.707021589707296, + "grad_norm": 0.78125, + "learning_rate": 0.00012771013108065188, + "loss": 0.8266, + "step": 27535 + }, + { + "epoch": 0.7070472669032178, + "grad_norm": 0.83984375, + "learning_rate": 0.0001277058416663373, + "loss": 0.8395, + "step": 27536 + }, + { + "epoch": 0.7070729440991397, + "grad_norm": 0.796875, + "learning_rate": 0.00012770155219680752, + "loss": 0.8121, + "step": 27537 + }, + { + "epoch": 0.7070986212950615, + "grad_norm": 0.84765625, + "learning_rate": 0.00012769726267207102, + "loss": 0.7187, + "step": 27538 + }, + { + "epoch": 0.7071242984909832, + "grad_norm": 0.81640625, + "learning_rate": 0.00012769297309213638, + "loss": 0.8048, + "step": 27539 + }, + { + "epoch": 0.7071499756869051, + "grad_norm": 0.76953125, + "learning_rate": 0.00012768868345701216, + "loss": 0.9022, + "step": 27540 + }, + { + "epoch": 0.7071756528828269, + "grad_norm": 0.73828125, + "learning_rate": 0.00012768439376670686, + "loss": 0.8983, + "step": 27541 + }, + { + "epoch": 0.7072013300787487, + "grad_norm": 0.7109375, + "learning_rate": 0.0001276801040212291, + "loss": 0.8131, + "step": 27542 + }, + { + "epoch": 0.7072270072746706, + "grad_norm": 0.81640625, + "learning_rate": 0.00012767581422058738, + "loss": 0.8159, + "step": 27543 + }, + { + "epoch": 0.7072526844705924, + "grad_norm": 0.7578125, + "learning_rate": 0.00012767152436479027, + "loss": 0.897, + "step": 27544 + }, + { + "epoch": 0.7072783616665143, + "grad_norm": 0.7578125, + "learning_rate": 0.00012766723445384632, + "loss": 0.8261, + "step": 27545 + }, + { + "epoch": 0.707304038862436, + "grad_norm": 0.73828125, + "learning_rate": 0.00012766294448776403, + "loss": 0.87, + "step": 27546 + }, + { + "epoch": 0.7073297160583578, + "grad_norm": 0.85546875, + "learning_rate": 0.00012765865446655203, + "loss": 0.9679, + "step": 27547 + }, + { + "epoch": 0.7073553932542797, + "grad_norm": 0.79296875, + "learning_rate": 0.00012765436439021884, + "loss": 0.9759, + "step": 27548 + }, + { + "epoch": 0.7073810704502015, + "grad_norm": 0.703125, + "learning_rate": 0.00012765007425877294, + "loss": 0.6806, + "step": 27549 + }, + { + "epoch": 0.7074067476461233, + "grad_norm": 0.75390625, + "learning_rate": 0.000127645784072223, + "loss": 0.7772, + "step": 27550 + }, + { + "epoch": 0.7074324248420452, + "grad_norm": 0.7265625, + "learning_rate": 0.00012764149383057748, + "loss": 0.8224, + "step": 27551 + }, + { + "epoch": 0.707458102037967, + "grad_norm": 0.70703125, + "learning_rate": 0.00012763720353384497, + "loss": 0.7678, + "step": 27552 + }, + { + "epoch": 0.7074837792338887, + "grad_norm": 0.75390625, + "learning_rate": 0.000127632913182034, + "loss": 0.7965, + "step": 27553 + }, + { + "epoch": 0.7075094564298106, + "grad_norm": 0.828125, + "learning_rate": 0.00012762862277515313, + "loss": 0.9505, + "step": 27554 + }, + { + "epoch": 0.7075351336257324, + "grad_norm": 0.86328125, + "learning_rate": 0.00012762433231321092, + "loss": 0.8418, + "step": 27555 + }, + { + "epoch": 0.7075608108216542, + "grad_norm": 0.8359375, + "learning_rate": 0.0001276200417962159, + "loss": 0.7854, + "step": 27556 + }, + { + "epoch": 0.7075864880175761, + "grad_norm": 0.7890625, + "learning_rate": 0.00012761575122417665, + "loss": 1.0013, + "step": 27557 + }, + { + "epoch": 0.7076121652134979, + "grad_norm": 0.74609375, + "learning_rate": 0.0001276114605971017, + "loss": 0.7969, + "step": 27558 + }, + { + "epoch": 0.7076378424094196, + "grad_norm": 0.80859375, + "learning_rate": 0.0001276071699149996, + "loss": 0.7566, + "step": 27559 + }, + { + "epoch": 0.7076635196053415, + "grad_norm": 0.81640625, + "learning_rate": 0.00012760287917787887, + "loss": 0.8809, + "step": 27560 + }, + { + "epoch": 0.7076891968012633, + "grad_norm": 0.796875, + "learning_rate": 0.00012759858838574814, + "loss": 0.8778, + "step": 27561 + }, + { + "epoch": 0.7077148739971851, + "grad_norm": 0.76953125, + "learning_rate": 0.00012759429753861587, + "loss": 0.8022, + "step": 27562 + }, + { + "epoch": 0.707740551193107, + "grad_norm": 0.8359375, + "learning_rate": 0.0001275900066364907, + "loss": 0.8393, + "step": 27563 + }, + { + "epoch": 0.7077662283890288, + "grad_norm": 0.88671875, + "learning_rate": 0.00012758571567938115, + "loss": 0.9885, + "step": 27564 + }, + { + "epoch": 0.7077919055849506, + "grad_norm": 0.7890625, + "learning_rate": 0.00012758142466729572, + "loss": 0.8553, + "step": 27565 + }, + { + "epoch": 0.7078175827808724, + "grad_norm": 0.75, + "learning_rate": 0.00012757713360024304, + "loss": 0.736, + "step": 27566 + }, + { + "epoch": 0.7078432599767942, + "grad_norm": 0.7109375, + "learning_rate": 0.0001275728424782316, + "loss": 0.771, + "step": 27567 + }, + { + "epoch": 0.707868937172716, + "grad_norm": 0.75, + "learning_rate": 0.00012756855130126996, + "loss": 0.7653, + "step": 27568 + }, + { + "epoch": 0.7078946143686379, + "grad_norm": 0.79296875, + "learning_rate": 0.0001275642600693667, + "loss": 0.8526, + "step": 27569 + }, + { + "epoch": 0.7079202915645597, + "grad_norm": 0.89453125, + "learning_rate": 0.00012755996878253037, + "loss": 0.915, + "step": 27570 + }, + { + "epoch": 0.7079459687604815, + "grad_norm": 0.72265625, + "learning_rate": 0.00012755567744076953, + "loss": 0.8592, + "step": 27571 + }, + { + "epoch": 0.7079716459564034, + "grad_norm": 0.81640625, + "learning_rate": 0.0001275513860440927, + "loss": 0.8111, + "step": 27572 + }, + { + "epoch": 0.7079973231523251, + "grad_norm": 0.79296875, + "learning_rate": 0.0001275470945925084, + "loss": 0.7868, + "step": 27573 + }, + { + "epoch": 0.708023000348247, + "grad_norm": 0.82421875, + "learning_rate": 0.0001275428030860253, + "loss": 0.8195, + "step": 27574 + }, + { + "epoch": 0.7080486775441688, + "grad_norm": 0.7890625, + "learning_rate": 0.00012753851152465183, + "loss": 0.8459, + "step": 27575 + }, + { + "epoch": 0.7080743547400906, + "grad_norm": 0.73828125, + "learning_rate": 0.0001275342199083966, + "loss": 0.8034, + "step": 27576 + }, + { + "epoch": 0.7081000319360125, + "grad_norm": 0.82421875, + "learning_rate": 0.00012752992823726817, + "loss": 0.8003, + "step": 27577 + }, + { + "epoch": 0.7081257091319343, + "grad_norm": 0.84765625, + "learning_rate": 0.00012752563651127508, + "loss": 0.8026, + "step": 27578 + }, + { + "epoch": 0.708151386327856, + "grad_norm": 0.734375, + "learning_rate": 0.0001275213447304259, + "loss": 0.6254, + "step": 27579 + }, + { + "epoch": 0.7081770635237778, + "grad_norm": 0.7109375, + "learning_rate": 0.00012751705289472916, + "loss": 0.7673, + "step": 27580 + }, + { + "epoch": 0.7082027407196997, + "grad_norm": 0.79296875, + "learning_rate": 0.00012751276100419337, + "loss": 0.8615, + "step": 27581 + }, + { + "epoch": 0.7082284179156215, + "grad_norm": 0.79296875, + "learning_rate": 0.00012750846905882715, + "loss": 0.809, + "step": 27582 + }, + { + "epoch": 0.7082540951115434, + "grad_norm": 0.8046875, + "learning_rate": 0.0001275041770586391, + "loss": 0.7856, + "step": 27583 + }, + { + "epoch": 0.7082797723074652, + "grad_norm": 0.7890625, + "learning_rate": 0.00012749988500363764, + "loss": 0.7984, + "step": 27584 + }, + { + "epoch": 0.708305449503387, + "grad_norm": 0.7578125, + "learning_rate": 0.00012749559289383143, + "loss": 0.8364, + "step": 27585 + }, + { + "epoch": 0.7083311266993088, + "grad_norm": 0.80859375, + "learning_rate": 0.00012749130072922896, + "loss": 0.6848, + "step": 27586 + }, + { + "epoch": 0.7083568038952306, + "grad_norm": 0.7890625, + "learning_rate": 0.00012748700850983882, + "loss": 0.8544, + "step": 27587 + }, + { + "epoch": 0.7083824810911524, + "grad_norm": 0.76171875, + "learning_rate": 0.0001274827162356696, + "loss": 0.9597, + "step": 27588 + }, + { + "epoch": 0.7084081582870743, + "grad_norm": 0.80859375, + "learning_rate": 0.00012747842390672972, + "loss": 0.7614, + "step": 27589 + }, + { + "epoch": 0.7084338354829961, + "grad_norm": 0.84765625, + "learning_rate": 0.0001274741315230279, + "loss": 0.9058, + "step": 27590 + }, + { + "epoch": 0.7084595126789179, + "grad_norm": 0.7265625, + "learning_rate": 0.00012746983908457258, + "loss": 0.877, + "step": 27591 + }, + { + "epoch": 0.7084851898748398, + "grad_norm": 0.8515625, + "learning_rate": 0.00012746554659137237, + "loss": 0.8316, + "step": 27592 + }, + { + "epoch": 0.7085108670707615, + "grad_norm": 0.76171875, + "learning_rate": 0.0001274612540434358, + "loss": 0.8177, + "step": 27593 + }, + { + "epoch": 0.7085365442666833, + "grad_norm": 0.77734375, + "learning_rate": 0.00012745696144077142, + "loss": 0.8284, + "step": 27594 + }, + { + "epoch": 0.7085622214626052, + "grad_norm": 0.91796875, + "learning_rate": 0.0001274526687833878, + "loss": 0.9127, + "step": 27595 + }, + { + "epoch": 0.708587898658527, + "grad_norm": 0.75, + "learning_rate": 0.00012744837607129352, + "loss": 0.7009, + "step": 27596 + }, + { + "epoch": 0.7086135758544488, + "grad_norm": 0.7890625, + "learning_rate": 0.00012744408330449708, + "loss": 0.9618, + "step": 27597 + }, + { + "epoch": 0.7086392530503707, + "grad_norm": 0.79296875, + "learning_rate": 0.00012743979048300707, + "loss": 0.7447, + "step": 27598 + }, + { + "epoch": 0.7086649302462924, + "grad_norm": 0.75390625, + "learning_rate": 0.00012743549760683203, + "loss": 0.7246, + "step": 27599 + }, + { + "epoch": 0.7086906074422142, + "grad_norm": 0.8671875, + "learning_rate": 0.00012743120467598052, + "loss": 0.9863, + "step": 27600 + }, + { + "epoch": 0.7087162846381361, + "grad_norm": 0.75, + "learning_rate": 0.0001274269116904611, + "loss": 0.8134, + "step": 27601 + }, + { + "epoch": 0.7087419618340579, + "grad_norm": 0.76171875, + "learning_rate": 0.00012742261865028232, + "loss": 0.7738, + "step": 27602 + }, + { + "epoch": 0.7087676390299797, + "grad_norm": 0.76171875, + "learning_rate": 0.00012741832555545273, + "loss": 0.8125, + "step": 27603 + }, + { + "epoch": 0.7087933162259016, + "grad_norm": 0.80859375, + "learning_rate": 0.00012741403240598093, + "loss": 0.8844, + "step": 27604 + }, + { + "epoch": 0.7088189934218234, + "grad_norm": 0.76171875, + "learning_rate": 0.00012740973920187537, + "loss": 0.9122, + "step": 27605 + }, + { + "epoch": 0.7088446706177451, + "grad_norm": 0.78125, + "learning_rate": 0.00012740544594314475, + "loss": 0.7789, + "step": 27606 + }, + { + "epoch": 0.708870347813667, + "grad_norm": 0.77734375, + "learning_rate": 0.00012740115262979752, + "loss": 0.7547, + "step": 27607 + }, + { + "epoch": 0.7088960250095888, + "grad_norm": 0.83984375, + "learning_rate": 0.00012739685926184223, + "loss": 0.8416, + "step": 27608 + }, + { + "epoch": 0.7089217022055107, + "grad_norm": 0.7265625, + "learning_rate": 0.00012739256583928752, + "loss": 0.8204, + "step": 27609 + }, + { + "epoch": 0.7089473794014325, + "grad_norm": 0.78125, + "learning_rate": 0.00012738827236214193, + "loss": 0.8267, + "step": 27610 + }, + { + "epoch": 0.7089730565973543, + "grad_norm": 0.73046875, + "learning_rate": 0.00012738397883041392, + "loss": 0.7241, + "step": 27611 + }, + { + "epoch": 0.7089987337932762, + "grad_norm": 0.76953125, + "learning_rate": 0.00012737968524411216, + "loss": 0.8131, + "step": 27612 + }, + { + "epoch": 0.7090244109891979, + "grad_norm": 0.83984375, + "learning_rate": 0.00012737539160324513, + "loss": 0.9167, + "step": 27613 + }, + { + "epoch": 0.7090500881851197, + "grad_norm": 0.8359375, + "learning_rate": 0.00012737109790782146, + "loss": 0.8137, + "step": 27614 + }, + { + "epoch": 0.7090757653810416, + "grad_norm": 0.76171875, + "learning_rate": 0.00012736680415784962, + "loss": 0.7348, + "step": 27615 + }, + { + "epoch": 0.7091014425769634, + "grad_norm": 0.72265625, + "learning_rate": 0.0001273625103533382, + "loss": 0.7768, + "step": 27616 + }, + { + "epoch": 0.7091271197728852, + "grad_norm": 0.8203125, + "learning_rate": 0.00012735821649429582, + "loss": 0.8839, + "step": 27617 + }, + { + "epoch": 0.7091527969688071, + "grad_norm": 0.94921875, + "learning_rate": 0.00012735392258073097, + "loss": 0.8879, + "step": 27618 + }, + { + "epoch": 0.7091784741647288, + "grad_norm": 0.828125, + "learning_rate": 0.0001273496286126522, + "loss": 0.8452, + "step": 27619 + }, + { + "epoch": 0.7092041513606506, + "grad_norm": 0.8125, + "learning_rate": 0.00012734533459006812, + "loss": 0.6916, + "step": 27620 + }, + { + "epoch": 0.7092298285565725, + "grad_norm": 0.7578125, + "learning_rate": 0.00012734104051298722, + "loss": 0.8197, + "step": 27621 + }, + { + "epoch": 0.7092555057524943, + "grad_norm": 0.9140625, + "learning_rate": 0.00012733674638141813, + "loss": 0.7827, + "step": 27622 + }, + { + "epoch": 0.7092811829484161, + "grad_norm": 0.96484375, + "learning_rate": 0.00012733245219536939, + "loss": 0.9173, + "step": 27623 + }, + { + "epoch": 0.709306860144338, + "grad_norm": 0.79296875, + "learning_rate": 0.0001273281579548495, + "loss": 0.8022, + "step": 27624 + }, + { + "epoch": 0.7093325373402598, + "grad_norm": 0.79296875, + "learning_rate": 0.00012732386365986708, + "loss": 0.8089, + "step": 27625 + }, + { + "epoch": 0.7093582145361815, + "grad_norm": 0.7734375, + "learning_rate": 0.00012731956931043064, + "loss": 0.7762, + "step": 27626 + }, + { + "epoch": 0.7093838917321034, + "grad_norm": 0.7265625, + "learning_rate": 0.0001273152749065488, + "loss": 0.7254, + "step": 27627 + }, + { + "epoch": 0.7094095689280252, + "grad_norm": 0.79296875, + "learning_rate": 0.0001273109804482301, + "loss": 0.8523, + "step": 27628 + }, + { + "epoch": 0.709435246123947, + "grad_norm": 0.83984375, + "learning_rate": 0.00012730668593548302, + "loss": 0.823, + "step": 27629 + }, + { + "epoch": 0.7094609233198689, + "grad_norm": 0.765625, + "learning_rate": 0.00012730239136831624, + "loss": 0.7734, + "step": 27630 + }, + { + "epoch": 0.7094866005157907, + "grad_norm": 0.765625, + "learning_rate": 0.00012729809674673822, + "loss": 0.7644, + "step": 27631 + }, + { + "epoch": 0.7095122777117125, + "grad_norm": 1.0625, + "learning_rate": 0.00012729380207075758, + "loss": 0.9395, + "step": 27632 + }, + { + "epoch": 0.7095379549076343, + "grad_norm": 0.7421875, + "learning_rate": 0.0001272895073403829, + "loss": 0.7792, + "step": 27633 + }, + { + "epoch": 0.7095636321035561, + "grad_norm": 0.82421875, + "learning_rate": 0.0001272852125556226, + "loss": 0.8839, + "step": 27634 + }, + { + "epoch": 0.7095893092994779, + "grad_norm": 0.8046875, + "learning_rate": 0.0001272809177164854, + "loss": 0.8078, + "step": 27635 + }, + { + "epoch": 0.7096149864953998, + "grad_norm": 0.80078125, + "learning_rate": 0.00012727662282297977, + "loss": 0.9905, + "step": 27636 + }, + { + "epoch": 0.7096406636913216, + "grad_norm": 0.79296875, + "learning_rate": 0.00012727232787511432, + "loss": 0.8326, + "step": 27637 + }, + { + "epoch": 0.7096663408872435, + "grad_norm": 0.8125, + "learning_rate": 0.0001272680328728976, + "loss": 0.966, + "step": 27638 + }, + { + "epoch": 0.7096920180831652, + "grad_norm": 0.8046875, + "learning_rate": 0.0001272637378163381, + "loss": 0.7764, + "step": 27639 + }, + { + "epoch": 0.709717695279087, + "grad_norm": 0.8359375, + "learning_rate": 0.00012725944270544444, + "loss": 0.8511, + "step": 27640 + }, + { + "epoch": 0.7097433724750088, + "grad_norm": 0.8359375, + "learning_rate": 0.0001272551475402252, + "loss": 0.8657, + "step": 27641 + }, + { + "epoch": 0.7097690496709307, + "grad_norm": 0.7734375, + "learning_rate": 0.00012725085232068888, + "loss": 0.7589, + "step": 27642 + }, + { + "epoch": 0.7097947268668525, + "grad_norm": 0.75, + "learning_rate": 0.00012724655704684414, + "loss": 0.7533, + "step": 27643 + }, + { + "epoch": 0.7098204040627744, + "grad_norm": 0.7578125, + "learning_rate": 0.00012724226171869943, + "loss": 0.8245, + "step": 27644 + }, + { + "epoch": 0.7098460812586962, + "grad_norm": 0.796875, + "learning_rate": 0.00012723796633626336, + "loss": 0.7941, + "step": 27645 + }, + { + "epoch": 0.7098717584546179, + "grad_norm": 0.7109375, + "learning_rate": 0.00012723367089954448, + "loss": 0.7686, + "step": 27646 + }, + { + "epoch": 0.7098974356505398, + "grad_norm": 0.828125, + "learning_rate": 0.00012722937540855136, + "loss": 0.8846, + "step": 27647 + }, + { + "epoch": 0.7099231128464616, + "grad_norm": 0.73046875, + "learning_rate": 0.00012722507986329254, + "loss": 0.8758, + "step": 27648 + }, + { + "epoch": 0.7099487900423834, + "grad_norm": 0.78515625, + "learning_rate": 0.00012722078426377658, + "loss": 0.9541, + "step": 27649 + }, + { + "epoch": 0.7099744672383053, + "grad_norm": 0.7734375, + "learning_rate": 0.0001272164886100121, + "loss": 0.6981, + "step": 27650 + }, + { + "epoch": 0.7100001444342271, + "grad_norm": 0.6953125, + "learning_rate": 0.0001272121929020076, + "loss": 0.6826, + "step": 27651 + }, + { + "epoch": 0.7100258216301489, + "grad_norm": 0.80859375, + "learning_rate": 0.00012720789713977163, + "loss": 0.9983, + "step": 27652 + }, + { + "epoch": 0.7100514988260707, + "grad_norm": 0.75, + "learning_rate": 0.00012720360132331278, + "loss": 0.8366, + "step": 27653 + }, + { + "epoch": 0.7100771760219925, + "grad_norm": 0.78515625, + "learning_rate": 0.00012719930545263965, + "loss": 0.9011, + "step": 27654 + }, + { + "epoch": 0.7101028532179143, + "grad_norm": 0.76953125, + "learning_rate": 0.00012719500952776075, + "loss": 0.9565, + "step": 27655 + }, + { + "epoch": 0.7101285304138362, + "grad_norm": 0.71484375, + "learning_rate": 0.00012719071354868464, + "loss": 0.7432, + "step": 27656 + }, + { + "epoch": 0.710154207609758, + "grad_norm": 0.84375, + "learning_rate": 0.0001271864175154199, + "loss": 0.878, + "step": 27657 + }, + { + "epoch": 0.7101798848056798, + "grad_norm": 0.78125, + "learning_rate": 0.00012718212142797506, + "loss": 1.033, + "step": 27658 + }, + { + "epoch": 0.7102055620016016, + "grad_norm": 0.75390625, + "learning_rate": 0.00012717782528635872, + "loss": 0.8244, + "step": 27659 + }, + { + "epoch": 0.7102312391975234, + "grad_norm": 0.8203125, + "learning_rate": 0.00012717352909057946, + "loss": 0.8755, + "step": 27660 + }, + { + "epoch": 0.7102569163934452, + "grad_norm": 0.80078125, + "learning_rate": 0.0001271692328406458, + "loss": 0.8416, + "step": 27661 + }, + { + "epoch": 0.7102825935893671, + "grad_norm": 0.8671875, + "learning_rate": 0.0001271649365365663, + "loss": 0.9164, + "step": 27662 + }, + { + "epoch": 0.7103082707852889, + "grad_norm": 0.78515625, + "learning_rate": 0.0001271606401783495, + "loss": 0.7898, + "step": 27663 + }, + { + "epoch": 0.7103339479812107, + "grad_norm": 0.73046875, + "learning_rate": 0.00012715634376600407, + "loss": 0.8516, + "step": 27664 + }, + { + "epoch": 0.7103596251771326, + "grad_norm": 0.8203125, + "learning_rate": 0.00012715204729953848, + "loss": 0.9209, + "step": 27665 + }, + { + "epoch": 0.7103853023730543, + "grad_norm": 0.71875, + "learning_rate": 0.00012714775077896125, + "loss": 0.7688, + "step": 27666 + }, + { + "epoch": 0.7104109795689761, + "grad_norm": 0.7890625, + "learning_rate": 0.0001271434542042811, + "loss": 0.9712, + "step": 27667 + }, + { + "epoch": 0.710436656764898, + "grad_norm": 0.796875, + "learning_rate": 0.00012713915757550645, + "loss": 0.878, + "step": 27668 + }, + { + "epoch": 0.7104623339608198, + "grad_norm": 0.765625, + "learning_rate": 0.00012713486089264587, + "loss": 0.8712, + "step": 27669 + }, + { + "epoch": 0.7104880111567417, + "grad_norm": 0.77734375, + "learning_rate": 0.00012713056415570803, + "loss": 0.8606, + "step": 27670 + }, + { + "epoch": 0.7105136883526635, + "grad_norm": 0.74609375, + "learning_rate": 0.0001271262673647014, + "loss": 0.8288, + "step": 27671 + }, + { + "epoch": 0.7105393655485853, + "grad_norm": 0.77734375, + "learning_rate": 0.00012712197051963458, + "loss": 0.9312, + "step": 27672 + }, + { + "epoch": 0.710565042744507, + "grad_norm": 0.6796875, + "learning_rate": 0.0001271176736205161, + "loss": 0.743, + "step": 27673 + }, + { + "epoch": 0.7105907199404289, + "grad_norm": 0.81640625, + "learning_rate": 0.00012711337666735455, + "loss": 1.0312, + "step": 27674 + }, + { + "epoch": 0.7106163971363507, + "grad_norm": 0.77734375, + "learning_rate": 0.0001271090796601585, + "loss": 0.7654, + "step": 27675 + }, + { + "epoch": 0.7106420743322726, + "grad_norm": 0.89453125, + "learning_rate": 0.0001271047825989365, + "loss": 0.742, + "step": 27676 + }, + { + "epoch": 0.7106677515281944, + "grad_norm": 0.73828125, + "learning_rate": 0.00012710048548369713, + "loss": 0.8489, + "step": 27677 + }, + { + "epoch": 0.7106934287241162, + "grad_norm": 0.70703125, + "learning_rate": 0.00012709618831444893, + "loss": 0.7765, + "step": 27678 + }, + { + "epoch": 0.710719105920038, + "grad_norm": 0.79296875, + "learning_rate": 0.00012709189109120044, + "loss": 1.006, + "step": 27679 + }, + { + "epoch": 0.7107447831159598, + "grad_norm": 1.4140625, + "learning_rate": 0.0001270875938139603, + "loss": 0.8726, + "step": 27680 + }, + { + "epoch": 0.7107704603118816, + "grad_norm": 0.79296875, + "learning_rate": 0.00012708329648273705, + "loss": 0.7837, + "step": 27681 + }, + { + "epoch": 0.7107961375078035, + "grad_norm": 0.8359375, + "learning_rate": 0.0001270789990975392, + "loss": 0.7822, + "step": 27682 + }, + { + "epoch": 0.7108218147037253, + "grad_norm": 0.84765625, + "learning_rate": 0.0001270747016583754, + "loss": 0.7022, + "step": 27683 + }, + { + "epoch": 0.7108474918996471, + "grad_norm": 0.73046875, + "learning_rate": 0.00012707040416525412, + "loss": 0.7603, + "step": 27684 + }, + { + "epoch": 0.710873169095569, + "grad_norm": 0.93359375, + "learning_rate": 0.00012706610661818396, + "loss": 0.8589, + "step": 27685 + }, + { + "epoch": 0.7108988462914907, + "grad_norm": 0.7265625, + "learning_rate": 0.0001270618090171735, + "loss": 0.8469, + "step": 27686 + }, + { + "epoch": 0.7109245234874125, + "grad_norm": 0.7578125, + "learning_rate": 0.00012705751136223129, + "loss": 0.7194, + "step": 27687 + }, + { + "epoch": 0.7109502006833344, + "grad_norm": 0.8671875, + "learning_rate": 0.00012705321365336593, + "loss": 0.8006, + "step": 27688 + }, + { + "epoch": 0.7109758778792562, + "grad_norm": 0.7890625, + "learning_rate": 0.000127048915890586, + "loss": 0.7512, + "step": 27689 + }, + { + "epoch": 0.711001555075178, + "grad_norm": 0.79296875, + "learning_rate": 0.00012704461807389996, + "loss": 0.8143, + "step": 27690 + }, + { + "epoch": 0.7110272322710999, + "grad_norm": 0.83203125, + "learning_rate": 0.00012704032020331645, + "loss": 1.0095, + "step": 27691 + }, + { + "epoch": 0.7110529094670216, + "grad_norm": 0.7421875, + "learning_rate": 0.00012703602227884405, + "loss": 1.0343, + "step": 27692 + }, + { + "epoch": 0.7110785866629434, + "grad_norm": 0.79296875, + "learning_rate": 0.00012703172430049124, + "loss": 0.8144, + "step": 27693 + }, + { + "epoch": 0.7111042638588653, + "grad_norm": 0.71484375, + "learning_rate": 0.00012702742626826669, + "loss": 0.8402, + "step": 27694 + }, + { + "epoch": 0.7111299410547871, + "grad_norm": 0.76171875, + "learning_rate": 0.0001270231281821789, + "loss": 0.7118, + "step": 27695 + }, + { + "epoch": 0.7111556182507089, + "grad_norm": 0.7734375, + "learning_rate": 0.0001270188300422365, + "loss": 0.8158, + "step": 27696 + }, + { + "epoch": 0.7111812954466308, + "grad_norm": 0.80078125, + "learning_rate": 0.00012701453184844801, + "loss": 0.8032, + "step": 27697 + }, + { + "epoch": 0.7112069726425526, + "grad_norm": 0.76953125, + "learning_rate": 0.00012701023360082195, + "loss": 0.8701, + "step": 27698 + }, + { + "epoch": 0.7112326498384743, + "grad_norm": 0.7265625, + "learning_rate": 0.00012700593529936695, + "loss": 0.8289, + "step": 27699 + }, + { + "epoch": 0.7112583270343962, + "grad_norm": 0.83203125, + "learning_rate": 0.00012700163694409159, + "loss": 0.9478, + "step": 27700 + }, + { + "epoch": 0.711284004230318, + "grad_norm": 0.79296875, + "learning_rate": 0.00012699733853500434, + "loss": 0.9622, + "step": 27701 + }, + { + "epoch": 0.7113096814262398, + "grad_norm": 0.734375, + "learning_rate": 0.0001269930400721139, + "loss": 0.81, + "step": 27702 + }, + { + "epoch": 0.7113353586221617, + "grad_norm": 0.74609375, + "learning_rate": 0.00012698874155542874, + "loss": 0.8943, + "step": 27703 + }, + { + "epoch": 0.7113610358180835, + "grad_norm": 0.89453125, + "learning_rate": 0.00012698444298495746, + "loss": 0.8409, + "step": 27704 + }, + { + "epoch": 0.7113867130140054, + "grad_norm": 0.84375, + "learning_rate": 0.00012698014436070864, + "loss": 0.7887, + "step": 27705 + }, + { + "epoch": 0.7114123902099271, + "grad_norm": 0.828125, + "learning_rate": 0.0001269758456826908, + "loss": 0.8764, + "step": 27706 + }, + { + "epoch": 0.7114380674058489, + "grad_norm": 0.80078125, + "learning_rate": 0.00012697154695091257, + "loss": 0.8343, + "step": 27707 + }, + { + "epoch": 0.7114637446017708, + "grad_norm": 0.875, + "learning_rate": 0.00012696724816538247, + "loss": 0.8419, + "step": 27708 + }, + { + "epoch": 0.7114894217976926, + "grad_norm": 0.734375, + "learning_rate": 0.00012696294932610906, + "loss": 0.7524, + "step": 27709 + }, + { + "epoch": 0.7115150989936144, + "grad_norm": 0.79296875, + "learning_rate": 0.00012695865043310095, + "loss": 0.8757, + "step": 27710 + }, + { + "epoch": 0.7115407761895363, + "grad_norm": 0.84765625, + "learning_rate": 0.00012695435148636666, + "loss": 0.8607, + "step": 27711 + }, + { + "epoch": 0.711566453385458, + "grad_norm": 0.71875, + "learning_rate": 0.0001269500524859148, + "loss": 0.7702, + "step": 27712 + }, + { + "epoch": 0.7115921305813798, + "grad_norm": 0.77734375, + "learning_rate": 0.00012694575343175391, + "loss": 0.8466, + "step": 27713 + }, + { + "epoch": 0.7116178077773017, + "grad_norm": 0.79296875, + "learning_rate": 0.00012694145432389257, + "loss": 0.8171, + "step": 27714 + }, + { + "epoch": 0.7116434849732235, + "grad_norm": 0.734375, + "learning_rate": 0.00012693715516233934, + "loss": 0.7184, + "step": 27715 + }, + { + "epoch": 0.7116691621691453, + "grad_norm": 0.71484375, + "learning_rate": 0.0001269328559471028, + "loss": 0.7196, + "step": 27716 + }, + { + "epoch": 0.7116948393650672, + "grad_norm": 0.734375, + "learning_rate": 0.00012692855667819156, + "loss": 0.7896, + "step": 27717 + }, + { + "epoch": 0.711720516560989, + "grad_norm": 0.796875, + "learning_rate": 0.00012692425735561408, + "loss": 0.9383, + "step": 27718 + }, + { + "epoch": 0.7117461937569107, + "grad_norm": 0.98828125, + "learning_rate": 0.00012691995797937898, + "loss": 0.816, + "step": 27719 + }, + { + "epoch": 0.7117718709528326, + "grad_norm": 0.7578125, + "learning_rate": 0.00012691565854949487, + "loss": 0.8402, + "step": 27720 + }, + { + "epoch": 0.7117975481487544, + "grad_norm": 0.828125, + "learning_rate": 0.00012691135906597027, + "loss": 0.9498, + "step": 27721 + }, + { + "epoch": 0.7118232253446762, + "grad_norm": 0.7421875, + "learning_rate": 0.00012690705952881376, + "loss": 0.847, + "step": 27722 + }, + { + "epoch": 0.7118489025405981, + "grad_norm": 0.71484375, + "learning_rate": 0.00012690275993803394, + "loss": 0.8325, + "step": 27723 + }, + { + "epoch": 0.7118745797365199, + "grad_norm": 0.7265625, + "learning_rate": 0.0001268984602936393, + "loss": 0.8015, + "step": 27724 + }, + { + "epoch": 0.7119002569324417, + "grad_norm": 0.76171875, + "learning_rate": 0.00012689416059563847, + "loss": 0.7514, + "step": 27725 + }, + { + "epoch": 0.7119259341283635, + "grad_norm": 0.7890625, + "learning_rate": 0.00012688986084404005, + "loss": 0.7829, + "step": 27726 + }, + { + "epoch": 0.7119516113242853, + "grad_norm": 0.81640625, + "learning_rate": 0.0001268855610388525, + "loss": 0.6726, + "step": 27727 + }, + { + "epoch": 0.7119772885202071, + "grad_norm": 0.71484375, + "learning_rate": 0.0001268812611800845, + "loss": 0.8424, + "step": 27728 + }, + { + "epoch": 0.712002965716129, + "grad_norm": 0.7109375, + "learning_rate": 0.0001268769612677446, + "loss": 0.7067, + "step": 27729 + }, + { + "epoch": 0.7120286429120508, + "grad_norm": 0.96484375, + "learning_rate": 0.0001268726613018413, + "loss": 0.8295, + "step": 27730 + }, + { + "epoch": 0.7120543201079726, + "grad_norm": 0.7578125, + "learning_rate": 0.00012686836128238322, + "loss": 0.8597, + "step": 27731 + }, + { + "epoch": 0.7120799973038944, + "grad_norm": 0.76171875, + "learning_rate": 0.00012686406120937893, + "loss": 0.8251, + "step": 27732 + }, + { + "epoch": 0.7121056744998162, + "grad_norm": 0.84375, + "learning_rate": 0.00012685976108283697, + "loss": 0.8726, + "step": 27733 + }, + { + "epoch": 0.712131351695738, + "grad_norm": 0.7578125, + "learning_rate": 0.00012685546090276595, + "loss": 0.9316, + "step": 27734 + }, + { + "epoch": 0.7121570288916599, + "grad_norm": 0.76953125, + "learning_rate": 0.00012685116066917443, + "loss": 0.8041, + "step": 27735 + }, + { + "epoch": 0.7121827060875817, + "grad_norm": 0.80078125, + "learning_rate": 0.00012684686038207097, + "loss": 0.8514, + "step": 27736 + }, + { + "epoch": 0.7122083832835036, + "grad_norm": 0.8515625, + "learning_rate": 0.00012684256004146413, + "loss": 0.9287, + "step": 27737 + }, + { + "epoch": 0.7122340604794254, + "grad_norm": 0.75, + "learning_rate": 0.0001268382596473625, + "loss": 0.8345, + "step": 27738 + }, + { + "epoch": 0.7122597376753471, + "grad_norm": 0.75390625, + "learning_rate": 0.00012683395919977465, + "loss": 0.7524, + "step": 27739 + }, + { + "epoch": 0.712285414871269, + "grad_norm": 0.77734375, + "learning_rate": 0.00012682965869870912, + "loss": 0.781, + "step": 27740 + }, + { + "epoch": 0.7123110920671908, + "grad_norm": 0.83984375, + "learning_rate": 0.0001268253581441745, + "loss": 0.7735, + "step": 27741 + }, + { + "epoch": 0.7123367692631126, + "grad_norm": 0.83203125, + "learning_rate": 0.0001268210575361794, + "loss": 0.8083, + "step": 27742 + }, + { + "epoch": 0.7123624464590345, + "grad_norm": 0.75390625, + "learning_rate": 0.00012681675687473235, + "loss": 0.8561, + "step": 27743 + }, + { + "epoch": 0.7123881236549563, + "grad_norm": 0.84765625, + "learning_rate": 0.0001268124561598419, + "loss": 0.8611, + "step": 27744 + }, + { + "epoch": 0.7124138008508781, + "grad_norm": 0.765625, + "learning_rate": 0.00012680815539151667, + "loss": 0.7203, + "step": 27745 + }, + { + "epoch": 0.7124394780467999, + "grad_norm": 0.90234375, + "learning_rate": 0.0001268038545697652, + "loss": 0.8054, + "step": 27746 + }, + { + "epoch": 0.7124651552427217, + "grad_norm": 0.83203125, + "learning_rate": 0.00012679955369459606, + "loss": 0.8479, + "step": 27747 + }, + { + "epoch": 0.7124908324386435, + "grad_norm": 0.76953125, + "learning_rate": 0.00012679525276601786, + "loss": 0.8711, + "step": 27748 + }, + { + "epoch": 0.7125165096345654, + "grad_norm": 0.7734375, + "learning_rate": 0.0001267909517840391, + "loss": 0.8158, + "step": 27749 + }, + { + "epoch": 0.7125421868304872, + "grad_norm": 0.77734375, + "learning_rate": 0.00012678665074866845, + "loss": 0.8653, + "step": 27750 + }, + { + "epoch": 0.712567864026409, + "grad_norm": 0.76953125, + "learning_rate": 0.00012678234965991437, + "loss": 0.9317, + "step": 27751 + }, + { + "epoch": 0.7125935412223308, + "grad_norm": 0.69921875, + "learning_rate": 0.0001267780485177855, + "loss": 0.7509, + "step": 27752 + }, + { + "epoch": 0.7126192184182526, + "grad_norm": 0.765625, + "learning_rate": 0.0001267737473222904, + "loss": 0.9129, + "step": 27753 + }, + { + "epoch": 0.7126448956141744, + "grad_norm": 0.71484375, + "learning_rate": 0.00012676944607343765, + "loss": 0.7227, + "step": 27754 + }, + { + "epoch": 0.7126705728100963, + "grad_norm": 0.796875, + "learning_rate": 0.00012676514477123584, + "loss": 0.851, + "step": 27755 + }, + { + "epoch": 0.7126962500060181, + "grad_norm": 0.72265625, + "learning_rate": 0.00012676084341569342, + "loss": 0.7137, + "step": 27756 + }, + { + "epoch": 0.7127219272019399, + "grad_norm": 0.79296875, + "learning_rate": 0.00012675654200681912, + "loss": 0.8741, + "step": 27757 + }, + { + "epoch": 0.7127476043978618, + "grad_norm": 0.78125, + "learning_rate": 0.00012675224054462146, + "loss": 0.8655, + "step": 27758 + }, + { + "epoch": 0.7127732815937835, + "grad_norm": 0.79296875, + "learning_rate": 0.00012674793902910897, + "loss": 0.7653, + "step": 27759 + }, + { + "epoch": 0.7127989587897053, + "grad_norm": 0.7734375, + "learning_rate": 0.00012674363746029027, + "loss": 0.7827, + "step": 27760 + }, + { + "epoch": 0.7128246359856272, + "grad_norm": 0.7421875, + "learning_rate": 0.00012673933583817393, + "loss": 0.778, + "step": 27761 + }, + { + "epoch": 0.712850313181549, + "grad_norm": 0.78515625, + "learning_rate": 0.00012673503416276848, + "loss": 0.9418, + "step": 27762 + }, + { + "epoch": 0.7128759903774708, + "grad_norm": 0.81640625, + "learning_rate": 0.00012673073243408254, + "loss": 0.9155, + "step": 27763 + }, + { + "epoch": 0.7129016675733927, + "grad_norm": 0.82421875, + "learning_rate": 0.00012672643065212466, + "loss": 0.7603, + "step": 27764 + }, + { + "epoch": 0.7129273447693145, + "grad_norm": 0.74609375, + "learning_rate": 0.00012672212881690342, + "loss": 0.9146, + "step": 27765 + }, + { + "epoch": 0.7129530219652362, + "grad_norm": 0.73828125, + "learning_rate": 0.00012671782692842739, + "loss": 0.8152, + "step": 27766 + }, + { + "epoch": 0.7129786991611581, + "grad_norm": 0.75390625, + "learning_rate": 0.00012671352498670513, + "loss": 0.7405, + "step": 27767 + }, + { + "epoch": 0.7130043763570799, + "grad_norm": 0.76171875, + "learning_rate": 0.0001267092229917453, + "loss": 0.7766, + "step": 27768 + }, + { + "epoch": 0.7130300535530018, + "grad_norm": 0.83203125, + "learning_rate": 0.00012670492094355633, + "loss": 0.8866, + "step": 27769 + }, + { + "epoch": 0.7130557307489236, + "grad_norm": 0.796875, + "learning_rate": 0.00012670061884214687, + "loss": 0.9536, + "step": 27770 + }, + { + "epoch": 0.7130814079448454, + "grad_norm": 0.86328125, + "learning_rate": 0.0001266963166875255, + "loss": 0.7436, + "step": 27771 + }, + { + "epoch": 0.7131070851407671, + "grad_norm": 0.77734375, + "learning_rate": 0.00012669201447970075, + "loss": 0.777, + "step": 27772 + }, + { + "epoch": 0.713132762336689, + "grad_norm": 0.8203125, + "learning_rate": 0.00012668771221868125, + "loss": 0.8118, + "step": 27773 + }, + { + "epoch": 0.7131584395326108, + "grad_norm": 0.75390625, + "learning_rate": 0.0001266834099044756, + "loss": 0.8684, + "step": 27774 + }, + { + "epoch": 0.7131841167285327, + "grad_norm": 0.76171875, + "learning_rate": 0.00012667910753709225, + "loss": 0.7915, + "step": 27775 + }, + { + "epoch": 0.7132097939244545, + "grad_norm": 0.7734375, + "learning_rate": 0.00012667480511653988, + "loss": 0.9003, + "step": 27776 + }, + { + "epoch": 0.7132354711203763, + "grad_norm": 0.796875, + "learning_rate": 0.00012667050264282705, + "loss": 0.7529, + "step": 27777 + }, + { + "epoch": 0.7132611483162982, + "grad_norm": 0.6953125, + "learning_rate": 0.0001266662001159623, + "loss": 0.8259, + "step": 27778 + }, + { + "epoch": 0.7132868255122199, + "grad_norm": 0.85546875, + "learning_rate": 0.0001266618975359542, + "loss": 0.8324, + "step": 27779 + }, + { + "epoch": 0.7133125027081417, + "grad_norm": 0.96484375, + "learning_rate": 0.00012665759490281138, + "loss": 0.7331, + "step": 27780 + }, + { + "epoch": 0.7133381799040636, + "grad_norm": 0.7578125, + "learning_rate": 0.00012665329221654237, + "loss": 0.8479, + "step": 27781 + }, + { + "epoch": 0.7133638570999854, + "grad_norm": 0.8046875, + "learning_rate": 0.00012664898947715577, + "loss": 0.8477, + "step": 27782 + }, + { + "epoch": 0.7133895342959072, + "grad_norm": 0.8203125, + "learning_rate": 0.0001266446866846601, + "loss": 0.8443, + "step": 27783 + }, + { + "epoch": 0.7134152114918291, + "grad_norm": 0.70703125, + "learning_rate": 0.00012664038383906402, + "loss": 0.7232, + "step": 27784 + }, + { + "epoch": 0.7134408886877509, + "grad_norm": 0.80078125, + "learning_rate": 0.00012663608094037602, + "loss": 0.8184, + "step": 27785 + }, + { + "epoch": 0.7134665658836726, + "grad_norm": 0.8515625, + "learning_rate": 0.00012663177798860473, + "loss": 0.8444, + "step": 27786 + }, + { + "epoch": 0.7134922430795945, + "grad_norm": 0.72265625, + "learning_rate": 0.00012662747498375874, + "loss": 0.8404, + "step": 27787 + }, + { + "epoch": 0.7135179202755163, + "grad_norm": 0.8125, + "learning_rate": 0.00012662317192584655, + "loss": 0.8876, + "step": 27788 + }, + { + "epoch": 0.7135435974714381, + "grad_norm": 0.7734375, + "learning_rate": 0.00012661886881487684, + "loss": 0.7594, + "step": 27789 + }, + { + "epoch": 0.71356927466736, + "grad_norm": 0.765625, + "learning_rate": 0.00012661456565085812, + "loss": 0.7512, + "step": 27790 + }, + { + "epoch": 0.7135949518632818, + "grad_norm": 0.8203125, + "learning_rate": 0.00012661026243379893, + "loss": 0.7886, + "step": 27791 + }, + { + "epoch": 0.7136206290592035, + "grad_norm": 0.95703125, + "learning_rate": 0.00012660595916370792, + "loss": 0.7379, + "step": 27792 + }, + { + "epoch": 0.7136463062551254, + "grad_norm": 0.7890625, + "learning_rate": 0.00012660165584059361, + "loss": 0.7361, + "step": 27793 + }, + { + "epoch": 0.7136719834510472, + "grad_norm": 0.8125, + "learning_rate": 0.00012659735246446462, + "loss": 0.7467, + "step": 27794 + }, + { + "epoch": 0.713697660646969, + "grad_norm": 0.8203125, + "learning_rate": 0.00012659304903532955, + "loss": 0.921, + "step": 27795 + }, + { + "epoch": 0.7137233378428909, + "grad_norm": 0.73828125, + "learning_rate": 0.00012658874555319689, + "loss": 0.8094, + "step": 27796 + }, + { + "epoch": 0.7137490150388127, + "grad_norm": 0.78515625, + "learning_rate": 0.00012658444201807528, + "loss": 0.8026, + "step": 27797 + }, + { + "epoch": 0.7137746922347346, + "grad_norm": 0.69140625, + "learning_rate": 0.00012658013842997327, + "loss": 0.7832, + "step": 27798 + }, + { + "epoch": 0.7138003694306563, + "grad_norm": 0.72265625, + "learning_rate": 0.00012657583478889944, + "loss": 0.9466, + "step": 27799 + }, + { + "epoch": 0.7138260466265781, + "grad_norm": 0.80078125, + "learning_rate": 0.0001265715310948624, + "loss": 0.8318, + "step": 27800 + }, + { + "epoch": 0.7138517238225, + "grad_norm": 0.84765625, + "learning_rate": 0.00012656722734787068, + "loss": 0.9237, + "step": 27801 + }, + { + "epoch": 0.7138774010184218, + "grad_norm": 0.7734375, + "learning_rate": 0.00012656292354793286, + "loss": 0.8635, + "step": 27802 + }, + { + "epoch": 0.7139030782143436, + "grad_norm": 0.7578125, + "learning_rate": 0.00012655861969505756, + "loss": 0.9285, + "step": 27803 + }, + { + "epoch": 0.7139287554102655, + "grad_norm": 0.72265625, + "learning_rate": 0.0001265543157892533, + "loss": 0.793, + "step": 27804 + }, + { + "epoch": 0.7139544326061873, + "grad_norm": 0.7265625, + "learning_rate": 0.0001265500118305287, + "loss": 0.714, + "step": 27805 + }, + { + "epoch": 0.713980109802109, + "grad_norm": 0.7265625, + "learning_rate": 0.00012654570781889235, + "loss": 0.7752, + "step": 27806 + }, + { + "epoch": 0.7140057869980309, + "grad_norm": 0.75390625, + "learning_rate": 0.00012654140375435274, + "loss": 0.8225, + "step": 27807 + }, + { + "epoch": 0.7140314641939527, + "grad_norm": 0.77734375, + "learning_rate": 0.0001265370996369186, + "loss": 0.8587, + "step": 27808 + }, + { + "epoch": 0.7140571413898745, + "grad_norm": 0.8359375, + "learning_rate": 0.00012653279546659835, + "loss": 0.8018, + "step": 27809 + }, + { + "epoch": 0.7140828185857964, + "grad_norm": 0.734375, + "learning_rate": 0.00012652849124340066, + "loss": 0.8899, + "step": 27810 + }, + { + "epoch": 0.7141084957817182, + "grad_norm": 0.8046875, + "learning_rate": 0.00012652418696733407, + "loss": 0.899, + "step": 27811 + }, + { + "epoch": 0.7141341729776399, + "grad_norm": 0.73046875, + "learning_rate": 0.00012651988263840715, + "loss": 0.7512, + "step": 27812 + }, + { + "epoch": 0.7141598501735618, + "grad_norm": 0.80078125, + "learning_rate": 0.0001265155782566285, + "loss": 0.7852, + "step": 27813 + }, + { + "epoch": 0.7141855273694836, + "grad_norm": 0.84765625, + "learning_rate": 0.00012651127382200679, + "loss": 0.8367, + "step": 27814 + }, + { + "epoch": 0.7142112045654054, + "grad_norm": 0.8359375, + "learning_rate": 0.00012650696933455042, + "loss": 0.8794, + "step": 27815 + }, + { + "epoch": 0.7142368817613273, + "grad_norm": 0.703125, + "learning_rate": 0.00012650266479426807, + "loss": 0.7524, + "step": 27816 + }, + { + "epoch": 0.7142625589572491, + "grad_norm": 0.76953125, + "learning_rate": 0.00012649836020116832, + "loss": 0.8558, + "step": 27817 + }, + { + "epoch": 0.7142882361531709, + "grad_norm": 0.88671875, + "learning_rate": 0.00012649405555525972, + "loss": 0.7952, + "step": 27818 + }, + { + "epoch": 0.7143139133490927, + "grad_norm": 0.75, + "learning_rate": 0.00012648975085655085, + "loss": 0.7814, + "step": 27819 + }, + { + "epoch": 0.7143395905450145, + "grad_norm": 0.7890625, + "learning_rate": 0.0001264854461050503, + "loss": 0.9092, + "step": 27820 + }, + { + "epoch": 0.7143652677409363, + "grad_norm": 0.7890625, + "learning_rate": 0.00012648114130076667, + "loss": 0.8346, + "step": 27821 + }, + { + "epoch": 0.7143909449368582, + "grad_norm": 0.7734375, + "learning_rate": 0.00012647683644370853, + "loss": 0.9136, + "step": 27822 + }, + { + "epoch": 0.71441662213278, + "grad_norm": 0.74609375, + "learning_rate": 0.00012647253153388438, + "loss": 0.8321, + "step": 27823 + }, + { + "epoch": 0.7144422993287018, + "grad_norm": 0.890625, + "learning_rate": 0.0001264682265713029, + "loss": 0.7993, + "step": 27824 + }, + { + "epoch": 0.7144679765246237, + "grad_norm": 0.78515625, + "learning_rate": 0.00012646392155597267, + "loss": 0.9992, + "step": 27825 + }, + { + "epoch": 0.7144936537205454, + "grad_norm": 0.78515625, + "learning_rate": 0.0001264596164879022, + "loss": 0.917, + "step": 27826 + }, + { + "epoch": 0.7145193309164672, + "grad_norm": 0.83984375, + "learning_rate": 0.00012645531136710013, + "loss": 0.8211, + "step": 27827 + }, + { + "epoch": 0.7145450081123891, + "grad_norm": 0.7578125, + "learning_rate": 0.00012645100619357498, + "loss": 0.7921, + "step": 27828 + }, + { + "epoch": 0.7145706853083109, + "grad_norm": 0.7890625, + "learning_rate": 0.00012644670096733537, + "loss": 0.9801, + "step": 27829 + }, + { + "epoch": 0.7145963625042328, + "grad_norm": 0.7734375, + "learning_rate": 0.0001264423956883899, + "loss": 0.8694, + "step": 27830 + }, + { + "epoch": 0.7146220397001546, + "grad_norm": 0.80078125, + "learning_rate": 0.0001264380903567471, + "loss": 0.8199, + "step": 27831 + }, + { + "epoch": 0.7146477168960763, + "grad_norm": 0.8671875, + "learning_rate": 0.0001264337849724156, + "loss": 0.8759, + "step": 27832 + }, + { + "epoch": 0.7146733940919981, + "grad_norm": 0.796875, + "learning_rate": 0.0001264294795354039, + "loss": 0.7603, + "step": 27833 + }, + { + "epoch": 0.71469907128792, + "grad_norm": 0.81640625, + "learning_rate": 0.0001264251740457207, + "loss": 0.8108, + "step": 27834 + }, + { + "epoch": 0.7147247484838418, + "grad_norm": 0.76953125, + "learning_rate": 0.0001264208685033745, + "loss": 0.885, + "step": 27835 + }, + { + "epoch": 0.7147504256797637, + "grad_norm": 0.73828125, + "learning_rate": 0.00012641656290837385, + "loss": 0.8375, + "step": 27836 + }, + { + "epoch": 0.7147761028756855, + "grad_norm": 0.85546875, + "learning_rate": 0.0001264122572607274, + "loss": 0.797, + "step": 27837 + }, + { + "epoch": 0.7148017800716073, + "grad_norm": 0.78125, + "learning_rate": 0.00012640795156044373, + "loss": 0.8029, + "step": 27838 + }, + { + "epoch": 0.714827457267529, + "grad_norm": 0.828125, + "learning_rate": 0.00012640364580753137, + "loss": 0.9302, + "step": 27839 + }, + { + "epoch": 0.7148531344634509, + "grad_norm": 0.76171875, + "learning_rate": 0.00012639934000199893, + "loss": 0.7825, + "step": 27840 + }, + { + "epoch": 0.7148788116593727, + "grad_norm": 0.74609375, + "learning_rate": 0.00012639503414385502, + "loss": 0.7501, + "step": 27841 + }, + { + "epoch": 0.7149044888552946, + "grad_norm": 0.7578125, + "learning_rate": 0.0001263907282331082, + "loss": 0.8071, + "step": 27842 + }, + { + "epoch": 0.7149301660512164, + "grad_norm": 0.765625, + "learning_rate": 0.000126386422269767, + "loss": 0.7862, + "step": 27843 + }, + { + "epoch": 0.7149558432471382, + "grad_norm": 0.75390625, + "learning_rate": 0.00012638211625384005, + "loss": 0.8565, + "step": 27844 + }, + { + "epoch": 0.7149815204430601, + "grad_norm": 0.765625, + "learning_rate": 0.00012637781018533595, + "loss": 0.9533, + "step": 27845 + }, + { + "epoch": 0.7150071976389818, + "grad_norm": 0.73046875, + "learning_rate": 0.00012637350406426322, + "loss": 0.7014, + "step": 27846 + }, + { + "epoch": 0.7150328748349036, + "grad_norm": 0.7890625, + "learning_rate": 0.00012636919789063051, + "loss": 0.8355, + "step": 27847 + }, + { + "epoch": 0.7150585520308255, + "grad_norm": 0.75390625, + "learning_rate": 0.0001263648916644464, + "loss": 0.8324, + "step": 27848 + }, + { + "epoch": 0.7150842292267473, + "grad_norm": 0.7421875, + "learning_rate": 0.00012636058538571937, + "loss": 0.7854, + "step": 27849 + }, + { + "epoch": 0.7151099064226691, + "grad_norm": 0.796875, + "learning_rate": 0.0001263562790544581, + "loss": 0.7332, + "step": 27850 + }, + { + "epoch": 0.715135583618591, + "grad_norm": 0.87890625, + "learning_rate": 0.00012635197267067116, + "loss": 0.7146, + "step": 27851 + }, + { + "epoch": 0.7151612608145127, + "grad_norm": 0.7421875, + "learning_rate": 0.0001263476662343671, + "loss": 0.7808, + "step": 27852 + }, + { + "epoch": 0.7151869380104345, + "grad_norm": 0.8046875, + "learning_rate": 0.00012634335974555453, + "loss": 0.7902, + "step": 27853 + }, + { + "epoch": 0.7152126152063564, + "grad_norm": 0.8359375, + "learning_rate": 0.00012633905320424206, + "loss": 0.8376, + "step": 27854 + }, + { + "epoch": 0.7152382924022782, + "grad_norm": 0.83203125, + "learning_rate": 0.0001263347466104382, + "loss": 0.783, + "step": 27855 + }, + { + "epoch": 0.7152639695982, + "grad_norm": 0.71484375, + "learning_rate": 0.00012633043996415158, + "loss": 0.7144, + "step": 27856 + }, + { + "epoch": 0.7152896467941219, + "grad_norm": 0.7265625, + "learning_rate": 0.00012632613326539076, + "loss": 0.8526, + "step": 27857 + }, + { + "epoch": 0.7153153239900437, + "grad_norm": 0.796875, + "learning_rate": 0.0001263218265141643, + "loss": 0.8917, + "step": 27858 + }, + { + "epoch": 0.7153410011859654, + "grad_norm": 0.8125, + "learning_rate": 0.0001263175197104809, + "loss": 0.9077, + "step": 27859 + }, + { + "epoch": 0.7153666783818873, + "grad_norm": 0.8046875, + "learning_rate": 0.000126313212854349, + "loss": 0.9859, + "step": 27860 + }, + { + "epoch": 0.7153923555778091, + "grad_norm": 0.859375, + "learning_rate": 0.00012630890594577727, + "loss": 0.9453, + "step": 27861 + }, + { + "epoch": 0.715418032773731, + "grad_norm": 0.81640625, + "learning_rate": 0.00012630459898477426, + "loss": 0.8613, + "step": 27862 + }, + { + "epoch": 0.7154437099696528, + "grad_norm": 0.75, + "learning_rate": 0.00012630029197134853, + "loss": 0.7365, + "step": 27863 + }, + { + "epoch": 0.7154693871655746, + "grad_norm": 0.8046875, + "learning_rate": 0.00012629598490550872, + "loss": 0.8938, + "step": 27864 + }, + { + "epoch": 0.7154950643614965, + "grad_norm": 0.75, + "learning_rate": 0.0001262916777872634, + "loss": 0.8452, + "step": 27865 + }, + { + "epoch": 0.7155207415574182, + "grad_norm": 0.75, + "learning_rate": 0.00012628737061662112, + "loss": 0.702, + "step": 27866 + }, + { + "epoch": 0.71554641875334, + "grad_norm": 0.86328125, + "learning_rate": 0.00012628306339359052, + "loss": 0.9339, + "step": 27867 + }, + { + "epoch": 0.7155720959492619, + "grad_norm": 0.76953125, + "learning_rate": 0.00012627875611818012, + "loss": 0.866, + "step": 27868 + }, + { + "epoch": 0.7155977731451837, + "grad_norm": 0.7734375, + "learning_rate": 0.00012627444879039854, + "loss": 0.8363, + "step": 27869 + }, + { + "epoch": 0.7156234503411055, + "grad_norm": 0.78125, + "learning_rate": 0.00012627014141025438, + "loss": 0.9078, + "step": 27870 + }, + { + "epoch": 0.7156491275370274, + "grad_norm": 0.8359375, + "learning_rate": 0.00012626583397775613, + "loss": 0.8938, + "step": 27871 + }, + { + "epoch": 0.7156748047329491, + "grad_norm": 0.7890625, + "learning_rate": 0.0001262615264929125, + "loss": 1.0054, + "step": 27872 + }, + { + "epoch": 0.7157004819288709, + "grad_norm": 0.80859375, + "learning_rate": 0.00012625721895573203, + "loss": 0.7519, + "step": 27873 + }, + { + "epoch": 0.7157261591247928, + "grad_norm": 0.74609375, + "learning_rate": 0.00012625291136622328, + "loss": 0.8797, + "step": 27874 + }, + { + "epoch": 0.7157518363207146, + "grad_norm": 0.78125, + "learning_rate": 0.00012624860372439483, + "loss": 0.7826, + "step": 27875 + }, + { + "epoch": 0.7157775135166364, + "grad_norm": 0.77734375, + "learning_rate": 0.0001262442960302553, + "loss": 0.8402, + "step": 27876 + }, + { + "epoch": 0.7158031907125583, + "grad_norm": 0.828125, + "learning_rate": 0.00012623998828381326, + "loss": 0.8082, + "step": 27877 + }, + { + "epoch": 0.7158288679084801, + "grad_norm": 0.7734375, + "learning_rate": 0.0001262356804850773, + "loss": 0.8686, + "step": 27878 + }, + { + "epoch": 0.7158545451044018, + "grad_norm": 0.76953125, + "learning_rate": 0.00012623137263405597, + "loss": 0.7348, + "step": 27879 + }, + { + "epoch": 0.7158802223003237, + "grad_norm": 0.7265625, + "learning_rate": 0.0001262270647307579, + "loss": 0.7724, + "step": 27880 + }, + { + "epoch": 0.7159058994962455, + "grad_norm": 0.84375, + "learning_rate": 0.00012622275677519166, + "loss": 0.8942, + "step": 27881 + }, + { + "epoch": 0.7159315766921673, + "grad_norm": 0.8828125, + "learning_rate": 0.00012621844876736584, + "loss": 0.9164, + "step": 27882 + }, + { + "epoch": 0.7159572538880892, + "grad_norm": 0.81640625, + "learning_rate": 0.00012621414070728904, + "loss": 0.8655, + "step": 27883 + }, + { + "epoch": 0.715982931084011, + "grad_norm": 0.8203125, + "learning_rate": 0.00012620983259496976, + "loss": 0.7907, + "step": 27884 + }, + { + "epoch": 0.7160086082799327, + "grad_norm": 0.8828125, + "learning_rate": 0.00012620552443041668, + "loss": 0.9725, + "step": 27885 + }, + { + "epoch": 0.7160342854758546, + "grad_norm": 0.828125, + "learning_rate": 0.00012620121621363838, + "loss": 0.9504, + "step": 27886 + }, + { + "epoch": 0.7160599626717764, + "grad_norm": 0.7421875, + "learning_rate": 0.00012619690794464343, + "loss": 0.6951, + "step": 27887 + }, + { + "epoch": 0.7160856398676982, + "grad_norm": 0.77734375, + "learning_rate": 0.0001261925996234404, + "loss": 0.8042, + "step": 27888 + }, + { + "epoch": 0.7161113170636201, + "grad_norm": 0.83203125, + "learning_rate": 0.00012618829125003787, + "loss": 0.8827, + "step": 27889 + }, + { + "epoch": 0.7161369942595419, + "grad_norm": 0.8125, + "learning_rate": 0.00012618398282444442, + "loss": 0.7389, + "step": 27890 + }, + { + "epoch": 0.7161626714554638, + "grad_norm": 0.91015625, + "learning_rate": 0.0001261796743466687, + "loss": 0.8271, + "step": 27891 + }, + { + "epoch": 0.7161883486513855, + "grad_norm": 0.95703125, + "learning_rate": 0.00012617536581671922, + "loss": 0.8201, + "step": 27892 + }, + { + "epoch": 0.7162140258473073, + "grad_norm": 0.8203125, + "learning_rate": 0.00012617105723460465, + "loss": 0.8837, + "step": 27893 + }, + { + "epoch": 0.7162397030432291, + "grad_norm": 0.7578125, + "learning_rate": 0.0001261667486003335, + "loss": 0.9062, + "step": 27894 + }, + { + "epoch": 0.716265380239151, + "grad_norm": 0.75, + "learning_rate": 0.00012616243991391435, + "loss": 0.8557, + "step": 27895 + }, + { + "epoch": 0.7162910574350728, + "grad_norm": 0.73046875, + "learning_rate": 0.00012615813117535586, + "loss": 0.7544, + "step": 27896 + }, + { + "epoch": 0.7163167346309947, + "grad_norm": 0.71875, + "learning_rate": 0.00012615382238466657, + "loss": 0.7582, + "step": 27897 + }, + { + "epoch": 0.7163424118269165, + "grad_norm": 0.765625, + "learning_rate": 0.00012614951354185508, + "loss": 0.816, + "step": 27898 + }, + { + "epoch": 0.7163680890228382, + "grad_norm": 0.8515625, + "learning_rate": 0.00012614520464692997, + "loss": 0.9047, + "step": 27899 + }, + { + "epoch": 0.71639376621876, + "grad_norm": 0.8125, + "learning_rate": 0.0001261408956998998, + "loss": 0.8184, + "step": 27900 + }, + { + "epoch": 0.7164194434146819, + "grad_norm": 0.83203125, + "learning_rate": 0.00012613658670077322, + "loss": 0.8586, + "step": 27901 + }, + { + "epoch": 0.7164451206106037, + "grad_norm": 0.7734375, + "learning_rate": 0.00012613227764955877, + "loss": 0.8724, + "step": 27902 + }, + { + "epoch": 0.7164707978065256, + "grad_norm": 0.84375, + "learning_rate": 0.00012612796854626505, + "loss": 0.9057, + "step": 27903 + }, + { + "epoch": 0.7164964750024474, + "grad_norm": 0.74609375, + "learning_rate": 0.00012612365939090064, + "loss": 0.7916, + "step": 27904 + }, + { + "epoch": 0.7165221521983691, + "grad_norm": 0.7421875, + "learning_rate": 0.00012611935018347415, + "loss": 0.7664, + "step": 27905 + }, + { + "epoch": 0.716547829394291, + "grad_norm": 0.7734375, + "learning_rate": 0.00012611504092399417, + "loss": 0.8948, + "step": 27906 + }, + { + "epoch": 0.7165735065902128, + "grad_norm": 0.7421875, + "learning_rate": 0.00012611073161246928, + "loss": 0.7305, + "step": 27907 + }, + { + "epoch": 0.7165991837861346, + "grad_norm": 0.90234375, + "learning_rate": 0.000126106422248908, + "loss": 0.9373, + "step": 27908 + }, + { + "epoch": 0.7166248609820565, + "grad_norm": 0.68359375, + "learning_rate": 0.00012610211283331903, + "loss": 0.7131, + "step": 27909 + }, + { + "epoch": 0.7166505381779783, + "grad_norm": 0.76953125, + "learning_rate": 0.00012609780336571088, + "loss": 0.9358, + "step": 27910 + }, + { + "epoch": 0.7166762153739001, + "grad_norm": 0.76171875, + "learning_rate": 0.00012609349384609217, + "loss": 0.9136, + "step": 27911 + }, + { + "epoch": 0.7167018925698219, + "grad_norm": 0.7890625, + "learning_rate": 0.0001260891842744715, + "loss": 0.8598, + "step": 27912 + }, + { + "epoch": 0.7167275697657437, + "grad_norm": 0.84375, + "learning_rate": 0.00012608487465085743, + "loss": 0.828, + "step": 27913 + }, + { + "epoch": 0.7167532469616655, + "grad_norm": 0.89453125, + "learning_rate": 0.00012608056497525857, + "loss": 0.9336, + "step": 27914 + }, + { + "epoch": 0.7167789241575874, + "grad_norm": 0.78515625, + "learning_rate": 0.0001260762552476835, + "loss": 0.772, + "step": 27915 + }, + { + "epoch": 0.7168046013535092, + "grad_norm": 0.734375, + "learning_rate": 0.0001260719454681408, + "loss": 0.8281, + "step": 27916 + }, + { + "epoch": 0.716830278549431, + "grad_norm": 0.79296875, + "learning_rate": 0.00012606763563663905, + "loss": 0.7381, + "step": 27917 + }, + { + "epoch": 0.7168559557453529, + "grad_norm": 0.78515625, + "learning_rate": 0.0001260633257531869, + "loss": 0.9643, + "step": 27918 + }, + { + "epoch": 0.7168816329412746, + "grad_norm": 0.78515625, + "learning_rate": 0.00012605901581779286, + "loss": 0.7925, + "step": 27919 + }, + { + "epoch": 0.7169073101371964, + "grad_norm": 0.79296875, + "learning_rate": 0.00012605470583046556, + "loss": 0.9332, + "step": 27920 + }, + { + "epoch": 0.7169329873331183, + "grad_norm": 0.71875, + "learning_rate": 0.0001260503957912136, + "loss": 0.693, + "step": 27921 + }, + { + "epoch": 0.7169586645290401, + "grad_norm": 0.6953125, + "learning_rate": 0.00012604608570004555, + "loss": 0.7935, + "step": 27922 + }, + { + "epoch": 0.716984341724962, + "grad_norm": 0.7734375, + "learning_rate": 0.00012604177555697, + "loss": 0.7769, + "step": 27923 + }, + { + "epoch": 0.7170100189208838, + "grad_norm": 0.828125, + "learning_rate": 0.00012603746536199553, + "loss": 0.8582, + "step": 27924 + }, + { + "epoch": 0.7170356961168055, + "grad_norm": 0.78515625, + "learning_rate": 0.00012603315511513076, + "loss": 0.7335, + "step": 27925 + }, + { + "epoch": 0.7170613733127273, + "grad_norm": 0.73046875, + "learning_rate": 0.00012602884481638428, + "loss": 0.8448, + "step": 27926 + }, + { + "epoch": 0.7170870505086492, + "grad_norm": 0.7421875, + "learning_rate": 0.00012602453446576467, + "loss": 0.808, + "step": 27927 + }, + { + "epoch": 0.717112727704571, + "grad_norm": 0.76171875, + "learning_rate": 0.00012602022406328048, + "loss": 0.9395, + "step": 27928 + }, + { + "epoch": 0.7171384049004929, + "grad_norm": 0.79296875, + "learning_rate": 0.0001260159136089403, + "loss": 0.9141, + "step": 27929 + }, + { + "epoch": 0.7171640820964147, + "grad_norm": 0.76171875, + "learning_rate": 0.0001260116031027528, + "loss": 0.8113, + "step": 27930 + }, + { + "epoch": 0.7171897592923365, + "grad_norm": 0.79296875, + "learning_rate": 0.00012600729254472654, + "loss": 0.8408, + "step": 27931 + }, + { + "epoch": 0.7172154364882583, + "grad_norm": 0.859375, + "learning_rate": 0.00012600298193487006, + "loss": 0.7723, + "step": 27932 + }, + { + "epoch": 0.7172411136841801, + "grad_norm": 0.796875, + "learning_rate": 0.00012599867127319203, + "loss": 0.7449, + "step": 27933 + }, + { + "epoch": 0.7172667908801019, + "grad_norm": 0.765625, + "learning_rate": 0.00012599436055970098, + "loss": 0.7869, + "step": 27934 + }, + { + "epoch": 0.7172924680760238, + "grad_norm": 0.69140625, + "learning_rate": 0.0001259900497944055, + "loss": 0.7085, + "step": 27935 + }, + { + "epoch": 0.7173181452719456, + "grad_norm": 0.7421875, + "learning_rate": 0.00012598573897731422, + "loss": 0.7622, + "step": 27936 + }, + { + "epoch": 0.7173438224678674, + "grad_norm": 0.79296875, + "learning_rate": 0.00012598142810843566, + "loss": 0.9057, + "step": 27937 + }, + { + "epoch": 0.7173694996637893, + "grad_norm": 0.7265625, + "learning_rate": 0.0001259771171877785, + "loss": 0.9156, + "step": 27938 + }, + { + "epoch": 0.717395176859711, + "grad_norm": 0.76953125, + "learning_rate": 0.0001259728062153513, + "loss": 0.8381, + "step": 27939 + }, + { + "epoch": 0.7174208540556328, + "grad_norm": 0.75390625, + "learning_rate": 0.00012596849519116264, + "loss": 0.7489, + "step": 27940 + }, + { + "epoch": 0.7174465312515547, + "grad_norm": 0.73046875, + "learning_rate": 0.0001259641841152211, + "loss": 0.7446, + "step": 27941 + }, + { + "epoch": 0.7174722084474765, + "grad_norm": 0.79296875, + "learning_rate": 0.0001259598729875353, + "loss": 0.8169, + "step": 27942 + }, + { + "epoch": 0.7174978856433983, + "grad_norm": 0.7890625, + "learning_rate": 0.00012595556180811378, + "loss": 0.8598, + "step": 27943 + }, + { + "epoch": 0.7175235628393202, + "grad_norm": 0.87109375, + "learning_rate": 0.0001259512505769652, + "loss": 0.977, + "step": 27944 + }, + { + "epoch": 0.7175492400352419, + "grad_norm": 0.7578125, + "learning_rate": 0.00012594693929409814, + "loss": 0.775, + "step": 27945 + }, + { + "epoch": 0.7175749172311637, + "grad_norm": 0.77734375, + "learning_rate": 0.00012594262795952114, + "loss": 0.797, + "step": 27946 + }, + { + "epoch": 0.7176005944270856, + "grad_norm": 0.8515625, + "learning_rate": 0.00012593831657324285, + "loss": 0.8285, + "step": 27947 + }, + { + "epoch": 0.7176262716230074, + "grad_norm": 0.75, + "learning_rate": 0.00012593400513527184, + "loss": 0.7386, + "step": 27948 + }, + { + "epoch": 0.7176519488189292, + "grad_norm": 0.7578125, + "learning_rate": 0.0001259296936456167, + "loss": 0.8943, + "step": 27949 + }, + { + "epoch": 0.7176776260148511, + "grad_norm": 0.82421875, + "learning_rate": 0.000125925382104286, + "loss": 0.7946, + "step": 27950 + }, + { + "epoch": 0.7177033032107729, + "grad_norm": 0.7890625, + "learning_rate": 0.00012592107051128835, + "loss": 0.9236, + "step": 27951 + }, + { + "epoch": 0.7177289804066946, + "grad_norm": 0.72265625, + "learning_rate": 0.00012591675886663238, + "loss": 0.8386, + "step": 27952 + }, + { + "epoch": 0.7177546576026165, + "grad_norm": 0.75, + "learning_rate": 0.00012591244717032662, + "loss": 0.8654, + "step": 27953 + }, + { + "epoch": 0.7177803347985383, + "grad_norm": 0.80078125, + "learning_rate": 0.0001259081354223797, + "loss": 0.9845, + "step": 27954 + }, + { + "epoch": 0.7178060119944601, + "grad_norm": 0.71484375, + "learning_rate": 0.00012590382362280024, + "loss": 0.8802, + "step": 27955 + }, + { + "epoch": 0.717831689190382, + "grad_norm": 0.8671875, + "learning_rate": 0.00012589951177159676, + "loss": 0.9093, + "step": 27956 + }, + { + "epoch": 0.7178573663863038, + "grad_norm": 0.7109375, + "learning_rate": 0.00012589519986877791, + "loss": 0.8878, + "step": 27957 + }, + { + "epoch": 0.7178830435822257, + "grad_norm": 0.70703125, + "learning_rate": 0.0001258908879143523, + "loss": 0.8788, + "step": 27958 + }, + { + "epoch": 0.7179087207781474, + "grad_norm": 0.92578125, + "learning_rate": 0.00012588657590832845, + "loss": 0.8138, + "step": 27959 + }, + { + "epoch": 0.7179343979740692, + "grad_norm": 0.7421875, + "learning_rate": 0.00012588226385071497, + "loss": 0.8927, + "step": 27960 + }, + { + "epoch": 0.717960075169991, + "grad_norm": 0.78515625, + "learning_rate": 0.00012587795174152052, + "loss": 0.7984, + "step": 27961 + }, + { + "epoch": 0.7179857523659129, + "grad_norm": 0.7734375, + "learning_rate": 0.0001258736395807536, + "loss": 0.7515, + "step": 27962 + }, + { + "epoch": 0.7180114295618347, + "grad_norm": 0.82421875, + "learning_rate": 0.0001258693273684229, + "loss": 0.8414, + "step": 27963 + }, + { + "epoch": 0.7180371067577566, + "grad_norm": 0.765625, + "learning_rate": 0.00012586501510453694, + "loss": 0.8023, + "step": 27964 + }, + { + "epoch": 0.7180627839536783, + "grad_norm": 0.78515625, + "learning_rate": 0.00012586070278910436, + "loss": 0.8077, + "step": 27965 + }, + { + "epoch": 0.7180884611496001, + "grad_norm": 0.84765625, + "learning_rate": 0.00012585639042213373, + "loss": 0.8445, + "step": 27966 + }, + { + "epoch": 0.718114138345522, + "grad_norm": 0.78515625, + "learning_rate": 0.00012585207800363366, + "loss": 0.85, + "step": 27967 + }, + { + "epoch": 0.7181398155414438, + "grad_norm": 0.90625, + "learning_rate": 0.00012584776553361271, + "loss": 0.8903, + "step": 27968 + }, + { + "epoch": 0.7181654927373656, + "grad_norm": 0.72265625, + "learning_rate": 0.0001258434530120795, + "loss": 0.8251, + "step": 27969 + }, + { + "epoch": 0.7181911699332875, + "grad_norm": 0.78125, + "learning_rate": 0.00012583914043904262, + "loss": 0.8305, + "step": 27970 + }, + { + "epoch": 0.7182168471292093, + "grad_norm": 0.71484375, + "learning_rate": 0.0001258348278145107, + "loss": 0.7236, + "step": 27971 + }, + { + "epoch": 0.718242524325131, + "grad_norm": 0.8046875, + "learning_rate": 0.00012583051513849228, + "loss": 0.899, + "step": 27972 + }, + { + "epoch": 0.7182682015210529, + "grad_norm": 0.78515625, + "learning_rate": 0.00012582620241099598, + "loss": 0.8911, + "step": 27973 + }, + { + "epoch": 0.7182938787169747, + "grad_norm": 0.71484375, + "learning_rate": 0.0001258218896320304, + "loss": 0.9855, + "step": 27974 + }, + { + "epoch": 0.7183195559128965, + "grad_norm": 0.79296875, + "learning_rate": 0.0001258175768016041, + "loss": 0.9305, + "step": 27975 + }, + { + "epoch": 0.7183452331088184, + "grad_norm": 0.78125, + "learning_rate": 0.0001258132639197257, + "loss": 0.8422, + "step": 27976 + }, + { + "epoch": 0.7183709103047402, + "grad_norm": 0.7734375, + "learning_rate": 0.00012580895098640381, + "loss": 0.8357, + "step": 27977 + }, + { + "epoch": 0.718396587500662, + "grad_norm": 0.7890625, + "learning_rate": 0.00012580463800164704, + "loss": 0.8347, + "step": 27978 + }, + { + "epoch": 0.7184222646965838, + "grad_norm": 0.76953125, + "learning_rate": 0.00012580032496546396, + "loss": 0.7738, + "step": 27979 + }, + { + "epoch": 0.7184479418925056, + "grad_norm": 0.8046875, + "learning_rate": 0.00012579601187786313, + "loss": 0.8614, + "step": 27980 + }, + { + "epoch": 0.7184736190884274, + "grad_norm": 0.8203125, + "learning_rate": 0.00012579169873885317, + "loss": 0.8869, + "step": 27981 + }, + { + "epoch": 0.7184992962843493, + "grad_norm": 0.72265625, + "learning_rate": 0.0001257873855484427, + "loss": 0.7907, + "step": 27982 + }, + { + "epoch": 0.7185249734802711, + "grad_norm": 0.80859375, + "learning_rate": 0.0001257830723066403, + "loss": 0.9146, + "step": 27983 + }, + { + "epoch": 0.718550650676193, + "grad_norm": 0.82421875, + "learning_rate": 0.00012577875901345457, + "loss": 0.8621, + "step": 27984 + }, + { + "epoch": 0.7185763278721147, + "grad_norm": 0.78125, + "learning_rate": 0.0001257744456688941, + "loss": 0.7861, + "step": 27985 + }, + { + "epoch": 0.7186020050680365, + "grad_norm": 0.7421875, + "learning_rate": 0.00012577013227296746, + "loss": 0.9259, + "step": 27986 + }, + { + "epoch": 0.7186276822639583, + "grad_norm": 0.8125, + "learning_rate": 0.00012576581882568332, + "loss": 0.8149, + "step": 27987 + }, + { + "epoch": 0.7186533594598802, + "grad_norm": 0.82421875, + "learning_rate": 0.0001257615053270502, + "loss": 0.7818, + "step": 27988 + }, + { + "epoch": 0.718679036655802, + "grad_norm": 0.7734375, + "learning_rate": 0.00012575719177707674, + "loss": 0.9653, + "step": 27989 + }, + { + "epoch": 0.7187047138517239, + "grad_norm": 0.88671875, + "learning_rate": 0.00012575287817577152, + "loss": 0.9845, + "step": 27990 + }, + { + "epoch": 0.7187303910476457, + "grad_norm": 0.7890625, + "learning_rate": 0.00012574856452314312, + "loss": 0.8394, + "step": 27991 + }, + { + "epoch": 0.7187560682435674, + "grad_norm": 0.83984375, + "learning_rate": 0.00012574425081920022, + "loss": 0.8208, + "step": 27992 + }, + { + "epoch": 0.7187817454394893, + "grad_norm": 0.77734375, + "learning_rate": 0.0001257399370639513, + "loss": 0.9521, + "step": 27993 + }, + { + "epoch": 0.7188074226354111, + "grad_norm": 0.85546875, + "learning_rate": 0.000125735623257405, + "loss": 0.8719, + "step": 27994 + }, + { + "epoch": 0.7188330998313329, + "grad_norm": 0.80078125, + "learning_rate": 0.00012573130939956998, + "loss": 0.8455, + "step": 27995 + }, + { + "epoch": 0.7188587770272548, + "grad_norm": 0.7578125, + "learning_rate": 0.0001257269954904547, + "loss": 0.8838, + "step": 27996 + }, + { + "epoch": 0.7188844542231766, + "grad_norm": 0.85546875, + "learning_rate": 0.0001257226815300679, + "loss": 0.7844, + "step": 27997 + }, + { + "epoch": 0.7189101314190984, + "grad_norm": 0.73046875, + "learning_rate": 0.00012571836751841816, + "loss": 0.8294, + "step": 27998 + }, + { + "epoch": 0.7189358086150202, + "grad_norm": 0.765625, + "learning_rate": 0.00012571405345551396, + "loss": 0.8136, + "step": 27999 + }, + { + "epoch": 0.718961485810942, + "grad_norm": 0.7890625, + "learning_rate": 0.00012570973934136404, + "loss": 0.7894, + "step": 28000 + }, + { + "epoch": 0.718961485810942, + "eval_loss": 0.8289976716041565, + "eval_runtime": 388.3635, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 0.806, + "step": 28000 + }, + { + "epoch": 0.7189871630068638, + "grad_norm": 0.796875, + "learning_rate": 0.00012570542517597685, + "loss": 0.8733, + "step": 28001 + }, + { + "epoch": 0.7190128402027857, + "grad_norm": 0.75, + "learning_rate": 0.00012570111095936115, + "loss": 0.8158, + "step": 28002 + }, + { + "epoch": 0.7190385173987075, + "grad_norm": 0.87890625, + "learning_rate": 0.0001256967966915254, + "loss": 0.8502, + "step": 28003 + }, + { + "epoch": 0.7190641945946293, + "grad_norm": 0.671875, + "learning_rate": 0.00012569248237247828, + "loss": 0.638, + "step": 28004 + }, + { + "epoch": 0.7190898717905511, + "grad_norm": 0.74609375, + "learning_rate": 0.0001256881680022284, + "loss": 0.7704, + "step": 28005 + }, + { + "epoch": 0.7191155489864729, + "grad_norm": 0.7109375, + "learning_rate": 0.00012568385358078426, + "loss": 0.7349, + "step": 28006 + }, + { + "epoch": 0.7191412261823947, + "grad_norm": 0.75, + "learning_rate": 0.00012567953910815454, + "loss": 0.8097, + "step": 28007 + }, + { + "epoch": 0.7191669033783166, + "grad_norm": 0.75390625, + "learning_rate": 0.00012567522458434781, + "loss": 0.8769, + "step": 28008 + }, + { + "epoch": 0.7191925805742384, + "grad_norm": 0.79296875, + "learning_rate": 0.0001256709100093727, + "loss": 0.8821, + "step": 28009 + }, + { + "epoch": 0.7192182577701602, + "grad_norm": 0.90625, + "learning_rate": 0.00012566659538323777, + "loss": 0.9129, + "step": 28010 + }, + { + "epoch": 0.7192439349660821, + "grad_norm": 0.7578125, + "learning_rate": 0.00012566228070595164, + "loss": 0.846, + "step": 28011 + }, + { + "epoch": 0.7192696121620038, + "grad_norm": 0.76953125, + "learning_rate": 0.0001256579659775229, + "loss": 0.8552, + "step": 28012 + }, + { + "epoch": 0.7192952893579256, + "grad_norm": 0.82421875, + "learning_rate": 0.00012565365119796015, + "loss": 0.8976, + "step": 28013 + }, + { + "epoch": 0.7193209665538475, + "grad_norm": 0.828125, + "learning_rate": 0.00012564933636727198, + "loss": 0.9187, + "step": 28014 + }, + { + "epoch": 0.7193466437497693, + "grad_norm": 0.84375, + "learning_rate": 0.00012564502148546702, + "loss": 0.7776, + "step": 28015 + }, + { + "epoch": 0.7193723209456911, + "grad_norm": 0.7734375, + "learning_rate": 0.00012564070655255385, + "loss": 0.7882, + "step": 28016 + }, + { + "epoch": 0.719397998141613, + "grad_norm": 0.78515625, + "learning_rate": 0.00012563639156854103, + "loss": 0.8869, + "step": 28017 + }, + { + "epoch": 0.7194236753375348, + "grad_norm": 0.875, + "learning_rate": 0.00012563207653343728, + "loss": 0.7569, + "step": 28018 + }, + { + "epoch": 0.7194493525334565, + "grad_norm": 0.71484375, + "learning_rate": 0.00012562776144725105, + "loss": 0.7495, + "step": 28019 + }, + { + "epoch": 0.7194750297293784, + "grad_norm": 0.78125, + "learning_rate": 0.000125623446309991, + "loss": 0.856, + "step": 28020 + }, + { + "epoch": 0.7195007069253002, + "grad_norm": 0.83203125, + "learning_rate": 0.00012561913112166576, + "loss": 1.0173, + "step": 28021 + }, + { + "epoch": 0.719526384121222, + "grad_norm": 0.7265625, + "learning_rate": 0.0001256148158822839, + "loss": 0.9168, + "step": 28022 + }, + { + "epoch": 0.7195520613171439, + "grad_norm": 0.73046875, + "learning_rate": 0.00012561050059185403, + "loss": 0.7953, + "step": 28023 + }, + { + "epoch": 0.7195777385130657, + "grad_norm": 0.703125, + "learning_rate": 0.00012560618525038475, + "loss": 0.7969, + "step": 28024 + }, + { + "epoch": 0.7196034157089874, + "grad_norm": 0.85546875, + "learning_rate": 0.00012560186985788465, + "loss": 1.0084, + "step": 28025 + }, + { + "epoch": 0.7196290929049093, + "grad_norm": 0.70703125, + "learning_rate": 0.00012559755441436234, + "loss": 0.8254, + "step": 28026 + }, + { + "epoch": 0.7196547701008311, + "grad_norm": 0.76953125, + "learning_rate": 0.0001255932389198264, + "loss": 0.8942, + "step": 28027 + }, + { + "epoch": 0.719680447296753, + "grad_norm": 0.80078125, + "learning_rate": 0.00012558892337428544, + "loss": 0.8738, + "step": 28028 + }, + { + "epoch": 0.7197061244926748, + "grad_norm": 0.8984375, + "learning_rate": 0.0001255846077777481, + "loss": 0.8627, + "step": 28029 + }, + { + "epoch": 0.7197318016885966, + "grad_norm": 0.82421875, + "learning_rate": 0.00012558029213022293, + "loss": 0.9021, + "step": 28030 + }, + { + "epoch": 0.7197574788845185, + "grad_norm": 0.71875, + "learning_rate": 0.00012557597643171855, + "loss": 0.7512, + "step": 28031 + }, + { + "epoch": 0.7197831560804402, + "grad_norm": 0.81640625, + "learning_rate": 0.00012557166068224358, + "loss": 0.8728, + "step": 28032 + }, + { + "epoch": 0.719808833276362, + "grad_norm": 0.7578125, + "learning_rate": 0.00012556734488180656, + "loss": 0.8481, + "step": 28033 + }, + { + "epoch": 0.7198345104722839, + "grad_norm": 0.765625, + "learning_rate": 0.00012556302903041615, + "loss": 0.888, + "step": 28034 + }, + { + "epoch": 0.7198601876682057, + "grad_norm": 0.765625, + "learning_rate": 0.00012555871312808095, + "loss": 0.7781, + "step": 28035 + }, + { + "epoch": 0.7198858648641275, + "grad_norm": 0.83984375, + "learning_rate": 0.0001255543971748095, + "loss": 0.7048, + "step": 28036 + }, + { + "epoch": 0.7199115420600494, + "grad_norm": 0.7421875, + "learning_rate": 0.00012555008117061048, + "loss": 0.8595, + "step": 28037 + }, + { + "epoch": 0.7199372192559712, + "grad_norm": 0.69140625, + "learning_rate": 0.00012554576511549247, + "loss": 0.8011, + "step": 28038 + }, + { + "epoch": 0.7199628964518929, + "grad_norm": 0.8046875, + "learning_rate": 0.000125541449009464, + "loss": 0.8055, + "step": 28039 + }, + { + "epoch": 0.7199885736478148, + "grad_norm": 0.91015625, + "learning_rate": 0.0001255371328525338, + "loss": 0.9242, + "step": 28040 + }, + { + "epoch": 0.7200142508437366, + "grad_norm": 0.73828125, + "learning_rate": 0.00012553281664471033, + "loss": 0.8012, + "step": 28041 + }, + { + "epoch": 0.7200399280396584, + "grad_norm": 0.8125, + "learning_rate": 0.0001255285003860023, + "loss": 0.8612, + "step": 28042 + }, + { + "epoch": 0.7200656052355803, + "grad_norm": 0.76171875, + "learning_rate": 0.0001255241840764183, + "loss": 0.8089, + "step": 28043 + }, + { + "epoch": 0.7200912824315021, + "grad_norm": 0.78125, + "learning_rate": 0.00012551986771596686, + "loss": 0.9439, + "step": 28044 + }, + { + "epoch": 0.7201169596274238, + "grad_norm": 0.8515625, + "learning_rate": 0.00012551555130465666, + "loss": 0.8543, + "step": 28045 + }, + { + "epoch": 0.7201426368233457, + "grad_norm": 0.70703125, + "learning_rate": 0.00012551123484249625, + "loss": 0.7881, + "step": 28046 + }, + { + "epoch": 0.7201683140192675, + "grad_norm": 0.7265625, + "learning_rate": 0.00012550691832949426, + "loss": 0.8298, + "step": 28047 + }, + { + "epoch": 0.7201939912151893, + "grad_norm": 0.765625, + "learning_rate": 0.0001255026017656593, + "loss": 0.8877, + "step": 28048 + }, + { + "epoch": 0.7202196684111112, + "grad_norm": 0.7578125, + "learning_rate": 0.00012549828515099994, + "loss": 0.8812, + "step": 28049 + }, + { + "epoch": 0.720245345607033, + "grad_norm": 0.78515625, + "learning_rate": 0.00012549396848552478, + "loss": 0.8438, + "step": 28050 + }, + { + "epoch": 0.7202710228029549, + "grad_norm": 0.75, + "learning_rate": 0.0001254896517692425, + "loss": 0.7826, + "step": 28051 + }, + { + "epoch": 0.7202966999988766, + "grad_norm": 0.75390625, + "learning_rate": 0.0001254853350021616, + "loss": 0.8552, + "step": 28052 + }, + { + "epoch": 0.7203223771947984, + "grad_norm": 0.80078125, + "learning_rate": 0.00012548101818429076, + "loss": 0.8737, + "step": 28053 + }, + { + "epoch": 0.7203480543907202, + "grad_norm": 0.703125, + "learning_rate": 0.00012547670131563853, + "loss": 0.7996, + "step": 28054 + }, + { + "epoch": 0.7203737315866421, + "grad_norm": 0.81640625, + "learning_rate": 0.00012547238439621357, + "loss": 0.8374, + "step": 28055 + }, + { + "epoch": 0.7203994087825639, + "grad_norm": 0.8828125, + "learning_rate": 0.00012546806742602443, + "loss": 0.7705, + "step": 28056 + }, + { + "epoch": 0.7204250859784858, + "grad_norm": 0.734375, + "learning_rate": 0.00012546375040507968, + "loss": 0.7851, + "step": 28057 + }, + { + "epoch": 0.7204507631744076, + "grad_norm": 0.734375, + "learning_rate": 0.00012545943333338805, + "loss": 0.66, + "step": 28058 + }, + { + "epoch": 0.7204764403703293, + "grad_norm": 0.71875, + "learning_rate": 0.00012545511621095808, + "loss": 0.8041, + "step": 28059 + }, + { + "epoch": 0.7205021175662512, + "grad_norm": 0.8125, + "learning_rate": 0.0001254507990377983, + "loss": 0.8347, + "step": 28060 + }, + { + "epoch": 0.720527794762173, + "grad_norm": 0.77734375, + "learning_rate": 0.00012544648181391738, + "loss": 0.7751, + "step": 28061 + }, + { + "epoch": 0.7205534719580948, + "grad_norm": 0.7421875, + "learning_rate": 0.00012544216453932395, + "loss": 0.8316, + "step": 28062 + }, + { + "epoch": 0.7205791491540167, + "grad_norm": 0.75, + "learning_rate": 0.00012543784721402655, + "loss": 0.7539, + "step": 28063 + }, + { + "epoch": 0.7206048263499385, + "grad_norm": 0.87109375, + "learning_rate": 0.0001254335298380339, + "loss": 0.8018, + "step": 28064 + }, + { + "epoch": 0.7206305035458602, + "grad_norm": 1.09375, + "learning_rate": 0.00012542921241135444, + "loss": 0.7839, + "step": 28065 + }, + { + "epoch": 0.7206561807417821, + "grad_norm": 0.75390625, + "learning_rate": 0.00012542489493399687, + "loss": 0.8527, + "step": 28066 + }, + { + "epoch": 0.7206818579377039, + "grad_norm": 0.796875, + "learning_rate": 0.0001254205774059698, + "loss": 0.7249, + "step": 28067 + }, + { + "epoch": 0.7207075351336257, + "grad_norm": 0.75390625, + "learning_rate": 0.0001254162598272818, + "loss": 0.9159, + "step": 28068 + }, + { + "epoch": 0.7207332123295476, + "grad_norm": 0.73828125, + "learning_rate": 0.0001254119421979415, + "loss": 0.7119, + "step": 28069 + }, + { + "epoch": 0.7207588895254694, + "grad_norm": 0.8359375, + "learning_rate": 0.0001254076245179575, + "loss": 0.8238, + "step": 28070 + }, + { + "epoch": 0.7207845667213912, + "grad_norm": 0.765625, + "learning_rate": 0.0001254033067873384, + "loss": 0.869, + "step": 28071 + }, + { + "epoch": 0.720810243917313, + "grad_norm": 0.72265625, + "learning_rate": 0.0001253989890060928, + "loss": 0.7722, + "step": 28072 + }, + { + "epoch": 0.7208359211132348, + "grad_norm": 0.796875, + "learning_rate": 0.00012539467117422928, + "loss": 0.7978, + "step": 28073 + }, + { + "epoch": 0.7208615983091566, + "grad_norm": 0.71875, + "learning_rate": 0.00012539035329175652, + "loss": 0.6615, + "step": 28074 + }, + { + "epoch": 0.7208872755050785, + "grad_norm": 0.75390625, + "learning_rate": 0.00012538603535868306, + "loss": 0.775, + "step": 28075 + }, + { + "epoch": 0.7209129527010003, + "grad_norm": 0.8203125, + "learning_rate": 0.00012538171737501753, + "loss": 0.9177, + "step": 28076 + }, + { + "epoch": 0.7209386298969221, + "grad_norm": 0.80078125, + "learning_rate": 0.00012537739934076852, + "loss": 0.8486, + "step": 28077 + }, + { + "epoch": 0.720964307092844, + "grad_norm": 0.828125, + "learning_rate": 0.00012537308125594465, + "loss": 0.8007, + "step": 28078 + }, + { + "epoch": 0.7209899842887657, + "grad_norm": 0.80078125, + "learning_rate": 0.0001253687631205545, + "loss": 0.8882, + "step": 28079 + }, + { + "epoch": 0.7210156614846875, + "grad_norm": 0.74609375, + "learning_rate": 0.00012536444493460675, + "loss": 0.8187, + "step": 28080 + }, + { + "epoch": 0.7210413386806094, + "grad_norm": 0.69921875, + "learning_rate": 0.0001253601266981099, + "loss": 0.7951, + "step": 28081 + }, + { + "epoch": 0.7210670158765312, + "grad_norm": 0.7890625, + "learning_rate": 0.00012535580841107263, + "loss": 0.8665, + "step": 28082 + }, + { + "epoch": 0.721092693072453, + "grad_norm": 0.78515625, + "learning_rate": 0.00012535149007350353, + "loss": 0.7967, + "step": 28083 + }, + { + "epoch": 0.7211183702683749, + "grad_norm": 0.796875, + "learning_rate": 0.00012534717168541119, + "loss": 1.0446, + "step": 28084 + }, + { + "epoch": 0.7211440474642966, + "grad_norm": 0.7265625, + "learning_rate": 0.00012534285324680423, + "loss": 0.8271, + "step": 28085 + }, + { + "epoch": 0.7211697246602184, + "grad_norm": 0.8203125, + "learning_rate": 0.00012533853475769124, + "loss": 0.8782, + "step": 28086 + }, + { + "epoch": 0.7211954018561403, + "grad_norm": 0.7890625, + "learning_rate": 0.00012533421621808085, + "loss": 0.7966, + "step": 28087 + }, + { + "epoch": 0.7212210790520621, + "grad_norm": 0.78125, + "learning_rate": 0.00012532989762798166, + "loss": 0.784, + "step": 28088 + }, + { + "epoch": 0.721246756247984, + "grad_norm": 0.80078125, + "learning_rate": 0.00012532557898740226, + "loss": 0.8507, + "step": 28089 + }, + { + "epoch": 0.7212724334439058, + "grad_norm": 0.7578125, + "learning_rate": 0.0001253212602963513, + "loss": 0.7766, + "step": 28090 + }, + { + "epoch": 0.7212981106398276, + "grad_norm": 0.9375, + "learning_rate": 0.00012531694155483734, + "loss": 0.7118, + "step": 28091 + }, + { + "epoch": 0.7213237878357494, + "grad_norm": 0.796875, + "learning_rate": 0.00012531262276286898, + "loss": 0.7967, + "step": 28092 + }, + { + "epoch": 0.7213494650316712, + "grad_norm": 0.796875, + "learning_rate": 0.00012530830392045488, + "loss": 0.9531, + "step": 28093 + }, + { + "epoch": 0.721375142227593, + "grad_norm": 0.7890625, + "learning_rate": 0.00012530398502760357, + "loss": 0.952, + "step": 28094 + }, + { + "epoch": 0.7214008194235149, + "grad_norm": 0.80078125, + "learning_rate": 0.00012529966608432376, + "loss": 0.7714, + "step": 28095 + }, + { + "epoch": 0.7214264966194367, + "grad_norm": 0.75390625, + "learning_rate": 0.00012529534709062397, + "loss": 0.9484, + "step": 28096 + }, + { + "epoch": 0.7214521738153585, + "grad_norm": 0.765625, + "learning_rate": 0.00012529102804651284, + "loss": 0.7779, + "step": 28097 + }, + { + "epoch": 0.7214778510112803, + "grad_norm": 0.8203125, + "learning_rate": 0.000125286708951999, + "loss": 0.7864, + "step": 28098 + }, + { + "epoch": 0.7215035282072021, + "grad_norm": 0.83984375, + "learning_rate": 0.00012528238980709098, + "loss": 0.8166, + "step": 28099 + }, + { + "epoch": 0.7215292054031239, + "grad_norm": 0.83984375, + "learning_rate": 0.00012527807061179749, + "loss": 0.8496, + "step": 28100 + }, + { + "epoch": 0.7215548825990458, + "grad_norm": 0.78515625, + "learning_rate": 0.00012527375136612708, + "loss": 0.7118, + "step": 28101 + }, + { + "epoch": 0.7215805597949676, + "grad_norm": 0.82421875, + "learning_rate": 0.00012526943207008832, + "loss": 0.8097, + "step": 28102 + }, + { + "epoch": 0.7216062369908894, + "grad_norm": 0.75, + "learning_rate": 0.00012526511272368992, + "loss": 0.8196, + "step": 28103 + }, + { + "epoch": 0.7216319141868113, + "grad_norm": 0.7421875, + "learning_rate": 0.00012526079332694044, + "loss": 0.7466, + "step": 28104 + }, + { + "epoch": 0.721657591382733, + "grad_norm": 0.78125, + "learning_rate": 0.00012525647387984844, + "loss": 0.7374, + "step": 28105 + }, + { + "epoch": 0.7216832685786548, + "grad_norm": 0.7890625, + "learning_rate": 0.0001252521543824226, + "loss": 0.7971, + "step": 28106 + }, + { + "epoch": 0.7217089457745767, + "grad_norm": 0.7890625, + "learning_rate": 0.00012524783483467146, + "loss": 0.7636, + "step": 28107 + }, + { + "epoch": 0.7217346229704985, + "grad_norm": 0.74609375, + "learning_rate": 0.0001252435152366037, + "loss": 0.7835, + "step": 28108 + }, + { + "epoch": 0.7217603001664203, + "grad_norm": 0.76953125, + "learning_rate": 0.00012523919558822788, + "loss": 0.7454, + "step": 28109 + }, + { + "epoch": 0.7217859773623422, + "grad_norm": 0.88671875, + "learning_rate": 0.00012523487588955262, + "loss": 0.8619, + "step": 28110 + }, + { + "epoch": 0.721811654558264, + "grad_norm": 0.77734375, + "learning_rate": 0.00012523055614058654, + "loss": 0.7362, + "step": 28111 + }, + { + "epoch": 0.7218373317541857, + "grad_norm": 0.8125, + "learning_rate": 0.00012522623634133825, + "loss": 0.9257, + "step": 28112 + }, + { + "epoch": 0.7218630089501076, + "grad_norm": 0.7734375, + "learning_rate": 0.00012522191649181632, + "loss": 0.8725, + "step": 28113 + }, + { + "epoch": 0.7218886861460294, + "grad_norm": 0.84375, + "learning_rate": 0.00012521759659202942, + "loss": 0.8157, + "step": 28114 + }, + { + "epoch": 0.7219143633419512, + "grad_norm": 0.80859375, + "learning_rate": 0.0001252132766419861, + "loss": 0.9176, + "step": 28115 + }, + { + "epoch": 0.7219400405378731, + "grad_norm": 0.76953125, + "learning_rate": 0.000125208956641695, + "loss": 0.793, + "step": 28116 + }, + { + "epoch": 0.7219657177337949, + "grad_norm": 0.79296875, + "learning_rate": 0.00012520463659116478, + "loss": 0.7368, + "step": 28117 + }, + { + "epoch": 0.7219913949297166, + "grad_norm": 0.74609375, + "learning_rate": 0.0001252003164904039, + "loss": 0.8392, + "step": 28118 + }, + { + "epoch": 0.7220170721256385, + "grad_norm": 0.80078125, + "learning_rate": 0.00012519599633942113, + "loss": 0.7761, + "step": 28119 + }, + { + "epoch": 0.7220427493215603, + "grad_norm": 0.80078125, + "learning_rate": 0.00012519167613822503, + "loss": 0.7085, + "step": 28120 + }, + { + "epoch": 0.7220684265174822, + "grad_norm": 0.75, + "learning_rate": 0.00012518735588682415, + "loss": 0.8804, + "step": 28121 + }, + { + "epoch": 0.722094103713404, + "grad_norm": 0.77734375, + "learning_rate": 0.00012518303558522716, + "loss": 0.7573, + "step": 28122 + }, + { + "epoch": 0.7221197809093258, + "grad_norm": 0.7890625, + "learning_rate": 0.00012517871523344266, + "loss": 0.8122, + "step": 28123 + }, + { + "epoch": 0.7221454581052477, + "grad_norm": 0.77734375, + "learning_rate": 0.00012517439483147928, + "loss": 0.8898, + "step": 28124 + }, + { + "epoch": 0.7221711353011694, + "grad_norm": 0.8203125, + "learning_rate": 0.0001251700743793456, + "loss": 0.7932, + "step": 28125 + }, + { + "epoch": 0.7221968124970912, + "grad_norm": 0.78125, + "learning_rate": 0.0001251657538770502, + "loss": 0.8126, + "step": 28126 + }, + { + "epoch": 0.7222224896930131, + "grad_norm": 0.7265625, + "learning_rate": 0.00012516143332460172, + "loss": 0.8014, + "step": 28127 + }, + { + "epoch": 0.7222481668889349, + "grad_norm": 0.78125, + "learning_rate": 0.00012515711272200882, + "loss": 0.7806, + "step": 28128 + }, + { + "epoch": 0.7222738440848567, + "grad_norm": 0.81640625, + "learning_rate": 0.00012515279206928002, + "loss": 0.932, + "step": 28129 + }, + { + "epoch": 0.7222995212807786, + "grad_norm": 0.75, + "learning_rate": 0.00012514847136642403, + "loss": 0.8343, + "step": 28130 + }, + { + "epoch": 0.7223251984767004, + "grad_norm": 0.796875, + "learning_rate": 0.00012514415061344936, + "loss": 0.915, + "step": 28131 + }, + { + "epoch": 0.7223508756726221, + "grad_norm": 0.78125, + "learning_rate": 0.00012513982981036468, + "loss": 0.7741, + "step": 28132 + }, + { + "epoch": 0.722376552868544, + "grad_norm": 0.73828125, + "learning_rate": 0.0001251355089571786, + "loss": 0.7882, + "step": 28133 + }, + { + "epoch": 0.7224022300644658, + "grad_norm": 0.79296875, + "learning_rate": 0.00012513118805389971, + "loss": 0.828, + "step": 28134 + }, + { + "epoch": 0.7224279072603876, + "grad_norm": 0.70703125, + "learning_rate": 0.00012512686710053664, + "loss": 0.7474, + "step": 28135 + }, + { + "epoch": 0.7224535844563095, + "grad_norm": 0.78125, + "learning_rate": 0.00012512254609709802, + "loss": 0.8882, + "step": 28136 + }, + { + "epoch": 0.7224792616522313, + "grad_norm": 0.7265625, + "learning_rate": 0.0001251182250435924, + "loss": 0.8204, + "step": 28137 + }, + { + "epoch": 0.722504938848153, + "grad_norm": 0.71875, + "learning_rate": 0.00012511390394002845, + "loss": 0.9029, + "step": 28138 + }, + { + "epoch": 0.7225306160440749, + "grad_norm": 0.76171875, + "learning_rate": 0.0001251095827864147, + "loss": 0.7308, + "step": 28139 + }, + { + "epoch": 0.7225562932399967, + "grad_norm": 0.78515625, + "learning_rate": 0.00012510526158275988, + "loss": 0.9126, + "step": 28140 + }, + { + "epoch": 0.7225819704359185, + "grad_norm": 0.73046875, + "learning_rate": 0.00012510094032907253, + "loss": 0.8427, + "step": 28141 + }, + { + "epoch": 0.7226076476318404, + "grad_norm": 0.76171875, + "learning_rate": 0.00012509661902536124, + "loss": 0.7899, + "step": 28142 + }, + { + "epoch": 0.7226333248277622, + "grad_norm": 0.76953125, + "learning_rate": 0.00012509229767163472, + "loss": 0.885, + "step": 28143 + }, + { + "epoch": 0.722659002023684, + "grad_norm": 0.6953125, + "learning_rate": 0.0001250879762679015, + "loss": 0.8667, + "step": 28144 + }, + { + "epoch": 0.7226846792196058, + "grad_norm": 0.75390625, + "learning_rate": 0.00012508365481417018, + "loss": 0.7803, + "step": 28145 + }, + { + "epoch": 0.7227103564155276, + "grad_norm": 0.78515625, + "learning_rate": 0.00012507933331044942, + "loss": 0.7612, + "step": 28146 + }, + { + "epoch": 0.7227360336114494, + "grad_norm": 0.83984375, + "learning_rate": 0.00012507501175674777, + "loss": 0.9542, + "step": 28147 + }, + { + "epoch": 0.7227617108073713, + "grad_norm": 0.703125, + "learning_rate": 0.00012507069015307395, + "loss": 0.79, + "step": 28148 + }, + { + "epoch": 0.7227873880032931, + "grad_norm": 0.80078125, + "learning_rate": 0.00012506636849943646, + "loss": 0.8401, + "step": 28149 + }, + { + "epoch": 0.722813065199215, + "grad_norm": 0.7734375, + "learning_rate": 0.000125062046795844, + "loss": 0.763, + "step": 28150 + }, + { + "epoch": 0.7228387423951368, + "grad_norm": 0.8203125, + "learning_rate": 0.00012505772504230514, + "loss": 0.9193, + "step": 28151 + }, + { + "epoch": 0.7228644195910585, + "grad_norm": 0.73046875, + "learning_rate": 0.0001250534032388285, + "loss": 0.8085, + "step": 28152 + }, + { + "epoch": 0.7228900967869804, + "grad_norm": 0.85546875, + "learning_rate": 0.00012504908138542264, + "loss": 0.8186, + "step": 28153 + }, + { + "epoch": 0.7229157739829022, + "grad_norm": 0.7890625, + "learning_rate": 0.0001250447594820963, + "loss": 0.871, + "step": 28154 + }, + { + "epoch": 0.722941451178824, + "grad_norm": 0.796875, + "learning_rate": 0.00012504043752885796, + "loss": 0.8373, + "step": 28155 + }, + { + "epoch": 0.7229671283747459, + "grad_norm": 0.86328125, + "learning_rate": 0.0001250361155257163, + "loss": 0.8286, + "step": 28156 + }, + { + "epoch": 0.7229928055706677, + "grad_norm": 0.8046875, + "learning_rate": 0.00012503179347267995, + "loss": 0.7325, + "step": 28157 + }, + { + "epoch": 0.7230184827665894, + "grad_norm": 0.75390625, + "learning_rate": 0.00012502747136975746, + "loss": 0.8305, + "step": 28158 + }, + { + "epoch": 0.7230441599625113, + "grad_norm": 0.78125, + "learning_rate": 0.00012502314921695751, + "loss": 0.8319, + "step": 28159 + }, + { + "epoch": 0.7230698371584331, + "grad_norm": 0.7421875, + "learning_rate": 0.00012501882701428866, + "loss": 0.8722, + "step": 28160 + }, + { + "epoch": 0.7230955143543549, + "grad_norm": 0.79296875, + "learning_rate": 0.00012501450476175955, + "loss": 0.7816, + "step": 28161 + }, + { + "epoch": 0.7231211915502768, + "grad_norm": 0.94921875, + "learning_rate": 0.0001250101824593788, + "loss": 0.8898, + "step": 28162 + }, + { + "epoch": 0.7231468687461986, + "grad_norm": 0.73828125, + "learning_rate": 0.000125005860107155, + "loss": 0.8359, + "step": 28163 + }, + { + "epoch": 0.7231725459421204, + "grad_norm": 0.7890625, + "learning_rate": 0.00012500153770509683, + "loss": 0.8115, + "step": 28164 + }, + { + "epoch": 0.7231982231380422, + "grad_norm": 0.69921875, + "learning_rate": 0.0001249972152532128, + "loss": 0.8437, + "step": 28165 + }, + { + "epoch": 0.723223900333964, + "grad_norm": 0.81640625, + "learning_rate": 0.0001249928927515116, + "loss": 0.8008, + "step": 28166 + }, + { + "epoch": 0.7232495775298858, + "grad_norm": 0.921875, + "learning_rate": 0.00012498857020000183, + "loss": 0.959, + "step": 28167 + }, + { + "epoch": 0.7232752547258077, + "grad_norm": 0.78125, + "learning_rate": 0.00012498424759869206, + "loss": 0.8654, + "step": 28168 + }, + { + "epoch": 0.7233009319217295, + "grad_norm": 0.80078125, + "learning_rate": 0.00012497992494759098, + "loss": 0.7671, + "step": 28169 + }, + { + "epoch": 0.7233266091176513, + "grad_norm": 0.734375, + "learning_rate": 0.00012497560224670716, + "loss": 0.7232, + "step": 28170 + }, + { + "epoch": 0.7233522863135732, + "grad_norm": 0.8046875, + "learning_rate": 0.00012497127949604922, + "loss": 0.8646, + "step": 28171 + }, + { + "epoch": 0.7233779635094949, + "grad_norm": 0.75390625, + "learning_rate": 0.00012496695669562578, + "loss": 0.7727, + "step": 28172 + }, + { + "epoch": 0.7234036407054167, + "grad_norm": 0.78125, + "learning_rate": 0.00012496263384544545, + "loss": 0.823, + "step": 28173 + }, + { + "epoch": 0.7234293179013386, + "grad_norm": 0.7578125, + "learning_rate": 0.0001249583109455168, + "loss": 0.8189, + "step": 28174 + }, + { + "epoch": 0.7234549950972604, + "grad_norm": 0.73828125, + "learning_rate": 0.00012495398799584854, + "loss": 0.7668, + "step": 28175 + }, + { + "epoch": 0.7234806722931822, + "grad_norm": 0.78515625, + "learning_rate": 0.00012494966499644922, + "loss": 0.7194, + "step": 28176 + }, + { + "epoch": 0.7235063494891041, + "grad_norm": 0.8203125, + "learning_rate": 0.00012494534194732747, + "loss": 0.803, + "step": 28177 + }, + { + "epoch": 0.7235320266850258, + "grad_norm": 0.76953125, + "learning_rate": 0.0001249410188484919, + "loss": 0.9249, + "step": 28178 + }, + { + "epoch": 0.7235577038809476, + "grad_norm": 0.87109375, + "learning_rate": 0.00012493669569995115, + "loss": 0.8826, + "step": 28179 + }, + { + "epoch": 0.7235833810768695, + "grad_norm": 0.8046875, + "learning_rate": 0.00012493237250171382, + "loss": 0.7843, + "step": 28180 + }, + { + "epoch": 0.7236090582727913, + "grad_norm": 0.71875, + "learning_rate": 0.00012492804925378853, + "loss": 0.8569, + "step": 28181 + }, + { + "epoch": 0.7236347354687132, + "grad_norm": 0.734375, + "learning_rate": 0.00012492372595618385, + "loss": 0.9736, + "step": 28182 + }, + { + "epoch": 0.723660412664635, + "grad_norm": 0.73828125, + "learning_rate": 0.00012491940260890848, + "loss": 0.8662, + "step": 28183 + }, + { + "epoch": 0.7236860898605568, + "grad_norm": 0.78125, + "learning_rate": 0.00012491507921197099, + "loss": 0.7595, + "step": 28184 + }, + { + "epoch": 0.7237117670564785, + "grad_norm": 0.7421875, + "learning_rate": 0.00012491075576537996, + "loss": 0.8982, + "step": 28185 + }, + { + "epoch": 0.7237374442524004, + "grad_norm": 0.92578125, + "learning_rate": 0.00012490643226914408, + "loss": 0.8309, + "step": 28186 + }, + { + "epoch": 0.7237631214483222, + "grad_norm": 0.73828125, + "learning_rate": 0.00012490210872327187, + "loss": 0.8972, + "step": 28187 + }, + { + "epoch": 0.7237887986442441, + "grad_norm": 0.69921875, + "learning_rate": 0.00012489778512777205, + "loss": 0.7511, + "step": 28188 + }, + { + "epoch": 0.7238144758401659, + "grad_norm": 0.7578125, + "learning_rate": 0.00012489346148265325, + "loss": 0.842, + "step": 28189 + }, + { + "epoch": 0.7238401530360877, + "grad_norm": 0.80078125, + "learning_rate": 0.00012488913778792394, + "loss": 0.8501, + "step": 28190 + }, + { + "epoch": 0.7238658302320096, + "grad_norm": 0.69140625, + "learning_rate": 0.00012488481404359286, + "loss": 0.8579, + "step": 28191 + }, + { + "epoch": 0.7238915074279313, + "grad_norm": 0.7734375, + "learning_rate": 0.00012488049024966862, + "loss": 0.9152, + "step": 28192 + }, + { + "epoch": 0.7239171846238531, + "grad_norm": 0.91015625, + "learning_rate": 0.00012487616640615977, + "loss": 0.8593, + "step": 28193 + }, + { + "epoch": 0.723942861819775, + "grad_norm": 0.77734375, + "learning_rate": 0.000124871842513075, + "loss": 0.8707, + "step": 28194 + }, + { + "epoch": 0.7239685390156968, + "grad_norm": 0.78125, + "learning_rate": 0.00012486751857042285, + "loss": 0.8043, + "step": 28195 + }, + { + "epoch": 0.7239942162116186, + "grad_norm": 0.8515625, + "learning_rate": 0.000124863194578212, + "loss": 0.8251, + "step": 28196 + }, + { + "epoch": 0.7240198934075405, + "grad_norm": 0.83203125, + "learning_rate": 0.00012485887053645107, + "loss": 0.8776, + "step": 28197 + }, + { + "epoch": 0.7240455706034622, + "grad_norm": 0.8359375, + "learning_rate": 0.00012485454644514863, + "loss": 0.8242, + "step": 28198 + }, + { + "epoch": 0.724071247799384, + "grad_norm": 0.7421875, + "learning_rate": 0.00012485022230431333, + "loss": 0.7787, + "step": 28199 + }, + { + "epoch": 0.7240969249953059, + "grad_norm": 0.7265625, + "learning_rate": 0.0001248458981139538, + "loss": 0.7264, + "step": 28200 + }, + { + "epoch": 0.7241226021912277, + "grad_norm": 0.83203125, + "learning_rate": 0.00012484157387407862, + "loss": 0.9453, + "step": 28201 + }, + { + "epoch": 0.7241482793871495, + "grad_norm": 0.95703125, + "learning_rate": 0.00012483724958469648, + "loss": 0.812, + "step": 28202 + }, + { + "epoch": 0.7241739565830714, + "grad_norm": 0.8046875, + "learning_rate": 0.00012483292524581588, + "loss": 0.9243, + "step": 28203 + }, + { + "epoch": 0.7241996337789932, + "grad_norm": 0.84375, + "learning_rate": 0.0001248286008574455, + "loss": 0.7922, + "step": 28204 + }, + { + "epoch": 0.7242253109749149, + "grad_norm": 0.92578125, + "learning_rate": 0.000124824276419594, + "loss": 0.8714, + "step": 28205 + }, + { + "epoch": 0.7242509881708368, + "grad_norm": 0.828125, + "learning_rate": 0.00012481995193226993, + "loss": 0.8426, + "step": 28206 + }, + { + "epoch": 0.7242766653667586, + "grad_norm": 0.74609375, + "learning_rate": 0.00012481562739548197, + "loss": 0.7925, + "step": 28207 + }, + { + "epoch": 0.7243023425626804, + "grad_norm": 0.79296875, + "learning_rate": 0.00012481130280923868, + "loss": 0.9107, + "step": 28208 + }, + { + "epoch": 0.7243280197586023, + "grad_norm": 0.7890625, + "learning_rate": 0.0001248069781735487, + "loss": 0.8587, + "step": 28209 + }, + { + "epoch": 0.7243536969545241, + "grad_norm": 0.765625, + "learning_rate": 0.0001248026534884207, + "loss": 0.9149, + "step": 28210 + }, + { + "epoch": 0.724379374150446, + "grad_norm": 0.765625, + "learning_rate": 0.00012479832875386318, + "loss": 0.8921, + "step": 28211 + }, + { + "epoch": 0.7244050513463677, + "grad_norm": 0.8203125, + "learning_rate": 0.0001247940039698849, + "loss": 0.7404, + "step": 28212 + }, + { + "epoch": 0.7244307285422895, + "grad_norm": 0.75, + "learning_rate": 0.00012478967913649437, + "loss": 0.759, + "step": 28213 + }, + { + "epoch": 0.7244564057382114, + "grad_norm": 0.80859375, + "learning_rate": 0.00012478535425370025, + "loss": 0.8525, + "step": 28214 + }, + { + "epoch": 0.7244820829341332, + "grad_norm": 0.79296875, + "learning_rate": 0.0001247810293215112, + "loss": 0.9254, + "step": 28215 + }, + { + "epoch": 0.724507760130055, + "grad_norm": 0.7890625, + "learning_rate": 0.00012477670433993574, + "loss": 0.8296, + "step": 28216 + }, + { + "epoch": 0.7245334373259769, + "grad_norm": 0.77734375, + "learning_rate": 0.00012477237930898257, + "loss": 0.936, + "step": 28217 + }, + { + "epoch": 0.7245591145218986, + "grad_norm": 0.78515625, + "learning_rate": 0.00012476805422866028, + "loss": 0.7901, + "step": 28218 + }, + { + "epoch": 0.7245847917178204, + "grad_norm": 0.77734375, + "learning_rate": 0.0001247637290989775, + "loss": 0.8487, + "step": 28219 + }, + { + "epoch": 0.7246104689137423, + "grad_norm": 0.78125, + "learning_rate": 0.00012475940391994282, + "loss": 0.8036, + "step": 28220 + }, + { + "epoch": 0.7246361461096641, + "grad_norm": 0.75390625, + "learning_rate": 0.00012475507869156492, + "loss": 0.8209, + "step": 28221 + }, + { + "epoch": 0.7246618233055859, + "grad_norm": 0.7578125, + "learning_rate": 0.00012475075341385238, + "loss": 0.8051, + "step": 28222 + }, + { + "epoch": 0.7246875005015078, + "grad_norm": 0.765625, + "learning_rate": 0.00012474642808681381, + "loss": 0.9628, + "step": 28223 + }, + { + "epoch": 0.7247131776974296, + "grad_norm": 0.76953125, + "learning_rate": 0.00012474210271045785, + "loss": 0.7571, + "step": 28224 + }, + { + "epoch": 0.7247388548933513, + "grad_norm": 0.8125, + "learning_rate": 0.00012473777728479312, + "loss": 0.9767, + "step": 28225 + }, + { + "epoch": 0.7247645320892732, + "grad_norm": 0.796875, + "learning_rate": 0.00012473345180982824, + "loss": 0.9211, + "step": 28226 + }, + { + "epoch": 0.724790209285195, + "grad_norm": 0.80859375, + "learning_rate": 0.0001247291262855718, + "loss": 0.811, + "step": 28227 + }, + { + "epoch": 0.7248158864811168, + "grad_norm": 0.80078125, + "learning_rate": 0.00012472480071203248, + "loss": 0.7424, + "step": 28228 + }, + { + "epoch": 0.7248415636770387, + "grad_norm": 0.79296875, + "learning_rate": 0.00012472047508921887, + "loss": 0.7979, + "step": 28229 + }, + { + "epoch": 0.7248672408729605, + "grad_norm": 0.8203125, + "learning_rate": 0.00012471614941713953, + "loss": 0.8837, + "step": 28230 + }, + { + "epoch": 0.7248929180688823, + "grad_norm": 0.7421875, + "learning_rate": 0.0001247118236958032, + "loss": 0.8392, + "step": 28231 + }, + { + "epoch": 0.7249185952648041, + "grad_norm": 0.8359375, + "learning_rate": 0.0001247074979252184, + "loss": 0.9365, + "step": 28232 + }, + { + "epoch": 0.7249442724607259, + "grad_norm": 0.74609375, + "learning_rate": 0.00012470317210539378, + "loss": 0.8306, + "step": 28233 + }, + { + "epoch": 0.7249699496566477, + "grad_norm": 0.828125, + "learning_rate": 0.000124698846236338, + "loss": 0.8131, + "step": 28234 + }, + { + "epoch": 0.7249956268525696, + "grad_norm": 0.8046875, + "learning_rate": 0.00012469452031805962, + "loss": 0.8184, + "step": 28235 + }, + { + "epoch": 0.7250213040484914, + "grad_norm": 0.80859375, + "learning_rate": 0.0001246901943505673, + "loss": 0.8637, + "step": 28236 + }, + { + "epoch": 0.7250469812444132, + "grad_norm": 0.79296875, + "learning_rate": 0.00012468586833386968, + "loss": 0.8725, + "step": 28237 + }, + { + "epoch": 0.725072658440335, + "grad_norm": 0.72265625, + "learning_rate": 0.0001246815422679753, + "loss": 0.7312, + "step": 28238 + }, + { + "epoch": 0.7250983356362568, + "grad_norm": 0.8046875, + "learning_rate": 0.0001246772161528929, + "loss": 0.7211, + "step": 28239 + }, + { + "epoch": 0.7251240128321786, + "grad_norm": 1.0078125, + "learning_rate": 0.000124672889988631, + "loss": 0.8001, + "step": 28240 + }, + { + "epoch": 0.7251496900281005, + "grad_norm": 0.70703125, + "learning_rate": 0.00012466856377519824, + "loss": 0.7656, + "step": 28241 + }, + { + "epoch": 0.7251753672240223, + "grad_norm": 0.94140625, + "learning_rate": 0.00012466423751260333, + "loss": 0.7813, + "step": 28242 + }, + { + "epoch": 0.7252010444199442, + "grad_norm": 0.83203125, + "learning_rate": 0.00012465991120085473, + "loss": 0.9227, + "step": 28243 + }, + { + "epoch": 0.725226721615866, + "grad_norm": 0.78125, + "learning_rate": 0.00012465558483996123, + "loss": 0.8256, + "step": 28244 + }, + { + "epoch": 0.7252523988117877, + "grad_norm": 0.87890625, + "learning_rate": 0.00012465125842993135, + "loss": 0.898, + "step": 28245 + }, + { + "epoch": 0.7252780760077095, + "grad_norm": 0.78125, + "learning_rate": 0.00012464693197077373, + "loss": 0.9104, + "step": 28246 + }, + { + "epoch": 0.7253037532036314, + "grad_norm": 0.73828125, + "learning_rate": 0.000124642605462497, + "loss": 0.8095, + "step": 28247 + }, + { + "epoch": 0.7253294303995532, + "grad_norm": 0.80078125, + "learning_rate": 0.0001246382789051098, + "loss": 0.8749, + "step": 28248 + }, + { + "epoch": 0.7253551075954751, + "grad_norm": 0.80078125, + "learning_rate": 0.00012463395229862074, + "loss": 0.9571, + "step": 28249 + }, + { + "epoch": 0.7253807847913969, + "grad_norm": 0.80078125, + "learning_rate": 0.00012462962564303845, + "loss": 0.8301, + "step": 28250 + }, + { + "epoch": 0.7254064619873187, + "grad_norm": 0.79296875, + "learning_rate": 0.00012462529893837146, + "loss": 0.8472, + "step": 28251 + }, + { + "epoch": 0.7254321391832405, + "grad_norm": 0.80859375, + "learning_rate": 0.00012462097218462857, + "loss": 0.907, + "step": 28252 + }, + { + "epoch": 0.7254578163791623, + "grad_norm": 0.7734375, + "learning_rate": 0.00012461664538181825, + "loss": 0.9452, + "step": 28253 + }, + { + "epoch": 0.7254834935750841, + "grad_norm": 0.87109375, + "learning_rate": 0.00012461231852994918, + "loss": 0.8228, + "step": 28254 + }, + { + "epoch": 0.725509170771006, + "grad_norm": 0.78125, + "learning_rate": 0.00012460799162903002, + "loss": 0.7552, + "step": 28255 + }, + { + "epoch": 0.7255348479669278, + "grad_norm": 0.7734375, + "learning_rate": 0.0001246036646790693, + "loss": 0.7595, + "step": 28256 + }, + { + "epoch": 0.7255605251628496, + "grad_norm": 0.78515625, + "learning_rate": 0.00012459933768007577, + "loss": 0.7719, + "step": 28257 + }, + { + "epoch": 0.7255862023587714, + "grad_norm": 0.77734375, + "learning_rate": 0.00012459501063205797, + "loss": 0.8704, + "step": 28258 + }, + { + "epoch": 0.7256118795546932, + "grad_norm": 0.75, + "learning_rate": 0.00012459068353502448, + "loss": 0.8066, + "step": 28259 + }, + { + "epoch": 0.725637556750615, + "grad_norm": 0.76171875, + "learning_rate": 0.00012458635638898401, + "loss": 0.7938, + "step": 28260 + }, + { + "epoch": 0.7256632339465369, + "grad_norm": 0.7890625, + "learning_rate": 0.00012458202919394515, + "loss": 0.7634, + "step": 28261 + }, + { + "epoch": 0.7256889111424587, + "grad_norm": 0.77734375, + "learning_rate": 0.00012457770194991652, + "loss": 0.8307, + "step": 28262 + }, + { + "epoch": 0.7257145883383805, + "grad_norm": 0.69921875, + "learning_rate": 0.0001245733746569068, + "loss": 0.7907, + "step": 28263 + }, + { + "epoch": 0.7257402655343024, + "grad_norm": 0.828125, + "learning_rate": 0.00012456904731492449, + "loss": 0.8117, + "step": 28264 + }, + { + "epoch": 0.7257659427302241, + "grad_norm": 0.76953125, + "learning_rate": 0.00012456471992397834, + "loss": 0.7977, + "step": 28265 + }, + { + "epoch": 0.7257916199261459, + "grad_norm": 0.79296875, + "learning_rate": 0.00012456039248407688, + "loss": 0.8511, + "step": 28266 + }, + { + "epoch": 0.7258172971220678, + "grad_norm": 0.79296875, + "learning_rate": 0.0001245560649952288, + "loss": 0.841, + "step": 28267 + }, + { + "epoch": 0.7258429743179896, + "grad_norm": 0.79296875, + "learning_rate": 0.00012455173745744272, + "loss": 0.8319, + "step": 28268 + }, + { + "epoch": 0.7258686515139114, + "grad_norm": 0.8046875, + "learning_rate": 0.00012454740987072723, + "loss": 0.7482, + "step": 28269 + }, + { + "epoch": 0.7258943287098333, + "grad_norm": 0.828125, + "learning_rate": 0.00012454308223509096, + "loss": 0.7017, + "step": 28270 + }, + { + "epoch": 0.7259200059057551, + "grad_norm": 0.78125, + "learning_rate": 0.00012453875455054256, + "loss": 0.9517, + "step": 28271 + }, + { + "epoch": 0.7259456831016768, + "grad_norm": 0.6796875, + "learning_rate": 0.0001245344268170906, + "loss": 0.6883, + "step": 28272 + }, + { + "epoch": 0.7259713602975987, + "grad_norm": 0.828125, + "learning_rate": 0.0001245300990347438, + "loss": 0.884, + "step": 28273 + }, + { + "epoch": 0.7259970374935205, + "grad_norm": 0.8125, + "learning_rate": 0.00012452577120351067, + "loss": 0.9185, + "step": 28274 + }, + { + "epoch": 0.7260227146894424, + "grad_norm": 0.8125, + "learning_rate": 0.00012452144332339993, + "loss": 0.8111, + "step": 28275 + }, + { + "epoch": 0.7260483918853642, + "grad_norm": 0.765625, + "learning_rate": 0.00012451711539442016, + "loss": 0.8322, + "step": 28276 + }, + { + "epoch": 0.726074069081286, + "grad_norm": 0.83984375, + "learning_rate": 0.00012451278741658, + "loss": 0.8328, + "step": 28277 + }, + { + "epoch": 0.7260997462772077, + "grad_norm": 1.6640625, + "learning_rate": 0.00012450845938988806, + "loss": 0.8907, + "step": 28278 + }, + { + "epoch": 0.7261254234731296, + "grad_norm": 0.71875, + "learning_rate": 0.000124504131314353, + "loss": 0.8309, + "step": 28279 + }, + { + "epoch": 0.7261511006690514, + "grad_norm": 0.703125, + "learning_rate": 0.00012449980318998338, + "loss": 0.7669, + "step": 28280 + }, + { + "epoch": 0.7261767778649733, + "grad_norm": 0.71875, + "learning_rate": 0.0001244954750167879, + "loss": 0.8231, + "step": 28281 + }, + { + "epoch": 0.7262024550608951, + "grad_norm": 0.8203125, + "learning_rate": 0.00012449114679477513, + "loss": 0.8726, + "step": 28282 + }, + { + "epoch": 0.7262281322568169, + "grad_norm": 0.80859375, + "learning_rate": 0.0001244868185239537, + "loss": 0.8623, + "step": 28283 + }, + { + "epoch": 0.7262538094527388, + "grad_norm": 1.0, + "learning_rate": 0.0001244824902043323, + "loss": 0.753, + "step": 28284 + }, + { + "epoch": 0.7262794866486605, + "grad_norm": 0.7421875, + "learning_rate": 0.00012447816183591947, + "loss": 0.8453, + "step": 28285 + }, + { + "epoch": 0.7263051638445823, + "grad_norm": 0.76171875, + "learning_rate": 0.00012447383341872387, + "loss": 0.8218, + "step": 28286 + }, + { + "epoch": 0.7263308410405042, + "grad_norm": 0.70703125, + "learning_rate": 0.00012446950495275416, + "loss": 0.7893, + "step": 28287 + }, + { + "epoch": 0.726356518236426, + "grad_norm": 0.8046875, + "learning_rate": 0.0001244651764380189, + "loss": 0.8497, + "step": 28288 + }, + { + "epoch": 0.7263821954323478, + "grad_norm": 0.828125, + "learning_rate": 0.0001244608478745268, + "loss": 0.882, + "step": 28289 + }, + { + "epoch": 0.7264078726282697, + "grad_norm": 0.7734375, + "learning_rate": 0.00012445651926228644, + "loss": 0.8982, + "step": 28290 + }, + { + "epoch": 0.7264335498241915, + "grad_norm": 0.76171875, + "learning_rate": 0.0001244521906013064, + "loss": 0.7937, + "step": 28291 + }, + { + "epoch": 0.7264592270201132, + "grad_norm": 0.81640625, + "learning_rate": 0.00012444786189159536, + "loss": 0.8195, + "step": 28292 + }, + { + "epoch": 0.7264849042160351, + "grad_norm": 0.80859375, + "learning_rate": 0.00012444353313316198, + "loss": 0.8569, + "step": 28293 + }, + { + "epoch": 0.7265105814119569, + "grad_norm": 0.8046875, + "learning_rate": 0.00012443920432601483, + "loss": 0.8654, + "step": 28294 + }, + { + "epoch": 0.7265362586078787, + "grad_norm": 0.76953125, + "learning_rate": 0.00012443487547016255, + "loss": 0.934, + "step": 28295 + }, + { + "epoch": 0.7265619358038006, + "grad_norm": 0.69921875, + "learning_rate": 0.00012443054656561376, + "loss": 0.7637, + "step": 28296 + }, + { + "epoch": 0.7265876129997224, + "grad_norm": 0.7890625, + "learning_rate": 0.00012442621761237708, + "loss": 0.94, + "step": 28297 + }, + { + "epoch": 0.7266132901956441, + "grad_norm": 0.76171875, + "learning_rate": 0.0001244218886104612, + "loss": 0.7847, + "step": 28298 + }, + { + "epoch": 0.726638967391566, + "grad_norm": 0.8671875, + "learning_rate": 0.00012441755955987466, + "loss": 0.9342, + "step": 28299 + }, + { + "epoch": 0.7266646445874878, + "grad_norm": 0.74609375, + "learning_rate": 0.00012441323046062617, + "loss": 0.8146, + "step": 28300 + }, + { + "epoch": 0.7266903217834096, + "grad_norm": 0.8984375, + "learning_rate": 0.0001244089013127243, + "loss": 0.8427, + "step": 28301 + }, + { + "epoch": 0.7267159989793315, + "grad_norm": 0.80078125, + "learning_rate": 0.0001244045721161777, + "loss": 0.8068, + "step": 28302 + }, + { + "epoch": 0.7267416761752533, + "grad_norm": 0.84375, + "learning_rate": 0.00012440024287099498, + "loss": 0.8938, + "step": 28303 + }, + { + "epoch": 0.7267673533711752, + "grad_norm": 0.73046875, + "learning_rate": 0.00012439591357718476, + "loss": 0.7363, + "step": 28304 + }, + { + "epoch": 0.7267930305670969, + "grad_norm": 0.82421875, + "learning_rate": 0.00012439158423475572, + "loss": 0.9281, + "step": 28305 + }, + { + "epoch": 0.7268187077630187, + "grad_norm": 0.70703125, + "learning_rate": 0.00012438725484371644, + "loss": 0.8973, + "step": 28306 + }, + { + "epoch": 0.7268443849589405, + "grad_norm": 0.74609375, + "learning_rate": 0.00012438292540407558, + "loss": 0.717, + "step": 28307 + }, + { + "epoch": 0.7268700621548624, + "grad_norm": 0.7421875, + "learning_rate": 0.00012437859591584176, + "loss": 0.8416, + "step": 28308 + }, + { + "epoch": 0.7268957393507842, + "grad_norm": 0.78125, + "learning_rate": 0.0001243742663790236, + "loss": 0.9053, + "step": 28309 + }, + { + "epoch": 0.7269214165467061, + "grad_norm": 0.85546875, + "learning_rate": 0.00012436993679362966, + "loss": 0.9302, + "step": 28310 + }, + { + "epoch": 0.7269470937426278, + "grad_norm": 0.8203125, + "learning_rate": 0.00012436560715966869, + "loss": 0.8548, + "step": 28311 + }, + { + "epoch": 0.7269727709385496, + "grad_norm": 0.82421875, + "learning_rate": 0.00012436127747714927, + "loss": 0.7594, + "step": 28312 + }, + { + "epoch": 0.7269984481344715, + "grad_norm": 0.73828125, + "learning_rate": 0.00012435694774608002, + "loss": 0.897, + "step": 28313 + }, + { + "epoch": 0.7270241253303933, + "grad_norm": 0.828125, + "learning_rate": 0.0001243526179664696, + "loss": 0.9095, + "step": 28314 + }, + { + "epoch": 0.7270498025263151, + "grad_norm": 0.76953125, + "learning_rate": 0.00012434828813832658, + "loss": 0.7812, + "step": 28315 + }, + { + "epoch": 0.727075479722237, + "grad_norm": 0.71484375, + "learning_rate": 0.00012434395826165962, + "loss": 0.7631, + "step": 28316 + }, + { + "epoch": 0.7271011569181588, + "grad_norm": 0.7890625, + "learning_rate": 0.00012433962833647736, + "loss": 0.8439, + "step": 28317 + }, + { + "epoch": 0.7271268341140805, + "grad_norm": 0.8046875, + "learning_rate": 0.0001243352983627884, + "loss": 0.7533, + "step": 28318 + }, + { + "epoch": 0.7271525113100024, + "grad_norm": 0.88671875, + "learning_rate": 0.0001243309683406014, + "loss": 1.0389, + "step": 28319 + }, + { + "epoch": 0.7271781885059242, + "grad_norm": 0.703125, + "learning_rate": 0.00012432663826992502, + "loss": 0.7097, + "step": 28320 + }, + { + "epoch": 0.727203865701846, + "grad_norm": 0.76171875, + "learning_rate": 0.0001243223081507678, + "loss": 0.8844, + "step": 28321 + }, + { + "epoch": 0.7272295428977679, + "grad_norm": 0.7109375, + "learning_rate": 0.0001243179779831384, + "loss": 0.6785, + "step": 28322 + }, + { + "epoch": 0.7272552200936897, + "grad_norm": 0.79296875, + "learning_rate": 0.00012431364776704552, + "loss": 0.8554, + "step": 28323 + }, + { + "epoch": 0.7272808972896115, + "grad_norm": 0.73828125, + "learning_rate": 0.0001243093175024977, + "loss": 0.7745, + "step": 28324 + }, + { + "epoch": 0.7273065744855333, + "grad_norm": 0.83984375, + "learning_rate": 0.00012430498718950364, + "loss": 1.0062, + "step": 28325 + }, + { + "epoch": 0.7273322516814551, + "grad_norm": 0.7421875, + "learning_rate": 0.00012430065682807187, + "loss": 0.876, + "step": 28326 + }, + { + "epoch": 0.7273579288773769, + "grad_norm": 0.80859375, + "learning_rate": 0.00012429632641821115, + "loss": 0.9707, + "step": 28327 + }, + { + "epoch": 0.7273836060732988, + "grad_norm": 0.734375, + "learning_rate": 0.00012429199595993, + "loss": 0.6398, + "step": 28328 + }, + { + "epoch": 0.7274092832692206, + "grad_norm": 0.73828125, + "learning_rate": 0.00012428766545323714, + "loss": 0.8749, + "step": 28329 + }, + { + "epoch": 0.7274349604651424, + "grad_norm": 0.8203125, + "learning_rate": 0.00012428333489814114, + "loss": 0.8828, + "step": 28330 + }, + { + "epoch": 0.7274606376610642, + "grad_norm": 0.80078125, + "learning_rate": 0.00012427900429465063, + "loss": 0.8694, + "step": 28331 + }, + { + "epoch": 0.727486314856986, + "grad_norm": 0.734375, + "learning_rate": 0.00012427467364277428, + "loss": 0.9016, + "step": 28332 + }, + { + "epoch": 0.7275119920529078, + "grad_norm": 0.82421875, + "learning_rate": 0.00012427034294252068, + "loss": 0.892, + "step": 28333 + }, + { + "epoch": 0.7275376692488297, + "grad_norm": 0.7421875, + "learning_rate": 0.0001242660121938985, + "loss": 0.8073, + "step": 28334 + }, + { + "epoch": 0.7275633464447515, + "grad_norm": 0.81640625, + "learning_rate": 0.00012426168139691637, + "loss": 0.8791, + "step": 28335 + }, + { + "epoch": 0.7275890236406733, + "grad_norm": 0.8046875, + "learning_rate": 0.00012425735055158284, + "loss": 0.8722, + "step": 28336 + }, + { + "epoch": 0.7276147008365952, + "grad_norm": 0.78125, + "learning_rate": 0.00012425301965790663, + "loss": 0.8855, + "step": 28337 + }, + { + "epoch": 0.7276403780325169, + "grad_norm": 0.7734375, + "learning_rate": 0.00012424868871589633, + "loss": 0.8537, + "step": 28338 + }, + { + "epoch": 0.7276660552284387, + "grad_norm": 0.87890625, + "learning_rate": 0.0001242443577255606, + "loss": 0.7779, + "step": 28339 + }, + { + "epoch": 0.7276917324243606, + "grad_norm": 0.79296875, + "learning_rate": 0.00012424002668690807, + "loss": 0.8314, + "step": 28340 + }, + { + "epoch": 0.7277174096202824, + "grad_norm": 0.83203125, + "learning_rate": 0.00012423569559994732, + "loss": 0.9875, + "step": 28341 + }, + { + "epoch": 0.7277430868162043, + "grad_norm": 0.796875, + "learning_rate": 0.00012423136446468704, + "loss": 0.6615, + "step": 28342 + }, + { + "epoch": 0.7277687640121261, + "grad_norm": 0.77734375, + "learning_rate": 0.00012422703328113585, + "loss": 0.921, + "step": 28343 + }, + { + "epoch": 0.7277944412080479, + "grad_norm": 0.8203125, + "learning_rate": 0.00012422270204930232, + "loss": 0.8421, + "step": 28344 + }, + { + "epoch": 0.7278201184039697, + "grad_norm": 0.74609375, + "learning_rate": 0.0001242183707691952, + "loss": 0.7407, + "step": 28345 + }, + { + "epoch": 0.7278457955998915, + "grad_norm": 1.109375, + "learning_rate": 0.00012421403944082302, + "loss": 0.8747, + "step": 28346 + }, + { + "epoch": 0.7278714727958133, + "grad_norm": 0.81640625, + "learning_rate": 0.00012420970806419445, + "loss": 0.7731, + "step": 28347 + }, + { + "epoch": 0.7278971499917352, + "grad_norm": 0.83984375, + "learning_rate": 0.00012420537663931813, + "loss": 0.7787, + "step": 28348 + }, + { + "epoch": 0.727922827187657, + "grad_norm": 0.8125, + "learning_rate": 0.00012420104516620265, + "loss": 0.7477, + "step": 28349 + }, + { + "epoch": 0.7279485043835788, + "grad_norm": 0.73828125, + "learning_rate": 0.0001241967136448567, + "loss": 0.7431, + "step": 28350 + }, + { + "epoch": 0.7279741815795006, + "grad_norm": 0.7734375, + "learning_rate": 0.00012419238207528888, + "loss": 0.9111, + "step": 28351 + }, + { + "epoch": 0.7279998587754224, + "grad_norm": 0.796875, + "learning_rate": 0.0001241880504575078, + "loss": 0.8442, + "step": 28352 + }, + { + "epoch": 0.7280255359713442, + "grad_norm": 0.7734375, + "learning_rate": 0.00012418371879152217, + "loss": 0.8502, + "step": 28353 + }, + { + "epoch": 0.7280512131672661, + "grad_norm": 0.8046875, + "learning_rate": 0.0001241793870773406, + "loss": 0.8958, + "step": 28354 + }, + { + "epoch": 0.7280768903631879, + "grad_norm": 0.75, + "learning_rate": 0.0001241750553149716, + "loss": 0.7733, + "step": 28355 + }, + { + "epoch": 0.7281025675591097, + "grad_norm": 0.81640625, + "learning_rate": 0.00012417072350442396, + "loss": 0.7028, + "step": 28356 + }, + { + "epoch": 0.7281282447550316, + "grad_norm": 0.73828125, + "learning_rate": 0.00012416639164570623, + "loss": 0.8718, + "step": 28357 + }, + { + "epoch": 0.7281539219509533, + "grad_norm": 0.75, + "learning_rate": 0.00012416205973882707, + "loss": 0.7889, + "step": 28358 + }, + { + "epoch": 0.7281795991468751, + "grad_norm": 0.75, + "learning_rate": 0.00012415772778379507, + "loss": 0.8298, + "step": 28359 + }, + { + "epoch": 0.728205276342797, + "grad_norm": 0.79296875, + "learning_rate": 0.00012415339578061897, + "loss": 0.7242, + "step": 28360 + }, + { + "epoch": 0.7282309535387188, + "grad_norm": 0.80078125, + "learning_rate": 0.0001241490637293073, + "loss": 0.8758, + "step": 28361 + }, + { + "epoch": 0.7282566307346406, + "grad_norm": 0.71484375, + "learning_rate": 0.00012414473162986872, + "loss": 0.7849, + "step": 28362 + }, + { + "epoch": 0.7282823079305625, + "grad_norm": 0.73828125, + "learning_rate": 0.00012414039948231188, + "loss": 0.7565, + "step": 28363 + }, + { + "epoch": 0.7283079851264843, + "grad_norm": 0.81640625, + "learning_rate": 0.00012413606728664542, + "loss": 0.7806, + "step": 28364 + }, + { + "epoch": 0.728333662322406, + "grad_norm": 0.84375, + "learning_rate": 0.00012413173504287794, + "loss": 0.9482, + "step": 28365 + }, + { + "epoch": 0.7283593395183279, + "grad_norm": 0.7421875, + "learning_rate": 0.00012412740275101808, + "loss": 0.8259, + "step": 28366 + }, + { + "epoch": 0.7283850167142497, + "grad_norm": 0.89453125, + "learning_rate": 0.0001241230704110745, + "loss": 0.8324, + "step": 28367 + }, + { + "epoch": 0.7284106939101715, + "grad_norm": 0.78125, + "learning_rate": 0.00012411873802305583, + "loss": 0.6892, + "step": 28368 + }, + { + "epoch": 0.7284363711060934, + "grad_norm": 0.80078125, + "learning_rate": 0.0001241144055869707, + "loss": 0.8631, + "step": 28369 + }, + { + "epoch": 0.7284620483020152, + "grad_norm": 0.66796875, + "learning_rate": 0.00012411007310282772, + "loss": 0.7222, + "step": 28370 + }, + { + "epoch": 0.7284877254979369, + "grad_norm": 0.84375, + "learning_rate": 0.0001241057405706355, + "loss": 0.7974, + "step": 28371 + }, + { + "epoch": 0.7285134026938588, + "grad_norm": 0.78125, + "learning_rate": 0.00012410140799040277, + "loss": 0.8855, + "step": 28372 + }, + { + "epoch": 0.7285390798897806, + "grad_norm": 0.79296875, + "learning_rate": 0.00012409707536213812, + "loss": 0.8127, + "step": 28373 + }, + { + "epoch": 0.7285647570857025, + "grad_norm": 0.8125, + "learning_rate": 0.00012409274268585017, + "loss": 0.8058, + "step": 28374 + }, + { + "epoch": 0.7285904342816243, + "grad_norm": 0.796875, + "learning_rate": 0.00012408840996154755, + "loss": 0.7845, + "step": 28375 + }, + { + "epoch": 0.7286161114775461, + "grad_norm": 0.859375, + "learning_rate": 0.00012408407718923886, + "loss": 0.7875, + "step": 28376 + }, + { + "epoch": 0.728641788673468, + "grad_norm": 0.84375, + "learning_rate": 0.00012407974436893284, + "loss": 0.9444, + "step": 28377 + }, + { + "epoch": 0.7286674658693897, + "grad_norm": 0.77734375, + "learning_rate": 0.00012407541150063806, + "loss": 0.7765, + "step": 28378 + }, + { + "epoch": 0.7286931430653115, + "grad_norm": 0.83984375, + "learning_rate": 0.00012407107858436312, + "loss": 0.8063, + "step": 28379 + }, + { + "epoch": 0.7287188202612334, + "grad_norm": 0.8203125, + "learning_rate": 0.00012406674562011674, + "loss": 0.6881, + "step": 28380 + }, + { + "epoch": 0.7287444974571552, + "grad_norm": 0.765625, + "learning_rate": 0.00012406241260790748, + "loss": 0.7864, + "step": 28381 + }, + { + "epoch": 0.728770174653077, + "grad_norm": 0.80078125, + "learning_rate": 0.000124058079547744, + "loss": 0.852, + "step": 28382 + }, + { + "epoch": 0.7287958518489989, + "grad_norm": 0.72265625, + "learning_rate": 0.00012405374643963498, + "loss": 0.7822, + "step": 28383 + }, + { + "epoch": 0.7288215290449207, + "grad_norm": 0.77734375, + "learning_rate": 0.00012404941328358898, + "loss": 0.9152, + "step": 28384 + }, + { + "epoch": 0.7288472062408424, + "grad_norm": 0.65234375, + "learning_rate": 0.0001240450800796147, + "loss": 0.7494, + "step": 28385 + }, + { + "epoch": 0.7288728834367643, + "grad_norm": 0.75, + "learning_rate": 0.00012404074682772074, + "loss": 0.7598, + "step": 28386 + }, + { + "epoch": 0.7288985606326861, + "grad_norm": 0.7890625, + "learning_rate": 0.00012403641352791572, + "loss": 0.9656, + "step": 28387 + }, + { + "epoch": 0.7289242378286079, + "grad_norm": 0.72265625, + "learning_rate": 0.0001240320801802083, + "loss": 0.8113, + "step": 28388 + }, + { + "epoch": 0.7289499150245298, + "grad_norm": 0.8203125, + "learning_rate": 0.00012402774678460713, + "loss": 0.9991, + "step": 28389 + }, + { + "epoch": 0.7289755922204516, + "grad_norm": 0.7890625, + "learning_rate": 0.00012402341334112084, + "loss": 0.884, + "step": 28390 + }, + { + "epoch": 0.7290012694163733, + "grad_norm": 0.71875, + "learning_rate": 0.00012401907984975805, + "loss": 0.8484, + "step": 28391 + }, + { + "epoch": 0.7290269466122952, + "grad_norm": 0.76171875, + "learning_rate": 0.00012401474631052738, + "loss": 0.8523, + "step": 28392 + }, + { + "epoch": 0.729052623808217, + "grad_norm": 0.78515625, + "learning_rate": 0.00012401041272343754, + "loss": 0.7695, + "step": 28393 + }, + { + "epoch": 0.7290783010041388, + "grad_norm": 0.8359375, + "learning_rate": 0.00012400607908849708, + "loss": 0.8507, + "step": 28394 + }, + { + "epoch": 0.7291039782000607, + "grad_norm": 0.80859375, + "learning_rate": 0.00012400174540571469, + "loss": 0.8486, + "step": 28395 + }, + { + "epoch": 0.7291296553959825, + "grad_norm": 0.7265625, + "learning_rate": 0.00012399741167509897, + "loss": 0.8622, + "step": 28396 + }, + { + "epoch": 0.7291553325919043, + "grad_norm": 0.81640625, + "learning_rate": 0.00012399307789665858, + "loss": 0.8783, + "step": 28397 + }, + { + "epoch": 0.7291810097878261, + "grad_norm": 0.765625, + "learning_rate": 0.00012398874407040214, + "loss": 0.8379, + "step": 28398 + }, + { + "epoch": 0.7292066869837479, + "grad_norm": 0.7890625, + "learning_rate": 0.00012398441019633832, + "loss": 0.8341, + "step": 28399 + }, + { + "epoch": 0.7292323641796697, + "grad_norm": 0.7265625, + "learning_rate": 0.00012398007627447574, + "loss": 0.6617, + "step": 28400 + }, + { + "epoch": 0.7292580413755916, + "grad_norm": 0.78515625, + "learning_rate": 0.00012397574230482304, + "loss": 0.739, + "step": 28401 + }, + { + "epoch": 0.7292837185715134, + "grad_norm": 0.75390625, + "learning_rate": 0.00012397140828738885, + "loss": 0.9061, + "step": 28402 + }, + { + "epoch": 0.7293093957674353, + "grad_norm": 0.75390625, + "learning_rate": 0.00012396707422218176, + "loss": 0.818, + "step": 28403 + }, + { + "epoch": 0.7293350729633571, + "grad_norm": 0.83203125, + "learning_rate": 0.00012396274010921052, + "loss": 1.0368, + "step": 28404 + }, + { + "epoch": 0.7293607501592788, + "grad_norm": 0.77734375, + "learning_rate": 0.0001239584059484837, + "loss": 0.936, + "step": 28405 + }, + { + "epoch": 0.7293864273552007, + "grad_norm": 0.79296875, + "learning_rate": 0.00012395407174000991, + "loss": 0.8972, + "step": 28406 + }, + { + "epoch": 0.7294121045511225, + "grad_norm": 0.84375, + "learning_rate": 0.00012394973748379784, + "loss": 0.8207, + "step": 28407 + }, + { + "epoch": 0.7294377817470443, + "grad_norm": 0.78125, + "learning_rate": 0.00012394540317985604, + "loss": 0.7664, + "step": 28408 + }, + { + "epoch": 0.7294634589429662, + "grad_norm": 0.734375, + "learning_rate": 0.0001239410688281933, + "loss": 0.7299, + "step": 28409 + }, + { + "epoch": 0.729489136138888, + "grad_norm": 0.6796875, + "learning_rate": 0.00012393673442881814, + "loss": 0.7253, + "step": 28410 + }, + { + "epoch": 0.7295148133348097, + "grad_norm": 0.8359375, + "learning_rate": 0.0001239323999817392, + "loss": 0.8685, + "step": 28411 + }, + { + "epoch": 0.7295404905307316, + "grad_norm": 0.73046875, + "learning_rate": 0.0001239280654869652, + "loss": 0.8531, + "step": 28412 + }, + { + "epoch": 0.7295661677266534, + "grad_norm": 0.8125, + "learning_rate": 0.00012392373094450472, + "loss": 0.8698, + "step": 28413 + }, + { + "epoch": 0.7295918449225752, + "grad_norm": 0.765625, + "learning_rate": 0.0001239193963543664, + "loss": 0.8699, + "step": 28414 + }, + { + "epoch": 0.7296175221184971, + "grad_norm": 0.7734375, + "learning_rate": 0.0001239150617165589, + "loss": 0.8394, + "step": 28415 + }, + { + "epoch": 0.7296431993144189, + "grad_norm": 0.82421875, + "learning_rate": 0.0001239107270310908, + "loss": 0.8524, + "step": 28416 + }, + { + "epoch": 0.7296688765103407, + "grad_norm": 0.77734375, + "learning_rate": 0.0001239063922979708, + "loss": 0.8111, + "step": 28417 + }, + { + "epoch": 0.7296945537062625, + "grad_norm": 0.74609375, + "learning_rate": 0.0001239020575172075, + "loss": 0.7569, + "step": 28418 + }, + { + "epoch": 0.7297202309021843, + "grad_norm": 0.77734375, + "learning_rate": 0.0001238977226888096, + "loss": 0.7463, + "step": 28419 + }, + { + "epoch": 0.7297459080981061, + "grad_norm": 0.76953125, + "learning_rate": 0.00012389338781278568, + "loss": 0.7413, + "step": 28420 + }, + { + "epoch": 0.729771585294028, + "grad_norm": 0.859375, + "learning_rate": 0.00012388905288914436, + "loss": 0.9043, + "step": 28421 + }, + { + "epoch": 0.7297972624899498, + "grad_norm": 0.8515625, + "learning_rate": 0.00012388471791789434, + "loss": 0.8986, + "step": 28422 + }, + { + "epoch": 0.7298229396858716, + "grad_norm": 0.8046875, + "learning_rate": 0.00012388038289904427, + "loss": 0.9179, + "step": 28423 + }, + { + "epoch": 0.7298486168817935, + "grad_norm": 0.75390625, + "learning_rate": 0.00012387604783260268, + "loss": 0.752, + "step": 28424 + }, + { + "epoch": 0.7298742940777152, + "grad_norm": 0.80078125, + "learning_rate": 0.00012387171271857836, + "loss": 0.7543, + "step": 28425 + }, + { + "epoch": 0.729899971273637, + "grad_norm": 0.84765625, + "learning_rate": 0.00012386737755697985, + "loss": 0.8401, + "step": 28426 + }, + { + "epoch": 0.7299256484695589, + "grad_norm": 0.7890625, + "learning_rate": 0.0001238630423478158, + "loss": 0.9404, + "step": 28427 + }, + { + "epoch": 0.7299513256654807, + "grad_norm": 0.7890625, + "learning_rate": 0.0001238587070910949, + "loss": 0.871, + "step": 28428 + }, + { + "epoch": 0.7299770028614025, + "grad_norm": 0.8046875, + "learning_rate": 0.00012385437178682567, + "loss": 0.6871, + "step": 28429 + }, + { + "epoch": 0.7300026800573244, + "grad_norm": 0.859375, + "learning_rate": 0.0001238500364350169, + "loss": 0.8295, + "step": 28430 + }, + { + "epoch": 0.7300283572532461, + "grad_norm": 0.75390625, + "learning_rate": 0.00012384570103567713, + "loss": 0.8367, + "step": 28431 + }, + { + "epoch": 0.7300540344491679, + "grad_norm": 0.84375, + "learning_rate": 0.00012384136558881505, + "loss": 0.9891, + "step": 28432 + }, + { + "epoch": 0.7300797116450898, + "grad_norm": 0.77734375, + "learning_rate": 0.00012383703009443927, + "loss": 0.8062, + "step": 28433 + }, + { + "epoch": 0.7301053888410116, + "grad_norm": 0.8125, + "learning_rate": 0.00012383269455255844, + "loss": 0.987, + "step": 28434 + }, + { + "epoch": 0.7301310660369335, + "grad_norm": 0.78125, + "learning_rate": 0.00012382835896318124, + "loss": 0.8537, + "step": 28435 + }, + { + "epoch": 0.7301567432328553, + "grad_norm": 0.796875, + "learning_rate": 0.00012382402332631625, + "loss": 0.7896, + "step": 28436 + }, + { + "epoch": 0.7301824204287771, + "grad_norm": 0.7890625, + "learning_rate": 0.0001238196876419721, + "loss": 0.9747, + "step": 28437 + }, + { + "epoch": 0.7302080976246988, + "grad_norm": 1.0234375, + "learning_rate": 0.0001238153519101575, + "loss": 0.8199, + "step": 28438 + }, + { + "epoch": 0.7302337748206207, + "grad_norm": 0.78515625, + "learning_rate": 0.00012381101613088107, + "loss": 0.7653, + "step": 28439 + }, + { + "epoch": 0.7302594520165425, + "grad_norm": 0.7734375, + "learning_rate": 0.00012380668030415138, + "loss": 0.7676, + "step": 28440 + }, + { + "epoch": 0.7302851292124644, + "grad_norm": 0.7109375, + "learning_rate": 0.0001238023444299772, + "loss": 0.7253, + "step": 28441 + }, + { + "epoch": 0.7303108064083862, + "grad_norm": 0.77734375, + "learning_rate": 0.00012379800850836702, + "loss": 0.7433, + "step": 28442 + }, + { + "epoch": 0.730336483604308, + "grad_norm": 0.76171875, + "learning_rate": 0.0001237936725393296, + "loss": 0.9457, + "step": 28443 + }, + { + "epoch": 0.7303621608002299, + "grad_norm": 0.74609375, + "learning_rate": 0.00012378933652287352, + "loss": 0.9042, + "step": 28444 + }, + { + "epoch": 0.7303878379961516, + "grad_norm": 0.81640625, + "learning_rate": 0.00012378500045900746, + "loss": 0.8266, + "step": 28445 + }, + { + "epoch": 0.7304135151920734, + "grad_norm": 0.7109375, + "learning_rate": 0.00012378066434774005, + "loss": 0.8123, + "step": 28446 + }, + { + "epoch": 0.7304391923879953, + "grad_norm": 0.765625, + "learning_rate": 0.00012377632818907992, + "loss": 0.8343, + "step": 28447 + }, + { + "epoch": 0.7304648695839171, + "grad_norm": 0.80078125, + "learning_rate": 0.00012377199198303568, + "loss": 0.9525, + "step": 28448 + }, + { + "epoch": 0.7304905467798389, + "grad_norm": 0.7890625, + "learning_rate": 0.00012376765572961603, + "loss": 0.756, + "step": 28449 + }, + { + "epoch": 0.7305162239757608, + "grad_norm": 0.7578125, + "learning_rate": 0.00012376331942882962, + "loss": 0.8546, + "step": 28450 + }, + { + "epoch": 0.7305419011716825, + "grad_norm": 0.78125, + "learning_rate": 0.00012375898308068502, + "loss": 0.6763, + "step": 28451 + }, + { + "epoch": 0.7305675783676043, + "grad_norm": 0.765625, + "learning_rate": 0.00012375464668519095, + "loss": 0.8218, + "step": 28452 + }, + { + "epoch": 0.7305932555635262, + "grad_norm": 0.78125, + "learning_rate": 0.00012375031024235597, + "loss": 0.8279, + "step": 28453 + }, + { + "epoch": 0.730618932759448, + "grad_norm": 0.81640625, + "learning_rate": 0.0001237459737521888, + "loss": 0.7873, + "step": 28454 + }, + { + "epoch": 0.7306446099553698, + "grad_norm": 0.81640625, + "learning_rate": 0.00012374163721469805, + "loss": 0.9447, + "step": 28455 + }, + { + "epoch": 0.7306702871512917, + "grad_norm": 0.765625, + "learning_rate": 0.00012373730062989233, + "loss": 0.8102, + "step": 28456 + }, + { + "epoch": 0.7306959643472135, + "grad_norm": 0.87109375, + "learning_rate": 0.00012373296399778035, + "loss": 0.884, + "step": 28457 + }, + { + "epoch": 0.7307216415431352, + "grad_norm": 0.9453125, + "learning_rate": 0.0001237286273183707, + "loss": 0.7545, + "step": 28458 + }, + { + "epoch": 0.7307473187390571, + "grad_norm": 0.78515625, + "learning_rate": 0.00012372429059167206, + "loss": 0.7647, + "step": 28459 + }, + { + "epoch": 0.7307729959349789, + "grad_norm": 0.921875, + "learning_rate": 0.000123719953817693, + "loss": 0.7913, + "step": 28460 + }, + { + "epoch": 0.7307986731309007, + "grad_norm": 0.72265625, + "learning_rate": 0.00012371561699644226, + "loss": 0.7834, + "step": 28461 + }, + { + "epoch": 0.7308243503268226, + "grad_norm": 0.81640625, + "learning_rate": 0.0001237112801279284, + "loss": 0.8531, + "step": 28462 + }, + { + "epoch": 0.7308500275227444, + "grad_norm": 0.83984375, + "learning_rate": 0.00012370694321216013, + "loss": 0.8001, + "step": 28463 + }, + { + "epoch": 0.7308757047186663, + "grad_norm": 0.78515625, + "learning_rate": 0.00012370260624914607, + "loss": 0.8492, + "step": 28464 + }, + { + "epoch": 0.730901381914588, + "grad_norm": 0.76953125, + "learning_rate": 0.00012369826923889487, + "loss": 0.7945, + "step": 28465 + }, + { + "epoch": 0.7309270591105098, + "grad_norm": 0.7734375, + "learning_rate": 0.00012369393218141512, + "loss": 0.9625, + "step": 28466 + }, + { + "epoch": 0.7309527363064316, + "grad_norm": 0.78125, + "learning_rate": 0.0001236895950767155, + "loss": 0.8312, + "step": 28467 + }, + { + "epoch": 0.7309784135023535, + "grad_norm": 0.77734375, + "learning_rate": 0.0001236852579248047, + "loss": 0.7804, + "step": 28468 + }, + { + "epoch": 0.7310040906982753, + "grad_norm": 0.84765625, + "learning_rate": 0.00012368092072569128, + "loss": 0.8481, + "step": 28469 + }, + { + "epoch": 0.7310297678941972, + "grad_norm": 0.73046875, + "learning_rate": 0.00012367658347938392, + "loss": 0.8203, + "step": 28470 + }, + { + "epoch": 0.7310554450901189, + "grad_norm": 0.80859375, + "learning_rate": 0.00012367224618589133, + "loss": 0.9948, + "step": 28471 + }, + { + "epoch": 0.7310811222860407, + "grad_norm": 0.75390625, + "learning_rate": 0.00012366790884522203, + "loss": 0.8492, + "step": 28472 + }, + { + "epoch": 0.7311067994819626, + "grad_norm": 0.74609375, + "learning_rate": 0.00012366357145738476, + "loss": 0.7136, + "step": 28473 + }, + { + "epoch": 0.7311324766778844, + "grad_norm": 0.69921875, + "learning_rate": 0.00012365923402238811, + "loss": 0.7693, + "step": 28474 + }, + { + "epoch": 0.7311581538738062, + "grad_norm": 0.7421875, + "learning_rate": 0.00012365489654024073, + "loss": 0.7814, + "step": 28475 + }, + { + "epoch": 0.7311838310697281, + "grad_norm": 0.76171875, + "learning_rate": 0.0001236505590109513, + "loss": 0.7765, + "step": 28476 + }, + { + "epoch": 0.7312095082656499, + "grad_norm": 0.7578125, + "learning_rate": 0.00012364622143452844, + "loss": 0.8425, + "step": 28477 + }, + { + "epoch": 0.7312351854615716, + "grad_norm": 0.984375, + "learning_rate": 0.00012364188381098083, + "loss": 0.8416, + "step": 28478 + }, + { + "epoch": 0.7312608626574935, + "grad_norm": 0.7421875, + "learning_rate": 0.00012363754614031703, + "loss": 0.7385, + "step": 28479 + }, + { + "epoch": 0.7312865398534153, + "grad_norm": 0.859375, + "learning_rate": 0.00012363320842254574, + "loss": 0.959, + "step": 28480 + }, + { + "epoch": 0.7313122170493371, + "grad_norm": 0.7421875, + "learning_rate": 0.00012362887065767562, + "loss": 0.8415, + "step": 28481 + }, + { + "epoch": 0.731337894245259, + "grad_norm": 0.8203125, + "learning_rate": 0.0001236245328457153, + "loss": 0.8039, + "step": 28482 + }, + { + "epoch": 0.7313635714411808, + "grad_norm": 0.71875, + "learning_rate": 0.00012362019498667339, + "loss": 0.7228, + "step": 28483 + }, + { + "epoch": 0.7313892486371026, + "grad_norm": 0.734375, + "learning_rate": 0.00012361585708055859, + "loss": 0.7261, + "step": 28484 + }, + { + "epoch": 0.7314149258330244, + "grad_norm": 0.76953125, + "learning_rate": 0.00012361151912737952, + "loss": 0.8922, + "step": 28485 + }, + { + "epoch": 0.7314406030289462, + "grad_norm": 0.80859375, + "learning_rate": 0.00012360718112714483, + "loss": 0.9269, + "step": 28486 + }, + { + "epoch": 0.731466280224868, + "grad_norm": 0.75, + "learning_rate": 0.00012360284307986314, + "loss": 0.7491, + "step": 28487 + }, + { + "epoch": 0.7314919574207899, + "grad_norm": 0.8203125, + "learning_rate": 0.00012359850498554312, + "loss": 0.8542, + "step": 28488 + }, + { + "epoch": 0.7315176346167117, + "grad_norm": 0.7734375, + "learning_rate": 0.0001235941668441934, + "loss": 0.879, + "step": 28489 + }, + { + "epoch": 0.7315433118126335, + "grad_norm": 0.7734375, + "learning_rate": 0.00012358982865582268, + "loss": 0.9328, + "step": 28490 + }, + { + "epoch": 0.7315689890085553, + "grad_norm": 0.75, + "learning_rate": 0.0001235854904204395, + "loss": 0.7637, + "step": 28491 + }, + { + "epoch": 0.7315946662044771, + "grad_norm": 0.8203125, + "learning_rate": 0.00012358115213805264, + "loss": 0.8517, + "step": 28492 + }, + { + "epoch": 0.7316203434003989, + "grad_norm": 0.91796875, + "learning_rate": 0.00012357681380867062, + "loss": 0.7679, + "step": 28493 + }, + { + "epoch": 0.7316460205963208, + "grad_norm": 0.8203125, + "learning_rate": 0.00012357247543230216, + "loss": 0.8119, + "step": 28494 + }, + { + "epoch": 0.7316716977922426, + "grad_norm": 0.76171875, + "learning_rate": 0.0001235681370089559, + "loss": 0.7247, + "step": 28495 + }, + { + "epoch": 0.7316973749881645, + "grad_norm": 1.015625, + "learning_rate": 0.0001235637985386404, + "loss": 0.8265, + "step": 28496 + }, + { + "epoch": 0.7317230521840863, + "grad_norm": 0.90625, + "learning_rate": 0.00012355946002136445, + "loss": 0.9099, + "step": 28497 + }, + { + "epoch": 0.731748729380008, + "grad_norm": 0.796875, + "learning_rate": 0.0001235551214571366, + "loss": 0.7643, + "step": 28498 + }, + { + "epoch": 0.7317744065759298, + "grad_norm": 0.84765625, + "learning_rate": 0.00012355078284596556, + "loss": 0.9523, + "step": 28499 + }, + { + "epoch": 0.7318000837718517, + "grad_norm": 0.7578125, + "learning_rate": 0.0001235464441878599, + "loss": 0.7859, + "step": 28500 + }, + { + "epoch": 0.7318257609677735, + "grad_norm": 0.80859375, + "learning_rate": 0.0001235421054828283, + "loss": 0.8776, + "step": 28501 + }, + { + "epoch": 0.7318514381636954, + "grad_norm": 0.76953125, + "learning_rate": 0.00012353776673087942, + "loss": 0.7367, + "step": 28502 + }, + { + "epoch": 0.7318771153596172, + "grad_norm": 0.875, + "learning_rate": 0.0001235334279320219, + "loss": 0.8008, + "step": 28503 + }, + { + "epoch": 0.7319027925555389, + "grad_norm": 0.83203125, + "learning_rate": 0.00012352908908626434, + "loss": 0.8111, + "step": 28504 + }, + { + "epoch": 0.7319284697514608, + "grad_norm": 0.84765625, + "learning_rate": 0.0001235247501936155, + "loss": 0.8057, + "step": 28505 + }, + { + "epoch": 0.7319541469473826, + "grad_norm": 0.7890625, + "learning_rate": 0.00012352041125408394, + "loss": 0.7882, + "step": 28506 + }, + { + "epoch": 0.7319798241433044, + "grad_norm": 0.80078125, + "learning_rate": 0.0001235160722676783, + "loss": 1.0049, + "step": 28507 + }, + { + "epoch": 0.7320055013392263, + "grad_norm": 0.95703125, + "learning_rate": 0.00012351173323440728, + "loss": 0.821, + "step": 28508 + }, + { + "epoch": 0.7320311785351481, + "grad_norm": 0.84765625, + "learning_rate": 0.00012350739415427944, + "loss": 0.9259, + "step": 28509 + }, + { + "epoch": 0.7320568557310699, + "grad_norm": 0.7265625, + "learning_rate": 0.00012350305502730356, + "loss": 0.7239, + "step": 28510 + }, + { + "epoch": 0.7320825329269917, + "grad_norm": 0.80859375, + "learning_rate": 0.00012349871585348818, + "loss": 0.9068, + "step": 28511 + }, + { + "epoch": 0.7321082101229135, + "grad_norm": 0.79296875, + "learning_rate": 0.000123494376632842, + "loss": 0.7294, + "step": 28512 + }, + { + "epoch": 0.7321338873188353, + "grad_norm": 0.75390625, + "learning_rate": 0.00012349003736537362, + "loss": 0.8333, + "step": 28513 + }, + { + "epoch": 0.7321595645147572, + "grad_norm": 0.78125, + "learning_rate": 0.00012348569805109173, + "loss": 0.9304, + "step": 28514 + }, + { + "epoch": 0.732185241710679, + "grad_norm": 0.796875, + "learning_rate": 0.000123481358690005, + "loss": 0.87, + "step": 28515 + }, + { + "epoch": 0.7322109189066008, + "grad_norm": 0.7265625, + "learning_rate": 0.000123477019282122, + "loss": 0.7854, + "step": 28516 + }, + { + "epoch": 0.7322365961025227, + "grad_norm": 0.83984375, + "learning_rate": 0.0001234726798274514, + "loss": 0.894, + "step": 28517 + }, + { + "epoch": 0.7322622732984444, + "grad_norm": 0.7109375, + "learning_rate": 0.00012346834032600195, + "loss": 0.9063, + "step": 28518 + }, + { + "epoch": 0.7322879504943662, + "grad_norm": 0.734375, + "learning_rate": 0.00012346400077778219, + "loss": 0.6577, + "step": 28519 + }, + { + "epoch": 0.7323136276902881, + "grad_norm": 0.75390625, + "learning_rate": 0.00012345966118280076, + "loss": 0.7862, + "step": 28520 + }, + { + "epoch": 0.7323393048862099, + "grad_norm": 0.765625, + "learning_rate": 0.00012345532154106636, + "loss": 0.7889, + "step": 28521 + }, + { + "epoch": 0.7323649820821317, + "grad_norm": 0.71875, + "learning_rate": 0.00012345098185258762, + "loss": 0.7877, + "step": 28522 + }, + { + "epoch": 0.7323906592780536, + "grad_norm": 0.71875, + "learning_rate": 0.00012344664211737317, + "loss": 0.7939, + "step": 28523 + }, + { + "epoch": 0.7324163364739753, + "grad_norm": 0.921875, + "learning_rate": 0.00012344230233543173, + "loss": 0.9082, + "step": 28524 + }, + { + "epoch": 0.7324420136698971, + "grad_norm": 0.84765625, + "learning_rate": 0.00012343796250677185, + "loss": 0.8081, + "step": 28525 + }, + { + "epoch": 0.732467690865819, + "grad_norm": 0.7890625, + "learning_rate": 0.00012343362263140227, + "loss": 0.7833, + "step": 28526 + }, + { + "epoch": 0.7324933680617408, + "grad_norm": 0.8671875, + "learning_rate": 0.00012342928270933157, + "loss": 0.776, + "step": 28527 + }, + { + "epoch": 0.7325190452576626, + "grad_norm": 0.7734375, + "learning_rate": 0.00012342494274056843, + "loss": 0.7934, + "step": 28528 + }, + { + "epoch": 0.7325447224535845, + "grad_norm": 0.796875, + "learning_rate": 0.00012342060272512153, + "loss": 0.8061, + "step": 28529 + }, + { + "epoch": 0.7325703996495063, + "grad_norm": 0.82421875, + "learning_rate": 0.00012341626266299945, + "loss": 0.9275, + "step": 28530 + }, + { + "epoch": 0.732596076845428, + "grad_norm": 0.7734375, + "learning_rate": 0.00012341192255421087, + "loss": 0.9095, + "step": 28531 + }, + { + "epoch": 0.7326217540413499, + "grad_norm": 0.765625, + "learning_rate": 0.00012340758239876446, + "loss": 0.9038, + "step": 28532 + }, + { + "epoch": 0.7326474312372717, + "grad_norm": 0.79296875, + "learning_rate": 0.00012340324219666885, + "loss": 0.7918, + "step": 28533 + }, + { + "epoch": 0.7326731084331936, + "grad_norm": 0.75, + "learning_rate": 0.0001233989019479327, + "loss": 0.8538, + "step": 28534 + }, + { + "epoch": 0.7326987856291154, + "grad_norm": 0.75, + "learning_rate": 0.00012339456165256463, + "loss": 0.7182, + "step": 28535 + }, + { + "epoch": 0.7327244628250372, + "grad_norm": 0.796875, + "learning_rate": 0.0001233902213105733, + "loss": 0.7603, + "step": 28536 + }, + { + "epoch": 0.7327501400209591, + "grad_norm": 0.79296875, + "learning_rate": 0.0001233858809219674, + "loss": 0.8438, + "step": 28537 + }, + { + "epoch": 0.7327758172168808, + "grad_norm": 0.7578125, + "learning_rate": 0.00012338154048675555, + "loss": 0.7693, + "step": 28538 + }, + { + "epoch": 0.7328014944128026, + "grad_norm": 0.71875, + "learning_rate": 0.0001233772000049464, + "loss": 0.7628, + "step": 28539 + }, + { + "epoch": 0.7328271716087245, + "grad_norm": 0.77734375, + "learning_rate": 0.0001233728594765486, + "loss": 0.9498, + "step": 28540 + }, + { + "epoch": 0.7328528488046463, + "grad_norm": 0.76171875, + "learning_rate": 0.00012336851890157078, + "loss": 0.8498, + "step": 28541 + }, + { + "epoch": 0.7328785260005681, + "grad_norm": 0.77734375, + "learning_rate": 0.00012336417828002165, + "loss": 0.7815, + "step": 28542 + }, + { + "epoch": 0.73290420319649, + "grad_norm": 0.8203125, + "learning_rate": 0.0001233598376119098, + "loss": 0.8858, + "step": 28543 + }, + { + "epoch": 0.7329298803924117, + "grad_norm": 0.77734375, + "learning_rate": 0.0001233554968972439, + "loss": 0.779, + "step": 28544 + }, + { + "epoch": 0.7329555575883335, + "grad_norm": 0.75, + "learning_rate": 0.00012335115613603263, + "loss": 0.9383, + "step": 28545 + }, + { + "epoch": 0.7329812347842554, + "grad_norm": 0.78125, + "learning_rate": 0.00012334681532828458, + "loss": 0.816, + "step": 28546 + }, + { + "epoch": 0.7330069119801772, + "grad_norm": 0.75390625, + "learning_rate": 0.00012334247447400848, + "loss": 0.7874, + "step": 28547 + }, + { + "epoch": 0.733032589176099, + "grad_norm": 0.72265625, + "learning_rate": 0.00012333813357321292, + "loss": 0.8522, + "step": 28548 + }, + { + "epoch": 0.7330582663720209, + "grad_norm": 0.7265625, + "learning_rate": 0.00012333379262590653, + "loss": 0.7346, + "step": 28549 + }, + { + "epoch": 0.7330839435679427, + "grad_norm": 0.79296875, + "learning_rate": 0.00012332945163209802, + "loss": 0.8804, + "step": 28550 + }, + { + "epoch": 0.7331096207638644, + "grad_norm": 0.75, + "learning_rate": 0.00012332511059179606, + "loss": 0.7892, + "step": 28551 + }, + { + "epoch": 0.7331352979597863, + "grad_norm": 0.77734375, + "learning_rate": 0.00012332076950500922, + "loss": 0.7664, + "step": 28552 + }, + { + "epoch": 0.7331609751557081, + "grad_norm": 0.796875, + "learning_rate": 0.00012331642837174622, + "loss": 0.7524, + "step": 28553 + }, + { + "epoch": 0.7331866523516299, + "grad_norm": 0.76171875, + "learning_rate": 0.00012331208719201567, + "loss": 0.7421, + "step": 28554 + }, + { + "epoch": 0.7332123295475518, + "grad_norm": 0.765625, + "learning_rate": 0.0001233077459658262, + "loss": 0.8682, + "step": 28555 + }, + { + "epoch": 0.7332380067434736, + "grad_norm": 0.79296875, + "learning_rate": 0.00012330340469318655, + "loss": 0.9411, + "step": 28556 + }, + { + "epoch": 0.7332636839393954, + "grad_norm": 0.8046875, + "learning_rate": 0.0001232990633741053, + "loss": 0.772, + "step": 28557 + }, + { + "epoch": 0.7332893611353172, + "grad_norm": 0.73828125, + "learning_rate": 0.00012329472200859113, + "loss": 0.7596, + "step": 28558 + }, + { + "epoch": 0.733315038331239, + "grad_norm": 0.70703125, + "learning_rate": 0.00012329038059665268, + "loss": 0.6362, + "step": 28559 + }, + { + "epoch": 0.7333407155271608, + "grad_norm": 0.74609375, + "learning_rate": 0.0001232860391382986, + "loss": 0.8234, + "step": 28560 + }, + { + "epoch": 0.7333663927230827, + "grad_norm": 0.78515625, + "learning_rate": 0.00012328169763353754, + "loss": 0.8547, + "step": 28561 + }, + { + "epoch": 0.7333920699190045, + "grad_norm": 0.75390625, + "learning_rate": 0.00012327735608237816, + "loss": 0.8415, + "step": 28562 + }, + { + "epoch": 0.7334177471149264, + "grad_norm": 0.82421875, + "learning_rate": 0.00012327301448482913, + "loss": 0.8706, + "step": 28563 + }, + { + "epoch": 0.7334434243108481, + "grad_norm": 0.765625, + "learning_rate": 0.0001232686728408991, + "loss": 0.7195, + "step": 28564 + }, + { + "epoch": 0.7334691015067699, + "grad_norm": 0.7890625, + "learning_rate": 0.00012326433115059665, + "loss": 0.762, + "step": 28565 + }, + { + "epoch": 0.7334947787026918, + "grad_norm": 0.75390625, + "learning_rate": 0.00012325998941393053, + "loss": 1.0202, + "step": 28566 + }, + { + "epoch": 0.7335204558986136, + "grad_norm": 0.87109375, + "learning_rate": 0.00012325564763090936, + "loss": 0.9126, + "step": 28567 + }, + { + "epoch": 0.7335461330945354, + "grad_norm": 0.80078125, + "learning_rate": 0.00012325130580154173, + "loss": 0.8428, + "step": 28568 + }, + { + "epoch": 0.7335718102904573, + "grad_norm": 0.78125, + "learning_rate": 0.0001232469639258364, + "loss": 0.7775, + "step": 28569 + }, + { + "epoch": 0.7335974874863791, + "grad_norm": 0.7890625, + "learning_rate": 0.00012324262200380198, + "loss": 0.6664, + "step": 28570 + }, + { + "epoch": 0.7336231646823008, + "grad_norm": 0.8359375, + "learning_rate": 0.00012323828003544708, + "loss": 0.9924, + "step": 28571 + }, + { + "epoch": 0.7336488418782227, + "grad_norm": 0.77734375, + "learning_rate": 0.0001232339380207804, + "loss": 0.8399, + "step": 28572 + }, + { + "epoch": 0.7336745190741445, + "grad_norm": 0.77734375, + "learning_rate": 0.00012322959595981056, + "loss": 0.874, + "step": 28573 + }, + { + "epoch": 0.7337001962700663, + "grad_norm": 0.83984375, + "learning_rate": 0.00012322525385254627, + "loss": 0.7455, + "step": 28574 + }, + { + "epoch": 0.7337258734659882, + "grad_norm": 0.734375, + "learning_rate": 0.0001232209116989961, + "loss": 0.7452, + "step": 28575 + }, + { + "epoch": 0.73375155066191, + "grad_norm": 0.81640625, + "learning_rate": 0.0001232165694991688, + "loss": 0.8813, + "step": 28576 + }, + { + "epoch": 0.7337772278578318, + "grad_norm": 0.78125, + "learning_rate": 0.00012321222725307298, + "loss": 0.7973, + "step": 28577 + }, + { + "epoch": 0.7338029050537536, + "grad_norm": 0.9296875, + "learning_rate": 0.00012320788496071722, + "loss": 0.9181, + "step": 28578 + }, + { + "epoch": 0.7338285822496754, + "grad_norm": 0.765625, + "learning_rate": 0.0001232035426221103, + "loss": 0.6999, + "step": 28579 + }, + { + "epoch": 0.7338542594455972, + "grad_norm": 0.8046875, + "learning_rate": 0.00012319920023726082, + "loss": 0.8613, + "step": 28580 + }, + { + "epoch": 0.7338799366415191, + "grad_norm": 0.80859375, + "learning_rate": 0.00012319485780617735, + "loss": 0.7483, + "step": 28581 + }, + { + "epoch": 0.7339056138374409, + "grad_norm": 0.765625, + "learning_rate": 0.0001231905153288687, + "loss": 0.7973, + "step": 28582 + }, + { + "epoch": 0.7339312910333627, + "grad_norm": 0.73828125, + "learning_rate": 0.00012318617280534345, + "loss": 0.7677, + "step": 28583 + }, + { + "epoch": 0.7339569682292845, + "grad_norm": 0.7734375, + "learning_rate": 0.00012318183023561023, + "loss": 0.8471, + "step": 28584 + }, + { + "epoch": 0.7339826454252063, + "grad_norm": 0.703125, + "learning_rate": 0.00012317748761967774, + "loss": 0.8733, + "step": 28585 + }, + { + "epoch": 0.7340083226211281, + "grad_norm": 0.84765625, + "learning_rate": 0.00012317314495755455, + "loss": 0.8597, + "step": 28586 + }, + { + "epoch": 0.73403399981705, + "grad_norm": 0.74609375, + "learning_rate": 0.00012316880224924942, + "loss": 0.7913, + "step": 28587 + }, + { + "epoch": 0.7340596770129718, + "grad_norm": 0.80078125, + "learning_rate": 0.00012316445949477097, + "loss": 0.8784, + "step": 28588 + }, + { + "epoch": 0.7340853542088936, + "grad_norm": 0.80078125, + "learning_rate": 0.0001231601166941278, + "loss": 0.9384, + "step": 28589 + }, + { + "epoch": 0.7341110314048155, + "grad_norm": 0.78515625, + "learning_rate": 0.00012315577384732865, + "loss": 0.8394, + "step": 28590 + }, + { + "epoch": 0.7341367086007372, + "grad_norm": 0.80078125, + "learning_rate": 0.00012315143095438213, + "loss": 0.7851, + "step": 28591 + }, + { + "epoch": 0.734162385796659, + "grad_norm": 0.69921875, + "learning_rate": 0.0001231470880152969, + "loss": 0.8798, + "step": 28592 + }, + { + "epoch": 0.7341880629925809, + "grad_norm": 0.828125, + "learning_rate": 0.0001231427450300816, + "loss": 0.8658, + "step": 28593 + }, + { + "epoch": 0.7342137401885027, + "grad_norm": 0.734375, + "learning_rate": 0.00012313840199874488, + "loss": 0.742, + "step": 28594 + }, + { + "epoch": 0.7342394173844246, + "grad_norm": 0.77734375, + "learning_rate": 0.00012313405892129545, + "loss": 0.7428, + "step": 28595 + }, + { + "epoch": 0.7342650945803464, + "grad_norm": 0.78515625, + "learning_rate": 0.00012312971579774194, + "loss": 0.8189, + "step": 28596 + }, + { + "epoch": 0.7342907717762682, + "grad_norm": 0.81640625, + "learning_rate": 0.00012312537262809298, + "loss": 0.8323, + "step": 28597 + }, + { + "epoch": 0.73431644897219, + "grad_norm": 0.7578125, + "learning_rate": 0.00012312102941235722, + "loss": 0.8311, + "step": 28598 + }, + { + "epoch": 0.7343421261681118, + "grad_norm": 0.7421875, + "learning_rate": 0.00012311668615054336, + "loss": 0.8343, + "step": 28599 + }, + { + "epoch": 0.7343678033640336, + "grad_norm": 0.85546875, + "learning_rate": 0.00012311234284266003, + "loss": 0.8837, + "step": 28600 + }, + { + "epoch": 0.7343934805599555, + "grad_norm": 0.76953125, + "learning_rate": 0.00012310799948871587, + "loss": 0.7279, + "step": 28601 + }, + { + "epoch": 0.7344191577558773, + "grad_norm": 0.7421875, + "learning_rate": 0.00012310365608871957, + "loss": 0.629, + "step": 28602 + }, + { + "epoch": 0.7344448349517991, + "grad_norm": 0.765625, + "learning_rate": 0.0001230993126426798, + "loss": 0.81, + "step": 28603 + }, + { + "epoch": 0.7344705121477209, + "grad_norm": 0.82421875, + "learning_rate": 0.00012309496915060514, + "loss": 0.8832, + "step": 28604 + }, + { + "epoch": 0.7344961893436427, + "grad_norm": 0.77734375, + "learning_rate": 0.0001230906256125043, + "loss": 0.7754, + "step": 28605 + }, + { + "epoch": 0.7345218665395645, + "grad_norm": 0.9140625, + "learning_rate": 0.00012308628202838595, + "loss": 0.8616, + "step": 28606 + }, + { + "epoch": 0.7345475437354864, + "grad_norm": 0.88671875, + "learning_rate": 0.0001230819383982587, + "loss": 0.899, + "step": 28607 + }, + { + "epoch": 0.7345732209314082, + "grad_norm": 0.6953125, + "learning_rate": 0.00012307759472213124, + "loss": 0.7731, + "step": 28608 + }, + { + "epoch": 0.73459889812733, + "grad_norm": 0.921875, + "learning_rate": 0.00012307325100001224, + "loss": 0.8377, + "step": 28609 + }, + { + "epoch": 0.7346245753232519, + "grad_norm": 0.7890625, + "learning_rate": 0.0001230689072319103, + "loss": 0.8515, + "step": 28610 + }, + { + "epoch": 0.7346502525191736, + "grad_norm": 0.7578125, + "learning_rate": 0.00012306456341783416, + "loss": 0.8862, + "step": 28611 + }, + { + "epoch": 0.7346759297150954, + "grad_norm": 0.74609375, + "learning_rate": 0.0001230602195577924, + "loss": 0.8165, + "step": 28612 + }, + { + "epoch": 0.7347016069110173, + "grad_norm": 0.75390625, + "learning_rate": 0.00012305587565179368, + "loss": 0.8524, + "step": 28613 + }, + { + "epoch": 0.7347272841069391, + "grad_norm": 0.81640625, + "learning_rate": 0.00012305153169984672, + "loss": 0.935, + "step": 28614 + }, + { + "epoch": 0.7347529613028609, + "grad_norm": 0.73046875, + "learning_rate": 0.00012304718770196014, + "loss": 0.8556, + "step": 28615 + }, + { + "epoch": 0.7347786384987828, + "grad_norm": 0.80078125, + "learning_rate": 0.00012304284365814255, + "loss": 0.876, + "step": 28616 + }, + { + "epoch": 0.7348043156947046, + "grad_norm": 0.73828125, + "learning_rate": 0.00012303849956840271, + "loss": 0.8092, + "step": 28617 + }, + { + "epoch": 0.7348299928906263, + "grad_norm": 0.8046875, + "learning_rate": 0.0001230341554327492, + "loss": 0.8881, + "step": 28618 + }, + { + "epoch": 0.7348556700865482, + "grad_norm": 0.80078125, + "learning_rate": 0.00012302981125119069, + "loss": 0.7499, + "step": 28619 + }, + { + "epoch": 0.73488134728247, + "grad_norm": 0.8125, + "learning_rate": 0.00012302546702373584, + "loss": 0.7853, + "step": 28620 + }, + { + "epoch": 0.7349070244783918, + "grad_norm": 0.796875, + "learning_rate": 0.00012302112275039333, + "loss": 0.9886, + "step": 28621 + }, + { + "epoch": 0.7349327016743137, + "grad_norm": 0.76171875, + "learning_rate": 0.0001230167784311718, + "loss": 0.8245, + "step": 28622 + }, + { + "epoch": 0.7349583788702355, + "grad_norm": 0.69140625, + "learning_rate": 0.0001230124340660799, + "loss": 0.853, + "step": 28623 + }, + { + "epoch": 0.7349840560661572, + "grad_norm": 0.75390625, + "learning_rate": 0.00012300808965512634, + "loss": 0.7133, + "step": 28624 + }, + { + "epoch": 0.7350097332620791, + "grad_norm": 0.75, + "learning_rate": 0.0001230037451983197, + "loss": 0.8906, + "step": 28625 + }, + { + "epoch": 0.7350354104580009, + "grad_norm": 0.6953125, + "learning_rate": 0.00012299940069566867, + "loss": 0.6775, + "step": 28626 + }, + { + "epoch": 0.7350610876539228, + "grad_norm": 0.73046875, + "learning_rate": 0.0001229950561471819, + "loss": 0.8446, + "step": 28627 + }, + { + "epoch": 0.7350867648498446, + "grad_norm": 0.9140625, + "learning_rate": 0.0001229907115528681, + "loss": 0.8912, + "step": 28628 + }, + { + "epoch": 0.7351124420457664, + "grad_norm": 0.765625, + "learning_rate": 0.00012298636691273585, + "loss": 0.8331, + "step": 28629 + }, + { + "epoch": 0.7351381192416883, + "grad_norm": 0.765625, + "learning_rate": 0.0001229820222267939, + "loss": 0.8824, + "step": 28630 + }, + { + "epoch": 0.73516379643761, + "grad_norm": 0.765625, + "learning_rate": 0.0001229776774950508, + "loss": 0.8227, + "step": 28631 + }, + { + "epoch": 0.7351894736335318, + "grad_norm": 0.7578125, + "learning_rate": 0.00012297333271751529, + "loss": 0.7821, + "step": 28632 + }, + { + "epoch": 0.7352151508294537, + "grad_norm": 0.84375, + "learning_rate": 0.000122968987894196, + "loss": 0.889, + "step": 28633 + }, + { + "epoch": 0.7352408280253755, + "grad_norm": 0.703125, + "learning_rate": 0.00012296464302510155, + "loss": 0.8284, + "step": 28634 + }, + { + "epoch": 0.7352665052212973, + "grad_norm": 0.8203125, + "learning_rate": 0.0001229602981102407, + "loss": 0.805, + "step": 28635 + }, + { + "epoch": 0.7352921824172192, + "grad_norm": 0.71484375, + "learning_rate": 0.00012295595314962203, + "loss": 0.7779, + "step": 28636 + }, + { + "epoch": 0.735317859613141, + "grad_norm": 0.86328125, + "learning_rate": 0.00012295160814325423, + "loss": 0.8211, + "step": 28637 + }, + { + "epoch": 0.7353435368090627, + "grad_norm": 0.73046875, + "learning_rate": 0.00012294726309114594, + "loss": 0.8558, + "step": 28638 + }, + { + "epoch": 0.7353692140049846, + "grad_norm": 0.82421875, + "learning_rate": 0.00012294291799330582, + "loss": 0.8403, + "step": 28639 + }, + { + "epoch": 0.7353948912009064, + "grad_norm": 0.68359375, + "learning_rate": 0.00012293857284974253, + "loss": 0.6835, + "step": 28640 + }, + { + "epoch": 0.7354205683968282, + "grad_norm": 0.8046875, + "learning_rate": 0.00012293422766046475, + "loss": 0.885, + "step": 28641 + }, + { + "epoch": 0.7354462455927501, + "grad_norm": 0.78515625, + "learning_rate": 0.0001229298824254811, + "loss": 0.8212, + "step": 28642 + }, + { + "epoch": 0.7354719227886719, + "grad_norm": 0.72265625, + "learning_rate": 0.00012292553714480034, + "loss": 0.9199, + "step": 28643 + }, + { + "epoch": 0.7354975999845936, + "grad_norm": 0.7421875, + "learning_rate": 0.000122921191818431, + "loss": 0.9445, + "step": 28644 + }, + { + "epoch": 0.7355232771805155, + "grad_norm": 0.8203125, + "learning_rate": 0.00012291684644638178, + "loss": 0.7915, + "step": 28645 + }, + { + "epoch": 0.7355489543764373, + "grad_norm": 0.7109375, + "learning_rate": 0.00012291250102866138, + "loss": 0.726, + "step": 28646 + }, + { + "epoch": 0.7355746315723591, + "grad_norm": 0.82421875, + "learning_rate": 0.00012290815556527843, + "loss": 1.0016, + "step": 28647 + }, + { + "epoch": 0.735600308768281, + "grad_norm": 0.890625, + "learning_rate": 0.0001229038100562416, + "loss": 0.8886, + "step": 28648 + }, + { + "epoch": 0.7356259859642028, + "grad_norm": 0.78515625, + "learning_rate": 0.00012289946450155955, + "loss": 0.7883, + "step": 28649 + }, + { + "epoch": 0.7356516631601246, + "grad_norm": 0.8046875, + "learning_rate": 0.0001228951189012409, + "loss": 0.8418, + "step": 28650 + }, + { + "epoch": 0.7356773403560464, + "grad_norm": 0.83984375, + "learning_rate": 0.0001228907732552944, + "loss": 0.9194, + "step": 28651 + }, + { + "epoch": 0.7357030175519682, + "grad_norm": 0.80078125, + "learning_rate": 0.00012288642756372862, + "loss": 0.8334, + "step": 28652 + }, + { + "epoch": 0.73572869474789, + "grad_norm": 0.8046875, + "learning_rate": 0.00012288208182655225, + "loss": 0.8124, + "step": 28653 + }, + { + "epoch": 0.7357543719438119, + "grad_norm": 0.76953125, + "learning_rate": 0.00012287773604377397, + "loss": 0.8188, + "step": 28654 + }, + { + "epoch": 0.7357800491397337, + "grad_norm": 0.74609375, + "learning_rate": 0.00012287339021540246, + "loss": 0.8521, + "step": 28655 + }, + { + "epoch": 0.7358057263356556, + "grad_norm": 0.72265625, + "learning_rate": 0.00012286904434144632, + "loss": 0.8719, + "step": 28656 + }, + { + "epoch": 0.7358314035315774, + "grad_norm": 0.7421875, + "learning_rate": 0.00012286469842191425, + "loss": 0.9548, + "step": 28657 + }, + { + "epoch": 0.7358570807274991, + "grad_norm": 0.84375, + "learning_rate": 0.00012286035245681487, + "loss": 0.9268, + "step": 28658 + }, + { + "epoch": 0.735882757923421, + "grad_norm": 0.8125, + "learning_rate": 0.0001228560064461569, + "loss": 0.7973, + "step": 28659 + }, + { + "epoch": 0.7359084351193428, + "grad_norm": 0.76171875, + "learning_rate": 0.00012285166038994896, + "loss": 0.779, + "step": 28660 + }, + { + "epoch": 0.7359341123152646, + "grad_norm": 0.76953125, + "learning_rate": 0.00012284731428819972, + "loss": 0.8067, + "step": 28661 + }, + { + "epoch": 0.7359597895111865, + "grad_norm": 0.703125, + "learning_rate": 0.0001228429681409179, + "loss": 0.7257, + "step": 28662 + }, + { + "epoch": 0.7359854667071083, + "grad_norm": 0.8046875, + "learning_rate": 0.00012283862194811208, + "loss": 0.8866, + "step": 28663 + }, + { + "epoch": 0.73601114390303, + "grad_norm": 0.73046875, + "learning_rate": 0.0001228342757097909, + "loss": 0.8119, + "step": 28664 + }, + { + "epoch": 0.7360368210989519, + "grad_norm": 0.765625, + "learning_rate": 0.00012282992942596314, + "loss": 0.7728, + "step": 28665 + }, + { + "epoch": 0.7360624982948737, + "grad_norm": 0.76171875, + "learning_rate": 0.00012282558309663734, + "loss": 0.709, + "step": 28666 + }, + { + "epoch": 0.7360881754907955, + "grad_norm": 0.76171875, + "learning_rate": 0.00012282123672182225, + "loss": 0.8424, + "step": 28667 + }, + { + "epoch": 0.7361138526867174, + "grad_norm": 0.7109375, + "learning_rate": 0.00012281689030152647, + "loss": 0.8919, + "step": 28668 + }, + { + "epoch": 0.7361395298826392, + "grad_norm": 0.8359375, + "learning_rate": 0.00012281254383575874, + "loss": 0.8582, + "step": 28669 + }, + { + "epoch": 0.736165207078561, + "grad_norm": 0.78515625, + "learning_rate": 0.00012280819732452762, + "loss": 0.8167, + "step": 28670 + }, + { + "epoch": 0.7361908842744828, + "grad_norm": 0.75390625, + "learning_rate": 0.00012280385076784183, + "loss": 0.803, + "step": 28671 + }, + { + "epoch": 0.7362165614704046, + "grad_norm": 0.73046875, + "learning_rate": 0.00012279950416571, + "loss": 0.7945, + "step": 28672 + }, + { + "epoch": 0.7362422386663264, + "grad_norm": 0.8203125, + "learning_rate": 0.00012279515751814087, + "loss": 0.909, + "step": 28673 + }, + { + "epoch": 0.7362679158622483, + "grad_norm": 0.78125, + "learning_rate": 0.00012279081082514304, + "loss": 0.6981, + "step": 28674 + }, + { + "epoch": 0.7362935930581701, + "grad_norm": 0.73046875, + "learning_rate": 0.00012278646408672516, + "loss": 0.7686, + "step": 28675 + }, + { + "epoch": 0.7363192702540919, + "grad_norm": 0.79296875, + "learning_rate": 0.00012278211730289593, + "loss": 0.8907, + "step": 28676 + }, + { + "epoch": 0.7363449474500138, + "grad_norm": 0.7578125, + "learning_rate": 0.000122777770473664, + "loss": 0.7875, + "step": 28677 + }, + { + "epoch": 0.7363706246459355, + "grad_norm": 0.7578125, + "learning_rate": 0.00012277342359903805, + "loss": 0.8017, + "step": 28678 + }, + { + "epoch": 0.7363963018418573, + "grad_norm": 0.734375, + "learning_rate": 0.00012276907667902668, + "loss": 0.8161, + "step": 28679 + }, + { + "epoch": 0.7364219790377792, + "grad_norm": 0.78515625, + "learning_rate": 0.00012276472971363861, + "loss": 0.8499, + "step": 28680 + }, + { + "epoch": 0.736447656233701, + "grad_norm": 0.78125, + "learning_rate": 0.0001227603827028825, + "loss": 0.8481, + "step": 28681 + }, + { + "epoch": 0.7364733334296228, + "grad_norm": 0.72265625, + "learning_rate": 0.00012275603564676702, + "loss": 0.8739, + "step": 28682 + }, + { + "epoch": 0.7364990106255447, + "grad_norm": 0.71875, + "learning_rate": 0.0001227516885453008, + "loss": 0.8566, + "step": 28683 + }, + { + "epoch": 0.7365246878214664, + "grad_norm": 0.70703125, + "learning_rate": 0.0001227473413984925, + "loss": 0.6273, + "step": 28684 + }, + { + "epoch": 0.7365503650173882, + "grad_norm": 0.734375, + "learning_rate": 0.00012274299420635083, + "loss": 0.7224, + "step": 28685 + }, + { + "epoch": 0.7365760422133101, + "grad_norm": 0.76953125, + "learning_rate": 0.00012273864696888442, + "loss": 0.9003, + "step": 28686 + }, + { + "epoch": 0.7366017194092319, + "grad_norm": 0.7734375, + "learning_rate": 0.00012273429968610194, + "loss": 0.7783, + "step": 28687 + }, + { + "epoch": 0.7366273966051538, + "grad_norm": 0.81640625, + "learning_rate": 0.00012272995235801203, + "loss": 0.846, + "step": 28688 + }, + { + "epoch": 0.7366530738010756, + "grad_norm": 0.74609375, + "learning_rate": 0.00012272560498462345, + "loss": 0.8052, + "step": 28689 + }, + { + "epoch": 0.7366787509969974, + "grad_norm": 0.76171875, + "learning_rate": 0.0001227212575659447, + "loss": 0.7892, + "step": 28690 + }, + { + "epoch": 0.7367044281929191, + "grad_norm": 0.8046875, + "learning_rate": 0.0001227169101019846, + "loss": 0.7345, + "step": 28691 + }, + { + "epoch": 0.736730105388841, + "grad_norm": 0.77734375, + "learning_rate": 0.00012271256259275174, + "loss": 0.8335, + "step": 28692 + }, + { + "epoch": 0.7367557825847628, + "grad_norm": 0.80078125, + "learning_rate": 0.00012270821503825477, + "loss": 1.0742, + "step": 28693 + }, + { + "epoch": 0.7367814597806847, + "grad_norm": 0.84765625, + "learning_rate": 0.00012270386743850237, + "loss": 0.8119, + "step": 28694 + }, + { + "epoch": 0.7368071369766065, + "grad_norm": 0.79296875, + "learning_rate": 0.00012269951979350326, + "loss": 0.8761, + "step": 28695 + }, + { + "epoch": 0.7368328141725283, + "grad_norm": 0.77734375, + "learning_rate": 0.00012269517210326602, + "loss": 0.8426, + "step": 28696 + }, + { + "epoch": 0.7368584913684502, + "grad_norm": 0.8203125, + "learning_rate": 0.00012269082436779937, + "loss": 0.8702, + "step": 28697 + }, + { + "epoch": 0.7368841685643719, + "grad_norm": 0.79296875, + "learning_rate": 0.00012268647658711195, + "loss": 0.7217, + "step": 28698 + }, + { + "epoch": 0.7369098457602937, + "grad_norm": 0.76171875, + "learning_rate": 0.00012268212876121243, + "loss": 0.9575, + "step": 28699 + }, + { + "epoch": 0.7369355229562156, + "grad_norm": 0.75, + "learning_rate": 0.00012267778089010948, + "loss": 0.7862, + "step": 28700 + }, + { + "epoch": 0.7369612001521374, + "grad_norm": 0.8046875, + "learning_rate": 0.00012267343297381176, + "loss": 0.8532, + "step": 28701 + }, + { + "epoch": 0.7369868773480592, + "grad_norm": 0.7421875, + "learning_rate": 0.00012266908501232793, + "loss": 0.8453, + "step": 28702 + }, + { + "epoch": 0.7370125545439811, + "grad_norm": 0.75390625, + "learning_rate": 0.00012266473700566668, + "loss": 0.7468, + "step": 28703 + }, + { + "epoch": 0.7370382317399028, + "grad_norm": 0.73828125, + "learning_rate": 0.00012266038895383664, + "loss": 0.848, + "step": 28704 + }, + { + "epoch": 0.7370639089358246, + "grad_norm": 0.75390625, + "learning_rate": 0.0001226560408568465, + "loss": 0.8349, + "step": 28705 + }, + { + "epoch": 0.7370895861317465, + "grad_norm": 0.75, + "learning_rate": 0.0001226516927147049, + "loss": 0.7726, + "step": 28706 + }, + { + "epoch": 0.7371152633276683, + "grad_norm": 0.86328125, + "learning_rate": 0.00012264734452742054, + "loss": 0.7621, + "step": 28707 + }, + { + "epoch": 0.7371409405235901, + "grad_norm": 0.85546875, + "learning_rate": 0.00012264299629500207, + "loss": 0.8226, + "step": 28708 + }, + { + "epoch": 0.737166617719512, + "grad_norm": 0.734375, + "learning_rate": 0.00012263864801745813, + "loss": 0.8664, + "step": 28709 + }, + { + "epoch": 0.7371922949154338, + "grad_norm": 0.84375, + "learning_rate": 0.00012263429969479745, + "loss": 0.8619, + "step": 28710 + }, + { + "epoch": 0.7372179721113555, + "grad_norm": 0.79296875, + "learning_rate": 0.00012262995132702862, + "loss": 0.8417, + "step": 28711 + }, + { + "epoch": 0.7372436493072774, + "grad_norm": 0.73828125, + "learning_rate": 0.00012262560291416037, + "loss": 0.6849, + "step": 28712 + }, + { + "epoch": 0.7372693265031992, + "grad_norm": 0.78125, + "learning_rate": 0.0001226212544562013, + "loss": 0.8563, + "step": 28713 + }, + { + "epoch": 0.737295003699121, + "grad_norm": 0.70703125, + "learning_rate": 0.00012261690595316013, + "loss": 0.8822, + "step": 28714 + }, + { + "epoch": 0.7373206808950429, + "grad_norm": 0.71875, + "learning_rate": 0.00012261255740504554, + "loss": 0.7, + "step": 28715 + }, + { + "epoch": 0.7373463580909647, + "grad_norm": 0.84765625, + "learning_rate": 0.00012260820881186612, + "loss": 0.9837, + "step": 28716 + }, + { + "epoch": 0.7373720352868864, + "grad_norm": 0.77734375, + "learning_rate": 0.00012260386017363058, + "loss": 0.8377, + "step": 28717 + }, + { + "epoch": 0.7373977124828083, + "grad_norm": 0.80078125, + "learning_rate": 0.00012259951149034762, + "loss": 0.8742, + "step": 28718 + }, + { + "epoch": 0.7374233896787301, + "grad_norm": 0.80859375, + "learning_rate": 0.00012259516276202586, + "loss": 0.7644, + "step": 28719 + }, + { + "epoch": 0.737449066874652, + "grad_norm": 0.81640625, + "learning_rate": 0.000122590813988674, + "loss": 0.7443, + "step": 28720 + }, + { + "epoch": 0.7374747440705738, + "grad_norm": 0.78515625, + "learning_rate": 0.00012258646517030068, + "loss": 0.7887, + "step": 28721 + }, + { + "epoch": 0.7375004212664956, + "grad_norm": 0.77734375, + "learning_rate": 0.00012258211630691455, + "loss": 0.9203, + "step": 28722 + }, + { + "epoch": 0.7375260984624175, + "grad_norm": 0.73046875, + "learning_rate": 0.00012257776739852433, + "loss": 0.817, + "step": 28723 + }, + { + "epoch": 0.7375517756583392, + "grad_norm": 0.7890625, + "learning_rate": 0.00012257341844513863, + "loss": 0.7913, + "step": 28724 + }, + { + "epoch": 0.737577452854261, + "grad_norm": 0.76953125, + "learning_rate": 0.00012256906944676618, + "loss": 0.858, + "step": 28725 + }, + { + "epoch": 0.7376031300501829, + "grad_norm": 0.94140625, + "learning_rate": 0.00012256472040341563, + "loss": 0.8486, + "step": 28726 + }, + { + "epoch": 0.7376288072461047, + "grad_norm": 0.84765625, + "learning_rate": 0.00012256037131509558, + "loss": 0.8906, + "step": 28727 + }, + { + "epoch": 0.7376544844420265, + "grad_norm": 0.77734375, + "learning_rate": 0.00012255602218181477, + "loss": 0.7984, + "step": 28728 + }, + { + "epoch": 0.7376801616379484, + "grad_norm": 0.83203125, + "learning_rate": 0.00012255167300358186, + "loss": 0.7542, + "step": 28729 + }, + { + "epoch": 0.7377058388338702, + "grad_norm": 0.859375, + "learning_rate": 0.00012254732378040549, + "loss": 0.9218, + "step": 28730 + }, + { + "epoch": 0.7377315160297919, + "grad_norm": 0.7578125, + "learning_rate": 0.0001225429745122943, + "loss": 0.6525, + "step": 28731 + }, + { + "epoch": 0.7377571932257138, + "grad_norm": 0.8125, + "learning_rate": 0.00012253862519925705, + "loss": 0.9186, + "step": 28732 + }, + { + "epoch": 0.7377828704216356, + "grad_norm": 0.78515625, + "learning_rate": 0.00012253427584130232, + "loss": 0.8494, + "step": 28733 + }, + { + "epoch": 0.7378085476175574, + "grad_norm": 0.73828125, + "learning_rate": 0.00012252992643843884, + "loss": 0.837, + "step": 28734 + }, + { + "epoch": 0.7378342248134793, + "grad_norm": 0.83984375, + "learning_rate": 0.00012252557699067527, + "loss": 0.7736, + "step": 28735 + }, + { + "epoch": 0.7378599020094011, + "grad_norm": 0.80078125, + "learning_rate": 0.0001225212274980202, + "loss": 0.8226, + "step": 28736 + }, + { + "epoch": 0.7378855792053228, + "grad_norm": 0.6796875, + "learning_rate": 0.00012251687796048242, + "loss": 0.6984, + "step": 28737 + }, + { + "epoch": 0.7379112564012447, + "grad_norm": 0.69140625, + "learning_rate": 0.0001225125283780705, + "loss": 0.7244, + "step": 28738 + }, + { + "epoch": 0.7379369335971665, + "grad_norm": 0.7421875, + "learning_rate": 0.00012250817875079318, + "loss": 0.8455, + "step": 28739 + }, + { + "epoch": 0.7379626107930883, + "grad_norm": 0.7421875, + "learning_rate": 0.00012250382907865904, + "loss": 0.668, + "step": 28740 + }, + { + "epoch": 0.7379882879890102, + "grad_norm": 0.77734375, + "learning_rate": 0.00012249947936167683, + "loss": 0.952, + "step": 28741 + }, + { + "epoch": 0.738013965184932, + "grad_norm": 0.7734375, + "learning_rate": 0.0001224951295998552, + "loss": 0.7549, + "step": 28742 + }, + { + "epoch": 0.7380396423808538, + "grad_norm": 0.78125, + "learning_rate": 0.00012249077979320278, + "loss": 0.7716, + "step": 28743 + }, + { + "epoch": 0.7380653195767756, + "grad_norm": 0.7734375, + "learning_rate": 0.0001224864299417283, + "loss": 0.7063, + "step": 28744 + }, + { + "epoch": 0.7380909967726974, + "grad_norm": 0.8203125, + "learning_rate": 0.0001224820800454404, + "loss": 0.9054, + "step": 28745 + }, + { + "epoch": 0.7381166739686192, + "grad_norm": 0.828125, + "learning_rate": 0.00012247773010434768, + "loss": 0.8301, + "step": 28746 + }, + { + "epoch": 0.7381423511645411, + "grad_norm": 0.7890625, + "learning_rate": 0.00012247338011845896, + "loss": 0.8025, + "step": 28747 + }, + { + "epoch": 0.7381680283604629, + "grad_norm": 0.74609375, + "learning_rate": 0.00012246903008778277, + "loss": 0.7626, + "step": 28748 + }, + { + "epoch": 0.7381937055563847, + "grad_norm": 0.7734375, + "learning_rate": 0.00012246468001232786, + "loss": 0.8813, + "step": 28749 + }, + { + "epoch": 0.7382193827523066, + "grad_norm": 0.78515625, + "learning_rate": 0.00012246032989210286, + "loss": 0.77, + "step": 28750 + }, + { + "epoch": 0.7382450599482283, + "grad_norm": 0.796875, + "learning_rate": 0.00012245597972711643, + "loss": 0.7953, + "step": 28751 + }, + { + "epoch": 0.7382707371441501, + "grad_norm": 0.76171875, + "learning_rate": 0.00012245162951737728, + "loss": 0.8236, + "step": 28752 + }, + { + "epoch": 0.738296414340072, + "grad_norm": 0.7578125, + "learning_rate": 0.0001224472792628941, + "loss": 0.8208, + "step": 28753 + }, + { + "epoch": 0.7383220915359938, + "grad_norm": 0.78515625, + "learning_rate": 0.00012244292896367544, + "loss": 0.7966, + "step": 28754 + }, + { + "epoch": 0.7383477687319157, + "grad_norm": 0.734375, + "learning_rate": 0.00012243857861973011, + "loss": 0.865, + "step": 28755 + }, + { + "epoch": 0.7383734459278375, + "grad_norm": 0.7734375, + "learning_rate": 0.00012243422823106668, + "loss": 0.854, + "step": 28756 + }, + { + "epoch": 0.7383991231237592, + "grad_norm": 0.76171875, + "learning_rate": 0.00012242987779769387, + "loss": 0.8954, + "step": 28757 + }, + { + "epoch": 0.738424800319681, + "grad_norm": 0.7421875, + "learning_rate": 0.00012242552731962038, + "loss": 0.8258, + "step": 28758 + }, + { + "epoch": 0.7384504775156029, + "grad_norm": 0.77734375, + "learning_rate": 0.0001224211767968548, + "loss": 0.7394, + "step": 28759 + }, + { + "epoch": 0.7384761547115247, + "grad_norm": 0.80078125, + "learning_rate": 0.00012241682622940586, + "loss": 0.7682, + "step": 28760 + }, + { + "epoch": 0.7385018319074466, + "grad_norm": 0.74609375, + "learning_rate": 0.0001224124756172822, + "loss": 0.6467, + "step": 28761 + }, + { + "epoch": 0.7385275091033684, + "grad_norm": 0.7109375, + "learning_rate": 0.00012240812496049249, + "loss": 0.7645, + "step": 28762 + }, + { + "epoch": 0.7385531862992902, + "grad_norm": 0.81640625, + "learning_rate": 0.00012240377425904543, + "loss": 0.8281, + "step": 28763 + }, + { + "epoch": 0.738578863495212, + "grad_norm": 0.796875, + "learning_rate": 0.00012239942351294965, + "loss": 0.8085, + "step": 28764 + }, + { + "epoch": 0.7386045406911338, + "grad_norm": 0.75, + "learning_rate": 0.00012239507272221385, + "loss": 0.8461, + "step": 28765 + }, + { + "epoch": 0.7386302178870556, + "grad_norm": 0.765625, + "learning_rate": 0.0001223907218868467, + "loss": 0.8951, + "step": 28766 + }, + { + "epoch": 0.7386558950829775, + "grad_norm": 0.76953125, + "learning_rate": 0.00012238637100685687, + "loss": 0.8135, + "step": 28767 + }, + { + "epoch": 0.7386815722788993, + "grad_norm": 0.76171875, + "learning_rate": 0.00012238202008225303, + "loss": 0.7616, + "step": 28768 + }, + { + "epoch": 0.7387072494748211, + "grad_norm": 0.765625, + "learning_rate": 0.00012237766911304382, + "loss": 0.8703, + "step": 28769 + }, + { + "epoch": 0.738732926670743, + "grad_norm": 0.82421875, + "learning_rate": 0.00012237331809923795, + "loss": 0.8327, + "step": 28770 + }, + { + "epoch": 0.7387586038666647, + "grad_norm": 0.73828125, + "learning_rate": 0.00012236896704084408, + "loss": 0.691, + "step": 28771 + }, + { + "epoch": 0.7387842810625865, + "grad_norm": 0.71484375, + "learning_rate": 0.00012236461593787085, + "loss": 0.7707, + "step": 28772 + }, + { + "epoch": 0.7388099582585084, + "grad_norm": 0.8046875, + "learning_rate": 0.000122360264790327, + "loss": 0.8298, + "step": 28773 + }, + { + "epoch": 0.7388356354544302, + "grad_norm": 0.77734375, + "learning_rate": 0.00012235591359822118, + "loss": 0.9096, + "step": 28774 + }, + { + "epoch": 0.738861312650352, + "grad_norm": 1.0078125, + "learning_rate": 0.000122351562361562, + "loss": 0.8051, + "step": 28775 + }, + { + "epoch": 0.7388869898462739, + "grad_norm": 0.76171875, + "learning_rate": 0.0001223472110803582, + "loss": 0.8421, + "step": 28776 + }, + { + "epoch": 0.7389126670421956, + "grad_norm": 0.78515625, + "learning_rate": 0.0001223428597546184, + "loss": 0.8118, + "step": 28777 + }, + { + "epoch": 0.7389383442381174, + "grad_norm": 0.77734375, + "learning_rate": 0.0001223385083843513, + "loss": 0.8607, + "step": 28778 + }, + { + "epoch": 0.7389640214340393, + "grad_norm": 0.8046875, + "learning_rate": 0.0001223341569695656, + "loss": 0.8671, + "step": 28779 + }, + { + "epoch": 0.7389896986299611, + "grad_norm": 0.7578125, + "learning_rate": 0.00012232980551026994, + "loss": 0.879, + "step": 28780 + }, + { + "epoch": 0.739015375825883, + "grad_norm": 0.765625, + "learning_rate": 0.00012232545400647297, + "loss": 0.7722, + "step": 28781 + }, + { + "epoch": 0.7390410530218048, + "grad_norm": 0.8203125, + "learning_rate": 0.0001223211024581834, + "loss": 0.7967, + "step": 28782 + }, + { + "epoch": 0.7390667302177266, + "grad_norm": 0.79296875, + "learning_rate": 0.0001223167508654099, + "loss": 0.7549, + "step": 28783 + }, + { + "epoch": 0.7390924074136483, + "grad_norm": 0.703125, + "learning_rate": 0.0001223123992281611, + "loss": 0.7814, + "step": 28784 + }, + { + "epoch": 0.7391180846095702, + "grad_norm": 0.78515625, + "learning_rate": 0.00012230804754644574, + "loss": 0.81, + "step": 28785 + }, + { + "epoch": 0.739143761805492, + "grad_norm": 0.7265625, + "learning_rate": 0.00012230369582027242, + "loss": 0.9171, + "step": 28786 + }, + { + "epoch": 0.7391694390014139, + "grad_norm": 0.703125, + "learning_rate": 0.00012229934404964986, + "loss": 0.7822, + "step": 28787 + }, + { + "epoch": 0.7391951161973357, + "grad_norm": 0.77734375, + "learning_rate": 0.00012229499223458674, + "loss": 0.6845, + "step": 28788 + }, + { + "epoch": 0.7392207933932575, + "grad_norm": 0.76171875, + "learning_rate": 0.0001222906403750917, + "loss": 0.756, + "step": 28789 + }, + { + "epoch": 0.7392464705891794, + "grad_norm": 0.93359375, + "learning_rate": 0.00012228628847117345, + "loss": 0.7746, + "step": 28790 + }, + { + "epoch": 0.7392721477851011, + "grad_norm": 0.76953125, + "learning_rate": 0.0001222819365228406, + "loss": 0.7845, + "step": 28791 + }, + { + "epoch": 0.7392978249810229, + "grad_norm": 0.7890625, + "learning_rate": 0.0001222775845301019, + "loss": 0.8828, + "step": 28792 + }, + { + "epoch": 0.7393235021769448, + "grad_norm": 0.74609375, + "learning_rate": 0.00012227323249296596, + "loss": 0.805, + "step": 28793 + }, + { + "epoch": 0.7393491793728666, + "grad_norm": 0.76953125, + "learning_rate": 0.0001222688804114415, + "loss": 0.9301, + "step": 28794 + }, + { + "epoch": 0.7393748565687884, + "grad_norm": 0.828125, + "learning_rate": 0.00012226452828553716, + "loss": 0.7108, + "step": 28795 + }, + { + "epoch": 0.7394005337647103, + "grad_norm": 0.7734375, + "learning_rate": 0.00012226017611526163, + "loss": 0.7634, + "step": 28796 + }, + { + "epoch": 0.739426210960632, + "grad_norm": 0.75, + "learning_rate": 0.00012225582390062358, + "loss": 0.8014, + "step": 28797 + }, + { + "epoch": 0.7394518881565538, + "grad_norm": 0.76171875, + "learning_rate": 0.00012225147164163168, + "loss": 0.8239, + "step": 28798 + }, + { + "epoch": 0.7394775653524757, + "grad_norm": 0.86328125, + "learning_rate": 0.00012224711933829461, + "loss": 0.8077, + "step": 28799 + }, + { + "epoch": 0.7395032425483975, + "grad_norm": 0.765625, + "learning_rate": 0.00012224276699062103, + "loss": 0.7615, + "step": 28800 + }, + { + "epoch": 0.7395289197443193, + "grad_norm": 0.76953125, + "learning_rate": 0.00012223841459861964, + "loss": 0.8484, + "step": 28801 + }, + { + "epoch": 0.7395545969402412, + "grad_norm": 0.76953125, + "learning_rate": 0.0001222340621622991, + "loss": 0.74, + "step": 28802 + }, + { + "epoch": 0.739580274136163, + "grad_norm": 0.9140625, + "learning_rate": 0.0001222297096816681, + "loss": 0.8486, + "step": 28803 + }, + { + "epoch": 0.7396059513320847, + "grad_norm": 0.77734375, + "learning_rate": 0.00012222535715673523, + "loss": 0.7482, + "step": 28804 + }, + { + "epoch": 0.7396316285280066, + "grad_norm": 0.7265625, + "learning_rate": 0.0001222210045875093, + "loss": 0.8597, + "step": 28805 + }, + { + "epoch": 0.7396573057239284, + "grad_norm": 0.8125, + "learning_rate": 0.00012221665197399887, + "loss": 0.8808, + "step": 28806 + }, + { + "epoch": 0.7396829829198502, + "grad_norm": 0.7421875, + "learning_rate": 0.0001222122993162127, + "loss": 0.8302, + "step": 28807 + }, + { + "epoch": 0.7397086601157721, + "grad_norm": 0.72265625, + "learning_rate": 0.0001222079466141594, + "loss": 0.859, + "step": 28808 + }, + { + "epoch": 0.7397343373116939, + "grad_norm": 0.7890625, + "learning_rate": 0.00012220359386784765, + "loss": 0.7504, + "step": 28809 + }, + { + "epoch": 0.7397600145076157, + "grad_norm": 0.80078125, + "learning_rate": 0.00012219924107728617, + "loss": 0.7597, + "step": 28810 + }, + { + "epoch": 0.7397856917035375, + "grad_norm": 0.79296875, + "learning_rate": 0.0001221948882424836, + "loss": 0.8605, + "step": 28811 + }, + { + "epoch": 0.7398113688994593, + "grad_norm": 0.8203125, + "learning_rate": 0.00012219053536344863, + "loss": 0.7982, + "step": 28812 + }, + { + "epoch": 0.7398370460953811, + "grad_norm": 0.84375, + "learning_rate": 0.00012218618244018992, + "loss": 0.9032, + "step": 28813 + }, + { + "epoch": 0.739862723291303, + "grad_norm": 0.8125, + "learning_rate": 0.00012218182947271617, + "loss": 0.8841, + "step": 28814 + }, + { + "epoch": 0.7398884004872248, + "grad_norm": 0.79296875, + "learning_rate": 0.00012217747646103602, + "loss": 0.8266, + "step": 28815 + }, + { + "epoch": 0.7399140776831467, + "grad_norm": 0.81640625, + "learning_rate": 0.00012217312340515816, + "loss": 0.9114, + "step": 28816 + }, + { + "epoch": 0.7399397548790684, + "grad_norm": 0.7890625, + "learning_rate": 0.00012216877030509128, + "loss": 0.8877, + "step": 28817 + }, + { + "epoch": 0.7399654320749902, + "grad_norm": 0.83203125, + "learning_rate": 0.00012216441716084403, + "loss": 0.6915, + "step": 28818 + }, + { + "epoch": 0.739991109270912, + "grad_norm": 0.7734375, + "learning_rate": 0.0001221600639724251, + "loss": 0.8115, + "step": 28819 + }, + { + "epoch": 0.7400167864668339, + "grad_norm": 0.91015625, + "learning_rate": 0.00012215571073984319, + "loss": 0.8417, + "step": 28820 + }, + { + "epoch": 0.7400424636627557, + "grad_norm": 1.1328125, + "learning_rate": 0.00012215135746310692, + "loss": 0.91, + "step": 28821 + }, + { + "epoch": 0.7400681408586776, + "grad_norm": 0.83203125, + "learning_rate": 0.000122147004142225, + "loss": 0.7721, + "step": 28822 + }, + { + "epoch": 0.7400938180545994, + "grad_norm": 0.8359375, + "learning_rate": 0.00012214265077720607, + "loss": 0.8709, + "step": 28823 + }, + { + "epoch": 0.7401194952505211, + "grad_norm": 0.82421875, + "learning_rate": 0.00012213829736805888, + "loss": 0.7371, + "step": 28824 + }, + { + "epoch": 0.740145172446443, + "grad_norm": 0.76171875, + "learning_rate": 0.00012213394391479208, + "loss": 0.8729, + "step": 28825 + }, + { + "epoch": 0.7401708496423648, + "grad_norm": 0.77734375, + "learning_rate": 0.00012212959041741427, + "loss": 0.8311, + "step": 28826 + }, + { + "epoch": 0.7401965268382866, + "grad_norm": 0.7578125, + "learning_rate": 0.00012212523687593423, + "loss": 0.7867, + "step": 28827 + }, + { + "epoch": 0.7402222040342085, + "grad_norm": 0.8671875, + "learning_rate": 0.0001221208832903606, + "loss": 0.7449, + "step": 28828 + }, + { + "epoch": 0.7402478812301303, + "grad_norm": 0.76953125, + "learning_rate": 0.000122116529660702, + "loss": 0.8068, + "step": 28829 + }, + { + "epoch": 0.7402735584260521, + "grad_norm": 0.77734375, + "learning_rate": 0.0001221121759869672, + "loss": 0.8209, + "step": 28830 + }, + { + "epoch": 0.7402992356219739, + "grad_norm": 0.74609375, + "learning_rate": 0.00012210782226916483, + "loss": 0.8849, + "step": 28831 + }, + { + "epoch": 0.7403249128178957, + "grad_norm": 0.76953125, + "learning_rate": 0.00012210346850730354, + "loss": 0.8721, + "step": 28832 + }, + { + "epoch": 0.7403505900138175, + "grad_norm": 0.85546875, + "learning_rate": 0.00012209911470139205, + "loss": 0.8282, + "step": 28833 + }, + { + "epoch": 0.7403762672097394, + "grad_norm": 0.79296875, + "learning_rate": 0.000122094760851439, + "loss": 0.8121, + "step": 28834 + }, + { + "epoch": 0.7404019444056612, + "grad_norm": 0.73828125, + "learning_rate": 0.00012209040695745313, + "loss": 0.7939, + "step": 28835 + }, + { + "epoch": 0.740427621601583, + "grad_norm": 0.75, + "learning_rate": 0.000122086053019443, + "loss": 0.6591, + "step": 28836 + }, + { + "epoch": 0.7404532987975048, + "grad_norm": 0.75390625, + "learning_rate": 0.00012208169903741743, + "loss": 0.7758, + "step": 28837 + }, + { + "epoch": 0.7404789759934266, + "grad_norm": 0.83984375, + "learning_rate": 0.00012207734501138502, + "loss": 0.9941, + "step": 28838 + }, + { + "epoch": 0.7405046531893484, + "grad_norm": 0.74609375, + "learning_rate": 0.00012207299094135442, + "loss": 0.7745, + "step": 28839 + }, + { + "epoch": 0.7405303303852703, + "grad_norm": 0.76953125, + "learning_rate": 0.00012206863682733441, + "loss": 0.7716, + "step": 28840 + }, + { + "epoch": 0.7405560075811921, + "grad_norm": 0.78125, + "learning_rate": 0.00012206428266933351, + "loss": 0.7951, + "step": 28841 + }, + { + "epoch": 0.740581684777114, + "grad_norm": 0.703125, + "learning_rate": 0.00012205992846736056, + "loss": 0.7342, + "step": 28842 + }, + { + "epoch": 0.7406073619730358, + "grad_norm": 0.75390625, + "learning_rate": 0.00012205557422142414, + "loss": 0.8724, + "step": 28843 + }, + { + "epoch": 0.7406330391689575, + "grad_norm": 0.79296875, + "learning_rate": 0.00012205121993153292, + "loss": 0.8098, + "step": 28844 + }, + { + "epoch": 0.7406587163648793, + "grad_norm": 0.796875, + "learning_rate": 0.00012204686559769564, + "loss": 0.7644, + "step": 28845 + }, + { + "epoch": 0.7406843935608012, + "grad_norm": 0.765625, + "learning_rate": 0.00012204251121992096, + "loss": 0.7534, + "step": 28846 + }, + { + "epoch": 0.740710070756723, + "grad_norm": 0.765625, + "learning_rate": 0.00012203815679821756, + "loss": 0.7451, + "step": 28847 + }, + { + "epoch": 0.7407357479526449, + "grad_norm": 0.7890625, + "learning_rate": 0.00012203380233259407, + "loss": 0.9046, + "step": 28848 + }, + { + "epoch": 0.7407614251485667, + "grad_norm": 0.7734375, + "learning_rate": 0.0001220294478230592, + "loss": 0.7567, + "step": 28849 + }, + { + "epoch": 0.7407871023444885, + "grad_norm": 0.82421875, + "learning_rate": 0.00012202509326962167, + "loss": 0.7301, + "step": 28850 + }, + { + "epoch": 0.7408127795404102, + "grad_norm": 0.85546875, + "learning_rate": 0.00012202073867229007, + "loss": 0.9079, + "step": 28851 + }, + { + "epoch": 0.7408384567363321, + "grad_norm": 0.81640625, + "learning_rate": 0.00012201638403107314, + "loss": 0.9045, + "step": 28852 + }, + { + "epoch": 0.7408641339322539, + "grad_norm": 0.78125, + "learning_rate": 0.00012201202934597957, + "loss": 0.7545, + "step": 28853 + }, + { + "epoch": 0.7408898111281758, + "grad_norm": 0.8125, + "learning_rate": 0.00012200767461701801, + "loss": 0.917, + "step": 28854 + }, + { + "epoch": 0.7409154883240976, + "grad_norm": 0.72265625, + "learning_rate": 0.00012200331984419712, + "loss": 0.8657, + "step": 28855 + }, + { + "epoch": 0.7409411655200194, + "grad_norm": 0.7578125, + "learning_rate": 0.00012199896502752562, + "loss": 0.7239, + "step": 28856 + }, + { + "epoch": 0.7409668427159412, + "grad_norm": 0.734375, + "learning_rate": 0.00012199461016701217, + "loss": 0.8282, + "step": 28857 + }, + { + "epoch": 0.740992519911863, + "grad_norm": 0.74609375, + "learning_rate": 0.0001219902552626654, + "loss": 0.7255, + "step": 28858 + }, + { + "epoch": 0.7410181971077848, + "grad_norm": 0.76171875, + "learning_rate": 0.0001219859003144941, + "loss": 0.749, + "step": 28859 + }, + { + "epoch": 0.7410438743037067, + "grad_norm": 0.8671875, + "learning_rate": 0.00012198154532250687, + "loss": 0.7901, + "step": 28860 + }, + { + "epoch": 0.7410695514996285, + "grad_norm": 0.7734375, + "learning_rate": 0.0001219771902867124, + "loss": 0.8642, + "step": 28861 + }, + { + "epoch": 0.7410952286955503, + "grad_norm": 0.83203125, + "learning_rate": 0.00012197283520711937, + "loss": 0.8806, + "step": 28862 + }, + { + "epoch": 0.7411209058914722, + "grad_norm": 0.85546875, + "learning_rate": 0.00012196848008373647, + "loss": 0.8335, + "step": 28863 + }, + { + "epoch": 0.7411465830873939, + "grad_norm": 0.83984375, + "learning_rate": 0.00012196412491657237, + "loss": 0.9122, + "step": 28864 + }, + { + "epoch": 0.7411722602833157, + "grad_norm": 0.76171875, + "learning_rate": 0.00012195976970563576, + "loss": 0.8454, + "step": 28865 + }, + { + "epoch": 0.7411979374792376, + "grad_norm": 0.75, + "learning_rate": 0.00012195541445093533, + "loss": 0.8456, + "step": 28866 + }, + { + "epoch": 0.7412236146751594, + "grad_norm": 0.859375, + "learning_rate": 0.0001219510591524797, + "loss": 0.8906, + "step": 28867 + }, + { + "epoch": 0.7412492918710812, + "grad_norm": 0.76171875, + "learning_rate": 0.00012194670381027762, + "loss": 0.7876, + "step": 28868 + }, + { + "epoch": 0.7412749690670031, + "grad_norm": 0.81640625, + "learning_rate": 0.00012194234842433774, + "loss": 0.8552, + "step": 28869 + }, + { + "epoch": 0.7413006462629249, + "grad_norm": 0.77734375, + "learning_rate": 0.00012193799299466873, + "loss": 0.8836, + "step": 28870 + }, + { + "epoch": 0.7413263234588466, + "grad_norm": 0.8046875, + "learning_rate": 0.00012193363752127927, + "loss": 0.8229, + "step": 28871 + }, + { + "epoch": 0.7413520006547685, + "grad_norm": 0.80859375, + "learning_rate": 0.00012192928200417808, + "loss": 0.8054, + "step": 28872 + }, + { + "epoch": 0.7413776778506903, + "grad_norm": 0.8359375, + "learning_rate": 0.0001219249264433738, + "loss": 0.9297, + "step": 28873 + }, + { + "epoch": 0.7414033550466121, + "grad_norm": 0.859375, + "learning_rate": 0.00012192057083887514, + "loss": 0.7937, + "step": 28874 + }, + { + "epoch": 0.741429032242534, + "grad_norm": 0.7890625, + "learning_rate": 0.00012191621519069074, + "loss": 0.8101, + "step": 28875 + }, + { + "epoch": 0.7414547094384558, + "grad_norm": 0.8515625, + "learning_rate": 0.00012191185949882928, + "loss": 0.7995, + "step": 28876 + }, + { + "epoch": 0.7414803866343775, + "grad_norm": 0.75390625, + "learning_rate": 0.00012190750376329952, + "loss": 0.8189, + "step": 28877 + }, + { + "epoch": 0.7415060638302994, + "grad_norm": 0.8125, + "learning_rate": 0.00012190314798411005, + "loss": 0.839, + "step": 28878 + }, + { + "epoch": 0.7415317410262212, + "grad_norm": 0.8515625, + "learning_rate": 0.00012189879216126958, + "loss": 0.8803, + "step": 28879 + }, + { + "epoch": 0.741557418222143, + "grad_norm": 0.73828125, + "learning_rate": 0.00012189443629478683, + "loss": 0.7128, + "step": 28880 + }, + { + "epoch": 0.7415830954180649, + "grad_norm": 0.8359375, + "learning_rate": 0.0001218900803846704, + "loss": 0.8901, + "step": 28881 + }, + { + "epoch": 0.7416087726139867, + "grad_norm": 0.87109375, + "learning_rate": 0.00012188572443092905, + "loss": 0.7187, + "step": 28882 + }, + { + "epoch": 0.7416344498099086, + "grad_norm": 0.75390625, + "learning_rate": 0.0001218813684335714, + "loss": 0.9312, + "step": 28883 + }, + { + "epoch": 0.7416601270058303, + "grad_norm": 0.88671875, + "learning_rate": 0.00012187701239260619, + "loss": 0.8245, + "step": 28884 + }, + { + "epoch": 0.7416858042017521, + "grad_norm": 0.75, + "learning_rate": 0.00012187265630804205, + "loss": 0.8149, + "step": 28885 + }, + { + "epoch": 0.741711481397674, + "grad_norm": 0.765625, + "learning_rate": 0.00012186830017988768, + "loss": 0.7798, + "step": 28886 + }, + { + "epoch": 0.7417371585935958, + "grad_norm": 0.75390625, + "learning_rate": 0.0001218639440081518, + "loss": 0.7549, + "step": 28887 + }, + { + "epoch": 0.7417628357895176, + "grad_norm": 0.76171875, + "learning_rate": 0.00012185958779284302, + "loss": 0.9289, + "step": 28888 + }, + { + "epoch": 0.7417885129854395, + "grad_norm": 0.82421875, + "learning_rate": 0.00012185523153397005, + "loss": 0.7243, + "step": 28889 + }, + { + "epoch": 0.7418141901813613, + "grad_norm": 0.73046875, + "learning_rate": 0.0001218508752315416, + "loss": 0.7948, + "step": 28890 + }, + { + "epoch": 0.741839867377283, + "grad_norm": 0.7734375, + "learning_rate": 0.00012184651888556631, + "loss": 0.8921, + "step": 28891 + }, + { + "epoch": 0.7418655445732049, + "grad_norm": 0.74609375, + "learning_rate": 0.00012184216249605287, + "loss": 0.9306, + "step": 28892 + }, + { + "epoch": 0.7418912217691267, + "grad_norm": 0.80078125, + "learning_rate": 0.00012183780606301003, + "loss": 0.8514, + "step": 28893 + }, + { + "epoch": 0.7419168989650485, + "grad_norm": 0.796875, + "learning_rate": 0.00012183344958644638, + "loss": 0.8277, + "step": 28894 + }, + { + "epoch": 0.7419425761609704, + "grad_norm": 0.734375, + "learning_rate": 0.00012182909306637062, + "loss": 0.7663, + "step": 28895 + }, + { + "epoch": 0.7419682533568922, + "grad_norm": 0.78125, + "learning_rate": 0.00012182473650279147, + "loss": 0.7551, + "step": 28896 + }, + { + "epoch": 0.7419939305528139, + "grad_norm": 0.7421875, + "learning_rate": 0.00012182037989571757, + "loss": 0.7307, + "step": 28897 + }, + { + "epoch": 0.7420196077487358, + "grad_norm": 0.75, + "learning_rate": 0.00012181602324515764, + "loss": 0.8813, + "step": 28898 + }, + { + "epoch": 0.7420452849446576, + "grad_norm": 0.765625, + "learning_rate": 0.00012181166655112037, + "loss": 0.8435, + "step": 28899 + }, + { + "epoch": 0.7420709621405794, + "grad_norm": 0.74609375, + "learning_rate": 0.00012180730981361439, + "loss": 0.8167, + "step": 28900 + }, + { + "epoch": 0.7420966393365013, + "grad_norm": 0.79296875, + "learning_rate": 0.00012180295303264841, + "loss": 0.9262, + "step": 28901 + }, + { + "epoch": 0.7421223165324231, + "grad_norm": 0.7578125, + "learning_rate": 0.00012179859620823113, + "loss": 0.8366, + "step": 28902 + }, + { + "epoch": 0.742147993728345, + "grad_norm": 0.796875, + "learning_rate": 0.00012179423934037119, + "loss": 0.9848, + "step": 28903 + }, + { + "epoch": 0.7421736709242667, + "grad_norm": 0.83984375, + "learning_rate": 0.00012178988242907734, + "loss": 1.0096, + "step": 28904 + }, + { + "epoch": 0.7421993481201885, + "grad_norm": 0.85546875, + "learning_rate": 0.0001217855254743582, + "loss": 0.8606, + "step": 28905 + }, + { + "epoch": 0.7422250253161103, + "grad_norm": 0.7265625, + "learning_rate": 0.00012178116847622248, + "loss": 0.8788, + "step": 28906 + }, + { + "epoch": 0.7422507025120322, + "grad_norm": 0.7890625, + "learning_rate": 0.00012177681143467886, + "loss": 0.9463, + "step": 28907 + }, + { + "epoch": 0.742276379707954, + "grad_norm": 0.84375, + "learning_rate": 0.00012177245434973602, + "loss": 0.8221, + "step": 28908 + }, + { + "epoch": 0.7423020569038759, + "grad_norm": 0.71875, + "learning_rate": 0.00012176809722140263, + "loss": 0.7966, + "step": 28909 + }, + { + "epoch": 0.7423277340997977, + "grad_norm": 0.84765625, + "learning_rate": 0.0001217637400496874, + "loss": 0.8208, + "step": 28910 + }, + { + "epoch": 0.7423534112957194, + "grad_norm": 0.7734375, + "learning_rate": 0.00012175938283459898, + "loss": 0.8643, + "step": 28911 + }, + { + "epoch": 0.7423790884916412, + "grad_norm": 1.03125, + "learning_rate": 0.0001217550255761461, + "loss": 0.794, + "step": 28912 + }, + { + "epoch": 0.7424047656875631, + "grad_norm": 0.7734375, + "learning_rate": 0.00012175066827433743, + "loss": 0.73, + "step": 28913 + }, + { + "epoch": 0.7424304428834849, + "grad_norm": 0.76171875, + "learning_rate": 0.00012174631092918162, + "loss": 0.7922, + "step": 28914 + }, + { + "epoch": 0.7424561200794068, + "grad_norm": 0.78515625, + "learning_rate": 0.00012174195354068741, + "loss": 0.9719, + "step": 28915 + }, + { + "epoch": 0.7424817972753286, + "grad_norm": 0.83984375, + "learning_rate": 0.00012173759610886342, + "loss": 0.8597, + "step": 28916 + }, + { + "epoch": 0.7425074744712503, + "grad_norm": 0.734375, + "learning_rate": 0.00012173323863371837, + "loss": 0.8978, + "step": 28917 + }, + { + "epoch": 0.7425331516671722, + "grad_norm": 0.73828125, + "learning_rate": 0.00012172888111526095, + "loss": 0.9154, + "step": 28918 + }, + { + "epoch": 0.742558828863094, + "grad_norm": 0.83203125, + "learning_rate": 0.00012172452355349982, + "loss": 0.8941, + "step": 28919 + }, + { + "epoch": 0.7425845060590158, + "grad_norm": 0.85546875, + "learning_rate": 0.00012172016594844369, + "loss": 0.9617, + "step": 28920 + }, + { + "epoch": 0.7426101832549377, + "grad_norm": 0.75390625, + "learning_rate": 0.00012171580830010119, + "loss": 0.7203, + "step": 28921 + }, + { + "epoch": 0.7426358604508595, + "grad_norm": 0.70703125, + "learning_rate": 0.00012171145060848108, + "loss": 0.8168, + "step": 28922 + }, + { + "epoch": 0.7426615376467813, + "grad_norm": 0.7265625, + "learning_rate": 0.00012170709287359203, + "loss": 0.6673, + "step": 28923 + }, + { + "epoch": 0.7426872148427031, + "grad_norm": 0.74609375, + "learning_rate": 0.00012170273509544265, + "loss": 0.7401, + "step": 28924 + }, + { + "epoch": 0.7427128920386249, + "grad_norm": 0.7578125, + "learning_rate": 0.00012169837727404172, + "loss": 0.8454, + "step": 28925 + }, + { + "epoch": 0.7427385692345467, + "grad_norm": 0.7265625, + "learning_rate": 0.00012169401940939788, + "loss": 0.7929, + "step": 28926 + }, + { + "epoch": 0.7427642464304686, + "grad_norm": 0.79296875, + "learning_rate": 0.00012168966150151981, + "loss": 0.7781, + "step": 28927 + }, + { + "epoch": 0.7427899236263904, + "grad_norm": 0.75, + "learning_rate": 0.00012168530355041623, + "loss": 0.8472, + "step": 28928 + }, + { + "epoch": 0.7428156008223122, + "grad_norm": 0.83984375, + "learning_rate": 0.00012168094555609574, + "loss": 0.7237, + "step": 28929 + }, + { + "epoch": 0.742841278018234, + "grad_norm": 0.84375, + "learning_rate": 0.00012167658751856714, + "loss": 0.8164, + "step": 28930 + }, + { + "epoch": 0.7428669552141558, + "grad_norm": 0.7109375, + "learning_rate": 0.00012167222943783904, + "loss": 0.8717, + "step": 28931 + }, + { + "epoch": 0.7428926324100776, + "grad_norm": 0.7421875, + "learning_rate": 0.00012166787131392014, + "loss": 0.8029, + "step": 28932 + }, + { + "epoch": 0.7429183096059995, + "grad_norm": 0.7421875, + "learning_rate": 0.00012166351314681916, + "loss": 0.7552, + "step": 28933 + }, + { + "epoch": 0.7429439868019213, + "grad_norm": 0.78125, + "learning_rate": 0.0001216591549365447, + "loss": 0.8081, + "step": 28934 + }, + { + "epoch": 0.7429696639978431, + "grad_norm": 0.75390625, + "learning_rate": 0.00012165479668310554, + "loss": 0.8648, + "step": 28935 + }, + { + "epoch": 0.742995341193765, + "grad_norm": 0.79296875, + "learning_rate": 0.00012165043838651031, + "loss": 0.8288, + "step": 28936 + }, + { + "epoch": 0.7430210183896867, + "grad_norm": 0.74609375, + "learning_rate": 0.00012164608004676772, + "loss": 0.7406, + "step": 28937 + }, + { + "epoch": 0.7430466955856085, + "grad_norm": 0.76953125, + "learning_rate": 0.00012164172166388643, + "loss": 0.7695, + "step": 28938 + }, + { + "epoch": 0.7430723727815304, + "grad_norm": 0.76953125, + "learning_rate": 0.0001216373632378752, + "loss": 0.7296, + "step": 28939 + }, + { + "epoch": 0.7430980499774522, + "grad_norm": 0.7578125, + "learning_rate": 0.0001216330047687426, + "loss": 0.8797, + "step": 28940 + }, + { + "epoch": 0.743123727173374, + "grad_norm": 0.78125, + "learning_rate": 0.0001216286462564974, + "loss": 0.7964, + "step": 28941 + }, + { + "epoch": 0.7431494043692959, + "grad_norm": 0.75390625, + "learning_rate": 0.00012162428770114826, + "loss": 0.755, + "step": 28942 + }, + { + "epoch": 0.7431750815652177, + "grad_norm": 0.7890625, + "learning_rate": 0.00012161992910270386, + "loss": 0.9727, + "step": 28943 + }, + { + "epoch": 0.7432007587611394, + "grad_norm": 0.8203125, + "learning_rate": 0.0001216155704611729, + "loss": 0.8801, + "step": 28944 + }, + { + "epoch": 0.7432264359570613, + "grad_norm": 0.7109375, + "learning_rate": 0.00012161121177656404, + "loss": 0.7293, + "step": 28945 + }, + { + "epoch": 0.7432521131529831, + "grad_norm": 0.7109375, + "learning_rate": 0.00012160685304888602, + "loss": 0.7957, + "step": 28946 + }, + { + "epoch": 0.743277790348905, + "grad_norm": 0.75390625, + "learning_rate": 0.0001216024942781475, + "loss": 0.7432, + "step": 28947 + }, + { + "epoch": 0.7433034675448268, + "grad_norm": 0.7734375, + "learning_rate": 0.00012159813546435712, + "loss": 0.7085, + "step": 28948 + }, + { + "epoch": 0.7433291447407486, + "grad_norm": 1.03125, + "learning_rate": 0.00012159377660752362, + "loss": 0.8484, + "step": 28949 + }, + { + "epoch": 0.7433548219366704, + "grad_norm": 0.80859375, + "learning_rate": 0.00012158941770765569, + "loss": 0.8019, + "step": 28950 + }, + { + "epoch": 0.7433804991325922, + "grad_norm": 0.78125, + "learning_rate": 0.00012158505876476199, + "loss": 0.8656, + "step": 28951 + }, + { + "epoch": 0.743406176328514, + "grad_norm": 0.78125, + "learning_rate": 0.00012158069977885122, + "loss": 0.7746, + "step": 28952 + }, + { + "epoch": 0.7434318535244359, + "grad_norm": 0.76171875, + "learning_rate": 0.00012157634074993204, + "loss": 0.7171, + "step": 28953 + }, + { + "epoch": 0.7434575307203577, + "grad_norm": 0.7734375, + "learning_rate": 0.0001215719816780132, + "loss": 0.7883, + "step": 28954 + }, + { + "epoch": 0.7434832079162795, + "grad_norm": 0.73046875, + "learning_rate": 0.00012156762256310335, + "loss": 0.7895, + "step": 28955 + }, + { + "epoch": 0.7435088851122014, + "grad_norm": 0.8203125, + "learning_rate": 0.00012156326340521114, + "loss": 0.8493, + "step": 28956 + }, + { + "epoch": 0.7435345623081231, + "grad_norm": 0.84765625, + "learning_rate": 0.00012155890420434533, + "loss": 0.8337, + "step": 28957 + }, + { + "epoch": 0.7435602395040449, + "grad_norm": 0.80859375, + "learning_rate": 0.00012155454496051455, + "loss": 0.8412, + "step": 28958 + }, + { + "epoch": 0.7435859166999668, + "grad_norm": 0.76953125, + "learning_rate": 0.00012155018567372752, + "loss": 0.7875, + "step": 28959 + }, + { + "epoch": 0.7436115938958886, + "grad_norm": 0.82421875, + "learning_rate": 0.00012154582634399292, + "loss": 0.8274, + "step": 28960 + }, + { + "epoch": 0.7436372710918104, + "grad_norm": 0.76953125, + "learning_rate": 0.00012154146697131941, + "loss": 0.9705, + "step": 28961 + }, + { + "epoch": 0.7436629482877323, + "grad_norm": 0.7578125, + "learning_rate": 0.00012153710755571573, + "loss": 0.6963, + "step": 28962 + }, + { + "epoch": 0.7436886254836541, + "grad_norm": 0.83984375, + "learning_rate": 0.00012153274809719053, + "loss": 0.9789, + "step": 28963 + }, + { + "epoch": 0.7437143026795758, + "grad_norm": 0.8125, + "learning_rate": 0.00012152838859575248, + "loss": 0.8561, + "step": 28964 + }, + { + "epoch": 0.7437399798754977, + "grad_norm": 0.828125, + "learning_rate": 0.00012152402905141036, + "loss": 0.8496, + "step": 28965 + }, + { + "epoch": 0.7437656570714195, + "grad_norm": 0.80078125, + "learning_rate": 0.00012151966946417272, + "loss": 0.8086, + "step": 28966 + }, + { + "epoch": 0.7437913342673413, + "grad_norm": 0.77734375, + "learning_rate": 0.00012151530983404837, + "loss": 0.757, + "step": 28967 + }, + { + "epoch": 0.7438170114632632, + "grad_norm": 0.703125, + "learning_rate": 0.00012151095016104595, + "loss": 0.8284, + "step": 28968 + }, + { + "epoch": 0.743842688659185, + "grad_norm": 0.68359375, + "learning_rate": 0.00012150659044517412, + "loss": 0.7779, + "step": 28969 + }, + { + "epoch": 0.7438683658551067, + "grad_norm": 0.8203125, + "learning_rate": 0.00012150223068644162, + "loss": 0.8112, + "step": 28970 + }, + { + "epoch": 0.7438940430510286, + "grad_norm": 0.75390625, + "learning_rate": 0.00012149787088485713, + "loss": 0.8104, + "step": 28971 + }, + { + "epoch": 0.7439197202469504, + "grad_norm": 0.8203125, + "learning_rate": 0.00012149351104042932, + "loss": 0.9034, + "step": 28972 + }, + { + "epoch": 0.7439453974428722, + "grad_norm": 0.8203125, + "learning_rate": 0.00012148915115316688, + "loss": 0.83, + "step": 28973 + }, + { + "epoch": 0.7439710746387941, + "grad_norm": 0.86328125, + "learning_rate": 0.00012148479122307847, + "loss": 0.8773, + "step": 28974 + }, + { + "epoch": 0.7439967518347159, + "grad_norm": 0.8046875, + "learning_rate": 0.00012148043125017283, + "loss": 0.8636, + "step": 28975 + }, + { + "epoch": 0.7440224290306378, + "grad_norm": 0.80078125, + "learning_rate": 0.00012147607123445865, + "loss": 0.9144, + "step": 28976 + }, + { + "epoch": 0.7440481062265595, + "grad_norm": 0.76953125, + "learning_rate": 0.00012147171117594458, + "loss": 0.8029, + "step": 28977 + }, + { + "epoch": 0.7440737834224813, + "grad_norm": 0.69921875, + "learning_rate": 0.00012146735107463935, + "loss": 0.7964, + "step": 28978 + }, + { + "epoch": 0.7440994606184032, + "grad_norm": 0.71875, + "learning_rate": 0.00012146299093055163, + "loss": 0.7782, + "step": 28979 + }, + { + "epoch": 0.744125137814325, + "grad_norm": 0.703125, + "learning_rate": 0.00012145863074369008, + "loss": 0.8349, + "step": 28980 + }, + { + "epoch": 0.7441508150102468, + "grad_norm": 0.90234375, + "learning_rate": 0.00012145427051406344, + "loss": 0.7359, + "step": 28981 + }, + { + "epoch": 0.7441764922061687, + "grad_norm": 0.765625, + "learning_rate": 0.00012144991024168038, + "loss": 0.9569, + "step": 28982 + }, + { + "epoch": 0.7442021694020905, + "grad_norm": 0.75, + "learning_rate": 0.00012144554992654956, + "loss": 0.8219, + "step": 28983 + }, + { + "epoch": 0.7442278465980122, + "grad_norm": 0.765625, + "learning_rate": 0.00012144118956867971, + "loss": 0.8179, + "step": 28984 + }, + { + "epoch": 0.7442535237939341, + "grad_norm": 0.7890625, + "learning_rate": 0.0001214368291680795, + "loss": 0.8038, + "step": 28985 + }, + { + "epoch": 0.7442792009898559, + "grad_norm": 0.73828125, + "learning_rate": 0.00012143246872475764, + "loss": 0.7191, + "step": 28986 + }, + { + "epoch": 0.7443048781857777, + "grad_norm": 0.796875, + "learning_rate": 0.00012142810823872281, + "loss": 0.8161, + "step": 28987 + }, + { + "epoch": 0.7443305553816996, + "grad_norm": 0.75390625, + "learning_rate": 0.00012142374770998366, + "loss": 0.6849, + "step": 28988 + }, + { + "epoch": 0.7443562325776214, + "grad_norm": 0.80078125, + "learning_rate": 0.00012141938713854894, + "loss": 0.9828, + "step": 28989 + }, + { + "epoch": 0.7443819097735431, + "grad_norm": 0.84765625, + "learning_rate": 0.00012141502652442734, + "loss": 0.7973, + "step": 28990 + }, + { + "epoch": 0.744407586969465, + "grad_norm": 0.80078125, + "learning_rate": 0.00012141066586762751, + "loss": 0.802, + "step": 28991 + }, + { + "epoch": 0.7444332641653868, + "grad_norm": 0.7265625, + "learning_rate": 0.00012140630516815814, + "loss": 0.7882, + "step": 28992 + }, + { + "epoch": 0.7444589413613086, + "grad_norm": 0.79296875, + "learning_rate": 0.00012140194442602794, + "loss": 0.798, + "step": 28993 + }, + { + "epoch": 0.7444846185572305, + "grad_norm": 0.7578125, + "learning_rate": 0.0001213975836412456, + "loss": 0.7491, + "step": 28994 + }, + { + "epoch": 0.7445102957531523, + "grad_norm": 0.82421875, + "learning_rate": 0.00012139322281381981, + "loss": 0.7667, + "step": 28995 + }, + { + "epoch": 0.7445359729490741, + "grad_norm": 0.75390625, + "learning_rate": 0.00012138886194375927, + "loss": 0.899, + "step": 28996 + }, + { + "epoch": 0.7445616501449959, + "grad_norm": 0.796875, + "learning_rate": 0.00012138450103107266, + "loss": 0.7297, + "step": 28997 + }, + { + "epoch": 0.7445873273409177, + "grad_norm": 0.75, + "learning_rate": 0.00012138014007576866, + "loss": 0.7889, + "step": 28998 + }, + { + "epoch": 0.7446130045368395, + "grad_norm": 0.78125, + "learning_rate": 0.00012137577907785598, + "loss": 0.9599, + "step": 28999 + }, + { + "epoch": 0.7446386817327614, + "grad_norm": 0.75, + "learning_rate": 0.00012137141803734331, + "loss": 0.7178, + "step": 29000 + }, + { + "epoch": 0.7446386817327614, + "eval_loss": 0.8222226500511169, + "eval_runtime": 395.4366, + "eval_samples_per_second": 25.289, + "eval_steps_per_second": 0.792, + "step": 29000 + }, + { + "epoch": 0.7446643589286832, + "grad_norm": 0.72265625, + "learning_rate": 0.00012136705695423932, + "loss": 0.7878, + "step": 29001 + }, + { + "epoch": 0.744690036124605, + "grad_norm": 0.8828125, + "learning_rate": 0.0001213626958285527, + "loss": 0.845, + "step": 29002 + }, + { + "epoch": 0.7447157133205269, + "grad_norm": 0.7734375, + "learning_rate": 0.00012135833466029218, + "loss": 0.81, + "step": 29003 + }, + { + "epoch": 0.7447413905164486, + "grad_norm": 0.7265625, + "learning_rate": 0.00012135397344946642, + "loss": 0.8514, + "step": 29004 + }, + { + "epoch": 0.7447670677123704, + "grad_norm": 0.76171875, + "learning_rate": 0.00012134961219608416, + "loss": 0.89, + "step": 29005 + }, + { + "epoch": 0.7447927449082923, + "grad_norm": 0.81640625, + "learning_rate": 0.00012134525090015399, + "loss": 0.767, + "step": 29006 + }, + { + "epoch": 0.7448184221042141, + "grad_norm": 0.796875, + "learning_rate": 0.0001213408895616847, + "loss": 0.8595, + "step": 29007 + }, + { + "epoch": 0.744844099300136, + "grad_norm": 0.9296875, + "learning_rate": 0.00012133652818068494, + "loss": 0.8183, + "step": 29008 + }, + { + "epoch": 0.7448697764960578, + "grad_norm": 0.82421875, + "learning_rate": 0.00012133216675716339, + "loss": 0.8639, + "step": 29009 + }, + { + "epoch": 0.7448954536919795, + "grad_norm": 0.75390625, + "learning_rate": 0.00012132780529112877, + "loss": 0.8525, + "step": 29010 + }, + { + "epoch": 0.7449211308879014, + "grad_norm": 0.87109375, + "learning_rate": 0.00012132344378258977, + "loss": 0.8043, + "step": 29011 + }, + { + "epoch": 0.7449468080838232, + "grad_norm": 0.78515625, + "learning_rate": 0.00012131908223155506, + "loss": 0.7321, + "step": 29012 + }, + { + "epoch": 0.744972485279745, + "grad_norm": 0.88671875, + "learning_rate": 0.00012131472063803335, + "loss": 0.815, + "step": 29013 + }, + { + "epoch": 0.7449981624756669, + "grad_norm": 0.84765625, + "learning_rate": 0.00012131035900203332, + "loss": 0.8416, + "step": 29014 + }, + { + "epoch": 0.7450238396715887, + "grad_norm": 0.74609375, + "learning_rate": 0.00012130599732356366, + "loss": 0.9004, + "step": 29015 + }, + { + "epoch": 0.7450495168675105, + "grad_norm": 0.7109375, + "learning_rate": 0.0001213016356026331, + "loss": 0.76, + "step": 29016 + }, + { + "epoch": 0.7450751940634323, + "grad_norm": 0.796875, + "learning_rate": 0.00012129727383925027, + "loss": 0.793, + "step": 29017 + }, + { + "epoch": 0.7451008712593541, + "grad_norm": 0.7890625, + "learning_rate": 0.00012129291203342394, + "loss": 0.8701, + "step": 29018 + }, + { + "epoch": 0.7451265484552759, + "grad_norm": 0.85546875, + "learning_rate": 0.00012128855018516274, + "loss": 0.9796, + "step": 29019 + }, + { + "epoch": 0.7451522256511978, + "grad_norm": 0.8359375, + "learning_rate": 0.00012128418829447539, + "loss": 0.9429, + "step": 29020 + }, + { + "epoch": 0.7451779028471196, + "grad_norm": 0.80078125, + "learning_rate": 0.00012127982636137057, + "loss": 0.7862, + "step": 29021 + }, + { + "epoch": 0.7452035800430414, + "grad_norm": 0.73828125, + "learning_rate": 0.00012127546438585695, + "loss": 0.8347, + "step": 29022 + }, + { + "epoch": 0.7452292572389633, + "grad_norm": 0.75, + "learning_rate": 0.00012127110236794328, + "loss": 0.7715, + "step": 29023 + }, + { + "epoch": 0.745254934434885, + "grad_norm": 0.765625, + "learning_rate": 0.00012126674030763823, + "loss": 0.7888, + "step": 29024 + }, + { + "epoch": 0.7452806116308068, + "grad_norm": 0.78125, + "learning_rate": 0.00012126237820495048, + "loss": 0.7698, + "step": 29025 + }, + { + "epoch": 0.7453062888267287, + "grad_norm": 0.88671875, + "learning_rate": 0.00012125801605988871, + "loss": 0.8403, + "step": 29026 + }, + { + "epoch": 0.7453319660226505, + "grad_norm": 0.75, + "learning_rate": 0.00012125365387246165, + "loss": 0.8533, + "step": 29027 + }, + { + "epoch": 0.7453576432185723, + "grad_norm": 0.8046875, + "learning_rate": 0.00012124929164267797, + "loss": 0.8405, + "step": 29028 + }, + { + "epoch": 0.7453833204144942, + "grad_norm": 0.8515625, + "learning_rate": 0.00012124492937054639, + "loss": 0.9236, + "step": 29029 + }, + { + "epoch": 0.7454089976104159, + "grad_norm": 0.75, + "learning_rate": 0.00012124056705607558, + "loss": 0.7794, + "step": 29030 + }, + { + "epoch": 0.7454346748063377, + "grad_norm": 0.75, + "learning_rate": 0.00012123620469927423, + "loss": 0.8591, + "step": 29031 + }, + { + "epoch": 0.7454603520022596, + "grad_norm": 1.03125, + "learning_rate": 0.00012123184230015106, + "loss": 0.8384, + "step": 29032 + }, + { + "epoch": 0.7454860291981814, + "grad_norm": 0.8203125, + "learning_rate": 0.00012122747985871472, + "loss": 0.8409, + "step": 29033 + }, + { + "epoch": 0.7455117063941032, + "grad_norm": 0.7265625, + "learning_rate": 0.00012122311737497396, + "loss": 0.723, + "step": 29034 + }, + { + "epoch": 0.7455373835900251, + "grad_norm": 0.734375, + "learning_rate": 0.00012121875484893742, + "loss": 0.8291, + "step": 29035 + }, + { + "epoch": 0.7455630607859469, + "grad_norm": 0.765625, + "learning_rate": 0.0001212143922806138, + "loss": 0.7325, + "step": 29036 + }, + { + "epoch": 0.7455887379818686, + "grad_norm": 0.80859375, + "learning_rate": 0.00012121002967001188, + "loss": 0.8461, + "step": 29037 + }, + { + "epoch": 0.7456144151777905, + "grad_norm": 0.76171875, + "learning_rate": 0.00012120566701714023, + "loss": 0.8645, + "step": 29038 + }, + { + "epoch": 0.7456400923737123, + "grad_norm": 0.87890625, + "learning_rate": 0.0001212013043220076, + "loss": 0.8381, + "step": 29039 + }, + { + "epoch": 0.7456657695696342, + "grad_norm": 0.70703125, + "learning_rate": 0.00012119694158462271, + "loss": 0.6952, + "step": 29040 + }, + { + "epoch": 0.745691446765556, + "grad_norm": 0.80078125, + "learning_rate": 0.00012119257880499422, + "loss": 0.7362, + "step": 29041 + }, + { + "epoch": 0.7457171239614778, + "grad_norm": 0.81640625, + "learning_rate": 0.00012118821598313084, + "loss": 0.8542, + "step": 29042 + }, + { + "epoch": 0.7457428011573997, + "grad_norm": 0.8671875, + "learning_rate": 0.00012118385311904125, + "loss": 0.8121, + "step": 29043 + }, + { + "epoch": 0.7457684783533214, + "grad_norm": 0.765625, + "learning_rate": 0.00012117949021273416, + "loss": 0.8062, + "step": 29044 + }, + { + "epoch": 0.7457941555492432, + "grad_norm": 0.76171875, + "learning_rate": 0.00012117512726421827, + "loss": 0.9317, + "step": 29045 + }, + { + "epoch": 0.7458198327451651, + "grad_norm": 0.75, + "learning_rate": 0.00012117076427350225, + "loss": 0.8727, + "step": 29046 + }, + { + "epoch": 0.7458455099410869, + "grad_norm": 0.82421875, + "learning_rate": 0.0001211664012405948, + "loss": 0.7826, + "step": 29047 + }, + { + "epoch": 0.7458711871370087, + "grad_norm": 0.78515625, + "learning_rate": 0.00012116203816550464, + "loss": 0.7142, + "step": 29048 + }, + { + "epoch": 0.7458968643329306, + "grad_norm": 0.77734375, + "learning_rate": 0.00012115767504824043, + "loss": 0.8468, + "step": 29049 + }, + { + "epoch": 0.7459225415288523, + "grad_norm": 0.85546875, + "learning_rate": 0.00012115331188881092, + "loss": 0.8231, + "step": 29050 + }, + { + "epoch": 0.7459482187247741, + "grad_norm": 0.796875, + "learning_rate": 0.00012114894868722475, + "loss": 0.7126, + "step": 29051 + }, + { + "epoch": 0.745973895920696, + "grad_norm": 0.734375, + "learning_rate": 0.00012114458544349063, + "loss": 0.8648, + "step": 29052 + }, + { + "epoch": 0.7459995731166178, + "grad_norm": 0.78125, + "learning_rate": 0.00012114022215761726, + "loss": 0.8073, + "step": 29053 + }, + { + "epoch": 0.7460252503125396, + "grad_norm": 0.76953125, + "learning_rate": 0.00012113585882961334, + "loss": 0.7626, + "step": 29054 + }, + { + "epoch": 0.7460509275084615, + "grad_norm": 0.79296875, + "learning_rate": 0.00012113149545948758, + "loss": 0.8101, + "step": 29055 + }, + { + "epoch": 0.7460766047043833, + "grad_norm": 0.80078125, + "learning_rate": 0.00012112713204724863, + "loss": 0.7461, + "step": 29056 + }, + { + "epoch": 0.746102281900305, + "grad_norm": 0.76953125, + "learning_rate": 0.00012112276859290526, + "loss": 0.7522, + "step": 29057 + }, + { + "epoch": 0.7461279590962269, + "grad_norm": 0.828125, + "learning_rate": 0.00012111840509646608, + "loss": 0.8358, + "step": 29058 + }, + { + "epoch": 0.7461536362921487, + "grad_norm": 0.7734375, + "learning_rate": 0.00012111404155793982, + "loss": 0.8463, + "step": 29059 + }, + { + "epoch": 0.7461793134880705, + "grad_norm": 0.74609375, + "learning_rate": 0.00012110967797733521, + "loss": 0.8368, + "step": 29060 + }, + { + "epoch": 0.7462049906839924, + "grad_norm": 0.8828125, + "learning_rate": 0.0001211053143546609, + "loss": 0.9075, + "step": 29061 + }, + { + "epoch": 0.7462306678799142, + "grad_norm": 0.8046875, + "learning_rate": 0.00012110095068992562, + "loss": 0.8826, + "step": 29062 + }, + { + "epoch": 0.746256345075836, + "grad_norm": 0.76171875, + "learning_rate": 0.00012109658698313803, + "loss": 0.8764, + "step": 29063 + }, + { + "epoch": 0.7462820222717578, + "grad_norm": 0.7890625, + "learning_rate": 0.00012109222323430689, + "loss": 0.8301, + "step": 29064 + }, + { + "epoch": 0.7463076994676796, + "grad_norm": 0.76953125, + "learning_rate": 0.00012108785944344081, + "loss": 0.9575, + "step": 29065 + }, + { + "epoch": 0.7463333766636014, + "grad_norm": 1.7734375, + "learning_rate": 0.00012108349561054855, + "loss": 0.8462, + "step": 29066 + }, + { + "epoch": 0.7463590538595233, + "grad_norm": 0.765625, + "learning_rate": 0.00012107913173563878, + "loss": 0.7985, + "step": 29067 + }, + { + "epoch": 0.7463847310554451, + "grad_norm": 0.8125, + "learning_rate": 0.0001210747678187202, + "loss": 0.8056, + "step": 29068 + }, + { + "epoch": 0.746410408251367, + "grad_norm": 0.80859375, + "learning_rate": 0.00012107040385980154, + "loss": 0.8361, + "step": 29069 + }, + { + "epoch": 0.7464360854472887, + "grad_norm": 0.79296875, + "learning_rate": 0.00012106603985889144, + "loss": 0.822, + "step": 29070 + }, + { + "epoch": 0.7464617626432105, + "grad_norm": 0.75, + "learning_rate": 0.00012106167581599863, + "loss": 0.7792, + "step": 29071 + }, + { + "epoch": 0.7464874398391323, + "grad_norm": 0.76953125, + "learning_rate": 0.00012105731173113182, + "loss": 0.8084, + "step": 29072 + }, + { + "epoch": 0.7465131170350542, + "grad_norm": 0.7734375, + "learning_rate": 0.00012105294760429965, + "loss": 0.8239, + "step": 29073 + }, + { + "epoch": 0.746538794230976, + "grad_norm": 0.73046875, + "learning_rate": 0.00012104858343551089, + "loss": 0.6579, + "step": 29074 + }, + { + "epoch": 0.7465644714268979, + "grad_norm": 0.72265625, + "learning_rate": 0.0001210442192247742, + "loss": 0.713, + "step": 29075 + }, + { + "epoch": 0.7465901486228197, + "grad_norm": 0.74609375, + "learning_rate": 0.00012103985497209826, + "loss": 0.7369, + "step": 29076 + }, + { + "epoch": 0.7466158258187414, + "grad_norm": 0.7578125, + "learning_rate": 0.00012103549067749185, + "loss": 0.69, + "step": 29077 + }, + { + "epoch": 0.7466415030146633, + "grad_norm": 0.79296875, + "learning_rate": 0.00012103112634096354, + "loss": 0.9809, + "step": 29078 + }, + { + "epoch": 0.7466671802105851, + "grad_norm": 0.796875, + "learning_rate": 0.00012102676196252213, + "loss": 0.8245, + "step": 29079 + }, + { + "epoch": 0.7466928574065069, + "grad_norm": 0.70703125, + "learning_rate": 0.00012102239754217628, + "loss": 0.7205, + "step": 29080 + }, + { + "epoch": 0.7467185346024288, + "grad_norm": 0.828125, + "learning_rate": 0.00012101803307993466, + "loss": 0.8925, + "step": 29081 + }, + { + "epoch": 0.7467442117983506, + "grad_norm": 0.84375, + "learning_rate": 0.00012101366857580605, + "loss": 0.9112, + "step": 29082 + }, + { + "epoch": 0.7467698889942724, + "grad_norm": 0.75390625, + "learning_rate": 0.00012100930402979906, + "loss": 0.8562, + "step": 29083 + }, + { + "epoch": 0.7467955661901942, + "grad_norm": 0.78125, + "learning_rate": 0.00012100493944192246, + "loss": 0.8213, + "step": 29084 + }, + { + "epoch": 0.746821243386116, + "grad_norm": 0.7734375, + "learning_rate": 0.00012100057481218487, + "loss": 0.8185, + "step": 29085 + }, + { + "epoch": 0.7468469205820378, + "grad_norm": 0.7734375, + "learning_rate": 0.00012099621014059503, + "loss": 0.8432, + "step": 29086 + }, + { + "epoch": 0.7468725977779597, + "grad_norm": 0.75, + "learning_rate": 0.00012099184542716169, + "loss": 0.744, + "step": 29087 + }, + { + "epoch": 0.7468982749738815, + "grad_norm": 0.73046875, + "learning_rate": 0.00012098748067189345, + "loss": 0.8802, + "step": 29088 + }, + { + "epoch": 0.7469239521698033, + "grad_norm": 0.77734375, + "learning_rate": 0.00012098311587479907, + "loss": 0.7565, + "step": 29089 + }, + { + "epoch": 0.7469496293657251, + "grad_norm": 0.68359375, + "learning_rate": 0.00012097875103588727, + "loss": 0.8472, + "step": 29090 + }, + { + "epoch": 0.7469753065616469, + "grad_norm": 0.79296875, + "learning_rate": 0.00012097438615516665, + "loss": 0.8682, + "step": 29091 + }, + { + "epoch": 0.7470009837575687, + "grad_norm": 0.765625, + "learning_rate": 0.00012097002123264603, + "loss": 0.7984, + "step": 29092 + }, + { + "epoch": 0.7470266609534906, + "grad_norm": 0.7890625, + "learning_rate": 0.00012096565626833403, + "loss": 0.8506, + "step": 29093 + }, + { + "epoch": 0.7470523381494124, + "grad_norm": 0.8125, + "learning_rate": 0.00012096129126223937, + "loss": 0.8541, + "step": 29094 + }, + { + "epoch": 0.7470780153453342, + "grad_norm": 0.7421875, + "learning_rate": 0.00012095692621437074, + "loss": 0.6858, + "step": 29095 + }, + { + "epoch": 0.7471036925412561, + "grad_norm": 0.76171875, + "learning_rate": 0.00012095256112473685, + "loss": 0.8003, + "step": 29096 + }, + { + "epoch": 0.7471293697371778, + "grad_norm": 0.828125, + "learning_rate": 0.00012094819599334641, + "loss": 0.8833, + "step": 29097 + }, + { + "epoch": 0.7471550469330996, + "grad_norm": 0.80078125, + "learning_rate": 0.00012094383082020812, + "loss": 0.9121, + "step": 29098 + }, + { + "epoch": 0.7471807241290215, + "grad_norm": 0.75, + "learning_rate": 0.00012093946560533065, + "loss": 0.8491, + "step": 29099 + }, + { + "epoch": 0.7472064013249433, + "grad_norm": 0.6953125, + "learning_rate": 0.0001209351003487227, + "loss": 0.7705, + "step": 29100 + }, + { + "epoch": 0.7472320785208652, + "grad_norm": 0.734375, + "learning_rate": 0.00012093073505039303, + "loss": 0.7934, + "step": 29101 + }, + { + "epoch": 0.747257755716787, + "grad_norm": 0.74609375, + "learning_rate": 0.00012092636971035025, + "loss": 0.869, + "step": 29102 + }, + { + "epoch": 0.7472834329127088, + "grad_norm": 0.796875, + "learning_rate": 0.00012092200432860314, + "loss": 0.7442, + "step": 29103 + }, + { + "epoch": 0.7473091101086305, + "grad_norm": 0.78125, + "learning_rate": 0.00012091763890516037, + "loss": 0.9381, + "step": 29104 + }, + { + "epoch": 0.7473347873045524, + "grad_norm": 0.82421875, + "learning_rate": 0.00012091327344003059, + "loss": 0.8262, + "step": 29105 + }, + { + "epoch": 0.7473604645004742, + "grad_norm": 0.7421875, + "learning_rate": 0.00012090890793322258, + "loss": 0.7763, + "step": 29106 + }, + { + "epoch": 0.7473861416963961, + "grad_norm": 0.76171875, + "learning_rate": 0.00012090454238474498, + "loss": 0.8319, + "step": 29107 + }, + { + "epoch": 0.7474118188923179, + "grad_norm": 0.7734375, + "learning_rate": 0.00012090017679460653, + "loss": 0.9221, + "step": 29108 + }, + { + "epoch": 0.7474374960882397, + "grad_norm": 0.8203125, + "learning_rate": 0.00012089581116281591, + "loss": 0.9828, + "step": 29109 + }, + { + "epoch": 0.7474631732841615, + "grad_norm": 0.7265625, + "learning_rate": 0.00012089144548938185, + "loss": 0.7177, + "step": 29110 + }, + { + "epoch": 0.7474888504800833, + "grad_norm": 0.796875, + "learning_rate": 0.00012088707977431302, + "loss": 0.928, + "step": 29111 + }, + { + "epoch": 0.7475145276760051, + "grad_norm": 0.86328125, + "learning_rate": 0.00012088271401761813, + "loss": 0.8537, + "step": 29112 + }, + { + "epoch": 0.747540204871927, + "grad_norm": 0.78125, + "learning_rate": 0.00012087834821930584, + "loss": 0.8757, + "step": 29113 + }, + { + "epoch": 0.7475658820678488, + "grad_norm": 0.86328125, + "learning_rate": 0.00012087398237938492, + "loss": 0.8144, + "step": 29114 + }, + { + "epoch": 0.7475915592637706, + "grad_norm": 0.86328125, + "learning_rate": 0.00012086961649786404, + "loss": 0.913, + "step": 29115 + }, + { + "epoch": 0.7476172364596925, + "grad_norm": 0.8046875, + "learning_rate": 0.00012086525057475188, + "loss": 1.0232, + "step": 29116 + }, + { + "epoch": 0.7476429136556142, + "grad_norm": 0.7421875, + "learning_rate": 0.00012086088461005718, + "loss": 0.8063, + "step": 29117 + }, + { + "epoch": 0.747668590851536, + "grad_norm": 0.72265625, + "learning_rate": 0.0001208565186037886, + "loss": 0.8103, + "step": 29118 + }, + { + "epoch": 0.7476942680474579, + "grad_norm": 0.80859375, + "learning_rate": 0.00012085215255595486, + "loss": 0.8914, + "step": 29119 + }, + { + "epoch": 0.7477199452433797, + "grad_norm": 0.76953125, + "learning_rate": 0.0001208477864665647, + "loss": 0.7988, + "step": 29120 + }, + { + "epoch": 0.7477456224393015, + "grad_norm": 0.7265625, + "learning_rate": 0.00012084342033562672, + "loss": 0.9089, + "step": 29121 + }, + { + "epoch": 0.7477712996352234, + "grad_norm": 1.03125, + "learning_rate": 0.00012083905416314974, + "loss": 0.8647, + "step": 29122 + }, + { + "epoch": 0.7477969768311451, + "grad_norm": 0.76171875, + "learning_rate": 0.00012083468794914241, + "loss": 0.8203, + "step": 29123 + }, + { + "epoch": 0.7478226540270669, + "grad_norm": 0.87109375, + "learning_rate": 0.00012083032169361342, + "loss": 0.8392, + "step": 29124 + }, + { + "epoch": 0.7478483312229888, + "grad_norm": 0.75390625, + "learning_rate": 0.00012082595539657146, + "loss": 0.8454, + "step": 29125 + }, + { + "epoch": 0.7478740084189106, + "grad_norm": 0.74609375, + "learning_rate": 0.00012082158905802526, + "loss": 0.8073, + "step": 29126 + }, + { + "epoch": 0.7478996856148324, + "grad_norm": 0.71875, + "learning_rate": 0.00012081722267798352, + "loss": 0.7803, + "step": 29127 + }, + { + "epoch": 0.7479253628107543, + "grad_norm": 0.86328125, + "learning_rate": 0.00012081285625645495, + "loss": 0.8851, + "step": 29128 + }, + { + "epoch": 0.7479510400066761, + "grad_norm": 0.8828125, + "learning_rate": 0.0001208084897934482, + "loss": 0.8979, + "step": 29129 + }, + { + "epoch": 0.7479767172025978, + "grad_norm": 0.75, + "learning_rate": 0.00012080412328897205, + "loss": 0.7674, + "step": 29130 + }, + { + "epoch": 0.7480023943985197, + "grad_norm": 0.77734375, + "learning_rate": 0.00012079975674303514, + "loss": 0.8659, + "step": 29131 + }, + { + "epoch": 0.7480280715944415, + "grad_norm": 0.76953125, + "learning_rate": 0.0001207953901556462, + "loss": 0.8999, + "step": 29132 + }, + { + "epoch": 0.7480537487903633, + "grad_norm": 0.78515625, + "learning_rate": 0.00012079102352681392, + "loss": 0.7333, + "step": 29133 + }, + { + "epoch": 0.7480794259862852, + "grad_norm": 0.69140625, + "learning_rate": 0.000120786656856547, + "loss": 0.7341, + "step": 29134 + }, + { + "epoch": 0.748105103182207, + "grad_norm": 0.75, + "learning_rate": 0.00012078229014485418, + "loss": 0.8, + "step": 29135 + }, + { + "epoch": 0.7481307803781289, + "grad_norm": 0.78515625, + "learning_rate": 0.00012077792339174412, + "loss": 0.8454, + "step": 29136 + }, + { + "epoch": 0.7481564575740506, + "grad_norm": 0.7734375, + "learning_rate": 0.00012077355659722553, + "loss": 0.755, + "step": 29137 + }, + { + "epoch": 0.7481821347699724, + "grad_norm": 0.76171875, + "learning_rate": 0.00012076918976130713, + "loss": 0.8459, + "step": 29138 + }, + { + "epoch": 0.7482078119658943, + "grad_norm": 0.8671875, + "learning_rate": 0.00012076482288399758, + "loss": 0.8307, + "step": 29139 + }, + { + "epoch": 0.7482334891618161, + "grad_norm": 0.89453125, + "learning_rate": 0.00012076045596530564, + "loss": 0.9164, + "step": 29140 + }, + { + "epoch": 0.7482591663577379, + "grad_norm": 0.77734375, + "learning_rate": 0.00012075608900523998, + "loss": 0.8372, + "step": 29141 + }, + { + "epoch": 0.7482848435536598, + "grad_norm": 0.8828125, + "learning_rate": 0.00012075172200380929, + "loss": 0.8917, + "step": 29142 + }, + { + "epoch": 0.7483105207495815, + "grad_norm": 0.75, + "learning_rate": 0.00012074735496102234, + "loss": 0.7565, + "step": 29143 + }, + { + "epoch": 0.7483361979455033, + "grad_norm": 0.76953125, + "learning_rate": 0.00012074298787688776, + "loss": 0.9182, + "step": 29144 + }, + { + "epoch": 0.7483618751414252, + "grad_norm": 0.75390625, + "learning_rate": 0.00012073862075141427, + "loss": 0.7273, + "step": 29145 + }, + { + "epoch": 0.748387552337347, + "grad_norm": 0.7265625, + "learning_rate": 0.00012073425358461059, + "loss": 0.7853, + "step": 29146 + }, + { + "epoch": 0.7484132295332688, + "grad_norm": 0.71484375, + "learning_rate": 0.00012072988637648541, + "loss": 0.8137, + "step": 29147 + }, + { + "epoch": 0.7484389067291907, + "grad_norm": 0.78515625, + "learning_rate": 0.00012072551912704744, + "loss": 0.6577, + "step": 29148 + }, + { + "epoch": 0.7484645839251125, + "grad_norm": 0.73046875, + "learning_rate": 0.00012072115183630541, + "loss": 0.8115, + "step": 29149 + }, + { + "epoch": 0.7484902611210342, + "grad_norm": 0.80859375, + "learning_rate": 0.00012071678450426796, + "loss": 0.763, + "step": 29150 + }, + { + "epoch": 0.7485159383169561, + "grad_norm": 0.84765625, + "learning_rate": 0.00012071241713094386, + "loss": 0.7527, + "step": 29151 + }, + { + "epoch": 0.7485416155128779, + "grad_norm": 0.75390625, + "learning_rate": 0.00012070804971634176, + "loss": 0.7193, + "step": 29152 + }, + { + "epoch": 0.7485672927087997, + "grad_norm": 0.7734375, + "learning_rate": 0.00012070368226047037, + "loss": 0.8556, + "step": 29153 + }, + { + "epoch": 0.7485929699047216, + "grad_norm": 0.84765625, + "learning_rate": 0.00012069931476333844, + "loss": 1.0043, + "step": 29154 + }, + { + "epoch": 0.7486186471006434, + "grad_norm": 0.75390625, + "learning_rate": 0.00012069494722495465, + "loss": 0.7657, + "step": 29155 + }, + { + "epoch": 0.7486443242965652, + "grad_norm": 0.75390625, + "learning_rate": 0.00012069057964532768, + "loss": 0.7769, + "step": 29156 + }, + { + "epoch": 0.748670001492487, + "grad_norm": 0.8046875, + "learning_rate": 0.00012068621202446628, + "loss": 0.8746, + "step": 29157 + }, + { + "epoch": 0.7486956786884088, + "grad_norm": 0.82421875, + "learning_rate": 0.00012068184436237911, + "loss": 0.8141, + "step": 29158 + }, + { + "epoch": 0.7487213558843306, + "grad_norm": 0.765625, + "learning_rate": 0.00012067747665907489, + "loss": 0.6913, + "step": 29159 + }, + { + "epoch": 0.7487470330802525, + "grad_norm": 0.80078125, + "learning_rate": 0.00012067310891456233, + "loss": 0.79, + "step": 29160 + }, + { + "epoch": 0.7487727102761743, + "grad_norm": 0.82421875, + "learning_rate": 0.00012066874112885012, + "loss": 0.9181, + "step": 29161 + }, + { + "epoch": 0.7487983874720961, + "grad_norm": 0.7890625, + "learning_rate": 0.00012066437330194702, + "loss": 0.8998, + "step": 29162 + }, + { + "epoch": 0.7488240646680179, + "grad_norm": 0.8046875, + "learning_rate": 0.00012066000543386163, + "loss": 0.8666, + "step": 29163 + }, + { + "epoch": 0.7488497418639397, + "grad_norm": 0.83203125, + "learning_rate": 0.00012065563752460276, + "loss": 0.9453, + "step": 29164 + }, + { + "epoch": 0.7488754190598615, + "grad_norm": 0.80078125, + "learning_rate": 0.00012065126957417905, + "loss": 0.7695, + "step": 29165 + }, + { + "epoch": 0.7489010962557834, + "grad_norm": 0.76171875, + "learning_rate": 0.00012064690158259921, + "loss": 0.8083, + "step": 29166 + }, + { + "epoch": 0.7489267734517052, + "grad_norm": 0.80078125, + "learning_rate": 0.00012064253354987198, + "loss": 0.7965, + "step": 29167 + }, + { + "epoch": 0.748952450647627, + "grad_norm": 0.76953125, + "learning_rate": 0.00012063816547600606, + "loss": 0.7606, + "step": 29168 + }, + { + "epoch": 0.7489781278435489, + "grad_norm": 0.828125, + "learning_rate": 0.00012063379736101013, + "loss": 0.9127, + "step": 29169 + }, + { + "epoch": 0.7490038050394706, + "grad_norm": 0.78515625, + "learning_rate": 0.00012062942920489291, + "loss": 0.868, + "step": 29170 + }, + { + "epoch": 0.7490294822353925, + "grad_norm": 0.78125, + "learning_rate": 0.00012062506100766306, + "loss": 0.8451, + "step": 29171 + }, + { + "epoch": 0.7490551594313143, + "grad_norm": 0.74609375, + "learning_rate": 0.00012062069276932936, + "loss": 0.8197, + "step": 29172 + }, + { + "epoch": 0.7490808366272361, + "grad_norm": 0.796875, + "learning_rate": 0.0001206163244899005, + "loss": 0.9657, + "step": 29173 + }, + { + "epoch": 0.749106513823158, + "grad_norm": 0.7734375, + "learning_rate": 0.00012061195616938513, + "loss": 0.9224, + "step": 29174 + }, + { + "epoch": 0.7491321910190798, + "grad_norm": 0.7734375, + "learning_rate": 0.000120607587807792, + "loss": 0.8121, + "step": 29175 + }, + { + "epoch": 0.7491578682150016, + "grad_norm": 0.8046875, + "learning_rate": 0.00012060321940512984, + "loss": 0.8234, + "step": 29176 + }, + { + "epoch": 0.7491835454109234, + "grad_norm": 0.7265625, + "learning_rate": 0.00012059885096140731, + "loss": 0.8342, + "step": 29177 + }, + { + "epoch": 0.7492092226068452, + "grad_norm": 0.7734375, + "learning_rate": 0.00012059448247663311, + "loss": 0.9134, + "step": 29178 + }, + { + "epoch": 0.749234899802767, + "grad_norm": 0.7578125, + "learning_rate": 0.00012059011395081598, + "loss": 1.0274, + "step": 29179 + }, + { + "epoch": 0.7492605769986889, + "grad_norm": 0.81640625, + "learning_rate": 0.00012058574538396463, + "loss": 0.7888, + "step": 29180 + }, + { + "epoch": 0.7492862541946107, + "grad_norm": 0.94140625, + "learning_rate": 0.00012058137677608772, + "loss": 0.862, + "step": 29181 + }, + { + "epoch": 0.7493119313905325, + "grad_norm": 0.75, + "learning_rate": 0.00012057700812719401, + "loss": 0.6983, + "step": 29182 + }, + { + "epoch": 0.7493376085864543, + "grad_norm": 0.76171875, + "learning_rate": 0.00012057263943729218, + "loss": 0.8728, + "step": 29183 + }, + { + "epoch": 0.7493632857823761, + "grad_norm": 0.765625, + "learning_rate": 0.00012056827070639091, + "loss": 0.8994, + "step": 29184 + }, + { + "epoch": 0.7493889629782979, + "grad_norm": 0.7421875, + "learning_rate": 0.00012056390193449896, + "loss": 0.8543, + "step": 29185 + }, + { + "epoch": 0.7494146401742198, + "grad_norm": 0.79296875, + "learning_rate": 0.00012055953312162501, + "loss": 0.9872, + "step": 29186 + }, + { + "epoch": 0.7494403173701416, + "grad_norm": 0.875, + "learning_rate": 0.00012055516426777775, + "loss": 0.7844, + "step": 29187 + }, + { + "epoch": 0.7494659945660634, + "grad_norm": 0.80859375, + "learning_rate": 0.00012055079537296592, + "loss": 0.8821, + "step": 29188 + }, + { + "epoch": 0.7494916717619853, + "grad_norm": 0.8515625, + "learning_rate": 0.00012054642643719822, + "loss": 0.8975, + "step": 29189 + }, + { + "epoch": 0.749517348957907, + "grad_norm": 0.7265625, + "learning_rate": 0.00012054205746048332, + "loss": 0.8307, + "step": 29190 + }, + { + "epoch": 0.7495430261538288, + "grad_norm": 0.79296875, + "learning_rate": 0.00012053768844282998, + "loss": 0.7736, + "step": 29191 + }, + { + "epoch": 0.7495687033497507, + "grad_norm": 0.78515625, + "learning_rate": 0.00012053331938424688, + "loss": 0.7424, + "step": 29192 + }, + { + "epoch": 0.7495943805456725, + "grad_norm": 0.75390625, + "learning_rate": 0.0001205289502847427, + "loss": 0.7836, + "step": 29193 + }, + { + "epoch": 0.7496200577415943, + "grad_norm": 0.7578125, + "learning_rate": 0.00012052458114432621, + "loss": 0.7344, + "step": 29194 + }, + { + "epoch": 0.7496457349375162, + "grad_norm": 0.72265625, + "learning_rate": 0.00012052021196300607, + "loss": 0.8195, + "step": 29195 + }, + { + "epoch": 0.749671412133438, + "grad_norm": 0.71484375, + "learning_rate": 0.00012051584274079102, + "loss": 0.7834, + "step": 29196 + }, + { + "epoch": 0.7496970893293597, + "grad_norm": 0.69140625, + "learning_rate": 0.00012051147347768971, + "loss": 0.7353, + "step": 29197 + }, + { + "epoch": 0.7497227665252816, + "grad_norm": 0.80078125, + "learning_rate": 0.00012050710417371092, + "loss": 0.7775, + "step": 29198 + }, + { + "epoch": 0.7497484437212034, + "grad_norm": 0.76171875, + "learning_rate": 0.00012050273482886329, + "loss": 0.9002, + "step": 29199 + }, + { + "epoch": 0.7497741209171253, + "grad_norm": 0.7578125, + "learning_rate": 0.0001204983654431556, + "loss": 0.7365, + "step": 29200 + }, + { + "epoch": 0.7497997981130471, + "grad_norm": 0.8046875, + "learning_rate": 0.0001204939960165965, + "loss": 0.972, + "step": 29201 + }, + { + "epoch": 0.7498254753089689, + "grad_norm": 0.7578125, + "learning_rate": 0.00012048962654919473, + "loss": 0.6976, + "step": 29202 + }, + { + "epoch": 0.7498511525048907, + "grad_norm": 0.87890625, + "learning_rate": 0.00012048525704095896, + "loss": 0.8622, + "step": 29203 + }, + { + "epoch": 0.7498768297008125, + "grad_norm": 0.796875, + "learning_rate": 0.00012048088749189794, + "loss": 0.8046, + "step": 29204 + }, + { + "epoch": 0.7499025068967343, + "grad_norm": 0.75, + "learning_rate": 0.00012047651790202037, + "loss": 0.8626, + "step": 29205 + }, + { + "epoch": 0.7499281840926562, + "grad_norm": 0.7578125, + "learning_rate": 0.00012047214827133493, + "loss": 0.9059, + "step": 29206 + }, + { + "epoch": 0.749953861288578, + "grad_norm": 0.7734375, + "learning_rate": 0.00012046777859985035, + "loss": 0.8239, + "step": 29207 + }, + { + "epoch": 0.7499795384844998, + "grad_norm": 0.828125, + "learning_rate": 0.00012046340888757534, + "loss": 0.7699, + "step": 29208 + }, + { + "epoch": 0.7500052156804217, + "grad_norm": 0.80859375, + "learning_rate": 0.00012045903913451862, + "loss": 1.0981, + "step": 29209 + }, + { + "epoch": 0.7500308928763434, + "grad_norm": 0.78125, + "learning_rate": 0.00012045466934068888, + "loss": 0.8401, + "step": 29210 + }, + { + "epoch": 0.7500565700722652, + "grad_norm": 0.90234375, + "learning_rate": 0.00012045029950609481, + "loss": 0.8724, + "step": 29211 + }, + { + "epoch": 0.7500822472681871, + "grad_norm": 0.80078125, + "learning_rate": 0.00012044592963074515, + "loss": 0.809, + "step": 29212 + }, + { + "epoch": 0.7501079244641089, + "grad_norm": 0.73046875, + "learning_rate": 0.0001204415597146486, + "loss": 0.7883, + "step": 29213 + }, + { + "epoch": 0.7501336016600307, + "grad_norm": 0.77734375, + "learning_rate": 0.00012043718975781386, + "loss": 0.9797, + "step": 29214 + }, + { + "epoch": 0.7501592788559526, + "grad_norm": 0.78515625, + "learning_rate": 0.00012043281976024967, + "loss": 0.8734, + "step": 29215 + }, + { + "epoch": 0.7501849560518744, + "grad_norm": 0.765625, + "learning_rate": 0.0001204284497219647, + "loss": 0.7778, + "step": 29216 + }, + { + "epoch": 0.7502106332477961, + "grad_norm": 0.73828125, + "learning_rate": 0.00012042407964296767, + "loss": 0.7502, + "step": 29217 + }, + { + "epoch": 0.750236310443718, + "grad_norm": 0.82421875, + "learning_rate": 0.00012041970952326732, + "loss": 0.8578, + "step": 29218 + }, + { + "epoch": 0.7502619876396398, + "grad_norm": 0.72265625, + "learning_rate": 0.00012041533936287228, + "loss": 0.8104, + "step": 29219 + }, + { + "epoch": 0.7502876648355616, + "grad_norm": 0.734375, + "learning_rate": 0.00012041096916179135, + "loss": 0.7381, + "step": 29220 + }, + { + "epoch": 0.7503133420314835, + "grad_norm": 0.7265625, + "learning_rate": 0.00012040659892003321, + "loss": 0.6719, + "step": 29221 + }, + { + "epoch": 0.7503390192274053, + "grad_norm": 0.77734375, + "learning_rate": 0.00012040222863760655, + "loss": 0.9064, + "step": 29222 + }, + { + "epoch": 0.750364696423327, + "grad_norm": 0.75, + "learning_rate": 0.0001203978583145201, + "loss": 0.9081, + "step": 29223 + }, + { + "epoch": 0.7503903736192489, + "grad_norm": 0.828125, + "learning_rate": 0.00012039348795078253, + "loss": 0.8567, + "step": 29224 + }, + { + "epoch": 0.7504160508151707, + "grad_norm": 0.81640625, + "learning_rate": 0.00012038911754640261, + "loss": 0.7224, + "step": 29225 + }, + { + "epoch": 0.7504417280110925, + "grad_norm": 0.73046875, + "learning_rate": 0.00012038474710138903, + "loss": 0.8211, + "step": 29226 + }, + { + "epoch": 0.7504674052070144, + "grad_norm": 0.75, + "learning_rate": 0.00012038037661575045, + "loss": 0.7743, + "step": 29227 + }, + { + "epoch": 0.7504930824029362, + "grad_norm": 0.78515625, + "learning_rate": 0.00012037600608949569, + "loss": 0.7765, + "step": 29228 + }, + { + "epoch": 0.750518759598858, + "grad_norm": 0.7578125, + "learning_rate": 0.00012037163552263333, + "loss": 0.8254, + "step": 29229 + }, + { + "epoch": 0.7505444367947798, + "grad_norm": 0.80859375, + "learning_rate": 0.00012036726491517214, + "loss": 0.8486, + "step": 29230 + }, + { + "epoch": 0.7505701139907016, + "grad_norm": 0.79296875, + "learning_rate": 0.00012036289426712087, + "loss": 0.8489, + "step": 29231 + }, + { + "epoch": 0.7505957911866235, + "grad_norm": 0.72265625, + "learning_rate": 0.00012035852357848813, + "loss": 0.7996, + "step": 29232 + }, + { + "epoch": 0.7506214683825453, + "grad_norm": 0.77734375, + "learning_rate": 0.00012035415284928273, + "loss": 0.7691, + "step": 29233 + }, + { + "epoch": 0.7506471455784671, + "grad_norm": 0.69921875, + "learning_rate": 0.00012034978207951336, + "loss": 0.7647, + "step": 29234 + }, + { + "epoch": 0.750672822774389, + "grad_norm": 0.7578125, + "learning_rate": 0.0001203454112691887, + "loss": 0.9242, + "step": 29235 + }, + { + "epoch": 0.7506984999703108, + "grad_norm": 0.76953125, + "learning_rate": 0.00012034104041831746, + "loss": 0.7522, + "step": 29236 + }, + { + "epoch": 0.7507241771662325, + "grad_norm": 0.7421875, + "learning_rate": 0.00012033666952690838, + "loss": 0.8771, + "step": 29237 + }, + { + "epoch": 0.7507498543621544, + "grad_norm": 0.73046875, + "learning_rate": 0.00012033229859497013, + "loss": 0.7689, + "step": 29238 + }, + { + "epoch": 0.7507755315580762, + "grad_norm": 0.703125, + "learning_rate": 0.00012032792762251147, + "loss": 0.8056, + "step": 29239 + }, + { + "epoch": 0.750801208753998, + "grad_norm": 0.8046875, + "learning_rate": 0.00012032355660954108, + "loss": 0.8324, + "step": 29240 + }, + { + "epoch": 0.7508268859499199, + "grad_norm": 0.71484375, + "learning_rate": 0.00012031918555606769, + "loss": 0.7297, + "step": 29241 + }, + { + "epoch": 0.7508525631458417, + "grad_norm": 0.85546875, + "learning_rate": 0.00012031481446209998, + "loss": 0.752, + "step": 29242 + }, + { + "epoch": 0.7508782403417634, + "grad_norm": 0.76953125, + "learning_rate": 0.00012031044332764667, + "loss": 0.8061, + "step": 29243 + }, + { + "epoch": 0.7509039175376853, + "grad_norm": 0.79296875, + "learning_rate": 0.0001203060721527165, + "loss": 0.706, + "step": 29244 + }, + { + "epoch": 0.7509295947336071, + "grad_norm": 0.84375, + "learning_rate": 0.00012030170093731816, + "loss": 1.0384, + "step": 29245 + }, + { + "epoch": 0.7509552719295289, + "grad_norm": 0.8125, + "learning_rate": 0.00012029732968146034, + "loss": 0.7782, + "step": 29246 + }, + { + "epoch": 0.7509809491254508, + "grad_norm": 0.75, + "learning_rate": 0.0001202929583851518, + "loss": 0.8059, + "step": 29247 + }, + { + "epoch": 0.7510066263213726, + "grad_norm": 0.8046875, + "learning_rate": 0.00012028858704840122, + "loss": 0.8944, + "step": 29248 + }, + { + "epoch": 0.7510323035172944, + "grad_norm": 0.8125, + "learning_rate": 0.00012028421567121732, + "loss": 0.9224, + "step": 29249 + }, + { + "epoch": 0.7510579807132162, + "grad_norm": 0.7734375, + "learning_rate": 0.00012027984425360883, + "loss": 0.7353, + "step": 29250 + }, + { + "epoch": 0.751083657909138, + "grad_norm": 0.80078125, + "learning_rate": 0.00012027547279558442, + "loss": 0.8851, + "step": 29251 + }, + { + "epoch": 0.7511093351050598, + "grad_norm": 0.77734375, + "learning_rate": 0.00012027110129715282, + "loss": 0.8471, + "step": 29252 + }, + { + "epoch": 0.7511350123009817, + "grad_norm": 0.77734375, + "learning_rate": 0.00012026672975832276, + "loss": 0.7939, + "step": 29253 + }, + { + "epoch": 0.7511606894969035, + "grad_norm": 0.7890625, + "learning_rate": 0.00012026235817910294, + "loss": 0.7313, + "step": 29254 + }, + { + "epoch": 0.7511863666928253, + "grad_norm": 0.7578125, + "learning_rate": 0.00012025798655950207, + "loss": 0.8671, + "step": 29255 + }, + { + "epoch": 0.7512120438887472, + "grad_norm": 0.8046875, + "learning_rate": 0.00012025361489952884, + "loss": 0.8958, + "step": 29256 + }, + { + "epoch": 0.7512377210846689, + "grad_norm": 0.8203125, + "learning_rate": 0.00012024924319919199, + "loss": 0.7595, + "step": 29257 + }, + { + "epoch": 0.7512633982805907, + "grad_norm": 0.73828125, + "learning_rate": 0.00012024487145850025, + "loss": 0.893, + "step": 29258 + }, + { + "epoch": 0.7512890754765126, + "grad_norm": 0.7578125, + "learning_rate": 0.00012024049967746229, + "loss": 0.8157, + "step": 29259 + }, + { + "epoch": 0.7513147526724344, + "grad_norm": 0.78125, + "learning_rate": 0.00012023612785608685, + "loss": 0.8649, + "step": 29260 + }, + { + "epoch": 0.7513404298683563, + "grad_norm": 0.76171875, + "learning_rate": 0.00012023175599438265, + "loss": 0.7129, + "step": 29261 + }, + { + "epoch": 0.7513661070642781, + "grad_norm": 0.75, + "learning_rate": 0.00012022738409235837, + "loss": 0.7924, + "step": 29262 + }, + { + "epoch": 0.7513917842601998, + "grad_norm": 0.82421875, + "learning_rate": 0.00012022301215002276, + "loss": 0.798, + "step": 29263 + }, + { + "epoch": 0.7514174614561216, + "grad_norm": 0.75, + "learning_rate": 0.00012021864016738447, + "loss": 0.8759, + "step": 29264 + }, + { + "epoch": 0.7514431386520435, + "grad_norm": 0.75390625, + "learning_rate": 0.00012021426814445229, + "loss": 0.715, + "step": 29265 + }, + { + "epoch": 0.7514688158479653, + "grad_norm": 0.75, + "learning_rate": 0.00012020989608123489, + "loss": 0.7702, + "step": 29266 + }, + { + "epoch": 0.7514944930438872, + "grad_norm": 0.7421875, + "learning_rate": 0.00012020552397774099, + "loss": 0.7203, + "step": 29267 + }, + { + "epoch": 0.751520170239809, + "grad_norm": 0.7734375, + "learning_rate": 0.00012020115183397936, + "loss": 0.8805, + "step": 29268 + }, + { + "epoch": 0.7515458474357308, + "grad_norm": 0.71484375, + "learning_rate": 0.00012019677964995862, + "loss": 0.7699, + "step": 29269 + }, + { + "epoch": 0.7515715246316526, + "grad_norm": 0.7265625, + "learning_rate": 0.00012019240742568747, + "loss": 0.754, + "step": 29270 + }, + { + "epoch": 0.7515972018275744, + "grad_norm": 0.7265625, + "learning_rate": 0.00012018803516117473, + "loss": 0.8499, + "step": 29271 + }, + { + "epoch": 0.7516228790234962, + "grad_norm": 0.74609375, + "learning_rate": 0.00012018366285642903, + "loss": 0.9202, + "step": 29272 + }, + { + "epoch": 0.7516485562194181, + "grad_norm": 0.75, + "learning_rate": 0.00012017929051145916, + "loss": 0.804, + "step": 29273 + }, + { + "epoch": 0.7516742334153399, + "grad_norm": 0.75, + "learning_rate": 0.00012017491812627377, + "loss": 0.7202, + "step": 29274 + }, + { + "epoch": 0.7516999106112617, + "grad_norm": 0.75390625, + "learning_rate": 0.00012017054570088157, + "loss": 0.7804, + "step": 29275 + }, + { + "epoch": 0.7517255878071836, + "grad_norm": 0.79296875, + "learning_rate": 0.0001201661732352913, + "loss": 0.8128, + "step": 29276 + }, + { + "epoch": 0.7517512650031053, + "grad_norm": 0.72265625, + "learning_rate": 0.00012016180072951168, + "loss": 0.7847, + "step": 29277 + }, + { + "epoch": 0.7517769421990271, + "grad_norm": 0.80859375, + "learning_rate": 0.0001201574281835514, + "loss": 0.8137, + "step": 29278 + }, + { + "epoch": 0.751802619394949, + "grad_norm": 0.8203125, + "learning_rate": 0.00012015305559741918, + "loss": 0.9218, + "step": 29279 + }, + { + "epoch": 0.7518282965908708, + "grad_norm": 0.7265625, + "learning_rate": 0.00012014868297112376, + "loss": 0.8287, + "step": 29280 + }, + { + "epoch": 0.7518539737867926, + "grad_norm": 0.78515625, + "learning_rate": 0.00012014431030467384, + "loss": 0.7358, + "step": 29281 + }, + { + "epoch": 0.7518796509827145, + "grad_norm": 0.79296875, + "learning_rate": 0.00012013993759807811, + "loss": 0.8587, + "step": 29282 + }, + { + "epoch": 0.7519053281786362, + "grad_norm": 0.80078125, + "learning_rate": 0.00012013556485134529, + "loss": 0.8703, + "step": 29283 + }, + { + "epoch": 0.751931005374558, + "grad_norm": 0.7890625, + "learning_rate": 0.00012013119206448413, + "loss": 0.8381, + "step": 29284 + }, + { + "epoch": 0.7519566825704799, + "grad_norm": 0.7578125, + "learning_rate": 0.00012012681923750332, + "loss": 0.8084, + "step": 29285 + }, + { + "epoch": 0.7519823597664017, + "grad_norm": 0.79296875, + "learning_rate": 0.0001201224463704116, + "loss": 0.772, + "step": 29286 + }, + { + "epoch": 0.7520080369623235, + "grad_norm": 0.69921875, + "learning_rate": 0.00012011807346321764, + "loss": 0.8449, + "step": 29287 + }, + { + "epoch": 0.7520337141582454, + "grad_norm": 0.828125, + "learning_rate": 0.00012011370051593016, + "loss": 0.8468, + "step": 29288 + }, + { + "epoch": 0.7520593913541672, + "grad_norm": 0.7578125, + "learning_rate": 0.00012010932752855793, + "loss": 0.7869, + "step": 29289 + }, + { + "epoch": 0.7520850685500889, + "grad_norm": 0.71875, + "learning_rate": 0.00012010495450110961, + "loss": 0.8776, + "step": 29290 + }, + { + "epoch": 0.7521107457460108, + "grad_norm": 0.78515625, + "learning_rate": 0.0001201005814335939, + "loss": 0.8172, + "step": 29291 + }, + { + "epoch": 0.7521364229419326, + "grad_norm": 0.75390625, + "learning_rate": 0.00012009620832601957, + "loss": 0.7716, + "step": 29292 + }, + { + "epoch": 0.7521621001378545, + "grad_norm": 0.80078125, + "learning_rate": 0.00012009183517839533, + "loss": 0.8182, + "step": 29293 + }, + { + "epoch": 0.7521877773337763, + "grad_norm": 0.7265625, + "learning_rate": 0.00012008746199072987, + "loss": 0.8316, + "step": 29294 + }, + { + "epoch": 0.7522134545296981, + "grad_norm": 0.94140625, + "learning_rate": 0.00012008308876303191, + "loss": 0.8252, + "step": 29295 + }, + { + "epoch": 0.75223913172562, + "grad_norm": 0.87890625, + "learning_rate": 0.00012007871549531016, + "loss": 0.8732, + "step": 29296 + }, + { + "epoch": 0.7522648089215417, + "grad_norm": 0.8203125, + "learning_rate": 0.00012007434218757334, + "loss": 0.8923, + "step": 29297 + }, + { + "epoch": 0.7522904861174635, + "grad_norm": 0.80859375, + "learning_rate": 0.0001200699688398302, + "loss": 0.8436, + "step": 29298 + }, + { + "epoch": 0.7523161633133854, + "grad_norm": 0.81640625, + "learning_rate": 0.00012006559545208938, + "loss": 0.8325, + "step": 29299 + }, + { + "epoch": 0.7523418405093072, + "grad_norm": 0.796875, + "learning_rate": 0.00012006122202435966, + "loss": 0.8981, + "step": 29300 + }, + { + "epoch": 0.752367517705229, + "grad_norm": 0.8125, + "learning_rate": 0.00012005684855664974, + "loss": 0.8558, + "step": 29301 + }, + { + "epoch": 0.7523931949011509, + "grad_norm": 0.7890625, + "learning_rate": 0.00012005247504896835, + "loss": 0.9046, + "step": 29302 + }, + { + "epoch": 0.7524188720970726, + "grad_norm": 0.828125, + "learning_rate": 0.00012004810150132418, + "loss": 0.8777, + "step": 29303 + }, + { + "epoch": 0.7524445492929944, + "grad_norm": 0.8046875, + "learning_rate": 0.00012004372791372591, + "loss": 0.8223, + "step": 29304 + }, + { + "epoch": 0.7524702264889163, + "grad_norm": 0.8046875, + "learning_rate": 0.00012003935428618233, + "loss": 0.8542, + "step": 29305 + }, + { + "epoch": 0.7524959036848381, + "grad_norm": 0.8984375, + "learning_rate": 0.00012003498061870215, + "loss": 0.7422, + "step": 29306 + }, + { + "epoch": 0.7525215808807599, + "grad_norm": 0.78125, + "learning_rate": 0.00012003060691129406, + "loss": 0.7094, + "step": 29307 + }, + { + "epoch": 0.7525472580766818, + "grad_norm": 0.69140625, + "learning_rate": 0.00012002623316396678, + "loss": 0.7492, + "step": 29308 + }, + { + "epoch": 0.7525729352726036, + "grad_norm": 0.79296875, + "learning_rate": 0.00012002185937672898, + "loss": 0.833, + "step": 29309 + }, + { + "epoch": 0.7525986124685253, + "grad_norm": 0.84375, + "learning_rate": 0.00012001748554958947, + "loss": 0.8604, + "step": 29310 + }, + { + "epoch": 0.7526242896644472, + "grad_norm": 0.75390625, + "learning_rate": 0.00012001311168255691, + "loss": 0.8222, + "step": 29311 + }, + { + "epoch": 0.752649966860369, + "grad_norm": 0.8046875, + "learning_rate": 0.00012000873777563999, + "loss": 0.7986, + "step": 29312 + }, + { + "epoch": 0.7526756440562908, + "grad_norm": 0.82421875, + "learning_rate": 0.0001200043638288475, + "loss": 0.8001, + "step": 29313 + }, + { + "epoch": 0.7527013212522127, + "grad_norm": 0.80859375, + "learning_rate": 0.00011999998984218814, + "loss": 0.8068, + "step": 29314 + }, + { + "epoch": 0.7527269984481345, + "grad_norm": 0.80859375, + "learning_rate": 0.00011999561581567058, + "loss": 0.7341, + "step": 29315 + }, + { + "epoch": 0.7527526756440563, + "grad_norm": 0.75, + "learning_rate": 0.00011999124174930356, + "loss": 0.7443, + "step": 29316 + }, + { + "epoch": 0.7527783528399781, + "grad_norm": 0.75390625, + "learning_rate": 0.00011998686764309581, + "loss": 0.7865, + "step": 29317 + }, + { + "epoch": 0.7528040300358999, + "grad_norm": 0.84765625, + "learning_rate": 0.00011998249349705601, + "loss": 0.8333, + "step": 29318 + }, + { + "epoch": 0.7528297072318217, + "grad_norm": 0.76953125, + "learning_rate": 0.00011997811931119293, + "loss": 0.7594, + "step": 29319 + }, + { + "epoch": 0.7528553844277436, + "grad_norm": 0.7890625, + "learning_rate": 0.00011997374508551528, + "loss": 0.8347, + "step": 29320 + }, + { + "epoch": 0.7528810616236654, + "grad_norm": 0.73828125, + "learning_rate": 0.00011996937082003175, + "loss": 0.7649, + "step": 29321 + }, + { + "epoch": 0.7529067388195873, + "grad_norm": 0.6796875, + "learning_rate": 0.00011996499651475107, + "loss": 0.8573, + "step": 29322 + }, + { + "epoch": 0.752932416015509, + "grad_norm": 0.86328125, + "learning_rate": 0.00011996062216968194, + "loss": 0.7392, + "step": 29323 + }, + { + "epoch": 0.7529580932114308, + "grad_norm": 0.77734375, + "learning_rate": 0.0001199562477848331, + "loss": 0.8485, + "step": 29324 + }, + { + "epoch": 0.7529837704073526, + "grad_norm": 0.79296875, + "learning_rate": 0.00011995187336021325, + "loss": 0.7929, + "step": 29325 + }, + { + "epoch": 0.7530094476032745, + "grad_norm": 0.77734375, + "learning_rate": 0.00011994749889583112, + "loss": 0.7877, + "step": 29326 + }, + { + "epoch": 0.7530351247991963, + "grad_norm": 0.75390625, + "learning_rate": 0.00011994312439169546, + "loss": 0.7733, + "step": 29327 + }, + { + "epoch": 0.7530608019951182, + "grad_norm": 0.8125, + "learning_rate": 0.0001199387498478149, + "loss": 0.7616, + "step": 29328 + }, + { + "epoch": 0.75308647919104, + "grad_norm": 0.87109375, + "learning_rate": 0.00011993437526419826, + "loss": 0.8676, + "step": 29329 + }, + { + "epoch": 0.7531121563869617, + "grad_norm": 0.7421875, + "learning_rate": 0.0001199300006408542, + "loss": 0.8643, + "step": 29330 + }, + { + "epoch": 0.7531378335828836, + "grad_norm": 0.73046875, + "learning_rate": 0.00011992562597779143, + "loss": 0.7532, + "step": 29331 + }, + { + "epoch": 0.7531635107788054, + "grad_norm": 0.8515625, + "learning_rate": 0.0001199212512750187, + "loss": 0.8099, + "step": 29332 + }, + { + "epoch": 0.7531891879747272, + "grad_norm": 0.76953125, + "learning_rate": 0.0001199168765325447, + "loss": 0.8487, + "step": 29333 + }, + { + "epoch": 0.7532148651706491, + "grad_norm": 0.91015625, + "learning_rate": 0.0001199125017503782, + "loss": 0.8071, + "step": 29334 + }, + { + "epoch": 0.7532405423665709, + "grad_norm": 0.8359375, + "learning_rate": 0.00011990812692852786, + "loss": 0.9166, + "step": 29335 + }, + { + "epoch": 0.7532662195624926, + "grad_norm": 0.765625, + "learning_rate": 0.00011990375206700242, + "loss": 0.9159, + "step": 29336 + }, + { + "epoch": 0.7532918967584145, + "grad_norm": 0.80859375, + "learning_rate": 0.00011989937716581061, + "loss": 0.836, + "step": 29337 + }, + { + "epoch": 0.7533175739543363, + "grad_norm": 0.78515625, + "learning_rate": 0.00011989500222496113, + "loss": 0.7281, + "step": 29338 + }, + { + "epoch": 0.7533432511502581, + "grad_norm": 0.765625, + "learning_rate": 0.00011989062724446269, + "loss": 0.76, + "step": 29339 + }, + { + "epoch": 0.75336892834618, + "grad_norm": 0.75, + "learning_rate": 0.00011988625222432408, + "loss": 0.8437, + "step": 29340 + }, + { + "epoch": 0.7533946055421018, + "grad_norm": 0.75, + "learning_rate": 0.0001198818771645539, + "loss": 0.7505, + "step": 29341 + }, + { + "epoch": 0.7534202827380236, + "grad_norm": 0.796875, + "learning_rate": 0.00011987750206516097, + "loss": 0.7699, + "step": 29342 + }, + { + "epoch": 0.7534459599339454, + "grad_norm": 0.7578125, + "learning_rate": 0.00011987312692615398, + "loss": 0.8462, + "step": 29343 + }, + { + "epoch": 0.7534716371298672, + "grad_norm": 0.78125, + "learning_rate": 0.00011986875174754162, + "loss": 0.7153, + "step": 29344 + }, + { + "epoch": 0.753497314325789, + "grad_norm": 0.77734375, + "learning_rate": 0.00011986437652933265, + "loss": 0.8903, + "step": 29345 + }, + { + "epoch": 0.7535229915217109, + "grad_norm": 0.796875, + "learning_rate": 0.00011986000127153578, + "loss": 0.85, + "step": 29346 + }, + { + "epoch": 0.7535486687176327, + "grad_norm": 0.76171875, + "learning_rate": 0.00011985562597415971, + "loss": 0.8546, + "step": 29347 + }, + { + "epoch": 0.7535743459135545, + "grad_norm": 0.85546875, + "learning_rate": 0.00011985125063721319, + "loss": 0.9883, + "step": 29348 + }, + { + "epoch": 0.7536000231094764, + "grad_norm": 0.68359375, + "learning_rate": 0.00011984687526070486, + "loss": 0.8065, + "step": 29349 + }, + { + "epoch": 0.7536257003053981, + "grad_norm": 0.75, + "learning_rate": 0.00011984249984464355, + "loss": 0.6921, + "step": 29350 + }, + { + "epoch": 0.7536513775013199, + "grad_norm": 0.7734375, + "learning_rate": 0.00011983812438903794, + "loss": 0.8479, + "step": 29351 + }, + { + "epoch": 0.7536770546972418, + "grad_norm": 0.703125, + "learning_rate": 0.00011983374889389669, + "loss": 0.7565, + "step": 29352 + }, + { + "epoch": 0.7537027318931636, + "grad_norm": 0.80859375, + "learning_rate": 0.00011982937335922863, + "loss": 0.7751, + "step": 29353 + }, + { + "epoch": 0.7537284090890854, + "grad_norm": 0.80859375, + "learning_rate": 0.0001198249977850424, + "loss": 0.8438, + "step": 29354 + }, + { + "epoch": 0.7537540862850073, + "grad_norm": 0.73046875, + "learning_rate": 0.00011982062217134671, + "loss": 0.7179, + "step": 29355 + }, + { + "epoch": 0.753779763480929, + "grad_norm": 0.82421875, + "learning_rate": 0.00011981624651815035, + "loss": 0.7588, + "step": 29356 + }, + { + "epoch": 0.7538054406768508, + "grad_norm": 0.86328125, + "learning_rate": 0.00011981187082546197, + "loss": 0.8889, + "step": 29357 + }, + { + "epoch": 0.7538311178727727, + "grad_norm": 0.875, + "learning_rate": 0.00011980749509329033, + "loss": 0.8068, + "step": 29358 + }, + { + "epoch": 0.7538567950686945, + "grad_norm": 0.78515625, + "learning_rate": 0.00011980311932164414, + "loss": 0.7722, + "step": 29359 + }, + { + "epoch": 0.7538824722646164, + "grad_norm": 0.7578125, + "learning_rate": 0.00011979874351053215, + "loss": 0.7726, + "step": 29360 + }, + { + "epoch": 0.7539081494605382, + "grad_norm": 0.8671875, + "learning_rate": 0.00011979436765996303, + "loss": 0.8589, + "step": 29361 + }, + { + "epoch": 0.75393382665646, + "grad_norm": 0.8203125, + "learning_rate": 0.00011978999176994552, + "loss": 0.8869, + "step": 29362 + }, + { + "epoch": 0.7539595038523818, + "grad_norm": 0.72265625, + "learning_rate": 0.00011978561584048833, + "loss": 0.7433, + "step": 29363 + }, + { + "epoch": 0.7539851810483036, + "grad_norm": 0.6796875, + "learning_rate": 0.00011978123987160021, + "loss": 0.736, + "step": 29364 + }, + { + "epoch": 0.7540108582442254, + "grad_norm": 0.765625, + "learning_rate": 0.00011977686386328987, + "loss": 0.7286, + "step": 29365 + }, + { + "epoch": 0.7540365354401473, + "grad_norm": 0.79296875, + "learning_rate": 0.00011977248781556605, + "loss": 0.8793, + "step": 29366 + }, + { + "epoch": 0.7540622126360691, + "grad_norm": 0.6875, + "learning_rate": 0.00011976811172843742, + "loss": 0.7918, + "step": 29367 + }, + { + "epoch": 0.7540878898319909, + "grad_norm": 0.75390625, + "learning_rate": 0.00011976373560191271, + "loss": 0.6669, + "step": 29368 + }, + { + "epoch": 0.7541135670279128, + "grad_norm": 0.76171875, + "learning_rate": 0.0001197593594360007, + "loss": 0.9133, + "step": 29369 + }, + { + "epoch": 0.7541392442238345, + "grad_norm": 0.73046875, + "learning_rate": 0.00011975498323071006, + "loss": 0.8632, + "step": 29370 + }, + { + "epoch": 0.7541649214197563, + "grad_norm": 0.765625, + "learning_rate": 0.0001197506069860495, + "loss": 0.7965, + "step": 29371 + }, + { + "epoch": 0.7541905986156782, + "grad_norm": 0.984375, + "learning_rate": 0.00011974623070202777, + "loss": 0.8062, + "step": 29372 + }, + { + "epoch": 0.7542162758116, + "grad_norm": 0.82421875, + "learning_rate": 0.00011974185437865362, + "loss": 0.9052, + "step": 29373 + }, + { + "epoch": 0.7542419530075218, + "grad_norm": 0.7265625, + "learning_rate": 0.00011973747801593571, + "loss": 0.7117, + "step": 29374 + }, + { + "epoch": 0.7542676302034437, + "grad_norm": 0.76953125, + "learning_rate": 0.00011973310161388281, + "loss": 0.7487, + "step": 29375 + }, + { + "epoch": 0.7542933073993654, + "grad_norm": 0.7578125, + "learning_rate": 0.00011972872517250358, + "loss": 0.8818, + "step": 29376 + }, + { + "epoch": 0.7543189845952872, + "grad_norm": 0.70703125, + "learning_rate": 0.0001197243486918068, + "loss": 0.7754, + "step": 29377 + }, + { + "epoch": 0.7543446617912091, + "grad_norm": 0.83984375, + "learning_rate": 0.0001197199721718012, + "loss": 0.819, + "step": 29378 + }, + { + "epoch": 0.7543703389871309, + "grad_norm": 0.7734375, + "learning_rate": 0.00011971559561249546, + "loss": 0.7982, + "step": 29379 + }, + { + "epoch": 0.7543960161830527, + "grad_norm": 0.73828125, + "learning_rate": 0.00011971121901389832, + "loss": 0.8345, + "step": 29380 + }, + { + "epoch": 0.7544216933789746, + "grad_norm": 0.9140625, + "learning_rate": 0.00011970684237601848, + "loss": 0.8704, + "step": 29381 + }, + { + "epoch": 0.7544473705748964, + "grad_norm": 0.77734375, + "learning_rate": 0.0001197024656988647, + "loss": 0.7144, + "step": 29382 + }, + { + "epoch": 0.7544730477708181, + "grad_norm": 0.8125, + "learning_rate": 0.0001196980889824457, + "loss": 0.7299, + "step": 29383 + }, + { + "epoch": 0.75449872496674, + "grad_norm": 0.82421875, + "learning_rate": 0.00011969371222677015, + "loss": 0.8374, + "step": 29384 + }, + { + "epoch": 0.7545244021626618, + "grad_norm": 0.80859375, + "learning_rate": 0.00011968933543184683, + "loss": 0.813, + "step": 29385 + }, + { + "epoch": 0.7545500793585836, + "grad_norm": 0.8125, + "learning_rate": 0.00011968495859768447, + "loss": 0.8571, + "step": 29386 + }, + { + "epoch": 0.7545757565545055, + "grad_norm": 0.83984375, + "learning_rate": 0.00011968058172429175, + "loss": 0.7733, + "step": 29387 + }, + { + "epoch": 0.7546014337504273, + "grad_norm": 0.76171875, + "learning_rate": 0.00011967620481167741, + "loss": 0.7465, + "step": 29388 + }, + { + "epoch": 0.7546271109463492, + "grad_norm": 0.8515625, + "learning_rate": 0.00011967182785985014, + "loss": 0.7819, + "step": 29389 + }, + { + "epoch": 0.7546527881422709, + "grad_norm": 0.74609375, + "learning_rate": 0.00011966745086881873, + "loss": 0.993, + "step": 29390 + }, + { + "epoch": 0.7546784653381927, + "grad_norm": 0.75390625, + "learning_rate": 0.00011966307383859185, + "loss": 0.9212, + "step": 29391 + }, + { + "epoch": 0.7547041425341146, + "grad_norm": 0.8515625, + "learning_rate": 0.00011965869676917823, + "loss": 0.8168, + "step": 29392 + }, + { + "epoch": 0.7547298197300364, + "grad_norm": 0.78515625, + "learning_rate": 0.00011965431966058665, + "loss": 0.7165, + "step": 29393 + }, + { + "epoch": 0.7547554969259582, + "grad_norm": 0.9375, + "learning_rate": 0.00011964994251282573, + "loss": 0.6951, + "step": 29394 + }, + { + "epoch": 0.7547811741218801, + "grad_norm": 0.7265625, + "learning_rate": 0.00011964556532590428, + "loss": 0.8789, + "step": 29395 + }, + { + "epoch": 0.7548068513178018, + "grad_norm": 0.69921875, + "learning_rate": 0.000119641188099831, + "loss": 0.8058, + "step": 29396 + }, + { + "epoch": 0.7548325285137236, + "grad_norm": 0.78515625, + "learning_rate": 0.00011963681083461457, + "loss": 0.7645, + "step": 29397 + }, + { + "epoch": 0.7548582057096455, + "grad_norm": 0.7890625, + "learning_rate": 0.0001196324335302638, + "loss": 0.8052, + "step": 29398 + }, + { + "epoch": 0.7548838829055673, + "grad_norm": 0.859375, + "learning_rate": 0.00011962805618678735, + "loss": 1.025, + "step": 29399 + }, + { + "epoch": 0.7549095601014891, + "grad_norm": 0.74609375, + "learning_rate": 0.00011962367880419391, + "loss": 0.8706, + "step": 29400 + }, + { + "epoch": 0.754935237297411, + "grad_norm": 0.75, + "learning_rate": 0.00011961930138249231, + "loss": 0.7582, + "step": 29401 + }, + { + "epoch": 0.7549609144933328, + "grad_norm": 0.84375, + "learning_rate": 0.00011961492392169119, + "loss": 0.8809, + "step": 29402 + }, + { + "epoch": 0.7549865916892545, + "grad_norm": 0.7734375, + "learning_rate": 0.00011961054642179928, + "loss": 0.7397, + "step": 29403 + }, + { + "epoch": 0.7550122688851764, + "grad_norm": 0.79296875, + "learning_rate": 0.00011960616888282535, + "loss": 0.8694, + "step": 29404 + }, + { + "epoch": 0.7550379460810982, + "grad_norm": 0.83203125, + "learning_rate": 0.00011960179130477809, + "loss": 0.885, + "step": 29405 + }, + { + "epoch": 0.75506362327702, + "grad_norm": 0.80859375, + "learning_rate": 0.00011959741368766621, + "loss": 1.0298, + "step": 29406 + }, + { + "epoch": 0.7550893004729419, + "grad_norm": 0.7890625, + "learning_rate": 0.00011959303603149847, + "loss": 0.7637, + "step": 29407 + }, + { + "epoch": 0.7551149776688637, + "grad_norm": 0.78125, + "learning_rate": 0.00011958865833628357, + "loss": 0.7529, + "step": 29408 + }, + { + "epoch": 0.7551406548647855, + "grad_norm": 0.80859375, + "learning_rate": 0.00011958428060203026, + "loss": 0.831, + "step": 29409 + }, + { + "epoch": 0.7551663320607073, + "grad_norm": 0.87109375, + "learning_rate": 0.00011957990282874724, + "loss": 0.8588, + "step": 29410 + }, + { + "epoch": 0.7551920092566291, + "grad_norm": 0.75, + "learning_rate": 0.00011957552501644325, + "loss": 0.8233, + "step": 29411 + }, + { + "epoch": 0.7552176864525509, + "grad_norm": 0.7421875, + "learning_rate": 0.000119571147165127, + "loss": 0.81, + "step": 29412 + }, + { + "epoch": 0.7552433636484728, + "grad_norm": 0.71875, + "learning_rate": 0.00011956676927480721, + "loss": 0.8356, + "step": 29413 + }, + { + "epoch": 0.7552690408443946, + "grad_norm": 0.82421875, + "learning_rate": 0.00011956239134549262, + "loss": 0.7696, + "step": 29414 + }, + { + "epoch": 0.7552947180403164, + "grad_norm": 0.796875, + "learning_rate": 0.000119558013377192, + "loss": 0.7707, + "step": 29415 + }, + { + "epoch": 0.7553203952362382, + "grad_norm": 0.72265625, + "learning_rate": 0.00011955363536991395, + "loss": 0.7359, + "step": 29416 + }, + { + "epoch": 0.75534607243216, + "grad_norm": 0.73046875, + "learning_rate": 0.0001195492573236673, + "loss": 0.7758, + "step": 29417 + }, + { + "epoch": 0.7553717496280818, + "grad_norm": 0.82421875, + "learning_rate": 0.00011954487923846076, + "loss": 0.7995, + "step": 29418 + }, + { + "epoch": 0.7553974268240037, + "grad_norm": 0.77734375, + "learning_rate": 0.00011954050111430306, + "loss": 0.7854, + "step": 29419 + }, + { + "epoch": 0.7554231040199255, + "grad_norm": 0.8515625, + "learning_rate": 0.00011953612295120286, + "loss": 0.7997, + "step": 29420 + }, + { + "epoch": 0.7554487812158474, + "grad_norm": 0.75390625, + "learning_rate": 0.00011953174474916896, + "loss": 0.7945, + "step": 29421 + }, + { + "epoch": 0.7554744584117692, + "grad_norm": 0.7890625, + "learning_rate": 0.00011952736650821003, + "loss": 0.7378, + "step": 29422 + }, + { + "epoch": 0.7555001356076909, + "grad_norm": 0.80078125, + "learning_rate": 0.00011952298822833486, + "loss": 0.8366, + "step": 29423 + }, + { + "epoch": 0.7555258128036128, + "grad_norm": 0.8671875, + "learning_rate": 0.0001195186099095521, + "loss": 0.8896, + "step": 29424 + }, + { + "epoch": 0.7555514899995346, + "grad_norm": 0.78125, + "learning_rate": 0.00011951423155187055, + "loss": 0.8053, + "step": 29425 + }, + { + "epoch": 0.7555771671954564, + "grad_norm": 0.82421875, + "learning_rate": 0.00011950985315529887, + "loss": 0.9484, + "step": 29426 + }, + { + "epoch": 0.7556028443913783, + "grad_norm": 0.75390625, + "learning_rate": 0.00011950547471984583, + "loss": 0.8017, + "step": 29427 + }, + { + "epoch": 0.7556285215873001, + "grad_norm": 0.7265625, + "learning_rate": 0.00011950109624552015, + "loss": 0.793, + "step": 29428 + }, + { + "epoch": 0.7556541987832219, + "grad_norm": 0.87109375, + "learning_rate": 0.00011949671773233051, + "loss": 0.7964, + "step": 29429 + }, + { + "epoch": 0.7556798759791437, + "grad_norm": 0.92578125, + "learning_rate": 0.0001194923391802857, + "loss": 0.8843, + "step": 29430 + }, + { + "epoch": 0.7557055531750655, + "grad_norm": 0.859375, + "learning_rate": 0.00011948796058939442, + "loss": 0.8751, + "step": 29431 + }, + { + "epoch": 0.7557312303709873, + "grad_norm": 0.828125, + "learning_rate": 0.0001194835819596654, + "loss": 0.9262, + "step": 29432 + }, + { + "epoch": 0.7557569075669092, + "grad_norm": 0.734375, + "learning_rate": 0.00011947920329110735, + "loss": 0.7524, + "step": 29433 + }, + { + "epoch": 0.755782584762831, + "grad_norm": 0.94140625, + "learning_rate": 0.000119474824583729, + "loss": 0.9523, + "step": 29434 + }, + { + "epoch": 0.7558082619587528, + "grad_norm": 0.7734375, + "learning_rate": 0.00011947044583753911, + "loss": 0.83, + "step": 29435 + }, + { + "epoch": 0.7558339391546746, + "grad_norm": 0.796875, + "learning_rate": 0.00011946606705254635, + "loss": 0.9, + "step": 29436 + }, + { + "epoch": 0.7558596163505964, + "grad_norm": 0.734375, + "learning_rate": 0.00011946168822875948, + "loss": 0.6987, + "step": 29437 + }, + { + "epoch": 0.7558852935465182, + "grad_norm": 0.74609375, + "learning_rate": 0.00011945730936618724, + "loss": 0.8154, + "step": 29438 + }, + { + "epoch": 0.7559109707424401, + "grad_norm": 0.76171875, + "learning_rate": 0.00011945293046483836, + "loss": 0.9294, + "step": 29439 + }, + { + "epoch": 0.7559366479383619, + "grad_norm": 0.73046875, + "learning_rate": 0.0001194485515247215, + "loss": 0.6925, + "step": 29440 + }, + { + "epoch": 0.7559623251342837, + "grad_norm": 0.78125, + "learning_rate": 0.00011944417254584545, + "loss": 0.773, + "step": 29441 + }, + { + "epoch": 0.7559880023302056, + "grad_norm": 0.78125, + "learning_rate": 0.00011943979352821894, + "loss": 0.9402, + "step": 29442 + }, + { + "epoch": 0.7560136795261273, + "grad_norm": 0.8046875, + "learning_rate": 0.00011943541447185065, + "loss": 0.7383, + "step": 29443 + }, + { + "epoch": 0.7560393567220491, + "grad_norm": 0.77734375, + "learning_rate": 0.00011943103537674933, + "loss": 0.7609, + "step": 29444 + }, + { + "epoch": 0.756065033917971, + "grad_norm": 0.89453125, + "learning_rate": 0.00011942665624292375, + "loss": 0.9369, + "step": 29445 + }, + { + "epoch": 0.7560907111138928, + "grad_norm": 0.75390625, + "learning_rate": 0.00011942227707038258, + "loss": 0.7812, + "step": 29446 + }, + { + "epoch": 0.7561163883098146, + "grad_norm": 0.76171875, + "learning_rate": 0.00011941789785913457, + "loss": 0.7937, + "step": 29447 + }, + { + "epoch": 0.7561420655057365, + "grad_norm": 0.7109375, + "learning_rate": 0.0001194135186091884, + "loss": 0.829, + "step": 29448 + }, + { + "epoch": 0.7561677427016583, + "grad_norm": 0.83203125, + "learning_rate": 0.00011940913932055289, + "loss": 0.8894, + "step": 29449 + }, + { + "epoch": 0.75619341989758, + "grad_norm": 0.83984375, + "learning_rate": 0.00011940475999323673, + "loss": 0.7825, + "step": 29450 + }, + { + "epoch": 0.7562190970935019, + "grad_norm": 0.73046875, + "learning_rate": 0.0001194003806272486, + "loss": 0.6487, + "step": 29451 + }, + { + "epoch": 0.7562447742894237, + "grad_norm": 0.87109375, + "learning_rate": 0.0001193960012225973, + "loss": 0.8759, + "step": 29452 + }, + { + "epoch": 0.7562704514853456, + "grad_norm": 0.67578125, + "learning_rate": 0.00011939162177929146, + "loss": 0.652, + "step": 29453 + }, + { + "epoch": 0.7562961286812674, + "grad_norm": 0.6953125, + "learning_rate": 0.00011938724229733993, + "loss": 0.7781, + "step": 29454 + }, + { + "epoch": 0.7563218058771892, + "grad_norm": 0.74609375, + "learning_rate": 0.00011938286277675135, + "loss": 0.7247, + "step": 29455 + }, + { + "epoch": 0.756347483073111, + "grad_norm": 0.80859375, + "learning_rate": 0.00011937848321753446, + "loss": 0.9183, + "step": 29456 + }, + { + "epoch": 0.7563731602690328, + "grad_norm": 0.765625, + "learning_rate": 0.00011937410361969803, + "loss": 0.777, + "step": 29457 + }, + { + "epoch": 0.7563988374649546, + "grad_norm": 0.90234375, + "learning_rate": 0.00011936972398325075, + "loss": 0.8722, + "step": 29458 + }, + { + "epoch": 0.7564245146608765, + "grad_norm": 0.859375, + "learning_rate": 0.00011936534430820137, + "loss": 0.7669, + "step": 29459 + }, + { + "epoch": 0.7564501918567983, + "grad_norm": 0.8046875, + "learning_rate": 0.00011936096459455862, + "loss": 0.8477, + "step": 29460 + }, + { + "epoch": 0.7564758690527201, + "grad_norm": 0.765625, + "learning_rate": 0.00011935658484233117, + "loss": 0.7809, + "step": 29461 + }, + { + "epoch": 0.756501546248642, + "grad_norm": 0.8125, + "learning_rate": 0.00011935220505152785, + "loss": 0.8764, + "step": 29462 + }, + { + "epoch": 0.7565272234445637, + "grad_norm": 0.80078125, + "learning_rate": 0.00011934782522215729, + "loss": 0.9189, + "step": 29463 + }, + { + "epoch": 0.7565529006404855, + "grad_norm": 0.8046875, + "learning_rate": 0.00011934344535422828, + "loss": 0.9295, + "step": 29464 + }, + { + "epoch": 0.7565785778364074, + "grad_norm": 0.71875, + "learning_rate": 0.00011933906544774953, + "loss": 0.799, + "step": 29465 + }, + { + "epoch": 0.7566042550323292, + "grad_norm": 0.80859375, + "learning_rate": 0.00011933468550272977, + "loss": 0.7479, + "step": 29466 + }, + { + "epoch": 0.756629932228251, + "grad_norm": 0.8671875, + "learning_rate": 0.00011933030551917772, + "loss": 0.8158, + "step": 29467 + }, + { + "epoch": 0.7566556094241729, + "grad_norm": 0.828125, + "learning_rate": 0.00011932592549710213, + "loss": 0.8688, + "step": 29468 + }, + { + "epoch": 0.7566812866200947, + "grad_norm": 0.76953125, + "learning_rate": 0.0001193215454365117, + "loss": 0.8332, + "step": 29469 + }, + { + "epoch": 0.7567069638160164, + "grad_norm": 0.828125, + "learning_rate": 0.0001193171653374152, + "loss": 0.9022, + "step": 29470 + }, + { + "epoch": 0.7567326410119383, + "grad_norm": 0.73046875, + "learning_rate": 0.00011931278519982133, + "loss": 0.8279, + "step": 29471 + }, + { + "epoch": 0.7567583182078601, + "grad_norm": 0.73828125, + "learning_rate": 0.00011930840502373881, + "loss": 0.7372, + "step": 29472 + }, + { + "epoch": 0.7567839954037819, + "grad_norm": 0.7890625, + "learning_rate": 0.0001193040248091764, + "loss": 0.8417, + "step": 29473 + }, + { + "epoch": 0.7568096725997038, + "grad_norm": 0.7421875, + "learning_rate": 0.00011929964455614278, + "loss": 0.7956, + "step": 29474 + }, + { + "epoch": 0.7568353497956256, + "grad_norm": 0.7578125, + "learning_rate": 0.00011929526426464672, + "loss": 0.8227, + "step": 29475 + }, + { + "epoch": 0.7568610269915473, + "grad_norm": 0.80859375, + "learning_rate": 0.00011929088393469697, + "loss": 0.8972, + "step": 29476 + }, + { + "epoch": 0.7568867041874692, + "grad_norm": 0.703125, + "learning_rate": 0.0001192865035663022, + "loss": 0.6895, + "step": 29477 + }, + { + "epoch": 0.756912381383391, + "grad_norm": 0.765625, + "learning_rate": 0.0001192821231594712, + "loss": 0.7879, + "step": 29478 + }, + { + "epoch": 0.7569380585793128, + "grad_norm": 0.80078125, + "learning_rate": 0.00011927774271421266, + "loss": 0.8228, + "step": 29479 + }, + { + "epoch": 0.7569637357752347, + "grad_norm": 0.8125, + "learning_rate": 0.00011927336223053529, + "loss": 0.7467, + "step": 29480 + }, + { + "epoch": 0.7569894129711565, + "grad_norm": 0.8046875, + "learning_rate": 0.00011926898170844789, + "loss": 0.881, + "step": 29481 + }, + { + "epoch": 0.7570150901670784, + "grad_norm": 0.75, + "learning_rate": 0.00011926460114795912, + "loss": 0.9, + "step": 29482 + }, + { + "epoch": 0.7570407673630001, + "grad_norm": 0.75390625, + "learning_rate": 0.00011926022054907775, + "loss": 0.8251, + "step": 29483 + }, + { + "epoch": 0.7570664445589219, + "grad_norm": 0.76953125, + "learning_rate": 0.0001192558399118125, + "loss": 0.9437, + "step": 29484 + }, + { + "epoch": 0.7570921217548437, + "grad_norm": 0.77734375, + "learning_rate": 0.00011925145923617211, + "loss": 0.8841, + "step": 29485 + }, + { + "epoch": 0.7571177989507656, + "grad_norm": 0.7890625, + "learning_rate": 0.00011924707852216532, + "loss": 0.7778, + "step": 29486 + }, + { + "epoch": 0.7571434761466874, + "grad_norm": 0.765625, + "learning_rate": 0.00011924269776980081, + "loss": 0.912, + "step": 29487 + }, + { + "epoch": 0.7571691533426093, + "grad_norm": 0.7890625, + "learning_rate": 0.00011923831697908733, + "loss": 0.8586, + "step": 29488 + }, + { + "epoch": 0.7571948305385311, + "grad_norm": 0.6875, + "learning_rate": 0.00011923393615003365, + "loss": 0.587, + "step": 29489 + }, + { + "epoch": 0.7572205077344528, + "grad_norm": 0.6953125, + "learning_rate": 0.00011922955528264847, + "loss": 0.6675, + "step": 29490 + }, + { + "epoch": 0.7572461849303747, + "grad_norm": 0.83203125, + "learning_rate": 0.00011922517437694052, + "loss": 0.7779, + "step": 29491 + }, + { + "epoch": 0.7572718621262965, + "grad_norm": 0.7265625, + "learning_rate": 0.00011922079343291854, + "loss": 0.801, + "step": 29492 + }, + { + "epoch": 0.7572975393222183, + "grad_norm": 0.7578125, + "learning_rate": 0.00011921641245059124, + "loss": 0.8574, + "step": 29493 + }, + { + "epoch": 0.7573232165181402, + "grad_norm": 0.82421875, + "learning_rate": 0.00011921203142996735, + "loss": 0.8232, + "step": 29494 + }, + { + "epoch": 0.757348893714062, + "grad_norm": 0.69921875, + "learning_rate": 0.00011920765037105566, + "loss": 0.6746, + "step": 29495 + }, + { + "epoch": 0.7573745709099837, + "grad_norm": 0.76953125, + "learning_rate": 0.0001192032692738648, + "loss": 0.9019, + "step": 29496 + }, + { + "epoch": 0.7574002481059056, + "grad_norm": 0.875, + "learning_rate": 0.0001191988881384036, + "loss": 0.8902, + "step": 29497 + }, + { + "epoch": 0.7574259253018274, + "grad_norm": 0.76171875, + "learning_rate": 0.00011919450696468075, + "loss": 0.908, + "step": 29498 + }, + { + "epoch": 0.7574516024977492, + "grad_norm": 0.76953125, + "learning_rate": 0.00011919012575270499, + "loss": 0.8704, + "step": 29499 + }, + { + "epoch": 0.7574772796936711, + "grad_norm": 0.6796875, + "learning_rate": 0.00011918574450248503, + "loss": 0.7494, + "step": 29500 + }, + { + "epoch": 0.7575029568895929, + "grad_norm": 0.75390625, + "learning_rate": 0.0001191813632140296, + "loss": 0.8292, + "step": 29501 + }, + { + "epoch": 0.7575286340855147, + "grad_norm": 0.8203125, + "learning_rate": 0.00011917698188734747, + "loss": 0.8402, + "step": 29502 + }, + { + "epoch": 0.7575543112814365, + "grad_norm": 0.796875, + "learning_rate": 0.00011917260052244734, + "loss": 0.8443, + "step": 29503 + }, + { + "epoch": 0.7575799884773583, + "grad_norm": 0.828125, + "learning_rate": 0.00011916821911933796, + "loss": 0.8771, + "step": 29504 + }, + { + "epoch": 0.7576056656732801, + "grad_norm": 0.77734375, + "learning_rate": 0.00011916383767802803, + "loss": 0.9069, + "step": 29505 + }, + { + "epoch": 0.757631342869202, + "grad_norm": 0.75390625, + "learning_rate": 0.00011915945619852629, + "loss": 0.8096, + "step": 29506 + }, + { + "epoch": 0.7576570200651238, + "grad_norm": 0.77734375, + "learning_rate": 0.0001191550746808415, + "loss": 0.7735, + "step": 29507 + }, + { + "epoch": 0.7576826972610456, + "grad_norm": 0.78515625, + "learning_rate": 0.0001191506931249824, + "loss": 0.8795, + "step": 29508 + }, + { + "epoch": 0.7577083744569675, + "grad_norm": 0.828125, + "learning_rate": 0.00011914631153095766, + "loss": 0.8357, + "step": 29509 + }, + { + "epoch": 0.7577340516528892, + "grad_norm": 0.796875, + "learning_rate": 0.00011914192989877607, + "loss": 0.8827, + "step": 29510 + }, + { + "epoch": 0.757759728848811, + "grad_norm": 0.7421875, + "learning_rate": 0.00011913754822844634, + "loss": 0.8492, + "step": 29511 + }, + { + "epoch": 0.7577854060447329, + "grad_norm": 0.8046875, + "learning_rate": 0.00011913316651997721, + "loss": 0.8865, + "step": 29512 + }, + { + "epoch": 0.7578110832406547, + "grad_norm": 0.75390625, + "learning_rate": 0.00011912878477337741, + "loss": 0.7504, + "step": 29513 + }, + { + "epoch": 0.7578367604365766, + "grad_norm": 0.82421875, + "learning_rate": 0.00011912440298865566, + "loss": 0.7609, + "step": 29514 + }, + { + "epoch": 0.7578624376324984, + "grad_norm": 0.765625, + "learning_rate": 0.00011912002116582071, + "loss": 0.8853, + "step": 29515 + }, + { + "epoch": 0.7578881148284201, + "grad_norm": 0.8828125, + "learning_rate": 0.00011911563930488128, + "loss": 0.8292, + "step": 29516 + }, + { + "epoch": 0.757913792024342, + "grad_norm": 0.8125, + "learning_rate": 0.00011911125740584607, + "loss": 0.7824, + "step": 29517 + }, + { + "epoch": 0.7579394692202638, + "grad_norm": 0.79296875, + "learning_rate": 0.00011910687546872394, + "loss": 0.8226, + "step": 29518 + }, + { + "epoch": 0.7579651464161856, + "grad_norm": 0.78125, + "learning_rate": 0.00011910249349352347, + "loss": 0.7966, + "step": 29519 + }, + { + "epoch": 0.7579908236121075, + "grad_norm": 0.828125, + "learning_rate": 0.00011909811148025344, + "loss": 0.805, + "step": 29520 + }, + { + "epoch": 0.7580165008080293, + "grad_norm": 0.8046875, + "learning_rate": 0.00011909372942892265, + "loss": 0.9018, + "step": 29521 + }, + { + "epoch": 0.7580421780039511, + "grad_norm": 0.77734375, + "learning_rate": 0.00011908934733953974, + "loss": 0.6967, + "step": 29522 + }, + { + "epoch": 0.7580678551998729, + "grad_norm": 0.71484375, + "learning_rate": 0.00011908496521211352, + "loss": 0.7911, + "step": 29523 + }, + { + "epoch": 0.7580935323957947, + "grad_norm": 0.8125, + "learning_rate": 0.00011908058304665268, + "loss": 0.8461, + "step": 29524 + }, + { + "epoch": 0.7581192095917165, + "grad_norm": 0.77734375, + "learning_rate": 0.00011907620084316595, + "loss": 0.731, + "step": 29525 + }, + { + "epoch": 0.7581448867876384, + "grad_norm": 0.8125, + "learning_rate": 0.00011907181860166208, + "loss": 0.7601, + "step": 29526 + }, + { + "epoch": 0.7581705639835602, + "grad_norm": 0.8125, + "learning_rate": 0.0001190674363221498, + "loss": 0.848, + "step": 29527 + }, + { + "epoch": 0.758196241179482, + "grad_norm": 0.75390625, + "learning_rate": 0.00011906305400463782, + "loss": 0.8155, + "step": 29528 + }, + { + "epoch": 0.7582219183754039, + "grad_norm": 0.79296875, + "learning_rate": 0.00011905867164913493, + "loss": 0.8462, + "step": 29529 + }, + { + "epoch": 0.7582475955713256, + "grad_norm": 0.734375, + "learning_rate": 0.0001190542892556498, + "loss": 0.7822, + "step": 29530 + }, + { + "epoch": 0.7582732727672474, + "grad_norm": 0.7421875, + "learning_rate": 0.00011904990682419121, + "loss": 0.7813, + "step": 29531 + }, + { + "epoch": 0.7582989499631693, + "grad_norm": 0.80859375, + "learning_rate": 0.0001190455243547679, + "loss": 0.727, + "step": 29532 + }, + { + "epoch": 0.7583246271590911, + "grad_norm": 0.765625, + "learning_rate": 0.00011904114184738853, + "loss": 0.9176, + "step": 29533 + }, + { + "epoch": 0.7583503043550129, + "grad_norm": 0.75390625, + "learning_rate": 0.0001190367593020619, + "loss": 0.7779, + "step": 29534 + }, + { + "epoch": 0.7583759815509348, + "grad_norm": 0.76171875, + "learning_rate": 0.00011903237671879675, + "loss": 0.8279, + "step": 29535 + }, + { + "epoch": 0.7584016587468565, + "grad_norm": 0.828125, + "learning_rate": 0.00011902799409760175, + "loss": 0.8934, + "step": 29536 + }, + { + "epoch": 0.7584273359427783, + "grad_norm": 0.80078125, + "learning_rate": 0.00011902361143848573, + "loss": 0.7396, + "step": 29537 + }, + { + "epoch": 0.7584530131387002, + "grad_norm": 0.859375, + "learning_rate": 0.00011901922874145734, + "loss": 0.8849, + "step": 29538 + }, + { + "epoch": 0.758478690334622, + "grad_norm": 0.73046875, + "learning_rate": 0.00011901484600652536, + "loss": 0.737, + "step": 29539 + }, + { + "epoch": 0.7585043675305438, + "grad_norm": 0.7734375, + "learning_rate": 0.00011901046323369848, + "loss": 0.8118, + "step": 29540 + }, + { + "epoch": 0.7585300447264657, + "grad_norm": 0.81640625, + "learning_rate": 0.00011900608042298548, + "loss": 0.7493, + "step": 29541 + }, + { + "epoch": 0.7585557219223875, + "grad_norm": 0.8046875, + "learning_rate": 0.00011900169757439509, + "loss": 0.931, + "step": 29542 + }, + { + "epoch": 0.7585813991183092, + "grad_norm": 0.71484375, + "learning_rate": 0.00011899731468793603, + "loss": 0.7743, + "step": 29543 + }, + { + "epoch": 0.7586070763142311, + "grad_norm": 0.80859375, + "learning_rate": 0.00011899293176361703, + "loss": 0.9863, + "step": 29544 + }, + { + "epoch": 0.7586327535101529, + "grad_norm": 0.84375, + "learning_rate": 0.00011898854880144685, + "loss": 0.845, + "step": 29545 + }, + { + "epoch": 0.7586584307060747, + "grad_norm": 0.78125, + "learning_rate": 0.00011898416580143417, + "loss": 0.817, + "step": 29546 + }, + { + "epoch": 0.7586841079019966, + "grad_norm": 0.76171875, + "learning_rate": 0.0001189797827635878, + "loss": 0.8208, + "step": 29547 + }, + { + "epoch": 0.7587097850979184, + "grad_norm": 0.765625, + "learning_rate": 0.00011897539968791642, + "loss": 0.7291, + "step": 29548 + }, + { + "epoch": 0.7587354622938401, + "grad_norm": 0.796875, + "learning_rate": 0.00011897101657442877, + "loss": 0.7581, + "step": 29549 + }, + { + "epoch": 0.758761139489762, + "grad_norm": 0.97265625, + "learning_rate": 0.00011896663342313363, + "loss": 0.9013, + "step": 29550 + }, + { + "epoch": 0.7587868166856838, + "grad_norm": 0.91796875, + "learning_rate": 0.00011896225023403968, + "loss": 0.7185, + "step": 29551 + }, + { + "epoch": 0.7588124938816057, + "grad_norm": 0.70703125, + "learning_rate": 0.0001189578670071557, + "loss": 0.7157, + "step": 29552 + }, + { + "epoch": 0.7588381710775275, + "grad_norm": 0.8125, + "learning_rate": 0.00011895348374249038, + "loss": 0.7929, + "step": 29553 + }, + { + "epoch": 0.7588638482734493, + "grad_norm": 0.734375, + "learning_rate": 0.00011894910044005248, + "loss": 0.8629, + "step": 29554 + }, + { + "epoch": 0.7588895254693712, + "grad_norm": 0.80859375, + "learning_rate": 0.00011894471709985074, + "loss": 0.8156, + "step": 29555 + }, + { + "epoch": 0.7589152026652929, + "grad_norm": 0.84375, + "learning_rate": 0.00011894033372189388, + "loss": 0.7817, + "step": 29556 + }, + { + "epoch": 0.7589408798612147, + "grad_norm": 0.83984375, + "learning_rate": 0.00011893595030619067, + "loss": 0.902, + "step": 29557 + }, + { + "epoch": 0.7589665570571366, + "grad_norm": 0.76953125, + "learning_rate": 0.00011893156685274982, + "loss": 0.814, + "step": 29558 + }, + { + "epoch": 0.7589922342530584, + "grad_norm": 0.8046875, + "learning_rate": 0.00011892718336158004, + "loss": 0.9633, + "step": 29559 + }, + { + "epoch": 0.7590179114489802, + "grad_norm": 0.875, + "learning_rate": 0.00011892279983269011, + "loss": 1.0013, + "step": 29560 + }, + { + "epoch": 0.7590435886449021, + "grad_norm": 0.7578125, + "learning_rate": 0.00011891841626608875, + "loss": 0.8655, + "step": 29561 + }, + { + "epoch": 0.7590692658408239, + "grad_norm": 0.79296875, + "learning_rate": 0.00011891403266178467, + "loss": 0.732, + "step": 29562 + }, + { + "epoch": 0.7590949430367456, + "grad_norm": 0.84765625, + "learning_rate": 0.00011890964901978667, + "loss": 0.8697, + "step": 29563 + }, + { + "epoch": 0.7591206202326675, + "grad_norm": 0.796875, + "learning_rate": 0.00011890526534010345, + "loss": 0.8703, + "step": 29564 + }, + { + "epoch": 0.7591462974285893, + "grad_norm": 0.70703125, + "learning_rate": 0.00011890088162274371, + "loss": 0.827, + "step": 29565 + }, + { + "epoch": 0.7591719746245111, + "grad_norm": 0.7578125, + "learning_rate": 0.00011889649786771623, + "loss": 0.8202, + "step": 29566 + }, + { + "epoch": 0.759197651820433, + "grad_norm": 0.76171875, + "learning_rate": 0.00011889211407502976, + "loss": 0.8049, + "step": 29567 + }, + { + "epoch": 0.7592233290163548, + "grad_norm": 0.91796875, + "learning_rate": 0.00011888773024469298, + "loss": 0.7869, + "step": 29568 + }, + { + "epoch": 0.7592490062122765, + "grad_norm": 0.78515625, + "learning_rate": 0.00011888334637671467, + "loss": 0.8113, + "step": 29569 + }, + { + "epoch": 0.7592746834081984, + "grad_norm": 0.734375, + "learning_rate": 0.00011887896247110357, + "loss": 0.7951, + "step": 29570 + }, + { + "epoch": 0.7593003606041202, + "grad_norm": 0.86328125, + "learning_rate": 0.00011887457852786841, + "loss": 0.7108, + "step": 29571 + }, + { + "epoch": 0.759326037800042, + "grad_norm": 0.80078125, + "learning_rate": 0.0001188701945470179, + "loss": 0.8825, + "step": 29572 + }, + { + "epoch": 0.7593517149959639, + "grad_norm": 1.1015625, + "learning_rate": 0.00011886581052856081, + "loss": 0.8004, + "step": 29573 + }, + { + "epoch": 0.7593773921918857, + "grad_norm": 0.8203125, + "learning_rate": 0.00011886142647250584, + "loss": 0.8126, + "step": 29574 + }, + { + "epoch": 0.7594030693878075, + "grad_norm": 0.8359375, + "learning_rate": 0.00011885704237886178, + "loss": 0.7738, + "step": 29575 + }, + { + "epoch": 0.7594287465837293, + "grad_norm": 0.828125, + "learning_rate": 0.00011885265824763731, + "loss": 0.8103, + "step": 29576 + }, + { + "epoch": 0.7594544237796511, + "grad_norm": 0.7578125, + "learning_rate": 0.00011884827407884123, + "loss": 0.8488, + "step": 29577 + }, + { + "epoch": 0.759480100975573, + "grad_norm": 0.87109375, + "learning_rate": 0.00011884388987248221, + "loss": 0.7994, + "step": 29578 + }, + { + "epoch": 0.7595057781714948, + "grad_norm": 0.69921875, + "learning_rate": 0.00011883950562856902, + "loss": 0.7797, + "step": 29579 + }, + { + "epoch": 0.7595314553674166, + "grad_norm": 0.796875, + "learning_rate": 0.00011883512134711043, + "loss": 0.8351, + "step": 29580 + }, + { + "epoch": 0.7595571325633385, + "grad_norm": 0.77734375, + "learning_rate": 0.00011883073702811512, + "loss": 0.9027, + "step": 29581 + }, + { + "epoch": 0.7595828097592603, + "grad_norm": 0.8203125, + "learning_rate": 0.00011882635267159186, + "loss": 0.7516, + "step": 29582 + }, + { + "epoch": 0.759608486955182, + "grad_norm": 0.75390625, + "learning_rate": 0.00011882196827754936, + "loss": 0.7091, + "step": 29583 + }, + { + "epoch": 0.7596341641511039, + "grad_norm": 0.86328125, + "learning_rate": 0.0001188175838459964, + "loss": 0.9281, + "step": 29584 + }, + { + "epoch": 0.7596598413470257, + "grad_norm": 0.82421875, + "learning_rate": 0.00011881319937694169, + "loss": 0.9404, + "step": 29585 + }, + { + "epoch": 0.7596855185429475, + "grad_norm": 0.796875, + "learning_rate": 0.00011880881487039394, + "loss": 0.82, + "step": 29586 + }, + { + "epoch": 0.7597111957388694, + "grad_norm": 0.84765625, + "learning_rate": 0.00011880443032636197, + "loss": 0.8717, + "step": 29587 + }, + { + "epoch": 0.7597368729347912, + "grad_norm": 0.86328125, + "learning_rate": 0.00011880004574485447, + "loss": 0.9435, + "step": 29588 + }, + { + "epoch": 0.7597625501307129, + "grad_norm": 0.76171875, + "learning_rate": 0.00011879566112588012, + "loss": 0.7501, + "step": 29589 + }, + { + "epoch": 0.7597882273266348, + "grad_norm": 0.75, + "learning_rate": 0.0001187912764694478, + "loss": 0.8388, + "step": 29590 + }, + { + "epoch": 0.7598139045225566, + "grad_norm": 0.78125, + "learning_rate": 0.00011878689177556608, + "loss": 0.8916, + "step": 29591 + }, + { + "epoch": 0.7598395817184784, + "grad_norm": 0.82421875, + "learning_rate": 0.00011878250704424383, + "loss": 0.9661, + "step": 29592 + }, + { + "epoch": 0.7598652589144003, + "grad_norm": 0.86328125, + "learning_rate": 0.00011877812227548975, + "loss": 0.8623, + "step": 29593 + }, + { + "epoch": 0.7598909361103221, + "grad_norm": 0.7890625, + "learning_rate": 0.0001187737374693125, + "loss": 0.8944, + "step": 29594 + }, + { + "epoch": 0.7599166133062439, + "grad_norm": 0.83203125, + "learning_rate": 0.00011876935262572095, + "loss": 0.7985, + "step": 29595 + }, + { + "epoch": 0.7599422905021657, + "grad_norm": 1.078125, + "learning_rate": 0.00011876496774472377, + "loss": 0.7994, + "step": 29596 + }, + { + "epoch": 0.7599679676980875, + "grad_norm": 0.91015625, + "learning_rate": 0.00011876058282632969, + "loss": 0.7715, + "step": 29597 + }, + { + "epoch": 0.7599936448940093, + "grad_norm": 0.80859375, + "learning_rate": 0.00011875619787054747, + "loss": 0.7804, + "step": 29598 + }, + { + "epoch": 0.7600193220899312, + "grad_norm": 0.76953125, + "learning_rate": 0.00011875181287738583, + "loss": 0.8041, + "step": 29599 + }, + { + "epoch": 0.760044999285853, + "grad_norm": 0.73046875, + "learning_rate": 0.00011874742784685353, + "loss": 0.7557, + "step": 29600 + }, + { + "epoch": 0.7600706764817748, + "grad_norm": 0.859375, + "learning_rate": 0.00011874304277895931, + "loss": 0.7508, + "step": 29601 + }, + { + "epoch": 0.7600963536776967, + "grad_norm": 0.765625, + "learning_rate": 0.00011873865767371186, + "loss": 0.9117, + "step": 29602 + }, + { + "epoch": 0.7601220308736184, + "grad_norm": 0.83984375, + "learning_rate": 0.00011873427253112002, + "loss": 0.8854, + "step": 29603 + }, + { + "epoch": 0.7601477080695402, + "grad_norm": 0.80859375, + "learning_rate": 0.00011872988735119244, + "loss": 0.7361, + "step": 29604 + }, + { + "epoch": 0.7601733852654621, + "grad_norm": 0.765625, + "learning_rate": 0.00011872550213393788, + "loss": 0.7512, + "step": 29605 + }, + { + "epoch": 0.7601990624613839, + "grad_norm": 0.75, + "learning_rate": 0.00011872111687936506, + "loss": 0.8061, + "step": 29606 + }, + { + "epoch": 0.7602247396573057, + "grad_norm": 0.7421875, + "learning_rate": 0.00011871673158748275, + "loss": 0.8257, + "step": 29607 + }, + { + "epoch": 0.7602504168532276, + "grad_norm": 0.75390625, + "learning_rate": 0.00011871234625829973, + "loss": 0.8425, + "step": 29608 + }, + { + "epoch": 0.7602760940491493, + "grad_norm": 0.84375, + "learning_rate": 0.00011870796089182467, + "loss": 0.876, + "step": 29609 + }, + { + "epoch": 0.7603017712450711, + "grad_norm": 0.83203125, + "learning_rate": 0.00011870357548806635, + "loss": 0.918, + "step": 29610 + }, + { + "epoch": 0.760327448440993, + "grad_norm": 0.81640625, + "learning_rate": 0.00011869919004703347, + "loss": 0.9158, + "step": 29611 + }, + { + "epoch": 0.7603531256369148, + "grad_norm": 0.8125, + "learning_rate": 0.00011869480456873481, + "loss": 0.8913, + "step": 29612 + }, + { + "epoch": 0.7603788028328367, + "grad_norm": 0.765625, + "learning_rate": 0.00011869041905317906, + "loss": 0.8553, + "step": 29613 + }, + { + "epoch": 0.7604044800287585, + "grad_norm": 0.76953125, + "learning_rate": 0.00011868603350037502, + "loss": 1.0044, + "step": 29614 + }, + { + "epoch": 0.7604301572246803, + "grad_norm": 0.77734375, + "learning_rate": 0.00011868164791033141, + "loss": 0.7123, + "step": 29615 + }, + { + "epoch": 0.760455834420602, + "grad_norm": 0.73046875, + "learning_rate": 0.00011867726228305696, + "loss": 0.7668, + "step": 29616 + }, + { + "epoch": 0.7604815116165239, + "grad_norm": 0.72265625, + "learning_rate": 0.0001186728766185604, + "loss": 0.8511, + "step": 29617 + }, + { + "epoch": 0.7605071888124457, + "grad_norm": 0.8125, + "learning_rate": 0.00011866849091685046, + "loss": 0.8067, + "step": 29618 + }, + { + "epoch": 0.7605328660083676, + "grad_norm": 0.80078125, + "learning_rate": 0.00011866410517793594, + "loss": 0.9186, + "step": 29619 + }, + { + "epoch": 0.7605585432042894, + "grad_norm": 0.76171875, + "learning_rate": 0.00011865971940182554, + "loss": 0.8156, + "step": 29620 + }, + { + "epoch": 0.7605842204002112, + "grad_norm": 0.76171875, + "learning_rate": 0.00011865533358852801, + "loss": 0.7884, + "step": 29621 + }, + { + "epoch": 0.7606098975961331, + "grad_norm": 0.8046875, + "learning_rate": 0.00011865094773805207, + "loss": 0.7809, + "step": 29622 + }, + { + "epoch": 0.7606355747920548, + "grad_norm": 0.796875, + "learning_rate": 0.00011864656185040648, + "loss": 0.8493, + "step": 29623 + }, + { + "epoch": 0.7606612519879766, + "grad_norm": 0.8359375, + "learning_rate": 0.00011864217592559998, + "loss": 0.9608, + "step": 29624 + }, + { + "epoch": 0.7606869291838985, + "grad_norm": 0.7578125, + "learning_rate": 0.00011863778996364132, + "loss": 0.873, + "step": 29625 + }, + { + "epoch": 0.7607126063798203, + "grad_norm": 0.75, + "learning_rate": 0.00011863340396453919, + "loss": 0.8369, + "step": 29626 + }, + { + "epoch": 0.7607382835757421, + "grad_norm": 0.83984375, + "learning_rate": 0.00011862901792830242, + "loss": 0.917, + "step": 29627 + }, + { + "epoch": 0.760763960771664, + "grad_norm": 0.78515625, + "learning_rate": 0.00011862463185493966, + "loss": 0.9107, + "step": 29628 + }, + { + "epoch": 0.7607896379675857, + "grad_norm": 0.82421875, + "learning_rate": 0.0001186202457444597, + "loss": 0.81, + "step": 29629 + }, + { + "epoch": 0.7608153151635075, + "grad_norm": 0.8515625, + "learning_rate": 0.00011861585959687129, + "loss": 0.9391, + "step": 29630 + }, + { + "epoch": 0.7608409923594294, + "grad_norm": 0.8984375, + "learning_rate": 0.00011861147341218313, + "loss": 0.9255, + "step": 29631 + }, + { + "epoch": 0.7608666695553512, + "grad_norm": 0.76953125, + "learning_rate": 0.00011860708719040401, + "loss": 0.7677, + "step": 29632 + }, + { + "epoch": 0.760892346751273, + "grad_norm": 0.8046875, + "learning_rate": 0.00011860270093154264, + "loss": 0.9313, + "step": 29633 + }, + { + "epoch": 0.7609180239471949, + "grad_norm": 1.0234375, + "learning_rate": 0.00011859831463560773, + "loss": 0.7943, + "step": 29634 + }, + { + "epoch": 0.7609437011431167, + "grad_norm": 0.7734375, + "learning_rate": 0.00011859392830260809, + "loss": 0.8114, + "step": 29635 + }, + { + "epoch": 0.7609693783390384, + "grad_norm": 0.8515625, + "learning_rate": 0.00011858954193255245, + "loss": 0.7928, + "step": 29636 + }, + { + "epoch": 0.7609950555349603, + "grad_norm": 0.734375, + "learning_rate": 0.00011858515552544951, + "loss": 0.8945, + "step": 29637 + }, + { + "epoch": 0.7610207327308821, + "grad_norm": 0.75, + "learning_rate": 0.00011858076908130804, + "loss": 0.8845, + "step": 29638 + }, + { + "epoch": 0.761046409926804, + "grad_norm": 0.7421875, + "learning_rate": 0.00011857638260013675, + "loss": 0.7678, + "step": 29639 + }, + { + "epoch": 0.7610720871227258, + "grad_norm": 0.7265625, + "learning_rate": 0.00011857199608194443, + "loss": 0.7665, + "step": 29640 + }, + { + "epoch": 0.7610977643186476, + "grad_norm": 0.9140625, + "learning_rate": 0.0001185676095267398, + "loss": 0.7421, + "step": 29641 + }, + { + "epoch": 0.7611234415145695, + "grad_norm": 0.90625, + "learning_rate": 0.00011856322293453162, + "loss": 0.8348, + "step": 29642 + }, + { + "epoch": 0.7611491187104912, + "grad_norm": 0.84375, + "learning_rate": 0.00011855883630532858, + "loss": 0.8219, + "step": 29643 + }, + { + "epoch": 0.761174795906413, + "grad_norm": 0.99609375, + "learning_rate": 0.00011855444963913944, + "loss": 0.8781, + "step": 29644 + }, + { + "epoch": 0.7612004731023349, + "grad_norm": 0.765625, + "learning_rate": 0.000118550062935973, + "loss": 0.6755, + "step": 29645 + }, + { + "epoch": 0.7612261502982567, + "grad_norm": 0.77734375, + "learning_rate": 0.00011854567619583797, + "loss": 0.8092, + "step": 29646 + }, + { + "epoch": 0.7612518274941785, + "grad_norm": 0.78515625, + "learning_rate": 0.00011854128941874303, + "loss": 0.8459, + "step": 29647 + }, + { + "epoch": 0.7612775046901004, + "grad_norm": 0.91015625, + "learning_rate": 0.000118536902604697, + "loss": 0.8118, + "step": 29648 + }, + { + "epoch": 0.7613031818860221, + "grad_norm": 0.828125, + "learning_rate": 0.00011853251575370863, + "loss": 0.9656, + "step": 29649 + }, + { + "epoch": 0.7613288590819439, + "grad_norm": 0.75, + "learning_rate": 0.00011852812886578659, + "loss": 0.8209, + "step": 29650 + }, + { + "epoch": 0.7613545362778658, + "grad_norm": 0.921875, + "learning_rate": 0.00011852374194093967, + "loss": 0.9116, + "step": 29651 + }, + { + "epoch": 0.7613802134737876, + "grad_norm": 0.73828125, + "learning_rate": 0.00011851935497917661, + "loss": 0.7842, + "step": 29652 + }, + { + "epoch": 0.7614058906697094, + "grad_norm": 0.73828125, + "learning_rate": 0.00011851496798050615, + "loss": 0.7387, + "step": 29653 + }, + { + "epoch": 0.7614315678656313, + "grad_norm": 0.77734375, + "learning_rate": 0.00011851058094493703, + "loss": 0.8917, + "step": 29654 + }, + { + "epoch": 0.7614572450615531, + "grad_norm": 0.83203125, + "learning_rate": 0.00011850619387247796, + "loss": 0.9528, + "step": 29655 + }, + { + "epoch": 0.7614829222574748, + "grad_norm": 0.765625, + "learning_rate": 0.00011850180676313778, + "loss": 0.7974, + "step": 29656 + }, + { + "epoch": 0.7615085994533967, + "grad_norm": 0.79296875, + "learning_rate": 0.00011849741961692515, + "loss": 0.821, + "step": 29657 + }, + { + "epoch": 0.7615342766493185, + "grad_norm": 1.109375, + "learning_rate": 0.00011849303243384881, + "loss": 0.9151, + "step": 29658 + }, + { + "epoch": 0.7615599538452403, + "grad_norm": 0.81640625, + "learning_rate": 0.00011848864521391754, + "loss": 0.942, + "step": 29659 + }, + { + "epoch": 0.7615856310411622, + "grad_norm": 0.83203125, + "learning_rate": 0.00011848425795714009, + "loss": 0.8926, + "step": 29660 + }, + { + "epoch": 0.761611308237084, + "grad_norm": 0.7578125, + "learning_rate": 0.00011847987066352515, + "loss": 0.877, + "step": 29661 + }, + { + "epoch": 0.7616369854330058, + "grad_norm": 0.75390625, + "learning_rate": 0.00011847548333308153, + "loss": 0.8396, + "step": 29662 + }, + { + "epoch": 0.7616626626289276, + "grad_norm": 0.7421875, + "learning_rate": 0.00011847109596581792, + "loss": 0.8642, + "step": 29663 + }, + { + "epoch": 0.7616883398248494, + "grad_norm": 0.75390625, + "learning_rate": 0.00011846670856174307, + "loss": 0.7017, + "step": 29664 + }, + { + "epoch": 0.7617140170207712, + "grad_norm": 0.85546875, + "learning_rate": 0.00011846232112086577, + "loss": 0.966, + "step": 29665 + }, + { + "epoch": 0.7617396942166931, + "grad_norm": 0.8046875, + "learning_rate": 0.00011845793364319468, + "loss": 0.8883, + "step": 29666 + }, + { + "epoch": 0.7617653714126149, + "grad_norm": 0.77734375, + "learning_rate": 0.00011845354612873864, + "loss": 0.844, + "step": 29667 + }, + { + "epoch": 0.7617910486085367, + "grad_norm": 0.875, + "learning_rate": 0.00011844915857750635, + "loss": 0.8149, + "step": 29668 + }, + { + "epoch": 0.7618167258044585, + "grad_norm": 0.78125, + "learning_rate": 0.00011844477098950655, + "loss": 0.8961, + "step": 29669 + }, + { + "epoch": 0.7618424030003803, + "grad_norm": 0.796875, + "learning_rate": 0.00011844038336474798, + "loss": 0.8397, + "step": 29670 + }, + { + "epoch": 0.7618680801963021, + "grad_norm": 0.82421875, + "learning_rate": 0.0001184359957032394, + "loss": 0.8147, + "step": 29671 + }, + { + "epoch": 0.761893757392224, + "grad_norm": 0.828125, + "learning_rate": 0.00011843160800498954, + "loss": 0.9409, + "step": 29672 + }, + { + "epoch": 0.7619194345881458, + "grad_norm": 0.796875, + "learning_rate": 0.00011842722027000715, + "loss": 0.9758, + "step": 29673 + }, + { + "epoch": 0.7619451117840677, + "grad_norm": 0.86328125, + "learning_rate": 0.00011842283249830095, + "loss": 0.744, + "step": 29674 + }, + { + "epoch": 0.7619707889799895, + "grad_norm": 0.8671875, + "learning_rate": 0.00011841844468987977, + "loss": 0.7875, + "step": 29675 + }, + { + "epoch": 0.7619964661759112, + "grad_norm": 0.77734375, + "learning_rate": 0.00011841405684475225, + "loss": 0.8106, + "step": 29676 + }, + { + "epoch": 0.762022143371833, + "grad_norm": 0.74609375, + "learning_rate": 0.00011840966896292718, + "loss": 0.8785, + "step": 29677 + }, + { + "epoch": 0.7620478205677549, + "grad_norm": 0.765625, + "learning_rate": 0.0001184052810444133, + "loss": 0.9061, + "step": 29678 + }, + { + "epoch": 0.7620734977636767, + "grad_norm": 0.796875, + "learning_rate": 0.00011840089308921935, + "loss": 0.8578, + "step": 29679 + }, + { + "epoch": 0.7620991749595986, + "grad_norm": 0.7109375, + "learning_rate": 0.00011839650509735412, + "loss": 0.7046, + "step": 29680 + }, + { + "epoch": 0.7621248521555204, + "grad_norm": 0.83984375, + "learning_rate": 0.00011839211706882629, + "loss": 0.7492, + "step": 29681 + }, + { + "epoch": 0.7621505293514422, + "grad_norm": 0.73828125, + "learning_rate": 0.00011838772900364463, + "loss": 0.7824, + "step": 29682 + }, + { + "epoch": 0.762176206547364, + "grad_norm": 0.9140625, + "learning_rate": 0.0001183833409018179, + "loss": 1.0306, + "step": 29683 + }, + { + "epoch": 0.7622018837432858, + "grad_norm": 0.765625, + "learning_rate": 0.00011837895276335482, + "loss": 0.86, + "step": 29684 + }, + { + "epoch": 0.7622275609392076, + "grad_norm": 0.84375, + "learning_rate": 0.00011837456458826416, + "loss": 0.8747, + "step": 29685 + }, + { + "epoch": 0.7622532381351295, + "grad_norm": 0.82421875, + "learning_rate": 0.00011837017637655466, + "loss": 0.7489, + "step": 29686 + }, + { + "epoch": 0.7622789153310513, + "grad_norm": 0.76953125, + "learning_rate": 0.00011836578812823502, + "loss": 0.8051, + "step": 29687 + }, + { + "epoch": 0.7623045925269731, + "grad_norm": 0.80859375, + "learning_rate": 0.00011836139984331404, + "loss": 0.8127, + "step": 29688 + }, + { + "epoch": 0.7623302697228949, + "grad_norm": 0.78125, + "learning_rate": 0.0001183570115218005, + "loss": 0.7403, + "step": 29689 + }, + { + "epoch": 0.7623559469188167, + "grad_norm": 0.75, + "learning_rate": 0.00011835262316370301, + "loss": 0.7934, + "step": 29690 + }, + { + "epoch": 0.7623816241147385, + "grad_norm": 0.80859375, + "learning_rate": 0.00011834823476903045, + "loss": 0.7784, + "step": 29691 + }, + { + "epoch": 0.7624073013106604, + "grad_norm": 0.8515625, + "learning_rate": 0.00011834384633779152, + "loss": 0.8097, + "step": 29692 + }, + { + "epoch": 0.7624329785065822, + "grad_norm": 0.75390625, + "learning_rate": 0.00011833945786999492, + "loss": 0.8301, + "step": 29693 + }, + { + "epoch": 0.762458655702504, + "grad_norm": 0.80078125, + "learning_rate": 0.00011833506936564947, + "loss": 0.8233, + "step": 29694 + }, + { + "epoch": 0.7624843328984259, + "grad_norm": 0.84765625, + "learning_rate": 0.00011833068082476389, + "loss": 0.8077, + "step": 29695 + }, + { + "epoch": 0.7625100100943476, + "grad_norm": 0.88671875, + "learning_rate": 0.0001183262922473469, + "loss": 0.8467, + "step": 29696 + }, + { + "epoch": 0.7625356872902694, + "grad_norm": 0.77734375, + "learning_rate": 0.00011832190363340727, + "loss": 0.7386, + "step": 29697 + }, + { + "epoch": 0.7625613644861913, + "grad_norm": 0.83984375, + "learning_rate": 0.00011831751498295374, + "loss": 0.9225, + "step": 29698 + }, + { + "epoch": 0.7625870416821131, + "grad_norm": 0.8046875, + "learning_rate": 0.00011831312629599504, + "loss": 0.7504, + "step": 29699 + }, + { + "epoch": 0.762612718878035, + "grad_norm": 0.7890625, + "learning_rate": 0.00011830873757253997, + "loss": 0.7221, + "step": 29700 + }, + { + "epoch": 0.7626383960739568, + "grad_norm": 0.83203125, + "learning_rate": 0.0001183043488125972, + "loss": 0.7909, + "step": 29701 + }, + { + "epoch": 0.7626640732698786, + "grad_norm": 0.8671875, + "learning_rate": 0.00011829996001617558, + "loss": 0.8516, + "step": 29702 + }, + { + "epoch": 0.7626897504658003, + "grad_norm": 0.7890625, + "learning_rate": 0.00011829557118328374, + "loss": 0.7875, + "step": 29703 + }, + { + "epoch": 0.7627154276617222, + "grad_norm": 0.78125, + "learning_rate": 0.00011829118231393048, + "loss": 0.7869, + "step": 29704 + }, + { + "epoch": 0.762741104857644, + "grad_norm": 0.75390625, + "learning_rate": 0.00011828679340812456, + "loss": 0.7547, + "step": 29705 + }, + { + "epoch": 0.7627667820535659, + "grad_norm": 0.74609375, + "learning_rate": 0.00011828240446587468, + "loss": 0.8278, + "step": 29706 + }, + { + "epoch": 0.7627924592494877, + "grad_norm": 0.71875, + "learning_rate": 0.00011827801548718966, + "loss": 0.7534, + "step": 29707 + }, + { + "epoch": 0.7628181364454095, + "grad_norm": 0.765625, + "learning_rate": 0.0001182736264720782, + "loss": 0.7955, + "step": 29708 + }, + { + "epoch": 0.7628438136413312, + "grad_norm": 0.76171875, + "learning_rate": 0.00011826923742054906, + "loss": 0.8003, + "step": 29709 + }, + { + "epoch": 0.7628694908372531, + "grad_norm": 0.8046875, + "learning_rate": 0.00011826484833261098, + "loss": 0.8574, + "step": 29710 + }, + { + "epoch": 0.7628951680331749, + "grad_norm": 0.7890625, + "learning_rate": 0.00011826045920827267, + "loss": 0.8038, + "step": 29711 + }, + { + "epoch": 0.7629208452290968, + "grad_norm": 0.83203125, + "learning_rate": 0.00011825607004754293, + "loss": 0.775, + "step": 29712 + }, + { + "epoch": 0.7629465224250186, + "grad_norm": 0.69140625, + "learning_rate": 0.00011825168085043051, + "loss": 0.7952, + "step": 29713 + }, + { + "epoch": 0.7629721996209404, + "grad_norm": 0.76953125, + "learning_rate": 0.00011824729161694413, + "loss": 0.864, + "step": 29714 + }, + { + "epoch": 0.7629978768168623, + "grad_norm": 0.74609375, + "learning_rate": 0.00011824290234709256, + "loss": 0.7627, + "step": 29715 + }, + { + "epoch": 0.763023554012784, + "grad_norm": 0.765625, + "learning_rate": 0.00011823851304088449, + "loss": 0.8037, + "step": 29716 + }, + { + "epoch": 0.7630492312087058, + "grad_norm": 0.7890625, + "learning_rate": 0.00011823412369832875, + "loss": 0.7993, + "step": 29717 + }, + { + "epoch": 0.7630749084046277, + "grad_norm": 0.7578125, + "learning_rate": 0.00011822973431943404, + "loss": 0.727, + "step": 29718 + }, + { + "epoch": 0.7631005856005495, + "grad_norm": 1.0078125, + "learning_rate": 0.00011822534490420909, + "loss": 0.7968, + "step": 29719 + }, + { + "epoch": 0.7631262627964713, + "grad_norm": 0.8046875, + "learning_rate": 0.00011822095545266272, + "loss": 0.8399, + "step": 29720 + }, + { + "epoch": 0.7631519399923932, + "grad_norm": 0.79296875, + "learning_rate": 0.00011821656596480362, + "loss": 0.8687, + "step": 29721 + }, + { + "epoch": 0.763177617188315, + "grad_norm": 0.7578125, + "learning_rate": 0.00011821217644064053, + "loss": 0.8599, + "step": 29722 + }, + { + "epoch": 0.7632032943842367, + "grad_norm": 0.83203125, + "learning_rate": 0.00011820778688018223, + "loss": 0.7894, + "step": 29723 + }, + { + "epoch": 0.7632289715801586, + "grad_norm": 0.69921875, + "learning_rate": 0.00011820339728343744, + "loss": 0.7024, + "step": 29724 + }, + { + "epoch": 0.7632546487760804, + "grad_norm": 0.80078125, + "learning_rate": 0.00011819900765041494, + "loss": 0.7752, + "step": 29725 + }, + { + "epoch": 0.7632803259720022, + "grad_norm": 0.77734375, + "learning_rate": 0.00011819461798112345, + "loss": 0.7545, + "step": 29726 + }, + { + "epoch": 0.7633060031679241, + "grad_norm": 0.81640625, + "learning_rate": 0.00011819022827557174, + "loss": 0.7375, + "step": 29727 + }, + { + "epoch": 0.7633316803638459, + "grad_norm": 0.8125, + "learning_rate": 0.00011818583853376857, + "loss": 0.8421, + "step": 29728 + }, + { + "epoch": 0.7633573575597676, + "grad_norm": 0.87109375, + "learning_rate": 0.00011818144875572263, + "loss": 0.8915, + "step": 29729 + }, + { + "epoch": 0.7633830347556895, + "grad_norm": 0.78515625, + "learning_rate": 0.00011817705894144271, + "loss": 0.9012, + "step": 29730 + }, + { + "epoch": 0.7634087119516113, + "grad_norm": 0.83203125, + "learning_rate": 0.00011817266909093756, + "loss": 0.8712, + "step": 29731 + }, + { + "epoch": 0.7634343891475331, + "grad_norm": 0.78125, + "learning_rate": 0.00011816827920421591, + "loss": 0.8886, + "step": 29732 + }, + { + "epoch": 0.763460066343455, + "grad_norm": 0.76953125, + "learning_rate": 0.00011816388928128655, + "loss": 0.7582, + "step": 29733 + }, + { + "epoch": 0.7634857435393768, + "grad_norm": 0.8125, + "learning_rate": 0.00011815949932215817, + "loss": 0.8265, + "step": 29734 + }, + { + "epoch": 0.7635114207352987, + "grad_norm": 0.9140625, + "learning_rate": 0.00011815510932683957, + "loss": 0.9084, + "step": 29735 + }, + { + "epoch": 0.7635370979312204, + "grad_norm": 0.7890625, + "learning_rate": 0.00011815071929533948, + "loss": 0.8273, + "step": 29736 + }, + { + "epoch": 0.7635627751271422, + "grad_norm": 0.77734375, + "learning_rate": 0.00011814632922766664, + "loss": 0.6866, + "step": 29737 + }, + { + "epoch": 0.763588452323064, + "grad_norm": 0.7890625, + "learning_rate": 0.00011814193912382979, + "loss": 0.7548, + "step": 29738 + }, + { + "epoch": 0.7636141295189859, + "grad_norm": 0.77734375, + "learning_rate": 0.00011813754898383769, + "loss": 0.8678, + "step": 29739 + }, + { + "epoch": 0.7636398067149077, + "grad_norm": 0.89453125, + "learning_rate": 0.0001181331588076991, + "loss": 0.883, + "step": 29740 + }, + { + "epoch": 0.7636654839108296, + "grad_norm": 0.82421875, + "learning_rate": 0.00011812876859542278, + "loss": 0.8852, + "step": 29741 + }, + { + "epoch": 0.7636911611067513, + "grad_norm": 0.7265625, + "learning_rate": 0.00011812437834701746, + "loss": 0.8013, + "step": 29742 + }, + { + "epoch": 0.7637168383026731, + "grad_norm": 0.75390625, + "learning_rate": 0.00011811998806249188, + "loss": 0.7816, + "step": 29743 + }, + { + "epoch": 0.763742515498595, + "grad_norm": 0.765625, + "learning_rate": 0.00011811559774185478, + "loss": 0.7703, + "step": 29744 + }, + { + "epoch": 0.7637681926945168, + "grad_norm": 0.69921875, + "learning_rate": 0.00011811120738511496, + "loss": 0.7893, + "step": 29745 + }, + { + "epoch": 0.7637938698904386, + "grad_norm": 1.59375, + "learning_rate": 0.00011810681699228112, + "loss": 0.7114, + "step": 29746 + }, + { + "epoch": 0.7638195470863605, + "grad_norm": 0.82421875, + "learning_rate": 0.00011810242656336204, + "loss": 0.897, + "step": 29747 + }, + { + "epoch": 0.7638452242822823, + "grad_norm": 0.76171875, + "learning_rate": 0.00011809803609836648, + "loss": 0.8497, + "step": 29748 + }, + { + "epoch": 0.763870901478204, + "grad_norm": 0.83984375, + "learning_rate": 0.00011809364559730314, + "loss": 0.9254, + "step": 29749 + }, + { + "epoch": 0.7638965786741259, + "grad_norm": 0.83984375, + "learning_rate": 0.00011808925506018082, + "loss": 0.9753, + "step": 29750 + }, + { + "epoch": 0.7639222558700477, + "grad_norm": 0.765625, + "learning_rate": 0.00011808486448700821, + "loss": 0.7072, + "step": 29751 + }, + { + "epoch": 0.7639479330659695, + "grad_norm": 0.765625, + "learning_rate": 0.00011808047387779412, + "loss": 0.8117, + "step": 29752 + }, + { + "epoch": 0.7639736102618914, + "grad_norm": 0.78125, + "learning_rate": 0.0001180760832325473, + "loss": 0.7815, + "step": 29753 + }, + { + "epoch": 0.7639992874578132, + "grad_norm": 0.76953125, + "learning_rate": 0.00011807169255127646, + "loss": 0.8677, + "step": 29754 + }, + { + "epoch": 0.764024964653735, + "grad_norm": 0.796875, + "learning_rate": 0.0001180673018339904, + "loss": 0.8093, + "step": 29755 + }, + { + "epoch": 0.7640506418496568, + "grad_norm": 0.83203125, + "learning_rate": 0.00011806291108069778, + "loss": 0.8882, + "step": 29756 + }, + { + "epoch": 0.7640763190455786, + "grad_norm": 0.7421875, + "learning_rate": 0.00011805852029140745, + "loss": 0.7103, + "step": 29757 + }, + { + "epoch": 0.7641019962415004, + "grad_norm": 0.796875, + "learning_rate": 0.0001180541294661281, + "loss": 0.8076, + "step": 29758 + }, + { + "epoch": 0.7641276734374223, + "grad_norm": 0.8515625, + "learning_rate": 0.0001180497386048685, + "loss": 0.9949, + "step": 29759 + }, + { + "epoch": 0.7641533506333441, + "grad_norm": 0.77734375, + "learning_rate": 0.00011804534770763742, + "loss": 0.903, + "step": 29760 + }, + { + "epoch": 0.7641790278292659, + "grad_norm": 0.796875, + "learning_rate": 0.0001180409567744436, + "loss": 0.9097, + "step": 29761 + }, + { + "epoch": 0.7642047050251877, + "grad_norm": 0.74609375, + "learning_rate": 0.00011803656580529576, + "loss": 0.8786, + "step": 29762 + }, + { + "epoch": 0.7642303822211095, + "grad_norm": 0.75390625, + "learning_rate": 0.0001180321748002027, + "loss": 0.868, + "step": 29763 + }, + { + "epoch": 0.7642560594170313, + "grad_norm": 0.71875, + "learning_rate": 0.0001180277837591731, + "loss": 0.8313, + "step": 29764 + }, + { + "epoch": 0.7642817366129532, + "grad_norm": 0.79296875, + "learning_rate": 0.00011802339268221578, + "loss": 0.8081, + "step": 29765 + }, + { + "epoch": 0.764307413808875, + "grad_norm": 0.7421875, + "learning_rate": 0.00011801900156933948, + "loss": 0.8713, + "step": 29766 + }, + { + "epoch": 0.7643330910047968, + "grad_norm": 0.796875, + "learning_rate": 0.00011801461042055293, + "loss": 0.8308, + "step": 29767 + }, + { + "epoch": 0.7643587682007187, + "grad_norm": 0.8125, + "learning_rate": 0.00011801021923586489, + "loss": 0.7833, + "step": 29768 + }, + { + "epoch": 0.7643844453966404, + "grad_norm": 0.81640625, + "learning_rate": 0.00011800582801528408, + "loss": 0.946, + "step": 29769 + }, + { + "epoch": 0.7644101225925622, + "grad_norm": 0.84765625, + "learning_rate": 0.00011800143675881932, + "loss": 0.804, + "step": 29770 + }, + { + "epoch": 0.7644357997884841, + "grad_norm": 0.7421875, + "learning_rate": 0.00011799704546647931, + "loss": 0.8336, + "step": 29771 + }, + { + "epoch": 0.7644614769844059, + "grad_norm": 0.875, + "learning_rate": 0.00011799265413827281, + "loss": 0.8776, + "step": 29772 + }, + { + "epoch": 0.7644871541803278, + "grad_norm": 0.8203125, + "learning_rate": 0.00011798826277420856, + "loss": 0.904, + "step": 29773 + }, + { + "epoch": 0.7645128313762496, + "grad_norm": 0.7734375, + "learning_rate": 0.00011798387137429539, + "loss": 0.7394, + "step": 29774 + }, + { + "epoch": 0.7645385085721714, + "grad_norm": 0.85546875, + "learning_rate": 0.00011797947993854193, + "loss": 0.9334, + "step": 29775 + }, + { + "epoch": 0.7645641857680932, + "grad_norm": 0.796875, + "learning_rate": 0.000117975088466957, + "loss": 0.901, + "step": 29776 + }, + { + "epoch": 0.764589862964015, + "grad_norm": 0.828125, + "learning_rate": 0.00011797069695954934, + "loss": 0.8454, + "step": 29777 + }, + { + "epoch": 0.7646155401599368, + "grad_norm": 0.77734375, + "learning_rate": 0.00011796630541632769, + "loss": 0.8556, + "step": 29778 + }, + { + "epoch": 0.7646412173558587, + "grad_norm": 0.8515625, + "learning_rate": 0.00011796191383730084, + "loss": 0.7542, + "step": 29779 + }, + { + "epoch": 0.7646668945517805, + "grad_norm": 0.6953125, + "learning_rate": 0.00011795752222247751, + "loss": 0.7629, + "step": 29780 + }, + { + "epoch": 0.7646925717477023, + "grad_norm": 0.73828125, + "learning_rate": 0.0001179531305718665, + "loss": 0.7739, + "step": 29781 + }, + { + "epoch": 0.7647182489436241, + "grad_norm": 0.71875, + "learning_rate": 0.00011794873888547649, + "loss": 0.692, + "step": 29782 + }, + { + "epoch": 0.7647439261395459, + "grad_norm": 0.79296875, + "learning_rate": 0.00011794434716331624, + "loss": 0.8061, + "step": 29783 + }, + { + "epoch": 0.7647696033354677, + "grad_norm": 1.1328125, + "learning_rate": 0.00011793995540539455, + "loss": 0.8068, + "step": 29784 + }, + { + "epoch": 0.7647952805313896, + "grad_norm": 0.89453125, + "learning_rate": 0.00011793556361172016, + "loss": 0.8687, + "step": 29785 + }, + { + "epoch": 0.7648209577273114, + "grad_norm": 0.83984375, + "learning_rate": 0.00011793117178230177, + "loss": 0.8094, + "step": 29786 + }, + { + "epoch": 0.7648466349232332, + "grad_norm": 0.70703125, + "learning_rate": 0.00011792677991714825, + "loss": 0.8433, + "step": 29787 + }, + { + "epoch": 0.7648723121191551, + "grad_norm": 0.76953125, + "learning_rate": 0.0001179223880162682, + "loss": 0.8792, + "step": 29788 + }, + { + "epoch": 0.7648979893150768, + "grad_norm": 0.81640625, + "learning_rate": 0.00011791799607967049, + "loss": 0.7337, + "step": 29789 + }, + { + "epoch": 0.7649236665109986, + "grad_norm": 0.8203125, + "learning_rate": 0.00011791360410736383, + "loss": 0.8427, + "step": 29790 + }, + { + "epoch": 0.7649493437069205, + "grad_norm": 0.79296875, + "learning_rate": 0.00011790921209935697, + "loss": 0.8732, + "step": 29791 + }, + { + "epoch": 0.7649750209028423, + "grad_norm": 0.84375, + "learning_rate": 0.00011790482005565866, + "loss": 0.9006, + "step": 29792 + }, + { + "epoch": 0.7650006980987641, + "grad_norm": 0.71484375, + "learning_rate": 0.00011790042797627767, + "loss": 0.7013, + "step": 29793 + }, + { + "epoch": 0.765026375294686, + "grad_norm": 0.88671875, + "learning_rate": 0.00011789603586122275, + "loss": 0.8537, + "step": 29794 + }, + { + "epoch": 0.7650520524906078, + "grad_norm": 0.83984375, + "learning_rate": 0.00011789164371050265, + "loss": 0.9967, + "step": 29795 + }, + { + "epoch": 0.7650777296865295, + "grad_norm": 0.8359375, + "learning_rate": 0.0001178872515241261, + "loss": 0.7766, + "step": 29796 + }, + { + "epoch": 0.7651034068824514, + "grad_norm": 0.91796875, + "learning_rate": 0.00011788285930210188, + "loss": 0.8354, + "step": 29797 + }, + { + "epoch": 0.7651290840783732, + "grad_norm": 1.1484375, + "learning_rate": 0.00011787846704443874, + "loss": 0.7935, + "step": 29798 + }, + { + "epoch": 0.765154761274295, + "grad_norm": 0.875, + "learning_rate": 0.00011787407475114541, + "loss": 0.8664, + "step": 29799 + }, + { + "epoch": 0.7651804384702169, + "grad_norm": 0.81640625, + "learning_rate": 0.00011786968242223071, + "loss": 1.0147, + "step": 29800 + }, + { + "epoch": 0.7652061156661387, + "grad_norm": 0.734375, + "learning_rate": 0.0001178652900577033, + "loss": 0.8274, + "step": 29801 + }, + { + "epoch": 0.7652317928620604, + "grad_norm": 0.77734375, + "learning_rate": 0.00011786089765757203, + "loss": 0.7683, + "step": 29802 + }, + { + "epoch": 0.7652574700579823, + "grad_norm": 0.77734375, + "learning_rate": 0.00011785650522184556, + "loss": 0.7457, + "step": 29803 + }, + { + "epoch": 0.7652831472539041, + "grad_norm": 0.83984375, + "learning_rate": 0.0001178521127505327, + "loss": 0.8832, + "step": 29804 + }, + { + "epoch": 0.765308824449826, + "grad_norm": 0.828125, + "learning_rate": 0.00011784772024364219, + "loss": 1.0101, + "step": 29805 + }, + { + "epoch": 0.7653345016457478, + "grad_norm": 0.8046875, + "learning_rate": 0.0001178433277011828, + "loss": 0.7465, + "step": 29806 + }, + { + "epoch": 0.7653601788416696, + "grad_norm": 0.7890625, + "learning_rate": 0.00011783893512316326, + "loss": 0.8728, + "step": 29807 + }, + { + "epoch": 0.7653858560375915, + "grad_norm": 0.76171875, + "learning_rate": 0.00011783454250959234, + "loss": 0.8414, + "step": 29808 + }, + { + "epoch": 0.7654115332335132, + "grad_norm": 0.71484375, + "learning_rate": 0.00011783014986047875, + "loss": 0.7319, + "step": 29809 + }, + { + "epoch": 0.765437210429435, + "grad_norm": 0.7890625, + "learning_rate": 0.00011782575717583133, + "loss": 0.8425, + "step": 29810 + }, + { + "epoch": 0.7654628876253569, + "grad_norm": 0.765625, + "learning_rate": 0.00011782136445565879, + "loss": 0.8021, + "step": 29811 + }, + { + "epoch": 0.7654885648212787, + "grad_norm": 0.8515625, + "learning_rate": 0.00011781697169996983, + "loss": 0.761, + "step": 29812 + }, + { + "epoch": 0.7655142420172005, + "grad_norm": 0.83203125, + "learning_rate": 0.00011781257890877331, + "loss": 0.9952, + "step": 29813 + }, + { + "epoch": 0.7655399192131224, + "grad_norm": 0.765625, + "learning_rate": 0.00011780818608207791, + "loss": 0.7227, + "step": 29814 + }, + { + "epoch": 0.7655655964090442, + "grad_norm": 0.83984375, + "learning_rate": 0.0001178037932198924, + "loss": 0.7478, + "step": 29815 + }, + { + "epoch": 0.7655912736049659, + "grad_norm": 0.73828125, + "learning_rate": 0.00011779940032222554, + "loss": 0.7793, + "step": 29816 + }, + { + "epoch": 0.7656169508008878, + "grad_norm": 0.76953125, + "learning_rate": 0.00011779500738908607, + "loss": 0.7468, + "step": 29817 + }, + { + "epoch": 0.7656426279968096, + "grad_norm": 0.7265625, + "learning_rate": 0.00011779061442048276, + "loss": 0.764, + "step": 29818 + }, + { + "epoch": 0.7656683051927314, + "grad_norm": 0.87890625, + "learning_rate": 0.00011778622141642437, + "loss": 0.8214, + "step": 29819 + }, + { + "epoch": 0.7656939823886533, + "grad_norm": 0.7578125, + "learning_rate": 0.00011778182837691965, + "loss": 0.8275, + "step": 29820 + }, + { + "epoch": 0.7657196595845751, + "grad_norm": 0.7265625, + "learning_rate": 0.00011777743530197736, + "loss": 0.8865, + "step": 29821 + }, + { + "epoch": 0.7657453367804968, + "grad_norm": 0.77734375, + "learning_rate": 0.00011777304219160623, + "loss": 0.7702, + "step": 29822 + }, + { + "epoch": 0.7657710139764187, + "grad_norm": 0.88671875, + "learning_rate": 0.00011776864904581503, + "loss": 0.8616, + "step": 29823 + }, + { + "epoch": 0.7657966911723405, + "grad_norm": 0.73828125, + "learning_rate": 0.00011776425586461253, + "loss": 0.7016, + "step": 29824 + }, + { + "epoch": 0.7658223683682623, + "grad_norm": 0.8203125, + "learning_rate": 0.00011775986264800747, + "loss": 0.9037, + "step": 29825 + }, + { + "epoch": 0.7658480455641842, + "grad_norm": 0.7734375, + "learning_rate": 0.0001177554693960086, + "loss": 0.8383, + "step": 29826 + }, + { + "epoch": 0.765873722760106, + "grad_norm": 0.7578125, + "learning_rate": 0.00011775107610862472, + "loss": 0.8033, + "step": 29827 + }, + { + "epoch": 0.7658993999560278, + "grad_norm": 0.765625, + "learning_rate": 0.00011774668278586449, + "loss": 0.8058, + "step": 29828 + }, + { + "epoch": 0.7659250771519496, + "grad_norm": 0.8125, + "learning_rate": 0.00011774228942773678, + "loss": 0.8354, + "step": 29829 + }, + { + "epoch": 0.7659507543478714, + "grad_norm": 0.74609375, + "learning_rate": 0.00011773789603425024, + "loss": 0.8557, + "step": 29830 + }, + { + "epoch": 0.7659764315437932, + "grad_norm": 0.74609375, + "learning_rate": 0.00011773350260541369, + "loss": 0.8867, + "step": 29831 + }, + { + "epoch": 0.7660021087397151, + "grad_norm": 0.703125, + "learning_rate": 0.0001177291091412359, + "loss": 0.7868, + "step": 29832 + }, + { + "epoch": 0.7660277859356369, + "grad_norm": 0.7734375, + "learning_rate": 0.00011772471564172558, + "loss": 0.9708, + "step": 29833 + }, + { + "epoch": 0.7660534631315588, + "grad_norm": 0.79296875, + "learning_rate": 0.0001177203221068915, + "loss": 0.8303, + "step": 29834 + }, + { + "epoch": 0.7660791403274806, + "grad_norm": 0.796875, + "learning_rate": 0.00011771592853674243, + "loss": 0.8612, + "step": 29835 + }, + { + "epoch": 0.7661048175234023, + "grad_norm": 0.76171875, + "learning_rate": 0.00011771153493128708, + "loss": 0.7896, + "step": 29836 + }, + { + "epoch": 0.7661304947193242, + "grad_norm": 0.96484375, + "learning_rate": 0.00011770714129053426, + "loss": 0.8219, + "step": 29837 + }, + { + "epoch": 0.766156171915246, + "grad_norm": 0.7109375, + "learning_rate": 0.00011770274761449274, + "loss": 0.846, + "step": 29838 + }, + { + "epoch": 0.7661818491111678, + "grad_norm": 0.671875, + "learning_rate": 0.00011769835390317118, + "loss": 0.8026, + "step": 29839 + }, + { + "epoch": 0.7662075263070897, + "grad_norm": 0.73828125, + "learning_rate": 0.00011769396015657846, + "loss": 0.7751, + "step": 29840 + }, + { + "epoch": 0.7662332035030115, + "grad_norm": 0.78125, + "learning_rate": 0.00011768956637472323, + "loss": 0.9117, + "step": 29841 + }, + { + "epoch": 0.7662588806989332, + "grad_norm": 0.75, + "learning_rate": 0.00011768517255761434, + "loss": 0.7826, + "step": 29842 + }, + { + "epoch": 0.7662845578948551, + "grad_norm": 0.8203125, + "learning_rate": 0.00011768077870526047, + "loss": 0.8264, + "step": 29843 + }, + { + "epoch": 0.7663102350907769, + "grad_norm": 0.7265625, + "learning_rate": 0.00011767638481767038, + "loss": 0.8288, + "step": 29844 + }, + { + "epoch": 0.7663359122866987, + "grad_norm": 0.796875, + "learning_rate": 0.00011767199089485288, + "loss": 0.9057, + "step": 29845 + }, + { + "epoch": 0.7663615894826206, + "grad_norm": 0.8125, + "learning_rate": 0.0001176675969368167, + "loss": 0.7868, + "step": 29846 + }, + { + "epoch": 0.7663872666785424, + "grad_norm": 0.8671875, + "learning_rate": 0.00011766320294357059, + "loss": 0.7317, + "step": 29847 + }, + { + "epoch": 0.7664129438744642, + "grad_norm": 0.78125, + "learning_rate": 0.00011765880891512331, + "loss": 0.8497, + "step": 29848 + }, + { + "epoch": 0.766438621070386, + "grad_norm": 0.73046875, + "learning_rate": 0.0001176544148514836, + "loss": 0.7359, + "step": 29849 + }, + { + "epoch": 0.7664642982663078, + "grad_norm": 0.80078125, + "learning_rate": 0.00011765002075266028, + "loss": 1.012, + "step": 29850 + }, + { + "epoch": 0.7664899754622296, + "grad_norm": 0.78515625, + "learning_rate": 0.00011764562661866203, + "loss": 0.8249, + "step": 29851 + }, + { + "epoch": 0.7665156526581515, + "grad_norm": 0.81640625, + "learning_rate": 0.00011764123244949765, + "loss": 0.8494, + "step": 29852 + }, + { + "epoch": 0.7665413298540733, + "grad_norm": 0.74609375, + "learning_rate": 0.0001176368382451759, + "loss": 0.8111, + "step": 29853 + }, + { + "epoch": 0.7665670070499951, + "grad_norm": 0.78515625, + "learning_rate": 0.00011763244400570552, + "loss": 0.7741, + "step": 29854 + }, + { + "epoch": 0.766592684245917, + "grad_norm": 0.81640625, + "learning_rate": 0.00011762804973109525, + "loss": 0.8204, + "step": 29855 + }, + { + "epoch": 0.7666183614418387, + "grad_norm": 0.796875, + "learning_rate": 0.00011762365542135389, + "loss": 0.9695, + "step": 29856 + }, + { + "epoch": 0.7666440386377605, + "grad_norm": 0.8125, + "learning_rate": 0.00011761926107649015, + "loss": 0.7786, + "step": 29857 + }, + { + "epoch": 0.7666697158336824, + "grad_norm": 0.87109375, + "learning_rate": 0.00011761486669651285, + "loss": 0.7588, + "step": 29858 + }, + { + "epoch": 0.7666953930296042, + "grad_norm": 0.77734375, + "learning_rate": 0.00011761047228143071, + "loss": 0.7975, + "step": 29859 + }, + { + "epoch": 0.766721070225526, + "grad_norm": 0.76171875, + "learning_rate": 0.00011760607783125245, + "loss": 0.7813, + "step": 29860 + }, + { + "epoch": 0.7667467474214479, + "grad_norm": 0.76171875, + "learning_rate": 0.00011760168334598691, + "loss": 0.6868, + "step": 29861 + }, + { + "epoch": 0.7667724246173696, + "grad_norm": 0.84765625, + "learning_rate": 0.00011759728882564278, + "loss": 0.8009, + "step": 29862 + }, + { + "epoch": 0.7667981018132914, + "grad_norm": 0.828125, + "learning_rate": 0.00011759289427022884, + "loss": 0.6955, + "step": 29863 + }, + { + "epoch": 0.7668237790092133, + "grad_norm": 0.828125, + "learning_rate": 0.00011758849967975387, + "loss": 0.8985, + "step": 29864 + }, + { + "epoch": 0.7668494562051351, + "grad_norm": 0.8671875, + "learning_rate": 0.00011758410505422661, + "loss": 0.8673, + "step": 29865 + }, + { + "epoch": 0.766875133401057, + "grad_norm": 0.796875, + "learning_rate": 0.0001175797103936558, + "loss": 0.7348, + "step": 29866 + }, + { + "epoch": 0.7669008105969788, + "grad_norm": 0.76171875, + "learning_rate": 0.00011757531569805022, + "loss": 0.8595, + "step": 29867 + }, + { + "epoch": 0.7669264877929006, + "grad_norm": 0.7734375, + "learning_rate": 0.00011757092096741862, + "loss": 0.7547, + "step": 29868 + }, + { + "epoch": 0.7669521649888223, + "grad_norm": 0.8359375, + "learning_rate": 0.00011756652620176977, + "loss": 0.7622, + "step": 29869 + }, + { + "epoch": 0.7669778421847442, + "grad_norm": 0.86328125, + "learning_rate": 0.00011756213140111243, + "loss": 0.8418, + "step": 29870 + }, + { + "epoch": 0.767003519380666, + "grad_norm": 0.8125, + "learning_rate": 0.00011755773656545533, + "loss": 0.8047, + "step": 29871 + }, + { + "epoch": 0.7670291965765879, + "grad_norm": 0.828125, + "learning_rate": 0.00011755334169480725, + "loss": 0.85, + "step": 29872 + }, + { + "epoch": 0.7670548737725097, + "grad_norm": 0.7890625, + "learning_rate": 0.00011754894678917696, + "loss": 0.776, + "step": 29873 + }, + { + "epoch": 0.7670805509684315, + "grad_norm": 0.83203125, + "learning_rate": 0.00011754455184857318, + "loss": 0.7944, + "step": 29874 + }, + { + "epoch": 0.7671062281643534, + "grad_norm": 0.765625, + "learning_rate": 0.00011754015687300473, + "loss": 0.849, + "step": 29875 + }, + { + "epoch": 0.7671319053602751, + "grad_norm": 0.83984375, + "learning_rate": 0.00011753576186248028, + "loss": 0.8878, + "step": 29876 + }, + { + "epoch": 0.7671575825561969, + "grad_norm": 0.81640625, + "learning_rate": 0.00011753136681700868, + "loss": 0.875, + "step": 29877 + }, + { + "epoch": 0.7671832597521188, + "grad_norm": 0.72265625, + "learning_rate": 0.00011752697173659863, + "loss": 0.7373, + "step": 29878 + }, + { + "epoch": 0.7672089369480406, + "grad_norm": 0.86328125, + "learning_rate": 0.00011752257662125892, + "loss": 0.8621, + "step": 29879 + }, + { + "epoch": 0.7672346141439624, + "grad_norm": 0.84375, + "learning_rate": 0.00011751818147099831, + "loss": 0.7845, + "step": 29880 + }, + { + "epoch": 0.7672602913398843, + "grad_norm": 0.8671875, + "learning_rate": 0.0001175137862858255, + "loss": 0.896, + "step": 29881 + }, + { + "epoch": 0.767285968535806, + "grad_norm": 0.73046875, + "learning_rate": 0.00011750939106574934, + "loss": 0.8485, + "step": 29882 + }, + { + "epoch": 0.7673116457317278, + "grad_norm": 0.8046875, + "learning_rate": 0.00011750499581077854, + "loss": 0.7653, + "step": 29883 + }, + { + "epoch": 0.7673373229276497, + "grad_norm": 1.171875, + "learning_rate": 0.00011750060052092185, + "loss": 0.7525, + "step": 29884 + }, + { + "epoch": 0.7673630001235715, + "grad_norm": 0.87109375, + "learning_rate": 0.00011749620519618805, + "loss": 0.7859, + "step": 29885 + }, + { + "epoch": 0.7673886773194933, + "grad_norm": 0.75, + "learning_rate": 0.0001174918098365859, + "loss": 0.9643, + "step": 29886 + }, + { + "epoch": 0.7674143545154152, + "grad_norm": 0.76171875, + "learning_rate": 0.00011748741444212414, + "loss": 0.8422, + "step": 29887 + }, + { + "epoch": 0.767440031711337, + "grad_norm": 0.765625, + "learning_rate": 0.00011748301901281156, + "loss": 0.8923, + "step": 29888 + }, + { + "epoch": 0.7674657089072587, + "grad_norm": 0.6953125, + "learning_rate": 0.00011747862354865686, + "loss": 0.7742, + "step": 29889 + }, + { + "epoch": 0.7674913861031806, + "grad_norm": 0.71875, + "learning_rate": 0.0001174742280496689, + "loss": 0.8824, + "step": 29890 + }, + { + "epoch": 0.7675170632991024, + "grad_norm": 0.87109375, + "learning_rate": 0.00011746983251585635, + "loss": 0.8841, + "step": 29891 + }, + { + "epoch": 0.7675427404950242, + "grad_norm": 0.7578125, + "learning_rate": 0.00011746543694722802, + "loss": 0.903, + "step": 29892 + }, + { + "epoch": 0.7675684176909461, + "grad_norm": 0.78515625, + "learning_rate": 0.00011746104134379263, + "loss": 0.9706, + "step": 29893 + }, + { + "epoch": 0.7675940948868679, + "grad_norm": 0.7421875, + "learning_rate": 0.00011745664570555896, + "loss": 0.7651, + "step": 29894 + }, + { + "epoch": 0.7676197720827898, + "grad_norm": 0.984375, + "learning_rate": 0.0001174522500325358, + "loss": 0.9971, + "step": 29895 + }, + { + "epoch": 0.7676454492787115, + "grad_norm": 0.84765625, + "learning_rate": 0.00011744785432473186, + "loss": 0.9636, + "step": 29896 + }, + { + "epoch": 0.7676711264746333, + "grad_norm": 0.8359375, + "learning_rate": 0.00011744345858215591, + "loss": 0.8182, + "step": 29897 + }, + { + "epoch": 0.7676968036705552, + "grad_norm": 0.7734375, + "learning_rate": 0.00011743906280481675, + "loss": 0.8278, + "step": 29898 + }, + { + "epoch": 0.767722480866477, + "grad_norm": 0.79296875, + "learning_rate": 0.00011743466699272314, + "loss": 0.9247, + "step": 29899 + }, + { + "epoch": 0.7677481580623988, + "grad_norm": 0.875, + "learning_rate": 0.00011743027114588375, + "loss": 0.9051, + "step": 29900 + }, + { + "epoch": 0.7677738352583207, + "grad_norm": 0.79296875, + "learning_rate": 0.00011742587526430744, + "loss": 0.7857, + "step": 29901 + }, + { + "epoch": 0.7677995124542424, + "grad_norm": 0.76953125, + "learning_rate": 0.00011742147934800294, + "loss": 0.8059, + "step": 29902 + }, + { + "epoch": 0.7678251896501642, + "grad_norm": 0.81640625, + "learning_rate": 0.00011741708339697897, + "loss": 0.7969, + "step": 29903 + }, + { + "epoch": 0.767850866846086, + "grad_norm": 0.76171875, + "learning_rate": 0.00011741268741124433, + "loss": 0.7716, + "step": 29904 + }, + { + "epoch": 0.7678765440420079, + "grad_norm": 0.82421875, + "learning_rate": 0.0001174082913908078, + "loss": 0.8525, + "step": 29905 + }, + { + "epoch": 0.7679022212379297, + "grad_norm": 0.78515625, + "learning_rate": 0.00011740389533567813, + "loss": 0.8181, + "step": 29906 + }, + { + "epoch": 0.7679278984338516, + "grad_norm": 0.83203125, + "learning_rate": 0.00011739949924586407, + "loss": 0.7995, + "step": 29907 + }, + { + "epoch": 0.7679535756297734, + "grad_norm": 0.86328125, + "learning_rate": 0.00011739510312137434, + "loss": 0.9623, + "step": 29908 + }, + { + "epoch": 0.7679792528256951, + "grad_norm": 0.734375, + "learning_rate": 0.00011739070696221777, + "loss": 0.7263, + "step": 29909 + }, + { + "epoch": 0.768004930021617, + "grad_norm": 0.7421875, + "learning_rate": 0.00011738631076840307, + "loss": 0.7362, + "step": 29910 + }, + { + "epoch": 0.7680306072175388, + "grad_norm": 0.796875, + "learning_rate": 0.00011738191453993902, + "loss": 0.9888, + "step": 29911 + }, + { + "epoch": 0.7680562844134606, + "grad_norm": 0.85546875, + "learning_rate": 0.00011737751827683442, + "loss": 0.9098, + "step": 29912 + }, + { + "epoch": 0.7680819616093825, + "grad_norm": 0.80078125, + "learning_rate": 0.00011737312197909798, + "loss": 0.9247, + "step": 29913 + }, + { + "epoch": 0.7681076388053043, + "grad_norm": 0.8046875, + "learning_rate": 0.00011736872564673846, + "loss": 0.7093, + "step": 29914 + }, + { + "epoch": 0.7681333160012261, + "grad_norm": 0.78125, + "learning_rate": 0.00011736432927976467, + "loss": 0.7888, + "step": 29915 + }, + { + "epoch": 0.7681589931971479, + "grad_norm": 0.7734375, + "learning_rate": 0.0001173599328781853, + "loss": 0.8651, + "step": 29916 + }, + { + "epoch": 0.7681846703930697, + "grad_norm": 0.8046875, + "learning_rate": 0.00011735553644200918, + "loss": 0.9165, + "step": 29917 + }, + { + "epoch": 0.7682103475889915, + "grad_norm": 0.765625, + "learning_rate": 0.00011735113997124505, + "loss": 0.6885, + "step": 29918 + }, + { + "epoch": 0.7682360247849134, + "grad_norm": 0.69921875, + "learning_rate": 0.00011734674346590165, + "loss": 0.8629, + "step": 29919 + }, + { + "epoch": 0.7682617019808352, + "grad_norm": 0.8203125, + "learning_rate": 0.00011734234692598778, + "loss": 0.9594, + "step": 29920 + }, + { + "epoch": 0.768287379176757, + "grad_norm": 0.74609375, + "learning_rate": 0.00011733795035151214, + "loss": 0.7808, + "step": 29921 + }, + { + "epoch": 0.7683130563726788, + "grad_norm": 0.7734375, + "learning_rate": 0.00011733355374248353, + "loss": 0.908, + "step": 29922 + }, + { + "epoch": 0.7683387335686006, + "grad_norm": 0.80078125, + "learning_rate": 0.00011732915709891074, + "loss": 0.8409, + "step": 29923 + }, + { + "epoch": 0.7683644107645224, + "grad_norm": 0.68359375, + "learning_rate": 0.00011732476042080249, + "loss": 0.7307, + "step": 29924 + }, + { + "epoch": 0.7683900879604443, + "grad_norm": 0.90625, + "learning_rate": 0.0001173203637081676, + "loss": 0.7148, + "step": 29925 + }, + { + "epoch": 0.7684157651563661, + "grad_norm": 1.0078125, + "learning_rate": 0.00011731596696101474, + "loss": 0.8598, + "step": 29926 + }, + { + "epoch": 0.768441442352288, + "grad_norm": 0.7421875, + "learning_rate": 0.00011731157017935275, + "loss": 0.7623, + "step": 29927 + }, + { + "epoch": 0.7684671195482098, + "grad_norm": 0.82421875, + "learning_rate": 0.00011730717336319035, + "loss": 0.8853, + "step": 29928 + }, + { + "epoch": 0.7684927967441315, + "grad_norm": 0.81640625, + "learning_rate": 0.00011730277651253629, + "loss": 0.7509, + "step": 29929 + }, + { + "epoch": 0.7685184739400533, + "grad_norm": 0.7890625, + "learning_rate": 0.0001172983796273994, + "loss": 0.7535, + "step": 29930 + }, + { + "epoch": 0.7685441511359752, + "grad_norm": 0.80859375, + "learning_rate": 0.0001172939827077884, + "loss": 0.8938, + "step": 29931 + }, + { + "epoch": 0.768569828331897, + "grad_norm": 0.7578125, + "learning_rate": 0.00011728958575371205, + "loss": 0.6213, + "step": 29932 + }, + { + "epoch": 0.7685955055278189, + "grad_norm": 0.765625, + "learning_rate": 0.00011728518876517913, + "loss": 0.8239, + "step": 29933 + }, + { + "epoch": 0.7686211827237407, + "grad_norm": 0.78515625, + "learning_rate": 0.00011728079174219835, + "loss": 0.8265, + "step": 29934 + }, + { + "epoch": 0.7686468599196625, + "grad_norm": 0.75390625, + "learning_rate": 0.00011727639468477853, + "loss": 0.7555, + "step": 29935 + }, + { + "epoch": 0.7686725371155843, + "grad_norm": 0.72265625, + "learning_rate": 0.00011727199759292844, + "loss": 0.7815, + "step": 29936 + }, + { + "epoch": 0.7686982143115061, + "grad_norm": 0.79296875, + "learning_rate": 0.00011726760046665677, + "loss": 0.8343, + "step": 29937 + }, + { + "epoch": 0.7687238915074279, + "grad_norm": 0.82421875, + "learning_rate": 0.0001172632033059724, + "loss": 0.8178, + "step": 29938 + }, + { + "epoch": 0.7687495687033498, + "grad_norm": 0.75390625, + "learning_rate": 0.000117258806110884, + "loss": 0.8368, + "step": 29939 + }, + { + "epoch": 0.7687752458992716, + "grad_norm": 0.86328125, + "learning_rate": 0.00011725440888140031, + "loss": 0.8907, + "step": 29940 + }, + { + "epoch": 0.7688009230951934, + "grad_norm": 0.71484375, + "learning_rate": 0.0001172500116175302, + "loss": 0.7789, + "step": 29941 + }, + { + "epoch": 0.7688266002911152, + "grad_norm": 0.97265625, + "learning_rate": 0.00011724561431928234, + "loss": 0.7526, + "step": 29942 + }, + { + "epoch": 0.768852277487037, + "grad_norm": 0.76171875, + "learning_rate": 0.00011724121698666554, + "loss": 0.8159, + "step": 29943 + }, + { + "epoch": 0.7688779546829588, + "grad_norm": 0.80859375, + "learning_rate": 0.00011723681961968855, + "loss": 0.8207, + "step": 29944 + }, + { + "epoch": 0.7689036318788807, + "grad_norm": 0.8046875, + "learning_rate": 0.00011723242221836015, + "loss": 0.8013, + "step": 29945 + }, + { + "epoch": 0.7689293090748025, + "grad_norm": 0.7734375, + "learning_rate": 0.00011722802478268908, + "loss": 0.7546, + "step": 29946 + }, + { + "epoch": 0.7689549862707243, + "grad_norm": 0.77734375, + "learning_rate": 0.00011722362731268412, + "loss": 0.8275, + "step": 29947 + }, + { + "epoch": 0.7689806634666462, + "grad_norm": 0.76953125, + "learning_rate": 0.000117219229808354, + "loss": 0.7515, + "step": 29948 + }, + { + "epoch": 0.7690063406625679, + "grad_norm": 0.87890625, + "learning_rate": 0.00011721483226970753, + "loss": 0.7878, + "step": 29949 + }, + { + "epoch": 0.7690320178584897, + "grad_norm": 0.80078125, + "learning_rate": 0.00011721043469675346, + "loss": 0.7058, + "step": 29950 + }, + { + "epoch": 0.7690576950544116, + "grad_norm": 0.83203125, + "learning_rate": 0.00011720603708950053, + "loss": 0.9879, + "step": 29951 + }, + { + "epoch": 0.7690833722503334, + "grad_norm": 0.8203125, + "learning_rate": 0.00011720163944795755, + "loss": 0.895, + "step": 29952 + }, + { + "epoch": 0.7691090494462552, + "grad_norm": 0.86328125, + "learning_rate": 0.00011719724177213322, + "loss": 0.8563, + "step": 29953 + }, + { + "epoch": 0.7691347266421771, + "grad_norm": 0.7890625, + "learning_rate": 0.00011719284406203637, + "loss": 0.7738, + "step": 29954 + }, + { + "epoch": 0.7691604038380988, + "grad_norm": 0.86328125, + "learning_rate": 0.00011718844631767573, + "loss": 0.8405, + "step": 29955 + }, + { + "epoch": 0.7691860810340206, + "grad_norm": 0.8046875, + "learning_rate": 0.00011718404853906004, + "loss": 0.9064, + "step": 29956 + }, + { + "epoch": 0.7692117582299425, + "grad_norm": 0.7578125, + "learning_rate": 0.00011717965072619812, + "loss": 0.8494, + "step": 29957 + }, + { + "epoch": 0.7692374354258643, + "grad_norm": 0.74609375, + "learning_rate": 0.00011717525287909871, + "loss": 0.8395, + "step": 29958 + }, + { + "epoch": 0.7692631126217861, + "grad_norm": 0.80078125, + "learning_rate": 0.00011717085499777059, + "loss": 0.8778, + "step": 29959 + }, + { + "epoch": 0.769288789817708, + "grad_norm": 0.78515625, + "learning_rate": 0.00011716645708222249, + "loss": 0.8094, + "step": 29960 + }, + { + "epoch": 0.7693144670136298, + "grad_norm": 0.765625, + "learning_rate": 0.00011716205913246317, + "loss": 0.7337, + "step": 29961 + }, + { + "epoch": 0.7693401442095515, + "grad_norm": 0.78515625, + "learning_rate": 0.00011715766114850141, + "loss": 0.8226, + "step": 29962 + }, + { + "epoch": 0.7693658214054734, + "grad_norm": 0.85546875, + "learning_rate": 0.00011715326313034602, + "loss": 0.7435, + "step": 29963 + }, + { + "epoch": 0.7693914986013952, + "grad_norm": 0.7734375, + "learning_rate": 0.00011714886507800571, + "loss": 0.7505, + "step": 29964 + }, + { + "epoch": 0.769417175797317, + "grad_norm": 0.875, + "learning_rate": 0.00011714446699148926, + "loss": 0.7975, + "step": 29965 + }, + { + "epoch": 0.7694428529932389, + "grad_norm": 0.828125, + "learning_rate": 0.00011714006887080544, + "loss": 0.8411, + "step": 29966 + }, + { + "epoch": 0.7694685301891607, + "grad_norm": 0.81640625, + "learning_rate": 0.000117135670715963, + "loss": 0.8858, + "step": 29967 + }, + { + "epoch": 0.7694942073850826, + "grad_norm": 0.77734375, + "learning_rate": 0.00011713127252697073, + "loss": 1.0038, + "step": 29968 + }, + { + "epoch": 0.7695198845810043, + "grad_norm": 0.8203125, + "learning_rate": 0.00011712687430383738, + "loss": 0.7833, + "step": 29969 + }, + { + "epoch": 0.7695455617769261, + "grad_norm": 0.76953125, + "learning_rate": 0.00011712247604657169, + "loss": 0.8204, + "step": 29970 + }, + { + "epoch": 0.769571238972848, + "grad_norm": 0.80078125, + "learning_rate": 0.00011711807775518249, + "loss": 0.7453, + "step": 29971 + }, + { + "epoch": 0.7695969161687698, + "grad_norm": 0.6875, + "learning_rate": 0.00011711367942967851, + "loss": 0.7202, + "step": 29972 + }, + { + "epoch": 0.7696225933646916, + "grad_norm": 0.85546875, + "learning_rate": 0.00011710928107006848, + "loss": 0.8466, + "step": 29973 + }, + { + "epoch": 0.7696482705606135, + "grad_norm": 0.8046875, + "learning_rate": 0.0001171048826763612, + "loss": 0.7398, + "step": 29974 + }, + { + "epoch": 0.7696739477565352, + "grad_norm": 0.77734375, + "learning_rate": 0.00011710048424856545, + "loss": 0.791, + "step": 29975 + }, + { + "epoch": 0.769699624952457, + "grad_norm": 0.81640625, + "learning_rate": 0.00011709608578668999, + "loss": 0.8683, + "step": 29976 + }, + { + "epoch": 0.7697253021483789, + "grad_norm": 0.84375, + "learning_rate": 0.00011709168729074353, + "loss": 0.899, + "step": 29977 + }, + { + "epoch": 0.7697509793443007, + "grad_norm": 0.71875, + "learning_rate": 0.00011708728876073494, + "loss": 0.7678, + "step": 29978 + }, + { + "epoch": 0.7697766565402225, + "grad_norm": 0.703125, + "learning_rate": 0.00011708289019667291, + "loss": 0.67, + "step": 29979 + }, + { + "epoch": 0.7698023337361444, + "grad_norm": 0.76953125, + "learning_rate": 0.00011707849159856618, + "loss": 0.7722, + "step": 29980 + }, + { + "epoch": 0.7698280109320662, + "grad_norm": 0.81640625, + "learning_rate": 0.00011707409296642362, + "loss": 0.8183, + "step": 29981 + }, + { + "epoch": 0.7698536881279879, + "grad_norm": 0.77734375, + "learning_rate": 0.00011706969430025387, + "loss": 0.8844, + "step": 29982 + }, + { + "epoch": 0.7698793653239098, + "grad_norm": 0.796875, + "learning_rate": 0.0001170652956000658, + "loss": 0.9964, + "step": 29983 + }, + { + "epoch": 0.7699050425198316, + "grad_norm": 0.6875, + "learning_rate": 0.00011706089686586816, + "loss": 0.7419, + "step": 29984 + }, + { + "epoch": 0.7699307197157534, + "grad_norm": 0.77734375, + "learning_rate": 0.00011705649809766965, + "loss": 0.7426, + "step": 29985 + }, + { + "epoch": 0.7699563969116753, + "grad_norm": 0.7890625, + "learning_rate": 0.00011705209929547911, + "loss": 0.8123, + "step": 29986 + }, + { + "epoch": 0.7699820741075971, + "grad_norm": 0.80859375, + "learning_rate": 0.00011704770045930527, + "loss": 0.874, + "step": 29987 + }, + { + "epoch": 0.770007751303519, + "grad_norm": 0.890625, + "learning_rate": 0.00011704330158915688, + "loss": 0.9085, + "step": 29988 + }, + { + "epoch": 0.7700334284994407, + "grad_norm": 0.7890625, + "learning_rate": 0.00011703890268504276, + "loss": 0.8623, + "step": 29989 + }, + { + "epoch": 0.7700591056953625, + "grad_norm": 0.74609375, + "learning_rate": 0.00011703450374697164, + "loss": 0.8031, + "step": 29990 + }, + { + "epoch": 0.7700847828912843, + "grad_norm": 0.8359375, + "learning_rate": 0.00011703010477495229, + "loss": 0.8134, + "step": 29991 + }, + { + "epoch": 0.7701104600872062, + "grad_norm": 0.98046875, + "learning_rate": 0.00011702570576899347, + "loss": 0.8462, + "step": 29992 + }, + { + "epoch": 0.770136137283128, + "grad_norm": 0.80859375, + "learning_rate": 0.00011702130672910394, + "loss": 0.8216, + "step": 29993 + }, + { + "epoch": 0.7701618144790499, + "grad_norm": 0.765625, + "learning_rate": 0.00011701690765529251, + "loss": 0.8113, + "step": 29994 + }, + { + "epoch": 0.7701874916749716, + "grad_norm": 0.765625, + "learning_rate": 0.00011701250854756793, + "loss": 0.753, + "step": 29995 + }, + { + "epoch": 0.7702131688708934, + "grad_norm": 0.80078125, + "learning_rate": 0.00011700810940593891, + "loss": 0.8585, + "step": 29996 + }, + { + "epoch": 0.7702388460668153, + "grad_norm": 0.76171875, + "learning_rate": 0.00011700371023041432, + "loss": 0.7659, + "step": 29997 + }, + { + "epoch": 0.7702645232627371, + "grad_norm": 0.73828125, + "learning_rate": 0.00011699931102100285, + "loss": 0.7572, + "step": 29998 + }, + { + "epoch": 0.7702902004586589, + "grad_norm": 0.796875, + "learning_rate": 0.00011699491177771329, + "loss": 0.96, + "step": 29999 + }, + { + "epoch": 0.7703158776545808, + "grad_norm": 0.7890625, + "learning_rate": 0.00011699051250055442, + "loss": 0.8763, + "step": 30000 + }, + { + "epoch": 0.7703158776545808, + "eval_loss": 0.8166946172714233, + "eval_runtime": 387.1891, + "eval_samples_per_second": 25.827, + "eval_steps_per_second": 0.808, + "step": 30000 + }, + { + "epoch": 0.7703415548505026, + "grad_norm": 0.7890625, + "learning_rate": 0.00011698611318953494, + "loss": 0.9043, + "step": 30001 + }, + { + "epoch": 0.7703672320464243, + "grad_norm": 0.83984375, + "learning_rate": 0.00011698171384466372, + "loss": 0.8628, + "step": 30002 + }, + { + "epoch": 0.7703929092423462, + "grad_norm": 0.7890625, + "learning_rate": 0.00011697731446594946, + "loss": 0.7379, + "step": 30003 + }, + { + "epoch": 0.770418586438268, + "grad_norm": 0.80859375, + "learning_rate": 0.00011697291505340095, + "loss": 0.8892, + "step": 30004 + }, + { + "epoch": 0.7704442636341898, + "grad_norm": 0.7734375, + "learning_rate": 0.00011696851560702697, + "loss": 0.7717, + "step": 30005 + }, + { + "epoch": 0.7704699408301117, + "grad_norm": 0.78125, + "learning_rate": 0.00011696411612683624, + "loss": 0.9251, + "step": 30006 + }, + { + "epoch": 0.7704956180260335, + "grad_norm": 0.75, + "learning_rate": 0.00011695971661283758, + "loss": 0.8244, + "step": 30007 + }, + { + "epoch": 0.7705212952219553, + "grad_norm": 0.77734375, + "learning_rate": 0.00011695531706503973, + "loss": 0.7736, + "step": 30008 + }, + { + "epoch": 0.7705469724178771, + "grad_norm": 0.703125, + "learning_rate": 0.00011695091748345146, + "loss": 0.7899, + "step": 30009 + }, + { + "epoch": 0.7705726496137989, + "grad_norm": 0.8515625, + "learning_rate": 0.00011694651786808155, + "loss": 0.9701, + "step": 30010 + }, + { + "epoch": 0.7705983268097207, + "grad_norm": 0.76171875, + "learning_rate": 0.00011694211821893877, + "loss": 0.8843, + "step": 30011 + }, + { + "epoch": 0.7706240040056426, + "grad_norm": 0.71484375, + "learning_rate": 0.00011693771853603186, + "loss": 0.7525, + "step": 30012 + }, + { + "epoch": 0.7706496812015644, + "grad_norm": 0.7578125, + "learning_rate": 0.00011693331881936961, + "loss": 0.7702, + "step": 30013 + }, + { + "epoch": 0.7706753583974862, + "grad_norm": 0.8046875, + "learning_rate": 0.00011692891906896078, + "loss": 0.8504, + "step": 30014 + }, + { + "epoch": 0.770701035593408, + "grad_norm": 0.78515625, + "learning_rate": 0.00011692451928481416, + "loss": 0.7562, + "step": 30015 + }, + { + "epoch": 0.7707267127893298, + "grad_norm": 0.78515625, + "learning_rate": 0.00011692011946693851, + "loss": 0.8657, + "step": 30016 + }, + { + "epoch": 0.7707523899852516, + "grad_norm": 0.66015625, + "learning_rate": 0.00011691571961534258, + "loss": 0.7423, + "step": 30017 + }, + { + "epoch": 0.7707780671811735, + "grad_norm": 0.75390625, + "learning_rate": 0.00011691131973003515, + "loss": 0.7602, + "step": 30018 + }, + { + "epoch": 0.7708037443770953, + "grad_norm": 0.78125, + "learning_rate": 0.00011690691981102495, + "loss": 0.6729, + "step": 30019 + }, + { + "epoch": 0.7708294215730171, + "grad_norm": 0.70703125, + "learning_rate": 0.00011690251985832083, + "loss": 0.833, + "step": 30020 + }, + { + "epoch": 0.770855098768939, + "grad_norm": 0.8125, + "learning_rate": 0.0001168981198719315, + "loss": 0.7808, + "step": 30021 + }, + { + "epoch": 0.7708807759648607, + "grad_norm": 0.76953125, + "learning_rate": 0.00011689371985186572, + "loss": 0.8767, + "step": 30022 + }, + { + "epoch": 0.7709064531607825, + "grad_norm": 0.84765625, + "learning_rate": 0.00011688931979813234, + "loss": 0.8849, + "step": 30023 + }, + { + "epoch": 0.7709321303567044, + "grad_norm": 0.7890625, + "learning_rate": 0.00011688491971074005, + "loss": 0.8478, + "step": 30024 + }, + { + "epoch": 0.7709578075526262, + "grad_norm": 0.82421875, + "learning_rate": 0.00011688051958969761, + "loss": 0.955, + "step": 30025 + }, + { + "epoch": 0.770983484748548, + "grad_norm": 0.69921875, + "learning_rate": 0.00011687611943501384, + "loss": 0.7435, + "step": 30026 + }, + { + "epoch": 0.7710091619444699, + "grad_norm": 0.7734375, + "learning_rate": 0.0001168717192466975, + "loss": 0.8514, + "step": 30027 + }, + { + "epoch": 0.7710348391403917, + "grad_norm": 0.90625, + "learning_rate": 0.00011686731902475731, + "loss": 0.8603, + "step": 30028 + }, + { + "epoch": 0.7710605163363135, + "grad_norm": 0.73046875, + "learning_rate": 0.00011686291876920212, + "loss": 0.8189, + "step": 30029 + }, + { + "epoch": 0.7710861935322353, + "grad_norm": 0.84375, + "learning_rate": 0.00011685851848004064, + "loss": 0.8393, + "step": 30030 + }, + { + "epoch": 0.7711118707281571, + "grad_norm": 0.80859375, + "learning_rate": 0.00011685411815728166, + "loss": 0.9101, + "step": 30031 + }, + { + "epoch": 0.771137547924079, + "grad_norm": 0.796875, + "learning_rate": 0.00011684971780093395, + "loss": 0.7456, + "step": 30032 + }, + { + "epoch": 0.7711632251200008, + "grad_norm": 0.7734375, + "learning_rate": 0.00011684531741100623, + "loss": 0.7718, + "step": 30033 + }, + { + "epoch": 0.7711889023159226, + "grad_norm": 0.78125, + "learning_rate": 0.00011684091698750737, + "loss": 0.8574, + "step": 30034 + }, + { + "epoch": 0.7712145795118444, + "grad_norm": 0.7734375, + "learning_rate": 0.00011683651653044605, + "loss": 0.746, + "step": 30035 + }, + { + "epoch": 0.7712402567077662, + "grad_norm": 0.7890625, + "learning_rate": 0.00011683211603983106, + "loss": 0.8103, + "step": 30036 + }, + { + "epoch": 0.771265933903688, + "grad_norm": 0.7265625, + "learning_rate": 0.00011682771551567125, + "loss": 0.747, + "step": 30037 + }, + { + "epoch": 0.7712916110996099, + "grad_norm": 0.75390625, + "learning_rate": 0.00011682331495797524, + "loss": 0.8967, + "step": 30038 + }, + { + "epoch": 0.7713172882955317, + "grad_norm": 0.6796875, + "learning_rate": 0.00011681891436675194, + "loss": 0.7668, + "step": 30039 + }, + { + "epoch": 0.7713429654914535, + "grad_norm": 0.78125, + "learning_rate": 0.00011681451374201004, + "loss": 0.8492, + "step": 30040 + }, + { + "epoch": 0.7713686426873754, + "grad_norm": 0.7890625, + "learning_rate": 0.00011681011308375833, + "loss": 0.7893, + "step": 30041 + }, + { + "epoch": 0.7713943198832971, + "grad_norm": 0.8046875, + "learning_rate": 0.00011680571239200559, + "loss": 0.7726, + "step": 30042 + }, + { + "epoch": 0.7714199970792189, + "grad_norm": 0.7890625, + "learning_rate": 0.0001168013116667606, + "loss": 0.8355, + "step": 30043 + }, + { + "epoch": 0.7714456742751408, + "grad_norm": 0.78125, + "learning_rate": 0.0001167969109080321, + "loss": 0.8336, + "step": 30044 + }, + { + "epoch": 0.7714713514710626, + "grad_norm": 1.25, + "learning_rate": 0.00011679251011582888, + "loss": 0.7411, + "step": 30045 + }, + { + "epoch": 0.7714970286669844, + "grad_norm": 0.765625, + "learning_rate": 0.00011678810929015968, + "loss": 0.94, + "step": 30046 + }, + { + "epoch": 0.7715227058629063, + "grad_norm": 0.734375, + "learning_rate": 0.00011678370843103333, + "loss": 0.7919, + "step": 30047 + }, + { + "epoch": 0.7715483830588281, + "grad_norm": 0.75390625, + "learning_rate": 0.00011677930753845854, + "loss": 0.9011, + "step": 30048 + }, + { + "epoch": 0.7715740602547498, + "grad_norm": 0.89453125, + "learning_rate": 0.0001167749066124441, + "loss": 0.8682, + "step": 30049 + }, + { + "epoch": 0.7715997374506717, + "grad_norm": 0.8671875, + "learning_rate": 0.00011677050565299881, + "loss": 0.7754, + "step": 30050 + }, + { + "epoch": 0.7716254146465935, + "grad_norm": 0.77734375, + "learning_rate": 0.0001167661046601314, + "loss": 0.8891, + "step": 30051 + }, + { + "epoch": 0.7716510918425153, + "grad_norm": 0.79296875, + "learning_rate": 0.00011676170363385066, + "loss": 0.8523, + "step": 30052 + }, + { + "epoch": 0.7716767690384372, + "grad_norm": 0.77734375, + "learning_rate": 0.00011675730257416537, + "loss": 0.7726, + "step": 30053 + }, + { + "epoch": 0.771702446234359, + "grad_norm": 0.8359375, + "learning_rate": 0.00011675290148108426, + "loss": 0.8504, + "step": 30054 + }, + { + "epoch": 0.7717281234302807, + "grad_norm": 0.75, + "learning_rate": 0.00011674850035461618, + "loss": 0.8265, + "step": 30055 + }, + { + "epoch": 0.7717538006262026, + "grad_norm": 0.79296875, + "learning_rate": 0.00011674409919476981, + "loss": 0.8213, + "step": 30056 + }, + { + "epoch": 0.7717794778221244, + "grad_norm": 0.85546875, + "learning_rate": 0.00011673969800155397, + "loss": 0.894, + "step": 30057 + }, + { + "epoch": 0.7718051550180463, + "grad_norm": 0.7109375, + "learning_rate": 0.00011673529677497745, + "loss": 0.8669, + "step": 30058 + }, + { + "epoch": 0.7718308322139681, + "grad_norm": 0.8046875, + "learning_rate": 0.00011673089551504897, + "loss": 0.8284, + "step": 30059 + }, + { + "epoch": 0.7718565094098899, + "grad_norm": 0.85546875, + "learning_rate": 0.00011672649422177733, + "loss": 0.9102, + "step": 30060 + }, + { + "epoch": 0.7718821866058118, + "grad_norm": 0.76953125, + "learning_rate": 0.0001167220928951713, + "loss": 0.8371, + "step": 30061 + }, + { + "epoch": 0.7719078638017335, + "grad_norm": 0.79296875, + "learning_rate": 0.00011671769153523964, + "loss": 0.8783, + "step": 30062 + }, + { + "epoch": 0.7719335409976553, + "grad_norm": 0.734375, + "learning_rate": 0.00011671329014199117, + "loss": 0.6811, + "step": 30063 + }, + { + "epoch": 0.7719592181935772, + "grad_norm": 0.73828125, + "learning_rate": 0.0001167088887154346, + "loss": 0.7786, + "step": 30064 + }, + { + "epoch": 0.771984895389499, + "grad_norm": 0.83984375, + "learning_rate": 0.0001167044872555787, + "loss": 0.9021, + "step": 30065 + }, + { + "epoch": 0.7720105725854208, + "grad_norm": 0.7265625, + "learning_rate": 0.00011670008576243228, + "loss": 0.8812, + "step": 30066 + }, + { + "epoch": 0.7720362497813427, + "grad_norm": 0.74609375, + "learning_rate": 0.00011669568423600407, + "loss": 0.7498, + "step": 30067 + }, + { + "epoch": 0.7720619269772645, + "grad_norm": 0.7421875, + "learning_rate": 0.00011669128267630291, + "loss": 0.7948, + "step": 30068 + }, + { + "epoch": 0.7720876041731862, + "grad_norm": 0.83984375, + "learning_rate": 0.00011668688108333754, + "loss": 0.8072, + "step": 30069 + }, + { + "epoch": 0.7721132813691081, + "grad_norm": 0.75390625, + "learning_rate": 0.0001166824794571167, + "loss": 0.7135, + "step": 30070 + }, + { + "epoch": 0.7721389585650299, + "grad_norm": 0.6953125, + "learning_rate": 0.0001166780777976492, + "loss": 0.6453, + "step": 30071 + }, + { + "epoch": 0.7721646357609517, + "grad_norm": 0.7578125, + "learning_rate": 0.0001166736761049438, + "loss": 0.9271, + "step": 30072 + }, + { + "epoch": 0.7721903129568736, + "grad_norm": 0.77734375, + "learning_rate": 0.00011666927437900924, + "loss": 0.764, + "step": 30073 + }, + { + "epoch": 0.7722159901527954, + "grad_norm": 0.83203125, + "learning_rate": 0.00011666487261985435, + "loss": 0.7596, + "step": 30074 + }, + { + "epoch": 0.7722416673487171, + "grad_norm": 0.796875, + "learning_rate": 0.00011666047082748786, + "loss": 0.8317, + "step": 30075 + }, + { + "epoch": 0.772267344544639, + "grad_norm": 0.7578125, + "learning_rate": 0.00011665606900191855, + "loss": 0.9552, + "step": 30076 + }, + { + "epoch": 0.7722930217405608, + "grad_norm": 0.69140625, + "learning_rate": 0.00011665166714315523, + "loss": 0.6694, + "step": 30077 + }, + { + "epoch": 0.7723186989364826, + "grad_norm": 0.7578125, + "learning_rate": 0.00011664726525120659, + "loss": 0.7313, + "step": 30078 + }, + { + "epoch": 0.7723443761324045, + "grad_norm": 0.87890625, + "learning_rate": 0.0001166428633260815, + "loss": 0.8446, + "step": 30079 + }, + { + "epoch": 0.7723700533283263, + "grad_norm": 0.703125, + "learning_rate": 0.00011663846136778866, + "loss": 0.9042, + "step": 30080 + }, + { + "epoch": 0.7723957305242481, + "grad_norm": 0.8203125, + "learning_rate": 0.00011663405937633686, + "loss": 0.7959, + "step": 30081 + }, + { + "epoch": 0.7724214077201699, + "grad_norm": 0.85546875, + "learning_rate": 0.00011662965735173492, + "loss": 0.7625, + "step": 30082 + }, + { + "epoch": 0.7724470849160917, + "grad_norm": 0.77734375, + "learning_rate": 0.00011662525529399157, + "loss": 0.8829, + "step": 30083 + }, + { + "epoch": 0.7724727621120135, + "grad_norm": 0.8203125, + "learning_rate": 0.00011662085320311557, + "loss": 0.8594, + "step": 30084 + }, + { + "epoch": 0.7724984393079354, + "grad_norm": 0.984375, + "learning_rate": 0.00011661645107911575, + "loss": 0.9346, + "step": 30085 + }, + { + "epoch": 0.7725241165038572, + "grad_norm": 0.859375, + "learning_rate": 0.00011661204892200077, + "loss": 0.8562, + "step": 30086 + }, + { + "epoch": 0.772549793699779, + "grad_norm": 0.77734375, + "learning_rate": 0.00011660764673177953, + "loss": 0.8079, + "step": 30087 + }, + { + "epoch": 0.7725754708957009, + "grad_norm": 0.79296875, + "learning_rate": 0.00011660324450846074, + "loss": 0.9401, + "step": 30088 + }, + { + "epoch": 0.7726011480916226, + "grad_norm": 0.8203125, + "learning_rate": 0.00011659884225205317, + "loss": 0.8932, + "step": 30089 + }, + { + "epoch": 0.7726268252875444, + "grad_norm": 0.75, + "learning_rate": 0.00011659443996256565, + "loss": 0.7211, + "step": 30090 + }, + { + "epoch": 0.7726525024834663, + "grad_norm": 0.80078125, + "learning_rate": 0.00011659003764000684, + "loss": 0.8701, + "step": 30091 + }, + { + "epoch": 0.7726781796793881, + "grad_norm": 0.7578125, + "learning_rate": 0.00011658563528438564, + "loss": 0.7493, + "step": 30092 + }, + { + "epoch": 0.77270385687531, + "grad_norm": 0.78515625, + "learning_rate": 0.00011658123289571075, + "loss": 0.8829, + "step": 30093 + }, + { + "epoch": 0.7727295340712318, + "grad_norm": 0.84375, + "learning_rate": 0.00011657683047399094, + "loss": 0.8485, + "step": 30094 + }, + { + "epoch": 0.7727552112671535, + "grad_norm": 0.79296875, + "learning_rate": 0.00011657242801923503, + "loss": 0.8413, + "step": 30095 + }, + { + "epoch": 0.7727808884630754, + "grad_norm": 0.765625, + "learning_rate": 0.00011656802553145176, + "loss": 0.8186, + "step": 30096 + }, + { + "epoch": 0.7728065656589972, + "grad_norm": 0.72265625, + "learning_rate": 0.00011656362301064993, + "loss": 0.7943, + "step": 30097 + }, + { + "epoch": 0.772832242854919, + "grad_norm": 0.7578125, + "learning_rate": 0.0001165592204568383, + "loss": 0.737, + "step": 30098 + }, + { + "epoch": 0.7728579200508409, + "grad_norm": 0.84375, + "learning_rate": 0.00011655481787002558, + "loss": 0.704, + "step": 30099 + }, + { + "epoch": 0.7728835972467627, + "grad_norm": 0.78125, + "learning_rate": 0.00011655041525022066, + "loss": 0.8598, + "step": 30100 + }, + { + "epoch": 0.7729092744426845, + "grad_norm": 0.84375, + "learning_rate": 0.00011654601259743223, + "loss": 0.8651, + "step": 30101 + }, + { + "epoch": 0.7729349516386063, + "grad_norm": 0.72265625, + "learning_rate": 0.0001165416099116691, + "loss": 0.8216, + "step": 30102 + }, + { + "epoch": 0.7729606288345281, + "grad_norm": 0.7578125, + "learning_rate": 0.00011653720719294005, + "loss": 0.8411, + "step": 30103 + }, + { + "epoch": 0.7729863060304499, + "grad_norm": 0.7421875, + "learning_rate": 0.00011653280444125383, + "loss": 0.7035, + "step": 30104 + }, + { + "epoch": 0.7730119832263718, + "grad_norm": 0.84765625, + "learning_rate": 0.0001165284016566192, + "loss": 0.8918, + "step": 30105 + }, + { + "epoch": 0.7730376604222936, + "grad_norm": 0.953125, + "learning_rate": 0.00011652399883904499, + "loss": 0.8446, + "step": 30106 + }, + { + "epoch": 0.7730633376182154, + "grad_norm": 0.80078125, + "learning_rate": 0.00011651959598853992, + "loss": 0.7674, + "step": 30107 + }, + { + "epoch": 0.7730890148141373, + "grad_norm": 0.73046875, + "learning_rate": 0.00011651519310511281, + "loss": 0.8007, + "step": 30108 + }, + { + "epoch": 0.773114692010059, + "grad_norm": 0.88671875, + "learning_rate": 0.00011651079018877243, + "loss": 1.0272, + "step": 30109 + }, + { + "epoch": 0.7731403692059808, + "grad_norm": 0.8828125, + "learning_rate": 0.00011650638723952747, + "loss": 0.7283, + "step": 30110 + }, + { + "epoch": 0.7731660464019027, + "grad_norm": 0.76171875, + "learning_rate": 0.00011650198425738683, + "loss": 0.7363, + "step": 30111 + }, + { + "epoch": 0.7731917235978245, + "grad_norm": 0.7734375, + "learning_rate": 0.00011649758124235921, + "loss": 0.7997, + "step": 30112 + }, + { + "epoch": 0.7732174007937463, + "grad_norm": 0.7421875, + "learning_rate": 0.00011649317819445337, + "loss": 0.9806, + "step": 30113 + }, + { + "epoch": 0.7732430779896682, + "grad_norm": 0.8671875, + "learning_rate": 0.00011648877511367814, + "loss": 0.8341, + "step": 30114 + }, + { + "epoch": 0.7732687551855899, + "grad_norm": 0.796875, + "learning_rate": 0.00011648437200004229, + "loss": 0.8114, + "step": 30115 + }, + { + "epoch": 0.7732944323815117, + "grad_norm": 0.703125, + "learning_rate": 0.00011647996885355456, + "loss": 0.7204, + "step": 30116 + }, + { + "epoch": 0.7733201095774336, + "grad_norm": 0.828125, + "learning_rate": 0.00011647556567422376, + "loss": 0.8806, + "step": 30117 + }, + { + "epoch": 0.7733457867733554, + "grad_norm": 0.7421875, + "learning_rate": 0.00011647116246205861, + "loss": 0.6407, + "step": 30118 + }, + { + "epoch": 0.7733714639692773, + "grad_norm": 0.76171875, + "learning_rate": 0.00011646675921706794, + "loss": 0.7663, + "step": 30119 + }, + { + "epoch": 0.7733971411651991, + "grad_norm": 0.7734375, + "learning_rate": 0.0001164623559392605, + "loss": 0.8295, + "step": 30120 + }, + { + "epoch": 0.7734228183611209, + "grad_norm": 0.71875, + "learning_rate": 0.00011645795262864505, + "loss": 0.7292, + "step": 30121 + }, + { + "epoch": 0.7734484955570426, + "grad_norm": 0.8125, + "learning_rate": 0.0001164535492852304, + "loss": 0.7487, + "step": 30122 + }, + { + "epoch": 0.7734741727529645, + "grad_norm": 0.765625, + "learning_rate": 0.00011644914590902534, + "loss": 0.8637, + "step": 30123 + }, + { + "epoch": 0.7734998499488863, + "grad_norm": 0.78125, + "learning_rate": 0.00011644474250003861, + "loss": 0.8238, + "step": 30124 + }, + { + "epoch": 0.7735255271448082, + "grad_norm": 0.7734375, + "learning_rate": 0.00011644033905827897, + "loss": 0.926, + "step": 30125 + }, + { + "epoch": 0.77355120434073, + "grad_norm": 0.8359375, + "learning_rate": 0.00011643593558375522, + "loss": 0.8989, + "step": 30126 + }, + { + "epoch": 0.7735768815366518, + "grad_norm": 0.76953125, + "learning_rate": 0.00011643153207647616, + "loss": 0.7405, + "step": 30127 + }, + { + "epoch": 0.7736025587325737, + "grad_norm": 0.765625, + "learning_rate": 0.00011642712853645054, + "loss": 0.757, + "step": 30128 + }, + { + "epoch": 0.7736282359284954, + "grad_norm": 0.71484375, + "learning_rate": 0.00011642272496368714, + "loss": 0.7673, + "step": 30129 + }, + { + "epoch": 0.7736539131244172, + "grad_norm": 0.8125, + "learning_rate": 0.0001164183213581947, + "loss": 0.7787, + "step": 30130 + }, + { + "epoch": 0.7736795903203391, + "grad_norm": 0.75390625, + "learning_rate": 0.00011641391771998205, + "loss": 0.831, + "step": 30131 + }, + { + "epoch": 0.7737052675162609, + "grad_norm": 0.765625, + "learning_rate": 0.00011640951404905793, + "loss": 0.7893, + "step": 30132 + }, + { + "epoch": 0.7737309447121827, + "grad_norm": 0.8515625, + "learning_rate": 0.00011640511034543115, + "loss": 0.8195, + "step": 30133 + }, + { + "epoch": 0.7737566219081046, + "grad_norm": 0.796875, + "learning_rate": 0.00011640070660911044, + "loss": 0.794, + "step": 30134 + }, + { + "epoch": 0.7737822991040263, + "grad_norm": 0.75390625, + "learning_rate": 0.00011639630284010463, + "loss": 0.7827, + "step": 30135 + }, + { + "epoch": 0.7738079762999481, + "grad_norm": 0.78515625, + "learning_rate": 0.00011639189903842247, + "loss": 0.8281, + "step": 30136 + }, + { + "epoch": 0.77383365349587, + "grad_norm": 0.8203125, + "learning_rate": 0.00011638749520407274, + "loss": 0.8896, + "step": 30137 + }, + { + "epoch": 0.7738593306917918, + "grad_norm": 0.7578125, + "learning_rate": 0.0001163830913370642, + "loss": 0.8291, + "step": 30138 + }, + { + "epoch": 0.7738850078877136, + "grad_norm": 0.828125, + "learning_rate": 0.00011637868743740563, + "loss": 0.765, + "step": 30139 + }, + { + "epoch": 0.7739106850836355, + "grad_norm": 0.77734375, + "learning_rate": 0.00011637428350510582, + "loss": 0.8051, + "step": 30140 + }, + { + "epoch": 0.7739363622795573, + "grad_norm": 0.78515625, + "learning_rate": 0.00011636987954017356, + "loss": 0.809, + "step": 30141 + }, + { + "epoch": 0.773962039475479, + "grad_norm": 0.82421875, + "learning_rate": 0.00011636547554261761, + "loss": 0.692, + "step": 30142 + }, + { + "epoch": 0.7739877166714009, + "grad_norm": 0.7890625, + "learning_rate": 0.00011636107151244673, + "loss": 0.889, + "step": 30143 + }, + { + "epoch": 0.7740133938673227, + "grad_norm": 0.65625, + "learning_rate": 0.00011635666744966969, + "loss": 0.6886, + "step": 30144 + }, + { + "epoch": 0.7740390710632445, + "grad_norm": 0.75, + "learning_rate": 0.00011635226335429532, + "loss": 0.7648, + "step": 30145 + }, + { + "epoch": 0.7740647482591664, + "grad_norm": 0.84765625, + "learning_rate": 0.00011634785922633238, + "loss": 0.82, + "step": 30146 + }, + { + "epoch": 0.7740904254550882, + "grad_norm": 0.76953125, + "learning_rate": 0.00011634345506578958, + "loss": 0.7763, + "step": 30147 + }, + { + "epoch": 0.77411610265101, + "grad_norm": 0.80859375, + "learning_rate": 0.00011633905087267578, + "loss": 0.9132, + "step": 30148 + }, + { + "epoch": 0.7741417798469318, + "grad_norm": 0.78125, + "learning_rate": 0.00011633464664699978, + "loss": 0.8173, + "step": 30149 + }, + { + "epoch": 0.7741674570428536, + "grad_norm": 0.85546875, + "learning_rate": 0.00011633024238877022, + "loss": 0.8494, + "step": 30150 + }, + { + "epoch": 0.7741931342387754, + "grad_norm": 0.796875, + "learning_rate": 0.000116325838097996, + "loss": 0.8775, + "step": 30151 + }, + { + "epoch": 0.7742188114346973, + "grad_norm": 0.82421875, + "learning_rate": 0.00011632143377468588, + "loss": 0.8756, + "step": 30152 + }, + { + "epoch": 0.7742444886306191, + "grad_norm": 0.8828125, + "learning_rate": 0.00011631702941884856, + "loss": 0.7634, + "step": 30153 + }, + { + "epoch": 0.774270165826541, + "grad_norm": 0.796875, + "learning_rate": 0.00011631262503049292, + "loss": 0.9482, + "step": 30154 + }, + { + "epoch": 0.7742958430224627, + "grad_norm": 0.8515625, + "learning_rate": 0.00011630822060962769, + "loss": 0.8704, + "step": 30155 + }, + { + "epoch": 0.7743215202183845, + "grad_norm": 0.80859375, + "learning_rate": 0.00011630381615626165, + "loss": 0.8172, + "step": 30156 + }, + { + "epoch": 0.7743471974143064, + "grad_norm": 0.73828125, + "learning_rate": 0.00011629941167040357, + "loss": 0.8139, + "step": 30157 + }, + { + "epoch": 0.7743728746102282, + "grad_norm": 0.76171875, + "learning_rate": 0.00011629500715206221, + "loss": 0.7941, + "step": 30158 + }, + { + "epoch": 0.77439855180615, + "grad_norm": 0.7578125, + "learning_rate": 0.0001162906026012464, + "loss": 0.8831, + "step": 30159 + }, + { + "epoch": 0.7744242290020719, + "grad_norm": 0.8671875, + "learning_rate": 0.00011628619801796488, + "loss": 0.8252, + "step": 30160 + }, + { + "epoch": 0.7744499061979937, + "grad_norm": 0.7578125, + "learning_rate": 0.00011628179340222643, + "loss": 0.7631, + "step": 30161 + }, + { + "epoch": 0.7744755833939154, + "grad_norm": 0.8203125, + "learning_rate": 0.00011627738875403986, + "loss": 0.9224, + "step": 30162 + }, + { + "epoch": 0.7745012605898373, + "grad_norm": 0.73828125, + "learning_rate": 0.00011627298407341392, + "loss": 0.8093, + "step": 30163 + }, + { + "epoch": 0.7745269377857591, + "grad_norm": 0.76953125, + "learning_rate": 0.00011626857936035738, + "loss": 0.8781, + "step": 30164 + }, + { + "epoch": 0.7745526149816809, + "grad_norm": 0.74609375, + "learning_rate": 0.00011626417461487904, + "loss": 0.7273, + "step": 30165 + }, + { + "epoch": 0.7745782921776028, + "grad_norm": 0.7265625, + "learning_rate": 0.00011625976983698765, + "loss": 0.8793, + "step": 30166 + }, + { + "epoch": 0.7746039693735246, + "grad_norm": 0.8125, + "learning_rate": 0.00011625536502669204, + "loss": 0.8249, + "step": 30167 + }, + { + "epoch": 0.7746296465694463, + "grad_norm": 0.765625, + "learning_rate": 0.00011625096018400095, + "loss": 0.6469, + "step": 30168 + }, + { + "epoch": 0.7746553237653682, + "grad_norm": 0.84765625, + "learning_rate": 0.00011624655530892314, + "loss": 0.8561, + "step": 30169 + }, + { + "epoch": 0.77468100096129, + "grad_norm": 0.83203125, + "learning_rate": 0.00011624215040146745, + "loss": 0.8952, + "step": 30170 + }, + { + "epoch": 0.7747066781572118, + "grad_norm": 1.453125, + "learning_rate": 0.00011623774546164256, + "loss": 0.7216, + "step": 30171 + }, + { + "epoch": 0.7747323553531337, + "grad_norm": 0.82421875, + "learning_rate": 0.00011623334048945736, + "loss": 0.8855, + "step": 30172 + }, + { + "epoch": 0.7747580325490555, + "grad_norm": 0.7734375, + "learning_rate": 0.00011622893548492058, + "loss": 0.7416, + "step": 30173 + }, + { + "epoch": 0.7747837097449773, + "grad_norm": 0.77734375, + "learning_rate": 0.00011622453044804096, + "loss": 0.8571, + "step": 30174 + }, + { + "epoch": 0.7748093869408991, + "grad_norm": 0.76171875, + "learning_rate": 0.00011622012537882736, + "loss": 0.7271, + "step": 30175 + }, + { + "epoch": 0.7748350641368209, + "grad_norm": 0.8046875, + "learning_rate": 0.00011621572027728848, + "loss": 0.8602, + "step": 30176 + }, + { + "epoch": 0.7748607413327427, + "grad_norm": 0.7109375, + "learning_rate": 0.00011621131514343315, + "loss": 0.8421, + "step": 30177 + }, + { + "epoch": 0.7748864185286646, + "grad_norm": 0.921875, + "learning_rate": 0.00011620690997727015, + "loss": 0.8411, + "step": 30178 + }, + { + "epoch": 0.7749120957245864, + "grad_norm": 0.76953125, + "learning_rate": 0.00011620250477880821, + "loss": 0.7404, + "step": 30179 + }, + { + "epoch": 0.7749377729205082, + "grad_norm": 0.79296875, + "learning_rate": 0.00011619809954805616, + "loss": 0.7991, + "step": 30180 + }, + { + "epoch": 0.7749634501164301, + "grad_norm": 0.7421875, + "learning_rate": 0.00011619369428502276, + "loss": 0.798, + "step": 30181 + }, + { + "epoch": 0.7749891273123518, + "grad_norm": 0.75390625, + "learning_rate": 0.0001161892889897168, + "loss": 0.7325, + "step": 30182 + }, + { + "epoch": 0.7750148045082736, + "grad_norm": 0.75, + "learning_rate": 0.00011618488366214703, + "loss": 0.692, + "step": 30183 + }, + { + "epoch": 0.7750404817041955, + "grad_norm": 0.7578125, + "learning_rate": 0.00011618047830232226, + "loss": 0.7939, + "step": 30184 + }, + { + "epoch": 0.7750661589001173, + "grad_norm": 0.79296875, + "learning_rate": 0.00011617607291025125, + "loss": 0.7997, + "step": 30185 + }, + { + "epoch": 0.7750918360960392, + "grad_norm": 0.76953125, + "learning_rate": 0.00011617166748594279, + "loss": 0.7677, + "step": 30186 + }, + { + "epoch": 0.775117513291961, + "grad_norm": 0.76953125, + "learning_rate": 0.00011616726202940565, + "loss": 0.8706, + "step": 30187 + }, + { + "epoch": 0.7751431904878827, + "grad_norm": 0.70703125, + "learning_rate": 0.00011616285654064865, + "loss": 0.8507, + "step": 30188 + }, + { + "epoch": 0.7751688676838046, + "grad_norm": 0.80859375, + "learning_rate": 0.00011615845101968053, + "loss": 0.81, + "step": 30189 + }, + { + "epoch": 0.7751945448797264, + "grad_norm": 0.7734375, + "learning_rate": 0.00011615404546651003, + "loss": 0.8785, + "step": 30190 + }, + { + "epoch": 0.7752202220756482, + "grad_norm": 0.78515625, + "learning_rate": 0.00011614963988114602, + "loss": 0.7318, + "step": 30191 + }, + { + "epoch": 0.7752458992715701, + "grad_norm": 0.7421875, + "learning_rate": 0.0001161452342635972, + "loss": 0.7277, + "step": 30192 + }, + { + "epoch": 0.7752715764674919, + "grad_norm": 0.8125, + "learning_rate": 0.00011614082861387243, + "loss": 0.8511, + "step": 30193 + }, + { + "epoch": 0.7752972536634137, + "grad_norm": 0.7578125, + "learning_rate": 0.00011613642293198044, + "loss": 0.7619, + "step": 30194 + }, + { + "epoch": 0.7753229308593355, + "grad_norm": 0.76953125, + "learning_rate": 0.00011613201721793, + "loss": 0.6576, + "step": 30195 + }, + { + "epoch": 0.7753486080552573, + "grad_norm": 0.79296875, + "learning_rate": 0.00011612761147172992, + "loss": 0.9245, + "step": 30196 + }, + { + "epoch": 0.7753742852511791, + "grad_norm": 0.8125, + "learning_rate": 0.00011612320569338894, + "loss": 0.8048, + "step": 30197 + }, + { + "epoch": 0.775399962447101, + "grad_norm": 0.859375, + "learning_rate": 0.0001161187998829159, + "loss": 0.9123, + "step": 30198 + }, + { + "epoch": 0.7754256396430228, + "grad_norm": 0.796875, + "learning_rate": 0.00011611439404031952, + "loss": 0.8425, + "step": 30199 + }, + { + "epoch": 0.7754513168389446, + "grad_norm": 0.86328125, + "learning_rate": 0.00011610998816560863, + "loss": 0.8225, + "step": 30200 + }, + { + "epoch": 0.7754769940348665, + "grad_norm": 0.8203125, + "learning_rate": 0.00011610558225879198, + "loss": 0.8934, + "step": 30201 + }, + { + "epoch": 0.7755026712307882, + "grad_norm": 0.765625, + "learning_rate": 0.00011610117631987835, + "loss": 0.7919, + "step": 30202 + }, + { + "epoch": 0.77552834842671, + "grad_norm": 0.9375, + "learning_rate": 0.00011609677034887653, + "loss": 0.8549, + "step": 30203 + }, + { + "epoch": 0.7755540256226319, + "grad_norm": 0.83203125, + "learning_rate": 0.00011609236434579534, + "loss": 0.727, + "step": 30204 + }, + { + "epoch": 0.7755797028185537, + "grad_norm": 0.859375, + "learning_rate": 0.00011608795831064347, + "loss": 1.0123, + "step": 30205 + }, + { + "epoch": 0.7756053800144755, + "grad_norm": 0.8671875, + "learning_rate": 0.00011608355224342977, + "loss": 0.9394, + "step": 30206 + }, + { + "epoch": 0.7756310572103974, + "grad_norm": 0.80078125, + "learning_rate": 0.00011607914614416303, + "loss": 0.9006, + "step": 30207 + }, + { + "epoch": 0.7756567344063191, + "grad_norm": 0.80859375, + "learning_rate": 0.00011607474001285197, + "loss": 0.7703, + "step": 30208 + }, + { + "epoch": 0.7756824116022409, + "grad_norm": 1.125, + "learning_rate": 0.00011607033384950543, + "loss": 0.9776, + "step": 30209 + }, + { + "epoch": 0.7757080887981628, + "grad_norm": 0.828125, + "learning_rate": 0.00011606592765413216, + "loss": 0.7874, + "step": 30210 + }, + { + "epoch": 0.7757337659940846, + "grad_norm": 0.7421875, + "learning_rate": 0.00011606152142674092, + "loss": 0.8948, + "step": 30211 + }, + { + "epoch": 0.7757594431900064, + "grad_norm": 0.98046875, + "learning_rate": 0.00011605711516734055, + "loss": 0.8895, + "step": 30212 + }, + { + "epoch": 0.7757851203859283, + "grad_norm": 0.8359375, + "learning_rate": 0.00011605270887593978, + "loss": 0.7804, + "step": 30213 + }, + { + "epoch": 0.7758107975818501, + "grad_norm": 0.79296875, + "learning_rate": 0.0001160483025525474, + "loss": 0.9291, + "step": 30214 + }, + { + "epoch": 0.7758364747777718, + "grad_norm": 0.77734375, + "learning_rate": 0.00011604389619717226, + "loss": 0.7666, + "step": 30215 + }, + { + "epoch": 0.7758621519736937, + "grad_norm": 0.76953125, + "learning_rate": 0.00011603948980982302, + "loss": 0.7709, + "step": 30216 + }, + { + "epoch": 0.7758878291696155, + "grad_norm": 0.765625, + "learning_rate": 0.00011603508339050857, + "loss": 0.796, + "step": 30217 + }, + { + "epoch": 0.7759135063655374, + "grad_norm": 0.73046875, + "learning_rate": 0.00011603067693923764, + "loss": 0.7955, + "step": 30218 + }, + { + "epoch": 0.7759391835614592, + "grad_norm": 0.796875, + "learning_rate": 0.00011602627045601897, + "loss": 0.9251, + "step": 30219 + }, + { + "epoch": 0.775964860757381, + "grad_norm": 0.8046875, + "learning_rate": 0.00011602186394086146, + "loss": 0.8551, + "step": 30220 + }, + { + "epoch": 0.7759905379533029, + "grad_norm": 0.79296875, + "learning_rate": 0.00011601745739377379, + "loss": 0.9045, + "step": 30221 + }, + { + "epoch": 0.7760162151492246, + "grad_norm": 0.8125, + "learning_rate": 0.00011601305081476477, + "loss": 0.8173, + "step": 30222 + }, + { + "epoch": 0.7760418923451464, + "grad_norm": 0.7578125, + "learning_rate": 0.00011600864420384319, + "loss": 0.8862, + "step": 30223 + }, + { + "epoch": 0.7760675695410683, + "grad_norm": 0.7890625, + "learning_rate": 0.00011600423756101781, + "loss": 0.899, + "step": 30224 + }, + { + "epoch": 0.7760932467369901, + "grad_norm": 0.76953125, + "learning_rate": 0.00011599983088629747, + "loss": 0.7974, + "step": 30225 + }, + { + "epoch": 0.7761189239329119, + "grad_norm": 0.74609375, + "learning_rate": 0.00011599542417969089, + "loss": 0.8388, + "step": 30226 + }, + { + "epoch": 0.7761446011288338, + "grad_norm": 0.78515625, + "learning_rate": 0.00011599101744120687, + "loss": 0.7187, + "step": 30227 + }, + { + "epoch": 0.7761702783247555, + "grad_norm": 0.76953125, + "learning_rate": 0.00011598661067085424, + "loss": 0.8215, + "step": 30228 + }, + { + "epoch": 0.7761959555206773, + "grad_norm": 0.79296875, + "learning_rate": 0.00011598220386864167, + "loss": 0.8495, + "step": 30229 + }, + { + "epoch": 0.7762216327165992, + "grad_norm": 0.85546875, + "learning_rate": 0.00011597779703457804, + "loss": 0.9045, + "step": 30230 + }, + { + "epoch": 0.776247309912521, + "grad_norm": 0.7890625, + "learning_rate": 0.00011597339016867212, + "loss": 0.778, + "step": 30231 + }, + { + "epoch": 0.7762729871084428, + "grad_norm": 0.84375, + "learning_rate": 0.00011596898327093266, + "loss": 0.7782, + "step": 30232 + }, + { + "epoch": 0.7762986643043647, + "grad_norm": 0.79296875, + "learning_rate": 0.00011596457634136846, + "loss": 0.7232, + "step": 30233 + }, + { + "epoch": 0.7763243415002865, + "grad_norm": 0.71484375, + "learning_rate": 0.00011596016937998833, + "loss": 0.7594, + "step": 30234 + }, + { + "epoch": 0.7763500186962082, + "grad_norm": 0.79296875, + "learning_rate": 0.000115955762386801, + "loss": 0.9056, + "step": 30235 + }, + { + "epoch": 0.7763756958921301, + "grad_norm": 0.734375, + "learning_rate": 0.00011595135536181527, + "loss": 0.8811, + "step": 30236 + }, + { + "epoch": 0.7764013730880519, + "grad_norm": 0.82421875, + "learning_rate": 0.00011594694830503994, + "loss": 0.8109, + "step": 30237 + }, + { + "epoch": 0.7764270502839737, + "grad_norm": 0.79296875, + "learning_rate": 0.00011594254121648376, + "loss": 0.8244, + "step": 30238 + }, + { + "epoch": 0.7764527274798956, + "grad_norm": 0.7421875, + "learning_rate": 0.00011593813409615555, + "loss": 0.7894, + "step": 30239 + }, + { + "epoch": 0.7764784046758174, + "grad_norm": 0.81640625, + "learning_rate": 0.00011593372694406408, + "loss": 0.7458, + "step": 30240 + }, + { + "epoch": 0.7765040818717392, + "grad_norm": 0.7734375, + "learning_rate": 0.00011592931976021815, + "loss": 0.8203, + "step": 30241 + }, + { + "epoch": 0.776529759067661, + "grad_norm": 0.8359375, + "learning_rate": 0.00011592491254462652, + "loss": 0.9035, + "step": 30242 + }, + { + "epoch": 0.7765554362635828, + "grad_norm": 0.7734375, + "learning_rate": 0.00011592050529729794, + "loss": 0.94, + "step": 30243 + }, + { + "epoch": 0.7765811134595046, + "grad_norm": 0.81640625, + "learning_rate": 0.00011591609801824126, + "loss": 0.7837, + "step": 30244 + }, + { + "epoch": 0.7766067906554265, + "grad_norm": 0.8046875, + "learning_rate": 0.00011591169070746523, + "loss": 0.803, + "step": 30245 + }, + { + "epoch": 0.7766324678513483, + "grad_norm": 0.77734375, + "learning_rate": 0.00011590728336497863, + "loss": 0.8117, + "step": 30246 + }, + { + "epoch": 0.7766581450472702, + "grad_norm": 0.7734375, + "learning_rate": 0.00011590287599079028, + "loss": 0.8138, + "step": 30247 + }, + { + "epoch": 0.7766838222431919, + "grad_norm": 0.84375, + "learning_rate": 0.00011589846858490889, + "loss": 0.7781, + "step": 30248 + }, + { + "epoch": 0.7767094994391137, + "grad_norm": 0.859375, + "learning_rate": 0.00011589406114734333, + "loss": 0.8779, + "step": 30249 + }, + { + "epoch": 0.7767351766350356, + "grad_norm": 0.73828125, + "learning_rate": 0.00011588965367810234, + "loss": 0.8025, + "step": 30250 + }, + { + "epoch": 0.7767608538309574, + "grad_norm": 0.796875, + "learning_rate": 0.00011588524617719466, + "loss": 0.8937, + "step": 30251 + }, + { + "epoch": 0.7767865310268792, + "grad_norm": 0.81640625, + "learning_rate": 0.00011588083864462915, + "loss": 0.783, + "step": 30252 + }, + { + "epoch": 0.7768122082228011, + "grad_norm": 0.71875, + "learning_rate": 0.00011587643108041457, + "loss": 0.6687, + "step": 30253 + }, + { + "epoch": 0.7768378854187229, + "grad_norm": 0.72265625, + "learning_rate": 0.0001158720234845597, + "loss": 0.7829, + "step": 30254 + }, + { + "epoch": 0.7768635626146446, + "grad_norm": 0.828125, + "learning_rate": 0.00011586761585707331, + "loss": 0.858, + "step": 30255 + }, + { + "epoch": 0.7768892398105665, + "grad_norm": 0.82421875, + "learning_rate": 0.00011586320819796418, + "loss": 0.9224, + "step": 30256 + }, + { + "epoch": 0.7769149170064883, + "grad_norm": 0.76171875, + "learning_rate": 0.00011585880050724114, + "loss": 0.73, + "step": 30257 + }, + { + "epoch": 0.7769405942024101, + "grad_norm": 0.78125, + "learning_rate": 0.00011585439278491292, + "loss": 0.8432, + "step": 30258 + }, + { + "epoch": 0.776966271398332, + "grad_norm": 0.74609375, + "learning_rate": 0.00011584998503098832, + "loss": 0.8113, + "step": 30259 + }, + { + "epoch": 0.7769919485942538, + "grad_norm": 0.8203125, + "learning_rate": 0.00011584557724547615, + "loss": 1.0128, + "step": 30260 + }, + { + "epoch": 0.7770176257901756, + "grad_norm": 0.76953125, + "learning_rate": 0.00011584116942838519, + "loss": 0.8139, + "step": 30261 + }, + { + "epoch": 0.7770433029860974, + "grad_norm": 0.83984375, + "learning_rate": 0.00011583676157972419, + "loss": 0.9412, + "step": 30262 + }, + { + "epoch": 0.7770689801820192, + "grad_norm": 0.7734375, + "learning_rate": 0.00011583235369950196, + "loss": 0.7242, + "step": 30263 + }, + { + "epoch": 0.777094657377941, + "grad_norm": 0.80078125, + "learning_rate": 0.00011582794578772727, + "loss": 0.7517, + "step": 30264 + }, + { + "epoch": 0.7771203345738629, + "grad_norm": 0.78515625, + "learning_rate": 0.00011582353784440891, + "loss": 0.8317, + "step": 30265 + }, + { + "epoch": 0.7771460117697847, + "grad_norm": 0.83203125, + "learning_rate": 0.00011581912986955567, + "loss": 0.8617, + "step": 30266 + }, + { + "epoch": 0.7771716889657065, + "grad_norm": 0.73046875, + "learning_rate": 0.00011581472186317637, + "loss": 0.8203, + "step": 30267 + }, + { + "epoch": 0.7771973661616283, + "grad_norm": 0.73828125, + "learning_rate": 0.00011581031382527973, + "loss": 0.7218, + "step": 30268 + }, + { + "epoch": 0.7772230433575501, + "grad_norm": 0.7734375, + "learning_rate": 0.00011580590575587454, + "loss": 0.756, + "step": 30269 + }, + { + "epoch": 0.7772487205534719, + "grad_norm": 0.76953125, + "learning_rate": 0.00011580149765496964, + "loss": 0.8754, + "step": 30270 + }, + { + "epoch": 0.7772743977493938, + "grad_norm": 0.73828125, + "learning_rate": 0.00011579708952257378, + "loss": 0.9336, + "step": 30271 + }, + { + "epoch": 0.7773000749453156, + "grad_norm": 0.80078125, + "learning_rate": 0.00011579268135869572, + "loss": 0.9453, + "step": 30272 + }, + { + "epoch": 0.7773257521412374, + "grad_norm": 0.77734375, + "learning_rate": 0.00011578827316334429, + "loss": 0.9001, + "step": 30273 + }, + { + "epoch": 0.7773514293371593, + "grad_norm": 0.81640625, + "learning_rate": 0.00011578386493652828, + "loss": 0.8717, + "step": 30274 + }, + { + "epoch": 0.777377106533081, + "grad_norm": 0.9140625, + "learning_rate": 0.00011577945667825643, + "loss": 0.734, + "step": 30275 + }, + { + "epoch": 0.7774027837290028, + "grad_norm": 0.85546875, + "learning_rate": 0.00011577504838853754, + "loss": 0.8833, + "step": 30276 + }, + { + "epoch": 0.7774284609249247, + "grad_norm": 0.7578125, + "learning_rate": 0.00011577064006738043, + "loss": 0.7233, + "step": 30277 + }, + { + "epoch": 0.7774541381208465, + "grad_norm": 1.03125, + "learning_rate": 0.00011576623171479384, + "loss": 0.7796, + "step": 30278 + }, + { + "epoch": 0.7774798153167684, + "grad_norm": 0.80859375, + "learning_rate": 0.00011576182333078658, + "loss": 0.7565, + "step": 30279 + }, + { + "epoch": 0.7775054925126902, + "grad_norm": 0.77734375, + "learning_rate": 0.00011575741491536743, + "loss": 0.9595, + "step": 30280 + }, + { + "epoch": 0.777531169708612, + "grad_norm": 0.90234375, + "learning_rate": 0.00011575300646854517, + "loss": 0.8866, + "step": 30281 + }, + { + "epoch": 0.7775568469045337, + "grad_norm": 0.7421875, + "learning_rate": 0.0001157485979903286, + "loss": 0.7172, + "step": 30282 + }, + { + "epoch": 0.7775825241004556, + "grad_norm": 0.88671875, + "learning_rate": 0.00011574418948072647, + "loss": 0.9082, + "step": 30283 + }, + { + "epoch": 0.7776082012963774, + "grad_norm": 0.75, + "learning_rate": 0.00011573978093974763, + "loss": 0.7525, + "step": 30284 + }, + { + "epoch": 0.7776338784922993, + "grad_norm": 0.82421875, + "learning_rate": 0.00011573537236740082, + "loss": 0.8912, + "step": 30285 + }, + { + "epoch": 0.7776595556882211, + "grad_norm": 0.796875, + "learning_rate": 0.00011573096376369479, + "loss": 0.9076, + "step": 30286 + }, + { + "epoch": 0.7776852328841429, + "grad_norm": 0.8125, + "learning_rate": 0.00011572655512863843, + "loss": 0.8963, + "step": 30287 + }, + { + "epoch": 0.7777109100800647, + "grad_norm": 0.765625, + "learning_rate": 0.00011572214646224045, + "loss": 0.7281, + "step": 30288 + }, + { + "epoch": 0.7777365872759865, + "grad_norm": 0.83203125, + "learning_rate": 0.00011571773776450964, + "loss": 0.836, + "step": 30289 + }, + { + "epoch": 0.7777622644719083, + "grad_norm": 0.79296875, + "learning_rate": 0.0001157133290354548, + "loss": 0.982, + "step": 30290 + }, + { + "epoch": 0.7777879416678302, + "grad_norm": 0.796875, + "learning_rate": 0.0001157089202750847, + "loss": 0.8624, + "step": 30291 + }, + { + "epoch": 0.777813618863752, + "grad_norm": 0.82421875, + "learning_rate": 0.00011570451148340817, + "loss": 0.8051, + "step": 30292 + }, + { + "epoch": 0.7778392960596738, + "grad_norm": 0.734375, + "learning_rate": 0.00011570010266043397, + "loss": 0.9161, + "step": 30293 + }, + { + "epoch": 0.7778649732555957, + "grad_norm": 0.8125, + "learning_rate": 0.00011569569380617086, + "loss": 0.8128, + "step": 30294 + }, + { + "epoch": 0.7778906504515174, + "grad_norm": 0.81640625, + "learning_rate": 0.00011569128492062766, + "loss": 0.724, + "step": 30295 + }, + { + "epoch": 0.7779163276474392, + "grad_norm": 0.7890625, + "learning_rate": 0.00011568687600381314, + "loss": 0.8316, + "step": 30296 + }, + { + "epoch": 0.7779420048433611, + "grad_norm": 0.7578125, + "learning_rate": 0.00011568246705573611, + "loss": 0.7565, + "step": 30297 + }, + { + "epoch": 0.7779676820392829, + "grad_norm": 0.73046875, + "learning_rate": 0.00011567805807640532, + "loss": 0.8245, + "step": 30298 + }, + { + "epoch": 0.7779933592352047, + "grad_norm": 0.7421875, + "learning_rate": 0.00011567364906582958, + "loss": 0.8129, + "step": 30299 + }, + { + "epoch": 0.7780190364311266, + "grad_norm": 1.1796875, + "learning_rate": 0.00011566924002401772, + "loss": 0.8866, + "step": 30300 + }, + { + "epoch": 0.7780447136270484, + "grad_norm": 0.7734375, + "learning_rate": 0.0001156648309509784, + "loss": 0.7898, + "step": 30301 + }, + { + "epoch": 0.7780703908229701, + "grad_norm": 0.85546875, + "learning_rate": 0.00011566042184672052, + "loss": 0.9796, + "step": 30302 + }, + { + "epoch": 0.778096068018892, + "grad_norm": 0.76953125, + "learning_rate": 0.00011565601271125286, + "loss": 0.6649, + "step": 30303 + }, + { + "epoch": 0.7781217452148138, + "grad_norm": 0.83203125, + "learning_rate": 0.00011565160354458414, + "loss": 0.8086, + "step": 30304 + }, + { + "epoch": 0.7781474224107356, + "grad_norm": 0.76953125, + "learning_rate": 0.00011564719434672322, + "loss": 0.7994, + "step": 30305 + }, + { + "epoch": 0.7781730996066575, + "grad_norm": 0.81640625, + "learning_rate": 0.00011564278511767883, + "loss": 0.7988, + "step": 30306 + }, + { + "epoch": 0.7781987768025793, + "grad_norm": 0.87890625, + "learning_rate": 0.0001156383758574598, + "loss": 0.8648, + "step": 30307 + }, + { + "epoch": 0.778224453998501, + "grad_norm": 0.796875, + "learning_rate": 0.00011563396656607491, + "loss": 0.7779, + "step": 30308 + }, + { + "epoch": 0.7782501311944229, + "grad_norm": 0.7890625, + "learning_rate": 0.0001156295572435329, + "loss": 0.7931, + "step": 30309 + }, + { + "epoch": 0.7782758083903447, + "grad_norm": 0.7421875, + "learning_rate": 0.00011562514788984262, + "loss": 0.9285, + "step": 30310 + }, + { + "epoch": 0.7783014855862666, + "grad_norm": 0.80078125, + "learning_rate": 0.00011562073850501285, + "loss": 0.8389, + "step": 30311 + }, + { + "epoch": 0.7783271627821884, + "grad_norm": 0.7734375, + "learning_rate": 0.00011561632908905231, + "loss": 0.7894, + "step": 30312 + }, + { + "epoch": 0.7783528399781102, + "grad_norm": 0.76953125, + "learning_rate": 0.00011561191964196989, + "loss": 0.8957, + "step": 30313 + }, + { + "epoch": 0.7783785171740321, + "grad_norm": 0.77734375, + "learning_rate": 0.00011560751016377431, + "loss": 0.8795, + "step": 30314 + }, + { + "epoch": 0.7784041943699538, + "grad_norm": 0.8125, + "learning_rate": 0.00011560310065447432, + "loss": 0.8164, + "step": 30315 + }, + { + "epoch": 0.7784298715658756, + "grad_norm": 0.77734375, + "learning_rate": 0.00011559869111407882, + "loss": 0.7459, + "step": 30316 + }, + { + "epoch": 0.7784555487617975, + "grad_norm": 0.7890625, + "learning_rate": 0.0001155942815425965, + "loss": 0.7938, + "step": 30317 + }, + { + "epoch": 0.7784812259577193, + "grad_norm": 0.77734375, + "learning_rate": 0.00011558987194003621, + "loss": 0.9072, + "step": 30318 + }, + { + "epoch": 0.7785069031536411, + "grad_norm": 0.8046875, + "learning_rate": 0.00011558546230640672, + "loss": 0.8049, + "step": 30319 + }, + { + "epoch": 0.778532580349563, + "grad_norm": 0.7578125, + "learning_rate": 0.0001155810526417168, + "loss": 0.8281, + "step": 30320 + }, + { + "epoch": 0.7785582575454848, + "grad_norm": 0.79296875, + "learning_rate": 0.00011557664294597526, + "loss": 0.8073, + "step": 30321 + }, + { + "epoch": 0.7785839347414065, + "grad_norm": 0.9140625, + "learning_rate": 0.00011557223321919086, + "loss": 0.834, + "step": 30322 + }, + { + "epoch": 0.7786096119373284, + "grad_norm": 0.796875, + "learning_rate": 0.00011556782346137239, + "loss": 0.8481, + "step": 30323 + }, + { + "epoch": 0.7786352891332502, + "grad_norm": 0.796875, + "learning_rate": 0.00011556341367252868, + "loss": 0.789, + "step": 30324 + }, + { + "epoch": 0.778660966329172, + "grad_norm": 0.71484375, + "learning_rate": 0.00011555900385266849, + "loss": 0.775, + "step": 30325 + }, + { + "epoch": 0.7786866435250939, + "grad_norm": 0.76953125, + "learning_rate": 0.00011555459400180063, + "loss": 0.7845, + "step": 30326 + }, + { + "epoch": 0.7787123207210157, + "grad_norm": 0.84375, + "learning_rate": 0.00011555018411993383, + "loss": 0.8683, + "step": 30327 + }, + { + "epoch": 0.7787379979169374, + "grad_norm": 0.7578125, + "learning_rate": 0.00011554577420707693, + "loss": 0.8753, + "step": 30328 + }, + { + "epoch": 0.7787636751128593, + "grad_norm": 0.80859375, + "learning_rate": 0.00011554136426323871, + "loss": 0.8318, + "step": 30329 + }, + { + "epoch": 0.7787893523087811, + "grad_norm": 0.84765625, + "learning_rate": 0.00011553695428842797, + "loss": 0.7895, + "step": 30330 + }, + { + "epoch": 0.7788150295047029, + "grad_norm": 0.75, + "learning_rate": 0.00011553254428265346, + "loss": 0.8025, + "step": 30331 + }, + { + "epoch": 0.7788407067006248, + "grad_norm": 0.82421875, + "learning_rate": 0.000115528134245924, + "loss": 0.8618, + "step": 30332 + }, + { + "epoch": 0.7788663838965466, + "grad_norm": 0.80078125, + "learning_rate": 0.00011552372417824837, + "loss": 0.8296, + "step": 30333 + }, + { + "epoch": 0.7788920610924684, + "grad_norm": 1.46875, + "learning_rate": 0.00011551931407963536, + "loss": 0.7677, + "step": 30334 + }, + { + "epoch": 0.7789177382883902, + "grad_norm": 0.7890625, + "learning_rate": 0.00011551490395009377, + "loss": 0.8486, + "step": 30335 + }, + { + "epoch": 0.778943415484312, + "grad_norm": 0.75390625, + "learning_rate": 0.00011551049378963236, + "loss": 0.6766, + "step": 30336 + }, + { + "epoch": 0.7789690926802338, + "grad_norm": 0.72265625, + "learning_rate": 0.00011550608359825994, + "loss": 0.7376, + "step": 30337 + }, + { + "epoch": 0.7789947698761557, + "grad_norm": 0.76953125, + "learning_rate": 0.00011550167337598532, + "loss": 0.7295, + "step": 30338 + }, + { + "epoch": 0.7790204470720775, + "grad_norm": 0.7890625, + "learning_rate": 0.00011549726312281723, + "loss": 0.7983, + "step": 30339 + }, + { + "epoch": 0.7790461242679994, + "grad_norm": 0.76171875, + "learning_rate": 0.00011549285283876455, + "loss": 0.7295, + "step": 30340 + }, + { + "epoch": 0.7790718014639212, + "grad_norm": 0.7265625, + "learning_rate": 0.00011548844252383595, + "loss": 0.7151, + "step": 30341 + }, + { + "epoch": 0.7790974786598429, + "grad_norm": 0.9375, + "learning_rate": 0.00011548403217804031, + "loss": 0.8789, + "step": 30342 + }, + { + "epoch": 0.7791231558557647, + "grad_norm": 0.703125, + "learning_rate": 0.0001154796218013864, + "loss": 0.6107, + "step": 30343 + }, + { + "epoch": 0.7791488330516866, + "grad_norm": 0.80859375, + "learning_rate": 0.00011547521139388298, + "loss": 0.8328, + "step": 30344 + }, + { + "epoch": 0.7791745102476084, + "grad_norm": 0.7734375, + "learning_rate": 0.00011547080095553888, + "loss": 0.7869, + "step": 30345 + }, + { + "epoch": 0.7792001874435303, + "grad_norm": 0.77734375, + "learning_rate": 0.00011546639048636287, + "loss": 0.8008, + "step": 30346 + }, + { + "epoch": 0.7792258646394521, + "grad_norm": 0.7421875, + "learning_rate": 0.00011546197998636374, + "loss": 0.7453, + "step": 30347 + }, + { + "epoch": 0.7792515418353738, + "grad_norm": 0.734375, + "learning_rate": 0.00011545756945555029, + "loss": 0.7856, + "step": 30348 + }, + { + "epoch": 0.7792772190312957, + "grad_norm": 0.75390625, + "learning_rate": 0.00011545315889393126, + "loss": 0.7527, + "step": 30349 + }, + { + "epoch": 0.7793028962272175, + "grad_norm": 0.875, + "learning_rate": 0.0001154487483015155, + "loss": 0.8979, + "step": 30350 + }, + { + "epoch": 0.7793285734231393, + "grad_norm": 0.92578125, + "learning_rate": 0.00011544433767831181, + "loss": 0.8014, + "step": 30351 + }, + { + "epoch": 0.7793542506190612, + "grad_norm": 0.82421875, + "learning_rate": 0.00011543992702432891, + "loss": 0.8, + "step": 30352 + }, + { + "epoch": 0.779379927814983, + "grad_norm": 0.80859375, + "learning_rate": 0.00011543551633957568, + "loss": 0.9254, + "step": 30353 + }, + { + "epoch": 0.7794056050109048, + "grad_norm": 0.765625, + "learning_rate": 0.00011543110562406081, + "loss": 0.7802, + "step": 30354 + }, + { + "epoch": 0.7794312822068266, + "grad_norm": 0.8203125, + "learning_rate": 0.00011542669487779317, + "loss": 0.7551, + "step": 30355 + }, + { + "epoch": 0.7794569594027484, + "grad_norm": 0.8359375, + "learning_rate": 0.00011542228410078151, + "loss": 0.9451, + "step": 30356 + }, + { + "epoch": 0.7794826365986702, + "grad_norm": 0.78515625, + "learning_rate": 0.00011541787329303462, + "loss": 0.8038, + "step": 30357 + }, + { + "epoch": 0.7795083137945921, + "grad_norm": 1.40625, + "learning_rate": 0.0001154134624545613, + "loss": 0.8844, + "step": 30358 + }, + { + "epoch": 0.7795339909905139, + "grad_norm": 0.6953125, + "learning_rate": 0.00011540905158537038, + "loss": 0.7721, + "step": 30359 + }, + { + "epoch": 0.7795596681864357, + "grad_norm": 0.73828125, + "learning_rate": 0.00011540464068547059, + "loss": 0.8045, + "step": 30360 + }, + { + "epoch": 0.7795853453823575, + "grad_norm": 0.98046875, + "learning_rate": 0.00011540022975487073, + "loss": 0.8157, + "step": 30361 + }, + { + "epoch": 0.7796110225782793, + "grad_norm": 0.80078125, + "learning_rate": 0.00011539581879357961, + "loss": 0.7327, + "step": 30362 + }, + { + "epoch": 0.7796366997742011, + "grad_norm": 0.69140625, + "learning_rate": 0.00011539140780160602, + "loss": 0.6964, + "step": 30363 + }, + { + "epoch": 0.779662376970123, + "grad_norm": 0.77734375, + "learning_rate": 0.00011538699677895874, + "loss": 0.7514, + "step": 30364 + }, + { + "epoch": 0.7796880541660448, + "grad_norm": 0.71875, + "learning_rate": 0.00011538258572564657, + "loss": 0.8225, + "step": 30365 + }, + { + "epoch": 0.7797137313619666, + "grad_norm": 0.8203125, + "learning_rate": 0.00011537817464167831, + "loss": 0.8597, + "step": 30366 + }, + { + "epoch": 0.7797394085578885, + "grad_norm": 0.8046875, + "learning_rate": 0.00011537376352706271, + "loss": 0.9234, + "step": 30367 + }, + { + "epoch": 0.7797650857538102, + "grad_norm": 0.73828125, + "learning_rate": 0.00011536935238180857, + "loss": 0.8826, + "step": 30368 + }, + { + "epoch": 0.779790762949732, + "grad_norm": 0.8046875, + "learning_rate": 0.00011536494120592474, + "loss": 0.7424, + "step": 30369 + }, + { + "epoch": 0.7798164401456539, + "grad_norm": 0.7578125, + "learning_rate": 0.00011536052999941996, + "loss": 0.7193, + "step": 30370 + }, + { + "epoch": 0.7798421173415757, + "grad_norm": 0.82421875, + "learning_rate": 0.00011535611876230301, + "loss": 0.8454, + "step": 30371 + }, + { + "epoch": 0.7798677945374975, + "grad_norm": 0.71875, + "learning_rate": 0.00011535170749458275, + "loss": 0.7794, + "step": 30372 + }, + { + "epoch": 0.7798934717334194, + "grad_norm": 0.78125, + "learning_rate": 0.00011534729619626788, + "loss": 0.9343, + "step": 30373 + }, + { + "epoch": 0.7799191489293412, + "grad_norm": 0.74609375, + "learning_rate": 0.00011534288486736724, + "loss": 0.7929, + "step": 30374 + }, + { + "epoch": 0.779944826125263, + "grad_norm": 0.8203125, + "learning_rate": 0.00011533847350788965, + "loss": 0.8957, + "step": 30375 + }, + { + "epoch": 0.7799705033211848, + "grad_norm": 0.81640625, + "learning_rate": 0.0001153340621178438, + "loss": 0.9556, + "step": 30376 + }, + { + "epoch": 0.7799961805171066, + "grad_norm": 0.75390625, + "learning_rate": 0.0001153296506972386, + "loss": 0.7498, + "step": 30377 + }, + { + "epoch": 0.7800218577130285, + "grad_norm": 0.8125, + "learning_rate": 0.00011532523924608278, + "loss": 0.8445, + "step": 30378 + }, + { + "epoch": 0.7800475349089503, + "grad_norm": 0.828125, + "learning_rate": 0.00011532082776438515, + "loss": 0.8238, + "step": 30379 + }, + { + "epoch": 0.7800732121048721, + "grad_norm": 0.75390625, + "learning_rate": 0.00011531641625215451, + "loss": 0.7957, + "step": 30380 + }, + { + "epoch": 0.7800988893007939, + "grad_norm": 0.75390625, + "learning_rate": 0.0001153120047093996, + "loss": 0.7635, + "step": 30381 + }, + { + "epoch": 0.7801245664967157, + "grad_norm": 0.79296875, + "learning_rate": 0.00011530759313612927, + "loss": 0.833, + "step": 30382 + }, + { + "epoch": 0.7801502436926375, + "grad_norm": 0.75390625, + "learning_rate": 0.00011530318153235229, + "loss": 0.795, + "step": 30383 + }, + { + "epoch": 0.7801759208885594, + "grad_norm": 0.70703125, + "learning_rate": 0.00011529876989807741, + "loss": 0.7694, + "step": 30384 + }, + { + "epoch": 0.7802015980844812, + "grad_norm": 0.76953125, + "learning_rate": 0.00011529435823331352, + "loss": 0.896, + "step": 30385 + }, + { + "epoch": 0.780227275280403, + "grad_norm": 0.734375, + "learning_rate": 0.00011528994653806933, + "loss": 0.797, + "step": 30386 + }, + { + "epoch": 0.7802529524763249, + "grad_norm": 0.8515625, + "learning_rate": 0.00011528553481235367, + "loss": 0.7874, + "step": 30387 + }, + { + "epoch": 0.7802786296722466, + "grad_norm": 0.72265625, + "learning_rate": 0.00011528112305617531, + "loss": 0.7932, + "step": 30388 + }, + { + "epoch": 0.7803043068681684, + "grad_norm": 0.78125, + "learning_rate": 0.00011527671126954303, + "loss": 0.7348, + "step": 30389 + }, + { + "epoch": 0.7803299840640903, + "grad_norm": 0.8046875, + "learning_rate": 0.00011527229945246568, + "loss": 0.7412, + "step": 30390 + }, + { + "epoch": 0.7803556612600121, + "grad_norm": 0.73828125, + "learning_rate": 0.000115267887604952, + "loss": 0.8298, + "step": 30391 + }, + { + "epoch": 0.7803813384559339, + "grad_norm": 0.74609375, + "learning_rate": 0.00011526347572701081, + "loss": 0.9006, + "step": 30392 + }, + { + "epoch": 0.7804070156518558, + "grad_norm": 0.76953125, + "learning_rate": 0.0001152590638186509, + "loss": 0.826, + "step": 30393 + }, + { + "epoch": 0.7804326928477776, + "grad_norm": 0.8046875, + "learning_rate": 0.00011525465187988103, + "loss": 0.9139, + "step": 30394 + }, + { + "epoch": 0.7804583700436993, + "grad_norm": 0.80078125, + "learning_rate": 0.00011525023991071003, + "loss": 1.0151, + "step": 30395 + }, + { + "epoch": 0.7804840472396212, + "grad_norm": 0.75, + "learning_rate": 0.00011524582791114666, + "loss": 0.7512, + "step": 30396 + }, + { + "epoch": 0.780509724435543, + "grad_norm": 0.890625, + "learning_rate": 0.00011524141588119975, + "loss": 0.8582, + "step": 30397 + }, + { + "epoch": 0.7805354016314648, + "grad_norm": 0.78125, + "learning_rate": 0.00011523700382087806, + "loss": 0.6795, + "step": 30398 + }, + { + "epoch": 0.7805610788273867, + "grad_norm": 0.74609375, + "learning_rate": 0.00011523259173019042, + "loss": 0.7664, + "step": 30399 + }, + { + "epoch": 0.7805867560233085, + "grad_norm": 0.75390625, + "learning_rate": 0.00011522817960914558, + "loss": 0.7978, + "step": 30400 + }, + { + "epoch": 0.7806124332192302, + "grad_norm": 0.75390625, + "learning_rate": 0.00011522376745775238, + "loss": 0.7744, + "step": 30401 + }, + { + "epoch": 0.7806381104151521, + "grad_norm": 0.828125, + "learning_rate": 0.00011521935527601956, + "loss": 0.8299, + "step": 30402 + }, + { + "epoch": 0.7806637876110739, + "grad_norm": 0.75, + "learning_rate": 0.00011521494306395595, + "loss": 0.8956, + "step": 30403 + }, + { + "epoch": 0.7806894648069957, + "grad_norm": 0.78515625, + "learning_rate": 0.00011521053082157034, + "loss": 0.9022, + "step": 30404 + }, + { + "epoch": 0.7807151420029176, + "grad_norm": 0.78125, + "learning_rate": 0.0001152061185488715, + "loss": 0.7722, + "step": 30405 + }, + { + "epoch": 0.7807408191988394, + "grad_norm": 1.1328125, + "learning_rate": 0.00011520170624586826, + "loss": 0.8163, + "step": 30406 + }, + { + "epoch": 0.7807664963947613, + "grad_norm": 0.7265625, + "learning_rate": 0.00011519729391256939, + "loss": 0.7037, + "step": 30407 + }, + { + "epoch": 0.780792173590683, + "grad_norm": 0.79296875, + "learning_rate": 0.00011519288154898366, + "loss": 0.8668, + "step": 30408 + }, + { + "epoch": 0.7808178507866048, + "grad_norm": 0.8046875, + "learning_rate": 0.00011518846915511991, + "loss": 0.7014, + "step": 30409 + }, + { + "epoch": 0.7808435279825267, + "grad_norm": 0.7890625, + "learning_rate": 0.0001151840567309869, + "loss": 0.8733, + "step": 30410 + }, + { + "epoch": 0.7808692051784485, + "grad_norm": 0.734375, + "learning_rate": 0.00011517964427659345, + "loss": 0.8451, + "step": 30411 + }, + { + "epoch": 0.7808948823743703, + "grad_norm": 0.75, + "learning_rate": 0.00011517523179194835, + "loss": 0.7753, + "step": 30412 + }, + { + "epoch": 0.7809205595702922, + "grad_norm": 0.84375, + "learning_rate": 0.00011517081927706037, + "loss": 0.9811, + "step": 30413 + }, + { + "epoch": 0.780946236766214, + "grad_norm": 0.796875, + "learning_rate": 0.0001151664067319383, + "loss": 0.7808, + "step": 30414 + }, + { + "epoch": 0.7809719139621357, + "grad_norm": 0.7421875, + "learning_rate": 0.000115161994156591, + "loss": 0.8537, + "step": 30415 + }, + { + "epoch": 0.7809975911580576, + "grad_norm": 0.6953125, + "learning_rate": 0.00011515758155102716, + "loss": 0.7508, + "step": 30416 + }, + { + "epoch": 0.7810232683539794, + "grad_norm": 0.75390625, + "learning_rate": 0.00011515316891525564, + "loss": 0.8848, + "step": 30417 + }, + { + "epoch": 0.7810489455499012, + "grad_norm": 0.82421875, + "learning_rate": 0.00011514875624928526, + "loss": 0.726, + "step": 30418 + }, + { + "epoch": 0.7810746227458231, + "grad_norm": 0.80078125, + "learning_rate": 0.00011514434355312476, + "loss": 0.8104, + "step": 30419 + }, + { + "epoch": 0.7811002999417449, + "grad_norm": 0.7578125, + "learning_rate": 0.00011513993082678293, + "loss": 0.8743, + "step": 30420 + }, + { + "epoch": 0.7811259771376666, + "grad_norm": 0.77734375, + "learning_rate": 0.00011513551807026858, + "loss": 0.8408, + "step": 30421 + }, + { + "epoch": 0.7811516543335885, + "grad_norm": 0.75, + "learning_rate": 0.00011513110528359054, + "loss": 0.7815, + "step": 30422 + }, + { + "epoch": 0.7811773315295103, + "grad_norm": 0.78125, + "learning_rate": 0.00011512669246675758, + "loss": 0.7097, + "step": 30423 + }, + { + "epoch": 0.7812030087254321, + "grad_norm": 0.75, + "learning_rate": 0.00011512227961977844, + "loss": 0.7316, + "step": 30424 + }, + { + "epoch": 0.781228685921354, + "grad_norm": 0.7734375, + "learning_rate": 0.00011511786674266203, + "loss": 0.7666, + "step": 30425 + }, + { + "epoch": 0.7812543631172758, + "grad_norm": 0.73828125, + "learning_rate": 0.00011511345383541704, + "loss": 0.8459, + "step": 30426 + }, + { + "epoch": 0.7812800403131976, + "grad_norm": 0.8984375, + "learning_rate": 0.0001151090408980523, + "loss": 0.8564, + "step": 30427 + }, + { + "epoch": 0.7813057175091194, + "grad_norm": 0.81640625, + "learning_rate": 0.00011510462793057661, + "loss": 0.8543, + "step": 30428 + }, + { + "epoch": 0.7813313947050412, + "grad_norm": 0.80859375, + "learning_rate": 0.00011510021493299875, + "loss": 0.7189, + "step": 30429 + }, + { + "epoch": 0.781357071900963, + "grad_norm": 0.71484375, + "learning_rate": 0.00011509580190532754, + "loss": 0.7347, + "step": 30430 + }, + { + "epoch": 0.7813827490968849, + "grad_norm": 0.73828125, + "learning_rate": 0.00011509138884757176, + "loss": 0.8251, + "step": 30431 + }, + { + "epoch": 0.7814084262928067, + "grad_norm": 0.76953125, + "learning_rate": 0.00011508697575974021, + "loss": 1.0226, + "step": 30432 + }, + { + "epoch": 0.7814341034887285, + "grad_norm": 0.79296875, + "learning_rate": 0.00011508256264184168, + "loss": 0.7558, + "step": 30433 + }, + { + "epoch": 0.7814597806846504, + "grad_norm": 0.98828125, + "learning_rate": 0.00011507814949388493, + "loss": 0.8211, + "step": 30434 + }, + { + "epoch": 0.7814854578805721, + "grad_norm": 0.74609375, + "learning_rate": 0.00011507373631587883, + "loss": 0.8959, + "step": 30435 + }, + { + "epoch": 0.781511135076494, + "grad_norm": 0.69921875, + "learning_rate": 0.00011506932310783212, + "loss": 0.7417, + "step": 30436 + }, + { + "epoch": 0.7815368122724158, + "grad_norm": 0.7890625, + "learning_rate": 0.0001150649098697536, + "loss": 0.8102, + "step": 30437 + }, + { + "epoch": 0.7815624894683376, + "grad_norm": 0.67578125, + "learning_rate": 0.0001150604966016521, + "loss": 0.785, + "step": 30438 + }, + { + "epoch": 0.7815881666642595, + "grad_norm": 0.828125, + "learning_rate": 0.00011505608330353638, + "loss": 0.8212, + "step": 30439 + }, + { + "epoch": 0.7816138438601813, + "grad_norm": 0.69140625, + "learning_rate": 0.00011505166997541525, + "loss": 0.7253, + "step": 30440 + }, + { + "epoch": 0.781639521056103, + "grad_norm": 0.74609375, + "learning_rate": 0.00011504725661729749, + "loss": 0.8434, + "step": 30441 + }, + { + "epoch": 0.7816651982520249, + "grad_norm": 0.78125, + "learning_rate": 0.0001150428432291919, + "loss": 0.8956, + "step": 30442 + }, + { + "epoch": 0.7816908754479467, + "grad_norm": 0.86328125, + "learning_rate": 0.00011503842981110731, + "loss": 0.892, + "step": 30443 + }, + { + "epoch": 0.7817165526438685, + "grad_norm": 0.8046875, + "learning_rate": 0.00011503401636305247, + "loss": 0.9032, + "step": 30444 + }, + { + "epoch": 0.7817422298397904, + "grad_norm": 0.7421875, + "learning_rate": 0.00011502960288503619, + "loss": 0.716, + "step": 30445 + }, + { + "epoch": 0.7817679070357122, + "grad_norm": 0.8046875, + "learning_rate": 0.0001150251893770673, + "loss": 0.8743, + "step": 30446 + }, + { + "epoch": 0.781793584231634, + "grad_norm": 0.75, + "learning_rate": 0.00011502077583915454, + "loss": 0.7999, + "step": 30447 + }, + { + "epoch": 0.7818192614275558, + "grad_norm": 0.83984375, + "learning_rate": 0.0001150163622713067, + "loss": 0.7325, + "step": 30448 + }, + { + "epoch": 0.7818449386234776, + "grad_norm": 0.7734375, + "learning_rate": 0.00011501194867353265, + "loss": 0.814, + "step": 30449 + }, + { + "epoch": 0.7818706158193994, + "grad_norm": 0.78125, + "learning_rate": 0.00011500753504584115, + "loss": 0.8604, + "step": 30450 + }, + { + "epoch": 0.7818962930153213, + "grad_norm": 0.79296875, + "learning_rate": 0.00011500312138824097, + "loss": 0.9909, + "step": 30451 + }, + { + "epoch": 0.7819219702112431, + "grad_norm": 0.81640625, + "learning_rate": 0.00011499870770074093, + "loss": 0.8629, + "step": 30452 + }, + { + "epoch": 0.7819476474071649, + "grad_norm": 0.859375, + "learning_rate": 0.00011499429398334978, + "loss": 0.7347, + "step": 30453 + }, + { + "epoch": 0.7819733246030868, + "grad_norm": 0.73828125, + "learning_rate": 0.0001149898802360764, + "loss": 0.8404, + "step": 30454 + }, + { + "epoch": 0.7819990017990085, + "grad_norm": 0.7890625, + "learning_rate": 0.00011498546645892953, + "loss": 0.8232, + "step": 30455 + }, + { + "epoch": 0.7820246789949303, + "grad_norm": 0.71875, + "learning_rate": 0.00011498105265191797, + "loss": 0.8056, + "step": 30456 + }, + { + "epoch": 0.7820503561908522, + "grad_norm": 0.78125, + "learning_rate": 0.00011497663881505055, + "loss": 0.7956, + "step": 30457 + }, + { + "epoch": 0.782076033386774, + "grad_norm": 0.73828125, + "learning_rate": 0.00011497222494833601, + "loss": 0.7951, + "step": 30458 + }, + { + "epoch": 0.7821017105826958, + "grad_norm": 0.9140625, + "learning_rate": 0.0001149678110517832, + "loss": 0.8656, + "step": 30459 + }, + { + "epoch": 0.7821273877786177, + "grad_norm": 0.86328125, + "learning_rate": 0.0001149633971254009, + "loss": 0.9576, + "step": 30460 + }, + { + "epoch": 0.7821530649745394, + "grad_norm": 1.078125, + "learning_rate": 0.00011495898316919789, + "loss": 0.8476, + "step": 30461 + }, + { + "epoch": 0.7821787421704612, + "grad_norm": 0.7578125, + "learning_rate": 0.00011495456918318296, + "loss": 0.9053, + "step": 30462 + }, + { + "epoch": 0.7822044193663831, + "grad_norm": 0.74609375, + "learning_rate": 0.00011495015516736496, + "loss": 0.8045, + "step": 30463 + }, + { + "epoch": 0.7822300965623049, + "grad_norm": 0.765625, + "learning_rate": 0.00011494574112175262, + "loss": 0.7231, + "step": 30464 + }, + { + "epoch": 0.7822557737582267, + "grad_norm": 0.81640625, + "learning_rate": 0.00011494132704635481, + "loss": 0.882, + "step": 30465 + }, + { + "epoch": 0.7822814509541486, + "grad_norm": 0.8359375, + "learning_rate": 0.00011493691294118025, + "loss": 0.8164, + "step": 30466 + }, + { + "epoch": 0.7823071281500704, + "grad_norm": 0.8203125, + "learning_rate": 0.00011493249880623777, + "loss": 0.7927, + "step": 30467 + }, + { + "epoch": 0.7823328053459921, + "grad_norm": 0.89453125, + "learning_rate": 0.00011492808464153619, + "loss": 0.884, + "step": 30468 + }, + { + "epoch": 0.782358482541914, + "grad_norm": 0.8984375, + "learning_rate": 0.00011492367044708425, + "loss": 0.8557, + "step": 30469 + }, + { + "epoch": 0.7823841597378358, + "grad_norm": 0.72265625, + "learning_rate": 0.00011491925622289082, + "loss": 0.7164, + "step": 30470 + }, + { + "epoch": 0.7824098369337577, + "grad_norm": 0.859375, + "learning_rate": 0.00011491484196896465, + "loss": 0.8405, + "step": 30471 + }, + { + "epoch": 0.7824355141296795, + "grad_norm": 0.8125, + "learning_rate": 0.00011491042768531454, + "loss": 0.8065, + "step": 30472 + }, + { + "epoch": 0.7824611913256013, + "grad_norm": 0.83984375, + "learning_rate": 0.0001149060133719493, + "loss": 0.7972, + "step": 30473 + }, + { + "epoch": 0.7824868685215232, + "grad_norm": 0.78515625, + "learning_rate": 0.00011490159902887772, + "loss": 0.9546, + "step": 30474 + }, + { + "epoch": 0.7825125457174449, + "grad_norm": 0.875, + "learning_rate": 0.00011489718465610861, + "loss": 1.0135, + "step": 30475 + }, + { + "epoch": 0.7825382229133667, + "grad_norm": 0.6953125, + "learning_rate": 0.00011489277025365075, + "loss": 0.7942, + "step": 30476 + }, + { + "epoch": 0.7825639001092886, + "grad_norm": 0.8515625, + "learning_rate": 0.00011488835582151292, + "loss": 0.8796, + "step": 30477 + }, + { + "epoch": 0.7825895773052104, + "grad_norm": 0.80859375, + "learning_rate": 0.00011488394135970399, + "loss": 0.8681, + "step": 30478 + }, + { + "epoch": 0.7826152545011322, + "grad_norm": 0.75, + "learning_rate": 0.00011487952686823267, + "loss": 0.7494, + "step": 30479 + }, + { + "epoch": 0.7826409316970541, + "grad_norm": 0.82421875, + "learning_rate": 0.00011487511234710781, + "loss": 0.8994, + "step": 30480 + }, + { + "epoch": 0.7826666088929758, + "grad_norm": 0.78515625, + "learning_rate": 0.00011487069779633818, + "loss": 0.7736, + "step": 30481 + }, + { + "epoch": 0.7826922860888976, + "grad_norm": 0.82421875, + "learning_rate": 0.0001148662832159326, + "loss": 0.7796, + "step": 30482 + }, + { + "epoch": 0.7827179632848195, + "grad_norm": 0.7265625, + "learning_rate": 0.00011486186860589986, + "loss": 0.8311, + "step": 30483 + }, + { + "epoch": 0.7827436404807413, + "grad_norm": 0.76953125, + "learning_rate": 0.0001148574539662488, + "loss": 0.8295, + "step": 30484 + }, + { + "epoch": 0.7827693176766631, + "grad_norm": 0.796875, + "learning_rate": 0.00011485303929698811, + "loss": 0.7103, + "step": 30485 + }, + { + "epoch": 0.782794994872585, + "grad_norm": 0.80859375, + "learning_rate": 0.00011484862459812671, + "loss": 0.8436, + "step": 30486 + }, + { + "epoch": 0.7828206720685068, + "grad_norm": 0.8359375, + "learning_rate": 0.00011484420986967333, + "loss": 0.8898, + "step": 30487 + }, + { + "epoch": 0.7828463492644285, + "grad_norm": 0.8359375, + "learning_rate": 0.00011483979511163674, + "loss": 0.9227, + "step": 30488 + }, + { + "epoch": 0.7828720264603504, + "grad_norm": 0.6953125, + "learning_rate": 0.0001148353803240258, + "loss": 0.6912, + "step": 30489 + }, + { + "epoch": 0.7828977036562722, + "grad_norm": 0.7890625, + "learning_rate": 0.00011483096550684929, + "loss": 0.9064, + "step": 30490 + }, + { + "epoch": 0.782923380852194, + "grad_norm": 0.81640625, + "learning_rate": 0.00011482655066011603, + "loss": 0.8597, + "step": 30491 + }, + { + "epoch": 0.7829490580481159, + "grad_norm": 0.8984375, + "learning_rate": 0.00011482213578383478, + "loss": 0.8583, + "step": 30492 + }, + { + "epoch": 0.7829747352440377, + "grad_norm": 0.80859375, + "learning_rate": 0.00011481772087801434, + "loss": 0.788, + "step": 30493 + }, + { + "epoch": 0.7830004124399595, + "grad_norm": 0.83984375, + "learning_rate": 0.00011481330594266352, + "loss": 0.8764, + "step": 30494 + }, + { + "epoch": 0.7830260896358813, + "grad_norm": 0.83984375, + "learning_rate": 0.00011480889097779113, + "loss": 0.8488, + "step": 30495 + }, + { + "epoch": 0.7830517668318031, + "grad_norm": 0.77734375, + "learning_rate": 0.00011480447598340594, + "loss": 0.7882, + "step": 30496 + }, + { + "epoch": 0.783077444027725, + "grad_norm": 0.78125, + "learning_rate": 0.00011480006095951681, + "loss": 0.8151, + "step": 30497 + }, + { + "epoch": 0.7831031212236468, + "grad_norm": 0.72265625, + "learning_rate": 0.00011479564590613245, + "loss": 0.7402, + "step": 30498 + }, + { + "epoch": 0.7831287984195686, + "grad_norm": 0.7265625, + "learning_rate": 0.00011479123082326173, + "loss": 0.8974, + "step": 30499 + }, + { + "epoch": 0.7831544756154905, + "grad_norm": 0.875, + "learning_rate": 0.00011478681571091342, + "loss": 0.8778, + "step": 30500 + }, + { + "epoch": 0.7831801528114122, + "grad_norm": 0.79296875, + "learning_rate": 0.00011478240056909628, + "loss": 0.8035, + "step": 30501 + }, + { + "epoch": 0.783205830007334, + "grad_norm": 0.6796875, + "learning_rate": 0.00011477798539781921, + "loss": 0.7161, + "step": 30502 + }, + { + "epoch": 0.7832315072032559, + "grad_norm": 0.765625, + "learning_rate": 0.00011477357019709091, + "loss": 0.7886, + "step": 30503 + }, + { + "epoch": 0.7832571843991777, + "grad_norm": 0.71484375, + "learning_rate": 0.00011476915496692025, + "loss": 0.6722, + "step": 30504 + }, + { + "epoch": 0.7832828615950995, + "grad_norm": 0.86328125, + "learning_rate": 0.00011476473970731598, + "loss": 0.7586, + "step": 30505 + }, + { + "epoch": 0.7833085387910214, + "grad_norm": 0.74609375, + "learning_rate": 0.00011476032441828692, + "loss": 0.7913, + "step": 30506 + }, + { + "epoch": 0.7833342159869432, + "grad_norm": 0.80859375, + "learning_rate": 0.00011475590909984187, + "loss": 0.8676, + "step": 30507 + }, + { + "epoch": 0.7833598931828649, + "grad_norm": 0.9453125, + "learning_rate": 0.00011475149375198961, + "loss": 0.8556, + "step": 30508 + }, + { + "epoch": 0.7833855703787868, + "grad_norm": 0.74609375, + "learning_rate": 0.00011474707837473897, + "loss": 0.7801, + "step": 30509 + }, + { + "epoch": 0.7834112475747086, + "grad_norm": 0.8125, + "learning_rate": 0.00011474266296809872, + "loss": 0.814, + "step": 30510 + }, + { + "epoch": 0.7834369247706304, + "grad_norm": 0.76953125, + "learning_rate": 0.00011473824753207772, + "loss": 0.7334, + "step": 30511 + }, + { + "epoch": 0.7834626019665523, + "grad_norm": 0.8203125, + "learning_rate": 0.00011473383206668469, + "loss": 0.7757, + "step": 30512 + }, + { + "epoch": 0.7834882791624741, + "grad_norm": 0.78125, + "learning_rate": 0.00011472941657192846, + "loss": 0.9249, + "step": 30513 + }, + { + "epoch": 0.7835139563583959, + "grad_norm": 0.859375, + "learning_rate": 0.00011472500104781782, + "loss": 0.7912, + "step": 30514 + }, + { + "epoch": 0.7835396335543177, + "grad_norm": 0.82421875, + "learning_rate": 0.00011472058549436162, + "loss": 0.7697, + "step": 30515 + }, + { + "epoch": 0.7835653107502395, + "grad_norm": 0.7421875, + "learning_rate": 0.0001147161699115686, + "loss": 0.8195, + "step": 30516 + }, + { + "epoch": 0.7835909879461613, + "grad_norm": 0.79296875, + "learning_rate": 0.0001147117542994476, + "loss": 0.7987, + "step": 30517 + }, + { + "epoch": 0.7836166651420832, + "grad_norm": 0.81640625, + "learning_rate": 0.0001147073386580074, + "loss": 0.8341, + "step": 30518 + }, + { + "epoch": 0.783642342338005, + "grad_norm": 0.8046875, + "learning_rate": 0.00011470292298725676, + "loss": 0.8134, + "step": 30519 + }, + { + "epoch": 0.7836680195339268, + "grad_norm": 0.8046875, + "learning_rate": 0.00011469850728720457, + "loss": 0.8111, + "step": 30520 + }, + { + "epoch": 0.7836936967298486, + "grad_norm": 0.74609375, + "learning_rate": 0.00011469409155785957, + "loss": 0.7731, + "step": 30521 + }, + { + "epoch": 0.7837193739257704, + "grad_norm": 0.78125, + "learning_rate": 0.00011468967579923055, + "loss": 0.8556, + "step": 30522 + }, + { + "epoch": 0.7837450511216922, + "grad_norm": 0.83203125, + "learning_rate": 0.00011468526001132636, + "loss": 0.7488, + "step": 30523 + }, + { + "epoch": 0.7837707283176141, + "grad_norm": 0.80859375, + "learning_rate": 0.00011468084419415579, + "loss": 0.8122, + "step": 30524 + }, + { + "epoch": 0.7837964055135359, + "grad_norm": 0.73828125, + "learning_rate": 0.00011467642834772758, + "loss": 0.8032, + "step": 30525 + }, + { + "epoch": 0.7838220827094577, + "grad_norm": 0.765625, + "learning_rate": 0.00011467201247205062, + "loss": 0.8714, + "step": 30526 + }, + { + "epoch": 0.7838477599053796, + "grad_norm": 0.6875, + "learning_rate": 0.00011466759656713364, + "loss": 0.7266, + "step": 30527 + }, + { + "epoch": 0.7838734371013013, + "grad_norm": 0.80859375, + "learning_rate": 0.00011466318063298544, + "loss": 0.8175, + "step": 30528 + }, + { + "epoch": 0.7838991142972231, + "grad_norm": 0.81640625, + "learning_rate": 0.00011465876466961487, + "loss": 0.9972, + "step": 30529 + }, + { + "epoch": 0.783924791493145, + "grad_norm": 0.76171875, + "learning_rate": 0.0001146543486770307, + "loss": 0.8509, + "step": 30530 + }, + { + "epoch": 0.7839504686890668, + "grad_norm": 0.7265625, + "learning_rate": 0.00011464993265524175, + "loss": 0.7778, + "step": 30531 + }, + { + "epoch": 0.7839761458849887, + "grad_norm": 0.76953125, + "learning_rate": 0.00011464551660425682, + "loss": 0.7437, + "step": 30532 + }, + { + "epoch": 0.7840018230809105, + "grad_norm": 0.82421875, + "learning_rate": 0.00011464110052408465, + "loss": 0.8375, + "step": 30533 + }, + { + "epoch": 0.7840275002768323, + "grad_norm": 0.890625, + "learning_rate": 0.00011463668441473415, + "loss": 0.8813, + "step": 30534 + }, + { + "epoch": 0.784053177472754, + "grad_norm": 0.86328125, + "learning_rate": 0.00011463226827621402, + "loss": 0.846, + "step": 30535 + }, + { + "epoch": 0.7840788546686759, + "grad_norm": 0.796875, + "learning_rate": 0.00011462785210853308, + "loss": 0.7738, + "step": 30536 + }, + { + "epoch": 0.7841045318645977, + "grad_norm": 0.76953125, + "learning_rate": 0.00011462343591170021, + "loss": 0.7869, + "step": 30537 + }, + { + "epoch": 0.7841302090605196, + "grad_norm": 0.78515625, + "learning_rate": 0.0001146190196857241, + "loss": 0.8012, + "step": 30538 + }, + { + "epoch": 0.7841558862564414, + "grad_norm": 0.75, + "learning_rate": 0.00011461460343061365, + "loss": 0.9034, + "step": 30539 + }, + { + "epoch": 0.7841815634523632, + "grad_norm": 0.78515625, + "learning_rate": 0.0001146101871463776, + "loss": 0.7308, + "step": 30540 + }, + { + "epoch": 0.784207240648285, + "grad_norm": 0.76171875, + "learning_rate": 0.00011460577083302475, + "loss": 0.8392, + "step": 30541 + }, + { + "epoch": 0.7842329178442068, + "grad_norm": 0.7890625, + "learning_rate": 0.00011460135449056393, + "loss": 0.8299, + "step": 30542 + }, + { + "epoch": 0.7842585950401286, + "grad_norm": 0.77734375, + "learning_rate": 0.00011459693811900392, + "loss": 0.7521, + "step": 30543 + }, + { + "epoch": 0.7842842722360505, + "grad_norm": 0.80078125, + "learning_rate": 0.00011459252171835355, + "loss": 0.82, + "step": 30544 + }, + { + "epoch": 0.7843099494319723, + "grad_norm": 0.82421875, + "learning_rate": 0.0001145881052886216, + "loss": 0.6407, + "step": 30545 + }, + { + "epoch": 0.7843356266278941, + "grad_norm": 0.796875, + "learning_rate": 0.00011458368882981685, + "loss": 0.7772, + "step": 30546 + }, + { + "epoch": 0.784361303823816, + "grad_norm": 0.75390625, + "learning_rate": 0.00011457927234194814, + "loss": 0.8174, + "step": 30547 + }, + { + "epoch": 0.7843869810197377, + "grad_norm": 0.78515625, + "learning_rate": 0.00011457485582502427, + "loss": 0.8461, + "step": 30548 + }, + { + "epoch": 0.7844126582156595, + "grad_norm": 0.7734375, + "learning_rate": 0.00011457043927905399, + "loss": 0.7449, + "step": 30549 + }, + { + "epoch": 0.7844383354115814, + "grad_norm": 0.8828125, + "learning_rate": 0.00011456602270404619, + "loss": 0.7863, + "step": 30550 + }, + { + "epoch": 0.7844640126075032, + "grad_norm": 0.79296875, + "learning_rate": 0.0001145616061000096, + "loss": 0.8652, + "step": 30551 + }, + { + "epoch": 0.784489689803425, + "grad_norm": 0.8515625, + "learning_rate": 0.00011455718946695304, + "loss": 0.8962, + "step": 30552 + }, + { + "epoch": 0.7845153669993469, + "grad_norm": 0.82421875, + "learning_rate": 0.00011455277280488532, + "loss": 0.7601, + "step": 30553 + }, + { + "epoch": 0.7845410441952687, + "grad_norm": 0.7890625, + "learning_rate": 0.00011454835611381522, + "loss": 0.7642, + "step": 30554 + }, + { + "epoch": 0.7845667213911904, + "grad_norm": 0.69140625, + "learning_rate": 0.00011454393939375159, + "loss": 0.6921, + "step": 30555 + }, + { + "epoch": 0.7845923985871123, + "grad_norm": 0.9140625, + "learning_rate": 0.0001145395226447032, + "loss": 0.8834, + "step": 30556 + }, + { + "epoch": 0.7846180757830341, + "grad_norm": 0.7421875, + "learning_rate": 0.00011453510586667885, + "loss": 0.8863, + "step": 30557 + }, + { + "epoch": 0.7846437529789559, + "grad_norm": 0.76171875, + "learning_rate": 0.00011453068905968736, + "loss": 0.9802, + "step": 30558 + }, + { + "epoch": 0.7846694301748778, + "grad_norm": 0.81640625, + "learning_rate": 0.00011452627222373748, + "loss": 0.7092, + "step": 30559 + }, + { + "epoch": 0.7846951073707996, + "grad_norm": 0.82421875, + "learning_rate": 0.00011452185535883808, + "loss": 0.8262, + "step": 30560 + }, + { + "epoch": 0.7847207845667213, + "grad_norm": 0.765625, + "learning_rate": 0.00011451743846499794, + "loss": 0.6468, + "step": 30561 + }, + { + "epoch": 0.7847464617626432, + "grad_norm": 0.80859375, + "learning_rate": 0.00011451302154222583, + "loss": 0.73, + "step": 30562 + }, + { + "epoch": 0.784772138958565, + "grad_norm": 0.8125, + "learning_rate": 0.00011450860459053062, + "loss": 0.848, + "step": 30563 + }, + { + "epoch": 0.7847978161544868, + "grad_norm": 0.8125, + "learning_rate": 0.00011450418760992103, + "loss": 0.869, + "step": 30564 + }, + { + "epoch": 0.7848234933504087, + "grad_norm": 0.77734375, + "learning_rate": 0.0001144997706004059, + "loss": 0.8291, + "step": 30565 + }, + { + "epoch": 0.7848491705463305, + "grad_norm": 0.875, + "learning_rate": 0.00011449535356199407, + "loss": 0.8372, + "step": 30566 + }, + { + "epoch": 0.7848748477422524, + "grad_norm": 0.765625, + "learning_rate": 0.00011449093649469429, + "loss": 0.8539, + "step": 30567 + }, + { + "epoch": 0.7849005249381741, + "grad_norm": 0.8046875, + "learning_rate": 0.00011448651939851538, + "loss": 0.8829, + "step": 30568 + }, + { + "epoch": 0.7849262021340959, + "grad_norm": 0.7578125, + "learning_rate": 0.00011448210227346616, + "loss": 0.8713, + "step": 30569 + }, + { + "epoch": 0.7849518793300178, + "grad_norm": 0.734375, + "learning_rate": 0.00011447768511955543, + "loss": 0.745, + "step": 30570 + }, + { + "epoch": 0.7849775565259396, + "grad_norm": 0.796875, + "learning_rate": 0.00011447326793679196, + "loss": 0.724, + "step": 30571 + }, + { + "epoch": 0.7850032337218614, + "grad_norm": 0.73046875, + "learning_rate": 0.00011446885072518458, + "loss": 0.8006, + "step": 30572 + }, + { + "epoch": 0.7850289109177833, + "grad_norm": 0.78125, + "learning_rate": 0.00011446443348474207, + "loss": 0.7398, + "step": 30573 + }, + { + "epoch": 0.785054588113705, + "grad_norm": 0.73046875, + "learning_rate": 0.00011446001621547328, + "loss": 0.7117, + "step": 30574 + }, + { + "epoch": 0.7850802653096268, + "grad_norm": 0.83984375, + "learning_rate": 0.00011445559891738697, + "loss": 0.9415, + "step": 30575 + }, + { + "epoch": 0.7851059425055487, + "grad_norm": 0.7109375, + "learning_rate": 0.00011445118159049197, + "loss": 0.7837, + "step": 30576 + }, + { + "epoch": 0.7851316197014705, + "grad_norm": 0.8203125, + "learning_rate": 0.00011444676423479705, + "loss": 0.7735, + "step": 30577 + }, + { + "epoch": 0.7851572968973923, + "grad_norm": 0.79296875, + "learning_rate": 0.00011444234685031103, + "loss": 0.8639, + "step": 30578 + }, + { + "epoch": 0.7851829740933142, + "grad_norm": 0.734375, + "learning_rate": 0.00011443792943704274, + "loss": 0.8213, + "step": 30579 + }, + { + "epoch": 0.785208651289236, + "grad_norm": 0.78125, + "learning_rate": 0.00011443351199500097, + "loss": 0.9146, + "step": 30580 + }, + { + "epoch": 0.7852343284851577, + "grad_norm": 0.7421875, + "learning_rate": 0.00011442909452419446, + "loss": 0.8136, + "step": 30581 + }, + { + "epoch": 0.7852600056810796, + "grad_norm": 0.71875, + "learning_rate": 0.0001144246770246321, + "loss": 0.8383, + "step": 30582 + }, + { + "epoch": 0.7852856828770014, + "grad_norm": 0.8203125, + "learning_rate": 0.00011442025949632267, + "loss": 0.8776, + "step": 30583 + }, + { + "epoch": 0.7853113600729232, + "grad_norm": 0.79296875, + "learning_rate": 0.00011441584193927497, + "loss": 0.8656, + "step": 30584 + }, + { + "epoch": 0.7853370372688451, + "grad_norm": 0.796875, + "learning_rate": 0.00011441142435349779, + "loss": 0.876, + "step": 30585 + }, + { + "epoch": 0.7853627144647669, + "grad_norm": 0.71484375, + "learning_rate": 0.00011440700673899993, + "loss": 0.6842, + "step": 30586 + }, + { + "epoch": 0.7853883916606887, + "grad_norm": 0.73046875, + "learning_rate": 0.00011440258909579022, + "loss": 0.8741, + "step": 30587 + }, + { + "epoch": 0.7854140688566105, + "grad_norm": 0.7890625, + "learning_rate": 0.00011439817142387745, + "loss": 0.7939, + "step": 30588 + }, + { + "epoch": 0.7854397460525323, + "grad_norm": 1.0546875, + "learning_rate": 0.00011439375372327043, + "loss": 0.7722, + "step": 30589 + }, + { + "epoch": 0.7854654232484541, + "grad_norm": 0.7890625, + "learning_rate": 0.00011438933599397797, + "loss": 0.8258, + "step": 30590 + }, + { + "epoch": 0.785491100444376, + "grad_norm": 0.77734375, + "learning_rate": 0.00011438491823600883, + "loss": 0.8176, + "step": 30591 + }, + { + "epoch": 0.7855167776402978, + "grad_norm": 0.75390625, + "learning_rate": 0.00011438050044937187, + "loss": 0.6891, + "step": 30592 + }, + { + "epoch": 0.7855424548362197, + "grad_norm": 0.84375, + "learning_rate": 0.00011437608263407587, + "loss": 0.8792, + "step": 30593 + }, + { + "epoch": 0.7855681320321414, + "grad_norm": 0.8359375, + "learning_rate": 0.00011437166479012962, + "loss": 0.8256, + "step": 30594 + }, + { + "epoch": 0.7855938092280632, + "grad_norm": 0.82421875, + "learning_rate": 0.00011436724691754195, + "loss": 0.9401, + "step": 30595 + }, + { + "epoch": 0.785619486423985, + "grad_norm": 0.8671875, + "learning_rate": 0.00011436282901632164, + "loss": 0.7258, + "step": 30596 + }, + { + "epoch": 0.7856451636199069, + "grad_norm": 0.796875, + "learning_rate": 0.00011435841108647755, + "loss": 0.7893, + "step": 30597 + }, + { + "epoch": 0.7856708408158287, + "grad_norm": 0.76171875, + "learning_rate": 0.00011435399312801842, + "loss": 0.9096, + "step": 30598 + }, + { + "epoch": 0.7856965180117506, + "grad_norm": 0.8203125, + "learning_rate": 0.00011434957514095306, + "loss": 0.8173, + "step": 30599 + }, + { + "epoch": 0.7857221952076724, + "grad_norm": 0.7890625, + "learning_rate": 0.00011434515712529032, + "loss": 0.762, + "step": 30600 + }, + { + "epoch": 0.7857478724035941, + "grad_norm": 0.74609375, + "learning_rate": 0.00011434073908103897, + "loss": 0.7034, + "step": 30601 + }, + { + "epoch": 0.785773549599516, + "grad_norm": 0.78125, + "learning_rate": 0.00011433632100820782, + "loss": 0.903, + "step": 30602 + }, + { + "epoch": 0.7857992267954378, + "grad_norm": 0.74609375, + "learning_rate": 0.00011433190290680569, + "loss": 0.8489, + "step": 30603 + }, + { + "epoch": 0.7858249039913596, + "grad_norm": 0.71875, + "learning_rate": 0.00011432748477684134, + "loss": 0.719, + "step": 30604 + }, + { + "epoch": 0.7858505811872815, + "grad_norm": 0.8125, + "learning_rate": 0.00011432306661832363, + "loss": 0.7829, + "step": 30605 + }, + { + "epoch": 0.7858762583832033, + "grad_norm": 0.74609375, + "learning_rate": 0.00011431864843126135, + "loss": 0.7262, + "step": 30606 + }, + { + "epoch": 0.7859019355791251, + "grad_norm": 0.74609375, + "learning_rate": 0.00011431423021566328, + "loss": 0.7485, + "step": 30607 + }, + { + "epoch": 0.7859276127750469, + "grad_norm": 0.78515625, + "learning_rate": 0.00011430981197153824, + "loss": 0.8099, + "step": 30608 + }, + { + "epoch": 0.7859532899709687, + "grad_norm": 0.70703125, + "learning_rate": 0.00011430539369889508, + "loss": 0.7544, + "step": 30609 + }, + { + "epoch": 0.7859789671668905, + "grad_norm": 0.6875, + "learning_rate": 0.00011430097539774252, + "loss": 0.7426, + "step": 30610 + }, + { + "epoch": 0.7860046443628124, + "grad_norm": 0.796875, + "learning_rate": 0.00011429655706808939, + "loss": 0.8056, + "step": 30611 + }, + { + "epoch": 0.7860303215587342, + "grad_norm": 0.76171875, + "learning_rate": 0.00011429213870994455, + "loss": 0.7955, + "step": 30612 + }, + { + "epoch": 0.786055998754656, + "grad_norm": 0.87109375, + "learning_rate": 0.00011428772032331677, + "loss": 0.8276, + "step": 30613 + }, + { + "epoch": 0.7860816759505778, + "grad_norm": 0.796875, + "learning_rate": 0.00011428330190821484, + "loss": 0.7607, + "step": 30614 + }, + { + "epoch": 0.7861073531464996, + "grad_norm": 0.76953125, + "learning_rate": 0.00011427888346464754, + "loss": 0.8384, + "step": 30615 + }, + { + "epoch": 0.7861330303424214, + "grad_norm": 0.78515625, + "learning_rate": 0.0001142744649926238, + "loss": 0.775, + "step": 30616 + }, + { + "epoch": 0.7861587075383433, + "grad_norm": 0.734375, + "learning_rate": 0.00011427004649215229, + "loss": 0.8106, + "step": 30617 + }, + { + "epoch": 0.7861843847342651, + "grad_norm": 0.7890625, + "learning_rate": 0.00011426562796324188, + "loss": 0.8585, + "step": 30618 + }, + { + "epoch": 0.7862100619301869, + "grad_norm": 0.75, + "learning_rate": 0.00011426120940590135, + "loss": 0.8746, + "step": 30619 + }, + { + "epoch": 0.7862357391261088, + "grad_norm": 0.6875, + "learning_rate": 0.00011425679082013952, + "loss": 0.766, + "step": 30620 + }, + { + "epoch": 0.7862614163220305, + "grad_norm": 0.80078125, + "learning_rate": 0.00011425237220596519, + "loss": 0.9759, + "step": 30621 + }, + { + "epoch": 0.7862870935179523, + "grad_norm": 0.7421875, + "learning_rate": 0.0001142479535633872, + "loss": 0.8661, + "step": 30622 + }, + { + "epoch": 0.7863127707138742, + "grad_norm": 0.78515625, + "learning_rate": 0.00011424353489241428, + "loss": 0.8124, + "step": 30623 + }, + { + "epoch": 0.786338447909796, + "grad_norm": 0.8046875, + "learning_rate": 0.0001142391161930553, + "loss": 0.8323, + "step": 30624 + }, + { + "epoch": 0.7863641251057178, + "grad_norm": 0.859375, + "learning_rate": 0.00011423469746531905, + "loss": 0.7543, + "step": 30625 + }, + { + "epoch": 0.7863898023016397, + "grad_norm": 0.765625, + "learning_rate": 0.00011423027870921433, + "loss": 0.7656, + "step": 30626 + }, + { + "epoch": 0.7864154794975615, + "grad_norm": 0.8203125, + "learning_rate": 0.00011422585992474996, + "loss": 0.8155, + "step": 30627 + }, + { + "epoch": 0.7864411566934832, + "grad_norm": 0.76953125, + "learning_rate": 0.00011422144111193473, + "loss": 0.9489, + "step": 30628 + }, + { + "epoch": 0.7864668338894051, + "grad_norm": 0.7421875, + "learning_rate": 0.00011421702227077746, + "loss": 0.8063, + "step": 30629 + }, + { + "epoch": 0.7864925110853269, + "grad_norm": 0.82421875, + "learning_rate": 0.00011421260340128697, + "loss": 0.9321, + "step": 30630 + }, + { + "epoch": 0.7865181882812488, + "grad_norm": 0.765625, + "learning_rate": 0.00011420818450347199, + "loss": 0.8717, + "step": 30631 + }, + { + "epoch": 0.7865438654771706, + "grad_norm": 0.81640625, + "learning_rate": 0.00011420376557734141, + "loss": 0.8555, + "step": 30632 + }, + { + "epoch": 0.7865695426730924, + "grad_norm": 0.74609375, + "learning_rate": 0.00011419934662290401, + "loss": 0.797, + "step": 30633 + }, + { + "epoch": 0.7865952198690142, + "grad_norm": 0.8359375, + "learning_rate": 0.00011419492764016858, + "loss": 0.934, + "step": 30634 + }, + { + "epoch": 0.786620897064936, + "grad_norm": 0.8359375, + "learning_rate": 0.00011419050862914397, + "loss": 0.7662, + "step": 30635 + }, + { + "epoch": 0.7866465742608578, + "grad_norm": 0.76171875, + "learning_rate": 0.00011418608958983895, + "loss": 0.7304, + "step": 30636 + }, + { + "epoch": 0.7866722514567797, + "grad_norm": 0.76171875, + "learning_rate": 0.00011418167052226232, + "loss": 0.8735, + "step": 30637 + }, + { + "epoch": 0.7866979286527015, + "grad_norm": 0.80859375, + "learning_rate": 0.00011417725142642291, + "loss": 0.8421, + "step": 30638 + }, + { + "epoch": 0.7867236058486233, + "grad_norm": 0.85546875, + "learning_rate": 0.00011417283230232949, + "loss": 0.8847, + "step": 30639 + }, + { + "epoch": 0.7867492830445452, + "grad_norm": 0.84375, + "learning_rate": 0.00011416841314999094, + "loss": 0.8643, + "step": 30640 + }, + { + "epoch": 0.7867749602404669, + "grad_norm": 0.72265625, + "learning_rate": 0.000114163993969416, + "loss": 0.813, + "step": 30641 + }, + { + "epoch": 0.7868006374363887, + "grad_norm": 0.765625, + "learning_rate": 0.00011415957476061352, + "loss": 0.8844, + "step": 30642 + }, + { + "epoch": 0.7868263146323106, + "grad_norm": 0.75, + "learning_rate": 0.00011415515552359226, + "loss": 0.8507, + "step": 30643 + }, + { + "epoch": 0.7868519918282324, + "grad_norm": 0.8359375, + "learning_rate": 0.00011415073625836105, + "loss": 0.7927, + "step": 30644 + }, + { + "epoch": 0.7868776690241542, + "grad_norm": 0.79296875, + "learning_rate": 0.00011414631696492872, + "loss": 0.688, + "step": 30645 + }, + { + "epoch": 0.7869033462200761, + "grad_norm": 0.84375, + "learning_rate": 0.00011414189764330408, + "loss": 0.7382, + "step": 30646 + }, + { + "epoch": 0.7869290234159979, + "grad_norm": 0.83203125, + "learning_rate": 0.00011413747829349586, + "loss": 0.8817, + "step": 30647 + }, + { + "epoch": 0.7869547006119196, + "grad_norm": 0.73828125, + "learning_rate": 0.00011413305891551296, + "loss": 0.7377, + "step": 30648 + }, + { + "epoch": 0.7869803778078415, + "grad_norm": 0.80859375, + "learning_rate": 0.00011412863950936417, + "loss": 0.976, + "step": 30649 + }, + { + "epoch": 0.7870060550037633, + "grad_norm": 1.2265625, + "learning_rate": 0.00011412422007505823, + "loss": 0.8877, + "step": 30650 + }, + { + "epoch": 0.7870317321996851, + "grad_norm": 0.7734375, + "learning_rate": 0.00011411980061260401, + "loss": 0.7825, + "step": 30651 + }, + { + "epoch": 0.787057409395607, + "grad_norm": 0.76171875, + "learning_rate": 0.00011411538112201032, + "loss": 0.7715, + "step": 30652 + }, + { + "epoch": 0.7870830865915288, + "grad_norm": 0.80078125, + "learning_rate": 0.00011411096160328592, + "loss": 0.8298, + "step": 30653 + }, + { + "epoch": 0.7871087637874505, + "grad_norm": 0.69140625, + "learning_rate": 0.00011410654205643967, + "loss": 0.7316, + "step": 30654 + }, + { + "epoch": 0.7871344409833724, + "grad_norm": 0.7578125, + "learning_rate": 0.00011410212248148036, + "loss": 0.7649, + "step": 30655 + }, + { + "epoch": 0.7871601181792942, + "grad_norm": 0.67578125, + "learning_rate": 0.0001140977028784168, + "loss": 0.69, + "step": 30656 + }, + { + "epoch": 0.787185795375216, + "grad_norm": 0.78125, + "learning_rate": 0.00011409328324725777, + "loss": 0.7628, + "step": 30657 + }, + { + "epoch": 0.7872114725711379, + "grad_norm": 0.8203125, + "learning_rate": 0.00011408886358801209, + "loss": 0.7696, + "step": 30658 + }, + { + "epoch": 0.7872371497670597, + "grad_norm": 0.75, + "learning_rate": 0.0001140844439006886, + "loss": 0.7457, + "step": 30659 + }, + { + "epoch": 0.7872628269629816, + "grad_norm": 0.80078125, + "learning_rate": 0.00011408002418529609, + "loss": 0.9355, + "step": 30660 + }, + { + "epoch": 0.7872885041589033, + "grad_norm": 0.80859375, + "learning_rate": 0.00011407560444184333, + "loss": 0.8644, + "step": 30661 + }, + { + "epoch": 0.7873141813548251, + "grad_norm": 0.76171875, + "learning_rate": 0.00011407118467033921, + "loss": 0.7435, + "step": 30662 + }, + { + "epoch": 0.787339858550747, + "grad_norm": 0.859375, + "learning_rate": 0.00011406676487079246, + "loss": 0.8137, + "step": 30663 + }, + { + "epoch": 0.7873655357466688, + "grad_norm": 0.7421875, + "learning_rate": 0.00011406234504321193, + "loss": 0.772, + "step": 30664 + }, + { + "epoch": 0.7873912129425906, + "grad_norm": 0.79296875, + "learning_rate": 0.00011405792518760641, + "loss": 0.8802, + "step": 30665 + }, + { + "epoch": 0.7874168901385125, + "grad_norm": 0.8125, + "learning_rate": 0.0001140535053039847, + "loss": 0.7791, + "step": 30666 + }, + { + "epoch": 0.7874425673344343, + "grad_norm": 0.76171875, + "learning_rate": 0.00011404908539235564, + "loss": 0.7547, + "step": 30667 + }, + { + "epoch": 0.787468244530356, + "grad_norm": 0.8046875, + "learning_rate": 0.00011404466545272803, + "loss": 0.8372, + "step": 30668 + }, + { + "epoch": 0.7874939217262779, + "grad_norm": 0.76953125, + "learning_rate": 0.00011404024548511066, + "loss": 0.744, + "step": 30669 + }, + { + "epoch": 0.7875195989221997, + "grad_norm": 0.79296875, + "learning_rate": 0.00011403582548951237, + "loss": 0.8175, + "step": 30670 + }, + { + "epoch": 0.7875452761181215, + "grad_norm": 0.77734375, + "learning_rate": 0.00011403140546594192, + "loss": 0.7415, + "step": 30671 + }, + { + "epoch": 0.7875709533140434, + "grad_norm": 0.75390625, + "learning_rate": 0.00011402698541440812, + "loss": 0.7657, + "step": 30672 + }, + { + "epoch": 0.7875966305099652, + "grad_norm": 0.78125, + "learning_rate": 0.00011402256533491986, + "loss": 0.7393, + "step": 30673 + }, + { + "epoch": 0.7876223077058869, + "grad_norm": 0.70703125, + "learning_rate": 0.00011401814522748586, + "loss": 0.9229, + "step": 30674 + }, + { + "epoch": 0.7876479849018088, + "grad_norm": 0.80078125, + "learning_rate": 0.000114013725092115, + "loss": 0.7939, + "step": 30675 + }, + { + "epoch": 0.7876736620977306, + "grad_norm": 0.8125, + "learning_rate": 0.000114009304928816, + "loss": 0.7207, + "step": 30676 + }, + { + "epoch": 0.7876993392936524, + "grad_norm": 0.765625, + "learning_rate": 0.00011400488473759776, + "loss": 0.8106, + "step": 30677 + }, + { + "epoch": 0.7877250164895743, + "grad_norm": 0.765625, + "learning_rate": 0.00011400046451846904, + "loss": 0.9373, + "step": 30678 + }, + { + "epoch": 0.7877506936854961, + "grad_norm": 0.79296875, + "learning_rate": 0.00011399604427143864, + "loss": 0.7959, + "step": 30679 + }, + { + "epoch": 0.7877763708814179, + "grad_norm": 0.84765625, + "learning_rate": 0.00011399162399651542, + "loss": 0.8679, + "step": 30680 + }, + { + "epoch": 0.7878020480773397, + "grad_norm": 0.75390625, + "learning_rate": 0.00011398720369370815, + "loss": 0.8884, + "step": 30681 + }, + { + "epoch": 0.7878277252732615, + "grad_norm": 0.7734375, + "learning_rate": 0.00011398278336302563, + "loss": 0.8736, + "step": 30682 + }, + { + "epoch": 0.7878534024691833, + "grad_norm": 0.8125, + "learning_rate": 0.00011397836300447671, + "loss": 0.9408, + "step": 30683 + }, + { + "epoch": 0.7878790796651052, + "grad_norm": 0.7265625, + "learning_rate": 0.00011397394261807015, + "loss": 0.7465, + "step": 30684 + }, + { + "epoch": 0.787904756861027, + "grad_norm": 0.83203125, + "learning_rate": 0.00011396952220381481, + "loss": 0.9032, + "step": 30685 + }, + { + "epoch": 0.7879304340569488, + "grad_norm": 0.78515625, + "learning_rate": 0.00011396510176171944, + "loss": 0.8332, + "step": 30686 + }, + { + "epoch": 0.7879561112528707, + "grad_norm": 0.7734375, + "learning_rate": 0.00011396068129179289, + "loss": 0.7414, + "step": 30687 + }, + { + "epoch": 0.7879817884487924, + "grad_norm": 1.0234375, + "learning_rate": 0.000113956260794044, + "loss": 0.8404, + "step": 30688 + }, + { + "epoch": 0.7880074656447142, + "grad_norm": 0.7890625, + "learning_rate": 0.00011395184026848154, + "loss": 0.8199, + "step": 30689 + }, + { + "epoch": 0.7880331428406361, + "grad_norm": 0.81640625, + "learning_rate": 0.00011394741971511427, + "loss": 0.6912, + "step": 30690 + }, + { + "epoch": 0.7880588200365579, + "grad_norm": 0.796875, + "learning_rate": 0.00011394299913395109, + "loss": 0.808, + "step": 30691 + }, + { + "epoch": 0.7880844972324798, + "grad_norm": 0.8359375, + "learning_rate": 0.00011393857852500074, + "loss": 0.9055, + "step": 30692 + }, + { + "epoch": 0.7881101744284016, + "grad_norm": 0.81640625, + "learning_rate": 0.00011393415788827212, + "loss": 0.8823, + "step": 30693 + }, + { + "epoch": 0.7881358516243233, + "grad_norm": 0.79296875, + "learning_rate": 0.00011392973722377393, + "loss": 0.8672, + "step": 30694 + }, + { + "epoch": 0.7881615288202451, + "grad_norm": 0.80078125, + "learning_rate": 0.00011392531653151506, + "loss": 0.7878, + "step": 30695 + }, + { + "epoch": 0.788187206016167, + "grad_norm": 0.74609375, + "learning_rate": 0.00011392089581150429, + "loss": 0.8511, + "step": 30696 + }, + { + "epoch": 0.7882128832120888, + "grad_norm": 0.765625, + "learning_rate": 0.00011391647506375042, + "loss": 0.8211, + "step": 30697 + }, + { + "epoch": 0.7882385604080107, + "grad_norm": 0.73828125, + "learning_rate": 0.00011391205428826226, + "loss": 0.803, + "step": 30698 + }, + { + "epoch": 0.7882642376039325, + "grad_norm": 0.7734375, + "learning_rate": 0.00011390763348504867, + "loss": 0.7447, + "step": 30699 + }, + { + "epoch": 0.7882899147998543, + "grad_norm": 0.7734375, + "learning_rate": 0.0001139032126541184, + "loss": 0.8143, + "step": 30700 + }, + { + "epoch": 0.788315591995776, + "grad_norm": 0.76953125, + "learning_rate": 0.00011389879179548031, + "loss": 1.016, + "step": 30701 + }, + { + "epoch": 0.7883412691916979, + "grad_norm": 0.80078125, + "learning_rate": 0.00011389437090914314, + "loss": 0.8477, + "step": 30702 + }, + { + "epoch": 0.7883669463876197, + "grad_norm": 0.8203125, + "learning_rate": 0.00011388994999511576, + "loss": 0.85, + "step": 30703 + }, + { + "epoch": 0.7883926235835416, + "grad_norm": 0.78515625, + "learning_rate": 0.00011388552905340697, + "loss": 0.7348, + "step": 30704 + }, + { + "epoch": 0.7884183007794634, + "grad_norm": 0.734375, + "learning_rate": 0.00011388110808402559, + "loss": 0.7796, + "step": 30705 + }, + { + "epoch": 0.7884439779753852, + "grad_norm": 0.75390625, + "learning_rate": 0.00011387668708698036, + "loss": 0.6896, + "step": 30706 + }, + { + "epoch": 0.7884696551713071, + "grad_norm": 0.82421875, + "learning_rate": 0.00011387226606228021, + "loss": 0.8857, + "step": 30707 + }, + { + "epoch": 0.7884953323672288, + "grad_norm": 0.7109375, + "learning_rate": 0.00011386784500993386, + "loss": 0.7492, + "step": 30708 + }, + { + "epoch": 0.7885210095631506, + "grad_norm": 0.77734375, + "learning_rate": 0.00011386342392995015, + "loss": 0.857, + "step": 30709 + }, + { + "epoch": 0.7885466867590725, + "grad_norm": 0.7578125, + "learning_rate": 0.0001138590028223379, + "loss": 0.9205, + "step": 30710 + }, + { + "epoch": 0.7885723639549943, + "grad_norm": 0.7890625, + "learning_rate": 0.00011385458168710588, + "loss": 0.8628, + "step": 30711 + }, + { + "epoch": 0.7885980411509161, + "grad_norm": 0.89453125, + "learning_rate": 0.00011385016052426294, + "loss": 0.935, + "step": 30712 + }, + { + "epoch": 0.788623718346838, + "grad_norm": 0.78125, + "learning_rate": 0.0001138457393338179, + "loss": 0.7508, + "step": 30713 + }, + { + "epoch": 0.7886493955427597, + "grad_norm": 0.74609375, + "learning_rate": 0.00011384131811577954, + "loss": 0.8118, + "step": 30714 + }, + { + "epoch": 0.7886750727386815, + "grad_norm": 0.80859375, + "learning_rate": 0.00011383689687015669, + "loss": 0.6798, + "step": 30715 + }, + { + "epoch": 0.7887007499346034, + "grad_norm": 0.76171875, + "learning_rate": 0.00011383247559695813, + "loss": 0.7223, + "step": 30716 + }, + { + "epoch": 0.7887264271305252, + "grad_norm": 0.79296875, + "learning_rate": 0.00011382805429619273, + "loss": 0.8957, + "step": 30717 + }, + { + "epoch": 0.788752104326447, + "grad_norm": 0.73046875, + "learning_rate": 0.00011382363296786924, + "loss": 0.7034, + "step": 30718 + }, + { + "epoch": 0.7887777815223689, + "grad_norm": 0.81640625, + "learning_rate": 0.0001138192116119965, + "loss": 0.87, + "step": 30719 + }, + { + "epoch": 0.7888034587182907, + "grad_norm": 0.7734375, + "learning_rate": 0.00011381479022858333, + "loss": 0.8194, + "step": 30720 + }, + { + "epoch": 0.7888291359142124, + "grad_norm": 0.796875, + "learning_rate": 0.00011381036881763854, + "loss": 0.7853, + "step": 30721 + }, + { + "epoch": 0.7888548131101343, + "grad_norm": 0.83203125, + "learning_rate": 0.00011380594737917092, + "loss": 0.7397, + "step": 30722 + }, + { + "epoch": 0.7888804903060561, + "grad_norm": 0.78515625, + "learning_rate": 0.00011380152591318929, + "loss": 0.8172, + "step": 30723 + }, + { + "epoch": 0.788906167501978, + "grad_norm": 0.75390625, + "learning_rate": 0.00011379710441970247, + "loss": 0.829, + "step": 30724 + }, + { + "epoch": 0.7889318446978998, + "grad_norm": 0.796875, + "learning_rate": 0.00011379268289871925, + "loss": 0.7777, + "step": 30725 + }, + { + "epoch": 0.7889575218938216, + "grad_norm": 0.796875, + "learning_rate": 0.00011378826135024849, + "loss": 0.8149, + "step": 30726 + }, + { + "epoch": 0.7889831990897435, + "grad_norm": 0.78515625, + "learning_rate": 0.00011378383977429893, + "loss": 0.81, + "step": 30727 + }, + { + "epoch": 0.7890088762856652, + "grad_norm": 0.8125, + "learning_rate": 0.0001137794181708795, + "loss": 0.8596, + "step": 30728 + }, + { + "epoch": 0.789034553481587, + "grad_norm": 0.8203125, + "learning_rate": 0.00011377499653999886, + "loss": 0.9267, + "step": 30729 + }, + { + "epoch": 0.7890602306775089, + "grad_norm": 0.71484375, + "learning_rate": 0.00011377057488166592, + "loss": 0.8288, + "step": 30730 + }, + { + "epoch": 0.7890859078734307, + "grad_norm": 0.79296875, + "learning_rate": 0.00011376615319588948, + "loss": 0.871, + "step": 30731 + }, + { + "epoch": 0.7891115850693525, + "grad_norm": 0.8046875, + "learning_rate": 0.0001137617314826783, + "loss": 0.9605, + "step": 30732 + }, + { + "epoch": 0.7891372622652744, + "grad_norm": 0.7109375, + "learning_rate": 0.00011375730974204126, + "loss": 0.8379, + "step": 30733 + }, + { + "epoch": 0.7891629394611961, + "grad_norm": 0.828125, + "learning_rate": 0.00011375288797398718, + "loss": 0.7914, + "step": 30734 + }, + { + "epoch": 0.7891886166571179, + "grad_norm": 0.77734375, + "learning_rate": 0.00011374846617852479, + "loss": 0.8529, + "step": 30735 + }, + { + "epoch": 0.7892142938530398, + "grad_norm": 0.73046875, + "learning_rate": 0.00011374404435566297, + "loss": 0.7116, + "step": 30736 + }, + { + "epoch": 0.7892399710489616, + "grad_norm": 0.7890625, + "learning_rate": 0.0001137396225054105, + "loss": 0.845, + "step": 30737 + }, + { + "epoch": 0.7892656482448834, + "grad_norm": 0.765625, + "learning_rate": 0.00011373520062777619, + "loss": 0.8244, + "step": 30738 + }, + { + "epoch": 0.7892913254408053, + "grad_norm": 0.73046875, + "learning_rate": 0.0001137307787227689, + "loss": 0.7789, + "step": 30739 + }, + { + "epoch": 0.7893170026367271, + "grad_norm": 0.828125, + "learning_rate": 0.00011372635679039736, + "loss": 0.9257, + "step": 30740 + }, + { + "epoch": 0.7893426798326488, + "grad_norm": 0.86328125, + "learning_rate": 0.00011372193483067048, + "loss": 0.9608, + "step": 30741 + }, + { + "epoch": 0.7893683570285707, + "grad_norm": 0.79296875, + "learning_rate": 0.00011371751284359701, + "loss": 0.885, + "step": 30742 + }, + { + "epoch": 0.7893940342244925, + "grad_norm": 0.78515625, + "learning_rate": 0.00011371309082918575, + "loss": 0.8141, + "step": 30743 + }, + { + "epoch": 0.7894197114204143, + "grad_norm": 0.76953125, + "learning_rate": 0.00011370866878744557, + "loss": 0.8615, + "step": 30744 + }, + { + "epoch": 0.7894453886163362, + "grad_norm": 0.74609375, + "learning_rate": 0.00011370424671838524, + "loss": 0.872, + "step": 30745 + }, + { + "epoch": 0.789471065812258, + "grad_norm": 0.7421875, + "learning_rate": 0.00011369982462201356, + "loss": 0.7872, + "step": 30746 + }, + { + "epoch": 0.7894967430081798, + "grad_norm": 0.80078125, + "learning_rate": 0.00011369540249833943, + "loss": 0.7566, + "step": 30747 + }, + { + "epoch": 0.7895224202041016, + "grad_norm": 0.80078125, + "learning_rate": 0.00011369098034737154, + "loss": 0.8207, + "step": 30748 + }, + { + "epoch": 0.7895480974000234, + "grad_norm": 0.828125, + "learning_rate": 0.00011368655816911879, + "loss": 0.831, + "step": 30749 + }, + { + "epoch": 0.7895737745959452, + "grad_norm": 0.84765625, + "learning_rate": 0.00011368213596358997, + "loss": 0.8124, + "step": 30750 + }, + { + "epoch": 0.7895994517918671, + "grad_norm": 0.8359375, + "learning_rate": 0.00011367771373079386, + "loss": 0.8897, + "step": 30751 + }, + { + "epoch": 0.7896251289877889, + "grad_norm": 0.7734375, + "learning_rate": 0.00011367329147073933, + "loss": 0.9315, + "step": 30752 + }, + { + "epoch": 0.7896508061837108, + "grad_norm": 0.80859375, + "learning_rate": 0.00011366886918343516, + "loss": 0.7042, + "step": 30753 + }, + { + "epoch": 0.7896764833796325, + "grad_norm": 0.71484375, + "learning_rate": 0.00011366444686889015, + "loss": 0.6921, + "step": 30754 + }, + { + "epoch": 0.7897021605755543, + "grad_norm": 0.80859375, + "learning_rate": 0.00011366002452711315, + "loss": 0.7652, + "step": 30755 + }, + { + "epoch": 0.7897278377714761, + "grad_norm": 0.7890625, + "learning_rate": 0.00011365560215811295, + "loss": 0.9018, + "step": 30756 + }, + { + "epoch": 0.789753514967398, + "grad_norm": 0.7265625, + "learning_rate": 0.00011365117976189836, + "loss": 0.8685, + "step": 30757 + }, + { + "epoch": 0.7897791921633198, + "grad_norm": 0.77734375, + "learning_rate": 0.00011364675733847823, + "loss": 0.7941, + "step": 30758 + }, + { + "epoch": 0.7898048693592417, + "grad_norm": 0.7421875, + "learning_rate": 0.00011364233488786129, + "loss": 0.8021, + "step": 30759 + }, + { + "epoch": 0.7898305465551635, + "grad_norm": 0.73046875, + "learning_rate": 0.00011363791241005648, + "loss": 0.72, + "step": 30760 + }, + { + "epoch": 0.7898562237510852, + "grad_norm": 0.76171875, + "learning_rate": 0.0001136334899050725, + "loss": 0.7969, + "step": 30761 + }, + { + "epoch": 0.789881900947007, + "grad_norm": 0.796875, + "learning_rate": 0.0001136290673729182, + "loss": 1.0157, + "step": 30762 + }, + { + "epoch": 0.7899075781429289, + "grad_norm": 0.77734375, + "learning_rate": 0.00011362464481360242, + "loss": 0.676, + "step": 30763 + }, + { + "epoch": 0.7899332553388507, + "grad_norm": 0.9296875, + "learning_rate": 0.00011362022222713394, + "loss": 0.8721, + "step": 30764 + }, + { + "epoch": 0.7899589325347726, + "grad_norm": 0.74609375, + "learning_rate": 0.0001136157996135216, + "loss": 0.7072, + "step": 30765 + }, + { + "epoch": 0.7899846097306944, + "grad_norm": 0.76953125, + "learning_rate": 0.0001136113769727742, + "loss": 0.7991, + "step": 30766 + }, + { + "epoch": 0.7900102869266162, + "grad_norm": 0.81640625, + "learning_rate": 0.00011360695430490056, + "loss": 0.7842, + "step": 30767 + }, + { + "epoch": 0.790035964122538, + "grad_norm": 0.71875, + "learning_rate": 0.00011360253160990948, + "loss": 0.8618, + "step": 30768 + }, + { + "epoch": 0.7900616413184598, + "grad_norm": 0.80859375, + "learning_rate": 0.00011359810888780977, + "loss": 0.7963, + "step": 30769 + }, + { + "epoch": 0.7900873185143816, + "grad_norm": 0.77734375, + "learning_rate": 0.00011359368613861028, + "loss": 0.7295, + "step": 30770 + }, + { + "epoch": 0.7901129957103035, + "grad_norm": 0.7421875, + "learning_rate": 0.00011358926336231979, + "loss": 0.9164, + "step": 30771 + }, + { + "epoch": 0.7901386729062253, + "grad_norm": 0.78515625, + "learning_rate": 0.00011358484055894711, + "loss": 0.7819, + "step": 30772 + }, + { + "epoch": 0.7901643501021471, + "grad_norm": 0.7890625, + "learning_rate": 0.0001135804177285011, + "loss": 0.8444, + "step": 30773 + }, + { + "epoch": 0.7901900272980689, + "grad_norm": 0.76953125, + "learning_rate": 0.00011357599487099056, + "loss": 0.755, + "step": 30774 + }, + { + "epoch": 0.7902157044939907, + "grad_norm": 0.76171875, + "learning_rate": 0.00011357157198642424, + "loss": 0.8381, + "step": 30775 + }, + { + "epoch": 0.7902413816899125, + "grad_norm": 0.75390625, + "learning_rate": 0.00011356714907481103, + "loss": 0.8207, + "step": 30776 + }, + { + "epoch": 0.7902670588858344, + "grad_norm": 0.7734375, + "learning_rate": 0.00011356272613615969, + "loss": 0.8288, + "step": 30777 + }, + { + "epoch": 0.7902927360817562, + "grad_norm": 0.8828125, + "learning_rate": 0.00011355830317047909, + "loss": 0.8836, + "step": 30778 + }, + { + "epoch": 0.790318413277678, + "grad_norm": 0.84765625, + "learning_rate": 0.00011355388017777802, + "loss": 0.8315, + "step": 30779 + }, + { + "epoch": 0.7903440904735999, + "grad_norm": 0.80859375, + "learning_rate": 0.00011354945715806526, + "loss": 0.7951, + "step": 30780 + }, + { + "epoch": 0.7903697676695216, + "grad_norm": 0.82421875, + "learning_rate": 0.00011354503411134969, + "loss": 0.9045, + "step": 30781 + }, + { + "epoch": 0.7903954448654434, + "grad_norm": 0.75, + "learning_rate": 0.00011354061103764008, + "loss": 0.9238, + "step": 30782 + }, + { + "epoch": 0.7904211220613653, + "grad_norm": 0.78515625, + "learning_rate": 0.00011353618793694525, + "loss": 0.8588, + "step": 30783 + }, + { + "epoch": 0.7904467992572871, + "grad_norm": 0.73828125, + "learning_rate": 0.00011353176480927401, + "loss": 0.7932, + "step": 30784 + }, + { + "epoch": 0.790472476453209, + "grad_norm": 0.78515625, + "learning_rate": 0.0001135273416546352, + "loss": 0.7845, + "step": 30785 + }, + { + "epoch": 0.7904981536491308, + "grad_norm": 0.7734375, + "learning_rate": 0.00011352291847303762, + "loss": 0.7196, + "step": 30786 + }, + { + "epoch": 0.7905238308450525, + "grad_norm": 0.8984375, + "learning_rate": 0.0001135184952644901, + "loss": 0.8941, + "step": 30787 + }, + { + "epoch": 0.7905495080409743, + "grad_norm": 0.78515625, + "learning_rate": 0.00011351407202900141, + "loss": 0.8181, + "step": 30788 + }, + { + "epoch": 0.7905751852368962, + "grad_norm": 0.80078125, + "learning_rate": 0.00011350964876658042, + "loss": 0.9779, + "step": 30789 + }, + { + "epoch": 0.790600862432818, + "grad_norm": 0.765625, + "learning_rate": 0.0001135052254772359, + "loss": 0.7217, + "step": 30790 + }, + { + "epoch": 0.7906265396287399, + "grad_norm": 0.8359375, + "learning_rate": 0.00011350080216097668, + "loss": 0.915, + "step": 30791 + }, + { + "epoch": 0.7906522168246617, + "grad_norm": 0.75390625, + "learning_rate": 0.00011349637881781159, + "loss": 0.7524, + "step": 30792 + }, + { + "epoch": 0.7906778940205835, + "grad_norm": 0.80859375, + "learning_rate": 0.00011349195544774944, + "loss": 0.7629, + "step": 30793 + }, + { + "epoch": 0.7907035712165053, + "grad_norm": 0.81640625, + "learning_rate": 0.00011348753205079905, + "loss": 0.8554, + "step": 30794 + }, + { + "epoch": 0.7907292484124271, + "grad_norm": 0.80859375, + "learning_rate": 0.00011348310862696922, + "loss": 0.8311, + "step": 30795 + }, + { + "epoch": 0.7907549256083489, + "grad_norm": 0.73828125, + "learning_rate": 0.00011347868517626876, + "loss": 0.6811, + "step": 30796 + }, + { + "epoch": 0.7907806028042708, + "grad_norm": 0.7265625, + "learning_rate": 0.00011347426169870652, + "loss": 0.7953, + "step": 30797 + }, + { + "epoch": 0.7908062800001926, + "grad_norm": 0.80078125, + "learning_rate": 0.00011346983819429129, + "loss": 0.6943, + "step": 30798 + }, + { + "epoch": 0.7908319571961144, + "grad_norm": 0.765625, + "learning_rate": 0.00011346541466303187, + "loss": 0.7645, + "step": 30799 + }, + { + "epoch": 0.7908576343920363, + "grad_norm": 0.78125, + "learning_rate": 0.00011346099110493714, + "loss": 0.7841, + "step": 30800 + }, + { + "epoch": 0.790883311587958, + "grad_norm": 0.7421875, + "learning_rate": 0.00011345656752001581, + "loss": 0.7796, + "step": 30801 + }, + { + "epoch": 0.7909089887838798, + "grad_norm": 0.78515625, + "learning_rate": 0.00011345214390827679, + "loss": 0.8259, + "step": 30802 + }, + { + "epoch": 0.7909346659798017, + "grad_norm": 0.8203125, + "learning_rate": 0.00011344772026972886, + "loss": 0.8854, + "step": 30803 + }, + { + "epoch": 0.7909603431757235, + "grad_norm": 0.77734375, + "learning_rate": 0.00011344329660438082, + "loss": 0.7493, + "step": 30804 + }, + { + "epoch": 0.7909860203716453, + "grad_norm": 0.8671875, + "learning_rate": 0.00011343887291224151, + "loss": 0.8491, + "step": 30805 + }, + { + "epoch": 0.7910116975675672, + "grad_norm": 0.8515625, + "learning_rate": 0.00011343444919331976, + "loss": 0.8651, + "step": 30806 + }, + { + "epoch": 0.7910373747634889, + "grad_norm": 0.7734375, + "learning_rate": 0.00011343002544762436, + "loss": 0.8097, + "step": 30807 + }, + { + "epoch": 0.7910630519594107, + "grad_norm": 0.88671875, + "learning_rate": 0.00011342560167516414, + "loss": 0.8246, + "step": 30808 + }, + { + "epoch": 0.7910887291553326, + "grad_norm": 0.80078125, + "learning_rate": 0.00011342117787594788, + "loss": 0.8333, + "step": 30809 + }, + { + "epoch": 0.7911144063512544, + "grad_norm": 0.7578125, + "learning_rate": 0.00011341675404998444, + "loss": 0.7551, + "step": 30810 + }, + { + "epoch": 0.7911400835471762, + "grad_norm": 0.703125, + "learning_rate": 0.00011341233019728263, + "loss": 0.7732, + "step": 30811 + }, + { + "epoch": 0.7911657607430981, + "grad_norm": 0.80078125, + "learning_rate": 0.00011340790631785123, + "loss": 0.8054, + "step": 30812 + }, + { + "epoch": 0.7911914379390199, + "grad_norm": 0.76953125, + "learning_rate": 0.00011340348241169914, + "loss": 0.8531, + "step": 30813 + }, + { + "epoch": 0.7912171151349416, + "grad_norm": 0.8359375, + "learning_rate": 0.00011339905847883507, + "loss": 0.865, + "step": 30814 + }, + { + "epoch": 0.7912427923308635, + "grad_norm": 0.82421875, + "learning_rate": 0.00011339463451926789, + "loss": 0.8264, + "step": 30815 + }, + { + "epoch": 0.7912684695267853, + "grad_norm": 0.8125, + "learning_rate": 0.00011339021053300643, + "loss": 1.0226, + "step": 30816 + }, + { + "epoch": 0.7912941467227071, + "grad_norm": 0.8203125, + "learning_rate": 0.00011338578652005946, + "loss": 0.7977, + "step": 30817 + }, + { + "epoch": 0.791319823918629, + "grad_norm": 0.8125, + "learning_rate": 0.00011338136248043585, + "loss": 0.8096, + "step": 30818 + }, + { + "epoch": 0.7913455011145508, + "grad_norm": 0.8359375, + "learning_rate": 0.00011337693841414441, + "loss": 0.7039, + "step": 30819 + }, + { + "epoch": 0.7913711783104727, + "grad_norm": 0.8359375, + "learning_rate": 0.00011337251432119394, + "loss": 0.7547, + "step": 30820 + }, + { + "epoch": 0.7913968555063944, + "grad_norm": 0.765625, + "learning_rate": 0.00011336809020159323, + "loss": 0.8874, + "step": 30821 + }, + { + "epoch": 0.7914225327023162, + "grad_norm": 0.71484375, + "learning_rate": 0.00011336366605535114, + "loss": 0.6995, + "step": 30822 + }, + { + "epoch": 0.791448209898238, + "grad_norm": 0.72265625, + "learning_rate": 0.00011335924188247645, + "loss": 0.8522, + "step": 30823 + }, + { + "epoch": 0.7914738870941599, + "grad_norm": 0.8203125, + "learning_rate": 0.00011335481768297802, + "loss": 0.8153, + "step": 30824 + }, + { + "epoch": 0.7914995642900817, + "grad_norm": 0.73046875, + "learning_rate": 0.00011335039345686464, + "loss": 0.8536, + "step": 30825 + }, + { + "epoch": 0.7915252414860036, + "grad_norm": 0.78515625, + "learning_rate": 0.00011334596920414513, + "loss": 0.7622, + "step": 30826 + }, + { + "epoch": 0.7915509186819253, + "grad_norm": 0.78125, + "learning_rate": 0.00011334154492482831, + "loss": 0.8693, + "step": 30827 + }, + { + "epoch": 0.7915765958778471, + "grad_norm": 0.7734375, + "learning_rate": 0.00011333712061892299, + "loss": 0.8417, + "step": 30828 + }, + { + "epoch": 0.791602273073769, + "grad_norm": 0.76171875, + "learning_rate": 0.00011333269628643799, + "loss": 0.8273, + "step": 30829 + }, + { + "epoch": 0.7916279502696908, + "grad_norm": 0.82421875, + "learning_rate": 0.00011332827192738215, + "loss": 0.9534, + "step": 30830 + }, + { + "epoch": 0.7916536274656126, + "grad_norm": 0.83203125, + "learning_rate": 0.00011332384754176424, + "loss": 0.8581, + "step": 30831 + }, + { + "epoch": 0.7916793046615345, + "grad_norm": 0.765625, + "learning_rate": 0.00011331942312959312, + "loss": 0.8654, + "step": 30832 + }, + { + "epoch": 0.7917049818574563, + "grad_norm": 0.8203125, + "learning_rate": 0.0001133149986908776, + "loss": 0.7018, + "step": 30833 + }, + { + "epoch": 0.791730659053378, + "grad_norm": 0.78515625, + "learning_rate": 0.00011331057422562649, + "loss": 0.9075, + "step": 30834 + }, + { + "epoch": 0.7917563362492999, + "grad_norm": 0.7578125, + "learning_rate": 0.00011330614973384861, + "loss": 0.7739, + "step": 30835 + }, + { + "epoch": 0.7917820134452217, + "grad_norm": 0.79296875, + "learning_rate": 0.00011330172521555276, + "loss": 0.864, + "step": 30836 + }, + { + "epoch": 0.7918076906411435, + "grad_norm": 0.77734375, + "learning_rate": 0.00011329730067074778, + "loss": 0.8792, + "step": 30837 + }, + { + "epoch": 0.7918333678370654, + "grad_norm": 0.81640625, + "learning_rate": 0.00011329287609944248, + "loss": 0.7787, + "step": 30838 + }, + { + "epoch": 0.7918590450329872, + "grad_norm": 0.79296875, + "learning_rate": 0.00011328845150164569, + "loss": 0.7741, + "step": 30839 + }, + { + "epoch": 0.791884722228909, + "grad_norm": 0.79296875, + "learning_rate": 0.00011328402687736621, + "loss": 0.8154, + "step": 30840 + }, + { + "epoch": 0.7919103994248308, + "grad_norm": 0.76171875, + "learning_rate": 0.00011327960222661284, + "loss": 0.7114, + "step": 30841 + }, + { + "epoch": 0.7919360766207526, + "grad_norm": 0.8125, + "learning_rate": 0.00011327517754939445, + "loss": 0.8435, + "step": 30842 + }, + { + "epoch": 0.7919617538166744, + "grad_norm": 0.73828125, + "learning_rate": 0.00011327075284571984, + "loss": 0.732, + "step": 30843 + }, + { + "epoch": 0.7919874310125963, + "grad_norm": 0.84765625, + "learning_rate": 0.00011326632811559778, + "loss": 0.856, + "step": 30844 + }, + { + "epoch": 0.7920131082085181, + "grad_norm": 0.79296875, + "learning_rate": 0.00011326190335903715, + "loss": 0.7687, + "step": 30845 + }, + { + "epoch": 0.79203878540444, + "grad_norm": 0.82421875, + "learning_rate": 0.00011325747857604676, + "loss": 0.7779, + "step": 30846 + }, + { + "epoch": 0.7920644626003617, + "grad_norm": 0.78515625, + "learning_rate": 0.0001132530537666354, + "loss": 0.753, + "step": 30847 + }, + { + "epoch": 0.7920901397962835, + "grad_norm": 0.73828125, + "learning_rate": 0.0001132486289308119, + "loss": 0.7507, + "step": 30848 + }, + { + "epoch": 0.7921158169922053, + "grad_norm": 0.8203125, + "learning_rate": 0.00011324420406858504, + "loss": 0.8298, + "step": 30849 + }, + { + "epoch": 0.7921414941881272, + "grad_norm": 0.85546875, + "learning_rate": 0.00011323977917996373, + "loss": 0.7341, + "step": 30850 + }, + { + "epoch": 0.792167171384049, + "grad_norm": 0.75390625, + "learning_rate": 0.00011323535426495672, + "loss": 0.7068, + "step": 30851 + }, + { + "epoch": 0.7921928485799709, + "grad_norm": 0.84375, + "learning_rate": 0.00011323092932357285, + "loss": 0.986, + "step": 30852 + }, + { + "epoch": 0.7922185257758927, + "grad_norm": 0.75390625, + "learning_rate": 0.00011322650435582095, + "loss": 0.8436, + "step": 30853 + }, + { + "epoch": 0.7922442029718144, + "grad_norm": 0.9375, + "learning_rate": 0.00011322207936170977, + "loss": 0.7983, + "step": 30854 + }, + { + "epoch": 0.7922698801677363, + "grad_norm": 0.7890625, + "learning_rate": 0.00011321765434124823, + "loss": 0.8183, + "step": 30855 + }, + { + "epoch": 0.7922955573636581, + "grad_norm": 0.7890625, + "learning_rate": 0.00011321322929444507, + "loss": 0.7747, + "step": 30856 + }, + { + "epoch": 0.7923212345595799, + "grad_norm": 0.734375, + "learning_rate": 0.00011320880422130913, + "loss": 0.7398, + "step": 30857 + }, + { + "epoch": 0.7923469117555018, + "grad_norm": 0.78515625, + "learning_rate": 0.00011320437912184927, + "loss": 0.7517, + "step": 30858 + }, + { + "epoch": 0.7923725889514236, + "grad_norm": 0.7578125, + "learning_rate": 0.00011319995399607429, + "loss": 0.8369, + "step": 30859 + }, + { + "epoch": 0.7923982661473454, + "grad_norm": 0.8515625, + "learning_rate": 0.00011319552884399293, + "loss": 0.7921, + "step": 30860 + }, + { + "epoch": 0.7924239433432672, + "grad_norm": 0.6953125, + "learning_rate": 0.00011319110366561412, + "loss": 0.7472, + "step": 30861 + }, + { + "epoch": 0.792449620539189, + "grad_norm": 0.74609375, + "learning_rate": 0.00011318667846094661, + "loss": 0.8121, + "step": 30862 + }, + { + "epoch": 0.7924752977351108, + "grad_norm": 0.765625, + "learning_rate": 0.00011318225322999923, + "loss": 0.8067, + "step": 30863 + }, + { + "epoch": 0.7925009749310327, + "grad_norm": 0.79296875, + "learning_rate": 0.00011317782797278082, + "loss": 0.8227, + "step": 30864 + }, + { + "epoch": 0.7925266521269545, + "grad_norm": 0.859375, + "learning_rate": 0.0001131734026893002, + "loss": 0.8536, + "step": 30865 + }, + { + "epoch": 0.7925523293228763, + "grad_norm": 0.7890625, + "learning_rate": 0.0001131689773795662, + "loss": 0.7945, + "step": 30866 + }, + { + "epoch": 0.7925780065187981, + "grad_norm": 0.74609375, + "learning_rate": 0.0001131645520435876, + "loss": 0.801, + "step": 30867 + }, + { + "epoch": 0.7926036837147199, + "grad_norm": 0.75, + "learning_rate": 0.00011316012668137322, + "loss": 0.8203, + "step": 30868 + }, + { + "epoch": 0.7926293609106417, + "grad_norm": 0.73828125, + "learning_rate": 0.0001131557012929319, + "loss": 0.7758, + "step": 30869 + }, + { + "epoch": 0.7926550381065636, + "grad_norm": 0.84375, + "learning_rate": 0.00011315127587827246, + "loss": 0.8876, + "step": 30870 + }, + { + "epoch": 0.7926807153024854, + "grad_norm": 0.796875, + "learning_rate": 0.00011314685043740369, + "loss": 0.864, + "step": 30871 + }, + { + "epoch": 0.7927063924984072, + "grad_norm": 0.81640625, + "learning_rate": 0.0001131424249703345, + "loss": 0.7572, + "step": 30872 + }, + { + "epoch": 0.7927320696943291, + "grad_norm": 0.78125, + "learning_rate": 0.0001131379994770736, + "loss": 0.8422, + "step": 30873 + }, + { + "epoch": 0.7927577468902508, + "grad_norm": 0.7265625, + "learning_rate": 0.00011313357395762984, + "loss": 0.7321, + "step": 30874 + }, + { + "epoch": 0.7927834240861726, + "grad_norm": 0.76171875, + "learning_rate": 0.0001131291484120121, + "loss": 0.6491, + "step": 30875 + }, + { + "epoch": 0.7928091012820945, + "grad_norm": 0.8125, + "learning_rate": 0.0001131247228402291, + "loss": 0.8276, + "step": 30876 + }, + { + "epoch": 0.7928347784780163, + "grad_norm": 0.796875, + "learning_rate": 0.00011312029724228974, + "loss": 0.9032, + "step": 30877 + }, + { + "epoch": 0.7928604556739381, + "grad_norm": 0.73046875, + "learning_rate": 0.0001131158716182028, + "loss": 0.79, + "step": 30878 + }, + { + "epoch": 0.79288613286986, + "grad_norm": 0.74609375, + "learning_rate": 0.00011311144596797714, + "loss": 0.7147, + "step": 30879 + }, + { + "epoch": 0.7929118100657818, + "grad_norm": 0.828125, + "learning_rate": 0.00011310702029162155, + "loss": 0.7986, + "step": 30880 + }, + { + "epoch": 0.7929374872617035, + "grad_norm": 0.765625, + "learning_rate": 0.00011310259458914482, + "loss": 0.7748, + "step": 30881 + }, + { + "epoch": 0.7929631644576254, + "grad_norm": 0.80859375, + "learning_rate": 0.00011309816886055582, + "loss": 0.7675, + "step": 30882 + }, + { + "epoch": 0.7929888416535472, + "grad_norm": 0.859375, + "learning_rate": 0.00011309374310586337, + "loss": 0.8129, + "step": 30883 + }, + { + "epoch": 0.793014518849469, + "grad_norm": 0.734375, + "learning_rate": 0.00011308931732507624, + "loss": 0.8096, + "step": 30884 + }, + { + "epoch": 0.7930401960453909, + "grad_norm": 0.83203125, + "learning_rate": 0.00011308489151820331, + "loss": 0.8947, + "step": 30885 + }, + { + "epoch": 0.7930658732413127, + "grad_norm": 0.86328125, + "learning_rate": 0.00011308046568525336, + "loss": 0.7757, + "step": 30886 + }, + { + "epoch": 0.7930915504372344, + "grad_norm": 0.75, + "learning_rate": 0.00011307603982623522, + "loss": 0.7685, + "step": 30887 + }, + { + "epoch": 0.7931172276331563, + "grad_norm": 0.67578125, + "learning_rate": 0.00011307161394115773, + "loss": 0.7531, + "step": 30888 + }, + { + "epoch": 0.7931429048290781, + "grad_norm": 1.0078125, + "learning_rate": 0.00011306718803002966, + "loss": 0.8048, + "step": 30889 + }, + { + "epoch": 0.793168582025, + "grad_norm": 0.7890625, + "learning_rate": 0.00011306276209285991, + "loss": 0.7968, + "step": 30890 + }, + { + "epoch": 0.7931942592209218, + "grad_norm": 0.734375, + "learning_rate": 0.00011305833612965724, + "loss": 0.8244, + "step": 30891 + }, + { + "epoch": 0.7932199364168436, + "grad_norm": 0.765625, + "learning_rate": 0.00011305391014043047, + "loss": 0.7334, + "step": 30892 + }, + { + "epoch": 0.7932456136127655, + "grad_norm": 0.7109375, + "learning_rate": 0.00011304948412518847, + "loss": 0.7198, + "step": 30893 + }, + { + "epoch": 0.7932712908086872, + "grad_norm": 0.765625, + "learning_rate": 0.00011304505808393999, + "loss": 0.8259, + "step": 30894 + }, + { + "epoch": 0.793296968004609, + "grad_norm": 0.7578125, + "learning_rate": 0.0001130406320166939, + "loss": 0.9114, + "step": 30895 + }, + { + "epoch": 0.7933226452005309, + "grad_norm": 0.76953125, + "learning_rate": 0.00011303620592345901, + "loss": 0.666, + "step": 30896 + }, + { + "epoch": 0.7933483223964527, + "grad_norm": 0.765625, + "learning_rate": 0.00011303177980424414, + "loss": 0.7691, + "step": 30897 + }, + { + "epoch": 0.7933739995923745, + "grad_norm": 0.84765625, + "learning_rate": 0.0001130273536590581, + "loss": 0.8242, + "step": 30898 + }, + { + "epoch": 0.7933996767882964, + "grad_norm": 0.7265625, + "learning_rate": 0.00011302292748790975, + "loss": 0.7603, + "step": 30899 + }, + { + "epoch": 0.7934253539842182, + "grad_norm": 0.74609375, + "learning_rate": 0.00011301850129080785, + "loss": 0.8356, + "step": 30900 + }, + { + "epoch": 0.7934510311801399, + "grad_norm": 0.87890625, + "learning_rate": 0.00011301407506776128, + "loss": 0.9621, + "step": 30901 + }, + { + "epoch": 0.7934767083760618, + "grad_norm": 0.8203125, + "learning_rate": 0.00011300964881877878, + "loss": 0.8764, + "step": 30902 + }, + { + "epoch": 0.7935023855719836, + "grad_norm": 0.74609375, + "learning_rate": 0.00011300522254386927, + "loss": 0.7105, + "step": 30903 + }, + { + "epoch": 0.7935280627679054, + "grad_norm": 0.72265625, + "learning_rate": 0.00011300079624304152, + "loss": 0.7946, + "step": 30904 + }, + { + "epoch": 0.7935537399638273, + "grad_norm": 0.8515625, + "learning_rate": 0.00011299636991630436, + "loss": 0.8287, + "step": 30905 + }, + { + "epoch": 0.7935794171597491, + "grad_norm": 0.72265625, + "learning_rate": 0.00011299194356366663, + "loss": 0.818, + "step": 30906 + }, + { + "epoch": 0.7936050943556708, + "grad_norm": 0.78515625, + "learning_rate": 0.00011298751718513709, + "loss": 0.8384, + "step": 30907 + }, + { + "epoch": 0.7936307715515927, + "grad_norm": 0.82421875, + "learning_rate": 0.0001129830907807246, + "loss": 0.8108, + "step": 30908 + }, + { + "epoch": 0.7936564487475145, + "grad_norm": 0.765625, + "learning_rate": 0.000112978664350438, + "loss": 0.8183, + "step": 30909 + }, + { + "epoch": 0.7936821259434363, + "grad_norm": 0.7734375, + "learning_rate": 0.00011297423789428612, + "loss": 0.8717, + "step": 30910 + }, + { + "epoch": 0.7937078031393582, + "grad_norm": 0.8203125, + "learning_rate": 0.0001129698114122777, + "loss": 0.9536, + "step": 30911 + }, + { + "epoch": 0.79373348033528, + "grad_norm": 0.78125, + "learning_rate": 0.00011296538490442168, + "loss": 0.8002, + "step": 30912 + }, + { + "epoch": 0.7937591575312019, + "grad_norm": 0.8515625, + "learning_rate": 0.00011296095837072676, + "loss": 0.7869, + "step": 30913 + }, + { + "epoch": 0.7937848347271236, + "grad_norm": 0.76953125, + "learning_rate": 0.00011295653181120185, + "loss": 0.888, + "step": 30914 + }, + { + "epoch": 0.7938105119230454, + "grad_norm": 0.7421875, + "learning_rate": 0.00011295210522585575, + "loss": 0.7124, + "step": 30915 + }, + { + "epoch": 0.7938361891189673, + "grad_norm": 0.72265625, + "learning_rate": 0.00011294767861469723, + "loss": 0.7962, + "step": 30916 + }, + { + "epoch": 0.7938618663148891, + "grad_norm": 0.71875, + "learning_rate": 0.0001129432519777352, + "loss": 0.7579, + "step": 30917 + }, + { + "epoch": 0.7938875435108109, + "grad_norm": 0.7265625, + "learning_rate": 0.00011293882531497844, + "loss": 0.6848, + "step": 30918 + }, + { + "epoch": 0.7939132207067328, + "grad_norm": 0.69921875, + "learning_rate": 0.00011293439862643577, + "loss": 0.7578, + "step": 30919 + }, + { + "epoch": 0.7939388979026546, + "grad_norm": 0.84765625, + "learning_rate": 0.000112929971912116, + "loss": 0.8039, + "step": 30920 + }, + { + "epoch": 0.7939645750985763, + "grad_norm": 0.796875, + "learning_rate": 0.00011292554517202796, + "loss": 0.7739, + "step": 30921 + }, + { + "epoch": 0.7939902522944982, + "grad_norm": 0.8125, + "learning_rate": 0.00011292111840618048, + "loss": 0.76, + "step": 30922 + }, + { + "epoch": 0.79401592949042, + "grad_norm": 0.73828125, + "learning_rate": 0.00011291669161458238, + "loss": 0.7876, + "step": 30923 + }, + { + "epoch": 0.7940416066863418, + "grad_norm": 0.765625, + "learning_rate": 0.00011291226479724246, + "loss": 0.7586, + "step": 30924 + }, + { + "epoch": 0.7940672838822637, + "grad_norm": 0.78125, + "learning_rate": 0.00011290783795416959, + "loss": 0.8675, + "step": 30925 + }, + { + "epoch": 0.7940929610781855, + "grad_norm": 0.7734375, + "learning_rate": 0.00011290341108537254, + "loss": 0.802, + "step": 30926 + }, + { + "epoch": 0.7941186382741072, + "grad_norm": 0.82421875, + "learning_rate": 0.00011289898419086019, + "loss": 0.7226, + "step": 30927 + }, + { + "epoch": 0.7941443154700291, + "grad_norm": 0.75390625, + "learning_rate": 0.00011289455727064132, + "loss": 0.7795, + "step": 30928 + }, + { + "epoch": 0.7941699926659509, + "grad_norm": 0.83203125, + "learning_rate": 0.00011289013032472472, + "loss": 0.8987, + "step": 30929 + }, + { + "epoch": 0.7941956698618727, + "grad_norm": 0.76953125, + "learning_rate": 0.0001128857033531193, + "loss": 0.9149, + "step": 30930 + }, + { + "epoch": 0.7942213470577946, + "grad_norm": 0.8359375, + "learning_rate": 0.00011288127635583383, + "loss": 0.8947, + "step": 30931 + }, + { + "epoch": 0.7942470242537164, + "grad_norm": 0.78515625, + "learning_rate": 0.00011287684933287715, + "loss": 0.8175, + "step": 30932 + }, + { + "epoch": 0.7942727014496382, + "grad_norm": 0.75, + "learning_rate": 0.00011287242228425804, + "loss": 0.7157, + "step": 30933 + }, + { + "epoch": 0.79429837864556, + "grad_norm": 0.81640625, + "learning_rate": 0.00011286799520998536, + "loss": 0.7839, + "step": 30934 + }, + { + "epoch": 0.7943240558414818, + "grad_norm": 0.80078125, + "learning_rate": 0.00011286356811006797, + "loss": 1.0121, + "step": 30935 + }, + { + "epoch": 0.7943497330374036, + "grad_norm": 0.76171875, + "learning_rate": 0.00011285914098451462, + "loss": 0.7743, + "step": 30936 + }, + { + "epoch": 0.7943754102333255, + "grad_norm": 0.73046875, + "learning_rate": 0.00011285471383333415, + "loss": 0.8771, + "step": 30937 + }, + { + "epoch": 0.7944010874292473, + "grad_norm": 0.8671875, + "learning_rate": 0.00011285028665653543, + "loss": 0.8089, + "step": 30938 + }, + { + "epoch": 0.7944267646251691, + "grad_norm": 0.78125, + "learning_rate": 0.00011284585945412725, + "loss": 0.8696, + "step": 30939 + }, + { + "epoch": 0.794452441821091, + "grad_norm": 0.7578125, + "learning_rate": 0.00011284143222611838, + "loss": 0.8459, + "step": 30940 + }, + { + "epoch": 0.7944781190170127, + "grad_norm": 0.78515625, + "learning_rate": 0.00011283700497251775, + "loss": 0.8382, + "step": 30941 + }, + { + "epoch": 0.7945037962129345, + "grad_norm": 0.77734375, + "learning_rate": 0.00011283257769333408, + "loss": 0.8328, + "step": 30942 + }, + { + "epoch": 0.7945294734088564, + "grad_norm": 0.7421875, + "learning_rate": 0.00011282815038857628, + "loss": 0.8108, + "step": 30943 + }, + { + "epoch": 0.7945551506047782, + "grad_norm": 0.8046875, + "learning_rate": 0.00011282372305825312, + "loss": 0.8564, + "step": 30944 + }, + { + "epoch": 0.7945808278007, + "grad_norm": 0.7734375, + "learning_rate": 0.00011281929570237347, + "loss": 0.9079, + "step": 30945 + }, + { + "epoch": 0.7946065049966219, + "grad_norm": 0.828125, + "learning_rate": 0.0001128148683209461, + "loss": 0.9022, + "step": 30946 + }, + { + "epoch": 0.7946321821925436, + "grad_norm": 0.7578125, + "learning_rate": 0.00011281044091397986, + "loss": 0.8215, + "step": 30947 + }, + { + "epoch": 0.7946578593884654, + "grad_norm": 0.8046875, + "learning_rate": 0.00011280601348148354, + "loss": 0.9121, + "step": 30948 + }, + { + "epoch": 0.7946835365843873, + "grad_norm": 0.8203125, + "learning_rate": 0.000112801586023466, + "loss": 0.9243, + "step": 30949 + }, + { + "epoch": 0.7947092137803091, + "grad_norm": 0.75, + "learning_rate": 0.00011279715853993606, + "loss": 0.7874, + "step": 30950 + }, + { + "epoch": 0.794734890976231, + "grad_norm": 0.734375, + "learning_rate": 0.00011279273103090257, + "loss": 0.7456, + "step": 30951 + }, + { + "epoch": 0.7947605681721528, + "grad_norm": 0.67578125, + "learning_rate": 0.0001127883034963743, + "loss": 0.6913, + "step": 30952 + }, + { + "epoch": 0.7947862453680746, + "grad_norm": 0.79296875, + "learning_rate": 0.00011278387593636009, + "loss": 0.7981, + "step": 30953 + }, + { + "epoch": 0.7948119225639964, + "grad_norm": 0.80859375, + "learning_rate": 0.00011277944835086876, + "loss": 0.8466, + "step": 30954 + }, + { + "epoch": 0.7948375997599182, + "grad_norm": 0.84765625, + "learning_rate": 0.00011277502073990917, + "loss": 0.8828, + "step": 30955 + }, + { + "epoch": 0.79486327695584, + "grad_norm": 0.734375, + "learning_rate": 0.0001127705931034901, + "loss": 0.741, + "step": 30956 + }, + { + "epoch": 0.7948889541517619, + "grad_norm": 0.75390625, + "learning_rate": 0.00011276616544162038, + "loss": 0.8088, + "step": 30957 + }, + { + "epoch": 0.7949146313476837, + "grad_norm": 0.765625, + "learning_rate": 0.00011276173775430887, + "loss": 0.7693, + "step": 30958 + }, + { + "epoch": 0.7949403085436055, + "grad_norm": 0.77734375, + "learning_rate": 0.00011275731004156438, + "loss": 0.9144, + "step": 30959 + }, + { + "epoch": 0.7949659857395274, + "grad_norm": 0.703125, + "learning_rate": 0.00011275288230339569, + "loss": 0.649, + "step": 30960 + }, + { + "epoch": 0.7949916629354491, + "grad_norm": 0.74609375, + "learning_rate": 0.00011274845453981167, + "loss": 0.8274, + "step": 30961 + }, + { + "epoch": 0.7950173401313709, + "grad_norm": 0.7890625, + "learning_rate": 0.00011274402675082114, + "loss": 0.8369, + "step": 30962 + }, + { + "epoch": 0.7950430173272928, + "grad_norm": 0.890625, + "learning_rate": 0.00011273959893643291, + "loss": 0.9019, + "step": 30963 + }, + { + "epoch": 0.7950686945232146, + "grad_norm": 0.859375, + "learning_rate": 0.00011273517109665582, + "loss": 0.7985, + "step": 30964 + }, + { + "epoch": 0.7950943717191364, + "grad_norm": 0.78515625, + "learning_rate": 0.00011273074323149868, + "loss": 0.7809, + "step": 30965 + }, + { + "epoch": 0.7951200489150583, + "grad_norm": 0.7421875, + "learning_rate": 0.00011272631534097029, + "loss": 0.7422, + "step": 30966 + }, + { + "epoch": 0.79514572611098, + "grad_norm": 0.81640625, + "learning_rate": 0.00011272188742507954, + "loss": 0.9021, + "step": 30967 + }, + { + "epoch": 0.7951714033069018, + "grad_norm": 0.74609375, + "learning_rate": 0.0001127174594838352, + "loss": 0.7594, + "step": 30968 + }, + { + "epoch": 0.7951970805028237, + "grad_norm": 0.72265625, + "learning_rate": 0.0001127130315172461, + "loss": 0.6265, + "step": 30969 + }, + { + "epoch": 0.7952227576987455, + "grad_norm": 0.69921875, + "learning_rate": 0.00011270860352532109, + "loss": 0.8132, + "step": 30970 + }, + { + "epoch": 0.7952484348946673, + "grad_norm": 0.81640625, + "learning_rate": 0.000112704175508069, + "loss": 0.8058, + "step": 30971 + }, + { + "epoch": 0.7952741120905892, + "grad_norm": 0.78125, + "learning_rate": 0.00011269974746549863, + "loss": 0.7837, + "step": 30972 + }, + { + "epoch": 0.795299789286511, + "grad_norm": 0.8125, + "learning_rate": 0.00011269531939761879, + "loss": 0.9555, + "step": 30973 + }, + { + "epoch": 0.7953254664824327, + "grad_norm": 0.80859375, + "learning_rate": 0.00011269089130443833, + "loss": 0.7276, + "step": 30974 + }, + { + "epoch": 0.7953511436783546, + "grad_norm": 0.80078125, + "learning_rate": 0.00011268646318596607, + "loss": 0.7846, + "step": 30975 + }, + { + "epoch": 0.7953768208742764, + "grad_norm": 0.80859375, + "learning_rate": 0.00011268203504221085, + "loss": 0.7668, + "step": 30976 + }, + { + "epoch": 0.7954024980701982, + "grad_norm": 0.8046875, + "learning_rate": 0.00011267760687318147, + "loss": 0.8198, + "step": 30977 + }, + { + "epoch": 0.7954281752661201, + "grad_norm": 0.77734375, + "learning_rate": 0.00011267317867888679, + "loss": 0.8105, + "step": 30978 + }, + { + "epoch": 0.7954538524620419, + "grad_norm": 0.78515625, + "learning_rate": 0.00011266875045933556, + "loss": 0.8145, + "step": 30979 + }, + { + "epoch": 0.7954795296579638, + "grad_norm": 0.88671875, + "learning_rate": 0.00011266432221453667, + "loss": 0.7344, + "step": 30980 + }, + { + "epoch": 0.7955052068538855, + "grad_norm": 0.74609375, + "learning_rate": 0.00011265989394449894, + "loss": 0.7383, + "step": 30981 + }, + { + "epoch": 0.7955308840498073, + "grad_norm": 0.83984375, + "learning_rate": 0.00011265546564923117, + "loss": 0.834, + "step": 30982 + }, + { + "epoch": 0.7955565612457292, + "grad_norm": 0.8671875, + "learning_rate": 0.00011265103732874223, + "loss": 0.8106, + "step": 30983 + }, + { + "epoch": 0.795582238441651, + "grad_norm": 0.8046875, + "learning_rate": 0.00011264660898304091, + "loss": 0.8898, + "step": 30984 + }, + { + "epoch": 0.7956079156375728, + "grad_norm": 0.76953125, + "learning_rate": 0.00011264218061213603, + "loss": 0.8565, + "step": 30985 + }, + { + "epoch": 0.7956335928334947, + "grad_norm": 0.703125, + "learning_rate": 0.00011263775221603643, + "loss": 0.7075, + "step": 30986 + }, + { + "epoch": 0.7956592700294164, + "grad_norm": 0.8828125, + "learning_rate": 0.00011263332379475092, + "loss": 0.8038, + "step": 30987 + }, + { + "epoch": 0.7956849472253382, + "grad_norm": 0.80078125, + "learning_rate": 0.00011262889534828833, + "loss": 0.8391, + "step": 30988 + }, + { + "epoch": 0.7957106244212601, + "grad_norm": 0.8125, + "learning_rate": 0.0001126244668766575, + "loss": 0.8589, + "step": 30989 + }, + { + "epoch": 0.7957363016171819, + "grad_norm": 0.73828125, + "learning_rate": 0.00011262003837986725, + "loss": 0.8061, + "step": 30990 + }, + { + "epoch": 0.7957619788131037, + "grad_norm": 0.8515625, + "learning_rate": 0.00011261560985792643, + "loss": 0.8282, + "step": 30991 + }, + { + "epoch": 0.7957876560090256, + "grad_norm": 0.86328125, + "learning_rate": 0.00011261118131084383, + "loss": 0.8691, + "step": 30992 + }, + { + "epoch": 0.7958133332049474, + "grad_norm": 0.78515625, + "learning_rate": 0.00011260675273862824, + "loss": 0.7615, + "step": 30993 + }, + { + "epoch": 0.7958390104008691, + "grad_norm": 0.7421875, + "learning_rate": 0.00011260232414128858, + "loss": 0.8982, + "step": 30994 + }, + { + "epoch": 0.795864687596791, + "grad_norm": 0.81640625, + "learning_rate": 0.0001125978955188336, + "loss": 0.8538, + "step": 30995 + }, + { + "epoch": 0.7958903647927128, + "grad_norm": 0.84765625, + "learning_rate": 0.00011259346687127215, + "loss": 0.7223, + "step": 30996 + }, + { + "epoch": 0.7959160419886346, + "grad_norm": 0.8203125, + "learning_rate": 0.00011258903819861308, + "loss": 0.9238, + "step": 30997 + }, + { + "epoch": 0.7959417191845565, + "grad_norm": 0.7578125, + "learning_rate": 0.00011258460950086518, + "loss": 0.7115, + "step": 30998 + }, + { + "epoch": 0.7959673963804783, + "grad_norm": 0.8125, + "learning_rate": 0.0001125801807780373, + "loss": 0.878, + "step": 30999 + }, + { + "epoch": 0.7959930735764, + "grad_norm": 0.7421875, + "learning_rate": 0.00011257575203013826, + "loss": 0.7818, + "step": 31000 + }, + { + "epoch": 0.7959930735764, + "eval_loss": 0.8121166229248047, + "eval_runtime": 386.01, + "eval_samples_per_second": 25.906, + "eval_steps_per_second": 0.811, + "step": 31000 + }, + { + "epoch": 0.7960187507723219, + "grad_norm": 0.74609375, + "learning_rate": 0.00011257132325717685, + "loss": 0.8611, + "step": 31001 + }, + { + "epoch": 0.7960444279682437, + "grad_norm": 0.8671875, + "learning_rate": 0.00011256689445916195, + "loss": 0.7293, + "step": 31002 + }, + { + "epoch": 0.7960701051641655, + "grad_norm": 0.79296875, + "learning_rate": 0.00011256246563610237, + "loss": 0.7429, + "step": 31003 + }, + { + "epoch": 0.7960957823600874, + "grad_norm": 0.78125, + "learning_rate": 0.00011255803678800694, + "loss": 0.707, + "step": 31004 + }, + { + "epoch": 0.7961214595560092, + "grad_norm": 0.67578125, + "learning_rate": 0.00011255360791488445, + "loss": 0.73, + "step": 31005 + }, + { + "epoch": 0.796147136751931, + "grad_norm": 0.78515625, + "learning_rate": 0.00011254917901674375, + "loss": 0.9402, + "step": 31006 + }, + { + "epoch": 0.7961728139478528, + "grad_norm": 0.8203125, + "learning_rate": 0.00011254475009359368, + "loss": 0.9244, + "step": 31007 + }, + { + "epoch": 0.7961984911437746, + "grad_norm": 0.77734375, + "learning_rate": 0.00011254032114544309, + "loss": 0.8564, + "step": 31008 + }, + { + "epoch": 0.7962241683396964, + "grad_norm": 0.75, + "learning_rate": 0.00011253589217230071, + "loss": 0.6669, + "step": 31009 + }, + { + "epoch": 0.7962498455356183, + "grad_norm": 0.75390625, + "learning_rate": 0.00011253146317417548, + "loss": 0.8773, + "step": 31010 + }, + { + "epoch": 0.7962755227315401, + "grad_norm": 0.9609375, + "learning_rate": 0.00011252703415107615, + "loss": 0.8266, + "step": 31011 + }, + { + "epoch": 0.796301199927462, + "grad_norm": 0.8359375, + "learning_rate": 0.00011252260510301158, + "loss": 0.8239, + "step": 31012 + }, + { + "epoch": 0.7963268771233838, + "grad_norm": 0.8515625, + "learning_rate": 0.00011251817602999061, + "loss": 0.7589, + "step": 31013 + }, + { + "epoch": 0.7963525543193055, + "grad_norm": 0.765625, + "learning_rate": 0.00011251374693202201, + "loss": 0.828, + "step": 31014 + }, + { + "epoch": 0.7963782315152274, + "grad_norm": 0.734375, + "learning_rate": 0.00011250931780911467, + "loss": 0.8063, + "step": 31015 + }, + { + "epoch": 0.7964039087111492, + "grad_norm": 0.8828125, + "learning_rate": 0.00011250488866127737, + "loss": 0.8652, + "step": 31016 + }, + { + "epoch": 0.796429585907071, + "grad_norm": 0.8046875, + "learning_rate": 0.00011250045948851897, + "loss": 0.7832, + "step": 31017 + }, + { + "epoch": 0.7964552631029929, + "grad_norm": 0.78125, + "learning_rate": 0.0001124960302908483, + "loss": 0.8997, + "step": 31018 + }, + { + "epoch": 0.7964809402989147, + "grad_norm": 0.76171875, + "learning_rate": 0.00011249160106827412, + "loss": 0.7589, + "step": 31019 + }, + { + "epoch": 0.7965066174948364, + "grad_norm": 0.83203125, + "learning_rate": 0.00011248717182080535, + "loss": 0.7797, + "step": 31020 + }, + { + "epoch": 0.7965322946907583, + "grad_norm": 0.80859375, + "learning_rate": 0.00011248274254845074, + "loss": 0.7523, + "step": 31021 + }, + { + "epoch": 0.7965579718866801, + "grad_norm": 0.88671875, + "learning_rate": 0.00011247831325121918, + "loss": 0.8527, + "step": 31022 + }, + { + "epoch": 0.7965836490826019, + "grad_norm": 0.80859375, + "learning_rate": 0.00011247388392911948, + "loss": 0.7541, + "step": 31023 + }, + { + "epoch": 0.7966093262785238, + "grad_norm": 0.8046875, + "learning_rate": 0.00011246945458216044, + "loss": 0.8994, + "step": 31024 + }, + { + "epoch": 0.7966350034744456, + "grad_norm": 0.7890625, + "learning_rate": 0.0001124650252103509, + "loss": 0.7947, + "step": 31025 + }, + { + "epoch": 0.7966606806703674, + "grad_norm": 0.80859375, + "learning_rate": 0.00011246059581369968, + "loss": 0.8625, + "step": 31026 + }, + { + "epoch": 0.7966863578662892, + "grad_norm": 0.75390625, + "learning_rate": 0.00011245616639221561, + "loss": 0.8258, + "step": 31027 + }, + { + "epoch": 0.796712035062211, + "grad_norm": 0.75390625, + "learning_rate": 0.00011245173694590756, + "loss": 0.7992, + "step": 31028 + }, + { + "epoch": 0.7967377122581328, + "grad_norm": 0.8046875, + "learning_rate": 0.00011244730747478431, + "loss": 0.8627, + "step": 31029 + }, + { + "epoch": 0.7967633894540547, + "grad_norm": 0.8359375, + "learning_rate": 0.0001124428779788547, + "loss": 0.7532, + "step": 31030 + }, + { + "epoch": 0.7967890666499765, + "grad_norm": 0.80078125, + "learning_rate": 0.00011243844845812756, + "loss": 0.9, + "step": 31031 + }, + { + "epoch": 0.7968147438458983, + "grad_norm": 0.8125, + "learning_rate": 0.00011243401891261171, + "loss": 0.8468, + "step": 31032 + }, + { + "epoch": 0.7968404210418202, + "grad_norm": 0.81640625, + "learning_rate": 0.00011242958934231597, + "loss": 0.8713, + "step": 31033 + }, + { + "epoch": 0.7968660982377419, + "grad_norm": 0.75, + "learning_rate": 0.00011242515974724921, + "loss": 0.7309, + "step": 31034 + }, + { + "epoch": 0.7968917754336637, + "grad_norm": 0.82421875, + "learning_rate": 0.00011242073012742023, + "loss": 0.8453, + "step": 31035 + }, + { + "epoch": 0.7969174526295856, + "grad_norm": 0.77734375, + "learning_rate": 0.00011241630048283782, + "loss": 0.7868, + "step": 31036 + }, + { + "epoch": 0.7969431298255074, + "grad_norm": 0.81640625, + "learning_rate": 0.0001124118708135109, + "loss": 0.8314, + "step": 31037 + }, + { + "epoch": 0.7969688070214292, + "grad_norm": 0.8515625, + "learning_rate": 0.00011240744111944819, + "loss": 0.9473, + "step": 31038 + }, + { + "epoch": 0.7969944842173511, + "grad_norm": 0.7890625, + "learning_rate": 0.00011240301140065859, + "loss": 0.8876, + "step": 31039 + }, + { + "epoch": 0.7970201614132728, + "grad_norm": 0.78125, + "learning_rate": 0.00011239858165715091, + "loss": 0.8108, + "step": 31040 + }, + { + "epoch": 0.7970458386091946, + "grad_norm": 0.8203125, + "learning_rate": 0.00011239415188893396, + "loss": 0.6794, + "step": 31041 + }, + { + "epoch": 0.7970715158051165, + "grad_norm": 0.83984375, + "learning_rate": 0.00011238972209601662, + "loss": 0.8224, + "step": 31042 + }, + { + "epoch": 0.7970971930010383, + "grad_norm": 0.75, + "learning_rate": 0.00011238529227840766, + "loss": 0.7785, + "step": 31043 + }, + { + "epoch": 0.7971228701969602, + "grad_norm": 0.8125, + "learning_rate": 0.00011238086243611594, + "loss": 1.0446, + "step": 31044 + }, + { + "epoch": 0.797148547392882, + "grad_norm": 0.78125, + "learning_rate": 0.0001123764325691503, + "loss": 0.8809, + "step": 31045 + }, + { + "epoch": 0.7971742245888038, + "grad_norm": 0.74609375, + "learning_rate": 0.00011237200267751949, + "loss": 0.8473, + "step": 31046 + }, + { + "epoch": 0.7971999017847256, + "grad_norm": 0.83203125, + "learning_rate": 0.00011236757276123245, + "loss": 0.8369, + "step": 31047 + }, + { + "epoch": 0.7972255789806474, + "grad_norm": 0.80078125, + "learning_rate": 0.00011236314282029791, + "loss": 0.8157, + "step": 31048 + }, + { + "epoch": 0.7972512561765692, + "grad_norm": 0.81640625, + "learning_rate": 0.00011235871285472476, + "loss": 0.7902, + "step": 31049 + }, + { + "epoch": 0.7972769333724911, + "grad_norm": 0.76953125, + "learning_rate": 0.00011235428286452186, + "loss": 0.8048, + "step": 31050 + }, + { + "epoch": 0.7973026105684129, + "grad_norm": 0.78515625, + "learning_rate": 0.00011234985284969792, + "loss": 0.8113, + "step": 31051 + }, + { + "epoch": 0.7973282877643347, + "grad_norm": 0.79296875, + "learning_rate": 0.00011234542281026188, + "loss": 0.7016, + "step": 31052 + }, + { + "epoch": 0.7973539649602566, + "grad_norm": 0.859375, + "learning_rate": 0.00011234099274622252, + "loss": 0.7497, + "step": 31053 + }, + { + "epoch": 0.7973796421561783, + "grad_norm": 0.7734375, + "learning_rate": 0.00011233656265758866, + "loss": 0.807, + "step": 31054 + }, + { + "epoch": 0.7974053193521001, + "grad_norm": 0.78515625, + "learning_rate": 0.00011233213254436914, + "loss": 0.8559, + "step": 31055 + }, + { + "epoch": 0.797430996548022, + "grad_norm": 0.67578125, + "learning_rate": 0.00011232770240657282, + "loss": 0.7079, + "step": 31056 + }, + { + "epoch": 0.7974566737439438, + "grad_norm": 0.7734375, + "learning_rate": 0.00011232327224420852, + "loss": 0.962, + "step": 31057 + }, + { + "epoch": 0.7974823509398656, + "grad_norm": 0.78515625, + "learning_rate": 0.000112318842057285, + "loss": 0.7749, + "step": 31058 + }, + { + "epoch": 0.7975080281357875, + "grad_norm": 0.76171875, + "learning_rate": 0.00011231441184581115, + "loss": 0.7731, + "step": 31059 + }, + { + "epoch": 0.7975337053317092, + "grad_norm": 0.76171875, + "learning_rate": 0.00011230998160979582, + "loss": 0.8015, + "step": 31060 + }, + { + "epoch": 0.797559382527631, + "grad_norm": 0.79296875, + "learning_rate": 0.00011230555134924778, + "loss": 0.9038, + "step": 31061 + }, + { + "epoch": 0.7975850597235529, + "grad_norm": 0.765625, + "learning_rate": 0.00011230112106417589, + "loss": 0.739, + "step": 31062 + }, + { + "epoch": 0.7976107369194747, + "grad_norm": 0.8203125, + "learning_rate": 0.00011229669075458902, + "loss": 0.8179, + "step": 31063 + }, + { + "epoch": 0.7976364141153965, + "grad_norm": 0.765625, + "learning_rate": 0.00011229226042049589, + "loss": 0.9568, + "step": 31064 + }, + { + "epoch": 0.7976620913113184, + "grad_norm": 0.7578125, + "learning_rate": 0.00011228783006190546, + "loss": 0.7365, + "step": 31065 + }, + { + "epoch": 0.7976877685072402, + "grad_norm": 0.796875, + "learning_rate": 0.00011228339967882646, + "loss": 0.8387, + "step": 31066 + }, + { + "epoch": 0.7977134457031619, + "grad_norm": 0.78125, + "learning_rate": 0.00011227896927126775, + "loss": 0.8992, + "step": 31067 + }, + { + "epoch": 0.7977391228990838, + "grad_norm": 0.7890625, + "learning_rate": 0.00011227453883923818, + "loss": 0.7254, + "step": 31068 + }, + { + "epoch": 0.7977648000950056, + "grad_norm": 0.71875, + "learning_rate": 0.00011227010838274654, + "loss": 0.815, + "step": 31069 + }, + { + "epoch": 0.7977904772909274, + "grad_norm": 0.796875, + "learning_rate": 0.00011226567790180172, + "loss": 0.7121, + "step": 31070 + }, + { + "epoch": 0.7978161544868493, + "grad_norm": 0.83984375, + "learning_rate": 0.00011226124739641248, + "loss": 0.7386, + "step": 31071 + }, + { + "epoch": 0.7978418316827711, + "grad_norm": 0.82421875, + "learning_rate": 0.0001122568168665877, + "loss": 0.8107, + "step": 31072 + }, + { + "epoch": 0.797867508878693, + "grad_norm": 0.69921875, + "learning_rate": 0.00011225238631233616, + "loss": 0.7919, + "step": 31073 + }, + { + "epoch": 0.7978931860746147, + "grad_norm": 0.796875, + "learning_rate": 0.00011224795573366675, + "loss": 0.8033, + "step": 31074 + }, + { + "epoch": 0.7979188632705365, + "grad_norm": 0.69921875, + "learning_rate": 0.00011224352513058827, + "loss": 0.8154, + "step": 31075 + }, + { + "epoch": 0.7979445404664584, + "grad_norm": 0.87890625, + "learning_rate": 0.00011223909450310954, + "loss": 0.8552, + "step": 31076 + }, + { + "epoch": 0.7979702176623802, + "grad_norm": 0.77734375, + "learning_rate": 0.00011223466385123942, + "loss": 0.9105, + "step": 31077 + }, + { + "epoch": 0.797995894858302, + "grad_norm": 0.77734375, + "learning_rate": 0.00011223023317498667, + "loss": 0.8027, + "step": 31078 + }, + { + "epoch": 0.7980215720542239, + "grad_norm": 0.80859375, + "learning_rate": 0.00011222580247436021, + "loss": 0.8348, + "step": 31079 + }, + { + "epoch": 0.7980472492501456, + "grad_norm": 0.82421875, + "learning_rate": 0.00011222137174936882, + "loss": 0.8909, + "step": 31080 + }, + { + "epoch": 0.7980729264460674, + "grad_norm": 0.8671875, + "learning_rate": 0.00011221694100002134, + "loss": 0.828, + "step": 31081 + }, + { + "epoch": 0.7980986036419893, + "grad_norm": 0.76953125, + "learning_rate": 0.00011221251022632659, + "loss": 0.7743, + "step": 31082 + }, + { + "epoch": 0.7981242808379111, + "grad_norm": 0.78515625, + "learning_rate": 0.00011220807942829343, + "loss": 0.7755, + "step": 31083 + }, + { + "epoch": 0.7981499580338329, + "grad_norm": 0.8359375, + "learning_rate": 0.00011220364860593069, + "loss": 0.9157, + "step": 31084 + }, + { + "epoch": 0.7981756352297548, + "grad_norm": 0.79296875, + "learning_rate": 0.00011219921775924716, + "loss": 0.8558, + "step": 31085 + }, + { + "epoch": 0.7982013124256766, + "grad_norm": 0.84375, + "learning_rate": 0.00011219478688825166, + "loss": 0.6581, + "step": 31086 + }, + { + "epoch": 0.7982269896215983, + "grad_norm": 0.80859375, + "learning_rate": 0.00011219035599295309, + "loss": 0.9517, + "step": 31087 + }, + { + "epoch": 0.7982526668175202, + "grad_norm": 0.71875, + "learning_rate": 0.00011218592507336024, + "loss": 0.8381, + "step": 31088 + }, + { + "epoch": 0.798278344013442, + "grad_norm": 0.8359375, + "learning_rate": 0.00011218149412948191, + "loss": 0.757, + "step": 31089 + }, + { + "epoch": 0.7983040212093638, + "grad_norm": 0.734375, + "learning_rate": 0.00011217706316132699, + "loss": 0.6842, + "step": 31090 + }, + { + "epoch": 0.7983296984052857, + "grad_norm": 0.82421875, + "learning_rate": 0.00011217263216890426, + "loss": 0.8365, + "step": 31091 + }, + { + "epoch": 0.7983553756012075, + "grad_norm": 0.77734375, + "learning_rate": 0.00011216820115222259, + "loss": 0.8519, + "step": 31092 + }, + { + "epoch": 0.7983810527971293, + "grad_norm": 0.74609375, + "learning_rate": 0.0001121637701112908, + "loss": 0.8127, + "step": 31093 + }, + { + "epoch": 0.7984067299930511, + "grad_norm": 0.83203125, + "learning_rate": 0.0001121593390461177, + "loss": 1.0024, + "step": 31094 + }, + { + "epoch": 0.7984324071889729, + "grad_norm": 0.76953125, + "learning_rate": 0.00011215490795671215, + "loss": 0.8303, + "step": 31095 + }, + { + "epoch": 0.7984580843848947, + "grad_norm": 0.765625, + "learning_rate": 0.00011215047684308298, + "loss": 0.733, + "step": 31096 + }, + { + "epoch": 0.7984837615808166, + "grad_norm": 0.80078125, + "learning_rate": 0.00011214604570523898, + "loss": 0.8497, + "step": 31097 + }, + { + "epoch": 0.7985094387767384, + "grad_norm": 0.765625, + "learning_rate": 0.00011214161454318901, + "loss": 0.7135, + "step": 31098 + }, + { + "epoch": 0.7985351159726602, + "grad_norm": 0.75, + "learning_rate": 0.0001121371833569419, + "loss": 0.791, + "step": 31099 + }, + { + "epoch": 0.798560793168582, + "grad_norm": 0.7734375, + "learning_rate": 0.0001121327521465065, + "loss": 0.8302, + "step": 31100 + }, + { + "epoch": 0.7985864703645038, + "grad_norm": 0.88671875, + "learning_rate": 0.0001121283209118916, + "loss": 0.8822, + "step": 31101 + }, + { + "epoch": 0.7986121475604256, + "grad_norm": 0.8203125, + "learning_rate": 0.00011212388965310605, + "loss": 0.8125, + "step": 31102 + }, + { + "epoch": 0.7986378247563475, + "grad_norm": 0.8671875, + "learning_rate": 0.00011211945837015871, + "loss": 0.7778, + "step": 31103 + }, + { + "epoch": 0.7986635019522693, + "grad_norm": 0.90625, + "learning_rate": 0.00011211502706305834, + "loss": 0.8869, + "step": 31104 + }, + { + "epoch": 0.7986891791481912, + "grad_norm": 0.71484375, + "learning_rate": 0.00011211059573181386, + "loss": 0.7724, + "step": 31105 + }, + { + "epoch": 0.798714856344113, + "grad_norm": 0.75, + "learning_rate": 0.00011210616437643404, + "loss": 0.7969, + "step": 31106 + }, + { + "epoch": 0.7987405335400347, + "grad_norm": 0.76171875, + "learning_rate": 0.0001121017329969277, + "loss": 0.8261, + "step": 31107 + }, + { + "epoch": 0.7987662107359565, + "grad_norm": 0.8359375, + "learning_rate": 0.00011209730159330374, + "loss": 0.918, + "step": 31108 + }, + { + "epoch": 0.7987918879318784, + "grad_norm": 0.80859375, + "learning_rate": 0.00011209287016557096, + "loss": 0.6913, + "step": 31109 + }, + { + "epoch": 0.7988175651278002, + "grad_norm": 0.765625, + "learning_rate": 0.00011208843871373814, + "loss": 0.7682, + "step": 31110 + }, + { + "epoch": 0.7988432423237221, + "grad_norm": 0.80078125, + "learning_rate": 0.00011208400723781415, + "loss": 0.8254, + "step": 31111 + }, + { + "epoch": 0.7988689195196439, + "grad_norm": 0.84375, + "learning_rate": 0.00011207957573780787, + "loss": 0.8294, + "step": 31112 + }, + { + "epoch": 0.7988945967155657, + "grad_norm": 0.8046875, + "learning_rate": 0.00011207514421372805, + "loss": 0.8673, + "step": 31113 + }, + { + "epoch": 0.7989202739114875, + "grad_norm": 0.93359375, + "learning_rate": 0.00011207071266558357, + "loss": 0.83, + "step": 31114 + }, + { + "epoch": 0.7989459511074093, + "grad_norm": 0.71875, + "learning_rate": 0.00011206628109338324, + "loss": 0.8009, + "step": 31115 + }, + { + "epoch": 0.7989716283033311, + "grad_norm": 0.84765625, + "learning_rate": 0.00011206184949713593, + "loss": 0.8058, + "step": 31116 + }, + { + "epoch": 0.798997305499253, + "grad_norm": 0.79296875, + "learning_rate": 0.00011205741787685045, + "loss": 0.834, + "step": 31117 + }, + { + "epoch": 0.7990229826951748, + "grad_norm": 0.75, + "learning_rate": 0.00011205298623253557, + "loss": 0.7256, + "step": 31118 + }, + { + "epoch": 0.7990486598910966, + "grad_norm": 0.80078125, + "learning_rate": 0.00011204855456420022, + "loss": 0.8019, + "step": 31119 + }, + { + "epoch": 0.7990743370870184, + "grad_norm": 0.79296875, + "learning_rate": 0.00011204412287185317, + "loss": 0.8656, + "step": 31120 + }, + { + "epoch": 0.7991000142829402, + "grad_norm": 0.80078125, + "learning_rate": 0.00011203969115550327, + "loss": 0.7432, + "step": 31121 + }, + { + "epoch": 0.799125691478862, + "grad_norm": 0.8125, + "learning_rate": 0.0001120352594151594, + "loss": 0.7652, + "step": 31122 + }, + { + "epoch": 0.7991513686747839, + "grad_norm": 0.7890625, + "learning_rate": 0.00011203082765083029, + "loss": 0.7818, + "step": 31123 + }, + { + "epoch": 0.7991770458707057, + "grad_norm": 0.76171875, + "learning_rate": 0.00011202639586252484, + "loss": 0.7825, + "step": 31124 + }, + { + "epoch": 0.7992027230666275, + "grad_norm": 0.7734375, + "learning_rate": 0.00011202196405025188, + "loss": 0.8542, + "step": 31125 + }, + { + "epoch": 0.7992284002625494, + "grad_norm": 0.78125, + "learning_rate": 0.0001120175322140202, + "loss": 0.7235, + "step": 31126 + }, + { + "epoch": 0.7992540774584711, + "grad_norm": 0.8515625, + "learning_rate": 0.0001120131003538387, + "loss": 0.7653, + "step": 31127 + }, + { + "epoch": 0.7992797546543929, + "grad_norm": 0.87890625, + "learning_rate": 0.00011200866846971616, + "loss": 0.9077, + "step": 31128 + }, + { + "epoch": 0.7993054318503148, + "grad_norm": 0.77734375, + "learning_rate": 0.00011200423656166146, + "loss": 0.744, + "step": 31129 + }, + { + "epoch": 0.7993311090462366, + "grad_norm": 0.7421875, + "learning_rate": 0.00011199980462968335, + "loss": 0.759, + "step": 31130 + }, + { + "epoch": 0.7993567862421584, + "grad_norm": 0.7890625, + "learning_rate": 0.00011199537267379074, + "loss": 0.7641, + "step": 31131 + }, + { + "epoch": 0.7993824634380803, + "grad_norm": 0.8046875, + "learning_rate": 0.00011199094069399243, + "loss": 0.8474, + "step": 31132 + }, + { + "epoch": 0.7994081406340021, + "grad_norm": 0.78125, + "learning_rate": 0.00011198650869029728, + "loss": 0.8161, + "step": 31133 + }, + { + "epoch": 0.7994338178299238, + "grad_norm": 0.79296875, + "learning_rate": 0.00011198207666271406, + "loss": 1.0011, + "step": 31134 + }, + { + "epoch": 0.7994594950258457, + "grad_norm": 0.7265625, + "learning_rate": 0.00011197764461125168, + "loss": 0.7805, + "step": 31135 + }, + { + "epoch": 0.7994851722217675, + "grad_norm": 0.83984375, + "learning_rate": 0.00011197321253591892, + "loss": 0.7634, + "step": 31136 + }, + { + "epoch": 0.7995108494176894, + "grad_norm": 0.76953125, + "learning_rate": 0.00011196878043672463, + "loss": 0.7947, + "step": 31137 + }, + { + "epoch": 0.7995365266136112, + "grad_norm": 0.7734375, + "learning_rate": 0.00011196434831367765, + "loss": 0.9936, + "step": 31138 + }, + { + "epoch": 0.799562203809533, + "grad_norm": 0.80859375, + "learning_rate": 0.00011195991616678678, + "loss": 0.9479, + "step": 31139 + }, + { + "epoch": 0.7995878810054547, + "grad_norm": 0.875, + "learning_rate": 0.0001119554839960609, + "loss": 0.8846, + "step": 31140 + }, + { + "epoch": 0.7996135582013766, + "grad_norm": 0.875, + "learning_rate": 0.00011195105180150883, + "loss": 0.8503, + "step": 31141 + }, + { + "epoch": 0.7996392353972984, + "grad_norm": 0.8359375, + "learning_rate": 0.0001119466195831394, + "loss": 0.9199, + "step": 31142 + }, + { + "epoch": 0.7996649125932203, + "grad_norm": 0.796875, + "learning_rate": 0.00011194218734096141, + "loss": 0.8568, + "step": 31143 + }, + { + "epoch": 0.7996905897891421, + "grad_norm": 0.78125, + "learning_rate": 0.0001119377550749837, + "loss": 0.9072, + "step": 31144 + }, + { + "epoch": 0.7997162669850639, + "grad_norm": 0.75390625, + "learning_rate": 0.00011193332278521516, + "loss": 0.9273, + "step": 31145 + }, + { + "epoch": 0.7997419441809858, + "grad_norm": 0.83984375, + "learning_rate": 0.00011192889047166458, + "loss": 0.904, + "step": 31146 + }, + { + "epoch": 0.7997676213769075, + "grad_norm": 0.765625, + "learning_rate": 0.00011192445813434076, + "loss": 0.6912, + "step": 31147 + }, + { + "epoch": 0.7997932985728293, + "grad_norm": 0.78515625, + "learning_rate": 0.00011192002577325262, + "loss": 0.846, + "step": 31148 + }, + { + "epoch": 0.7998189757687512, + "grad_norm": 0.75, + "learning_rate": 0.00011191559338840894, + "loss": 0.8739, + "step": 31149 + }, + { + "epoch": 0.799844652964673, + "grad_norm": 0.8671875, + "learning_rate": 0.00011191116097981853, + "loss": 0.9591, + "step": 31150 + }, + { + "epoch": 0.7998703301605948, + "grad_norm": 0.75, + "learning_rate": 0.00011190672854749028, + "loss": 0.7577, + "step": 31151 + }, + { + "epoch": 0.7998960073565167, + "grad_norm": 0.8125, + "learning_rate": 0.00011190229609143296, + "loss": 0.8407, + "step": 31152 + }, + { + "epoch": 0.7999216845524385, + "grad_norm": 0.75390625, + "learning_rate": 0.00011189786361165548, + "loss": 0.8808, + "step": 31153 + }, + { + "epoch": 0.7999473617483602, + "grad_norm": 0.89453125, + "learning_rate": 0.00011189343110816662, + "loss": 0.9339, + "step": 31154 + }, + { + "epoch": 0.7999730389442821, + "grad_norm": 0.75, + "learning_rate": 0.00011188899858097523, + "loss": 0.789, + "step": 31155 + }, + { + "epoch": 0.7999987161402039, + "grad_norm": 0.73046875, + "learning_rate": 0.00011188456603009014, + "loss": 0.7595, + "step": 31156 + }, + { + "epoch": 0.8000243933361257, + "grad_norm": 0.75390625, + "learning_rate": 0.00011188013345552017, + "loss": 0.8743, + "step": 31157 + }, + { + "epoch": 0.8000500705320476, + "grad_norm": 0.71484375, + "learning_rate": 0.00011187570085727416, + "loss": 0.6794, + "step": 31158 + }, + { + "epoch": 0.8000757477279694, + "grad_norm": 0.7421875, + "learning_rate": 0.00011187126823536098, + "loss": 0.8968, + "step": 31159 + }, + { + "epoch": 0.8001014249238911, + "grad_norm": 0.80078125, + "learning_rate": 0.00011186683558978941, + "loss": 0.8828, + "step": 31160 + }, + { + "epoch": 0.800127102119813, + "grad_norm": 0.7265625, + "learning_rate": 0.00011186240292056831, + "loss": 0.7428, + "step": 31161 + }, + { + "epoch": 0.8001527793157348, + "grad_norm": 0.77734375, + "learning_rate": 0.00011185797022770655, + "loss": 0.7329, + "step": 31162 + }, + { + "epoch": 0.8001784565116566, + "grad_norm": 0.80078125, + "learning_rate": 0.00011185353751121288, + "loss": 0.7455, + "step": 31163 + }, + { + "epoch": 0.8002041337075785, + "grad_norm": 0.76171875, + "learning_rate": 0.0001118491047710962, + "loss": 0.7923, + "step": 31164 + }, + { + "epoch": 0.8002298109035003, + "grad_norm": 0.75390625, + "learning_rate": 0.00011184467200736533, + "loss": 0.7507, + "step": 31165 + }, + { + "epoch": 0.8002554880994222, + "grad_norm": 0.79296875, + "learning_rate": 0.00011184023922002907, + "loss": 0.8057, + "step": 31166 + }, + { + "epoch": 0.8002811652953439, + "grad_norm": 0.84375, + "learning_rate": 0.0001118358064090963, + "loss": 0.7379, + "step": 31167 + }, + { + "epoch": 0.8003068424912657, + "grad_norm": 0.73828125, + "learning_rate": 0.00011183137357457585, + "loss": 0.8234, + "step": 31168 + }, + { + "epoch": 0.8003325196871875, + "grad_norm": 0.765625, + "learning_rate": 0.00011182694071647653, + "loss": 0.7629, + "step": 31169 + }, + { + "epoch": 0.8003581968831094, + "grad_norm": 0.87109375, + "learning_rate": 0.0001118225078348072, + "loss": 0.7261, + "step": 31170 + }, + { + "epoch": 0.8003838740790312, + "grad_norm": 0.9609375, + "learning_rate": 0.00011181807492957666, + "loss": 0.7847, + "step": 31171 + }, + { + "epoch": 0.8004095512749531, + "grad_norm": 0.78125, + "learning_rate": 0.00011181364200079376, + "loss": 0.8114, + "step": 31172 + }, + { + "epoch": 0.8004352284708749, + "grad_norm": 0.78515625, + "learning_rate": 0.00011180920904846734, + "loss": 0.8056, + "step": 31173 + }, + { + "epoch": 0.8004609056667966, + "grad_norm": 0.78515625, + "learning_rate": 0.00011180477607260625, + "loss": 0.831, + "step": 31174 + }, + { + "epoch": 0.8004865828627185, + "grad_norm": 1.4375, + "learning_rate": 0.00011180034307321932, + "loss": 0.7177, + "step": 31175 + }, + { + "epoch": 0.8005122600586403, + "grad_norm": 0.81640625, + "learning_rate": 0.00011179591005031533, + "loss": 0.8903, + "step": 31176 + }, + { + "epoch": 0.8005379372545621, + "grad_norm": 0.734375, + "learning_rate": 0.00011179147700390317, + "loss": 0.7163, + "step": 31177 + }, + { + "epoch": 0.800563614450484, + "grad_norm": 0.8203125, + "learning_rate": 0.00011178704393399169, + "loss": 0.876, + "step": 31178 + }, + { + "epoch": 0.8005892916464058, + "grad_norm": 0.8046875, + "learning_rate": 0.00011178261084058965, + "loss": 0.902, + "step": 31179 + }, + { + "epoch": 0.8006149688423275, + "grad_norm": 0.79296875, + "learning_rate": 0.00011177817772370598, + "loss": 0.7961, + "step": 31180 + }, + { + "epoch": 0.8006406460382494, + "grad_norm": 0.76171875, + "learning_rate": 0.00011177374458334943, + "loss": 0.7817, + "step": 31181 + }, + { + "epoch": 0.8006663232341712, + "grad_norm": 0.8203125, + "learning_rate": 0.00011176931141952888, + "loss": 0.8456, + "step": 31182 + }, + { + "epoch": 0.800692000430093, + "grad_norm": 0.79296875, + "learning_rate": 0.00011176487823225319, + "loss": 0.8838, + "step": 31183 + }, + { + "epoch": 0.8007176776260149, + "grad_norm": 0.75, + "learning_rate": 0.00011176044502153111, + "loss": 0.7149, + "step": 31184 + }, + { + "epoch": 0.8007433548219367, + "grad_norm": 0.765625, + "learning_rate": 0.00011175601178737154, + "loss": 0.8563, + "step": 31185 + }, + { + "epoch": 0.8007690320178585, + "grad_norm": 0.8046875, + "learning_rate": 0.0001117515785297833, + "loss": 0.7379, + "step": 31186 + }, + { + "epoch": 0.8007947092137803, + "grad_norm": 0.8359375, + "learning_rate": 0.00011174714524877523, + "loss": 0.8819, + "step": 31187 + }, + { + "epoch": 0.8008203864097021, + "grad_norm": 0.77734375, + "learning_rate": 0.0001117427119443562, + "loss": 0.7321, + "step": 31188 + }, + { + "epoch": 0.8008460636056239, + "grad_norm": 0.8203125, + "learning_rate": 0.00011173827861653495, + "loss": 0.8901, + "step": 31189 + }, + { + "epoch": 0.8008717408015458, + "grad_norm": 0.76953125, + "learning_rate": 0.00011173384526532038, + "loss": 0.7836, + "step": 31190 + }, + { + "epoch": 0.8008974179974676, + "grad_norm": 0.83203125, + "learning_rate": 0.00011172941189072132, + "loss": 1.0131, + "step": 31191 + }, + { + "epoch": 0.8009230951933894, + "grad_norm": 0.7578125, + "learning_rate": 0.00011172497849274659, + "loss": 0.8841, + "step": 31192 + }, + { + "epoch": 0.8009487723893112, + "grad_norm": 0.71484375, + "learning_rate": 0.00011172054507140505, + "loss": 0.7767, + "step": 31193 + }, + { + "epoch": 0.800974449585233, + "grad_norm": 0.80859375, + "learning_rate": 0.00011171611162670557, + "loss": 0.9495, + "step": 31194 + }, + { + "epoch": 0.8010001267811548, + "grad_norm": 0.890625, + "learning_rate": 0.00011171167815865689, + "loss": 0.9129, + "step": 31195 + }, + { + "epoch": 0.8010258039770767, + "grad_norm": 0.70703125, + "learning_rate": 0.0001117072446672679, + "loss": 0.7605, + "step": 31196 + }, + { + "epoch": 0.8010514811729985, + "grad_norm": 0.79296875, + "learning_rate": 0.00011170281115254742, + "loss": 0.8397, + "step": 31197 + }, + { + "epoch": 0.8010771583689204, + "grad_norm": 0.75390625, + "learning_rate": 0.00011169837761450428, + "loss": 0.7396, + "step": 31198 + }, + { + "epoch": 0.8011028355648422, + "grad_norm": 0.765625, + "learning_rate": 0.00011169394405314734, + "loss": 0.7778, + "step": 31199 + }, + { + "epoch": 0.8011285127607639, + "grad_norm": 0.76171875, + "learning_rate": 0.00011168951046848546, + "loss": 0.812, + "step": 31200 + }, + { + "epoch": 0.8011541899566857, + "grad_norm": 0.76171875, + "learning_rate": 0.00011168507686052742, + "loss": 0.7891, + "step": 31201 + }, + { + "epoch": 0.8011798671526076, + "grad_norm": 0.796875, + "learning_rate": 0.0001116806432292821, + "loss": 0.8013, + "step": 31202 + }, + { + "epoch": 0.8012055443485294, + "grad_norm": 0.66015625, + "learning_rate": 0.00011167620957475824, + "loss": 0.6219, + "step": 31203 + }, + { + "epoch": 0.8012312215444513, + "grad_norm": 0.8359375, + "learning_rate": 0.00011167177589696481, + "loss": 0.8364, + "step": 31204 + }, + { + "epoch": 0.8012568987403731, + "grad_norm": 0.72265625, + "learning_rate": 0.00011166734219591058, + "loss": 0.7966, + "step": 31205 + }, + { + "epoch": 0.8012825759362949, + "grad_norm": 0.8046875, + "learning_rate": 0.00011166290847160437, + "loss": 0.7794, + "step": 31206 + }, + { + "epoch": 0.8013082531322167, + "grad_norm": 0.73828125, + "learning_rate": 0.00011165847472405504, + "loss": 0.8072, + "step": 31207 + }, + { + "epoch": 0.8013339303281385, + "grad_norm": 0.82421875, + "learning_rate": 0.00011165404095327145, + "loss": 0.9305, + "step": 31208 + }, + { + "epoch": 0.8013596075240603, + "grad_norm": 0.734375, + "learning_rate": 0.0001116496071592624, + "loss": 0.8029, + "step": 31209 + }, + { + "epoch": 0.8013852847199822, + "grad_norm": 0.765625, + "learning_rate": 0.00011164517334203672, + "loss": 0.8203, + "step": 31210 + }, + { + "epoch": 0.801410961915904, + "grad_norm": 0.76953125, + "learning_rate": 0.00011164073950160326, + "loss": 0.787, + "step": 31211 + }, + { + "epoch": 0.8014366391118258, + "grad_norm": 0.80859375, + "learning_rate": 0.00011163630563797088, + "loss": 0.7167, + "step": 31212 + }, + { + "epoch": 0.8014623163077476, + "grad_norm": 0.765625, + "learning_rate": 0.00011163187175114838, + "loss": 0.7565, + "step": 31213 + }, + { + "epoch": 0.8014879935036694, + "grad_norm": 0.87890625, + "learning_rate": 0.0001116274378411446, + "loss": 0.8024, + "step": 31214 + }, + { + "epoch": 0.8015136706995912, + "grad_norm": 0.8046875, + "learning_rate": 0.0001116230039079684, + "loss": 0.8583, + "step": 31215 + }, + { + "epoch": 0.8015393478955131, + "grad_norm": 0.7890625, + "learning_rate": 0.0001116185699516286, + "loss": 0.8499, + "step": 31216 + }, + { + "epoch": 0.8015650250914349, + "grad_norm": 0.80078125, + "learning_rate": 0.00011161413597213404, + "loss": 0.7731, + "step": 31217 + }, + { + "epoch": 0.8015907022873567, + "grad_norm": 0.7890625, + "learning_rate": 0.00011160970196949357, + "loss": 0.7345, + "step": 31218 + }, + { + "epoch": 0.8016163794832786, + "grad_norm": 0.83984375, + "learning_rate": 0.00011160526794371597, + "loss": 0.8216, + "step": 31219 + }, + { + "epoch": 0.8016420566792003, + "grad_norm": 0.76953125, + "learning_rate": 0.00011160083389481015, + "loss": 0.7915, + "step": 31220 + }, + { + "epoch": 0.8016677338751221, + "grad_norm": 0.6796875, + "learning_rate": 0.00011159639982278491, + "loss": 0.8327, + "step": 31221 + }, + { + "epoch": 0.801693411071044, + "grad_norm": 0.85546875, + "learning_rate": 0.00011159196572764912, + "loss": 0.7842, + "step": 31222 + }, + { + "epoch": 0.8017190882669658, + "grad_norm": 0.765625, + "learning_rate": 0.00011158753160941158, + "loss": 0.8674, + "step": 31223 + }, + { + "epoch": 0.8017447654628876, + "grad_norm": 0.8359375, + "learning_rate": 0.00011158309746808108, + "loss": 0.8157, + "step": 31224 + }, + { + "epoch": 0.8017704426588095, + "grad_norm": 0.7734375, + "learning_rate": 0.00011157866330366657, + "loss": 0.7017, + "step": 31225 + }, + { + "epoch": 0.8017961198547313, + "grad_norm": 0.8984375, + "learning_rate": 0.00011157422911617684, + "loss": 0.7781, + "step": 31226 + }, + { + "epoch": 0.801821797050653, + "grad_norm": 0.80078125, + "learning_rate": 0.00011156979490562067, + "loss": 0.8302, + "step": 31227 + }, + { + "epoch": 0.8018474742465749, + "grad_norm": 0.78515625, + "learning_rate": 0.00011156536067200698, + "loss": 0.813, + "step": 31228 + }, + { + "epoch": 0.8018731514424967, + "grad_norm": 0.73828125, + "learning_rate": 0.00011156092641534455, + "loss": 0.7662, + "step": 31229 + }, + { + "epoch": 0.8018988286384185, + "grad_norm": 0.78515625, + "learning_rate": 0.00011155649213564223, + "loss": 0.8085, + "step": 31230 + }, + { + "epoch": 0.8019245058343404, + "grad_norm": 0.80078125, + "learning_rate": 0.00011155205783290889, + "loss": 0.7703, + "step": 31231 + }, + { + "epoch": 0.8019501830302622, + "grad_norm": 0.828125, + "learning_rate": 0.0001115476235071533, + "loss": 0.8923, + "step": 31232 + }, + { + "epoch": 0.801975860226184, + "grad_norm": 0.77734375, + "learning_rate": 0.0001115431891583844, + "loss": 0.7379, + "step": 31233 + }, + { + "epoch": 0.8020015374221058, + "grad_norm": 0.82421875, + "learning_rate": 0.00011153875478661095, + "loss": 0.8632, + "step": 31234 + }, + { + "epoch": 0.8020272146180276, + "grad_norm": 0.76171875, + "learning_rate": 0.0001115343203918418, + "loss": 0.7402, + "step": 31235 + }, + { + "epoch": 0.8020528918139495, + "grad_norm": 0.97265625, + "learning_rate": 0.00011152988597408579, + "loss": 0.7898, + "step": 31236 + }, + { + "epoch": 0.8020785690098713, + "grad_norm": 0.859375, + "learning_rate": 0.00011152545153335174, + "loss": 0.882, + "step": 31237 + }, + { + "epoch": 0.8021042462057931, + "grad_norm": 0.85546875, + "learning_rate": 0.0001115210170696485, + "loss": 0.8332, + "step": 31238 + }, + { + "epoch": 0.802129923401715, + "grad_norm": 0.8125, + "learning_rate": 0.00011151658258298494, + "loss": 0.8668, + "step": 31239 + }, + { + "epoch": 0.8021556005976367, + "grad_norm": 0.75390625, + "learning_rate": 0.00011151214807336985, + "loss": 0.7059, + "step": 31240 + }, + { + "epoch": 0.8021812777935585, + "grad_norm": 0.859375, + "learning_rate": 0.00011150771354081213, + "loss": 0.8581, + "step": 31241 + }, + { + "epoch": 0.8022069549894804, + "grad_norm": 0.7734375, + "learning_rate": 0.00011150327898532056, + "loss": 0.7899, + "step": 31242 + }, + { + "epoch": 0.8022326321854022, + "grad_norm": 0.71875, + "learning_rate": 0.00011149884440690395, + "loss": 0.7824, + "step": 31243 + }, + { + "epoch": 0.802258309381324, + "grad_norm": 0.8125, + "learning_rate": 0.00011149440980557122, + "loss": 0.8297, + "step": 31244 + }, + { + "epoch": 0.8022839865772459, + "grad_norm": 0.7890625, + "learning_rate": 0.00011148997518133117, + "loss": 0.8739, + "step": 31245 + }, + { + "epoch": 0.8023096637731677, + "grad_norm": 0.84765625, + "learning_rate": 0.00011148554053419262, + "loss": 0.7916, + "step": 31246 + }, + { + "epoch": 0.8023353409690894, + "grad_norm": 0.84765625, + "learning_rate": 0.00011148110586416445, + "loss": 1.0033, + "step": 31247 + }, + { + "epoch": 0.8023610181650113, + "grad_norm": 0.8046875, + "learning_rate": 0.00011147667117125544, + "loss": 0.7572, + "step": 31248 + }, + { + "epoch": 0.8023866953609331, + "grad_norm": 0.8828125, + "learning_rate": 0.00011147223645547448, + "loss": 0.8845, + "step": 31249 + }, + { + "epoch": 0.8024123725568549, + "grad_norm": 0.796875, + "learning_rate": 0.00011146780171683042, + "loss": 0.8943, + "step": 31250 + }, + { + "epoch": 0.8024380497527768, + "grad_norm": 0.84375, + "learning_rate": 0.00011146336695533202, + "loss": 0.9117, + "step": 31251 + }, + { + "epoch": 0.8024637269486986, + "grad_norm": 0.8203125, + "learning_rate": 0.00011145893217098817, + "loss": 0.8424, + "step": 31252 + }, + { + "epoch": 0.8024894041446203, + "grad_norm": 0.75, + "learning_rate": 0.00011145449736380773, + "loss": 0.7928, + "step": 31253 + }, + { + "epoch": 0.8025150813405422, + "grad_norm": 0.7890625, + "learning_rate": 0.00011145006253379951, + "loss": 0.8108, + "step": 31254 + }, + { + "epoch": 0.802540758536464, + "grad_norm": 0.80859375, + "learning_rate": 0.00011144562768097233, + "loss": 0.9099, + "step": 31255 + }, + { + "epoch": 0.8025664357323858, + "grad_norm": 0.76953125, + "learning_rate": 0.00011144119280533503, + "loss": 0.7443, + "step": 31256 + }, + { + "epoch": 0.8025921129283077, + "grad_norm": 0.734375, + "learning_rate": 0.0001114367579068965, + "loss": 0.6807, + "step": 31257 + }, + { + "epoch": 0.8026177901242295, + "grad_norm": 0.77734375, + "learning_rate": 0.00011143232298566554, + "loss": 0.7984, + "step": 31258 + }, + { + "epoch": 0.8026434673201513, + "grad_norm": 0.77734375, + "learning_rate": 0.00011142788804165096, + "loss": 0.8226, + "step": 31259 + }, + { + "epoch": 0.8026691445160731, + "grad_norm": 0.87109375, + "learning_rate": 0.0001114234530748617, + "loss": 0.8502, + "step": 31260 + }, + { + "epoch": 0.8026948217119949, + "grad_norm": 0.8515625, + "learning_rate": 0.00011141901808530648, + "loss": 0.905, + "step": 31261 + }, + { + "epoch": 0.8027204989079167, + "grad_norm": 0.8515625, + "learning_rate": 0.0001114145830729942, + "loss": 0.6318, + "step": 31262 + }, + { + "epoch": 0.8027461761038386, + "grad_norm": 0.77734375, + "learning_rate": 0.00011141014803793368, + "loss": 0.8963, + "step": 31263 + }, + { + "epoch": 0.8027718532997604, + "grad_norm": 0.6796875, + "learning_rate": 0.00011140571298013376, + "loss": 0.6852, + "step": 31264 + }, + { + "epoch": 0.8027975304956823, + "grad_norm": 0.72265625, + "learning_rate": 0.00011140127789960329, + "loss": 0.8018, + "step": 31265 + }, + { + "epoch": 0.8028232076916041, + "grad_norm": 0.8203125, + "learning_rate": 0.00011139684279635112, + "loss": 0.8235, + "step": 31266 + }, + { + "epoch": 0.8028488848875258, + "grad_norm": 0.796875, + "learning_rate": 0.00011139240767038606, + "loss": 0.7624, + "step": 31267 + }, + { + "epoch": 0.8028745620834477, + "grad_norm": 0.79296875, + "learning_rate": 0.00011138797252171698, + "loss": 0.822, + "step": 31268 + }, + { + "epoch": 0.8029002392793695, + "grad_norm": 0.93359375, + "learning_rate": 0.00011138353735035265, + "loss": 0.8804, + "step": 31269 + }, + { + "epoch": 0.8029259164752913, + "grad_norm": 0.765625, + "learning_rate": 0.00011137910215630198, + "loss": 0.7561, + "step": 31270 + }, + { + "epoch": 0.8029515936712132, + "grad_norm": 0.74609375, + "learning_rate": 0.00011137466693957381, + "loss": 0.7841, + "step": 31271 + }, + { + "epoch": 0.802977270867135, + "grad_norm": 0.75390625, + "learning_rate": 0.00011137023170017693, + "loss": 0.8388, + "step": 31272 + }, + { + "epoch": 0.8030029480630567, + "grad_norm": 0.796875, + "learning_rate": 0.00011136579643812024, + "loss": 0.8293, + "step": 31273 + }, + { + "epoch": 0.8030286252589786, + "grad_norm": 0.7578125, + "learning_rate": 0.00011136136115341253, + "loss": 0.7188, + "step": 31274 + }, + { + "epoch": 0.8030543024549004, + "grad_norm": 0.79296875, + "learning_rate": 0.00011135692584606263, + "loss": 0.8149, + "step": 31275 + }, + { + "epoch": 0.8030799796508222, + "grad_norm": 0.74609375, + "learning_rate": 0.00011135249051607942, + "loss": 0.7062, + "step": 31276 + }, + { + "epoch": 0.8031056568467441, + "grad_norm": 0.79296875, + "learning_rate": 0.00011134805516347168, + "loss": 0.7815, + "step": 31277 + }, + { + "epoch": 0.8031313340426659, + "grad_norm": 0.8671875, + "learning_rate": 0.00011134361978824835, + "loss": 0.7481, + "step": 31278 + }, + { + "epoch": 0.8031570112385877, + "grad_norm": 0.859375, + "learning_rate": 0.00011133918439041819, + "loss": 1.0314, + "step": 31279 + }, + { + "epoch": 0.8031826884345095, + "grad_norm": 0.83984375, + "learning_rate": 0.00011133474896999007, + "loss": 0.8458, + "step": 31280 + }, + { + "epoch": 0.8032083656304313, + "grad_norm": 0.8046875, + "learning_rate": 0.00011133031352697281, + "loss": 0.8374, + "step": 31281 + }, + { + "epoch": 0.8032340428263531, + "grad_norm": 0.703125, + "learning_rate": 0.00011132587806137527, + "loss": 0.7612, + "step": 31282 + }, + { + "epoch": 0.803259720022275, + "grad_norm": 0.890625, + "learning_rate": 0.00011132144257320625, + "loss": 0.8692, + "step": 31283 + }, + { + "epoch": 0.8032853972181968, + "grad_norm": 0.7890625, + "learning_rate": 0.00011131700706247463, + "loss": 0.7994, + "step": 31284 + }, + { + "epoch": 0.8033110744141186, + "grad_norm": 0.77734375, + "learning_rate": 0.00011131257152918926, + "loss": 0.9498, + "step": 31285 + }, + { + "epoch": 0.8033367516100405, + "grad_norm": 0.82421875, + "learning_rate": 0.00011130813597335891, + "loss": 0.834, + "step": 31286 + }, + { + "epoch": 0.8033624288059622, + "grad_norm": 0.78515625, + "learning_rate": 0.00011130370039499252, + "loss": 0.8033, + "step": 31287 + }, + { + "epoch": 0.803388106001884, + "grad_norm": 0.73828125, + "learning_rate": 0.00011129926479409884, + "loss": 0.8042, + "step": 31288 + }, + { + "epoch": 0.8034137831978059, + "grad_norm": 0.67578125, + "learning_rate": 0.00011129482917068676, + "loss": 0.7272, + "step": 31289 + }, + { + "epoch": 0.8034394603937277, + "grad_norm": 0.77734375, + "learning_rate": 0.00011129039352476512, + "loss": 0.7683, + "step": 31290 + }, + { + "epoch": 0.8034651375896495, + "grad_norm": 0.8046875, + "learning_rate": 0.00011128595785634271, + "loss": 0.778, + "step": 31291 + }, + { + "epoch": 0.8034908147855714, + "grad_norm": 0.796875, + "learning_rate": 0.00011128152216542842, + "loss": 0.8003, + "step": 31292 + }, + { + "epoch": 0.8035164919814931, + "grad_norm": 0.78125, + "learning_rate": 0.00011127708645203108, + "loss": 0.8224, + "step": 31293 + }, + { + "epoch": 0.8035421691774149, + "grad_norm": 0.8203125, + "learning_rate": 0.00011127265071615953, + "loss": 0.9056, + "step": 31294 + }, + { + "epoch": 0.8035678463733368, + "grad_norm": 0.80078125, + "learning_rate": 0.0001112682149578226, + "loss": 0.7599, + "step": 31295 + }, + { + "epoch": 0.8035935235692586, + "grad_norm": 0.74609375, + "learning_rate": 0.00011126377917702913, + "loss": 0.8761, + "step": 31296 + }, + { + "epoch": 0.8036192007651805, + "grad_norm": 0.7578125, + "learning_rate": 0.00011125934337378797, + "loss": 0.8427, + "step": 31297 + }, + { + "epoch": 0.8036448779611023, + "grad_norm": 0.734375, + "learning_rate": 0.00011125490754810794, + "loss": 0.7499, + "step": 31298 + }, + { + "epoch": 0.8036705551570241, + "grad_norm": 0.87109375, + "learning_rate": 0.00011125047169999792, + "loss": 0.8219, + "step": 31299 + }, + { + "epoch": 0.8036962323529458, + "grad_norm": 0.80859375, + "learning_rate": 0.00011124603582946674, + "loss": 0.8602, + "step": 31300 + }, + { + "epoch": 0.8037219095488677, + "grad_norm": 0.78515625, + "learning_rate": 0.00011124159993652319, + "loss": 0.7473, + "step": 31301 + }, + { + "epoch": 0.8037475867447895, + "grad_norm": 1.0078125, + "learning_rate": 0.00011123716402117615, + "loss": 0.8305, + "step": 31302 + }, + { + "epoch": 0.8037732639407114, + "grad_norm": 0.79296875, + "learning_rate": 0.00011123272808343448, + "loss": 0.7608, + "step": 31303 + }, + { + "epoch": 0.8037989411366332, + "grad_norm": 0.75, + "learning_rate": 0.00011122829212330696, + "loss": 0.9022, + "step": 31304 + }, + { + "epoch": 0.803824618332555, + "grad_norm": 0.84375, + "learning_rate": 0.0001112238561408025, + "loss": 0.9232, + "step": 31305 + }, + { + "epoch": 0.8038502955284769, + "grad_norm": 0.828125, + "learning_rate": 0.00011121942013592992, + "loss": 0.8816, + "step": 31306 + }, + { + "epoch": 0.8038759727243986, + "grad_norm": 0.80859375, + "learning_rate": 0.00011121498410869801, + "loss": 0.8702, + "step": 31307 + }, + { + "epoch": 0.8039016499203204, + "grad_norm": 0.83203125, + "learning_rate": 0.00011121054805911568, + "loss": 0.7947, + "step": 31308 + }, + { + "epoch": 0.8039273271162423, + "grad_norm": 0.765625, + "learning_rate": 0.00011120611198719173, + "loss": 0.9225, + "step": 31309 + }, + { + "epoch": 0.8039530043121641, + "grad_norm": 0.73828125, + "learning_rate": 0.00011120167589293503, + "loss": 0.6656, + "step": 31310 + }, + { + "epoch": 0.8039786815080859, + "grad_norm": 0.75, + "learning_rate": 0.00011119723977635437, + "loss": 0.7239, + "step": 31311 + }, + { + "epoch": 0.8040043587040078, + "grad_norm": 0.8359375, + "learning_rate": 0.00011119280363745864, + "loss": 0.8729, + "step": 31312 + }, + { + "epoch": 0.8040300358999295, + "grad_norm": 0.703125, + "learning_rate": 0.00011118836747625667, + "loss": 0.7709, + "step": 31313 + }, + { + "epoch": 0.8040557130958513, + "grad_norm": 0.71484375, + "learning_rate": 0.00011118393129275726, + "loss": 0.7436, + "step": 31314 + }, + { + "epoch": 0.8040813902917732, + "grad_norm": 0.8125, + "learning_rate": 0.00011117949508696932, + "loss": 0.8732, + "step": 31315 + }, + { + "epoch": 0.804107067487695, + "grad_norm": 0.79296875, + "learning_rate": 0.00011117505885890164, + "loss": 0.8441, + "step": 31316 + }, + { + "epoch": 0.8041327446836168, + "grad_norm": 0.81640625, + "learning_rate": 0.00011117062260856308, + "loss": 0.8718, + "step": 31317 + }, + { + "epoch": 0.8041584218795387, + "grad_norm": 0.7734375, + "learning_rate": 0.0001111661863359625, + "loss": 0.8045, + "step": 31318 + }, + { + "epoch": 0.8041840990754605, + "grad_norm": 0.796875, + "learning_rate": 0.00011116175004110872, + "loss": 0.8479, + "step": 31319 + }, + { + "epoch": 0.8042097762713822, + "grad_norm": 0.87890625, + "learning_rate": 0.00011115731372401055, + "loss": 0.7411, + "step": 31320 + }, + { + "epoch": 0.8042354534673041, + "grad_norm": 0.7265625, + "learning_rate": 0.00011115287738467686, + "loss": 0.7932, + "step": 31321 + }, + { + "epoch": 0.8042611306632259, + "grad_norm": 1.0703125, + "learning_rate": 0.00011114844102311652, + "loss": 0.8519, + "step": 31322 + }, + { + "epoch": 0.8042868078591477, + "grad_norm": 0.7578125, + "learning_rate": 0.00011114400463933832, + "loss": 0.9141, + "step": 31323 + }, + { + "epoch": 0.8043124850550696, + "grad_norm": 1.046875, + "learning_rate": 0.00011113956823335113, + "loss": 0.8598, + "step": 31324 + }, + { + "epoch": 0.8043381622509914, + "grad_norm": 0.81640625, + "learning_rate": 0.0001111351318051638, + "loss": 0.7984, + "step": 31325 + }, + { + "epoch": 0.8043638394469133, + "grad_norm": 0.78125, + "learning_rate": 0.00011113069535478515, + "loss": 0.8168, + "step": 31326 + }, + { + "epoch": 0.804389516642835, + "grad_norm": 0.74609375, + "learning_rate": 0.00011112625888222403, + "loss": 0.9297, + "step": 31327 + }, + { + "epoch": 0.8044151938387568, + "grad_norm": 0.78515625, + "learning_rate": 0.00011112182238748927, + "loss": 0.8622, + "step": 31328 + }, + { + "epoch": 0.8044408710346787, + "grad_norm": 0.81640625, + "learning_rate": 0.00011111738587058973, + "loss": 0.7316, + "step": 31329 + }, + { + "epoch": 0.8044665482306005, + "grad_norm": 0.75390625, + "learning_rate": 0.00011111294933153427, + "loss": 0.8149, + "step": 31330 + }, + { + "epoch": 0.8044922254265223, + "grad_norm": 0.83203125, + "learning_rate": 0.00011110851277033166, + "loss": 0.9185, + "step": 31331 + }, + { + "epoch": 0.8045179026224442, + "grad_norm": 0.6953125, + "learning_rate": 0.00011110407618699081, + "loss": 0.7046, + "step": 31332 + }, + { + "epoch": 0.8045435798183659, + "grad_norm": 0.76171875, + "learning_rate": 0.00011109963958152055, + "loss": 0.8389, + "step": 31333 + }, + { + "epoch": 0.8045692570142877, + "grad_norm": 0.73828125, + "learning_rate": 0.0001110952029539297, + "loss": 0.7398, + "step": 31334 + }, + { + "epoch": 0.8045949342102096, + "grad_norm": 0.78515625, + "learning_rate": 0.00011109076630422713, + "loss": 0.809, + "step": 31335 + }, + { + "epoch": 0.8046206114061314, + "grad_norm": 0.78515625, + "learning_rate": 0.00011108632963242163, + "loss": 0.8308, + "step": 31336 + }, + { + "epoch": 0.8046462886020532, + "grad_norm": 0.7734375, + "learning_rate": 0.00011108189293852209, + "loss": 0.7306, + "step": 31337 + }, + { + "epoch": 0.8046719657979751, + "grad_norm": 0.77734375, + "learning_rate": 0.00011107745622253734, + "loss": 0.9142, + "step": 31338 + }, + { + "epoch": 0.8046976429938969, + "grad_norm": 0.77734375, + "learning_rate": 0.00011107301948447623, + "loss": 0.7894, + "step": 31339 + }, + { + "epoch": 0.8047233201898186, + "grad_norm": 0.765625, + "learning_rate": 0.00011106858272434759, + "loss": 0.8178, + "step": 31340 + }, + { + "epoch": 0.8047489973857405, + "grad_norm": 0.91015625, + "learning_rate": 0.00011106414594216023, + "loss": 0.9009, + "step": 31341 + }, + { + "epoch": 0.8047746745816623, + "grad_norm": 0.73046875, + "learning_rate": 0.00011105970913792307, + "loss": 0.709, + "step": 31342 + }, + { + "epoch": 0.8048003517775841, + "grad_norm": 1.046875, + "learning_rate": 0.0001110552723116449, + "loss": 0.9397, + "step": 31343 + }, + { + "epoch": 0.804826028973506, + "grad_norm": 0.72265625, + "learning_rate": 0.00011105083546333454, + "loss": 0.9221, + "step": 31344 + }, + { + "epoch": 0.8048517061694278, + "grad_norm": 0.8203125, + "learning_rate": 0.0001110463985930009, + "loss": 0.8746, + "step": 31345 + }, + { + "epoch": 0.8048773833653496, + "grad_norm": 0.75, + "learning_rate": 0.0001110419617006528, + "loss": 0.7652, + "step": 31346 + }, + { + "epoch": 0.8049030605612714, + "grad_norm": 0.6953125, + "learning_rate": 0.00011103752478629903, + "loss": 0.7956, + "step": 31347 + }, + { + "epoch": 0.8049287377571932, + "grad_norm": 0.85546875, + "learning_rate": 0.0001110330878499485, + "loss": 0.8106, + "step": 31348 + }, + { + "epoch": 0.804954414953115, + "grad_norm": 0.74609375, + "learning_rate": 0.00011102865089160999, + "loss": 0.8696, + "step": 31349 + }, + { + "epoch": 0.8049800921490369, + "grad_norm": 0.75390625, + "learning_rate": 0.00011102421391129238, + "loss": 0.6691, + "step": 31350 + }, + { + "epoch": 0.8050057693449587, + "grad_norm": 0.74609375, + "learning_rate": 0.00011101977690900454, + "loss": 0.6807, + "step": 31351 + }, + { + "epoch": 0.8050314465408805, + "grad_norm": 0.734375, + "learning_rate": 0.00011101533988475525, + "loss": 0.6452, + "step": 31352 + }, + { + "epoch": 0.8050571237368023, + "grad_norm": 0.8125, + "learning_rate": 0.0001110109028385534, + "loss": 0.7139, + "step": 31353 + }, + { + "epoch": 0.8050828009327241, + "grad_norm": 0.69921875, + "learning_rate": 0.00011100646577040778, + "loss": 0.7015, + "step": 31354 + }, + { + "epoch": 0.8051084781286459, + "grad_norm": 0.76953125, + "learning_rate": 0.0001110020286803273, + "loss": 0.8149, + "step": 31355 + }, + { + "epoch": 0.8051341553245678, + "grad_norm": 0.91015625, + "learning_rate": 0.00011099759156832077, + "loss": 0.9466, + "step": 31356 + }, + { + "epoch": 0.8051598325204896, + "grad_norm": 0.77734375, + "learning_rate": 0.00011099315443439702, + "loss": 0.8751, + "step": 31357 + }, + { + "epoch": 0.8051855097164115, + "grad_norm": 0.859375, + "learning_rate": 0.0001109887172785649, + "loss": 0.8818, + "step": 31358 + }, + { + "epoch": 0.8052111869123333, + "grad_norm": 0.9453125, + "learning_rate": 0.00011098428010083331, + "loss": 0.8399, + "step": 31359 + }, + { + "epoch": 0.805236864108255, + "grad_norm": 0.84765625, + "learning_rate": 0.000110979842901211, + "loss": 0.7977, + "step": 31360 + }, + { + "epoch": 0.8052625413041768, + "grad_norm": 0.82421875, + "learning_rate": 0.00011097540567970686, + "loss": 0.8941, + "step": 31361 + }, + { + "epoch": 0.8052882185000987, + "grad_norm": 0.74609375, + "learning_rate": 0.00011097096843632972, + "loss": 0.7019, + "step": 31362 + }, + { + "epoch": 0.8053138956960205, + "grad_norm": 0.75, + "learning_rate": 0.00011096653117108842, + "loss": 0.879, + "step": 31363 + }, + { + "epoch": 0.8053395728919424, + "grad_norm": 0.83984375, + "learning_rate": 0.00011096209388399183, + "loss": 0.8582, + "step": 31364 + }, + { + "epoch": 0.8053652500878642, + "grad_norm": 0.74609375, + "learning_rate": 0.0001109576565750488, + "loss": 0.7599, + "step": 31365 + }, + { + "epoch": 0.805390927283786, + "grad_norm": 0.80078125, + "learning_rate": 0.00011095321924426813, + "loss": 0.8626, + "step": 31366 + }, + { + "epoch": 0.8054166044797078, + "grad_norm": 0.83984375, + "learning_rate": 0.00011094878189165869, + "loss": 0.739, + "step": 31367 + }, + { + "epoch": 0.8054422816756296, + "grad_norm": 0.7421875, + "learning_rate": 0.0001109443445172293, + "loss": 0.7978, + "step": 31368 + }, + { + "epoch": 0.8054679588715514, + "grad_norm": 0.70703125, + "learning_rate": 0.00011093990712098883, + "loss": 0.8174, + "step": 31369 + }, + { + "epoch": 0.8054936360674733, + "grad_norm": 1.1640625, + "learning_rate": 0.0001109354697029461, + "loss": 0.7884, + "step": 31370 + }, + { + "epoch": 0.8055193132633951, + "grad_norm": 0.765625, + "learning_rate": 0.00011093103226310997, + "loss": 0.8312, + "step": 31371 + }, + { + "epoch": 0.8055449904593169, + "grad_norm": 0.796875, + "learning_rate": 0.00011092659480148933, + "loss": 0.745, + "step": 31372 + }, + { + "epoch": 0.8055706676552387, + "grad_norm": 0.875, + "learning_rate": 0.0001109221573180929, + "loss": 0.8429, + "step": 31373 + }, + { + "epoch": 0.8055963448511605, + "grad_norm": 0.78515625, + "learning_rate": 0.00011091771981292965, + "loss": 0.7821, + "step": 31374 + }, + { + "epoch": 0.8056220220470823, + "grad_norm": 0.734375, + "learning_rate": 0.00011091328228600834, + "loss": 0.8308, + "step": 31375 + }, + { + "epoch": 0.8056476992430042, + "grad_norm": 0.7890625, + "learning_rate": 0.00011090884473733785, + "loss": 0.8827, + "step": 31376 + }, + { + "epoch": 0.805673376438926, + "grad_norm": 0.765625, + "learning_rate": 0.00011090440716692703, + "loss": 0.6211, + "step": 31377 + }, + { + "epoch": 0.8056990536348478, + "grad_norm": 0.75, + "learning_rate": 0.0001108999695747847, + "loss": 0.7863, + "step": 31378 + }, + { + "epoch": 0.8057247308307697, + "grad_norm": 0.8046875, + "learning_rate": 0.00011089553196091973, + "loss": 0.8407, + "step": 31379 + }, + { + "epoch": 0.8057504080266914, + "grad_norm": 0.859375, + "learning_rate": 0.00011089109432534095, + "loss": 0.9605, + "step": 31380 + }, + { + "epoch": 0.8057760852226132, + "grad_norm": 0.765625, + "learning_rate": 0.00011088665666805718, + "loss": 0.67, + "step": 31381 + }, + { + "epoch": 0.8058017624185351, + "grad_norm": 0.8984375, + "learning_rate": 0.00011088221898907729, + "loss": 0.8443, + "step": 31382 + }, + { + "epoch": 0.8058274396144569, + "grad_norm": 0.8203125, + "learning_rate": 0.00011087778128841014, + "loss": 0.7968, + "step": 31383 + }, + { + "epoch": 0.8058531168103787, + "grad_norm": 0.85546875, + "learning_rate": 0.00011087334356606451, + "loss": 0.9463, + "step": 31384 + }, + { + "epoch": 0.8058787940063006, + "grad_norm": 0.84375, + "learning_rate": 0.00011086890582204936, + "loss": 0.8202, + "step": 31385 + }, + { + "epoch": 0.8059044712022224, + "grad_norm": 0.7578125, + "learning_rate": 0.0001108644680563734, + "loss": 0.765, + "step": 31386 + }, + { + "epoch": 0.8059301483981441, + "grad_norm": 0.7734375, + "learning_rate": 0.00011086003026904556, + "loss": 0.7907, + "step": 31387 + }, + { + "epoch": 0.805955825594066, + "grad_norm": 0.79296875, + "learning_rate": 0.00011085559246007469, + "loss": 0.8128, + "step": 31388 + }, + { + "epoch": 0.8059815027899878, + "grad_norm": 0.79296875, + "learning_rate": 0.00011085115462946956, + "loss": 0.812, + "step": 31389 + }, + { + "epoch": 0.8060071799859096, + "grad_norm": 0.73046875, + "learning_rate": 0.00011084671677723907, + "loss": 0.8034, + "step": 31390 + }, + { + "epoch": 0.8060328571818315, + "grad_norm": 0.77734375, + "learning_rate": 0.00011084227890339207, + "loss": 0.8924, + "step": 31391 + }, + { + "epoch": 0.8060585343777533, + "grad_norm": 0.92578125, + "learning_rate": 0.00011083784100793737, + "loss": 0.8629, + "step": 31392 + }, + { + "epoch": 0.806084211573675, + "grad_norm": 0.8046875, + "learning_rate": 0.00011083340309088386, + "loss": 0.9321, + "step": 31393 + }, + { + "epoch": 0.8061098887695969, + "grad_norm": 0.7109375, + "learning_rate": 0.0001108289651522403, + "loss": 0.7368, + "step": 31394 + }, + { + "epoch": 0.8061355659655187, + "grad_norm": 0.74609375, + "learning_rate": 0.00011082452719201564, + "loss": 0.8032, + "step": 31395 + }, + { + "epoch": 0.8061612431614406, + "grad_norm": 0.8125, + "learning_rate": 0.00011082008921021867, + "loss": 0.9547, + "step": 31396 + }, + { + "epoch": 0.8061869203573624, + "grad_norm": 0.80078125, + "learning_rate": 0.0001108156512068582, + "loss": 0.8236, + "step": 31397 + }, + { + "epoch": 0.8062125975532842, + "grad_norm": 0.7578125, + "learning_rate": 0.00011081121318194316, + "loss": 0.8944, + "step": 31398 + }, + { + "epoch": 0.8062382747492061, + "grad_norm": 0.76953125, + "learning_rate": 0.00011080677513548234, + "loss": 0.7723, + "step": 31399 + }, + { + "epoch": 0.8062639519451278, + "grad_norm": 0.81640625, + "learning_rate": 0.00011080233706748457, + "loss": 0.8076, + "step": 31400 + }, + { + "epoch": 0.8062896291410496, + "grad_norm": 0.76953125, + "learning_rate": 0.00011079789897795874, + "loss": 0.7016, + "step": 31401 + }, + { + "epoch": 0.8063153063369715, + "grad_norm": 1.0078125, + "learning_rate": 0.00011079346086691365, + "loss": 0.8431, + "step": 31402 + }, + { + "epoch": 0.8063409835328933, + "grad_norm": 0.77734375, + "learning_rate": 0.0001107890227343582, + "loss": 0.9705, + "step": 31403 + }, + { + "epoch": 0.8063666607288151, + "grad_norm": 0.74609375, + "learning_rate": 0.00011078458458030119, + "loss": 0.8807, + "step": 31404 + }, + { + "epoch": 0.806392337924737, + "grad_norm": 0.75, + "learning_rate": 0.00011078014640475147, + "loss": 0.7907, + "step": 31405 + }, + { + "epoch": 0.8064180151206587, + "grad_norm": 0.84375, + "learning_rate": 0.0001107757082077179, + "loss": 0.8744, + "step": 31406 + }, + { + "epoch": 0.8064436923165805, + "grad_norm": 0.81640625, + "learning_rate": 0.00011077126998920932, + "loss": 0.7846, + "step": 31407 + }, + { + "epoch": 0.8064693695125024, + "grad_norm": 1.0078125, + "learning_rate": 0.00011076683174923454, + "loss": 0.7975, + "step": 31408 + }, + { + "epoch": 0.8064950467084242, + "grad_norm": 0.78515625, + "learning_rate": 0.00011076239348780246, + "loss": 0.6583, + "step": 31409 + }, + { + "epoch": 0.806520723904346, + "grad_norm": 0.7578125, + "learning_rate": 0.00011075795520492191, + "loss": 0.7916, + "step": 31410 + }, + { + "epoch": 0.8065464011002679, + "grad_norm": 0.87109375, + "learning_rate": 0.00011075351690060174, + "loss": 0.7819, + "step": 31411 + }, + { + "epoch": 0.8065720782961897, + "grad_norm": 0.76171875, + "learning_rate": 0.00011074907857485075, + "loss": 0.8713, + "step": 31412 + }, + { + "epoch": 0.8065977554921114, + "grad_norm": 0.81640625, + "learning_rate": 0.00011074464022767783, + "loss": 0.7388, + "step": 31413 + }, + { + "epoch": 0.8066234326880333, + "grad_norm": 0.75, + "learning_rate": 0.00011074020185909182, + "loss": 0.7469, + "step": 31414 + }, + { + "epoch": 0.8066491098839551, + "grad_norm": 0.7890625, + "learning_rate": 0.00011073576346910155, + "loss": 0.8334, + "step": 31415 + }, + { + "epoch": 0.8066747870798769, + "grad_norm": 0.80078125, + "learning_rate": 0.00011073132505771585, + "loss": 0.711, + "step": 31416 + }, + { + "epoch": 0.8067004642757988, + "grad_norm": 0.8046875, + "learning_rate": 0.00011072688662494361, + "loss": 0.8387, + "step": 31417 + }, + { + "epoch": 0.8067261414717206, + "grad_norm": 0.80078125, + "learning_rate": 0.00011072244817079366, + "loss": 0.8155, + "step": 31418 + }, + { + "epoch": 0.8067518186676425, + "grad_norm": 0.796875, + "learning_rate": 0.00011071800969527484, + "loss": 0.833, + "step": 31419 + }, + { + "epoch": 0.8067774958635642, + "grad_norm": 0.67578125, + "learning_rate": 0.000110713571198396, + "loss": 0.6818, + "step": 31420 + }, + { + "epoch": 0.806803173059486, + "grad_norm": 0.7578125, + "learning_rate": 0.00011070913268016597, + "loss": 0.8318, + "step": 31421 + }, + { + "epoch": 0.8068288502554078, + "grad_norm": 0.93359375, + "learning_rate": 0.00011070469414059359, + "loss": 0.7978, + "step": 31422 + }, + { + "epoch": 0.8068545274513297, + "grad_norm": 0.83203125, + "learning_rate": 0.00011070025557968775, + "loss": 0.7421, + "step": 31423 + }, + { + "epoch": 0.8068802046472515, + "grad_norm": 0.8359375, + "learning_rate": 0.00011069581699745723, + "loss": 0.7881, + "step": 31424 + }, + { + "epoch": 0.8069058818431734, + "grad_norm": 0.734375, + "learning_rate": 0.00011069137839391097, + "loss": 0.8244, + "step": 31425 + }, + { + "epoch": 0.8069315590390951, + "grad_norm": 0.83984375, + "learning_rate": 0.0001106869397690577, + "loss": 0.7626, + "step": 31426 + }, + { + "epoch": 0.8069572362350169, + "grad_norm": 0.87109375, + "learning_rate": 0.00011068250112290636, + "loss": 0.7689, + "step": 31427 + }, + { + "epoch": 0.8069829134309388, + "grad_norm": 0.9453125, + "learning_rate": 0.00011067806245546575, + "loss": 0.7877, + "step": 31428 + }, + { + "epoch": 0.8070085906268606, + "grad_norm": 0.94921875, + "learning_rate": 0.00011067362376674473, + "loss": 0.6957, + "step": 31429 + }, + { + "epoch": 0.8070342678227824, + "grad_norm": 0.7734375, + "learning_rate": 0.00011066918505675213, + "loss": 0.7735, + "step": 31430 + }, + { + "epoch": 0.8070599450187043, + "grad_norm": 0.75, + "learning_rate": 0.00011066474632549681, + "loss": 0.6936, + "step": 31431 + }, + { + "epoch": 0.8070856222146261, + "grad_norm": 0.7890625, + "learning_rate": 0.00011066030757298764, + "loss": 0.8661, + "step": 31432 + }, + { + "epoch": 0.8071112994105478, + "grad_norm": 0.75390625, + "learning_rate": 0.00011065586879923342, + "loss": 0.8981, + "step": 31433 + }, + { + "epoch": 0.8071369766064697, + "grad_norm": 0.7109375, + "learning_rate": 0.00011065143000424301, + "loss": 0.6864, + "step": 31434 + }, + { + "epoch": 0.8071626538023915, + "grad_norm": 0.765625, + "learning_rate": 0.00011064699118802526, + "loss": 0.8917, + "step": 31435 + }, + { + "epoch": 0.8071883309983133, + "grad_norm": 0.79296875, + "learning_rate": 0.00011064255235058904, + "loss": 0.9199, + "step": 31436 + }, + { + "epoch": 0.8072140081942352, + "grad_norm": 0.76171875, + "learning_rate": 0.00011063811349194313, + "loss": 0.7242, + "step": 31437 + }, + { + "epoch": 0.807239685390157, + "grad_norm": 0.8359375, + "learning_rate": 0.00011063367461209649, + "loss": 0.8392, + "step": 31438 + }, + { + "epoch": 0.8072653625860788, + "grad_norm": 0.7734375, + "learning_rate": 0.00011062923571105784, + "loss": 0.8025, + "step": 31439 + }, + { + "epoch": 0.8072910397820006, + "grad_norm": 0.8359375, + "learning_rate": 0.0001106247967888361, + "loss": 0.7499, + "step": 31440 + }, + { + "epoch": 0.8073167169779224, + "grad_norm": 0.82421875, + "learning_rate": 0.00011062035784544011, + "loss": 0.816, + "step": 31441 + }, + { + "epoch": 0.8073423941738442, + "grad_norm": 0.82421875, + "learning_rate": 0.00011061591888087868, + "loss": 0.8885, + "step": 31442 + }, + { + "epoch": 0.8073680713697661, + "grad_norm": 0.8359375, + "learning_rate": 0.00011061147989516072, + "loss": 0.8326, + "step": 31443 + }, + { + "epoch": 0.8073937485656879, + "grad_norm": 0.77734375, + "learning_rate": 0.00011060704088829504, + "loss": 0.865, + "step": 31444 + }, + { + "epoch": 0.8074194257616097, + "grad_norm": 0.7890625, + "learning_rate": 0.00011060260186029046, + "loss": 0.8698, + "step": 31445 + }, + { + "epoch": 0.8074451029575315, + "grad_norm": 0.8828125, + "learning_rate": 0.00011059816281115585, + "loss": 0.9424, + "step": 31446 + }, + { + "epoch": 0.8074707801534533, + "grad_norm": 0.796875, + "learning_rate": 0.00011059372374090007, + "loss": 0.9428, + "step": 31447 + }, + { + "epoch": 0.8074964573493751, + "grad_norm": 0.73828125, + "learning_rate": 0.00011058928464953193, + "loss": 0.7956, + "step": 31448 + }, + { + "epoch": 0.807522134545297, + "grad_norm": 0.82421875, + "learning_rate": 0.00011058484553706034, + "loss": 0.8346, + "step": 31449 + }, + { + "epoch": 0.8075478117412188, + "grad_norm": 0.734375, + "learning_rate": 0.00011058040640349407, + "loss": 0.7544, + "step": 31450 + }, + { + "epoch": 0.8075734889371406, + "grad_norm": 1.0546875, + "learning_rate": 0.00011057596724884205, + "loss": 0.8232, + "step": 31451 + }, + { + "epoch": 0.8075991661330625, + "grad_norm": 0.85546875, + "learning_rate": 0.00011057152807311308, + "loss": 0.8284, + "step": 31452 + }, + { + "epoch": 0.8076248433289842, + "grad_norm": 0.75, + "learning_rate": 0.00011056708887631598, + "loss": 0.8684, + "step": 31453 + }, + { + "epoch": 0.807650520524906, + "grad_norm": 0.8203125, + "learning_rate": 0.00011056264965845964, + "loss": 0.7999, + "step": 31454 + }, + { + "epoch": 0.8076761977208279, + "grad_norm": 0.8203125, + "learning_rate": 0.00011055821041955288, + "loss": 0.8277, + "step": 31455 + }, + { + "epoch": 0.8077018749167497, + "grad_norm": 0.76953125, + "learning_rate": 0.00011055377115960457, + "loss": 0.7168, + "step": 31456 + }, + { + "epoch": 0.8077275521126716, + "grad_norm": 0.74609375, + "learning_rate": 0.00011054933187862356, + "loss": 0.7089, + "step": 31457 + }, + { + "epoch": 0.8077532293085934, + "grad_norm": 0.76953125, + "learning_rate": 0.00011054489257661866, + "loss": 0.7964, + "step": 31458 + }, + { + "epoch": 0.8077789065045152, + "grad_norm": 0.7734375, + "learning_rate": 0.00011054045325359877, + "loss": 0.7783, + "step": 31459 + }, + { + "epoch": 0.807804583700437, + "grad_norm": 0.7578125, + "learning_rate": 0.00011053601390957269, + "loss": 0.8122, + "step": 31460 + }, + { + "epoch": 0.8078302608963588, + "grad_norm": 0.75390625, + "learning_rate": 0.00011053157454454928, + "loss": 0.8252, + "step": 31461 + }, + { + "epoch": 0.8078559380922806, + "grad_norm": 0.7421875, + "learning_rate": 0.0001105271351585374, + "loss": 0.8023, + "step": 31462 + }, + { + "epoch": 0.8078816152882025, + "grad_norm": 0.77734375, + "learning_rate": 0.0001105226957515459, + "loss": 0.984, + "step": 31463 + }, + { + "epoch": 0.8079072924841243, + "grad_norm": 0.80078125, + "learning_rate": 0.00011051825632358362, + "loss": 0.8822, + "step": 31464 + }, + { + "epoch": 0.8079329696800461, + "grad_norm": 0.76171875, + "learning_rate": 0.0001105138168746594, + "loss": 0.7149, + "step": 31465 + }, + { + "epoch": 0.8079586468759679, + "grad_norm": 0.765625, + "learning_rate": 0.00011050937740478206, + "loss": 0.8233, + "step": 31466 + }, + { + "epoch": 0.8079843240718897, + "grad_norm": 0.71875, + "learning_rate": 0.00011050493791396052, + "loss": 0.7176, + "step": 31467 + }, + { + "epoch": 0.8080100012678115, + "grad_norm": 0.7734375, + "learning_rate": 0.00011050049840220358, + "loss": 0.8695, + "step": 31468 + }, + { + "epoch": 0.8080356784637334, + "grad_norm": 0.80859375, + "learning_rate": 0.00011049605886952008, + "loss": 0.7712, + "step": 31469 + }, + { + "epoch": 0.8080613556596552, + "grad_norm": 0.859375, + "learning_rate": 0.0001104916193159189, + "loss": 0.8798, + "step": 31470 + }, + { + "epoch": 0.808087032855577, + "grad_norm": 0.703125, + "learning_rate": 0.00011048717974140888, + "loss": 0.7617, + "step": 31471 + }, + { + "epoch": 0.8081127100514989, + "grad_norm": 0.80078125, + "learning_rate": 0.00011048274014599884, + "loss": 0.7824, + "step": 31472 + }, + { + "epoch": 0.8081383872474206, + "grad_norm": 0.765625, + "learning_rate": 0.00011047830052969765, + "loss": 0.8338, + "step": 31473 + }, + { + "epoch": 0.8081640644433424, + "grad_norm": 0.7109375, + "learning_rate": 0.00011047386089251415, + "loss": 0.788, + "step": 31474 + }, + { + "epoch": 0.8081897416392643, + "grad_norm": 0.74609375, + "learning_rate": 0.00011046942123445718, + "loss": 0.8774, + "step": 31475 + }, + { + "epoch": 0.8082154188351861, + "grad_norm": 0.77734375, + "learning_rate": 0.00011046498155553561, + "loss": 0.7601, + "step": 31476 + }, + { + "epoch": 0.8082410960311079, + "grad_norm": 0.7734375, + "learning_rate": 0.0001104605418557583, + "loss": 0.7286, + "step": 31477 + }, + { + "epoch": 0.8082667732270298, + "grad_norm": 0.79296875, + "learning_rate": 0.00011045610213513406, + "loss": 0.8313, + "step": 31478 + }, + { + "epoch": 0.8082924504229516, + "grad_norm": 0.73828125, + "learning_rate": 0.00011045166239367172, + "loss": 0.8219, + "step": 31479 + }, + { + "epoch": 0.8083181276188733, + "grad_norm": 0.80078125, + "learning_rate": 0.0001104472226313802, + "loss": 0.7539, + "step": 31480 + }, + { + "epoch": 0.8083438048147952, + "grad_norm": 0.78515625, + "learning_rate": 0.00011044278284826832, + "loss": 0.8272, + "step": 31481 + }, + { + "epoch": 0.808369482010717, + "grad_norm": 0.84375, + "learning_rate": 0.00011043834304434486, + "loss": 0.8546, + "step": 31482 + }, + { + "epoch": 0.8083951592066388, + "grad_norm": 0.78125, + "learning_rate": 0.00011043390321961877, + "loss": 0.7921, + "step": 31483 + }, + { + "epoch": 0.8084208364025607, + "grad_norm": 0.85546875, + "learning_rate": 0.00011042946337409888, + "loss": 0.798, + "step": 31484 + }, + { + "epoch": 0.8084465135984825, + "grad_norm": 0.82421875, + "learning_rate": 0.00011042502350779395, + "loss": 0.8221, + "step": 31485 + }, + { + "epoch": 0.8084721907944042, + "grad_norm": 0.76953125, + "learning_rate": 0.00011042058362071293, + "loss": 0.7424, + "step": 31486 + }, + { + "epoch": 0.8084978679903261, + "grad_norm": 0.765625, + "learning_rate": 0.00011041614371286461, + "loss": 0.8626, + "step": 31487 + }, + { + "epoch": 0.8085235451862479, + "grad_norm": 0.84765625, + "learning_rate": 0.00011041170378425785, + "loss": 0.8361, + "step": 31488 + }, + { + "epoch": 0.8085492223821698, + "grad_norm": 0.7578125, + "learning_rate": 0.00011040726383490152, + "loss": 0.6061, + "step": 31489 + }, + { + "epoch": 0.8085748995780916, + "grad_norm": 0.7578125, + "learning_rate": 0.00011040282386480445, + "loss": 0.8542, + "step": 31490 + }, + { + "epoch": 0.8086005767740134, + "grad_norm": 0.8515625, + "learning_rate": 0.0001103983838739755, + "loss": 0.8117, + "step": 31491 + }, + { + "epoch": 0.8086262539699353, + "grad_norm": 0.8125, + "learning_rate": 0.00011039394386242349, + "loss": 0.7701, + "step": 31492 + }, + { + "epoch": 0.808651931165857, + "grad_norm": 0.8203125, + "learning_rate": 0.0001103895038301573, + "loss": 0.9789, + "step": 31493 + }, + { + "epoch": 0.8086776083617788, + "grad_norm": 0.80078125, + "learning_rate": 0.00011038506377718576, + "loss": 0.6676, + "step": 31494 + }, + { + "epoch": 0.8087032855577007, + "grad_norm": 0.7265625, + "learning_rate": 0.00011038062370351774, + "loss": 0.7555, + "step": 31495 + }, + { + "epoch": 0.8087289627536225, + "grad_norm": 0.8203125, + "learning_rate": 0.00011037618360916204, + "loss": 0.8823, + "step": 31496 + }, + { + "epoch": 0.8087546399495443, + "grad_norm": 0.73046875, + "learning_rate": 0.0001103717434941276, + "loss": 0.8067, + "step": 31497 + }, + { + "epoch": 0.8087803171454662, + "grad_norm": 0.78515625, + "learning_rate": 0.00011036730335842316, + "loss": 0.7675, + "step": 31498 + }, + { + "epoch": 0.808805994341388, + "grad_norm": 0.78515625, + "learning_rate": 0.00011036286320205767, + "loss": 0.9288, + "step": 31499 + }, + { + "epoch": 0.8088316715373097, + "grad_norm": 0.80859375, + "learning_rate": 0.00011035842302503989, + "loss": 0.8282, + "step": 31500 + }, + { + "epoch": 0.8088573487332316, + "grad_norm": 0.7734375, + "learning_rate": 0.0001103539828273787, + "loss": 0.8096, + "step": 31501 + }, + { + "epoch": 0.8088830259291534, + "grad_norm": 0.7265625, + "learning_rate": 0.00011034954260908299, + "loss": 0.7423, + "step": 31502 + }, + { + "epoch": 0.8089087031250752, + "grad_norm": 0.77734375, + "learning_rate": 0.00011034510237016158, + "loss": 0.8499, + "step": 31503 + }, + { + "epoch": 0.8089343803209971, + "grad_norm": 0.7421875, + "learning_rate": 0.0001103406621106233, + "loss": 0.8016, + "step": 31504 + }, + { + "epoch": 0.8089600575169189, + "grad_norm": 0.79296875, + "learning_rate": 0.00011033622183047702, + "loss": 0.9324, + "step": 31505 + }, + { + "epoch": 0.8089857347128406, + "grad_norm": 0.75, + "learning_rate": 0.00011033178152973155, + "loss": 0.7452, + "step": 31506 + }, + { + "epoch": 0.8090114119087625, + "grad_norm": 0.86328125, + "learning_rate": 0.00011032734120839581, + "loss": 0.7736, + "step": 31507 + }, + { + "epoch": 0.8090370891046843, + "grad_norm": 0.8203125, + "learning_rate": 0.0001103229008664786, + "loss": 0.7573, + "step": 31508 + }, + { + "epoch": 0.8090627663006061, + "grad_norm": 0.828125, + "learning_rate": 0.00011031846050398878, + "loss": 0.8094, + "step": 31509 + }, + { + "epoch": 0.809088443496528, + "grad_norm": 0.7734375, + "learning_rate": 0.00011031402012093523, + "loss": 0.7216, + "step": 31510 + }, + { + "epoch": 0.8091141206924498, + "grad_norm": 0.765625, + "learning_rate": 0.00011030957971732673, + "loss": 0.762, + "step": 31511 + }, + { + "epoch": 0.8091397978883716, + "grad_norm": 0.89453125, + "learning_rate": 0.00011030513929317219, + "loss": 0.8943, + "step": 31512 + }, + { + "epoch": 0.8091654750842934, + "grad_norm": 0.8125, + "learning_rate": 0.00011030069884848044, + "loss": 0.8957, + "step": 31513 + }, + { + "epoch": 0.8091911522802152, + "grad_norm": 0.7421875, + "learning_rate": 0.00011029625838326029, + "loss": 0.773, + "step": 31514 + }, + { + "epoch": 0.809216829476137, + "grad_norm": 0.6875, + "learning_rate": 0.00011029181789752066, + "loss": 0.7971, + "step": 31515 + }, + { + "epoch": 0.8092425066720589, + "grad_norm": 0.7890625, + "learning_rate": 0.00011028737739127037, + "loss": 0.7477, + "step": 31516 + }, + { + "epoch": 0.8092681838679807, + "grad_norm": 0.76953125, + "learning_rate": 0.00011028293686451827, + "loss": 0.8419, + "step": 31517 + }, + { + "epoch": 0.8092938610639026, + "grad_norm": 0.7578125, + "learning_rate": 0.0001102784963172732, + "loss": 0.7246, + "step": 31518 + }, + { + "epoch": 0.8093195382598244, + "grad_norm": 0.703125, + "learning_rate": 0.00011027405574954399, + "loss": 0.7362, + "step": 31519 + }, + { + "epoch": 0.8093452154557461, + "grad_norm": 0.76171875, + "learning_rate": 0.00011026961516133954, + "loss": 0.8066, + "step": 31520 + }, + { + "epoch": 0.809370892651668, + "grad_norm": 0.77734375, + "learning_rate": 0.00011026517455266867, + "loss": 0.8439, + "step": 31521 + }, + { + "epoch": 0.8093965698475898, + "grad_norm": 0.74609375, + "learning_rate": 0.00011026073392354022, + "loss": 0.6982, + "step": 31522 + }, + { + "epoch": 0.8094222470435116, + "grad_norm": 0.76953125, + "learning_rate": 0.0001102562932739631, + "loss": 0.7759, + "step": 31523 + }, + { + "epoch": 0.8094479242394335, + "grad_norm": 0.75, + "learning_rate": 0.00011025185260394609, + "loss": 0.8217, + "step": 31524 + }, + { + "epoch": 0.8094736014353553, + "grad_norm": 0.9453125, + "learning_rate": 0.00011024741191349804, + "loss": 0.8722, + "step": 31525 + }, + { + "epoch": 0.809499278631277, + "grad_norm": 0.78515625, + "learning_rate": 0.00011024297120262783, + "loss": 0.9497, + "step": 31526 + }, + { + "epoch": 0.8095249558271989, + "grad_norm": 0.83203125, + "learning_rate": 0.0001102385304713443, + "loss": 0.8798, + "step": 31527 + }, + { + "epoch": 0.8095506330231207, + "grad_norm": 0.73046875, + "learning_rate": 0.00011023408971965632, + "loss": 0.7925, + "step": 31528 + }, + { + "epoch": 0.8095763102190425, + "grad_norm": 0.73046875, + "learning_rate": 0.00011022964894757273, + "loss": 0.7512, + "step": 31529 + }, + { + "epoch": 0.8096019874149644, + "grad_norm": 0.7578125, + "learning_rate": 0.00011022520815510235, + "loss": 0.8125, + "step": 31530 + }, + { + "epoch": 0.8096276646108862, + "grad_norm": 0.89453125, + "learning_rate": 0.00011022076734225408, + "loss": 0.7428, + "step": 31531 + }, + { + "epoch": 0.809653341806808, + "grad_norm": 0.7890625, + "learning_rate": 0.00011021632650903674, + "loss": 0.9154, + "step": 31532 + }, + { + "epoch": 0.8096790190027298, + "grad_norm": 0.67578125, + "learning_rate": 0.00011021188565545917, + "loss": 0.8474, + "step": 31533 + }, + { + "epoch": 0.8097046961986516, + "grad_norm": 0.703125, + "learning_rate": 0.00011020744478153025, + "loss": 0.706, + "step": 31534 + }, + { + "epoch": 0.8097303733945734, + "grad_norm": 0.8125, + "learning_rate": 0.00011020300388725879, + "loss": 0.8293, + "step": 31535 + }, + { + "epoch": 0.8097560505904953, + "grad_norm": 0.8046875, + "learning_rate": 0.0001101985629726537, + "loss": 0.9983, + "step": 31536 + }, + { + "epoch": 0.8097817277864171, + "grad_norm": 0.76953125, + "learning_rate": 0.00011019412203772377, + "loss": 0.8602, + "step": 31537 + }, + { + "epoch": 0.8098074049823389, + "grad_norm": 0.703125, + "learning_rate": 0.00011018968108247786, + "loss": 0.6723, + "step": 31538 + }, + { + "epoch": 0.8098330821782608, + "grad_norm": 0.8984375, + "learning_rate": 0.00011018524010692486, + "loss": 0.739, + "step": 31539 + }, + { + "epoch": 0.8098587593741825, + "grad_norm": 0.8046875, + "learning_rate": 0.0001101807991110736, + "loss": 0.9388, + "step": 31540 + }, + { + "epoch": 0.8098844365701043, + "grad_norm": 0.78125, + "learning_rate": 0.0001101763580949329, + "loss": 0.6947, + "step": 31541 + }, + { + "epoch": 0.8099101137660262, + "grad_norm": 0.73046875, + "learning_rate": 0.00011017191705851166, + "loss": 0.9208, + "step": 31542 + }, + { + "epoch": 0.809935790961948, + "grad_norm": 0.68359375, + "learning_rate": 0.00011016747600181871, + "loss": 0.7196, + "step": 31543 + }, + { + "epoch": 0.8099614681578698, + "grad_norm": 0.7265625, + "learning_rate": 0.00011016303492486291, + "loss": 0.8306, + "step": 31544 + }, + { + "epoch": 0.8099871453537917, + "grad_norm": 0.78125, + "learning_rate": 0.00011015859382765308, + "loss": 0.8707, + "step": 31545 + }, + { + "epoch": 0.8100128225497134, + "grad_norm": 0.83203125, + "learning_rate": 0.00011015415271019809, + "loss": 0.82, + "step": 31546 + }, + { + "epoch": 0.8100384997456352, + "grad_norm": 0.81640625, + "learning_rate": 0.0001101497115725068, + "loss": 0.9097, + "step": 31547 + }, + { + "epoch": 0.8100641769415571, + "grad_norm": 0.8046875, + "learning_rate": 0.00011014527041458804, + "loss": 0.7968, + "step": 31548 + }, + { + "epoch": 0.8100898541374789, + "grad_norm": 0.75390625, + "learning_rate": 0.00011014082923645066, + "loss": 0.7837, + "step": 31549 + }, + { + "epoch": 0.8101155313334008, + "grad_norm": 0.9375, + "learning_rate": 0.00011013638803810358, + "loss": 0.8065, + "step": 31550 + }, + { + "epoch": 0.8101412085293226, + "grad_norm": 0.75390625, + "learning_rate": 0.00011013194681955552, + "loss": 0.7722, + "step": 31551 + }, + { + "epoch": 0.8101668857252444, + "grad_norm": 0.7734375, + "learning_rate": 0.00011012750558081546, + "loss": 0.8538, + "step": 31552 + }, + { + "epoch": 0.8101925629211661, + "grad_norm": 0.8125, + "learning_rate": 0.00011012306432189219, + "loss": 0.7767, + "step": 31553 + }, + { + "epoch": 0.810218240117088, + "grad_norm": 0.828125, + "learning_rate": 0.00011011862304279453, + "loss": 0.9028, + "step": 31554 + }, + { + "epoch": 0.8102439173130098, + "grad_norm": 0.73046875, + "learning_rate": 0.00011011418174353138, + "loss": 0.7894, + "step": 31555 + }, + { + "epoch": 0.8102695945089317, + "grad_norm": 0.82421875, + "learning_rate": 0.00011010974042411162, + "loss": 1.0, + "step": 31556 + }, + { + "epoch": 0.8102952717048535, + "grad_norm": 0.72265625, + "learning_rate": 0.00011010529908454403, + "loss": 0.756, + "step": 31557 + }, + { + "epoch": 0.8103209489007753, + "grad_norm": 0.82421875, + "learning_rate": 0.00011010085772483749, + "loss": 0.8854, + "step": 31558 + }, + { + "epoch": 0.8103466260966972, + "grad_norm": 0.8125, + "learning_rate": 0.00011009641634500086, + "loss": 0.7589, + "step": 31559 + }, + { + "epoch": 0.8103723032926189, + "grad_norm": 0.796875, + "learning_rate": 0.00011009197494504298, + "loss": 0.8991, + "step": 31560 + }, + { + "epoch": 0.8103979804885407, + "grad_norm": 0.7109375, + "learning_rate": 0.00011008753352497271, + "loss": 0.766, + "step": 31561 + }, + { + "epoch": 0.8104236576844626, + "grad_norm": 0.71875, + "learning_rate": 0.00011008309208479888, + "loss": 0.8683, + "step": 31562 + }, + { + "epoch": 0.8104493348803844, + "grad_norm": 0.828125, + "learning_rate": 0.0001100786506245304, + "loss": 0.8657, + "step": 31563 + }, + { + "epoch": 0.8104750120763062, + "grad_norm": 0.80078125, + "learning_rate": 0.00011007420914417603, + "loss": 0.7814, + "step": 31564 + }, + { + "epoch": 0.8105006892722281, + "grad_norm": 0.73046875, + "learning_rate": 0.0001100697676437447, + "loss": 0.8169, + "step": 31565 + }, + { + "epoch": 0.8105263664681498, + "grad_norm": 0.83984375, + "learning_rate": 0.00011006532612324523, + "loss": 0.7773, + "step": 31566 + }, + { + "epoch": 0.8105520436640716, + "grad_norm": 0.87890625, + "learning_rate": 0.00011006088458268646, + "loss": 0.7355, + "step": 31567 + }, + { + "epoch": 0.8105777208599935, + "grad_norm": 0.81640625, + "learning_rate": 0.00011005644302207727, + "loss": 0.9944, + "step": 31568 + }, + { + "epoch": 0.8106033980559153, + "grad_norm": 0.8359375, + "learning_rate": 0.00011005200144142651, + "loss": 0.8594, + "step": 31569 + }, + { + "epoch": 0.8106290752518371, + "grad_norm": 0.95703125, + "learning_rate": 0.000110047559840743, + "loss": 0.8363, + "step": 31570 + }, + { + "epoch": 0.810654752447759, + "grad_norm": 1.0078125, + "learning_rate": 0.00011004311822003561, + "loss": 0.808, + "step": 31571 + }, + { + "epoch": 0.8106804296436808, + "grad_norm": 0.7421875, + "learning_rate": 0.0001100386765793132, + "loss": 0.7425, + "step": 31572 + }, + { + "epoch": 0.8107061068396025, + "grad_norm": 0.86328125, + "learning_rate": 0.0001100342349185846, + "loss": 0.8657, + "step": 31573 + }, + { + "epoch": 0.8107317840355244, + "grad_norm": 0.7890625, + "learning_rate": 0.00011002979323785869, + "loss": 0.904, + "step": 31574 + }, + { + "epoch": 0.8107574612314462, + "grad_norm": 0.8203125, + "learning_rate": 0.00011002535153714429, + "loss": 0.7872, + "step": 31575 + }, + { + "epoch": 0.810783138427368, + "grad_norm": 0.76953125, + "learning_rate": 0.00011002090981645031, + "loss": 0.7817, + "step": 31576 + }, + { + "epoch": 0.8108088156232899, + "grad_norm": 0.75390625, + "learning_rate": 0.00011001646807578554, + "loss": 0.7773, + "step": 31577 + }, + { + "epoch": 0.8108344928192117, + "grad_norm": 0.8359375, + "learning_rate": 0.00011001202631515884, + "loss": 0.8516, + "step": 31578 + }, + { + "epoch": 0.8108601700151336, + "grad_norm": 0.78515625, + "learning_rate": 0.00011000758453457908, + "loss": 0.8857, + "step": 31579 + }, + { + "epoch": 0.8108858472110553, + "grad_norm": 0.859375, + "learning_rate": 0.00011000314273405512, + "loss": 0.8755, + "step": 31580 + }, + { + "epoch": 0.8109115244069771, + "grad_norm": 0.765625, + "learning_rate": 0.00010999870091359579, + "loss": 0.8088, + "step": 31581 + }, + { + "epoch": 0.810937201602899, + "grad_norm": 0.8671875, + "learning_rate": 0.00010999425907320997, + "loss": 0.9824, + "step": 31582 + }, + { + "epoch": 0.8109628787988208, + "grad_norm": 0.7421875, + "learning_rate": 0.00010998981721290647, + "loss": 0.7925, + "step": 31583 + }, + { + "epoch": 0.8109885559947426, + "grad_norm": 0.83203125, + "learning_rate": 0.00010998537533269418, + "loss": 0.8058, + "step": 31584 + }, + { + "epoch": 0.8110142331906645, + "grad_norm": 0.796875, + "learning_rate": 0.00010998093343258196, + "loss": 0.9347, + "step": 31585 + }, + { + "epoch": 0.8110399103865862, + "grad_norm": 0.76953125, + "learning_rate": 0.00010997649151257858, + "loss": 0.7656, + "step": 31586 + }, + { + "epoch": 0.811065587582508, + "grad_norm": 0.6953125, + "learning_rate": 0.000109972049572693, + "loss": 0.734, + "step": 31587 + }, + { + "epoch": 0.8110912647784299, + "grad_norm": 0.99609375, + "learning_rate": 0.00010996760761293403, + "loss": 0.8024, + "step": 31588 + }, + { + "epoch": 0.8111169419743517, + "grad_norm": 0.76171875, + "learning_rate": 0.00010996316563331051, + "loss": 0.7125, + "step": 31589 + }, + { + "epoch": 0.8111426191702735, + "grad_norm": 0.94140625, + "learning_rate": 0.0001099587236338313, + "loss": 0.7687, + "step": 31590 + }, + { + "epoch": 0.8111682963661954, + "grad_norm": 0.76171875, + "learning_rate": 0.00010995428161450523, + "loss": 0.7722, + "step": 31591 + }, + { + "epoch": 0.8111939735621172, + "grad_norm": 0.78125, + "learning_rate": 0.0001099498395753412, + "loss": 0.7934, + "step": 31592 + }, + { + "epoch": 0.8112196507580389, + "grad_norm": 0.7578125, + "learning_rate": 0.00010994539751634805, + "loss": 0.8062, + "step": 31593 + }, + { + "epoch": 0.8112453279539608, + "grad_norm": 0.79296875, + "learning_rate": 0.00010994095543753457, + "loss": 0.7416, + "step": 31594 + }, + { + "epoch": 0.8112710051498826, + "grad_norm": 0.79296875, + "learning_rate": 0.00010993651333890974, + "loss": 0.8752, + "step": 31595 + }, + { + "epoch": 0.8112966823458044, + "grad_norm": 0.8359375, + "learning_rate": 0.00010993207122048227, + "loss": 0.6447, + "step": 31596 + }, + { + "epoch": 0.8113223595417263, + "grad_norm": 0.8359375, + "learning_rate": 0.00010992762908226113, + "loss": 0.847, + "step": 31597 + }, + { + "epoch": 0.8113480367376481, + "grad_norm": 0.76171875, + "learning_rate": 0.0001099231869242551, + "loss": 0.7175, + "step": 31598 + }, + { + "epoch": 0.8113737139335699, + "grad_norm": 0.75, + "learning_rate": 0.00010991874474647303, + "loss": 0.8463, + "step": 31599 + }, + { + "epoch": 0.8113993911294917, + "grad_norm": 0.78515625, + "learning_rate": 0.00010991430254892383, + "loss": 0.7183, + "step": 31600 + }, + { + "epoch": 0.8114250683254135, + "grad_norm": 0.80078125, + "learning_rate": 0.0001099098603316163, + "loss": 0.7065, + "step": 31601 + }, + { + "epoch": 0.8114507455213353, + "grad_norm": 0.83203125, + "learning_rate": 0.00010990541809455934, + "loss": 0.9064, + "step": 31602 + }, + { + "epoch": 0.8114764227172572, + "grad_norm": 0.76953125, + "learning_rate": 0.00010990097583776175, + "loss": 0.8623, + "step": 31603 + }, + { + "epoch": 0.811502099913179, + "grad_norm": 0.7734375, + "learning_rate": 0.00010989653356123242, + "loss": 0.8666, + "step": 31604 + }, + { + "epoch": 0.8115277771091008, + "grad_norm": 0.78125, + "learning_rate": 0.00010989209126498019, + "loss": 0.8401, + "step": 31605 + }, + { + "epoch": 0.8115534543050226, + "grad_norm": 0.765625, + "learning_rate": 0.00010988764894901394, + "loss": 0.9263, + "step": 31606 + }, + { + "epoch": 0.8115791315009444, + "grad_norm": 0.74609375, + "learning_rate": 0.00010988320661334243, + "loss": 0.8432, + "step": 31607 + }, + { + "epoch": 0.8116048086968662, + "grad_norm": 0.8046875, + "learning_rate": 0.00010987876425797465, + "loss": 0.7418, + "step": 31608 + }, + { + "epoch": 0.8116304858927881, + "grad_norm": 0.7421875, + "learning_rate": 0.00010987432188291939, + "loss": 0.8307, + "step": 31609 + }, + { + "epoch": 0.8116561630887099, + "grad_norm": 0.78515625, + "learning_rate": 0.00010986987948818546, + "loss": 0.7799, + "step": 31610 + }, + { + "epoch": 0.8116818402846318, + "grad_norm": 0.83984375, + "learning_rate": 0.00010986543707378177, + "loss": 0.718, + "step": 31611 + }, + { + "epoch": 0.8117075174805536, + "grad_norm": 0.7421875, + "learning_rate": 0.00010986099463971716, + "loss": 0.7264, + "step": 31612 + }, + { + "epoch": 0.8117331946764753, + "grad_norm": 0.74609375, + "learning_rate": 0.00010985655218600044, + "loss": 0.7342, + "step": 31613 + }, + { + "epoch": 0.8117588718723971, + "grad_norm": 0.78125, + "learning_rate": 0.00010985210971264055, + "loss": 0.7947, + "step": 31614 + }, + { + "epoch": 0.811784549068319, + "grad_norm": 0.75, + "learning_rate": 0.00010984766721964628, + "loss": 0.7655, + "step": 31615 + }, + { + "epoch": 0.8118102262642408, + "grad_norm": 0.7421875, + "learning_rate": 0.00010984322470702648, + "loss": 0.8638, + "step": 31616 + }, + { + "epoch": 0.8118359034601627, + "grad_norm": 0.79296875, + "learning_rate": 0.00010983878217479004, + "loss": 0.7816, + "step": 31617 + }, + { + "epoch": 0.8118615806560845, + "grad_norm": 0.83203125, + "learning_rate": 0.00010983433962294578, + "loss": 0.7691, + "step": 31618 + }, + { + "epoch": 0.8118872578520062, + "grad_norm": 0.76953125, + "learning_rate": 0.0001098298970515026, + "loss": 0.7223, + "step": 31619 + }, + { + "epoch": 0.811912935047928, + "grad_norm": 0.75390625, + "learning_rate": 0.0001098254544604693, + "loss": 0.8464, + "step": 31620 + }, + { + "epoch": 0.8119386122438499, + "grad_norm": 0.73046875, + "learning_rate": 0.00010982101184985473, + "loss": 0.772, + "step": 31621 + }, + { + "epoch": 0.8119642894397717, + "grad_norm": 0.828125, + "learning_rate": 0.00010981656921966785, + "loss": 0.785, + "step": 31622 + }, + { + "epoch": 0.8119899666356936, + "grad_norm": 0.77734375, + "learning_rate": 0.00010981212656991735, + "loss": 0.8542, + "step": 31623 + }, + { + "epoch": 0.8120156438316154, + "grad_norm": 0.7890625, + "learning_rate": 0.00010980768390061222, + "loss": 0.7495, + "step": 31624 + }, + { + "epoch": 0.8120413210275372, + "grad_norm": 0.80078125, + "learning_rate": 0.00010980324121176124, + "loss": 0.8153, + "step": 31625 + }, + { + "epoch": 0.812066998223459, + "grad_norm": 0.87890625, + "learning_rate": 0.0001097987985033733, + "loss": 0.7769, + "step": 31626 + }, + { + "epoch": 0.8120926754193808, + "grad_norm": 0.7734375, + "learning_rate": 0.00010979435577545723, + "loss": 0.8359, + "step": 31627 + }, + { + "epoch": 0.8121183526153026, + "grad_norm": 0.765625, + "learning_rate": 0.00010978991302802188, + "loss": 0.9273, + "step": 31628 + }, + { + "epoch": 0.8121440298112245, + "grad_norm": 0.765625, + "learning_rate": 0.00010978547026107615, + "loss": 0.774, + "step": 31629 + }, + { + "epoch": 0.8121697070071463, + "grad_norm": 0.73046875, + "learning_rate": 0.00010978102747462884, + "loss": 0.8002, + "step": 31630 + }, + { + "epoch": 0.8121953842030681, + "grad_norm": 0.7421875, + "learning_rate": 0.00010977658466868883, + "loss": 0.8098, + "step": 31631 + }, + { + "epoch": 0.81222106139899, + "grad_norm": 0.8203125, + "learning_rate": 0.00010977214184326496, + "loss": 0.7531, + "step": 31632 + }, + { + "epoch": 0.8122467385949117, + "grad_norm": 0.80078125, + "learning_rate": 0.00010976769899836611, + "loss": 0.8559, + "step": 31633 + }, + { + "epoch": 0.8122724157908335, + "grad_norm": 0.82421875, + "learning_rate": 0.00010976325613400109, + "loss": 0.8083, + "step": 31634 + }, + { + "epoch": 0.8122980929867554, + "grad_norm": 0.74609375, + "learning_rate": 0.00010975881325017884, + "loss": 0.8068, + "step": 31635 + }, + { + "epoch": 0.8123237701826772, + "grad_norm": 0.7421875, + "learning_rate": 0.00010975437034690812, + "loss": 0.7696, + "step": 31636 + }, + { + "epoch": 0.812349447378599, + "grad_norm": 0.79296875, + "learning_rate": 0.0001097499274241978, + "loss": 0.8361, + "step": 31637 + }, + { + "epoch": 0.8123751245745209, + "grad_norm": 0.72265625, + "learning_rate": 0.0001097454844820568, + "loss": 0.8221, + "step": 31638 + }, + { + "epoch": 0.8124008017704426, + "grad_norm": 0.7890625, + "learning_rate": 0.00010974104152049389, + "loss": 0.8255, + "step": 31639 + }, + { + "epoch": 0.8124264789663644, + "grad_norm": 0.78125, + "learning_rate": 0.00010973659853951799, + "loss": 0.7509, + "step": 31640 + }, + { + "epoch": 0.8124521561622863, + "grad_norm": 0.76953125, + "learning_rate": 0.00010973215553913791, + "loss": 0.9137, + "step": 31641 + }, + { + "epoch": 0.8124778333582081, + "grad_norm": 0.875, + "learning_rate": 0.00010972771251936256, + "loss": 0.8238, + "step": 31642 + }, + { + "epoch": 0.81250351055413, + "grad_norm": 0.80078125, + "learning_rate": 0.00010972326948020073, + "loss": 0.7918, + "step": 31643 + }, + { + "epoch": 0.8125291877500518, + "grad_norm": 0.80859375, + "learning_rate": 0.0001097188264216613, + "loss": 0.8129, + "step": 31644 + }, + { + "epoch": 0.8125548649459736, + "grad_norm": 0.83203125, + "learning_rate": 0.00010971438334375313, + "loss": 0.8347, + "step": 31645 + }, + { + "epoch": 0.8125805421418953, + "grad_norm": 0.80078125, + "learning_rate": 0.00010970994024648507, + "loss": 0.739, + "step": 31646 + }, + { + "epoch": 0.8126062193378172, + "grad_norm": 0.75390625, + "learning_rate": 0.00010970549712986595, + "loss": 0.724, + "step": 31647 + }, + { + "epoch": 0.812631896533739, + "grad_norm": 0.8046875, + "learning_rate": 0.00010970105399390471, + "loss": 0.8531, + "step": 31648 + }, + { + "epoch": 0.8126575737296609, + "grad_norm": 0.7265625, + "learning_rate": 0.00010969661083861011, + "loss": 0.7945, + "step": 31649 + }, + { + "epoch": 0.8126832509255827, + "grad_norm": 0.71875, + "learning_rate": 0.00010969216766399104, + "loss": 0.729, + "step": 31650 + }, + { + "epoch": 0.8127089281215045, + "grad_norm": 0.796875, + "learning_rate": 0.00010968772447005636, + "loss": 0.711, + "step": 31651 + }, + { + "epoch": 0.8127346053174264, + "grad_norm": 0.71875, + "learning_rate": 0.0001096832812568149, + "loss": 0.6993, + "step": 31652 + }, + { + "epoch": 0.8127602825133481, + "grad_norm": 0.80859375, + "learning_rate": 0.00010967883802427556, + "loss": 0.7761, + "step": 31653 + }, + { + "epoch": 0.8127859597092699, + "grad_norm": 0.734375, + "learning_rate": 0.00010967439477244719, + "loss": 0.7667, + "step": 31654 + }, + { + "epoch": 0.8128116369051918, + "grad_norm": 0.8125, + "learning_rate": 0.00010966995150133859, + "loss": 0.7846, + "step": 31655 + }, + { + "epoch": 0.8128373141011136, + "grad_norm": 0.76953125, + "learning_rate": 0.00010966550821095867, + "loss": 0.9835, + "step": 31656 + }, + { + "epoch": 0.8128629912970354, + "grad_norm": 0.734375, + "learning_rate": 0.00010966106490131627, + "loss": 0.8387, + "step": 31657 + }, + { + "epoch": 0.8128886684929573, + "grad_norm": 0.83203125, + "learning_rate": 0.0001096566215724202, + "loss": 0.8227, + "step": 31658 + }, + { + "epoch": 0.812914345688879, + "grad_norm": 0.8671875, + "learning_rate": 0.00010965217822427938, + "loss": 0.8409, + "step": 31659 + }, + { + "epoch": 0.8129400228848008, + "grad_norm": 0.74609375, + "learning_rate": 0.00010964773485690265, + "loss": 0.7803, + "step": 31660 + }, + { + "epoch": 0.8129657000807227, + "grad_norm": 0.77734375, + "learning_rate": 0.00010964329147029886, + "loss": 0.7809, + "step": 31661 + }, + { + "epoch": 0.8129913772766445, + "grad_norm": 0.79296875, + "learning_rate": 0.00010963884806447686, + "loss": 0.7866, + "step": 31662 + }, + { + "epoch": 0.8130170544725663, + "grad_norm": 0.76171875, + "learning_rate": 0.00010963440463944547, + "loss": 0.7598, + "step": 31663 + }, + { + "epoch": 0.8130427316684882, + "grad_norm": 0.734375, + "learning_rate": 0.00010962996119521362, + "loss": 0.7633, + "step": 31664 + }, + { + "epoch": 0.81306840886441, + "grad_norm": 0.78515625, + "learning_rate": 0.0001096255177317901, + "loss": 0.8275, + "step": 31665 + }, + { + "epoch": 0.8130940860603317, + "grad_norm": 0.75390625, + "learning_rate": 0.0001096210742491838, + "loss": 0.7312, + "step": 31666 + }, + { + "epoch": 0.8131197632562536, + "grad_norm": 0.8515625, + "learning_rate": 0.00010961663074740358, + "loss": 0.8513, + "step": 31667 + }, + { + "epoch": 0.8131454404521754, + "grad_norm": 0.890625, + "learning_rate": 0.0001096121872264583, + "loss": 0.8312, + "step": 31668 + }, + { + "epoch": 0.8131711176480972, + "grad_norm": 0.78125, + "learning_rate": 0.00010960774368635678, + "loss": 0.8782, + "step": 31669 + }, + { + "epoch": 0.8131967948440191, + "grad_norm": 0.8046875, + "learning_rate": 0.00010960330012710789, + "loss": 0.8321, + "step": 31670 + }, + { + "epoch": 0.8132224720399409, + "grad_norm": 0.828125, + "learning_rate": 0.00010959885654872049, + "loss": 0.8035, + "step": 31671 + }, + { + "epoch": 0.8132481492358627, + "grad_norm": 0.78515625, + "learning_rate": 0.00010959441295120343, + "loss": 0.9024, + "step": 31672 + }, + { + "epoch": 0.8132738264317845, + "grad_norm": 0.72265625, + "learning_rate": 0.0001095899693345656, + "loss": 0.8272, + "step": 31673 + }, + { + "epoch": 0.8132995036277063, + "grad_norm": 0.79296875, + "learning_rate": 0.00010958552569881577, + "loss": 0.7872, + "step": 31674 + }, + { + "epoch": 0.8133251808236281, + "grad_norm": 0.78515625, + "learning_rate": 0.00010958108204396294, + "loss": 0.7961, + "step": 31675 + }, + { + "epoch": 0.81335085801955, + "grad_norm": 0.76953125, + "learning_rate": 0.00010957663837001582, + "loss": 0.822, + "step": 31676 + }, + { + "epoch": 0.8133765352154718, + "grad_norm": 0.7578125, + "learning_rate": 0.00010957219467698332, + "loss": 0.8084, + "step": 31677 + }, + { + "epoch": 0.8134022124113937, + "grad_norm": 0.796875, + "learning_rate": 0.00010956775096487433, + "loss": 0.8375, + "step": 31678 + }, + { + "epoch": 0.8134278896073154, + "grad_norm": 0.8203125, + "learning_rate": 0.00010956330723369765, + "loss": 0.7925, + "step": 31679 + }, + { + "epoch": 0.8134535668032372, + "grad_norm": 0.7890625, + "learning_rate": 0.00010955886348346218, + "loss": 0.9387, + "step": 31680 + }, + { + "epoch": 0.813479243999159, + "grad_norm": 0.75, + "learning_rate": 0.00010955441971417675, + "loss": 0.7316, + "step": 31681 + }, + { + "epoch": 0.8135049211950809, + "grad_norm": 0.87890625, + "learning_rate": 0.00010954997592585023, + "loss": 0.7918, + "step": 31682 + }, + { + "epoch": 0.8135305983910027, + "grad_norm": 0.796875, + "learning_rate": 0.00010954553211849148, + "loss": 0.8128, + "step": 31683 + }, + { + "epoch": 0.8135562755869246, + "grad_norm": 0.75, + "learning_rate": 0.00010954108829210931, + "loss": 0.6971, + "step": 31684 + }, + { + "epoch": 0.8135819527828464, + "grad_norm": 0.78125, + "learning_rate": 0.00010953664444671266, + "loss": 0.7878, + "step": 31685 + }, + { + "epoch": 0.8136076299787681, + "grad_norm": 0.79296875, + "learning_rate": 0.0001095322005823103, + "loss": 0.6638, + "step": 31686 + }, + { + "epoch": 0.81363330717469, + "grad_norm": 0.78125, + "learning_rate": 0.00010952775669891113, + "loss": 0.8603, + "step": 31687 + }, + { + "epoch": 0.8136589843706118, + "grad_norm": 0.77734375, + "learning_rate": 0.00010952331279652406, + "loss": 0.8221, + "step": 31688 + }, + { + "epoch": 0.8136846615665336, + "grad_norm": 0.76171875, + "learning_rate": 0.00010951886887515781, + "loss": 0.7566, + "step": 31689 + }, + { + "epoch": 0.8137103387624555, + "grad_norm": 0.7734375, + "learning_rate": 0.00010951442493482136, + "loss": 0.8211, + "step": 31690 + }, + { + "epoch": 0.8137360159583773, + "grad_norm": 0.8359375, + "learning_rate": 0.00010950998097552352, + "loss": 0.8516, + "step": 31691 + }, + { + "epoch": 0.8137616931542991, + "grad_norm": 0.7734375, + "learning_rate": 0.00010950553699727311, + "loss": 0.6632, + "step": 31692 + }, + { + "epoch": 0.8137873703502209, + "grad_norm": 0.83984375, + "learning_rate": 0.00010950109300007904, + "loss": 0.8472, + "step": 31693 + }, + { + "epoch": 0.8138130475461427, + "grad_norm": 0.74609375, + "learning_rate": 0.0001094966489839502, + "loss": 0.8266, + "step": 31694 + }, + { + "epoch": 0.8138387247420645, + "grad_norm": 0.79296875, + "learning_rate": 0.00010949220494889534, + "loss": 0.809, + "step": 31695 + }, + { + "epoch": 0.8138644019379864, + "grad_norm": 0.76171875, + "learning_rate": 0.00010948776089492338, + "loss": 0.8797, + "step": 31696 + }, + { + "epoch": 0.8138900791339082, + "grad_norm": 0.734375, + "learning_rate": 0.00010948331682204318, + "loss": 0.8126, + "step": 31697 + }, + { + "epoch": 0.81391575632983, + "grad_norm": 0.83984375, + "learning_rate": 0.00010947887273026358, + "loss": 0.8286, + "step": 31698 + }, + { + "epoch": 0.8139414335257518, + "grad_norm": 0.69921875, + "learning_rate": 0.00010947442861959345, + "loss": 0.772, + "step": 31699 + }, + { + "epoch": 0.8139671107216736, + "grad_norm": 0.7890625, + "learning_rate": 0.0001094699844900416, + "loss": 0.7282, + "step": 31700 + }, + { + "epoch": 0.8139927879175954, + "grad_norm": 0.8359375, + "learning_rate": 0.000109465540341617, + "loss": 0.8059, + "step": 31701 + }, + { + "epoch": 0.8140184651135173, + "grad_norm": 0.75, + "learning_rate": 0.0001094610961743284, + "loss": 0.6996, + "step": 31702 + }, + { + "epoch": 0.8140441423094391, + "grad_norm": 0.85546875, + "learning_rate": 0.00010945665198818466, + "loss": 0.8874, + "step": 31703 + }, + { + "epoch": 0.814069819505361, + "grad_norm": 0.80078125, + "learning_rate": 0.0001094522077831947, + "loss": 0.754, + "step": 31704 + }, + { + "epoch": 0.8140954967012828, + "grad_norm": 0.75, + "learning_rate": 0.00010944776355936734, + "loss": 0.7967, + "step": 31705 + }, + { + "epoch": 0.8141211738972045, + "grad_norm": 0.7890625, + "learning_rate": 0.00010944331931671141, + "loss": 0.8683, + "step": 31706 + }, + { + "epoch": 0.8141468510931263, + "grad_norm": 0.79296875, + "learning_rate": 0.00010943887505523586, + "loss": 0.7872, + "step": 31707 + }, + { + "epoch": 0.8141725282890482, + "grad_norm": 0.84765625, + "learning_rate": 0.00010943443077494944, + "loss": 0.8023, + "step": 31708 + }, + { + "epoch": 0.81419820548497, + "grad_norm": 0.82421875, + "learning_rate": 0.00010942998647586105, + "loss": 0.8372, + "step": 31709 + }, + { + "epoch": 0.8142238826808919, + "grad_norm": 0.69921875, + "learning_rate": 0.00010942554215797955, + "loss": 0.7217, + "step": 31710 + }, + { + "epoch": 0.8142495598768137, + "grad_norm": 0.73046875, + "learning_rate": 0.0001094210978213138, + "loss": 0.7501, + "step": 31711 + }, + { + "epoch": 0.8142752370727355, + "grad_norm": 0.78125, + "learning_rate": 0.00010941665346587266, + "loss": 0.8309, + "step": 31712 + }, + { + "epoch": 0.8143009142686572, + "grad_norm": 0.71875, + "learning_rate": 0.00010941220909166499, + "loss": 0.7709, + "step": 31713 + }, + { + "epoch": 0.8143265914645791, + "grad_norm": 0.8984375, + "learning_rate": 0.00010940776469869962, + "loss": 0.8853, + "step": 31714 + }, + { + "epoch": 0.8143522686605009, + "grad_norm": 0.86328125, + "learning_rate": 0.00010940332028698543, + "loss": 0.9305, + "step": 31715 + }, + { + "epoch": 0.8143779458564228, + "grad_norm": 0.7734375, + "learning_rate": 0.00010939887585653126, + "loss": 0.7354, + "step": 31716 + }, + { + "epoch": 0.8144036230523446, + "grad_norm": 0.8359375, + "learning_rate": 0.000109394431407346, + "loss": 0.8837, + "step": 31717 + }, + { + "epoch": 0.8144293002482664, + "grad_norm": 0.83203125, + "learning_rate": 0.00010938998693943849, + "loss": 0.945, + "step": 31718 + }, + { + "epoch": 0.8144549774441882, + "grad_norm": 0.76953125, + "learning_rate": 0.00010938554245281755, + "loss": 0.8581, + "step": 31719 + }, + { + "epoch": 0.81448065464011, + "grad_norm": 0.73828125, + "learning_rate": 0.00010938109794749209, + "loss": 0.6963, + "step": 31720 + }, + { + "epoch": 0.8145063318360318, + "grad_norm": 0.80078125, + "learning_rate": 0.00010937665342347096, + "loss": 0.8301, + "step": 31721 + }, + { + "epoch": 0.8145320090319537, + "grad_norm": 0.73046875, + "learning_rate": 0.00010937220888076301, + "loss": 0.7662, + "step": 31722 + }, + { + "epoch": 0.8145576862278755, + "grad_norm": 0.796875, + "learning_rate": 0.0001093677643193771, + "loss": 0.83, + "step": 31723 + }, + { + "epoch": 0.8145833634237973, + "grad_norm": 0.8359375, + "learning_rate": 0.00010936331973932203, + "loss": 0.8184, + "step": 31724 + }, + { + "epoch": 0.8146090406197192, + "grad_norm": 0.76171875, + "learning_rate": 0.00010935887514060677, + "loss": 0.7024, + "step": 31725 + }, + { + "epoch": 0.8146347178156409, + "grad_norm": 0.890625, + "learning_rate": 0.00010935443052324007, + "loss": 0.9734, + "step": 31726 + }, + { + "epoch": 0.8146603950115627, + "grad_norm": 0.73828125, + "learning_rate": 0.00010934998588723089, + "loss": 0.8067, + "step": 31727 + }, + { + "epoch": 0.8146860722074846, + "grad_norm": 0.734375, + "learning_rate": 0.000109345541232588, + "loss": 0.7632, + "step": 31728 + }, + { + "epoch": 0.8147117494034064, + "grad_norm": 0.80859375, + "learning_rate": 0.00010934109655932028, + "loss": 0.7222, + "step": 31729 + }, + { + "epoch": 0.8147374265993282, + "grad_norm": 0.7578125, + "learning_rate": 0.0001093366518674366, + "loss": 0.7582, + "step": 31730 + }, + { + "epoch": 0.8147631037952501, + "grad_norm": 0.7890625, + "learning_rate": 0.00010933220715694583, + "loss": 0.7545, + "step": 31731 + }, + { + "epoch": 0.8147887809911719, + "grad_norm": 0.78125, + "learning_rate": 0.0001093277624278568, + "loss": 0.7338, + "step": 31732 + }, + { + "epoch": 0.8148144581870936, + "grad_norm": 0.765625, + "learning_rate": 0.00010932331768017838, + "loss": 0.7214, + "step": 31733 + }, + { + "epoch": 0.8148401353830155, + "grad_norm": 0.83984375, + "learning_rate": 0.00010931887291391949, + "loss": 0.9544, + "step": 31734 + }, + { + "epoch": 0.8148658125789373, + "grad_norm": 0.7890625, + "learning_rate": 0.00010931442812908885, + "loss": 0.8585, + "step": 31735 + }, + { + "epoch": 0.8148914897748591, + "grad_norm": 0.80859375, + "learning_rate": 0.00010930998332569544, + "loss": 0.9245, + "step": 31736 + }, + { + "epoch": 0.814917166970781, + "grad_norm": 0.73046875, + "learning_rate": 0.00010930553850374803, + "loss": 0.8314, + "step": 31737 + }, + { + "epoch": 0.8149428441667028, + "grad_norm": 0.80859375, + "learning_rate": 0.00010930109366325557, + "loss": 0.7719, + "step": 31738 + }, + { + "epoch": 0.8149685213626245, + "grad_norm": 0.8046875, + "learning_rate": 0.00010929664880422685, + "loss": 0.7796, + "step": 31739 + }, + { + "epoch": 0.8149941985585464, + "grad_norm": 0.73046875, + "learning_rate": 0.00010929220392667074, + "loss": 0.7802, + "step": 31740 + }, + { + "epoch": 0.8150198757544682, + "grad_norm": 0.828125, + "learning_rate": 0.00010928775903059613, + "loss": 0.9524, + "step": 31741 + }, + { + "epoch": 0.81504555295039, + "grad_norm": 0.84765625, + "learning_rate": 0.00010928331411601185, + "loss": 0.7878, + "step": 31742 + }, + { + "epoch": 0.8150712301463119, + "grad_norm": 0.87109375, + "learning_rate": 0.00010927886918292676, + "loss": 0.8679, + "step": 31743 + }, + { + "epoch": 0.8150969073422337, + "grad_norm": 0.84375, + "learning_rate": 0.0001092744242313497, + "loss": 0.8071, + "step": 31744 + }, + { + "epoch": 0.8151225845381556, + "grad_norm": 0.87109375, + "learning_rate": 0.00010926997926128958, + "loss": 0.7785, + "step": 31745 + }, + { + "epoch": 0.8151482617340773, + "grad_norm": 0.80078125, + "learning_rate": 0.0001092655342727552, + "loss": 0.9033, + "step": 31746 + }, + { + "epoch": 0.8151739389299991, + "grad_norm": 0.734375, + "learning_rate": 0.0001092610892657555, + "loss": 0.8866, + "step": 31747 + }, + { + "epoch": 0.815199616125921, + "grad_norm": 0.78515625, + "learning_rate": 0.00010925664424029922, + "loss": 0.7253, + "step": 31748 + }, + { + "epoch": 0.8152252933218428, + "grad_norm": 0.7265625, + "learning_rate": 0.00010925219919639533, + "loss": 0.8103, + "step": 31749 + }, + { + "epoch": 0.8152509705177646, + "grad_norm": 0.78125, + "learning_rate": 0.00010924775413405264, + "loss": 0.7379, + "step": 31750 + }, + { + "epoch": 0.8152766477136865, + "grad_norm": 0.7734375, + "learning_rate": 0.00010924330905327997, + "loss": 0.8072, + "step": 31751 + }, + { + "epoch": 0.8153023249096083, + "grad_norm": 0.68359375, + "learning_rate": 0.00010923886395408626, + "loss": 0.7383, + "step": 31752 + }, + { + "epoch": 0.81532800210553, + "grad_norm": 0.7578125, + "learning_rate": 0.00010923441883648031, + "loss": 0.8002, + "step": 31753 + }, + { + "epoch": 0.8153536793014519, + "grad_norm": 0.74609375, + "learning_rate": 0.00010922997370047104, + "loss": 0.7702, + "step": 31754 + }, + { + "epoch": 0.8153793564973737, + "grad_norm": 0.85546875, + "learning_rate": 0.00010922552854606722, + "loss": 0.9148, + "step": 31755 + }, + { + "epoch": 0.8154050336932955, + "grad_norm": 0.7734375, + "learning_rate": 0.00010922108337327774, + "loss": 0.8039, + "step": 31756 + }, + { + "epoch": 0.8154307108892174, + "grad_norm": 0.8125, + "learning_rate": 0.00010921663818211152, + "loss": 0.8812, + "step": 31757 + }, + { + "epoch": 0.8154563880851392, + "grad_norm": 0.8203125, + "learning_rate": 0.00010921219297257734, + "loss": 0.8185, + "step": 31758 + }, + { + "epoch": 0.8154820652810609, + "grad_norm": 0.7890625, + "learning_rate": 0.0001092077477446841, + "loss": 0.7358, + "step": 31759 + }, + { + "epoch": 0.8155077424769828, + "grad_norm": 0.7265625, + "learning_rate": 0.00010920330249844069, + "loss": 0.7804, + "step": 31760 + }, + { + "epoch": 0.8155334196729046, + "grad_norm": 0.734375, + "learning_rate": 0.00010919885723385586, + "loss": 0.7741, + "step": 31761 + }, + { + "epoch": 0.8155590968688264, + "grad_norm": 0.7890625, + "learning_rate": 0.00010919441195093858, + "loss": 0.717, + "step": 31762 + }, + { + "epoch": 0.8155847740647483, + "grad_norm": 0.83203125, + "learning_rate": 0.00010918996664969768, + "loss": 0.7968, + "step": 31763 + }, + { + "epoch": 0.8156104512606701, + "grad_norm": 0.7734375, + "learning_rate": 0.00010918552133014194, + "loss": 0.7791, + "step": 31764 + }, + { + "epoch": 0.815636128456592, + "grad_norm": 0.74609375, + "learning_rate": 0.00010918107599228036, + "loss": 0.803, + "step": 31765 + }, + { + "epoch": 0.8156618056525137, + "grad_norm": 0.77734375, + "learning_rate": 0.00010917663063612169, + "loss": 0.8384, + "step": 31766 + }, + { + "epoch": 0.8156874828484355, + "grad_norm": 0.765625, + "learning_rate": 0.00010917218526167485, + "loss": 0.8375, + "step": 31767 + }, + { + "epoch": 0.8157131600443573, + "grad_norm": 0.72265625, + "learning_rate": 0.00010916773986894867, + "loss": 0.7206, + "step": 31768 + }, + { + "epoch": 0.8157388372402792, + "grad_norm": 0.7890625, + "learning_rate": 0.00010916329445795197, + "loss": 0.7903, + "step": 31769 + }, + { + "epoch": 0.815764514436201, + "grad_norm": 0.80078125, + "learning_rate": 0.00010915884902869366, + "loss": 0.889, + "step": 31770 + }, + { + "epoch": 0.8157901916321229, + "grad_norm": 0.8046875, + "learning_rate": 0.00010915440358118263, + "loss": 0.8089, + "step": 31771 + }, + { + "epoch": 0.8158158688280447, + "grad_norm": 0.8359375, + "learning_rate": 0.00010914995811542765, + "loss": 0.7759, + "step": 31772 + }, + { + "epoch": 0.8158415460239664, + "grad_norm": 0.82421875, + "learning_rate": 0.0001091455126314377, + "loss": 0.9072, + "step": 31773 + }, + { + "epoch": 0.8158672232198882, + "grad_norm": 0.75, + "learning_rate": 0.00010914106712922154, + "loss": 0.7467, + "step": 31774 + }, + { + "epoch": 0.8158929004158101, + "grad_norm": 0.8046875, + "learning_rate": 0.00010913662160878804, + "loss": 0.8, + "step": 31775 + }, + { + "epoch": 0.8159185776117319, + "grad_norm": 0.90234375, + "learning_rate": 0.00010913217607014608, + "loss": 0.859, + "step": 31776 + }, + { + "epoch": 0.8159442548076538, + "grad_norm": 0.88671875, + "learning_rate": 0.00010912773051330451, + "loss": 0.7376, + "step": 31777 + }, + { + "epoch": 0.8159699320035756, + "grad_norm": 0.76171875, + "learning_rate": 0.00010912328493827221, + "loss": 0.7185, + "step": 31778 + }, + { + "epoch": 0.8159956091994973, + "grad_norm": 0.76171875, + "learning_rate": 0.00010911883934505804, + "loss": 0.7921, + "step": 31779 + }, + { + "epoch": 0.8160212863954192, + "grad_norm": 0.8515625, + "learning_rate": 0.00010911439373367086, + "loss": 0.8326, + "step": 31780 + }, + { + "epoch": 0.816046963591341, + "grad_norm": 0.79296875, + "learning_rate": 0.00010910994810411948, + "loss": 0.7805, + "step": 31781 + }, + { + "epoch": 0.8160726407872628, + "grad_norm": 0.76171875, + "learning_rate": 0.00010910550245641283, + "loss": 0.6591, + "step": 31782 + }, + { + "epoch": 0.8160983179831847, + "grad_norm": 0.78125, + "learning_rate": 0.00010910105679055968, + "loss": 0.8324, + "step": 31783 + }, + { + "epoch": 0.8161239951791065, + "grad_norm": 0.78515625, + "learning_rate": 0.000109096611106569, + "loss": 0.8457, + "step": 31784 + }, + { + "epoch": 0.8161496723750283, + "grad_norm": 0.78515625, + "learning_rate": 0.00010909216540444958, + "loss": 0.8116, + "step": 31785 + }, + { + "epoch": 0.8161753495709501, + "grad_norm": 0.75390625, + "learning_rate": 0.0001090877196842103, + "loss": 0.7475, + "step": 31786 + }, + { + "epoch": 0.8162010267668719, + "grad_norm": 0.7421875, + "learning_rate": 0.00010908327394586001, + "loss": 0.7556, + "step": 31787 + }, + { + "epoch": 0.8162267039627937, + "grad_norm": 0.70703125, + "learning_rate": 0.00010907882818940756, + "loss": 0.6976, + "step": 31788 + }, + { + "epoch": 0.8162523811587156, + "grad_norm": 2.546875, + "learning_rate": 0.00010907438241486185, + "loss": 0.8754, + "step": 31789 + }, + { + "epoch": 0.8162780583546374, + "grad_norm": 0.74609375, + "learning_rate": 0.0001090699366222317, + "loss": 0.8245, + "step": 31790 + }, + { + "epoch": 0.8163037355505592, + "grad_norm": 0.7734375, + "learning_rate": 0.00010906549081152597, + "loss": 0.7564, + "step": 31791 + }, + { + "epoch": 0.8163294127464811, + "grad_norm": 0.83984375, + "learning_rate": 0.00010906104498275358, + "loss": 0.8536, + "step": 31792 + }, + { + "epoch": 0.8163550899424028, + "grad_norm": 0.8515625, + "learning_rate": 0.00010905659913592332, + "loss": 0.7415, + "step": 31793 + }, + { + "epoch": 0.8163807671383246, + "grad_norm": 0.76953125, + "learning_rate": 0.00010905215327104408, + "loss": 0.7282, + "step": 31794 + }, + { + "epoch": 0.8164064443342465, + "grad_norm": 0.73828125, + "learning_rate": 0.00010904770738812474, + "loss": 0.827, + "step": 31795 + }, + { + "epoch": 0.8164321215301683, + "grad_norm": 0.75390625, + "learning_rate": 0.00010904326148717407, + "loss": 0.7895, + "step": 31796 + }, + { + "epoch": 0.8164577987260901, + "grad_norm": 0.734375, + "learning_rate": 0.00010903881556820106, + "loss": 0.7554, + "step": 31797 + }, + { + "epoch": 0.816483475922012, + "grad_norm": 0.75390625, + "learning_rate": 0.00010903436963121447, + "loss": 0.8177, + "step": 31798 + }, + { + "epoch": 0.8165091531179337, + "grad_norm": 0.76953125, + "learning_rate": 0.00010902992367622322, + "loss": 0.7212, + "step": 31799 + }, + { + "epoch": 0.8165348303138555, + "grad_norm": 0.796875, + "learning_rate": 0.00010902547770323615, + "loss": 0.7475, + "step": 31800 + }, + { + "epoch": 0.8165605075097774, + "grad_norm": 0.7578125, + "learning_rate": 0.0001090210317122621, + "loss": 0.7415, + "step": 31801 + }, + { + "epoch": 0.8165861847056992, + "grad_norm": 0.76953125, + "learning_rate": 0.00010901658570330995, + "loss": 0.7083, + "step": 31802 + }, + { + "epoch": 0.816611861901621, + "grad_norm": 0.7734375, + "learning_rate": 0.00010901213967638857, + "loss": 0.8747, + "step": 31803 + }, + { + "epoch": 0.8166375390975429, + "grad_norm": 0.8125, + "learning_rate": 0.0001090076936315068, + "loss": 0.7532, + "step": 31804 + }, + { + "epoch": 0.8166632162934647, + "grad_norm": 0.890625, + "learning_rate": 0.00010900324756867353, + "loss": 0.7721, + "step": 31805 + }, + { + "epoch": 0.8166888934893864, + "grad_norm": 0.73046875, + "learning_rate": 0.00010899880148789758, + "loss": 0.787, + "step": 31806 + }, + { + "epoch": 0.8167145706853083, + "grad_norm": 0.8046875, + "learning_rate": 0.00010899435538918785, + "loss": 0.7857, + "step": 31807 + }, + { + "epoch": 0.8167402478812301, + "grad_norm": 0.78125, + "learning_rate": 0.00010898990927255318, + "loss": 0.7307, + "step": 31808 + }, + { + "epoch": 0.816765925077152, + "grad_norm": 0.8046875, + "learning_rate": 0.0001089854631380024, + "loss": 0.7699, + "step": 31809 + }, + { + "epoch": 0.8167916022730738, + "grad_norm": 0.83984375, + "learning_rate": 0.00010898101698554443, + "loss": 0.9532, + "step": 31810 + }, + { + "epoch": 0.8168172794689956, + "grad_norm": 0.8046875, + "learning_rate": 0.00010897657081518811, + "loss": 0.8197, + "step": 31811 + }, + { + "epoch": 0.8168429566649174, + "grad_norm": 0.765625, + "learning_rate": 0.00010897212462694226, + "loss": 0.8464, + "step": 31812 + }, + { + "epoch": 0.8168686338608392, + "grad_norm": 0.8046875, + "learning_rate": 0.00010896767842081584, + "loss": 0.8159, + "step": 31813 + }, + { + "epoch": 0.816894311056761, + "grad_norm": 0.90234375, + "learning_rate": 0.0001089632321968176, + "loss": 0.7748, + "step": 31814 + }, + { + "epoch": 0.8169199882526829, + "grad_norm": 0.83984375, + "learning_rate": 0.00010895878595495645, + "loss": 0.6935, + "step": 31815 + }, + { + "epoch": 0.8169456654486047, + "grad_norm": 0.71484375, + "learning_rate": 0.00010895433969524127, + "loss": 0.6898, + "step": 31816 + }, + { + "epoch": 0.8169713426445265, + "grad_norm": 0.76953125, + "learning_rate": 0.00010894989341768088, + "loss": 0.8052, + "step": 31817 + }, + { + "epoch": 0.8169970198404484, + "grad_norm": 0.796875, + "learning_rate": 0.00010894544712228417, + "loss": 0.8591, + "step": 31818 + }, + { + "epoch": 0.8170226970363701, + "grad_norm": 0.8515625, + "learning_rate": 0.00010894100080906, + "loss": 0.8335, + "step": 31819 + }, + { + "epoch": 0.8170483742322919, + "grad_norm": 0.81640625, + "learning_rate": 0.0001089365544780172, + "loss": 0.7582, + "step": 31820 + }, + { + "epoch": 0.8170740514282138, + "grad_norm": 0.8359375, + "learning_rate": 0.00010893210812916466, + "loss": 0.9124, + "step": 31821 + }, + { + "epoch": 0.8170997286241356, + "grad_norm": 0.7109375, + "learning_rate": 0.00010892766176251125, + "loss": 0.7545, + "step": 31822 + }, + { + "epoch": 0.8171254058200574, + "grad_norm": 0.8359375, + "learning_rate": 0.0001089232153780658, + "loss": 0.7767, + "step": 31823 + }, + { + "epoch": 0.8171510830159793, + "grad_norm": 0.796875, + "learning_rate": 0.00010891876897583719, + "loss": 0.7228, + "step": 31824 + }, + { + "epoch": 0.8171767602119011, + "grad_norm": 0.80078125, + "learning_rate": 0.00010891432255583426, + "loss": 0.9184, + "step": 31825 + }, + { + "epoch": 0.8172024374078228, + "grad_norm": 0.8359375, + "learning_rate": 0.00010890987611806595, + "loss": 0.8652, + "step": 31826 + }, + { + "epoch": 0.8172281146037447, + "grad_norm": 0.77734375, + "learning_rate": 0.00010890542966254105, + "loss": 0.8274, + "step": 31827 + }, + { + "epoch": 0.8172537917996665, + "grad_norm": 0.79296875, + "learning_rate": 0.00010890098318926838, + "loss": 0.8636, + "step": 31828 + }, + { + "epoch": 0.8172794689955883, + "grad_norm": 0.75, + "learning_rate": 0.0001088965366982569, + "loss": 0.8004, + "step": 31829 + }, + { + "epoch": 0.8173051461915102, + "grad_norm": 0.83984375, + "learning_rate": 0.0001088920901895154, + "loss": 0.8317, + "step": 31830 + }, + { + "epoch": 0.817330823387432, + "grad_norm": 0.79296875, + "learning_rate": 0.00010888764366305278, + "loss": 1.1532, + "step": 31831 + }, + { + "epoch": 0.8173565005833537, + "grad_norm": 0.71484375, + "learning_rate": 0.00010888319711887791, + "loss": 0.8582, + "step": 31832 + }, + { + "epoch": 0.8173821777792756, + "grad_norm": 0.828125, + "learning_rate": 0.00010887875055699959, + "loss": 0.9292, + "step": 31833 + }, + { + "epoch": 0.8174078549751974, + "grad_norm": 0.88671875, + "learning_rate": 0.00010887430397742673, + "loss": 0.8865, + "step": 31834 + }, + { + "epoch": 0.8174335321711192, + "grad_norm": 0.87890625, + "learning_rate": 0.00010886985738016822, + "loss": 0.9082, + "step": 31835 + }, + { + "epoch": 0.8174592093670411, + "grad_norm": 0.71875, + "learning_rate": 0.00010886541076523282, + "loss": 0.7573, + "step": 31836 + }, + { + "epoch": 0.8174848865629629, + "grad_norm": 0.75, + "learning_rate": 0.00010886096413262951, + "loss": 0.7755, + "step": 31837 + }, + { + "epoch": 0.8175105637588848, + "grad_norm": 0.77734375, + "learning_rate": 0.00010885651748236709, + "loss": 0.8099, + "step": 31838 + }, + { + "epoch": 0.8175362409548065, + "grad_norm": 0.765625, + "learning_rate": 0.00010885207081445443, + "loss": 0.801, + "step": 31839 + }, + { + "epoch": 0.8175619181507283, + "grad_norm": 0.87109375, + "learning_rate": 0.0001088476241289004, + "loss": 0.8802, + "step": 31840 + }, + { + "epoch": 0.8175875953466502, + "grad_norm": 0.734375, + "learning_rate": 0.00010884317742571381, + "loss": 0.8206, + "step": 31841 + }, + { + "epoch": 0.817613272542572, + "grad_norm": 1.0546875, + "learning_rate": 0.00010883873070490362, + "loss": 0.8417, + "step": 31842 + }, + { + "epoch": 0.8176389497384938, + "grad_norm": 0.796875, + "learning_rate": 0.00010883428396647861, + "loss": 0.7533, + "step": 31843 + }, + { + "epoch": 0.8176646269344157, + "grad_norm": 0.74609375, + "learning_rate": 0.00010882983721044765, + "loss": 0.8186, + "step": 31844 + }, + { + "epoch": 0.8176903041303375, + "grad_norm": 0.80078125, + "learning_rate": 0.00010882539043681968, + "loss": 0.8277, + "step": 31845 + }, + { + "epoch": 0.8177159813262592, + "grad_norm": 0.828125, + "learning_rate": 0.00010882094364560346, + "loss": 0.8853, + "step": 31846 + }, + { + "epoch": 0.8177416585221811, + "grad_norm": 1.0390625, + "learning_rate": 0.00010881649683680792, + "loss": 0.7916, + "step": 31847 + }, + { + "epoch": 0.8177673357181029, + "grad_norm": 0.8125, + "learning_rate": 0.0001088120500104419, + "loss": 0.8797, + "step": 31848 + }, + { + "epoch": 0.8177930129140247, + "grad_norm": 0.80859375, + "learning_rate": 0.00010880760316651422, + "loss": 0.7661, + "step": 31849 + }, + { + "epoch": 0.8178186901099466, + "grad_norm": 0.80078125, + "learning_rate": 0.00010880315630503381, + "loss": 0.9016, + "step": 31850 + }, + { + "epoch": 0.8178443673058684, + "grad_norm": 0.7578125, + "learning_rate": 0.00010879870942600951, + "loss": 0.7157, + "step": 31851 + }, + { + "epoch": 0.8178700445017901, + "grad_norm": 0.73828125, + "learning_rate": 0.0001087942625294502, + "loss": 0.8141, + "step": 31852 + }, + { + "epoch": 0.817895721697712, + "grad_norm": 0.74609375, + "learning_rate": 0.00010878981561536468, + "loss": 0.8027, + "step": 31853 + }, + { + "epoch": 0.8179213988936338, + "grad_norm": 0.7109375, + "learning_rate": 0.00010878536868376186, + "loss": 0.7814, + "step": 31854 + }, + { + "epoch": 0.8179470760895556, + "grad_norm": 0.7734375, + "learning_rate": 0.0001087809217346506, + "loss": 0.6808, + "step": 31855 + }, + { + "epoch": 0.8179727532854775, + "grad_norm": 0.73046875, + "learning_rate": 0.00010877647476803976, + "loss": 0.7224, + "step": 31856 + }, + { + "epoch": 0.8179984304813993, + "grad_norm": 0.80859375, + "learning_rate": 0.00010877202778393816, + "loss": 0.9234, + "step": 31857 + }, + { + "epoch": 0.8180241076773211, + "grad_norm": 0.7421875, + "learning_rate": 0.00010876758078235474, + "loss": 0.8324, + "step": 31858 + }, + { + "epoch": 0.8180497848732429, + "grad_norm": 0.77734375, + "learning_rate": 0.00010876313376329835, + "loss": 0.8399, + "step": 31859 + }, + { + "epoch": 0.8180754620691647, + "grad_norm": 0.71484375, + "learning_rate": 0.00010875868672677776, + "loss": 0.8002, + "step": 31860 + }, + { + "epoch": 0.8181011392650865, + "grad_norm": 0.890625, + "learning_rate": 0.00010875423967280193, + "loss": 0.9219, + "step": 31861 + }, + { + "epoch": 0.8181268164610084, + "grad_norm": 0.8828125, + "learning_rate": 0.0001087497926013797, + "loss": 0.7553, + "step": 31862 + }, + { + "epoch": 0.8181524936569302, + "grad_norm": 0.77734375, + "learning_rate": 0.00010874534551251991, + "loss": 0.7808, + "step": 31863 + }, + { + "epoch": 0.818178170852852, + "grad_norm": 0.828125, + "learning_rate": 0.00010874089840623145, + "loss": 0.8846, + "step": 31864 + }, + { + "epoch": 0.8182038480487739, + "grad_norm": 0.828125, + "learning_rate": 0.00010873645128252316, + "loss": 0.8293, + "step": 31865 + }, + { + "epoch": 0.8182295252446956, + "grad_norm": 0.78125, + "learning_rate": 0.00010873200414140392, + "loss": 0.8822, + "step": 31866 + }, + { + "epoch": 0.8182552024406174, + "grad_norm": 0.80859375, + "learning_rate": 0.00010872755698288258, + "loss": 0.763, + "step": 31867 + }, + { + "epoch": 0.8182808796365393, + "grad_norm": 0.76953125, + "learning_rate": 0.00010872310980696798, + "loss": 0.8715, + "step": 31868 + }, + { + "epoch": 0.8183065568324611, + "grad_norm": 0.8671875, + "learning_rate": 0.00010871866261366904, + "loss": 0.7513, + "step": 31869 + }, + { + "epoch": 0.818332234028383, + "grad_norm": 0.7265625, + "learning_rate": 0.0001087142154029946, + "loss": 0.7336, + "step": 31870 + }, + { + "epoch": 0.8183579112243048, + "grad_norm": 0.796875, + "learning_rate": 0.00010870976817495349, + "loss": 0.829, + "step": 31871 + }, + { + "epoch": 0.8183835884202265, + "grad_norm": 0.75390625, + "learning_rate": 0.00010870532092955463, + "loss": 0.8279, + "step": 31872 + }, + { + "epoch": 0.8184092656161484, + "grad_norm": 0.80859375, + "learning_rate": 0.00010870087366680681, + "loss": 0.7112, + "step": 31873 + }, + { + "epoch": 0.8184349428120702, + "grad_norm": 0.8515625, + "learning_rate": 0.00010869642638671897, + "loss": 0.897, + "step": 31874 + }, + { + "epoch": 0.818460620007992, + "grad_norm": 0.79296875, + "learning_rate": 0.00010869197908929993, + "loss": 0.7578, + "step": 31875 + }, + { + "epoch": 0.8184862972039139, + "grad_norm": 0.765625, + "learning_rate": 0.00010868753177455852, + "loss": 0.7516, + "step": 31876 + }, + { + "epoch": 0.8185119743998357, + "grad_norm": 0.7890625, + "learning_rate": 0.0001086830844425037, + "loss": 0.8638, + "step": 31877 + }, + { + "epoch": 0.8185376515957575, + "grad_norm": 0.84765625, + "learning_rate": 0.00010867863709314425, + "loss": 0.6715, + "step": 31878 + }, + { + "epoch": 0.8185633287916793, + "grad_norm": 0.77734375, + "learning_rate": 0.00010867418972648909, + "loss": 0.7824, + "step": 31879 + }, + { + "epoch": 0.8185890059876011, + "grad_norm": 0.83203125, + "learning_rate": 0.00010866974234254704, + "loss": 0.8563, + "step": 31880 + }, + { + "epoch": 0.8186146831835229, + "grad_norm": 0.796875, + "learning_rate": 0.00010866529494132694, + "loss": 0.8257, + "step": 31881 + }, + { + "epoch": 0.8186403603794448, + "grad_norm": 0.77734375, + "learning_rate": 0.00010866084752283771, + "loss": 0.7851, + "step": 31882 + }, + { + "epoch": 0.8186660375753666, + "grad_norm": 0.8671875, + "learning_rate": 0.00010865640008708821, + "loss": 0.9331, + "step": 31883 + }, + { + "epoch": 0.8186917147712884, + "grad_norm": 0.73828125, + "learning_rate": 0.00010865195263408727, + "loss": 0.7456, + "step": 31884 + }, + { + "epoch": 0.8187173919672103, + "grad_norm": 0.75390625, + "learning_rate": 0.00010864750516384381, + "loss": 0.8787, + "step": 31885 + }, + { + "epoch": 0.818743069163132, + "grad_norm": 0.83984375, + "learning_rate": 0.0001086430576763666, + "loss": 0.8721, + "step": 31886 + }, + { + "epoch": 0.8187687463590538, + "grad_norm": 0.71484375, + "learning_rate": 0.00010863861017166459, + "loss": 0.6906, + "step": 31887 + }, + { + "epoch": 0.8187944235549757, + "grad_norm": 0.828125, + "learning_rate": 0.0001086341626497466, + "loss": 0.8131, + "step": 31888 + }, + { + "epoch": 0.8188201007508975, + "grad_norm": 0.75390625, + "learning_rate": 0.00010862971511062148, + "loss": 0.7613, + "step": 31889 + }, + { + "epoch": 0.8188457779468193, + "grad_norm": 0.83203125, + "learning_rate": 0.00010862526755429813, + "loss": 0.8202, + "step": 31890 + }, + { + "epoch": 0.8188714551427412, + "grad_norm": 0.75390625, + "learning_rate": 0.00010862081998078543, + "loss": 0.8505, + "step": 31891 + }, + { + "epoch": 0.8188971323386629, + "grad_norm": 0.80859375, + "learning_rate": 0.0001086163723900922, + "loss": 0.7806, + "step": 31892 + }, + { + "epoch": 0.8189228095345847, + "grad_norm": 0.74609375, + "learning_rate": 0.0001086119247822273, + "loss": 0.8795, + "step": 31893 + }, + { + "epoch": 0.8189484867305066, + "grad_norm": 0.84765625, + "learning_rate": 0.00010860747715719962, + "loss": 0.8107, + "step": 31894 + }, + { + "epoch": 0.8189741639264284, + "grad_norm": 0.85546875, + "learning_rate": 0.00010860302951501802, + "loss": 0.8831, + "step": 31895 + }, + { + "epoch": 0.8189998411223502, + "grad_norm": 0.78515625, + "learning_rate": 0.00010859858185569136, + "loss": 0.8825, + "step": 31896 + }, + { + "epoch": 0.8190255183182721, + "grad_norm": 0.8046875, + "learning_rate": 0.0001085941341792285, + "loss": 0.902, + "step": 31897 + }, + { + "epoch": 0.8190511955141939, + "grad_norm": 0.734375, + "learning_rate": 0.00010858968648563834, + "loss": 0.7937, + "step": 31898 + }, + { + "epoch": 0.8190768727101156, + "grad_norm": 0.74609375, + "learning_rate": 0.00010858523877492967, + "loss": 0.7035, + "step": 31899 + }, + { + "epoch": 0.8191025499060375, + "grad_norm": 0.73046875, + "learning_rate": 0.00010858079104711138, + "loss": 0.7809, + "step": 31900 + }, + { + "epoch": 0.8191282271019593, + "grad_norm": 0.9453125, + "learning_rate": 0.00010857634330219239, + "loss": 0.7158, + "step": 31901 + }, + { + "epoch": 0.8191539042978812, + "grad_norm": 0.8515625, + "learning_rate": 0.00010857189554018148, + "loss": 0.8631, + "step": 31902 + }, + { + "epoch": 0.819179581493803, + "grad_norm": 0.78125, + "learning_rate": 0.0001085674477610876, + "loss": 0.8565, + "step": 31903 + }, + { + "epoch": 0.8192052586897248, + "grad_norm": 0.828125, + "learning_rate": 0.00010856299996491954, + "loss": 0.8467, + "step": 31904 + }, + { + "epoch": 0.8192309358856467, + "grad_norm": 0.8046875, + "learning_rate": 0.00010855855215168622, + "loss": 0.9383, + "step": 31905 + }, + { + "epoch": 0.8192566130815684, + "grad_norm": 0.79296875, + "learning_rate": 0.00010855410432139648, + "loss": 0.8057, + "step": 31906 + }, + { + "epoch": 0.8192822902774902, + "grad_norm": 0.76953125, + "learning_rate": 0.00010854965647405918, + "loss": 0.6829, + "step": 31907 + }, + { + "epoch": 0.8193079674734121, + "grad_norm": 0.76953125, + "learning_rate": 0.00010854520860968316, + "loss": 0.7472, + "step": 31908 + }, + { + "epoch": 0.8193336446693339, + "grad_norm": 0.86328125, + "learning_rate": 0.00010854076072827733, + "loss": 0.8328, + "step": 31909 + }, + { + "epoch": 0.8193593218652557, + "grad_norm": 0.76953125, + "learning_rate": 0.00010853631282985053, + "loss": 0.9002, + "step": 31910 + }, + { + "epoch": 0.8193849990611776, + "grad_norm": 0.765625, + "learning_rate": 0.00010853186491441162, + "loss": 0.7115, + "step": 31911 + }, + { + "epoch": 0.8194106762570993, + "grad_norm": 0.765625, + "learning_rate": 0.00010852741698196951, + "loss": 0.7548, + "step": 31912 + }, + { + "epoch": 0.8194363534530211, + "grad_norm": 0.6796875, + "learning_rate": 0.00010852296903253298, + "loss": 0.8092, + "step": 31913 + }, + { + "epoch": 0.819462030648943, + "grad_norm": 0.796875, + "learning_rate": 0.00010851852106611096, + "loss": 0.716, + "step": 31914 + }, + { + "epoch": 0.8194877078448648, + "grad_norm": 0.6875, + "learning_rate": 0.00010851407308271231, + "loss": 0.6818, + "step": 31915 + }, + { + "epoch": 0.8195133850407866, + "grad_norm": 0.86328125, + "learning_rate": 0.00010850962508234586, + "loss": 0.7548, + "step": 31916 + }, + { + "epoch": 0.8195390622367085, + "grad_norm": 0.88671875, + "learning_rate": 0.00010850517706502053, + "loss": 0.9745, + "step": 31917 + }, + { + "epoch": 0.8195647394326303, + "grad_norm": 0.83203125, + "learning_rate": 0.00010850072903074512, + "loss": 0.8075, + "step": 31918 + }, + { + "epoch": 0.819590416628552, + "grad_norm": 0.8125, + "learning_rate": 0.00010849628097952856, + "loss": 0.893, + "step": 31919 + }, + { + "epoch": 0.8196160938244739, + "grad_norm": 0.7890625, + "learning_rate": 0.00010849183291137965, + "loss": 0.8976, + "step": 31920 + }, + { + "epoch": 0.8196417710203957, + "grad_norm": 0.7421875, + "learning_rate": 0.00010848738482630727, + "loss": 0.8737, + "step": 31921 + }, + { + "epoch": 0.8196674482163175, + "grad_norm": 0.83984375, + "learning_rate": 0.00010848293672432033, + "loss": 0.8027, + "step": 31922 + }, + { + "epoch": 0.8196931254122394, + "grad_norm": 0.70703125, + "learning_rate": 0.00010847848860542764, + "loss": 0.6459, + "step": 31923 + }, + { + "epoch": 0.8197188026081612, + "grad_norm": 0.83203125, + "learning_rate": 0.00010847404046963812, + "loss": 0.7747, + "step": 31924 + }, + { + "epoch": 0.819744479804083, + "grad_norm": 0.87109375, + "learning_rate": 0.00010846959231696057, + "loss": 0.7203, + "step": 31925 + }, + { + "epoch": 0.8197701570000048, + "grad_norm": 0.79296875, + "learning_rate": 0.00010846514414740389, + "loss": 0.7178, + "step": 31926 + }, + { + "epoch": 0.8197958341959266, + "grad_norm": 0.94921875, + "learning_rate": 0.00010846069596097696, + "loss": 0.7692, + "step": 31927 + }, + { + "epoch": 0.8198215113918484, + "grad_norm": 0.85546875, + "learning_rate": 0.00010845624775768861, + "loss": 0.646, + "step": 31928 + }, + { + "epoch": 0.8198471885877703, + "grad_norm": 0.84375, + "learning_rate": 0.00010845179953754771, + "loss": 0.936, + "step": 31929 + }, + { + "epoch": 0.8198728657836921, + "grad_norm": 0.734375, + "learning_rate": 0.00010844735130056318, + "loss": 0.721, + "step": 31930 + }, + { + "epoch": 0.819898542979614, + "grad_norm": 0.7890625, + "learning_rate": 0.00010844290304674383, + "loss": 0.8613, + "step": 31931 + }, + { + "epoch": 0.8199242201755357, + "grad_norm": 0.7890625, + "learning_rate": 0.00010843845477609853, + "loss": 0.7167, + "step": 31932 + }, + { + "epoch": 0.8199498973714575, + "grad_norm": 0.796875, + "learning_rate": 0.00010843400648863615, + "loss": 0.8414, + "step": 31933 + }, + { + "epoch": 0.8199755745673794, + "grad_norm": 3.421875, + "learning_rate": 0.00010842955818436554, + "loss": 0.8029, + "step": 31934 + }, + { + "epoch": 0.8200012517633012, + "grad_norm": 0.7265625, + "learning_rate": 0.00010842510986329559, + "loss": 0.7132, + "step": 31935 + }, + { + "epoch": 0.820026928959223, + "grad_norm": 0.8359375, + "learning_rate": 0.00010842066152543517, + "loss": 0.7542, + "step": 31936 + }, + { + "epoch": 0.8200526061551449, + "grad_norm": 0.76171875, + "learning_rate": 0.00010841621317079311, + "loss": 0.6575, + "step": 31937 + }, + { + "epoch": 0.8200782833510667, + "grad_norm": 0.76953125, + "learning_rate": 0.00010841176479937836, + "loss": 0.8672, + "step": 31938 + }, + { + "epoch": 0.8201039605469884, + "grad_norm": 0.76171875, + "learning_rate": 0.00010840731641119966, + "loss": 0.7921, + "step": 31939 + }, + { + "epoch": 0.8201296377429103, + "grad_norm": 0.734375, + "learning_rate": 0.00010840286800626595, + "loss": 0.7512, + "step": 31940 + }, + { + "epoch": 0.8201553149388321, + "grad_norm": 0.80859375, + "learning_rate": 0.0001083984195845861, + "loss": 0.7581, + "step": 31941 + }, + { + "epoch": 0.8201809921347539, + "grad_norm": 0.8671875, + "learning_rate": 0.00010839397114616891, + "loss": 0.7654, + "step": 31942 + }, + { + "epoch": 0.8202066693306758, + "grad_norm": 0.87890625, + "learning_rate": 0.00010838952269102337, + "loss": 0.8391, + "step": 31943 + }, + { + "epoch": 0.8202323465265976, + "grad_norm": 0.765625, + "learning_rate": 0.00010838507421915824, + "loss": 0.6914, + "step": 31944 + }, + { + "epoch": 0.8202580237225194, + "grad_norm": 0.7421875, + "learning_rate": 0.00010838062573058238, + "loss": 0.7903, + "step": 31945 + }, + { + "epoch": 0.8202837009184412, + "grad_norm": 0.7109375, + "learning_rate": 0.00010837617722530474, + "loss": 0.7295, + "step": 31946 + }, + { + "epoch": 0.820309378114363, + "grad_norm": 0.8203125, + "learning_rate": 0.0001083717287033341, + "loss": 0.7429, + "step": 31947 + }, + { + "epoch": 0.8203350553102848, + "grad_norm": 0.8125, + "learning_rate": 0.00010836728016467934, + "loss": 0.8899, + "step": 31948 + }, + { + "epoch": 0.8203607325062067, + "grad_norm": 0.8125, + "learning_rate": 0.0001083628316093494, + "loss": 0.8482, + "step": 31949 + }, + { + "epoch": 0.8203864097021285, + "grad_norm": 0.8125, + "learning_rate": 0.00010835838303735304, + "loss": 0.8212, + "step": 31950 + }, + { + "epoch": 0.8204120868980503, + "grad_norm": 0.79296875, + "learning_rate": 0.00010835393444869923, + "loss": 0.8915, + "step": 31951 + }, + { + "epoch": 0.8204377640939721, + "grad_norm": 0.84765625, + "learning_rate": 0.00010834948584339677, + "loss": 0.7732, + "step": 31952 + }, + { + "epoch": 0.8204634412898939, + "grad_norm": 0.73046875, + "learning_rate": 0.00010834503722145451, + "loss": 0.7118, + "step": 31953 + }, + { + "epoch": 0.8204891184858157, + "grad_norm": 0.78515625, + "learning_rate": 0.00010834058858288136, + "loss": 0.7737, + "step": 31954 + }, + { + "epoch": 0.8205147956817376, + "grad_norm": 0.76953125, + "learning_rate": 0.00010833613992768619, + "loss": 0.8131, + "step": 31955 + }, + { + "epoch": 0.8205404728776594, + "grad_norm": 0.828125, + "learning_rate": 0.0001083316912558778, + "loss": 0.9175, + "step": 31956 + }, + { + "epoch": 0.8205661500735812, + "grad_norm": 0.76953125, + "learning_rate": 0.00010832724256746516, + "loss": 0.8171, + "step": 31957 + }, + { + "epoch": 0.8205918272695031, + "grad_norm": 0.77734375, + "learning_rate": 0.00010832279386245702, + "loss": 0.8443, + "step": 31958 + }, + { + "epoch": 0.8206175044654248, + "grad_norm": 0.85546875, + "learning_rate": 0.00010831834514086237, + "loss": 0.8395, + "step": 31959 + }, + { + "epoch": 0.8206431816613466, + "grad_norm": 0.8046875, + "learning_rate": 0.00010831389640268997, + "loss": 0.881, + "step": 31960 + }, + { + "epoch": 0.8206688588572685, + "grad_norm": 0.81640625, + "learning_rate": 0.00010830944764794871, + "loss": 0.7472, + "step": 31961 + }, + { + "epoch": 0.8206945360531903, + "grad_norm": 0.8203125, + "learning_rate": 0.00010830499887664749, + "loss": 0.8018, + "step": 31962 + }, + { + "epoch": 0.8207202132491122, + "grad_norm": 0.79296875, + "learning_rate": 0.00010830055008879516, + "loss": 0.8236, + "step": 31963 + }, + { + "epoch": 0.820745890445034, + "grad_norm": 0.83984375, + "learning_rate": 0.00010829610128440058, + "loss": 0.8639, + "step": 31964 + }, + { + "epoch": 0.8207715676409558, + "grad_norm": 0.890625, + "learning_rate": 0.00010829165246347264, + "loss": 0.8926, + "step": 31965 + }, + { + "epoch": 0.8207972448368775, + "grad_norm": 1.0, + "learning_rate": 0.00010828720362602014, + "loss": 0.8875, + "step": 31966 + }, + { + "epoch": 0.8208229220327994, + "grad_norm": 0.7890625, + "learning_rate": 0.000108282754772052, + "loss": 0.9123, + "step": 31967 + }, + { + "epoch": 0.8208485992287212, + "grad_norm": 0.7734375, + "learning_rate": 0.00010827830590157712, + "loss": 0.8669, + "step": 31968 + }, + { + "epoch": 0.8208742764246431, + "grad_norm": 0.71875, + "learning_rate": 0.00010827385701460429, + "loss": 0.8888, + "step": 31969 + }, + { + "epoch": 0.8208999536205649, + "grad_norm": 0.83203125, + "learning_rate": 0.00010826940811114243, + "loss": 0.8535, + "step": 31970 + }, + { + "epoch": 0.8209256308164867, + "grad_norm": 0.79296875, + "learning_rate": 0.00010826495919120036, + "loss": 0.7879, + "step": 31971 + }, + { + "epoch": 0.8209513080124085, + "grad_norm": 0.8203125, + "learning_rate": 0.000108260510254787, + "loss": 0.7997, + "step": 31972 + }, + { + "epoch": 0.8209769852083303, + "grad_norm": 0.72265625, + "learning_rate": 0.00010825606130191117, + "loss": 0.8259, + "step": 31973 + }, + { + "epoch": 0.8210026624042521, + "grad_norm": 0.75, + "learning_rate": 0.00010825161233258177, + "loss": 0.818, + "step": 31974 + }, + { + "epoch": 0.821028339600174, + "grad_norm": 0.73046875, + "learning_rate": 0.00010824716334680765, + "loss": 0.6897, + "step": 31975 + }, + { + "epoch": 0.8210540167960958, + "grad_norm": 0.79296875, + "learning_rate": 0.00010824271434459767, + "loss": 0.8567, + "step": 31976 + }, + { + "epoch": 0.8210796939920176, + "grad_norm": 0.82421875, + "learning_rate": 0.00010823826532596071, + "loss": 0.74, + "step": 31977 + }, + { + "epoch": 0.8211053711879395, + "grad_norm": 0.80078125, + "learning_rate": 0.00010823381629090563, + "loss": 0.6991, + "step": 31978 + }, + { + "epoch": 0.8211310483838612, + "grad_norm": 0.8125, + "learning_rate": 0.00010822936723944129, + "loss": 0.6765, + "step": 31979 + }, + { + "epoch": 0.821156725579783, + "grad_norm": 0.8671875, + "learning_rate": 0.00010822491817157656, + "loss": 0.8661, + "step": 31980 + }, + { + "epoch": 0.8211824027757049, + "grad_norm": 0.75390625, + "learning_rate": 0.00010822046908732033, + "loss": 0.752, + "step": 31981 + }, + { + "epoch": 0.8212080799716267, + "grad_norm": 0.8671875, + "learning_rate": 0.00010821601998668143, + "loss": 0.8659, + "step": 31982 + }, + { + "epoch": 0.8212337571675485, + "grad_norm": 0.80078125, + "learning_rate": 0.00010821157086966875, + "loss": 0.8424, + "step": 31983 + }, + { + "epoch": 0.8212594343634704, + "grad_norm": 0.8203125, + "learning_rate": 0.00010820712173629119, + "loss": 0.8598, + "step": 31984 + }, + { + "epoch": 0.8212851115593922, + "grad_norm": 0.7890625, + "learning_rate": 0.00010820267258655754, + "loss": 0.9003, + "step": 31985 + }, + { + "epoch": 0.8213107887553139, + "grad_norm": 0.80859375, + "learning_rate": 0.00010819822342047672, + "loss": 0.9167, + "step": 31986 + }, + { + "epoch": 0.8213364659512358, + "grad_norm": 0.765625, + "learning_rate": 0.00010819377423805755, + "loss": 0.9082, + "step": 31987 + }, + { + "epoch": 0.8213621431471576, + "grad_norm": 0.8203125, + "learning_rate": 0.00010818932503930895, + "loss": 0.86, + "step": 31988 + }, + { + "epoch": 0.8213878203430794, + "grad_norm": 0.81640625, + "learning_rate": 0.00010818487582423977, + "loss": 0.8929, + "step": 31989 + }, + { + "epoch": 0.8214134975390013, + "grad_norm": 0.7890625, + "learning_rate": 0.00010818042659285885, + "loss": 0.857, + "step": 31990 + }, + { + "epoch": 0.8214391747349231, + "grad_norm": 0.72265625, + "learning_rate": 0.0001081759773451751, + "loss": 0.7665, + "step": 31991 + }, + { + "epoch": 0.8214648519308448, + "grad_norm": 0.796875, + "learning_rate": 0.00010817152808119737, + "loss": 0.774, + "step": 31992 + }, + { + "epoch": 0.8214905291267667, + "grad_norm": 0.95703125, + "learning_rate": 0.00010816707880093451, + "loss": 0.83, + "step": 31993 + }, + { + "epoch": 0.8215162063226885, + "grad_norm": 0.828125, + "learning_rate": 0.0001081626295043954, + "loss": 0.9938, + "step": 31994 + }, + { + "epoch": 0.8215418835186103, + "grad_norm": 0.83984375, + "learning_rate": 0.00010815818019158892, + "loss": 0.8117, + "step": 31995 + }, + { + "epoch": 0.8215675607145322, + "grad_norm": 0.67578125, + "learning_rate": 0.0001081537308625239, + "loss": 0.6413, + "step": 31996 + }, + { + "epoch": 0.821593237910454, + "grad_norm": 0.8125, + "learning_rate": 0.00010814928151720927, + "loss": 0.6991, + "step": 31997 + }, + { + "epoch": 0.8216189151063759, + "grad_norm": 0.796875, + "learning_rate": 0.0001081448321556538, + "loss": 0.818, + "step": 31998 + }, + { + "epoch": 0.8216445923022976, + "grad_norm": 0.73046875, + "learning_rate": 0.00010814038277786644, + "loss": 0.7625, + "step": 31999 + }, + { + "epoch": 0.8216702694982194, + "grad_norm": 0.984375, + "learning_rate": 0.00010813593338385605, + "loss": 0.784, + "step": 32000 + }, + { + "epoch": 0.8216702694982194, + "eval_loss": 0.8049712181091309, + "eval_runtime": 354.7977, + "eval_samples_per_second": 28.185, + "eval_steps_per_second": 0.882, + "step": 32000 + }, + { + "epoch": 0.8216959466941413, + "grad_norm": 0.7109375, + "learning_rate": 0.00010813148397363146, + "loss": 0.7735, + "step": 32001 + }, + { + "epoch": 0.8217216238900631, + "grad_norm": 0.8515625, + "learning_rate": 0.00010812703454720157, + "loss": 0.7665, + "step": 32002 + }, + { + "epoch": 0.8217473010859849, + "grad_norm": 0.8359375, + "learning_rate": 0.00010812258510457523, + "loss": 0.9308, + "step": 32003 + }, + { + "epoch": 0.8217729782819068, + "grad_norm": 0.7421875, + "learning_rate": 0.00010811813564576132, + "loss": 0.8966, + "step": 32004 + }, + { + "epoch": 0.8217986554778286, + "grad_norm": 0.8046875, + "learning_rate": 0.0001081136861707687, + "loss": 0.8567, + "step": 32005 + }, + { + "epoch": 0.8218243326737503, + "grad_norm": 0.828125, + "learning_rate": 0.0001081092366796062, + "loss": 0.8934, + "step": 32006 + }, + { + "epoch": 0.8218500098696722, + "grad_norm": 0.77734375, + "learning_rate": 0.00010810478717228277, + "loss": 0.7904, + "step": 32007 + }, + { + "epoch": 0.821875687065594, + "grad_norm": 0.7890625, + "learning_rate": 0.0001081003376488072, + "loss": 0.8318, + "step": 32008 + }, + { + "epoch": 0.8219013642615158, + "grad_norm": 0.8125, + "learning_rate": 0.00010809588810918838, + "loss": 0.9321, + "step": 32009 + }, + { + "epoch": 0.8219270414574377, + "grad_norm": 0.8046875, + "learning_rate": 0.00010809143855343524, + "loss": 0.7949, + "step": 32010 + }, + { + "epoch": 0.8219527186533595, + "grad_norm": 0.72265625, + "learning_rate": 0.00010808698898155654, + "loss": 0.8113, + "step": 32011 + }, + { + "epoch": 0.8219783958492812, + "grad_norm": 0.73046875, + "learning_rate": 0.00010808253939356123, + "loss": 0.8532, + "step": 32012 + }, + { + "epoch": 0.8220040730452031, + "grad_norm": 0.7734375, + "learning_rate": 0.00010807808978945815, + "loss": 0.7903, + "step": 32013 + }, + { + "epoch": 0.8220297502411249, + "grad_norm": 0.78125, + "learning_rate": 0.00010807364016925614, + "loss": 0.6978, + "step": 32014 + }, + { + "epoch": 0.8220554274370467, + "grad_norm": 0.7421875, + "learning_rate": 0.00010806919053296412, + "loss": 0.78, + "step": 32015 + }, + { + "epoch": 0.8220811046329686, + "grad_norm": 0.76953125, + "learning_rate": 0.00010806474088059093, + "loss": 0.7363, + "step": 32016 + }, + { + "epoch": 0.8221067818288904, + "grad_norm": 0.875, + "learning_rate": 0.00010806029121214543, + "loss": 0.6926, + "step": 32017 + }, + { + "epoch": 0.8221324590248122, + "grad_norm": 0.82421875, + "learning_rate": 0.00010805584152763651, + "loss": 0.7708, + "step": 32018 + }, + { + "epoch": 0.822158136220734, + "grad_norm": 0.78125, + "learning_rate": 0.000108051391827073, + "loss": 0.8914, + "step": 32019 + }, + { + "epoch": 0.8221838134166558, + "grad_norm": 0.8125, + "learning_rate": 0.00010804694211046382, + "loss": 0.9737, + "step": 32020 + }, + { + "epoch": 0.8222094906125776, + "grad_norm": 0.76953125, + "learning_rate": 0.0001080424923778178, + "loss": 0.7912, + "step": 32021 + }, + { + "epoch": 0.8222351678084995, + "grad_norm": 0.76953125, + "learning_rate": 0.00010803804262914381, + "loss": 0.6696, + "step": 32022 + }, + { + "epoch": 0.8222608450044213, + "grad_norm": 0.80078125, + "learning_rate": 0.00010803359286445076, + "loss": 0.7793, + "step": 32023 + }, + { + "epoch": 0.8222865222003432, + "grad_norm": 0.8203125, + "learning_rate": 0.00010802914308374745, + "loss": 0.8912, + "step": 32024 + }, + { + "epoch": 0.8223121993962649, + "grad_norm": 0.81640625, + "learning_rate": 0.00010802469328704281, + "loss": 0.9256, + "step": 32025 + }, + { + "epoch": 0.8223378765921867, + "grad_norm": 0.828125, + "learning_rate": 0.00010802024347434568, + "loss": 0.7623, + "step": 32026 + }, + { + "epoch": 0.8223635537881085, + "grad_norm": 0.69921875, + "learning_rate": 0.00010801579364566488, + "loss": 0.7149, + "step": 32027 + }, + { + "epoch": 0.8223892309840304, + "grad_norm": 0.74609375, + "learning_rate": 0.00010801134380100937, + "loss": 0.8747, + "step": 32028 + }, + { + "epoch": 0.8224149081799522, + "grad_norm": 0.9140625, + "learning_rate": 0.00010800689394038799, + "loss": 0.8093, + "step": 32029 + }, + { + "epoch": 0.8224405853758741, + "grad_norm": 0.78125, + "learning_rate": 0.00010800244406380957, + "loss": 0.8196, + "step": 32030 + }, + { + "epoch": 0.8224662625717959, + "grad_norm": 0.859375, + "learning_rate": 0.000107997994171283, + "loss": 0.9003, + "step": 32031 + }, + { + "epoch": 0.8224919397677176, + "grad_norm": 0.81640625, + "learning_rate": 0.00010799354426281715, + "loss": 0.7016, + "step": 32032 + }, + { + "epoch": 0.8225176169636395, + "grad_norm": 0.8828125, + "learning_rate": 0.0001079890943384209, + "loss": 0.9134, + "step": 32033 + }, + { + "epoch": 0.8225432941595613, + "grad_norm": 0.78125, + "learning_rate": 0.0001079846443981031, + "loss": 0.7569, + "step": 32034 + }, + { + "epoch": 0.8225689713554831, + "grad_norm": 0.79296875, + "learning_rate": 0.00010798019444187264, + "loss": 0.8211, + "step": 32035 + }, + { + "epoch": 0.822594648551405, + "grad_norm": 0.84375, + "learning_rate": 0.00010797574446973835, + "loss": 0.8556, + "step": 32036 + }, + { + "epoch": 0.8226203257473268, + "grad_norm": 0.8046875, + "learning_rate": 0.00010797129448170915, + "loss": 0.9542, + "step": 32037 + }, + { + "epoch": 0.8226460029432486, + "grad_norm": 0.8125, + "learning_rate": 0.00010796684447779383, + "loss": 0.7812, + "step": 32038 + }, + { + "epoch": 0.8226716801391704, + "grad_norm": 0.8046875, + "learning_rate": 0.00010796239445800136, + "loss": 0.9663, + "step": 32039 + }, + { + "epoch": 0.8226973573350922, + "grad_norm": 0.83984375, + "learning_rate": 0.00010795794442234053, + "loss": 0.8134, + "step": 32040 + }, + { + "epoch": 0.822723034531014, + "grad_norm": 0.8046875, + "learning_rate": 0.00010795349437082023, + "loss": 0.852, + "step": 32041 + }, + { + "epoch": 0.8227487117269359, + "grad_norm": 0.8515625, + "learning_rate": 0.00010794904430344934, + "loss": 0.8002, + "step": 32042 + }, + { + "epoch": 0.8227743889228577, + "grad_norm": 0.734375, + "learning_rate": 0.00010794459422023673, + "loss": 0.8205, + "step": 32043 + }, + { + "epoch": 0.8228000661187795, + "grad_norm": 0.7734375, + "learning_rate": 0.00010794014412119127, + "loss": 0.8467, + "step": 32044 + }, + { + "epoch": 0.8228257433147013, + "grad_norm": 0.76953125, + "learning_rate": 0.00010793569400632181, + "loss": 0.8878, + "step": 32045 + }, + { + "epoch": 0.8228514205106231, + "grad_norm": 0.828125, + "learning_rate": 0.00010793124387563722, + "loss": 0.7657, + "step": 32046 + }, + { + "epoch": 0.8228770977065449, + "grad_norm": 0.78515625, + "learning_rate": 0.00010792679372914638, + "loss": 0.8345, + "step": 32047 + }, + { + "epoch": 0.8229027749024668, + "grad_norm": 0.76953125, + "learning_rate": 0.00010792234356685818, + "loss": 0.7746, + "step": 32048 + }, + { + "epoch": 0.8229284520983886, + "grad_norm": 0.75390625, + "learning_rate": 0.00010791789338878144, + "loss": 0.7398, + "step": 32049 + }, + { + "epoch": 0.8229541292943104, + "grad_norm": 0.83984375, + "learning_rate": 0.00010791344319492506, + "loss": 0.9238, + "step": 32050 + }, + { + "epoch": 0.8229798064902323, + "grad_norm": 0.8125, + "learning_rate": 0.00010790899298529791, + "loss": 0.8151, + "step": 32051 + }, + { + "epoch": 0.823005483686154, + "grad_norm": 0.74609375, + "learning_rate": 0.00010790454275990885, + "loss": 0.826, + "step": 32052 + }, + { + "epoch": 0.8230311608820758, + "grad_norm": 0.859375, + "learning_rate": 0.00010790009251876674, + "loss": 0.837, + "step": 32053 + }, + { + "epoch": 0.8230568380779977, + "grad_norm": 0.796875, + "learning_rate": 0.00010789564226188045, + "loss": 0.7407, + "step": 32054 + }, + { + "epoch": 0.8230825152739195, + "grad_norm": 0.73046875, + "learning_rate": 0.00010789119198925888, + "loss": 0.6859, + "step": 32055 + }, + { + "epoch": 0.8231081924698413, + "grad_norm": 0.87109375, + "learning_rate": 0.00010788674170091087, + "loss": 0.9053, + "step": 32056 + }, + { + "epoch": 0.8231338696657632, + "grad_norm": 0.75390625, + "learning_rate": 0.00010788229139684529, + "loss": 0.7445, + "step": 32057 + }, + { + "epoch": 0.823159546861685, + "grad_norm": 0.765625, + "learning_rate": 0.00010787784107707104, + "loss": 0.8215, + "step": 32058 + }, + { + "epoch": 0.8231852240576067, + "grad_norm": 0.75390625, + "learning_rate": 0.00010787339074159693, + "loss": 0.8556, + "step": 32059 + }, + { + "epoch": 0.8232109012535286, + "grad_norm": 0.7890625, + "learning_rate": 0.00010786894039043187, + "loss": 0.788, + "step": 32060 + }, + { + "epoch": 0.8232365784494504, + "grad_norm": 0.67578125, + "learning_rate": 0.00010786449002358475, + "loss": 0.7603, + "step": 32061 + }, + { + "epoch": 0.8232622556453723, + "grad_norm": 0.7890625, + "learning_rate": 0.00010786003964106437, + "loss": 0.9431, + "step": 32062 + }, + { + "epoch": 0.8232879328412941, + "grad_norm": 0.75390625, + "learning_rate": 0.00010785558924287969, + "loss": 0.7733, + "step": 32063 + }, + { + "epoch": 0.8233136100372159, + "grad_norm": 0.8984375, + "learning_rate": 0.0001078511388290395, + "loss": 0.8595, + "step": 32064 + }, + { + "epoch": 0.8233392872331377, + "grad_norm": 0.7109375, + "learning_rate": 0.0001078466883995527, + "loss": 0.8266, + "step": 32065 + }, + { + "epoch": 0.8233649644290595, + "grad_norm": 1.0234375, + "learning_rate": 0.00010784223795442816, + "loss": 0.9543, + "step": 32066 + }, + { + "epoch": 0.8233906416249813, + "grad_norm": 0.81640625, + "learning_rate": 0.00010783778749367475, + "loss": 0.8866, + "step": 32067 + }, + { + "epoch": 0.8234163188209032, + "grad_norm": 0.76953125, + "learning_rate": 0.00010783333701730134, + "loss": 0.8145, + "step": 32068 + }, + { + "epoch": 0.823441996016825, + "grad_norm": 0.7109375, + "learning_rate": 0.00010782888652531682, + "loss": 0.8294, + "step": 32069 + }, + { + "epoch": 0.8234676732127468, + "grad_norm": 0.75, + "learning_rate": 0.00010782443601772998, + "loss": 0.7437, + "step": 32070 + }, + { + "epoch": 0.8234933504086687, + "grad_norm": 0.78125, + "learning_rate": 0.0001078199854945498, + "loss": 0.8607, + "step": 32071 + }, + { + "epoch": 0.8235190276045904, + "grad_norm": 0.80078125, + "learning_rate": 0.00010781553495578507, + "loss": 0.8437, + "step": 32072 + }, + { + "epoch": 0.8235447048005122, + "grad_norm": 1.5, + "learning_rate": 0.00010781108440144467, + "loss": 0.7576, + "step": 32073 + }, + { + "epoch": 0.8235703819964341, + "grad_norm": 0.68359375, + "learning_rate": 0.00010780663383153751, + "loss": 0.7342, + "step": 32074 + }, + { + "epoch": 0.8235960591923559, + "grad_norm": 0.7890625, + "learning_rate": 0.0001078021832460724, + "loss": 0.8569, + "step": 32075 + }, + { + "epoch": 0.8236217363882777, + "grad_norm": 0.73828125, + "learning_rate": 0.00010779773264505831, + "loss": 0.8089, + "step": 32076 + }, + { + "epoch": 0.8236474135841996, + "grad_norm": 0.75390625, + "learning_rate": 0.000107793282028504, + "loss": 0.8413, + "step": 32077 + }, + { + "epoch": 0.8236730907801214, + "grad_norm": 0.78125, + "learning_rate": 0.00010778883139641838, + "loss": 0.7008, + "step": 32078 + }, + { + "epoch": 0.8236987679760431, + "grad_norm": 0.74609375, + "learning_rate": 0.00010778438074881033, + "loss": 0.8329, + "step": 32079 + }, + { + "epoch": 0.823724445171965, + "grad_norm": 0.78125, + "learning_rate": 0.00010777993008568872, + "loss": 0.7728, + "step": 32080 + }, + { + "epoch": 0.8237501223678868, + "grad_norm": 0.8984375, + "learning_rate": 0.00010777547940706236, + "loss": 0.7218, + "step": 32081 + }, + { + "epoch": 0.8237757995638086, + "grad_norm": 0.88671875, + "learning_rate": 0.00010777102871294025, + "loss": 0.7968, + "step": 32082 + }, + { + "epoch": 0.8238014767597305, + "grad_norm": 0.73828125, + "learning_rate": 0.00010776657800333114, + "loss": 0.8345, + "step": 32083 + }, + { + "epoch": 0.8238271539556523, + "grad_norm": 0.79296875, + "learning_rate": 0.00010776212727824395, + "loss": 0.8531, + "step": 32084 + }, + { + "epoch": 0.823852831151574, + "grad_norm": 0.81640625, + "learning_rate": 0.00010775767653768753, + "loss": 0.7807, + "step": 32085 + }, + { + "epoch": 0.8238785083474959, + "grad_norm": 0.8515625, + "learning_rate": 0.00010775322578167078, + "loss": 0.8684, + "step": 32086 + }, + { + "epoch": 0.8239041855434177, + "grad_norm": 0.8125, + "learning_rate": 0.00010774877501020252, + "loss": 1.0198, + "step": 32087 + }, + { + "epoch": 0.8239298627393395, + "grad_norm": 0.69921875, + "learning_rate": 0.00010774432422329168, + "loss": 0.6619, + "step": 32088 + }, + { + "epoch": 0.8239555399352614, + "grad_norm": 0.7265625, + "learning_rate": 0.00010773987342094712, + "loss": 0.7474, + "step": 32089 + }, + { + "epoch": 0.8239812171311832, + "grad_norm": 0.89453125, + "learning_rate": 0.00010773542260317766, + "loss": 0.8663, + "step": 32090 + }, + { + "epoch": 0.824006894327105, + "grad_norm": 0.83203125, + "learning_rate": 0.00010773097176999218, + "loss": 0.8773, + "step": 32091 + }, + { + "epoch": 0.8240325715230268, + "grad_norm": 0.70703125, + "learning_rate": 0.00010772652092139961, + "loss": 0.7866, + "step": 32092 + }, + { + "epoch": 0.8240582487189486, + "grad_norm": 0.76171875, + "learning_rate": 0.00010772207005740878, + "loss": 0.7513, + "step": 32093 + }, + { + "epoch": 0.8240839259148705, + "grad_norm": 0.7890625, + "learning_rate": 0.00010771761917802854, + "loss": 0.836, + "step": 32094 + }, + { + "epoch": 0.8241096031107923, + "grad_norm": 0.8125, + "learning_rate": 0.0001077131682832678, + "loss": 0.998, + "step": 32095 + }, + { + "epoch": 0.8241352803067141, + "grad_norm": 0.78515625, + "learning_rate": 0.00010770871737313542, + "loss": 0.8285, + "step": 32096 + }, + { + "epoch": 0.824160957502636, + "grad_norm": 0.796875, + "learning_rate": 0.00010770426644764025, + "loss": 0.8839, + "step": 32097 + }, + { + "epoch": 0.8241866346985578, + "grad_norm": 0.796875, + "learning_rate": 0.00010769981550679118, + "loss": 0.8422, + "step": 32098 + }, + { + "epoch": 0.8242123118944795, + "grad_norm": 0.75390625, + "learning_rate": 0.00010769536455059703, + "loss": 0.7357, + "step": 32099 + }, + { + "epoch": 0.8242379890904014, + "grad_norm": 0.75, + "learning_rate": 0.00010769091357906676, + "loss": 0.8062, + "step": 32100 + }, + { + "epoch": 0.8242636662863232, + "grad_norm": 0.75, + "learning_rate": 0.0001076864625922092, + "loss": 0.7139, + "step": 32101 + }, + { + "epoch": 0.824289343482245, + "grad_norm": 0.796875, + "learning_rate": 0.00010768201159003319, + "loss": 0.9173, + "step": 32102 + }, + { + "epoch": 0.8243150206781669, + "grad_norm": 0.7890625, + "learning_rate": 0.00010767756057254766, + "loss": 0.8527, + "step": 32103 + }, + { + "epoch": 0.8243406978740887, + "grad_norm": 0.78515625, + "learning_rate": 0.00010767310953976137, + "loss": 0.6615, + "step": 32104 + }, + { + "epoch": 0.8243663750700104, + "grad_norm": 0.83984375, + "learning_rate": 0.00010766865849168333, + "loss": 0.7286, + "step": 32105 + }, + { + "epoch": 0.8243920522659323, + "grad_norm": 0.765625, + "learning_rate": 0.00010766420742832233, + "loss": 0.7097, + "step": 32106 + }, + { + "epoch": 0.8244177294618541, + "grad_norm": 0.83984375, + "learning_rate": 0.00010765975634968723, + "loss": 0.8185, + "step": 32107 + }, + { + "epoch": 0.8244434066577759, + "grad_norm": 0.8203125, + "learning_rate": 0.00010765530525578698, + "loss": 0.7868, + "step": 32108 + }, + { + "epoch": 0.8244690838536978, + "grad_norm": 0.734375, + "learning_rate": 0.00010765085414663039, + "loss": 0.724, + "step": 32109 + }, + { + "epoch": 0.8244947610496196, + "grad_norm": 0.73046875, + "learning_rate": 0.0001076464030222263, + "loss": 0.7742, + "step": 32110 + }, + { + "epoch": 0.8245204382455414, + "grad_norm": 0.8828125, + "learning_rate": 0.00010764195188258365, + "loss": 0.8268, + "step": 32111 + }, + { + "epoch": 0.8245461154414632, + "grad_norm": 0.74609375, + "learning_rate": 0.00010763750072771126, + "loss": 0.7452, + "step": 32112 + }, + { + "epoch": 0.824571792637385, + "grad_norm": 0.80078125, + "learning_rate": 0.00010763304955761804, + "loss": 0.8928, + "step": 32113 + }, + { + "epoch": 0.8245974698333068, + "grad_norm": 0.83984375, + "learning_rate": 0.00010762859837231283, + "loss": 0.812, + "step": 32114 + }, + { + "epoch": 0.8246231470292287, + "grad_norm": 0.6953125, + "learning_rate": 0.00010762414717180454, + "loss": 0.7712, + "step": 32115 + }, + { + "epoch": 0.8246488242251505, + "grad_norm": 0.80078125, + "learning_rate": 0.00010761969595610198, + "loss": 0.8068, + "step": 32116 + }, + { + "epoch": 0.8246745014210723, + "grad_norm": 0.7265625, + "learning_rate": 0.00010761524472521407, + "loss": 0.6897, + "step": 32117 + }, + { + "epoch": 0.8247001786169942, + "grad_norm": 0.8046875, + "learning_rate": 0.00010761079347914964, + "loss": 0.8289, + "step": 32118 + }, + { + "epoch": 0.8247258558129159, + "grad_norm": 0.83203125, + "learning_rate": 0.0001076063422179176, + "loss": 0.8612, + "step": 32119 + }, + { + "epoch": 0.8247515330088377, + "grad_norm": 0.82421875, + "learning_rate": 0.00010760189094152684, + "loss": 0.8237, + "step": 32120 + }, + { + "epoch": 0.8247772102047596, + "grad_norm": 0.77734375, + "learning_rate": 0.00010759743964998614, + "loss": 0.6882, + "step": 32121 + }, + { + "epoch": 0.8248028874006814, + "grad_norm": 0.7265625, + "learning_rate": 0.00010759298834330449, + "loss": 0.8169, + "step": 32122 + }, + { + "epoch": 0.8248285645966033, + "grad_norm": 0.7265625, + "learning_rate": 0.00010758853702149066, + "loss": 0.8299, + "step": 32123 + }, + { + "epoch": 0.8248542417925251, + "grad_norm": 0.8203125, + "learning_rate": 0.00010758408568455356, + "loss": 0.8608, + "step": 32124 + }, + { + "epoch": 0.8248799189884468, + "grad_norm": 0.7890625, + "learning_rate": 0.00010757963433250209, + "loss": 0.8436, + "step": 32125 + }, + { + "epoch": 0.8249055961843687, + "grad_norm": 0.7734375, + "learning_rate": 0.00010757518296534505, + "loss": 0.7078, + "step": 32126 + }, + { + "epoch": 0.8249312733802905, + "grad_norm": 0.75390625, + "learning_rate": 0.00010757073158309139, + "loss": 0.7733, + "step": 32127 + }, + { + "epoch": 0.8249569505762123, + "grad_norm": 0.82421875, + "learning_rate": 0.00010756628018574995, + "loss": 0.7966, + "step": 32128 + }, + { + "epoch": 0.8249826277721342, + "grad_norm": 0.76171875, + "learning_rate": 0.00010756182877332958, + "loss": 0.8344, + "step": 32129 + }, + { + "epoch": 0.825008304968056, + "grad_norm": 0.76171875, + "learning_rate": 0.00010755737734583917, + "loss": 0.7655, + "step": 32130 + }, + { + "epoch": 0.8250339821639778, + "grad_norm": 0.7890625, + "learning_rate": 0.00010755292590328758, + "loss": 0.912, + "step": 32131 + }, + { + "epoch": 0.8250596593598996, + "grad_norm": 0.7109375, + "learning_rate": 0.0001075484744456837, + "loss": 0.8383, + "step": 32132 + }, + { + "epoch": 0.8250853365558214, + "grad_norm": 0.828125, + "learning_rate": 0.00010754402297303641, + "loss": 0.7179, + "step": 32133 + }, + { + "epoch": 0.8251110137517432, + "grad_norm": 0.8125, + "learning_rate": 0.00010753957148535452, + "loss": 0.756, + "step": 32134 + }, + { + "epoch": 0.8251366909476651, + "grad_norm": 0.78125, + "learning_rate": 0.000107535119982647, + "loss": 0.8871, + "step": 32135 + }, + { + "epoch": 0.8251623681435869, + "grad_norm": 0.74609375, + "learning_rate": 0.00010753066846492264, + "loss": 0.6321, + "step": 32136 + }, + { + "epoch": 0.8251880453395087, + "grad_norm": 0.7578125, + "learning_rate": 0.00010752621693219034, + "loss": 0.7616, + "step": 32137 + }, + { + "epoch": 0.8252137225354306, + "grad_norm": 0.78125, + "learning_rate": 0.00010752176538445897, + "loss": 0.8422, + "step": 32138 + }, + { + "epoch": 0.8252393997313523, + "grad_norm": 0.6640625, + "learning_rate": 0.00010751731382173738, + "loss": 0.6712, + "step": 32139 + }, + { + "epoch": 0.8252650769272741, + "grad_norm": 0.75, + "learning_rate": 0.0001075128622440345, + "loss": 0.8294, + "step": 32140 + }, + { + "epoch": 0.825290754123196, + "grad_norm": 0.80078125, + "learning_rate": 0.00010750841065135912, + "loss": 0.8468, + "step": 32141 + }, + { + "epoch": 0.8253164313191178, + "grad_norm": 0.7578125, + "learning_rate": 0.00010750395904372019, + "loss": 0.7765, + "step": 32142 + }, + { + "epoch": 0.8253421085150396, + "grad_norm": 0.8203125, + "learning_rate": 0.00010749950742112656, + "loss": 0.8896, + "step": 32143 + }, + { + "epoch": 0.8253677857109615, + "grad_norm": 0.74609375, + "learning_rate": 0.00010749505578358705, + "loss": 0.8069, + "step": 32144 + }, + { + "epoch": 0.8253934629068832, + "grad_norm": 0.78515625, + "learning_rate": 0.00010749060413111058, + "loss": 0.7951, + "step": 32145 + }, + { + "epoch": 0.825419140102805, + "grad_norm": 0.8671875, + "learning_rate": 0.00010748615246370604, + "loss": 0.8321, + "step": 32146 + }, + { + "epoch": 0.8254448172987269, + "grad_norm": 0.703125, + "learning_rate": 0.00010748170078138225, + "loss": 0.8442, + "step": 32147 + }, + { + "epoch": 0.8254704944946487, + "grad_norm": 0.765625, + "learning_rate": 0.00010747724908414811, + "loss": 0.8647, + "step": 32148 + }, + { + "epoch": 0.8254961716905705, + "grad_norm": 0.84375, + "learning_rate": 0.00010747279737201248, + "loss": 0.8925, + "step": 32149 + }, + { + "epoch": 0.8255218488864924, + "grad_norm": 0.74609375, + "learning_rate": 0.00010746834564498427, + "loss": 0.8259, + "step": 32150 + }, + { + "epoch": 0.8255475260824142, + "grad_norm": 0.8984375, + "learning_rate": 0.0001074638939030723, + "loss": 0.8597, + "step": 32151 + }, + { + "epoch": 0.8255732032783359, + "grad_norm": 0.83984375, + "learning_rate": 0.00010745944214628545, + "loss": 0.7865, + "step": 32152 + }, + { + "epoch": 0.8255988804742578, + "grad_norm": 0.73046875, + "learning_rate": 0.00010745499037463262, + "loss": 0.7405, + "step": 32153 + }, + { + "epoch": 0.8256245576701796, + "grad_norm": 0.859375, + "learning_rate": 0.00010745053858812268, + "loss": 0.808, + "step": 32154 + }, + { + "epoch": 0.8256502348661015, + "grad_norm": 1.2734375, + "learning_rate": 0.00010744608678676446, + "loss": 0.7196, + "step": 32155 + }, + { + "epoch": 0.8256759120620233, + "grad_norm": 0.78515625, + "learning_rate": 0.00010744163497056689, + "loss": 0.8351, + "step": 32156 + }, + { + "epoch": 0.8257015892579451, + "grad_norm": 0.8046875, + "learning_rate": 0.00010743718313953881, + "loss": 0.7931, + "step": 32157 + }, + { + "epoch": 0.825727266453867, + "grad_norm": 0.80078125, + "learning_rate": 0.0001074327312936891, + "loss": 0.8661, + "step": 32158 + }, + { + "epoch": 0.8257529436497887, + "grad_norm": 0.80078125, + "learning_rate": 0.0001074282794330266, + "loss": 0.8463, + "step": 32159 + }, + { + "epoch": 0.8257786208457105, + "grad_norm": 0.734375, + "learning_rate": 0.00010742382755756024, + "loss": 0.7318, + "step": 32160 + }, + { + "epoch": 0.8258042980416324, + "grad_norm": 0.7109375, + "learning_rate": 0.00010741937566729885, + "loss": 0.7939, + "step": 32161 + }, + { + "epoch": 0.8258299752375542, + "grad_norm": 1.140625, + "learning_rate": 0.0001074149237622513, + "loss": 0.8574, + "step": 32162 + }, + { + "epoch": 0.825855652433476, + "grad_norm": 0.83203125, + "learning_rate": 0.0001074104718424265, + "loss": 0.8697, + "step": 32163 + }, + { + "epoch": 0.8258813296293979, + "grad_norm": 0.76953125, + "learning_rate": 0.00010740601990783328, + "loss": 0.8595, + "step": 32164 + }, + { + "epoch": 0.8259070068253196, + "grad_norm": 0.71875, + "learning_rate": 0.00010740156795848055, + "loss": 0.8426, + "step": 32165 + }, + { + "epoch": 0.8259326840212414, + "grad_norm": 0.78125, + "learning_rate": 0.00010739711599437714, + "loss": 0.7199, + "step": 32166 + }, + { + "epoch": 0.8259583612171633, + "grad_norm": 0.796875, + "learning_rate": 0.00010739266401553196, + "loss": 0.7466, + "step": 32167 + }, + { + "epoch": 0.8259840384130851, + "grad_norm": 0.75390625, + "learning_rate": 0.00010738821202195388, + "loss": 0.8096, + "step": 32168 + }, + { + "epoch": 0.8260097156090069, + "grad_norm": 0.88671875, + "learning_rate": 0.00010738376001365176, + "loss": 0.8201, + "step": 32169 + }, + { + "epoch": 0.8260353928049288, + "grad_norm": 0.80078125, + "learning_rate": 0.00010737930799063446, + "loss": 0.7796, + "step": 32170 + }, + { + "epoch": 0.8260610700008506, + "grad_norm": 0.85546875, + "learning_rate": 0.00010737485595291086, + "loss": 0.8236, + "step": 32171 + }, + { + "epoch": 0.8260867471967723, + "grad_norm": 0.8125, + "learning_rate": 0.00010737040390048985, + "loss": 0.8014, + "step": 32172 + }, + { + "epoch": 0.8261124243926942, + "grad_norm": 0.828125, + "learning_rate": 0.0001073659518333803, + "loss": 0.8476, + "step": 32173 + }, + { + "epoch": 0.826138101588616, + "grad_norm": 0.703125, + "learning_rate": 0.00010736149975159104, + "loss": 0.7491, + "step": 32174 + }, + { + "epoch": 0.8261637787845378, + "grad_norm": 0.80859375, + "learning_rate": 0.00010735704765513102, + "loss": 0.7194, + "step": 32175 + }, + { + "epoch": 0.8261894559804597, + "grad_norm": 0.6953125, + "learning_rate": 0.00010735259554400903, + "loss": 0.701, + "step": 32176 + }, + { + "epoch": 0.8262151331763815, + "grad_norm": 0.8203125, + "learning_rate": 0.000107348143418234, + "loss": 0.8709, + "step": 32177 + }, + { + "epoch": 0.8262408103723033, + "grad_norm": 0.75390625, + "learning_rate": 0.00010734369127781478, + "loss": 0.9641, + "step": 32178 + }, + { + "epoch": 0.8262664875682251, + "grad_norm": 0.6953125, + "learning_rate": 0.00010733923912276023, + "loss": 0.7528, + "step": 32179 + }, + { + "epoch": 0.8262921647641469, + "grad_norm": 0.75, + "learning_rate": 0.00010733478695307925, + "loss": 0.7245, + "step": 32180 + }, + { + "epoch": 0.8263178419600687, + "grad_norm": 0.79296875, + "learning_rate": 0.00010733033476878072, + "loss": 0.6733, + "step": 32181 + }, + { + "epoch": 0.8263435191559906, + "grad_norm": 0.79296875, + "learning_rate": 0.0001073258825698735, + "loss": 0.7588, + "step": 32182 + }, + { + "epoch": 0.8263691963519124, + "grad_norm": 0.8125, + "learning_rate": 0.00010732143035636644, + "loss": 0.7798, + "step": 32183 + }, + { + "epoch": 0.8263948735478343, + "grad_norm": 0.71875, + "learning_rate": 0.00010731697812826842, + "loss": 0.6646, + "step": 32184 + }, + { + "epoch": 0.826420550743756, + "grad_norm": 0.69921875, + "learning_rate": 0.00010731252588558834, + "loss": 0.7391, + "step": 32185 + }, + { + "epoch": 0.8264462279396778, + "grad_norm": 0.7265625, + "learning_rate": 0.00010730807362833506, + "loss": 0.665, + "step": 32186 + }, + { + "epoch": 0.8264719051355996, + "grad_norm": 0.83984375, + "learning_rate": 0.00010730362135651741, + "loss": 0.8878, + "step": 32187 + }, + { + "epoch": 0.8264975823315215, + "grad_norm": 0.75390625, + "learning_rate": 0.00010729916907014436, + "loss": 0.8459, + "step": 32188 + }, + { + "epoch": 0.8265232595274433, + "grad_norm": 0.71875, + "learning_rate": 0.0001072947167692247, + "loss": 0.7608, + "step": 32189 + }, + { + "epoch": 0.8265489367233652, + "grad_norm": 0.765625, + "learning_rate": 0.00010729026445376733, + "loss": 0.7976, + "step": 32190 + }, + { + "epoch": 0.826574613919287, + "grad_norm": 0.78515625, + "learning_rate": 0.00010728581212378112, + "loss": 0.8236, + "step": 32191 + }, + { + "epoch": 0.8266002911152087, + "grad_norm": 0.70703125, + "learning_rate": 0.00010728135977927491, + "loss": 0.7767, + "step": 32192 + }, + { + "epoch": 0.8266259683111306, + "grad_norm": 0.75390625, + "learning_rate": 0.00010727690742025766, + "loss": 0.821, + "step": 32193 + }, + { + "epoch": 0.8266516455070524, + "grad_norm": 0.85546875, + "learning_rate": 0.00010727245504673819, + "loss": 0.7029, + "step": 32194 + }, + { + "epoch": 0.8266773227029742, + "grad_norm": 0.7890625, + "learning_rate": 0.00010726800265872534, + "loss": 0.8698, + "step": 32195 + }, + { + "epoch": 0.8267029998988961, + "grad_norm": 0.8125, + "learning_rate": 0.00010726355025622805, + "loss": 0.8466, + "step": 32196 + }, + { + "epoch": 0.8267286770948179, + "grad_norm": 0.76171875, + "learning_rate": 0.00010725909783925514, + "loss": 0.9167, + "step": 32197 + }, + { + "epoch": 0.8267543542907397, + "grad_norm": 0.8203125, + "learning_rate": 0.00010725464540781549, + "loss": 0.8216, + "step": 32198 + }, + { + "epoch": 0.8267800314866615, + "grad_norm": 0.6796875, + "learning_rate": 0.00010725019296191803, + "loss": 0.7405, + "step": 32199 + }, + { + "epoch": 0.8268057086825833, + "grad_norm": 0.7109375, + "learning_rate": 0.00010724574050157154, + "loss": 0.7875, + "step": 32200 + }, + { + "epoch": 0.8268313858785051, + "grad_norm": 0.734375, + "learning_rate": 0.000107241288026785, + "loss": 0.8717, + "step": 32201 + }, + { + "epoch": 0.826857063074427, + "grad_norm": 0.7578125, + "learning_rate": 0.00010723683553756721, + "loss": 0.8466, + "step": 32202 + }, + { + "epoch": 0.8268827402703488, + "grad_norm": 0.79296875, + "learning_rate": 0.00010723238303392702, + "loss": 0.8613, + "step": 32203 + }, + { + "epoch": 0.8269084174662706, + "grad_norm": 0.82421875, + "learning_rate": 0.00010722793051587336, + "loss": 0.8171, + "step": 32204 + }, + { + "epoch": 0.8269340946621924, + "grad_norm": 0.75, + "learning_rate": 0.0001072234779834151, + "loss": 0.7365, + "step": 32205 + }, + { + "epoch": 0.8269597718581142, + "grad_norm": 0.7734375, + "learning_rate": 0.00010721902543656108, + "loss": 0.8305, + "step": 32206 + }, + { + "epoch": 0.826985449054036, + "grad_norm": 0.80859375, + "learning_rate": 0.00010721457287532024, + "loss": 0.8677, + "step": 32207 + }, + { + "epoch": 0.8270111262499579, + "grad_norm": 0.734375, + "learning_rate": 0.00010721012029970136, + "loss": 0.718, + "step": 32208 + }, + { + "epoch": 0.8270368034458797, + "grad_norm": 0.76953125, + "learning_rate": 0.00010720566770971341, + "loss": 0.8496, + "step": 32209 + }, + { + "epoch": 0.8270624806418015, + "grad_norm": 0.7578125, + "learning_rate": 0.00010720121510536518, + "loss": 0.7491, + "step": 32210 + }, + { + "epoch": 0.8270881578377234, + "grad_norm": 0.8203125, + "learning_rate": 0.00010719676248666557, + "loss": 0.7936, + "step": 32211 + }, + { + "epoch": 0.8271138350336451, + "grad_norm": 0.828125, + "learning_rate": 0.0001071923098536235, + "loss": 0.818, + "step": 32212 + }, + { + "epoch": 0.8271395122295669, + "grad_norm": 0.703125, + "learning_rate": 0.00010718785720624777, + "loss": 0.7368, + "step": 32213 + }, + { + "epoch": 0.8271651894254888, + "grad_norm": 0.8125, + "learning_rate": 0.00010718340454454732, + "loss": 0.8093, + "step": 32214 + }, + { + "epoch": 0.8271908666214106, + "grad_norm": 0.79296875, + "learning_rate": 0.00010717895186853097, + "loss": 0.7451, + "step": 32215 + }, + { + "epoch": 0.8272165438173325, + "grad_norm": 0.8203125, + "learning_rate": 0.00010717449917820761, + "loss": 0.7926, + "step": 32216 + }, + { + "epoch": 0.8272422210132543, + "grad_norm": 0.890625, + "learning_rate": 0.00010717004647358615, + "loss": 0.8652, + "step": 32217 + }, + { + "epoch": 0.8272678982091761, + "grad_norm": 0.84765625, + "learning_rate": 0.00010716559375467543, + "loss": 0.8232, + "step": 32218 + }, + { + "epoch": 0.8272935754050978, + "grad_norm": 0.74609375, + "learning_rate": 0.00010716114102148428, + "loss": 0.8863, + "step": 32219 + }, + { + "epoch": 0.8273192526010197, + "grad_norm": 0.86328125, + "learning_rate": 0.00010715668827402172, + "loss": 0.9441, + "step": 32220 + }, + { + "epoch": 0.8273449297969415, + "grad_norm": 0.82421875, + "learning_rate": 0.00010715223551229645, + "loss": 0.9411, + "step": 32221 + }, + { + "epoch": 0.8273706069928634, + "grad_norm": 0.7265625, + "learning_rate": 0.00010714778273631745, + "loss": 0.7865, + "step": 32222 + }, + { + "epoch": 0.8273962841887852, + "grad_norm": 0.76953125, + "learning_rate": 0.00010714332994609355, + "loss": 0.7014, + "step": 32223 + }, + { + "epoch": 0.827421961384707, + "grad_norm": 0.78515625, + "learning_rate": 0.00010713887714163362, + "loss": 0.7468, + "step": 32224 + }, + { + "epoch": 0.8274476385806288, + "grad_norm": 0.80078125, + "learning_rate": 0.00010713442432294658, + "loss": 0.7318, + "step": 32225 + }, + { + "epoch": 0.8274733157765506, + "grad_norm": 0.73828125, + "learning_rate": 0.0001071299714900413, + "loss": 0.7961, + "step": 32226 + }, + { + "epoch": 0.8274989929724724, + "grad_norm": 0.78125, + "learning_rate": 0.0001071255186429266, + "loss": 0.8442, + "step": 32227 + }, + { + "epoch": 0.8275246701683943, + "grad_norm": 0.77734375, + "learning_rate": 0.00010712106578161138, + "loss": 0.8876, + "step": 32228 + }, + { + "epoch": 0.8275503473643161, + "grad_norm": 0.73046875, + "learning_rate": 0.00010711661290610453, + "loss": 0.6859, + "step": 32229 + }, + { + "epoch": 0.8275760245602379, + "grad_norm": 0.75, + "learning_rate": 0.00010711216001641491, + "loss": 0.8399, + "step": 32230 + }, + { + "epoch": 0.8276017017561598, + "grad_norm": 0.81640625, + "learning_rate": 0.0001071077071125514, + "loss": 0.8636, + "step": 32231 + }, + { + "epoch": 0.8276273789520815, + "grad_norm": 0.77734375, + "learning_rate": 0.00010710325419452285, + "loss": 0.8705, + "step": 32232 + }, + { + "epoch": 0.8276530561480033, + "grad_norm": 0.91015625, + "learning_rate": 0.0001070988012623382, + "loss": 0.9239, + "step": 32233 + }, + { + "epoch": 0.8276787333439252, + "grad_norm": 0.71484375, + "learning_rate": 0.00010709434831600629, + "loss": 0.7649, + "step": 32234 + }, + { + "epoch": 0.827704410539847, + "grad_norm": 0.83984375, + "learning_rate": 0.00010708989535553593, + "loss": 0.7491, + "step": 32235 + }, + { + "epoch": 0.8277300877357688, + "grad_norm": 0.8203125, + "learning_rate": 0.00010708544238093608, + "loss": 0.8073, + "step": 32236 + }, + { + "epoch": 0.8277557649316907, + "grad_norm": 0.79296875, + "learning_rate": 0.00010708098939221556, + "loss": 0.8664, + "step": 32237 + }, + { + "epoch": 0.8277814421276124, + "grad_norm": 0.7734375, + "learning_rate": 0.00010707653638938328, + "loss": 0.8351, + "step": 32238 + }, + { + "epoch": 0.8278071193235342, + "grad_norm": 0.859375, + "learning_rate": 0.00010707208337244811, + "loss": 0.7869, + "step": 32239 + }, + { + "epoch": 0.8278327965194561, + "grad_norm": 0.91796875, + "learning_rate": 0.00010706763034141891, + "loss": 0.7425, + "step": 32240 + }, + { + "epoch": 0.8278584737153779, + "grad_norm": 0.765625, + "learning_rate": 0.00010706317729630458, + "loss": 0.7937, + "step": 32241 + }, + { + "epoch": 0.8278841509112997, + "grad_norm": 0.69921875, + "learning_rate": 0.00010705872423711396, + "loss": 0.7817, + "step": 32242 + }, + { + "epoch": 0.8279098281072216, + "grad_norm": 0.75390625, + "learning_rate": 0.00010705427116385592, + "loss": 0.7265, + "step": 32243 + }, + { + "epoch": 0.8279355053031434, + "grad_norm": 0.76953125, + "learning_rate": 0.00010704981807653938, + "loss": 0.6907, + "step": 32244 + }, + { + "epoch": 0.8279611824990651, + "grad_norm": 0.8515625, + "learning_rate": 0.00010704536497517319, + "loss": 0.9452, + "step": 32245 + }, + { + "epoch": 0.827986859694987, + "grad_norm": 0.8046875, + "learning_rate": 0.0001070409118597662, + "loss": 0.7744, + "step": 32246 + }, + { + "epoch": 0.8280125368909088, + "grad_norm": 0.8125, + "learning_rate": 0.00010703645873032735, + "loss": 0.7639, + "step": 32247 + }, + { + "epoch": 0.8280382140868306, + "grad_norm": 0.77734375, + "learning_rate": 0.00010703200558686542, + "loss": 0.8013, + "step": 32248 + }, + { + "epoch": 0.8280638912827525, + "grad_norm": 0.8046875, + "learning_rate": 0.00010702755242938937, + "loss": 0.8777, + "step": 32249 + }, + { + "epoch": 0.8280895684786743, + "grad_norm": 0.76171875, + "learning_rate": 0.00010702309925790805, + "loss": 0.7458, + "step": 32250 + }, + { + "epoch": 0.8281152456745962, + "grad_norm": 0.859375, + "learning_rate": 0.00010701864607243031, + "loss": 0.8793, + "step": 32251 + }, + { + "epoch": 0.8281409228705179, + "grad_norm": 0.8359375, + "learning_rate": 0.00010701419287296504, + "loss": 0.8553, + "step": 32252 + }, + { + "epoch": 0.8281666000664397, + "grad_norm": 0.95703125, + "learning_rate": 0.00010700973965952115, + "loss": 0.7764, + "step": 32253 + }, + { + "epoch": 0.8281922772623616, + "grad_norm": 0.79296875, + "learning_rate": 0.00010700528643210745, + "loss": 0.8326, + "step": 32254 + }, + { + "epoch": 0.8282179544582834, + "grad_norm": 0.8125, + "learning_rate": 0.00010700083319073286, + "loss": 0.8153, + "step": 32255 + }, + { + "epoch": 0.8282436316542052, + "grad_norm": 0.82421875, + "learning_rate": 0.0001069963799354062, + "loss": 0.8466, + "step": 32256 + }, + { + "epoch": 0.8282693088501271, + "grad_norm": 0.78125, + "learning_rate": 0.00010699192666613645, + "loss": 0.7805, + "step": 32257 + }, + { + "epoch": 0.8282949860460488, + "grad_norm": 0.7578125, + "learning_rate": 0.0001069874733829324, + "loss": 0.8096, + "step": 32258 + }, + { + "epoch": 0.8283206632419706, + "grad_norm": 0.80859375, + "learning_rate": 0.00010698302008580292, + "loss": 0.7973, + "step": 32259 + }, + { + "epoch": 0.8283463404378925, + "grad_norm": 0.77734375, + "learning_rate": 0.00010697856677475697, + "loss": 0.8185, + "step": 32260 + }, + { + "epoch": 0.8283720176338143, + "grad_norm": 0.6875, + "learning_rate": 0.0001069741134498033, + "loss": 0.805, + "step": 32261 + }, + { + "epoch": 0.8283976948297361, + "grad_norm": 0.9609375, + "learning_rate": 0.00010696966011095091, + "loss": 0.6773, + "step": 32262 + }, + { + "epoch": 0.828423372025658, + "grad_norm": 0.75390625, + "learning_rate": 0.00010696520675820858, + "loss": 0.8061, + "step": 32263 + }, + { + "epoch": 0.8284490492215798, + "grad_norm": 0.75, + "learning_rate": 0.00010696075339158522, + "loss": 0.8458, + "step": 32264 + }, + { + "epoch": 0.8284747264175015, + "grad_norm": 0.83203125, + "learning_rate": 0.00010695630001108971, + "loss": 0.8613, + "step": 32265 + }, + { + "epoch": 0.8285004036134234, + "grad_norm": 0.75, + "learning_rate": 0.00010695184661673094, + "loss": 0.7585, + "step": 32266 + }, + { + "epoch": 0.8285260808093452, + "grad_norm": 0.73828125, + "learning_rate": 0.00010694739320851777, + "loss": 0.8359, + "step": 32267 + }, + { + "epoch": 0.828551758005267, + "grad_norm": 0.76953125, + "learning_rate": 0.00010694293978645906, + "loss": 0.8876, + "step": 32268 + }, + { + "epoch": 0.8285774352011889, + "grad_norm": 0.80078125, + "learning_rate": 0.00010693848635056369, + "loss": 0.8106, + "step": 32269 + }, + { + "epoch": 0.8286031123971107, + "grad_norm": 0.70703125, + "learning_rate": 0.00010693403290084057, + "loss": 0.7623, + "step": 32270 + }, + { + "epoch": 0.8286287895930325, + "grad_norm": 0.7421875, + "learning_rate": 0.00010692957943729854, + "loss": 0.6612, + "step": 32271 + }, + { + "epoch": 0.8286544667889543, + "grad_norm": 0.78125, + "learning_rate": 0.00010692512595994646, + "loss": 0.8845, + "step": 32272 + }, + { + "epoch": 0.8286801439848761, + "grad_norm": 0.859375, + "learning_rate": 0.00010692067246879327, + "loss": 0.8559, + "step": 32273 + }, + { + "epoch": 0.8287058211807979, + "grad_norm": 0.8671875, + "learning_rate": 0.00010691621896384776, + "loss": 0.8264, + "step": 32274 + }, + { + "epoch": 0.8287314983767198, + "grad_norm": 0.8046875, + "learning_rate": 0.00010691176544511888, + "loss": 0.7379, + "step": 32275 + }, + { + "epoch": 0.8287571755726416, + "grad_norm": 0.8984375, + "learning_rate": 0.00010690731191261547, + "loss": 0.9017, + "step": 32276 + }, + { + "epoch": 0.8287828527685634, + "grad_norm": 0.82421875, + "learning_rate": 0.00010690285836634639, + "loss": 0.8984, + "step": 32277 + }, + { + "epoch": 0.8288085299644852, + "grad_norm": 0.80078125, + "learning_rate": 0.00010689840480632056, + "loss": 0.7248, + "step": 32278 + }, + { + "epoch": 0.828834207160407, + "grad_norm": 0.828125, + "learning_rate": 0.00010689395123254684, + "loss": 0.8356, + "step": 32279 + }, + { + "epoch": 0.8288598843563288, + "grad_norm": 0.765625, + "learning_rate": 0.00010688949764503408, + "loss": 0.6223, + "step": 32280 + }, + { + "epoch": 0.8288855615522507, + "grad_norm": 0.7265625, + "learning_rate": 0.00010688504404379119, + "loss": 0.738, + "step": 32281 + }, + { + "epoch": 0.8289112387481725, + "grad_norm": 0.765625, + "learning_rate": 0.00010688059042882704, + "loss": 0.8111, + "step": 32282 + }, + { + "epoch": 0.8289369159440944, + "grad_norm": 0.8125, + "learning_rate": 0.00010687613680015043, + "loss": 0.7928, + "step": 32283 + }, + { + "epoch": 0.8289625931400162, + "grad_norm": 0.83984375, + "learning_rate": 0.00010687168315777036, + "loss": 0.8028, + "step": 32284 + }, + { + "epoch": 0.8289882703359379, + "grad_norm": 0.75, + "learning_rate": 0.00010686722950169561, + "loss": 0.8228, + "step": 32285 + }, + { + "epoch": 0.8290139475318598, + "grad_norm": 0.90234375, + "learning_rate": 0.00010686277583193513, + "loss": 0.8099, + "step": 32286 + }, + { + "epoch": 0.8290396247277816, + "grad_norm": 0.87109375, + "learning_rate": 0.00010685832214849776, + "loss": 0.8845, + "step": 32287 + }, + { + "epoch": 0.8290653019237034, + "grad_norm": 0.81640625, + "learning_rate": 0.00010685386845139233, + "loss": 0.8738, + "step": 32288 + }, + { + "epoch": 0.8290909791196253, + "grad_norm": 0.890625, + "learning_rate": 0.00010684941474062779, + "loss": 0.8242, + "step": 32289 + }, + { + "epoch": 0.8291166563155471, + "grad_norm": 0.78515625, + "learning_rate": 0.00010684496101621297, + "loss": 0.8531, + "step": 32290 + }, + { + "epoch": 0.8291423335114689, + "grad_norm": 0.7578125, + "learning_rate": 0.00010684050727815675, + "loss": 0.8346, + "step": 32291 + }, + { + "epoch": 0.8291680107073907, + "grad_norm": 0.7109375, + "learning_rate": 0.00010683605352646807, + "loss": 0.728, + "step": 32292 + }, + { + "epoch": 0.8291936879033125, + "grad_norm": 0.78515625, + "learning_rate": 0.0001068315997611557, + "loss": 0.8193, + "step": 32293 + }, + { + "epoch": 0.8292193650992343, + "grad_norm": 0.7109375, + "learning_rate": 0.00010682714598222861, + "loss": 0.8723, + "step": 32294 + }, + { + "epoch": 0.8292450422951562, + "grad_norm": 0.81640625, + "learning_rate": 0.0001068226921896956, + "loss": 0.7759, + "step": 32295 + }, + { + "epoch": 0.829270719491078, + "grad_norm": 0.86328125, + "learning_rate": 0.00010681823838356557, + "loss": 0.7732, + "step": 32296 + }, + { + "epoch": 0.8292963966869998, + "grad_norm": 0.73828125, + "learning_rate": 0.00010681378456384745, + "loss": 0.9504, + "step": 32297 + }, + { + "epoch": 0.8293220738829216, + "grad_norm": 0.8125, + "learning_rate": 0.00010680933073055004, + "loss": 0.8229, + "step": 32298 + }, + { + "epoch": 0.8293477510788434, + "grad_norm": 0.77734375, + "learning_rate": 0.00010680487688368229, + "loss": 0.7302, + "step": 32299 + }, + { + "epoch": 0.8293734282747652, + "grad_norm": 0.8359375, + "learning_rate": 0.00010680042302325299, + "loss": 0.8759, + "step": 32300 + }, + { + "epoch": 0.8293991054706871, + "grad_norm": 0.7578125, + "learning_rate": 0.00010679596914927106, + "loss": 0.8211, + "step": 32301 + }, + { + "epoch": 0.8294247826666089, + "grad_norm": 0.7734375, + "learning_rate": 0.0001067915152617454, + "loss": 0.9093, + "step": 32302 + }, + { + "epoch": 0.8294504598625307, + "grad_norm": 0.84765625, + "learning_rate": 0.00010678706136068486, + "loss": 1.0493, + "step": 32303 + }, + { + "epoch": 0.8294761370584526, + "grad_norm": 0.78515625, + "learning_rate": 0.00010678260744609831, + "loss": 0.8064, + "step": 32304 + }, + { + "epoch": 0.8295018142543743, + "grad_norm": 0.87890625, + "learning_rate": 0.00010677815351799465, + "loss": 0.9878, + "step": 32305 + }, + { + "epoch": 0.8295274914502961, + "grad_norm": 0.74609375, + "learning_rate": 0.00010677369957638275, + "loss": 0.7527, + "step": 32306 + }, + { + "epoch": 0.829553168646218, + "grad_norm": 0.796875, + "learning_rate": 0.00010676924562127146, + "loss": 0.7706, + "step": 32307 + }, + { + "epoch": 0.8295788458421398, + "grad_norm": 0.76953125, + "learning_rate": 0.00010676479165266968, + "loss": 0.8646, + "step": 32308 + }, + { + "epoch": 0.8296045230380616, + "grad_norm": 0.93359375, + "learning_rate": 0.00010676033767058626, + "loss": 0.87, + "step": 32309 + }, + { + "epoch": 0.8296302002339835, + "grad_norm": 0.76953125, + "learning_rate": 0.00010675588367503013, + "loss": 0.8184, + "step": 32310 + }, + { + "epoch": 0.8296558774299053, + "grad_norm": 0.74609375, + "learning_rate": 0.00010675142966601012, + "loss": 0.7899, + "step": 32311 + }, + { + "epoch": 0.829681554625827, + "grad_norm": 0.80078125, + "learning_rate": 0.00010674697564353513, + "loss": 0.839, + "step": 32312 + }, + { + "epoch": 0.8297072318217489, + "grad_norm": 0.734375, + "learning_rate": 0.00010674252160761404, + "loss": 0.7376, + "step": 32313 + }, + { + "epoch": 0.8297329090176707, + "grad_norm": 0.74609375, + "learning_rate": 0.00010673806755825565, + "loss": 0.809, + "step": 32314 + }, + { + "epoch": 0.8297585862135926, + "grad_norm": 0.80078125, + "learning_rate": 0.00010673361349546898, + "loss": 0.8281, + "step": 32315 + }, + { + "epoch": 0.8297842634095144, + "grad_norm": 0.81640625, + "learning_rate": 0.00010672915941926279, + "loss": 0.901, + "step": 32316 + }, + { + "epoch": 0.8298099406054362, + "grad_norm": 0.83984375, + "learning_rate": 0.00010672470532964598, + "loss": 0.9121, + "step": 32317 + }, + { + "epoch": 0.829835617801358, + "grad_norm": 0.796875, + "learning_rate": 0.00010672025122662745, + "loss": 0.8072, + "step": 32318 + }, + { + "epoch": 0.8298612949972798, + "grad_norm": 0.78515625, + "learning_rate": 0.00010671579711021609, + "loss": 0.7612, + "step": 32319 + }, + { + "epoch": 0.8298869721932016, + "grad_norm": 0.76953125, + "learning_rate": 0.00010671134298042072, + "loss": 0.8152, + "step": 32320 + }, + { + "epoch": 0.8299126493891235, + "grad_norm": 0.85546875, + "learning_rate": 0.00010670688883725027, + "loss": 0.8185, + "step": 32321 + }, + { + "epoch": 0.8299383265850453, + "grad_norm": 0.76171875, + "learning_rate": 0.00010670243468071359, + "loss": 0.762, + "step": 32322 + }, + { + "epoch": 0.8299640037809671, + "grad_norm": 0.77734375, + "learning_rate": 0.00010669798051081954, + "loss": 0.7862, + "step": 32323 + }, + { + "epoch": 0.829989680976889, + "grad_norm": 0.9921875, + "learning_rate": 0.00010669352632757705, + "loss": 0.8782, + "step": 32324 + }, + { + "epoch": 0.8300153581728107, + "grad_norm": 0.79296875, + "learning_rate": 0.00010668907213099495, + "loss": 0.8672, + "step": 32325 + }, + { + "epoch": 0.8300410353687325, + "grad_norm": 0.8984375, + "learning_rate": 0.00010668461792108218, + "loss": 0.8594, + "step": 32326 + }, + { + "epoch": 0.8300667125646544, + "grad_norm": 0.74609375, + "learning_rate": 0.00010668016369784753, + "loss": 0.7636, + "step": 32327 + }, + { + "epoch": 0.8300923897605762, + "grad_norm": 0.7890625, + "learning_rate": 0.00010667570946129989, + "loss": 0.6686, + "step": 32328 + }, + { + "epoch": 0.830118066956498, + "grad_norm": 0.7265625, + "learning_rate": 0.0001066712552114482, + "loss": 0.8393, + "step": 32329 + }, + { + "epoch": 0.8301437441524199, + "grad_norm": 1.2265625, + "learning_rate": 0.0001066668009483013, + "loss": 0.6909, + "step": 32330 + }, + { + "epoch": 0.8301694213483417, + "grad_norm": 0.8125, + "learning_rate": 0.00010666234667186805, + "loss": 0.8681, + "step": 32331 + }, + { + "epoch": 0.8301950985442634, + "grad_norm": 0.77734375, + "learning_rate": 0.00010665789238215738, + "loss": 0.8595, + "step": 32332 + }, + { + "epoch": 0.8302207757401853, + "grad_norm": 0.76953125, + "learning_rate": 0.00010665343807917809, + "loss": 0.7922, + "step": 32333 + }, + { + "epoch": 0.8302464529361071, + "grad_norm": 0.75390625, + "learning_rate": 0.00010664898376293913, + "loss": 0.8374, + "step": 32334 + }, + { + "epoch": 0.8302721301320289, + "grad_norm": 0.859375, + "learning_rate": 0.00010664452943344934, + "loss": 0.8957, + "step": 32335 + }, + { + "epoch": 0.8302978073279508, + "grad_norm": 0.796875, + "learning_rate": 0.00010664007509071759, + "loss": 0.7546, + "step": 32336 + }, + { + "epoch": 0.8303234845238726, + "grad_norm": 0.76953125, + "learning_rate": 0.00010663562073475277, + "loss": 0.7971, + "step": 32337 + }, + { + "epoch": 0.8303491617197943, + "grad_norm": 0.81640625, + "learning_rate": 0.00010663116636556376, + "loss": 0.905, + "step": 32338 + }, + { + "epoch": 0.8303748389157162, + "grad_norm": 0.81640625, + "learning_rate": 0.00010662671198315946, + "loss": 0.9377, + "step": 32339 + }, + { + "epoch": 0.830400516111638, + "grad_norm": 0.83203125, + "learning_rate": 0.00010662225758754869, + "loss": 0.7852, + "step": 32340 + }, + { + "epoch": 0.8304261933075598, + "grad_norm": 0.7109375, + "learning_rate": 0.00010661780317874035, + "loss": 0.8035, + "step": 32341 + }, + { + "epoch": 0.8304518705034817, + "grad_norm": 0.7421875, + "learning_rate": 0.00010661334875674336, + "loss": 0.7859, + "step": 32342 + }, + { + "epoch": 0.8304775476994035, + "grad_norm": 0.74609375, + "learning_rate": 0.00010660889432156656, + "loss": 0.7991, + "step": 32343 + }, + { + "epoch": 0.8305032248953254, + "grad_norm": 0.83984375, + "learning_rate": 0.0001066044398732188, + "loss": 0.7539, + "step": 32344 + }, + { + "epoch": 0.8305289020912471, + "grad_norm": 1.1015625, + "learning_rate": 0.00010659998541170903, + "loss": 0.7834, + "step": 32345 + }, + { + "epoch": 0.8305545792871689, + "grad_norm": 0.8359375, + "learning_rate": 0.00010659553093704604, + "loss": 0.8262, + "step": 32346 + }, + { + "epoch": 0.8305802564830908, + "grad_norm": 0.72265625, + "learning_rate": 0.00010659107644923878, + "loss": 0.8004, + "step": 32347 + }, + { + "epoch": 0.8306059336790126, + "grad_norm": 0.9921875, + "learning_rate": 0.0001065866219482961, + "loss": 0.7187, + "step": 32348 + }, + { + "epoch": 0.8306316108749344, + "grad_norm": 0.7421875, + "learning_rate": 0.00010658216743422686, + "loss": 0.6926, + "step": 32349 + }, + { + "epoch": 0.8306572880708563, + "grad_norm": 0.7734375, + "learning_rate": 0.00010657771290703997, + "loss": 0.8096, + "step": 32350 + }, + { + "epoch": 0.8306829652667781, + "grad_norm": 0.765625, + "learning_rate": 0.0001065732583667443, + "loss": 0.7566, + "step": 32351 + }, + { + "epoch": 0.8307086424626998, + "grad_norm": 0.76953125, + "learning_rate": 0.0001065688038133487, + "loss": 0.7528, + "step": 32352 + }, + { + "epoch": 0.8307343196586217, + "grad_norm": 0.78515625, + "learning_rate": 0.0001065643492468621, + "loss": 0.7567, + "step": 32353 + }, + { + "epoch": 0.8307599968545435, + "grad_norm": 0.75, + "learning_rate": 0.0001065598946672933, + "loss": 0.9645, + "step": 32354 + }, + { + "epoch": 0.8307856740504653, + "grad_norm": 0.82421875, + "learning_rate": 0.00010655544007465124, + "loss": 0.7782, + "step": 32355 + }, + { + "epoch": 0.8308113512463872, + "grad_norm": 0.8984375, + "learning_rate": 0.0001065509854689448, + "loss": 0.8817, + "step": 32356 + }, + { + "epoch": 0.830837028442309, + "grad_norm": 1.2109375, + "learning_rate": 0.0001065465308501828, + "loss": 0.8971, + "step": 32357 + }, + { + "epoch": 0.8308627056382307, + "grad_norm": 0.703125, + "learning_rate": 0.00010654207621837419, + "loss": 0.6486, + "step": 32358 + }, + { + "epoch": 0.8308883828341526, + "grad_norm": 0.83203125, + "learning_rate": 0.00010653762157352782, + "loss": 0.9089, + "step": 32359 + }, + { + "epoch": 0.8309140600300744, + "grad_norm": 0.7578125, + "learning_rate": 0.00010653316691565251, + "loss": 0.7327, + "step": 32360 + }, + { + "epoch": 0.8309397372259962, + "grad_norm": 0.73046875, + "learning_rate": 0.00010652871224475723, + "loss": 0.7903, + "step": 32361 + }, + { + "epoch": 0.8309654144219181, + "grad_norm": 0.76953125, + "learning_rate": 0.00010652425756085078, + "loss": 0.7677, + "step": 32362 + }, + { + "epoch": 0.8309910916178399, + "grad_norm": 0.75390625, + "learning_rate": 0.00010651980286394212, + "loss": 0.8427, + "step": 32363 + }, + { + "epoch": 0.8310167688137617, + "grad_norm": 0.671875, + "learning_rate": 0.00010651534815404007, + "loss": 0.8265, + "step": 32364 + }, + { + "epoch": 0.8310424460096835, + "grad_norm": 0.74609375, + "learning_rate": 0.0001065108934311535, + "loss": 0.7579, + "step": 32365 + }, + { + "epoch": 0.8310681232056053, + "grad_norm": 0.7578125, + "learning_rate": 0.00010650643869529133, + "loss": 0.7228, + "step": 32366 + }, + { + "epoch": 0.8310938004015271, + "grad_norm": 0.828125, + "learning_rate": 0.00010650198394646241, + "loss": 0.7671, + "step": 32367 + }, + { + "epoch": 0.831119477597449, + "grad_norm": 0.72265625, + "learning_rate": 0.00010649752918467559, + "loss": 0.7101, + "step": 32368 + }, + { + "epoch": 0.8311451547933708, + "grad_norm": 0.734375, + "learning_rate": 0.00010649307440993982, + "loss": 0.7425, + "step": 32369 + }, + { + "epoch": 0.8311708319892926, + "grad_norm": 0.70703125, + "learning_rate": 0.00010648861962226393, + "loss": 0.6688, + "step": 32370 + }, + { + "epoch": 0.8311965091852145, + "grad_norm": 0.78125, + "learning_rate": 0.00010648416482165679, + "loss": 0.7683, + "step": 32371 + }, + { + "epoch": 0.8312221863811362, + "grad_norm": 0.7890625, + "learning_rate": 0.00010647971000812736, + "loss": 0.698, + "step": 32372 + }, + { + "epoch": 0.831247863577058, + "grad_norm": 0.94140625, + "learning_rate": 0.00010647525518168438, + "loss": 0.7923, + "step": 32373 + }, + { + "epoch": 0.8312735407729799, + "grad_norm": 0.76171875, + "learning_rate": 0.00010647080034233683, + "loss": 0.7768, + "step": 32374 + }, + { + "epoch": 0.8312992179689017, + "grad_norm": 0.85546875, + "learning_rate": 0.00010646634549009356, + "loss": 0.8519, + "step": 32375 + }, + { + "epoch": 0.8313248951648236, + "grad_norm": 0.7109375, + "learning_rate": 0.00010646189062496341, + "loss": 0.7101, + "step": 32376 + }, + { + "epoch": 0.8313505723607454, + "grad_norm": 0.765625, + "learning_rate": 0.00010645743574695535, + "loss": 0.9155, + "step": 32377 + }, + { + "epoch": 0.8313762495566671, + "grad_norm": 0.828125, + "learning_rate": 0.00010645298085607818, + "loss": 0.7986, + "step": 32378 + }, + { + "epoch": 0.831401926752589, + "grad_norm": 0.74609375, + "learning_rate": 0.00010644852595234081, + "loss": 0.7419, + "step": 32379 + }, + { + "epoch": 0.8314276039485108, + "grad_norm": 0.71484375, + "learning_rate": 0.0001064440710357521, + "loss": 0.8817, + "step": 32380 + }, + { + "epoch": 0.8314532811444326, + "grad_norm": 0.7890625, + "learning_rate": 0.00010643961610632091, + "loss": 0.9198, + "step": 32381 + }, + { + "epoch": 0.8314789583403545, + "grad_norm": 0.75, + "learning_rate": 0.0001064351611640562, + "loss": 0.7433, + "step": 32382 + }, + { + "epoch": 0.8315046355362763, + "grad_norm": 0.70703125, + "learning_rate": 0.00010643070620896678, + "loss": 0.7971, + "step": 32383 + }, + { + "epoch": 0.8315303127321981, + "grad_norm": 0.78515625, + "learning_rate": 0.00010642625124106151, + "loss": 0.7904, + "step": 32384 + }, + { + "epoch": 0.8315559899281199, + "grad_norm": 0.73828125, + "learning_rate": 0.00010642179626034937, + "loss": 0.796, + "step": 32385 + }, + { + "epoch": 0.8315816671240417, + "grad_norm": 0.6875, + "learning_rate": 0.00010641734126683911, + "loss": 0.7571, + "step": 32386 + }, + { + "epoch": 0.8316073443199635, + "grad_norm": 0.78515625, + "learning_rate": 0.00010641288626053968, + "loss": 0.7442, + "step": 32387 + }, + { + "epoch": 0.8316330215158854, + "grad_norm": 0.82421875, + "learning_rate": 0.00010640843124145998, + "loss": 0.8192, + "step": 32388 + }, + { + "epoch": 0.8316586987118072, + "grad_norm": 0.80078125, + "learning_rate": 0.0001064039762096088, + "loss": 0.795, + "step": 32389 + }, + { + "epoch": 0.831684375907729, + "grad_norm": 0.83203125, + "learning_rate": 0.00010639952116499512, + "loss": 0.7872, + "step": 32390 + }, + { + "epoch": 0.8317100531036509, + "grad_norm": 0.78515625, + "learning_rate": 0.00010639506610762775, + "loss": 0.8413, + "step": 32391 + }, + { + "epoch": 0.8317357302995726, + "grad_norm": 0.80859375, + "learning_rate": 0.00010639061103751559, + "loss": 0.8816, + "step": 32392 + }, + { + "epoch": 0.8317614074954944, + "grad_norm": 0.78515625, + "learning_rate": 0.00010638615595466755, + "loss": 0.7461, + "step": 32393 + }, + { + "epoch": 0.8317870846914163, + "grad_norm": 0.86328125, + "learning_rate": 0.00010638170085909245, + "loss": 0.8274, + "step": 32394 + }, + { + "epoch": 0.8318127618873381, + "grad_norm": 0.83203125, + "learning_rate": 0.00010637724575079919, + "loss": 0.8035, + "step": 32395 + }, + { + "epoch": 0.8318384390832599, + "grad_norm": 0.734375, + "learning_rate": 0.00010637279062979668, + "loss": 0.7741, + "step": 32396 + }, + { + "epoch": 0.8318641162791818, + "grad_norm": 0.796875, + "learning_rate": 0.00010636833549609372, + "loss": 0.7925, + "step": 32397 + }, + { + "epoch": 0.8318897934751035, + "grad_norm": 0.82421875, + "learning_rate": 0.00010636388034969933, + "loss": 0.8026, + "step": 32398 + }, + { + "epoch": 0.8319154706710253, + "grad_norm": 0.8046875, + "learning_rate": 0.00010635942519062223, + "loss": 0.8146, + "step": 32399 + }, + { + "epoch": 0.8319411478669472, + "grad_norm": 0.76171875, + "learning_rate": 0.0001063549700188714, + "loss": 0.8329, + "step": 32400 + }, + { + "epoch": 0.831966825062869, + "grad_norm": 0.85546875, + "learning_rate": 0.0001063505148344557, + "loss": 0.7733, + "step": 32401 + }, + { + "epoch": 0.8319925022587908, + "grad_norm": 0.7265625, + "learning_rate": 0.00010634605963738395, + "loss": 0.644, + "step": 32402 + }, + { + "epoch": 0.8320181794547127, + "grad_norm": 0.80859375, + "learning_rate": 0.00010634160442766512, + "loss": 0.7729, + "step": 32403 + }, + { + "epoch": 0.8320438566506345, + "grad_norm": 0.71875, + "learning_rate": 0.00010633714920530804, + "loss": 0.78, + "step": 32404 + }, + { + "epoch": 0.8320695338465562, + "grad_norm": 0.9765625, + "learning_rate": 0.0001063326939703216, + "loss": 0.8014, + "step": 32405 + }, + { + "epoch": 0.8320952110424781, + "grad_norm": 0.8046875, + "learning_rate": 0.00010632823872271464, + "loss": 0.9201, + "step": 32406 + }, + { + "epoch": 0.8321208882383999, + "grad_norm": 0.7421875, + "learning_rate": 0.00010632378346249612, + "loss": 0.8847, + "step": 32407 + }, + { + "epoch": 0.8321465654343217, + "grad_norm": 0.78515625, + "learning_rate": 0.00010631932818967483, + "loss": 0.7221, + "step": 32408 + }, + { + "epoch": 0.8321722426302436, + "grad_norm": 0.828125, + "learning_rate": 0.0001063148729042597, + "loss": 0.929, + "step": 32409 + }, + { + "epoch": 0.8321979198261654, + "grad_norm": 0.76953125, + "learning_rate": 0.00010631041760625957, + "loss": 0.7268, + "step": 32410 + }, + { + "epoch": 0.8322235970220873, + "grad_norm": 0.82421875, + "learning_rate": 0.00010630596229568341, + "loss": 0.7714, + "step": 32411 + }, + { + "epoch": 0.832249274218009, + "grad_norm": 0.71875, + "learning_rate": 0.00010630150697254002, + "loss": 0.681, + "step": 32412 + }, + { + "epoch": 0.8322749514139308, + "grad_norm": 0.7109375, + "learning_rate": 0.00010629705163683825, + "loss": 0.7293, + "step": 32413 + }, + { + "epoch": 0.8323006286098527, + "grad_norm": 0.734375, + "learning_rate": 0.00010629259628858707, + "loss": 0.6948, + "step": 32414 + }, + { + "epoch": 0.8323263058057745, + "grad_norm": 0.8125, + "learning_rate": 0.00010628814092779531, + "loss": 0.8043, + "step": 32415 + }, + { + "epoch": 0.8323519830016963, + "grad_norm": 1.6875, + "learning_rate": 0.00010628368555447182, + "loss": 0.8699, + "step": 32416 + }, + { + "epoch": 0.8323776601976182, + "grad_norm": 0.8046875, + "learning_rate": 0.00010627923016862555, + "loss": 0.9149, + "step": 32417 + }, + { + "epoch": 0.8324033373935399, + "grad_norm": 0.78515625, + "learning_rate": 0.00010627477477026532, + "loss": 0.7086, + "step": 32418 + }, + { + "epoch": 0.8324290145894617, + "grad_norm": 0.73046875, + "learning_rate": 0.00010627031935940003, + "loss": 0.8245, + "step": 32419 + }, + { + "epoch": 0.8324546917853836, + "grad_norm": 0.828125, + "learning_rate": 0.0001062658639360386, + "loss": 0.7808, + "step": 32420 + }, + { + "epoch": 0.8324803689813054, + "grad_norm": 0.80859375, + "learning_rate": 0.00010626140850018982, + "loss": 0.8, + "step": 32421 + }, + { + "epoch": 0.8325060461772272, + "grad_norm": 0.8046875, + "learning_rate": 0.00010625695305186263, + "loss": 0.7768, + "step": 32422 + }, + { + "epoch": 0.8325317233731491, + "grad_norm": 0.828125, + "learning_rate": 0.0001062524975910659, + "loss": 0.8761, + "step": 32423 + }, + { + "epoch": 0.8325574005690709, + "grad_norm": 0.75, + "learning_rate": 0.00010624804211780853, + "loss": 0.7816, + "step": 32424 + }, + { + "epoch": 0.8325830777649926, + "grad_norm": 0.734375, + "learning_rate": 0.00010624358663209935, + "loss": 0.8197, + "step": 32425 + }, + { + "epoch": 0.8326087549609145, + "grad_norm": 0.76171875, + "learning_rate": 0.00010623913113394726, + "loss": 0.7703, + "step": 32426 + }, + { + "epoch": 0.8326344321568363, + "grad_norm": 0.7421875, + "learning_rate": 0.00010623467562336116, + "loss": 0.7102, + "step": 32427 + }, + { + "epoch": 0.8326601093527581, + "grad_norm": 0.83203125, + "learning_rate": 0.00010623022010034991, + "loss": 0.778, + "step": 32428 + }, + { + "epoch": 0.83268578654868, + "grad_norm": 0.7578125, + "learning_rate": 0.0001062257645649224, + "loss": 0.8694, + "step": 32429 + }, + { + "epoch": 0.8327114637446018, + "grad_norm": 0.85546875, + "learning_rate": 0.00010622130901708748, + "loss": 0.8371, + "step": 32430 + }, + { + "epoch": 0.8327371409405235, + "grad_norm": 0.78515625, + "learning_rate": 0.00010621685345685409, + "loss": 0.8816, + "step": 32431 + }, + { + "epoch": 0.8327628181364454, + "grad_norm": 0.76171875, + "learning_rate": 0.00010621239788423105, + "loss": 0.7316, + "step": 32432 + }, + { + "epoch": 0.8327884953323672, + "grad_norm": 0.76171875, + "learning_rate": 0.00010620794229922728, + "loss": 0.727, + "step": 32433 + }, + { + "epoch": 0.832814172528289, + "grad_norm": 0.8046875, + "learning_rate": 0.00010620348670185163, + "loss": 0.8963, + "step": 32434 + }, + { + "epoch": 0.8328398497242109, + "grad_norm": 0.796875, + "learning_rate": 0.00010619903109211298, + "loss": 0.7943, + "step": 32435 + }, + { + "epoch": 0.8328655269201327, + "grad_norm": 0.78515625, + "learning_rate": 0.00010619457547002024, + "loss": 0.8822, + "step": 32436 + }, + { + "epoch": 0.8328912041160546, + "grad_norm": 0.875, + "learning_rate": 0.00010619011983558226, + "loss": 0.8039, + "step": 32437 + }, + { + "epoch": 0.8329168813119763, + "grad_norm": 0.78515625, + "learning_rate": 0.00010618566418880796, + "loss": 0.9556, + "step": 32438 + }, + { + "epoch": 0.8329425585078981, + "grad_norm": 0.70703125, + "learning_rate": 0.00010618120852970612, + "loss": 0.7983, + "step": 32439 + }, + { + "epoch": 0.83296823570382, + "grad_norm": 0.7578125, + "learning_rate": 0.00010617675285828574, + "loss": 0.8859, + "step": 32440 + }, + { + "epoch": 0.8329939128997418, + "grad_norm": 0.7734375, + "learning_rate": 0.00010617229717455566, + "loss": 0.8042, + "step": 32441 + }, + { + "epoch": 0.8330195900956636, + "grad_norm": 0.75390625, + "learning_rate": 0.00010616784147852472, + "loss": 0.8905, + "step": 32442 + }, + { + "epoch": 0.8330452672915855, + "grad_norm": 0.73046875, + "learning_rate": 0.00010616338577020184, + "loss": 0.8045, + "step": 32443 + }, + { + "epoch": 0.8330709444875073, + "grad_norm": 0.796875, + "learning_rate": 0.00010615893004959591, + "loss": 0.9232, + "step": 32444 + }, + { + "epoch": 0.833096621683429, + "grad_norm": 0.80078125, + "learning_rate": 0.00010615447431671575, + "loss": 0.7523, + "step": 32445 + }, + { + "epoch": 0.8331222988793509, + "grad_norm": 0.765625, + "learning_rate": 0.0001061500185715703, + "loss": 0.8671, + "step": 32446 + }, + { + "epoch": 0.8331479760752727, + "grad_norm": 0.7109375, + "learning_rate": 0.00010614556281416843, + "loss": 0.7847, + "step": 32447 + }, + { + "epoch": 0.8331736532711945, + "grad_norm": 0.703125, + "learning_rate": 0.00010614110704451899, + "loss": 0.8123, + "step": 32448 + }, + { + "epoch": 0.8331993304671164, + "grad_norm": 0.7578125, + "learning_rate": 0.00010613665126263086, + "loss": 0.8682, + "step": 32449 + }, + { + "epoch": 0.8332250076630382, + "grad_norm": 0.734375, + "learning_rate": 0.00010613219546851296, + "loss": 0.7499, + "step": 32450 + }, + { + "epoch": 0.8332506848589599, + "grad_norm": 0.671875, + "learning_rate": 0.00010612773966217417, + "loss": 0.7499, + "step": 32451 + }, + { + "epoch": 0.8332763620548818, + "grad_norm": 0.69921875, + "learning_rate": 0.00010612328384362333, + "loss": 0.6795, + "step": 32452 + }, + { + "epoch": 0.8333020392508036, + "grad_norm": 0.81640625, + "learning_rate": 0.00010611882801286932, + "loss": 0.8263, + "step": 32453 + }, + { + "epoch": 0.8333277164467254, + "grad_norm": 0.79296875, + "learning_rate": 0.00010611437216992106, + "loss": 0.8951, + "step": 32454 + }, + { + "epoch": 0.8333533936426473, + "grad_norm": 0.796875, + "learning_rate": 0.00010610991631478742, + "loss": 0.7173, + "step": 32455 + }, + { + "epoch": 0.8333790708385691, + "grad_norm": 0.8046875, + "learning_rate": 0.00010610546044747723, + "loss": 0.8496, + "step": 32456 + }, + { + "epoch": 0.8334047480344909, + "grad_norm": 0.765625, + "learning_rate": 0.00010610100456799945, + "loss": 0.8692, + "step": 32457 + }, + { + "epoch": 0.8334304252304127, + "grad_norm": 0.81640625, + "learning_rate": 0.00010609654867636287, + "loss": 0.9239, + "step": 32458 + }, + { + "epoch": 0.8334561024263345, + "grad_norm": 0.765625, + "learning_rate": 0.00010609209277257646, + "loss": 0.7745, + "step": 32459 + }, + { + "epoch": 0.8334817796222563, + "grad_norm": 0.7421875, + "learning_rate": 0.00010608763685664907, + "loss": 0.8197, + "step": 32460 + }, + { + "epoch": 0.8335074568181782, + "grad_norm": 0.734375, + "learning_rate": 0.00010608318092858951, + "loss": 0.7594, + "step": 32461 + }, + { + "epoch": 0.8335331340141, + "grad_norm": 0.76953125, + "learning_rate": 0.00010607872498840676, + "loss": 0.796, + "step": 32462 + }, + { + "epoch": 0.8335588112100218, + "grad_norm": 0.77734375, + "learning_rate": 0.00010607426903610966, + "loss": 0.8971, + "step": 32463 + }, + { + "epoch": 0.8335844884059437, + "grad_norm": 0.9765625, + "learning_rate": 0.00010606981307170712, + "loss": 0.7092, + "step": 32464 + }, + { + "epoch": 0.8336101656018654, + "grad_norm": 0.75390625, + "learning_rate": 0.00010606535709520794, + "loss": 0.6655, + "step": 32465 + }, + { + "epoch": 0.8336358427977872, + "grad_norm": 0.76171875, + "learning_rate": 0.00010606090110662105, + "loss": 0.8382, + "step": 32466 + }, + { + "epoch": 0.8336615199937091, + "grad_norm": 0.765625, + "learning_rate": 0.00010605644510595536, + "loss": 0.7614, + "step": 32467 + }, + { + "epoch": 0.8336871971896309, + "grad_norm": 0.84375, + "learning_rate": 0.00010605198909321971, + "loss": 0.8904, + "step": 32468 + }, + { + "epoch": 0.8337128743855527, + "grad_norm": 0.7109375, + "learning_rate": 0.00010604753306842299, + "loss": 0.7185, + "step": 32469 + }, + { + "epoch": 0.8337385515814746, + "grad_norm": 0.796875, + "learning_rate": 0.00010604307703157413, + "loss": 0.8514, + "step": 32470 + }, + { + "epoch": 0.8337642287773963, + "grad_norm": 0.79296875, + "learning_rate": 0.00010603862098268189, + "loss": 0.7581, + "step": 32471 + }, + { + "epoch": 0.8337899059733181, + "grad_norm": 0.69140625, + "learning_rate": 0.00010603416492175527, + "loss": 0.7862, + "step": 32472 + }, + { + "epoch": 0.83381558316924, + "grad_norm": 0.73046875, + "learning_rate": 0.0001060297088488031, + "loss": 0.7262, + "step": 32473 + }, + { + "epoch": 0.8338412603651618, + "grad_norm": 0.8046875, + "learning_rate": 0.00010602525276383424, + "loss": 0.7695, + "step": 32474 + }, + { + "epoch": 0.8338669375610837, + "grad_norm": 0.74609375, + "learning_rate": 0.0001060207966668576, + "loss": 0.7751, + "step": 32475 + }, + { + "epoch": 0.8338926147570055, + "grad_norm": 0.6875, + "learning_rate": 0.00010601634055788208, + "loss": 0.717, + "step": 32476 + }, + { + "epoch": 0.8339182919529273, + "grad_norm": 0.77734375, + "learning_rate": 0.00010601188443691652, + "loss": 0.8179, + "step": 32477 + }, + { + "epoch": 0.833943969148849, + "grad_norm": 0.73828125, + "learning_rate": 0.00010600742830396984, + "loss": 0.6764, + "step": 32478 + }, + { + "epoch": 0.8339696463447709, + "grad_norm": 0.71875, + "learning_rate": 0.00010600297215905086, + "loss": 0.8252, + "step": 32479 + }, + { + "epoch": 0.8339953235406927, + "grad_norm": 0.76953125, + "learning_rate": 0.00010599851600216852, + "loss": 0.8205, + "step": 32480 + }, + { + "epoch": 0.8340210007366146, + "grad_norm": 0.890625, + "learning_rate": 0.00010599405983333168, + "loss": 0.8355, + "step": 32481 + }, + { + "epoch": 0.8340466779325364, + "grad_norm": 0.7578125, + "learning_rate": 0.0001059896036525492, + "loss": 0.6627, + "step": 32482 + }, + { + "epoch": 0.8340723551284582, + "grad_norm": 0.75, + "learning_rate": 0.00010598514745983001, + "loss": 0.8084, + "step": 32483 + }, + { + "epoch": 0.8340980323243801, + "grad_norm": 0.76953125, + "learning_rate": 0.00010598069125518298, + "loss": 0.8848, + "step": 32484 + }, + { + "epoch": 0.8341237095203018, + "grad_norm": 0.6953125, + "learning_rate": 0.00010597623503861691, + "loss": 0.8148, + "step": 32485 + }, + { + "epoch": 0.8341493867162236, + "grad_norm": 0.8046875, + "learning_rate": 0.00010597177881014077, + "loss": 0.8814, + "step": 32486 + }, + { + "epoch": 0.8341750639121455, + "grad_norm": 0.82421875, + "learning_rate": 0.00010596732256976343, + "loss": 0.8194, + "step": 32487 + }, + { + "epoch": 0.8342007411080673, + "grad_norm": 0.80078125, + "learning_rate": 0.00010596286631749375, + "loss": 0.7647, + "step": 32488 + }, + { + "epoch": 0.8342264183039891, + "grad_norm": 0.78125, + "learning_rate": 0.00010595841005334061, + "loss": 0.8701, + "step": 32489 + }, + { + "epoch": 0.834252095499911, + "grad_norm": 0.796875, + "learning_rate": 0.0001059539537773129, + "loss": 0.7842, + "step": 32490 + }, + { + "epoch": 0.8342777726958327, + "grad_norm": 0.78515625, + "learning_rate": 0.00010594949748941949, + "loss": 0.7477, + "step": 32491 + }, + { + "epoch": 0.8343034498917545, + "grad_norm": 0.7890625, + "learning_rate": 0.00010594504118966929, + "loss": 0.756, + "step": 32492 + }, + { + "epoch": 0.8343291270876764, + "grad_norm": 0.8046875, + "learning_rate": 0.00010594058487807112, + "loss": 0.8575, + "step": 32493 + }, + { + "epoch": 0.8343548042835982, + "grad_norm": 0.73828125, + "learning_rate": 0.00010593612855463395, + "loss": 0.7991, + "step": 32494 + }, + { + "epoch": 0.83438048147952, + "grad_norm": 0.765625, + "learning_rate": 0.00010593167221936657, + "loss": 0.7736, + "step": 32495 + }, + { + "epoch": 0.8344061586754419, + "grad_norm": 0.7421875, + "learning_rate": 0.00010592721587227792, + "loss": 0.7995, + "step": 32496 + }, + { + "epoch": 0.8344318358713637, + "grad_norm": 0.7578125, + "learning_rate": 0.0001059227595133769, + "loss": 0.8448, + "step": 32497 + }, + { + "epoch": 0.8344575130672854, + "grad_norm": 0.82421875, + "learning_rate": 0.0001059183031426723, + "loss": 0.7065, + "step": 32498 + }, + { + "epoch": 0.8344831902632073, + "grad_norm": 0.7421875, + "learning_rate": 0.00010591384676017309, + "loss": 0.8447, + "step": 32499 + }, + { + "epoch": 0.8345088674591291, + "grad_norm": 0.875, + "learning_rate": 0.0001059093903658881, + "loss": 0.6958, + "step": 32500 + }, + { + "epoch": 0.834534544655051, + "grad_norm": 0.84375, + "learning_rate": 0.00010590493395982622, + "loss": 0.688, + "step": 32501 + }, + { + "epoch": 0.8345602218509728, + "grad_norm": 0.80859375, + "learning_rate": 0.00010590047754199634, + "loss": 0.8068, + "step": 32502 + }, + { + "epoch": 0.8345858990468946, + "grad_norm": 0.81640625, + "learning_rate": 0.00010589602111240737, + "loss": 0.8487, + "step": 32503 + }, + { + "epoch": 0.8346115762428165, + "grad_norm": 0.76171875, + "learning_rate": 0.00010589156467106816, + "loss": 0.7164, + "step": 32504 + }, + { + "epoch": 0.8346372534387382, + "grad_norm": 0.765625, + "learning_rate": 0.00010588710821798757, + "loss": 0.824, + "step": 32505 + }, + { + "epoch": 0.83466293063466, + "grad_norm": 0.7578125, + "learning_rate": 0.00010588265175317451, + "loss": 0.738, + "step": 32506 + }, + { + "epoch": 0.8346886078305819, + "grad_norm": 0.84765625, + "learning_rate": 0.00010587819527663787, + "loss": 0.8087, + "step": 32507 + }, + { + "epoch": 0.8347142850265037, + "grad_norm": 0.84375, + "learning_rate": 0.00010587373878838651, + "loss": 0.8835, + "step": 32508 + }, + { + "epoch": 0.8347399622224255, + "grad_norm": 0.84765625, + "learning_rate": 0.0001058692822884293, + "loss": 0.8697, + "step": 32509 + }, + { + "epoch": 0.8347656394183474, + "grad_norm": 0.85546875, + "learning_rate": 0.0001058648257767752, + "loss": 0.7385, + "step": 32510 + }, + { + "epoch": 0.8347913166142691, + "grad_norm": 0.80078125, + "learning_rate": 0.00010586036925343297, + "loss": 0.7992, + "step": 32511 + }, + { + "epoch": 0.8348169938101909, + "grad_norm": 0.79296875, + "learning_rate": 0.00010585591271841157, + "loss": 0.7642, + "step": 32512 + }, + { + "epoch": 0.8348426710061128, + "grad_norm": 0.7734375, + "learning_rate": 0.00010585145617171988, + "loss": 0.8374, + "step": 32513 + }, + { + "epoch": 0.8348683482020346, + "grad_norm": 0.76953125, + "learning_rate": 0.00010584699961336675, + "loss": 0.8752, + "step": 32514 + }, + { + "epoch": 0.8348940253979564, + "grad_norm": 0.796875, + "learning_rate": 0.00010584254304336107, + "loss": 0.7665, + "step": 32515 + }, + { + "epoch": 0.8349197025938783, + "grad_norm": 0.80078125, + "learning_rate": 0.00010583808646171176, + "loss": 1.0727, + "step": 32516 + }, + { + "epoch": 0.8349453797898001, + "grad_norm": 0.8046875, + "learning_rate": 0.00010583362986842767, + "loss": 0.6935, + "step": 32517 + }, + { + "epoch": 0.8349710569857218, + "grad_norm": 0.828125, + "learning_rate": 0.00010582917326351766, + "loss": 0.9505, + "step": 32518 + }, + { + "epoch": 0.8349967341816437, + "grad_norm": 0.83984375, + "learning_rate": 0.00010582471664699063, + "loss": 0.8412, + "step": 32519 + }, + { + "epoch": 0.8350224113775655, + "grad_norm": 0.78125, + "learning_rate": 0.00010582026001885547, + "loss": 0.9183, + "step": 32520 + }, + { + "epoch": 0.8350480885734873, + "grad_norm": 0.8203125, + "learning_rate": 0.0001058158033791211, + "loss": 0.7523, + "step": 32521 + }, + { + "epoch": 0.8350737657694092, + "grad_norm": 0.71875, + "learning_rate": 0.00010581134672779629, + "loss": 0.7212, + "step": 32522 + }, + { + "epoch": 0.835099442965331, + "grad_norm": 0.796875, + "learning_rate": 0.00010580689006489006, + "loss": 0.7444, + "step": 32523 + }, + { + "epoch": 0.8351251201612528, + "grad_norm": 0.78515625, + "learning_rate": 0.00010580243339041117, + "loss": 0.6902, + "step": 32524 + }, + { + "epoch": 0.8351507973571746, + "grad_norm": 0.75390625, + "learning_rate": 0.00010579797670436858, + "loss": 0.7605, + "step": 32525 + }, + { + "epoch": 0.8351764745530964, + "grad_norm": 0.7734375, + "learning_rate": 0.00010579352000677114, + "loss": 0.7185, + "step": 32526 + }, + { + "epoch": 0.8352021517490182, + "grad_norm": 0.7734375, + "learning_rate": 0.00010578906329762772, + "loss": 0.7207, + "step": 32527 + }, + { + "epoch": 0.8352278289449401, + "grad_norm": 0.7421875, + "learning_rate": 0.00010578460657694724, + "loss": 0.7526, + "step": 32528 + }, + { + "epoch": 0.8352535061408619, + "grad_norm": 0.77734375, + "learning_rate": 0.00010578014984473858, + "loss": 0.8223, + "step": 32529 + }, + { + "epoch": 0.8352791833367837, + "grad_norm": 0.77734375, + "learning_rate": 0.00010577569310101057, + "loss": 0.8463, + "step": 32530 + }, + { + "epoch": 0.8353048605327055, + "grad_norm": 0.83984375, + "learning_rate": 0.00010577123634577215, + "loss": 0.9487, + "step": 32531 + }, + { + "epoch": 0.8353305377286273, + "grad_norm": 0.77734375, + "learning_rate": 0.00010576677957903216, + "loss": 0.7787, + "step": 32532 + }, + { + "epoch": 0.8353562149245491, + "grad_norm": 0.7265625, + "learning_rate": 0.0001057623228007995, + "loss": 0.7932, + "step": 32533 + }, + { + "epoch": 0.835381892120471, + "grad_norm": 0.76171875, + "learning_rate": 0.00010575786601108305, + "loss": 0.8791, + "step": 32534 + }, + { + "epoch": 0.8354075693163928, + "grad_norm": 0.80078125, + "learning_rate": 0.00010575340920989167, + "loss": 0.8108, + "step": 32535 + }, + { + "epoch": 0.8354332465123147, + "grad_norm": 0.7578125, + "learning_rate": 0.00010574895239723432, + "loss": 0.6893, + "step": 32536 + }, + { + "epoch": 0.8354589237082365, + "grad_norm": 0.77734375, + "learning_rate": 0.00010574449557311981, + "loss": 0.8261, + "step": 32537 + }, + { + "epoch": 0.8354846009041582, + "grad_norm": 0.6953125, + "learning_rate": 0.00010574003873755703, + "loss": 0.7541, + "step": 32538 + }, + { + "epoch": 0.83551027810008, + "grad_norm": 0.7421875, + "learning_rate": 0.00010573558189055485, + "loss": 0.8813, + "step": 32539 + }, + { + "epoch": 0.8355359552960019, + "grad_norm": 0.78515625, + "learning_rate": 0.00010573112503212221, + "loss": 0.7467, + "step": 32540 + }, + { + "epoch": 0.8355616324919237, + "grad_norm": 0.90625, + "learning_rate": 0.00010572666816226793, + "loss": 0.7665, + "step": 32541 + }, + { + "epoch": 0.8355873096878456, + "grad_norm": 0.86328125, + "learning_rate": 0.00010572221128100093, + "loss": 0.7887, + "step": 32542 + }, + { + "epoch": 0.8356129868837674, + "grad_norm": 0.875, + "learning_rate": 0.00010571775438833007, + "loss": 0.8596, + "step": 32543 + }, + { + "epoch": 0.8356386640796892, + "grad_norm": 0.79296875, + "learning_rate": 0.00010571329748426427, + "loss": 0.8107, + "step": 32544 + }, + { + "epoch": 0.835664341275611, + "grad_norm": 0.796875, + "learning_rate": 0.00010570884056881236, + "loss": 0.7185, + "step": 32545 + }, + { + "epoch": 0.8356900184715328, + "grad_norm": 0.734375, + "learning_rate": 0.00010570438364198323, + "loss": 0.7527, + "step": 32546 + }, + { + "epoch": 0.8357156956674546, + "grad_norm": 0.7578125, + "learning_rate": 0.00010569992670378582, + "loss": 0.8544, + "step": 32547 + }, + { + "epoch": 0.8357413728633765, + "grad_norm": 0.74609375, + "learning_rate": 0.00010569546975422895, + "loss": 0.6826, + "step": 32548 + }, + { + "epoch": 0.8357670500592983, + "grad_norm": 0.77734375, + "learning_rate": 0.00010569101279332153, + "loss": 0.7508, + "step": 32549 + }, + { + "epoch": 0.8357927272552201, + "grad_norm": 0.7578125, + "learning_rate": 0.00010568655582107243, + "loss": 0.846, + "step": 32550 + }, + { + "epoch": 0.8358184044511419, + "grad_norm": 0.78125, + "learning_rate": 0.00010568209883749051, + "loss": 0.6836, + "step": 32551 + }, + { + "epoch": 0.8358440816470637, + "grad_norm": 0.73046875, + "learning_rate": 0.00010567764184258471, + "loss": 0.826, + "step": 32552 + }, + { + "epoch": 0.8358697588429855, + "grad_norm": 0.91015625, + "learning_rate": 0.00010567318483636388, + "loss": 0.8357, + "step": 32553 + }, + { + "epoch": 0.8358954360389074, + "grad_norm": 0.83203125, + "learning_rate": 0.0001056687278188369, + "loss": 0.8679, + "step": 32554 + }, + { + "epoch": 0.8359211132348292, + "grad_norm": 0.75, + "learning_rate": 0.00010566427079001267, + "loss": 0.7398, + "step": 32555 + }, + { + "epoch": 0.835946790430751, + "grad_norm": 0.83203125, + "learning_rate": 0.00010565981374990004, + "loss": 0.7915, + "step": 32556 + }, + { + "epoch": 0.8359724676266729, + "grad_norm": 0.73828125, + "learning_rate": 0.00010565535669850795, + "loss": 0.735, + "step": 32557 + }, + { + "epoch": 0.8359981448225946, + "grad_norm": 0.73046875, + "learning_rate": 0.0001056508996358452, + "loss": 0.7148, + "step": 32558 + }, + { + "epoch": 0.8360238220185164, + "grad_norm": 0.73046875, + "learning_rate": 0.00010564644256192073, + "loss": 0.7739, + "step": 32559 + }, + { + "epoch": 0.8360494992144383, + "grad_norm": 0.76171875, + "learning_rate": 0.00010564198547674342, + "loss": 0.9415, + "step": 32560 + }, + { + "epoch": 0.8360751764103601, + "grad_norm": 0.81640625, + "learning_rate": 0.00010563752838032215, + "loss": 0.897, + "step": 32561 + }, + { + "epoch": 0.836100853606282, + "grad_norm": 0.83984375, + "learning_rate": 0.00010563307127266577, + "loss": 0.8096, + "step": 32562 + }, + { + "epoch": 0.8361265308022038, + "grad_norm": 0.8046875, + "learning_rate": 0.00010562861415378322, + "loss": 0.8568, + "step": 32563 + }, + { + "epoch": 0.8361522079981256, + "grad_norm": 0.75390625, + "learning_rate": 0.00010562415702368331, + "loss": 0.7998, + "step": 32564 + }, + { + "epoch": 0.8361778851940473, + "grad_norm": 0.86328125, + "learning_rate": 0.00010561969988237499, + "loss": 0.8025, + "step": 32565 + }, + { + "epoch": 0.8362035623899692, + "grad_norm": 0.76953125, + "learning_rate": 0.00010561524272986711, + "loss": 0.7638, + "step": 32566 + }, + { + "epoch": 0.836229239585891, + "grad_norm": 0.76171875, + "learning_rate": 0.00010561078556616855, + "loss": 0.7301, + "step": 32567 + }, + { + "epoch": 0.8362549167818129, + "grad_norm": 0.7890625, + "learning_rate": 0.00010560632839128821, + "loss": 0.7888, + "step": 32568 + }, + { + "epoch": 0.8362805939777347, + "grad_norm": 0.83203125, + "learning_rate": 0.000105601871205235, + "loss": 0.8217, + "step": 32569 + }, + { + "epoch": 0.8363062711736565, + "grad_norm": 0.80078125, + "learning_rate": 0.00010559741400801773, + "loss": 0.8943, + "step": 32570 + }, + { + "epoch": 0.8363319483695782, + "grad_norm": 1.1875, + "learning_rate": 0.00010559295679964532, + "loss": 0.7049, + "step": 32571 + }, + { + "epoch": 0.8363576255655001, + "grad_norm": 0.77734375, + "learning_rate": 0.00010558849958012662, + "loss": 0.7055, + "step": 32572 + }, + { + "epoch": 0.8363833027614219, + "grad_norm": 0.7265625, + "learning_rate": 0.0001055840423494706, + "loss": 0.5579, + "step": 32573 + }, + { + "epoch": 0.8364089799573438, + "grad_norm": 0.71875, + "learning_rate": 0.00010557958510768606, + "loss": 0.7784, + "step": 32574 + }, + { + "epoch": 0.8364346571532656, + "grad_norm": 0.7890625, + "learning_rate": 0.0001055751278547819, + "loss": 0.7433, + "step": 32575 + }, + { + "epoch": 0.8364603343491874, + "grad_norm": 0.734375, + "learning_rate": 0.00010557067059076706, + "loss": 0.8211, + "step": 32576 + }, + { + "epoch": 0.8364860115451093, + "grad_norm": 0.78515625, + "learning_rate": 0.00010556621331565035, + "loss": 0.873, + "step": 32577 + }, + { + "epoch": 0.836511688741031, + "grad_norm": 0.7734375, + "learning_rate": 0.00010556175602944067, + "loss": 0.7742, + "step": 32578 + }, + { + "epoch": 0.8365373659369528, + "grad_norm": 0.859375, + "learning_rate": 0.00010555729873214692, + "loss": 0.8653, + "step": 32579 + }, + { + "epoch": 0.8365630431328747, + "grad_norm": 0.8046875, + "learning_rate": 0.00010555284142377799, + "loss": 0.8836, + "step": 32580 + }, + { + "epoch": 0.8365887203287965, + "grad_norm": 0.82421875, + "learning_rate": 0.0001055483841043427, + "loss": 0.9081, + "step": 32581 + }, + { + "epoch": 0.8366143975247183, + "grad_norm": 0.828125, + "learning_rate": 0.00010554392677385005, + "loss": 1.0299, + "step": 32582 + }, + { + "epoch": 0.8366400747206402, + "grad_norm": 0.765625, + "learning_rate": 0.00010553946943230881, + "loss": 0.8226, + "step": 32583 + }, + { + "epoch": 0.836665751916562, + "grad_norm": 0.9140625, + "learning_rate": 0.00010553501207972791, + "loss": 0.9042, + "step": 32584 + }, + { + "epoch": 0.8366914291124837, + "grad_norm": 0.765625, + "learning_rate": 0.00010553055471611626, + "loss": 0.8416, + "step": 32585 + }, + { + "epoch": 0.8367171063084056, + "grad_norm": 0.71875, + "learning_rate": 0.00010552609734148267, + "loss": 0.8339, + "step": 32586 + }, + { + "epoch": 0.8367427835043274, + "grad_norm": 0.8515625, + "learning_rate": 0.00010552163995583611, + "loss": 0.7554, + "step": 32587 + }, + { + "epoch": 0.8367684607002492, + "grad_norm": 0.82421875, + "learning_rate": 0.00010551718255918539, + "loss": 0.8006, + "step": 32588 + }, + { + "epoch": 0.8367941378961711, + "grad_norm": 0.890625, + "learning_rate": 0.00010551272515153944, + "loss": 0.9957, + "step": 32589 + }, + { + "epoch": 0.8368198150920929, + "grad_norm": 0.8359375, + "learning_rate": 0.00010550826773290711, + "loss": 0.8383, + "step": 32590 + }, + { + "epoch": 0.8368454922880146, + "grad_norm": 0.78125, + "learning_rate": 0.0001055038103032973, + "loss": 0.764, + "step": 32591 + }, + { + "epoch": 0.8368711694839365, + "grad_norm": 0.73828125, + "learning_rate": 0.00010549935286271893, + "loss": 0.7348, + "step": 32592 + }, + { + "epoch": 0.8368968466798583, + "grad_norm": 0.77734375, + "learning_rate": 0.00010549489541118083, + "loss": 0.8552, + "step": 32593 + }, + { + "epoch": 0.8369225238757801, + "grad_norm": 0.76171875, + "learning_rate": 0.00010549043794869187, + "loss": 0.8452, + "step": 32594 + }, + { + "epoch": 0.836948201071702, + "grad_norm": 0.76171875, + "learning_rate": 0.000105485980475261, + "loss": 0.7577, + "step": 32595 + }, + { + "epoch": 0.8369738782676238, + "grad_norm": 0.76953125, + "learning_rate": 0.00010548152299089704, + "loss": 0.8653, + "step": 32596 + }, + { + "epoch": 0.8369995554635457, + "grad_norm": 0.765625, + "learning_rate": 0.00010547706549560892, + "loss": 0.7365, + "step": 32597 + }, + { + "epoch": 0.8370252326594674, + "grad_norm": 0.78125, + "learning_rate": 0.00010547260798940548, + "loss": 0.8168, + "step": 32598 + }, + { + "epoch": 0.8370509098553892, + "grad_norm": 0.88671875, + "learning_rate": 0.00010546815047229563, + "loss": 0.7685, + "step": 32599 + }, + { + "epoch": 0.837076587051311, + "grad_norm": 0.75390625, + "learning_rate": 0.00010546369294428826, + "loss": 0.699, + "step": 32600 + }, + { + "epoch": 0.8371022642472329, + "grad_norm": 0.73828125, + "learning_rate": 0.00010545923540539226, + "loss": 0.7044, + "step": 32601 + }, + { + "epoch": 0.8371279414431547, + "grad_norm": 0.8125, + "learning_rate": 0.00010545477785561648, + "loss": 0.8864, + "step": 32602 + }, + { + "epoch": 0.8371536186390766, + "grad_norm": 0.78515625, + "learning_rate": 0.00010545032029496982, + "loss": 0.7205, + "step": 32603 + }, + { + "epoch": 0.8371792958349984, + "grad_norm": 0.70703125, + "learning_rate": 0.00010544586272346116, + "loss": 0.7795, + "step": 32604 + }, + { + "epoch": 0.8372049730309201, + "grad_norm": 0.7421875, + "learning_rate": 0.00010544140514109939, + "loss": 0.8373, + "step": 32605 + }, + { + "epoch": 0.837230650226842, + "grad_norm": 0.86328125, + "learning_rate": 0.00010543694754789339, + "loss": 0.8373, + "step": 32606 + }, + { + "epoch": 0.8372563274227638, + "grad_norm": 0.7890625, + "learning_rate": 0.00010543248994385204, + "loss": 0.7916, + "step": 32607 + }, + { + "epoch": 0.8372820046186856, + "grad_norm": 0.75390625, + "learning_rate": 0.00010542803232898426, + "loss": 0.8489, + "step": 32608 + }, + { + "epoch": 0.8373076818146075, + "grad_norm": 0.83203125, + "learning_rate": 0.00010542357470329889, + "loss": 0.8145, + "step": 32609 + }, + { + "epoch": 0.8373333590105293, + "grad_norm": 0.79296875, + "learning_rate": 0.00010541911706680478, + "loss": 0.817, + "step": 32610 + }, + { + "epoch": 0.837359036206451, + "grad_norm": 1.921875, + "learning_rate": 0.00010541465941951092, + "loss": 0.6683, + "step": 32611 + }, + { + "epoch": 0.8373847134023729, + "grad_norm": 0.74609375, + "learning_rate": 0.00010541020176142608, + "loss": 0.8103, + "step": 32612 + }, + { + "epoch": 0.8374103905982947, + "grad_norm": 0.7734375, + "learning_rate": 0.00010540574409255923, + "loss": 0.7649, + "step": 32613 + }, + { + "epoch": 0.8374360677942165, + "grad_norm": 0.79296875, + "learning_rate": 0.00010540128641291922, + "loss": 0.7404, + "step": 32614 + }, + { + "epoch": 0.8374617449901384, + "grad_norm": 0.765625, + "learning_rate": 0.00010539682872251493, + "loss": 0.7542, + "step": 32615 + }, + { + "epoch": 0.8374874221860602, + "grad_norm": 0.671875, + "learning_rate": 0.00010539237102135524, + "loss": 0.8065, + "step": 32616 + }, + { + "epoch": 0.837513099381982, + "grad_norm": 0.765625, + "learning_rate": 0.00010538791330944906, + "loss": 0.7963, + "step": 32617 + }, + { + "epoch": 0.8375387765779038, + "grad_norm": 0.81640625, + "learning_rate": 0.00010538345558680522, + "loss": 0.6875, + "step": 32618 + }, + { + "epoch": 0.8375644537738256, + "grad_norm": 0.7578125, + "learning_rate": 0.00010537899785343266, + "loss": 0.8599, + "step": 32619 + }, + { + "epoch": 0.8375901309697474, + "grad_norm": 0.765625, + "learning_rate": 0.00010537454010934027, + "loss": 0.7664, + "step": 32620 + }, + { + "epoch": 0.8376158081656693, + "grad_norm": 0.984375, + "learning_rate": 0.00010537008235453685, + "loss": 0.881, + "step": 32621 + }, + { + "epoch": 0.8376414853615911, + "grad_norm": 0.82421875, + "learning_rate": 0.0001053656245890314, + "loss": 0.9415, + "step": 32622 + }, + { + "epoch": 0.837667162557513, + "grad_norm": 0.74609375, + "learning_rate": 0.0001053611668128327, + "loss": 0.6943, + "step": 32623 + }, + { + "epoch": 0.8376928397534348, + "grad_norm": 0.7421875, + "learning_rate": 0.0001053567090259497, + "loss": 0.7981, + "step": 32624 + }, + { + "epoch": 0.8377185169493565, + "grad_norm": 0.83203125, + "learning_rate": 0.00010535225122839127, + "loss": 0.8374, + "step": 32625 + }, + { + "epoch": 0.8377441941452783, + "grad_norm": 0.7578125, + "learning_rate": 0.00010534779342016626, + "loss": 0.8968, + "step": 32626 + }, + { + "epoch": 0.8377698713412002, + "grad_norm": 0.78515625, + "learning_rate": 0.0001053433356012836, + "loss": 0.8741, + "step": 32627 + }, + { + "epoch": 0.837795548537122, + "grad_norm": 0.7421875, + "learning_rate": 0.00010533887777175215, + "loss": 0.71, + "step": 32628 + }, + { + "epoch": 0.8378212257330439, + "grad_norm": 0.7890625, + "learning_rate": 0.00010533441993158082, + "loss": 0.8338, + "step": 32629 + }, + { + "epoch": 0.8378469029289657, + "grad_norm": 0.76171875, + "learning_rate": 0.00010532996208077845, + "loss": 0.7117, + "step": 32630 + }, + { + "epoch": 0.8378725801248874, + "grad_norm": 0.8125, + "learning_rate": 0.00010532550421935392, + "loss": 0.8216, + "step": 32631 + }, + { + "epoch": 0.8378982573208092, + "grad_norm": 0.82421875, + "learning_rate": 0.00010532104634731619, + "loss": 0.9723, + "step": 32632 + }, + { + "epoch": 0.8379239345167311, + "grad_norm": 0.76953125, + "learning_rate": 0.00010531658846467408, + "loss": 0.7862, + "step": 32633 + }, + { + "epoch": 0.8379496117126529, + "grad_norm": 0.77734375, + "learning_rate": 0.00010531213057143647, + "loss": 0.8903, + "step": 32634 + }, + { + "epoch": 0.8379752889085748, + "grad_norm": 0.78515625, + "learning_rate": 0.0001053076726676123, + "loss": 0.9095, + "step": 32635 + }, + { + "epoch": 0.8380009661044966, + "grad_norm": 0.85546875, + "learning_rate": 0.00010530321475321039, + "loss": 0.8989, + "step": 32636 + }, + { + "epoch": 0.8380266433004184, + "grad_norm": 0.8203125, + "learning_rate": 0.00010529875682823967, + "loss": 0.7437, + "step": 32637 + }, + { + "epoch": 0.8380523204963402, + "grad_norm": 0.8125, + "learning_rate": 0.00010529429889270899, + "loss": 0.7711, + "step": 32638 + }, + { + "epoch": 0.838077997692262, + "grad_norm": 0.76953125, + "learning_rate": 0.00010528984094662724, + "loss": 0.7808, + "step": 32639 + }, + { + "epoch": 0.8381036748881838, + "grad_norm": 0.875, + "learning_rate": 0.00010528538299000332, + "loss": 0.7354, + "step": 32640 + }, + { + "epoch": 0.8381293520841057, + "grad_norm": 0.8203125, + "learning_rate": 0.00010528092502284614, + "loss": 0.8709, + "step": 32641 + }, + { + "epoch": 0.8381550292800275, + "grad_norm": 0.79296875, + "learning_rate": 0.00010527646704516455, + "loss": 0.7465, + "step": 32642 + }, + { + "epoch": 0.8381807064759493, + "grad_norm": 0.86328125, + "learning_rate": 0.00010527200905696742, + "loss": 0.8781, + "step": 32643 + }, + { + "epoch": 0.8382063836718711, + "grad_norm": 0.83984375, + "learning_rate": 0.00010526755105826363, + "loss": 0.7008, + "step": 32644 + }, + { + "epoch": 0.8382320608677929, + "grad_norm": 0.765625, + "learning_rate": 0.00010526309304906212, + "loss": 0.8542, + "step": 32645 + }, + { + "epoch": 0.8382577380637147, + "grad_norm": 0.6875, + "learning_rate": 0.00010525863502937173, + "loss": 0.7264, + "step": 32646 + }, + { + "epoch": 0.8382834152596366, + "grad_norm": 0.71875, + "learning_rate": 0.00010525417699920135, + "loss": 0.8354, + "step": 32647 + }, + { + "epoch": 0.8383090924555584, + "grad_norm": 0.85546875, + "learning_rate": 0.00010524971895855989, + "loss": 0.8122, + "step": 32648 + }, + { + "epoch": 0.8383347696514802, + "grad_norm": 0.77734375, + "learning_rate": 0.00010524526090745619, + "loss": 0.7137, + "step": 32649 + }, + { + "epoch": 0.8383604468474021, + "grad_norm": 0.8125, + "learning_rate": 0.00010524080284589917, + "loss": 0.9018, + "step": 32650 + }, + { + "epoch": 0.8383861240433238, + "grad_norm": 0.66015625, + "learning_rate": 0.0001052363447738977, + "loss": 0.6798, + "step": 32651 + }, + { + "epoch": 0.8384118012392456, + "grad_norm": 0.77734375, + "learning_rate": 0.00010523188669146066, + "loss": 0.7307, + "step": 32652 + }, + { + "epoch": 0.8384374784351675, + "grad_norm": 0.69921875, + "learning_rate": 0.00010522742859859696, + "loss": 0.7643, + "step": 32653 + }, + { + "epoch": 0.8384631556310893, + "grad_norm": 0.81640625, + "learning_rate": 0.00010522297049531549, + "loss": 0.7294, + "step": 32654 + }, + { + "epoch": 0.8384888328270111, + "grad_norm": 0.7421875, + "learning_rate": 0.00010521851238162505, + "loss": 0.7143, + "step": 32655 + }, + { + "epoch": 0.838514510022933, + "grad_norm": 0.83984375, + "learning_rate": 0.00010521405425753464, + "loss": 0.9013, + "step": 32656 + }, + { + "epoch": 0.8385401872188548, + "grad_norm": 0.71875, + "learning_rate": 0.00010520959612305305, + "loss": 0.6896, + "step": 32657 + }, + { + "epoch": 0.8385658644147765, + "grad_norm": 0.76171875, + "learning_rate": 0.00010520513797818922, + "loss": 0.8157, + "step": 32658 + }, + { + "epoch": 0.8385915416106984, + "grad_norm": 0.7265625, + "learning_rate": 0.00010520067982295202, + "loss": 0.62, + "step": 32659 + }, + { + "epoch": 0.8386172188066202, + "grad_norm": 0.72265625, + "learning_rate": 0.00010519622165735033, + "loss": 0.733, + "step": 32660 + }, + { + "epoch": 0.838642896002542, + "grad_norm": 0.76171875, + "learning_rate": 0.00010519176348139307, + "loss": 0.8467, + "step": 32661 + }, + { + "epoch": 0.8386685731984639, + "grad_norm": 0.86328125, + "learning_rate": 0.00010518730529508909, + "loss": 0.7031, + "step": 32662 + }, + { + "epoch": 0.8386942503943857, + "grad_norm": 0.8125, + "learning_rate": 0.00010518284709844724, + "loss": 0.9785, + "step": 32663 + }, + { + "epoch": 0.8387199275903074, + "grad_norm": 0.8125, + "learning_rate": 0.00010517838889147646, + "loss": 0.8325, + "step": 32664 + }, + { + "epoch": 0.8387456047862293, + "grad_norm": 0.8515625, + "learning_rate": 0.00010517393067418564, + "loss": 0.7703, + "step": 32665 + }, + { + "epoch": 0.8387712819821511, + "grad_norm": 0.7421875, + "learning_rate": 0.00010516947244658361, + "loss": 0.883, + "step": 32666 + }, + { + "epoch": 0.838796959178073, + "grad_norm": 0.78515625, + "learning_rate": 0.00010516501420867932, + "loss": 0.7273, + "step": 32667 + }, + { + "epoch": 0.8388226363739948, + "grad_norm": 0.79296875, + "learning_rate": 0.00010516055596048162, + "loss": 0.8128, + "step": 32668 + }, + { + "epoch": 0.8388483135699166, + "grad_norm": 0.71484375, + "learning_rate": 0.00010515609770199938, + "loss": 0.8152, + "step": 32669 + }, + { + "epoch": 0.8388739907658385, + "grad_norm": 0.88671875, + "learning_rate": 0.00010515163943324151, + "loss": 0.809, + "step": 32670 + }, + { + "epoch": 0.8388996679617602, + "grad_norm": 0.78125, + "learning_rate": 0.00010514718115421689, + "loss": 0.8526, + "step": 32671 + }, + { + "epoch": 0.838925345157682, + "grad_norm": 0.83203125, + "learning_rate": 0.0001051427228649344, + "loss": 0.9659, + "step": 32672 + }, + { + "epoch": 0.8389510223536039, + "grad_norm": 0.7578125, + "learning_rate": 0.00010513826456540295, + "loss": 0.7801, + "step": 32673 + }, + { + "epoch": 0.8389766995495257, + "grad_norm": 0.80078125, + "learning_rate": 0.00010513380625563136, + "loss": 0.7792, + "step": 32674 + }, + { + "epoch": 0.8390023767454475, + "grad_norm": 0.82421875, + "learning_rate": 0.00010512934793562861, + "loss": 0.8051, + "step": 32675 + }, + { + "epoch": 0.8390280539413694, + "grad_norm": 0.8203125, + "learning_rate": 0.00010512488960540348, + "loss": 0.7076, + "step": 32676 + }, + { + "epoch": 0.8390537311372912, + "grad_norm": 0.7421875, + "learning_rate": 0.00010512043126496494, + "loss": 0.8714, + "step": 32677 + }, + { + "epoch": 0.8390794083332129, + "grad_norm": 0.83203125, + "learning_rate": 0.00010511597291432183, + "loss": 0.868, + "step": 32678 + }, + { + "epoch": 0.8391050855291348, + "grad_norm": 0.890625, + "learning_rate": 0.00010511151455348302, + "loss": 0.883, + "step": 32679 + }, + { + "epoch": 0.8391307627250566, + "grad_norm": 0.83984375, + "learning_rate": 0.00010510705618245749, + "loss": 0.8499, + "step": 32680 + }, + { + "epoch": 0.8391564399209784, + "grad_norm": 0.8671875, + "learning_rate": 0.00010510259780125403, + "loss": 0.9627, + "step": 32681 + }, + { + "epoch": 0.8391821171169003, + "grad_norm": 0.77734375, + "learning_rate": 0.00010509813940988154, + "loss": 0.8189, + "step": 32682 + }, + { + "epoch": 0.8392077943128221, + "grad_norm": 0.7578125, + "learning_rate": 0.00010509368100834893, + "loss": 0.7242, + "step": 32683 + }, + { + "epoch": 0.8392334715087438, + "grad_norm": 1.046875, + "learning_rate": 0.00010508922259666505, + "loss": 0.8133, + "step": 32684 + }, + { + "epoch": 0.8392591487046657, + "grad_norm": 0.703125, + "learning_rate": 0.00010508476417483883, + "loss": 0.8411, + "step": 32685 + }, + { + "epoch": 0.8392848259005875, + "grad_norm": 0.7578125, + "learning_rate": 0.00010508030574287915, + "loss": 0.6868, + "step": 32686 + }, + { + "epoch": 0.8393105030965093, + "grad_norm": 0.71875, + "learning_rate": 0.00010507584730079487, + "loss": 0.7282, + "step": 32687 + }, + { + "epoch": 0.8393361802924312, + "grad_norm": 0.86328125, + "learning_rate": 0.00010507138884859487, + "loss": 0.883, + "step": 32688 + }, + { + "epoch": 0.839361857488353, + "grad_norm": 0.81640625, + "learning_rate": 0.00010506693038628806, + "loss": 0.7902, + "step": 32689 + }, + { + "epoch": 0.8393875346842748, + "grad_norm": 0.79296875, + "learning_rate": 0.0001050624719138833, + "loss": 0.7985, + "step": 32690 + }, + { + "epoch": 0.8394132118801966, + "grad_norm": 0.80859375, + "learning_rate": 0.00010505801343138953, + "loss": 0.917, + "step": 32691 + }, + { + "epoch": 0.8394388890761184, + "grad_norm": 0.76953125, + "learning_rate": 0.00010505355493881555, + "loss": 0.7669, + "step": 32692 + }, + { + "epoch": 0.8394645662720402, + "grad_norm": 0.7109375, + "learning_rate": 0.00010504909643617032, + "loss": 0.8185, + "step": 32693 + }, + { + "epoch": 0.8394902434679621, + "grad_norm": 0.78125, + "learning_rate": 0.0001050446379234627, + "loss": 0.8825, + "step": 32694 + }, + { + "epoch": 0.8395159206638839, + "grad_norm": 0.80859375, + "learning_rate": 0.00010504017940070154, + "loss": 0.7344, + "step": 32695 + }, + { + "epoch": 0.8395415978598058, + "grad_norm": 0.78515625, + "learning_rate": 0.0001050357208678958, + "loss": 0.8244, + "step": 32696 + }, + { + "epoch": 0.8395672750557276, + "grad_norm": 0.8046875, + "learning_rate": 0.00010503126232505428, + "loss": 0.9066, + "step": 32697 + }, + { + "epoch": 0.8395929522516493, + "grad_norm": 0.828125, + "learning_rate": 0.00010502680377218593, + "loss": 0.7099, + "step": 32698 + }, + { + "epoch": 0.8396186294475712, + "grad_norm": 0.76171875, + "learning_rate": 0.00010502234520929961, + "loss": 0.7607, + "step": 32699 + }, + { + "epoch": 0.839644306643493, + "grad_norm": 0.82421875, + "learning_rate": 0.00010501788663640422, + "loss": 0.832, + "step": 32700 + }, + { + "epoch": 0.8396699838394148, + "grad_norm": 0.76171875, + "learning_rate": 0.00010501342805350865, + "loss": 0.7997, + "step": 32701 + }, + { + "epoch": 0.8396956610353367, + "grad_norm": 0.8828125, + "learning_rate": 0.00010500896946062175, + "loss": 0.8892, + "step": 32702 + }, + { + "epoch": 0.8397213382312585, + "grad_norm": 0.74609375, + "learning_rate": 0.00010500451085775241, + "loss": 0.6667, + "step": 32703 + }, + { + "epoch": 0.8397470154271802, + "grad_norm": 0.72265625, + "learning_rate": 0.00010500005224490955, + "loss": 0.763, + "step": 32704 + }, + { + "epoch": 0.8397726926231021, + "grad_norm": 0.9296875, + "learning_rate": 0.00010499559362210206, + "loss": 0.7548, + "step": 32705 + }, + { + "epoch": 0.8397983698190239, + "grad_norm": 0.80078125, + "learning_rate": 0.00010499113498933874, + "loss": 0.6877, + "step": 32706 + }, + { + "epoch": 0.8398240470149457, + "grad_norm": 0.796875, + "learning_rate": 0.00010498667634662859, + "loss": 0.8939, + "step": 32707 + }, + { + "epoch": 0.8398497242108676, + "grad_norm": 0.7734375, + "learning_rate": 0.00010498221769398044, + "loss": 0.9743, + "step": 32708 + }, + { + "epoch": 0.8398754014067894, + "grad_norm": 0.80078125, + "learning_rate": 0.00010497775903140317, + "loss": 0.8343, + "step": 32709 + }, + { + "epoch": 0.8399010786027112, + "grad_norm": 0.671875, + "learning_rate": 0.00010497330035890569, + "loss": 0.8037, + "step": 32710 + }, + { + "epoch": 0.839926755798633, + "grad_norm": 0.8125, + "learning_rate": 0.00010496884167649684, + "loss": 0.8907, + "step": 32711 + }, + { + "epoch": 0.8399524329945548, + "grad_norm": 0.77734375, + "learning_rate": 0.00010496438298418556, + "loss": 0.7974, + "step": 32712 + }, + { + "epoch": 0.8399781101904766, + "grad_norm": 0.76171875, + "learning_rate": 0.00010495992428198071, + "loss": 0.7049, + "step": 32713 + }, + { + "epoch": 0.8400037873863985, + "grad_norm": 0.80859375, + "learning_rate": 0.00010495546556989119, + "loss": 0.8268, + "step": 32714 + }, + { + "epoch": 0.8400294645823203, + "grad_norm": 0.78515625, + "learning_rate": 0.00010495100684792586, + "loss": 0.837, + "step": 32715 + }, + { + "epoch": 0.8400551417782421, + "grad_norm": 0.76953125, + "learning_rate": 0.00010494654811609362, + "loss": 0.8208, + "step": 32716 + }, + { + "epoch": 0.840080818974164, + "grad_norm": 0.76171875, + "learning_rate": 0.00010494208937440337, + "loss": 0.8898, + "step": 32717 + }, + { + "epoch": 0.8401064961700857, + "grad_norm": 0.80078125, + "learning_rate": 0.00010493763062286398, + "loss": 0.7238, + "step": 32718 + }, + { + "epoch": 0.8401321733660075, + "grad_norm": 0.8359375, + "learning_rate": 0.0001049331718614843, + "loss": 0.8174, + "step": 32719 + }, + { + "epoch": 0.8401578505619294, + "grad_norm": 0.8203125, + "learning_rate": 0.00010492871309027332, + "loss": 0.9516, + "step": 32720 + }, + { + "epoch": 0.8401835277578512, + "grad_norm": 0.73046875, + "learning_rate": 0.0001049242543092398, + "loss": 0.7733, + "step": 32721 + }, + { + "epoch": 0.840209204953773, + "grad_norm": 0.83984375, + "learning_rate": 0.00010491979551839273, + "loss": 0.7897, + "step": 32722 + }, + { + "epoch": 0.8402348821496949, + "grad_norm": 0.78515625, + "learning_rate": 0.00010491533671774094, + "loss": 0.9366, + "step": 32723 + }, + { + "epoch": 0.8402605593456166, + "grad_norm": 0.83203125, + "learning_rate": 0.0001049108779072933, + "loss": 0.7225, + "step": 32724 + }, + { + "epoch": 0.8402862365415384, + "grad_norm": 0.91796875, + "learning_rate": 0.00010490641908705876, + "loss": 0.8676, + "step": 32725 + }, + { + "epoch": 0.8403119137374603, + "grad_norm": 0.76953125, + "learning_rate": 0.00010490196025704615, + "loss": 0.7321, + "step": 32726 + }, + { + "epoch": 0.8403375909333821, + "grad_norm": 0.7734375, + "learning_rate": 0.0001048975014172644, + "loss": 0.7824, + "step": 32727 + }, + { + "epoch": 0.840363268129304, + "grad_norm": 0.74609375, + "learning_rate": 0.00010489304256772235, + "loss": 0.8614, + "step": 32728 + }, + { + "epoch": 0.8403889453252258, + "grad_norm": 0.79296875, + "learning_rate": 0.00010488858370842888, + "loss": 0.7481, + "step": 32729 + }, + { + "epoch": 0.8404146225211476, + "grad_norm": 0.67578125, + "learning_rate": 0.00010488412483939297, + "loss": 0.8119, + "step": 32730 + }, + { + "epoch": 0.8404402997170694, + "grad_norm": 0.80078125, + "learning_rate": 0.0001048796659606234, + "loss": 0.9102, + "step": 32731 + }, + { + "epoch": 0.8404659769129912, + "grad_norm": 0.7578125, + "learning_rate": 0.00010487520707212908, + "loss": 0.6847, + "step": 32732 + }, + { + "epoch": 0.840491654108913, + "grad_norm": 0.85546875, + "learning_rate": 0.00010487074817391898, + "loss": 0.7811, + "step": 32733 + }, + { + "epoch": 0.8405173313048349, + "grad_norm": 0.79296875, + "learning_rate": 0.00010486628926600187, + "loss": 0.7684, + "step": 32734 + }, + { + "epoch": 0.8405430085007567, + "grad_norm": 0.78125, + "learning_rate": 0.00010486183034838666, + "loss": 0.8372, + "step": 32735 + }, + { + "epoch": 0.8405686856966785, + "grad_norm": 0.765625, + "learning_rate": 0.0001048573714210823, + "loss": 0.776, + "step": 32736 + }, + { + "epoch": 0.8405943628926004, + "grad_norm": 0.7734375, + "learning_rate": 0.0001048529124840976, + "loss": 0.854, + "step": 32737 + }, + { + "epoch": 0.8406200400885221, + "grad_norm": 0.81640625, + "learning_rate": 0.00010484845353744153, + "loss": 0.8611, + "step": 32738 + }, + { + "epoch": 0.8406457172844439, + "grad_norm": 0.80859375, + "learning_rate": 0.00010484399458112293, + "loss": 0.8716, + "step": 32739 + }, + { + "epoch": 0.8406713944803658, + "grad_norm": 0.75, + "learning_rate": 0.00010483953561515065, + "loss": 0.7157, + "step": 32740 + }, + { + "epoch": 0.8406970716762876, + "grad_norm": 0.69140625, + "learning_rate": 0.00010483507663953365, + "loss": 0.6504, + "step": 32741 + }, + { + "epoch": 0.8407227488722094, + "grad_norm": 0.75390625, + "learning_rate": 0.00010483061765428077, + "loss": 0.887, + "step": 32742 + }, + { + "epoch": 0.8407484260681313, + "grad_norm": 0.796875, + "learning_rate": 0.00010482615865940086, + "loss": 0.9372, + "step": 32743 + }, + { + "epoch": 0.840774103264053, + "grad_norm": 0.71484375, + "learning_rate": 0.00010482169965490292, + "loss": 0.7786, + "step": 32744 + }, + { + "epoch": 0.8407997804599748, + "grad_norm": 0.796875, + "learning_rate": 0.00010481724064079572, + "loss": 0.7936, + "step": 32745 + }, + { + "epoch": 0.8408254576558967, + "grad_norm": 0.78515625, + "learning_rate": 0.00010481278161708823, + "loss": 0.8959, + "step": 32746 + }, + { + "epoch": 0.8408511348518185, + "grad_norm": 0.80078125, + "learning_rate": 0.00010480832258378927, + "loss": 0.7494, + "step": 32747 + }, + { + "epoch": 0.8408768120477403, + "grad_norm": 0.73046875, + "learning_rate": 0.00010480386354090776, + "loss": 0.6241, + "step": 32748 + }, + { + "epoch": 0.8409024892436622, + "grad_norm": 0.875, + "learning_rate": 0.0001047994044884526, + "loss": 0.8782, + "step": 32749 + }, + { + "epoch": 0.840928166439584, + "grad_norm": 0.734375, + "learning_rate": 0.00010479494542643264, + "loss": 0.8091, + "step": 32750 + }, + { + "epoch": 0.8409538436355057, + "grad_norm": 0.828125, + "learning_rate": 0.0001047904863548568, + "loss": 0.8579, + "step": 32751 + }, + { + "epoch": 0.8409795208314276, + "grad_norm": 0.765625, + "learning_rate": 0.00010478602727373394, + "loss": 0.7632, + "step": 32752 + }, + { + "epoch": 0.8410051980273494, + "grad_norm": 0.76953125, + "learning_rate": 0.00010478156818307299, + "loss": 0.9024, + "step": 32753 + }, + { + "epoch": 0.8410308752232712, + "grad_norm": 0.7421875, + "learning_rate": 0.00010477710908288281, + "loss": 0.77, + "step": 32754 + }, + { + "epoch": 0.8410565524191931, + "grad_norm": 0.79296875, + "learning_rate": 0.00010477264997317227, + "loss": 0.7983, + "step": 32755 + }, + { + "epoch": 0.8410822296151149, + "grad_norm": 0.77734375, + "learning_rate": 0.00010476819085395022, + "loss": 0.7683, + "step": 32756 + }, + { + "epoch": 0.8411079068110368, + "grad_norm": 0.75390625, + "learning_rate": 0.00010476373172522564, + "loss": 0.8253, + "step": 32757 + }, + { + "epoch": 0.8411335840069585, + "grad_norm": 0.76171875, + "learning_rate": 0.0001047592725870074, + "loss": 0.8284, + "step": 32758 + }, + { + "epoch": 0.8411592612028803, + "grad_norm": 0.87109375, + "learning_rate": 0.00010475481343930431, + "loss": 0.8478, + "step": 32759 + }, + { + "epoch": 0.8411849383988022, + "grad_norm": 0.78125, + "learning_rate": 0.00010475035428212534, + "loss": 0.7274, + "step": 32760 + }, + { + "epoch": 0.841210615594724, + "grad_norm": 0.86328125, + "learning_rate": 0.00010474589511547932, + "loss": 0.8095, + "step": 32761 + }, + { + "epoch": 0.8412362927906458, + "grad_norm": 0.77734375, + "learning_rate": 0.00010474143593937518, + "loss": 0.7646, + "step": 32762 + }, + { + "epoch": 0.8412619699865677, + "grad_norm": 0.82421875, + "learning_rate": 0.0001047369767538218, + "loss": 0.8052, + "step": 32763 + }, + { + "epoch": 0.8412876471824894, + "grad_norm": 0.76953125, + "learning_rate": 0.000104732517558828, + "loss": 0.787, + "step": 32764 + }, + { + "epoch": 0.8413133243784112, + "grad_norm": 0.76953125, + "learning_rate": 0.00010472805835440276, + "loss": 0.7498, + "step": 32765 + }, + { + "epoch": 0.8413390015743331, + "grad_norm": 0.69921875, + "learning_rate": 0.00010472359914055491, + "loss": 0.7135, + "step": 32766 + }, + { + "epoch": 0.8413646787702549, + "grad_norm": 0.79296875, + "learning_rate": 0.00010471913991729337, + "loss": 0.9518, + "step": 32767 + }, + { + "epoch": 0.8413903559661767, + "grad_norm": 0.7421875, + "learning_rate": 0.000104714680684627, + "loss": 0.8408, + "step": 32768 + }, + { + "epoch": 0.8414160331620986, + "grad_norm": 0.76171875, + "learning_rate": 0.00010471022144256468, + "loss": 0.81, + "step": 32769 + }, + { + "epoch": 0.8414417103580204, + "grad_norm": 0.74609375, + "learning_rate": 0.00010470576219111534, + "loss": 0.8227, + "step": 32770 + }, + { + "epoch": 0.8414673875539421, + "grad_norm": 0.765625, + "learning_rate": 0.00010470130293028784, + "loss": 0.8085, + "step": 32771 + }, + { + "epoch": 0.841493064749864, + "grad_norm": 0.75390625, + "learning_rate": 0.00010469684366009105, + "loss": 0.853, + "step": 32772 + }, + { + "epoch": 0.8415187419457858, + "grad_norm": 1.453125, + "learning_rate": 0.0001046923843805339, + "loss": 0.7088, + "step": 32773 + }, + { + "epoch": 0.8415444191417076, + "grad_norm": 0.79296875, + "learning_rate": 0.00010468792509162523, + "loss": 0.8255, + "step": 32774 + }, + { + "epoch": 0.8415700963376295, + "grad_norm": 0.79296875, + "learning_rate": 0.00010468346579337396, + "loss": 0.7494, + "step": 32775 + }, + { + "epoch": 0.8415957735335513, + "grad_norm": 0.734375, + "learning_rate": 0.00010467900648578896, + "loss": 0.8762, + "step": 32776 + }, + { + "epoch": 0.8416214507294731, + "grad_norm": 0.85546875, + "learning_rate": 0.00010467454716887912, + "loss": 0.7039, + "step": 32777 + }, + { + "epoch": 0.8416471279253949, + "grad_norm": 0.7578125, + "learning_rate": 0.00010467008784265333, + "loss": 0.6961, + "step": 32778 + }, + { + "epoch": 0.8416728051213167, + "grad_norm": 0.7578125, + "learning_rate": 0.00010466562850712052, + "loss": 0.7101, + "step": 32779 + }, + { + "epoch": 0.8416984823172385, + "grad_norm": 0.828125, + "learning_rate": 0.00010466116916228948, + "loss": 0.7252, + "step": 32780 + }, + { + "epoch": 0.8417241595131604, + "grad_norm": 0.80859375, + "learning_rate": 0.00010465670980816917, + "loss": 0.7778, + "step": 32781 + }, + { + "epoch": 0.8417498367090822, + "grad_norm": 0.79296875, + "learning_rate": 0.00010465225044476845, + "loss": 0.9585, + "step": 32782 + }, + { + "epoch": 0.841775513905004, + "grad_norm": 0.82421875, + "learning_rate": 0.00010464779107209622, + "loss": 0.9216, + "step": 32783 + }, + { + "epoch": 0.8418011911009258, + "grad_norm": 0.7578125, + "learning_rate": 0.00010464333169016136, + "loss": 0.8299, + "step": 32784 + }, + { + "epoch": 0.8418268682968476, + "grad_norm": 0.7421875, + "learning_rate": 0.00010463887229897273, + "loss": 0.7882, + "step": 32785 + }, + { + "epoch": 0.8418525454927694, + "grad_norm": 0.76171875, + "learning_rate": 0.00010463441289853931, + "loss": 0.7251, + "step": 32786 + }, + { + "epoch": 0.8418782226886913, + "grad_norm": 0.78515625, + "learning_rate": 0.00010462995348886991, + "loss": 0.8116, + "step": 32787 + }, + { + "epoch": 0.8419038998846131, + "grad_norm": 0.78125, + "learning_rate": 0.00010462549406997338, + "loss": 1.0279, + "step": 32788 + }, + { + "epoch": 0.841929577080535, + "grad_norm": 0.765625, + "learning_rate": 0.0001046210346418587, + "loss": 0.8418, + "step": 32789 + }, + { + "epoch": 0.8419552542764568, + "grad_norm": 0.74609375, + "learning_rate": 0.00010461657520453472, + "loss": 0.8795, + "step": 32790 + }, + { + "epoch": 0.8419809314723785, + "grad_norm": 0.953125, + "learning_rate": 0.00010461211575801028, + "loss": 0.8402, + "step": 32791 + }, + { + "epoch": 0.8420066086683003, + "grad_norm": 0.77734375, + "learning_rate": 0.00010460765630229434, + "loss": 0.8053, + "step": 32792 + }, + { + "epoch": 0.8420322858642222, + "grad_norm": 0.734375, + "learning_rate": 0.00010460319683739576, + "loss": 0.9389, + "step": 32793 + }, + { + "epoch": 0.842057963060144, + "grad_norm": 0.765625, + "learning_rate": 0.00010459873736332344, + "loss": 0.8042, + "step": 32794 + }, + { + "epoch": 0.8420836402560659, + "grad_norm": 0.73828125, + "learning_rate": 0.00010459427788008622, + "loss": 0.7818, + "step": 32795 + }, + { + "epoch": 0.8421093174519877, + "grad_norm": 0.90234375, + "learning_rate": 0.00010458981838769302, + "loss": 0.7543, + "step": 32796 + }, + { + "epoch": 0.8421349946479095, + "grad_norm": 0.7734375, + "learning_rate": 0.00010458535888615275, + "loss": 0.826, + "step": 32797 + }, + { + "epoch": 0.8421606718438313, + "grad_norm": 0.8515625, + "learning_rate": 0.00010458089937547426, + "loss": 0.8933, + "step": 32798 + }, + { + "epoch": 0.8421863490397531, + "grad_norm": 0.796875, + "learning_rate": 0.00010457643985566645, + "loss": 0.817, + "step": 32799 + }, + { + "epoch": 0.8422120262356749, + "grad_norm": 0.734375, + "learning_rate": 0.00010457198032673823, + "loss": 0.8757, + "step": 32800 + }, + { + "epoch": 0.8422377034315968, + "grad_norm": 0.75, + "learning_rate": 0.00010456752078869842, + "loss": 0.7749, + "step": 32801 + }, + { + "epoch": 0.8422633806275186, + "grad_norm": 0.91015625, + "learning_rate": 0.000104563061241556, + "loss": 0.8866, + "step": 32802 + }, + { + "epoch": 0.8422890578234404, + "grad_norm": 0.7578125, + "learning_rate": 0.0001045586016853198, + "loss": 0.964, + "step": 32803 + }, + { + "epoch": 0.8423147350193622, + "grad_norm": 0.84765625, + "learning_rate": 0.00010455414211999867, + "loss": 0.8873, + "step": 32804 + }, + { + "epoch": 0.842340412215284, + "grad_norm": 0.71875, + "learning_rate": 0.0001045496825456016, + "loss": 0.7246, + "step": 32805 + }, + { + "epoch": 0.8423660894112058, + "grad_norm": 0.7734375, + "learning_rate": 0.00010454522296213742, + "loss": 0.6967, + "step": 32806 + }, + { + "epoch": 0.8423917666071277, + "grad_norm": 0.75390625, + "learning_rate": 0.00010454076336961502, + "loss": 0.8505, + "step": 32807 + }, + { + "epoch": 0.8424174438030495, + "grad_norm": 0.79296875, + "learning_rate": 0.00010453630376804329, + "loss": 0.7712, + "step": 32808 + }, + { + "epoch": 0.8424431209989713, + "grad_norm": 0.7734375, + "learning_rate": 0.0001045318441574311, + "loss": 0.7744, + "step": 32809 + }, + { + "epoch": 0.8424687981948932, + "grad_norm": 0.72265625, + "learning_rate": 0.00010452738453778734, + "loss": 0.6428, + "step": 32810 + }, + { + "epoch": 0.8424944753908149, + "grad_norm": 0.78125, + "learning_rate": 0.00010452292490912094, + "loss": 0.7949, + "step": 32811 + }, + { + "epoch": 0.8425201525867367, + "grad_norm": 0.74609375, + "learning_rate": 0.00010451846527144076, + "loss": 0.7125, + "step": 32812 + }, + { + "epoch": 0.8425458297826586, + "grad_norm": 0.80859375, + "learning_rate": 0.00010451400562475569, + "loss": 0.8487, + "step": 32813 + }, + { + "epoch": 0.8425715069785804, + "grad_norm": 0.82421875, + "learning_rate": 0.00010450954596907458, + "loss": 0.7342, + "step": 32814 + }, + { + "epoch": 0.8425971841745022, + "grad_norm": 1.296875, + "learning_rate": 0.00010450508630440639, + "loss": 0.7226, + "step": 32815 + }, + { + "epoch": 0.8426228613704241, + "grad_norm": 0.78515625, + "learning_rate": 0.00010450062663075994, + "loss": 0.809, + "step": 32816 + }, + { + "epoch": 0.8426485385663459, + "grad_norm": 0.80859375, + "learning_rate": 0.00010449616694814418, + "loss": 0.7569, + "step": 32817 + }, + { + "epoch": 0.8426742157622676, + "grad_norm": 0.6953125, + "learning_rate": 0.00010449170725656794, + "loss": 0.655, + "step": 32818 + }, + { + "epoch": 0.8426998929581895, + "grad_norm": 0.828125, + "learning_rate": 0.00010448724755604015, + "loss": 0.9641, + "step": 32819 + }, + { + "epoch": 0.8427255701541113, + "grad_norm": 0.83984375, + "learning_rate": 0.00010448278784656966, + "loss": 0.8582, + "step": 32820 + }, + { + "epoch": 0.8427512473500332, + "grad_norm": 0.734375, + "learning_rate": 0.0001044783281281654, + "loss": 0.7246, + "step": 32821 + }, + { + "epoch": 0.842776924545955, + "grad_norm": 0.78125, + "learning_rate": 0.00010447386840083621, + "loss": 0.7618, + "step": 32822 + }, + { + "epoch": 0.8428026017418768, + "grad_norm": 0.7578125, + "learning_rate": 0.00010446940866459102, + "loss": 0.7929, + "step": 32823 + }, + { + "epoch": 0.8428282789377985, + "grad_norm": 0.80078125, + "learning_rate": 0.00010446494891943872, + "loss": 0.8713, + "step": 32824 + }, + { + "epoch": 0.8428539561337204, + "grad_norm": 0.71875, + "learning_rate": 0.00010446048916538818, + "loss": 0.7527, + "step": 32825 + }, + { + "epoch": 0.8428796333296422, + "grad_norm": 0.921875, + "learning_rate": 0.00010445602940244828, + "loss": 0.7926, + "step": 32826 + }, + { + "epoch": 0.842905310525564, + "grad_norm": 0.83984375, + "learning_rate": 0.0001044515696306279, + "loss": 0.7276, + "step": 32827 + }, + { + "epoch": 0.8429309877214859, + "grad_norm": 0.8359375, + "learning_rate": 0.00010444710984993595, + "loss": 0.7462, + "step": 32828 + }, + { + "epoch": 0.8429566649174077, + "grad_norm": 0.84375, + "learning_rate": 0.00010444265006038132, + "loss": 0.9116, + "step": 32829 + }, + { + "epoch": 0.8429823421133296, + "grad_norm": 0.78515625, + "learning_rate": 0.00010443819026197288, + "loss": 0.7966, + "step": 32830 + }, + { + "epoch": 0.8430080193092513, + "grad_norm": 0.796875, + "learning_rate": 0.00010443373045471952, + "loss": 0.8543, + "step": 32831 + }, + { + "epoch": 0.8430336965051731, + "grad_norm": 0.7578125, + "learning_rate": 0.00010442927063863017, + "loss": 0.8301, + "step": 32832 + }, + { + "epoch": 0.843059373701095, + "grad_norm": 1.4453125, + "learning_rate": 0.00010442481081371364, + "loss": 0.7813, + "step": 32833 + }, + { + "epoch": 0.8430850508970168, + "grad_norm": 0.77734375, + "learning_rate": 0.00010442035097997889, + "loss": 0.7947, + "step": 32834 + }, + { + "epoch": 0.8431107280929386, + "grad_norm": 0.78515625, + "learning_rate": 0.00010441589113743477, + "loss": 0.7702, + "step": 32835 + }, + { + "epoch": 0.8431364052888605, + "grad_norm": 0.8359375, + "learning_rate": 0.00010441143128609016, + "loss": 0.8257, + "step": 32836 + }, + { + "epoch": 0.8431620824847823, + "grad_norm": 0.65234375, + "learning_rate": 0.00010440697142595402, + "loss": 0.7042, + "step": 32837 + }, + { + "epoch": 0.843187759680704, + "grad_norm": 0.765625, + "learning_rate": 0.00010440251155703513, + "loss": 0.8274, + "step": 32838 + }, + { + "epoch": 0.8432134368766259, + "grad_norm": 0.765625, + "learning_rate": 0.00010439805167934247, + "loss": 0.7446, + "step": 32839 + }, + { + "epoch": 0.8432391140725477, + "grad_norm": 0.73046875, + "learning_rate": 0.00010439359179288488, + "loss": 0.7838, + "step": 32840 + }, + { + "epoch": 0.8432647912684695, + "grad_norm": 0.796875, + "learning_rate": 0.00010438913189767123, + "loss": 0.7284, + "step": 32841 + }, + { + "epoch": 0.8432904684643914, + "grad_norm": 0.703125, + "learning_rate": 0.00010438467199371047, + "loss": 0.6515, + "step": 32842 + }, + { + "epoch": 0.8433161456603132, + "grad_norm": 0.703125, + "learning_rate": 0.00010438021208101146, + "loss": 0.8598, + "step": 32843 + }, + { + "epoch": 0.8433418228562349, + "grad_norm": 0.77734375, + "learning_rate": 0.00010437575215958305, + "loss": 0.7946, + "step": 32844 + }, + { + "epoch": 0.8433675000521568, + "grad_norm": 0.76953125, + "learning_rate": 0.00010437129222943418, + "loss": 0.8823, + "step": 32845 + }, + { + "epoch": 0.8433931772480786, + "grad_norm": 0.8125, + "learning_rate": 0.00010436683229057371, + "loss": 0.6337, + "step": 32846 + }, + { + "epoch": 0.8434188544440004, + "grad_norm": 0.78515625, + "learning_rate": 0.00010436237234301054, + "loss": 0.76, + "step": 32847 + }, + { + "epoch": 0.8434445316399223, + "grad_norm": 0.73828125, + "learning_rate": 0.00010435791238675357, + "loss": 0.6883, + "step": 32848 + }, + { + "epoch": 0.8434702088358441, + "grad_norm": 0.734375, + "learning_rate": 0.00010435345242181165, + "loss": 0.8242, + "step": 32849 + }, + { + "epoch": 0.843495886031766, + "grad_norm": 0.7421875, + "learning_rate": 0.00010434899244819372, + "loss": 0.8782, + "step": 32850 + }, + { + "epoch": 0.8435215632276877, + "grad_norm": 0.85546875, + "learning_rate": 0.00010434453246590863, + "loss": 0.8495, + "step": 32851 + }, + { + "epoch": 0.8435472404236095, + "grad_norm": 0.80859375, + "learning_rate": 0.00010434007247496527, + "loss": 0.77, + "step": 32852 + }, + { + "epoch": 0.8435729176195313, + "grad_norm": 0.8046875, + "learning_rate": 0.00010433561247537255, + "loss": 0.9027, + "step": 32853 + }, + { + "epoch": 0.8435985948154532, + "grad_norm": 0.7734375, + "learning_rate": 0.00010433115246713934, + "loss": 0.7869, + "step": 32854 + }, + { + "epoch": 0.843624272011375, + "grad_norm": 0.79296875, + "learning_rate": 0.00010432669245027453, + "loss": 0.8645, + "step": 32855 + }, + { + "epoch": 0.8436499492072969, + "grad_norm": 0.71875, + "learning_rate": 0.00010432223242478703, + "loss": 0.732, + "step": 32856 + }, + { + "epoch": 0.8436756264032186, + "grad_norm": 0.71484375, + "learning_rate": 0.00010431777239068567, + "loss": 0.6805, + "step": 32857 + }, + { + "epoch": 0.8437013035991404, + "grad_norm": 0.74609375, + "learning_rate": 0.00010431331234797943, + "loss": 0.7062, + "step": 32858 + }, + { + "epoch": 0.8437269807950623, + "grad_norm": 0.84375, + "learning_rate": 0.0001043088522966771, + "loss": 0.948, + "step": 32859 + }, + { + "epoch": 0.8437526579909841, + "grad_norm": 0.7421875, + "learning_rate": 0.00010430439223678767, + "loss": 0.8577, + "step": 32860 + }, + { + "epoch": 0.8437783351869059, + "grad_norm": 0.70703125, + "learning_rate": 0.00010429993216831995, + "loss": 0.7637, + "step": 32861 + }, + { + "epoch": 0.8438040123828278, + "grad_norm": 0.76953125, + "learning_rate": 0.00010429547209128283, + "loss": 0.7725, + "step": 32862 + }, + { + "epoch": 0.8438296895787496, + "grad_norm": 0.77734375, + "learning_rate": 0.00010429101200568525, + "loss": 0.857, + "step": 32863 + }, + { + "epoch": 0.8438553667746713, + "grad_norm": 0.80859375, + "learning_rate": 0.00010428655191153607, + "loss": 0.9614, + "step": 32864 + }, + { + "epoch": 0.8438810439705932, + "grad_norm": 0.87890625, + "learning_rate": 0.00010428209180884418, + "loss": 0.804, + "step": 32865 + }, + { + "epoch": 0.843906721166515, + "grad_norm": 0.91796875, + "learning_rate": 0.00010427763169761847, + "loss": 0.8909, + "step": 32866 + }, + { + "epoch": 0.8439323983624368, + "grad_norm": 0.73046875, + "learning_rate": 0.00010427317157786782, + "loss": 0.7254, + "step": 32867 + }, + { + "epoch": 0.8439580755583587, + "grad_norm": 0.75390625, + "learning_rate": 0.00010426871144960111, + "loss": 0.7871, + "step": 32868 + }, + { + "epoch": 0.8439837527542805, + "grad_norm": 0.8203125, + "learning_rate": 0.00010426425131282727, + "loss": 0.8527, + "step": 32869 + }, + { + "epoch": 0.8440094299502023, + "grad_norm": 0.81640625, + "learning_rate": 0.00010425979116755515, + "loss": 0.8379, + "step": 32870 + }, + { + "epoch": 0.8440351071461241, + "grad_norm": 0.72265625, + "learning_rate": 0.00010425533101379366, + "loss": 0.7023, + "step": 32871 + }, + { + "epoch": 0.8440607843420459, + "grad_norm": 0.90625, + "learning_rate": 0.00010425087085155168, + "loss": 0.7516, + "step": 32872 + }, + { + "epoch": 0.8440864615379677, + "grad_norm": 0.72265625, + "learning_rate": 0.00010424641068083804, + "loss": 0.871, + "step": 32873 + }, + { + "epoch": 0.8441121387338896, + "grad_norm": 0.74609375, + "learning_rate": 0.00010424195050166176, + "loss": 0.8095, + "step": 32874 + }, + { + "epoch": 0.8441378159298114, + "grad_norm": 0.80859375, + "learning_rate": 0.00010423749031403163, + "loss": 0.7652, + "step": 32875 + }, + { + "epoch": 0.8441634931257332, + "grad_norm": 0.99609375, + "learning_rate": 0.00010423303011795655, + "loss": 0.8709, + "step": 32876 + }, + { + "epoch": 0.844189170321655, + "grad_norm": 0.76953125, + "learning_rate": 0.00010422856991344545, + "loss": 0.8302, + "step": 32877 + }, + { + "epoch": 0.8442148475175768, + "grad_norm": 0.74609375, + "learning_rate": 0.00010422410970050718, + "loss": 0.8264, + "step": 32878 + }, + { + "epoch": 0.8442405247134986, + "grad_norm": 0.81640625, + "learning_rate": 0.00010421964947915066, + "loss": 0.8353, + "step": 32879 + }, + { + "epoch": 0.8442662019094205, + "grad_norm": 0.76953125, + "learning_rate": 0.00010421518924938474, + "loss": 0.7326, + "step": 32880 + }, + { + "epoch": 0.8442918791053423, + "grad_norm": 0.8359375, + "learning_rate": 0.00010421072901121832, + "loss": 0.8401, + "step": 32881 + }, + { + "epoch": 0.8443175563012641, + "grad_norm": 0.8359375, + "learning_rate": 0.0001042062687646603, + "loss": 0.888, + "step": 32882 + }, + { + "epoch": 0.844343233497186, + "grad_norm": 0.72265625, + "learning_rate": 0.00010420180850971958, + "loss": 0.7355, + "step": 32883 + }, + { + "epoch": 0.8443689106931077, + "grad_norm": 0.7265625, + "learning_rate": 0.00010419734824640501, + "loss": 0.8034, + "step": 32884 + }, + { + "epoch": 0.8443945878890295, + "grad_norm": 0.66015625, + "learning_rate": 0.00010419288797472555, + "loss": 0.7537, + "step": 32885 + }, + { + "epoch": 0.8444202650849514, + "grad_norm": 0.8203125, + "learning_rate": 0.00010418842769469002, + "loss": 0.916, + "step": 32886 + }, + { + "epoch": 0.8444459422808732, + "grad_norm": 1.0703125, + "learning_rate": 0.00010418396740630731, + "loss": 0.7631, + "step": 32887 + }, + { + "epoch": 0.844471619476795, + "grad_norm": 0.76171875, + "learning_rate": 0.00010417950710958637, + "loss": 0.7932, + "step": 32888 + }, + { + "epoch": 0.8444972966727169, + "grad_norm": 0.80859375, + "learning_rate": 0.00010417504680453601, + "loss": 0.7393, + "step": 32889 + }, + { + "epoch": 0.8445229738686387, + "grad_norm": 0.6640625, + "learning_rate": 0.00010417058649116518, + "loss": 0.7704, + "step": 32890 + }, + { + "epoch": 0.8445486510645605, + "grad_norm": 0.734375, + "learning_rate": 0.00010416612616948276, + "loss": 0.7555, + "step": 32891 + }, + { + "epoch": 0.8445743282604823, + "grad_norm": 0.80078125, + "learning_rate": 0.0001041616658394976, + "loss": 0.8847, + "step": 32892 + }, + { + "epoch": 0.8446000054564041, + "grad_norm": 0.7734375, + "learning_rate": 0.00010415720550121863, + "loss": 0.8878, + "step": 32893 + }, + { + "epoch": 0.844625682652326, + "grad_norm": 0.72265625, + "learning_rate": 0.00010415274515465472, + "loss": 0.8276, + "step": 32894 + }, + { + "epoch": 0.8446513598482478, + "grad_norm": 0.94140625, + "learning_rate": 0.00010414828479981477, + "loss": 0.7551, + "step": 32895 + }, + { + "epoch": 0.8446770370441696, + "grad_norm": 0.8046875, + "learning_rate": 0.00010414382443670767, + "loss": 0.8034, + "step": 32896 + }, + { + "epoch": 0.8447027142400914, + "grad_norm": 0.77734375, + "learning_rate": 0.00010413936406534229, + "loss": 0.7691, + "step": 32897 + }, + { + "epoch": 0.8447283914360132, + "grad_norm": 0.765625, + "learning_rate": 0.00010413490368572753, + "loss": 0.7622, + "step": 32898 + }, + { + "epoch": 0.844754068631935, + "grad_norm": 0.76171875, + "learning_rate": 0.0001041304432978723, + "loss": 0.7777, + "step": 32899 + }, + { + "epoch": 0.8447797458278569, + "grad_norm": 0.75390625, + "learning_rate": 0.00010412598290178543, + "loss": 0.7557, + "step": 32900 + }, + { + "epoch": 0.8448054230237787, + "grad_norm": 0.73828125, + "learning_rate": 0.00010412152249747588, + "loss": 0.8251, + "step": 32901 + }, + { + "epoch": 0.8448311002197005, + "grad_norm": 0.8359375, + "learning_rate": 0.00010411706208495249, + "loss": 0.8268, + "step": 32902 + }, + { + "epoch": 0.8448567774156224, + "grad_norm": 0.69921875, + "learning_rate": 0.00010411260166422419, + "loss": 0.7858, + "step": 32903 + }, + { + "epoch": 0.8448824546115441, + "grad_norm": 0.84375, + "learning_rate": 0.00010410814123529986, + "loss": 0.8385, + "step": 32904 + }, + { + "epoch": 0.8449081318074659, + "grad_norm": 0.80859375, + "learning_rate": 0.00010410368079818833, + "loss": 0.7568, + "step": 32905 + }, + { + "epoch": 0.8449338090033878, + "grad_norm": 0.78515625, + "learning_rate": 0.00010409922035289856, + "loss": 0.8553, + "step": 32906 + }, + { + "epoch": 0.8449594861993096, + "grad_norm": 0.7265625, + "learning_rate": 0.00010409475989943942, + "loss": 0.7648, + "step": 32907 + }, + { + "epoch": 0.8449851633952314, + "grad_norm": 0.98046875, + "learning_rate": 0.00010409029943781976, + "loss": 0.7464, + "step": 32908 + }, + { + "epoch": 0.8450108405911533, + "grad_norm": 0.80859375, + "learning_rate": 0.00010408583896804853, + "loss": 0.7538, + "step": 32909 + }, + { + "epoch": 0.8450365177870751, + "grad_norm": 0.7734375, + "learning_rate": 0.00010408137849013457, + "loss": 0.7454, + "step": 32910 + }, + { + "epoch": 0.8450621949829968, + "grad_norm": 0.7734375, + "learning_rate": 0.00010407691800408683, + "loss": 0.8379, + "step": 32911 + }, + { + "epoch": 0.8450878721789187, + "grad_norm": 0.77734375, + "learning_rate": 0.00010407245750991415, + "loss": 0.8781, + "step": 32912 + }, + { + "epoch": 0.8451135493748405, + "grad_norm": 0.77734375, + "learning_rate": 0.00010406799700762542, + "loss": 0.8803, + "step": 32913 + }, + { + "epoch": 0.8451392265707623, + "grad_norm": 0.85546875, + "learning_rate": 0.00010406353649722953, + "loss": 0.95, + "step": 32914 + }, + { + "epoch": 0.8451649037666842, + "grad_norm": 0.79296875, + "learning_rate": 0.00010405907597873539, + "loss": 0.832, + "step": 32915 + }, + { + "epoch": 0.845190580962606, + "grad_norm": 0.80859375, + "learning_rate": 0.00010405461545215186, + "loss": 0.853, + "step": 32916 + }, + { + "epoch": 0.8452162581585277, + "grad_norm": 0.78515625, + "learning_rate": 0.00010405015491748789, + "loss": 0.8438, + "step": 32917 + }, + { + "epoch": 0.8452419353544496, + "grad_norm": 0.890625, + "learning_rate": 0.00010404569437475229, + "loss": 0.768, + "step": 32918 + }, + { + "epoch": 0.8452676125503714, + "grad_norm": 0.8515625, + "learning_rate": 0.00010404123382395402, + "loss": 0.7919, + "step": 32919 + }, + { + "epoch": 0.8452932897462933, + "grad_norm": 0.78515625, + "learning_rate": 0.00010403677326510191, + "loss": 0.839, + "step": 32920 + }, + { + "epoch": 0.8453189669422151, + "grad_norm": 0.87890625, + "learning_rate": 0.00010403231269820486, + "loss": 0.7049, + "step": 32921 + }, + { + "epoch": 0.8453446441381369, + "grad_norm": 0.765625, + "learning_rate": 0.00010402785212327179, + "loss": 0.7371, + "step": 32922 + }, + { + "epoch": 0.8453703213340588, + "grad_norm": 0.83203125, + "learning_rate": 0.00010402339154031158, + "loss": 0.8224, + "step": 32923 + }, + { + "epoch": 0.8453959985299805, + "grad_norm": 0.78515625, + "learning_rate": 0.00010401893094933313, + "loss": 0.7448, + "step": 32924 + }, + { + "epoch": 0.8454216757259023, + "grad_norm": 0.80859375, + "learning_rate": 0.00010401447035034528, + "loss": 0.7314, + "step": 32925 + }, + { + "epoch": 0.8454473529218242, + "grad_norm": 0.828125, + "learning_rate": 0.00010401000974335697, + "loss": 0.894, + "step": 32926 + }, + { + "epoch": 0.845473030117746, + "grad_norm": 0.8203125, + "learning_rate": 0.00010400554912837708, + "loss": 0.7847, + "step": 32927 + }, + { + "epoch": 0.8454987073136678, + "grad_norm": 0.8671875, + "learning_rate": 0.00010400108850541447, + "loss": 0.7641, + "step": 32928 + }, + { + "epoch": 0.8455243845095897, + "grad_norm": 0.88671875, + "learning_rate": 0.00010399662787447806, + "loss": 0.9721, + "step": 32929 + }, + { + "epoch": 0.8455500617055115, + "grad_norm": 0.828125, + "learning_rate": 0.00010399216723557675, + "loss": 0.7948, + "step": 32930 + }, + { + "epoch": 0.8455757389014332, + "grad_norm": 0.8046875, + "learning_rate": 0.0001039877065887194, + "loss": 0.6818, + "step": 32931 + }, + { + "epoch": 0.8456014160973551, + "grad_norm": 0.828125, + "learning_rate": 0.00010398324593391491, + "loss": 0.8446, + "step": 32932 + }, + { + "epoch": 0.8456270932932769, + "grad_norm": 0.74609375, + "learning_rate": 0.00010397878527117217, + "loss": 0.7074, + "step": 32933 + }, + { + "epoch": 0.8456527704891987, + "grad_norm": 0.8125, + "learning_rate": 0.00010397432460050007, + "loss": 0.8202, + "step": 32934 + }, + { + "epoch": 0.8456784476851206, + "grad_norm": 0.7734375, + "learning_rate": 0.0001039698639219075, + "loss": 0.7355, + "step": 32935 + }, + { + "epoch": 0.8457041248810424, + "grad_norm": 0.82421875, + "learning_rate": 0.00010396540323540336, + "loss": 0.7794, + "step": 32936 + }, + { + "epoch": 0.8457298020769641, + "grad_norm": 0.73828125, + "learning_rate": 0.00010396094254099653, + "loss": 0.7228, + "step": 32937 + }, + { + "epoch": 0.845755479272886, + "grad_norm": 0.7421875, + "learning_rate": 0.00010395648183869589, + "loss": 0.7455, + "step": 32938 + }, + { + "epoch": 0.8457811564688078, + "grad_norm": 0.83203125, + "learning_rate": 0.00010395202112851033, + "loss": 0.8347, + "step": 32939 + }, + { + "epoch": 0.8458068336647296, + "grad_norm": 0.8203125, + "learning_rate": 0.00010394756041044876, + "loss": 0.7724, + "step": 32940 + }, + { + "epoch": 0.8458325108606515, + "grad_norm": 0.734375, + "learning_rate": 0.00010394309968452006, + "loss": 0.6446, + "step": 32941 + }, + { + "epoch": 0.8458581880565733, + "grad_norm": 0.7890625, + "learning_rate": 0.00010393863895073311, + "loss": 0.9066, + "step": 32942 + }, + { + "epoch": 0.8458838652524951, + "grad_norm": 0.80859375, + "learning_rate": 0.00010393417820909681, + "loss": 0.8886, + "step": 32943 + }, + { + "epoch": 0.8459095424484169, + "grad_norm": 0.81640625, + "learning_rate": 0.00010392971745962008, + "loss": 0.8251, + "step": 32944 + }, + { + "epoch": 0.8459352196443387, + "grad_norm": 0.76953125, + "learning_rate": 0.00010392525670231173, + "loss": 0.9506, + "step": 32945 + }, + { + "epoch": 0.8459608968402605, + "grad_norm": 0.7890625, + "learning_rate": 0.00010392079593718071, + "loss": 0.8679, + "step": 32946 + }, + { + "epoch": 0.8459865740361824, + "grad_norm": 0.7578125, + "learning_rate": 0.0001039163351642359, + "loss": 0.821, + "step": 32947 + }, + { + "epoch": 0.8460122512321042, + "grad_norm": 0.8203125, + "learning_rate": 0.00010391187438348619, + "loss": 0.7055, + "step": 32948 + }, + { + "epoch": 0.846037928428026, + "grad_norm": 0.8046875, + "learning_rate": 0.00010390741359494047, + "loss": 0.7868, + "step": 32949 + }, + { + "epoch": 0.8460636056239479, + "grad_norm": 0.8125, + "learning_rate": 0.00010390295279860764, + "loss": 0.658, + "step": 32950 + }, + { + "epoch": 0.8460892828198696, + "grad_norm": 0.78125, + "learning_rate": 0.00010389849199449658, + "loss": 0.7969, + "step": 32951 + }, + { + "epoch": 0.8461149600157915, + "grad_norm": 0.84765625, + "learning_rate": 0.00010389403118261616, + "loss": 0.8587, + "step": 32952 + }, + { + "epoch": 0.8461406372117133, + "grad_norm": 0.84765625, + "learning_rate": 0.00010388957036297528, + "loss": 0.7498, + "step": 32953 + }, + { + "epoch": 0.8461663144076351, + "grad_norm": 0.8125, + "learning_rate": 0.00010388510953558286, + "loss": 0.8458, + "step": 32954 + }, + { + "epoch": 0.846191991603557, + "grad_norm": 0.77734375, + "learning_rate": 0.00010388064870044776, + "loss": 0.8323, + "step": 32955 + }, + { + "epoch": 0.8462176687994788, + "grad_norm": 0.7265625, + "learning_rate": 0.00010387618785757886, + "loss": 0.6908, + "step": 32956 + }, + { + "epoch": 0.8462433459954005, + "grad_norm": 1.25, + "learning_rate": 0.00010387172700698511, + "loss": 0.697, + "step": 32957 + }, + { + "epoch": 0.8462690231913224, + "grad_norm": 0.71484375, + "learning_rate": 0.0001038672661486753, + "loss": 0.76, + "step": 32958 + }, + { + "epoch": 0.8462947003872442, + "grad_norm": 0.78125, + "learning_rate": 0.00010386280528265842, + "loss": 0.9801, + "step": 32959 + }, + { + "epoch": 0.846320377583166, + "grad_norm": 0.80859375, + "learning_rate": 0.00010385834440894331, + "loss": 0.7898, + "step": 32960 + }, + { + "epoch": 0.8463460547790879, + "grad_norm": 0.8359375, + "learning_rate": 0.00010385388352753885, + "loss": 0.8491, + "step": 32961 + }, + { + "epoch": 0.8463717319750097, + "grad_norm": 0.75, + "learning_rate": 0.00010384942263845397, + "loss": 0.6825, + "step": 32962 + }, + { + "epoch": 0.8463974091709315, + "grad_norm": 0.79296875, + "learning_rate": 0.00010384496174169753, + "loss": 0.8412, + "step": 32963 + }, + { + "epoch": 0.8464230863668533, + "grad_norm": 0.80859375, + "learning_rate": 0.00010384050083727844, + "loss": 0.8741, + "step": 32964 + }, + { + "epoch": 0.8464487635627751, + "grad_norm": 0.78125, + "learning_rate": 0.00010383603992520557, + "loss": 0.8592, + "step": 32965 + }, + { + "epoch": 0.8464744407586969, + "grad_norm": 0.70703125, + "learning_rate": 0.00010383157900548782, + "loss": 0.7834, + "step": 32966 + }, + { + "epoch": 0.8465001179546188, + "grad_norm": 0.76171875, + "learning_rate": 0.00010382711807813406, + "loss": 0.7379, + "step": 32967 + }, + { + "epoch": 0.8465257951505406, + "grad_norm": 0.8046875, + "learning_rate": 0.00010382265714315322, + "loss": 0.7251, + "step": 32968 + }, + { + "epoch": 0.8465514723464624, + "grad_norm": 0.76953125, + "learning_rate": 0.00010381819620055415, + "loss": 0.7659, + "step": 32969 + }, + { + "epoch": 0.8465771495423843, + "grad_norm": 0.8125, + "learning_rate": 0.00010381373525034581, + "loss": 0.725, + "step": 32970 + }, + { + "epoch": 0.846602826738306, + "grad_norm": 0.73046875, + "learning_rate": 0.00010380927429253697, + "loss": 0.8745, + "step": 32971 + }, + { + "epoch": 0.8466285039342278, + "grad_norm": 0.83203125, + "learning_rate": 0.00010380481332713665, + "loss": 0.752, + "step": 32972 + }, + { + "epoch": 0.8466541811301497, + "grad_norm": 0.8515625, + "learning_rate": 0.00010380035235415367, + "loss": 0.735, + "step": 32973 + }, + { + "epoch": 0.8466798583260715, + "grad_norm": 0.78125, + "learning_rate": 0.00010379589137359689, + "loss": 0.6988, + "step": 32974 + }, + { + "epoch": 0.8467055355219933, + "grad_norm": 0.78515625, + "learning_rate": 0.00010379143038547527, + "loss": 0.8713, + "step": 32975 + }, + { + "epoch": 0.8467312127179152, + "grad_norm": 0.77734375, + "learning_rate": 0.00010378696938979768, + "loss": 0.8549, + "step": 32976 + }, + { + "epoch": 0.8467568899138369, + "grad_norm": 0.73046875, + "learning_rate": 0.000103782508386573, + "loss": 0.7611, + "step": 32977 + }, + { + "epoch": 0.8467825671097587, + "grad_norm": 0.77734375, + "learning_rate": 0.00010377804737581011, + "loss": 0.8166, + "step": 32978 + }, + { + "epoch": 0.8468082443056806, + "grad_norm": 0.7734375, + "learning_rate": 0.0001037735863575179, + "loss": 0.7803, + "step": 32979 + }, + { + "epoch": 0.8468339215016024, + "grad_norm": 0.76171875, + "learning_rate": 0.0001037691253317053, + "loss": 0.7833, + "step": 32980 + }, + { + "epoch": 0.8468595986975243, + "grad_norm": 0.78515625, + "learning_rate": 0.00010376466429838116, + "loss": 0.7655, + "step": 32981 + }, + { + "epoch": 0.8468852758934461, + "grad_norm": 0.76953125, + "learning_rate": 0.00010376020325755438, + "loss": 0.8757, + "step": 32982 + }, + { + "epoch": 0.8469109530893679, + "grad_norm": 0.71875, + "learning_rate": 0.00010375574220923389, + "loss": 0.6869, + "step": 32983 + }, + { + "epoch": 0.8469366302852896, + "grad_norm": 0.796875, + "learning_rate": 0.00010375128115342852, + "loss": 0.8, + "step": 32984 + }, + { + "epoch": 0.8469623074812115, + "grad_norm": 0.73046875, + "learning_rate": 0.00010374682009014718, + "loss": 0.7587, + "step": 32985 + }, + { + "epoch": 0.8469879846771333, + "grad_norm": 0.81640625, + "learning_rate": 0.00010374235901939877, + "loss": 1.0166, + "step": 32986 + }, + { + "epoch": 0.8470136618730552, + "grad_norm": 0.7578125, + "learning_rate": 0.00010373789794119216, + "loss": 0.8028, + "step": 32987 + }, + { + "epoch": 0.847039339068977, + "grad_norm": 0.73046875, + "learning_rate": 0.00010373343685553627, + "loss": 0.7116, + "step": 32988 + }, + { + "epoch": 0.8470650162648988, + "grad_norm": 0.75, + "learning_rate": 0.00010372897576244001, + "loss": 0.778, + "step": 32989 + }, + { + "epoch": 0.8470906934608207, + "grad_norm": 0.79296875, + "learning_rate": 0.0001037245146619122, + "loss": 0.6842, + "step": 32990 + }, + { + "epoch": 0.8471163706567424, + "grad_norm": 0.75390625, + "learning_rate": 0.00010372005355396179, + "loss": 0.765, + "step": 32991 + }, + { + "epoch": 0.8471420478526642, + "grad_norm": 0.76953125, + "learning_rate": 0.00010371559243859763, + "loss": 0.8581, + "step": 32992 + }, + { + "epoch": 0.8471677250485861, + "grad_norm": 0.75390625, + "learning_rate": 0.00010371113131582864, + "loss": 0.7515, + "step": 32993 + }, + { + "epoch": 0.8471934022445079, + "grad_norm": 0.82421875, + "learning_rate": 0.00010370667018566369, + "loss": 0.7125, + "step": 32994 + }, + { + "epoch": 0.8472190794404297, + "grad_norm": 0.7265625, + "learning_rate": 0.0001037022090481117, + "loss": 0.8156, + "step": 32995 + }, + { + "epoch": 0.8472447566363516, + "grad_norm": 0.8046875, + "learning_rate": 0.00010369774790318152, + "loss": 0.9542, + "step": 32996 + }, + { + "epoch": 0.8472704338322733, + "grad_norm": 0.80859375, + "learning_rate": 0.00010369328675088208, + "loss": 0.8312, + "step": 32997 + }, + { + "epoch": 0.8472961110281951, + "grad_norm": 0.78515625, + "learning_rate": 0.00010368882559122223, + "loss": 0.9105, + "step": 32998 + }, + { + "epoch": 0.847321788224117, + "grad_norm": 0.74609375, + "learning_rate": 0.00010368436442421092, + "loss": 0.766, + "step": 32999 + }, + { + "epoch": 0.8473474654200388, + "grad_norm": 0.83203125, + "learning_rate": 0.000103679903249857, + "loss": 0.8133, + "step": 33000 + }, + { + "epoch": 0.8473474654200388, + "eval_loss": 0.8028781414031982, + "eval_runtime": 351.1822, + "eval_samples_per_second": 28.475, + "eval_steps_per_second": 0.891, + "step": 33000 + }, + { + "epoch": 0.8473731426159606, + "grad_norm": 0.8203125, + "learning_rate": 0.0001036754420681693, + "loss": 0.7612, + "step": 33001 + }, + { + "epoch": 0.8473988198118825, + "grad_norm": 0.76953125, + "learning_rate": 0.00010367098087915686, + "loss": 0.737, + "step": 33002 + }, + { + "epoch": 0.8474244970078043, + "grad_norm": 0.8359375, + "learning_rate": 0.00010366651968282843, + "loss": 0.7196, + "step": 33003 + }, + { + "epoch": 0.847450174203726, + "grad_norm": 0.73046875, + "learning_rate": 0.00010366205847919299, + "loss": 0.6937, + "step": 33004 + }, + { + "epoch": 0.8474758513996479, + "grad_norm": 0.76953125, + "learning_rate": 0.00010365759726825941, + "loss": 0.9479, + "step": 33005 + }, + { + "epoch": 0.8475015285955697, + "grad_norm": 0.76171875, + "learning_rate": 0.00010365313605003653, + "loss": 0.8484, + "step": 33006 + }, + { + "epoch": 0.8475272057914915, + "grad_norm": 0.78515625, + "learning_rate": 0.00010364867482453328, + "loss": 0.6601, + "step": 33007 + }, + { + "epoch": 0.8475528829874134, + "grad_norm": 0.73828125, + "learning_rate": 0.00010364421359175858, + "loss": 0.7863, + "step": 33008 + }, + { + "epoch": 0.8475785601833352, + "grad_norm": 0.7734375, + "learning_rate": 0.00010363975235172128, + "loss": 0.8184, + "step": 33009 + }, + { + "epoch": 0.847604237379257, + "grad_norm": 0.75, + "learning_rate": 0.00010363529110443028, + "loss": 0.81, + "step": 33010 + }, + { + "epoch": 0.8476299145751788, + "grad_norm": 0.84375, + "learning_rate": 0.00010363082984989447, + "loss": 0.912, + "step": 33011 + }, + { + "epoch": 0.8476555917711006, + "grad_norm": 0.8203125, + "learning_rate": 0.00010362636858812275, + "loss": 0.8423, + "step": 33012 + }, + { + "epoch": 0.8476812689670224, + "grad_norm": 0.7890625, + "learning_rate": 0.00010362190731912402, + "loss": 0.6282, + "step": 33013 + }, + { + "epoch": 0.8477069461629443, + "grad_norm": 0.76171875, + "learning_rate": 0.00010361744604290713, + "loss": 0.7137, + "step": 33014 + }, + { + "epoch": 0.8477326233588661, + "grad_norm": 0.87890625, + "learning_rate": 0.000103612984759481, + "loss": 0.9701, + "step": 33015 + }, + { + "epoch": 0.847758300554788, + "grad_norm": 0.7578125, + "learning_rate": 0.00010360852346885453, + "loss": 0.882, + "step": 33016 + }, + { + "epoch": 0.8477839777507097, + "grad_norm": 0.765625, + "learning_rate": 0.0001036040621710366, + "loss": 0.8773, + "step": 33017 + }, + { + "epoch": 0.8478096549466315, + "grad_norm": 0.734375, + "learning_rate": 0.00010359960086603611, + "loss": 0.7849, + "step": 33018 + }, + { + "epoch": 0.8478353321425534, + "grad_norm": 0.8046875, + "learning_rate": 0.00010359513955386189, + "loss": 0.7611, + "step": 33019 + }, + { + "epoch": 0.8478610093384752, + "grad_norm": 0.79296875, + "learning_rate": 0.00010359067823452294, + "loss": 0.7451, + "step": 33020 + }, + { + "epoch": 0.847886686534397, + "grad_norm": 0.703125, + "learning_rate": 0.00010358621690802806, + "loss": 0.69, + "step": 33021 + }, + { + "epoch": 0.8479123637303189, + "grad_norm": 0.83203125, + "learning_rate": 0.00010358175557438616, + "loss": 0.7327, + "step": 33022 + }, + { + "epoch": 0.8479380409262407, + "grad_norm": 0.76953125, + "learning_rate": 0.0001035772942336062, + "loss": 0.8881, + "step": 33023 + }, + { + "epoch": 0.8479637181221624, + "grad_norm": 0.78515625, + "learning_rate": 0.00010357283288569697, + "loss": 0.7718, + "step": 33024 + }, + { + "epoch": 0.8479893953180843, + "grad_norm": 0.75, + "learning_rate": 0.0001035683715306674, + "loss": 0.6928, + "step": 33025 + }, + { + "epoch": 0.8480150725140061, + "grad_norm": 0.73828125, + "learning_rate": 0.00010356391016852643, + "loss": 0.7148, + "step": 33026 + }, + { + "epoch": 0.8480407497099279, + "grad_norm": 0.84375, + "learning_rate": 0.00010355944879928286, + "loss": 0.8795, + "step": 33027 + }, + { + "epoch": 0.8480664269058498, + "grad_norm": 0.79296875, + "learning_rate": 0.00010355498742294567, + "loss": 0.837, + "step": 33028 + }, + { + "epoch": 0.8480921041017716, + "grad_norm": 0.8203125, + "learning_rate": 0.0001035505260395237, + "loss": 0.8496, + "step": 33029 + }, + { + "epoch": 0.8481177812976934, + "grad_norm": 0.73828125, + "learning_rate": 0.00010354606464902585, + "loss": 0.7195, + "step": 33030 + }, + { + "epoch": 0.8481434584936152, + "grad_norm": 0.8125, + "learning_rate": 0.00010354160325146101, + "loss": 0.7689, + "step": 33031 + }, + { + "epoch": 0.848169135689537, + "grad_norm": 0.78515625, + "learning_rate": 0.00010353714184683809, + "loss": 0.6372, + "step": 33032 + }, + { + "epoch": 0.8481948128854588, + "grad_norm": 0.78125, + "learning_rate": 0.00010353268043516593, + "loss": 0.7723, + "step": 33033 + }, + { + "epoch": 0.8482204900813807, + "grad_norm": 0.875, + "learning_rate": 0.00010352821901645348, + "loss": 0.8477, + "step": 33034 + }, + { + "epoch": 0.8482461672773025, + "grad_norm": 0.81640625, + "learning_rate": 0.0001035237575907096, + "loss": 0.9396, + "step": 33035 + }, + { + "epoch": 0.8482718444732243, + "grad_norm": 0.796875, + "learning_rate": 0.00010351929615794323, + "loss": 0.7041, + "step": 33036 + }, + { + "epoch": 0.8482975216691461, + "grad_norm": 0.79296875, + "learning_rate": 0.00010351483471816319, + "loss": 0.8339, + "step": 33037 + }, + { + "epoch": 0.8483231988650679, + "grad_norm": 0.79296875, + "learning_rate": 0.00010351037327137838, + "loss": 0.6995, + "step": 33038 + }, + { + "epoch": 0.8483488760609897, + "grad_norm": 0.76171875, + "learning_rate": 0.00010350591181759775, + "loss": 0.7836, + "step": 33039 + }, + { + "epoch": 0.8483745532569116, + "grad_norm": 0.87109375, + "learning_rate": 0.00010350145035683013, + "loss": 0.8048, + "step": 33040 + }, + { + "epoch": 0.8484002304528334, + "grad_norm": 0.78125, + "learning_rate": 0.00010349698888908443, + "loss": 0.6613, + "step": 33041 + }, + { + "epoch": 0.8484259076487553, + "grad_norm": 0.78125, + "learning_rate": 0.00010349252741436959, + "loss": 0.841, + "step": 33042 + }, + { + "epoch": 0.8484515848446771, + "grad_norm": 0.828125, + "learning_rate": 0.0001034880659326944, + "loss": 0.918, + "step": 33043 + }, + { + "epoch": 0.8484772620405988, + "grad_norm": 0.82421875, + "learning_rate": 0.00010348360444406783, + "loss": 0.848, + "step": 33044 + }, + { + "epoch": 0.8485029392365206, + "grad_norm": 0.76171875, + "learning_rate": 0.00010347914294849879, + "loss": 0.732, + "step": 33045 + }, + { + "epoch": 0.8485286164324425, + "grad_norm": 0.8828125, + "learning_rate": 0.00010347468144599609, + "loss": 0.8398, + "step": 33046 + }, + { + "epoch": 0.8485542936283643, + "grad_norm": 0.76171875, + "learning_rate": 0.00010347021993656867, + "loss": 0.7968, + "step": 33047 + }, + { + "epoch": 0.8485799708242862, + "grad_norm": 0.859375, + "learning_rate": 0.00010346575842022541, + "loss": 0.8746, + "step": 33048 + }, + { + "epoch": 0.848605648020208, + "grad_norm": 0.71484375, + "learning_rate": 0.00010346129689697523, + "loss": 0.7124, + "step": 33049 + }, + { + "epoch": 0.8486313252161297, + "grad_norm": 0.84765625, + "learning_rate": 0.000103456835366827, + "loss": 0.882, + "step": 33050 + }, + { + "epoch": 0.8486570024120516, + "grad_norm": 0.7265625, + "learning_rate": 0.00010345237382978956, + "loss": 0.6794, + "step": 33051 + }, + { + "epoch": 0.8486826796079734, + "grad_norm": 0.7734375, + "learning_rate": 0.0001034479122858719, + "loss": 0.8842, + "step": 33052 + }, + { + "epoch": 0.8487083568038952, + "grad_norm": 0.8515625, + "learning_rate": 0.00010344345073508286, + "loss": 0.9102, + "step": 33053 + }, + { + "epoch": 0.8487340339998171, + "grad_norm": 0.78125, + "learning_rate": 0.00010343898917743128, + "loss": 0.769, + "step": 33054 + }, + { + "epoch": 0.8487597111957389, + "grad_norm": 1.234375, + "learning_rate": 0.00010343452761292618, + "loss": 0.7539, + "step": 33055 + }, + { + "epoch": 0.8487853883916607, + "grad_norm": 0.8125, + "learning_rate": 0.00010343006604157635, + "loss": 0.7536, + "step": 33056 + }, + { + "epoch": 0.8488110655875825, + "grad_norm": 0.8359375, + "learning_rate": 0.00010342560446339068, + "loss": 0.932, + "step": 33057 + }, + { + "epoch": 0.8488367427835043, + "grad_norm": 0.8359375, + "learning_rate": 0.00010342114287837811, + "loss": 0.7412, + "step": 33058 + }, + { + "epoch": 0.8488624199794261, + "grad_norm": 0.84375, + "learning_rate": 0.00010341668128654751, + "loss": 0.9348, + "step": 33059 + }, + { + "epoch": 0.848888097175348, + "grad_norm": 0.78515625, + "learning_rate": 0.00010341221968790778, + "loss": 0.8616, + "step": 33060 + }, + { + "epoch": 0.8489137743712698, + "grad_norm": 0.703125, + "learning_rate": 0.0001034077580824678, + "loss": 0.7647, + "step": 33061 + }, + { + "epoch": 0.8489394515671916, + "grad_norm": 0.8671875, + "learning_rate": 0.00010340329647023648, + "loss": 0.7538, + "step": 33062 + }, + { + "epoch": 0.8489651287631135, + "grad_norm": 0.84375, + "learning_rate": 0.00010339883485122269, + "loss": 0.7598, + "step": 33063 + }, + { + "epoch": 0.8489908059590352, + "grad_norm": 0.78125, + "learning_rate": 0.0001033943732254353, + "loss": 0.798, + "step": 33064 + }, + { + "epoch": 0.849016483154957, + "grad_norm": 0.765625, + "learning_rate": 0.00010338991159288327, + "loss": 0.8098, + "step": 33065 + }, + { + "epoch": 0.8490421603508789, + "grad_norm": 1.953125, + "learning_rate": 0.00010338544995357544, + "loss": 0.8596, + "step": 33066 + }, + { + "epoch": 0.8490678375468007, + "grad_norm": 0.8046875, + "learning_rate": 0.00010338098830752068, + "loss": 0.8745, + "step": 33067 + }, + { + "epoch": 0.8490935147427225, + "grad_norm": 0.71875, + "learning_rate": 0.00010337652665472795, + "loss": 0.7776, + "step": 33068 + }, + { + "epoch": 0.8491191919386444, + "grad_norm": 0.7890625, + "learning_rate": 0.00010337206499520613, + "loss": 0.8043, + "step": 33069 + }, + { + "epoch": 0.8491448691345661, + "grad_norm": 0.82421875, + "learning_rate": 0.00010336760332896405, + "loss": 0.7361, + "step": 33070 + }, + { + "epoch": 0.8491705463304879, + "grad_norm": 0.81640625, + "learning_rate": 0.00010336314165601066, + "loss": 0.7195, + "step": 33071 + }, + { + "epoch": 0.8491962235264098, + "grad_norm": 0.79296875, + "learning_rate": 0.0001033586799763548, + "loss": 0.9218, + "step": 33072 + }, + { + "epoch": 0.8492219007223316, + "grad_norm": 0.78515625, + "learning_rate": 0.00010335421829000542, + "loss": 0.8262, + "step": 33073 + }, + { + "epoch": 0.8492475779182534, + "grad_norm": 0.82421875, + "learning_rate": 0.0001033497565969714, + "loss": 0.6999, + "step": 33074 + }, + { + "epoch": 0.8492732551141753, + "grad_norm": 0.80078125, + "learning_rate": 0.00010334529489726162, + "loss": 0.8547, + "step": 33075 + }, + { + "epoch": 0.8492989323100971, + "grad_norm": 0.69921875, + "learning_rate": 0.00010334083319088495, + "loss": 0.7493, + "step": 33076 + }, + { + "epoch": 0.8493246095060188, + "grad_norm": 0.76171875, + "learning_rate": 0.00010333637147785029, + "loss": 0.7621, + "step": 33077 + }, + { + "epoch": 0.8493502867019407, + "grad_norm": 0.8046875, + "learning_rate": 0.00010333190975816655, + "loss": 0.734, + "step": 33078 + }, + { + "epoch": 0.8493759638978625, + "grad_norm": 0.83203125, + "learning_rate": 0.00010332744803184262, + "loss": 0.8539, + "step": 33079 + }, + { + "epoch": 0.8494016410937844, + "grad_norm": 0.7734375, + "learning_rate": 0.0001033229862988874, + "loss": 0.6305, + "step": 33080 + }, + { + "epoch": 0.8494273182897062, + "grad_norm": 0.7578125, + "learning_rate": 0.00010331852455930974, + "loss": 0.9046, + "step": 33081 + }, + { + "epoch": 0.849452995485628, + "grad_norm": 0.8046875, + "learning_rate": 0.00010331406281311859, + "loss": 0.9005, + "step": 33082 + }, + { + "epoch": 0.8494786726815499, + "grad_norm": 0.78515625, + "learning_rate": 0.0001033096010603228, + "loss": 0.8578, + "step": 33083 + }, + { + "epoch": 0.8495043498774716, + "grad_norm": 0.7734375, + "learning_rate": 0.00010330513930093126, + "loss": 0.8394, + "step": 33084 + }, + { + "epoch": 0.8495300270733934, + "grad_norm": 1.484375, + "learning_rate": 0.0001033006775349529, + "loss": 0.818, + "step": 33085 + }, + { + "epoch": 0.8495557042693153, + "grad_norm": 0.734375, + "learning_rate": 0.00010329621576239656, + "loss": 0.7048, + "step": 33086 + }, + { + "epoch": 0.8495813814652371, + "grad_norm": 0.6875, + "learning_rate": 0.00010329175398327117, + "loss": 0.7424, + "step": 33087 + }, + { + "epoch": 0.8496070586611589, + "grad_norm": 0.703125, + "learning_rate": 0.00010328729219758564, + "loss": 0.6765, + "step": 33088 + }, + { + "epoch": 0.8496327358570808, + "grad_norm": 0.828125, + "learning_rate": 0.0001032828304053488, + "loss": 0.9625, + "step": 33089 + }, + { + "epoch": 0.8496584130530025, + "grad_norm": 0.78125, + "learning_rate": 0.00010327836860656961, + "loss": 0.8732, + "step": 33090 + }, + { + "epoch": 0.8496840902489243, + "grad_norm": 0.80078125, + "learning_rate": 0.00010327390680125689, + "loss": 0.8248, + "step": 33091 + }, + { + "epoch": 0.8497097674448462, + "grad_norm": 0.90234375, + "learning_rate": 0.00010326944498941957, + "loss": 0.8154, + "step": 33092 + }, + { + "epoch": 0.849735444640768, + "grad_norm": 0.7890625, + "learning_rate": 0.00010326498317106657, + "loss": 0.7624, + "step": 33093 + }, + { + "epoch": 0.8497611218366898, + "grad_norm": 0.75390625, + "learning_rate": 0.00010326052134620672, + "loss": 0.7714, + "step": 33094 + }, + { + "epoch": 0.8497867990326117, + "grad_norm": 0.76953125, + "learning_rate": 0.000103256059514849, + "loss": 0.8754, + "step": 33095 + }, + { + "epoch": 0.8498124762285335, + "grad_norm": 0.94140625, + "learning_rate": 0.00010325159767700219, + "loss": 1.058, + "step": 33096 + }, + { + "epoch": 0.8498381534244552, + "grad_norm": 0.75390625, + "learning_rate": 0.00010324713583267526, + "loss": 0.7572, + "step": 33097 + }, + { + "epoch": 0.8498638306203771, + "grad_norm": 0.7421875, + "learning_rate": 0.0001032426739818771, + "loss": 0.7507, + "step": 33098 + }, + { + "epoch": 0.8498895078162989, + "grad_norm": 0.74609375, + "learning_rate": 0.00010323821212461655, + "loss": 0.7001, + "step": 33099 + }, + { + "epoch": 0.8499151850122207, + "grad_norm": 0.67578125, + "learning_rate": 0.00010323375026090256, + "loss": 0.7881, + "step": 33100 + }, + { + "epoch": 0.8499408622081426, + "grad_norm": 0.78125, + "learning_rate": 0.000103229288390744, + "loss": 0.7931, + "step": 33101 + }, + { + "epoch": 0.8499665394040644, + "grad_norm": 0.8515625, + "learning_rate": 0.00010322482651414975, + "loss": 0.7996, + "step": 33102 + }, + { + "epoch": 0.8499922165999863, + "grad_norm": 0.8203125, + "learning_rate": 0.00010322036463112872, + "loss": 0.7683, + "step": 33103 + }, + { + "epoch": 0.850017893795908, + "grad_norm": 0.76171875, + "learning_rate": 0.00010321590274168976, + "loss": 0.8877, + "step": 33104 + }, + { + "epoch": 0.8500435709918298, + "grad_norm": 0.74609375, + "learning_rate": 0.00010321144084584182, + "loss": 0.675, + "step": 33105 + }, + { + "epoch": 0.8500692481877516, + "grad_norm": 0.76171875, + "learning_rate": 0.00010320697894359379, + "loss": 0.749, + "step": 33106 + }, + { + "epoch": 0.8500949253836735, + "grad_norm": 0.765625, + "learning_rate": 0.00010320251703495449, + "loss": 0.8524, + "step": 33107 + }, + { + "epoch": 0.8501206025795953, + "grad_norm": 0.79296875, + "learning_rate": 0.00010319805511993292, + "loss": 0.7159, + "step": 33108 + }, + { + "epoch": 0.8501462797755172, + "grad_norm": 0.91015625, + "learning_rate": 0.00010319359319853787, + "loss": 0.9811, + "step": 33109 + }, + { + "epoch": 0.8501719569714389, + "grad_norm": 0.8125, + "learning_rate": 0.00010318913127077832, + "loss": 0.8444, + "step": 33110 + }, + { + "epoch": 0.8501976341673607, + "grad_norm": 0.7890625, + "learning_rate": 0.00010318466933666311, + "loss": 0.8022, + "step": 33111 + }, + { + "epoch": 0.8502233113632826, + "grad_norm": 0.79296875, + "learning_rate": 0.0001031802073962011, + "loss": 0.7618, + "step": 33112 + }, + { + "epoch": 0.8502489885592044, + "grad_norm": 0.7265625, + "learning_rate": 0.00010317574544940125, + "loss": 0.7145, + "step": 33113 + }, + { + "epoch": 0.8502746657551262, + "grad_norm": 0.79296875, + "learning_rate": 0.00010317128349627243, + "loss": 0.8847, + "step": 33114 + }, + { + "epoch": 0.8503003429510481, + "grad_norm": 0.8046875, + "learning_rate": 0.00010316682153682354, + "loss": 0.8411, + "step": 33115 + }, + { + "epoch": 0.8503260201469699, + "grad_norm": 0.88671875, + "learning_rate": 0.00010316235957106346, + "loss": 0.7983, + "step": 33116 + }, + { + "epoch": 0.8503516973428916, + "grad_norm": 0.73046875, + "learning_rate": 0.00010315789759900107, + "loss": 0.8482, + "step": 33117 + }, + { + "epoch": 0.8503773745388135, + "grad_norm": 0.765625, + "learning_rate": 0.00010315343562064526, + "loss": 0.7908, + "step": 33118 + }, + { + "epoch": 0.8504030517347353, + "grad_norm": 0.8125, + "learning_rate": 0.00010314897363600496, + "loss": 0.7589, + "step": 33119 + }, + { + "epoch": 0.8504287289306571, + "grad_norm": 0.9296875, + "learning_rate": 0.00010314451164508901, + "loss": 0.8358, + "step": 33120 + }, + { + "epoch": 0.850454406126579, + "grad_norm": 0.8203125, + "learning_rate": 0.00010314004964790638, + "loss": 0.809, + "step": 33121 + }, + { + "epoch": 0.8504800833225008, + "grad_norm": 0.7109375, + "learning_rate": 0.00010313558764446589, + "loss": 0.7881, + "step": 33122 + }, + { + "epoch": 0.8505057605184226, + "grad_norm": 0.83203125, + "learning_rate": 0.00010313112563477645, + "loss": 0.861, + "step": 33123 + }, + { + "epoch": 0.8505314377143444, + "grad_norm": 0.66015625, + "learning_rate": 0.00010312666361884697, + "loss": 0.623, + "step": 33124 + }, + { + "epoch": 0.8505571149102662, + "grad_norm": 0.79296875, + "learning_rate": 0.00010312220159668633, + "loss": 1.0032, + "step": 33125 + }, + { + "epoch": 0.850582792106188, + "grad_norm": 0.8125, + "learning_rate": 0.00010311773956830342, + "loss": 0.8185, + "step": 33126 + }, + { + "epoch": 0.8506084693021099, + "grad_norm": 0.78515625, + "learning_rate": 0.00010311327753370714, + "loss": 0.7767, + "step": 33127 + }, + { + "epoch": 0.8506341464980317, + "grad_norm": 0.76953125, + "learning_rate": 0.00010310881549290639, + "loss": 0.8367, + "step": 33128 + }, + { + "epoch": 0.8506598236939535, + "grad_norm": 0.76171875, + "learning_rate": 0.00010310435344591003, + "loss": 0.88, + "step": 33129 + }, + { + "epoch": 0.8506855008898753, + "grad_norm": 0.74609375, + "learning_rate": 0.000103099891392727, + "loss": 0.8355, + "step": 33130 + }, + { + "epoch": 0.8507111780857971, + "grad_norm": 0.890625, + "learning_rate": 0.00010309542933336613, + "loss": 0.7916, + "step": 33131 + }, + { + "epoch": 0.8507368552817189, + "grad_norm": 0.78125, + "learning_rate": 0.00010309096726783637, + "loss": 0.812, + "step": 33132 + }, + { + "epoch": 0.8507625324776408, + "grad_norm": 0.875, + "learning_rate": 0.0001030865051961466, + "loss": 0.7649, + "step": 33133 + }, + { + "epoch": 0.8507882096735626, + "grad_norm": 0.8125, + "learning_rate": 0.00010308204311830569, + "loss": 0.7816, + "step": 33134 + }, + { + "epoch": 0.8508138868694844, + "grad_norm": 0.8203125, + "learning_rate": 0.00010307758103432257, + "loss": 0.9167, + "step": 33135 + }, + { + "epoch": 0.8508395640654063, + "grad_norm": 0.7109375, + "learning_rate": 0.00010307311894420608, + "loss": 0.8023, + "step": 33136 + }, + { + "epoch": 0.850865241261328, + "grad_norm": 0.73828125, + "learning_rate": 0.00010306865684796514, + "loss": 0.7993, + "step": 33137 + }, + { + "epoch": 0.8508909184572498, + "grad_norm": 0.73828125, + "learning_rate": 0.00010306419474560867, + "loss": 0.7784, + "step": 33138 + }, + { + "epoch": 0.8509165956531717, + "grad_norm": 0.7890625, + "learning_rate": 0.00010305973263714551, + "loss": 0.7262, + "step": 33139 + }, + { + "epoch": 0.8509422728490935, + "grad_norm": 0.76171875, + "learning_rate": 0.0001030552705225846, + "loss": 0.8375, + "step": 33140 + }, + { + "epoch": 0.8509679500450154, + "grad_norm": 0.78515625, + "learning_rate": 0.0001030508084019348, + "loss": 0.7864, + "step": 33141 + }, + { + "epoch": 0.8509936272409372, + "grad_norm": 0.78515625, + "learning_rate": 0.00010304634627520502, + "loss": 0.7416, + "step": 33142 + }, + { + "epoch": 0.851019304436859, + "grad_norm": 0.796875, + "learning_rate": 0.00010304188414240416, + "loss": 0.8757, + "step": 33143 + }, + { + "epoch": 0.8510449816327808, + "grad_norm": 0.7578125, + "learning_rate": 0.00010303742200354108, + "loss": 0.778, + "step": 33144 + }, + { + "epoch": 0.8510706588287026, + "grad_norm": 0.7578125, + "learning_rate": 0.00010303295985862468, + "loss": 0.8849, + "step": 33145 + }, + { + "epoch": 0.8510963360246244, + "grad_norm": 0.83203125, + "learning_rate": 0.00010302849770766389, + "loss": 0.6633, + "step": 33146 + }, + { + "epoch": 0.8511220132205463, + "grad_norm": 0.7421875, + "learning_rate": 0.00010302403555066755, + "loss": 0.759, + "step": 33147 + }, + { + "epoch": 0.8511476904164681, + "grad_norm": 0.75, + "learning_rate": 0.00010301957338764463, + "loss": 0.7047, + "step": 33148 + }, + { + "epoch": 0.8511733676123899, + "grad_norm": 0.78515625, + "learning_rate": 0.00010301511121860392, + "loss": 0.7875, + "step": 33149 + }, + { + "epoch": 0.8511990448083117, + "grad_norm": 0.83203125, + "learning_rate": 0.0001030106490435544, + "loss": 0.8629, + "step": 33150 + }, + { + "epoch": 0.8512247220042335, + "grad_norm": 0.80078125, + "learning_rate": 0.00010300618686250492, + "loss": 0.7561, + "step": 33151 + }, + { + "epoch": 0.8512503992001553, + "grad_norm": 0.7890625, + "learning_rate": 0.00010300172467546436, + "loss": 0.8314, + "step": 33152 + }, + { + "epoch": 0.8512760763960772, + "grad_norm": 0.73046875, + "learning_rate": 0.00010299726248244166, + "loss": 0.9179, + "step": 33153 + }, + { + "epoch": 0.851301753591999, + "grad_norm": 0.92578125, + "learning_rate": 0.0001029928002834457, + "loss": 0.904, + "step": 33154 + }, + { + "epoch": 0.8513274307879208, + "grad_norm": 0.76171875, + "learning_rate": 0.00010298833807848533, + "loss": 0.8245, + "step": 33155 + }, + { + "epoch": 0.8513531079838427, + "grad_norm": 0.828125, + "learning_rate": 0.00010298387586756948, + "loss": 0.7226, + "step": 33156 + }, + { + "epoch": 0.8513787851797644, + "grad_norm": 0.80859375, + "learning_rate": 0.00010297941365070705, + "loss": 0.8257, + "step": 33157 + }, + { + "epoch": 0.8514044623756862, + "grad_norm": 0.8359375, + "learning_rate": 0.0001029749514279069, + "loss": 0.9155, + "step": 33158 + }, + { + "epoch": 0.8514301395716081, + "grad_norm": 0.8125, + "learning_rate": 0.00010297048919917794, + "loss": 0.7413, + "step": 33159 + }, + { + "epoch": 0.8514558167675299, + "grad_norm": 0.78125, + "learning_rate": 0.00010296602696452904, + "loss": 0.7856, + "step": 33160 + }, + { + "epoch": 0.8514814939634517, + "grad_norm": 0.8046875, + "learning_rate": 0.00010296156472396919, + "loss": 0.7884, + "step": 33161 + }, + { + "epoch": 0.8515071711593736, + "grad_norm": 0.80078125, + "learning_rate": 0.00010295710247750716, + "loss": 0.7998, + "step": 33162 + }, + { + "epoch": 0.8515328483552954, + "grad_norm": 0.796875, + "learning_rate": 0.00010295264022515188, + "loss": 0.7874, + "step": 33163 + }, + { + "epoch": 0.8515585255512171, + "grad_norm": 0.703125, + "learning_rate": 0.00010294817796691228, + "loss": 0.8099, + "step": 33164 + }, + { + "epoch": 0.851584202747139, + "grad_norm": 0.890625, + "learning_rate": 0.00010294371570279722, + "loss": 0.8533, + "step": 33165 + }, + { + "epoch": 0.8516098799430608, + "grad_norm": 0.8046875, + "learning_rate": 0.00010293925343281556, + "loss": 0.8971, + "step": 33166 + }, + { + "epoch": 0.8516355571389826, + "grad_norm": 0.83203125, + "learning_rate": 0.0001029347911569763, + "loss": 0.8114, + "step": 33167 + }, + { + "epoch": 0.8516612343349045, + "grad_norm": 0.78125, + "learning_rate": 0.00010293032887528821, + "loss": 0.8551, + "step": 33168 + }, + { + "epoch": 0.8516869115308263, + "grad_norm": 0.77734375, + "learning_rate": 0.00010292586658776029, + "loss": 0.8355, + "step": 33169 + }, + { + "epoch": 0.851712588726748, + "grad_norm": 0.83203125, + "learning_rate": 0.00010292140429440136, + "loss": 0.7535, + "step": 33170 + }, + { + "epoch": 0.8517382659226699, + "grad_norm": 0.84375, + "learning_rate": 0.0001029169419952203, + "loss": 0.7572, + "step": 33171 + }, + { + "epoch": 0.8517639431185917, + "grad_norm": 0.83984375, + "learning_rate": 0.00010291247969022608, + "loss": 0.7942, + "step": 33172 + }, + { + "epoch": 0.8517896203145136, + "grad_norm": 0.78125, + "learning_rate": 0.00010290801737942757, + "loss": 0.8275, + "step": 33173 + }, + { + "epoch": 0.8518152975104354, + "grad_norm": 0.8046875, + "learning_rate": 0.0001029035550628336, + "loss": 0.7839, + "step": 33174 + }, + { + "epoch": 0.8518409747063572, + "grad_norm": 0.74609375, + "learning_rate": 0.00010289909274045313, + "loss": 0.7346, + "step": 33175 + }, + { + "epoch": 0.8518666519022791, + "grad_norm": 0.77734375, + "learning_rate": 0.000102894630412295, + "loss": 0.7564, + "step": 33176 + }, + { + "epoch": 0.8518923290982008, + "grad_norm": 0.73828125, + "learning_rate": 0.00010289016807836817, + "loss": 0.7574, + "step": 33177 + }, + { + "epoch": 0.8519180062941226, + "grad_norm": 0.83203125, + "learning_rate": 0.0001028857057386815, + "loss": 0.936, + "step": 33178 + }, + { + "epoch": 0.8519436834900445, + "grad_norm": 0.84765625, + "learning_rate": 0.00010288124339324383, + "loss": 0.9364, + "step": 33179 + }, + { + "epoch": 0.8519693606859663, + "grad_norm": 0.7578125, + "learning_rate": 0.00010287678104206415, + "loss": 0.9654, + "step": 33180 + }, + { + "epoch": 0.8519950378818881, + "grad_norm": 0.80078125, + "learning_rate": 0.00010287231868515131, + "loss": 0.8099, + "step": 33181 + }, + { + "epoch": 0.85202071507781, + "grad_norm": 0.765625, + "learning_rate": 0.00010286785632251416, + "loss": 0.7619, + "step": 33182 + }, + { + "epoch": 0.8520463922737318, + "grad_norm": 0.82421875, + "learning_rate": 0.00010286339395416167, + "loss": 0.8491, + "step": 33183 + }, + { + "epoch": 0.8520720694696535, + "grad_norm": 0.77734375, + "learning_rate": 0.00010285893158010265, + "loss": 0.8883, + "step": 33184 + }, + { + "epoch": 0.8520977466655754, + "grad_norm": 0.953125, + "learning_rate": 0.00010285446920034607, + "loss": 0.9075, + "step": 33185 + }, + { + "epoch": 0.8521234238614972, + "grad_norm": 0.74609375, + "learning_rate": 0.00010285000681490078, + "loss": 0.9204, + "step": 33186 + }, + { + "epoch": 0.852149101057419, + "grad_norm": 0.796875, + "learning_rate": 0.0001028455444237757, + "loss": 0.9936, + "step": 33187 + }, + { + "epoch": 0.8521747782533409, + "grad_norm": 0.80078125, + "learning_rate": 0.00010284108202697969, + "loss": 0.6776, + "step": 33188 + }, + { + "epoch": 0.8522004554492627, + "grad_norm": 0.83203125, + "learning_rate": 0.00010283661962452164, + "loss": 0.9133, + "step": 33189 + }, + { + "epoch": 0.8522261326451844, + "grad_norm": 0.78515625, + "learning_rate": 0.00010283215721641048, + "loss": 0.8325, + "step": 33190 + }, + { + "epoch": 0.8522518098411063, + "grad_norm": 0.8984375, + "learning_rate": 0.0001028276948026551, + "loss": 0.9086, + "step": 33191 + }, + { + "epoch": 0.8522774870370281, + "grad_norm": 0.8046875, + "learning_rate": 0.00010282323238326437, + "loss": 0.7638, + "step": 33192 + }, + { + "epoch": 0.8523031642329499, + "grad_norm": 0.8203125, + "learning_rate": 0.0001028187699582472, + "loss": 0.8366, + "step": 33193 + }, + { + "epoch": 0.8523288414288718, + "grad_norm": 0.8046875, + "learning_rate": 0.0001028143075276125, + "loss": 0.8409, + "step": 33194 + }, + { + "epoch": 0.8523545186247936, + "grad_norm": 0.7734375, + "learning_rate": 0.00010280984509136909, + "loss": 0.7265, + "step": 33195 + }, + { + "epoch": 0.8523801958207154, + "grad_norm": 0.8203125, + "learning_rate": 0.00010280538264952593, + "loss": 0.817, + "step": 33196 + }, + { + "epoch": 0.8524058730166372, + "grad_norm": 0.83984375, + "learning_rate": 0.00010280092020209188, + "loss": 0.7327, + "step": 33197 + }, + { + "epoch": 0.852431550212559, + "grad_norm": 0.7421875, + "learning_rate": 0.00010279645774907589, + "loss": 0.7675, + "step": 33198 + }, + { + "epoch": 0.8524572274084808, + "grad_norm": 0.8515625, + "learning_rate": 0.0001027919952904868, + "loss": 0.8418, + "step": 33199 + }, + { + "epoch": 0.8524829046044027, + "grad_norm": 0.796875, + "learning_rate": 0.00010278753282633351, + "loss": 0.941, + "step": 33200 + }, + { + "epoch": 0.8525085818003245, + "grad_norm": 0.7734375, + "learning_rate": 0.00010278307035662491, + "loss": 0.7907, + "step": 33201 + }, + { + "epoch": 0.8525342589962464, + "grad_norm": 0.80859375, + "learning_rate": 0.0001027786078813699, + "loss": 0.7795, + "step": 33202 + }, + { + "epoch": 0.8525599361921682, + "grad_norm": 0.7421875, + "learning_rate": 0.00010277414540057738, + "loss": 0.7361, + "step": 33203 + }, + { + "epoch": 0.8525856133880899, + "grad_norm": 0.8359375, + "learning_rate": 0.00010276968291425624, + "loss": 0.7697, + "step": 33204 + }, + { + "epoch": 0.8526112905840117, + "grad_norm": 0.6875, + "learning_rate": 0.00010276522042241538, + "loss": 0.6244, + "step": 33205 + }, + { + "epoch": 0.8526369677799336, + "grad_norm": 0.765625, + "learning_rate": 0.00010276075792506365, + "loss": 0.7625, + "step": 33206 + }, + { + "epoch": 0.8526626449758554, + "grad_norm": 0.7265625, + "learning_rate": 0.00010275629542221005, + "loss": 0.6669, + "step": 33207 + }, + { + "epoch": 0.8526883221717773, + "grad_norm": 0.81640625, + "learning_rate": 0.00010275183291386335, + "loss": 0.8562, + "step": 33208 + }, + { + "epoch": 0.8527139993676991, + "grad_norm": 0.953125, + "learning_rate": 0.0001027473704000325, + "loss": 1.011, + "step": 33209 + }, + { + "epoch": 0.8527396765636208, + "grad_norm": 0.75, + "learning_rate": 0.00010274290788072641, + "loss": 0.8583, + "step": 33210 + }, + { + "epoch": 0.8527653537595427, + "grad_norm": 0.828125, + "learning_rate": 0.00010273844535595392, + "loss": 0.9363, + "step": 33211 + }, + { + "epoch": 0.8527910309554645, + "grad_norm": 0.85546875, + "learning_rate": 0.00010273398282572397, + "loss": 0.8763, + "step": 33212 + }, + { + "epoch": 0.8528167081513863, + "grad_norm": 0.6796875, + "learning_rate": 0.00010272952029004546, + "loss": 0.7557, + "step": 33213 + }, + { + "epoch": 0.8528423853473082, + "grad_norm": 0.69921875, + "learning_rate": 0.00010272505774892724, + "loss": 0.6707, + "step": 33214 + }, + { + "epoch": 0.85286806254323, + "grad_norm": 0.83984375, + "learning_rate": 0.00010272059520237822, + "loss": 0.7952, + "step": 33215 + }, + { + "epoch": 0.8528937397391518, + "grad_norm": 0.81640625, + "learning_rate": 0.0001027161326504073, + "loss": 0.8933, + "step": 33216 + }, + { + "epoch": 0.8529194169350736, + "grad_norm": 0.71875, + "learning_rate": 0.00010271167009302339, + "loss": 0.8016, + "step": 33217 + }, + { + "epoch": 0.8529450941309954, + "grad_norm": 0.765625, + "learning_rate": 0.00010270720753023538, + "loss": 0.7619, + "step": 33218 + }, + { + "epoch": 0.8529707713269172, + "grad_norm": 0.84375, + "learning_rate": 0.0001027027449620521, + "loss": 0.9145, + "step": 33219 + }, + { + "epoch": 0.8529964485228391, + "grad_norm": 0.8359375, + "learning_rate": 0.00010269828238848256, + "loss": 0.7323, + "step": 33220 + }, + { + "epoch": 0.8530221257187609, + "grad_norm": 0.75, + "learning_rate": 0.00010269381980953552, + "loss": 0.8664, + "step": 33221 + }, + { + "epoch": 0.8530478029146827, + "grad_norm": 0.828125, + "learning_rate": 0.00010268935722521998, + "loss": 0.8492, + "step": 33222 + }, + { + "epoch": 0.8530734801106046, + "grad_norm": 0.75, + "learning_rate": 0.00010268489463554479, + "loss": 0.8311, + "step": 33223 + }, + { + "epoch": 0.8530991573065263, + "grad_norm": 0.8671875, + "learning_rate": 0.00010268043204051883, + "loss": 0.961, + "step": 33224 + }, + { + "epoch": 0.8531248345024481, + "grad_norm": 0.76171875, + "learning_rate": 0.00010267596944015101, + "loss": 0.757, + "step": 33225 + }, + { + "epoch": 0.85315051169837, + "grad_norm": 0.69921875, + "learning_rate": 0.00010267150683445026, + "loss": 0.6751, + "step": 33226 + }, + { + "epoch": 0.8531761888942918, + "grad_norm": 0.73828125, + "learning_rate": 0.00010266704422342542, + "loss": 0.7676, + "step": 33227 + }, + { + "epoch": 0.8532018660902136, + "grad_norm": 0.7421875, + "learning_rate": 0.0001026625816070854, + "loss": 0.8503, + "step": 33228 + }, + { + "epoch": 0.8532275432861355, + "grad_norm": 0.7890625, + "learning_rate": 0.00010265811898543906, + "loss": 0.8462, + "step": 33229 + }, + { + "epoch": 0.8532532204820572, + "grad_norm": 0.76953125, + "learning_rate": 0.00010265365635849538, + "loss": 0.6942, + "step": 33230 + }, + { + "epoch": 0.853278897677979, + "grad_norm": 0.79296875, + "learning_rate": 0.00010264919372626318, + "loss": 0.7146, + "step": 33231 + }, + { + "epoch": 0.8533045748739009, + "grad_norm": 0.796875, + "learning_rate": 0.00010264473108875136, + "loss": 0.7493, + "step": 33232 + }, + { + "epoch": 0.8533302520698227, + "grad_norm": 0.79296875, + "learning_rate": 0.00010264026844596888, + "loss": 0.8592, + "step": 33233 + }, + { + "epoch": 0.8533559292657446, + "grad_norm": 0.828125, + "learning_rate": 0.00010263580579792453, + "loss": 0.7381, + "step": 33234 + }, + { + "epoch": 0.8533816064616664, + "grad_norm": 0.75390625, + "learning_rate": 0.0001026313431446273, + "loss": 0.8666, + "step": 33235 + }, + { + "epoch": 0.8534072836575882, + "grad_norm": 0.8203125, + "learning_rate": 0.00010262688048608602, + "loss": 0.8579, + "step": 33236 + }, + { + "epoch": 0.85343296085351, + "grad_norm": 0.86328125, + "learning_rate": 0.0001026224178223096, + "loss": 0.879, + "step": 33237 + }, + { + "epoch": 0.8534586380494318, + "grad_norm": 0.78125, + "learning_rate": 0.00010261795515330695, + "loss": 0.7962, + "step": 33238 + }, + { + "epoch": 0.8534843152453536, + "grad_norm": 0.79296875, + "learning_rate": 0.00010261349247908693, + "loss": 0.7941, + "step": 33239 + }, + { + "epoch": 0.8535099924412755, + "grad_norm": 0.8359375, + "learning_rate": 0.0001026090297996585, + "loss": 0.7468, + "step": 33240 + }, + { + "epoch": 0.8535356696371973, + "grad_norm": 0.81640625, + "learning_rate": 0.00010260456711503048, + "loss": 0.8157, + "step": 33241 + }, + { + "epoch": 0.8535613468331191, + "grad_norm": 0.89453125, + "learning_rate": 0.00010260010442521179, + "loss": 0.7725, + "step": 33242 + }, + { + "epoch": 0.853587024029041, + "grad_norm": 0.73828125, + "learning_rate": 0.00010259564173021132, + "loss": 0.8257, + "step": 33243 + }, + { + "epoch": 0.8536127012249627, + "grad_norm": 0.83203125, + "learning_rate": 0.00010259117903003798, + "loss": 0.7907, + "step": 33244 + }, + { + "epoch": 0.8536383784208845, + "grad_norm": 0.79296875, + "learning_rate": 0.00010258671632470064, + "loss": 0.7606, + "step": 33245 + }, + { + "epoch": 0.8536640556168064, + "grad_norm": 0.8046875, + "learning_rate": 0.00010258225361420826, + "loss": 0.8204, + "step": 33246 + }, + { + "epoch": 0.8536897328127282, + "grad_norm": 0.75, + "learning_rate": 0.00010257779089856965, + "loss": 0.7245, + "step": 33247 + }, + { + "epoch": 0.85371541000865, + "grad_norm": 0.984375, + "learning_rate": 0.0001025733281777937, + "loss": 0.9581, + "step": 33248 + }, + { + "epoch": 0.8537410872045719, + "grad_norm": 0.796875, + "learning_rate": 0.00010256886545188938, + "loss": 0.7701, + "step": 33249 + }, + { + "epoch": 0.8537667644004936, + "grad_norm": 0.8359375, + "learning_rate": 0.00010256440272086554, + "loss": 0.905, + "step": 33250 + }, + { + "epoch": 0.8537924415964154, + "grad_norm": 0.81640625, + "learning_rate": 0.00010255993998473106, + "loss": 0.743, + "step": 33251 + }, + { + "epoch": 0.8538181187923373, + "grad_norm": 0.81640625, + "learning_rate": 0.00010255547724349486, + "loss": 0.923, + "step": 33252 + }, + { + "epoch": 0.8538437959882591, + "grad_norm": 0.86328125, + "learning_rate": 0.00010255101449716584, + "loss": 0.8737, + "step": 33253 + }, + { + "epoch": 0.8538694731841809, + "grad_norm": 0.80078125, + "learning_rate": 0.00010254655174575288, + "loss": 0.7466, + "step": 33254 + }, + { + "epoch": 0.8538951503801028, + "grad_norm": 0.7578125, + "learning_rate": 0.00010254208898926485, + "loss": 0.7388, + "step": 33255 + }, + { + "epoch": 0.8539208275760246, + "grad_norm": 0.7734375, + "learning_rate": 0.00010253762622771066, + "loss": 0.7165, + "step": 33256 + }, + { + "epoch": 0.8539465047719463, + "grad_norm": 0.85546875, + "learning_rate": 0.00010253316346109922, + "loss": 0.86, + "step": 33257 + }, + { + "epoch": 0.8539721819678682, + "grad_norm": 0.7734375, + "learning_rate": 0.00010252870068943943, + "loss": 0.9042, + "step": 33258 + }, + { + "epoch": 0.85399785916379, + "grad_norm": 0.7265625, + "learning_rate": 0.00010252423791274015, + "loss": 0.8389, + "step": 33259 + }, + { + "epoch": 0.8540235363597118, + "grad_norm": 0.796875, + "learning_rate": 0.0001025197751310103, + "loss": 0.7292, + "step": 33260 + }, + { + "epoch": 0.8540492135556337, + "grad_norm": 0.8359375, + "learning_rate": 0.00010251531234425875, + "loss": 0.8027, + "step": 33261 + }, + { + "epoch": 0.8540748907515555, + "grad_norm": 0.78125, + "learning_rate": 0.00010251084955249444, + "loss": 0.8076, + "step": 33262 + }, + { + "epoch": 0.8541005679474772, + "grad_norm": 0.80078125, + "learning_rate": 0.00010250638675572621, + "loss": 0.8029, + "step": 33263 + }, + { + "epoch": 0.8541262451433991, + "grad_norm": 0.8359375, + "learning_rate": 0.00010250192395396297, + "loss": 0.9085, + "step": 33264 + }, + { + "epoch": 0.8541519223393209, + "grad_norm": 0.8046875, + "learning_rate": 0.00010249746114721364, + "loss": 0.773, + "step": 33265 + }, + { + "epoch": 0.8541775995352427, + "grad_norm": 0.7734375, + "learning_rate": 0.0001024929983354871, + "loss": 0.8341, + "step": 33266 + }, + { + "epoch": 0.8542032767311646, + "grad_norm": 0.75390625, + "learning_rate": 0.00010248853551879223, + "loss": 0.7629, + "step": 33267 + }, + { + "epoch": 0.8542289539270864, + "grad_norm": 0.8046875, + "learning_rate": 0.00010248407269713793, + "loss": 0.8893, + "step": 33268 + }, + { + "epoch": 0.8542546311230083, + "grad_norm": 0.74609375, + "learning_rate": 0.00010247960987053308, + "loss": 0.8164, + "step": 33269 + }, + { + "epoch": 0.85428030831893, + "grad_norm": 0.7578125, + "learning_rate": 0.00010247514703898664, + "loss": 0.7772, + "step": 33270 + }, + { + "epoch": 0.8543059855148518, + "grad_norm": 0.859375, + "learning_rate": 0.00010247068420250742, + "loss": 0.8473, + "step": 33271 + }, + { + "epoch": 0.8543316627107737, + "grad_norm": 0.78125, + "learning_rate": 0.00010246622136110435, + "loss": 0.8628, + "step": 33272 + }, + { + "epoch": 0.8543573399066955, + "grad_norm": 0.90234375, + "learning_rate": 0.00010246175851478636, + "loss": 0.7941, + "step": 33273 + }, + { + "epoch": 0.8543830171026173, + "grad_norm": 0.80859375, + "learning_rate": 0.00010245729566356226, + "loss": 0.8018, + "step": 33274 + }, + { + "epoch": 0.8544086942985392, + "grad_norm": 0.6953125, + "learning_rate": 0.00010245283280744102, + "loss": 0.6974, + "step": 33275 + }, + { + "epoch": 0.854434371494461, + "grad_norm": 0.77734375, + "learning_rate": 0.0001024483699464315, + "loss": 0.7753, + "step": 33276 + }, + { + "epoch": 0.8544600486903827, + "grad_norm": 0.81640625, + "learning_rate": 0.00010244390708054256, + "loss": 0.764, + "step": 33277 + }, + { + "epoch": 0.8544857258863046, + "grad_norm": 0.86328125, + "learning_rate": 0.0001024394442097832, + "loss": 0.9462, + "step": 33278 + }, + { + "epoch": 0.8545114030822264, + "grad_norm": 0.7421875, + "learning_rate": 0.00010243498133416222, + "loss": 0.7589, + "step": 33279 + }, + { + "epoch": 0.8545370802781482, + "grad_norm": 0.890625, + "learning_rate": 0.00010243051845368854, + "loss": 0.8219, + "step": 33280 + }, + { + "epoch": 0.8545627574740701, + "grad_norm": 0.77734375, + "learning_rate": 0.00010242605556837106, + "loss": 0.7284, + "step": 33281 + }, + { + "epoch": 0.8545884346699919, + "grad_norm": 0.7734375, + "learning_rate": 0.00010242159267821866, + "loss": 0.7232, + "step": 33282 + }, + { + "epoch": 0.8546141118659136, + "grad_norm": 0.76953125, + "learning_rate": 0.00010241712978324024, + "loss": 0.8775, + "step": 33283 + }, + { + "epoch": 0.8546397890618355, + "grad_norm": 0.79296875, + "learning_rate": 0.00010241266688344473, + "loss": 0.8073, + "step": 33284 + }, + { + "epoch": 0.8546654662577573, + "grad_norm": 0.76953125, + "learning_rate": 0.00010240820397884094, + "loss": 0.8352, + "step": 33285 + }, + { + "epoch": 0.8546911434536791, + "grad_norm": 0.84375, + "learning_rate": 0.00010240374106943789, + "loss": 0.7619, + "step": 33286 + }, + { + "epoch": 0.854716820649601, + "grad_norm": 0.8046875, + "learning_rate": 0.00010239927815524437, + "loss": 0.7549, + "step": 33287 + }, + { + "epoch": 0.8547424978455228, + "grad_norm": 0.76171875, + "learning_rate": 0.00010239481523626928, + "loss": 0.8991, + "step": 33288 + }, + { + "epoch": 0.8547681750414446, + "grad_norm": 0.78125, + "learning_rate": 0.00010239035231252156, + "loss": 0.8373, + "step": 33289 + }, + { + "epoch": 0.8547938522373664, + "grad_norm": 0.78125, + "learning_rate": 0.00010238588938401009, + "loss": 0.863, + "step": 33290 + }, + { + "epoch": 0.8548195294332882, + "grad_norm": 0.8046875, + "learning_rate": 0.00010238142645074373, + "loss": 0.896, + "step": 33291 + }, + { + "epoch": 0.85484520662921, + "grad_norm": 0.81640625, + "learning_rate": 0.00010237696351273144, + "loss": 0.7339, + "step": 33292 + }, + { + "epoch": 0.8548708838251319, + "grad_norm": 0.8359375, + "learning_rate": 0.00010237250056998205, + "loss": 0.9171, + "step": 33293 + }, + { + "epoch": 0.8548965610210537, + "grad_norm": 0.74609375, + "learning_rate": 0.00010236803762250448, + "loss": 0.7235, + "step": 33294 + }, + { + "epoch": 0.8549222382169755, + "grad_norm": 0.78125, + "learning_rate": 0.00010236357467030763, + "loss": 0.8733, + "step": 33295 + }, + { + "epoch": 0.8549479154128974, + "grad_norm": 0.8671875, + "learning_rate": 0.00010235911171340039, + "loss": 0.9368, + "step": 33296 + }, + { + "epoch": 0.8549735926088191, + "grad_norm": 0.91796875, + "learning_rate": 0.00010235464875179166, + "loss": 0.9699, + "step": 33297 + }, + { + "epoch": 0.854999269804741, + "grad_norm": 0.67578125, + "learning_rate": 0.00010235018578549034, + "loss": 0.8075, + "step": 33298 + }, + { + "epoch": 0.8550249470006628, + "grad_norm": 0.92578125, + "learning_rate": 0.0001023457228145053, + "loss": 0.7665, + "step": 33299 + }, + { + "epoch": 0.8550506241965846, + "grad_norm": 0.88671875, + "learning_rate": 0.00010234125983884544, + "loss": 0.732, + "step": 33300 + }, + { + "epoch": 0.8550763013925065, + "grad_norm": 0.7578125, + "learning_rate": 0.00010233679685851964, + "loss": 0.9053, + "step": 33301 + }, + { + "epoch": 0.8551019785884283, + "grad_norm": 0.71484375, + "learning_rate": 0.00010233233387353685, + "loss": 0.787, + "step": 33302 + }, + { + "epoch": 0.85512765578435, + "grad_norm": 0.82421875, + "learning_rate": 0.00010232787088390592, + "loss": 0.805, + "step": 33303 + }, + { + "epoch": 0.8551533329802719, + "grad_norm": 0.78125, + "learning_rate": 0.00010232340788963573, + "loss": 0.7531, + "step": 33304 + }, + { + "epoch": 0.8551790101761937, + "grad_norm": 0.78515625, + "learning_rate": 0.00010231894489073524, + "loss": 0.8298, + "step": 33305 + }, + { + "epoch": 0.8552046873721155, + "grad_norm": 0.74609375, + "learning_rate": 0.00010231448188721326, + "loss": 0.8581, + "step": 33306 + }, + { + "epoch": 0.8552303645680374, + "grad_norm": 0.83203125, + "learning_rate": 0.00010231001887907876, + "loss": 0.8328, + "step": 33307 + }, + { + "epoch": 0.8552560417639592, + "grad_norm": 0.8203125, + "learning_rate": 0.00010230555586634059, + "loss": 0.7119, + "step": 33308 + }, + { + "epoch": 0.855281718959881, + "grad_norm": 0.7734375, + "learning_rate": 0.00010230109284900764, + "loss": 0.7702, + "step": 33309 + }, + { + "epoch": 0.8553073961558028, + "grad_norm": 0.7890625, + "learning_rate": 0.00010229662982708884, + "loss": 0.7712, + "step": 33310 + }, + { + "epoch": 0.8553330733517246, + "grad_norm": 0.81640625, + "learning_rate": 0.00010229216680059306, + "loss": 0.78, + "step": 33311 + }, + { + "epoch": 0.8553587505476464, + "grad_norm": 0.78515625, + "learning_rate": 0.00010228770376952919, + "loss": 0.7869, + "step": 33312 + }, + { + "epoch": 0.8553844277435683, + "grad_norm": 0.79296875, + "learning_rate": 0.00010228324073390615, + "loss": 0.8289, + "step": 33313 + }, + { + "epoch": 0.8554101049394901, + "grad_norm": 0.7265625, + "learning_rate": 0.0001022787776937328, + "loss": 0.8546, + "step": 33314 + }, + { + "epoch": 0.8554357821354119, + "grad_norm": 0.98046875, + "learning_rate": 0.00010227431464901804, + "loss": 0.8608, + "step": 33315 + }, + { + "epoch": 0.8554614593313338, + "grad_norm": 0.76171875, + "learning_rate": 0.00010226985159977081, + "loss": 0.7682, + "step": 33316 + }, + { + "epoch": 0.8554871365272555, + "grad_norm": 0.7421875, + "learning_rate": 0.00010226538854599994, + "loss": 0.8769, + "step": 33317 + }, + { + "epoch": 0.8555128137231773, + "grad_norm": 0.81640625, + "learning_rate": 0.00010226092548771438, + "loss": 0.8165, + "step": 33318 + }, + { + "epoch": 0.8555384909190992, + "grad_norm": 0.8203125, + "learning_rate": 0.00010225646242492301, + "loss": 0.8137, + "step": 33319 + }, + { + "epoch": 0.855564168115021, + "grad_norm": 0.9296875, + "learning_rate": 0.00010225199935763467, + "loss": 0.7524, + "step": 33320 + }, + { + "epoch": 0.8555898453109428, + "grad_norm": 0.7734375, + "learning_rate": 0.00010224753628585834, + "loss": 0.8147, + "step": 33321 + }, + { + "epoch": 0.8556155225068647, + "grad_norm": 0.75, + "learning_rate": 0.00010224307320960285, + "loss": 0.8382, + "step": 33322 + }, + { + "epoch": 0.8556411997027864, + "grad_norm": 0.78125, + "learning_rate": 0.00010223861012887713, + "loss": 0.7145, + "step": 33323 + }, + { + "epoch": 0.8556668768987082, + "grad_norm": 0.8125, + "learning_rate": 0.00010223414704369006, + "loss": 0.7953, + "step": 33324 + }, + { + "epoch": 0.8556925540946301, + "grad_norm": 0.7890625, + "learning_rate": 0.00010222968395405053, + "loss": 0.9286, + "step": 33325 + }, + { + "epoch": 0.8557182312905519, + "grad_norm": 1.0, + "learning_rate": 0.00010222522085996745, + "loss": 0.8497, + "step": 33326 + }, + { + "epoch": 0.8557439084864737, + "grad_norm": 0.8125, + "learning_rate": 0.0001022207577614497, + "loss": 0.8466, + "step": 33327 + }, + { + "epoch": 0.8557695856823956, + "grad_norm": 0.8828125, + "learning_rate": 0.00010221629465850617, + "loss": 0.8745, + "step": 33328 + }, + { + "epoch": 0.8557952628783174, + "grad_norm": 0.81640625, + "learning_rate": 0.00010221183155114577, + "loss": 0.8803, + "step": 33329 + }, + { + "epoch": 0.8558209400742391, + "grad_norm": 0.796875, + "learning_rate": 0.00010220736843937742, + "loss": 0.853, + "step": 33330 + }, + { + "epoch": 0.855846617270161, + "grad_norm": 0.8046875, + "learning_rate": 0.00010220290532320993, + "loss": 0.9978, + "step": 33331 + }, + { + "epoch": 0.8558722944660828, + "grad_norm": 0.8515625, + "learning_rate": 0.00010219844220265232, + "loss": 0.8735, + "step": 33332 + }, + { + "epoch": 0.8558979716620047, + "grad_norm": 0.78515625, + "learning_rate": 0.00010219397907771333, + "loss": 0.8043, + "step": 33333 + }, + { + "epoch": 0.8559236488579265, + "grad_norm": 0.7734375, + "learning_rate": 0.00010218951594840198, + "loss": 0.7616, + "step": 33334 + }, + { + "epoch": 0.8559493260538483, + "grad_norm": 0.7734375, + "learning_rate": 0.00010218505281472715, + "loss": 0.8842, + "step": 33335 + }, + { + "epoch": 0.8559750032497702, + "grad_norm": 0.7421875, + "learning_rate": 0.00010218058967669766, + "loss": 0.7137, + "step": 33336 + }, + { + "epoch": 0.8560006804456919, + "grad_norm": 0.83203125, + "learning_rate": 0.00010217612653432247, + "loss": 0.8016, + "step": 33337 + }, + { + "epoch": 0.8560263576416137, + "grad_norm": 0.7734375, + "learning_rate": 0.00010217166338761046, + "loss": 0.8039, + "step": 33338 + }, + { + "epoch": 0.8560520348375356, + "grad_norm": 0.84765625, + "learning_rate": 0.00010216720023657052, + "loss": 0.9226, + "step": 33339 + }, + { + "epoch": 0.8560777120334574, + "grad_norm": 0.734375, + "learning_rate": 0.00010216273708121155, + "loss": 0.8708, + "step": 33340 + }, + { + "epoch": 0.8561033892293792, + "grad_norm": 0.78515625, + "learning_rate": 0.00010215827392154242, + "loss": 0.8174, + "step": 33341 + }, + { + "epoch": 0.8561290664253011, + "grad_norm": 0.77734375, + "learning_rate": 0.00010215381075757206, + "loss": 0.8499, + "step": 33342 + }, + { + "epoch": 0.8561547436212228, + "grad_norm": 0.734375, + "learning_rate": 0.00010214934758930936, + "loss": 0.7569, + "step": 33343 + }, + { + "epoch": 0.8561804208171446, + "grad_norm": 0.76171875, + "learning_rate": 0.00010214488441676318, + "loss": 0.8722, + "step": 33344 + }, + { + "epoch": 0.8562060980130665, + "grad_norm": 0.79296875, + "learning_rate": 0.00010214042123994247, + "loss": 0.7297, + "step": 33345 + }, + { + "epoch": 0.8562317752089883, + "grad_norm": 0.75390625, + "learning_rate": 0.00010213595805885605, + "loss": 0.732, + "step": 33346 + }, + { + "epoch": 0.8562574524049101, + "grad_norm": 0.78125, + "learning_rate": 0.0001021314948735129, + "loss": 0.8547, + "step": 33347 + }, + { + "epoch": 0.856283129600832, + "grad_norm": 0.79296875, + "learning_rate": 0.00010212703168392186, + "loss": 0.8521, + "step": 33348 + }, + { + "epoch": 0.8563088067967538, + "grad_norm": 0.8125, + "learning_rate": 0.0001021225684900918, + "loss": 0.862, + "step": 33349 + }, + { + "epoch": 0.8563344839926755, + "grad_norm": 0.73828125, + "learning_rate": 0.00010211810529203171, + "loss": 0.7371, + "step": 33350 + }, + { + "epoch": 0.8563601611885974, + "grad_norm": 0.703125, + "learning_rate": 0.0001021136420897504, + "loss": 0.7247, + "step": 33351 + }, + { + "epoch": 0.8563858383845192, + "grad_norm": 0.80078125, + "learning_rate": 0.00010210917888325682, + "loss": 0.8571, + "step": 33352 + }, + { + "epoch": 0.856411515580441, + "grad_norm": 0.76953125, + "learning_rate": 0.00010210471567255981, + "loss": 0.9839, + "step": 33353 + }, + { + "epoch": 0.8564371927763629, + "grad_norm": 0.73828125, + "learning_rate": 0.00010210025245766828, + "loss": 0.7827, + "step": 33354 + }, + { + "epoch": 0.8564628699722847, + "grad_norm": 0.79296875, + "learning_rate": 0.00010209578923859115, + "loss": 0.7442, + "step": 33355 + }, + { + "epoch": 0.8564885471682065, + "grad_norm": 0.78515625, + "learning_rate": 0.00010209132601533733, + "loss": 0.9006, + "step": 33356 + }, + { + "epoch": 0.8565142243641283, + "grad_norm": 0.87109375, + "learning_rate": 0.00010208686278791564, + "loss": 0.8365, + "step": 33357 + }, + { + "epoch": 0.8565399015600501, + "grad_norm": 0.78125, + "learning_rate": 0.00010208239955633507, + "loss": 0.774, + "step": 33358 + }, + { + "epoch": 0.856565578755972, + "grad_norm": 0.82421875, + "learning_rate": 0.00010207793632060443, + "loss": 0.7261, + "step": 33359 + }, + { + "epoch": 0.8565912559518938, + "grad_norm": 0.78125, + "learning_rate": 0.00010207347308073265, + "loss": 0.8634, + "step": 33360 + }, + { + "epoch": 0.8566169331478156, + "grad_norm": 0.78515625, + "learning_rate": 0.00010206900983672864, + "loss": 0.8111, + "step": 33361 + }, + { + "epoch": 0.8566426103437375, + "grad_norm": 0.80859375, + "learning_rate": 0.00010206454658860125, + "loss": 0.9059, + "step": 33362 + }, + { + "epoch": 0.8566682875396592, + "grad_norm": 0.79296875, + "learning_rate": 0.00010206008333635944, + "loss": 0.8643, + "step": 33363 + }, + { + "epoch": 0.856693964735581, + "grad_norm": 0.8125, + "learning_rate": 0.00010205562008001206, + "loss": 0.7817, + "step": 33364 + }, + { + "epoch": 0.8567196419315029, + "grad_norm": 0.85546875, + "learning_rate": 0.00010205115681956802, + "loss": 0.7495, + "step": 33365 + }, + { + "epoch": 0.8567453191274247, + "grad_norm": 0.765625, + "learning_rate": 0.00010204669355503622, + "loss": 0.7134, + "step": 33366 + }, + { + "epoch": 0.8567709963233465, + "grad_norm": 0.76171875, + "learning_rate": 0.00010204223028642552, + "loss": 0.6861, + "step": 33367 + }, + { + "epoch": 0.8567966735192684, + "grad_norm": 0.8359375, + "learning_rate": 0.00010203776701374483, + "loss": 0.8953, + "step": 33368 + }, + { + "epoch": 0.8568223507151902, + "grad_norm": 0.82421875, + "learning_rate": 0.00010203330373700308, + "loss": 0.7287, + "step": 33369 + }, + { + "epoch": 0.8568480279111119, + "grad_norm": 0.7890625, + "learning_rate": 0.00010202884045620912, + "loss": 0.8548, + "step": 33370 + }, + { + "epoch": 0.8568737051070338, + "grad_norm": 0.85546875, + "learning_rate": 0.0001020243771713719, + "loss": 0.9223, + "step": 33371 + }, + { + "epoch": 0.8568993823029556, + "grad_norm": 0.84765625, + "learning_rate": 0.00010201991388250027, + "loss": 0.8257, + "step": 33372 + }, + { + "epoch": 0.8569250594988774, + "grad_norm": 0.70703125, + "learning_rate": 0.0001020154505896031, + "loss": 0.6907, + "step": 33373 + }, + { + "epoch": 0.8569507366947993, + "grad_norm": 0.86328125, + "learning_rate": 0.00010201098729268934, + "loss": 0.8683, + "step": 33374 + }, + { + "epoch": 0.8569764138907211, + "grad_norm": 0.8125, + "learning_rate": 0.00010200652399176786, + "loss": 0.6949, + "step": 33375 + }, + { + "epoch": 0.8570020910866429, + "grad_norm": 0.7421875, + "learning_rate": 0.00010200206068684755, + "loss": 0.834, + "step": 33376 + }, + { + "epoch": 0.8570277682825647, + "grad_norm": 0.765625, + "learning_rate": 0.00010199759737793732, + "loss": 0.7975, + "step": 33377 + }, + { + "epoch": 0.8570534454784865, + "grad_norm": 0.89453125, + "learning_rate": 0.00010199313406504608, + "loss": 0.7775, + "step": 33378 + }, + { + "epoch": 0.8570791226744083, + "grad_norm": 0.76171875, + "learning_rate": 0.0001019886707481827, + "loss": 0.8996, + "step": 33379 + }, + { + "epoch": 0.8571047998703302, + "grad_norm": 0.80078125, + "learning_rate": 0.00010198420742735606, + "loss": 0.7748, + "step": 33380 + }, + { + "epoch": 0.857130477066252, + "grad_norm": 0.84765625, + "learning_rate": 0.00010197974410257507, + "loss": 0.8875, + "step": 33381 + }, + { + "epoch": 0.8571561542621738, + "grad_norm": 0.72265625, + "learning_rate": 0.00010197528077384864, + "loss": 0.7321, + "step": 33382 + }, + { + "epoch": 0.8571818314580956, + "grad_norm": 0.7734375, + "learning_rate": 0.00010197081744118566, + "loss": 0.7985, + "step": 33383 + }, + { + "epoch": 0.8572075086540174, + "grad_norm": 0.7734375, + "learning_rate": 0.000101966354104595, + "loss": 0.8422, + "step": 33384 + }, + { + "epoch": 0.8572331858499392, + "grad_norm": 0.79296875, + "learning_rate": 0.0001019618907640856, + "loss": 0.6942, + "step": 33385 + }, + { + "epoch": 0.8572588630458611, + "grad_norm": 0.8046875, + "learning_rate": 0.0001019574274196663, + "loss": 0.7512, + "step": 33386 + }, + { + "epoch": 0.8572845402417829, + "grad_norm": 0.74609375, + "learning_rate": 0.00010195296407134605, + "loss": 0.6551, + "step": 33387 + }, + { + "epoch": 0.8573102174377047, + "grad_norm": 0.78515625, + "learning_rate": 0.00010194850071913371, + "loss": 0.7554, + "step": 33388 + }, + { + "epoch": 0.8573358946336266, + "grad_norm": 0.78125, + "learning_rate": 0.00010194403736303817, + "loss": 0.6943, + "step": 33389 + }, + { + "epoch": 0.8573615718295483, + "grad_norm": 0.8984375, + "learning_rate": 0.00010193957400306834, + "loss": 0.7658, + "step": 33390 + }, + { + "epoch": 0.8573872490254701, + "grad_norm": 0.76171875, + "learning_rate": 0.00010193511063923313, + "loss": 0.8425, + "step": 33391 + }, + { + "epoch": 0.857412926221392, + "grad_norm": 0.82421875, + "learning_rate": 0.00010193064727154143, + "loss": 0.8249, + "step": 33392 + }, + { + "epoch": 0.8574386034173138, + "grad_norm": 0.8359375, + "learning_rate": 0.00010192618390000213, + "loss": 0.6821, + "step": 33393 + }, + { + "epoch": 0.8574642806132357, + "grad_norm": 0.81640625, + "learning_rate": 0.00010192172052462407, + "loss": 0.8193, + "step": 33394 + }, + { + "epoch": 0.8574899578091575, + "grad_norm": 0.78125, + "learning_rate": 0.00010191725714541623, + "loss": 0.8258, + "step": 33395 + }, + { + "epoch": 0.8575156350050793, + "grad_norm": 0.9765625, + "learning_rate": 0.00010191279376238747, + "loss": 0.8561, + "step": 33396 + }, + { + "epoch": 0.857541312201001, + "grad_norm": 0.828125, + "learning_rate": 0.00010190833037554665, + "loss": 0.7547, + "step": 33397 + }, + { + "epoch": 0.8575669893969229, + "grad_norm": 0.796875, + "learning_rate": 0.00010190386698490275, + "loss": 0.7642, + "step": 33398 + }, + { + "epoch": 0.8575926665928447, + "grad_norm": 0.81640625, + "learning_rate": 0.00010189940359046459, + "loss": 0.8905, + "step": 33399 + }, + { + "epoch": 0.8576183437887666, + "grad_norm": 0.73046875, + "learning_rate": 0.00010189494019224109, + "loss": 0.7932, + "step": 33400 + }, + { + "epoch": 0.8576440209846884, + "grad_norm": 0.90234375, + "learning_rate": 0.00010189047679024115, + "loss": 0.8079, + "step": 33401 + }, + { + "epoch": 0.8576696981806102, + "grad_norm": 0.7578125, + "learning_rate": 0.00010188601338447366, + "loss": 0.6998, + "step": 33402 + }, + { + "epoch": 0.857695375376532, + "grad_norm": 0.76171875, + "learning_rate": 0.0001018815499749475, + "loss": 0.8152, + "step": 33403 + }, + { + "epoch": 0.8577210525724538, + "grad_norm": 0.828125, + "learning_rate": 0.00010187708656167164, + "loss": 0.7674, + "step": 33404 + }, + { + "epoch": 0.8577467297683756, + "grad_norm": 0.73828125, + "learning_rate": 0.00010187262314465484, + "loss": 0.7402, + "step": 33405 + }, + { + "epoch": 0.8577724069642975, + "grad_norm": 0.91015625, + "learning_rate": 0.0001018681597239061, + "loss": 0.7695, + "step": 33406 + }, + { + "epoch": 0.8577980841602193, + "grad_norm": 0.73828125, + "learning_rate": 0.00010186369629943429, + "loss": 0.8346, + "step": 33407 + }, + { + "epoch": 0.8578237613561411, + "grad_norm": 0.8203125, + "learning_rate": 0.0001018592328712483, + "loss": 0.8637, + "step": 33408 + }, + { + "epoch": 0.857849438552063, + "grad_norm": 0.84375, + "learning_rate": 0.00010185476943935703, + "loss": 0.8129, + "step": 33409 + }, + { + "epoch": 0.8578751157479847, + "grad_norm": 0.8203125, + "learning_rate": 0.00010185030600376934, + "loss": 0.8006, + "step": 33410 + }, + { + "epoch": 0.8579007929439065, + "grad_norm": 0.73828125, + "learning_rate": 0.00010184584256449423, + "loss": 0.752, + "step": 33411 + }, + { + "epoch": 0.8579264701398284, + "grad_norm": 0.80859375, + "learning_rate": 0.00010184137912154048, + "loss": 0.8354, + "step": 33412 + }, + { + "epoch": 0.8579521473357502, + "grad_norm": 0.765625, + "learning_rate": 0.000101836915674917, + "loss": 0.8439, + "step": 33413 + }, + { + "epoch": 0.857977824531672, + "grad_norm": 0.765625, + "learning_rate": 0.00010183245222463275, + "loss": 0.7347, + "step": 33414 + }, + { + "epoch": 0.8580035017275939, + "grad_norm": 0.71875, + "learning_rate": 0.0001018279887706966, + "loss": 0.6543, + "step": 33415 + }, + { + "epoch": 0.8580291789235157, + "grad_norm": 0.75390625, + "learning_rate": 0.00010182352531311739, + "loss": 0.6297, + "step": 33416 + }, + { + "epoch": 0.8580548561194374, + "grad_norm": 0.76171875, + "learning_rate": 0.0001018190618519041, + "loss": 0.7406, + "step": 33417 + }, + { + "epoch": 0.8580805333153593, + "grad_norm": 0.71484375, + "learning_rate": 0.00010181459838706555, + "loss": 0.7687, + "step": 33418 + }, + { + "epoch": 0.8581062105112811, + "grad_norm": 0.78125, + "learning_rate": 0.00010181013491861068, + "loss": 0.7467, + "step": 33419 + }, + { + "epoch": 0.858131887707203, + "grad_norm": 0.7734375, + "learning_rate": 0.00010180567144654838, + "loss": 0.9095, + "step": 33420 + }, + { + "epoch": 0.8581575649031248, + "grad_norm": 0.76171875, + "learning_rate": 0.00010180120797088752, + "loss": 0.8221, + "step": 33421 + }, + { + "epoch": 0.8581832420990466, + "grad_norm": 0.88671875, + "learning_rate": 0.00010179674449163705, + "loss": 0.8406, + "step": 33422 + }, + { + "epoch": 0.8582089192949683, + "grad_norm": 0.7109375, + "learning_rate": 0.00010179228100880583, + "loss": 0.7104, + "step": 33423 + }, + { + "epoch": 0.8582345964908902, + "grad_norm": 0.796875, + "learning_rate": 0.00010178781752240274, + "loss": 0.8162, + "step": 33424 + }, + { + "epoch": 0.858260273686812, + "grad_norm": 0.6953125, + "learning_rate": 0.00010178335403243668, + "loss": 0.8605, + "step": 33425 + }, + { + "epoch": 0.8582859508827339, + "grad_norm": 0.828125, + "learning_rate": 0.00010177889053891656, + "loss": 0.8557, + "step": 33426 + }, + { + "epoch": 0.8583116280786557, + "grad_norm": 0.90234375, + "learning_rate": 0.00010177442704185127, + "loss": 0.7888, + "step": 33427 + }, + { + "epoch": 0.8583373052745775, + "grad_norm": 0.77734375, + "learning_rate": 0.00010176996354124973, + "loss": 0.8279, + "step": 33428 + }, + { + "epoch": 0.8583629824704994, + "grad_norm": 0.80078125, + "learning_rate": 0.0001017655000371208, + "loss": 0.8716, + "step": 33429 + }, + { + "epoch": 0.8583886596664211, + "grad_norm": 0.77734375, + "learning_rate": 0.00010176103652947339, + "loss": 0.6794, + "step": 33430 + }, + { + "epoch": 0.8584143368623429, + "grad_norm": 0.76171875, + "learning_rate": 0.00010175657301831638, + "loss": 0.7752, + "step": 33431 + }, + { + "epoch": 0.8584400140582648, + "grad_norm": 0.7578125, + "learning_rate": 0.0001017521095036587, + "loss": 0.8375, + "step": 33432 + }, + { + "epoch": 0.8584656912541866, + "grad_norm": 0.7890625, + "learning_rate": 0.00010174764598550921, + "loss": 0.8103, + "step": 33433 + }, + { + "epoch": 0.8584913684501084, + "grad_norm": 0.734375, + "learning_rate": 0.00010174318246387682, + "loss": 0.6703, + "step": 33434 + }, + { + "epoch": 0.8585170456460303, + "grad_norm": 0.8046875, + "learning_rate": 0.00010173871893877042, + "loss": 0.791, + "step": 33435 + }, + { + "epoch": 0.8585427228419521, + "grad_norm": 0.79296875, + "learning_rate": 0.00010173425541019892, + "loss": 0.8755, + "step": 33436 + }, + { + "epoch": 0.8585684000378738, + "grad_norm": 0.765625, + "learning_rate": 0.00010172979187817122, + "loss": 0.6202, + "step": 33437 + }, + { + "epoch": 0.8585940772337957, + "grad_norm": 0.77734375, + "learning_rate": 0.00010172532834269619, + "loss": 0.7207, + "step": 33438 + }, + { + "epoch": 0.8586197544297175, + "grad_norm": 0.75390625, + "learning_rate": 0.00010172086480378271, + "loss": 0.7792, + "step": 33439 + }, + { + "epoch": 0.8586454316256393, + "grad_norm": 0.7734375, + "learning_rate": 0.00010171640126143972, + "loss": 0.6752, + "step": 33440 + }, + { + "epoch": 0.8586711088215612, + "grad_norm": 0.80078125, + "learning_rate": 0.0001017119377156761, + "loss": 0.836, + "step": 33441 + }, + { + "epoch": 0.858696786017483, + "grad_norm": 0.76171875, + "learning_rate": 0.00010170747416650072, + "loss": 0.7297, + "step": 33442 + }, + { + "epoch": 0.8587224632134047, + "grad_norm": 0.6875, + "learning_rate": 0.00010170301061392253, + "loss": 0.7783, + "step": 33443 + }, + { + "epoch": 0.8587481404093266, + "grad_norm": 0.7890625, + "learning_rate": 0.00010169854705795041, + "loss": 0.7998, + "step": 33444 + }, + { + "epoch": 0.8587738176052484, + "grad_norm": 0.80859375, + "learning_rate": 0.00010169408349859322, + "loss": 0.8408, + "step": 33445 + }, + { + "epoch": 0.8587994948011702, + "grad_norm": 0.73828125, + "learning_rate": 0.00010168961993585986, + "loss": 0.8961, + "step": 33446 + }, + { + "epoch": 0.8588251719970921, + "grad_norm": 0.73046875, + "learning_rate": 0.00010168515636975924, + "loss": 0.7094, + "step": 33447 + }, + { + "epoch": 0.8588508491930139, + "grad_norm": 0.80859375, + "learning_rate": 0.00010168069280030026, + "loss": 0.7438, + "step": 33448 + }, + { + "epoch": 0.8588765263889357, + "grad_norm": 0.765625, + "learning_rate": 0.00010167622922749183, + "loss": 0.7029, + "step": 33449 + }, + { + "epoch": 0.8589022035848575, + "grad_norm": 0.7265625, + "learning_rate": 0.00010167176565134279, + "loss": 0.7763, + "step": 33450 + }, + { + "epoch": 0.8589278807807793, + "grad_norm": 0.77734375, + "learning_rate": 0.00010166730207186213, + "loss": 0.7051, + "step": 33451 + }, + { + "epoch": 0.8589535579767011, + "grad_norm": 0.80078125, + "learning_rate": 0.00010166283848905864, + "loss": 0.8068, + "step": 33452 + }, + { + "epoch": 0.858979235172623, + "grad_norm": 0.78125, + "learning_rate": 0.00010165837490294126, + "loss": 0.7086, + "step": 33453 + }, + { + "epoch": 0.8590049123685448, + "grad_norm": 0.78515625, + "learning_rate": 0.00010165391131351891, + "loss": 0.7151, + "step": 33454 + }, + { + "epoch": 0.8590305895644667, + "grad_norm": 0.80078125, + "learning_rate": 0.00010164944772080045, + "loss": 0.9115, + "step": 33455 + }, + { + "epoch": 0.8590562667603885, + "grad_norm": 0.7578125, + "learning_rate": 0.00010164498412479479, + "loss": 0.7438, + "step": 33456 + }, + { + "epoch": 0.8590819439563102, + "grad_norm": 0.7890625, + "learning_rate": 0.00010164052052551085, + "loss": 0.6802, + "step": 33457 + }, + { + "epoch": 0.859107621152232, + "grad_norm": 0.796875, + "learning_rate": 0.00010163605692295748, + "loss": 0.8804, + "step": 33458 + }, + { + "epoch": 0.8591332983481539, + "grad_norm": 0.82421875, + "learning_rate": 0.0001016315933171436, + "loss": 0.7503, + "step": 33459 + }, + { + "epoch": 0.8591589755440757, + "grad_norm": 0.77734375, + "learning_rate": 0.00010162712970807811, + "loss": 0.7263, + "step": 33460 + }, + { + "epoch": 0.8591846527399976, + "grad_norm": 0.71484375, + "learning_rate": 0.00010162266609576987, + "loss": 0.6923, + "step": 33461 + }, + { + "epoch": 0.8592103299359194, + "grad_norm": 0.76953125, + "learning_rate": 0.00010161820248022782, + "loss": 0.7385, + "step": 33462 + }, + { + "epoch": 0.8592360071318411, + "grad_norm": 0.7578125, + "learning_rate": 0.00010161373886146084, + "loss": 0.8586, + "step": 33463 + }, + { + "epoch": 0.859261684327763, + "grad_norm": 0.796875, + "learning_rate": 0.00010160927523947783, + "loss": 0.7629, + "step": 33464 + }, + { + "epoch": 0.8592873615236848, + "grad_norm": 0.81640625, + "learning_rate": 0.00010160481161428767, + "loss": 0.8334, + "step": 33465 + }, + { + "epoch": 0.8593130387196066, + "grad_norm": 0.7734375, + "learning_rate": 0.00010160034798589925, + "loss": 0.8692, + "step": 33466 + }, + { + "epoch": 0.8593387159155285, + "grad_norm": 0.8203125, + "learning_rate": 0.00010159588435432152, + "loss": 0.7203, + "step": 33467 + }, + { + "epoch": 0.8593643931114503, + "grad_norm": 0.71484375, + "learning_rate": 0.00010159142071956331, + "loss": 0.6158, + "step": 33468 + }, + { + "epoch": 0.8593900703073721, + "grad_norm": 0.90234375, + "learning_rate": 0.00010158695708163355, + "loss": 0.8598, + "step": 33469 + }, + { + "epoch": 0.8594157475032939, + "grad_norm": 0.796875, + "learning_rate": 0.00010158249344054116, + "loss": 0.8489, + "step": 33470 + }, + { + "epoch": 0.8594414246992157, + "grad_norm": 0.8125, + "learning_rate": 0.00010157802979629494, + "loss": 0.6827, + "step": 33471 + }, + { + "epoch": 0.8594671018951375, + "grad_norm": 0.80078125, + "learning_rate": 0.00010157356614890389, + "loss": 0.73, + "step": 33472 + }, + { + "epoch": 0.8594927790910594, + "grad_norm": 0.859375, + "learning_rate": 0.00010156910249837685, + "loss": 0.849, + "step": 33473 + }, + { + "epoch": 0.8595184562869812, + "grad_norm": 0.77734375, + "learning_rate": 0.00010156463884472273, + "loss": 0.8964, + "step": 33474 + }, + { + "epoch": 0.859544133482903, + "grad_norm": 0.73828125, + "learning_rate": 0.00010156017518795042, + "loss": 0.8193, + "step": 33475 + }, + { + "epoch": 0.8595698106788248, + "grad_norm": 0.82421875, + "learning_rate": 0.00010155571152806884, + "loss": 0.8055, + "step": 33476 + }, + { + "epoch": 0.8595954878747466, + "grad_norm": 0.796875, + "learning_rate": 0.00010155124786508687, + "loss": 0.7812, + "step": 33477 + }, + { + "epoch": 0.8596211650706684, + "grad_norm": 0.875, + "learning_rate": 0.0001015467841990134, + "loss": 0.845, + "step": 33478 + }, + { + "epoch": 0.8596468422665903, + "grad_norm": 0.78515625, + "learning_rate": 0.0001015423205298573, + "loss": 0.8094, + "step": 33479 + }, + { + "epoch": 0.8596725194625121, + "grad_norm": 0.7734375, + "learning_rate": 0.00010153785685762754, + "loss": 0.8017, + "step": 33480 + }, + { + "epoch": 0.8596981966584339, + "grad_norm": 0.82421875, + "learning_rate": 0.00010153339318233294, + "loss": 0.8115, + "step": 33481 + }, + { + "epoch": 0.8597238738543558, + "grad_norm": 0.8046875, + "learning_rate": 0.00010152892950398243, + "loss": 0.8125, + "step": 33482 + }, + { + "epoch": 0.8597495510502775, + "grad_norm": 0.76171875, + "learning_rate": 0.00010152446582258492, + "loss": 0.8575, + "step": 33483 + }, + { + "epoch": 0.8597752282461993, + "grad_norm": 0.765625, + "learning_rate": 0.00010152000213814925, + "loss": 0.7567, + "step": 33484 + }, + { + "epoch": 0.8598009054421212, + "grad_norm": 0.8125, + "learning_rate": 0.00010151553845068438, + "loss": 0.873, + "step": 33485 + }, + { + "epoch": 0.859826582638043, + "grad_norm": 0.77734375, + "learning_rate": 0.00010151107476019918, + "loss": 0.7552, + "step": 33486 + }, + { + "epoch": 0.8598522598339648, + "grad_norm": 0.75390625, + "learning_rate": 0.00010150661106670251, + "loss": 0.8045, + "step": 33487 + }, + { + "epoch": 0.8598779370298867, + "grad_norm": 0.7578125, + "learning_rate": 0.00010150214737020337, + "loss": 0.7616, + "step": 33488 + }, + { + "epoch": 0.8599036142258085, + "grad_norm": 0.77734375, + "learning_rate": 0.00010149768367071053, + "loss": 0.8152, + "step": 33489 + }, + { + "epoch": 0.8599292914217302, + "grad_norm": 0.7265625, + "learning_rate": 0.00010149321996823296, + "loss": 0.6982, + "step": 33490 + }, + { + "epoch": 0.8599549686176521, + "grad_norm": 0.75390625, + "learning_rate": 0.00010148875626277953, + "loss": 0.7562, + "step": 33491 + }, + { + "epoch": 0.8599806458135739, + "grad_norm": 0.77734375, + "learning_rate": 0.00010148429255435916, + "loss": 0.8202, + "step": 33492 + }, + { + "epoch": 0.8600063230094958, + "grad_norm": 0.81640625, + "learning_rate": 0.00010147982884298071, + "loss": 0.8875, + "step": 33493 + }, + { + "epoch": 0.8600320002054176, + "grad_norm": 0.75390625, + "learning_rate": 0.0001014753651286531, + "loss": 0.773, + "step": 33494 + }, + { + "epoch": 0.8600576774013394, + "grad_norm": 0.7890625, + "learning_rate": 0.0001014709014113852, + "loss": 0.7704, + "step": 33495 + }, + { + "epoch": 0.8600833545972612, + "grad_norm": 0.80859375, + "learning_rate": 0.00010146643769118598, + "loss": 0.8, + "step": 33496 + }, + { + "epoch": 0.860109031793183, + "grad_norm": 0.8125, + "learning_rate": 0.00010146197396806425, + "loss": 0.7699, + "step": 33497 + }, + { + "epoch": 0.8601347089891048, + "grad_norm": 0.73046875, + "learning_rate": 0.00010145751024202891, + "loss": 0.7895, + "step": 33498 + }, + { + "epoch": 0.8601603861850267, + "grad_norm": 0.79296875, + "learning_rate": 0.00010145304651308893, + "loss": 0.8719, + "step": 33499 + }, + { + "epoch": 0.8601860633809485, + "grad_norm": 0.75390625, + "learning_rate": 0.00010144858278125316, + "loss": 0.7046, + "step": 33500 + }, + { + "epoch": 0.8602117405768703, + "grad_norm": 0.828125, + "learning_rate": 0.00010144411904653045, + "loss": 0.8562, + "step": 33501 + }, + { + "epoch": 0.8602374177727922, + "grad_norm": 0.734375, + "learning_rate": 0.00010143965530892976, + "loss": 0.7723, + "step": 33502 + }, + { + "epoch": 0.8602630949687139, + "grad_norm": 0.796875, + "learning_rate": 0.00010143519156846, + "loss": 0.7631, + "step": 33503 + }, + { + "epoch": 0.8602887721646357, + "grad_norm": 0.7421875, + "learning_rate": 0.00010143072782513, + "loss": 0.6906, + "step": 33504 + }, + { + "epoch": 0.8603144493605576, + "grad_norm": 0.7890625, + "learning_rate": 0.0001014262640789487, + "loss": 0.6822, + "step": 33505 + }, + { + "epoch": 0.8603401265564794, + "grad_norm": 1.015625, + "learning_rate": 0.00010142180032992496, + "loss": 0.9178, + "step": 33506 + }, + { + "epoch": 0.8603658037524012, + "grad_norm": 0.79296875, + "learning_rate": 0.00010141733657806774, + "loss": 0.7045, + "step": 33507 + }, + { + "epoch": 0.8603914809483231, + "grad_norm": 0.79296875, + "learning_rate": 0.00010141287282338587, + "loss": 0.8498, + "step": 33508 + }, + { + "epoch": 0.8604171581442449, + "grad_norm": 0.80859375, + "learning_rate": 0.00010140840906588829, + "loss": 0.8143, + "step": 33509 + }, + { + "epoch": 0.8604428353401666, + "grad_norm": 0.8515625, + "learning_rate": 0.00010140394530558386, + "loss": 0.9625, + "step": 33510 + }, + { + "epoch": 0.8604685125360885, + "grad_norm": 0.76953125, + "learning_rate": 0.00010139948154248149, + "loss": 0.7657, + "step": 33511 + }, + { + "epoch": 0.8604941897320103, + "grad_norm": 0.75390625, + "learning_rate": 0.00010139501777659011, + "loss": 0.8004, + "step": 33512 + }, + { + "epoch": 0.8605198669279321, + "grad_norm": 0.79296875, + "learning_rate": 0.00010139055400791856, + "loss": 0.9242, + "step": 33513 + }, + { + "epoch": 0.860545544123854, + "grad_norm": 0.8125, + "learning_rate": 0.00010138609023647576, + "loss": 0.7363, + "step": 33514 + }, + { + "epoch": 0.8605712213197758, + "grad_norm": 0.80078125, + "learning_rate": 0.0001013816264622706, + "loss": 0.7558, + "step": 33515 + }, + { + "epoch": 0.8605968985156975, + "grad_norm": 0.703125, + "learning_rate": 0.00010137716268531201, + "loss": 0.7902, + "step": 33516 + }, + { + "epoch": 0.8606225757116194, + "grad_norm": 0.85546875, + "learning_rate": 0.00010137269890560885, + "loss": 0.746, + "step": 33517 + }, + { + "epoch": 0.8606482529075412, + "grad_norm": 0.79296875, + "learning_rate": 0.00010136823512317005, + "loss": 0.8646, + "step": 33518 + }, + { + "epoch": 0.860673930103463, + "grad_norm": 0.7578125, + "learning_rate": 0.00010136377133800443, + "loss": 0.8528, + "step": 33519 + }, + { + "epoch": 0.8606996072993849, + "grad_norm": 0.77734375, + "learning_rate": 0.00010135930755012095, + "loss": 0.8999, + "step": 33520 + }, + { + "epoch": 0.8607252844953067, + "grad_norm": 0.73828125, + "learning_rate": 0.0001013548437595285, + "loss": 0.7046, + "step": 33521 + }, + { + "epoch": 0.8607509616912286, + "grad_norm": 0.88671875, + "learning_rate": 0.000101350379966236, + "loss": 0.874, + "step": 33522 + }, + { + "epoch": 0.8607766388871503, + "grad_norm": 0.8359375, + "learning_rate": 0.00010134591617025229, + "loss": 0.8797, + "step": 33523 + }, + { + "epoch": 0.8608023160830721, + "grad_norm": 0.74609375, + "learning_rate": 0.00010134145237158628, + "loss": 0.7431, + "step": 33524 + }, + { + "epoch": 0.860827993278994, + "grad_norm": 0.79296875, + "learning_rate": 0.00010133698857024687, + "loss": 0.7568, + "step": 33525 + }, + { + "epoch": 0.8608536704749158, + "grad_norm": 0.7578125, + "learning_rate": 0.000101332524766243, + "loss": 0.7912, + "step": 33526 + }, + { + "epoch": 0.8608793476708376, + "grad_norm": 0.76171875, + "learning_rate": 0.00010132806095958347, + "loss": 0.7666, + "step": 33527 + }, + { + "epoch": 0.8609050248667595, + "grad_norm": 0.796875, + "learning_rate": 0.00010132359715027728, + "loss": 0.8446, + "step": 33528 + }, + { + "epoch": 0.8609307020626813, + "grad_norm": 0.79296875, + "learning_rate": 0.00010131913333833329, + "loss": 0.7994, + "step": 33529 + }, + { + "epoch": 0.860956379258603, + "grad_norm": 0.76171875, + "learning_rate": 0.00010131466952376037, + "loss": 0.8349, + "step": 33530 + }, + { + "epoch": 0.8609820564545249, + "grad_norm": 0.75390625, + "learning_rate": 0.00010131020570656741, + "loss": 0.7579, + "step": 33531 + }, + { + "epoch": 0.8610077336504467, + "grad_norm": 0.82421875, + "learning_rate": 0.00010130574188676333, + "loss": 0.7221, + "step": 33532 + }, + { + "epoch": 0.8610334108463685, + "grad_norm": 0.80078125, + "learning_rate": 0.00010130127806435705, + "loss": 0.7908, + "step": 33533 + }, + { + "epoch": 0.8610590880422904, + "grad_norm": 0.74609375, + "learning_rate": 0.00010129681423935744, + "loss": 0.7538, + "step": 33534 + }, + { + "epoch": 0.8610847652382122, + "grad_norm": 0.7890625, + "learning_rate": 0.00010129235041177339, + "loss": 1.0019, + "step": 33535 + }, + { + "epoch": 0.8611104424341339, + "grad_norm": 0.83203125, + "learning_rate": 0.00010128788658161381, + "loss": 0.7713, + "step": 33536 + }, + { + "epoch": 0.8611361196300558, + "grad_norm": 0.78125, + "learning_rate": 0.00010128342274888759, + "loss": 0.8299, + "step": 33537 + }, + { + "epoch": 0.8611617968259776, + "grad_norm": 0.75, + "learning_rate": 0.00010127895891360359, + "loss": 0.6624, + "step": 33538 + }, + { + "epoch": 0.8611874740218994, + "grad_norm": 0.88671875, + "learning_rate": 0.00010127449507577078, + "loss": 0.8212, + "step": 33539 + }, + { + "epoch": 0.8612131512178213, + "grad_norm": 1.015625, + "learning_rate": 0.000101270031235398, + "loss": 0.898, + "step": 33540 + }, + { + "epoch": 0.8612388284137431, + "grad_norm": 0.75, + "learning_rate": 0.00010126556739249415, + "loss": 0.8344, + "step": 33541 + }, + { + "epoch": 0.8612645056096649, + "grad_norm": 0.77734375, + "learning_rate": 0.00010126110354706817, + "loss": 0.7703, + "step": 33542 + }, + { + "epoch": 0.8612901828055867, + "grad_norm": 0.7421875, + "learning_rate": 0.00010125663969912888, + "loss": 0.7538, + "step": 33543 + }, + { + "epoch": 0.8613158600015085, + "grad_norm": 0.7890625, + "learning_rate": 0.00010125217584868528, + "loss": 0.7979, + "step": 33544 + }, + { + "epoch": 0.8613415371974303, + "grad_norm": 0.7890625, + "learning_rate": 0.00010124771199574616, + "loss": 0.7294, + "step": 33545 + }, + { + "epoch": 0.8613672143933522, + "grad_norm": 0.875, + "learning_rate": 0.00010124324814032047, + "loss": 0.841, + "step": 33546 + }, + { + "epoch": 0.861392891589274, + "grad_norm": 0.7890625, + "learning_rate": 0.00010123878428241711, + "loss": 0.8588, + "step": 33547 + }, + { + "epoch": 0.8614185687851958, + "grad_norm": 0.765625, + "learning_rate": 0.00010123432042204495, + "loss": 0.8222, + "step": 33548 + }, + { + "epoch": 0.8614442459811177, + "grad_norm": 0.79296875, + "learning_rate": 0.00010122985655921294, + "loss": 0.7154, + "step": 33549 + }, + { + "epoch": 0.8614699231770394, + "grad_norm": 0.75390625, + "learning_rate": 0.0001012253926939299, + "loss": 0.7262, + "step": 33550 + }, + { + "epoch": 0.8614956003729612, + "grad_norm": 0.80078125, + "learning_rate": 0.00010122092882620475, + "loss": 0.825, + "step": 33551 + }, + { + "epoch": 0.8615212775688831, + "grad_norm": 0.7578125, + "learning_rate": 0.00010121646495604642, + "loss": 0.7348, + "step": 33552 + }, + { + "epoch": 0.8615469547648049, + "grad_norm": 0.765625, + "learning_rate": 0.00010121200108346381, + "loss": 0.6944, + "step": 33553 + }, + { + "epoch": 0.8615726319607268, + "grad_norm": 0.7890625, + "learning_rate": 0.00010120753720846574, + "loss": 0.7349, + "step": 33554 + }, + { + "epoch": 0.8615983091566486, + "grad_norm": 0.828125, + "learning_rate": 0.00010120307333106121, + "loss": 0.8909, + "step": 33555 + }, + { + "epoch": 0.8616239863525703, + "grad_norm": 1.0078125, + "learning_rate": 0.000101198609451259, + "loss": 0.8614, + "step": 33556 + }, + { + "epoch": 0.8616496635484922, + "grad_norm": 0.765625, + "learning_rate": 0.00010119414556906812, + "loss": 0.8176, + "step": 33557 + }, + { + "epoch": 0.861675340744414, + "grad_norm": 0.76171875, + "learning_rate": 0.00010118968168449741, + "loss": 0.8203, + "step": 33558 + }, + { + "epoch": 0.8617010179403358, + "grad_norm": 0.7265625, + "learning_rate": 0.00010118521779755575, + "loss": 0.748, + "step": 33559 + }, + { + "epoch": 0.8617266951362577, + "grad_norm": 0.76171875, + "learning_rate": 0.00010118075390825207, + "loss": 0.8925, + "step": 33560 + }, + { + "epoch": 0.8617523723321795, + "grad_norm": 0.765625, + "learning_rate": 0.00010117629001659527, + "loss": 0.7878, + "step": 33561 + }, + { + "epoch": 0.8617780495281013, + "grad_norm": 0.84765625, + "learning_rate": 0.00010117182612259421, + "loss": 0.723, + "step": 33562 + }, + { + "epoch": 0.8618037267240231, + "grad_norm": 0.81640625, + "learning_rate": 0.00010116736222625782, + "loss": 0.8284, + "step": 33563 + }, + { + "epoch": 0.8618294039199449, + "grad_norm": 0.80078125, + "learning_rate": 0.00010116289832759496, + "loss": 0.709, + "step": 33564 + }, + { + "epoch": 0.8618550811158667, + "grad_norm": 1.15625, + "learning_rate": 0.00010115843442661456, + "loss": 0.6913, + "step": 33565 + }, + { + "epoch": 0.8618807583117886, + "grad_norm": 0.81640625, + "learning_rate": 0.00010115397052332554, + "loss": 0.8487, + "step": 33566 + }, + { + "epoch": 0.8619064355077104, + "grad_norm": 0.80078125, + "learning_rate": 0.0001011495066177367, + "loss": 0.7681, + "step": 33567 + }, + { + "epoch": 0.8619321127036322, + "grad_norm": 0.72265625, + "learning_rate": 0.00010114504270985704, + "loss": 0.6829, + "step": 33568 + }, + { + "epoch": 0.8619577898995541, + "grad_norm": 0.8046875, + "learning_rate": 0.00010114057879969543, + "loss": 0.7352, + "step": 33569 + }, + { + "epoch": 0.8619834670954758, + "grad_norm": 0.82421875, + "learning_rate": 0.00010113611488726069, + "loss": 0.8592, + "step": 33570 + }, + { + "epoch": 0.8620091442913976, + "grad_norm": 0.765625, + "learning_rate": 0.0001011316509725618, + "loss": 0.7936, + "step": 33571 + }, + { + "epoch": 0.8620348214873195, + "grad_norm": 0.75390625, + "learning_rate": 0.00010112718705560764, + "loss": 0.755, + "step": 33572 + }, + { + "epoch": 0.8620604986832413, + "grad_norm": 0.87109375, + "learning_rate": 0.00010112272313640709, + "loss": 0.7645, + "step": 33573 + }, + { + "epoch": 0.8620861758791631, + "grad_norm": 0.7578125, + "learning_rate": 0.00010111825921496907, + "loss": 0.697, + "step": 33574 + }, + { + "epoch": 0.862111853075085, + "grad_norm": 0.76953125, + "learning_rate": 0.00010111379529130245, + "loss": 0.7497, + "step": 33575 + }, + { + "epoch": 0.8621375302710067, + "grad_norm": 0.77734375, + "learning_rate": 0.00010110933136541614, + "loss": 0.862, + "step": 33576 + }, + { + "epoch": 0.8621632074669285, + "grad_norm": 0.73046875, + "learning_rate": 0.00010110486743731904, + "loss": 0.7221, + "step": 33577 + }, + { + "epoch": 0.8621888846628504, + "grad_norm": 0.8125, + "learning_rate": 0.00010110040350702001, + "loss": 0.8201, + "step": 33578 + }, + { + "epoch": 0.8622145618587722, + "grad_norm": 0.7890625, + "learning_rate": 0.00010109593957452801, + "loss": 0.7856, + "step": 33579 + }, + { + "epoch": 0.862240239054694, + "grad_norm": 0.734375, + "learning_rate": 0.00010109147563985188, + "loss": 0.7438, + "step": 33580 + }, + { + "epoch": 0.8622659162506159, + "grad_norm": 0.7421875, + "learning_rate": 0.00010108701170300052, + "loss": 0.7674, + "step": 33581 + }, + { + "epoch": 0.8622915934465377, + "grad_norm": 0.73828125, + "learning_rate": 0.00010108254776398289, + "loss": 0.7675, + "step": 33582 + }, + { + "epoch": 0.8623172706424594, + "grad_norm": 0.84375, + "learning_rate": 0.00010107808382280781, + "loss": 0.7575, + "step": 33583 + }, + { + "epoch": 0.8623429478383813, + "grad_norm": 0.7578125, + "learning_rate": 0.00010107361987948422, + "loss": 0.7526, + "step": 33584 + }, + { + "epoch": 0.8623686250343031, + "grad_norm": 0.77734375, + "learning_rate": 0.000101069155934021, + "loss": 0.8855, + "step": 33585 + }, + { + "epoch": 0.862394302230225, + "grad_norm": 0.7421875, + "learning_rate": 0.00010106469198642703, + "loss": 0.8606, + "step": 33586 + }, + { + "epoch": 0.8624199794261468, + "grad_norm": 0.8125, + "learning_rate": 0.00010106022803671124, + "loss": 0.8965, + "step": 33587 + }, + { + "epoch": 0.8624456566220686, + "grad_norm": 0.76171875, + "learning_rate": 0.00010105576408488251, + "loss": 0.7677, + "step": 33588 + }, + { + "epoch": 0.8624713338179905, + "grad_norm": 0.8125, + "learning_rate": 0.00010105130013094975, + "loss": 0.7725, + "step": 33589 + }, + { + "epoch": 0.8624970110139122, + "grad_norm": 0.73828125, + "learning_rate": 0.00010104683617492183, + "loss": 0.7912, + "step": 33590 + }, + { + "epoch": 0.862522688209834, + "grad_norm": 0.76171875, + "learning_rate": 0.00010104237221680765, + "loss": 0.736, + "step": 33591 + }, + { + "epoch": 0.8625483654057559, + "grad_norm": 0.75, + "learning_rate": 0.00010103790825661615, + "loss": 0.8556, + "step": 33592 + }, + { + "epoch": 0.8625740426016777, + "grad_norm": 0.984375, + "learning_rate": 0.00010103344429435617, + "loss": 0.9308, + "step": 33593 + }, + { + "epoch": 0.8625997197975995, + "grad_norm": 0.8359375, + "learning_rate": 0.0001010289803300366, + "loss": 0.8506, + "step": 33594 + }, + { + "epoch": 0.8626253969935214, + "grad_norm": 0.76953125, + "learning_rate": 0.00010102451636366644, + "loss": 0.7746, + "step": 33595 + }, + { + "epoch": 0.8626510741894431, + "grad_norm": 0.86328125, + "learning_rate": 0.00010102005239525443, + "loss": 0.7944, + "step": 33596 + }, + { + "epoch": 0.8626767513853649, + "grad_norm": 1.0, + "learning_rate": 0.0001010155884248096, + "loss": 0.9054, + "step": 33597 + }, + { + "epoch": 0.8627024285812868, + "grad_norm": 0.734375, + "learning_rate": 0.00010101112445234078, + "loss": 0.6532, + "step": 33598 + }, + { + "epoch": 0.8627281057772086, + "grad_norm": 0.78125, + "learning_rate": 0.00010100666047785686, + "loss": 0.8262, + "step": 33599 + }, + { + "epoch": 0.8627537829731304, + "grad_norm": 0.765625, + "learning_rate": 0.00010100219650136678, + "loss": 0.813, + "step": 33600 + }, + { + "epoch": 0.8627794601690523, + "grad_norm": 0.734375, + "learning_rate": 0.00010099773252287941, + "loss": 0.632, + "step": 33601 + }, + { + "epoch": 0.8628051373649741, + "grad_norm": 0.79296875, + "learning_rate": 0.00010099326854240365, + "loss": 0.8439, + "step": 33602 + }, + { + "epoch": 0.8628308145608958, + "grad_norm": 0.8046875, + "learning_rate": 0.00010098880455994841, + "loss": 0.8341, + "step": 33603 + }, + { + "epoch": 0.8628564917568177, + "grad_norm": 0.76171875, + "learning_rate": 0.00010098434057552253, + "loss": 0.8445, + "step": 33604 + }, + { + "epoch": 0.8628821689527395, + "grad_norm": 0.82421875, + "learning_rate": 0.00010097987658913499, + "loss": 0.8424, + "step": 33605 + }, + { + "epoch": 0.8629078461486613, + "grad_norm": 0.7578125, + "learning_rate": 0.00010097541260079462, + "loss": 0.7027, + "step": 33606 + }, + { + "epoch": 0.8629335233445832, + "grad_norm": 0.76171875, + "learning_rate": 0.00010097094861051032, + "loss": 0.8145, + "step": 33607 + }, + { + "epoch": 0.862959200540505, + "grad_norm": 0.796875, + "learning_rate": 0.00010096648461829108, + "loss": 0.7954, + "step": 33608 + }, + { + "epoch": 0.8629848777364268, + "grad_norm": 0.75, + "learning_rate": 0.00010096202062414566, + "loss": 0.8195, + "step": 33609 + }, + { + "epoch": 0.8630105549323486, + "grad_norm": 0.73046875, + "learning_rate": 0.00010095755662808304, + "loss": 0.8044, + "step": 33610 + }, + { + "epoch": 0.8630362321282704, + "grad_norm": 0.7890625, + "learning_rate": 0.0001009530926301121, + "loss": 0.8153, + "step": 33611 + }, + { + "epoch": 0.8630619093241922, + "grad_norm": 0.78125, + "learning_rate": 0.0001009486286302417, + "loss": 0.7962, + "step": 33612 + }, + { + "epoch": 0.8630875865201141, + "grad_norm": 0.8515625, + "learning_rate": 0.00010094416462848082, + "loss": 0.7382, + "step": 33613 + }, + { + "epoch": 0.8631132637160359, + "grad_norm": 0.8046875, + "learning_rate": 0.00010093970062483829, + "loss": 1.0214, + "step": 33614 + }, + { + "epoch": 0.8631389409119578, + "grad_norm": 0.8125, + "learning_rate": 0.00010093523661932301, + "loss": 0.8097, + "step": 33615 + }, + { + "epoch": 0.8631646181078795, + "grad_norm": 0.76171875, + "learning_rate": 0.0001009307726119439, + "loss": 0.8502, + "step": 33616 + }, + { + "epoch": 0.8631902953038013, + "grad_norm": 0.73046875, + "learning_rate": 0.00010092630860270984, + "loss": 0.7275, + "step": 33617 + }, + { + "epoch": 0.8632159724997231, + "grad_norm": 0.8125, + "learning_rate": 0.0001009218445916297, + "loss": 0.9218, + "step": 33618 + }, + { + "epoch": 0.863241649695645, + "grad_norm": 0.7578125, + "learning_rate": 0.00010091738057871243, + "loss": 0.8296, + "step": 33619 + }, + { + "epoch": 0.8632673268915668, + "grad_norm": 0.69140625, + "learning_rate": 0.00010091291656396688, + "loss": 0.722, + "step": 33620 + }, + { + "epoch": 0.8632930040874887, + "grad_norm": 0.80078125, + "learning_rate": 0.00010090845254740205, + "loss": 0.6821, + "step": 33621 + }, + { + "epoch": 0.8633186812834105, + "grad_norm": 0.80859375, + "learning_rate": 0.0001009039885290267, + "loss": 0.8521, + "step": 33622 + }, + { + "epoch": 0.8633443584793322, + "grad_norm": 0.9375, + "learning_rate": 0.00010089952450884976, + "loss": 0.6976, + "step": 33623 + }, + { + "epoch": 0.863370035675254, + "grad_norm": 0.7421875, + "learning_rate": 0.00010089506048688018, + "loss": 0.9209, + "step": 33624 + }, + { + "epoch": 0.8633957128711759, + "grad_norm": 0.79296875, + "learning_rate": 0.00010089059646312682, + "loss": 0.7613, + "step": 33625 + }, + { + "epoch": 0.8634213900670977, + "grad_norm": 0.75390625, + "learning_rate": 0.00010088613243759856, + "loss": 0.6799, + "step": 33626 + }, + { + "epoch": 0.8634470672630196, + "grad_norm": 0.796875, + "learning_rate": 0.00010088166841030436, + "loss": 0.7434, + "step": 33627 + }, + { + "epoch": 0.8634727444589414, + "grad_norm": 0.84375, + "learning_rate": 0.00010087720438125305, + "loss": 0.7806, + "step": 33628 + }, + { + "epoch": 0.8634984216548632, + "grad_norm": 0.7734375, + "learning_rate": 0.00010087274035045356, + "loss": 0.7201, + "step": 33629 + }, + { + "epoch": 0.863524098850785, + "grad_norm": 0.7578125, + "learning_rate": 0.00010086827631791478, + "loss": 0.766, + "step": 33630 + }, + { + "epoch": 0.8635497760467068, + "grad_norm": 0.76171875, + "learning_rate": 0.00010086381228364556, + "loss": 0.8499, + "step": 33631 + }, + { + "epoch": 0.8635754532426286, + "grad_norm": 0.69921875, + "learning_rate": 0.00010085934824765488, + "loss": 0.676, + "step": 33632 + }, + { + "epoch": 0.8636011304385505, + "grad_norm": 0.73046875, + "learning_rate": 0.00010085488420995159, + "loss": 0.7336, + "step": 33633 + }, + { + "epoch": 0.8636268076344723, + "grad_norm": 0.8125, + "learning_rate": 0.00010085042017054462, + "loss": 0.8474, + "step": 33634 + }, + { + "epoch": 0.8636524848303941, + "grad_norm": 0.74609375, + "learning_rate": 0.0001008459561294428, + "loss": 0.8577, + "step": 33635 + }, + { + "epoch": 0.8636781620263159, + "grad_norm": 0.82421875, + "learning_rate": 0.00010084149208665508, + "loss": 0.8626, + "step": 33636 + }, + { + "epoch": 0.8637038392222377, + "grad_norm": 0.7890625, + "learning_rate": 0.00010083702804219035, + "loss": 0.8088, + "step": 33637 + }, + { + "epoch": 0.8637295164181595, + "grad_norm": 0.83984375, + "learning_rate": 0.0001008325639960575, + "loss": 0.79, + "step": 33638 + }, + { + "epoch": 0.8637551936140814, + "grad_norm": 0.79296875, + "learning_rate": 0.00010082809994826538, + "loss": 0.8237, + "step": 33639 + }, + { + "epoch": 0.8637808708100032, + "grad_norm": 0.7734375, + "learning_rate": 0.00010082363589882297, + "loss": 0.6847, + "step": 33640 + }, + { + "epoch": 0.863806548005925, + "grad_norm": 0.82421875, + "learning_rate": 0.00010081917184773915, + "loss": 0.797, + "step": 33641 + }, + { + "epoch": 0.8638322252018469, + "grad_norm": 0.8203125, + "learning_rate": 0.00010081470779502276, + "loss": 0.7324, + "step": 33642 + }, + { + "epoch": 0.8638579023977686, + "grad_norm": 0.7890625, + "learning_rate": 0.00010081024374068275, + "loss": 0.8961, + "step": 33643 + }, + { + "epoch": 0.8638835795936904, + "grad_norm": 0.8046875, + "learning_rate": 0.00010080577968472798, + "loss": 0.9567, + "step": 33644 + }, + { + "epoch": 0.8639092567896123, + "grad_norm": 0.8359375, + "learning_rate": 0.00010080131562716737, + "loss": 0.7196, + "step": 33645 + }, + { + "epoch": 0.8639349339855341, + "grad_norm": 0.80859375, + "learning_rate": 0.00010079685156800982, + "loss": 0.7985, + "step": 33646 + }, + { + "epoch": 0.863960611181456, + "grad_norm": 0.78515625, + "learning_rate": 0.00010079238750726421, + "loss": 0.7947, + "step": 33647 + }, + { + "epoch": 0.8639862883773778, + "grad_norm": 0.7734375, + "learning_rate": 0.00010078792344493946, + "loss": 0.738, + "step": 33648 + }, + { + "epoch": 0.8640119655732996, + "grad_norm": 0.87109375, + "learning_rate": 0.00010078345938104441, + "loss": 0.8478, + "step": 33649 + }, + { + "epoch": 0.8640376427692213, + "grad_norm": 0.76171875, + "learning_rate": 0.00010077899531558801, + "loss": 0.844, + "step": 33650 + }, + { + "epoch": 0.8640633199651432, + "grad_norm": 0.765625, + "learning_rate": 0.00010077453124857917, + "loss": 0.8427, + "step": 33651 + }, + { + "epoch": 0.864088997161065, + "grad_norm": 0.8046875, + "learning_rate": 0.0001007700671800267, + "loss": 0.6925, + "step": 33652 + }, + { + "epoch": 0.8641146743569869, + "grad_norm": 0.7421875, + "learning_rate": 0.00010076560310993961, + "loss": 0.659, + "step": 33653 + }, + { + "epoch": 0.8641403515529087, + "grad_norm": 0.84765625, + "learning_rate": 0.00010076113903832676, + "loss": 0.8384, + "step": 33654 + }, + { + "epoch": 0.8641660287488305, + "grad_norm": 0.87109375, + "learning_rate": 0.00010075667496519698, + "loss": 0.7792, + "step": 33655 + }, + { + "epoch": 0.8641917059447523, + "grad_norm": 0.91796875, + "learning_rate": 0.00010075221089055923, + "loss": 0.8556, + "step": 33656 + }, + { + "epoch": 0.8642173831406741, + "grad_norm": 0.74609375, + "learning_rate": 0.00010074774681442239, + "loss": 0.7219, + "step": 33657 + }, + { + "epoch": 0.8642430603365959, + "grad_norm": 0.80859375, + "learning_rate": 0.00010074328273679537, + "loss": 0.8077, + "step": 33658 + }, + { + "epoch": 0.8642687375325178, + "grad_norm": 0.82421875, + "learning_rate": 0.00010073881865768703, + "loss": 0.757, + "step": 33659 + }, + { + "epoch": 0.8642944147284396, + "grad_norm": 0.7421875, + "learning_rate": 0.00010073435457710631, + "loss": 0.8415, + "step": 33660 + }, + { + "epoch": 0.8643200919243614, + "grad_norm": 0.8984375, + "learning_rate": 0.00010072989049506212, + "loss": 0.8145, + "step": 33661 + }, + { + "epoch": 0.8643457691202833, + "grad_norm": 0.8203125, + "learning_rate": 0.00010072542641156328, + "loss": 0.8888, + "step": 33662 + }, + { + "epoch": 0.864371446316205, + "grad_norm": 0.828125, + "learning_rate": 0.00010072096232661871, + "loss": 0.8777, + "step": 33663 + }, + { + "epoch": 0.8643971235121268, + "grad_norm": 0.953125, + "learning_rate": 0.00010071649824023737, + "loss": 0.8745, + "step": 33664 + }, + { + "epoch": 0.8644228007080487, + "grad_norm": 0.828125, + "learning_rate": 0.00010071203415242811, + "loss": 0.8141, + "step": 33665 + }, + { + "epoch": 0.8644484779039705, + "grad_norm": 0.78125, + "learning_rate": 0.00010070757006319982, + "loss": 0.8441, + "step": 33666 + }, + { + "epoch": 0.8644741550998923, + "grad_norm": 0.89453125, + "learning_rate": 0.00010070310597256143, + "loss": 0.8359, + "step": 33667 + }, + { + "epoch": 0.8644998322958142, + "grad_norm": 0.78515625, + "learning_rate": 0.00010069864188052176, + "loss": 0.9175, + "step": 33668 + }, + { + "epoch": 0.8645255094917359, + "grad_norm": 0.9296875, + "learning_rate": 0.0001006941777870898, + "loss": 0.9175, + "step": 33669 + }, + { + "epoch": 0.8645511866876577, + "grad_norm": 0.7734375, + "learning_rate": 0.0001006897136922744, + "loss": 0.7018, + "step": 33670 + }, + { + "epoch": 0.8645768638835796, + "grad_norm": 0.890625, + "learning_rate": 0.00010068524959608446, + "loss": 0.8735, + "step": 33671 + }, + { + "epoch": 0.8646025410795014, + "grad_norm": 0.76953125, + "learning_rate": 0.00010068078549852888, + "loss": 0.7232, + "step": 33672 + }, + { + "epoch": 0.8646282182754232, + "grad_norm": 0.8125, + "learning_rate": 0.00010067632139961655, + "loss": 0.6903, + "step": 33673 + }, + { + "epoch": 0.8646538954713451, + "grad_norm": 0.7578125, + "learning_rate": 0.0001006718572993564, + "loss": 0.8199, + "step": 33674 + }, + { + "epoch": 0.8646795726672669, + "grad_norm": 0.70703125, + "learning_rate": 0.00010066739319775726, + "loss": 0.7708, + "step": 33675 + }, + { + "epoch": 0.8647052498631886, + "grad_norm": 0.73828125, + "learning_rate": 0.00010066292909482808, + "loss": 0.8374, + "step": 33676 + }, + { + "epoch": 0.8647309270591105, + "grad_norm": 0.8203125, + "learning_rate": 0.00010065846499057774, + "loss": 0.7318, + "step": 33677 + }, + { + "epoch": 0.8647566042550323, + "grad_norm": 0.8828125, + "learning_rate": 0.00010065400088501515, + "loss": 0.756, + "step": 33678 + }, + { + "epoch": 0.8647822814509541, + "grad_norm": 0.79296875, + "learning_rate": 0.00010064953677814918, + "loss": 0.7993, + "step": 33679 + }, + { + "epoch": 0.864807958646876, + "grad_norm": 0.75390625, + "learning_rate": 0.00010064507266998877, + "loss": 0.6879, + "step": 33680 + }, + { + "epoch": 0.8648336358427978, + "grad_norm": 0.7734375, + "learning_rate": 0.00010064060856054276, + "loss": 0.8153, + "step": 33681 + }, + { + "epoch": 0.8648593130387197, + "grad_norm": 0.8125, + "learning_rate": 0.0001006361444498201, + "loss": 0.8625, + "step": 33682 + }, + { + "epoch": 0.8648849902346414, + "grad_norm": 0.7265625, + "learning_rate": 0.00010063168033782963, + "loss": 0.7103, + "step": 33683 + }, + { + "epoch": 0.8649106674305632, + "grad_norm": 0.71484375, + "learning_rate": 0.00010062721622458028, + "loss": 0.7844, + "step": 33684 + }, + { + "epoch": 0.864936344626485, + "grad_norm": 0.92578125, + "learning_rate": 0.00010062275211008096, + "loss": 0.8272, + "step": 33685 + }, + { + "epoch": 0.8649620218224069, + "grad_norm": 0.81640625, + "learning_rate": 0.00010061828799434057, + "loss": 0.8332, + "step": 33686 + }, + { + "epoch": 0.8649876990183287, + "grad_norm": 1.2421875, + "learning_rate": 0.00010061382387736797, + "loss": 0.9293, + "step": 33687 + }, + { + "epoch": 0.8650133762142506, + "grad_norm": 0.81640625, + "learning_rate": 0.00010060935975917208, + "loss": 0.6887, + "step": 33688 + }, + { + "epoch": 0.8650390534101723, + "grad_norm": 0.77734375, + "learning_rate": 0.00010060489563976177, + "loss": 0.8267, + "step": 33689 + }, + { + "epoch": 0.8650647306060941, + "grad_norm": 0.765625, + "learning_rate": 0.00010060043151914598, + "loss": 0.7603, + "step": 33690 + }, + { + "epoch": 0.865090407802016, + "grad_norm": 0.75390625, + "learning_rate": 0.00010059596739733357, + "loss": 0.7835, + "step": 33691 + }, + { + "epoch": 0.8651160849979378, + "grad_norm": 0.7734375, + "learning_rate": 0.00010059150327433345, + "loss": 0.6776, + "step": 33692 + }, + { + "epoch": 0.8651417621938596, + "grad_norm": 0.71875, + "learning_rate": 0.00010058703915015454, + "loss": 0.7247, + "step": 33693 + }, + { + "epoch": 0.8651674393897815, + "grad_norm": 0.73828125, + "learning_rate": 0.00010058257502480568, + "loss": 0.7481, + "step": 33694 + }, + { + "epoch": 0.8651931165857033, + "grad_norm": 0.80078125, + "learning_rate": 0.00010057811089829583, + "loss": 0.7945, + "step": 33695 + }, + { + "epoch": 0.865218793781625, + "grad_norm": 0.734375, + "learning_rate": 0.00010057364677063385, + "loss": 0.9378, + "step": 33696 + }, + { + "epoch": 0.8652444709775469, + "grad_norm": 0.8125, + "learning_rate": 0.00010056918264182861, + "loss": 0.7814, + "step": 33697 + }, + { + "epoch": 0.8652701481734687, + "grad_norm": 0.7734375, + "learning_rate": 0.00010056471851188909, + "loss": 0.8672, + "step": 33698 + }, + { + "epoch": 0.8652958253693905, + "grad_norm": 0.765625, + "learning_rate": 0.00010056025438082413, + "loss": 0.7908, + "step": 33699 + }, + { + "epoch": 0.8653215025653124, + "grad_norm": 0.79296875, + "learning_rate": 0.00010055579024864261, + "loss": 0.7757, + "step": 33700 + }, + { + "epoch": 0.8653471797612342, + "grad_norm": 0.87109375, + "learning_rate": 0.00010055132611535348, + "loss": 0.9153, + "step": 33701 + }, + { + "epoch": 0.865372856957156, + "grad_norm": 0.8203125, + "learning_rate": 0.00010054686198096559, + "loss": 0.8504, + "step": 33702 + }, + { + "epoch": 0.8653985341530778, + "grad_norm": 0.671875, + "learning_rate": 0.00010054239784548782, + "loss": 0.7553, + "step": 33703 + }, + { + "epoch": 0.8654242113489996, + "grad_norm": 0.72265625, + "learning_rate": 0.00010053793370892915, + "loss": 0.777, + "step": 33704 + }, + { + "epoch": 0.8654498885449214, + "grad_norm": 0.7734375, + "learning_rate": 0.00010053346957129841, + "loss": 0.8139, + "step": 33705 + }, + { + "epoch": 0.8654755657408433, + "grad_norm": 0.73828125, + "learning_rate": 0.00010052900543260448, + "loss": 0.846, + "step": 33706 + }, + { + "epoch": 0.8655012429367651, + "grad_norm": 0.83203125, + "learning_rate": 0.00010052454129285635, + "loss": 0.8086, + "step": 33707 + }, + { + "epoch": 0.865526920132687, + "grad_norm": 0.84375, + "learning_rate": 0.00010052007715206282, + "loss": 0.8792, + "step": 33708 + }, + { + "epoch": 0.8655525973286087, + "grad_norm": 0.7734375, + "learning_rate": 0.00010051561301023285, + "loss": 0.8467, + "step": 33709 + }, + { + "epoch": 0.8655782745245305, + "grad_norm": 0.83203125, + "learning_rate": 0.0001005111488673753, + "loss": 0.911, + "step": 33710 + }, + { + "epoch": 0.8656039517204523, + "grad_norm": 0.859375, + "learning_rate": 0.00010050668472349904, + "loss": 0.6956, + "step": 33711 + }, + { + "epoch": 0.8656296289163742, + "grad_norm": 0.75, + "learning_rate": 0.00010050222057861303, + "loss": 0.8004, + "step": 33712 + }, + { + "epoch": 0.865655306112296, + "grad_norm": 0.8515625, + "learning_rate": 0.00010049775643272615, + "loss": 0.8686, + "step": 33713 + }, + { + "epoch": 0.8656809833082179, + "grad_norm": 0.828125, + "learning_rate": 0.00010049329228584727, + "loss": 0.8155, + "step": 33714 + }, + { + "epoch": 0.8657066605041397, + "grad_norm": 0.8515625, + "learning_rate": 0.00010048882813798533, + "loss": 0.8638, + "step": 33715 + }, + { + "epoch": 0.8657323377000614, + "grad_norm": 0.8046875, + "learning_rate": 0.00010048436398914916, + "loss": 0.8278, + "step": 33716 + }, + { + "epoch": 0.8657580148959833, + "grad_norm": 0.74609375, + "learning_rate": 0.00010047989983934771, + "loss": 0.8868, + "step": 33717 + }, + { + "epoch": 0.8657836920919051, + "grad_norm": 0.7890625, + "learning_rate": 0.00010047543568858988, + "loss": 0.7537, + "step": 33718 + }, + { + "epoch": 0.8658093692878269, + "grad_norm": 0.80078125, + "learning_rate": 0.00010047097153688452, + "loss": 0.8158, + "step": 33719 + }, + { + "epoch": 0.8658350464837488, + "grad_norm": 0.78515625, + "learning_rate": 0.00010046650738424059, + "loss": 0.8198, + "step": 33720 + }, + { + "epoch": 0.8658607236796706, + "grad_norm": 0.7578125, + "learning_rate": 0.00010046204323066692, + "loss": 0.7852, + "step": 33721 + }, + { + "epoch": 0.8658864008755924, + "grad_norm": 0.76171875, + "learning_rate": 0.00010045757907617247, + "loss": 0.7148, + "step": 33722 + }, + { + "epoch": 0.8659120780715142, + "grad_norm": 0.74609375, + "learning_rate": 0.0001004531149207661, + "loss": 0.9004, + "step": 33723 + }, + { + "epoch": 0.865937755267436, + "grad_norm": 0.84765625, + "learning_rate": 0.00010044865076445668, + "loss": 0.8062, + "step": 33724 + }, + { + "epoch": 0.8659634324633578, + "grad_norm": 0.765625, + "learning_rate": 0.00010044418660725318, + "loss": 0.7584, + "step": 33725 + }, + { + "epoch": 0.8659891096592797, + "grad_norm": 0.8828125, + "learning_rate": 0.00010043972244916445, + "loss": 0.8554, + "step": 33726 + }, + { + "epoch": 0.8660147868552015, + "grad_norm": 0.7421875, + "learning_rate": 0.00010043525829019939, + "loss": 0.7452, + "step": 33727 + }, + { + "epoch": 0.8660404640511233, + "grad_norm": 0.88671875, + "learning_rate": 0.00010043079413036689, + "loss": 0.8542, + "step": 33728 + }, + { + "epoch": 0.8660661412470451, + "grad_norm": 0.765625, + "learning_rate": 0.00010042632996967584, + "loss": 0.7738, + "step": 33729 + }, + { + "epoch": 0.8660918184429669, + "grad_norm": 0.85546875, + "learning_rate": 0.0001004218658081352, + "loss": 0.8187, + "step": 33730 + }, + { + "epoch": 0.8661174956388887, + "grad_norm": 0.88671875, + "learning_rate": 0.00010041740164575379, + "loss": 0.8095, + "step": 33731 + }, + { + "epoch": 0.8661431728348106, + "grad_norm": 0.734375, + "learning_rate": 0.00010041293748254053, + "loss": 0.7717, + "step": 33732 + }, + { + "epoch": 0.8661688500307324, + "grad_norm": 0.765625, + "learning_rate": 0.00010040847331850436, + "loss": 0.7406, + "step": 33733 + }, + { + "epoch": 0.8661945272266542, + "grad_norm": 0.8515625, + "learning_rate": 0.0001004040091536541, + "loss": 0.8036, + "step": 33734 + }, + { + "epoch": 0.8662202044225761, + "grad_norm": 0.78515625, + "learning_rate": 0.00010039954498799868, + "loss": 0.8447, + "step": 33735 + }, + { + "epoch": 0.8662458816184978, + "grad_norm": 0.7890625, + "learning_rate": 0.00010039508082154702, + "loss": 0.9081, + "step": 33736 + }, + { + "epoch": 0.8662715588144196, + "grad_norm": 0.87109375, + "learning_rate": 0.000100390616654308, + "loss": 0.7166, + "step": 33737 + }, + { + "epoch": 0.8662972360103415, + "grad_norm": 0.8125, + "learning_rate": 0.00010038615248629053, + "loss": 0.6903, + "step": 33738 + }, + { + "epoch": 0.8663229132062633, + "grad_norm": 0.7890625, + "learning_rate": 0.00010038168831750347, + "loss": 0.8951, + "step": 33739 + }, + { + "epoch": 0.8663485904021851, + "grad_norm": 0.78515625, + "learning_rate": 0.00010037722414795576, + "loss": 0.8383, + "step": 33740 + }, + { + "epoch": 0.866374267598107, + "grad_norm": 0.80859375, + "learning_rate": 0.00010037275997765627, + "loss": 0.7594, + "step": 33741 + }, + { + "epoch": 0.8663999447940288, + "grad_norm": 0.80078125, + "learning_rate": 0.00010036829580661392, + "loss": 0.7792, + "step": 33742 + }, + { + "epoch": 0.8664256219899505, + "grad_norm": 0.7734375, + "learning_rate": 0.00010036383163483753, + "loss": 0.8747, + "step": 33743 + }, + { + "epoch": 0.8664512991858724, + "grad_norm": 0.85546875, + "learning_rate": 0.00010035936746233609, + "loss": 0.7299, + "step": 33744 + }, + { + "epoch": 0.8664769763817942, + "grad_norm": 0.78125, + "learning_rate": 0.00010035490328911848, + "loss": 0.8112, + "step": 33745 + }, + { + "epoch": 0.866502653577716, + "grad_norm": 0.7734375, + "learning_rate": 0.00010035043911519357, + "loss": 0.8147, + "step": 33746 + }, + { + "epoch": 0.8665283307736379, + "grad_norm": 0.859375, + "learning_rate": 0.00010034597494057028, + "loss": 0.9064, + "step": 33747 + }, + { + "epoch": 0.8665540079695597, + "grad_norm": 0.8203125, + "learning_rate": 0.00010034151076525746, + "loss": 0.8624, + "step": 33748 + }, + { + "epoch": 0.8665796851654815, + "grad_norm": 0.80859375, + "learning_rate": 0.00010033704658926406, + "loss": 0.6802, + "step": 33749 + }, + { + "epoch": 0.8666053623614033, + "grad_norm": 0.80078125, + "learning_rate": 0.00010033258241259897, + "loss": 0.8504, + "step": 33750 + }, + { + "epoch": 0.8666310395573251, + "grad_norm": 0.78125, + "learning_rate": 0.00010032811823527104, + "loss": 0.7881, + "step": 33751 + }, + { + "epoch": 0.866656716753247, + "grad_norm": 0.80078125, + "learning_rate": 0.00010032365405728921, + "loss": 0.8177, + "step": 33752 + }, + { + "epoch": 0.8666823939491688, + "grad_norm": 0.90234375, + "learning_rate": 0.00010031918987866238, + "loss": 0.7079, + "step": 33753 + }, + { + "epoch": 0.8667080711450906, + "grad_norm": 0.734375, + "learning_rate": 0.00010031472569939943, + "loss": 0.6867, + "step": 33754 + }, + { + "epoch": 0.8667337483410125, + "grad_norm": 0.70703125, + "learning_rate": 0.00010031026151950926, + "loss": 0.6074, + "step": 33755 + }, + { + "epoch": 0.8667594255369342, + "grad_norm": 0.80859375, + "learning_rate": 0.00010030579733900076, + "loss": 0.7111, + "step": 33756 + }, + { + "epoch": 0.866785102732856, + "grad_norm": 0.8125, + "learning_rate": 0.00010030133315788283, + "loss": 0.9138, + "step": 33757 + }, + { + "epoch": 0.8668107799287779, + "grad_norm": 0.8515625, + "learning_rate": 0.0001002968689761644, + "loss": 0.7685, + "step": 33758 + }, + { + "epoch": 0.8668364571246997, + "grad_norm": 0.80078125, + "learning_rate": 0.00010029240479385431, + "loss": 0.835, + "step": 33759 + }, + { + "epoch": 0.8668621343206215, + "grad_norm": 0.91015625, + "learning_rate": 0.0001002879406109615, + "loss": 0.8112, + "step": 33760 + }, + { + "epoch": 0.8668878115165434, + "grad_norm": 0.9140625, + "learning_rate": 0.00010028347642749483, + "loss": 0.9586, + "step": 33761 + }, + { + "epoch": 0.8669134887124652, + "grad_norm": 0.80078125, + "learning_rate": 0.00010027901224346324, + "loss": 0.8432, + "step": 33762 + }, + { + "epoch": 0.8669391659083869, + "grad_norm": 0.76171875, + "learning_rate": 0.00010027454805887557, + "loss": 0.7927, + "step": 33763 + }, + { + "epoch": 0.8669648431043088, + "grad_norm": 1.09375, + "learning_rate": 0.00010027008387374074, + "loss": 0.6705, + "step": 33764 + }, + { + "epoch": 0.8669905203002306, + "grad_norm": 0.796875, + "learning_rate": 0.00010026561968806771, + "loss": 0.8856, + "step": 33765 + }, + { + "epoch": 0.8670161974961524, + "grad_norm": 0.859375, + "learning_rate": 0.0001002611555018653, + "loss": 0.7255, + "step": 33766 + }, + { + "epoch": 0.8670418746920743, + "grad_norm": 0.76171875, + "learning_rate": 0.00010025669131514243, + "loss": 0.706, + "step": 33767 + }, + { + "epoch": 0.8670675518879961, + "grad_norm": 0.80859375, + "learning_rate": 0.00010025222712790802, + "loss": 0.8543, + "step": 33768 + }, + { + "epoch": 0.8670932290839178, + "grad_norm": 0.8671875, + "learning_rate": 0.00010024776294017091, + "loss": 0.7693, + "step": 33769 + }, + { + "epoch": 0.8671189062798397, + "grad_norm": 0.765625, + "learning_rate": 0.00010024329875194005, + "loss": 0.7379, + "step": 33770 + }, + { + "epoch": 0.8671445834757615, + "grad_norm": 0.83203125, + "learning_rate": 0.00010023883456322431, + "loss": 0.8112, + "step": 33771 + }, + { + "epoch": 0.8671702606716833, + "grad_norm": 0.890625, + "learning_rate": 0.00010023437037403261, + "loss": 0.7935, + "step": 33772 + }, + { + "epoch": 0.8671959378676052, + "grad_norm": 0.8515625, + "learning_rate": 0.00010022990618437383, + "loss": 0.9012, + "step": 33773 + }, + { + "epoch": 0.867221615063527, + "grad_norm": 0.76171875, + "learning_rate": 0.00010022544199425682, + "loss": 0.8236, + "step": 33774 + }, + { + "epoch": 0.8672472922594489, + "grad_norm": 0.80859375, + "learning_rate": 0.00010022097780369057, + "loss": 0.7736, + "step": 33775 + }, + { + "epoch": 0.8672729694553706, + "grad_norm": 0.81640625, + "learning_rate": 0.00010021651361268394, + "loss": 0.8178, + "step": 33776 + }, + { + "epoch": 0.8672986466512924, + "grad_norm": 0.78125, + "learning_rate": 0.00010021204942124578, + "loss": 0.688, + "step": 33777 + }, + { + "epoch": 0.8673243238472143, + "grad_norm": 0.85546875, + "learning_rate": 0.00010020758522938505, + "loss": 0.782, + "step": 33778 + }, + { + "epoch": 0.8673500010431361, + "grad_norm": 0.8984375, + "learning_rate": 0.00010020312103711062, + "loss": 0.7529, + "step": 33779 + }, + { + "epoch": 0.8673756782390579, + "grad_norm": 0.82421875, + "learning_rate": 0.00010019865684443138, + "loss": 0.765, + "step": 33780 + }, + { + "epoch": 0.8674013554349798, + "grad_norm": 0.8125, + "learning_rate": 0.00010019419265135624, + "loss": 0.9677, + "step": 33781 + }, + { + "epoch": 0.8674270326309016, + "grad_norm": 0.79296875, + "learning_rate": 0.00010018972845789408, + "loss": 0.9204, + "step": 33782 + }, + { + "epoch": 0.8674527098268233, + "grad_norm": 0.9453125, + "learning_rate": 0.00010018526426405381, + "loss": 0.735, + "step": 33783 + }, + { + "epoch": 0.8674783870227452, + "grad_norm": 0.75, + "learning_rate": 0.00010018080006984434, + "loss": 0.7939, + "step": 33784 + }, + { + "epoch": 0.867504064218667, + "grad_norm": 0.78515625, + "learning_rate": 0.00010017633587527453, + "loss": 0.9264, + "step": 33785 + }, + { + "epoch": 0.8675297414145888, + "grad_norm": 0.796875, + "learning_rate": 0.00010017187168035333, + "loss": 0.912, + "step": 33786 + }, + { + "epoch": 0.8675554186105107, + "grad_norm": 0.82421875, + "learning_rate": 0.00010016740748508962, + "loss": 0.8498, + "step": 33787 + }, + { + "epoch": 0.8675810958064325, + "grad_norm": 0.8046875, + "learning_rate": 0.00010016294328949223, + "loss": 0.9468, + "step": 33788 + }, + { + "epoch": 0.8676067730023542, + "grad_norm": 0.79296875, + "learning_rate": 0.00010015847909357013, + "loss": 0.8168, + "step": 33789 + }, + { + "epoch": 0.8676324501982761, + "grad_norm": 0.76953125, + "learning_rate": 0.0001001540148973322, + "loss": 0.7546, + "step": 33790 + }, + { + "epoch": 0.8676581273941979, + "grad_norm": 0.78125, + "learning_rate": 0.0001001495507007873, + "loss": 0.8173, + "step": 33791 + }, + { + "epoch": 0.8676838045901197, + "grad_norm": 0.75, + "learning_rate": 0.00010014508650394441, + "loss": 0.7807, + "step": 33792 + }, + { + "epoch": 0.8677094817860416, + "grad_norm": 0.83203125, + "learning_rate": 0.00010014062230681234, + "loss": 0.7302, + "step": 33793 + }, + { + "epoch": 0.8677351589819634, + "grad_norm": 0.83984375, + "learning_rate": 0.00010013615810940005, + "loss": 0.7942, + "step": 33794 + }, + { + "epoch": 0.8677608361778852, + "grad_norm": 0.8828125, + "learning_rate": 0.0001001316939117164, + "loss": 0.8734, + "step": 33795 + }, + { + "epoch": 0.867786513373807, + "grad_norm": 0.796875, + "learning_rate": 0.00010012722971377026, + "loss": 0.8162, + "step": 33796 + }, + { + "epoch": 0.8678121905697288, + "grad_norm": 0.75390625, + "learning_rate": 0.0001001227655155706, + "loss": 0.7821, + "step": 33797 + }, + { + "epoch": 0.8678378677656506, + "grad_norm": 0.90234375, + "learning_rate": 0.00010011830131712627, + "loss": 0.7108, + "step": 33798 + }, + { + "epoch": 0.8678635449615725, + "grad_norm": 0.80859375, + "learning_rate": 0.0001001138371184462, + "loss": 0.7471, + "step": 33799 + }, + { + "epoch": 0.8678892221574943, + "grad_norm": 0.7578125, + "learning_rate": 0.00010010937291953923, + "loss": 0.8084, + "step": 33800 + }, + { + "epoch": 0.8679148993534161, + "grad_norm": 0.7734375, + "learning_rate": 0.0001001049087204143, + "loss": 0.7591, + "step": 33801 + }, + { + "epoch": 0.867940576549338, + "grad_norm": 0.8125, + "learning_rate": 0.0001001004445210803, + "loss": 0.7972, + "step": 33802 + }, + { + "epoch": 0.8679662537452597, + "grad_norm": 0.78125, + "learning_rate": 0.00010009598032154612, + "loss": 0.7672, + "step": 33803 + }, + { + "epoch": 0.8679919309411815, + "grad_norm": 0.6953125, + "learning_rate": 0.00010009151612182064, + "loss": 0.7097, + "step": 33804 + }, + { + "epoch": 0.8680176081371034, + "grad_norm": 0.74609375, + "learning_rate": 0.0001000870519219128, + "loss": 0.7378, + "step": 33805 + }, + { + "epoch": 0.8680432853330252, + "grad_norm": 0.69921875, + "learning_rate": 0.00010008258772183148, + "loss": 0.6747, + "step": 33806 + }, + { + "epoch": 0.868068962528947, + "grad_norm": 0.734375, + "learning_rate": 0.00010007812352158556, + "loss": 0.7246, + "step": 33807 + }, + { + "epoch": 0.8680946397248689, + "grad_norm": 0.75390625, + "learning_rate": 0.00010007365932118395, + "loss": 0.8544, + "step": 33808 + }, + { + "epoch": 0.8681203169207906, + "grad_norm": 0.82421875, + "learning_rate": 0.00010006919512063553, + "loss": 0.9641, + "step": 33809 + }, + { + "epoch": 0.8681459941167124, + "grad_norm": 0.75390625, + "learning_rate": 0.00010006473091994921, + "loss": 0.7709, + "step": 33810 + }, + { + "epoch": 0.8681716713126343, + "grad_norm": 0.81640625, + "learning_rate": 0.00010006026671913391, + "loss": 0.7698, + "step": 33811 + }, + { + "epoch": 0.8681973485085561, + "grad_norm": 0.76171875, + "learning_rate": 0.0001000558025181985, + "loss": 0.8435, + "step": 33812 + }, + { + "epoch": 0.868223025704478, + "grad_norm": 0.8671875, + "learning_rate": 0.00010005133831715188, + "loss": 0.8995, + "step": 33813 + }, + { + "epoch": 0.8682487029003998, + "grad_norm": 0.796875, + "learning_rate": 0.00010004687411600291, + "loss": 0.7179, + "step": 33814 + }, + { + "epoch": 0.8682743800963216, + "grad_norm": 0.796875, + "learning_rate": 0.00010004240991476057, + "loss": 0.8529, + "step": 33815 + }, + { + "epoch": 0.8683000572922434, + "grad_norm": 0.9296875, + "learning_rate": 0.0001000379457134337, + "loss": 0.9026, + "step": 33816 + }, + { + "epoch": 0.8683257344881652, + "grad_norm": 0.73828125, + "learning_rate": 0.00010003348151203118, + "loss": 0.679, + "step": 33817 + }, + { + "epoch": 0.868351411684087, + "grad_norm": 0.78515625, + "learning_rate": 0.00010002901731056197, + "loss": 0.7278, + "step": 33818 + }, + { + "epoch": 0.8683770888800089, + "grad_norm": 0.79296875, + "learning_rate": 0.00010002455310903491, + "loss": 0.8204, + "step": 33819 + }, + { + "epoch": 0.8684027660759307, + "grad_norm": 0.7578125, + "learning_rate": 0.00010002008890745893, + "loss": 0.8067, + "step": 33820 + }, + { + "epoch": 0.8684284432718525, + "grad_norm": 0.7890625, + "learning_rate": 0.00010001562470584292, + "loss": 0.7597, + "step": 33821 + }, + { + "epoch": 0.8684541204677744, + "grad_norm": 0.7734375, + "learning_rate": 0.00010001116050419574, + "loss": 0.7647, + "step": 33822 + }, + { + "epoch": 0.8684797976636961, + "grad_norm": 0.8046875, + "learning_rate": 0.00010000669630252634, + "loss": 0.8634, + "step": 33823 + }, + { + "epoch": 0.8685054748596179, + "grad_norm": 0.8671875, + "learning_rate": 0.00010000223210084361, + "loss": 1.0001, + "step": 33824 + }, + { + "epoch": 0.8685311520555398, + "grad_norm": 0.8515625, + "learning_rate": 9.999776789915641e-05, + "loss": 0.8899, + "step": 33825 + }, + { + "epoch": 0.8685568292514616, + "grad_norm": 0.7421875, + "learning_rate": 9.999330369747365e-05, + "loss": 0.8175, + "step": 33826 + }, + { + "epoch": 0.8685825064473834, + "grad_norm": 0.81640625, + "learning_rate": 9.998883949580427e-05, + "loss": 0.7803, + "step": 33827 + }, + { + "epoch": 0.8686081836433053, + "grad_norm": 0.7578125, + "learning_rate": 9.998437529415713e-05, + "loss": 0.8173, + "step": 33828 + }, + { + "epoch": 0.868633860839227, + "grad_norm": 0.7109375, + "learning_rate": 9.997991109254109e-05, + "loss": 0.8522, + "step": 33829 + }, + { + "epoch": 0.8686595380351488, + "grad_norm": 0.85546875, + "learning_rate": 9.99754468909651e-05, + "loss": 0.9029, + "step": 33830 + }, + { + "epoch": 0.8686852152310707, + "grad_norm": 0.7890625, + "learning_rate": 9.997098268943804e-05, + "loss": 0.7486, + "step": 33831 + }, + { + "epoch": 0.8687108924269925, + "grad_norm": 0.80078125, + "learning_rate": 9.996651848796884e-05, + "loss": 0.837, + "step": 33832 + }, + { + "epoch": 0.8687365696229143, + "grad_norm": 0.80078125, + "learning_rate": 9.99620542865663e-05, + "loss": 0.8335, + "step": 33833 + }, + { + "epoch": 0.8687622468188362, + "grad_norm": 0.7578125, + "learning_rate": 9.995759008523947e-05, + "loss": 0.7603, + "step": 33834 + }, + { + "epoch": 0.868787924014758, + "grad_norm": 0.765625, + "learning_rate": 9.995312588399711e-05, + "loss": 0.7205, + "step": 33835 + }, + { + "epoch": 0.8688136012106797, + "grad_norm": 0.7890625, + "learning_rate": 9.994866168284816e-05, + "loss": 0.8225, + "step": 33836 + }, + { + "epoch": 0.8688392784066016, + "grad_norm": 0.8125, + "learning_rate": 9.994419748180155e-05, + "loss": 0.7681, + "step": 33837 + }, + { + "epoch": 0.8688649556025234, + "grad_norm": 0.78515625, + "learning_rate": 9.993973328086611e-05, + "loss": 0.8311, + "step": 33838 + }, + { + "epoch": 0.8688906327984453, + "grad_norm": 0.76171875, + "learning_rate": 9.993526908005081e-05, + "loss": 0.7696, + "step": 33839 + }, + { + "epoch": 0.8689163099943671, + "grad_norm": 0.81640625, + "learning_rate": 9.993080487936448e-05, + "loss": 0.8626, + "step": 33840 + }, + { + "epoch": 0.8689419871902889, + "grad_norm": 0.8125, + "learning_rate": 9.992634067881608e-05, + "loss": 0.7796, + "step": 33841 + }, + { + "epoch": 0.8689676643862108, + "grad_norm": 0.859375, + "learning_rate": 9.992187647841449e-05, + "loss": 0.8077, + "step": 33842 + }, + { + "epoch": 0.8689933415821325, + "grad_norm": 0.828125, + "learning_rate": 9.991741227816855e-05, + "loss": 0.691, + "step": 33843 + }, + { + "epoch": 0.8690190187780543, + "grad_norm": 0.828125, + "learning_rate": 9.991294807808722e-05, + "loss": 0.9269, + "step": 33844 + }, + { + "epoch": 0.8690446959739762, + "grad_norm": 0.71875, + "learning_rate": 9.990848387817937e-05, + "loss": 0.694, + "step": 33845 + }, + { + "epoch": 0.869070373169898, + "grad_norm": 0.796875, + "learning_rate": 9.99040196784539e-05, + "loss": 0.8903, + "step": 33846 + }, + { + "epoch": 0.8690960503658198, + "grad_norm": 0.8515625, + "learning_rate": 9.989955547891971e-05, + "loss": 0.848, + "step": 33847 + }, + { + "epoch": 0.8691217275617417, + "grad_norm": 0.8125, + "learning_rate": 9.989509127958574e-05, + "loss": 0.7426, + "step": 33848 + }, + { + "epoch": 0.8691474047576634, + "grad_norm": 0.79296875, + "learning_rate": 9.989062708046081e-05, + "loss": 0.8282, + "step": 33849 + }, + { + "epoch": 0.8691730819535852, + "grad_norm": 0.73046875, + "learning_rate": 9.988616288155384e-05, + "loss": 0.8011, + "step": 33850 + }, + { + "epoch": 0.8691987591495071, + "grad_norm": 0.77734375, + "learning_rate": 9.988169868287375e-05, + "loss": 0.8131, + "step": 33851 + }, + { + "epoch": 0.8692244363454289, + "grad_norm": 0.8515625, + "learning_rate": 9.987723448442941e-05, + "loss": 0.7411, + "step": 33852 + }, + { + "epoch": 0.8692501135413507, + "grad_norm": 0.859375, + "learning_rate": 9.987277028622976e-05, + "loss": 0.7912, + "step": 33853 + }, + { + "epoch": 0.8692757907372726, + "grad_norm": 0.81640625, + "learning_rate": 9.986830608828364e-05, + "loss": 0.897, + "step": 33854 + }, + { + "epoch": 0.8693014679331944, + "grad_norm": 0.78125, + "learning_rate": 9.986384189059999e-05, + "loss": 0.663, + "step": 33855 + }, + { + "epoch": 0.8693271451291161, + "grad_norm": 0.75390625, + "learning_rate": 9.98593776931877e-05, + "loss": 0.8348, + "step": 33856 + }, + { + "epoch": 0.869352822325038, + "grad_norm": 0.828125, + "learning_rate": 9.98549134960556e-05, + "loss": 0.8212, + "step": 33857 + }, + { + "epoch": 0.8693784995209598, + "grad_norm": 0.80859375, + "learning_rate": 9.985044929921272e-05, + "loss": 0.7368, + "step": 33858 + }, + { + "epoch": 0.8694041767168816, + "grad_norm": 0.76171875, + "learning_rate": 9.984598510266783e-05, + "loss": 0.8394, + "step": 33859 + }, + { + "epoch": 0.8694298539128035, + "grad_norm": 0.7421875, + "learning_rate": 9.98415209064299e-05, + "loss": 0.8213, + "step": 33860 + }, + { + "epoch": 0.8694555311087253, + "grad_norm": 0.7578125, + "learning_rate": 9.983705671050783e-05, + "loss": 0.818, + "step": 33861 + }, + { + "epoch": 0.8694812083046471, + "grad_norm": 0.8125, + "learning_rate": 9.983259251491042e-05, + "loss": 0.917, + "step": 33862 + }, + { + "epoch": 0.8695068855005689, + "grad_norm": 0.80078125, + "learning_rate": 9.982812831964669e-05, + "loss": 0.7449, + "step": 33863 + }, + { + "epoch": 0.8695325626964907, + "grad_norm": 0.83984375, + "learning_rate": 9.982366412472547e-05, + "loss": 0.7509, + "step": 33864 + }, + { + "epoch": 0.8695582398924125, + "grad_norm": 0.81640625, + "learning_rate": 9.981919993015567e-05, + "loss": 0.8608, + "step": 33865 + }, + { + "epoch": 0.8695839170883344, + "grad_norm": 0.96484375, + "learning_rate": 9.981473573594618e-05, + "loss": 0.8049, + "step": 33866 + }, + { + "epoch": 0.8696095942842562, + "grad_norm": 0.88671875, + "learning_rate": 9.981027154210595e-05, + "loss": 0.9434, + "step": 33867 + }, + { + "epoch": 0.869635271480178, + "grad_norm": 0.7890625, + "learning_rate": 9.98058073486438e-05, + "loss": 0.8305, + "step": 33868 + }, + { + "epoch": 0.8696609486760998, + "grad_norm": 0.83984375, + "learning_rate": 9.980134315556863e-05, + "loss": 0.818, + "step": 33869 + }, + { + "epoch": 0.8696866258720216, + "grad_norm": 0.79296875, + "learning_rate": 9.97968789628894e-05, + "loss": 0.7997, + "step": 33870 + }, + { + "epoch": 0.8697123030679434, + "grad_norm": 0.796875, + "learning_rate": 9.979241477061496e-05, + "loss": 0.8209, + "step": 33871 + }, + { + "epoch": 0.8697379802638653, + "grad_norm": 0.79296875, + "learning_rate": 9.978795057875426e-05, + "loss": 0.6936, + "step": 33872 + }, + { + "epoch": 0.8697636574597871, + "grad_norm": 0.796875, + "learning_rate": 9.978348638731607e-05, + "loss": 0.7329, + "step": 33873 + }, + { + "epoch": 0.869789334655709, + "grad_norm": 0.8515625, + "learning_rate": 9.977902219630944e-05, + "loss": 0.7569, + "step": 33874 + }, + { + "epoch": 0.8698150118516308, + "grad_norm": 0.75390625, + "learning_rate": 9.977455800574319e-05, + "loss": 0.7119, + "step": 33875 + }, + { + "epoch": 0.8698406890475525, + "grad_norm": 0.921875, + "learning_rate": 9.97700938156262e-05, + "loss": 0.7527, + "step": 33876 + }, + { + "epoch": 0.8698663662434744, + "grad_norm": 0.796875, + "learning_rate": 9.976562962596744e-05, + "loss": 0.8445, + "step": 33877 + }, + { + "epoch": 0.8698920434393962, + "grad_norm": 0.7890625, + "learning_rate": 9.976116543677571e-05, + "loss": 0.8255, + "step": 33878 + }, + { + "epoch": 0.869917720635318, + "grad_norm": 0.81640625, + "learning_rate": 9.975670124805997e-05, + "loss": 0.9914, + "step": 33879 + }, + { + "epoch": 0.8699433978312399, + "grad_norm": 0.74609375, + "learning_rate": 9.975223705982909e-05, + "loss": 0.7788, + "step": 33880 + }, + { + "epoch": 0.8699690750271617, + "grad_norm": 0.81640625, + "learning_rate": 9.9747772872092e-05, + "loss": 0.8252, + "step": 33881 + }, + { + "epoch": 0.8699947522230834, + "grad_norm": 0.75390625, + "learning_rate": 9.974330868485759e-05, + "loss": 0.7994, + "step": 33882 + }, + { + "epoch": 0.8700204294190053, + "grad_norm": 0.83984375, + "learning_rate": 9.973884449813471e-05, + "loss": 0.7781, + "step": 33883 + }, + { + "epoch": 0.8700461066149271, + "grad_norm": 0.8046875, + "learning_rate": 9.97343803119323e-05, + "loss": 0.851, + "step": 33884 + }, + { + "epoch": 0.8700717838108489, + "grad_norm": 0.72265625, + "learning_rate": 9.972991612625924e-05, + "loss": 0.7481, + "step": 33885 + }, + { + "epoch": 0.8700974610067708, + "grad_norm": 0.79296875, + "learning_rate": 9.972545194112444e-05, + "loss": 0.8697, + "step": 33886 + }, + { + "epoch": 0.8701231382026926, + "grad_norm": 0.859375, + "learning_rate": 9.972098775653682e-05, + "loss": 0.7987, + "step": 33887 + }, + { + "epoch": 0.8701488153986144, + "grad_norm": 0.78515625, + "learning_rate": 9.971652357250521e-05, + "loss": 0.7739, + "step": 33888 + }, + { + "epoch": 0.8701744925945362, + "grad_norm": 0.80859375, + "learning_rate": 9.971205938903855e-05, + "loss": 0.6997, + "step": 33889 + }, + { + "epoch": 0.870200169790458, + "grad_norm": 0.86328125, + "learning_rate": 9.970759520614571e-05, + "loss": 0.9154, + "step": 33890 + }, + { + "epoch": 0.8702258469863798, + "grad_norm": 0.72265625, + "learning_rate": 9.970313102383562e-05, + "loss": 0.6962, + "step": 33891 + }, + { + "epoch": 0.8702515241823017, + "grad_norm": 0.8046875, + "learning_rate": 9.969866684211716e-05, + "loss": 0.7766, + "step": 33892 + }, + { + "epoch": 0.8702772013782235, + "grad_norm": 0.7734375, + "learning_rate": 9.969420266099925e-05, + "loss": 0.8263, + "step": 33893 + }, + { + "epoch": 0.8703028785741453, + "grad_norm": 0.83203125, + "learning_rate": 9.968973848049078e-05, + "loss": 0.8562, + "step": 33894 + }, + { + "epoch": 0.8703285557700672, + "grad_norm": 0.8515625, + "learning_rate": 9.968527430060059e-05, + "loss": 0.8436, + "step": 33895 + }, + { + "epoch": 0.8703542329659889, + "grad_norm": 0.734375, + "learning_rate": 9.968081012133763e-05, + "loss": 0.8107, + "step": 33896 + }, + { + "epoch": 0.8703799101619107, + "grad_norm": 0.78125, + "learning_rate": 9.967634594271078e-05, + "loss": 0.7247, + "step": 33897 + }, + { + "epoch": 0.8704055873578326, + "grad_norm": 0.84765625, + "learning_rate": 9.9671881764729e-05, + "loss": 0.9047, + "step": 33898 + }, + { + "epoch": 0.8704312645537544, + "grad_norm": 0.75, + "learning_rate": 9.966741758740105e-05, + "loss": 0.6583, + "step": 33899 + }, + { + "epoch": 0.8704569417496762, + "grad_norm": 0.77734375, + "learning_rate": 9.966295341073596e-05, + "loss": 0.8093, + "step": 33900 + }, + { + "epoch": 0.8704826189455981, + "grad_norm": 0.7734375, + "learning_rate": 9.965848923474258e-05, + "loss": 0.8615, + "step": 33901 + }, + { + "epoch": 0.8705082961415198, + "grad_norm": 0.8671875, + "learning_rate": 9.965402505942973e-05, + "loss": 0.972, + "step": 33902 + }, + { + "epoch": 0.8705339733374416, + "grad_norm": 0.765625, + "learning_rate": 9.964956088480646e-05, + "loss": 0.8374, + "step": 33903 + }, + { + "epoch": 0.8705596505333635, + "grad_norm": 0.76953125, + "learning_rate": 9.964509671088153e-05, + "loss": 0.8092, + "step": 33904 + }, + { + "epoch": 0.8705853277292853, + "grad_norm": 0.7890625, + "learning_rate": 9.964063253766392e-05, + "loss": 0.7475, + "step": 33905 + }, + { + "epoch": 0.8706110049252072, + "grad_norm": 0.765625, + "learning_rate": 9.963616836516246e-05, + "loss": 0.9008, + "step": 33906 + }, + { + "epoch": 0.870636682121129, + "grad_norm": 0.76953125, + "learning_rate": 9.963170419338612e-05, + "loss": 0.8799, + "step": 33907 + }, + { + "epoch": 0.8706623593170508, + "grad_norm": 0.796875, + "learning_rate": 9.962724002234377e-05, + "loss": 0.783, + "step": 33908 + }, + { + "epoch": 0.8706880365129726, + "grad_norm": 0.7578125, + "learning_rate": 9.962277585204426e-05, + "loss": 0.8408, + "step": 33909 + }, + { + "epoch": 0.8707137137088944, + "grad_norm": 0.80859375, + "learning_rate": 9.961831168249655e-05, + "loss": 0.8837, + "step": 33910 + }, + { + "epoch": 0.8707393909048162, + "grad_norm": 0.8203125, + "learning_rate": 9.961384751370948e-05, + "loss": 0.8216, + "step": 33911 + }, + { + "epoch": 0.8707650681007381, + "grad_norm": 0.76953125, + "learning_rate": 9.960938334569202e-05, + "loss": 0.7251, + "step": 33912 + }, + { + "epoch": 0.8707907452966599, + "grad_norm": 0.78125, + "learning_rate": 9.960491917845296e-05, + "loss": 0.7586, + "step": 33913 + }, + { + "epoch": 0.8708164224925817, + "grad_norm": 0.81640625, + "learning_rate": 9.960045501200134e-05, + "loss": 0.8009, + "step": 33914 + }, + { + "epoch": 0.8708420996885036, + "grad_norm": 0.984375, + "learning_rate": 9.959599084634593e-05, + "loss": 0.8014, + "step": 33915 + }, + { + "epoch": 0.8708677768844253, + "grad_norm": 0.81640625, + "learning_rate": 9.959152668149567e-05, + "loss": 0.7514, + "step": 33916 + }, + { + "epoch": 0.8708934540803471, + "grad_norm": 0.78515625, + "learning_rate": 9.958706251745949e-05, + "loss": 0.7558, + "step": 33917 + }, + { + "epoch": 0.870919131276269, + "grad_norm": 0.82421875, + "learning_rate": 9.95825983542462e-05, + "loss": 0.9098, + "step": 33918 + }, + { + "epoch": 0.8709448084721908, + "grad_norm": 0.90234375, + "learning_rate": 9.957813419186483e-05, + "loss": 0.884, + "step": 33919 + }, + { + "epoch": 0.8709704856681126, + "grad_norm": 0.76953125, + "learning_rate": 9.957367003032417e-05, + "loss": 0.8379, + "step": 33920 + }, + { + "epoch": 0.8709961628640345, + "grad_norm": 0.73828125, + "learning_rate": 9.956920586963312e-05, + "loss": 0.7773, + "step": 33921 + }, + { + "epoch": 0.8710218400599562, + "grad_norm": 0.796875, + "learning_rate": 9.956474170980066e-05, + "loss": 0.8437, + "step": 33922 + }, + { + "epoch": 0.871047517255878, + "grad_norm": 0.8359375, + "learning_rate": 9.956027755083556e-05, + "loss": 0.7769, + "step": 33923 + }, + { + "epoch": 0.8710731944517999, + "grad_norm": 0.74609375, + "learning_rate": 9.955581339274684e-05, + "loss": 0.745, + "step": 33924 + }, + { + "epoch": 0.8710988716477217, + "grad_norm": 0.7578125, + "learning_rate": 9.95513492355433e-05, + "loss": 0.8886, + "step": 33925 + }, + { + "epoch": 0.8711245488436435, + "grad_norm": 0.78125, + "learning_rate": 9.954688507923391e-05, + "loss": 0.7842, + "step": 33926 + }, + { + "epoch": 0.8711502260395654, + "grad_norm": 0.8125, + "learning_rate": 9.954242092382758e-05, + "loss": 0.8284, + "step": 33927 + }, + { + "epoch": 0.8711759032354872, + "grad_norm": 0.84765625, + "learning_rate": 9.953795676933309e-05, + "loss": 0.8068, + "step": 33928 + }, + { + "epoch": 0.8712015804314089, + "grad_norm": 0.75390625, + "learning_rate": 9.953349261575943e-05, + "loss": 0.7672, + "step": 33929 + }, + { + "epoch": 0.8712272576273308, + "grad_norm": 0.74609375, + "learning_rate": 9.952902846311548e-05, + "loss": 0.8168, + "step": 33930 + }, + { + "epoch": 0.8712529348232526, + "grad_norm": 0.7265625, + "learning_rate": 9.952456431141015e-05, + "loss": 0.7648, + "step": 33931 + }, + { + "epoch": 0.8712786120191744, + "grad_norm": 0.79296875, + "learning_rate": 9.952010016065228e-05, + "loss": 0.8716, + "step": 33932 + }, + { + "epoch": 0.8713042892150963, + "grad_norm": 0.86328125, + "learning_rate": 9.951563601085086e-05, + "loss": 0.7107, + "step": 33933 + }, + { + "epoch": 0.8713299664110181, + "grad_norm": 0.7890625, + "learning_rate": 9.951117186201472e-05, + "loss": 0.8387, + "step": 33934 + }, + { + "epoch": 0.87135564360694, + "grad_norm": 0.8828125, + "learning_rate": 9.950670771415275e-05, + "loss": 0.7807, + "step": 33935 + }, + { + "epoch": 0.8713813208028617, + "grad_norm": 0.84375, + "learning_rate": 9.950224356727388e-05, + "loss": 0.8453, + "step": 33936 + }, + { + "epoch": 0.8714069979987835, + "grad_norm": 0.72265625, + "learning_rate": 9.949777942138697e-05, + "loss": 0.8393, + "step": 33937 + }, + { + "epoch": 0.8714326751947054, + "grad_norm": 0.78515625, + "learning_rate": 9.949331527650098e-05, + "loss": 0.7919, + "step": 33938 + }, + { + "epoch": 0.8714583523906272, + "grad_norm": 0.78515625, + "learning_rate": 9.948885113262474e-05, + "loss": 0.8369, + "step": 33939 + }, + { + "epoch": 0.871484029586549, + "grad_norm": 0.78125, + "learning_rate": 9.948438698976719e-05, + "loss": 0.7536, + "step": 33940 + }, + { + "epoch": 0.8715097067824709, + "grad_norm": 0.765625, + "learning_rate": 9.947992284793723e-05, + "loss": 0.7797, + "step": 33941 + }, + { + "epoch": 0.8715353839783926, + "grad_norm": 0.82421875, + "learning_rate": 9.947545870714366e-05, + "loss": 0.9116, + "step": 33942 + }, + { + "epoch": 0.8715610611743144, + "grad_norm": 0.81640625, + "learning_rate": 9.947099456739553e-05, + "loss": 0.8074, + "step": 33943 + }, + { + "epoch": 0.8715867383702363, + "grad_norm": 0.87890625, + "learning_rate": 9.946653042870161e-05, + "loss": 0.8638, + "step": 33944 + }, + { + "epoch": 0.8716124155661581, + "grad_norm": 0.79296875, + "learning_rate": 9.946206629107088e-05, + "loss": 0.8264, + "step": 33945 + }, + { + "epoch": 0.8716380927620799, + "grad_norm": 0.76953125, + "learning_rate": 9.945760215451216e-05, + "loss": 0.7279, + "step": 33946 + }, + { + "epoch": 0.8716637699580018, + "grad_norm": 0.80859375, + "learning_rate": 9.945313801903443e-05, + "loss": 0.8413, + "step": 33947 + }, + { + "epoch": 0.8716894471539236, + "grad_norm": 0.77734375, + "learning_rate": 9.944867388464657e-05, + "loss": 0.8007, + "step": 33948 + }, + { + "epoch": 0.8717151243498453, + "grad_norm": 0.8046875, + "learning_rate": 9.944420975135742e-05, + "loss": 0.7267, + "step": 33949 + }, + { + "epoch": 0.8717408015457672, + "grad_norm": 0.7265625, + "learning_rate": 9.943974561917589e-05, + "loss": 0.6926, + "step": 33950 + }, + { + "epoch": 0.871766478741689, + "grad_norm": 0.80078125, + "learning_rate": 9.943528148811091e-05, + "loss": 0.7843, + "step": 33951 + }, + { + "epoch": 0.8717921559376108, + "grad_norm": 0.859375, + "learning_rate": 9.94308173581714e-05, + "loss": 0.8803, + "step": 33952 + }, + { + "epoch": 0.8718178331335327, + "grad_norm": 0.7890625, + "learning_rate": 9.942635322936619e-05, + "loss": 0.8855, + "step": 33953 + }, + { + "epoch": 0.8718435103294545, + "grad_norm": 0.80078125, + "learning_rate": 9.942188910170419e-05, + "loss": 0.6861, + "step": 33954 + }, + { + "epoch": 0.8718691875253763, + "grad_norm": 0.78125, + "learning_rate": 9.941742497519433e-05, + "loss": 0.7164, + "step": 33955 + }, + { + "epoch": 0.8718948647212981, + "grad_norm": 0.7421875, + "learning_rate": 9.941296084984546e-05, + "loss": 0.6083, + "step": 33956 + }, + { + "epoch": 0.8719205419172199, + "grad_norm": 0.76171875, + "learning_rate": 9.940849672566657e-05, + "loss": 0.849, + "step": 33957 + }, + { + "epoch": 0.8719462191131417, + "grad_norm": 0.828125, + "learning_rate": 9.940403260266642e-05, + "loss": 0.8539, + "step": 33958 + }, + { + "epoch": 0.8719718963090636, + "grad_norm": 0.76953125, + "learning_rate": 9.939956848085404e-05, + "loss": 0.732, + "step": 33959 + }, + { + "epoch": 0.8719975735049854, + "grad_norm": 0.83203125, + "learning_rate": 9.939510436023826e-05, + "loss": 0.7268, + "step": 33960 + }, + { + "epoch": 0.8720232507009072, + "grad_norm": 0.75, + "learning_rate": 9.939064024082793e-05, + "loss": 0.7289, + "step": 33961 + }, + { + "epoch": 0.872048927896829, + "grad_norm": 0.75390625, + "learning_rate": 9.938617612263206e-05, + "loss": 0.7051, + "step": 33962 + }, + { + "epoch": 0.8720746050927508, + "grad_norm": 0.73828125, + "learning_rate": 9.938171200565946e-05, + "loss": 0.8264, + "step": 33963 + }, + { + "epoch": 0.8721002822886726, + "grad_norm": 0.734375, + "learning_rate": 9.937724788991905e-05, + "loss": 0.7569, + "step": 33964 + }, + { + "epoch": 0.8721259594845945, + "grad_norm": 0.765625, + "learning_rate": 9.937278377541971e-05, + "loss": 0.8359, + "step": 33965 + }, + { + "epoch": 0.8721516366805163, + "grad_norm": 0.76953125, + "learning_rate": 9.936831966217038e-05, + "loss": 0.6583, + "step": 33966 + }, + { + "epoch": 0.8721773138764382, + "grad_norm": 0.7421875, + "learning_rate": 9.936385555017995e-05, + "loss": 0.8218, + "step": 33967 + }, + { + "epoch": 0.87220299107236, + "grad_norm": 0.7578125, + "learning_rate": 9.935939143945726e-05, + "loss": 0.9379, + "step": 33968 + }, + { + "epoch": 0.8722286682682817, + "grad_norm": 0.94921875, + "learning_rate": 9.935492733001127e-05, + "loss": 0.864, + "step": 33969 + }, + { + "epoch": 0.8722543454642036, + "grad_norm": 0.80078125, + "learning_rate": 9.935046322185083e-05, + "loss": 0.8021, + "step": 33970 + }, + { + "epoch": 0.8722800226601254, + "grad_norm": 0.80859375, + "learning_rate": 9.934599911498487e-05, + "loss": 0.8271, + "step": 33971 + }, + { + "epoch": 0.8723056998560472, + "grad_norm": 0.75, + "learning_rate": 9.934153500942225e-05, + "loss": 0.7345, + "step": 33972 + }, + { + "epoch": 0.8723313770519691, + "grad_norm": 0.94140625, + "learning_rate": 9.933707090517195e-05, + "loss": 0.7958, + "step": 33973 + }, + { + "epoch": 0.8723570542478909, + "grad_norm": 0.91796875, + "learning_rate": 9.933260680224276e-05, + "loss": 0.9048, + "step": 33974 + }, + { + "epoch": 0.8723827314438127, + "grad_norm": 0.77734375, + "learning_rate": 9.932814270064363e-05, + "loss": 0.8337, + "step": 33975 + }, + { + "epoch": 0.8724084086397345, + "grad_norm": 0.7890625, + "learning_rate": 9.932367860038346e-05, + "loss": 0.6949, + "step": 33976 + }, + { + "epoch": 0.8724340858356563, + "grad_norm": 0.80078125, + "learning_rate": 9.931921450147113e-05, + "loss": 0.6667, + "step": 33977 + }, + { + "epoch": 0.8724597630315781, + "grad_norm": 0.734375, + "learning_rate": 9.931475040391557e-05, + "loss": 0.7417, + "step": 33978 + }, + { + "epoch": 0.8724854402275, + "grad_norm": 0.74609375, + "learning_rate": 9.931028630772562e-05, + "loss": 0.8293, + "step": 33979 + }, + { + "epoch": 0.8725111174234218, + "grad_norm": 0.84375, + "learning_rate": 9.930582221291021e-05, + "loss": 0.762, + "step": 33980 + }, + { + "epoch": 0.8725367946193436, + "grad_norm": 0.78125, + "learning_rate": 9.930135811947825e-05, + "loss": 0.7521, + "step": 33981 + }, + { + "epoch": 0.8725624718152654, + "grad_norm": 0.77734375, + "learning_rate": 9.929689402743861e-05, + "loss": 0.7087, + "step": 33982 + }, + { + "epoch": 0.8725881490111872, + "grad_norm": 0.765625, + "learning_rate": 9.929242993680022e-05, + "loss": 0.8726, + "step": 33983 + }, + { + "epoch": 0.872613826207109, + "grad_norm": 0.796875, + "learning_rate": 9.928796584757191e-05, + "loss": 0.769, + "step": 33984 + }, + { + "epoch": 0.8726395034030309, + "grad_norm": 0.78125, + "learning_rate": 9.928350175976264e-05, + "loss": 0.8426, + "step": 33985 + }, + { + "epoch": 0.8726651805989527, + "grad_norm": 0.8515625, + "learning_rate": 9.927903767338131e-05, + "loss": 0.7839, + "step": 33986 + }, + { + "epoch": 0.8726908577948745, + "grad_norm": 0.78515625, + "learning_rate": 9.927457358843674e-05, + "loss": 0.7055, + "step": 33987 + }, + { + "epoch": 0.8727165349907964, + "grad_norm": 0.828125, + "learning_rate": 9.927010950493793e-05, + "loss": 0.7774, + "step": 33988 + }, + { + "epoch": 0.8727422121867181, + "grad_norm": 0.75, + "learning_rate": 9.92656454228937e-05, + "loss": 0.7367, + "step": 33989 + }, + { + "epoch": 0.8727678893826399, + "grad_norm": 0.76171875, + "learning_rate": 9.926118134231298e-05, + "loss": 0.7824, + "step": 33990 + }, + { + "epoch": 0.8727935665785618, + "grad_norm": 0.84375, + "learning_rate": 9.925671726320464e-05, + "loss": 0.7697, + "step": 33991 + }, + { + "epoch": 0.8728192437744836, + "grad_norm": 0.8203125, + "learning_rate": 9.925225318557764e-05, + "loss": 0.6766, + "step": 33992 + }, + { + "epoch": 0.8728449209704054, + "grad_norm": 0.7421875, + "learning_rate": 9.92477891094408e-05, + "loss": 0.7869, + "step": 33993 + }, + { + "epoch": 0.8728705981663273, + "grad_norm": 0.7578125, + "learning_rate": 9.924332503480304e-05, + "loss": 0.7402, + "step": 33994 + }, + { + "epoch": 0.8728962753622491, + "grad_norm": 0.80859375, + "learning_rate": 9.923886096167328e-05, + "loss": 0.772, + "step": 33995 + }, + { + "epoch": 0.8729219525581708, + "grad_norm": 0.76953125, + "learning_rate": 9.923439689006037e-05, + "loss": 0.7367, + "step": 33996 + }, + { + "epoch": 0.8729476297540927, + "grad_norm": 0.7578125, + "learning_rate": 9.92299328199733e-05, + "loss": 0.7693, + "step": 33997 + }, + { + "epoch": 0.8729733069500145, + "grad_norm": 0.83984375, + "learning_rate": 9.922546875142083e-05, + "loss": 0.9476, + "step": 33998 + }, + { + "epoch": 0.8729989841459364, + "grad_norm": 0.7421875, + "learning_rate": 9.9221004684412e-05, + "loss": 0.8536, + "step": 33999 + }, + { + "epoch": 0.8730246613418582, + "grad_norm": 0.76171875, + "learning_rate": 9.921654061895563e-05, + "loss": 0.873, + "step": 34000 + }, + { + "epoch": 0.8730246613418582, + "eval_loss": 0.7939178943634033, + "eval_runtime": 351.5162, + "eval_samples_per_second": 28.448, + "eval_steps_per_second": 0.89, + "step": 34000 + }, + { + "epoch": 0.9003331616170857, + "grad_norm": 0.9375, + "learning_rate": 9.921207655506057e-05, + "loss": 0.8261, + "step": 34001 + }, + { + "epoch": 0.90035964122538, + "grad_norm": 0.77734375, + "learning_rate": 9.37613815364211e-05, + "loss": 0.7983, + "step": 34002 + }, + { + "epoch": 0.9003861208336743, + "grad_norm": 0.75390625, + "learning_rate": 9.375678677003054e-05, + "loss": 0.7575, + "step": 34003 + }, + { + "epoch": 0.9004126004419687, + "grad_norm": 0.83203125, + "learning_rate": 9.375219201687211e-05, + "loss": 0.8773, + "step": 34004 + }, + { + "epoch": 0.9004390800502631, + "grad_norm": 0.78125, + "learning_rate": 9.374759727695556e-05, + "loss": 0.7975, + "step": 34005 + }, + { + "epoch": 0.9004655596585575, + "grad_norm": 0.82421875, + "learning_rate": 9.374300255029065e-05, + "loss": 0.9247, + "step": 34006 + }, + { + "epoch": 0.9004920392668518, + "grad_norm": 0.79296875, + "learning_rate": 9.373840783688708e-05, + "loss": 0.8328, + "step": 34007 + }, + { + "epoch": 0.9005185188751462, + "grad_norm": 0.78125, + "learning_rate": 9.373381313675459e-05, + "loss": 0.7105, + "step": 34008 + }, + { + "epoch": 0.9005449984834406, + "grad_norm": 0.84765625, + "learning_rate": 9.372921844990291e-05, + "loss": 0.8798, + "step": 34009 + }, + { + "epoch": 0.900571478091735, + "grad_norm": 0.8359375, + "learning_rate": 9.372462377634182e-05, + "loss": 0.8332, + "step": 34010 + }, + { + "epoch": 0.9005979577000294, + "grad_norm": 0.76953125, + "learning_rate": 9.372002911608101e-05, + "loss": 0.9123, + "step": 34011 + }, + { + "epoch": 0.9006244373083238, + "grad_norm": 0.80078125, + "learning_rate": 9.371543446913026e-05, + "loss": 0.7355, + "step": 34012 + }, + { + "epoch": 0.9006509169166181, + "grad_norm": 0.7265625, + "learning_rate": 9.371083983549928e-05, + "loss": 0.7831, + "step": 34013 + }, + { + "epoch": 0.9006773965249125, + "grad_norm": 0.83203125, + "learning_rate": 9.370624521519776e-05, + "loss": 0.7978, + "step": 34014 + }, + { + "epoch": 0.9007038761332069, + "grad_norm": 0.80859375, + "learning_rate": 9.370165060823556e-05, + "loss": 0.9161, + "step": 34015 + }, + { + "epoch": 0.9007303557415013, + "grad_norm": 0.73046875, + "learning_rate": 9.369705601462234e-05, + "loss": 0.6855, + "step": 34016 + }, + { + "epoch": 0.9007568353497957, + "grad_norm": 0.81640625, + "learning_rate": 9.369246143436784e-05, + "loss": 0.7878, + "step": 34017 + }, + { + "epoch": 0.90078331495809, + "grad_norm": 0.82421875, + "learning_rate": 9.368786686748182e-05, + "loss": 0.8564, + "step": 34018 + }, + { + "epoch": 0.9008097945663843, + "grad_norm": 0.8359375, + "learning_rate": 9.368327231397396e-05, + "loss": 0.826, + "step": 34019 + }, + { + "epoch": 0.9008362741746787, + "grad_norm": 0.73828125, + "learning_rate": 9.367867777385408e-05, + "loss": 0.8193, + "step": 34020 + }, + { + "epoch": 0.9008627537829731, + "grad_norm": 0.8515625, + "learning_rate": 9.36740832471319e-05, + "loss": 0.839, + "step": 34021 + }, + { + "epoch": 0.9008892333912675, + "grad_norm": 0.76953125, + "learning_rate": 9.366948873381713e-05, + "loss": 0.8047, + "step": 34022 + }, + { + "epoch": 0.9009157129995619, + "grad_norm": 0.765625, + "learning_rate": 9.366489423391951e-05, + "loss": 0.8113, + "step": 34023 + }, + { + "epoch": 0.9009421926078562, + "grad_norm": 0.8125, + "learning_rate": 9.366029974744874e-05, + "loss": 0.7647, + "step": 34024 + }, + { + "epoch": 0.9009686722161506, + "grad_norm": 0.765625, + "learning_rate": 9.365570527441465e-05, + "loss": 0.9213, + "step": 34025 + }, + { + "epoch": 0.900995151824445, + "grad_norm": 0.80078125, + "learning_rate": 9.365111081482692e-05, + "loss": 0.9405, + "step": 34026 + }, + { + "epoch": 0.9010216314327394, + "grad_norm": 0.77734375, + "learning_rate": 9.364651636869531e-05, + "loss": 0.8492, + "step": 34027 + }, + { + "epoch": 0.9010481110410338, + "grad_norm": 0.73828125, + "learning_rate": 9.364192193602955e-05, + "loss": 0.979, + "step": 34028 + }, + { + "epoch": 0.9010745906493282, + "grad_norm": 0.80859375, + "learning_rate": 9.363732751683934e-05, + "loss": 0.8707, + "step": 34029 + }, + { + "epoch": 0.9011010702576225, + "grad_norm": 0.8515625, + "learning_rate": 9.363273311113448e-05, + "loss": 0.7831, + "step": 34030 + }, + { + "epoch": 0.9011275498659169, + "grad_norm": 0.7734375, + "learning_rate": 9.362813871892467e-05, + "loss": 0.7289, + "step": 34031 + }, + { + "epoch": 0.9011540294742113, + "grad_norm": 0.8125, + "learning_rate": 9.362354434021968e-05, + "loss": 0.856, + "step": 34032 + }, + { + "epoch": 0.9011805090825057, + "grad_norm": 0.80859375, + "learning_rate": 9.36189499750292e-05, + "loss": 0.7669, + "step": 34033 + }, + { + "epoch": 0.9012069886908001, + "grad_norm": 0.8203125, + "learning_rate": 9.361435562336294e-05, + "loss": 0.8075, + "step": 34034 + }, + { + "epoch": 0.9012334682990943, + "grad_norm": 0.8359375, + "learning_rate": 9.360976128523074e-05, + "loss": 0.9438, + "step": 34035 + }, + { + "epoch": 0.9012599479073887, + "grad_norm": 0.90625, + "learning_rate": 9.36051669606423e-05, + "loss": 0.7972, + "step": 34036 + }, + { + "epoch": 0.9012864275156831, + "grad_norm": 0.77734375, + "learning_rate": 9.360057264960732e-05, + "loss": 0.7387, + "step": 34037 + }, + { + "epoch": 0.9013129071239775, + "grad_norm": 0.8125, + "learning_rate": 9.359597835213558e-05, + "loss": 0.8335, + "step": 34038 + }, + { + "epoch": 0.9013393867322719, + "grad_norm": 0.76171875, + "learning_rate": 9.359138406823675e-05, + "loss": 0.794, + "step": 34039 + }, + { + "epoch": 0.9013658663405663, + "grad_norm": 0.734375, + "learning_rate": 9.358678979792065e-05, + "loss": 0.6944, + "step": 34040 + }, + { + "epoch": 0.9013923459488606, + "grad_norm": 0.7421875, + "learning_rate": 9.358219554119698e-05, + "loss": 0.8513, + "step": 34041 + }, + { + "epoch": 0.901418825557155, + "grad_norm": 0.7890625, + "learning_rate": 9.357760129807549e-05, + "loss": 0.7877, + "step": 34042 + }, + { + "epoch": 0.9014453051654494, + "grad_norm": 0.8125, + "learning_rate": 9.357300706856591e-05, + "loss": 0.8157, + "step": 34043 + }, + { + "epoch": 0.9014717847737438, + "grad_norm": 0.76171875, + "learning_rate": 9.356841285267792e-05, + "loss": 0.8114, + "step": 34044 + }, + { + "epoch": 0.9014982643820382, + "grad_norm": 0.7734375, + "learning_rate": 9.356381865042136e-05, + "loss": 0.78, + "step": 34045 + }, + { + "epoch": 0.9015247439903326, + "grad_norm": 0.71484375, + "learning_rate": 9.355922446180593e-05, + "loss": 0.7193, + "step": 34046 + }, + { + "epoch": 0.9015512235986269, + "grad_norm": 0.81640625, + "learning_rate": 9.355463028684133e-05, + "loss": 0.8143, + "step": 34047 + }, + { + "epoch": 0.9015777032069213, + "grad_norm": 0.73046875, + "learning_rate": 9.355003612553735e-05, + "loss": 0.6852, + "step": 34048 + }, + { + "epoch": 0.9016041828152157, + "grad_norm": 0.734375, + "learning_rate": 9.354544197790366e-05, + "loss": 0.9107, + "step": 34049 + }, + { + "epoch": 0.9016306624235101, + "grad_norm": 0.796875, + "learning_rate": 9.354084784395006e-05, + "loss": 0.7172, + "step": 34050 + }, + { + "epoch": 0.9016571420318045, + "grad_norm": 0.82421875, + "learning_rate": 9.353625372368627e-05, + "loss": 0.8369, + "step": 34051 + }, + { + "epoch": 0.9016836216400987, + "grad_norm": 0.7890625, + "learning_rate": 9.353165961712201e-05, + "loss": 0.7303, + "step": 34052 + }, + { + "epoch": 0.9017101012483931, + "grad_norm": 0.734375, + "learning_rate": 9.352706552426702e-05, + "loss": 0.8402, + "step": 34053 + }, + { + "epoch": 0.9017365808566875, + "grad_norm": 0.76953125, + "learning_rate": 9.352247144513104e-05, + "loss": 0.843, + "step": 34054 + }, + { + "epoch": 0.9017630604649819, + "grad_norm": 0.7578125, + "learning_rate": 9.351787737972382e-05, + "loss": 0.7733, + "step": 34055 + }, + { + "epoch": 0.9017895400732763, + "grad_norm": 0.77734375, + "learning_rate": 9.35132833280551e-05, + "loss": 0.8591, + "step": 34056 + }, + { + "epoch": 0.9018160196815707, + "grad_norm": 0.80859375, + "learning_rate": 9.350868929013463e-05, + "loss": 0.7701, + "step": 34057 + }, + { + "epoch": 0.901842499289865, + "grad_norm": 0.73828125, + "learning_rate": 9.350409526597208e-05, + "loss": 0.7464, + "step": 34058 + }, + { + "epoch": 0.9018689788981594, + "grad_norm": 0.83203125, + "learning_rate": 9.34995012555772e-05, + "loss": 0.9021, + "step": 34059 + }, + { + "epoch": 0.9018954585064538, + "grad_norm": 0.76953125, + "learning_rate": 9.349490725895981e-05, + "loss": 0.8115, + "step": 34060 + }, + { + "epoch": 0.9019219381147482, + "grad_norm": 0.75, + "learning_rate": 9.349031327612958e-05, + "loss": 0.7385, + "step": 34061 + }, + { + "epoch": 0.9019484177230426, + "grad_norm": 0.73046875, + "learning_rate": 9.348571930709627e-05, + "loss": 0.8168, + "step": 34062 + }, + { + "epoch": 0.901974897331337, + "grad_norm": 0.875, + "learning_rate": 9.348112535186959e-05, + "loss": 0.8003, + "step": 34063 + }, + { + "epoch": 0.9020013769396313, + "grad_norm": 0.734375, + "learning_rate": 9.347653141045927e-05, + "loss": 0.8043, + "step": 34064 + }, + { + "epoch": 0.9020278565479257, + "grad_norm": 0.70703125, + "learning_rate": 9.347193748287509e-05, + "loss": 0.7872, + "step": 34065 + }, + { + "epoch": 0.9020543361562201, + "grad_norm": 0.80859375, + "learning_rate": 9.346734356912678e-05, + "loss": 0.8423, + "step": 34066 + }, + { + "epoch": 0.9020808157645145, + "grad_norm": 0.765625, + "learning_rate": 9.346274966922406e-05, + "loss": 0.821, + "step": 34067 + }, + { + "epoch": 0.9021072953728088, + "grad_norm": 0.7421875, + "learning_rate": 9.345815578317665e-05, + "loss": 0.7687, + "step": 34068 + }, + { + "epoch": 0.9021337749811031, + "grad_norm": 0.79296875, + "learning_rate": 9.345356191099431e-05, + "loss": 0.8486, + "step": 34069 + }, + { + "epoch": 0.9021602545893975, + "grad_norm": 0.88671875, + "learning_rate": 9.344896805268678e-05, + "loss": 0.77, + "step": 34070 + }, + { + "epoch": 0.9021867341976919, + "grad_norm": 0.71484375, + "learning_rate": 9.344437420826378e-05, + "loss": 0.7192, + "step": 34071 + }, + { + "epoch": 0.9022132138059863, + "grad_norm": 0.77734375, + "learning_rate": 9.343978037773506e-05, + "loss": 0.7758, + "step": 34072 + }, + { + "epoch": 0.9022396934142807, + "grad_norm": 0.8125, + "learning_rate": 9.343518656111034e-05, + "loss": 0.8598, + "step": 34073 + }, + { + "epoch": 0.9022661730225751, + "grad_norm": 0.8359375, + "learning_rate": 9.343059275839938e-05, + "loss": 0.7443, + "step": 34074 + }, + { + "epoch": 0.9022926526308694, + "grad_norm": 0.78125, + "learning_rate": 9.34259989696119e-05, + "loss": 0.9682, + "step": 34075 + }, + { + "epoch": 0.9023191322391638, + "grad_norm": 0.76171875, + "learning_rate": 9.342140519475764e-05, + "loss": 0.8645, + "step": 34076 + }, + { + "epoch": 0.9023456118474582, + "grad_norm": 0.75390625, + "learning_rate": 9.341681143384633e-05, + "loss": 0.8468, + "step": 34077 + }, + { + "epoch": 0.9023720914557526, + "grad_norm": 0.7734375, + "learning_rate": 9.341221768688773e-05, + "loss": 0.8955, + "step": 34078 + }, + { + "epoch": 0.902398571064047, + "grad_norm": 0.72265625, + "learning_rate": 9.340762395389149e-05, + "loss": 0.7644, + "step": 34079 + }, + { + "epoch": 0.9024250506723414, + "grad_norm": 0.859375, + "learning_rate": 9.340303023486748e-05, + "loss": 0.7209, + "step": 34080 + }, + { + "epoch": 0.9024515302806357, + "grad_norm": 0.73828125, + "learning_rate": 9.339843652982538e-05, + "loss": 0.7913, + "step": 34081 + }, + { + "epoch": 0.9024780098889301, + "grad_norm": 0.83984375, + "learning_rate": 9.339384283877489e-05, + "loss": 0.7222, + "step": 34082 + }, + { + "epoch": 0.9025044894972245, + "grad_norm": 0.80859375, + "learning_rate": 9.338924916172578e-05, + "loss": 0.7736, + "step": 34083 + }, + { + "epoch": 0.9025309691055188, + "grad_norm": 0.80859375, + "learning_rate": 9.338465549868773e-05, + "loss": 0.894, + "step": 34084 + }, + { + "epoch": 0.9025574487138132, + "grad_norm": 0.7734375, + "learning_rate": 9.338006184967058e-05, + "loss": 0.7517, + "step": 34085 + }, + { + "epoch": 0.9025839283221075, + "grad_norm": 0.81640625, + "learning_rate": 9.337546821468402e-05, + "loss": 0.8015, + "step": 34086 + }, + { + "epoch": 0.9026104079304019, + "grad_norm": 0.82421875, + "learning_rate": 9.337087459373776e-05, + "loss": 0.7904, + "step": 34087 + }, + { + "epoch": 0.9026368875386963, + "grad_norm": 0.82421875, + "learning_rate": 9.336628098684154e-05, + "loss": 0.8362, + "step": 34088 + }, + { + "epoch": 0.9026633671469907, + "grad_norm": 0.828125, + "learning_rate": 9.336168739400511e-05, + "loss": 0.7772, + "step": 34089 + }, + { + "epoch": 0.9026898467552851, + "grad_norm": 0.734375, + "learning_rate": 9.335709381523819e-05, + "loss": 0.8347, + "step": 34090 + }, + { + "epoch": 0.9027163263635795, + "grad_norm": 0.76171875, + "learning_rate": 9.335250025055056e-05, + "loss": 0.7925, + "step": 34091 + }, + { + "epoch": 0.9027428059718738, + "grad_norm": 0.80859375, + "learning_rate": 9.334790669995192e-05, + "loss": 0.8202, + "step": 34092 + }, + { + "epoch": 0.9027692855801682, + "grad_norm": 0.828125, + "learning_rate": 9.334331316345203e-05, + "loss": 0.925, + "step": 34093 + }, + { + "epoch": 0.9027957651884626, + "grad_norm": 0.78125, + "learning_rate": 9.333871964106056e-05, + "loss": 0.8528, + "step": 34094 + }, + { + "epoch": 0.902822244796757, + "grad_norm": 0.734375, + "learning_rate": 9.333412613278733e-05, + "loss": 0.7673, + "step": 34095 + }, + { + "epoch": 0.9028487244050514, + "grad_norm": 0.7734375, + "learning_rate": 9.332953263864204e-05, + "loss": 0.8261, + "step": 34096 + }, + { + "epoch": 0.9028752040133458, + "grad_norm": 0.78125, + "learning_rate": 9.332493915863442e-05, + "loss": 0.8273, + "step": 34097 + }, + { + "epoch": 0.9029016836216401, + "grad_norm": 0.8046875, + "learning_rate": 9.33203456927742e-05, + "loss": 0.7104, + "step": 34098 + }, + { + "epoch": 0.9029281632299345, + "grad_norm": 0.78515625, + "learning_rate": 9.331575224107108e-05, + "loss": 0.8512, + "step": 34099 + }, + { + "epoch": 0.9029546428382289, + "grad_norm": 0.76953125, + "learning_rate": 9.33111588035349e-05, + "loss": 0.81, + "step": 34100 + }, + { + "epoch": 0.9029811224465232, + "grad_norm": 0.84765625, + "learning_rate": 9.330656538017534e-05, + "loss": 0.7634, + "step": 34101 + }, + { + "epoch": 0.9030076020548176, + "grad_norm": 0.75, + "learning_rate": 9.330197197100212e-05, + "loss": 0.8982, + "step": 34102 + }, + { + "epoch": 0.9030340816631119, + "grad_norm": 0.796875, + "learning_rate": 9.329737857602499e-05, + "loss": 0.9136, + "step": 34103 + }, + { + "epoch": 0.9030605612714063, + "grad_norm": 0.7265625, + "learning_rate": 9.329278519525364e-05, + "loss": 0.7782, + "step": 34104 + }, + { + "epoch": 0.9030870408797007, + "grad_norm": 0.83203125, + "learning_rate": 9.32881918286979e-05, + "loss": 0.7378, + "step": 34105 + }, + { + "epoch": 0.9031135204879951, + "grad_norm": 0.78515625, + "learning_rate": 9.328359847636744e-05, + "loss": 0.6718, + "step": 34106 + }, + { + "epoch": 0.9031400000962895, + "grad_norm": 0.8203125, + "learning_rate": 9.327900513827201e-05, + "loss": 0.7471, + "step": 34107 + }, + { + "epoch": 0.9031664797045839, + "grad_norm": 0.7578125, + "learning_rate": 9.327441181442136e-05, + "loss": 0.8971, + "step": 34108 + }, + { + "epoch": 0.9031929593128782, + "grad_norm": 0.7265625, + "learning_rate": 9.326981850482515e-05, + "loss": 0.8756, + "step": 34109 + }, + { + "epoch": 0.9032194389211726, + "grad_norm": 0.80078125, + "learning_rate": 9.326522520949322e-05, + "loss": 0.7393, + "step": 34110 + }, + { + "epoch": 0.903245918529467, + "grad_norm": 0.7578125, + "learning_rate": 9.326063192843528e-05, + "loss": 0.7259, + "step": 34111 + }, + { + "epoch": 0.9032723981377614, + "grad_norm": 0.75, + "learning_rate": 9.325603866166102e-05, + "loss": 0.7555, + "step": 34112 + }, + { + "epoch": 0.9032988777460558, + "grad_norm": 0.72265625, + "learning_rate": 9.325144540918022e-05, + "loss": 0.702, + "step": 34113 + }, + { + "epoch": 0.9033253573543502, + "grad_norm": 1.3125, + "learning_rate": 9.324685217100255e-05, + "loss": 0.8074, + "step": 34114 + }, + { + "epoch": 0.9033518369626445, + "grad_norm": 0.84375, + "learning_rate": 9.324225894713783e-05, + "loss": 0.8667, + "step": 34115 + }, + { + "epoch": 0.9033783165709389, + "grad_norm": 0.79296875, + "learning_rate": 9.323766573759576e-05, + "loss": 0.8376, + "step": 34116 + }, + { + "epoch": 0.9034047961792332, + "grad_norm": 0.67578125, + "learning_rate": 9.323307254238607e-05, + "loss": 0.6783, + "step": 34117 + }, + { + "epoch": 0.9034312757875276, + "grad_norm": 0.7578125, + "learning_rate": 9.322847936151848e-05, + "loss": 0.7361, + "step": 34118 + }, + { + "epoch": 0.903457755395822, + "grad_norm": 0.83984375, + "learning_rate": 9.322388619500271e-05, + "loss": 0.8048, + "step": 34119 + }, + { + "epoch": 0.9034842350041163, + "grad_norm": 0.70703125, + "learning_rate": 9.321929304284856e-05, + "loss": 0.7703, + "step": 34120 + }, + { + "epoch": 0.9035107146124107, + "grad_norm": 0.73046875, + "learning_rate": 9.321469990506575e-05, + "loss": 0.8334, + "step": 34121 + }, + { + "epoch": 0.9035371942207051, + "grad_norm": 0.8203125, + "learning_rate": 9.321010678166398e-05, + "loss": 0.8936, + "step": 34122 + }, + { + "epoch": 0.9035636738289995, + "grad_norm": 0.7265625, + "learning_rate": 9.3205513672653e-05, + "loss": 0.7326, + "step": 34123 + }, + { + "epoch": 0.9035901534372939, + "grad_norm": 0.7890625, + "learning_rate": 9.320092057804251e-05, + "loss": 0.8221, + "step": 34124 + }, + { + "epoch": 0.9036166330455883, + "grad_norm": 0.80859375, + "learning_rate": 9.319632749784232e-05, + "loss": 0.7571, + "step": 34125 + }, + { + "epoch": 0.9036431126538826, + "grad_norm": 0.79296875, + "learning_rate": 9.319173443206213e-05, + "loss": 0.8527, + "step": 34126 + }, + { + "epoch": 0.903669592262177, + "grad_norm": 0.734375, + "learning_rate": 9.318714138071166e-05, + "loss": 0.8267, + "step": 34127 + }, + { + "epoch": 0.9036960718704714, + "grad_norm": 0.76953125, + "learning_rate": 9.318254834380065e-05, + "loss": 0.8282, + "step": 34128 + }, + { + "epoch": 0.9037225514787658, + "grad_norm": 0.76171875, + "learning_rate": 9.317795532133882e-05, + "loss": 0.7298, + "step": 34129 + }, + { + "epoch": 0.9037490310870602, + "grad_norm": 0.83203125, + "learning_rate": 9.317336231333595e-05, + "loss": 0.8041, + "step": 34130 + }, + { + "epoch": 0.9037755106953546, + "grad_norm": 0.84375, + "learning_rate": 9.316876931980174e-05, + "loss": 0.8411, + "step": 34131 + }, + { + "epoch": 0.9038019903036489, + "grad_norm": 0.76171875, + "learning_rate": 9.316417634074595e-05, + "loss": 0.7461, + "step": 34132 + }, + { + "epoch": 0.9038284699119432, + "grad_norm": 0.84375, + "learning_rate": 9.315958337617828e-05, + "loss": 0.8039, + "step": 34133 + }, + { + "epoch": 0.9038549495202376, + "grad_norm": 0.8125, + "learning_rate": 9.315499042610847e-05, + "loss": 0.7957, + "step": 34134 + }, + { + "epoch": 0.903881429128532, + "grad_norm": 0.828125, + "learning_rate": 9.31503974905463e-05, + "loss": 0.7786, + "step": 34135 + }, + { + "epoch": 0.9039079087368264, + "grad_norm": 0.7890625, + "learning_rate": 9.314580456950147e-05, + "loss": 0.8989, + "step": 34136 + }, + { + "epoch": 0.9039343883451207, + "grad_norm": 0.765625, + "learning_rate": 9.31412116629837e-05, + "loss": 0.8293, + "step": 34137 + }, + { + "epoch": 0.9039608679534151, + "grad_norm": 0.73828125, + "learning_rate": 9.313661877100271e-05, + "loss": 0.8275, + "step": 34138 + }, + { + "epoch": 0.9039873475617095, + "grad_norm": 0.7734375, + "learning_rate": 9.313202589356831e-05, + "loss": 0.7015, + "step": 34139 + }, + { + "epoch": 0.9040138271700039, + "grad_norm": 0.76171875, + "learning_rate": 9.312743303069015e-05, + "loss": 0.7964, + "step": 34140 + }, + { + "epoch": 0.9040403067782983, + "grad_norm": 0.8046875, + "learning_rate": 9.312284018237804e-05, + "loss": 0.886, + "step": 34141 + }, + { + "epoch": 0.9040667863865927, + "grad_norm": 0.78125, + "learning_rate": 9.311824734864166e-05, + "loss": 0.8524, + "step": 34142 + }, + { + "epoch": 0.904093265994887, + "grad_norm": 0.79296875, + "learning_rate": 9.311365452949077e-05, + "loss": 0.8021, + "step": 34143 + }, + { + "epoch": 0.9041197456031814, + "grad_norm": 0.78515625, + "learning_rate": 9.310906172493509e-05, + "loss": 0.7836, + "step": 34144 + }, + { + "epoch": 0.9041462252114758, + "grad_norm": 0.6953125, + "learning_rate": 9.310446893498433e-05, + "loss": 0.728, + "step": 34145 + }, + { + "epoch": 0.9041727048197702, + "grad_norm": 0.7734375, + "learning_rate": 9.309987615964829e-05, + "loss": 0.9355, + "step": 34146 + }, + { + "epoch": 0.9041991844280646, + "grad_norm": 0.78125, + "learning_rate": 9.309528339893667e-05, + "loss": 0.7903, + "step": 34147 + }, + { + "epoch": 0.904225664036359, + "grad_norm": 0.84375, + "learning_rate": 9.309069065285921e-05, + "loss": 0.7515, + "step": 34148 + }, + { + "epoch": 0.9042521436446533, + "grad_norm": 0.76953125, + "learning_rate": 9.308609792142562e-05, + "loss": 0.709, + "step": 34149 + }, + { + "epoch": 0.9042786232529476, + "grad_norm": 0.73828125, + "learning_rate": 9.30815052046456e-05, + "loss": 0.7722, + "step": 34150 + }, + { + "epoch": 0.904305102861242, + "grad_norm": 0.77734375, + "learning_rate": 9.3076912502529e-05, + "loss": 0.8319, + "step": 34151 + }, + { + "epoch": 0.9043315824695364, + "grad_norm": 0.78515625, + "learning_rate": 9.307231981508547e-05, + "loss": 0.7675, + "step": 34152 + }, + { + "epoch": 0.9043580620778308, + "grad_norm": 0.875, + "learning_rate": 9.306772714232478e-05, + "loss": 0.8085, + "step": 34153 + }, + { + "epoch": 0.9043845416861251, + "grad_norm": 0.80859375, + "learning_rate": 9.306313448425663e-05, + "loss": 0.7605, + "step": 34154 + }, + { + "epoch": 0.9044110212944195, + "grad_norm": 0.73828125, + "learning_rate": 9.305854184089076e-05, + "loss": 0.6866, + "step": 34155 + }, + { + "epoch": 0.9044375009027139, + "grad_norm": 0.77734375, + "learning_rate": 9.305394921223693e-05, + "loss": 0.8051, + "step": 34156 + }, + { + "epoch": 0.9044639805110083, + "grad_norm": 0.8125, + "learning_rate": 9.304935659830484e-05, + "loss": 0.8098, + "step": 34157 + }, + { + "epoch": 0.9044904601193027, + "grad_norm": 0.78515625, + "learning_rate": 9.304476399910425e-05, + "loss": 0.7585, + "step": 34158 + }, + { + "epoch": 0.9045169397275971, + "grad_norm": 0.78125, + "learning_rate": 9.30401714146449e-05, + "loss": 0.7964, + "step": 34159 + }, + { + "epoch": 0.9045434193358914, + "grad_norm": 0.84375, + "learning_rate": 9.303557884493648e-05, + "loss": 0.7715, + "step": 34160 + }, + { + "epoch": 0.9045698989441858, + "grad_norm": 0.796875, + "learning_rate": 9.303098628998878e-05, + "loss": 0.7426, + "step": 34161 + }, + { + "epoch": 0.9045963785524802, + "grad_norm": 0.796875, + "learning_rate": 9.30263937498115e-05, + "loss": 0.8797, + "step": 34162 + }, + { + "epoch": 0.9046228581607746, + "grad_norm": 1.046875, + "learning_rate": 9.302180122441439e-05, + "loss": 0.821, + "step": 34163 + }, + { + "epoch": 0.904649337769069, + "grad_norm": 0.76171875, + "learning_rate": 9.301720871380716e-05, + "loss": 0.8275, + "step": 34164 + }, + { + "epoch": 0.9046758173773634, + "grad_norm": 0.73828125, + "learning_rate": 9.301261621799952e-05, + "loss": 0.7868, + "step": 34165 + }, + { + "epoch": 0.9047022969856576, + "grad_norm": 0.76171875, + "learning_rate": 9.300802373700128e-05, + "loss": 0.803, + "step": 34166 + }, + { + "epoch": 0.904728776593952, + "grad_norm": 0.79296875, + "learning_rate": 9.300343127082215e-05, + "loss": 0.7773, + "step": 34167 + }, + { + "epoch": 0.9047552562022464, + "grad_norm": 0.76953125, + "learning_rate": 9.299883881947183e-05, + "loss": 0.8182, + "step": 34168 + }, + { + "epoch": 0.9047817358105408, + "grad_norm": 0.765625, + "learning_rate": 9.299424638296007e-05, + "loss": 0.7944, + "step": 34169 + }, + { + "epoch": 0.9048082154188352, + "grad_norm": 0.78125, + "learning_rate": 9.298965396129656e-05, + "loss": 0.7318, + "step": 34170 + }, + { + "epoch": 0.9048346950271295, + "grad_norm": 0.796875, + "learning_rate": 9.298506155449114e-05, + "loss": 0.8418, + "step": 34171 + }, + { + "epoch": 0.9048611746354239, + "grad_norm": 0.734375, + "learning_rate": 9.298046916255346e-05, + "loss": 0.8676, + "step": 34172 + }, + { + "epoch": 0.9048876542437183, + "grad_norm": 0.83984375, + "learning_rate": 9.297587678549329e-05, + "loss": 0.8541, + "step": 34173 + }, + { + "epoch": 0.9049141338520127, + "grad_norm": 0.78125, + "learning_rate": 9.297128442332033e-05, + "loss": 0.8284, + "step": 34174 + }, + { + "epoch": 0.9049406134603071, + "grad_norm": 0.875, + "learning_rate": 9.29666920760443e-05, + "loss": 0.8408, + "step": 34175 + }, + { + "epoch": 0.9049670930686015, + "grad_norm": 0.7890625, + "learning_rate": 9.296209974367501e-05, + "loss": 0.7739, + "step": 34176 + }, + { + "epoch": 0.9049935726768958, + "grad_norm": 0.82421875, + "learning_rate": 9.295750742622215e-05, + "loss": 0.8086, + "step": 34177 + }, + { + "epoch": 0.9050200522851902, + "grad_norm": 0.8125, + "learning_rate": 9.295291512369544e-05, + "loss": 0.7777, + "step": 34178 + }, + { + "epoch": 0.9050465318934846, + "grad_norm": 0.83203125, + "learning_rate": 9.294832283610463e-05, + "loss": 0.8589, + "step": 34179 + }, + { + "epoch": 0.905073011501779, + "grad_norm": 0.77734375, + "learning_rate": 9.294373056345941e-05, + "loss": 0.8004, + "step": 34180 + }, + { + "epoch": 0.9050994911100734, + "grad_norm": 0.74609375, + "learning_rate": 9.293913830576959e-05, + "loss": 0.7206, + "step": 34181 + }, + { + "epoch": 0.9051259707183678, + "grad_norm": 0.77734375, + "learning_rate": 9.293454606304485e-05, + "loss": 0.8799, + "step": 34182 + }, + { + "epoch": 0.905152450326662, + "grad_norm": 0.84375, + "learning_rate": 9.292995383529496e-05, + "loss": 0.8016, + "step": 34183 + }, + { + "epoch": 0.9051789299349564, + "grad_norm": 0.76171875, + "learning_rate": 9.29253616225296e-05, + "loss": 0.8197, + "step": 34184 + }, + { + "epoch": 0.9052054095432508, + "grad_norm": 0.73828125, + "learning_rate": 9.29207694247585e-05, + "loss": 0.7336, + "step": 34185 + }, + { + "epoch": 0.9052318891515452, + "grad_norm": 0.7734375, + "learning_rate": 9.291617724199147e-05, + "loss": 0.7063, + "step": 34186 + }, + { + "epoch": 0.9052583687598396, + "grad_norm": 0.80859375, + "learning_rate": 9.291158507423821e-05, + "loss": 0.8214, + "step": 34187 + }, + { + "epoch": 0.9052848483681339, + "grad_norm": 0.78125, + "learning_rate": 9.290699292150842e-05, + "loss": 0.8227, + "step": 34188 + }, + { + "epoch": 0.9053113279764283, + "grad_norm": 0.8515625, + "learning_rate": 9.290240078381184e-05, + "loss": 0.7521, + "step": 34189 + }, + { + "epoch": 0.9053378075847227, + "grad_norm": 0.8125, + "learning_rate": 9.28978086611582e-05, + "loss": 0.7607, + "step": 34190 + }, + { + "epoch": 0.9053642871930171, + "grad_norm": 0.765625, + "learning_rate": 9.289321655355728e-05, + "loss": 0.7533, + "step": 34191 + }, + { + "epoch": 0.9053907668013115, + "grad_norm": 0.7890625, + "learning_rate": 9.288862446101879e-05, + "loss": 0.7935, + "step": 34192 + }, + { + "epoch": 0.9054172464096059, + "grad_norm": 0.84765625, + "learning_rate": 9.288403238355245e-05, + "loss": 0.8118, + "step": 34193 + }, + { + "epoch": 0.9054437260179002, + "grad_norm": 0.83984375, + "learning_rate": 9.2879440321168e-05, + "loss": 0.9613, + "step": 34194 + }, + { + "epoch": 0.9054702056261946, + "grad_norm": 0.7734375, + "learning_rate": 9.287484827387511e-05, + "loss": 0.8393, + "step": 34195 + }, + { + "epoch": 0.905496685234489, + "grad_norm": 0.71484375, + "learning_rate": 9.287025624168363e-05, + "loss": 0.7187, + "step": 34196 + }, + { + "epoch": 0.9055231648427834, + "grad_norm": 1.921875, + "learning_rate": 9.286566422460323e-05, + "loss": 0.8526, + "step": 34197 + }, + { + "epoch": 0.9055496444510778, + "grad_norm": 0.77734375, + "learning_rate": 9.286107222264365e-05, + "loss": 0.7242, + "step": 34198 + }, + { + "epoch": 0.905576124059372, + "grad_norm": 0.72265625, + "learning_rate": 9.285648023581461e-05, + "loss": 0.7209, + "step": 34199 + }, + { + "epoch": 0.9056026036676664, + "grad_norm": 0.80078125, + "learning_rate": 9.285188826412584e-05, + "loss": 0.7716, + "step": 34200 + }, + { + "epoch": 0.9056290832759608, + "grad_norm": 0.73828125, + "learning_rate": 9.284729630758711e-05, + "loss": 0.8227, + "step": 34201 + }, + { + "epoch": 0.9056555628842552, + "grad_norm": 0.79296875, + "learning_rate": 9.284270436620812e-05, + "loss": 0.7918, + "step": 34202 + }, + { + "epoch": 0.9056820424925496, + "grad_norm": 0.7265625, + "learning_rate": 9.28381124399986e-05, + "loss": 0.7834, + "step": 34203 + }, + { + "epoch": 0.905708522100844, + "grad_norm": 0.73828125, + "learning_rate": 9.28335205289683e-05, + "loss": 0.7959, + "step": 34204 + }, + { + "epoch": 0.9057350017091383, + "grad_norm": 0.75390625, + "learning_rate": 9.28289286331269e-05, + "loss": 0.8138, + "step": 34205 + }, + { + "epoch": 0.9057614813174327, + "grad_norm": 1.1796875, + "learning_rate": 9.282433675248422e-05, + "loss": 0.7561, + "step": 34206 + }, + { + "epoch": 0.9057879609257271, + "grad_norm": 0.734375, + "learning_rate": 9.281974488704996e-05, + "loss": 0.7843, + "step": 34207 + }, + { + "epoch": 0.9058144405340215, + "grad_norm": 0.71875, + "learning_rate": 9.281515303683381e-05, + "loss": 0.8091, + "step": 34208 + }, + { + "epoch": 0.9058409201423159, + "grad_norm": 0.7421875, + "learning_rate": 9.281056120184556e-05, + "loss": 0.7824, + "step": 34209 + }, + { + "epoch": 0.9058673997506103, + "grad_norm": 0.7265625, + "learning_rate": 9.280596938209486e-05, + "loss": 0.7369, + "step": 34210 + }, + { + "epoch": 0.9058938793589046, + "grad_norm": 0.78515625, + "learning_rate": 9.280137757759154e-05, + "loss": 0.7581, + "step": 34211 + }, + { + "epoch": 0.905920358967199, + "grad_norm": 0.78515625, + "learning_rate": 9.279678578834529e-05, + "loss": 0.8393, + "step": 34212 + }, + { + "epoch": 0.9059468385754934, + "grad_norm": 0.81640625, + "learning_rate": 9.279219401436584e-05, + "loss": 0.8418, + "step": 34213 + }, + { + "epoch": 0.9059733181837878, + "grad_norm": 0.88671875, + "learning_rate": 9.278760225566292e-05, + "loss": 0.7305, + "step": 34214 + }, + { + "epoch": 0.9059997977920821, + "grad_norm": 0.8359375, + "learning_rate": 9.278301051224622e-05, + "loss": 0.9067, + "step": 34215 + }, + { + "epoch": 0.9060262774003764, + "grad_norm": 0.7578125, + "learning_rate": 9.277841878412558e-05, + "loss": 0.812, + "step": 34216 + }, + { + "epoch": 0.9060527570086708, + "grad_norm": 0.73828125, + "learning_rate": 9.277382707131065e-05, + "loss": 0.8103, + "step": 34217 + }, + { + "epoch": 0.9060792366169652, + "grad_norm": 0.82421875, + "learning_rate": 9.276923537381119e-05, + "loss": 0.7672, + "step": 34218 + }, + { + "epoch": 0.9061057162252596, + "grad_norm": 0.7890625, + "learning_rate": 9.27646436916369e-05, + "loss": 0.7893, + "step": 34219 + }, + { + "epoch": 0.906132195833554, + "grad_norm": 0.7578125, + "learning_rate": 9.276005202479754e-05, + "loss": 0.8045, + "step": 34220 + }, + { + "epoch": 0.9061586754418484, + "grad_norm": 0.78515625, + "learning_rate": 9.275546037330285e-05, + "loss": 0.7699, + "step": 34221 + }, + { + "epoch": 0.9061851550501427, + "grad_norm": 0.8671875, + "learning_rate": 9.275086873716255e-05, + "loss": 0.7832, + "step": 34222 + }, + { + "epoch": 0.9062116346584371, + "grad_norm": 0.7578125, + "learning_rate": 9.274627711638635e-05, + "loss": 0.7641, + "step": 34223 + }, + { + "epoch": 0.9062381142667315, + "grad_norm": 0.765625, + "learning_rate": 9.2741685510984e-05, + "loss": 0.7253, + "step": 34224 + }, + { + "epoch": 0.9062645938750259, + "grad_norm": 0.76171875, + "learning_rate": 9.273709392096522e-05, + "loss": 0.9423, + "step": 34225 + }, + { + "epoch": 0.9062910734833203, + "grad_norm": 0.7890625, + "learning_rate": 9.273250234633978e-05, + "loss": 0.7247, + "step": 34226 + }, + { + "epoch": 0.9063175530916147, + "grad_norm": 0.7890625, + "learning_rate": 9.272791078711739e-05, + "loss": 0.8953, + "step": 34227 + }, + { + "epoch": 0.906344032699909, + "grad_norm": 0.8203125, + "learning_rate": 9.272331924330777e-05, + "loss": 0.6533, + "step": 34228 + }, + { + "epoch": 0.9063705123082034, + "grad_norm": 0.8203125, + "learning_rate": 9.271872771492067e-05, + "loss": 0.8332, + "step": 34229 + }, + { + "epoch": 0.9063969919164978, + "grad_norm": 0.83984375, + "learning_rate": 9.271413620196576e-05, + "loss": 0.7974, + "step": 34230 + }, + { + "epoch": 0.9064234715247922, + "grad_norm": 0.7578125, + "learning_rate": 9.270954470445287e-05, + "loss": 0.7993, + "step": 34231 + }, + { + "epoch": 0.9064499511330865, + "grad_norm": 0.8984375, + "learning_rate": 9.270495322239167e-05, + "loss": 0.8885, + "step": 34232 + }, + { + "epoch": 0.9064764307413808, + "grad_norm": 0.77734375, + "learning_rate": 9.270036175579192e-05, + "loss": 0.8636, + "step": 34233 + }, + { + "epoch": 0.9065029103496752, + "grad_norm": 0.79296875, + "learning_rate": 9.269577030466334e-05, + "loss": 0.8709, + "step": 34234 + }, + { + "epoch": 0.9065293899579696, + "grad_norm": 0.76953125, + "learning_rate": 9.26911788690156e-05, + "loss": 0.8122, + "step": 34235 + }, + { + "epoch": 0.906555869566264, + "grad_norm": 0.75, + "learning_rate": 9.268658744885853e-05, + "loss": 0.9009, + "step": 34236 + }, + { + "epoch": 0.9065823491745584, + "grad_norm": 0.7734375, + "learning_rate": 9.268199604420182e-05, + "loss": 0.7351, + "step": 34237 + }, + { + "epoch": 0.9066088287828528, + "grad_norm": 0.80859375, + "learning_rate": 9.267740465505523e-05, + "loss": 0.7782, + "step": 34238 + }, + { + "epoch": 0.9066353083911471, + "grad_norm": 0.8359375, + "learning_rate": 9.267281328142843e-05, + "loss": 0.8772, + "step": 34239 + }, + { + "epoch": 0.9066617879994415, + "grad_norm": 0.828125, + "learning_rate": 9.266822192333117e-05, + "loss": 0.8858, + "step": 34240 + }, + { + "epoch": 0.9066882676077359, + "grad_norm": 0.75, + "learning_rate": 9.266363058077322e-05, + "loss": 0.7671, + "step": 34241 + }, + { + "epoch": 0.9067147472160303, + "grad_norm": 0.76953125, + "learning_rate": 9.265903925376426e-05, + "loss": 0.8802, + "step": 34242 + }, + { + "epoch": 0.9067412268243247, + "grad_norm": 0.82421875, + "learning_rate": 9.265444794231407e-05, + "loss": 0.7855, + "step": 34243 + }, + { + "epoch": 0.9067677064326191, + "grad_norm": 0.83984375, + "learning_rate": 9.264985664643235e-05, + "loss": 0.8006, + "step": 34244 + }, + { + "epoch": 0.9067941860409134, + "grad_norm": 0.84375, + "learning_rate": 9.264526536612883e-05, + "loss": 0.8195, + "step": 34245 + }, + { + "epoch": 0.9068206656492078, + "grad_norm": 0.80859375, + "learning_rate": 9.264067410141327e-05, + "loss": 0.8601, + "step": 34246 + }, + { + "epoch": 0.9068471452575022, + "grad_norm": 0.765625, + "learning_rate": 9.263608285229536e-05, + "loss": 0.7587, + "step": 34247 + }, + { + "epoch": 0.9068736248657965, + "grad_norm": 0.765625, + "learning_rate": 9.263149161878488e-05, + "loss": 0.734, + "step": 34248 + }, + { + "epoch": 0.9069001044740909, + "grad_norm": 0.81640625, + "learning_rate": 9.262690040089153e-05, + "loss": 0.8657, + "step": 34249 + }, + { + "epoch": 0.9069265840823852, + "grad_norm": 0.77734375, + "learning_rate": 9.262230919862498e-05, + "loss": 0.7377, + "step": 34250 + }, + { + "epoch": 0.9069530636906796, + "grad_norm": 0.7734375, + "learning_rate": 9.261771801199509e-05, + "loss": 0.743, + "step": 34251 + }, + { + "epoch": 0.906979543298974, + "grad_norm": 0.984375, + "learning_rate": 9.26131268410115e-05, + "loss": 0.7849, + "step": 34252 + }, + { + "epoch": 0.9070060229072684, + "grad_norm": 0.796875, + "learning_rate": 9.260853568568398e-05, + "loss": 0.8502, + "step": 34253 + }, + { + "epoch": 0.9070325025155628, + "grad_norm": 0.81640625, + "learning_rate": 9.260394454602224e-05, + "loss": 0.708, + "step": 34254 + }, + { + "epoch": 0.9070589821238572, + "grad_norm": 0.92578125, + "learning_rate": 9.259935342203598e-05, + "loss": 0.7466, + "step": 34255 + }, + { + "epoch": 0.9070854617321515, + "grad_norm": 0.79296875, + "learning_rate": 9.2594762313735e-05, + "loss": 0.7995, + "step": 34256 + }, + { + "epoch": 0.9071119413404459, + "grad_norm": 0.7421875, + "learning_rate": 9.259017122112902e-05, + "loss": 0.8172, + "step": 34257 + }, + { + "epoch": 0.9071384209487403, + "grad_norm": 0.875, + "learning_rate": 9.258558014422774e-05, + "loss": 1.0501, + "step": 34258 + }, + { + "epoch": 0.9071649005570347, + "grad_norm": 0.75390625, + "learning_rate": 9.258098908304089e-05, + "loss": 0.6502, + "step": 34259 + }, + { + "epoch": 0.9071913801653291, + "grad_norm": 0.89453125, + "learning_rate": 9.257639803757819e-05, + "loss": 0.7795, + "step": 34260 + }, + { + "epoch": 0.9072178597736235, + "grad_norm": 0.796875, + "learning_rate": 9.257180700784939e-05, + "loss": 0.8296, + "step": 34261 + }, + { + "epoch": 0.9072443393819178, + "grad_norm": 0.734375, + "learning_rate": 9.256721599386424e-05, + "loss": 0.6949, + "step": 34262 + }, + { + "epoch": 0.9072708189902122, + "grad_norm": 0.953125, + "learning_rate": 9.256262499563246e-05, + "loss": 0.8817, + "step": 34263 + }, + { + "epoch": 0.9072972985985065, + "grad_norm": 0.7265625, + "learning_rate": 9.255803401316376e-05, + "loss": 0.9082, + "step": 34264 + }, + { + "epoch": 0.9073237782068009, + "grad_norm": 0.75390625, + "learning_rate": 9.255344304646787e-05, + "loss": 0.6917, + "step": 34265 + }, + { + "epoch": 0.9073502578150953, + "grad_norm": 0.78125, + "learning_rate": 9.254885209555455e-05, + "loss": 0.9274, + "step": 34266 + }, + { + "epoch": 0.9073767374233896, + "grad_norm": 0.76171875, + "learning_rate": 9.254426116043352e-05, + "loss": 0.8457, + "step": 34267 + }, + { + "epoch": 0.907403217031684, + "grad_norm": 0.7890625, + "learning_rate": 9.253967024111448e-05, + "loss": 0.8369, + "step": 34268 + }, + { + "epoch": 0.9074296966399784, + "grad_norm": 0.79296875, + "learning_rate": 9.253507933760721e-05, + "loss": 0.8276, + "step": 34269 + }, + { + "epoch": 0.9074561762482728, + "grad_norm": 0.796875, + "learning_rate": 9.253048844992135e-05, + "loss": 0.7671, + "step": 34270 + }, + { + "epoch": 0.9074826558565672, + "grad_norm": 0.7734375, + "learning_rate": 9.252589757806675e-05, + "loss": 0.8052, + "step": 34271 + }, + { + "epoch": 0.9075091354648616, + "grad_norm": 0.7734375, + "learning_rate": 9.252130672205307e-05, + "loss": 0.7711, + "step": 34272 + }, + { + "epoch": 0.9075356150731559, + "grad_norm": 0.76953125, + "learning_rate": 9.251671588189006e-05, + "loss": 0.744, + "step": 34273 + }, + { + "epoch": 0.9075620946814503, + "grad_norm": 0.7109375, + "learning_rate": 9.251212505758744e-05, + "loss": 0.7115, + "step": 34274 + }, + { + "epoch": 0.9075885742897447, + "grad_norm": 0.7890625, + "learning_rate": 9.25075342491549e-05, + "loss": 0.8437, + "step": 34275 + }, + { + "epoch": 0.9076150538980391, + "grad_norm": 0.76171875, + "learning_rate": 9.250294345660225e-05, + "loss": 0.8167, + "step": 34276 + }, + { + "epoch": 0.9076415335063335, + "grad_norm": 0.78125, + "learning_rate": 9.249835267993919e-05, + "loss": 0.7912, + "step": 34277 + }, + { + "epoch": 0.9076680131146279, + "grad_norm": 0.80078125, + "learning_rate": 9.249376191917545e-05, + "loss": 0.7877, + "step": 34278 + }, + { + "epoch": 0.9076944927229222, + "grad_norm": 0.765625, + "learning_rate": 9.248917117432075e-05, + "loss": 0.8865, + "step": 34279 + }, + { + "epoch": 0.9077209723312166, + "grad_norm": 0.83203125, + "learning_rate": 9.248458044538475e-05, + "loss": 0.9462, + "step": 34280 + }, + { + "epoch": 0.9077474519395109, + "grad_norm": 0.75, + "learning_rate": 9.247998973237733e-05, + "loss": 0.7696, + "step": 34281 + }, + { + "epoch": 0.9077739315478053, + "grad_norm": 0.890625, + "learning_rate": 9.247539903530813e-05, + "loss": 0.7618, + "step": 34282 + }, + { + "epoch": 0.9078004111560997, + "grad_norm": 0.8515625, + "learning_rate": 9.24708083541869e-05, + "loss": 0.739, + "step": 34283 + }, + { + "epoch": 0.907826890764394, + "grad_norm": 0.75390625, + "learning_rate": 9.246621768902335e-05, + "loss": 0.8335, + "step": 34284 + }, + { + "epoch": 0.9078533703726884, + "grad_norm": 0.78125, + "learning_rate": 9.246162703982723e-05, + "loss": 0.8746, + "step": 34285 + }, + { + "epoch": 0.9078798499809828, + "grad_norm": 0.78125, + "learning_rate": 9.245703640660824e-05, + "loss": 0.7859, + "step": 34286 + }, + { + "epoch": 0.9079063295892772, + "grad_norm": 0.8515625, + "learning_rate": 9.245244578937615e-05, + "loss": 0.7938, + "step": 34287 + }, + { + "epoch": 0.9079328091975716, + "grad_norm": 0.73046875, + "learning_rate": 9.244785518814066e-05, + "loss": 0.6714, + "step": 34288 + }, + { + "epoch": 0.907959288805866, + "grad_norm": 0.75390625, + "learning_rate": 9.24432646029115e-05, + "loss": 0.7387, + "step": 34289 + }, + { + "epoch": 0.9079857684141603, + "grad_norm": 0.7890625, + "learning_rate": 9.243867403369842e-05, + "loss": 0.7778, + "step": 34290 + }, + { + "epoch": 0.9080122480224547, + "grad_norm": 0.73828125, + "learning_rate": 9.243408348051111e-05, + "loss": 0.8826, + "step": 34291 + }, + { + "epoch": 0.9080387276307491, + "grad_norm": 0.71875, + "learning_rate": 9.242949294335937e-05, + "loss": 0.7575, + "step": 34292 + }, + { + "epoch": 0.9080652072390435, + "grad_norm": 0.73046875, + "learning_rate": 9.242490242225287e-05, + "loss": 0.7322, + "step": 34293 + }, + { + "epoch": 0.9080916868473379, + "grad_norm": 0.8515625, + "learning_rate": 9.242031191720136e-05, + "loss": 0.8613, + "step": 34294 + }, + { + "epoch": 0.9081181664556323, + "grad_norm": 0.75, + "learning_rate": 9.241572142821457e-05, + "loss": 0.7883, + "step": 34295 + }, + { + "epoch": 0.9081446460639266, + "grad_norm": 0.79296875, + "learning_rate": 9.241113095530218e-05, + "loss": 0.7829, + "step": 34296 + }, + { + "epoch": 0.9081711256722209, + "grad_norm": 0.73046875, + "learning_rate": 9.2406540498474e-05, + "loss": 0.7546, + "step": 34297 + }, + { + "epoch": 0.9081976052805153, + "grad_norm": 0.765625, + "learning_rate": 9.240195005773974e-05, + "loss": 0.8346, + "step": 34298 + }, + { + "epoch": 0.9082240848888097, + "grad_norm": 0.84375, + "learning_rate": 9.23973596331091e-05, + "loss": 0.775, + "step": 34299 + }, + { + "epoch": 0.9082505644971041, + "grad_norm": 0.71875, + "learning_rate": 9.239276922459183e-05, + "loss": 0.7478, + "step": 34300 + }, + { + "epoch": 0.9082770441053984, + "grad_norm": 0.7734375, + "learning_rate": 9.23881788321976e-05, + "loss": 0.8358, + "step": 34301 + }, + { + "epoch": 0.9083035237136928, + "grad_norm": 0.80859375, + "learning_rate": 9.238358845593623e-05, + "loss": 0.7422, + "step": 34302 + }, + { + "epoch": 0.9083300033219872, + "grad_norm": 0.6953125, + "learning_rate": 9.237899809581742e-05, + "loss": 0.7386, + "step": 34303 + }, + { + "epoch": 0.9083564829302816, + "grad_norm": 0.76953125, + "learning_rate": 9.237440775185089e-05, + "loss": 0.7534, + "step": 34304 + }, + { + "epoch": 0.908382962538576, + "grad_norm": 0.74609375, + "learning_rate": 9.236981742404636e-05, + "loss": 0.658, + "step": 34305 + }, + { + "epoch": 0.9084094421468704, + "grad_norm": 0.7734375, + "learning_rate": 9.236522711241354e-05, + "loss": 0.8831, + "step": 34306 + }, + { + "epoch": 0.9084359217551647, + "grad_norm": 0.83984375, + "learning_rate": 9.23606368169622e-05, + "loss": 0.9022, + "step": 34307 + }, + { + "epoch": 0.9084624013634591, + "grad_norm": 0.765625, + "learning_rate": 9.235604653770209e-05, + "loss": 0.6691, + "step": 34308 + }, + { + "epoch": 0.9084888809717535, + "grad_norm": 0.69140625, + "learning_rate": 9.235145627464283e-05, + "loss": 0.7314, + "step": 34309 + }, + { + "epoch": 0.9085153605800479, + "grad_norm": 0.72265625, + "learning_rate": 9.234686602779428e-05, + "loss": 0.7823, + "step": 34310 + }, + { + "epoch": 0.9085418401883423, + "grad_norm": 0.80078125, + "learning_rate": 9.234227579716607e-05, + "loss": 0.7558, + "step": 34311 + }, + { + "epoch": 0.9085683197966367, + "grad_norm": 0.84375, + "learning_rate": 9.2337685582768e-05, + "loss": 0.7393, + "step": 34312 + }, + { + "epoch": 0.9085947994049309, + "grad_norm": 0.671875, + "learning_rate": 9.233309538460977e-05, + "loss": 0.7856, + "step": 34313 + }, + { + "epoch": 0.9086212790132253, + "grad_norm": 0.86328125, + "learning_rate": 9.23285052027011e-05, + "loss": 0.6888, + "step": 34314 + }, + { + "epoch": 0.9086477586215197, + "grad_norm": 0.8515625, + "learning_rate": 9.232391503705172e-05, + "loss": 0.88, + "step": 34315 + }, + { + "epoch": 0.9086742382298141, + "grad_norm": 0.71484375, + "learning_rate": 9.231932488767133e-05, + "loss": 0.6952, + "step": 34316 + }, + { + "epoch": 0.9087007178381085, + "grad_norm": 0.7734375, + "learning_rate": 9.231473475456972e-05, + "loss": 0.8605, + "step": 34317 + }, + { + "epoch": 0.9087271974464028, + "grad_norm": 0.71484375, + "learning_rate": 9.231014463775661e-05, + "loss": 0.7952, + "step": 34318 + }, + { + "epoch": 0.9087536770546972, + "grad_norm": 0.73828125, + "learning_rate": 9.23055545372417e-05, + "loss": 0.6836, + "step": 34319 + }, + { + "epoch": 0.9087801566629916, + "grad_norm": 0.6875, + "learning_rate": 9.230096445303472e-05, + "loss": 0.7462, + "step": 34320 + }, + { + "epoch": 0.908806636271286, + "grad_norm": 0.7578125, + "learning_rate": 9.229637438514536e-05, + "loss": 0.7557, + "step": 34321 + }, + { + "epoch": 0.9088331158795804, + "grad_norm": 0.78515625, + "learning_rate": 9.229178433358345e-05, + "loss": 0.7355, + "step": 34322 + }, + { + "epoch": 0.9088595954878748, + "grad_norm": 0.75, + "learning_rate": 9.228719429835867e-05, + "loss": 0.9682, + "step": 34323 + }, + { + "epoch": 0.9088860750961691, + "grad_norm": 0.66015625, + "learning_rate": 9.228260427948074e-05, + "loss": 0.712, + "step": 34324 + }, + { + "epoch": 0.9089125547044635, + "grad_norm": 0.83203125, + "learning_rate": 9.227801427695936e-05, + "loss": 0.7771, + "step": 34325 + }, + { + "epoch": 0.9089390343127579, + "grad_norm": 0.7890625, + "learning_rate": 9.227342429080429e-05, + "loss": 0.8743, + "step": 34326 + }, + { + "epoch": 0.9089655139210523, + "grad_norm": 0.80859375, + "learning_rate": 9.226883432102528e-05, + "loss": 0.7838, + "step": 34327 + }, + { + "epoch": 0.9089919935293467, + "grad_norm": 0.7890625, + "learning_rate": 9.2264244367632e-05, + "loss": 0.7133, + "step": 34328 + }, + { + "epoch": 0.9090184731376411, + "grad_norm": 0.85546875, + "learning_rate": 9.225965443063423e-05, + "loss": 0.8145, + "step": 34329 + }, + { + "epoch": 0.9090449527459353, + "grad_norm": 0.88671875, + "learning_rate": 9.22550645100417e-05, + "loss": 0.7405, + "step": 34330 + }, + { + "epoch": 0.9090714323542297, + "grad_norm": 0.7421875, + "learning_rate": 9.225047460586408e-05, + "loss": 0.6086, + "step": 34331 + }, + { + "epoch": 0.9090979119625241, + "grad_norm": 0.8203125, + "learning_rate": 9.224588471811116e-05, + "loss": 0.8481, + "step": 34332 + }, + { + "epoch": 0.9091243915708185, + "grad_norm": 0.765625, + "learning_rate": 9.224129484679265e-05, + "loss": 0.8907, + "step": 34333 + }, + { + "epoch": 0.9091508711791129, + "grad_norm": 0.78515625, + "learning_rate": 9.223670499191827e-05, + "loss": 0.9103, + "step": 34334 + }, + { + "epoch": 0.9091773507874072, + "grad_norm": 0.75, + "learning_rate": 9.223211515349775e-05, + "loss": 0.7212, + "step": 34335 + }, + { + "epoch": 0.9092038303957016, + "grad_norm": 0.78125, + "learning_rate": 9.222752533154077e-05, + "loss": 0.6895, + "step": 34336 + }, + { + "epoch": 0.909230310003996, + "grad_norm": 0.734375, + "learning_rate": 9.222293552605715e-05, + "loss": 0.648, + "step": 34337 + }, + { + "epoch": 0.9092567896122904, + "grad_norm": 0.7421875, + "learning_rate": 9.221834573705658e-05, + "loss": 0.7368, + "step": 34338 + }, + { + "epoch": 0.9092832692205848, + "grad_norm": 0.7734375, + "learning_rate": 9.221375596454878e-05, + "loss": 0.6579, + "step": 34339 + }, + { + "epoch": 0.9093097488288792, + "grad_norm": 0.828125, + "learning_rate": 9.220916620854347e-05, + "loss": 0.8758, + "step": 34340 + }, + { + "epoch": 0.9093362284371735, + "grad_norm": 0.77734375, + "learning_rate": 9.220457646905035e-05, + "loss": 0.8487, + "step": 34341 + }, + { + "epoch": 0.9093627080454679, + "grad_norm": 0.8046875, + "learning_rate": 9.219998674607923e-05, + "loss": 0.8747, + "step": 34342 + }, + { + "epoch": 0.9093891876537623, + "grad_norm": 0.8125, + "learning_rate": 9.219539703963978e-05, + "loss": 0.7587, + "step": 34343 + }, + { + "epoch": 0.9094156672620567, + "grad_norm": 0.859375, + "learning_rate": 9.219080734974176e-05, + "loss": 0.7978, + "step": 34344 + }, + { + "epoch": 0.9094421468703511, + "grad_norm": 0.73046875, + "learning_rate": 9.218621767639487e-05, + "loss": 0.7593, + "step": 34345 + }, + { + "epoch": 0.9094686264786453, + "grad_norm": 0.94140625, + "learning_rate": 9.21816280196088e-05, + "loss": 0.8092, + "step": 34346 + }, + { + "epoch": 0.9094951060869397, + "grad_norm": 0.765625, + "learning_rate": 9.217703837939337e-05, + "loss": 0.8136, + "step": 34347 + }, + { + "epoch": 0.9095215856952341, + "grad_norm": 0.82421875, + "learning_rate": 9.217244875575825e-05, + "loss": 0.7992, + "step": 34348 + }, + { + "epoch": 0.9095480653035285, + "grad_norm": 0.76953125, + "learning_rate": 9.216785914871318e-05, + "loss": 0.8365, + "step": 34349 + }, + { + "epoch": 0.9095745449118229, + "grad_norm": 0.7421875, + "learning_rate": 9.216326955826788e-05, + "loss": 0.7636, + "step": 34350 + }, + { + "epoch": 0.9096010245201173, + "grad_norm": 0.76953125, + "learning_rate": 9.215867998443208e-05, + "loss": 0.8378, + "step": 34351 + }, + { + "epoch": 0.9096275041284116, + "grad_norm": 0.765625, + "learning_rate": 9.215409042721552e-05, + "loss": 0.8144, + "step": 34352 + }, + { + "epoch": 0.909653983736706, + "grad_norm": 0.73046875, + "learning_rate": 9.214950088662791e-05, + "loss": 0.7831, + "step": 34353 + }, + { + "epoch": 0.9096804633450004, + "grad_norm": 0.77734375, + "learning_rate": 9.2144911362679e-05, + "loss": 0.8265, + "step": 34354 + }, + { + "epoch": 0.9097069429532948, + "grad_norm": 0.79296875, + "learning_rate": 9.214032185537849e-05, + "loss": 0.7817, + "step": 34355 + }, + { + "epoch": 0.9097334225615892, + "grad_norm": 0.734375, + "learning_rate": 9.213573236473607e-05, + "loss": 0.815, + "step": 34356 + }, + { + "epoch": 0.9097599021698836, + "grad_norm": 0.8046875, + "learning_rate": 9.213114289076156e-05, + "loss": 0.8273, + "step": 34357 + }, + { + "epoch": 0.9097863817781779, + "grad_norm": 0.80078125, + "learning_rate": 9.212655343346466e-05, + "loss": 0.8541, + "step": 34358 + }, + { + "epoch": 0.9098128613864723, + "grad_norm": 0.796875, + "learning_rate": 9.212196399285506e-05, + "loss": 0.7732, + "step": 34359 + }, + { + "epoch": 0.9098393409947667, + "grad_norm": 0.7578125, + "learning_rate": 9.211737456894253e-05, + "loss": 0.794, + "step": 34360 + }, + { + "epoch": 0.9098658206030611, + "grad_norm": 0.7578125, + "learning_rate": 9.211278516173671e-05, + "loss": 0.7188, + "step": 34361 + }, + { + "epoch": 0.9098923002113554, + "grad_norm": 0.9140625, + "learning_rate": 9.210819577124745e-05, + "loss": 0.7147, + "step": 34362 + }, + { + "epoch": 0.9099187798196497, + "grad_norm": 0.7734375, + "learning_rate": 9.21036063974844e-05, + "loss": 0.8136, + "step": 34363 + }, + { + "epoch": 0.9099452594279441, + "grad_norm": 0.77734375, + "learning_rate": 9.209901704045732e-05, + "loss": 0.6773, + "step": 34364 + }, + { + "epoch": 0.9099717390362385, + "grad_norm": 0.71875, + "learning_rate": 9.209442770017591e-05, + "loss": 0.765, + "step": 34365 + }, + { + "epoch": 0.9099982186445329, + "grad_norm": 0.71484375, + "learning_rate": 9.208983837664987e-05, + "loss": 0.667, + "step": 34366 + }, + { + "epoch": 0.9100246982528273, + "grad_norm": 0.82421875, + "learning_rate": 9.208524906988902e-05, + "loss": 0.8288, + "step": 34367 + }, + { + "epoch": 0.9100511778611217, + "grad_norm": 0.7578125, + "learning_rate": 9.2080659779903e-05, + "loss": 0.9128, + "step": 34368 + }, + { + "epoch": 0.910077657469416, + "grad_norm": 0.8125, + "learning_rate": 9.207607050670161e-05, + "loss": 0.7431, + "step": 34369 + }, + { + "epoch": 0.9101041370777104, + "grad_norm": 0.75, + "learning_rate": 9.20714812502945e-05, + "loss": 0.8587, + "step": 34370 + }, + { + "epoch": 0.9101306166860048, + "grad_norm": 0.796875, + "learning_rate": 9.206689201069144e-05, + "loss": 0.7925, + "step": 34371 + }, + { + "epoch": 0.9101570962942992, + "grad_norm": 0.75390625, + "learning_rate": 9.206230278790216e-05, + "loss": 0.7632, + "step": 34372 + }, + { + "epoch": 0.9101835759025936, + "grad_norm": 0.8046875, + "learning_rate": 9.205771358193637e-05, + "loss": 0.8939, + "step": 34373 + }, + { + "epoch": 0.910210055510888, + "grad_norm": 0.7578125, + "learning_rate": 9.20531243928038e-05, + "loss": 0.773, + "step": 34374 + }, + { + "epoch": 0.9102365351191823, + "grad_norm": 0.78125, + "learning_rate": 9.204853522051419e-05, + "loss": 0.7586, + "step": 34375 + }, + { + "epoch": 0.9102630147274767, + "grad_norm": 0.78125, + "learning_rate": 9.20439460650772e-05, + "loss": 0.6883, + "step": 34376 + }, + { + "epoch": 0.9102894943357711, + "grad_norm": 0.78515625, + "learning_rate": 9.203935692650267e-05, + "loss": 0.7411, + "step": 34377 + }, + { + "epoch": 0.9103159739440655, + "grad_norm": 0.76953125, + "learning_rate": 9.203476780480026e-05, + "loss": 0.8977, + "step": 34378 + }, + { + "epoch": 0.9103424535523598, + "grad_norm": 0.81640625, + "learning_rate": 9.20301786999797e-05, + "loss": 0.7482, + "step": 34379 + }, + { + "epoch": 0.9103689331606541, + "grad_norm": 0.75390625, + "learning_rate": 9.202558961205073e-05, + "loss": 0.7638, + "step": 34380 + }, + { + "epoch": 0.9103954127689485, + "grad_norm": 0.796875, + "learning_rate": 9.2021000541023e-05, + "loss": 0.8754, + "step": 34381 + }, + { + "epoch": 0.9104218923772429, + "grad_norm": 0.78125, + "learning_rate": 9.201641148690638e-05, + "loss": 0.7568, + "step": 34382 + }, + { + "epoch": 0.9104483719855373, + "grad_norm": 0.81640625, + "learning_rate": 9.201182244971049e-05, + "loss": 0.7877, + "step": 34383 + }, + { + "epoch": 0.9104748515938317, + "grad_norm": 0.87890625, + "learning_rate": 9.20072334294451e-05, + "loss": 0.8294, + "step": 34384 + }, + { + "epoch": 0.9105013312021261, + "grad_norm": 0.77734375, + "learning_rate": 9.200264442611993e-05, + "loss": 0.8324, + "step": 34385 + }, + { + "epoch": 0.9105278108104204, + "grad_norm": 0.75, + "learning_rate": 9.199805543974464e-05, + "loss": 0.7688, + "step": 34386 + }, + { + "epoch": 0.9105542904187148, + "grad_norm": 0.80078125, + "learning_rate": 9.199346647032906e-05, + "loss": 0.827, + "step": 34387 + }, + { + "epoch": 0.9105807700270092, + "grad_norm": 0.77734375, + "learning_rate": 9.198887751788289e-05, + "loss": 0.7007, + "step": 34388 + }, + { + "epoch": 0.9106072496353036, + "grad_norm": 0.75, + "learning_rate": 9.198428858241581e-05, + "loss": 0.8158, + "step": 34389 + }, + { + "epoch": 0.910633729243598, + "grad_norm": 0.765625, + "learning_rate": 9.19796996639376e-05, + "loss": 0.898, + "step": 34390 + }, + { + "epoch": 0.9106602088518924, + "grad_norm": 0.703125, + "learning_rate": 9.197511076245791e-05, + "loss": 0.703, + "step": 34391 + }, + { + "epoch": 0.9106866884601867, + "grad_norm": 0.82421875, + "learning_rate": 9.197052187798653e-05, + "loss": 0.7907, + "step": 34392 + }, + { + "epoch": 0.9107131680684811, + "grad_norm": 0.76953125, + "learning_rate": 9.19659330105332e-05, + "loss": 0.8798, + "step": 34393 + }, + { + "epoch": 0.9107396476767755, + "grad_norm": 0.7265625, + "learning_rate": 9.196134416010758e-05, + "loss": 0.803, + "step": 34394 + }, + { + "epoch": 0.9107661272850698, + "grad_norm": 0.6796875, + "learning_rate": 9.195675532671945e-05, + "loss": 0.7933, + "step": 34395 + }, + { + "epoch": 0.9107926068933642, + "grad_norm": 0.80078125, + "learning_rate": 9.195216651037849e-05, + "loss": 0.7305, + "step": 34396 + }, + { + "epoch": 0.9108190865016585, + "grad_norm": 0.70703125, + "learning_rate": 9.194757771109449e-05, + "loss": 0.6835, + "step": 34397 + }, + { + "epoch": 0.9108455661099529, + "grad_norm": 0.828125, + "learning_rate": 9.194298892887712e-05, + "loss": 0.7647, + "step": 34398 + }, + { + "epoch": 0.9108720457182473, + "grad_norm": 0.765625, + "learning_rate": 9.193840016373613e-05, + "loss": 0.9137, + "step": 34399 + }, + { + "epoch": 0.9108985253265417, + "grad_norm": 0.79296875, + "learning_rate": 9.193381141568126e-05, + "loss": 0.8619, + "step": 34400 + }, + { + "epoch": 0.9109250049348361, + "grad_norm": 0.828125, + "learning_rate": 9.192922268472215e-05, + "loss": 0.807, + "step": 34401 + }, + { + "epoch": 0.9109514845431305, + "grad_norm": 0.78515625, + "learning_rate": 9.192463397086865e-05, + "loss": 0.8715, + "step": 34402 + }, + { + "epoch": 0.9109779641514248, + "grad_norm": 0.71875, + "learning_rate": 9.192004527413042e-05, + "loss": 0.8823, + "step": 34403 + }, + { + "epoch": 0.9110044437597192, + "grad_norm": 0.8046875, + "learning_rate": 9.191545659451719e-05, + "loss": 0.7956, + "step": 34404 + }, + { + "epoch": 0.9110309233680136, + "grad_norm": 0.765625, + "learning_rate": 9.191086793203869e-05, + "loss": 0.8123, + "step": 34405 + }, + { + "epoch": 0.911057402976308, + "grad_norm": 0.73046875, + "learning_rate": 9.19062792867046e-05, + "loss": 0.7838, + "step": 34406 + }, + { + "epoch": 0.9110838825846024, + "grad_norm": 0.80078125, + "learning_rate": 9.190169065852473e-05, + "loss": 0.6993, + "step": 34407 + }, + { + "epoch": 0.9111103621928968, + "grad_norm": 0.765625, + "learning_rate": 9.189710204750877e-05, + "loss": 0.8318, + "step": 34408 + }, + { + "epoch": 0.9111368418011911, + "grad_norm": 0.77734375, + "learning_rate": 9.189251345366643e-05, + "loss": 0.8549, + "step": 34409 + }, + { + "epoch": 0.9111633214094855, + "grad_norm": 0.79296875, + "learning_rate": 9.188792487700744e-05, + "loss": 0.719, + "step": 34410 + }, + { + "epoch": 0.9111898010177798, + "grad_norm": 0.74609375, + "learning_rate": 9.188333631754151e-05, + "loss": 0.7785, + "step": 34411 + }, + { + "epoch": 0.9112162806260742, + "grad_norm": 0.6796875, + "learning_rate": 9.187874777527841e-05, + "loss": 0.764, + "step": 34412 + }, + { + "epoch": 0.9112427602343686, + "grad_norm": 0.89453125, + "learning_rate": 9.187415925022783e-05, + "loss": 0.8445, + "step": 34413 + }, + { + "epoch": 0.911269239842663, + "grad_norm": 0.7890625, + "learning_rate": 9.186957074239952e-05, + "loss": 0.7332, + "step": 34414 + }, + { + "epoch": 0.9112957194509573, + "grad_norm": 0.79296875, + "learning_rate": 9.186498225180318e-05, + "loss": 0.7495, + "step": 34415 + }, + { + "epoch": 0.9113221990592517, + "grad_norm": 0.7578125, + "learning_rate": 9.186039377844853e-05, + "loss": 0.7544, + "step": 34416 + }, + { + "epoch": 0.9113486786675461, + "grad_norm": 0.72265625, + "learning_rate": 9.185580532234533e-05, + "loss": 0.7104, + "step": 34417 + }, + { + "epoch": 0.9113751582758405, + "grad_norm": 0.8046875, + "learning_rate": 9.18512168835033e-05, + "loss": 0.8042, + "step": 34418 + }, + { + "epoch": 0.9114016378841349, + "grad_norm": 0.796875, + "learning_rate": 9.184662846193213e-05, + "loss": 0.8751, + "step": 34419 + }, + { + "epoch": 0.9114281174924292, + "grad_norm": 0.87890625, + "learning_rate": 9.184204005764158e-05, + "loss": 0.7549, + "step": 34420 + }, + { + "epoch": 0.9114545971007236, + "grad_norm": 0.796875, + "learning_rate": 9.18374516706413e-05, + "loss": 0.7843, + "step": 34421 + }, + { + "epoch": 0.911481076709018, + "grad_norm": 0.8046875, + "learning_rate": 9.183286330094113e-05, + "loss": 0.7942, + "step": 34422 + }, + { + "epoch": 0.9115075563173124, + "grad_norm": 0.70703125, + "learning_rate": 9.182827494855075e-05, + "loss": 0.8914, + "step": 34423 + }, + { + "epoch": 0.9115340359256068, + "grad_norm": 0.7421875, + "learning_rate": 9.182368661347984e-05, + "loss": 0.8631, + "step": 34424 + }, + { + "epoch": 0.9115605155339012, + "grad_norm": 0.98828125, + "learning_rate": 9.181909829573818e-05, + "loss": 0.8003, + "step": 34425 + }, + { + "epoch": 0.9115869951421955, + "grad_norm": 0.78125, + "learning_rate": 9.181450999533549e-05, + "loss": 0.8452, + "step": 34426 + }, + { + "epoch": 0.9116134747504899, + "grad_norm": 0.76171875, + "learning_rate": 9.180992171228141e-05, + "loss": 0.9028, + "step": 34427 + }, + { + "epoch": 0.9116399543587842, + "grad_norm": 0.79296875, + "learning_rate": 9.180533344658578e-05, + "loss": 0.8542, + "step": 34428 + }, + { + "epoch": 0.9116664339670786, + "grad_norm": 0.7890625, + "learning_rate": 9.18007451982583e-05, + "loss": 0.8145, + "step": 34429 + }, + { + "epoch": 0.911692913575373, + "grad_norm": 0.80078125, + "learning_rate": 9.179615696730863e-05, + "loss": 0.8928, + "step": 34430 + }, + { + "epoch": 0.9117193931836673, + "grad_norm": 0.7578125, + "learning_rate": 9.179156875374658e-05, + "loss": 0.7603, + "step": 34431 + }, + { + "epoch": 0.9117458727919617, + "grad_norm": 0.89453125, + "learning_rate": 9.178698055758175e-05, + "loss": 0.8035, + "step": 34432 + }, + { + "epoch": 0.9117723524002561, + "grad_norm": 0.75, + "learning_rate": 9.178239237882401e-05, + "loss": 0.7646, + "step": 34433 + }, + { + "epoch": 0.9117988320085505, + "grad_norm": 0.87109375, + "learning_rate": 9.177780421748301e-05, + "loss": 0.7786, + "step": 34434 + }, + { + "epoch": 0.9118253116168449, + "grad_norm": 0.7734375, + "learning_rate": 9.177321607356849e-05, + "loss": 0.7298, + "step": 34435 + }, + { + "epoch": 0.9118517912251393, + "grad_norm": 0.70703125, + "learning_rate": 9.176862794709017e-05, + "loss": 0.7306, + "step": 34436 + }, + { + "epoch": 0.9118782708334336, + "grad_norm": 0.87109375, + "learning_rate": 9.176403983805775e-05, + "loss": 0.8442, + "step": 34437 + }, + { + "epoch": 0.911904750441728, + "grad_norm": 0.8203125, + "learning_rate": 9.175945174648099e-05, + "loss": 0.88, + "step": 34438 + }, + { + "epoch": 0.9119312300500224, + "grad_norm": 0.8203125, + "learning_rate": 9.175486367236961e-05, + "loss": 0.8494, + "step": 34439 + }, + { + "epoch": 0.9119577096583168, + "grad_norm": 0.72265625, + "learning_rate": 9.175027561573333e-05, + "loss": 0.7362, + "step": 34440 + }, + { + "epoch": 0.9119841892666112, + "grad_norm": 0.7890625, + "learning_rate": 9.174568757658186e-05, + "loss": 0.8505, + "step": 34441 + }, + { + "epoch": 0.9120106688749056, + "grad_norm": 0.73828125, + "learning_rate": 9.174109955492489e-05, + "loss": 0.7823, + "step": 34442 + }, + { + "epoch": 0.9120371484831999, + "grad_norm": 0.82421875, + "learning_rate": 9.173651155077225e-05, + "loss": 0.7448, + "step": 34443 + }, + { + "epoch": 0.9120636280914942, + "grad_norm": 0.765625, + "learning_rate": 9.173192356413357e-05, + "loss": 0.7237, + "step": 34444 + }, + { + "epoch": 0.9120901076997886, + "grad_norm": 0.76953125, + "learning_rate": 9.172733559501862e-05, + "loss": 0.8869, + "step": 34445 + }, + { + "epoch": 0.912116587308083, + "grad_norm": 0.75, + "learning_rate": 9.172274764343711e-05, + "loss": 0.6991, + "step": 34446 + }, + { + "epoch": 0.9121430669163774, + "grad_norm": 0.76171875, + "learning_rate": 9.171815970939872e-05, + "loss": 0.8708, + "step": 34447 + }, + { + "epoch": 0.9121695465246717, + "grad_norm": 0.8046875, + "learning_rate": 9.171357179291326e-05, + "loss": 0.7599, + "step": 34448 + }, + { + "epoch": 0.9121960261329661, + "grad_norm": 0.7578125, + "learning_rate": 9.170898389399042e-05, + "loss": 0.7022, + "step": 34449 + }, + { + "epoch": 0.9122225057412605, + "grad_norm": 0.796875, + "learning_rate": 9.17043960126399e-05, + "loss": 0.8258, + "step": 34450 + }, + { + "epoch": 0.9122489853495549, + "grad_norm": 0.72265625, + "learning_rate": 9.169980814887145e-05, + "loss": 0.8571, + "step": 34451 + }, + { + "epoch": 0.9122754649578493, + "grad_norm": 0.76953125, + "learning_rate": 9.169522030269473e-05, + "loss": 0.7026, + "step": 34452 + }, + { + "epoch": 0.9123019445661437, + "grad_norm": 0.79296875, + "learning_rate": 9.169063247411957e-05, + "loss": 0.7619, + "step": 34453 + }, + { + "epoch": 0.912328424174438, + "grad_norm": 0.7734375, + "learning_rate": 9.168604466315562e-05, + "loss": 0.7905, + "step": 34454 + }, + { + "epoch": 0.9123549037827324, + "grad_norm": 0.828125, + "learning_rate": 9.168145686981264e-05, + "loss": 0.776, + "step": 34455 + }, + { + "epoch": 0.9123813833910268, + "grad_norm": 0.8203125, + "learning_rate": 9.167686909410032e-05, + "loss": 0.8804, + "step": 34456 + }, + { + "epoch": 0.9124078629993212, + "grad_norm": 0.828125, + "learning_rate": 9.16722813360284e-05, + "loss": 0.8235, + "step": 34457 + }, + { + "epoch": 0.9124343426076156, + "grad_norm": 0.84765625, + "learning_rate": 9.166769359560662e-05, + "loss": 0.7567, + "step": 34458 + }, + { + "epoch": 0.91246082221591, + "grad_norm": 0.8046875, + "learning_rate": 9.166310587284468e-05, + "loss": 0.8532, + "step": 34459 + }, + { + "epoch": 0.9124873018242042, + "grad_norm": 0.83984375, + "learning_rate": 9.165851816775231e-05, + "loss": 0.7684, + "step": 34460 + }, + { + "epoch": 0.9125137814324986, + "grad_norm": 0.765625, + "learning_rate": 9.16539304803392e-05, + "loss": 0.7581, + "step": 34461 + }, + { + "epoch": 0.912540261040793, + "grad_norm": 0.75390625, + "learning_rate": 9.164934281061513e-05, + "loss": 0.7439, + "step": 34462 + }, + { + "epoch": 0.9125667406490874, + "grad_norm": 0.78125, + "learning_rate": 9.164475515858983e-05, + "loss": 0.7928, + "step": 34463 + }, + { + "epoch": 0.9125932202573818, + "grad_norm": 0.75390625, + "learning_rate": 9.164016752427297e-05, + "loss": 0.7705, + "step": 34464 + }, + { + "epoch": 0.9126196998656761, + "grad_norm": 1.6640625, + "learning_rate": 9.16355799076743e-05, + "loss": 0.7925, + "step": 34465 + }, + { + "epoch": 0.9126461794739705, + "grad_norm": 0.74609375, + "learning_rate": 9.163099230880356e-05, + "loss": 0.7705, + "step": 34466 + }, + { + "epoch": 0.9126726590822649, + "grad_norm": 0.68359375, + "learning_rate": 9.162640472767038e-05, + "loss": 0.8106, + "step": 34467 + }, + { + "epoch": 0.9126991386905593, + "grad_norm": 0.7421875, + "learning_rate": 9.162181716428463e-05, + "loss": 0.7167, + "step": 34468 + }, + { + "epoch": 0.9127256182988537, + "grad_norm": 0.86328125, + "learning_rate": 9.161722961865596e-05, + "loss": 0.8709, + "step": 34469 + }, + { + "epoch": 0.9127520979071481, + "grad_norm": 0.8046875, + "learning_rate": 9.161264209079407e-05, + "loss": 0.8115, + "step": 34470 + }, + { + "epoch": 0.9127785775154424, + "grad_norm": 0.91796875, + "learning_rate": 9.160805458070871e-05, + "loss": 0.9012, + "step": 34471 + }, + { + "epoch": 0.9128050571237368, + "grad_norm": 0.72265625, + "learning_rate": 9.160346708840957e-05, + "loss": 0.7992, + "step": 34472 + }, + { + "epoch": 0.9128315367320312, + "grad_norm": 0.7421875, + "learning_rate": 9.159887961390642e-05, + "loss": 0.698, + "step": 34473 + }, + { + "epoch": 0.9128580163403256, + "grad_norm": 0.6875, + "learning_rate": 9.159429215720898e-05, + "loss": 0.741, + "step": 34474 + }, + { + "epoch": 0.91288449594862, + "grad_norm": 0.8125, + "learning_rate": 9.158970471832697e-05, + "loss": 0.8469, + "step": 34475 + }, + { + "epoch": 0.9129109755569144, + "grad_norm": 0.8125, + "learning_rate": 9.158511729727008e-05, + "loss": 0.8362, + "step": 34476 + }, + { + "epoch": 0.9129374551652086, + "grad_norm": 0.79296875, + "learning_rate": 9.158052989404803e-05, + "loss": 0.8402, + "step": 34477 + }, + { + "epoch": 0.912963934773503, + "grad_norm": 0.85546875, + "learning_rate": 9.157594250867061e-05, + "loss": 0.8562, + "step": 34478 + }, + { + "epoch": 0.9129904143817974, + "grad_norm": 0.74609375, + "learning_rate": 9.157135514114747e-05, + "loss": 0.8003, + "step": 34479 + }, + { + "epoch": 0.9130168939900918, + "grad_norm": 0.77734375, + "learning_rate": 9.156676779148836e-05, + "loss": 0.8477, + "step": 34480 + }, + { + "epoch": 0.9130433735983862, + "grad_norm": 0.76953125, + "learning_rate": 9.156218045970302e-05, + "loss": 0.8291, + "step": 34481 + }, + { + "epoch": 0.9130698532066805, + "grad_norm": 0.78125, + "learning_rate": 9.155759314580113e-05, + "loss": 0.8261, + "step": 34482 + }, + { + "epoch": 0.9130963328149749, + "grad_norm": 0.71875, + "learning_rate": 9.155300584979246e-05, + "loss": 0.7559, + "step": 34483 + }, + { + "epoch": 0.9131228124232693, + "grad_norm": 0.890625, + "learning_rate": 9.154841857168672e-05, + "loss": 0.7971, + "step": 34484 + }, + { + "epoch": 0.9131492920315637, + "grad_norm": 0.72265625, + "learning_rate": 9.154383131149362e-05, + "loss": 0.6765, + "step": 34485 + }, + { + "epoch": 0.9131757716398581, + "grad_norm": 0.734375, + "learning_rate": 9.153924406922289e-05, + "loss": 0.8235, + "step": 34486 + }, + { + "epoch": 0.9132022512481525, + "grad_norm": 0.8125, + "learning_rate": 9.15346568448842e-05, + "loss": 0.8129, + "step": 34487 + }, + { + "epoch": 0.9132287308564468, + "grad_norm": 0.76171875, + "learning_rate": 9.153006963848736e-05, + "loss": 0.7782, + "step": 34488 + }, + { + "epoch": 0.9132552104647412, + "grad_norm": 0.796875, + "learning_rate": 9.152548245004205e-05, + "loss": 0.9135, + "step": 34489 + }, + { + "epoch": 0.9132816900730356, + "grad_norm": 0.76171875, + "learning_rate": 9.1520895279558e-05, + "loss": 0.7343, + "step": 34490 + }, + { + "epoch": 0.91330816968133, + "grad_norm": 0.78515625, + "learning_rate": 9.151630812704494e-05, + "loss": 0.8022, + "step": 34491 + }, + { + "epoch": 0.9133346492896244, + "grad_norm": 0.7578125, + "learning_rate": 9.151172099251253e-05, + "loss": 0.9558, + "step": 34492 + }, + { + "epoch": 0.9133611288979187, + "grad_norm": 0.765625, + "learning_rate": 9.150713387597059e-05, + "loss": 0.7229, + "step": 34493 + }, + { + "epoch": 0.913387608506213, + "grad_norm": 0.76171875, + "learning_rate": 9.150254677742877e-05, + "loss": 0.8922, + "step": 34494 + }, + { + "epoch": 0.9134140881145074, + "grad_norm": 0.69921875, + "learning_rate": 9.149795969689684e-05, + "loss": 0.7319, + "step": 34495 + }, + { + "epoch": 0.9134405677228018, + "grad_norm": 0.8046875, + "learning_rate": 9.149337263438449e-05, + "loss": 0.8913, + "step": 34496 + }, + { + "epoch": 0.9134670473310962, + "grad_norm": 0.73046875, + "learning_rate": 9.148878558990144e-05, + "loss": 1.0285, + "step": 34497 + }, + { + "epoch": 0.9134935269393906, + "grad_norm": 0.7421875, + "learning_rate": 9.148419856345743e-05, + "loss": 0.811, + "step": 34498 + }, + { + "epoch": 0.913520006547685, + "grad_norm": 0.9296875, + "learning_rate": 9.147961155506216e-05, + "loss": 0.7756, + "step": 34499 + }, + { + "epoch": 0.9135464861559793, + "grad_norm": 0.87109375, + "learning_rate": 9.147502456472539e-05, + "loss": 0.8173, + "step": 34500 + }, + { + "epoch": 0.9135729657642737, + "grad_norm": 0.7578125, + "learning_rate": 9.147043759245682e-05, + "loss": 0.7302, + "step": 34501 + }, + { + "epoch": 0.9135994453725681, + "grad_norm": 0.80859375, + "learning_rate": 9.146585063826614e-05, + "loss": 0.7956, + "step": 34502 + }, + { + "epoch": 0.9136259249808625, + "grad_norm": 0.80078125, + "learning_rate": 9.146126370216312e-05, + "loss": 0.8818, + "step": 34503 + }, + { + "epoch": 0.9136524045891569, + "grad_norm": 0.734375, + "learning_rate": 9.145667678415747e-05, + "loss": 0.7899, + "step": 34504 + }, + { + "epoch": 0.9136788841974512, + "grad_norm": 0.81640625, + "learning_rate": 9.145208988425891e-05, + "loss": 0.7801, + "step": 34505 + }, + { + "epoch": 0.9137053638057456, + "grad_norm": 0.81640625, + "learning_rate": 9.144750300247715e-05, + "loss": 0.862, + "step": 34506 + }, + { + "epoch": 0.91373184341404, + "grad_norm": 0.78515625, + "learning_rate": 9.144291613882188e-05, + "loss": 0.7501, + "step": 34507 + }, + { + "epoch": 0.9137583230223344, + "grad_norm": 0.734375, + "learning_rate": 9.14383292933029e-05, + "loss": 0.7994, + "step": 34508 + }, + { + "epoch": 0.9137848026306287, + "grad_norm": 0.80859375, + "learning_rate": 9.143374246592989e-05, + "loss": 0.8182, + "step": 34509 + }, + { + "epoch": 0.913811282238923, + "grad_norm": 0.7109375, + "learning_rate": 9.142915565671257e-05, + "loss": 0.8392, + "step": 34510 + }, + { + "epoch": 0.9138377618472174, + "grad_norm": 0.75390625, + "learning_rate": 9.142456886566067e-05, + "loss": 0.7294, + "step": 34511 + }, + { + "epoch": 0.9138642414555118, + "grad_norm": 0.72265625, + "learning_rate": 9.141998209278386e-05, + "loss": 0.806, + "step": 34512 + }, + { + "epoch": 0.9138907210638062, + "grad_norm": 0.7734375, + "learning_rate": 9.141539533809195e-05, + "loss": 0.7727, + "step": 34513 + }, + { + "epoch": 0.9139172006721006, + "grad_norm": 0.8125, + "learning_rate": 9.141080860159464e-05, + "loss": 0.7173, + "step": 34514 + }, + { + "epoch": 0.913943680280395, + "grad_norm": 0.796875, + "learning_rate": 9.140622188330161e-05, + "loss": 0.8204, + "step": 34515 + }, + { + "epoch": 0.9139701598886893, + "grad_norm": 0.765625, + "learning_rate": 9.14016351832226e-05, + "loss": 0.7844, + "step": 34516 + }, + { + "epoch": 0.9139966394969837, + "grad_norm": 0.7109375, + "learning_rate": 9.139704850136729e-05, + "loss": 0.7687, + "step": 34517 + }, + { + "epoch": 0.9140231191052781, + "grad_norm": 0.73828125, + "learning_rate": 9.139246183774549e-05, + "loss": 0.7759, + "step": 34518 + }, + { + "epoch": 0.9140495987135725, + "grad_norm": 0.81640625, + "learning_rate": 9.138787519236687e-05, + "loss": 0.8672, + "step": 34519 + }, + { + "epoch": 0.9140760783218669, + "grad_norm": 0.73046875, + "learning_rate": 9.138328856524116e-05, + "loss": 0.7785, + "step": 34520 + }, + { + "epoch": 0.9141025579301613, + "grad_norm": 0.79296875, + "learning_rate": 9.137870195637809e-05, + "loss": 0.8307, + "step": 34521 + }, + { + "epoch": 0.9141290375384556, + "grad_norm": 0.765625, + "learning_rate": 9.137411536578734e-05, + "loss": 0.6681, + "step": 34522 + }, + { + "epoch": 0.91415551714675, + "grad_norm": 0.7578125, + "learning_rate": 9.136952879347868e-05, + "loss": 0.8198, + "step": 34523 + }, + { + "epoch": 0.9141819967550444, + "grad_norm": 0.75, + "learning_rate": 9.13649422394618e-05, + "loss": 0.8619, + "step": 34524 + }, + { + "epoch": 0.9142084763633388, + "grad_norm": 0.8125, + "learning_rate": 9.136035570374645e-05, + "loss": 0.7955, + "step": 34525 + }, + { + "epoch": 0.9142349559716331, + "grad_norm": 0.75390625, + "learning_rate": 9.135576918634231e-05, + "loss": 0.7599, + "step": 34526 + }, + { + "epoch": 0.9142614355799275, + "grad_norm": 0.76171875, + "learning_rate": 9.135118268725909e-05, + "loss": 0.7868, + "step": 34527 + }, + { + "epoch": 0.9142879151882218, + "grad_norm": 0.828125, + "learning_rate": 9.13465962065066e-05, + "loss": 0.72, + "step": 34528 + }, + { + "epoch": 0.9143143947965162, + "grad_norm": 0.76171875, + "learning_rate": 9.13420097440945e-05, + "loss": 0.7915, + "step": 34529 + }, + { + "epoch": 0.9143408744048106, + "grad_norm": 0.76171875, + "learning_rate": 9.13374233000325e-05, + "loss": 0.8681, + "step": 34530 + }, + { + "epoch": 0.914367354013105, + "grad_norm": 0.7265625, + "learning_rate": 9.133283687433035e-05, + "loss": 0.7675, + "step": 34531 + }, + { + "epoch": 0.9143938336213994, + "grad_norm": 0.81640625, + "learning_rate": 9.13282504669977e-05, + "loss": 0.8702, + "step": 34532 + }, + { + "epoch": 0.9144203132296937, + "grad_norm": 0.765625, + "learning_rate": 9.132366407804438e-05, + "loss": 0.6989, + "step": 34533 + }, + { + "epoch": 0.9144467928379881, + "grad_norm": 0.88671875, + "learning_rate": 9.131907770748006e-05, + "loss": 0.971, + "step": 34534 + }, + { + "epoch": 0.9144732724462825, + "grad_norm": 0.74609375, + "learning_rate": 9.131449135531445e-05, + "loss": 0.7079, + "step": 34535 + }, + { + "epoch": 0.9144997520545769, + "grad_norm": 0.78125, + "learning_rate": 9.130990502155729e-05, + "loss": 0.6942, + "step": 34536 + }, + { + "epoch": 0.9145262316628713, + "grad_norm": 0.83203125, + "learning_rate": 9.130531870621824e-05, + "loss": 0.7901, + "step": 34537 + }, + { + "epoch": 0.9145527112711657, + "grad_norm": 0.74609375, + "learning_rate": 9.130073240930712e-05, + "loss": 0.8573, + "step": 34538 + }, + { + "epoch": 0.91457919087946, + "grad_norm": 0.81640625, + "learning_rate": 9.129614613083359e-05, + "loss": 0.7235, + "step": 34539 + }, + { + "epoch": 0.9146056704877544, + "grad_norm": 0.85546875, + "learning_rate": 9.129155987080739e-05, + "loss": 0.8377, + "step": 34540 + }, + { + "epoch": 0.9146321500960488, + "grad_norm": 0.76171875, + "learning_rate": 9.128697362923822e-05, + "loss": 0.7017, + "step": 34541 + }, + { + "epoch": 0.9146586297043431, + "grad_norm": 0.79296875, + "learning_rate": 9.12823874061358e-05, + "loss": 0.8029, + "step": 34542 + }, + { + "epoch": 0.9146851093126375, + "grad_norm": 0.765625, + "learning_rate": 9.127780120150988e-05, + "loss": 0.7887, + "step": 34543 + }, + { + "epoch": 0.9147115889209319, + "grad_norm": 0.76171875, + "learning_rate": 9.127321501537018e-05, + "loss": 0.7656, + "step": 34544 + }, + { + "epoch": 0.9147380685292262, + "grad_norm": 0.8125, + "learning_rate": 9.126862884772638e-05, + "loss": 0.8211, + "step": 34545 + }, + { + "epoch": 0.9147645481375206, + "grad_norm": 0.75390625, + "learning_rate": 9.126404269858821e-05, + "loss": 0.8006, + "step": 34546 + }, + { + "epoch": 0.914791027745815, + "grad_norm": 0.7734375, + "learning_rate": 9.125945656796539e-05, + "loss": 0.8547, + "step": 34547 + }, + { + "epoch": 0.9148175073541094, + "grad_norm": 0.703125, + "learning_rate": 9.125487045586767e-05, + "loss": 0.8003, + "step": 34548 + }, + { + "epoch": 0.9148439869624038, + "grad_norm": 0.703125, + "learning_rate": 9.125028436230478e-05, + "loss": 0.7698, + "step": 34549 + }, + { + "epoch": 0.9148704665706981, + "grad_norm": 0.80859375, + "learning_rate": 9.124569828728639e-05, + "loss": 0.7967, + "step": 34550 + }, + { + "epoch": 0.9148969461789925, + "grad_norm": 0.80859375, + "learning_rate": 9.124111223082224e-05, + "loss": 0.8235, + "step": 34551 + }, + { + "epoch": 0.9149234257872869, + "grad_norm": 0.9296875, + "learning_rate": 9.123652619292202e-05, + "loss": 0.7796, + "step": 34552 + }, + { + "epoch": 0.9149499053955813, + "grad_norm": 0.75390625, + "learning_rate": 9.123194017359552e-05, + "loss": 0.7913, + "step": 34553 + }, + { + "epoch": 0.9149763850038757, + "grad_norm": 0.7734375, + "learning_rate": 9.122735417285242e-05, + "loss": 0.8125, + "step": 34554 + }, + { + "epoch": 0.9150028646121701, + "grad_norm": 0.79296875, + "learning_rate": 9.122276819070244e-05, + "loss": 0.7914, + "step": 34555 + }, + { + "epoch": 0.9150293442204644, + "grad_norm": 0.78125, + "learning_rate": 9.121818222715531e-05, + "loss": 0.7174, + "step": 34556 + }, + { + "epoch": 0.9150558238287588, + "grad_norm": 0.94140625, + "learning_rate": 9.12135962822207e-05, + "loss": 0.7865, + "step": 34557 + }, + { + "epoch": 0.9150823034370531, + "grad_norm": 0.8359375, + "learning_rate": 9.120901035590839e-05, + "loss": 0.9108, + "step": 34558 + }, + { + "epoch": 0.9151087830453475, + "grad_norm": 0.91796875, + "learning_rate": 9.120442444822811e-05, + "loss": 0.7954, + "step": 34559 + }, + { + "epoch": 0.9151352626536419, + "grad_norm": 0.83984375, + "learning_rate": 9.119983855918953e-05, + "loss": 0.8844, + "step": 34560 + }, + { + "epoch": 0.9151617422619363, + "grad_norm": 0.78515625, + "learning_rate": 9.11952526888024e-05, + "loss": 0.859, + "step": 34561 + }, + { + "epoch": 0.9151882218702306, + "grad_norm": 0.765625, + "learning_rate": 9.11906668370764e-05, + "loss": 0.7951, + "step": 34562 + }, + { + "epoch": 0.915214701478525, + "grad_norm": 0.82421875, + "learning_rate": 9.118608100402131e-05, + "loss": 0.7506, + "step": 34563 + }, + { + "epoch": 0.9152411810868194, + "grad_norm": 0.7890625, + "learning_rate": 9.118149518964681e-05, + "loss": 0.7615, + "step": 34564 + }, + { + "epoch": 0.9152676606951138, + "grad_norm": 0.80859375, + "learning_rate": 9.11769093939626e-05, + "loss": 0.8297, + "step": 34565 + }, + { + "epoch": 0.9152941403034082, + "grad_norm": 0.80078125, + "learning_rate": 9.117232361697844e-05, + "loss": 0.7695, + "step": 34566 + }, + { + "epoch": 0.9153206199117025, + "grad_norm": 0.80859375, + "learning_rate": 9.116773785870406e-05, + "loss": 0.8357, + "step": 34567 + }, + { + "epoch": 0.9153470995199969, + "grad_norm": 0.765625, + "learning_rate": 9.116315211914913e-05, + "loss": 0.7644, + "step": 34568 + }, + { + "epoch": 0.9153735791282913, + "grad_norm": 0.86328125, + "learning_rate": 9.11585663983234e-05, + "loss": 0.8292, + "step": 34569 + }, + { + "epoch": 0.9154000587365857, + "grad_norm": 0.71484375, + "learning_rate": 9.11539806962366e-05, + "loss": 0.7056, + "step": 34570 + }, + { + "epoch": 0.9154265383448801, + "grad_norm": 0.78515625, + "learning_rate": 9.114939501289841e-05, + "loss": 0.8853, + "step": 34571 + }, + { + "epoch": 0.9154530179531745, + "grad_norm": 0.88671875, + "learning_rate": 9.114480934831858e-05, + "loss": 0.8438, + "step": 34572 + }, + { + "epoch": 0.9154794975614688, + "grad_norm": 0.74609375, + "learning_rate": 9.11402237025068e-05, + "loss": 0.7578, + "step": 34573 + }, + { + "epoch": 0.9155059771697632, + "grad_norm": 0.78515625, + "learning_rate": 9.113563807547282e-05, + "loss": 0.7825, + "step": 34574 + }, + { + "epoch": 0.9155324567780575, + "grad_norm": 0.74609375, + "learning_rate": 9.113105246722637e-05, + "loss": 0.7351, + "step": 34575 + }, + { + "epoch": 0.9155589363863519, + "grad_norm": 0.7734375, + "learning_rate": 9.112646687777714e-05, + "loss": 0.8057, + "step": 34576 + }, + { + "epoch": 0.9155854159946463, + "grad_norm": 0.77734375, + "learning_rate": 9.112188130713485e-05, + "loss": 0.7687, + "step": 34577 + }, + { + "epoch": 0.9156118956029407, + "grad_norm": 0.9375, + "learning_rate": 9.111729575530919e-05, + "loss": 0.8393, + "step": 34578 + }, + { + "epoch": 0.915638375211235, + "grad_norm": 0.78125, + "learning_rate": 9.111271022230995e-05, + "loss": 0.8282, + "step": 34579 + }, + { + "epoch": 0.9156648548195294, + "grad_norm": 0.7265625, + "learning_rate": 9.110812470814683e-05, + "loss": 0.7011, + "step": 34580 + }, + { + "epoch": 0.9156913344278238, + "grad_norm": 0.78125, + "learning_rate": 9.110353921282951e-05, + "loss": 0.8101, + "step": 34581 + }, + { + "epoch": 0.9157178140361182, + "grad_norm": 0.74609375, + "learning_rate": 9.109895373636774e-05, + "loss": 0.7246, + "step": 34582 + }, + { + "epoch": 0.9157442936444126, + "grad_norm": 0.79296875, + "learning_rate": 9.10943682787712e-05, + "loss": 0.8001, + "step": 34583 + }, + { + "epoch": 0.915770773252707, + "grad_norm": 0.796875, + "learning_rate": 9.108978284004966e-05, + "loss": 0.82, + "step": 34584 + }, + { + "epoch": 0.9157972528610013, + "grad_norm": 0.828125, + "learning_rate": 9.108519742021281e-05, + "loss": 0.9098, + "step": 34585 + }, + { + "epoch": 0.9158237324692957, + "grad_norm": 0.734375, + "learning_rate": 9.108061201927039e-05, + "loss": 0.7448, + "step": 34586 + }, + { + "epoch": 0.9158502120775901, + "grad_norm": 0.72265625, + "learning_rate": 9.10760266372321e-05, + "loss": 0.7617, + "step": 34587 + }, + { + "epoch": 0.9158766916858845, + "grad_norm": 0.77734375, + "learning_rate": 9.107144127410764e-05, + "loss": 0.7587, + "step": 34588 + }, + { + "epoch": 0.9159031712941789, + "grad_norm": 0.86328125, + "learning_rate": 9.106685592990677e-05, + "loss": 0.7988, + "step": 34589 + }, + { + "epoch": 0.9159296509024732, + "grad_norm": 0.8046875, + "learning_rate": 9.106227060463919e-05, + "loss": 0.8165, + "step": 34590 + }, + { + "epoch": 0.9159561305107675, + "grad_norm": 0.73046875, + "learning_rate": 9.105768529831462e-05, + "loss": 0.8149, + "step": 34591 + }, + { + "epoch": 0.9159826101190619, + "grad_norm": 0.69921875, + "learning_rate": 9.105310001094276e-05, + "loss": 0.7492, + "step": 34592 + }, + { + "epoch": 0.9160090897273563, + "grad_norm": 0.7265625, + "learning_rate": 9.10485147425333e-05, + "loss": 0.7476, + "step": 34593 + }, + { + "epoch": 0.9160355693356507, + "grad_norm": 0.85546875, + "learning_rate": 9.104392949309607e-05, + "loss": 0.9355, + "step": 34594 + }, + { + "epoch": 0.916062048943945, + "grad_norm": 0.78515625, + "learning_rate": 9.10393442626407e-05, + "loss": 0.7556, + "step": 34595 + }, + { + "epoch": 0.9160885285522394, + "grad_norm": 0.79296875, + "learning_rate": 9.103475905117693e-05, + "loss": 0.8257, + "step": 34596 + }, + { + "epoch": 0.9161150081605338, + "grad_norm": 0.72265625, + "learning_rate": 9.103017385871448e-05, + "loss": 0.6644, + "step": 34597 + }, + { + "epoch": 0.9161414877688282, + "grad_norm": 0.76171875, + "learning_rate": 9.102558868526302e-05, + "loss": 0.8479, + "step": 34598 + }, + { + "epoch": 0.9161679673771226, + "grad_norm": 0.79296875, + "learning_rate": 9.102100353083236e-05, + "loss": 0.5966, + "step": 34599 + }, + { + "epoch": 0.916194446985417, + "grad_norm": 0.8203125, + "learning_rate": 9.101641839543216e-05, + "loss": 0.8193, + "step": 34600 + }, + { + "epoch": 0.9162209265937113, + "grad_norm": 0.76171875, + "learning_rate": 9.101183327907214e-05, + "loss": 0.861, + "step": 34601 + }, + { + "epoch": 0.9162474062020057, + "grad_norm": 0.7265625, + "learning_rate": 9.100724818176205e-05, + "loss": 0.7225, + "step": 34602 + }, + { + "epoch": 0.9162738858103001, + "grad_norm": 0.7890625, + "learning_rate": 9.100266310351152e-05, + "loss": 0.7401, + "step": 34603 + }, + { + "epoch": 0.9163003654185945, + "grad_norm": 0.75, + "learning_rate": 9.099807804433038e-05, + "loss": 0.7608, + "step": 34604 + }, + { + "epoch": 0.9163268450268889, + "grad_norm": 0.75, + "learning_rate": 9.09934930042283e-05, + "loss": 0.7255, + "step": 34605 + }, + { + "epoch": 0.9163533246351833, + "grad_norm": 0.8828125, + "learning_rate": 9.0988907983215e-05, + "loss": 0.8801, + "step": 34606 + }, + { + "epoch": 0.9163798042434775, + "grad_norm": 0.77734375, + "learning_rate": 9.09843229813002e-05, + "loss": 0.7649, + "step": 34607 + }, + { + "epoch": 0.9164062838517719, + "grad_norm": 0.8828125, + "learning_rate": 9.097973799849357e-05, + "loss": 0.857, + "step": 34608 + }, + { + "epoch": 0.9164327634600663, + "grad_norm": 0.78125, + "learning_rate": 9.097515303480491e-05, + "loss": 0.8684, + "step": 34609 + }, + { + "epoch": 0.9164592430683607, + "grad_norm": 0.796875, + "learning_rate": 9.097056809024389e-05, + "loss": 0.7084, + "step": 34610 + }, + { + "epoch": 0.9164857226766551, + "grad_norm": 0.80859375, + "learning_rate": 9.096598316482023e-05, + "loss": 0.8207, + "step": 34611 + }, + { + "epoch": 0.9165122022849495, + "grad_norm": 0.82421875, + "learning_rate": 9.096139825854366e-05, + "loss": 0.8284, + "step": 34612 + }, + { + "epoch": 0.9165386818932438, + "grad_norm": 0.78515625, + "learning_rate": 9.095681337142386e-05, + "loss": 0.809, + "step": 34613 + }, + { + "epoch": 0.9165651615015382, + "grad_norm": 0.75, + "learning_rate": 9.09522285034706e-05, + "loss": 0.6337, + "step": 34614 + }, + { + "epoch": 0.9165916411098326, + "grad_norm": 0.8046875, + "learning_rate": 9.094764365469357e-05, + "loss": 0.7539, + "step": 34615 + }, + { + "epoch": 0.916618120718127, + "grad_norm": 0.89453125, + "learning_rate": 9.09430588251025e-05, + "loss": 0.8479, + "step": 34616 + }, + { + "epoch": 0.9166446003264214, + "grad_norm": 0.73828125, + "learning_rate": 9.093847401470711e-05, + "loss": 0.7515, + "step": 34617 + }, + { + "epoch": 0.9166710799347157, + "grad_norm": 0.7421875, + "learning_rate": 9.093388922351705e-05, + "loss": 0.7964, + "step": 34618 + }, + { + "epoch": 0.9166975595430101, + "grad_norm": 0.671875, + "learning_rate": 9.092930445154214e-05, + "loss": 0.7988, + "step": 34619 + }, + { + "epoch": 0.9167240391513045, + "grad_norm": 0.86328125, + "learning_rate": 9.092471969879204e-05, + "loss": 0.9186, + "step": 34620 + }, + { + "epoch": 0.9167505187595989, + "grad_norm": 0.70703125, + "learning_rate": 9.092013496527651e-05, + "loss": 0.7425, + "step": 34621 + }, + { + "epoch": 0.9167769983678933, + "grad_norm": 0.7578125, + "learning_rate": 9.091555025100521e-05, + "loss": 0.8618, + "step": 34622 + }, + { + "epoch": 0.9168034779761877, + "grad_norm": 0.78515625, + "learning_rate": 9.091096555598783e-05, + "loss": 0.7029, + "step": 34623 + }, + { + "epoch": 0.9168299575844819, + "grad_norm": 0.8515625, + "learning_rate": 9.09063808802342e-05, + "loss": 0.7948, + "step": 34624 + }, + { + "epoch": 0.9168564371927763, + "grad_norm": 0.80078125, + "learning_rate": 9.090179622375398e-05, + "loss": 0.8505, + "step": 34625 + }, + { + "epoch": 0.9168829168010707, + "grad_norm": 0.7421875, + "learning_rate": 9.089721158655686e-05, + "loss": 0.7304, + "step": 34626 + }, + { + "epoch": 0.9169093964093651, + "grad_norm": 0.7578125, + "learning_rate": 9.089262696865261e-05, + "loss": 0.8145, + "step": 34627 + }, + { + "epoch": 0.9169358760176595, + "grad_norm": 0.80859375, + "learning_rate": 9.088804237005087e-05, + "loss": 0.7888, + "step": 34628 + }, + { + "epoch": 0.9169623556259539, + "grad_norm": 0.79296875, + "learning_rate": 9.088345779076144e-05, + "loss": 0.7913, + "step": 34629 + }, + { + "epoch": 0.9169888352342482, + "grad_norm": 0.76171875, + "learning_rate": 9.0878873230794e-05, + "loss": 0.8183, + "step": 34630 + }, + { + "epoch": 0.9170153148425426, + "grad_norm": 0.78515625, + "learning_rate": 9.087428869015824e-05, + "loss": 0.6891, + "step": 34631 + }, + { + "epoch": 0.917041794450837, + "grad_norm": 0.7265625, + "learning_rate": 9.086970416886391e-05, + "loss": 0.8453, + "step": 34632 + }, + { + "epoch": 0.9170682740591314, + "grad_norm": 0.83203125, + "learning_rate": 9.086511966692072e-05, + "loss": 0.8763, + "step": 34633 + }, + { + "epoch": 0.9170947536674258, + "grad_norm": 0.74609375, + "learning_rate": 9.08605351843384e-05, + "loss": 0.7748, + "step": 34634 + }, + { + "epoch": 0.9171212332757201, + "grad_norm": 0.71484375, + "learning_rate": 9.085595072112664e-05, + "loss": 0.7582, + "step": 34635 + }, + { + "epoch": 0.9171477128840145, + "grad_norm": 0.73828125, + "learning_rate": 9.08513662772952e-05, + "loss": 0.8062, + "step": 34636 + }, + { + "epoch": 0.9171741924923089, + "grad_norm": 0.78515625, + "learning_rate": 9.084678185285374e-05, + "loss": 0.7843, + "step": 34637 + }, + { + "epoch": 0.9172006721006033, + "grad_norm": 0.73828125, + "learning_rate": 9.084219744781197e-05, + "loss": 0.6886, + "step": 34638 + }, + { + "epoch": 0.9172271517088977, + "grad_norm": 0.83203125, + "learning_rate": 9.083761306217969e-05, + "loss": 0.8333, + "step": 34639 + }, + { + "epoch": 0.917253631317192, + "grad_norm": 0.734375, + "learning_rate": 9.083302869596656e-05, + "loss": 0.7607, + "step": 34640 + }, + { + "epoch": 0.9172801109254863, + "grad_norm": 0.7734375, + "learning_rate": 9.082844434918229e-05, + "loss": 0.8214, + "step": 34641 + }, + { + "epoch": 0.9173065905337807, + "grad_norm": 0.7734375, + "learning_rate": 9.082386002183663e-05, + "loss": 0.8331, + "step": 34642 + }, + { + "epoch": 0.9173330701420751, + "grad_norm": 0.765625, + "learning_rate": 9.081927571393921e-05, + "loss": 0.7969, + "step": 34643 + }, + { + "epoch": 0.9173595497503695, + "grad_norm": 0.76171875, + "learning_rate": 9.081469142549986e-05, + "loss": 0.8276, + "step": 34644 + }, + { + "epoch": 0.9173860293586639, + "grad_norm": 0.75, + "learning_rate": 9.081010715652825e-05, + "loss": 0.6867, + "step": 34645 + }, + { + "epoch": 0.9174125089669583, + "grad_norm": 0.89453125, + "learning_rate": 9.080552290703409e-05, + "loss": 0.809, + "step": 34646 + }, + { + "epoch": 0.9174389885752526, + "grad_norm": 0.7421875, + "learning_rate": 9.080093867702711e-05, + "loss": 0.7716, + "step": 34647 + }, + { + "epoch": 0.917465468183547, + "grad_norm": 0.734375, + "learning_rate": 9.079635446651697e-05, + "loss": 0.7912, + "step": 34648 + }, + { + "epoch": 0.9174919477918414, + "grad_norm": 0.8046875, + "learning_rate": 9.079177027551346e-05, + "loss": 0.8517, + "step": 34649 + }, + { + "epoch": 0.9175184274001358, + "grad_norm": 0.73828125, + "learning_rate": 9.078718610402628e-05, + "loss": 0.8471, + "step": 34650 + }, + { + "epoch": 0.9175449070084302, + "grad_norm": 0.796875, + "learning_rate": 9.078260195206511e-05, + "loss": 0.7757, + "step": 34651 + }, + { + "epoch": 0.9175713866167245, + "grad_norm": 0.82421875, + "learning_rate": 9.077801781963969e-05, + "loss": 0.759, + "step": 34652 + }, + { + "epoch": 0.9175978662250189, + "grad_norm": 0.81640625, + "learning_rate": 9.077343370675972e-05, + "loss": 0.8707, + "step": 34653 + }, + { + "epoch": 0.9176243458333133, + "grad_norm": 0.81640625, + "learning_rate": 9.076884961343497e-05, + "loss": 0.89, + "step": 34654 + }, + { + "epoch": 0.9176508254416077, + "grad_norm": 0.79296875, + "learning_rate": 9.07642655396751e-05, + "loss": 0.8459, + "step": 34655 + }, + { + "epoch": 0.917677305049902, + "grad_norm": 0.83984375, + "learning_rate": 9.075968148548984e-05, + "loss": 0.8293, + "step": 34656 + }, + { + "epoch": 0.9177037846581964, + "grad_norm": 0.73046875, + "learning_rate": 9.075509745088892e-05, + "loss": 0.7789, + "step": 34657 + }, + { + "epoch": 0.9177302642664907, + "grad_norm": 0.796875, + "learning_rate": 9.075051343588198e-05, + "loss": 0.7615, + "step": 34658 + }, + { + "epoch": 0.9177567438747851, + "grad_norm": 0.7109375, + "learning_rate": 9.074592944047884e-05, + "loss": 0.6987, + "step": 34659 + }, + { + "epoch": 0.9177832234830795, + "grad_norm": 0.80859375, + "learning_rate": 9.07413454646892e-05, + "loss": 0.7447, + "step": 34660 + }, + { + "epoch": 0.9178097030913739, + "grad_norm": 0.80078125, + "learning_rate": 9.073676150852272e-05, + "loss": 0.857, + "step": 34661 + }, + { + "epoch": 0.9178361826996683, + "grad_norm": 0.796875, + "learning_rate": 9.073217757198917e-05, + "loss": 0.8316, + "step": 34662 + }, + { + "epoch": 0.9178626623079627, + "grad_norm": 0.68359375, + "learning_rate": 9.072759365509818e-05, + "loss": 0.7405, + "step": 34663 + }, + { + "epoch": 0.917889141916257, + "grad_norm": 0.83984375, + "learning_rate": 9.072300975785958e-05, + "loss": 0.88, + "step": 34664 + }, + { + "epoch": 0.9179156215245514, + "grad_norm": 0.86328125, + "learning_rate": 9.071842588028302e-05, + "loss": 0.9168, + "step": 34665 + }, + { + "epoch": 0.9179421011328458, + "grad_norm": 0.765625, + "learning_rate": 9.071384202237824e-05, + "loss": 0.6854, + "step": 34666 + }, + { + "epoch": 0.9179685807411402, + "grad_norm": 0.71875, + "learning_rate": 9.070925818415492e-05, + "loss": 0.817, + "step": 34667 + }, + { + "epoch": 0.9179950603494346, + "grad_norm": 0.8125, + "learning_rate": 9.070467436562278e-05, + "loss": 0.8757, + "step": 34668 + }, + { + "epoch": 0.918021539957729, + "grad_norm": 0.80078125, + "learning_rate": 9.070009056679156e-05, + "loss": 0.7164, + "step": 34669 + }, + { + "epoch": 0.9180480195660233, + "grad_norm": 0.72265625, + "learning_rate": 9.069550678767099e-05, + "loss": 0.7223, + "step": 34670 + }, + { + "epoch": 0.9180744991743177, + "grad_norm": 0.765625, + "learning_rate": 9.069092302827076e-05, + "loss": 0.7825, + "step": 34671 + }, + { + "epoch": 0.9181009787826121, + "grad_norm": 0.85546875, + "learning_rate": 9.068633928860058e-05, + "loss": 0.8221, + "step": 34672 + }, + { + "epoch": 0.9181274583909064, + "grad_norm": 0.75390625, + "learning_rate": 9.068175556867015e-05, + "loss": 0.6925, + "step": 34673 + }, + { + "epoch": 0.9181539379992008, + "grad_norm": 0.74609375, + "learning_rate": 9.067717186848923e-05, + "loss": 0.7826, + "step": 34674 + }, + { + "epoch": 0.9181804176074951, + "grad_norm": 0.765625, + "learning_rate": 9.067258818806752e-05, + "loss": 0.785, + "step": 34675 + }, + { + "epoch": 0.9182068972157895, + "grad_norm": 0.71484375, + "learning_rate": 9.066800452741472e-05, + "loss": 0.772, + "step": 34676 + }, + { + "epoch": 0.9182333768240839, + "grad_norm": 0.78515625, + "learning_rate": 9.066342088654055e-05, + "loss": 0.7975, + "step": 34677 + }, + { + "epoch": 0.9182598564323783, + "grad_norm": 0.734375, + "learning_rate": 9.065883726545468e-05, + "loss": 0.6468, + "step": 34678 + }, + { + "epoch": 0.9182863360406727, + "grad_norm": 0.80859375, + "learning_rate": 9.06542536641669e-05, + "loss": 0.8079, + "step": 34679 + }, + { + "epoch": 0.918312815648967, + "grad_norm": 0.88671875, + "learning_rate": 9.064967008268692e-05, + "loss": 0.9184, + "step": 34680 + }, + { + "epoch": 0.9183392952572614, + "grad_norm": 0.734375, + "learning_rate": 9.064508652102442e-05, + "loss": 0.8228, + "step": 34681 + }, + { + "epoch": 0.9183657748655558, + "grad_norm": 0.74609375, + "learning_rate": 9.064050297918914e-05, + "loss": 0.6551, + "step": 34682 + }, + { + "epoch": 0.9183922544738502, + "grad_norm": 0.76171875, + "learning_rate": 9.063591945719071e-05, + "loss": 0.7582, + "step": 34683 + }, + { + "epoch": 0.9184187340821446, + "grad_norm": 0.73046875, + "learning_rate": 9.063133595503897e-05, + "loss": 0.6384, + "step": 34684 + }, + { + "epoch": 0.918445213690439, + "grad_norm": 0.765625, + "learning_rate": 9.062675247274358e-05, + "loss": 0.7845, + "step": 34685 + }, + { + "epoch": 0.9184716932987333, + "grad_norm": 0.8515625, + "learning_rate": 9.062216901031424e-05, + "loss": 0.7253, + "step": 34686 + }, + { + "epoch": 0.9184981729070277, + "grad_norm": 0.76171875, + "learning_rate": 9.061758556776069e-05, + "loss": 0.8972, + "step": 34687 + }, + { + "epoch": 0.9185246525153221, + "grad_norm": 0.7578125, + "learning_rate": 9.061300214509258e-05, + "loss": 0.7862, + "step": 34688 + }, + { + "epoch": 0.9185511321236164, + "grad_norm": 0.7578125, + "learning_rate": 9.060841874231972e-05, + "loss": 0.8276, + "step": 34689 + }, + { + "epoch": 0.9185776117319108, + "grad_norm": 0.84765625, + "learning_rate": 9.060383535945178e-05, + "loss": 0.8095, + "step": 34690 + }, + { + "epoch": 0.9186040913402052, + "grad_norm": 0.8671875, + "learning_rate": 9.059925199649847e-05, + "loss": 0.8606, + "step": 34691 + }, + { + "epoch": 0.9186305709484995, + "grad_norm": 0.79296875, + "learning_rate": 9.059466865346953e-05, + "loss": 0.8565, + "step": 34692 + }, + { + "epoch": 0.9186570505567939, + "grad_norm": 0.85546875, + "learning_rate": 9.059008533037461e-05, + "loss": 0.7678, + "step": 34693 + }, + { + "epoch": 0.9186835301650883, + "grad_norm": 0.72265625, + "learning_rate": 9.058550202722349e-05, + "loss": 0.7942, + "step": 34694 + }, + { + "epoch": 0.9187100097733827, + "grad_norm": 0.7578125, + "learning_rate": 9.058091874402586e-05, + "loss": 0.8788, + "step": 34695 + }, + { + "epoch": 0.9187364893816771, + "grad_norm": 0.78125, + "learning_rate": 9.057633548079144e-05, + "loss": 0.7717, + "step": 34696 + }, + { + "epoch": 0.9187629689899715, + "grad_norm": 0.83203125, + "learning_rate": 9.057175223752993e-05, + "loss": 0.7505, + "step": 34697 + }, + { + "epoch": 0.9187894485982658, + "grad_norm": 0.9296875, + "learning_rate": 9.056716901425102e-05, + "loss": 0.8552, + "step": 34698 + }, + { + "epoch": 0.9188159282065602, + "grad_norm": 0.7109375, + "learning_rate": 9.056258581096448e-05, + "loss": 0.8432, + "step": 34699 + }, + { + "epoch": 0.9188424078148546, + "grad_norm": 0.69921875, + "learning_rate": 9.055800262768001e-05, + "loss": 0.6567, + "step": 34700 + }, + { + "epoch": 0.918868887423149, + "grad_norm": 0.7890625, + "learning_rate": 9.055341946440733e-05, + "loss": 0.7208, + "step": 34701 + }, + { + "epoch": 0.9188953670314434, + "grad_norm": 0.7421875, + "learning_rate": 9.054883632115611e-05, + "loss": 0.8901, + "step": 34702 + }, + { + "epoch": 0.9189218466397377, + "grad_norm": 0.71484375, + "learning_rate": 9.054425319793607e-05, + "loss": 0.6446, + "step": 34703 + }, + { + "epoch": 0.9189483262480321, + "grad_norm": 0.7734375, + "learning_rate": 9.0539670094757e-05, + "loss": 0.8494, + "step": 34704 + }, + { + "epoch": 0.9189748058563264, + "grad_norm": 0.703125, + "learning_rate": 9.053508701162853e-05, + "loss": 0.8313, + "step": 34705 + }, + { + "epoch": 0.9190012854646208, + "grad_norm": 0.7421875, + "learning_rate": 9.05305039485604e-05, + "loss": 0.7158, + "step": 34706 + }, + { + "epoch": 0.9190277650729152, + "grad_norm": 0.8359375, + "learning_rate": 9.052592090556234e-05, + "loss": 0.8442, + "step": 34707 + }, + { + "epoch": 0.9190542446812096, + "grad_norm": 0.7890625, + "learning_rate": 9.052133788264405e-05, + "loss": 0.7564, + "step": 34708 + }, + { + "epoch": 0.9190807242895039, + "grad_norm": 0.7265625, + "learning_rate": 9.051675487981519e-05, + "loss": 0.7582, + "step": 34709 + }, + { + "epoch": 0.9191072038977983, + "grad_norm": 0.8125, + "learning_rate": 9.051217189708557e-05, + "loss": 0.7554, + "step": 34710 + }, + { + "epoch": 0.9191336835060927, + "grad_norm": 0.7421875, + "learning_rate": 9.050758893446487e-05, + "loss": 0.6294, + "step": 34711 + }, + { + "epoch": 0.9191601631143871, + "grad_norm": 0.7734375, + "learning_rate": 9.050300599196278e-05, + "loss": 0.8562, + "step": 34712 + }, + { + "epoch": 0.9191866427226815, + "grad_norm": 0.84375, + "learning_rate": 9.049842306958905e-05, + "loss": 0.7458, + "step": 34713 + }, + { + "epoch": 0.9192131223309759, + "grad_norm": 0.7890625, + "learning_rate": 9.049384016735332e-05, + "loss": 0.7989, + "step": 34714 + }, + { + "epoch": 0.9192396019392702, + "grad_norm": 0.6953125, + "learning_rate": 9.048925728526538e-05, + "loss": 0.647, + "step": 34715 + }, + { + "epoch": 0.9192660815475646, + "grad_norm": 0.9453125, + "learning_rate": 9.048467442333492e-05, + "loss": 0.8804, + "step": 34716 + }, + { + "epoch": 0.919292561155859, + "grad_norm": 0.78515625, + "learning_rate": 9.048009158157165e-05, + "loss": 0.7385, + "step": 34717 + }, + { + "epoch": 0.9193190407641534, + "grad_norm": 0.77734375, + "learning_rate": 9.047550875998526e-05, + "loss": 0.8028, + "step": 34718 + }, + { + "epoch": 0.9193455203724478, + "grad_norm": 0.80078125, + "learning_rate": 9.047092595858549e-05, + "loss": 0.8589, + "step": 34719 + }, + { + "epoch": 0.9193719999807421, + "grad_norm": 0.79296875, + "learning_rate": 9.046634317738206e-05, + "loss": 0.6464, + "step": 34720 + }, + { + "epoch": 0.9193984795890365, + "grad_norm": 0.77734375, + "learning_rate": 9.046176041638467e-05, + "loss": 0.8471, + "step": 34721 + }, + { + "epoch": 0.9194249591973308, + "grad_norm": 0.8046875, + "learning_rate": 9.045717767560304e-05, + "loss": 0.7235, + "step": 34722 + }, + { + "epoch": 0.9194514388056252, + "grad_norm": 0.734375, + "learning_rate": 9.045259495504687e-05, + "loss": 0.8442, + "step": 34723 + }, + { + "epoch": 0.9194779184139196, + "grad_norm": 0.75, + "learning_rate": 9.044801225472584e-05, + "loss": 0.7273, + "step": 34724 + }, + { + "epoch": 0.919504398022214, + "grad_norm": 0.71875, + "learning_rate": 9.044342957464974e-05, + "loss": 0.682, + "step": 34725 + }, + { + "epoch": 0.9195308776305083, + "grad_norm": 0.75390625, + "learning_rate": 9.043884691482824e-05, + "loss": 0.7482, + "step": 34726 + }, + { + "epoch": 0.9195573572388027, + "grad_norm": 0.78515625, + "learning_rate": 9.043426427527107e-05, + "loss": 0.7294, + "step": 34727 + }, + { + "epoch": 0.9195838368470971, + "grad_norm": 0.77734375, + "learning_rate": 9.042968165598793e-05, + "loss": 0.6909, + "step": 34728 + }, + { + "epoch": 0.9196103164553915, + "grad_norm": 0.65625, + "learning_rate": 9.04250990569885e-05, + "loss": 0.6381, + "step": 34729 + }, + { + "epoch": 0.9196367960636859, + "grad_norm": 0.85546875, + "learning_rate": 9.042051647828255e-05, + "loss": 0.7428, + "step": 34730 + }, + { + "epoch": 0.9196632756719803, + "grad_norm": 0.7734375, + "learning_rate": 9.041593391987977e-05, + "loss": 0.7139, + "step": 34731 + }, + { + "epoch": 0.9196897552802746, + "grad_norm": 0.68359375, + "learning_rate": 9.041135138178987e-05, + "loss": 0.6919, + "step": 34732 + }, + { + "epoch": 0.919716234888569, + "grad_norm": 0.79296875, + "learning_rate": 9.040676886402257e-05, + "loss": 0.8241, + "step": 34733 + }, + { + "epoch": 0.9197427144968634, + "grad_norm": 0.8046875, + "learning_rate": 9.040218636658755e-05, + "loss": 0.8773, + "step": 34734 + }, + { + "epoch": 0.9197691941051578, + "grad_norm": 0.796875, + "learning_rate": 9.039760388949456e-05, + "loss": 0.6723, + "step": 34735 + }, + { + "epoch": 0.9197956737134522, + "grad_norm": 0.71875, + "learning_rate": 9.039302143275329e-05, + "loss": 0.662, + "step": 34736 + }, + { + "epoch": 0.9198221533217465, + "grad_norm": 0.7421875, + "learning_rate": 9.038843899637348e-05, + "loss": 0.7754, + "step": 34737 + }, + { + "epoch": 0.9198486329300408, + "grad_norm": 0.7578125, + "learning_rate": 9.038385658036482e-05, + "loss": 0.7093, + "step": 34738 + }, + { + "epoch": 0.9198751125383352, + "grad_norm": 0.796875, + "learning_rate": 9.037927418473702e-05, + "loss": 0.7238, + "step": 34739 + }, + { + "epoch": 0.9199015921466296, + "grad_norm": 0.83984375, + "learning_rate": 9.03746918094998e-05, + "loss": 0.7673, + "step": 34740 + }, + { + "epoch": 0.919928071754924, + "grad_norm": 0.859375, + "learning_rate": 9.03701094546629e-05, + "loss": 0.9249, + "step": 34741 + }, + { + "epoch": 0.9199545513632184, + "grad_norm": 0.79296875, + "learning_rate": 9.036552712023598e-05, + "loss": 0.8788, + "step": 34742 + }, + { + "epoch": 0.9199810309715127, + "grad_norm": 0.77734375, + "learning_rate": 9.036094480622876e-05, + "loss": 0.7448, + "step": 34743 + }, + { + "epoch": 0.9200075105798071, + "grad_norm": 0.828125, + "learning_rate": 9.035636251265093e-05, + "loss": 0.7677, + "step": 34744 + }, + { + "epoch": 0.9200339901881015, + "grad_norm": 0.71484375, + "learning_rate": 9.035178023951231e-05, + "loss": 0.8119, + "step": 34745 + }, + { + "epoch": 0.9200604697963959, + "grad_norm": 0.73046875, + "learning_rate": 9.034719798682251e-05, + "loss": 0.9036, + "step": 34746 + }, + { + "epoch": 0.9200869494046903, + "grad_norm": 0.875, + "learning_rate": 9.034261575459129e-05, + "loss": 0.9467, + "step": 34747 + }, + { + "epoch": 0.9201134290129847, + "grad_norm": 0.81640625, + "learning_rate": 9.033803354282833e-05, + "loss": 0.8196, + "step": 34748 + }, + { + "epoch": 0.920139908621279, + "grad_norm": 0.71484375, + "learning_rate": 9.033345135154333e-05, + "loss": 0.7727, + "step": 34749 + }, + { + "epoch": 0.9201663882295734, + "grad_norm": 0.8125, + "learning_rate": 9.032886918074606e-05, + "loss": 0.8375, + "step": 34750 + }, + { + "epoch": 0.9201928678378678, + "grad_norm": 0.78515625, + "learning_rate": 9.03242870304462e-05, + "loss": 0.7195, + "step": 34751 + }, + { + "epoch": 0.9202193474461622, + "grad_norm": 0.7890625, + "learning_rate": 9.031970490065346e-05, + "loss": 0.7655, + "step": 34752 + }, + { + "epoch": 0.9202458270544566, + "grad_norm": 0.8046875, + "learning_rate": 9.031512279137755e-05, + "loss": 0.7689, + "step": 34753 + }, + { + "epoch": 0.920272306662751, + "grad_norm": 0.96484375, + "learning_rate": 9.031054070262816e-05, + "loss": 0.7952, + "step": 34754 + }, + { + "epoch": 0.9202987862710452, + "grad_norm": 0.7578125, + "learning_rate": 9.030595863441505e-05, + "loss": 0.728, + "step": 34755 + }, + { + "epoch": 0.9203252658793396, + "grad_norm": 0.75, + "learning_rate": 9.03013765867479e-05, + "loss": 0.7577, + "step": 34756 + }, + { + "epoch": 0.920351745487634, + "grad_norm": 0.765625, + "learning_rate": 9.029679455963643e-05, + "loss": 0.6966, + "step": 34757 + }, + { + "epoch": 0.9203782250959284, + "grad_norm": 0.76953125, + "learning_rate": 9.029221255309036e-05, + "loss": 0.7568, + "step": 34758 + }, + { + "epoch": 0.9204047047042228, + "grad_norm": 0.82421875, + "learning_rate": 9.028763056711936e-05, + "loss": 0.7966, + "step": 34759 + }, + { + "epoch": 0.9204311843125171, + "grad_norm": 0.76953125, + "learning_rate": 9.02830486017332e-05, + "loss": 0.8331, + "step": 34760 + }, + { + "epoch": 0.9204576639208115, + "grad_norm": 0.796875, + "learning_rate": 9.027846665694157e-05, + "loss": 0.8842, + "step": 34761 + }, + { + "epoch": 0.9204841435291059, + "grad_norm": 0.8046875, + "learning_rate": 9.027388473275416e-05, + "loss": 0.7588, + "step": 34762 + }, + { + "epoch": 0.9205106231374003, + "grad_norm": 0.765625, + "learning_rate": 9.02693028291807e-05, + "loss": 0.7655, + "step": 34763 + }, + { + "epoch": 0.9205371027456947, + "grad_norm": 0.80078125, + "learning_rate": 9.026472094623086e-05, + "loss": 0.8182, + "step": 34764 + }, + { + "epoch": 0.920563582353989, + "grad_norm": 0.7265625, + "learning_rate": 9.026013908391442e-05, + "loss": 0.7502, + "step": 34765 + }, + { + "epoch": 0.9205900619622834, + "grad_norm": 0.77734375, + "learning_rate": 9.025555724224106e-05, + "loss": 0.8038, + "step": 34766 + }, + { + "epoch": 0.9206165415705778, + "grad_norm": 0.77734375, + "learning_rate": 9.025097542122049e-05, + "loss": 0.8698, + "step": 34767 + }, + { + "epoch": 0.9206430211788722, + "grad_norm": 0.87109375, + "learning_rate": 9.024639362086243e-05, + "loss": 0.8111, + "step": 34768 + }, + { + "epoch": 0.9206695007871666, + "grad_norm": 0.80078125, + "learning_rate": 9.024181184117652e-05, + "loss": 0.78, + "step": 34769 + }, + { + "epoch": 0.920695980395461, + "grad_norm": 0.796875, + "learning_rate": 9.02372300821726e-05, + "loss": 0.8327, + "step": 34770 + }, + { + "epoch": 0.9207224600037552, + "grad_norm": 0.765625, + "learning_rate": 9.023264834386029e-05, + "loss": 0.7619, + "step": 34771 + }, + { + "epoch": 0.9207489396120496, + "grad_norm": 1.0859375, + "learning_rate": 9.022806662624934e-05, + "loss": 0.7801, + "step": 34772 + }, + { + "epoch": 0.920775419220344, + "grad_norm": 0.859375, + "learning_rate": 9.022348492934944e-05, + "loss": 0.7453, + "step": 34773 + }, + { + "epoch": 0.9208018988286384, + "grad_norm": 0.77734375, + "learning_rate": 9.021890325317025e-05, + "loss": 0.7752, + "step": 34774 + }, + { + "epoch": 0.9208283784369328, + "grad_norm": 0.83984375, + "learning_rate": 9.021432159772158e-05, + "loss": 0.7024, + "step": 34775 + }, + { + "epoch": 0.9208548580452272, + "grad_norm": 0.7421875, + "learning_rate": 9.020973996301312e-05, + "loss": 0.6178, + "step": 34776 + }, + { + "epoch": 0.9208813376535215, + "grad_norm": 0.75390625, + "learning_rate": 9.020515834905453e-05, + "loss": 0.7034, + "step": 34777 + }, + { + "epoch": 0.9209078172618159, + "grad_norm": 0.75390625, + "learning_rate": 9.020057675585556e-05, + "loss": 0.7149, + "step": 34778 + }, + { + "epoch": 0.9209342968701103, + "grad_norm": 0.765625, + "learning_rate": 9.019599518342589e-05, + "loss": 0.8838, + "step": 34779 + }, + { + "epoch": 0.9209607764784047, + "grad_norm": 0.78515625, + "learning_rate": 9.019141363177525e-05, + "loss": 0.7886, + "step": 34780 + }, + { + "epoch": 0.9209872560866991, + "grad_norm": 0.71875, + "learning_rate": 9.018683210091337e-05, + "loss": 0.8214, + "step": 34781 + }, + { + "epoch": 0.9210137356949935, + "grad_norm": 0.73828125, + "learning_rate": 9.018225059084992e-05, + "loss": 0.7799, + "step": 34782 + }, + { + "epoch": 0.9210402153032878, + "grad_norm": 0.77734375, + "learning_rate": 9.017766910159465e-05, + "loss": 0.8285, + "step": 34783 + }, + { + "epoch": 0.9210666949115822, + "grad_norm": 0.7421875, + "learning_rate": 9.017308763315718e-05, + "loss": 0.7407, + "step": 34784 + }, + { + "epoch": 0.9210931745198766, + "grad_norm": 0.68359375, + "learning_rate": 9.016850618554736e-05, + "loss": 0.7962, + "step": 34785 + }, + { + "epoch": 0.921119654128171, + "grad_norm": 0.828125, + "learning_rate": 9.016392475877482e-05, + "loss": 0.8762, + "step": 34786 + }, + { + "epoch": 0.9211461337364653, + "grad_norm": 1.09375, + "learning_rate": 9.015934335284926e-05, + "loss": 0.7466, + "step": 34787 + }, + { + "epoch": 0.9211726133447596, + "grad_norm": 0.77734375, + "learning_rate": 9.015476196778042e-05, + "loss": 0.7748, + "step": 34788 + }, + { + "epoch": 0.921199092953054, + "grad_norm": 0.93359375, + "learning_rate": 9.015018060357796e-05, + "loss": 0.8636, + "step": 34789 + }, + { + "epoch": 0.9212255725613484, + "grad_norm": 0.83203125, + "learning_rate": 9.014559926025168e-05, + "loss": 0.7923, + "step": 34790 + }, + { + "epoch": 0.9212520521696428, + "grad_norm": 0.734375, + "learning_rate": 9.014101793781122e-05, + "loss": 0.7795, + "step": 34791 + }, + { + "epoch": 0.9212785317779372, + "grad_norm": 0.8515625, + "learning_rate": 9.013643663626633e-05, + "loss": 0.7613, + "step": 34792 + }, + { + "epoch": 0.9213050113862316, + "grad_norm": 0.7578125, + "learning_rate": 9.01318553556267e-05, + "loss": 0.7822, + "step": 34793 + }, + { + "epoch": 0.9213314909945259, + "grad_norm": 0.78125, + "learning_rate": 9.012727409590197e-05, + "loss": 0.767, + "step": 34794 + }, + { + "epoch": 0.9213579706028203, + "grad_norm": 0.71875, + "learning_rate": 9.012269285710198e-05, + "loss": 0.7358, + "step": 34795 + }, + { + "epoch": 0.9213844502111147, + "grad_norm": 0.8125, + "learning_rate": 9.011811163923637e-05, + "loss": 0.8555, + "step": 34796 + }, + { + "epoch": 0.9214109298194091, + "grad_norm": 0.79296875, + "learning_rate": 9.011353044231486e-05, + "loss": 0.7666, + "step": 34797 + }, + { + "epoch": 0.9214374094277035, + "grad_norm": 0.80859375, + "learning_rate": 9.010894926634717e-05, + "loss": 0.7218, + "step": 34798 + }, + { + "epoch": 0.9214638890359979, + "grad_norm": 0.77734375, + "learning_rate": 9.010436811134295e-05, + "loss": 0.8592, + "step": 34799 + }, + { + "epoch": 0.9214903686442922, + "grad_norm": 0.71875, + "learning_rate": 9.009978697731199e-05, + "loss": 0.7896, + "step": 34800 + }, + { + "epoch": 0.9215168482525866, + "grad_norm": 0.8203125, + "learning_rate": 9.009520586426397e-05, + "loss": 0.8694, + "step": 34801 + }, + { + "epoch": 0.921543327860881, + "grad_norm": 0.80078125, + "learning_rate": 9.00906247722086e-05, + "loss": 0.8129, + "step": 34802 + }, + { + "epoch": 0.9215698074691754, + "grad_norm": 0.765625, + "learning_rate": 9.008604370115555e-05, + "loss": 0.8012, + "step": 34803 + }, + { + "epoch": 0.9215962870774697, + "grad_norm": 0.734375, + "learning_rate": 9.008146265111457e-05, + "loss": 0.6521, + "step": 34804 + }, + { + "epoch": 0.921622766685764, + "grad_norm": 0.71484375, + "learning_rate": 9.007688162209538e-05, + "loss": 0.679, + "step": 34805 + }, + { + "epoch": 0.9216492462940584, + "grad_norm": 0.75, + "learning_rate": 9.007230061410768e-05, + "loss": 0.7052, + "step": 34806 + }, + { + "epoch": 0.9216757259023528, + "grad_norm": 0.8125, + "learning_rate": 9.006771962716117e-05, + "loss": 0.8309, + "step": 34807 + }, + { + "epoch": 0.9217022055106472, + "grad_norm": 0.765625, + "learning_rate": 9.006313866126554e-05, + "loss": 0.867, + "step": 34808 + }, + { + "epoch": 0.9217286851189416, + "grad_norm": 0.7734375, + "learning_rate": 9.00585577164305e-05, + "loss": 0.7527, + "step": 34809 + }, + { + "epoch": 0.921755164727236, + "grad_norm": 0.72265625, + "learning_rate": 9.005397679266582e-05, + "loss": 0.7352, + "step": 34810 + }, + { + "epoch": 0.9217816443355303, + "grad_norm": 0.7421875, + "learning_rate": 9.004939588998116e-05, + "loss": 0.7055, + "step": 34811 + }, + { + "epoch": 0.9218081239438247, + "grad_norm": 0.70703125, + "learning_rate": 9.004481500838624e-05, + "loss": 0.694, + "step": 34812 + }, + { + "epoch": 0.9218346035521191, + "grad_norm": 0.81640625, + "learning_rate": 9.004023414789077e-05, + "loss": 0.6905, + "step": 34813 + }, + { + "epoch": 0.9218610831604135, + "grad_norm": 0.859375, + "learning_rate": 9.003565330850441e-05, + "loss": 0.9185, + "step": 34814 + }, + { + "epoch": 0.9218875627687079, + "grad_norm": 0.7265625, + "learning_rate": 9.003107249023695e-05, + "loss": 0.8328, + "step": 34815 + }, + { + "epoch": 0.9219140423770023, + "grad_norm": 0.76171875, + "learning_rate": 9.002649169309806e-05, + "loss": 0.8218, + "step": 34816 + }, + { + "epoch": 0.9219405219852966, + "grad_norm": 0.80859375, + "learning_rate": 9.002191091709746e-05, + "loss": 0.7828, + "step": 34817 + }, + { + "epoch": 0.921967001593591, + "grad_norm": 0.8515625, + "learning_rate": 9.001733016224484e-05, + "loss": 0.7512, + "step": 34818 + }, + { + "epoch": 0.9219934812018854, + "grad_norm": 0.73046875, + "learning_rate": 9.00127494285499e-05, + "loss": 0.7736, + "step": 34819 + }, + { + "epoch": 0.9220199608101797, + "grad_norm": 0.78125, + "learning_rate": 9.00081687160224e-05, + "loss": 0.8995, + "step": 34820 + }, + { + "epoch": 0.9220464404184741, + "grad_norm": 0.7734375, + "learning_rate": 9.0003588024672e-05, + "loss": 0.8792, + "step": 34821 + }, + { + "epoch": 0.9220729200267684, + "grad_norm": 0.68359375, + "learning_rate": 8.999900735450842e-05, + "loss": 0.7805, + "step": 34822 + }, + { + "epoch": 0.9220993996350628, + "grad_norm": 0.74609375, + "learning_rate": 8.999442670554136e-05, + "loss": 0.8894, + "step": 34823 + }, + { + "epoch": 0.9221258792433572, + "grad_norm": 0.765625, + "learning_rate": 8.998984607778054e-05, + "loss": 0.826, + "step": 34824 + }, + { + "epoch": 0.9221523588516516, + "grad_norm": 0.77734375, + "learning_rate": 8.99852654712357e-05, + "loss": 0.7679, + "step": 34825 + }, + { + "epoch": 0.922178838459946, + "grad_norm": 0.796875, + "learning_rate": 8.998068488591651e-05, + "loss": 1.0175, + "step": 34826 + }, + { + "epoch": 0.9222053180682404, + "grad_norm": 0.82421875, + "learning_rate": 8.997610432183269e-05, + "loss": 0.799, + "step": 34827 + }, + { + "epoch": 0.9222317976765347, + "grad_norm": 0.75390625, + "learning_rate": 8.997152377899394e-05, + "loss": 0.6676, + "step": 34828 + }, + { + "epoch": 0.9222582772848291, + "grad_norm": 0.8203125, + "learning_rate": 8.996694325740992e-05, + "loss": 0.7582, + "step": 34829 + }, + { + "epoch": 0.9222847568931235, + "grad_norm": 0.8671875, + "learning_rate": 8.996236275709044e-05, + "loss": 0.8602, + "step": 34830 + }, + { + "epoch": 0.9223112365014179, + "grad_norm": 0.828125, + "learning_rate": 8.995778227804516e-05, + "loss": 0.7935, + "step": 34831 + }, + { + "epoch": 0.9223377161097123, + "grad_norm": 0.83984375, + "learning_rate": 8.995320182028379e-05, + "loss": 0.8086, + "step": 34832 + }, + { + "epoch": 0.9223641957180067, + "grad_norm": 0.8359375, + "learning_rate": 8.994862138381604e-05, + "loss": 0.7856, + "step": 34833 + }, + { + "epoch": 0.922390675326301, + "grad_norm": 0.7734375, + "learning_rate": 8.994404096865154e-05, + "loss": 0.6813, + "step": 34834 + }, + { + "epoch": 0.9224171549345954, + "grad_norm": 0.796875, + "learning_rate": 8.993946057480013e-05, + "loss": 0.8336, + "step": 34835 + }, + { + "epoch": 0.9224436345428897, + "grad_norm": 0.88671875, + "learning_rate": 8.993488020227147e-05, + "loss": 0.7268, + "step": 34836 + }, + { + "epoch": 0.9224701141511841, + "grad_norm": 1.015625, + "learning_rate": 8.993029985107525e-05, + "loss": 0.8106, + "step": 34837 + }, + { + "epoch": 0.9224965937594785, + "grad_norm": 0.77734375, + "learning_rate": 8.992571952122119e-05, + "loss": 0.9092, + "step": 34838 + }, + { + "epoch": 0.9225230733677728, + "grad_norm": 0.71875, + "learning_rate": 8.992113921271896e-05, + "loss": 0.8633, + "step": 34839 + }, + { + "epoch": 0.9225495529760672, + "grad_norm": 0.7890625, + "learning_rate": 8.99165589255783e-05, + "loss": 0.8249, + "step": 34840 + }, + { + "epoch": 0.9225760325843616, + "grad_norm": 0.72265625, + "learning_rate": 8.991197865980894e-05, + "loss": 0.7883, + "step": 34841 + }, + { + "epoch": 0.922602512192656, + "grad_norm": 0.8046875, + "learning_rate": 8.990739841542056e-05, + "loss": 0.8524, + "step": 34842 + }, + { + "epoch": 0.9226289918009504, + "grad_norm": 0.7421875, + "learning_rate": 8.990281819242288e-05, + "loss": 0.703, + "step": 34843 + }, + { + "epoch": 0.9226554714092448, + "grad_norm": 0.8046875, + "learning_rate": 8.989823799082557e-05, + "loss": 0.8488, + "step": 34844 + }, + { + "epoch": 0.9226819510175391, + "grad_norm": 0.81640625, + "learning_rate": 8.98936578106384e-05, + "loss": 0.756, + "step": 34845 + }, + { + "epoch": 0.9227084306258335, + "grad_norm": 0.83203125, + "learning_rate": 8.988907765187104e-05, + "loss": 0.7471, + "step": 34846 + }, + { + "epoch": 0.9227349102341279, + "grad_norm": 0.796875, + "learning_rate": 8.98844975145332e-05, + "loss": 0.7788, + "step": 34847 + }, + { + "epoch": 0.9227613898424223, + "grad_norm": 0.7421875, + "learning_rate": 8.987991739863458e-05, + "loss": 0.6973, + "step": 34848 + }, + { + "epoch": 0.9227878694507167, + "grad_norm": 0.7734375, + "learning_rate": 8.987533730418487e-05, + "loss": 0.7953, + "step": 34849 + }, + { + "epoch": 0.922814349059011, + "grad_norm": 0.7578125, + "learning_rate": 8.987075723119384e-05, + "loss": 0.8552, + "step": 34850 + }, + { + "epoch": 0.9228408286673054, + "grad_norm": 0.734375, + "learning_rate": 8.986617717967116e-05, + "loss": 0.716, + "step": 34851 + }, + { + "epoch": 0.9228673082755998, + "grad_norm": 0.859375, + "learning_rate": 8.986159714962654e-05, + "loss": 0.7607, + "step": 34852 + }, + { + "epoch": 0.9228937878838941, + "grad_norm": 0.796875, + "learning_rate": 8.985701714106968e-05, + "loss": 0.7589, + "step": 34853 + }, + { + "epoch": 0.9229202674921885, + "grad_norm": 0.87109375, + "learning_rate": 8.98524371540103e-05, + "loss": 0.9253, + "step": 34854 + }, + { + "epoch": 0.9229467471004829, + "grad_norm": 0.8515625, + "learning_rate": 8.984785718845805e-05, + "loss": 0.8458, + "step": 34855 + }, + { + "epoch": 0.9229732267087772, + "grad_norm": 0.77734375, + "learning_rate": 8.984327724442275e-05, + "loss": 0.7396, + "step": 34856 + }, + { + "epoch": 0.9229997063170716, + "grad_norm": 0.859375, + "learning_rate": 8.983869732191401e-05, + "loss": 0.9039, + "step": 34857 + }, + { + "epoch": 0.923026185925366, + "grad_norm": 0.7578125, + "learning_rate": 8.983411742094159e-05, + "loss": 0.7363, + "step": 34858 + }, + { + "epoch": 0.9230526655336604, + "grad_norm": 3.078125, + "learning_rate": 8.982953754151518e-05, + "loss": 0.86, + "step": 34859 + }, + { + "epoch": 0.9230791451419548, + "grad_norm": 0.6484375, + "learning_rate": 8.982495768364442e-05, + "loss": 0.7122, + "step": 34860 + }, + { + "epoch": 0.9231056247502492, + "grad_norm": 0.77734375, + "learning_rate": 8.982037784733914e-05, + "loss": 0.7814, + "step": 34861 + }, + { + "epoch": 0.9231321043585435, + "grad_norm": 0.90234375, + "learning_rate": 8.981579803260898e-05, + "loss": 0.8379, + "step": 34862 + }, + { + "epoch": 0.9231585839668379, + "grad_norm": 0.8125, + "learning_rate": 8.981121823946366e-05, + "loss": 0.8841, + "step": 34863 + }, + { + "epoch": 0.9231850635751323, + "grad_norm": 0.7734375, + "learning_rate": 8.980663846791288e-05, + "loss": 0.84, + "step": 34864 + }, + { + "epoch": 0.9232115431834267, + "grad_norm": 0.76953125, + "learning_rate": 8.980205871796633e-05, + "loss": 0.798, + "step": 34865 + }, + { + "epoch": 0.9232380227917211, + "grad_norm": 0.7421875, + "learning_rate": 8.979747898963376e-05, + "loss": 0.9187, + "step": 34866 + }, + { + "epoch": 0.9232645024000155, + "grad_norm": 0.74609375, + "learning_rate": 8.979289928292483e-05, + "loss": 0.8793, + "step": 34867 + }, + { + "epoch": 0.9232909820083098, + "grad_norm": 0.74609375, + "learning_rate": 8.978831959784929e-05, + "loss": 0.8269, + "step": 34868 + }, + { + "epoch": 0.9233174616166041, + "grad_norm": 0.78125, + "learning_rate": 8.97837399344168e-05, + "loss": 0.9406, + "step": 34869 + }, + { + "epoch": 0.9233439412248985, + "grad_norm": 0.77734375, + "learning_rate": 8.977916029263707e-05, + "loss": 0.7323, + "step": 34870 + }, + { + "epoch": 0.9233704208331929, + "grad_norm": 0.84375, + "learning_rate": 8.977458067251984e-05, + "loss": 0.7311, + "step": 34871 + }, + { + "epoch": 0.9233969004414873, + "grad_norm": 0.71484375, + "learning_rate": 8.977000107407483e-05, + "loss": 0.7621, + "step": 34872 + }, + { + "epoch": 0.9234233800497816, + "grad_norm": 0.79296875, + "learning_rate": 8.97654214973117e-05, + "loss": 0.8018, + "step": 34873 + }, + { + "epoch": 0.923449859658076, + "grad_norm": 0.74609375, + "learning_rate": 8.976084194224018e-05, + "loss": 0.7804, + "step": 34874 + }, + { + "epoch": 0.9234763392663704, + "grad_norm": 0.75, + "learning_rate": 8.975626240886991e-05, + "loss": 0.7234, + "step": 34875 + }, + { + "epoch": 0.9235028188746648, + "grad_norm": 0.76953125, + "learning_rate": 8.975168289721073e-05, + "loss": 0.7445, + "step": 34876 + }, + { + "epoch": 0.9235292984829592, + "grad_norm": 0.75390625, + "learning_rate": 8.974710340727225e-05, + "loss": 0.8509, + "step": 34877 + }, + { + "epoch": 0.9235557780912536, + "grad_norm": 0.80078125, + "learning_rate": 8.974252393906422e-05, + "loss": 0.8644, + "step": 34878 + }, + { + "epoch": 0.9235822576995479, + "grad_norm": 0.7890625, + "learning_rate": 8.97379444925963e-05, + "loss": 0.7794, + "step": 34879 + }, + { + "epoch": 0.9236087373078423, + "grad_norm": 0.78515625, + "learning_rate": 8.973336506787819e-05, + "loss": 0.6756, + "step": 34880 + }, + { + "epoch": 0.9236352169161367, + "grad_norm": 0.71484375, + "learning_rate": 8.972878566491967e-05, + "loss": 0.6981, + "step": 34881 + }, + { + "epoch": 0.9236616965244311, + "grad_norm": 0.80859375, + "learning_rate": 8.97242062837304e-05, + "loss": 0.815, + "step": 34882 + }, + { + "epoch": 0.9236881761327255, + "grad_norm": 0.78125, + "learning_rate": 8.971962692432007e-05, + "loss": 0.8645, + "step": 34883 + }, + { + "epoch": 0.9237146557410199, + "grad_norm": 0.859375, + "learning_rate": 8.971504758669843e-05, + "loss": 0.9575, + "step": 34884 + }, + { + "epoch": 0.9237411353493141, + "grad_norm": 0.765625, + "learning_rate": 8.971046827087513e-05, + "loss": 0.6643, + "step": 34885 + }, + { + "epoch": 0.9237676149576085, + "grad_norm": 1.0, + "learning_rate": 8.970588897685992e-05, + "loss": 0.7289, + "step": 34886 + }, + { + "epoch": 0.9237940945659029, + "grad_norm": 0.68359375, + "learning_rate": 8.97013097046625e-05, + "loss": 0.5745, + "step": 34887 + }, + { + "epoch": 0.9238205741741973, + "grad_norm": 0.7890625, + "learning_rate": 8.969673045429254e-05, + "loss": 0.7549, + "step": 34888 + }, + { + "epoch": 0.9238470537824917, + "grad_norm": 0.8125, + "learning_rate": 8.969215122575977e-05, + "loss": 0.8056, + "step": 34889 + }, + { + "epoch": 0.923873533390786, + "grad_norm": 0.76171875, + "learning_rate": 8.96875720190739e-05, + "loss": 0.7991, + "step": 34890 + }, + { + "epoch": 0.9239000129990804, + "grad_norm": 0.77734375, + "learning_rate": 8.968299283424464e-05, + "loss": 0.8495, + "step": 34891 + }, + { + "epoch": 0.9239264926073748, + "grad_norm": 0.765625, + "learning_rate": 8.967841367128171e-05, + "loss": 0.754, + "step": 34892 + }, + { + "epoch": 0.9239529722156692, + "grad_norm": 0.7890625, + "learning_rate": 8.967383453019477e-05, + "loss": 0.7036, + "step": 34893 + }, + { + "epoch": 0.9239794518239636, + "grad_norm": 0.73046875, + "learning_rate": 8.966925541099354e-05, + "loss": 0.8455, + "step": 34894 + }, + { + "epoch": 0.924005931432258, + "grad_norm": 0.80078125, + "learning_rate": 8.966467631368771e-05, + "loss": 0.8121, + "step": 34895 + }, + { + "epoch": 0.9240324110405523, + "grad_norm": 0.7265625, + "learning_rate": 8.966009723828705e-05, + "loss": 0.8682, + "step": 34896 + }, + { + "epoch": 0.9240588906488467, + "grad_norm": 0.796875, + "learning_rate": 8.965551818480122e-05, + "loss": 0.7309, + "step": 34897 + }, + { + "epoch": 0.9240853702571411, + "grad_norm": 0.80859375, + "learning_rate": 8.965093915323991e-05, + "loss": 0.748, + "step": 34898 + }, + { + "epoch": 0.9241118498654355, + "grad_norm": 0.7578125, + "learning_rate": 8.964636014361286e-05, + "loss": 0.7259, + "step": 34899 + }, + { + "epoch": 0.9241383294737299, + "grad_norm": 0.81640625, + "learning_rate": 8.964178115592971e-05, + "loss": 0.8757, + "step": 34900 + }, + { + "epoch": 0.9241648090820243, + "grad_norm": 0.734375, + "learning_rate": 8.963720219020026e-05, + "loss": 0.7742, + "step": 34901 + }, + { + "epoch": 0.9241912886903185, + "grad_norm": 0.734375, + "learning_rate": 8.963262324643417e-05, + "loss": 0.7835, + "step": 34902 + }, + { + "epoch": 0.9242177682986129, + "grad_norm": 0.75390625, + "learning_rate": 8.962804432464114e-05, + "loss": 0.7418, + "step": 34903 + }, + { + "epoch": 0.9242442479069073, + "grad_norm": 0.7578125, + "learning_rate": 8.962346542483087e-05, + "loss": 0.7717, + "step": 34904 + }, + { + "epoch": 0.9242707275152017, + "grad_norm": 0.76171875, + "learning_rate": 8.961888654701306e-05, + "loss": 0.818, + "step": 34905 + }, + { + "epoch": 0.9242972071234961, + "grad_norm": 0.7265625, + "learning_rate": 8.961430769119744e-05, + "loss": 0.7806, + "step": 34906 + }, + { + "epoch": 0.9243236867317904, + "grad_norm": 0.84765625, + "learning_rate": 8.960972885739369e-05, + "loss": 0.7654, + "step": 34907 + }, + { + "epoch": 0.9243501663400848, + "grad_norm": 0.75, + "learning_rate": 8.960515004561154e-05, + "loss": 0.8321, + "step": 34908 + }, + { + "epoch": 0.9243766459483792, + "grad_norm": 0.86328125, + "learning_rate": 8.960057125586067e-05, + "loss": 0.8881, + "step": 34909 + }, + { + "epoch": 0.9244031255566736, + "grad_norm": 0.77734375, + "learning_rate": 8.95959924881508e-05, + "loss": 0.7191, + "step": 34910 + }, + { + "epoch": 0.924429605164968, + "grad_norm": 0.8125, + "learning_rate": 8.959141374249162e-05, + "loss": 0.7742, + "step": 34911 + }, + { + "epoch": 0.9244560847732624, + "grad_norm": 0.7890625, + "learning_rate": 8.958683501889286e-05, + "loss": 0.794, + "step": 34912 + }, + { + "epoch": 0.9244825643815567, + "grad_norm": 0.78515625, + "learning_rate": 8.958225631736421e-05, + "loss": 0.7603, + "step": 34913 + }, + { + "epoch": 0.9245090439898511, + "grad_norm": 0.79296875, + "learning_rate": 8.957767763791537e-05, + "loss": 0.8675, + "step": 34914 + }, + { + "epoch": 0.9245355235981455, + "grad_norm": 0.7109375, + "learning_rate": 8.9573098980556e-05, + "loss": 0.8208, + "step": 34915 + }, + { + "epoch": 0.9245620032064399, + "grad_norm": 0.80078125, + "learning_rate": 8.95685203452959e-05, + "loss": 0.7845, + "step": 34916 + }, + { + "epoch": 0.9245884828147343, + "grad_norm": 0.84375, + "learning_rate": 8.956394173214472e-05, + "loss": 0.8903, + "step": 34917 + }, + { + "epoch": 0.9246149624230285, + "grad_norm": 0.8046875, + "learning_rate": 8.955936314111218e-05, + "loss": 0.7856, + "step": 34918 + }, + { + "epoch": 0.9246414420313229, + "grad_norm": 0.71875, + "learning_rate": 8.955478457220796e-05, + "loss": 0.739, + "step": 34919 + }, + { + "epoch": 0.9246679216396173, + "grad_norm": 0.78515625, + "learning_rate": 8.955020602544172e-05, + "loss": 0.8305, + "step": 34920 + }, + { + "epoch": 0.9246944012479117, + "grad_norm": 0.890625, + "learning_rate": 8.954562750082329e-05, + "loss": 0.9689, + "step": 34921 + }, + { + "epoch": 0.9247208808562061, + "grad_norm": 0.78515625, + "learning_rate": 8.95410489983623e-05, + "loss": 0.8027, + "step": 34922 + }, + { + "epoch": 0.9247473604645005, + "grad_norm": 0.8046875, + "learning_rate": 8.953647051806845e-05, + "loss": 0.8225, + "step": 34923 + }, + { + "epoch": 0.9247738400727948, + "grad_norm": 0.734375, + "learning_rate": 8.953189205995147e-05, + "loss": 0.8446, + "step": 34924 + }, + { + "epoch": 0.9248003196810892, + "grad_norm": 0.76171875, + "learning_rate": 8.9527313624021e-05, + "loss": 0.7823, + "step": 34925 + }, + { + "epoch": 0.9248267992893836, + "grad_norm": 0.7734375, + "learning_rate": 8.952273521028682e-05, + "loss": 0.8235, + "step": 34926 + }, + { + "epoch": 0.924853278897678, + "grad_norm": 0.828125, + "learning_rate": 8.95181568187586e-05, + "loss": 0.7967, + "step": 34927 + }, + { + "epoch": 0.9248797585059724, + "grad_norm": 0.796875, + "learning_rate": 8.951357844944605e-05, + "loss": 0.8747, + "step": 34928 + }, + { + "epoch": 0.9249062381142668, + "grad_norm": 0.7734375, + "learning_rate": 8.950900010235889e-05, + "loss": 0.6616, + "step": 34929 + }, + { + "epoch": 0.9249327177225611, + "grad_norm": 0.72265625, + "learning_rate": 8.950442177750676e-05, + "loss": 0.7551, + "step": 34930 + }, + { + "epoch": 0.9249591973308555, + "grad_norm": 0.734375, + "learning_rate": 8.949984347489944e-05, + "loss": 0.8427, + "step": 34931 + }, + { + "epoch": 0.9249856769391499, + "grad_norm": 0.6875, + "learning_rate": 8.94952651945466e-05, + "loss": 0.7542, + "step": 34932 + }, + { + "epoch": 0.9250121565474443, + "grad_norm": 0.76953125, + "learning_rate": 8.949068693645795e-05, + "loss": 0.7341, + "step": 34933 + }, + { + "epoch": 0.9250386361557386, + "grad_norm": 0.8515625, + "learning_rate": 8.948610870064319e-05, + "loss": 0.7917, + "step": 34934 + }, + { + "epoch": 0.9250651157640329, + "grad_norm": 0.79296875, + "learning_rate": 8.948153048711198e-05, + "loss": 0.793, + "step": 34935 + }, + { + "epoch": 0.9250915953723273, + "grad_norm": 0.80859375, + "learning_rate": 8.94769522958741e-05, + "loss": 0.7795, + "step": 34936 + }, + { + "epoch": 0.9251180749806217, + "grad_norm": 0.75390625, + "learning_rate": 8.947237412693921e-05, + "loss": 0.7801, + "step": 34937 + }, + { + "epoch": 0.9251445545889161, + "grad_norm": 0.80078125, + "learning_rate": 8.946779598031703e-05, + "loss": 0.7572, + "step": 34938 + }, + { + "epoch": 0.9251710341972105, + "grad_norm": 0.7890625, + "learning_rate": 8.946321785601725e-05, + "loss": 0.6653, + "step": 34939 + }, + { + "epoch": 0.9251975138055049, + "grad_norm": 0.734375, + "learning_rate": 8.945863975404955e-05, + "loss": 0.7781, + "step": 34940 + }, + { + "epoch": 0.9252239934137992, + "grad_norm": 0.8671875, + "learning_rate": 8.94540616744237e-05, + "loss": 0.6919, + "step": 34941 + }, + { + "epoch": 0.9252504730220936, + "grad_norm": 0.76171875, + "learning_rate": 8.944948361714936e-05, + "loss": 0.7681, + "step": 34942 + }, + { + "epoch": 0.925276952630388, + "grad_norm": 0.82421875, + "learning_rate": 8.944490558223623e-05, + "loss": 0.7517, + "step": 34943 + }, + { + "epoch": 0.9253034322386824, + "grad_norm": 0.77734375, + "learning_rate": 8.944032756969401e-05, + "loss": 0.8899, + "step": 34944 + }, + { + "epoch": 0.9253299118469768, + "grad_norm": 0.77734375, + "learning_rate": 8.94357495795324e-05, + "loss": 0.865, + "step": 34945 + }, + { + "epoch": 0.9253563914552712, + "grad_norm": 0.796875, + "learning_rate": 8.943117161176113e-05, + "loss": 0.8369, + "step": 34946 + }, + { + "epoch": 0.9253828710635655, + "grad_norm": 0.7734375, + "learning_rate": 8.942659366638989e-05, + "loss": 0.7524, + "step": 34947 + }, + { + "epoch": 0.9254093506718599, + "grad_norm": 0.8046875, + "learning_rate": 8.942201574342838e-05, + "loss": 0.8079, + "step": 34948 + }, + { + "epoch": 0.9254358302801543, + "grad_norm": 0.78515625, + "learning_rate": 8.941743784288632e-05, + "loss": 0.7206, + "step": 34949 + }, + { + "epoch": 0.9254623098884487, + "grad_norm": 0.765625, + "learning_rate": 8.941285996477336e-05, + "loss": 0.8855, + "step": 34950 + }, + { + "epoch": 0.925488789496743, + "grad_norm": 0.82421875, + "learning_rate": 8.940828210909927e-05, + "loss": 0.7276, + "step": 34951 + }, + { + "epoch": 0.9255152691050373, + "grad_norm": 0.82421875, + "learning_rate": 8.940370427587373e-05, + "loss": 0.7193, + "step": 34952 + }, + { + "epoch": 0.9255417487133317, + "grad_norm": 0.79296875, + "learning_rate": 8.939912646510642e-05, + "loss": 0.7828, + "step": 34953 + }, + { + "epoch": 0.9255682283216261, + "grad_norm": 0.76171875, + "learning_rate": 8.939454867680705e-05, + "loss": 0.8321, + "step": 34954 + }, + { + "epoch": 0.9255947079299205, + "grad_norm": 0.703125, + "learning_rate": 8.938997091098527e-05, + "loss": 0.7833, + "step": 34955 + }, + { + "epoch": 0.9256211875382149, + "grad_norm": 0.77734375, + "learning_rate": 8.938539316765091e-05, + "loss": 0.7393, + "step": 34956 + }, + { + "epoch": 0.9256476671465093, + "grad_norm": 0.73046875, + "learning_rate": 8.93808154468136e-05, + "loss": 0.8035, + "step": 34957 + }, + { + "epoch": 0.9256741467548036, + "grad_norm": 0.83203125, + "learning_rate": 8.937623774848304e-05, + "loss": 0.821, + "step": 34958 + }, + { + "epoch": 0.925700626363098, + "grad_norm": 0.79296875, + "learning_rate": 8.937166007266892e-05, + "loss": 0.8113, + "step": 34959 + }, + { + "epoch": 0.9257271059713924, + "grad_norm": 0.77734375, + "learning_rate": 8.936708241938094e-05, + "loss": 0.7998, + "step": 34960 + }, + { + "epoch": 0.9257535855796868, + "grad_norm": 0.78515625, + "learning_rate": 8.936250478862887e-05, + "loss": 0.6753, + "step": 34961 + }, + { + "epoch": 0.9257800651879812, + "grad_norm": 0.8671875, + "learning_rate": 8.935792718042234e-05, + "loss": 0.814, + "step": 34962 + }, + { + "epoch": 0.9258065447962756, + "grad_norm": 0.77734375, + "learning_rate": 8.935334959477108e-05, + "loss": 0.9401, + "step": 34963 + }, + { + "epoch": 0.9258330244045699, + "grad_norm": 0.77734375, + "learning_rate": 8.93487720316848e-05, + "loss": 0.7117, + "step": 34964 + }, + { + "epoch": 0.9258595040128643, + "grad_norm": 0.71484375, + "learning_rate": 8.934419449117314e-05, + "loss": 0.7169, + "step": 34965 + }, + { + "epoch": 0.9258859836211587, + "grad_norm": 0.71484375, + "learning_rate": 8.933961697324588e-05, + "loss": 0.7343, + "step": 34966 + }, + { + "epoch": 0.925912463229453, + "grad_norm": 0.78125, + "learning_rate": 8.93350394779127e-05, + "loss": 0.8064, + "step": 34967 + }, + { + "epoch": 0.9259389428377474, + "grad_norm": 0.78125, + "learning_rate": 8.933046200518331e-05, + "loss": 0.8782, + "step": 34968 + }, + { + "epoch": 0.9259654224460417, + "grad_norm": 0.6875, + "learning_rate": 8.932588455506738e-05, + "loss": 0.7362, + "step": 34969 + }, + { + "epoch": 0.9259919020543361, + "grad_norm": 0.828125, + "learning_rate": 8.93213071275746e-05, + "loss": 0.7198, + "step": 34970 + }, + { + "epoch": 0.9260183816626305, + "grad_norm": 0.77734375, + "learning_rate": 8.931672972271475e-05, + "loss": 0.8426, + "step": 34971 + }, + { + "epoch": 0.9260448612709249, + "grad_norm": 0.83203125, + "learning_rate": 8.931215234049745e-05, + "loss": 0.8489, + "step": 34972 + }, + { + "epoch": 0.9260713408792193, + "grad_norm": 0.83203125, + "learning_rate": 8.930757498093244e-05, + "loss": 0.7726, + "step": 34973 + }, + { + "epoch": 0.9260978204875137, + "grad_norm": 0.8125, + "learning_rate": 8.93029976440294e-05, + "loss": 0.8225, + "step": 34974 + }, + { + "epoch": 0.926124300095808, + "grad_norm": 0.72265625, + "learning_rate": 8.929842032979804e-05, + "loss": 0.9152, + "step": 34975 + }, + { + "epoch": 0.9261507797041024, + "grad_norm": 1.0234375, + "learning_rate": 8.92938430382481e-05, + "loss": 0.8253, + "step": 34976 + }, + { + "epoch": 0.9261772593123968, + "grad_norm": 0.76953125, + "learning_rate": 8.928926576938924e-05, + "loss": 0.7249, + "step": 34977 + }, + { + "epoch": 0.9262037389206912, + "grad_norm": 0.78515625, + "learning_rate": 8.928468852323115e-05, + "loss": 0.7107, + "step": 34978 + }, + { + "epoch": 0.9262302185289856, + "grad_norm": 0.7734375, + "learning_rate": 8.928011129978357e-05, + "loss": 0.7771, + "step": 34979 + }, + { + "epoch": 0.92625669813728, + "grad_norm": 0.765625, + "learning_rate": 8.927553409905613e-05, + "loss": 0.8562, + "step": 34980 + }, + { + "epoch": 0.9262831777455743, + "grad_norm": 0.7265625, + "learning_rate": 8.927095692105863e-05, + "loss": 0.7069, + "step": 34981 + }, + { + "epoch": 0.9263096573538687, + "grad_norm": 0.74609375, + "learning_rate": 8.926637976580072e-05, + "loss": 0.8663, + "step": 34982 + }, + { + "epoch": 0.926336136962163, + "grad_norm": 0.796875, + "learning_rate": 8.926180263329212e-05, + "loss": 0.8252, + "step": 34983 + }, + { + "epoch": 0.9263626165704574, + "grad_norm": 0.73046875, + "learning_rate": 8.92572255235425e-05, + "loss": 0.7971, + "step": 34984 + }, + { + "epoch": 0.9263890961787518, + "grad_norm": 0.75390625, + "learning_rate": 8.925264843656153e-05, + "loss": 0.6756, + "step": 34985 + }, + { + "epoch": 0.9264155757870461, + "grad_norm": 0.86328125, + "learning_rate": 8.924807137235902e-05, + "loss": 0.8019, + "step": 34986 + }, + { + "epoch": 0.9264420553953405, + "grad_norm": 0.87109375, + "learning_rate": 8.924349433094459e-05, + "loss": 0.877, + "step": 34987 + }, + { + "epoch": 0.9264685350036349, + "grad_norm": 1.484375, + "learning_rate": 8.923891731232796e-05, + "loss": 0.7116, + "step": 34988 + }, + { + "epoch": 0.9264950146119293, + "grad_norm": 0.7578125, + "learning_rate": 8.923434031651883e-05, + "loss": 0.7098, + "step": 34989 + }, + { + "epoch": 0.9265214942202237, + "grad_norm": 0.66015625, + "learning_rate": 8.922976334352688e-05, + "loss": 0.6848, + "step": 34990 + }, + { + "epoch": 0.9265479738285181, + "grad_norm": 0.81640625, + "learning_rate": 8.922518639336186e-05, + "loss": 0.7496, + "step": 34991 + }, + { + "epoch": 0.9265744534368124, + "grad_norm": 0.84375, + "learning_rate": 8.922060946603343e-05, + "loss": 0.7526, + "step": 34992 + }, + { + "epoch": 0.9266009330451068, + "grad_norm": 0.76953125, + "learning_rate": 8.92160325615513e-05, + "loss": 0.8034, + "step": 34993 + }, + { + "epoch": 0.9266274126534012, + "grad_norm": 0.81640625, + "learning_rate": 8.921145567992517e-05, + "loss": 0.8831, + "step": 34994 + }, + { + "epoch": 0.9266538922616956, + "grad_norm": 0.796875, + "learning_rate": 8.920687882116477e-05, + "loss": 0.716, + "step": 34995 + }, + { + "epoch": 0.92668037186999, + "grad_norm": 0.796875, + "learning_rate": 8.920230198527972e-05, + "loss": 0.8092, + "step": 34996 + }, + { + "epoch": 0.9267068514782844, + "grad_norm": 0.78125, + "learning_rate": 8.919772517227982e-05, + "loss": 0.7097, + "step": 34997 + }, + { + "epoch": 0.9267333310865787, + "grad_norm": 0.84375, + "learning_rate": 8.919314838217472e-05, + "loss": 0.8158, + "step": 34998 + }, + { + "epoch": 0.9267598106948731, + "grad_norm": 0.73828125, + "learning_rate": 8.918857161497412e-05, + "loss": 0.7256, + "step": 34999 + }, + { + "epoch": 0.9267862903031674, + "grad_norm": 0.7421875, + "learning_rate": 8.918399487068772e-05, + "loss": 0.7456, + "step": 35000 + }, + { + "epoch": 0.9267862903031674, + "eval_loss": 0.7882528305053711, + "eval_runtime": 252.9514, + "eval_samples_per_second": 39.533, + "eval_steps_per_second": 0.826, + "step": 35000 + }, + { + "epoch": 0.9268127699114618, + "grad_norm": 0.78515625, + "learning_rate": 8.917941814932519e-05, + "loss": 0.8181, + "step": 35001 + }, + { + "epoch": 0.9268392495197562, + "grad_norm": 0.81640625, + "learning_rate": 8.917484145089629e-05, + "loss": 0.8445, + "step": 35002 + }, + { + "epoch": 0.9268657291280505, + "grad_norm": 0.765625, + "learning_rate": 8.917026477541072e-05, + "loss": 0.7937, + "step": 35003 + }, + { + "epoch": 0.9268922087363449, + "grad_norm": 0.78515625, + "learning_rate": 8.916568812287814e-05, + "loss": 0.7859, + "step": 35004 + }, + { + "epoch": 0.9269186883446393, + "grad_norm": 0.7421875, + "learning_rate": 8.916111149330827e-05, + "loss": 0.7546, + "step": 35005 + }, + { + "epoch": 0.9269451679529337, + "grad_norm": 0.828125, + "learning_rate": 8.915653488671077e-05, + "loss": 0.9477, + "step": 35006 + }, + { + "epoch": 0.9269716475612281, + "grad_norm": 0.8046875, + "learning_rate": 8.91519583030954e-05, + "loss": 0.6848, + "step": 35007 + }, + { + "epoch": 0.9269981271695225, + "grad_norm": 0.828125, + "learning_rate": 8.914738174247184e-05, + "loss": 0.7627, + "step": 35008 + }, + { + "epoch": 0.9270246067778168, + "grad_norm": 0.78125, + "learning_rate": 8.914280520484979e-05, + "loss": 0.7443, + "step": 35009 + }, + { + "epoch": 0.9270510863861112, + "grad_norm": 0.89453125, + "learning_rate": 8.913822869023893e-05, + "loss": 0.8584, + "step": 35010 + }, + { + "epoch": 0.9270775659944056, + "grad_norm": 0.765625, + "learning_rate": 8.913365219864894e-05, + "loss": 0.8282, + "step": 35011 + }, + { + "epoch": 0.9271040456027, + "grad_norm": 0.74609375, + "learning_rate": 8.91290757300896e-05, + "loss": 0.7495, + "step": 35012 + }, + { + "epoch": 0.9271305252109944, + "grad_norm": 0.8125, + "learning_rate": 8.912449928457057e-05, + "loss": 0.7597, + "step": 35013 + }, + { + "epoch": 0.9271570048192888, + "grad_norm": 0.85546875, + "learning_rate": 8.911992286210153e-05, + "loss": 0.8581, + "step": 35014 + }, + { + "epoch": 0.9271834844275831, + "grad_norm": 0.76171875, + "learning_rate": 8.911534646269218e-05, + "loss": 0.8291, + "step": 35015 + }, + { + "epoch": 0.9272099640358774, + "grad_norm": 0.75390625, + "learning_rate": 8.911077008635222e-05, + "loss": 0.7828, + "step": 35016 + }, + { + "epoch": 0.9272364436441718, + "grad_norm": 0.84375, + "learning_rate": 8.910619373309139e-05, + "loss": 0.8714, + "step": 35017 + }, + { + "epoch": 0.9272629232524662, + "grad_norm": 0.8515625, + "learning_rate": 8.910161740291936e-05, + "loss": 0.7823, + "step": 35018 + }, + { + "epoch": 0.9272894028607606, + "grad_norm": 0.828125, + "learning_rate": 8.909704109584582e-05, + "loss": 0.6801, + "step": 35019 + }, + { + "epoch": 0.9273158824690549, + "grad_norm": 0.77734375, + "learning_rate": 8.909246481188048e-05, + "loss": 0.8952, + "step": 35020 + }, + { + "epoch": 0.9273423620773493, + "grad_norm": 0.78515625, + "learning_rate": 8.9087888551033e-05, + "loss": 0.8124, + "step": 35021 + }, + { + "epoch": 0.9273688416856437, + "grad_norm": 0.76171875, + "learning_rate": 8.908331231331315e-05, + "loss": 0.7473, + "step": 35022 + }, + { + "epoch": 0.9273953212939381, + "grad_norm": 0.859375, + "learning_rate": 8.907873609873061e-05, + "loss": 0.7249, + "step": 35023 + }, + { + "epoch": 0.9274218009022325, + "grad_norm": 0.77734375, + "learning_rate": 8.907415990729506e-05, + "loss": 0.7476, + "step": 35024 + }, + { + "epoch": 0.9274482805105269, + "grad_norm": 0.78515625, + "learning_rate": 8.906958373901619e-05, + "loss": 0.8355, + "step": 35025 + }, + { + "epoch": 0.9274747601188212, + "grad_norm": 0.7734375, + "learning_rate": 8.906500759390368e-05, + "loss": 0.8982, + "step": 35026 + }, + { + "epoch": 0.9275012397271156, + "grad_norm": 0.73046875, + "learning_rate": 8.90604314719673e-05, + "loss": 0.7375, + "step": 35027 + }, + { + "epoch": 0.92752771933541, + "grad_norm": 0.78125, + "learning_rate": 8.905585537321671e-05, + "loss": 0.7527, + "step": 35028 + }, + { + "epoch": 0.9275541989437044, + "grad_norm": 0.890625, + "learning_rate": 8.905127929766162e-05, + "loss": 0.7397, + "step": 35029 + }, + { + "epoch": 0.9275806785519988, + "grad_norm": 0.78515625, + "learning_rate": 8.90467032453117e-05, + "loss": 0.737, + "step": 35030 + }, + { + "epoch": 0.9276071581602932, + "grad_norm": 0.80078125, + "learning_rate": 8.904212721617664e-05, + "loss": 0.924, + "step": 35031 + }, + { + "epoch": 0.9276336377685874, + "grad_norm": 0.77734375, + "learning_rate": 8.90375512102662e-05, + "loss": 0.7979, + "step": 35032 + }, + { + "epoch": 0.9276601173768818, + "grad_norm": 0.75, + "learning_rate": 8.903297522759005e-05, + "loss": 0.6898, + "step": 35033 + }, + { + "epoch": 0.9276865969851762, + "grad_norm": 0.73046875, + "learning_rate": 8.902839926815787e-05, + "loss": 0.617, + "step": 35034 + }, + { + "epoch": 0.9277130765934706, + "grad_norm": 0.8046875, + "learning_rate": 8.902382333197938e-05, + "loss": 0.9083, + "step": 35035 + }, + { + "epoch": 0.927739556201765, + "grad_norm": 0.8203125, + "learning_rate": 8.901924741906425e-05, + "loss": 0.8214, + "step": 35036 + }, + { + "epoch": 0.9277660358100593, + "grad_norm": 0.8203125, + "learning_rate": 8.901467152942221e-05, + "loss": 0.7215, + "step": 35037 + }, + { + "epoch": 0.9277925154183537, + "grad_norm": 0.80859375, + "learning_rate": 8.901009566306295e-05, + "loss": 0.7997, + "step": 35038 + }, + { + "epoch": 0.9278189950266481, + "grad_norm": 0.7265625, + "learning_rate": 8.900551981999616e-05, + "loss": 0.7635, + "step": 35039 + }, + { + "epoch": 0.9278454746349425, + "grad_norm": 0.72265625, + "learning_rate": 8.900094400023155e-05, + "loss": 0.7572, + "step": 35040 + }, + { + "epoch": 0.9278719542432369, + "grad_norm": 0.8515625, + "learning_rate": 8.899636820377876e-05, + "loss": 0.784, + "step": 35041 + }, + { + "epoch": 0.9278984338515313, + "grad_norm": 0.7578125, + "learning_rate": 8.899179243064759e-05, + "loss": 0.863, + "step": 35042 + }, + { + "epoch": 0.9279249134598256, + "grad_norm": 0.796875, + "learning_rate": 8.898721668084768e-05, + "loss": 0.8093, + "step": 35043 + }, + { + "epoch": 0.92795139306812, + "grad_norm": 0.96875, + "learning_rate": 8.898264095438874e-05, + "loss": 0.8484, + "step": 35044 + }, + { + "epoch": 0.9279778726764144, + "grad_norm": 0.7890625, + "learning_rate": 8.897806525128045e-05, + "loss": 0.8408, + "step": 35045 + }, + { + "epoch": 0.9280043522847088, + "grad_norm": 0.72265625, + "learning_rate": 8.89734895715325e-05, + "loss": 0.7135, + "step": 35046 + }, + { + "epoch": 0.9280308318930032, + "grad_norm": 0.78125, + "learning_rate": 8.896891391515463e-05, + "loss": 0.7796, + "step": 35047 + }, + { + "epoch": 0.9280573115012976, + "grad_norm": 0.75390625, + "learning_rate": 8.896433828215653e-05, + "loss": 0.7668, + "step": 35048 + }, + { + "epoch": 0.9280837911095918, + "grad_norm": 0.7578125, + "learning_rate": 8.895976267254788e-05, + "loss": 0.7944, + "step": 35049 + }, + { + "epoch": 0.9281102707178862, + "grad_norm": 0.75, + "learning_rate": 8.895518708633838e-05, + "loss": 0.8075, + "step": 35050 + }, + { + "epoch": 0.9281367503261806, + "grad_norm": 0.79296875, + "learning_rate": 8.895061152353768e-05, + "loss": 0.7363, + "step": 35051 + }, + { + "epoch": 0.928163229934475, + "grad_norm": 0.70703125, + "learning_rate": 8.894603598415558e-05, + "loss": 0.8274, + "step": 35052 + }, + { + "epoch": 0.9281897095427694, + "grad_norm": 0.796875, + "learning_rate": 8.894146046820174e-05, + "loss": 0.7818, + "step": 35053 + }, + { + "epoch": 0.9282161891510637, + "grad_norm": 0.7734375, + "learning_rate": 8.893688497568582e-05, + "loss": 0.7784, + "step": 35054 + }, + { + "epoch": 0.9282426687593581, + "grad_norm": 0.74609375, + "learning_rate": 8.893230950661755e-05, + "loss": 0.836, + "step": 35055 + }, + { + "epoch": 0.9282691483676525, + "grad_norm": 0.796875, + "learning_rate": 8.892773406100658e-05, + "loss": 0.8144, + "step": 35056 + }, + { + "epoch": 0.9282956279759469, + "grad_norm": 0.7890625, + "learning_rate": 8.892315863886268e-05, + "loss": 0.771, + "step": 35057 + }, + { + "epoch": 0.9283221075842413, + "grad_norm": 0.81640625, + "learning_rate": 8.891858324019552e-05, + "loss": 0.8606, + "step": 35058 + }, + { + "epoch": 0.9283485871925357, + "grad_norm": 0.83984375, + "learning_rate": 8.891400786501476e-05, + "loss": 0.8162, + "step": 35059 + }, + { + "epoch": 0.92837506680083, + "grad_norm": 0.76171875, + "learning_rate": 8.890943251333014e-05, + "loss": 0.8104, + "step": 35060 + }, + { + "epoch": 0.9284015464091244, + "grad_norm": 0.76953125, + "learning_rate": 8.890485718515132e-05, + "loss": 0.803, + "step": 35061 + }, + { + "epoch": 0.9284280260174188, + "grad_norm": 0.7734375, + "learning_rate": 8.890028188048806e-05, + "loss": 0.7644, + "step": 35062 + }, + { + "epoch": 0.9284545056257132, + "grad_norm": 0.78125, + "learning_rate": 8.889570659935002e-05, + "loss": 0.7302, + "step": 35063 + }, + { + "epoch": 0.9284809852340076, + "grad_norm": 0.80859375, + "learning_rate": 8.889113134174689e-05, + "loss": 0.8063, + "step": 35064 + }, + { + "epoch": 0.9285074648423018, + "grad_norm": 0.8359375, + "learning_rate": 8.888655610768836e-05, + "loss": 0.7728, + "step": 35065 + }, + { + "epoch": 0.9285339444505962, + "grad_norm": 0.81640625, + "learning_rate": 8.88819808971841e-05, + "loss": 0.8521, + "step": 35066 + }, + { + "epoch": 0.9285604240588906, + "grad_norm": 0.87890625, + "learning_rate": 8.88774057102439e-05, + "loss": 0.7422, + "step": 35067 + }, + { + "epoch": 0.928586903667185, + "grad_norm": 0.75, + "learning_rate": 8.887283054687738e-05, + "loss": 0.7258, + "step": 35068 + }, + { + "epoch": 0.9286133832754794, + "grad_norm": 0.984375, + "learning_rate": 8.886825540709428e-05, + "loss": 0.858, + "step": 35069 + }, + { + "epoch": 0.9286398628837738, + "grad_norm": 0.79296875, + "learning_rate": 8.886368029090428e-05, + "loss": 0.8671, + "step": 35070 + }, + { + "epoch": 0.9286663424920681, + "grad_norm": 0.79296875, + "learning_rate": 8.8859105198317e-05, + "loss": 0.8131, + "step": 35071 + }, + { + "epoch": 0.9286928221003625, + "grad_norm": 0.76953125, + "learning_rate": 8.885453012934228e-05, + "loss": 0.6768, + "step": 35072 + }, + { + "epoch": 0.9287193017086569, + "grad_norm": 0.796875, + "learning_rate": 8.884995508398975e-05, + "loss": 0.9119, + "step": 35073 + }, + { + "epoch": 0.9287457813169513, + "grad_norm": 0.8203125, + "learning_rate": 8.884538006226909e-05, + "loss": 0.7747, + "step": 35074 + }, + { + "epoch": 0.9287722609252457, + "grad_norm": 0.74609375, + "learning_rate": 8.884080506419002e-05, + "loss": 0.6689, + "step": 35075 + }, + { + "epoch": 0.9287987405335401, + "grad_norm": 0.74609375, + "learning_rate": 8.883623008976219e-05, + "loss": 0.7972, + "step": 35076 + }, + { + "epoch": 0.9288252201418344, + "grad_norm": 0.79296875, + "learning_rate": 8.883165513899536e-05, + "loss": 0.8735, + "step": 35077 + }, + { + "epoch": 0.9288516997501288, + "grad_norm": 0.82421875, + "learning_rate": 8.882708021189917e-05, + "loss": 0.7392, + "step": 35078 + }, + { + "epoch": 0.9288781793584232, + "grad_norm": 0.81640625, + "learning_rate": 8.882250530848338e-05, + "loss": 0.8702, + "step": 35079 + }, + { + "epoch": 0.9289046589667176, + "grad_norm": 0.75, + "learning_rate": 8.881793042875764e-05, + "loss": 0.7885, + "step": 35080 + }, + { + "epoch": 0.9289311385750119, + "grad_norm": 0.828125, + "learning_rate": 8.881335557273165e-05, + "loss": 0.9156, + "step": 35081 + }, + { + "epoch": 0.9289576181833062, + "grad_norm": 0.8125, + "learning_rate": 8.880878074041512e-05, + "loss": 0.8429, + "step": 35082 + }, + { + "epoch": 0.9289840977916006, + "grad_norm": 0.796875, + "learning_rate": 8.880420593181775e-05, + "loss": 0.8381, + "step": 35083 + }, + { + "epoch": 0.929010577399895, + "grad_norm": 0.75390625, + "learning_rate": 8.879963114694923e-05, + "loss": 0.7421, + "step": 35084 + }, + { + "epoch": 0.9290370570081894, + "grad_norm": 0.8515625, + "learning_rate": 8.879505638581922e-05, + "loss": 0.7964, + "step": 35085 + }, + { + "epoch": 0.9290635366164838, + "grad_norm": 0.78125, + "learning_rate": 8.879048164843744e-05, + "loss": 0.7835, + "step": 35086 + }, + { + "epoch": 0.9290900162247782, + "grad_norm": 0.72265625, + "learning_rate": 8.878590693481362e-05, + "loss": 0.6858, + "step": 35087 + }, + { + "epoch": 0.9291164958330725, + "grad_norm": 0.76171875, + "learning_rate": 8.878133224495743e-05, + "loss": 0.8152, + "step": 35088 + }, + { + "epoch": 0.9291429754413669, + "grad_norm": 0.6875, + "learning_rate": 8.877675757887858e-05, + "loss": 0.7781, + "step": 35089 + }, + { + "epoch": 0.9291694550496613, + "grad_norm": 0.76171875, + "learning_rate": 8.877218293658673e-05, + "loss": 0.7748, + "step": 35090 + }, + { + "epoch": 0.9291959346579557, + "grad_norm": 0.75390625, + "learning_rate": 8.876760831809156e-05, + "loss": 0.8203, + "step": 35091 + }, + { + "epoch": 0.9292224142662501, + "grad_norm": 0.78515625, + "learning_rate": 8.876303372340286e-05, + "loss": 0.8042, + "step": 35092 + }, + { + "epoch": 0.9292488938745445, + "grad_norm": 0.82421875, + "learning_rate": 8.875845915253025e-05, + "loss": 0.7569, + "step": 35093 + }, + { + "epoch": 0.9292753734828388, + "grad_norm": 0.80078125, + "learning_rate": 8.875388460548345e-05, + "loss": 0.8347, + "step": 35094 + }, + { + "epoch": 0.9293018530911332, + "grad_norm": 0.8671875, + "learning_rate": 8.874931008227214e-05, + "loss": 0.8721, + "step": 35095 + }, + { + "epoch": 0.9293283326994276, + "grad_norm": 0.72265625, + "learning_rate": 8.874473558290602e-05, + "loss": 0.7601, + "step": 35096 + }, + { + "epoch": 0.929354812307722, + "grad_norm": 0.80859375, + "learning_rate": 8.874016110739477e-05, + "loss": 0.8251, + "step": 35097 + }, + { + "epoch": 0.9293812919160163, + "grad_norm": 0.76953125, + "learning_rate": 8.873558665574813e-05, + "loss": 0.7878, + "step": 35098 + }, + { + "epoch": 0.9294077715243106, + "grad_norm": 0.83203125, + "learning_rate": 8.873101222797578e-05, + "loss": 0.8295, + "step": 35099 + }, + { + "epoch": 0.929434251132605, + "grad_norm": 0.77734375, + "learning_rate": 8.87264378240874e-05, + "loss": 0.8483, + "step": 35100 + }, + { + "epoch": 0.9294607307408994, + "grad_norm": 0.90625, + "learning_rate": 8.872186344409268e-05, + "loss": 0.8379, + "step": 35101 + }, + { + "epoch": 0.9294872103491938, + "grad_norm": 0.73828125, + "learning_rate": 8.871728908800134e-05, + "loss": 0.7371, + "step": 35102 + }, + { + "epoch": 0.9295136899574882, + "grad_norm": 0.76171875, + "learning_rate": 8.871271475582305e-05, + "loss": 0.7158, + "step": 35103 + }, + { + "epoch": 0.9295401695657826, + "grad_norm": 0.80078125, + "learning_rate": 8.870814044756754e-05, + "loss": 0.8548, + "step": 35104 + }, + { + "epoch": 0.9295666491740769, + "grad_norm": 0.8125, + "learning_rate": 8.870356616324447e-05, + "loss": 0.7514, + "step": 35105 + }, + { + "epoch": 0.9295931287823713, + "grad_norm": 0.7890625, + "learning_rate": 8.86989919028635e-05, + "loss": 0.7399, + "step": 35106 + }, + { + "epoch": 0.9296196083906657, + "grad_norm": 0.80078125, + "learning_rate": 8.86944176664344e-05, + "loss": 0.6481, + "step": 35107 + }, + { + "epoch": 0.9296460879989601, + "grad_norm": 0.78125, + "learning_rate": 8.868984345396684e-05, + "loss": 0.8339, + "step": 35108 + }, + { + "epoch": 0.9296725676072545, + "grad_norm": 0.765625, + "learning_rate": 8.868526926547053e-05, + "loss": 0.7862, + "step": 35109 + }, + { + "epoch": 0.9296990472155489, + "grad_norm": 0.80859375, + "learning_rate": 8.868069510095513e-05, + "loss": 0.8588, + "step": 35110 + }, + { + "epoch": 0.9297255268238432, + "grad_norm": 0.734375, + "learning_rate": 8.867612096043031e-05, + "loss": 0.7332, + "step": 35111 + }, + { + "epoch": 0.9297520064321376, + "grad_norm": 0.80078125, + "learning_rate": 8.867154684390586e-05, + "loss": 0.9167, + "step": 35112 + }, + { + "epoch": 0.929778486040432, + "grad_norm": 0.8984375, + "learning_rate": 8.86669727513914e-05, + "loss": 0.799, + "step": 35113 + }, + { + "epoch": 0.9298049656487263, + "grad_norm": 0.76171875, + "learning_rate": 8.866239868289666e-05, + "loss": 0.6258, + "step": 35114 + }, + { + "epoch": 0.9298314452570207, + "grad_norm": 0.75390625, + "learning_rate": 8.86578246384313e-05, + "loss": 0.7397, + "step": 35115 + }, + { + "epoch": 0.929857924865315, + "grad_norm": 0.8359375, + "learning_rate": 8.865325061800499e-05, + "loss": 0.9208, + "step": 35116 + }, + { + "epoch": 0.9298844044736094, + "grad_norm": 0.828125, + "learning_rate": 8.864867662162752e-05, + "loss": 0.8417, + "step": 35117 + }, + { + "epoch": 0.9299108840819038, + "grad_norm": 0.81640625, + "learning_rate": 8.864410264930854e-05, + "loss": 0.7791, + "step": 35118 + }, + { + "epoch": 0.9299373636901982, + "grad_norm": 0.71484375, + "learning_rate": 8.863952870105772e-05, + "loss": 0.9516, + "step": 35119 + }, + { + "epoch": 0.9299638432984926, + "grad_norm": 0.765625, + "learning_rate": 8.863495477688477e-05, + "loss": 0.6605, + "step": 35120 + }, + { + "epoch": 0.929990322906787, + "grad_norm": 0.90625, + "learning_rate": 8.863038087679936e-05, + "loss": 0.8321, + "step": 35121 + }, + { + "epoch": 0.9300168025150813, + "grad_norm": 0.8046875, + "learning_rate": 8.862580700081123e-05, + "loss": 0.8554, + "step": 35122 + }, + { + "epoch": 0.9300432821233757, + "grad_norm": 0.74609375, + "learning_rate": 8.862123314893007e-05, + "loss": 0.7519, + "step": 35123 + }, + { + "epoch": 0.9300697617316701, + "grad_norm": 0.84375, + "learning_rate": 8.861665932116554e-05, + "loss": 0.8216, + "step": 35124 + }, + { + "epoch": 0.9300962413399645, + "grad_norm": 0.75390625, + "learning_rate": 8.861208551752736e-05, + "loss": 0.8471, + "step": 35125 + }, + { + "epoch": 0.9301227209482589, + "grad_norm": 0.82421875, + "learning_rate": 8.860751173802515e-05, + "loss": 0.7924, + "step": 35126 + }, + { + "epoch": 0.9301492005565533, + "grad_norm": 0.76953125, + "learning_rate": 8.860293798266873e-05, + "loss": 0.7692, + "step": 35127 + }, + { + "epoch": 0.9301756801648476, + "grad_norm": 0.83203125, + "learning_rate": 8.859836425146773e-05, + "loss": 0.8298, + "step": 35128 + }, + { + "epoch": 0.930202159773142, + "grad_norm": 0.71484375, + "learning_rate": 8.859379054443185e-05, + "loss": 0.7072, + "step": 35129 + }, + { + "epoch": 0.9302286393814363, + "grad_norm": 0.734375, + "learning_rate": 8.858921686157076e-05, + "loss": 0.8342, + "step": 35130 + }, + { + "epoch": 0.9302551189897307, + "grad_norm": 0.70703125, + "learning_rate": 8.858464320289414e-05, + "loss": 0.7468, + "step": 35131 + }, + { + "epoch": 0.9302815985980251, + "grad_norm": 0.8359375, + "learning_rate": 8.858006956841176e-05, + "loss": 0.8586, + "step": 35132 + }, + { + "epoch": 0.9303080782063194, + "grad_norm": 0.76171875, + "learning_rate": 8.857549595813327e-05, + "loss": 0.7276, + "step": 35133 + }, + { + "epoch": 0.9303345578146138, + "grad_norm": 0.8125, + "learning_rate": 8.857092237206836e-05, + "loss": 0.8244, + "step": 35134 + }, + { + "epoch": 0.9303610374229082, + "grad_norm": 0.77734375, + "learning_rate": 8.856634881022673e-05, + "loss": 0.8044, + "step": 35135 + }, + { + "epoch": 0.9303875170312026, + "grad_norm": 0.75390625, + "learning_rate": 8.856177527261807e-05, + "loss": 0.8399, + "step": 35136 + }, + { + "epoch": 0.930413996639497, + "grad_norm": 0.7890625, + "learning_rate": 8.855720175925203e-05, + "loss": 0.8316, + "step": 35137 + }, + { + "epoch": 0.9304404762477914, + "grad_norm": 0.80078125, + "learning_rate": 8.855262827013838e-05, + "loss": 0.8201, + "step": 35138 + }, + { + "epoch": 0.9304669558560857, + "grad_norm": 0.80859375, + "learning_rate": 8.85480548052868e-05, + "loss": 0.8839, + "step": 35139 + }, + { + "epoch": 0.9304934354643801, + "grad_norm": 0.77734375, + "learning_rate": 8.854348136470695e-05, + "loss": 0.8153, + "step": 35140 + }, + { + "epoch": 0.9305199150726745, + "grad_norm": 0.78515625, + "learning_rate": 8.853890794840854e-05, + "loss": 0.8287, + "step": 35141 + }, + { + "epoch": 0.9305463946809689, + "grad_norm": 0.71875, + "learning_rate": 8.853433455640124e-05, + "loss": 0.8052, + "step": 35142 + }, + { + "epoch": 0.9305728742892633, + "grad_norm": 0.84765625, + "learning_rate": 8.852976118869478e-05, + "loss": 0.8602, + "step": 35143 + }, + { + "epoch": 0.9305993538975577, + "grad_norm": 0.78515625, + "learning_rate": 8.852518784529884e-05, + "loss": 0.8506, + "step": 35144 + }, + { + "epoch": 0.930625833505852, + "grad_norm": 0.77734375, + "learning_rate": 8.852061452622308e-05, + "loss": 0.6993, + "step": 35145 + }, + { + "epoch": 0.9306523131141464, + "grad_norm": 0.765625, + "learning_rate": 8.851604123147725e-05, + "loss": 0.7407, + "step": 35146 + }, + { + "epoch": 0.9306787927224407, + "grad_norm": 0.7734375, + "learning_rate": 8.851146796107098e-05, + "loss": 0.8717, + "step": 35147 + }, + { + "epoch": 0.9307052723307351, + "grad_norm": 0.79296875, + "learning_rate": 8.850689471501403e-05, + "loss": 0.7279, + "step": 35148 + }, + { + "epoch": 0.9307317519390295, + "grad_norm": 0.73046875, + "learning_rate": 8.850232149331606e-05, + "loss": 0.7586, + "step": 35149 + }, + { + "epoch": 0.9307582315473238, + "grad_norm": 0.80859375, + "learning_rate": 8.849774829598676e-05, + "loss": 0.7543, + "step": 35150 + }, + { + "epoch": 0.9307847111556182, + "grad_norm": 0.75, + "learning_rate": 8.849317512303583e-05, + "loss": 0.6649, + "step": 35151 + }, + { + "epoch": 0.9308111907639126, + "grad_norm": 0.76953125, + "learning_rate": 8.848860197447291e-05, + "loss": 0.7259, + "step": 35152 + }, + { + "epoch": 0.930837670372207, + "grad_norm": 0.8125, + "learning_rate": 8.848402885030778e-05, + "loss": 0.9223, + "step": 35153 + }, + { + "epoch": 0.9308641499805014, + "grad_norm": 0.86328125, + "learning_rate": 8.84794557505501e-05, + "loss": 0.882, + "step": 35154 + }, + { + "epoch": 0.9308906295887958, + "grad_norm": 0.8359375, + "learning_rate": 8.847488267520954e-05, + "loss": 0.7464, + "step": 35155 + }, + { + "epoch": 0.9309171091970901, + "grad_norm": 0.78125, + "learning_rate": 8.847030962429582e-05, + "loss": 0.7747, + "step": 35156 + }, + { + "epoch": 0.9309435888053845, + "grad_norm": 0.76171875, + "learning_rate": 8.846573659781859e-05, + "loss": 0.7477, + "step": 35157 + }, + { + "epoch": 0.9309700684136789, + "grad_norm": 0.73828125, + "learning_rate": 8.846116359578758e-05, + "loss": 0.681, + "step": 35158 + }, + { + "epoch": 0.9309965480219733, + "grad_norm": 0.79296875, + "learning_rate": 8.845659061821251e-05, + "loss": 0.8143, + "step": 35159 + }, + { + "epoch": 0.9310230276302677, + "grad_norm": 0.859375, + "learning_rate": 8.845201766510301e-05, + "loss": 0.7205, + "step": 35160 + }, + { + "epoch": 0.9310495072385621, + "grad_norm": 0.94921875, + "learning_rate": 8.844744473646882e-05, + "loss": 0.7032, + "step": 35161 + }, + { + "epoch": 0.9310759868468564, + "grad_norm": 0.75, + "learning_rate": 8.844287183231957e-05, + "loss": 0.7934, + "step": 35162 + }, + { + "epoch": 0.9311024664551507, + "grad_norm": 0.7890625, + "learning_rate": 8.843829895266503e-05, + "loss": 0.7665, + "step": 35163 + }, + { + "epoch": 0.9311289460634451, + "grad_norm": 0.8203125, + "learning_rate": 8.843372609751482e-05, + "loss": 0.824, + "step": 35164 + }, + { + "epoch": 0.9311554256717395, + "grad_norm": 0.76953125, + "learning_rate": 8.84291532668787e-05, + "loss": 0.7027, + "step": 35165 + }, + { + "epoch": 0.9311819052800339, + "grad_norm": 0.76171875, + "learning_rate": 8.842458046076632e-05, + "loss": 0.7922, + "step": 35166 + }, + { + "epoch": 0.9312083848883282, + "grad_norm": 0.83984375, + "learning_rate": 8.842000767918736e-05, + "loss": 0.7047, + "step": 35167 + }, + { + "epoch": 0.9312348644966226, + "grad_norm": 0.8359375, + "learning_rate": 8.841543492215156e-05, + "loss": 0.7605, + "step": 35168 + }, + { + "epoch": 0.931261344104917, + "grad_norm": 0.796875, + "learning_rate": 8.841086218966857e-05, + "loss": 0.8376, + "step": 35169 + }, + { + "epoch": 0.9312878237132114, + "grad_norm": 0.890625, + "learning_rate": 8.84062894817481e-05, + "loss": 0.9529, + "step": 35170 + }, + { + "epoch": 0.9313143033215058, + "grad_norm": 0.78515625, + "learning_rate": 8.840171679839983e-05, + "loss": 0.7968, + "step": 35171 + }, + { + "epoch": 0.9313407829298002, + "grad_norm": 0.76171875, + "learning_rate": 8.839714413963343e-05, + "loss": 0.6876, + "step": 35172 + }, + { + "epoch": 0.9313672625380945, + "grad_norm": 0.7421875, + "learning_rate": 8.839257150545866e-05, + "loss": 0.7149, + "step": 35173 + }, + { + "epoch": 0.9313937421463889, + "grad_norm": 0.734375, + "learning_rate": 8.838799889588518e-05, + "loss": 0.7744, + "step": 35174 + }, + { + "epoch": 0.9314202217546833, + "grad_norm": 0.76953125, + "learning_rate": 8.838342631092265e-05, + "loss": 0.8307, + "step": 35175 + }, + { + "epoch": 0.9314467013629777, + "grad_norm": 0.71875, + "learning_rate": 8.83788537505808e-05, + "loss": 0.853, + "step": 35176 + }, + { + "epoch": 0.9314731809712721, + "grad_norm": 0.81640625, + "learning_rate": 8.837428121486925e-05, + "loss": 0.7995, + "step": 35177 + }, + { + "epoch": 0.9314996605795665, + "grad_norm": 0.70703125, + "learning_rate": 8.836970870379779e-05, + "loss": 0.8126, + "step": 35178 + }, + { + "epoch": 0.9315261401878607, + "grad_norm": 0.89453125, + "learning_rate": 8.836513621737608e-05, + "loss": 0.8902, + "step": 35179 + }, + { + "epoch": 0.9315526197961551, + "grad_norm": 0.8125, + "learning_rate": 8.83605637556138e-05, + "loss": 0.8198, + "step": 35180 + }, + { + "epoch": 0.9315790994044495, + "grad_norm": 0.703125, + "learning_rate": 8.835599131852063e-05, + "loss": 0.8254, + "step": 35181 + }, + { + "epoch": 0.9316055790127439, + "grad_norm": 0.78125, + "learning_rate": 8.835141890610624e-05, + "loss": 0.7739, + "step": 35182 + }, + { + "epoch": 0.9316320586210383, + "grad_norm": 0.703125, + "learning_rate": 8.834684651838037e-05, + "loss": 0.7403, + "step": 35183 + }, + { + "epoch": 0.9316585382293326, + "grad_norm": 0.75390625, + "learning_rate": 8.83422741553527e-05, + "loss": 0.7083, + "step": 35184 + }, + { + "epoch": 0.931685017837627, + "grad_norm": 0.76171875, + "learning_rate": 8.833770181703293e-05, + "loss": 0.808, + "step": 35185 + }, + { + "epoch": 0.9317114974459214, + "grad_norm": 0.79296875, + "learning_rate": 8.833312950343073e-05, + "loss": 0.8173, + "step": 35186 + }, + { + "epoch": 0.9317379770542158, + "grad_norm": 0.7265625, + "learning_rate": 8.832855721455577e-05, + "loss": 0.8166, + "step": 35187 + }, + { + "epoch": 0.9317644566625102, + "grad_norm": 0.7734375, + "learning_rate": 8.832398495041779e-05, + "loss": 0.7587, + "step": 35188 + }, + { + "epoch": 0.9317909362708046, + "grad_norm": 0.8359375, + "learning_rate": 8.831941271102645e-05, + "loss": 0.9162, + "step": 35189 + }, + { + "epoch": 0.9318174158790989, + "grad_norm": 0.80859375, + "learning_rate": 8.831484049639146e-05, + "loss": 0.6923, + "step": 35190 + }, + { + "epoch": 0.9318438954873933, + "grad_norm": 0.76171875, + "learning_rate": 8.831026830652248e-05, + "loss": 0.7248, + "step": 35191 + }, + { + "epoch": 0.9318703750956877, + "grad_norm": 0.734375, + "learning_rate": 8.830569614142919e-05, + "loss": 0.7995, + "step": 35192 + }, + { + "epoch": 0.9318968547039821, + "grad_norm": 0.72265625, + "learning_rate": 8.830112400112134e-05, + "loss": 0.6961, + "step": 35193 + }, + { + "epoch": 0.9319233343122765, + "grad_norm": 0.828125, + "learning_rate": 8.82965518856086e-05, + "loss": 0.7397, + "step": 35194 + }, + { + "epoch": 0.9319498139205709, + "grad_norm": 0.7734375, + "learning_rate": 8.829197979490065e-05, + "loss": 0.6424, + "step": 35195 + }, + { + "epoch": 0.9319762935288651, + "grad_norm": 0.83203125, + "learning_rate": 8.828740772900716e-05, + "loss": 0.814, + "step": 35196 + }, + { + "epoch": 0.9320027731371595, + "grad_norm": 0.77734375, + "learning_rate": 8.828283568793782e-05, + "loss": 0.8598, + "step": 35197 + }, + { + "epoch": 0.9320292527454539, + "grad_norm": 0.84375, + "learning_rate": 8.827826367170236e-05, + "loss": 0.8354, + "step": 35198 + }, + { + "epoch": 0.9320557323537483, + "grad_norm": 0.875, + "learning_rate": 8.827369168031048e-05, + "loss": 0.835, + "step": 35199 + }, + { + "epoch": 0.9320822119620427, + "grad_norm": 0.83203125, + "learning_rate": 8.826911971377182e-05, + "loss": 0.8858, + "step": 35200 + }, + { + "epoch": 0.932108691570337, + "grad_norm": 0.70703125, + "learning_rate": 8.82645477720961e-05, + "loss": 0.7804, + "step": 35201 + }, + { + "epoch": 0.9321351711786314, + "grad_norm": 0.75390625, + "learning_rate": 8.825997585529295e-05, + "loss": 0.8038, + "step": 35202 + }, + { + "epoch": 0.9321616507869258, + "grad_norm": 0.84765625, + "learning_rate": 8.825540396337215e-05, + "loss": 0.8483, + "step": 35203 + }, + { + "epoch": 0.9321881303952202, + "grad_norm": 0.9609375, + "learning_rate": 8.825083209634336e-05, + "loss": 0.7119, + "step": 35204 + }, + { + "epoch": 0.9322146100035146, + "grad_norm": 0.75390625, + "learning_rate": 8.824626025421626e-05, + "loss": 0.7749, + "step": 35205 + }, + { + "epoch": 0.932241089611809, + "grad_norm": 0.80078125, + "learning_rate": 8.824168843700052e-05, + "loss": 0.77, + "step": 35206 + }, + { + "epoch": 0.9322675692201033, + "grad_norm": 0.828125, + "learning_rate": 8.823711664470585e-05, + "loss": 0.8858, + "step": 35207 + }, + { + "epoch": 0.9322940488283977, + "grad_norm": 0.73828125, + "learning_rate": 8.823254487734194e-05, + "loss": 0.824, + "step": 35208 + }, + { + "epoch": 0.9323205284366921, + "grad_norm": 0.78515625, + "learning_rate": 8.82279731349185e-05, + "loss": 0.7382, + "step": 35209 + }, + { + "epoch": 0.9323470080449865, + "grad_norm": 0.828125, + "learning_rate": 8.822340141744518e-05, + "loss": 0.7482, + "step": 35210 + }, + { + "epoch": 0.9323734876532809, + "grad_norm": 0.8359375, + "learning_rate": 8.82188297249317e-05, + "loss": 0.8862, + "step": 35211 + }, + { + "epoch": 0.9323999672615751, + "grad_norm": 0.75, + "learning_rate": 8.82142580573877e-05, + "loss": 0.831, + "step": 35212 + }, + { + "epoch": 0.9324264468698695, + "grad_norm": 0.76953125, + "learning_rate": 8.820968641482294e-05, + "loss": 0.9107, + "step": 35213 + }, + { + "epoch": 0.9324529264781639, + "grad_norm": 0.78125, + "learning_rate": 8.820511479724707e-05, + "loss": 0.8441, + "step": 35214 + }, + { + "epoch": 0.9324794060864583, + "grad_norm": 0.765625, + "learning_rate": 8.820054320466978e-05, + "loss": 0.8265, + "step": 35215 + }, + { + "epoch": 0.9325058856947527, + "grad_norm": 0.7578125, + "learning_rate": 8.819597163710078e-05, + "loss": 0.8503, + "step": 35216 + }, + { + "epoch": 0.9325323653030471, + "grad_norm": 0.7734375, + "learning_rate": 8.819140009454969e-05, + "loss": 0.9335, + "step": 35217 + }, + { + "epoch": 0.9325588449113414, + "grad_norm": 0.83203125, + "learning_rate": 8.81868285770263e-05, + "loss": 0.7558, + "step": 35218 + }, + { + "epoch": 0.9325853245196358, + "grad_norm": 0.7890625, + "learning_rate": 8.818225708454026e-05, + "loss": 0.6889, + "step": 35219 + }, + { + "epoch": 0.9326118041279302, + "grad_norm": 0.73828125, + "learning_rate": 8.817768561710125e-05, + "loss": 0.6994, + "step": 35220 + }, + { + "epoch": 0.9326382837362246, + "grad_norm": 0.8671875, + "learning_rate": 8.817311417471894e-05, + "loss": 0.909, + "step": 35221 + }, + { + "epoch": 0.932664763344519, + "grad_norm": 0.859375, + "learning_rate": 8.816854275740301e-05, + "loss": 0.7341, + "step": 35222 + }, + { + "epoch": 0.9326912429528134, + "grad_norm": 0.71875, + "learning_rate": 8.816397136516322e-05, + "loss": 0.6286, + "step": 35223 + }, + { + "epoch": 0.9327177225611077, + "grad_norm": 0.80859375, + "learning_rate": 8.815939999800922e-05, + "loss": 0.7808, + "step": 35224 + }, + { + "epoch": 0.9327442021694021, + "grad_norm": 0.796875, + "learning_rate": 8.815482865595068e-05, + "loss": 0.7507, + "step": 35225 + }, + { + "epoch": 0.9327706817776965, + "grad_norm": 0.8046875, + "learning_rate": 8.815025733899732e-05, + "loss": 0.7894, + "step": 35226 + }, + { + "epoch": 0.9327971613859909, + "grad_norm": 0.8203125, + "learning_rate": 8.814568604715878e-05, + "loss": 0.772, + "step": 35227 + }, + { + "epoch": 0.9328236409942852, + "grad_norm": 0.7734375, + "learning_rate": 8.814111478044481e-05, + "loss": 0.8749, + "step": 35228 + }, + { + "epoch": 0.9328501206025795, + "grad_norm": 0.78125, + "learning_rate": 8.813654353886506e-05, + "loss": 0.7347, + "step": 35229 + }, + { + "epoch": 0.9328766002108739, + "grad_norm": 0.80078125, + "learning_rate": 8.813197232242922e-05, + "loss": 1.0664, + "step": 35230 + }, + { + "epoch": 0.9329030798191683, + "grad_norm": 0.84765625, + "learning_rate": 8.812740113114701e-05, + "loss": 0.8136, + "step": 35231 + }, + { + "epoch": 0.9329295594274627, + "grad_norm": 0.79296875, + "learning_rate": 8.812282996502805e-05, + "loss": 0.7441, + "step": 35232 + }, + { + "epoch": 0.9329560390357571, + "grad_norm": 0.76953125, + "learning_rate": 8.81182588240821e-05, + "loss": 0.7735, + "step": 35233 + }, + { + "epoch": 0.9329825186440515, + "grad_norm": 0.79296875, + "learning_rate": 8.811368770831884e-05, + "loss": 0.7804, + "step": 35234 + }, + { + "epoch": 0.9330089982523458, + "grad_norm": 0.8046875, + "learning_rate": 8.810911661774792e-05, + "loss": 0.7243, + "step": 35235 + }, + { + "epoch": 0.9330354778606402, + "grad_norm": 0.76953125, + "learning_rate": 8.810454555237906e-05, + "loss": 0.8517, + "step": 35236 + }, + { + "epoch": 0.9330619574689346, + "grad_norm": 0.76171875, + "learning_rate": 8.809997451222189e-05, + "loss": 0.7934, + "step": 35237 + }, + { + "epoch": 0.933088437077229, + "grad_norm": 0.7421875, + "learning_rate": 8.809540349728619e-05, + "loss": 0.8419, + "step": 35238 + }, + { + "epoch": 0.9331149166855234, + "grad_norm": 0.82421875, + "learning_rate": 8.80908325075816e-05, + "loss": 0.8179, + "step": 35239 + }, + { + "epoch": 0.9331413962938178, + "grad_norm": 1.2109375, + "learning_rate": 8.808626154311781e-05, + "loss": 0.8058, + "step": 35240 + }, + { + "epoch": 0.9331678759021121, + "grad_norm": 0.83203125, + "learning_rate": 8.808169060390449e-05, + "loss": 0.8305, + "step": 35241 + }, + { + "epoch": 0.9331943555104065, + "grad_norm": 0.76171875, + "learning_rate": 8.807711968995133e-05, + "loss": 0.79, + "step": 35242 + }, + { + "epoch": 0.9332208351187009, + "grad_norm": 0.76171875, + "learning_rate": 8.807254880126807e-05, + "loss": 0.6897, + "step": 35243 + }, + { + "epoch": 0.9332473147269953, + "grad_norm": 0.79296875, + "learning_rate": 8.806797793786434e-05, + "loss": 0.7554, + "step": 35244 + }, + { + "epoch": 0.9332737943352896, + "grad_norm": 0.7265625, + "learning_rate": 8.806340709974987e-05, + "loss": 0.7171, + "step": 35245 + }, + { + "epoch": 0.933300273943584, + "grad_norm": 0.8046875, + "learning_rate": 8.805883628693432e-05, + "loss": 0.7394, + "step": 35246 + }, + { + "epoch": 0.9333267535518783, + "grad_norm": 0.76171875, + "learning_rate": 8.805426549942737e-05, + "loss": 0.7611, + "step": 35247 + }, + { + "epoch": 0.9333532331601727, + "grad_norm": 0.7421875, + "learning_rate": 8.804969473723874e-05, + "loss": 0.861, + "step": 35248 + }, + { + "epoch": 0.9333797127684671, + "grad_norm": 0.80078125, + "learning_rate": 8.804512400037807e-05, + "loss": 0.8681, + "step": 35249 + }, + { + "epoch": 0.9334061923767615, + "grad_norm": 0.859375, + "learning_rate": 8.80405532888551e-05, + "loss": 0.7375, + "step": 35250 + }, + { + "epoch": 0.9334326719850559, + "grad_norm": 0.7890625, + "learning_rate": 8.803598260267949e-05, + "loss": 0.7759, + "step": 35251 + }, + { + "epoch": 0.9334591515933502, + "grad_norm": 0.8203125, + "learning_rate": 8.803141194186091e-05, + "loss": 0.8972, + "step": 35252 + }, + { + "epoch": 0.9334856312016446, + "grad_norm": 0.78125, + "learning_rate": 8.80268413064091e-05, + "loss": 0.8488, + "step": 35253 + }, + { + "epoch": 0.933512110809939, + "grad_norm": 0.828125, + "learning_rate": 8.802227069633371e-05, + "loss": 0.8579, + "step": 35254 + }, + { + "epoch": 0.9335385904182334, + "grad_norm": 0.8125, + "learning_rate": 8.801770011164443e-05, + "loss": 0.7793, + "step": 35255 + }, + { + "epoch": 0.9335650700265278, + "grad_norm": 0.8203125, + "learning_rate": 8.801312955235094e-05, + "loss": 0.8765, + "step": 35256 + }, + { + "epoch": 0.9335915496348222, + "grad_norm": 0.7265625, + "learning_rate": 8.800855901846292e-05, + "loss": 0.8654, + "step": 35257 + }, + { + "epoch": 0.9336180292431165, + "grad_norm": 0.890625, + "learning_rate": 8.80039885099901e-05, + "loss": 0.7538, + "step": 35258 + }, + { + "epoch": 0.9336445088514109, + "grad_norm": 0.78515625, + "learning_rate": 8.799941802694213e-05, + "loss": 0.7369, + "step": 35259 + }, + { + "epoch": 0.9336709884597053, + "grad_norm": 0.765625, + "learning_rate": 8.799484756932872e-05, + "loss": 0.8319, + "step": 35260 + }, + { + "epoch": 0.9336974680679996, + "grad_norm": 0.80859375, + "learning_rate": 8.799027713715956e-05, + "loss": 0.841, + "step": 35261 + }, + { + "epoch": 0.933723947676294, + "grad_norm": 0.83984375, + "learning_rate": 8.798570673044426e-05, + "loss": 0.857, + "step": 35262 + }, + { + "epoch": 0.9337504272845883, + "grad_norm": 1.375, + "learning_rate": 8.798113634919262e-05, + "loss": 1.0359, + "step": 35263 + }, + { + "epoch": 0.9337769068928827, + "grad_norm": 0.71484375, + "learning_rate": 8.797656599341426e-05, + "loss": 0.802, + "step": 35264 + }, + { + "epoch": 0.9338033865011771, + "grad_norm": 0.94921875, + "learning_rate": 8.79719956631189e-05, + "loss": 0.874, + "step": 35265 + }, + { + "epoch": 0.9338298661094715, + "grad_norm": 0.78125, + "learning_rate": 8.796742535831619e-05, + "loss": 0.7697, + "step": 35266 + }, + { + "epoch": 0.9338563457177659, + "grad_norm": 0.81640625, + "learning_rate": 8.796285507901582e-05, + "loss": 0.8761, + "step": 35267 + }, + { + "epoch": 0.9338828253260603, + "grad_norm": 0.75390625, + "learning_rate": 8.79582848252275e-05, + "loss": 0.6701, + "step": 35268 + }, + { + "epoch": 0.9339093049343546, + "grad_norm": 0.74609375, + "learning_rate": 8.795371459696091e-05, + "loss": 0.7756, + "step": 35269 + }, + { + "epoch": 0.933935784542649, + "grad_norm": 0.73828125, + "learning_rate": 8.794914439422575e-05, + "loss": 0.6645, + "step": 35270 + }, + { + "epoch": 0.9339622641509434, + "grad_norm": 0.78515625, + "learning_rate": 8.794457421703168e-05, + "loss": 0.9246, + "step": 35271 + }, + { + "epoch": 0.9339887437592378, + "grad_norm": 0.71875, + "learning_rate": 8.794000406538838e-05, + "loss": 0.7448, + "step": 35272 + }, + { + "epoch": 0.9340152233675322, + "grad_norm": 0.78125, + "learning_rate": 8.793543393930557e-05, + "loss": 0.8511, + "step": 35273 + }, + { + "epoch": 0.9340417029758266, + "grad_norm": 0.83984375, + "learning_rate": 8.793086383879292e-05, + "loss": 0.8037, + "step": 35274 + }, + { + "epoch": 0.9340681825841209, + "grad_norm": 0.78515625, + "learning_rate": 8.792629376386012e-05, + "loss": 0.8013, + "step": 35275 + }, + { + "epoch": 0.9340946621924153, + "grad_norm": 0.75, + "learning_rate": 8.792172371451684e-05, + "loss": 0.8569, + "step": 35276 + }, + { + "epoch": 0.9341211418007097, + "grad_norm": 0.80078125, + "learning_rate": 8.791715369077277e-05, + "loss": 0.7789, + "step": 35277 + }, + { + "epoch": 0.934147621409004, + "grad_norm": 0.73046875, + "learning_rate": 8.791258369263758e-05, + "loss": 0.7124, + "step": 35278 + }, + { + "epoch": 0.9341741010172984, + "grad_norm": 0.7734375, + "learning_rate": 8.790801372012101e-05, + "loss": 0.7889, + "step": 35279 + }, + { + "epoch": 0.9342005806255927, + "grad_norm": 0.765625, + "learning_rate": 8.790344377323271e-05, + "loss": 0.8546, + "step": 35280 + }, + { + "epoch": 0.9342270602338871, + "grad_norm": 0.796875, + "learning_rate": 8.789887385198237e-05, + "loss": 0.7583, + "step": 35281 + }, + { + "epoch": 0.9342535398421815, + "grad_norm": 0.86328125, + "learning_rate": 8.789430395637967e-05, + "loss": 0.7466, + "step": 35282 + }, + { + "epoch": 0.9342800194504759, + "grad_norm": 0.72265625, + "learning_rate": 8.788973408643427e-05, + "loss": 0.7363, + "step": 35283 + }, + { + "epoch": 0.9343064990587703, + "grad_norm": 0.84375, + "learning_rate": 8.788516424215593e-05, + "loss": 0.7668, + "step": 35284 + }, + { + "epoch": 0.9343329786670647, + "grad_norm": 0.796875, + "learning_rate": 8.788059442355429e-05, + "loss": 0.7797, + "step": 35285 + }, + { + "epoch": 0.934359458275359, + "grad_norm": 0.734375, + "learning_rate": 8.787602463063904e-05, + "loss": 0.7256, + "step": 35286 + }, + { + "epoch": 0.9343859378836534, + "grad_norm": 0.77734375, + "learning_rate": 8.787145486341984e-05, + "loss": 0.7555, + "step": 35287 + }, + { + "epoch": 0.9344124174919478, + "grad_norm": 0.80078125, + "learning_rate": 8.786688512190638e-05, + "loss": 0.7863, + "step": 35288 + }, + { + "epoch": 0.9344388971002422, + "grad_norm": 0.7265625, + "learning_rate": 8.78623154061084e-05, + "loss": 0.8198, + "step": 35289 + }, + { + "epoch": 0.9344653767085366, + "grad_norm": 0.71484375, + "learning_rate": 8.785774571603555e-05, + "loss": 0.8032, + "step": 35290 + }, + { + "epoch": 0.934491856316831, + "grad_norm": 0.8203125, + "learning_rate": 8.785317605169751e-05, + "loss": 0.8278, + "step": 35291 + }, + { + "epoch": 0.9345183359251253, + "grad_norm": 0.8359375, + "learning_rate": 8.784860641310398e-05, + "loss": 0.8862, + "step": 35292 + }, + { + "epoch": 0.9345448155334197, + "grad_norm": 0.8359375, + "learning_rate": 8.78440368002646e-05, + "loss": 0.882, + "step": 35293 + }, + { + "epoch": 0.934571295141714, + "grad_norm": 0.81640625, + "learning_rate": 8.783946721318911e-05, + "loss": 0.7834, + "step": 35294 + }, + { + "epoch": 0.9345977747500084, + "grad_norm": 0.73828125, + "learning_rate": 8.783489765188719e-05, + "loss": 0.7319, + "step": 35295 + }, + { + "epoch": 0.9346242543583028, + "grad_norm": 0.6796875, + "learning_rate": 8.783032811636848e-05, + "loss": 0.7012, + "step": 35296 + }, + { + "epoch": 0.9346507339665971, + "grad_norm": 0.78515625, + "learning_rate": 8.78257586066427e-05, + "loss": 0.7518, + "step": 35297 + }, + { + "epoch": 0.9346772135748915, + "grad_norm": 0.8125, + "learning_rate": 8.782118912271952e-05, + "loss": 0.8635, + "step": 35298 + }, + { + "epoch": 0.9347036931831859, + "grad_norm": 0.78515625, + "learning_rate": 8.781661966460865e-05, + "loss": 0.782, + "step": 35299 + }, + { + "epoch": 0.9347301727914803, + "grad_norm": 0.765625, + "learning_rate": 8.781205023231976e-05, + "loss": 0.7881, + "step": 35300 + }, + { + "epoch": 0.9347566523997747, + "grad_norm": 0.765625, + "learning_rate": 8.780748082586255e-05, + "loss": 0.7891, + "step": 35301 + }, + { + "epoch": 0.9347831320080691, + "grad_norm": 0.77734375, + "learning_rate": 8.780291144524667e-05, + "loss": 0.8324, + "step": 35302 + }, + { + "epoch": 0.9348096116163634, + "grad_norm": 0.75, + "learning_rate": 8.779834209048177e-05, + "loss": 0.8039, + "step": 35303 + }, + { + "epoch": 0.9348360912246578, + "grad_norm": 0.7421875, + "learning_rate": 8.779377276157765e-05, + "loss": 0.7951, + "step": 35304 + }, + { + "epoch": 0.9348625708329522, + "grad_norm": 0.796875, + "learning_rate": 8.778920345854391e-05, + "loss": 0.7877, + "step": 35305 + }, + { + "epoch": 0.9348890504412466, + "grad_norm": 0.85546875, + "learning_rate": 8.778463418139026e-05, + "loss": 0.7105, + "step": 35306 + }, + { + "epoch": 0.934915530049541, + "grad_norm": 0.82421875, + "learning_rate": 8.778006493012641e-05, + "loss": 0.7467, + "step": 35307 + }, + { + "epoch": 0.9349420096578354, + "grad_norm": 0.7578125, + "learning_rate": 8.777549570476194e-05, + "loss": 0.7659, + "step": 35308 + }, + { + "epoch": 0.9349684892661297, + "grad_norm": 0.89453125, + "learning_rate": 8.777092650530666e-05, + "loss": 0.9027, + "step": 35309 + }, + { + "epoch": 0.934994968874424, + "grad_norm": 0.8125, + "learning_rate": 8.77663573317702e-05, + "loss": 0.76, + "step": 35310 + }, + { + "epoch": 0.9350214484827184, + "grad_norm": 0.75390625, + "learning_rate": 8.776178818416225e-05, + "loss": 0.8751, + "step": 35311 + }, + { + "epoch": 0.9350479280910128, + "grad_norm": 0.72265625, + "learning_rate": 8.775721906249248e-05, + "loss": 0.7808, + "step": 35312 + }, + { + "epoch": 0.9350744076993072, + "grad_norm": 0.7734375, + "learning_rate": 8.775264996677057e-05, + "loss": 0.7857, + "step": 35313 + }, + { + "epoch": 0.9351008873076015, + "grad_norm": 0.7890625, + "learning_rate": 8.774808089700624e-05, + "loss": 0.8647, + "step": 35314 + }, + { + "epoch": 0.9351273669158959, + "grad_norm": 0.75390625, + "learning_rate": 8.774351185320915e-05, + "loss": 0.6811, + "step": 35315 + }, + { + "epoch": 0.9351538465241903, + "grad_norm": 0.73046875, + "learning_rate": 8.773894283538898e-05, + "loss": 0.7511, + "step": 35316 + }, + { + "epoch": 0.9351803261324847, + "grad_norm": 0.8125, + "learning_rate": 8.773437384355543e-05, + "loss": 0.7875, + "step": 35317 + }, + { + "epoch": 0.9352068057407791, + "grad_norm": 0.8046875, + "learning_rate": 8.772980487771814e-05, + "loss": 0.7538, + "step": 35318 + }, + { + "epoch": 0.9352332853490735, + "grad_norm": 0.79296875, + "learning_rate": 8.772523593788687e-05, + "loss": 0.8461, + "step": 35319 + }, + { + "epoch": 0.9352597649573678, + "grad_norm": 0.765625, + "learning_rate": 8.772066702407125e-05, + "loss": 0.8091, + "step": 35320 + }, + { + "epoch": 0.9352862445656622, + "grad_norm": 0.7890625, + "learning_rate": 8.771609813628097e-05, + "loss": 0.8071, + "step": 35321 + }, + { + "epoch": 0.9353127241739566, + "grad_norm": 0.8046875, + "learning_rate": 8.771152927452572e-05, + "loss": 0.7601, + "step": 35322 + }, + { + "epoch": 0.935339203782251, + "grad_norm": 0.828125, + "learning_rate": 8.770696043881514e-05, + "loss": 0.8598, + "step": 35323 + }, + { + "epoch": 0.9353656833905454, + "grad_norm": 0.76953125, + "learning_rate": 8.770239162915902e-05, + "loss": 0.7559, + "step": 35324 + }, + { + "epoch": 0.9353921629988398, + "grad_norm": 0.8515625, + "learning_rate": 8.769782284556695e-05, + "loss": 0.6412, + "step": 35325 + }, + { + "epoch": 0.9354186426071341, + "grad_norm": 0.80859375, + "learning_rate": 8.769325408804864e-05, + "loss": 0.8331, + "step": 35326 + }, + { + "epoch": 0.9354451222154284, + "grad_norm": 0.76953125, + "learning_rate": 8.768868535661379e-05, + "loss": 0.7963, + "step": 35327 + }, + { + "epoch": 0.9354716018237228, + "grad_norm": 0.78515625, + "learning_rate": 8.768411665127203e-05, + "loss": 0.855, + "step": 35328 + }, + { + "epoch": 0.9354980814320172, + "grad_norm": 0.80078125, + "learning_rate": 8.76795479720331e-05, + "loss": 0.7651, + "step": 35329 + }, + { + "epoch": 0.9355245610403116, + "grad_norm": 0.79296875, + "learning_rate": 8.767497931890666e-05, + "loss": 0.905, + "step": 35330 + }, + { + "epoch": 0.935551040648606, + "grad_norm": 0.7421875, + "learning_rate": 8.767041069190242e-05, + "loss": 0.8121, + "step": 35331 + }, + { + "epoch": 0.9355775202569003, + "grad_norm": 0.93359375, + "learning_rate": 8.766584209103004e-05, + "loss": 0.7327, + "step": 35332 + }, + { + "epoch": 0.9356039998651947, + "grad_norm": 0.7421875, + "learning_rate": 8.766127351629917e-05, + "loss": 0.7547, + "step": 35333 + }, + { + "epoch": 0.9356304794734891, + "grad_norm": 0.80078125, + "learning_rate": 8.765670496771952e-05, + "loss": 0.8524, + "step": 35334 + }, + { + "epoch": 0.9356569590817835, + "grad_norm": 0.84375, + "learning_rate": 8.765213644530083e-05, + "loss": 0.7902, + "step": 35335 + }, + { + "epoch": 0.9356834386900779, + "grad_norm": 0.6796875, + "learning_rate": 8.76475679490527e-05, + "loss": 0.8565, + "step": 35336 + }, + { + "epoch": 0.9357099182983722, + "grad_norm": 0.84765625, + "learning_rate": 8.764299947898486e-05, + "loss": 0.7823, + "step": 35337 + }, + { + "epoch": 0.9357363979066666, + "grad_norm": 0.69140625, + "learning_rate": 8.763843103510694e-05, + "loss": 0.7444, + "step": 35338 + }, + { + "epoch": 0.935762877514961, + "grad_norm": 0.734375, + "learning_rate": 8.763386261742869e-05, + "loss": 0.7239, + "step": 35339 + }, + { + "epoch": 0.9357893571232554, + "grad_norm": 0.7734375, + "learning_rate": 8.762929422595976e-05, + "loss": 0.8989, + "step": 35340 + }, + { + "epoch": 0.9358158367315498, + "grad_norm": 0.7109375, + "learning_rate": 8.762472586070984e-05, + "loss": 0.7746, + "step": 35341 + }, + { + "epoch": 0.9358423163398442, + "grad_norm": 0.75, + "learning_rate": 8.76201575216886e-05, + "loss": 0.8323, + "step": 35342 + }, + { + "epoch": 0.9358687959481384, + "grad_norm": 0.98828125, + "learning_rate": 8.761558920890569e-05, + "loss": 0.7848, + "step": 35343 + }, + { + "epoch": 0.9358952755564328, + "grad_norm": 0.7890625, + "learning_rate": 8.761102092237086e-05, + "loss": 0.9672, + "step": 35344 + }, + { + "epoch": 0.9359217551647272, + "grad_norm": 0.7578125, + "learning_rate": 8.760645266209378e-05, + "loss": 0.7999, + "step": 35345 + }, + { + "epoch": 0.9359482347730216, + "grad_norm": 0.91015625, + "learning_rate": 8.76018844280841e-05, + "loss": 0.8599, + "step": 35346 + }, + { + "epoch": 0.935974714381316, + "grad_norm": 0.76171875, + "learning_rate": 8.759731622035152e-05, + "loss": 0.7402, + "step": 35347 + }, + { + "epoch": 0.9360011939896103, + "grad_norm": 0.671875, + "learning_rate": 8.759274803890569e-05, + "loss": 0.6316, + "step": 35348 + }, + { + "epoch": 0.9360276735979047, + "grad_norm": 0.859375, + "learning_rate": 8.758817988375634e-05, + "loss": 0.6896, + "step": 35349 + }, + { + "epoch": 0.9360541532061991, + "grad_norm": 0.79296875, + "learning_rate": 8.758361175491317e-05, + "loss": 0.6964, + "step": 35350 + }, + { + "epoch": 0.9360806328144935, + "grad_norm": 0.7734375, + "learning_rate": 8.757904365238579e-05, + "loss": 0.6611, + "step": 35351 + }, + { + "epoch": 0.9361071124227879, + "grad_norm": 0.7578125, + "learning_rate": 8.757447557618393e-05, + "loss": 0.8074, + "step": 35352 + }, + { + "epoch": 0.9361335920310823, + "grad_norm": 0.69921875, + "learning_rate": 8.756990752631723e-05, + "loss": 0.8702, + "step": 35353 + }, + { + "epoch": 0.9361600716393766, + "grad_norm": 0.82421875, + "learning_rate": 8.756533950279543e-05, + "loss": 0.9535, + "step": 35354 + }, + { + "epoch": 0.936186551247671, + "grad_norm": 0.77734375, + "learning_rate": 8.756077150562818e-05, + "loss": 0.8412, + "step": 35355 + }, + { + "epoch": 0.9362130308559654, + "grad_norm": 0.8046875, + "learning_rate": 8.755620353482517e-05, + "loss": 0.7486, + "step": 35356 + }, + { + "epoch": 0.9362395104642598, + "grad_norm": 0.7890625, + "learning_rate": 8.755163559039606e-05, + "loss": 0.84, + "step": 35357 + }, + { + "epoch": 0.9362659900725542, + "grad_norm": 0.75, + "learning_rate": 8.754706767235055e-05, + "loss": 0.6627, + "step": 35358 + }, + { + "epoch": 0.9362924696808484, + "grad_norm": 0.796875, + "learning_rate": 8.754249978069832e-05, + "loss": 0.7923, + "step": 35359 + }, + { + "epoch": 0.9363189492891428, + "grad_norm": 0.76953125, + "learning_rate": 8.753793191544905e-05, + "loss": 0.8897, + "step": 35360 + }, + { + "epoch": 0.9363454288974372, + "grad_norm": 0.8046875, + "learning_rate": 8.753336407661244e-05, + "loss": 0.7635, + "step": 35361 + }, + { + "epoch": 0.9363719085057316, + "grad_norm": 0.82421875, + "learning_rate": 8.752879626419814e-05, + "loss": 0.9094, + "step": 35362 + }, + { + "epoch": 0.936398388114026, + "grad_norm": 0.85546875, + "learning_rate": 8.75242284782158e-05, + "loss": 0.8463, + "step": 35363 + }, + { + "epoch": 0.9364248677223204, + "grad_norm": 1.0859375, + "learning_rate": 8.751966071867519e-05, + "loss": 0.8545, + "step": 35364 + }, + { + "epoch": 0.9364513473306147, + "grad_norm": 0.80078125, + "learning_rate": 8.751509298558595e-05, + "loss": 0.8499, + "step": 35365 + }, + { + "epoch": 0.9364778269389091, + "grad_norm": 0.80859375, + "learning_rate": 8.751052527895775e-05, + "loss": 0.7811, + "step": 35366 + }, + { + "epoch": 0.9365043065472035, + "grad_norm": 0.76953125, + "learning_rate": 8.750595759880027e-05, + "loss": 0.7702, + "step": 35367 + }, + { + "epoch": 0.9365307861554979, + "grad_norm": 0.75390625, + "learning_rate": 8.750138994512316e-05, + "loss": 0.8221, + "step": 35368 + }, + { + "epoch": 0.9365572657637923, + "grad_norm": 0.7265625, + "learning_rate": 8.749682231793619e-05, + "loss": 0.7341, + "step": 35369 + }, + { + "epoch": 0.9365837453720867, + "grad_norm": 0.796875, + "learning_rate": 8.749225471724898e-05, + "loss": 0.9293, + "step": 35370 + }, + { + "epoch": 0.936610224980381, + "grad_norm": 0.87109375, + "learning_rate": 8.748768714307124e-05, + "loss": 0.8685, + "step": 35371 + }, + { + "epoch": 0.9366367045886754, + "grad_norm": 0.77734375, + "learning_rate": 8.748311959541262e-05, + "loss": 0.8571, + "step": 35372 + }, + { + "epoch": 0.9366631841969698, + "grad_norm": 0.69140625, + "learning_rate": 8.747855207428277e-05, + "loss": 0.7081, + "step": 35373 + }, + { + "epoch": 0.9366896638052642, + "grad_norm": 0.82421875, + "learning_rate": 8.747398457969145e-05, + "loss": 0.8442, + "step": 35374 + }, + { + "epoch": 0.9367161434135586, + "grad_norm": 0.75, + "learning_rate": 8.746941711164831e-05, + "loss": 0.8031, + "step": 35375 + }, + { + "epoch": 0.9367426230218528, + "grad_norm": 0.76953125, + "learning_rate": 8.746484967016303e-05, + "loss": 0.7495, + "step": 35376 + }, + { + "epoch": 0.9367691026301472, + "grad_norm": 0.71875, + "learning_rate": 8.746028225524528e-05, + "loss": 0.709, + "step": 35377 + }, + { + "epoch": 0.9367955822384416, + "grad_norm": 0.71875, + "learning_rate": 8.745571486690471e-05, + "loss": 0.7854, + "step": 35378 + }, + { + "epoch": 0.936822061846736, + "grad_norm": 0.77734375, + "learning_rate": 8.745114750515109e-05, + "loss": 0.7902, + "step": 35379 + }, + { + "epoch": 0.9368485414550304, + "grad_norm": 0.7421875, + "learning_rate": 8.744658016999401e-05, + "loss": 0.7346, + "step": 35380 + }, + { + "epoch": 0.9368750210633248, + "grad_norm": 0.80859375, + "learning_rate": 8.74420128614432e-05, + "loss": 0.7952, + "step": 35381 + }, + { + "epoch": 0.9369015006716191, + "grad_norm": 0.79296875, + "learning_rate": 8.743744557950833e-05, + "loss": 0.7666, + "step": 35382 + }, + { + "epoch": 0.9369279802799135, + "grad_norm": 0.76171875, + "learning_rate": 8.743287832419903e-05, + "loss": 0.7747, + "step": 35383 + }, + { + "epoch": 0.9369544598882079, + "grad_norm": 0.7890625, + "learning_rate": 8.742831109552508e-05, + "loss": 0.8081, + "step": 35384 + }, + { + "epoch": 0.9369809394965023, + "grad_norm": 0.80078125, + "learning_rate": 8.742374389349608e-05, + "loss": 0.8974, + "step": 35385 + }, + { + "epoch": 0.9370074191047967, + "grad_norm": 0.8125, + "learning_rate": 8.741917671812177e-05, + "loss": 0.691, + "step": 35386 + }, + { + "epoch": 0.9370338987130911, + "grad_norm": 0.79296875, + "learning_rate": 8.741460956941177e-05, + "loss": 0.8177, + "step": 35387 + }, + { + "epoch": 0.9370603783213854, + "grad_norm": 0.953125, + "learning_rate": 8.741004244737574e-05, + "loss": 0.8088, + "step": 35388 + }, + { + "epoch": 0.9370868579296798, + "grad_norm": 0.69140625, + "learning_rate": 8.740547535202346e-05, + "loss": 0.6692, + "step": 35389 + }, + { + "epoch": 0.9371133375379742, + "grad_norm": 0.87109375, + "learning_rate": 8.740090828336456e-05, + "loss": 0.8628, + "step": 35390 + }, + { + "epoch": 0.9371398171462686, + "grad_norm": 0.8046875, + "learning_rate": 8.73963412414087e-05, + "loss": 0.8614, + "step": 35391 + }, + { + "epoch": 0.9371662967545629, + "grad_norm": 0.74609375, + "learning_rate": 8.739177422616558e-05, + "loss": 0.8388, + "step": 35392 + }, + { + "epoch": 0.9371927763628572, + "grad_norm": 0.765625, + "learning_rate": 8.738720723764485e-05, + "loss": 0.6995, + "step": 35393 + }, + { + "epoch": 0.9372192559711516, + "grad_norm": 0.72265625, + "learning_rate": 8.738264027585623e-05, + "loss": 0.6672, + "step": 35394 + }, + { + "epoch": 0.937245735579446, + "grad_norm": 0.78515625, + "learning_rate": 8.737807334080938e-05, + "loss": 0.7032, + "step": 35395 + }, + { + "epoch": 0.9372722151877404, + "grad_norm": 0.79296875, + "learning_rate": 8.737350643251401e-05, + "loss": 0.8079, + "step": 35396 + }, + { + "epoch": 0.9372986947960348, + "grad_norm": 0.79296875, + "learning_rate": 8.736893955097975e-05, + "loss": 0.8977, + "step": 35397 + }, + { + "epoch": 0.9373251744043292, + "grad_norm": 0.7578125, + "learning_rate": 8.73643726962163e-05, + "loss": 0.8255, + "step": 35398 + }, + { + "epoch": 0.9373516540126235, + "grad_norm": 0.84765625, + "learning_rate": 8.735980586823334e-05, + "loss": 0.8183, + "step": 35399 + }, + { + "epoch": 0.9373781336209179, + "grad_norm": 0.7734375, + "learning_rate": 8.735523906704056e-05, + "loss": 0.8407, + "step": 35400 + }, + { + "epoch": 0.9374046132292123, + "grad_norm": 0.75, + "learning_rate": 8.735067229264759e-05, + "loss": 0.7602, + "step": 35401 + }, + { + "epoch": 0.9374310928375067, + "grad_norm": 0.84765625, + "learning_rate": 8.734610554506417e-05, + "loss": 0.8766, + "step": 35402 + }, + { + "epoch": 0.9374575724458011, + "grad_norm": 0.765625, + "learning_rate": 8.734153882429995e-05, + "loss": 0.804, + "step": 35403 + }, + { + "epoch": 0.9374840520540955, + "grad_norm": 0.87890625, + "learning_rate": 8.733697213036463e-05, + "loss": 0.8824, + "step": 35404 + }, + { + "epoch": 0.9375105316623898, + "grad_norm": 0.77734375, + "learning_rate": 8.733240546326788e-05, + "loss": 0.7507, + "step": 35405 + }, + { + "epoch": 0.9375370112706842, + "grad_norm": 0.69140625, + "learning_rate": 8.732783882301937e-05, + "loss": 0.6994, + "step": 35406 + }, + { + "epoch": 0.9375634908789786, + "grad_norm": 0.765625, + "learning_rate": 8.732327220962877e-05, + "loss": 0.7097, + "step": 35407 + }, + { + "epoch": 0.9375899704872729, + "grad_norm": 0.83203125, + "learning_rate": 8.731870562310574e-05, + "loss": 0.7853, + "step": 35408 + }, + { + "epoch": 0.9376164500955673, + "grad_norm": 0.859375, + "learning_rate": 8.731413906346003e-05, + "loss": 0.7966, + "step": 35409 + }, + { + "epoch": 0.9376429297038616, + "grad_norm": 0.70703125, + "learning_rate": 8.730957253070127e-05, + "loss": 0.702, + "step": 35410 + }, + { + "epoch": 0.937669409312156, + "grad_norm": 0.78515625, + "learning_rate": 8.730500602483915e-05, + "loss": 0.6759, + "step": 35411 + }, + { + "epoch": 0.9376958889204504, + "grad_norm": 0.875, + "learning_rate": 8.730043954588335e-05, + "loss": 0.8434, + "step": 35412 + }, + { + "epoch": 0.9377223685287448, + "grad_norm": 0.76171875, + "learning_rate": 8.729587309384349e-05, + "loss": 0.8855, + "step": 35413 + }, + { + "epoch": 0.9377488481370392, + "grad_norm": 1.4765625, + "learning_rate": 8.729130666872935e-05, + "loss": 0.7442, + "step": 35414 + }, + { + "epoch": 0.9377753277453336, + "grad_norm": 0.84375, + "learning_rate": 8.728674027055057e-05, + "loss": 0.8619, + "step": 35415 + }, + { + "epoch": 0.937801807353628, + "grad_norm": 0.765625, + "learning_rate": 8.728217389931679e-05, + "loss": 0.8205, + "step": 35416 + }, + { + "epoch": 0.9378282869619223, + "grad_norm": 0.76171875, + "learning_rate": 8.727760755503773e-05, + "loss": 0.7417, + "step": 35417 + }, + { + "epoch": 0.9378547665702167, + "grad_norm": 0.73828125, + "learning_rate": 8.727304123772304e-05, + "loss": 0.8084, + "step": 35418 + }, + { + "epoch": 0.9378812461785111, + "grad_norm": 0.75, + "learning_rate": 8.726847494738241e-05, + "loss": 0.6905, + "step": 35419 + }, + { + "epoch": 0.9379077257868055, + "grad_norm": 0.77734375, + "learning_rate": 8.726390868402552e-05, + "loss": 0.8236, + "step": 35420 + }, + { + "epoch": 0.9379342053950999, + "grad_norm": 0.796875, + "learning_rate": 8.725934244766206e-05, + "loss": 0.8361, + "step": 35421 + }, + { + "epoch": 0.9379606850033942, + "grad_norm": 0.75390625, + "learning_rate": 8.72547762383017e-05, + "loss": 0.8857, + "step": 35422 + }, + { + "epoch": 0.9379871646116886, + "grad_norm": 0.76171875, + "learning_rate": 8.725021005595412e-05, + "loss": 0.7997, + "step": 35423 + }, + { + "epoch": 0.938013644219983, + "grad_norm": 0.796875, + "learning_rate": 8.724564390062895e-05, + "loss": 0.6988, + "step": 35424 + }, + { + "epoch": 0.9380401238282773, + "grad_norm": 0.82421875, + "learning_rate": 8.724107777233595e-05, + "loss": 0.8514, + "step": 35425 + }, + { + "epoch": 0.9380666034365717, + "grad_norm": 0.8671875, + "learning_rate": 8.723651167108474e-05, + "loss": 0.9237, + "step": 35426 + }, + { + "epoch": 0.938093083044866, + "grad_norm": 0.75390625, + "learning_rate": 8.723194559688502e-05, + "loss": 0.7734, + "step": 35427 + }, + { + "epoch": 0.9381195626531604, + "grad_norm": 0.8515625, + "learning_rate": 8.722737954974647e-05, + "loss": 0.7074, + "step": 35428 + }, + { + "epoch": 0.9381460422614548, + "grad_norm": 0.84765625, + "learning_rate": 8.722281352967871e-05, + "loss": 0.8962, + "step": 35429 + }, + { + "epoch": 0.9381725218697492, + "grad_norm": 0.859375, + "learning_rate": 8.721824753669149e-05, + "loss": 0.7083, + "step": 35430 + }, + { + "epoch": 0.9381990014780436, + "grad_norm": 0.796875, + "learning_rate": 8.72136815707945e-05, + "loss": 0.9243, + "step": 35431 + }, + { + "epoch": 0.938225481086338, + "grad_norm": 0.8515625, + "learning_rate": 8.720911563199736e-05, + "loss": 0.8019, + "step": 35432 + }, + { + "epoch": 0.9382519606946323, + "grad_norm": 0.78125, + "learning_rate": 8.720454972030977e-05, + "loss": 0.7916, + "step": 35433 + }, + { + "epoch": 0.9382784403029267, + "grad_norm": 0.73828125, + "learning_rate": 8.719998383574137e-05, + "loss": 0.8259, + "step": 35434 + }, + { + "epoch": 0.9383049199112211, + "grad_norm": 0.82421875, + "learning_rate": 8.719541797830191e-05, + "loss": 0.8393, + "step": 35435 + }, + { + "epoch": 0.9383313995195155, + "grad_norm": 0.69921875, + "learning_rate": 8.719085214800102e-05, + "loss": 0.6216, + "step": 35436 + }, + { + "epoch": 0.9383578791278099, + "grad_norm": 0.9140625, + "learning_rate": 8.71862863448484e-05, + "loss": 0.8121, + "step": 35437 + }, + { + "epoch": 0.9383843587361043, + "grad_norm": 0.7890625, + "learning_rate": 8.718172056885371e-05, + "loss": 0.7839, + "step": 35438 + }, + { + "epoch": 0.9384108383443986, + "grad_norm": 0.7578125, + "learning_rate": 8.717715482002659e-05, + "loss": 0.7685, + "step": 35439 + }, + { + "epoch": 0.938437317952693, + "grad_norm": 0.79296875, + "learning_rate": 8.71725890983768e-05, + "loss": 0.9, + "step": 35440 + }, + { + "epoch": 0.9384637975609873, + "grad_norm": 0.82421875, + "learning_rate": 8.716802340391397e-05, + "loss": 0.9407, + "step": 35441 + }, + { + "epoch": 0.9384902771692817, + "grad_norm": 0.7578125, + "learning_rate": 8.716345773664779e-05, + "loss": 0.7728, + "step": 35442 + }, + { + "epoch": 0.9385167567775761, + "grad_norm": 0.8671875, + "learning_rate": 8.715889209658791e-05, + "loss": 0.7487, + "step": 35443 + }, + { + "epoch": 0.9385432363858704, + "grad_norm": 0.7109375, + "learning_rate": 8.715432648374402e-05, + "loss": 0.8067, + "step": 35444 + }, + { + "epoch": 0.9385697159941648, + "grad_norm": 0.7578125, + "learning_rate": 8.714976089812581e-05, + "loss": 0.7413, + "step": 35445 + }, + { + "epoch": 0.9385961956024592, + "grad_norm": 0.75, + "learning_rate": 8.714519533974295e-05, + "loss": 0.6105, + "step": 35446 + }, + { + "epoch": 0.9386226752107536, + "grad_norm": 0.80078125, + "learning_rate": 8.714062980860513e-05, + "loss": 0.8751, + "step": 35447 + }, + { + "epoch": 0.938649154819048, + "grad_norm": 0.75, + "learning_rate": 8.713606430472199e-05, + "loss": 0.7594, + "step": 35448 + }, + { + "epoch": 0.9386756344273424, + "grad_norm": 0.74609375, + "learning_rate": 8.713149882810319e-05, + "loss": 0.7338, + "step": 35449 + }, + { + "epoch": 0.9387021140356367, + "grad_norm": 0.75, + "learning_rate": 8.712693337875848e-05, + "loss": 0.8008, + "step": 35450 + }, + { + "epoch": 0.9387285936439311, + "grad_norm": 1.4375, + "learning_rate": 8.71223679566975e-05, + "loss": 0.8545, + "step": 35451 + }, + { + "epoch": 0.9387550732522255, + "grad_norm": 0.78515625, + "learning_rate": 8.711780256192994e-05, + "loss": 0.8156, + "step": 35452 + }, + { + "epoch": 0.9387815528605199, + "grad_norm": 0.80078125, + "learning_rate": 8.711323719446545e-05, + "loss": 0.7959, + "step": 35453 + }, + { + "epoch": 0.9388080324688143, + "grad_norm": 0.8359375, + "learning_rate": 8.710867185431367e-05, + "loss": 0.7208, + "step": 35454 + }, + { + "epoch": 0.9388345120771087, + "grad_norm": 0.75390625, + "learning_rate": 8.710410654148437e-05, + "loss": 0.7122, + "step": 35455 + }, + { + "epoch": 0.938860991685403, + "grad_norm": 0.85546875, + "learning_rate": 8.709954125598719e-05, + "loss": 0.9611, + "step": 35456 + }, + { + "epoch": 0.9388874712936973, + "grad_norm": 0.83984375, + "learning_rate": 8.709497599783178e-05, + "loss": 0.796, + "step": 35457 + }, + { + "epoch": 0.9389139509019917, + "grad_norm": 0.7890625, + "learning_rate": 8.709041076702784e-05, + "loss": 0.8357, + "step": 35458 + }, + { + "epoch": 0.9389404305102861, + "grad_norm": 0.6953125, + "learning_rate": 8.7085845563585e-05, + "loss": 0.6733, + "step": 35459 + }, + { + "epoch": 0.9389669101185805, + "grad_norm": 0.83984375, + "learning_rate": 8.708128038751299e-05, + "loss": 0.7521, + "step": 35460 + }, + { + "epoch": 0.9389933897268748, + "grad_norm": 0.96484375, + "learning_rate": 8.707671523882149e-05, + "loss": 0.9317, + "step": 35461 + }, + { + "epoch": 0.9390198693351692, + "grad_norm": 0.79296875, + "learning_rate": 8.707215011752015e-05, + "loss": 0.8527, + "step": 35462 + }, + { + "epoch": 0.9390463489434636, + "grad_norm": 0.73828125, + "learning_rate": 8.706758502361863e-05, + "loss": 0.7587, + "step": 35463 + }, + { + "epoch": 0.939072828551758, + "grad_norm": 0.796875, + "learning_rate": 8.706301995712663e-05, + "loss": 0.7994, + "step": 35464 + }, + { + "epoch": 0.9390993081600524, + "grad_norm": 0.76953125, + "learning_rate": 8.705845491805382e-05, + "loss": 0.8647, + "step": 35465 + }, + { + "epoch": 0.9391257877683468, + "grad_norm": 0.7734375, + "learning_rate": 8.705388990640988e-05, + "loss": 0.7404, + "step": 35466 + }, + { + "epoch": 0.9391522673766411, + "grad_norm": 0.7265625, + "learning_rate": 8.704932492220448e-05, + "loss": 0.752, + "step": 35467 + }, + { + "epoch": 0.9391787469849355, + "grad_norm": 0.77734375, + "learning_rate": 8.704475996544729e-05, + "loss": 0.8235, + "step": 35468 + }, + { + "epoch": 0.9392052265932299, + "grad_norm": 0.84765625, + "learning_rate": 8.704019503614798e-05, + "loss": 0.8748, + "step": 35469 + }, + { + "epoch": 0.9392317062015243, + "grad_norm": 0.8046875, + "learning_rate": 8.703563013431626e-05, + "loss": 0.8218, + "step": 35470 + }, + { + "epoch": 0.9392581858098187, + "grad_norm": 0.7265625, + "learning_rate": 8.703106525996177e-05, + "loss": 0.6756, + "step": 35471 + }, + { + "epoch": 0.9392846654181131, + "grad_norm": 0.8046875, + "learning_rate": 8.702650041309421e-05, + "loss": 0.777, + "step": 35472 + }, + { + "epoch": 0.9393111450264074, + "grad_norm": 0.74609375, + "learning_rate": 8.702193559372324e-05, + "loss": 0.7783, + "step": 35473 + }, + { + "epoch": 0.9393376246347017, + "grad_norm": 0.8125, + "learning_rate": 8.701737080185848e-05, + "loss": 0.9069, + "step": 35474 + }, + { + "epoch": 0.9393641042429961, + "grad_norm": 0.83203125, + "learning_rate": 8.701280603750971e-05, + "loss": 0.8421, + "step": 35475 + }, + { + "epoch": 0.9393905838512905, + "grad_norm": 0.9296875, + "learning_rate": 8.700824130068655e-05, + "loss": 0.8975, + "step": 35476 + }, + { + "epoch": 0.9394170634595849, + "grad_norm": 0.765625, + "learning_rate": 8.700367659139869e-05, + "loss": 0.7317, + "step": 35477 + }, + { + "epoch": 0.9394435430678792, + "grad_norm": 0.82421875, + "learning_rate": 8.699911190965579e-05, + "loss": 0.7365, + "step": 35478 + }, + { + "epoch": 0.9394700226761736, + "grad_norm": 0.7734375, + "learning_rate": 8.699454725546749e-05, + "loss": 0.7912, + "step": 35479 + }, + { + "epoch": 0.939496502284468, + "grad_norm": 0.8203125, + "learning_rate": 8.698998262884354e-05, + "loss": 0.874, + "step": 35480 + }, + { + "epoch": 0.9395229818927624, + "grad_norm": 0.796875, + "learning_rate": 8.698541802979358e-05, + "loss": 0.674, + "step": 35481 + }, + { + "epoch": 0.9395494615010568, + "grad_norm": 0.796875, + "learning_rate": 8.69808534583273e-05, + "loss": 0.7908, + "step": 35482 + }, + { + "epoch": 0.9395759411093512, + "grad_norm": 0.7265625, + "learning_rate": 8.697628891445433e-05, + "loss": 0.7436, + "step": 35483 + }, + { + "epoch": 0.9396024207176455, + "grad_norm": 0.7734375, + "learning_rate": 8.697172439818436e-05, + "loss": 0.7476, + "step": 35484 + }, + { + "epoch": 0.9396289003259399, + "grad_norm": 0.79296875, + "learning_rate": 8.696715990952709e-05, + "loss": 0.7929, + "step": 35485 + }, + { + "epoch": 0.9396553799342343, + "grad_norm": 0.8671875, + "learning_rate": 8.69625954484922e-05, + "loss": 0.6278, + "step": 35486 + }, + { + "epoch": 0.9396818595425287, + "grad_norm": 0.80859375, + "learning_rate": 8.69580310150893e-05, + "loss": 0.749, + "step": 35487 + }, + { + "epoch": 0.9397083391508231, + "grad_norm": 0.80859375, + "learning_rate": 8.695346660932815e-05, + "loss": 0.8144, + "step": 35488 + }, + { + "epoch": 0.9397348187591175, + "grad_norm": 0.84375, + "learning_rate": 8.694890223121835e-05, + "loss": 0.899, + "step": 35489 + }, + { + "epoch": 0.9397612983674117, + "grad_norm": 0.90625, + "learning_rate": 8.694433788076963e-05, + "loss": 0.7798, + "step": 35490 + }, + { + "epoch": 0.9397877779757061, + "grad_norm": 0.84765625, + "learning_rate": 8.693977355799163e-05, + "loss": 0.8542, + "step": 35491 + }, + { + "epoch": 0.9398142575840005, + "grad_norm": 0.75, + "learning_rate": 8.693520926289405e-05, + "loss": 0.8691, + "step": 35492 + }, + { + "epoch": 0.9398407371922949, + "grad_norm": 0.8203125, + "learning_rate": 8.693064499548653e-05, + "loss": 0.8366, + "step": 35493 + }, + { + "epoch": 0.9398672168005893, + "grad_norm": 0.82421875, + "learning_rate": 8.692608075577873e-05, + "loss": 0.777, + "step": 35494 + }, + { + "epoch": 0.9398936964088836, + "grad_norm": 0.82421875, + "learning_rate": 8.692151654378038e-05, + "loss": 0.8436, + "step": 35495 + }, + { + "epoch": 0.939920176017178, + "grad_norm": 0.80859375, + "learning_rate": 8.691695235950115e-05, + "loss": 0.8924, + "step": 35496 + }, + { + "epoch": 0.9399466556254724, + "grad_norm": 0.87109375, + "learning_rate": 8.691238820295069e-05, + "loss": 0.8207, + "step": 35497 + }, + { + "epoch": 0.9399731352337668, + "grad_norm": 0.796875, + "learning_rate": 8.690782407413865e-05, + "loss": 0.8768, + "step": 35498 + }, + { + "epoch": 0.9399996148420612, + "grad_norm": 0.8046875, + "learning_rate": 8.690325997307472e-05, + "loss": 0.8163, + "step": 35499 + }, + { + "epoch": 0.9400260944503556, + "grad_norm": 0.73828125, + "learning_rate": 8.68986958997686e-05, + "loss": 0.8709, + "step": 35500 + }, + { + "epoch": 0.94005257405865, + "grad_norm": 0.79296875, + "learning_rate": 8.689413185422995e-05, + "loss": 0.7927, + "step": 35501 + }, + { + "epoch": 0.9400790536669443, + "grad_norm": 0.77734375, + "learning_rate": 8.688956783646844e-05, + "loss": 0.702, + "step": 35502 + }, + { + "epoch": 0.9401055332752387, + "grad_norm": 0.7734375, + "learning_rate": 8.688500384649375e-05, + "loss": 0.8155, + "step": 35503 + }, + { + "epoch": 0.9401320128835331, + "grad_norm": 0.73046875, + "learning_rate": 8.688043988431552e-05, + "loss": 0.8398, + "step": 35504 + }, + { + "epoch": 0.9401584924918275, + "grad_norm": 0.7734375, + "learning_rate": 8.687587594994344e-05, + "loss": 0.8769, + "step": 35505 + }, + { + "epoch": 0.9401849721001218, + "grad_norm": 0.78125, + "learning_rate": 8.687131204338721e-05, + "loss": 0.818, + "step": 35506 + }, + { + "epoch": 0.9402114517084161, + "grad_norm": 0.9296875, + "learning_rate": 8.68667481646565e-05, + "loss": 0.8238, + "step": 35507 + }, + { + "epoch": 0.9402379313167105, + "grad_norm": 0.78125, + "learning_rate": 8.686218431376096e-05, + "loss": 0.781, + "step": 35508 + }, + { + "epoch": 0.9402644109250049, + "grad_norm": 0.74609375, + "learning_rate": 8.685762049071023e-05, + "loss": 0.827, + "step": 35509 + }, + { + "epoch": 0.9402908905332993, + "grad_norm": 0.79296875, + "learning_rate": 8.685305669551406e-05, + "loss": 0.7691, + "step": 35510 + }, + { + "epoch": 0.9403173701415937, + "grad_norm": 0.8828125, + "learning_rate": 8.68484929281821e-05, + "loss": 0.794, + "step": 35511 + }, + { + "epoch": 0.940343849749888, + "grad_norm": 0.9765625, + "learning_rate": 8.684392918872398e-05, + "loss": 0.8917, + "step": 35512 + }, + { + "epoch": 0.9403703293581824, + "grad_norm": 0.828125, + "learning_rate": 8.683936547714941e-05, + "loss": 0.8838, + "step": 35513 + }, + { + "epoch": 0.9403968089664768, + "grad_norm": 0.734375, + "learning_rate": 8.683480179346801e-05, + "loss": 0.7415, + "step": 35514 + }, + { + "epoch": 0.9404232885747712, + "grad_norm": 0.8203125, + "learning_rate": 8.683023813768955e-05, + "loss": 0.7504, + "step": 35515 + }, + { + "epoch": 0.9404497681830656, + "grad_norm": 0.953125, + "learning_rate": 8.682567450982363e-05, + "loss": 0.8263, + "step": 35516 + }, + { + "epoch": 0.94047624779136, + "grad_norm": 0.83984375, + "learning_rate": 8.682111090987994e-05, + "loss": 0.9313, + "step": 35517 + }, + { + "epoch": 0.9405027273996543, + "grad_norm": 0.77734375, + "learning_rate": 8.681654733786817e-05, + "loss": 0.7998, + "step": 35518 + }, + { + "epoch": 0.9405292070079487, + "grad_norm": 1.0078125, + "learning_rate": 8.681198379379791e-05, + "loss": 0.8664, + "step": 35519 + }, + { + "epoch": 0.9405556866162431, + "grad_norm": 0.765625, + "learning_rate": 8.680742027767895e-05, + "loss": 0.8165, + "step": 35520 + }, + { + "epoch": 0.9405821662245375, + "grad_norm": 0.796875, + "learning_rate": 8.680285678952091e-05, + "loss": 0.7337, + "step": 35521 + }, + { + "epoch": 0.9406086458328319, + "grad_norm": 0.7578125, + "learning_rate": 8.679829332933346e-05, + "loss": 0.7572, + "step": 35522 + }, + { + "epoch": 0.9406351254411262, + "grad_norm": 0.78515625, + "learning_rate": 8.679372989712628e-05, + "loss": 0.7867, + "step": 35523 + }, + { + "epoch": 0.9406616050494205, + "grad_norm": 0.77734375, + "learning_rate": 8.678916649290898e-05, + "loss": 0.7964, + "step": 35524 + }, + { + "epoch": 0.9406880846577149, + "grad_norm": 0.80859375, + "learning_rate": 8.678460311669133e-05, + "loss": 0.7854, + "step": 35525 + }, + { + "epoch": 0.9407145642660093, + "grad_norm": 0.69140625, + "learning_rate": 8.678003976848296e-05, + "loss": 0.699, + "step": 35526 + }, + { + "epoch": 0.9407410438743037, + "grad_norm": 0.734375, + "learning_rate": 8.677547644829353e-05, + "loss": 0.6677, + "step": 35527 + }, + { + "epoch": 0.9407675234825981, + "grad_norm": 0.78515625, + "learning_rate": 8.677091315613275e-05, + "loss": 0.7465, + "step": 35528 + }, + { + "epoch": 0.9407940030908925, + "grad_norm": 0.75390625, + "learning_rate": 8.676634989201022e-05, + "loss": 0.8187, + "step": 35529 + }, + { + "epoch": 0.9408204826991868, + "grad_norm": 0.79296875, + "learning_rate": 8.676178665593568e-05, + "loss": 0.7287, + "step": 35530 + }, + { + "epoch": 0.9408469623074812, + "grad_norm": 0.80078125, + "learning_rate": 8.675722344791877e-05, + "loss": 0.7247, + "step": 35531 + }, + { + "epoch": 0.9408734419157756, + "grad_norm": 0.76171875, + "learning_rate": 8.675266026796919e-05, + "loss": 0.7342, + "step": 35532 + }, + { + "epoch": 0.94089992152407, + "grad_norm": 0.75, + "learning_rate": 8.674809711609657e-05, + "loss": 0.7696, + "step": 35533 + }, + { + "epoch": 0.9409264011323644, + "grad_norm": 0.78515625, + "learning_rate": 8.674353399231057e-05, + "loss": 0.796, + "step": 35534 + }, + { + "epoch": 0.9409528807406587, + "grad_norm": 0.76171875, + "learning_rate": 8.673897089662092e-05, + "loss": 0.6744, + "step": 35535 + }, + { + "epoch": 0.9409793603489531, + "grad_norm": 0.7734375, + "learning_rate": 8.673440782903728e-05, + "loss": 0.7775, + "step": 35536 + }, + { + "epoch": 0.9410058399572475, + "grad_norm": 0.80859375, + "learning_rate": 8.672984478956929e-05, + "loss": 0.7835, + "step": 35537 + }, + { + "epoch": 0.9410323195655419, + "grad_norm": 0.8046875, + "learning_rate": 8.672528177822664e-05, + "loss": 0.8102, + "step": 35538 + }, + { + "epoch": 0.9410587991738362, + "grad_norm": 0.76171875, + "learning_rate": 8.672071879501894e-05, + "loss": 0.6051, + "step": 35539 + }, + { + "epoch": 0.9410852787821306, + "grad_norm": 0.72265625, + "learning_rate": 8.6716155839956e-05, + "loss": 0.6604, + "step": 35540 + }, + { + "epoch": 0.9411117583904249, + "grad_norm": 0.80078125, + "learning_rate": 8.671159291304738e-05, + "loss": 0.8248, + "step": 35541 + }, + { + "epoch": 0.9411382379987193, + "grad_norm": 0.734375, + "learning_rate": 8.670703001430278e-05, + "loss": 0.8278, + "step": 35542 + }, + { + "epoch": 0.9411647176070137, + "grad_norm": 0.7734375, + "learning_rate": 8.670246714373187e-05, + "loss": 0.9386, + "step": 35543 + }, + { + "epoch": 0.9411911972153081, + "grad_norm": 0.75, + "learning_rate": 8.669790430134428e-05, + "loss": 0.7534, + "step": 35544 + }, + { + "epoch": 0.9412176768236025, + "grad_norm": 0.76171875, + "learning_rate": 8.669334148714976e-05, + "loss": 0.8349, + "step": 35545 + }, + { + "epoch": 0.9412441564318969, + "grad_norm": 0.7890625, + "learning_rate": 8.668877870115795e-05, + "loss": 0.7817, + "step": 35546 + }, + { + "epoch": 0.9412706360401912, + "grad_norm": 0.71484375, + "learning_rate": 8.668421594337851e-05, + "loss": 0.6702, + "step": 35547 + }, + { + "epoch": 0.9412971156484856, + "grad_norm": 0.71875, + "learning_rate": 8.667965321382112e-05, + "loss": 0.7491, + "step": 35548 + }, + { + "epoch": 0.94132359525678, + "grad_norm": 0.828125, + "learning_rate": 8.667509051249541e-05, + "loss": 0.7827, + "step": 35549 + }, + { + "epoch": 0.9413500748650744, + "grad_norm": 0.74609375, + "learning_rate": 8.667052783941112e-05, + "loss": 0.647, + "step": 35550 + }, + { + "epoch": 0.9413765544733688, + "grad_norm": 0.8125, + "learning_rate": 8.666596519457787e-05, + "loss": 0.756, + "step": 35551 + }, + { + "epoch": 0.9414030340816631, + "grad_norm": 0.7890625, + "learning_rate": 8.666140257800534e-05, + "loss": 0.8157, + "step": 35552 + }, + { + "epoch": 0.9414295136899575, + "grad_norm": 0.73828125, + "learning_rate": 8.665683998970322e-05, + "loss": 0.7003, + "step": 35553 + }, + { + "epoch": 0.9414559932982519, + "grad_norm": 0.984375, + "learning_rate": 8.665227742968111e-05, + "loss": 0.7691, + "step": 35554 + }, + { + "epoch": 0.9414824729065462, + "grad_norm": 0.765625, + "learning_rate": 8.66477148979488e-05, + "loss": 0.7157, + "step": 35555 + }, + { + "epoch": 0.9415089525148406, + "grad_norm": 0.79296875, + "learning_rate": 8.664315239451587e-05, + "loss": 0.8078, + "step": 35556 + }, + { + "epoch": 0.941535432123135, + "grad_norm": 0.765625, + "learning_rate": 8.663858991939203e-05, + "loss": 0.7928, + "step": 35557 + }, + { + "epoch": 0.9415619117314293, + "grad_norm": 0.74609375, + "learning_rate": 8.663402747258693e-05, + "loss": 0.6584, + "step": 35558 + }, + { + "epoch": 0.9415883913397237, + "grad_norm": 0.7265625, + "learning_rate": 8.66294650541102e-05, + "loss": 0.7452, + "step": 35559 + }, + { + "epoch": 0.9416148709480181, + "grad_norm": 0.80078125, + "learning_rate": 8.662490266397161e-05, + "loss": 0.8749, + "step": 35560 + }, + { + "epoch": 0.9416413505563125, + "grad_norm": 0.79296875, + "learning_rate": 8.662034030218076e-05, + "loss": 0.8485, + "step": 35561 + }, + { + "epoch": 0.9416678301646069, + "grad_norm": 0.7265625, + "learning_rate": 8.661577796874734e-05, + "loss": 0.7054, + "step": 35562 + }, + { + "epoch": 0.9416943097729013, + "grad_norm": 0.80859375, + "learning_rate": 8.6611215663681e-05, + "loss": 0.7832, + "step": 35563 + }, + { + "epoch": 0.9417207893811956, + "grad_norm": 0.9140625, + "learning_rate": 8.660665338699145e-05, + "loss": 0.7061, + "step": 35564 + }, + { + "epoch": 0.94174726898949, + "grad_norm": 1.890625, + "learning_rate": 8.660209113868827e-05, + "loss": 0.776, + "step": 35565 + }, + { + "epoch": 0.9417737485977844, + "grad_norm": 0.75390625, + "learning_rate": 8.659752891878123e-05, + "loss": 0.7342, + "step": 35566 + }, + { + "epoch": 0.9418002282060788, + "grad_norm": 0.8828125, + "learning_rate": 8.659296672727997e-05, + "loss": 0.7114, + "step": 35567 + }, + { + "epoch": 0.9418267078143732, + "grad_norm": 0.84765625, + "learning_rate": 8.658840456419416e-05, + "loss": 0.7995, + "step": 35568 + }, + { + "epoch": 0.9418531874226675, + "grad_norm": 0.7890625, + "learning_rate": 8.658384242953345e-05, + "loss": 0.7943, + "step": 35569 + }, + { + "epoch": 0.9418796670309619, + "grad_norm": 0.8046875, + "learning_rate": 8.65792803233075e-05, + "loss": 0.7784, + "step": 35570 + }, + { + "epoch": 0.9419061466392563, + "grad_norm": 0.75390625, + "learning_rate": 8.657471824552602e-05, + "loss": 0.8696, + "step": 35571 + }, + { + "epoch": 0.9419326262475506, + "grad_norm": 0.80078125, + "learning_rate": 8.657015619619864e-05, + "loss": 0.7834, + "step": 35572 + }, + { + "epoch": 0.941959105855845, + "grad_norm": 0.79296875, + "learning_rate": 8.656559417533505e-05, + "loss": 0.7926, + "step": 35573 + }, + { + "epoch": 0.9419855854641394, + "grad_norm": 0.8359375, + "learning_rate": 8.656103218294493e-05, + "loss": 0.8631, + "step": 35574 + }, + { + "epoch": 0.9420120650724337, + "grad_norm": 0.796875, + "learning_rate": 8.65564702190379e-05, + "loss": 0.777, + "step": 35575 + }, + { + "epoch": 0.9420385446807281, + "grad_norm": 0.8359375, + "learning_rate": 8.65519082836237e-05, + "loss": 0.8495, + "step": 35576 + }, + { + "epoch": 0.9420650242890225, + "grad_norm": 0.72265625, + "learning_rate": 8.654734637671196e-05, + "loss": 0.7563, + "step": 35577 + }, + { + "epoch": 0.9420915038973169, + "grad_norm": 0.86328125, + "learning_rate": 8.654278449831234e-05, + "loss": 0.7595, + "step": 35578 + }, + { + "epoch": 0.9421179835056113, + "grad_norm": 0.83203125, + "learning_rate": 8.653822264843454e-05, + "loss": 0.7834, + "step": 35579 + }, + { + "epoch": 0.9421444631139057, + "grad_norm": 0.89453125, + "learning_rate": 8.653366082708815e-05, + "loss": 0.8059, + "step": 35580 + }, + { + "epoch": 0.9421709427222, + "grad_norm": 0.6953125, + "learning_rate": 8.652909903428293e-05, + "loss": 0.7263, + "step": 35581 + }, + { + "epoch": 0.9421974223304944, + "grad_norm": 0.78125, + "learning_rate": 8.652453727002854e-05, + "loss": 0.7872, + "step": 35582 + }, + { + "epoch": 0.9422239019387888, + "grad_norm": 0.80859375, + "learning_rate": 8.651997553433459e-05, + "loss": 0.7062, + "step": 35583 + }, + { + "epoch": 0.9422503815470832, + "grad_norm": 0.734375, + "learning_rate": 8.651541382721082e-05, + "loss": 0.8145, + "step": 35584 + }, + { + "epoch": 0.9422768611553776, + "grad_norm": 0.86328125, + "learning_rate": 8.651085214866679e-05, + "loss": 0.8775, + "step": 35585 + }, + { + "epoch": 0.942303340763672, + "grad_norm": 0.74609375, + "learning_rate": 8.650629049871229e-05, + "loss": 0.7801, + "step": 35586 + }, + { + "epoch": 0.9423298203719663, + "grad_norm": 0.7578125, + "learning_rate": 8.650172887735693e-05, + "loss": 0.6849, + "step": 35587 + }, + { + "epoch": 0.9423562999802606, + "grad_norm": 0.82421875, + "learning_rate": 8.649716728461038e-05, + "loss": 0.8074, + "step": 35588 + }, + { + "epoch": 0.942382779588555, + "grad_norm": 0.78515625, + "learning_rate": 8.649260572048233e-05, + "loss": 0.6695, + "step": 35589 + }, + { + "epoch": 0.9424092591968494, + "grad_norm": 0.79296875, + "learning_rate": 8.648804418498239e-05, + "loss": 0.7172, + "step": 35590 + }, + { + "epoch": 0.9424357388051438, + "grad_norm": 0.8203125, + "learning_rate": 8.648348267812027e-05, + "loss": 0.8341, + "step": 35591 + }, + { + "epoch": 0.9424622184134381, + "grad_norm": 0.796875, + "learning_rate": 8.647892119990566e-05, + "loss": 0.8239, + "step": 35592 + }, + { + "epoch": 0.9424886980217325, + "grad_norm": 0.76171875, + "learning_rate": 8.647435975034822e-05, + "loss": 0.8195, + "step": 35593 + }, + { + "epoch": 0.9425151776300269, + "grad_norm": 0.8125, + "learning_rate": 8.646979832945758e-05, + "loss": 0.7708, + "step": 35594 + }, + { + "epoch": 0.9425416572383213, + "grad_norm": 0.69140625, + "learning_rate": 8.64652369372434e-05, + "loss": 0.7597, + "step": 35595 + }, + { + "epoch": 0.9425681368466157, + "grad_norm": 0.75390625, + "learning_rate": 8.646067557371543e-05, + "loss": 0.8225, + "step": 35596 + }, + { + "epoch": 0.94259461645491, + "grad_norm": 0.80078125, + "learning_rate": 8.645611423888327e-05, + "loss": 0.7998, + "step": 35597 + }, + { + "epoch": 0.9426210960632044, + "grad_norm": 0.75390625, + "learning_rate": 8.645155293275659e-05, + "loss": 0.7372, + "step": 35598 + }, + { + "epoch": 0.9426475756714988, + "grad_norm": 0.78515625, + "learning_rate": 8.644699165534508e-05, + "loss": 0.8403, + "step": 35599 + }, + { + "epoch": 0.9426740552797932, + "grad_norm": 0.76171875, + "learning_rate": 8.644243040665836e-05, + "loss": 0.792, + "step": 35600 + }, + { + "epoch": 0.9427005348880876, + "grad_norm": 0.86328125, + "learning_rate": 8.643786918670617e-05, + "loss": 0.8314, + "step": 35601 + }, + { + "epoch": 0.942727014496382, + "grad_norm": 0.90625, + "learning_rate": 8.643330799549814e-05, + "loss": 0.6899, + "step": 35602 + }, + { + "epoch": 0.9427534941046763, + "grad_norm": 0.85546875, + "learning_rate": 8.642874683304394e-05, + "loss": 0.9921, + "step": 35603 + }, + { + "epoch": 0.9427799737129706, + "grad_norm": 0.8203125, + "learning_rate": 8.642418569935323e-05, + "loss": 0.8641, + "step": 35604 + }, + { + "epoch": 0.942806453321265, + "grad_norm": 0.8359375, + "learning_rate": 8.641962459443565e-05, + "loss": 0.7791, + "step": 35605 + }, + { + "epoch": 0.9428329329295594, + "grad_norm": 0.7734375, + "learning_rate": 8.641506351830094e-05, + "loss": 0.7203, + "step": 35606 + }, + { + "epoch": 0.9428594125378538, + "grad_norm": 0.76171875, + "learning_rate": 8.641050247095872e-05, + "loss": 0.8752, + "step": 35607 + }, + { + "epoch": 0.9428858921461482, + "grad_norm": 0.84765625, + "learning_rate": 8.640594145241867e-05, + "loss": 0.841, + "step": 35608 + }, + { + "epoch": 0.9429123717544425, + "grad_norm": 0.8125, + "learning_rate": 8.640138046269043e-05, + "loss": 0.7413, + "step": 35609 + }, + { + "epoch": 0.9429388513627369, + "grad_norm": 0.7734375, + "learning_rate": 8.639681950178367e-05, + "loss": 0.7926, + "step": 35610 + }, + { + "epoch": 0.9429653309710313, + "grad_norm": 0.8828125, + "learning_rate": 8.63922585697081e-05, + "loss": 0.7542, + "step": 35611 + }, + { + "epoch": 0.9429918105793257, + "grad_norm": 0.78125, + "learning_rate": 8.638769766647336e-05, + "loss": 0.8748, + "step": 35612 + }, + { + "epoch": 0.9430182901876201, + "grad_norm": 0.77734375, + "learning_rate": 8.638313679208912e-05, + "loss": 0.7075, + "step": 35613 + }, + { + "epoch": 0.9430447697959145, + "grad_norm": 0.7890625, + "learning_rate": 8.637857594656505e-05, + "loss": 0.883, + "step": 35614 + }, + { + "epoch": 0.9430712494042088, + "grad_norm": 0.69921875, + "learning_rate": 8.637401512991079e-05, + "loss": 0.6577, + "step": 35615 + }, + { + "epoch": 0.9430977290125032, + "grad_norm": 0.68359375, + "learning_rate": 8.636945434213603e-05, + "loss": 0.6436, + "step": 35616 + }, + { + "epoch": 0.9431242086207976, + "grad_norm": 0.72265625, + "learning_rate": 8.636489358325046e-05, + "loss": 0.7416, + "step": 35617 + }, + { + "epoch": 0.943150688229092, + "grad_norm": 0.7578125, + "learning_rate": 8.63603328532637e-05, + "loss": 0.8862, + "step": 35618 + }, + { + "epoch": 0.9431771678373864, + "grad_norm": 0.82421875, + "learning_rate": 8.635577215218545e-05, + "loss": 0.8429, + "step": 35619 + }, + { + "epoch": 0.9432036474456807, + "grad_norm": 0.74609375, + "learning_rate": 8.635121148002529e-05, + "loss": 0.7801, + "step": 35620 + }, + { + "epoch": 0.943230127053975, + "grad_norm": 0.76171875, + "learning_rate": 8.634665083679302e-05, + "loss": 0.831, + "step": 35621 + }, + { + "epoch": 0.9432566066622694, + "grad_norm": 0.859375, + "learning_rate": 8.634209022249824e-05, + "loss": 0.7736, + "step": 35622 + }, + { + "epoch": 0.9432830862705638, + "grad_norm": 0.765625, + "learning_rate": 8.633752963715062e-05, + "loss": 0.8502, + "step": 35623 + }, + { + "epoch": 0.9433095658788582, + "grad_norm": 0.81640625, + "learning_rate": 8.633296908075982e-05, + "loss": 0.786, + "step": 35624 + }, + { + "epoch": 0.9433360454871526, + "grad_norm": 0.765625, + "learning_rate": 8.632840855333547e-05, + "loss": 0.7872, + "step": 35625 + }, + { + "epoch": 0.9433625250954469, + "grad_norm": 0.7734375, + "learning_rate": 8.632384805488732e-05, + "loss": 0.8067, + "step": 35626 + }, + { + "epoch": 0.9433890047037413, + "grad_norm": 0.6953125, + "learning_rate": 8.6319287585425e-05, + "loss": 0.7699, + "step": 35627 + }, + { + "epoch": 0.9434154843120357, + "grad_norm": 0.70703125, + "learning_rate": 8.631472714495817e-05, + "loss": 0.8227, + "step": 35628 + }, + { + "epoch": 0.9434419639203301, + "grad_norm": 0.85546875, + "learning_rate": 8.631016673349648e-05, + "loss": 0.8129, + "step": 35629 + }, + { + "epoch": 0.9434684435286245, + "grad_norm": 0.83203125, + "learning_rate": 8.630560635104956e-05, + "loss": 0.7438, + "step": 35630 + }, + { + "epoch": 0.9434949231369189, + "grad_norm": 0.7421875, + "learning_rate": 8.630104599762719e-05, + "loss": 0.6933, + "step": 35631 + }, + { + "epoch": 0.9435214027452132, + "grad_norm": 0.81640625, + "learning_rate": 8.629648567323896e-05, + "loss": 0.9004, + "step": 35632 + }, + { + "epoch": 0.9435478823535076, + "grad_norm": 0.80859375, + "learning_rate": 8.629192537789454e-05, + "loss": 0.7226, + "step": 35633 + }, + { + "epoch": 0.943574361961802, + "grad_norm": 0.80078125, + "learning_rate": 8.62873651116036e-05, + "loss": 0.7509, + "step": 35634 + }, + { + "epoch": 0.9436008415700964, + "grad_norm": 0.75, + "learning_rate": 8.628280487437579e-05, + "loss": 0.7535, + "step": 35635 + }, + { + "epoch": 0.9436273211783908, + "grad_norm": 0.97265625, + "learning_rate": 8.627824466622081e-05, + "loss": 0.8201, + "step": 35636 + }, + { + "epoch": 0.943653800786685, + "grad_norm": 0.78515625, + "learning_rate": 8.627368448714831e-05, + "loss": 0.7987, + "step": 35637 + }, + { + "epoch": 0.9436802803949794, + "grad_norm": 0.78125, + "learning_rate": 8.626912433716795e-05, + "loss": 0.7264, + "step": 35638 + }, + { + "epoch": 0.9437067600032738, + "grad_norm": 0.75, + "learning_rate": 8.626456421628938e-05, + "loss": 0.6662, + "step": 35639 + }, + { + "epoch": 0.9437332396115682, + "grad_norm": 0.7890625, + "learning_rate": 8.626000412452226e-05, + "loss": 0.7558, + "step": 35640 + }, + { + "epoch": 0.9437597192198626, + "grad_norm": 0.765625, + "learning_rate": 8.625544406187632e-05, + "loss": 0.7344, + "step": 35641 + }, + { + "epoch": 0.943786198828157, + "grad_norm": 0.8125, + "learning_rate": 8.625088402836118e-05, + "loss": 0.7352, + "step": 35642 + }, + { + "epoch": 0.9438126784364513, + "grad_norm": 0.84375, + "learning_rate": 8.624632402398649e-05, + "loss": 0.7282, + "step": 35643 + }, + { + "epoch": 0.9438391580447457, + "grad_norm": 0.8515625, + "learning_rate": 8.624176404876194e-05, + "loss": 0.6849, + "step": 35644 + }, + { + "epoch": 0.9438656376530401, + "grad_norm": 0.671875, + "learning_rate": 8.623720410269714e-05, + "loss": 0.7647, + "step": 35645 + }, + { + "epoch": 0.9438921172613345, + "grad_norm": 0.7890625, + "learning_rate": 8.623264418580185e-05, + "loss": 0.7408, + "step": 35646 + }, + { + "epoch": 0.9439185968696289, + "grad_norm": 0.74609375, + "learning_rate": 8.622808429808567e-05, + "loss": 0.7736, + "step": 35647 + }, + { + "epoch": 0.9439450764779233, + "grad_norm": 0.99609375, + "learning_rate": 8.622352443955829e-05, + "loss": 0.8401, + "step": 35648 + }, + { + "epoch": 0.9439715560862176, + "grad_norm": 0.74609375, + "learning_rate": 8.621896461022936e-05, + "loss": 0.7741, + "step": 35649 + }, + { + "epoch": 0.943998035694512, + "grad_norm": 0.8203125, + "learning_rate": 8.62144048101085e-05, + "loss": 0.7005, + "step": 35650 + }, + { + "epoch": 0.9440245153028064, + "grad_norm": 0.75390625, + "learning_rate": 8.620984503920547e-05, + "loss": 0.755, + "step": 35651 + }, + { + "epoch": 0.9440509949111008, + "grad_norm": 0.875, + "learning_rate": 8.620528529752987e-05, + "loss": 0.8112, + "step": 35652 + }, + { + "epoch": 0.944077474519395, + "grad_norm": 0.7578125, + "learning_rate": 8.62007255850914e-05, + "loss": 0.6913, + "step": 35653 + }, + { + "epoch": 0.9441039541276894, + "grad_norm": 0.82421875, + "learning_rate": 8.619616590189968e-05, + "loss": 0.7723, + "step": 35654 + }, + { + "epoch": 0.9441304337359838, + "grad_norm": 0.8359375, + "learning_rate": 8.619160624796439e-05, + "loss": 0.7779, + "step": 35655 + }, + { + "epoch": 0.9441569133442782, + "grad_norm": 0.8125, + "learning_rate": 8.618704662329522e-05, + "loss": 0.7425, + "step": 35656 + }, + { + "epoch": 0.9441833929525726, + "grad_norm": 0.765625, + "learning_rate": 8.618248702790181e-05, + "loss": 0.8143, + "step": 35657 + }, + { + "epoch": 0.944209872560867, + "grad_norm": 0.8046875, + "learning_rate": 8.617792746179383e-05, + "loss": 0.7851, + "step": 35658 + }, + { + "epoch": 0.9442363521691614, + "grad_norm": 0.7265625, + "learning_rate": 8.617336792498094e-05, + "loss": 0.7628, + "step": 35659 + }, + { + "epoch": 0.9442628317774557, + "grad_norm": 0.76953125, + "learning_rate": 8.61688084174728e-05, + "loss": 0.8271, + "step": 35660 + }, + { + "epoch": 0.9442893113857501, + "grad_norm": 0.765625, + "learning_rate": 8.616424893927909e-05, + "loss": 0.7114, + "step": 35661 + }, + { + "epoch": 0.9443157909940445, + "grad_norm": 0.765625, + "learning_rate": 8.615968949040947e-05, + "loss": 0.6988, + "step": 35662 + }, + { + "epoch": 0.9443422706023389, + "grad_norm": 0.83984375, + "learning_rate": 8.615513007087359e-05, + "loss": 0.7889, + "step": 35663 + }, + { + "epoch": 0.9443687502106333, + "grad_norm": 0.77734375, + "learning_rate": 8.615057068068113e-05, + "loss": 0.7106, + "step": 35664 + }, + { + "epoch": 0.9443952298189277, + "grad_norm": 0.8671875, + "learning_rate": 8.61460113198417e-05, + "loss": 0.8581, + "step": 35665 + }, + { + "epoch": 0.944421709427222, + "grad_norm": 0.7734375, + "learning_rate": 8.614145198836504e-05, + "loss": 0.7194, + "step": 35666 + }, + { + "epoch": 0.9444481890355164, + "grad_norm": 0.734375, + "learning_rate": 8.61368926862608e-05, + "loss": 0.6923, + "step": 35667 + }, + { + "epoch": 0.9444746686438108, + "grad_norm": 0.8515625, + "learning_rate": 8.613233341353861e-05, + "loss": 0.749, + "step": 35668 + }, + { + "epoch": 0.9445011482521052, + "grad_norm": 0.80078125, + "learning_rate": 8.612777417020815e-05, + "loss": 0.7644, + "step": 35669 + }, + { + "epoch": 0.9445276278603995, + "grad_norm": 0.7578125, + "learning_rate": 8.612321495627902e-05, + "loss": 0.8014, + "step": 35670 + }, + { + "epoch": 0.9445541074686938, + "grad_norm": 0.71875, + "learning_rate": 8.611865577176101e-05, + "loss": 0.6701, + "step": 35671 + }, + { + "epoch": 0.9445805870769882, + "grad_norm": 0.73828125, + "learning_rate": 8.61140966166637e-05, + "loss": 0.6801, + "step": 35672 + }, + { + "epoch": 0.9446070666852826, + "grad_norm": 0.8046875, + "learning_rate": 8.610953749099678e-05, + "loss": 0.8354, + "step": 35673 + }, + { + "epoch": 0.944633546293577, + "grad_norm": 0.71484375, + "learning_rate": 8.610497839476989e-05, + "loss": 0.6962, + "step": 35674 + }, + { + "epoch": 0.9446600259018714, + "grad_norm": 0.78125, + "learning_rate": 8.610041932799269e-05, + "loss": 0.7207, + "step": 35675 + }, + { + "epoch": 0.9446865055101658, + "grad_norm": 0.734375, + "learning_rate": 8.609586029067485e-05, + "loss": 0.8289, + "step": 35676 + }, + { + "epoch": 0.9447129851184601, + "grad_norm": 0.76171875, + "learning_rate": 8.609130128282607e-05, + "loss": 0.7644, + "step": 35677 + }, + { + "epoch": 0.9447394647267545, + "grad_norm": 0.8125, + "learning_rate": 8.608674230445597e-05, + "loss": 0.7592, + "step": 35678 + }, + { + "epoch": 0.9447659443350489, + "grad_norm": 0.75390625, + "learning_rate": 8.608218335557423e-05, + "loss": 0.8326, + "step": 35679 + }, + { + "epoch": 0.9447924239433433, + "grad_norm": 0.703125, + "learning_rate": 8.60776244361905e-05, + "loss": 0.7258, + "step": 35680 + }, + { + "epoch": 0.9448189035516377, + "grad_norm": 1.125, + "learning_rate": 8.607306554631444e-05, + "loss": 0.8737, + "step": 35681 + }, + { + "epoch": 0.944845383159932, + "grad_norm": 0.74609375, + "learning_rate": 8.606850668595576e-05, + "loss": 0.7702, + "step": 35682 + }, + { + "epoch": 0.9448718627682264, + "grad_norm": 0.66015625, + "learning_rate": 8.606394785512407e-05, + "loss": 0.6356, + "step": 35683 + }, + { + "epoch": 0.9448983423765208, + "grad_norm": 0.8359375, + "learning_rate": 8.605938905382903e-05, + "loss": 0.8004, + "step": 35684 + }, + { + "epoch": 0.9449248219848152, + "grad_norm": 0.76953125, + "learning_rate": 8.605483028208028e-05, + "loss": 0.7596, + "step": 35685 + }, + { + "epoch": 0.9449513015931095, + "grad_norm": 0.79296875, + "learning_rate": 8.605027153988756e-05, + "loss": 0.7526, + "step": 35686 + }, + { + "epoch": 0.9449777812014039, + "grad_norm": 0.78125, + "learning_rate": 8.604571282726051e-05, + "loss": 0.8088, + "step": 35687 + }, + { + "epoch": 0.9450042608096982, + "grad_norm": 0.76171875, + "learning_rate": 8.604115414420876e-05, + "loss": 0.7802, + "step": 35688 + }, + { + "epoch": 0.9450307404179926, + "grad_norm": 0.7421875, + "learning_rate": 8.603659549074199e-05, + "loss": 0.8047, + "step": 35689 + }, + { + "epoch": 0.945057220026287, + "grad_norm": 0.74609375, + "learning_rate": 8.603203686686982e-05, + "loss": 0.8067, + "step": 35690 + }, + { + "epoch": 0.9450836996345814, + "grad_norm": 0.80078125, + "learning_rate": 8.602747827260198e-05, + "loss": 0.7554, + "step": 35691 + }, + { + "epoch": 0.9451101792428758, + "grad_norm": 0.796875, + "learning_rate": 8.602291970794812e-05, + "loss": 0.7839, + "step": 35692 + }, + { + "epoch": 0.9451366588511702, + "grad_norm": 0.75390625, + "learning_rate": 8.601836117291786e-05, + "loss": 0.7845, + "step": 35693 + }, + { + "epoch": 0.9451631384594645, + "grad_norm": 0.77734375, + "learning_rate": 8.60138026675209e-05, + "loss": 0.7974, + "step": 35694 + }, + { + "epoch": 0.9451896180677589, + "grad_norm": 0.79296875, + "learning_rate": 8.600924419176683e-05, + "loss": 0.8133, + "step": 35695 + }, + { + "epoch": 0.9452160976760533, + "grad_norm": 0.78515625, + "learning_rate": 8.600468574566542e-05, + "loss": 0.6575, + "step": 35696 + }, + { + "epoch": 0.9452425772843477, + "grad_norm": 0.7734375, + "learning_rate": 8.600012732922628e-05, + "loss": 0.8644, + "step": 35697 + }, + { + "epoch": 0.9452690568926421, + "grad_norm": 0.71484375, + "learning_rate": 8.599556894245907e-05, + "loss": 0.6674, + "step": 35698 + }, + { + "epoch": 0.9452955365009365, + "grad_norm": 0.7734375, + "learning_rate": 8.599101058537345e-05, + "loss": 0.826, + "step": 35699 + }, + { + "epoch": 0.9453220161092308, + "grad_norm": 0.84765625, + "learning_rate": 8.598645225797906e-05, + "loss": 0.802, + "step": 35700 + }, + { + "epoch": 0.9453484957175252, + "grad_norm": 0.77734375, + "learning_rate": 8.59818939602856e-05, + "loss": 0.718, + "step": 35701 + }, + { + "epoch": 0.9453749753258195, + "grad_norm": 0.8125, + "learning_rate": 8.597733569230272e-05, + "loss": 0.7508, + "step": 35702 + }, + { + "epoch": 0.9454014549341139, + "grad_norm": 0.86328125, + "learning_rate": 8.597277745404008e-05, + "loss": 0.8701, + "step": 35703 + }, + { + "epoch": 0.9454279345424083, + "grad_norm": 0.9296875, + "learning_rate": 8.596821924550733e-05, + "loss": 0.6944, + "step": 35704 + }, + { + "epoch": 0.9454544141507026, + "grad_norm": 0.8046875, + "learning_rate": 8.596366106671414e-05, + "loss": 0.7614, + "step": 35705 + }, + { + "epoch": 0.945480893758997, + "grad_norm": 0.7734375, + "learning_rate": 8.595910291767012e-05, + "loss": 0.9029, + "step": 35706 + }, + { + "epoch": 0.9455073733672914, + "grad_norm": 0.85546875, + "learning_rate": 8.595454479838503e-05, + "loss": 0.8625, + "step": 35707 + }, + { + "epoch": 0.9455338529755858, + "grad_norm": 0.77734375, + "learning_rate": 8.594998670886847e-05, + "loss": 0.8419, + "step": 35708 + }, + { + "epoch": 0.9455603325838802, + "grad_norm": 0.71484375, + "learning_rate": 8.594542864913011e-05, + "loss": 0.819, + "step": 35709 + }, + { + "epoch": 0.9455868121921746, + "grad_norm": 0.81640625, + "learning_rate": 8.594087061917961e-05, + "loss": 0.7232, + "step": 35710 + }, + { + "epoch": 0.9456132918004689, + "grad_norm": 0.765625, + "learning_rate": 8.59363126190266e-05, + "loss": 0.773, + "step": 35711 + }, + { + "epoch": 0.9456397714087633, + "grad_norm": 0.765625, + "learning_rate": 8.593175464868081e-05, + "loss": 0.7263, + "step": 35712 + }, + { + "epoch": 0.9456662510170577, + "grad_norm": 0.81640625, + "learning_rate": 8.592719670815186e-05, + "loss": 0.9862, + "step": 35713 + }, + { + "epoch": 0.9456927306253521, + "grad_norm": 0.73046875, + "learning_rate": 8.592263879744941e-05, + "loss": 0.7866, + "step": 35714 + }, + { + "epoch": 0.9457192102336465, + "grad_norm": 0.8125, + "learning_rate": 8.591808091658313e-05, + "loss": 0.9219, + "step": 35715 + }, + { + "epoch": 0.9457456898419409, + "grad_norm": 0.7578125, + "learning_rate": 8.591352306556261e-05, + "loss": 0.6819, + "step": 35716 + }, + { + "epoch": 0.9457721694502352, + "grad_norm": 1.015625, + "learning_rate": 8.590896524439764e-05, + "loss": 0.8355, + "step": 35717 + }, + { + "epoch": 0.9457986490585296, + "grad_norm": 0.7421875, + "learning_rate": 8.59044074530978e-05, + "loss": 0.6914, + "step": 35718 + }, + { + "epoch": 0.9458251286668239, + "grad_norm": 0.75390625, + "learning_rate": 8.589984969167276e-05, + "loss": 0.7713, + "step": 35719 + }, + { + "epoch": 0.9458516082751183, + "grad_norm": 0.7421875, + "learning_rate": 8.58952919601322e-05, + "loss": 0.7916, + "step": 35720 + }, + { + "epoch": 0.9458780878834127, + "grad_norm": 0.80859375, + "learning_rate": 8.589073425848572e-05, + "loss": 0.7784, + "step": 35721 + }, + { + "epoch": 0.945904567491707, + "grad_norm": 0.84765625, + "learning_rate": 8.588617658674306e-05, + "loss": 0.8331, + "step": 35722 + }, + { + "epoch": 0.9459310471000014, + "grad_norm": 0.69921875, + "learning_rate": 8.588161894491384e-05, + "loss": 0.7623, + "step": 35723 + }, + { + "epoch": 0.9459575267082958, + "grad_norm": 0.8046875, + "learning_rate": 8.58770613330077e-05, + "loss": 0.7348, + "step": 35724 + }, + { + "epoch": 0.9459840063165902, + "grad_norm": 0.8359375, + "learning_rate": 8.587250375103433e-05, + "loss": 0.7745, + "step": 35725 + }, + { + "epoch": 0.9460104859248846, + "grad_norm": 0.796875, + "learning_rate": 8.586794619900335e-05, + "loss": 0.9393, + "step": 35726 + }, + { + "epoch": 0.946036965533179, + "grad_norm": 0.85546875, + "learning_rate": 8.58633886769245e-05, + "loss": 0.7009, + "step": 35727 + }, + { + "epoch": 0.9460634451414733, + "grad_norm": 0.73828125, + "learning_rate": 8.585883118480737e-05, + "loss": 0.7925, + "step": 35728 + }, + { + "epoch": 0.9460899247497677, + "grad_norm": 0.75, + "learning_rate": 8.585427372266166e-05, + "loss": 0.6733, + "step": 35729 + }, + { + "epoch": 0.9461164043580621, + "grad_norm": 0.76953125, + "learning_rate": 8.584971629049698e-05, + "loss": 0.7781, + "step": 35730 + }, + { + "epoch": 0.9461428839663565, + "grad_norm": 0.71484375, + "learning_rate": 8.5845158888323e-05, + "loss": 0.7693, + "step": 35731 + }, + { + "epoch": 0.9461693635746509, + "grad_norm": 0.796875, + "learning_rate": 8.584060151614944e-05, + "loss": 0.7738, + "step": 35732 + }, + { + "epoch": 0.9461958431829453, + "grad_norm": 0.828125, + "learning_rate": 8.583604417398589e-05, + "loss": 0.7679, + "step": 35733 + }, + { + "epoch": 0.9462223227912396, + "grad_norm": 0.703125, + "learning_rate": 8.583148686184206e-05, + "loss": 0.6889, + "step": 35734 + }, + { + "epoch": 0.9462488023995339, + "grad_norm": 0.765625, + "learning_rate": 8.582692957972758e-05, + "loss": 0.8823, + "step": 35735 + }, + { + "epoch": 0.9462752820078283, + "grad_norm": 0.828125, + "learning_rate": 8.582237232765206e-05, + "loss": 0.7621, + "step": 35736 + }, + { + "epoch": 0.9463017616161227, + "grad_norm": 0.70703125, + "learning_rate": 8.581781510562526e-05, + "loss": 0.732, + "step": 35737 + }, + { + "epoch": 0.946328241224417, + "grad_norm": 0.8359375, + "learning_rate": 8.581325791365678e-05, + "loss": 0.8806, + "step": 35738 + }, + { + "epoch": 0.9463547208327114, + "grad_norm": 0.79296875, + "learning_rate": 8.58087007517563e-05, + "loss": 0.8019, + "step": 35739 + }, + { + "epoch": 0.9463812004410058, + "grad_norm": 0.76171875, + "learning_rate": 8.580414361993346e-05, + "loss": 0.7883, + "step": 35740 + }, + { + "epoch": 0.9464076800493002, + "grad_norm": 0.90625, + "learning_rate": 8.579958651819791e-05, + "loss": 0.8735, + "step": 35741 + }, + { + "epoch": 0.9464341596575946, + "grad_norm": 0.73046875, + "learning_rate": 8.579502944655935e-05, + "loss": 0.7033, + "step": 35742 + }, + { + "epoch": 0.946460639265889, + "grad_norm": 0.68359375, + "learning_rate": 8.579047240502739e-05, + "loss": 0.8351, + "step": 35743 + }, + { + "epoch": 0.9464871188741834, + "grad_norm": 0.78515625, + "learning_rate": 8.578591539361173e-05, + "loss": 0.7552, + "step": 35744 + }, + { + "epoch": 0.9465135984824777, + "grad_norm": 0.78515625, + "learning_rate": 8.578135841232201e-05, + "loss": 0.7972, + "step": 35745 + }, + { + "epoch": 0.9465400780907721, + "grad_norm": 0.75, + "learning_rate": 8.577680146116787e-05, + "loss": 0.7306, + "step": 35746 + }, + { + "epoch": 0.9465665576990665, + "grad_norm": 0.87109375, + "learning_rate": 8.577224454015901e-05, + "loss": 0.8133, + "step": 35747 + }, + { + "epoch": 0.9465930373073609, + "grad_norm": 0.78125, + "learning_rate": 8.576768764930505e-05, + "loss": 0.8528, + "step": 35748 + }, + { + "epoch": 0.9466195169156553, + "grad_norm": 0.76171875, + "learning_rate": 8.576313078861568e-05, + "loss": 0.8019, + "step": 35749 + }, + { + "epoch": 0.9466459965239497, + "grad_norm": 0.80078125, + "learning_rate": 8.575857395810053e-05, + "loss": 0.7759, + "step": 35750 + }, + { + "epoch": 0.9466724761322439, + "grad_norm": 0.75, + "learning_rate": 8.575401715776923e-05, + "loss": 0.7842, + "step": 35751 + }, + { + "epoch": 0.9466989557405383, + "grad_norm": 0.76953125, + "learning_rate": 8.574946038763152e-05, + "loss": 0.736, + "step": 35752 + }, + { + "epoch": 0.9467254353488327, + "grad_norm": 0.74609375, + "learning_rate": 8.5744903647697e-05, + "loss": 0.6704, + "step": 35753 + }, + { + "epoch": 0.9467519149571271, + "grad_norm": 0.7421875, + "learning_rate": 8.574034693797537e-05, + "loss": 0.7834, + "step": 35754 + }, + { + "epoch": 0.9467783945654215, + "grad_norm": 1.15625, + "learning_rate": 8.573579025847624e-05, + "loss": 0.8038, + "step": 35755 + }, + { + "epoch": 0.9468048741737158, + "grad_norm": 0.7578125, + "learning_rate": 8.573123360920924e-05, + "loss": 0.8385, + "step": 35756 + }, + { + "epoch": 0.9468313537820102, + "grad_norm": 0.75390625, + "learning_rate": 8.572667699018413e-05, + "loss": 0.7851, + "step": 35757 + }, + { + "epoch": 0.9468578333903046, + "grad_norm": 0.71484375, + "learning_rate": 8.572212040141051e-05, + "loss": 0.6526, + "step": 35758 + }, + { + "epoch": 0.946884312998599, + "grad_norm": 0.92578125, + "learning_rate": 8.571756384289804e-05, + "loss": 0.8172, + "step": 35759 + }, + { + "epoch": 0.9469107926068934, + "grad_norm": 0.78125, + "learning_rate": 8.571300731465637e-05, + "loss": 0.7547, + "step": 35760 + }, + { + "epoch": 0.9469372722151878, + "grad_norm": 0.7578125, + "learning_rate": 8.570845081669515e-05, + "loss": 0.6922, + "step": 35761 + }, + { + "epoch": 0.9469637518234821, + "grad_norm": 0.80859375, + "learning_rate": 8.570389434902405e-05, + "loss": 0.8297, + "step": 35762 + }, + { + "epoch": 0.9469902314317765, + "grad_norm": 0.8359375, + "learning_rate": 8.569933791165274e-05, + "loss": 0.8772, + "step": 35763 + }, + { + "epoch": 0.9470167110400709, + "grad_norm": 0.84765625, + "learning_rate": 8.569478150459088e-05, + "loss": 0.8075, + "step": 35764 + }, + { + "epoch": 0.9470431906483653, + "grad_norm": 0.69140625, + "learning_rate": 8.569022512784811e-05, + "loss": 0.6676, + "step": 35765 + }, + { + "epoch": 0.9470696702566597, + "grad_norm": 0.8359375, + "learning_rate": 8.568566878143407e-05, + "loss": 0.9195, + "step": 35766 + }, + { + "epoch": 0.947096149864954, + "grad_norm": 0.78515625, + "learning_rate": 8.568111246535846e-05, + "loss": 0.7476, + "step": 35767 + }, + { + "epoch": 0.9471226294732483, + "grad_norm": 0.81640625, + "learning_rate": 8.567655617963091e-05, + "loss": 0.7715, + "step": 35768 + }, + { + "epoch": 0.9471491090815427, + "grad_norm": 0.71875, + "learning_rate": 8.567199992426107e-05, + "loss": 0.7304, + "step": 35769 + }, + { + "epoch": 0.9471755886898371, + "grad_norm": 0.8203125, + "learning_rate": 8.566744369925861e-05, + "loss": 0.8087, + "step": 35770 + }, + { + "epoch": 0.9472020682981315, + "grad_norm": 0.75390625, + "learning_rate": 8.566288750463316e-05, + "loss": 0.8011, + "step": 35771 + }, + { + "epoch": 0.9472285479064259, + "grad_norm": 0.77734375, + "learning_rate": 8.565833134039442e-05, + "loss": 0.7521, + "step": 35772 + }, + { + "epoch": 0.9472550275147202, + "grad_norm": 0.77734375, + "learning_rate": 8.565377520655204e-05, + "loss": 0.8366, + "step": 35773 + }, + { + "epoch": 0.9472815071230146, + "grad_norm": 0.79296875, + "learning_rate": 8.564921910311568e-05, + "loss": 0.7078, + "step": 35774 + }, + { + "epoch": 0.947307986731309, + "grad_norm": 0.80078125, + "learning_rate": 8.564466303009495e-05, + "loss": 0.7976, + "step": 35775 + }, + { + "epoch": 0.9473344663396034, + "grad_norm": 0.859375, + "learning_rate": 8.564010698749951e-05, + "loss": 0.7788, + "step": 35776 + }, + { + "epoch": 0.9473609459478978, + "grad_norm": 0.828125, + "learning_rate": 8.563555097533907e-05, + "loss": 0.8778, + "step": 35777 + }, + { + "epoch": 0.9473874255561922, + "grad_norm": 0.73046875, + "learning_rate": 8.563099499362327e-05, + "loss": 0.6685, + "step": 35778 + }, + { + "epoch": 0.9474139051644865, + "grad_norm": 0.76953125, + "learning_rate": 8.562643904236176e-05, + "loss": 0.8645, + "step": 35779 + }, + { + "epoch": 0.9474403847727809, + "grad_norm": 0.859375, + "learning_rate": 8.562188312156418e-05, + "loss": 0.8947, + "step": 35780 + }, + { + "epoch": 0.9474668643810753, + "grad_norm": 0.8828125, + "learning_rate": 8.561732723124017e-05, + "loss": 0.7558, + "step": 35781 + }, + { + "epoch": 0.9474933439893697, + "grad_norm": 0.7265625, + "learning_rate": 8.561277137139943e-05, + "loss": 0.7291, + "step": 35782 + }, + { + "epoch": 0.9475198235976641, + "grad_norm": 0.78515625, + "learning_rate": 8.560821554205161e-05, + "loss": 0.7779, + "step": 35783 + }, + { + "epoch": 0.9475463032059583, + "grad_norm": 0.796875, + "learning_rate": 8.560365974320637e-05, + "loss": 0.7649, + "step": 35784 + }, + { + "epoch": 0.9475727828142527, + "grad_norm": 0.74609375, + "learning_rate": 8.559910397487333e-05, + "loss": 0.8016, + "step": 35785 + }, + { + "epoch": 0.9475992624225471, + "grad_norm": 0.67578125, + "learning_rate": 8.559454823706216e-05, + "loss": 0.7162, + "step": 35786 + }, + { + "epoch": 0.9476257420308415, + "grad_norm": 0.76953125, + "learning_rate": 8.558999252978252e-05, + "loss": 0.7577, + "step": 35787 + }, + { + "epoch": 0.9476522216391359, + "grad_norm": 0.7265625, + "learning_rate": 8.55854368530441e-05, + "loss": 0.7225, + "step": 35788 + }, + { + "epoch": 0.9476787012474303, + "grad_norm": 0.80859375, + "learning_rate": 8.55808812068565e-05, + "loss": 0.7514, + "step": 35789 + }, + { + "epoch": 0.9477051808557246, + "grad_norm": 0.7421875, + "learning_rate": 8.55763255912294e-05, + "loss": 0.7545, + "step": 35790 + }, + { + "epoch": 0.947731660464019, + "grad_norm": 0.75390625, + "learning_rate": 8.557177000617241e-05, + "loss": 0.7645, + "step": 35791 + }, + { + "epoch": 0.9477581400723134, + "grad_norm": 0.7890625, + "learning_rate": 8.556721445169528e-05, + "loss": 0.7026, + "step": 35792 + }, + { + "epoch": 0.9477846196806078, + "grad_norm": 0.8125, + "learning_rate": 8.556265892780761e-05, + "loss": 0.8273, + "step": 35793 + }, + { + "epoch": 0.9478110992889022, + "grad_norm": 0.78125, + "learning_rate": 8.555810343451907e-05, + "loss": 0.7838, + "step": 35794 + }, + { + "epoch": 0.9478375788971966, + "grad_norm": 0.74609375, + "learning_rate": 8.555354797183929e-05, + "loss": 0.6832, + "step": 35795 + }, + { + "epoch": 0.9478640585054909, + "grad_norm": 0.83203125, + "learning_rate": 8.55489925397779e-05, + "loss": 0.7801, + "step": 35796 + }, + { + "epoch": 0.9478905381137853, + "grad_norm": 0.765625, + "learning_rate": 8.554443713834464e-05, + "loss": 0.7457, + "step": 35797 + }, + { + "epoch": 0.9479170177220797, + "grad_norm": 0.80859375, + "learning_rate": 8.55398817675491e-05, + "loss": 0.8188, + "step": 35798 + }, + { + "epoch": 0.9479434973303741, + "grad_norm": 0.7578125, + "learning_rate": 8.553532642740098e-05, + "loss": 0.7162, + "step": 35799 + }, + { + "epoch": 0.9479699769386684, + "grad_norm": 0.7734375, + "learning_rate": 8.553077111790988e-05, + "loss": 0.767, + "step": 35800 + }, + { + "epoch": 0.9479964565469627, + "grad_norm": 0.75390625, + "learning_rate": 8.552621583908548e-05, + "loss": 0.7668, + "step": 35801 + }, + { + "epoch": 0.9480229361552571, + "grad_norm": 0.7734375, + "learning_rate": 8.552166059093746e-05, + "loss": 0.7943, + "step": 35802 + }, + { + "epoch": 0.9480494157635515, + "grad_norm": 0.765625, + "learning_rate": 8.551710537347545e-05, + "loss": 0.847, + "step": 35803 + }, + { + "epoch": 0.9480758953718459, + "grad_norm": 0.8125, + "learning_rate": 8.551255018670912e-05, + "loss": 0.7273, + "step": 35804 + }, + { + "epoch": 0.9481023749801403, + "grad_norm": 0.76953125, + "learning_rate": 8.550799503064809e-05, + "loss": 0.8998, + "step": 35805 + }, + { + "epoch": 0.9481288545884347, + "grad_norm": 0.7890625, + "learning_rate": 8.550343990530203e-05, + "loss": 0.7973, + "step": 35806 + }, + { + "epoch": 0.948155334196729, + "grad_norm": 0.75, + "learning_rate": 8.549888481068063e-05, + "loss": 0.852, + "step": 35807 + }, + { + "epoch": 0.9481818138050234, + "grad_norm": 0.81640625, + "learning_rate": 8.54943297467935e-05, + "loss": 0.717, + "step": 35808 + }, + { + "epoch": 0.9482082934133178, + "grad_norm": 0.8046875, + "learning_rate": 8.548977471365032e-05, + "loss": 0.7097, + "step": 35809 + }, + { + "epoch": 0.9482347730216122, + "grad_norm": 0.7421875, + "learning_rate": 8.548521971126069e-05, + "loss": 0.6832, + "step": 35810 + }, + { + "epoch": 0.9482612526299066, + "grad_norm": 0.7421875, + "learning_rate": 8.548066473963432e-05, + "loss": 0.7666, + "step": 35811 + }, + { + "epoch": 0.948287732238201, + "grad_norm": 0.76953125, + "learning_rate": 8.547610979878087e-05, + "loss": 0.7138, + "step": 35812 + }, + { + "epoch": 0.9483142118464953, + "grad_norm": 0.84765625, + "learning_rate": 8.547155488871e-05, + "loss": 0.8121, + "step": 35813 + }, + { + "epoch": 0.9483406914547897, + "grad_norm": 0.765625, + "learning_rate": 8.546700000943131e-05, + "loss": 0.8202, + "step": 35814 + }, + { + "epoch": 0.9483671710630841, + "grad_norm": 0.7421875, + "learning_rate": 8.54624451609545e-05, + "loss": 0.7049, + "step": 35815 + }, + { + "epoch": 0.9483936506713785, + "grad_norm": 0.79296875, + "learning_rate": 8.545789034328914e-05, + "loss": 0.8093, + "step": 35816 + }, + { + "epoch": 0.9484201302796728, + "grad_norm": 0.91015625, + "learning_rate": 8.545333555644501e-05, + "loss": 0.711, + "step": 35817 + }, + { + "epoch": 0.9484466098879671, + "grad_norm": 0.87890625, + "learning_rate": 8.54487808004317e-05, + "loss": 0.9337, + "step": 35818 + }, + { + "epoch": 0.9484730894962615, + "grad_norm": 0.765625, + "learning_rate": 8.544422607525889e-05, + "loss": 0.8095, + "step": 35819 + }, + { + "epoch": 0.9484995691045559, + "grad_norm": 0.71875, + "learning_rate": 8.543967138093618e-05, + "loss": 0.719, + "step": 35820 + }, + { + "epoch": 0.9485260487128503, + "grad_norm": 0.76171875, + "learning_rate": 8.543511671747322e-05, + "loss": 0.8661, + "step": 35821 + }, + { + "epoch": 0.9485525283211447, + "grad_norm": 0.84375, + "learning_rate": 8.543056208487975e-05, + "loss": 0.6524, + "step": 35822 + }, + { + "epoch": 0.948579007929439, + "grad_norm": 0.76171875, + "learning_rate": 8.542600748316536e-05, + "loss": 0.7675, + "step": 35823 + }, + { + "epoch": 0.9486054875377334, + "grad_norm": 0.78125, + "learning_rate": 8.542145291233972e-05, + "loss": 0.7344, + "step": 35824 + }, + { + "epoch": 0.9486319671460278, + "grad_norm": 0.87109375, + "learning_rate": 8.541689837241247e-05, + "loss": 0.7254, + "step": 35825 + }, + { + "epoch": 0.9486584467543222, + "grad_norm": 0.7265625, + "learning_rate": 8.541234386339326e-05, + "loss": 0.7994, + "step": 35826 + }, + { + "epoch": 0.9486849263626166, + "grad_norm": 0.83203125, + "learning_rate": 8.540778938529177e-05, + "loss": 0.7826, + "step": 35827 + }, + { + "epoch": 0.948711405970911, + "grad_norm": 0.81640625, + "learning_rate": 8.540323493811763e-05, + "loss": 0.8192, + "step": 35828 + }, + { + "epoch": 0.9487378855792054, + "grad_norm": 0.7578125, + "learning_rate": 8.539868052188049e-05, + "loss": 0.7849, + "step": 35829 + }, + { + "epoch": 0.9487643651874997, + "grad_norm": 0.75390625, + "learning_rate": 8.539412613659003e-05, + "loss": 0.7856, + "step": 35830 + }, + { + "epoch": 0.9487908447957941, + "grad_norm": 0.77734375, + "learning_rate": 8.538957178225585e-05, + "loss": 0.833, + "step": 35831 + }, + { + "epoch": 0.9488173244040885, + "grad_norm": 0.859375, + "learning_rate": 8.538501745888767e-05, + "loss": 0.9296, + "step": 35832 + }, + { + "epoch": 0.9488438040123828, + "grad_norm": 0.7265625, + "learning_rate": 8.53804631664951e-05, + "loss": 0.7172, + "step": 35833 + }, + { + "epoch": 0.9488702836206772, + "grad_norm": 0.75390625, + "learning_rate": 8.537590890508782e-05, + "loss": 0.7592, + "step": 35834 + }, + { + "epoch": 0.9488967632289715, + "grad_norm": 0.78515625, + "learning_rate": 8.537135467467546e-05, + "loss": 0.8309, + "step": 35835 + }, + { + "epoch": 0.9489232428372659, + "grad_norm": 0.82421875, + "learning_rate": 8.536680047526762e-05, + "loss": 0.7973, + "step": 35836 + }, + { + "epoch": 0.9489497224455603, + "grad_norm": 0.7421875, + "learning_rate": 8.536224630687407e-05, + "loss": 0.8655, + "step": 35837 + }, + { + "epoch": 0.9489762020538547, + "grad_norm": 0.7734375, + "learning_rate": 8.53576921695044e-05, + "loss": 0.8394, + "step": 35838 + }, + { + "epoch": 0.9490026816621491, + "grad_norm": 0.87109375, + "learning_rate": 8.535313806316827e-05, + "loss": 0.8909, + "step": 35839 + }, + { + "epoch": 0.9490291612704435, + "grad_norm": 0.734375, + "learning_rate": 8.534858398787531e-05, + "loss": 0.7346, + "step": 35840 + }, + { + "epoch": 0.9490556408787378, + "grad_norm": 0.875, + "learning_rate": 8.534402994363515e-05, + "loss": 0.8382, + "step": 35841 + }, + { + "epoch": 0.9490821204870322, + "grad_norm": 1.203125, + "learning_rate": 8.533947593045754e-05, + "loss": 0.7426, + "step": 35842 + }, + { + "epoch": 0.9491086000953266, + "grad_norm": 0.76171875, + "learning_rate": 8.533492194835206e-05, + "loss": 0.7932, + "step": 35843 + }, + { + "epoch": 0.949135079703621, + "grad_norm": 0.80078125, + "learning_rate": 8.533036799732837e-05, + "loss": 0.7021, + "step": 35844 + }, + { + "epoch": 0.9491615593119154, + "grad_norm": 0.78515625, + "learning_rate": 8.532581407739612e-05, + "loss": 0.8433, + "step": 35845 + }, + { + "epoch": 0.9491880389202098, + "grad_norm": 0.734375, + "learning_rate": 8.5321260188565e-05, + "loss": 0.8287, + "step": 35846 + }, + { + "epoch": 0.9492145185285041, + "grad_norm": 0.7890625, + "learning_rate": 8.531670633084457e-05, + "loss": 0.8587, + "step": 35847 + }, + { + "epoch": 0.9492409981367985, + "grad_norm": 0.79296875, + "learning_rate": 8.531215250424457e-05, + "loss": 0.7165, + "step": 35848 + }, + { + "epoch": 0.9492674777450929, + "grad_norm": 0.8125, + "learning_rate": 8.530759870877464e-05, + "loss": 0.8272, + "step": 35849 + }, + { + "epoch": 0.9492939573533872, + "grad_norm": 0.703125, + "learning_rate": 8.530304494444441e-05, + "loss": 0.6918, + "step": 35850 + }, + { + "epoch": 0.9493204369616816, + "grad_norm": 0.796875, + "learning_rate": 8.529849121126353e-05, + "loss": 0.8485, + "step": 35851 + }, + { + "epoch": 0.9493469165699759, + "grad_norm": 0.8046875, + "learning_rate": 8.529393750924164e-05, + "loss": 0.821, + "step": 35852 + }, + { + "epoch": 0.9493733961782703, + "grad_norm": 0.79296875, + "learning_rate": 8.528938383838843e-05, + "loss": 0.8277, + "step": 35853 + }, + { + "epoch": 0.9493998757865647, + "grad_norm": 0.98046875, + "learning_rate": 8.528483019871355e-05, + "loss": 0.8642, + "step": 35854 + }, + { + "epoch": 0.9494263553948591, + "grad_norm": 0.74609375, + "learning_rate": 8.52802765902266e-05, + "loss": 0.8881, + "step": 35855 + }, + { + "epoch": 0.9494528350031535, + "grad_norm": 0.77734375, + "learning_rate": 8.527572301293728e-05, + "loss": 0.8063, + "step": 35856 + }, + { + "epoch": 0.9494793146114479, + "grad_norm": 0.828125, + "learning_rate": 8.527116946685519e-05, + "loss": 0.7911, + "step": 35857 + }, + { + "epoch": 0.9495057942197422, + "grad_norm": 0.74609375, + "learning_rate": 8.526661595199004e-05, + "loss": 0.703, + "step": 35858 + }, + { + "epoch": 0.9495322738280366, + "grad_norm": 0.8828125, + "learning_rate": 8.526206246835145e-05, + "loss": 0.8093, + "step": 35859 + }, + { + "epoch": 0.949558753436331, + "grad_norm": 0.8359375, + "learning_rate": 8.525750901594909e-05, + "loss": 0.7995, + "step": 35860 + }, + { + "epoch": 0.9495852330446254, + "grad_norm": 0.84765625, + "learning_rate": 8.525295559479259e-05, + "loss": 0.7694, + "step": 35861 + }, + { + "epoch": 0.9496117126529198, + "grad_norm": 1.203125, + "learning_rate": 8.524840220489157e-05, + "loss": 0.8271, + "step": 35862 + }, + { + "epoch": 0.9496381922612142, + "grad_norm": 0.765625, + "learning_rate": 8.524384884625576e-05, + "loss": 0.7006, + "step": 35863 + }, + { + "epoch": 0.9496646718695085, + "grad_norm": 0.8125, + "learning_rate": 8.523929551889477e-05, + "loss": 0.8423, + "step": 35864 + }, + { + "epoch": 0.9496911514778029, + "grad_norm": 0.7734375, + "learning_rate": 8.523474222281824e-05, + "loss": 0.826, + "step": 35865 + }, + { + "epoch": 0.9497176310860972, + "grad_norm": 0.8359375, + "learning_rate": 8.523018895803585e-05, + "loss": 0.7492, + "step": 35866 + }, + { + "epoch": 0.9497441106943916, + "grad_norm": 0.78125, + "learning_rate": 8.522563572455717e-05, + "loss": 0.7902, + "step": 35867 + }, + { + "epoch": 0.949770590302686, + "grad_norm": 0.85546875, + "learning_rate": 8.522108252239196e-05, + "loss": 0.9243, + "step": 35868 + }, + { + "epoch": 0.9497970699109803, + "grad_norm": 0.76953125, + "learning_rate": 8.521652935154981e-05, + "loss": 0.855, + "step": 35869 + }, + { + "epoch": 0.9498235495192747, + "grad_norm": 0.8046875, + "learning_rate": 8.52119762120404e-05, + "loss": 0.8328, + "step": 35870 + }, + { + "epoch": 0.9498500291275691, + "grad_norm": 0.76953125, + "learning_rate": 8.520742310387335e-05, + "loss": 0.8399, + "step": 35871 + }, + { + "epoch": 0.9498765087358635, + "grad_norm": 0.74609375, + "learning_rate": 8.520287002705829e-05, + "loss": 0.8227, + "step": 35872 + }, + { + "epoch": 0.9499029883441579, + "grad_norm": 0.75, + "learning_rate": 8.519831698160494e-05, + "loss": 0.8109, + "step": 35873 + }, + { + "epoch": 0.9499294679524523, + "grad_norm": 0.8046875, + "learning_rate": 8.51937639675229e-05, + "loss": 0.7161, + "step": 35874 + }, + { + "epoch": 0.9499559475607466, + "grad_norm": 0.73046875, + "learning_rate": 8.518921098482184e-05, + "loss": 0.8445, + "step": 35875 + }, + { + "epoch": 0.949982427169041, + "grad_norm": 0.7734375, + "learning_rate": 8.51846580335114e-05, + "loss": 0.8181, + "step": 35876 + }, + { + "epoch": 0.9500089067773354, + "grad_norm": 0.78125, + "learning_rate": 8.518010511360118e-05, + "loss": 0.7234, + "step": 35877 + }, + { + "epoch": 0.9500353863856298, + "grad_norm": 0.70703125, + "learning_rate": 8.517555222510093e-05, + "loss": 0.7501, + "step": 35878 + }, + { + "epoch": 0.9500618659939242, + "grad_norm": 0.796875, + "learning_rate": 8.517099936802025e-05, + "loss": 0.9232, + "step": 35879 + }, + { + "epoch": 0.9500883456022186, + "grad_norm": 0.80078125, + "learning_rate": 8.516644654236879e-05, + "loss": 0.7665, + "step": 35880 + }, + { + "epoch": 0.9501148252105129, + "grad_norm": 0.82421875, + "learning_rate": 8.516189374815619e-05, + "loss": 0.8155, + "step": 35881 + }, + { + "epoch": 0.9501413048188072, + "grad_norm": 0.734375, + "learning_rate": 8.515734098539207e-05, + "loss": 0.8918, + "step": 35882 + }, + { + "epoch": 0.9501677844271016, + "grad_norm": 0.734375, + "learning_rate": 8.515278825408616e-05, + "loss": 0.7561, + "step": 35883 + }, + { + "epoch": 0.950194264035396, + "grad_norm": 0.7578125, + "learning_rate": 8.514823555424807e-05, + "loss": 0.8606, + "step": 35884 + }, + { + "epoch": 0.9502207436436904, + "grad_norm": 0.8828125, + "learning_rate": 8.514368288588744e-05, + "loss": 0.8084, + "step": 35885 + }, + { + "epoch": 0.9502472232519847, + "grad_norm": 0.70703125, + "learning_rate": 8.513913024901393e-05, + "loss": 0.7695, + "step": 35886 + }, + { + "epoch": 0.9502737028602791, + "grad_norm": 6.21875, + "learning_rate": 8.513457764363713e-05, + "loss": 0.7656, + "step": 35887 + }, + { + "epoch": 0.9503001824685735, + "grad_norm": 0.83984375, + "learning_rate": 8.513002506976681e-05, + "loss": 0.8167, + "step": 35888 + }, + { + "epoch": 0.9503266620768679, + "grad_norm": 0.74609375, + "learning_rate": 8.512547252741253e-05, + "loss": 0.7926, + "step": 35889 + }, + { + "epoch": 0.9503531416851623, + "grad_norm": 0.77734375, + "learning_rate": 8.512092001658396e-05, + "loss": 0.9479, + "step": 35890 + }, + { + "epoch": 0.9503796212934567, + "grad_norm": 0.76171875, + "learning_rate": 8.511636753729077e-05, + "loss": 0.7054, + "step": 35891 + }, + { + "epoch": 0.950406100901751, + "grad_norm": 0.703125, + "learning_rate": 8.511181508954255e-05, + "loss": 0.7467, + "step": 35892 + }, + { + "epoch": 0.9504325805100454, + "grad_norm": 0.7421875, + "learning_rate": 8.510726267334901e-05, + "loss": 0.8303, + "step": 35893 + }, + { + "epoch": 0.9504590601183398, + "grad_norm": 1.75, + "learning_rate": 8.510271028871978e-05, + "loss": 0.859, + "step": 35894 + }, + { + "epoch": 0.9504855397266342, + "grad_norm": 0.765625, + "learning_rate": 8.509815793566448e-05, + "loss": 0.7557, + "step": 35895 + }, + { + "epoch": 0.9505120193349286, + "grad_norm": 0.87109375, + "learning_rate": 8.509360561419281e-05, + "loss": 0.9015, + "step": 35896 + }, + { + "epoch": 0.950538498943223, + "grad_norm": 0.8671875, + "learning_rate": 8.508905332431436e-05, + "loss": 0.8421, + "step": 35897 + }, + { + "epoch": 0.9505649785515173, + "grad_norm": 0.89453125, + "learning_rate": 8.508450106603884e-05, + "loss": 0.9164, + "step": 35898 + }, + { + "epoch": 0.9505914581598116, + "grad_norm": 0.74609375, + "learning_rate": 8.507994883937586e-05, + "loss": 0.706, + "step": 35899 + }, + { + "epoch": 0.950617937768106, + "grad_norm": 0.72265625, + "learning_rate": 8.507539664433509e-05, + "loss": 0.8288, + "step": 35900 + }, + { + "epoch": 0.9506444173764004, + "grad_norm": 0.8203125, + "learning_rate": 8.507084448092614e-05, + "loss": 0.89, + "step": 35901 + }, + { + "epoch": 0.9506708969846948, + "grad_norm": 0.76953125, + "learning_rate": 8.506629234915865e-05, + "loss": 0.6538, + "step": 35902 + }, + { + "epoch": 0.9506973765929891, + "grad_norm": 0.9140625, + "learning_rate": 8.506174024904234e-05, + "loss": 0.8346, + "step": 35903 + }, + { + "epoch": 0.9507238562012835, + "grad_norm": 0.7421875, + "learning_rate": 8.505718818058682e-05, + "loss": 0.8123, + "step": 35904 + }, + { + "epoch": 0.9507503358095779, + "grad_norm": 0.71875, + "learning_rate": 8.505263614380175e-05, + "loss": 0.6982, + "step": 35905 + }, + { + "epoch": 0.9507768154178723, + "grad_norm": 0.70703125, + "learning_rate": 8.504808413869673e-05, + "loss": 0.9082, + "step": 35906 + }, + { + "epoch": 0.9508032950261667, + "grad_norm": 1.4375, + "learning_rate": 8.504353216528142e-05, + "loss": 0.8188, + "step": 35907 + }, + { + "epoch": 0.9508297746344611, + "grad_norm": 0.83203125, + "learning_rate": 8.503898022356553e-05, + "loss": 0.8187, + "step": 35908 + }, + { + "epoch": 0.9508562542427554, + "grad_norm": 0.875, + "learning_rate": 8.503442831355868e-05, + "loss": 0.8769, + "step": 35909 + }, + { + "epoch": 0.9508827338510498, + "grad_norm": 0.859375, + "learning_rate": 8.502987643527048e-05, + "loss": 0.8201, + "step": 35910 + }, + { + "epoch": 0.9509092134593442, + "grad_norm": 0.734375, + "learning_rate": 8.502532458871063e-05, + "loss": 0.83, + "step": 35911 + }, + { + "epoch": 0.9509356930676386, + "grad_norm": 0.828125, + "learning_rate": 8.502077277388869e-05, + "loss": 0.7617, + "step": 35912 + }, + { + "epoch": 0.950962172675933, + "grad_norm": 0.6953125, + "learning_rate": 8.501622099081441e-05, + "loss": 0.7578, + "step": 35913 + }, + { + "epoch": 0.9509886522842274, + "grad_norm": 0.7421875, + "learning_rate": 8.501166923949737e-05, + "loss": 0.8005, + "step": 35914 + }, + { + "epoch": 0.9510151318925216, + "grad_norm": 0.79296875, + "learning_rate": 8.500711751994726e-05, + "loss": 0.876, + "step": 35915 + }, + { + "epoch": 0.951041611500816, + "grad_norm": 0.94921875, + "learning_rate": 8.50025658321737e-05, + "loss": 0.8914, + "step": 35916 + }, + { + "epoch": 0.9510680911091104, + "grad_norm": 0.703125, + "learning_rate": 8.499801417618634e-05, + "loss": 0.8578, + "step": 35917 + }, + { + "epoch": 0.9510945707174048, + "grad_norm": 0.796875, + "learning_rate": 8.499346255199484e-05, + "loss": 0.7754, + "step": 35918 + }, + { + "epoch": 0.9511210503256992, + "grad_norm": 0.8046875, + "learning_rate": 8.498891095960883e-05, + "loss": 0.7272, + "step": 35919 + }, + { + "epoch": 0.9511475299339935, + "grad_norm": 0.8125, + "learning_rate": 8.498435939903798e-05, + "loss": 0.7603, + "step": 35920 + }, + { + "epoch": 0.9511740095422879, + "grad_norm": 0.7734375, + "learning_rate": 8.497980787029192e-05, + "loss": 0.8201, + "step": 35921 + }, + { + "epoch": 0.9512004891505823, + "grad_norm": 0.80859375, + "learning_rate": 8.497525637338025e-05, + "loss": 0.8659, + "step": 35922 + }, + { + "epoch": 0.9512269687588767, + "grad_norm": 0.8515625, + "learning_rate": 8.497070490831272e-05, + "loss": 0.7656, + "step": 35923 + }, + { + "epoch": 0.9512534483671711, + "grad_norm": 0.88671875, + "learning_rate": 8.496615347509891e-05, + "loss": 0.9062, + "step": 35924 + }, + { + "epoch": 0.9512799279754655, + "grad_norm": 0.82421875, + "learning_rate": 8.496160207374848e-05, + "loss": 0.6853, + "step": 35925 + }, + { + "epoch": 0.9513064075837598, + "grad_norm": 0.984375, + "learning_rate": 8.495705070427108e-05, + "loss": 0.9433, + "step": 35926 + }, + { + "epoch": 0.9513328871920542, + "grad_norm": 0.80078125, + "learning_rate": 8.49524993666763e-05, + "loss": 0.952, + "step": 35927 + }, + { + "epoch": 0.9513593668003486, + "grad_norm": 0.84765625, + "learning_rate": 8.49479480609739e-05, + "loss": 0.9692, + "step": 35928 + }, + { + "epoch": 0.951385846408643, + "grad_norm": 0.80078125, + "learning_rate": 8.494339678717345e-05, + "loss": 0.8975, + "step": 35929 + }, + { + "epoch": 0.9514123260169374, + "grad_norm": 0.83203125, + "learning_rate": 8.49388455452846e-05, + "loss": 0.8599, + "step": 35930 + }, + { + "epoch": 0.9514388056252316, + "grad_norm": 0.84375, + "learning_rate": 8.493429433531702e-05, + "loss": 0.8131, + "step": 35931 + }, + { + "epoch": 0.951465285233526, + "grad_norm": 0.75, + "learning_rate": 8.492974315728032e-05, + "loss": 0.7625, + "step": 35932 + }, + { + "epoch": 0.9514917648418204, + "grad_norm": 0.75390625, + "learning_rate": 8.492519201118416e-05, + "loss": 0.6621, + "step": 35933 + }, + { + "epoch": 0.9515182444501148, + "grad_norm": 1.21875, + "learning_rate": 8.492064089703822e-05, + "loss": 0.8465, + "step": 35934 + }, + { + "epoch": 0.9515447240584092, + "grad_norm": 0.72265625, + "learning_rate": 8.491608981485211e-05, + "loss": 0.7274, + "step": 35935 + }, + { + "epoch": 0.9515712036667036, + "grad_norm": 0.85546875, + "learning_rate": 8.491153876463551e-05, + "loss": 0.776, + "step": 35936 + }, + { + "epoch": 0.9515976832749979, + "grad_norm": 0.8828125, + "learning_rate": 8.490698774639801e-05, + "loss": 0.8205, + "step": 35937 + }, + { + "epoch": 0.9516241628832923, + "grad_norm": 0.83984375, + "learning_rate": 8.49024367601493e-05, + "loss": 0.75, + "step": 35938 + }, + { + "epoch": 0.9516506424915867, + "grad_norm": 0.80078125, + "learning_rate": 8.489788580589902e-05, + "loss": 0.7976, + "step": 35939 + }, + { + "epoch": 0.9516771220998811, + "grad_norm": 0.7890625, + "learning_rate": 8.489333488365679e-05, + "loss": 0.7983, + "step": 35940 + }, + { + "epoch": 0.9517036017081755, + "grad_norm": 0.765625, + "learning_rate": 8.48887839934323e-05, + "loss": 0.7428, + "step": 35941 + }, + { + "epoch": 0.9517300813164699, + "grad_norm": 0.78515625, + "learning_rate": 8.48842331352351e-05, + "loss": 0.7522, + "step": 35942 + }, + { + "epoch": 0.9517565609247642, + "grad_norm": 0.7890625, + "learning_rate": 8.487968230907496e-05, + "loss": 0.7602, + "step": 35943 + }, + { + "epoch": 0.9517830405330586, + "grad_norm": 0.91796875, + "learning_rate": 8.487513151496147e-05, + "loss": 0.8875, + "step": 35944 + }, + { + "epoch": 0.951809520141353, + "grad_norm": 0.83203125, + "learning_rate": 8.487058075290427e-05, + "loss": 0.8308, + "step": 35945 + }, + { + "epoch": 0.9518359997496474, + "grad_norm": 0.734375, + "learning_rate": 8.486603002291301e-05, + "loss": 0.7423, + "step": 35946 + }, + { + "epoch": 0.9518624793579418, + "grad_norm": 0.8046875, + "learning_rate": 8.48614793249973e-05, + "loss": 0.8042, + "step": 35947 + }, + { + "epoch": 0.951888958966236, + "grad_norm": 0.7734375, + "learning_rate": 8.485692865916686e-05, + "loss": 0.7645, + "step": 35948 + }, + { + "epoch": 0.9519154385745304, + "grad_norm": 0.76171875, + "learning_rate": 8.485237802543128e-05, + "loss": 0.8225, + "step": 35949 + }, + { + "epoch": 0.9519419181828248, + "grad_norm": 0.83984375, + "learning_rate": 8.484782742380023e-05, + "loss": 0.8368, + "step": 35950 + }, + { + "epoch": 0.9519683977911192, + "grad_norm": 0.77734375, + "learning_rate": 8.484327685428336e-05, + "loss": 0.7794, + "step": 35951 + }, + { + "epoch": 0.9519948773994136, + "grad_norm": 0.80078125, + "learning_rate": 8.483872631689022e-05, + "loss": 0.7523, + "step": 35952 + }, + { + "epoch": 0.952021357007708, + "grad_norm": 0.7578125, + "learning_rate": 8.48341758116306e-05, + "loss": 0.7106, + "step": 35953 + }, + { + "epoch": 0.9520478366160023, + "grad_norm": 0.89453125, + "learning_rate": 8.482962533851407e-05, + "loss": 0.6897, + "step": 35954 + }, + { + "epoch": 0.9520743162242967, + "grad_norm": 0.8203125, + "learning_rate": 8.482507489755028e-05, + "loss": 0.8339, + "step": 35955 + }, + { + "epoch": 0.9521007958325911, + "grad_norm": 0.81640625, + "learning_rate": 8.482052448874888e-05, + "loss": 0.7256, + "step": 35956 + }, + { + "epoch": 0.9521272754408855, + "grad_norm": 0.7109375, + "learning_rate": 8.481597411211949e-05, + "loss": 0.7943, + "step": 35957 + }, + { + "epoch": 0.9521537550491799, + "grad_norm": 0.73828125, + "learning_rate": 8.481142376767179e-05, + "loss": 0.8078, + "step": 35958 + }, + { + "epoch": 0.9521802346574743, + "grad_norm": 1.1171875, + "learning_rate": 8.480687345541542e-05, + "loss": 0.8462, + "step": 35959 + }, + { + "epoch": 0.9522067142657686, + "grad_norm": 0.80078125, + "learning_rate": 8.480232317536001e-05, + "loss": 0.7192, + "step": 35960 + }, + { + "epoch": 0.952233193874063, + "grad_norm": 0.7421875, + "learning_rate": 8.479777292751521e-05, + "loss": 0.8414, + "step": 35961 + }, + { + "epoch": 0.9522596734823574, + "grad_norm": 0.734375, + "learning_rate": 8.479322271189061e-05, + "loss": 0.7184, + "step": 35962 + }, + { + "epoch": 0.9522861530906518, + "grad_norm": 0.796875, + "learning_rate": 8.478867252849595e-05, + "loss": 0.8647, + "step": 35963 + }, + { + "epoch": 0.9523126326989461, + "grad_norm": 0.77734375, + "learning_rate": 8.478412237734083e-05, + "loss": 0.7785, + "step": 35964 + }, + { + "epoch": 0.9523391123072404, + "grad_norm": 0.71875, + "learning_rate": 8.477957225843489e-05, + "loss": 0.7807, + "step": 35965 + }, + { + "epoch": 0.9523655919155348, + "grad_norm": 0.95703125, + "learning_rate": 8.477502217178779e-05, + "loss": 0.7325, + "step": 35966 + }, + { + "epoch": 0.9523920715238292, + "grad_norm": 0.765625, + "learning_rate": 8.47704721174091e-05, + "loss": 0.7288, + "step": 35967 + }, + { + "epoch": 0.9524185511321236, + "grad_norm": 0.79296875, + "learning_rate": 8.476592209530858e-05, + "loss": 0.7672, + "step": 35968 + }, + { + "epoch": 0.952445030740418, + "grad_norm": 0.8671875, + "learning_rate": 8.47613721054958e-05, + "loss": 0.8251, + "step": 35969 + }, + { + "epoch": 0.9524715103487124, + "grad_norm": 0.76171875, + "learning_rate": 8.475682214798045e-05, + "loss": 0.7054, + "step": 35970 + }, + { + "epoch": 0.9524979899570067, + "grad_norm": 0.80078125, + "learning_rate": 8.47522722227721e-05, + "loss": 0.7404, + "step": 35971 + }, + { + "epoch": 0.9525244695653011, + "grad_norm": 0.73828125, + "learning_rate": 8.474772232988045e-05, + "loss": 0.6727, + "step": 35972 + }, + { + "epoch": 0.9525509491735955, + "grad_norm": 0.75390625, + "learning_rate": 8.474317246931514e-05, + "loss": 0.7584, + "step": 35973 + }, + { + "epoch": 0.9525774287818899, + "grad_norm": 0.75390625, + "learning_rate": 8.473862264108581e-05, + "loss": 0.8784, + "step": 35974 + }, + { + "epoch": 0.9526039083901843, + "grad_norm": 0.859375, + "learning_rate": 8.47340728452021e-05, + "loss": 0.8592, + "step": 35975 + }, + { + "epoch": 0.9526303879984787, + "grad_norm": 1.8046875, + "learning_rate": 8.472952308167363e-05, + "loss": 0.8237, + "step": 35976 + }, + { + "epoch": 0.952656867606773, + "grad_norm": 0.81640625, + "learning_rate": 8.472497335051007e-05, + "loss": 0.8437, + "step": 35977 + }, + { + "epoch": 0.9526833472150674, + "grad_norm": 0.7890625, + "learning_rate": 8.472042365172106e-05, + "loss": 0.777, + "step": 35978 + }, + { + "epoch": 0.9527098268233618, + "grad_norm": 0.7109375, + "learning_rate": 8.471587398531624e-05, + "loss": 0.7014, + "step": 35979 + }, + { + "epoch": 0.9527363064316561, + "grad_norm": 0.7421875, + "learning_rate": 8.471132435130525e-05, + "loss": 0.859, + "step": 35980 + }, + { + "epoch": 0.9527627860399505, + "grad_norm": 0.7578125, + "learning_rate": 8.470677474969773e-05, + "loss": 0.735, + "step": 35981 + }, + { + "epoch": 0.9527892656482448, + "grad_norm": 0.7734375, + "learning_rate": 8.470222518050332e-05, + "loss": 0.7241, + "step": 35982 + }, + { + "epoch": 0.9528157452565392, + "grad_norm": 0.7578125, + "learning_rate": 8.469767564373168e-05, + "loss": 0.7079, + "step": 35983 + }, + { + "epoch": 0.9528422248648336, + "grad_norm": 0.79296875, + "learning_rate": 8.469312613939244e-05, + "loss": 0.822, + "step": 35984 + }, + { + "epoch": 0.952868704473128, + "grad_norm": 0.81640625, + "learning_rate": 8.468857666749526e-05, + "loss": 0.6851, + "step": 35985 + }, + { + "epoch": 0.9528951840814224, + "grad_norm": 0.80859375, + "learning_rate": 8.468402722804976e-05, + "loss": 0.894, + "step": 35986 + }, + { + "epoch": 0.9529216636897168, + "grad_norm": 0.734375, + "learning_rate": 8.467947782106557e-05, + "loss": 0.8209, + "step": 35987 + }, + { + "epoch": 0.9529481432980111, + "grad_norm": 0.82421875, + "learning_rate": 8.467492844655234e-05, + "loss": 0.8343, + "step": 35988 + }, + { + "epoch": 0.9529746229063055, + "grad_norm": 0.8671875, + "learning_rate": 8.467037910451975e-05, + "loss": 0.6474, + "step": 35989 + }, + { + "epoch": 0.9530011025145999, + "grad_norm": 0.73046875, + "learning_rate": 8.466582979497742e-05, + "loss": 0.7378, + "step": 35990 + }, + { + "epoch": 0.9530275821228943, + "grad_norm": 0.7734375, + "learning_rate": 8.466128051793498e-05, + "loss": 0.7921, + "step": 35991 + }, + { + "epoch": 0.9530540617311887, + "grad_norm": 0.83984375, + "learning_rate": 8.465673127340207e-05, + "loss": 0.7594, + "step": 35992 + }, + { + "epoch": 0.9530805413394831, + "grad_norm": 0.74609375, + "learning_rate": 8.465218206138832e-05, + "loss": 0.7895, + "step": 35993 + }, + { + "epoch": 0.9531070209477774, + "grad_norm": 0.84375, + "learning_rate": 8.464763288190342e-05, + "loss": 0.9008, + "step": 35994 + }, + { + "epoch": 0.9531335005560718, + "grad_norm": 0.734375, + "learning_rate": 8.4643083734957e-05, + "loss": 0.7352, + "step": 35995 + }, + { + "epoch": 0.9531599801643662, + "grad_norm": 0.71875, + "learning_rate": 8.463853462055869e-05, + "loss": 0.7825, + "step": 35996 + }, + { + "epoch": 0.9531864597726605, + "grad_norm": 0.78125, + "learning_rate": 8.463398553871811e-05, + "loss": 0.8441, + "step": 35997 + }, + { + "epoch": 0.9532129393809549, + "grad_norm": 0.84765625, + "learning_rate": 8.462943648944491e-05, + "loss": 0.7937, + "step": 35998 + }, + { + "epoch": 0.9532394189892492, + "grad_norm": 0.8046875, + "learning_rate": 8.462488747274873e-05, + "loss": 0.788, + "step": 35999 + }, + { + "epoch": 0.9532658985975436, + "grad_norm": 0.80078125, + "learning_rate": 8.462033848863924e-05, + "loss": 0.7795, + "step": 36000 + }, + { + "epoch": 0.9532658985975436, + "eval_loss": 0.7839899659156799, + "eval_runtime": 281.3245, + "eval_samples_per_second": 35.546, + "eval_steps_per_second": 0.743, + "step": 36000 + }, + { + "epoch": 0.953292378205838, + "grad_norm": 0.76171875, + "learning_rate": 8.461578953712608e-05, + "loss": 0.7456, + "step": 36001 + }, + { + "epoch": 0.9533188578141324, + "grad_norm": 0.8046875, + "learning_rate": 8.461124061821887e-05, + "loss": 0.8013, + "step": 36002 + }, + { + "epoch": 0.9533453374224268, + "grad_norm": 0.8359375, + "learning_rate": 8.460669173192723e-05, + "loss": 0.7346, + "step": 36003 + }, + { + "epoch": 0.9533718170307212, + "grad_norm": 0.76953125, + "learning_rate": 8.460214287826085e-05, + "loss": 0.7343, + "step": 36004 + }, + { + "epoch": 0.9533982966390155, + "grad_norm": 0.78125, + "learning_rate": 8.459759405722934e-05, + "loss": 0.8038, + "step": 36005 + }, + { + "epoch": 0.9534247762473099, + "grad_norm": 0.6953125, + "learning_rate": 8.459304526884237e-05, + "loss": 0.6792, + "step": 36006 + }, + { + "epoch": 0.9534512558556043, + "grad_norm": 0.7421875, + "learning_rate": 8.458849651310954e-05, + "loss": 0.8136, + "step": 36007 + }, + { + "epoch": 0.9534777354638987, + "grad_norm": 0.94140625, + "learning_rate": 8.458394779004047e-05, + "loss": 0.7481, + "step": 36008 + }, + { + "epoch": 0.9535042150721931, + "grad_norm": 0.77734375, + "learning_rate": 8.45793990996449e-05, + "loss": 0.7701, + "step": 36009 + }, + { + "epoch": 0.9535306946804875, + "grad_norm": 0.80859375, + "learning_rate": 8.457485044193239e-05, + "loss": 0.8313, + "step": 36010 + }, + { + "epoch": 0.9535571742887818, + "grad_norm": 0.83984375, + "learning_rate": 8.45703018169126e-05, + "loss": 0.7842, + "step": 36011 + }, + { + "epoch": 0.9535836538970762, + "grad_norm": 0.79296875, + "learning_rate": 8.456575322459522e-05, + "loss": 0.7933, + "step": 36012 + }, + { + "epoch": 0.9536101335053705, + "grad_norm": 0.859375, + "learning_rate": 8.456120466498975e-05, + "loss": 0.8357, + "step": 36013 + }, + { + "epoch": 0.9536366131136649, + "grad_norm": 0.80859375, + "learning_rate": 8.4556656138106e-05, + "loss": 0.6872, + "step": 36014 + }, + { + "epoch": 0.9536630927219593, + "grad_norm": 0.7734375, + "learning_rate": 8.455210764395351e-05, + "loss": 0.8285, + "step": 36015 + }, + { + "epoch": 0.9536895723302536, + "grad_norm": 0.78515625, + "learning_rate": 8.454755918254196e-05, + "loss": 0.7719, + "step": 36016 + }, + { + "epoch": 0.953716051938548, + "grad_norm": 0.70703125, + "learning_rate": 8.454301075388097e-05, + "loss": 0.7694, + "step": 36017 + }, + { + "epoch": 0.9537425315468424, + "grad_norm": 0.7890625, + "learning_rate": 8.453846235798015e-05, + "loss": 0.6899, + "step": 36018 + }, + { + "epoch": 0.9537690111551368, + "grad_norm": 0.7578125, + "learning_rate": 8.45339139948492e-05, + "loss": 0.7909, + "step": 36019 + }, + { + "epoch": 0.9537954907634312, + "grad_norm": 0.76171875, + "learning_rate": 8.452936566449774e-05, + "loss": 0.6972, + "step": 36020 + }, + { + "epoch": 0.9538219703717256, + "grad_norm": 0.80078125, + "learning_rate": 8.452481736693541e-05, + "loss": 0.7866, + "step": 36021 + }, + { + "epoch": 0.9538484499800199, + "grad_norm": 0.796875, + "learning_rate": 8.452026910217186e-05, + "loss": 0.7358, + "step": 36022 + }, + { + "epoch": 0.9538749295883143, + "grad_norm": 0.8359375, + "learning_rate": 8.451572087021667e-05, + "loss": 0.9316, + "step": 36023 + }, + { + "epoch": 0.9539014091966087, + "grad_norm": 0.73046875, + "learning_rate": 8.451117267107955e-05, + "loss": 0.7682, + "step": 36024 + }, + { + "epoch": 0.9539278888049031, + "grad_norm": 0.84765625, + "learning_rate": 8.450662450477013e-05, + "loss": 0.885, + "step": 36025 + }, + { + "epoch": 0.9539543684131975, + "grad_norm": 0.80078125, + "learning_rate": 8.450207637129802e-05, + "loss": 0.7253, + "step": 36026 + }, + { + "epoch": 0.9539808480214919, + "grad_norm": 0.80078125, + "learning_rate": 8.449752827067288e-05, + "loss": 0.7455, + "step": 36027 + }, + { + "epoch": 0.9540073276297862, + "grad_norm": 0.7890625, + "learning_rate": 8.44929802029043e-05, + "loss": 0.8337, + "step": 36028 + }, + { + "epoch": 0.9540338072380805, + "grad_norm": 0.81640625, + "learning_rate": 8.4488432168002e-05, + "loss": 0.737, + "step": 36029 + }, + { + "epoch": 0.9540602868463749, + "grad_norm": 0.859375, + "learning_rate": 8.448388416597558e-05, + "loss": 0.8974, + "step": 36030 + }, + { + "epoch": 0.9540867664546693, + "grad_norm": 0.77734375, + "learning_rate": 8.447933619683468e-05, + "loss": 0.7518, + "step": 36031 + }, + { + "epoch": 0.9541132460629637, + "grad_norm": 0.91015625, + "learning_rate": 8.447478826058894e-05, + "loss": 0.8114, + "step": 36032 + }, + { + "epoch": 0.954139725671258, + "grad_norm": 0.76171875, + "learning_rate": 8.447024035724798e-05, + "loss": 0.7825, + "step": 36033 + }, + { + "epoch": 0.9541662052795524, + "grad_norm": 0.81640625, + "learning_rate": 8.446569248682146e-05, + "loss": 0.8704, + "step": 36034 + }, + { + "epoch": 0.9541926848878468, + "grad_norm": 0.75390625, + "learning_rate": 8.446114464931904e-05, + "loss": 0.7013, + "step": 36035 + }, + { + "epoch": 0.9542191644961412, + "grad_norm": 0.7265625, + "learning_rate": 8.445659684475033e-05, + "loss": 0.8375, + "step": 36036 + }, + { + "epoch": 0.9542456441044356, + "grad_norm": 0.81640625, + "learning_rate": 8.445204907312498e-05, + "loss": 0.7829, + "step": 36037 + }, + { + "epoch": 0.95427212371273, + "grad_norm": 0.8046875, + "learning_rate": 8.444750133445259e-05, + "loss": 0.7158, + "step": 36038 + }, + { + "epoch": 0.9542986033210243, + "grad_norm": 0.85546875, + "learning_rate": 8.444295362874286e-05, + "loss": 0.7862, + "step": 36039 + }, + { + "epoch": 0.9543250829293187, + "grad_norm": 0.79296875, + "learning_rate": 8.44384059560054e-05, + "loss": 0.8626, + "step": 36040 + }, + { + "epoch": 0.9543515625376131, + "grad_norm": 0.71484375, + "learning_rate": 8.443385831624986e-05, + "loss": 0.719, + "step": 36041 + }, + { + "epoch": 0.9543780421459075, + "grad_norm": 0.76171875, + "learning_rate": 8.442931070948585e-05, + "loss": 0.7307, + "step": 36042 + }, + { + "epoch": 0.9544045217542019, + "grad_norm": 0.84765625, + "learning_rate": 8.442476313572303e-05, + "loss": 0.8526, + "step": 36043 + }, + { + "epoch": 0.9544310013624963, + "grad_norm": 0.84765625, + "learning_rate": 8.442021559497103e-05, + "loss": 0.7382, + "step": 36044 + }, + { + "epoch": 0.9544574809707906, + "grad_norm": 0.71875, + "learning_rate": 8.441566808723951e-05, + "loss": 0.7124, + "step": 36045 + }, + { + "epoch": 0.9544839605790849, + "grad_norm": 0.75, + "learning_rate": 8.441112061253808e-05, + "loss": 0.7767, + "step": 36046 + }, + { + "epoch": 0.9545104401873793, + "grad_norm": 0.70703125, + "learning_rate": 8.44065731708764e-05, + "loss": 0.7095, + "step": 36047 + }, + { + "epoch": 0.9545369197956737, + "grad_norm": 0.8125, + "learning_rate": 8.440202576226404e-05, + "loss": 0.7664, + "step": 36048 + }, + { + "epoch": 0.9545633994039681, + "grad_norm": 0.80859375, + "learning_rate": 8.439747838671075e-05, + "loss": 0.705, + "step": 36049 + }, + { + "epoch": 0.9545898790122624, + "grad_norm": 0.8359375, + "learning_rate": 8.439293104422609e-05, + "loss": 0.8848, + "step": 36050 + }, + { + "epoch": 0.9546163586205568, + "grad_norm": 0.80078125, + "learning_rate": 8.438838373481975e-05, + "loss": 0.7956, + "step": 36051 + }, + { + "epoch": 0.9546428382288512, + "grad_norm": 1.1015625, + "learning_rate": 8.438383645850134e-05, + "loss": 0.8482, + "step": 36052 + }, + { + "epoch": 0.9546693178371456, + "grad_norm": 0.72265625, + "learning_rate": 8.437928921528043e-05, + "loss": 0.7504, + "step": 36053 + }, + { + "epoch": 0.95469579744544, + "grad_norm": 1.0390625, + "learning_rate": 8.437474200516677e-05, + "loss": 0.7688, + "step": 36054 + }, + { + "epoch": 0.9547222770537344, + "grad_norm": 0.7578125, + "learning_rate": 8.437019482816996e-05, + "loss": 0.8193, + "step": 36055 + }, + { + "epoch": 0.9547487566620287, + "grad_norm": 0.76171875, + "learning_rate": 8.436564768429964e-05, + "loss": 0.7561, + "step": 36056 + }, + { + "epoch": 0.9547752362703231, + "grad_norm": 0.70703125, + "learning_rate": 8.436110057356541e-05, + "loss": 0.7515, + "step": 36057 + }, + { + "epoch": 0.9548017158786175, + "grad_norm": 0.75, + "learning_rate": 8.435655349597689e-05, + "loss": 0.6805, + "step": 36058 + }, + { + "epoch": 0.9548281954869119, + "grad_norm": 0.75390625, + "learning_rate": 8.435200645154383e-05, + "loss": 0.6133, + "step": 36059 + }, + { + "epoch": 0.9548546750952063, + "grad_norm": 0.80859375, + "learning_rate": 8.434745944027578e-05, + "loss": 0.6639, + "step": 36060 + }, + { + "epoch": 0.9548811547035007, + "grad_norm": 0.8203125, + "learning_rate": 8.43429124621824e-05, + "loss": 0.8387, + "step": 36061 + }, + { + "epoch": 0.9549076343117949, + "grad_norm": 0.8046875, + "learning_rate": 8.433836551727333e-05, + "loss": 0.7792, + "step": 36062 + }, + { + "epoch": 0.9549341139200893, + "grad_norm": 0.6875, + "learning_rate": 8.433381860555816e-05, + "loss": 0.5624, + "step": 36063 + }, + { + "epoch": 0.9549605935283837, + "grad_norm": 0.734375, + "learning_rate": 8.43292717270466e-05, + "loss": 0.6897, + "step": 36064 + }, + { + "epoch": 0.9549870731366781, + "grad_norm": 0.84375, + "learning_rate": 8.432472488174826e-05, + "loss": 0.7766, + "step": 36065 + }, + { + "epoch": 0.9550135527449725, + "grad_norm": 0.71875, + "learning_rate": 8.432017806967273e-05, + "loss": 0.8052, + "step": 36066 + }, + { + "epoch": 0.9550400323532668, + "grad_norm": 0.87109375, + "learning_rate": 8.43156312908297e-05, + "loss": 0.7818, + "step": 36067 + }, + { + "epoch": 0.9550665119615612, + "grad_norm": 0.7734375, + "learning_rate": 8.431108454522881e-05, + "loss": 0.9002, + "step": 36068 + }, + { + "epoch": 0.9550929915698556, + "grad_norm": 0.85546875, + "learning_rate": 8.430653783287968e-05, + "loss": 0.7342, + "step": 36069 + }, + { + "epoch": 0.95511947117815, + "grad_norm": 0.7578125, + "learning_rate": 8.430199115379194e-05, + "loss": 0.7019, + "step": 36070 + }, + { + "epoch": 0.9551459507864444, + "grad_norm": 0.75390625, + "learning_rate": 8.429744450797524e-05, + "loss": 0.8337, + "step": 36071 + }, + { + "epoch": 0.9551724303947388, + "grad_norm": 0.8125, + "learning_rate": 8.429289789543919e-05, + "loss": 0.9238, + "step": 36072 + }, + { + "epoch": 0.9551989100030331, + "grad_norm": 0.734375, + "learning_rate": 8.428835131619341e-05, + "loss": 0.7213, + "step": 36073 + }, + { + "epoch": 0.9552253896113275, + "grad_norm": 0.73828125, + "learning_rate": 8.428380477024763e-05, + "loss": 0.7336, + "step": 36074 + }, + { + "epoch": 0.9552518692196219, + "grad_norm": 0.76171875, + "learning_rate": 8.427925825761142e-05, + "loss": 0.7773, + "step": 36075 + }, + { + "epoch": 0.9552783488279163, + "grad_norm": 0.78515625, + "learning_rate": 8.427471177829443e-05, + "loss": 0.8327, + "step": 36076 + }, + { + "epoch": 0.9553048284362107, + "grad_norm": 0.75, + "learning_rate": 8.427016533230627e-05, + "loss": 0.8754, + "step": 36077 + }, + { + "epoch": 0.955331308044505, + "grad_norm": 0.8359375, + "learning_rate": 8.426561891965658e-05, + "loss": 0.8728, + "step": 36078 + }, + { + "epoch": 0.9553577876527993, + "grad_norm": 0.7734375, + "learning_rate": 8.426107254035503e-05, + "loss": 0.7824, + "step": 36079 + }, + { + "epoch": 0.9553842672610937, + "grad_norm": 0.828125, + "learning_rate": 8.425652619441125e-05, + "loss": 0.8514, + "step": 36080 + }, + { + "epoch": 0.9554107468693881, + "grad_norm": 0.85546875, + "learning_rate": 8.425197988183487e-05, + "loss": 0.827, + "step": 36081 + }, + { + "epoch": 0.9554372264776825, + "grad_norm": 0.89453125, + "learning_rate": 8.42474336026355e-05, + "loss": 0.7823, + "step": 36082 + }, + { + "epoch": 0.9554637060859769, + "grad_norm": 0.81640625, + "learning_rate": 8.424288735682277e-05, + "loss": 0.8172, + "step": 36083 + }, + { + "epoch": 0.9554901856942712, + "grad_norm": 0.75, + "learning_rate": 8.423834114440639e-05, + "loss": 0.812, + "step": 36084 + }, + { + "epoch": 0.9555166653025656, + "grad_norm": 0.7578125, + "learning_rate": 8.42337949653959e-05, + "loss": 0.7438, + "step": 36085 + }, + { + "epoch": 0.95554314491086, + "grad_norm": 0.796875, + "learning_rate": 8.4229248819801e-05, + "loss": 0.8432, + "step": 36086 + }, + { + "epoch": 0.9555696245191544, + "grad_norm": 0.82421875, + "learning_rate": 8.422470270763133e-05, + "loss": 0.6591, + "step": 36087 + }, + { + "epoch": 0.9555961041274488, + "grad_norm": 0.8828125, + "learning_rate": 8.422015662889645e-05, + "loss": 0.7848, + "step": 36088 + }, + { + "epoch": 0.9556225837357432, + "grad_norm": 0.78125, + "learning_rate": 8.421561058360607e-05, + "loss": 0.8541, + "step": 36089 + }, + { + "epoch": 0.9556490633440375, + "grad_norm": 0.69921875, + "learning_rate": 8.421106457176982e-05, + "loss": 0.7997, + "step": 36090 + }, + { + "epoch": 0.9556755429523319, + "grad_norm": 0.72265625, + "learning_rate": 8.420651859339731e-05, + "loss": 0.7851, + "step": 36091 + }, + { + "epoch": 0.9557020225606263, + "grad_norm": 0.828125, + "learning_rate": 8.420197264849818e-05, + "loss": 0.7573, + "step": 36092 + }, + { + "epoch": 0.9557285021689207, + "grad_norm": 0.859375, + "learning_rate": 8.419742673708203e-05, + "loss": 0.723, + "step": 36093 + }, + { + "epoch": 0.9557549817772151, + "grad_norm": 0.81640625, + "learning_rate": 8.419288085915855e-05, + "loss": 0.7489, + "step": 36094 + }, + { + "epoch": 0.9557814613855093, + "grad_norm": 0.86328125, + "learning_rate": 8.418833501473737e-05, + "loss": 0.6797, + "step": 36095 + }, + { + "epoch": 0.9558079409938037, + "grad_norm": 0.83203125, + "learning_rate": 8.41837892038281e-05, + "loss": 0.8359, + "step": 36096 + }, + { + "epoch": 0.9558344206020981, + "grad_norm": 0.70703125, + "learning_rate": 8.417924342644042e-05, + "loss": 0.6679, + "step": 36097 + }, + { + "epoch": 0.9558609002103925, + "grad_norm": 0.7421875, + "learning_rate": 8.417469768258385e-05, + "loss": 0.7418, + "step": 36098 + }, + { + "epoch": 0.9558873798186869, + "grad_norm": 0.71875, + "learning_rate": 8.417015197226816e-05, + "loss": 0.7641, + "step": 36099 + }, + { + "epoch": 0.9559138594269813, + "grad_norm": 0.78515625, + "learning_rate": 8.416560629550293e-05, + "loss": 0.7373, + "step": 36100 + }, + { + "epoch": 0.9559403390352756, + "grad_norm": 1.65625, + "learning_rate": 8.416106065229778e-05, + "loss": 0.7558, + "step": 36101 + }, + { + "epoch": 0.95596681864357, + "grad_norm": 0.7890625, + "learning_rate": 8.415651504266238e-05, + "loss": 0.7854, + "step": 36102 + }, + { + "epoch": 0.9559932982518644, + "grad_norm": 0.78125, + "learning_rate": 8.415196946660628e-05, + "loss": 0.8035, + "step": 36103 + }, + { + "epoch": 0.9560197778601588, + "grad_norm": 0.7734375, + "learning_rate": 8.41474239241392e-05, + "loss": 0.7753, + "step": 36104 + }, + { + "epoch": 0.9560462574684532, + "grad_norm": 0.83984375, + "learning_rate": 8.414287841527079e-05, + "loss": 0.8659, + "step": 36105 + }, + { + "epoch": 0.9560727370767476, + "grad_norm": 0.8515625, + "learning_rate": 8.413833294001062e-05, + "loss": 0.9053, + "step": 36106 + }, + { + "epoch": 0.9560992166850419, + "grad_norm": 0.69921875, + "learning_rate": 8.413378749836836e-05, + "loss": 0.6801, + "step": 36107 + }, + { + "epoch": 0.9561256962933363, + "grad_norm": 0.73046875, + "learning_rate": 8.412924209035359e-05, + "loss": 0.7038, + "step": 36108 + }, + { + "epoch": 0.9561521759016307, + "grad_norm": 0.7890625, + "learning_rate": 8.412469671597602e-05, + "loss": 0.7934, + "step": 36109 + }, + { + "epoch": 0.9561786555099251, + "grad_norm": 0.78125, + "learning_rate": 8.412015137524524e-05, + "loss": 0.8182, + "step": 36110 + }, + { + "epoch": 0.9562051351182194, + "grad_norm": 0.7265625, + "learning_rate": 8.41156060681709e-05, + "loss": 0.6867, + "step": 36111 + }, + { + "epoch": 0.9562316147265137, + "grad_norm": 0.8046875, + "learning_rate": 8.411106079476261e-05, + "loss": 0.8893, + "step": 36112 + }, + { + "epoch": 0.9562580943348081, + "grad_norm": 0.7578125, + "learning_rate": 8.410651555502998e-05, + "loss": 0.7384, + "step": 36113 + }, + { + "epoch": 0.9562845739431025, + "grad_norm": 0.8046875, + "learning_rate": 8.410197034898274e-05, + "loss": 0.9014, + "step": 36114 + }, + { + "epoch": 0.9563110535513969, + "grad_norm": 1.25, + "learning_rate": 8.409742517663045e-05, + "loss": 0.8485, + "step": 36115 + }, + { + "epoch": 0.9563375331596913, + "grad_norm": 0.8203125, + "learning_rate": 8.409288003798277e-05, + "loss": 0.9422, + "step": 36116 + }, + { + "epoch": 0.9563640127679857, + "grad_norm": 0.76171875, + "learning_rate": 8.408833493304931e-05, + "loss": 0.7552, + "step": 36117 + }, + { + "epoch": 0.95639049237628, + "grad_norm": 0.71875, + "learning_rate": 8.408378986183969e-05, + "loss": 0.7424, + "step": 36118 + }, + { + "epoch": 0.9564169719845744, + "grad_norm": 0.69140625, + "learning_rate": 8.407924482436357e-05, + "loss": 0.647, + "step": 36119 + }, + { + "epoch": 0.9564434515928688, + "grad_norm": 0.76171875, + "learning_rate": 8.407469982063063e-05, + "loss": 0.8773, + "step": 36120 + }, + { + "epoch": 0.9564699312011632, + "grad_norm": 0.8828125, + "learning_rate": 8.407015485065042e-05, + "loss": 0.6544, + "step": 36121 + }, + { + "epoch": 0.9564964108094576, + "grad_norm": 0.99609375, + "learning_rate": 8.406560991443261e-05, + "loss": 0.7701, + "step": 36122 + }, + { + "epoch": 0.956522890417752, + "grad_norm": 0.88671875, + "learning_rate": 8.406106501198681e-05, + "loss": 0.8006, + "step": 36123 + }, + { + "epoch": 0.9565493700260463, + "grad_norm": 0.7421875, + "learning_rate": 8.40565201433227e-05, + "loss": 0.7584, + "step": 36124 + }, + { + "epoch": 0.9565758496343407, + "grad_norm": 0.7421875, + "learning_rate": 8.405197530844987e-05, + "loss": 0.8291, + "step": 36125 + }, + { + "epoch": 0.9566023292426351, + "grad_norm": 0.828125, + "learning_rate": 8.404743050737797e-05, + "loss": 0.856, + "step": 36126 + }, + { + "epoch": 0.9566288088509294, + "grad_norm": 0.73828125, + "learning_rate": 8.404288574011665e-05, + "loss": 0.8263, + "step": 36127 + }, + { + "epoch": 0.9566552884592238, + "grad_norm": 0.77734375, + "learning_rate": 8.403834100667549e-05, + "loss": 0.6409, + "step": 36128 + }, + { + "epoch": 0.9566817680675181, + "grad_norm": 0.82421875, + "learning_rate": 8.403379630706417e-05, + "loss": 0.7076, + "step": 36129 + }, + { + "epoch": 0.9567082476758125, + "grad_norm": 0.75390625, + "learning_rate": 8.402925164129232e-05, + "loss": 0.7268, + "step": 36130 + }, + { + "epoch": 0.9567347272841069, + "grad_norm": 0.703125, + "learning_rate": 8.402470700936954e-05, + "loss": 0.7763, + "step": 36131 + }, + { + "epoch": 0.9567612068924013, + "grad_norm": 0.8515625, + "learning_rate": 8.402016241130549e-05, + "loss": 0.921, + "step": 36132 + }, + { + "epoch": 0.9567876865006957, + "grad_norm": 0.7109375, + "learning_rate": 8.401561784710978e-05, + "loss": 0.8449, + "step": 36133 + }, + { + "epoch": 0.9568141661089901, + "grad_norm": 0.80859375, + "learning_rate": 8.401107331679203e-05, + "loss": 0.783, + "step": 36134 + }, + { + "epoch": 0.9568406457172844, + "grad_norm": 0.8203125, + "learning_rate": 8.400652882036194e-05, + "loss": 0.8066, + "step": 36135 + }, + { + "epoch": 0.9568671253255788, + "grad_norm": 0.7890625, + "learning_rate": 8.400198435782909e-05, + "loss": 0.9421, + "step": 36136 + }, + { + "epoch": 0.9568936049338732, + "grad_norm": 0.7734375, + "learning_rate": 8.399743992920311e-05, + "loss": 0.7462, + "step": 36137 + }, + { + "epoch": 0.9569200845421676, + "grad_norm": 0.67578125, + "learning_rate": 8.399289553449365e-05, + "loss": 0.6853, + "step": 36138 + }, + { + "epoch": 0.956946564150462, + "grad_norm": 0.7265625, + "learning_rate": 8.39883511737103e-05, + "loss": 0.6992, + "step": 36139 + }, + { + "epoch": 0.9569730437587564, + "grad_norm": 0.76953125, + "learning_rate": 8.398380684686276e-05, + "loss": 0.7597, + "step": 36140 + }, + { + "epoch": 0.9569995233670507, + "grad_norm": 0.77734375, + "learning_rate": 8.397926255396062e-05, + "loss": 0.7885, + "step": 36141 + }, + { + "epoch": 0.9570260029753451, + "grad_norm": 0.79296875, + "learning_rate": 8.397471829501352e-05, + "loss": 0.8028, + "step": 36142 + }, + { + "epoch": 0.9570524825836395, + "grad_norm": 0.8515625, + "learning_rate": 8.397017407003109e-05, + "loss": 0.7462, + "step": 36143 + }, + { + "epoch": 0.9570789621919338, + "grad_norm": 0.8125, + "learning_rate": 8.396562987902292e-05, + "loss": 0.6564, + "step": 36144 + }, + { + "epoch": 0.9571054418002282, + "grad_norm": 0.734375, + "learning_rate": 8.396108572199872e-05, + "loss": 0.7119, + "step": 36145 + }, + { + "epoch": 0.9571319214085225, + "grad_norm": 0.77734375, + "learning_rate": 8.395654159896809e-05, + "loss": 0.9428, + "step": 36146 + }, + { + "epoch": 0.9571584010168169, + "grad_norm": 0.73046875, + "learning_rate": 8.395199750994065e-05, + "loss": 0.7337, + "step": 36147 + }, + { + "epoch": 0.9571848806251113, + "grad_norm": 0.77734375, + "learning_rate": 8.394745345492603e-05, + "loss": 0.7508, + "step": 36148 + }, + { + "epoch": 0.9572113602334057, + "grad_norm": 1.03125, + "learning_rate": 8.394290943393383e-05, + "loss": 0.7594, + "step": 36149 + }, + { + "epoch": 0.9572378398417001, + "grad_norm": 0.765625, + "learning_rate": 8.393836544697376e-05, + "loss": 0.7893, + "step": 36150 + }, + { + "epoch": 0.9572643194499945, + "grad_norm": 0.7421875, + "learning_rate": 8.39338214940554e-05, + "loss": 0.7551, + "step": 36151 + }, + { + "epoch": 0.9572907990582888, + "grad_norm": 0.74609375, + "learning_rate": 8.392927757518835e-05, + "loss": 0.7151, + "step": 36152 + }, + { + "epoch": 0.9573172786665832, + "grad_norm": 0.80078125, + "learning_rate": 8.392473369038231e-05, + "loss": 0.8276, + "step": 36153 + }, + { + "epoch": 0.9573437582748776, + "grad_norm": 0.7421875, + "learning_rate": 8.392018983964687e-05, + "loss": 0.7498, + "step": 36154 + }, + { + "epoch": 0.957370237883172, + "grad_norm": 0.72265625, + "learning_rate": 8.391564602299168e-05, + "loss": 0.7766, + "step": 36155 + }, + { + "epoch": 0.9573967174914664, + "grad_norm": 0.83203125, + "learning_rate": 8.391110224042635e-05, + "loss": 0.7886, + "step": 36156 + }, + { + "epoch": 0.9574231970997608, + "grad_norm": 0.82421875, + "learning_rate": 8.390655849196053e-05, + "loss": 0.7616, + "step": 36157 + }, + { + "epoch": 0.9574496767080551, + "grad_norm": 0.8203125, + "learning_rate": 8.390201477760385e-05, + "loss": 0.8149, + "step": 36158 + }, + { + "epoch": 0.9574761563163495, + "grad_norm": 0.83984375, + "learning_rate": 8.389747109736587e-05, + "loss": 0.8821, + "step": 36159 + }, + { + "epoch": 0.9575026359246438, + "grad_norm": 0.81640625, + "learning_rate": 8.389292745125633e-05, + "loss": 0.7276, + "step": 36160 + }, + { + "epoch": 0.9575291155329382, + "grad_norm": 0.83203125, + "learning_rate": 8.388838383928479e-05, + "loss": 0.7911, + "step": 36161 + }, + { + "epoch": 0.9575555951412326, + "grad_norm": 0.78125, + "learning_rate": 8.388384026146092e-05, + "loss": 0.6861, + "step": 36162 + }, + { + "epoch": 0.957582074749527, + "grad_norm": 0.78125, + "learning_rate": 8.387929671779432e-05, + "loss": 0.8836, + "step": 36163 + }, + { + "epoch": 0.9576085543578213, + "grad_norm": 1.1875, + "learning_rate": 8.38747532082946e-05, + "loss": 0.8129, + "step": 36164 + }, + { + "epoch": 0.9576350339661157, + "grad_norm": 0.7265625, + "learning_rate": 8.387020973297144e-05, + "loss": 0.7451, + "step": 36165 + }, + { + "epoch": 0.9576615135744101, + "grad_norm": 0.82421875, + "learning_rate": 8.386566629183446e-05, + "loss": 0.7779, + "step": 36166 + }, + { + "epoch": 0.9576879931827045, + "grad_norm": 0.859375, + "learning_rate": 8.386112288489329e-05, + "loss": 0.8073, + "step": 36167 + }, + { + "epoch": 0.9577144727909989, + "grad_norm": 0.83203125, + "learning_rate": 8.385657951215753e-05, + "loss": 0.7857, + "step": 36168 + }, + { + "epoch": 0.9577409523992932, + "grad_norm": 0.76953125, + "learning_rate": 8.385203617363682e-05, + "loss": 0.9691, + "step": 36169 + }, + { + "epoch": 0.9577674320075876, + "grad_norm": 0.765625, + "learning_rate": 8.384749286934078e-05, + "loss": 0.7476, + "step": 36170 + }, + { + "epoch": 0.957793911615882, + "grad_norm": 0.7265625, + "learning_rate": 8.384294959927908e-05, + "loss": 0.7648, + "step": 36171 + }, + { + "epoch": 0.9578203912241764, + "grad_norm": 0.7734375, + "learning_rate": 8.383840636346135e-05, + "loss": 0.7908, + "step": 36172 + }, + { + "epoch": 0.9578468708324708, + "grad_norm": 0.7578125, + "learning_rate": 8.383386316189716e-05, + "loss": 0.8067, + "step": 36173 + }, + { + "epoch": 0.9578733504407652, + "grad_norm": 0.78125, + "learning_rate": 8.382931999459617e-05, + "loss": 0.7315, + "step": 36174 + }, + { + "epoch": 0.9578998300490595, + "grad_norm": 0.7734375, + "learning_rate": 8.382477686156802e-05, + "loss": 0.7618, + "step": 36175 + }, + { + "epoch": 0.9579263096573538, + "grad_norm": 0.82421875, + "learning_rate": 8.382023376282234e-05, + "loss": 0.7876, + "step": 36176 + }, + { + "epoch": 0.9579527892656482, + "grad_norm": 0.7734375, + "learning_rate": 8.381569069836875e-05, + "loss": 0.6955, + "step": 36177 + }, + { + "epoch": 0.9579792688739426, + "grad_norm": 0.7734375, + "learning_rate": 8.381114766821689e-05, + "loss": 0.8207, + "step": 36178 + }, + { + "epoch": 0.958005748482237, + "grad_norm": 0.7578125, + "learning_rate": 8.380660467237631e-05, + "loss": 0.7825, + "step": 36179 + }, + { + "epoch": 0.9580322280905313, + "grad_norm": 0.76171875, + "learning_rate": 8.380206171085676e-05, + "loss": 0.7449, + "step": 36180 + }, + { + "epoch": 0.9580587076988257, + "grad_norm": 0.78515625, + "learning_rate": 8.379751878366782e-05, + "loss": 0.7199, + "step": 36181 + }, + { + "epoch": 0.9580851873071201, + "grad_norm": 0.7734375, + "learning_rate": 8.379297589081911e-05, + "loss": 0.8224, + "step": 36182 + }, + { + "epoch": 0.9581116669154145, + "grad_norm": 0.83203125, + "learning_rate": 8.378843303232025e-05, + "loss": 0.7906, + "step": 36183 + }, + { + "epoch": 0.9581381465237089, + "grad_norm": 0.7421875, + "learning_rate": 8.378389020818086e-05, + "loss": 0.7014, + "step": 36184 + }, + { + "epoch": 0.9581646261320033, + "grad_norm": 0.84765625, + "learning_rate": 8.377934741841061e-05, + "loss": 0.7476, + "step": 36185 + }, + { + "epoch": 0.9581911057402976, + "grad_norm": 0.82421875, + "learning_rate": 8.377480466301912e-05, + "loss": 0.8635, + "step": 36186 + }, + { + "epoch": 0.958217585348592, + "grad_norm": 0.76171875, + "learning_rate": 8.377026194201599e-05, + "loss": 0.7664, + "step": 36187 + }, + { + "epoch": 0.9582440649568864, + "grad_norm": 0.78125, + "learning_rate": 8.376571925541088e-05, + "loss": 0.7158, + "step": 36188 + }, + { + "epoch": 0.9582705445651808, + "grad_norm": 0.85546875, + "learning_rate": 8.376117660321335e-05, + "loss": 0.7861, + "step": 36189 + }, + { + "epoch": 0.9582970241734752, + "grad_norm": 0.8671875, + "learning_rate": 8.375663398543311e-05, + "loss": 0.8262, + "step": 36190 + }, + { + "epoch": 0.9583235037817696, + "grad_norm": 0.8515625, + "learning_rate": 8.375209140207976e-05, + "loss": 0.825, + "step": 36191 + }, + { + "epoch": 0.9583499833900639, + "grad_norm": 0.8359375, + "learning_rate": 8.374754885316293e-05, + "loss": 0.9309, + "step": 36192 + }, + { + "epoch": 0.9583764629983582, + "grad_norm": 0.8046875, + "learning_rate": 8.374300633869223e-05, + "loss": 0.7545, + "step": 36193 + }, + { + "epoch": 0.9584029426066526, + "grad_norm": 0.8046875, + "learning_rate": 8.37384638586773e-05, + "loss": 0.865, + "step": 36194 + }, + { + "epoch": 0.958429422214947, + "grad_norm": 0.83984375, + "learning_rate": 8.373392141312777e-05, + "loss": 0.9215, + "step": 36195 + }, + { + "epoch": 0.9584559018232414, + "grad_norm": 0.65625, + "learning_rate": 8.372937900205326e-05, + "loss": 0.6383, + "step": 36196 + }, + { + "epoch": 0.9584823814315357, + "grad_norm": 0.77734375, + "learning_rate": 8.372483662546342e-05, + "loss": 0.8055, + "step": 36197 + }, + { + "epoch": 0.9585088610398301, + "grad_norm": 0.78515625, + "learning_rate": 8.372029428336783e-05, + "loss": 0.8129, + "step": 36198 + }, + { + "epoch": 0.9585353406481245, + "grad_norm": 1.234375, + "learning_rate": 8.371575197577613e-05, + "loss": 0.7719, + "step": 36199 + }, + { + "epoch": 0.9585618202564189, + "grad_norm": 0.77734375, + "learning_rate": 8.3711209702698e-05, + "loss": 0.7804, + "step": 36200 + }, + { + "epoch": 0.9585882998647133, + "grad_norm": 0.8671875, + "learning_rate": 8.370666746414301e-05, + "loss": 0.7452, + "step": 36201 + }, + { + "epoch": 0.9586147794730077, + "grad_norm": 0.80859375, + "learning_rate": 8.370212526012083e-05, + "loss": 0.815, + "step": 36202 + }, + { + "epoch": 0.958641259081302, + "grad_norm": 0.765625, + "learning_rate": 8.369758309064106e-05, + "loss": 0.8536, + "step": 36203 + }, + { + "epoch": 0.9586677386895964, + "grad_norm": 0.78515625, + "learning_rate": 8.369304095571328e-05, + "loss": 0.7559, + "step": 36204 + }, + { + "epoch": 0.9586942182978908, + "grad_norm": 0.76171875, + "learning_rate": 8.368849885534722e-05, + "loss": 0.849, + "step": 36205 + }, + { + "epoch": 0.9587206979061852, + "grad_norm": 0.90234375, + "learning_rate": 8.368395678955246e-05, + "loss": 0.8459, + "step": 36206 + }, + { + "epoch": 0.9587471775144796, + "grad_norm": 0.82421875, + "learning_rate": 8.36794147583386e-05, + "loss": 0.8031, + "step": 36207 + }, + { + "epoch": 0.958773657122774, + "grad_norm": 0.8671875, + "learning_rate": 8.36748727617153e-05, + "loss": 0.8295, + "step": 36208 + }, + { + "epoch": 0.9588001367310682, + "grad_norm": 0.79296875, + "learning_rate": 8.367033079969213e-05, + "loss": 0.7297, + "step": 36209 + }, + { + "epoch": 0.9588266163393626, + "grad_norm": 0.81640625, + "learning_rate": 8.36657888722788e-05, + "loss": 0.7621, + "step": 36210 + }, + { + "epoch": 0.958853095947657, + "grad_norm": 0.78125, + "learning_rate": 8.36612469794849e-05, + "loss": 0.6782, + "step": 36211 + }, + { + "epoch": 0.9588795755559514, + "grad_norm": 0.6875, + "learning_rate": 8.365670512132006e-05, + "loss": 0.6636, + "step": 36212 + }, + { + "epoch": 0.9589060551642458, + "grad_norm": 0.81640625, + "learning_rate": 8.365216329779388e-05, + "loss": 0.774, + "step": 36213 + }, + { + "epoch": 0.9589325347725401, + "grad_norm": 0.7734375, + "learning_rate": 8.364762150891601e-05, + "loss": 0.8896, + "step": 36214 + }, + { + "epoch": 0.9589590143808345, + "grad_norm": 0.78515625, + "learning_rate": 8.364307975469608e-05, + "loss": 0.8534, + "step": 36215 + }, + { + "epoch": 0.9589854939891289, + "grad_norm": 0.75390625, + "learning_rate": 8.36385380351437e-05, + "loss": 0.8054, + "step": 36216 + }, + { + "epoch": 0.9590119735974233, + "grad_norm": 0.7578125, + "learning_rate": 8.363399635026852e-05, + "loss": 0.7695, + "step": 36217 + }, + { + "epoch": 0.9590384532057177, + "grad_norm": 0.8671875, + "learning_rate": 8.362945470008013e-05, + "loss": 0.7562, + "step": 36218 + }, + { + "epoch": 0.9590649328140121, + "grad_norm": 0.78515625, + "learning_rate": 8.362491308458815e-05, + "loss": 0.7765, + "step": 36219 + }, + { + "epoch": 0.9590914124223064, + "grad_norm": 0.75390625, + "learning_rate": 8.362037150380227e-05, + "loss": 0.7359, + "step": 36220 + }, + { + "epoch": 0.9591178920306008, + "grad_norm": 0.8359375, + "learning_rate": 8.361582995773206e-05, + "loss": 0.7114, + "step": 36221 + }, + { + "epoch": 0.9591443716388952, + "grad_norm": 0.7421875, + "learning_rate": 8.361128844638717e-05, + "loss": 0.7515, + "step": 36222 + }, + { + "epoch": 0.9591708512471896, + "grad_norm": 1.4921875, + "learning_rate": 8.360674696977722e-05, + "loss": 0.7417, + "step": 36223 + }, + { + "epoch": 0.959197330855484, + "grad_norm": 0.734375, + "learning_rate": 8.36022055279118e-05, + "loss": 0.6886, + "step": 36224 + }, + { + "epoch": 0.9592238104637782, + "grad_norm": 0.75390625, + "learning_rate": 8.35976641208006e-05, + "loss": 0.791, + "step": 36225 + }, + { + "epoch": 0.9592502900720726, + "grad_norm": 2.015625, + "learning_rate": 8.359312274845322e-05, + "loss": 0.7781, + "step": 36226 + }, + { + "epoch": 0.959276769680367, + "grad_norm": 0.74609375, + "learning_rate": 8.358858141087928e-05, + "loss": 0.761, + "step": 36227 + }, + { + "epoch": 0.9593032492886614, + "grad_norm": 0.96875, + "learning_rate": 8.358404010808839e-05, + "loss": 0.7138, + "step": 36228 + }, + { + "epoch": 0.9593297288969558, + "grad_norm": 0.8046875, + "learning_rate": 8.357949884009016e-05, + "loss": 0.8339, + "step": 36229 + }, + { + "epoch": 0.9593562085052502, + "grad_norm": 0.984375, + "learning_rate": 8.357495760689426e-05, + "loss": 0.891, + "step": 36230 + }, + { + "epoch": 0.9593826881135445, + "grad_norm": 0.765625, + "learning_rate": 8.357041640851034e-05, + "loss": 0.8016, + "step": 36231 + }, + { + "epoch": 0.9594091677218389, + "grad_norm": 0.76171875, + "learning_rate": 8.356587524494796e-05, + "loss": 0.7877, + "step": 36232 + }, + { + "epoch": 0.9594356473301333, + "grad_norm": 0.80078125, + "learning_rate": 8.356133411621676e-05, + "loss": 0.8733, + "step": 36233 + }, + { + "epoch": 0.9594621269384277, + "grad_norm": 0.796875, + "learning_rate": 8.355679302232637e-05, + "loss": 0.8489, + "step": 36234 + }, + { + "epoch": 0.9594886065467221, + "grad_norm": 0.71484375, + "learning_rate": 8.355225196328642e-05, + "loss": 0.7181, + "step": 36235 + }, + { + "epoch": 0.9595150861550165, + "grad_norm": 0.796875, + "learning_rate": 8.354771093910655e-05, + "loss": 0.7626, + "step": 36236 + }, + { + "epoch": 0.9595415657633108, + "grad_norm": 0.73828125, + "learning_rate": 8.354316994979634e-05, + "loss": 0.733, + "step": 36237 + }, + { + "epoch": 0.9595680453716052, + "grad_norm": 0.80859375, + "learning_rate": 8.353862899536545e-05, + "loss": 0.7455, + "step": 36238 + }, + { + "epoch": 0.9595945249798996, + "grad_norm": 0.7734375, + "learning_rate": 8.353408807582348e-05, + "loss": 0.8783, + "step": 36239 + }, + { + "epoch": 0.959621004588194, + "grad_norm": 0.734375, + "learning_rate": 8.35295471911801e-05, + "loss": 0.7062, + "step": 36240 + }, + { + "epoch": 0.9596474841964884, + "grad_norm": 0.77734375, + "learning_rate": 8.352500634144489e-05, + "loss": 0.7576, + "step": 36241 + }, + { + "epoch": 0.9596739638047826, + "grad_norm": 0.8125, + "learning_rate": 8.35204655266275e-05, + "loss": 0.8796, + "step": 36242 + }, + { + "epoch": 0.959700443413077, + "grad_norm": 0.86328125, + "learning_rate": 8.351592474673753e-05, + "loss": 0.8506, + "step": 36243 + }, + { + "epoch": 0.9597269230213714, + "grad_norm": 0.75390625, + "learning_rate": 8.351138400178457e-05, + "loss": 0.8088, + "step": 36244 + }, + { + "epoch": 0.9597534026296658, + "grad_norm": 0.796875, + "learning_rate": 8.350684329177836e-05, + "loss": 0.7515, + "step": 36245 + }, + { + "epoch": 0.9597798822379602, + "grad_norm": 0.7890625, + "learning_rate": 8.350230261672842e-05, + "loss": 0.8588, + "step": 36246 + }, + { + "epoch": 0.9598063618462546, + "grad_norm": 0.8203125, + "learning_rate": 8.349776197664442e-05, + "loss": 0.7461, + "step": 36247 + }, + { + "epoch": 0.959832841454549, + "grad_norm": 0.7890625, + "learning_rate": 8.349322137153597e-05, + "loss": 0.838, + "step": 36248 + }, + { + "epoch": 0.9598593210628433, + "grad_norm": 0.8203125, + "learning_rate": 8.348868080141265e-05, + "loss": 0.8082, + "step": 36249 + }, + { + "epoch": 0.9598858006711377, + "grad_norm": 0.78125, + "learning_rate": 8.348414026628417e-05, + "loss": 0.901, + "step": 36250 + }, + { + "epoch": 0.9599122802794321, + "grad_norm": 0.80859375, + "learning_rate": 8.347959976616011e-05, + "loss": 0.8144, + "step": 36251 + }, + { + "epoch": 0.9599387598877265, + "grad_norm": 0.890625, + "learning_rate": 8.347505930105011e-05, + "loss": 0.7835, + "step": 36252 + }, + { + "epoch": 0.9599652394960209, + "grad_norm": 0.796875, + "learning_rate": 8.347051887096376e-05, + "loss": 0.767, + "step": 36253 + }, + { + "epoch": 0.9599917191043152, + "grad_norm": 0.71484375, + "learning_rate": 8.346597847591068e-05, + "loss": 0.8108, + "step": 36254 + }, + { + "epoch": 0.9600181987126096, + "grad_norm": 0.796875, + "learning_rate": 8.346143811590054e-05, + "loss": 0.7152, + "step": 36255 + }, + { + "epoch": 0.960044678320904, + "grad_norm": 0.82421875, + "learning_rate": 8.34568977909429e-05, + "loss": 0.8033, + "step": 36256 + }, + { + "epoch": 0.9600711579291984, + "grad_norm": 0.796875, + "learning_rate": 8.345235750104746e-05, + "loss": 0.7803, + "step": 36257 + }, + { + "epoch": 0.9600976375374927, + "grad_norm": 0.85546875, + "learning_rate": 8.34478172462238e-05, + "loss": 0.871, + "step": 36258 + }, + { + "epoch": 0.960124117145787, + "grad_norm": 0.765625, + "learning_rate": 8.344327702648151e-05, + "loss": 0.8793, + "step": 36259 + }, + { + "epoch": 0.9601505967540814, + "grad_norm": 0.82421875, + "learning_rate": 8.343873684183028e-05, + "loss": 0.7947, + "step": 36260 + }, + { + "epoch": 0.9601770763623758, + "grad_norm": 0.9296875, + "learning_rate": 8.343419669227969e-05, + "loss": 0.8956, + "step": 36261 + }, + { + "epoch": 0.9602035559706702, + "grad_norm": 0.828125, + "learning_rate": 8.342965657783938e-05, + "loss": 0.8612, + "step": 36262 + }, + { + "epoch": 0.9602300355789646, + "grad_norm": 0.72265625, + "learning_rate": 8.342511649851898e-05, + "loss": 0.7626, + "step": 36263 + }, + { + "epoch": 0.960256515187259, + "grad_norm": 0.7421875, + "learning_rate": 8.342057645432805e-05, + "loss": 0.7991, + "step": 36264 + }, + { + "epoch": 0.9602829947955533, + "grad_norm": 0.79296875, + "learning_rate": 8.34160364452763e-05, + "loss": 0.7363, + "step": 36265 + }, + { + "epoch": 0.9603094744038477, + "grad_norm": 0.84375, + "learning_rate": 8.341149647137331e-05, + "loss": 0.9166, + "step": 36266 + }, + { + "epoch": 0.9603359540121421, + "grad_norm": 0.94921875, + "learning_rate": 8.34069565326287e-05, + "loss": 0.7342, + "step": 36267 + }, + { + "epoch": 0.9603624336204365, + "grad_norm": 0.8515625, + "learning_rate": 8.34024166290521e-05, + "loss": 0.8456, + "step": 36268 + }, + { + "epoch": 0.9603889132287309, + "grad_norm": 0.75390625, + "learning_rate": 8.33978767606531e-05, + "loss": 0.8264, + "step": 36269 + }, + { + "epoch": 0.9604153928370253, + "grad_norm": 0.75390625, + "learning_rate": 8.339333692744138e-05, + "loss": 0.7838, + "step": 36270 + }, + { + "epoch": 0.9604418724453196, + "grad_norm": 0.77734375, + "learning_rate": 8.338879712942653e-05, + "loss": 0.7502, + "step": 36271 + }, + { + "epoch": 0.960468352053614, + "grad_norm": 0.76171875, + "learning_rate": 8.338425736661818e-05, + "loss": 0.7282, + "step": 36272 + }, + { + "epoch": 0.9604948316619084, + "grad_norm": 0.81640625, + "learning_rate": 8.337971763902594e-05, + "loss": 0.8566, + "step": 36273 + }, + { + "epoch": 0.9605213112702027, + "grad_norm": 0.71875, + "learning_rate": 8.337517794665945e-05, + "loss": 0.7133, + "step": 36274 + }, + { + "epoch": 0.9605477908784971, + "grad_norm": 0.73828125, + "learning_rate": 8.337063828952828e-05, + "loss": 0.7049, + "step": 36275 + }, + { + "epoch": 0.9605742704867914, + "grad_norm": 0.75, + "learning_rate": 8.336609866764212e-05, + "loss": 0.7846, + "step": 36276 + }, + { + "epoch": 0.9606007500950858, + "grad_norm": 0.7421875, + "learning_rate": 8.336155908101058e-05, + "loss": 0.8151, + "step": 36277 + }, + { + "epoch": 0.9606272297033802, + "grad_norm": 0.796875, + "learning_rate": 8.335701952964325e-05, + "loss": 0.7159, + "step": 36278 + }, + { + "epoch": 0.9606537093116746, + "grad_norm": 0.80078125, + "learning_rate": 8.335248001354978e-05, + "loss": 0.7528, + "step": 36279 + }, + { + "epoch": 0.960680188919969, + "grad_norm": 0.7734375, + "learning_rate": 8.334794053273974e-05, + "loss": 0.8619, + "step": 36280 + }, + { + "epoch": 0.9607066685282634, + "grad_norm": 0.671875, + "learning_rate": 8.334340108722282e-05, + "loss": 0.7076, + "step": 36281 + }, + { + "epoch": 0.9607331481365577, + "grad_norm": 0.84375, + "learning_rate": 8.33388616770086e-05, + "loss": 0.856, + "step": 36282 + }, + { + "epoch": 0.9607596277448521, + "grad_norm": 0.77734375, + "learning_rate": 8.333432230210673e-05, + "loss": 0.7692, + "step": 36283 + }, + { + "epoch": 0.9607861073531465, + "grad_norm": 0.81640625, + "learning_rate": 8.332978296252679e-05, + "loss": 0.7277, + "step": 36284 + }, + { + "epoch": 0.9608125869614409, + "grad_norm": 0.7578125, + "learning_rate": 8.332524365827839e-05, + "loss": 0.7717, + "step": 36285 + }, + { + "epoch": 0.9608390665697353, + "grad_norm": 0.77734375, + "learning_rate": 8.332070438937123e-05, + "loss": 0.8136, + "step": 36286 + }, + { + "epoch": 0.9608655461780297, + "grad_norm": 1.34375, + "learning_rate": 8.331616515581487e-05, + "loss": 0.8001, + "step": 36287 + }, + { + "epoch": 0.960892025786324, + "grad_norm": 0.84375, + "learning_rate": 8.331162595761895e-05, + "loss": 0.7621, + "step": 36288 + }, + { + "epoch": 0.9609185053946184, + "grad_norm": 1.65625, + "learning_rate": 8.33070867947931e-05, + "loss": 0.7515, + "step": 36289 + }, + { + "epoch": 0.9609449850029128, + "grad_norm": 0.859375, + "learning_rate": 8.330254766734685e-05, + "loss": 0.8069, + "step": 36290 + }, + { + "epoch": 0.9609714646112071, + "grad_norm": 0.80859375, + "learning_rate": 8.329800857528996e-05, + "loss": 0.7681, + "step": 36291 + }, + { + "epoch": 0.9609979442195015, + "grad_norm": 0.84375, + "learning_rate": 8.329346951863199e-05, + "loss": 0.7838, + "step": 36292 + }, + { + "epoch": 0.9610244238277958, + "grad_norm": 0.80859375, + "learning_rate": 8.328893049738255e-05, + "loss": 0.7918, + "step": 36293 + }, + { + "epoch": 0.9610509034360902, + "grad_norm": 0.75390625, + "learning_rate": 8.328439151155126e-05, + "loss": 0.6833, + "step": 36294 + }, + { + "epoch": 0.9610773830443846, + "grad_norm": 0.80078125, + "learning_rate": 8.327985256114772e-05, + "loss": 0.6994, + "step": 36295 + }, + { + "epoch": 0.961103862652679, + "grad_norm": 0.81640625, + "learning_rate": 8.327531364618161e-05, + "loss": 0.8997, + "step": 36296 + }, + { + "epoch": 0.9611303422609734, + "grad_norm": 0.7109375, + "learning_rate": 8.327077476666252e-05, + "loss": 0.8317, + "step": 36297 + }, + { + "epoch": 0.9611568218692678, + "grad_norm": 0.73828125, + "learning_rate": 8.326623592260006e-05, + "loss": 0.7413, + "step": 36298 + }, + { + "epoch": 0.9611833014775621, + "grad_norm": 0.81640625, + "learning_rate": 8.326169711400385e-05, + "loss": 0.7669, + "step": 36299 + }, + { + "epoch": 0.9612097810858565, + "grad_norm": 0.88671875, + "learning_rate": 8.325715834088352e-05, + "loss": 0.7881, + "step": 36300 + }, + { + "epoch": 0.9612362606941509, + "grad_norm": 0.765625, + "learning_rate": 8.325261960324868e-05, + "loss": 0.7974, + "step": 36301 + }, + { + "epoch": 0.9612627403024453, + "grad_norm": 0.7890625, + "learning_rate": 8.324808090110897e-05, + "loss": 0.7262, + "step": 36302 + }, + { + "epoch": 0.9612892199107397, + "grad_norm": 0.74609375, + "learning_rate": 8.3243542234474e-05, + "loss": 0.7745, + "step": 36303 + }, + { + "epoch": 0.9613156995190341, + "grad_norm": 0.7578125, + "learning_rate": 8.323900360335335e-05, + "loss": 0.7209, + "step": 36304 + }, + { + "epoch": 0.9613421791273284, + "grad_norm": 0.7109375, + "learning_rate": 8.323446500775668e-05, + "loss": 0.7166, + "step": 36305 + }, + { + "epoch": 0.9613686587356228, + "grad_norm": 0.828125, + "learning_rate": 8.322992644769362e-05, + "loss": 0.7897, + "step": 36306 + }, + { + "epoch": 0.9613951383439171, + "grad_norm": 0.73828125, + "learning_rate": 8.322538792317378e-05, + "loss": 0.7003, + "step": 36307 + }, + { + "epoch": 0.9614216179522115, + "grad_norm": 0.78515625, + "learning_rate": 8.322084943420678e-05, + "loss": 0.9194, + "step": 36308 + }, + { + "epoch": 0.9614480975605059, + "grad_norm": 0.7578125, + "learning_rate": 8.321631098080221e-05, + "loss": 0.708, + "step": 36309 + }, + { + "epoch": 0.9614745771688002, + "grad_norm": 0.78125, + "learning_rate": 8.321177256296968e-05, + "loss": 0.743, + "step": 36310 + }, + { + "epoch": 0.9615010567770946, + "grad_norm": 0.80078125, + "learning_rate": 8.320723418071888e-05, + "loss": 0.8079, + "step": 36311 + }, + { + "epoch": 0.961527536385389, + "grad_norm": 0.80078125, + "learning_rate": 8.320269583405938e-05, + "loss": 0.7195, + "step": 36312 + }, + { + "epoch": 0.9615540159936834, + "grad_norm": 0.78515625, + "learning_rate": 8.319815752300081e-05, + "loss": 0.8221, + "step": 36313 + }, + { + "epoch": 0.9615804956019778, + "grad_norm": 0.7890625, + "learning_rate": 8.319361924755277e-05, + "loss": 0.8067, + "step": 36314 + }, + { + "epoch": 0.9616069752102722, + "grad_norm": 0.80078125, + "learning_rate": 8.318908100772488e-05, + "loss": 0.7652, + "step": 36315 + }, + { + "epoch": 0.9616334548185665, + "grad_norm": 0.7265625, + "learning_rate": 8.31845428035268e-05, + "loss": 0.8488, + "step": 36316 + }, + { + "epoch": 0.9616599344268609, + "grad_norm": 0.75390625, + "learning_rate": 8.318000463496812e-05, + "loss": 0.7855, + "step": 36317 + }, + { + "epoch": 0.9616864140351553, + "grad_norm": 0.8515625, + "learning_rate": 8.317546650205846e-05, + "loss": 0.7801, + "step": 36318 + }, + { + "epoch": 0.9617128936434497, + "grad_norm": 0.83984375, + "learning_rate": 8.317092840480744e-05, + "loss": 0.8452, + "step": 36319 + }, + { + "epoch": 0.9617393732517441, + "grad_norm": 0.79296875, + "learning_rate": 8.316639034322464e-05, + "loss": 0.7654, + "step": 36320 + }, + { + "epoch": 0.9617658528600385, + "grad_norm": 0.75390625, + "learning_rate": 8.316185231731976e-05, + "loss": 0.8033, + "step": 36321 + }, + { + "epoch": 0.9617923324683328, + "grad_norm": 0.73828125, + "learning_rate": 8.315731432710235e-05, + "loss": 0.7621, + "step": 36322 + }, + { + "epoch": 0.9618188120766271, + "grad_norm": 0.69921875, + "learning_rate": 8.315277637258205e-05, + "loss": 0.7536, + "step": 36323 + }, + { + "epoch": 0.9618452916849215, + "grad_norm": 0.8515625, + "learning_rate": 8.314823845376847e-05, + "loss": 0.788, + "step": 36324 + }, + { + "epoch": 0.9618717712932159, + "grad_norm": 0.859375, + "learning_rate": 8.314370057067123e-05, + "loss": 0.9108, + "step": 36325 + }, + { + "epoch": 0.9618982509015103, + "grad_norm": 0.6875, + "learning_rate": 8.313916272329998e-05, + "loss": 0.6818, + "step": 36326 + }, + { + "epoch": 0.9619247305098046, + "grad_norm": 0.7890625, + "learning_rate": 8.31346249116643e-05, + "loss": 0.8229, + "step": 36327 + }, + { + "epoch": 0.961951210118099, + "grad_norm": 0.796875, + "learning_rate": 8.313008713577381e-05, + "loss": 0.7554, + "step": 36328 + }, + { + "epoch": 0.9619776897263934, + "grad_norm": 1.078125, + "learning_rate": 8.312554939563816e-05, + "loss": 0.7785, + "step": 36329 + }, + { + "epoch": 0.9620041693346878, + "grad_norm": 0.78515625, + "learning_rate": 8.31210116912669e-05, + "loss": 0.826, + "step": 36330 + }, + { + "epoch": 0.9620306489429822, + "grad_norm": 0.90234375, + "learning_rate": 8.31164740226697e-05, + "loss": 0.7719, + "step": 36331 + }, + { + "epoch": 0.9620571285512766, + "grad_norm": 0.75, + "learning_rate": 8.311193638985621e-05, + "loss": 0.7729, + "step": 36332 + }, + { + "epoch": 0.962083608159571, + "grad_norm": 0.78515625, + "learning_rate": 8.310739879283598e-05, + "loss": 0.7323, + "step": 36333 + }, + { + "epoch": 0.9621100877678653, + "grad_norm": 0.85546875, + "learning_rate": 8.310286123161866e-05, + "loss": 0.8459, + "step": 36334 + }, + { + "epoch": 0.9621365673761597, + "grad_norm": 0.75390625, + "learning_rate": 8.309832370621382e-05, + "loss": 0.7627, + "step": 36335 + }, + { + "epoch": 0.9621630469844541, + "grad_norm": 0.7734375, + "learning_rate": 8.309378621663116e-05, + "loss": 0.8284, + "step": 36336 + }, + { + "epoch": 0.9621895265927485, + "grad_norm": 0.84765625, + "learning_rate": 8.308924876288024e-05, + "loss": 0.6882, + "step": 36337 + }, + { + "epoch": 0.9622160062010429, + "grad_norm": 0.80859375, + "learning_rate": 8.30847113449707e-05, + "loss": 0.8387, + "step": 36338 + }, + { + "epoch": 0.9622424858093372, + "grad_norm": 0.9375, + "learning_rate": 8.308017396291216e-05, + "loss": 0.9008, + "step": 36339 + }, + { + "epoch": 0.9622689654176315, + "grad_norm": 0.7578125, + "learning_rate": 8.307563661671418e-05, + "loss": 0.777, + "step": 36340 + }, + { + "epoch": 0.9622954450259259, + "grad_norm": 0.75390625, + "learning_rate": 8.307109930638643e-05, + "loss": 0.7778, + "step": 36341 + }, + { + "epoch": 0.9623219246342203, + "grad_norm": 0.734375, + "learning_rate": 8.306656203193855e-05, + "loss": 0.7277, + "step": 36342 + }, + { + "epoch": 0.9623484042425147, + "grad_norm": 0.734375, + "learning_rate": 8.306202479338012e-05, + "loss": 0.7982, + "step": 36343 + }, + { + "epoch": 0.962374883850809, + "grad_norm": 0.796875, + "learning_rate": 8.305748759072074e-05, + "loss": 0.7501, + "step": 36344 + }, + { + "epoch": 0.9624013634591034, + "grad_norm": 0.796875, + "learning_rate": 8.305295042397004e-05, + "loss": 0.871, + "step": 36345 + }, + { + "epoch": 0.9624278430673978, + "grad_norm": 0.77734375, + "learning_rate": 8.304841329313768e-05, + "loss": 0.682, + "step": 36346 + }, + { + "epoch": 0.9624543226756922, + "grad_norm": 0.7578125, + "learning_rate": 8.304387619823322e-05, + "loss": 0.6906, + "step": 36347 + }, + { + "epoch": 0.9624808022839866, + "grad_norm": 0.828125, + "learning_rate": 8.303933913926629e-05, + "loss": 0.8272, + "step": 36348 + }, + { + "epoch": 0.962507281892281, + "grad_norm": 0.76953125, + "learning_rate": 8.303480211624653e-05, + "loss": 0.7639, + "step": 36349 + }, + { + "epoch": 0.9625337615005753, + "grad_norm": 0.74609375, + "learning_rate": 8.303026512918348e-05, + "loss": 0.6638, + "step": 36350 + }, + { + "epoch": 0.9625602411088697, + "grad_norm": 0.6640625, + "learning_rate": 8.302572817808686e-05, + "loss": 0.6484, + "step": 36351 + }, + { + "epoch": 0.9625867207171641, + "grad_norm": 0.83203125, + "learning_rate": 8.302119126296623e-05, + "loss": 0.8172, + "step": 36352 + }, + { + "epoch": 0.9626132003254585, + "grad_norm": 0.80859375, + "learning_rate": 8.301665438383124e-05, + "loss": 0.783, + "step": 36353 + }, + { + "epoch": 0.9626396799337529, + "grad_norm": 0.78515625, + "learning_rate": 8.301211754069146e-05, + "loss": 0.9315, + "step": 36354 + }, + { + "epoch": 0.9626661595420473, + "grad_norm": 0.7578125, + "learning_rate": 8.300758073355647e-05, + "loss": 0.7991, + "step": 36355 + }, + { + "epoch": 0.9626926391503415, + "grad_norm": 0.7421875, + "learning_rate": 8.3003043962436e-05, + "loss": 0.8024, + "step": 36356 + }, + { + "epoch": 0.9627191187586359, + "grad_norm": 0.71484375, + "learning_rate": 8.299850722733963e-05, + "loss": 0.7618, + "step": 36357 + }, + { + "epoch": 0.9627455983669303, + "grad_norm": 0.796875, + "learning_rate": 8.299397052827692e-05, + "loss": 0.7872, + "step": 36358 + }, + { + "epoch": 0.9627720779752247, + "grad_norm": 0.8203125, + "learning_rate": 8.298943386525752e-05, + "loss": 0.7232, + "step": 36359 + }, + { + "epoch": 0.9627985575835191, + "grad_norm": 0.734375, + "learning_rate": 8.298489723829102e-05, + "loss": 0.7957, + "step": 36360 + }, + { + "epoch": 0.9628250371918134, + "grad_norm": 0.86328125, + "learning_rate": 8.298036064738708e-05, + "loss": 0.8265, + "step": 36361 + }, + { + "epoch": 0.9628515168001078, + "grad_norm": 0.73828125, + "learning_rate": 8.297582409255532e-05, + "loss": 0.8212, + "step": 36362 + }, + { + "epoch": 0.9628779964084022, + "grad_norm": 0.79296875, + "learning_rate": 8.29712875738053e-05, + "loss": 0.922, + "step": 36363 + }, + { + "epoch": 0.9629044760166966, + "grad_norm": 0.87890625, + "learning_rate": 8.296675109114667e-05, + "loss": 0.6936, + "step": 36364 + }, + { + "epoch": 0.962930955624991, + "grad_norm": 0.78515625, + "learning_rate": 8.296221464458902e-05, + "loss": 0.7148, + "step": 36365 + }, + { + "epoch": 0.9629574352332854, + "grad_norm": 0.81640625, + "learning_rate": 8.2957678234142e-05, + "loss": 0.7404, + "step": 36366 + }, + { + "epoch": 0.9629839148415797, + "grad_norm": 0.91015625, + "learning_rate": 8.295314185981521e-05, + "loss": 0.7318, + "step": 36367 + }, + { + "epoch": 0.9630103944498741, + "grad_norm": 0.8203125, + "learning_rate": 8.294860552161826e-05, + "loss": 0.773, + "step": 36368 + }, + { + "epoch": 0.9630368740581685, + "grad_norm": 0.765625, + "learning_rate": 8.294406921956077e-05, + "loss": 0.7079, + "step": 36369 + }, + { + "epoch": 0.9630633536664629, + "grad_norm": 0.7421875, + "learning_rate": 8.293953295365229e-05, + "loss": 0.7407, + "step": 36370 + }, + { + "epoch": 0.9630898332747573, + "grad_norm": 0.74609375, + "learning_rate": 8.293499672390255e-05, + "loss": 0.7221, + "step": 36371 + }, + { + "epoch": 0.9631163128830517, + "grad_norm": 0.83984375, + "learning_rate": 8.293046053032112e-05, + "loss": 0.7026, + "step": 36372 + }, + { + "epoch": 0.9631427924913459, + "grad_norm": 0.88671875, + "learning_rate": 8.29259243729176e-05, + "loss": 0.7911, + "step": 36373 + }, + { + "epoch": 0.9631692720996403, + "grad_norm": 0.8359375, + "learning_rate": 8.29213882517016e-05, + "loss": 0.7668, + "step": 36374 + }, + { + "epoch": 0.9631957517079347, + "grad_norm": 0.79296875, + "learning_rate": 8.29168521666827e-05, + "loss": 0.836, + "step": 36375 + }, + { + "epoch": 0.9632222313162291, + "grad_norm": 0.7890625, + "learning_rate": 8.291231611787059e-05, + "loss": 0.8296, + "step": 36376 + }, + { + "epoch": 0.9632487109245235, + "grad_norm": 0.8203125, + "learning_rate": 8.290778010527486e-05, + "loss": 0.7653, + "step": 36377 + }, + { + "epoch": 0.9632751905328178, + "grad_norm": 0.82421875, + "learning_rate": 8.290324412890511e-05, + "loss": 0.8094, + "step": 36378 + }, + { + "epoch": 0.9633016701411122, + "grad_norm": 0.83984375, + "learning_rate": 8.289870818877096e-05, + "loss": 0.8988, + "step": 36379 + }, + { + "epoch": 0.9633281497494066, + "grad_norm": 0.78515625, + "learning_rate": 8.289417228488197e-05, + "loss": 0.7571, + "step": 36380 + }, + { + "epoch": 0.963354629357701, + "grad_norm": 0.83984375, + "learning_rate": 8.288963641724785e-05, + "loss": 0.8646, + "step": 36381 + }, + { + "epoch": 0.9633811089659954, + "grad_norm": 0.78125, + "learning_rate": 8.288510058587817e-05, + "loss": 0.7071, + "step": 36382 + }, + { + "epoch": 0.9634075885742898, + "grad_norm": 0.75, + "learning_rate": 8.288056479078255e-05, + "loss": 0.7499, + "step": 36383 + }, + { + "epoch": 0.9634340681825841, + "grad_norm": 0.765625, + "learning_rate": 8.287602903197058e-05, + "loss": 0.7414, + "step": 36384 + }, + { + "epoch": 0.9634605477908785, + "grad_norm": 0.80859375, + "learning_rate": 8.287149330945187e-05, + "loss": 0.7582, + "step": 36385 + }, + { + "epoch": 0.9634870273991729, + "grad_norm": 0.83984375, + "learning_rate": 8.286695762323607e-05, + "loss": 0.8904, + "step": 36386 + }, + { + "epoch": 0.9635135070074673, + "grad_norm": 0.78515625, + "learning_rate": 8.286242197333279e-05, + "loss": 0.7184, + "step": 36387 + }, + { + "epoch": 0.9635399866157617, + "grad_norm": 0.8671875, + "learning_rate": 8.28578863597516e-05, + "loss": 0.7333, + "step": 36388 + }, + { + "epoch": 0.963566466224056, + "grad_norm": 0.8125, + "learning_rate": 8.285335078250215e-05, + "loss": 0.8821, + "step": 36389 + }, + { + "epoch": 0.9635929458323503, + "grad_norm": 0.70703125, + "learning_rate": 8.284881524159402e-05, + "loss": 0.6864, + "step": 36390 + }, + { + "epoch": 0.9636194254406447, + "grad_norm": 0.83984375, + "learning_rate": 8.284427973703686e-05, + "loss": 0.7939, + "step": 36391 + }, + { + "epoch": 0.9636459050489391, + "grad_norm": 0.81640625, + "learning_rate": 8.28397442688403e-05, + "loss": 0.7158, + "step": 36392 + }, + { + "epoch": 0.9636723846572335, + "grad_norm": 0.8359375, + "learning_rate": 8.283520883701389e-05, + "loss": 0.9584, + "step": 36393 + }, + { + "epoch": 0.9636988642655279, + "grad_norm": 0.80859375, + "learning_rate": 8.283067344156729e-05, + "loss": 0.8499, + "step": 36394 + }, + { + "epoch": 0.9637253438738222, + "grad_norm": 0.83984375, + "learning_rate": 8.282613808251005e-05, + "loss": 0.7062, + "step": 36395 + }, + { + "epoch": 0.9637518234821166, + "grad_norm": 0.8125, + "learning_rate": 8.282160275985186e-05, + "loss": 0.7077, + "step": 36396 + }, + { + "epoch": 0.963778303090411, + "grad_norm": 0.796875, + "learning_rate": 8.281706747360231e-05, + "loss": 0.8904, + "step": 36397 + }, + { + "epoch": 0.9638047826987054, + "grad_norm": 0.82421875, + "learning_rate": 8.281253222377103e-05, + "loss": 0.8667, + "step": 36398 + }, + { + "epoch": 0.9638312623069998, + "grad_norm": 0.76171875, + "learning_rate": 8.280799701036757e-05, + "loss": 0.8107, + "step": 36399 + }, + { + "epoch": 0.9638577419152942, + "grad_norm": 0.7890625, + "learning_rate": 8.280346183340156e-05, + "loss": 0.7844, + "step": 36400 + }, + { + "epoch": 0.9638842215235885, + "grad_norm": 0.7421875, + "learning_rate": 8.279892669288266e-05, + "loss": 0.848, + "step": 36401 + }, + { + "epoch": 0.9639107011318829, + "grad_norm": 0.78515625, + "learning_rate": 8.279439158882043e-05, + "loss": 0.7481, + "step": 36402 + }, + { + "epoch": 0.9639371807401773, + "grad_norm": 0.78125, + "learning_rate": 8.278985652122453e-05, + "loss": 0.7611, + "step": 36403 + }, + { + "epoch": 0.9639636603484717, + "grad_norm": 0.796875, + "learning_rate": 8.278532149010454e-05, + "loss": 0.8931, + "step": 36404 + }, + { + "epoch": 0.963990139956766, + "grad_norm": 0.83984375, + "learning_rate": 8.278078649547006e-05, + "loss": 0.7655, + "step": 36405 + }, + { + "epoch": 0.9640166195650604, + "grad_norm": 0.8203125, + "learning_rate": 8.277625153733073e-05, + "loss": 0.8177, + "step": 36406 + }, + { + "epoch": 0.9640430991733547, + "grad_norm": 0.77734375, + "learning_rate": 8.277171661569616e-05, + "loss": 0.7126, + "step": 36407 + }, + { + "epoch": 0.9640695787816491, + "grad_norm": 0.8359375, + "learning_rate": 8.276718173057593e-05, + "loss": 0.753, + "step": 36408 + }, + { + "epoch": 0.9640960583899435, + "grad_norm": 0.71484375, + "learning_rate": 8.276264688197968e-05, + "loss": 0.744, + "step": 36409 + }, + { + "epoch": 0.9641225379982379, + "grad_norm": 0.79296875, + "learning_rate": 8.275811206991701e-05, + "loss": 0.7847, + "step": 36410 + }, + { + "epoch": 0.9641490176065323, + "grad_norm": 0.76953125, + "learning_rate": 8.275357729439756e-05, + "loss": 0.6929, + "step": 36411 + }, + { + "epoch": 0.9641754972148266, + "grad_norm": 0.80859375, + "learning_rate": 8.27490425554309e-05, + "loss": 0.688, + "step": 36412 + }, + { + "epoch": 0.964201976823121, + "grad_norm": 0.671875, + "learning_rate": 8.274450785302667e-05, + "loss": 0.6489, + "step": 36413 + }, + { + "epoch": 0.9642284564314154, + "grad_norm": 0.765625, + "learning_rate": 8.273997318719447e-05, + "loss": 0.6705, + "step": 36414 + }, + { + "epoch": 0.9642549360397098, + "grad_norm": 0.80859375, + "learning_rate": 8.273543855794391e-05, + "loss": 0.828, + "step": 36415 + }, + { + "epoch": 0.9642814156480042, + "grad_norm": 0.84375, + "learning_rate": 8.273090396528456e-05, + "loss": 0.9863, + "step": 36416 + }, + { + "epoch": 0.9643078952562986, + "grad_norm": 0.83984375, + "learning_rate": 8.27263694092261e-05, + "loss": 0.8783, + "step": 36417 + }, + { + "epoch": 0.964334374864593, + "grad_norm": 1.015625, + "learning_rate": 8.272183488977813e-05, + "loss": 0.7684, + "step": 36418 + }, + { + "epoch": 0.9643608544728873, + "grad_norm": 0.77734375, + "learning_rate": 8.271730040695023e-05, + "loss": 0.8293, + "step": 36419 + }, + { + "epoch": 0.9643873340811817, + "grad_norm": 0.77734375, + "learning_rate": 8.271276596075204e-05, + "loss": 0.8381, + "step": 36420 + }, + { + "epoch": 0.9644138136894761, + "grad_norm": 0.765625, + "learning_rate": 8.270823155119309e-05, + "loss": 0.7897, + "step": 36421 + }, + { + "epoch": 0.9644402932977704, + "grad_norm": 0.796875, + "learning_rate": 8.270369717828312e-05, + "loss": 0.8848, + "step": 36422 + }, + { + "epoch": 0.9644667729060648, + "grad_norm": 0.73046875, + "learning_rate": 8.269916284203166e-05, + "loss": 0.7098, + "step": 36423 + }, + { + "epoch": 0.9644932525143591, + "grad_norm": 0.71875, + "learning_rate": 8.269462854244833e-05, + "loss": 0.7173, + "step": 36424 + }, + { + "epoch": 0.9645197321226535, + "grad_norm": 0.7734375, + "learning_rate": 8.269009427954277e-05, + "loss": 0.7546, + "step": 36425 + }, + { + "epoch": 0.9645462117309479, + "grad_norm": 0.8359375, + "learning_rate": 8.268556005332453e-05, + "loss": 0.7504, + "step": 36426 + }, + { + "epoch": 0.9645726913392423, + "grad_norm": 0.69921875, + "learning_rate": 8.268102586380325e-05, + "loss": 0.7552, + "step": 36427 + }, + { + "epoch": 0.9645991709475367, + "grad_norm": 0.76171875, + "learning_rate": 8.267649171098858e-05, + "loss": 0.7529, + "step": 36428 + }, + { + "epoch": 0.964625650555831, + "grad_norm": 0.79296875, + "learning_rate": 8.267195759489008e-05, + "loss": 0.7549, + "step": 36429 + }, + { + "epoch": 0.9646521301641254, + "grad_norm": 0.75, + "learning_rate": 8.26674235155174e-05, + "loss": 0.7234, + "step": 36430 + }, + { + "epoch": 0.9646786097724198, + "grad_norm": 0.72265625, + "learning_rate": 8.266288947288008e-05, + "loss": 0.6127, + "step": 36431 + }, + { + "epoch": 0.9647050893807142, + "grad_norm": 0.734375, + "learning_rate": 8.26583554669878e-05, + "loss": 0.7668, + "step": 36432 + }, + { + "epoch": 0.9647315689890086, + "grad_norm": 0.81640625, + "learning_rate": 8.265382149785016e-05, + "loss": 0.8523, + "step": 36433 + }, + { + "epoch": 0.964758048597303, + "grad_norm": 0.71484375, + "learning_rate": 8.264928756547674e-05, + "loss": 0.6607, + "step": 36434 + }, + { + "epoch": 0.9647845282055973, + "grad_norm": 0.73828125, + "learning_rate": 8.264475366987716e-05, + "loss": 0.7668, + "step": 36435 + }, + { + "epoch": 0.9648110078138917, + "grad_norm": 0.7734375, + "learning_rate": 8.2640219811061e-05, + "loss": 0.8703, + "step": 36436 + }, + { + "epoch": 0.9648374874221861, + "grad_norm": 0.80859375, + "learning_rate": 8.263568598903795e-05, + "loss": 0.797, + "step": 36437 + }, + { + "epoch": 0.9648639670304804, + "grad_norm": 1.59375, + "learning_rate": 8.263115220381756e-05, + "loss": 0.755, + "step": 36438 + }, + { + "epoch": 0.9648904466387748, + "grad_norm": 0.8046875, + "learning_rate": 8.262661845540945e-05, + "loss": 0.764, + "step": 36439 + }, + { + "epoch": 0.9649169262470692, + "grad_norm": 0.7578125, + "learning_rate": 8.262208474382323e-05, + "loss": 0.8264, + "step": 36440 + }, + { + "epoch": 0.9649434058553635, + "grad_norm": 0.7734375, + "learning_rate": 8.261755106906847e-05, + "loss": 0.717, + "step": 36441 + }, + { + "epoch": 0.9649698854636579, + "grad_norm": 0.79296875, + "learning_rate": 8.261301743115484e-05, + "loss": 0.8903, + "step": 36442 + }, + { + "epoch": 0.9649963650719523, + "grad_norm": 0.80078125, + "learning_rate": 8.260848383009195e-05, + "loss": 0.8349, + "step": 36443 + }, + { + "epoch": 0.9650228446802467, + "grad_norm": 0.81640625, + "learning_rate": 8.260395026588938e-05, + "loss": 0.8391, + "step": 36444 + }, + { + "epoch": 0.9650493242885411, + "grad_norm": 0.86328125, + "learning_rate": 8.259941673855674e-05, + "loss": 0.7912, + "step": 36445 + }, + { + "epoch": 0.9650758038968354, + "grad_norm": 0.71484375, + "learning_rate": 8.259488324810359e-05, + "loss": 0.6654, + "step": 36446 + }, + { + "epoch": 0.9651022835051298, + "grad_norm": 0.9453125, + "learning_rate": 8.259034979453964e-05, + "loss": 0.7916, + "step": 36447 + }, + { + "epoch": 0.9651287631134242, + "grad_norm": 0.70703125, + "learning_rate": 8.258581637787446e-05, + "loss": 0.8259, + "step": 36448 + }, + { + "epoch": 0.9651552427217186, + "grad_norm": 0.90234375, + "learning_rate": 8.258128299811764e-05, + "loss": 0.9429, + "step": 36449 + }, + { + "epoch": 0.965181722330013, + "grad_norm": 0.7890625, + "learning_rate": 8.257674965527879e-05, + "loss": 0.7729, + "step": 36450 + }, + { + "epoch": 0.9652082019383074, + "grad_norm": 0.7578125, + "learning_rate": 8.25722163493675e-05, + "loss": 0.7102, + "step": 36451 + }, + { + "epoch": 0.9652346815466017, + "grad_norm": 0.82421875, + "learning_rate": 8.256768308039342e-05, + "loss": 0.729, + "step": 36452 + }, + { + "epoch": 0.9652611611548961, + "grad_norm": 0.76171875, + "learning_rate": 8.256314984836615e-05, + "loss": 0.7842, + "step": 36453 + }, + { + "epoch": 0.9652876407631904, + "grad_norm": 0.78125, + "learning_rate": 8.255861665329529e-05, + "loss": 0.7884, + "step": 36454 + }, + { + "epoch": 0.9653141203714848, + "grad_norm": 0.80859375, + "learning_rate": 8.255408349519044e-05, + "loss": 0.8011, + "step": 36455 + }, + { + "epoch": 0.9653405999797792, + "grad_norm": 0.7578125, + "learning_rate": 8.254955037406116e-05, + "loss": 0.8093, + "step": 36456 + }, + { + "epoch": 0.9653670795880736, + "grad_norm": 0.74609375, + "learning_rate": 8.254501728991715e-05, + "loss": 0.9383, + "step": 36457 + }, + { + "epoch": 0.9653935591963679, + "grad_norm": 0.78515625, + "learning_rate": 8.2540484242768e-05, + "loss": 0.7538, + "step": 36458 + }, + { + "epoch": 0.9654200388046623, + "grad_norm": 0.7734375, + "learning_rate": 8.253595123262329e-05, + "loss": 0.7739, + "step": 36459 + }, + { + "epoch": 0.9654465184129567, + "grad_norm": 0.7578125, + "learning_rate": 8.253141825949262e-05, + "loss": 0.7653, + "step": 36460 + }, + { + "epoch": 0.9654729980212511, + "grad_norm": 0.70703125, + "learning_rate": 8.252688532338558e-05, + "loss": 0.7385, + "step": 36461 + }, + { + "epoch": 0.9654994776295455, + "grad_norm": 0.79296875, + "learning_rate": 8.252235242431185e-05, + "loss": 0.8061, + "step": 36462 + }, + { + "epoch": 0.9655259572378398, + "grad_norm": 0.7734375, + "learning_rate": 8.251781956228099e-05, + "loss": 0.7575, + "step": 36463 + }, + { + "epoch": 0.9655524368461342, + "grad_norm": 0.76171875, + "learning_rate": 8.251328673730262e-05, + "loss": 0.7981, + "step": 36464 + }, + { + "epoch": 0.9655789164544286, + "grad_norm": 0.75390625, + "learning_rate": 8.250875394938632e-05, + "loss": 0.8717, + "step": 36465 + }, + { + "epoch": 0.965605396062723, + "grad_norm": 0.75, + "learning_rate": 8.25042211985417e-05, + "loss": 0.7718, + "step": 36466 + }, + { + "epoch": 0.9656318756710174, + "grad_norm": 0.72265625, + "learning_rate": 8.249968848477841e-05, + "loss": 0.6699, + "step": 36467 + }, + { + "epoch": 0.9656583552793118, + "grad_norm": 0.74609375, + "learning_rate": 8.249515580810603e-05, + "loss": 0.8308, + "step": 36468 + }, + { + "epoch": 0.9656848348876061, + "grad_norm": 0.75390625, + "learning_rate": 8.249062316853418e-05, + "loss": 0.7069, + "step": 36469 + }, + { + "epoch": 0.9657113144959005, + "grad_norm": 0.68359375, + "learning_rate": 8.248609056607243e-05, + "loss": 0.779, + "step": 36470 + }, + { + "epoch": 0.9657377941041948, + "grad_norm": 0.80078125, + "learning_rate": 8.24815580007304e-05, + "loss": 0.8123, + "step": 36471 + }, + { + "epoch": 0.9657642737124892, + "grad_norm": 0.84765625, + "learning_rate": 8.247702547251773e-05, + "loss": 0.7997, + "step": 36472 + }, + { + "epoch": 0.9657907533207836, + "grad_norm": 0.78125, + "learning_rate": 8.247249298144402e-05, + "loss": 0.6871, + "step": 36473 + }, + { + "epoch": 0.965817232929078, + "grad_norm": 0.84375, + "learning_rate": 8.246796052751883e-05, + "loss": 0.84, + "step": 36474 + }, + { + "epoch": 0.9658437125373723, + "grad_norm": 0.7265625, + "learning_rate": 8.246342811075178e-05, + "loss": 0.7352, + "step": 36475 + }, + { + "epoch": 0.9658701921456667, + "grad_norm": 0.83984375, + "learning_rate": 8.24588957311525e-05, + "loss": 0.8702, + "step": 36476 + }, + { + "epoch": 0.9658966717539611, + "grad_norm": 0.83984375, + "learning_rate": 8.245436338873061e-05, + "loss": 0.8025, + "step": 36477 + }, + { + "epoch": 0.9659231513622555, + "grad_norm": 0.83984375, + "learning_rate": 8.244983108349569e-05, + "loss": 0.7442, + "step": 36478 + }, + { + "epoch": 0.9659496309705499, + "grad_norm": 0.75390625, + "learning_rate": 8.244529881545735e-05, + "loss": 0.8013, + "step": 36479 + }, + { + "epoch": 0.9659761105788442, + "grad_norm": 0.7421875, + "learning_rate": 8.244076658462521e-05, + "loss": 0.675, + "step": 36480 + }, + { + "epoch": 0.9660025901871386, + "grad_norm": 0.78515625, + "learning_rate": 8.24362343910088e-05, + "loss": 0.7834, + "step": 36481 + }, + { + "epoch": 0.966029069795433, + "grad_norm": 0.7890625, + "learning_rate": 8.243170223461782e-05, + "loss": 0.8797, + "step": 36482 + }, + { + "epoch": 0.9660555494037274, + "grad_norm": 0.73046875, + "learning_rate": 8.242717011546187e-05, + "loss": 0.7363, + "step": 36483 + }, + { + "epoch": 0.9660820290120218, + "grad_norm": 0.77734375, + "learning_rate": 8.242263803355052e-05, + "loss": 0.7521, + "step": 36484 + }, + { + "epoch": 0.9661085086203162, + "grad_norm": 0.8125, + "learning_rate": 8.241810598889338e-05, + "loss": 0.8308, + "step": 36485 + }, + { + "epoch": 0.9661349882286105, + "grad_norm": 0.75, + "learning_rate": 8.241357398150001e-05, + "loss": 0.7418, + "step": 36486 + }, + { + "epoch": 0.9661614678369048, + "grad_norm": 0.76171875, + "learning_rate": 8.240904201138012e-05, + "loss": 0.6975, + "step": 36487 + }, + { + "epoch": 0.9661879474451992, + "grad_norm": 0.80859375, + "learning_rate": 8.240451007854327e-05, + "loss": 0.9387, + "step": 36488 + }, + { + "epoch": 0.9662144270534936, + "grad_norm": 0.85546875, + "learning_rate": 8.239997818299903e-05, + "loss": 0.9544, + "step": 36489 + }, + { + "epoch": 0.966240906661788, + "grad_norm": 0.8359375, + "learning_rate": 8.239544632475705e-05, + "loss": 0.7631, + "step": 36490 + }, + { + "epoch": 0.9662673862700824, + "grad_norm": 0.796875, + "learning_rate": 8.239091450382688e-05, + "loss": 0.8072, + "step": 36491 + }, + { + "epoch": 0.9662938658783767, + "grad_norm": 0.7265625, + "learning_rate": 8.238638272021818e-05, + "loss": 0.762, + "step": 36492 + }, + { + "epoch": 0.9663203454866711, + "grad_norm": 0.80078125, + "learning_rate": 8.238185097394055e-05, + "loss": 0.7384, + "step": 36493 + }, + { + "epoch": 0.9663468250949655, + "grad_norm": 0.7734375, + "learning_rate": 8.237731926500354e-05, + "loss": 0.7915, + "step": 36494 + }, + { + "epoch": 0.9663733047032599, + "grad_norm": 0.875, + "learning_rate": 8.237278759341683e-05, + "loss": 0.9618, + "step": 36495 + }, + { + "epoch": 0.9663997843115543, + "grad_norm": 0.7421875, + "learning_rate": 8.236825595918995e-05, + "loss": 0.8799, + "step": 36496 + }, + { + "epoch": 0.9664262639198486, + "grad_norm": 0.76171875, + "learning_rate": 8.236372436233258e-05, + "loss": 0.8054, + "step": 36497 + }, + { + "epoch": 0.966452743528143, + "grad_norm": 0.8359375, + "learning_rate": 8.235919280285429e-05, + "loss": 0.7635, + "step": 36498 + }, + { + "epoch": 0.9664792231364374, + "grad_norm": 0.78515625, + "learning_rate": 8.235466128076468e-05, + "loss": 0.7983, + "step": 36499 + }, + { + "epoch": 0.9665057027447318, + "grad_norm": 0.80078125, + "learning_rate": 8.235012979607337e-05, + "loss": 0.8247, + "step": 36500 + }, + { + "epoch": 0.9665321823530262, + "grad_norm": 0.73046875, + "learning_rate": 8.234559834878989e-05, + "loss": 0.7343, + "step": 36501 + }, + { + "epoch": 0.9665586619613206, + "grad_norm": 0.91796875, + "learning_rate": 8.234106693892394e-05, + "loss": 0.8949, + "step": 36502 + }, + { + "epoch": 0.9665851415696148, + "grad_norm": 0.75390625, + "learning_rate": 8.23365355664851e-05, + "loss": 0.9719, + "step": 36503 + }, + { + "epoch": 0.9666116211779092, + "grad_norm": 0.69140625, + "learning_rate": 8.233200423148297e-05, + "loss": 0.7153, + "step": 36504 + }, + { + "epoch": 0.9666381007862036, + "grad_norm": 0.79296875, + "learning_rate": 8.232747293392715e-05, + "loss": 0.7686, + "step": 36505 + }, + { + "epoch": 0.966664580394498, + "grad_norm": 0.7890625, + "learning_rate": 8.232294167382719e-05, + "loss": 0.9335, + "step": 36506 + }, + { + "epoch": 0.9666910600027924, + "grad_norm": 0.75390625, + "learning_rate": 8.231841045119279e-05, + "loss": 0.8762, + "step": 36507 + }, + { + "epoch": 0.9667175396110868, + "grad_norm": 0.765625, + "learning_rate": 8.23138792660335e-05, + "loss": 0.8111, + "step": 36508 + }, + { + "epoch": 0.9667440192193811, + "grad_norm": 0.79296875, + "learning_rate": 8.230934811835894e-05, + "loss": 0.8005, + "step": 36509 + }, + { + "epoch": 0.9667704988276755, + "grad_norm": 0.875, + "learning_rate": 8.23048170081787e-05, + "loss": 0.8281, + "step": 36510 + }, + { + "epoch": 0.9667969784359699, + "grad_norm": 0.76953125, + "learning_rate": 8.230028593550238e-05, + "loss": 0.8061, + "step": 36511 + }, + { + "epoch": 0.9668234580442643, + "grad_norm": 0.734375, + "learning_rate": 8.229575490033958e-05, + "loss": 0.7237, + "step": 36512 + }, + { + "epoch": 0.9668499376525587, + "grad_norm": 0.79296875, + "learning_rate": 8.229122390269994e-05, + "loss": 0.8147, + "step": 36513 + }, + { + "epoch": 0.966876417260853, + "grad_norm": 0.8203125, + "learning_rate": 8.228669294259304e-05, + "loss": 0.7801, + "step": 36514 + }, + { + "epoch": 0.9669028968691474, + "grad_norm": 0.77734375, + "learning_rate": 8.228216202002848e-05, + "loss": 0.74, + "step": 36515 + }, + { + "epoch": 0.9669293764774418, + "grad_norm": 0.796875, + "learning_rate": 8.227763113501584e-05, + "loss": 0.7837, + "step": 36516 + }, + { + "epoch": 0.9669558560857362, + "grad_norm": 0.7265625, + "learning_rate": 8.227310028756478e-05, + "loss": 0.8155, + "step": 36517 + }, + { + "epoch": 0.9669823356940306, + "grad_norm": 0.7734375, + "learning_rate": 8.226856947768486e-05, + "loss": 0.7774, + "step": 36518 + }, + { + "epoch": 0.967008815302325, + "grad_norm": 0.7578125, + "learning_rate": 8.226403870538571e-05, + "loss": 0.8248, + "step": 36519 + }, + { + "epoch": 0.9670352949106192, + "grad_norm": 0.8203125, + "learning_rate": 8.22595079706769e-05, + "loss": 0.8479, + "step": 36520 + }, + { + "epoch": 0.9670617745189136, + "grad_norm": 0.73828125, + "learning_rate": 8.225497727356801e-05, + "loss": 0.7669, + "step": 36521 + }, + { + "epoch": 0.967088254127208, + "grad_norm": 0.79296875, + "learning_rate": 8.225044661406872e-05, + "loss": 0.7716, + "step": 36522 + }, + { + "epoch": 0.9671147337355024, + "grad_norm": 0.7578125, + "learning_rate": 8.22459159921886e-05, + "loss": 0.8008, + "step": 36523 + }, + { + "epoch": 0.9671412133437968, + "grad_norm": 0.86328125, + "learning_rate": 8.224138540793724e-05, + "loss": 0.7829, + "step": 36524 + }, + { + "epoch": 0.9671676929520912, + "grad_norm": 0.87890625, + "learning_rate": 8.223685486132427e-05, + "loss": 0.7891, + "step": 36525 + }, + { + "epoch": 0.9671941725603855, + "grad_norm": 0.7734375, + "learning_rate": 8.22323243523592e-05, + "loss": 0.7367, + "step": 36526 + }, + { + "epoch": 0.9672206521686799, + "grad_norm": 0.859375, + "learning_rate": 8.222779388105177e-05, + "loss": 0.9234, + "step": 36527 + }, + { + "epoch": 0.9672471317769743, + "grad_norm": 0.85546875, + "learning_rate": 8.22232634474115e-05, + "loss": 0.8409, + "step": 36528 + }, + { + "epoch": 0.9672736113852687, + "grad_norm": 0.76953125, + "learning_rate": 8.2218733051448e-05, + "loss": 0.7908, + "step": 36529 + }, + { + "epoch": 0.9673000909935631, + "grad_norm": 0.94140625, + "learning_rate": 8.221420269317088e-05, + "loss": 0.8587, + "step": 36530 + }, + { + "epoch": 0.9673265706018574, + "grad_norm": 1.0546875, + "learning_rate": 8.220967237258972e-05, + "loss": 0.7281, + "step": 36531 + }, + { + "epoch": 0.9673530502101518, + "grad_norm": 0.79296875, + "learning_rate": 8.220514208971417e-05, + "loss": 0.7966, + "step": 36532 + }, + { + "epoch": 0.9673795298184462, + "grad_norm": 0.765625, + "learning_rate": 8.220061184455381e-05, + "loss": 0.7659, + "step": 36533 + }, + { + "epoch": 0.9674060094267406, + "grad_norm": 0.77734375, + "learning_rate": 8.219608163711823e-05, + "loss": 0.7611, + "step": 36534 + }, + { + "epoch": 0.967432489035035, + "grad_norm": 0.71875, + "learning_rate": 8.219155146741703e-05, + "loss": 0.7793, + "step": 36535 + }, + { + "epoch": 0.9674589686433293, + "grad_norm": 0.88671875, + "learning_rate": 8.218702133545981e-05, + "loss": 0.819, + "step": 36536 + }, + { + "epoch": 0.9674854482516236, + "grad_norm": 0.73046875, + "learning_rate": 8.21824912412562e-05, + "loss": 0.7529, + "step": 36537 + }, + { + "epoch": 0.967511927859918, + "grad_norm": 0.83984375, + "learning_rate": 8.217796118481578e-05, + "loss": 0.8868, + "step": 36538 + }, + { + "epoch": 0.9675384074682124, + "grad_norm": 0.765625, + "learning_rate": 8.217343116614816e-05, + "loss": 0.8471, + "step": 36539 + }, + { + "epoch": 0.9675648870765068, + "grad_norm": 0.7421875, + "learning_rate": 8.216890118526292e-05, + "loss": 0.6661, + "step": 36540 + }, + { + "epoch": 0.9675913666848012, + "grad_norm": 0.78125, + "learning_rate": 8.216437124216964e-05, + "loss": 0.7233, + "step": 36541 + }, + { + "epoch": 0.9676178462930956, + "grad_norm": 0.76171875, + "learning_rate": 8.2159841336878e-05, + "loss": 0.7279, + "step": 36542 + }, + { + "epoch": 0.9676443259013899, + "grad_norm": 0.75390625, + "learning_rate": 8.215531146939754e-05, + "loss": 0.8053, + "step": 36543 + }, + { + "epoch": 0.9676708055096843, + "grad_norm": 0.890625, + "learning_rate": 8.21507816397379e-05, + "loss": 0.8029, + "step": 36544 + }, + { + "epoch": 0.9676972851179787, + "grad_norm": 0.7578125, + "learning_rate": 8.214625184790865e-05, + "loss": 0.7078, + "step": 36545 + }, + { + "epoch": 0.9677237647262731, + "grad_norm": 0.71875, + "learning_rate": 8.214172209391935e-05, + "loss": 0.7124, + "step": 36546 + }, + { + "epoch": 0.9677502443345675, + "grad_norm": 0.8125, + "learning_rate": 8.21371923777797e-05, + "loss": 0.8178, + "step": 36547 + }, + { + "epoch": 0.9677767239428618, + "grad_norm": 0.7890625, + "learning_rate": 8.213266269949925e-05, + "loss": 0.7284, + "step": 36548 + }, + { + "epoch": 0.9678032035511562, + "grad_norm": 0.78515625, + "learning_rate": 8.212813305908759e-05, + "loss": 0.844, + "step": 36549 + }, + { + "epoch": 0.9678296831594506, + "grad_norm": 0.81640625, + "learning_rate": 8.212360345655434e-05, + "loss": 0.7963, + "step": 36550 + }, + { + "epoch": 0.967856162767745, + "grad_norm": 0.80859375, + "learning_rate": 8.211907389190903e-05, + "loss": 0.7208, + "step": 36551 + }, + { + "epoch": 0.9678826423760393, + "grad_norm": 0.8203125, + "learning_rate": 8.21145443651614e-05, + "loss": 0.8659, + "step": 36552 + }, + { + "epoch": 0.9679091219843337, + "grad_norm": 0.890625, + "learning_rate": 8.211001487632094e-05, + "loss": 0.8791, + "step": 36553 + }, + { + "epoch": 0.967935601592628, + "grad_norm": 0.86328125, + "learning_rate": 8.21054854253973e-05, + "loss": 0.7267, + "step": 36554 + }, + { + "epoch": 0.9679620812009224, + "grad_norm": 0.734375, + "learning_rate": 8.210095601240004e-05, + "loss": 0.6642, + "step": 36555 + }, + { + "epoch": 0.9679885608092168, + "grad_norm": 0.80078125, + "learning_rate": 8.209642663733879e-05, + "loss": 0.8736, + "step": 36556 + }, + { + "epoch": 0.9680150404175112, + "grad_norm": 0.76171875, + "learning_rate": 8.209189730022312e-05, + "loss": 0.7474, + "step": 36557 + }, + { + "epoch": 0.9680415200258056, + "grad_norm": 0.796875, + "learning_rate": 8.208736800106268e-05, + "loss": 0.8353, + "step": 36558 + }, + { + "epoch": 0.9680679996341, + "grad_norm": 0.91796875, + "learning_rate": 8.208283873986703e-05, + "loss": 0.7156, + "step": 36559 + }, + { + "epoch": 0.9680944792423943, + "grad_norm": 0.75390625, + "learning_rate": 8.207830951664576e-05, + "loss": 0.8185, + "step": 36560 + }, + { + "epoch": 0.9681209588506887, + "grad_norm": 0.73828125, + "learning_rate": 8.207378033140851e-05, + "loss": 0.7862, + "step": 36561 + }, + { + "epoch": 0.9681474384589831, + "grad_norm": 0.8984375, + "learning_rate": 8.206925118416483e-05, + "loss": 0.8811, + "step": 36562 + }, + { + "epoch": 0.9681739180672775, + "grad_norm": 0.73046875, + "learning_rate": 8.206472207492437e-05, + "loss": 0.7216, + "step": 36563 + }, + { + "epoch": 0.9682003976755719, + "grad_norm": 0.71875, + "learning_rate": 8.206019300369672e-05, + "loss": 0.6698, + "step": 36564 + }, + { + "epoch": 0.9682268772838662, + "grad_norm": 0.75, + "learning_rate": 8.205566397049146e-05, + "loss": 0.7731, + "step": 36565 + }, + { + "epoch": 0.9682533568921606, + "grad_norm": 0.796875, + "learning_rate": 8.205113497531819e-05, + "loss": 0.7592, + "step": 36566 + }, + { + "epoch": 0.968279836500455, + "grad_norm": 1.25, + "learning_rate": 8.204660601818647e-05, + "loss": 0.813, + "step": 36567 + }, + { + "epoch": 0.9683063161087494, + "grad_norm": 0.76953125, + "learning_rate": 8.204207709910599e-05, + "loss": 0.7665, + "step": 36568 + }, + { + "epoch": 0.9683327957170437, + "grad_norm": 0.80078125, + "learning_rate": 8.203754821808628e-05, + "loss": 0.8358, + "step": 36569 + }, + { + "epoch": 0.968359275325338, + "grad_norm": 0.78125, + "learning_rate": 8.2033019375137e-05, + "loss": 0.7857, + "step": 36570 + }, + { + "epoch": 0.9683857549336324, + "grad_norm": 0.671875, + "learning_rate": 8.202849057026768e-05, + "loss": 0.6436, + "step": 36571 + }, + { + "epoch": 0.9684122345419268, + "grad_norm": 0.70703125, + "learning_rate": 8.202396180348791e-05, + "loss": 0.7678, + "step": 36572 + }, + { + "epoch": 0.9684387141502212, + "grad_norm": 0.75, + "learning_rate": 8.201943307480737e-05, + "loss": 0.8124, + "step": 36573 + }, + { + "epoch": 0.9684651937585156, + "grad_norm": 0.94140625, + "learning_rate": 8.201490438423562e-05, + "loss": 0.8026, + "step": 36574 + }, + { + "epoch": 0.96849167336681, + "grad_norm": 0.92578125, + "learning_rate": 8.201037573178224e-05, + "loss": 0.7768, + "step": 36575 + }, + { + "epoch": 0.9685181529751044, + "grad_norm": 0.67578125, + "learning_rate": 8.200584711745684e-05, + "loss": 0.6879, + "step": 36576 + }, + { + "epoch": 0.9685446325833987, + "grad_norm": 0.81640625, + "learning_rate": 8.200131854126901e-05, + "loss": 0.8447, + "step": 36577 + }, + { + "epoch": 0.9685711121916931, + "grad_norm": 0.78125, + "learning_rate": 8.199679000322837e-05, + "loss": 0.7483, + "step": 36578 + }, + { + "epoch": 0.9685975917999875, + "grad_norm": 0.78125, + "learning_rate": 8.199226150334449e-05, + "loss": 0.6764, + "step": 36579 + }, + { + "epoch": 0.9686240714082819, + "grad_norm": 0.8046875, + "learning_rate": 8.198773304162699e-05, + "loss": 0.6802, + "step": 36580 + }, + { + "epoch": 0.9686505510165763, + "grad_norm": 0.7265625, + "learning_rate": 8.198320461808547e-05, + "loss": 0.774, + "step": 36581 + }, + { + "epoch": 0.9686770306248706, + "grad_norm": 0.7109375, + "learning_rate": 8.19786762327295e-05, + "loss": 0.8196, + "step": 36582 + }, + { + "epoch": 0.968703510233165, + "grad_norm": 0.79296875, + "learning_rate": 8.197414788556873e-05, + "loss": 0.7442, + "step": 36583 + }, + { + "epoch": 0.9687299898414594, + "grad_norm": 0.78515625, + "learning_rate": 8.196961957661269e-05, + "loss": 0.6801, + "step": 36584 + }, + { + "epoch": 0.9687564694497537, + "grad_norm": 0.8125, + "learning_rate": 8.196509130587102e-05, + "loss": 0.9157, + "step": 36585 + }, + { + "epoch": 0.9687829490580481, + "grad_norm": 0.765625, + "learning_rate": 8.196056307335332e-05, + "loss": 0.7661, + "step": 36586 + }, + { + "epoch": 0.9688094286663425, + "grad_norm": 0.80859375, + "learning_rate": 8.195603487906913e-05, + "loss": 0.7663, + "step": 36587 + }, + { + "epoch": 0.9688359082746368, + "grad_norm": 0.8203125, + "learning_rate": 8.195150672302812e-05, + "loss": 0.7398, + "step": 36588 + }, + { + "epoch": 0.9688623878829312, + "grad_norm": 0.77734375, + "learning_rate": 8.194697860523987e-05, + "loss": 0.8634, + "step": 36589 + }, + { + "epoch": 0.9688888674912256, + "grad_norm": 0.734375, + "learning_rate": 8.1942450525714e-05, + "loss": 0.8147, + "step": 36590 + }, + { + "epoch": 0.96891534709952, + "grad_norm": 0.640625, + "learning_rate": 8.193792248446003e-05, + "loss": 0.6871, + "step": 36591 + }, + { + "epoch": 0.9689418267078144, + "grad_norm": 0.8125, + "learning_rate": 8.193339448148757e-05, + "loss": 0.797, + "step": 36592 + }, + { + "epoch": 0.9689683063161088, + "grad_norm": 0.75, + "learning_rate": 8.192886651680628e-05, + "loss": 0.833, + "step": 36593 + }, + { + "epoch": 0.9689947859244031, + "grad_norm": 0.734375, + "learning_rate": 8.192433859042574e-05, + "loss": 0.7892, + "step": 36594 + }, + { + "epoch": 0.9690212655326975, + "grad_norm": 0.8125, + "learning_rate": 8.191981070235552e-05, + "loss": 0.7597, + "step": 36595 + }, + { + "epoch": 0.9690477451409919, + "grad_norm": 0.75, + "learning_rate": 8.191528285260525e-05, + "loss": 0.759, + "step": 36596 + }, + { + "epoch": 0.9690742247492863, + "grad_norm": 0.74609375, + "learning_rate": 8.191075504118446e-05, + "loss": 0.8251, + "step": 36597 + }, + { + "epoch": 0.9691007043575807, + "grad_norm": 0.71484375, + "learning_rate": 8.190622726810281e-05, + "loss": 0.6376, + "step": 36598 + }, + { + "epoch": 0.969127183965875, + "grad_norm": 0.87109375, + "learning_rate": 8.190169953336988e-05, + "loss": 0.9124, + "step": 36599 + }, + { + "epoch": 0.9691536635741694, + "grad_norm": 0.796875, + "learning_rate": 8.189717183699528e-05, + "loss": 0.7647, + "step": 36600 + }, + { + "epoch": 0.9691801431824637, + "grad_norm": 0.8359375, + "learning_rate": 8.189264417898857e-05, + "loss": 0.7506, + "step": 36601 + }, + { + "epoch": 0.9692066227907581, + "grad_norm": 0.76171875, + "learning_rate": 8.188811655935936e-05, + "loss": 0.7759, + "step": 36602 + }, + { + "epoch": 0.9692331023990525, + "grad_norm": 0.7578125, + "learning_rate": 8.188358897811727e-05, + "loss": 0.8151, + "step": 36603 + }, + { + "epoch": 0.9692595820073469, + "grad_norm": 0.828125, + "learning_rate": 8.187906143527187e-05, + "loss": 0.8372, + "step": 36604 + }, + { + "epoch": 0.9692860616156412, + "grad_norm": 0.76953125, + "learning_rate": 8.187453393083277e-05, + "loss": 0.7191, + "step": 36605 + }, + { + "epoch": 0.9693125412239356, + "grad_norm": 0.72265625, + "learning_rate": 8.187000646480956e-05, + "loss": 0.7383, + "step": 36606 + }, + { + "epoch": 0.96933902083223, + "grad_norm": 0.796875, + "learning_rate": 8.18654790372118e-05, + "loss": 0.8016, + "step": 36607 + }, + { + "epoch": 0.9693655004405244, + "grad_norm": 0.82421875, + "learning_rate": 8.186095164804915e-05, + "loss": 0.8334, + "step": 36608 + }, + { + "epoch": 0.9693919800488188, + "grad_norm": 0.7890625, + "learning_rate": 8.185642429733118e-05, + "loss": 0.7929, + "step": 36609 + }, + { + "epoch": 0.9694184596571132, + "grad_norm": 0.8203125, + "learning_rate": 8.185189698506748e-05, + "loss": 0.7604, + "step": 36610 + }, + { + "epoch": 0.9694449392654075, + "grad_norm": 0.79296875, + "learning_rate": 8.184736971126765e-05, + "loss": 0.7488, + "step": 36611 + }, + { + "epoch": 0.9694714188737019, + "grad_norm": 0.7109375, + "learning_rate": 8.184284247594124e-05, + "loss": 0.6857, + "step": 36612 + }, + { + "epoch": 0.9694978984819963, + "grad_norm": 0.67578125, + "learning_rate": 8.183831527909794e-05, + "loss": 0.7655, + "step": 36613 + }, + { + "epoch": 0.9695243780902907, + "grad_norm": 0.765625, + "learning_rate": 8.183378812074727e-05, + "loss": 0.8114, + "step": 36614 + }, + { + "epoch": 0.9695508576985851, + "grad_norm": 0.87890625, + "learning_rate": 8.182926100089886e-05, + "loss": 0.8746, + "step": 36615 + }, + { + "epoch": 0.9695773373068794, + "grad_norm": 0.73828125, + "learning_rate": 8.182473391956228e-05, + "loss": 0.678, + "step": 36616 + }, + { + "epoch": 0.9696038169151738, + "grad_norm": 0.796875, + "learning_rate": 8.182020687674712e-05, + "loss": 0.8066, + "step": 36617 + }, + { + "epoch": 0.9696302965234681, + "grad_norm": 0.75390625, + "learning_rate": 8.181567987246303e-05, + "loss": 0.6989, + "step": 36618 + }, + { + "epoch": 0.9696567761317625, + "grad_norm": 0.78515625, + "learning_rate": 8.181115290671956e-05, + "loss": 0.7372, + "step": 36619 + }, + { + "epoch": 0.9696832557400569, + "grad_norm": 0.75390625, + "learning_rate": 8.180662597952631e-05, + "loss": 0.8374, + "step": 36620 + }, + { + "epoch": 0.9697097353483513, + "grad_norm": 0.8125, + "learning_rate": 8.180209909089287e-05, + "loss": 0.8527, + "step": 36621 + }, + { + "epoch": 0.9697362149566456, + "grad_norm": 0.796875, + "learning_rate": 8.179757224082884e-05, + "loss": 0.808, + "step": 36622 + }, + { + "epoch": 0.96976269456494, + "grad_norm": 0.84375, + "learning_rate": 8.179304542934382e-05, + "loss": 0.7662, + "step": 36623 + }, + { + "epoch": 0.9697891741732344, + "grad_norm": 0.81640625, + "learning_rate": 8.17885186564474e-05, + "loss": 0.6976, + "step": 36624 + }, + { + "epoch": 0.9698156537815288, + "grad_norm": 0.77734375, + "learning_rate": 8.178399192214917e-05, + "loss": 0.7003, + "step": 36625 + }, + { + "epoch": 0.9698421333898232, + "grad_norm": 0.7578125, + "learning_rate": 8.177946522645873e-05, + "loss": 0.7842, + "step": 36626 + }, + { + "epoch": 0.9698686129981176, + "grad_norm": 0.7578125, + "learning_rate": 8.177493856938566e-05, + "loss": 0.763, + "step": 36627 + }, + { + "epoch": 0.9698950926064119, + "grad_norm": 0.7734375, + "learning_rate": 8.177041195093957e-05, + "loss": 0.7521, + "step": 36628 + }, + { + "epoch": 0.9699215722147063, + "grad_norm": 0.87890625, + "learning_rate": 8.176588537113006e-05, + "loss": 0.8775, + "step": 36629 + }, + { + "epoch": 0.9699480518230007, + "grad_norm": 0.86328125, + "learning_rate": 8.176135882996672e-05, + "loss": 0.8362, + "step": 36630 + }, + { + "epoch": 0.9699745314312951, + "grad_norm": 0.76171875, + "learning_rate": 8.175683232745913e-05, + "loss": 0.7381, + "step": 36631 + }, + { + "epoch": 0.9700010110395895, + "grad_norm": 0.8515625, + "learning_rate": 8.175230586361685e-05, + "loss": 0.9037, + "step": 36632 + }, + { + "epoch": 0.9700274906478838, + "grad_norm": 0.7890625, + "learning_rate": 8.174777943844956e-05, + "loss": 0.775, + "step": 36633 + }, + { + "epoch": 0.9700539702561781, + "grad_norm": 1.3828125, + "learning_rate": 8.174325305196681e-05, + "loss": 0.7184, + "step": 36634 + }, + { + "epoch": 0.9700804498644725, + "grad_norm": 0.7421875, + "learning_rate": 8.173872670417819e-05, + "loss": 0.8918, + "step": 36635 + }, + { + "epoch": 0.9701069294727669, + "grad_norm": 0.78515625, + "learning_rate": 8.17342003950933e-05, + "loss": 0.7433, + "step": 36636 + }, + { + "epoch": 0.9701334090810613, + "grad_norm": 0.765625, + "learning_rate": 8.172967412472168e-05, + "loss": 0.8219, + "step": 36637 + }, + { + "epoch": 0.9701598886893557, + "grad_norm": 0.765625, + "learning_rate": 8.172514789307301e-05, + "loss": 0.9802, + "step": 36638 + }, + { + "epoch": 0.97018636829765, + "grad_norm": 0.7890625, + "learning_rate": 8.172062170015685e-05, + "loss": 0.7188, + "step": 36639 + }, + { + "epoch": 0.9702128479059444, + "grad_norm": 0.77734375, + "learning_rate": 8.171609554598279e-05, + "loss": 0.8869, + "step": 36640 + }, + { + "epoch": 0.9702393275142388, + "grad_norm": 0.828125, + "learning_rate": 8.171156943056042e-05, + "loss": 0.7376, + "step": 36641 + }, + { + "epoch": 0.9702658071225332, + "grad_norm": 0.84765625, + "learning_rate": 8.170704335389931e-05, + "loss": 0.8481, + "step": 36642 + }, + { + "epoch": 0.9702922867308276, + "grad_norm": 0.8359375, + "learning_rate": 8.170251731600909e-05, + "loss": 0.7278, + "step": 36643 + }, + { + "epoch": 0.970318766339122, + "grad_norm": 0.75, + "learning_rate": 8.169799131689935e-05, + "loss": 0.7916, + "step": 36644 + }, + { + "epoch": 0.9703452459474163, + "grad_norm": 0.8515625, + "learning_rate": 8.169346535657967e-05, + "loss": 0.831, + "step": 36645 + }, + { + "epoch": 0.9703717255557107, + "grad_norm": 0.79296875, + "learning_rate": 8.168893943505962e-05, + "loss": 0.7388, + "step": 36646 + }, + { + "epoch": 0.9703982051640051, + "grad_norm": 0.76953125, + "learning_rate": 8.168441355234882e-05, + "loss": 0.7275, + "step": 36647 + }, + { + "epoch": 0.9704246847722995, + "grad_norm": 0.8046875, + "learning_rate": 8.167988770845687e-05, + "loss": 0.7643, + "step": 36648 + }, + { + "epoch": 0.9704511643805939, + "grad_norm": 0.8515625, + "learning_rate": 8.167536190339335e-05, + "loss": 0.748, + "step": 36649 + }, + { + "epoch": 0.9704776439888881, + "grad_norm": 0.82421875, + "learning_rate": 8.167083613716787e-05, + "loss": 0.6797, + "step": 36650 + }, + { + "epoch": 0.9705041235971825, + "grad_norm": 0.80859375, + "learning_rate": 8.166631040978998e-05, + "loss": 0.8407, + "step": 36651 + }, + { + "epoch": 0.9705306032054769, + "grad_norm": 0.77734375, + "learning_rate": 8.166178472126928e-05, + "loss": 0.7373, + "step": 36652 + }, + { + "epoch": 0.9705570828137713, + "grad_norm": 0.8359375, + "learning_rate": 8.16572590716154e-05, + "loss": 0.8258, + "step": 36653 + }, + { + "epoch": 0.9705835624220657, + "grad_norm": 0.83203125, + "learning_rate": 8.165273346083793e-05, + "loss": 0.8016, + "step": 36654 + }, + { + "epoch": 0.97061004203036, + "grad_norm": 0.76953125, + "learning_rate": 8.164820788894643e-05, + "loss": 0.7491, + "step": 36655 + }, + { + "epoch": 0.9706365216386544, + "grad_norm": 0.7578125, + "learning_rate": 8.164368235595052e-05, + "loss": 0.7549, + "step": 36656 + }, + { + "epoch": 0.9706630012469488, + "grad_norm": 0.80859375, + "learning_rate": 8.16391568618597e-05, + "loss": 0.7923, + "step": 36657 + }, + { + "epoch": 0.9706894808552432, + "grad_norm": 0.7734375, + "learning_rate": 8.163463140668372e-05, + "loss": 0.7743, + "step": 36658 + }, + { + "epoch": 0.9707159604635376, + "grad_norm": 0.7890625, + "learning_rate": 8.163010599043206e-05, + "loss": 0.7064, + "step": 36659 + }, + { + "epoch": 0.970742440071832, + "grad_norm": 0.796875, + "learning_rate": 8.162558061311436e-05, + "loss": 0.7583, + "step": 36660 + }, + { + "epoch": 0.9707689196801264, + "grad_norm": 0.8125, + "learning_rate": 8.162105527474017e-05, + "loss": 0.7971, + "step": 36661 + }, + { + "epoch": 0.9707953992884207, + "grad_norm": 0.796875, + "learning_rate": 8.16165299753191e-05, + "loss": 0.6931, + "step": 36662 + }, + { + "epoch": 0.9708218788967151, + "grad_norm": 0.80078125, + "learning_rate": 8.161200471486076e-05, + "loss": 0.6656, + "step": 36663 + }, + { + "epoch": 0.9708483585050095, + "grad_norm": 0.7578125, + "learning_rate": 8.16074794933747e-05, + "loss": 0.744, + "step": 36664 + }, + { + "epoch": 0.9708748381133039, + "grad_norm": 0.734375, + "learning_rate": 8.160295431087057e-05, + "loss": 0.8073, + "step": 36665 + }, + { + "epoch": 0.9709013177215983, + "grad_norm": 0.75390625, + "learning_rate": 8.159842916735792e-05, + "loss": 0.7379, + "step": 36666 + }, + { + "epoch": 0.9709277973298925, + "grad_norm": 0.88671875, + "learning_rate": 8.159390406284633e-05, + "loss": 0.7711, + "step": 36667 + }, + { + "epoch": 0.9709542769381869, + "grad_norm": 0.77734375, + "learning_rate": 8.158937899734542e-05, + "loss": 0.7589, + "step": 36668 + }, + { + "epoch": 0.9709807565464813, + "grad_norm": 0.828125, + "learning_rate": 8.158485397086478e-05, + "loss": 0.8413, + "step": 36669 + }, + { + "epoch": 0.9710072361547757, + "grad_norm": 0.78515625, + "learning_rate": 8.158032898341397e-05, + "loss": 0.8033, + "step": 36670 + }, + { + "epoch": 0.9710337157630701, + "grad_norm": 0.796875, + "learning_rate": 8.157580403500262e-05, + "loss": 0.7912, + "step": 36671 + }, + { + "epoch": 0.9710601953713645, + "grad_norm": 0.8203125, + "learning_rate": 8.157127912564024e-05, + "loss": 0.6775, + "step": 36672 + }, + { + "epoch": 0.9710866749796588, + "grad_norm": 0.796875, + "learning_rate": 8.156675425533654e-05, + "loss": 0.7228, + "step": 36673 + }, + { + "epoch": 0.9711131545879532, + "grad_norm": 0.71484375, + "learning_rate": 8.156222942410104e-05, + "loss": 0.8574, + "step": 36674 + }, + { + "epoch": 0.9711396341962476, + "grad_norm": 0.76171875, + "learning_rate": 8.155770463194335e-05, + "loss": 0.8235, + "step": 36675 + }, + { + "epoch": 0.971166113804542, + "grad_norm": 0.77734375, + "learning_rate": 8.155317987887304e-05, + "loss": 0.8609, + "step": 36676 + }, + { + "epoch": 0.9711925934128364, + "grad_norm": 0.83984375, + "learning_rate": 8.154865516489968e-05, + "loss": 0.9349, + "step": 36677 + }, + { + "epoch": 0.9712190730211308, + "grad_norm": 0.80078125, + "learning_rate": 8.154413049003293e-05, + "loss": 0.8021, + "step": 36678 + }, + { + "epoch": 0.9712455526294251, + "grad_norm": 0.80859375, + "learning_rate": 8.153960585428233e-05, + "loss": 0.778, + "step": 36679 + }, + { + "epoch": 0.9712720322377195, + "grad_norm": 0.81640625, + "learning_rate": 8.15350812576575e-05, + "loss": 0.851, + "step": 36680 + }, + { + "epoch": 0.9712985118460139, + "grad_norm": 0.796875, + "learning_rate": 8.153055670016799e-05, + "loss": 0.7867, + "step": 36681 + }, + { + "epoch": 0.9713249914543083, + "grad_norm": 0.77734375, + "learning_rate": 8.15260321818234e-05, + "loss": 0.808, + "step": 36682 + }, + { + "epoch": 0.9713514710626026, + "grad_norm": 0.85546875, + "learning_rate": 8.152150770263332e-05, + "loss": 0.8016, + "step": 36683 + }, + { + "epoch": 0.9713779506708969, + "grad_norm": 0.8125, + "learning_rate": 8.151698326260737e-05, + "loss": 0.7899, + "step": 36684 + }, + { + "epoch": 0.9714044302791913, + "grad_norm": 0.76171875, + "learning_rate": 8.151245886175513e-05, + "loss": 0.7745, + "step": 36685 + }, + { + "epoch": 0.9714309098874857, + "grad_norm": 0.78125, + "learning_rate": 8.150793450008616e-05, + "loss": 0.6893, + "step": 36686 + }, + { + "epoch": 0.9714573894957801, + "grad_norm": 0.8671875, + "learning_rate": 8.150341017761005e-05, + "loss": 0.8168, + "step": 36687 + }, + { + "epoch": 0.9714838691040745, + "grad_norm": 0.7421875, + "learning_rate": 8.149888589433644e-05, + "loss": 0.7424, + "step": 36688 + }, + { + "epoch": 0.9715103487123689, + "grad_norm": 0.734375, + "learning_rate": 8.149436165027486e-05, + "loss": 0.7084, + "step": 36689 + }, + { + "epoch": 0.9715368283206632, + "grad_norm": 0.83984375, + "learning_rate": 8.148983744543494e-05, + "loss": 0.7678, + "step": 36690 + }, + { + "epoch": 0.9715633079289576, + "grad_norm": 0.80078125, + "learning_rate": 8.148531327982626e-05, + "loss": 0.8632, + "step": 36691 + }, + { + "epoch": 0.971589787537252, + "grad_norm": 0.80078125, + "learning_rate": 8.148078915345833e-05, + "loss": 0.6968, + "step": 36692 + }, + { + "epoch": 0.9716162671455464, + "grad_norm": 0.79296875, + "learning_rate": 8.147626506634087e-05, + "loss": 0.7679, + "step": 36693 + }, + { + "epoch": 0.9716427467538408, + "grad_norm": 0.80859375, + "learning_rate": 8.147174101848341e-05, + "loss": 0.773, + "step": 36694 + }, + { + "epoch": 0.9716692263621352, + "grad_norm": 0.8046875, + "learning_rate": 8.146721700989554e-05, + "loss": 0.7155, + "step": 36695 + }, + { + "epoch": 0.9716957059704295, + "grad_norm": 0.7265625, + "learning_rate": 8.146269304058682e-05, + "loss": 0.8113, + "step": 36696 + }, + { + "epoch": 0.9717221855787239, + "grad_norm": 0.82421875, + "learning_rate": 8.145816911056684e-05, + "loss": 0.7138, + "step": 36697 + }, + { + "epoch": 0.9717486651870183, + "grad_norm": 0.83203125, + "learning_rate": 8.145364521984524e-05, + "loss": 0.8021, + "step": 36698 + }, + { + "epoch": 0.9717751447953126, + "grad_norm": 0.77734375, + "learning_rate": 8.14491213684316e-05, + "loss": 0.7712, + "step": 36699 + }, + { + "epoch": 0.971801624403607, + "grad_norm": 0.71875, + "learning_rate": 8.144459755633547e-05, + "loss": 0.7903, + "step": 36700 + }, + { + "epoch": 0.9718281040119013, + "grad_norm": 1.3828125, + "learning_rate": 8.144007378356646e-05, + "loss": 0.7924, + "step": 36701 + }, + { + "epoch": 0.9718545836201957, + "grad_norm": 0.8359375, + "learning_rate": 8.143555005013415e-05, + "loss": 0.8888, + "step": 36702 + }, + { + "epoch": 0.9718810632284901, + "grad_norm": 0.76171875, + "learning_rate": 8.14310263560481e-05, + "loss": 0.7137, + "step": 36703 + }, + { + "epoch": 0.9719075428367845, + "grad_norm": 0.73046875, + "learning_rate": 8.142650270131795e-05, + "loss": 0.6633, + "step": 36704 + }, + { + "epoch": 0.9719340224450789, + "grad_norm": 0.78515625, + "learning_rate": 8.142197908595327e-05, + "loss": 0.7869, + "step": 36705 + }, + { + "epoch": 0.9719605020533733, + "grad_norm": 0.796875, + "learning_rate": 8.141745550996365e-05, + "loss": 0.8275, + "step": 36706 + }, + { + "epoch": 0.9719869816616676, + "grad_norm": 0.7890625, + "learning_rate": 8.141293197335867e-05, + "loss": 0.6859, + "step": 36707 + }, + { + "epoch": 0.972013461269962, + "grad_norm": 0.76171875, + "learning_rate": 8.14084084761479e-05, + "loss": 0.7612, + "step": 36708 + }, + { + "epoch": 0.9720399408782564, + "grad_norm": 0.73046875, + "learning_rate": 8.140388501834097e-05, + "loss": 0.7345, + "step": 36709 + }, + { + "epoch": 0.9720664204865508, + "grad_norm": 0.703125, + "learning_rate": 8.139936159994744e-05, + "loss": 0.6848, + "step": 36710 + }, + { + "epoch": 0.9720929000948452, + "grad_norm": 0.7265625, + "learning_rate": 8.139483822097688e-05, + "loss": 0.7009, + "step": 36711 + }, + { + "epoch": 0.9721193797031396, + "grad_norm": 0.77734375, + "learning_rate": 8.139031488143892e-05, + "loss": 0.8063, + "step": 36712 + }, + { + "epoch": 0.9721458593114339, + "grad_norm": 0.8046875, + "learning_rate": 8.138579158134307e-05, + "loss": 0.9303, + "step": 36713 + }, + { + "epoch": 0.9721723389197283, + "grad_norm": 0.92578125, + "learning_rate": 8.138126832069903e-05, + "loss": 0.7706, + "step": 36714 + }, + { + "epoch": 0.9721988185280227, + "grad_norm": 0.83203125, + "learning_rate": 8.137674509951632e-05, + "loss": 0.7308, + "step": 36715 + }, + { + "epoch": 0.972225298136317, + "grad_norm": 0.796875, + "learning_rate": 8.137222191780452e-05, + "loss": 0.7395, + "step": 36716 + }, + { + "epoch": 0.9722517777446114, + "grad_norm": 0.75390625, + "learning_rate": 8.136769877557325e-05, + "loss": 0.7359, + "step": 36717 + }, + { + "epoch": 0.9722782573529057, + "grad_norm": 0.77734375, + "learning_rate": 8.136317567283202e-05, + "loss": 0.8216, + "step": 36718 + }, + { + "epoch": 0.9723047369612001, + "grad_norm": 0.78125, + "learning_rate": 8.135865260959053e-05, + "loss": 0.7896, + "step": 36719 + }, + { + "epoch": 0.9723312165694945, + "grad_norm": 0.79296875, + "learning_rate": 8.13541295858583e-05, + "loss": 0.8207, + "step": 36720 + }, + { + "epoch": 0.9723576961777889, + "grad_norm": 0.71875, + "learning_rate": 8.134960660164493e-05, + "loss": 0.8572, + "step": 36721 + }, + { + "epoch": 0.9723841757860833, + "grad_norm": 0.7890625, + "learning_rate": 8.134508365696001e-05, + "loss": 0.7814, + "step": 36722 + }, + { + "epoch": 0.9724106553943777, + "grad_norm": 0.73046875, + "learning_rate": 8.134056075181307e-05, + "loss": 0.6372, + "step": 36723 + }, + { + "epoch": 0.972437135002672, + "grad_norm": 0.703125, + "learning_rate": 8.133603788621377e-05, + "loss": 0.7285, + "step": 36724 + }, + { + "epoch": 0.9724636146109664, + "grad_norm": 0.78515625, + "learning_rate": 8.133151506017169e-05, + "loss": 0.8838, + "step": 36725 + }, + { + "epoch": 0.9724900942192608, + "grad_norm": 0.6640625, + "learning_rate": 8.13269922736964e-05, + "loss": 0.6685, + "step": 36726 + }, + { + "epoch": 0.9725165738275552, + "grad_norm": 0.7421875, + "learning_rate": 8.132246952679748e-05, + "loss": 0.7654, + "step": 36727 + }, + { + "epoch": 0.9725430534358496, + "grad_norm": 0.8359375, + "learning_rate": 8.131794681948449e-05, + "loss": 0.8673, + "step": 36728 + }, + { + "epoch": 0.972569533044144, + "grad_norm": 0.73828125, + "learning_rate": 8.131342415176707e-05, + "loss": 0.7301, + "step": 36729 + }, + { + "epoch": 0.9725960126524383, + "grad_norm": 0.7734375, + "learning_rate": 8.130890152365477e-05, + "loss": 0.904, + "step": 36730 + }, + { + "epoch": 0.9726224922607327, + "grad_norm": 0.76953125, + "learning_rate": 8.130437893515717e-05, + "loss": 0.7391, + "step": 36731 + }, + { + "epoch": 0.972648971869027, + "grad_norm": 0.76953125, + "learning_rate": 8.12998563862839e-05, + "loss": 0.7863, + "step": 36732 + }, + { + "epoch": 0.9726754514773214, + "grad_norm": 0.76171875, + "learning_rate": 8.129533387704449e-05, + "loss": 0.7309, + "step": 36733 + }, + { + "epoch": 0.9727019310856158, + "grad_norm": 0.8203125, + "learning_rate": 8.129081140744857e-05, + "loss": 0.8202, + "step": 36734 + }, + { + "epoch": 0.9727284106939101, + "grad_norm": 0.76953125, + "learning_rate": 8.128628897750571e-05, + "loss": 0.8135, + "step": 36735 + }, + { + "epoch": 0.9727548903022045, + "grad_norm": 0.83203125, + "learning_rate": 8.128176658722548e-05, + "loss": 0.7632, + "step": 36736 + }, + { + "epoch": 0.9727813699104989, + "grad_norm": 0.734375, + "learning_rate": 8.127724423661749e-05, + "loss": 0.6223, + "step": 36737 + }, + { + "epoch": 0.9728078495187933, + "grad_norm": 0.890625, + "learning_rate": 8.127272192569126e-05, + "loss": 0.86, + "step": 36738 + }, + { + "epoch": 0.9728343291270877, + "grad_norm": 0.7734375, + "learning_rate": 8.126819965445647e-05, + "loss": 0.7911, + "step": 36739 + }, + { + "epoch": 0.972860808735382, + "grad_norm": 0.84765625, + "learning_rate": 8.126367742292266e-05, + "loss": 0.7625, + "step": 36740 + }, + { + "epoch": 0.9728872883436764, + "grad_norm": 0.828125, + "learning_rate": 8.125915523109942e-05, + "loss": 0.8244, + "step": 36741 + }, + { + "epoch": 0.9729137679519708, + "grad_norm": 0.73046875, + "learning_rate": 8.125463307899631e-05, + "loss": 0.7395, + "step": 36742 + }, + { + "epoch": 0.9729402475602652, + "grad_norm": 0.76171875, + "learning_rate": 8.12501109666229e-05, + "loss": 0.625, + "step": 36743 + }, + { + "epoch": 0.9729667271685596, + "grad_norm": 0.72265625, + "learning_rate": 8.124558889398886e-05, + "loss": 0.7368, + "step": 36744 + }, + { + "epoch": 0.972993206776854, + "grad_norm": 0.73828125, + "learning_rate": 8.124106686110372e-05, + "loss": 0.762, + "step": 36745 + }, + { + "epoch": 0.9730196863851484, + "grad_norm": 0.796875, + "learning_rate": 8.123654486797706e-05, + "loss": 0.7777, + "step": 36746 + }, + { + "epoch": 0.9730461659934427, + "grad_norm": 0.84765625, + "learning_rate": 8.123202291461848e-05, + "loss": 0.9708, + "step": 36747 + }, + { + "epoch": 0.973072645601737, + "grad_norm": 0.703125, + "learning_rate": 8.122750100103752e-05, + "loss": 0.7628, + "step": 36748 + }, + { + "epoch": 0.9730991252100314, + "grad_norm": 0.96484375, + "learning_rate": 8.122297912724383e-05, + "loss": 0.7848, + "step": 36749 + }, + { + "epoch": 0.9731256048183258, + "grad_norm": 0.75, + "learning_rate": 8.121845729324694e-05, + "loss": 0.7835, + "step": 36750 + }, + { + "epoch": 0.9731520844266202, + "grad_norm": 0.74609375, + "learning_rate": 8.121393549905649e-05, + "loss": 0.8254, + "step": 36751 + }, + { + "epoch": 0.9731785640349145, + "grad_norm": 0.8359375, + "learning_rate": 8.1209413744682e-05, + "loss": 0.8701, + "step": 36752 + }, + { + "epoch": 0.9732050436432089, + "grad_norm": 0.69140625, + "learning_rate": 8.120489203013309e-05, + "loss": 0.6923, + "step": 36753 + }, + { + "epoch": 0.9732315232515033, + "grad_norm": 0.828125, + "learning_rate": 8.120037035541934e-05, + "loss": 0.7484, + "step": 36754 + }, + { + "epoch": 0.9732580028597977, + "grad_norm": 0.8203125, + "learning_rate": 8.119584872055034e-05, + "loss": 0.7604, + "step": 36755 + }, + { + "epoch": 0.9732844824680921, + "grad_norm": 0.80078125, + "learning_rate": 8.119132712553566e-05, + "loss": 0.9112, + "step": 36756 + }, + { + "epoch": 0.9733109620763865, + "grad_norm": 0.7109375, + "learning_rate": 8.11868055703849e-05, + "loss": 0.8685, + "step": 36757 + }, + { + "epoch": 0.9733374416846808, + "grad_norm": 1.078125, + "learning_rate": 8.118228405510756e-05, + "loss": 0.8005, + "step": 36758 + }, + { + "epoch": 0.9733639212929752, + "grad_norm": 0.77734375, + "learning_rate": 8.117776257971335e-05, + "loss": 0.7427, + "step": 36759 + }, + { + "epoch": 0.9733904009012696, + "grad_norm": 0.79296875, + "learning_rate": 8.117324114421181e-05, + "loss": 0.8151, + "step": 36760 + }, + { + "epoch": 0.973416880509564, + "grad_norm": 0.8359375, + "learning_rate": 8.116871974861248e-05, + "loss": 0.8502, + "step": 36761 + }, + { + "epoch": 0.9734433601178584, + "grad_norm": 0.90234375, + "learning_rate": 8.116419839292501e-05, + "loss": 0.8332, + "step": 36762 + }, + { + "epoch": 0.9734698397261528, + "grad_norm": 0.79296875, + "learning_rate": 8.115967707715888e-05, + "loss": 0.8097, + "step": 36763 + }, + { + "epoch": 0.9734963193344471, + "grad_norm": 0.80078125, + "learning_rate": 8.115515580132378e-05, + "loss": 0.8205, + "step": 36764 + }, + { + "epoch": 0.9735227989427414, + "grad_norm": 0.90625, + "learning_rate": 8.115063456542925e-05, + "loss": 0.9028, + "step": 36765 + }, + { + "epoch": 0.9735492785510358, + "grad_norm": 0.8125, + "learning_rate": 8.114611336948487e-05, + "loss": 0.9323, + "step": 36766 + }, + { + "epoch": 0.9735757581593302, + "grad_norm": 0.7421875, + "learning_rate": 8.114159221350025e-05, + "loss": 0.7624, + "step": 36767 + }, + { + "epoch": 0.9736022377676246, + "grad_norm": 0.7265625, + "learning_rate": 8.113707109748488e-05, + "loss": 0.8079, + "step": 36768 + }, + { + "epoch": 0.9736287173759189, + "grad_norm": 0.76171875, + "learning_rate": 8.113255002144845e-05, + "loss": 0.8248, + "step": 36769 + }, + { + "epoch": 0.9736551969842133, + "grad_norm": 0.7734375, + "learning_rate": 8.11280289854005e-05, + "loss": 0.7191, + "step": 36770 + }, + { + "epoch": 0.9736816765925077, + "grad_norm": 0.7578125, + "learning_rate": 8.112350798935063e-05, + "loss": 0.7222, + "step": 36771 + }, + { + "epoch": 0.9737081562008021, + "grad_norm": 0.83984375, + "learning_rate": 8.11189870333084e-05, + "loss": 0.6617, + "step": 36772 + }, + { + "epoch": 0.9737346358090965, + "grad_norm": 0.71875, + "learning_rate": 8.111446611728337e-05, + "loss": 0.6649, + "step": 36773 + }, + { + "epoch": 0.9737611154173909, + "grad_norm": 0.703125, + "learning_rate": 8.110994524128519e-05, + "loss": 0.7724, + "step": 36774 + }, + { + "epoch": 0.9737875950256852, + "grad_norm": 0.734375, + "learning_rate": 8.110542440532339e-05, + "loss": 0.8487, + "step": 36775 + }, + { + "epoch": 0.9738140746339796, + "grad_norm": 0.8125, + "learning_rate": 8.110090360940755e-05, + "loss": 0.7364, + "step": 36776 + }, + { + "epoch": 0.973840554242274, + "grad_norm": 0.7734375, + "learning_rate": 8.109638285354728e-05, + "loss": 0.7506, + "step": 36777 + }, + { + "epoch": 0.9738670338505684, + "grad_norm": 0.96875, + "learning_rate": 8.10918621377521e-05, + "loss": 0.8681, + "step": 36778 + }, + { + "epoch": 0.9738935134588628, + "grad_norm": 0.7421875, + "learning_rate": 8.108734146203167e-05, + "loss": 0.6947, + "step": 36779 + }, + { + "epoch": 0.9739199930671572, + "grad_norm": 0.921875, + "learning_rate": 8.108282082639556e-05, + "loss": 0.9459, + "step": 36780 + }, + { + "epoch": 0.9739464726754514, + "grad_norm": 0.7734375, + "learning_rate": 8.10783002308533e-05, + "loss": 0.6844, + "step": 36781 + }, + { + "epoch": 0.9739729522837458, + "grad_norm": 0.79296875, + "learning_rate": 8.107377967541452e-05, + "loss": 0.8636, + "step": 36782 + }, + { + "epoch": 0.9739994318920402, + "grad_norm": 0.7265625, + "learning_rate": 8.106925916008875e-05, + "loss": 0.8261, + "step": 36783 + }, + { + "epoch": 0.9740259115003346, + "grad_norm": 0.83984375, + "learning_rate": 8.106473868488563e-05, + "loss": 0.737, + "step": 36784 + }, + { + "epoch": 0.974052391108629, + "grad_norm": 0.796875, + "learning_rate": 8.106021824981472e-05, + "loss": 0.8528, + "step": 36785 + }, + { + "epoch": 0.9740788707169233, + "grad_norm": 0.83984375, + "learning_rate": 8.10556978548856e-05, + "loss": 0.6989, + "step": 36786 + }, + { + "epoch": 0.9741053503252177, + "grad_norm": 0.77734375, + "learning_rate": 8.105117750010783e-05, + "loss": 0.8343, + "step": 36787 + }, + { + "epoch": 0.9741318299335121, + "grad_norm": 0.78515625, + "learning_rate": 8.104665718549099e-05, + "loss": 0.7166, + "step": 36788 + }, + { + "epoch": 0.9741583095418065, + "grad_norm": 0.75, + "learning_rate": 8.104213691104469e-05, + "loss": 0.7815, + "step": 36789 + }, + { + "epoch": 0.9741847891501009, + "grad_norm": 0.8046875, + "learning_rate": 8.103761667677852e-05, + "loss": 0.7238, + "step": 36790 + }, + { + "epoch": 0.9742112687583953, + "grad_norm": 0.7421875, + "learning_rate": 8.103309648270202e-05, + "loss": 0.829, + "step": 36791 + }, + { + "epoch": 0.9742377483666896, + "grad_norm": 0.82421875, + "learning_rate": 8.102857632882479e-05, + "loss": 0.7033, + "step": 36792 + }, + { + "epoch": 0.974264227974984, + "grad_norm": 0.87890625, + "learning_rate": 8.102405621515639e-05, + "loss": 0.7971, + "step": 36793 + }, + { + "epoch": 0.9742907075832784, + "grad_norm": 0.78515625, + "learning_rate": 8.101953614170645e-05, + "loss": 0.8012, + "step": 36794 + }, + { + "epoch": 0.9743171871915728, + "grad_norm": 0.78515625, + "learning_rate": 8.10150161084845e-05, + "loss": 0.8832, + "step": 36795 + }, + { + "epoch": 0.9743436667998672, + "grad_norm": 0.79296875, + "learning_rate": 8.101049611550016e-05, + "loss": 0.7178, + "step": 36796 + }, + { + "epoch": 0.9743701464081614, + "grad_norm": 0.70703125, + "learning_rate": 8.100597616276297e-05, + "loss": 0.7662, + "step": 36797 + }, + { + "epoch": 0.9743966260164558, + "grad_norm": 0.76171875, + "learning_rate": 8.100145625028249e-05, + "loss": 0.8434, + "step": 36798 + }, + { + "epoch": 0.9744231056247502, + "grad_norm": 0.76953125, + "learning_rate": 8.099693637806838e-05, + "loss": 0.8316, + "step": 36799 + }, + { + "epoch": 0.9744495852330446, + "grad_norm": 0.6953125, + "learning_rate": 8.099241654613018e-05, + "loss": 0.571, + "step": 36800 + }, + { + "epoch": 0.974476064841339, + "grad_norm": 0.79296875, + "learning_rate": 8.098789675447747e-05, + "loss": 0.8692, + "step": 36801 + }, + { + "epoch": 0.9745025444496334, + "grad_norm": 0.8203125, + "learning_rate": 8.098337700311982e-05, + "loss": 0.6822, + "step": 36802 + }, + { + "epoch": 0.9745290240579277, + "grad_norm": 0.77734375, + "learning_rate": 8.097885729206677e-05, + "loss": 0.7259, + "step": 36803 + }, + { + "epoch": 0.9745555036662221, + "grad_norm": 0.859375, + "learning_rate": 8.097433762132799e-05, + "loss": 0.8048, + "step": 36804 + }, + { + "epoch": 0.9745819832745165, + "grad_norm": 0.78515625, + "learning_rate": 8.0969817990913e-05, + "loss": 0.7825, + "step": 36805 + }, + { + "epoch": 0.9746084628828109, + "grad_norm": 0.7890625, + "learning_rate": 8.096529840083142e-05, + "loss": 0.7624, + "step": 36806 + }, + { + "epoch": 0.9746349424911053, + "grad_norm": 0.80078125, + "learning_rate": 8.096077885109279e-05, + "loss": 0.8524, + "step": 36807 + }, + { + "epoch": 0.9746614220993997, + "grad_norm": 0.80859375, + "learning_rate": 8.095625934170667e-05, + "loss": 0.8978, + "step": 36808 + }, + { + "epoch": 0.974687901707694, + "grad_norm": 1.390625, + "learning_rate": 8.09517398726827e-05, + "loss": 0.7864, + "step": 36809 + }, + { + "epoch": 0.9747143813159884, + "grad_norm": 0.84375, + "learning_rate": 8.094722044403043e-05, + "loss": 1.0688, + "step": 36810 + }, + { + "epoch": 0.9747408609242828, + "grad_norm": 0.69921875, + "learning_rate": 8.094270105575942e-05, + "loss": 0.6999, + "step": 36811 + }, + { + "epoch": 0.9747673405325772, + "grad_norm": 0.765625, + "learning_rate": 8.09381817078793e-05, + "loss": 0.6736, + "step": 36812 + }, + { + "epoch": 0.9747938201408716, + "grad_norm": 0.8671875, + "learning_rate": 8.093366240039957e-05, + "loss": 0.9078, + "step": 36813 + }, + { + "epoch": 0.9748202997491658, + "grad_norm": 0.703125, + "learning_rate": 8.092914313332988e-05, + "loss": 0.7105, + "step": 36814 + }, + { + "epoch": 0.9748467793574602, + "grad_norm": 0.76171875, + "learning_rate": 8.092462390667979e-05, + "loss": 0.8297, + "step": 36815 + }, + { + "epoch": 0.9748732589657546, + "grad_norm": 0.87109375, + "learning_rate": 8.092010472045886e-05, + "loss": 0.7792, + "step": 36816 + }, + { + "epoch": 0.974899738574049, + "grad_norm": 0.77734375, + "learning_rate": 8.091558557467665e-05, + "loss": 0.7607, + "step": 36817 + }, + { + "epoch": 0.9749262181823434, + "grad_norm": 0.7421875, + "learning_rate": 8.091106646934276e-05, + "loss": 0.8726, + "step": 36818 + }, + { + "epoch": 0.9749526977906378, + "grad_norm": 0.80859375, + "learning_rate": 8.090654740446681e-05, + "loss": 0.8092, + "step": 36819 + }, + { + "epoch": 0.9749791773989321, + "grad_norm": 0.74609375, + "learning_rate": 8.090202838005835e-05, + "loss": 0.7204, + "step": 36820 + }, + { + "epoch": 0.9750056570072265, + "grad_norm": 0.75390625, + "learning_rate": 8.089750939612693e-05, + "loss": 0.6198, + "step": 36821 + }, + { + "epoch": 0.9750321366155209, + "grad_norm": 0.77734375, + "learning_rate": 8.089299045268216e-05, + "loss": 0.7392, + "step": 36822 + }, + { + "epoch": 0.9750586162238153, + "grad_norm": 0.7265625, + "learning_rate": 8.088847154973356e-05, + "loss": 0.7724, + "step": 36823 + }, + { + "epoch": 0.9750850958321097, + "grad_norm": 0.7421875, + "learning_rate": 8.088395268729078e-05, + "loss": 0.7959, + "step": 36824 + }, + { + "epoch": 0.975111575440404, + "grad_norm": 0.87890625, + "learning_rate": 8.087943386536339e-05, + "loss": 0.837, + "step": 36825 + }, + { + "epoch": 0.9751380550486984, + "grad_norm": 0.7734375, + "learning_rate": 8.087491508396094e-05, + "loss": 0.656, + "step": 36826 + }, + { + "epoch": 0.9751645346569928, + "grad_norm": 0.765625, + "learning_rate": 8.087039634309302e-05, + "loss": 0.8508, + "step": 36827 + }, + { + "epoch": 0.9751910142652872, + "grad_norm": 0.859375, + "learning_rate": 8.086587764276913e-05, + "loss": 0.8347, + "step": 36828 + }, + { + "epoch": 0.9752174938735816, + "grad_norm": 0.75, + "learning_rate": 8.0861358982999e-05, + "loss": 0.7528, + "step": 36829 + }, + { + "epoch": 0.9752439734818759, + "grad_norm": 0.75, + "learning_rate": 8.08568403637921e-05, + "loss": 0.7736, + "step": 36830 + }, + { + "epoch": 0.9752704530901702, + "grad_norm": 0.7890625, + "learning_rate": 8.085232178515805e-05, + "loss": 0.8319, + "step": 36831 + }, + { + "epoch": 0.9752969326984646, + "grad_norm": 0.79296875, + "learning_rate": 8.08478032471064e-05, + "loss": 0.7788, + "step": 36832 + }, + { + "epoch": 0.975323412306759, + "grad_norm": 0.859375, + "learning_rate": 8.084328474964671e-05, + "loss": 0.8125, + "step": 36833 + }, + { + "epoch": 0.9753498919150534, + "grad_norm": 0.83203125, + "learning_rate": 8.083876629278861e-05, + "loss": 0.8324, + "step": 36834 + }, + { + "epoch": 0.9753763715233478, + "grad_norm": 0.84765625, + "learning_rate": 8.083424787654163e-05, + "loss": 0.7631, + "step": 36835 + }, + { + "epoch": 0.9754028511316422, + "grad_norm": 0.7109375, + "learning_rate": 8.082972950091539e-05, + "loss": 0.7569, + "step": 36836 + }, + { + "epoch": 0.9754293307399365, + "grad_norm": 0.7890625, + "learning_rate": 8.082521116591944e-05, + "loss": 0.7229, + "step": 36837 + }, + { + "epoch": 0.9754558103482309, + "grad_norm": 0.77734375, + "learning_rate": 8.082069287156333e-05, + "loss": 0.7025, + "step": 36838 + }, + { + "epoch": 0.9754822899565253, + "grad_norm": 0.77734375, + "learning_rate": 8.08161746178567e-05, + "loss": 0.8182, + "step": 36839 + }, + { + "epoch": 0.9755087695648197, + "grad_norm": 0.80859375, + "learning_rate": 8.081165640480908e-05, + "loss": 0.8158, + "step": 36840 + }, + { + "epoch": 0.9755352491731141, + "grad_norm": 0.80078125, + "learning_rate": 8.080713823243007e-05, + "loss": 0.734, + "step": 36841 + }, + { + "epoch": 0.9755617287814085, + "grad_norm": 0.90625, + "learning_rate": 8.080262010072922e-05, + "loss": 0.7382, + "step": 36842 + }, + { + "epoch": 0.9755882083897028, + "grad_norm": 0.76171875, + "learning_rate": 8.079810200971613e-05, + "loss": 0.7179, + "step": 36843 + }, + { + "epoch": 0.9756146879979972, + "grad_norm": 0.796875, + "learning_rate": 8.079358395940031e-05, + "loss": 0.7107, + "step": 36844 + }, + { + "epoch": 0.9756411676062916, + "grad_norm": 0.734375, + "learning_rate": 8.078906594979145e-05, + "loss": 0.7377, + "step": 36845 + }, + { + "epoch": 0.9756676472145859, + "grad_norm": 0.7734375, + "learning_rate": 8.078454798089905e-05, + "loss": 0.7007, + "step": 36846 + }, + { + "epoch": 0.9756941268228803, + "grad_norm": 0.8125, + "learning_rate": 8.078003005273272e-05, + "loss": 0.8251, + "step": 36847 + }, + { + "epoch": 0.9757206064311746, + "grad_norm": 0.7421875, + "learning_rate": 8.077551216530199e-05, + "loss": 0.7136, + "step": 36848 + }, + { + "epoch": 0.975747086039469, + "grad_norm": 0.77734375, + "learning_rate": 8.077099431861644e-05, + "loss": 0.6973, + "step": 36849 + }, + { + "epoch": 0.9757735656477634, + "grad_norm": 0.82421875, + "learning_rate": 8.076647651268572e-05, + "loss": 0.6843, + "step": 36850 + }, + { + "epoch": 0.9758000452560578, + "grad_norm": 0.76171875, + "learning_rate": 8.076195874751934e-05, + "loss": 0.876, + "step": 36851 + }, + { + "epoch": 0.9758265248643522, + "grad_norm": 0.83203125, + "learning_rate": 8.075744102312689e-05, + "loss": 0.9243, + "step": 36852 + }, + { + "epoch": 0.9758530044726466, + "grad_norm": 0.796875, + "learning_rate": 8.075292333951795e-05, + "loss": 0.7392, + "step": 36853 + }, + { + "epoch": 0.9758794840809409, + "grad_norm": 0.91796875, + "learning_rate": 8.074840569670203e-05, + "loss": 0.776, + "step": 36854 + }, + { + "epoch": 0.9759059636892353, + "grad_norm": 0.78515625, + "learning_rate": 8.074388809468882e-05, + "loss": 0.7004, + "step": 36855 + }, + { + "epoch": 0.9759324432975297, + "grad_norm": 0.80078125, + "learning_rate": 8.073937053348784e-05, + "loss": 0.8449, + "step": 36856 + }, + { + "epoch": 0.9759589229058241, + "grad_norm": 0.859375, + "learning_rate": 8.073485301310865e-05, + "loss": 0.8403, + "step": 36857 + }, + { + "epoch": 0.9759854025141185, + "grad_norm": 0.8359375, + "learning_rate": 8.073033553356086e-05, + "loss": 0.9243, + "step": 36858 + }, + { + "epoch": 0.9760118821224129, + "grad_norm": 0.80859375, + "learning_rate": 8.072581809485398e-05, + "loss": 0.7405, + "step": 36859 + }, + { + "epoch": 0.9760383617307072, + "grad_norm": 0.796875, + "learning_rate": 8.072130069699767e-05, + "loss": 0.8274, + "step": 36860 + }, + { + "epoch": 0.9760648413390016, + "grad_norm": 0.80078125, + "learning_rate": 8.071678334000144e-05, + "loss": 0.9327, + "step": 36861 + }, + { + "epoch": 0.976091320947296, + "grad_norm": 0.7578125, + "learning_rate": 8.071226602387491e-05, + "loss": 0.8154, + "step": 36862 + }, + { + "epoch": 0.9761178005555903, + "grad_norm": 0.8515625, + "learning_rate": 8.07077487486276e-05, + "loss": 0.7886, + "step": 36863 + }, + { + "epoch": 0.9761442801638847, + "grad_norm": 0.84765625, + "learning_rate": 8.070323151426909e-05, + "loss": 0.8355, + "step": 36864 + }, + { + "epoch": 0.976170759772179, + "grad_norm": 0.79296875, + "learning_rate": 8.069871432080902e-05, + "loss": 0.7591, + "step": 36865 + }, + { + "epoch": 0.9761972393804734, + "grad_norm": 0.7734375, + "learning_rate": 8.069419716825691e-05, + "loss": 0.8271, + "step": 36866 + }, + { + "epoch": 0.9762237189887678, + "grad_norm": 0.7578125, + "learning_rate": 8.068968005662237e-05, + "loss": 0.7179, + "step": 36867 + }, + { + "epoch": 0.9762501985970622, + "grad_norm": 0.8125, + "learning_rate": 8.068516298591492e-05, + "loss": 0.7589, + "step": 36868 + }, + { + "epoch": 0.9762766782053566, + "grad_norm": 0.77734375, + "learning_rate": 8.068064595614413e-05, + "loss": 0.7984, + "step": 36869 + }, + { + "epoch": 0.976303157813651, + "grad_norm": 0.828125, + "learning_rate": 8.067612896731965e-05, + "loss": 0.7329, + "step": 36870 + }, + { + "epoch": 0.9763296374219453, + "grad_norm": 0.7734375, + "learning_rate": 8.067161201945102e-05, + "loss": 0.8685, + "step": 36871 + }, + { + "epoch": 0.9763561170302397, + "grad_norm": 0.828125, + "learning_rate": 8.06670951125478e-05, + "loss": 0.702, + "step": 36872 + }, + { + "epoch": 0.9763825966385341, + "grad_norm": 0.7734375, + "learning_rate": 8.066257824661955e-05, + "loss": 0.7299, + "step": 36873 + }, + { + "epoch": 0.9764090762468285, + "grad_norm": 0.78515625, + "learning_rate": 8.065806142167584e-05, + "loss": 0.7861, + "step": 36874 + }, + { + "epoch": 0.9764355558551229, + "grad_norm": 0.87890625, + "learning_rate": 8.065354463772629e-05, + "loss": 0.8365, + "step": 36875 + }, + { + "epoch": 0.9764620354634173, + "grad_norm": 0.7109375, + "learning_rate": 8.064902789478046e-05, + "loss": 0.8379, + "step": 36876 + }, + { + "epoch": 0.9764885150717116, + "grad_norm": 0.74609375, + "learning_rate": 8.06445111928479e-05, + "loss": 0.6954, + "step": 36877 + }, + { + "epoch": 0.976514994680006, + "grad_norm": 0.8046875, + "learning_rate": 8.063999453193819e-05, + "loss": 0.8656, + "step": 36878 + }, + { + "epoch": 0.9765414742883003, + "grad_norm": 1.546875, + "learning_rate": 8.063547791206088e-05, + "loss": 0.6871, + "step": 36879 + }, + { + "epoch": 0.9765679538965947, + "grad_norm": 0.796875, + "learning_rate": 8.063096133322561e-05, + "loss": 0.771, + "step": 36880 + }, + { + "epoch": 0.9765944335048891, + "grad_norm": 0.7421875, + "learning_rate": 8.062644479544189e-05, + "loss": 0.7901, + "step": 36881 + }, + { + "epoch": 0.9766209131131834, + "grad_norm": 0.7734375, + "learning_rate": 8.062192829871933e-05, + "loss": 0.8721, + "step": 36882 + }, + { + "epoch": 0.9766473927214778, + "grad_norm": 0.828125, + "learning_rate": 8.061741184306747e-05, + "loss": 0.9451, + "step": 36883 + }, + { + "epoch": 0.9766738723297722, + "grad_norm": 0.85546875, + "learning_rate": 8.061289542849586e-05, + "loss": 0.6739, + "step": 36884 + }, + { + "epoch": 0.9767003519380666, + "grad_norm": 0.76953125, + "learning_rate": 8.060837905501415e-05, + "loss": 0.7765, + "step": 36885 + }, + { + "epoch": 0.976726831546361, + "grad_norm": 0.68359375, + "learning_rate": 8.060386272263187e-05, + "loss": 0.8033, + "step": 36886 + }, + { + "epoch": 0.9767533111546554, + "grad_norm": 0.81640625, + "learning_rate": 8.059934643135861e-05, + "loss": 0.8605, + "step": 36887 + }, + { + "epoch": 0.9767797907629497, + "grad_norm": 0.80078125, + "learning_rate": 8.059483018120391e-05, + "loss": 0.7421, + "step": 36888 + }, + { + "epoch": 0.9768062703712441, + "grad_norm": 0.75, + "learning_rate": 8.059031397217732e-05, + "loss": 0.7326, + "step": 36889 + }, + { + "epoch": 0.9768327499795385, + "grad_norm": 0.8203125, + "learning_rate": 8.05857978042885e-05, + "loss": 0.7422, + "step": 36890 + }, + { + "epoch": 0.9768592295878329, + "grad_norm": 0.85546875, + "learning_rate": 8.058128167754698e-05, + "loss": 0.7699, + "step": 36891 + }, + { + "epoch": 0.9768857091961273, + "grad_norm": 0.80078125, + "learning_rate": 8.05767655919623e-05, + "loss": 0.6911, + "step": 36892 + }, + { + "epoch": 0.9769121888044217, + "grad_norm": 0.796875, + "learning_rate": 8.057224954754406e-05, + "loss": 0.8131, + "step": 36893 + }, + { + "epoch": 0.976938668412716, + "grad_norm": 0.79296875, + "learning_rate": 8.056773354430179e-05, + "loss": 0.7889, + "step": 36894 + }, + { + "epoch": 0.9769651480210103, + "grad_norm": 0.8125, + "learning_rate": 8.056321758224514e-05, + "loss": 0.748, + "step": 36895 + }, + { + "epoch": 0.9769916276293047, + "grad_norm": 0.80078125, + "learning_rate": 8.055870166138364e-05, + "loss": 0.7852, + "step": 36896 + }, + { + "epoch": 0.9770181072375991, + "grad_norm": 0.7890625, + "learning_rate": 8.055418578172686e-05, + "loss": 0.7674, + "step": 36897 + }, + { + "epoch": 0.9770445868458935, + "grad_norm": 0.77734375, + "learning_rate": 8.054966994328436e-05, + "loss": 0.7992, + "step": 36898 + }, + { + "epoch": 0.9770710664541878, + "grad_norm": 0.71875, + "learning_rate": 8.054515414606573e-05, + "loss": 0.6932, + "step": 36899 + }, + { + "epoch": 0.9770975460624822, + "grad_norm": 0.72265625, + "learning_rate": 8.054063839008054e-05, + "loss": 0.7768, + "step": 36900 + }, + { + "epoch": 0.9771240256707766, + "grad_norm": 0.76171875, + "learning_rate": 8.053612267533834e-05, + "loss": 0.8192, + "step": 36901 + }, + { + "epoch": 0.977150505279071, + "grad_norm": 1.140625, + "learning_rate": 8.053160700184872e-05, + "loss": 0.7947, + "step": 36902 + }, + { + "epoch": 0.9771769848873654, + "grad_norm": 0.8203125, + "learning_rate": 8.052709136962124e-05, + "loss": 0.748, + "step": 36903 + }, + { + "epoch": 0.9772034644956598, + "grad_norm": 0.7578125, + "learning_rate": 8.052257577866548e-05, + "loss": 0.8544, + "step": 36904 + }, + { + "epoch": 0.9772299441039541, + "grad_norm": 0.73828125, + "learning_rate": 8.051806022899102e-05, + "loss": 0.7392, + "step": 36905 + }, + { + "epoch": 0.9772564237122485, + "grad_norm": 0.796875, + "learning_rate": 8.051354472060741e-05, + "loss": 0.7198, + "step": 36906 + }, + { + "epoch": 0.9772829033205429, + "grad_norm": 0.76171875, + "learning_rate": 8.050902925352424e-05, + "loss": 0.6941, + "step": 36907 + }, + { + "epoch": 0.9773093829288373, + "grad_norm": 0.75, + "learning_rate": 8.050451382775106e-05, + "loss": 0.8632, + "step": 36908 + }, + { + "epoch": 0.9773358625371317, + "grad_norm": 0.703125, + "learning_rate": 8.04999984432974e-05, + "loss": 0.6695, + "step": 36909 + }, + { + "epoch": 0.977362342145426, + "grad_norm": 0.90625, + "learning_rate": 8.049548310017294e-05, + "loss": 0.8647, + "step": 36910 + }, + { + "epoch": 0.9773888217537204, + "grad_norm": 0.8203125, + "learning_rate": 8.049096779838719e-05, + "loss": 0.8748, + "step": 36911 + }, + { + "epoch": 0.9774153013620147, + "grad_norm": 0.9296875, + "learning_rate": 8.048645253794971e-05, + "loss": 0.7622, + "step": 36912 + }, + { + "epoch": 0.9774417809703091, + "grad_norm": 0.7578125, + "learning_rate": 8.048193731887007e-05, + "loss": 0.7174, + "step": 36913 + }, + { + "epoch": 0.9774682605786035, + "grad_norm": 0.8203125, + "learning_rate": 8.047742214115781e-05, + "loss": 0.7308, + "step": 36914 + }, + { + "epoch": 0.9774947401868979, + "grad_norm": 0.796875, + "learning_rate": 8.047290700482258e-05, + "loss": 0.9016, + "step": 36915 + }, + { + "epoch": 0.9775212197951922, + "grad_norm": 0.8203125, + "learning_rate": 8.046839190987391e-05, + "loss": 0.768, + "step": 36916 + }, + { + "epoch": 0.9775476994034866, + "grad_norm": 0.79296875, + "learning_rate": 8.046387685632138e-05, + "loss": 0.8045, + "step": 36917 + }, + { + "epoch": 0.977574179011781, + "grad_norm": 0.74609375, + "learning_rate": 8.045936184417453e-05, + "loss": 0.7563, + "step": 36918 + }, + { + "epoch": 0.9776006586200754, + "grad_norm": 0.84765625, + "learning_rate": 8.045484687344293e-05, + "loss": 0.8175, + "step": 36919 + }, + { + "epoch": 0.9776271382283698, + "grad_norm": 0.7578125, + "learning_rate": 8.045033194413618e-05, + "loss": 0.825, + "step": 36920 + }, + { + "epoch": 0.9776536178366642, + "grad_norm": 0.83203125, + "learning_rate": 8.044581705626381e-05, + "loss": 0.709, + "step": 36921 + }, + { + "epoch": 0.9776800974449585, + "grad_norm": 0.8359375, + "learning_rate": 8.044130220983544e-05, + "loss": 0.8472, + "step": 36922 + }, + { + "epoch": 0.9777065770532529, + "grad_norm": 0.8125, + "learning_rate": 8.043678740486062e-05, + "loss": 0.7979, + "step": 36923 + }, + { + "epoch": 0.9777330566615473, + "grad_norm": 0.8046875, + "learning_rate": 8.043227264134889e-05, + "loss": 0.8773, + "step": 36924 + }, + { + "epoch": 0.9777595362698417, + "grad_norm": 0.8046875, + "learning_rate": 8.042775791930984e-05, + "loss": 0.7806, + "step": 36925 + }, + { + "epoch": 0.9777860158781361, + "grad_norm": 0.76953125, + "learning_rate": 8.042324323875306e-05, + "loss": 0.8073, + "step": 36926 + }, + { + "epoch": 0.9778124954864305, + "grad_norm": 0.75390625, + "learning_rate": 8.041872859968808e-05, + "loss": 0.7325, + "step": 36927 + }, + { + "epoch": 0.9778389750947247, + "grad_norm": 0.7734375, + "learning_rate": 8.04142140021245e-05, + "loss": 0.709, + "step": 36928 + }, + { + "epoch": 0.9778654547030191, + "grad_norm": 0.78125, + "learning_rate": 8.040969944607183e-05, + "loss": 0.7776, + "step": 36929 + }, + { + "epoch": 0.9778919343113135, + "grad_norm": 0.8671875, + "learning_rate": 8.040518493153973e-05, + "loss": 0.8543, + "step": 36930 + }, + { + "epoch": 0.9779184139196079, + "grad_norm": 0.75, + "learning_rate": 8.04006704585377e-05, + "loss": 0.7364, + "step": 36931 + }, + { + "epoch": 0.9779448935279023, + "grad_norm": 0.77734375, + "learning_rate": 8.039615602707534e-05, + "loss": 0.7306, + "step": 36932 + }, + { + "epoch": 0.9779713731361966, + "grad_norm": 0.80078125, + "learning_rate": 8.039164163716221e-05, + "loss": 0.8009, + "step": 36933 + }, + { + "epoch": 0.977997852744491, + "grad_norm": 0.83203125, + "learning_rate": 8.038712728880783e-05, + "loss": 0.7607, + "step": 36934 + }, + { + "epoch": 0.9780243323527854, + "grad_norm": 0.73046875, + "learning_rate": 8.038261298202185e-05, + "loss": 0.686, + "step": 36935 + }, + { + "epoch": 0.9780508119610798, + "grad_norm": 0.78515625, + "learning_rate": 8.037809871681381e-05, + "loss": 0.8944, + "step": 36936 + }, + { + "epoch": 0.9780772915693742, + "grad_norm": 0.8359375, + "learning_rate": 8.037358449319326e-05, + "loss": 0.9358, + "step": 36937 + }, + { + "epoch": 0.9781037711776686, + "grad_norm": 0.76171875, + "learning_rate": 8.036907031116978e-05, + "loss": 0.698, + "step": 36938 + }, + { + "epoch": 0.9781302507859629, + "grad_norm": 0.74609375, + "learning_rate": 8.036455617075289e-05, + "loss": 0.8, + "step": 36939 + }, + { + "epoch": 0.9781567303942573, + "grad_norm": 0.828125, + "learning_rate": 8.036004207195224e-05, + "loss": 0.776, + "step": 36940 + }, + { + "epoch": 0.9781832100025517, + "grad_norm": 0.890625, + "learning_rate": 8.035552801477737e-05, + "loss": 0.9424, + "step": 36941 + }, + { + "epoch": 0.9782096896108461, + "grad_norm": 0.80078125, + "learning_rate": 8.035101399923782e-05, + "loss": 0.7523, + "step": 36942 + }, + { + "epoch": 0.9782361692191405, + "grad_norm": 0.765625, + "learning_rate": 8.034650002534319e-05, + "loss": 0.7411, + "step": 36943 + }, + { + "epoch": 0.9782626488274349, + "grad_norm": 0.82421875, + "learning_rate": 8.034198609310299e-05, + "loss": 0.8988, + "step": 36944 + }, + { + "epoch": 0.9782891284357291, + "grad_norm": 0.8203125, + "learning_rate": 8.033747220252686e-05, + "loss": 0.731, + "step": 36945 + }, + { + "epoch": 0.9783156080440235, + "grad_norm": 0.7890625, + "learning_rate": 8.033295835362433e-05, + "loss": 0.7669, + "step": 36946 + }, + { + "epoch": 0.9783420876523179, + "grad_norm": 0.81640625, + "learning_rate": 8.032844454640498e-05, + "loss": 0.7004, + "step": 36947 + }, + { + "epoch": 0.9783685672606123, + "grad_norm": 0.84375, + "learning_rate": 8.032393078087835e-05, + "loss": 0.8073, + "step": 36948 + }, + { + "epoch": 0.9783950468689067, + "grad_norm": 0.7265625, + "learning_rate": 8.031941705705399e-05, + "loss": 0.6866, + "step": 36949 + }, + { + "epoch": 0.978421526477201, + "grad_norm": 0.79296875, + "learning_rate": 8.031490337494155e-05, + "loss": 0.8612, + "step": 36950 + }, + { + "epoch": 0.9784480060854954, + "grad_norm": 0.75390625, + "learning_rate": 8.031038973455054e-05, + "loss": 0.8109, + "step": 36951 + }, + { + "epoch": 0.9784744856937898, + "grad_norm": 0.83984375, + "learning_rate": 8.030587613589053e-05, + "loss": 0.7485, + "step": 36952 + }, + { + "epoch": 0.9785009653020842, + "grad_norm": 0.78125, + "learning_rate": 8.03013625789711e-05, + "loss": 0.8507, + "step": 36953 + }, + { + "epoch": 0.9785274449103786, + "grad_norm": 0.73046875, + "learning_rate": 8.029684906380175e-05, + "loss": 0.7196, + "step": 36954 + }, + { + "epoch": 0.978553924518673, + "grad_norm": 0.7578125, + "learning_rate": 8.029233559039216e-05, + "loss": 0.782, + "step": 36955 + }, + { + "epoch": 0.9785804041269673, + "grad_norm": 0.78125, + "learning_rate": 8.028782215875183e-05, + "loss": 0.8256, + "step": 36956 + }, + { + "epoch": 0.9786068837352617, + "grad_norm": 0.82421875, + "learning_rate": 8.028330876889033e-05, + "loss": 0.7244, + "step": 36957 + }, + { + "epoch": 0.9786333633435561, + "grad_norm": 0.78125, + "learning_rate": 8.027879542081724e-05, + "loss": 0.8189, + "step": 36958 + }, + { + "epoch": 0.9786598429518505, + "grad_norm": 0.8359375, + "learning_rate": 8.027428211454207e-05, + "loss": 0.8099, + "step": 36959 + }, + { + "epoch": 0.9786863225601449, + "grad_norm": 0.78125, + "learning_rate": 8.026976885007447e-05, + "loss": 0.6897, + "step": 36960 + }, + { + "epoch": 0.9787128021684391, + "grad_norm": 0.7578125, + "learning_rate": 8.026525562742398e-05, + "loss": 0.7686, + "step": 36961 + }, + { + "epoch": 0.9787392817767335, + "grad_norm": 0.70703125, + "learning_rate": 8.026074244660014e-05, + "loss": 0.7055, + "step": 36962 + }, + { + "epoch": 0.9787657613850279, + "grad_norm": 0.7734375, + "learning_rate": 8.025622930761252e-05, + "loss": 0.8619, + "step": 36963 + }, + { + "epoch": 0.9787922409933223, + "grad_norm": 0.84375, + "learning_rate": 8.025171621047068e-05, + "loss": 0.7776, + "step": 36964 + }, + { + "epoch": 0.9788187206016167, + "grad_norm": 0.8046875, + "learning_rate": 8.024720315518422e-05, + "loss": 0.868, + "step": 36965 + }, + { + "epoch": 0.9788452002099111, + "grad_norm": 0.8046875, + "learning_rate": 8.024269014176269e-05, + "loss": 0.8919, + "step": 36966 + }, + { + "epoch": 0.9788716798182054, + "grad_norm": 0.86328125, + "learning_rate": 8.023817717021565e-05, + "loss": 0.8223, + "step": 36967 + }, + { + "epoch": 0.9788981594264998, + "grad_norm": 0.71875, + "learning_rate": 8.023366424055266e-05, + "loss": 0.8038, + "step": 36968 + }, + { + "epoch": 0.9789246390347942, + "grad_norm": 0.73828125, + "learning_rate": 8.022915135278325e-05, + "loss": 0.6862, + "step": 36969 + }, + { + "epoch": 0.9789511186430886, + "grad_norm": 0.828125, + "learning_rate": 8.022463850691705e-05, + "loss": 0.7944, + "step": 36970 + }, + { + "epoch": 0.978977598251383, + "grad_norm": 0.78125, + "learning_rate": 8.022012570296362e-05, + "loss": 0.7947, + "step": 36971 + }, + { + "epoch": 0.9790040778596774, + "grad_norm": 0.8203125, + "learning_rate": 8.02156129409325e-05, + "loss": 0.9012, + "step": 36972 + }, + { + "epoch": 0.9790305574679717, + "grad_norm": 0.76953125, + "learning_rate": 8.021110022083324e-05, + "loss": 0.7634, + "step": 36973 + }, + { + "epoch": 0.9790570370762661, + "grad_norm": 0.74609375, + "learning_rate": 8.02065875426754e-05, + "loss": 0.8781, + "step": 36974 + }, + { + "epoch": 0.9790835166845605, + "grad_norm": 0.7265625, + "learning_rate": 8.020207490646859e-05, + "loss": 0.6901, + "step": 36975 + }, + { + "epoch": 0.9791099962928549, + "grad_norm": 0.7421875, + "learning_rate": 8.019756231222236e-05, + "loss": 0.8666, + "step": 36976 + }, + { + "epoch": 0.9791364759011492, + "grad_norm": 0.83203125, + "learning_rate": 8.019304975994627e-05, + "loss": 0.7164, + "step": 36977 + }, + { + "epoch": 0.9791629555094435, + "grad_norm": 0.7578125, + "learning_rate": 8.018853724964988e-05, + "loss": 0.8115, + "step": 36978 + }, + { + "epoch": 0.9791894351177379, + "grad_norm": 0.80078125, + "learning_rate": 8.01840247813427e-05, + "loss": 0.7615, + "step": 36979 + }, + { + "epoch": 0.9792159147260323, + "grad_norm": 0.78515625, + "learning_rate": 8.01795123550344e-05, + "loss": 0.7826, + "step": 36980 + }, + { + "epoch": 0.9792423943343267, + "grad_norm": 0.74609375, + "learning_rate": 8.017499997073448e-05, + "loss": 0.7655, + "step": 36981 + }, + { + "epoch": 0.9792688739426211, + "grad_norm": 0.79296875, + "learning_rate": 8.017048762845252e-05, + "loss": 0.71, + "step": 36982 + }, + { + "epoch": 0.9792953535509155, + "grad_norm": 0.77734375, + "learning_rate": 8.016597532819808e-05, + "loss": 0.8344, + "step": 36983 + }, + { + "epoch": 0.9793218331592098, + "grad_norm": 0.74609375, + "learning_rate": 8.016146306998071e-05, + "loss": 0.6568, + "step": 36984 + }, + { + "epoch": 0.9793483127675042, + "grad_norm": 0.828125, + "learning_rate": 8.015695085380998e-05, + "loss": 0.8858, + "step": 36985 + }, + { + "epoch": 0.9793747923757986, + "grad_norm": 0.8203125, + "learning_rate": 8.015243867969546e-05, + "loss": 0.8526, + "step": 36986 + }, + { + "epoch": 0.979401271984093, + "grad_norm": 0.828125, + "learning_rate": 8.014792654764673e-05, + "loss": 0.885, + "step": 36987 + }, + { + "epoch": 0.9794277515923874, + "grad_norm": 0.7890625, + "learning_rate": 8.014341445767331e-05, + "loss": 0.7877, + "step": 36988 + }, + { + "epoch": 0.9794542312006818, + "grad_norm": 0.79296875, + "learning_rate": 8.013890240978481e-05, + "loss": 0.8476, + "step": 36989 + }, + { + "epoch": 0.9794807108089761, + "grad_norm": 0.7421875, + "learning_rate": 8.013439040399074e-05, + "loss": 0.7275, + "step": 36990 + }, + { + "epoch": 0.9795071904172705, + "grad_norm": 0.7421875, + "learning_rate": 8.012987844030072e-05, + "loss": 0.7574, + "step": 36991 + }, + { + "epoch": 0.9795336700255649, + "grad_norm": 0.82421875, + "learning_rate": 8.012536651872429e-05, + "loss": 0.7813, + "step": 36992 + }, + { + "epoch": 0.9795601496338593, + "grad_norm": 0.8046875, + "learning_rate": 8.012085463927101e-05, + "loss": 0.7644, + "step": 36993 + }, + { + "epoch": 0.9795866292421536, + "grad_norm": 0.734375, + "learning_rate": 8.011634280195044e-05, + "loss": 0.7076, + "step": 36994 + }, + { + "epoch": 0.979613108850448, + "grad_norm": 0.83984375, + "learning_rate": 8.011183100677211e-05, + "loss": 0.8135, + "step": 36995 + }, + { + "epoch": 0.9796395884587423, + "grad_norm": 0.71875, + "learning_rate": 8.010731925374564e-05, + "loss": 0.7606, + "step": 36996 + }, + { + "epoch": 0.9796660680670367, + "grad_norm": 0.76171875, + "learning_rate": 8.01028075428806e-05, + "loss": 0.8408, + "step": 36997 + }, + { + "epoch": 0.9796925476753311, + "grad_norm": 0.7265625, + "learning_rate": 8.00982958741865e-05, + "loss": 0.7485, + "step": 36998 + }, + { + "epoch": 0.9797190272836255, + "grad_norm": 0.76953125, + "learning_rate": 8.009378424767293e-05, + "loss": 0.8173, + "step": 36999 + }, + { + "epoch": 0.9797455068919199, + "grad_norm": 0.76171875, + "learning_rate": 8.00892726633494e-05, + "loss": 0.7883, + "step": 37000 + }, + { + "epoch": 0.9797455068919199, + "eval_loss": 0.7816874980926514, + "eval_runtime": 280.8839, + "eval_samples_per_second": 35.602, + "eval_steps_per_second": 0.744, + "step": 37000 + }, + { + "epoch": 0.9797719865002142, + "grad_norm": 0.69921875, + "learning_rate": 8.008476112122558e-05, + "loss": 0.7045, + "step": 37001 + }, + { + "epoch": 0.9797984661085086, + "grad_norm": 0.859375, + "learning_rate": 8.008024962131095e-05, + "loss": 0.819, + "step": 37002 + }, + { + "epoch": 0.979824945716803, + "grad_norm": 0.74609375, + "learning_rate": 8.00757381636151e-05, + "loss": 0.7293, + "step": 37003 + }, + { + "epoch": 0.9798514253250974, + "grad_norm": 0.8359375, + "learning_rate": 8.007122674814757e-05, + "loss": 0.8056, + "step": 37004 + }, + { + "epoch": 0.9798779049333918, + "grad_norm": 0.76171875, + "learning_rate": 8.006671537491793e-05, + "loss": 0.8215, + "step": 37005 + }, + { + "epoch": 0.9799043845416862, + "grad_norm": 0.80078125, + "learning_rate": 8.006220404393574e-05, + "loss": 0.8893, + "step": 37006 + }, + { + "epoch": 0.9799308641499805, + "grad_norm": 0.71875, + "learning_rate": 8.00576927552106e-05, + "loss": 0.7647, + "step": 37007 + }, + { + "epoch": 0.9799573437582749, + "grad_norm": 0.7734375, + "learning_rate": 8.005318150875203e-05, + "loss": 0.7585, + "step": 37008 + }, + { + "epoch": 0.9799838233665693, + "grad_norm": 0.796875, + "learning_rate": 8.004867030456961e-05, + "loss": 0.7356, + "step": 37009 + }, + { + "epoch": 0.9800103029748636, + "grad_norm": 0.69921875, + "learning_rate": 8.004415914267287e-05, + "loss": 0.6443, + "step": 37010 + }, + { + "epoch": 0.980036782583158, + "grad_norm": 0.8359375, + "learning_rate": 8.003964802307141e-05, + "loss": 0.8185, + "step": 37011 + }, + { + "epoch": 0.9800632621914523, + "grad_norm": 0.8046875, + "learning_rate": 8.003513694577478e-05, + "loss": 0.8209, + "step": 37012 + }, + { + "epoch": 0.9800897417997467, + "grad_norm": 0.8046875, + "learning_rate": 8.003062591079253e-05, + "loss": 0.8585, + "step": 37013 + }, + { + "epoch": 0.9801162214080411, + "grad_norm": 0.796875, + "learning_rate": 8.002611491813424e-05, + "loss": 0.7129, + "step": 37014 + }, + { + "epoch": 0.9801427010163355, + "grad_norm": 0.78125, + "learning_rate": 8.00216039678094e-05, + "loss": 0.7461, + "step": 37015 + }, + { + "epoch": 0.9801691806246299, + "grad_norm": 0.7421875, + "learning_rate": 8.001709305982767e-05, + "loss": 0.6702, + "step": 37016 + }, + { + "epoch": 0.9801956602329243, + "grad_norm": 0.7109375, + "learning_rate": 8.001258219419858e-05, + "loss": 0.7889, + "step": 37017 + }, + { + "epoch": 0.9802221398412186, + "grad_norm": 0.71875, + "learning_rate": 8.000807137093168e-05, + "loss": 0.8085, + "step": 37018 + }, + { + "epoch": 0.980248619449513, + "grad_norm": 0.77734375, + "learning_rate": 8.000356059003654e-05, + "loss": 0.6943, + "step": 37019 + }, + { + "epoch": 0.9802750990578074, + "grad_norm": 0.8515625, + "learning_rate": 7.999904985152264e-05, + "loss": 0.8448, + "step": 37020 + }, + { + "epoch": 0.9803015786661018, + "grad_norm": 0.83984375, + "learning_rate": 7.999453915539967e-05, + "loss": 0.7905, + "step": 37021 + }, + { + "epoch": 0.9803280582743962, + "grad_norm": 0.82421875, + "learning_rate": 7.999002850167713e-05, + "loss": 0.8636, + "step": 37022 + }, + { + "epoch": 0.9803545378826906, + "grad_norm": 0.76953125, + "learning_rate": 7.998551789036459e-05, + "loss": 0.7672, + "step": 37023 + }, + { + "epoch": 0.9803810174909849, + "grad_norm": 0.8515625, + "learning_rate": 7.998100732147157e-05, + "loss": 0.8006, + "step": 37024 + }, + { + "epoch": 0.9804074970992793, + "grad_norm": 0.78515625, + "learning_rate": 7.997649679500765e-05, + "loss": 0.7395, + "step": 37025 + }, + { + "epoch": 0.9804339767075736, + "grad_norm": 0.69140625, + "learning_rate": 7.997198631098243e-05, + "loss": 0.5975, + "step": 37026 + }, + { + "epoch": 0.980460456315868, + "grad_norm": 0.796875, + "learning_rate": 7.996747586940543e-05, + "loss": 0.8392, + "step": 37027 + }, + { + "epoch": 0.9804869359241624, + "grad_norm": 0.734375, + "learning_rate": 7.996296547028624e-05, + "loss": 0.7497, + "step": 37028 + }, + { + "epoch": 0.9805134155324567, + "grad_norm": 0.7890625, + "learning_rate": 7.995845511363438e-05, + "loss": 0.7992, + "step": 37029 + }, + { + "epoch": 0.9805398951407511, + "grad_norm": 0.765625, + "learning_rate": 7.995394479945944e-05, + "loss": 0.7501, + "step": 37030 + }, + { + "epoch": 0.9805663747490455, + "grad_norm": 0.73046875, + "learning_rate": 7.994943452777096e-05, + "loss": 0.8105, + "step": 37031 + }, + { + "epoch": 0.9805928543573399, + "grad_norm": 0.7578125, + "learning_rate": 7.994492429857852e-05, + "loss": 0.7298, + "step": 37032 + }, + { + "epoch": 0.9806193339656343, + "grad_norm": 0.73828125, + "learning_rate": 7.994041411189167e-05, + "loss": 0.7013, + "step": 37033 + }, + { + "epoch": 0.9806458135739287, + "grad_norm": 0.8203125, + "learning_rate": 7.993590396771995e-05, + "loss": 0.7937, + "step": 37034 + }, + { + "epoch": 0.980672293182223, + "grad_norm": 0.796875, + "learning_rate": 7.993139386607289e-05, + "loss": 0.7432, + "step": 37035 + }, + { + "epoch": 0.9806987727905174, + "grad_norm": 0.74609375, + "learning_rate": 7.992688380696015e-05, + "loss": 0.7344, + "step": 37036 + }, + { + "epoch": 0.9807252523988118, + "grad_norm": 0.91796875, + "learning_rate": 7.992237379039123e-05, + "loss": 0.8844, + "step": 37037 + }, + { + "epoch": 0.9807517320071062, + "grad_norm": 0.75, + "learning_rate": 7.991786381637569e-05, + "loss": 0.7443, + "step": 37038 + }, + { + "epoch": 0.9807782116154006, + "grad_norm": 0.7578125, + "learning_rate": 7.99133538849231e-05, + "loss": 0.8208, + "step": 37039 + }, + { + "epoch": 0.980804691223695, + "grad_norm": 0.7734375, + "learning_rate": 7.990884399604294e-05, + "loss": 0.7828, + "step": 37040 + }, + { + "epoch": 0.9808311708319893, + "grad_norm": 0.7734375, + "learning_rate": 7.99043341497449e-05, + "loss": 0.731, + "step": 37041 + }, + { + "epoch": 0.9808576504402837, + "grad_norm": 0.83203125, + "learning_rate": 7.989982434603847e-05, + "loss": 0.9046, + "step": 37042 + }, + { + "epoch": 0.980884130048578, + "grad_norm": 0.72265625, + "learning_rate": 7.989531458493321e-05, + "loss": 0.6739, + "step": 37043 + }, + { + "epoch": 0.9809106096568724, + "grad_norm": 0.83984375, + "learning_rate": 7.98908048664387e-05, + "loss": 0.7912, + "step": 37044 + }, + { + "epoch": 0.9809370892651668, + "grad_norm": 0.72265625, + "learning_rate": 7.98862951905644e-05, + "loss": 0.7986, + "step": 37045 + }, + { + "epoch": 0.9809635688734611, + "grad_norm": 0.828125, + "learning_rate": 7.988178555732002e-05, + "loss": 0.8014, + "step": 37046 + }, + { + "epoch": 0.9809900484817555, + "grad_norm": 0.80078125, + "learning_rate": 7.987727596671504e-05, + "loss": 0.8434, + "step": 37047 + }, + { + "epoch": 0.9810165280900499, + "grad_norm": 0.7109375, + "learning_rate": 7.987276641875902e-05, + "loss": 0.7688, + "step": 37048 + }, + { + "epoch": 0.9810430076983443, + "grad_norm": 0.81640625, + "learning_rate": 7.986825691346152e-05, + "loss": 0.8244, + "step": 37049 + }, + { + "epoch": 0.9810694873066387, + "grad_norm": 0.79296875, + "learning_rate": 7.986374745083208e-05, + "loss": 0.8849, + "step": 37050 + }, + { + "epoch": 0.9810959669149331, + "grad_norm": 0.7890625, + "learning_rate": 7.985923803088031e-05, + "loss": 0.7763, + "step": 37051 + }, + { + "epoch": 0.9811224465232274, + "grad_norm": 0.86328125, + "learning_rate": 7.985472865361571e-05, + "loss": 0.8073, + "step": 37052 + }, + { + "epoch": 0.9811489261315218, + "grad_norm": 0.7734375, + "learning_rate": 7.985021931904788e-05, + "loss": 0.7835, + "step": 37053 + }, + { + "epoch": 0.9811754057398162, + "grad_norm": 0.7890625, + "learning_rate": 7.984571002718634e-05, + "loss": 0.7169, + "step": 37054 + }, + { + "epoch": 0.9812018853481106, + "grad_norm": 0.875, + "learning_rate": 7.984120077804065e-05, + "loss": 0.9122, + "step": 37055 + }, + { + "epoch": 0.981228364956405, + "grad_norm": 0.734375, + "learning_rate": 7.98366915716204e-05, + "loss": 0.7668, + "step": 37056 + }, + { + "epoch": 0.9812548445646994, + "grad_norm": 0.76953125, + "learning_rate": 7.983218240793514e-05, + "loss": 0.7302, + "step": 37057 + }, + { + "epoch": 0.9812813241729937, + "grad_norm": 0.7265625, + "learning_rate": 7.982767328699441e-05, + "loss": 0.7291, + "step": 37058 + }, + { + "epoch": 0.981307803781288, + "grad_norm": 0.73828125, + "learning_rate": 7.982316420880779e-05, + "loss": 0.6469, + "step": 37059 + }, + { + "epoch": 0.9813342833895824, + "grad_norm": 0.80859375, + "learning_rate": 7.981865517338476e-05, + "loss": 0.7948, + "step": 37060 + }, + { + "epoch": 0.9813607629978768, + "grad_norm": 0.7734375, + "learning_rate": 7.981414618073499e-05, + "loss": 0.806, + "step": 37061 + }, + { + "epoch": 0.9813872426061712, + "grad_norm": 0.8515625, + "learning_rate": 7.980963723086799e-05, + "loss": 0.8978, + "step": 37062 + }, + { + "epoch": 0.9814137222144655, + "grad_norm": 0.78125, + "learning_rate": 7.980512832379328e-05, + "loss": 0.8657, + "step": 37063 + }, + { + "epoch": 0.9814402018227599, + "grad_norm": 0.81640625, + "learning_rate": 7.980061945952048e-05, + "loss": 0.7889, + "step": 37064 + }, + { + "epoch": 0.9814666814310543, + "grad_norm": 0.75390625, + "learning_rate": 7.979611063805905e-05, + "loss": 0.8137, + "step": 37065 + }, + { + "epoch": 0.9814931610393487, + "grad_norm": 0.83203125, + "learning_rate": 7.979160185941866e-05, + "loss": 0.8226, + "step": 37066 + }, + { + "epoch": 0.9815196406476431, + "grad_norm": 0.72265625, + "learning_rate": 7.978709312360881e-05, + "loss": 0.7399, + "step": 37067 + }, + { + "epoch": 0.9815461202559375, + "grad_norm": 0.828125, + "learning_rate": 7.978258443063906e-05, + "loss": 0.8756, + "step": 37068 + }, + { + "epoch": 0.9815725998642318, + "grad_norm": 0.83984375, + "learning_rate": 7.977807578051897e-05, + "loss": 0.7709, + "step": 37069 + }, + { + "epoch": 0.9815990794725262, + "grad_norm": 0.78125, + "learning_rate": 7.977356717325808e-05, + "loss": 0.7235, + "step": 37070 + }, + { + "epoch": 0.9816255590808206, + "grad_norm": 0.9296875, + "learning_rate": 7.976905860886597e-05, + "loss": 0.8517, + "step": 37071 + }, + { + "epoch": 0.981652038689115, + "grad_norm": 0.81640625, + "learning_rate": 7.976455008735218e-05, + "loss": 0.8119, + "step": 37072 + }, + { + "epoch": 0.9816785182974094, + "grad_norm": 0.77734375, + "learning_rate": 7.976004160872625e-05, + "loss": 0.7297, + "step": 37073 + }, + { + "epoch": 0.9817049979057038, + "grad_norm": 0.81640625, + "learning_rate": 7.975553317299777e-05, + "loss": 0.7166, + "step": 37074 + }, + { + "epoch": 0.981731477513998, + "grad_norm": 0.8359375, + "learning_rate": 7.975102478017627e-05, + "loss": 0.8268, + "step": 37075 + }, + { + "epoch": 0.9817579571222924, + "grad_norm": 0.75, + "learning_rate": 7.974651643027134e-05, + "loss": 0.7433, + "step": 37076 + }, + { + "epoch": 0.9817844367305868, + "grad_norm": 0.8125, + "learning_rate": 7.974200812329251e-05, + "loss": 0.7099, + "step": 37077 + }, + { + "epoch": 0.9818109163388812, + "grad_norm": 0.796875, + "learning_rate": 7.973749985924933e-05, + "loss": 0.7037, + "step": 37078 + }, + { + "epoch": 0.9818373959471756, + "grad_norm": 0.76171875, + "learning_rate": 7.973299163815137e-05, + "loss": 0.7034, + "step": 37079 + }, + { + "epoch": 0.98186387555547, + "grad_norm": 0.78515625, + "learning_rate": 7.972848346000812e-05, + "loss": 0.8052, + "step": 37080 + }, + { + "epoch": 0.9818903551637643, + "grad_norm": 0.79296875, + "learning_rate": 7.972397532482924e-05, + "loss": 0.6763, + "step": 37081 + }, + { + "epoch": 0.9819168347720587, + "grad_norm": 0.76953125, + "learning_rate": 7.971946723262424e-05, + "loss": 0.6533, + "step": 37082 + }, + { + "epoch": 0.9819433143803531, + "grad_norm": 0.76953125, + "learning_rate": 7.971495918340266e-05, + "loss": 0.7355, + "step": 37083 + }, + { + "epoch": 0.9819697939886475, + "grad_norm": 0.8515625, + "learning_rate": 7.971045117717407e-05, + "loss": 0.8559, + "step": 37084 + }, + { + "epoch": 0.9819962735969419, + "grad_norm": 0.75, + "learning_rate": 7.970594321394797e-05, + "loss": 0.6582, + "step": 37085 + }, + { + "epoch": 0.9820227532052362, + "grad_norm": 0.83203125, + "learning_rate": 7.970143529373402e-05, + "loss": 0.7472, + "step": 37086 + }, + { + "epoch": 0.9820492328135306, + "grad_norm": 0.765625, + "learning_rate": 7.96969274165417e-05, + "loss": 0.7868, + "step": 37087 + }, + { + "epoch": 0.982075712421825, + "grad_norm": 0.828125, + "learning_rate": 7.96924195823806e-05, + "loss": 0.7918, + "step": 37088 + }, + { + "epoch": 0.9821021920301194, + "grad_norm": 0.8359375, + "learning_rate": 7.968791179126024e-05, + "loss": 0.8814, + "step": 37089 + }, + { + "epoch": 0.9821286716384138, + "grad_norm": 0.76953125, + "learning_rate": 7.968340404319017e-05, + "loss": 0.6651, + "step": 37090 + }, + { + "epoch": 0.9821551512467082, + "grad_norm": 0.89453125, + "learning_rate": 7.967889633818e-05, + "loss": 0.8336, + "step": 37091 + }, + { + "epoch": 0.9821816308550024, + "grad_norm": 0.79296875, + "learning_rate": 7.967438867623921e-05, + "loss": 0.7511, + "step": 37092 + }, + { + "epoch": 0.9822081104632968, + "grad_norm": 0.734375, + "learning_rate": 7.966988105737743e-05, + "loss": 0.5527, + "step": 37093 + }, + { + "epoch": 0.9822345900715912, + "grad_norm": 0.8203125, + "learning_rate": 7.966537348160415e-05, + "loss": 0.7182, + "step": 37094 + }, + { + "epoch": 0.9822610696798856, + "grad_norm": 0.70703125, + "learning_rate": 7.966086594892894e-05, + "loss": 0.7128, + "step": 37095 + }, + { + "epoch": 0.98228754928818, + "grad_norm": 0.8125, + "learning_rate": 7.96563584593614e-05, + "loss": 0.8658, + "step": 37096 + }, + { + "epoch": 0.9823140288964743, + "grad_norm": 0.79296875, + "learning_rate": 7.965185101291103e-05, + "loss": 0.6338, + "step": 37097 + }, + { + "epoch": 0.9823405085047687, + "grad_norm": 0.7734375, + "learning_rate": 7.964734360958739e-05, + "loss": 0.7789, + "step": 37098 + }, + { + "epoch": 0.9823669881130631, + "grad_norm": 0.80078125, + "learning_rate": 7.964283624940006e-05, + "loss": 0.8225, + "step": 37099 + }, + { + "epoch": 0.9823934677213575, + "grad_norm": 0.8125, + "learning_rate": 7.963832893235852e-05, + "loss": 0.6875, + "step": 37100 + }, + { + "epoch": 0.9824199473296519, + "grad_norm": 0.77734375, + "learning_rate": 7.963382165847242e-05, + "loss": 0.7181, + "step": 37101 + }, + { + "epoch": 0.9824464269379463, + "grad_norm": 0.73828125, + "learning_rate": 7.962931442775128e-05, + "loss": 0.6923, + "step": 37102 + }, + { + "epoch": 0.9824729065462406, + "grad_norm": 0.73046875, + "learning_rate": 7.962480724020465e-05, + "loss": 0.7814, + "step": 37103 + }, + { + "epoch": 0.982499386154535, + "grad_norm": 0.8515625, + "learning_rate": 7.962030009584204e-05, + "loss": 0.8263, + "step": 37104 + }, + { + "epoch": 0.9825258657628294, + "grad_norm": 0.82421875, + "learning_rate": 7.961579299467304e-05, + "loss": 0.8102, + "step": 37105 + }, + { + "epoch": 0.9825523453711238, + "grad_norm": 0.7734375, + "learning_rate": 7.961128593670722e-05, + "loss": 0.776, + "step": 37106 + }, + { + "epoch": 0.9825788249794182, + "grad_norm": 0.70703125, + "learning_rate": 7.960677892195413e-05, + "loss": 0.6339, + "step": 37107 + }, + { + "epoch": 0.9826053045877124, + "grad_norm": 0.7578125, + "learning_rate": 7.96022719504233e-05, + "loss": 0.7345, + "step": 37108 + }, + { + "epoch": 0.9826317841960068, + "grad_norm": 0.84375, + "learning_rate": 7.959776502212428e-05, + "loss": 0.8788, + "step": 37109 + }, + { + "epoch": 0.9826582638043012, + "grad_norm": 0.796875, + "learning_rate": 7.95932581370666e-05, + "loss": 0.8022, + "step": 37110 + }, + { + "epoch": 0.9826847434125956, + "grad_norm": 2.34375, + "learning_rate": 7.958875129525986e-05, + "loss": 0.6148, + "step": 37111 + }, + { + "epoch": 0.98271122302089, + "grad_norm": 0.81640625, + "learning_rate": 7.958424449671362e-05, + "loss": 0.7441, + "step": 37112 + }, + { + "epoch": 0.9827377026291844, + "grad_norm": 0.7890625, + "learning_rate": 7.957973774143742e-05, + "loss": 0.715, + "step": 37113 + }, + { + "epoch": 0.9827641822374787, + "grad_norm": 0.796875, + "learning_rate": 7.957523102944077e-05, + "loss": 0.9448, + "step": 37114 + }, + { + "epoch": 0.9827906618457731, + "grad_norm": 0.84375, + "learning_rate": 7.957072436073325e-05, + "loss": 0.823, + "step": 37115 + }, + { + "epoch": 0.9828171414540675, + "grad_norm": 0.76953125, + "learning_rate": 7.956621773532442e-05, + "loss": 0.7406, + "step": 37116 + }, + { + "epoch": 0.9828436210623619, + "grad_norm": 0.69921875, + "learning_rate": 7.956171115322381e-05, + "loss": 0.7285, + "step": 37117 + }, + { + "epoch": 0.9828701006706563, + "grad_norm": 0.859375, + "learning_rate": 7.9557204614441e-05, + "loss": 0.9091, + "step": 37118 + }, + { + "epoch": 0.9828965802789507, + "grad_norm": 0.77734375, + "learning_rate": 7.955269811898554e-05, + "loss": 0.8597, + "step": 37119 + }, + { + "epoch": 0.982923059887245, + "grad_norm": 0.77734375, + "learning_rate": 7.954819166686691e-05, + "loss": 0.8308, + "step": 37120 + }, + { + "epoch": 0.9829495394955394, + "grad_norm": 0.6953125, + "learning_rate": 7.954368525809477e-05, + "loss": 0.8112, + "step": 37121 + }, + { + "epoch": 0.9829760191038338, + "grad_norm": 0.81640625, + "learning_rate": 7.953917889267861e-05, + "loss": 0.9193, + "step": 37122 + }, + { + "epoch": 0.9830024987121282, + "grad_norm": 0.765625, + "learning_rate": 7.953467257062799e-05, + "loss": 0.7558, + "step": 37123 + }, + { + "epoch": 0.9830289783204225, + "grad_norm": 0.79296875, + "learning_rate": 7.953016629195246e-05, + "loss": 0.6789, + "step": 37124 + }, + { + "epoch": 0.9830554579287168, + "grad_norm": 0.72265625, + "learning_rate": 7.952566005666159e-05, + "loss": 0.7292, + "step": 37125 + }, + { + "epoch": 0.9830819375370112, + "grad_norm": 0.75, + "learning_rate": 7.952115386476486e-05, + "loss": 0.6784, + "step": 37126 + }, + { + "epoch": 0.9831084171453056, + "grad_norm": 0.796875, + "learning_rate": 7.951664771627192e-05, + "loss": 0.7426, + "step": 37127 + }, + { + "epoch": 0.9831348967536, + "grad_norm": 0.73046875, + "learning_rate": 7.951214161119228e-05, + "loss": 0.6629, + "step": 37128 + }, + { + "epoch": 0.9831613763618944, + "grad_norm": 0.828125, + "learning_rate": 7.950763554953547e-05, + "loss": 0.733, + "step": 37129 + }, + { + "epoch": 0.9831878559701888, + "grad_norm": 0.78125, + "learning_rate": 7.950312953131105e-05, + "loss": 0.6583, + "step": 37130 + }, + { + "epoch": 0.9832143355784831, + "grad_norm": 0.80078125, + "learning_rate": 7.949862355652855e-05, + "loss": 0.8077, + "step": 37131 + }, + { + "epoch": 0.9832408151867775, + "grad_norm": 0.7734375, + "learning_rate": 7.949411762519757e-05, + "loss": 0.8642, + "step": 37132 + }, + { + "epoch": 0.9832672947950719, + "grad_norm": 0.84765625, + "learning_rate": 7.948961173732765e-05, + "loss": 0.7279, + "step": 37133 + }, + { + "epoch": 0.9832937744033663, + "grad_norm": 0.8046875, + "learning_rate": 7.948510589292833e-05, + "loss": 0.8468, + "step": 37134 + }, + { + "epoch": 0.9833202540116607, + "grad_norm": 0.765625, + "learning_rate": 7.948060009200916e-05, + "loss": 0.8768, + "step": 37135 + }, + { + "epoch": 0.9833467336199551, + "grad_norm": 0.76171875, + "learning_rate": 7.947609433457963e-05, + "loss": 0.7573, + "step": 37136 + }, + { + "epoch": 0.9833732132282494, + "grad_norm": 0.74609375, + "learning_rate": 7.94715886206494e-05, + "loss": 0.6833, + "step": 37137 + }, + { + "epoch": 0.9833996928365438, + "grad_norm": 0.765625, + "learning_rate": 7.946708295022795e-05, + "loss": 0.8403, + "step": 37138 + }, + { + "epoch": 0.9834261724448382, + "grad_norm": 0.86328125, + "learning_rate": 7.946257732332485e-05, + "loss": 0.7931, + "step": 37139 + }, + { + "epoch": 0.9834526520531326, + "grad_norm": 0.8359375, + "learning_rate": 7.945807173994962e-05, + "loss": 0.7579, + "step": 37140 + }, + { + "epoch": 0.9834791316614269, + "grad_norm": 0.76953125, + "learning_rate": 7.945356620011183e-05, + "loss": 0.8058, + "step": 37141 + }, + { + "epoch": 0.9835056112697212, + "grad_norm": 0.80859375, + "learning_rate": 7.944906070382106e-05, + "loss": 0.8084, + "step": 37142 + }, + { + "epoch": 0.9835320908780156, + "grad_norm": 0.8203125, + "learning_rate": 7.944455525108682e-05, + "loss": 0.7017, + "step": 37143 + }, + { + "epoch": 0.98355857048631, + "grad_norm": 0.85546875, + "learning_rate": 7.944004984191868e-05, + "loss": 0.8081, + "step": 37144 + }, + { + "epoch": 0.9835850500946044, + "grad_norm": 0.85546875, + "learning_rate": 7.943554447632617e-05, + "loss": 0.8434, + "step": 37145 + }, + { + "epoch": 0.9836115297028988, + "grad_norm": 0.80078125, + "learning_rate": 7.943103915431882e-05, + "loss": 0.6887, + "step": 37146 + }, + { + "epoch": 0.9836380093111932, + "grad_norm": 0.76953125, + "learning_rate": 7.942653387590623e-05, + "loss": 0.8784, + "step": 37147 + }, + { + "epoch": 0.9836644889194875, + "grad_norm": 0.73828125, + "learning_rate": 7.942202864109794e-05, + "loss": 0.7909, + "step": 37148 + }, + { + "epoch": 0.9836909685277819, + "grad_norm": 0.77734375, + "learning_rate": 7.941752344990349e-05, + "loss": 0.7716, + "step": 37149 + }, + { + "epoch": 0.9837174481360763, + "grad_norm": 0.75, + "learning_rate": 7.941301830233241e-05, + "loss": 0.7263, + "step": 37150 + }, + { + "epoch": 0.9837439277443707, + "grad_norm": 0.7890625, + "learning_rate": 7.940851319839422e-05, + "loss": 0.7106, + "step": 37151 + }, + { + "epoch": 0.9837704073526651, + "grad_norm": 0.921875, + "learning_rate": 7.940400813809855e-05, + "loss": 0.7081, + "step": 37152 + }, + { + "epoch": 0.9837968869609595, + "grad_norm": 0.98046875, + "learning_rate": 7.939950312145492e-05, + "loss": 0.8301, + "step": 37153 + }, + { + "epoch": 0.9838233665692538, + "grad_norm": 0.67578125, + "learning_rate": 7.939499814847286e-05, + "loss": 0.686, + "step": 37154 + }, + { + "epoch": 0.9838498461775482, + "grad_norm": 0.7890625, + "learning_rate": 7.939049321916191e-05, + "loss": 0.7766, + "step": 37155 + }, + { + "epoch": 0.9838763257858426, + "grad_norm": 0.9140625, + "learning_rate": 7.938598833353162e-05, + "loss": 0.893, + "step": 37156 + }, + { + "epoch": 0.9839028053941369, + "grad_norm": 0.796875, + "learning_rate": 7.938148349159158e-05, + "loss": 0.7905, + "step": 37157 + }, + { + "epoch": 0.9839292850024313, + "grad_norm": 0.75, + "learning_rate": 7.937697869335129e-05, + "loss": 0.6793, + "step": 37158 + }, + { + "epoch": 0.9839557646107256, + "grad_norm": 0.84375, + "learning_rate": 7.93724739388203e-05, + "loss": 0.7965, + "step": 37159 + }, + { + "epoch": 0.98398224421902, + "grad_norm": 0.80078125, + "learning_rate": 7.93679692280082e-05, + "loss": 0.7921, + "step": 37160 + }, + { + "epoch": 0.9840087238273144, + "grad_norm": 0.78515625, + "learning_rate": 7.936346456092448e-05, + "loss": 0.8276, + "step": 37161 + }, + { + "epoch": 0.9840352034356088, + "grad_norm": 0.78515625, + "learning_rate": 7.935895993757875e-05, + "loss": 0.7366, + "step": 37162 + }, + { + "epoch": 0.9840616830439032, + "grad_norm": 0.77734375, + "learning_rate": 7.935445535798052e-05, + "loss": 0.7722, + "step": 37163 + }, + { + "epoch": 0.9840881626521976, + "grad_norm": 1.1875, + "learning_rate": 7.934995082213934e-05, + "loss": 0.6926, + "step": 37164 + }, + { + "epoch": 0.984114642260492, + "grad_norm": 0.70703125, + "learning_rate": 7.934544633006475e-05, + "loss": 0.632, + "step": 37165 + }, + { + "epoch": 0.9841411218687863, + "grad_norm": 0.81640625, + "learning_rate": 7.934094188176629e-05, + "loss": 0.7598, + "step": 37166 + }, + { + "epoch": 0.9841676014770807, + "grad_norm": 0.8359375, + "learning_rate": 7.933643747725355e-05, + "loss": 0.8248, + "step": 37167 + }, + { + "epoch": 0.9841940810853751, + "grad_norm": 0.73828125, + "learning_rate": 7.933193311653604e-05, + "loss": 0.7043, + "step": 37168 + }, + { + "epoch": 0.9842205606936695, + "grad_norm": 0.8515625, + "learning_rate": 7.932742879962331e-05, + "loss": 0.7874, + "step": 37169 + }, + { + "epoch": 0.9842470403019639, + "grad_norm": 0.79296875, + "learning_rate": 7.932292452652494e-05, + "loss": 0.8504, + "step": 37170 + }, + { + "epoch": 0.9842735199102582, + "grad_norm": 0.984375, + "learning_rate": 7.93184202972504e-05, + "loss": 0.9006, + "step": 37171 + }, + { + "epoch": 0.9842999995185526, + "grad_norm": 0.82421875, + "learning_rate": 7.931391611180931e-05, + "loss": 0.7129, + "step": 37172 + }, + { + "epoch": 0.9843264791268469, + "grad_norm": 0.8203125, + "learning_rate": 7.930941197021121e-05, + "loss": 0.6754, + "step": 37173 + }, + { + "epoch": 0.9843529587351413, + "grad_norm": 0.875, + "learning_rate": 7.930490787246563e-05, + "loss": 0.8153, + "step": 37174 + }, + { + "epoch": 0.9843794383434357, + "grad_norm": 0.7265625, + "learning_rate": 7.93004038185821e-05, + "loss": 0.8134, + "step": 37175 + }, + { + "epoch": 0.98440591795173, + "grad_norm": 0.828125, + "learning_rate": 7.929589980857018e-05, + "loss": 0.6777, + "step": 37176 + }, + { + "epoch": 0.9844323975600244, + "grad_norm": 0.76953125, + "learning_rate": 7.92913958424394e-05, + "loss": 0.8001, + "step": 37177 + }, + { + "epoch": 0.9844588771683188, + "grad_norm": 0.79296875, + "learning_rate": 7.928689192019935e-05, + "loss": 0.8523, + "step": 37178 + }, + { + "epoch": 0.9844853567766132, + "grad_norm": 0.75390625, + "learning_rate": 7.928238804185954e-05, + "loss": 0.7843, + "step": 37179 + }, + { + "epoch": 0.9845118363849076, + "grad_norm": 0.8046875, + "learning_rate": 7.927788420742954e-05, + "loss": 0.9015, + "step": 37180 + }, + { + "epoch": 0.984538315993202, + "grad_norm": 0.7578125, + "learning_rate": 7.927338041691886e-05, + "loss": 0.7911, + "step": 37181 + }, + { + "epoch": 0.9845647956014963, + "grad_norm": 0.7109375, + "learning_rate": 7.926887667033706e-05, + "loss": 0.7286, + "step": 37182 + }, + { + "epoch": 0.9845912752097907, + "grad_norm": 0.765625, + "learning_rate": 7.926437296769371e-05, + "loss": 0.7395, + "step": 37183 + }, + { + "epoch": 0.9846177548180851, + "grad_norm": 0.70703125, + "learning_rate": 7.925986930899834e-05, + "loss": 0.771, + "step": 37184 + }, + { + "epoch": 0.9846442344263795, + "grad_norm": 0.91015625, + "learning_rate": 7.925536569426049e-05, + "loss": 0.731, + "step": 37185 + }, + { + "epoch": 0.9846707140346739, + "grad_norm": 0.78515625, + "learning_rate": 7.925086212348967e-05, + "loss": 0.8381, + "step": 37186 + }, + { + "epoch": 0.9846971936429683, + "grad_norm": 0.87109375, + "learning_rate": 7.924635859669549e-05, + "loss": 0.7635, + "step": 37187 + }, + { + "epoch": 0.9847236732512626, + "grad_norm": 0.8046875, + "learning_rate": 7.924185511388746e-05, + "loss": 0.8257, + "step": 37188 + }, + { + "epoch": 0.984750152859557, + "grad_norm": 0.73828125, + "learning_rate": 7.923735167507516e-05, + "loss": 0.7513, + "step": 37189 + }, + { + "epoch": 0.9847766324678513, + "grad_norm": 0.765625, + "learning_rate": 7.92328482802681e-05, + "loss": 0.8313, + "step": 37190 + }, + { + "epoch": 0.9848031120761457, + "grad_norm": 0.84375, + "learning_rate": 7.922834492947576e-05, + "loss": 0.9197, + "step": 37191 + }, + { + "epoch": 0.9848295916844401, + "grad_norm": 0.79296875, + "learning_rate": 7.922384162270782e-05, + "loss": 0.7742, + "step": 37192 + }, + { + "epoch": 0.9848560712927344, + "grad_norm": 0.77734375, + "learning_rate": 7.921933835997376e-05, + "loss": 0.8859, + "step": 37193 + }, + { + "epoch": 0.9848825509010288, + "grad_norm": 0.74609375, + "learning_rate": 7.921483514128312e-05, + "loss": 0.7324, + "step": 37194 + }, + { + "epoch": 0.9849090305093232, + "grad_norm": 0.74609375, + "learning_rate": 7.921033196664545e-05, + "loss": 0.7646, + "step": 37195 + }, + { + "epoch": 0.9849355101176176, + "grad_norm": 0.80078125, + "learning_rate": 7.920582883607025e-05, + "loss": 0.6652, + "step": 37196 + }, + { + "epoch": 0.984961989725912, + "grad_norm": 0.9296875, + "learning_rate": 7.920132574956716e-05, + "loss": 0.8126, + "step": 37197 + }, + { + "epoch": 0.9849884693342064, + "grad_norm": 0.8046875, + "learning_rate": 7.919682270714566e-05, + "loss": 0.9109, + "step": 37198 + }, + { + "epoch": 0.9850149489425007, + "grad_norm": 0.953125, + "learning_rate": 7.91923197088153e-05, + "loss": 0.8215, + "step": 37199 + }, + { + "epoch": 0.9850414285507951, + "grad_norm": 0.75, + "learning_rate": 7.918781675458565e-05, + "loss": 0.72, + "step": 37200 + }, + { + "epoch": 0.9850679081590895, + "grad_norm": 0.8125, + "learning_rate": 7.91833138444662e-05, + "loss": 0.7408, + "step": 37201 + }, + { + "epoch": 0.9850943877673839, + "grad_norm": 0.76953125, + "learning_rate": 7.917881097846653e-05, + "loss": 0.7836, + "step": 37202 + }, + { + "epoch": 0.9851208673756783, + "grad_norm": 0.7265625, + "learning_rate": 7.91743081565962e-05, + "loss": 0.7871, + "step": 37203 + }, + { + "epoch": 0.9851473469839727, + "grad_norm": 0.80859375, + "learning_rate": 7.916980537886473e-05, + "loss": 0.736, + "step": 37204 + }, + { + "epoch": 0.985173826592267, + "grad_norm": 0.765625, + "learning_rate": 7.916530264528166e-05, + "loss": 0.8755, + "step": 37205 + }, + { + "epoch": 0.9852003062005613, + "grad_norm": 0.78125, + "learning_rate": 7.91607999558565e-05, + "loss": 0.5911, + "step": 37206 + }, + { + "epoch": 0.9852267858088557, + "grad_norm": 0.79296875, + "learning_rate": 7.915629731059888e-05, + "loss": 0.7854, + "step": 37207 + }, + { + "epoch": 0.9852532654171501, + "grad_norm": 0.80078125, + "learning_rate": 7.91517947095183e-05, + "loss": 0.8084, + "step": 37208 + }, + { + "epoch": 0.9852797450254445, + "grad_norm": 0.7578125, + "learning_rate": 7.91472921526243e-05, + "loss": 0.6612, + "step": 37209 + }, + { + "epoch": 0.9853062246337388, + "grad_norm": 0.8359375, + "learning_rate": 7.914278963992641e-05, + "loss": 0.7109, + "step": 37210 + }, + { + "epoch": 0.9853327042420332, + "grad_norm": 0.8046875, + "learning_rate": 7.913828717143416e-05, + "loss": 0.6974, + "step": 37211 + }, + { + "epoch": 0.9853591838503276, + "grad_norm": 0.79296875, + "learning_rate": 7.913378474715715e-05, + "loss": 0.7863, + "step": 37212 + }, + { + "epoch": 0.985385663458622, + "grad_norm": 0.79296875, + "learning_rate": 7.91292823671049e-05, + "loss": 0.6629, + "step": 37213 + }, + { + "epoch": 0.9854121430669164, + "grad_norm": 0.95703125, + "learning_rate": 7.912478003128693e-05, + "loss": 0.9347, + "step": 37214 + }, + { + "epoch": 0.9854386226752108, + "grad_norm": 0.88671875, + "learning_rate": 7.91202777397128e-05, + "loss": 0.8927, + "step": 37215 + }, + { + "epoch": 0.9854651022835051, + "grad_norm": 0.91015625, + "learning_rate": 7.911577549239201e-05, + "loss": 0.6598, + "step": 37216 + }, + { + "epoch": 0.9854915818917995, + "grad_norm": 0.81640625, + "learning_rate": 7.911127328933418e-05, + "loss": 0.917, + "step": 37217 + }, + { + "epoch": 0.9855180615000939, + "grad_norm": 0.88671875, + "learning_rate": 7.91067711305488e-05, + "loss": 0.8067, + "step": 37218 + }, + { + "epoch": 0.9855445411083883, + "grad_norm": 0.7421875, + "learning_rate": 7.910226901604543e-05, + "loss": 0.7993, + "step": 37219 + }, + { + "epoch": 0.9855710207166827, + "grad_norm": 0.78515625, + "learning_rate": 7.909776694583362e-05, + "loss": 0.7646, + "step": 37220 + }, + { + "epoch": 0.9855975003249771, + "grad_norm": 0.75390625, + "learning_rate": 7.909326491992287e-05, + "loss": 0.6937, + "step": 37221 + }, + { + "epoch": 0.9856239799332713, + "grad_norm": 0.75390625, + "learning_rate": 7.908876293832277e-05, + "loss": 0.8402, + "step": 37222 + }, + { + "epoch": 0.9856504595415657, + "grad_norm": 0.79296875, + "learning_rate": 7.908426100104284e-05, + "loss": 0.7956, + "step": 37223 + }, + { + "epoch": 0.9856769391498601, + "grad_norm": 0.73828125, + "learning_rate": 7.907975910809263e-05, + "loss": 0.7733, + "step": 37224 + }, + { + "epoch": 0.9857034187581545, + "grad_norm": 0.7578125, + "learning_rate": 7.907525725948165e-05, + "loss": 0.7206, + "step": 37225 + }, + { + "epoch": 0.9857298983664489, + "grad_norm": 0.734375, + "learning_rate": 7.907075545521947e-05, + "loss": 0.7595, + "step": 37226 + }, + { + "epoch": 0.9857563779747432, + "grad_norm": 0.83203125, + "learning_rate": 7.906625369531564e-05, + "loss": 0.8227, + "step": 37227 + }, + { + "epoch": 0.9857828575830376, + "grad_norm": 0.84375, + "learning_rate": 7.90617519797797e-05, + "loss": 0.7678, + "step": 37228 + }, + { + "epoch": 0.985809337191332, + "grad_norm": 0.765625, + "learning_rate": 7.905725030862117e-05, + "loss": 0.7578, + "step": 37229 + }, + { + "epoch": 0.9858358167996264, + "grad_norm": 0.77734375, + "learning_rate": 7.90527486818496e-05, + "loss": 0.793, + "step": 37230 + }, + { + "epoch": 0.9858622964079208, + "grad_norm": 0.7890625, + "learning_rate": 7.904824709947449e-05, + "loss": 0.8281, + "step": 37231 + }, + { + "epoch": 0.9858887760162152, + "grad_norm": 0.76171875, + "learning_rate": 7.904374556150547e-05, + "loss": 0.8604, + "step": 37232 + }, + { + "epoch": 0.9859152556245095, + "grad_norm": 0.8515625, + "learning_rate": 7.903924406795202e-05, + "loss": 0.7719, + "step": 37233 + }, + { + "epoch": 0.9859417352328039, + "grad_norm": 0.7734375, + "learning_rate": 7.903474261882371e-05, + "loss": 0.8126, + "step": 37234 + }, + { + "epoch": 0.9859682148410983, + "grad_norm": 0.74609375, + "learning_rate": 7.903024121413006e-05, + "loss": 0.7178, + "step": 37235 + }, + { + "epoch": 0.9859946944493927, + "grad_norm": 0.78515625, + "learning_rate": 7.902573985388058e-05, + "loss": 0.8169, + "step": 37236 + }, + { + "epoch": 0.9860211740576871, + "grad_norm": 0.81640625, + "learning_rate": 7.902123853808487e-05, + "loss": 0.7385, + "step": 37237 + }, + { + "epoch": 0.9860476536659815, + "grad_norm": 0.83984375, + "learning_rate": 7.901673726675247e-05, + "loss": 0.8092, + "step": 37238 + }, + { + "epoch": 0.9860741332742757, + "grad_norm": 0.76953125, + "learning_rate": 7.901223603989287e-05, + "loss": 0.7826, + "step": 37239 + }, + { + "epoch": 0.9861006128825701, + "grad_norm": 0.7890625, + "learning_rate": 7.900773485751566e-05, + "loss": 0.8103, + "step": 37240 + }, + { + "epoch": 0.9861270924908645, + "grad_norm": 0.78125, + "learning_rate": 7.900323371963032e-05, + "loss": 0.7108, + "step": 37241 + }, + { + "epoch": 0.9861535720991589, + "grad_norm": 0.77734375, + "learning_rate": 7.899873262624646e-05, + "loss": 0.7277, + "step": 37242 + }, + { + "epoch": 0.9861800517074533, + "grad_norm": 0.85546875, + "learning_rate": 7.899423157737357e-05, + "loss": 0.7117, + "step": 37243 + }, + { + "epoch": 0.9862065313157476, + "grad_norm": 0.80859375, + "learning_rate": 7.898973057302119e-05, + "loss": 0.784, + "step": 37244 + }, + { + "epoch": 0.986233010924042, + "grad_norm": 0.71875, + "learning_rate": 7.89852296131989e-05, + "loss": 0.7246, + "step": 37245 + }, + { + "epoch": 0.9862594905323364, + "grad_norm": 0.8046875, + "learning_rate": 7.89807286979162e-05, + "loss": 0.9269, + "step": 37246 + }, + { + "epoch": 0.9862859701406308, + "grad_norm": 0.78515625, + "learning_rate": 7.897622782718265e-05, + "loss": 0.8048, + "step": 37247 + }, + { + "epoch": 0.9863124497489252, + "grad_norm": 0.80078125, + "learning_rate": 7.897172700100779e-05, + "loss": 0.8411, + "step": 37248 + }, + { + "epoch": 0.9863389293572196, + "grad_norm": 0.78125, + "learning_rate": 7.896722621940115e-05, + "loss": 0.7507, + "step": 37249 + }, + { + "epoch": 0.986365408965514, + "grad_norm": 0.7578125, + "learning_rate": 7.896272548237226e-05, + "loss": 0.8153, + "step": 37250 + }, + { + "epoch": 0.9863918885738083, + "grad_norm": 0.796875, + "learning_rate": 7.895822478993066e-05, + "loss": 0.7727, + "step": 37251 + }, + { + "epoch": 0.9864183681821027, + "grad_norm": 0.84765625, + "learning_rate": 7.895372414208592e-05, + "loss": 0.8363, + "step": 37252 + }, + { + "epoch": 0.9864448477903971, + "grad_norm": 0.8203125, + "learning_rate": 7.894922353884757e-05, + "loss": 0.7447, + "step": 37253 + }, + { + "epoch": 0.9864713273986915, + "grad_norm": 0.90625, + "learning_rate": 7.894472298022512e-05, + "loss": 0.8122, + "step": 37254 + }, + { + "epoch": 0.9864978070069857, + "grad_norm": 0.77734375, + "learning_rate": 7.894022246622814e-05, + "loss": 0.8741, + "step": 37255 + }, + { + "epoch": 0.9865242866152801, + "grad_norm": 0.796875, + "learning_rate": 7.893572199686612e-05, + "loss": 0.7199, + "step": 37256 + }, + { + "epoch": 0.9865507662235745, + "grad_norm": 0.78125, + "learning_rate": 7.893122157214866e-05, + "loss": 0.8093, + "step": 37257 + }, + { + "epoch": 0.9865772458318689, + "grad_norm": 0.7265625, + "learning_rate": 7.892672119208528e-05, + "loss": 0.7625, + "step": 37258 + }, + { + "epoch": 0.9866037254401633, + "grad_norm": 0.76953125, + "learning_rate": 7.892222085668551e-05, + "loss": 0.7041, + "step": 37259 + }, + { + "epoch": 0.9866302050484577, + "grad_norm": 0.796875, + "learning_rate": 7.891772056595889e-05, + "loss": 0.7271, + "step": 37260 + }, + { + "epoch": 0.986656684656752, + "grad_norm": 0.74609375, + "learning_rate": 7.891322031991492e-05, + "loss": 0.7488, + "step": 37261 + }, + { + "epoch": 0.9866831642650464, + "grad_norm": 0.7734375, + "learning_rate": 7.89087201185632e-05, + "loss": 0.8329, + "step": 37262 + }, + { + "epoch": 0.9867096438733408, + "grad_norm": 0.8046875, + "learning_rate": 7.890421996191324e-05, + "loss": 0.8083, + "step": 37263 + }, + { + "epoch": 0.9867361234816352, + "grad_norm": 0.875, + "learning_rate": 7.889971984997458e-05, + "loss": 0.7663, + "step": 37264 + }, + { + "epoch": 0.9867626030899296, + "grad_norm": 0.75390625, + "learning_rate": 7.889521978275677e-05, + "loss": 0.7568, + "step": 37265 + }, + { + "epoch": 0.986789082698224, + "grad_norm": 0.80078125, + "learning_rate": 7.889071976026931e-05, + "loss": 0.8798, + "step": 37266 + }, + { + "epoch": 0.9868155623065183, + "grad_norm": 0.8125, + "learning_rate": 7.888621978252179e-05, + "loss": 0.8083, + "step": 37267 + }, + { + "epoch": 0.9868420419148127, + "grad_norm": 0.76171875, + "learning_rate": 7.88817198495237e-05, + "loss": 0.7549, + "step": 37268 + }, + { + "epoch": 0.9868685215231071, + "grad_norm": 0.765625, + "learning_rate": 7.887721996128462e-05, + "loss": 0.8315, + "step": 37269 + }, + { + "epoch": 0.9868950011314015, + "grad_norm": 0.76171875, + "learning_rate": 7.887272011781405e-05, + "loss": 0.785, + "step": 37270 + }, + { + "epoch": 0.9869214807396958, + "grad_norm": 0.7421875, + "learning_rate": 7.886822031912155e-05, + "loss": 0.7191, + "step": 37271 + }, + { + "epoch": 0.9869479603479901, + "grad_norm": 0.79296875, + "learning_rate": 7.886372056521662e-05, + "loss": 0.7944, + "step": 37272 + }, + { + "epoch": 0.9869744399562845, + "grad_norm": 0.75, + "learning_rate": 7.885922085610884e-05, + "loss": 0.734, + "step": 37273 + }, + { + "epoch": 0.9870009195645789, + "grad_norm": 0.875, + "learning_rate": 7.885472119180776e-05, + "loss": 0.7459, + "step": 37274 + }, + { + "epoch": 0.9870273991728733, + "grad_norm": 0.83203125, + "learning_rate": 7.885022157232287e-05, + "loss": 0.8654, + "step": 37275 + }, + { + "epoch": 0.9870538787811677, + "grad_norm": 0.84375, + "learning_rate": 7.884572199766373e-05, + "loss": 0.8877, + "step": 37276 + }, + { + "epoch": 0.9870803583894621, + "grad_norm": 0.8046875, + "learning_rate": 7.884122246783984e-05, + "loss": 0.8711, + "step": 37277 + }, + { + "epoch": 0.9871068379977564, + "grad_norm": 0.7109375, + "learning_rate": 7.883672298286081e-05, + "loss": 0.7432, + "step": 37278 + }, + { + "epoch": 0.9871333176060508, + "grad_norm": 0.7578125, + "learning_rate": 7.883222354273614e-05, + "loss": 0.9213, + "step": 37279 + }, + { + "epoch": 0.9871597972143452, + "grad_norm": 0.7265625, + "learning_rate": 7.882772414747535e-05, + "loss": 0.8372, + "step": 37280 + }, + { + "epoch": 0.9871862768226396, + "grad_norm": 0.8515625, + "learning_rate": 7.8823224797088e-05, + "loss": 0.7706, + "step": 37281 + }, + { + "epoch": 0.987212756430934, + "grad_norm": 0.87109375, + "learning_rate": 7.881872549158356e-05, + "loss": 0.8132, + "step": 37282 + }, + { + "epoch": 0.9872392360392284, + "grad_norm": 0.76171875, + "learning_rate": 7.881422623097167e-05, + "loss": 0.8483, + "step": 37283 + }, + { + "epoch": 0.9872657156475227, + "grad_norm": 0.75, + "learning_rate": 7.880972701526182e-05, + "loss": 0.821, + "step": 37284 + }, + { + "epoch": 0.9872921952558171, + "grad_norm": 0.8203125, + "learning_rate": 7.880522784446354e-05, + "loss": 0.8037, + "step": 37285 + }, + { + "epoch": 0.9873186748641115, + "grad_norm": 0.76953125, + "learning_rate": 7.880072871858636e-05, + "loss": 0.8375, + "step": 37286 + }, + { + "epoch": 0.9873451544724059, + "grad_norm": 0.7890625, + "learning_rate": 7.87962296376398e-05, + "loss": 0.7609, + "step": 37287 + }, + { + "epoch": 0.9873716340807002, + "grad_norm": 0.8046875, + "learning_rate": 7.879173060163346e-05, + "loss": 0.6633, + "step": 37288 + }, + { + "epoch": 0.9873981136889945, + "grad_norm": 0.765625, + "learning_rate": 7.878723161057682e-05, + "loss": 0.792, + "step": 37289 + }, + { + "epoch": 0.9874245932972889, + "grad_norm": 0.77734375, + "learning_rate": 7.878273266447944e-05, + "loss": 0.7658, + "step": 37290 + }, + { + "epoch": 0.9874510729055833, + "grad_norm": 0.79296875, + "learning_rate": 7.877823376335084e-05, + "loss": 0.8647, + "step": 37291 + }, + { + "epoch": 0.9874775525138777, + "grad_norm": 0.78515625, + "learning_rate": 7.877373490720051e-05, + "loss": 0.7639, + "step": 37292 + }, + { + "epoch": 0.9875040321221721, + "grad_norm": 0.85546875, + "learning_rate": 7.87692360960381e-05, + "loss": 0.9981, + "step": 37293 + }, + { + "epoch": 0.9875305117304665, + "grad_norm": 0.8125, + "learning_rate": 7.876473732987307e-05, + "loss": 0.7087, + "step": 37294 + }, + { + "epoch": 0.9875569913387608, + "grad_norm": 0.765625, + "learning_rate": 7.876023860871495e-05, + "loss": 0.7499, + "step": 37295 + }, + { + "epoch": 0.9875834709470552, + "grad_norm": 0.87890625, + "learning_rate": 7.87557399325733e-05, + "loss": 0.7993, + "step": 37296 + }, + { + "epoch": 0.9876099505553496, + "grad_norm": 0.81640625, + "learning_rate": 7.875124130145761e-05, + "loss": 0.7643, + "step": 37297 + }, + { + "epoch": 0.987636430163644, + "grad_norm": 0.73046875, + "learning_rate": 7.874674271537748e-05, + "loss": 0.7813, + "step": 37298 + }, + { + "epoch": 0.9876629097719384, + "grad_norm": 0.89453125, + "learning_rate": 7.874224417434242e-05, + "loss": 0.8987, + "step": 37299 + }, + { + "epoch": 0.9876893893802328, + "grad_norm": 0.71484375, + "learning_rate": 7.873774567836197e-05, + "loss": 0.7339, + "step": 37300 + }, + { + "epoch": 0.9877158689885271, + "grad_norm": 0.76953125, + "learning_rate": 7.873324722744563e-05, + "loss": 0.7616, + "step": 37301 + }, + { + "epoch": 0.9877423485968215, + "grad_norm": 0.8359375, + "learning_rate": 7.872874882160293e-05, + "loss": 0.7573, + "step": 37302 + }, + { + "epoch": 0.9877688282051159, + "grad_norm": 0.71875, + "learning_rate": 7.872425046084346e-05, + "loss": 0.688, + "step": 37303 + }, + { + "epoch": 0.9877953078134102, + "grad_norm": 0.8046875, + "learning_rate": 7.871975214517675e-05, + "loss": 0.825, + "step": 37304 + }, + { + "epoch": 0.9878217874217046, + "grad_norm": 0.828125, + "learning_rate": 7.87152538746123e-05, + "loss": 0.8277, + "step": 37305 + }, + { + "epoch": 0.987848267029999, + "grad_norm": 0.79296875, + "learning_rate": 7.871075564915964e-05, + "loss": 0.7515, + "step": 37306 + }, + { + "epoch": 0.9878747466382933, + "grad_norm": 0.8046875, + "learning_rate": 7.87062574688283e-05, + "loss": 0.6799, + "step": 37307 + }, + { + "epoch": 0.9879012262465877, + "grad_norm": 0.72265625, + "learning_rate": 7.870175933362784e-05, + "loss": 0.7358, + "step": 37308 + }, + { + "epoch": 0.9879277058548821, + "grad_norm": 0.8046875, + "learning_rate": 7.869726124356782e-05, + "loss": 0.7956, + "step": 37309 + }, + { + "epoch": 0.9879541854631765, + "grad_norm": 0.76953125, + "learning_rate": 7.86927631986577e-05, + "loss": 0.6724, + "step": 37310 + }, + { + "epoch": 0.9879806650714709, + "grad_norm": 0.76171875, + "learning_rate": 7.868826519890704e-05, + "loss": 0.7958, + "step": 37311 + }, + { + "epoch": 0.9880071446797652, + "grad_norm": 0.75, + "learning_rate": 7.86837672443254e-05, + "loss": 0.7531, + "step": 37312 + }, + { + "epoch": 0.9880336242880596, + "grad_norm": 0.87890625, + "learning_rate": 7.86792693349223e-05, + "loss": 0.7016, + "step": 37313 + }, + { + "epoch": 0.988060103896354, + "grad_norm": 0.75390625, + "learning_rate": 7.867477147070728e-05, + "loss": 0.8106, + "step": 37314 + }, + { + "epoch": 0.9880865835046484, + "grad_norm": 0.87109375, + "learning_rate": 7.867027365168985e-05, + "loss": 0.7591, + "step": 37315 + }, + { + "epoch": 0.9881130631129428, + "grad_norm": 0.82421875, + "learning_rate": 7.866577587787957e-05, + "loss": 0.8795, + "step": 37316 + }, + { + "epoch": 0.9881395427212372, + "grad_norm": 0.76171875, + "learning_rate": 7.866127814928591e-05, + "loss": 0.8317, + "step": 37317 + }, + { + "epoch": 0.9881660223295315, + "grad_norm": 0.81640625, + "learning_rate": 7.865678046591848e-05, + "loss": 0.7185, + "step": 37318 + }, + { + "epoch": 0.9881925019378259, + "grad_norm": 0.921875, + "learning_rate": 7.865228282778682e-05, + "loss": 1.0652, + "step": 37319 + }, + { + "epoch": 0.9882189815461202, + "grad_norm": 0.70703125, + "learning_rate": 7.86477852349004e-05, + "loss": 0.6485, + "step": 37320 + }, + { + "epoch": 0.9882454611544146, + "grad_norm": 0.76953125, + "learning_rate": 7.864328768726878e-05, + "loss": 0.8163, + "step": 37321 + }, + { + "epoch": 0.988271940762709, + "grad_norm": 0.765625, + "learning_rate": 7.863879018490145e-05, + "loss": 0.7047, + "step": 37322 + }, + { + "epoch": 0.9882984203710033, + "grad_norm": 0.75, + "learning_rate": 7.863429272780804e-05, + "loss": 0.7724, + "step": 37323 + }, + { + "epoch": 0.9883248999792977, + "grad_norm": 0.71875, + "learning_rate": 7.862979531599802e-05, + "loss": 0.7244, + "step": 37324 + }, + { + "epoch": 0.9883513795875921, + "grad_norm": 0.80078125, + "learning_rate": 7.86252979494809e-05, + "loss": 0.9501, + "step": 37325 + }, + { + "epoch": 0.9883778591958865, + "grad_norm": 0.78515625, + "learning_rate": 7.862080062826627e-05, + "loss": 0.7889, + "step": 37326 + }, + { + "epoch": 0.9884043388041809, + "grad_norm": 0.80859375, + "learning_rate": 7.86163033523636e-05, + "loss": 0.7263, + "step": 37327 + }, + { + "epoch": 0.9884308184124753, + "grad_norm": 0.796875, + "learning_rate": 7.861180612178247e-05, + "loss": 0.7708, + "step": 37328 + }, + { + "epoch": 0.9884572980207696, + "grad_norm": 0.76953125, + "learning_rate": 7.860730893653238e-05, + "loss": 0.787, + "step": 37329 + }, + { + "epoch": 0.988483777629064, + "grad_norm": 0.80859375, + "learning_rate": 7.860281179662289e-05, + "loss": 0.8682, + "step": 37330 + }, + { + "epoch": 0.9885102572373584, + "grad_norm": 0.8125, + "learning_rate": 7.859831470206353e-05, + "loss": 0.7567, + "step": 37331 + }, + { + "epoch": 0.9885367368456528, + "grad_norm": 0.7890625, + "learning_rate": 7.859381765286378e-05, + "loss": 0.8087, + "step": 37332 + }, + { + "epoch": 0.9885632164539472, + "grad_norm": 0.7734375, + "learning_rate": 7.858932064903323e-05, + "loss": 0.7391, + "step": 37333 + }, + { + "epoch": 0.9885896960622416, + "grad_norm": 0.78515625, + "learning_rate": 7.85848236905814e-05, + "loss": 0.7848, + "step": 37334 + }, + { + "epoch": 0.988616175670536, + "grad_norm": 0.7734375, + "learning_rate": 7.858032677751783e-05, + "loss": 0.7431, + "step": 37335 + }, + { + "epoch": 0.9886426552788303, + "grad_norm": 0.7109375, + "learning_rate": 7.857582990985202e-05, + "loss": 0.6949, + "step": 37336 + }, + { + "epoch": 0.9886691348871246, + "grad_norm": 0.6953125, + "learning_rate": 7.857133308759345e-05, + "loss": 0.7675, + "step": 37337 + }, + { + "epoch": 0.988695614495419, + "grad_norm": 0.81640625, + "learning_rate": 7.856683631075179e-05, + "loss": 0.7404, + "step": 37338 + }, + { + "epoch": 0.9887220941037134, + "grad_norm": 0.80078125, + "learning_rate": 7.856233957933648e-05, + "loss": 0.832, + "step": 37339 + }, + { + "epoch": 0.9887485737120078, + "grad_norm": 0.859375, + "learning_rate": 7.855784289335705e-05, + "loss": 0.767, + "step": 37340 + }, + { + "epoch": 0.9887750533203021, + "grad_norm": 0.796875, + "learning_rate": 7.855334625282308e-05, + "loss": 0.8204, + "step": 37341 + }, + { + "epoch": 0.9888015329285965, + "grad_norm": 0.84375, + "learning_rate": 7.8548849657744e-05, + "loss": 0.7697, + "step": 37342 + }, + { + "epoch": 0.9888280125368909, + "grad_norm": 0.8046875, + "learning_rate": 7.854435310812946e-05, + "loss": 0.7453, + "step": 37343 + }, + { + "epoch": 0.9888544921451853, + "grad_norm": 0.76953125, + "learning_rate": 7.853985660398894e-05, + "loss": 0.735, + "step": 37344 + }, + { + "epoch": 0.9888809717534797, + "grad_norm": 0.85546875, + "learning_rate": 7.853536014533197e-05, + "loss": 0.9167, + "step": 37345 + }, + { + "epoch": 0.988907451361774, + "grad_norm": 0.7890625, + "learning_rate": 7.853086373216806e-05, + "loss": 0.8174, + "step": 37346 + }, + { + "epoch": 0.9889339309700684, + "grad_norm": 0.79296875, + "learning_rate": 7.852636736450673e-05, + "loss": 0.9566, + "step": 37347 + }, + { + "epoch": 0.9889604105783628, + "grad_norm": 0.80859375, + "learning_rate": 7.852187104235756e-05, + "loss": 0.8185, + "step": 37348 + }, + { + "epoch": 0.9889868901866572, + "grad_norm": 0.80859375, + "learning_rate": 7.851737476573007e-05, + "loss": 0.9501, + "step": 37349 + }, + { + "epoch": 0.9890133697949516, + "grad_norm": 0.71875, + "learning_rate": 7.851287853463378e-05, + "loss": 0.786, + "step": 37350 + }, + { + "epoch": 0.989039849403246, + "grad_norm": 0.71875, + "learning_rate": 7.850838234907821e-05, + "loss": 0.8279, + "step": 37351 + }, + { + "epoch": 0.9890663290115403, + "grad_norm": 0.73828125, + "learning_rate": 7.850388620907287e-05, + "loss": 0.8141, + "step": 37352 + }, + { + "epoch": 0.9890928086198346, + "grad_norm": 0.7734375, + "learning_rate": 7.849939011462735e-05, + "loss": 0.8207, + "step": 37353 + }, + { + "epoch": 0.989119288228129, + "grad_norm": 0.79296875, + "learning_rate": 7.849489406575113e-05, + "loss": 0.8878, + "step": 37354 + }, + { + "epoch": 0.9891457678364234, + "grad_norm": 0.83203125, + "learning_rate": 7.849039806245376e-05, + "loss": 0.7553, + "step": 37355 + }, + { + "epoch": 0.9891722474447178, + "grad_norm": 0.9140625, + "learning_rate": 7.848590210474476e-05, + "loss": 0.97, + "step": 37356 + }, + { + "epoch": 0.9891987270530122, + "grad_norm": 0.8203125, + "learning_rate": 7.848140619263362e-05, + "loss": 0.7697, + "step": 37357 + }, + { + "epoch": 0.9892252066613065, + "grad_norm": 0.765625, + "learning_rate": 7.847691032612995e-05, + "loss": 0.754, + "step": 37358 + }, + { + "epoch": 0.9892516862696009, + "grad_norm": 0.75, + "learning_rate": 7.847241450524324e-05, + "loss": 0.7224, + "step": 37359 + }, + { + "epoch": 0.9892781658778953, + "grad_norm": 0.828125, + "learning_rate": 7.846791872998302e-05, + "loss": 0.8129, + "step": 37360 + }, + { + "epoch": 0.9893046454861897, + "grad_norm": 0.7734375, + "learning_rate": 7.84634230003588e-05, + "loss": 0.6441, + "step": 37361 + }, + { + "epoch": 0.9893311250944841, + "grad_norm": 0.72265625, + "learning_rate": 7.845892731638011e-05, + "loss": 0.6913, + "step": 37362 + }, + { + "epoch": 0.9893576047027784, + "grad_norm": 0.75, + "learning_rate": 7.84544316780565e-05, + "loss": 0.7645, + "step": 37363 + }, + { + "epoch": 0.9893840843110728, + "grad_norm": 0.74609375, + "learning_rate": 7.844993608539752e-05, + "loss": 0.8001, + "step": 37364 + }, + { + "epoch": 0.9894105639193672, + "grad_norm": 0.75, + "learning_rate": 7.844544053841265e-05, + "loss": 0.7641, + "step": 37365 + }, + { + "epoch": 0.9894370435276616, + "grad_norm": 0.8515625, + "learning_rate": 7.844094503711145e-05, + "loss": 0.8121, + "step": 37366 + }, + { + "epoch": 0.989463523135956, + "grad_norm": 0.75, + "learning_rate": 7.843644958150338e-05, + "loss": 0.8459, + "step": 37367 + }, + { + "epoch": 0.9894900027442504, + "grad_norm": 0.7421875, + "learning_rate": 7.843195417159808e-05, + "loss": 0.8056, + "step": 37368 + }, + { + "epoch": 0.9895164823525446, + "grad_norm": 0.765625, + "learning_rate": 7.8427458807405e-05, + "loss": 0.8123, + "step": 37369 + }, + { + "epoch": 0.989542961960839, + "grad_norm": 0.76953125, + "learning_rate": 7.842296348893371e-05, + "loss": 0.7156, + "step": 37370 + }, + { + "epoch": 0.9895694415691334, + "grad_norm": 0.7578125, + "learning_rate": 7.84184682161937e-05, + "loss": 0.7588, + "step": 37371 + }, + { + "epoch": 0.9895959211774278, + "grad_norm": 0.78125, + "learning_rate": 7.84139729891945e-05, + "loss": 0.6658, + "step": 37372 + }, + { + "epoch": 0.9896224007857222, + "grad_norm": 0.72265625, + "learning_rate": 7.840947780794566e-05, + "loss": 0.7589, + "step": 37373 + }, + { + "epoch": 0.9896488803940166, + "grad_norm": 0.82421875, + "learning_rate": 7.840498267245672e-05, + "loss": 0.8112, + "step": 37374 + }, + { + "epoch": 0.9896753600023109, + "grad_norm": 0.73828125, + "learning_rate": 7.840048758273717e-05, + "loss": 0.6768, + "step": 37375 + }, + { + "epoch": 0.9897018396106053, + "grad_norm": 0.7734375, + "learning_rate": 7.839599253879655e-05, + "loss": 0.6659, + "step": 37376 + }, + { + "epoch": 0.9897283192188997, + "grad_norm": 0.86328125, + "learning_rate": 7.839149754064435e-05, + "loss": 0.8182, + "step": 37377 + }, + { + "epoch": 0.9897547988271941, + "grad_norm": 0.7578125, + "learning_rate": 7.838700258829017e-05, + "loss": 0.8208, + "step": 37378 + }, + { + "epoch": 0.9897812784354885, + "grad_norm": 0.80859375, + "learning_rate": 7.838250768174351e-05, + "loss": 0.8557, + "step": 37379 + }, + { + "epoch": 0.9898077580437828, + "grad_norm": 0.765625, + "learning_rate": 7.837801282101389e-05, + "loss": 0.7112, + "step": 37380 + }, + { + "epoch": 0.9898342376520772, + "grad_norm": 0.80078125, + "learning_rate": 7.837351800611084e-05, + "loss": 0.7807, + "step": 37381 + }, + { + "epoch": 0.9898607172603716, + "grad_norm": 0.95703125, + "learning_rate": 7.836902323704383e-05, + "loss": 0.6774, + "step": 37382 + }, + { + "epoch": 0.989887196868666, + "grad_norm": 0.76953125, + "learning_rate": 7.836452851382249e-05, + "loss": 0.6814, + "step": 37383 + }, + { + "epoch": 0.9899136764769604, + "grad_norm": 0.8203125, + "learning_rate": 7.836003383645629e-05, + "loss": 0.8369, + "step": 37384 + }, + { + "epoch": 0.9899401560852548, + "grad_norm": 0.80078125, + "learning_rate": 7.835553920495478e-05, + "loss": 0.8158, + "step": 37385 + }, + { + "epoch": 0.989966635693549, + "grad_norm": 0.8046875, + "learning_rate": 7.835104461932745e-05, + "loss": 0.8015, + "step": 37386 + }, + { + "epoch": 0.9899931153018434, + "grad_norm": 0.75, + "learning_rate": 7.83465500795838e-05, + "loss": 0.7502, + "step": 37387 + }, + { + "epoch": 0.9900195949101378, + "grad_norm": 0.8203125, + "learning_rate": 7.834205558573345e-05, + "loss": 0.927, + "step": 37388 + }, + { + "epoch": 0.9900460745184322, + "grad_norm": 0.73046875, + "learning_rate": 7.833756113778586e-05, + "loss": 0.7297, + "step": 37389 + }, + { + "epoch": 0.9900725541267266, + "grad_norm": 0.76171875, + "learning_rate": 7.833306673575058e-05, + "loss": 0.7433, + "step": 37390 + }, + { + "epoch": 0.990099033735021, + "grad_norm": 0.84375, + "learning_rate": 7.832857237963714e-05, + "loss": 0.8504, + "step": 37391 + }, + { + "epoch": 0.9901255133433153, + "grad_norm": 0.82421875, + "learning_rate": 7.832407806945501e-05, + "loss": 0.8354, + "step": 37392 + }, + { + "epoch": 0.9901519929516097, + "grad_norm": 0.76953125, + "learning_rate": 7.831958380521379e-05, + "loss": 0.7354, + "step": 37393 + }, + { + "epoch": 0.9901784725599041, + "grad_norm": 0.75, + "learning_rate": 7.831508958692297e-05, + "loss": 0.7462, + "step": 37394 + }, + { + "epoch": 0.9902049521681985, + "grad_norm": 0.80859375, + "learning_rate": 7.831059541459208e-05, + "loss": 0.8837, + "step": 37395 + }, + { + "epoch": 0.9902314317764929, + "grad_norm": 0.8046875, + "learning_rate": 7.830610128823062e-05, + "loss": 0.8658, + "step": 37396 + }, + { + "epoch": 0.9902579113847872, + "grad_norm": 0.73828125, + "learning_rate": 7.830160720784814e-05, + "loss": 0.6587, + "step": 37397 + }, + { + "epoch": 0.9902843909930816, + "grad_norm": 0.73828125, + "learning_rate": 7.829711317345418e-05, + "loss": 0.6744, + "step": 37398 + }, + { + "epoch": 0.990310870601376, + "grad_norm": 0.796875, + "learning_rate": 7.829261918505825e-05, + "loss": 0.7331, + "step": 37399 + }, + { + "epoch": 0.9903373502096704, + "grad_norm": 0.7890625, + "learning_rate": 7.828812524266988e-05, + "loss": 0.7587, + "step": 37400 + }, + { + "epoch": 0.9903638298179648, + "grad_norm": 0.828125, + "learning_rate": 7.828363134629859e-05, + "loss": 0.7428, + "step": 37401 + }, + { + "epoch": 0.990390309426259, + "grad_norm": 0.78125, + "learning_rate": 7.827913749595383e-05, + "loss": 0.7669, + "step": 37402 + }, + { + "epoch": 0.9904167890345534, + "grad_norm": 0.8125, + "learning_rate": 7.827464369164526e-05, + "loss": 0.7487, + "step": 37403 + }, + { + "epoch": 0.9904432686428478, + "grad_norm": 0.7421875, + "learning_rate": 7.827014993338235e-05, + "loss": 0.7696, + "step": 37404 + }, + { + "epoch": 0.9904697482511422, + "grad_norm": 0.76171875, + "learning_rate": 7.82656562211746e-05, + "loss": 0.7224, + "step": 37405 + }, + { + "epoch": 0.9904962278594366, + "grad_norm": 0.75390625, + "learning_rate": 7.826116255503157e-05, + "loss": 0.7761, + "step": 37406 + }, + { + "epoch": 0.990522707467731, + "grad_norm": 0.859375, + "learning_rate": 7.825666893496271e-05, + "loss": 0.8584, + "step": 37407 + }, + { + "epoch": 0.9905491870760254, + "grad_norm": 0.78515625, + "learning_rate": 7.825217536097763e-05, + "loss": 0.7746, + "step": 37408 + }, + { + "epoch": 0.9905756666843197, + "grad_norm": 0.82421875, + "learning_rate": 7.824768183308583e-05, + "loss": 0.8808, + "step": 37409 + }, + { + "epoch": 0.9906021462926141, + "grad_norm": 0.71484375, + "learning_rate": 7.824318835129683e-05, + "loss": 0.7395, + "step": 37410 + }, + { + "epoch": 0.9906286259009085, + "grad_norm": 0.81640625, + "learning_rate": 7.823869491562013e-05, + "loss": 0.788, + "step": 37411 + }, + { + "epoch": 0.9906551055092029, + "grad_norm": 0.79296875, + "learning_rate": 7.823420152606528e-05, + "loss": 0.7953, + "step": 37412 + }, + { + "epoch": 0.9906815851174973, + "grad_norm": 0.80078125, + "learning_rate": 7.822970818264179e-05, + "loss": 0.8676, + "step": 37413 + }, + { + "epoch": 0.9907080647257916, + "grad_norm": 0.76171875, + "learning_rate": 7.822521488535919e-05, + "loss": 0.819, + "step": 37414 + }, + { + "epoch": 0.990734544334086, + "grad_norm": 0.74609375, + "learning_rate": 7.8220721634227e-05, + "loss": 0.881, + "step": 37415 + }, + { + "epoch": 0.9907610239423804, + "grad_norm": 0.79296875, + "learning_rate": 7.821622842925476e-05, + "loss": 0.7208, + "step": 37416 + }, + { + "epoch": 0.9907875035506748, + "grad_norm": 0.8046875, + "learning_rate": 7.821173527045197e-05, + "loss": 0.8064, + "step": 37417 + }, + { + "epoch": 0.9908139831589691, + "grad_norm": 0.76171875, + "learning_rate": 7.820724215782814e-05, + "loss": 0.9241, + "step": 37418 + }, + { + "epoch": 0.9908404627672635, + "grad_norm": 0.79296875, + "learning_rate": 7.820274909139285e-05, + "loss": 0.8077, + "step": 37419 + }, + { + "epoch": 0.9908669423755578, + "grad_norm": 0.72265625, + "learning_rate": 7.819825607115558e-05, + "loss": 0.7657, + "step": 37420 + }, + { + "epoch": 0.9908934219838522, + "grad_norm": 0.82421875, + "learning_rate": 7.819376309712585e-05, + "loss": 0.7877, + "step": 37421 + }, + { + "epoch": 0.9909199015921466, + "grad_norm": 0.74609375, + "learning_rate": 7.818927016931321e-05, + "loss": 0.847, + "step": 37422 + }, + { + "epoch": 0.990946381200441, + "grad_norm": 0.70703125, + "learning_rate": 7.818477728772711e-05, + "loss": 0.7823, + "step": 37423 + }, + { + "epoch": 0.9909728608087354, + "grad_norm": 0.75390625, + "learning_rate": 7.818028445237718e-05, + "loss": 0.7633, + "step": 37424 + }, + { + "epoch": 0.9909993404170298, + "grad_norm": 1.0625, + "learning_rate": 7.817579166327289e-05, + "loss": 0.7094, + "step": 37425 + }, + { + "epoch": 0.9910258200253241, + "grad_norm": 0.80859375, + "learning_rate": 7.817129892042377e-05, + "loss": 0.7734, + "step": 37426 + }, + { + "epoch": 0.9910522996336185, + "grad_norm": 0.7578125, + "learning_rate": 7.81668062238393e-05, + "loss": 0.6656, + "step": 37427 + }, + { + "epoch": 0.9910787792419129, + "grad_norm": 0.83203125, + "learning_rate": 7.816231357352903e-05, + "loss": 0.8249, + "step": 37428 + }, + { + "epoch": 0.9911052588502073, + "grad_norm": 0.734375, + "learning_rate": 7.815782096950251e-05, + "loss": 0.706, + "step": 37429 + }, + { + "epoch": 0.9911317384585017, + "grad_norm": 0.765625, + "learning_rate": 7.815332841176925e-05, + "loss": 0.7211, + "step": 37430 + }, + { + "epoch": 0.991158218066796, + "grad_norm": 1.046875, + "learning_rate": 7.814883590033876e-05, + "loss": 0.7573, + "step": 37431 + }, + { + "epoch": 0.9911846976750904, + "grad_norm": 0.77734375, + "learning_rate": 7.814434343522056e-05, + "loss": 0.7657, + "step": 37432 + }, + { + "epoch": 0.9912111772833848, + "grad_norm": 0.78515625, + "learning_rate": 7.813985101642414e-05, + "loss": 0.8846, + "step": 37433 + }, + { + "epoch": 0.9912376568916792, + "grad_norm": 0.85546875, + "learning_rate": 7.813535864395909e-05, + "loss": 0.7189, + "step": 37434 + }, + { + "epoch": 0.9912641364999735, + "grad_norm": 0.74609375, + "learning_rate": 7.813086631783489e-05, + "loss": 0.8868, + "step": 37435 + }, + { + "epoch": 0.9912906161082679, + "grad_norm": 0.765625, + "learning_rate": 7.81263740380611e-05, + "loss": 0.7439, + "step": 37436 + }, + { + "epoch": 0.9913170957165622, + "grad_norm": 0.8203125, + "learning_rate": 7.812188180464717e-05, + "loss": 0.6603, + "step": 37437 + }, + { + "epoch": 0.9913435753248566, + "grad_norm": 0.8515625, + "learning_rate": 7.811738961760266e-05, + "loss": 0.8966, + "step": 37438 + }, + { + "epoch": 0.991370054933151, + "grad_norm": 0.71484375, + "learning_rate": 7.811289747693711e-05, + "loss": 0.9026, + "step": 37439 + }, + { + "epoch": 0.9913965345414454, + "grad_norm": 0.75, + "learning_rate": 7.810840538266004e-05, + "loss": 0.7341, + "step": 37440 + }, + { + "epoch": 0.9914230141497398, + "grad_norm": 0.78515625, + "learning_rate": 7.810391333478093e-05, + "loss": 0.7323, + "step": 37441 + }, + { + "epoch": 0.9914494937580342, + "grad_norm": 0.82421875, + "learning_rate": 7.809942133330933e-05, + "loss": 0.8188, + "step": 37442 + }, + { + "epoch": 0.9914759733663285, + "grad_norm": 0.796875, + "learning_rate": 7.809492937825472e-05, + "loss": 0.7499, + "step": 37443 + }, + { + "epoch": 0.9915024529746229, + "grad_norm": 0.828125, + "learning_rate": 7.809043746962669e-05, + "loss": 0.8338, + "step": 37444 + }, + { + "epoch": 0.9915289325829173, + "grad_norm": 0.8203125, + "learning_rate": 7.808594560743473e-05, + "loss": 0.8275, + "step": 37445 + }, + { + "epoch": 0.9915554121912117, + "grad_norm": 0.7734375, + "learning_rate": 7.808145379168835e-05, + "loss": 0.8799, + "step": 37446 + }, + { + "epoch": 0.9915818917995061, + "grad_norm": 0.76953125, + "learning_rate": 7.807696202239708e-05, + "loss": 0.8212, + "step": 37447 + }, + { + "epoch": 0.9916083714078004, + "grad_norm": 0.75, + "learning_rate": 7.80724702995704e-05, + "loss": 0.7351, + "step": 37448 + }, + { + "epoch": 0.9916348510160948, + "grad_norm": 0.76953125, + "learning_rate": 7.80679786232179e-05, + "loss": 0.7685, + "step": 37449 + }, + { + "epoch": 0.9916613306243892, + "grad_norm": 0.828125, + "learning_rate": 7.806348699334906e-05, + "loss": 0.8333, + "step": 37450 + }, + { + "epoch": 0.9916878102326835, + "grad_norm": 0.734375, + "learning_rate": 7.805899540997342e-05, + "loss": 0.8241, + "step": 37451 + }, + { + "epoch": 0.9917142898409779, + "grad_norm": 0.75390625, + "learning_rate": 7.805450387310047e-05, + "loss": 0.7896, + "step": 37452 + }, + { + "epoch": 0.9917407694492723, + "grad_norm": 0.78125, + "learning_rate": 7.805001238273972e-05, + "loss": 0.9228, + "step": 37453 + }, + { + "epoch": 0.9917672490575666, + "grad_norm": 0.828125, + "learning_rate": 7.804552093890074e-05, + "loss": 0.8625, + "step": 37454 + }, + { + "epoch": 0.991793728665861, + "grad_norm": 0.82421875, + "learning_rate": 7.804102954159304e-05, + "loss": 0.7303, + "step": 37455 + }, + { + "epoch": 0.9918202082741554, + "grad_norm": 0.82421875, + "learning_rate": 7.80365381908261e-05, + "loss": 0.6602, + "step": 37456 + }, + { + "epoch": 0.9918466878824498, + "grad_norm": 0.84765625, + "learning_rate": 7.803204688660948e-05, + "loss": 0.9361, + "step": 37457 + }, + { + "epoch": 0.9918731674907442, + "grad_norm": 0.81640625, + "learning_rate": 7.802755562895265e-05, + "loss": 0.7114, + "step": 37458 + }, + { + "epoch": 0.9918996470990386, + "grad_norm": 0.8125, + "learning_rate": 7.80230644178652e-05, + "loss": 0.8074, + "step": 37459 + }, + { + "epoch": 0.9919261267073329, + "grad_norm": 0.71484375, + "learning_rate": 7.801857325335659e-05, + "loss": 0.7028, + "step": 37460 + }, + { + "epoch": 0.9919526063156273, + "grad_norm": 0.9453125, + "learning_rate": 7.801408213543637e-05, + "loss": 0.806, + "step": 37461 + }, + { + "epoch": 0.9919790859239217, + "grad_norm": 0.7734375, + "learning_rate": 7.800959106411403e-05, + "loss": 0.8169, + "step": 37462 + }, + { + "epoch": 0.9920055655322161, + "grad_norm": 1.0078125, + "learning_rate": 7.800510003939907e-05, + "loss": 0.83, + "step": 37463 + }, + { + "epoch": 0.9920320451405105, + "grad_norm": 0.81640625, + "learning_rate": 7.80006090613011e-05, + "loss": 0.8218, + "step": 37464 + }, + { + "epoch": 0.9920585247488048, + "grad_norm": 0.765625, + "learning_rate": 7.799611812982956e-05, + "loss": 0.7978, + "step": 37465 + }, + { + "epoch": 0.9920850043570992, + "grad_norm": 0.77734375, + "learning_rate": 7.799162724499399e-05, + "loss": 0.8479, + "step": 37466 + }, + { + "epoch": 0.9921114839653936, + "grad_norm": 0.7421875, + "learning_rate": 7.798713640680391e-05, + "loss": 0.8048, + "step": 37467 + }, + { + "epoch": 0.9921379635736879, + "grad_norm": 0.81640625, + "learning_rate": 7.79826456152688e-05, + "loss": 0.8652, + "step": 37468 + }, + { + "epoch": 0.9921644431819823, + "grad_norm": 0.75390625, + "learning_rate": 7.797815487039826e-05, + "loss": 0.7945, + "step": 37469 + }, + { + "epoch": 0.9921909227902767, + "grad_norm": 0.79296875, + "learning_rate": 7.797366417220177e-05, + "loss": 0.786, + "step": 37470 + }, + { + "epoch": 0.992217402398571, + "grad_norm": 0.75, + "learning_rate": 7.796917352068882e-05, + "loss": 0.8126, + "step": 37471 + }, + { + "epoch": 0.9922438820068654, + "grad_norm": 0.84765625, + "learning_rate": 7.796468291586896e-05, + "loss": 0.7043, + "step": 37472 + }, + { + "epoch": 0.9922703616151598, + "grad_norm": 0.796875, + "learning_rate": 7.796019235775165e-05, + "loss": 0.7434, + "step": 37473 + }, + { + "epoch": 0.9922968412234542, + "grad_norm": 0.75390625, + "learning_rate": 7.79557018463465e-05, + "loss": 0.7643, + "step": 37474 + }, + { + "epoch": 0.9923233208317486, + "grad_norm": 0.82421875, + "learning_rate": 7.795121138166297e-05, + "loss": 0.727, + "step": 37475 + }, + { + "epoch": 0.992349800440043, + "grad_norm": 0.7265625, + "learning_rate": 7.794672096371058e-05, + "loss": 0.6291, + "step": 37476 + }, + { + "epoch": 0.9923762800483373, + "grad_norm": 0.81640625, + "learning_rate": 7.794223059249888e-05, + "loss": 0.6667, + "step": 37477 + }, + { + "epoch": 0.9924027596566317, + "grad_norm": 0.78515625, + "learning_rate": 7.793774026803734e-05, + "loss": 0.7231, + "step": 37478 + }, + { + "epoch": 0.9924292392649261, + "grad_norm": 0.78515625, + "learning_rate": 7.79332499903355e-05, + "loss": 0.8344, + "step": 37479 + }, + { + "epoch": 0.9924557188732205, + "grad_norm": 0.84375, + "learning_rate": 7.792875975940287e-05, + "loss": 0.8016, + "step": 37480 + }, + { + "epoch": 0.9924821984815149, + "grad_norm": 0.90625, + "learning_rate": 7.792426957524899e-05, + "loss": 0.7848, + "step": 37481 + }, + { + "epoch": 0.9925086780898092, + "grad_norm": 0.88671875, + "learning_rate": 7.791977943788332e-05, + "loss": 0.7963, + "step": 37482 + }, + { + "epoch": 0.9925351576981036, + "grad_norm": 0.859375, + "learning_rate": 7.791528934731542e-05, + "loss": 0.7859, + "step": 37483 + }, + { + "epoch": 0.9925616373063979, + "grad_norm": 0.76953125, + "learning_rate": 7.791079930355484e-05, + "loss": 0.8118, + "step": 37484 + }, + { + "epoch": 0.9925881169146923, + "grad_norm": 0.7421875, + "learning_rate": 7.790630930661104e-05, + "loss": 0.7293, + "step": 37485 + }, + { + "epoch": 0.9926145965229867, + "grad_norm": 0.83984375, + "learning_rate": 7.790181935649357e-05, + "loss": 0.7634, + "step": 37486 + }, + { + "epoch": 0.992641076131281, + "grad_norm": 0.80859375, + "learning_rate": 7.789732945321192e-05, + "loss": 0.7817, + "step": 37487 + }, + { + "epoch": 0.9926675557395754, + "grad_norm": 0.7265625, + "learning_rate": 7.789283959677559e-05, + "loss": 0.784, + "step": 37488 + }, + { + "epoch": 0.9926940353478698, + "grad_norm": 0.7578125, + "learning_rate": 7.788834978719415e-05, + "loss": 0.8548, + "step": 37489 + }, + { + "epoch": 0.9927205149561642, + "grad_norm": 0.85546875, + "learning_rate": 7.788386002447709e-05, + "loss": 0.7914, + "step": 37490 + }, + { + "epoch": 0.9927469945644586, + "grad_norm": 0.828125, + "learning_rate": 7.787937030863393e-05, + "loss": 0.8735, + "step": 37491 + }, + { + "epoch": 0.992773474172753, + "grad_norm": 0.984375, + "learning_rate": 7.787488063967416e-05, + "loss": 0.8217, + "step": 37492 + }, + { + "epoch": 0.9927999537810474, + "grad_norm": 0.79296875, + "learning_rate": 7.787039101760729e-05, + "loss": 0.8389, + "step": 37493 + }, + { + "epoch": 0.9928264333893417, + "grad_norm": 0.8515625, + "learning_rate": 7.786590144244291e-05, + "loss": 0.8029, + "step": 37494 + }, + { + "epoch": 0.9928529129976361, + "grad_norm": 0.76953125, + "learning_rate": 7.786141191419048e-05, + "loss": 0.6965, + "step": 37495 + }, + { + "epoch": 0.9928793926059305, + "grad_norm": 0.8359375, + "learning_rate": 7.785692243285953e-05, + "loss": 0.7969, + "step": 37496 + }, + { + "epoch": 0.9929058722142249, + "grad_norm": 0.81640625, + "learning_rate": 7.785243299845954e-05, + "loss": 0.8762, + "step": 37497 + }, + { + "epoch": 0.9929323518225193, + "grad_norm": 0.7890625, + "learning_rate": 7.784794361100006e-05, + "loss": 0.86, + "step": 37498 + }, + { + "epoch": 0.9929588314308136, + "grad_norm": 0.76953125, + "learning_rate": 7.784345427049062e-05, + "loss": 0.6533, + "step": 37499 + }, + { + "epoch": 0.9929853110391079, + "grad_norm": 0.765625, + "learning_rate": 7.783896497694068e-05, + "loss": 0.8047, + "step": 37500 + }, + { + "epoch": 0.9930117906474023, + "grad_norm": 0.76171875, + "learning_rate": 7.78344757303598e-05, + "loss": 0.7002, + "step": 37501 + }, + { + "epoch": 0.9930382702556967, + "grad_norm": 0.6953125, + "learning_rate": 7.782998653075749e-05, + "loss": 0.6674, + "step": 37502 + }, + { + "epoch": 0.9930647498639911, + "grad_norm": 0.77734375, + "learning_rate": 7.782549737814324e-05, + "loss": 0.7874, + "step": 37503 + }, + { + "epoch": 0.9930912294722855, + "grad_norm": 0.796875, + "learning_rate": 7.78210082725266e-05, + "loss": 0.906, + "step": 37504 + }, + { + "epoch": 0.9931177090805798, + "grad_norm": 0.83203125, + "learning_rate": 7.781651921391706e-05, + "loss": 0.7696, + "step": 37505 + }, + { + "epoch": 0.9931441886888742, + "grad_norm": 0.79296875, + "learning_rate": 7.781203020232416e-05, + "loss": 0.9235, + "step": 37506 + }, + { + "epoch": 0.9931706682971686, + "grad_norm": 0.90625, + "learning_rate": 7.780754123775735e-05, + "loss": 0.7887, + "step": 37507 + }, + { + "epoch": 0.993197147905463, + "grad_norm": 0.74609375, + "learning_rate": 7.780305232022618e-05, + "loss": 0.7946, + "step": 37508 + }, + { + "epoch": 0.9932236275137574, + "grad_norm": 0.80078125, + "learning_rate": 7.779856344974021e-05, + "loss": 0.7708, + "step": 37509 + }, + { + "epoch": 0.9932501071220518, + "grad_norm": 0.765625, + "learning_rate": 7.779407462630891e-05, + "loss": 0.7985, + "step": 37510 + }, + { + "epoch": 0.9932765867303461, + "grad_norm": 0.80078125, + "learning_rate": 7.77895858499418e-05, + "loss": 0.7838, + "step": 37511 + }, + { + "epoch": 0.9933030663386405, + "grad_norm": 0.7578125, + "learning_rate": 7.77850971206484e-05, + "loss": 0.6761, + "step": 37512 + }, + { + "epoch": 0.9933295459469349, + "grad_norm": 0.78515625, + "learning_rate": 7.778060843843816e-05, + "loss": 0.7212, + "step": 37513 + }, + { + "epoch": 0.9933560255552293, + "grad_norm": 0.9140625, + "learning_rate": 7.777611980332072e-05, + "loss": 0.7535, + "step": 37514 + }, + { + "epoch": 0.9933825051635237, + "grad_norm": 0.71484375, + "learning_rate": 7.77716312153055e-05, + "loss": 0.7618, + "step": 37515 + }, + { + "epoch": 0.993408984771818, + "grad_norm": 0.8125, + "learning_rate": 7.776714267440204e-05, + "loss": 0.7294, + "step": 37516 + }, + { + "epoch": 0.9934354643801123, + "grad_norm": 0.6953125, + "learning_rate": 7.776265418061985e-05, + "loss": 0.8137, + "step": 37517 + }, + { + "epoch": 0.9934619439884067, + "grad_norm": 0.79296875, + "learning_rate": 7.775816573396843e-05, + "loss": 0.7795, + "step": 37518 + }, + { + "epoch": 0.9934884235967011, + "grad_norm": 0.82421875, + "learning_rate": 7.775367733445729e-05, + "loss": 0.8554, + "step": 37519 + }, + { + "epoch": 0.9935149032049955, + "grad_norm": 0.7578125, + "learning_rate": 7.7749188982096e-05, + "loss": 0.7122, + "step": 37520 + }, + { + "epoch": 0.9935413828132899, + "grad_norm": 0.75390625, + "learning_rate": 7.774470067689403e-05, + "loss": 0.8633, + "step": 37521 + }, + { + "epoch": 0.9935678624215842, + "grad_norm": 0.7578125, + "learning_rate": 7.774021241886089e-05, + "loss": 0.7309, + "step": 37522 + }, + { + "epoch": 0.9935943420298786, + "grad_norm": 0.8359375, + "learning_rate": 7.773572420800608e-05, + "loss": 0.7434, + "step": 37523 + }, + { + "epoch": 0.993620821638173, + "grad_norm": 0.82421875, + "learning_rate": 7.773123604433914e-05, + "loss": 0.7782, + "step": 37524 + }, + { + "epoch": 0.9936473012464674, + "grad_norm": 0.75, + "learning_rate": 7.772674792786957e-05, + "loss": 0.7975, + "step": 37525 + }, + { + "epoch": 0.9936737808547618, + "grad_norm": 0.83984375, + "learning_rate": 7.772225985860691e-05, + "loss": 0.8678, + "step": 37526 + }, + { + "epoch": 0.9937002604630562, + "grad_norm": 0.8125, + "learning_rate": 7.771777183656062e-05, + "loss": 0.8538, + "step": 37527 + }, + { + "epoch": 0.9937267400713505, + "grad_norm": 0.796875, + "learning_rate": 7.771328386174022e-05, + "loss": 0.7443, + "step": 37528 + }, + { + "epoch": 0.9937532196796449, + "grad_norm": 0.69921875, + "learning_rate": 7.770879593415526e-05, + "loss": 0.694, + "step": 37529 + }, + { + "epoch": 0.9937796992879393, + "grad_norm": 0.86328125, + "learning_rate": 7.770430805381526e-05, + "loss": 0.7311, + "step": 37530 + }, + { + "epoch": 0.9938061788962337, + "grad_norm": 0.796875, + "learning_rate": 7.769982022072968e-05, + "loss": 0.7463, + "step": 37531 + }, + { + "epoch": 0.9938326585045281, + "grad_norm": 0.73828125, + "learning_rate": 7.769533243490806e-05, + "loss": 0.7454, + "step": 37532 + }, + { + "epoch": 0.9938591381128223, + "grad_norm": 0.734375, + "learning_rate": 7.769084469635988e-05, + "loss": 0.7296, + "step": 37533 + }, + { + "epoch": 0.9938856177211167, + "grad_norm": 0.71484375, + "learning_rate": 7.768635700509472e-05, + "loss": 0.7563, + "step": 37534 + }, + { + "epoch": 0.9939120973294111, + "grad_norm": 0.78125, + "learning_rate": 7.768186936112202e-05, + "loss": 0.8075, + "step": 37535 + }, + { + "epoch": 0.9939385769377055, + "grad_norm": 0.78125, + "learning_rate": 7.767738176445136e-05, + "loss": 0.7074, + "step": 37536 + }, + { + "epoch": 0.9939650565459999, + "grad_norm": 0.71484375, + "learning_rate": 7.76728942150922e-05, + "loss": 0.7347, + "step": 37537 + }, + { + "epoch": 0.9939915361542943, + "grad_norm": 0.70703125, + "learning_rate": 7.766840671305403e-05, + "loss": 0.617, + "step": 37538 + }, + { + "epoch": 0.9940180157625886, + "grad_norm": 0.89453125, + "learning_rate": 7.766391925834642e-05, + "loss": 0.8272, + "step": 37539 + }, + { + "epoch": 0.994044495370883, + "grad_norm": 0.87109375, + "learning_rate": 7.765943185097886e-05, + "loss": 0.7222, + "step": 37540 + }, + { + "epoch": 0.9940709749791774, + "grad_norm": 0.7578125, + "learning_rate": 7.765494449096086e-05, + "loss": 0.7807, + "step": 37541 + }, + { + "epoch": 0.9940974545874718, + "grad_norm": 0.79296875, + "learning_rate": 7.765045717830193e-05, + "loss": 0.6803, + "step": 37542 + }, + { + "epoch": 0.9941239341957662, + "grad_norm": 0.84765625, + "learning_rate": 7.764596991301157e-05, + "loss": 0.681, + "step": 37543 + }, + { + "epoch": 0.9941504138040606, + "grad_norm": 0.77734375, + "learning_rate": 7.764148269509932e-05, + "loss": 0.7923, + "step": 37544 + }, + { + "epoch": 0.9941768934123549, + "grad_norm": 0.7578125, + "learning_rate": 7.763699552457464e-05, + "loss": 0.6822, + "step": 37545 + }, + { + "epoch": 0.9942033730206493, + "grad_norm": 0.8203125, + "learning_rate": 7.763250840144709e-05, + "loss": 0.8646, + "step": 37546 + }, + { + "epoch": 0.9942298526289437, + "grad_norm": 0.8125, + "learning_rate": 7.762802132572616e-05, + "loss": 0.7805, + "step": 37547 + }, + { + "epoch": 0.9942563322372381, + "grad_norm": 0.6640625, + "learning_rate": 7.762353429742131e-05, + "loss": 0.741, + "step": 37548 + }, + { + "epoch": 0.9942828118455324, + "grad_norm": 0.83984375, + "learning_rate": 7.761904731654215e-05, + "loss": 0.8557, + "step": 37549 + }, + { + "epoch": 0.9943092914538267, + "grad_norm": 1.3125, + "learning_rate": 7.761456038309814e-05, + "loss": 0.7435, + "step": 37550 + }, + { + "epoch": 0.9943357710621211, + "grad_norm": 0.80078125, + "learning_rate": 7.761007349709879e-05, + "loss": 0.6388, + "step": 37551 + }, + { + "epoch": 0.9943622506704155, + "grad_norm": 0.82421875, + "learning_rate": 7.760558665855362e-05, + "loss": 0.7125, + "step": 37552 + }, + { + "epoch": 0.9943887302787099, + "grad_norm": 0.7421875, + "learning_rate": 7.76010998674721e-05, + "loss": 0.7288, + "step": 37553 + }, + { + "epoch": 0.9944152098870043, + "grad_norm": 0.796875, + "learning_rate": 7.759661312386375e-05, + "loss": 0.8346, + "step": 37554 + }, + { + "epoch": 0.9944416894952987, + "grad_norm": 0.8203125, + "learning_rate": 7.759212642773814e-05, + "loss": 0.7073, + "step": 37555 + }, + { + "epoch": 0.994468169103593, + "grad_norm": 0.796875, + "learning_rate": 7.758763977910472e-05, + "loss": 0.8524, + "step": 37556 + }, + { + "epoch": 0.9944946487118874, + "grad_norm": 0.7890625, + "learning_rate": 7.758315317797304e-05, + "loss": 0.8567, + "step": 37557 + }, + { + "epoch": 0.9945211283201818, + "grad_norm": 0.9453125, + "learning_rate": 7.757866662435257e-05, + "loss": 0.7505, + "step": 37558 + }, + { + "epoch": 0.9945476079284762, + "grad_norm": 0.78515625, + "learning_rate": 7.757418011825279e-05, + "loss": 0.7273, + "step": 37559 + }, + { + "epoch": 0.9945740875367706, + "grad_norm": 0.76171875, + "learning_rate": 7.75696936596833e-05, + "loss": 0.8456, + "step": 37560 + }, + { + "epoch": 0.994600567145065, + "grad_norm": 0.8125, + "learning_rate": 7.756520724865356e-05, + "loss": 0.8743, + "step": 37561 + }, + { + "epoch": 0.9946270467533593, + "grad_norm": 0.8125, + "learning_rate": 7.756072088517308e-05, + "loss": 0.8295, + "step": 37562 + }, + { + "epoch": 0.9946535263616537, + "grad_norm": 0.82421875, + "learning_rate": 7.755623456925138e-05, + "loss": 0.7908, + "step": 37563 + }, + { + "epoch": 0.9946800059699481, + "grad_norm": 0.78515625, + "learning_rate": 7.755174830089793e-05, + "loss": 0.7492, + "step": 37564 + }, + { + "epoch": 0.9947064855782425, + "grad_norm": 0.76171875, + "learning_rate": 7.754726208012229e-05, + "loss": 0.7698, + "step": 37565 + }, + { + "epoch": 0.9947329651865368, + "grad_norm": 0.82421875, + "learning_rate": 7.754277590693394e-05, + "loss": 0.8022, + "step": 37566 + }, + { + "epoch": 0.9947594447948311, + "grad_norm": 0.76953125, + "learning_rate": 7.753828978134238e-05, + "loss": 0.8658, + "step": 37567 + }, + { + "epoch": 0.9947859244031255, + "grad_norm": 0.7890625, + "learning_rate": 7.753380370335714e-05, + "loss": 0.8678, + "step": 37568 + }, + { + "epoch": 0.9948124040114199, + "grad_norm": 0.76171875, + "learning_rate": 7.75293176729877e-05, + "loss": 0.7137, + "step": 37569 + }, + { + "epoch": 0.9948388836197143, + "grad_norm": 0.7421875, + "learning_rate": 7.752483169024361e-05, + "loss": 0.6987, + "step": 37570 + }, + { + "epoch": 0.9948653632280087, + "grad_norm": 0.71484375, + "learning_rate": 7.752034575513437e-05, + "loss": 0.7274, + "step": 37571 + }, + { + "epoch": 0.994891842836303, + "grad_norm": 0.7890625, + "learning_rate": 7.751585986766945e-05, + "loss": 0.7259, + "step": 37572 + }, + { + "epoch": 0.9949183224445974, + "grad_norm": 0.88671875, + "learning_rate": 7.751137402785839e-05, + "loss": 0.7161, + "step": 37573 + }, + { + "epoch": 0.9949448020528918, + "grad_norm": 0.85546875, + "learning_rate": 7.750688823571064e-05, + "loss": 0.8738, + "step": 37574 + }, + { + "epoch": 0.9949712816611862, + "grad_norm": 0.765625, + "learning_rate": 7.75024024912358e-05, + "loss": 0.8663, + "step": 37575 + }, + { + "epoch": 0.9949977612694806, + "grad_norm": 0.765625, + "learning_rate": 7.749791679444333e-05, + "loss": 0.7717, + "step": 37576 + }, + { + "epoch": 0.995024240877775, + "grad_norm": 0.78125, + "learning_rate": 7.749343114534276e-05, + "loss": 0.7312, + "step": 37577 + }, + { + "epoch": 0.9950507204860694, + "grad_norm": 0.79296875, + "learning_rate": 7.748894554394354e-05, + "loss": 0.7998, + "step": 37578 + }, + { + "epoch": 0.9950772000943637, + "grad_norm": 0.88671875, + "learning_rate": 7.74844599902552e-05, + "loss": 0.729, + "step": 37579 + }, + { + "epoch": 0.9951036797026581, + "grad_norm": 0.8515625, + "learning_rate": 7.747997448428729e-05, + "loss": 0.8104, + "step": 37580 + }, + { + "epoch": 0.9951301593109525, + "grad_norm": 0.78515625, + "learning_rate": 7.747548902604928e-05, + "loss": 0.724, + "step": 37581 + }, + { + "epoch": 0.9951566389192468, + "grad_norm": 0.74609375, + "learning_rate": 7.74710036155507e-05, + "loss": 0.7383, + "step": 37582 + }, + { + "epoch": 0.9951831185275412, + "grad_norm": 0.78125, + "learning_rate": 7.746651825280103e-05, + "loss": 0.7666, + "step": 37583 + }, + { + "epoch": 0.9952095981358355, + "grad_norm": 0.8671875, + "learning_rate": 7.746203293780977e-05, + "loss": 0.7894, + "step": 37584 + }, + { + "epoch": 0.9952360777441299, + "grad_norm": 0.69140625, + "learning_rate": 7.745754767058647e-05, + "loss": 0.7331, + "step": 37585 + }, + { + "epoch": 0.9952625573524243, + "grad_norm": 0.76171875, + "learning_rate": 7.745306245114056e-05, + "loss": 0.8687, + "step": 37586 + }, + { + "epoch": 0.9952890369607187, + "grad_norm": 0.78515625, + "learning_rate": 7.744857727948164e-05, + "loss": 0.7798, + "step": 37587 + }, + { + "epoch": 0.9953155165690131, + "grad_norm": 0.76953125, + "learning_rate": 7.744409215561918e-05, + "loss": 0.8701, + "step": 37588 + }, + { + "epoch": 0.9953419961773075, + "grad_norm": 0.7109375, + "learning_rate": 7.743960707956263e-05, + "loss": 0.758, + "step": 37589 + }, + { + "epoch": 0.9953684757856018, + "grad_norm": 0.7890625, + "learning_rate": 7.74351220513216e-05, + "loss": 0.6697, + "step": 37590 + }, + { + "epoch": 0.9953949553938962, + "grad_norm": 0.87109375, + "learning_rate": 7.74306370709055e-05, + "loss": 0.7535, + "step": 37591 + }, + { + "epoch": 0.9954214350021906, + "grad_norm": 0.765625, + "learning_rate": 7.74261521383239e-05, + "loss": 0.8001, + "step": 37592 + }, + { + "epoch": 0.995447914610485, + "grad_norm": 0.7265625, + "learning_rate": 7.742166725358628e-05, + "loss": 0.8601, + "step": 37593 + }, + { + "epoch": 0.9954743942187794, + "grad_norm": 0.8359375, + "learning_rate": 7.741718241670209e-05, + "loss": 0.9143, + "step": 37594 + }, + { + "epoch": 0.9955008738270738, + "grad_norm": 0.73828125, + "learning_rate": 7.741269762768093e-05, + "loss": 0.7674, + "step": 37595 + }, + { + "epoch": 0.9955273534353681, + "grad_norm": 0.88671875, + "learning_rate": 7.740821288653228e-05, + "loss": 0.8546, + "step": 37596 + }, + { + "epoch": 0.9955538330436625, + "grad_norm": 0.7421875, + "learning_rate": 7.740372819326563e-05, + "loss": 0.6777, + "step": 37597 + }, + { + "epoch": 0.9955803126519568, + "grad_norm": 0.8046875, + "learning_rate": 7.739924354789048e-05, + "loss": 0.914, + "step": 37598 + }, + { + "epoch": 0.9956067922602512, + "grad_norm": 0.71484375, + "learning_rate": 7.739475895041632e-05, + "loss": 0.7801, + "step": 37599 + }, + { + "epoch": 0.9956332718685456, + "grad_norm": 0.75390625, + "learning_rate": 7.73902744008527e-05, + "loss": 0.8, + "step": 37600 + }, + { + "epoch": 0.9956597514768399, + "grad_norm": 0.83203125, + "learning_rate": 7.738578989920909e-05, + "loss": 0.791, + "step": 37601 + }, + { + "epoch": 0.9956862310851343, + "grad_norm": 0.85546875, + "learning_rate": 7.738130544549502e-05, + "loss": 0.8068, + "step": 37602 + }, + { + "epoch": 0.9957127106934287, + "grad_norm": 0.75, + "learning_rate": 7.737682103971999e-05, + "loss": 0.7226, + "step": 37603 + }, + { + "epoch": 0.9957391903017231, + "grad_norm": 0.73828125, + "learning_rate": 7.737233668189343e-05, + "loss": 0.7496, + "step": 37604 + }, + { + "epoch": 0.9957656699100175, + "grad_norm": 0.7421875, + "learning_rate": 7.736785237202498e-05, + "loss": 0.8071, + "step": 37605 + }, + { + "epoch": 0.9957921495183119, + "grad_norm": 0.80078125, + "learning_rate": 7.736336811012405e-05, + "loss": 0.8213, + "step": 37606 + }, + { + "epoch": 0.9958186291266062, + "grad_norm": 0.8046875, + "learning_rate": 7.735888389620016e-05, + "loss": 0.7343, + "step": 37607 + }, + { + "epoch": 0.9958451087349006, + "grad_norm": 0.8984375, + "learning_rate": 7.735439973026284e-05, + "loss": 0.8687, + "step": 37608 + }, + { + "epoch": 0.995871588343195, + "grad_norm": 0.74609375, + "learning_rate": 7.734991561232155e-05, + "loss": 0.8252, + "step": 37609 + }, + { + "epoch": 0.9958980679514894, + "grad_norm": 0.81640625, + "learning_rate": 7.734543154238585e-05, + "loss": 0.7609, + "step": 37610 + }, + { + "epoch": 0.9959245475597838, + "grad_norm": 0.828125, + "learning_rate": 7.73409475204652e-05, + "loss": 0.8701, + "step": 37611 + }, + { + "epoch": 0.9959510271680782, + "grad_norm": 0.76953125, + "learning_rate": 7.733646354656911e-05, + "loss": 0.7608, + "step": 37612 + }, + { + "epoch": 0.9959775067763725, + "grad_norm": 0.7734375, + "learning_rate": 7.733197962070711e-05, + "loss": 0.8397, + "step": 37613 + }, + { + "epoch": 0.9960039863846669, + "grad_norm": 0.765625, + "learning_rate": 7.732749574288862e-05, + "loss": 0.7306, + "step": 37614 + }, + { + "epoch": 0.9960304659929612, + "grad_norm": 0.796875, + "learning_rate": 7.732301191312327e-05, + "loss": 0.7318, + "step": 37615 + }, + { + "epoch": 0.9960569456012556, + "grad_norm": 0.84765625, + "learning_rate": 7.731852813142048e-05, + "loss": 0.9463, + "step": 37616 + }, + { + "epoch": 0.99608342520955, + "grad_norm": 0.77734375, + "learning_rate": 7.731404439778979e-05, + "loss": 0.7661, + "step": 37617 + }, + { + "epoch": 0.9961099048178443, + "grad_norm": 0.7578125, + "learning_rate": 7.730956071224068e-05, + "loss": 0.705, + "step": 37618 + }, + { + "epoch": 0.9961363844261387, + "grad_norm": 0.9609375, + "learning_rate": 7.730507707478261e-05, + "loss": 0.719, + "step": 37619 + }, + { + "epoch": 0.9961628640344331, + "grad_norm": 0.78515625, + "learning_rate": 7.730059348542519e-05, + "loss": 0.7985, + "step": 37620 + }, + { + "epoch": 0.9961893436427275, + "grad_norm": 0.88671875, + "learning_rate": 7.729610994417785e-05, + "loss": 0.8381, + "step": 37621 + }, + { + "epoch": 0.9962158232510219, + "grad_norm": 0.75390625, + "learning_rate": 7.729162645105011e-05, + "loss": 0.791, + "step": 37622 + }, + { + "epoch": 0.9962423028593163, + "grad_norm": 1.0078125, + "learning_rate": 7.728714300605148e-05, + "loss": 0.7872, + "step": 37623 + }, + { + "epoch": 0.9962687824676106, + "grad_norm": 0.82421875, + "learning_rate": 7.72826596091914e-05, + "loss": 0.7139, + "step": 37624 + }, + { + "epoch": 0.996295262075905, + "grad_norm": 0.77734375, + "learning_rate": 7.727817626047947e-05, + "loss": 0.7615, + "step": 37625 + }, + { + "epoch": 0.9963217416841994, + "grad_norm": 0.83203125, + "learning_rate": 7.727369295992515e-05, + "loss": 0.8466, + "step": 37626 + }, + { + "epoch": 0.9963482212924938, + "grad_norm": 0.83203125, + "learning_rate": 7.726920970753792e-05, + "loss": 0.8772, + "step": 37627 + }, + { + "epoch": 0.9963747009007882, + "grad_norm": 0.828125, + "learning_rate": 7.726472650332732e-05, + "loss": 0.7708, + "step": 37628 + }, + { + "epoch": 0.9964011805090826, + "grad_norm": 0.7890625, + "learning_rate": 7.726024334730281e-05, + "loss": 0.7187, + "step": 37629 + }, + { + "epoch": 0.9964276601173769, + "grad_norm": 0.8515625, + "learning_rate": 7.725576023947393e-05, + "loss": 0.9438, + "step": 37630 + }, + { + "epoch": 0.9964541397256712, + "grad_norm": 0.78125, + "learning_rate": 7.725127717985017e-05, + "loss": 0.8697, + "step": 37631 + }, + { + "epoch": 0.9964806193339656, + "grad_norm": 0.77734375, + "learning_rate": 7.724679416844102e-05, + "loss": 0.8013, + "step": 37632 + }, + { + "epoch": 0.99650709894226, + "grad_norm": 0.76171875, + "learning_rate": 7.7242311205256e-05, + "loss": 0.8202, + "step": 37633 + }, + { + "epoch": 0.9965335785505544, + "grad_norm": 0.7890625, + "learning_rate": 7.723782829030456e-05, + "loss": 0.8537, + "step": 37634 + }, + { + "epoch": 0.9965600581588487, + "grad_norm": 0.80078125, + "learning_rate": 7.723334542359627e-05, + "loss": 0.8326, + "step": 37635 + }, + { + "epoch": 0.9965865377671431, + "grad_norm": 0.7890625, + "learning_rate": 7.722886260514062e-05, + "loss": 0.8258, + "step": 37636 + }, + { + "epoch": 0.9966130173754375, + "grad_norm": 0.82421875, + "learning_rate": 7.722437983494708e-05, + "loss": 0.7711, + "step": 37637 + }, + { + "epoch": 0.9966394969837319, + "grad_norm": 0.80859375, + "learning_rate": 7.721989711302518e-05, + "loss": 0.7714, + "step": 37638 + }, + { + "epoch": 0.9966659765920263, + "grad_norm": 0.85546875, + "learning_rate": 7.721541443938435e-05, + "loss": 0.7901, + "step": 37639 + }, + { + "epoch": 0.9966924562003207, + "grad_norm": 0.76953125, + "learning_rate": 7.721093181403418e-05, + "loss": 0.8906, + "step": 37640 + }, + { + "epoch": 0.996718935808615, + "grad_norm": 0.73828125, + "learning_rate": 7.720644923698415e-05, + "loss": 0.7662, + "step": 37641 + }, + { + "epoch": 0.9967454154169094, + "grad_norm": 0.7578125, + "learning_rate": 7.720196670824375e-05, + "loss": 0.6873, + "step": 37642 + }, + { + "epoch": 0.9967718950252038, + "grad_norm": 0.78515625, + "learning_rate": 7.71974842278225e-05, + "loss": 0.7639, + "step": 37643 + }, + { + "epoch": 0.9967983746334982, + "grad_norm": 0.7890625, + "learning_rate": 7.71930017957298e-05, + "loss": 0.8121, + "step": 37644 + }, + { + "epoch": 0.9968248542417926, + "grad_norm": 0.80078125, + "learning_rate": 7.71885194119753e-05, + "loss": 0.8095, + "step": 37645 + }, + { + "epoch": 0.996851333850087, + "grad_norm": 0.87109375, + "learning_rate": 7.71840370765684e-05, + "loss": 0.7619, + "step": 37646 + }, + { + "epoch": 0.9968778134583812, + "grad_norm": 0.7734375, + "learning_rate": 7.717955478951863e-05, + "loss": 0.7381, + "step": 37647 + }, + { + "epoch": 0.9969042930666756, + "grad_norm": 0.70703125, + "learning_rate": 7.717507255083552e-05, + "loss": 0.6976, + "step": 37648 + }, + { + "epoch": 0.99693077267497, + "grad_norm": 0.765625, + "learning_rate": 7.71705903605285e-05, + "loss": 0.7346, + "step": 37649 + }, + { + "epoch": 0.9969572522832644, + "grad_norm": 0.80078125, + "learning_rate": 7.716610821860713e-05, + "loss": 0.8884, + "step": 37650 + }, + { + "epoch": 0.9969837318915588, + "grad_norm": 0.84765625, + "learning_rate": 7.716162612508089e-05, + "loss": 0.7013, + "step": 37651 + }, + { + "epoch": 0.9970102114998531, + "grad_norm": 0.78515625, + "learning_rate": 7.715714407995927e-05, + "loss": 0.7706, + "step": 37652 + }, + { + "epoch": 0.9970366911081475, + "grad_norm": 0.79296875, + "learning_rate": 7.715266208325177e-05, + "loss": 0.7916, + "step": 37653 + }, + { + "epoch": 0.9970631707164419, + "grad_norm": 0.76171875, + "learning_rate": 7.71481801349679e-05, + "loss": 0.794, + "step": 37654 + }, + { + "epoch": 0.9970896503247363, + "grad_norm": 0.74609375, + "learning_rate": 7.714369823511716e-05, + "loss": 0.7532, + "step": 37655 + }, + { + "epoch": 0.9971161299330307, + "grad_norm": 0.80078125, + "learning_rate": 7.713921638370905e-05, + "loss": 0.9179, + "step": 37656 + }, + { + "epoch": 0.997142609541325, + "grad_norm": 0.84375, + "learning_rate": 7.713473458075306e-05, + "loss": 0.8384, + "step": 37657 + }, + { + "epoch": 0.9971690891496194, + "grad_norm": 0.7421875, + "learning_rate": 7.713025282625872e-05, + "loss": 0.7392, + "step": 37658 + }, + { + "epoch": 0.9971955687579138, + "grad_norm": 0.81640625, + "learning_rate": 7.712577112023543e-05, + "loss": 0.8002, + "step": 37659 + }, + { + "epoch": 0.9972220483662082, + "grad_norm": 0.78515625, + "learning_rate": 7.712128946269281e-05, + "loss": 0.7411, + "step": 37660 + }, + { + "epoch": 0.9972485279745026, + "grad_norm": 0.80859375, + "learning_rate": 7.711680785364032e-05, + "loss": 0.8139, + "step": 37661 + }, + { + "epoch": 0.997275007582797, + "grad_norm": 0.8046875, + "learning_rate": 7.711232629308744e-05, + "loss": 0.8859, + "step": 37662 + }, + { + "epoch": 0.9973014871910914, + "grad_norm": 0.78125, + "learning_rate": 7.710784478104367e-05, + "loss": 0.7968, + "step": 37663 + }, + { + "epoch": 0.9973279667993856, + "grad_norm": 0.73828125, + "learning_rate": 7.710336331751848e-05, + "loss": 0.8646, + "step": 37664 + }, + { + "epoch": 0.99735444640768, + "grad_norm": 0.7421875, + "learning_rate": 7.709888190252144e-05, + "loss": 0.6562, + "step": 37665 + }, + { + "epoch": 0.9973809260159744, + "grad_norm": 0.7578125, + "learning_rate": 7.709440053606203e-05, + "loss": 0.8248, + "step": 37666 + }, + { + "epoch": 0.9974074056242688, + "grad_norm": 0.8828125, + "learning_rate": 7.708991921814971e-05, + "loss": 0.7795, + "step": 37667 + }, + { + "epoch": 0.9974338852325632, + "grad_norm": 0.82421875, + "learning_rate": 7.7085437948794e-05, + "loss": 0.7571, + "step": 37668 + }, + { + "epoch": 0.9974603648408575, + "grad_norm": 0.77734375, + "learning_rate": 7.708095672800437e-05, + "loss": 0.7826, + "step": 37669 + }, + { + "epoch": 0.9974868444491519, + "grad_norm": 0.734375, + "learning_rate": 7.707647555579038e-05, + "loss": 0.6782, + "step": 37670 + }, + { + "epoch": 0.9975133240574463, + "grad_norm": 0.74609375, + "learning_rate": 7.707199443216145e-05, + "loss": 0.8293, + "step": 37671 + }, + { + "epoch": 0.9975398036657407, + "grad_norm": 0.78125, + "learning_rate": 7.706751335712714e-05, + "loss": 0.8664, + "step": 37672 + }, + { + "epoch": 0.9975662832740351, + "grad_norm": 0.77734375, + "learning_rate": 7.706303233069692e-05, + "loss": 0.6835, + "step": 37673 + }, + { + "epoch": 0.9975927628823295, + "grad_norm": 0.7421875, + "learning_rate": 7.70585513528803e-05, + "loss": 0.7025, + "step": 37674 + }, + { + "epoch": 0.9976192424906238, + "grad_norm": 6.03125, + "learning_rate": 7.705407042368675e-05, + "loss": 0.7934, + "step": 37675 + }, + { + "epoch": 0.9976457220989182, + "grad_norm": 0.80078125, + "learning_rate": 7.704958954312581e-05, + "loss": 0.7656, + "step": 37676 + }, + { + "epoch": 0.9976722017072126, + "grad_norm": 0.84765625, + "learning_rate": 7.704510871120695e-05, + "loss": 0.8597, + "step": 37677 + }, + { + "epoch": 0.997698681315507, + "grad_norm": 0.81640625, + "learning_rate": 7.704062792793967e-05, + "loss": 0.6929, + "step": 37678 + }, + { + "epoch": 0.9977251609238014, + "grad_norm": 0.84375, + "learning_rate": 7.703614719333344e-05, + "loss": 0.7905, + "step": 37679 + }, + { + "epoch": 0.9977516405320956, + "grad_norm": 0.83984375, + "learning_rate": 7.703166650739779e-05, + "loss": 0.6665, + "step": 37680 + }, + { + "epoch": 0.99777812014039, + "grad_norm": 0.6953125, + "learning_rate": 7.702718587014224e-05, + "loss": 0.7891, + "step": 37681 + }, + { + "epoch": 0.9978045997486844, + "grad_norm": 0.80859375, + "learning_rate": 7.702270528157625e-05, + "loss": 0.7359, + "step": 37682 + }, + { + "epoch": 0.9978310793569788, + "grad_norm": 0.7734375, + "learning_rate": 7.70182247417093e-05, + "loss": 0.7335, + "step": 37683 + }, + { + "epoch": 0.9978575589652732, + "grad_norm": 0.7890625, + "learning_rate": 7.701374425055091e-05, + "loss": 0.8848, + "step": 37684 + }, + { + "epoch": 0.9978840385735676, + "grad_norm": 0.8359375, + "learning_rate": 7.700926380811058e-05, + "loss": 0.7827, + "step": 37685 + }, + { + "epoch": 0.9979105181818619, + "grad_norm": 0.81640625, + "learning_rate": 7.700478341439782e-05, + "loss": 0.7397, + "step": 37686 + }, + { + "epoch": 0.9979369977901563, + "grad_norm": 1.1015625, + "learning_rate": 7.70003030694221e-05, + "loss": 0.8682, + "step": 37687 + }, + { + "epoch": 0.9979634773984507, + "grad_norm": 1.3828125, + "learning_rate": 7.699582277319292e-05, + "loss": 0.8779, + "step": 37688 + }, + { + "epoch": 0.9979899570067451, + "grad_norm": 0.8828125, + "learning_rate": 7.699134252571976e-05, + "loss": 0.8014, + "step": 37689 + }, + { + "epoch": 0.9980164366150395, + "grad_norm": 0.81640625, + "learning_rate": 7.698686232701215e-05, + "loss": 0.8, + "step": 37690 + }, + { + "epoch": 0.9980429162233339, + "grad_norm": 0.8359375, + "learning_rate": 7.698238217707958e-05, + "loss": 0.9196, + "step": 37691 + }, + { + "epoch": 0.9980693958316282, + "grad_norm": 0.77734375, + "learning_rate": 7.697790207593152e-05, + "loss": 0.7764, + "step": 37692 + }, + { + "epoch": 0.9980958754399226, + "grad_norm": 0.72265625, + "learning_rate": 7.69734220235775e-05, + "loss": 0.6807, + "step": 37693 + }, + { + "epoch": 0.998122355048217, + "grad_norm": 0.8515625, + "learning_rate": 7.696894202002699e-05, + "loss": 0.8906, + "step": 37694 + }, + { + "epoch": 0.9981488346565114, + "grad_norm": 0.81640625, + "learning_rate": 7.696446206528946e-05, + "loss": 0.8696, + "step": 37695 + }, + { + "epoch": 0.9981753142648057, + "grad_norm": 0.82421875, + "learning_rate": 7.695998215937448e-05, + "loss": 0.921, + "step": 37696 + }, + { + "epoch": 0.9982017938731, + "grad_norm": 0.8203125, + "learning_rate": 7.695550230229149e-05, + "loss": 0.8944, + "step": 37697 + }, + { + "epoch": 0.9982282734813944, + "grad_norm": 0.82421875, + "learning_rate": 7.695102249404999e-05, + "loss": 0.7531, + "step": 37698 + }, + { + "epoch": 0.9982547530896888, + "grad_norm": 0.75, + "learning_rate": 7.694654273465949e-05, + "loss": 0.7071, + "step": 37699 + }, + { + "epoch": 0.9982812326979832, + "grad_norm": 0.7890625, + "learning_rate": 7.694206302412942e-05, + "loss": 0.7739, + "step": 37700 + }, + { + "epoch": 0.9983077123062776, + "grad_norm": 0.73046875, + "learning_rate": 7.693758336246937e-05, + "loss": 0.7412, + "step": 37701 + }, + { + "epoch": 0.998334191914572, + "grad_norm": 0.765625, + "learning_rate": 7.69331037496888e-05, + "loss": 0.6809, + "step": 37702 + }, + { + "epoch": 0.9983606715228663, + "grad_norm": 0.8046875, + "learning_rate": 7.69286241857972e-05, + "loss": 0.7564, + "step": 37703 + }, + { + "epoch": 0.9983871511311607, + "grad_norm": 0.78515625, + "learning_rate": 7.692414467080407e-05, + "loss": 0.7197, + "step": 37704 + }, + { + "epoch": 0.9984136307394551, + "grad_norm": 0.84765625, + "learning_rate": 7.691966520471885e-05, + "loss": 0.8655, + "step": 37705 + }, + { + "epoch": 0.9984401103477495, + "grad_norm": 0.828125, + "learning_rate": 7.691518578755112e-05, + "loss": 0.8258, + "step": 37706 + }, + { + "epoch": 0.9984665899560439, + "grad_norm": 1.453125, + "learning_rate": 7.691070641931033e-05, + "loss": 0.7362, + "step": 37707 + }, + { + "epoch": 0.9984930695643383, + "grad_norm": 0.75, + "learning_rate": 7.6906227100006e-05, + "loss": 0.7852, + "step": 37708 + }, + { + "epoch": 0.9985195491726326, + "grad_norm": 0.7734375, + "learning_rate": 7.690174782964756e-05, + "loss": 0.7265, + "step": 37709 + }, + { + "epoch": 0.998546028780927, + "grad_norm": 0.85546875, + "learning_rate": 7.689726860824453e-05, + "loss": 0.862, + "step": 37710 + }, + { + "epoch": 0.9985725083892214, + "grad_norm": 0.70703125, + "learning_rate": 7.689278943580646e-05, + "loss": 0.8022, + "step": 37711 + }, + { + "epoch": 0.9985989879975158, + "grad_norm": 0.79296875, + "learning_rate": 7.68883103123428e-05, + "loss": 0.7964, + "step": 37712 + }, + { + "epoch": 0.9986254676058101, + "grad_norm": 0.80859375, + "learning_rate": 7.688383123786305e-05, + "loss": 0.7423, + "step": 37713 + }, + { + "epoch": 0.9986519472141044, + "grad_norm": 0.72265625, + "learning_rate": 7.68793522123767e-05, + "loss": 0.7045, + "step": 37714 + }, + { + "epoch": 0.9986784268223988, + "grad_norm": 0.7734375, + "learning_rate": 7.68748732358932e-05, + "loss": 0.8098, + "step": 37715 + }, + { + "epoch": 0.9987049064306932, + "grad_norm": 0.859375, + "learning_rate": 7.687039430842212e-05, + "loss": 0.7934, + "step": 37716 + }, + { + "epoch": 0.9987313860389876, + "grad_norm": 0.7265625, + "learning_rate": 7.686591542997292e-05, + "loss": 0.7439, + "step": 37717 + }, + { + "epoch": 0.998757865647282, + "grad_norm": 0.76953125, + "learning_rate": 7.686143660055509e-05, + "loss": 0.6693, + "step": 37718 + }, + { + "epoch": 0.9987843452555764, + "grad_norm": 0.828125, + "learning_rate": 7.685695782017812e-05, + "loss": 0.7707, + "step": 37719 + }, + { + "epoch": 0.9988108248638707, + "grad_norm": 0.8671875, + "learning_rate": 7.685247908885146e-05, + "loss": 0.8705, + "step": 37720 + }, + { + "epoch": 0.9988373044721651, + "grad_norm": 0.75, + "learning_rate": 7.684800040658468e-05, + "loss": 0.6799, + "step": 37721 + }, + { + "epoch": 0.9988637840804595, + "grad_norm": 0.82421875, + "learning_rate": 7.684352177338726e-05, + "loss": 0.773, + "step": 37722 + }, + { + "epoch": 0.9988902636887539, + "grad_norm": 0.7578125, + "learning_rate": 7.683904318926866e-05, + "loss": 0.7833, + "step": 37723 + }, + { + "epoch": 0.9989167432970483, + "grad_norm": 0.8046875, + "learning_rate": 7.68345646542384e-05, + "loss": 0.8052, + "step": 37724 + }, + { + "epoch": 0.9989432229053427, + "grad_norm": 0.73828125, + "learning_rate": 7.68300861683059e-05, + "loss": 0.771, + "step": 37725 + }, + { + "epoch": 0.998969702513637, + "grad_norm": 0.78515625, + "learning_rate": 7.682560773148075e-05, + "loss": 0.8149, + "step": 37726 + }, + { + "epoch": 0.9989961821219314, + "grad_norm": 0.72265625, + "learning_rate": 7.68211293437724e-05, + "loss": 0.6675, + "step": 37727 + }, + { + "epoch": 0.9990226617302258, + "grad_norm": 0.7890625, + "learning_rate": 7.681665100519034e-05, + "loss": 0.7651, + "step": 37728 + }, + { + "epoch": 0.9990491413385201, + "grad_norm": 0.703125, + "learning_rate": 7.681217271574407e-05, + "loss": 0.8266, + "step": 37729 + }, + { + "epoch": 0.9990756209468145, + "grad_norm": 0.796875, + "learning_rate": 7.680769447544303e-05, + "loss": 0.7295, + "step": 37730 + }, + { + "epoch": 0.9991021005551088, + "grad_norm": 0.765625, + "learning_rate": 7.680321628429679e-05, + "loss": 0.8975, + "step": 37731 + }, + { + "epoch": 0.9991285801634032, + "grad_norm": 0.77734375, + "learning_rate": 7.679873814231481e-05, + "loss": 0.809, + "step": 37732 + }, + { + "epoch": 0.9991550597716976, + "grad_norm": 0.765625, + "learning_rate": 7.67942600495066e-05, + "loss": 0.7446, + "step": 37733 + }, + { + "epoch": 0.999181539379992, + "grad_norm": 0.8203125, + "learning_rate": 7.67897820058816e-05, + "loss": 0.7862, + "step": 37734 + }, + { + "epoch": 0.9992080189882864, + "grad_norm": 0.75390625, + "learning_rate": 7.678530401144932e-05, + "loss": 0.8964, + "step": 37735 + }, + { + "epoch": 0.9992344985965808, + "grad_norm": 0.9453125, + "learning_rate": 7.678082606621929e-05, + "loss": 0.7678, + "step": 37736 + }, + { + "epoch": 0.9992609782048751, + "grad_norm": 0.7890625, + "learning_rate": 7.677634817020096e-05, + "loss": 0.7941, + "step": 37737 + }, + { + "epoch": 0.9992874578131695, + "grad_norm": 0.70703125, + "learning_rate": 7.677187032340381e-05, + "loss": 0.7564, + "step": 37738 + }, + { + "epoch": 0.9993139374214639, + "grad_norm": 0.71484375, + "learning_rate": 7.676739252583739e-05, + "loss": 0.695, + "step": 37739 + }, + { + "epoch": 0.9993404170297583, + "grad_norm": 0.79296875, + "learning_rate": 7.676291477751111e-05, + "loss": 0.708, + "step": 37740 + }, + { + "epoch": 0.9993668966380527, + "grad_norm": 0.76171875, + "learning_rate": 7.675843707843455e-05, + "loss": 0.7906, + "step": 37741 + }, + { + "epoch": 0.999393376246347, + "grad_norm": 0.82421875, + "learning_rate": 7.675395942861715e-05, + "loss": 0.7871, + "step": 37742 + }, + { + "epoch": 0.9994198558546414, + "grad_norm": 0.7421875, + "learning_rate": 7.674948182806839e-05, + "loss": 0.6801, + "step": 37743 + }, + { + "epoch": 0.9994463354629358, + "grad_norm": 0.80859375, + "learning_rate": 7.674500427679779e-05, + "loss": 0.8638, + "step": 37744 + }, + { + "epoch": 0.9994728150712301, + "grad_norm": 0.71484375, + "learning_rate": 7.674052677481477e-05, + "loss": 0.7462, + "step": 37745 + }, + { + "epoch": 0.9994992946795245, + "grad_norm": 0.84375, + "learning_rate": 7.673604932212892e-05, + "loss": 0.9259, + "step": 37746 + }, + { + "epoch": 0.9995257742878189, + "grad_norm": 0.74609375, + "learning_rate": 7.673157191874968e-05, + "loss": 0.8082, + "step": 37747 + }, + { + "epoch": 0.9995522538961132, + "grad_norm": 0.80078125, + "learning_rate": 7.672709456468655e-05, + "loss": 0.763, + "step": 37748 + }, + { + "epoch": 0.9995787335044076, + "grad_norm": 0.828125, + "learning_rate": 7.672261725994902e-05, + "loss": 0.8387, + "step": 37749 + }, + { + "epoch": 0.999605213112702, + "grad_norm": 0.8359375, + "learning_rate": 7.671814000454651e-05, + "loss": 0.8578, + "step": 37750 + }, + { + "epoch": 0.9996316927209964, + "grad_norm": 0.90625, + "learning_rate": 7.671366279848863e-05, + "loss": 0.867, + "step": 37751 + }, + { + "epoch": 0.9996581723292908, + "grad_norm": 0.78125, + "learning_rate": 7.67091856417848e-05, + "loss": 0.6541, + "step": 37752 + }, + { + "epoch": 0.9996846519375852, + "grad_norm": 0.82421875, + "learning_rate": 7.670470853444452e-05, + "loss": 0.7566, + "step": 37753 + }, + { + "epoch": 0.9997111315458795, + "grad_norm": 0.79296875, + "learning_rate": 7.670023147647729e-05, + "loss": 0.7997, + "step": 37754 + }, + { + "epoch": 0.9997376111541739, + "grad_norm": 0.8515625, + "learning_rate": 7.669575446789255e-05, + "loss": 0.7718, + "step": 37755 + }, + { + "epoch": 0.9997640907624683, + "grad_norm": 0.796875, + "learning_rate": 7.669127750869987e-05, + "loss": 0.8023, + "step": 37756 + }, + { + "epoch": 0.9997905703707627, + "grad_norm": 0.81640625, + "learning_rate": 7.668680059890866e-05, + "loss": 0.7616, + "step": 37757 + }, + { + "epoch": 0.9998170499790571, + "grad_norm": 0.78515625, + "learning_rate": 7.668232373852847e-05, + "loss": 0.6713, + "step": 37758 + }, + { + "epoch": 0.9998435295873515, + "grad_norm": 0.8828125, + "learning_rate": 7.667784692756875e-05, + "loss": 0.7796, + "step": 37759 + }, + { + "epoch": 0.9998700091956458, + "grad_norm": 0.9140625, + "learning_rate": 7.667337016603899e-05, + "loss": 0.8406, + "step": 37760 + }, + { + "epoch": 0.9998964888039402, + "grad_norm": 0.86328125, + "learning_rate": 7.666889345394872e-05, + "loss": 0.7383, + "step": 37761 + }, + { + "epoch": 0.9999229684122345, + "grad_norm": 0.7265625, + "learning_rate": 7.666441679130737e-05, + "loss": 0.7733, + "step": 37762 + }, + { + "epoch": 0.9999494480205289, + "grad_norm": 0.765625, + "learning_rate": 7.665994017812447e-05, + "loss": 0.8468, + "step": 37763 + }, + { + "epoch": 0.9999759276288233, + "grad_norm": 0.80078125, + "learning_rate": 7.66554636144095e-05, + "loss": 0.7439, + "step": 37764 + }, + { + "epoch": 0.9999759276288233, + "step": 37764, + "total_flos": 1.0820185937641524e+20, + "train_loss": 0.07852082892805701, + "train_runtime": 12569.6366, + "train_samples_per_second": 396.589, + "train_steps_per_second": 3.004 + }, + { + "epoch": 0.00015765184739301172, + "grad_norm": 1.5234375, + "learning_rate": 7.733197962070711e-05, + "loss": 2.0038, + "step": 1 + }, + { + "epoch": 0.00031530369478602343, + "grad_norm": 7.21875, + "learning_rate": 7.732749574288862e-05, + "loss": 2.1873, + "step": 2 + }, + { + "epoch": 0.0004729555421790352, + "grad_norm": 2.109375, + "learning_rate": 7.732301191312327e-05, + "loss": 1.8105, + "step": 3 + }, + { + "epoch": 0.0006306073895720469, + "grad_norm": 1.7578125, + "learning_rate": 7.731852813142048e-05, + "loss": 1.6517, + "step": 4 + }, + { + "epoch": 0.0007882592369650586, + "grad_norm": 2.265625, + "learning_rate": 7.731404439778979e-05, + "loss": 1.6581, + "step": 5 + }, + { + "epoch": 0.0009459110843580704, + "grad_norm": 1.4453125, + "learning_rate": 7.730956071224068e-05, + "loss": 1.586, + "step": 6 + }, + { + "epoch": 0.001103562931751082, + "grad_norm": 1.4453125, + "learning_rate": 7.730507707478261e-05, + "loss": 1.6468, + "step": 7 + }, + { + "epoch": 0.0012612147791440937, + "grad_norm": 1.109375, + "learning_rate": 7.730059348542519e-05, + "loss": 1.4984, + "step": 8 + }, + { + "epoch": 0.0014188666265371056, + "grad_norm": 1.3203125, + "learning_rate": 7.729610994417785e-05, + "loss": 1.7313, + "step": 9 + }, + { + "epoch": 0.0015765184739301172, + "grad_norm": 1.140625, + "learning_rate": 7.729162645105011e-05, + "loss": 1.8015, + "step": 10 + }, + { + "epoch": 0.001734170321323129, + "grad_norm": 1.4453125, + "learning_rate": 7.728714300605148e-05, + "loss": 1.6603, + "step": 11 + }, + { + "epoch": 0.0018918221687161407, + "grad_norm": 1.15625, + "learning_rate": 7.72826596091914e-05, + "loss": 1.4449, + "step": 12 + }, + { + "epoch": 0.0020494740161091526, + "grad_norm": 1.1875, + "learning_rate": 7.727817626047947e-05, + "loss": 1.5075, + "step": 13 + }, + { + "epoch": 0.002207125863502164, + "grad_norm": 1.2890625, + "learning_rate": 7.727369295992515e-05, + "loss": 1.6191, + "step": 14 + }, + { + "epoch": 0.002364777710895176, + "grad_norm": 1.2578125, + "learning_rate": 7.726920970753792e-05, + "loss": 1.8938, + "step": 15 + }, + { + "epoch": 0.0025224295582881875, + "grad_norm": 1.2109375, + "learning_rate": 7.726472650332732e-05, + "loss": 1.4453, + "step": 16 + }, + { + "epoch": 0.002680081405681199, + "grad_norm": 1.2265625, + "learning_rate": 7.726024334730281e-05, + "loss": 1.805, + "step": 17 + }, + { + "epoch": 0.002837733253074211, + "grad_norm": 1.2578125, + "learning_rate": 7.725576023947393e-05, + "loss": 1.5799, + "step": 18 + }, + { + "epoch": 0.002995385100467223, + "grad_norm": 1.1015625, + "learning_rate": 7.725127717985017e-05, + "loss": 1.5906, + "step": 19 + }, + { + "epoch": 0.0031530369478602344, + "grad_norm": 1.2265625, + "learning_rate": 7.724679416844102e-05, + "loss": 1.7925, + "step": 20 + }, + { + "epoch": 0.003310688795253246, + "grad_norm": 1.1640625, + "learning_rate": 7.7242311205256e-05, + "loss": 1.7829, + "step": 21 + }, + { + "epoch": 0.003468340642646258, + "grad_norm": 1.125, + "learning_rate": 7.723782829030456e-05, + "loss": 1.4992, + "step": 22 + }, + { + "epoch": 0.0036259924900392698, + "grad_norm": 1.171875, + "learning_rate": 7.723334542359627e-05, + "loss": 1.5473, + "step": 23 + }, + { + "epoch": 0.0037836443374322814, + "grad_norm": 1.21875, + "learning_rate": 7.722886260514062e-05, + "loss": 1.5765, + "step": 24 + }, + { + "epoch": 0.0039412961848252935, + "grad_norm": 1.1015625, + "learning_rate": 7.722437983494708e-05, + "loss": 1.484, + "step": 25 + }, + { + "epoch": 0.004098948032218305, + "grad_norm": 1.2265625, + "learning_rate": 7.721989711302518e-05, + "loss": 1.5618, + "step": 26 + }, + { + "epoch": 0.004256599879611317, + "grad_norm": 1.1484375, + "learning_rate": 7.721541443938435e-05, + "loss": 1.4351, + "step": 27 + }, + { + "epoch": 0.004414251727004328, + "grad_norm": 1.109375, + "learning_rate": 7.721093181403418e-05, + "loss": 1.4263, + "step": 28 + }, + { + "epoch": 0.00457190357439734, + "grad_norm": 1.0625, + "learning_rate": 7.720644923698415e-05, + "loss": 1.2921, + "step": 29 + }, + { + "epoch": 0.004729555421790352, + "grad_norm": 1.1875, + "learning_rate": 7.720196670824375e-05, + "loss": 1.6779, + "step": 30 + }, + { + "epoch": 0.004887207269183363, + "grad_norm": 1.0078125, + "learning_rate": 7.71974842278225e-05, + "loss": 1.4489, + "step": 31 + }, + { + "epoch": 0.005044859116576375, + "grad_norm": 1.03125, + "learning_rate": 7.71930017957298e-05, + "loss": 1.4959, + "step": 32 + }, + { + "epoch": 0.0052025109639693866, + "grad_norm": 0.96875, + "learning_rate": 7.71885194119753e-05, + "loss": 1.3712, + "step": 33 + }, + { + "epoch": 0.005360162811362398, + "grad_norm": 1.1171875, + "learning_rate": 7.71840370765684e-05, + "loss": 1.6498, + "step": 34 + }, + { + "epoch": 0.005517814658755411, + "grad_norm": 1.0546875, + "learning_rate": 7.717955478951863e-05, + "loss": 1.6476, + "step": 35 + }, + { + "epoch": 0.005675466506148422, + "grad_norm": 1.15625, + "learning_rate": 7.717507255083552e-05, + "loss": 1.5398, + "step": 36 + }, + { + "epoch": 0.005833118353541434, + "grad_norm": 1.078125, + "learning_rate": 7.71705903605285e-05, + "loss": 1.5979, + "step": 37 + }, + { + "epoch": 0.005990770200934446, + "grad_norm": 1.328125, + "learning_rate": 7.716610821860713e-05, + "loss": 1.5245, + "step": 38 + }, + { + "epoch": 0.006148422048327457, + "grad_norm": 1.0078125, + "learning_rate": 7.716162612508089e-05, + "loss": 1.2426, + "step": 39 + }, + { + "epoch": 0.006306073895720469, + "grad_norm": 1.0625, + "learning_rate": 7.715714407995927e-05, + "loss": 1.3583, + "step": 40 + }, + { + "epoch": 0.0064637257431134805, + "grad_norm": 1.0390625, + "learning_rate": 7.715266208325177e-05, + "loss": 1.6029, + "step": 41 + }, + { + "epoch": 0.006621377590506492, + "grad_norm": 1.1328125, + "learning_rate": 7.71481801349679e-05, + "loss": 1.7693, + "step": 42 + }, + { + "epoch": 0.006779029437899504, + "grad_norm": 1.140625, + "learning_rate": 7.714369823511716e-05, + "loss": 1.6219, + "step": 43 + }, + { + "epoch": 0.006936681285292516, + "grad_norm": 0.9765625, + "learning_rate": 7.713921638370905e-05, + "loss": 1.255, + "step": 44 + }, + { + "epoch": 0.007094333132685528, + "grad_norm": 0.98828125, + "learning_rate": 7.713473458075306e-05, + "loss": 1.646, + "step": 45 + }, + { + "epoch": 0.0072519849800785395, + "grad_norm": 1.1015625, + "learning_rate": 7.713025282625872e-05, + "loss": 1.4255, + "step": 46 + }, + { + "epoch": 0.007409636827471551, + "grad_norm": 1.0234375, + "learning_rate": 7.712577112023543e-05, + "loss": 1.4606, + "step": 47 + }, + { + "epoch": 0.007567288674864563, + "grad_norm": 1.046875, + "learning_rate": 7.712128946269281e-05, + "loss": 1.3715, + "step": 48 + }, + { + "epoch": 0.0077249405222575744, + "grad_norm": 1.03125, + "learning_rate": 7.711680785364032e-05, + "loss": 1.3441, + "step": 49 + }, + { + "epoch": 0.007882592369650587, + "grad_norm": 1.078125, + "learning_rate": 7.711232629308744e-05, + "loss": 1.4596, + "step": 50 + }, + { + "epoch": 0.008040244217043599, + "grad_norm": 1.0859375, + "learning_rate": 7.710784478104367e-05, + "loss": 1.3161, + "step": 51 + }, + { + "epoch": 0.00819789606443661, + "grad_norm": 2.59375, + "learning_rate": 7.710336331751848e-05, + "loss": 1.5608, + "step": 52 + }, + { + "epoch": 0.008355547911829622, + "grad_norm": 0.96484375, + "learning_rate": 7.709888190252144e-05, + "loss": 1.3919, + "step": 53 + }, + { + "epoch": 0.008513199759222633, + "grad_norm": 1.171875, + "learning_rate": 7.709440053606203e-05, + "loss": 1.6252, + "step": 54 + }, + { + "epoch": 0.008670851606615645, + "grad_norm": 1.1796875, + "learning_rate": 7.708991921814971e-05, + "loss": 1.5166, + "step": 55 + }, + { + "epoch": 0.008828503454008657, + "grad_norm": 1.0625, + "learning_rate": 7.7085437948794e-05, + "loss": 1.3341, + "step": 56 + }, + { + "epoch": 0.008986155301401668, + "grad_norm": 1.046875, + "learning_rate": 7.708095672800437e-05, + "loss": 1.6018, + "step": 57 + }, + { + "epoch": 0.00914380714879468, + "grad_norm": 1.125, + "learning_rate": 7.707647555579038e-05, + "loss": 1.4213, + "step": 58 + }, + { + "epoch": 0.009301458996187692, + "grad_norm": 1.2265625, + "learning_rate": 7.707199443216145e-05, + "loss": 1.5771, + "step": 59 + }, + { + "epoch": 0.009459110843580703, + "grad_norm": 1.0546875, + "learning_rate": 7.706751335712714e-05, + "loss": 1.3836, + "step": 60 + }, + { + "epoch": 0.009616762690973715, + "grad_norm": 0.9453125, + "learning_rate": 7.706303233069692e-05, + "loss": 1.4195, + "step": 61 + }, + { + "epoch": 0.009774414538366727, + "grad_norm": 1.1171875, + "learning_rate": 7.70585513528803e-05, + "loss": 1.4836, + "step": 62 + }, + { + "epoch": 0.009932066385759738, + "grad_norm": 1.0390625, + "learning_rate": 7.705407042368675e-05, + "loss": 1.5355, + "step": 63 + }, + { + "epoch": 0.01008971823315275, + "grad_norm": 1.1328125, + "learning_rate": 7.704958954312581e-05, + "loss": 1.5794, + "step": 64 + }, + { + "epoch": 0.010247370080545761, + "grad_norm": 0.96484375, + "learning_rate": 7.704510871120695e-05, + "loss": 1.3337, + "step": 65 + }, + { + "epoch": 0.010405021927938773, + "grad_norm": 1.1953125, + "learning_rate": 7.704062792793967e-05, + "loss": 1.7244, + "step": 66 + }, + { + "epoch": 0.010562673775331785, + "grad_norm": 1.0234375, + "learning_rate": 7.703614719333344e-05, + "loss": 1.3335, + "step": 67 + }, + { + "epoch": 0.010720325622724796, + "grad_norm": 1.0625, + "learning_rate": 7.703166650739779e-05, + "loss": 1.3979, + "step": 68 + }, + { + "epoch": 0.01087797747011781, + "grad_norm": 1.1875, + "learning_rate": 7.702718587014224e-05, + "loss": 1.3189, + "step": 69 + }, + { + "epoch": 0.011035629317510821, + "grad_norm": 1.140625, + "learning_rate": 7.702270528157625e-05, + "loss": 1.323, + "step": 70 + }, + { + "epoch": 0.011193281164903833, + "grad_norm": 1.0234375, + "learning_rate": 7.70182247417093e-05, + "loss": 1.2675, + "step": 71 + }, + { + "epoch": 0.011350933012296845, + "grad_norm": 1.09375, + "learning_rate": 7.701374425055091e-05, + "loss": 1.4025, + "step": 72 + }, + { + "epoch": 0.011508584859689856, + "grad_norm": 1.0390625, + "learning_rate": 7.700926380811058e-05, + "loss": 1.4356, + "step": 73 + }, + { + "epoch": 0.011666236707082868, + "grad_norm": 1.03125, + "learning_rate": 7.700478341439782e-05, + "loss": 1.2854, + "step": 74 + }, + { + "epoch": 0.01182388855447588, + "grad_norm": 1.0546875, + "learning_rate": 7.70003030694221e-05, + "loss": 1.4212, + "step": 75 + }, + { + "epoch": 0.011981540401868891, + "grad_norm": 1.1796875, + "learning_rate": 7.699582277319292e-05, + "loss": 1.6788, + "step": 76 + }, + { + "epoch": 0.012139192249261903, + "grad_norm": 1.03125, + "learning_rate": 7.699134252571976e-05, + "loss": 1.5051, + "step": 77 + }, + { + "epoch": 0.012296844096654914, + "grad_norm": 1.109375, + "learning_rate": 7.698686232701215e-05, + "loss": 1.4897, + "step": 78 + }, + { + "epoch": 0.012454495944047926, + "grad_norm": 1.140625, + "learning_rate": 7.698238217707958e-05, + "loss": 1.6293, + "step": 79 + }, + { + "epoch": 0.012612147791440938, + "grad_norm": 1.171875, + "learning_rate": 7.697790207593152e-05, + "loss": 1.6356, + "step": 80 + }, + { + "epoch": 0.01276979963883395, + "grad_norm": 1.0859375, + "learning_rate": 7.69734220235775e-05, + "loss": 1.6491, + "step": 81 + }, + { + "epoch": 0.012927451486226961, + "grad_norm": 1.0234375, + "learning_rate": 7.696894202002699e-05, + "loss": 1.4218, + "step": 82 + }, + { + "epoch": 0.013085103333619973, + "grad_norm": 1.1484375, + "learning_rate": 7.696446206528946e-05, + "loss": 1.5862, + "step": 83 + }, + { + "epoch": 0.013242755181012984, + "grad_norm": 1.1875, + "learning_rate": 7.695998215937448e-05, + "loss": 1.5699, + "step": 84 + }, + { + "epoch": 0.013400407028405996, + "grad_norm": 1.1171875, + "learning_rate": 7.695550230229149e-05, + "loss": 1.2374, + "step": 85 + }, + { + "epoch": 0.013558058875799008, + "grad_norm": 1.1875, + "learning_rate": 7.695102249404999e-05, + "loss": 1.4611, + "step": 86 + }, + { + "epoch": 0.01371571072319202, + "grad_norm": 0.98046875, + "learning_rate": 7.694654273465949e-05, + "loss": 1.215, + "step": 87 + }, + { + "epoch": 0.013873362570585033, + "grad_norm": 1.0625, + "learning_rate": 7.694206302412942e-05, + "loss": 1.5837, + "step": 88 + }, + { + "epoch": 0.014031014417978044, + "grad_norm": 1.0859375, + "learning_rate": 7.693758336246937e-05, + "loss": 1.4436, + "step": 89 + }, + { + "epoch": 0.014188666265371056, + "grad_norm": 1.15625, + "learning_rate": 7.69331037496888e-05, + "loss": 1.6648, + "step": 90 + }, + { + "epoch": 0.014346318112764067, + "grad_norm": 1.046875, + "learning_rate": 7.69286241857972e-05, + "loss": 1.3123, + "step": 91 + }, + { + "epoch": 0.014503969960157079, + "grad_norm": 0.96875, + "learning_rate": 7.692414467080407e-05, + "loss": 1.3214, + "step": 92 + }, + { + "epoch": 0.01466162180755009, + "grad_norm": 1.046875, + "learning_rate": 7.691966520471885e-05, + "loss": 1.4412, + "step": 93 + }, + { + "epoch": 0.014819273654943102, + "grad_norm": 1.1015625, + "learning_rate": 7.691518578755112e-05, + "loss": 1.4515, + "step": 94 + }, + { + "epoch": 0.014976925502336114, + "grad_norm": 1.0859375, + "learning_rate": 7.691070641931033e-05, + "loss": 1.5151, + "step": 95 + }, + { + "epoch": 0.015134577349729126, + "grad_norm": 1.15625, + "learning_rate": 7.6906227100006e-05, + "loss": 1.5282, + "step": 96 + }, + { + "epoch": 0.015292229197122137, + "grad_norm": 1.1015625, + "learning_rate": 7.690174782964756e-05, + "loss": 1.3737, + "step": 97 + }, + { + "epoch": 0.015449881044515149, + "grad_norm": 1.046875, + "learning_rate": 7.689726860824453e-05, + "loss": 1.4524, + "step": 98 + }, + { + "epoch": 0.01560753289190816, + "grad_norm": 0.98046875, + "learning_rate": 7.689278943580646e-05, + "loss": 1.5046, + "step": 99 + }, + { + "epoch": 0.015765184739301174, + "grad_norm": 1.15625, + "learning_rate": 7.68883103123428e-05, + "loss": 1.5331, + "step": 100 + }, + { + "epoch": 0.015922836586694186, + "grad_norm": 0.94140625, + "learning_rate": 7.688383123786305e-05, + "loss": 1.2469, + "step": 101 + }, + { + "epoch": 0.016080488434087197, + "grad_norm": 1.1484375, + "learning_rate": 7.68793522123767e-05, + "loss": 1.5894, + "step": 102 + }, + { + "epoch": 0.01623814028148021, + "grad_norm": 1.0078125, + "learning_rate": 7.68748732358932e-05, + "loss": 1.1792, + "step": 103 + }, + { + "epoch": 0.01639579212887322, + "grad_norm": 1.1328125, + "learning_rate": 7.687039430842212e-05, + "loss": 1.4037, + "step": 104 + }, + { + "epoch": 0.016553443976266232, + "grad_norm": 1.046875, + "learning_rate": 7.686591542997292e-05, + "loss": 1.5406, + "step": 105 + }, + { + "epoch": 0.016711095823659244, + "grad_norm": 0.9921875, + "learning_rate": 7.686143660055509e-05, + "loss": 1.3978, + "step": 106 + }, + { + "epoch": 0.016868747671052255, + "grad_norm": 1.125, + "learning_rate": 7.685695782017812e-05, + "loss": 1.3442, + "step": 107 + }, + { + "epoch": 0.017026399518445267, + "grad_norm": 1.0234375, + "learning_rate": 7.685247908885146e-05, + "loss": 1.3122, + "step": 108 + }, + { + "epoch": 0.01718405136583828, + "grad_norm": 2.109375, + "learning_rate": 7.684800040658468e-05, + "loss": 1.1921, + "step": 109 + }, + { + "epoch": 0.01734170321323129, + "grad_norm": 1.0703125, + "learning_rate": 7.684352177338726e-05, + "loss": 1.3773, + "step": 110 + }, + { + "epoch": 0.017499355060624302, + "grad_norm": 1.0546875, + "learning_rate": 7.683904318926866e-05, + "loss": 1.1859, + "step": 111 + }, + { + "epoch": 0.017657006908017314, + "grad_norm": 1.0625, + "learning_rate": 7.68345646542384e-05, + "loss": 1.4292, + "step": 112 + }, + { + "epoch": 0.017814658755410325, + "grad_norm": 1.0625, + "learning_rate": 7.68300861683059e-05, + "loss": 1.2496, + "step": 113 + }, + { + "epoch": 0.017972310602803337, + "grad_norm": 0.96875, + "learning_rate": 7.682560773148075e-05, + "loss": 1.3713, + "step": 114 + }, + { + "epoch": 0.01812996245019635, + "grad_norm": 0.94921875, + "learning_rate": 7.68211293437724e-05, + "loss": 1.2039, + "step": 115 + }, + { + "epoch": 0.01828761429758936, + "grad_norm": 1.1015625, + "learning_rate": 7.681665100519034e-05, + "loss": 1.5079, + "step": 116 + }, + { + "epoch": 0.01844526614498237, + "grad_norm": 1.1640625, + "learning_rate": 7.681217271574407e-05, + "loss": 1.5297, + "step": 117 + }, + { + "epoch": 0.018602917992375383, + "grad_norm": 1.1328125, + "learning_rate": 7.680769447544303e-05, + "loss": 1.2977, + "step": 118 + }, + { + "epoch": 0.018760569839768395, + "grad_norm": 1.125, + "learning_rate": 7.680321628429679e-05, + "loss": 1.3847, + "step": 119 + }, + { + "epoch": 0.018918221687161407, + "grad_norm": 1.0859375, + "learning_rate": 7.679873814231481e-05, + "loss": 1.7289, + "step": 120 + }, + { + "epoch": 0.019075873534554418, + "grad_norm": 1.015625, + "learning_rate": 7.67942600495066e-05, + "loss": 1.3275, + "step": 121 + }, + { + "epoch": 0.01923352538194743, + "grad_norm": 1.03125, + "learning_rate": 7.67897820058816e-05, + "loss": 1.2296, + "step": 122 + }, + { + "epoch": 0.01939117722934044, + "grad_norm": 0.87109375, + "learning_rate": 7.678530401144932e-05, + "loss": 1.0845, + "step": 123 + }, + { + "epoch": 0.019548829076733453, + "grad_norm": 0.9921875, + "learning_rate": 7.678082606621929e-05, + "loss": 1.2004, + "step": 124 + }, + { + "epoch": 0.019706480924126465, + "grad_norm": 1.28125, + "learning_rate": 7.677634817020096e-05, + "loss": 1.2791, + "step": 125 + }, + { + "epoch": 0.019864132771519476, + "grad_norm": 0.9921875, + "learning_rate": 7.677187032340381e-05, + "loss": 1.2597, + "step": 126 + }, + { + "epoch": 0.020021784618912488, + "grad_norm": 0.9765625, + "learning_rate": 7.676739252583739e-05, + "loss": 1.1538, + "step": 127 + }, + { + "epoch": 0.0201794364663055, + "grad_norm": 0.92578125, + "learning_rate": 7.676291477751111e-05, + "loss": 1.2538, + "step": 128 + }, + { + "epoch": 0.02033708831369851, + "grad_norm": 1.1328125, + "learning_rate": 7.675843707843455e-05, + "loss": 1.2804, + "step": 129 + }, + { + "epoch": 0.020494740161091523, + "grad_norm": 1.0546875, + "learning_rate": 7.675395942861715e-05, + "loss": 1.1411, + "step": 130 + }, + { + "epoch": 0.020652392008484535, + "grad_norm": 1.0703125, + "learning_rate": 7.674948182806839e-05, + "loss": 1.1425, + "step": 131 + }, + { + "epoch": 0.020810043855877546, + "grad_norm": 1.0234375, + "learning_rate": 7.674500427679779e-05, + "loss": 1.595, + "step": 132 + }, + { + "epoch": 0.020967695703270558, + "grad_norm": 1.1171875, + "learning_rate": 7.674052677481477e-05, + "loss": 1.6207, + "step": 133 + }, + { + "epoch": 0.02112534755066357, + "grad_norm": 0.99609375, + "learning_rate": 7.673604932212892e-05, + "loss": 1.1974, + "step": 134 + }, + { + "epoch": 0.02128299939805658, + "grad_norm": 1.046875, + "learning_rate": 7.673157191874968e-05, + "loss": 1.4377, + "step": 135 + }, + { + "epoch": 0.021440651245449593, + "grad_norm": 1.0703125, + "learning_rate": 7.672709456468655e-05, + "loss": 1.4708, + "step": 136 + }, + { + "epoch": 0.021598303092842608, + "grad_norm": 1.1015625, + "learning_rate": 7.672261725994902e-05, + "loss": 1.2266, + "step": 137 + }, + { + "epoch": 0.02175595494023562, + "grad_norm": 1.015625, + "learning_rate": 7.671814000454651e-05, + "loss": 1.2505, + "step": 138 + }, + { + "epoch": 0.02191360678762863, + "grad_norm": 1.1015625, + "learning_rate": 7.671366279848863e-05, + "loss": 1.3418, + "step": 139 + }, + { + "epoch": 0.022071258635021643, + "grad_norm": 1.0078125, + "learning_rate": 7.67091856417848e-05, + "loss": 1.2083, + "step": 140 + }, + { + "epoch": 0.022228910482414654, + "grad_norm": 1.109375, + "learning_rate": 7.670470853444452e-05, + "loss": 1.4812, + "step": 141 + }, + { + "epoch": 0.022386562329807666, + "grad_norm": 1.0546875, + "learning_rate": 7.670023147647729e-05, + "loss": 1.4736, + "step": 142 + }, + { + "epoch": 0.022544214177200678, + "grad_norm": 0.97265625, + "learning_rate": 7.669575446789255e-05, + "loss": 1.2786, + "step": 143 + }, + { + "epoch": 0.02270186602459369, + "grad_norm": 1.109375, + "learning_rate": 7.669127750869987e-05, + "loss": 1.3641, + "step": 144 + }, + { + "epoch": 0.0228595178719867, + "grad_norm": 1.1015625, + "learning_rate": 7.668680059890866e-05, + "loss": 1.2386, + "step": 145 + }, + { + "epoch": 0.023017169719379713, + "grad_norm": 1.0859375, + "learning_rate": 7.668232373852847e-05, + "loss": 1.5634, + "step": 146 + }, + { + "epoch": 0.023174821566772724, + "grad_norm": 0.94921875, + "learning_rate": 7.667784692756875e-05, + "loss": 1.3462, + "step": 147 + }, + { + "epoch": 0.023332473414165736, + "grad_norm": 1.03125, + "learning_rate": 7.667337016603899e-05, + "loss": 1.2795, + "step": 148 + }, + { + "epoch": 0.023490125261558747, + "grad_norm": 1.046875, + "learning_rate": 7.666889345394872e-05, + "loss": 1.3205, + "step": 149 + }, + { + "epoch": 0.02364777710895176, + "grad_norm": 0.9765625, + "learning_rate": 7.666441679130737e-05, + "loss": 1.2658, + "step": 150 + }, + { + "epoch": 0.02380542895634477, + "grad_norm": 1.0625, + "learning_rate": 7.665994017812447e-05, + "loss": 1.3257, + "step": 151 + }, + { + "epoch": 0.023963080803737782, + "grad_norm": 1.25, + "learning_rate": 7.66554636144095e-05, + "loss": 1.5763, + "step": 152 + }, + { + "epoch": 0.024120732651130794, + "grad_norm": 0.875, + "learning_rate": 7.665098710017188e-05, + "loss": 1.1683, + "step": 153 + }, + { + "epoch": 0.024278384498523806, + "grad_norm": 0.9375, + "learning_rate": 7.66465106354212e-05, + "loss": 1.3102, + "step": 154 + }, + { + "epoch": 0.024436036345916817, + "grad_norm": 1.046875, + "learning_rate": 7.664203422016693e-05, + "loss": 1.3299, + "step": 155 + }, + { + "epoch": 0.02459368819330983, + "grad_norm": 1.0546875, + "learning_rate": 7.663755785441852e-05, + "loss": 1.4491, + "step": 156 + }, + { + "epoch": 0.02475134004070284, + "grad_norm": 1.1953125, + "learning_rate": 7.663308153818547e-05, + "loss": 1.4689, + "step": 157 + }, + { + "epoch": 0.024908991888095852, + "grad_norm": 1.1953125, + "learning_rate": 7.662860527147721e-05, + "loss": 1.4654, + "step": 158 + }, + { + "epoch": 0.025066643735488864, + "grad_norm": 1.1796875, + "learning_rate": 7.662412905430337e-05, + "loss": 1.4283, + "step": 159 + }, + { + "epoch": 0.025224295582881875, + "grad_norm": 1.0, + "learning_rate": 7.66196528866733e-05, + "loss": 1.2221, + "step": 160 + }, + { + "epoch": 0.025381947430274887, + "grad_norm": 1.0625, + "learning_rate": 7.661517676859657e-05, + "loss": 1.3822, + "step": 161 + }, + { + "epoch": 0.0255395992776679, + "grad_norm": 1.1328125, + "learning_rate": 7.661070070008263e-05, + "loss": 1.3225, + "step": 162 + }, + { + "epoch": 0.02569725112506091, + "grad_norm": 0.98828125, + "learning_rate": 7.660622468114094e-05, + "loss": 1.1407, + "step": 163 + }, + { + "epoch": 0.025854902972453922, + "grad_norm": 1.0234375, + "learning_rate": 7.660174871178106e-05, + "loss": 1.5181, + "step": 164 + }, + { + "epoch": 0.026012554819846934, + "grad_norm": 0.96484375, + "learning_rate": 7.659727279201243e-05, + "loss": 1.2308, + "step": 165 + }, + { + "epoch": 0.026170206667239945, + "grad_norm": 1.1171875, + "learning_rate": 7.659279692184453e-05, + "loss": 1.3064, + "step": 166 + }, + { + "epoch": 0.026327858514632957, + "grad_norm": 1.1484375, + "learning_rate": 7.658832110128688e-05, + "loss": 1.4267, + "step": 167 + }, + { + "epoch": 0.02648551036202597, + "grad_norm": 1.0, + "learning_rate": 7.658384533034893e-05, + "loss": 1.2501, + "step": 168 + }, + { + "epoch": 0.02664316220941898, + "grad_norm": 1.0625, + "learning_rate": 7.657936960904018e-05, + "loss": 1.2482, + "step": 169 + }, + { + "epoch": 0.026800814056811992, + "grad_norm": 0.96484375, + "learning_rate": 7.657489393737012e-05, + "loss": 1.3088, + "step": 170 + }, + { + "epoch": 0.026958465904205003, + "grad_norm": 1.015625, + "learning_rate": 7.657041831534824e-05, + "loss": 1.3537, + "step": 171 + }, + { + "epoch": 0.027116117751598015, + "grad_norm": 0.98046875, + "learning_rate": 7.656594274298402e-05, + "loss": 1.287, + "step": 172 + }, + { + "epoch": 0.027273769598991027, + "grad_norm": 0.99609375, + "learning_rate": 7.65614672202869e-05, + "loss": 1.3102, + "step": 173 + }, + { + "epoch": 0.02743142144638404, + "grad_norm": 0.984375, + "learning_rate": 7.655699174726645e-05, + "loss": 1.2733, + "step": 174 + }, + { + "epoch": 0.02758907329377705, + "grad_norm": 1.078125, + "learning_rate": 7.655251632393213e-05, + "loss": 1.4304, + "step": 175 + }, + { + "epoch": 0.027746725141170065, + "grad_norm": 1.0546875, + "learning_rate": 7.65480409502934e-05, + "loss": 1.276, + "step": 176 + }, + { + "epoch": 0.027904376988563077, + "grad_norm": 1.0625, + "learning_rate": 7.654356562635976e-05, + "loss": 1.3163, + "step": 177 + }, + { + "epoch": 0.02806202883595609, + "grad_norm": 0.953125, + "learning_rate": 7.653909035214065e-05, + "loss": 1.4239, + "step": 178 + }, + { + "epoch": 0.0282196806833491, + "grad_norm": 1.0546875, + "learning_rate": 7.653461512764563e-05, + "loss": 1.3669, + "step": 179 + }, + { + "epoch": 0.02837733253074211, + "grad_norm": 1.1640625, + "learning_rate": 7.653013995288416e-05, + "loss": 1.263, + "step": 180 + }, + { + "epoch": 0.028534984378135123, + "grad_norm": 0.96484375, + "learning_rate": 7.652566482786572e-05, + "loss": 1.249, + "step": 181 + }, + { + "epoch": 0.028692636225528135, + "grad_norm": 0.9375, + "learning_rate": 7.652118975259979e-05, + "loss": 1.2723, + "step": 182 + }, + { + "epoch": 0.028850288072921147, + "grad_norm": 1.0078125, + "learning_rate": 7.651671472709581e-05, + "loss": 1.3792, + "step": 183 + }, + { + "epoch": 0.029007939920314158, + "grad_norm": 1.0390625, + "learning_rate": 7.651223975136336e-05, + "loss": 1.6424, + "step": 184 + }, + { + "epoch": 0.02916559176770717, + "grad_norm": 1.25, + "learning_rate": 7.650776482541187e-05, + "loss": 1.2534, + "step": 185 + }, + { + "epoch": 0.02932324361510018, + "grad_norm": 1.046875, + "learning_rate": 7.650328994925083e-05, + "loss": 1.211, + "step": 186 + }, + { + "epoch": 0.029480895462493193, + "grad_norm": 1.984375, + "learning_rate": 7.649881512288972e-05, + "loss": 1.4544, + "step": 187 + }, + { + "epoch": 0.029638547309886205, + "grad_norm": 0.98046875, + "learning_rate": 7.649434034633802e-05, + "loss": 1.3, + "step": 188 + }, + { + "epoch": 0.029796199157279216, + "grad_norm": 1.09375, + "learning_rate": 7.648986561960523e-05, + "loss": 1.4345, + "step": 189 + }, + { + "epoch": 0.029953851004672228, + "grad_norm": 0.94921875, + "learning_rate": 7.648539094270083e-05, + "loss": 1.0856, + "step": 190 + }, + { + "epoch": 0.03011150285206524, + "grad_norm": 0.97265625, + "learning_rate": 7.64809163156343e-05, + "loss": 1.2131, + "step": 191 + }, + { + "epoch": 0.03026915469945825, + "grad_norm": 0.9921875, + "learning_rate": 7.647644173841512e-05, + "loss": 1.6161, + "step": 192 + }, + { + "epoch": 0.030426806546851263, + "grad_norm": 1.015625, + "learning_rate": 7.647196721105274e-05, + "loss": 1.4315, + "step": 193 + }, + { + "epoch": 0.030584458394244275, + "grad_norm": 0.95703125, + "learning_rate": 7.646749273355674e-05, + "loss": 1.4361, + "step": 194 + }, + { + "epoch": 0.030742110241637286, + "grad_norm": 1.0859375, + "learning_rate": 7.646301830593652e-05, + "loss": 1.3294, + "step": 195 + }, + { + "epoch": 0.030899762089030298, + "grad_norm": 0.96875, + "learning_rate": 7.645854392820158e-05, + "loss": 1.4119, + "step": 196 + }, + { + "epoch": 0.03105741393642331, + "grad_norm": 1.09375, + "learning_rate": 7.645406960036143e-05, + "loss": 1.2016, + "step": 197 + }, + { + "epoch": 0.03121506578381632, + "grad_norm": 0.98828125, + "learning_rate": 7.644959532242548e-05, + "loss": 1.0864, + "step": 198 + }, + { + "epoch": 0.031372717631209336, + "grad_norm": 1.171875, + "learning_rate": 7.644512109440332e-05, + "loss": 1.4124, + "step": 199 + }, + { + "epoch": 0.03153036947860235, + "grad_norm": 1.0546875, + "learning_rate": 7.644064691630437e-05, + "loss": 1.1773, + "step": 200 + }, + { + "epoch": 0.03168802132599536, + "grad_norm": 1.9296875, + "learning_rate": 7.643617278813812e-05, + "loss": 1.2825, + "step": 201 + }, + { + "epoch": 0.03184567317338837, + "grad_norm": 1.0078125, + "learning_rate": 7.643169870991408e-05, + "loss": 1.398, + "step": 202 + }, + { + "epoch": 0.03200332502078138, + "grad_norm": 0.98828125, + "learning_rate": 7.642722468164164e-05, + "loss": 1.2979, + "step": 203 + }, + { + "epoch": 0.032160976868174394, + "grad_norm": 1.0546875, + "learning_rate": 7.64227507033304e-05, + "loss": 1.3942, + "step": 204 + }, + { + "epoch": 0.032318628715567406, + "grad_norm": 0.921875, + "learning_rate": 7.64182767749898e-05, + "loss": 1.2042, + "step": 205 + }, + { + "epoch": 0.03247628056296042, + "grad_norm": 0.9765625, + "learning_rate": 7.64138028966293e-05, + "loss": 1.519, + "step": 206 + }, + { + "epoch": 0.03263393241035343, + "grad_norm": 1.140625, + "learning_rate": 7.64093290682584e-05, + "loss": 1.4504, + "step": 207 + }, + { + "epoch": 0.03279158425774644, + "grad_norm": 1.1015625, + "learning_rate": 7.640485528988656e-05, + "loss": 1.6083, + "step": 208 + }, + { + "epoch": 0.03294923610513945, + "grad_norm": 1.0703125, + "learning_rate": 7.640038156152329e-05, + "loss": 1.4028, + "step": 209 + }, + { + "epoch": 0.033106887952532464, + "grad_norm": 1.0, + "learning_rate": 7.639590788317809e-05, + "loss": 1.1125, + "step": 210 + }, + { + "epoch": 0.033264539799925476, + "grad_norm": 0.97265625, + "learning_rate": 7.639143425486039e-05, + "loss": 1.1355, + "step": 211 + }, + { + "epoch": 0.03342219164731849, + "grad_norm": 1.046875, + "learning_rate": 7.638696067657969e-05, + "loss": 1.2931, + "step": 212 + }, + { + "epoch": 0.0335798434947115, + "grad_norm": 0.94921875, + "learning_rate": 7.638248714834545e-05, + "loss": 1.2178, + "step": 213 + }, + { + "epoch": 0.03373749534210451, + "grad_norm": 1.0625, + "learning_rate": 7.637801367016723e-05, + "loss": 1.6365, + "step": 214 + }, + { + "epoch": 0.03389514718949752, + "grad_norm": 0.90625, + "learning_rate": 7.637354024205446e-05, + "loss": 1.1059, + "step": 215 + }, + { + "epoch": 0.034052799036890534, + "grad_norm": 1.046875, + "learning_rate": 7.63690668640166e-05, + "loss": 1.2119, + "step": 216 + }, + { + "epoch": 0.034210450884283546, + "grad_norm": 0.98046875, + "learning_rate": 7.636459353606317e-05, + "loss": 1.4586, + "step": 217 + }, + { + "epoch": 0.03436810273167656, + "grad_norm": 1.015625, + "learning_rate": 7.636012025820358e-05, + "loss": 1.0742, + "step": 218 + }, + { + "epoch": 0.03452575457906957, + "grad_norm": 1.1875, + "learning_rate": 7.63556470304474e-05, + "loss": 1.3508, + "step": 219 + }, + { + "epoch": 0.03468340642646258, + "grad_norm": 1.140625, + "learning_rate": 7.635117385280409e-05, + "loss": 1.2963, + "step": 220 + }, + { + "epoch": 0.03484105827385559, + "grad_norm": 1.0703125, + "learning_rate": 7.634670072528311e-05, + "loss": 1.323, + "step": 221 + }, + { + "epoch": 0.034998710121248604, + "grad_norm": 1.0703125, + "learning_rate": 7.634222764789394e-05, + "loss": 1.1162, + "step": 222 + }, + { + "epoch": 0.035156361968641615, + "grad_norm": 1.1953125, + "learning_rate": 7.633775462064602e-05, + "loss": 1.2893, + "step": 223 + }, + { + "epoch": 0.03531401381603463, + "grad_norm": 0.9453125, + "learning_rate": 7.633328164354894e-05, + "loss": 1.2837, + "step": 224 + }, + { + "epoch": 0.03547166566342764, + "grad_norm": 1.0625, + "learning_rate": 7.632880871661212e-05, + "loss": 1.3051, + "step": 225 + }, + { + "epoch": 0.03562931751082065, + "grad_norm": 1.1328125, + "learning_rate": 7.632433583984501e-05, + "loss": 1.6281, + "step": 226 + }, + { + "epoch": 0.03578696935821366, + "grad_norm": 1.0625, + "learning_rate": 7.631986301325713e-05, + "loss": 1.2134, + "step": 227 + }, + { + "epoch": 0.035944621205606674, + "grad_norm": 1.015625, + "learning_rate": 7.631539023685794e-05, + "loss": 1.3279, + "step": 228 + }, + { + "epoch": 0.036102273052999685, + "grad_norm": 1.0234375, + "learning_rate": 7.631091751065691e-05, + "loss": 1.1386, + "step": 229 + }, + { + "epoch": 0.0362599249003927, + "grad_norm": 1.09375, + "learning_rate": 7.630644483466354e-05, + "loss": 1.2387, + "step": 230 + }, + { + "epoch": 0.03641757674778571, + "grad_norm": 1.078125, + "learning_rate": 7.630197220888733e-05, + "loss": 1.2119, + "step": 231 + }, + { + "epoch": 0.03657522859517872, + "grad_norm": 1.1953125, + "learning_rate": 7.629749963333772e-05, + "loss": 1.4202, + "step": 232 + }, + { + "epoch": 0.03673288044257173, + "grad_norm": 0.97265625, + "learning_rate": 7.629302710802422e-05, + "loss": 1.2874, + "step": 233 + }, + { + "epoch": 0.03689053228996474, + "grad_norm": 0.9765625, + "learning_rate": 7.628855463295626e-05, + "loss": 1.1393, + "step": 234 + }, + { + "epoch": 0.037048184137357755, + "grad_norm": 1.1484375, + "learning_rate": 7.628408220814339e-05, + "loss": 1.5517, + "step": 235 + }, + { + "epoch": 0.03720583598475077, + "grad_norm": 1.0703125, + "learning_rate": 7.627960983359505e-05, + "loss": 1.2827, + "step": 236 + }, + { + "epoch": 0.03736348783214378, + "grad_norm": 1.453125, + "learning_rate": 7.627513750932071e-05, + "loss": 1.3977, + "step": 237 + }, + { + "epoch": 0.03752113967953679, + "grad_norm": 0.953125, + "learning_rate": 7.627066523532986e-05, + "loss": 1.0914, + "step": 238 + }, + { + "epoch": 0.0376787915269298, + "grad_norm": 0.90625, + "learning_rate": 7.626619301163195e-05, + "loss": 0.9954, + "step": 239 + }, + { + "epoch": 0.03783644337432281, + "grad_norm": 1.0390625, + "learning_rate": 7.626172083823652e-05, + "loss": 1.1874, + "step": 240 + }, + { + "epoch": 0.037994095221715825, + "grad_norm": 1.015625, + "learning_rate": 7.625724871515302e-05, + "loss": 1.1796, + "step": 241 + }, + { + "epoch": 0.038151747069108836, + "grad_norm": 1.1328125, + "learning_rate": 7.625277664239092e-05, + "loss": 1.4228, + "step": 242 + }, + { + "epoch": 0.03830939891650185, + "grad_norm": 1.1796875, + "learning_rate": 7.62483046199597e-05, + "loss": 1.4132, + "step": 243 + }, + { + "epoch": 0.03846705076389486, + "grad_norm": 0.9296875, + "learning_rate": 7.624383264786881e-05, + "loss": 1.1635, + "step": 244 + }, + { + "epoch": 0.03862470261128787, + "grad_norm": 1.046875, + "learning_rate": 7.623936072612779e-05, + "loss": 1.4281, + "step": 245 + }, + { + "epoch": 0.03878235445868088, + "grad_norm": 1.078125, + "learning_rate": 7.623488885474609e-05, + "loss": 1.3828, + "step": 246 + }, + { + "epoch": 0.038940006306073895, + "grad_norm": 1.2578125, + "learning_rate": 7.623041703373318e-05, + "loss": 1.1873, + "step": 247 + }, + { + "epoch": 0.039097658153466906, + "grad_norm": 1.109375, + "learning_rate": 7.622594526309856e-05, + "loss": 1.4153, + "step": 248 + }, + { + "epoch": 0.03925531000085992, + "grad_norm": 0.98828125, + "learning_rate": 7.622147354285163e-05, + "loss": 1.2454, + "step": 249 + }, + { + "epoch": 0.03941296184825293, + "grad_norm": 1.21875, + "learning_rate": 7.621700187300198e-05, + "loss": 1.4777, + "step": 250 + }, + { + "epoch": 0.03957061369564594, + "grad_norm": 1.0703125, + "learning_rate": 7.621253025355902e-05, + "loss": 1.2219, + "step": 251 + }, + { + "epoch": 0.03972826554303895, + "grad_norm": 1.0234375, + "learning_rate": 7.620805868453226e-05, + "loss": 1.4007, + "step": 252 + }, + { + "epoch": 0.039885917390431964, + "grad_norm": 1.109375, + "learning_rate": 7.620358716593114e-05, + "loss": 1.4307, + "step": 253 + }, + { + "epoch": 0.040043569237824976, + "grad_norm": 1.0, + "learning_rate": 7.619911569776515e-05, + "loss": 1.233, + "step": 254 + }, + { + "epoch": 0.04020122108521799, + "grad_norm": 1.328125, + "learning_rate": 7.619464428004381e-05, + "loss": 1.6911, + "step": 255 + }, + { + "epoch": 0.040358872932611, + "grad_norm": 0.97265625, + "learning_rate": 7.619017291277653e-05, + "loss": 1.237, + "step": 256 + }, + { + "epoch": 0.04051652478000401, + "grad_norm": 1.21875, + "learning_rate": 7.618570159597284e-05, + "loss": 1.4241, + "step": 257 + }, + { + "epoch": 0.04067417662739702, + "grad_norm": 1.0, + "learning_rate": 7.618123032964218e-05, + "loss": 1.3114, + "step": 258 + }, + { + "epoch": 0.040831828474790034, + "grad_norm": 0.984375, + "learning_rate": 7.617675911379401e-05, + "loss": 1.4388, + "step": 259 + }, + { + "epoch": 0.040989480322183046, + "grad_norm": 1.203125, + "learning_rate": 7.617228794843787e-05, + "loss": 1.4383, + "step": 260 + }, + { + "epoch": 0.04114713216957606, + "grad_norm": 0.87109375, + "learning_rate": 7.616781683358321e-05, + "loss": 0.8977, + "step": 261 + }, + { + "epoch": 0.04130478401696907, + "grad_norm": 1.0234375, + "learning_rate": 7.61633457692395e-05, + "loss": 1.1832, + "step": 262 + }, + { + "epoch": 0.04146243586436208, + "grad_norm": 1.1171875, + "learning_rate": 7.615887475541623e-05, + "loss": 1.437, + "step": 263 + }, + { + "epoch": 0.04162008771175509, + "grad_norm": 0.953125, + "learning_rate": 7.61544037921228e-05, + "loss": 1.2289, + "step": 264 + }, + { + "epoch": 0.041777739559148104, + "grad_norm": 1.0625, + "learning_rate": 7.614993287936878e-05, + "loss": 1.1745, + "step": 265 + }, + { + "epoch": 0.041935391406541116, + "grad_norm": 1.03125, + "learning_rate": 7.614546201716363e-05, + "loss": 1.2957, + "step": 266 + }, + { + "epoch": 0.04209304325393413, + "grad_norm": 0.9609375, + "learning_rate": 7.614099120551681e-05, + "loss": 1.3913, + "step": 267 + }, + { + "epoch": 0.04225069510132714, + "grad_norm": 1.0703125, + "learning_rate": 7.61365204444378e-05, + "loss": 1.3907, + "step": 268 + }, + { + "epoch": 0.04240834694872015, + "grad_norm": 1.1171875, + "learning_rate": 7.613204973393601e-05, + "loss": 1.2667, + "step": 269 + }, + { + "epoch": 0.04256599879611316, + "grad_norm": 0.99609375, + "learning_rate": 7.612757907402103e-05, + "loss": 1.2855, + "step": 270 + }, + { + "epoch": 0.042723650643506174, + "grad_norm": 1.0234375, + "learning_rate": 7.61231084647023e-05, + "loss": 1.26, + "step": 271 + }, + { + "epoch": 0.042881302490899185, + "grad_norm": 1.1484375, + "learning_rate": 7.611863790598925e-05, + "loss": 1.4559, + "step": 272 + }, + { + "epoch": 0.0430389543382922, + "grad_norm": 1.0234375, + "learning_rate": 7.61141673978914e-05, + "loss": 1.2738, + "step": 273 + }, + { + "epoch": 0.043196606185685216, + "grad_norm": 0.9609375, + "learning_rate": 7.610969694041819e-05, + "loss": 1.1966, + "step": 274 + }, + { + "epoch": 0.04335425803307823, + "grad_norm": 1.0078125, + "learning_rate": 7.610522653357912e-05, + "loss": 1.2606, + "step": 275 + }, + { + "epoch": 0.04351190988047124, + "grad_norm": 1.0859375, + "learning_rate": 7.610075617738364e-05, + "loss": 1.3679, + "step": 276 + }, + { + "epoch": 0.04366956172786425, + "grad_norm": 1.0859375, + "learning_rate": 7.609628587184127e-05, + "loss": 1.2724, + "step": 277 + }, + { + "epoch": 0.04382721357525726, + "grad_norm": 0.98828125, + "learning_rate": 7.609181561696142e-05, + "loss": 1.3065, + "step": 278 + }, + { + "epoch": 0.043984865422650274, + "grad_norm": 1.1015625, + "learning_rate": 7.608734541275361e-05, + "loss": 1.4963, + "step": 279 + }, + { + "epoch": 0.044142517270043286, + "grad_norm": 1.03125, + "learning_rate": 7.608287525922731e-05, + "loss": 1.2244, + "step": 280 + }, + { + "epoch": 0.0443001691174363, + "grad_norm": 0.9609375, + "learning_rate": 7.607840515639201e-05, + "loss": 1.0684, + "step": 281 + }, + { + "epoch": 0.04445782096482931, + "grad_norm": 1.1171875, + "learning_rate": 7.607393510425714e-05, + "loss": 1.2171, + "step": 282 + }, + { + "epoch": 0.04461547281222232, + "grad_norm": 1.078125, + "learning_rate": 7.606946510283222e-05, + "loss": 1.4809, + "step": 283 + }, + { + "epoch": 0.04477312465961533, + "grad_norm": 0.9375, + "learning_rate": 7.606499515212663e-05, + "loss": 1.138, + "step": 284 + }, + { + "epoch": 0.044930776507008344, + "grad_norm": 1.03125, + "learning_rate": 7.606052525214999e-05, + "loss": 1.2274, + "step": 285 + }, + { + "epoch": 0.045088428354401355, + "grad_norm": 0.98046875, + "learning_rate": 7.605605540291167e-05, + "loss": 1.2865, + "step": 286 + }, + { + "epoch": 0.04524608020179437, + "grad_norm": 0.9921875, + "learning_rate": 7.605158560442119e-05, + "loss": 1.0963, + "step": 287 + }, + { + "epoch": 0.04540373204918738, + "grad_norm": 1.03125, + "learning_rate": 7.6047115856688e-05, + "loss": 1.4318, + "step": 288 + }, + { + "epoch": 0.04556138389658039, + "grad_norm": 0.96484375, + "learning_rate": 7.604264615972154e-05, + "loss": 1.1505, + "step": 289 + }, + { + "epoch": 0.0457190357439734, + "grad_norm": 1.1484375, + "learning_rate": 7.603817651353135e-05, + "loss": 1.2147, + "step": 290 + }, + { + "epoch": 0.045876687591366413, + "grad_norm": 1.0, + "learning_rate": 7.60337069181269e-05, + "loss": 1.2802, + "step": 291 + }, + { + "epoch": 0.046034339438759425, + "grad_norm": 1.03125, + "learning_rate": 7.602923737351762e-05, + "loss": 1.2362, + "step": 292 + }, + { + "epoch": 0.04619199128615244, + "grad_norm": 1.03125, + "learning_rate": 7.602476787971301e-05, + "loss": 1.1614, + "step": 293 + }, + { + "epoch": 0.04634964313354545, + "grad_norm": 1.0546875, + "learning_rate": 7.60202984367225e-05, + "loss": 1.1971, + "step": 294 + }, + { + "epoch": 0.04650729498093846, + "grad_norm": 1.0234375, + "learning_rate": 7.601582904455563e-05, + "loss": 1.3712, + "step": 295 + }, + { + "epoch": 0.04666494682833147, + "grad_norm": 1.046875, + "learning_rate": 7.601135970322184e-05, + "loss": 1.251, + "step": 296 + }, + { + "epoch": 0.04682259867572448, + "grad_norm": 1.046875, + "learning_rate": 7.600689041273058e-05, + "loss": 1.249, + "step": 297 + }, + { + "epoch": 0.046980250523117495, + "grad_norm": 1.2109375, + "learning_rate": 7.600242117309135e-05, + "loss": 1.4933, + "step": 298 + }, + { + "epoch": 0.04713790237051051, + "grad_norm": 1.015625, + "learning_rate": 7.599795198431362e-05, + "loss": 1.4416, + "step": 299 + }, + { + "epoch": 0.04729555421790352, + "grad_norm": 1.078125, + "learning_rate": 7.599348284640686e-05, + "loss": 1.3171, + "step": 300 + }, + { + "epoch": 0.04745320606529653, + "grad_norm": 2.9375, + "learning_rate": 7.598901375938055e-05, + "loss": 1.4638, + "step": 301 + }, + { + "epoch": 0.04761085791268954, + "grad_norm": 1.1171875, + "learning_rate": 7.598454472324416e-05, + "loss": 1.3142, + "step": 302 + }, + { + "epoch": 0.04776850976008255, + "grad_norm": 0.984375, + "learning_rate": 7.598007573800713e-05, + "loss": 1.4089, + "step": 303 + }, + { + "epoch": 0.047926161607475565, + "grad_norm": 0.98828125, + "learning_rate": 7.597560680367895e-05, + "loss": 1.4437, + "step": 304 + }, + { + "epoch": 0.048083813454868576, + "grad_norm": 1.0390625, + "learning_rate": 7.597113792026913e-05, + "loss": 1.2965, + "step": 305 + }, + { + "epoch": 0.04824146530226159, + "grad_norm": 1.015625, + "learning_rate": 7.596666908778709e-05, + "loss": 1.4765, + "step": 306 + }, + { + "epoch": 0.0483991171496546, + "grad_norm": 1.1484375, + "learning_rate": 7.596220030624234e-05, + "loss": 1.4994, + "step": 307 + }, + { + "epoch": 0.04855676899704761, + "grad_norm": 1.03125, + "learning_rate": 7.595773157564432e-05, + "loss": 1.3982, + "step": 308 + }, + { + "epoch": 0.04871442084444062, + "grad_norm": 0.93359375, + "learning_rate": 7.595326289600248e-05, + "loss": 1.1308, + "step": 309 + }, + { + "epoch": 0.048872072691833635, + "grad_norm": 1.15625, + "learning_rate": 7.594879426732636e-05, + "loss": 1.3513, + "step": 310 + }, + { + "epoch": 0.049029724539226646, + "grad_norm": 1.0390625, + "learning_rate": 7.594432568962539e-05, + "loss": 1.3763, + "step": 311 + }, + { + "epoch": 0.04918737638661966, + "grad_norm": 1.140625, + "learning_rate": 7.593985716290907e-05, + "loss": 1.3615, + "step": 312 + }, + { + "epoch": 0.04934502823401267, + "grad_norm": 0.99609375, + "learning_rate": 7.593538868718683e-05, + "loss": 1.2998, + "step": 313 + }, + { + "epoch": 0.04950268008140568, + "grad_norm": 1.0078125, + "learning_rate": 7.593092026246814e-05, + "loss": 1.2916, + "step": 314 + }, + { + "epoch": 0.04966033192879869, + "grad_norm": 1.15625, + "learning_rate": 7.592645188876251e-05, + "loss": 1.3156, + "step": 315 + }, + { + "epoch": 0.049817983776191704, + "grad_norm": 1.0546875, + "learning_rate": 7.592198356607937e-05, + "loss": 1.2252, + "step": 316 + }, + { + "epoch": 0.049975635623584716, + "grad_norm": 1.0078125, + "learning_rate": 7.591751529442823e-05, + "loss": 1.392, + "step": 317 + }, + { + "epoch": 0.05013328747097773, + "grad_norm": 0.97265625, + "learning_rate": 7.591304707381855e-05, + "loss": 0.9968, + "step": 318 + }, + { + "epoch": 0.05029093931837074, + "grad_norm": 1.078125, + "learning_rate": 7.590857890425975e-05, + "loss": 1.2946, + "step": 319 + }, + { + "epoch": 0.05044859116576375, + "grad_norm": 0.9765625, + "learning_rate": 7.590411078576137e-05, + "loss": 1.1921, + "step": 320 + }, + { + "epoch": 0.05060624301315676, + "grad_norm": 1.140625, + "learning_rate": 7.589964271833286e-05, + "loss": 1.2742, + "step": 321 + }, + { + "epoch": 0.050763894860549774, + "grad_norm": 1.1640625, + "learning_rate": 7.589517470198366e-05, + "loss": 1.4172, + "step": 322 + }, + { + "epoch": 0.050921546707942786, + "grad_norm": 1.0078125, + "learning_rate": 7.589070673672327e-05, + "loss": 1.405, + "step": 323 + }, + { + "epoch": 0.0510791985553358, + "grad_norm": 1.046875, + "learning_rate": 7.58862388225611e-05, + "loss": 1.2666, + "step": 324 + }, + { + "epoch": 0.05123685040272881, + "grad_norm": 1.0078125, + "learning_rate": 7.588177095950673e-05, + "loss": 1.4633, + "step": 325 + }, + { + "epoch": 0.05139450225012182, + "grad_norm": 1.09375, + "learning_rate": 7.587730314756954e-05, + "loss": 1.2416, + "step": 326 + }, + { + "epoch": 0.05155215409751483, + "grad_norm": 1.0, + "learning_rate": 7.587283538675904e-05, + "loss": 1.4502, + "step": 327 + }, + { + "epoch": 0.051709805944907844, + "grad_norm": 1.046875, + "learning_rate": 7.58683676770847e-05, + "loss": 1.3921, + "step": 328 + }, + { + "epoch": 0.051867457792300856, + "grad_norm": 1.3046875, + "learning_rate": 7.586390001855591e-05, + "loss": 1.236, + "step": 329 + }, + { + "epoch": 0.05202510963969387, + "grad_norm": 0.9140625, + "learning_rate": 7.585943241118227e-05, + "loss": 1.1263, + "step": 330 + }, + { + "epoch": 0.05218276148708688, + "grad_norm": 0.9140625, + "learning_rate": 7.585496485497316e-05, + "loss": 1.1593, + "step": 331 + }, + { + "epoch": 0.05234041333447989, + "grad_norm": 1.0234375, + "learning_rate": 7.585049734993809e-05, + "loss": 1.3364, + "step": 332 + }, + { + "epoch": 0.0524980651818729, + "grad_norm": 1.046875, + "learning_rate": 7.584602989608651e-05, + "loss": 1.1631, + "step": 333 + }, + { + "epoch": 0.052655717029265914, + "grad_norm": 0.95703125, + "learning_rate": 7.584156249342783e-05, + "loss": 1.1316, + "step": 334 + }, + { + "epoch": 0.052813368876658925, + "grad_norm": 1.3046875, + "learning_rate": 7.583709514197162e-05, + "loss": 1.2093, + "step": 335 + }, + { + "epoch": 0.05297102072405194, + "grad_norm": 0.921875, + "learning_rate": 7.583262784172733e-05, + "loss": 1.2034, + "step": 336 + }, + { + "epoch": 0.05312867257144495, + "grad_norm": 0.94921875, + "learning_rate": 7.582816059270438e-05, + "loss": 1.0787, + "step": 337 + }, + { + "epoch": 0.05328632441883796, + "grad_norm": 0.984375, + "learning_rate": 7.582369339491227e-05, + "loss": 1.3247, + "step": 338 + }, + { + "epoch": 0.05344397626623097, + "grad_norm": 1.1328125, + "learning_rate": 7.581922624836045e-05, + "loss": 1.4561, + "step": 339 + }, + { + "epoch": 0.053601628113623984, + "grad_norm": 0.9375, + "learning_rate": 7.58147591530584e-05, + "loss": 1.226, + "step": 340 + }, + { + "epoch": 0.053759279961016995, + "grad_norm": 1.046875, + "learning_rate": 7.58102921090156e-05, + "loss": 1.3249, + "step": 341 + }, + { + "epoch": 0.05391693180841001, + "grad_norm": 0.9375, + "learning_rate": 7.58058251162415e-05, + "loss": 1.2785, + "step": 342 + }, + { + "epoch": 0.05407458365580302, + "grad_norm": 1.078125, + "learning_rate": 7.580135817474557e-05, + "loss": 1.3761, + "step": 343 + }, + { + "epoch": 0.05423223550319603, + "grad_norm": 1.125, + "learning_rate": 7.579689128453723e-05, + "loss": 1.3079, + "step": 344 + }, + { + "epoch": 0.05438988735058904, + "grad_norm": 1.1875, + "learning_rate": 7.579242444562605e-05, + "loss": 1.3772, + "step": 345 + }, + { + "epoch": 0.05454753919798205, + "grad_norm": 0.953125, + "learning_rate": 7.578795765802143e-05, + "loss": 1.0293, + "step": 346 + }, + { + "epoch": 0.054705191045375065, + "grad_norm": 0.9453125, + "learning_rate": 7.578349092173286e-05, + "loss": 0.9691, + "step": 347 + }, + { + "epoch": 0.05486284289276808, + "grad_norm": 1.0, + "learning_rate": 7.577902423676978e-05, + "loss": 1.2734, + "step": 348 + }, + { + "epoch": 0.05502049474016109, + "grad_norm": 0.96484375, + "learning_rate": 7.577455760314165e-05, + "loss": 1.3084, + "step": 349 + }, + { + "epoch": 0.0551781465875541, + "grad_norm": 1.0625, + "learning_rate": 7.577009102085801e-05, + "loss": 1.2957, + "step": 350 + }, + { + "epoch": 0.05533579843494711, + "grad_norm": 1.015625, + "learning_rate": 7.576562448992825e-05, + "loss": 1.1018, + "step": 351 + }, + { + "epoch": 0.05549345028234013, + "grad_norm": 1.0078125, + "learning_rate": 7.576115801036187e-05, + "loss": 1.1478, + "step": 352 + }, + { + "epoch": 0.05565110212973314, + "grad_norm": 0.93359375, + "learning_rate": 7.575669158216835e-05, + "loss": 1.2483, + "step": 353 + }, + { + "epoch": 0.05580875397712615, + "grad_norm": 1.5703125, + "learning_rate": 7.575222520535709e-05, + "loss": 1.1609, + "step": 354 + }, + { + "epoch": 0.055966405824519165, + "grad_norm": 1.046875, + "learning_rate": 7.574775887993764e-05, + "loss": 1.1153, + "step": 355 + }, + { + "epoch": 0.05612405767191218, + "grad_norm": 0.96875, + "learning_rate": 7.574329260591942e-05, + "loss": 1.2138, + "step": 356 + }, + { + "epoch": 0.05628170951930519, + "grad_norm": 0.91015625, + "learning_rate": 7.573882638331192e-05, + "loss": 1.2048, + "step": 357 + }, + { + "epoch": 0.0564393613666982, + "grad_norm": 0.8671875, + "learning_rate": 7.573436021212456e-05, + "loss": 1.2139, + "step": 358 + }, + { + "epoch": 0.05659701321409121, + "grad_norm": 1.1171875, + "learning_rate": 7.572989409236684e-05, + "loss": 1.3339, + "step": 359 + }, + { + "epoch": 0.05675466506148422, + "grad_norm": 0.94140625, + "learning_rate": 7.572542802404825e-05, + "loss": 1.181, + "step": 360 + }, + { + "epoch": 0.056912316908877235, + "grad_norm": 1.1875, + "learning_rate": 7.572096200717821e-05, + "loss": 1.365, + "step": 361 + }, + { + "epoch": 0.057069968756270247, + "grad_norm": 1.078125, + "learning_rate": 7.571649604176622e-05, + "loss": 1.3342, + "step": 362 + }, + { + "epoch": 0.05722762060366326, + "grad_norm": 1.0, + "learning_rate": 7.571203012782172e-05, + "loss": 1.1962, + "step": 363 + }, + { + "epoch": 0.05738527245105627, + "grad_norm": 0.8984375, + "learning_rate": 7.570756426535414e-05, + "loss": 1.0502, + "step": 364 + }, + { + "epoch": 0.05754292429844928, + "grad_norm": 2.125, + "learning_rate": 7.570309845437302e-05, + "loss": 1.2771, + "step": 365 + }, + { + "epoch": 0.05770057614584229, + "grad_norm": 1.0234375, + "learning_rate": 7.569863269488782e-05, + "loss": 1.2067, + "step": 366 + }, + { + "epoch": 0.057858227993235305, + "grad_norm": 0.92578125, + "learning_rate": 7.569416698690797e-05, + "loss": 1.224, + "step": 367 + }, + { + "epoch": 0.058015879840628316, + "grad_norm": 0.97265625, + "learning_rate": 7.568970133044293e-05, + "loss": 1.3076, + "step": 368 + }, + { + "epoch": 0.05817353168802133, + "grad_norm": 1.0078125, + "learning_rate": 7.568523572550219e-05, + "loss": 1.2475, + "step": 369 + }, + { + "epoch": 0.05833118353541434, + "grad_norm": 1.1484375, + "learning_rate": 7.568077017209516e-05, + "loss": 1.4433, + "step": 370 + }, + { + "epoch": 0.05848883538280735, + "grad_norm": 1.03125, + "learning_rate": 7.567630467023138e-05, + "loss": 1.4507, + "step": 371 + }, + { + "epoch": 0.05864648723020036, + "grad_norm": 0.9921875, + "learning_rate": 7.567183921992031e-05, + "loss": 1.2626, + "step": 372 + }, + { + "epoch": 0.058804139077593374, + "grad_norm": 1.328125, + "learning_rate": 7.566737382117136e-05, + "loss": 1.2898, + "step": 373 + }, + { + "epoch": 0.058961790924986386, + "grad_norm": 1.125, + "learning_rate": 7.566290847399403e-05, + "loss": 1.3465, + "step": 374 + }, + { + "epoch": 0.0591194427723794, + "grad_norm": 2.15625, + "learning_rate": 7.565844317839772e-05, + "loss": 1.4528, + "step": 375 + }, + { + "epoch": 0.05927709461977241, + "grad_norm": 1.0234375, + "learning_rate": 7.565397793439201e-05, + "loss": 1.2723, + "step": 376 + }, + { + "epoch": 0.05943474646716542, + "grad_norm": 0.9921875, + "learning_rate": 7.564951274198629e-05, + "loss": 1.2087, + "step": 377 + }, + { + "epoch": 0.05959239831455843, + "grad_norm": 0.94921875, + "learning_rate": 7.564504760119005e-05, + "loss": 1.1051, + "step": 378 + }, + { + "epoch": 0.059750050161951444, + "grad_norm": 0.953125, + "learning_rate": 7.564058251201272e-05, + "loss": 1.3148, + "step": 379 + }, + { + "epoch": 0.059907702009344456, + "grad_norm": 1.0, + "learning_rate": 7.563611747446377e-05, + "loss": 1.2887, + "step": 380 + }, + { + "epoch": 0.06006535385673747, + "grad_norm": 0.8984375, + "learning_rate": 7.56316524885527e-05, + "loss": 1.0629, + "step": 381 + }, + { + "epoch": 0.06022300570413048, + "grad_norm": 0.8984375, + "learning_rate": 7.562718755428893e-05, + "loss": 1.3577, + "step": 382 + }, + { + "epoch": 0.06038065755152349, + "grad_norm": 1.0859375, + "learning_rate": 7.562272267168195e-05, + "loss": 1.1758, + "step": 383 + }, + { + "epoch": 0.0605383093989165, + "grad_norm": 1.078125, + "learning_rate": 7.561825784074122e-05, + "loss": 1.1331, + "step": 384 + }, + { + "epoch": 0.060695961246309514, + "grad_norm": 1.15625, + "learning_rate": 7.561379306147619e-05, + "loss": 1.6068, + "step": 385 + }, + { + "epoch": 0.060853613093702526, + "grad_norm": 1.1328125, + "learning_rate": 7.560932833389634e-05, + "loss": 1.2707, + "step": 386 + }, + { + "epoch": 0.06101126494109554, + "grad_norm": 1.0703125, + "learning_rate": 7.560486365801113e-05, + "loss": 1.3707, + "step": 387 + }, + { + "epoch": 0.06116891678848855, + "grad_norm": 1.125, + "learning_rate": 7.560039903383002e-05, + "loss": 1.0712, + "step": 388 + }, + { + "epoch": 0.06132656863588156, + "grad_norm": 0.95703125, + "learning_rate": 7.559593446136248e-05, + "loss": 1.1705, + "step": 389 + }, + { + "epoch": 0.06148422048327457, + "grad_norm": 0.96875, + "learning_rate": 7.559146994061792e-05, + "loss": 1.2098, + "step": 390 + }, + { + "epoch": 0.061641872330667584, + "grad_norm": 2.265625, + "learning_rate": 7.558700547160587e-05, + "loss": 1.0231, + "step": 391 + }, + { + "epoch": 0.061799524178060596, + "grad_norm": 0.9921875, + "learning_rate": 7.558254105433577e-05, + "loss": 1.2373, + "step": 392 + }, + { + "epoch": 0.06195717602545361, + "grad_norm": 1.0703125, + "learning_rate": 7.557807668881707e-05, + "loss": 1.2103, + "step": 393 + }, + { + "epoch": 0.06211482787284662, + "grad_norm": 1.109375, + "learning_rate": 7.557361237505926e-05, + "loss": 1.5356, + "step": 394 + }, + { + "epoch": 0.06227247972023963, + "grad_norm": 0.8984375, + "learning_rate": 7.556914811307171e-05, + "loss": 1.2645, + "step": 395 + }, + { + "epoch": 0.06243013156763264, + "grad_norm": 1.1953125, + "learning_rate": 7.556468390286402e-05, + "loss": 1.5393, + "step": 396 + }, + { + "epoch": 0.06258778341502566, + "grad_norm": 1.03125, + "learning_rate": 7.55602197444456e-05, + "loss": 1.3203, + "step": 397 + }, + { + "epoch": 0.06274543526241867, + "grad_norm": 1.0, + "learning_rate": 7.555575563782587e-05, + "loss": 1.1366, + "step": 398 + }, + { + "epoch": 0.06290308710981168, + "grad_norm": 1.1640625, + "learning_rate": 7.555129158301432e-05, + "loss": 1.2643, + "step": 399 + }, + { + "epoch": 0.0630607389572047, + "grad_norm": 1.0078125, + "learning_rate": 7.554682758002041e-05, + "loss": 1.3032, + "step": 400 + }, + { + "epoch": 0.06321839080459771, + "grad_norm": 1.15625, + "learning_rate": 7.554236362885358e-05, + "loss": 1.3687, + "step": 401 + }, + { + "epoch": 0.06337604265199072, + "grad_norm": 1.0234375, + "learning_rate": 7.553789972952335e-05, + "loss": 1.2373, + "step": 402 + }, + { + "epoch": 0.06353369449938373, + "grad_norm": 0.98828125, + "learning_rate": 7.553343588203915e-05, + "loss": 1.301, + "step": 403 + }, + { + "epoch": 0.06369134634677674, + "grad_norm": 0.94921875, + "learning_rate": 7.552897208641042e-05, + "loss": 1.0514, + "step": 404 + }, + { + "epoch": 0.06384899819416975, + "grad_norm": 0.95703125, + "learning_rate": 7.55245083426466e-05, + "loss": 1.2411, + "step": 405 + }, + { + "epoch": 0.06400665004156277, + "grad_norm": 1.0546875, + "learning_rate": 7.552004465075722e-05, + "loss": 1.1168, + "step": 406 + }, + { + "epoch": 0.06416430188895578, + "grad_norm": 1.0, + "learning_rate": 7.551558101075171e-05, + "loss": 1.1903, + "step": 407 + }, + { + "epoch": 0.06432195373634879, + "grad_norm": 1.015625, + "learning_rate": 7.551111742263953e-05, + "loss": 1.3756, + "step": 408 + }, + { + "epoch": 0.0644796055837418, + "grad_norm": 1.0625, + "learning_rate": 7.550665388643013e-05, + "loss": 1.1984, + "step": 409 + }, + { + "epoch": 0.06463725743113481, + "grad_norm": 1.0546875, + "learning_rate": 7.550219040213293e-05, + "loss": 1.1656, + "step": 410 + }, + { + "epoch": 0.06479490927852782, + "grad_norm": 1.0859375, + "learning_rate": 7.549772696975749e-05, + "loss": 1.2355, + "step": 411 + }, + { + "epoch": 0.06495256112592084, + "grad_norm": 1.0625, + "learning_rate": 7.549326358931321e-05, + "loss": 1.3744, + "step": 412 + }, + { + "epoch": 0.06511021297331385, + "grad_norm": 1.03125, + "learning_rate": 7.548880026080956e-05, + "loss": 1.1551, + "step": 413 + }, + { + "epoch": 0.06526786482070686, + "grad_norm": 0.96484375, + "learning_rate": 7.548433698425598e-05, + "loss": 0.9827, + "step": 414 + }, + { + "epoch": 0.06542551666809987, + "grad_norm": 1.03125, + "learning_rate": 7.547987375966194e-05, + "loss": 1.2771, + "step": 415 + }, + { + "epoch": 0.06558316851549288, + "grad_norm": 0.97265625, + "learning_rate": 7.547541058703692e-05, + "loss": 1.0584, + "step": 416 + }, + { + "epoch": 0.0657408203628859, + "grad_norm": 0.94140625, + "learning_rate": 7.547094746639037e-05, + "loss": 1.2938, + "step": 417 + }, + { + "epoch": 0.0658984722102789, + "grad_norm": 0.9453125, + "learning_rate": 7.546648439773175e-05, + "loss": 1.1955, + "step": 418 + }, + { + "epoch": 0.06605612405767192, + "grad_norm": 1.03125, + "learning_rate": 7.546202138107052e-05, + "loss": 1.2908, + "step": 419 + }, + { + "epoch": 0.06621377590506493, + "grad_norm": 1.1015625, + "learning_rate": 7.545755841641607e-05, + "loss": 1.3614, + "step": 420 + }, + { + "epoch": 0.06637142775245794, + "grad_norm": 1.078125, + "learning_rate": 7.545309550377799e-05, + "loss": 1.1311, + "step": 421 + }, + { + "epoch": 0.06652907959985095, + "grad_norm": 1.0625, + "learning_rate": 7.544863264316566e-05, + "loss": 1.3877, + "step": 422 + }, + { + "epoch": 0.06668673144724396, + "grad_norm": 0.99609375, + "learning_rate": 7.544416983458855e-05, + "loss": 1.2271, + "step": 423 + }, + { + "epoch": 0.06684438329463697, + "grad_norm": 1.0234375, + "learning_rate": 7.54397070780561e-05, + "loss": 1.3025, + "step": 424 + }, + { + "epoch": 0.06700203514202999, + "grad_norm": 0.890625, + "learning_rate": 7.54352443735778e-05, + "loss": 0.9175, + "step": 425 + }, + { + "epoch": 0.067159686989423, + "grad_norm": 0.9609375, + "learning_rate": 7.543078172116309e-05, + "loss": 1.3899, + "step": 426 + }, + { + "epoch": 0.06731733883681601, + "grad_norm": 0.98828125, + "learning_rate": 7.542631912082145e-05, + "loss": 1.2967, + "step": 427 + }, + { + "epoch": 0.06747499068420902, + "grad_norm": 0.9453125, + "learning_rate": 7.54218565725623e-05, + "loss": 1.2848, + "step": 428 + }, + { + "epoch": 0.06763264253160203, + "grad_norm": 1.125, + "learning_rate": 7.541739407639516e-05, + "loss": 1.3587, + "step": 429 + }, + { + "epoch": 0.06779029437899504, + "grad_norm": 1.125, + "learning_rate": 7.541293163232938e-05, + "loss": 1.3439, + "step": 430 + }, + { + "epoch": 0.06794794622638806, + "grad_norm": 1.015625, + "learning_rate": 7.540846924037452e-05, + "loss": 1.2732, + "step": 431 + }, + { + "epoch": 0.06810559807378107, + "grad_norm": 0.87109375, + "learning_rate": 7.540400690054002e-05, + "loss": 1.112, + "step": 432 + }, + { + "epoch": 0.06826324992117408, + "grad_norm": 1.0078125, + "learning_rate": 7.539954461283531e-05, + "loss": 1.2712, + "step": 433 + }, + { + "epoch": 0.06842090176856709, + "grad_norm": 1.125, + "learning_rate": 7.539508237726986e-05, + "loss": 1.3191, + "step": 434 + }, + { + "epoch": 0.0685785536159601, + "grad_norm": 1.0234375, + "learning_rate": 7.53906201938531e-05, + "loss": 1.14, + "step": 435 + }, + { + "epoch": 0.06873620546335311, + "grad_norm": 0.84765625, + "learning_rate": 7.538615806259455e-05, + "loss": 0.9618, + "step": 436 + }, + { + "epoch": 0.06889385731074613, + "grad_norm": 1.28125, + "learning_rate": 7.538169598350362e-05, + "loss": 1.4627, + "step": 437 + }, + { + "epoch": 0.06905150915813914, + "grad_norm": 1.109375, + "learning_rate": 7.537723395658979e-05, + "loss": 1.1721, + "step": 438 + }, + { + "epoch": 0.06920916100553215, + "grad_norm": 1.015625, + "learning_rate": 7.53727719818625e-05, + "loss": 1.2041, + "step": 439 + }, + { + "epoch": 0.06936681285292516, + "grad_norm": 1.0546875, + "learning_rate": 7.536831005933116e-05, + "loss": 1.144, + "step": 440 + }, + { + "epoch": 0.06952446470031817, + "grad_norm": 1.03125, + "learning_rate": 7.536384818900534e-05, + "loss": 1.3357, + "step": 441 + }, + { + "epoch": 0.06968211654771118, + "grad_norm": 0.96875, + "learning_rate": 7.535938637089442e-05, + "loss": 1.1101, + "step": 442 + }, + { + "epoch": 0.0698397683951042, + "grad_norm": 0.9453125, + "learning_rate": 7.535492460500788e-05, + "loss": 1.4905, + "step": 443 + }, + { + "epoch": 0.06999742024249721, + "grad_norm": 0.9921875, + "learning_rate": 7.535046289135517e-05, + "loss": 1.2318, + "step": 444 + }, + { + "epoch": 0.07015507208989022, + "grad_norm": 1.015625, + "learning_rate": 7.534600122994572e-05, + "loss": 1.258, + "step": 445 + }, + { + "epoch": 0.07031272393728323, + "grad_norm": 1.0078125, + "learning_rate": 7.534153962078903e-05, + "loss": 1.2017, + "step": 446 + }, + { + "epoch": 0.07047037578467624, + "grad_norm": 0.94921875, + "learning_rate": 7.533707806389456e-05, + "loss": 1.1496, + "step": 447 + }, + { + "epoch": 0.07062802763206925, + "grad_norm": 0.92578125, + "learning_rate": 7.533261655927171e-05, + "loss": 1.1001, + "step": 448 + }, + { + "epoch": 0.07078567947946227, + "grad_norm": 1.0234375, + "learning_rate": 7.532815510692997e-05, + "loss": 1.3017, + "step": 449 + }, + { + "epoch": 0.07094333132685528, + "grad_norm": 1.3671875, + "learning_rate": 7.532369370687879e-05, + "loss": 1.4135, + "step": 450 + }, + { + "epoch": 0.07110098317424829, + "grad_norm": 1.078125, + "learning_rate": 7.531923235912764e-05, + "loss": 1.1465, + "step": 451 + }, + { + "epoch": 0.0712586350216413, + "grad_norm": 1.1328125, + "learning_rate": 7.531477106368597e-05, + "loss": 1.1826, + "step": 452 + }, + { + "epoch": 0.07141628686903431, + "grad_norm": 1.078125, + "learning_rate": 7.531030982056324e-05, + "loss": 1.2396, + "step": 453 + }, + { + "epoch": 0.07157393871642732, + "grad_norm": 0.9296875, + "learning_rate": 7.53058486297689e-05, + "loss": 1.2488, + "step": 454 + }, + { + "epoch": 0.07173159056382034, + "grad_norm": 1.0390625, + "learning_rate": 7.530138749131236e-05, + "loss": 1.3044, + "step": 455 + }, + { + "epoch": 0.07188924241121335, + "grad_norm": 0.98828125, + "learning_rate": 7.529692640520315e-05, + "loss": 1.3514, + "step": 456 + }, + { + "epoch": 0.07204689425860636, + "grad_norm": 0.984375, + "learning_rate": 7.52924653714507e-05, + "loss": 1.2322, + "step": 457 + }, + { + "epoch": 0.07220454610599937, + "grad_norm": 0.91796875, + "learning_rate": 7.528800439006444e-05, + "loss": 1.0827, + "step": 458 + }, + { + "epoch": 0.07236219795339238, + "grad_norm": 1.015625, + "learning_rate": 7.528354346105387e-05, + "loss": 1.2809, + "step": 459 + }, + { + "epoch": 0.0725198498007854, + "grad_norm": 1.0234375, + "learning_rate": 7.527908258442836e-05, + "loss": 1.2996, + "step": 460 + }, + { + "epoch": 0.0726775016481784, + "grad_norm": 0.96875, + "learning_rate": 7.527462176019746e-05, + "loss": 1.1359, + "step": 461 + }, + { + "epoch": 0.07283515349557142, + "grad_norm": 0.9609375, + "learning_rate": 7.52701609883706e-05, + "loss": 1.3207, + "step": 462 + }, + { + "epoch": 0.07299280534296443, + "grad_norm": 1.765625, + "learning_rate": 7.52657002689572e-05, + "loss": 1.4157, + "step": 463 + }, + { + "epoch": 0.07315045719035744, + "grad_norm": 1.046875, + "learning_rate": 7.526123960196676e-05, + "loss": 1.1813, + "step": 464 + }, + { + "epoch": 0.07330810903775045, + "grad_norm": 0.890625, + "learning_rate": 7.525677898740868e-05, + "loss": 1.3201, + "step": 465 + }, + { + "epoch": 0.07346576088514346, + "grad_norm": 0.87109375, + "learning_rate": 7.525231842529244e-05, + "loss": 1.2264, + "step": 466 + }, + { + "epoch": 0.07362341273253648, + "grad_norm": 1.0, + "learning_rate": 7.524785791562752e-05, + "loss": 1.181, + "step": 467 + }, + { + "epoch": 0.07378106457992949, + "grad_norm": 1.046875, + "learning_rate": 7.524339745842332e-05, + "loss": 1.3327, + "step": 468 + }, + { + "epoch": 0.0739387164273225, + "grad_norm": 0.9765625, + "learning_rate": 7.523893705368935e-05, + "loss": 1.3032, + "step": 469 + }, + { + "epoch": 0.07409636827471551, + "grad_norm": 0.98828125, + "learning_rate": 7.523447670143502e-05, + "loss": 1.1823, + "step": 470 + }, + { + "epoch": 0.07425402012210852, + "grad_norm": 0.94921875, + "learning_rate": 7.523001640166983e-05, + "loss": 1.234, + "step": 471 + }, + { + "epoch": 0.07441167196950153, + "grad_norm": 0.95703125, + "learning_rate": 7.522555615440318e-05, + "loss": 1.256, + "step": 472 + }, + { + "epoch": 0.07456932381689454, + "grad_norm": 1.078125, + "learning_rate": 7.522109595964456e-05, + "loss": 1.3037, + "step": 473 + }, + { + "epoch": 0.07472697566428756, + "grad_norm": 0.9453125, + "learning_rate": 7.521663581740341e-05, + "loss": 1.2517, + "step": 474 + }, + { + "epoch": 0.07488462751168057, + "grad_norm": 0.93359375, + "learning_rate": 7.521217572768914e-05, + "loss": 1.2364, + "step": 475 + }, + { + "epoch": 0.07504227935907358, + "grad_norm": 1.0625, + "learning_rate": 7.520771569051129e-05, + "loss": 1.325, + "step": 476 + }, + { + "epoch": 0.07519993120646659, + "grad_norm": 1.015625, + "learning_rate": 7.520325570587927e-05, + "loss": 1.1998, + "step": 477 + }, + { + "epoch": 0.0753575830538596, + "grad_norm": 1.0390625, + "learning_rate": 7.519879577380253e-05, + "loss": 1.2116, + "step": 478 + }, + { + "epoch": 0.07551523490125261, + "grad_norm": 0.98046875, + "learning_rate": 7.519433589429053e-05, + "loss": 1.2443, + "step": 479 + }, + { + "epoch": 0.07567288674864563, + "grad_norm": 1.0234375, + "learning_rate": 7.518987606735267e-05, + "loss": 1.1512, + "step": 480 + }, + { + "epoch": 0.07583053859603864, + "grad_norm": 0.88671875, + "learning_rate": 7.51854162929985e-05, + "loss": 1.0509, + "step": 481 + }, + { + "epoch": 0.07598819044343165, + "grad_norm": 1.0859375, + "learning_rate": 7.518095657123742e-05, + "loss": 1.4288, + "step": 482 + }, + { + "epoch": 0.07614584229082466, + "grad_norm": 1.1171875, + "learning_rate": 7.517649690207888e-05, + "loss": 1.6021, + "step": 483 + }, + { + "epoch": 0.07630349413821767, + "grad_norm": 0.98828125, + "learning_rate": 7.517203728553233e-05, + "loss": 1.1761, + "step": 484 + }, + { + "epoch": 0.07646114598561068, + "grad_norm": 1.0234375, + "learning_rate": 7.51675777216072e-05, + "loss": 1.043, + "step": 485 + }, + { + "epoch": 0.0766187978330037, + "grad_norm": 1.0078125, + "learning_rate": 7.516311821031298e-05, + "loss": 1.1508, + "step": 486 + }, + { + "epoch": 0.07677644968039671, + "grad_norm": 1.0625, + "learning_rate": 7.515865875165915e-05, + "loss": 1.1291, + "step": 487 + }, + { + "epoch": 0.07693410152778972, + "grad_norm": 1.0234375, + "learning_rate": 7.51541993456551e-05, + "loss": 1.2817, + "step": 488 + }, + { + "epoch": 0.07709175337518273, + "grad_norm": 0.94140625, + "learning_rate": 7.514973999231032e-05, + "loss": 1.1202, + "step": 489 + }, + { + "epoch": 0.07724940522257574, + "grad_norm": 1.03125, + "learning_rate": 7.514528069163423e-05, + "loss": 1.319, + "step": 490 + }, + { + "epoch": 0.07740705706996875, + "grad_norm": 1.0390625, + "learning_rate": 7.514082144363629e-05, + "loss": 1.3601, + "step": 491 + }, + { + "epoch": 0.07756470891736177, + "grad_norm": 0.98828125, + "learning_rate": 7.513636224832597e-05, + "loss": 1.2541, + "step": 492 + }, + { + "epoch": 0.07772236076475478, + "grad_norm": 1.078125, + "learning_rate": 7.513190310571271e-05, + "loss": 1.4006, + "step": 493 + }, + { + "epoch": 0.07788001261214779, + "grad_norm": 0.87890625, + "learning_rate": 7.512744401580598e-05, + "loss": 1.0685, + "step": 494 + }, + { + "epoch": 0.0780376644595408, + "grad_norm": 0.94921875, + "learning_rate": 7.512298497861514e-05, + "loss": 1.4167, + "step": 495 + }, + { + "epoch": 0.07819531630693381, + "grad_norm": 0.984375, + "learning_rate": 7.511852599414977e-05, + "loss": 1.0967, + "step": 496 + }, + { + "epoch": 0.07835296815432682, + "grad_norm": 1.1015625, + "learning_rate": 7.511406706241924e-05, + "loss": 1.4225, + "step": 497 + }, + { + "epoch": 0.07851062000171984, + "grad_norm": 1.40625, + "learning_rate": 7.510960818343305e-05, + "loss": 1.3862, + "step": 498 + }, + { + "epoch": 0.07866827184911285, + "grad_norm": 1.0859375, + "learning_rate": 7.51051493572006e-05, + "loss": 1.119, + "step": 499 + }, + { + "epoch": 0.07882592369650586, + "grad_norm": 1.140625, + "learning_rate": 7.510069058373135e-05, + "loss": 1.2952, + "step": 500 + }, + { + "epoch": 0.07898357554389887, + "grad_norm": 1.078125, + "learning_rate": 7.509623186303478e-05, + "loss": 1.5034, + "step": 501 + }, + { + "epoch": 0.07914122739129188, + "grad_norm": 0.94921875, + "learning_rate": 7.509177319512033e-05, + "loss": 1.0635, + "step": 502 + }, + { + "epoch": 0.0792988792386849, + "grad_norm": 0.99609375, + "learning_rate": 7.508731457999745e-05, + "loss": 1.1524, + "step": 503 + }, + { + "epoch": 0.0794565310860779, + "grad_norm": 0.97265625, + "learning_rate": 7.508285601767558e-05, + "loss": 1.2952, + "step": 504 + }, + { + "epoch": 0.07961418293347092, + "grad_norm": 0.97265625, + "learning_rate": 7.507839750816412e-05, + "loss": 1.374, + "step": 505 + }, + { + "epoch": 0.07977183478086393, + "grad_norm": 1.1015625, + "learning_rate": 7.507393905147261e-05, + "loss": 1.2584, + "step": 506 + }, + { + "epoch": 0.07992948662825694, + "grad_norm": 0.97265625, + "learning_rate": 7.506948064761048e-05, + "loss": 1.0858, + "step": 507 + }, + { + "epoch": 0.08008713847564995, + "grad_norm": 1.03125, + "learning_rate": 7.506502229658716e-05, + "loss": 1.4086, + "step": 508 + }, + { + "epoch": 0.08024479032304296, + "grad_norm": 1.0703125, + "learning_rate": 7.506056399841209e-05, + "loss": 1.4715, + "step": 509 + }, + { + "epoch": 0.08040244217043598, + "grad_norm": 1.03125, + "learning_rate": 7.505610575309472e-05, + "loss": 1.3469, + "step": 510 + }, + { + "epoch": 0.08056009401782899, + "grad_norm": 0.984375, + "learning_rate": 7.505164756064451e-05, + "loss": 1.1889, + "step": 511 + }, + { + "epoch": 0.080717745865222, + "grad_norm": 1.078125, + "learning_rate": 7.504718942107091e-05, + "loss": 1.5048, + "step": 512 + }, + { + "epoch": 0.08087539771261501, + "grad_norm": 0.890625, + "learning_rate": 7.504273133438337e-05, + "loss": 1.322, + "step": 513 + }, + { + "epoch": 0.08103304956000802, + "grad_norm": 1.0234375, + "learning_rate": 7.503827330059134e-05, + "loss": 1.2944, + "step": 514 + }, + { + "epoch": 0.08119070140740103, + "grad_norm": 1.078125, + "learning_rate": 7.503381531970427e-05, + "loss": 1.2741, + "step": 515 + }, + { + "epoch": 0.08134835325479405, + "grad_norm": 1.015625, + "learning_rate": 7.502935739173154e-05, + "loss": 1.4, + "step": 516 + }, + { + "epoch": 0.08150600510218706, + "grad_norm": 1.015625, + "learning_rate": 7.502489951668272e-05, + "loss": 1.3554, + "step": 517 + }, + { + "epoch": 0.08166365694958007, + "grad_norm": 1.0078125, + "learning_rate": 7.502044169456719e-05, + "loss": 1.2887, + "step": 518 + }, + { + "epoch": 0.08182130879697308, + "grad_norm": 1.03125, + "learning_rate": 7.50159839253944e-05, + "loss": 1.1278, + "step": 519 + }, + { + "epoch": 0.08197896064436609, + "grad_norm": 1.03125, + "learning_rate": 7.501152620917383e-05, + "loss": 1.2579, + "step": 520 + }, + { + "epoch": 0.0821366124917591, + "grad_norm": 1.0078125, + "learning_rate": 7.500706854591483e-05, + "loss": 1.3041, + "step": 521 + }, + { + "epoch": 0.08229426433915212, + "grad_norm": 0.82421875, + "learning_rate": 7.500261093562697e-05, + "loss": 0.9188, + "step": 522 + }, + { + "epoch": 0.08245191618654513, + "grad_norm": 1.0078125, + "learning_rate": 7.499815337831966e-05, + "loss": 1.1688, + "step": 523 + }, + { + "epoch": 0.08260956803393814, + "grad_norm": 0.9921875, + "learning_rate": 7.49936958740023e-05, + "loss": 1.103, + "step": 524 + }, + { + "epoch": 0.08276721988133115, + "grad_norm": 1.046875, + "learning_rate": 7.498923842268441e-05, + "loss": 1.1761, + "step": 525 + }, + { + "epoch": 0.08292487172872416, + "grad_norm": 1.0078125, + "learning_rate": 7.498478102437534e-05, + "loss": 1.1771, + "step": 526 + }, + { + "epoch": 0.08308252357611717, + "grad_norm": 0.9375, + "learning_rate": 7.498032367908465e-05, + "loss": 1.1592, + "step": 527 + }, + { + "epoch": 0.08324017542351018, + "grad_norm": 1.0078125, + "learning_rate": 7.497586638682172e-05, + "loss": 1.4756, + "step": 528 + }, + { + "epoch": 0.0833978272709032, + "grad_norm": 0.9921875, + "learning_rate": 7.497140914759602e-05, + "loss": 1.3079, + "step": 529 + }, + { + "epoch": 0.08355547911829621, + "grad_norm": 1.0703125, + "learning_rate": 7.496695196141699e-05, + "loss": 1.2843, + "step": 530 + }, + { + "epoch": 0.08371313096568922, + "grad_norm": 0.921875, + "learning_rate": 7.496249482829405e-05, + "loss": 1.2504, + "step": 531 + }, + { + "epoch": 0.08387078281308223, + "grad_norm": 1.046875, + "learning_rate": 7.495803774823669e-05, + "loss": 1.2643, + "step": 532 + }, + { + "epoch": 0.08402843466047524, + "grad_norm": 1.0078125, + "learning_rate": 7.495358072125434e-05, + "loss": 0.9953, + "step": 533 + }, + { + "epoch": 0.08418608650786825, + "grad_norm": 0.984375, + "learning_rate": 7.494912374735643e-05, + "loss": 0.9915, + "step": 534 + }, + { + "epoch": 0.08434373835526127, + "grad_norm": 0.859375, + "learning_rate": 7.494466682655241e-05, + "loss": 1.065, + "step": 535 + }, + { + "epoch": 0.08450139020265428, + "grad_norm": 0.86328125, + "learning_rate": 7.494020995885175e-05, + "loss": 1.1784, + "step": 536 + }, + { + "epoch": 0.08465904205004729, + "grad_norm": 1.0078125, + "learning_rate": 7.493575314426389e-05, + "loss": 1.0711, + "step": 537 + }, + { + "epoch": 0.0848166938974403, + "grad_norm": 1.0546875, + "learning_rate": 7.493129638279827e-05, + "loss": 1.2725, + "step": 538 + }, + { + "epoch": 0.08497434574483331, + "grad_norm": 0.9375, + "learning_rate": 7.492683967446434e-05, + "loss": 1.114, + "step": 539 + }, + { + "epoch": 0.08513199759222632, + "grad_norm": 0.98046875, + "learning_rate": 7.492238301927153e-05, + "loss": 1.2806, + "step": 540 + }, + { + "epoch": 0.08528964943961934, + "grad_norm": 1.125, + "learning_rate": 7.491792641722925e-05, + "loss": 1.1968, + "step": 541 + }, + { + "epoch": 0.08544730128701235, + "grad_norm": 0.984375, + "learning_rate": 7.491346986834705e-05, + "loss": 1.2989, + "step": 542 + }, + { + "epoch": 0.08560495313440536, + "grad_norm": 1.0390625, + "learning_rate": 7.490901337263432e-05, + "loss": 1.4634, + "step": 543 + }, + { + "epoch": 0.08576260498179837, + "grad_norm": 1.09375, + "learning_rate": 7.490455693010048e-05, + "loss": 1.2581, + "step": 544 + }, + { + "epoch": 0.08592025682919138, + "grad_norm": 0.99609375, + "learning_rate": 7.490010054075502e-05, + "loss": 1.2082, + "step": 545 + }, + { + "epoch": 0.0860779086765844, + "grad_norm": 1.0859375, + "learning_rate": 7.489564420460731e-05, + "loss": 1.4202, + "step": 546 + }, + { + "epoch": 0.0862355605239774, + "grad_norm": 0.98828125, + "learning_rate": 7.489118792166687e-05, + "loss": 1.2088, + "step": 547 + }, + { + "epoch": 0.08639321237137043, + "grad_norm": 1.0234375, + "learning_rate": 7.488673169194316e-05, + "loss": 1.2706, + "step": 548 + }, + { + "epoch": 0.08655086421876344, + "grad_norm": 1.03125, + "learning_rate": 7.488227551544556e-05, + "loss": 1.1202, + "step": 549 + }, + { + "epoch": 0.08670851606615645, + "grad_norm": 1.0546875, + "learning_rate": 7.487781939218355e-05, + "loss": 1.5892, + "step": 550 + }, + { + "epoch": 0.08686616791354947, + "grad_norm": 0.9921875, + "learning_rate": 7.487336332216654e-05, + "loss": 1.1621, + "step": 551 + }, + { + "epoch": 0.08702381976094248, + "grad_norm": 1.078125, + "learning_rate": 7.486890730540403e-05, + "loss": 1.1864, + "step": 552 + }, + { + "epoch": 0.08718147160833549, + "grad_norm": 0.890625, + "learning_rate": 7.486445134190541e-05, + "loss": 1.156, + "step": 553 + }, + { + "epoch": 0.0873391234557285, + "grad_norm": 0.88671875, + "learning_rate": 7.485999543168017e-05, + "loss": 1.0731, + "step": 554 + }, + { + "epoch": 0.08749677530312151, + "grad_norm": 1.140625, + "learning_rate": 7.485553957473771e-05, + "loss": 1.4862, + "step": 555 + }, + { + "epoch": 0.08765442715051452, + "grad_norm": 1.015625, + "learning_rate": 7.48510837710875e-05, + "loss": 1.1578, + "step": 556 + }, + { + "epoch": 0.08781207899790754, + "grad_norm": 0.99609375, + "learning_rate": 7.4846628020739e-05, + "loss": 1.3176, + "step": 557 + }, + { + "epoch": 0.08796973084530055, + "grad_norm": 0.90625, + "learning_rate": 7.484217232370163e-05, + "loss": 1.1076, + "step": 558 + }, + { + "epoch": 0.08812738269269356, + "grad_norm": 0.9140625, + "learning_rate": 7.483771667998485e-05, + "loss": 1.0937, + "step": 559 + }, + { + "epoch": 0.08828503454008657, + "grad_norm": 1.03125, + "learning_rate": 7.483326108959807e-05, + "loss": 1.1822, + "step": 560 + }, + { + "epoch": 0.08844268638747958, + "grad_norm": 1.078125, + "learning_rate": 7.482880555255073e-05, + "loss": 1.2227, + "step": 561 + }, + { + "epoch": 0.0886003382348726, + "grad_norm": 1.0234375, + "learning_rate": 7.482435006885234e-05, + "loss": 1.2702, + "step": 562 + }, + { + "epoch": 0.0887579900822656, + "grad_norm": 0.98828125, + "learning_rate": 7.481989463851228e-05, + "loss": 1.1449, + "step": 563 + }, + { + "epoch": 0.08891564192965862, + "grad_norm": 1.0390625, + "learning_rate": 7.481543926154005e-05, + "loss": 1.3391, + "step": 564 + }, + { + "epoch": 0.08907329377705163, + "grad_norm": 0.9296875, + "learning_rate": 7.481098393794502e-05, + "loss": 1.2093, + "step": 565 + }, + { + "epoch": 0.08923094562444464, + "grad_norm": 0.99609375, + "learning_rate": 7.480652866773665e-05, + "loss": 1.3015, + "step": 566 + }, + { + "epoch": 0.08938859747183765, + "grad_norm": 0.9453125, + "learning_rate": 7.480207345092443e-05, + "loss": 1.1732, + "step": 567 + }, + { + "epoch": 0.08954624931923066, + "grad_norm": 0.9921875, + "learning_rate": 7.479761828751779e-05, + "loss": 1.1943, + "step": 568 + }, + { + "epoch": 0.08970390116662368, + "grad_norm": 0.92578125, + "learning_rate": 7.479316317752615e-05, + "loss": 1.0889, + "step": 569 + }, + { + "epoch": 0.08986155301401669, + "grad_norm": 0.9296875, + "learning_rate": 7.478870812095895e-05, + "loss": 1.3222, + "step": 570 + }, + { + "epoch": 0.0900192048614097, + "grad_norm": 1.0234375, + "learning_rate": 7.478425311782564e-05, + "loss": 1.2801, + "step": 571 + }, + { + "epoch": 0.09017685670880271, + "grad_norm": 0.87890625, + "learning_rate": 7.477979816813565e-05, + "loss": 0.9339, + "step": 572 + }, + { + "epoch": 0.09033450855619572, + "grad_norm": 0.94921875, + "learning_rate": 7.477534327189847e-05, + "loss": 1.2102, + "step": 573 + }, + { + "epoch": 0.09049216040358873, + "grad_norm": 0.8515625, + "learning_rate": 7.47708884291235e-05, + "loss": 1.1488, + "step": 574 + }, + { + "epoch": 0.09064981225098175, + "grad_norm": 1.0078125, + "learning_rate": 7.476643363982019e-05, + "loss": 1.1672, + "step": 575 + }, + { + "epoch": 0.09080746409837476, + "grad_norm": 1.0, + "learning_rate": 7.476197890399797e-05, + "loss": 1.1126, + "step": 576 + }, + { + "epoch": 0.09096511594576777, + "grad_norm": 0.9921875, + "learning_rate": 7.47575242216663e-05, + "loss": 1.2595, + "step": 577 + }, + { + "epoch": 0.09112276779316078, + "grad_norm": 1.015625, + "learning_rate": 7.475306959283461e-05, + "loss": 1.4458, + "step": 578 + }, + { + "epoch": 0.09128041964055379, + "grad_norm": 0.96484375, + "learning_rate": 7.474861501751236e-05, + "loss": 1.3733, + "step": 579 + }, + { + "epoch": 0.0914380714879468, + "grad_norm": 0.98828125, + "learning_rate": 7.474416049570898e-05, + "loss": 1.2634, + "step": 580 + }, + { + "epoch": 0.09159572333533982, + "grad_norm": 0.91015625, + "learning_rate": 7.473970602743385e-05, + "loss": 1.1116, + "step": 581 + }, + { + "epoch": 0.09175337518273283, + "grad_norm": 0.9140625, + "learning_rate": 7.473525161269653e-05, + "loss": 1.2457, + "step": 582 + }, + { + "epoch": 0.09191102703012584, + "grad_norm": 1.09375, + "learning_rate": 7.47307972515064e-05, + "loss": 1.0115, + "step": 583 + }, + { + "epoch": 0.09206867887751885, + "grad_norm": 1.109375, + "learning_rate": 7.472634294387289e-05, + "loss": 1.1516, + "step": 584 + }, + { + "epoch": 0.09222633072491186, + "grad_norm": 1.046875, + "learning_rate": 7.472188868980545e-05, + "loss": 1.3839, + "step": 585 + }, + { + "epoch": 0.09238398257230487, + "grad_norm": 0.96484375, + "learning_rate": 7.471743448931347e-05, + "loss": 1.1989, + "step": 586 + }, + { + "epoch": 0.09254163441969789, + "grad_norm": 1.1015625, + "learning_rate": 7.471298034240651e-05, + "loss": 1.3703, + "step": 587 + }, + { + "epoch": 0.0926992862670909, + "grad_norm": 1.0546875, + "learning_rate": 7.470852624909393e-05, + "loss": 1.2874, + "step": 588 + }, + { + "epoch": 0.09285693811448391, + "grad_norm": 0.94921875, + "learning_rate": 7.47040722093852e-05, + "loss": 1.1757, + "step": 589 + }, + { + "epoch": 0.09301458996187692, + "grad_norm": 0.8046875, + "learning_rate": 7.469961822328972e-05, + "loss": 0.9003, + "step": 590 + }, + { + "epoch": 0.09317224180926993, + "grad_norm": 1.0703125, + "learning_rate": 7.469516429081692e-05, + "loss": 1.4065, + "step": 591 + }, + { + "epoch": 0.09332989365666294, + "grad_norm": 0.9375, + "learning_rate": 7.469071041197631e-05, + "loss": 1.1492, + "step": 592 + }, + { + "epoch": 0.09348754550405595, + "grad_norm": 0.91015625, + "learning_rate": 7.46862565867773e-05, + "loss": 1.1621, + "step": 593 + }, + { + "epoch": 0.09364519735144897, + "grad_norm": 0.9921875, + "learning_rate": 7.468180281522932e-05, + "loss": 1.1522, + "step": 594 + }, + { + "epoch": 0.09380284919884198, + "grad_norm": 0.92578125, + "learning_rate": 7.46773490973418e-05, + "loss": 1.2551, + "step": 595 + }, + { + "epoch": 0.09396050104623499, + "grad_norm": 1.1484375, + "learning_rate": 7.467289543312419e-05, + "loss": 1.2196, + "step": 596 + }, + { + "epoch": 0.094118152893628, + "grad_norm": 1.1015625, + "learning_rate": 7.466844182258595e-05, + "loss": 1.2214, + "step": 597 + }, + { + "epoch": 0.09427580474102101, + "grad_norm": 0.96484375, + "learning_rate": 7.466398826573648e-05, + "loss": 1.3309, + "step": 598 + }, + { + "epoch": 0.09443345658841402, + "grad_norm": 0.96484375, + "learning_rate": 7.465953476258525e-05, + "loss": 1.1182, + "step": 599 + }, + { + "epoch": 0.09459110843580704, + "grad_norm": 0.98828125, + "learning_rate": 7.46550813131417e-05, + "loss": 1.2693, + "step": 600 + }, + { + "epoch": 0.09474876028320005, + "grad_norm": 0.93359375, + "learning_rate": 7.465062791741519e-05, + "loss": 1.038, + "step": 601 + }, + { + "epoch": 0.09490641213059306, + "grad_norm": 0.95703125, + "learning_rate": 7.464617457541527e-05, + "loss": 1.0602, + "step": 602 + }, + { + "epoch": 0.09506406397798607, + "grad_norm": 1.015625, + "learning_rate": 7.464172128715135e-05, + "loss": 1.126, + "step": 603 + }, + { + "epoch": 0.09522171582537908, + "grad_norm": 0.984375, + "learning_rate": 7.463726805263285e-05, + "loss": 1.2795, + "step": 604 + }, + { + "epoch": 0.0953793676727721, + "grad_norm": 1.0703125, + "learning_rate": 7.46328148718692e-05, + "loss": 1.4452, + "step": 605 + }, + { + "epoch": 0.0955370195201651, + "grad_norm": 1.0078125, + "learning_rate": 7.462836174486982e-05, + "loss": 1.1347, + "step": 606 + }, + { + "epoch": 0.09569467136755812, + "grad_norm": 1.0546875, + "learning_rate": 7.462390867164422e-05, + "loss": 1.3051, + "step": 607 + }, + { + "epoch": 0.09585232321495113, + "grad_norm": 1.1015625, + "learning_rate": 7.461945565220179e-05, + "loss": 1.339, + "step": 608 + }, + { + "epoch": 0.09600997506234414, + "grad_norm": 1.0, + "learning_rate": 7.461500268655197e-05, + "loss": 1.2509, + "step": 609 + }, + { + "epoch": 0.09616762690973715, + "grad_norm": 1.0625, + "learning_rate": 7.461054977470419e-05, + "loss": 1.4698, + "step": 610 + }, + { + "epoch": 0.09632527875713016, + "grad_norm": 0.859375, + "learning_rate": 7.460609691666787e-05, + "loss": 1.1182, + "step": 611 + }, + { + "epoch": 0.09648293060452318, + "grad_norm": 0.9453125, + "learning_rate": 7.460164411245252e-05, + "loss": 1.0848, + "step": 612 + }, + { + "epoch": 0.09664058245191619, + "grad_norm": 1.0625, + "learning_rate": 7.459719136206753e-05, + "loss": 1.1935, + "step": 613 + }, + { + "epoch": 0.0967982342993092, + "grad_norm": 0.98828125, + "learning_rate": 7.459273866552234e-05, + "loss": 1.1229, + "step": 614 + }, + { + "epoch": 0.09695588614670221, + "grad_norm": 0.93359375, + "learning_rate": 7.458828602282639e-05, + "loss": 1.2689, + "step": 615 + }, + { + "epoch": 0.09711353799409522, + "grad_norm": 0.984375, + "learning_rate": 7.458383343398909e-05, + "loss": 1.3101, + "step": 616 + }, + { + "epoch": 0.09727118984148823, + "grad_norm": 1.09375, + "learning_rate": 7.457938089901992e-05, + "loss": 1.3334, + "step": 617 + }, + { + "epoch": 0.09742884168888125, + "grad_norm": 1.140625, + "learning_rate": 7.457492841792832e-05, + "loss": 1.3037, + "step": 618 + }, + { + "epoch": 0.09758649353627426, + "grad_norm": 0.96875, + "learning_rate": 7.457047599072368e-05, + "loss": 1.1142, + "step": 619 + }, + { + "epoch": 0.09774414538366727, + "grad_norm": 0.99609375, + "learning_rate": 7.456602361741547e-05, + "loss": 1.1003, + "step": 620 + }, + { + "epoch": 0.09790179723106028, + "grad_norm": 1.015625, + "learning_rate": 7.45615712980131e-05, + "loss": 1.4077, + "step": 621 + }, + { + "epoch": 0.09805944907845329, + "grad_norm": 0.85546875, + "learning_rate": 7.455711903252605e-05, + "loss": 0.9765, + "step": 622 + }, + { + "epoch": 0.0982171009258463, + "grad_norm": 1.0234375, + "learning_rate": 7.455266682096374e-05, + "loss": 1.2055, + "step": 623 + }, + { + "epoch": 0.09837475277323932, + "grad_norm": 1.1328125, + "learning_rate": 7.454821466333557e-05, + "loss": 1.1113, + "step": 624 + }, + { + "epoch": 0.09853240462063233, + "grad_norm": 0.9296875, + "learning_rate": 7.454376255965102e-05, + "loss": 1.0605, + "step": 625 + }, + { + "epoch": 0.09869005646802534, + "grad_norm": 1.0234375, + "learning_rate": 7.453931050991948e-05, + "loss": 1.1806, + "step": 626 + }, + { + "epoch": 0.09884770831541835, + "grad_norm": 0.98046875, + "learning_rate": 7.453485851415045e-05, + "loss": 1.1077, + "step": 627 + }, + { + "epoch": 0.09900536016281136, + "grad_norm": 0.96484375, + "learning_rate": 7.453040657235333e-05, + "loss": 1.4821, + "step": 628 + }, + { + "epoch": 0.09916301201020437, + "grad_norm": 0.859375, + "learning_rate": 7.452595468453756e-05, + "loss": 1.0574, + "step": 629 + }, + { + "epoch": 0.09932066385759739, + "grad_norm": 0.9296875, + "learning_rate": 7.452150285071258e-05, + "loss": 1.1567, + "step": 630 + }, + { + "epoch": 0.0994783157049904, + "grad_norm": 0.97265625, + "learning_rate": 7.451705107088777e-05, + "loss": 1.3568, + "step": 631 + }, + { + "epoch": 0.09963596755238341, + "grad_norm": 0.88671875, + "learning_rate": 7.451259934507266e-05, + "loss": 1.2442, + "step": 632 + }, + { + "epoch": 0.09979361939977642, + "grad_norm": 0.96875, + "learning_rate": 7.450814767327663e-05, + "loss": 1.0539, + "step": 633 + }, + { + "epoch": 0.09995127124716943, + "grad_norm": 0.88671875, + "learning_rate": 7.450369605550913e-05, + "loss": 1.1088, + "step": 634 + }, + { + "epoch": 0.10010892309456244, + "grad_norm": 1.0078125, + "learning_rate": 7.449924449177958e-05, + "loss": 1.0329, + "step": 635 + }, + { + "epoch": 0.10026657494195546, + "grad_norm": 1.171875, + "learning_rate": 7.449479298209741e-05, + "loss": 1.3813, + "step": 636 + }, + { + "epoch": 0.10042422678934847, + "grad_norm": 0.98046875, + "learning_rate": 7.44903415264721e-05, + "loss": 1.2364, + "step": 637 + }, + { + "epoch": 0.10058187863674148, + "grad_norm": 1.0, + "learning_rate": 7.448589012491303e-05, + "loss": 1.3175, + "step": 638 + }, + { + "epoch": 0.10073953048413449, + "grad_norm": 0.9609375, + "learning_rate": 7.448143877742965e-05, + "loss": 1.0257, + "step": 639 + }, + { + "epoch": 0.1008971823315275, + "grad_norm": 1.0, + "learning_rate": 7.447698748403142e-05, + "loss": 1.3068, + "step": 640 + }, + { + "epoch": 0.10105483417892051, + "grad_norm": 0.9609375, + "learning_rate": 7.447253624472774e-05, + "loss": 1.3199, + "step": 641 + }, + { + "epoch": 0.10121248602631353, + "grad_norm": 0.87890625, + "learning_rate": 7.446808505952807e-05, + "loss": 1.1259, + "step": 642 + }, + { + "epoch": 0.10137013787370654, + "grad_norm": 0.90625, + "learning_rate": 7.446363392844184e-05, + "loss": 1.1717, + "step": 643 + }, + { + "epoch": 0.10152778972109955, + "grad_norm": 1.0, + "learning_rate": 7.445918285147849e-05, + "loss": 1.4615, + "step": 644 + }, + { + "epoch": 0.10168544156849256, + "grad_norm": 1.03125, + "learning_rate": 7.445473182864744e-05, + "loss": 1.403, + "step": 645 + }, + { + "epoch": 0.10184309341588557, + "grad_norm": 1.0703125, + "learning_rate": 7.445028085995806e-05, + "loss": 1.1854, + "step": 646 + }, + { + "epoch": 0.10200074526327858, + "grad_norm": 0.91015625, + "learning_rate": 7.444582994541992e-05, + "loss": 1.0989, + "step": 647 + }, + { + "epoch": 0.1021583971106716, + "grad_norm": 1.0390625, + "learning_rate": 7.444137908504236e-05, + "loss": 1.4107, + "step": 648 + }, + { + "epoch": 0.1023160489580646, + "grad_norm": 0.99609375, + "learning_rate": 7.443692827883487e-05, + "loss": 1.2837, + "step": 649 + }, + { + "epoch": 0.10247370080545762, + "grad_norm": 0.9296875, + "learning_rate": 7.443247752680682e-05, + "loss": 1.1837, + "step": 650 + }, + { + "epoch": 0.10263135265285063, + "grad_norm": 0.95703125, + "learning_rate": 7.442802682896768e-05, + "loss": 1.2619, + "step": 651 + }, + { + "epoch": 0.10278900450024364, + "grad_norm": 0.9453125, + "learning_rate": 7.442357618532683e-05, + "loss": 1.335, + "step": 652 + }, + { + "epoch": 0.10294665634763665, + "grad_norm": 1.046875, + "learning_rate": 7.441912559589379e-05, + "loss": 1.0828, + "step": 653 + }, + { + "epoch": 0.10310430819502966, + "grad_norm": 0.99609375, + "learning_rate": 7.441467506067795e-05, + "loss": 1.2063, + "step": 654 + }, + { + "epoch": 0.10326196004242268, + "grad_norm": 1.0234375, + "learning_rate": 7.441022457968875e-05, + "loss": 1.2698, + "step": 655 + }, + { + "epoch": 0.10341961188981569, + "grad_norm": 0.97265625, + "learning_rate": 7.440577415293562e-05, + "loss": 1.1495, + "step": 656 + }, + { + "epoch": 0.1035772637372087, + "grad_norm": 0.90234375, + "learning_rate": 7.440132378042793e-05, + "loss": 1.1802, + "step": 657 + }, + { + "epoch": 0.10373491558460171, + "grad_norm": 1.03125, + "learning_rate": 7.439687346217523e-05, + "loss": 1.2465, + "step": 658 + }, + { + "epoch": 0.10389256743199472, + "grad_norm": 1.0, + "learning_rate": 7.439242319818689e-05, + "loss": 1.1662, + "step": 659 + }, + { + "epoch": 0.10405021927938773, + "grad_norm": 0.984375, + "learning_rate": 7.438797298847234e-05, + "loss": 1.1692, + "step": 660 + }, + { + "epoch": 0.10420787112678075, + "grad_norm": 0.91015625, + "learning_rate": 7.438352283304102e-05, + "loss": 1.0785, + "step": 661 + }, + { + "epoch": 0.10436552297417376, + "grad_norm": 0.92578125, + "learning_rate": 7.437907273190234e-05, + "loss": 0.9894, + "step": 662 + }, + { + "epoch": 0.10452317482156677, + "grad_norm": 1.0, + "learning_rate": 7.437462268506576e-05, + "loss": 1.2955, + "step": 663 + }, + { + "epoch": 0.10468082666895978, + "grad_norm": 1.0390625, + "learning_rate": 7.437017269254072e-05, + "loss": 1.5138, + "step": 664 + }, + { + "epoch": 0.10483847851635279, + "grad_norm": 0.96875, + "learning_rate": 7.436572275433661e-05, + "loss": 1.2694, + "step": 665 + }, + { + "epoch": 0.1049961303637458, + "grad_norm": 0.90234375, + "learning_rate": 7.436127287046293e-05, + "loss": 1.1391, + "step": 666 + }, + { + "epoch": 0.10515378221113882, + "grad_norm": 0.97265625, + "learning_rate": 7.4356823040929e-05, + "loss": 0.9918, + "step": 667 + }, + { + "epoch": 0.10531143405853183, + "grad_norm": 0.89453125, + "learning_rate": 7.435237326574435e-05, + "loss": 0.9228, + "step": 668 + }, + { + "epoch": 0.10546908590592484, + "grad_norm": 0.96484375, + "learning_rate": 7.434792354491837e-05, + "loss": 1.1432, + "step": 669 + }, + { + "epoch": 0.10562673775331785, + "grad_norm": 0.87109375, + "learning_rate": 7.434347387846052e-05, + "loss": 1.178, + "step": 670 + }, + { + "epoch": 0.10578438960071086, + "grad_norm": 0.98828125, + "learning_rate": 7.433902426638021e-05, + "loss": 1.3491, + "step": 671 + }, + { + "epoch": 0.10594204144810387, + "grad_norm": 0.97265625, + "learning_rate": 7.433457470868683e-05, + "loss": 1.0879, + "step": 672 + }, + { + "epoch": 0.10609969329549689, + "grad_norm": 0.95703125, + "learning_rate": 7.43301252053899e-05, + "loss": 1.1116, + "step": 673 + }, + { + "epoch": 0.1062573451428899, + "grad_norm": 0.953125, + "learning_rate": 7.432567575649878e-05, + "loss": 1.2212, + "step": 674 + }, + { + "epoch": 0.10641499699028291, + "grad_norm": 0.9765625, + "learning_rate": 7.432122636202294e-05, + "loss": 1.0424, + "step": 675 + }, + { + "epoch": 0.10657264883767592, + "grad_norm": 0.9375, + "learning_rate": 7.431677702197179e-05, + "loss": 1.3942, + "step": 676 + }, + { + "epoch": 0.10673030068506893, + "grad_norm": 1.15625, + "learning_rate": 7.43123277363547e-05, + "loss": 1.2196, + "step": 677 + }, + { + "epoch": 0.10688795253246194, + "grad_norm": 1.0390625, + "learning_rate": 7.430787850518124e-05, + "loss": 1.3524, + "step": 678 + }, + { + "epoch": 0.10704560437985496, + "grad_norm": 1.0078125, + "learning_rate": 7.430342932846073e-05, + "loss": 1.1995, + "step": 679 + }, + { + "epoch": 0.10720325622724797, + "grad_norm": 0.96484375, + "learning_rate": 7.429898020620266e-05, + "loss": 1.2108, + "step": 680 + }, + { + "epoch": 0.10736090807464098, + "grad_norm": 0.9453125, + "learning_rate": 7.429453113841643e-05, + "loss": 1.3323, + "step": 681 + }, + { + "epoch": 0.10751855992203399, + "grad_norm": 0.9609375, + "learning_rate": 7.429008212511143e-05, + "loss": 1.1557, + "step": 682 + }, + { + "epoch": 0.107676211769427, + "grad_norm": 0.88671875, + "learning_rate": 7.428563316629718e-05, + "loss": 1.1836, + "step": 683 + }, + { + "epoch": 0.10783386361682001, + "grad_norm": 0.9296875, + "learning_rate": 7.428118426198303e-05, + "loss": 1.2797, + "step": 684 + }, + { + "epoch": 0.10799151546421303, + "grad_norm": 0.85546875, + "learning_rate": 7.427673541217847e-05, + "loss": 0.9795, + "step": 685 + }, + { + "epoch": 0.10814916731160604, + "grad_norm": 0.93359375, + "learning_rate": 7.427228661689288e-05, + "loss": 1.197, + "step": 686 + }, + { + "epoch": 0.10830681915899905, + "grad_norm": 0.99609375, + "learning_rate": 7.426783787613567e-05, + "loss": 1.2451, + "step": 687 + }, + { + "epoch": 0.10846447100639206, + "grad_norm": 0.90234375, + "learning_rate": 7.426338918991635e-05, + "loss": 1.1404, + "step": 688 + }, + { + "epoch": 0.10862212285378507, + "grad_norm": 0.9375, + "learning_rate": 7.425894055824432e-05, + "loss": 1.2287, + "step": 689 + }, + { + "epoch": 0.10877977470117808, + "grad_norm": 0.96484375, + "learning_rate": 7.425449198112897e-05, + "loss": 1.201, + "step": 690 + }, + { + "epoch": 0.1089374265485711, + "grad_norm": 1.046875, + "learning_rate": 7.425004345857975e-05, + "loss": 1.6085, + "step": 691 + }, + { + "epoch": 0.1090950783959641, + "grad_norm": 0.9921875, + "learning_rate": 7.424559499060606e-05, + "loss": 1.1462, + "step": 692 + }, + { + "epoch": 0.10925273024335712, + "grad_norm": 1.046875, + "learning_rate": 7.424114657721741e-05, + "loss": 1.2834, + "step": 693 + }, + { + "epoch": 0.10941038209075013, + "grad_norm": 1.15625, + "learning_rate": 7.423669821842318e-05, + "loss": 1.1607, + "step": 694 + }, + { + "epoch": 0.10956803393814314, + "grad_norm": 0.98828125, + "learning_rate": 7.423224991423279e-05, + "loss": 1.134, + "step": 695 + }, + { + "epoch": 0.10972568578553615, + "grad_norm": 1.0078125, + "learning_rate": 7.422780166465566e-05, + "loss": 1.1405, + "step": 696 + }, + { + "epoch": 0.10988333763292917, + "grad_norm": 0.94921875, + "learning_rate": 7.422335346970119e-05, + "loss": 1.1037, + "step": 697 + }, + { + "epoch": 0.11004098948032218, + "grad_norm": 1.1328125, + "learning_rate": 7.421890532937891e-05, + "loss": 1.3617, + "step": 698 + }, + { + "epoch": 0.11019864132771519, + "grad_norm": 1.03125, + "learning_rate": 7.421445724369818e-05, + "loss": 1.3546, + "step": 699 + }, + { + "epoch": 0.1103562931751082, + "grad_norm": 0.94140625, + "learning_rate": 7.421000921266843e-05, + "loss": 1.1407, + "step": 700 + }, + { + "epoch": 0.11051394502250121, + "grad_norm": 1.015625, + "learning_rate": 7.420556123629909e-05, + "loss": 1.129, + "step": 701 + }, + { + "epoch": 0.11067159686989422, + "grad_norm": 1.5078125, + "learning_rate": 7.420111331459957e-05, + "loss": 1.1317, + "step": 702 + }, + { + "epoch": 0.11082924871728723, + "grad_norm": 1.0390625, + "learning_rate": 7.419666544757934e-05, + "loss": 1.2639, + "step": 703 + }, + { + "epoch": 0.11098690056468026, + "grad_norm": 0.97265625, + "learning_rate": 7.419221763524779e-05, + "loss": 1.1172, + "step": 704 + }, + { + "epoch": 0.11114455241207327, + "grad_norm": 1.0, + "learning_rate": 7.418776987761438e-05, + "loss": 1.1648, + "step": 705 + }, + { + "epoch": 0.11130220425946628, + "grad_norm": 0.93359375, + "learning_rate": 7.418332217468849e-05, + "loss": 1.209, + "step": 706 + }, + { + "epoch": 0.1114598561068593, + "grad_norm": 0.9296875, + "learning_rate": 7.417887452647956e-05, + "loss": 1.0799, + "step": 707 + }, + { + "epoch": 0.1116175079542523, + "grad_norm": 0.95703125, + "learning_rate": 7.417442693299705e-05, + "loss": 1.0243, + "step": 708 + }, + { + "epoch": 0.11177515980164532, + "grad_norm": 0.95703125, + "learning_rate": 7.416997939425038e-05, + "loss": 1.3149, + "step": 709 + }, + { + "epoch": 0.11193281164903833, + "grad_norm": 0.828125, + "learning_rate": 7.416553191024895e-05, + "loss": 0.962, + "step": 710 + }, + { + "epoch": 0.11209046349643134, + "grad_norm": 0.9140625, + "learning_rate": 7.41610844810022e-05, + "loss": 1.1477, + "step": 711 + }, + { + "epoch": 0.11224811534382435, + "grad_norm": 0.984375, + "learning_rate": 7.415663710651952e-05, + "loss": 1.1915, + "step": 712 + }, + { + "epoch": 0.11240576719121737, + "grad_norm": 0.9375, + "learning_rate": 7.415218978681042e-05, + "loss": 0.9805, + "step": 713 + }, + { + "epoch": 0.11256341903861038, + "grad_norm": 0.9765625, + "learning_rate": 7.414774252188425e-05, + "loss": 1.1969, + "step": 714 + }, + { + "epoch": 0.11272107088600339, + "grad_norm": 1.015625, + "learning_rate": 7.414329531175049e-05, + "loss": 1.3074, + "step": 715 + }, + { + "epoch": 0.1128787227333964, + "grad_norm": 0.96484375, + "learning_rate": 7.413884815641853e-05, + "loss": 1.3352, + "step": 716 + }, + { + "epoch": 0.11303637458078941, + "grad_norm": 0.8984375, + "learning_rate": 7.413440105589775e-05, + "loss": 1.1258, + "step": 717 + }, + { + "epoch": 0.11319402642818242, + "grad_norm": 1.0234375, + "learning_rate": 7.412995401019768e-05, + "loss": 1.1097, + "step": 718 + }, + { + "epoch": 0.11335167827557543, + "grad_norm": 1.09375, + "learning_rate": 7.41255070193277e-05, + "loss": 1.358, + "step": 719 + }, + { + "epoch": 0.11350933012296845, + "grad_norm": 0.83984375, + "learning_rate": 7.412106008329721e-05, + "loss": 0.8733, + "step": 720 + }, + { + "epoch": 0.11366698197036146, + "grad_norm": 1.0, + "learning_rate": 7.411661320211566e-05, + "loss": 1.1374, + "step": 721 + }, + { + "epoch": 0.11382463381775447, + "grad_norm": 1.0078125, + "learning_rate": 7.411216637579245e-05, + "loss": 1.2428, + "step": 722 + }, + { + "epoch": 0.11398228566514748, + "grad_norm": 1.0625, + "learning_rate": 7.410771960433706e-05, + "loss": 1.0459, + "step": 723 + }, + { + "epoch": 0.11413993751254049, + "grad_norm": 0.96484375, + "learning_rate": 7.410327288775885e-05, + "loss": 1.2101, + "step": 724 + }, + { + "epoch": 0.1142975893599335, + "grad_norm": 0.93359375, + "learning_rate": 7.409882622606728e-05, + "loss": 1.3412, + "step": 725 + }, + { + "epoch": 0.11445524120732652, + "grad_norm": 0.94921875, + "learning_rate": 7.409437961927177e-05, + "loss": 1.1829, + "step": 726 + }, + { + "epoch": 0.11461289305471953, + "grad_norm": 1.0625, + "learning_rate": 7.408993306738173e-05, + "loss": 1.3572, + "step": 727 + }, + { + "epoch": 0.11477054490211254, + "grad_norm": 0.96484375, + "learning_rate": 7.408548657040661e-05, + "loss": 1.1652, + "step": 728 + }, + { + "epoch": 0.11492819674950555, + "grad_norm": 0.99609375, + "learning_rate": 7.408104012835582e-05, + "loss": 1.3395, + "step": 729 + }, + { + "epoch": 0.11508584859689856, + "grad_norm": 0.9140625, + "learning_rate": 7.407659374123879e-05, + "loss": 1.0933, + "step": 730 + }, + { + "epoch": 0.11524350044429157, + "grad_norm": 0.98046875, + "learning_rate": 7.407214740906494e-05, + "loss": 1.1906, + "step": 731 + }, + { + "epoch": 0.11540115229168459, + "grad_norm": 0.92578125, + "learning_rate": 7.406770113184364e-05, + "loss": 0.9344, + "step": 732 + }, + { + "epoch": 0.1155588041390776, + "grad_norm": 0.8828125, + "learning_rate": 7.40632549095844e-05, + "loss": 1.1063, + "step": 733 + }, + { + "epoch": 0.11571645598647061, + "grad_norm": 0.96875, + "learning_rate": 7.405880874229663e-05, + "loss": 1.3994, + "step": 734 + }, + { + "epoch": 0.11587410783386362, + "grad_norm": 0.9453125, + "learning_rate": 7.405436262998972e-05, + "loss": 1.1545, + "step": 735 + }, + { + "epoch": 0.11603175968125663, + "grad_norm": 1.125, + "learning_rate": 7.404991657267312e-05, + "loss": 1.2821, + "step": 736 + }, + { + "epoch": 0.11618941152864964, + "grad_norm": 0.95703125, + "learning_rate": 7.404547057035617e-05, + "loss": 1.1387, + "step": 737 + }, + { + "epoch": 0.11634706337604266, + "grad_norm": 0.9140625, + "learning_rate": 7.404102462304842e-05, + "loss": 1.0894, + "step": 738 + }, + { + "epoch": 0.11650471522343567, + "grad_norm": 0.9375, + "learning_rate": 7.403657873075925e-05, + "loss": 1.0531, + "step": 739 + }, + { + "epoch": 0.11666236707082868, + "grad_norm": 0.9375, + "learning_rate": 7.403213289349804e-05, + "loss": 1.2914, + "step": 740 + }, + { + "epoch": 0.11682001891822169, + "grad_norm": 1.0390625, + "learning_rate": 7.402768711127425e-05, + "loss": 1.0987, + "step": 741 + }, + { + "epoch": 0.1169776707656147, + "grad_norm": 0.921875, + "learning_rate": 7.402324138409727e-05, + "loss": 1.2507, + "step": 742 + }, + { + "epoch": 0.11713532261300771, + "grad_norm": 0.99609375, + "learning_rate": 7.401879571197656e-05, + "loss": 1.2409, + "step": 743 + }, + { + "epoch": 0.11729297446040073, + "grad_norm": 0.94140625, + "learning_rate": 7.401435009492153e-05, + "loss": 1.0694, + "step": 744 + }, + { + "epoch": 0.11745062630779374, + "grad_norm": 0.921875, + "learning_rate": 7.40099045329416e-05, + "loss": 1.0904, + "step": 745 + }, + { + "epoch": 0.11760827815518675, + "grad_norm": 1.40625, + "learning_rate": 7.400545902604621e-05, + "loss": 1.3173, + "step": 746 + }, + { + "epoch": 0.11776593000257976, + "grad_norm": 0.8828125, + "learning_rate": 7.400101357424471e-05, + "loss": 1.0317, + "step": 747 + }, + { + "epoch": 0.11792358184997277, + "grad_norm": 0.95703125, + "learning_rate": 7.399656817754664e-05, + "loss": 1.2205, + "step": 748 + }, + { + "epoch": 0.11808123369736578, + "grad_norm": 0.90234375, + "learning_rate": 7.399212283596132e-05, + "loss": 1.5768, + "step": 749 + }, + { + "epoch": 0.1182388855447588, + "grad_norm": 1.0, + "learning_rate": 7.398767754949822e-05, + "loss": 1.1895, + "step": 750 + }, + { + "epoch": 0.11839653739215181, + "grad_norm": 0.94921875, + "learning_rate": 7.398323231816677e-05, + "loss": 1.3032, + "step": 751 + }, + { + "epoch": 0.11855418923954482, + "grad_norm": 0.90234375, + "learning_rate": 7.39787871419763e-05, + "loss": 1.2666, + "step": 752 + }, + { + "epoch": 0.11871184108693783, + "grad_norm": 0.953125, + "learning_rate": 7.397434202093636e-05, + "loss": 1.1103, + "step": 753 + }, + { + "epoch": 0.11886949293433084, + "grad_norm": 0.875, + "learning_rate": 7.396989695505631e-05, + "loss": 1.1344, + "step": 754 + }, + { + "epoch": 0.11902714478172385, + "grad_norm": 0.859375, + "learning_rate": 7.396545194434558e-05, + "loss": 1.2312, + "step": 755 + }, + { + "epoch": 0.11918479662911687, + "grad_norm": 1.015625, + "learning_rate": 7.39610069888136e-05, + "loss": 1.3046, + "step": 756 + }, + { + "epoch": 0.11934244847650988, + "grad_norm": 0.90625, + "learning_rate": 7.395656208846972e-05, + "loss": 1.077, + "step": 757 + }, + { + "epoch": 0.11950010032390289, + "grad_norm": 1.0078125, + "learning_rate": 7.395211724332345e-05, + "loss": 1.3239, + "step": 758 + }, + { + "epoch": 0.1196577521712959, + "grad_norm": 0.9296875, + "learning_rate": 7.39476724533842e-05, + "loss": 1.1966, + "step": 759 + }, + { + "epoch": 0.11981540401868891, + "grad_norm": 0.8984375, + "learning_rate": 7.394322771866136e-05, + "loss": 1.1077, + "step": 760 + }, + { + "epoch": 0.11997305586608192, + "grad_norm": 0.9375, + "learning_rate": 7.393878303916435e-05, + "loss": 1.2681, + "step": 761 + }, + { + "epoch": 0.12013070771347494, + "grad_norm": 0.99609375, + "learning_rate": 7.393433841490257e-05, + "loss": 1.3597, + "step": 762 + }, + { + "epoch": 0.12028835956086795, + "grad_norm": 0.98828125, + "learning_rate": 7.392989384588551e-05, + "loss": 1.0681, + "step": 763 + }, + { + "epoch": 0.12044601140826096, + "grad_norm": 1.125, + "learning_rate": 7.392544933212256e-05, + "loss": 1.2165, + "step": 764 + }, + { + "epoch": 0.12060366325565397, + "grad_norm": 0.9375, + "learning_rate": 7.39210048736231e-05, + "loss": 1.1721, + "step": 765 + }, + { + "epoch": 0.12076131510304698, + "grad_norm": 0.9453125, + "learning_rate": 7.391656047039662e-05, + "loss": 1.3108, + "step": 766 + }, + { + "epoch": 0.12091896695044, + "grad_norm": 0.93359375, + "learning_rate": 7.391211612245244e-05, + "loss": 1.0176, + "step": 767 + }, + { + "epoch": 0.121076618797833, + "grad_norm": 0.9453125, + "learning_rate": 7.390767182980008e-05, + "loss": 1.2099, + "step": 768 + }, + { + "epoch": 0.12123427064522602, + "grad_norm": 0.96875, + "learning_rate": 7.390322759244892e-05, + "loss": 1.0398, + "step": 769 + }, + { + "epoch": 0.12139192249261903, + "grad_norm": 1.046875, + "learning_rate": 7.389878341040837e-05, + "loss": 1.5509, + "step": 770 + }, + { + "epoch": 0.12154957434001204, + "grad_norm": 0.97265625, + "learning_rate": 7.389433928368785e-05, + "loss": 1.2606, + "step": 771 + }, + { + "epoch": 0.12170722618740505, + "grad_norm": 0.95703125, + "learning_rate": 7.388989521229674e-05, + "loss": 1.1484, + "step": 772 + }, + { + "epoch": 0.12186487803479806, + "grad_norm": 1.0078125, + "learning_rate": 7.388545119624457e-05, + "loss": 1.3385, + "step": 773 + }, + { + "epoch": 0.12202252988219107, + "grad_norm": 1.0625, + "learning_rate": 7.388100723554068e-05, + "loss": 1.0916, + "step": 774 + }, + { + "epoch": 0.12218018172958409, + "grad_norm": 0.9453125, + "learning_rate": 7.38765633301945e-05, + "loss": 1.1116, + "step": 775 + }, + { + "epoch": 0.1223378335769771, + "grad_norm": 1.0234375, + "learning_rate": 7.387211948021546e-05, + "loss": 1.1095, + "step": 776 + }, + { + "epoch": 0.12249548542437011, + "grad_norm": 1.109375, + "learning_rate": 7.386767568561292e-05, + "loss": 1.4139, + "step": 777 + }, + { + "epoch": 0.12265313727176312, + "grad_norm": 1.1015625, + "learning_rate": 7.38632319463964e-05, + "loss": 1.3321, + "step": 778 + }, + { + "epoch": 0.12281078911915613, + "grad_norm": 0.984375, + "learning_rate": 7.385878826257526e-05, + "loss": 1.2532, + "step": 779 + }, + { + "epoch": 0.12296844096654914, + "grad_norm": 1.0, + "learning_rate": 7.385434463415893e-05, + "loss": 1.0896, + "step": 780 + }, + { + "epoch": 0.12312609281394216, + "grad_norm": 0.9765625, + "learning_rate": 7.384990106115682e-05, + "loss": 1.1889, + "step": 781 + }, + { + "epoch": 0.12328374466133517, + "grad_norm": 1.03125, + "learning_rate": 7.384545754357828e-05, + "loss": 1.2576, + "step": 782 + }, + { + "epoch": 0.12344139650872818, + "grad_norm": 0.89453125, + "learning_rate": 7.384101408143286e-05, + "loss": 1.027, + "step": 783 + }, + { + "epoch": 0.12359904835612119, + "grad_norm": 0.921875, + "learning_rate": 7.383657067472993e-05, + "loss": 1.2874, + "step": 784 + }, + { + "epoch": 0.1237567002035142, + "grad_norm": 0.86328125, + "learning_rate": 7.383212732347889e-05, + "loss": 0.9958, + "step": 785 + }, + { + "epoch": 0.12391435205090721, + "grad_norm": 0.94921875, + "learning_rate": 7.382768402768914e-05, + "loss": 1.23, + "step": 786 + }, + { + "epoch": 0.12407200389830023, + "grad_norm": 0.94140625, + "learning_rate": 7.38232407873701e-05, + "loss": 1.2853, + "step": 787 + }, + { + "epoch": 0.12422965574569324, + "grad_norm": 1.03125, + "learning_rate": 7.381879760253124e-05, + "loss": 1.2576, + "step": 788 + }, + { + "epoch": 0.12438730759308625, + "grad_norm": 0.97265625, + "learning_rate": 7.381435447318195e-05, + "loss": 1.0389, + "step": 789 + }, + { + "epoch": 0.12454495944047926, + "grad_norm": 1.0234375, + "learning_rate": 7.380991139933163e-05, + "loss": 1.0695, + "step": 790 + }, + { + "epoch": 0.12470261128787227, + "grad_norm": 1.0234375, + "learning_rate": 7.380546838098966e-05, + "loss": 1.1153, + "step": 791 + }, + { + "epoch": 0.12486026313526528, + "grad_norm": 1.0, + "learning_rate": 7.380102541816554e-05, + "loss": 1.258, + "step": 792 + }, + { + "epoch": 0.1250179149826583, + "grad_norm": 0.96875, + "learning_rate": 7.379658251086864e-05, + "loss": 1.1759, + "step": 793 + }, + { + "epoch": 0.12517556683005132, + "grad_norm": 1.0390625, + "learning_rate": 7.37921396591084e-05, + "loss": 1.2502, + "step": 794 + }, + { + "epoch": 0.12533321867744432, + "grad_norm": 0.90234375, + "learning_rate": 7.378769686289422e-05, + "loss": 1.1113, + "step": 795 + }, + { + "epoch": 0.12549087052483734, + "grad_norm": 0.9140625, + "learning_rate": 7.378325412223552e-05, + "loss": 1.1515, + "step": 796 + }, + { + "epoch": 0.12564852237223034, + "grad_norm": 0.98828125, + "learning_rate": 7.377881143714172e-05, + "loss": 1.1773, + "step": 797 + }, + { + "epoch": 0.12580617421962337, + "grad_norm": 0.9140625, + "learning_rate": 7.377436880762218e-05, + "loss": 1.32, + "step": 798 + }, + { + "epoch": 0.12596382606701637, + "grad_norm": 1.03125, + "learning_rate": 7.37699262336864e-05, + "loss": 1.3543, + "step": 799 + }, + { + "epoch": 0.1261214779144094, + "grad_norm": 1.015625, + "learning_rate": 7.376548371534378e-05, + "loss": 1.2574, + "step": 800 + }, + { + "epoch": 0.1262791297618024, + "grad_norm": 0.9296875, + "learning_rate": 7.37610412526037e-05, + "loss": 1.3045, + "step": 801 + }, + { + "epoch": 0.12643678160919541, + "grad_norm": 1.0078125, + "learning_rate": 7.375659884547561e-05, + "loss": 1.2639, + "step": 802 + }, + { + "epoch": 0.1265944334565884, + "grad_norm": 0.9296875, + "learning_rate": 7.375215649396885e-05, + "loss": 0.9907, + "step": 803 + }, + { + "epoch": 0.12675208530398144, + "grad_norm": 0.94921875, + "learning_rate": 7.374771419809296e-05, + "loss": 1.1223, + "step": 804 + }, + { + "epoch": 0.12690973715137444, + "grad_norm": 0.953125, + "learning_rate": 7.374327195785726e-05, + "loss": 1.1565, + "step": 805 + }, + { + "epoch": 0.12706738899876746, + "grad_norm": 0.9453125, + "learning_rate": 7.373882977327122e-05, + "loss": 1.1606, + "step": 806 + }, + { + "epoch": 0.12722504084616046, + "grad_norm": 0.921875, + "learning_rate": 7.373438764434422e-05, + "loss": 1.1745, + "step": 807 + }, + { + "epoch": 0.12738269269355348, + "grad_norm": 0.9609375, + "learning_rate": 7.372994557108566e-05, + "loss": 1.3641, + "step": 808 + }, + { + "epoch": 0.12754034454094648, + "grad_norm": 0.90625, + "learning_rate": 7.372550355350501e-05, + "loss": 1.1421, + "step": 809 + }, + { + "epoch": 0.1276979963883395, + "grad_norm": 1.0, + "learning_rate": 7.37210615916116e-05, + "loss": 1.1201, + "step": 810 + }, + { + "epoch": 0.1278556482357325, + "grad_norm": 0.98828125, + "learning_rate": 7.371661968541496e-05, + "loss": 1.1053, + "step": 811 + }, + { + "epoch": 0.12801330008312553, + "grad_norm": 0.98828125, + "learning_rate": 7.371217783492443e-05, + "loss": 1.2309, + "step": 812 + }, + { + "epoch": 0.12817095193051853, + "grad_norm": 0.953125, + "learning_rate": 7.37077360401494e-05, + "loss": 1.1134, + "step": 813 + }, + { + "epoch": 0.12832860377791155, + "grad_norm": 0.8828125, + "learning_rate": 7.370329430109936e-05, + "loss": 1.1089, + "step": 814 + }, + { + "epoch": 0.12848625562530455, + "grad_norm": 1.1328125, + "learning_rate": 7.369885261778368e-05, + "loss": 1.2232, + "step": 815 + }, + { + "epoch": 0.12864390747269758, + "grad_norm": 0.859375, + "learning_rate": 7.369441099021177e-05, + "loss": 0.9038, + "step": 816 + }, + { + "epoch": 0.12880155932009058, + "grad_norm": 0.9453125, + "learning_rate": 7.368996941839306e-05, + "loss": 1.2109, + "step": 817 + }, + { + "epoch": 0.1289592111674836, + "grad_norm": 0.890625, + "learning_rate": 7.368552790233691e-05, + "loss": 1.0869, + "step": 818 + }, + { + "epoch": 0.1291168630148766, + "grad_norm": 0.8984375, + "learning_rate": 7.368108644205283e-05, + "loss": 1.0288, + "step": 819 + }, + { + "epoch": 0.12927451486226962, + "grad_norm": 0.796875, + "learning_rate": 7.367664503755018e-05, + "loss": 0.9274, + "step": 820 + }, + { + "epoch": 0.12943216670966262, + "grad_norm": 0.9296875, + "learning_rate": 7.367220368883836e-05, + "loss": 1.0057, + "step": 821 + }, + { + "epoch": 0.12958981855705565, + "grad_norm": 0.921875, + "learning_rate": 7.366776239592681e-05, + "loss": 1.2045, + "step": 822 + }, + { + "epoch": 0.12974747040444864, + "grad_norm": 0.93359375, + "learning_rate": 7.36633211588249e-05, + "loss": 0.8991, + "step": 823 + }, + { + "epoch": 0.12990512225184167, + "grad_norm": 1.078125, + "learning_rate": 7.36588799775421e-05, + "loss": 1.0336, + "step": 824 + }, + { + "epoch": 0.13006277409923467, + "grad_norm": 0.984375, + "learning_rate": 7.365443885208783e-05, + "loss": 1.1097, + "step": 825 + }, + { + "epoch": 0.1302204259466277, + "grad_norm": 0.93359375, + "learning_rate": 7.364999778247143e-05, + "loss": 0.9801, + "step": 826 + }, + { + "epoch": 0.1303780777940207, + "grad_norm": 0.99609375, + "learning_rate": 7.364555676870238e-05, + "loss": 1.2915, + "step": 827 + }, + { + "epoch": 0.13053572964141372, + "grad_norm": 0.95703125, + "learning_rate": 7.364111581079002e-05, + "loss": 1.1857, + "step": 828 + }, + { + "epoch": 0.13069338148880671, + "grad_norm": 0.921875, + "learning_rate": 7.363667490874384e-05, + "loss": 1.0863, + "step": 829 + }, + { + "epoch": 0.13085103333619974, + "grad_norm": 1.015625, + "learning_rate": 7.363223406257324e-05, + "loss": 1.261, + "step": 830 + }, + { + "epoch": 0.13100868518359274, + "grad_norm": 1.0234375, + "learning_rate": 7.36277932722876e-05, + "loss": 1.2139, + "step": 831 + }, + { + "epoch": 0.13116633703098576, + "grad_norm": 0.9609375, + "learning_rate": 7.362335253789635e-05, + "loss": 1.1666, + "step": 832 + }, + { + "epoch": 0.13132398887837876, + "grad_norm": 0.984375, + "learning_rate": 7.361891185940886e-05, + "loss": 1.4789, + "step": 833 + }, + { + "epoch": 0.1314816407257718, + "grad_norm": 1.0625, + "learning_rate": 7.361447123683463e-05, + "loss": 1.2802, + "step": 834 + }, + { + "epoch": 0.13163929257316478, + "grad_norm": 0.97265625, + "learning_rate": 7.3610030670183e-05, + "loss": 1.2085, + "step": 835 + }, + { + "epoch": 0.1317969444205578, + "grad_norm": 0.9375, + "learning_rate": 7.36055901594634e-05, + "loss": 1.1116, + "step": 836 + }, + { + "epoch": 0.1319545962679508, + "grad_norm": 1.046875, + "learning_rate": 7.360114970468523e-05, + "loss": 1.1272, + "step": 837 + }, + { + "epoch": 0.13211224811534383, + "grad_norm": 0.93359375, + "learning_rate": 7.35967093058579e-05, + "loss": 1.2472, + "step": 838 + }, + { + "epoch": 0.13226989996273683, + "grad_norm": 0.8671875, + "learning_rate": 7.359226896299086e-05, + "loss": 1.2304, + "step": 839 + }, + { + "epoch": 0.13242755181012986, + "grad_norm": 0.94921875, + "learning_rate": 7.358782867609351e-05, + "loss": 1.0428, + "step": 840 + }, + { + "epoch": 0.13258520365752285, + "grad_norm": 0.953125, + "learning_rate": 7.358338844517523e-05, + "loss": 1.1529, + "step": 841 + }, + { + "epoch": 0.13274285550491588, + "grad_norm": 0.97265625, + "learning_rate": 7.357894827024547e-05, + "loss": 1.1142, + "step": 842 + }, + { + "epoch": 0.13290050735230888, + "grad_norm": 0.9296875, + "learning_rate": 7.357450815131356e-05, + "loss": 1.1024, + "step": 843 + }, + { + "epoch": 0.1330581591997019, + "grad_norm": 0.93359375, + "learning_rate": 7.357006808838903e-05, + "loss": 0.9091, + "step": 844 + }, + { + "epoch": 0.1332158110470949, + "grad_norm": 1.0703125, + "learning_rate": 7.35656280814812e-05, + "loss": 1.1265, + "step": 845 + }, + { + "epoch": 0.13337346289448793, + "grad_norm": 0.9609375, + "learning_rate": 7.356118813059953e-05, + "loss": 1.1058, + "step": 846 + }, + { + "epoch": 0.13353111474188092, + "grad_norm": 1.0859375, + "learning_rate": 7.35567482357534e-05, + "loss": 1.2292, + "step": 847 + }, + { + "epoch": 0.13368876658927395, + "grad_norm": 0.94140625, + "learning_rate": 7.35523083969522e-05, + "loss": 1.1149, + "step": 848 + }, + { + "epoch": 0.13384641843666695, + "grad_norm": 0.9609375, + "learning_rate": 7.354786861420542e-05, + "loss": 1.2834, + "step": 849 + }, + { + "epoch": 0.13400407028405997, + "grad_norm": 0.92578125, + "learning_rate": 7.35434288875224e-05, + "loss": 1.0439, + "step": 850 + }, + { + "epoch": 0.13416172213145297, + "grad_norm": 0.859375, + "learning_rate": 7.353898921691258e-05, + "loss": 1.0132, + "step": 851 + }, + { + "epoch": 0.134319373978846, + "grad_norm": 0.95703125, + "learning_rate": 7.353454960238536e-05, + "loss": 1.2369, + "step": 852 + }, + { + "epoch": 0.134477025826239, + "grad_norm": 0.87109375, + "learning_rate": 7.353011004395013e-05, + "loss": 1.0607, + "step": 853 + }, + { + "epoch": 0.13463467767363202, + "grad_norm": 0.96484375, + "learning_rate": 7.352567054161634e-05, + "loss": 1.3504, + "step": 854 + }, + { + "epoch": 0.13479232952102502, + "grad_norm": 0.94140625, + "learning_rate": 7.352123109539337e-05, + "loss": 1.2534, + "step": 855 + }, + { + "epoch": 0.13494998136841804, + "grad_norm": 0.984375, + "learning_rate": 7.351679170529065e-05, + "loss": 0.9253, + "step": 856 + }, + { + "epoch": 0.13510763321581104, + "grad_norm": 1.0078125, + "learning_rate": 7.351235237131757e-05, + "loss": 1.2869, + "step": 857 + }, + { + "epoch": 0.13526528506320407, + "grad_norm": 1.0234375, + "learning_rate": 7.35079130934835e-05, + "loss": 1.0661, + "step": 858 + }, + { + "epoch": 0.13542293691059706, + "grad_norm": 1.0234375, + "learning_rate": 7.350347387179794e-05, + "loss": 1.1316, + "step": 859 + }, + { + "epoch": 0.1355805887579901, + "grad_norm": 1.046875, + "learning_rate": 7.349903470627026e-05, + "loss": 1.3311, + "step": 860 + }, + { + "epoch": 0.1357382406053831, + "grad_norm": 1.0546875, + "learning_rate": 7.349459559690984e-05, + "loss": 1.224, + "step": 861 + }, + { + "epoch": 0.1358958924527761, + "grad_norm": 1.0234375, + "learning_rate": 7.349015654372613e-05, + "loss": 1.1074, + "step": 862 + }, + { + "epoch": 0.1360535443001691, + "grad_norm": 0.89453125, + "learning_rate": 7.348571754672846e-05, + "loss": 1.2115, + "step": 863 + }, + { + "epoch": 0.13621119614756214, + "grad_norm": 0.76171875, + "learning_rate": 7.348127860592636e-05, + "loss": 0.926, + "step": 864 + }, + { + "epoch": 0.13636884799495513, + "grad_norm": 0.91796875, + "learning_rate": 7.347683972132915e-05, + "loss": 1.2034, + "step": 865 + }, + { + "epoch": 0.13652649984234816, + "grad_norm": 1.0234375, + "learning_rate": 7.347240089294627e-05, + "loss": 1.2762, + "step": 866 + }, + { + "epoch": 0.13668415168974116, + "grad_norm": 1.03125, + "learning_rate": 7.346796212078713e-05, + "loss": 1.0057, + "step": 867 + }, + { + "epoch": 0.13684180353713418, + "grad_norm": 0.953125, + "learning_rate": 7.346352340486108e-05, + "loss": 0.9425, + "step": 868 + }, + { + "epoch": 0.13699945538452718, + "grad_norm": 1.0625, + "learning_rate": 7.345908474517762e-05, + "loss": 1.2193, + "step": 869 + }, + { + "epoch": 0.1371571072319202, + "grad_norm": 0.9453125, + "learning_rate": 7.345464614174611e-05, + "loss": 1.0303, + "step": 870 + }, + { + "epoch": 0.1373147590793132, + "grad_norm": 0.9765625, + "learning_rate": 7.345020759457595e-05, + "loss": 1.1257, + "step": 871 + }, + { + "epoch": 0.13747241092670623, + "grad_norm": 0.9765625, + "learning_rate": 7.344576910367658e-05, + "loss": 1.1779, + "step": 872 + }, + { + "epoch": 0.13763006277409923, + "grad_norm": 0.9375, + "learning_rate": 7.344133066905734e-05, + "loss": 1.1765, + "step": 873 + }, + { + "epoch": 0.13778771462149225, + "grad_norm": 0.8828125, + "learning_rate": 7.343689229072771e-05, + "loss": 1.1481, + "step": 874 + }, + { + "epoch": 0.13794536646888525, + "grad_norm": 0.90625, + "learning_rate": 7.343245396869706e-05, + "loss": 1.1828, + "step": 875 + }, + { + "epoch": 0.13810301831627828, + "grad_norm": 0.98828125, + "learning_rate": 7.342801570297483e-05, + "loss": 1.3029, + "step": 876 + }, + { + "epoch": 0.13826067016367127, + "grad_norm": 1.0234375, + "learning_rate": 7.342357749357036e-05, + "loss": 1.2341, + "step": 877 + }, + { + "epoch": 0.1384183220110643, + "grad_norm": 0.89453125, + "learning_rate": 7.341913934049312e-05, + "loss": 1.1462, + "step": 878 + }, + { + "epoch": 0.1385759738584573, + "grad_norm": 1.015625, + "learning_rate": 7.341470124375249e-05, + "loss": 1.0431, + "step": 879 + }, + { + "epoch": 0.13873362570585032, + "grad_norm": 1.0625, + "learning_rate": 7.34102632033579e-05, + "loss": 1.2274, + "step": 880 + }, + { + "epoch": 0.13889127755324332, + "grad_norm": 0.96875, + "learning_rate": 7.340582521931873e-05, + "loss": 1.2855, + "step": 881 + }, + { + "epoch": 0.13904892940063635, + "grad_norm": 1.0234375, + "learning_rate": 7.340138729164439e-05, + "loss": 1.2785, + "step": 882 + }, + { + "epoch": 0.13920658124802934, + "grad_norm": 0.921875, + "learning_rate": 7.339694942034425e-05, + "loss": 1.0836, + "step": 883 + }, + { + "epoch": 0.13936423309542237, + "grad_norm": 0.99609375, + "learning_rate": 7.33925116054278e-05, + "loss": 1.2375, + "step": 884 + }, + { + "epoch": 0.13952188494281537, + "grad_norm": 1.0, + "learning_rate": 7.33880738469044e-05, + "loss": 1.3075, + "step": 885 + }, + { + "epoch": 0.1396795367902084, + "grad_norm": 0.91796875, + "learning_rate": 7.338363614478347e-05, + "loss": 1.3019, + "step": 886 + }, + { + "epoch": 0.1398371886376014, + "grad_norm": 0.9375, + "learning_rate": 7.337919849907437e-05, + "loss": 1.0888, + "step": 887 + }, + { + "epoch": 0.13999484048499442, + "grad_norm": 0.8671875, + "learning_rate": 7.337476090978652e-05, + "loss": 1.1154, + "step": 888 + }, + { + "epoch": 0.1401524923323874, + "grad_norm": 1.0, + "learning_rate": 7.337032337692937e-05, + "loss": 1.1271, + "step": 889 + }, + { + "epoch": 0.14031014417978044, + "grad_norm": 0.9453125, + "learning_rate": 7.336588590051232e-05, + "loss": 1.3659, + "step": 890 + }, + { + "epoch": 0.14046779602717344, + "grad_norm": 1.0078125, + "learning_rate": 7.336144848054474e-05, + "loss": 1.2111, + "step": 891 + }, + { + "epoch": 0.14062544787456646, + "grad_norm": 1.03125, + "learning_rate": 7.335701111703604e-05, + "loss": 1.2921, + "step": 892 + }, + { + "epoch": 0.14078309972195946, + "grad_norm": 1.0234375, + "learning_rate": 7.335257380999561e-05, + "loss": 1.4295, + "step": 893 + }, + { + "epoch": 0.14094075156935248, + "grad_norm": 0.98828125, + "learning_rate": 7.334813655943291e-05, + "loss": 1.2903, + "step": 894 + }, + { + "epoch": 0.14109840341674548, + "grad_norm": 0.9609375, + "learning_rate": 7.334369936535727e-05, + "loss": 1.099, + "step": 895 + }, + { + "epoch": 0.1412560552641385, + "grad_norm": 0.98828125, + "learning_rate": 7.333926222777818e-05, + "loss": 1.3001, + "step": 896 + }, + { + "epoch": 0.1414137071115315, + "grad_norm": 0.90234375, + "learning_rate": 7.333482514670499e-05, + "loss": 1.0216, + "step": 897 + }, + { + "epoch": 0.14157135895892453, + "grad_norm": 0.95703125, + "learning_rate": 7.33303881221471e-05, + "loss": 1.1811, + "step": 898 + }, + { + "epoch": 0.14172901080631753, + "grad_norm": 1.0, + "learning_rate": 7.332595115411393e-05, + "loss": 1.1414, + "step": 899 + }, + { + "epoch": 0.14188666265371055, + "grad_norm": 0.91015625, + "learning_rate": 7.33215142426149e-05, + "loss": 1.1882, + "step": 900 + }, + { + "epoch": 0.14204431450110355, + "grad_norm": 0.9453125, + "learning_rate": 7.331707738765938e-05, + "loss": 1.1021, + "step": 901 + }, + { + "epoch": 0.14220196634849658, + "grad_norm": 0.85546875, + "learning_rate": 7.33126405892568e-05, + "loss": 1.0459, + "step": 902 + }, + { + "epoch": 0.14235961819588958, + "grad_norm": 0.9375, + "learning_rate": 7.330820384741652e-05, + "loss": 1.2087, + "step": 903 + }, + { + "epoch": 0.1425172700432826, + "grad_norm": 0.91796875, + "learning_rate": 7.3303767162148e-05, + "loss": 1.037, + "step": 904 + }, + { + "epoch": 0.1426749218906756, + "grad_norm": 0.96875, + "learning_rate": 7.329933053346063e-05, + "loss": 1.1291, + "step": 905 + }, + { + "epoch": 0.14283257373806862, + "grad_norm": 0.9609375, + "learning_rate": 7.32948939613638e-05, + "loss": 1.446, + "step": 906 + }, + { + "epoch": 0.14299022558546162, + "grad_norm": 1.0, + "learning_rate": 7.329045744586692e-05, + "loss": 1.2247, + "step": 907 + }, + { + "epoch": 0.14314787743285465, + "grad_norm": 1.1015625, + "learning_rate": 7.328602098697934e-05, + "loss": 1.2402, + "step": 908 + }, + { + "epoch": 0.14330552928024765, + "grad_norm": 1.1640625, + "learning_rate": 7.328158458471057e-05, + "loss": 1.1348, + "step": 909 + }, + { + "epoch": 0.14346318112764067, + "grad_norm": 0.984375, + "learning_rate": 7.327714823906994e-05, + "loss": 1.3082, + "step": 910 + }, + { + "epoch": 0.14362083297503367, + "grad_norm": 0.94921875, + "learning_rate": 7.327271195006687e-05, + "loss": 1.0162, + "step": 911 + }, + { + "epoch": 0.1437784848224267, + "grad_norm": 1.1015625, + "learning_rate": 7.326827571771075e-05, + "loss": 1.0617, + "step": 912 + }, + { + "epoch": 0.1439361366698197, + "grad_norm": 1.0546875, + "learning_rate": 7.326383954201098e-05, + "loss": 1.2287, + "step": 913 + }, + { + "epoch": 0.14409378851721272, + "grad_norm": 0.97265625, + "learning_rate": 7.325940342297697e-05, + "loss": 0.976, + "step": 914 + }, + { + "epoch": 0.14425144036460572, + "grad_norm": 0.875, + "learning_rate": 7.325496736061815e-05, + "loss": 0.9393, + "step": 915 + }, + { + "epoch": 0.14440909221199874, + "grad_norm": 1.0234375, + "learning_rate": 7.32505313549439e-05, + "loss": 1.1421, + "step": 916 + }, + { + "epoch": 0.14456674405939174, + "grad_norm": 1.0, + "learning_rate": 7.324609540596362e-05, + "loss": 1.1644, + "step": 917 + }, + { + "epoch": 0.14472439590678476, + "grad_norm": 1.015625, + "learning_rate": 7.324165951368669e-05, + "loss": 1.1705, + "step": 918 + }, + { + "epoch": 0.14488204775417776, + "grad_norm": 0.921875, + "learning_rate": 7.323722367812255e-05, + "loss": 1.1147, + "step": 919 + }, + { + "epoch": 0.1450396996015708, + "grad_norm": 1.0, + "learning_rate": 7.323278789928059e-05, + "loss": 1.2157, + "step": 920 + }, + { + "epoch": 0.14519735144896379, + "grad_norm": 0.89453125, + "learning_rate": 7.32283521771702e-05, + "loss": 1.0341, + "step": 921 + }, + { + "epoch": 0.1453550032963568, + "grad_norm": 0.95703125, + "learning_rate": 7.322391651180078e-05, + "loss": 1.148, + "step": 922 + }, + { + "epoch": 0.1455126551437498, + "grad_norm": 1.0390625, + "learning_rate": 7.321948090318172e-05, + "loss": 1.003, + "step": 923 + }, + { + "epoch": 0.14567030699114283, + "grad_norm": 1.015625, + "learning_rate": 7.321504535132247e-05, + "loss": 1.148, + "step": 924 + }, + { + "epoch": 0.14582795883853583, + "grad_norm": 0.92578125, + "learning_rate": 7.32106098562324e-05, + "loss": 1.0988, + "step": 925 + }, + { + "epoch": 0.14598561068592886, + "grad_norm": 0.87890625, + "learning_rate": 7.32061744179209e-05, + "loss": 0.9523, + "step": 926 + }, + { + "epoch": 0.14614326253332185, + "grad_norm": 0.90625, + "learning_rate": 7.320173903639739e-05, + "loss": 1.4244, + "step": 927 + }, + { + "epoch": 0.14630091438071488, + "grad_norm": 1.0, + "learning_rate": 7.319730371167122e-05, + "loss": 1.2422, + "step": 928 + }, + { + "epoch": 0.14645856622810788, + "grad_norm": 0.86328125, + "learning_rate": 7.319286844375189e-05, + "loss": 1.1872, + "step": 929 + }, + { + "epoch": 0.1466162180755009, + "grad_norm": 0.953125, + "learning_rate": 7.318843323264872e-05, + "loss": 1.2095, + "step": 930 + }, + { + "epoch": 0.1467738699228939, + "grad_norm": 1.09375, + "learning_rate": 7.318399807837115e-05, + "loss": 1.2441, + "step": 931 + }, + { + "epoch": 0.14693152177028693, + "grad_norm": 1.046875, + "learning_rate": 7.317956298092856e-05, + "loss": 1.4817, + "step": 932 + }, + { + "epoch": 0.14708917361767992, + "grad_norm": 0.97265625, + "learning_rate": 7.31751279403303e-05, + "loss": 1.0741, + "step": 933 + }, + { + "epoch": 0.14724682546507295, + "grad_norm": 1.125, + "learning_rate": 7.317069295658588e-05, + "loss": 1.2691, + "step": 934 + }, + { + "epoch": 0.14740447731246595, + "grad_norm": 0.94921875, + "learning_rate": 7.316625802970463e-05, + "loss": 1.2311, + "step": 935 + }, + { + "epoch": 0.14756212915985897, + "grad_norm": 0.95703125, + "learning_rate": 7.316182315969598e-05, + "loss": 1.0153, + "step": 936 + }, + { + "epoch": 0.14771978100725197, + "grad_norm": 0.984375, + "learning_rate": 7.315738834656929e-05, + "loss": 1.0934, + "step": 937 + }, + { + "epoch": 0.147877432854645, + "grad_norm": 0.90625, + "learning_rate": 7.315295359033398e-05, + "loss": 1.0864, + "step": 938 + }, + { + "epoch": 0.14803508470203802, + "grad_norm": 0.96875, + "learning_rate": 7.314851889099945e-05, + "loss": 1.1927, + "step": 939 + }, + { + "epoch": 0.14819273654943102, + "grad_norm": 0.96484375, + "learning_rate": 7.31440842485751e-05, + "loss": 1.1883, + "step": 940 + }, + { + "epoch": 0.14835038839682405, + "grad_norm": 0.97265625, + "learning_rate": 7.313964966307035e-05, + "loss": 1.2068, + "step": 941 + }, + { + "epoch": 0.14850804024421704, + "grad_norm": 0.99609375, + "learning_rate": 7.313521513449457e-05, + "loss": 1.1585, + "step": 942 + }, + { + "epoch": 0.14866569209161007, + "grad_norm": 1.078125, + "learning_rate": 7.313078066285715e-05, + "loss": 1.3315, + "step": 943 + }, + { + "epoch": 0.14882334393900307, + "grad_norm": 0.96875, + "learning_rate": 7.312634624816748e-05, + "loss": 1.1245, + "step": 944 + }, + { + "epoch": 0.1489809957863961, + "grad_norm": 0.89453125, + "learning_rate": 7.312191189043502e-05, + "loss": 1.1142, + "step": 945 + }, + { + "epoch": 0.1491386476337891, + "grad_norm": 1.03125, + "learning_rate": 7.311747758966913e-05, + "loss": 1.2184, + "step": 946 + }, + { + "epoch": 0.14929629948118212, + "grad_norm": 0.90234375, + "learning_rate": 7.311304334587923e-05, + "loss": 1.0056, + "step": 947 + }, + { + "epoch": 0.1494539513285751, + "grad_norm": 0.83984375, + "learning_rate": 7.310860915907467e-05, + "loss": 1.0754, + "step": 948 + }, + { + "epoch": 0.14961160317596814, + "grad_norm": 0.94140625, + "learning_rate": 7.310417502926484e-05, + "loss": 1.1499, + "step": 949 + }, + { + "epoch": 0.14976925502336114, + "grad_norm": 0.87109375, + "learning_rate": 7.309974095645922e-05, + "loss": 1.1762, + "step": 950 + }, + { + "epoch": 0.14992690687075416, + "grad_norm": 0.96875, + "learning_rate": 7.309530694066717e-05, + "loss": 1.1175, + "step": 951 + }, + { + "epoch": 0.15008455871814716, + "grad_norm": 0.875, + "learning_rate": 7.309087298189807e-05, + "loss": 1.0166, + "step": 952 + }, + { + "epoch": 0.15024221056554019, + "grad_norm": 0.91015625, + "learning_rate": 7.308643908016132e-05, + "loss": 0.9621, + "step": 953 + }, + { + "epoch": 0.15039986241293318, + "grad_norm": 0.95703125, + "learning_rate": 7.308200523546629e-05, + "loss": 1.2469, + "step": 954 + }, + { + "epoch": 0.1505575142603262, + "grad_norm": 0.94140625, + "learning_rate": 7.307757144782246e-05, + "loss": 1.0857, + "step": 955 + }, + { + "epoch": 0.1507151661077192, + "grad_norm": 0.91796875, + "learning_rate": 7.307313771723917e-05, + "loss": 1.0383, + "step": 956 + }, + { + "epoch": 0.15087281795511223, + "grad_norm": 1.078125, + "learning_rate": 7.306870404372581e-05, + "loss": 1.2949, + "step": 957 + }, + { + "epoch": 0.15103046980250523, + "grad_norm": 1.015625, + "learning_rate": 7.306427042729181e-05, + "loss": 1.3026, + "step": 958 + }, + { + "epoch": 0.15118812164989825, + "grad_norm": 0.9453125, + "learning_rate": 7.305983686794653e-05, + "loss": 1.0829, + "step": 959 + }, + { + "epoch": 0.15134577349729125, + "grad_norm": 0.93359375, + "learning_rate": 7.30554033656994e-05, + "loss": 1.3281, + "step": 960 + }, + { + "epoch": 0.15150342534468428, + "grad_norm": 0.96875, + "learning_rate": 7.30509699205598e-05, + "loss": 1.2363, + "step": 961 + }, + { + "epoch": 0.15166107719207728, + "grad_norm": 0.890625, + "learning_rate": 7.304653653253713e-05, + "loss": 0.9763, + "step": 962 + }, + { + "epoch": 0.1518187290394703, + "grad_norm": 0.94921875, + "learning_rate": 7.304210320164078e-05, + "loss": 1.1151, + "step": 963 + }, + { + "epoch": 0.1519763808868633, + "grad_norm": 0.90625, + "learning_rate": 7.303766992788015e-05, + "loss": 1.1778, + "step": 964 + }, + { + "epoch": 0.15213403273425632, + "grad_norm": 1.203125, + "learning_rate": 7.303323671126465e-05, + "loss": 1.4283, + "step": 965 + }, + { + "epoch": 0.15229168458164932, + "grad_norm": 0.921875, + "learning_rate": 7.302880355180366e-05, + "loss": 1.1114, + "step": 966 + }, + { + "epoch": 0.15244933642904235, + "grad_norm": 0.984375, + "learning_rate": 7.302437044950658e-05, + "loss": 1.2378, + "step": 967 + }, + { + "epoch": 0.15260698827643535, + "grad_norm": 0.90234375, + "learning_rate": 7.30199374043828e-05, + "loss": 1.0008, + "step": 968 + }, + { + "epoch": 0.15276464012382837, + "grad_norm": 0.93359375, + "learning_rate": 7.30155044164417e-05, + "loss": 1.2024, + "step": 969 + }, + { + "epoch": 0.15292229197122137, + "grad_norm": 1.03125, + "learning_rate": 7.301107148569271e-05, + "loss": 1.2336, + "step": 970 + }, + { + "epoch": 0.1530799438186144, + "grad_norm": 0.86328125, + "learning_rate": 7.300663861214523e-05, + "loss": 1.0731, + "step": 971 + }, + { + "epoch": 0.1532375956660074, + "grad_norm": 0.8984375, + "learning_rate": 7.300220579580863e-05, + "loss": 0.9677, + "step": 972 + }, + { + "epoch": 0.15339524751340042, + "grad_norm": 0.90234375, + "learning_rate": 7.299777303669231e-05, + "loss": 0.9836, + "step": 973 + }, + { + "epoch": 0.15355289936079342, + "grad_norm": 0.99609375, + "learning_rate": 7.299334033480562e-05, + "loss": 1.0679, + "step": 974 + }, + { + "epoch": 0.15371055120818644, + "grad_norm": 0.90234375, + "learning_rate": 7.298890769015808e-05, + "loss": 1.0393, + "step": 975 + }, + { + "epoch": 0.15386820305557944, + "grad_norm": 1.234375, + "learning_rate": 7.298447510275896e-05, + "loss": 1.0612, + "step": 976 + }, + { + "epoch": 0.15402585490297246, + "grad_norm": 0.98828125, + "learning_rate": 7.298004257261772e-05, + "loss": 1.0934, + "step": 977 + }, + { + "epoch": 0.15418350675036546, + "grad_norm": 0.8125, + "learning_rate": 7.297561009974374e-05, + "loss": 0.937, + "step": 978 + }, + { + "epoch": 0.1543411585977585, + "grad_norm": 0.94921875, + "learning_rate": 7.297117768414637e-05, + "loss": 1.1493, + "step": 979 + }, + { + "epoch": 0.15449881044515149, + "grad_norm": 0.8515625, + "learning_rate": 7.296674532583509e-05, + "loss": 1.1116, + "step": 980 + }, + { + "epoch": 0.1546564622925445, + "grad_norm": 0.96484375, + "learning_rate": 7.296231302481921e-05, + "loss": 1.3317, + "step": 981 + }, + { + "epoch": 0.1548141141399375, + "grad_norm": 0.90234375, + "learning_rate": 7.295788078110819e-05, + "loss": 1.1275, + "step": 982 + }, + { + "epoch": 0.15497176598733053, + "grad_norm": 0.8515625, + "learning_rate": 7.295344859471138e-05, + "loss": 0.7711, + "step": 983 + }, + { + "epoch": 0.15512941783472353, + "grad_norm": 1.0859375, + "learning_rate": 7.294901646563819e-05, + "loss": 1.3431, + "step": 984 + }, + { + "epoch": 0.15528706968211656, + "grad_norm": 0.953125, + "learning_rate": 7.294458439389803e-05, + "loss": 1.1839, + "step": 985 + }, + { + "epoch": 0.15544472152950956, + "grad_norm": 1.0390625, + "learning_rate": 7.294015237950027e-05, + "loss": 1.4353, + "step": 986 + }, + { + "epoch": 0.15560237337690258, + "grad_norm": 0.96484375, + "learning_rate": 7.293572042245431e-05, + "loss": 1.0651, + "step": 987 + }, + { + "epoch": 0.15576002522429558, + "grad_norm": 0.90625, + "learning_rate": 7.293128852276956e-05, + "loss": 1.0434, + "step": 988 + }, + { + "epoch": 0.1559176770716886, + "grad_norm": 1.046875, + "learning_rate": 7.292685668045533e-05, + "loss": 1.1297, + "step": 989 + }, + { + "epoch": 0.1560753289190816, + "grad_norm": 1.03125, + "learning_rate": 7.292242489552114e-05, + "loss": 1.1229, + "step": 990 + }, + { + "epoch": 0.15623298076647463, + "grad_norm": 0.9140625, + "learning_rate": 7.291799316797632e-05, + "loss": 1.1803, + "step": 991 + }, + { + "epoch": 0.15639063261386763, + "grad_norm": 0.921875, + "learning_rate": 7.291356149783026e-05, + "loss": 1.0939, + "step": 992 + }, + { + "epoch": 0.15654828446126065, + "grad_norm": 0.87109375, + "learning_rate": 7.290912988509236e-05, + "loss": 0.9719, + "step": 993 + }, + { + "epoch": 0.15670593630865365, + "grad_norm": 0.92578125, + "learning_rate": 7.290469832977198e-05, + "loss": 1.0, + "step": 994 + }, + { + "epoch": 0.15686358815604667, + "grad_norm": 0.90234375, + "learning_rate": 7.290026683187857e-05, + "loss": 0.9822, + "step": 995 + }, + { + "epoch": 0.15702124000343967, + "grad_norm": 0.984375, + "learning_rate": 7.289583539142151e-05, + "loss": 1.1368, + "step": 996 + }, + { + "epoch": 0.1571788918508327, + "grad_norm": 0.953125, + "learning_rate": 7.289140400841017e-05, + "loss": 1.2877, + "step": 997 + }, + { + "epoch": 0.1573365436982257, + "grad_norm": 1.0078125, + "learning_rate": 7.288697268285393e-05, + "loss": 1.3098, + "step": 998 + }, + { + "epoch": 0.15749419554561872, + "grad_norm": 0.90234375, + "learning_rate": 7.288254141476218e-05, + "loss": 1.2059, + "step": 999 + }, + { + "epoch": 0.15765184739301172, + "grad_norm": 1.015625, + "learning_rate": 7.287811020414438e-05, + "loss": 1.0577, + "step": 1000 + }, + { + "epoch": 0.15765184739301172, + "eval_loss": 1.137479543685913, + "eval_runtime": 296.6003, + "eval_samples_per_second": 33.715, + "eval_steps_per_second": 0.705, + "step": 1000 + }, + { + "epoch": 0.15780949924040474, + "grad_norm": 0.8125, + "learning_rate": 7.287367905100987e-05, + "loss": 1.0921, + "step": 1001 + }, + { + "epoch": 0.15796715108779774, + "grad_norm": 1.015625, + "learning_rate": 7.286924795536805e-05, + "loss": 1.0942, + "step": 1002 + }, + { + "epoch": 0.15812480293519077, + "grad_norm": 0.85546875, + "learning_rate": 7.286481691722831e-05, + "loss": 1.0385, + "step": 1003 + }, + { + "epoch": 0.15828245478258376, + "grad_norm": 0.95703125, + "learning_rate": 7.286038593660001e-05, + "loss": 0.9699, + "step": 1004 + }, + { + "epoch": 0.1584401066299768, + "grad_norm": 1.09375, + "learning_rate": 7.285595501349258e-05, + "loss": 1.0774, + "step": 1005 + }, + { + "epoch": 0.1585977584773698, + "grad_norm": 0.97265625, + "learning_rate": 7.285152414791543e-05, + "loss": 1.3203, + "step": 1006 + }, + { + "epoch": 0.1587554103247628, + "grad_norm": 0.9921875, + "learning_rate": 7.284709333987789e-05, + "loss": 1.1586, + "step": 1007 + }, + { + "epoch": 0.1589130621721558, + "grad_norm": 0.9765625, + "learning_rate": 7.28426625893894e-05, + "loss": 1.1286, + "step": 1008 + }, + { + "epoch": 0.15907071401954884, + "grad_norm": 1.0, + "learning_rate": 7.28382318964593e-05, + "loss": 1.1665, + "step": 1009 + }, + { + "epoch": 0.15922836586694183, + "grad_norm": 1.0546875, + "learning_rate": 7.283380126109706e-05, + "loss": 1.1352, + "step": 1010 + }, + { + "epoch": 0.15938601771433486, + "grad_norm": 1.015625, + "learning_rate": 7.282937068331201e-05, + "loss": 1.214, + "step": 1011 + }, + { + "epoch": 0.15954366956172786, + "grad_norm": 0.91015625, + "learning_rate": 7.282494016311356e-05, + "loss": 1.226, + "step": 1012 + }, + { + "epoch": 0.15970132140912088, + "grad_norm": 1.078125, + "learning_rate": 7.282050970051111e-05, + "loss": 1.2391, + "step": 1013 + }, + { + "epoch": 0.15985897325651388, + "grad_norm": 1.03125, + "learning_rate": 7.281607929551398e-05, + "loss": 1.3136, + "step": 1014 + }, + { + "epoch": 0.1600166251039069, + "grad_norm": 0.8984375, + "learning_rate": 7.281164894813166e-05, + "loss": 0.9134, + "step": 1015 + }, + { + "epoch": 0.1601742769512999, + "grad_norm": 1.09375, + "learning_rate": 7.28072186583735e-05, + "loss": 1.2413, + "step": 1016 + }, + { + "epoch": 0.16033192879869293, + "grad_norm": 1.0546875, + "learning_rate": 7.280278842624887e-05, + "loss": 1.3154, + "step": 1017 + }, + { + "epoch": 0.16048958064608593, + "grad_norm": 1.0234375, + "learning_rate": 7.279835825176719e-05, + "loss": 1.1872, + "step": 1018 + }, + { + "epoch": 0.16064723249347895, + "grad_norm": 1.0234375, + "learning_rate": 7.279392813493781e-05, + "loss": 1.0962, + "step": 1019 + }, + { + "epoch": 0.16080488434087195, + "grad_norm": 1.015625, + "learning_rate": 7.278949807577016e-05, + "loss": 1.0495, + "step": 1020 + }, + { + "epoch": 0.16096253618826498, + "grad_norm": 1.0234375, + "learning_rate": 7.278506807427364e-05, + "loss": 1.1386, + "step": 1021 + }, + { + "epoch": 0.16112018803565797, + "grad_norm": 0.8984375, + "learning_rate": 7.278063813045758e-05, + "loss": 1.3057, + "step": 1022 + }, + { + "epoch": 0.161277839883051, + "grad_norm": 0.89453125, + "learning_rate": 7.277620824433143e-05, + "loss": 1.5924, + "step": 1023 + }, + { + "epoch": 0.161435491730444, + "grad_norm": 1.015625, + "learning_rate": 7.27717784159045e-05, + "loss": 1.1775, + "step": 1024 + }, + { + "epoch": 0.16159314357783702, + "grad_norm": 0.86328125, + "learning_rate": 7.276734864518626e-05, + "loss": 1.2048, + "step": 1025 + }, + { + "epoch": 0.16175079542523002, + "grad_norm": 0.9765625, + "learning_rate": 7.276291893218607e-05, + "loss": 1.0168, + "step": 1026 + }, + { + "epoch": 0.16190844727262305, + "grad_norm": 1.015625, + "learning_rate": 7.275848927691333e-05, + "loss": 1.4225, + "step": 1027 + }, + { + "epoch": 0.16206609912001604, + "grad_norm": 0.90625, + "learning_rate": 7.275405967937741e-05, + "loss": 1.051, + "step": 1028 + }, + { + "epoch": 0.16222375096740907, + "grad_norm": 0.89453125, + "learning_rate": 7.274963013958765e-05, + "loss": 1.2224, + "step": 1029 + }, + { + "epoch": 0.16238140281480207, + "grad_norm": 0.94140625, + "learning_rate": 7.274520065755354e-05, + "loss": 1.0428, + "step": 1030 + }, + { + "epoch": 0.1625390546621951, + "grad_norm": 1.0625, + "learning_rate": 7.274077123328443e-05, + "loss": 1.3585, + "step": 1031 + }, + { + "epoch": 0.1626967065095881, + "grad_norm": 0.96484375, + "learning_rate": 7.273634186678968e-05, + "loss": 1.1712, + "step": 1032 + }, + { + "epoch": 0.16285435835698112, + "grad_norm": 0.95703125, + "learning_rate": 7.27319125580787e-05, + "loss": 1.1267, + "step": 1033 + }, + { + "epoch": 0.1630120102043741, + "grad_norm": 0.94140625, + "learning_rate": 7.272748330716084e-05, + "loss": 1.265, + "step": 1034 + }, + { + "epoch": 0.16316966205176714, + "grad_norm": 1.0, + "learning_rate": 7.272305411404554e-05, + "loss": 1.1168, + "step": 1035 + }, + { + "epoch": 0.16332731389916014, + "grad_norm": 0.875, + "learning_rate": 7.271862497874218e-05, + "loss": 1.0935, + "step": 1036 + }, + { + "epoch": 0.16348496574655316, + "grad_norm": 0.97265625, + "learning_rate": 7.271419590126015e-05, + "loss": 1.4416, + "step": 1037 + }, + { + "epoch": 0.16364261759394616, + "grad_norm": 0.90625, + "learning_rate": 7.27097668816088e-05, + "loss": 0.9698, + "step": 1038 + }, + { + "epoch": 0.16380026944133919, + "grad_norm": 1.0390625, + "learning_rate": 7.270533791979752e-05, + "loss": 1.1774, + "step": 1039 + }, + { + "epoch": 0.16395792128873218, + "grad_norm": 0.8828125, + "learning_rate": 7.270090901583574e-05, + "loss": 1.0788, + "step": 1040 + }, + { + "epoch": 0.1641155731361252, + "grad_norm": 0.9453125, + "learning_rate": 7.269648016973283e-05, + "loss": 1.1205, + "step": 1041 + }, + { + "epoch": 0.1642732249835182, + "grad_norm": 0.99609375, + "learning_rate": 7.269205138149818e-05, + "loss": 1.097, + "step": 1042 + }, + { + "epoch": 0.16443087683091123, + "grad_norm": 0.92578125, + "learning_rate": 7.268762265114114e-05, + "loss": 1.0903, + "step": 1043 + }, + { + "epoch": 0.16458852867830423, + "grad_norm": 0.96875, + "learning_rate": 7.268319397867112e-05, + "loss": 1.2038, + "step": 1044 + }, + { + "epoch": 0.16474618052569726, + "grad_norm": 0.95703125, + "learning_rate": 7.267876536409752e-05, + "loss": 1.2062, + "step": 1045 + }, + { + "epoch": 0.16490383237309025, + "grad_norm": 1.0, + "learning_rate": 7.267433680742973e-05, + "loss": 1.156, + "step": 1046 + }, + { + "epoch": 0.16506148422048328, + "grad_norm": 1.046875, + "learning_rate": 7.266990830867708e-05, + "loss": 1.3046, + "step": 1047 + }, + { + "epoch": 0.16521913606787628, + "grad_norm": 0.9296875, + "learning_rate": 7.266547986784904e-05, + "loss": 1.0482, + "step": 1048 + }, + { + "epoch": 0.1653767879152693, + "grad_norm": 0.90234375, + "learning_rate": 7.266105148495492e-05, + "loss": 1.1674, + "step": 1049 + }, + { + "epoch": 0.1655344397626623, + "grad_norm": 0.95703125, + "learning_rate": 7.265662316000415e-05, + "loss": 1.2059, + "step": 1050 + }, + { + "epoch": 0.16569209161005533, + "grad_norm": 0.96484375, + "learning_rate": 7.265219489300612e-05, + "loss": 1.4553, + "step": 1051 + }, + { + "epoch": 0.16584974345744832, + "grad_norm": 0.87109375, + "learning_rate": 7.26477666839702e-05, + "loss": 1.0725, + "step": 1052 + }, + { + "epoch": 0.16600739530484135, + "grad_norm": 1.0390625, + "learning_rate": 7.264333853290577e-05, + "loss": 1.2494, + "step": 1053 + }, + { + "epoch": 0.16616504715223435, + "grad_norm": 0.98046875, + "learning_rate": 7.263891043982218e-05, + "loss": 1.1896, + "step": 1054 + }, + { + "epoch": 0.16632269899962737, + "grad_norm": 0.96484375, + "learning_rate": 7.26344824047289e-05, + "loss": 1.047, + "step": 1055 + }, + { + "epoch": 0.16648035084702037, + "grad_norm": 0.9921875, + "learning_rate": 7.263005442763526e-05, + "loss": 1.2455, + "step": 1056 + }, + { + "epoch": 0.1666380026944134, + "grad_norm": 1.0078125, + "learning_rate": 7.262562650855068e-05, + "loss": 1.0922, + "step": 1057 + }, + { + "epoch": 0.1667956545418064, + "grad_norm": 0.9765625, + "learning_rate": 7.262119864748448e-05, + "loss": 0.9815, + "step": 1058 + }, + { + "epoch": 0.16695330638919942, + "grad_norm": 0.95703125, + "learning_rate": 7.261677084444606e-05, + "loss": 1.1978, + "step": 1059 + }, + { + "epoch": 0.16711095823659242, + "grad_norm": 0.9765625, + "learning_rate": 7.26123430994449e-05, + "loss": 1.1045, + "step": 1060 + }, + { + "epoch": 0.16726861008398544, + "grad_norm": 0.87109375, + "learning_rate": 7.260791541249028e-05, + "loss": 1.0743, + "step": 1061 + }, + { + "epoch": 0.16742626193137844, + "grad_norm": 1.046875, + "learning_rate": 7.260348778359163e-05, + "loss": 1.1904, + "step": 1062 + }, + { + "epoch": 0.16758391377877146, + "grad_norm": 0.92578125, + "learning_rate": 7.259906021275831e-05, + "loss": 1.0195, + "step": 1063 + }, + { + "epoch": 0.16774156562616446, + "grad_norm": 1.125, + "learning_rate": 7.259463269999971e-05, + "loss": 1.0266, + "step": 1064 + }, + { + "epoch": 0.1678992174735575, + "grad_norm": 0.953125, + "learning_rate": 7.259020524532523e-05, + "loss": 1.3445, + "step": 1065 + }, + { + "epoch": 0.16805686932095049, + "grad_norm": 0.97265625, + "learning_rate": 7.258577784874423e-05, + "loss": 1.3835, + "step": 1066 + }, + { + "epoch": 0.1682145211683435, + "grad_norm": 0.96484375, + "learning_rate": 7.258135051026612e-05, + "loss": 1.1523, + "step": 1067 + }, + { + "epoch": 0.1683721730157365, + "grad_norm": 1.0859375, + "learning_rate": 7.257692322990028e-05, + "loss": 1.2187, + "step": 1068 + }, + { + "epoch": 0.16852982486312953, + "grad_norm": 1.03125, + "learning_rate": 7.257249600765606e-05, + "loss": 1.096, + "step": 1069 + }, + { + "epoch": 0.16868747671052253, + "grad_norm": 0.98046875, + "learning_rate": 7.256806884354289e-05, + "loss": 1.1944, + "step": 1070 + }, + { + "epoch": 0.16884512855791556, + "grad_norm": 0.8515625, + "learning_rate": 7.256364173757011e-05, + "loss": 1.1216, + "step": 1071 + }, + { + "epoch": 0.16900278040530856, + "grad_norm": 0.90625, + "learning_rate": 7.255921468974713e-05, + "loss": 1.1106, + "step": 1072 + }, + { + "epoch": 0.16916043225270158, + "grad_norm": 0.953125, + "learning_rate": 7.255478770008335e-05, + "loss": 1.2976, + "step": 1073 + }, + { + "epoch": 0.16931808410009458, + "grad_norm": 1.0234375, + "learning_rate": 7.255036076858806e-05, + "loss": 1.067, + "step": 1074 + }, + { + "epoch": 0.1694757359474876, + "grad_norm": 1.0078125, + "learning_rate": 7.254593389527079e-05, + "loss": 1.1403, + "step": 1075 + }, + { + "epoch": 0.1696333877948806, + "grad_norm": 0.88671875, + "learning_rate": 7.25415070801408e-05, + "loss": 0.9739, + "step": 1076 + }, + { + "epoch": 0.16979103964227363, + "grad_norm": 1.984375, + "learning_rate": 7.253708032320754e-05, + "loss": 1.2047, + "step": 1077 + }, + { + "epoch": 0.16994869148966663, + "grad_norm": 0.99609375, + "learning_rate": 7.253265362448036e-05, + "loss": 1.307, + "step": 1078 + }, + { + "epoch": 0.17010634333705965, + "grad_norm": 0.91015625, + "learning_rate": 7.252822698396866e-05, + "loss": 1.0295, + "step": 1079 + }, + { + "epoch": 0.17026399518445265, + "grad_norm": 0.89453125, + "learning_rate": 7.252380040168177e-05, + "loss": 1.2777, + "step": 1080 + }, + { + "epoch": 0.17042164703184567, + "grad_norm": 0.94921875, + "learning_rate": 7.251937387762913e-05, + "loss": 1.3088, + "step": 1081 + }, + { + "epoch": 0.17057929887923867, + "grad_norm": 0.9296875, + "learning_rate": 7.251494741182014e-05, + "loss": 1.3712, + "step": 1082 + }, + { + "epoch": 0.1707369507266317, + "grad_norm": 1.0625, + "learning_rate": 7.251052100426413e-05, + "loss": 1.1478, + "step": 1083 + }, + { + "epoch": 0.1708946025740247, + "grad_norm": 0.98046875, + "learning_rate": 7.250609465497051e-05, + "loss": 1.2346, + "step": 1084 + }, + { + "epoch": 0.17105225442141772, + "grad_norm": 0.9375, + "learning_rate": 7.250166836394861e-05, + "loss": 0.996, + "step": 1085 + }, + { + "epoch": 0.17120990626881072, + "grad_norm": 0.9921875, + "learning_rate": 7.249724213120787e-05, + "loss": 1.2197, + "step": 1086 + }, + { + "epoch": 0.17136755811620374, + "grad_norm": 1.0703125, + "learning_rate": 7.249281595675768e-05, + "loss": 1.2911, + "step": 1087 + }, + { + "epoch": 0.17152520996359674, + "grad_norm": 1.0078125, + "learning_rate": 7.24883898406074e-05, + "loss": 1.1625, + "step": 1088 + }, + { + "epoch": 0.17168286181098977, + "grad_norm": 0.95703125, + "learning_rate": 7.248396378276638e-05, + "loss": 1.1839, + "step": 1089 + }, + { + "epoch": 0.17184051365838277, + "grad_norm": 0.9765625, + "learning_rate": 7.247953778324401e-05, + "loss": 1.0299, + "step": 1090 + }, + { + "epoch": 0.1719981655057758, + "grad_norm": 0.80859375, + "learning_rate": 7.247511184204971e-05, + "loss": 0.9532, + "step": 1091 + }, + { + "epoch": 0.1721558173531688, + "grad_norm": 0.953125, + "learning_rate": 7.247068595919285e-05, + "loss": 1.1673, + "step": 1092 + }, + { + "epoch": 0.17231346920056181, + "grad_norm": 1.0078125, + "learning_rate": 7.246626013468279e-05, + "loss": 1.292, + "step": 1093 + }, + { + "epoch": 0.1724711210479548, + "grad_norm": 0.98828125, + "learning_rate": 7.246183436852891e-05, + "loss": 1.3648, + "step": 1094 + }, + { + "epoch": 0.17262877289534784, + "grad_norm": 0.86328125, + "learning_rate": 7.245740866074058e-05, + "loss": 1.0402, + "step": 1095 + }, + { + "epoch": 0.17278642474274086, + "grad_norm": 0.9921875, + "learning_rate": 7.245298301132721e-05, + "loss": 1.4757, + "step": 1096 + }, + { + "epoch": 0.17294407659013386, + "grad_norm": 0.90234375, + "learning_rate": 7.244855742029819e-05, + "loss": 1.3013, + "step": 1097 + }, + { + "epoch": 0.17310172843752689, + "grad_norm": 0.91015625, + "learning_rate": 7.244413188766286e-05, + "loss": 1.0135, + "step": 1098 + }, + { + "epoch": 0.17325938028491988, + "grad_norm": 1.09375, + "learning_rate": 7.243970641343063e-05, + "loss": 1.1352, + "step": 1099 + }, + { + "epoch": 0.1734170321323129, + "grad_norm": 0.87109375, + "learning_rate": 7.243528099761082e-05, + "loss": 1.1853, + "step": 1100 + }, + { + "epoch": 0.1735746839797059, + "grad_norm": 0.9609375, + "learning_rate": 7.243085564021291e-05, + "loss": 0.8484, + "step": 1101 + }, + { + "epoch": 0.17373233582709893, + "grad_norm": 1.046875, + "learning_rate": 7.242643034124621e-05, + "loss": 1.1056, + "step": 1102 + }, + { + "epoch": 0.17388998767449193, + "grad_norm": 1.1171875, + "learning_rate": 7.242200510072012e-05, + "loss": 1.217, + "step": 1103 + }, + { + "epoch": 0.17404763952188496, + "grad_norm": 0.98828125, + "learning_rate": 7.241757991864401e-05, + "loss": 1.4531, + "step": 1104 + }, + { + "epoch": 0.17420529136927795, + "grad_norm": 0.9453125, + "learning_rate": 7.241315479502722e-05, + "loss": 0.9688, + "step": 1105 + }, + { + "epoch": 0.17436294321667098, + "grad_norm": 0.9765625, + "learning_rate": 7.240872972987923e-05, + "loss": 1.0284, + "step": 1106 + }, + { + "epoch": 0.17452059506406398, + "grad_norm": 0.984375, + "learning_rate": 7.240430472320933e-05, + "loss": 1.0915, + "step": 1107 + }, + { + "epoch": 0.174678246911457, + "grad_norm": 0.96875, + "learning_rate": 7.239987977502695e-05, + "loss": 1.1627, + "step": 1108 + }, + { + "epoch": 0.17483589875885, + "grad_norm": 0.97265625, + "learning_rate": 7.239545488534146e-05, + "loss": 1.0793, + "step": 1109 + }, + { + "epoch": 0.17499355060624303, + "grad_norm": 0.8984375, + "learning_rate": 7.23910300541622e-05, + "loss": 1.1779, + "step": 1110 + }, + { + "epoch": 0.17515120245363602, + "grad_norm": 0.83203125, + "learning_rate": 7.238660528149857e-05, + "loss": 0.9639, + "step": 1111 + }, + { + "epoch": 0.17530885430102905, + "grad_norm": 0.921875, + "learning_rate": 7.238218056735997e-05, + "loss": 1.2617, + "step": 1112 + }, + { + "epoch": 0.17546650614842205, + "grad_norm": 0.921875, + "learning_rate": 7.237775591175574e-05, + "loss": 1.0418, + "step": 1113 + }, + { + "epoch": 0.17562415799581507, + "grad_norm": 0.796875, + "learning_rate": 7.237333131469528e-05, + "loss": 1.0377, + "step": 1114 + }, + { + "epoch": 0.17578180984320807, + "grad_norm": 0.91015625, + "learning_rate": 7.236890677618796e-05, + "loss": 1.144, + "step": 1115 + }, + { + "epoch": 0.1759394616906011, + "grad_norm": 1.0859375, + "learning_rate": 7.236448229624317e-05, + "loss": 1.1276, + "step": 1116 + }, + { + "epoch": 0.1760971135379941, + "grad_norm": 0.94921875, + "learning_rate": 7.23600578748703e-05, + "loss": 0.9916, + "step": 1117 + }, + { + "epoch": 0.17625476538538712, + "grad_norm": 0.94140625, + "learning_rate": 7.23556335120787e-05, + "loss": 1.2311, + "step": 1118 + }, + { + "epoch": 0.17641241723278012, + "grad_norm": 0.94921875, + "learning_rate": 7.235120920787776e-05, + "loss": 1.0904, + "step": 1119 + }, + { + "epoch": 0.17657006908017314, + "grad_norm": 1.046875, + "learning_rate": 7.234678496227681e-05, + "loss": 1.1721, + "step": 1120 + }, + { + "epoch": 0.17672772092756614, + "grad_norm": 1.015625, + "learning_rate": 7.234236077528529e-05, + "loss": 1.1207, + "step": 1121 + }, + { + "epoch": 0.17688537277495917, + "grad_norm": 0.99609375, + "learning_rate": 7.233793664691257e-05, + "loss": 1.1241, + "step": 1122 + }, + { + "epoch": 0.17704302462235216, + "grad_norm": 0.91015625, + "learning_rate": 7.233351257716801e-05, + "loss": 1.1628, + "step": 1123 + }, + { + "epoch": 0.1772006764697452, + "grad_norm": 0.90234375, + "learning_rate": 7.2329088566061e-05, + "loss": 1.0504, + "step": 1124 + }, + { + "epoch": 0.1773583283171382, + "grad_norm": 1.0703125, + "learning_rate": 7.232466461360084e-05, + "loss": 1.1426, + "step": 1125 + }, + { + "epoch": 0.1775159801645312, + "grad_norm": 0.97265625, + "learning_rate": 7.232024071979704e-05, + "loss": 1.2198, + "step": 1126 + }, + { + "epoch": 0.1776736320119242, + "grad_norm": 0.97265625, + "learning_rate": 7.231581688465889e-05, + "loss": 1.3697, + "step": 1127 + }, + { + "epoch": 0.17783128385931724, + "grad_norm": 0.98828125, + "learning_rate": 7.23113931081958e-05, + "loss": 1.145, + "step": 1128 + }, + { + "epoch": 0.17798893570671023, + "grad_norm": 0.92578125, + "learning_rate": 7.23069693904171e-05, + "loss": 1.058, + "step": 1129 + }, + { + "epoch": 0.17814658755410326, + "grad_norm": 0.9765625, + "learning_rate": 7.23025457313322e-05, + "loss": 1.108, + "step": 1130 + }, + { + "epoch": 0.17830423940149626, + "grad_norm": 0.95703125, + "learning_rate": 7.229812213095049e-05, + "loss": 1.0761, + "step": 1131 + }, + { + "epoch": 0.17846189124888928, + "grad_norm": 1.0390625, + "learning_rate": 7.229369858928132e-05, + "loss": 1.1769, + "step": 1132 + }, + { + "epoch": 0.17861954309628228, + "grad_norm": 1.015625, + "learning_rate": 7.228927510633405e-05, + "loss": 1.1977, + "step": 1133 + }, + { + "epoch": 0.1787771949436753, + "grad_norm": 0.93359375, + "learning_rate": 7.22848516821181e-05, + "loss": 0.9682, + "step": 1134 + }, + { + "epoch": 0.1789348467910683, + "grad_norm": 1.140625, + "learning_rate": 7.22804283166428e-05, + "loss": 1.321, + "step": 1135 + }, + { + "epoch": 0.17909249863846133, + "grad_norm": 0.86328125, + "learning_rate": 7.227600500991758e-05, + "loss": 0.9646, + "step": 1136 + }, + { + "epoch": 0.17925015048585433, + "grad_norm": 0.8671875, + "learning_rate": 7.227158176195176e-05, + "loss": 1.1262, + "step": 1137 + }, + { + "epoch": 0.17940780233324735, + "grad_norm": 0.9765625, + "learning_rate": 7.226715857275477e-05, + "loss": 1.053, + "step": 1138 + }, + { + "epoch": 0.17956545418064035, + "grad_norm": 1.125, + "learning_rate": 7.226273544233593e-05, + "loss": 1.2967, + "step": 1139 + }, + { + "epoch": 0.17972310602803337, + "grad_norm": 0.890625, + "learning_rate": 7.225831237070459e-05, + "loss": 0.987, + "step": 1140 + }, + { + "epoch": 0.17988075787542637, + "grad_norm": 0.890625, + "learning_rate": 7.225388935787023e-05, + "loss": 0.947, + "step": 1141 + }, + { + "epoch": 0.1800384097228194, + "grad_norm": 0.9140625, + "learning_rate": 7.224946640384216e-05, + "loss": 1.0412, + "step": 1142 + }, + { + "epoch": 0.1801960615702124, + "grad_norm": 0.9453125, + "learning_rate": 7.224504350862976e-05, + "loss": 1.1626, + "step": 1143 + }, + { + "epoch": 0.18035371341760542, + "grad_norm": 1.0078125, + "learning_rate": 7.22406206722424e-05, + "loss": 1.1337, + "step": 1144 + }, + { + "epoch": 0.18051136526499842, + "grad_norm": 0.9296875, + "learning_rate": 7.223619789468942e-05, + "loss": 1.0724, + "step": 1145 + }, + { + "epoch": 0.18066901711239144, + "grad_norm": 0.93359375, + "learning_rate": 7.223177517598027e-05, + "loss": 1.1174, + "step": 1146 + }, + { + "epoch": 0.18082666895978444, + "grad_norm": 0.96875, + "learning_rate": 7.22273525161243e-05, + "loss": 1.1967, + "step": 1147 + }, + { + "epoch": 0.18098432080717747, + "grad_norm": 1.09375, + "learning_rate": 7.222292991513085e-05, + "loss": 1.2996, + "step": 1148 + }, + { + "epoch": 0.18114197265457047, + "grad_norm": 1.046875, + "learning_rate": 7.221850737300932e-05, + "loss": 1.1696, + "step": 1149 + }, + { + "epoch": 0.1812996245019635, + "grad_norm": 1.03125, + "learning_rate": 7.221408488976904e-05, + "loss": 1.1977, + "step": 1150 + }, + { + "epoch": 0.1814572763493565, + "grad_norm": 0.9140625, + "learning_rate": 7.220966246541946e-05, + "loss": 0.9373, + "step": 1151 + }, + { + "epoch": 0.18161492819674951, + "grad_norm": 0.97265625, + "learning_rate": 7.220524009996989e-05, + "loss": 1.2118, + "step": 1152 + }, + { + "epoch": 0.1817725800441425, + "grad_norm": 0.96484375, + "learning_rate": 7.220081779342973e-05, + "loss": 1.1216, + "step": 1153 + }, + { + "epoch": 0.18193023189153554, + "grad_norm": 0.9453125, + "learning_rate": 7.219639554580836e-05, + "loss": 1.1091, + "step": 1154 + }, + { + "epoch": 0.18208788373892854, + "grad_norm": 0.91015625, + "learning_rate": 7.219197335711512e-05, + "loss": 1.0109, + "step": 1155 + }, + { + "epoch": 0.18224553558632156, + "grad_norm": 0.87890625, + "learning_rate": 7.218755122735943e-05, + "loss": 1.0602, + "step": 1156 + }, + { + "epoch": 0.18240318743371456, + "grad_norm": 0.8046875, + "learning_rate": 7.21831291565506e-05, + "loss": 0.9741, + "step": 1157 + }, + { + "epoch": 0.18256083928110758, + "grad_norm": 0.9453125, + "learning_rate": 7.217870714469808e-05, + "loss": 1.1194, + "step": 1158 + }, + { + "epoch": 0.18271849112850058, + "grad_norm": 1.0234375, + "learning_rate": 7.217428519181118e-05, + "loss": 1.416, + "step": 1159 + }, + { + "epoch": 0.1828761429758936, + "grad_norm": 0.94921875, + "learning_rate": 7.216986329789925e-05, + "loss": 1.0331, + "step": 1160 + }, + { + "epoch": 0.1830337948232866, + "grad_norm": 0.8125, + "learning_rate": 7.216544146297177e-05, + "loss": 1.0492, + "step": 1161 + }, + { + "epoch": 0.18319144667067963, + "grad_norm": 0.859375, + "learning_rate": 7.216101968703801e-05, + "loss": 1.0186, + "step": 1162 + }, + { + "epoch": 0.18334909851807263, + "grad_norm": 1.015625, + "learning_rate": 7.215659797010741e-05, + "loss": 1.1807, + "step": 1163 + }, + { + "epoch": 0.18350675036546565, + "grad_norm": 0.97265625, + "learning_rate": 7.215217631218929e-05, + "loss": 1.1883, + "step": 1164 + }, + { + "epoch": 0.18366440221285865, + "grad_norm": 0.9140625, + "learning_rate": 7.2147754713293e-05, + "loss": 1.0969, + "step": 1165 + }, + { + "epoch": 0.18382205406025168, + "grad_norm": 0.9609375, + "learning_rate": 7.214333317342799e-05, + "loss": 1.0287, + "step": 1166 + }, + { + "epoch": 0.18397970590764468, + "grad_norm": 0.98828125, + "learning_rate": 7.21389116926036e-05, + "loss": 1.0993, + "step": 1167 + }, + { + "epoch": 0.1841373577550377, + "grad_norm": 1.0625, + "learning_rate": 7.21344902708292e-05, + "loss": 1.1584, + "step": 1168 + }, + { + "epoch": 0.1842950096024307, + "grad_norm": 0.9296875, + "learning_rate": 7.213006890811415e-05, + "loss": 1.0319, + "step": 1169 + }, + { + "epoch": 0.18445266144982372, + "grad_norm": 0.85546875, + "learning_rate": 7.212564760446778e-05, + "loss": 0.9684, + "step": 1170 + }, + { + "epoch": 0.18461031329721672, + "grad_norm": 0.984375, + "learning_rate": 7.212122635989956e-05, + "loss": 1.1257, + "step": 1171 + }, + { + "epoch": 0.18476796514460975, + "grad_norm": 0.8984375, + "learning_rate": 7.211680517441878e-05, + "loss": 1.0532, + "step": 1172 + }, + { + "epoch": 0.18492561699200274, + "grad_norm": 0.94140625, + "learning_rate": 7.211238404803486e-05, + "loss": 1.1364, + "step": 1173 + }, + { + "epoch": 0.18508326883939577, + "grad_norm": 1.015625, + "learning_rate": 7.210796298075715e-05, + "loss": 1.1933, + "step": 1174 + }, + { + "epoch": 0.18524092068678877, + "grad_norm": 0.8203125, + "learning_rate": 7.210354197259499e-05, + "loss": 1.0786, + "step": 1175 + }, + { + "epoch": 0.1853985725341818, + "grad_norm": 0.92578125, + "learning_rate": 7.209912102355781e-05, + "loss": 1.167, + "step": 1176 + }, + { + "epoch": 0.1855562243815748, + "grad_norm": 1.0078125, + "learning_rate": 7.209470013365494e-05, + "loss": 1.2949, + "step": 1177 + }, + { + "epoch": 0.18571387622896782, + "grad_norm": 0.9296875, + "learning_rate": 7.209027930289575e-05, + "loss": 1.1454, + "step": 1178 + }, + { + "epoch": 0.18587152807636081, + "grad_norm": 0.9609375, + "learning_rate": 7.208585853128962e-05, + "loss": 1.192, + "step": 1179 + }, + { + "epoch": 0.18602917992375384, + "grad_norm": 0.95703125, + "learning_rate": 7.208143781884589e-05, + "loss": 1.1754, + "step": 1180 + }, + { + "epoch": 0.18618683177114684, + "grad_norm": 0.921875, + "learning_rate": 7.207701716557398e-05, + "loss": 0.8474, + "step": 1181 + }, + { + "epoch": 0.18634448361853986, + "grad_norm": 0.85546875, + "learning_rate": 7.207259657148324e-05, + "loss": 0.9793, + "step": 1182 + }, + { + "epoch": 0.18650213546593286, + "grad_norm": 1.2578125, + "learning_rate": 7.206817603658304e-05, + "loss": 1.1666, + "step": 1183 + }, + { + "epoch": 0.1866597873133259, + "grad_norm": 0.890625, + "learning_rate": 7.206375556088276e-05, + "loss": 1.1172, + "step": 1184 + }, + { + "epoch": 0.18681743916071888, + "grad_norm": 1.015625, + "learning_rate": 7.205933514439169e-05, + "loss": 1.0309, + "step": 1185 + }, + { + "epoch": 0.1869750910081119, + "grad_norm": 0.9453125, + "learning_rate": 7.20549147871193e-05, + "loss": 0.9983, + "step": 1186 + }, + { + "epoch": 0.1871327428555049, + "grad_norm": 0.9453125, + "learning_rate": 7.205049448907492e-05, + "loss": 1.1514, + "step": 1187 + }, + { + "epoch": 0.18729039470289793, + "grad_norm": 0.8203125, + "learning_rate": 7.204607425026794e-05, + "loss": 0.9441, + "step": 1188 + }, + { + "epoch": 0.18744804655029093, + "grad_norm": 0.9375, + "learning_rate": 7.204165407070771e-05, + "loss": 1.0981, + "step": 1189 + }, + { + "epoch": 0.18760569839768396, + "grad_norm": 0.90234375, + "learning_rate": 7.203723395040354e-05, + "loss": 1.0558, + "step": 1190 + }, + { + "epoch": 0.18776335024507695, + "grad_norm": 1.0625, + "learning_rate": 7.20328138893649e-05, + "loss": 1.4519, + "step": 1191 + }, + { + "epoch": 0.18792100209246998, + "grad_norm": 1.0234375, + "learning_rate": 7.202839388760109e-05, + "loss": 1.4529, + "step": 1192 + }, + { + "epoch": 0.18807865393986298, + "grad_norm": 0.90625, + "learning_rate": 7.202397394512153e-05, + "loss": 0.9502, + "step": 1193 + }, + { + "epoch": 0.188236305787256, + "grad_norm": 0.8828125, + "learning_rate": 7.201955406193556e-05, + "loss": 0.9474, + "step": 1194 + }, + { + "epoch": 0.188393957634649, + "grad_norm": 1.046875, + "learning_rate": 7.201513423805251e-05, + "loss": 1.2527, + "step": 1195 + }, + { + "epoch": 0.18855160948204203, + "grad_norm": 0.93359375, + "learning_rate": 7.201071447348181e-05, + "loss": 1.0572, + "step": 1196 + }, + { + "epoch": 0.18870926132943502, + "grad_norm": 1.0078125, + "learning_rate": 7.20062947682328e-05, + "loss": 1.154, + "step": 1197 + }, + { + "epoch": 0.18886691317682805, + "grad_norm": 0.84375, + "learning_rate": 7.200187512231484e-05, + "loss": 0.9587, + "step": 1198 + }, + { + "epoch": 0.18902456502422105, + "grad_norm": 0.98828125, + "learning_rate": 7.199745553573733e-05, + "loss": 1.0023, + "step": 1199 + }, + { + "epoch": 0.18918221687161407, + "grad_norm": 0.97265625, + "learning_rate": 7.199303600850956e-05, + "loss": 1.2248, + "step": 1200 + }, + { + "epoch": 0.18933986871900707, + "grad_norm": 0.98828125, + "learning_rate": 7.198861654064097e-05, + "loss": 1.1428, + "step": 1201 + }, + { + "epoch": 0.1894975205664001, + "grad_norm": 0.9296875, + "learning_rate": 7.198419713214093e-05, + "loss": 1.2118, + "step": 1202 + }, + { + "epoch": 0.1896551724137931, + "grad_norm": 0.9375, + "learning_rate": 7.197977778301876e-05, + "loss": 1.0747, + "step": 1203 + }, + { + "epoch": 0.18981282426118612, + "grad_norm": 0.90234375, + "learning_rate": 7.197535849328387e-05, + "loss": 1.013, + "step": 1204 + }, + { + "epoch": 0.18997047610857912, + "grad_norm": 0.9296875, + "learning_rate": 7.197093926294555e-05, + "loss": 1.0271, + "step": 1205 + }, + { + "epoch": 0.19012812795597214, + "grad_norm": 0.92578125, + "learning_rate": 7.196652009201327e-05, + "loss": 1.0971, + "step": 1206 + }, + { + "epoch": 0.19028577980336514, + "grad_norm": 0.86328125, + "learning_rate": 7.196210098049636e-05, + "loss": 1.1681, + "step": 1207 + }, + { + "epoch": 0.19044343165075817, + "grad_norm": 0.9296875, + "learning_rate": 7.195768192840417e-05, + "loss": 1.1135, + "step": 1208 + }, + { + "epoch": 0.19060108349815116, + "grad_norm": 1.0078125, + "learning_rate": 7.195326293574607e-05, + "loss": 1.1038, + "step": 1209 + }, + { + "epoch": 0.1907587353455442, + "grad_norm": 1.09375, + "learning_rate": 7.194884400253137e-05, + "loss": 1.2848, + "step": 1210 + }, + { + "epoch": 0.1909163871929372, + "grad_norm": 1.0390625, + "learning_rate": 7.194442512876954e-05, + "loss": 1.205, + "step": 1211 + }, + { + "epoch": 0.1910740390403302, + "grad_norm": 0.90234375, + "learning_rate": 7.194000631446991e-05, + "loss": 1.0155, + "step": 1212 + }, + { + "epoch": 0.1912316908877232, + "grad_norm": 0.8828125, + "learning_rate": 7.193558755964183e-05, + "loss": 1.2083, + "step": 1213 + }, + { + "epoch": 0.19138934273511624, + "grad_norm": 0.9453125, + "learning_rate": 7.193116886429466e-05, + "loss": 1.2266, + "step": 1214 + }, + { + "epoch": 0.19154699458250923, + "grad_norm": 0.921875, + "learning_rate": 7.192675022843774e-05, + "loss": 0.9782, + "step": 1215 + }, + { + "epoch": 0.19170464642990226, + "grad_norm": 0.875, + "learning_rate": 7.192233165208051e-05, + "loss": 1.2995, + "step": 1216 + }, + { + "epoch": 0.19186229827729526, + "grad_norm": 1.0234375, + "learning_rate": 7.191791313523229e-05, + "loss": 1.3943, + "step": 1217 + }, + { + "epoch": 0.19201995012468828, + "grad_norm": 0.92578125, + "learning_rate": 7.191349467790241e-05, + "loss": 1.1526, + "step": 1218 + }, + { + "epoch": 0.19217760197208128, + "grad_norm": 0.94140625, + "learning_rate": 7.190907628010031e-05, + "loss": 0.9935, + "step": 1219 + }, + { + "epoch": 0.1923352538194743, + "grad_norm": 1.0390625, + "learning_rate": 7.190465794183531e-05, + "loss": 1.1115, + "step": 1220 + }, + { + "epoch": 0.1924929056668673, + "grad_norm": 0.9453125, + "learning_rate": 7.190023966311676e-05, + "loss": 1.176, + "step": 1221 + }, + { + "epoch": 0.19265055751426033, + "grad_norm": 1.078125, + "learning_rate": 7.189582144395408e-05, + "loss": 1.0479, + "step": 1222 + }, + { + "epoch": 0.19280820936165333, + "grad_norm": 1.0078125, + "learning_rate": 7.18914032843566e-05, + "loss": 1.1806, + "step": 1223 + }, + { + "epoch": 0.19296586120904635, + "grad_norm": 0.93359375, + "learning_rate": 7.188698518433367e-05, + "loss": 0.9646, + "step": 1224 + }, + { + "epoch": 0.19312351305643935, + "grad_norm": 0.94140625, + "learning_rate": 7.188256714389467e-05, + "loss": 1.0893, + "step": 1225 + }, + { + "epoch": 0.19328116490383238, + "grad_norm": 0.8515625, + "learning_rate": 7.187814916304893e-05, + "loss": 0.9054, + "step": 1226 + }, + { + "epoch": 0.19343881675122537, + "grad_norm": 0.828125, + "learning_rate": 7.187373124180587e-05, + "loss": 0.9952, + "step": 1227 + }, + { + "epoch": 0.1935964685986184, + "grad_norm": 0.9296875, + "learning_rate": 7.186931338017484e-05, + "loss": 1.1739, + "step": 1228 + }, + { + "epoch": 0.1937541204460114, + "grad_norm": 0.98828125, + "learning_rate": 7.18648955781652e-05, + "loss": 1.3445, + "step": 1229 + }, + { + "epoch": 0.19391177229340442, + "grad_norm": 0.99609375, + "learning_rate": 7.18604778357863e-05, + "loss": 1.1383, + "step": 1230 + }, + { + "epoch": 0.19406942414079742, + "grad_norm": 0.84375, + "learning_rate": 7.185606015304747e-05, + "loss": 0.8533, + "step": 1231 + }, + { + "epoch": 0.19422707598819045, + "grad_norm": 1.03125, + "learning_rate": 7.185164252995814e-05, + "loss": 1.1167, + "step": 1232 + }, + { + "epoch": 0.19438472783558344, + "grad_norm": 0.97265625, + "learning_rate": 7.184722496652764e-05, + "loss": 1.1353, + "step": 1233 + }, + { + "epoch": 0.19454237968297647, + "grad_norm": 0.9140625, + "learning_rate": 7.184280746276537e-05, + "loss": 1.1203, + "step": 1234 + }, + { + "epoch": 0.19470003153036947, + "grad_norm": 0.90625, + "learning_rate": 7.183839001868064e-05, + "loss": 0.9602, + "step": 1235 + }, + { + "epoch": 0.1948576833777625, + "grad_norm": 0.83984375, + "learning_rate": 7.183397263428281e-05, + "loss": 1.0512, + "step": 1236 + }, + { + "epoch": 0.1950153352251555, + "grad_norm": 0.9609375, + "learning_rate": 7.182955530958127e-05, + "loss": 1.2018, + "step": 1237 + }, + { + "epoch": 0.19517298707254851, + "grad_norm": 0.91015625, + "learning_rate": 7.182513804458539e-05, + "loss": 1.0852, + "step": 1238 + }, + { + "epoch": 0.1953306389199415, + "grad_norm": 1.0, + "learning_rate": 7.182072083930453e-05, + "loss": 1.1473, + "step": 1239 + }, + { + "epoch": 0.19548829076733454, + "grad_norm": 1.0390625, + "learning_rate": 7.181630369374804e-05, + "loss": 1.0743, + "step": 1240 + }, + { + "epoch": 0.19564594261472754, + "grad_norm": 1.015625, + "learning_rate": 7.181188660792525e-05, + "loss": 1.13, + "step": 1241 + }, + { + "epoch": 0.19580359446212056, + "grad_norm": 0.8515625, + "learning_rate": 7.180746958184559e-05, + "loss": 0.9628, + "step": 1242 + }, + { + "epoch": 0.19596124630951356, + "grad_norm": 0.97265625, + "learning_rate": 7.180305261551839e-05, + "loss": 1.1122, + "step": 1243 + }, + { + "epoch": 0.19611889815690658, + "grad_norm": 0.87890625, + "learning_rate": 7.1798635708953e-05, + "loss": 0.9681, + "step": 1244 + }, + { + "epoch": 0.19627655000429958, + "grad_norm": 1.0078125, + "learning_rate": 7.17942188621588e-05, + "loss": 1.0186, + "step": 1245 + }, + { + "epoch": 0.1964342018516926, + "grad_norm": 0.90625, + "learning_rate": 7.178980207514507e-05, + "loss": 1.2402, + "step": 1246 + }, + { + "epoch": 0.1965918536990856, + "grad_norm": 0.98046875, + "learning_rate": 7.178538534792132e-05, + "loss": 1.0631, + "step": 1247 + }, + { + "epoch": 0.19674950554647863, + "grad_norm": 0.90625, + "learning_rate": 7.178096868049681e-05, + "loss": 1.0162, + "step": 1248 + }, + { + "epoch": 0.19690715739387163, + "grad_norm": 0.96875, + "learning_rate": 7.177655207288094e-05, + "loss": 1.3358, + "step": 1249 + }, + { + "epoch": 0.19706480924126465, + "grad_norm": 0.91015625, + "learning_rate": 7.177213552508303e-05, + "loss": 1.1617, + "step": 1250 + }, + { + "epoch": 0.19722246108865765, + "grad_norm": 0.8671875, + "learning_rate": 7.176771903711246e-05, + "loss": 0.9839, + "step": 1251 + }, + { + "epoch": 0.19738011293605068, + "grad_norm": 1.0234375, + "learning_rate": 7.176330260897862e-05, + "loss": 1.2375, + "step": 1252 + }, + { + "epoch": 0.1975377647834437, + "grad_norm": 0.95703125, + "learning_rate": 7.175888624069083e-05, + "loss": 1.1342, + "step": 1253 + }, + { + "epoch": 0.1976954166308367, + "grad_norm": 0.8359375, + "learning_rate": 7.175446993225848e-05, + "loss": 1.0215, + "step": 1254 + }, + { + "epoch": 0.19785306847822973, + "grad_norm": 0.88671875, + "learning_rate": 7.175005368369092e-05, + "loss": 1.0518, + "step": 1255 + }, + { + "epoch": 0.19801072032562272, + "grad_norm": 0.953125, + "learning_rate": 7.174563749499745e-05, + "loss": 1.0539, + "step": 1256 + }, + { + "epoch": 0.19816837217301575, + "grad_norm": 0.9453125, + "learning_rate": 7.174122136618755e-05, + "loss": 1.1794, + "step": 1257 + }, + { + "epoch": 0.19832602402040875, + "grad_norm": 1.03125, + "learning_rate": 7.17368052972705e-05, + "loss": 0.9212, + "step": 1258 + }, + { + "epoch": 0.19848367586780177, + "grad_norm": 0.94140625, + "learning_rate": 7.173238928825568e-05, + "loss": 1.1611, + "step": 1259 + }, + { + "epoch": 0.19864132771519477, + "grad_norm": 1.2109375, + "learning_rate": 7.172797333915244e-05, + "loss": 1.164, + "step": 1260 + }, + { + "epoch": 0.1987989795625878, + "grad_norm": 0.9296875, + "learning_rate": 7.172355744997014e-05, + "loss": 1.2847, + "step": 1261 + }, + { + "epoch": 0.1989566314099808, + "grad_norm": 1.0234375, + "learning_rate": 7.171914162071816e-05, + "loss": 1.2166, + "step": 1262 + }, + { + "epoch": 0.19911428325737382, + "grad_norm": 0.90625, + "learning_rate": 7.171472585140583e-05, + "loss": 1.0452, + "step": 1263 + }, + { + "epoch": 0.19927193510476682, + "grad_norm": 0.88671875, + "learning_rate": 7.171031014204253e-05, + "loss": 1.0085, + "step": 1264 + }, + { + "epoch": 0.19942958695215984, + "grad_norm": 0.9609375, + "learning_rate": 7.17058944926376e-05, + "loss": 1.3308, + "step": 1265 + }, + { + "epoch": 0.19958723879955284, + "grad_norm": 0.85546875, + "learning_rate": 7.170147890320038e-05, + "loss": 1.0107, + "step": 1266 + }, + { + "epoch": 0.19974489064694587, + "grad_norm": 0.984375, + "learning_rate": 7.16970633737403e-05, + "loss": 1.1441, + "step": 1267 + }, + { + "epoch": 0.19990254249433886, + "grad_norm": 0.92578125, + "learning_rate": 7.169264790426669e-05, + "loss": 1.1446, + "step": 1268 + }, + { + "epoch": 0.2000601943417319, + "grad_norm": 0.890625, + "learning_rate": 7.168823249478886e-05, + "loss": 1.1628, + "step": 1269 + }, + { + "epoch": 0.2002178461891249, + "grad_norm": 0.96484375, + "learning_rate": 7.168381714531623e-05, + "loss": 1.1789, + "step": 1270 + }, + { + "epoch": 0.2003754980365179, + "grad_norm": 1.015625, + "learning_rate": 7.167940185585808e-05, + "loss": 1.3069, + "step": 1271 + }, + { + "epoch": 0.2005331498839109, + "grad_norm": 0.9296875, + "learning_rate": 7.167498662642386e-05, + "loss": 1.2018, + "step": 1272 + }, + { + "epoch": 0.20069080173130394, + "grad_norm": 0.85546875, + "learning_rate": 7.16705714570229e-05, + "loss": 1.0384, + "step": 1273 + }, + { + "epoch": 0.20084845357869693, + "grad_norm": 0.91015625, + "learning_rate": 7.166615634766451e-05, + "loss": 1.0935, + "step": 1274 + }, + { + "epoch": 0.20100610542608996, + "grad_norm": 0.90234375, + "learning_rate": 7.166174129835812e-05, + "loss": 1.073, + "step": 1275 + }, + { + "epoch": 0.20116375727348296, + "grad_norm": 0.8984375, + "learning_rate": 7.1657326309113e-05, + "loss": 1.1069, + "step": 1276 + }, + { + "epoch": 0.20132140912087598, + "grad_norm": 0.921875, + "learning_rate": 7.165291137993858e-05, + "loss": 1.2457, + "step": 1277 + }, + { + "epoch": 0.20147906096826898, + "grad_norm": 0.85546875, + "learning_rate": 7.164849651084421e-05, + "loss": 1.1289, + "step": 1278 + }, + { + "epoch": 0.201636712815662, + "grad_norm": 0.8515625, + "learning_rate": 7.164408170183923e-05, + "loss": 0.9065, + "step": 1279 + }, + { + "epoch": 0.201794364663055, + "grad_norm": 0.9375, + "learning_rate": 7.1639666952933e-05, + "loss": 1.2445, + "step": 1280 + }, + { + "epoch": 0.20195201651044803, + "grad_norm": 0.85546875, + "learning_rate": 7.163525226413483e-05, + "loss": 0.938, + "step": 1281 + }, + { + "epoch": 0.20210966835784103, + "grad_norm": 0.9296875, + "learning_rate": 7.163083763545418e-05, + "loss": 1.0402, + "step": 1282 + }, + { + "epoch": 0.20226732020523405, + "grad_norm": 1.015625, + "learning_rate": 7.162642306690033e-05, + "loss": 1.1654, + "step": 1283 + }, + { + "epoch": 0.20242497205262705, + "grad_norm": 0.96484375, + "learning_rate": 7.162200855848264e-05, + "loss": 1.2361, + "step": 1284 + }, + { + "epoch": 0.20258262390002008, + "grad_norm": 0.9765625, + "learning_rate": 7.161759411021048e-05, + "loss": 1.1056, + "step": 1285 + }, + { + "epoch": 0.20274027574741307, + "grad_norm": 1.0078125, + "learning_rate": 7.16131797220932e-05, + "loss": 0.9872, + "step": 1286 + }, + { + "epoch": 0.2028979275948061, + "grad_norm": 0.9765625, + "learning_rate": 7.160876539414018e-05, + "loss": 1.2429, + "step": 1287 + }, + { + "epoch": 0.2030555794421991, + "grad_norm": 0.94140625, + "learning_rate": 7.160435112636078e-05, + "loss": 0.9785, + "step": 1288 + }, + { + "epoch": 0.20321323128959212, + "grad_norm": 1.3828125, + "learning_rate": 7.159993691876431e-05, + "loss": 1.296, + "step": 1289 + }, + { + "epoch": 0.20337088313698512, + "grad_norm": 0.9375, + "learning_rate": 7.159552277136016e-05, + "loss": 1.0469, + "step": 1290 + }, + { + "epoch": 0.20352853498437815, + "grad_norm": 0.9609375, + "learning_rate": 7.159110868415763e-05, + "loss": 1.2138, + "step": 1291 + }, + { + "epoch": 0.20368618683177114, + "grad_norm": 0.875, + "learning_rate": 7.158669465716617e-05, + "loss": 1.1343, + "step": 1292 + }, + { + "epoch": 0.20384383867916417, + "grad_norm": 1.0546875, + "learning_rate": 7.158228069039507e-05, + "loss": 1.0967, + "step": 1293 + }, + { + "epoch": 0.20400149052655717, + "grad_norm": 1.109375, + "learning_rate": 7.157786678385371e-05, + "loss": 1.2791, + "step": 1294 + }, + { + "epoch": 0.2041591423739502, + "grad_norm": 1.03125, + "learning_rate": 7.157345293755147e-05, + "loss": 1.3085, + "step": 1295 + }, + { + "epoch": 0.2043167942213432, + "grad_norm": 0.86328125, + "learning_rate": 7.15690391514976e-05, + "loss": 0.8861, + "step": 1296 + }, + { + "epoch": 0.20447444606873622, + "grad_norm": 0.9609375, + "learning_rate": 7.156462542570157e-05, + "loss": 1.1473, + "step": 1297 + }, + { + "epoch": 0.2046320979161292, + "grad_norm": 0.93359375, + "learning_rate": 7.15602117601727e-05, + "loss": 1.0772, + "step": 1298 + }, + { + "epoch": 0.20478974976352224, + "grad_norm": 0.89453125, + "learning_rate": 7.15557981549203e-05, + "loss": 1.0796, + "step": 1299 + }, + { + "epoch": 0.20494740161091524, + "grad_norm": 0.98828125, + "learning_rate": 7.15513846099538e-05, + "loss": 1.2913, + "step": 1300 + }, + { + "epoch": 0.20510505345830826, + "grad_norm": 0.95703125, + "learning_rate": 7.154697112528248e-05, + "loss": 1.536, + "step": 1301 + }, + { + "epoch": 0.20526270530570126, + "grad_norm": 0.89453125, + "learning_rate": 7.154255770091575e-05, + "loss": 0.998, + "step": 1302 + }, + { + "epoch": 0.20542035715309429, + "grad_norm": 0.875, + "learning_rate": 7.153814433686294e-05, + "loss": 1.1016, + "step": 1303 + }, + { + "epoch": 0.20557800900048728, + "grad_norm": 0.98046875, + "learning_rate": 7.15337310331334e-05, + "loss": 1.2312, + "step": 1304 + }, + { + "epoch": 0.2057356608478803, + "grad_norm": 1.0703125, + "learning_rate": 7.152931778973649e-05, + "loss": 1.0117, + "step": 1305 + }, + { + "epoch": 0.2058933126952733, + "grad_norm": 0.96484375, + "learning_rate": 7.152490460668156e-05, + "loss": 0.9469, + "step": 1306 + }, + { + "epoch": 0.20605096454266633, + "grad_norm": 0.91796875, + "learning_rate": 7.152049148397797e-05, + "loss": 1.236, + "step": 1307 + }, + { + "epoch": 0.20620861639005933, + "grad_norm": 0.87890625, + "learning_rate": 7.151607842163508e-05, + "loss": 1.1139, + "step": 1308 + }, + { + "epoch": 0.20636626823745235, + "grad_norm": 1.0859375, + "learning_rate": 7.151166541966223e-05, + "loss": 1.22, + "step": 1309 + }, + { + "epoch": 0.20652392008484535, + "grad_norm": 0.83203125, + "learning_rate": 7.150725247806877e-05, + "loss": 1.1642, + "step": 1310 + }, + { + "epoch": 0.20668157193223838, + "grad_norm": 1.046875, + "learning_rate": 7.150283959686403e-05, + "loss": 1.1524, + "step": 1311 + }, + { + "epoch": 0.20683922377963138, + "grad_norm": 1.03125, + "learning_rate": 7.149842677605743e-05, + "loss": 1.2956, + "step": 1312 + }, + { + "epoch": 0.2069968756270244, + "grad_norm": 0.96484375, + "learning_rate": 7.14940140156583e-05, + "loss": 0.9648, + "step": 1313 + }, + { + "epoch": 0.2071545274744174, + "grad_norm": 0.93359375, + "learning_rate": 7.148960131567597e-05, + "loss": 1.0749, + "step": 1314 + }, + { + "epoch": 0.20731217932181042, + "grad_norm": 1.0, + "learning_rate": 7.14851886761198e-05, + "loss": 1.2087, + "step": 1315 + }, + { + "epoch": 0.20746983116920342, + "grad_norm": 1.0, + "learning_rate": 7.148077609699909e-05, + "loss": 1.1315, + "step": 1316 + }, + { + "epoch": 0.20762748301659645, + "grad_norm": 0.87109375, + "learning_rate": 7.14763635783233e-05, + "loss": 0.9368, + "step": 1317 + }, + { + "epoch": 0.20778513486398945, + "grad_norm": 1.015625, + "learning_rate": 7.147195112010171e-05, + "loss": 1.1761, + "step": 1318 + }, + { + "epoch": 0.20794278671138247, + "grad_norm": 0.984375, + "learning_rate": 7.146753872234371e-05, + "loss": 1.1666, + "step": 1319 + }, + { + "epoch": 0.20810043855877547, + "grad_norm": 1.0078125, + "learning_rate": 7.14631263850586e-05, + "loss": 1.1527, + "step": 1320 + }, + { + "epoch": 0.2082580904061685, + "grad_norm": 0.84375, + "learning_rate": 7.145871410825577e-05, + "loss": 0.9459, + "step": 1321 + }, + { + "epoch": 0.2084157422535615, + "grad_norm": 0.99609375, + "learning_rate": 7.145430189194456e-05, + "loss": 1.2383, + "step": 1322 + }, + { + "epoch": 0.20857339410095452, + "grad_norm": 1.046875, + "learning_rate": 7.144988973613435e-05, + "loss": 1.3612, + "step": 1323 + }, + { + "epoch": 0.20873104594834752, + "grad_norm": 1.015625, + "learning_rate": 7.144547764083445e-05, + "loss": 1.2297, + "step": 1324 + }, + { + "epoch": 0.20888869779574054, + "grad_norm": 1.203125, + "learning_rate": 7.144106560605425e-05, + "loss": 1.3405, + "step": 1325 + }, + { + "epoch": 0.20904634964313354, + "grad_norm": 0.83984375, + "learning_rate": 7.143665363180305e-05, + "loss": 1.0089, + "step": 1326 + }, + { + "epoch": 0.20920400149052656, + "grad_norm": 0.90234375, + "learning_rate": 7.143224171809025e-05, + "loss": 0.9975, + "step": 1327 + }, + { + "epoch": 0.20936165333791956, + "grad_norm": 1.0078125, + "learning_rate": 7.142782986492517e-05, + "loss": 1.0678, + "step": 1328 + }, + { + "epoch": 0.2095193051853126, + "grad_norm": 1.40625, + "learning_rate": 7.142341807231719e-05, + "loss": 1.2242, + "step": 1329 + }, + { + "epoch": 0.20967695703270559, + "grad_norm": 0.92578125, + "learning_rate": 7.141900634027562e-05, + "loss": 1.1118, + "step": 1330 + }, + { + "epoch": 0.2098346088800986, + "grad_norm": 1.015625, + "learning_rate": 7.14145946688098e-05, + "loss": 1.1714, + "step": 1331 + }, + { + "epoch": 0.2099922607274916, + "grad_norm": 0.953125, + "learning_rate": 7.141018305792916e-05, + "loss": 1.093, + "step": 1332 + }, + { + "epoch": 0.21014991257488463, + "grad_norm": 0.91015625, + "learning_rate": 7.140577150764301e-05, + "loss": 1.0177, + "step": 1333 + }, + { + "epoch": 0.21030756442227763, + "grad_norm": 0.89453125, + "learning_rate": 7.140136001796068e-05, + "loss": 1.0075, + "step": 1334 + }, + { + "epoch": 0.21046521626967066, + "grad_norm": 0.9140625, + "learning_rate": 7.139694858889152e-05, + "loss": 1.0699, + "step": 1335 + }, + { + "epoch": 0.21062286811706366, + "grad_norm": 0.93359375, + "learning_rate": 7.139253722044487e-05, + "loss": 1.0777, + "step": 1336 + }, + { + "epoch": 0.21078051996445668, + "grad_norm": 1.0390625, + "learning_rate": 7.138812591263015e-05, + "loss": 1.2028, + "step": 1337 + }, + { + "epoch": 0.21093817181184968, + "grad_norm": 1.015625, + "learning_rate": 7.138371466545665e-05, + "loss": 1.0713, + "step": 1338 + }, + { + "epoch": 0.2110958236592427, + "grad_norm": 0.9375, + "learning_rate": 7.137930347893374e-05, + "loss": 1.0506, + "step": 1339 + }, + { + "epoch": 0.2112534755066357, + "grad_norm": 0.984375, + "learning_rate": 7.137489235307075e-05, + "loss": 1.0686, + "step": 1340 + }, + { + "epoch": 0.21141112735402873, + "grad_norm": 0.921875, + "learning_rate": 7.1370481287877e-05, + "loss": 1.1614, + "step": 1341 + }, + { + "epoch": 0.21156877920142173, + "grad_norm": 0.953125, + "learning_rate": 7.13660702833619e-05, + "loss": 1.1643, + "step": 1342 + }, + { + "epoch": 0.21172643104881475, + "grad_norm": 0.8984375, + "learning_rate": 7.136165933953482e-05, + "loss": 1.0648, + "step": 1343 + }, + { + "epoch": 0.21188408289620775, + "grad_norm": 0.81640625, + "learning_rate": 7.135724845640503e-05, + "loss": 0.9797, + "step": 1344 + }, + { + "epoch": 0.21204173474360077, + "grad_norm": 0.97265625, + "learning_rate": 7.135283763398194e-05, + "loss": 1.2917, + "step": 1345 + }, + { + "epoch": 0.21219938659099377, + "grad_norm": 0.9609375, + "learning_rate": 7.134842687227484e-05, + "loss": 1.0777, + "step": 1346 + }, + { + "epoch": 0.2123570384383868, + "grad_norm": 0.9140625, + "learning_rate": 7.134401617129313e-05, + "loss": 1.1141, + "step": 1347 + }, + { + "epoch": 0.2125146902857798, + "grad_norm": 1.0390625, + "learning_rate": 7.133960553104615e-05, + "loss": 1.2634, + "step": 1348 + }, + { + "epoch": 0.21267234213317282, + "grad_norm": 0.92578125, + "learning_rate": 7.133519495154324e-05, + "loss": 1.1475, + "step": 1349 + }, + { + "epoch": 0.21282999398056582, + "grad_norm": 0.81640625, + "learning_rate": 7.133078443279373e-05, + "loss": 1.137, + "step": 1350 + }, + { + "epoch": 0.21298764582795884, + "grad_norm": 1.0546875, + "learning_rate": 7.132637397480695e-05, + "loss": 1.1164, + "step": 1351 + }, + { + "epoch": 0.21314529767535184, + "grad_norm": 1.1015625, + "learning_rate": 7.132196357759233e-05, + "loss": 1.1759, + "step": 1352 + }, + { + "epoch": 0.21330294952274487, + "grad_norm": 0.9609375, + "learning_rate": 7.131755324115916e-05, + "loss": 1.0756, + "step": 1353 + }, + { + "epoch": 0.21346060137013786, + "grad_norm": 0.9921875, + "learning_rate": 7.13131429655168e-05, + "loss": 1.1905, + "step": 1354 + }, + { + "epoch": 0.2136182532175309, + "grad_norm": 0.97265625, + "learning_rate": 7.130873275067459e-05, + "loss": 1.2061, + "step": 1355 + }, + { + "epoch": 0.2137759050649239, + "grad_norm": 0.8515625, + "learning_rate": 7.130432259664184e-05, + "loss": 0.9919, + "step": 1356 + }, + { + "epoch": 0.2139335569123169, + "grad_norm": 0.953125, + "learning_rate": 7.129991250342799e-05, + "loss": 1.1732, + "step": 1357 + }, + { + "epoch": 0.2140912087597099, + "grad_norm": 0.953125, + "learning_rate": 7.129550247104233e-05, + "loss": 1.0984, + "step": 1358 + }, + { + "epoch": 0.21424886060710294, + "grad_norm": 1.75, + "learning_rate": 7.12910924994942e-05, + "loss": 0.9716, + "step": 1359 + }, + { + "epoch": 0.21440651245449593, + "grad_norm": 0.93359375, + "learning_rate": 7.128668258879296e-05, + "loss": 1.0291, + "step": 1360 + }, + { + "epoch": 0.21456416430188896, + "grad_norm": 1.0390625, + "learning_rate": 7.128227273894798e-05, + "loss": 1.2808, + "step": 1361 + }, + { + "epoch": 0.21472181614928196, + "grad_norm": 0.98828125, + "learning_rate": 7.127786294996852e-05, + "loss": 1.4473, + "step": 1362 + }, + { + "epoch": 0.21487946799667498, + "grad_norm": 0.984375, + "learning_rate": 7.127345322186403e-05, + "loss": 1.139, + "step": 1363 + }, + { + "epoch": 0.21503711984406798, + "grad_norm": 0.85546875, + "learning_rate": 7.126904355464382e-05, + "loss": 1.0423, + "step": 1364 + }, + { + "epoch": 0.215194771691461, + "grad_norm": 0.88671875, + "learning_rate": 7.126463394831722e-05, + "loss": 1.0168, + "step": 1365 + }, + { + "epoch": 0.215352423538854, + "grad_norm": 0.9140625, + "learning_rate": 7.12602244028936e-05, + "loss": 1.052, + "step": 1366 + }, + { + "epoch": 0.21551007538624703, + "grad_norm": 1.0078125, + "learning_rate": 7.125581491838224e-05, + "loss": 1.2174, + "step": 1367 + }, + { + "epoch": 0.21566772723364003, + "grad_norm": 0.859375, + "learning_rate": 7.125140549479258e-05, + "loss": 0.9669, + "step": 1368 + }, + { + "epoch": 0.21582537908103305, + "grad_norm": 1.171875, + "learning_rate": 7.124699613213393e-05, + "loss": 1.1151, + "step": 1369 + }, + { + "epoch": 0.21598303092842605, + "grad_norm": 0.9609375, + "learning_rate": 7.124258683041562e-05, + "loss": 1.1197, + "step": 1370 + }, + { + "epoch": 0.21614068277581908, + "grad_norm": 0.9453125, + "learning_rate": 7.123817758964699e-05, + "loss": 1.0519, + "step": 1371 + }, + { + "epoch": 0.21629833462321207, + "grad_norm": 0.93359375, + "learning_rate": 7.12337684098374e-05, + "loss": 1.1533, + "step": 1372 + }, + { + "epoch": 0.2164559864706051, + "grad_norm": 1.0078125, + "learning_rate": 7.122935929099621e-05, + "loss": 1.1204, + "step": 1373 + }, + { + "epoch": 0.2166136383179981, + "grad_norm": 0.85546875, + "learning_rate": 7.122495023313273e-05, + "loss": 0.8712, + "step": 1374 + }, + { + "epoch": 0.21677129016539112, + "grad_norm": 0.9140625, + "learning_rate": 7.122054123625635e-05, + "loss": 1.0061, + "step": 1375 + }, + { + "epoch": 0.21692894201278412, + "grad_norm": 1.1796875, + "learning_rate": 7.121613230037639e-05, + "loss": 1.438, + "step": 1376 + }, + { + "epoch": 0.21708659386017715, + "grad_norm": 0.9453125, + "learning_rate": 7.121172342550213e-05, + "loss": 1.2149, + "step": 1377 + }, + { + "epoch": 0.21724424570757014, + "grad_norm": 0.97265625, + "learning_rate": 7.120731461164302e-05, + "loss": 1.1075, + "step": 1378 + }, + { + "epoch": 0.21740189755496317, + "grad_norm": 0.9296875, + "learning_rate": 7.120290585880837e-05, + "loss": 1.4147, + "step": 1379 + }, + { + "epoch": 0.21755954940235617, + "grad_norm": 0.98046875, + "learning_rate": 7.119849716700752e-05, + "loss": 1.0446, + "step": 1380 + }, + { + "epoch": 0.2177172012497492, + "grad_norm": 0.890625, + "learning_rate": 7.119408853624981e-05, + "loss": 0.8742, + "step": 1381 + }, + { + "epoch": 0.2178748530971422, + "grad_norm": 1.109375, + "learning_rate": 7.118967996654452e-05, + "loss": 1.2054, + "step": 1382 + }, + { + "epoch": 0.21803250494453522, + "grad_norm": 0.78125, + "learning_rate": 7.118527145790111e-05, + "loss": 0.9694, + "step": 1383 + }, + { + "epoch": 0.2181901567919282, + "grad_norm": 0.94140625, + "learning_rate": 7.118086301032887e-05, + "loss": 1.0086, + "step": 1384 + }, + { + "epoch": 0.21834780863932124, + "grad_norm": 0.9375, + "learning_rate": 7.117645462383715e-05, + "loss": 1.3487, + "step": 1385 + }, + { + "epoch": 0.21850546048671424, + "grad_norm": 0.97265625, + "learning_rate": 7.117204629843528e-05, + "loss": 0.8584, + "step": 1386 + }, + { + "epoch": 0.21866311233410726, + "grad_norm": 1.0625, + "learning_rate": 7.116763803413259e-05, + "loss": 1.0109, + "step": 1387 + }, + { + "epoch": 0.21882076418150026, + "grad_norm": 1.078125, + "learning_rate": 7.116322983093846e-05, + "loss": 1.1321, + "step": 1388 + }, + { + "epoch": 0.21897841602889329, + "grad_norm": 0.9453125, + "learning_rate": 7.115882168886221e-05, + "loss": 1.1851, + "step": 1389 + }, + { + "epoch": 0.21913606787628628, + "grad_norm": 0.921875, + "learning_rate": 7.11544136079132e-05, + "loss": 1.0456, + "step": 1390 + }, + { + "epoch": 0.2192937197236793, + "grad_norm": 1.015625, + "learning_rate": 7.115000558810076e-05, + "loss": 1.1628, + "step": 1391 + }, + { + "epoch": 0.2194513715710723, + "grad_norm": 1.1640625, + "learning_rate": 7.114559762943422e-05, + "loss": 1.1918, + "step": 1392 + }, + { + "epoch": 0.21960902341846533, + "grad_norm": 1.0, + "learning_rate": 7.114118973192294e-05, + "loss": 1.1242, + "step": 1393 + }, + { + "epoch": 0.21976667526585833, + "grad_norm": 0.91796875, + "learning_rate": 7.113678189557627e-05, + "loss": 1.1061, + "step": 1394 + }, + { + "epoch": 0.21992432711325136, + "grad_norm": 1.03125, + "learning_rate": 7.113237412040354e-05, + "loss": 1.1322, + "step": 1395 + }, + { + "epoch": 0.22008197896064435, + "grad_norm": 0.98828125, + "learning_rate": 7.112796640641411e-05, + "loss": 1.0374, + "step": 1396 + }, + { + "epoch": 0.22023963080803738, + "grad_norm": 1.0546875, + "learning_rate": 7.112355875361724e-05, + "loss": 1.098, + "step": 1397 + }, + { + "epoch": 0.22039728265543038, + "grad_norm": 0.9921875, + "learning_rate": 7.111915116202239e-05, + "loss": 1.013, + "step": 1398 + }, + { + "epoch": 0.2205549345028234, + "grad_norm": 0.96875, + "learning_rate": 7.111474363163885e-05, + "loss": 1.2095, + "step": 1399 + }, + { + "epoch": 0.2207125863502164, + "grad_norm": 1.0390625, + "learning_rate": 7.111033616247595e-05, + "loss": 1.2058, + "step": 1400 + }, + { + "epoch": 0.22087023819760943, + "grad_norm": 0.9609375, + "learning_rate": 7.110592875454304e-05, + "loss": 1.361, + "step": 1401 + }, + { + "epoch": 0.22102789004500242, + "grad_norm": 0.8828125, + "learning_rate": 7.110152140784944e-05, + "loss": 1.3022, + "step": 1402 + }, + { + "epoch": 0.22118554189239545, + "grad_norm": 0.87109375, + "learning_rate": 7.109711412240453e-05, + "loss": 0.9766, + "step": 1403 + }, + { + "epoch": 0.22134319373978845, + "grad_norm": 1.03125, + "learning_rate": 7.109270689821766e-05, + "loss": 1.1768, + "step": 1404 + }, + { + "epoch": 0.22150084558718147, + "grad_norm": 1.140625, + "learning_rate": 7.108829973529814e-05, + "loss": 1.2395, + "step": 1405 + }, + { + "epoch": 0.22165849743457447, + "grad_norm": 0.94140625, + "learning_rate": 7.10838926336553e-05, + "loss": 0.9981, + "step": 1406 + }, + { + "epoch": 0.2218161492819675, + "grad_norm": 0.8671875, + "learning_rate": 7.107948559329848e-05, + "loss": 1.196, + "step": 1407 + }, + { + "epoch": 0.22197380112936052, + "grad_norm": 1.109375, + "learning_rate": 7.107507861423706e-05, + "loss": 1.3497, + "step": 1408 + }, + { + "epoch": 0.22213145297675352, + "grad_norm": 1.1015625, + "learning_rate": 7.107067169648035e-05, + "loss": 1.2475, + "step": 1409 + }, + { + "epoch": 0.22228910482414654, + "grad_norm": 1.0390625, + "learning_rate": 7.10662648400377e-05, + "loss": 1.0338, + "step": 1410 + }, + { + "epoch": 0.22244675667153954, + "grad_norm": 1.09375, + "learning_rate": 7.106185804491847e-05, + "loss": 1.2372, + "step": 1411 + }, + { + "epoch": 0.22260440851893257, + "grad_norm": 1.046875, + "learning_rate": 7.105745131113193e-05, + "loss": 1.4149, + "step": 1412 + }, + { + "epoch": 0.22276206036632556, + "grad_norm": 0.86328125, + "learning_rate": 7.105304463868752e-05, + "loss": 1.0651, + "step": 1413 + }, + { + "epoch": 0.2229197122137186, + "grad_norm": 0.90234375, + "learning_rate": 7.104863802759449e-05, + "loss": 0.9611, + "step": 1414 + }, + { + "epoch": 0.2230773640611116, + "grad_norm": 1.0234375, + "learning_rate": 7.104423147786226e-05, + "loss": 1.0465, + "step": 1415 + }, + { + "epoch": 0.2232350159085046, + "grad_norm": 1.0078125, + "learning_rate": 7.103982498950009e-05, + "loss": 1.0557, + "step": 1416 + }, + { + "epoch": 0.2233926677558976, + "grad_norm": 0.92578125, + "learning_rate": 7.103541856251731e-05, + "loss": 1.1902, + "step": 1417 + }, + { + "epoch": 0.22355031960329064, + "grad_norm": 0.9375, + "learning_rate": 7.103101219692338e-05, + "loss": 0.9738, + "step": 1418 + }, + { + "epoch": 0.22370797145068363, + "grad_norm": 0.8984375, + "learning_rate": 7.102660589272754e-05, + "loss": 1.0793, + "step": 1419 + }, + { + "epoch": 0.22386562329807666, + "grad_norm": 0.96484375, + "learning_rate": 7.102219964993917e-05, + "loss": 1.1476, + "step": 1420 + }, + { + "epoch": 0.22402327514546966, + "grad_norm": 0.86328125, + "learning_rate": 7.101779346856758e-05, + "loss": 0.9305, + "step": 1421 + }, + { + "epoch": 0.22418092699286268, + "grad_norm": 1.0390625, + "learning_rate": 7.101338734862208e-05, + "loss": 1.1201, + "step": 1422 + }, + { + "epoch": 0.22433857884025568, + "grad_norm": 0.90625, + "learning_rate": 7.100898129011208e-05, + "loss": 1.3119, + "step": 1423 + }, + { + "epoch": 0.2244962306876487, + "grad_norm": 1.015625, + "learning_rate": 7.100457529304691e-05, + "loss": 1.1416, + "step": 1424 + }, + { + "epoch": 0.2246538825350417, + "grad_norm": 0.9296875, + "learning_rate": 7.100016935743588e-05, + "loss": 1.1263, + "step": 1425 + }, + { + "epoch": 0.22481153438243473, + "grad_norm": 1.0078125, + "learning_rate": 7.099576348328832e-05, + "loss": 1.0232, + "step": 1426 + }, + { + "epoch": 0.22496918622982773, + "grad_norm": 1.015625, + "learning_rate": 7.099135767061354e-05, + "loss": 1.3169, + "step": 1427 + }, + { + "epoch": 0.22512683807722075, + "grad_norm": 1.0078125, + "learning_rate": 7.098695191942097e-05, + "loss": 1.2153, + "step": 1428 + }, + { + "epoch": 0.22528448992461375, + "grad_norm": 1.046875, + "learning_rate": 7.09825462297199e-05, + "loss": 1.1726, + "step": 1429 + }, + { + "epoch": 0.22544214177200678, + "grad_norm": 0.97265625, + "learning_rate": 7.097814060151965e-05, + "loss": 1.1591, + "step": 1430 + }, + { + "epoch": 0.22559979361939977, + "grad_norm": 0.9140625, + "learning_rate": 7.097373503482958e-05, + "loss": 1.1141, + "step": 1431 + }, + { + "epoch": 0.2257574454667928, + "grad_norm": 1.015625, + "learning_rate": 7.0969329529659e-05, + "loss": 1.1505, + "step": 1432 + }, + { + "epoch": 0.2259150973141858, + "grad_norm": 0.98828125, + "learning_rate": 7.09649240860173e-05, + "loss": 1.296, + "step": 1433 + }, + { + "epoch": 0.22607274916157882, + "grad_norm": 0.96484375, + "learning_rate": 7.096051870391376e-05, + "loss": 1.0001, + "step": 1434 + }, + { + "epoch": 0.22623040100897182, + "grad_norm": 0.95703125, + "learning_rate": 7.095611338335776e-05, + "loss": 1.1247, + "step": 1435 + }, + { + "epoch": 0.22638805285636485, + "grad_norm": 0.90234375, + "learning_rate": 7.095170812435861e-05, + "loss": 0.9733, + "step": 1436 + }, + { + "epoch": 0.22654570470375784, + "grad_norm": 0.95703125, + "learning_rate": 7.09473029269256e-05, + "loss": 1.0558, + "step": 1437 + }, + { + "epoch": 0.22670335655115087, + "grad_norm": 0.890625, + "learning_rate": 7.094289779106819e-05, + "loss": 1.037, + "step": 1438 + }, + { + "epoch": 0.22686100839854387, + "grad_norm": 1.0390625, + "learning_rate": 7.093849271679562e-05, + "loss": 1.0728, + "step": 1439 + }, + { + "epoch": 0.2270186602459369, + "grad_norm": 0.94140625, + "learning_rate": 7.093408770411727e-05, + "loss": 1.103, + "step": 1440 + }, + { + "epoch": 0.2271763120933299, + "grad_norm": 0.94140625, + "learning_rate": 7.092968275304245e-05, + "loss": 1.1255, + "step": 1441 + }, + { + "epoch": 0.22733396394072292, + "grad_norm": 1.0546875, + "learning_rate": 7.092527786358047e-05, + "loss": 1.0659, + "step": 1442 + }, + { + "epoch": 0.22749161578811591, + "grad_norm": 0.96875, + "learning_rate": 7.092087303574075e-05, + "loss": 1.1646, + "step": 1443 + }, + { + "epoch": 0.22764926763550894, + "grad_norm": 0.9609375, + "learning_rate": 7.091646826953258e-05, + "loss": 1.2669, + "step": 1444 + }, + { + "epoch": 0.22780691948290194, + "grad_norm": 0.93359375, + "learning_rate": 7.091206356496526e-05, + "loss": 1.1622, + "step": 1445 + }, + { + "epoch": 0.22796457133029496, + "grad_norm": 0.90625, + "learning_rate": 7.090765892204821e-05, + "loss": 0.9643, + "step": 1446 + }, + { + "epoch": 0.22812222317768796, + "grad_norm": 0.96875, + "learning_rate": 7.090325434079064e-05, + "loss": 1.1371, + "step": 1447 + }, + { + "epoch": 0.22827987502508099, + "grad_norm": 0.8671875, + "learning_rate": 7.0898849821202e-05, + "loss": 1.0543, + "step": 1448 + }, + { + "epoch": 0.22843752687247398, + "grad_norm": 1.109375, + "learning_rate": 7.089444536329159e-05, + "loss": 1.2672, + "step": 1449 + }, + { + "epoch": 0.228595178719867, + "grad_norm": 0.8359375, + "learning_rate": 7.089004096706873e-05, + "loss": 1.0028, + "step": 1450 + }, + { + "epoch": 0.22875283056726, + "grad_norm": 1.0078125, + "learning_rate": 7.088563663254278e-05, + "loss": 1.1057, + "step": 1451 + }, + { + "epoch": 0.22891048241465303, + "grad_norm": 1.015625, + "learning_rate": 7.088123235972304e-05, + "loss": 1.1907, + "step": 1452 + }, + { + "epoch": 0.22906813426204603, + "grad_norm": 0.91796875, + "learning_rate": 7.087682814861887e-05, + "loss": 1.253, + "step": 1453 + }, + { + "epoch": 0.22922578610943906, + "grad_norm": 0.96875, + "learning_rate": 7.087242399923962e-05, + "loss": 1.0156, + "step": 1454 + }, + { + "epoch": 0.22938343795683205, + "grad_norm": 0.90625, + "learning_rate": 7.086801991159458e-05, + "loss": 0.9941, + "step": 1455 + }, + { + "epoch": 0.22954108980422508, + "grad_norm": 0.96484375, + "learning_rate": 7.086361588569309e-05, + "loss": 1.1429, + "step": 1456 + }, + { + "epoch": 0.22969874165161808, + "grad_norm": 1.3359375, + "learning_rate": 7.08592119215445e-05, + "loss": 1.159, + "step": 1457 + }, + { + "epoch": 0.2298563934990111, + "grad_norm": 1.1796875, + "learning_rate": 7.085480801915818e-05, + "loss": 1.1011, + "step": 1458 + }, + { + "epoch": 0.2300140453464041, + "grad_norm": 1.0546875, + "learning_rate": 7.08504041785434e-05, + "loss": 1.2951, + "step": 1459 + }, + { + "epoch": 0.23017169719379713, + "grad_norm": 0.8828125, + "learning_rate": 7.084600039970955e-05, + "loss": 1.028, + "step": 1460 + }, + { + "epoch": 0.23032934904119012, + "grad_norm": 0.92578125, + "learning_rate": 7.084159668266592e-05, + "loss": 0.9699, + "step": 1461 + }, + { + "epoch": 0.23048700088858315, + "grad_norm": 0.9921875, + "learning_rate": 7.083719302742183e-05, + "loss": 1.0949, + "step": 1462 + }, + { + "epoch": 0.23064465273597615, + "grad_norm": 1.5859375, + "learning_rate": 7.083278943398668e-05, + "loss": 1.2474, + "step": 1463 + }, + { + "epoch": 0.23080230458336917, + "grad_norm": 0.9375, + "learning_rate": 7.082838590236976e-05, + "loss": 0.9996, + "step": 1464 + }, + { + "epoch": 0.23095995643076217, + "grad_norm": 0.96484375, + "learning_rate": 7.082398243258042e-05, + "loss": 1.127, + "step": 1465 + }, + { + "epoch": 0.2311176082781552, + "grad_norm": 1.0546875, + "learning_rate": 7.081957902462797e-05, + "loss": 1.2925, + "step": 1466 + }, + { + "epoch": 0.2312752601255482, + "grad_norm": 0.98046875, + "learning_rate": 7.081517567852172e-05, + "loss": 1.0787, + "step": 1467 + }, + { + "epoch": 0.23143291197294122, + "grad_norm": 1.0, + "learning_rate": 7.081077239427107e-05, + "loss": 0.9864, + "step": 1468 + }, + { + "epoch": 0.23159056382033422, + "grad_norm": 1.03125, + "learning_rate": 7.080636917188532e-05, + "loss": 1.1679, + "step": 1469 + }, + { + "epoch": 0.23174821566772724, + "grad_norm": 0.953125, + "learning_rate": 7.080196601137381e-05, + "loss": 0.9883, + "step": 1470 + }, + { + "epoch": 0.23190586751512024, + "grad_norm": 0.93359375, + "learning_rate": 7.079756291274587e-05, + "loss": 1.1307, + "step": 1471 + }, + { + "epoch": 0.23206351936251327, + "grad_norm": 0.98046875, + "learning_rate": 7.07931598760108e-05, + "loss": 1.2378, + "step": 1472 + }, + { + "epoch": 0.23222117120990626, + "grad_norm": 1.0078125, + "learning_rate": 7.078875690117797e-05, + "loss": 1.2546, + "step": 1473 + }, + { + "epoch": 0.2323788230572993, + "grad_norm": 0.9921875, + "learning_rate": 7.078435398825671e-05, + "loss": 1.1365, + "step": 1474 + }, + { + "epoch": 0.2325364749046923, + "grad_norm": 0.9765625, + "learning_rate": 7.077995113725631e-05, + "loss": 1.1301, + "step": 1475 + }, + { + "epoch": 0.2326941267520853, + "grad_norm": 0.9375, + "learning_rate": 7.077554834818617e-05, + "loss": 0.978, + "step": 1476 + }, + { + "epoch": 0.2328517785994783, + "grad_norm": 1.0078125, + "learning_rate": 7.077114562105556e-05, + "loss": 1.2358, + "step": 1477 + }, + { + "epoch": 0.23300943044687134, + "grad_norm": 0.8125, + "learning_rate": 7.076674295587384e-05, + "loss": 0.9415, + "step": 1478 + }, + { + "epoch": 0.23316708229426433, + "grad_norm": 0.9296875, + "learning_rate": 7.076234035265034e-05, + "loss": 1.0254, + "step": 1479 + }, + { + "epoch": 0.23332473414165736, + "grad_norm": 0.98828125, + "learning_rate": 7.075793781139442e-05, + "loss": 1.31, + "step": 1480 + }, + { + "epoch": 0.23348238598905036, + "grad_norm": 1.0078125, + "learning_rate": 7.075353533211535e-05, + "loss": 1.305, + "step": 1481 + }, + { + "epoch": 0.23364003783644338, + "grad_norm": 0.95703125, + "learning_rate": 7.074913291482246e-05, + "loss": 0.9372, + "step": 1482 + }, + { + "epoch": 0.23379768968383638, + "grad_norm": 0.96875, + "learning_rate": 7.074473055952515e-05, + "loss": 1.1031, + "step": 1483 + }, + { + "epoch": 0.2339553415312294, + "grad_norm": 1.03125, + "learning_rate": 7.074032826623271e-05, + "loss": 1.4406, + "step": 1484 + }, + { + "epoch": 0.2341129933786224, + "grad_norm": 0.97265625, + "learning_rate": 7.073592603495447e-05, + "loss": 1.1805, + "step": 1485 + }, + { + "epoch": 0.23427064522601543, + "grad_norm": 0.921875, + "learning_rate": 7.073152386569976e-05, + "loss": 0.9847, + "step": 1486 + }, + { + "epoch": 0.23442829707340843, + "grad_norm": 0.984375, + "learning_rate": 7.072712175847787e-05, + "loss": 0.9604, + "step": 1487 + }, + { + "epoch": 0.23458594892080145, + "grad_norm": 1.1640625, + "learning_rate": 7.072271971329823e-05, + "loss": 1.0034, + "step": 1488 + }, + { + "epoch": 0.23474360076819445, + "grad_norm": 0.78125, + "learning_rate": 7.071831773017009e-05, + "loss": 0.9594, + "step": 1489 + }, + { + "epoch": 0.23490125261558747, + "grad_norm": 0.921875, + "learning_rate": 7.071391580910281e-05, + "loss": 1.1016, + "step": 1490 + }, + { + "epoch": 0.23505890446298047, + "grad_norm": 0.96484375, + "learning_rate": 7.070951395010572e-05, + "loss": 1.108, + "step": 1491 + }, + { + "epoch": 0.2352165563103735, + "grad_norm": 0.99609375, + "learning_rate": 7.070511215318811e-05, + "loss": 1.004, + "step": 1492 + }, + { + "epoch": 0.2353742081577665, + "grad_norm": 0.89453125, + "learning_rate": 7.070071041835935e-05, + "loss": 1.1389, + "step": 1493 + }, + { + "epoch": 0.23553186000515952, + "grad_norm": 0.93359375, + "learning_rate": 7.069630874562877e-05, + "loss": 1.0214, + "step": 1494 + }, + { + "epoch": 0.23568951185255252, + "grad_norm": 0.9140625, + "learning_rate": 7.06919071350057e-05, + "loss": 1.138, + "step": 1495 + }, + { + "epoch": 0.23584716369994554, + "grad_norm": 0.98828125, + "learning_rate": 7.068750558649945e-05, + "loss": 1.3268, + "step": 1496 + }, + { + "epoch": 0.23600481554733854, + "grad_norm": 0.86328125, + "learning_rate": 7.068310410011934e-05, + "loss": 0.9273, + "step": 1497 + }, + { + "epoch": 0.23616246739473157, + "grad_norm": 1.7421875, + "learning_rate": 7.067870267587472e-05, + "loss": 1.3118, + "step": 1498 + }, + { + "epoch": 0.23632011924212457, + "grad_norm": 0.91015625, + "learning_rate": 7.067430131377494e-05, + "loss": 1.1032, + "step": 1499 + }, + { + "epoch": 0.2364777710895176, + "grad_norm": 0.953125, + "learning_rate": 7.066990001382928e-05, + "loss": 1.144, + "step": 1500 + }, + { + "epoch": 0.2366354229369106, + "grad_norm": 1.0, + "learning_rate": 7.066549877604712e-05, + "loss": 1.2402, + "step": 1501 + }, + { + "epoch": 0.23679307478430361, + "grad_norm": 1.265625, + "learning_rate": 7.066109760043768e-05, + "loss": 0.9757, + "step": 1502 + }, + { + "epoch": 0.2369507266316966, + "grad_norm": 0.80078125, + "learning_rate": 7.065669648701044e-05, + "loss": 0.9279, + "step": 1503 + }, + { + "epoch": 0.23710837847908964, + "grad_norm": 1.0546875, + "learning_rate": 7.065229543577463e-05, + "loss": 1.1724, + "step": 1504 + }, + { + "epoch": 0.23726603032648264, + "grad_norm": 1.0234375, + "learning_rate": 7.06478944467396e-05, + "loss": 1.3237, + "step": 1505 + }, + { + "epoch": 0.23742368217387566, + "grad_norm": 1.046875, + "learning_rate": 7.064349351991471e-05, + "loss": 1.1968, + "step": 1506 + }, + { + "epoch": 0.23758133402126866, + "grad_norm": 0.8984375, + "learning_rate": 7.063909265530923e-05, + "loss": 1.1441, + "step": 1507 + }, + { + "epoch": 0.23773898586866168, + "grad_norm": 1.0703125, + "learning_rate": 7.063469185293248e-05, + "loss": 1.1328, + "step": 1508 + }, + { + "epoch": 0.23789663771605468, + "grad_norm": 0.94921875, + "learning_rate": 7.063029111279387e-05, + "loss": 1.0879, + "step": 1509 + }, + { + "epoch": 0.2380542895634477, + "grad_norm": 0.7890625, + "learning_rate": 7.062589043490267e-05, + "loss": 0.8795, + "step": 1510 + }, + { + "epoch": 0.2382119414108407, + "grad_norm": 0.98828125, + "learning_rate": 7.062148981926822e-05, + "loss": 1.3077, + "step": 1511 + }, + { + "epoch": 0.23836959325823373, + "grad_norm": 1.7421875, + "learning_rate": 7.061708926589985e-05, + "loss": 1.1431, + "step": 1512 + }, + { + "epoch": 0.23852724510562673, + "grad_norm": 1.0859375, + "learning_rate": 7.061268877480684e-05, + "loss": 1.0349, + "step": 1513 + }, + { + "epoch": 0.23868489695301975, + "grad_norm": 0.9453125, + "learning_rate": 7.060828834599858e-05, + "loss": 1.0356, + "step": 1514 + }, + { + "epoch": 0.23884254880041275, + "grad_norm": 1.0234375, + "learning_rate": 7.060388797948438e-05, + "loss": 1.0894, + "step": 1515 + }, + { + "epoch": 0.23900020064780578, + "grad_norm": 0.89453125, + "learning_rate": 7.059948767527358e-05, + "loss": 0.9824, + "step": 1516 + }, + { + "epoch": 0.23915785249519877, + "grad_norm": 1.3515625, + "learning_rate": 7.059508743337547e-05, + "loss": 0.9462, + "step": 1517 + }, + { + "epoch": 0.2393155043425918, + "grad_norm": 1.0390625, + "learning_rate": 7.059068725379936e-05, + "loss": 1.3671, + "step": 1518 + }, + { + "epoch": 0.2394731561899848, + "grad_norm": 0.98828125, + "learning_rate": 7.058628713655464e-05, + "loss": 1.109, + "step": 1519 + }, + { + "epoch": 0.23963080803737782, + "grad_norm": 0.9375, + "learning_rate": 7.058188708165062e-05, + "loss": 1.0372, + "step": 1520 + }, + { + "epoch": 0.23978845988477082, + "grad_norm": 0.98046875, + "learning_rate": 7.057748708909657e-05, + "loss": 1.2781, + "step": 1521 + }, + { + "epoch": 0.23994611173216385, + "grad_norm": 1.1484375, + "learning_rate": 7.057308715890187e-05, + "loss": 1.0074, + "step": 1522 + }, + { + "epoch": 0.24010376357955684, + "grad_norm": 0.90625, + "learning_rate": 7.05686872910758e-05, + "loss": 0.9642, + "step": 1523 + }, + { + "epoch": 0.24026141542694987, + "grad_norm": 1.046875, + "learning_rate": 7.056428748562776e-05, + "loss": 1.1751, + "step": 1524 + }, + { + "epoch": 0.24041906727434287, + "grad_norm": 0.97265625, + "learning_rate": 7.055988774256701e-05, + "loss": 1.114, + "step": 1525 + }, + { + "epoch": 0.2405767191217359, + "grad_norm": 0.9296875, + "learning_rate": 7.055548806190291e-05, + "loss": 1.2158, + "step": 1526 + }, + { + "epoch": 0.2407343709691289, + "grad_norm": 0.98828125, + "learning_rate": 7.055108844364476e-05, + "loss": 1.1238, + "step": 1527 + }, + { + "epoch": 0.24089202281652192, + "grad_norm": 0.890625, + "learning_rate": 7.054668888780186e-05, + "loss": 1.1261, + "step": 1528 + }, + { + "epoch": 0.24104967466391491, + "grad_norm": 0.95703125, + "learning_rate": 7.054228939438361e-05, + "loss": 1.0729, + "step": 1529 + }, + { + "epoch": 0.24120732651130794, + "grad_norm": 0.9921875, + "learning_rate": 7.05378899633993e-05, + "loss": 1.1375, + "step": 1530 + }, + { + "epoch": 0.24136497835870094, + "grad_norm": 1.0234375, + "learning_rate": 7.053349059485823e-05, + "loss": 1.2235, + "step": 1531 + }, + { + "epoch": 0.24152263020609396, + "grad_norm": 0.92578125, + "learning_rate": 7.052909128876976e-05, + "loss": 1.2142, + "step": 1532 + }, + { + "epoch": 0.24168028205348696, + "grad_norm": 0.8671875, + "learning_rate": 7.052469204514315e-05, + "loss": 1.1479, + "step": 1533 + }, + { + "epoch": 0.24183793390088, + "grad_norm": 0.98046875, + "learning_rate": 7.05202928639878e-05, + "loss": 1.0628, + "step": 1534 + }, + { + "epoch": 0.24199558574827298, + "grad_norm": 1.0703125, + "learning_rate": 7.051589374531303e-05, + "loss": 1.2245, + "step": 1535 + }, + { + "epoch": 0.242153237595666, + "grad_norm": 1.03125, + "learning_rate": 7.051149468912812e-05, + "loss": 1.1376, + "step": 1536 + }, + { + "epoch": 0.242310889443059, + "grad_norm": 0.8828125, + "learning_rate": 7.050709569544241e-05, + "loss": 0.8926, + "step": 1537 + }, + { + "epoch": 0.24246854129045203, + "grad_norm": 1.0703125, + "learning_rate": 7.05026967642652e-05, + "loss": 1.2994, + "step": 1538 + }, + { + "epoch": 0.24262619313784503, + "grad_norm": 1.046875, + "learning_rate": 7.049829789560586e-05, + "loss": 1.3028, + "step": 1539 + }, + { + "epoch": 0.24278384498523806, + "grad_norm": 1.0546875, + "learning_rate": 7.049389908947372e-05, + "loss": 1.1088, + "step": 1540 + }, + { + "epoch": 0.24294149683263105, + "grad_norm": 0.91015625, + "learning_rate": 7.048950034587805e-05, + "loss": 1.1674, + "step": 1541 + }, + { + "epoch": 0.24309914868002408, + "grad_norm": 0.81640625, + "learning_rate": 7.048510166482818e-05, + "loss": 0.9752, + "step": 1542 + }, + { + "epoch": 0.24325680052741708, + "grad_norm": 1.0703125, + "learning_rate": 7.048070304633345e-05, + "loss": 1.2117, + "step": 1543 + }, + { + "epoch": 0.2434144523748101, + "grad_norm": 0.8203125, + "learning_rate": 7.047630449040321e-05, + "loss": 0.9562, + "step": 1544 + }, + { + "epoch": 0.2435721042222031, + "grad_norm": 0.91015625, + "learning_rate": 7.047190599704674e-05, + "loss": 1.0712, + "step": 1545 + }, + { + "epoch": 0.24372975606959613, + "grad_norm": 0.99609375, + "learning_rate": 7.046750756627338e-05, + "loss": 1.1116, + "step": 1546 + }, + { + "epoch": 0.24388740791698912, + "grad_norm": 0.96875, + "learning_rate": 7.046310919809247e-05, + "loss": 1.0757, + "step": 1547 + }, + { + "epoch": 0.24404505976438215, + "grad_norm": 1.0078125, + "learning_rate": 7.045871089251325e-05, + "loss": 1.0926, + "step": 1548 + }, + { + "epoch": 0.24420271161177515, + "grad_norm": 1.046875, + "learning_rate": 7.045431264954516e-05, + "loss": 1.0518, + "step": 1549 + }, + { + "epoch": 0.24436036345916817, + "grad_norm": 0.91796875, + "learning_rate": 7.044991446919744e-05, + "loss": 1.1166, + "step": 1550 + }, + { + "epoch": 0.24451801530656117, + "grad_norm": 0.98828125, + "learning_rate": 7.044551635147947e-05, + "loss": 1.2052, + "step": 1551 + }, + { + "epoch": 0.2446756671539542, + "grad_norm": 1.0703125, + "learning_rate": 7.044111829640052e-05, + "loss": 1.1918, + "step": 1552 + }, + { + "epoch": 0.2448333190013472, + "grad_norm": 0.8359375, + "learning_rate": 7.043672030396989e-05, + "loss": 0.8933, + "step": 1553 + }, + { + "epoch": 0.24499097084874022, + "grad_norm": 0.9296875, + "learning_rate": 7.043232237419699e-05, + "loss": 1.0459, + "step": 1554 + }, + { + "epoch": 0.24514862269613322, + "grad_norm": 1.015625, + "learning_rate": 7.042792450709108e-05, + "loss": 1.2032, + "step": 1555 + }, + { + "epoch": 0.24530627454352624, + "grad_norm": 0.9296875, + "learning_rate": 7.04235267026615e-05, + "loss": 1.0806, + "step": 1556 + }, + { + "epoch": 0.24546392639091924, + "grad_norm": 0.95703125, + "learning_rate": 7.041912896091757e-05, + "loss": 1.0679, + "step": 1557 + }, + { + "epoch": 0.24562157823831227, + "grad_norm": 1.0390625, + "learning_rate": 7.041473128186858e-05, + "loss": 1.0776, + "step": 1558 + }, + { + "epoch": 0.24577923008570526, + "grad_norm": 1.1171875, + "learning_rate": 7.041033366552389e-05, + "loss": 1.0946, + "step": 1559 + }, + { + "epoch": 0.2459368819330983, + "grad_norm": 0.88671875, + "learning_rate": 7.04059361118928e-05, + "loss": 0.9819, + "step": 1560 + }, + { + "epoch": 0.2460945337804913, + "grad_norm": 0.8359375, + "learning_rate": 7.040153862098465e-05, + "loss": 1.0056, + "step": 1561 + }, + { + "epoch": 0.2462521856278843, + "grad_norm": 0.94140625, + "learning_rate": 7.039714119280876e-05, + "loss": 1.0712, + "step": 1562 + }, + { + "epoch": 0.2464098374752773, + "grad_norm": 1.0390625, + "learning_rate": 7.03927438273744e-05, + "loss": 1.0964, + "step": 1563 + }, + { + "epoch": 0.24656748932267034, + "grad_norm": 1.03125, + "learning_rate": 7.038834652469094e-05, + "loss": 1.0572, + "step": 1564 + }, + { + "epoch": 0.24672514117006336, + "grad_norm": 0.9453125, + "learning_rate": 7.03839492847677e-05, + "loss": 1.1886, + "step": 1565 + }, + { + "epoch": 0.24688279301745636, + "grad_norm": 0.9296875, + "learning_rate": 7.0379552107614e-05, + "loss": 1.0563, + "step": 1566 + }, + { + "epoch": 0.24704044486484938, + "grad_norm": 1.015625, + "learning_rate": 7.037515499323913e-05, + "loss": 1.186, + "step": 1567 + }, + { + "epoch": 0.24719809671224238, + "grad_norm": 0.90625, + "learning_rate": 7.037075794165237e-05, + "loss": 1.0678, + "step": 1568 + }, + { + "epoch": 0.2473557485596354, + "grad_norm": 1.0, + "learning_rate": 7.036636095286316e-05, + "loss": 1.117, + "step": 1569 + }, + { + "epoch": 0.2475134004070284, + "grad_norm": 0.90625, + "learning_rate": 7.036196402688076e-05, + "loss": 1.1243, + "step": 1570 + }, + { + "epoch": 0.24767105225442143, + "grad_norm": 1.09375, + "learning_rate": 7.035756716371446e-05, + "loss": 1.042, + "step": 1571 + }, + { + "epoch": 0.24782870410181443, + "grad_norm": 1.375, + "learning_rate": 7.03531703633736e-05, + "loss": 1.0608, + "step": 1572 + }, + { + "epoch": 0.24798635594920745, + "grad_norm": 0.8984375, + "learning_rate": 7.03487736258675e-05, + "loss": 1.1136, + "step": 1573 + }, + { + "epoch": 0.24814400779660045, + "grad_norm": 0.9609375, + "learning_rate": 7.034437695120548e-05, + "loss": 1.0519, + "step": 1574 + }, + { + "epoch": 0.24830165964399348, + "grad_norm": 0.9375, + "learning_rate": 7.033998033939687e-05, + "loss": 1.17, + "step": 1575 + }, + { + "epoch": 0.24845931149138648, + "grad_norm": 0.93359375, + "learning_rate": 7.033558379045098e-05, + "loss": 0.9513, + "step": 1576 + }, + { + "epoch": 0.2486169633387795, + "grad_norm": 0.96484375, + "learning_rate": 7.033118730437713e-05, + "loss": 0.9956, + "step": 1577 + }, + { + "epoch": 0.2487746151861725, + "grad_norm": 0.96484375, + "learning_rate": 7.03267908811846e-05, + "loss": 1.1155, + "step": 1578 + }, + { + "epoch": 0.24893226703356552, + "grad_norm": 0.80859375, + "learning_rate": 7.032239452088274e-05, + "loss": 0.9468, + "step": 1579 + }, + { + "epoch": 0.24908991888095852, + "grad_norm": 0.90234375, + "learning_rate": 7.03179982234809e-05, + "loss": 0.9682, + "step": 1580 + }, + { + "epoch": 0.24924757072835155, + "grad_norm": 0.921875, + "learning_rate": 7.031360198898835e-05, + "loss": 1.0741, + "step": 1581 + }, + { + "epoch": 0.24940522257574455, + "grad_norm": 0.98046875, + "learning_rate": 7.030920581741444e-05, + "loss": 1.07, + "step": 1582 + }, + { + "epoch": 0.24956287442313757, + "grad_norm": 0.97265625, + "learning_rate": 7.030480970876846e-05, + "loss": 0.909, + "step": 1583 + }, + { + "epoch": 0.24972052627053057, + "grad_norm": 0.9921875, + "learning_rate": 7.030041366305973e-05, + "loss": 1.2718, + "step": 1584 + }, + { + "epoch": 0.2498781781179236, + "grad_norm": 1.5234375, + "learning_rate": 7.02960176802976e-05, + "loss": 0.9924, + "step": 1585 + }, + { + "epoch": 0.2500358299653166, + "grad_norm": 0.84375, + "learning_rate": 7.029162176049134e-05, + "loss": 1.1488, + "step": 1586 + }, + { + "epoch": 0.2501934818127096, + "grad_norm": 0.92578125, + "learning_rate": 7.02872259036503e-05, + "loss": 1.1985, + "step": 1587 + }, + { + "epoch": 0.25035113366010264, + "grad_norm": 1.0546875, + "learning_rate": 7.028283010978376e-05, + "loss": 1.1041, + "step": 1588 + }, + { + "epoch": 0.2505087855074956, + "grad_norm": 1.0234375, + "learning_rate": 7.027843437890109e-05, + "loss": 0.9797, + "step": 1589 + }, + { + "epoch": 0.25066643735488864, + "grad_norm": 0.96875, + "learning_rate": 7.027403871101157e-05, + "loss": 1.0096, + "step": 1590 + }, + { + "epoch": 0.25082408920228166, + "grad_norm": 0.89453125, + "learning_rate": 7.026964310612453e-05, + "loss": 1.1291, + "step": 1591 + }, + { + "epoch": 0.2509817410496747, + "grad_norm": 0.84765625, + "learning_rate": 7.02652475642493e-05, + "loss": 0.9851, + "step": 1592 + }, + { + "epoch": 0.25113939289706766, + "grad_norm": 1.0390625, + "learning_rate": 7.026085208539513e-05, + "loss": 1.0247, + "step": 1593 + }, + { + "epoch": 0.2512970447444607, + "grad_norm": 0.91015625, + "learning_rate": 7.025645666957141e-05, + "loss": 1.1735, + "step": 1594 + }, + { + "epoch": 0.2514546965918537, + "grad_norm": 0.95703125, + "learning_rate": 7.025206131678745e-05, + "loss": 1.1215, + "step": 1595 + }, + { + "epoch": 0.25161234843924674, + "grad_norm": 0.9609375, + "learning_rate": 7.024766602705254e-05, + "loss": 1.0568, + "step": 1596 + }, + { + "epoch": 0.2517700002866397, + "grad_norm": 1.078125, + "learning_rate": 7.024327080037599e-05, + "loss": 1.3967, + "step": 1597 + }, + { + "epoch": 0.25192765213403273, + "grad_norm": 0.97265625, + "learning_rate": 7.02388756367671e-05, + "loss": 1.0681, + "step": 1598 + }, + { + "epoch": 0.25208530398142576, + "grad_norm": 0.99609375, + "learning_rate": 7.023448053623525e-05, + "loss": 1.0843, + "step": 1599 + }, + { + "epoch": 0.2522429558288188, + "grad_norm": 0.921875, + "learning_rate": 7.023008549878971e-05, + "loss": 1.0127, + "step": 1600 + }, + { + "epoch": 0.25240060767621175, + "grad_norm": 1.1171875, + "learning_rate": 7.022569052443982e-05, + "loss": 1.1032, + "step": 1601 + }, + { + "epoch": 0.2525582595236048, + "grad_norm": 0.94921875, + "learning_rate": 7.022129561319486e-05, + "loss": 1.2235, + "step": 1602 + }, + { + "epoch": 0.2527159113709978, + "grad_norm": 1.96875, + "learning_rate": 7.021690076506413e-05, + "loss": 1.1295, + "step": 1603 + }, + { + "epoch": 0.25287356321839083, + "grad_norm": 0.8984375, + "learning_rate": 7.021250598005702e-05, + "loss": 1.021, + "step": 1604 + }, + { + "epoch": 0.2530312150657838, + "grad_norm": 0.99609375, + "learning_rate": 7.020811125818279e-05, + "loss": 1.2344, + "step": 1605 + }, + { + "epoch": 0.2531888669131768, + "grad_norm": 0.9375, + "learning_rate": 7.020371659945078e-05, + "loss": 1.1612, + "step": 1606 + }, + { + "epoch": 0.25334651876056985, + "grad_norm": 1.03125, + "learning_rate": 7.019932200387027e-05, + "loss": 1.1417, + "step": 1607 + }, + { + "epoch": 0.2535041706079629, + "grad_norm": 1.0625, + "learning_rate": 7.019492747145055e-05, + "loss": 1.2071, + "step": 1608 + }, + { + "epoch": 0.25366182245535585, + "grad_norm": 0.921875, + "learning_rate": 7.019053300220104e-05, + "loss": 1.0778, + "step": 1609 + }, + { + "epoch": 0.25381947430274887, + "grad_norm": 0.921875, + "learning_rate": 7.018613859613097e-05, + "loss": 1.1426, + "step": 1610 + }, + { + "epoch": 0.2539771261501419, + "grad_norm": 0.890625, + "learning_rate": 7.01817442532497e-05, + "loss": 1.1963, + "step": 1611 + }, + { + "epoch": 0.2541347779975349, + "grad_norm": 0.94921875, + "learning_rate": 7.017734997356651e-05, + "loss": 1.1617, + "step": 1612 + }, + { + "epoch": 0.2542924298449279, + "grad_norm": 1.0078125, + "learning_rate": 7.017295575709066e-05, + "loss": 1.0453, + "step": 1613 + }, + { + "epoch": 0.2544500816923209, + "grad_norm": 0.85546875, + "learning_rate": 7.016856160383158e-05, + "loss": 1.0537, + "step": 1614 + }, + { + "epoch": 0.25460773353971394, + "grad_norm": 0.8359375, + "learning_rate": 7.016416751379854e-05, + "loss": 0.8849, + "step": 1615 + }, + { + "epoch": 0.25476538538710697, + "grad_norm": 0.90234375, + "learning_rate": 7.015977348700084e-05, + "loss": 1.0803, + "step": 1616 + }, + { + "epoch": 0.25492303723449994, + "grad_norm": 1.015625, + "learning_rate": 7.015537952344778e-05, + "loss": 1.0938, + "step": 1617 + }, + { + "epoch": 0.25508068908189296, + "grad_norm": 0.88671875, + "learning_rate": 7.015098562314866e-05, + "loss": 0.8956, + "step": 1618 + }, + { + "epoch": 0.255238340929286, + "grad_norm": 0.96875, + "learning_rate": 7.014659178611285e-05, + "loss": 0.9498, + "step": 1619 + }, + { + "epoch": 0.255395992776679, + "grad_norm": 1.0078125, + "learning_rate": 7.014219801234963e-05, + "loss": 1.3111, + "step": 1620 + }, + { + "epoch": 0.255553644624072, + "grad_norm": 0.92578125, + "learning_rate": 7.013780430186832e-05, + "loss": 1.4495, + "step": 1621 + }, + { + "epoch": 0.255711296471465, + "grad_norm": 0.96484375, + "learning_rate": 7.013341065467823e-05, + "loss": 1.1225, + "step": 1622 + }, + { + "epoch": 0.25586894831885804, + "grad_norm": 1.921875, + "learning_rate": 7.012901707078865e-05, + "loss": 1.1353, + "step": 1623 + }, + { + "epoch": 0.25602660016625106, + "grad_norm": 0.8984375, + "learning_rate": 7.012462355020893e-05, + "loss": 0.9615, + "step": 1624 + }, + { + "epoch": 0.25618425201364403, + "grad_norm": 1.171875, + "learning_rate": 7.012023009294836e-05, + "loss": 1.2999, + "step": 1625 + }, + { + "epoch": 0.25634190386103706, + "grad_norm": 0.921875, + "learning_rate": 7.011583669901625e-05, + "loss": 1.1103, + "step": 1626 + }, + { + "epoch": 0.2564995557084301, + "grad_norm": 0.9453125, + "learning_rate": 7.01114433684219e-05, + "loss": 1.0375, + "step": 1627 + }, + { + "epoch": 0.2566572075558231, + "grad_norm": 0.92578125, + "learning_rate": 7.010705010117464e-05, + "loss": 1.0802, + "step": 1628 + }, + { + "epoch": 0.2568148594032161, + "grad_norm": 0.859375, + "learning_rate": 7.01026568972838e-05, + "loss": 0.9149, + "step": 1629 + }, + { + "epoch": 0.2569725112506091, + "grad_norm": 0.89453125, + "learning_rate": 7.009826375675868e-05, + "loss": 1.1372, + "step": 1630 + }, + { + "epoch": 0.25713016309800213, + "grad_norm": 0.98046875, + "learning_rate": 7.009387067960855e-05, + "loss": 1.3063, + "step": 1631 + }, + { + "epoch": 0.25728781494539515, + "grad_norm": 0.9453125, + "learning_rate": 7.008947766584278e-05, + "loss": 1.2457, + "step": 1632 + }, + { + "epoch": 0.2574454667927881, + "grad_norm": 0.96484375, + "learning_rate": 7.008508471547059e-05, + "loss": 1.0863, + "step": 1633 + }, + { + "epoch": 0.25760311864018115, + "grad_norm": 0.984375, + "learning_rate": 7.008069182850141e-05, + "loss": 1.3042, + "step": 1634 + }, + { + "epoch": 0.2577607704875742, + "grad_norm": 0.8828125, + "learning_rate": 7.00762990049445e-05, + "loss": 0.8884, + "step": 1635 + }, + { + "epoch": 0.2579184223349672, + "grad_norm": 0.94921875, + "learning_rate": 7.007190624480915e-05, + "loss": 1.2682, + "step": 1636 + }, + { + "epoch": 0.25807607418236017, + "grad_norm": 1.0078125, + "learning_rate": 7.006751354810468e-05, + "loss": 1.0564, + "step": 1637 + }, + { + "epoch": 0.2582337260297532, + "grad_norm": 1.015625, + "learning_rate": 7.006312091484038e-05, + "loss": 1.2266, + "step": 1638 + }, + { + "epoch": 0.2583913778771462, + "grad_norm": 0.94921875, + "learning_rate": 7.005872834502562e-05, + "loss": 1.0456, + "step": 1639 + }, + { + "epoch": 0.25854902972453925, + "grad_norm": 0.9765625, + "learning_rate": 7.005433583866966e-05, + "loss": 1.0957, + "step": 1640 + }, + { + "epoch": 0.2587066815719322, + "grad_norm": 0.9609375, + "learning_rate": 7.004994339578184e-05, + "loss": 0.8837, + "step": 1641 + }, + { + "epoch": 0.25886433341932524, + "grad_norm": 0.8125, + "learning_rate": 7.004555101637144e-05, + "loss": 0.964, + "step": 1642 + }, + { + "epoch": 0.25902198526671827, + "grad_norm": 1.0078125, + "learning_rate": 7.004115870044776e-05, + "loss": 1.0471, + "step": 1643 + }, + { + "epoch": 0.2591796371141113, + "grad_norm": 0.9921875, + "learning_rate": 7.003676644802017e-05, + "loss": 1.0464, + "step": 1644 + }, + { + "epoch": 0.25933728896150426, + "grad_norm": 1.0234375, + "learning_rate": 7.003237425909794e-05, + "loss": 1.153, + "step": 1645 + }, + { + "epoch": 0.2594949408088973, + "grad_norm": 0.9140625, + "learning_rate": 7.002798213369035e-05, + "loss": 0.9293, + "step": 1646 + }, + { + "epoch": 0.2596525926562903, + "grad_norm": 0.8671875, + "learning_rate": 7.002359007180675e-05, + "loss": 1.0631, + "step": 1647 + }, + { + "epoch": 0.25981024450368334, + "grad_norm": 1.0234375, + "learning_rate": 7.001919807345645e-05, + "loss": 1.1788, + "step": 1648 + }, + { + "epoch": 0.2599678963510763, + "grad_norm": 1.0625, + "learning_rate": 7.001480613864872e-05, + "loss": 1.2348, + "step": 1649 + }, + { + "epoch": 0.26012554819846934, + "grad_norm": 0.9453125, + "learning_rate": 7.001041426739292e-05, + "loss": 1.2481, + "step": 1650 + }, + { + "epoch": 0.26028320004586236, + "grad_norm": 0.84765625, + "learning_rate": 7.000602245969833e-05, + "loss": 0.9925, + "step": 1651 + }, + { + "epoch": 0.2604408518932554, + "grad_norm": 0.90625, + "learning_rate": 7.000163071557426e-05, + "loss": 0.9351, + "step": 1652 + }, + { + "epoch": 0.26059850374064836, + "grad_norm": 2.28125, + "learning_rate": 6.999723903503002e-05, + "loss": 1.1705, + "step": 1653 + }, + { + "epoch": 0.2607561555880414, + "grad_norm": 1.015625, + "learning_rate": 6.999284741807488e-05, + "loss": 1.2346, + "step": 1654 + }, + { + "epoch": 0.2609138074354344, + "grad_norm": 0.94140625, + "learning_rate": 6.998845586471823e-05, + "loss": 1.1277, + "step": 1655 + }, + { + "epoch": 0.26107145928282743, + "grad_norm": 1.046875, + "learning_rate": 6.998406437496932e-05, + "loss": 1.3113, + "step": 1656 + }, + { + "epoch": 0.2612291111302204, + "grad_norm": 0.92578125, + "learning_rate": 6.997967294883747e-05, + "loss": 1.0489, + "step": 1657 + }, + { + "epoch": 0.26138676297761343, + "grad_norm": 1.078125, + "learning_rate": 6.9975281586332e-05, + "loss": 0.9751, + "step": 1658 + }, + { + "epoch": 0.26154441482500645, + "grad_norm": 0.9375, + "learning_rate": 6.997089028746216e-05, + "loss": 0.9875, + "step": 1659 + }, + { + "epoch": 0.2617020666723995, + "grad_norm": 0.80859375, + "learning_rate": 6.996649905223733e-05, + "loss": 0.7756, + "step": 1660 + }, + { + "epoch": 0.26185971851979245, + "grad_norm": 1.84375, + "learning_rate": 6.99621078806668e-05, + "loss": 1.2189, + "step": 1661 + }, + { + "epoch": 0.2620173703671855, + "grad_norm": 0.93359375, + "learning_rate": 6.995771677275986e-05, + "loss": 1.0652, + "step": 1662 + }, + { + "epoch": 0.2621750222145785, + "grad_norm": 0.87890625, + "learning_rate": 6.995332572852583e-05, + "loss": 1.1876, + "step": 1663 + }, + { + "epoch": 0.2623326740619715, + "grad_norm": 0.83203125, + "learning_rate": 6.994893474797396e-05, + "loss": 0.9399, + "step": 1664 + }, + { + "epoch": 0.2624903259093645, + "grad_norm": 0.875, + "learning_rate": 6.994454383111365e-05, + "loss": 1.1369, + "step": 1665 + }, + { + "epoch": 0.2626479777567575, + "grad_norm": 0.9296875, + "learning_rate": 6.994015297795415e-05, + "loss": 0.9936, + "step": 1666 + }, + { + "epoch": 0.26280562960415055, + "grad_norm": 0.96875, + "learning_rate": 6.993576218850479e-05, + "loss": 1.0441, + "step": 1667 + }, + { + "epoch": 0.2629632814515436, + "grad_norm": 1.0, + "learning_rate": 6.993137146277487e-05, + "loss": 1.0963, + "step": 1668 + }, + { + "epoch": 0.26312093329893654, + "grad_norm": 1.7421875, + "learning_rate": 6.992698080077367e-05, + "loss": 1.2195, + "step": 1669 + }, + { + "epoch": 0.26327858514632957, + "grad_norm": 0.84375, + "learning_rate": 6.992259020251052e-05, + "loss": 0.9091, + "step": 1670 + }, + { + "epoch": 0.2634362369937226, + "grad_norm": 0.85546875, + "learning_rate": 6.991819966799473e-05, + "loss": 0.8029, + "step": 1671 + }, + { + "epoch": 0.2635938888411156, + "grad_norm": 0.9140625, + "learning_rate": 6.991380919723559e-05, + "loss": 0.9312, + "step": 1672 + }, + { + "epoch": 0.2637515406885086, + "grad_norm": 1.0078125, + "learning_rate": 6.990941879024242e-05, + "loss": 1.1331, + "step": 1673 + }, + { + "epoch": 0.2639091925359016, + "grad_norm": 0.94140625, + "learning_rate": 6.990502844702447e-05, + "loss": 1.0675, + "step": 1674 + }, + { + "epoch": 0.26406684438329464, + "grad_norm": 1.0078125, + "learning_rate": 6.990063816759114e-05, + "loss": 1.2706, + "step": 1675 + }, + { + "epoch": 0.26422449623068767, + "grad_norm": 0.95703125, + "learning_rate": 6.989624795195166e-05, + "loss": 1.063, + "step": 1676 + }, + { + "epoch": 0.26438214807808064, + "grad_norm": 0.91796875, + "learning_rate": 6.98918578001154e-05, + "loss": 1.0199, + "step": 1677 + }, + { + "epoch": 0.26453979992547366, + "grad_norm": 0.87109375, + "learning_rate": 6.988746771209161e-05, + "loss": 0.9699, + "step": 1678 + }, + { + "epoch": 0.2646974517728667, + "grad_norm": 0.88671875, + "learning_rate": 6.988307768788956e-05, + "loss": 1.2557, + "step": 1679 + }, + { + "epoch": 0.2648551036202597, + "grad_norm": 0.97265625, + "learning_rate": 6.987868772751866e-05, + "loss": 1.5269, + "step": 1680 + }, + { + "epoch": 0.2650127554676527, + "grad_norm": 0.94140625, + "learning_rate": 6.987429783098816e-05, + "loss": 0.8982, + "step": 1681 + }, + { + "epoch": 0.2651704073150457, + "grad_norm": 0.99609375, + "learning_rate": 6.986990799830737e-05, + "loss": 1.0767, + "step": 1682 + }, + { + "epoch": 0.26532805916243873, + "grad_norm": 0.90234375, + "learning_rate": 6.986551822948557e-05, + "loss": 0.8804, + "step": 1683 + }, + { + "epoch": 0.26548571100983176, + "grad_norm": 0.81640625, + "learning_rate": 6.986112852453204e-05, + "loss": 0.9225, + "step": 1684 + }, + { + "epoch": 0.26564336285722473, + "grad_norm": 1.03125, + "learning_rate": 6.985673888345619e-05, + "loss": 1.1509, + "step": 1685 + }, + { + "epoch": 0.26580101470461776, + "grad_norm": 0.921875, + "learning_rate": 6.985234930626723e-05, + "loss": 1.1187, + "step": 1686 + }, + { + "epoch": 0.2659586665520108, + "grad_norm": 0.8359375, + "learning_rate": 6.984795979297452e-05, + "loss": 0.9949, + "step": 1687 + }, + { + "epoch": 0.2661163183994038, + "grad_norm": 0.98828125, + "learning_rate": 6.98435703435873e-05, + "loss": 1.1532, + "step": 1688 + }, + { + "epoch": 0.2662739702467968, + "grad_norm": 0.84375, + "learning_rate": 6.983918095811493e-05, + "loss": 0.9482, + "step": 1689 + }, + { + "epoch": 0.2664316220941898, + "grad_norm": 0.8515625, + "learning_rate": 6.983479163656669e-05, + "loss": 1.1119, + "step": 1690 + }, + { + "epoch": 0.2665892739415828, + "grad_norm": 1.2265625, + "learning_rate": 6.983040237895188e-05, + "loss": 1.2842, + "step": 1691 + }, + { + "epoch": 0.26674692578897585, + "grad_norm": 0.875, + "learning_rate": 6.982601318527982e-05, + "loss": 0.9848, + "step": 1692 + }, + { + "epoch": 0.2669045776363688, + "grad_norm": 0.8515625, + "learning_rate": 6.98216240555598e-05, + "loss": 1.0954, + "step": 1693 + }, + { + "epoch": 0.26706222948376185, + "grad_norm": 0.91015625, + "learning_rate": 6.981723498980107e-05, + "loss": 1.1908, + "step": 1694 + }, + { + "epoch": 0.2672198813311549, + "grad_norm": 0.9609375, + "learning_rate": 6.981284598801303e-05, + "loss": 1.0665, + "step": 1695 + }, + { + "epoch": 0.2673775331785479, + "grad_norm": 0.95703125, + "learning_rate": 6.980845705020495e-05, + "loss": 1.1982, + "step": 1696 + }, + { + "epoch": 0.26753518502594087, + "grad_norm": 0.90234375, + "learning_rate": 6.980406817638611e-05, + "loss": 1.3166, + "step": 1697 + }, + { + "epoch": 0.2676928368733339, + "grad_norm": 0.8984375, + "learning_rate": 6.979967936656582e-05, + "loss": 0.9869, + "step": 1698 + }, + { + "epoch": 0.2678504887207269, + "grad_norm": 0.96484375, + "learning_rate": 6.979529062075335e-05, + "loss": 1.0196, + "step": 1699 + }, + { + "epoch": 0.26800814056811995, + "grad_norm": 0.97265625, + "learning_rate": 6.979090193895807e-05, + "loss": 1.1166, + "step": 1700 + }, + { + "epoch": 0.2681657924155129, + "grad_norm": 0.87109375, + "learning_rate": 6.978651332118925e-05, + "loss": 0.9091, + "step": 1701 + }, + { + "epoch": 0.26832344426290594, + "grad_norm": 1.0234375, + "learning_rate": 6.978212476745619e-05, + "loss": 1.1728, + "step": 1702 + }, + { + "epoch": 0.26848109611029897, + "grad_norm": 0.9921875, + "learning_rate": 6.977773627776818e-05, + "loss": 1.1748, + "step": 1703 + }, + { + "epoch": 0.268638747957692, + "grad_norm": 0.89453125, + "learning_rate": 6.977334785213449e-05, + "loss": 1.1023, + "step": 1704 + }, + { + "epoch": 0.26879639980508496, + "grad_norm": 1.015625, + "learning_rate": 6.976895949056453e-05, + "loss": 1.1224, + "step": 1705 + }, + { + "epoch": 0.268954051652478, + "grad_norm": 0.90234375, + "learning_rate": 6.97645711930675e-05, + "loss": 0.9684, + "step": 1706 + }, + { + "epoch": 0.269111703499871, + "grad_norm": 0.96875, + "learning_rate": 6.976018295965274e-05, + "loss": 1.0414, + "step": 1707 + }, + { + "epoch": 0.26926935534726404, + "grad_norm": 0.84765625, + "learning_rate": 6.975579479032955e-05, + "loss": 0.8566, + "step": 1708 + }, + { + "epoch": 0.269427007194657, + "grad_norm": 0.95703125, + "learning_rate": 6.97514066851072e-05, + "loss": 1.1728, + "step": 1709 + }, + { + "epoch": 0.26958465904205003, + "grad_norm": 0.92578125, + "learning_rate": 6.974701864399505e-05, + "loss": 1.0108, + "step": 1710 + }, + { + "epoch": 0.26974231088944306, + "grad_norm": 1.0625, + "learning_rate": 6.974263066700234e-05, + "loss": 1.156, + "step": 1711 + }, + { + "epoch": 0.2698999627368361, + "grad_norm": 0.92578125, + "learning_rate": 6.973824275413838e-05, + "loss": 1.1015, + "step": 1712 + }, + { + "epoch": 0.27005761458422906, + "grad_norm": 0.796875, + "learning_rate": 6.973385490541251e-05, + "loss": 0.9523, + "step": 1713 + }, + { + "epoch": 0.2702152664316221, + "grad_norm": 0.82421875, + "learning_rate": 6.9729467120834e-05, + "loss": 0.9399, + "step": 1714 + }, + { + "epoch": 0.2703729182790151, + "grad_norm": 0.921875, + "learning_rate": 6.972507940041215e-05, + "loss": 1.2716, + "step": 1715 + }, + { + "epoch": 0.27053057012640813, + "grad_norm": 0.921875, + "learning_rate": 6.972069174415628e-05, + "loss": 1.1105, + "step": 1716 + }, + { + "epoch": 0.2706882219738011, + "grad_norm": 0.984375, + "learning_rate": 6.971630415207566e-05, + "loss": 1.2103, + "step": 1717 + }, + { + "epoch": 0.2708458738211941, + "grad_norm": 1.0234375, + "learning_rate": 6.971191662417962e-05, + "loss": 1.243, + "step": 1718 + }, + { + "epoch": 0.27100352566858715, + "grad_norm": 0.8984375, + "learning_rate": 6.970752916047739e-05, + "loss": 1.0349, + "step": 1719 + }, + { + "epoch": 0.2711611775159802, + "grad_norm": 0.96875, + "learning_rate": 6.970314176097836e-05, + "loss": 1.0999, + "step": 1720 + }, + { + "epoch": 0.2713188293633732, + "grad_norm": 0.91796875, + "learning_rate": 6.969875442569178e-05, + "loss": 0.9182, + "step": 1721 + }, + { + "epoch": 0.2714764812107662, + "grad_norm": 1.3671875, + "learning_rate": 6.969436715462697e-05, + "loss": 1.1229, + "step": 1722 + }, + { + "epoch": 0.2716341330581592, + "grad_norm": 0.9765625, + "learning_rate": 6.968997994779322e-05, + "loss": 1.2143, + "step": 1723 + }, + { + "epoch": 0.2717917849055522, + "grad_norm": 0.94140625, + "learning_rate": 6.968559280519978e-05, + "loss": 1.1386, + "step": 1724 + }, + { + "epoch": 0.27194943675294525, + "grad_norm": 1.0390625, + "learning_rate": 6.968120572685604e-05, + "loss": 1.0527, + "step": 1725 + }, + { + "epoch": 0.2721070886003382, + "grad_norm": 1.125, + "learning_rate": 6.967681871277123e-05, + "loss": 1.3038, + "step": 1726 + }, + { + "epoch": 0.27226474044773125, + "grad_norm": 0.90234375, + "learning_rate": 6.967243176295469e-05, + "loss": 1.0945, + "step": 1727 + }, + { + "epoch": 0.27242239229512427, + "grad_norm": 1.0546875, + "learning_rate": 6.966804487741569e-05, + "loss": 1.1197, + "step": 1728 + }, + { + "epoch": 0.2725800441425173, + "grad_norm": 0.99609375, + "learning_rate": 6.966365805616352e-05, + "loss": 1.0122, + "step": 1729 + }, + { + "epoch": 0.27273769598991027, + "grad_norm": 1.0, + "learning_rate": 6.965927129920751e-05, + "loss": 1.1578, + "step": 1730 + }, + { + "epoch": 0.2728953478373033, + "grad_norm": 0.94921875, + "learning_rate": 6.965488460655692e-05, + "loss": 1.0093, + "step": 1731 + }, + { + "epoch": 0.2730529996846963, + "grad_norm": 0.94921875, + "learning_rate": 6.965049797822109e-05, + "loss": 1.1155, + "step": 1732 + }, + { + "epoch": 0.27321065153208934, + "grad_norm": 0.9296875, + "learning_rate": 6.96461114142093e-05, + "loss": 1.2728, + "step": 1733 + }, + { + "epoch": 0.2733683033794823, + "grad_norm": 0.87890625, + "learning_rate": 6.964172491453081e-05, + "loss": 1.176, + "step": 1734 + }, + { + "epoch": 0.27352595522687534, + "grad_norm": 0.9609375, + "learning_rate": 6.963733847919496e-05, + "loss": 1.1942, + "step": 1735 + }, + { + "epoch": 0.27368360707426836, + "grad_norm": 0.9140625, + "learning_rate": 6.963295210821105e-05, + "loss": 0.9473, + "step": 1736 + }, + { + "epoch": 0.2738412589216614, + "grad_norm": 1.0546875, + "learning_rate": 6.962856580158837e-05, + "loss": 1.1784, + "step": 1737 + }, + { + "epoch": 0.27399891076905436, + "grad_norm": 0.9453125, + "learning_rate": 6.96241795593362e-05, + "loss": 1.0329, + "step": 1738 + }, + { + "epoch": 0.2741565626164474, + "grad_norm": 1.0234375, + "learning_rate": 6.961979338146381e-05, + "loss": 1.215, + "step": 1739 + }, + { + "epoch": 0.2743142144638404, + "grad_norm": 0.90234375, + "learning_rate": 6.961540726798056e-05, + "loss": 0.8672, + "step": 1740 + }, + { + "epoch": 0.27447186631123344, + "grad_norm": 1.0078125, + "learning_rate": 6.961102121889572e-05, + "loss": 1.1347, + "step": 1741 + }, + { + "epoch": 0.2746295181586264, + "grad_norm": 0.9453125, + "learning_rate": 6.96066352342186e-05, + "loss": 1.1355, + "step": 1742 + }, + { + "epoch": 0.27478717000601943, + "grad_norm": 0.83203125, + "learning_rate": 6.960224931395846e-05, + "loss": 0.9219, + "step": 1743 + }, + { + "epoch": 0.27494482185341246, + "grad_norm": 1.0078125, + "learning_rate": 6.959786345812459e-05, + "loss": 1.187, + "step": 1744 + }, + { + "epoch": 0.2751024737008055, + "grad_norm": 0.94140625, + "learning_rate": 6.959347766672633e-05, + "loss": 1.3087, + "step": 1745 + }, + { + "epoch": 0.27526012554819845, + "grad_norm": 0.90625, + "learning_rate": 6.958909193977297e-05, + "loss": 1.0897, + "step": 1746 + }, + { + "epoch": 0.2754177773955915, + "grad_norm": 1.0625, + "learning_rate": 6.95847062772738e-05, + "loss": 0.9982, + "step": 1747 + }, + { + "epoch": 0.2755754292429845, + "grad_norm": 0.83203125, + "learning_rate": 6.95803206792381e-05, + "loss": 1.1167, + "step": 1748 + }, + { + "epoch": 0.27573308109037753, + "grad_norm": 0.91015625, + "learning_rate": 6.957593514567514e-05, + "loss": 0.9965, + "step": 1749 + }, + { + "epoch": 0.2758907329377705, + "grad_norm": 0.99609375, + "learning_rate": 6.957154967659426e-05, + "loss": 1.2468, + "step": 1750 + }, + { + "epoch": 0.2760483847851635, + "grad_norm": 0.9765625, + "learning_rate": 6.956716427200476e-05, + "loss": 0.9735, + "step": 1751 + }, + { + "epoch": 0.27620603663255655, + "grad_norm": 0.9453125, + "learning_rate": 6.956277893191591e-05, + "loss": 0.9906, + "step": 1752 + }, + { + "epoch": 0.2763636884799496, + "grad_norm": 0.921875, + "learning_rate": 6.955839365633701e-05, + "loss": 1.0699, + "step": 1753 + }, + { + "epoch": 0.27652134032734255, + "grad_norm": 0.9765625, + "learning_rate": 6.955400844527735e-05, + "loss": 1.1844, + "step": 1754 + }, + { + "epoch": 0.27667899217473557, + "grad_norm": 0.94140625, + "learning_rate": 6.954962329874623e-05, + "loss": 1.3394, + "step": 1755 + }, + { + "epoch": 0.2768366440221286, + "grad_norm": 0.89453125, + "learning_rate": 6.954523821675294e-05, + "loss": 1.0751, + "step": 1756 + }, + { + "epoch": 0.2769942958695216, + "grad_norm": 0.921875, + "learning_rate": 6.954085319930679e-05, + "loss": 1.0276, + "step": 1757 + }, + { + "epoch": 0.2771519477169146, + "grad_norm": 0.9453125, + "learning_rate": 6.953646824641707e-05, + "loss": 1.215, + "step": 1758 + }, + { + "epoch": 0.2773095995643076, + "grad_norm": 0.92578125, + "learning_rate": 6.953208335809301e-05, + "loss": 1.047, + "step": 1759 + }, + { + "epoch": 0.27746725141170064, + "grad_norm": 0.81640625, + "learning_rate": 6.9527698534344e-05, + "loss": 0.881, + "step": 1760 + }, + { + "epoch": 0.27762490325909367, + "grad_norm": 1.015625, + "learning_rate": 6.952331377517929e-05, + "loss": 1.2199, + "step": 1761 + }, + { + "epoch": 0.27778255510648664, + "grad_norm": 0.9375, + "learning_rate": 6.951892908060818e-05, + "loss": 1.1025, + "step": 1762 + }, + { + "epoch": 0.27794020695387966, + "grad_norm": 0.9453125, + "learning_rate": 6.951454445063994e-05, + "loss": 1.0837, + "step": 1763 + }, + { + "epoch": 0.2780978588012727, + "grad_norm": 0.93359375, + "learning_rate": 6.951015988528385e-05, + "loss": 1.1537, + "step": 1764 + }, + { + "epoch": 0.2782555106486657, + "grad_norm": 0.9765625, + "learning_rate": 6.950577538454927e-05, + "loss": 1.1484, + "step": 1765 + }, + { + "epoch": 0.2784131624960587, + "grad_norm": 0.8828125, + "learning_rate": 6.950139094844548e-05, + "loss": 1.0682, + "step": 1766 + }, + { + "epoch": 0.2785708143434517, + "grad_norm": 0.8671875, + "learning_rate": 6.949700657698173e-05, + "loss": 1.1406, + "step": 1767 + }, + { + "epoch": 0.27872846619084474, + "grad_norm": 1.046875, + "learning_rate": 6.949262227016732e-05, + "loss": 1.0977, + "step": 1768 + }, + { + "epoch": 0.27888611803823776, + "grad_norm": 1.0, + "learning_rate": 6.948823802801154e-05, + "loss": 1.2812, + "step": 1769 + }, + { + "epoch": 0.27904376988563073, + "grad_norm": 0.98046875, + "learning_rate": 6.948385385052372e-05, + "loss": 1.2174, + "step": 1770 + }, + { + "epoch": 0.27920142173302376, + "grad_norm": 0.9765625, + "learning_rate": 6.947946973771313e-05, + "loss": 1.0728, + "step": 1771 + }, + { + "epoch": 0.2793590735804168, + "grad_norm": 0.89453125, + "learning_rate": 6.947508568958905e-05, + "loss": 1.0217, + "step": 1772 + }, + { + "epoch": 0.2795167254278098, + "grad_norm": 0.890625, + "learning_rate": 6.947070170616079e-05, + "loss": 1.0975, + "step": 1773 + }, + { + "epoch": 0.2796743772752028, + "grad_norm": 1.171875, + "learning_rate": 6.946631778743762e-05, + "loss": 1.2402, + "step": 1774 + }, + { + "epoch": 0.2798320291225958, + "grad_norm": 1.0546875, + "learning_rate": 6.946193393342886e-05, + "loss": 1.1615, + "step": 1775 + }, + { + "epoch": 0.27998968096998883, + "grad_norm": 0.9609375, + "learning_rate": 6.945755014414376e-05, + "loss": 1.0538, + "step": 1776 + }, + { + "epoch": 0.28014733281738186, + "grad_norm": 0.8671875, + "learning_rate": 6.945316641959168e-05, + "loss": 0.8767, + "step": 1777 + }, + { + "epoch": 0.2803049846647748, + "grad_norm": 1.0, + "learning_rate": 6.944878275978184e-05, + "loss": 1.2505, + "step": 1778 + }, + { + "epoch": 0.28046263651216785, + "grad_norm": 0.96875, + "learning_rate": 6.944439916472351e-05, + "loss": 0.9423, + "step": 1779 + }, + { + "epoch": 0.2806202883595609, + "grad_norm": 0.9375, + "learning_rate": 6.94400156344261e-05, + "loss": 0.9866, + "step": 1780 + }, + { + "epoch": 0.2807779402069539, + "grad_norm": 0.83984375, + "learning_rate": 6.943563216889881e-05, + "loss": 1.1577, + "step": 1781 + }, + { + "epoch": 0.28093559205434687, + "grad_norm": 0.87109375, + "learning_rate": 6.943124876815097e-05, + "loss": 0.9237, + "step": 1782 + }, + { + "epoch": 0.2810932439017399, + "grad_norm": 0.95703125, + "learning_rate": 6.942686543219183e-05, + "loss": 1.0231, + "step": 1783 + }, + { + "epoch": 0.2812508957491329, + "grad_norm": 0.91015625, + "learning_rate": 6.942248216103067e-05, + "loss": 0.9635, + "step": 1784 + }, + { + "epoch": 0.28140854759652595, + "grad_norm": 0.96484375, + "learning_rate": 6.941809895467684e-05, + "loss": 1.0322, + "step": 1785 + }, + { + "epoch": 0.2815661994439189, + "grad_norm": 0.8984375, + "learning_rate": 6.941371581313962e-05, + "loss": 0.9097, + "step": 1786 + }, + { + "epoch": 0.28172385129131194, + "grad_norm": 0.97265625, + "learning_rate": 6.940933273642827e-05, + "loss": 0.8335, + "step": 1787 + }, + { + "epoch": 0.28188150313870497, + "grad_norm": 0.99609375, + "learning_rate": 6.94049497245521e-05, + "loss": 1.1426, + "step": 1788 + }, + { + "epoch": 0.282039154986098, + "grad_norm": 0.9921875, + "learning_rate": 6.940056677752038e-05, + "loss": 1.3404, + "step": 1789 + }, + { + "epoch": 0.28219680683349097, + "grad_norm": 0.95703125, + "learning_rate": 6.939618389534237e-05, + "loss": 1.2091, + "step": 1790 + }, + { + "epoch": 0.282354458680884, + "grad_norm": 0.97265625, + "learning_rate": 6.939180107802743e-05, + "loss": 1.1574, + "step": 1791 + }, + { + "epoch": 0.282512110528277, + "grad_norm": 0.9765625, + "learning_rate": 6.938741832558484e-05, + "loss": 1.2753, + "step": 1792 + }, + { + "epoch": 0.28266976237567004, + "grad_norm": 1.0546875, + "learning_rate": 6.938303563802386e-05, + "loss": 0.9984, + "step": 1793 + }, + { + "epoch": 0.282827414223063, + "grad_norm": 1.0859375, + "learning_rate": 6.937865301535377e-05, + "loss": 1.0268, + "step": 1794 + }, + { + "epoch": 0.28298506607045604, + "grad_norm": 0.9296875, + "learning_rate": 6.937427045758386e-05, + "loss": 1.0478, + "step": 1795 + }, + { + "epoch": 0.28314271791784906, + "grad_norm": 0.94921875, + "learning_rate": 6.936988796472349e-05, + "loss": 1.223, + "step": 1796 + }, + { + "epoch": 0.2833003697652421, + "grad_norm": 0.95703125, + "learning_rate": 6.936550553678185e-05, + "loss": 1.0839, + "step": 1797 + }, + { + "epoch": 0.28345802161263506, + "grad_norm": 0.9609375, + "learning_rate": 6.936112317376827e-05, + "loss": 0.911, + "step": 1798 + }, + { + "epoch": 0.2836156734600281, + "grad_norm": 0.83203125, + "learning_rate": 6.935674087569205e-05, + "loss": 1.1088, + "step": 1799 + }, + { + "epoch": 0.2837733253074211, + "grad_norm": 1.03125, + "learning_rate": 6.935235864256245e-05, + "loss": 0.9608, + "step": 1800 + }, + { + "epoch": 0.28393097715481413, + "grad_norm": 0.8984375, + "learning_rate": 6.934797647438877e-05, + "loss": 1.0391, + "step": 1801 + }, + { + "epoch": 0.2840886290022071, + "grad_norm": 0.94140625, + "learning_rate": 6.934359437118034e-05, + "loss": 1.0048, + "step": 1802 + }, + { + "epoch": 0.28424628084960013, + "grad_norm": 0.98046875, + "learning_rate": 6.933921233294639e-05, + "loss": 1.0308, + "step": 1803 + }, + { + "epoch": 0.28440393269699316, + "grad_norm": 0.90234375, + "learning_rate": 6.933483035969623e-05, + "loss": 1.079, + "step": 1804 + }, + { + "epoch": 0.2845615845443862, + "grad_norm": 0.90625, + "learning_rate": 6.93304484514391e-05, + "loss": 1.0794, + "step": 1805 + }, + { + "epoch": 0.28471923639177915, + "grad_norm": 0.9140625, + "learning_rate": 6.932606660818437e-05, + "loss": 1.1625, + "step": 1806 + }, + { + "epoch": 0.2848768882391722, + "grad_norm": 0.828125, + "learning_rate": 6.93216848299413e-05, + "loss": 0.8867, + "step": 1807 + }, + { + "epoch": 0.2850345400865652, + "grad_norm": 0.93359375, + "learning_rate": 6.931730311671916e-05, + "loss": 0.863, + "step": 1808 + }, + { + "epoch": 0.28519219193395823, + "grad_norm": 1.0, + "learning_rate": 6.931292146852723e-05, + "loss": 1.119, + "step": 1809 + }, + { + "epoch": 0.2853498437813512, + "grad_norm": 0.890625, + "learning_rate": 6.930853988537479e-05, + "loss": 1.2126, + "step": 1810 + }, + { + "epoch": 0.2855074956287442, + "grad_norm": 0.9453125, + "learning_rate": 6.930415836727117e-05, + "loss": 1.1326, + "step": 1811 + }, + { + "epoch": 0.28566514747613725, + "grad_norm": 0.95703125, + "learning_rate": 6.929977691422565e-05, + "loss": 1.0235, + "step": 1812 + }, + { + "epoch": 0.2858227993235303, + "grad_norm": 0.94140625, + "learning_rate": 6.929539552624749e-05, + "loss": 1.1517, + "step": 1813 + }, + { + "epoch": 0.28598045117092324, + "grad_norm": 0.94921875, + "learning_rate": 6.929101420334598e-05, + "loss": 1.034, + "step": 1814 + }, + { + "epoch": 0.28613810301831627, + "grad_norm": 0.90625, + "learning_rate": 6.928663294553038e-05, + "loss": 1.1265, + "step": 1815 + }, + { + "epoch": 0.2862957548657093, + "grad_norm": 0.921875, + "learning_rate": 6.928225175281005e-05, + "loss": 1.1049, + "step": 1816 + }, + { + "epoch": 0.2864534067131023, + "grad_norm": 0.890625, + "learning_rate": 6.927787062519418e-05, + "loss": 1.1225, + "step": 1817 + }, + { + "epoch": 0.2866110585604953, + "grad_norm": 0.96484375, + "learning_rate": 6.927348956269216e-05, + "loss": 1.1945, + "step": 1818 + }, + { + "epoch": 0.2867687104078883, + "grad_norm": 1.0, + "learning_rate": 6.92691085653132e-05, + "loss": 1.0273, + "step": 1819 + }, + { + "epoch": 0.28692636225528134, + "grad_norm": 1.0625, + "learning_rate": 6.92647276330666e-05, + "loss": 1.1564, + "step": 1820 + }, + { + "epoch": 0.28708401410267437, + "grad_norm": 0.93359375, + "learning_rate": 6.926034676596167e-05, + "loss": 1.2002, + "step": 1821 + }, + { + "epoch": 0.28724166595006734, + "grad_norm": 0.85546875, + "learning_rate": 6.925596596400768e-05, + "loss": 0.9643, + "step": 1822 + }, + { + "epoch": 0.28739931779746036, + "grad_norm": 0.828125, + "learning_rate": 6.925158522721392e-05, + "loss": 0.7791, + "step": 1823 + }, + { + "epoch": 0.2875569696448534, + "grad_norm": 0.94921875, + "learning_rate": 6.924720455558964e-05, + "loss": 1.0785, + "step": 1824 + }, + { + "epoch": 0.2877146214922464, + "grad_norm": 1.046875, + "learning_rate": 6.924282394914413e-05, + "loss": 1.1811, + "step": 1825 + }, + { + "epoch": 0.2878722733396394, + "grad_norm": 0.9296875, + "learning_rate": 6.923844340788675e-05, + "loss": 1.073, + "step": 1826 + }, + { + "epoch": 0.2880299251870324, + "grad_norm": 0.84375, + "learning_rate": 6.923406293182671e-05, + "loss": 0.9031, + "step": 1827 + }, + { + "epoch": 0.28818757703442544, + "grad_norm": 0.9375, + "learning_rate": 6.922968252097332e-05, + "loss": 1.13, + "step": 1828 + }, + { + "epoch": 0.28834522888181846, + "grad_norm": 0.9375, + "learning_rate": 6.922530217533586e-05, + "loss": 1.2399, + "step": 1829 + }, + { + "epoch": 0.28850288072921143, + "grad_norm": 1.078125, + "learning_rate": 6.922092189492358e-05, + "loss": 1.1195, + "step": 1830 + }, + { + "epoch": 0.28866053257660446, + "grad_norm": 0.921875, + "learning_rate": 6.921654167974583e-05, + "loss": 1.0458, + "step": 1831 + }, + { + "epoch": 0.2888181844239975, + "grad_norm": 0.95703125, + "learning_rate": 6.921216152981185e-05, + "loss": 1.2888, + "step": 1832 + }, + { + "epoch": 0.2889758362713905, + "grad_norm": 0.96875, + "learning_rate": 6.920778144513097e-05, + "loss": 1.1291, + "step": 1833 + }, + { + "epoch": 0.2891334881187835, + "grad_norm": 0.96875, + "learning_rate": 6.92034014257124e-05, + "loss": 1.1549, + "step": 1834 + }, + { + "epoch": 0.2892911399661765, + "grad_norm": 0.984375, + "learning_rate": 6.919902147156542e-05, + "loss": 1.0321, + "step": 1835 + }, + { + "epoch": 0.28944879181356953, + "grad_norm": 0.98828125, + "learning_rate": 6.919464158269942e-05, + "loss": 1.0902, + "step": 1836 + }, + { + "epoch": 0.28960644366096255, + "grad_norm": 0.9296875, + "learning_rate": 6.919026175912359e-05, + "loss": 1.0786, + "step": 1837 + }, + { + "epoch": 0.2897640955083555, + "grad_norm": 0.89453125, + "learning_rate": 6.918588200084726e-05, + "loss": 0.9172, + "step": 1838 + }, + { + "epoch": 0.28992174735574855, + "grad_norm": 1.0, + "learning_rate": 6.918150230787969e-05, + "loss": 1.0693, + "step": 1839 + }, + { + "epoch": 0.2900793992031416, + "grad_norm": 0.94921875, + "learning_rate": 6.917712268023013e-05, + "loss": 0.9375, + "step": 1840 + }, + { + "epoch": 0.2902370510505346, + "grad_norm": 0.93359375, + "learning_rate": 6.917274311790793e-05, + "loss": 1.3096, + "step": 1841 + }, + { + "epoch": 0.29039470289792757, + "grad_norm": 0.88671875, + "learning_rate": 6.916836362092234e-05, + "loss": 1.0281, + "step": 1842 + }, + { + "epoch": 0.2905523547453206, + "grad_norm": 0.9296875, + "learning_rate": 6.916398418928263e-05, + "loss": 1.1903, + "step": 1843 + }, + { + "epoch": 0.2907100065927136, + "grad_norm": 0.76953125, + "learning_rate": 6.91596048229981e-05, + "loss": 0.8723, + "step": 1844 + }, + { + "epoch": 0.29086765844010665, + "grad_norm": 0.83203125, + "learning_rate": 6.915522552207798e-05, + "loss": 0.9742, + "step": 1845 + }, + { + "epoch": 0.2910253102874996, + "grad_norm": 0.87109375, + "learning_rate": 6.915084628653163e-05, + "loss": 1.0318, + "step": 1846 + }, + { + "epoch": 0.29118296213489264, + "grad_norm": 1.03125, + "learning_rate": 6.914646711636832e-05, + "loss": 1.2363, + "step": 1847 + }, + { + "epoch": 0.29134061398228567, + "grad_norm": 0.921875, + "learning_rate": 6.91420880115973e-05, + "loss": 1.2061, + "step": 1848 + }, + { + "epoch": 0.2914982658296787, + "grad_norm": 0.8984375, + "learning_rate": 6.913770897222786e-05, + "loss": 1.3845, + "step": 1849 + }, + { + "epoch": 0.29165591767707166, + "grad_norm": 1.0546875, + "learning_rate": 6.913332999826922e-05, + "loss": 1.1576, + "step": 1850 + }, + { + "epoch": 0.2918135695244647, + "grad_norm": 0.875, + "learning_rate": 6.91289510897308e-05, + "loss": 1.0151, + "step": 1851 + }, + { + "epoch": 0.2919712213718577, + "grad_norm": 0.98828125, + "learning_rate": 6.912457224662179e-05, + "loss": 1.235, + "step": 1852 + }, + { + "epoch": 0.29212887321925074, + "grad_norm": 0.921875, + "learning_rate": 6.912019346895146e-05, + "loss": 1.0265, + "step": 1853 + }, + { + "epoch": 0.2922865250666437, + "grad_norm": 0.94140625, + "learning_rate": 6.911581475672914e-05, + "loss": 1.0102, + "step": 1854 + }, + { + "epoch": 0.29244417691403674, + "grad_norm": 0.890625, + "learning_rate": 6.911143610996404e-05, + "loss": 1.1134, + "step": 1855 + }, + { + "epoch": 0.29260182876142976, + "grad_norm": 0.8515625, + "learning_rate": 6.910705752866553e-05, + "loss": 1.2059, + "step": 1856 + }, + { + "epoch": 0.2927594806088228, + "grad_norm": 0.90234375, + "learning_rate": 6.910267901284284e-05, + "loss": 1.0385, + "step": 1857 + }, + { + "epoch": 0.29291713245621576, + "grad_norm": 0.91015625, + "learning_rate": 6.909830056250527e-05, + "loss": 1.0966, + "step": 1858 + }, + { + "epoch": 0.2930747843036088, + "grad_norm": 0.9375, + "learning_rate": 6.909392217766207e-05, + "loss": 1.1934, + "step": 1859 + }, + { + "epoch": 0.2932324361510018, + "grad_norm": 0.89453125, + "learning_rate": 6.908954385832251e-05, + "loss": 1.0348, + "step": 1860 + }, + { + "epoch": 0.29339008799839483, + "grad_norm": 0.91796875, + "learning_rate": 6.908516560449594e-05, + "loss": 1.1135, + "step": 1861 + }, + { + "epoch": 0.2935477398457878, + "grad_norm": 0.95703125, + "learning_rate": 6.908078741619157e-05, + "loss": 1.128, + "step": 1862 + }, + { + "epoch": 0.29370539169318083, + "grad_norm": 0.86328125, + "learning_rate": 6.907640929341872e-05, + "loss": 1.0865, + "step": 1863 + }, + { + "epoch": 0.29386304354057385, + "grad_norm": 0.94140625, + "learning_rate": 6.907203123618664e-05, + "loss": 1.1875, + "step": 1864 + }, + { + "epoch": 0.2940206953879669, + "grad_norm": 0.921875, + "learning_rate": 6.90676532445046e-05, + "loss": 1.278, + "step": 1865 + }, + { + "epoch": 0.29417834723535985, + "grad_norm": 0.9140625, + "learning_rate": 6.906327531838193e-05, + "loss": 1.0806, + "step": 1866 + }, + { + "epoch": 0.2943359990827529, + "grad_norm": 0.8984375, + "learning_rate": 6.905889745782788e-05, + "loss": 1.2227, + "step": 1867 + }, + { + "epoch": 0.2944936509301459, + "grad_norm": 0.84765625, + "learning_rate": 6.905451966285171e-05, + "loss": 0.9036, + "step": 1868 + }, + { + "epoch": 0.2946513027775389, + "grad_norm": 0.9296875, + "learning_rate": 6.905014193346274e-05, + "loss": 1.0541, + "step": 1869 + }, + { + "epoch": 0.2948089546249319, + "grad_norm": 0.94921875, + "learning_rate": 6.904576426967017e-05, + "loss": 1.2212, + "step": 1870 + }, + { + "epoch": 0.2949666064723249, + "grad_norm": 0.890625, + "learning_rate": 6.904138667148338e-05, + "loss": 0.9853, + "step": 1871 + }, + { + "epoch": 0.29512425831971795, + "grad_norm": 0.921875, + "learning_rate": 6.903700913891162e-05, + "loss": 1.1984, + "step": 1872 + }, + { + "epoch": 0.295281910167111, + "grad_norm": 0.9140625, + "learning_rate": 6.903263167196412e-05, + "loss": 1.0842, + "step": 1873 + }, + { + "epoch": 0.29543956201450394, + "grad_norm": 0.984375, + "learning_rate": 6.902825427065021e-05, + "loss": 1.1143, + "step": 1874 + }, + { + "epoch": 0.29559721386189697, + "grad_norm": 0.97265625, + "learning_rate": 6.90238769349791e-05, + "loss": 1.2871, + "step": 1875 + }, + { + "epoch": 0.29575486570929, + "grad_norm": 1.0390625, + "learning_rate": 6.901949966496013e-05, + "loss": 0.898, + "step": 1876 + }, + { + "epoch": 0.295912517556683, + "grad_norm": 0.859375, + "learning_rate": 6.901512246060259e-05, + "loss": 0.8928, + "step": 1877 + }, + { + "epoch": 0.29607016940407604, + "grad_norm": 1.046875, + "learning_rate": 6.901074532191572e-05, + "loss": 1.037, + "step": 1878 + }, + { + "epoch": 0.296227821251469, + "grad_norm": 1.0703125, + "learning_rate": 6.900636824890878e-05, + "loss": 1.2106, + "step": 1879 + }, + { + "epoch": 0.29638547309886204, + "grad_norm": 0.8828125, + "learning_rate": 6.900199124159109e-05, + "loss": 0.9175, + "step": 1880 + }, + { + "epoch": 0.29654312494625507, + "grad_norm": 0.92578125, + "learning_rate": 6.89976142999719e-05, + "loss": 1.0531, + "step": 1881 + }, + { + "epoch": 0.2967007767936481, + "grad_norm": 0.94921875, + "learning_rate": 6.89932374240605e-05, + "loss": 0.9883, + "step": 1882 + }, + { + "epoch": 0.29685842864104106, + "grad_norm": 0.921875, + "learning_rate": 6.898886061386614e-05, + "loss": 1.1274, + "step": 1883 + }, + { + "epoch": 0.2970160804884341, + "grad_norm": 1.0234375, + "learning_rate": 6.898448386939814e-05, + "loss": 1.3654, + "step": 1884 + }, + { + "epoch": 0.2971737323358271, + "grad_norm": 0.890625, + "learning_rate": 6.898010719066572e-05, + "loss": 1.0427, + "step": 1885 + }, + { + "epoch": 0.29733138418322014, + "grad_norm": 1.0390625, + "learning_rate": 6.897573057767824e-05, + "loss": 1.1164, + "step": 1886 + }, + { + "epoch": 0.2974890360306131, + "grad_norm": 0.9609375, + "learning_rate": 6.897135403044491e-05, + "loss": 1.1587, + "step": 1887 + }, + { + "epoch": 0.29764668787800613, + "grad_norm": 0.96875, + "learning_rate": 6.896697754897501e-05, + "loss": 1.0289, + "step": 1888 + }, + { + "epoch": 0.29780433972539916, + "grad_norm": 1.0078125, + "learning_rate": 6.896260113327783e-05, + "loss": 1.2689, + "step": 1889 + }, + { + "epoch": 0.2979619915727922, + "grad_norm": 0.91015625, + "learning_rate": 6.895822478336262e-05, + "loss": 1.0849, + "step": 1890 + }, + { + "epoch": 0.29811964342018515, + "grad_norm": 0.984375, + "learning_rate": 6.895384849923871e-05, + "loss": 1.1242, + "step": 1891 + }, + { + "epoch": 0.2982772952675782, + "grad_norm": 1.015625, + "learning_rate": 6.894947228091535e-05, + "loss": 0.9605, + "step": 1892 + }, + { + "epoch": 0.2984349471149712, + "grad_norm": 1.0078125, + "learning_rate": 6.894509612840179e-05, + "loss": 0.9764, + "step": 1893 + }, + { + "epoch": 0.29859259896236423, + "grad_norm": 0.91015625, + "learning_rate": 6.894072004170734e-05, + "loss": 1.1986, + "step": 1894 + }, + { + "epoch": 0.2987502508097572, + "grad_norm": 1.0078125, + "learning_rate": 6.893634402084121e-05, + "loss": 1.2609, + "step": 1895 + }, + { + "epoch": 0.2989079026571502, + "grad_norm": 1.0234375, + "learning_rate": 6.893196806581277e-05, + "loss": 1.1271, + "step": 1896 + }, + { + "epoch": 0.29906555450454325, + "grad_norm": 0.95703125, + "learning_rate": 6.892759217663124e-05, + "loss": 0.9491, + "step": 1897 + }, + { + "epoch": 0.2992232063519363, + "grad_norm": 1.0546875, + "learning_rate": 6.892321635330592e-05, + "loss": 1.2664, + "step": 1898 + }, + { + "epoch": 0.29938085819932925, + "grad_norm": 0.93359375, + "learning_rate": 6.891884059584604e-05, + "loss": 1.0487, + "step": 1899 + }, + { + "epoch": 0.2995385100467223, + "grad_norm": 0.921875, + "learning_rate": 6.89144649042609e-05, + "loss": 1.2452, + "step": 1900 + }, + { + "epoch": 0.2996961618941153, + "grad_norm": 0.91796875, + "learning_rate": 6.89100892785598e-05, + "loss": 1.1048, + "step": 1901 + }, + { + "epoch": 0.2998538137415083, + "grad_norm": 0.953125, + "learning_rate": 6.890571371875194e-05, + "loss": 0.8733, + "step": 1902 + }, + { + "epoch": 0.3000114655889013, + "grad_norm": 1.046875, + "learning_rate": 6.890133822484669e-05, + "loss": 1.0949, + "step": 1903 + }, + { + "epoch": 0.3001691174362943, + "grad_norm": 0.90625, + "learning_rate": 6.889696279685327e-05, + "loss": 0.9121, + "step": 1904 + }, + { + "epoch": 0.30032676928368734, + "grad_norm": 0.9453125, + "learning_rate": 6.889258743478093e-05, + "loss": 1.0087, + "step": 1905 + }, + { + "epoch": 0.30048442113108037, + "grad_norm": 0.953125, + "learning_rate": 6.888821213863901e-05, + "loss": 0.9906, + "step": 1906 + }, + { + "epoch": 0.30064207297847334, + "grad_norm": 0.94140625, + "learning_rate": 6.888383690843673e-05, + "loss": 1.1579, + "step": 1907 + }, + { + "epoch": 0.30079972482586637, + "grad_norm": 0.85546875, + "learning_rate": 6.887946174418338e-05, + "loss": 0.8905, + "step": 1908 + }, + { + "epoch": 0.3009573766732594, + "grad_norm": 0.9140625, + "learning_rate": 6.887508664588824e-05, + "loss": 1.1476, + "step": 1909 + }, + { + "epoch": 0.3011150285206524, + "grad_norm": 0.84765625, + "learning_rate": 6.887071161356054e-05, + "loss": 1.203, + "step": 1910 + }, + { + "epoch": 0.3012726803680454, + "grad_norm": 0.94921875, + "learning_rate": 6.886633664720961e-05, + "loss": 0.8998, + "step": 1911 + }, + { + "epoch": 0.3014303322154384, + "grad_norm": 1.015625, + "learning_rate": 6.886196174684471e-05, + "loss": 1.2258, + "step": 1912 + }, + { + "epoch": 0.30158798406283144, + "grad_norm": 0.9765625, + "learning_rate": 6.88575869124751e-05, + "loss": 1.158, + "step": 1913 + }, + { + "epoch": 0.30174563591022446, + "grad_norm": 0.91015625, + "learning_rate": 6.885321214411007e-05, + "loss": 0.9877, + "step": 1914 + }, + { + "epoch": 0.30190328775761743, + "grad_norm": 0.9609375, + "learning_rate": 6.88488374417588e-05, + "loss": 1.1588, + "step": 1915 + }, + { + "epoch": 0.30206093960501046, + "grad_norm": 0.89453125, + "learning_rate": 6.88444628054307e-05, + "loss": 1.0643, + "step": 1916 + }, + { + "epoch": 0.3022185914524035, + "grad_norm": 0.875, + "learning_rate": 6.884008823513499e-05, + "loss": 0.9943, + "step": 1917 + }, + { + "epoch": 0.3023762432997965, + "grad_norm": 0.96484375, + "learning_rate": 6.883571373088093e-05, + "loss": 1.3065, + "step": 1918 + }, + { + "epoch": 0.3025338951471895, + "grad_norm": 1.03125, + "learning_rate": 6.88313392926778e-05, + "loss": 1.0518, + "step": 1919 + }, + { + "epoch": 0.3026915469945825, + "grad_norm": 1.4296875, + "learning_rate": 6.882696492053483e-05, + "loss": 1.1562, + "step": 1920 + }, + { + "epoch": 0.30284919884197553, + "grad_norm": 1.015625, + "learning_rate": 6.882259061446131e-05, + "loss": 1.2035, + "step": 1921 + }, + { + "epoch": 0.30300685068936856, + "grad_norm": 1.1015625, + "learning_rate": 6.881821637446657e-05, + "loss": 1.0504, + "step": 1922 + }, + { + "epoch": 0.3031645025367615, + "grad_norm": 0.95703125, + "learning_rate": 6.881384220055984e-05, + "loss": 1.3212, + "step": 1923 + }, + { + "epoch": 0.30332215438415455, + "grad_norm": 1.015625, + "learning_rate": 6.880946809275038e-05, + "loss": 1.0566, + "step": 1924 + }, + { + "epoch": 0.3034798062315476, + "grad_norm": 1.046875, + "learning_rate": 6.880509405104744e-05, + "loss": 1.1852, + "step": 1925 + }, + { + "epoch": 0.3036374580789406, + "grad_norm": 1.0078125, + "learning_rate": 6.880072007546036e-05, + "loss": 1.4554, + "step": 1926 + }, + { + "epoch": 0.3037951099263336, + "grad_norm": 0.96484375, + "learning_rate": 6.879634616599837e-05, + "loss": 1.0309, + "step": 1927 + }, + { + "epoch": 0.3039527617737266, + "grad_norm": 0.9375, + "learning_rate": 6.879197232267073e-05, + "loss": 0.8777, + "step": 1928 + }, + { + "epoch": 0.3041104136211196, + "grad_norm": 0.89453125, + "learning_rate": 6.878759854548672e-05, + "loss": 1.0103, + "step": 1929 + }, + { + "epoch": 0.30426806546851265, + "grad_norm": 0.9765625, + "learning_rate": 6.878322483445561e-05, + "loss": 1.096, + "step": 1930 + }, + { + "epoch": 0.3044257173159056, + "grad_norm": 0.92578125, + "learning_rate": 6.877885118958664e-05, + "loss": 1.0015, + "step": 1931 + }, + { + "epoch": 0.30458336916329865, + "grad_norm": 0.96484375, + "learning_rate": 6.877447761088915e-05, + "loss": 1.0426, + "step": 1932 + }, + { + "epoch": 0.30474102101069167, + "grad_norm": 0.86328125, + "learning_rate": 6.877010409837237e-05, + "loss": 0.9628, + "step": 1933 + }, + { + "epoch": 0.3048986728580847, + "grad_norm": 1.0234375, + "learning_rate": 6.876573065204556e-05, + "loss": 0.9831, + "step": 1934 + }, + { + "epoch": 0.30505632470547767, + "grad_norm": 1.03125, + "learning_rate": 6.8761357271918e-05, + "loss": 1.0138, + "step": 1935 + }, + { + "epoch": 0.3052139765528707, + "grad_norm": 0.87890625, + "learning_rate": 6.875698395799892e-05, + "loss": 0.7691, + "step": 1936 + }, + { + "epoch": 0.3053716284002637, + "grad_norm": 1.09375, + "learning_rate": 6.875261071029768e-05, + "loss": 1.0916, + "step": 1937 + }, + { + "epoch": 0.30552928024765674, + "grad_norm": 0.8984375, + "learning_rate": 6.874823752882347e-05, + "loss": 1.0076, + "step": 1938 + }, + { + "epoch": 0.3056869320950497, + "grad_norm": 0.83984375, + "learning_rate": 6.874386441358561e-05, + "loss": 1.0452, + "step": 1939 + }, + { + "epoch": 0.30584458394244274, + "grad_norm": 0.93359375, + "learning_rate": 6.873949136459332e-05, + "loss": 1.068, + "step": 1940 + }, + { + "epoch": 0.30600223578983576, + "grad_norm": 0.98046875, + "learning_rate": 6.873511838185587e-05, + "loss": 1.2288, + "step": 1941 + }, + { + "epoch": 0.3061598876372288, + "grad_norm": 1.046875, + "learning_rate": 6.873074546538258e-05, + "loss": 0.9895, + "step": 1942 + }, + { + "epoch": 0.30631753948462176, + "grad_norm": 0.875, + "learning_rate": 6.872637261518269e-05, + "loss": 1.0229, + "step": 1943 + }, + { + "epoch": 0.3064751913320148, + "grad_norm": 0.93359375, + "learning_rate": 6.872199983126546e-05, + "loss": 1.0553, + "step": 1944 + }, + { + "epoch": 0.3066328431794078, + "grad_norm": 0.83203125, + "learning_rate": 6.871762711364018e-05, + "loss": 0.9682, + "step": 1945 + }, + { + "epoch": 0.30679049502680084, + "grad_norm": 0.8984375, + "learning_rate": 6.871325446231606e-05, + "loss": 0.958, + "step": 1946 + }, + { + "epoch": 0.3069481468741938, + "grad_norm": 0.98046875, + "learning_rate": 6.870888187730245e-05, + "loss": 1.0723, + "step": 1947 + }, + { + "epoch": 0.30710579872158683, + "grad_norm": 0.88671875, + "learning_rate": 6.870450935860857e-05, + "loss": 1.0502, + "step": 1948 + }, + { + "epoch": 0.30726345056897986, + "grad_norm": 1.046875, + "learning_rate": 6.870013690624368e-05, + "loss": 1.2192, + "step": 1949 + }, + { + "epoch": 0.3074211024163729, + "grad_norm": 0.9296875, + "learning_rate": 6.869576452021705e-05, + "loss": 1.1482, + "step": 1950 + }, + { + "epoch": 0.30757875426376585, + "grad_norm": 0.9375, + "learning_rate": 6.869139220053795e-05, + "loss": 1.1024, + "step": 1951 + }, + { + "epoch": 0.3077364061111589, + "grad_norm": 0.80859375, + "learning_rate": 6.868701994721569e-05, + "loss": 0.7835, + "step": 1952 + }, + { + "epoch": 0.3078940579585519, + "grad_norm": 0.85546875, + "learning_rate": 6.86826477602595e-05, + "loss": 1.0139, + "step": 1953 + }, + { + "epoch": 0.30805170980594493, + "grad_norm": 1.0234375, + "learning_rate": 6.867827563967864e-05, + "loss": 1.332, + "step": 1954 + }, + { + "epoch": 0.3082093616533379, + "grad_norm": 0.9375, + "learning_rate": 6.867390358548238e-05, + "loss": 1.1438, + "step": 1955 + }, + { + "epoch": 0.3083670135007309, + "grad_norm": 0.87890625, + "learning_rate": 6.866953159767997e-05, + "loss": 1.1559, + "step": 1956 + }, + { + "epoch": 0.30852466534812395, + "grad_norm": 0.9765625, + "learning_rate": 6.866515967628071e-05, + "loss": 1.098, + "step": 1957 + }, + { + "epoch": 0.308682317195517, + "grad_norm": 0.91015625, + "learning_rate": 6.866078782129388e-05, + "loss": 0.8567, + "step": 1958 + }, + { + "epoch": 0.30883996904290995, + "grad_norm": 0.9296875, + "learning_rate": 6.86564160327287e-05, + "loss": 0.9513, + "step": 1959 + }, + { + "epoch": 0.30899762089030297, + "grad_norm": 0.8359375, + "learning_rate": 6.865204431059447e-05, + "loss": 0.9079, + "step": 1960 + }, + { + "epoch": 0.309155272737696, + "grad_norm": 0.9921875, + "learning_rate": 6.864767265490039e-05, + "loss": 1.0906, + "step": 1961 + }, + { + "epoch": 0.309312924585089, + "grad_norm": 0.953125, + "learning_rate": 6.864330106565582e-05, + "loss": 0.9395, + "step": 1962 + }, + { + "epoch": 0.309470576432482, + "grad_norm": 0.8515625, + "learning_rate": 6.863892954286997e-05, + "loss": 0.8805, + "step": 1963 + }, + { + "epoch": 0.309628228279875, + "grad_norm": 0.83984375, + "learning_rate": 6.863455808655213e-05, + "loss": 0.8618, + "step": 1964 + }, + { + "epoch": 0.30978588012726804, + "grad_norm": 1.0703125, + "learning_rate": 6.863018669671156e-05, + "loss": 1.3156, + "step": 1965 + }, + { + "epoch": 0.30994353197466107, + "grad_norm": 0.88671875, + "learning_rate": 6.862581537335746e-05, + "loss": 0.8891, + "step": 1966 + }, + { + "epoch": 0.31010118382205404, + "grad_norm": 0.90625, + "learning_rate": 6.86214441164992e-05, + "loss": 1.0067, + "step": 1967 + }, + { + "epoch": 0.31025883566944706, + "grad_norm": 0.9296875, + "learning_rate": 6.861707292614598e-05, + "loss": 1.046, + "step": 1968 + }, + { + "epoch": 0.3104164875168401, + "grad_norm": 0.9765625, + "learning_rate": 6.861270180230708e-05, + "loss": 0.9657, + "step": 1969 + }, + { + "epoch": 0.3105741393642331, + "grad_norm": 1.0078125, + "learning_rate": 6.860833074499178e-05, + "loss": 1.1181, + "step": 1970 + }, + { + "epoch": 0.3107317912116261, + "grad_norm": 0.92578125, + "learning_rate": 6.86039597542093e-05, + "loss": 1.0731, + "step": 1971 + }, + { + "epoch": 0.3108894430590191, + "grad_norm": 0.93359375, + "learning_rate": 6.859958882996895e-05, + "loss": 1.0175, + "step": 1972 + }, + { + "epoch": 0.31104709490641214, + "grad_norm": 0.8828125, + "learning_rate": 6.859521797227999e-05, + "loss": 1.0549, + "step": 1973 + }, + { + "epoch": 0.31120474675380516, + "grad_norm": 0.87890625, + "learning_rate": 6.859084718115165e-05, + "loss": 1.081, + "step": 1974 + }, + { + "epoch": 0.31136239860119813, + "grad_norm": 0.9921875, + "learning_rate": 6.858647645659324e-05, + "loss": 1.1961, + "step": 1975 + }, + { + "epoch": 0.31152005044859116, + "grad_norm": 0.98046875, + "learning_rate": 6.858210579861394e-05, + "loss": 1.077, + "step": 1976 + }, + { + "epoch": 0.3116777022959842, + "grad_norm": 0.9296875, + "learning_rate": 6.857773520722311e-05, + "loss": 1.0883, + "step": 1977 + }, + { + "epoch": 0.3118353541433772, + "grad_norm": 0.9453125, + "learning_rate": 6.857336468242999e-05, + "loss": 1.0424, + "step": 1978 + }, + { + "epoch": 0.3119930059907702, + "grad_norm": 0.90625, + "learning_rate": 6.856899422424381e-05, + "loss": 1.0863, + "step": 1979 + }, + { + "epoch": 0.3121506578381632, + "grad_norm": 0.9921875, + "learning_rate": 6.856462383267386e-05, + "loss": 1.1165, + "step": 1980 + }, + { + "epoch": 0.31230830968555623, + "grad_norm": 0.84765625, + "learning_rate": 6.856025350772934e-05, + "loss": 1.0884, + "step": 1981 + }, + { + "epoch": 0.31246596153294925, + "grad_norm": 0.90625, + "learning_rate": 6.855588324941962e-05, + "loss": 1.0194, + "step": 1982 + }, + { + "epoch": 0.3126236133803422, + "grad_norm": 1.015625, + "learning_rate": 6.855151305775392e-05, + "loss": 1.3551, + "step": 1983 + }, + { + "epoch": 0.31278126522773525, + "grad_norm": 0.9609375, + "learning_rate": 6.854714293274147e-05, + "loss": 1.1895, + "step": 1984 + }, + { + "epoch": 0.3129389170751283, + "grad_norm": 0.94921875, + "learning_rate": 6.854277287439154e-05, + "loss": 0.9744, + "step": 1985 + }, + { + "epoch": 0.3130965689225213, + "grad_norm": 0.94921875, + "learning_rate": 6.853840288271341e-05, + "loss": 1.0628, + "step": 1986 + }, + { + "epoch": 0.31325422076991427, + "grad_norm": 1.2265625, + "learning_rate": 6.853403295771633e-05, + "loss": 1.2318, + "step": 1987 + }, + { + "epoch": 0.3134118726173073, + "grad_norm": 0.859375, + "learning_rate": 6.852966309940959e-05, + "loss": 0.9688, + "step": 1988 + }, + { + "epoch": 0.3135695244647003, + "grad_norm": 0.90625, + "learning_rate": 6.852529330780243e-05, + "loss": 1.0642, + "step": 1989 + }, + { + "epoch": 0.31372717631209335, + "grad_norm": 0.94140625, + "learning_rate": 6.852092358290411e-05, + "loss": 1.1573, + "step": 1990 + }, + { + "epoch": 0.3138848281594863, + "grad_norm": 1.03125, + "learning_rate": 6.851655392472387e-05, + "loss": 1.1883, + "step": 1991 + }, + { + "epoch": 0.31404248000687934, + "grad_norm": 0.92578125, + "learning_rate": 6.851218433327103e-05, + "loss": 1.1077, + "step": 1992 + }, + { + "epoch": 0.31420013185427237, + "grad_norm": 0.91796875, + "learning_rate": 6.850781480855479e-05, + "loss": 0.9263, + "step": 1993 + }, + { + "epoch": 0.3143577837016654, + "grad_norm": 0.9765625, + "learning_rate": 6.850344535058446e-05, + "loss": 1.0613, + "step": 1994 + }, + { + "epoch": 0.31451543554905836, + "grad_norm": 1.203125, + "learning_rate": 6.849907595936927e-05, + "loss": 1.0678, + "step": 1995 + }, + { + "epoch": 0.3146730873964514, + "grad_norm": 0.87109375, + "learning_rate": 6.849470663491844e-05, + "loss": 1.1007, + "step": 1996 + }, + { + "epoch": 0.3148307392438444, + "grad_norm": 0.890625, + "learning_rate": 6.849033737724131e-05, + "loss": 1.1009, + "step": 1997 + }, + { + "epoch": 0.31498839109123744, + "grad_norm": 0.9765625, + "learning_rate": 6.848596818634714e-05, + "loss": 1.1363, + "step": 1998 + }, + { + "epoch": 0.3151460429386304, + "grad_norm": 0.8671875, + "learning_rate": 6.848159906224513e-05, + "loss": 0.9082, + "step": 1999 + }, + { + "epoch": 0.31530369478602344, + "grad_norm": 0.8671875, + "learning_rate": 6.847723000494459e-05, + "loss": 0.8839, + "step": 2000 + }, + { + "epoch": 0.31530369478602344, + "eval_loss": 1.0622248649597168, + "eval_runtime": 306.9382, + "eval_samples_per_second": 32.58, + "eval_steps_per_second": 0.681, + "step": 2000 + }, + { + "epoch": 0.31546134663341646, + "grad_norm": 0.91015625, + "learning_rate": 6.84728610144547e-05, + "loss": 1.0609, + "step": 2001 + }, + { + "epoch": 0.3156189984808095, + "grad_norm": 1.015625, + "learning_rate": 6.846849209078484e-05, + "loss": 1.095, + "step": 2002 + }, + { + "epoch": 0.31577665032820246, + "grad_norm": 1.0078125, + "learning_rate": 6.84641232339442e-05, + "loss": 1.0071, + "step": 2003 + }, + { + "epoch": 0.3159343021755955, + "grad_norm": 0.9453125, + "learning_rate": 6.845975444394204e-05, + "loss": 0.9972, + "step": 2004 + }, + { + "epoch": 0.3160919540229885, + "grad_norm": 0.9375, + "learning_rate": 6.845538572078763e-05, + "loss": 1.2231, + "step": 2005 + }, + { + "epoch": 0.31624960587038153, + "grad_norm": 0.921875, + "learning_rate": 6.84510170644902e-05, + "loss": 1.046, + "step": 2006 + }, + { + "epoch": 0.3164072577177745, + "grad_norm": 0.921875, + "learning_rate": 6.844664847505907e-05, + "loss": 1.0046, + "step": 2007 + }, + { + "epoch": 0.31656490956516753, + "grad_norm": 0.87109375, + "learning_rate": 6.844227995250345e-05, + "loss": 1.3371, + "step": 2008 + }, + { + "epoch": 0.31672256141256055, + "grad_norm": 0.9765625, + "learning_rate": 6.843791149683262e-05, + "loss": 1.0634, + "step": 2009 + }, + { + "epoch": 0.3168802132599536, + "grad_norm": 0.89453125, + "learning_rate": 6.843354310805586e-05, + "loss": 0.8842, + "step": 2010 + }, + { + "epoch": 0.31703786510734655, + "grad_norm": 0.98828125, + "learning_rate": 6.842917478618235e-05, + "loss": 1.149, + "step": 2011 + }, + { + "epoch": 0.3171955169547396, + "grad_norm": 0.94921875, + "learning_rate": 6.842480653122143e-05, + "loss": 0.9179, + "step": 2012 + }, + { + "epoch": 0.3173531688021326, + "grad_norm": 0.98828125, + "learning_rate": 6.842043834318232e-05, + "loss": 1.0081, + "step": 2013 + }, + { + "epoch": 0.3175108206495256, + "grad_norm": 1.0390625, + "learning_rate": 6.841607022207431e-05, + "loss": 1.212, + "step": 2014 + }, + { + "epoch": 0.3176684724969186, + "grad_norm": 0.84375, + "learning_rate": 6.84117021679066e-05, + "loss": 0.861, + "step": 2015 + }, + { + "epoch": 0.3178261243443116, + "grad_norm": 1.0234375, + "learning_rate": 6.840733418068845e-05, + "loss": 0.9907, + "step": 2016 + }, + { + "epoch": 0.31798377619170465, + "grad_norm": 0.9375, + "learning_rate": 6.840296626042921e-05, + "loss": 0.9483, + "step": 2017 + }, + { + "epoch": 0.3181414280390977, + "grad_norm": 0.9609375, + "learning_rate": 6.839859840713807e-05, + "loss": 1.0886, + "step": 2018 + }, + { + "epoch": 0.31829907988649064, + "grad_norm": 0.890625, + "learning_rate": 6.839423062082429e-05, + "loss": 0.9851, + "step": 2019 + }, + { + "epoch": 0.31845673173388367, + "grad_norm": 1.03125, + "learning_rate": 6.838986290149712e-05, + "loss": 1.2503, + "step": 2020 + }, + { + "epoch": 0.3186143835812767, + "grad_norm": 1.0078125, + "learning_rate": 6.83854952491658e-05, + "loss": 1.3227, + "step": 2021 + }, + { + "epoch": 0.3187720354286697, + "grad_norm": 0.94921875, + "learning_rate": 6.838112766383966e-05, + "loss": 1.0298, + "step": 2022 + }, + { + "epoch": 0.3189296872760627, + "grad_norm": 0.9140625, + "learning_rate": 6.837676014552791e-05, + "loss": 1.1044, + "step": 2023 + }, + { + "epoch": 0.3190873391234557, + "grad_norm": 0.9921875, + "learning_rate": 6.837239269423981e-05, + "loss": 1.1684, + "step": 2024 + }, + { + "epoch": 0.31924499097084874, + "grad_norm": 1.0625, + "learning_rate": 6.836802530998461e-05, + "loss": 1.1514, + "step": 2025 + }, + { + "epoch": 0.31940264281824177, + "grad_norm": 0.87890625, + "learning_rate": 6.836365799277154e-05, + "loss": 0.9073, + "step": 2026 + }, + { + "epoch": 0.31956029466563474, + "grad_norm": 0.98828125, + "learning_rate": 6.835929074260993e-05, + "loss": 1.2934, + "step": 2027 + }, + { + "epoch": 0.31971794651302776, + "grad_norm": 1.03125, + "learning_rate": 6.835492355950898e-05, + "loss": 1.0152, + "step": 2028 + }, + { + "epoch": 0.3198755983604208, + "grad_norm": 0.91015625, + "learning_rate": 6.835055644347797e-05, + "loss": 1.1495, + "step": 2029 + }, + { + "epoch": 0.3200332502078138, + "grad_norm": 0.87890625, + "learning_rate": 6.834618939452614e-05, + "loss": 0.8969, + "step": 2030 + }, + { + "epoch": 0.3201909020552068, + "grad_norm": 0.90625, + "learning_rate": 6.834182241266275e-05, + "loss": 1.1123, + "step": 2031 + }, + { + "epoch": 0.3203485539025998, + "grad_norm": 0.94140625, + "learning_rate": 6.833745549789707e-05, + "loss": 0.9982, + "step": 2032 + }, + { + "epoch": 0.32050620574999283, + "grad_norm": 0.99609375, + "learning_rate": 6.833308865023834e-05, + "loss": 1.314, + "step": 2033 + }, + { + "epoch": 0.32066385759738586, + "grad_norm": 0.86328125, + "learning_rate": 6.832872186969583e-05, + "loss": 1.0377, + "step": 2034 + }, + { + "epoch": 0.3208215094447789, + "grad_norm": 0.890625, + "learning_rate": 6.832435515627877e-05, + "loss": 1.0519, + "step": 2035 + }, + { + "epoch": 0.32097916129217186, + "grad_norm": 0.921875, + "learning_rate": 6.831998850999639e-05, + "loss": 1.1053, + "step": 2036 + }, + { + "epoch": 0.3211368131395649, + "grad_norm": 1.03125, + "learning_rate": 6.831562193085802e-05, + "loss": 1.1427, + "step": 2037 + }, + { + "epoch": 0.3212944649869579, + "grad_norm": 0.98046875, + "learning_rate": 6.831125541887289e-05, + "loss": 1.1727, + "step": 2038 + }, + { + "epoch": 0.32145211683435093, + "grad_norm": 1.015625, + "learning_rate": 6.830688897405024e-05, + "loss": 1.328, + "step": 2039 + }, + { + "epoch": 0.3216097686817439, + "grad_norm": 0.86328125, + "learning_rate": 6.830252259639931e-05, + "loss": 1.0804, + "step": 2040 + }, + { + "epoch": 0.3217674205291369, + "grad_norm": 0.9375, + "learning_rate": 6.829815628592935e-05, + "loss": 0.9526, + "step": 2041 + }, + { + "epoch": 0.32192507237652995, + "grad_norm": 0.91015625, + "learning_rate": 6.829379004264966e-05, + "loss": 1.0284, + "step": 2042 + }, + { + "epoch": 0.322082724223923, + "grad_norm": 1.046875, + "learning_rate": 6.828942386656948e-05, + "loss": 1.1929, + "step": 2043 + }, + { + "epoch": 0.32224037607131595, + "grad_norm": 0.921875, + "learning_rate": 6.828505775769807e-05, + "loss": 1.2235, + "step": 2044 + }, + { + "epoch": 0.322398027918709, + "grad_norm": 1.015625, + "learning_rate": 6.828069171604465e-05, + "loss": 1.0808, + "step": 2045 + }, + { + "epoch": 0.322555679766102, + "grad_norm": 1.0390625, + "learning_rate": 6.827632574161845e-05, + "loss": 1.2171, + "step": 2046 + }, + { + "epoch": 0.322713331613495, + "grad_norm": 0.90234375, + "learning_rate": 6.827195983442881e-05, + "loss": 1.0112, + "step": 2047 + }, + { + "epoch": 0.322870983460888, + "grad_norm": 0.91796875, + "learning_rate": 6.826759399448493e-05, + "loss": 0.9924, + "step": 2048 + }, + { + "epoch": 0.323028635308281, + "grad_norm": 0.89453125, + "learning_rate": 6.826322822179608e-05, + "loss": 1.1004, + "step": 2049 + }, + { + "epoch": 0.32318628715567405, + "grad_norm": 0.93359375, + "learning_rate": 6.82588625163715e-05, + "loss": 1.2539, + "step": 2050 + }, + { + "epoch": 0.32334393900306707, + "grad_norm": 0.984375, + "learning_rate": 6.825449687822043e-05, + "loss": 0.8724, + "step": 2051 + }, + { + "epoch": 0.32350159085046004, + "grad_norm": 0.98828125, + "learning_rate": 6.825013130735216e-05, + "loss": 1.0367, + "step": 2052 + }, + { + "epoch": 0.32365924269785307, + "grad_norm": 0.87890625, + "learning_rate": 6.824576580377591e-05, + "loss": 1.0989, + "step": 2053 + }, + { + "epoch": 0.3238168945452461, + "grad_norm": 0.83203125, + "learning_rate": 6.824140036750092e-05, + "loss": 0.8809, + "step": 2054 + }, + { + "epoch": 0.3239745463926391, + "grad_norm": 0.953125, + "learning_rate": 6.82370349985365e-05, + "loss": 0.9756, + "step": 2055 + }, + { + "epoch": 0.3241321982400321, + "grad_norm": 0.9765625, + "learning_rate": 6.823266969689186e-05, + "loss": 0.9237, + "step": 2056 + }, + { + "epoch": 0.3242898500874251, + "grad_norm": 1.0390625, + "learning_rate": 6.822830446257627e-05, + "loss": 0.9121, + "step": 2057 + }, + { + "epoch": 0.32444750193481814, + "grad_norm": 1.3671875, + "learning_rate": 6.822393929559897e-05, + "loss": 0.9538, + "step": 2058 + }, + { + "epoch": 0.32460515378221116, + "grad_norm": 0.98046875, + "learning_rate": 6.821957419596922e-05, + "loss": 1.1098, + "step": 2059 + }, + { + "epoch": 0.32476280562960413, + "grad_norm": 0.9296875, + "learning_rate": 6.821520916369626e-05, + "loss": 0.9749, + "step": 2060 + }, + { + "epoch": 0.32492045747699716, + "grad_norm": 0.9140625, + "learning_rate": 6.821084419878931e-05, + "loss": 1.1818, + "step": 2061 + }, + { + "epoch": 0.3250781093243902, + "grad_norm": 0.8984375, + "learning_rate": 6.82064793012577e-05, + "loss": 0.9469, + "step": 2062 + }, + { + "epoch": 0.3252357611717832, + "grad_norm": 0.95703125, + "learning_rate": 6.820211447111062e-05, + "loss": 1.0894, + "step": 2063 + }, + { + "epoch": 0.3253934130191762, + "grad_norm": 0.98828125, + "learning_rate": 6.819774970835738e-05, + "loss": 1.1263, + "step": 2064 + }, + { + "epoch": 0.3255510648665692, + "grad_norm": 1.6953125, + "learning_rate": 6.819338501300716e-05, + "loss": 1.1258, + "step": 2065 + }, + { + "epoch": 0.32570871671396223, + "grad_norm": 0.9921875, + "learning_rate": 6.818902038506921e-05, + "loss": 1.016, + "step": 2066 + }, + { + "epoch": 0.32586636856135526, + "grad_norm": 1.046875, + "learning_rate": 6.818465582455285e-05, + "loss": 1.0847, + "step": 2067 + }, + { + "epoch": 0.3260240204087482, + "grad_norm": 0.94140625, + "learning_rate": 6.81802913314673e-05, + "loss": 1.001, + "step": 2068 + }, + { + "epoch": 0.32618167225614125, + "grad_norm": 0.93359375, + "learning_rate": 6.817592690582182e-05, + "loss": 1.2238, + "step": 2069 + }, + { + "epoch": 0.3263393241035343, + "grad_norm": 0.94140625, + "learning_rate": 6.817156254762562e-05, + "loss": 1.1811, + "step": 2070 + }, + { + "epoch": 0.3264969759509273, + "grad_norm": 0.89453125, + "learning_rate": 6.816719825688796e-05, + "loss": 1.0741, + "step": 2071 + }, + { + "epoch": 0.3266546277983203, + "grad_norm": 0.97265625, + "learning_rate": 6.816283403361812e-05, + "loss": 1.0666, + "step": 2072 + }, + { + "epoch": 0.3268122796457133, + "grad_norm": 0.9765625, + "learning_rate": 6.815846987782532e-05, + "loss": 1.073, + "step": 2073 + }, + { + "epoch": 0.3269699314931063, + "grad_norm": 0.9453125, + "learning_rate": 6.815410578951884e-05, + "loss": 1.0709, + "step": 2074 + }, + { + "epoch": 0.32712758334049935, + "grad_norm": 0.8671875, + "learning_rate": 6.814974176870791e-05, + "loss": 0.9503, + "step": 2075 + }, + { + "epoch": 0.3272852351878923, + "grad_norm": 1.125, + "learning_rate": 6.81453778154018e-05, + "loss": 1.2879, + "step": 2076 + }, + { + "epoch": 0.32744288703528535, + "grad_norm": 0.86328125, + "learning_rate": 6.81410139296097e-05, + "loss": 1.1819, + "step": 2077 + }, + { + "epoch": 0.32760053888267837, + "grad_norm": 0.9453125, + "learning_rate": 6.813665011134093e-05, + "loss": 1.0551, + "step": 2078 + }, + { + "epoch": 0.3277581907300714, + "grad_norm": 0.80078125, + "learning_rate": 6.813228636060471e-05, + "loss": 0.9619, + "step": 2079 + }, + { + "epoch": 0.32791584257746437, + "grad_norm": 1.0546875, + "learning_rate": 6.812792267741029e-05, + "loss": 1.0038, + "step": 2080 + }, + { + "epoch": 0.3280734944248574, + "grad_norm": 0.8671875, + "learning_rate": 6.81235590617669e-05, + "loss": 0.916, + "step": 2081 + }, + { + "epoch": 0.3282311462722504, + "grad_norm": 1.0234375, + "learning_rate": 6.811919551368378e-05, + "loss": 1.184, + "step": 2082 + }, + { + "epoch": 0.32838879811964344, + "grad_norm": 0.90625, + "learning_rate": 6.811483203317023e-05, + "loss": 0.9072, + "step": 2083 + }, + { + "epoch": 0.3285464499670364, + "grad_norm": 0.98828125, + "learning_rate": 6.811046862023548e-05, + "loss": 1.1323, + "step": 2084 + }, + { + "epoch": 0.32870410181442944, + "grad_norm": 0.875, + "learning_rate": 6.810610527488877e-05, + "loss": 1.0646, + "step": 2085 + }, + { + "epoch": 0.32886175366182246, + "grad_norm": 0.734375, + "learning_rate": 6.810174199713934e-05, + "loss": 0.8824, + "step": 2086 + }, + { + "epoch": 0.3290194055092155, + "grad_norm": 0.9765625, + "learning_rate": 6.80973787869964e-05, + "loss": 1.1224, + "step": 2087 + }, + { + "epoch": 0.32917705735660846, + "grad_norm": 1.0703125, + "learning_rate": 6.809301564446931e-05, + "loss": 1.2173, + "step": 2088 + }, + { + "epoch": 0.3293347092040015, + "grad_norm": 0.90234375, + "learning_rate": 6.808865256956722e-05, + "loss": 1.1498, + "step": 2089 + }, + { + "epoch": 0.3294923610513945, + "grad_norm": 0.95703125, + "learning_rate": 6.808428956229942e-05, + "loss": 1.0883, + "step": 2090 + }, + { + "epoch": 0.32965001289878754, + "grad_norm": 1.0703125, + "learning_rate": 6.807992662267514e-05, + "loss": 1.2694, + "step": 2091 + }, + { + "epoch": 0.3298076647461805, + "grad_norm": 0.9140625, + "learning_rate": 6.807556375070359e-05, + "loss": 1.1311, + "step": 2092 + }, + { + "epoch": 0.32996531659357353, + "grad_norm": 1.0546875, + "learning_rate": 6.807120094639409e-05, + "loss": 1.2841, + "step": 2093 + }, + { + "epoch": 0.33012296844096656, + "grad_norm": 0.8828125, + "learning_rate": 6.806683820975587e-05, + "loss": 1.0515, + "step": 2094 + }, + { + "epoch": 0.3302806202883596, + "grad_norm": 0.984375, + "learning_rate": 6.806247554079816e-05, + "loss": 1.1433, + "step": 2095 + }, + { + "epoch": 0.33043827213575255, + "grad_norm": 0.953125, + "learning_rate": 6.805811293953021e-05, + "loss": 1.0956, + "step": 2096 + }, + { + "epoch": 0.3305959239831456, + "grad_norm": 0.97265625, + "learning_rate": 6.805375040596123e-05, + "loss": 1.0345, + "step": 2097 + }, + { + "epoch": 0.3307535758305386, + "grad_norm": 1.1015625, + "learning_rate": 6.804938794010053e-05, + "loss": 1.0658, + "step": 2098 + }, + { + "epoch": 0.33091122767793163, + "grad_norm": 0.89453125, + "learning_rate": 6.804502554195733e-05, + "loss": 0.926, + "step": 2099 + }, + { + "epoch": 0.3310688795253246, + "grad_norm": 0.90234375, + "learning_rate": 6.804066321154085e-05, + "loss": 0.9631, + "step": 2100 + }, + { + "epoch": 0.3312265313727176, + "grad_norm": 0.890625, + "learning_rate": 6.803630094886039e-05, + "loss": 1.0872, + "step": 2101 + }, + { + "epoch": 0.33138418322011065, + "grad_norm": 0.9453125, + "learning_rate": 6.803193875392511e-05, + "loss": 1.2202, + "step": 2102 + }, + { + "epoch": 0.3315418350675037, + "grad_norm": 0.9921875, + "learning_rate": 6.802757662674434e-05, + "loss": 1.2617, + "step": 2103 + }, + { + "epoch": 0.33169948691489665, + "grad_norm": 0.96875, + "learning_rate": 6.80232145673273e-05, + "loss": 0.868, + "step": 2104 + }, + { + "epoch": 0.33185713876228967, + "grad_norm": 0.890625, + "learning_rate": 6.801885257568323e-05, + "loss": 0.9644, + "step": 2105 + }, + { + "epoch": 0.3320147906096827, + "grad_norm": 1.0859375, + "learning_rate": 6.801449065182137e-05, + "loss": 0.9373, + "step": 2106 + }, + { + "epoch": 0.3321724424570757, + "grad_norm": 1.0, + "learning_rate": 6.801012879575093e-05, + "loss": 1.266, + "step": 2107 + }, + { + "epoch": 0.3323300943044687, + "grad_norm": 0.9765625, + "learning_rate": 6.800576700748122e-05, + "loss": 1.1639, + "step": 2108 + }, + { + "epoch": 0.3324877461518617, + "grad_norm": 0.921875, + "learning_rate": 6.800140528702147e-05, + "loss": 1.0931, + "step": 2109 + }, + { + "epoch": 0.33264539799925474, + "grad_norm": 0.8515625, + "learning_rate": 6.799704363438093e-05, + "loss": 1.0479, + "step": 2110 + }, + { + "epoch": 0.33280304984664777, + "grad_norm": 1.0, + "learning_rate": 6.799268204956881e-05, + "loss": 1.1818, + "step": 2111 + }, + { + "epoch": 0.33296070169404074, + "grad_norm": 0.89453125, + "learning_rate": 6.798832053259434e-05, + "loss": 1.1938, + "step": 2112 + }, + { + "epoch": 0.33311835354143376, + "grad_norm": 0.984375, + "learning_rate": 6.798395908346682e-05, + "loss": 1.1217, + "step": 2113 + }, + { + "epoch": 0.3332760053888268, + "grad_norm": 0.97265625, + "learning_rate": 6.797959770219548e-05, + "loss": 1.0753, + "step": 2114 + }, + { + "epoch": 0.3334336572362198, + "grad_norm": 0.92578125, + "learning_rate": 6.797523638878955e-05, + "loss": 1.256, + "step": 2115 + }, + { + "epoch": 0.3335913090836128, + "grad_norm": 0.8984375, + "learning_rate": 6.797087514325828e-05, + "loss": 1.0689, + "step": 2116 + }, + { + "epoch": 0.3337489609310058, + "grad_norm": 0.98046875, + "learning_rate": 6.796651396561088e-05, + "loss": 1.145, + "step": 2117 + }, + { + "epoch": 0.33390661277839884, + "grad_norm": 1.0, + "learning_rate": 6.796215285585666e-05, + "loss": 1.0995, + "step": 2118 + }, + { + "epoch": 0.33406426462579186, + "grad_norm": 0.91015625, + "learning_rate": 6.79577918140048e-05, + "loss": 0.9946, + "step": 2119 + }, + { + "epoch": 0.33422191647318483, + "grad_norm": 1.03125, + "learning_rate": 6.795343084006458e-05, + "loss": 1.0797, + "step": 2120 + }, + { + "epoch": 0.33437956832057786, + "grad_norm": 0.953125, + "learning_rate": 6.794906993404522e-05, + "loss": 1.0831, + "step": 2121 + }, + { + "epoch": 0.3345372201679709, + "grad_norm": 1.078125, + "learning_rate": 6.794470909595596e-05, + "loss": 1.303, + "step": 2122 + }, + { + "epoch": 0.3346948720153639, + "grad_norm": 0.94140625, + "learning_rate": 6.79403483258061e-05, + "loss": 1.174, + "step": 2123 + }, + { + "epoch": 0.3348525238627569, + "grad_norm": 1.1171875, + "learning_rate": 6.793598762360481e-05, + "loss": 1.1146, + "step": 2124 + }, + { + "epoch": 0.3350101757101499, + "grad_norm": 0.99609375, + "learning_rate": 6.793162698936137e-05, + "loss": 1.005, + "step": 2125 + }, + { + "epoch": 0.33516782755754293, + "grad_norm": 0.97265625, + "learning_rate": 6.792726642308503e-05, + "loss": 1.1307, + "step": 2126 + }, + { + "epoch": 0.33532547940493596, + "grad_norm": 0.890625, + "learning_rate": 6.792290592478497e-05, + "loss": 0.9617, + "step": 2127 + }, + { + "epoch": 0.3354831312523289, + "grad_norm": 0.90625, + "learning_rate": 6.79185454944705e-05, + "loss": 0.9597, + "step": 2128 + }, + { + "epoch": 0.33564078309972195, + "grad_norm": 0.87109375, + "learning_rate": 6.791418513215086e-05, + "loss": 0.9049, + "step": 2129 + }, + { + "epoch": 0.335798434947115, + "grad_norm": 1.046875, + "learning_rate": 6.790982483783526e-05, + "loss": 1.1412, + "step": 2130 + }, + { + "epoch": 0.335956086794508, + "grad_norm": 0.84375, + "learning_rate": 6.790546461153296e-05, + "loss": 1.0486, + "step": 2131 + }, + { + "epoch": 0.33611373864190097, + "grad_norm": 1.0078125, + "learning_rate": 6.790110445325313e-05, + "loss": 0.9885, + "step": 2132 + }, + { + "epoch": 0.336271390489294, + "grad_norm": 0.87109375, + "learning_rate": 6.789674436300516e-05, + "loss": 1.0136, + "step": 2133 + }, + { + "epoch": 0.336429042336687, + "grad_norm": 0.90234375, + "learning_rate": 6.789238434079817e-05, + "loss": 0.8374, + "step": 2134 + }, + { + "epoch": 0.33658669418408005, + "grad_norm": 0.93359375, + "learning_rate": 6.788802438664144e-05, + "loss": 0.9635, + "step": 2135 + }, + { + "epoch": 0.336744346031473, + "grad_norm": 0.8203125, + "learning_rate": 6.78836645005442e-05, + "loss": 0.9721, + "step": 2136 + }, + { + "epoch": 0.33690199787886604, + "grad_norm": 1.0078125, + "learning_rate": 6.787930468251569e-05, + "loss": 1.1027, + "step": 2137 + }, + { + "epoch": 0.33705964972625907, + "grad_norm": 1.0078125, + "learning_rate": 6.787494493256519e-05, + "loss": 1.1275, + "step": 2138 + }, + { + "epoch": 0.3372173015736521, + "grad_norm": 0.91796875, + "learning_rate": 6.787058525070189e-05, + "loss": 0.9446, + "step": 2139 + }, + { + "epoch": 0.33737495342104507, + "grad_norm": 0.9140625, + "learning_rate": 6.786622563693503e-05, + "loss": 1.0358, + "step": 2140 + }, + { + "epoch": 0.3375326052684381, + "grad_norm": 0.94921875, + "learning_rate": 6.786186609127389e-05, + "loss": 1.1396, + "step": 2141 + }, + { + "epoch": 0.3376902571158311, + "grad_norm": 0.8828125, + "learning_rate": 6.785750661372766e-05, + "loss": 1.0144, + "step": 2142 + }, + { + "epoch": 0.33784790896322414, + "grad_norm": 0.94921875, + "learning_rate": 6.785314720430565e-05, + "loss": 1.1476, + "step": 2143 + }, + { + "epoch": 0.3380055608106171, + "grad_norm": 0.94140625, + "learning_rate": 6.784878786301703e-05, + "loss": 1.0698, + "step": 2144 + }, + { + "epoch": 0.33816321265801014, + "grad_norm": 0.953125, + "learning_rate": 6.784442858987108e-05, + "loss": 1.1043, + "step": 2145 + }, + { + "epoch": 0.33832086450540316, + "grad_norm": 0.9609375, + "learning_rate": 6.784006938487702e-05, + "loss": 1.008, + "step": 2146 + }, + { + "epoch": 0.3384785163527962, + "grad_norm": 0.85546875, + "learning_rate": 6.783571024804407e-05, + "loss": 0.8262, + "step": 2147 + }, + { + "epoch": 0.33863616820018916, + "grad_norm": 0.95703125, + "learning_rate": 6.783135117938152e-05, + "loss": 1.201, + "step": 2148 + }, + { + "epoch": 0.3387938200475822, + "grad_norm": 1.015625, + "learning_rate": 6.782699217889858e-05, + "loss": 1.0254, + "step": 2149 + }, + { + "epoch": 0.3389514718949752, + "grad_norm": 0.96484375, + "learning_rate": 6.782263324660449e-05, + "loss": 1.1882, + "step": 2150 + }, + { + "epoch": 0.33910912374236823, + "grad_norm": 1.078125, + "learning_rate": 6.78182743825085e-05, + "loss": 1.0499, + "step": 2151 + }, + { + "epoch": 0.3392667755897612, + "grad_norm": 1.0078125, + "learning_rate": 6.781391558661981e-05, + "loss": 1.0338, + "step": 2152 + }, + { + "epoch": 0.33942442743715423, + "grad_norm": 0.91015625, + "learning_rate": 6.78095568589477e-05, + "loss": 1.0153, + "step": 2153 + }, + { + "epoch": 0.33958207928454726, + "grad_norm": 1.0625, + "learning_rate": 6.780519819950141e-05, + "loss": 1.0766, + "step": 2154 + }, + { + "epoch": 0.3397397311319403, + "grad_norm": 0.9765625, + "learning_rate": 6.780083960829015e-05, + "loss": 1.1782, + "step": 2155 + }, + { + "epoch": 0.33989738297933325, + "grad_norm": 1.0390625, + "learning_rate": 6.779648108532319e-05, + "loss": 1.3827, + "step": 2156 + }, + { + "epoch": 0.3400550348267263, + "grad_norm": 1.0546875, + "learning_rate": 6.77921226306097e-05, + "loss": 1.3931, + "step": 2157 + }, + { + "epoch": 0.3402126866741193, + "grad_norm": 0.921875, + "learning_rate": 6.778776424415899e-05, + "loss": 1.1675, + "step": 2158 + }, + { + "epoch": 0.34037033852151233, + "grad_norm": 0.93359375, + "learning_rate": 6.778340592598029e-05, + "loss": 1.1134, + "step": 2159 + }, + { + "epoch": 0.3405279903689053, + "grad_norm": 0.828125, + "learning_rate": 6.777904767608281e-05, + "loss": 0.8249, + "step": 2160 + }, + { + "epoch": 0.3406856422162983, + "grad_norm": 0.95703125, + "learning_rate": 6.777468949447579e-05, + "loss": 1.0454, + "step": 2161 + }, + { + "epoch": 0.34084329406369135, + "grad_norm": 0.93359375, + "learning_rate": 6.777033138116846e-05, + "loss": 1.249, + "step": 2162 + }, + { + "epoch": 0.3410009459110844, + "grad_norm": 0.8359375, + "learning_rate": 6.77659733361701e-05, + "loss": 0.9667, + "step": 2163 + }, + { + "epoch": 0.34115859775847734, + "grad_norm": 0.96484375, + "learning_rate": 6.77616153594899e-05, + "loss": 1.1271, + "step": 2164 + }, + { + "epoch": 0.34131624960587037, + "grad_norm": 1.0703125, + "learning_rate": 6.775725745113713e-05, + "loss": 1.1888, + "step": 2165 + }, + { + "epoch": 0.3414739014532634, + "grad_norm": 0.9609375, + "learning_rate": 6.775289961112101e-05, + "loss": 0.9442, + "step": 2166 + }, + { + "epoch": 0.3416315533006564, + "grad_norm": 0.984375, + "learning_rate": 6.774854183945072e-05, + "loss": 1.026, + "step": 2167 + }, + { + "epoch": 0.3417892051480494, + "grad_norm": 1.046875, + "learning_rate": 6.774418413613561e-05, + "loss": 1.2214, + "step": 2168 + }, + { + "epoch": 0.3419468569954424, + "grad_norm": 1.0390625, + "learning_rate": 6.773982650118484e-05, + "loss": 1.1054, + "step": 2169 + }, + { + "epoch": 0.34210450884283544, + "grad_norm": 0.92578125, + "learning_rate": 6.773546893460769e-05, + "loss": 1.2413, + "step": 2170 + }, + { + "epoch": 0.34226216069022847, + "grad_norm": 0.86328125, + "learning_rate": 6.773111143641335e-05, + "loss": 1.0844, + "step": 2171 + }, + { + "epoch": 0.34241981253762144, + "grad_norm": 0.8984375, + "learning_rate": 6.772675400661104e-05, + "loss": 0.8381, + "step": 2172 + }, + { + "epoch": 0.34257746438501446, + "grad_norm": 0.8359375, + "learning_rate": 6.772239664521007e-05, + "loss": 1.0863, + "step": 2173 + }, + { + "epoch": 0.3427351162324075, + "grad_norm": 0.94921875, + "learning_rate": 6.771803935221963e-05, + "loss": 0.9231, + "step": 2174 + }, + { + "epoch": 0.3428927680798005, + "grad_norm": 1.0, + "learning_rate": 6.771368212764897e-05, + "loss": 0.9976, + "step": 2175 + }, + { + "epoch": 0.3430504199271935, + "grad_norm": 0.890625, + "learning_rate": 6.77093249715073e-05, + "loss": 1.0573, + "step": 2176 + }, + { + "epoch": 0.3432080717745865, + "grad_norm": 0.90234375, + "learning_rate": 6.770496788380385e-05, + "loss": 1.1379, + "step": 2177 + }, + { + "epoch": 0.34336572362197954, + "grad_norm": 0.91015625, + "learning_rate": 6.77006108645479e-05, + "loss": 1.0191, + "step": 2178 + }, + { + "epoch": 0.34352337546937256, + "grad_norm": 0.92578125, + "learning_rate": 6.769625391374867e-05, + "loss": 0.9361, + "step": 2179 + }, + { + "epoch": 0.34368102731676553, + "grad_norm": 0.8515625, + "learning_rate": 6.769189703141536e-05, + "loss": 1.1088, + "step": 2180 + }, + { + "epoch": 0.34383867916415856, + "grad_norm": 1.1015625, + "learning_rate": 6.768754021755726e-05, + "loss": 1.3022, + "step": 2181 + }, + { + "epoch": 0.3439963310115516, + "grad_norm": 0.953125, + "learning_rate": 6.768318347218353e-05, + "loss": 1.0419, + "step": 2182 + }, + { + "epoch": 0.3441539828589446, + "grad_norm": 1.03125, + "learning_rate": 6.767882679530347e-05, + "loss": 0.9761, + "step": 2183 + }, + { + "epoch": 0.3443116347063376, + "grad_norm": 0.99609375, + "learning_rate": 6.76744701869263e-05, + "loss": 1.0723, + "step": 2184 + }, + { + "epoch": 0.3444692865537306, + "grad_norm": 1.03125, + "learning_rate": 6.767011364706122e-05, + "loss": 1.1259, + "step": 2185 + }, + { + "epoch": 0.34462693840112363, + "grad_norm": 0.97265625, + "learning_rate": 6.766575717571751e-05, + "loss": 1.0985, + "step": 2186 + }, + { + "epoch": 0.34478459024851665, + "grad_norm": 0.97265625, + "learning_rate": 6.766140077290433e-05, + "loss": 1.1436, + "step": 2187 + }, + { + "epoch": 0.3449422420959096, + "grad_norm": 1.0078125, + "learning_rate": 6.7657044438631e-05, + "loss": 1.1508, + "step": 2188 + }, + { + "epoch": 0.34509989394330265, + "grad_norm": 0.9765625, + "learning_rate": 6.765268817290672e-05, + "loss": 1.0807, + "step": 2189 + }, + { + "epoch": 0.3452575457906957, + "grad_norm": 0.8203125, + "learning_rate": 6.764833197574072e-05, + "loss": 1.0054, + "step": 2190 + }, + { + "epoch": 0.3454151976380887, + "grad_norm": 0.91796875, + "learning_rate": 6.764397584714223e-05, + "loss": 1.0618, + "step": 2191 + }, + { + "epoch": 0.3455728494854817, + "grad_norm": 1.09375, + "learning_rate": 6.763961978712044e-05, + "loss": 1.1288, + "step": 2192 + }, + { + "epoch": 0.3457305013328747, + "grad_norm": 0.9453125, + "learning_rate": 6.763526379568466e-05, + "loss": 1.0887, + "step": 2193 + }, + { + "epoch": 0.3458881531802677, + "grad_norm": 1.0546875, + "learning_rate": 6.76309078728441e-05, + "loss": 1.3282, + "step": 2194 + }, + { + "epoch": 0.34604580502766075, + "grad_norm": 0.95703125, + "learning_rate": 6.762655201860798e-05, + "loss": 1.1897, + "step": 2195 + }, + { + "epoch": 0.34620345687505377, + "grad_norm": 0.96484375, + "learning_rate": 6.762219623298553e-05, + "loss": 1.3365, + "step": 2196 + }, + { + "epoch": 0.34636110872244674, + "grad_norm": 0.91015625, + "learning_rate": 6.761784051598594e-05, + "loss": 1.0132, + "step": 2197 + }, + { + "epoch": 0.34651876056983977, + "grad_norm": 0.82421875, + "learning_rate": 6.761348486761854e-05, + "loss": 1.0249, + "step": 2198 + }, + { + "epoch": 0.3466764124172328, + "grad_norm": 0.9453125, + "learning_rate": 6.76091292878925e-05, + "loss": 1.1909, + "step": 2199 + }, + { + "epoch": 0.3468340642646258, + "grad_norm": 0.97265625, + "learning_rate": 6.760477377681705e-05, + "loss": 1.0123, + "step": 2200 + }, + { + "epoch": 0.3469917161120188, + "grad_norm": 0.84765625, + "learning_rate": 6.760041833440144e-05, + "loss": 1.1197, + "step": 2201 + }, + { + "epoch": 0.3471493679594118, + "grad_norm": 0.8046875, + "learning_rate": 6.759606296065486e-05, + "loss": 0.9041, + "step": 2202 + }, + { + "epoch": 0.34730701980680484, + "grad_norm": 0.96484375, + "learning_rate": 6.75917076555866e-05, + "loss": 1.0531, + "step": 2203 + }, + { + "epoch": 0.34746467165419787, + "grad_norm": 0.83203125, + "learning_rate": 6.758735241920588e-05, + "loss": 0.7879, + "step": 2204 + }, + { + "epoch": 0.34762232350159084, + "grad_norm": 0.94140625, + "learning_rate": 6.758299725152189e-05, + "loss": 1.0953, + "step": 2205 + }, + { + "epoch": 0.34777997534898386, + "grad_norm": 0.98046875, + "learning_rate": 6.75786421525439e-05, + "loss": 1.0906, + "step": 2206 + }, + { + "epoch": 0.3479376271963769, + "grad_norm": 0.9296875, + "learning_rate": 6.757428712228107e-05, + "loss": 0.9932, + "step": 2207 + }, + { + "epoch": 0.3480952790437699, + "grad_norm": 0.890625, + "learning_rate": 6.756993216074275e-05, + "loss": 0.9871, + "step": 2208 + }, + { + "epoch": 0.3482529308911629, + "grad_norm": 0.9296875, + "learning_rate": 6.756557726793808e-05, + "loss": 1.07, + "step": 2209 + }, + { + "epoch": 0.3484105827385559, + "grad_norm": 1.03125, + "learning_rate": 6.756122244387632e-05, + "loss": 0.9949, + "step": 2210 + }, + { + "epoch": 0.34856823458594893, + "grad_norm": 1.09375, + "learning_rate": 6.75568676885667e-05, + "loss": 1.1843, + "step": 2211 + }, + { + "epoch": 0.34872588643334196, + "grad_norm": 1.0078125, + "learning_rate": 6.75525130020184e-05, + "loss": 1.0812, + "step": 2212 + }, + { + "epoch": 0.34888353828073493, + "grad_norm": 0.94140625, + "learning_rate": 6.754815838424074e-05, + "loss": 1.1916, + "step": 2213 + }, + { + "epoch": 0.34904119012812795, + "grad_norm": 1.015625, + "learning_rate": 6.75438038352429e-05, + "loss": 1.0848, + "step": 2214 + }, + { + "epoch": 0.349198841975521, + "grad_norm": 0.98046875, + "learning_rate": 6.753944935503412e-05, + "loss": 1.2739, + "step": 2215 + }, + { + "epoch": 0.349356493822914, + "grad_norm": 0.890625, + "learning_rate": 6.75350949436236e-05, + "loss": 1.1133, + "step": 2216 + }, + { + "epoch": 0.349514145670307, + "grad_norm": 0.92578125, + "learning_rate": 6.75307406010206e-05, + "loss": 1.0498, + "step": 2217 + }, + { + "epoch": 0.3496717975177, + "grad_norm": 0.98828125, + "learning_rate": 6.752638632723431e-05, + "loss": 0.9045, + "step": 2218 + }, + { + "epoch": 0.349829449365093, + "grad_norm": 0.93359375, + "learning_rate": 6.752203212227402e-05, + "loss": 1.0925, + "step": 2219 + }, + { + "epoch": 0.34998710121248605, + "grad_norm": 0.89453125, + "learning_rate": 6.751767798614894e-05, + "loss": 0.9421, + "step": 2220 + }, + { + "epoch": 0.350144753059879, + "grad_norm": 0.9921875, + "learning_rate": 6.751332391886827e-05, + "loss": 1.1928, + "step": 2221 + }, + { + "epoch": 0.35030240490727205, + "grad_norm": 0.96484375, + "learning_rate": 6.750896992044126e-05, + "loss": 1.1387, + "step": 2222 + }, + { + "epoch": 0.3504600567546651, + "grad_norm": 1.0390625, + "learning_rate": 6.75046159908771e-05, + "loss": 1.0339, + "step": 2223 + }, + { + "epoch": 0.3506177086020581, + "grad_norm": 0.8671875, + "learning_rate": 6.750026213018509e-05, + "loss": 0.9718, + "step": 2224 + }, + { + "epoch": 0.35077536044945107, + "grad_norm": 0.96875, + "learning_rate": 6.749590833837438e-05, + "loss": 1.1879, + "step": 2225 + }, + { + "epoch": 0.3509330122968441, + "grad_norm": 0.8359375, + "learning_rate": 6.749155461545427e-05, + "loss": 0.9013, + "step": 2226 + }, + { + "epoch": 0.3510906641442371, + "grad_norm": 0.90625, + "learning_rate": 6.748720096143392e-05, + "loss": 1.1077, + "step": 2227 + }, + { + "epoch": 0.35124831599163014, + "grad_norm": 0.91015625, + "learning_rate": 6.74828473763226e-05, + "loss": 0.9664, + "step": 2228 + }, + { + "epoch": 0.3514059678390231, + "grad_norm": 1.0390625, + "learning_rate": 6.747849386012954e-05, + "loss": 1.1624, + "step": 2229 + }, + { + "epoch": 0.35156361968641614, + "grad_norm": 0.98828125, + "learning_rate": 6.747414041286395e-05, + "loss": 1.1076, + "step": 2230 + }, + { + "epoch": 0.35172127153380917, + "grad_norm": 1.03125, + "learning_rate": 6.746978703453506e-05, + "loss": 0.9155, + "step": 2231 + }, + { + "epoch": 0.3518789233812022, + "grad_norm": 0.92578125, + "learning_rate": 6.74654337251521e-05, + "loss": 1.1402, + "step": 2232 + }, + { + "epoch": 0.35203657522859516, + "grad_norm": 0.90625, + "learning_rate": 6.746108048472424e-05, + "loss": 1.0794, + "step": 2233 + }, + { + "epoch": 0.3521942270759882, + "grad_norm": 0.9296875, + "learning_rate": 6.745672731326082e-05, + "loss": 0.8526, + "step": 2234 + }, + { + "epoch": 0.3523518789233812, + "grad_norm": 0.99609375, + "learning_rate": 6.7452374210771e-05, + "loss": 1.03, + "step": 2235 + }, + { + "epoch": 0.35250953077077424, + "grad_norm": 0.86328125, + "learning_rate": 6.744802117726401e-05, + "loss": 0.9623, + "step": 2236 + }, + { + "epoch": 0.3526671826181672, + "grad_norm": 1.3828125, + "learning_rate": 6.744366821274907e-05, + "loss": 1.0031, + "step": 2237 + }, + { + "epoch": 0.35282483446556023, + "grad_norm": 0.89453125, + "learning_rate": 6.743931531723539e-05, + "loss": 0.9254, + "step": 2238 + }, + { + "epoch": 0.35298248631295326, + "grad_norm": 0.921875, + "learning_rate": 6.743496249073225e-05, + "loss": 1.2715, + "step": 2239 + }, + { + "epoch": 0.3531401381603463, + "grad_norm": 0.93359375, + "learning_rate": 6.743060973324885e-05, + "loss": 1.109, + "step": 2240 + }, + { + "epoch": 0.35329779000773925, + "grad_norm": 1.0078125, + "learning_rate": 6.74262570447944e-05, + "loss": 1.1213, + "step": 2241 + }, + { + "epoch": 0.3534554418551323, + "grad_norm": 0.859375, + "learning_rate": 6.742190442537814e-05, + "loss": 0.9054, + "step": 2242 + }, + { + "epoch": 0.3536130937025253, + "grad_norm": 0.9296875, + "learning_rate": 6.741755187500928e-05, + "loss": 1.0821, + "step": 2243 + }, + { + "epoch": 0.35377074554991833, + "grad_norm": 0.97265625, + "learning_rate": 6.741319939369706e-05, + "loss": 1.0868, + "step": 2244 + }, + { + "epoch": 0.3539283973973113, + "grad_norm": 0.99609375, + "learning_rate": 6.740884698145072e-05, + "loss": 0.8927, + "step": 2245 + }, + { + "epoch": 0.3540860492447043, + "grad_norm": 1.0703125, + "learning_rate": 6.740449463827947e-05, + "loss": 1.2288, + "step": 2246 + }, + { + "epoch": 0.35424370109209735, + "grad_norm": 0.85546875, + "learning_rate": 6.740014236419252e-05, + "loss": 1.0025, + "step": 2247 + }, + { + "epoch": 0.3544013529394904, + "grad_norm": 0.8828125, + "learning_rate": 6.739579015919909e-05, + "loss": 0.9829, + "step": 2248 + }, + { + "epoch": 0.35455900478688335, + "grad_norm": 0.8125, + "learning_rate": 6.739143802330843e-05, + "loss": 0.8915, + "step": 2249 + }, + { + "epoch": 0.3547166566342764, + "grad_norm": 0.9765625, + "learning_rate": 6.738708595652976e-05, + "loss": 1.0, + "step": 2250 + }, + { + "epoch": 0.3548743084816694, + "grad_norm": 0.88671875, + "learning_rate": 6.738273395887229e-05, + "loss": 1.0099, + "step": 2251 + }, + { + "epoch": 0.3550319603290624, + "grad_norm": 0.828125, + "learning_rate": 6.737838203034526e-05, + "loss": 1.0821, + "step": 2252 + }, + { + "epoch": 0.3551896121764554, + "grad_norm": 0.97265625, + "learning_rate": 6.737403017095785e-05, + "loss": 1.0546, + "step": 2253 + }, + { + "epoch": 0.3553472640238484, + "grad_norm": 0.921875, + "learning_rate": 6.736967838071935e-05, + "loss": 1.2094, + "step": 2254 + }, + { + "epoch": 0.35550491587124144, + "grad_norm": 1.1015625, + "learning_rate": 6.736532665963894e-05, + "loss": 0.9994, + "step": 2255 + }, + { + "epoch": 0.35566256771863447, + "grad_norm": 1.0546875, + "learning_rate": 6.736097500772587e-05, + "loss": 1.0833, + "step": 2256 + }, + { + "epoch": 0.35582021956602744, + "grad_norm": 1.03125, + "learning_rate": 6.735662342498935e-05, + "loss": 1.1034, + "step": 2257 + }, + { + "epoch": 0.35597787141342047, + "grad_norm": 0.9453125, + "learning_rate": 6.735227191143856e-05, + "loss": 1.0425, + "step": 2258 + }, + { + "epoch": 0.3561355232608135, + "grad_norm": 0.94921875, + "learning_rate": 6.73479204670828e-05, + "loss": 1.065, + "step": 2259 + }, + { + "epoch": 0.3562931751082065, + "grad_norm": 1.0859375, + "learning_rate": 6.734356909193125e-05, + "loss": 1.4038, + "step": 2260 + }, + { + "epoch": 0.3564508269555995, + "grad_norm": 0.99609375, + "learning_rate": 6.733921778599315e-05, + "loss": 1.0652, + "step": 2261 + }, + { + "epoch": 0.3566084788029925, + "grad_norm": 0.96875, + "learning_rate": 6.73348665492777e-05, + "loss": 1.0583, + "step": 2262 + }, + { + "epoch": 0.35676613065038554, + "grad_norm": 0.91015625, + "learning_rate": 6.73305153817941e-05, + "loss": 1.0196, + "step": 2263 + }, + { + "epoch": 0.35692378249777856, + "grad_norm": 0.91796875, + "learning_rate": 6.732616428355165e-05, + "loss": 0.9423, + "step": 2264 + }, + { + "epoch": 0.35708143434517153, + "grad_norm": 0.87890625, + "learning_rate": 6.732181325455951e-05, + "loss": 1.0701, + "step": 2265 + }, + { + "epoch": 0.35723908619256456, + "grad_norm": 0.9375, + "learning_rate": 6.731746229482694e-05, + "loss": 0.9669, + "step": 2266 + }, + { + "epoch": 0.3573967380399576, + "grad_norm": 0.86328125, + "learning_rate": 6.731311140436312e-05, + "loss": 1.0199, + "step": 2267 + }, + { + "epoch": 0.3575543898873506, + "grad_norm": 0.83984375, + "learning_rate": 6.730876058317729e-05, + "loss": 0.9333, + "step": 2268 + }, + { + "epoch": 0.3577120417347436, + "grad_norm": 0.90625, + "learning_rate": 6.730440983127869e-05, + "loss": 0.8572, + "step": 2269 + }, + { + "epoch": 0.3578696935821366, + "grad_norm": 0.8828125, + "learning_rate": 6.730005914867653e-05, + "loss": 0.8859, + "step": 2270 + }, + { + "epoch": 0.35802734542952963, + "grad_norm": 0.828125, + "learning_rate": 6.729570853538001e-05, + "loss": 1.0453, + "step": 2271 + }, + { + "epoch": 0.35818499727692266, + "grad_norm": 1.015625, + "learning_rate": 6.729135799139836e-05, + "loss": 1.4755, + "step": 2272 + }, + { + "epoch": 0.3583426491243156, + "grad_norm": 0.875, + "learning_rate": 6.728700751674079e-05, + "loss": 1.1053, + "step": 2273 + }, + { + "epoch": 0.35850030097170865, + "grad_norm": 0.92578125, + "learning_rate": 6.728265711141657e-05, + "loss": 1.1808, + "step": 2274 + }, + { + "epoch": 0.3586579528191017, + "grad_norm": 0.86328125, + "learning_rate": 6.727830677543489e-05, + "loss": 0.8523, + "step": 2275 + }, + { + "epoch": 0.3588156046664947, + "grad_norm": 0.87109375, + "learning_rate": 6.727395650880496e-05, + "loss": 1.0509, + "step": 2276 + }, + { + "epoch": 0.3589732565138877, + "grad_norm": 0.92578125, + "learning_rate": 6.726960631153602e-05, + "loss": 1.0758, + "step": 2277 + }, + { + "epoch": 0.3591309083612807, + "grad_norm": 1.0390625, + "learning_rate": 6.726525618363722e-05, + "loss": 1.1373, + "step": 2278 + }, + { + "epoch": 0.3592885602086737, + "grad_norm": 0.828125, + "learning_rate": 6.72609061251179e-05, + "loss": 0.8261, + "step": 2279 + }, + { + "epoch": 0.35944621205606675, + "grad_norm": 0.9296875, + "learning_rate": 6.72565561359872e-05, + "loss": 1.0646, + "step": 2280 + }, + { + "epoch": 0.3596038639034597, + "grad_norm": 0.92578125, + "learning_rate": 6.725220621625437e-05, + "loss": 1.0613, + "step": 2281 + }, + { + "epoch": 0.35976151575085275, + "grad_norm": 0.9453125, + "learning_rate": 6.724785636592861e-05, + "loss": 1.0387, + "step": 2282 + }, + { + "epoch": 0.35991916759824577, + "grad_norm": 0.92578125, + "learning_rate": 6.724350658501912e-05, + "loss": 0.9041, + "step": 2283 + }, + { + "epoch": 0.3600768194456388, + "grad_norm": 0.93359375, + "learning_rate": 6.723915687353517e-05, + "loss": 1.1569, + "step": 2284 + }, + { + "epoch": 0.36023447129303177, + "grad_norm": 0.99609375, + "learning_rate": 6.723480723148596e-05, + "loss": 1.1649, + "step": 2285 + }, + { + "epoch": 0.3603921231404248, + "grad_norm": 1.1171875, + "learning_rate": 6.723045765888071e-05, + "loss": 1.0411, + "step": 2286 + }, + { + "epoch": 0.3605497749878178, + "grad_norm": 0.9609375, + "learning_rate": 6.722610815572863e-05, + "loss": 0.9436, + "step": 2287 + }, + { + "epoch": 0.36070742683521084, + "grad_norm": 0.9765625, + "learning_rate": 6.72217587220389e-05, + "loss": 1.1826, + "step": 2288 + }, + { + "epoch": 0.3608650786826038, + "grad_norm": 0.74609375, + "learning_rate": 6.721740935782081e-05, + "loss": 0.7414, + "step": 2289 + }, + { + "epoch": 0.36102273052999684, + "grad_norm": 1.0078125, + "learning_rate": 6.721306006308357e-05, + "loss": 1.1439, + "step": 2290 + }, + { + "epoch": 0.36118038237738986, + "grad_norm": 0.96875, + "learning_rate": 6.720871083783636e-05, + "loss": 0.9869, + "step": 2291 + }, + { + "epoch": 0.3613380342247829, + "grad_norm": 0.796875, + "learning_rate": 6.720436168208839e-05, + "loss": 1.1139, + "step": 2292 + }, + { + "epoch": 0.36149568607217586, + "grad_norm": 0.91796875, + "learning_rate": 6.720001259584889e-05, + "loss": 1.1633, + "step": 2293 + }, + { + "epoch": 0.3616533379195689, + "grad_norm": 0.93359375, + "learning_rate": 6.719566357912712e-05, + "loss": 1.1726, + "step": 2294 + }, + { + "epoch": 0.3618109897669619, + "grad_norm": 0.8671875, + "learning_rate": 6.719131463193227e-05, + "loss": 0.9629, + "step": 2295 + }, + { + "epoch": 0.36196864161435494, + "grad_norm": 1.015625, + "learning_rate": 6.718696575427354e-05, + "loss": 0.9552, + "step": 2296 + }, + { + "epoch": 0.3621262934617479, + "grad_norm": 1.015625, + "learning_rate": 6.718261694616018e-05, + "loss": 1.2482, + "step": 2297 + }, + { + "epoch": 0.36228394530914093, + "grad_norm": 0.94140625, + "learning_rate": 6.717826820760134e-05, + "loss": 1.0721, + "step": 2298 + }, + { + "epoch": 0.36244159715653396, + "grad_norm": 0.9453125, + "learning_rate": 6.717391953860633e-05, + "loss": 0.9532, + "step": 2299 + }, + { + "epoch": 0.362599249003927, + "grad_norm": 0.87890625, + "learning_rate": 6.71695709391843e-05, + "loss": 0.9664, + "step": 2300 + }, + { + "epoch": 0.36275690085131995, + "grad_norm": 0.98828125, + "learning_rate": 6.71652224093445e-05, + "loss": 1.2529, + "step": 2301 + }, + { + "epoch": 0.362914552698713, + "grad_norm": 1.0078125, + "learning_rate": 6.716087394909613e-05, + "loss": 1.0475, + "step": 2302 + }, + { + "epoch": 0.363072204546106, + "grad_norm": 0.984375, + "learning_rate": 6.715652555844838e-05, + "loss": 1.207, + "step": 2303 + }, + { + "epoch": 0.36322985639349903, + "grad_norm": 0.8515625, + "learning_rate": 6.715217723741054e-05, + "loss": 1.0334, + "step": 2304 + }, + { + "epoch": 0.363387508240892, + "grad_norm": 0.95703125, + "learning_rate": 6.714782898599176e-05, + "loss": 1.1563, + "step": 2305 + }, + { + "epoch": 0.363545160088285, + "grad_norm": 0.859375, + "learning_rate": 6.714348080420131e-05, + "loss": 1.0027, + "step": 2306 + }, + { + "epoch": 0.36370281193567805, + "grad_norm": 0.91015625, + "learning_rate": 6.713913269204836e-05, + "loss": 1.192, + "step": 2307 + }, + { + "epoch": 0.3638604637830711, + "grad_norm": 0.7734375, + "learning_rate": 6.713478464954212e-05, + "loss": 0.9898, + "step": 2308 + }, + { + "epoch": 0.36401811563046405, + "grad_norm": 0.875, + "learning_rate": 6.713043667669184e-05, + "loss": 1.0811, + "step": 2309 + }, + { + "epoch": 0.36417576747785707, + "grad_norm": 0.9765625, + "learning_rate": 6.712608877350673e-05, + "loss": 0.9769, + "step": 2310 + }, + { + "epoch": 0.3643334193252501, + "grad_norm": 0.93359375, + "learning_rate": 6.712174093999597e-05, + "loss": 0.924, + "step": 2311 + }, + { + "epoch": 0.3644910711726431, + "grad_norm": 0.74609375, + "learning_rate": 6.711739317616882e-05, + "loss": 0.9438, + "step": 2312 + }, + { + "epoch": 0.3646487230200361, + "grad_norm": 1.046875, + "learning_rate": 6.711304548203446e-05, + "loss": 1.321, + "step": 2313 + }, + { + "epoch": 0.3648063748674291, + "grad_norm": 0.92578125, + "learning_rate": 6.710869785760214e-05, + "loss": 1.1431, + "step": 2314 + }, + { + "epoch": 0.36496402671482214, + "grad_norm": 0.875, + "learning_rate": 6.710435030288105e-05, + "loss": 1.0735, + "step": 2315 + }, + { + "epoch": 0.36512167856221517, + "grad_norm": 0.8671875, + "learning_rate": 6.710000281788041e-05, + "loss": 0.8637, + "step": 2316 + }, + { + "epoch": 0.36527933040960814, + "grad_norm": 1.0078125, + "learning_rate": 6.709565540260944e-05, + "loss": 1.105, + "step": 2317 + }, + { + "epoch": 0.36543698225700116, + "grad_norm": 0.97265625, + "learning_rate": 6.709130805707731e-05, + "loss": 1.1873, + "step": 2318 + }, + { + "epoch": 0.3655946341043942, + "grad_norm": 1.0, + "learning_rate": 6.70869607812933e-05, + "loss": 1.0989, + "step": 2319 + }, + { + "epoch": 0.3657522859517872, + "grad_norm": 1.0078125, + "learning_rate": 6.70826135752666e-05, + "loss": 1.0525, + "step": 2320 + }, + { + "epoch": 0.3659099377991802, + "grad_norm": 0.87890625, + "learning_rate": 6.707826643900641e-05, + "loss": 1.0107, + "step": 2321 + }, + { + "epoch": 0.3660675896465732, + "grad_norm": 0.890625, + "learning_rate": 6.707391937252196e-05, + "loss": 1.1362, + "step": 2322 + }, + { + "epoch": 0.36622524149396624, + "grad_norm": 0.94921875, + "learning_rate": 6.706957237582243e-05, + "loss": 1.0402, + "step": 2323 + }, + { + "epoch": 0.36638289334135926, + "grad_norm": 0.9296875, + "learning_rate": 6.706522544891709e-05, + "loss": 0.9442, + "step": 2324 + }, + { + "epoch": 0.36654054518875223, + "grad_norm": 0.9765625, + "learning_rate": 6.706087859181512e-05, + "loss": 1.2302, + "step": 2325 + }, + { + "epoch": 0.36669819703614526, + "grad_norm": 0.953125, + "learning_rate": 6.705653180452574e-05, + "loss": 1.0468, + "step": 2326 + }, + { + "epoch": 0.3668558488835383, + "grad_norm": 1.0859375, + "learning_rate": 6.705218508705814e-05, + "loss": 1.2013, + "step": 2327 + }, + { + "epoch": 0.3670135007309313, + "grad_norm": 0.9609375, + "learning_rate": 6.704783843942155e-05, + "loss": 1.0828, + "step": 2328 + }, + { + "epoch": 0.3671711525783243, + "grad_norm": 1.1171875, + "learning_rate": 6.704349186162517e-05, + "loss": 1.1, + "step": 2329 + }, + { + "epoch": 0.3673288044257173, + "grad_norm": 0.90625, + "learning_rate": 6.703914535367825e-05, + "loss": 0.9595, + "step": 2330 + }, + { + "epoch": 0.36748645627311033, + "grad_norm": 0.98046875, + "learning_rate": 6.703479891558998e-05, + "loss": 1.1449, + "step": 2331 + }, + { + "epoch": 0.36764410812050335, + "grad_norm": 0.96875, + "learning_rate": 6.703045254736957e-05, + "loss": 1.056, + "step": 2332 + }, + { + "epoch": 0.3678017599678963, + "grad_norm": 1.1015625, + "learning_rate": 6.702610624902621e-05, + "loss": 1.1571, + "step": 2333 + }, + { + "epoch": 0.36795941181528935, + "grad_norm": 0.94921875, + "learning_rate": 6.702176002056915e-05, + "loss": 1.0523, + "step": 2334 + }, + { + "epoch": 0.3681170636626824, + "grad_norm": 0.83984375, + "learning_rate": 6.70174138620076e-05, + "loss": 0.9799, + "step": 2335 + }, + { + "epoch": 0.3682747155100754, + "grad_norm": 1.0234375, + "learning_rate": 6.701306777335072e-05, + "loss": 1.0972, + "step": 2336 + }, + { + "epoch": 0.36843236735746837, + "grad_norm": 0.8984375, + "learning_rate": 6.70087217546078e-05, + "loss": 1.2335, + "step": 2337 + }, + { + "epoch": 0.3685900192048614, + "grad_norm": 0.98046875, + "learning_rate": 6.700437580578793e-05, + "loss": 1.1157, + "step": 2338 + }, + { + "epoch": 0.3687476710522544, + "grad_norm": 0.9296875, + "learning_rate": 6.700002992690046e-05, + "loss": 1.0694, + "step": 2339 + }, + { + "epoch": 0.36890532289964745, + "grad_norm": 0.8984375, + "learning_rate": 6.699568411795455e-05, + "loss": 0.9397, + "step": 2340 + }, + { + "epoch": 0.3690629747470404, + "grad_norm": 1.03125, + "learning_rate": 6.699133837895938e-05, + "loss": 1.1, + "step": 2341 + }, + { + "epoch": 0.36922062659443344, + "grad_norm": 0.91015625, + "learning_rate": 6.69869927099242e-05, + "loss": 1.0253, + "step": 2342 + }, + { + "epoch": 0.36937827844182647, + "grad_norm": 0.82421875, + "learning_rate": 6.698264711085816e-05, + "loss": 0.8466, + "step": 2343 + }, + { + "epoch": 0.3695359302892195, + "grad_norm": 0.99609375, + "learning_rate": 6.697830158177054e-05, + "loss": 1.078, + "step": 2344 + }, + { + "epoch": 0.36969358213661246, + "grad_norm": 0.8984375, + "learning_rate": 6.697395612267052e-05, + "loss": 0.9842, + "step": 2345 + }, + { + "epoch": 0.3698512339840055, + "grad_norm": 0.91796875, + "learning_rate": 6.696961073356733e-05, + "loss": 1.0337, + "step": 2346 + }, + { + "epoch": 0.3700088858313985, + "grad_norm": 0.90625, + "learning_rate": 6.696526541447015e-05, + "loss": 1.1329, + "step": 2347 + }, + { + "epoch": 0.37016653767879154, + "grad_norm": 0.99609375, + "learning_rate": 6.696092016538818e-05, + "loss": 0.9966, + "step": 2348 + }, + { + "epoch": 0.37032418952618457, + "grad_norm": 0.78125, + "learning_rate": 6.695657498633067e-05, + "loss": 0.978, + "step": 2349 + }, + { + "epoch": 0.37048184137357754, + "grad_norm": 0.97265625, + "learning_rate": 6.695222987730681e-05, + "loss": 1.1548, + "step": 2350 + }, + { + "epoch": 0.37063949322097056, + "grad_norm": 1.0078125, + "learning_rate": 6.694788483832581e-05, + "loss": 1.0842, + "step": 2351 + }, + { + "epoch": 0.3707971450683636, + "grad_norm": 1.0546875, + "learning_rate": 6.69435398693969e-05, + "loss": 1.223, + "step": 2352 + }, + { + "epoch": 0.3709547969157566, + "grad_norm": 0.85546875, + "learning_rate": 6.693919497052923e-05, + "loss": 1.249, + "step": 2353 + }, + { + "epoch": 0.3711124487631496, + "grad_norm": 0.8984375, + "learning_rate": 6.693485014173207e-05, + "loss": 1.15, + "step": 2354 + }, + { + "epoch": 0.3712701006105426, + "grad_norm": 0.87890625, + "learning_rate": 6.693050538301461e-05, + "loss": 0.9304, + "step": 2355 + }, + { + "epoch": 0.37142775245793563, + "grad_norm": 0.87109375, + "learning_rate": 6.692616069438604e-05, + "loss": 1.0236, + "step": 2356 + }, + { + "epoch": 0.37158540430532866, + "grad_norm": 1.015625, + "learning_rate": 6.692181607585559e-05, + "loss": 1.1574, + "step": 2357 + }, + { + "epoch": 0.37174305615272163, + "grad_norm": 0.8828125, + "learning_rate": 6.691747152743247e-05, + "loss": 1.0662, + "step": 2358 + }, + { + "epoch": 0.37190070800011465, + "grad_norm": 0.97265625, + "learning_rate": 6.691312704912582e-05, + "loss": 1.0855, + "step": 2359 + }, + { + "epoch": 0.3720583598475077, + "grad_norm": 0.94140625, + "learning_rate": 6.690878264094496e-05, + "loss": 0.8673, + "step": 2360 + }, + { + "epoch": 0.3722160116949007, + "grad_norm": 0.92578125, + "learning_rate": 6.690443830289904e-05, + "loss": 1.0486, + "step": 2361 + }, + { + "epoch": 0.3723736635422937, + "grad_norm": 0.91015625, + "learning_rate": 6.690009403499728e-05, + "loss": 1.0667, + "step": 2362 + }, + { + "epoch": 0.3725313153896867, + "grad_norm": 0.89453125, + "learning_rate": 6.689574983724887e-05, + "loss": 0.9216, + "step": 2363 + }, + { + "epoch": 0.3726889672370797, + "grad_norm": 0.90625, + "learning_rate": 6.689140570966298e-05, + "loss": 1.0748, + "step": 2364 + }, + { + "epoch": 0.37284661908447275, + "grad_norm": 0.9375, + "learning_rate": 6.688706165224891e-05, + "loss": 1.2729, + "step": 2365 + }, + { + "epoch": 0.3730042709318657, + "grad_norm": 0.82421875, + "learning_rate": 6.688271766501581e-05, + "loss": 1.0533, + "step": 2366 + }, + { + "epoch": 0.37316192277925875, + "grad_norm": 0.8671875, + "learning_rate": 6.687837374797291e-05, + "loss": 1.0134, + "step": 2367 + }, + { + "epoch": 0.3733195746266518, + "grad_norm": 0.9375, + "learning_rate": 6.68740299011294e-05, + "loss": 1.068, + "step": 2368 + }, + { + "epoch": 0.3734772264740448, + "grad_norm": 0.98828125, + "learning_rate": 6.686968612449445e-05, + "loss": 1.0722, + "step": 2369 + }, + { + "epoch": 0.37363487832143777, + "grad_norm": 0.94140625, + "learning_rate": 6.686534241807734e-05, + "loss": 1.0215, + "step": 2370 + }, + { + "epoch": 0.3737925301688308, + "grad_norm": 0.90625, + "learning_rate": 6.686099878188725e-05, + "loss": 1.0173, + "step": 2371 + }, + { + "epoch": 0.3739501820162238, + "grad_norm": 0.81640625, + "learning_rate": 6.685665521593339e-05, + "loss": 0.7034, + "step": 2372 + }, + { + "epoch": 0.37410783386361685, + "grad_norm": 0.8828125, + "learning_rate": 6.685231172022494e-05, + "loss": 1.063, + "step": 2373 + }, + { + "epoch": 0.3742654857110098, + "grad_norm": 0.89453125, + "learning_rate": 6.68479682947711e-05, + "loss": 0.9328, + "step": 2374 + }, + { + "epoch": 0.37442313755840284, + "grad_norm": 0.8828125, + "learning_rate": 6.684362493958112e-05, + "loss": 0.9069, + "step": 2375 + }, + { + "epoch": 0.37458078940579587, + "grad_norm": 0.9375, + "learning_rate": 6.683928165466418e-05, + "loss": 1.0216, + "step": 2376 + }, + { + "epoch": 0.3747384412531889, + "grad_norm": 1.109375, + "learning_rate": 6.683493844002949e-05, + "loss": 1.0683, + "step": 2377 + }, + { + "epoch": 0.37489609310058186, + "grad_norm": 0.921875, + "learning_rate": 6.683059529568622e-05, + "loss": 0.9968, + "step": 2378 + }, + { + "epoch": 0.3750537449479749, + "grad_norm": 0.92578125, + "learning_rate": 6.682625222164362e-05, + "loss": 1.1002, + "step": 2379 + }, + { + "epoch": 0.3752113967953679, + "grad_norm": 0.91796875, + "learning_rate": 6.682190921791091e-05, + "loss": 1.2517, + "step": 2380 + }, + { + "epoch": 0.37536904864276094, + "grad_norm": 0.9140625, + "learning_rate": 6.681756628449727e-05, + "loss": 1.1317, + "step": 2381 + }, + { + "epoch": 0.3755267004901539, + "grad_norm": 0.875, + "learning_rate": 6.681322342141188e-05, + "loss": 0.9104, + "step": 2382 + }, + { + "epoch": 0.37568435233754693, + "grad_norm": 0.9140625, + "learning_rate": 6.680888062866397e-05, + "loss": 1.2512, + "step": 2383 + }, + { + "epoch": 0.37584200418493996, + "grad_norm": 0.86328125, + "learning_rate": 6.680453790626271e-05, + "loss": 1.1783, + "step": 2384 + }, + { + "epoch": 0.375999656032333, + "grad_norm": 1.0234375, + "learning_rate": 6.680019525421737e-05, + "loss": 1.2468, + "step": 2385 + }, + { + "epoch": 0.37615730787972596, + "grad_norm": 1.140625, + "learning_rate": 6.679585267253712e-05, + "loss": 1.0994, + "step": 2386 + }, + { + "epoch": 0.376314959727119, + "grad_norm": 0.921875, + "learning_rate": 6.679151016123116e-05, + "loss": 1.1314, + "step": 2387 + }, + { + "epoch": 0.376472611574512, + "grad_norm": 1.0078125, + "learning_rate": 6.67871677203087e-05, + "loss": 1.1308, + "step": 2388 + }, + { + "epoch": 0.37663026342190503, + "grad_norm": 0.8984375, + "learning_rate": 6.67828253497789e-05, + "loss": 1.0111, + "step": 2389 + }, + { + "epoch": 0.376787915269298, + "grad_norm": 0.9453125, + "learning_rate": 6.677848304965104e-05, + "loss": 0.974, + "step": 2390 + }, + { + "epoch": 0.376945567116691, + "grad_norm": 0.83984375, + "learning_rate": 6.67741408199343e-05, + "loss": 0.9218, + "step": 2391 + }, + { + "epoch": 0.37710321896408405, + "grad_norm": 0.8359375, + "learning_rate": 6.676979866063784e-05, + "loss": 0.9949, + "step": 2392 + }, + { + "epoch": 0.3772608708114771, + "grad_norm": 0.93359375, + "learning_rate": 6.676545657177091e-05, + "loss": 1.2279, + "step": 2393 + }, + { + "epoch": 0.37741852265887005, + "grad_norm": 0.9375, + "learning_rate": 6.676111455334269e-05, + "loss": 1.0802, + "step": 2394 + }, + { + "epoch": 0.3775761745062631, + "grad_norm": 1.015625, + "learning_rate": 6.675677260536237e-05, + "loss": 1.1283, + "step": 2395 + }, + { + "epoch": 0.3777338263536561, + "grad_norm": 1.046875, + "learning_rate": 6.675243072783919e-05, + "loss": 1.1182, + "step": 2396 + }, + { + "epoch": 0.3778914782010491, + "grad_norm": 0.94140625, + "learning_rate": 6.674808892078232e-05, + "loss": 1.0163, + "step": 2397 + }, + { + "epoch": 0.3780491300484421, + "grad_norm": 0.859375, + "learning_rate": 6.6743747184201e-05, + "loss": 0.9343, + "step": 2398 + }, + { + "epoch": 0.3782067818958351, + "grad_norm": 0.96484375, + "learning_rate": 6.673940551810438e-05, + "loss": 0.9897, + "step": 2399 + }, + { + "epoch": 0.37836443374322815, + "grad_norm": 1.0546875, + "learning_rate": 6.673506392250171e-05, + "loss": 1.0946, + "step": 2400 + }, + { + "epoch": 0.37852208559062117, + "grad_norm": 0.9375, + "learning_rate": 6.673072239740216e-05, + "loss": 1.0103, + "step": 2401 + }, + { + "epoch": 0.37867973743801414, + "grad_norm": 0.890625, + "learning_rate": 6.672638094281494e-05, + "loss": 1.0068, + "step": 2402 + }, + { + "epoch": 0.37883738928540717, + "grad_norm": 1.03125, + "learning_rate": 6.672203955874928e-05, + "loss": 1.213, + "step": 2403 + }, + { + "epoch": 0.3789950411328002, + "grad_norm": 1.0546875, + "learning_rate": 6.671769824521427e-05, + "loss": 1.0481, + "step": 2404 + }, + { + "epoch": 0.3791526929801932, + "grad_norm": 0.8671875, + "learning_rate": 6.671335700221927e-05, + "loss": 0.9311, + "step": 2405 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 1.0234375, + "learning_rate": 6.670901582977342e-05, + "loss": 1.2619, + "step": 2406 + }, + { + "epoch": 0.3794679966749792, + "grad_norm": 0.9765625, + "learning_rate": 6.670467472788589e-05, + "loss": 1.101, + "step": 2407 + }, + { + "epoch": 0.37962564852237224, + "grad_norm": 0.8515625, + "learning_rate": 6.670033369656589e-05, + "loss": 1.0371, + "step": 2408 + }, + { + "epoch": 0.37978330036976526, + "grad_norm": 0.95703125, + "learning_rate": 6.66959927358226e-05, + "loss": 1.1082, + "step": 2409 + }, + { + "epoch": 0.37994095221715823, + "grad_norm": 0.90234375, + "learning_rate": 6.669165184566528e-05, + "loss": 1.0961, + "step": 2410 + }, + { + "epoch": 0.38009860406455126, + "grad_norm": 0.99609375, + "learning_rate": 6.66873110261031e-05, + "loss": 1.0553, + "step": 2411 + }, + { + "epoch": 0.3802562559119443, + "grad_norm": 0.953125, + "learning_rate": 6.668297027714527e-05, + "loss": 0.8685, + "step": 2412 + }, + { + "epoch": 0.3804139077593373, + "grad_norm": 0.984375, + "learning_rate": 6.667862959880098e-05, + "loss": 0.9549, + "step": 2413 + }, + { + "epoch": 0.3805715596067303, + "grad_norm": 1.0078125, + "learning_rate": 6.667428899107942e-05, + "loss": 1.2299, + "step": 2414 + }, + { + "epoch": 0.3807292114541233, + "grad_norm": 0.91015625, + "learning_rate": 6.666994845398978e-05, + "loss": 0.9016, + "step": 2415 + }, + { + "epoch": 0.38088686330151633, + "grad_norm": 0.9453125, + "learning_rate": 6.666560798754131e-05, + "loss": 1.2052, + "step": 2416 + }, + { + "epoch": 0.38104451514890936, + "grad_norm": 0.96484375, + "learning_rate": 6.666126759174319e-05, + "loss": 1.1808, + "step": 2417 + }, + { + "epoch": 0.3812021669963023, + "grad_norm": 0.96875, + "learning_rate": 6.665692726660457e-05, + "loss": 1.1925, + "step": 2418 + }, + { + "epoch": 0.38135981884369535, + "grad_norm": 0.94921875, + "learning_rate": 6.66525870121347e-05, + "loss": 1.05, + "step": 2419 + }, + { + "epoch": 0.3815174706910884, + "grad_norm": 0.96484375, + "learning_rate": 6.664824682834277e-05, + "loss": 1.0083, + "step": 2420 + }, + { + "epoch": 0.3816751225384814, + "grad_norm": 0.9296875, + "learning_rate": 6.664390671523799e-05, + "loss": 1.1218, + "step": 2421 + }, + { + "epoch": 0.3818327743858744, + "grad_norm": 0.89453125, + "learning_rate": 6.663956667282953e-05, + "loss": 1.1959, + "step": 2422 + }, + { + "epoch": 0.3819904262332674, + "grad_norm": 1.234375, + "learning_rate": 6.663522670112661e-05, + "loss": 1.0978, + "step": 2423 + }, + { + "epoch": 0.3821480780806604, + "grad_norm": 0.859375, + "learning_rate": 6.663088680013837e-05, + "loss": 1.0639, + "step": 2424 + }, + { + "epoch": 0.38230572992805345, + "grad_norm": 0.921875, + "learning_rate": 6.662654696987409e-05, + "loss": 0.8288, + "step": 2425 + }, + { + "epoch": 0.3824633817754464, + "grad_norm": 0.9296875, + "learning_rate": 6.662220721034296e-05, + "loss": 0.864, + "step": 2426 + }, + { + "epoch": 0.38262103362283945, + "grad_norm": 0.86328125, + "learning_rate": 6.661786752155415e-05, + "loss": 0.9253, + "step": 2427 + }, + { + "epoch": 0.38277868547023247, + "grad_norm": 1.0078125, + "learning_rate": 6.661352790351685e-05, + "loss": 1.0914, + "step": 2428 + }, + { + "epoch": 0.3829363373176255, + "grad_norm": 1.03125, + "learning_rate": 6.660918835624023e-05, + "loss": 1.1623, + "step": 2429 + }, + { + "epoch": 0.38309398916501847, + "grad_norm": 0.953125, + "learning_rate": 6.660484887973357e-05, + "loss": 1.0313, + "step": 2430 + }, + { + "epoch": 0.3832516410124115, + "grad_norm": 0.9296875, + "learning_rate": 6.660050947400603e-05, + "loss": 1.1663, + "step": 2431 + }, + { + "epoch": 0.3834092928598045, + "grad_norm": 0.9140625, + "learning_rate": 6.65961701390668e-05, + "loss": 0.9767, + "step": 2432 + }, + { + "epoch": 0.38356694470719754, + "grad_norm": 0.89453125, + "learning_rate": 6.659183087492507e-05, + "loss": 1.1642, + "step": 2433 + }, + { + "epoch": 0.3837245965545905, + "grad_norm": 0.94140625, + "learning_rate": 6.658749168159e-05, + "loss": 1.0965, + "step": 2434 + }, + { + "epoch": 0.38388224840198354, + "grad_norm": 0.90625, + "learning_rate": 6.65831525590709e-05, + "loss": 1.0116, + "step": 2435 + }, + { + "epoch": 0.38403990024937656, + "grad_norm": 0.91796875, + "learning_rate": 6.657881350737687e-05, + "loss": 1.2105, + "step": 2436 + }, + { + "epoch": 0.3841975520967696, + "grad_norm": 0.90625, + "learning_rate": 6.657447452651715e-05, + "loss": 1.0967, + "step": 2437 + }, + { + "epoch": 0.38435520394416256, + "grad_norm": 0.9375, + "learning_rate": 6.657013561650092e-05, + "loss": 1.1027, + "step": 2438 + }, + { + "epoch": 0.3845128557915556, + "grad_norm": 0.94140625, + "learning_rate": 6.656579677733734e-05, + "loss": 1.0223, + "step": 2439 + }, + { + "epoch": 0.3846705076389486, + "grad_norm": 0.98828125, + "learning_rate": 6.656145800903569e-05, + "loss": 1.0253, + "step": 2440 + }, + { + "epoch": 0.38482815948634164, + "grad_norm": 0.99609375, + "learning_rate": 6.655711931160509e-05, + "loss": 1.1558, + "step": 2441 + }, + { + "epoch": 0.3849858113337346, + "grad_norm": 0.99609375, + "learning_rate": 6.655278068505478e-05, + "loss": 1.2671, + "step": 2442 + }, + { + "epoch": 0.38514346318112763, + "grad_norm": 0.875, + "learning_rate": 6.654844212939393e-05, + "loss": 1.1614, + "step": 2443 + }, + { + "epoch": 0.38530111502852066, + "grad_norm": 0.98828125, + "learning_rate": 6.65441036446317e-05, + "loss": 1.0869, + "step": 2444 + }, + { + "epoch": 0.3854587668759137, + "grad_norm": 0.859375, + "learning_rate": 6.653976523077739e-05, + "loss": 0.9522, + "step": 2445 + }, + { + "epoch": 0.38561641872330665, + "grad_norm": 0.953125, + "learning_rate": 6.653542688784014e-05, + "loss": 1.0814, + "step": 2446 + }, + { + "epoch": 0.3857740705706997, + "grad_norm": 0.91796875, + "learning_rate": 6.653108861582912e-05, + "loss": 1.165, + "step": 2447 + }, + { + "epoch": 0.3859317224180927, + "grad_norm": 0.97265625, + "learning_rate": 6.652675041475355e-05, + "loss": 1.0485, + "step": 2448 + }, + { + "epoch": 0.38608937426548573, + "grad_norm": 0.88671875, + "learning_rate": 6.652241228462258e-05, + "loss": 1.0481, + "step": 2449 + }, + { + "epoch": 0.3862470261128787, + "grad_norm": 0.89453125, + "learning_rate": 6.651807422544548e-05, + "loss": 0.841, + "step": 2450 + }, + { + "epoch": 0.3864046779602717, + "grad_norm": 0.8203125, + "learning_rate": 6.651373623723142e-05, + "loss": 0.7862, + "step": 2451 + }, + { + "epoch": 0.38656232980766475, + "grad_norm": 0.99609375, + "learning_rate": 6.650939831998956e-05, + "loss": 1.0859, + "step": 2452 + }, + { + "epoch": 0.3867199816550578, + "grad_norm": 0.87890625, + "learning_rate": 6.650506047372915e-05, + "loss": 0.8709, + "step": 2453 + }, + { + "epoch": 0.38687763350245075, + "grad_norm": 0.91015625, + "learning_rate": 6.650072269845928e-05, + "loss": 0.8493, + "step": 2454 + }, + { + "epoch": 0.38703528534984377, + "grad_norm": 0.9140625, + "learning_rate": 6.649638499418926e-05, + "loss": 0.9746, + "step": 2455 + }, + { + "epoch": 0.3871929371972368, + "grad_norm": 0.875, + "learning_rate": 6.649204736092825e-05, + "loss": 0.9267, + "step": 2456 + }, + { + "epoch": 0.3873505890446298, + "grad_norm": 0.890625, + "learning_rate": 6.648770979868542e-05, + "loss": 0.9865, + "step": 2457 + }, + { + "epoch": 0.3875082408920228, + "grad_norm": 0.921875, + "learning_rate": 6.648337230746998e-05, + "loss": 0.8834, + "step": 2458 + }, + { + "epoch": 0.3876658927394158, + "grad_norm": 0.8515625, + "learning_rate": 6.647903488729108e-05, + "loss": 1.0119, + "step": 2459 + }, + { + "epoch": 0.38782354458680884, + "grad_norm": 0.94140625, + "learning_rate": 6.647469753815799e-05, + "loss": 0.8676, + "step": 2460 + }, + { + "epoch": 0.38798119643420187, + "grad_norm": 0.8828125, + "learning_rate": 6.647036026007984e-05, + "loss": 1.0225, + "step": 2461 + }, + { + "epoch": 0.38813884828159484, + "grad_norm": 0.90625, + "learning_rate": 6.646602305306587e-05, + "loss": 0.9215, + "step": 2462 + }, + { + "epoch": 0.38829650012898786, + "grad_norm": 0.83984375, + "learning_rate": 6.646168591712521e-05, + "loss": 0.8573, + "step": 2463 + }, + { + "epoch": 0.3884541519763809, + "grad_norm": 0.9609375, + "learning_rate": 6.645734885226709e-05, + "loss": 1.016, + "step": 2464 + }, + { + "epoch": 0.3886118038237739, + "grad_norm": 0.96875, + "learning_rate": 6.645301185850072e-05, + "loss": 1.0788, + "step": 2465 + }, + { + "epoch": 0.3887694556711669, + "grad_norm": 0.875, + "learning_rate": 6.644867493583529e-05, + "loss": 1.1469, + "step": 2466 + }, + { + "epoch": 0.3889271075185599, + "grad_norm": 0.92578125, + "learning_rate": 6.644433808427996e-05, + "loss": 0.9306, + "step": 2467 + }, + { + "epoch": 0.38908475936595294, + "grad_norm": 0.859375, + "learning_rate": 6.644000130384396e-05, + "loss": 0.8761, + "step": 2468 + }, + { + "epoch": 0.38924241121334596, + "grad_norm": 0.875, + "learning_rate": 6.64356645945364e-05, + "loss": 1.0406, + "step": 2469 + }, + { + "epoch": 0.38940006306073893, + "grad_norm": 0.921875, + "learning_rate": 6.643132795636657e-05, + "loss": 1.0771, + "step": 2470 + }, + { + "epoch": 0.38955771490813196, + "grad_norm": 0.9296875, + "learning_rate": 6.642699138934363e-05, + "loss": 1.0681, + "step": 2471 + }, + { + "epoch": 0.389715366755525, + "grad_norm": 0.98046875, + "learning_rate": 6.642265489347675e-05, + "loss": 1.099, + "step": 2472 + }, + { + "epoch": 0.389873018602918, + "grad_norm": 0.9921875, + "learning_rate": 6.641831846877514e-05, + "loss": 0.9692, + "step": 2473 + }, + { + "epoch": 0.390030670450311, + "grad_norm": 0.9765625, + "learning_rate": 6.641398211524795e-05, + "loss": 1.0655, + "step": 2474 + }, + { + "epoch": 0.390188322297704, + "grad_norm": 0.96875, + "learning_rate": 6.640964583290444e-05, + "loss": 1.0029, + "step": 2475 + }, + { + "epoch": 0.39034597414509703, + "grad_norm": 1.0, + "learning_rate": 6.640530962175377e-05, + "loss": 0.9785, + "step": 2476 + }, + { + "epoch": 0.39050362599249006, + "grad_norm": 0.89453125, + "learning_rate": 6.640097348180512e-05, + "loss": 1.0944, + "step": 2477 + }, + { + "epoch": 0.390661277839883, + "grad_norm": 0.9453125, + "learning_rate": 6.63966374130677e-05, + "loss": 1.0633, + "step": 2478 + }, + { + "epoch": 0.39081892968727605, + "grad_norm": 1.0078125, + "learning_rate": 6.639230141555065e-05, + "loss": 1.241, + "step": 2479 + }, + { + "epoch": 0.3909765815346691, + "grad_norm": 0.9296875, + "learning_rate": 6.638796548926323e-05, + "loss": 0.8372, + "step": 2480 + }, + { + "epoch": 0.3911342333820621, + "grad_norm": 0.99609375, + "learning_rate": 6.638362963421459e-05, + "loss": 1.2442, + "step": 2481 + }, + { + "epoch": 0.39129188522945507, + "grad_norm": 0.94140625, + "learning_rate": 6.637929385041389e-05, + "loss": 0.8778, + "step": 2482 + }, + { + "epoch": 0.3914495370768481, + "grad_norm": 0.9140625, + "learning_rate": 6.63749581378704e-05, + "loss": 1.0154, + "step": 2483 + }, + { + "epoch": 0.3916071889242411, + "grad_norm": 0.97265625, + "learning_rate": 6.637062249659323e-05, + "loss": 1.1208, + "step": 2484 + }, + { + "epoch": 0.39176484077163415, + "grad_norm": 0.9453125, + "learning_rate": 6.636628692659162e-05, + "loss": 1.0515, + "step": 2485 + }, + { + "epoch": 0.3919224926190271, + "grad_norm": 0.83984375, + "learning_rate": 6.636195142787475e-05, + "loss": 0.8469, + "step": 2486 + }, + { + "epoch": 0.39208014446642014, + "grad_norm": 0.984375, + "learning_rate": 6.635761600045181e-05, + "loss": 1.192, + "step": 2487 + }, + { + "epoch": 0.39223779631381317, + "grad_norm": 0.93359375, + "learning_rate": 6.635328064433197e-05, + "loss": 0.9389, + "step": 2488 + }, + { + "epoch": 0.3923954481612062, + "grad_norm": 0.94140625, + "learning_rate": 6.634894535952438e-05, + "loss": 1.1853, + "step": 2489 + }, + { + "epoch": 0.39255310000859917, + "grad_norm": 0.921875, + "learning_rate": 6.634461014603832e-05, + "loss": 0.9414, + "step": 2490 + }, + { + "epoch": 0.3927107518559922, + "grad_norm": 0.93359375, + "learning_rate": 6.634027500388295e-05, + "loss": 1.0023, + "step": 2491 + }, + { + "epoch": 0.3928684037033852, + "grad_norm": 1.0234375, + "learning_rate": 6.633593993306744e-05, + "loss": 1.1227, + "step": 2492 + }, + { + "epoch": 0.39302605555077824, + "grad_norm": 1.0, + "learning_rate": 6.633160493360097e-05, + "loss": 1.0954, + "step": 2493 + }, + { + "epoch": 0.3931837073981712, + "grad_norm": 0.89453125, + "learning_rate": 6.63272700054927e-05, + "loss": 1.0853, + "step": 2494 + }, + { + "epoch": 0.39334135924556424, + "grad_norm": 0.97265625, + "learning_rate": 6.632293514875191e-05, + "loss": 1.0914, + "step": 2495 + }, + { + "epoch": 0.39349901109295726, + "grad_norm": 0.87109375, + "learning_rate": 6.631860036338773e-05, + "loss": 1.01, + "step": 2496 + }, + { + "epoch": 0.3936566629403503, + "grad_norm": 0.86328125, + "learning_rate": 6.631426564940935e-05, + "loss": 1.0759, + "step": 2497 + }, + { + "epoch": 0.39381431478774326, + "grad_norm": 0.89453125, + "learning_rate": 6.630993100682595e-05, + "loss": 1.0284, + "step": 2498 + }, + { + "epoch": 0.3939719666351363, + "grad_norm": 0.98046875, + "learning_rate": 6.630559643564673e-05, + "loss": 1.1537, + "step": 2499 + }, + { + "epoch": 0.3941296184825293, + "grad_norm": 0.78515625, + "learning_rate": 6.630126193588082e-05, + "loss": 0.9391, + "step": 2500 + }, + { + "epoch": 0.39428727032992233, + "grad_norm": 0.8203125, + "learning_rate": 6.629692750753752e-05, + "loss": 0.9273, + "step": 2501 + }, + { + "epoch": 0.3944449221773153, + "grad_norm": 0.95703125, + "learning_rate": 6.629259315062594e-05, + "loss": 1.0176, + "step": 2502 + }, + { + "epoch": 0.39460257402470833, + "grad_norm": 0.8984375, + "learning_rate": 6.62882588651553e-05, + "loss": 1.0719, + "step": 2503 + }, + { + "epoch": 0.39476022587210136, + "grad_norm": 0.8671875, + "learning_rate": 6.628392465113475e-05, + "loss": 1.1594, + "step": 2504 + }, + { + "epoch": 0.3949178777194944, + "grad_norm": 0.96875, + "learning_rate": 6.627959050857348e-05, + "loss": 1.1225, + "step": 2505 + }, + { + "epoch": 0.3950755295668874, + "grad_norm": 0.98046875, + "learning_rate": 6.627525643748073e-05, + "loss": 1.0193, + "step": 2506 + }, + { + "epoch": 0.3952331814142804, + "grad_norm": 1.0390625, + "learning_rate": 6.627092243786562e-05, + "loss": 1.2507, + "step": 2507 + }, + { + "epoch": 0.3953908332616734, + "grad_norm": 0.94921875, + "learning_rate": 6.626658850973737e-05, + "loss": 0.9266, + "step": 2508 + }, + { + "epoch": 0.39554848510906643, + "grad_norm": 1.0078125, + "learning_rate": 6.626225465310517e-05, + "loss": 1.252, + "step": 2509 + }, + { + "epoch": 0.39570613695645945, + "grad_norm": 0.9296875, + "learning_rate": 6.625792086797814e-05, + "loss": 1.0576, + "step": 2510 + }, + { + "epoch": 0.3958637888038524, + "grad_norm": 0.953125, + "learning_rate": 6.625358715436556e-05, + "loss": 1.1978, + "step": 2511 + }, + { + "epoch": 0.39602144065124545, + "grad_norm": 1.0390625, + "learning_rate": 6.624925351227658e-05, + "loss": 1.1091, + "step": 2512 + }, + { + "epoch": 0.3961790924986385, + "grad_norm": 0.84375, + "learning_rate": 6.624491994172037e-05, + "loss": 0.9117, + "step": 2513 + }, + { + "epoch": 0.3963367443460315, + "grad_norm": 1.03125, + "learning_rate": 6.624058644270613e-05, + "loss": 1.0061, + "step": 2514 + }, + { + "epoch": 0.39649439619342447, + "grad_norm": 0.98828125, + "learning_rate": 6.623625301524299e-05, + "loss": 1.0533, + "step": 2515 + }, + { + "epoch": 0.3966520480408175, + "grad_norm": 0.93359375, + "learning_rate": 6.623191965934021e-05, + "loss": 1.208, + "step": 2516 + }, + { + "epoch": 0.3968096998882105, + "grad_norm": 1.0078125, + "learning_rate": 6.622758637500697e-05, + "loss": 1.1789, + "step": 2517 + }, + { + "epoch": 0.39696735173560355, + "grad_norm": 0.953125, + "learning_rate": 6.622325316225243e-05, + "loss": 1.2051, + "step": 2518 + }, + { + "epoch": 0.3971250035829965, + "grad_norm": 0.8359375, + "learning_rate": 6.621892002108576e-05, + "loss": 0.9699, + "step": 2519 + }, + { + "epoch": 0.39728265543038954, + "grad_norm": 0.8828125, + "learning_rate": 6.621458695151612e-05, + "loss": 0.8761, + "step": 2520 + }, + { + "epoch": 0.39744030727778257, + "grad_norm": 0.9609375, + "learning_rate": 6.621025395355279e-05, + "loss": 0.9464, + "step": 2521 + }, + { + "epoch": 0.3975979591251756, + "grad_norm": 0.8671875, + "learning_rate": 6.620592102720487e-05, + "loss": 0.9853, + "step": 2522 + }, + { + "epoch": 0.39775561097256856, + "grad_norm": 0.90625, + "learning_rate": 6.620158817248157e-05, + "loss": 0.9904, + "step": 2523 + }, + { + "epoch": 0.3979132628199616, + "grad_norm": 1.0, + "learning_rate": 6.619725538939209e-05, + "loss": 1.1307, + "step": 2524 + }, + { + "epoch": 0.3980709146673546, + "grad_norm": 0.99609375, + "learning_rate": 6.619292267794557e-05, + "loss": 1.3308, + "step": 2525 + }, + { + "epoch": 0.39822856651474764, + "grad_norm": 1.0703125, + "learning_rate": 6.618859003815123e-05, + "loss": 1.0709, + "step": 2526 + }, + { + "epoch": 0.3983862183621406, + "grad_norm": 0.953125, + "learning_rate": 6.618425747001824e-05, + "loss": 1.3499, + "step": 2527 + }, + { + "epoch": 0.39854387020953363, + "grad_norm": 0.94140625, + "learning_rate": 6.61799249735558e-05, + "loss": 1.0349, + "step": 2528 + }, + { + "epoch": 0.39870152205692666, + "grad_norm": 0.9140625, + "learning_rate": 6.617559254877305e-05, + "loss": 0.984, + "step": 2529 + }, + { + "epoch": 0.3988591739043197, + "grad_norm": 1.03125, + "learning_rate": 6.617126019567918e-05, + "loss": 1.1902, + "step": 2530 + }, + { + "epoch": 0.39901682575171266, + "grad_norm": 0.84765625, + "learning_rate": 6.616692791428343e-05, + "loss": 0.8985, + "step": 2531 + }, + { + "epoch": 0.3991744775991057, + "grad_norm": 0.98828125, + "learning_rate": 6.616259570459493e-05, + "loss": 1.1675, + "step": 2532 + }, + { + "epoch": 0.3993321294464987, + "grad_norm": 1.078125, + "learning_rate": 6.615826356662289e-05, + "loss": 1.184, + "step": 2533 + }, + { + "epoch": 0.39948978129389173, + "grad_norm": 0.890625, + "learning_rate": 6.615393150037646e-05, + "loss": 0.9945, + "step": 2534 + }, + { + "epoch": 0.3996474331412847, + "grad_norm": 0.9296875, + "learning_rate": 6.61495995058648e-05, + "loss": 1.0541, + "step": 2535 + }, + { + "epoch": 0.39980508498867773, + "grad_norm": 0.9609375, + "learning_rate": 6.614526758309718e-05, + "loss": 1.035, + "step": 2536 + }, + { + "epoch": 0.39996273683607075, + "grad_norm": 0.94921875, + "learning_rate": 6.614093573208272e-05, + "loss": 1.2051, + "step": 2537 + }, + { + "epoch": 0.4001203886834638, + "grad_norm": 0.96875, + "learning_rate": 6.613660395283061e-05, + "loss": 1.2529, + "step": 2538 + }, + { + "epoch": 0.40027804053085675, + "grad_norm": 0.83203125, + "learning_rate": 6.613227224535004e-05, + "loss": 1.0717, + "step": 2539 + }, + { + "epoch": 0.4004356923782498, + "grad_norm": 0.97265625, + "learning_rate": 6.612794060965014e-05, + "loss": 0.9188, + "step": 2540 + }, + { + "epoch": 0.4005933442256428, + "grad_norm": 0.8515625, + "learning_rate": 6.612360904574018e-05, + "loss": 1.0133, + "step": 2541 + }, + { + "epoch": 0.4007509960730358, + "grad_norm": 0.98046875, + "learning_rate": 6.611927755362929e-05, + "loss": 1.0587, + "step": 2542 + }, + { + "epoch": 0.4009086479204288, + "grad_norm": 1.015625, + "learning_rate": 6.611494613332664e-05, + "loss": 1.2033, + "step": 2543 + }, + { + "epoch": 0.4010662997678218, + "grad_norm": 0.921875, + "learning_rate": 6.611061478484145e-05, + "loss": 0.9407, + "step": 2544 + }, + { + "epoch": 0.40122395161521485, + "grad_norm": 0.921875, + "learning_rate": 6.610628350818284e-05, + "loss": 0.9071, + "step": 2545 + }, + { + "epoch": 0.40138160346260787, + "grad_norm": 1.015625, + "learning_rate": 6.610195230336005e-05, + "loss": 1.115, + "step": 2546 + }, + { + "epoch": 0.40153925531000084, + "grad_norm": 0.96875, + "learning_rate": 6.609762117038224e-05, + "loss": 1.4135, + "step": 2547 + }, + { + "epoch": 0.40169690715739387, + "grad_norm": 0.98828125, + "learning_rate": 6.609329010925855e-05, + "loss": 0.9778, + "step": 2548 + }, + { + "epoch": 0.4018545590047869, + "grad_norm": 0.828125, + "learning_rate": 6.608895911999822e-05, + "loss": 0.8862, + "step": 2549 + }, + { + "epoch": 0.4020122108521799, + "grad_norm": 1.0390625, + "learning_rate": 6.608462820261039e-05, + "loss": 1.1875, + "step": 2550 + }, + { + "epoch": 0.4021698626995729, + "grad_norm": 0.96875, + "learning_rate": 6.608029735710426e-05, + "loss": 1.059, + "step": 2551 + }, + { + "epoch": 0.4023275145469659, + "grad_norm": 0.95703125, + "learning_rate": 6.6075966583489e-05, + "loss": 1.2092, + "step": 2552 + }, + { + "epoch": 0.40248516639435894, + "grad_norm": 0.9453125, + "learning_rate": 6.60716358817738e-05, + "loss": 1.1373, + "step": 2553 + }, + { + "epoch": 0.40264281824175197, + "grad_norm": 0.9609375, + "learning_rate": 6.606730525196783e-05, + "loss": 1.0164, + "step": 2554 + }, + { + "epoch": 0.40280047008914494, + "grad_norm": 0.953125, + "learning_rate": 6.606297469408023e-05, + "loss": 0.9494, + "step": 2555 + }, + { + "epoch": 0.40295812193653796, + "grad_norm": 0.89453125, + "learning_rate": 6.605864420812024e-05, + "loss": 1.0553, + "step": 2556 + }, + { + "epoch": 0.403115773783931, + "grad_norm": 1.015625, + "learning_rate": 6.605431379409702e-05, + "loss": 1.0976, + "step": 2557 + }, + { + "epoch": 0.403273425631324, + "grad_norm": 0.9765625, + "learning_rate": 6.604998345201974e-05, + "loss": 1.2923, + "step": 2558 + }, + { + "epoch": 0.403431077478717, + "grad_norm": 0.8671875, + "learning_rate": 6.604565318189758e-05, + "loss": 0.9522, + "step": 2559 + }, + { + "epoch": 0.40358872932611, + "grad_norm": 0.90234375, + "learning_rate": 6.604132298373969e-05, + "loss": 0.9164, + "step": 2560 + }, + { + "epoch": 0.40374638117350303, + "grad_norm": 0.890625, + "learning_rate": 6.603699285755529e-05, + "loss": 0.8581, + "step": 2561 + }, + { + "epoch": 0.40390403302089606, + "grad_norm": 0.9921875, + "learning_rate": 6.603266280335356e-05, + "loss": 1.3284, + "step": 2562 + }, + { + "epoch": 0.40406168486828903, + "grad_norm": 0.94921875, + "learning_rate": 6.602833282114366e-05, + "loss": 1.0006, + "step": 2563 + }, + { + "epoch": 0.40421933671568205, + "grad_norm": 0.84375, + "learning_rate": 6.602400291093476e-05, + "loss": 1.0529, + "step": 2564 + }, + { + "epoch": 0.4043769885630751, + "grad_norm": 0.90234375, + "learning_rate": 6.601967307273603e-05, + "loss": 1.1403, + "step": 2565 + }, + { + "epoch": 0.4045346404104681, + "grad_norm": 1.046875, + "learning_rate": 6.601534330655668e-05, + "loss": 1.3225, + "step": 2566 + }, + { + "epoch": 0.4046922922578611, + "grad_norm": 0.90625, + "learning_rate": 6.601101361240584e-05, + "loss": 0.8643, + "step": 2567 + }, + { + "epoch": 0.4048499441052541, + "grad_norm": 0.95703125, + "learning_rate": 6.600668399029275e-05, + "loss": 0.9923, + "step": 2568 + }, + { + "epoch": 0.4050075959526471, + "grad_norm": 0.9453125, + "learning_rate": 6.600235444022653e-05, + "loss": 0.9787, + "step": 2569 + }, + { + "epoch": 0.40516524780004015, + "grad_norm": 0.875, + "learning_rate": 6.599802496221636e-05, + "loss": 1.1355, + "step": 2570 + }, + { + "epoch": 0.4053228996474331, + "grad_norm": 0.921875, + "learning_rate": 6.599369555627146e-05, + "loss": 1.0048, + "step": 2571 + }, + { + "epoch": 0.40548055149482615, + "grad_norm": 0.95703125, + "learning_rate": 6.598936622240098e-05, + "loss": 1.144, + "step": 2572 + }, + { + "epoch": 0.4056382033422192, + "grad_norm": 0.94140625, + "learning_rate": 6.59850369606141e-05, + "loss": 0.9273, + "step": 2573 + }, + { + "epoch": 0.4057958551896122, + "grad_norm": 1.15625, + "learning_rate": 6.598070777091999e-05, + "loss": 0.8808, + "step": 2574 + }, + { + "epoch": 0.40595350703700517, + "grad_norm": 0.9296875, + "learning_rate": 6.597637865332776e-05, + "loss": 1.0257, + "step": 2575 + }, + { + "epoch": 0.4061111588843982, + "grad_norm": 0.921875, + "learning_rate": 6.59720496078467e-05, + "loss": 1.0233, + "step": 2576 + }, + { + "epoch": 0.4062688107317912, + "grad_norm": 0.81640625, + "learning_rate": 6.596772063448595e-05, + "loss": 0.9426, + "step": 2577 + }, + { + "epoch": 0.40642646257918424, + "grad_norm": 0.94921875, + "learning_rate": 6.596339173325467e-05, + "loss": 0.958, + "step": 2578 + }, + { + "epoch": 0.4065841144265772, + "grad_norm": 0.9609375, + "learning_rate": 6.595906290416203e-05, + "loss": 1.1968, + "step": 2579 + }, + { + "epoch": 0.40674176627397024, + "grad_norm": 1.03125, + "learning_rate": 6.595473414721718e-05, + "loss": 1.1921, + "step": 2580 + }, + { + "epoch": 0.40689941812136327, + "grad_norm": 0.984375, + "learning_rate": 6.595040546242934e-05, + "loss": 1.1397, + "step": 2581 + }, + { + "epoch": 0.4070570699687563, + "grad_norm": 0.90234375, + "learning_rate": 6.59460768498077e-05, + "loss": 1.1991, + "step": 2582 + }, + { + "epoch": 0.40721472181614926, + "grad_norm": 0.9609375, + "learning_rate": 6.594174830936139e-05, + "loss": 0.9516, + "step": 2583 + }, + { + "epoch": 0.4073723736635423, + "grad_norm": 0.9609375, + "learning_rate": 6.593741984109959e-05, + "loss": 1.1191, + "step": 2584 + }, + { + "epoch": 0.4075300255109353, + "grad_norm": 1.0234375, + "learning_rate": 6.593309144503147e-05, + "loss": 1.1492, + "step": 2585 + }, + { + "epoch": 0.40768767735832834, + "grad_norm": 0.90234375, + "learning_rate": 6.59287631211662e-05, + "loss": 1.0137, + "step": 2586 + }, + { + "epoch": 0.4078453292057213, + "grad_norm": 0.88671875, + "learning_rate": 6.592443486951301e-05, + "loss": 0.9528, + "step": 2587 + }, + { + "epoch": 0.40800298105311433, + "grad_norm": 0.91015625, + "learning_rate": 6.592010669008102e-05, + "loss": 0.975, + "step": 2588 + }, + { + "epoch": 0.40816063290050736, + "grad_norm": 0.94921875, + "learning_rate": 6.591577858287942e-05, + "loss": 1.0767, + "step": 2589 + }, + { + "epoch": 0.4083182847479004, + "grad_norm": 0.87890625, + "learning_rate": 6.591145054791737e-05, + "loss": 0.9555, + "step": 2590 + }, + { + "epoch": 0.40847593659529335, + "grad_norm": 0.91796875, + "learning_rate": 6.590712258520406e-05, + "loss": 0.9409, + "step": 2591 + }, + { + "epoch": 0.4086335884426864, + "grad_norm": 0.82421875, + "learning_rate": 6.590279469474867e-05, + "loss": 0.8437, + "step": 2592 + }, + { + "epoch": 0.4087912402900794, + "grad_norm": 1.03125, + "learning_rate": 6.589846687656033e-05, + "loss": 1.2578, + "step": 2593 + }, + { + "epoch": 0.40894889213747243, + "grad_norm": 1.0546875, + "learning_rate": 6.589413913064826e-05, + "loss": 1.1839, + "step": 2594 + }, + { + "epoch": 0.4091065439848654, + "grad_norm": 0.90234375, + "learning_rate": 6.588981145702158e-05, + "loss": 1.0763, + "step": 2595 + }, + { + "epoch": 0.4092641958322584, + "grad_norm": 0.91015625, + "learning_rate": 6.588548385568952e-05, + "loss": 1.0296, + "step": 2596 + }, + { + "epoch": 0.40942184767965145, + "grad_norm": 1.0390625, + "learning_rate": 6.588115632666123e-05, + "loss": 1.2016, + "step": 2597 + }, + { + "epoch": 0.4095794995270445, + "grad_norm": 0.91015625, + "learning_rate": 6.587682886994588e-05, + "loss": 1.0526, + "step": 2598 + }, + { + "epoch": 0.40973715137443745, + "grad_norm": 1.0546875, + "learning_rate": 6.587250148555264e-05, + "loss": 1.1871, + "step": 2599 + }, + { + "epoch": 0.4098948032218305, + "grad_norm": 0.94921875, + "learning_rate": 6.586817417349065e-05, + "loss": 1.1411, + "step": 2600 + }, + { + "epoch": 0.4100524550692235, + "grad_norm": 1.0, + "learning_rate": 6.586384693376915e-05, + "loss": 1.0411, + "step": 2601 + }, + { + "epoch": 0.4102101069166165, + "grad_norm": 0.8828125, + "learning_rate": 6.585951976639726e-05, + "loss": 0.9494, + "step": 2602 + }, + { + "epoch": 0.4103677587640095, + "grad_norm": 0.98046875, + "learning_rate": 6.585519267138418e-05, + "loss": 1.0176, + "step": 2603 + }, + { + "epoch": 0.4105254106114025, + "grad_norm": 1.0, + "learning_rate": 6.585086564873908e-05, + "loss": 1.0628, + "step": 2604 + }, + { + "epoch": 0.41068306245879554, + "grad_norm": 0.90234375, + "learning_rate": 6.584653869847106e-05, + "loss": 1.2245, + "step": 2605 + }, + { + "epoch": 0.41084071430618857, + "grad_norm": 0.8203125, + "learning_rate": 6.584221182058939e-05, + "loss": 0.9844, + "step": 2606 + }, + { + "epoch": 0.41099836615358154, + "grad_norm": 0.8984375, + "learning_rate": 6.58378850151032e-05, + "loss": 0.9701, + "step": 2607 + }, + { + "epoch": 0.41115601800097457, + "grad_norm": 0.91796875, + "learning_rate": 6.583355828202166e-05, + "loss": 1.0095, + "step": 2608 + }, + { + "epoch": 0.4113136698483676, + "grad_norm": 0.8515625, + "learning_rate": 6.582923162135394e-05, + "loss": 0.985, + "step": 2609 + }, + { + "epoch": 0.4114713216957606, + "grad_norm": 0.91796875, + "learning_rate": 6.58249050331092e-05, + "loss": 0.9441, + "step": 2610 + }, + { + "epoch": 0.4116289735431536, + "grad_norm": 0.921875, + "learning_rate": 6.582057851729664e-05, + "loss": 1.13, + "step": 2611 + }, + { + "epoch": 0.4117866253905466, + "grad_norm": 0.98828125, + "learning_rate": 6.581625207392541e-05, + "loss": 1.015, + "step": 2612 + }, + { + "epoch": 0.41194427723793964, + "grad_norm": 0.9609375, + "learning_rate": 6.581192570300467e-05, + "loss": 1.143, + "step": 2613 + }, + { + "epoch": 0.41210192908533266, + "grad_norm": 0.91796875, + "learning_rate": 6.580759940454361e-05, + "loss": 0.981, + "step": 2614 + }, + { + "epoch": 0.41225958093272563, + "grad_norm": 0.96484375, + "learning_rate": 6.580327317855133e-05, + "loss": 1.0447, + "step": 2615 + }, + { + "epoch": 0.41241723278011866, + "grad_norm": 0.93359375, + "learning_rate": 6.579894702503712e-05, + "loss": 0.9664, + "step": 2616 + }, + { + "epoch": 0.4125748846275117, + "grad_norm": 0.88671875, + "learning_rate": 6.579462094401008e-05, + "loss": 1.0837, + "step": 2617 + }, + { + "epoch": 0.4127325364749047, + "grad_norm": 0.9609375, + "learning_rate": 6.579029493547939e-05, + "loss": 1.3176, + "step": 2618 + }, + { + "epoch": 0.4128901883222977, + "grad_norm": 0.890625, + "learning_rate": 6.57859689994542e-05, + "loss": 0.9244, + "step": 2619 + }, + { + "epoch": 0.4130478401696907, + "grad_norm": 0.9609375, + "learning_rate": 6.578164313594366e-05, + "loss": 1.1278, + "step": 2620 + }, + { + "epoch": 0.41320549201708373, + "grad_norm": 1.0, + "learning_rate": 6.577731734495703e-05, + "loss": 1.2114, + "step": 2621 + }, + { + "epoch": 0.41336314386447676, + "grad_norm": 1.0234375, + "learning_rate": 6.577299162650338e-05, + "loss": 1.0157, + "step": 2622 + }, + { + "epoch": 0.4135207957118697, + "grad_norm": 0.92578125, + "learning_rate": 6.576866598059195e-05, + "loss": 0.9643, + "step": 2623 + }, + { + "epoch": 0.41367844755926275, + "grad_norm": 1.03125, + "learning_rate": 6.576434040723188e-05, + "loss": 1.1188, + "step": 2624 + }, + { + "epoch": 0.4138360994066558, + "grad_norm": 0.921875, + "learning_rate": 6.576001490643227e-05, + "loss": 0.9879, + "step": 2625 + }, + { + "epoch": 0.4139937512540488, + "grad_norm": 0.8984375, + "learning_rate": 6.575568947820239e-05, + "loss": 0.8778, + "step": 2626 + }, + { + "epoch": 0.4141514031014418, + "grad_norm": 0.9140625, + "learning_rate": 6.575136412255138e-05, + "loss": 0.9882, + "step": 2627 + }, + { + "epoch": 0.4143090549488348, + "grad_norm": 0.91015625, + "learning_rate": 6.574703883948841e-05, + "loss": 1.0088, + "step": 2628 + }, + { + "epoch": 0.4144667067962278, + "grad_norm": 0.9375, + "learning_rate": 6.574271362902262e-05, + "loss": 1.1282, + "step": 2629 + }, + { + "epoch": 0.41462435864362085, + "grad_norm": 0.94921875, + "learning_rate": 6.573838849116316e-05, + "loss": 1.0887, + "step": 2630 + }, + { + "epoch": 0.4147820104910138, + "grad_norm": 0.9765625, + "learning_rate": 6.573406342591927e-05, + "loss": 1.1364, + "step": 2631 + }, + { + "epoch": 0.41493966233840684, + "grad_norm": 0.98828125, + "learning_rate": 6.572973843330006e-05, + "loss": 0.9321, + "step": 2632 + }, + { + "epoch": 0.41509731418579987, + "grad_norm": 0.953125, + "learning_rate": 6.572541351331472e-05, + "loss": 1.0673, + "step": 2633 + }, + { + "epoch": 0.4152549660331929, + "grad_norm": 0.953125, + "learning_rate": 6.572108866597238e-05, + "loss": 1.2719, + "step": 2634 + }, + { + "epoch": 0.41541261788058587, + "grad_norm": 0.93359375, + "learning_rate": 6.571676389128223e-05, + "loss": 1.1433, + "step": 2635 + }, + { + "epoch": 0.4155702697279789, + "grad_norm": 1.0546875, + "learning_rate": 6.571243918925344e-05, + "loss": 1.2131, + "step": 2636 + }, + { + "epoch": 0.4157279215753719, + "grad_norm": 1.0078125, + "learning_rate": 6.570811455989521e-05, + "loss": 1.0439, + "step": 2637 + }, + { + "epoch": 0.41588557342276494, + "grad_norm": 0.9453125, + "learning_rate": 6.570379000321667e-05, + "loss": 1.0461, + "step": 2638 + }, + { + "epoch": 0.4160432252701579, + "grad_norm": 1.0625, + "learning_rate": 6.569946551922696e-05, + "loss": 1.0368, + "step": 2639 + }, + { + "epoch": 0.41620087711755094, + "grad_norm": 0.8515625, + "learning_rate": 6.56951411079353e-05, + "loss": 1.1239, + "step": 2640 + }, + { + "epoch": 0.41635852896494396, + "grad_norm": 0.91015625, + "learning_rate": 6.569081676935078e-05, + "loss": 1.0256, + "step": 2641 + }, + { + "epoch": 0.416516180812337, + "grad_norm": 1.0546875, + "learning_rate": 6.568649250348264e-05, + "loss": 1.2058, + "step": 2642 + }, + { + "epoch": 0.41667383265972996, + "grad_norm": 0.98828125, + "learning_rate": 6.568216831034002e-05, + "loss": 1.0007, + "step": 2643 + }, + { + "epoch": 0.416831484507123, + "grad_norm": 0.96484375, + "learning_rate": 6.56778441899321e-05, + "loss": 1.2472, + "step": 2644 + }, + { + "epoch": 0.416989136354516, + "grad_norm": 1.046875, + "learning_rate": 6.567352014226802e-05, + "loss": 1.0037, + "step": 2645 + }, + { + "epoch": 0.41714678820190904, + "grad_norm": 0.921875, + "learning_rate": 6.566919616735689e-05, + "loss": 1.0668, + "step": 2646 + }, + { + "epoch": 0.417304440049302, + "grad_norm": 0.90234375, + "learning_rate": 6.566487226520801e-05, + "loss": 1.0446, + "step": 2647 + }, + { + "epoch": 0.41746209189669503, + "grad_norm": 0.94921875, + "learning_rate": 6.566054843583045e-05, + "loss": 1.1741, + "step": 2648 + }, + { + "epoch": 0.41761974374408806, + "grad_norm": 0.91796875, + "learning_rate": 6.56562246792334e-05, + "loss": 0.8666, + "step": 2649 + }, + { + "epoch": 0.4177773955914811, + "grad_norm": 0.921875, + "learning_rate": 6.565190099542603e-05, + "loss": 1.1364, + "step": 2650 + }, + { + "epoch": 0.41793504743887405, + "grad_norm": 0.96875, + "learning_rate": 6.564757738441745e-05, + "loss": 0.973, + "step": 2651 + }, + { + "epoch": 0.4180926992862671, + "grad_norm": 1.0234375, + "learning_rate": 6.564325384621688e-05, + "loss": 1.3055, + "step": 2652 + }, + { + "epoch": 0.4182503511336601, + "grad_norm": 0.9140625, + "learning_rate": 6.563893038083349e-05, + "loss": 0.9315, + "step": 2653 + }, + { + "epoch": 0.41840800298105313, + "grad_norm": 0.86328125, + "learning_rate": 6.563460698827642e-05, + "loss": 0.8043, + "step": 2654 + }, + { + "epoch": 0.4185656548284461, + "grad_norm": 1.0703125, + "learning_rate": 6.563028366855484e-05, + "loss": 1.2749, + "step": 2655 + }, + { + "epoch": 0.4187233066758391, + "grad_norm": 0.89453125, + "learning_rate": 6.562596042167788e-05, + "loss": 0.706, + "step": 2656 + }, + { + "epoch": 0.41888095852323215, + "grad_norm": 0.90625, + "learning_rate": 6.562163724765476e-05, + "loss": 1.0255, + "step": 2657 + }, + { + "epoch": 0.4190386103706252, + "grad_norm": 0.8671875, + "learning_rate": 6.561731414649462e-05, + "loss": 0.966, + "step": 2658 + }, + { + "epoch": 0.41919626221801815, + "grad_norm": 1.28125, + "learning_rate": 6.561299111820661e-05, + "loss": 1.0845, + "step": 2659 + }, + { + "epoch": 0.41935391406541117, + "grad_norm": 0.91796875, + "learning_rate": 6.560866816279991e-05, + "loss": 1.0453, + "step": 2660 + }, + { + "epoch": 0.4195115659128042, + "grad_norm": 1.0234375, + "learning_rate": 6.56043452802836e-05, + "loss": 1.0875, + "step": 2661 + }, + { + "epoch": 0.4196692177601972, + "grad_norm": 0.9140625, + "learning_rate": 6.560002247066699e-05, + "loss": 0.9738, + "step": 2662 + }, + { + "epoch": 0.41982686960759025, + "grad_norm": 0.93359375, + "learning_rate": 6.559569973395915e-05, + "loss": 0.7607, + "step": 2663 + }, + { + "epoch": 0.4199845214549832, + "grad_norm": 1.0390625, + "learning_rate": 6.559137707016927e-05, + "loss": 1.1456, + "step": 2664 + }, + { + "epoch": 0.42014217330237624, + "grad_norm": 0.92578125, + "learning_rate": 6.558705447930649e-05, + "loss": 0.9813, + "step": 2665 + }, + { + "epoch": 0.42029982514976927, + "grad_norm": 0.88671875, + "learning_rate": 6.558273196137995e-05, + "loss": 0.9075, + "step": 2666 + }, + { + "epoch": 0.4204574769971623, + "grad_norm": 0.97265625, + "learning_rate": 6.557840951639886e-05, + "loss": 1.2153, + "step": 2667 + }, + { + "epoch": 0.42061512884455526, + "grad_norm": 1.0234375, + "learning_rate": 6.557408714437239e-05, + "loss": 1.0649, + "step": 2668 + }, + { + "epoch": 0.4207727806919483, + "grad_norm": 0.921875, + "learning_rate": 6.556976484530965e-05, + "loss": 1.0956, + "step": 2669 + }, + { + "epoch": 0.4209304325393413, + "grad_norm": 0.984375, + "learning_rate": 6.556544261921984e-05, + "loss": 1.1292, + "step": 2670 + }, + { + "epoch": 0.42108808438673434, + "grad_norm": 0.92578125, + "learning_rate": 6.556112046611207e-05, + "loss": 0.8227, + "step": 2671 + }, + { + "epoch": 0.4212457362341273, + "grad_norm": 0.90234375, + "learning_rate": 6.555679838599556e-05, + "loss": 0.9641, + "step": 2672 + }, + { + "epoch": 0.42140338808152034, + "grad_norm": 0.8671875, + "learning_rate": 6.555247637887947e-05, + "loss": 1.036, + "step": 2673 + }, + { + "epoch": 0.42156103992891336, + "grad_norm": 0.99609375, + "learning_rate": 6.554815444477292e-05, + "loss": 0.976, + "step": 2674 + }, + { + "epoch": 0.4217186917763064, + "grad_norm": 1.09375, + "learning_rate": 6.554383258368509e-05, + "loss": 1.1241, + "step": 2675 + }, + { + "epoch": 0.42187634362369936, + "grad_norm": 0.984375, + "learning_rate": 6.553951079562513e-05, + "loss": 1.2606, + "step": 2676 + }, + { + "epoch": 0.4220339954710924, + "grad_norm": 1.1640625, + "learning_rate": 6.553518908060221e-05, + "loss": 1.1383, + "step": 2677 + }, + { + "epoch": 0.4221916473184854, + "grad_norm": 0.87890625, + "learning_rate": 6.55308674386255e-05, + "loss": 0.9199, + "step": 2678 + }, + { + "epoch": 0.42234929916587843, + "grad_norm": 0.984375, + "learning_rate": 6.552654586970414e-05, + "loss": 1.3144, + "step": 2679 + }, + { + "epoch": 0.4225069510132714, + "grad_norm": 0.95703125, + "learning_rate": 6.55222243738473e-05, + "loss": 0.9537, + "step": 2680 + }, + { + "epoch": 0.42266460286066443, + "grad_norm": 1.015625, + "learning_rate": 6.551790295106407e-05, + "loss": 1.0827, + "step": 2681 + }, + { + "epoch": 0.42282225470805745, + "grad_norm": 0.95703125, + "learning_rate": 6.551358160136376e-05, + "loss": 1.2236, + "step": 2682 + }, + { + "epoch": 0.4229799065554505, + "grad_norm": 1.140625, + "learning_rate": 6.550926032475542e-05, + "loss": 1.0299, + "step": 2683 + }, + { + "epoch": 0.42313755840284345, + "grad_norm": 0.89453125, + "learning_rate": 6.550493912124822e-05, + "loss": 1.0157, + "step": 2684 + }, + { + "epoch": 0.4232952102502365, + "grad_norm": 0.875, + "learning_rate": 6.550061799085134e-05, + "loss": 0.9454, + "step": 2685 + }, + { + "epoch": 0.4234528620976295, + "grad_norm": 1.0, + "learning_rate": 6.549629693357389e-05, + "loss": 1.1689, + "step": 2686 + }, + { + "epoch": 0.4236105139450225, + "grad_norm": 1.0703125, + "learning_rate": 6.54919759494251e-05, + "loss": 1.1926, + "step": 2687 + }, + { + "epoch": 0.4237681657924155, + "grad_norm": 0.875, + "learning_rate": 6.54876550384141e-05, + "loss": 0.9646, + "step": 2688 + }, + { + "epoch": 0.4239258176398085, + "grad_norm": 1.1484375, + "learning_rate": 6.548333420055004e-05, + "loss": 1.0622, + "step": 2689 + }, + { + "epoch": 0.42408346948720155, + "grad_norm": 0.94140625, + "learning_rate": 6.547901343584208e-05, + "loss": 1.1203, + "step": 2690 + }, + { + "epoch": 0.4242411213345946, + "grad_norm": 1.0234375, + "learning_rate": 6.547469274429933e-05, + "loss": 1.0942, + "step": 2691 + }, + { + "epoch": 0.42439877318198754, + "grad_norm": 0.8984375, + "learning_rate": 6.547037212593106e-05, + "loss": 1.0022, + "step": 2692 + }, + { + "epoch": 0.42455642502938057, + "grad_norm": 1.2265625, + "learning_rate": 6.546605158074635e-05, + "loss": 0.8668, + "step": 2693 + }, + { + "epoch": 0.4247140768767736, + "grad_norm": 0.8359375, + "learning_rate": 6.546173110875435e-05, + "loss": 0.8737, + "step": 2694 + }, + { + "epoch": 0.4248717287241666, + "grad_norm": 0.92578125, + "learning_rate": 6.545741070996427e-05, + "loss": 0.9713, + "step": 2695 + }, + { + "epoch": 0.4250293805715596, + "grad_norm": 0.97265625, + "learning_rate": 6.545309038438518e-05, + "loss": 1.008, + "step": 2696 + }, + { + "epoch": 0.4251870324189526, + "grad_norm": 0.91015625, + "learning_rate": 6.544877013202633e-05, + "loss": 0.9426, + "step": 2697 + }, + { + "epoch": 0.42534468426634564, + "grad_norm": 0.9375, + "learning_rate": 6.544444995289685e-05, + "loss": 0.9433, + "step": 2698 + }, + { + "epoch": 0.42550233611373867, + "grad_norm": 0.8671875, + "learning_rate": 6.544012984700587e-05, + "loss": 1.0172, + "step": 2699 + }, + { + "epoch": 0.42565998796113164, + "grad_norm": 0.85546875, + "learning_rate": 6.543580981436255e-05, + "loss": 1.0212, + "step": 2700 + }, + { + "epoch": 0.42581763980852466, + "grad_norm": 0.90234375, + "learning_rate": 6.543148985497603e-05, + "loss": 0.9989, + "step": 2701 + }, + { + "epoch": 0.4259752916559177, + "grad_norm": 1.03125, + "learning_rate": 6.542716996885551e-05, + "loss": 1.0019, + "step": 2702 + }, + { + "epoch": 0.4261329435033107, + "grad_norm": 0.921875, + "learning_rate": 6.542285015601015e-05, + "loss": 0.9482, + "step": 2703 + }, + { + "epoch": 0.4262905953507037, + "grad_norm": 0.90625, + "learning_rate": 6.541853041644907e-05, + "loss": 0.9904, + "step": 2704 + }, + { + "epoch": 0.4264482471980967, + "grad_norm": 0.9375, + "learning_rate": 6.541421075018143e-05, + "loss": 1.2548, + "step": 2705 + }, + { + "epoch": 0.42660589904548973, + "grad_norm": 1.015625, + "learning_rate": 6.540989115721638e-05, + "loss": 1.0241, + "step": 2706 + }, + { + "epoch": 0.42676355089288276, + "grad_norm": 0.92578125, + "learning_rate": 6.54055716375631e-05, + "loss": 1.0165, + "step": 2707 + }, + { + "epoch": 0.42692120274027573, + "grad_norm": 0.84765625, + "learning_rate": 6.540125219123077e-05, + "loss": 0.8942, + "step": 2708 + }, + { + "epoch": 0.42707885458766875, + "grad_norm": 0.9609375, + "learning_rate": 6.539693281822847e-05, + "loss": 0.9734, + "step": 2709 + }, + { + "epoch": 0.4272365064350618, + "grad_norm": 1.078125, + "learning_rate": 6.539261351856542e-05, + "loss": 1.144, + "step": 2710 + }, + { + "epoch": 0.4273941582824548, + "grad_norm": 1.046875, + "learning_rate": 6.538829429225069e-05, + "loss": 1.1552, + "step": 2711 + }, + { + "epoch": 0.4275518101298478, + "grad_norm": 1.03125, + "learning_rate": 6.538397513929354e-05, + "loss": 1.2133, + "step": 2712 + }, + { + "epoch": 0.4277094619772408, + "grad_norm": 0.8359375, + "learning_rate": 6.537965605970308e-05, + "loss": 0.945, + "step": 2713 + }, + { + "epoch": 0.4278671138246338, + "grad_norm": 0.82421875, + "learning_rate": 6.537533705348845e-05, + "loss": 0.7975, + "step": 2714 + }, + { + "epoch": 0.42802476567202685, + "grad_norm": 0.890625, + "learning_rate": 6.537101812065882e-05, + "loss": 0.9211, + "step": 2715 + }, + { + "epoch": 0.4281824175194198, + "grad_norm": 1.03125, + "learning_rate": 6.536669926122332e-05, + "loss": 1.177, + "step": 2716 + }, + { + "epoch": 0.42834006936681285, + "grad_norm": 0.9921875, + "learning_rate": 6.536238047519114e-05, + "loss": 0.9751, + "step": 2717 + }, + { + "epoch": 0.4284977212142059, + "grad_norm": 0.9453125, + "learning_rate": 6.535806176257141e-05, + "loss": 1.0925, + "step": 2718 + }, + { + "epoch": 0.4286553730615989, + "grad_norm": 0.87109375, + "learning_rate": 6.535374312337328e-05, + "loss": 1.1275, + "step": 2719 + }, + { + "epoch": 0.42881302490899187, + "grad_norm": 0.90625, + "learning_rate": 6.534942455760591e-05, + "loss": 0.9658, + "step": 2720 + }, + { + "epoch": 0.4289706767563849, + "grad_norm": 0.83984375, + "learning_rate": 6.534510606527845e-05, + "loss": 0.9199, + "step": 2721 + }, + { + "epoch": 0.4291283286037779, + "grad_norm": 0.87890625, + "learning_rate": 6.534078764640008e-05, + "loss": 0.989, + "step": 2722 + }, + { + "epoch": 0.42928598045117095, + "grad_norm": 0.8984375, + "learning_rate": 6.533646930097991e-05, + "loss": 0.9411, + "step": 2723 + }, + { + "epoch": 0.4294436322985639, + "grad_norm": 0.90234375, + "learning_rate": 6.533215102902714e-05, + "loss": 0.9289, + "step": 2724 + }, + { + "epoch": 0.42960128414595694, + "grad_norm": 1.0078125, + "learning_rate": 6.532783283055087e-05, + "loss": 1.047, + "step": 2725 + }, + { + "epoch": 0.42975893599334997, + "grad_norm": 0.96875, + "learning_rate": 6.532351470556023e-05, + "loss": 1.1762, + "step": 2726 + }, + { + "epoch": 0.429916587840743, + "grad_norm": 0.890625, + "learning_rate": 6.53191966540645e-05, + "loss": 0.9962, + "step": 2727 + }, + { + "epoch": 0.43007423968813596, + "grad_norm": 1.0390625, + "learning_rate": 6.531487867607272e-05, + "loss": 1.0826, + "step": 2728 + }, + { + "epoch": 0.430231891535529, + "grad_norm": 0.87890625, + "learning_rate": 6.531056077159407e-05, + "loss": 1.0216, + "step": 2729 + }, + { + "epoch": 0.430389543382922, + "grad_norm": 0.94921875, + "learning_rate": 6.530624294063771e-05, + "loss": 1.1872, + "step": 2730 + }, + { + "epoch": 0.43054719523031504, + "grad_norm": 0.94140625, + "learning_rate": 6.530192518321276e-05, + "loss": 1.0959, + "step": 2731 + }, + { + "epoch": 0.430704847077708, + "grad_norm": 0.9921875, + "learning_rate": 6.529760749932841e-05, + "loss": 0.9862, + "step": 2732 + }, + { + "epoch": 0.43086249892510103, + "grad_norm": 0.953125, + "learning_rate": 6.529328988899382e-05, + "loss": 0.9641, + "step": 2733 + }, + { + "epoch": 0.43102015077249406, + "grad_norm": 0.91015625, + "learning_rate": 6.52889723522181e-05, + "loss": 1.0431, + "step": 2734 + }, + { + "epoch": 0.4311778026198871, + "grad_norm": 0.9453125, + "learning_rate": 6.528465488901043e-05, + "loss": 0.9527, + "step": 2735 + }, + { + "epoch": 0.43133545446728006, + "grad_norm": 0.9765625, + "learning_rate": 6.528033749937992e-05, + "loss": 1.0643, + "step": 2736 + }, + { + "epoch": 0.4314931063146731, + "grad_norm": 0.93359375, + "learning_rate": 6.527602018333579e-05, + "loss": 1.1445, + "step": 2737 + }, + { + "epoch": 0.4316507581620661, + "grad_norm": 0.9375, + "learning_rate": 6.527170294088712e-05, + "loss": 1.0928, + "step": 2738 + }, + { + "epoch": 0.43180841000945913, + "grad_norm": 0.84375, + "learning_rate": 6.52673857720431e-05, + "loss": 0.9932, + "step": 2739 + }, + { + "epoch": 0.4319660618568521, + "grad_norm": 1.046875, + "learning_rate": 6.526306867681288e-05, + "loss": 1.2806, + "step": 2740 + }, + { + "epoch": 0.4321237137042451, + "grad_norm": 0.83984375, + "learning_rate": 6.525875165520558e-05, + "loss": 1.0832, + "step": 2741 + }, + { + "epoch": 0.43228136555163815, + "grad_norm": 0.828125, + "learning_rate": 6.525443470723039e-05, + "loss": 0.9927, + "step": 2742 + }, + { + "epoch": 0.4324390173990312, + "grad_norm": 0.93359375, + "learning_rate": 6.525011783289644e-05, + "loss": 0.9146, + "step": 2743 + }, + { + "epoch": 0.43259666924642415, + "grad_norm": 0.953125, + "learning_rate": 6.524580103221287e-05, + "loss": 0.9657, + "step": 2744 + }, + { + "epoch": 0.4327543210938172, + "grad_norm": 0.86328125, + "learning_rate": 6.524148430518886e-05, + "loss": 1.032, + "step": 2745 + }, + { + "epoch": 0.4329119729412102, + "grad_norm": 0.87109375, + "learning_rate": 6.523716765183347e-05, + "loss": 1.0179, + "step": 2746 + }, + { + "epoch": 0.4330696247886032, + "grad_norm": 0.8671875, + "learning_rate": 6.523285107215597e-05, + "loss": 0.9907, + "step": 2747 + }, + { + "epoch": 0.4332272766359962, + "grad_norm": 0.91796875, + "learning_rate": 6.522853456616547e-05, + "loss": 1.0947, + "step": 2748 + }, + { + "epoch": 0.4333849284833892, + "grad_norm": 0.92578125, + "learning_rate": 6.522421813387107e-05, + "loss": 1.0683, + "step": 2749 + }, + { + "epoch": 0.43354258033078225, + "grad_norm": 0.953125, + "learning_rate": 6.521990177528198e-05, + "loss": 1.229, + "step": 2750 + }, + { + "epoch": 0.43370023217817527, + "grad_norm": 0.9921875, + "learning_rate": 6.521558549040727e-05, + "loss": 1.0778, + "step": 2751 + }, + { + "epoch": 0.43385788402556824, + "grad_norm": 0.93359375, + "learning_rate": 6.521126927925618e-05, + "loss": 0.9319, + "step": 2752 + }, + { + "epoch": 0.43401553587296127, + "grad_norm": 0.8359375, + "learning_rate": 6.520695314183782e-05, + "loss": 1.0405, + "step": 2753 + }, + { + "epoch": 0.4341731877203543, + "grad_norm": 0.8203125, + "learning_rate": 6.520263707816132e-05, + "loss": 0.8712, + "step": 2754 + }, + { + "epoch": 0.4343308395677473, + "grad_norm": 1.0546875, + "learning_rate": 6.519832108823586e-05, + "loss": 1.0385, + "step": 2755 + }, + { + "epoch": 0.4344884914151403, + "grad_norm": 0.95703125, + "learning_rate": 6.519400517207052e-05, + "loss": 1.1215, + "step": 2756 + }, + { + "epoch": 0.4346461432625333, + "grad_norm": 0.86328125, + "learning_rate": 6.518968932967453e-05, + "loss": 0.9479, + "step": 2757 + }, + { + "epoch": 0.43480379510992634, + "grad_norm": 0.8984375, + "learning_rate": 6.518537356105701e-05, + "loss": 1.1548, + "step": 2758 + }, + { + "epoch": 0.43496144695731936, + "grad_norm": 0.8828125, + "learning_rate": 6.518105786622711e-05, + "loss": 0.9939, + "step": 2759 + }, + { + "epoch": 0.43511909880471233, + "grad_norm": 0.9453125, + "learning_rate": 6.517674224519397e-05, + "loss": 1.0875, + "step": 2760 + }, + { + "epoch": 0.43527675065210536, + "grad_norm": 1.3984375, + "learning_rate": 6.517242669796669e-05, + "loss": 0.9756, + "step": 2761 + }, + { + "epoch": 0.4354344024994984, + "grad_norm": 1.0234375, + "learning_rate": 6.516811122455449e-05, + "loss": 1.1052, + "step": 2762 + }, + { + "epoch": 0.4355920543468914, + "grad_norm": 0.87890625, + "learning_rate": 6.51637958249665e-05, + "loss": 0.9943, + "step": 2763 + }, + { + "epoch": 0.4357497061942844, + "grad_norm": 1.0078125, + "learning_rate": 6.515948049921186e-05, + "loss": 1.2119, + "step": 2764 + }, + { + "epoch": 0.4359073580416774, + "grad_norm": 0.8828125, + "learning_rate": 6.515516524729967e-05, + "loss": 1.002, + "step": 2765 + }, + { + "epoch": 0.43606500988907043, + "grad_norm": 1.140625, + "learning_rate": 6.51508500692391e-05, + "loss": 1.2949, + "step": 2766 + }, + { + "epoch": 0.43622266173646346, + "grad_norm": 0.9609375, + "learning_rate": 6.514653496503938e-05, + "loss": 1.1826, + "step": 2767 + }, + { + "epoch": 0.4363803135838564, + "grad_norm": 0.98828125, + "learning_rate": 6.514221993470954e-05, + "loss": 1.0555, + "step": 2768 + }, + { + "epoch": 0.43653796543124945, + "grad_norm": 1.0, + "learning_rate": 6.51379049782588e-05, + "loss": 1.0748, + "step": 2769 + }, + { + "epoch": 0.4366956172786425, + "grad_norm": 0.890625, + "learning_rate": 6.513359009569627e-05, + "loss": 0.9438, + "step": 2770 + }, + { + "epoch": 0.4368532691260355, + "grad_norm": 0.87109375, + "learning_rate": 6.512927528703105e-05, + "loss": 1.0476, + "step": 2771 + }, + { + "epoch": 0.4370109209734285, + "grad_norm": 1.0234375, + "learning_rate": 6.51249605522724e-05, + "loss": 1.0635, + "step": 2772 + }, + { + "epoch": 0.4371685728208215, + "grad_norm": 0.9296875, + "learning_rate": 6.512064589142938e-05, + "loss": 1.0634, + "step": 2773 + }, + { + "epoch": 0.4373262246682145, + "grad_norm": 0.98828125, + "learning_rate": 6.511633130451117e-05, + "loss": 1.0078, + "step": 2774 + }, + { + "epoch": 0.43748387651560755, + "grad_norm": 0.87109375, + "learning_rate": 6.511201679152692e-05, + "loss": 1.1566, + "step": 2775 + }, + { + "epoch": 0.4376415283630005, + "grad_norm": 0.98046875, + "learning_rate": 6.51077023524857e-05, + "loss": 1.2044, + "step": 2776 + }, + { + "epoch": 0.43779918021039355, + "grad_norm": 0.9140625, + "learning_rate": 6.510338798739674e-05, + "loss": 0.9469, + "step": 2777 + }, + { + "epoch": 0.43795683205778657, + "grad_norm": 0.859375, + "learning_rate": 6.509907369626917e-05, + "loss": 0.868, + "step": 2778 + }, + { + "epoch": 0.4381144839051796, + "grad_norm": 0.84765625, + "learning_rate": 6.509475947911212e-05, + "loss": 1.0416, + "step": 2779 + }, + { + "epoch": 0.43827213575257257, + "grad_norm": 1.0234375, + "learning_rate": 6.509044533593473e-05, + "loss": 1.046, + "step": 2780 + }, + { + "epoch": 0.4384297875999656, + "grad_norm": 0.97265625, + "learning_rate": 6.508613126674612e-05, + "loss": 1.0648, + "step": 2781 + }, + { + "epoch": 0.4385874394473586, + "grad_norm": 1.0078125, + "learning_rate": 6.508181727155547e-05, + "loss": 1.0563, + "step": 2782 + }, + { + "epoch": 0.43874509129475164, + "grad_norm": 0.98046875, + "learning_rate": 6.507750335037192e-05, + "loss": 1.0095, + "step": 2783 + }, + { + "epoch": 0.4389027431421446, + "grad_norm": 0.96484375, + "learning_rate": 6.507318950320461e-05, + "loss": 0.9736, + "step": 2784 + }, + { + "epoch": 0.43906039498953764, + "grad_norm": 0.90625, + "learning_rate": 6.506887573006268e-05, + "loss": 0.9974, + "step": 2785 + }, + { + "epoch": 0.43921804683693066, + "grad_norm": 1.0390625, + "learning_rate": 6.506456203095526e-05, + "loss": 1.293, + "step": 2786 + }, + { + "epoch": 0.4393756986843237, + "grad_norm": 1.0, + "learning_rate": 6.506024840589148e-05, + "loss": 0.974, + "step": 2787 + }, + { + "epoch": 0.43953335053171666, + "grad_norm": 0.94140625, + "learning_rate": 6.505593485488054e-05, + "loss": 1.0034, + "step": 2788 + }, + { + "epoch": 0.4396910023791097, + "grad_norm": 0.94140625, + "learning_rate": 6.505162137793154e-05, + "loss": 1.068, + "step": 2789 + }, + { + "epoch": 0.4398486542265027, + "grad_norm": 0.9765625, + "learning_rate": 6.504730797505365e-05, + "loss": 1.0178, + "step": 2790 + }, + { + "epoch": 0.44000630607389574, + "grad_norm": 0.93359375, + "learning_rate": 6.504299464625599e-05, + "loss": 1.1401, + "step": 2791 + }, + { + "epoch": 0.4401639579212887, + "grad_norm": 1.0234375, + "learning_rate": 6.503868139154763e-05, + "loss": 1.0917, + "step": 2792 + }, + { + "epoch": 0.44032160976868173, + "grad_norm": 0.88671875, + "learning_rate": 6.503436821093787e-05, + "loss": 1.0827, + "step": 2793 + }, + { + "epoch": 0.44047926161607476, + "grad_norm": 1.0390625, + "learning_rate": 6.503005510443574e-05, + "loss": 1.3003, + "step": 2794 + }, + { + "epoch": 0.4406369134634678, + "grad_norm": 0.984375, + "learning_rate": 6.502574207205042e-05, + "loss": 0.9289, + "step": 2795 + }, + { + "epoch": 0.44079456531086075, + "grad_norm": 1.375, + "learning_rate": 6.502142911379105e-05, + "loss": 1.1396, + "step": 2796 + }, + { + "epoch": 0.4409522171582538, + "grad_norm": 0.9296875, + "learning_rate": 6.501711622966671e-05, + "loss": 1.1392, + "step": 2797 + }, + { + "epoch": 0.4411098690056468, + "grad_norm": 1.109375, + "learning_rate": 6.501280341968663e-05, + "loss": 1.0291, + "step": 2798 + }, + { + "epoch": 0.44126752085303983, + "grad_norm": 1.03125, + "learning_rate": 6.500849068385993e-05, + "loss": 1.4314, + "step": 2799 + }, + { + "epoch": 0.4414251727004328, + "grad_norm": 1.453125, + "learning_rate": 6.500417802219572e-05, + "loss": 1.2191, + "step": 2800 + }, + { + "epoch": 0.4415828245478258, + "grad_norm": 0.9140625, + "learning_rate": 6.499986543470315e-05, + "loss": 1.1965, + "step": 2801 + }, + { + "epoch": 0.44174047639521885, + "grad_norm": 0.890625, + "learning_rate": 6.499555292139134e-05, + "loss": 0.9441, + "step": 2802 + }, + { + "epoch": 0.4418981282426119, + "grad_norm": 0.90234375, + "learning_rate": 6.49912404822695e-05, + "loss": 1.0731, + "step": 2803 + }, + { + "epoch": 0.44205578009000485, + "grad_norm": 0.90625, + "learning_rate": 6.498692811734671e-05, + "loss": 1.0864, + "step": 2804 + }, + { + "epoch": 0.44221343193739787, + "grad_norm": 0.99609375, + "learning_rate": 6.49826158266321e-05, + "loss": 0.9313, + "step": 2805 + }, + { + "epoch": 0.4423710837847909, + "grad_norm": 0.828125, + "learning_rate": 6.497830361013486e-05, + "loss": 0.8939, + "step": 2806 + }, + { + "epoch": 0.4425287356321839, + "grad_norm": 0.96484375, + "learning_rate": 6.497399146786408e-05, + "loss": 1.3015, + "step": 2807 + }, + { + "epoch": 0.4426863874795769, + "grad_norm": 0.88671875, + "learning_rate": 6.496967939982895e-05, + "loss": 0.9593, + "step": 2808 + }, + { + "epoch": 0.4428440393269699, + "grad_norm": 0.9140625, + "learning_rate": 6.496536740603858e-05, + "loss": 0.8728, + "step": 2809 + }, + { + "epoch": 0.44300169117436294, + "grad_norm": 0.8828125, + "learning_rate": 6.496105548650212e-05, + "loss": 1.1447, + "step": 2810 + }, + { + "epoch": 0.44315934302175597, + "grad_norm": 0.98828125, + "learning_rate": 6.495674364122868e-05, + "loss": 1.0463, + "step": 2811 + }, + { + "epoch": 0.44331699486914894, + "grad_norm": 0.9609375, + "learning_rate": 6.495243187022739e-05, + "loss": 1.1476, + "step": 2812 + }, + { + "epoch": 0.44347464671654196, + "grad_norm": 0.9296875, + "learning_rate": 6.494812017350746e-05, + "loss": 1.0201, + "step": 2813 + }, + { + "epoch": 0.443632298563935, + "grad_norm": 0.94921875, + "learning_rate": 6.494380855107798e-05, + "loss": 1.014, + "step": 2814 + }, + { + "epoch": 0.443789950411328, + "grad_norm": 1.046875, + "learning_rate": 6.49394970029481e-05, + "loss": 1.0446, + "step": 2815 + }, + { + "epoch": 0.44394760225872104, + "grad_norm": 0.79296875, + "learning_rate": 6.493518552912695e-05, + "loss": 0.8293, + "step": 2816 + }, + { + "epoch": 0.444105254106114, + "grad_norm": 1.03125, + "learning_rate": 6.493087412962361e-05, + "loss": 0.9589, + "step": 2817 + }, + { + "epoch": 0.44426290595350704, + "grad_norm": 0.98828125, + "learning_rate": 6.492656280444735e-05, + "loss": 1.2074, + "step": 2818 + }, + { + "epoch": 0.44442055780090006, + "grad_norm": 0.93359375, + "learning_rate": 6.492225155360722e-05, + "loss": 0.9702, + "step": 2819 + }, + { + "epoch": 0.4445782096482931, + "grad_norm": 0.94921875, + "learning_rate": 6.491794037711238e-05, + "loss": 1.2989, + "step": 2820 + }, + { + "epoch": 0.44473586149568606, + "grad_norm": 0.90234375, + "learning_rate": 6.491362927497195e-05, + "loss": 1.0855, + "step": 2821 + }, + { + "epoch": 0.4448935133430791, + "grad_norm": 0.9140625, + "learning_rate": 6.490931824719506e-05, + "loss": 0.8969, + "step": 2822 + }, + { + "epoch": 0.4450511651904721, + "grad_norm": 0.875, + "learning_rate": 6.490500729379087e-05, + "loss": 0.9227, + "step": 2823 + }, + { + "epoch": 0.44520881703786513, + "grad_norm": 0.94140625, + "learning_rate": 6.490069641476852e-05, + "loss": 0.995, + "step": 2824 + }, + { + "epoch": 0.4453664688852581, + "grad_norm": 0.88671875, + "learning_rate": 6.489638561013716e-05, + "loss": 1.1086, + "step": 2825 + }, + { + "epoch": 0.44552412073265113, + "grad_norm": 1.0546875, + "learning_rate": 6.489207487990588e-05, + "loss": 1.2279, + "step": 2826 + }, + { + "epoch": 0.44568177258004416, + "grad_norm": 0.89453125, + "learning_rate": 6.488776422408384e-05, + "loss": 1.1683, + "step": 2827 + }, + { + "epoch": 0.4458394244274372, + "grad_norm": 0.97265625, + "learning_rate": 6.488345364268018e-05, + "loss": 1.0655, + "step": 2828 + }, + { + "epoch": 0.44599707627483015, + "grad_norm": 0.80859375, + "learning_rate": 6.487914313570404e-05, + "loss": 0.9703, + "step": 2829 + }, + { + "epoch": 0.4461547281222232, + "grad_norm": 0.9140625, + "learning_rate": 6.487483270316455e-05, + "loss": 1.2179, + "step": 2830 + }, + { + "epoch": 0.4463123799696162, + "grad_norm": 1.2109375, + "learning_rate": 6.487052234507085e-05, + "loss": 1.1804, + "step": 2831 + }, + { + "epoch": 0.4464700318170092, + "grad_norm": 0.87109375, + "learning_rate": 6.486621206143203e-05, + "loss": 0.9828, + "step": 2832 + }, + { + "epoch": 0.4466276836644022, + "grad_norm": 0.94140625, + "learning_rate": 6.486190185225729e-05, + "loss": 1.1227, + "step": 2833 + }, + { + "epoch": 0.4467853355117952, + "grad_norm": 1.0234375, + "learning_rate": 6.485759171755574e-05, + "loss": 0.986, + "step": 2834 + }, + { + "epoch": 0.44694298735918825, + "grad_norm": 1.0078125, + "learning_rate": 6.485328165733653e-05, + "loss": 1.1596, + "step": 2835 + }, + { + "epoch": 0.4471006392065813, + "grad_norm": 1.09375, + "learning_rate": 6.484897167160877e-05, + "loss": 1.1091, + "step": 2836 + }, + { + "epoch": 0.44725829105397424, + "grad_norm": 0.9609375, + "learning_rate": 6.484466176038158e-05, + "loss": 0.8134, + "step": 2837 + }, + { + "epoch": 0.44741594290136727, + "grad_norm": 0.92578125, + "learning_rate": 6.484035192366414e-05, + "loss": 0.9533, + "step": 2838 + }, + { + "epoch": 0.4475735947487603, + "grad_norm": 0.984375, + "learning_rate": 6.483604216146558e-05, + "loss": 1.2262, + "step": 2839 + }, + { + "epoch": 0.4477312465961533, + "grad_norm": 0.91796875, + "learning_rate": 6.483173247379501e-05, + "loss": 1.0133, + "step": 2840 + }, + { + "epoch": 0.4478888984435463, + "grad_norm": 0.859375, + "learning_rate": 6.482742286066159e-05, + "loss": 0.9566, + "step": 2841 + }, + { + "epoch": 0.4480465502909393, + "grad_norm": 0.953125, + "learning_rate": 6.482311332207438e-05, + "loss": 1.1468, + "step": 2842 + }, + { + "epoch": 0.44820420213833234, + "grad_norm": 0.94140625, + "learning_rate": 6.48188038580426e-05, + "loss": 0.8488, + "step": 2843 + }, + { + "epoch": 0.44836185398572537, + "grad_norm": 0.9140625, + "learning_rate": 6.481449446857539e-05, + "loss": 1.0653, + "step": 2844 + }, + { + "epoch": 0.44851950583311834, + "grad_norm": 0.91796875, + "learning_rate": 6.481018515368183e-05, + "loss": 1.068, + "step": 2845 + }, + { + "epoch": 0.44867715768051136, + "grad_norm": 0.87890625, + "learning_rate": 6.480587591337106e-05, + "loss": 0.8972, + "step": 2846 + }, + { + "epoch": 0.4488348095279044, + "grad_norm": 0.98828125, + "learning_rate": 6.48015667476522e-05, + "loss": 1.1933, + "step": 2847 + }, + { + "epoch": 0.4489924613752974, + "grad_norm": 0.86328125, + "learning_rate": 6.479725765653445e-05, + "loss": 0.9931, + "step": 2848 + }, + { + "epoch": 0.4491501132226904, + "grad_norm": 1.0234375, + "learning_rate": 6.47929486400269e-05, + "loss": 1.1808, + "step": 2849 + }, + { + "epoch": 0.4493077650700834, + "grad_norm": 0.94921875, + "learning_rate": 6.478863969813866e-05, + "loss": 1.1713, + "step": 2850 + }, + { + "epoch": 0.44946541691747643, + "grad_norm": 0.9609375, + "learning_rate": 6.478433083087891e-05, + "loss": 1.1561, + "step": 2851 + }, + { + "epoch": 0.44962306876486946, + "grad_norm": 0.9765625, + "learning_rate": 6.47800220382567e-05, + "loss": 1.2913, + "step": 2852 + }, + { + "epoch": 0.44978072061226243, + "grad_norm": 0.9609375, + "learning_rate": 6.477571332028128e-05, + "loss": 1.1878, + "step": 2853 + }, + { + "epoch": 0.44993837245965546, + "grad_norm": 0.953125, + "learning_rate": 6.47714046769617e-05, + "loss": 1.2379, + "step": 2854 + }, + { + "epoch": 0.4500960243070485, + "grad_norm": 1.0234375, + "learning_rate": 6.476709610830712e-05, + "loss": 1.2034, + "step": 2855 + }, + { + "epoch": 0.4502536761544415, + "grad_norm": 0.9296875, + "learning_rate": 6.476278761432666e-05, + "loss": 1.1278, + "step": 2856 + }, + { + "epoch": 0.4504113280018345, + "grad_norm": 0.8984375, + "learning_rate": 6.475847919502945e-05, + "loss": 1.0027, + "step": 2857 + }, + { + "epoch": 0.4505689798492275, + "grad_norm": 1.375, + "learning_rate": 6.475417085042464e-05, + "loss": 1.1341, + "step": 2858 + }, + { + "epoch": 0.4507266316966205, + "grad_norm": 0.9765625, + "learning_rate": 6.474986258052135e-05, + "loss": 0.981, + "step": 2859 + }, + { + "epoch": 0.45088428354401355, + "grad_norm": 1.0234375, + "learning_rate": 6.474555438532872e-05, + "loss": 1.0832, + "step": 2860 + }, + { + "epoch": 0.4510419353914065, + "grad_norm": 1.015625, + "learning_rate": 6.474124626485587e-05, + "loss": 1.1032, + "step": 2861 + }, + { + "epoch": 0.45119958723879955, + "grad_norm": 1.09375, + "learning_rate": 6.473693821911188e-05, + "loss": 0.8015, + "step": 2862 + }, + { + "epoch": 0.4513572390861926, + "grad_norm": 1.1171875, + "learning_rate": 6.4732630248106e-05, + "loss": 1.2049, + "step": 2863 + }, + { + "epoch": 0.4515148909335856, + "grad_norm": 1.015625, + "learning_rate": 6.472832235184728e-05, + "loss": 1.131, + "step": 2864 + }, + { + "epoch": 0.45167254278097857, + "grad_norm": 0.95703125, + "learning_rate": 6.472401453034486e-05, + "loss": 1.1863, + "step": 2865 + }, + { + "epoch": 0.4518301946283716, + "grad_norm": 1.0234375, + "learning_rate": 6.471970678360789e-05, + "loss": 1.001, + "step": 2866 + }, + { + "epoch": 0.4519878464757646, + "grad_norm": 0.9140625, + "learning_rate": 6.471539911164546e-05, + "loss": 0.8913, + "step": 2867 + }, + { + "epoch": 0.45214549832315765, + "grad_norm": 0.86328125, + "learning_rate": 6.471109151446674e-05, + "loss": 0.8759, + "step": 2868 + }, + { + "epoch": 0.4523031501705506, + "grad_norm": 0.93359375, + "learning_rate": 6.470678399208085e-05, + "loss": 1.093, + "step": 2869 + }, + { + "epoch": 0.45246080201794364, + "grad_norm": 0.87890625, + "learning_rate": 6.47024765444969e-05, + "loss": 0.9012, + "step": 2870 + }, + { + "epoch": 0.45261845386533667, + "grad_norm": 0.92578125, + "learning_rate": 6.469816917172404e-05, + "loss": 1.0347, + "step": 2871 + }, + { + "epoch": 0.4527761057127297, + "grad_norm": 0.96484375, + "learning_rate": 6.469386187377135e-05, + "loss": 1.1308, + "step": 2872 + }, + { + "epoch": 0.45293375756012266, + "grad_norm": 0.9296875, + "learning_rate": 6.468955465064806e-05, + "loss": 1.1969, + "step": 2873 + }, + { + "epoch": 0.4530914094075157, + "grad_norm": 1.015625, + "learning_rate": 6.468524750236322e-05, + "loss": 1.1529, + "step": 2874 + }, + { + "epoch": 0.4532490612549087, + "grad_norm": 0.97265625, + "learning_rate": 6.4680940428926e-05, + "loss": 1.0287, + "step": 2875 + }, + { + "epoch": 0.45340671310230174, + "grad_norm": 0.83203125, + "learning_rate": 6.467663343034549e-05, + "loss": 0.9452, + "step": 2876 + }, + { + "epoch": 0.4535643649496947, + "grad_norm": 1.015625, + "learning_rate": 6.46723265066308e-05, + "loss": 1.0053, + "step": 2877 + }, + { + "epoch": 0.45372201679708773, + "grad_norm": 1.0078125, + "learning_rate": 6.466801965779114e-05, + "loss": 1.1034, + "step": 2878 + }, + { + "epoch": 0.45387966864448076, + "grad_norm": 0.87890625, + "learning_rate": 6.46637128838356e-05, + "loss": 0.9602, + "step": 2879 + }, + { + "epoch": 0.4540373204918738, + "grad_norm": 0.94140625, + "learning_rate": 6.465940618477328e-05, + "loss": 1.2403, + "step": 2880 + }, + { + "epoch": 0.45419497233926676, + "grad_norm": 1.140625, + "learning_rate": 6.465509956061336e-05, + "loss": 1.0088, + "step": 2881 + }, + { + "epoch": 0.4543526241866598, + "grad_norm": 1.0234375, + "learning_rate": 6.465079301136485e-05, + "loss": 1.1048, + "step": 2882 + }, + { + "epoch": 0.4545102760340528, + "grad_norm": 0.9453125, + "learning_rate": 6.464648653703705e-05, + "loss": 0.9967, + "step": 2883 + }, + { + "epoch": 0.45466792788144583, + "grad_norm": 0.96875, + "learning_rate": 6.464218013763896e-05, + "loss": 1.1412, + "step": 2884 + }, + { + "epoch": 0.4548255797288388, + "grad_norm": 1.015625, + "learning_rate": 6.463787381317978e-05, + "loss": 1.0688, + "step": 2885 + }, + { + "epoch": 0.45498323157623183, + "grad_norm": 1.015625, + "learning_rate": 6.463356756366859e-05, + "loss": 1.2648, + "step": 2886 + }, + { + "epoch": 0.45514088342362485, + "grad_norm": 0.90625, + "learning_rate": 6.462926138911452e-05, + "loss": 0.9267, + "step": 2887 + }, + { + "epoch": 0.4552985352710179, + "grad_norm": 1.0078125, + "learning_rate": 6.462495528952672e-05, + "loss": 1.1445, + "step": 2888 + }, + { + "epoch": 0.45545618711841085, + "grad_norm": 0.94140625, + "learning_rate": 6.462064926491431e-05, + "loss": 1.1423, + "step": 2889 + }, + { + "epoch": 0.4556138389658039, + "grad_norm": 1.0625, + "learning_rate": 6.46163433152864e-05, + "loss": 1.2558, + "step": 2890 + }, + { + "epoch": 0.4557714908131969, + "grad_norm": 0.8984375, + "learning_rate": 6.461203744065214e-05, + "loss": 0.9796, + "step": 2891 + }, + { + "epoch": 0.4559291426605899, + "grad_norm": 0.90625, + "learning_rate": 6.460773164102062e-05, + "loss": 0.977, + "step": 2892 + }, + { + "epoch": 0.4560867945079829, + "grad_norm": 0.9296875, + "learning_rate": 6.460342591640102e-05, + "loss": 1.1335, + "step": 2893 + }, + { + "epoch": 0.4562444463553759, + "grad_norm": 0.98046875, + "learning_rate": 6.459912026680243e-05, + "loss": 1.1063, + "step": 2894 + }, + { + "epoch": 0.45640209820276895, + "grad_norm": 0.9375, + "learning_rate": 6.459481469223397e-05, + "loss": 1.1357, + "step": 2895 + }, + { + "epoch": 0.45655975005016197, + "grad_norm": 0.9609375, + "learning_rate": 6.45905091927048e-05, + "loss": 1.0765, + "step": 2896 + }, + { + "epoch": 0.45671740189755494, + "grad_norm": 0.94921875, + "learning_rate": 6.458620376822395e-05, + "loss": 1.1401, + "step": 2897 + }, + { + "epoch": 0.45687505374494797, + "grad_norm": 1.0078125, + "learning_rate": 6.45818984188007e-05, + "loss": 1.1662, + "step": 2898 + }, + { + "epoch": 0.457032705592341, + "grad_norm": 0.99609375, + "learning_rate": 6.457759314444404e-05, + "loss": 1.1503, + "step": 2899 + }, + { + "epoch": 0.457190357439734, + "grad_norm": 0.9375, + "learning_rate": 6.45732879451632e-05, + "loss": 0.9518, + "step": 2900 + }, + { + "epoch": 0.457348009287127, + "grad_norm": 1.0, + "learning_rate": 6.456898282096721e-05, + "loss": 1.1939, + "step": 2901 + }, + { + "epoch": 0.45750566113452, + "grad_norm": 0.98828125, + "learning_rate": 6.456467777186522e-05, + "loss": 0.9387, + "step": 2902 + }, + { + "epoch": 0.45766331298191304, + "grad_norm": 1.046875, + "learning_rate": 6.45603727978664e-05, + "loss": 1.1041, + "step": 2903 + }, + { + "epoch": 0.45782096482930607, + "grad_norm": 0.9140625, + "learning_rate": 6.455606789897984e-05, + "loss": 1.0833, + "step": 2904 + }, + { + "epoch": 0.45797861667669904, + "grad_norm": 0.91796875, + "learning_rate": 6.455176307521468e-05, + "loss": 1.149, + "step": 2905 + }, + { + "epoch": 0.45813626852409206, + "grad_norm": 0.88671875, + "learning_rate": 6.454745832658002e-05, + "loss": 0.9623, + "step": 2906 + }, + { + "epoch": 0.4582939203714851, + "grad_norm": 0.8671875, + "learning_rate": 6.454315365308499e-05, + "loss": 0.9078, + "step": 2907 + }, + { + "epoch": 0.4584515722188781, + "grad_norm": 0.9140625, + "learning_rate": 6.453884905473873e-05, + "loss": 1.0158, + "step": 2908 + }, + { + "epoch": 0.4586092240662711, + "grad_norm": 0.8828125, + "learning_rate": 6.453454453155033e-05, + "loss": 0.9926, + "step": 2909 + }, + { + "epoch": 0.4587668759136641, + "grad_norm": 0.99609375, + "learning_rate": 6.453024008352897e-05, + "loss": 0.9995, + "step": 2910 + }, + { + "epoch": 0.45892452776105713, + "grad_norm": 0.953125, + "learning_rate": 6.452593571068372e-05, + "loss": 1.1068, + "step": 2911 + }, + { + "epoch": 0.45908217960845016, + "grad_norm": 0.83203125, + "learning_rate": 6.452163141302371e-05, + "loss": 1.0467, + "step": 2912 + }, + { + "epoch": 0.45923983145584313, + "grad_norm": 0.87890625, + "learning_rate": 6.45173271905581e-05, + "loss": 1.0606, + "step": 2913 + }, + { + "epoch": 0.45939748330323615, + "grad_norm": 0.9609375, + "learning_rate": 6.451302304329597e-05, + "loss": 1.0208, + "step": 2914 + }, + { + "epoch": 0.4595551351506292, + "grad_norm": 0.98828125, + "learning_rate": 6.450871897124647e-05, + "loss": 1.0334, + "step": 2915 + }, + { + "epoch": 0.4597127869980222, + "grad_norm": 0.9921875, + "learning_rate": 6.450441497441872e-05, + "loss": 1.0089, + "step": 2916 + }, + { + "epoch": 0.4598704388454152, + "grad_norm": 1.015625, + "learning_rate": 6.450011105282179e-05, + "loss": 1.2276, + "step": 2917 + }, + { + "epoch": 0.4600280906928082, + "grad_norm": 0.8359375, + "learning_rate": 6.449580720646488e-05, + "loss": 0.868, + "step": 2918 + }, + { + "epoch": 0.4601857425402012, + "grad_norm": 0.94140625, + "learning_rate": 6.449150343535709e-05, + "loss": 1.007, + "step": 2919 + }, + { + "epoch": 0.46034339438759425, + "grad_norm": 0.93359375, + "learning_rate": 6.44871997395075e-05, + "loss": 0.975, + "step": 2920 + }, + { + "epoch": 0.4605010462349872, + "grad_norm": 0.890625, + "learning_rate": 6.448289611892529e-05, + "loss": 0.9442, + "step": 2921 + }, + { + "epoch": 0.46065869808238025, + "grad_norm": 1.0, + "learning_rate": 6.447859257361949e-05, + "loss": 1.046, + "step": 2922 + }, + { + "epoch": 0.4608163499297733, + "grad_norm": 0.9765625, + "learning_rate": 6.447428910359933e-05, + "loss": 0.9668, + "step": 2923 + }, + { + "epoch": 0.4609740017771663, + "grad_norm": 0.9921875, + "learning_rate": 6.446998570887388e-05, + "loss": 0.9756, + "step": 2924 + }, + { + "epoch": 0.46113165362455927, + "grad_norm": 0.98828125, + "learning_rate": 6.446568238945228e-05, + "loss": 1.0839, + "step": 2925 + }, + { + "epoch": 0.4612893054719523, + "grad_norm": 0.91796875, + "learning_rate": 6.446137914534363e-05, + "loss": 1.183, + "step": 2926 + }, + { + "epoch": 0.4614469573193453, + "grad_norm": 0.96875, + "learning_rate": 6.445707597655703e-05, + "loss": 1.183, + "step": 2927 + }, + { + "epoch": 0.46160460916673834, + "grad_norm": 0.9921875, + "learning_rate": 6.445277288310162e-05, + "loss": 0.9142, + "step": 2928 + }, + { + "epoch": 0.4617622610141313, + "grad_norm": 4.625, + "learning_rate": 6.444846986498656e-05, + "loss": 0.9203, + "step": 2929 + }, + { + "epoch": 0.46191991286152434, + "grad_norm": 0.89453125, + "learning_rate": 6.444416692222093e-05, + "loss": 1.0077, + "step": 2930 + }, + { + "epoch": 0.46207756470891737, + "grad_norm": 0.98828125, + "learning_rate": 6.443986405481385e-05, + "loss": 1.3544, + "step": 2931 + }, + { + "epoch": 0.4622352165563104, + "grad_norm": 0.921875, + "learning_rate": 6.443556126277445e-05, + "loss": 0.9479, + "step": 2932 + }, + { + "epoch": 0.46239286840370336, + "grad_norm": 1.0, + "learning_rate": 6.443125854611183e-05, + "loss": 1.1212, + "step": 2933 + }, + { + "epoch": 0.4625505202510964, + "grad_norm": 0.90625, + "learning_rate": 6.442695590483514e-05, + "loss": 1.1315, + "step": 2934 + }, + { + "epoch": 0.4627081720984894, + "grad_norm": 1.2109375, + "learning_rate": 6.442265333895348e-05, + "loss": 0.83, + "step": 2935 + }, + { + "epoch": 0.46286582394588244, + "grad_norm": 0.9609375, + "learning_rate": 6.441835084847598e-05, + "loss": 0.9132, + "step": 2936 + }, + { + "epoch": 0.4630234757932754, + "grad_norm": 0.84375, + "learning_rate": 6.441404843341174e-05, + "loss": 0.8552, + "step": 2937 + }, + { + "epoch": 0.46318112764066843, + "grad_norm": 0.859375, + "learning_rate": 6.440974609376987e-05, + "loss": 1.0363, + "step": 2938 + }, + { + "epoch": 0.46333877948806146, + "grad_norm": 0.92578125, + "learning_rate": 6.440544382955953e-05, + "loss": 1.1962, + "step": 2939 + }, + { + "epoch": 0.4634964313354545, + "grad_norm": 1.0703125, + "learning_rate": 6.440114164078983e-05, + "loss": 1.0169, + "step": 2940 + }, + { + "epoch": 0.46365408318284745, + "grad_norm": 0.97265625, + "learning_rate": 6.439683952746987e-05, + "loss": 1.16, + "step": 2941 + }, + { + "epoch": 0.4638117350302405, + "grad_norm": 0.9140625, + "learning_rate": 6.439253748960877e-05, + "loss": 1.0198, + "step": 2942 + }, + { + "epoch": 0.4639693868776335, + "grad_norm": 0.93359375, + "learning_rate": 6.438823552721562e-05, + "loss": 1.0605, + "step": 2943 + }, + { + "epoch": 0.46412703872502653, + "grad_norm": 0.91796875, + "learning_rate": 6.43839336402996e-05, + "loss": 1.2189, + "step": 2944 + }, + { + "epoch": 0.4642846905724195, + "grad_norm": 0.9921875, + "learning_rate": 6.43796318288698e-05, + "loss": 0.9212, + "step": 2945 + }, + { + "epoch": 0.4644423424198125, + "grad_norm": 0.83203125, + "learning_rate": 6.437533009293535e-05, + "loss": 0.8551, + "step": 2946 + }, + { + "epoch": 0.46459999426720555, + "grad_norm": 0.984375, + "learning_rate": 6.437102843250533e-05, + "loss": 1.1616, + "step": 2947 + }, + { + "epoch": 0.4647576461145986, + "grad_norm": 0.87890625, + "learning_rate": 6.436672684758884e-05, + "loss": 0.9441, + "step": 2948 + }, + { + "epoch": 0.46491529796199155, + "grad_norm": 0.9609375, + "learning_rate": 6.436242533819509e-05, + "loss": 1.2233, + "step": 2949 + }, + { + "epoch": 0.4650729498093846, + "grad_norm": 0.87890625, + "learning_rate": 6.435812390433313e-05, + "loss": 1.0567, + "step": 2950 + }, + { + "epoch": 0.4652306016567776, + "grad_norm": 0.8984375, + "learning_rate": 6.43538225460121e-05, + "loss": 1.1503, + "step": 2951 + }, + { + "epoch": 0.4653882535041706, + "grad_norm": 0.9140625, + "learning_rate": 6.43495212632411e-05, + "loss": 1.123, + "step": 2952 + }, + { + "epoch": 0.4655459053515636, + "grad_norm": 0.84375, + "learning_rate": 6.434522005602923e-05, + "loss": 0.9633, + "step": 2953 + }, + { + "epoch": 0.4657035571989566, + "grad_norm": 0.8828125, + "learning_rate": 6.434091892438565e-05, + "loss": 0.9783, + "step": 2954 + }, + { + "epoch": 0.46586120904634964, + "grad_norm": 0.90234375, + "learning_rate": 6.433661786831946e-05, + "loss": 1.0, + "step": 2955 + }, + { + "epoch": 0.46601886089374267, + "grad_norm": 0.875, + "learning_rate": 6.433231688783976e-05, + "loss": 0.9738, + "step": 2956 + }, + { + "epoch": 0.46617651274113564, + "grad_norm": 0.93359375, + "learning_rate": 6.432801598295565e-05, + "loss": 1.2092, + "step": 2957 + }, + { + "epoch": 0.46633416458852867, + "grad_norm": 0.9609375, + "learning_rate": 6.432371515367628e-05, + "loss": 0.9751, + "step": 2958 + }, + { + "epoch": 0.4664918164359217, + "grad_norm": 0.84375, + "learning_rate": 6.431941440001079e-05, + "loss": 0.9563, + "step": 2959 + }, + { + "epoch": 0.4666494682833147, + "grad_norm": 1.03125, + "learning_rate": 6.431511372196825e-05, + "loss": 1.0504, + "step": 2960 + }, + { + "epoch": 0.4668071201307077, + "grad_norm": 0.90234375, + "learning_rate": 6.43108131195578e-05, + "loss": 1.017, + "step": 2961 + }, + { + "epoch": 0.4669647719781007, + "grad_norm": 0.9609375, + "learning_rate": 6.430651259278852e-05, + "loss": 1.0393, + "step": 2962 + }, + { + "epoch": 0.46712242382549374, + "grad_norm": 0.9296875, + "learning_rate": 6.430221214166951e-05, + "loss": 1.0031, + "step": 2963 + }, + { + "epoch": 0.46728007567288676, + "grad_norm": 0.9609375, + "learning_rate": 6.429791176620996e-05, + "loss": 1.0212, + "step": 2964 + }, + { + "epoch": 0.46743772752027973, + "grad_norm": 0.9609375, + "learning_rate": 6.429361146641895e-05, + "loss": 1.0614, + "step": 2965 + }, + { + "epoch": 0.46759537936767276, + "grad_norm": 0.90625, + "learning_rate": 6.428931124230559e-05, + "loss": 1.0419, + "step": 2966 + }, + { + "epoch": 0.4677530312150658, + "grad_norm": 0.90625, + "learning_rate": 6.4285011093879e-05, + "loss": 1.0604, + "step": 2967 + }, + { + "epoch": 0.4679106830624588, + "grad_norm": 0.94921875, + "learning_rate": 6.428071102114824e-05, + "loss": 1.2436, + "step": 2968 + }, + { + "epoch": 0.4680683349098518, + "grad_norm": 0.8984375, + "learning_rate": 6.427641102412251e-05, + "loss": 1.1099, + "step": 2969 + }, + { + "epoch": 0.4682259867572448, + "grad_norm": 0.96875, + "learning_rate": 6.42721111028109e-05, + "loss": 1.0098, + "step": 2970 + }, + { + "epoch": 0.46838363860463783, + "grad_norm": 0.90625, + "learning_rate": 6.42678112572225e-05, + "loss": 0.965, + "step": 2971 + }, + { + "epoch": 0.46854129045203086, + "grad_norm": 1.078125, + "learning_rate": 6.426351148736642e-05, + "loss": 1.099, + "step": 2972 + }, + { + "epoch": 0.4686989422994239, + "grad_norm": 1.078125, + "learning_rate": 6.425921179325177e-05, + "loss": 1.2015, + "step": 2973 + }, + { + "epoch": 0.46885659414681685, + "grad_norm": 1.046875, + "learning_rate": 6.42549121748877e-05, + "loss": 1.0897, + "step": 2974 + }, + { + "epoch": 0.4690142459942099, + "grad_norm": 0.97265625, + "learning_rate": 6.425061263228331e-05, + "loss": 1.0414, + "step": 2975 + }, + { + "epoch": 0.4691718978416029, + "grad_norm": 1.03125, + "learning_rate": 6.424631316544767e-05, + "loss": 1.0062, + "step": 2976 + }, + { + "epoch": 0.46932954968899593, + "grad_norm": 0.98046875, + "learning_rate": 6.424201377438995e-05, + "loss": 0.9038, + "step": 2977 + }, + { + "epoch": 0.4694872015363889, + "grad_norm": 0.890625, + "learning_rate": 6.423771445911921e-05, + "loss": 1.0877, + "step": 2978 + }, + { + "epoch": 0.4696448533837819, + "grad_norm": 0.96875, + "learning_rate": 6.423341521964463e-05, + "loss": 0.9739, + "step": 2979 + }, + { + "epoch": 0.46980250523117495, + "grad_norm": 1.015625, + "learning_rate": 6.422911605597527e-05, + "loss": 1.1937, + "step": 2980 + }, + { + "epoch": 0.469960157078568, + "grad_norm": 0.875, + "learning_rate": 6.422481696812026e-05, + "loss": 0.9589, + "step": 2981 + }, + { + "epoch": 0.47011780892596094, + "grad_norm": 0.91796875, + "learning_rate": 6.42205179560887e-05, + "loss": 1.0437, + "step": 2982 + }, + { + "epoch": 0.47027546077335397, + "grad_norm": 0.94140625, + "learning_rate": 6.421621901988965e-05, + "loss": 1.0172, + "step": 2983 + }, + { + "epoch": 0.470433112620747, + "grad_norm": 0.86328125, + "learning_rate": 6.421192015953235e-05, + "loss": 0.9801, + "step": 2984 + }, + { + "epoch": 0.47059076446814, + "grad_norm": 0.90625, + "learning_rate": 6.420762137502583e-05, + "loss": 0.9537, + "step": 2985 + }, + { + "epoch": 0.470748416315533, + "grad_norm": 0.8359375, + "learning_rate": 6.420332266637919e-05, + "loss": 0.8769, + "step": 2986 + }, + { + "epoch": 0.470906068162926, + "grad_norm": 0.9453125, + "learning_rate": 6.419902403360158e-05, + "loss": 1.0042, + "step": 2987 + }, + { + "epoch": 0.47106372001031904, + "grad_norm": 1.125, + "learning_rate": 6.419472547670206e-05, + "loss": 1.0405, + "step": 2988 + }, + { + "epoch": 0.47122137185771207, + "grad_norm": 0.9375, + "learning_rate": 6.41904269956898e-05, + "loss": 0.9437, + "step": 2989 + }, + { + "epoch": 0.47137902370510504, + "grad_norm": 0.91796875, + "learning_rate": 6.41861285905739e-05, + "loss": 1.0336, + "step": 2990 + }, + { + "epoch": 0.47153667555249806, + "grad_norm": 1.0, + "learning_rate": 6.418183026136342e-05, + "loss": 1.06, + "step": 2991 + }, + { + "epoch": 0.4716943273998911, + "grad_norm": 0.9296875, + "learning_rate": 6.417753200806753e-05, + "loss": 1.0348, + "step": 2992 + }, + { + "epoch": 0.4718519792472841, + "grad_norm": 0.84765625, + "learning_rate": 6.41732338306953e-05, + "loss": 0.8807, + "step": 2993 + }, + { + "epoch": 0.4720096310946771, + "grad_norm": 0.99609375, + "learning_rate": 6.416893572925583e-05, + "loss": 1.0623, + "step": 2994 + }, + { + "epoch": 0.4721672829420701, + "grad_norm": 0.8828125, + "learning_rate": 6.416463770375827e-05, + "loss": 1.1014, + "step": 2995 + }, + { + "epoch": 0.47232493478946314, + "grad_norm": 0.86328125, + "learning_rate": 6.416033975421173e-05, + "loss": 1.0485, + "step": 2996 + }, + { + "epoch": 0.47248258663685616, + "grad_norm": 0.97265625, + "learning_rate": 6.41560418806253e-05, + "loss": 1.0475, + "step": 2997 + }, + { + "epoch": 0.47264023848424913, + "grad_norm": 0.9765625, + "learning_rate": 6.415174408300806e-05, + "loss": 1.0459, + "step": 2998 + }, + { + "epoch": 0.47279789033164216, + "grad_norm": 0.9375, + "learning_rate": 6.414744636136918e-05, + "loss": 1.3581, + "step": 2999 + }, + { + "epoch": 0.4729555421790352, + "grad_norm": 0.99609375, + "learning_rate": 6.414314871571773e-05, + "loss": 1.0912, + "step": 3000 + }, + { + "epoch": 0.4729555421790352, + "eval_loss": 1.0201112031936646, + "eval_runtime": 310.4524, + "eval_samples_per_second": 32.211, + "eval_steps_per_second": 0.673, + "step": 3000 + }, + { + "epoch": 0.4731131940264282, + "grad_norm": 0.86328125, + "learning_rate": 6.413885114606284e-05, + "loss": 0.8915, + "step": 3001 + }, + { + "epoch": 0.4732708458738212, + "grad_norm": 1.03125, + "learning_rate": 6.413455365241358e-05, + "loss": 1.2258, + "step": 3002 + }, + { + "epoch": 0.4734284977212142, + "grad_norm": 0.92578125, + "learning_rate": 6.413025623477907e-05, + "loss": 0.8646, + "step": 3003 + }, + { + "epoch": 0.47358614956860723, + "grad_norm": 2.15625, + "learning_rate": 6.412595889316845e-05, + "loss": 0.9748, + "step": 3004 + }, + { + "epoch": 0.47374380141600025, + "grad_norm": 0.9765625, + "learning_rate": 6.412166162759084e-05, + "loss": 1.1281, + "step": 3005 + }, + { + "epoch": 0.4739014532633932, + "grad_norm": 0.9375, + "learning_rate": 6.411736443805529e-05, + "loss": 1.0596, + "step": 3006 + }, + { + "epoch": 0.47405910511078625, + "grad_norm": 0.8671875, + "learning_rate": 6.411306732457094e-05, + "loss": 0.8141, + "step": 3007 + }, + { + "epoch": 0.4742167569581793, + "grad_norm": 1.0234375, + "learning_rate": 6.410877028714686e-05, + "loss": 1.0134, + "step": 3008 + }, + { + "epoch": 0.4743744088055723, + "grad_norm": 1.0078125, + "learning_rate": 6.410447332579222e-05, + "loss": 0.951, + "step": 3009 + }, + { + "epoch": 0.47453206065296527, + "grad_norm": 0.890625, + "learning_rate": 6.410017644051611e-05, + "loss": 1.0156, + "step": 3010 + }, + { + "epoch": 0.4746897125003583, + "grad_norm": 0.9140625, + "learning_rate": 6.409587963132762e-05, + "loss": 0.9838, + "step": 3011 + }, + { + "epoch": 0.4748473643477513, + "grad_norm": 1.09375, + "learning_rate": 6.409158289823584e-05, + "loss": 1.0861, + "step": 3012 + }, + { + "epoch": 0.47500501619514435, + "grad_norm": 0.88671875, + "learning_rate": 6.408728624124988e-05, + "loss": 0.7852, + "step": 3013 + }, + { + "epoch": 0.4751626680425373, + "grad_norm": 0.95703125, + "learning_rate": 6.408298966037892e-05, + "loss": 1.1155, + "step": 3014 + }, + { + "epoch": 0.47532031988993034, + "grad_norm": 0.9453125, + "learning_rate": 6.407869315563198e-05, + "loss": 0.9624, + "step": 3015 + }, + { + "epoch": 0.47547797173732337, + "grad_norm": 0.96484375, + "learning_rate": 6.40743967270182e-05, + "loss": 0.947, + "step": 3016 + }, + { + "epoch": 0.4756356235847164, + "grad_norm": 0.98046875, + "learning_rate": 6.407010037454669e-05, + "loss": 0.9774, + "step": 3017 + }, + { + "epoch": 0.47579327543210936, + "grad_norm": 0.91015625, + "learning_rate": 6.406580409822654e-05, + "loss": 0.943, + "step": 3018 + }, + { + "epoch": 0.4759509272795024, + "grad_norm": 0.875, + "learning_rate": 6.406150789806686e-05, + "loss": 1.1236, + "step": 3019 + }, + { + "epoch": 0.4761085791268954, + "grad_norm": 0.9921875, + "learning_rate": 6.405721177407678e-05, + "loss": 1.0658, + "step": 3020 + }, + { + "epoch": 0.47626623097428844, + "grad_norm": 1.0234375, + "learning_rate": 6.405291572626537e-05, + "loss": 1.0106, + "step": 3021 + }, + { + "epoch": 0.4764238828216814, + "grad_norm": 0.85546875, + "learning_rate": 6.404861975464177e-05, + "loss": 1.025, + "step": 3022 + }, + { + "epoch": 0.47658153466907444, + "grad_norm": 0.8359375, + "learning_rate": 6.404432385921501e-05, + "loss": 0.9433, + "step": 3023 + }, + { + "epoch": 0.47673918651646746, + "grad_norm": 0.86328125, + "learning_rate": 6.40400280399943e-05, + "loss": 1.0483, + "step": 3024 + }, + { + "epoch": 0.4768968383638605, + "grad_norm": 0.85546875, + "learning_rate": 6.403573229698868e-05, + "loss": 0.8012, + "step": 3025 + }, + { + "epoch": 0.47705449021125346, + "grad_norm": 1.0, + "learning_rate": 6.403143663020727e-05, + "loss": 0.891, + "step": 3026 + }, + { + "epoch": 0.4772121420586465, + "grad_norm": 0.859375, + "learning_rate": 6.40271410396592e-05, + "loss": 0.8501, + "step": 3027 + }, + { + "epoch": 0.4773697939060395, + "grad_norm": 1.203125, + "learning_rate": 6.402284552535349e-05, + "loss": 1.321, + "step": 3028 + }, + { + "epoch": 0.47752744575343253, + "grad_norm": 0.8515625, + "learning_rate": 6.401855008729934e-05, + "loss": 0.863, + "step": 3029 + }, + { + "epoch": 0.4776850976008255, + "grad_norm": 0.984375, + "learning_rate": 6.401425472550581e-05, + "loss": 0.9992, + "step": 3030 + }, + { + "epoch": 0.47784274944821853, + "grad_norm": 0.91796875, + "learning_rate": 6.400995943998204e-05, + "loss": 1.0827, + "step": 3031 + }, + { + "epoch": 0.47800040129561155, + "grad_norm": 0.91796875, + "learning_rate": 6.400566423073709e-05, + "loss": 0.9695, + "step": 3032 + }, + { + "epoch": 0.4781580531430046, + "grad_norm": 0.92578125, + "learning_rate": 6.400136909778002e-05, + "loss": 1.0187, + "step": 3033 + }, + { + "epoch": 0.47831570499039755, + "grad_norm": 0.8984375, + "learning_rate": 6.399707404112005e-05, + "loss": 1.1181, + "step": 3034 + }, + { + "epoch": 0.4784733568377906, + "grad_norm": 0.8125, + "learning_rate": 6.399277906076622e-05, + "loss": 0.9862, + "step": 3035 + }, + { + "epoch": 0.4786310086851836, + "grad_norm": 0.90625, + "learning_rate": 6.398848415672762e-05, + "loss": 0.938, + "step": 3036 + }, + { + "epoch": 0.4787886605325766, + "grad_norm": 0.9921875, + "learning_rate": 6.39841893290134e-05, + "loss": 1.0769, + "step": 3037 + }, + { + "epoch": 0.4789463123799696, + "grad_norm": 1.0, + "learning_rate": 6.397989457763258e-05, + "loss": 0.8636, + "step": 3038 + }, + { + "epoch": 0.4791039642273626, + "grad_norm": 1.515625, + "learning_rate": 6.397559990259437e-05, + "loss": 1.0157, + "step": 3039 + }, + { + "epoch": 0.47926161607475565, + "grad_norm": 0.83984375, + "learning_rate": 6.397130530390778e-05, + "loss": 1.0076, + "step": 3040 + }, + { + "epoch": 0.4794192679221487, + "grad_norm": 0.89453125, + "learning_rate": 6.396701078158197e-05, + "loss": 0.9069, + "step": 3041 + }, + { + "epoch": 0.47957691976954164, + "grad_norm": 1.0546875, + "learning_rate": 6.396271633562601e-05, + "loss": 1.2792, + "step": 3042 + }, + { + "epoch": 0.47973457161693467, + "grad_norm": 0.92578125, + "learning_rate": 6.395842196604899e-05, + "loss": 1.0496, + "step": 3043 + }, + { + "epoch": 0.4798922234643277, + "grad_norm": 0.8828125, + "learning_rate": 6.395412767286007e-05, + "loss": 0.8954, + "step": 3044 + }, + { + "epoch": 0.4800498753117207, + "grad_norm": 0.96875, + "learning_rate": 6.394983345606831e-05, + "loss": 1.0327, + "step": 3045 + }, + { + "epoch": 0.4802075271591137, + "grad_norm": 0.890625, + "learning_rate": 6.394553931568281e-05, + "loss": 0.9867, + "step": 3046 + }, + { + "epoch": 0.4803651790065067, + "grad_norm": 1.0078125, + "learning_rate": 6.394124525171269e-05, + "loss": 0.9998, + "step": 3047 + }, + { + "epoch": 0.48052283085389974, + "grad_norm": 1.015625, + "learning_rate": 6.393695126416698e-05, + "loss": 1.0541, + "step": 3048 + }, + { + "epoch": 0.48068048270129277, + "grad_norm": 0.99609375, + "learning_rate": 6.393265735305492e-05, + "loss": 1.1514, + "step": 3049 + }, + { + "epoch": 0.48083813454868574, + "grad_norm": 0.98046875, + "learning_rate": 6.39283635183855e-05, + "loss": 1.24, + "step": 3050 + }, + { + "epoch": 0.48099578639607876, + "grad_norm": 0.88671875, + "learning_rate": 6.392406976016786e-05, + "loss": 1.048, + "step": 3051 + }, + { + "epoch": 0.4811534382434718, + "grad_norm": 0.953125, + "learning_rate": 6.391977607841109e-05, + "loss": 0.9697, + "step": 3052 + }, + { + "epoch": 0.4813110900908648, + "grad_norm": 0.84765625, + "learning_rate": 6.391548247312425e-05, + "loss": 0.8896, + "step": 3053 + }, + { + "epoch": 0.4814687419382578, + "grad_norm": 0.90625, + "learning_rate": 6.391118894431654e-05, + "loss": 0.8735, + "step": 3054 + }, + { + "epoch": 0.4816263937856508, + "grad_norm": 0.94921875, + "learning_rate": 6.390689549199698e-05, + "loss": 1.181, + "step": 3055 + }, + { + "epoch": 0.48178404563304383, + "grad_norm": 0.94140625, + "learning_rate": 6.390260211617471e-05, + "loss": 0.9737, + "step": 3056 + }, + { + "epoch": 0.48194169748043686, + "grad_norm": 0.85546875, + "learning_rate": 6.38983088168588e-05, + "loss": 0.9113, + "step": 3057 + }, + { + "epoch": 0.48209934932782983, + "grad_norm": 1.0234375, + "learning_rate": 6.389401559405835e-05, + "loss": 1.1394, + "step": 3058 + }, + { + "epoch": 0.48225700117522285, + "grad_norm": 1.0078125, + "learning_rate": 6.388972244778248e-05, + "loss": 1.1372, + "step": 3059 + }, + { + "epoch": 0.4824146530226159, + "grad_norm": 0.89453125, + "learning_rate": 6.388542937804027e-05, + "loss": 0.8104, + "step": 3060 + }, + { + "epoch": 0.4825723048700089, + "grad_norm": 0.921875, + "learning_rate": 6.388113638484083e-05, + "loss": 1.2575, + "step": 3061 + }, + { + "epoch": 0.4827299567174019, + "grad_norm": 1.0078125, + "learning_rate": 6.387684346819328e-05, + "loss": 1.1679, + "step": 3062 + }, + { + "epoch": 0.4828876085647949, + "grad_norm": 1.0234375, + "learning_rate": 6.387255062810666e-05, + "loss": 1.2936, + "step": 3063 + }, + { + "epoch": 0.4830452604121879, + "grad_norm": 0.98828125, + "learning_rate": 6.386825786459012e-05, + "loss": 0.9851, + "step": 3064 + }, + { + "epoch": 0.48320291225958095, + "grad_norm": 0.91015625, + "learning_rate": 6.386396517765275e-05, + "loss": 0.9444, + "step": 3065 + }, + { + "epoch": 0.4833605641069739, + "grad_norm": 0.91796875, + "learning_rate": 6.385967256730363e-05, + "loss": 0.9007, + "step": 3066 + }, + { + "epoch": 0.48351821595436695, + "grad_norm": 0.81640625, + "learning_rate": 6.385538003355189e-05, + "loss": 0.8841, + "step": 3067 + }, + { + "epoch": 0.48367586780176, + "grad_norm": 0.8828125, + "learning_rate": 6.38510875764066e-05, + "loss": 0.9777, + "step": 3068 + }, + { + "epoch": 0.483833519649153, + "grad_norm": 0.890625, + "learning_rate": 6.384679519587682e-05, + "loss": 0.8578, + "step": 3069 + }, + { + "epoch": 0.48399117149654597, + "grad_norm": 0.96484375, + "learning_rate": 6.384250289197172e-05, + "loss": 1.1878, + "step": 3070 + }, + { + "epoch": 0.484148823343939, + "grad_norm": 1.046875, + "learning_rate": 6.383821066470039e-05, + "loss": 1.186, + "step": 3071 + }, + { + "epoch": 0.484306475191332, + "grad_norm": 0.96484375, + "learning_rate": 6.383391851407189e-05, + "loss": 0.8684, + "step": 3072 + }, + { + "epoch": 0.48446412703872505, + "grad_norm": 1.0, + "learning_rate": 6.382962644009534e-05, + "loss": 1.0652, + "step": 3073 + }, + { + "epoch": 0.484621778886118, + "grad_norm": 0.99609375, + "learning_rate": 6.382533444277978e-05, + "loss": 1.184, + "step": 3074 + }, + { + "epoch": 0.48477943073351104, + "grad_norm": 0.8828125, + "learning_rate": 6.38210425221344e-05, + "loss": 0.8562, + "step": 3075 + }, + { + "epoch": 0.48493708258090407, + "grad_norm": 1.015625, + "learning_rate": 6.381675067816825e-05, + "loss": 1.1279, + "step": 3076 + }, + { + "epoch": 0.4850947344282971, + "grad_norm": 0.8359375, + "learning_rate": 6.381245891089045e-05, + "loss": 0.9765, + "step": 3077 + }, + { + "epoch": 0.48525238627569006, + "grad_norm": 0.80078125, + "learning_rate": 6.380816722031005e-05, + "loss": 0.8693, + "step": 3078 + }, + { + "epoch": 0.4854100381230831, + "grad_norm": 1.0390625, + "learning_rate": 6.380387560643616e-05, + "loss": 0.9756, + "step": 3079 + }, + { + "epoch": 0.4855676899704761, + "grad_norm": 0.90625, + "learning_rate": 6.379958406927789e-05, + "loss": 1.0338, + "step": 3080 + }, + { + "epoch": 0.48572534181786914, + "grad_norm": 0.828125, + "learning_rate": 6.379529260884434e-05, + "loss": 0.8861, + "step": 3081 + }, + { + "epoch": 0.4858829936652621, + "grad_norm": 0.8828125, + "learning_rate": 6.37910012251446e-05, + "loss": 0.9717, + "step": 3082 + }, + { + "epoch": 0.48604064551265513, + "grad_norm": 0.87109375, + "learning_rate": 6.378670991818778e-05, + "loss": 0.8443, + "step": 3083 + }, + { + "epoch": 0.48619829736004816, + "grad_norm": 0.88671875, + "learning_rate": 6.378241868798293e-05, + "loss": 0.9172, + "step": 3084 + }, + { + "epoch": 0.4863559492074412, + "grad_norm": 0.9140625, + "learning_rate": 6.377812753453919e-05, + "loss": 0.9223, + "step": 3085 + }, + { + "epoch": 0.48651360105483415, + "grad_norm": 0.8828125, + "learning_rate": 6.377383645786563e-05, + "loss": 0.9662, + "step": 3086 + }, + { + "epoch": 0.4866712529022272, + "grad_norm": 0.87890625, + "learning_rate": 6.376954545797138e-05, + "loss": 1.0614, + "step": 3087 + }, + { + "epoch": 0.4868289047496202, + "grad_norm": 1.0390625, + "learning_rate": 6.376525453486549e-05, + "loss": 1.2085, + "step": 3088 + }, + { + "epoch": 0.48698655659701323, + "grad_norm": 0.91796875, + "learning_rate": 6.376096368855701e-05, + "loss": 0.993, + "step": 3089 + }, + { + "epoch": 0.4871442084444062, + "grad_norm": 0.9609375, + "learning_rate": 6.375667291905519e-05, + "loss": 0.9354, + "step": 3090 + }, + { + "epoch": 0.4873018602917992, + "grad_norm": 1.046875, + "learning_rate": 6.375238222636899e-05, + "loss": 1.0115, + "step": 3091 + }, + { + "epoch": 0.48745951213919225, + "grad_norm": 0.8828125, + "learning_rate": 6.374809161050754e-05, + "loss": 0.799, + "step": 3092 + }, + { + "epoch": 0.4876171639865853, + "grad_norm": 0.99609375, + "learning_rate": 6.374380107147996e-05, + "loss": 0.9673, + "step": 3093 + }, + { + "epoch": 0.48777481583397825, + "grad_norm": 0.92578125, + "learning_rate": 6.373951060929526e-05, + "loss": 1.1461, + "step": 3094 + }, + { + "epoch": 0.4879324676813713, + "grad_norm": 1.078125, + "learning_rate": 6.373522022396265e-05, + "loss": 1.2105, + "step": 3095 + }, + { + "epoch": 0.4880901195287643, + "grad_norm": 1.0234375, + "learning_rate": 6.373092991549117e-05, + "loss": 1.0989, + "step": 3096 + }, + { + "epoch": 0.4882477713761573, + "grad_norm": 0.9375, + "learning_rate": 6.37266396838899e-05, + "loss": 0.9336, + "step": 3097 + }, + { + "epoch": 0.4884054232235503, + "grad_norm": 0.94140625, + "learning_rate": 6.372234952916796e-05, + "loss": 1.1615, + "step": 3098 + }, + { + "epoch": 0.4885630750709433, + "grad_norm": 0.890625, + "learning_rate": 6.371805945133437e-05, + "loss": 0.9691, + "step": 3099 + }, + { + "epoch": 0.48872072691833635, + "grad_norm": 1.0234375, + "learning_rate": 6.371376945039833e-05, + "loss": 1.2314, + "step": 3100 + }, + { + "epoch": 0.48887837876572937, + "grad_norm": 0.91796875, + "learning_rate": 6.370947952636887e-05, + "loss": 1.0524, + "step": 3101 + }, + { + "epoch": 0.48903603061312234, + "grad_norm": 0.9375, + "learning_rate": 6.37051896792551e-05, + "loss": 0.9398, + "step": 3102 + }, + { + "epoch": 0.48919368246051537, + "grad_norm": 1.0078125, + "learning_rate": 6.370089990906612e-05, + "loss": 1.0843, + "step": 3103 + }, + { + "epoch": 0.4893513343079084, + "grad_norm": 0.8828125, + "learning_rate": 6.369661021581097e-05, + "loss": 0.9807, + "step": 3104 + }, + { + "epoch": 0.4895089861553014, + "grad_norm": 0.8515625, + "learning_rate": 6.369232059949881e-05, + "loss": 0.9204, + "step": 3105 + }, + { + "epoch": 0.4896666380026944, + "grad_norm": 0.890625, + "learning_rate": 6.368803106013869e-05, + "loss": 0.9076, + "step": 3106 + }, + { + "epoch": 0.4898242898500874, + "grad_norm": 0.81640625, + "learning_rate": 6.368374159773971e-05, + "loss": 1.0334, + "step": 3107 + }, + { + "epoch": 0.48998194169748044, + "grad_norm": 0.8515625, + "learning_rate": 6.367945221231097e-05, + "loss": 0.9416, + "step": 3108 + }, + { + "epoch": 0.49013959354487346, + "grad_norm": 0.91015625, + "learning_rate": 6.367516290386153e-05, + "loss": 1.0069, + "step": 3109 + }, + { + "epoch": 0.49029724539226643, + "grad_norm": 0.97265625, + "learning_rate": 6.367087367240053e-05, + "loss": 1.1338, + "step": 3110 + }, + { + "epoch": 0.49045489723965946, + "grad_norm": 0.984375, + "learning_rate": 6.366658451793703e-05, + "loss": 1.1915, + "step": 3111 + }, + { + "epoch": 0.4906125490870525, + "grad_norm": 0.94140625, + "learning_rate": 6.366229544048015e-05, + "loss": 0.9578, + "step": 3112 + }, + { + "epoch": 0.4907702009344455, + "grad_norm": 0.88671875, + "learning_rate": 6.365800644003894e-05, + "loss": 1.0418, + "step": 3113 + }, + { + "epoch": 0.4909278527818385, + "grad_norm": 0.859375, + "learning_rate": 6.365371751662247e-05, + "loss": 1.0268, + "step": 3114 + }, + { + "epoch": 0.4910855046292315, + "grad_norm": 0.8359375, + "learning_rate": 6.364942867023992e-05, + "loss": 0.856, + "step": 3115 + }, + { + "epoch": 0.49124315647662453, + "grad_norm": 0.95703125, + "learning_rate": 6.364513990090032e-05, + "loss": 1.1017, + "step": 3116 + }, + { + "epoch": 0.49140080832401756, + "grad_norm": 0.88671875, + "learning_rate": 6.364085120861276e-05, + "loss": 1.0334, + "step": 3117 + }, + { + "epoch": 0.4915584601714105, + "grad_norm": 0.875, + "learning_rate": 6.363656259338635e-05, + "loss": 1.0768, + "step": 3118 + }, + { + "epoch": 0.49171611201880355, + "grad_norm": 0.9609375, + "learning_rate": 6.363227405523013e-05, + "loss": 0.941, + "step": 3119 + }, + { + "epoch": 0.4918737638661966, + "grad_norm": 0.9453125, + "learning_rate": 6.362798559415327e-05, + "loss": 0.9969, + "step": 3120 + }, + { + "epoch": 0.4920314157135896, + "grad_norm": 0.95703125, + "learning_rate": 6.362369721016479e-05, + "loss": 1.0028, + "step": 3121 + }, + { + "epoch": 0.4921890675609826, + "grad_norm": 1.015625, + "learning_rate": 6.361940890327382e-05, + "loss": 1.1523, + "step": 3122 + }, + { + "epoch": 0.4923467194083756, + "grad_norm": 0.9140625, + "learning_rate": 6.361512067348944e-05, + "loss": 1.0733, + "step": 3123 + }, + { + "epoch": 0.4925043712557686, + "grad_norm": 0.87109375, + "learning_rate": 6.36108325208207e-05, + "loss": 0.997, + "step": 3124 + }, + { + "epoch": 0.49266202310316165, + "grad_norm": 0.859375, + "learning_rate": 6.360654444527675e-05, + "loss": 1.0732, + "step": 3125 + }, + { + "epoch": 0.4928196749505546, + "grad_norm": 0.97265625, + "learning_rate": 6.360225644686664e-05, + "loss": 1.1142, + "step": 3126 + }, + { + "epoch": 0.49297732679794765, + "grad_norm": 0.9296875, + "learning_rate": 6.359796852559948e-05, + "loss": 1.1833, + "step": 3127 + }, + { + "epoch": 0.49313497864534067, + "grad_norm": 0.9140625, + "learning_rate": 6.359368068148431e-05, + "loss": 1.0652, + "step": 3128 + }, + { + "epoch": 0.4932926304927337, + "grad_norm": 0.94140625, + "learning_rate": 6.358939291453026e-05, + "loss": 1.2144, + "step": 3129 + }, + { + "epoch": 0.4934502823401267, + "grad_norm": 0.984375, + "learning_rate": 6.358510522474643e-05, + "loss": 0.9077, + "step": 3130 + }, + { + "epoch": 0.4936079341875197, + "grad_norm": 0.87890625, + "learning_rate": 6.358081761214189e-05, + "loss": 0.8758, + "step": 3131 + }, + { + "epoch": 0.4937655860349127, + "grad_norm": 0.89453125, + "learning_rate": 6.357653007672572e-05, + "loss": 0.8917, + "step": 3132 + }, + { + "epoch": 0.49392323788230574, + "grad_norm": 0.8671875, + "learning_rate": 6.357224261850701e-05, + "loss": 0.9171, + "step": 3133 + }, + { + "epoch": 0.49408088972969877, + "grad_norm": 0.90625, + "learning_rate": 6.356795523749483e-05, + "loss": 0.945, + "step": 3134 + }, + { + "epoch": 0.49423854157709174, + "grad_norm": 0.94140625, + "learning_rate": 6.35636679336983e-05, + "loss": 1.0642, + "step": 3135 + }, + { + "epoch": 0.49439619342448476, + "grad_norm": 1.0390625, + "learning_rate": 6.355938070712651e-05, + "loss": 1.1653, + "step": 3136 + }, + { + "epoch": 0.4945538452718778, + "grad_norm": 0.83203125, + "learning_rate": 6.355509355778852e-05, + "loss": 0.867, + "step": 3137 + }, + { + "epoch": 0.4947114971192708, + "grad_norm": 0.90234375, + "learning_rate": 6.355080648569345e-05, + "loss": 0.9392, + "step": 3138 + }, + { + "epoch": 0.4948691489666638, + "grad_norm": 0.84765625, + "learning_rate": 6.354651949085028e-05, + "loss": 1.0152, + "step": 3139 + }, + { + "epoch": 0.4950268008140568, + "grad_norm": 0.99609375, + "learning_rate": 6.354223257326826e-05, + "loss": 1.2065, + "step": 3140 + }, + { + "epoch": 0.49518445266144984, + "grad_norm": 0.94921875, + "learning_rate": 6.353794573295637e-05, + "loss": 1.0764, + "step": 3141 + }, + { + "epoch": 0.49534210450884286, + "grad_norm": 1.015625, + "learning_rate": 6.353365896992374e-05, + "loss": 1.126, + "step": 3142 + }, + { + "epoch": 0.49549975635623583, + "grad_norm": 0.9765625, + "learning_rate": 6.352937228417942e-05, + "loss": 0.957, + "step": 3143 + }, + { + "epoch": 0.49565740820362886, + "grad_norm": 0.984375, + "learning_rate": 6.352508567573247e-05, + "loss": 1.0135, + "step": 3144 + }, + { + "epoch": 0.4958150600510219, + "grad_norm": 0.921875, + "learning_rate": 6.352079914459208e-05, + "loss": 1.0165, + "step": 3145 + }, + { + "epoch": 0.4959727118984149, + "grad_norm": 1.0078125, + "learning_rate": 6.351651269076723e-05, + "loss": 1.1789, + "step": 3146 + }, + { + "epoch": 0.4961303637458079, + "grad_norm": 0.84765625, + "learning_rate": 6.351222631426705e-05, + "loss": 1.0894, + "step": 3147 + }, + { + "epoch": 0.4962880155932009, + "grad_norm": 0.9765625, + "learning_rate": 6.350794001510066e-05, + "loss": 1.1899, + "step": 3148 + }, + { + "epoch": 0.49644566744059393, + "grad_norm": 0.8828125, + "learning_rate": 6.350365379327705e-05, + "loss": 1.0332, + "step": 3149 + }, + { + "epoch": 0.49660331928798696, + "grad_norm": 0.91796875, + "learning_rate": 6.34993676488054e-05, + "loss": 1.0062, + "step": 3150 + }, + { + "epoch": 0.4967609711353799, + "grad_norm": 0.9453125, + "learning_rate": 6.349508158169474e-05, + "loss": 0.9593, + "step": 3151 + }, + { + "epoch": 0.49691862298277295, + "grad_norm": 1.0625, + "learning_rate": 6.349079559195416e-05, + "loss": 1.2109, + "step": 3152 + }, + { + "epoch": 0.497076274830166, + "grad_norm": 0.875, + "learning_rate": 6.348650967959278e-05, + "loss": 1.1193, + "step": 3153 + }, + { + "epoch": 0.497233926677559, + "grad_norm": 1.03125, + "learning_rate": 6.34822238446196e-05, + "loss": 1.2981, + "step": 3154 + }, + { + "epoch": 0.49739157852495197, + "grad_norm": 0.9140625, + "learning_rate": 6.347793808704381e-05, + "loss": 1.0825, + "step": 3155 + }, + { + "epoch": 0.497549230372345, + "grad_norm": 0.93359375, + "learning_rate": 6.347365240687443e-05, + "loss": 0.8452, + "step": 3156 + }, + { + "epoch": 0.497706882219738, + "grad_norm": 0.91015625, + "learning_rate": 6.346936680412055e-05, + "loss": 1.1153, + "step": 3157 + }, + { + "epoch": 0.49786453406713105, + "grad_norm": 0.9921875, + "learning_rate": 6.346508127879128e-05, + "loss": 1.0513, + "step": 3158 + }, + { + "epoch": 0.498022185914524, + "grad_norm": 0.94140625, + "learning_rate": 6.346079583089563e-05, + "loss": 1.2105, + "step": 3159 + }, + { + "epoch": 0.49817983776191704, + "grad_norm": 0.97265625, + "learning_rate": 6.345651046044277e-05, + "loss": 0.9945, + "step": 3160 + }, + { + "epoch": 0.49833748960931007, + "grad_norm": 0.9921875, + "learning_rate": 6.345222516744175e-05, + "loss": 1.0579, + "step": 3161 + }, + { + "epoch": 0.4984951414567031, + "grad_norm": 0.9296875, + "learning_rate": 6.344793995190166e-05, + "loss": 0.9442, + "step": 3162 + }, + { + "epoch": 0.49865279330409606, + "grad_norm": 0.9765625, + "learning_rate": 6.344365481383157e-05, + "loss": 1.0004, + "step": 3163 + }, + { + "epoch": 0.4988104451514891, + "grad_norm": 0.94921875, + "learning_rate": 6.343936975324054e-05, + "loss": 1.0813, + "step": 3164 + }, + { + "epoch": 0.4989680969988821, + "grad_norm": 0.91015625, + "learning_rate": 6.343508477013766e-05, + "loss": 0.9525, + "step": 3165 + }, + { + "epoch": 0.49912574884627514, + "grad_norm": 1.046875, + "learning_rate": 6.343079986453207e-05, + "loss": 1.2379, + "step": 3166 + }, + { + "epoch": 0.4992834006936681, + "grad_norm": 0.82421875, + "learning_rate": 6.34265150364328e-05, + "loss": 0.8228, + "step": 3167 + }, + { + "epoch": 0.49944105254106114, + "grad_norm": 1.03125, + "learning_rate": 6.342223028584895e-05, + "loss": 1.0939, + "step": 3168 + }, + { + "epoch": 0.49959870438845416, + "grad_norm": 0.90625, + "learning_rate": 6.341794561278956e-05, + "loss": 0.9936, + "step": 3169 + }, + { + "epoch": 0.4997563562358472, + "grad_norm": 0.9609375, + "learning_rate": 6.341366101726376e-05, + "loss": 0.9175, + "step": 3170 + }, + { + "epoch": 0.49991400808324016, + "grad_norm": 1.0078125, + "learning_rate": 6.340937649928063e-05, + "loss": 1.1426, + "step": 3171 + }, + { + "epoch": 0.5000716599306332, + "grad_norm": 0.84765625, + "learning_rate": 6.340509205884924e-05, + "loss": 0.8701, + "step": 3172 + }, + { + "epoch": 0.5002293117780262, + "grad_norm": 0.8984375, + "learning_rate": 6.340080769597864e-05, + "loss": 0.9066, + "step": 3173 + }, + { + "epoch": 0.5003869636254192, + "grad_norm": 0.98828125, + "learning_rate": 6.339652341067792e-05, + "loss": 1.1343, + "step": 3174 + }, + { + "epoch": 0.5005446154728123, + "grad_norm": 0.91015625, + "learning_rate": 6.339223920295621e-05, + "loss": 0.9691, + "step": 3175 + }, + { + "epoch": 0.5007022673202053, + "grad_norm": 1.1640625, + "learning_rate": 6.338795507282255e-05, + "loss": 1.0484, + "step": 3176 + }, + { + "epoch": 0.5008599191675982, + "grad_norm": 0.89453125, + "learning_rate": 6.338367102028603e-05, + "loss": 1.1239, + "step": 3177 + }, + { + "epoch": 0.5010175710149912, + "grad_norm": 1.046875, + "learning_rate": 6.337938704535573e-05, + "loss": 1.151, + "step": 3178 + }, + { + "epoch": 0.5011752228623843, + "grad_norm": 0.98828125, + "learning_rate": 6.337510314804067e-05, + "loss": 1.1216, + "step": 3179 + }, + { + "epoch": 0.5013328747097773, + "grad_norm": 1.0234375, + "learning_rate": 6.337081932835005e-05, + "loss": 0.803, + "step": 3180 + }, + { + "epoch": 0.5014905265571703, + "grad_norm": 0.96875, + "learning_rate": 6.336653558629286e-05, + "loss": 0.8989, + "step": 3181 + }, + { + "epoch": 0.5016481784045633, + "grad_norm": 0.90234375, + "learning_rate": 6.336225192187824e-05, + "loss": 1.0633, + "step": 3182 + }, + { + "epoch": 0.5018058302519564, + "grad_norm": 1.0703125, + "learning_rate": 6.33579683351152e-05, + "loss": 1.2872, + "step": 3183 + }, + { + "epoch": 0.5019634820993494, + "grad_norm": 0.80859375, + "learning_rate": 6.335368482601282e-05, + "loss": 1.002, + "step": 3184 + }, + { + "epoch": 0.5021211339467423, + "grad_norm": 0.82421875, + "learning_rate": 6.334940139458026e-05, + "loss": 1.0318, + "step": 3185 + }, + { + "epoch": 0.5022787857941353, + "grad_norm": 0.8984375, + "learning_rate": 6.334511804082653e-05, + "loss": 1.037, + "step": 3186 + }, + { + "epoch": 0.5024364376415283, + "grad_norm": 0.94921875, + "learning_rate": 6.334083476476073e-05, + "loss": 1.2291, + "step": 3187 + }, + { + "epoch": 0.5025940894889214, + "grad_norm": 0.94921875, + "learning_rate": 6.333655156639196e-05, + "loss": 1.0167, + "step": 3188 + }, + { + "epoch": 0.5027517413363144, + "grad_norm": 0.88671875, + "learning_rate": 6.333226844572924e-05, + "loss": 1.0555, + "step": 3189 + }, + { + "epoch": 0.5029093931837074, + "grad_norm": 0.953125, + "learning_rate": 6.332798540278168e-05, + "loss": 1.0258, + "step": 3190 + }, + { + "epoch": 0.5030670450311004, + "grad_norm": 1.0078125, + "learning_rate": 6.332370243755839e-05, + "loss": 1.055, + "step": 3191 + }, + { + "epoch": 0.5032246968784935, + "grad_norm": 0.96875, + "learning_rate": 6.331941955006839e-05, + "loss": 0.9814, + "step": 3192 + }, + { + "epoch": 0.5033823487258864, + "grad_norm": 1.015625, + "learning_rate": 6.331513674032081e-05, + "loss": 1.1327, + "step": 3193 + }, + { + "epoch": 0.5035400005732794, + "grad_norm": 0.8671875, + "learning_rate": 6.331085400832464e-05, + "loss": 1.0885, + "step": 3194 + }, + { + "epoch": 0.5036976524206724, + "grad_norm": 0.9921875, + "learning_rate": 6.330657135408906e-05, + "loss": 1.0215, + "step": 3195 + }, + { + "epoch": 0.5038553042680655, + "grad_norm": 0.82421875, + "learning_rate": 6.330228877762312e-05, + "loss": 1.0863, + "step": 3196 + }, + { + "epoch": 0.5040129561154585, + "grad_norm": 0.93359375, + "learning_rate": 6.329800627893587e-05, + "loss": 1.0148, + "step": 3197 + }, + { + "epoch": 0.5041706079628515, + "grad_norm": 0.85546875, + "learning_rate": 6.32937238580364e-05, + "loss": 0.9876, + "step": 3198 + }, + { + "epoch": 0.5043282598102445, + "grad_norm": 0.98828125, + "learning_rate": 6.328944151493374e-05, + "loss": 1.3381, + "step": 3199 + }, + { + "epoch": 0.5044859116576376, + "grad_norm": 0.8125, + "learning_rate": 6.328515924963707e-05, + "loss": 0.7538, + "step": 3200 + }, + { + "epoch": 0.5046435635050305, + "grad_norm": 0.9375, + "learning_rate": 6.328087706215537e-05, + "loss": 1.1717, + "step": 3201 + }, + { + "epoch": 0.5048012153524235, + "grad_norm": 0.90234375, + "learning_rate": 6.327659495249779e-05, + "loss": 1.1193, + "step": 3202 + }, + { + "epoch": 0.5049588671998165, + "grad_norm": 0.91015625, + "learning_rate": 6.327231292067335e-05, + "loss": 1.1128, + "step": 3203 + }, + { + "epoch": 0.5051165190472096, + "grad_norm": 1.0234375, + "learning_rate": 6.326803096669109e-05, + "loss": 0.9594, + "step": 3204 + }, + { + "epoch": 0.5052741708946026, + "grad_norm": 0.9140625, + "learning_rate": 6.32637490905602e-05, + "loss": 0.9936, + "step": 3205 + }, + { + "epoch": 0.5054318227419956, + "grad_norm": 0.953125, + "learning_rate": 6.325946729228969e-05, + "loss": 1.0825, + "step": 3206 + }, + { + "epoch": 0.5055894745893886, + "grad_norm": 0.859375, + "learning_rate": 6.325518557188863e-05, + "loss": 1.0765, + "step": 3207 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 0.8359375, + "learning_rate": 6.325090392936612e-05, + "loss": 0.807, + "step": 3208 + }, + { + "epoch": 0.5059047782841746, + "grad_norm": 0.91015625, + "learning_rate": 6.32466223647312e-05, + "loss": 1.1411, + "step": 3209 + }, + { + "epoch": 0.5060624301315676, + "grad_norm": 1.046875, + "learning_rate": 6.324234087799296e-05, + "loss": 1.1344, + "step": 3210 + }, + { + "epoch": 0.5062200819789606, + "grad_norm": 0.90625, + "learning_rate": 6.323805946916048e-05, + "loss": 0.781, + "step": 3211 + }, + { + "epoch": 0.5063777338263536, + "grad_norm": 1.0234375, + "learning_rate": 6.323377813824286e-05, + "loss": 1.1569, + "step": 3212 + }, + { + "epoch": 0.5065353856737467, + "grad_norm": 0.8984375, + "learning_rate": 6.322949688524909e-05, + "loss": 1.0285, + "step": 3213 + }, + { + "epoch": 0.5066930375211397, + "grad_norm": 0.859375, + "learning_rate": 6.322521571018835e-05, + "loss": 0.935, + "step": 3214 + }, + { + "epoch": 0.5068506893685327, + "grad_norm": 0.90625, + "learning_rate": 6.322093461306963e-05, + "loss": 0.8615, + "step": 3215 + }, + { + "epoch": 0.5070083412159258, + "grad_norm": 0.921875, + "learning_rate": 6.321665359390205e-05, + "loss": 1.0648, + "step": 3216 + }, + { + "epoch": 0.5071659930633187, + "grad_norm": 0.9453125, + "learning_rate": 6.321237265269469e-05, + "loss": 1.0917, + "step": 3217 + }, + { + "epoch": 0.5073236449107117, + "grad_norm": 0.87109375, + "learning_rate": 6.320809178945658e-05, + "loss": 0.988, + "step": 3218 + }, + { + "epoch": 0.5074812967581047, + "grad_norm": 0.9765625, + "learning_rate": 6.320381100419683e-05, + "loss": 0.9283, + "step": 3219 + }, + { + "epoch": 0.5076389486054977, + "grad_norm": 0.9296875, + "learning_rate": 6.319953029692446e-05, + "loss": 1.157, + "step": 3220 + }, + { + "epoch": 0.5077966004528908, + "grad_norm": 0.85546875, + "learning_rate": 6.319524966764861e-05, + "loss": 0.9046, + "step": 3221 + }, + { + "epoch": 0.5079542523002838, + "grad_norm": 0.9765625, + "learning_rate": 6.319096911637834e-05, + "loss": 1.0192, + "step": 3222 + }, + { + "epoch": 0.5081119041476768, + "grad_norm": 0.98046875, + "learning_rate": 6.31866886431227e-05, + "loss": 0.9162, + "step": 3223 + }, + { + "epoch": 0.5082695559950698, + "grad_norm": 1.0078125, + "learning_rate": 6.318240824789077e-05, + "loss": 1.1665, + "step": 3224 + }, + { + "epoch": 0.5084272078424628, + "grad_norm": 0.90234375, + "learning_rate": 6.317812793069158e-05, + "loss": 0.9449, + "step": 3225 + }, + { + "epoch": 0.5085848596898558, + "grad_norm": 0.98046875, + "learning_rate": 6.317384769153429e-05, + "loss": 1.052, + "step": 3226 + }, + { + "epoch": 0.5087425115372488, + "grad_norm": 0.9140625, + "learning_rate": 6.31695675304279e-05, + "loss": 0.8936, + "step": 3227 + }, + { + "epoch": 0.5089001633846418, + "grad_norm": 0.99609375, + "learning_rate": 6.316528744738155e-05, + "loss": 0.9335, + "step": 3228 + }, + { + "epoch": 0.5090578152320349, + "grad_norm": 0.98828125, + "learning_rate": 6.316100744240422e-05, + "loss": 0.8978, + "step": 3229 + }, + { + "epoch": 0.5092154670794279, + "grad_norm": 0.94140625, + "learning_rate": 6.315672751550505e-05, + "loss": 1.0539, + "step": 3230 + }, + { + "epoch": 0.5093731189268209, + "grad_norm": 0.96875, + "learning_rate": 6.31524476666931e-05, + "loss": 0.9447, + "step": 3231 + }, + { + "epoch": 0.5095307707742139, + "grad_norm": 0.89453125, + "learning_rate": 6.31481678959774e-05, + "loss": 1.0343, + "step": 3232 + }, + { + "epoch": 0.5096884226216069, + "grad_norm": 0.97265625, + "learning_rate": 6.314388820336707e-05, + "loss": 1.096, + "step": 3233 + }, + { + "epoch": 0.5098460744689999, + "grad_norm": 0.8828125, + "learning_rate": 6.313960858887119e-05, + "loss": 0.955, + "step": 3234 + }, + { + "epoch": 0.5100037263163929, + "grad_norm": 0.90625, + "learning_rate": 6.313532905249875e-05, + "loss": 0.9424, + "step": 3235 + }, + { + "epoch": 0.5101613781637859, + "grad_norm": 0.953125, + "learning_rate": 6.31310495942589e-05, + "loss": 1.1538, + "step": 3236 + }, + { + "epoch": 0.510319030011179, + "grad_norm": 0.87890625, + "learning_rate": 6.312677021416068e-05, + "loss": 0.9694, + "step": 3237 + }, + { + "epoch": 0.510476681858572, + "grad_norm": 1.0078125, + "learning_rate": 6.312249091221318e-05, + "loss": 1.0882, + "step": 3238 + }, + { + "epoch": 0.510634333705965, + "grad_norm": 0.9140625, + "learning_rate": 6.311821168842544e-05, + "loss": 0.9332, + "step": 3239 + }, + { + "epoch": 0.510791985553358, + "grad_norm": 0.85546875, + "learning_rate": 6.311393254280652e-05, + "loss": 1.1887, + "step": 3240 + }, + { + "epoch": 0.510949637400751, + "grad_norm": 1.03125, + "learning_rate": 6.310965347536553e-05, + "loss": 1.0398, + "step": 3241 + }, + { + "epoch": 0.511107289248144, + "grad_norm": 0.8125, + "learning_rate": 6.310537448611153e-05, + "loss": 0.8946, + "step": 3242 + }, + { + "epoch": 0.511264941095537, + "grad_norm": 0.953125, + "learning_rate": 6.310109557505357e-05, + "loss": 1.0631, + "step": 3243 + }, + { + "epoch": 0.51142259294293, + "grad_norm": 0.8359375, + "learning_rate": 6.309681674220073e-05, + "loss": 0.9902, + "step": 3244 + }, + { + "epoch": 0.511580244790323, + "grad_norm": 0.921875, + "learning_rate": 6.309253798756205e-05, + "loss": 1.0334, + "step": 3245 + }, + { + "epoch": 0.5117378966377161, + "grad_norm": 1.0390625, + "learning_rate": 6.308825931114666e-05, + "loss": 1.1892, + "step": 3246 + }, + { + "epoch": 0.5118955484851091, + "grad_norm": 1.015625, + "learning_rate": 6.30839807129636e-05, + "loss": 0.9506, + "step": 3247 + }, + { + "epoch": 0.5120532003325021, + "grad_norm": 1.0390625, + "learning_rate": 6.307970219302193e-05, + "loss": 1.118, + "step": 3248 + }, + { + "epoch": 0.512210852179895, + "grad_norm": 1.1171875, + "learning_rate": 6.307542375133071e-05, + "loss": 1.2192, + "step": 3249 + }, + { + "epoch": 0.5123685040272881, + "grad_norm": 1.0078125, + "learning_rate": 6.3071145387899e-05, + "loss": 1.0798, + "step": 3250 + }, + { + "epoch": 0.5125261558746811, + "grad_norm": 1.0390625, + "learning_rate": 6.306686710273591e-05, + "loss": 1.021, + "step": 3251 + }, + { + "epoch": 0.5126838077220741, + "grad_norm": 0.8984375, + "learning_rate": 6.306258889585049e-05, + "loss": 0.9509, + "step": 3252 + }, + { + "epoch": 0.5128414595694671, + "grad_norm": 0.8984375, + "learning_rate": 6.30583107672518e-05, + "loss": 0.8355, + "step": 3253 + }, + { + "epoch": 0.5129991114168602, + "grad_norm": 1.078125, + "learning_rate": 6.30540327169489e-05, + "loss": 0.9464, + "step": 3254 + }, + { + "epoch": 0.5131567632642532, + "grad_norm": 0.93359375, + "learning_rate": 6.304975474495085e-05, + "loss": 0.9805, + "step": 3255 + }, + { + "epoch": 0.5133144151116462, + "grad_norm": 1.0078125, + "learning_rate": 6.304547685126676e-05, + "loss": 0.9806, + "step": 3256 + }, + { + "epoch": 0.5134720669590391, + "grad_norm": 0.82421875, + "learning_rate": 6.304119903590567e-05, + "loss": 0.9074, + "step": 3257 + }, + { + "epoch": 0.5136297188064322, + "grad_norm": 0.890625, + "learning_rate": 6.303692129887665e-05, + "loss": 1.1021, + "step": 3258 + }, + { + "epoch": 0.5137873706538252, + "grad_norm": 1.0234375, + "learning_rate": 6.303264364018874e-05, + "loss": 1.0947, + "step": 3259 + }, + { + "epoch": 0.5139450225012182, + "grad_norm": 0.96484375, + "learning_rate": 6.302836605985102e-05, + "loss": 1.0885, + "step": 3260 + }, + { + "epoch": 0.5141026743486112, + "grad_norm": 0.88671875, + "learning_rate": 6.302408855787258e-05, + "loss": 1.0311, + "step": 3261 + }, + { + "epoch": 0.5142603261960043, + "grad_norm": 0.9375, + "learning_rate": 6.30198111342625e-05, + "loss": 1.0976, + "step": 3262 + }, + { + "epoch": 0.5144179780433973, + "grad_norm": 1.0234375, + "learning_rate": 6.301553378902979e-05, + "loss": 1.0993, + "step": 3263 + }, + { + "epoch": 0.5145756298907903, + "grad_norm": 0.96484375, + "learning_rate": 6.301125652218355e-05, + "loss": 0.9576, + "step": 3264 + }, + { + "epoch": 0.5147332817381832, + "grad_norm": 0.95703125, + "learning_rate": 6.30069793337328e-05, + "loss": 1.0853, + "step": 3265 + }, + { + "epoch": 0.5148909335855762, + "grad_norm": 0.92578125, + "learning_rate": 6.300270222368667e-05, + "loss": 1.0693, + "step": 3266 + }, + { + "epoch": 0.5150485854329693, + "grad_norm": 0.9296875, + "learning_rate": 6.299842519205421e-05, + "loss": 0.938, + "step": 3267 + }, + { + "epoch": 0.5152062372803623, + "grad_norm": 0.8359375, + "learning_rate": 6.299414823884447e-05, + "loss": 1.0012, + "step": 3268 + }, + { + "epoch": 0.5153638891277553, + "grad_norm": 1.015625, + "learning_rate": 6.298987136406653e-05, + "loss": 1.1214, + "step": 3269 + }, + { + "epoch": 0.5155215409751484, + "grad_norm": 0.984375, + "learning_rate": 6.29855945677294e-05, + "loss": 1.1659, + "step": 3270 + }, + { + "epoch": 0.5156791928225414, + "grad_norm": 0.9375, + "learning_rate": 6.29813178498422e-05, + "loss": 0.8866, + "step": 3271 + }, + { + "epoch": 0.5158368446699344, + "grad_norm": 0.984375, + "learning_rate": 6.2977041210414e-05, + "loss": 1.1123, + "step": 3272 + }, + { + "epoch": 0.5159944965173273, + "grad_norm": 0.8984375, + "learning_rate": 6.297276464945386e-05, + "loss": 1.0722, + "step": 3273 + }, + { + "epoch": 0.5161521483647203, + "grad_norm": 0.9375, + "learning_rate": 6.296848816697079e-05, + "loss": 0.9293, + "step": 3274 + }, + { + "epoch": 0.5163098002121134, + "grad_norm": 0.9140625, + "learning_rate": 6.29642117629739e-05, + "loss": 0.9768, + "step": 3275 + }, + { + "epoch": 0.5164674520595064, + "grad_norm": 1.0703125, + "learning_rate": 6.295993543747228e-05, + "loss": 1.0379, + "step": 3276 + }, + { + "epoch": 0.5166251039068994, + "grad_norm": 0.90234375, + "learning_rate": 6.295565919047492e-05, + "loss": 1.0485, + "step": 3277 + }, + { + "epoch": 0.5167827557542924, + "grad_norm": 0.890625, + "learning_rate": 6.295138302199096e-05, + "loss": 0.9777, + "step": 3278 + }, + { + "epoch": 0.5169404076016855, + "grad_norm": 2.953125, + "learning_rate": 6.294710693202941e-05, + "loss": 0.819, + "step": 3279 + }, + { + "epoch": 0.5170980594490785, + "grad_norm": 1.0625, + "learning_rate": 6.294283092059929e-05, + "loss": 1.0518, + "step": 3280 + }, + { + "epoch": 0.5172557112964714, + "grad_norm": 0.9375, + "learning_rate": 6.29385549877098e-05, + "loss": 1.0184, + "step": 3281 + }, + { + "epoch": 0.5174133631438644, + "grad_norm": 0.95703125, + "learning_rate": 6.29342791333699e-05, + "loss": 0.87, + "step": 3282 + }, + { + "epoch": 0.5175710149912575, + "grad_norm": 0.98828125, + "learning_rate": 6.293000335758867e-05, + "loss": 1.154, + "step": 3283 + }, + { + "epoch": 0.5177286668386505, + "grad_norm": 0.984375, + "learning_rate": 6.29257276603752e-05, + "loss": 1.227, + "step": 3284 + }, + { + "epoch": 0.5178863186860435, + "grad_norm": 0.90625, + "learning_rate": 6.292145204173848e-05, + "loss": 1.1327, + "step": 3285 + }, + { + "epoch": 0.5180439705334365, + "grad_norm": 0.8984375, + "learning_rate": 6.291717650168766e-05, + "loss": 0.7133, + "step": 3286 + }, + { + "epoch": 0.5182016223808296, + "grad_norm": 0.97265625, + "learning_rate": 6.291290104023178e-05, + "loss": 0.9067, + "step": 3287 + }, + { + "epoch": 0.5183592742282226, + "grad_norm": 0.984375, + "learning_rate": 6.290862565737987e-05, + "loss": 1.1938, + "step": 3288 + }, + { + "epoch": 0.5185169260756156, + "grad_norm": 1.015625, + "learning_rate": 6.290435035314102e-05, + "loss": 1.078, + "step": 3289 + }, + { + "epoch": 0.5186745779230085, + "grad_norm": 0.890625, + "learning_rate": 6.290007512752423e-05, + "loss": 0.9553, + "step": 3290 + }, + { + "epoch": 0.5188322297704016, + "grad_norm": 0.94140625, + "learning_rate": 6.289579998053867e-05, + "loss": 1.1297, + "step": 3291 + }, + { + "epoch": 0.5189898816177946, + "grad_norm": 0.8828125, + "learning_rate": 6.289152491219332e-05, + "loss": 0.8165, + "step": 3292 + }, + { + "epoch": 0.5191475334651876, + "grad_norm": 0.90625, + "learning_rate": 6.288724992249726e-05, + "loss": 0.9558, + "step": 3293 + }, + { + "epoch": 0.5193051853125806, + "grad_norm": 0.9921875, + "learning_rate": 6.288297501145956e-05, + "loss": 1.1013, + "step": 3294 + }, + { + "epoch": 0.5194628371599737, + "grad_norm": 0.94921875, + "learning_rate": 6.287870017908926e-05, + "loss": 0.9093, + "step": 3295 + }, + { + "epoch": 0.5196204890073667, + "grad_norm": 0.91796875, + "learning_rate": 6.287442542539544e-05, + "loss": 0.974, + "step": 3296 + }, + { + "epoch": 0.5197781408547597, + "grad_norm": 0.95703125, + "learning_rate": 6.287015075038716e-05, + "loss": 0.9947, + "step": 3297 + }, + { + "epoch": 0.5199357927021526, + "grad_norm": 0.97265625, + "learning_rate": 6.286587615407348e-05, + "loss": 1.206, + "step": 3298 + }, + { + "epoch": 0.5200934445495456, + "grad_norm": 0.9140625, + "learning_rate": 6.286160163646342e-05, + "loss": 0.9339, + "step": 3299 + }, + { + "epoch": 0.5202510963969387, + "grad_norm": 0.99609375, + "learning_rate": 6.285732719756608e-05, + "loss": 1.0755, + "step": 3300 + }, + { + "epoch": 0.5204087482443317, + "grad_norm": 0.93359375, + "learning_rate": 6.285305283739054e-05, + "loss": 1.0263, + "step": 3301 + }, + { + "epoch": 0.5205664000917247, + "grad_norm": 1.0078125, + "learning_rate": 6.284877855594582e-05, + "loss": 1.0925, + "step": 3302 + }, + { + "epoch": 0.5207240519391177, + "grad_norm": 1.0703125, + "learning_rate": 6.284450435324098e-05, + "loss": 0.9795, + "step": 3303 + }, + { + "epoch": 0.5208817037865108, + "grad_norm": 0.8046875, + "learning_rate": 6.284023022928511e-05, + "loss": 0.7722, + "step": 3304 + }, + { + "epoch": 0.5210393556339038, + "grad_norm": 0.84375, + "learning_rate": 6.28359561840872e-05, + "loss": 1.0487, + "step": 3305 + }, + { + "epoch": 0.5211970074812967, + "grad_norm": 0.92578125, + "learning_rate": 6.283168221765639e-05, + "loss": 1.0511, + "step": 3306 + }, + { + "epoch": 0.5213546593286897, + "grad_norm": 0.91015625, + "learning_rate": 6.282740833000171e-05, + "loss": 0.9363, + "step": 3307 + }, + { + "epoch": 0.5215123111760828, + "grad_norm": 0.92578125, + "learning_rate": 6.282313452113222e-05, + "loss": 1.0918, + "step": 3308 + }, + { + "epoch": 0.5216699630234758, + "grad_norm": 1.0234375, + "learning_rate": 6.281886079105697e-05, + "loss": 0.9448, + "step": 3309 + }, + { + "epoch": 0.5218276148708688, + "grad_norm": 0.88671875, + "learning_rate": 6.281458713978496e-05, + "loss": 0.9482, + "step": 3310 + }, + { + "epoch": 0.5219852667182618, + "grad_norm": 0.984375, + "learning_rate": 6.281031356732536e-05, + "loss": 0.9017, + "step": 3311 + }, + { + "epoch": 0.5221429185656549, + "grad_norm": 0.8359375, + "learning_rate": 6.280604007368719e-05, + "loss": 1.0652, + "step": 3312 + }, + { + "epoch": 0.5223005704130479, + "grad_norm": 0.9765625, + "learning_rate": 6.280176665887949e-05, + "loss": 0.9913, + "step": 3313 + }, + { + "epoch": 0.5224582222604408, + "grad_norm": 0.91796875, + "learning_rate": 6.279749332291129e-05, + "loss": 1.0119, + "step": 3314 + }, + { + "epoch": 0.5226158741078338, + "grad_norm": 0.953125, + "learning_rate": 6.279322006579169e-05, + "loss": 1.0175, + "step": 3315 + }, + { + "epoch": 0.5227735259552269, + "grad_norm": 0.8984375, + "learning_rate": 6.278894688752972e-05, + "loss": 0.9892, + "step": 3316 + }, + { + "epoch": 0.5229311778026199, + "grad_norm": 0.80078125, + "learning_rate": 6.278467378813446e-05, + "loss": 0.8812, + "step": 3317 + }, + { + "epoch": 0.5230888296500129, + "grad_norm": 0.91015625, + "learning_rate": 6.278040076761497e-05, + "loss": 1.0253, + "step": 3318 + }, + { + "epoch": 0.5232464814974059, + "grad_norm": 0.921875, + "learning_rate": 6.277612782598028e-05, + "loss": 1.1658, + "step": 3319 + }, + { + "epoch": 0.523404133344799, + "grad_norm": 0.9296875, + "learning_rate": 6.277185496323945e-05, + "loss": 1.0372, + "step": 3320 + }, + { + "epoch": 0.523561785192192, + "grad_norm": 0.890625, + "learning_rate": 6.276758217940157e-05, + "loss": 1.145, + "step": 3321 + }, + { + "epoch": 0.5237194370395849, + "grad_norm": 0.93359375, + "learning_rate": 6.276330947447566e-05, + "loss": 0.9584, + "step": 3322 + }, + { + "epoch": 0.5238770888869779, + "grad_norm": 0.96484375, + "learning_rate": 6.27590368484708e-05, + "loss": 1.0584, + "step": 3323 + }, + { + "epoch": 0.524034740734371, + "grad_norm": 0.82421875, + "learning_rate": 6.275476430139602e-05, + "loss": 0.9147, + "step": 3324 + }, + { + "epoch": 0.524192392581764, + "grad_norm": 0.91796875, + "learning_rate": 6.275049183326036e-05, + "loss": 0.9428, + "step": 3325 + }, + { + "epoch": 0.524350044429157, + "grad_norm": 0.92578125, + "learning_rate": 6.274621944407292e-05, + "loss": 1.014, + "step": 3326 + }, + { + "epoch": 0.52450769627655, + "grad_norm": 0.90625, + "learning_rate": 6.274194713384276e-05, + "loss": 0.9501, + "step": 3327 + }, + { + "epoch": 0.524665348123943, + "grad_norm": 0.84765625, + "learning_rate": 6.273767490257891e-05, + "loss": 0.9977, + "step": 3328 + }, + { + "epoch": 0.5248229999713361, + "grad_norm": 0.8828125, + "learning_rate": 6.273340275029043e-05, + "loss": 0.9036, + "step": 3329 + }, + { + "epoch": 0.524980651818729, + "grad_norm": 0.828125, + "learning_rate": 6.272913067698634e-05, + "loss": 0.9698, + "step": 3330 + }, + { + "epoch": 0.525138303666122, + "grad_norm": 1.0, + "learning_rate": 6.272485868267575e-05, + "loss": 1.4471, + "step": 3331 + }, + { + "epoch": 0.525295955513515, + "grad_norm": 1.0390625, + "learning_rate": 6.27205867673677e-05, + "loss": 1.0074, + "step": 3332 + }, + { + "epoch": 0.5254536073609081, + "grad_norm": 1.125, + "learning_rate": 6.271631493107124e-05, + "loss": 0.9946, + "step": 3333 + }, + { + "epoch": 0.5256112592083011, + "grad_norm": 0.8046875, + "learning_rate": 6.271204317379541e-05, + "loss": 0.7963, + "step": 3334 + }, + { + "epoch": 0.5257689110556941, + "grad_norm": 1.0390625, + "learning_rate": 6.270777149554926e-05, + "loss": 1.0995, + "step": 3335 + }, + { + "epoch": 0.5259265629030871, + "grad_norm": 1.046875, + "learning_rate": 6.270349989634185e-05, + "loss": 1.2185, + "step": 3336 + }, + { + "epoch": 0.5260842147504802, + "grad_norm": 1.03125, + "learning_rate": 6.269922837618227e-05, + "loss": 0.8999, + "step": 3337 + }, + { + "epoch": 0.5262418665978731, + "grad_norm": 0.91015625, + "learning_rate": 6.269495693507954e-05, + "loss": 0.8993, + "step": 3338 + }, + { + "epoch": 0.5263995184452661, + "grad_norm": 0.90625, + "learning_rate": 6.269068557304271e-05, + "loss": 1.0625, + "step": 3339 + }, + { + "epoch": 0.5265571702926591, + "grad_norm": 0.8515625, + "learning_rate": 6.268641429008081e-05, + "loss": 0.911, + "step": 3340 + }, + { + "epoch": 0.5267148221400522, + "grad_norm": 0.98828125, + "learning_rate": 6.268214308620297e-05, + "loss": 1.1234, + "step": 3341 + }, + { + "epoch": 0.5268724739874452, + "grad_norm": 0.8828125, + "learning_rate": 6.267787196141817e-05, + "loss": 1.036, + "step": 3342 + }, + { + "epoch": 0.5270301258348382, + "grad_norm": 0.91015625, + "learning_rate": 6.267360091573551e-05, + "loss": 1.0629, + "step": 3343 + }, + { + "epoch": 0.5271877776822312, + "grad_norm": 0.96875, + "learning_rate": 6.2669329949164e-05, + "loss": 0.9262, + "step": 3344 + }, + { + "epoch": 0.5273454295296243, + "grad_norm": 1.1328125, + "learning_rate": 6.266505906171268e-05, + "loss": 1.0977, + "step": 3345 + }, + { + "epoch": 0.5275030813770172, + "grad_norm": 0.93359375, + "learning_rate": 6.266078825339067e-05, + "loss": 0.9758, + "step": 3346 + }, + { + "epoch": 0.5276607332244102, + "grad_norm": 0.91796875, + "learning_rate": 6.265651752420699e-05, + "loss": 1.0942, + "step": 3347 + }, + { + "epoch": 0.5278183850718032, + "grad_norm": 0.87109375, + "learning_rate": 6.265224687417068e-05, + "loss": 1.3608, + "step": 3348 + }, + { + "epoch": 0.5279760369191963, + "grad_norm": 0.953125, + "learning_rate": 6.26479763032908e-05, + "loss": 0.9409, + "step": 3349 + }, + { + "epoch": 0.5281336887665893, + "grad_norm": 0.89453125, + "learning_rate": 6.264370581157637e-05, + "loss": 1.2185, + "step": 3350 + }, + { + "epoch": 0.5282913406139823, + "grad_norm": 0.94140625, + "learning_rate": 6.26394353990365e-05, + "loss": 0.9889, + "step": 3351 + }, + { + "epoch": 0.5284489924613753, + "grad_norm": 0.94140625, + "learning_rate": 6.263516506568021e-05, + "loss": 1.0061, + "step": 3352 + }, + { + "epoch": 0.5286066443087684, + "grad_norm": 0.92578125, + "learning_rate": 6.263089481151656e-05, + "loss": 1.0299, + "step": 3353 + }, + { + "epoch": 0.5287642961561613, + "grad_norm": 0.8828125, + "learning_rate": 6.262662463655458e-05, + "loss": 0.9581, + "step": 3354 + }, + { + "epoch": 0.5289219480035543, + "grad_norm": 1.03125, + "learning_rate": 6.262235454080334e-05, + "loss": 0.7999, + "step": 3355 + }, + { + "epoch": 0.5290795998509473, + "grad_norm": 0.8515625, + "learning_rate": 6.261808452427185e-05, + "loss": 0.9769, + "step": 3356 + }, + { + "epoch": 0.5292372516983403, + "grad_norm": 0.95703125, + "learning_rate": 6.261381458696923e-05, + "loss": 0.9866, + "step": 3357 + }, + { + "epoch": 0.5293949035457334, + "grad_norm": 0.96484375, + "learning_rate": 6.260954472890448e-05, + "loss": 1.0054, + "step": 3358 + }, + { + "epoch": 0.5295525553931264, + "grad_norm": 0.94140625, + "learning_rate": 6.260527495008668e-05, + "loss": 0.966, + "step": 3359 + }, + { + "epoch": 0.5297102072405194, + "grad_norm": 0.8984375, + "learning_rate": 6.260100525052486e-05, + "loss": 0.9001, + "step": 3360 + }, + { + "epoch": 0.5298678590879125, + "grad_norm": 0.90625, + "learning_rate": 6.259673563022803e-05, + "loss": 0.9025, + "step": 3361 + }, + { + "epoch": 0.5300255109353054, + "grad_norm": 0.93359375, + "learning_rate": 6.259246608920533e-05, + "loss": 0.9716, + "step": 3362 + }, + { + "epoch": 0.5301831627826984, + "grad_norm": 0.94921875, + "learning_rate": 6.258819662746574e-05, + "loss": 0.8808, + "step": 3363 + }, + { + "epoch": 0.5303408146300914, + "grad_norm": 0.953125, + "learning_rate": 6.258392724501834e-05, + "loss": 1.1066, + "step": 3364 + }, + { + "epoch": 0.5304984664774844, + "grad_norm": 1.0078125, + "learning_rate": 6.257965794187217e-05, + "loss": 1.0312, + "step": 3365 + }, + { + "epoch": 0.5306561183248775, + "grad_norm": 0.86328125, + "learning_rate": 6.257538871803622e-05, + "loss": 0.8986, + "step": 3366 + }, + { + "epoch": 0.5308137701722705, + "grad_norm": 0.96484375, + "learning_rate": 6.257111957351965e-05, + "loss": 1.0827, + "step": 3367 + }, + { + "epoch": 0.5309714220196635, + "grad_norm": 1.046875, + "learning_rate": 6.256685050833144e-05, + "loss": 1.257, + "step": 3368 + }, + { + "epoch": 0.5311290738670565, + "grad_norm": 1.4375, + "learning_rate": 6.256258152248067e-05, + "loss": 0.9571, + "step": 3369 + }, + { + "epoch": 0.5312867257144495, + "grad_norm": 0.8515625, + "learning_rate": 6.255831261597635e-05, + "loss": 1.1417, + "step": 3370 + }, + { + "epoch": 0.5314443775618425, + "grad_norm": 0.89453125, + "learning_rate": 6.255404378882752e-05, + "loss": 0.9161, + "step": 3371 + }, + { + "epoch": 0.5316020294092355, + "grad_norm": 0.94140625, + "learning_rate": 6.254977504104328e-05, + "loss": 1.0195, + "step": 3372 + }, + { + "epoch": 0.5317596812566285, + "grad_norm": 0.8828125, + "learning_rate": 6.254550637263266e-05, + "loss": 1.0788, + "step": 3373 + }, + { + "epoch": 0.5319173331040216, + "grad_norm": 0.828125, + "learning_rate": 6.25412377836047e-05, + "loss": 1.0279, + "step": 3374 + }, + { + "epoch": 0.5320749849514146, + "grad_norm": 1.0703125, + "learning_rate": 6.253696927396843e-05, + "loss": 1.0844, + "step": 3375 + }, + { + "epoch": 0.5322326367988076, + "grad_norm": 0.87890625, + "learning_rate": 6.253270084373288e-05, + "loss": 0.8595, + "step": 3376 + }, + { + "epoch": 0.5323902886462006, + "grad_norm": 0.91796875, + "learning_rate": 6.252843249290719e-05, + "loss": 1.11, + "step": 3377 + }, + { + "epoch": 0.5325479404935936, + "grad_norm": 0.88671875, + "learning_rate": 6.252416422150032e-05, + "loss": 1.0162, + "step": 3378 + }, + { + "epoch": 0.5327055923409866, + "grad_norm": 0.87109375, + "learning_rate": 6.251989602952135e-05, + "loss": 0.9173, + "step": 3379 + }, + { + "epoch": 0.5328632441883796, + "grad_norm": 0.9765625, + "learning_rate": 6.251562791697932e-05, + "loss": 0.9907, + "step": 3380 + }, + { + "epoch": 0.5330208960357726, + "grad_norm": 0.953125, + "learning_rate": 6.251135988388326e-05, + "loss": 0.8467, + "step": 3381 + }, + { + "epoch": 0.5331785478831657, + "grad_norm": 1.0390625, + "learning_rate": 6.250709193024224e-05, + "loss": 1.0027, + "step": 3382 + }, + { + "epoch": 0.5333361997305587, + "grad_norm": 0.9609375, + "learning_rate": 6.25028240560653e-05, + "loss": 0.9265, + "step": 3383 + }, + { + "epoch": 0.5334938515779517, + "grad_norm": 0.9453125, + "learning_rate": 6.249855626136145e-05, + "loss": 1.0365, + "step": 3384 + }, + { + "epoch": 0.5336515034253447, + "grad_norm": 0.96875, + "learning_rate": 6.249428854613981e-05, + "loss": 1.1204, + "step": 3385 + }, + { + "epoch": 0.5338091552727376, + "grad_norm": 0.875, + "learning_rate": 6.249002091040934e-05, + "loss": 0.8398, + "step": 3386 + }, + { + "epoch": 0.5339668071201307, + "grad_norm": 0.85546875, + "learning_rate": 6.248575335417915e-05, + "loss": 1.0203, + "step": 3387 + }, + { + "epoch": 0.5341244589675237, + "grad_norm": 1.0625, + "learning_rate": 6.248148587745828e-05, + "loss": 0.9752, + "step": 3388 + }, + { + "epoch": 0.5342821108149167, + "grad_norm": 0.8671875, + "learning_rate": 6.247721848025574e-05, + "loss": 1.0393, + "step": 3389 + }, + { + "epoch": 0.5344397626623097, + "grad_norm": 0.9296875, + "learning_rate": 6.247295116258059e-05, + "loss": 0.8796, + "step": 3390 + }, + { + "epoch": 0.5345974145097028, + "grad_norm": 0.98828125, + "learning_rate": 6.246868392444185e-05, + "loss": 0.9378, + "step": 3391 + }, + { + "epoch": 0.5347550663570958, + "grad_norm": 1.0703125, + "learning_rate": 6.24644167658486e-05, + "loss": 1.3404, + "step": 3392 + }, + { + "epoch": 0.5349127182044888, + "grad_norm": 0.85546875, + "learning_rate": 6.24601496868099e-05, + "loss": 0.9214, + "step": 3393 + }, + { + "epoch": 0.5350703700518817, + "grad_norm": 0.80078125, + "learning_rate": 6.245588268733475e-05, + "loss": 0.8723, + "step": 3394 + }, + { + "epoch": 0.5352280218992748, + "grad_norm": 0.9296875, + "learning_rate": 6.245161576743223e-05, + "loss": 1.0703, + "step": 3395 + }, + { + "epoch": 0.5353856737466678, + "grad_norm": 0.953125, + "learning_rate": 6.24473489271113e-05, + "loss": 1.0099, + "step": 3396 + }, + { + "epoch": 0.5355433255940608, + "grad_norm": 0.8984375, + "learning_rate": 6.244308216638113e-05, + "loss": 0.9258, + "step": 3397 + }, + { + "epoch": 0.5357009774414538, + "grad_norm": 1.046875, + "learning_rate": 6.243881548525068e-05, + "loss": 1.4259, + "step": 3398 + }, + { + "epoch": 0.5358586292888469, + "grad_norm": 0.875, + "learning_rate": 6.243454888372904e-05, + "loss": 1.046, + "step": 3399 + }, + { + "epoch": 0.5360162811362399, + "grad_norm": 0.93359375, + "learning_rate": 6.243028236182522e-05, + "loss": 0.9435, + "step": 3400 + }, + { + "epoch": 0.5361739329836329, + "grad_norm": 0.921875, + "learning_rate": 6.242601591954822e-05, + "loss": 1.0578, + "step": 3401 + }, + { + "epoch": 0.5363315848310258, + "grad_norm": 0.890625, + "learning_rate": 6.242174955690719e-05, + "loss": 1.1444, + "step": 3402 + }, + { + "epoch": 0.5364892366784189, + "grad_norm": 0.92578125, + "learning_rate": 6.241748327391107e-05, + "loss": 0.9384, + "step": 3403 + }, + { + "epoch": 0.5366468885258119, + "grad_norm": 0.90234375, + "learning_rate": 6.241321707056897e-05, + "loss": 1.1036, + "step": 3404 + }, + { + "epoch": 0.5368045403732049, + "grad_norm": 0.90625, + "learning_rate": 6.240895094688991e-05, + "loss": 1.0915, + "step": 3405 + }, + { + "epoch": 0.5369621922205979, + "grad_norm": 0.88671875, + "learning_rate": 6.240468490288291e-05, + "loss": 1.0356, + "step": 3406 + }, + { + "epoch": 0.537119844067991, + "grad_norm": 0.96875, + "learning_rate": 6.240041893855706e-05, + "loss": 1.0159, + "step": 3407 + }, + { + "epoch": 0.537277495915384, + "grad_norm": 0.90625, + "learning_rate": 6.239615305392136e-05, + "loss": 1.0554, + "step": 3408 + }, + { + "epoch": 0.537435147762777, + "grad_norm": 0.9296875, + "learning_rate": 6.239188724898486e-05, + "loss": 1.2192, + "step": 3409 + }, + { + "epoch": 0.5375927996101699, + "grad_norm": 1.03125, + "learning_rate": 6.238762152375661e-05, + "loss": 0.8354, + "step": 3410 + }, + { + "epoch": 0.537750451457563, + "grad_norm": 0.95703125, + "learning_rate": 6.238335587824562e-05, + "loss": 1.0254, + "step": 3411 + }, + { + "epoch": 0.537908103304956, + "grad_norm": 1.0625, + "learning_rate": 6.237909031246098e-05, + "loss": 1.0759, + "step": 3412 + }, + { + "epoch": 0.538065755152349, + "grad_norm": 0.87890625, + "learning_rate": 6.237482482641173e-05, + "loss": 0.964, + "step": 3413 + }, + { + "epoch": 0.538223406999742, + "grad_norm": 1.0546875, + "learning_rate": 6.237055942010686e-05, + "loss": 0.8919, + "step": 3414 + }, + { + "epoch": 0.538381058847135, + "grad_norm": 0.953125, + "learning_rate": 6.236629409355545e-05, + "loss": 1.0071, + "step": 3415 + }, + { + "epoch": 0.5385387106945281, + "grad_norm": 0.88671875, + "learning_rate": 6.23620288467665e-05, + "loss": 1.0342, + "step": 3416 + }, + { + "epoch": 0.5386963625419211, + "grad_norm": 0.87890625, + "learning_rate": 6.23577636797491e-05, + "loss": 0.9522, + "step": 3417 + }, + { + "epoch": 0.538854014389314, + "grad_norm": 0.92578125, + "learning_rate": 6.235349859251229e-05, + "loss": 1.0359, + "step": 3418 + }, + { + "epoch": 0.539011666236707, + "grad_norm": 1.1484375, + "learning_rate": 6.234923358506508e-05, + "loss": 0.9768, + "step": 3419 + }, + { + "epoch": 0.5391693180841001, + "grad_norm": 0.92578125, + "learning_rate": 6.23449686574165e-05, + "loss": 1.1031, + "step": 3420 + }, + { + "epoch": 0.5393269699314931, + "grad_norm": 0.9296875, + "learning_rate": 6.234070380957559e-05, + "loss": 1.064, + "step": 3421 + }, + { + "epoch": 0.5394846217788861, + "grad_norm": 0.99609375, + "learning_rate": 6.233643904155144e-05, + "loss": 1.1468, + "step": 3422 + }, + { + "epoch": 0.5396422736262791, + "grad_norm": 0.98046875, + "learning_rate": 6.233217435335304e-05, + "loss": 0.8327, + "step": 3423 + }, + { + "epoch": 0.5397999254736722, + "grad_norm": 0.9921875, + "learning_rate": 6.232790974498945e-05, + "loss": 1.0967, + "step": 3424 + }, + { + "epoch": 0.5399575773210652, + "grad_norm": 0.87890625, + "learning_rate": 6.232364521646972e-05, + "loss": 0.8968, + "step": 3425 + }, + { + "epoch": 0.5401152291684581, + "grad_norm": 0.9375, + "learning_rate": 6.231938076780284e-05, + "loss": 1.1833, + "step": 3426 + }, + { + "epoch": 0.5402728810158511, + "grad_norm": 0.91796875, + "learning_rate": 6.231511639899789e-05, + "loss": 0.9542, + "step": 3427 + }, + { + "epoch": 0.5404305328632442, + "grad_norm": 0.98046875, + "learning_rate": 6.23108521100639e-05, + "loss": 1.0067, + "step": 3428 + }, + { + "epoch": 0.5405881847106372, + "grad_norm": 0.87109375, + "learning_rate": 6.230658790100991e-05, + "loss": 1.0509, + "step": 3429 + }, + { + "epoch": 0.5407458365580302, + "grad_norm": 1.0078125, + "learning_rate": 6.230232377184495e-05, + "loss": 1.2403, + "step": 3430 + }, + { + "epoch": 0.5409034884054232, + "grad_norm": 0.88671875, + "learning_rate": 6.229805972257802e-05, + "loss": 0.8219, + "step": 3431 + }, + { + "epoch": 0.5410611402528163, + "grad_norm": 1.0234375, + "learning_rate": 6.229379575321824e-05, + "loss": 1.2755, + "step": 3432 + }, + { + "epoch": 0.5412187921002093, + "grad_norm": 1.0078125, + "learning_rate": 6.228953186377459e-05, + "loss": 1.1359, + "step": 3433 + }, + { + "epoch": 0.5413764439476022, + "grad_norm": 0.95703125, + "learning_rate": 6.228526805425614e-05, + "loss": 1.1323, + "step": 3434 + }, + { + "epoch": 0.5415340957949952, + "grad_norm": 1.015625, + "learning_rate": 6.22810043246719e-05, + "loss": 0.9417, + "step": 3435 + }, + { + "epoch": 0.5416917476423883, + "grad_norm": 0.83203125, + "learning_rate": 6.227674067503088e-05, + "loss": 0.9541, + "step": 3436 + }, + { + "epoch": 0.5418493994897813, + "grad_norm": 0.875, + "learning_rate": 6.227247710534219e-05, + "loss": 0.9886, + "step": 3437 + }, + { + "epoch": 0.5420070513371743, + "grad_norm": 0.828125, + "learning_rate": 6.226821361561483e-05, + "loss": 0.9319, + "step": 3438 + }, + { + "epoch": 0.5421647031845673, + "grad_norm": 0.84375, + "learning_rate": 6.226395020585783e-05, + "loss": 0.8324, + "step": 3439 + }, + { + "epoch": 0.5423223550319604, + "grad_norm": 0.86328125, + "learning_rate": 6.225968687608024e-05, + "loss": 1.0523, + "step": 3440 + }, + { + "epoch": 0.5424800068793534, + "grad_norm": 0.828125, + "learning_rate": 6.225542362629103e-05, + "loss": 0.8469, + "step": 3441 + }, + { + "epoch": 0.5426376587267464, + "grad_norm": 0.953125, + "learning_rate": 6.225116045649935e-05, + "loss": 1.047, + "step": 3442 + }, + { + "epoch": 0.5427953105741393, + "grad_norm": 0.91015625, + "learning_rate": 6.224689736671417e-05, + "loss": 0.939, + "step": 3443 + }, + { + "epoch": 0.5429529624215323, + "grad_norm": 0.984375, + "learning_rate": 6.224263435694453e-05, + "loss": 1.0781, + "step": 3444 + }, + { + "epoch": 0.5431106142689254, + "grad_norm": 0.85546875, + "learning_rate": 6.223837142719946e-05, + "loss": 0.9463, + "step": 3445 + }, + { + "epoch": 0.5432682661163184, + "grad_norm": 1.1640625, + "learning_rate": 6.2234108577488e-05, + "loss": 1.2032, + "step": 3446 + }, + { + "epoch": 0.5434259179637114, + "grad_norm": 1.0546875, + "learning_rate": 6.222984580781921e-05, + "loss": 1.1017, + "step": 3447 + }, + { + "epoch": 0.5435835698111045, + "grad_norm": 1.0703125, + "learning_rate": 6.222558311820209e-05, + "loss": 0.9118, + "step": 3448 + }, + { + "epoch": 0.5437412216584975, + "grad_norm": 0.97265625, + "learning_rate": 6.222132050864569e-05, + "loss": 1.061, + "step": 3449 + }, + { + "epoch": 0.5438988735058905, + "grad_norm": 0.90625, + "learning_rate": 6.221705797915905e-05, + "loss": 0.9684, + "step": 3450 + }, + { + "epoch": 0.5440565253532834, + "grad_norm": 1.125, + "learning_rate": 6.221279552975114e-05, + "loss": 1.0248, + "step": 3451 + }, + { + "epoch": 0.5442141772006764, + "grad_norm": 0.87109375, + "learning_rate": 6.220853316043111e-05, + "loss": 1.0001, + "step": 3452 + }, + { + "epoch": 0.5443718290480695, + "grad_norm": 0.91796875, + "learning_rate": 6.220427087120793e-05, + "loss": 1.0313, + "step": 3453 + }, + { + "epoch": 0.5445294808954625, + "grad_norm": 0.82421875, + "learning_rate": 6.220000866209064e-05, + "loss": 0.8019, + "step": 3454 + }, + { + "epoch": 0.5446871327428555, + "grad_norm": 1.0625, + "learning_rate": 6.219574653308826e-05, + "loss": 1.1443, + "step": 3455 + }, + { + "epoch": 0.5448447845902485, + "grad_norm": 1.1796875, + "learning_rate": 6.219148448420979e-05, + "loss": 1.2077, + "step": 3456 + }, + { + "epoch": 0.5450024364376416, + "grad_norm": 0.9296875, + "learning_rate": 6.218722251546437e-05, + "loss": 1.1358, + "step": 3457 + }, + { + "epoch": 0.5451600882850346, + "grad_norm": 0.9765625, + "learning_rate": 6.218296062686094e-05, + "loss": 1.1086, + "step": 3458 + }, + { + "epoch": 0.5453177401324275, + "grad_norm": 0.9140625, + "learning_rate": 6.217869881840859e-05, + "loss": 0.8284, + "step": 3459 + }, + { + "epoch": 0.5454753919798205, + "grad_norm": 0.93359375, + "learning_rate": 6.217443709011632e-05, + "loss": 0.839, + "step": 3460 + }, + { + "epoch": 0.5456330438272136, + "grad_norm": 0.9296875, + "learning_rate": 6.217017544199312e-05, + "loss": 1.0064, + "step": 3461 + }, + { + "epoch": 0.5457906956746066, + "grad_norm": 0.9609375, + "learning_rate": 6.21659138740481e-05, + "loss": 1.2099, + "step": 3462 + }, + { + "epoch": 0.5459483475219996, + "grad_norm": 0.94140625, + "learning_rate": 6.21616523862903e-05, + "loss": 0.9807, + "step": 3463 + }, + { + "epoch": 0.5461059993693926, + "grad_norm": 1.0234375, + "learning_rate": 6.215739097872868e-05, + "loss": 1.1242, + "step": 3464 + }, + { + "epoch": 0.5462636512167857, + "grad_norm": 0.9453125, + "learning_rate": 6.215312965137232e-05, + "loss": 0.9632, + "step": 3465 + }, + { + "epoch": 0.5464213030641787, + "grad_norm": 0.87890625, + "learning_rate": 6.214886840423021e-05, + "loss": 1.0211, + "step": 3466 + }, + { + "epoch": 0.5465789549115716, + "grad_norm": 0.89453125, + "learning_rate": 6.214460723731145e-05, + "loss": 1.0288, + "step": 3467 + }, + { + "epoch": 0.5467366067589646, + "grad_norm": 1.0, + "learning_rate": 6.214034615062503e-05, + "loss": 1.1139, + "step": 3468 + }, + { + "epoch": 0.5468942586063577, + "grad_norm": 0.87890625, + "learning_rate": 6.213608514417997e-05, + "loss": 1.2055, + "step": 3469 + }, + { + "epoch": 0.5470519104537507, + "grad_norm": 0.8984375, + "learning_rate": 6.213182421798529e-05, + "loss": 1.017, + "step": 3470 + }, + { + "epoch": 0.5472095623011437, + "grad_norm": 0.9375, + "learning_rate": 6.212756337205006e-05, + "loss": 0.9165, + "step": 3471 + }, + { + "epoch": 0.5473672141485367, + "grad_norm": 1.15625, + "learning_rate": 6.21233026063833e-05, + "loss": 0.991, + "step": 3472 + }, + { + "epoch": 0.5475248659959298, + "grad_norm": 1.09375, + "learning_rate": 6.211904192099404e-05, + "loss": 1.0471, + "step": 3473 + }, + { + "epoch": 0.5476825178433228, + "grad_norm": 0.921875, + "learning_rate": 6.211478131589131e-05, + "loss": 1.1033, + "step": 3474 + }, + { + "epoch": 0.5478401696907157, + "grad_norm": 0.96875, + "learning_rate": 6.211052079108413e-05, + "loss": 1.0447, + "step": 3475 + }, + { + "epoch": 0.5479978215381087, + "grad_norm": 1.015625, + "learning_rate": 6.21062603465815e-05, + "loss": 1.0452, + "step": 3476 + }, + { + "epoch": 0.5481554733855017, + "grad_norm": 0.92578125, + "learning_rate": 6.210199998239251e-05, + "loss": 0.9041, + "step": 3477 + }, + { + "epoch": 0.5483131252328948, + "grad_norm": 0.90625, + "learning_rate": 6.209773969852618e-05, + "loss": 0.983, + "step": 3478 + }, + { + "epoch": 0.5484707770802878, + "grad_norm": 0.9375, + "learning_rate": 6.209347949499151e-05, + "loss": 1.0631, + "step": 3479 + }, + { + "epoch": 0.5486284289276808, + "grad_norm": 1.0703125, + "learning_rate": 6.208921937179756e-05, + "loss": 1.0733, + "step": 3480 + }, + { + "epoch": 0.5487860807750738, + "grad_norm": 0.9296875, + "learning_rate": 6.208495932895331e-05, + "loss": 1.0309, + "step": 3481 + }, + { + "epoch": 0.5489437326224669, + "grad_norm": 0.91015625, + "learning_rate": 6.208069936646784e-05, + "loss": 0.8806, + "step": 3482 + }, + { + "epoch": 0.5491013844698598, + "grad_norm": 1.0, + "learning_rate": 6.207643948435018e-05, + "loss": 0.9734, + "step": 3483 + }, + { + "epoch": 0.5492590363172528, + "grad_norm": 0.80078125, + "learning_rate": 6.207217968260932e-05, + "loss": 0.918, + "step": 3484 + }, + { + "epoch": 0.5494166881646458, + "grad_norm": 1.28125, + "learning_rate": 6.206791996125431e-05, + "loss": 1.1494, + "step": 3485 + }, + { + "epoch": 0.5495743400120389, + "grad_norm": 0.9765625, + "learning_rate": 6.206366032029416e-05, + "loss": 1.0591, + "step": 3486 + }, + { + "epoch": 0.5497319918594319, + "grad_norm": 0.9140625, + "learning_rate": 6.205940075973795e-05, + "loss": 1.0137, + "step": 3487 + }, + { + "epoch": 0.5498896437068249, + "grad_norm": 0.9375, + "learning_rate": 6.205514127959462e-05, + "loss": 1.0162, + "step": 3488 + }, + { + "epoch": 0.5500472955542179, + "grad_norm": 0.90234375, + "learning_rate": 6.205088187987329e-05, + "loss": 1.2119, + "step": 3489 + }, + { + "epoch": 0.550204947401611, + "grad_norm": 1.0703125, + "learning_rate": 6.204662256058294e-05, + "loss": 1.0059, + "step": 3490 + }, + { + "epoch": 0.5503625992490039, + "grad_norm": 0.94140625, + "learning_rate": 6.204236332173259e-05, + "loss": 1.1431, + "step": 3491 + }, + { + "epoch": 0.5505202510963969, + "grad_norm": 0.96484375, + "learning_rate": 6.20381041633313e-05, + "loss": 0.9722, + "step": 3492 + }, + { + "epoch": 0.5506779029437899, + "grad_norm": 0.953125, + "learning_rate": 6.203384508538808e-05, + "loss": 1.1598, + "step": 3493 + }, + { + "epoch": 0.550835554791183, + "grad_norm": 0.86328125, + "learning_rate": 6.202958608791195e-05, + "loss": 0.9417, + "step": 3494 + }, + { + "epoch": 0.550993206638576, + "grad_norm": 0.92578125, + "learning_rate": 6.202532717091193e-05, + "loss": 0.8229, + "step": 3495 + }, + { + "epoch": 0.551150858485969, + "grad_norm": 1.1484375, + "learning_rate": 6.20210683343971e-05, + "loss": 1.1489, + "step": 3496 + }, + { + "epoch": 0.551308510333362, + "grad_norm": 0.875, + "learning_rate": 6.201680957837637e-05, + "loss": 1.0794, + "step": 3497 + }, + { + "epoch": 0.5514661621807551, + "grad_norm": 0.9296875, + "learning_rate": 6.201255090285888e-05, + "loss": 0.912, + "step": 3498 + }, + { + "epoch": 0.551623814028148, + "grad_norm": 0.96484375, + "learning_rate": 6.200829230785362e-05, + "loss": 0.8833, + "step": 3499 + }, + { + "epoch": 0.551781465875541, + "grad_norm": 0.8828125, + "learning_rate": 6.200403379336962e-05, + "loss": 1.0313, + "step": 3500 + }, + { + "epoch": 0.551939117722934, + "grad_norm": 1.234375, + "learning_rate": 6.19997753594159e-05, + "loss": 1.0761, + "step": 3501 + }, + { + "epoch": 0.552096769570327, + "grad_norm": 1.0078125, + "learning_rate": 6.199551700600144e-05, + "loss": 1.2859, + "step": 3502 + }, + { + "epoch": 0.5522544214177201, + "grad_norm": 1.0625, + "learning_rate": 6.199125873313535e-05, + "loss": 1.1384, + "step": 3503 + }, + { + "epoch": 0.5524120732651131, + "grad_norm": 0.890625, + "learning_rate": 6.198700054082662e-05, + "loss": 1.0558, + "step": 3504 + }, + { + "epoch": 0.5525697251125061, + "grad_norm": 0.9296875, + "learning_rate": 6.198274242908427e-05, + "loss": 0.8794, + "step": 3505 + }, + { + "epoch": 0.5527273769598992, + "grad_norm": 0.9140625, + "learning_rate": 6.197848439791731e-05, + "loss": 0.9681, + "step": 3506 + }, + { + "epoch": 0.5528850288072921, + "grad_norm": 0.9296875, + "learning_rate": 6.197422644733475e-05, + "loss": 1.014, + "step": 3507 + }, + { + "epoch": 0.5530426806546851, + "grad_norm": 0.9140625, + "learning_rate": 6.196996857734568e-05, + "loss": 0.9072, + "step": 3508 + }, + { + "epoch": 0.5532003325020781, + "grad_norm": 0.90625, + "learning_rate": 6.19657107879591e-05, + "loss": 0.8791, + "step": 3509 + }, + { + "epoch": 0.5533579843494711, + "grad_norm": 1.3203125, + "learning_rate": 6.196145307918399e-05, + "loss": 1.0669, + "step": 3510 + }, + { + "epoch": 0.5535156361968642, + "grad_norm": 0.94140625, + "learning_rate": 6.195719545102944e-05, + "loss": 0.9948, + "step": 3511 + }, + { + "epoch": 0.5536732880442572, + "grad_norm": 0.97265625, + "learning_rate": 6.19529379035044e-05, + "loss": 0.9606, + "step": 3512 + }, + { + "epoch": 0.5538309398916502, + "grad_norm": 1.0234375, + "learning_rate": 6.194868043661796e-05, + "loss": 0.9529, + "step": 3513 + }, + { + "epoch": 0.5539885917390432, + "grad_norm": 0.9921875, + "learning_rate": 6.194442305037913e-05, + "loss": 1.0794, + "step": 3514 + }, + { + "epoch": 0.5541462435864362, + "grad_norm": 1.25, + "learning_rate": 6.194016574479691e-05, + "loss": 0.9702, + "step": 3515 + }, + { + "epoch": 0.5543038954338292, + "grad_norm": 0.91796875, + "learning_rate": 6.193590851988032e-05, + "loss": 0.958, + "step": 3516 + }, + { + "epoch": 0.5544615472812222, + "grad_norm": 0.953125, + "learning_rate": 6.193165137563836e-05, + "loss": 1.3084, + "step": 3517 + }, + { + "epoch": 0.5546191991286152, + "grad_norm": 0.87109375, + "learning_rate": 6.192739431208014e-05, + "loss": 1.0632, + "step": 3518 + }, + { + "epoch": 0.5547768509760083, + "grad_norm": 0.97265625, + "learning_rate": 6.192313732921465e-05, + "loss": 1.1959, + "step": 3519 + }, + { + "epoch": 0.5549345028234013, + "grad_norm": 1.0078125, + "learning_rate": 6.191888042705086e-05, + "loss": 1.0572, + "step": 3520 + }, + { + "epoch": 0.5550921546707943, + "grad_norm": 0.90234375, + "learning_rate": 6.191462360559783e-05, + "loss": 1.1542, + "step": 3521 + }, + { + "epoch": 0.5552498065181873, + "grad_norm": 0.97265625, + "learning_rate": 6.191036686486456e-05, + "loss": 1.0064, + "step": 3522 + }, + { + "epoch": 0.5554074583655803, + "grad_norm": 0.98828125, + "learning_rate": 6.190611020486012e-05, + "loss": 1.0162, + "step": 3523 + }, + { + "epoch": 0.5555651102129733, + "grad_norm": 1.078125, + "learning_rate": 6.190185362559349e-05, + "loss": 0.8536, + "step": 3524 + }, + { + "epoch": 0.5557227620603663, + "grad_norm": 0.8984375, + "learning_rate": 6.189759712707372e-05, + "loss": 0.9489, + "step": 3525 + }, + { + "epoch": 0.5558804139077593, + "grad_norm": 0.953125, + "learning_rate": 6.189334070930982e-05, + "loss": 0.9904, + "step": 3526 + }, + { + "epoch": 0.5560380657551524, + "grad_norm": 0.9609375, + "learning_rate": 6.188908437231074e-05, + "loss": 1.0053, + "step": 3527 + }, + { + "epoch": 0.5561957176025454, + "grad_norm": 1.0625, + "learning_rate": 6.188482811608563e-05, + "loss": 1.2527, + "step": 3528 + }, + { + "epoch": 0.5563533694499384, + "grad_norm": 1.03125, + "learning_rate": 6.188057194064343e-05, + "loss": 0.9917, + "step": 3529 + }, + { + "epoch": 0.5565110212973314, + "grad_norm": 1.0234375, + "learning_rate": 6.187631584599319e-05, + "loss": 1.1718, + "step": 3530 + }, + { + "epoch": 0.5566686731447243, + "grad_norm": 0.953125, + "learning_rate": 6.187205983214393e-05, + "loss": 1.0718, + "step": 3531 + }, + { + "epoch": 0.5568263249921174, + "grad_norm": 0.98046875, + "learning_rate": 6.186780389910463e-05, + "loss": 0.9906, + "step": 3532 + }, + { + "epoch": 0.5569839768395104, + "grad_norm": 1.0859375, + "learning_rate": 6.186354804688435e-05, + "loss": 1.0674, + "step": 3533 + }, + { + "epoch": 0.5571416286869034, + "grad_norm": 0.97265625, + "learning_rate": 6.18592922754921e-05, + "loss": 0.9837, + "step": 3534 + }, + { + "epoch": 0.5572992805342964, + "grad_norm": 0.96875, + "learning_rate": 6.185503658493693e-05, + "loss": 0.9634, + "step": 3535 + }, + { + "epoch": 0.5574569323816895, + "grad_norm": 1.171875, + "learning_rate": 6.185078097522779e-05, + "loss": 0.9674, + "step": 3536 + }, + { + "epoch": 0.5576145842290825, + "grad_norm": 1.015625, + "learning_rate": 6.184652544637373e-05, + "loss": 1.2012, + "step": 3537 + }, + { + "epoch": 0.5577722360764755, + "grad_norm": 1.046875, + "learning_rate": 6.184226999838381e-05, + "loss": 1.0553, + "step": 3538 + }, + { + "epoch": 0.5579298879238684, + "grad_norm": 0.9296875, + "learning_rate": 6.183801463126701e-05, + "loss": 0.892, + "step": 3539 + }, + { + "epoch": 0.5580875397712615, + "grad_norm": 0.98046875, + "learning_rate": 6.183375934503234e-05, + "loss": 0.97, + "step": 3540 + }, + { + "epoch": 0.5582451916186545, + "grad_norm": 1.03125, + "learning_rate": 6.182950413968886e-05, + "loss": 1.1748, + "step": 3541 + }, + { + "epoch": 0.5584028434660475, + "grad_norm": 0.96875, + "learning_rate": 6.182524901524553e-05, + "loss": 1.0689, + "step": 3542 + }, + { + "epoch": 0.5585604953134405, + "grad_norm": 0.921875, + "learning_rate": 6.182099397171144e-05, + "loss": 0.9223, + "step": 3543 + }, + { + "epoch": 0.5587181471608336, + "grad_norm": 0.87890625, + "learning_rate": 6.181673900909556e-05, + "loss": 0.9212, + "step": 3544 + }, + { + "epoch": 0.5588757990082266, + "grad_norm": 1.0546875, + "learning_rate": 6.181248412740692e-05, + "loss": 1.1926, + "step": 3545 + }, + { + "epoch": 0.5590334508556196, + "grad_norm": 0.95703125, + "learning_rate": 6.180822932665454e-05, + "loss": 1.1333, + "step": 3546 + }, + { + "epoch": 0.5591911027030125, + "grad_norm": 0.90625, + "learning_rate": 6.180397460684739e-05, + "loss": 0.9353, + "step": 3547 + }, + { + "epoch": 0.5593487545504056, + "grad_norm": 0.859375, + "learning_rate": 6.179971996799459e-05, + "loss": 1.0062, + "step": 3548 + }, + { + "epoch": 0.5595064063977986, + "grad_norm": 0.9765625, + "learning_rate": 6.179546541010508e-05, + "loss": 0.9992, + "step": 3549 + }, + { + "epoch": 0.5596640582451916, + "grad_norm": 0.984375, + "learning_rate": 6.179121093318791e-05, + "loss": 1.2038, + "step": 3550 + }, + { + "epoch": 0.5598217100925846, + "grad_norm": 0.9765625, + "learning_rate": 6.178695653725208e-05, + "loss": 1.1454, + "step": 3551 + }, + { + "epoch": 0.5599793619399777, + "grad_norm": 0.90625, + "learning_rate": 6.17827022223066e-05, + "loss": 0.9238, + "step": 3552 + }, + { + "epoch": 0.5601370137873707, + "grad_norm": 0.9453125, + "learning_rate": 6.177844798836051e-05, + "loss": 1.0459, + "step": 3553 + }, + { + "epoch": 0.5602946656347637, + "grad_norm": 0.95703125, + "learning_rate": 6.17741938354228e-05, + "loss": 1.0438, + "step": 3554 + }, + { + "epoch": 0.5604523174821566, + "grad_norm": 1.0, + "learning_rate": 6.176993976350251e-05, + "loss": 1.1238, + "step": 3555 + }, + { + "epoch": 0.5606099693295497, + "grad_norm": 1.1484375, + "learning_rate": 6.176568577260866e-05, + "loss": 1.1855, + "step": 3556 + }, + { + "epoch": 0.5607676211769427, + "grad_norm": 0.90234375, + "learning_rate": 6.176143186275021e-05, + "loss": 0.9099, + "step": 3557 + }, + { + "epoch": 0.5609252730243357, + "grad_norm": 0.95703125, + "learning_rate": 6.175717803393627e-05, + "loss": 0.9274, + "step": 3558 + }, + { + "epoch": 0.5610829248717287, + "grad_norm": 0.8984375, + "learning_rate": 6.175292428617578e-05, + "loss": 0.9743, + "step": 3559 + }, + { + "epoch": 0.5612405767191218, + "grad_norm": 0.9609375, + "learning_rate": 6.17486706194778e-05, + "loss": 0.9465, + "step": 3560 + }, + { + "epoch": 0.5613982285665148, + "grad_norm": 0.96875, + "learning_rate": 6.174441703385132e-05, + "loss": 0.9705, + "step": 3561 + }, + { + "epoch": 0.5615558804139078, + "grad_norm": 0.87890625, + "learning_rate": 6.174016352930532e-05, + "loss": 0.9486, + "step": 3562 + }, + { + "epoch": 0.5617135322613007, + "grad_norm": 0.86328125, + "learning_rate": 6.173591010584891e-05, + "loss": 0.9749, + "step": 3563 + }, + { + "epoch": 0.5618711841086937, + "grad_norm": 0.9375, + "learning_rate": 6.173165676349103e-05, + "loss": 0.9463, + "step": 3564 + }, + { + "epoch": 0.5620288359560868, + "grad_norm": 0.88671875, + "learning_rate": 6.172740350224074e-05, + "loss": 1.0372, + "step": 3565 + }, + { + "epoch": 0.5621864878034798, + "grad_norm": 0.91015625, + "learning_rate": 6.172315032210701e-05, + "loss": 0.9502, + "step": 3566 + }, + { + "epoch": 0.5623441396508728, + "grad_norm": 0.87109375, + "learning_rate": 6.171889722309884e-05, + "loss": 0.8261, + "step": 3567 + }, + { + "epoch": 0.5625017914982658, + "grad_norm": 1.109375, + "learning_rate": 6.171464420522531e-05, + "loss": 1.0187, + "step": 3568 + }, + { + "epoch": 0.5626594433456589, + "grad_norm": 0.953125, + "learning_rate": 6.171039126849543e-05, + "loss": 1.0639, + "step": 3569 + }, + { + "epoch": 0.5628170951930519, + "grad_norm": 0.94140625, + "learning_rate": 6.170613841291817e-05, + "loss": 1.1333, + "step": 3570 + }, + { + "epoch": 0.5629747470404448, + "grad_norm": 0.9375, + "learning_rate": 6.170188563850256e-05, + "loss": 0.7898, + "step": 3571 + }, + { + "epoch": 0.5631323988878378, + "grad_norm": 1.1328125, + "learning_rate": 6.169763294525758e-05, + "loss": 1.1087, + "step": 3572 + }, + { + "epoch": 0.5632900507352309, + "grad_norm": 0.98828125, + "learning_rate": 6.169338033319232e-05, + "loss": 1.0119, + "step": 3573 + }, + { + "epoch": 0.5634477025826239, + "grad_norm": 0.94921875, + "learning_rate": 6.168912780231573e-05, + "loss": 0.9388, + "step": 3574 + }, + { + "epoch": 0.5636053544300169, + "grad_norm": 0.94921875, + "learning_rate": 6.168487535263685e-05, + "loss": 0.8846, + "step": 3575 + }, + { + "epoch": 0.5637630062774099, + "grad_norm": 1.078125, + "learning_rate": 6.168062298416471e-05, + "loss": 1.2134, + "step": 3576 + }, + { + "epoch": 0.563920658124803, + "grad_norm": 0.9140625, + "learning_rate": 6.167637069690825e-05, + "loss": 0.9596, + "step": 3577 + }, + { + "epoch": 0.564078309972196, + "grad_norm": 0.921875, + "learning_rate": 6.167211849087658e-05, + "loss": 1.0673, + "step": 3578 + }, + { + "epoch": 0.5642359618195889, + "grad_norm": 1.1796875, + "learning_rate": 6.166786636607864e-05, + "loss": 1.1317, + "step": 3579 + }, + { + "epoch": 0.5643936136669819, + "grad_norm": 0.96875, + "learning_rate": 6.166361432252348e-05, + "loss": 1.0661, + "step": 3580 + }, + { + "epoch": 0.564551265514375, + "grad_norm": 0.921875, + "learning_rate": 6.165936236022009e-05, + "loss": 0.9367, + "step": 3581 + }, + { + "epoch": 0.564708917361768, + "grad_norm": 0.8046875, + "learning_rate": 6.165511047917747e-05, + "loss": 1.0096, + "step": 3582 + }, + { + "epoch": 0.564866569209161, + "grad_norm": 0.953125, + "learning_rate": 6.165085867940467e-05, + "loss": 1.0306, + "step": 3583 + }, + { + "epoch": 0.565024221056554, + "grad_norm": 1.0625, + "learning_rate": 6.164660696091069e-05, + "loss": 1.3307, + "step": 3584 + }, + { + "epoch": 0.5651818729039471, + "grad_norm": 0.91796875, + "learning_rate": 6.164235532370453e-05, + "loss": 1.1037, + "step": 3585 + }, + { + "epoch": 0.5653395247513401, + "grad_norm": 0.88671875, + "learning_rate": 6.163810376779521e-05, + "loss": 0.9014, + "step": 3586 + }, + { + "epoch": 0.565497176598733, + "grad_norm": 0.99609375, + "learning_rate": 6.16338522931917e-05, + "loss": 1.1368, + "step": 3587 + }, + { + "epoch": 0.565654828446126, + "grad_norm": 0.87890625, + "learning_rate": 6.162960089990308e-05, + "loss": 0.9397, + "step": 3588 + }, + { + "epoch": 0.565812480293519, + "grad_norm": 0.84765625, + "learning_rate": 6.162534958793833e-05, + "loss": 0.9237, + "step": 3589 + }, + { + "epoch": 0.5659701321409121, + "grad_norm": 0.859375, + "learning_rate": 6.162109835730647e-05, + "loss": 0.8484, + "step": 3590 + }, + { + "epoch": 0.5661277839883051, + "grad_norm": 1.0546875, + "learning_rate": 6.161684720801648e-05, + "loss": 0.9828, + "step": 3591 + }, + { + "epoch": 0.5662854358356981, + "grad_norm": 0.953125, + "learning_rate": 6.161259614007736e-05, + "loss": 0.9851, + "step": 3592 + }, + { + "epoch": 0.5664430876830912, + "grad_norm": 0.875, + "learning_rate": 6.160834515349819e-05, + "loss": 0.7556, + "step": 3593 + }, + { + "epoch": 0.5666007395304842, + "grad_norm": 1.9921875, + "learning_rate": 6.160409424828793e-05, + "loss": 1.1532, + "step": 3594 + }, + { + "epoch": 0.5667583913778771, + "grad_norm": 0.84765625, + "learning_rate": 6.159984342445559e-05, + "loss": 0.9824, + "step": 3595 + }, + { + "epoch": 0.5669160432252701, + "grad_norm": 1.0078125, + "learning_rate": 6.159559268201021e-05, + "loss": 1.0979, + "step": 3596 + }, + { + "epoch": 0.5670736950726631, + "grad_norm": 0.953125, + "learning_rate": 6.159134202096073e-05, + "loss": 1.1249, + "step": 3597 + }, + { + "epoch": 0.5672313469200562, + "grad_norm": 0.96484375, + "learning_rate": 6.158709144131625e-05, + "loss": 1.0888, + "step": 3598 + }, + { + "epoch": 0.5673889987674492, + "grad_norm": 0.953125, + "learning_rate": 6.158284094308572e-05, + "loss": 1.054, + "step": 3599 + }, + { + "epoch": 0.5675466506148422, + "grad_norm": 0.8828125, + "learning_rate": 6.157859052627817e-05, + "loss": 1.0251, + "step": 3600 + }, + { + "epoch": 0.5677043024622352, + "grad_norm": 0.84375, + "learning_rate": 6.15743401909026e-05, + "loss": 1.1205, + "step": 3601 + }, + { + "epoch": 0.5678619543096283, + "grad_norm": 0.8671875, + "learning_rate": 6.157008993696798e-05, + "loss": 0.902, + "step": 3602 + }, + { + "epoch": 0.5680196061570213, + "grad_norm": 0.9921875, + "learning_rate": 6.156583976448339e-05, + "loss": 0.9983, + "step": 3603 + }, + { + "epoch": 0.5681772580044142, + "grad_norm": 0.890625, + "learning_rate": 6.156158967345781e-05, + "loss": 0.9108, + "step": 3604 + }, + { + "epoch": 0.5683349098518072, + "grad_norm": 0.9375, + "learning_rate": 6.155733966390024e-05, + "loss": 0.982, + "step": 3605 + }, + { + "epoch": 0.5684925616992003, + "grad_norm": 0.90625, + "learning_rate": 6.15530897358197e-05, + "loss": 1.0778, + "step": 3606 + }, + { + "epoch": 0.5686502135465933, + "grad_norm": 1.3828125, + "learning_rate": 6.154883988922515e-05, + "loss": 1.152, + "step": 3607 + }, + { + "epoch": 0.5688078653939863, + "grad_norm": 0.9609375, + "learning_rate": 6.154459012412565e-05, + "loss": 0.9303, + "step": 3608 + }, + { + "epoch": 0.5689655172413793, + "grad_norm": 1.0234375, + "learning_rate": 6.154034044053023e-05, + "loss": 1.1761, + "step": 3609 + }, + { + "epoch": 0.5691231690887724, + "grad_norm": 0.90625, + "learning_rate": 6.153609083844784e-05, + "loss": 0.9178, + "step": 3610 + }, + { + "epoch": 0.5692808209361654, + "grad_norm": 0.91015625, + "learning_rate": 6.153184131788751e-05, + "loss": 0.9623, + "step": 3611 + }, + { + "epoch": 0.5694384727835583, + "grad_norm": 0.87890625, + "learning_rate": 6.15275918788582e-05, + "loss": 0.9303, + "step": 3612 + }, + { + "epoch": 0.5695961246309513, + "grad_norm": 0.98828125, + "learning_rate": 6.152334252136902e-05, + "loss": 1.0123, + "step": 3613 + }, + { + "epoch": 0.5697537764783444, + "grad_norm": 0.92578125, + "learning_rate": 6.151909324542888e-05, + "loss": 1.0151, + "step": 3614 + }, + { + "epoch": 0.5699114283257374, + "grad_norm": 0.80078125, + "learning_rate": 6.151484405104685e-05, + "loss": 0.8561, + "step": 3615 + }, + { + "epoch": 0.5700690801731304, + "grad_norm": 0.98828125, + "learning_rate": 6.15105949382319e-05, + "loss": 1.0678, + "step": 3616 + }, + { + "epoch": 0.5702267320205234, + "grad_norm": 0.9375, + "learning_rate": 6.150634590699302e-05, + "loss": 0.9229, + "step": 3617 + }, + { + "epoch": 0.5703843838679165, + "grad_norm": 0.92578125, + "learning_rate": 6.150209695733927e-05, + "loss": 0.9033, + "step": 3618 + }, + { + "epoch": 0.5705420357153095, + "grad_norm": 0.98046875, + "learning_rate": 6.149784808927963e-05, + "loss": 0.8425, + "step": 3619 + }, + { + "epoch": 0.5706996875627024, + "grad_norm": 1.0078125, + "learning_rate": 6.149359930282308e-05, + "loss": 1.0524, + "step": 3620 + }, + { + "epoch": 0.5708573394100954, + "grad_norm": 0.953125, + "learning_rate": 6.148935059797866e-05, + "loss": 1.017, + "step": 3621 + }, + { + "epoch": 0.5710149912574884, + "grad_norm": 1.0078125, + "learning_rate": 6.148510197475533e-05, + "loss": 0.9759, + "step": 3622 + }, + { + "epoch": 0.5711726431048815, + "grad_norm": 0.859375, + "learning_rate": 6.148085343316214e-05, + "loss": 0.7434, + "step": 3623 + }, + { + "epoch": 0.5713302949522745, + "grad_norm": 0.9921875, + "learning_rate": 6.147660497320809e-05, + "loss": 0.9951, + "step": 3624 + }, + { + "epoch": 0.5714879467996675, + "grad_norm": 1.0703125, + "learning_rate": 6.147235659490216e-05, + "loss": 1.0435, + "step": 3625 + }, + { + "epoch": 0.5716455986470605, + "grad_norm": 1.0234375, + "learning_rate": 6.14681082982534e-05, + "loss": 1.143, + "step": 3626 + }, + { + "epoch": 0.5718032504944536, + "grad_norm": 0.90234375, + "learning_rate": 6.146386008327072e-05, + "loss": 0.9175, + "step": 3627 + }, + { + "epoch": 0.5719609023418465, + "grad_norm": 0.83984375, + "learning_rate": 6.145961194996323e-05, + "loss": 1.0816, + "step": 3628 + }, + { + "epoch": 0.5721185541892395, + "grad_norm": 1.0703125, + "learning_rate": 6.145536389833989e-05, + "loss": 1.14, + "step": 3629 + }, + { + "epoch": 0.5722762060366325, + "grad_norm": 0.9453125, + "learning_rate": 6.145111592840971e-05, + "loss": 0.9537, + "step": 3630 + }, + { + "epoch": 0.5724338578840256, + "grad_norm": 1.0234375, + "learning_rate": 6.144686804018167e-05, + "loss": 1.1233, + "step": 3631 + }, + { + "epoch": 0.5725915097314186, + "grad_norm": 2.0625, + "learning_rate": 6.144262023366476e-05, + "loss": 1.021, + "step": 3632 + }, + { + "epoch": 0.5727491615788116, + "grad_norm": 0.99609375, + "learning_rate": 6.143837250886806e-05, + "loss": 1.053, + "step": 3633 + }, + { + "epoch": 0.5729068134262046, + "grad_norm": 0.9921875, + "learning_rate": 6.143412486580051e-05, + "loss": 1.0127, + "step": 3634 + }, + { + "epoch": 0.5730644652735977, + "grad_norm": 0.984375, + "learning_rate": 6.142987730447115e-05, + "loss": 0.9543, + "step": 3635 + }, + { + "epoch": 0.5732221171209906, + "grad_norm": 0.8984375, + "learning_rate": 6.142562982488893e-05, + "loss": 1.1389, + "step": 3636 + }, + { + "epoch": 0.5733797689683836, + "grad_norm": 0.98828125, + "learning_rate": 6.14213824270629e-05, + "loss": 1.2776, + "step": 3637 + }, + { + "epoch": 0.5735374208157766, + "grad_norm": 0.94921875, + "learning_rate": 6.141713511100203e-05, + "loss": 1.0018, + "step": 3638 + }, + { + "epoch": 0.5736950726631697, + "grad_norm": 0.83203125, + "learning_rate": 6.141288787671535e-05, + "loss": 0.7984, + "step": 3639 + }, + { + "epoch": 0.5738527245105627, + "grad_norm": 0.94140625, + "learning_rate": 6.140864072421184e-05, + "loss": 1.2511, + "step": 3640 + }, + { + "epoch": 0.5740103763579557, + "grad_norm": 0.9453125, + "learning_rate": 6.14043936535005e-05, + "loss": 0.9615, + "step": 3641 + }, + { + "epoch": 0.5741680282053487, + "grad_norm": 1.03125, + "learning_rate": 6.140014666459036e-05, + "loss": 0.94, + "step": 3642 + }, + { + "epoch": 0.5743256800527418, + "grad_norm": 0.89453125, + "learning_rate": 6.139589975749039e-05, + "loss": 1.075, + "step": 3643 + }, + { + "epoch": 0.5744833319001347, + "grad_norm": 0.921875, + "learning_rate": 6.139165293220961e-05, + "loss": 1.1126, + "step": 3644 + }, + { + "epoch": 0.5746409837475277, + "grad_norm": 1.0, + "learning_rate": 6.138740618875703e-05, + "loss": 1.0052, + "step": 3645 + }, + { + "epoch": 0.5747986355949207, + "grad_norm": 0.87890625, + "learning_rate": 6.138315952714162e-05, + "loss": 0.8576, + "step": 3646 + }, + { + "epoch": 0.5749562874423138, + "grad_norm": 0.859375, + "learning_rate": 6.137891294737241e-05, + "loss": 0.8909, + "step": 3647 + }, + { + "epoch": 0.5751139392897068, + "grad_norm": 0.90625, + "learning_rate": 6.137466644945834e-05, + "loss": 1.1745, + "step": 3648 + }, + { + "epoch": 0.5752715911370998, + "grad_norm": 0.95703125, + "learning_rate": 6.137042003340849e-05, + "loss": 1.0669, + "step": 3649 + }, + { + "epoch": 0.5754292429844928, + "grad_norm": 0.98046875, + "learning_rate": 6.136617369923185e-05, + "loss": 0.8868, + "step": 3650 + }, + { + "epoch": 0.5755868948318859, + "grad_norm": 0.9453125, + "learning_rate": 6.136192744693738e-05, + "loss": 1.0726, + "step": 3651 + }, + { + "epoch": 0.5757445466792788, + "grad_norm": 1.390625, + "learning_rate": 6.135768127653409e-05, + "loss": 0.9905, + "step": 3652 + }, + { + "epoch": 0.5759021985266718, + "grad_norm": 0.91015625, + "learning_rate": 6.135343518803096e-05, + "loss": 0.953, + "step": 3653 + }, + { + "epoch": 0.5760598503740648, + "grad_norm": 1.03125, + "learning_rate": 6.134918918143703e-05, + "loss": 0.9553, + "step": 3654 + }, + { + "epoch": 0.5762175022214578, + "grad_norm": 0.95703125, + "learning_rate": 6.134494325676133e-05, + "loss": 1.14, + "step": 3655 + }, + { + "epoch": 0.5763751540688509, + "grad_norm": 1.0, + "learning_rate": 6.134069741401278e-05, + "loss": 1.1095, + "step": 3656 + }, + { + "epoch": 0.5765328059162439, + "grad_norm": 0.87890625, + "learning_rate": 6.13364516532004e-05, + "loss": 0.8708, + "step": 3657 + }, + { + "epoch": 0.5766904577636369, + "grad_norm": 0.90234375, + "learning_rate": 6.13322059743332e-05, + "loss": 1.0143, + "step": 3658 + }, + { + "epoch": 0.57684810961103, + "grad_norm": 0.9140625, + "learning_rate": 6.132796037742019e-05, + "loss": 0.96, + "step": 3659 + }, + { + "epoch": 0.5770057614584229, + "grad_norm": 0.86328125, + "learning_rate": 6.132371486247036e-05, + "loss": 1.009, + "step": 3660 + }, + { + "epoch": 0.5771634133058159, + "grad_norm": 0.9375, + "learning_rate": 6.131946942949271e-05, + "loss": 0.9901, + "step": 3661 + }, + { + "epoch": 0.5773210651532089, + "grad_norm": 0.88671875, + "learning_rate": 6.131522407849624e-05, + "loss": 0.9055, + "step": 3662 + }, + { + "epoch": 0.5774787170006019, + "grad_norm": 0.94140625, + "learning_rate": 6.131097880948992e-05, + "loss": 1.1075, + "step": 3663 + }, + { + "epoch": 0.577636368847995, + "grad_norm": 0.91796875, + "learning_rate": 6.130673362248278e-05, + "loss": 0.9076, + "step": 3664 + }, + { + "epoch": 0.577794020695388, + "grad_norm": 1.015625, + "learning_rate": 6.130248851748382e-05, + "loss": 1.0603, + "step": 3665 + }, + { + "epoch": 0.577951672542781, + "grad_norm": 1.0390625, + "learning_rate": 6.129824349450202e-05, + "loss": 1.2511, + "step": 3666 + }, + { + "epoch": 0.578109324390174, + "grad_norm": 1.0234375, + "learning_rate": 6.129399855354637e-05, + "loss": 1.145, + "step": 3667 + }, + { + "epoch": 0.578266976237567, + "grad_norm": 1.0234375, + "learning_rate": 6.128975369462584e-05, + "loss": 1.1687, + "step": 3668 + }, + { + "epoch": 0.57842462808496, + "grad_norm": 0.9765625, + "learning_rate": 6.128550891774952e-05, + "loss": 0.9894, + "step": 3669 + }, + { + "epoch": 0.578582279932353, + "grad_norm": 1.03125, + "learning_rate": 6.128126422292633e-05, + "loss": 1.1256, + "step": 3670 + }, + { + "epoch": 0.578739931779746, + "grad_norm": 0.99609375, + "learning_rate": 6.12770196101653e-05, + "loss": 1.0242, + "step": 3671 + }, + { + "epoch": 0.5788975836271391, + "grad_norm": 0.89453125, + "learning_rate": 6.12727750794754e-05, + "loss": 0.8864, + "step": 3672 + }, + { + "epoch": 0.5790552354745321, + "grad_norm": 0.89453125, + "learning_rate": 6.126853063086562e-05, + "loss": 1.1632, + "step": 3673 + }, + { + "epoch": 0.5792128873219251, + "grad_norm": 0.98828125, + "learning_rate": 6.126428626434501e-05, + "loss": 1.0877, + "step": 3674 + }, + { + "epoch": 0.5793705391693181, + "grad_norm": 0.84765625, + "learning_rate": 6.126004197992253e-05, + "loss": 0.8311, + "step": 3675 + }, + { + "epoch": 0.579528191016711, + "grad_norm": 0.8984375, + "learning_rate": 6.125579777760717e-05, + "loss": 1.0003, + "step": 3676 + }, + { + "epoch": 0.5796858428641041, + "grad_norm": 0.859375, + "learning_rate": 6.125155365740794e-05, + "loss": 0.8458, + "step": 3677 + }, + { + "epoch": 0.5798434947114971, + "grad_norm": 0.9453125, + "learning_rate": 6.124730961933378e-05, + "loss": 0.967, + "step": 3678 + }, + { + "epoch": 0.5800011465588901, + "grad_norm": 0.94921875, + "learning_rate": 6.124306566339377e-05, + "loss": 1.2209, + "step": 3679 + }, + { + "epoch": 0.5801587984062831, + "grad_norm": 0.94921875, + "learning_rate": 6.123882178959687e-05, + "loss": 1.1856, + "step": 3680 + }, + { + "epoch": 0.5803164502536762, + "grad_norm": 0.93359375, + "learning_rate": 6.123457799795208e-05, + "loss": 1.0488, + "step": 3681 + }, + { + "epoch": 0.5804741021010692, + "grad_norm": 0.8828125, + "learning_rate": 6.123033428846839e-05, + "loss": 0.8963, + "step": 3682 + }, + { + "epoch": 0.5806317539484622, + "grad_norm": 0.8203125, + "learning_rate": 6.122609066115476e-05, + "loss": 0.9018, + "step": 3683 + }, + { + "epoch": 0.5807894057958551, + "grad_norm": 0.91796875, + "learning_rate": 6.122184711602024e-05, + "loss": 0.8157, + "step": 3684 + }, + { + "epoch": 0.5809470576432482, + "grad_norm": 0.96484375, + "learning_rate": 6.121760365307378e-05, + "loss": 0.9985, + "step": 3685 + }, + { + "epoch": 0.5811047094906412, + "grad_norm": 0.93359375, + "learning_rate": 6.121336027232441e-05, + "loss": 0.8355, + "step": 3686 + }, + { + "epoch": 0.5812623613380342, + "grad_norm": 0.90234375, + "learning_rate": 6.120911697378111e-05, + "loss": 0.9189, + "step": 3687 + }, + { + "epoch": 0.5814200131854272, + "grad_norm": 0.92578125, + "learning_rate": 6.120487375745282e-05, + "loss": 1.108, + "step": 3688 + }, + { + "epoch": 0.5815776650328203, + "grad_norm": 0.9453125, + "learning_rate": 6.120063062334862e-05, + "loss": 0.9857, + "step": 3689 + }, + { + "epoch": 0.5817353168802133, + "grad_norm": 0.984375, + "learning_rate": 6.119638757147748e-05, + "loss": 1.2278, + "step": 3690 + }, + { + "epoch": 0.5818929687276063, + "grad_norm": 0.89453125, + "learning_rate": 6.119214460184836e-05, + "loss": 0.9237, + "step": 3691 + }, + { + "epoch": 0.5820506205749992, + "grad_norm": 1.015625, + "learning_rate": 6.118790171447029e-05, + "loss": 1.0169, + "step": 3692 + }, + { + "epoch": 0.5822082724223923, + "grad_norm": 0.93359375, + "learning_rate": 6.11836589093522e-05, + "loss": 1.0088, + "step": 3693 + }, + { + "epoch": 0.5823659242697853, + "grad_norm": 0.9453125, + "learning_rate": 6.117941618650315e-05, + "loss": 1.0157, + "step": 3694 + }, + { + "epoch": 0.5825235761171783, + "grad_norm": 0.9609375, + "learning_rate": 6.117517354593212e-05, + "loss": 1.2598, + "step": 3695 + }, + { + "epoch": 0.5826812279645713, + "grad_norm": 0.8671875, + "learning_rate": 6.117093098764808e-05, + "loss": 0.8387, + "step": 3696 + }, + { + "epoch": 0.5828388798119644, + "grad_norm": 1.015625, + "learning_rate": 6.116668851166006e-05, + "loss": 0.9466, + "step": 3697 + }, + { + "epoch": 0.5829965316593574, + "grad_norm": 0.921875, + "learning_rate": 6.116244611797697e-05, + "loss": 1.2616, + "step": 3698 + }, + { + "epoch": 0.5831541835067504, + "grad_norm": 0.98828125, + "learning_rate": 6.115820380660789e-05, + "loss": 1.0047, + "step": 3699 + }, + { + "epoch": 0.5833118353541433, + "grad_norm": 0.921875, + "learning_rate": 6.115396157756177e-05, + "loss": 0.9286, + "step": 3700 + }, + { + "epoch": 0.5834694872015364, + "grad_norm": 0.92578125, + "learning_rate": 6.114971943084761e-05, + "loss": 1.0702, + "step": 3701 + }, + { + "epoch": 0.5836271390489294, + "grad_norm": 0.8984375, + "learning_rate": 6.114547736647442e-05, + "loss": 0.9522, + "step": 3702 + }, + { + "epoch": 0.5837847908963224, + "grad_norm": 1.0859375, + "learning_rate": 6.114123538445114e-05, + "loss": 1.1444, + "step": 3703 + }, + { + "epoch": 0.5839424427437154, + "grad_norm": 1.15625, + "learning_rate": 6.11369934847868e-05, + "loss": 1.0301, + "step": 3704 + }, + { + "epoch": 0.5841000945911085, + "grad_norm": 0.94921875, + "learning_rate": 6.11327516674904e-05, + "loss": 1.0001, + "step": 3705 + }, + { + "epoch": 0.5842577464385015, + "grad_norm": 0.8671875, + "learning_rate": 6.112850993257089e-05, + "loss": 0.9038, + "step": 3706 + }, + { + "epoch": 0.5844153982858945, + "grad_norm": 0.90625, + "learning_rate": 6.112426828003728e-05, + "loss": 1.1516, + "step": 3707 + }, + { + "epoch": 0.5845730501332874, + "grad_norm": 0.90234375, + "learning_rate": 6.112002670989853e-05, + "loss": 0.9596, + "step": 3708 + }, + { + "epoch": 0.5847307019806804, + "grad_norm": 0.90234375, + "learning_rate": 6.111578522216371e-05, + "loss": 0.9554, + "step": 3709 + }, + { + "epoch": 0.5848883538280735, + "grad_norm": 0.9921875, + "learning_rate": 6.111154381684176e-05, + "loss": 0.8495, + "step": 3710 + }, + { + "epoch": 0.5850460056754665, + "grad_norm": 1.3515625, + "learning_rate": 6.110730249394165e-05, + "loss": 0.9454, + "step": 3711 + }, + { + "epoch": 0.5852036575228595, + "grad_norm": 0.9765625, + "learning_rate": 6.110306125347242e-05, + "loss": 0.9966, + "step": 3712 + }, + { + "epoch": 0.5853613093702525, + "grad_norm": 0.95703125, + "learning_rate": 6.109882009544296e-05, + "loss": 1.0292, + "step": 3713 + }, + { + "epoch": 0.5855189612176456, + "grad_norm": 1.0625, + "learning_rate": 6.109457901986238e-05, + "loss": 1.1162, + "step": 3714 + }, + { + "epoch": 0.5856766130650386, + "grad_norm": 0.97265625, + "learning_rate": 6.109033802673963e-05, + "loss": 1.1341, + "step": 3715 + }, + { + "epoch": 0.5858342649124315, + "grad_norm": 0.83984375, + "learning_rate": 6.108609711608367e-05, + "loss": 0.8791, + "step": 3716 + }, + { + "epoch": 0.5859919167598245, + "grad_norm": 1.046875, + "learning_rate": 6.108185628790349e-05, + "loss": 1.0924, + "step": 3717 + }, + { + "epoch": 0.5861495686072176, + "grad_norm": 0.92578125, + "learning_rate": 6.107761554220806e-05, + "loss": 1.0934, + "step": 3718 + }, + { + "epoch": 0.5863072204546106, + "grad_norm": 1.015625, + "learning_rate": 6.107337487900643e-05, + "loss": 1.1894, + "step": 3719 + }, + { + "epoch": 0.5864648723020036, + "grad_norm": 1.0234375, + "learning_rate": 6.106913429830759e-05, + "loss": 0.8627, + "step": 3720 + }, + { + "epoch": 0.5866225241493966, + "grad_norm": 0.83984375, + "learning_rate": 6.106489380012047e-05, + "loss": 0.908, + "step": 3721 + }, + { + "epoch": 0.5867801759967897, + "grad_norm": 0.98828125, + "learning_rate": 6.106065338445409e-05, + "loss": 1.2072, + "step": 3722 + }, + { + "epoch": 0.5869378278441827, + "grad_norm": 0.859375, + "learning_rate": 6.105641305131741e-05, + "loss": 0.9015, + "step": 3723 + }, + { + "epoch": 0.5870954796915756, + "grad_norm": 0.9609375, + "learning_rate": 6.105217280071944e-05, + "loss": 1.0727, + "step": 3724 + }, + { + "epoch": 0.5872531315389686, + "grad_norm": 0.88671875, + "learning_rate": 6.104793263266919e-05, + "loss": 0.8315, + "step": 3725 + }, + { + "epoch": 0.5874107833863617, + "grad_norm": 1.046875, + "learning_rate": 6.104369254717558e-05, + "loss": 0.914, + "step": 3726 + }, + { + "epoch": 0.5875684352337547, + "grad_norm": 0.9453125, + "learning_rate": 6.103945254424766e-05, + "loss": 0.9096, + "step": 3727 + }, + { + "epoch": 0.5877260870811477, + "grad_norm": 1.03125, + "learning_rate": 6.103521262389438e-05, + "loss": 1.1415, + "step": 3728 + }, + { + "epoch": 0.5878837389285407, + "grad_norm": 0.9453125, + "learning_rate": 6.103097278612476e-05, + "loss": 1.0662, + "step": 3729 + }, + { + "epoch": 0.5880413907759338, + "grad_norm": 0.828125, + "learning_rate": 6.102673303094777e-05, + "loss": 0.8099, + "step": 3730 + }, + { + "epoch": 0.5881990426233268, + "grad_norm": 0.98828125, + "learning_rate": 6.102249335837238e-05, + "loss": 1.0218, + "step": 3731 + }, + { + "epoch": 0.5883566944707197, + "grad_norm": 0.953125, + "learning_rate": 6.10182537684076e-05, + "loss": 1.1913, + "step": 3732 + }, + { + "epoch": 0.5885143463181127, + "grad_norm": 0.87109375, + "learning_rate": 6.1014014261062345e-05, + "loss": 0.9416, + "step": 3733 + }, + { + "epoch": 0.5886719981655057, + "grad_norm": 0.87890625, + "learning_rate": 6.100977483634571e-05, + "loss": 0.9575, + "step": 3734 + }, + { + "epoch": 0.5888296500128988, + "grad_norm": 0.8984375, + "learning_rate": 6.100553549426662e-05, + "loss": 1.0487, + "step": 3735 + }, + { + "epoch": 0.5889873018602918, + "grad_norm": 0.97265625, + "learning_rate": 6.100129623483408e-05, + "loss": 0.9781, + "step": 3736 + }, + { + "epoch": 0.5891449537076848, + "grad_norm": 0.9453125, + "learning_rate": 6.099705705805706e-05, + "loss": 1.1241, + "step": 3737 + }, + { + "epoch": 0.5893026055550779, + "grad_norm": 0.9453125, + "learning_rate": 6.099281796394452e-05, + "loss": 1.1786, + "step": 3738 + }, + { + "epoch": 0.5894602574024709, + "grad_norm": 1.0625, + "learning_rate": 6.098857895250549e-05, + "loss": 1.1393, + "step": 3739 + }, + { + "epoch": 0.5896179092498638, + "grad_norm": 0.9609375, + "learning_rate": 6.0984340023748955e-05, + "loss": 1.1505, + "step": 3740 + }, + { + "epoch": 0.5897755610972568, + "grad_norm": 1.0, + "learning_rate": 6.098010117768387e-05, + "loss": 1.1143, + "step": 3741 + }, + { + "epoch": 0.5899332129446498, + "grad_norm": 0.92578125, + "learning_rate": 6.097586241431923e-05, + "loss": 0.9673, + "step": 3742 + }, + { + "epoch": 0.5900908647920429, + "grad_norm": 1.203125, + "learning_rate": 6.0971623733664005e-05, + "loss": 0.8645, + "step": 3743 + }, + { + "epoch": 0.5902485166394359, + "grad_norm": 0.89453125, + "learning_rate": 6.096738513572722e-05, + "loss": 0.8388, + "step": 3744 + }, + { + "epoch": 0.5904061684868289, + "grad_norm": 0.8671875, + "learning_rate": 6.096314662051779e-05, + "loss": 0.8632, + "step": 3745 + }, + { + "epoch": 0.590563820334222, + "grad_norm": 0.97265625, + "learning_rate": 6.095890818804477e-05, + "loss": 1.002, + "step": 3746 + }, + { + "epoch": 0.590721472181615, + "grad_norm": 0.91015625, + "learning_rate": 6.0954669838317123e-05, + "loss": 0.8754, + "step": 3747 + }, + { + "epoch": 0.5908791240290079, + "grad_norm": 0.875, + "learning_rate": 6.095043157134379e-05, + "loss": 0.9226, + "step": 3748 + }, + { + "epoch": 0.5910367758764009, + "grad_norm": 0.9296875, + "learning_rate": 6.094619338713381e-05, + "loss": 1.0216, + "step": 3749 + }, + { + "epoch": 0.5911944277237939, + "grad_norm": 0.93359375, + "learning_rate": 6.094195528569614e-05, + "loss": 0.9461, + "step": 3750 + }, + { + "epoch": 0.591352079571187, + "grad_norm": 1.0, + "learning_rate": 6.0937717267039765e-05, + "loss": 0.8667, + "step": 3751 + }, + { + "epoch": 0.59150973141858, + "grad_norm": 1.0234375, + "learning_rate": 6.093347933117366e-05, + "loss": 1.2498, + "step": 3752 + }, + { + "epoch": 0.591667383265973, + "grad_norm": 0.96875, + "learning_rate": 6.092924147810677e-05, + "loss": 1.0493, + "step": 3753 + }, + { + "epoch": 0.591825035113366, + "grad_norm": 0.96484375, + "learning_rate": 6.092500370784817e-05, + "loss": 0.9877, + "step": 3754 + }, + { + "epoch": 0.5919826869607591, + "grad_norm": 1.0078125, + "learning_rate": 6.092076602040678e-05, + "loss": 1.1773, + "step": 3755 + }, + { + "epoch": 0.5921403388081521, + "grad_norm": 1.1484375, + "learning_rate": 6.091652841579161e-05, + "loss": 1.0858, + "step": 3756 + }, + { + "epoch": 0.592297990655545, + "grad_norm": 0.88671875, + "learning_rate": 6.0912290894011606e-05, + "loss": 0.9462, + "step": 3757 + }, + { + "epoch": 0.592455642502938, + "grad_norm": 0.94140625, + "learning_rate": 6.0908053455075734e-05, + "loss": 1.0217, + "step": 3758 + }, + { + "epoch": 0.592613294350331, + "grad_norm": 0.984375, + "learning_rate": 6.090381609899305e-05, + "loss": 0.9896, + "step": 3759 + }, + { + "epoch": 0.5927709461977241, + "grad_norm": 1.0078125, + "learning_rate": 6.08995788257725e-05, + "loss": 1.0398, + "step": 3760 + }, + { + "epoch": 0.5929285980451171, + "grad_norm": 0.984375, + "learning_rate": 6.089534163542304e-05, + "loss": 1.3516, + "step": 3761 + }, + { + "epoch": 0.5930862498925101, + "grad_norm": 0.90625, + "learning_rate": 6.089110452795368e-05, + "loss": 1.114, + "step": 3762 + }, + { + "epoch": 0.5932439017399032, + "grad_norm": 0.890625, + "learning_rate": 6.088686750337335e-05, + "loss": 0.9312, + "step": 3763 + }, + { + "epoch": 0.5934015535872962, + "grad_norm": 0.96484375, + "learning_rate": 6.08826305616911e-05, + "loss": 0.8496, + "step": 3764 + }, + { + "epoch": 0.5935592054346891, + "grad_norm": 0.92578125, + "learning_rate": 6.087839370291588e-05, + "loss": 1.192, + "step": 3765 + }, + { + "epoch": 0.5937168572820821, + "grad_norm": 1.0234375, + "learning_rate": 6.0874156927056683e-05, + "loss": 1.2035, + "step": 3766 + }, + { + "epoch": 0.5938745091294751, + "grad_norm": 0.953125, + "learning_rate": 6.0869920234122454e-05, + "loss": 1.0935, + "step": 3767 + }, + { + "epoch": 0.5940321609768682, + "grad_norm": 0.8515625, + "learning_rate": 6.0865683624122196e-05, + "loss": 0.9468, + "step": 3768 + }, + { + "epoch": 0.5941898128242612, + "grad_norm": 0.890625, + "learning_rate": 6.086144709706488e-05, + "loss": 0.9503, + "step": 3769 + }, + { + "epoch": 0.5943474646716542, + "grad_norm": 1.0703125, + "learning_rate": 6.085721065295951e-05, + "loss": 1.2029, + "step": 3770 + }, + { + "epoch": 0.5945051165190472, + "grad_norm": 0.9609375, + "learning_rate": 6.085297429181504e-05, + "loss": 0.9799, + "step": 3771 + }, + { + "epoch": 0.5946627683664403, + "grad_norm": 0.921875, + "learning_rate": 6.084873801364044e-05, + "loss": 1.0189, + "step": 3772 + }, + { + "epoch": 0.5948204202138332, + "grad_norm": 0.8515625, + "learning_rate": 6.084450181844468e-05, + "loss": 0.7438, + "step": 3773 + }, + { + "epoch": 0.5949780720612262, + "grad_norm": 0.96875, + "learning_rate": 6.084026570623678e-05, + "loss": 1.1908, + "step": 3774 + }, + { + "epoch": 0.5951357239086192, + "grad_norm": 0.94921875, + "learning_rate": 6.083602967702571e-05, + "loss": 0.9484, + "step": 3775 + }, + { + "epoch": 0.5952933757560123, + "grad_norm": 0.9453125, + "learning_rate": 6.0831793730820444e-05, + "loss": 1.0996, + "step": 3776 + }, + { + "epoch": 0.5954510276034053, + "grad_norm": 0.83984375, + "learning_rate": 6.0827557867629945e-05, + "loss": 0.9528, + "step": 3777 + }, + { + "epoch": 0.5956086794507983, + "grad_norm": 1.0546875, + "learning_rate": 6.08233220874632e-05, + "loss": 1.0698, + "step": 3778 + }, + { + "epoch": 0.5957663312981913, + "grad_norm": 1.015625, + "learning_rate": 6.081908639032914e-05, + "loss": 1.3273, + "step": 3779 + }, + { + "epoch": 0.5959239831455844, + "grad_norm": 1.640625, + "learning_rate": 6.081485077623683e-05, + "loss": 1.2733, + "step": 3780 + }, + { + "epoch": 0.5960816349929773, + "grad_norm": 0.95703125, + "learning_rate": 6.08106152451952e-05, + "loss": 0.9948, + "step": 3781 + }, + { + "epoch": 0.5962392868403703, + "grad_norm": 5.15625, + "learning_rate": 6.0806379797213245e-05, + "loss": 1.0552, + "step": 3782 + }, + { + "epoch": 0.5963969386877633, + "grad_norm": 1.0625, + "learning_rate": 6.08021444322999e-05, + "loss": 0.9877, + "step": 3783 + }, + { + "epoch": 0.5965545905351564, + "grad_norm": 0.953125, + "learning_rate": 6.079790915046415e-05, + "loss": 1.0554, + "step": 3784 + }, + { + "epoch": 0.5967122423825494, + "grad_norm": 0.9375, + "learning_rate": 6.079367395171504e-05, + "loss": 1.0957, + "step": 3785 + }, + { + "epoch": 0.5968698942299424, + "grad_norm": 0.86328125, + "learning_rate": 6.0789438836061474e-05, + "loss": 0.8926, + "step": 3786 + }, + { + "epoch": 0.5970275460773354, + "grad_norm": 0.9296875, + "learning_rate": 6.078520380351247e-05, + "loss": 1.1871, + "step": 3787 + }, + { + "epoch": 0.5971851979247285, + "grad_norm": 0.8671875, + "learning_rate": 6.078096885407698e-05, + "loss": 0.987, + "step": 3788 + }, + { + "epoch": 0.5973428497721214, + "grad_norm": 0.8671875, + "learning_rate": 6.0776733987763955e-05, + "loss": 0.926, + "step": 3789 + }, + { + "epoch": 0.5975005016195144, + "grad_norm": 0.92578125, + "learning_rate": 6.0772499204582437e-05, + "loss": 0.9101, + "step": 3790 + }, + { + "epoch": 0.5976581534669074, + "grad_norm": 1.0625, + "learning_rate": 6.076826450454135e-05, + "loss": 1.0006, + "step": 3791 + }, + { + "epoch": 0.5978158053143005, + "grad_norm": 0.9296875, + "learning_rate": 6.076402988764969e-05, + "loss": 0.9464, + "step": 3792 + }, + { + "epoch": 0.5979734571616935, + "grad_norm": 0.90625, + "learning_rate": 6.075979535391641e-05, + "loss": 0.9077, + "step": 3793 + }, + { + "epoch": 0.5981311090090865, + "grad_norm": 0.953125, + "learning_rate": 6.075556090335048e-05, + "loss": 0.9244, + "step": 3794 + }, + { + "epoch": 0.5982887608564795, + "grad_norm": 0.9453125, + "learning_rate": 6.075132653596094e-05, + "loss": 0.9194, + "step": 3795 + }, + { + "epoch": 0.5984464127038726, + "grad_norm": 0.8984375, + "learning_rate": 6.074709225175672e-05, + "loss": 0.9811, + "step": 3796 + }, + { + "epoch": 0.5986040645512655, + "grad_norm": 1.0390625, + "learning_rate": 6.0742858050746776e-05, + "loss": 0.8866, + "step": 3797 + }, + { + "epoch": 0.5987617163986585, + "grad_norm": 0.9296875, + "learning_rate": 6.073862393294012e-05, + "loss": 1.0535, + "step": 3798 + }, + { + "epoch": 0.5989193682460515, + "grad_norm": 0.80078125, + "learning_rate": 6.073438989834566e-05, + "loss": 0.9325, + "step": 3799 + }, + { + "epoch": 0.5990770200934445, + "grad_norm": 0.921875, + "learning_rate": 6.073015594697245e-05, + "loss": 0.9716, + "step": 3800 + }, + { + "epoch": 0.5992346719408376, + "grad_norm": 0.99609375, + "learning_rate": 6.0725922078829434e-05, + "loss": 1.1171, + "step": 3801 + }, + { + "epoch": 0.5993923237882306, + "grad_norm": 0.90625, + "learning_rate": 6.0721688293925575e-05, + "loss": 1.0306, + "step": 3802 + }, + { + "epoch": 0.5995499756356236, + "grad_norm": 0.9609375, + "learning_rate": 6.071745459226986e-05, + "loss": 1.0906, + "step": 3803 + }, + { + "epoch": 0.5997076274830166, + "grad_norm": 0.9375, + "learning_rate": 6.071322097387122e-05, + "loss": 1.0168, + "step": 3804 + }, + { + "epoch": 0.5998652793304096, + "grad_norm": 0.953125, + "learning_rate": 6.0708987438738695e-05, + "loss": 1.0013, + "step": 3805 + }, + { + "epoch": 0.6000229311778026, + "grad_norm": 0.91796875, + "learning_rate": 6.0704753986881224e-05, + "loss": 1.0368, + "step": 3806 + }, + { + "epoch": 0.6001805830251956, + "grad_norm": 0.93359375, + "learning_rate": 6.0700520618307774e-05, + "loss": 0.9536, + "step": 3807 + }, + { + "epoch": 0.6003382348725886, + "grad_norm": 0.87890625, + "learning_rate": 6.069628733302735e-05, + "loss": 0.8368, + "step": 3808 + }, + { + "epoch": 0.6004958867199817, + "grad_norm": 1.03125, + "learning_rate": 6.069205413104886e-05, + "loss": 0.8933, + "step": 3809 + }, + { + "epoch": 0.6006535385673747, + "grad_norm": 0.9375, + "learning_rate": 6.068782101238133e-05, + "loss": 0.8778, + "step": 3810 + }, + { + "epoch": 0.6008111904147677, + "grad_norm": 1.4765625, + "learning_rate": 6.068358797703373e-05, + "loss": 1.0116, + "step": 3811 + }, + { + "epoch": 0.6009688422621607, + "grad_norm": 0.828125, + "learning_rate": 6.067935502501498e-05, + "loss": 0.9347, + "step": 3812 + }, + { + "epoch": 0.6011264941095537, + "grad_norm": 0.84765625, + "learning_rate": 6.067512215633413e-05, + "loss": 0.8701, + "step": 3813 + }, + { + "epoch": 0.6012841459569467, + "grad_norm": 0.91796875, + "learning_rate": 6.067088937100007e-05, + "loss": 1.0277, + "step": 3814 + }, + { + "epoch": 0.6014417978043397, + "grad_norm": 0.96484375, + "learning_rate": 6.066665666902185e-05, + "loss": 0.972, + "step": 3815 + }, + { + "epoch": 0.6015994496517327, + "grad_norm": 0.91796875, + "learning_rate": 6.06624240504084e-05, + "loss": 0.9593, + "step": 3816 + }, + { + "epoch": 0.6017571014991258, + "grad_norm": 1.0, + "learning_rate": 6.065819151516868e-05, + "loss": 0.9156, + "step": 3817 + }, + { + "epoch": 0.6019147533465188, + "grad_norm": 1.09375, + "learning_rate": 6.065395906331168e-05, + "loss": 1.1907, + "step": 3818 + }, + { + "epoch": 0.6020724051939118, + "grad_norm": 0.98046875, + "learning_rate": 6.064972669484634e-05, + "loss": 1.0359, + "step": 3819 + }, + { + "epoch": 0.6022300570413048, + "grad_norm": 0.94921875, + "learning_rate": 6.064549440978169e-05, + "loss": 1.0657, + "step": 3820 + }, + { + "epoch": 0.6023877088886977, + "grad_norm": 0.86328125, + "learning_rate": 6.064126220812665e-05, + "loss": 0.7633, + "step": 3821 + }, + { + "epoch": 0.6025453607360908, + "grad_norm": 0.89453125, + "learning_rate": 6.063703008989021e-05, + "loss": 0.9008, + "step": 3822 + }, + { + "epoch": 0.6027030125834838, + "grad_norm": 0.93359375, + "learning_rate": 6.0632798055081354e-05, + "loss": 0.9777, + "step": 3823 + }, + { + "epoch": 0.6028606644308768, + "grad_norm": 0.92578125, + "learning_rate": 6.062856610370897e-05, + "loss": 1.0232, + "step": 3824 + }, + { + "epoch": 0.6030183162782699, + "grad_norm": 1.125, + "learning_rate": 6.0624334235782146e-05, + "loss": 1.3447, + "step": 3825 + }, + { + "epoch": 0.6031759681256629, + "grad_norm": 0.91796875, + "learning_rate": 6.062010245130978e-05, + "loss": 0.9635, + "step": 3826 + }, + { + "epoch": 0.6033336199730559, + "grad_norm": 0.90625, + "learning_rate": 6.0615870750300865e-05, + "loss": 0.8296, + "step": 3827 + }, + { + "epoch": 0.6034912718204489, + "grad_norm": 1.046875, + "learning_rate": 6.061163913276435e-05, + "loss": 1.0955, + "step": 3828 + }, + { + "epoch": 0.6036489236678418, + "grad_norm": 1.0234375, + "learning_rate": 6.060740759870922e-05, + "loss": 0.9466, + "step": 3829 + }, + { + "epoch": 0.6038065755152349, + "grad_norm": 0.9140625, + "learning_rate": 6.060317614814441e-05, + "loss": 0.8579, + "step": 3830 + }, + { + "epoch": 0.6039642273626279, + "grad_norm": 0.8359375, + "learning_rate": 6.059894478107895e-05, + "loss": 0.9249, + "step": 3831 + }, + { + "epoch": 0.6041218792100209, + "grad_norm": 0.9296875, + "learning_rate": 6.059471349752177e-05, + "loss": 0.9536, + "step": 3832 + }, + { + "epoch": 0.6042795310574139, + "grad_norm": 0.984375, + "learning_rate": 6.059048229748185e-05, + "loss": 1.0153, + "step": 3833 + }, + { + "epoch": 0.604437182904807, + "grad_norm": 1.015625, + "learning_rate": 6.058625118096812e-05, + "loss": 1.1531, + "step": 3834 + }, + { + "epoch": 0.6045948347522, + "grad_norm": 0.8671875, + "learning_rate": 6.058202014798962e-05, + "loss": 1.0149, + "step": 3835 + }, + { + "epoch": 0.604752486599593, + "grad_norm": 1.0390625, + "learning_rate": 6.057778919855525e-05, + "loss": 1.2301, + "step": 3836 + }, + { + "epoch": 0.6049101384469859, + "grad_norm": 0.88671875, + "learning_rate": 6.057355833267401e-05, + "loss": 0.9064, + "step": 3837 + }, + { + "epoch": 0.605067790294379, + "grad_norm": 0.9453125, + "learning_rate": 6.056932755035486e-05, + "loss": 0.8718, + "step": 3838 + }, + { + "epoch": 0.605225442141772, + "grad_norm": 0.9765625, + "learning_rate": 6.056509685160672e-05, + "loss": 0.8612, + "step": 3839 + }, + { + "epoch": 0.605383093989165, + "grad_norm": 0.9453125, + "learning_rate": 6.0560866236438654e-05, + "loss": 1.0919, + "step": 3840 + }, + { + "epoch": 0.605540745836558, + "grad_norm": 0.89453125, + "learning_rate": 6.055663570485956e-05, + "loss": 0.9215, + "step": 3841 + }, + { + "epoch": 0.6056983976839511, + "grad_norm": 0.91015625, + "learning_rate": 6.055240525687844e-05, + "loss": 0.9596, + "step": 3842 + }, + { + "epoch": 0.6058560495313441, + "grad_norm": 0.9609375, + "learning_rate": 6.054817489250425e-05, + "loss": 0.9909, + "step": 3843 + }, + { + "epoch": 0.6060137013787371, + "grad_norm": 0.84375, + "learning_rate": 6.054394461174589e-05, + "loss": 0.8108, + "step": 3844 + }, + { + "epoch": 0.60617135322613, + "grad_norm": 1.0078125, + "learning_rate": 6.0539714414612434e-05, + "loss": 1.0393, + "step": 3845 + }, + { + "epoch": 0.606329005073523, + "grad_norm": 0.984375, + "learning_rate": 6.05354843011128e-05, + "loss": 1.1833, + "step": 3846 + }, + { + "epoch": 0.6064866569209161, + "grad_norm": 1.0546875, + "learning_rate": 6.053125427125595e-05, + "loss": 0.9821, + "step": 3847 + }, + { + "epoch": 0.6066443087683091, + "grad_norm": 0.9921875, + "learning_rate": 6.052702432505084e-05, + "loss": 0.8377, + "step": 3848 + }, + { + "epoch": 0.6068019606157021, + "grad_norm": 0.96484375, + "learning_rate": 6.0522794462506416e-05, + "loss": 1.1503, + "step": 3849 + }, + { + "epoch": 0.6069596124630952, + "grad_norm": 0.95703125, + "learning_rate": 6.051856468363171e-05, + "loss": 1.3072, + "step": 3850 + }, + { + "epoch": 0.6071172643104882, + "grad_norm": 1.0234375, + "learning_rate": 6.051433498843564e-05, + "loss": 1.1149, + "step": 3851 + }, + { + "epoch": 0.6072749161578812, + "grad_norm": 0.89453125, + "learning_rate": 6.0510105376927184e-05, + "loss": 0.9371, + "step": 3852 + }, + { + "epoch": 0.6074325680052741, + "grad_norm": 0.97265625, + "learning_rate": 6.050587584911531e-05, + "loss": 1.0806, + "step": 3853 + }, + { + "epoch": 0.6075902198526671, + "grad_norm": 1.0234375, + "learning_rate": 6.050164640500895e-05, + "loss": 0.8806, + "step": 3854 + }, + { + "epoch": 0.6077478717000602, + "grad_norm": 0.8671875, + "learning_rate": 6.049741704461712e-05, + "loss": 0.9011, + "step": 3855 + }, + { + "epoch": 0.6079055235474532, + "grad_norm": 0.9375, + "learning_rate": 6.049318776794876e-05, + "loss": 1.0227, + "step": 3856 + }, + { + "epoch": 0.6080631753948462, + "grad_norm": 0.953125, + "learning_rate": 6.0488958575012824e-05, + "loss": 0.9647, + "step": 3857 + }, + { + "epoch": 0.6082208272422392, + "grad_norm": 1.0234375, + "learning_rate": 6.048472946581827e-05, + "loss": 1.2149, + "step": 3858 + }, + { + "epoch": 0.6083784790896323, + "grad_norm": 0.90234375, + "learning_rate": 6.048050044037405e-05, + "loss": 0.8682, + "step": 3859 + }, + { + "epoch": 0.6085361309370253, + "grad_norm": 0.8984375, + "learning_rate": 6.0476271498689174e-05, + "loss": 0.8054, + "step": 3860 + }, + { + "epoch": 0.6086937827844182, + "grad_norm": 0.90234375, + "learning_rate": 6.04720426407726e-05, + "loss": 0.9682, + "step": 3861 + }, + { + "epoch": 0.6088514346318112, + "grad_norm": 0.91796875, + "learning_rate": 6.046781386663326e-05, + "loss": 0.8595, + "step": 3862 + }, + { + "epoch": 0.6090090864792043, + "grad_norm": 0.984375, + "learning_rate": 6.0463585176280134e-05, + "loss": 1.2023, + "step": 3863 + }, + { + "epoch": 0.6091667383265973, + "grad_norm": 0.90625, + "learning_rate": 6.045935656972214e-05, + "loss": 0.9886, + "step": 3864 + }, + { + "epoch": 0.6093243901739903, + "grad_norm": 0.90625, + "learning_rate": 6.0455128046968316e-05, + "loss": 0.8924, + "step": 3865 + }, + { + "epoch": 0.6094820420213833, + "grad_norm": 1.046875, + "learning_rate": 6.045089960802759e-05, + "loss": 1.0013, + "step": 3866 + }, + { + "epoch": 0.6096396938687764, + "grad_norm": 0.87890625, + "learning_rate": 6.0446671252908926e-05, + "loss": 0.8369, + "step": 3867 + }, + { + "epoch": 0.6097973457161694, + "grad_norm": 0.96875, + "learning_rate": 6.044244298162127e-05, + "loss": 0.9433, + "step": 3868 + }, + { + "epoch": 0.6099549975635623, + "grad_norm": 1.015625, + "learning_rate": 6.043821479417356e-05, + "loss": 0.9105, + "step": 3869 + }, + { + "epoch": 0.6101126494109553, + "grad_norm": 0.84765625, + "learning_rate": 6.043398669057483e-05, + "loss": 0.9041, + "step": 3870 + }, + { + "epoch": 0.6102703012583484, + "grad_norm": 0.94140625, + "learning_rate": 6.0429758670834015e-05, + "loss": 1.0843, + "step": 3871 + }, + { + "epoch": 0.6104279531057414, + "grad_norm": 1.0, + "learning_rate": 6.0425530734960054e-05, + "loss": 1.0184, + "step": 3872 + }, + { + "epoch": 0.6105856049531344, + "grad_norm": 1.03125, + "learning_rate": 6.0421302882961926e-05, + "loss": 1.018, + "step": 3873 + }, + { + "epoch": 0.6107432568005274, + "grad_norm": 0.95703125, + "learning_rate": 6.041707511484855e-05, + "loss": 0.9192, + "step": 3874 + }, + { + "epoch": 0.6109009086479205, + "grad_norm": 0.8671875, + "learning_rate": 6.041284743062896e-05, + "loss": 1.0443, + "step": 3875 + }, + { + "epoch": 0.6110585604953135, + "grad_norm": 1.03125, + "learning_rate": 6.040861983031207e-05, + "loss": 0.8376, + "step": 3876 + }, + { + "epoch": 0.6112162123427064, + "grad_norm": 0.921875, + "learning_rate": 6.0404392313906845e-05, + "loss": 0.9152, + "step": 3877 + }, + { + "epoch": 0.6113738641900994, + "grad_norm": 0.93359375, + "learning_rate": 6.040016488142224e-05, + "loss": 1.1457, + "step": 3878 + }, + { + "epoch": 0.6115315160374925, + "grad_norm": 0.8984375, + "learning_rate": 6.039593753286721e-05, + "loss": 0.938, + "step": 3879 + }, + { + "epoch": 0.6116891678848855, + "grad_norm": 0.9921875, + "learning_rate": 6.039171026825075e-05, + "loss": 0.9104, + "step": 3880 + }, + { + "epoch": 0.6118468197322785, + "grad_norm": 0.94921875, + "learning_rate": 6.038748308758179e-05, + "loss": 0.9276, + "step": 3881 + }, + { + "epoch": 0.6120044715796715, + "grad_norm": 1.0078125, + "learning_rate": 6.038325599086929e-05, + "loss": 1.1049, + "step": 3882 + }, + { + "epoch": 0.6121621234270646, + "grad_norm": 0.93359375, + "learning_rate": 6.037902897812222e-05, + "loss": 1.0009, + "step": 3883 + }, + { + "epoch": 0.6123197752744576, + "grad_norm": 0.91796875, + "learning_rate": 6.037480204934951e-05, + "loss": 0.8953, + "step": 3884 + }, + { + "epoch": 0.6124774271218505, + "grad_norm": 1.3203125, + "learning_rate": 6.0370575204560164e-05, + "loss": 0.9871, + "step": 3885 + }, + { + "epoch": 0.6126350789692435, + "grad_norm": 0.83203125, + "learning_rate": 6.036634844376312e-05, + "loss": 0.8286, + "step": 3886 + }, + { + "epoch": 0.6127927308166365, + "grad_norm": 0.8984375, + "learning_rate": 6.036212176696734e-05, + "loss": 1.0444, + "step": 3887 + }, + { + "epoch": 0.6129503826640296, + "grad_norm": 0.84765625, + "learning_rate": 6.0357895174181775e-05, + "loss": 0.8363, + "step": 3888 + }, + { + "epoch": 0.6131080345114226, + "grad_norm": 0.97265625, + "learning_rate": 6.035366866541535e-05, + "loss": 0.9531, + "step": 3889 + }, + { + "epoch": 0.6132656863588156, + "grad_norm": 0.85546875, + "learning_rate": 6.034944224067709e-05, + "loss": 0.9766, + "step": 3890 + }, + { + "epoch": 0.6134233382062086, + "grad_norm": 1.03125, + "learning_rate": 6.034521589997593e-05, + "loss": 1.2584, + "step": 3891 + }, + { + "epoch": 0.6135809900536017, + "grad_norm": 0.96875, + "learning_rate": 6.034098964332082e-05, + "loss": 1.0185, + "step": 3892 + }, + { + "epoch": 0.6137386419009946, + "grad_norm": 0.87109375, + "learning_rate": 6.0336763470720705e-05, + "loss": 1.1305, + "step": 3893 + }, + { + "epoch": 0.6138962937483876, + "grad_norm": 1.0546875, + "learning_rate": 6.033253738218454e-05, + "loss": 1.0223, + "step": 3894 + }, + { + "epoch": 0.6140539455957806, + "grad_norm": 0.91015625, + "learning_rate": 6.0328311377721305e-05, + "loss": 0.9278, + "step": 3895 + }, + { + "epoch": 0.6142115974431737, + "grad_norm": 1.0234375, + "learning_rate": 6.032408545733996e-05, + "loss": 1.2693, + "step": 3896 + }, + { + "epoch": 0.6143692492905667, + "grad_norm": 0.91015625, + "learning_rate": 6.031985962104942e-05, + "loss": 0.9212, + "step": 3897 + }, + { + "epoch": 0.6145269011379597, + "grad_norm": 0.90234375, + "learning_rate": 6.03156338688587e-05, + "loss": 0.9277, + "step": 3898 + }, + { + "epoch": 0.6146845529853527, + "grad_norm": 2.203125, + "learning_rate": 6.03114082007767e-05, + "loss": 1.018, + "step": 3899 + }, + { + "epoch": 0.6148422048327458, + "grad_norm": 1.0703125, + "learning_rate": 6.030718261681241e-05, + "loss": 0.9203, + "step": 3900 + }, + { + "epoch": 0.6149998566801387, + "grad_norm": 1.0390625, + "learning_rate": 6.030295711697478e-05, + "loss": 1.0157, + "step": 3901 + }, + { + "epoch": 0.6151575085275317, + "grad_norm": 0.9921875, + "learning_rate": 6.0298731701272784e-05, + "loss": 1.0371, + "step": 3902 + }, + { + "epoch": 0.6153151603749247, + "grad_norm": 0.921875, + "learning_rate": 6.029450636971534e-05, + "loss": 1.0708, + "step": 3903 + }, + { + "epoch": 0.6154728122223178, + "grad_norm": 0.875, + "learning_rate": 6.0290281122311384e-05, + "loss": 0.9155, + "step": 3904 + }, + { + "epoch": 0.6156304640697108, + "grad_norm": 0.97265625, + "learning_rate": 6.028605595906994e-05, + "loss": 1.1189, + "step": 3905 + }, + { + "epoch": 0.6157881159171038, + "grad_norm": 0.94140625, + "learning_rate": 6.028183087999994e-05, + "loss": 0.9114, + "step": 3906 + }, + { + "epoch": 0.6159457677644968, + "grad_norm": 0.953125, + "learning_rate": 6.027760588511033e-05, + "loss": 1.2237, + "step": 3907 + }, + { + "epoch": 0.6161034196118899, + "grad_norm": 0.96484375, + "learning_rate": 6.027338097441006e-05, + "loss": 0.9169, + "step": 3908 + }, + { + "epoch": 0.6162610714592828, + "grad_norm": 0.84765625, + "learning_rate": 6.026915614790805e-05, + "loss": 1.0648, + "step": 3909 + }, + { + "epoch": 0.6164187233066758, + "grad_norm": 1.0390625, + "learning_rate": 6.026493140561333e-05, + "loss": 0.9408, + "step": 3910 + }, + { + "epoch": 0.6165763751540688, + "grad_norm": 0.8203125, + "learning_rate": 6.026070674753481e-05, + "loss": 0.9578, + "step": 3911 + }, + { + "epoch": 0.6167340270014618, + "grad_norm": 0.96484375, + "learning_rate": 6.0256482173681476e-05, + "loss": 0.9915, + "step": 3912 + }, + { + "epoch": 0.6168916788488549, + "grad_norm": 1.1015625, + "learning_rate": 6.025225768406223e-05, + "loss": 1.1405, + "step": 3913 + }, + { + "epoch": 0.6170493306962479, + "grad_norm": 1.0078125, + "learning_rate": 6.0248033278686044e-05, + "loss": 1.1939, + "step": 3914 + }, + { + "epoch": 0.6172069825436409, + "grad_norm": 0.96484375, + "learning_rate": 6.02438089575619e-05, + "loss": 0.9426, + "step": 3915 + }, + { + "epoch": 0.617364634391034, + "grad_norm": 1.109375, + "learning_rate": 6.02395847206987e-05, + "loss": 1.07, + "step": 3916 + }, + { + "epoch": 0.617522286238427, + "grad_norm": 0.93359375, + "learning_rate": 6.023536056810546e-05, + "loss": 0.8044, + "step": 3917 + }, + { + "epoch": 0.6176799380858199, + "grad_norm": 0.875, + "learning_rate": 6.0231136499791084e-05, + "loss": 0.9561, + "step": 3918 + }, + { + "epoch": 0.6178375899332129, + "grad_norm": 1.0, + "learning_rate": 6.022691251576453e-05, + "loss": 1.084, + "step": 3919 + }, + { + "epoch": 0.6179952417806059, + "grad_norm": 1.0859375, + "learning_rate": 6.022268861603479e-05, + "loss": 1.2374, + "step": 3920 + }, + { + "epoch": 0.618152893627999, + "grad_norm": 0.8515625, + "learning_rate": 6.0218464800610786e-05, + "loss": 0.9819, + "step": 3921 + }, + { + "epoch": 0.618310545475392, + "grad_norm": 0.96875, + "learning_rate": 6.021424106950146e-05, + "loss": 0.9702, + "step": 3922 + }, + { + "epoch": 0.618468197322785, + "grad_norm": 0.98046875, + "learning_rate": 6.021001742271579e-05, + "loss": 1.0531, + "step": 3923 + }, + { + "epoch": 0.618625849170178, + "grad_norm": 1.0390625, + "learning_rate": 6.020579386026271e-05, + "loss": 1.0509, + "step": 3924 + }, + { + "epoch": 0.6187835010175711, + "grad_norm": 0.9921875, + "learning_rate": 6.0201570382151126e-05, + "loss": 0.8691, + "step": 3925 + }, + { + "epoch": 0.618941152864964, + "grad_norm": 0.96484375, + "learning_rate": 6.019734698839009e-05, + "loss": 1.0757, + "step": 3926 + }, + { + "epoch": 0.619098804712357, + "grad_norm": 1.015625, + "learning_rate": 6.01931236789885e-05, + "loss": 1.1138, + "step": 3927 + }, + { + "epoch": 0.61925645655975, + "grad_norm": 0.93359375, + "learning_rate": 6.018890045395531e-05, + "loss": 1.0332, + "step": 3928 + }, + { + "epoch": 0.6194141084071431, + "grad_norm": 1.046875, + "learning_rate": 6.0184677313299466e-05, + "loss": 1.2614, + "step": 3929 + }, + { + "epoch": 0.6195717602545361, + "grad_norm": 1.0390625, + "learning_rate": 6.018045425702988e-05, + "loss": 0.9975, + "step": 3930 + }, + { + "epoch": 0.6197294121019291, + "grad_norm": 0.95703125, + "learning_rate": 6.017623128515559e-05, + "loss": 0.8931, + "step": 3931 + }, + { + "epoch": 0.6198870639493221, + "grad_norm": 0.91015625, + "learning_rate": 6.017200839768551e-05, + "loss": 0.981, + "step": 3932 + }, + { + "epoch": 0.6200447157967152, + "grad_norm": 0.89453125, + "learning_rate": 6.016778559462857e-05, + "loss": 1.123, + "step": 3933 + }, + { + "epoch": 0.6202023676441081, + "grad_norm": 1.0078125, + "learning_rate": 6.0163562875993726e-05, + "loss": 1.2774, + "step": 3934 + }, + { + "epoch": 0.6203600194915011, + "grad_norm": 1.1796875, + "learning_rate": 6.0159340241789906e-05, + "loss": 1.15, + "step": 3935 + }, + { + "epoch": 0.6205176713388941, + "grad_norm": 0.91015625, + "learning_rate": 6.015511769202612e-05, + "loss": 0.8823, + "step": 3936 + }, + { + "epoch": 0.6206753231862872, + "grad_norm": 0.8515625, + "learning_rate": 6.015089522671129e-05, + "loss": 0.7702, + "step": 3937 + }, + { + "epoch": 0.6208329750336802, + "grad_norm": 0.96484375, + "learning_rate": 6.014667284585436e-05, + "loss": 1.0726, + "step": 3938 + }, + { + "epoch": 0.6209906268810732, + "grad_norm": 0.92578125, + "learning_rate": 6.014245054946428e-05, + "loss": 0.9264, + "step": 3939 + }, + { + "epoch": 0.6211482787284662, + "grad_norm": 0.91015625, + "learning_rate": 6.0138228337549964e-05, + "loss": 1.0241, + "step": 3940 + }, + { + "epoch": 0.6213059305758593, + "grad_norm": 0.96875, + "learning_rate": 6.013400621012043e-05, + "loss": 0.8098, + "step": 3941 + }, + { + "epoch": 0.6214635824232522, + "grad_norm": 0.92578125, + "learning_rate": 6.0129784167184576e-05, + "loss": 0.9866, + "step": 3942 + }, + { + "epoch": 0.6216212342706452, + "grad_norm": 0.93359375, + "learning_rate": 6.012556220875139e-05, + "loss": 1.1052, + "step": 3943 + }, + { + "epoch": 0.6217788861180382, + "grad_norm": 0.9375, + "learning_rate": 6.0121340334829765e-05, + "loss": 1.0257, + "step": 3944 + }, + { + "epoch": 0.6219365379654312, + "grad_norm": 0.96875, + "learning_rate": 6.0117118545428676e-05, + "loss": 1.1729, + "step": 3945 + }, + { + "epoch": 0.6220941898128243, + "grad_norm": 0.87109375, + "learning_rate": 6.011289684055709e-05, + "loss": 0.8727, + "step": 3946 + }, + { + "epoch": 0.6222518416602173, + "grad_norm": 0.90234375, + "learning_rate": 6.010867522022394e-05, + "loss": 1.0254, + "step": 3947 + }, + { + "epoch": 0.6224094935076103, + "grad_norm": 0.859375, + "learning_rate": 6.0104453684438175e-05, + "loss": 1.02, + "step": 3948 + }, + { + "epoch": 0.6225671453550033, + "grad_norm": 0.92578125, + "learning_rate": 6.010023223320874e-05, + "loss": 0.966, + "step": 3949 + }, + { + "epoch": 0.6227247972023963, + "grad_norm": 1.2109375, + "learning_rate": 6.009601086654455e-05, + "loss": 1.4213, + "step": 3950 + }, + { + "epoch": 0.6228824490497893, + "grad_norm": 0.90234375, + "learning_rate": 6.0091789584454613e-05, + "loss": 0.9118, + "step": 3951 + }, + { + "epoch": 0.6230401008971823, + "grad_norm": 0.96875, + "learning_rate": 6.0087568386947855e-05, + "loss": 0.9218, + "step": 3952 + }, + { + "epoch": 0.6231977527445753, + "grad_norm": 1.1484375, + "learning_rate": 6.008334727403322e-05, + "loss": 1.0216, + "step": 3953 + }, + { + "epoch": 0.6233554045919684, + "grad_norm": 0.921875, + "learning_rate": 6.007912624571963e-05, + "loss": 0.9363, + "step": 3954 + }, + { + "epoch": 0.6235130564393614, + "grad_norm": 0.9375, + "learning_rate": 6.0074905302016025e-05, + "loss": 0.9949, + "step": 3955 + }, + { + "epoch": 0.6236707082867544, + "grad_norm": 0.8203125, + "learning_rate": 6.0070684442931414e-05, + "loss": 0.7989, + "step": 3956 + }, + { + "epoch": 0.6238283601341474, + "grad_norm": 0.85546875, + "learning_rate": 6.0066463668474706e-05, + "loss": 0.8889, + "step": 3957 + }, + { + "epoch": 0.6239860119815404, + "grad_norm": 0.921875, + "learning_rate": 6.006224297865484e-05, + "loss": 1.094, + "step": 3958 + }, + { + "epoch": 0.6241436638289334, + "grad_norm": 0.828125, + "learning_rate": 6.0058022373480774e-05, + "loss": 0.7891, + "step": 3959 + }, + { + "epoch": 0.6243013156763264, + "grad_norm": 0.8359375, + "learning_rate": 6.005380185296142e-05, + "loss": 1.0674, + "step": 3960 + }, + { + "epoch": 0.6244589675237194, + "grad_norm": 0.94921875, + "learning_rate": 6.004958141710577e-05, + "loss": 1.0569, + "step": 3961 + }, + { + "epoch": 0.6246166193711125, + "grad_norm": 0.953125, + "learning_rate": 6.004536106592276e-05, + "loss": 0.8703, + "step": 3962 + }, + { + "epoch": 0.6247742712185055, + "grad_norm": 0.90625, + "learning_rate": 6.004114079942131e-05, + "loss": 1.0172, + "step": 3963 + }, + { + "epoch": 0.6249319230658985, + "grad_norm": 0.87890625, + "learning_rate": 6.003692061761037e-05, + "loss": 0.7993, + "step": 3964 + }, + { + "epoch": 0.6250895749132915, + "grad_norm": 1.0625, + "learning_rate": 6.003270052049887e-05, + "loss": 0.958, + "step": 3965 + }, + { + "epoch": 0.6252472267606844, + "grad_norm": 1.0546875, + "learning_rate": 6.002848050809582e-05, + "loss": 1.1425, + "step": 3966 + }, + { + "epoch": 0.6254048786080775, + "grad_norm": 0.94921875, + "learning_rate": 6.002426058041011e-05, + "loss": 0.9984, + "step": 3967 + }, + { + "epoch": 0.6255625304554705, + "grad_norm": 0.98828125, + "learning_rate": 6.00200407374507e-05, + "loss": 1.15, + "step": 3968 + }, + { + "epoch": 0.6257201823028635, + "grad_norm": 1.0, + "learning_rate": 6.001582097922653e-05, + "loss": 1.0224, + "step": 3969 + }, + { + "epoch": 0.6258778341502566, + "grad_norm": 1.0390625, + "learning_rate": 6.00116013057465e-05, + "loss": 1.1592, + "step": 3970 + }, + { + "epoch": 0.6260354859976496, + "grad_norm": 0.87890625, + "learning_rate": 6.000738171701963e-05, + "loss": 0.9591, + "step": 3971 + }, + { + "epoch": 0.6261931378450426, + "grad_norm": 0.94140625, + "learning_rate": 6.000316221305484e-05, + "loss": 1.1682, + "step": 3972 + }, + { + "epoch": 0.6263507896924356, + "grad_norm": 0.80859375, + "learning_rate": 5.9998942793861044e-05, + "loss": 0.8825, + "step": 3973 + }, + { + "epoch": 0.6265084415398285, + "grad_norm": 0.92578125, + "learning_rate": 5.999472345944721e-05, + "loss": 1.0027, + "step": 3974 + }, + { + "epoch": 0.6266660933872216, + "grad_norm": 1.0, + "learning_rate": 5.999050420982224e-05, + "loss": 1.0554, + "step": 3975 + }, + { + "epoch": 0.6268237452346146, + "grad_norm": 1.0234375, + "learning_rate": 5.998628504499515e-05, + "loss": 0.8671, + "step": 3976 + }, + { + "epoch": 0.6269813970820076, + "grad_norm": 0.8359375, + "learning_rate": 5.9982065964974845e-05, + "loss": 0.9443, + "step": 3977 + }, + { + "epoch": 0.6271390489294006, + "grad_norm": 0.9140625, + "learning_rate": 5.997784696977024e-05, + "loss": 1.0919, + "step": 3978 + }, + { + "epoch": 0.6272967007767937, + "grad_norm": 0.9921875, + "learning_rate": 5.997362805939033e-05, + "loss": 1.088, + "step": 3979 + }, + { + "epoch": 0.6274543526241867, + "grad_norm": 0.90234375, + "learning_rate": 5.9969409233843984e-05, + "loss": 0.8665, + "step": 3980 + }, + { + "epoch": 0.6276120044715797, + "grad_norm": 0.94140625, + "learning_rate": 5.996519049314022e-05, + "loss": 1.0393, + "step": 3981 + }, + { + "epoch": 0.6277696563189726, + "grad_norm": 0.9375, + "learning_rate": 5.9960971837287926e-05, + "loss": 0.9798, + "step": 3982 + }, + { + "epoch": 0.6279273081663657, + "grad_norm": 1.0390625, + "learning_rate": 5.9956753266296086e-05, + "loss": 1.0209, + "step": 3983 + }, + { + "epoch": 0.6280849600137587, + "grad_norm": 0.95703125, + "learning_rate": 5.9952534780173606e-05, + "loss": 0.9548, + "step": 3984 + }, + { + "epoch": 0.6282426118611517, + "grad_norm": 0.91015625, + "learning_rate": 5.994831637892944e-05, + "loss": 0.8393, + "step": 3985 + }, + { + "epoch": 0.6284002637085447, + "grad_norm": 0.9453125, + "learning_rate": 5.994409806257253e-05, + "loss": 1.1392, + "step": 3986 + }, + { + "epoch": 0.6285579155559378, + "grad_norm": 0.90234375, + "learning_rate": 5.9939879831111826e-05, + "loss": 0.9246, + "step": 3987 + }, + { + "epoch": 0.6287155674033308, + "grad_norm": 1.109375, + "learning_rate": 5.993566168455624e-05, + "loss": 1.0332, + "step": 3988 + }, + { + "epoch": 0.6288732192507238, + "grad_norm": 0.9765625, + "learning_rate": 5.993144362291474e-05, + "loss": 0.9965, + "step": 3989 + }, + { + "epoch": 0.6290308710981167, + "grad_norm": 0.890625, + "learning_rate": 5.992722564619622e-05, + "loss": 0.9224, + "step": 3990 + }, + { + "epoch": 0.6291885229455098, + "grad_norm": 1.0078125, + "learning_rate": 5.9923007754409686e-05, + "loss": 1.0478, + "step": 3991 + }, + { + "epoch": 0.6293461747929028, + "grad_norm": 1.0078125, + "learning_rate": 5.991878994756406e-05, + "loss": 0.9672, + "step": 3992 + }, + { + "epoch": 0.6295038266402958, + "grad_norm": 1.03125, + "learning_rate": 5.991457222566825e-05, + "loss": 0.8852, + "step": 3993 + }, + { + "epoch": 0.6296614784876888, + "grad_norm": 1.046875, + "learning_rate": 5.991035458873122e-05, + "loss": 1.0203, + "step": 3994 + }, + { + "epoch": 0.6298191303350819, + "grad_norm": 0.90234375, + "learning_rate": 5.990613703676187e-05, + "loss": 0.9439, + "step": 3995 + }, + { + "epoch": 0.6299767821824749, + "grad_norm": 0.90234375, + "learning_rate": 5.99019195697692e-05, + "loss": 0.8503, + "step": 3996 + }, + { + "epoch": 0.6301344340298679, + "grad_norm": 0.96484375, + "learning_rate": 5.9897702187762125e-05, + "loss": 1.176, + "step": 3997 + }, + { + "epoch": 0.6302920858772608, + "grad_norm": 1.0625, + "learning_rate": 5.9893484890749586e-05, + "loss": 0.9993, + "step": 3998 + }, + { + "epoch": 0.6304497377246538, + "grad_norm": 1.0234375, + "learning_rate": 5.98892676787405e-05, + "loss": 0.9875, + "step": 3999 + }, + { + "epoch": 0.6306073895720469, + "grad_norm": 0.9375, + "learning_rate": 5.98850505517438e-05, + "loss": 1.0499, + "step": 4000 + }, + { + "epoch": 0.6306073895720469, + "eval_loss": 0.9967675805091858, + "eval_runtime": 308.9704, + "eval_samples_per_second": 32.366, + "eval_steps_per_second": 0.676, + "step": 4000 + }, + { + "epoch": 0.6307650414194399, + "grad_norm": 0.9609375, + "learning_rate": 5.988083350976845e-05, + "loss": 1.0275, + "step": 4001 + }, + { + "epoch": 0.6309226932668329, + "grad_norm": 0.92578125, + "learning_rate": 5.98766165528234e-05, + "loss": 1.063, + "step": 4002 + }, + { + "epoch": 0.631080345114226, + "grad_norm": 0.90234375, + "learning_rate": 5.987239968091756e-05, + "loss": 0.9901, + "step": 4003 + }, + { + "epoch": 0.631237996961619, + "grad_norm": 0.98046875, + "learning_rate": 5.9868182894059886e-05, + "loss": 0.9539, + "step": 4004 + }, + { + "epoch": 0.631395648809012, + "grad_norm": 0.90234375, + "learning_rate": 5.986396619225927e-05, + "loss": 1.0427, + "step": 4005 + }, + { + "epoch": 0.6315533006564049, + "grad_norm": 0.98828125, + "learning_rate": 5.9859749575524714e-05, + "loss": 1.0904, + "step": 4006 + }, + { + "epoch": 0.6317109525037979, + "grad_norm": 0.94140625, + "learning_rate": 5.985553304386512e-05, + "loss": 0.8973, + "step": 4007 + }, + { + "epoch": 0.631868604351191, + "grad_norm": 0.859375, + "learning_rate": 5.985131659728943e-05, + "loss": 0.8621, + "step": 4008 + }, + { + "epoch": 0.632026256198584, + "grad_norm": 1.0078125, + "learning_rate": 5.984710023580658e-05, + "loss": 1.037, + "step": 4009 + }, + { + "epoch": 0.632183908045977, + "grad_norm": 1.046875, + "learning_rate": 5.984288395942547e-05, + "loss": 1.1111, + "step": 4010 + }, + { + "epoch": 0.63234155989337, + "grad_norm": 0.9921875, + "learning_rate": 5.983866776815511e-05, + "loss": 1.0566, + "step": 4011 + }, + { + "epoch": 0.6324992117407631, + "grad_norm": 0.98046875, + "learning_rate": 5.9834451662004384e-05, + "loss": 1.0283, + "step": 4012 + }, + { + "epoch": 0.6326568635881561, + "grad_norm": 0.83203125, + "learning_rate": 5.983023564098227e-05, + "loss": 0.9869, + "step": 4013 + }, + { + "epoch": 0.632814515435549, + "grad_norm": 0.84765625, + "learning_rate": 5.982601970509766e-05, + "loss": 0.8327, + "step": 4014 + }, + { + "epoch": 0.632972167282942, + "grad_norm": 0.97265625, + "learning_rate": 5.982180385435946e-05, + "loss": 1.019, + "step": 4015 + }, + { + "epoch": 0.6331298191303351, + "grad_norm": 0.921875, + "learning_rate": 5.98175880887767e-05, + "loss": 0.9727, + "step": 4016 + }, + { + "epoch": 0.6332874709777281, + "grad_norm": 0.95703125, + "learning_rate": 5.981337240835826e-05, + "loss": 1.0572, + "step": 4017 + }, + { + "epoch": 0.6334451228251211, + "grad_norm": 0.87890625, + "learning_rate": 5.9809156813113075e-05, + "loss": 1.0114, + "step": 4018 + }, + { + "epoch": 0.6336027746725141, + "grad_norm": 1.0078125, + "learning_rate": 5.98049413030501e-05, + "loss": 0.9712, + "step": 4019 + }, + { + "epoch": 0.6337604265199072, + "grad_norm": 0.9453125, + "learning_rate": 5.98007258781782e-05, + "loss": 1.0276, + "step": 4020 + }, + { + "epoch": 0.6339180783673002, + "grad_norm": 1.3515625, + "learning_rate": 5.979651053850641e-05, + "loss": 1.2735, + "step": 4021 + }, + { + "epoch": 0.6340757302146931, + "grad_norm": 0.93359375, + "learning_rate": 5.979229528404362e-05, + "loss": 1.1118, + "step": 4022 + }, + { + "epoch": 0.6342333820620861, + "grad_norm": 0.94140625, + "learning_rate": 5.9788080114798765e-05, + "loss": 1.0267, + "step": 4023 + }, + { + "epoch": 0.6343910339094792, + "grad_norm": 0.96484375, + "learning_rate": 5.9783865030780764e-05, + "loss": 1.0634, + "step": 4024 + }, + { + "epoch": 0.6345486857568722, + "grad_norm": 0.87890625, + "learning_rate": 5.977965003199855e-05, + "loss": 1.0741, + "step": 4025 + }, + { + "epoch": 0.6347063376042652, + "grad_norm": 0.85546875, + "learning_rate": 5.977543511846109e-05, + "loss": 1.0265, + "step": 4026 + }, + { + "epoch": 0.6348639894516582, + "grad_norm": 1.203125, + "learning_rate": 5.977122029017729e-05, + "loss": 1.4199, + "step": 4027 + }, + { + "epoch": 0.6350216412990513, + "grad_norm": 0.82421875, + "learning_rate": 5.976700554715608e-05, + "loss": 0.8776, + "step": 4028 + }, + { + "epoch": 0.6351792931464443, + "grad_norm": 0.84765625, + "learning_rate": 5.976279088940643e-05, + "loss": 0.9833, + "step": 4029 + }, + { + "epoch": 0.6353369449938372, + "grad_norm": 1.0078125, + "learning_rate": 5.9758576316937186e-05, + "loss": 0.9782, + "step": 4030 + }, + { + "epoch": 0.6354945968412302, + "grad_norm": 0.88671875, + "learning_rate": 5.975436182975739e-05, + "loss": 0.9362, + "step": 4031 + }, + { + "epoch": 0.6356522486886232, + "grad_norm": 1.015625, + "learning_rate": 5.975014742787591e-05, + "loss": 1.1218, + "step": 4032 + }, + { + "epoch": 0.6358099005360163, + "grad_norm": 1.078125, + "learning_rate": 5.97459331113017e-05, + "loss": 1.0512, + "step": 4033 + }, + { + "epoch": 0.6359675523834093, + "grad_norm": 0.95703125, + "learning_rate": 5.9741718880043685e-05, + "loss": 1.0428, + "step": 4034 + }, + { + "epoch": 0.6361252042308023, + "grad_norm": 0.9296875, + "learning_rate": 5.973750473411075e-05, + "loss": 0.87, + "step": 4035 + }, + { + "epoch": 0.6362828560781953, + "grad_norm": 0.8359375, + "learning_rate": 5.9733290673511925e-05, + "loss": 0.9724, + "step": 4036 + }, + { + "epoch": 0.6364405079255884, + "grad_norm": 0.9609375, + "learning_rate": 5.972907669825607e-05, + "loss": 0.8214, + "step": 4037 + }, + { + "epoch": 0.6365981597729813, + "grad_norm": 0.96875, + "learning_rate": 5.972486280835214e-05, + "loss": 1.0852, + "step": 4038 + }, + { + "epoch": 0.6367558116203743, + "grad_norm": 1.2890625, + "learning_rate": 5.972064900380907e-05, + "loss": 1.1032, + "step": 4039 + }, + { + "epoch": 0.6369134634677673, + "grad_norm": 0.7734375, + "learning_rate": 5.971643528463575e-05, + "loss": 0.8922, + "step": 4040 + }, + { + "epoch": 0.6370711153151604, + "grad_norm": 0.98046875, + "learning_rate": 5.971222165084116e-05, + "loss": 0.7775, + "step": 4041 + }, + { + "epoch": 0.6372287671625534, + "grad_norm": 0.96484375, + "learning_rate": 5.970800810243423e-05, + "loss": 0.9469, + "step": 4042 + }, + { + "epoch": 0.6373864190099464, + "grad_norm": 0.984375, + "learning_rate": 5.970379463942387e-05, + "loss": 1.0401, + "step": 4043 + }, + { + "epoch": 0.6375440708573394, + "grad_norm": 0.9375, + "learning_rate": 5.969958126181902e-05, + "loss": 1.0152, + "step": 4044 + }, + { + "epoch": 0.6377017227047325, + "grad_norm": 0.921875, + "learning_rate": 5.969536796962858e-05, + "loss": 0.9256, + "step": 4045 + }, + { + "epoch": 0.6378593745521254, + "grad_norm": 0.9296875, + "learning_rate": 5.969115476286151e-05, + "loss": 1.1293, + "step": 4046 + }, + { + "epoch": 0.6380170263995184, + "grad_norm": 0.98046875, + "learning_rate": 5.968694164152675e-05, + "loss": 1.2149, + "step": 4047 + }, + { + "epoch": 0.6381746782469114, + "grad_norm": 1.171875, + "learning_rate": 5.96827286056332e-05, + "loss": 1.0217, + "step": 4048 + }, + { + "epoch": 0.6383323300943045, + "grad_norm": 0.93359375, + "learning_rate": 5.9678515655189806e-05, + "loss": 1.0468, + "step": 4049 + }, + { + "epoch": 0.6384899819416975, + "grad_norm": 0.93359375, + "learning_rate": 5.967430279020547e-05, + "loss": 0.9772, + "step": 4050 + }, + { + "epoch": 0.6386476337890905, + "grad_norm": 0.88671875, + "learning_rate": 5.967009001068917e-05, + "loss": 0.9117, + "step": 4051 + }, + { + "epoch": 0.6388052856364835, + "grad_norm": 0.90234375, + "learning_rate": 5.966587731664981e-05, + "loss": 1.0482, + "step": 4052 + }, + { + "epoch": 0.6389629374838766, + "grad_norm": 0.9453125, + "learning_rate": 5.966166470809631e-05, + "loss": 0.9544, + "step": 4053 + }, + { + "epoch": 0.6391205893312695, + "grad_norm": 0.96484375, + "learning_rate": 5.965745218503762e-05, + "loss": 1.1745, + "step": 4054 + }, + { + "epoch": 0.6392782411786625, + "grad_norm": 0.921875, + "learning_rate": 5.96532397474826e-05, + "loss": 0.8989, + "step": 4055 + }, + { + "epoch": 0.6394358930260555, + "grad_norm": 1.0234375, + "learning_rate": 5.964902739544029e-05, + "loss": 0.9273, + "step": 4056 + }, + { + "epoch": 0.6395935448734485, + "grad_norm": 0.87890625, + "learning_rate": 5.964481512891955e-05, + "loss": 1.074, + "step": 4057 + }, + { + "epoch": 0.6397511967208416, + "grad_norm": 0.93359375, + "learning_rate": 5.964060294792933e-05, + "loss": 1.0031, + "step": 4058 + }, + { + "epoch": 0.6399088485682346, + "grad_norm": 1.0703125, + "learning_rate": 5.963639085247852e-05, + "loss": 0.898, + "step": 4059 + }, + { + "epoch": 0.6400665004156276, + "grad_norm": 1.0546875, + "learning_rate": 5.963217884257606e-05, + "loss": 1.1751, + "step": 4060 + }, + { + "epoch": 0.6402241522630207, + "grad_norm": 0.97265625, + "learning_rate": 5.962796691823092e-05, + "loss": 1.065, + "step": 4061 + }, + { + "epoch": 0.6403818041104136, + "grad_norm": 1.0, + "learning_rate": 5.962375507945198e-05, + "loss": 1.0746, + "step": 4062 + }, + { + "epoch": 0.6405394559578066, + "grad_norm": 0.8359375, + "learning_rate": 5.9619543326248196e-05, + "loss": 1.0416, + "step": 4063 + }, + { + "epoch": 0.6406971078051996, + "grad_norm": 0.90234375, + "learning_rate": 5.961533165862848e-05, + "loss": 0.9557, + "step": 4064 + }, + { + "epoch": 0.6408547596525926, + "grad_norm": 0.8046875, + "learning_rate": 5.961112007660177e-05, + "loss": 0.8417, + "step": 4065 + }, + { + "epoch": 0.6410124114999857, + "grad_norm": 0.921875, + "learning_rate": 5.960690858017695e-05, + "loss": 1.0058, + "step": 4066 + }, + { + "epoch": 0.6411700633473787, + "grad_norm": 1.0, + "learning_rate": 5.9602697169363e-05, + "loss": 1.0221, + "step": 4067 + }, + { + "epoch": 0.6413277151947717, + "grad_norm": 1.03125, + "learning_rate": 5.9598485844168805e-05, + "loss": 0.9815, + "step": 4068 + }, + { + "epoch": 0.6414853670421647, + "grad_norm": 0.94140625, + "learning_rate": 5.959427460460334e-05, + "loss": 0.808, + "step": 4069 + }, + { + "epoch": 0.6416430188895578, + "grad_norm": 0.9765625, + "learning_rate": 5.959006345067548e-05, + "loss": 1.0907, + "step": 4070 + }, + { + "epoch": 0.6418006707369507, + "grad_norm": 0.984375, + "learning_rate": 5.958585238239418e-05, + "loss": 1.153, + "step": 4071 + }, + { + "epoch": 0.6419583225843437, + "grad_norm": 0.828125, + "learning_rate": 5.958164139976835e-05, + "loss": 0.9838, + "step": 4072 + }, + { + "epoch": 0.6421159744317367, + "grad_norm": 0.98828125, + "learning_rate": 5.957743050280693e-05, + "loss": 1.2096, + "step": 4073 + }, + { + "epoch": 0.6422736262791298, + "grad_norm": 0.91796875, + "learning_rate": 5.9573219691518836e-05, + "loss": 1.0842, + "step": 4074 + }, + { + "epoch": 0.6424312781265228, + "grad_norm": 0.94921875, + "learning_rate": 5.9569008965913e-05, + "loss": 1.0536, + "step": 4075 + }, + { + "epoch": 0.6425889299739158, + "grad_norm": 1.0078125, + "learning_rate": 5.9564798325998286e-05, + "loss": 0.8884, + "step": 4076 + }, + { + "epoch": 0.6427465818213088, + "grad_norm": 0.80078125, + "learning_rate": 5.956058777178372e-05, + "loss": 0.8034, + "step": 4077 + }, + { + "epoch": 0.6429042336687019, + "grad_norm": 0.89453125, + "learning_rate": 5.955637730327816e-05, + "loss": 1.0051, + "step": 4078 + }, + { + "epoch": 0.6430618855160948, + "grad_norm": 0.95703125, + "learning_rate": 5.955216692049057e-05, + "loss": 0.9429, + "step": 4079 + }, + { + "epoch": 0.6432195373634878, + "grad_norm": 0.90625, + "learning_rate": 5.954795662342985e-05, + "loss": 1.0221, + "step": 4080 + }, + { + "epoch": 0.6433771892108808, + "grad_norm": 0.96875, + "learning_rate": 5.954374641210486e-05, + "loss": 0.9418, + "step": 4081 + }, + { + "epoch": 0.6435348410582739, + "grad_norm": 0.87109375, + "learning_rate": 5.953953628652464e-05, + "loss": 0.951, + "step": 4082 + }, + { + "epoch": 0.6436924929056669, + "grad_norm": 0.94921875, + "learning_rate": 5.9535326246698065e-05, + "loss": 1.1242, + "step": 4083 + }, + { + "epoch": 0.6438501447530599, + "grad_norm": 0.93359375, + "learning_rate": 5.953111629263405e-05, + "loss": 1.083, + "step": 4084 + }, + { + "epoch": 0.6440077966004529, + "grad_norm": 0.90625, + "learning_rate": 5.952690642434151e-05, + "loss": 0.9453, + "step": 4085 + }, + { + "epoch": 0.644165448447846, + "grad_norm": 0.859375, + "learning_rate": 5.9522696641829344e-05, + "loss": 0.8229, + "step": 4086 + }, + { + "epoch": 0.6443231002952389, + "grad_norm": 1.0, + "learning_rate": 5.951848694510655e-05, + "loss": 1.0224, + "step": 4087 + }, + { + "epoch": 0.6444807521426319, + "grad_norm": 0.98046875, + "learning_rate": 5.951427733418202e-05, + "loss": 1.2275, + "step": 4088 + }, + { + "epoch": 0.6446384039900249, + "grad_norm": 0.953125, + "learning_rate": 5.951006780906466e-05, + "loss": 1.051, + "step": 4089 + }, + { + "epoch": 0.644796055837418, + "grad_norm": 1.109375, + "learning_rate": 5.950585836976339e-05, + "loss": 0.8892, + "step": 4090 + }, + { + "epoch": 0.644953707684811, + "grad_norm": 0.9453125, + "learning_rate": 5.950164901628713e-05, + "loss": 0.9013, + "step": 4091 + }, + { + "epoch": 0.645111359532204, + "grad_norm": 0.91796875, + "learning_rate": 5.949743974864482e-05, + "loss": 0.9337, + "step": 4092 + }, + { + "epoch": 0.645269011379597, + "grad_norm": 0.98046875, + "learning_rate": 5.949323056684537e-05, + "loss": 1.0974, + "step": 4093 + }, + { + "epoch": 0.64542666322699, + "grad_norm": 0.890625, + "learning_rate": 5.948902147089772e-05, + "loss": 0.8279, + "step": 4094 + }, + { + "epoch": 0.645584315074383, + "grad_norm": 0.9296875, + "learning_rate": 5.9484812460810754e-05, + "loss": 0.8438, + "step": 4095 + }, + { + "epoch": 0.645741966921776, + "grad_norm": 0.9609375, + "learning_rate": 5.948060353659338e-05, + "loss": 1.1735, + "step": 4096 + }, + { + "epoch": 0.645899618769169, + "grad_norm": 0.9921875, + "learning_rate": 5.947639469825459e-05, + "loss": 1.1641, + "step": 4097 + }, + { + "epoch": 0.646057270616562, + "grad_norm": 0.984375, + "learning_rate": 5.947218594580326e-05, + "loss": 0.9519, + "step": 4098 + }, + { + "epoch": 0.6462149224639551, + "grad_norm": 0.92578125, + "learning_rate": 5.9467977279248336e-05, + "loss": 0.9406, + "step": 4099 + }, + { + "epoch": 0.6463725743113481, + "grad_norm": 1.03125, + "learning_rate": 5.946376869859869e-05, + "loss": 1.1576, + "step": 4100 + }, + { + "epoch": 0.6465302261587411, + "grad_norm": 0.859375, + "learning_rate": 5.945956020386324e-05, + "loss": 0.8623, + "step": 4101 + }, + { + "epoch": 0.6466878780061341, + "grad_norm": 0.96875, + "learning_rate": 5.9455351795050975e-05, + "loss": 1.0275, + "step": 4102 + }, + { + "epoch": 0.6468455298535271, + "grad_norm": 0.8984375, + "learning_rate": 5.945114347217078e-05, + "loss": 0.923, + "step": 4103 + }, + { + "epoch": 0.6470031817009201, + "grad_norm": 1.0078125, + "learning_rate": 5.944693523523157e-05, + "loss": 1.1688, + "step": 4104 + }, + { + "epoch": 0.6471608335483131, + "grad_norm": 0.859375, + "learning_rate": 5.944272708424226e-05, + "loss": 0.9876, + "step": 4105 + }, + { + "epoch": 0.6473184853957061, + "grad_norm": 0.99609375, + "learning_rate": 5.943851901921171e-05, + "loss": 1.0504, + "step": 4106 + }, + { + "epoch": 0.6474761372430992, + "grad_norm": 0.92578125, + "learning_rate": 5.943431104014897e-05, + "loss": 1.22, + "step": 4107 + }, + { + "epoch": 0.6476337890904922, + "grad_norm": 0.9375, + "learning_rate": 5.943010314706288e-05, + "loss": 1.0781, + "step": 4108 + }, + { + "epoch": 0.6477914409378852, + "grad_norm": 0.9765625, + "learning_rate": 5.942589533996237e-05, + "loss": 1.1588, + "step": 4109 + }, + { + "epoch": 0.6479490927852782, + "grad_norm": 0.9140625, + "learning_rate": 5.942168761885635e-05, + "loss": 0.7994, + "step": 4110 + }, + { + "epoch": 0.6481067446326712, + "grad_norm": 0.98828125, + "learning_rate": 5.941747998375372e-05, + "loss": 0.9258, + "step": 4111 + }, + { + "epoch": 0.6482643964800642, + "grad_norm": 0.9921875, + "learning_rate": 5.941327243466345e-05, + "loss": 0.975, + "step": 4112 + }, + { + "epoch": 0.6484220483274572, + "grad_norm": 1.71875, + "learning_rate": 5.940906497159442e-05, + "loss": 1.055, + "step": 4113 + }, + { + "epoch": 0.6485797001748502, + "grad_norm": 0.94921875, + "learning_rate": 5.940485759455556e-05, + "loss": 0.8806, + "step": 4114 + }, + { + "epoch": 0.6487373520222433, + "grad_norm": 0.96484375, + "learning_rate": 5.94006503035558e-05, + "loss": 1.0791, + "step": 4115 + }, + { + "epoch": 0.6488950038696363, + "grad_norm": 0.97265625, + "learning_rate": 5.939644309860398e-05, + "loss": 0.8534, + "step": 4116 + }, + { + "epoch": 0.6490526557170293, + "grad_norm": 0.9140625, + "learning_rate": 5.939223597970913e-05, + "loss": 0.8935, + "step": 4117 + }, + { + "epoch": 0.6492103075644223, + "grad_norm": 0.91796875, + "learning_rate": 5.938802894688011e-05, + "loss": 0.9772, + "step": 4118 + }, + { + "epoch": 0.6493679594118152, + "grad_norm": 0.9765625, + "learning_rate": 5.938382200012584e-05, + "loss": 1.1068, + "step": 4119 + }, + { + "epoch": 0.6495256112592083, + "grad_norm": 0.9609375, + "learning_rate": 5.9379615139455244e-05, + "loss": 0.975, + "step": 4120 + }, + { + "epoch": 0.6496832631066013, + "grad_norm": 0.984375, + "learning_rate": 5.9375408364877185e-05, + "loss": 1.0221, + "step": 4121 + }, + { + "epoch": 0.6498409149539943, + "grad_norm": 0.8828125, + "learning_rate": 5.937120167640067e-05, + "loss": 0.8946, + "step": 4122 + }, + { + "epoch": 0.6499985668013873, + "grad_norm": 0.8828125, + "learning_rate": 5.936699507403458e-05, + "loss": 0.9541, + "step": 4123 + }, + { + "epoch": 0.6501562186487804, + "grad_norm": 0.890625, + "learning_rate": 5.9362788557787805e-05, + "loss": 0.9226, + "step": 4124 + }, + { + "epoch": 0.6503138704961734, + "grad_norm": 0.97265625, + "learning_rate": 5.935858212766928e-05, + "loss": 0.9563, + "step": 4125 + }, + { + "epoch": 0.6504715223435664, + "grad_norm": 0.87890625, + "learning_rate": 5.935437578368787e-05, + "loss": 0.8202, + "step": 4126 + }, + { + "epoch": 0.6506291741909593, + "grad_norm": 1.0078125, + "learning_rate": 5.93501695258526e-05, + "loss": 1.003, + "step": 4127 + }, + { + "epoch": 0.6507868260383524, + "grad_norm": 0.875, + "learning_rate": 5.934596335417231e-05, + "loss": 0.9557, + "step": 4128 + }, + { + "epoch": 0.6509444778857454, + "grad_norm": 0.8984375, + "learning_rate": 5.934175726865593e-05, + "loss": 0.9531, + "step": 4129 + }, + { + "epoch": 0.6511021297331384, + "grad_norm": 0.83203125, + "learning_rate": 5.9337551269312376e-05, + "loss": 1.057, + "step": 4130 + }, + { + "epoch": 0.6512597815805314, + "grad_norm": 0.8984375, + "learning_rate": 5.933334535615051e-05, + "loss": 1.0511, + "step": 4131 + }, + { + "epoch": 0.6514174334279245, + "grad_norm": 0.99609375, + "learning_rate": 5.9329139529179356e-05, + "loss": 1.1209, + "step": 4132 + }, + { + "epoch": 0.6515750852753175, + "grad_norm": 0.87890625, + "learning_rate": 5.932493378840773e-05, + "loss": 1.023, + "step": 4133 + }, + { + "epoch": 0.6517327371227105, + "grad_norm": 0.91015625, + "learning_rate": 5.932072813384461e-05, + "loss": 1.0014, + "step": 4134 + }, + { + "epoch": 0.6518903889701034, + "grad_norm": 0.890625, + "learning_rate": 5.931652256549885e-05, + "loss": 0.9755, + "step": 4135 + }, + { + "epoch": 0.6520480408174965, + "grad_norm": 0.9375, + "learning_rate": 5.931231708337939e-05, + "loss": 0.928, + "step": 4136 + }, + { + "epoch": 0.6522056926648895, + "grad_norm": 1.0234375, + "learning_rate": 5.930811168749517e-05, + "loss": 1.1616, + "step": 4137 + }, + { + "epoch": 0.6523633445122825, + "grad_norm": 0.91796875, + "learning_rate": 5.930390637785508e-05, + "loss": 1.1084, + "step": 4138 + }, + { + "epoch": 0.6525209963596755, + "grad_norm": 0.8828125, + "learning_rate": 5.929970115446803e-05, + "loss": 0.9586, + "step": 4139 + }, + { + "epoch": 0.6526786482070686, + "grad_norm": 1.1015625, + "learning_rate": 5.9295496017342956e-05, + "loss": 1.2771, + "step": 4140 + }, + { + "epoch": 0.6528363000544616, + "grad_norm": 0.9765625, + "learning_rate": 5.92912909664887e-05, + "loss": 1.1735, + "step": 4141 + }, + { + "epoch": 0.6529939519018546, + "grad_norm": 0.90625, + "learning_rate": 5.928708600191426e-05, + "loss": 0.8799, + "step": 4142 + }, + { + "epoch": 0.6531516037492475, + "grad_norm": 1.0625, + "learning_rate": 5.9282881123628534e-05, + "loss": 1.056, + "step": 4143 + }, + { + "epoch": 0.6533092555966405, + "grad_norm": 0.9296875, + "learning_rate": 5.9278676331640395e-05, + "loss": 0.9063, + "step": 4144 + }, + { + "epoch": 0.6534669074440336, + "grad_norm": 0.87109375, + "learning_rate": 5.927447162595878e-05, + "loss": 1.0187, + "step": 4145 + }, + { + "epoch": 0.6536245592914266, + "grad_norm": 1.0, + "learning_rate": 5.927026700659255e-05, + "loss": 0.9661, + "step": 4146 + }, + { + "epoch": 0.6537822111388196, + "grad_norm": 0.8203125, + "learning_rate": 5.926606247355071e-05, + "loss": 0.9684, + "step": 4147 + }, + { + "epoch": 0.6539398629862126, + "grad_norm": 0.9921875, + "learning_rate": 5.926185802684212e-05, + "loss": 1.0104, + "step": 4148 + }, + { + "epoch": 0.6540975148336057, + "grad_norm": 0.99609375, + "learning_rate": 5.925765366647569e-05, + "loss": 1.0866, + "step": 4149 + }, + { + "epoch": 0.6542551666809987, + "grad_norm": 0.86328125, + "learning_rate": 5.9253449392460334e-05, + "loss": 0.867, + "step": 4150 + }, + { + "epoch": 0.6544128185283916, + "grad_norm": 0.9296875, + "learning_rate": 5.924924520480495e-05, + "loss": 0.8596, + "step": 4151 + }, + { + "epoch": 0.6545704703757846, + "grad_norm": 0.8671875, + "learning_rate": 5.924504110351847e-05, + "loss": 0.8642, + "step": 4152 + }, + { + "epoch": 0.6547281222231777, + "grad_norm": 0.94140625, + "learning_rate": 5.924083708860979e-05, + "loss": 0.9368, + "step": 4153 + }, + { + "epoch": 0.6548857740705707, + "grad_norm": 1.0859375, + "learning_rate": 5.923663316008783e-05, + "loss": 1.1633, + "step": 4154 + }, + { + "epoch": 0.6550434259179637, + "grad_norm": 0.9453125, + "learning_rate": 5.9232429317961516e-05, + "loss": 0.9412, + "step": 4155 + }, + { + "epoch": 0.6552010777653567, + "grad_norm": 0.92578125, + "learning_rate": 5.9228225562239706e-05, + "loss": 0.9711, + "step": 4156 + }, + { + "epoch": 0.6553587296127498, + "grad_norm": 1.0859375, + "learning_rate": 5.922402189293136e-05, + "loss": 1.1707, + "step": 4157 + }, + { + "epoch": 0.6555163814601428, + "grad_norm": 0.8359375, + "learning_rate": 5.921981831004537e-05, + "loss": 0.8617, + "step": 4158 + }, + { + "epoch": 0.6556740333075357, + "grad_norm": 0.875, + "learning_rate": 5.921561481359064e-05, + "loss": 1.2367, + "step": 4159 + }, + { + "epoch": 0.6558316851549287, + "grad_norm": 0.9453125, + "learning_rate": 5.921141140357609e-05, + "loss": 0.9554, + "step": 4160 + }, + { + "epoch": 0.6559893370023218, + "grad_norm": 1.015625, + "learning_rate": 5.920720808001059e-05, + "loss": 0.8776, + "step": 4161 + }, + { + "epoch": 0.6561469888497148, + "grad_norm": 0.93359375, + "learning_rate": 5.920300484290311e-05, + "loss": 0.835, + "step": 4162 + }, + { + "epoch": 0.6563046406971078, + "grad_norm": 1.0234375, + "learning_rate": 5.919880169226253e-05, + "loss": 1.2206, + "step": 4163 + }, + { + "epoch": 0.6564622925445008, + "grad_norm": 0.91796875, + "learning_rate": 5.919459862809775e-05, + "loss": 0.9594, + "step": 4164 + }, + { + "epoch": 0.6566199443918939, + "grad_norm": 0.9453125, + "learning_rate": 5.9190395650417696e-05, + "loss": 0.9661, + "step": 4165 + }, + { + "epoch": 0.6567775962392869, + "grad_norm": 0.94140625, + "learning_rate": 5.918619275923123e-05, + "loss": 1.111, + "step": 4166 + }, + { + "epoch": 0.6569352480866798, + "grad_norm": 0.91796875, + "learning_rate": 5.9181989954547334e-05, + "loss": 0.9967, + "step": 4167 + }, + { + "epoch": 0.6570928999340728, + "grad_norm": 1.0390625, + "learning_rate": 5.917778723637486e-05, + "loss": 1.1511, + "step": 4168 + }, + { + "epoch": 0.6572505517814659, + "grad_norm": 0.94140625, + "learning_rate": 5.9173584604722755e-05, + "loss": 0.8744, + "step": 4169 + }, + { + "epoch": 0.6574082036288589, + "grad_norm": 0.9453125, + "learning_rate": 5.9169382059599895e-05, + "loss": 0.9771, + "step": 4170 + }, + { + "epoch": 0.6575658554762519, + "grad_norm": 1.140625, + "learning_rate": 5.916517960101517e-05, + "loss": 1.1534, + "step": 4171 + }, + { + "epoch": 0.6577235073236449, + "grad_norm": 0.92578125, + "learning_rate": 5.9160977228977524e-05, + "loss": 1.0715, + "step": 4172 + }, + { + "epoch": 0.657881159171038, + "grad_norm": 1.046875, + "learning_rate": 5.915677494349586e-05, + "loss": 1.0406, + "step": 4173 + }, + { + "epoch": 0.658038811018431, + "grad_norm": 1.03125, + "learning_rate": 5.9152572744579085e-05, + "loss": 0.971, + "step": 4174 + }, + { + "epoch": 0.6581964628658239, + "grad_norm": 1.1015625, + "learning_rate": 5.914837063223611e-05, + "loss": 1.065, + "step": 4175 + }, + { + "epoch": 0.6583541147132169, + "grad_norm": 0.96484375, + "learning_rate": 5.914416860647578e-05, + "loss": 1.0131, + "step": 4176 + }, + { + "epoch": 0.65851176656061, + "grad_norm": 0.8671875, + "learning_rate": 5.913996666730709e-05, + "loss": 0.8314, + "step": 4177 + }, + { + "epoch": 0.658669418408003, + "grad_norm": 0.98046875, + "learning_rate": 5.91357648147389e-05, + "loss": 1.1401, + "step": 4178 + }, + { + "epoch": 0.658827070255396, + "grad_norm": 0.8828125, + "learning_rate": 5.913156304878011e-05, + "loss": 0.8032, + "step": 4179 + }, + { + "epoch": 0.658984722102789, + "grad_norm": 0.94921875, + "learning_rate": 5.912736136943966e-05, + "loss": 0.9449, + "step": 4180 + }, + { + "epoch": 0.659142373950182, + "grad_norm": 0.90234375, + "learning_rate": 5.912315977672638e-05, + "loss": 0.8813, + "step": 4181 + }, + { + "epoch": 0.6593000257975751, + "grad_norm": 1.0, + "learning_rate": 5.911895827064926e-05, + "loss": 0.9157, + "step": 4182 + }, + { + "epoch": 0.659457677644968, + "grad_norm": 0.921875, + "learning_rate": 5.9114756851217166e-05, + "loss": 1.0107, + "step": 4183 + }, + { + "epoch": 0.659615329492361, + "grad_norm": 0.875, + "learning_rate": 5.9110555518439024e-05, + "loss": 0.7929, + "step": 4184 + }, + { + "epoch": 0.659772981339754, + "grad_norm": 1.0078125, + "learning_rate": 5.9106354272323716e-05, + "loss": 1.0335, + "step": 4185 + }, + { + "epoch": 0.6599306331871471, + "grad_norm": 1.1328125, + "learning_rate": 5.910215311288012e-05, + "loss": 1.3282, + "step": 4186 + }, + { + "epoch": 0.6600882850345401, + "grad_norm": 0.81640625, + "learning_rate": 5.90979520401172e-05, + "loss": 0.7694, + "step": 4187 + }, + { + "epoch": 0.6602459368819331, + "grad_norm": 1.203125, + "learning_rate": 5.9093751054043845e-05, + "loss": 1.2487, + "step": 4188 + }, + { + "epoch": 0.6604035887293261, + "grad_norm": 0.984375, + "learning_rate": 5.9089550154668935e-05, + "loss": 1.0161, + "step": 4189 + }, + { + "epoch": 0.6605612405767192, + "grad_norm": 0.91015625, + "learning_rate": 5.9085349342001386e-05, + "loss": 0.9972, + "step": 4190 + }, + { + "epoch": 0.6607188924241121, + "grad_norm": 1.328125, + "learning_rate": 5.9081148616050075e-05, + "loss": 0.9767, + "step": 4191 + }, + { + "epoch": 0.6608765442715051, + "grad_norm": 0.9609375, + "learning_rate": 5.907694797682395e-05, + "loss": 1.094, + "step": 4192 + }, + { + "epoch": 0.6610341961188981, + "grad_norm": 0.91015625, + "learning_rate": 5.907274742433191e-05, + "loss": 0.9247, + "step": 4193 + }, + { + "epoch": 0.6611918479662912, + "grad_norm": 0.890625, + "learning_rate": 5.906854695858284e-05, + "loss": 0.9399, + "step": 4194 + }, + { + "epoch": 0.6613494998136842, + "grad_norm": 0.9921875, + "learning_rate": 5.906434657958564e-05, + "loss": 1.0467, + "step": 4195 + }, + { + "epoch": 0.6615071516610772, + "grad_norm": 0.89453125, + "learning_rate": 5.906014628734919e-05, + "loss": 1.1172, + "step": 4196 + }, + { + "epoch": 0.6616648035084702, + "grad_norm": 0.984375, + "learning_rate": 5.905594608188246e-05, + "loss": 0.9914, + "step": 4197 + }, + { + "epoch": 0.6618224553558633, + "grad_norm": 1.046875, + "learning_rate": 5.9051745963194306e-05, + "loss": 0.8661, + "step": 4198 + }, + { + "epoch": 0.6619801072032562, + "grad_norm": 1.9609375, + "learning_rate": 5.9047545931293626e-05, + "loss": 0.9653, + "step": 4199 + }, + { + "epoch": 0.6621377590506492, + "grad_norm": 1.0, + "learning_rate": 5.904334598618935e-05, + "loss": 0.9585, + "step": 4200 + }, + { + "epoch": 0.6622954108980422, + "grad_norm": 0.81640625, + "learning_rate": 5.90391461278903e-05, + "loss": 0.8253, + "step": 4201 + }, + { + "epoch": 0.6624530627454353, + "grad_norm": 0.96484375, + "learning_rate": 5.9034946356405496e-05, + "loss": 0.9384, + "step": 4202 + }, + { + "epoch": 0.6626107145928283, + "grad_norm": 0.9609375, + "learning_rate": 5.903074667174378e-05, + "loss": 0.9379, + "step": 4203 + }, + { + "epoch": 0.6627683664402213, + "grad_norm": 0.9765625, + "learning_rate": 5.902654707391404e-05, + "loss": 1.0177, + "step": 4204 + }, + { + "epoch": 0.6629260182876143, + "grad_norm": 0.92578125, + "learning_rate": 5.9022347562925196e-05, + "loss": 1.0832, + "step": 4205 + }, + { + "epoch": 0.6630836701350074, + "grad_norm": 1.0390625, + "learning_rate": 5.9018148138786146e-05, + "loss": 1.0953, + "step": 4206 + }, + { + "epoch": 0.6632413219824003, + "grad_norm": 0.9140625, + "learning_rate": 5.9013948801505745e-05, + "loss": 1.0732, + "step": 4207 + }, + { + "epoch": 0.6633989738297933, + "grad_norm": 0.93359375, + "learning_rate": 5.9009749551093e-05, + "loss": 1.1342, + "step": 4208 + }, + { + "epoch": 0.6635566256771863, + "grad_norm": 0.98046875, + "learning_rate": 5.9005550387556726e-05, + "loss": 1.0695, + "step": 4209 + }, + { + "epoch": 0.6637142775245793, + "grad_norm": 0.9453125, + "learning_rate": 5.900135131090585e-05, + "loss": 0.7862, + "step": 4210 + }, + { + "epoch": 0.6638719293719724, + "grad_norm": 0.8984375, + "learning_rate": 5.899715232114926e-05, + "loss": 0.7955, + "step": 4211 + }, + { + "epoch": 0.6640295812193654, + "grad_norm": 0.8984375, + "learning_rate": 5.8992953418295824e-05, + "loss": 0.9941, + "step": 4212 + }, + { + "epoch": 0.6641872330667584, + "grad_norm": 0.9140625, + "learning_rate": 5.898875460235453e-05, + "loss": 0.9664, + "step": 4213 + }, + { + "epoch": 0.6643448849141514, + "grad_norm": 0.9140625, + "learning_rate": 5.898455587333422e-05, + "loss": 0.9722, + "step": 4214 + }, + { + "epoch": 0.6645025367615444, + "grad_norm": 1.0234375, + "learning_rate": 5.89803572312438e-05, + "loss": 1.1612, + "step": 4215 + }, + { + "epoch": 0.6646601886089374, + "grad_norm": 0.85546875, + "learning_rate": 5.897615867609216e-05, + "loss": 0.9339, + "step": 4216 + }, + { + "epoch": 0.6648178404563304, + "grad_norm": 0.96484375, + "learning_rate": 5.89719602078882e-05, + "loss": 1.0697, + "step": 4217 + }, + { + "epoch": 0.6649754923037234, + "grad_norm": 0.94921875, + "learning_rate": 5.896776182664084e-05, + "loss": 0.8771, + "step": 4218 + }, + { + "epoch": 0.6651331441511165, + "grad_norm": 0.90234375, + "learning_rate": 5.8963563532358965e-05, + "loss": 1.1059, + "step": 4219 + }, + { + "epoch": 0.6652907959985095, + "grad_norm": 0.9921875, + "learning_rate": 5.895936532505145e-05, + "loss": 0.9927, + "step": 4220 + }, + { + "epoch": 0.6654484478459025, + "grad_norm": 0.9609375, + "learning_rate": 5.895516720472724e-05, + "loss": 1.1861, + "step": 4221 + }, + { + "epoch": 0.6656060996932955, + "grad_norm": 0.921875, + "learning_rate": 5.895096917139518e-05, + "loss": 0.9352, + "step": 4222 + }, + { + "epoch": 0.6657637515406885, + "grad_norm": 0.8203125, + "learning_rate": 5.894677122506421e-05, + "loss": 0.8584, + "step": 4223 + }, + { + "epoch": 0.6659214033880815, + "grad_norm": 1.2109375, + "learning_rate": 5.894257336574322e-05, + "loss": 1.0601, + "step": 4224 + }, + { + "epoch": 0.6660790552354745, + "grad_norm": 1.09375, + "learning_rate": 5.89383755934411e-05, + "loss": 0.9244, + "step": 4225 + }, + { + "epoch": 0.6662367070828675, + "grad_norm": 0.88671875, + "learning_rate": 5.893417790816674e-05, + "loss": 1.0607, + "step": 4226 + }, + { + "epoch": 0.6663943589302606, + "grad_norm": 1.046875, + "learning_rate": 5.8929980309929e-05, + "loss": 1.0881, + "step": 4227 + }, + { + "epoch": 0.6665520107776536, + "grad_norm": 0.890625, + "learning_rate": 5.892578279873687e-05, + "loss": 0.8997, + "step": 4228 + }, + { + "epoch": 0.6667096626250466, + "grad_norm": 0.8828125, + "learning_rate": 5.89215853745992e-05, + "loss": 0.896, + "step": 4229 + }, + { + "epoch": 0.6668673144724396, + "grad_norm": 0.8828125, + "learning_rate": 5.891738803752487e-05, + "loss": 0.805, + "step": 4230 + }, + { + "epoch": 0.6670249663198327, + "grad_norm": 0.8828125, + "learning_rate": 5.891319078752279e-05, + "loss": 0.9534, + "step": 4231 + }, + { + "epoch": 0.6671826181672256, + "grad_norm": 1.0390625, + "learning_rate": 5.8908993624601806e-05, + "loss": 0.9815, + "step": 4232 + }, + { + "epoch": 0.6673402700146186, + "grad_norm": 1.0625, + "learning_rate": 5.8904796548770916e-05, + "loss": 1.2012, + "step": 4233 + }, + { + "epoch": 0.6674979218620116, + "grad_norm": 0.9921875, + "learning_rate": 5.890059956003895e-05, + "loss": 0.9238, + "step": 4234 + }, + { + "epoch": 0.6676555737094046, + "grad_norm": 1.140625, + "learning_rate": 5.889640265841482e-05, + "loss": 1.0521, + "step": 4235 + }, + { + "epoch": 0.6678132255567977, + "grad_norm": 1.140625, + "learning_rate": 5.889220584390741e-05, + "loss": 0.9985, + "step": 4236 + }, + { + "epoch": 0.6679708774041907, + "grad_norm": 0.921875, + "learning_rate": 5.888800911652561e-05, + "loss": 1.1496, + "step": 4237 + }, + { + "epoch": 0.6681285292515837, + "grad_norm": 0.97265625, + "learning_rate": 5.888381247627833e-05, + "loss": 1.1432, + "step": 4238 + }, + { + "epoch": 0.6682861810989768, + "grad_norm": 0.9453125, + "learning_rate": 5.887961592317446e-05, + "loss": 1.0047, + "step": 4239 + }, + { + "epoch": 0.6684438329463697, + "grad_norm": 0.9140625, + "learning_rate": 5.8875419457222904e-05, + "loss": 1.1259, + "step": 4240 + }, + { + "epoch": 0.6686014847937627, + "grad_norm": 0.98046875, + "learning_rate": 5.8871223078432536e-05, + "loss": 0.8408, + "step": 4241 + }, + { + "epoch": 0.6687591366411557, + "grad_norm": 0.953125, + "learning_rate": 5.886702678681224e-05, + "loss": 0.9974, + "step": 4242 + }, + { + "epoch": 0.6689167884885487, + "grad_norm": 0.97265625, + "learning_rate": 5.886283058237097e-05, + "loss": 0.8743, + "step": 4243 + }, + { + "epoch": 0.6690744403359418, + "grad_norm": 0.89453125, + "learning_rate": 5.8858634465117554e-05, + "loss": 1.0363, + "step": 4244 + }, + { + "epoch": 0.6692320921833348, + "grad_norm": 0.8984375, + "learning_rate": 5.885443843506093e-05, + "loss": 0.9773, + "step": 4245 + }, + { + "epoch": 0.6693897440307278, + "grad_norm": 0.90625, + "learning_rate": 5.885024249220996e-05, + "loss": 0.895, + "step": 4246 + }, + { + "epoch": 0.6695473958781208, + "grad_norm": 0.9375, + "learning_rate": 5.8846046636573515e-05, + "loss": 1.0035, + "step": 4247 + }, + { + "epoch": 0.6697050477255138, + "grad_norm": 1.015625, + "learning_rate": 5.8841850868160564e-05, + "loss": 1.1711, + "step": 4248 + }, + { + "epoch": 0.6698626995729068, + "grad_norm": 0.91015625, + "learning_rate": 5.883765518697996e-05, + "loss": 1.2258, + "step": 4249 + }, + { + "epoch": 0.6700203514202998, + "grad_norm": 0.9765625, + "learning_rate": 5.8833459593040583e-05, + "loss": 1.2968, + "step": 4250 + }, + { + "epoch": 0.6701780032676928, + "grad_norm": 0.92578125, + "learning_rate": 5.882926408635134e-05, + "loss": 0.8819, + "step": 4251 + }, + { + "epoch": 0.6703356551150859, + "grad_norm": 0.98046875, + "learning_rate": 5.882506866692108e-05, + "loss": 1.1756, + "step": 4252 + }, + { + "epoch": 0.6704933069624789, + "grad_norm": 0.921875, + "learning_rate": 5.882087333475876e-05, + "loss": 0.8979, + "step": 4253 + }, + { + "epoch": 0.6706509588098719, + "grad_norm": 1.9140625, + "learning_rate": 5.881667808987327e-05, + "loss": 1.0591, + "step": 4254 + }, + { + "epoch": 0.6708086106572649, + "grad_norm": 0.96875, + "learning_rate": 5.881248293227346e-05, + "loss": 1.0853, + "step": 4255 + }, + { + "epoch": 0.6709662625046579, + "grad_norm": 0.83203125, + "learning_rate": 5.880828786196825e-05, + "loss": 0.7879, + "step": 4256 + }, + { + "epoch": 0.6711239143520509, + "grad_norm": 0.984375, + "learning_rate": 5.880409287896647e-05, + "loss": 1.0554, + "step": 4257 + }, + { + "epoch": 0.6712815661994439, + "grad_norm": 0.890625, + "learning_rate": 5.87998979832771e-05, + "loss": 0.9114, + "step": 4258 + }, + { + "epoch": 0.6714392180468369, + "grad_norm": 0.9453125, + "learning_rate": 5.879570317490899e-05, + "loss": 0.9613, + "step": 4259 + }, + { + "epoch": 0.67159686989423, + "grad_norm": 0.94921875, + "learning_rate": 5.879150845387105e-05, + "loss": 0.907, + "step": 4260 + }, + { + "epoch": 0.671754521741623, + "grad_norm": 0.9140625, + "learning_rate": 5.878731382017213e-05, + "loss": 0.9004, + "step": 4261 + }, + { + "epoch": 0.671912173589016, + "grad_norm": 0.875, + "learning_rate": 5.878311927382113e-05, + "loss": 0.9622, + "step": 4262 + }, + { + "epoch": 0.672069825436409, + "grad_norm": 2.34375, + "learning_rate": 5.877892481482697e-05, + "loss": 0.8504, + "step": 4263 + }, + { + "epoch": 0.6722274772838019, + "grad_norm": 0.953125, + "learning_rate": 5.877473044319853e-05, + "loss": 0.9961, + "step": 4264 + }, + { + "epoch": 0.672385129131195, + "grad_norm": 1.4921875, + "learning_rate": 5.877053615894469e-05, + "loss": 1.1064, + "step": 4265 + }, + { + "epoch": 0.672542780978588, + "grad_norm": 0.9296875, + "learning_rate": 5.876634196207433e-05, + "loss": 1.0089, + "step": 4266 + }, + { + "epoch": 0.672700432825981, + "grad_norm": 1.140625, + "learning_rate": 5.876214785259633e-05, + "loss": 1.0729, + "step": 4267 + }, + { + "epoch": 0.672858084673374, + "grad_norm": 0.9375, + "learning_rate": 5.875795383051963e-05, + "loss": 0.9183, + "step": 4268 + }, + { + "epoch": 0.6730157365207671, + "grad_norm": 0.9921875, + "learning_rate": 5.87537598958531e-05, + "loss": 1.1391, + "step": 4269 + }, + { + "epoch": 0.6731733883681601, + "grad_norm": 1.0390625, + "learning_rate": 5.874956604860562e-05, + "loss": 1.2014, + "step": 4270 + }, + { + "epoch": 0.6733310402155531, + "grad_norm": 0.93359375, + "learning_rate": 5.874537228878606e-05, + "loss": 1.0237, + "step": 4271 + }, + { + "epoch": 0.673488692062946, + "grad_norm": 0.875, + "learning_rate": 5.874117861640329e-05, + "loss": 0.8601, + "step": 4272 + }, + { + "epoch": 0.6736463439103391, + "grad_norm": 0.96875, + "learning_rate": 5.8736985031466274e-05, + "loss": 0.986, + "step": 4273 + }, + { + "epoch": 0.6738039957577321, + "grad_norm": 0.96875, + "learning_rate": 5.873279153398388e-05, + "loss": 1.1771, + "step": 4274 + }, + { + "epoch": 0.6739616476051251, + "grad_norm": 0.98828125, + "learning_rate": 5.872859812396495e-05, + "loss": 1.1899, + "step": 4275 + }, + { + "epoch": 0.6741192994525181, + "grad_norm": 1.125, + "learning_rate": 5.872440480141841e-05, + "loss": 1.0007, + "step": 4276 + }, + { + "epoch": 0.6742769512999112, + "grad_norm": 0.92578125, + "learning_rate": 5.872021156635309e-05, + "loss": 1.0224, + "step": 4277 + }, + { + "epoch": 0.6744346031473042, + "grad_norm": 1.0, + "learning_rate": 5.871601841877796e-05, + "loss": 0.9422, + "step": 4278 + }, + { + "epoch": 0.6745922549946972, + "grad_norm": 0.86328125, + "learning_rate": 5.8711825358701876e-05, + "loss": 1.1526, + "step": 4279 + }, + { + "epoch": 0.6747499068420901, + "grad_norm": 0.86328125, + "learning_rate": 5.8707632386133706e-05, + "loss": 0.8807, + "step": 4280 + }, + { + "epoch": 0.6749075586894832, + "grad_norm": 0.96875, + "learning_rate": 5.870343950108237e-05, + "loss": 1.2016, + "step": 4281 + }, + { + "epoch": 0.6750652105368762, + "grad_norm": 0.8828125, + "learning_rate": 5.869924670355669e-05, + "loss": 1.0572, + "step": 4282 + }, + { + "epoch": 0.6752228623842692, + "grad_norm": 0.92578125, + "learning_rate": 5.869505399356563e-05, + "loss": 0.9693, + "step": 4283 + }, + { + "epoch": 0.6753805142316622, + "grad_norm": 0.99609375, + "learning_rate": 5.869086137111803e-05, + "loss": 1.0617, + "step": 4284 + }, + { + "epoch": 0.6755381660790553, + "grad_norm": 0.99609375, + "learning_rate": 5.86866688362228e-05, + "loss": 1.0246, + "step": 4285 + }, + { + "epoch": 0.6756958179264483, + "grad_norm": 0.8828125, + "learning_rate": 5.868247638888882e-05, + "loss": 1.1124, + "step": 4286 + }, + { + "epoch": 0.6758534697738413, + "grad_norm": 1.0, + "learning_rate": 5.8678284029124916e-05, + "loss": 1.1563, + "step": 4287 + }, + { + "epoch": 0.6760111216212342, + "grad_norm": 1.0234375, + "learning_rate": 5.867409175694007e-05, + "loss": 0.8567, + "step": 4288 + }, + { + "epoch": 0.6761687734686272, + "grad_norm": 0.8515625, + "learning_rate": 5.866989957234313e-05, + "loss": 1.0346, + "step": 4289 + }, + { + "epoch": 0.6763264253160203, + "grad_norm": 1.046875, + "learning_rate": 5.866570747534298e-05, + "loss": 1.0391, + "step": 4290 + }, + { + "epoch": 0.6764840771634133, + "grad_norm": 1.1484375, + "learning_rate": 5.866151546594849e-05, + "loss": 0.9463, + "step": 4291 + }, + { + "epoch": 0.6766417290108063, + "grad_norm": 0.93359375, + "learning_rate": 5.865732354416852e-05, + "loss": 1.0399, + "step": 4292 + }, + { + "epoch": 0.6767993808581994, + "grad_norm": 1.0, + "learning_rate": 5.865313171001203e-05, + "loss": 1.1003, + "step": 4293 + }, + { + "epoch": 0.6769570327055924, + "grad_norm": 0.89453125, + "learning_rate": 5.864893996348786e-05, + "loss": 0.9609, + "step": 4294 + }, + { + "epoch": 0.6771146845529854, + "grad_norm": 1.1875, + "learning_rate": 5.864474830460489e-05, + "loss": 1.1013, + "step": 4295 + }, + { + "epoch": 0.6772723364003783, + "grad_norm": 0.90625, + "learning_rate": 5.864055673337203e-05, + "loss": 1.0646, + "step": 4296 + }, + { + "epoch": 0.6774299882477713, + "grad_norm": 0.8984375, + "learning_rate": 5.86363652497981e-05, + "loss": 0.9352, + "step": 4297 + }, + { + "epoch": 0.6775876400951644, + "grad_norm": 0.94140625, + "learning_rate": 5.863217385389207e-05, + "loss": 0.8736, + "step": 4298 + }, + { + "epoch": 0.6777452919425574, + "grad_norm": 0.77734375, + "learning_rate": 5.862798254566277e-05, + "loss": 0.9367, + "step": 4299 + }, + { + "epoch": 0.6779029437899504, + "grad_norm": 1.109375, + "learning_rate": 5.862379132511912e-05, + "loss": 1.1506, + "step": 4300 + }, + { + "epoch": 0.6780605956373434, + "grad_norm": 0.98046875, + "learning_rate": 5.8619600192269964e-05, + "loss": 1.0278, + "step": 4301 + }, + { + "epoch": 0.6782182474847365, + "grad_norm": 0.85546875, + "learning_rate": 5.861540914712418e-05, + "loss": 0.8443, + "step": 4302 + }, + { + "epoch": 0.6783758993321295, + "grad_norm": 1.078125, + "learning_rate": 5.86112181896907e-05, + "loss": 1.0845, + "step": 4303 + }, + { + "epoch": 0.6785335511795224, + "grad_norm": 0.97265625, + "learning_rate": 5.860702731997836e-05, + "loss": 0.893, + "step": 4304 + }, + { + "epoch": 0.6786912030269154, + "grad_norm": 0.89453125, + "learning_rate": 5.860283653799607e-05, + "loss": 0.8737, + "step": 4305 + }, + { + "epoch": 0.6788488548743085, + "grad_norm": 1.2890625, + "learning_rate": 5.8598645843752695e-05, + "loss": 0.8927, + "step": 4306 + }, + { + "epoch": 0.6790065067217015, + "grad_norm": 0.9453125, + "learning_rate": 5.8594455237257106e-05, + "loss": 0.7611, + "step": 4307 + }, + { + "epoch": 0.6791641585690945, + "grad_norm": 1.1484375, + "learning_rate": 5.859026471851823e-05, + "loss": 1.2162, + "step": 4308 + }, + { + "epoch": 0.6793218104164875, + "grad_norm": 0.96484375, + "learning_rate": 5.8586074287544924e-05, + "loss": 1.0337, + "step": 4309 + }, + { + "epoch": 0.6794794622638806, + "grad_norm": 0.96484375, + "learning_rate": 5.858188394434605e-05, + "loss": 1.0937, + "step": 4310 + }, + { + "epoch": 0.6796371141112736, + "grad_norm": 0.9375, + "learning_rate": 5.857769368893052e-05, + "loss": 0.8648, + "step": 4311 + }, + { + "epoch": 0.6797947659586665, + "grad_norm": 0.81640625, + "learning_rate": 5.857350352130716e-05, + "loss": 0.9456, + "step": 4312 + }, + { + "epoch": 0.6799524178060595, + "grad_norm": 0.9921875, + "learning_rate": 5.856931344148493e-05, + "loss": 1.0389, + "step": 4313 + }, + { + "epoch": 0.6801100696534526, + "grad_norm": 1.0625, + "learning_rate": 5.856512344947267e-05, + "loss": 1.1071, + "step": 4314 + }, + { + "epoch": 0.6802677215008456, + "grad_norm": 0.8671875, + "learning_rate": 5.856093354527926e-05, + "loss": 0.9892, + "step": 4315 + }, + { + "epoch": 0.6804253733482386, + "grad_norm": 1.015625, + "learning_rate": 5.8556743728913585e-05, + "loss": 0.9803, + "step": 4316 + }, + { + "epoch": 0.6805830251956316, + "grad_norm": 0.99609375, + "learning_rate": 5.8552554000384485e-05, + "loss": 1.1223, + "step": 4317 + }, + { + "epoch": 0.6807406770430247, + "grad_norm": 0.89453125, + "learning_rate": 5.854836435970092e-05, + "loss": 1.0529, + "step": 4318 + }, + { + "epoch": 0.6808983288904177, + "grad_norm": 0.984375, + "learning_rate": 5.854417480687173e-05, + "loss": 0.7834, + "step": 4319 + }, + { + "epoch": 0.6810559807378106, + "grad_norm": 1.0703125, + "learning_rate": 5.853998534190579e-05, + "loss": 1.0463, + "step": 4320 + }, + { + "epoch": 0.6812136325852036, + "grad_norm": 1.015625, + "learning_rate": 5.853579596481198e-05, + "loss": 1.0442, + "step": 4321 + }, + { + "epoch": 0.6813712844325966, + "grad_norm": 0.9140625, + "learning_rate": 5.853160667559915e-05, + "loss": 1.057, + "step": 4322 + }, + { + "epoch": 0.6815289362799897, + "grad_norm": 1.09375, + "learning_rate": 5.8527417474276235e-05, + "loss": 1.0733, + "step": 4323 + }, + { + "epoch": 0.6816865881273827, + "grad_norm": 0.9296875, + "learning_rate": 5.852322836085207e-05, + "loss": 0.8256, + "step": 4324 + }, + { + "epoch": 0.6818442399747757, + "grad_norm": 0.93359375, + "learning_rate": 5.851903933533558e-05, + "loss": 0.9742, + "step": 4325 + }, + { + "epoch": 0.6820018918221687, + "grad_norm": 0.96875, + "learning_rate": 5.851485039773561e-05, + "loss": 0.8921, + "step": 4326 + }, + { + "epoch": 0.6821595436695618, + "grad_norm": 1.0703125, + "learning_rate": 5.8510661548061015e-05, + "loss": 1.1335, + "step": 4327 + }, + { + "epoch": 0.6823171955169547, + "grad_norm": 1.0234375, + "learning_rate": 5.850647278632073e-05, + "loss": 1.1335, + "step": 4328 + }, + { + "epoch": 0.6824748473643477, + "grad_norm": 0.9296875, + "learning_rate": 5.850228411252361e-05, + "loss": 1.0729, + "step": 4329 + }, + { + "epoch": 0.6826324992117407, + "grad_norm": 0.9296875, + "learning_rate": 5.849809552667851e-05, + "loss": 1.0121, + "step": 4330 + }, + { + "epoch": 0.6827901510591338, + "grad_norm": 0.953125, + "learning_rate": 5.8493907028794334e-05, + "loss": 0.8595, + "step": 4331 + }, + { + "epoch": 0.6829478029065268, + "grad_norm": 0.94921875, + "learning_rate": 5.8489718618879906e-05, + "loss": 0.9107, + "step": 4332 + }, + { + "epoch": 0.6831054547539198, + "grad_norm": 0.89453125, + "learning_rate": 5.848553029694419e-05, + "loss": 1.049, + "step": 4333 + }, + { + "epoch": 0.6832631066013128, + "grad_norm": 0.921875, + "learning_rate": 5.848134206299602e-05, + "loss": 0.9846, + "step": 4334 + }, + { + "epoch": 0.6834207584487059, + "grad_norm": 0.9140625, + "learning_rate": 5.847715391704427e-05, + "loss": 0.8768, + "step": 4335 + }, + { + "epoch": 0.6835784102960988, + "grad_norm": 0.890625, + "learning_rate": 5.8472965859097825e-05, + "loss": 0.9036, + "step": 4336 + }, + { + "epoch": 0.6837360621434918, + "grad_norm": 1.0703125, + "learning_rate": 5.846877788916551e-05, + "loss": 0.9874, + "step": 4337 + }, + { + "epoch": 0.6838937139908848, + "grad_norm": 0.9453125, + "learning_rate": 5.846459000725629e-05, + "loss": 1.1366, + "step": 4338 + }, + { + "epoch": 0.6840513658382779, + "grad_norm": 0.96484375, + "learning_rate": 5.8460402213378996e-05, + "loss": 0.8511, + "step": 4339 + }, + { + "epoch": 0.6842090176856709, + "grad_norm": 0.94140625, + "learning_rate": 5.8456214507542504e-05, + "loss": 1.2788, + "step": 4340 + }, + { + "epoch": 0.6843666695330639, + "grad_norm": 0.8828125, + "learning_rate": 5.8452026889755697e-05, + "loss": 0.9004, + "step": 4341 + }, + { + "epoch": 0.6845243213804569, + "grad_norm": 1.1015625, + "learning_rate": 5.844783936002741e-05, + "loss": 1.0071, + "step": 4342 + }, + { + "epoch": 0.68468197322785, + "grad_norm": 0.86328125, + "learning_rate": 5.844365191836656e-05, + "loss": 0.9768, + "step": 4343 + }, + { + "epoch": 0.6848396250752429, + "grad_norm": 0.90234375, + "learning_rate": 5.8439464564782043e-05, + "loss": 0.9202, + "step": 4344 + }, + { + "epoch": 0.6849972769226359, + "grad_norm": 0.8125, + "learning_rate": 5.8435277299282697e-05, + "loss": 1.0251, + "step": 4345 + }, + { + "epoch": 0.6851549287700289, + "grad_norm": 0.91015625, + "learning_rate": 5.8431090121877405e-05, + "loss": 0.9603, + "step": 4346 + }, + { + "epoch": 0.685312580617422, + "grad_norm": 1.015625, + "learning_rate": 5.842690303257504e-05, + "loss": 0.8703, + "step": 4347 + }, + { + "epoch": 0.685470232464815, + "grad_norm": 1.171875, + "learning_rate": 5.8422716031384464e-05, + "loss": 1.0084, + "step": 4348 + }, + { + "epoch": 0.685627884312208, + "grad_norm": 0.94140625, + "learning_rate": 5.841852911831458e-05, + "loss": 0.7919, + "step": 4349 + }, + { + "epoch": 0.685785536159601, + "grad_norm": 0.94921875, + "learning_rate": 5.841434229337425e-05, + "loss": 1.0641, + "step": 4350 + }, + { + "epoch": 0.685943188006994, + "grad_norm": 0.94921875, + "learning_rate": 5.841015555657234e-05, + "loss": 0.9804, + "step": 4351 + }, + { + "epoch": 0.686100839854387, + "grad_norm": 1.0, + "learning_rate": 5.840596890791773e-05, + "loss": 1.0713, + "step": 4352 + }, + { + "epoch": 0.68625849170178, + "grad_norm": 0.94921875, + "learning_rate": 5.8401782347419255e-05, + "loss": 1.1287, + "step": 4353 + }, + { + "epoch": 0.686416143549173, + "grad_norm": 0.91796875, + "learning_rate": 5.839759587508586e-05, + "loss": 1.0508, + "step": 4354 + }, + { + "epoch": 0.686573795396566, + "grad_norm": 0.92578125, + "learning_rate": 5.8393409490926385e-05, + "loss": 0.991, + "step": 4355 + }, + { + "epoch": 0.6867314472439591, + "grad_norm": 0.98046875, + "learning_rate": 5.8389223194949704e-05, + "loss": 0.9483, + "step": 4356 + }, + { + "epoch": 0.6868890990913521, + "grad_norm": 0.8515625, + "learning_rate": 5.8385036987164665e-05, + "loss": 1.0166, + "step": 4357 + }, + { + "epoch": 0.6870467509387451, + "grad_norm": 0.9609375, + "learning_rate": 5.838085086758017e-05, + "loss": 0.9482, + "step": 4358 + }, + { + "epoch": 0.6872044027861381, + "grad_norm": 0.99609375, + "learning_rate": 5.8376664836205096e-05, + "loss": 0.853, + "step": 4359 + }, + { + "epoch": 0.6873620546335311, + "grad_norm": 0.984375, + "learning_rate": 5.837247889304829e-05, + "loss": 0.9209, + "step": 4360 + }, + { + "epoch": 0.6875197064809241, + "grad_norm": 0.9609375, + "learning_rate": 5.836829303811865e-05, + "loss": 0.9603, + "step": 4361 + }, + { + "epoch": 0.6876773583283171, + "grad_norm": 0.91015625, + "learning_rate": 5.8364107271425025e-05, + "loss": 0.8887, + "step": 4362 + }, + { + "epoch": 0.6878350101757101, + "grad_norm": 1.1015625, + "learning_rate": 5.835992159297625e-05, + "loss": 1.1894, + "step": 4363 + }, + { + "epoch": 0.6879926620231032, + "grad_norm": 0.85546875, + "learning_rate": 5.835573600278129e-05, + "loss": 0.9182, + "step": 4364 + }, + { + "epoch": 0.6881503138704962, + "grad_norm": 0.9765625, + "learning_rate": 5.835155050084897e-05, + "loss": 1.0942, + "step": 4365 + }, + { + "epoch": 0.6883079657178892, + "grad_norm": 0.9609375, + "learning_rate": 5.8347365087188145e-05, + "loss": 1.0286, + "step": 4366 + }, + { + "epoch": 0.6884656175652822, + "grad_norm": 0.90625, + "learning_rate": 5.834317976180772e-05, + "loss": 0.8342, + "step": 4367 + }, + { + "epoch": 0.6886232694126752, + "grad_norm": 1.1484375, + "learning_rate": 5.833899452471648e-05, + "loss": 0.8904, + "step": 4368 + }, + { + "epoch": 0.6887809212600682, + "grad_norm": 1.1171875, + "learning_rate": 5.8334809375923414e-05, + "loss": 0.9624, + "step": 4369 + }, + { + "epoch": 0.6889385731074612, + "grad_norm": 0.9375, + "learning_rate": 5.8330624315437346e-05, + "loss": 1.0367, + "step": 4370 + }, + { + "epoch": 0.6890962249548542, + "grad_norm": 1.09375, + "learning_rate": 5.832643934326711e-05, + "loss": 1.0257, + "step": 4371 + }, + { + "epoch": 0.6892538768022473, + "grad_norm": 0.89453125, + "learning_rate": 5.8322254459421634e-05, + "loss": 1.0738, + "step": 4372 + }, + { + "epoch": 0.6894115286496403, + "grad_norm": 0.875, + "learning_rate": 5.831806966390969e-05, + "loss": 1.1193, + "step": 4373 + }, + { + "epoch": 0.6895691804970333, + "grad_norm": 0.95703125, + "learning_rate": 5.831388495674028e-05, + "loss": 1.15, + "step": 4374 + }, + { + "epoch": 0.6897268323444263, + "grad_norm": 0.8515625, + "learning_rate": 5.8309700337922204e-05, + "loss": 0.8735, + "step": 4375 + }, + { + "epoch": 0.6898844841918192, + "grad_norm": 0.97265625, + "learning_rate": 5.8305515807464325e-05, + "loss": 0.9704, + "step": 4376 + }, + { + "epoch": 0.6900421360392123, + "grad_norm": 0.93359375, + "learning_rate": 5.8301331365375524e-05, + "loss": 0.9566, + "step": 4377 + }, + { + "epoch": 0.6901997878866053, + "grad_norm": 0.96484375, + "learning_rate": 5.829714701166462e-05, + "loss": 1.1779, + "step": 4378 + }, + { + "epoch": 0.6903574397339983, + "grad_norm": 1.453125, + "learning_rate": 5.8292962746340586e-05, + "loss": 1.2191, + "step": 4379 + }, + { + "epoch": 0.6905150915813913, + "grad_norm": 0.93359375, + "learning_rate": 5.828877856941223e-05, + "loss": 0.9928, + "step": 4380 + }, + { + "epoch": 0.6906727434287844, + "grad_norm": 0.8671875, + "learning_rate": 5.828459448088841e-05, + "loss": 0.7851, + "step": 4381 + }, + { + "epoch": 0.6908303952761774, + "grad_norm": 0.9609375, + "learning_rate": 5.828041048077803e-05, + "loss": 1.1876, + "step": 4382 + }, + { + "epoch": 0.6909880471235704, + "grad_norm": 0.94921875, + "learning_rate": 5.827622656908989e-05, + "loss": 0.9875, + "step": 4383 + }, + { + "epoch": 0.6911456989709635, + "grad_norm": 0.9453125, + "learning_rate": 5.8272042745832936e-05, + "loss": 1.0881, + "step": 4384 + }, + { + "epoch": 0.6913033508183564, + "grad_norm": 0.87109375, + "learning_rate": 5.826785901101599e-05, + "loss": 0.968, + "step": 4385 + }, + { + "epoch": 0.6914610026657494, + "grad_norm": 0.9453125, + "learning_rate": 5.826367536464794e-05, + "loss": 1.035, + "step": 4386 + }, + { + "epoch": 0.6916186545131424, + "grad_norm": 0.859375, + "learning_rate": 5.8259491806737654e-05, + "loss": 0.8168, + "step": 4387 + }, + { + "epoch": 0.6917763063605354, + "grad_norm": 0.94921875, + "learning_rate": 5.825530833729395e-05, + "loss": 0.9897, + "step": 4388 + }, + { + "epoch": 0.6919339582079285, + "grad_norm": 0.984375, + "learning_rate": 5.8251124956325765e-05, + "loss": 1.0609, + "step": 4389 + }, + { + "epoch": 0.6920916100553215, + "grad_norm": 0.984375, + "learning_rate": 5.824694166384194e-05, + "loss": 0.9525, + "step": 4390 + }, + { + "epoch": 0.6922492619027145, + "grad_norm": 0.92578125, + "learning_rate": 5.824275845985133e-05, + "loss": 1.0863, + "step": 4391 + }, + { + "epoch": 0.6924069137501075, + "grad_norm": 1.109375, + "learning_rate": 5.823857534436281e-05, + "loss": 1.1252, + "step": 4392 + }, + { + "epoch": 0.6925645655975005, + "grad_norm": 0.91015625, + "learning_rate": 5.82343923173852e-05, + "loss": 0.8341, + "step": 4393 + }, + { + "epoch": 0.6927222174448935, + "grad_norm": 1.015625, + "learning_rate": 5.823020937892745e-05, + "loss": 1.0191, + "step": 4394 + }, + { + "epoch": 0.6928798692922865, + "grad_norm": 1.15625, + "learning_rate": 5.822602652899839e-05, + "loss": 0.9242, + "step": 4395 + }, + { + "epoch": 0.6930375211396795, + "grad_norm": 1.015625, + "learning_rate": 5.822184376760687e-05, + "loss": 1.0908, + "step": 4396 + }, + { + "epoch": 0.6931951729870726, + "grad_norm": 1.0546875, + "learning_rate": 5.821766109476177e-05, + "loss": 1.2476, + "step": 4397 + }, + { + "epoch": 0.6933528248344656, + "grad_norm": 0.95703125, + "learning_rate": 5.821347851047194e-05, + "loss": 0.923, + "step": 4398 + }, + { + "epoch": 0.6935104766818586, + "grad_norm": 0.8671875, + "learning_rate": 5.8209296014746275e-05, + "loss": 0.8926, + "step": 4399 + }, + { + "epoch": 0.6936681285292516, + "grad_norm": 1.03125, + "learning_rate": 5.820511360759355e-05, + "loss": 0.9136, + "step": 4400 + }, + { + "epoch": 0.6938257803766446, + "grad_norm": 0.984375, + "learning_rate": 5.820093128902275e-05, + "loss": 1.1609, + "step": 4401 + }, + { + "epoch": 0.6939834322240376, + "grad_norm": 1.0078125, + "learning_rate": 5.81967490590427e-05, + "loss": 0.9315, + "step": 4402 + }, + { + "epoch": 0.6941410840714306, + "grad_norm": 0.89453125, + "learning_rate": 5.819256691766224e-05, + "loss": 0.91, + "step": 4403 + }, + { + "epoch": 0.6942987359188236, + "grad_norm": 0.89453125, + "learning_rate": 5.818838486489024e-05, + "loss": 0.9092, + "step": 4404 + }, + { + "epoch": 0.6944563877662167, + "grad_norm": 0.80078125, + "learning_rate": 5.8184202900735585e-05, + "loss": 0.8103, + "step": 4405 + }, + { + "epoch": 0.6946140396136097, + "grad_norm": 0.87890625, + "learning_rate": 5.81800210252071e-05, + "loss": 1.1155, + "step": 4406 + }, + { + "epoch": 0.6947716914610027, + "grad_norm": 0.88671875, + "learning_rate": 5.817583923831369e-05, + "loss": 0.9026, + "step": 4407 + }, + { + "epoch": 0.6949293433083957, + "grad_norm": 0.9375, + "learning_rate": 5.817165754006415e-05, + "loss": 0.9241, + "step": 4408 + }, + { + "epoch": 0.6950869951557886, + "grad_norm": 0.98046875, + "learning_rate": 5.816747593046743e-05, + "loss": 0.9623, + "step": 4409 + }, + { + "epoch": 0.6952446470031817, + "grad_norm": 0.92578125, + "learning_rate": 5.816329440953234e-05, + "loss": 1.0573, + "step": 4410 + }, + { + "epoch": 0.6954022988505747, + "grad_norm": 0.9453125, + "learning_rate": 5.815911297726777e-05, + "loss": 1.1209, + "step": 4411 + }, + { + "epoch": 0.6955599506979677, + "grad_norm": 1.34375, + "learning_rate": 5.815493163368257e-05, + "loss": 1.0183, + "step": 4412 + }, + { + "epoch": 0.6957176025453607, + "grad_norm": 1.109375, + "learning_rate": 5.815075037878557e-05, + "loss": 0.9832, + "step": 4413 + }, + { + "epoch": 0.6958752543927538, + "grad_norm": 1.0546875, + "learning_rate": 5.814656921258568e-05, + "loss": 1.1672, + "step": 4414 + }, + { + "epoch": 0.6960329062401468, + "grad_norm": 0.86328125, + "learning_rate": 5.814238813509176e-05, + "loss": 0.9455, + "step": 4415 + }, + { + "epoch": 0.6961905580875398, + "grad_norm": 1.015625, + "learning_rate": 5.8138207146312654e-05, + "loss": 1.0353, + "step": 4416 + }, + { + "epoch": 0.6963482099349327, + "grad_norm": 1.0, + "learning_rate": 5.8134026246257225e-05, + "loss": 0.974, + "step": 4417 + }, + { + "epoch": 0.6965058617823258, + "grad_norm": 0.8671875, + "learning_rate": 5.812984543493429e-05, + "loss": 1.0643, + "step": 4418 + }, + { + "epoch": 0.6966635136297188, + "grad_norm": 0.8515625, + "learning_rate": 5.812566471235279e-05, + "loss": 0.8551, + "step": 4419 + }, + { + "epoch": 0.6968211654771118, + "grad_norm": 0.96875, + "learning_rate": 5.812148407852156e-05, + "loss": 1.0857, + "step": 4420 + }, + { + "epoch": 0.6969788173245048, + "grad_norm": 0.99609375, + "learning_rate": 5.811730353344945e-05, + "loss": 0.9829, + "step": 4421 + }, + { + "epoch": 0.6971364691718979, + "grad_norm": 0.8671875, + "learning_rate": 5.8113123077145315e-05, + "loss": 0.9791, + "step": 4422 + }, + { + "epoch": 0.6972941210192909, + "grad_norm": 0.9921875, + "learning_rate": 5.8108942709617996e-05, + "loss": 1.0099, + "step": 4423 + }, + { + "epoch": 0.6974517728666839, + "grad_norm": 1.015625, + "learning_rate": 5.81047624308764e-05, + "loss": 0.9585, + "step": 4424 + }, + { + "epoch": 0.6976094247140768, + "grad_norm": 0.90234375, + "learning_rate": 5.810058224092938e-05, + "loss": 0.9146, + "step": 4425 + }, + { + "epoch": 0.6977670765614699, + "grad_norm": 1.0, + "learning_rate": 5.809640213978578e-05, + "loss": 1.0991, + "step": 4426 + }, + { + "epoch": 0.6979247284088629, + "grad_norm": 0.98046875, + "learning_rate": 5.809222212745444e-05, + "loss": 1.0558, + "step": 4427 + }, + { + "epoch": 0.6980823802562559, + "grad_norm": 0.91015625, + "learning_rate": 5.8088042203944214e-05, + "loss": 0.9057, + "step": 4428 + }, + { + "epoch": 0.6982400321036489, + "grad_norm": 0.99609375, + "learning_rate": 5.8083862369264045e-05, + "loss": 1.0565, + "step": 4429 + }, + { + "epoch": 0.698397683951042, + "grad_norm": 1.1875, + "learning_rate": 5.807968262342272e-05, + "loss": 1.2033, + "step": 4430 + }, + { + "epoch": 0.698555335798435, + "grad_norm": 0.98828125, + "learning_rate": 5.8075502966429094e-05, + "loss": 1.136, + "step": 4431 + }, + { + "epoch": 0.698712987645828, + "grad_norm": 0.890625, + "learning_rate": 5.807132339829207e-05, + "loss": 0.8736, + "step": 4432 + }, + { + "epoch": 0.6988706394932209, + "grad_norm": 1.0234375, + "learning_rate": 5.806714391902042e-05, + "loss": 1.0263, + "step": 4433 + }, + { + "epoch": 0.699028291340614, + "grad_norm": 0.90625, + "learning_rate": 5.8062964528623096e-05, + "loss": 1.0787, + "step": 4434 + }, + { + "epoch": 0.699185943188007, + "grad_norm": 0.89453125, + "learning_rate": 5.805878522710895e-05, + "loss": 1.0348, + "step": 4435 + }, + { + "epoch": 0.6993435950354, + "grad_norm": 0.9140625, + "learning_rate": 5.805460601448679e-05, + "loss": 0.9765, + "step": 4436 + }, + { + "epoch": 0.699501246882793, + "grad_norm": 1.0234375, + "learning_rate": 5.8050426890765496e-05, + "loss": 0.9284, + "step": 4437 + }, + { + "epoch": 0.699658898730186, + "grad_norm": 0.94140625, + "learning_rate": 5.8046247855953893e-05, + "loss": 1.0038, + "step": 4438 + }, + { + "epoch": 0.6998165505775791, + "grad_norm": 1.125, + "learning_rate": 5.804206891006091e-05, + "loss": 1.0067, + "step": 4439 + }, + { + "epoch": 0.6999742024249721, + "grad_norm": 0.91015625, + "learning_rate": 5.803789005309537e-05, + "loss": 0.7741, + "step": 4440 + }, + { + "epoch": 0.700131854272365, + "grad_norm": 0.921875, + "learning_rate": 5.803371128506611e-05, + "loss": 0.898, + "step": 4441 + }, + { + "epoch": 0.700289506119758, + "grad_norm": 0.859375, + "learning_rate": 5.8029532605982e-05, + "loss": 0.7008, + "step": 4442 + }, + { + "epoch": 0.7004471579671511, + "grad_norm": 0.85546875, + "learning_rate": 5.80253540158519e-05, + "loss": 0.873, + "step": 4443 + }, + { + "epoch": 0.7006048098145441, + "grad_norm": 1.0546875, + "learning_rate": 5.802117551468467e-05, + "loss": 1.1366, + "step": 4444 + }, + { + "epoch": 0.7007624616619371, + "grad_norm": 0.9921875, + "learning_rate": 5.8016997102489146e-05, + "loss": 1.0757, + "step": 4445 + }, + { + "epoch": 0.7009201135093301, + "grad_norm": 0.96484375, + "learning_rate": 5.8012818779274205e-05, + "loss": 1.1899, + "step": 4446 + }, + { + "epoch": 0.7010777653567232, + "grad_norm": 0.8515625, + "learning_rate": 5.800864054504869e-05, + "loss": 0.9863, + "step": 4447 + }, + { + "epoch": 0.7012354172041162, + "grad_norm": 0.8671875, + "learning_rate": 5.8004462399821424e-05, + "loss": 1.0852, + "step": 4448 + }, + { + "epoch": 0.7013930690515091, + "grad_norm": 0.9453125, + "learning_rate": 5.800028434360134e-05, + "loss": 1.0996, + "step": 4449 + }, + { + "epoch": 0.7015507208989021, + "grad_norm": 1.0, + "learning_rate": 5.799610637639725e-05, + "loss": 0.9754, + "step": 4450 + }, + { + "epoch": 0.7017083727462952, + "grad_norm": 0.89453125, + "learning_rate": 5.7991928498218016e-05, + "loss": 0.9668, + "step": 4451 + }, + { + "epoch": 0.7018660245936882, + "grad_norm": 0.97265625, + "learning_rate": 5.7987750709072494e-05, + "loss": 0.9257, + "step": 4452 + }, + { + "epoch": 0.7020236764410812, + "grad_norm": 0.984375, + "learning_rate": 5.798357300896948e-05, + "loss": 0.9411, + "step": 4453 + }, + { + "epoch": 0.7021813282884742, + "grad_norm": 0.9375, + "learning_rate": 5.7979395397917934e-05, + "loss": 0.8162, + "step": 4454 + }, + { + "epoch": 0.7023389801358673, + "grad_norm": 1.0546875, + "learning_rate": 5.797521787592666e-05, + "loss": 0.9148, + "step": 4455 + }, + { + "epoch": 0.7024966319832603, + "grad_norm": 1.40625, + "learning_rate": 5.79710404430045e-05, + "loss": 1.0743, + "step": 4456 + }, + { + "epoch": 0.7026542838306532, + "grad_norm": 0.91796875, + "learning_rate": 5.7966863099160305e-05, + "loss": 1.0067, + "step": 4457 + }, + { + "epoch": 0.7028119356780462, + "grad_norm": 0.96484375, + "learning_rate": 5.796268584440292e-05, + "loss": 0.923, + "step": 4458 + }, + { + "epoch": 0.7029695875254393, + "grad_norm": 0.84765625, + "learning_rate": 5.795850867874126e-05, + "loss": 0.8637, + "step": 4459 + }, + { + "epoch": 0.7031272393728323, + "grad_norm": 2.125, + "learning_rate": 5.795433160218413e-05, + "loss": 1.0024, + "step": 4460 + }, + { + "epoch": 0.7032848912202253, + "grad_norm": 0.921875, + "learning_rate": 5.7950154614740396e-05, + "loss": 1.0329, + "step": 4461 + }, + { + "epoch": 0.7034425430676183, + "grad_norm": 1.4296875, + "learning_rate": 5.7945977716418896e-05, + "loss": 0.9838, + "step": 4462 + }, + { + "epoch": 0.7036001949150114, + "grad_norm": 1.046875, + "learning_rate": 5.794180090722846e-05, + "loss": 1.1006, + "step": 4463 + }, + { + "epoch": 0.7037578467624044, + "grad_norm": 1.1640625, + "learning_rate": 5.793762418717801e-05, + "loss": 1.2801, + "step": 4464 + }, + { + "epoch": 0.7039154986097973, + "grad_norm": 0.984375, + "learning_rate": 5.7933447556276364e-05, + "loss": 1.1131, + "step": 4465 + }, + { + "epoch": 0.7040731504571903, + "grad_norm": 0.97265625, + "learning_rate": 5.7929271014532383e-05, + "loss": 1.0401, + "step": 4466 + }, + { + "epoch": 0.7042308023045833, + "grad_norm": 1.0, + "learning_rate": 5.79250945619549e-05, + "loss": 0.8546, + "step": 4467 + }, + { + "epoch": 0.7043884541519764, + "grad_norm": 0.93359375, + "learning_rate": 5.792091819855272e-05, + "loss": 0.8764, + "step": 4468 + }, + { + "epoch": 0.7045461059993694, + "grad_norm": 0.99609375, + "learning_rate": 5.7916741924334807e-05, + "loss": 1.0916, + "step": 4469 + }, + { + "epoch": 0.7047037578467624, + "grad_norm": 0.94140625, + "learning_rate": 5.791256573930994e-05, + "loss": 0.9512, + "step": 4470 + }, + { + "epoch": 0.7048614096941554, + "grad_norm": 1.0546875, + "learning_rate": 5.790838964348699e-05, + "loss": 1.1549, + "step": 4471 + }, + { + "epoch": 0.7050190615415485, + "grad_norm": 0.91796875, + "learning_rate": 5.7904213636874806e-05, + "loss": 1.0431, + "step": 4472 + }, + { + "epoch": 0.7051767133889414, + "grad_norm": 0.9609375, + "learning_rate": 5.790003771948219e-05, + "loss": 1.1709, + "step": 4473 + }, + { + "epoch": 0.7053343652363344, + "grad_norm": 1.0859375, + "learning_rate": 5.789586189131808e-05, + "loss": 1.0286, + "step": 4474 + }, + { + "epoch": 0.7054920170837274, + "grad_norm": 0.8828125, + "learning_rate": 5.789168615239129e-05, + "loss": 1.0234, + "step": 4475 + }, + { + "epoch": 0.7056496689311205, + "grad_norm": 0.953125, + "learning_rate": 5.788751050271065e-05, + "loss": 0.9849, + "step": 4476 + }, + { + "epoch": 0.7058073207785135, + "grad_norm": 1.0625, + "learning_rate": 5.7883334942285036e-05, + "loss": 1.0829, + "step": 4477 + }, + { + "epoch": 0.7059649726259065, + "grad_norm": 0.83984375, + "learning_rate": 5.7879159471123236e-05, + "loss": 0.9591, + "step": 4478 + }, + { + "epoch": 0.7061226244732995, + "grad_norm": 0.890625, + "learning_rate": 5.7874984089234196e-05, + "loss": 0.8608, + "step": 4479 + }, + { + "epoch": 0.7062802763206926, + "grad_norm": 0.92578125, + "learning_rate": 5.787080879662673e-05, + "loss": 1.0391, + "step": 4480 + }, + { + "epoch": 0.7064379281680855, + "grad_norm": 0.85546875, + "learning_rate": 5.786663359330966e-05, + "loss": 0.9326, + "step": 4481 + }, + { + "epoch": 0.7065955800154785, + "grad_norm": 0.9609375, + "learning_rate": 5.786245847929186e-05, + "loss": 1.1351, + "step": 4482 + }, + { + "epoch": 0.7067532318628715, + "grad_norm": 0.8984375, + "learning_rate": 5.7858283454582174e-05, + "loss": 0.7805, + "step": 4483 + }, + { + "epoch": 0.7069108837102646, + "grad_norm": 0.97265625, + "learning_rate": 5.785410851918944e-05, + "loss": 1.1022, + "step": 4484 + }, + { + "epoch": 0.7070685355576576, + "grad_norm": 0.828125, + "learning_rate": 5.784993367312253e-05, + "loss": 0.8071, + "step": 4485 + }, + { + "epoch": 0.7072261874050506, + "grad_norm": 0.84375, + "learning_rate": 5.7845758916390213e-05, + "loss": 0.6792, + "step": 4486 + }, + { + "epoch": 0.7073838392524436, + "grad_norm": 0.8359375, + "learning_rate": 5.784158424900146e-05, + "loss": 0.8592, + "step": 4487 + }, + { + "epoch": 0.7075414910998367, + "grad_norm": 0.9765625, + "learning_rate": 5.783740967096505e-05, + "loss": 1.0713, + "step": 4488 + }, + { + "epoch": 0.7076991429472296, + "grad_norm": 0.91015625, + "learning_rate": 5.7833235182289835e-05, + "loss": 0.9192, + "step": 4489 + }, + { + "epoch": 0.7078567947946226, + "grad_norm": 0.9140625, + "learning_rate": 5.782906078298467e-05, + "loss": 1.0175, + "step": 4490 + }, + { + "epoch": 0.7080144466420156, + "grad_norm": 0.91796875, + "learning_rate": 5.782488647305841e-05, + "loss": 0.8847, + "step": 4491 + }, + { + "epoch": 0.7081720984894087, + "grad_norm": 0.98046875, + "learning_rate": 5.782071225251988e-05, + "loss": 1.0023, + "step": 4492 + }, + { + "epoch": 0.7083297503368017, + "grad_norm": 0.89453125, + "learning_rate": 5.781653812137795e-05, + "loss": 1.1209, + "step": 4493 + }, + { + "epoch": 0.7084874021841947, + "grad_norm": 0.890625, + "learning_rate": 5.781236407964141e-05, + "loss": 0.8303, + "step": 4494 + }, + { + "epoch": 0.7086450540315877, + "grad_norm": 0.9609375, + "learning_rate": 5.7808190127319195e-05, + "loss": 0.9324, + "step": 4495 + }, + { + "epoch": 0.7088027058789808, + "grad_norm": 1.0078125, + "learning_rate": 5.780401626442011e-05, + "loss": 1.0799, + "step": 4496 + }, + { + "epoch": 0.7089603577263737, + "grad_norm": 0.8515625, + "learning_rate": 5.7799842490953005e-05, + "loss": 0.8729, + "step": 4497 + }, + { + "epoch": 0.7091180095737667, + "grad_norm": 1.0, + "learning_rate": 5.779566880692673e-05, + "loss": 0.9757, + "step": 4498 + }, + { + "epoch": 0.7092756614211597, + "grad_norm": 0.83203125, + "learning_rate": 5.779149521235005e-05, + "loss": 0.8458, + "step": 4499 + }, + { + "epoch": 0.7094333132685527, + "grad_norm": 1.03125, + "learning_rate": 5.7787321707231954e-05, + "loss": 0.8481, + "step": 4500 + }, + { + "epoch": 0.7095909651159458, + "grad_norm": 1.078125, + "learning_rate": 5.77831482915812e-05, + "loss": 1.0312, + "step": 4501 + }, + { + "epoch": 0.7097486169633388, + "grad_norm": 0.95703125, + "learning_rate": 5.7778974965406675e-05, + "loss": 1.0573, + "step": 4502 + }, + { + "epoch": 0.7099062688107318, + "grad_norm": 0.890625, + "learning_rate": 5.777480172871717e-05, + "loss": 1.0107, + "step": 4503 + }, + { + "epoch": 0.7100639206581248, + "grad_norm": 0.91796875, + "learning_rate": 5.7770628581521534e-05, + "loss": 0.9872, + "step": 4504 + }, + { + "epoch": 0.7102215725055178, + "grad_norm": 0.76953125, + "learning_rate": 5.776645552382867e-05, + "loss": 0.9871, + "step": 4505 + }, + { + "epoch": 0.7103792243529108, + "grad_norm": 0.875, + "learning_rate": 5.77622825556474e-05, + "loss": 1.0843, + "step": 4506 + }, + { + "epoch": 0.7105368762003038, + "grad_norm": 0.85546875, + "learning_rate": 5.7758109676986546e-05, + "loss": 1.0402, + "step": 4507 + }, + { + "epoch": 0.7106945280476968, + "grad_norm": 1.125, + "learning_rate": 5.7753936887854976e-05, + "loss": 0.8896, + "step": 4508 + }, + { + "epoch": 0.7108521798950899, + "grad_norm": 0.890625, + "learning_rate": 5.774976418826147e-05, + "loss": 1.0048, + "step": 4509 + }, + { + "epoch": 0.7110098317424829, + "grad_norm": 0.90234375, + "learning_rate": 5.774559157821497e-05, + "loss": 0.9749, + "step": 4510 + }, + { + "epoch": 0.7111674835898759, + "grad_norm": 0.9140625, + "learning_rate": 5.774141905772428e-05, + "loss": 1.0288, + "step": 4511 + }, + { + "epoch": 0.7113251354372689, + "grad_norm": 0.93359375, + "learning_rate": 5.773724662679823e-05, + "loss": 0.9577, + "step": 4512 + }, + { + "epoch": 0.7114827872846619, + "grad_norm": 0.94921875, + "learning_rate": 5.7733074285445665e-05, + "loss": 1.0781, + "step": 4513 + }, + { + "epoch": 0.7116404391320549, + "grad_norm": 0.94921875, + "learning_rate": 5.77289020336754e-05, + "loss": 0.914, + "step": 4514 + }, + { + "epoch": 0.7117980909794479, + "grad_norm": 0.9609375, + "learning_rate": 5.7724729871496356e-05, + "loss": 1.079, + "step": 4515 + }, + { + "epoch": 0.7119557428268409, + "grad_norm": 0.984375, + "learning_rate": 5.7720557798917315e-05, + "loss": 1.0542, + "step": 4516 + }, + { + "epoch": 0.712113394674234, + "grad_norm": 1.0625, + "learning_rate": 5.771638581594714e-05, + "loss": 0.8942, + "step": 4517 + }, + { + "epoch": 0.712271046521627, + "grad_norm": 1.0625, + "learning_rate": 5.771221392259467e-05, + "loss": 1.23, + "step": 4518 + }, + { + "epoch": 0.71242869836902, + "grad_norm": 0.80859375, + "learning_rate": 5.770804211886871e-05, + "loss": 0.938, + "step": 4519 + }, + { + "epoch": 0.712586350216413, + "grad_norm": 0.95703125, + "learning_rate": 5.770387040477818e-05, + "loss": 1.0381, + "step": 4520 + }, + { + "epoch": 0.712744002063806, + "grad_norm": 1.0234375, + "learning_rate": 5.769969878033188e-05, + "loss": 1.0366, + "step": 4521 + }, + { + "epoch": 0.712901653911199, + "grad_norm": 0.97265625, + "learning_rate": 5.769552724553864e-05, + "loss": 1.0303, + "step": 4522 + }, + { + "epoch": 0.713059305758592, + "grad_norm": 0.8671875, + "learning_rate": 5.769135580040732e-05, + "loss": 0.9038, + "step": 4523 + }, + { + "epoch": 0.713216957605985, + "grad_norm": 0.984375, + "learning_rate": 5.768718444494671e-05, + "loss": 1.1799, + "step": 4524 + }, + { + "epoch": 0.713374609453378, + "grad_norm": 0.9453125, + "learning_rate": 5.7683013179165736e-05, + "loss": 1.1596, + "step": 4525 + }, + { + "epoch": 0.7135322613007711, + "grad_norm": 0.93359375, + "learning_rate": 5.767884200307319e-05, + "loss": 1.0714, + "step": 4526 + }, + { + "epoch": 0.7136899131481641, + "grad_norm": 0.90625, + "learning_rate": 5.767467091667793e-05, + "loss": 0.8077, + "step": 4527 + }, + { + "epoch": 0.7138475649955571, + "grad_norm": 0.94921875, + "learning_rate": 5.7670499919988776e-05, + "loss": 0.855, + "step": 4528 + }, + { + "epoch": 0.71400521684295, + "grad_norm": 1.3203125, + "learning_rate": 5.7666329013014586e-05, + "loss": 1.1338, + "step": 4529 + }, + { + "epoch": 0.7141628686903431, + "grad_norm": 0.97265625, + "learning_rate": 5.7662158195764185e-05, + "loss": 1.0226, + "step": 4530 + }, + { + "epoch": 0.7143205205377361, + "grad_norm": 0.92578125, + "learning_rate": 5.765798746824642e-05, + "loss": 1.0009, + "step": 4531 + }, + { + "epoch": 0.7144781723851291, + "grad_norm": 0.9453125, + "learning_rate": 5.765381683047013e-05, + "loss": 0.9105, + "step": 4532 + }, + { + "epoch": 0.7146358242325221, + "grad_norm": 0.9609375, + "learning_rate": 5.7649646282444156e-05, + "loss": 1.0727, + "step": 4533 + }, + { + "epoch": 0.7147934760799152, + "grad_norm": 0.98046875, + "learning_rate": 5.7645475824177295e-05, + "loss": 1.0921, + "step": 4534 + }, + { + "epoch": 0.7149511279273082, + "grad_norm": 0.9296875, + "learning_rate": 5.764130545567847e-05, + "loss": 0.8955, + "step": 4535 + }, + { + "epoch": 0.7151087797747012, + "grad_norm": 1.0234375, + "learning_rate": 5.763713517695649e-05, + "loss": 1.1819, + "step": 4536 + }, + { + "epoch": 0.7152664316220942, + "grad_norm": 3.96875, + "learning_rate": 5.763296498802017e-05, + "loss": 1.0377, + "step": 4537 + }, + { + "epoch": 0.7154240834694872, + "grad_norm": 1.015625, + "learning_rate": 5.762879488887836e-05, + "loss": 1.1568, + "step": 4538 + }, + { + "epoch": 0.7155817353168802, + "grad_norm": 0.88671875, + "learning_rate": 5.762462487953986e-05, + "loss": 0.7946, + "step": 4539 + }, + { + "epoch": 0.7157393871642732, + "grad_norm": 0.9453125, + "learning_rate": 5.762045496001357e-05, + "loss": 1.043, + "step": 4540 + }, + { + "epoch": 0.7158970390116662, + "grad_norm": 0.9609375, + "learning_rate": 5.761628513030831e-05, + "loss": 0.9246, + "step": 4541 + }, + { + "epoch": 0.7160546908590593, + "grad_norm": 0.9296875, + "learning_rate": 5.7612115390432916e-05, + "loss": 1.0121, + "step": 4542 + }, + { + "epoch": 0.7162123427064523, + "grad_norm": 1.015625, + "learning_rate": 5.7607945740396227e-05, + "loss": 1.2838, + "step": 4543 + }, + { + "epoch": 0.7163699945538453, + "grad_norm": 0.87890625, + "learning_rate": 5.7603776180207025e-05, + "loss": 0.8676, + "step": 4544 + }, + { + "epoch": 0.7165276464012383, + "grad_norm": 1.0234375, + "learning_rate": 5.759960670987423e-05, + "loss": 1.1053, + "step": 4545 + }, + { + "epoch": 0.7166852982486313, + "grad_norm": 1.046875, + "learning_rate": 5.759543732940666e-05, + "loss": 0.9991, + "step": 4546 + }, + { + "epoch": 0.7168429500960243, + "grad_norm": 0.8828125, + "learning_rate": 5.759126803881313e-05, + "loss": 1.0022, + "step": 4547 + }, + { + "epoch": 0.7170006019434173, + "grad_norm": 0.8515625, + "learning_rate": 5.758709883810248e-05, + "loss": 0.8361, + "step": 4548 + }, + { + "epoch": 0.7171582537908103, + "grad_norm": 0.921875, + "learning_rate": 5.758292972728352e-05, + "loss": 1.028, + "step": 4549 + }, + { + "epoch": 0.7173159056382034, + "grad_norm": 0.89453125, + "learning_rate": 5.7578760706365144e-05, + "loss": 1.1563, + "step": 4550 + }, + { + "epoch": 0.7174735574855964, + "grad_norm": 1.03125, + "learning_rate": 5.757459177535616e-05, + "loss": 1.2181, + "step": 4551 + }, + { + "epoch": 0.7176312093329894, + "grad_norm": 1.0, + "learning_rate": 5.757042293426541e-05, + "loss": 1.1177, + "step": 4552 + }, + { + "epoch": 0.7177888611803824, + "grad_norm": 0.9296875, + "learning_rate": 5.756625418310173e-05, + "loss": 0.9362, + "step": 4553 + }, + { + "epoch": 0.7179465130277753, + "grad_norm": 1.0234375, + "learning_rate": 5.75620855218739e-05, + "loss": 1.0421, + "step": 4554 + }, + { + "epoch": 0.7181041648751684, + "grad_norm": 0.83203125, + "learning_rate": 5.755791695059084e-05, + "loss": 0.7127, + "step": 4555 + }, + { + "epoch": 0.7182618167225614, + "grad_norm": 0.89453125, + "learning_rate": 5.755374846926136e-05, + "loss": 0.9927, + "step": 4556 + }, + { + "epoch": 0.7184194685699544, + "grad_norm": 0.99609375, + "learning_rate": 5.754958007789427e-05, + "loss": 1.0711, + "step": 4557 + }, + { + "epoch": 0.7185771204173474, + "grad_norm": 1.0, + "learning_rate": 5.7545411776498437e-05, + "loss": 1.0965, + "step": 4558 + }, + { + "epoch": 0.7187347722647405, + "grad_norm": 0.859375, + "learning_rate": 5.7541243565082616e-05, + "loss": 0.9999, + "step": 4559 + }, + { + "epoch": 0.7188924241121335, + "grad_norm": 0.87890625, + "learning_rate": 5.7537075443655766e-05, + "loss": 1.0846, + "step": 4560 + }, + { + "epoch": 0.7190500759595265, + "grad_norm": 0.86328125, + "learning_rate": 5.7532907412226634e-05, + "loss": 0.9009, + "step": 4561 + }, + { + "epoch": 0.7192077278069194, + "grad_norm": 0.82421875, + "learning_rate": 5.7528739470804084e-05, + "loss": 0.8899, + "step": 4562 + }, + { + "epoch": 0.7193653796543125, + "grad_norm": 0.90625, + "learning_rate": 5.752457161939695e-05, + "loss": 1.052, + "step": 4563 + }, + { + "epoch": 0.7195230315017055, + "grad_norm": 0.90234375, + "learning_rate": 5.7520403858014025e-05, + "loss": 0.9828, + "step": 4564 + }, + { + "epoch": 0.7196806833490985, + "grad_norm": 1.0390625, + "learning_rate": 5.7516236186664196e-05, + "loss": 1.0232, + "step": 4565 + }, + { + "epoch": 0.7198383351964915, + "grad_norm": 1.015625, + "learning_rate": 5.7512068605356294e-05, + "loss": 1.124, + "step": 4566 + }, + { + "epoch": 0.7199959870438846, + "grad_norm": 0.91015625, + "learning_rate": 5.750790111409913e-05, + "loss": 0.857, + "step": 4567 + }, + { + "epoch": 0.7201536388912776, + "grad_norm": 1.0625, + "learning_rate": 5.750373371290154e-05, + "loss": 1.1734, + "step": 4568 + }, + { + "epoch": 0.7203112907386706, + "grad_norm": 1.6328125, + "learning_rate": 5.749956640177235e-05, + "loss": 1.0413, + "step": 4569 + }, + { + "epoch": 0.7204689425860635, + "grad_norm": 0.859375, + "learning_rate": 5.749539918072041e-05, + "loss": 0.8861, + "step": 4570 + }, + { + "epoch": 0.7206265944334566, + "grad_norm": 0.84375, + "learning_rate": 5.7491232049754495e-05, + "loss": 0.8735, + "step": 4571 + }, + { + "epoch": 0.7207842462808496, + "grad_norm": 1.0, + "learning_rate": 5.7487065008883524e-05, + "loss": 1.1797, + "step": 4572 + }, + { + "epoch": 0.7209418981282426, + "grad_norm": 0.9453125, + "learning_rate": 5.74828980581163e-05, + "loss": 1.0538, + "step": 4573 + }, + { + "epoch": 0.7210995499756356, + "grad_norm": 1.03125, + "learning_rate": 5.7478731197461636e-05, + "loss": 0.973, + "step": 4574 + }, + { + "epoch": 0.7212572018230287, + "grad_norm": 0.953125, + "learning_rate": 5.747456442692836e-05, + "loss": 1.0639, + "step": 4575 + }, + { + "epoch": 0.7214148536704217, + "grad_norm": 0.8515625, + "learning_rate": 5.747039774652533e-05, + "loss": 0.884, + "step": 4576 + }, + { + "epoch": 0.7215725055178147, + "grad_norm": 0.875, + "learning_rate": 5.746623115626135e-05, + "loss": 0.7935, + "step": 4577 + }, + { + "epoch": 0.7217301573652076, + "grad_norm": 1.0546875, + "learning_rate": 5.746206465614528e-05, + "loss": 1.2527, + "step": 4578 + }, + { + "epoch": 0.7218878092126007, + "grad_norm": 0.9765625, + "learning_rate": 5.745789824618587e-05, + "loss": 1.0558, + "step": 4579 + }, + { + "epoch": 0.7220454610599937, + "grad_norm": 1.0625, + "learning_rate": 5.745373192639206e-05, + "loss": 1.1334, + "step": 4580 + }, + { + "epoch": 0.7222031129073867, + "grad_norm": 0.99609375, + "learning_rate": 5.744956569677265e-05, + "loss": 0.9989, + "step": 4581 + }, + { + "epoch": 0.7223607647547797, + "grad_norm": 0.96875, + "learning_rate": 5.744539955733643e-05, + "loss": 0.9758, + "step": 4582 + }, + { + "epoch": 0.7225184166021728, + "grad_norm": 0.8984375, + "learning_rate": 5.744123350809225e-05, + "loss": 0.8247, + "step": 4583 + }, + { + "epoch": 0.7226760684495658, + "grad_norm": 1.0625, + "learning_rate": 5.743706754904892e-05, + "loss": 1.0697, + "step": 4584 + }, + { + "epoch": 0.7228337202969588, + "grad_norm": 1.1171875, + "learning_rate": 5.743290168021534e-05, + "loss": 0.9729, + "step": 4585 + }, + { + "epoch": 0.7229913721443517, + "grad_norm": 1.1171875, + "learning_rate": 5.742873590160027e-05, + "loss": 1.3939, + "step": 4586 + }, + { + "epoch": 0.7231490239917447, + "grad_norm": 0.9296875, + "learning_rate": 5.742457021321257e-05, + "loss": 0.9684, + "step": 4587 + }, + { + "epoch": 0.7233066758391378, + "grad_norm": 0.91015625, + "learning_rate": 5.742040461506106e-05, + "loss": 1.0538, + "step": 4588 + }, + { + "epoch": 0.7234643276865308, + "grad_norm": 0.92578125, + "learning_rate": 5.7416239107154534e-05, + "loss": 1.0785, + "step": 4589 + }, + { + "epoch": 0.7236219795339238, + "grad_norm": 0.921875, + "learning_rate": 5.74120736895019e-05, + "loss": 1.0088, + "step": 4590 + }, + { + "epoch": 0.7237796313813168, + "grad_norm": 1.1328125, + "learning_rate": 5.7407908362111916e-05, + "loss": 0.9529, + "step": 4591 + }, + { + "epoch": 0.7239372832287099, + "grad_norm": 0.96875, + "learning_rate": 5.740374312499346e-05, + "loss": 0.8724, + "step": 4592 + }, + { + "epoch": 0.7240949350761029, + "grad_norm": 0.92578125, + "learning_rate": 5.739957797815533e-05, + "loss": 1.1785, + "step": 4593 + }, + { + "epoch": 0.7242525869234958, + "grad_norm": 0.9765625, + "learning_rate": 5.739541292160632e-05, + "loss": 0.9545, + "step": 4594 + }, + { + "epoch": 0.7244102387708888, + "grad_norm": 0.98046875, + "learning_rate": 5.739124795535533e-05, + "loss": 0.8773, + "step": 4595 + }, + { + "epoch": 0.7245678906182819, + "grad_norm": 0.8671875, + "learning_rate": 5.738708307941117e-05, + "loss": 0.7808, + "step": 4596 + }, + { + "epoch": 0.7247255424656749, + "grad_norm": 0.921875, + "learning_rate": 5.738291829378264e-05, + "loss": 0.8546, + "step": 4597 + }, + { + "epoch": 0.7248831943130679, + "grad_norm": 1.0703125, + "learning_rate": 5.7378753598478574e-05, + "loss": 1.3119, + "step": 4598 + }, + { + "epoch": 0.7250408461604609, + "grad_norm": 0.9296875, + "learning_rate": 5.737458899350776e-05, + "loss": 0.9611, + "step": 4599 + }, + { + "epoch": 0.725198498007854, + "grad_norm": 1.0703125, + "learning_rate": 5.7370424478879124e-05, + "loss": 1.0643, + "step": 4600 + }, + { + "epoch": 0.725356149855247, + "grad_norm": 1.6328125, + "learning_rate": 5.736626005460144e-05, + "loss": 1.1612, + "step": 4601 + }, + { + "epoch": 0.7255138017026399, + "grad_norm": 0.9140625, + "learning_rate": 5.736209572068352e-05, + "loss": 0.9703, + "step": 4602 + }, + { + "epoch": 0.7256714535500329, + "grad_norm": 1.015625, + "learning_rate": 5.7357931477134194e-05, + "loss": 1.0533, + "step": 4603 + }, + { + "epoch": 0.725829105397426, + "grad_norm": 0.80078125, + "learning_rate": 5.735376732396227e-05, + "loss": 0.7787, + "step": 4604 + }, + { + "epoch": 0.725986757244819, + "grad_norm": 1.0234375, + "learning_rate": 5.734960326117662e-05, + "loss": 1.1392, + "step": 4605 + }, + { + "epoch": 0.726144409092212, + "grad_norm": 0.94140625, + "learning_rate": 5.734543928878606e-05, + "loss": 1.0177, + "step": 4606 + }, + { + "epoch": 0.726302060939605, + "grad_norm": 0.92578125, + "learning_rate": 5.734127540679941e-05, + "loss": 0.9528, + "step": 4607 + }, + { + "epoch": 0.7264597127869981, + "grad_norm": 0.9140625, + "learning_rate": 5.733711161522548e-05, + "loss": 0.9613, + "step": 4608 + }, + { + "epoch": 0.7266173646343911, + "grad_norm": 0.9921875, + "learning_rate": 5.733294791407307e-05, + "loss": 1.0726, + "step": 4609 + }, + { + "epoch": 0.726775016481784, + "grad_norm": 0.90625, + "learning_rate": 5.732878430335107e-05, + "loss": 0.8793, + "step": 4610 + }, + { + "epoch": 0.726932668329177, + "grad_norm": 0.90234375, + "learning_rate": 5.732462078306828e-05, + "loss": 0.929, + "step": 4611 + }, + { + "epoch": 0.72709032017657, + "grad_norm": 1.15625, + "learning_rate": 5.732045735323351e-05, + "loss": 1.063, + "step": 4612 + }, + { + "epoch": 0.7272479720239631, + "grad_norm": 0.96484375, + "learning_rate": 5.73162940138556e-05, + "loss": 0.9351, + "step": 4613 + }, + { + "epoch": 0.7274056238713561, + "grad_norm": 1.0078125, + "learning_rate": 5.7312130764943364e-05, + "loss": 0.9894, + "step": 4614 + }, + { + "epoch": 0.7275632757187491, + "grad_norm": 1.0546875, + "learning_rate": 5.730796760650562e-05, + "loss": 0.971, + "step": 4615 + }, + { + "epoch": 0.7277209275661422, + "grad_norm": 1.03125, + "learning_rate": 5.73038045385512e-05, + "loss": 0.9732, + "step": 4616 + }, + { + "epoch": 0.7278785794135352, + "grad_norm": 1.046875, + "learning_rate": 5.729964156108894e-05, + "loss": 1.0791, + "step": 4617 + }, + { + "epoch": 0.7280362312609281, + "grad_norm": 1.0703125, + "learning_rate": 5.7295478674127624e-05, + "loss": 0.8884, + "step": 4618 + }, + { + "epoch": 0.7281938831083211, + "grad_norm": 0.96875, + "learning_rate": 5.7291315877676064e-05, + "loss": 1.1057, + "step": 4619 + }, + { + "epoch": 0.7283515349557141, + "grad_norm": 1.0859375, + "learning_rate": 5.728715317174318e-05, + "loss": 1.0346, + "step": 4620 + }, + { + "epoch": 0.7285091868031072, + "grad_norm": 1.03125, + "learning_rate": 5.7282990556337724e-05, + "loss": 1.2148, + "step": 4621 + }, + { + "epoch": 0.7286668386505002, + "grad_norm": 0.99609375, + "learning_rate": 5.7278828031468515e-05, + "loss": 1.0916, + "step": 4622 + }, + { + "epoch": 0.7288244904978932, + "grad_norm": 1.0625, + "learning_rate": 5.7274665597144404e-05, + "loss": 1.1413, + "step": 4623 + }, + { + "epoch": 0.7289821423452862, + "grad_norm": 0.984375, + "learning_rate": 5.727050325337415e-05, + "loss": 1.1136, + "step": 4624 + }, + { + "epoch": 0.7291397941926793, + "grad_norm": 1.0, + "learning_rate": 5.726634100016666e-05, + "loss": 1.0647, + "step": 4625 + }, + { + "epoch": 0.7292974460400722, + "grad_norm": 0.9375, + "learning_rate": 5.7262178837530723e-05, + "loss": 1.0225, + "step": 4626 + }, + { + "epoch": 0.7294550978874652, + "grad_norm": 0.93359375, + "learning_rate": 5.725801676547515e-05, + "loss": 0.9935, + "step": 4627 + }, + { + "epoch": 0.7296127497348582, + "grad_norm": 0.9140625, + "learning_rate": 5.7253854784008775e-05, + "loss": 0.9398, + "step": 4628 + }, + { + "epoch": 0.7297704015822513, + "grad_norm": 1.015625, + "learning_rate": 5.724969289314036e-05, + "loss": 1.0707, + "step": 4629 + }, + { + "epoch": 0.7299280534296443, + "grad_norm": 1.046875, + "learning_rate": 5.724553109287882e-05, + "loss": 1.0626, + "step": 4630 + }, + { + "epoch": 0.7300857052770373, + "grad_norm": 0.921875, + "learning_rate": 5.724136938323295e-05, + "loss": 1.1977, + "step": 4631 + }, + { + "epoch": 0.7302433571244303, + "grad_norm": 0.92578125, + "learning_rate": 5.723720776421153e-05, + "loss": 1.0472, + "step": 4632 + }, + { + "epoch": 0.7304010089718234, + "grad_norm": 1.0859375, + "learning_rate": 5.723304623582342e-05, + "loss": 1.0177, + "step": 4633 + }, + { + "epoch": 0.7305586608192163, + "grad_norm": 1.125, + "learning_rate": 5.722888479807742e-05, + "loss": 0.9893, + "step": 4634 + }, + { + "epoch": 0.7307163126666093, + "grad_norm": 0.78515625, + "learning_rate": 5.722472345098231e-05, + "loss": 0.8246, + "step": 4635 + }, + { + "epoch": 0.7308739645140023, + "grad_norm": 0.84765625, + "learning_rate": 5.7220562194547e-05, + "loss": 0.907, + "step": 4636 + }, + { + "epoch": 0.7310316163613954, + "grad_norm": 0.83984375, + "learning_rate": 5.721640102878026e-05, + "loss": 0.9116, + "step": 4637 + }, + { + "epoch": 0.7311892682087884, + "grad_norm": 0.93359375, + "learning_rate": 5.721223995369092e-05, + "loss": 0.8619, + "step": 4638 + }, + { + "epoch": 0.7313469200561814, + "grad_norm": 1.0546875, + "learning_rate": 5.720807896928778e-05, + "loss": 1.0181, + "step": 4639 + }, + { + "epoch": 0.7315045719035744, + "grad_norm": 1.03125, + "learning_rate": 5.720391807557963e-05, + "loss": 1.183, + "step": 4640 + }, + { + "epoch": 0.7316622237509675, + "grad_norm": 1.0234375, + "learning_rate": 5.719975727257538e-05, + "loss": 1.2148, + "step": 4641 + }, + { + "epoch": 0.7318198755983604, + "grad_norm": 0.98046875, + "learning_rate": 5.7195596560283794e-05, + "loss": 0.868, + "step": 4642 + }, + { + "epoch": 0.7319775274457534, + "grad_norm": 0.87890625, + "learning_rate": 5.71914359387137e-05, + "loss": 0.8763, + "step": 4643 + }, + { + "epoch": 0.7321351792931464, + "grad_norm": 1.109375, + "learning_rate": 5.7187275407873895e-05, + "loss": 0.9841, + "step": 4644 + }, + { + "epoch": 0.7322928311405394, + "grad_norm": 0.98828125, + "learning_rate": 5.7183114967773174e-05, + "loss": 0.9438, + "step": 4645 + }, + { + "epoch": 0.7324504829879325, + "grad_norm": 1.0078125, + "learning_rate": 5.7178954618420446e-05, + "loss": 0.832, + "step": 4646 + }, + { + "epoch": 0.7326081348353255, + "grad_norm": 0.96875, + "learning_rate": 5.717479435982448e-05, + "loss": 0.9947, + "step": 4647 + }, + { + "epoch": 0.7327657866827185, + "grad_norm": 0.890625, + "learning_rate": 5.7170634191994076e-05, + "loss": 1.0779, + "step": 4648 + }, + { + "epoch": 0.7329234385301115, + "grad_norm": 1.015625, + "learning_rate": 5.716647411493807e-05, + "loss": 1.1037, + "step": 4649 + }, + { + "epoch": 0.7330810903775045, + "grad_norm": 1.0390625, + "learning_rate": 5.716231412866524e-05, + "loss": 1.0519, + "step": 4650 + }, + { + "epoch": 0.7332387422248975, + "grad_norm": 0.86328125, + "learning_rate": 5.715815423318447e-05, + "loss": 0.8023, + "step": 4651 + }, + { + "epoch": 0.7333963940722905, + "grad_norm": 0.9140625, + "learning_rate": 5.715399442850454e-05, + "loss": 1.0471, + "step": 4652 + }, + { + "epoch": 0.7335540459196835, + "grad_norm": 0.91796875, + "learning_rate": 5.714983471463428e-05, + "loss": 0.9293, + "step": 4653 + }, + { + "epoch": 0.7337116977670766, + "grad_norm": 0.7734375, + "learning_rate": 5.7145675091582485e-05, + "loss": 0.7455, + "step": 4654 + }, + { + "epoch": 0.7338693496144696, + "grad_norm": 1.015625, + "learning_rate": 5.714151555935798e-05, + "loss": 1.1761, + "step": 4655 + }, + { + "epoch": 0.7340270014618626, + "grad_norm": 0.9453125, + "learning_rate": 5.71373561179696e-05, + "loss": 0.9818, + "step": 4656 + }, + { + "epoch": 0.7341846533092556, + "grad_norm": 0.97265625, + "learning_rate": 5.713319676742609e-05, + "loss": 1.157, + "step": 4657 + }, + { + "epoch": 0.7343423051566486, + "grad_norm": 0.9609375, + "learning_rate": 5.712903750773637e-05, + "loss": 0.91, + "step": 4658 + }, + { + "epoch": 0.7344999570040416, + "grad_norm": 0.95703125, + "learning_rate": 5.712487833890919e-05, + "loss": 0.9129, + "step": 4659 + }, + { + "epoch": 0.7346576088514346, + "grad_norm": 0.96875, + "learning_rate": 5.712071926095339e-05, + "loss": 0.9001, + "step": 4660 + }, + { + "epoch": 0.7348152606988276, + "grad_norm": 1.03125, + "learning_rate": 5.7116560273877775e-05, + "loss": 0.7676, + "step": 4661 + }, + { + "epoch": 0.7349729125462207, + "grad_norm": 0.984375, + "learning_rate": 5.711240137769116e-05, + "loss": 1.1658, + "step": 4662 + }, + { + "epoch": 0.7351305643936137, + "grad_norm": 1.1328125, + "learning_rate": 5.7108242572402346e-05, + "loss": 0.9517, + "step": 4663 + }, + { + "epoch": 0.7352882162410067, + "grad_norm": 1.0078125, + "learning_rate": 5.710408385802017e-05, + "loss": 1.0213, + "step": 4664 + }, + { + "epoch": 0.7354458680883997, + "grad_norm": 1.0703125, + "learning_rate": 5.709992523455341e-05, + "loss": 1.0087, + "step": 4665 + }, + { + "epoch": 0.7356035199357926, + "grad_norm": 0.8359375, + "learning_rate": 5.7095766702010935e-05, + "loss": 0.7306, + "step": 4666 + }, + { + "epoch": 0.7357611717831857, + "grad_norm": 0.94921875, + "learning_rate": 5.709160826040152e-05, + "loss": 0.9112, + "step": 4667 + }, + { + "epoch": 0.7359188236305787, + "grad_norm": 0.84765625, + "learning_rate": 5.7087449909734005e-05, + "loss": 0.7613, + "step": 4668 + }, + { + "epoch": 0.7360764754779717, + "grad_norm": 0.94921875, + "learning_rate": 5.708329165001717e-05, + "loss": 0.9827, + "step": 4669 + }, + { + "epoch": 0.7362341273253648, + "grad_norm": 1.0234375, + "learning_rate": 5.707913348125982e-05, + "loss": 0.9918, + "step": 4670 + }, + { + "epoch": 0.7363917791727578, + "grad_norm": 0.9453125, + "learning_rate": 5.707497540347082e-05, + "loss": 0.9083, + "step": 4671 + }, + { + "epoch": 0.7365494310201508, + "grad_norm": 1.2265625, + "learning_rate": 5.707081741665896e-05, + "loss": 1.0363, + "step": 4672 + }, + { + "epoch": 0.7367070828675438, + "grad_norm": 1.0078125, + "learning_rate": 5.706665952083305e-05, + "loss": 0.9994, + "step": 4673 + }, + { + "epoch": 0.7368647347149367, + "grad_norm": 0.921875, + "learning_rate": 5.706250171600192e-05, + "loss": 0.9276, + "step": 4674 + }, + { + "epoch": 0.7370223865623298, + "grad_norm": 0.94140625, + "learning_rate": 5.70583440021743e-05, + "loss": 0.8835, + "step": 4675 + }, + { + "epoch": 0.7371800384097228, + "grad_norm": 0.9296875, + "learning_rate": 5.705418637935911e-05, + "loss": 1.1272, + "step": 4676 + }, + { + "epoch": 0.7373376902571158, + "grad_norm": 0.96484375, + "learning_rate": 5.7050028847565115e-05, + "loss": 0.8556, + "step": 4677 + }, + { + "epoch": 0.7374953421045088, + "grad_norm": 0.87109375, + "learning_rate": 5.704587140680114e-05, + "loss": 0.9852, + "step": 4678 + }, + { + "epoch": 0.7376529939519019, + "grad_norm": 0.9296875, + "learning_rate": 5.704171405707598e-05, + "loss": 1.0377, + "step": 4679 + }, + { + "epoch": 0.7378106457992949, + "grad_norm": 0.96484375, + "learning_rate": 5.703755679839842e-05, + "loss": 0.9971, + "step": 4680 + }, + { + "epoch": 0.7379682976466879, + "grad_norm": 0.91796875, + "learning_rate": 5.703339963077733e-05, + "loss": 1.0554, + "step": 4681 + }, + { + "epoch": 0.7381259494940808, + "grad_norm": 0.890625, + "learning_rate": 5.70292425542215e-05, + "loss": 0.9929, + "step": 4682 + }, + { + "epoch": 0.7382836013414739, + "grad_norm": 0.84375, + "learning_rate": 5.702508556873973e-05, + "loss": 0.8728, + "step": 4683 + }, + { + "epoch": 0.7384412531888669, + "grad_norm": 0.84765625, + "learning_rate": 5.702092867434084e-05, + "loss": 0.8815, + "step": 4684 + }, + { + "epoch": 0.7385989050362599, + "grad_norm": 0.88671875, + "learning_rate": 5.701677187103358e-05, + "loss": 0.967, + "step": 4685 + }, + { + "epoch": 0.7387565568836529, + "grad_norm": 0.890625, + "learning_rate": 5.7012615158826875e-05, + "loss": 1.0705, + "step": 4686 + }, + { + "epoch": 0.738914208731046, + "grad_norm": 0.98046875, + "learning_rate": 5.700845853772947e-05, + "loss": 0.9555, + "step": 4687 + }, + { + "epoch": 0.739071860578439, + "grad_norm": 0.94921875, + "learning_rate": 5.700430200775018e-05, + "loss": 0.994, + "step": 4688 + }, + { + "epoch": 0.739229512425832, + "grad_norm": 0.80859375, + "learning_rate": 5.70001455688978e-05, + "loss": 0.8522, + "step": 4689 + }, + { + "epoch": 0.7393871642732249, + "grad_norm": 0.82421875, + "learning_rate": 5.6995989221181124e-05, + "loss": 0.8654, + "step": 4690 + }, + { + "epoch": 0.739544816120618, + "grad_norm": 0.87890625, + "learning_rate": 5.6991832964609035e-05, + "loss": 0.8621, + "step": 4691 + }, + { + "epoch": 0.739702467968011, + "grad_norm": 0.953125, + "learning_rate": 5.6987676799190295e-05, + "loss": 0.9141, + "step": 4692 + }, + { + "epoch": 0.739860119815404, + "grad_norm": 0.98046875, + "learning_rate": 5.698352072493371e-05, + "loss": 0.8904, + "step": 4693 + }, + { + "epoch": 0.740017771662797, + "grad_norm": 0.9453125, + "learning_rate": 5.6979364741848105e-05, + "loss": 1.0103, + "step": 4694 + }, + { + "epoch": 0.7401754235101901, + "grad_norm": 0.98046875, + "learning_rate": 5.697520884994223e-05, + "loss": 0.902, + "step": 4695 + }, + { + "epoch": 0.7403330753575831, + "grad_norm": 0.9375, + "learning_rate": 5.6971053049224987e-05, + "loss": 1.0204, + "step": 4696 + }, + { + "epoch": 0.7404907272049761, + "grad_norm": 0.89453125, + "learning_rate": 5.696689733970513e-05, + "loss": 0.9853, + "step": 4697 + }, + { + "epoch": 0.7406483790523691, + "grad_norm": 0.91796875, + "learning_rate": 5.6962741721391474e-05, + "loss": 1.0159, + "step": 4698 + }, + { + "epoch": 0.740806030899762, + "grad_norm": 1.0234375, + "learning_rate": 5.695858619429284e-05, + "loss": 0.9521, + "step": 4699 + }, + { + "epoch": 0.7409636827471551, + "grad_norm": 1.0859375, + "learning_rate": 5.695443075841801e-05, + "loss": 0.9416, + "step": 4700 + }, + { + "epoch": 0.7411213345945481, + "grad_norm": 1.0546875, + "learning_rate": 5.69502754137758e-05, + "loss": 0.971, + "step": 4701 + }, + { + "epoch": 0.7412789864419411, + "grad_norm": 1.21875, + "learning_rate": 5.694612016037504e-05, + "loss": 1.0025, + "step": 4702 + }, + { + "epoch": 0.7414366382893341, + "grad_norm": 1.0078125, + "learning_rate": 5.694196499822449e-05, + "loss": 0.9848, + "step": 4703 + }, + { + "epoch": 0.7415942901367272, + "grad_norm": 0.921875, + "learning_rate": 5.6937809927333006e-05, + "loss": 0.9557, + "step": 4704 + }, + { + "epoch": 0.7417519419841202, + "grad_norm": 0.859375, + "learning_rate": 5.6933654947709326e-05, + "loss": 0.8627, + "step": 4705 + }, + { + "epoch": 0.7419095938315132, + "grad_norm": 1.2109375, + "learning_rate": 5.692950005936234e-05, + "loss": 1.1389, + "step": 4706 + }, + { + "epoch": 0.7420672456789061, + "grad_norm": 0.87109375, + "learning_rate": 5.692534526230082e-05, + "loss": 0.946, + "step": 4707 + }, + { + "epoch": 0.7422248975262992, + "grad_norm": 0.953125, + "learning_rate": 5.692119055653358e-05, + "loss": 0.9587, + "step": 4708 + }, + { + "epoch": 0.7423825493736922, + "grad_norm": 1.0078125, + "learning_rate": 5.6917035942069386e-05, + "loss": 0.9721, + "step": 4709 + }, + { + "epoch": 0.7425402012210852, + "grad_norm": 0.9296875, + "learning_rate": 5.6912881418917054e-05, + "loss": 0.9636, + "step": 4710 + }, + { + "epoch": 0.7426978530684782, + "grad_norm": 0.89453125, + "learning_rate": 5.6908726987085444e-05, + "loss": 0.8739, + "step": 4711 + }, + { + "epoch": 0.7428555049158713, + "grad_norm": 0.90625, + "learning_rate": 5.690457264658331e-05, + "loss": 0.9868, + "step": 4712 + }, + { + "epoch": 0.7430131567632643, + "grad_norm": 1.0859375, + "learning_rate": 5.690041839741949e-05, + "loss": 0.9563, + "step": 4713 + }, + { + "epoch": 0.7431708086106573, + "grad_norm": 0.9140625, + "learning_rate": 5.689626423960277e-05, + "loss": 0.8799, + "step": 4714 + }, + { + "epoch": 0.7433284604580502, + "grad_norm": 0.94921875, + "learning_rate": 5.68921101731419e-05, + "loss": 1.0575, + "step": 4715 + }, + { + "epoch": 0.7434861123054433, + "grad_norm": 0.91796875, + "learning_rate": 5.688795619804579e-05, + "loss": 1.0697, + "step": 4716 + }, + { + "epoch": 0.7436437641528363, + "grad_norm": 0.9609375, + "learning_rate": 5.688380231432318e-05, + "loss": 1.1015, + "step": 4717 + }, + { + "epoch": 0.7438014160002293, + "grad_norm": 0.921875, + "learning_rate": 5.6879648521982896e-05, + "loss": 1.0351, + "step": 4718 + }, + { + "epoch": 0.7439590678476223, + "grad_norm": 1.03125, + "learning_rate": 5.6875494821033737e-05, + "loss": 1.0856, + "step": 4719 + }, + { + "epoch": 0.7441167196950154, + "grad_norm": 1.109375, + "learning_rate": 5.687134121148445e-05, + "loss": 1.0032, + "step": 4720 + }, + { + "epoch": 0.7442743715424084, + "grad_norm": 0.87109375, + "learning_rate": 5.6867187693343936e-05, + "loss": 0.842, + "step": 4721 + }, + { + "epoch": 0.7444320233898014, + "grad_norm": 1.671875, + "learning_rate": 5.6863034266620954e-05, + "loss": 0.7749, + "step": 4722 + }, + { + "epoch": 0.7445896752371943, + "grad_norm": 1.0390625, + "learning_rate": 5.68588809313243e-05, + "loss": 1.245, + "step": 4723 + }, + { + "epoch": 0.7447473270845874, + "grad_norm": 1.015625, + "learning_rate": 5.6854727687462786e-05, + "loss": 1.0012, + "step": 4724 + }, + { + "epoch": 0.7449049789319804, + "grad_norm": 1.03125, + "learning_rate": 5.685057453504516e-05, + "loss": 1.0731, + "step": 4725 + }, + { + "epoch": 0.7450626307793734, + "grad_norm": 1.0078125, + "learning_rate": 5.684642147408034e-05, + "loss": 0.8519, + "step": 4726 + }, + { + "epoch": 0.7452202826267664, + "grad_norm": 0.87890625, + "learning_rate": 5.6842268504577034e-05, + "loss": 0.9437, + "step": 4727 + }, + { + "epoch": 0.7453779344741595, + "grad_norm": 0.91015625, + "learning_rate": 5.6838115626544086e-05, + "loss": 0.919, + "step": 4728 + }, + { + "epoch": 0.7455355863215525, + "grad_norm": 0.98046875, + "learning_rate": 5.6833962839990296e-05, + "loss": 1.1763, + "step": 4729 + }, + { + "epoch": 0.7456932381689455, + "grad_norm": 0.9609375, + "learning_rate": 5.68298101449244e-05, + "loss": 1.0592, + "step": 4730 + }, + { + "epoch": 0.7458508900163384, + "grad_norm": 0.953125, + "learning_rate": 5.682565754135531e-05, + "loss": 1.104, + "step": 4731 + }, + { + "epoch": 0.7460085418637314, + "grad_norm": 1.0, + "learning_rate": 5.682150502929175e-05, + "loss": 1.0742, + "step": 4732 + }, + { + "epoch": 0.7461661937111245, + "grad_norm": 1.1796875, + "learning_rate": 5.6817352608742546e-05, + "loss": 0.9803, + "step": 4733 + }, + { + "epoch": 0.7463238455585175, + "grad_norm": 0.8671875, + "learning_rate": 5.6813200279716503e-05, + "loss": 0.9692, + "step": 4734 + }, + { + "epoch": 0.7464814974059105, + "grad_norm": 0.79296875, + "learning_rate": 5.680904804222238e-05, + "loss": 0.7481, + "step": 4735 + }, + { + "epoch": 0.7466391492533035, + "grad_norm": 0.9765625, + "learning_rate": 5.6804895896269036e-05, + "loss": 1.149, + "step": 4736 + }, + { + "epoch": 0.7467968011006966, + "grad_norm": 1.0546875, + "learning_rate": 5.680074384186526e-05, + "loss": 1.2308, + "step": 4737 + }, + { + "epoch": 0.7469544529480896, + "grad_norm": 0.94921875, + "learning_rate": 5.6796591879019825e-05, + "loss": 0.938, + "step": 4738 + }, + { + "epoch": 0.7471121047954825, + "grad_norm": 0.90234375, + "learning_rate": 5.6792440007741554e-05, + "loss": 0.9796, + "step": 4739 + }, + { + "epoch": 0.7472697566428755, + "grad_norm": 0.8984375, + "learning_rate": 5.6788288228039234e-05, + "loss": 1.0304, + "step": 4740 + }, + { + "epoch": 0.7474274084902686, + "grad_norm": 0.88671875, + "learning_rate": 5.678413653992166e-05, + "loss": 0.8708, + "step": 4741 + }, + { + "epoch": 0.7475850603376616, + "grad_norm": 1.0625, + "learning_rate": 5.67799849433976e-05, + "loss": 0.9206, + "step": 4742 + }, + { + "epoch": 0.7477427121850546, + "grad_norm": 0.96875, + "learning_rate": 5.6775833438475946e-05, + "loss": 1.2325, + "step": 4743 + }, + { + "epoch": 0.7479003640324476, + "grad_norm": 0.9453125, + "learning_rate": 5.677168202516543e-05, + "loss": 1.0454, + "step": 4744 + }, + { + "epoch": 0.7480580158798407, + "grad_norm": 0.93359375, + "learning_rate": 5.6767530703474866e-05, + "loss": 1.0683, + "step": 4745 + }, + { + "epoch": 0.7482156677272337, + "grad_norm": 0.8984375, + "learning_rate": 5.6763379473413045e-05, + "loss": 0.9489, + "step": 4746 + }, + { + "epoch": 0.7483733195746266, + "grad_norm": 0.93359375, + "learning_rate": 5.6759228334988764e-05, + "loss": 1.0223, + "step": 4747 + }, + { + "epoch": 0.7485309714220196, + "grad_norm": 0.88671875, + "learning_rate": 5.6755077288210846e-05, + "loss": 0.9622, + "step": 4748 + }, + { + "epoch": 0.7486886232694127, + "grad_norm": 0.91015625, + "learning_rate": 5.675092633308806e-05, + "loss": 1.1073, + "step": 4749 + }, + { + "epoch": 0.7488462751168057, + "grad_norm": 0.90234375, + "learning_rate": 5.674677546962918e-05, + "loss": 1.1217, + "step": 4750 + }, + { + "epoch": 0.7490039269641987, + "grad_norm": 0.88671875, + "learning_rate": 5.674262469784306e-05, + "loss": 0.9063, + "step": 4751 + }, + { + "epoch": 0.7491615788115917, + "grad_norm": 1.0625, + "learning_rate": 5.67384740177385e-05, + "loss": 0.9443, + "step": 4752 + }, + { + "epoch": 0.7493192306589848, + "grad_norm": 0.91015625, + "learning_rate": 5.673432342932425e-05, + "loss": 0.869, + "step": 4753 + }, + { + "epoch": 0.7494768825063778, + "grad_norm": 0.8984375, + "learning_rate": 5.673017293260914e-05, + "loss": 0.9956, + "step": 4754 + }, + { + "epoch": 0.7496345343537707, + "grad_norm": 1.0078125, + "learning_rate": 5.672602252760191e-05, + "loss": 1.0084, + "step": 4755 + }, + { + "epoch": 0.7497921862011637, + "grad_norm": 0.94140625, + "learning_rate": 5.6721872214311445e-05, + "loss": 0.9403, + "step": 4756 + }, + { + "epoch": 0.7499498380485567, + "grad_norm": 0.90625, + "learning_rate": 5.67177219927465e-05, + "loss": 0.9248, + "step": 4757 + }, + { + "epoch": 0.7501074898959498, + "grad_norm": 1.0078125, + "learning_rate": 5.671357186291586e-05, + "loss": 0.9568, + "step": 4758 + }, + { + "epoch": 0.7502651417433428, + "grad_norm": 0.91796875, + "learning_rate": 5.6709421824828345e-05, + "loss": 0.9258, + "step": 4759 + }, + { + "epoch": 0.7504227935907358, + "grad_norm": 0.90234375, + "learning_rate": 5.670527187849268e-05, + "loss": 0.9776, + "step": 4760 + }, + { + "epoch": 0.7505804454381289, + "grad_norm": 0.83984375, + "learning_rate": 5.670112202391776e-05, + "loss": 1.0012, + "step": 4761 + }, + { + "epoch": 0.7507380972855219, + "grad_norm": 0.9296875, + "learning_rate": 5.6696972261112346e-05, + "loss": 1.1628, + "step": 4762 + }, + { + "epoch": 0.7508957491329148, + "grad_norm": 0.99609375, + "learning_rate": 5.669282259008521e-05, + "loss": 1.0154, + "step": 4763 + }, + { + "epoch": 0.7510534009803078, + "grad_norm": 0.9609375, + "learning_rate": 5.6688673010845184e-05, + "loss": 1.1107, + "step": 4764 + }, + { + "epoch": 0.7512110528277008, + "grad_norm": 1.0234375, + "learning_rate": 5.668452352340099e-05, + "loss": 1.017, + "step": 4765 + }, + { + "epoch": 0.7513687046750939, + "grad_norm": 0.91015625, + "learning_rate": 5.6680374127761506e-05, + "loss": 0.9244, + "step": 4766 + }, + { + "epoch": 0.7515263565224869, + "grad_norm": 1.28125, + "learning_rate": 5.667622482393549e-05, + "loss": 1.0216, + "step": 4767 + }, + { + "epoch": 0.7516840083698799, + "grad_norm": 0.9921875, + "learning_rate": 5.667207561193175e-05, + "loss": 1.1673, + "step": 4768 + }, + { + "epoch": 0.751841660217273, + "grad_norm": 0.83203125, + "learning_rate": 5.666792649175906e-05, + "loss": 1.019, + "step": 4769 + }, + { + "epoch": 0.751999312064666, + "grad_norm": 0.85546875, + "learning_rate": 5.6663777463426194e-05, + "loss": 0.8572, + "step": 4770 + }, + { + "epoch": 0.7521569639120589, + "grad_norm": 0.81640625, + "learning_rate": 5.6659628526942e-05, + "loss": 0.8351, + "step": 4771 + }, + { + "epoch": 0.7523146157594519, + "grad_norm": 0.8984375, + "learning_rate": 5.665547968231526e-05, + "loss": 1.1263, + "step": 4772 + }, + { + "epoch": 0.7524722676068449, + "grad_norm": 0.8671875, + "learning_rate": 5.6651330929554745e-05, + "loss": 0.8592, + "step": 4773 + }, + { + "epoch": 0.752629919454238, + "grad_norm": 1.078125, + "learning_rate": 5.664718226866926e-05, + "loss": 0.9418, + "step": 4774 + }, + { + "epoch": 0.752787571301631, + "grad_norm": 0.86328125, + "learning_rate": 5.664303369966759e-05, + "loss": 0.9023, + "step": 4775 + }, + { + "epoch": 0.752945223149024, + "grad_norm": 1.0859375, + "learning_rate": 5.66388852225585e-05, + "loss": 1.1438, + "step": 4776 + }, + { + "epoch": 0.753102874996417, + "grad_norm": 0.93359375, + "learning_rate": 5.663473683735085e-05, + "loss": 1.002, + "step": 4777 + }, + { + "epoch": 0.7532605268438101, + "grad_norm": 0.93359375, + "learning_rate": 5.6630588544053396e-05, + "loss": 1.0069, + "step": 4778 + }, + { + "epoch": 0.753418178691203, + "grad_norm": 0.9453125, + "learning_rate": 5.6626440342674924e-05, + "loss": 0.8408, + "step": 4779 + }, + { + "epoch": 0.753575830538596, + "grad_norm": 1.078125, + "learning_rate": 5.662229223322425e-05, + "loss": 1.183, + "step": 4780 + }, + { + "epoch": 0.753733482385989, + "grad_norm": 1.1328125, + "learning_rate": 5.66181442157101e-05, + "loss": 1.1869, + "step": 4781 + }, + { + "epoch": 0.753891134233382, + "grad_norm": 0.921875, + "learning_rate": 5.6613996290141344e-05, + "loss": 0.8942, + "step": 4782 + }, + { + "epoch": 0.7540487860807751, + "grad_norm": 0.9609375, + "learning_rate": 5.660984845652675e-05, + "loss": 0.8832, + "step": 4783 + }, + { + "epoch": 0.7542064379281681, + "grad_norm": 0.98828125, + "learning_rate": 5.660570071487511e-05, + "loss": 1.1572, + "step": 4784 + }, + { + "epoch": 0.7543640897755611, + "grad_norm": 0.86328125, + "learning_rate": 5.660155306519519e-05, + "loss": 0.9075, + "step": 4785 + }, + { + "epoch": 0.7545217416229542, + "grad_norm": 1.609375, + "learning_rate": 5.6597405507495814e-05, + "loss": 0.9227, + "step": 4786 + }, + { + "epoch": 0.7546793934703471, + "grad_norm": 0.91015625, + "learning_rate": 5.6593258041785746e-05, + "loss": 1.1666, + "step": 4787 + }, + { + "epoch": 0.7548370453177401, + "grad_norm": 0.94921875, + "learning_rate": 5.6589110668073795e-05, + "loss": 1.0032, + "step": 4788 + }, + { + "epoch": 0.7549946971651331, + "grad_norm": 0.9375, + "learning_rate": 5.658496338636874e-05, + "loss": 0.9914, + "step": 4789 + }, + { + "epoch": 0.7551523490125261, + "grad_norm": 1.1640625, + "learning_rate": 5.658081619667933e-05, + "loss": 0.9509, + "step": 4790 + }, + { + "epoch": 0.7553100008599192, + "grad_norm": 0.859375, + "learning_rate": 5.657666909901444e-05, + "loss": 0.8592, + "step": 4791 + }, + { + "epoch": 0.7554676527073122, + "grad_norm": 0.9921875, + "learning_rate": 5.6572522093382815e-05, + "loss": 1.1908, + "step": 4792 + }, + { + "epoch": 0.7556253045547052, + "grad_norm": 0.921875, + "learning_rate": 5.656837517979326e-05, + "loss": 1.0601, + "step": 4793 + }, + { + "epoch": 0.7557829564020982, + "grad_norm": 0.96484375, + "learning_rate": 5.6564228358254536e-05, + "loss": 0.8806, + "step": 4794 + }, + { + "epoch": 0.7559406082494912, + "grad_norm": 0.953125, + "learning_rate": 5.656008162877545e-05, + "loss": 1.0074, + "step": 4795 + }, + { + "epoch": 0.7560982600968842, + "grad_norm": 0.89453125, + "learning_rate": 5.655593499136476e-05, + "loss": 0.9661, + "step": 4796 + }, + { + "epoch": 0.7562559119442772, + "grad_norm": 0.80078125, + "learning_rate": 5.6551788446031304e-05, + "loss": 0.7427, + "step": 4797 + }, + { + "epoch": 0.7564135637916702, + "grad_norm": 1.0546875, + "learning_rate": 5.6547641992783864e-05, + "loss": 0.8946, + "step": 4798 + }, + { + "epoch": 0.7565712156390633, + "grad_norm": 0.984375, + "learning_rate": 5.6543495631631206e-05, + "loss": 1.28, + "step": 4799 + }, + { + "epoch": 0.7567288674864563, + "grad_norm": 0.84765625, + "learning_rate": 5.6539349362582126e-05, + "loss": 1.0501, + "step": 4800 + }, + { + "epoch": 0.7568865193338493, + "grad_norm": 0.98828125, + "learning_rate": 5.653520318564538e-05, + "loss": 1.0795, + "step": 4801 + }, + { + "epoch": 0.7570441711812423, + "grad_norm": 0.9140625, + "learning_rate": 5.6531057100829807e-05, + "loss": 0.9369, + "step": 4802 + }, + { + "epoch": 0.7572018230286353, + "grad_norm": 0.875, + "learning_rate": 5.6526911108144185e-05, + "loss": 0.8539, + "step": 4803 + }, + { + "epoch": 0.7573594748760283, + "grad_norm": 1.015625, + "learning_rate": 5.65227652075973e-05, + "loss": 1.0506, + "step": 4804 + }, + { + "epoch": 0.7575171267234213, + "grad_norm": 0.87109375, + "learning_rate": 5.651861939919792e-05, + "loss": 1.0172, + "step": 4805 + }, + { + "epoch": 0.7576747785708143, + "grad_norm": 0.99609375, + "learning_rate": 5.651447368295479e-05, + "loss": 1.0218, + "step": 4806 + }, + { + "epoch": 0.7578324304182074, + "grad_norm": 0.91796875, + "learning_rate": 5.65103280588768e-05, + "loss": 0.9427, + "step": 4807 + }, + { + "epoch": 0.7579900822656004, + "grad_norm": 0.9296875, + "learning_rate": 5.650618252697269e-05, + "loss": 0.9399, + "step": 4808 + }, + { + "epoch": 0.7581477341129934, + "grad_norm": 0.88671875, + "learning_rate": 5.6502037087251234e-05, + "loss": 0.8983, + "step": 4809 + }, + { + "epoch": 0.7583053859603864, + "grad_norm": 1.015625, + "learning_rate": 5.649789173972122e-05, + "loss": 1.0003, + "step": 4810 + }, + { + "epoch": 0.7584630378077793, + "grad_norm": 0.8828125, + "learning_rate": 5.649374648439139e-05, + "loss": 1.1657, + "step": 4811 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 1.1796875, + "learning_rate": 5.648960132127064e-05, + "loss": 0.9845, + "step": 4812 + }, + { + "epoch": 0.7587783415025654, + "grad_norm": 3.25, + "learning_rate": 5.648545625036767e-05, + "loss": 1.044, + "step": 4813 + }, + { + "epoch": 0.7589359933499584, + "grad_norm": 1.140625, + "learning_rate": 5.648131127169131e-05, + "loss": 0.9187, + "step": 4814 + }, + { + "epoch": 0.7590936451973515, + "grad_norm": 0.9453125, + "learning_rate": 5.647716638525031e-05, + "loss": 1.0713, + "step": 4815 + }, + { + "epoch": 0.7592512970447445, + "grad_norm": 0.921875, + "learning_rate": 5.6473021591053435e-05, + "loss": 0.9249, + "step": 4816 + }, + { + "epoch": 0.7594089488921375, + "grad_norm": 0.9140625, + "learning_rate": 5.646887688910954e-05, + "loss": 0.8056, + "step": 4817 + }, + { + "epoch": 0.7595666007395305, + "grad_norm": 0.953125, + "learning_rate": 5.646473227942737e-05, + "loss": 1.2723, + "step": 4818 + }, + { + "epoch": 0.7597242525869234, + "grad_norm": 0.91015625, + "learning_rate": 5.6460587762015705e-05, + "loss": 0.9943, + "step": 4819 + }, + { + "epoch": 0.7598819044343165, + "grad_norm": 1.015625, + "learning_rate": 5.645644333688335e-05, + "loss": 0.968, + "step": 4820 + }, + { + "epoch": 0.7600395562817095, + "grad_norm": 0.90625, + "learning_rate": 5.6452299004039025e-05, + "loss": 0.9611, + "step": 4821 + }, + { + "epoch": 0.7601972081291025, + "grad_norm": 1.125, + "learning_rate": 5.644815476349161e-05, + "loss": 1.3164, + "step": 4822 + }, + { + "epoch": 0.7603548599764955, + "grad_norm": 1.015625, + "learning_rate": 5.644401061524983e-05, + "loss": 1.1447, + "step": 4823 + }, + { + "epoch": 0.7605125118238886, + "grad_norm": 1.015625, + "learning_rate": 5.6439866559322494e-05, + "loss": 1.0321, + "step": 4824 + }, + { + "epoch": 0.7606701636712816, + "grad_norm": 0.90234375, + "learning_rate": 5.6435722595718366e-05, + "loss": 0.9234, + "step": 4825 + }, + { + "epoch": 0.7608278155186746, + "grad_norm": 0.99609375, + "learning_rate": 5.643157872444622e-05, + "loss": 0.8942, + "step": 4826 + }, + { + "epoch": 0.7609854673660675, + "grad_norm": 1.0078125, + "learning_rate": 5.642743494551488e-05, + "loss": 1.1226, + "step": 4827 + }, + { + "epoch": 0.7611431192134606, + "grad_norm": 0.9140625, + "learning_rate": 5.642329125893304e-05, + "loss": 0.9367, + "step": 4828 + }, + { + "epoch": 0.7613007710608536, + "grad_norm": 1.0234375, + "learning_rate": 5.641914766470959e-05, + "loss": 0.9956, + "step": 4829 + }, + { + "epoch": 0.7614584229082466, + "grad_norm": 1.0234375, + "learning_rate": 5.641500416285326e-05, + "loss": 0.8958, + "step": 4830 + }, + { + "epoch": 0.7616160747556396, + "grad_norm": 1.0234375, + "learning_rate": 5.641086075337285e-05, + "loss": 0.9933, + "step": 4831 + }, + { + "epoch": 0.7617737266030327, + "grad_norm": 0.94921875, + "learning_rate": 5.640671743627712e-05, + "loss": 0.8069, + "step": 4832 + }, + { + "epoch": 0.7619313784504257, + "grad_norm": 0.90625, + "learning_rate": 5.640257421157487e-05, + "loss": 1.0776, + "step": 4833 + }, + { + "epoch": 0.7620890302978187, + "grad_norm": 0.92578125, + "learning_rate": 5.639843107927485e-05, + "loss": 0.968, + "step": 4834 + }, + { + "epoch": 0.7622466821452116, + "grad_norm": 1.03125, + "learning_rate": 5.639428803938589e-05, + "loss": 1.1492, + "step": 4835 + }, + { + "epoch": 0.7624043339926047, + "grad_norm": 0.9375, + "learning_rate": 5.639014509191669e-05, + "loss": 0.8815, + "step": 4836 + }, + { + "epoch": 0.7625619858399977, + "grad_norm": 0.98828125, + "learning_rate": 5.638600223687612e-05, + "loss": 0.9549, + "step": 4837 + }, + { + "epoch": 0.7627196376873907, + "grad_norm": 1.6953125, + "learning_rate": 5.6381859474272935e-05, + "loss": 1.0474, + "step": 4838 + }, + { + "epoch": 0.7628772895347837, + "grad_norm": 0.86328125, + "learning_rate": 5.63777168041159e-05, + "loss": 0.858, + "step": 4839 + }, + { + "epoch": 0.7630349413821768, + "grad_norm": 0.9921875, + "learning_rate": 5.6373574226413805e-05, + "loss": 1.1111, + "step": 4840 + }, + { + "epoch": 0.7631925932295698, + "grad_norm": 0.9296875, + "learning_rate": 5.636943174117539e-05, + "loss": 1.0617, + "step": 4841 + }, + { + "epoch": 0.7633502450769628, + "grad_norm": 1.0390625, + "learning_rate": 5.63652893484095e-05, + "loss": 0.9691, + "step": 4842 + }, + { + "epoch": 0.7635078969243557, + "grad_norm": 0.9765625, + "learning_rate": 5.6361147048124895e-05, + "loss": 0.8672, + "step": 4843 + }, + { + "epoch": 0.7636655487717487, + "grad_norm": 1.3125, + "learning_rate": 5.635700484033034e-05, + "loss": 0.8289, + "step": 4844 + }, + { + "epoch": 0.7638232006191418, + "grad_norm": 1.0, + "learning_rate": 5.635286272503462e-05, + "loss": 1.026, + "step": 4845 + }, + { + "epoch": 0.7639808524665348, + "grad_norm": 0.8828125, + "learning_rate": 5.6348720702246474e-05, + "loss": 0.9966, + "step": 4846 + }, + { + "epoch": 0.7641385043139278, + "grad_norm": 0.9296875, + "learning_rate": 5.634457877197475e-05, + "loss": 1.0871, + "step": 4847 + }, + { + "epoch": 0.7642961561613208, + "grad_norm": 0.9140625, + "learning_rate": 5.634043693422821e-05, + "loss": 1.0065, + "step": 4848 + }, + { + "epoch": 0.7644538080087139, + "grad_norm": 1.0, + "learning_rate": 5.633629518901561e-05, + "loss": 0.9696, + "step": 4849 + }, + { + "epoch": 0.7646114598561069, + "grad_norm": 0.93359375, + "learning_rate": 5.6332153536345735e-05, + "loss": 0.8475, + "step": 4850 + }, + { + "epoch": 0.7647691117034999, + "grad_norm": 1.0078125, + "learning_rate": 5.632801197622732e-05, + "loss": 1.0268, + "step": 4851 + }, + { + "epoch": 0.7649267635508928, + "grad_norm": 0.96875, + "learning_rate": 5.6323870508669254e-05, + "loss": 1.1305, + "step": 4852 + }, + { + "epoch": 0.7650844153982859, + "grad_norm": 1.015625, + "learning_rate": 5.631972913368022e-05, + "loss": 0.9686, + "step": 4853 + }, + { + "epoch": 0.7652420672456789, + "grad_norm": 0.96875, + "learning_rate": 5.631558785126904e-05, + "loss": 1.1917, + "step": 4854 + }, + { + "epoch": 0.7653997190930719, + "grad_norm": 0.78515625, + "learning_rate": 5.6311446661444454e-05, + "loss": 0.8221, + "step": 4855 + }, + { + "epoch": 0.7655573709404649, + "grad_norm": 0.890625, + "learning_rate": 5.630730556421524e-05, + "loss": 0.8045, + "step": 4856 + }, + { + "epoch": 0.765715022787858, + "grad_norm": 0.89453125, + "learning_rate": 5.630316455959024e-05, + "loss": 0.9608, + "step": 4857 + }, + { + "epoch": 0.765872674635251, + "grad_norm": 0.9453125, + "learning_rate": 5.629902364757816e-05, + "loss": 0.9387, + "step": 4858 + }, + { + "epoch": 0.766030326482644, + "grad_norm": 0.921875, + "learning_rate": 5.6294882828187826e-05, + "loss": 1.0019, + "step": 4859 + }, + { + "epoch": 0.7661879783300369, + "grad_norm": 1.0625, + "learning_rate": 5.6290742101427975e-05, + "loss": 1.0928, + "step": 4860 + }, + { + "epoch": 0.76634563017743, + "grad_norm": 0.921875, + "learning_rate": 5.628660146730735e-05, + "loss": 1.106, + "step": 4861 + }, + { + "epoch": 0.766503282024823, + "grad_norm": 0.9375, + "learning_rate": 5.6282460925834834e-05, + "loss": 0.8749, + "step": 4862 + }, + { + "epoch": 0.766660933872216, + "grad_norm": 0.96875, + "learning_rate": 5.627832047701913e-05, + "loss": 0.7704, + "step": 4863 + }, + { + "epoch": 0.766818585719609, + "grad_norm": 1.0, + "learning_rate": 5.6274180120869036e-05, + "loss": 0.8896, + "step": 4864 + }, + { + "epoch": 0.7669762375670021, + "grad_norm": 0.953125, + "learning_rate": 5.627003985739332e-05, + "loss": 0.8467, + "step": 4865 + }, + { + "epoch": 0.7671338894143951, + "grad_norm": 0.90625, + "learning_rate": 5.6265899686600696e-05, + "loss": 0.8383, + "step": 4866 + }, + { + "epoch": 0.7672915412617881, + "grad_norm": 0.96875, + "learning_rate": 5.6261759608500065e-05, + "loss": 0.9786, + "step": 4867 + }, + { + "epoch": 0.767449193109181, + "grad_norm": 0.953125, + "learning_rate": 5.625761962310011e-05, + "loss": 1.0446, + "step": 4868 + }, + { + "epoch": 0.767606844956574, + "grad_norm": 1.140625, + "learning_rate": 5.6253479730409645e-05, + "loss": 1.0828, + "step": 4869 + }, + { + "epoch": 0.7677644968039671, + "grad_norm": 0.8984375, + "learning_rate": 5.624933993043742e-05, + "loss": 0.8906, + "step": 4870 + }, + { + "epoch": 0.7679221486513601, + "grad_norm": 0.96484375, + "learning_rate": 5.624520022319222e-05, + "loss": 0.9426, + "step": 4871 + }, + { + "epoch": 0.7680798004987531, + "grad_norm": 0.984375, + "learning_rate": 5.6241060608682826e-05, + "loss": 1.0208, + "step": 4872 + }, + { + "epoch": 0.7682374523461462, + "grad_norm": 0.9765625, + "learning_rate": 5.6236921086918e-05, + "loss": 1.0319, + "step": 4873 + }, + { + "epoch": 0.7683951041935392, + "grad_norm": 0.9375, + "learning_rate": 5.6232781657906505e-05, + "loss": 1.0559, + "step": 4874 + }, + { + "epoch": 0.7685527560409322, + "grad_norm": 1.0234375, + "learning_rate": 5.6228642321657144e-05, + "loss": 1.1998, + "step": 4875 + }, + { + "epoch": 0.7687104078883251, + "grad_norm": 0.91796875, + "learning_rate": 5.622450307817864e-05, + "loss": 0.9915, + "step": 4876 + }, + { + "epoch": 0.7688680597357181, + "grad_norm": 0.91796875, + "learning_rate": 5.622036392747983e-05, + "loss": 0.9104, + "step": 4877 + }, + { + "epoch": 0.7690257115831112, + "grad_norm": 0.890625, + "learning_rate": 5.6216224869569436e-05, + "loss": 0.8299, + "step": 4878 + }, + { + "epoch": 0.7691833634305042, + "grad_norm": 0.984375, + "learning_rate": 5.621208590445628e-05, + "loss": 1.1187, + "step": 4879 + }, + { + "epoch": 0.7693410152778972, + "grad_norm": 0.8984375, + "learning_rate": 5.620794703214909e-05, + "loss": 0.8825, + "step": 4880 + }, + { + "epoch": 0.7694986671252902, + "grad_norm": 1.046875, + "learning_rate": 5.620380825265661e-05, + "loss": 1.1337, + "step": 4881 + }, + { + "epoch": 0.7696563189726833, + "grad_norm": 0.96484375, + "learning_rate": 5.61996695659877e-05, + "loss": 1.1088, + "step": 4882 + }, + { + "epoch": 0.7698139708200763, + "grad_norm": 0.81640625, + "learning_rate": 5.619553097215109e-05, + "loss": 0.8583, + "step": 4883 + }, + { + "epoch": 0.7699716226674692, + "grad_norm": 1.1484375, + "learning_rate": 5.6191392471155555e-05, + "loss": 1.2999, + "step": 4884 + }, + { + "epoch": 0.7701292745148622, + "grad_norm": 0.92578125, + "learning_rate": 5.618725406300985e-05, + "loss": 1.0181, + "step": 4885 + }, + { + "epoch": 0.7702869263622553, + "grad_norm": 0.94921875, + "learning_rate": 5.618311574772273e-05, + "loss": 1.0428, + "step": 4886 + }, + { + "epoch": 0.7704445782096483, + "grad_norm": 0.92578125, + "learning_rate": 5.617897752530301e-05, + "loss": 0.9231, + "step": 4887 + }, + { + "epoch": 0.7706022300570413, + "grad_norm": 0.921875, + "learning_rate": 5.617483939575945e-05, + "loss": 0.8501, + "step": 4888 + }, + { + "epoch": 0.7707598819044343, + "grad_norm": 0.95703125, + "learning_rate": 5.617070135910082e-05, + "loss": 1.1794, + "step": 4889 + }, + { + "epoch": 0.7709175337518274, + "grad_norm": 1.0234375, + "learning_rate": 5.616656341533588e-05, + "loss": 1.1176, + "step": 4890 + }, + { + "epoch": 0.7710751855992204, + "grad_norm": 1.0078125, + "learning_rate": 5.616242556447335e-05, + "loss": 1.0158, + "step": 4891 + }, + { + "epoch": 0.7712328374466133, + "grad_norm": 0.91796875, + "learning_rate": 5.615828780652211e-05, + "loss": 0.8889, + "step": 4892 + }, + { + "epoch": 0.7713904892940063, + "grad_norm": 0.96875, + "learning_rate": 5.615415014149087e-05, + "loss": 1.1174, + "step": 4893 + }, + { + "epoch": 0.7715481411413994, + "grad_norm": 1.0078125, + "learning_rate": 5.61500125693884e-05, + "loss": 1.179, + "step": 4894 + }, + { + "epoch": 0.7717057929887924, + "grad_norm": 0.9140625, + "learning_rate": 5.6145875090223475e-05, + "loss": 0.7847, + "step": 4895 + }, + { + "epoch": 0.7718634448361854, + "grad_norm": 1.0, + "learning_rate": 5.614173770400481e-05, + "loss": 0.8788, + "step": 4896 + }, + { + "epoch": 0.7720210966835784, + "grad_norm": 0.90625, + "learning_rate": 5.6137600410741275e-05, + "loss": 1.0177, + "step": 4897 + }, + { + "epoch": 0.7721787485309715, + "grad_norm": 1.0078125, + "learning_rate": 5.6133463210441595e-05, + "loss": 1.0472, + "step": 4898 + }, + { + "epoch": 0.7723364003783645, + "grad_norm": 1.0078125, + "learning_rate": 5.612932610311452e-05, + "loss": 1.0524, + "step": 4899 + }, + { + "epoch": 0.7724940522257574, + "grad_norm": 0.953125, + "learning_rate": 5.612518908876885e-05, + "loss": 1.021, + "step": 4900 + }, + { + "epoch": 0.7726517040731504, + "grad_norm": 0.93359375, + "learning_rate": 5.612105216741328e-05, + "loss": 0.968, + "step": 4901 + }, + { + "epoch": 0.7728093559205435, + "grad_norm": 0.9609375, + "learning_rate": 5.611691533905666e-05, + "loss": 0.7917, + "step": 4902 + }, + { + "epoch": 0.7729670077679365, + "grad_norm": 0.84375, + "learning_rate": 5.6112778603707746e-05, + "loss": 0.8711, + "step": 4903 + }, + { + "epoch": 0.7731246596153295, + "grad_norm": 0.9375, + "learning_rate": 5.6108641961375286e-05, + "loss": 1.0208, + "step": 4904 + }, + { + "epoch": 0.7732823114627225, + "grad_norm": 0.984375, + "learning_rate": 5.6104505412068056e-05, + "loss": 0.951, + "step": 4905 + }, + { + "epoch": 0.7734399633101156, + "grad_norm": 1.0703125, + "learning_rate": 5.610036895579477e-05, + "loss": 1.1127, + "step": 4906 + }, + { + "epoch": 0.7735976151575086, + "grad_norm": 0.9375, + "learning_rate": 5.609623259256428e-05, + "loss": 0.9038, + "step": 4907 + }, + { + "epoch": 0.7737552670049015, + "grad_norm": 0.89453125, + "learning_rate": 5.609209632238532e-05, + "loss": 0.8875, + "step": 4908 + }, + { + "epoch": 0.7739129188522945, + "grad_norm": 0.94921875, + "learning_rate": 5.608796014526665e-05, + "loss": 1.1249, + "step": 4909 + }, + { + "epoch": 0.7740705706996875, + "grad_norm": 0.9375, + "learning_rate": 5.6083824061217035e-05, + "loss": 1.2101, + "step": 4910 + }, + { + "epoch": 0.7742282225470806, + "grad_norm": 0.84765625, + "learning_rate": 5.607968807024526e-05, + "loss": 1.0282, + "step": 4911 + }, + { + "epoch": 0.7743858743944736, + "grad_norm": 0.97265625, + "learning_rate": 5.607555217236007e-05, + "loss": 1.083, + "step": 4912 + }, + { + "epoch": 0.7745435262418666, + "grad_norm": 1.0234375, + "learning_rate": 5.607141636757018e-05, + "loss": 1.1474, + "step": 4913 + }, + { + "epoch": 0.7747011780892596, + "grad_norm": 0.81640625, + "learning_rate": 5.606728065588447e-05, + "loss": 0.9038, + "step": 4914 + }, + { + "epoch": 0.7748588299366527, + "grad_norm": 1.078125, + "learning_rate": 5.606314503731165e-05, + "loss": 1.0259, + "step": 4915 + }, + { + "epoch": 0.7750164817840456, + "grad_norm": 1.328125, + "learning_rate": 5.605900951186047e-05, + "loss": 0.8264, + "step": 4916 + }, + { + "epoch": 0.7751741336314386, + "grad_norm": 0.98046875, + "learning_rate": 5.605487407953972e-05, + "loss": 0.9976, + "step": 4917 + }, + { + "epoch": 0.7753317854788316, + "grad_norm": 0.94140625, + "learning_rate": 5.6050738740358146e-05, + "loss": 1.0209, + "step": 4918 + }, + { + "epoch": 0.7754894373262247, + "grad_norm": 0.94921875, + "learning_rate": 5.604660349432451e-05, + "loss": 0.8686, + "step": 4919 + }, + { + "epoch": 0.7756470891736177, + "grad_norm": 0.87109375, + "learning_rate": 5.60424683414476e-05, + "loss": 0.9371, + "step": 4920 + }, + { + "epoch": 0.7758047410210107, + "grad_norm": 0.88671875, + "learning_rate": 5.603833328173617e-05, + "loss": 0.9635, + "step": 4921 + }, + { + "epoch": 0.7759623928684037, + "grad_norm": 0.9375, + "learning_rate": 5.6034198315198924e-05, + "loss": 1.0, + "step": 4922 + }, + { + "epoch": 0.7761200447157968, + "grad_norm": 0.984375, + "learning_rate": 5.6030063441844725e-05, + "loss": 0.9096, + "step": 4923 + }, + { + "epoch": 0.7762776965631897, + "grad_norm": 0.98046875, + "learning_rate": 5.60259286616823e-05, + "loss": 1.0806, + "step": 4924 + }, + { + "epoch": 0.7764353484105827, + "grad_norm": 1.0390625, + "learning_rate": 5.6021793974720407e-05, + "loss": 1.194, + "step": 4925 + }, + { + "epoch": 0.7765930002579757, + "grad_norm": 1.0390625, + "learning_rate": 5.6017659380967805e-05, + "loss": 0.9434, + "step": 4926 + }, + { + "epoch": 0.7767506521053688, + "grad_norm": 0.96484375, + "learning_rate": 5.601352488043321e-05, + "loss": 0.8821, + "step": 4927 + }, + { + "epoch": 0.7769083039527618, + "grad_norm": 1.1328125, + "learning_rate": 5.600939047312548e-05, + "loss": 1.1288, + "step": 4928 + }, + { + "epoch": 0.7770659558001548, + "grad_norm": 0.97265625, + "learning_rate": 5.600525615905334e-05, + "loss": 1.0655, + "step": 4929 + }, + { + "epoch": 0.7772236076475478, + "grad_norm": 1.0078125, + "learning_rate": 5.600112193822554e-05, + "loss": 1.1527, + "step": 4930 + }, + { + "epoch": 0.7773812594949409, + "grad_norm": 0.9453125, + "learning_rate": 5.599698781065086e-05, + "loss": 0.9636, + "step": 4931 + }, + { + "epoch": 0.7775389113423338, + "grad_norm": 0.84375, + "learning_rate": 5.599285377633798e-05, + "loss": 1.0691, + "step": 4932 + }, + { + "epoch": 0.7776965631897268, + "grad_norm": 0.921875, + "learning_rate": 5.598871983529579e-05, + "loss": 0.9235, + "step": 4933 + }, + { + "epoch": 0.7778542150371198, + "grad_norm": 0.953125, + "learning_rate": 5.5984585987533e-05, + "loss": 1.0348, + "step": 4934 + }, + { + "epoch": 0.7780118668845128, + "grad_norm": 0.796875, + "learning_rate": 5.598045223305836e-05, + "loss": 0.777, + "step": 4935 + }, + { + "epoch": 0.7781695187319059, + "grad_norm": 1.0078125, + "learning_rate": 5.597631857188063e-05, + "loss": 1.1555, + "step": 4936 + }, + { + "epoch": 0.7783271705792989, + "grad_norm": 0.9609375, + "learning_rate": 5.597218500400854e-05, + "loss": 0.9298, + "step": 4937 + }, + { + "epoch": 0.7784848224266919, + "grad_norm": 0.9296875, + "learning_rate": 5.5968051529450916e-05, + "loss": 1.0523, + "step": 4938 + }, + { + "epoch": 0.778642474274085, + "grad_norm": 0.93359375, + "learning_rate": 5.59639181482165e-05, + "loss": 0.9194, + "step": 4939 + }, + { + "epoch": 0.7788001261214779, + "grad_norm": 1.046875, + "learning_rate": 5.595978486031403e-05, + "loss": 1.1318, + "step": 4940 + }, + { + "epoch": 0.7789577779688709, + "grad_norm": 0.80078125, + "learning_rate": 5.5955651665752294e-05, + "loss": 0.8782, + "step": 4941 + }, + { + "epoch": 0.7791154298162639, + "grad_norm": 0.828125, + "learning_rate": 5.5951518564539996e-05, + "loss": 0.789, + "step": 4942 + }, + { + "epoch": 0.7792730816636569, + "grad_norm": 1.1171875, + "learning_rate": 5.594738555668597e-05, + "loss": 1.1789, + "step": 4943 + }, + { + "epoch": 0.77943073351105, + "grad_norm": 0.89453125, + "learning_rate": 5.594325264219895e-05, + "loss": 0.8158, + "step": 4944 + }, + { + "epoch": 0.779588385358443, + "grad_norm": 1.0234375, + "learning_rate": 5.5939119821087684e-05, + "loss": 1.2868, + "step": 4945 + }, + { + "epoch": 0.779746037205836, + "grad_norm": 0.8828125, + "learning_rate": 5.593498709336094e-05, + "loss": 0.9817, + "step": 4946 + }, + { + "epoch": 0.779903689053229, + "grad_norm": 0.91796875, + "learning_rate": 5.5930854459027426e-05, + "loss": 0.9308, + "step": 4947 + }, + { + "epoch": 0.780061340900622, + "grad_norm": 0.90234375, + "learning_rate": 5.592672191809598e-05, + "loss": 1.0024, + "step": 4948 + }, + { + "epoch": 0.780218992748015, + "grad_norm": 0.9140625, + "learning_rate": 5.5922589470575336e-05, + "loss": 0.9408, + "step": 4949 + }, + { + "epoch": 0.780376644595408, + "grad_norm": 0.81640625, + "learning_rate": 5.591845711647425e-05, + "loss": 0.7219, + "step": 4950 + }, + { + "epoch": 0.780534296442801, + "grad_norm": 0.94140625, + "learning_rate": 5.591432485580147e-05, + "loss": 1.0164, + "step": 4951 + }, + { + "epoch": 0.7806919482901941, + "grad_norm": 0.88671875, + "learning_rate": 5.5910192688565724e-05, + "loss": 1.0595, + "step": 4952 + }, + { + "epoch": 0.7808496001375871, + "grad_norm": 0.96484375, + "learning_rate": 5.5906060614775836e-05, + "loss": 1.1242, + "step": 4953 + }, + { + "epoch": 0.7810072519849801, + "grad_norm": 0.83984375, + "learning_rate": 5.590192863444052e-05, + "loss": 0.9602, + "step": 4954 + }, + { + "epoch": 0.7811649038323731, + "grad_norm": 0.890625, + "learning_rate": 5.5897796747568566e-05, + "loss": 0.8565, + "step": 4955 + }, + { + "epoch": 0.781322555679766, + "grad_norm": 0.9296875, + "learning_rate": 5.5893664954168715e-05, + "loss": 0.9337, + "step": 4956 + }, + { + "epoch": 0.7814802075271591, + "grad_norm": 0.8203125, + "learning_rate": 5.58895332542497e-05, + "loss": 0.8481, + "step": 4957 + }, + { + "epoch": 0.7816378593745521, + "grad_norm": 0.9375, + "learning_rate": 5.588540164782031e-05, + "loss": 0.8779, + "step": 4958 + }, + { + "epoch": 0.7817955112219451, + "grad_norm": 1.1015625, + "learning_rate": 5.5881270134889286e-05, + "loss": 1.2009, + "step": 4959 + }, + { + "epoch": 0.7819531630693382, + "grad_norm": 0.8984375, + "learning_rate": 5.587713871546539e-05, + "loss": 0.849, + "step": 4960 + }, + { + "epoch": 0.7821108149167312, + "grad_norm": 1.0234375, + "learning_rate": 5.587300738955734e-05, + "loss": 1.2691, + "step": 4961 + }, + { + "epoch": 0.7822684667641242, + "grad_norm": 0.94140625, + "learning_rate": 5.586887615717397e-05, + "loss": 0.9872, + "step": 4962 + }, + { + "epoch": 0.7824261186115172, + "grad_norm": 0.86328125, + "learning_rate": 5.5864745018323996e-05, + "loss": 0.9269, + "step": 4963 + }, + { + "epoch": 0.7825837704589101, + "grad_norm": 1.0546875, + "learning_rate": 5.586061397301616e-05, + "loss": 1.119, + "step": 4964 + }, + { + "epoch": 0.7827414223063032, + "grad_norm": 0.92578125, + "learning_rate": 5.585648302125923e-05, + "loss": 0.9326, + "step": 4965 + }, + { + "epoch": 0.7828990741536962, + "grad_norm": 0.9375, + "learning_rate": 5.585235216306197e-05, + "loss": 1.0823, + "step": 4966 + }, + { + "epoch": 0.7830567260010892, + "grad_norm": 0.9765625, + "learning_rate": 5.5848221398433086e-05, + "loss": 0.8879, + "step": 4967 + }, + { + "epoch": 0.7832143778484822, + "grad_norm": 2.09375, + "learning_rate": 5.5844090727381414e-05, + "loss": 1.2079, + "step": 4968 + }, + { + "epoch": 0.7833720296958753, + "grad_norm": 0.9765625, + "learning_rate": 5.583996014991566e-05, + "loss": 0.7936, + "step": 4969 + }, + { + "epoch": 0.7835296815432683, + "grad_norm": 0.90625, + "learning_rate": 5.583582966604459e-05, + "loss": 1.0289, + "step": 4970 + }, + { + "epoch": 0.7836873333906613, + "grad_norm": 1.15625, + "learning_rate": 5.583169927577696e-05, + "loss": 0.9569, + "step": 4971 + }, + { + "epoch": 0.7838449852380542, + "grad_norm": 0.8359375, + "learning_rate": 5.582756897912147e-05, + "loss": 0.9837, + "step": 4972 + }, + { + "epoch": 0.7840026370854473, + "grad_norm": 0.8203125, + "learning_rate": 5.582343877608697e-05, + "loss": 0.9335, + "step": 4973 + }, + { + "epoch": 0.7841602889328403, + "grad_norm": 0.8984375, + "learning_rate": 5.581930866668217e-05, + "loss": 0.9462, + "step": 4974 + }, + { + "epoch": 0.7843179407802333, + "grad_norm": 0.890625, + "learning_rate": 5.581517865091581e-05, + "loss": 0.851, + "step": 4975 + }, + { + "epoch": 0.7844755926276263, + "grad_norm": 0.97265625, + "learning_rate": 5.581104872879666e-05, + "loss": 1.0618, + "step": 4976 + }, + { + "epoch": 0.7846332444750194, + "grad_norm": 0.890625, + "learning_rate": 5.5806918900333416e-05, + "loss": 0.8612, + "step": 4977 + }, + { + "epoch": 0.7847908963224124, + "grad_norm": 0.8984375, + "learning_rate": 5.580278916553493e-05, + "loss": 0.7828, + "step": 4978 + }, + { + "epoch": 0.7849485481698054, + "grad_norm": 0.94140625, + "learning_rate": 5.57986595244099e-05, + "loss": 0.9476, + "step": 4979 + }, + { + "epoch": 0.7851062000171983, + "grad_norm": 0.89453125, + "learning_rate": 5.5794529976967105e-05, + "loss": 1.0268, + "step": 4980 + }, + { + "epoch": 0.7852638518645914, + "grad_norm": 0.9140625, + "learning_rate": 5.579040052321525e-05, + "loss": 1.0485, + "step": 4981 + }, + { + "epoch": 0.7854215037119844, + "grad_norm": 1.0078125, + "learning_rate": 5.578627116316308e-05, + "loss": 0.9143, + "step": 4982 + }, + { + "epoch": 0.7855791555593774, + "grad_norm": 0.9296875, + "learning_rate": 5.578214189681943e-05, + "loss": 1.0022, + "step": 4983 + }, + { + "epoch": 0.7857368074067704, + "grad_norm": 0.984375, + "learning_rate": 5.577801272419301e-05, + "loss": 0.9785, + "step": 4984 + }, + { + "epoch": 0.7858944592541635, + "grad_norm": 0.98046875, + "learning_rate": 5.577388364529255e-05, + "loss": 0.952, + "step": 4985 + }, + { + "epoch": 0.7860521111015565, + "grad_norm": 0.85546875, + "learning_rate": 5.5769754660126816e-05, + "loss": 0.9624, + "step": 4986 + }, + { + "epoch": 0.7862097629489495, + "grad_norm": 0.90234375, + "learning_rate": 5.576562576870452e-05, + "loss": 0.9868, + "step": 4987 + }, + { + "epoch": 0.7863674147963424, + "grad_norm": 1.0, + "learning_rate": 5.57614969710345e-05, + "loss": 1.182, + "step": 4988 + }, + { + "epoch": 0.7865250666437354, + "grad_norm": 0.9921875, + "learning_rate": 5.5757368267125455e-05, + "loss": 1.1271, + "step": 4989 + }, + { + "epoch": 0.7866827184911285, + "grad_norm": 1.0546875, + "learning_rate": 5.575323965698614e-05, + "loss": 1.0158, + "step": 4990 + }, + { + "epoch": 0.7868403703385215, + "grad_norm": 1.015625, + "learning_rate": 5.574911114062531e-05, + "loss": 0.8596, + "step": 4991 + }, + { + "epoch": 0.7869980221859145, + "grad_norm": 0.92578125, + "learning_rate": 5.574498271805166e-05, + "loss": 0.9557, + "step": 4992 + }, + { + "epoch": 0.7871556740333076, + "grad_norm": 0.9296875, + "learning_rate": 5.5740854389274036e-05, + "loss": 0.8968, + "step": 4993 + }, + { + "epoch": 0.7873133258807006, + "grad_norm": 0.8984375, + "learning_rate": 5.573672615430112e-05, + "loss": 0.9367, + "step": 4994 + }, + { + "epoch": 0.7874709777280936, + "grad_norm": 0.91015625, + "learning_rate": 5.57325980131417e-05, + "loss": 0.9134, + "step": 4995 + }, + { + "epoch": 0.7876286295754865, + "grad_norm": 0.91796875, + "learning_rate": 5.572846996580452e-05, + "loss": 0.9269, + "step": 4996 + }, + { + "epoch": 0.7877862814228795, + "grad_norm": 0.91796875, + "learning_rate": 5.57243420122983e-05, + "loss": 0.9997, + "step": 4997 + }, + { + "epoch": 0.7879439332702726, + "grad_norm": 0.83984375, + "learning_rate": 5.572021415263182e-05, + "loss": 0.792, + "step": 4998 + }, + { + "epoch": 0.7881015851176656, + "grad_norm": 0.91015625, + "learning_rate": 5.571608638681376e-05, + "loss": 1.0963, + "step": 4999 + }, + { + "epoch": 0.7882592369650586, + "grad_norm": 1.0234375, + "learning_rate": 5.571195871485297e-05, + "loss": 0.9355, + "step": 5000 + }, + { + "epoch": 0.7882592369650586, + "eval_loss": 0.973717987537384, + "eval_runtime": 307.9479, + "eval_samples_per_second": 32.473, + "eval_steps_per_second": 0.679, + "step": 5000 + }, + { + "epoch": 0.7884168888124516, + "grad_norm": 1.078125, + "learning_rate": 5.570783113675816e-05, + "loss": 1.369, + "step": 5001 + }, + { + "epoch": 0.7885745406598447, + "grad_norm": 0.94921875, + "learning_rate": 5.570370365253806e-05, + "loss": 1.1005, + "step": 5002 + }, + { + "epoch": 0.7887321925072377, + "grad_norm": 1.0234375, + "learning_rate": 5.569957626220143e-05, + "loss": 1.1115, + "step": 5003 + }, + { + "epoch": 0.7888898443546306, + "grad_norm": 1.3046875, + "learning_rate": 5.569544896575702e-05, + "loss": 1.0446, + "step": 5004 + }, + { + "epoch": 0.7890474962020236, + "grad_norm": 0.89453125, + "learning_rate": 5.5691321763213566e-05, + "loss": 1.0526, + "step": 5005 + }, + { + "epoch": 0.7892051480494167, + "grad_norm": 0.93359375, + "learning_rate": 5.568719465457983e-05, + "loss": 1.1241, + "step": 5006 + }, + { + "epoch": 0.7893627998968097, + "grad_norm": 1.046875, + "learning_rate": 5.5683067639864516e-05, + "loss": 1.2479, + "step": 5007 + }, + { + "epoch": 0.7895204517442027, + "grad_norm": 1.2890625, + "learning_rate": 5.5678940719076446e-05, + "loss": 1.0674, + "step": 5008 + }, + { + "epoch": 0.7896781035915957, + "grad_norm": 1.03125, + "learning_rate": 5.567481389222432e-05, + "loss": 0.9963, + "step": 5009 + }, + { + "epoch": 0.7898357554389888, + "grad_norm": 1.171875, + "learning_rate": 5.5670687159316896e-05, + "loss": 1.0227, + "step": 5010 + }, + { + "epoch": 0.7899934072863818, + "grad_norm": 0.8984375, + "learning_rate": 5.566656052036292e-05, + "loss": 0.9234, + "step": 5011 + }, + { + "epoch": 0.7901510591337748, + "grad_norm": 0.99609375, + "learning_rate": 5.56624339753711e-05, + "loss": 0.9602, + "step": 5012 + }, + { + "epoch": 0.7903087109811677, + "grad_norm": 0.921875, + "learning_rate": 5.565830752435024e-05, + "loss": 0.8803, + "step": 5013 + }, + { + "epoch": 0.7904663628285608, + "grad_norm": 0.95703125, + "learning_rate": 5.565418116730907e-05, + "loss": 1.0225, + "step": 5014 + }, + { + "epoch": 0.7906240146759538, + "grad_norm": 0.890625, + "learning_rate": 5.565005490425633e-05, + "loss": 0.9334, + "step": 5015 + }, + { + "epoch": 0.7907816665233468, + "grad_norm": 1.0234375, + "learning_rate": 5.564592873520076e-05, + "loss": 1.0181, + "step": 5016 + }, + { + "epoch": 0.7909393183707398, + "grad_norm": 1.046875, + "learning_rate": 5.564180266015108e-05, + "loss": 1.0257, + "step": 5017 + }, + { + "epoch": 0.7910969702181329, + "grad_norm": 1.0546875, + "learning_rate": 5.5637676679116104e-05, + "loss": 0.9985, + "step": 5018 + }, + { + "epoch": 0.7912546220655259, + "grad_norm": 0.88671875, + "learning_rate": 5.5633550792104526e-05, + "loss": 0.8488, + "step": 5019 + }, + { + "epoch": 0.7914122739129189, + "grad_norm": 1.0859375, + "learning_rate": 5.562942499912509e-05, + "loss": 1.0944, + "step": 5020 + }, + { + "epoch": 0.7915699257603118, + "grad_norm": 1.0703125, + "learning_rate": 5.5625299300186574e-05, + "loss": 0.9547, + "step": 5021 + }, + { + "epoch": 0.7917275776077048, + "grad_norm": 1.2265625, + "learning_rate": 5.562117369529765e-05, + "loss": 1.0213, + "step": 5022 + }, + { + "epoch": 0.7918852294550979, + "grad_norm": 1.03125, + "learning_rate": 5.561704818446716e-05, + "loss": 1.1166, + "step": 5023 + }, + { + "epoch": 0.7920428813024909, + "grad_norm": 0.98828125, + "learning_rate": 5.561292276770379e-05, + "loss": 1.2056, + "step": 5024 + }, + { + "epoch": 0.7922005331498839, + "grad_norm": 0.890625, + "learning_rate": 5.5608797445016305e-05, + "loss": 0.9612, + "step": 5025 + }, + { + "epoch": 0.792358184997277, + "grad_norm": 0.9609375, + "learning_rate": 5.560467221641341e-05, + "loss": 0.9725, + "step": 5026 + }, + { + "epoch": 0.79251583684467, + "grad_norm": 0.9375, + "learning_rate": 5.5600547081903864e-05, + "loss": 0.8572, + "step": 5027 + }, + { + "epoch": 0.792673488692063, + "grad_norm": 0.91796875, + "learning_rate": 5.5596422041496444e-05, + "loss": 0.9926, + "step": 5028 + }, + { + "epoch": 0.7928311405394559, + "grad_norm": 0.9453125, + "learning_rate": 5.559229709519989e-05, + "loss": 1.094, + "step": 5029 + }, + { + "epoch": 0.7929887923868489, + "grad_norm": 0.93359375, + "learning_rate": 5.5588172243022895e-05, + "loss": 0.9249, + "step": 5030 + }, + { + "epoch": 0.793146444234242, + "grad_norm": 0.90625, + "learning_rate": 5.558404748497425e-05, + "loss": 1.0177, + "step": 5031 + }, + { + "epoch": 0.793304096081635, + "grad_norm": 0.92578125, + "learning_rate": 5.557992282106263e-05, + "loss": 0.9821, + "step": 5032 + }, + { + "epoch": 0.793461747929028, + "grad_norm": 1.359375, + "learning_rate": 5.557579825129686e-05, + "loss": 1.2193, + "step": 5033 + }, + { + "epoch": 0.793619399776421, + "grad_norm": 1.015625, + "learning_rate": 5.557167377568566e-05, + "loss": 1.1078, + "step": 5034 + }, + { + "epoch": 0.7937770516238141, + "grad_norm": 0.87890625, + "learning_rate": 5.556754939423775e-05, + "loss": 0.8695, + "step": 5035 + }, + { + "epoch": 0.7939347034712071, + "grad_norm": 1.0078125, + "learning_rate": 5.556342510696188e-05, + "loss": 0.981, + "step": 5036 + }, + { + "epoch": 0.7940923553186, + "grad_norm": 0.8359375, + "learning_rate": 5.555930091386675e-05, + "loss": 0.7826, + "step": 5037 + }, + { + "epoch": 0.794250007165993, + "grad_norm": 1.0078125, + "learning_rate": 5.5555176814961184e-05, + "loss": 1.0468, + "step": 5038 + }, + { + "epoch": 0.7944076590133861, + "grad_norm": 0.8125, + "learning_rate": 5.555105281025388e-05, + "loss": 0.8758, + "step": 5039 + }, + { + "epoch": 0.7945653108607791, + "grad_norm": 0.96484375, + "learning_rate": 5.554692889975357e-05, + "loss": 1.1049, + "step": 5040 + }, + { + "epoch": 0.7947229627081721, + "grad_norm": 0.95703125, + "learning_rate": 5.554280508346902e-05, + "loss": 1.0157, + "step": 5041 + }, + { + "epoch": 0.7948806145555651, + "grad_norm": 0.76953125, + "learning_rate": 5.553868136140894e-05, + "loss": 0.8076, + "step": 5042 + }, + { + "epoch": 0.7950382664029582, + "grad_norm": 2.671875, + "learning_rate": 5.553455773358209e-05, + "loss": 1.0057, + "step": 5043 + }, + { + "epoch": 0.7951959182503512, + "grad_norm": 0.94921875, + "learning_rate": 5.5530434199997204e-05, + "loss": 1.045, + "step": 5044 + }, + { + "epoch": 0.7953535700977441, + "grad_norm": 0.94140625, + "learning_rate": 5.5526310760663015e-05, + "loss": 0.7979, + "step": 5045 + }, + { + "epoch": 0.7955112219451371, + "grad_norm": 1.078125, + "learning_rate": 5.552218741558828e-05, + "loss": 0.9919, + "step": 5046 + }, + { + "epoch": 0.7956688737925302, + "grad_norm": 1.0390625, + "learning_rate": 5.551806416478167e-05, + "loss": 0.9217, + "step": 5047 + }, + { + "epoch": 0.7958265256399232, + "grad_norm": 0.9453125, + "learning_rate": 5.5513941008252035e-05, + "loss": 0.9083, + "step": 5048 + }, + { + "epoch": 0.7959841774873162, + "grad_norm": 0.99609375, + "learning_rate": 5.5509817946008044e-05, + "loss": 0.9528, + "step": 5049 + }, + { + "epoch": 0.7961418293347092, + "grad_norm": 1.125, + "learning_rate": 5.5505694978058464e-05, + "loss": 1.0934, + "step": 5050 + }, + { + "epoch": 0.7962994811821023, + "grad_norm": 0.98828125, + "learning_rate": 5.5501572104412024e-05, + "loss": 0.9185, + "step": 5051 + }, + { + "epoch": 0.7964571330294953, + "grad_norm": 0.8984375, + "learning_rate": 5.549744932507741e-05, + "loss": 0.8803, + "step": 5052 + }, + { + "epoch": 0.7966147848768882, + "grad_norm": 0.8984375, + "learning_rate": 5.549332664006346e-05, + "loss": 0.979, + "step": 5053 + }, + { + "epoch": 0.7967724367242812, + "grad_norm": 0.95703125, + "learning_rate": 5.548920404937885e-05, + "loss": 1.0268, + "step": 5054 + }, + { + "epoch": 0.7969300885716742, + "grad_norm": 0.859375, + "learning_rate": 5.5485081553032314e-05, + "loss": 0.9669, + "step": 5055 + }, + { + "epoch": 0.7970877404190673, + "grad_norm": 0.98046875, + "learning_rate": 5.5480959151032616e-05, + "loss": 1.1204, + "step": 5056 + }, + { + "epoch": 0.7972453922664603, + "grad_norm": 1.0625, + "learning_rate": 5.547683684338849e-05, + "loss": 1.1368, + "step": 5057 + }, + { + "epoch": 0.7974030441138533, + "grad_norm": 0.9921875, + "learning_rate": 5.5472714630108616e-05, + "loss": 1.1169, + "step": 5058 + }, + { + "epoch": 0.7975606959612463, + "grad_norm": 0.8984375, + "learning_rate": 5.546859251120182e-05, + "loss": 1.0148, + "step": 5059 + }, + { + "epoch": 0.7977183478086394, + "grad_norm": 1.0390625, + "learning_rate": 5.54644704866768e-05, + "loss": 0.8953, + "step": 5060 + }, + { + "epoch": 0.7978759996560323, + "grad_norm": 1.046875, + "learning_rate": 5.546034855654227e-05, + "loss": 1.1756, + "step": 5061 + }, + { + "epoch": 0.7980336515034253, + "grad_norm": 0.859375, + "learning_rate": 5.545622672080702e-05, + "loss": 1.1079, + "step": 5062 + }, + { + "epoch": 0.7981913033508183, + "grad_norm": 0.91015625, + "learning_rate": 5.545210497947968e-05, + "loss": 0.9998, + "step": 5063 + }, + { + "epoch": 0.7983489551982114, + "grad_norm": 0.94921875, + "learning_rate": 5.544798333256911e-05, + "loss": 1.0053, + "step": 5064 + }, + { + "epoch": 0.7985066070456044, + "grad_norm": 0.88671875, + "learning_rate": 5.5443861780083984e-05, + "loss": 0.9897, + "step": 5065 + }, + { + "epoch": 0.7986642588929974, + "grad_norm": 1.0, + "learning_rate": 5.543974032203305e-05, + "loss": 0.9277, + "step": 5066 + }, + { + "epoch": 0.7988219107403904, + "grad_norm": 0.97265625, + "learning_rate": 5.543561895842505e-05, + "loss": 1.081, + "step": 5067 + }, + { + "epoch": 0.7989795625877835, + "grad_norm": 0.9453125, + "learning_rate": 5.543149768926866e-05, + "loss": 1.061, + "step": 5068 + }, + { + "epoch": 0.7991372144351764, + "grad_norm": 0.98828125, + "learning_rate": 5.5427376514572703e-05, + "loss": 0.9441, + "step": 5069 + }, + { + "epoch": 0.7992948662825694, + "grad_norm": 0.91015625, + "learning_rate": 5.5423255434345877e-05, + "loss": 1.0224, + "step": 5070 + }, + { + "epoch": 0.7994525181299624, + "grad_norm": 1.0703125, + "learning_rate": 5.541913444859692e-05, + "loss": 1.0454, + "step": 5071 + }, + { + "epoch": 0.7996101699773555, + "grad_norm": 0.96484375, + "learning_rate": 5.541501355733454e-05, + "loss": 0.9178, + "step": 5072 + }, + { + "epoch": 0.7997678218247485, + "grad_norm": 0.890625, + "learning_rate": 5.5410892760567465e-05, + "loss": 0.7967, + "step": 5073 + }, + { + "epoch": 0.7999254736721415, + "grad_norm": 0.875, + "learning_rate": 5.540677205830449e-05, + "loss": 0.8773, + "step": 5074 + }, + { + "epoch": 0.8000831255195345, + "grad_norm": 1.0625, + "learning_rate": 5.5402651450554324e-05, + "loss": 1.0382, + "step": 5075 + }, + { + "epoch": 0.8002407773669276, + "grad_norm": 1.1015625, + "learning_rate": 5.5398530937325674e-05, + "loss": 1.1118, + "step": 5076 + }, + { + "epoch": 0.8003984292143205, + "grad_norm": 1.1953125, + "learning_rate": 5.539441051862728e-05, + "loss": 0.724, + "step": 5077 + }, + { + "epoch": 0.8005560810617135, + "grad_norm": 0.8671875, + "learning_rate": 5.5390290194467865e-05, + "loss": 1.0443, + "step": 5078 + }, + { + "epoch": 0.8007137329091065, + "grad_norm": 0.85546875, + "learning_rate": 5.5386169964856216e-05, + "loss": 0.9296, + "step": 5079 + }, + { + "epoch": 0.8008713847564995, + "grad_norm": 1.0234375, + "learning_rate": 5.538204982980102e-05, + "loss": 1.0788, + "step": 5080 + }, + { + "epoch": 0.8010290366038926, + "grad_norm": 0.9140625, + "learning_rate": 5.537792978931102e-05, + "loss": 0.9391, + "step": 5081 + }, + { + "epoch": 0.8011866884512856, + "grad_norm": 0.875, + "learning_rate": 5.5373809843394954e-05, + "loss": 0.7482, + "step": 5082 + }, + { + "epoch": 0.8013443402986786, + "grad_norm": 1.3046875, + "learning_rate": 5.5369689992061545e-05, + "loss": 1.1189, + "step": 5083 + }, + { + "epoch": 0.8015019921460717, + "grad_norm": 1.046875, + "learning_rate": 5.53655702353195e-05, + "loss": 1.209, + "step": 5084 + }, + { + "epoch": 0.8016596439934646, + "grad_norm": 0.9375, + "learning_rate": 5.5361450573177585e-05, + "loss": 0.9709, + "step": 5085 + }, + { + "epoch": 0.8018172958408576, + "grad_norm": 1.0234375, + "learning_rate": 5.535733100564455e-05, + "loss": 1.0494, + "step": 5086 + }, + { + "epoch": 0.8019749476882506, + "grad_norm": 1.0234375, + "learning_rate": 5.53532115327291e-05, + "loss": 1.1126, + "step": 5087 + }, + { + "epoch": 0.8021325995356436, + "grad_norm": 0.90625, + "learning_rate": 5.534909215443995e-05, + "loss": 1.0134, + "step": 5088 + }, + { + "epoch": 0.8022902513830367, + "grad_norm": 0.984375, + "learning_rate": 5.534497287078586e-05, + "loss": 0.9051, + "step": 5089 + }, + { + "epoch": 0.8024479032304297, + "grad_norm": 0.82421875, + "learning_rate": 5.534085368177555e-05, + "loss": 0.8708, + "step": 5090 + }, + { + "epoch": 0.8026055550778227, + "grad_norm": 0.9453125, + "learning_rate": 5.533673458741775e-05, + "loss": 1.0625, + "step": 5091 + }, + { + "epoch": 0.8027632069252157, + "grad_norm": 0.96875, + "learning_rate": 5.5332615587721183e-05, + "loss": 1.0924, + "step": 5092 + }, + { + "epoch": 0.8029208587726087, + "grad_norm": 0.9453125, + "learning_rate": 5.532849668269453e-05, + "loss": 0.951, + "step": 5093 + }, + { + "epoch": 0.8030785106200017, + "grad_norm": 0.9453125, + "learning_rate": 5.5324377872346634e-05, + "loss": 0.9661, + "step": 5094 + }, + { + "epoch": 0.8032361624673947, + "grad_norm": 0.9609375, + "learning_rate": 5.532025915668616e-05, + "loss": 0.8835, + "step": 5095 + }, + { + "epoch": 0.8033938143147877, + "grad_norm": 0.96875, + "learning_rate": 5.531614053572184e-05, + "loss": 1.0814, + "step": 5096 + }, + { + "epoch": 0.8035514661621808, + "grad_norm": 0.88671875, + "learning_rate": 5.531202200946242e-05, + "loss": 0.8403, + "step": 5097 + }, + { + "epoch": 0.8037091180095738, + "grad_norm": 0.8984375, + "learning_rate": 5.5307903577916554e-05, + "loss": 0.9275, + "step": 5098 + }, + { + "epoch": 0.8038667698569668, + "grad_norm": 0.875, + "learning_rate": 5.530378524109308e-05, + "loss": 0.8785, + "step": 5099 + }, + { + "epoch": 0.8040244217043598, + "grad_norm": 0.96484375, + "learning_rate": 5.529966699900069e-05, + "loss": 0.9336, + "step": 5100 + }, + { + "epoch": 0.8041820735517528, + "grad_norm": 0.96875, + "learning_rate": 5.529554885164808e-05, + "loss": 1.042, + "step": 5101 + }, + { + "epoch": 0.8043397253991458, + "grad_norm": 0.87109375, + "learning_rate": 5.529143079904401e-05, + "loss": 0.9835, + "step": 5102 + }, + { + "epoch": 0.8044973772465388, + "grad_norm": 1.6328125, + "learning_rate": 5.5287312841197156e-05, + "loss": 1.022, + "step": 5103 + }, + { + "epoch": 0.8046550290939318, + "grad_norm": 0.921875, + "learning_rate": 5.528319497811633e-05, + "loss": 0.9245, + "step": 5104 + }, + { + "epoch": 0.8048126809413249, + "grad_norm": 0.98046875, + "learning_rate": 5.527907720981022e-05, + "loss": 1.1255, + "step": 5105 + }, + { + "epoch": 0.8049703327887179, + "grad_norm": 1.0234375, + "learning_rate": 5.527495953628754e-05, + "loss": 1.1628, + "step": 5106 + }, + { + "epoch": 0.8051279846361109, + "grad_norm": 0.91796875, + "learning_rate": 5.527084195755703e-05, + "loss": 1.0737, + "step": 5107 + }, + { + "epoch": 0.8052856364835039, + "grad_norm": 0.96484375, + "learning_rate": 5.526672447362737e-05, + "loss": 1.0728, + "step": 5108 + }, + { + "epoch": 0.8054432883308968, + "grad_norm": 0.83984375, + "learning_rate": 5.526260708450738e-05, + "loss": 0.8846, + "step": 5109 + }, + { + "epoch": 0.8056009401782899, + "grad_norm": 0.890625, + "learning_rate": 5.525848979020573e-05, + "loss": 0.9312, + "step": 5110 + }, + { + "epoch": 0.8057585920256829, + "grad_norm": 0.9375, + "learning_rate": 5.525437259073115e-05, + "loss": 0.8961, + "step": 5111 + }, + { + "epoch": 0.8059162438730759, + "grad_norm": 0.93359375, + "learning_rate": 5.525025548609237e-05, + "loss": 1.0776, + "step": 5112 + }, + { + "epoch": 0.806073895720469, + "grad_norm": 0.8671875, + "learning_rate": 5.5246138476298095e-05, + "loss": 1.0368, + "step": 5113 + }, + { + "epoch": 0.806231547567862, + "grad_norm": 0.9453125, + "learning_rate": 5.5242021561357094e-05, + "loss": 1.089, + "step": 5114 + }, + { + "epoch": 0.806389199415255, + "grad_norm": 0.84375, + "learning_rate": 5.523790474127808e-05, + "loss": 1.052, + "step": 5115 + }, + { + "epoch": 0.806546851262648, + "grad_norm": 0.94921875, + "learning_rate": 5.523378801606975e-05, + "loss": 1.0138, + "step": 5116 + }, + { + "epoch": 0.8067045031100409, + "grad_norm": 0.96875, + "learning_rate": 5.522967138574087e-05, + "loss": 1.1764, + "step": 5117 + }, + { + "epoch": 0.806862154957434, + "grad_norm": 0.98046875, + "learning_rate": 5.5225554850300085e-05, + "loss": 0.9761, + "step": 5118 + }, + { + "epoch": 0.807019806804827, + "grad_norm": 0.9375, + "learning_rate": 5.5221438409756224e-05, + "loss": 0.8085, + "step": 5119 + }, + { + "epoch": 0.80717745865222, + "grad_norm": 0.94921875, + "learning_rate": 5.521732206411797e-05, + "loss": 1.0472, + "step": 5120 + }, + { + "epoch": 0.807335110499613, + "grad_norm": 0.9140625, + "learning_rate": 5.521320581339404e-05, + "loss": 0.8863, + "step": 5121 + }, + { + "epoch": 0.8074927623470061, + "grad_norm": 1.0234375, + "learning_rate": 5.520908965759315e-05, + "loss": 0.9264, + "step": 5122 + }, + { + "epoch": 0.8076504141943991, + "grad_norm": 0.91015625, + "learning_rate": 5.520497359672401e-05, + "loss": 0.9371, + "step": 5123 + }, + { + "epoch": 0.8078080660417921, + "grad_norm": 0.9453125, + "learning_rate": 5.52008576307954e-05, + "loss": 1.1294, + "step": 5124 + }, + { + "epoch": 0.807965717889185, + "grad_norm": 0.90625, + "learning_rate": 5.5196741759816016e-05, + "loss": 0.9846, + "step": 5125 + }, + { + "epoch": 0.8081233697365781, + "grad_norm": 0.9609375, + "learning_rate": 5.519262598379457e-05, + "loss": 0.9696, + "step": 5126 + }, + { + "epoch": 0.8082810215839711, + "grad_norm": 1.09375, + "learning_rate": 5.518851030273979e-05, + "loss": 0.8826, + "step": 5127 + }, + { + "epoch": 0.8084386734313641, + "grad_norm": 0.9765625, + "learning_rate": 5.518439471666042e-05, + "loss": 0.935, + "step": 5128 + }, + { + "epoch": 0.8085963252787571, + "grad_norm": 0.8125, + "learning_rate": 5.5180279225565145e-05, + "loss": 0.7327, + "step": 5129 + }, + { + "epoch": 0.8087539771261502, + "grad_norm": 0.90234375, + "learning_rate": 5.517616382946271e-05, + "loss": 0.8761, + "step": 5130 + }, + { + "epoch": 0.8089116289735432, + "grad_norm": 1.171875, + "learning_rate": 5.517204852836183e-05, + "loss": 1.187, + "step": 5131 + }, + { + "epoch": 0.8090692808209362, + "grad_norm": 0.9609375, + "learning_rate": 5.5167933322271196e-05, + "loss": 0.8544, + "step": 5132 + }, + { + "epoch": 0.8092269326683291, + "grad_norm": 1.0390625, + "learning_rate": 5.51638182111996e-05, + "loss": 0.9201, + "step": 5133 + }, + { + "epoch": 0.8093845845157221, + "grad_norm": 0.984375, + "learning_rate": 5.515970319515574e-05, + "loss": 0.9751, + "step": 5134 + }, + { + "epoch": 0.8095422363631152, + "grad_norm": 0.99609375, + "learning_rate": 5.515558827414833e-05, + "loss": 0.9072, + "step": 5135 + }, + { + "epoch": 0.8096998882105082, + "grad_norm": 0.9609375, + "learning_rate": 5.515147344818607e-05, + "loss": 1.0498, + "step": 5136 + }, + { + "epoch": 0.8098575400579012, + "grad_norm": 0.8828125, + "learning_rate": 5.514735871727771e-05, + "loss": 0.8049, + "step": 5137 + }, + { + "epoch": 0.8100151919052943, + "grad_norm": 1.109375, + "learning_rate": 5.514324408143191e-05, + "loss": 0.8281, + "step": 5138 + }, + { + "epoch": 0.8101728437526873, + "grad_norm": 0.984375, + "learning_rate": 5.5139129540657486e-05, + "loss": 0.903, + "step": 5139 + }, + { + "epoch": 0.8103304956000803, + "grad_norm": 0.98046875, + "learning_rate": 5.5135015094963114e-05, + "loss": 0.8336, + "step": 5140 + }, + { + "epoch": 0.8104881474474732, + "grad_norm": 0.87109375, + "learning_rate": 5.5130900744357515e-05, + "loss": 0.8002, + "step": 5141 + }, + { + "epoch": 0.8106457992948662, + "grad_norm": 1.0390625, + "learning_rate": 5.51267864888494e-05, + "loss": 1.1768, + "step": 5142 + }, + { + "epoch": 0.8108034511422593, + "grad_norm": 0.91796875, + "learning_rate": 5.512267232844746e-05, + "loss": 0.7953, + "step": 5143 + }, + { + "epoch": 0.8109611029896523, + "grad_norm": 0.9609375, + "learning_rate": 5.51185582631605e-05, + "loss": 1.1085, + "step": 5144 + }, + { + "epoch": 0.8111187548370453, + "grad_norm": 0.95703125, + "learning_rate": 5.511444429299717e-05, + "loss": 1.0579, + "step": 5145 + }, + { + "epoch": 0.8112764066844383, + "grad_norm": 1.078125, + "learning_rate": 5.511033041796623e-05, + "loss": 1.0626, + "step": 5146 + }, + { + "epoch": 0.8114340585318314, + "grad_norm": 1.109375, + "learning_rate": 5.510621663807637e-05, + "loss": 0.9026, + "step": 5147 + }, + { + "epoch": 0.8115917103792244, + "grad_norm": 0.80078125, + "learning_rate": 5.5102102953336274e-05, + "loss": 0.7934, + "step": 5148 + }, + { + "epoch": 0.8117493622266173, + "grad_norm": 0.87109375, + "learning_rate": 5.509798936375475e-05, + "loss": 0.9132, + "step": 5149 + }, + { + "epoch": 0.8119070140740103, + "grad_norm": 1.0390625, + "learning_rate": 5.509387586934047e-05, + "loss": 1.2006, + "step": 5150 + }, + { + "epoch": 0.8120646659214034, + "grad_norm": 0.93359375, + "learning_rate": 5.508976247010216e-05, + "loss": 1.0395, + "step": 5151 + }, + { + "epoch": 0.8122223177687964, + "grad_norm": 0.91796875, + "learning_rate": 5.5085649166048525e-05, + "loss": 0.8543, + "step": 5152 + }, + { + "epoch": 0.8123799696161894, + "grad_norm": 0.97265625, + "learning_rate": 5.508153595718825e-05, + "loss": 0.9978, + "step": 5153 + }, + { + "epoch": 0.8125376214635824, + "grad_norm": 1.0703125, + "learning_rate": 5.507742284353012e-05, + "loss": 1.3401, + "step": 5154 + }, + { + "epoch": 0.8126952733109755, + "grad_norm": 0.9140625, + "learning_rate": 5.507330982508284e-05, + "loss": 0.7885, + "step": 5155 + }, + { + "epoch": 0.8128529251583685, + "grad_norm": 0.94921875, + "learning_rate": 5.50691969018551e-05, + "loss": 0.9366, + "step": 5156 + }, + { + "epoch": 0.8130105770057614, + "grad_norm": 0.99609375, + "learning_rate": 5.506508407385565e-05, + "loss": 1.0174, + "step": 5157 + }, + { + "epoch": 0.8131682288531544, + "grad_norm": 0.98828125, + "learning_rate": 5.5060971341093135e-05, + "loss": 0.989, + "step": 5158 + }, + { + "epoch": 0.8133258807005475, + "grad_norm": 0.921875, + "learning_rate": 5.5056858703576354e-05, + "loss": 0.9446, + "step": 5159 + }, + { + "epoch": 0.8134835325479405, + "grad_norm": 0.88671875, + "learning_rate": 5.5052746161314e-05, + "loss": 0.8718, + "step": 5160 + }, + { + "epoch": 0.8136411843953335, + "grad_norm": 0.88671875, + "learning_rate": 5.504863371431478e-05, + "loss": 0.8263, + "step": 5161 + }, + { + "epoch": 0.8137988362427265, + "grad_norm": 0.99609375, + "learning_rate": 5.504452136258742e-05, + "loss": 1.1316, + "step": 5162 + }, + { + "epoch": 0.8139564880901196, + "grad_norm": 1.046875, + "learning_rate": 5.504040910614057e-05, + "loss": 1.2804, + "step": 5163 + }, + { + "epoch": 0.8141141399375126, + "grad_norm": 1.015625, + "learning_rate": 5.5036296944983054e-05, + "loss": 0.8445, + "step": 5164 + }, + { + "epoch": 0.8142717917849056, + "grad_norm": 0.80078125, + "learning_rate": 5.503218487912354e-05, + "loss": 0.8056, + "step": 5165 + }, + { + "epoch": 0.8144294436322985, + "grad_norm": 0.91796875, + "learning_rate": 5.5028072908570724e-05, + "loss": 1.0973, + "step": 5166 + }, + { + "epoch": 0.8145870954796915, + "grad_norm": 0.8671875, + "learning_rate": 5.502396103333336e-05, + "loss": 0.7199, + "step": 5167 + }, + { + "epoch": 0.8147447473270846, + "grad_norm": 0.92578125, + "learning_rate": 5.501984925342012e-05, + "loss": 0.831, + "step": 5168 + }, + { + "epoch": 0.8149023991744776, + "grad_norm": 1.0078125, + "learning_rate": 5.501573756883971e-05, + "loss": 0.951, + "step": 5169 + }, + { + "epoch": 0.8150600510218706, + "grad_norm": 0.87890625, + "learning_rate": 5.5011625979600904e-05, + "loss": 0.98, + "step": 5170 + }, + { + "epoch": 0.8152177028692636, + "grad_norm": 0.9453125, + "learning_rate": 5.50075144857124e-05, + "loss": 0.8224, + "step": 5171 + }, + { + "epoch": 0.8153753547166567, + "grad_norm": 1.1953125, + "learning_rate": 5.500340308718288e-05, + "loss": 1.0025, + "step": 5172 + }, + { + "epoch": 0.8155330065640497, + "grad_norm": 0.94921875, + "learning_rate": 5.499929178402108e-05, + "loss": 1.0702, + "step": 5173 + }, + { + "epoch": 0.8156906584114426, + "grad_norm": 0.81640625, + "learning_rate": 5.499518057623572e-05, + "loss": 0.8925, + "step": 5174 + }, + { + "epoch": 0.8158483102588356, + "grad_norm": 0.8984375, + "learning_rate": 5.49910694638355e-05, + "loss": 0.9471, + "step": 5175 + }, + { + "epoch": 0.8160059621062287, + "grad_norm": 0.8046875, + "learning_rate": 5.498695844682913e-05, + "loss": 0.9644, + "step": 5176 + }, + { + "epoch": 0.8161636139536217, + "grad_norm": 0.96484375, + "learning_rate": 5.4982847525225324e-05, + "loss": 1.0802, + "step": 5177 + }, + { + "epoch": 0.8163212658010147, + "grad_norm": 1.015625, + "learning_rate": 5.4978736699032765e-05, + "loss": 0.976, + "step": 5178 + }, + { + "epoch": 0.8164789176484077, + "grad_norm": 0.9296875, + "learning_rate": 5.497462596826024e-05, + "loss": 1.138, + "step": 5179 + }, + { + "epoch": 0.8166365694958008, + "grad_norm": 0.9921875, + "learning_rate": 5.497051533291642e-05, + "loss": 0.8956, + "step": 5180 + }, + { + "epoch": 0.8167942213431938, + "grad_norm": 0.9375, + "learning_rate": 5.4966404793010027e-05, + "loss": 0.9929, + "step": 5181 + }, + { + "epoch": 0.8169518731905867, + "grad_norm": 0.90234375, + "learning_rate": 5.496229434854976e-05, + "loss": 0.9973, + "step": 5182 + }, + { + "epoch": 0.8171095250379797, + "grad_norm": 0.95703125, + "learning_rate": 5.495818399954429e-05, + "loss": 1.0594, + "step": 5183 + }, + { + "epoch": 0.8172671768853728, + "grad_norm": 1.0859375, + "learning_rate": 5.4954073746002424e-05, + "loss": 1.0803, + "step": 5184 + }, + { + "epoch": 0.8174248287327658, + "grad_norm": 0.9921875, + "learning_rate": 5.4949963587932805e-05, + "loss": 1.1302, + "step": 5185 + }, + { + "epoch": 0.8175824805801588, + "grad_norm": 0.90234375, + "learning_rate": 5.494585352534418e-05, + "loss": 0.976, + "step": 5186 + }, + { + "epoch": 0.8177401324275518, + "grad_norm": 1.203125, + "learning_rate": 5.494174355824524e-05, + "loss": 0.992, + "step": 5187 + }, + { + "epoch": 0.8178977842749449, + "grad_norm": 0.9453125, + "learning_rate": 5.493763368664466e-05, + "loss": 0.9713, + "step": 5188 + }, + { + "epoch": 0.8180554361223379, + "grad_norm": 0.96875, + "learning_rate": 5.4933523910551245e-05, + "loss": 0.9347, + "step": 5189 + }, + { + "epoch": 0.8182130879697308, + "grad_norm": 0.953125, + "learning_rate": 5.492941422997362e-05, + "loss": 1.0517, + "step": 5190 + }, + { + "epoch": 0.8183707398171238, + "grad_norm": 0.90625, + "learning_rate": 5.492530464492055e-05, + "loss": 0.8317, + "step": 5191 + }, + { + "epoch": 0.8185283916645169, + "grad_norm": 1.0, + "learning_rate": 5.492119515540071e-05, + "loss": 1.1276, + "step": 5192 + }, + { + "epoch": 0.8186860435119099, + "grad_norm": 0.98828125, + "learning_rate": 5.491708576142278e-05, + "loss": 1.1118, + "step": 5193 + }, + { + "epoch": 0.8188436953593029, + "grad_norm": 1.0234375, + "learning_rate": 5.4912976462995556e-05, + "loss": 0.9278, + "step": 5194 + }, + { + "epoch": 0.8190013472066959, + "grad_norm": 0.91015625, + "learning_rate": 5.4908867260127696e-05, + "loss": 0.8707, + "step": 5195 + }, + { + "epoch": 0.819158999054089, + "grad_norm": 1.171875, + "learning_rate": 5.490475815282792e-05, + "loss": 1.0641, + "step": 5196 + }, + { + "epoch": 0.819316650901482, + "grad_norm": 0.953125, + "learning_rate": 5.490064914110491e-05, + "loss": 0.9458, + "step": 5197 + }, + { + "epoch": 0.8194743027488749, + "grad_norm": 1.1328125, + "learning_rate": 5.489654022496737e-05, + "loss": 0.8619, + "step": 5198 + }, + { + "epoch": 0.8196319545962679, + "grad_norm": 0.86328125, + "learning_rate": 5.489243140442408e-05, + "loss": 1.0698, + "step": 5199 + }, + { + "epoch": 0.819789606443661, + "grad_norm": 0.921875, + "learning_rate": 5.488832267948372e-05, + "loss": 0.9618, + "step": 5200 + }, + { + "epoch": 0.819947258291054, + "grad_norm": 0.96875, + "learning_rate": 5.488421405015496e-05, + "loss": 1.058, + "step": 5201 + }, + { + "epoch": 0.820104910138447, + "grad_norm": 0.98046875, + "learning_rate": 5.488010551644652e-05, + "loss": 0.9512, + "step": 5202 + }, + { + "epoch": 0.82026256198584, + "grad_norm": 0.94140625, + "learning_rate": 5.4875997078367135e-05, + "loss": 0.8277, + "step": 5203 + }, + { + "epoch": 0.820420213833233, + "grad_norm": 0.9453125, + "learning_rate": 5.4871888735925446e-05, + "loss": 0.9278, + "step": 5204 + }, + { + "epoch": 0.8205778656806261, + "grad_norm": 1.015625, + "learning_rate": 5.4867780489130263e-05, + "loss": 0.7493, + "step": 5205 + }, + { + "epoch": 0.820735517528019, + "grad_norm": 0.890625, + "learning_rate": 5.486367233799023e-05, + "loss": 0.8391, + "step": 5206 + }, + { + "epoch": 0.820893169375412, + "grad_norm": 0.8515625, + "learning_rate": 5.4859564282514064e-05, + "loss": 0.8964, + "step": 5207 + }, + { + "epoch": 0.821050821222805, + "grad_norm": 0.94921875, + "learning_rate": 5.485545632271046e-05, + "loss": 1.177, + "step": 5208 + }, + { + "epoch": 0.8212084730701981, + "grad_norm": 0.91015625, + "learning_rate": 5.4851348458588116e-05, + "loss": 0.8275, + "step": 5209 + }, + { + "epoch": 0.8213661249175911, + "grad_norm": 0.9296875, + "learning_rate": 5.4847240690155786e-05, + "loss": 1.0174, + "step": 5210 + }, + { + "epoch": 0.8215237767649841, + "grad_norm": 0.93359375, + "learning_rate": 5.484313301742215e-05, + "loss": 0.8695, + "step": 5211 + }, + { + "epoch": 0.8216814286123771, + "grad_norm": 1.015625, + "learning_rate": 5.483902544039592e-05, + "loss": 0.9129, + "step": 5212 + }, + { + "epoch": 0.8218390804597702, + "grad_norm": 0.859375, + "learning_rate": 5.483491795908578e-05, + "loss": 0.9097, + "step": 5213 + }, + { + "epoch": 0.8219967323071631, + "grad_norm": 1.03125, + "learning_rate": 5.483081057350046e-05, + "loss": 0.9051, + "step": 5214 + }, + { + "epoch": 0.8221543841545561, + "grad_norm": 0.921875, + "learning_rate": 5.482670328364865e-05, + "loss": 0.9964, + "step": 5215 + }, + { + "epoch": 0.8223120360019491, + "grad_norm": 1.0, + "learning_rate": 5.482259608953908e-05, + "loss": 1.2773, + "step": 5216 + }, + { + "epoch": 0.8224696878493422, + "grad_norm": 0.98046875, + "learning_rate": 5.481848899118041e-05, + "loss": 0.8949, + "step": 5217 + }, + { + "epoch": 0.8226273396967352, + "grad_norm": 0.95703125, + "learning_rate": 5.4814381988581344e-05, + "loss": 0.9458, + "step": 5218 + }, + { + "epoch": 0.8227849915441282, + "grad_norm": 0.96875, + "learning_rate": 5.481027508175065e-05, + "loss": 0.7866, + "step": 5219 + }, + { + "epoch": 0.8229426433915212, + "grad_norm": 0.9453125, + "learning_rate": 5.480616827069699e-05, + "loss": 0.8623, + "step": 5220 + }, + { + "epoch": 0.8231002952389143, + "grad_norm": 0.9375, + "learning_rate": 5.4802061555429084e-05, + "loss": 1.0991, + "step": 5221 + }, + { + "epoch": 0.8232579470863072, + "grad_norm": 1.0, + "learning_rate": 5.479795493595561e-05, + "loss": 0.8485, + "step": 5222 + }, + { + "epoch": 0.8234155989337002, + "grad_norm": 0.97265625, + "learning_rate": 5.47938484122853e-05, + "loss": 1.1294, + "step": 5223 + }, + { + "epoch": 0.8235732507810932, + "grad_norm": 0.96484375, + "learning_rate": 5.478974198442679e-05, + "loss": 1.0742, + "step": 5224 + }, + { + "epoch": 0.8237309026284863, + "grad_norm": 0.99609375, + "learning_rate": 5.47856356523889e-05, + "loss": 0.991, + "step": 5225 + }, + { + "epoch": 0.8238885544758793, + "grad_norm": 0.9140625, + "learning_rate": 5.478152941618024e-05, + "loss": 0.8446, + "step": 5226 + }, + { + "epoch": 0.8240462063232723, + "grad_norm": 0.9296875, + "learning_rate": 5.477742327580957e-05, + "loss": 0.9164, + "step": 5227 + }, + { + "epoch": 0.8242038581706653, + "grad_norm": 0.8828125, + "learning_rate": 5.477331723128556e-05, + "loss": 1.1309, + "step": 5228 + }, + { + "epoch": 0.8243615100180584, + "grad_norm": 1.0859375, + "learning_rate": 5.4769211282616874e-05, + "loss": 0.9729, + "step": 5229 + }, + { + "epoch": 0.8245191618654513, + "grad_norm": 1.0703125, + "learning_rate": 5.47651054298123e-05, + "loss": 0.946, + "step": 5230 + }, + { + "epoch": 0.8246768137128443, + "grad_norm": 0.921875, + "learning_rate": 5.476099967288051e-05, + "loss": 0.9177, + "step": 5231 + }, + { + "epoch": 0.8248344655602373, + "grad_norm": 1.1796875, + "learning_rate": 5.4756894011830185e-05, + "loss": 1.0947, + "step": 5232 + }, + { + "epoch": 0.8249921174076303, + "grad_norm": 0.8984375, + "learning_rate": 5.475278844667005e-05, + "loss": 0.9768, + "step": 5233 + }, + { + "epoch": 0.8251497692550234, + "grad_norm": 1.3203125, + "learning_rate": 5.474868297740874e-05, + "loss": 0.961, + "step": 5234 + }, + { + "epoch": 0.8253074211024164, + "grad_norm": 0.90234375, + "learning_rate": 5.474457760405505e-05, + "loss": 1.1096, + "step": 5235 + }, + { + "epoch": 0.8254650729498094, + "grad_norm": 0.83203125, + "learning_rate": 5.4740472326617645e-05, + "loss": 0.8823, + "step": 5236 + }, + { + "epoch": 0.8256227247972024, + "grad_norm": 0.8984375, + "learning_rate": 5.473636714510522e-05, + "loss": 0.8365, + "step": 5237 + }, + { + "epoch": 0.8257803766445954, + "grad_norm": 0.95703125, + "learning_rate": 5.473226205952649e-05, + "loss": 1.2306, + "step": 5238 + }, + { + "epoch": 0.8259380284919884, + "grad_norm": 0.8671875, + "learning_rate": 5.472815706989008e-05, + "loss": 0.9294, + "step": 5239 + }, + { + "epoch": 0.8260956803393814, + "grad_norm": 1.03125, + "learning_rate": 5.472405217620481e-05, + "loss": 1.0705, + "step": 5240 + }, + { + "epoch": 0.8262533321867744, + "grad_norm": 0.9296875, + "learning_rate": 5.471994737847932e-05, + "loss": 0.9364, + "step": 5241 + }, + { + "epoch": 0.8264109840341675, + "grad_norm": 0.99609375, + "learning_rate": 5.471584267672232e-05, + "loss": 0.9921, + "step": 5242 + }, + { + "epoch": 0.8265686358815605, + "grad_norm": 0.96875, + "learning_rate": 5.471173807094249e-05, + "loss": 1.0356, + "step": 5243 + }, + { + "epoch": 0.8267262877289535, + "grad_norm": 0.94921875, + "learning_rate": 5.470763356114851e-05, + "loss": 1.0048, + "step": 5244 + }, + { + "epoch": 0.8268839395763465, + "grad_norm": 0.953125, + "learning_rate": 5.4703529147349155e-05, + "loss": 0.9975, + "step": 5245 + }, + { + "epoch": 0.8270415914237395, + "grad_norm": 1.0859375, + "learning_rate": 5.469942482955307e-05, + "loss": 1.0741, + "step": 5246 + }, + { + "epoch": 0.8271992432711325, + "grad_norm": 1.0234375, + "learning_rate": 5.4695320607768975e-05, + "loss": 0.9203, + "step": 5247 + }, + { + "epoch": 0.8273568951185255, + "grad_norm": 0.88671875, + "learning_rate": 5.469121648200555e-05, + "loss": 0.8414, + "step": 5248 + }, + { + "epoch": 0.8275145469659185, + "grad_norm": 0.9453125, + "learning_rate": 5.468711245227147e-05, + "loss": 0.8854, + "step": 5249 + }, + { + "epoch": 0.8276721988133116, + "grad_norm": 0.94140625, + "learning_rate": 5.4683008518575505e-05, + "loss": 0.9234, + "step": 5250 + }, + { + "epoch": 0.8278298506607046, + "grad_norm": 0.86328125, + "learning_rate": 5.467890468092631e-05, + "loss": 0.917, + "step": 5251 + }, + { + "epoch": 0.8279875025080976, + "grad_norm": 0.96875, + "learning_rate": 5.467480093933258e-05, + "loss": 1.0449, + "step": 5252 + }, + { + "epoch": 0.8281451543554906, + "grad_norm": 0.9140625, + "learning_rate": 5.467069729380303e-05, + "loss": 0.8753, + "step": 5253 + }, + { + "epoch": 0.8283028062028835, + "grad_norm": 0.89453125, + "learning_rate": 5.466659374434634e-05, + "loss": 0.9171, + "step": 5254 + }, + { + "epoch": 0.8284604580502766, + "grad_norm": 0.93359375, + "learning_rate": 5.466249029097117e-05, + "loss": 0.9256, + "step": 5255 + }, + { + "epoch": 0.8286181098976696, + "grad_norm": 0.94140625, + "learning_rate": 5.4658386933686315e-05, + "loss": 0.8906, + "step": 5256 + }, + { + "epoch": 0.8287757617450626, + "grad_norm": 0.92578125, + "learning_rate": 5.465428367250041e-05, + "loss": 1.063, + "step": 5257 + }, + { + "epoch": 0.8289334135924556, + "grad_norm": 1.0703125, + "learning_rate": 5.465018050742215e-05, + "loss": 1.0541, + "step": 5258 + }, + { + "epoch": 0.8290910654398487, + "grad_norm": 0.953125, + "learning_rate": 5.4646077438460244e-05, + "loss": 1.1291, + "step": 5259 + }, + { + "epoch": 0.8292487172872417, + "grad_norm": 0.95703125, + "learning_rate": 5.464197446562339e-05, + "loss": 1.0995, + "step": 5260 + }, + { + "epoch": 0.8294063691346347, + "grad_norm": 0.93359375, + "learning_rate": 5.463787158892028e-05, + "loss": 0.9444, + "step": 5261 + }, + { + "epoch": 0.8295640209820276, + "grad_norm": 0.9375, + "learning_rate": 5.4633768808359596e-05, + "loss": 1.1147, + "step": 5262 + }, + { + "epoch": 0.8297216728294207, + "grad_norm": 0.87890625, + "learning_rate": 5.462966612395006e-05, + "loss": 0.6974, + "step": 5263 + }, + { + "epoch": 0.8298793246768137, + "grad_norm": 1.0625, + "learning_rate": 5.462556353570031e-05, + "loss": 1.0401, + "step": 5264 + }, + { + "epoch": 0.8300369765242067, + "grad_norm": 0.96875, + "learning_rate": 5.462146104361911e-05, + "loss": 1.0604, + "step": 5265 + }, + { + "epoch": 0.8301946283715997, + "grad_norm": 0.89453125, + "learning_rate": 5.461735864771515e-05, + "loss": 0.999, + "step": 5266 + }, + { + "epoch": 0.8303522802189928, + "grad_norm": 1.3359375, + "learning_rate": 5.46132563479971e-05, + "loss": 0.8615, + "step": 5267 + }, + { + "epoch": 0.8305099320663858, + "grad_norm": 0.8515625, + "learning_rate": 5.460915414447365e-05, + "loss": 0.9262, + "step": 5268 + }, + { + "epoch": 0.8306675839137788, + "grad_norm": 0.875, + "learning_rate": 5.460505203715347e-05, + "loss": 1.0039, + "step": 5269 + }, + { + "epoch": 0.8308252357611717, + "grad_norm": 1.0625, + "learning_rate": 5.4600950026045326e-05, + "loss": 1.0897, + "step": 5270 + }, + { + "epoch": 0.8309828876085648, + "grad_norm": 0.87890625, + "learning_rate": 5.4596848111157885e-05, + "loss": 0.9269, + "step": 5271 + }, + { + "epoch": 0.8311405394559578, + "grad_norm": 1.0234375, + "learning_rate": 5.459274629249982e-05, + "loss": 1.1924, + "step": 5272 + }, + { + "epoch": 0.8312981913033508, + "grad_norm": 0.953125, + "learning_rate": 5.458864457007982e-05, + "loss": 0.8545, + "step": 5273 + }, + { + "epoch": 0.8314558431507438, + "grad_norm": 0.84375, + "learning_rate": 5.458454294390657e-05, + "loss": 0.8142, + "step": 5274 + }, + { + "epoch": 0.8316134949981369, + "grad_norm": 0.9765625, + "learning_rate": 5.458044141398881e-05, + "loss": 0.9757, + "step": 5275 + }, + { + "epoch": 0.8317711468455299, + "grad_norm": 0.85546875, + "learning_rate": 5.457633998033522e-05, + "loss": 0.9675, + "step": 5276 + }, + { + "epoch": 0.8319287986929229, + "grad_norm": 0.9296875, + "learning_rate": 5.457223864295449e-05, + "loss": 1.0007, + "step": 5277 + }, + { + "epoch": 0.8320864505403158, + "grad_norm": 0.8828125, + "learning_rate": 5.456813740185529e-05, + "loss": 0.9493, + "step": 5278 + }, + { + "epoch": 0.8322441023877089, + "grad_norm": 0.9921875, + "learning_rate": 5.456403625704629e-05, + "loss": 1.0495, + "step": 5279 + }, + { + "epoch": 0.8324017542351019, + "grad_norm": 1.0, + "learning_rate": 5.455993520853626e-05, + "loss": 1.0945, + "step": 5280 + }, + { + "epoch": 0.8325594060824949, + "grad_norm": 0.984375, + "learning_rate": 5.455583425633384e-05, + "loss": 1.0039, + "step": 5281 + }, + { + "epoch": 0.8327170579298879, + "grad_norm": 0.87890625, + "learning_rate": 5.4551733400447747e-05, + "loss": 0.8577, + "step": 5282 + }, + { + "epoch": 0.832874709777281, + "grad_norm": 0.96484375, + "learning_rate": 5.454763264088665e-05, + "loss": 1.0563, + "step": 5283 + }, + { + "epoch": 0.833032361624674, + "grad_norm": 0.8984375, + "learning_rate": 5.45435319776592e-05, + "loss": 1.074, + "step": 5284 + }, + { + "epoch": 0.833190013472067, + "grad_norm": 0.8828125, + "learning_rate": 5.453943141077418e-05, + "loss": 0.9303, + "step": 5285 + }, + { + "epoch": 0.8333476653194599, + "grad_norm": 1.0546875, + "learning_rate": 5.453533094024024e-05, + "loss": 1.083, + "step": 5286 + }, + { + "epoch": 0.8335053171668529, + "grad_norm": 0.890625, + "learning_rate": 5.4531230566066074e-05, + "loss": 0.913, + "step": 5287 + }, + { + "epoch": 0.833662969014246, + "grad_norm": 0.9375, + "learning_rate": 5.452713028826035e-05, + "loss": 0.962, + "step": 5288 + }, + { + "epoch": 0.833820620861639, + "grad_norm": 0.953125, + "learning_rate": 5.4523030106831754e-05, + "loss": 0.8838, + "step": 5289 + }, + { + "epoch": 0.833978272709032, + "grad_norm": 0.98046875, + "learning_rate": 5.451893002178903e-05, + "loss": 0.9337, + "step": 5290 + }, + { + "epoch": 0.834135924556425, + "grad_norm": 1.03125, + "learning_rate": 5.451483003314082e-05, + "loss": 0.822, + "step": 5291 + }, + { + "epoch": 0.8342935764038181, + "grad_norm": 0.78515625, + "learning_rate": 5.4510730140895835e-05, + "loss": 0.8248, + "step": 5292 + }, + { + "epoch": 0.8344512282512111, + "grad_norm": 0.90234375, + "learning_rate": 5.450663034506276e-05, + "loss": 0.9307, + "step": 5293 + }, + { + "epoch": 0.834608880098604, + "grad_norm": 0.9609375, + "learning_rate": 5.450253064565025e-05, + "loss": 1.0506, + "step": 5294 + }, + { + "epoch": 0.834766531945997, + "grad_norm": 1.203125, + "learning_rate": 5.449843104266705e-05, + "loss": 0.9977, + "step": 5295 + }, + { + "epoch": 0.8349241837933901, + "grad_norm": 1.0234375, + "learning_rate": 5.449433153612184e-05, + "loss": 1.0601, + "step": 5296 + }, + { + "epoch": 0.8350818356407831, + "grad_norm": 1.1015625, + "learning_rate": 5.4490232126023286e-05, + "loss": 0.9959, + "step": 5297 + }, + { + "epoch": 0.8352394874881761, + "grad_norm": 1.0625, + "learning_rate": 5.4486132812380085e-05, + "loss": 1.0214, + "step": 5298 + }, + { + "epoch": 0.8353971393355691, + "grad_norm": 0.91015625, + "learning_rate": 5.448203359520092e-05, + "loss": 0.9181, + "step": 5299 + }, + { + "epoch": 0.8355547911829622, + "grad_norm": 0.9140625, + "learning_rate": 5.447793447449448e-05, + "loss": 0.8187, + "step": 5300 + }, + { + "epoch": 0.8357124430303552, + "grad_norm": 0.98046875, + "learning_rate": 5.447383545026947e-05, + "loss": 0.9523, + "step": 5301 + }, + { + "epoch": 0.8358700948777481, + "grad_norm": 1.0234375, + "learning_rate": 5.4469736522534554e-05, + "loss": 1.06, + "step": 5302 + }, + { + "epoch": 0.8360277467251411, + "grad_norm": 0.85546875, + "learning_rate": 5.446563769129839e-05, + "loss": 0.8695, + "step": 5303 + }, + { + "epoch": 0.8361853985725342, + "grad_norm": 0.984375, + "learning_rate": 5.4461538956569734e-05, + "loss": 1.0142, + "step": 5304 + }, + { + "epoch": 0.8363430504199272, + "grad_norm": 0.9921875, + "learning_rate": 5.4457440318357266e-05, + "loss": 1.1545, + "step": 5305 + }, + { + "epoch": 0.8365007022673202, + "grad_norm": 0.984375, + "learning_rate": 5.445334177666963e-05, + "loss": 1.0802, + "step": 5306 + }, + { + "epoch": 0.8366583541147132, + "grad_norm": 0.91796875, + "learning_rate": 5.444924333151554e-05, + "loss": 0.8082, + "step": 5307 + }, + { + "epoch": 0.8368160059621063, + "grad_norm": 0.94921875, + "learning_rate": 5.444514498290367e-05, + "loss": 0.8939, + "step": 5308 + }, + { + "epoch": 0.8369736578094993, + "grad_norm": 0.90625, + "learning_rate": 5.4441046730842695e-05, + "loss": 1.0111, + "step": 5309 + }, + { + "epoch": 0.8371313096568922, + "grad_norm": 0.859375, + "learning_rate": 5.443694857534134e-05, + "loss": 0.8811, + "step": 5310 + }, + { + "epoch": 0.8372889615042852, + "grad_norm": 1.046875, + "learning_rate": 5.443285051640826e-05, + "loss": 1.0414, + "step": 5311 + }, + { + "epoch": 0.8374466133516782, + "grad_norm": 0.984375, + "learning_rate": 5.442875255405215e-05, + "loss": 1.2941, + "step": 5312 + }, + { + "epoch": 0.8376042651990713, + "grad_norm": 0.83984375, + "learning_rate": 5.442465468828171e-05, + "loss": 0.8372, + "step": 5313 + }, + { + "epoch": 0.8377619170464643, + "grad_norm": 0.83984375, + "learning_rate": 5.442055691910557e-05, + "loss": 0.8019, + "step": 5314 + }, + { + "epoch": 0.8379195688938573, + "grad_norm": 0.94140625, + "learning_rate": 5.441645924653247e-05, + "loss": 1.1077, + "step": 5315 + }, + { + "epoch": 0.8380772207412504, + "grad_norm": 1.0390625, + "learning_rate": 5.4412361670571096e-05, + "loss": 1.1403, + "step": 5316 + }, + { + "epoch": 0.8382348725886434, + "grad_norm": 1.15625, + "learning_rate": 5.4408264191230116e-05, + "loss": 1.1413, + "step": 5317 + }, + { + "epoch": 0.8383925244360363, + "grad_norm": 0.91796875, + "learning_rate": 5.440416680851821e-05, + "loss": 1.013, + "step": 5318 + }, + { + "epoch": 0.8385501762834293, + "grad_norm": 0.93359375, + "learning_rate": 5.440006952244403e-05, + "loss": 0.9658, + "step": 5319 + }, + { + "epoch": 0.8387078281308223, + "grad_norm": 0.94140625, + "learning_rate": 5.4395972333016345e-05, + "loss": 0.9723, + "step": 5320 + }, + { + "epoch": 0.8388654799782154, + "grad_norm": 0.96875, + "learning_rate": 5.4391875240243774e-05, + "loss": 1.1316, + "step": 5321 + }, + { + "epoch": 0.8390231318256084, + "grad_norm": 0.953125, + "learning_rate": 5.438777824413502e-05, + "loss": 1.0209, + "step": 5322 + }, + { + "epoch": 0.8391807836730014, + "grad_norm": 0.9296875, + "learning_rate": 5.4383681344698764e-05, + "loss": 1.1083, + "step": 5323 + }, + { + "epoch": 0.8393384355203944, + "grad_norm": 0.97265625, + "learning_rate": 5.437958454194365e-05, + "loss": 1.0236, + "step": 5324 + }, + { + "epoch": 0.8394960873677875, + "grad_norm": 1.1171875, + "learning_rate": 5.4375487835878444e-05, + "loss": 1.1842, + "step": 5325 + }, + { + "epoch": 0.8396537392151805, + "grad_norm": 0.9921875, + "learning_rate": 5.4371391226511783e-05, + "loss": 0.9599, + "step": 5326 + }, + { + "epoch": 0.8398113910625734, + "grad_norm": 0.97265625, + "learning_rate": 5.436729471385234e-05, + "loss": 0.962, + "step": 5327 + }, + { + "epoch": 0.8399690429099664, + "grad_norm": 0.99609375, + "learning_rate": 5.4363198297908815e-05, + "loss": 1.1417, + "step": 5328 + }, + { + "epoch": 0.8401266947573595, + "grad_norm": 0.91796875, + "learning_rate": 5.435910197868984e-05, + "loss": 0.9005, + "step": 5329 + }, + { + "epoch": 0.8402843466047525, + "grad_norm": 0.79296875, + "learning_rate": 5.435500575620418e-05, + "loss": 0.9509, + "step": 5330 + }, + { + "epoch": 0.8404419984521455, + "grad_norm": 0.953125, + "learning_rate": 5.435090963046048e-05, + "loss": 0.8581, + "step": 5331 + }, + { + "epoch": 0.8405996502995385, + "grad_norm": 0.9140625, + "learning_rate": 5.4346813601467405e-05, + "loss": 1.0648, + "step": 5332 + }, + { + "epoch": 0.8407573021469316, + "grad_norm": 1.1484375, + "learning_rate": 5.434271766923367e-05, + "loss": 1.1964, + "step": 5333 + }, + { + "epoch": 0.8409149539943246, + "grad_norm": 1.6640625, + "learning_rate": 5.4338621833767875e-05, + "loss": 1.1655, + "step": 5334 + }, + { + "epoch": 0.8410726058417175, + "grad_norm": 0.9375, + "learning_rate": 5.433452609507881e-05, + "loss": 0.8811, + "step": 5335 + }, + { + "epoch": 0.8412302576891105, + "grad_norm": 0.83984375, + "learning_rate": 5.433043045317512e-05, + "loss": 0.8548, + "step": 5336 + }, + { + "epoch": 0.8413879095365036, + "grad_norm": 0.84765625, + "learning_rate": 5.432633490806545e-05, + "loss": 0.7957, + "step": 5337 + }, + { + "epoch": 0.8415455613838966, + "grad_norm": 1.34375, + "learning_rate": 5.432223945975852e-05, + "loss": 1.1343, + "step": 5338 + }, + { + "epoch": 0.8417032132312896, + "grad_norm": 0.90625, + "learning_rate": 5.431814410826298e-05, + "loss": 0.9082, + "step": 5339 + }, + { + "epoch": 0.8418608650786826, + "grad_norm": 0.90625, + "learning_rate": 5.4314048853587485e-05, + "loss": 1.0591, + "step": 5340 + }, + { + "epoch": 0.8420185169260757, + "grad_norm": 0.79296875, + "learning_rate": 5.430995369574079e-05, + "loss": 0.7362, + "step": 5341 + }, + { + "epoch": 0.8421761687734687, + "grad_norm": 0.89453125, + "learning_rate": 5.430585863473154e-05, + "loss": 0.8145, + "step": 5342 + }, + { + "epoch": 0.8423338206208616, + "grad_norm": 0.9609375, + "learning_rate": 5.430176367056842e-05, + "loss": 0.9817, + "step": 5343 + }, + { + "epoch": 0.8424914724682546, + "grad_norm": 0.98828125, + "learning_rate": 5.429766880326009e-05, + "loss": 0.9073, + "step": 5344 + }, + { + "epoch": 0.8426491243156476, + "grad_norm": 1.0234375, + "learning_rate": 5.429357403281524e-05, + "loss": 1.0548, + "step": 5345 + }, + { + "epoch": 0.8428067761630407, + "grad_norm": 1.1015625, + "learning_rate": 5.4289479359242555e-05, + "loss": 1.2482, + "step": 5346 + }, + { + "epoch": 0.8429644280104337, + "grad_norm": 0.87109375, + "learning_rate": 5.42853847825507e-05, + "loss": 1.0408, + "step": 5347 + }, + { + "epoch": 0.8431220798578267, + "grad_norm": 1.53125, + "learning_rate": 5.4281290302748357e-05, + "loss": 1.0326, + "step": 5348 + }, + { + "epoch": 0.8432797317052197, + "grad_norm": 0.96484375, + "learning_rate": 5.427719591984421e-05, + "loss": 0.9507, + "step": 5349 + }, + { + "epoch": 0.8434373835526128, + "grad_norm": 0.8671875, + "learning_rate": 5.427310163384689e-05, + "loss": 0.8769, + "step": 5350 + }, + { + "epoch": 0.8435950354000057, + "grad_norm": 1.0390625, + "learning_rate": 5.426900744476515e-05, + "loss": 0.9115, + "step": 5351 + }, + { + "epoch": 0.8437526872473987, + "grad_norm": 0.93359375, + "learning_rate": 5.426491335260764e-05, + "loss": 1.0168, + "step": 5352 + }, + { + "epoch": 0.8439103390947917, + "grad_norm": 1.546875, + "learning_rate": 5.426081935738303e-05, + "loss": 0.7667, + "step": 5353 + }, + { + "epoch": 0.8440679909421848, + "grad_norm": 0.85546875, + "learning_rate": 5.425672545910001e-05, + "loss": 0.8063, + "step": 5354 + }, + { + "epoch": 0.8442256427895778, + "grad_norm": 0.94921875, + "learning_rate": 5.42526316577672e-05, + "loss": 0.7669, + "step": 5355 + }, + { + "epoch": 0.8443832946369708, + "grad_norm": 0.96875, + "learning_rate": 5.424853795339335e-05, + "loss": 0.9334, + "step": 5356 + }, + { + "epoch": 0.8445409464843638, + "grad_norm": 1.0390625, + "learning_rate": 5.424444434598712e-05, + "loss": 0.9616, + "step": 5357 + }, + { + "epoch": 0.8446985983317569, + "grad_norm": 0.921875, + "learning_rate": 5.424035083555718e-05, + "loss": 1.1178, + "step": 5358 + }, + { + "epoch": 0.8448562501791498, + "grad_norm": 0.87890625, + "learning_rate": 5.4236257422112205e-05, + "loss": 0.9606, + "step": 5359 + }, + { + "epoch": 0.8450139020265428, + "grad_norm": 0.9765625, + "learning_rate": 5.423216410566081e-05, + "loss": 0.8693, + "step": 5360 + }, + { + "epoch": 0.8451715538739358, + "grad_norm": 0.87109375, + "learning_rate": 5.422807088621178e-05, + "loss": 0.8453, + "step": 5361 + }, + { + "epoch": 0.8453292057213289, + "grad_norm": 0.90625, + "learning_rate": 5.422397776377373e-05, + "loss": 1.0678, + "step": 5362 + }, + { + "epoch": 0.8454868575687219, + "grad_norm": 0.93359375, + "learning_rate": 5.4219884738355356e-05, + "loss": 0.8089, + "step": 5363 + }, + { + "epoch": 0.8456445094161149, + "grad_norm": 1.109375, + "learning_rate": 5.42157918099653e-05, + "loss": 1.0632, + "step": 5364 + }, + { + "epoch": 0.8458021612635079, + "grad_norm": 0.9921875, + "learning_rate": 5.421169897861223e-05, + "loss": 0.935, + "step": 5365 + }, + { + "epoch": 0.845959813110901, + "grad_norm": 0.93359375, + "learning_rate": 5.420760624430488e-05, + "loss": 0.9009, + "step": 5366 + }, + { + "epoch": 0.8461174649582939, + "grad_norm": 1.2421875, + "learning_rate": 5.420351360705189e-05, + "loss": 1.066, + "step": 5367 + }, + { + "epoch": 0.8462751168056869, + "grad_norm": 0.921875, + "learning_rate": 5.4199421066861934e-05, + "loss": 0.8118, + "step": 5368 + }, + { + "epoch": 0.8464327686530799, + "grad_norm": 0.98828125, + "learning_rate": 5.41953286237437e-05, + "loss": 1.0136, + "step": 5369 + }, + { + "epoch": 0.846590420500473, + "grad_norm": 0.88671875, + "learning_rate": 5.419123627770579e-05, + "loss": 0.8225, + "step": 5370 + }, + { + "epoch": 0.846748072347866, + "grad_norm": 0.90625, + "learning_rate": 5.4187144028756994e-05, + "loss": 0.9703, + "step": 5371 + }, + { + "epoch": 0.846905724195259, + "grad_norm": 0.828125, + "learning_rate": 5.418305187690592e-05, + "loss": 0.8827, + "step": 5372 + }, + { + "epoch": 0.847063376042652, + "grad_norm": 0.87890625, + "learning_rate": 5.417895982216126e-05, + "loss": 0.8912, + "step": 5373 + }, + { + "epoch": 0.847221027890045, + "grad_norm": 1.046875, + "learning_rate": 5.417486786453165e-05, + "loss": 1.0141, + "step": 5374 + }, + { + "epoch": 0.847378679737438, + "grad_norm": 1.0234375, + "learning_rate": 5.417077600402577e-05, + "loss": 0.9664, + "step": 5375 + }, + { + "epoch": 0.847536331584831, + "grad_norm": 0.94140625, + "learning_rate": 5.416668424065234e-05, + "loss": 1.0336, + "step": 5376 + }, + { + "epoch": 0.847693983432224, + "grad_norm": 0.9375, + "learning_rate": 5.4162592574420015e-05, + "loss": 1.0457, + "step": 5377 + }, + { + "epoch": 0.847851635279617, + "grad_norm": 1.03125, + "learning_rate": 5.415850100533745e-05, + "loss": 0.957, + "step": 5378 + }, + { + "epoch": 0.8480092871270101, + "grad_norm": 0.99609375, + "learning_rate": 5.415440953341331e-05, + "loss": 0.9653, + "step": 5379 + }, + { + "epoch": 0.8481669389744031, + "grad_norm": 1.515625, + "learning_rate": 5.4150318158656255e-05, + "loss": 0.9962, + "step": 5380 + }, + { + "epoch": 0.8483245908217961, + "grad_norm": 0.8515625, + "learning_rate": 5.414622688107501e-05, + "loss": 0.9141, + "step": 5381 + }, + { + "epoch": 0.8484822426691891, + "grad_norm": 0.9609375, + "learning_rate": 5.414213570067822e-05, + "loss": 0.9514, + "step": 5382 + }, + { + "epoch": 0.8486398945165821, + "grad_norm": 1.234375, + "learning_rate": 5.413804461747456e-05, + "loss": 0.9546, + "step": 5383 + }, + { + "epoch": 0.8487975463639751, + "grad_norm": 1.09375, + "learning_rate": 5.4133953631472676e-05, + "loss": 1.0411, + "step": 5384 + }, + { + "epoch": 0.8489551982113681, + "grad_norm": 0.9765625, + "learning_rate": 5.412986274268126e-05, + "loss": 0.9021, + "step": 5385 + }, + { + "epoch": 0.8491128500587611, + "grad_norm": 0.9296875, + "learning_rate": 5.4125771951108993e-05, + "loss": 1.0593, + "step": 5386 + }, + { + "epoch": 0.8492705019061542, + "grad_norm": 0.9375, + "learning_rate": 5.4121681256764514e-05, + "loss": 1.0389, + "step": 5387 + }, + { + "epoch": 0.8494281537535472, + "grad_norm": 0.89453125, + "learning_rate": 5.411759065965651e-05, + "loss": 0.9675, + "step": 5388 + }, + { + "epoch": 0.8495858056009402, + "grad_norm": 1.4296875, + "learning_rate": 5.4113500159793615e-05, + "loss": 1.1298, + "step": 5389 + }, + { + "epoch": 0.8497434574483332, + "grad_norm": 0.9296875, + "learning_rate": 5.410940975718458e-05, + "loss": 0.9422, + "step": 5390 + }, + { + "epoch": 0.8499011092957262, + "grad_norm": 0.95703125, + "learning_rate": 5.410531945183802e-05, + "loss": 0.9882, + "step": 5391 + }, + { + "epoch": 0.8500587611431192, + "grad_norm": 0.9921875, + "learning_rate": 5.410122924376262e-05, + "loss": 1.0268, + "step": 5392 + }, + { + "epoch": 0.8502164129905122, + "grad_norm": 1.03125, + "learning_rate": 5.4097139132967036e-05, + "loss": 1.1408, + "step": 5393 + }, + { + "epoch": 0.8503740648379052, + "grad_norm": 0.99609375, + "learning_rate": 5.4093049119459936e-05, + "loss": 1.2685, + "step": 5394 + }, + { + "epoch": 0.8505317166852983, + "grad_norm": 0.96484375, + "learning_rate": 5.4088959203249965e-05, + "loss": 0.9417, + "step": 5395 + }, + { + "epoch": 0.8506893685326913, + "grad_norm": 0.9453125, + "learning_rate": 5.408486938434585e-05, + "loss": 0.973, + "step": 5396 + }, + { + "epoch": 0.8508470203800843, + "grad_norm": 0.984375, + "learning_rate": 5.408077966275624e-05, + "loss": 0.9648, + "step": 5397 + }, + { + "epoch": 0.8510046722274773, + "grad_norm": 0.953125, + "learning_rate": 5.407669003848978e-05, + "loss": 0.875, + "step": 5398 + }, + { + "epoch": 0.8511623240748702, + "grad_norm": 0.984375, + "learning_rate": 5.407260051155516e-05, + "loss": 0.8967, + "step": 5399 + }, + { + "epoch": 0.8513199759222633, + "grad_norm": 0.9375, + "learning_rate": 5.4068511081961004e-05, + "loss": 0.8516, + "step": 5400 + }, + { + "epoch": 0.8514776277696563, + "grad_norm": 0.84765625, + "learning_rate": 5.4064421749716046e-05, + "loss": 0.8777, + "step": 5401 + }, + { + "epoch": 0.8516352796170493, + "grad_norm": 0.98046875, + "learning_rate": 5.406033251482892e-05, + "loss": 0.9872, + "step": 5402 + }, + { + "epoch": 0.8517929314644423, + "grad_norm": 0.796875, + "learning_rate": 5.40562433773083e-05, + "loss": 0.683, + "step": 5403 + }, + { + "epoch": 0.8519505833118354, + "grad_norm": 0.95703125, + "learning_rate": 5.405215433716284e-05, + "loss": 1.1625, + "step": 5404 + }, + { + "epoch": 0.8521082351592284, + "grad_norm": 0.90625, + "learning_rate": 5.404806539440117e-05, + "loss": 0.893, + "step": 5405 + }, + { + "epoch": 0.8522658870066214, + "grad_norm": 0.9765625, + "learning_rate": 5.404397654903204e-05, + "loss": 1.0531, + "step": 5406 + }, + { + "epoch": 0.8524235388540143, + "grad_norm": 1.0390625, + "learning_rate": 5.4039887801064085e-05, + "loss": 1.1981, + "step": 5407 + }, + { + "epoch": 0.8525811907014074, + "grad_norm": 0.87890625, + "learning_rate": 5.4035799150505936e-05, + "loss": 1.0637, + "step": 5408 + }, + { + "epoch": 0.8527388425488004, + "grad_norm": 0.86328125, + "learning_rate": 5.4031710597366314e-05, + "loss": 0.8279, + "step": 5409 + }, + { + "epoch": 0.8528964943961934, + "grad_norm": 1.046875, + "learning_rate": 5.4027622141653797e-05, + "loss": 1.037, + "step": 5410 + }, + { + "epoch": 0.8530541462435864, + "grad_norm": 0.91015625, + "learning_rate": 5.4023533783377146e-05, + "loss": 0.9613, + "step": 5411 + }, + { + "epoch": 0.8532117980909795, + "grad_norm": 1.09375, + "learning_rate": 5.4019445522544996e-05, + "loss": 1.1839, + "step": 5412 + }, + { + "epoch": 0.8533694499383725, + "grad_norm": 0.9296875, + "learning_rate": 5.4015357359166005e-05, + "loss": 1.0147, + "step": 5413 + }, + { + "epoch": 0.8535271017857655, + "grad_norm": 0.89453125, + "learning_rate": 5.401126929324882e-05, + "loss": 0.9077, + "step": 5414 + }, + { + "epoch": 0.8536847536331584, + "grad_norm": 0.953125, + "learning_rate": 5.400718132480209e-05, + "loss": 0.9579, + "step": 5415 + }, + { + "epoch": 0.8538424054805515, + "grad_norm": 1.0234375, + "learning_rate": 5.400309345383457e-05, + "loss": 0.9847, + "step": 5416 + }, + { + "epoch": 0.8540000573279445, + "grad_norm": 0.97265625, + "learning_rate": 5.399900568035483e-05, + "loss": 0.9473, + "step": 5417 + }, + { + "epoch": 0.8541577091753375, + "grad_norm": 0.89453125, + "learning_rate": 5.3994918004371594e-05, + "loss": 0.8808, + "step": 5418 + }, + { + "epoch": 0.8543153610227305, + "grad_norm": 0.9921875, + "learning_rate": 5.399083042589348e-05, + "loss": 1.0036, + "step": 5419 + }, + { + "epoch": 0.8544730128701236, + "grad_norm": 0.97265625, + "learning_rate": 5.398674294492915e-05, + "loss": 0.9734, + "step": 5420 + }, + { + "epoch": 0.8546306647175166, + "grad_norm": 0.8125, + "learning_rate": 5.3982655561487317e-05, + "loss": 0.9005, + "step": 5421 + }, + { + "epoch": 0.8547883165649096, + "grad_norm": 1.140625, + "learning_rate": 5.397856827557661e-05, + "loss": 0.9832, + "step": 5422 + }, + { + "epoch": 0.8549459684123025, + "grad_norm": 0.95703125, + "learning_rate": 5.3974481087205706e-05, + "loss": 0.8583, + "step": 5423 + }, + { + "epoch": 0.8551036202596956, + "grad_norm": 0.82421875, + "learning_rate": 5.397039399638326e-05, + "loss": 0.9437, + "step": 5424 + }, + { + "epoch": 0.8552612721070886, + "grad_norm": 1.046875, + "learning_rate": 5.396630700311793e-05, + "loss": 0.8679, + "step": 5425 + }, + { + "epoch": 0.8554189239544816, + "grad_norm": 0.99609375, + "learning_rate": 5.396222010741834e-05, + "loss": 0.9481, + "step": 5426 + }, + { + "epoch": 0.8555765758018746, + "grad_norm": 1.1484375, + "learning_rate": 5.3958133309293245e-05, + "loss": 0.8645, + "step": 5427 + }, + { + "epoch": 0.8557342276492677, + "grad_norm": 1.1171875, + "learning_rate": 5.395404660875124e-05, + "loss": 1.032, + "step": 5428 + }, + { + "epoch": 0.8558918794966607, + "grad_norm": 1.0546875, + "learning_rate": 5.3949960005801004e-05, + "loss": 1.2569, + "step": 5429 + }, + { + "epoch": 0.8560495313440537, + "grad_norm": 0.953125, + "learning_rate": 5.3945873500451196e-05, + "loss": 0.7883, + "step": 5430 + }, + { + "epoch": 0.8562071831914466, + "grad_norm": 1.0390625, + "learning_rate": 5.394178709271047e-05, + "loss": 0.996, + "step": 5431 + }, + { + "epoch": 0.8563648350388396, + "grad_norm": 1.0859375, + "learning_rate": 5.3937700782587506e-05, + "loss": 1.0072, + "step": 5432 + }, + { + "epoch": 0.8565224868862327, + "grad_norm": 0.87109375, + "learning_rate": 5.393361457009095e-05, + "loss": 0.8809, + "step": 5433 + }, + { + "epoch": 0.8566801387336257, + "grad_norm": 0.99609375, + "learning_rate": 5.392952845522947e-05, + "loss": 0.9547, + "step": 5434 + }, + { + "epoch": 0.8568377905810187, + "grad_norm": 1.109375, + "learning_rate": 5.392544243801167e-05, + "loss": 0.9758, + "step": 5435 + }, + { + "epoch": 0.8569954424284117, + "grad_norm": 0.890625, + "learning_rate": 5.392135651844631e-05, + "loss": 1.0272, + "step": 5436 + }, + { + "epoch": 0.8571530942758048, + "grad_norm": 0.89453125, + "learning_rate": 5.3917270696542e-05, + "loss": 0.9655, + "step": 5437 + }, + { + "epoch": 0.8573107461231978, + "grad_norm": 0.953125, + "learning_rate": 5.391318497230739e-05, + "loss": 0.8843, + "step": 5438 + }, + { + "epoch": 0.8574683979705907, + "grad_norm": 0.83203125, + "learning_rate": 5.390909934575116e-05, + "loss": 0.8495, + "step": 5439 + }, + { + "epoch": 0.8576260498179837, + "grad_norm": 0.8984375, + "learning_rate": 5.390501381688191e-05, + "loss": 0.9226, + "step": 5440 + }, + { + "epoch": 0.8577837016653768, + "grad_norm": 0.95703125, + "learning_rate": 5.390092838570841e-05, + "loss": 0.9759, + "step": 5441 + }, + { + "epoch": 0.8579413535127698, + "grad_norm": 0.91796875, + "learning_rate": 5.389684305223923e-05, + "loss": 0.9135, + "step": 5442 + }, + { + "epoch": 0.8580990053601628, + "grad_norm": 1.015625, + "learning_rate": 5.3892757816483073e-05, + "loss": 1.0694, + "step": 5443 + }, + { + "epoch": 0.8582566572075558, + "grad_norm": 0.9609375, + "learning_rate": 5.388867267844857e-05, + "loss": 0.7533, + "step": 5444 + }, + { + "epoch": 0.8584143090549489, + "grad_norm": 0.98828125, + "learning_rate": 5.388458763814437e-05, + "loss": 0.8991, + "step": 5445 + }, + { + "epoch": 0.8585719609023419, + "grad_norm": 0.9765625, + "learning_rate": 5.388050269557917e-05, + "loss": 0.9416, + "step": 5446 + }, + { + "epoch": 0.8587296127497348, + "grad_norm": 6.03125, + "learning_rate": 5.387641785076162e-05, + "loss": 0.9521, + "step": 5447 + }, + { + "epoch": 0.8588872645971278, + "grad_norm": 1.0859375, + "learning_rate": 5.387233310370036e-05, + "loss": 1.2063, + "step": 5448 + }, + { + "epoch": 0.8590449164445209, + "grad_norm": 0.91796875, + "learning_rate": 5.386824845440406e-05, + "loss": 0.7649, + "step": 5449 + }, + { + "epoch": 0.8592025682919139, + "grad_norm": 0.90625, + "learning_rate": 5.3864163902881316e-05, + "loss": 0.9348, + "step": 5450 + }, + { + "epoch": 0.8593602201393069, + "grad_norm": 0.828125, + "learning_rate": 5.3860079449140886e-05, + "loss": 0.8151, + "step": 5451 + }, + { + "epoch": 0.8595178719866999, + "grad_norm": 0.84375, + "learning_rate": 5.385599509319139e-05, + "loss": 0.9416, + "step": 5452 + }, + { + "epoch": 0.859675523834093, + "grad_norm": 1.2265625, + "learning_rate": 5.385191083504146e-05, + "loss": 0.8906, + "step": 5453 + }, + { + "epoch": 0.859833175681486, + "grad_norm": 0.90625, + "learning_rate": 5.3847826674699785e-05, + "loss": 0.9712, + "step": 5454 + }, + { + "epoch": 0.8599908275288789, + "grad_norm": 1.0703125, + "learning_rate": 5.384374261217495e-05, + "loss": 0.884, + "step": 5455 + }, + { + "epoch": 0.8601484793762719, + "grad_norm": 1.0, + "learning_rate": 5.383965864747571e-05, + "loss": 0.9084, + "step": 5456 + }, + { + "epoch": 0.860306131223665, + "grad_norm": 1.03125, + "learning_rate": 5.3835574780610676e-05, + "loss": 1.1974, + "step": 5457 + }, + { + "epoch": 0.860463783071058, + "grad_norm": 1.078125, + "learning_rate": 5.383149101158851e-05, + "loss": 1.1331, + "step": 5458 + }, + { + "epoch": 0.860621434918451, + "grad_norm": 0.8984375, + "learning_rate": 5.3827407340417844e-05, + "loss": 0.8904, + "step": 5459 + }, + { + "epoch": 0.860779086765844, + "grad_norm": 0.9609375, + "learning_rate": 5.382332376710731e-05, + "loss": 1.0053, + "step": 5460 + }, + { + "epoch": 0.860936738613237, + "grad_norm": 1.0234375, + "learning_rate": 5.381924029166565e-05, + "loss": 1.0447, + "step": 5461 + }, + { + "epoch": 0.8610943904606301, + "grad_norm": 1.046875, + "learning_rate": 5.381515691410147e-05, + "loss": 0.9282, + "step": 5462 + }, + { + "epoch": 0.861252042308023, + "grad_norm": 1.0078125, + "learning_rate": 5.381107363442341e-05, + "loss": 0.8708, + "step": 5463 + }, + { + "epoch": 0.861409694155416, + "grad_norm": 0.95703125, + "learning_rate": 5.380699045264017e-05, + "loss": 1.1283, + "step": 5464 + }, + { + "epoch": 0.861567346002809, + "grad_norm": 0.96484375, + "learning_rate": 5.380290736876031e-05, + "loss": 1.1498, + "step": 5465 + }, + { + "epoch": 0.8617249978502021, + "grad_norm": 0.95703125, + "learning_rate": 5.37988243827926e-05, + "loss": 0.9288, + "step": 5466 + }, + { + "epoch": 0.8618826496975951, + "grad_norm": 0.89453125, + "learning_rate": 5.3794741494745636e-05, + "loss": 0.9818, + "step": 5467 + }, + { + "epoch": 0.8620403015449881, + "grad_norm": 0.94921875, + "learning_rate": 5.379065870462807e-05, + "loss": 0.8926, + "step": 5468 + }, + { + "epoch": 0.8621979533923811, + "grad_norm": 1.0078125, + "learning_rate": 5.3786576012448564e-05, + "loss": 0.9919, + "step": 5469 + }, + { + "epoch": 0.8623556052397742, + "grad_norm": 0.96875, + "learning_rate": 5.378249341821577e-05, + "loss": 0.881, + "step": 5470 + }, + { + "epoch": 0.8625132570871671, + "grad_norm": 1.015625, + "learning_rate": 5.3778410921938335e-05, + "loss": 1.1738, + "step": 5471 + }, + { + "epoch": 0.8626709089345601, + "grad_norm": 1.0390625, + "learning_rate": 5.377432852362493e-05, + "loss": 0.8708, + "step": 5472 + }, + { + "epoch": 0.8628285607819531, + "grad_norm": 1.0078125, + "learning_rate": 5.377024622328418e-05, + "loss": 0.934, + "step": 5473 + }, + { + "epoch": 0.8629862126293462, + "grad_norm": 1.984375, + "learning_rate": 5.376616402092472e-05, + "loss": 0.8773, + "step": 5474 + }, + { + "epoch": 0.8631438644767392, + "grad_norm": 0.94140625, + "learning_rate": 5.3762081916555277e-05, + "loss": 1.0426, + "step": 5475 + }, + { + "epoch": 0.8633015163241322, + "grad_norm": 0.94140625, + "learning_rate": 5.3757999910184444e-05, + "loss": 0.8709, + "step": 5476 + }, + { + "epoch": 0.8634591681715252, + "grad_norm": 0.890625, + "learning_rate": 5.3753918001820894e-05, + "loss": 0.972, + "step": 5477 + }, + { + "epoch": 0.8636168200189183, + "grad_norm": 0.96484375, + "learning_rate": 5.374983619147327e-05, + "loss": 1.005, + "step": 5478 + }, + { + "epoch": 0.8637744718663113, + "grad_norm": 1.484375, + "learning_rate": 5.3745754479150225e-05, + "loss": 0.9892, + "step": 5479 + }, + { + "epoch": 0.8639321237137042, + "grad_norm": 0.984375, + "learning_rate": 5.374167286486037e-05, + "loss": 0.8773, + "step": 5480 + }, + { + "epoch": 0.8640897755610972, + "grad_norm": 0.9453125, + "learning_rate": 5.3737591348612436e-05, + "loss": 0.8845, + "step": 5481 + }, + { + "epoch": 0.8642474274084903, + "grad_norm": 0.98046875, + "learning_rate": 5.373350993041504e-05, + "loss": 0.8877, + "step": 5482 + }, + { + "epoch": 0.8644050792558833, + "grad_norm": 0.96484375, + "learning_rate": 5.3729428610276814e-05, + "loss": 1.0384, + "step": 5483 + }, + { + "epoch": 0.8645627311032763, + "grad_norm": 1.03125, + "learning_rate": 5.372534738820643e-05, + "loss": 0.7953, + "step": 5484 + }, + { + "epoch": 0.8647203829506693, + "grad_norm": 0.94921875, + "learning_rate": 5.372126626421251e-05, + "loss": 0.9685, + "step": 5485 + }, + { + "epoch": 0.8648780347980624, + "grad_norm": 0.984375, + "learning_rate": 5.3717185238303694e-05, + "loss": 0.9756, + "step": 5486 + }, + { + "epoch": 0.8650356866454554, + "grad_norm": 1.0, + "learning_rate": 5.371310431048869e-05, + "loss": 0.9024, + "step": 5487 + }, + { + "epoch": 0.8651933384928483, + "grad_norm": 0.89453125, + "learning_rate": 5.370902348077613e-05, + "loss": 0.9132, + "step": 5488 + }, + { + "epoch": 0.8653509903402413, + "grad_norm": 0.97265625, + "learning_rate": 5.370494274917464e-05, + "loss": 0.9129, + "step": 5489 + }, + { + "epoch": 0.8655086421876343, + "grad_norm": 0.83203125, + "learning_rate": 5.370086211569287e-05, + "loss": 0.9744, + "step": 5490 + }, + { + "epoch": 0.8656662940350274, + "grad_norm": 0.94921875, + "learning_rate": 5.369678158033945e-05, + "loss": 0.9591, + "step": 5491 + }, + { + "epoch": 0.8658239458824204, + "grad_norm": 0.9375, + "learning_rate": 5.3692701143123094e-05, + "loss": 1.0054, + "step": 5492 + }, + { + "epoch": 0.8659815977298134, + "grad_norm": 1.09375, + "learning_rate": 5.36886208040524e-05, + "loss": 1.0017, + "step": 5493 + }, + { + "epoch": 0.8661392495772064, + "grad_norm": 1.0078125, + "learning_rate": 5.368454056313603e-05, + "loss": 1.1823, + "step": 5494 + }, + { + "epoch": 0.8662969014245995, + "grad_norm": 0.85546875, + "learning_rate": 5.368046042038264e-05, + "loss": 0.9306, + "step": 5495 + }, + { + "epoch": 0.8664545532719924, + "grad_norm": 0.90234375, + "learning_rate": 5.367638037580083e-05, + "loss": 1.0665, + "step": 5496 + }, + { + "epoch": 0.8666122051193854, + "grad_norm": 0.94140625, + "learning_rate": 5.3672300429399305e-05, + "loss": 0.9194, + "step": 5497 + }, + { + "epoch": 0.8667698569667784, + "grad_norm": 0.92578125, + "learning_rate": 5.36682205811867e-05, + "loss": 1.0626, + "step": 5498 + }, + { + "epoch": 0.8669275088141715, + "grad_norm": 1.0703125, + "learning_rate": 5.366414083117165e-05, + "loss": 1.0507, + "step": 5499 + }, + { + "epoch": 0.8670851606615645, + "grad_norm": 0.859375, + "learning_rate": 5.366006117936281e-05, + "loss": 0.9181, + "step": 5500 + }, + { + "epoch": 0.8672428125089575, + "grad_norm": 0.93359375, + "learning_rate": 5.365598162576878e-05, + "loss": 0.9944, + "step": 5501 + }, + { + "epoch": 0.8674004643563505, + "grad_norm": 0.87890625, + "learning_rate": 5.365190217039828e-05, + "loss": 1.0055, + "step": 5502 + }, + { + "epoch": 0.8675581162037436, + "grad_norm": 0.99609375, + "learning_rate": 5.364782281325994e-05, + "loss": 0.9273, + "step": 5503 + }, + { + "epoch": 0.8677157680511365, + "grad_norm": 1.1484375, + "learning_rate": 5.364374355436239e-05, + "loss": 1.1744, + "step": 5504 + }, + { + "epoch": 0.8678734198985295, + "grad_norm": 0.98828125, + "learning_rate": 5.363966439371426e-05, + "loss": 0.9707, + "step": 5505 + }, + { + "epoch": 0.8680310717459225, + "grad_norm": 1.125, + "learning_rate": 5.363558533132418e-05, + "loss": 0.924, + "step": 5506 + }, + { + "epoch": 0.8681887235933156, + "grad_norm": 0.9609375, + "learning_rate": 5.363150636720087e-05, + "loss": 1.1148, + "step": 5507 + }, + { + "epoch": 0.8683463754407086, + "grad_norm": 0.890625, + "learning_rate": 5.362742750135292e-05, + "loss": 0.8912, + "step": 5508 + }, + { + "epoch": 0.8685040272881016, + "grad_norm": 0.90625, + "learning_rate": 5.3623348733789e-05, + "loss": 0.9579, + "step": 5509 + }, + { + "epoch": 0.8686616791354946, + "grad_norm": 0.99609375, + "learning_rate": 5.361927006451775e-05, + "loss": 0.8604, + "step": 5510 + }, + { + "epoch": 0.8688193309828877, + "grad_norm": 0.92578125, + "learning_rate": 5.3615191493547745e-05, + "loss": 0.9789, + "step": 5511 + }, + { + "epoch": 0.8689769828302806, + "grad_norm": 0.953125, + "learning_rate": 5.361111302088774e-05, + "loss": 0.9296, + "step": 5512 + }, + { + "epoch": 0.8691346346776736, + "grad_norm": 5.84375, + "learning_rate": 5.360703464654633e-05, + "loss": 0.99, + "step": 5513 + }, + { + "epoch": 0.8692922865250666, + "grad_norm": 1.0546875, + "learning_rate": 5.360295637053215e-05, + "loss": 1.1978, + "step": 5514 + }, + { + "epoch": 0.8694499383724597, + "grad_norm": 0.90234375, + "learning_rate": 5.359887819285386e-05, + "loss": 0.9493, + "step": 5515 + }, + { + "epoch": 0.8696075902198527, + "grad_norm": 0.98046875, + "learning_rate": 5.359480011352009e-05, + "loss": 1.0329, + "step": 5516 + }, + { + "epoch": 0.8697652420672457, + "grad_norm": 0.9765625, + "learning_rate": 5.35907221325395e-05, + "loss": 1.0045, + "step": 5517 + }, + { + "epoch": 0.8699228939146387, + "grad_norm": 0.9140625, + "learning_rate": 5.358664424992072e-05, + "loss": 0.7608, + "step": 5518 + }, + { + "epoch": 0.8700805457620318, + "grad_norm": 1.0078125, + "learning_rate": 5.358256646567239e-05, + "loss": 1.1914, + "step": 5519 + }, + { + "epoch": 0.8702381976094247, + "grad_norm": 1.0078125, + "learning_rate": 5.357848877980315e-05, + "loss": 1.0989, + "step": 5520 + }, + { + "epoch": 0.8703958494568177, + "grad_norm": 0.98828125, + "learning_rate": 5.357441119232162e-05, + "loss": 1.1018, + "step": 5521 + }, + { + "epoch": 0.8705535013042107, + "grad_norm": 1.0703125, + "learning_rate": 5.3570333703236495e-05, + "loss": 1.17, + "step": 5522 + }, + { + "epoch": 0.8707111531516037, + "grad_norm": 0.98046875, + "learning_rate": 5.356625631255642e-05, + "loss": 0.7659, + "step": 5523 + }, + { + "epoch": 0.8708688049989968, + "grad_norm": 0.88671875, + "learning_rate": 5.356217902028999e-05, + "loss": 0.9319, + "step": 5524 + }, + { + "epoch": 0.8710264568463898, + "grad_norm": 0.953125, + "learning_rate": 5.355810182644587e-05, + "loss": 0.9368, + "step": 5525 + }, + { + "epoch": 0.8711841086937828, + "grad_norm": 1.0546875, + "learning_rate": 5.355402473103268e-05, + "loss": 1.1292, + "step": 5526 + }, + { + "epoch": 0.8713417605411758, + "grad_norm": 1.0078125, + "learning_rate": 5.3549947734059104e-05, + "loss": 1.105, + "step": 5527 + }, + { + "epoch": 0.8714994123885688, + "grad_norm": 0.94921875, + "learning_rate": 5.354587083553375e-05, + "loss": 0.9808, + "step": 5528 + }, + { + "epoch": 0.8716570642359618, + "grad_norm": 0.8984375, + "learning_rate": 5.3541794035465276e-05, + "loss": 0.787, + "step": 5529 + }, + { + "epoch": 0.8718147160833548, + "grad_norm": 1.078125, + "learning_rate": 5.353771733386231e-05, + "loss": 0.7993, + "step": 5530 + }, + { + "epoch": 0.8719723679307478, + "grad_norm": 0.98046875, + "learning_rate": 5.353364073073347e-05, + "loss": 1.0618, + "step": 5531 + }, + { + "epoch": 0.8721300197781409, + "grad_norm": 1.2109375, + "learning_rate": 5.352956422608746e-05, + "loss": 0.7805, + "step": 5532 + }, + { + "epoch": 0.8722876716255339, + "grad_norm": 0.97265625, + "learning_rate": 5.352548781993287e-05, + "loss": 0.8289, + "step": 5533 + }, + { + "epoch": 0.8724453234729269, + "grad_norm": 1.0625, + "learning_rate": 5.3521411512278366e-05, + "loss": 0.9012, + "step": 5534 + }, + { + "epoch": 0.8726029753203199, + "grad_norm": 0.90234375, + "learning_rate": 5.351733530313255e-05, + "loss": 0.7937, + "step": 5535 + }, + { + "epoch": 0.8727606271677129, + "grad_norm": 1.0234375, + "learning_rate": 5.3513259192504076e-05, + "loss": 0.9976, + "step": 5536 + }, + { + "epoch": 0.8729182790151059, + "grad_norm": 1.1484375, + "learning_rate": 5.350918318040161e-05, + "loss": 1.0026, + "step": 5537 + }, + { + "epoch": 0.8730759308624989, + "grad_norm": 0.95703125, + "learning_rate": 5.350510726683378e-05, + "loss": 0.7239, + "step": 5538 + }, + { + "epoch": 0.8732335827098919, + "grad_norm": 0.9375, + "learning_rate": 5.350103145180922e-05, + "loss": 1.1158, + "step": 5539 + }, + { + "epoch": 0.873391234557285, + "grad_norm": 1.25, + "learning_rate": 5.349695573533655e-05, + "loss": 0.9755, + "step": 5540 + }, + { + "epoch": 0.873548886404678, + "grad_norm": 0.99609375, + "learning_rate": 5.3492880117424404e-05, + "loss": 0.9594, + "step": 5541 + }, + { + "epoch": 0.873706538252071, + "grad_norm": 0.9453125, + "learning_rate": 5.348880459808148e-05, + "loss": 0.9597, + "step": 5542 + }, + { + "epoch": 0.873864190099464, + "grad_norm": 1.0, + "learning_rate": 5.348472917731637e-05, + "loss": 0.9553, + "step": 5543 + }, + { + "epoch": 0.874021841946857, + "grad_norm": 0.9921875, + "learning_rate": 5.348065385513772e-05, + "loss": 1.2525, + "step": 5544 + }, + { + "epoch": 0.87417949379425, + "grad_norm": 0.86328125, + "learning_rate": 5.347657863155415e-05, + "loss": 1.0777, + "step": 5545 + }, + { + "epoch": 0.874337145641643, + "grad_norm": 0.9765625, + "learning_rate": 5.347250350657429e-05, + "loss": 1.1205, + "step": 5546 + }, + { + "epoch": 0.874494797489036, + "grad_norm": 0.96875, + "learning_rate": 5.346842848020683e-05, + "loss": 1.1123, + "step": 5547 + }, + { + "epoch": 0.874652449336429, + "grad_norm": 0.91796875, + "learning_rate": 5.34643535524604e-05, + "loss": 0.9374, + "step": 5548 + }, + { + "epoch": 0.8748101011838221, + "grad_norm": 1.0390625, + "learning_rate": 5.346027872334358e-05, + "loss": 1.1903, + "step": 5549 + }, + { + "epoch": 0.8749677530312151, + "grad_norm": 0.93359375, + "learning_rate": 5.3456203992865065e-05, + "loss": 1.1511, + "step": 5550 + }, + { + "epoch": 0.8751254048786081, + "grad_norm": 0.98828125, + "learning_rate": 5.345212936103341e-05, + "loss": 0.8491, + "step": 5551 + }, + { + "epoch": 0.875283056726001, + "grad_norm": 1.1015625, + "learning_rate": 5.344805482785735e-05, + "loss": 0.8454, + "step": 5552 + }, + { + "epoch": 0.8754407085733941, + "grad_norm": 0.796875, + "learning_rate": 5.344398039334548e-05, + "loss": 0.9668, + "step": 5553 + }, + { + "epoch": 0.8755983604207871, + "grad_norm": 1.0078125, + "learning_rate": 5.343990605750643e-05, + "loss": 1.0905, + "step": 5554 + }, + { + "epoch": 0.8757560122681801, + "grad_norm": 0.84375, + "learning_rate": 5.3435831820348833e-05, + "loss": 0.7291, + "step": 5555 + }, + { + "epoch": 0.8759136641155731, + "grad_norm": 0.9765625, + "learning_rate": 5.343175768188133e-05, + "loss": 0.855, + "step": 5556 + }, + { + "epoch": 0.8760713159629662, + "grad_norm": 0.85546875, + "learning_rate": 5.342768364211257e-05, + "loss": 1.0429, + "step": 5557 + }, + { + "epoch": 0.8762289678103592, + "grad_norm": 0.89453125, + "learning_rate": 5.342360970105116e-05, + "loss": 0.9714, + "step": 5558 + }, + { + "epoch": 0.8763866196577522, + "grad_norm": 1.03125, + "learning_rate": 5.341953585870574e-05, + "loss": 1.0793, + "step": 5559 + }, + { + "epoch": 0.8765442715051451, + "grad_norm": 0.94140625, + "learning_rate": 5.341546211508492e-05, + "loss": 1.0013, + "step": 5560 + }, + { + "epoch": 0.8767019233525382, + "grad_norm": 0.96484375, + "learning_rate": 5.3411388470197397e-05, + "loss": 1.0199, + "step": 5561 + }, + { + "epoch": 0.8768595751999312, + "grad_norm": 0.90625, + "learning_rate": 5.340731492405179e-05, + "loss": 0.9532, + "step": 5562 + }, + { + "epoch": 0.8770172270473242, + "grad_norm": 1.046875, + "learning_rate": 5.340324147665671e-05, + "loss": 1.0215, + "step": 5563 + }, + { + "epoch": 0.8771748788947172, + "grad_norm": 0.96484375, + "learning_rate": 5.339916812802079e-05, + "loss": 0.9403, + "step": 5564 + }, + { + "epoch": 0.8773325307421103, + "grad_norm": 0.98828125, + "learning_rate": 5.339509487815268e-05, + "loss": 1.3008, + "step": 5565 + }, + { + "epoch": 0.8774901825895033, + "grad_norm": 0.85546875, + "learning_rate": 5.3391021727060944e-05, + "loss": 0.8369, + "step": 5566 + }, + { + "epoch": 0.8776478344368963, + "grad_norm": 1.125, + "learning_rate": 5.3386948674754333e-05, + "loss": 1.1578, + "step": 5567 + }, + { + "epoch": 0.8778054862842892, + "grad_norm": 0.98046875, + "learning_rate": 5.338287572124141e-05, + "loss": 0.9204, + "step": 5568 + }, + { + "epoch": 0.8779631381316823, + "grad_norm": 0.91015625, + "learning_rate": 5.337880286653082e-05, + "loss": 0.8878, + "step": 5569 + }, + { + "epoch": 0.8781207899790753, + "grad_norm": 0.8359375, + "learning_rate": 5.337473011063119e-05, + "loss": 0.7978, + "step": 5570 + }, + { + "epoch": 0.8782784418264683, + "grad_norm": 0.9921875, + "learning_rate": 5.337065745355112e-05, + "loss": 0.867, + "step": 5571 + }, + { + "epoch": 0.8784360936738613, + "grad_norm": 0.890625, + "learning_rate": 5.336658489529931e-05, + "loss": 1.0389, + "step": 5572 + }, + { + "epoch": 0.8785937455212544, + "grad_norm": 0.94140625, + "learning_rate": 5.336251243588436e-05, + "loss": 0.905, + "step": 5573 + }, + { + "epoch": 0.8787513973686474, + "grad_norm": 1.015625, + "learning_rate": 5.33584400753149e-05, + "loss": 0.9505, + "step": 5574 + }, + { + "epoch": 0.8789090492160404, + "grad_norm": 0.94921875, + "learning_rate": 5.335436781359956e-05, + "loss": 0.9754, + "step": 5575 + }, + { + "epoch": 0.8790667010634333, + "grad_norm": 0.9375, + "learning_rate": 5.335029565074694e-05, + "loss": 0.978, + "step": 5576 + }, + { + "epoch": 0.8792243529108263, + "grad_norm": 0.98046875, + "learning_rate": 5.3346223586765734e-05, + "loss": 1.0564, + "step": 5577 + }, + { + "epoch": 0.8793820047582194, + "grad_norm": 0.99609375, + "learning_rate": 5.3342151621664536e-05, + "loss": 1.0063, + "step": 5578 + }, + { + "epoch": 0.8795396566056124, + "grad_norm": 1.1015625, + "learning_rate": 5.3338079755451975e-05, + "loss": 0.9175, + "step": 5579 + }, + { + "epoch": 0.8796973084530054, + "grad_norm": 0.91796875, + "learning_rate": 5.3334007988136704e-05, + "loss": 0.8044, + "step": 5580 + }, + { + "epoch": 0.8798549603003984, + "grad_norm": 1.046875, + "learning_rate": 5.3329936319727295e-05, + "loss": 1.1958, + "step": 5581 + }, + { + "epoch": 0.8800126121477915, + "grad_norm": 0.8671875, + "learning_rate": 5.332586475023245e-05, + "loss": 0.6708, + "step": 5582 + }, + { + "epoch": 0.8801702639951845, + "grad_norm": 0.90234375, + "learning_rate": 5.332179327966076e-05, + "loss": 0.8885, + "step": 5583 + }, + { + "epoch": 0.8803279158425774, + "grad_norm": 0.95703125, + "learning_rate": 5.331772190802087e-05, + "loss": 0.93, + "step": 5584 + }, + { + "epoch": 0.8804855676899704, + "grad_norm": 0.9375, + "learning_rate": 5.331365063532141e-05, + "loss": 0.897, + "step": 5585 + }, + { + "epoch": 0.8806432195373635, + "grad_norm": 0.96484375, + "learning_rate": 5.3309579461570945e-05, + "loss": 0.952, + "step": 5586 + }, + { + "epoch": 0.8808008713847565, + "grad_norm": 1.1328125, + "learning_rate": 5.3305508386778203e-05, + "loss": 1.0395, + "step": 5587 + }, + { + "epoch": 0.8809585232321495, + "grad_norm": 0.94921875, + "learning_rate": 5.330143741095177e-05, + "loss": 0.9916, + "step": 5588 + }, + { + "epoch": 0.8811161750795425, + "grad_norm": 0.99609375, + "learning_rate": 5.3297366534100266e-05, + "loss": 1.2572, + "step": 5589 + }, + { + "epoch": 0.8812738269269356, + "grad_norm": 1.2578125, + "learning_rate": 5.329329575623232e-05, + "loss": 0.9552, + "step": 5590 + }, + { + "epoch": 0.8814314787743286, + "grad_norm": 0.99609375, + "learning_rate": 5.328922507735653e-05, + "loss": 1.0277, + "step": 5591 + }, + { + "epoch": 0.8815891306217215, + "grad_norm": 0.8125, + "learning_rate": 5.3285154497481596e-05, + "loss": 0.7809, + "step": 5592 + }, + { + "epoch": 0.8817467824691145, + "grad_norm": 1.046875, + "learning_rate": 5.328108401661611e-05, + "loss": 1.1159, + "step": 5593 + }, + { + "epoch": 0.8819044343165076, + "grad_norm": 0.921875, + "learning_rate": 5.3277013634768694e-05, + "loss": 0.795, + "step": 5594 + }, + { + "epoch": 0.8820620861639006, + "grad_norm": 1.0, + "learning_rate": 5.327294335194798e-05, + "loss": 1.1674, + "step": 5595 + }, + { + "epoch": 0.8822197380112936, + "grad_norm": 0.9375, + "learning_rate": 5.326887316816258e-05, + "loss": 1.0644, + "step": 5596 + }, + { + "epoch": 0.8823773898586866, + "grad_norm": 1.1171875, + "learning_rate": 5.32648030834211e-05, + "loss": 0.7833, + "step": 5597 + }, + { + "epoch": 0.8825350417060797, + "grad_norm": 0.99609375, + "learning_rate": 5.3260733097732254e-05, + "loss": 1.1752, + "step": 5598 + }, + { + "epoch": 0.8826926935534727, + "grad_norm": 0.98828125, + "learning_rate": 5.3256663211104585e-05, + "loss": 0.9997, + "step": 5599 + }, + { + "epoch": 0.8828503454008656, + "grad_norm": 1.0390625, + "learning_rate": 5.325259342354676e-05, + "loss": 1.0539, + "step": 5600 + }, + { + "epoch": 0.8830079972482586, + "grad_norm": 0.88671875, + "learning_rate": 5.324852373506739e-05, + "loss": 0.8949, + "step": 5601 + }, + { + "epoch": 0.8831656490956517, + "grad_norm": 1.0390625, + "learning_rate": 5.3244454145675114e-05, + "loss": 0.9282, + "step": 5602 + }, + { + "epoch": 0.8833233009430447, + "grad_norm": 1.0, + "learning_rate": 5.3240384655378525e-05, + "loss": 1.0816, + "step": 5603 + }, + { + "epoch": 0.8834809527904377, + "grad_norm": 1.21875, + "learning_rate": 5.323631526418629e-05, + "loss": 1.367, + "step": 5604 + }, + { + "epoch": 0.8836386046378307, + "grad_norm": 1.234375, + "learning_rate": 5.323224597210699e-05, + "loss": 0.8751, + "step": 5605 + }, + { + "epoch": 0.8837962564852238, + "grad_norm": 1.0390625, + "learning_rate": 5.322817677914924e-05, + "loss": 1.0953, + "step": 5606 + }, + { + "epoch": 0.8839539083326168, + "grad_norm": 0.953125, + "learning_rate": 5.322410768532174e-05, + "loss": 1.0726, + "step": 5607 + }, + { + "epoch": 0.8841115601800097, + "grad_norm": 0.921875, + "learning_rate": 5.322003869063307e-05, + "loss": 0.8766, + "step": 5608 + }, + { + "epoch": 0.8842692120274027, + "grad_norm": 0.96484375, + "learning_rate": 5.3215969795091845e-05, + "loss": 1.0606, + "step": 5609 + }, + { + "epoch": 0.8844268638747957, + "grad_norm": 0.89453125, + "learning_rate": 5.321190099870671e-05, + "loss": 1.0368, + "step": 5610 + }, + { + "epoch": 0.8845845157221888, + "grad_norm": 0.87890625, + "learning_rate": 5.320783230148623e-05, + "loss": 0.9647, + "step": 5611 + }, + { + "epoch": 0.8847421675695818, + "grad_norm": 0.89453125, + "learning_rate": 5.320376370343911e-05, + "loss": 1.018, + "step": 5612 + }, + { + "epoch": 0.8848998194169748, + "grad_norm": 1.3359375, + "learning_rate": 5.319969520457395e-05, + "loss": 0.8532, + "step": 5613 + }, + { + "epoch": 0.8850574712643678, + "grad_norm": 1.1875, + "learning_rate": 5.3195626804899354e-05, + "loss": 0.9745, + "step": 5614 + }, + { + "epoch": 0.8852151231117609, + "grad_norm": 1.140625, + "learning_rate": 5.319155850442394e-05, + "loss": 1.1008, + "step": 5615 + }, + { + "epoch": 0.8853727749591538, + "grad_norm": 0.9765625, + "learning_rate": 5.318749030315633e-05, + "loss": 1.0226, + "step": 5616 + }, + { + "epoch": 0.8855304268065468, + "grad_norm": 0.9140625, + "learning_rate": 5.318342220110518e-05, + "loss": 0.8461, + "step": 5617 + }, + { + "epoch": 0.8856880786539398, + "grad_norm": 0.9609375, + "learning_rate": 5.3179354198279085e-05, + "loss": 0.9375, + "step": 5618 + }, + { + "epoch": 0.8858457305013329, + "grad_norm": 2.359375, + "learning_rate": 5.3175286294686686e-05, + "loss": 0.7864, + "step": 5619 + }, + { + "epoch": 0.8860033823487259, + "grad_norm": 0.96484375, + "learning_rate": 5.317121849033659e-05, + "loss": 1.008, + "step": 5620 + }, + { + "epoch": 0.8861610341961189, + "grad_norm": 1.09375, + "learning_rate": 5.316715078523737e-05, + "loss": 1.1926, + "step": 5621 + }, + { + "epoch": 0.8863186860435119, + "grad_norm": 0.97265625, + "learning_rate": 5.3163083179397756e-05, + "loss": 0.8189, + "step": 5622 + }, + { + "epoch": 0.886476337890905, + "grad_norm": 1.0, + "learning_rate": 5.3159015672826295e-05, + "loss": 0.9734, + "step": 5623 + }, + { + "epoch": 0.8866339897382979, + "grad_norm": 0.97265625, + "learning_rate": 5.315494826553162e-05, + "loss": 1.1089, + "step": 5624 + }, + { + "epoch": 0.8867916415856909, + "grad_norm": 1.03125, + "learning_rate": 5.315088095752236e-05, + "loss": 0.9951, + "step": 5625 + }, + { + "epoch": 0.8869492934330839, + "grad_norm": 1.3125, + "learning_rate": 5.3146813748807144e-05, + "loss": 0.9189, + "step": 5626 + }, + { + "epoch": 0.887106945280477, + "grad_norm": 1.5078125, + "learning_rate": 5.314274663939452e-05, + "loss": 0.7972, + "step": 5627 + }, + { + "epoch": 0.88726459712787, + "grad_norm": 0.9453125, + "learning_rate": 5.313867962929321e-05, + "loss": 0.9639, + "step": 5628 + }, + { + "epoch": 0.887422248975263, + "grad_norm": 1.0859375, + "learning_rate": 5.313461271851179e-05, + "loss": 1.0188, + "step": 5629 + }, + { + "epoch": 0.887579900822656, + "grad_norm": 1.2890625, + "learning_rate": 5.3130545907058884e-05, + "loss": 0.9206, + "step": 5630 + }, + { + "epoch": 0.8877375526700491, + "grad_norm": 0.92578125, + "learning_rate": 5.3126479194943104e-05, + "loss": 0.9683, + "step": 5631 + }, + { + "epoch": 0.8878952045174421, + "grad_norm": 1.140625, + "learning_rate": 5.3122412582173034e-05, + "loss": 0.9772, + "step": 5632 + }, + { + "epoch": 0.888052856364835, + "grad_norm": 0.92578125, + "learning_rate": 5.3118346068757366e-05, + "loss": 0.828, + "step": 5633 + }, + { + "epoch": 0.888210508212228, + "grad_norm": 0.96484375, + "learning_rate": 5.311427965470468e-05, + "loss": 0.9667, + "step": 5634 + }, + { + "epoch": 0.888368160059621, + "grad_norm": 1.078125, + "learning_rate": 5.311021334002362e-05, + "loss": 0.9997, + "step": 5635 + }, + { + "epoch": 0.8885258119070141, + "grad_norm": 1.109375, + "learning_rate": 5.310614712472276e-05, + "loss": 1.0826, + "step": 5636 + }, + { + "epoch": 0.8886834637544071, + "grad_norm": 1.0234375, + "learning_rate": 5.310208100881071e-05, + "loss": 1.0051, + "step": 5637 + }, + { + "epoch": 0.8888411156018001, + "grad_norm": 1.1640625, + "learning_rate": 5.309801499229616e-05, + "loss": 1.0067, + "step": 5638 + }, + { + "epoch": 0.8889987674491932, + "grad_norm": 0.94140625, + "learning_rate": 5.3093949075187676e-05, + "loss": 0.7348, + "step": 5639 + }, + { + "epoch": 0.8891564192965862, + "grad_norm": 0.99609375, + "learning_rate": 5.30898832574939e-05, + "loss": 1.0236, + "step": 5640 + }, + { + "epoch": 0.8893140711439791, + "grad_norm": 0.97265625, + "learning_rate": 5.3085817539223437e-05, + "loss": 1.0125, + "step": 5641 + }, + { + "epoch": 0.8894717229913721, + "grad_norm": 0.8828125, + "learning_rate": 5.308175192038489e-05, + "loss": 0.9826, + "step": 5642 + }, + { + "epoch": 0.8896293748387651, + "grad_norm": 0.953125, + "learning_rate": 5.307768640098689e-05, + "loss": 0.9488, + "step": 5643 + }, + { + "epoch": 0.8897870266861582, + "grad_norm": 0.96875, + "learning_rate": 5.307362098103805e-05, + "loss": 1.1617, + "step": 5644 + }, + { + "epoch": 0.8899446785335512, + "grad_norm": 1.0, + "learning_rate": 5.306955566054696e-05, + "loss": 1.0921, + "step": 5645 + }, + { + "epoch": 0.8901023303809442, + "grad_norm": 0.9296875, + "learning_rate": 5.3065490439522294e-05, + "loss": 1.0238, + "step": 5646 + }, + { + "epoch": 0.8902599822283372, + "grad_norm": 1.2109375, + "learning_rate": 5.3061425317972646e-05, + "loss": 1.0684, + "step": 5647 + }, + { + "epoch": 0.8904176340757303, + "grad_norm": 0.9375, + "learning_rate": 5.3057360295906625e-05, + "loss": 1.0786, + "step": 5648 + }, + { + "epoch": 0.8905752859231232, + "grad_norm": 1.015625, + "learning_rate": 5.3053295373332836e-05, + "loss": 1.2517, + "step": 5649 + }, + { + "epoch": 0.8907329377705162, + "grad_norm": 1.6953125, + "learning_rate": 5.30492305502599e-05, + "loss": 0.8766, + "step": 5650 + }, + { + "epoch": 0.8908905896179092, + "grad_norm": 0.83203125, + "learning_rate": 5.3045165826696454e-05, + "loss": 0.6458, + "step": 5651 + }, + { + "epoch": 0.8910482414653023, + "grad_norm": 1.0625, + "learning_rate": 5.3041101202651046e-05, + "loss": 1.0941, + "step": 5652 + }, + { + "epoch": 0.8912058933126953, + "grad_norm": 0.984375, + "learning_rate": 5.303703667813238e-05, + "loss": 1.0064, + "step": 5653 + }, + { + "epoch": 0.8913635451600883, + "grad_norm": 1.0859375, + "learning_rate": 5.3032972253149026e-05, + "loss": 1.0211, + "step": 5654 + }, + { + "epoch": 0.8915211970074813, + "grad_norm": 0.99609375, + "learning_rate": 5.3028907927709605e-05, + "loss": 1.1466, + "step": 5655 + }, + { + "epoch": 0.8916788488548744, + "grad_norm": 1.0234375, + "learning_rate": 5.3024843701822733e-05, + "loss": 0.9122, + "step": 5656 + }, + { + "epoch": 0.8918365007022673, + "grad_norm": 1.3828125, + "learning_rate": 5.302077957549698e-05, + "loss": 0.8689, + "step": 5657 + }, + { + "epoch": 0.8919941525496603, + "grad_norm": 1.125, + "learning_rate": 5.3016715548741036e-05, + "loss": 1.0993, + "step": 5658 + }, + { + "epoch": 0.8921518043970533, + "grad_norm": 0.90625, + "learning_rate": 5.301265162156348e-05, + "loss": 0.9303, + "step": 5659 + }, + { + "epoch": 0.8923094562444464, + "grad_norm": 0.93359375, + "learning_rate": 5.300858779397293e-05, + "loss": 0.9972, + "step": 5660 + }, + { + "epoch": 0.8924671080918394, + "grad_norm": 1.125, + "learning_rate": 5.300452406597798e-05, + "loss": 1.0082, + "step": 5661 + }, + { + "epoch": 0.8926247599392324, + "grad_norm": 0.8828125, + "learning_rate": 5.300046043758722e-05, + "loss": 0.8564, + "step": 5662 + }, + { + "epoch": 0.8927824117866254, + "grad_norm": 0.921875, + "learning_rate": 5.299639690880933e-05, + "loss": 1.0099, + "step": 5663 + }, + { + "epoch": 0.8929400636340185, + "grad_norm": 0.9921875, + "learning_rate": 5.299233347965291e-05, + "loss": 1.0481, + "step": 5664 + }, + { + "epoch": 0.8930977154814114, + "grad_norm": 1.0234375, + "learning_rate": 5.298827015012653e-05, + "loss": 0.9609, + "step": 5665 + }, + { + "epoch": 0.8932553673288044, + "grad_norm": 0.984375, + "learning_rate": 5.2984206920238844e-05, + "loss": 0.9816, + "step": 5666 + }, + { + "epoch": 0.8934130191761974, + "grad_norm": 1.0625, + "learning_rate": 5.298014378999838e-05, + "loss": 1.0295, + "step": 5667 + }, + { + "epoch": 0.8935706710235904, + "grad_norm": 0.87890625, + "learning_rate": 5.297608075941387e-05, + "loss": 0.8711, + "step": 5668 + }, + { + "epoch": 0.8937283228709835, + "grad_norm": 0.85546875, + "learning_rate": 5.297201782849388e-05, + "loss": 1.0331, + "step": 5669 + }, + { + "epoch": 0.8938859747183765, + "grad_norm": 0.98046875, + "learning_rate": 5.2967954997246996e-05, + "loss": 1.0351, + "step": 5670 + }, + { + "epoch": 0.8940436265657695, + "grad_norm": 0.94140625, + "learning_rate": 5.296389226568184e-05, + "loss": 1.0203, + "step": 5671 + }, + { + "epoch": 0.8942012784131625, + "grad_norm": 0.9296875, + "learning_rate": 5.295982963380699e-05, + "loss": 1.1639, + "step": 5672 + }, + { + "epoch": 0.8943589302605555, + "grad_norm": 0.96875, + "learning_rate": 5.2955767101631135e-05, + "loss": 0.9663, + "step": 5673 + }, + { + "epoch": 0.8945165821079485, + "grad_norm": 0.859375, + "learning_rate": 5.295170466916284e-05, + "loss": 0.8075, + "step": 5674 + }, + { + "epoch": 0.8946742339553415, + "grad_norm": 0.96875, + "learning_rate": 5.294764233641072e-05, + "loss": 0.9778, + "step": 5675 + }, + { + "epoch": 0.8948318858027345, + "grad_norm": 0.875, + "learning_rate": 5.2943580103383384e-05, + "loss": 1.019, + "step": 5676 + }, + { + "epoch": 0.8949895376501276, + "grad_norm": 0.87890625, + "learning_rate": 5.29395179700894e-05, + "loss": 0.873, + "step": 5677 + }, + { + "epoch": 0.8951471894975206, + "grad_norm": 1.5703125, + "learning_rate": 5.293545593653746e-05, + "loss": 0.7727, + "step": 5678 + }, + { + "epoch": 0.8953048413449136, + "grad_norm": 0.97265625, + "learning_rate": 5.293139400273612e-05, + "loss": 0.9591, + "step": 5679 + }, + { + "epoch": 0.8954624931923066, + "grad_norm": 1.0078125, + "learning_rate": 5.292733216869401e-05, + "loss": 0.8596, + "step": 5680 + }, + { + "epoch": 0.8956201450396996, + "grad_norm": 0.86328125, + "learning_rate": 5.292327043441971e-05, + "loss": 1.0043, + "step": 5681 + }, + { + "epoch": 0.8957777968870926, + "grad_norm": 0.8984375, + "learning_rate": 5.291920879992184e-05, + "loss": 0.9112, + "step": 5682 + }, + { + "epoch": 0.8959354487344856, + "grad_norm": 1.171875, + "learning_rate": 5.291514726520903e-05, + "loss": 0.9408, + "step": 5683 + }, + { + "epoch": 0.8960931005818786, + "grad_norm": 1.0078125, + "learning_rate": 5.2911085830289885e-05, + "loss": 1.0698, + "step": 5684 + }, + { + "epoch": 0.8962507524292717, + "grad_norm": 0.9453125, + "learning_rate": 5.2907024495172994e-05, + "loss": 1.0327, + "step": 5685 + }, + { + "epoch": 0.8964084042766647, + "grad_norm": 0.91796875, + "learning_rate": 5.290296325986699e-05, + "loss": 0.9329, + "step": 5686 + }, + { + "epoch": 0.8965660561240577, + "grad_norm": 0.87109375, + "learning_rate": 5.289890212438045e-05, + "loss": 0.8695, + "step": 5687 + }, + { + "epoch": 0.8967237079714507, + "grad_norm": 0.9609375, + "learning_rate": 5.2894841088722005e-05, + "loss": 0.7918, + "step": 5688 + }, + { + "epoch": 0.8968813598188436, + "grad_norm": 1.0078125, + "learning_rate": 5.289078015290023e-05, + "loss": 0.8638, + "step": 5689 + }, + { + "epoch": 0.8970390116662367, + "grad_norm": 0.98828125, + "learning_rate": 5.288671931692377e-05, + "loss": 0.9629, + "step": 5690 + }, + { + "epoch": 0.8971966635136297, + "grad_norm": 0.9375, + "learning_rate": 5.2882658580801215e-05, + "loss": 0.9847, + "step": 5691 + }, + { + "epoch": 0.8973543153610227, + "grad_norm": 0.87890625, + "learning_rate": 5.287859794454113e-05, + "loss": 0.8517, + "step": 5692 + }, + { + "epoch": 0.8975119672084158, + "grad_norm": 1.0546875, + "learning_rate": 5.28745374081522e-05, + "loss": 1.1116, + "step": 5693 + }, + { + "epoch": 0.8976696190558088, + "grad_norm": 1.0390625, + "learning_rate": 5.2870476971643e-05, + "loss": 0.9708, + "step": 5694 + }, + { + "epoch": 0.8978272709032018, + "grad_norm": 0.96875, + "learning_rate": 5.286641663502214e-05, + "loss": 1.0561, + "step": 5695 + }, + { + "epoch": 0.8979849227505948, + "grad_norm": 1.015625, + "learning_rate": 5.28623563982982e-05, + "loss": 1.0881, + "step": 5696 + }, + { + "epoch": 0.8981425745979877, + "grad_norm": 0.92578125, + "learning_rate": 5.2858296261479766e-05, + "loss": 1.3231, + "step": 5697 + }, + { + "epoch": 0.8983002264453808, + "grad_norm": 1.09375, + "learning_rate": 5.2854236224575524e-05, + "loss": 1.0912, + "step": 5698 + }, + { + "epoch": 0.8984578782927738, + "grad_norm": 0.98828125, + "learning_rate": 5.285017628759403e-05, + "loss": 1.1144, + "step": 5699 + }, + { + "epoch": 0.8986155301401668, + "grad_norm": 0.9453125, + "learning_rate": 5.284611645054389e-05, + "loss": 0.8805, + "step": 5700 + }, + { + "epoch": 0.8987731819875598, + "grad_norm": 1.328125, + "learning_rate": 5.2842056713433706e-05, + "loss": 0.9397, + "step": 5701 + }, + { + "epoch": 0.8989308338349529, + "grad_norm": 1.5, + "learning_rate": 5.283799707627206e-05, + "loss": 1.2732, + "step": 5702 + }, + { + "epoch": 0.8990884856823459, + "grad_norm": 0.9296875, + "learning_rate": 5.2833937539067614e-05, + "loss": 0.9459, + "step": 5703 + }, + { + "epoch": 0.8992461375297389, + "grad_norm": 0.91015625, + "learning_rate": 5.2829878101828946e-05, + "loss": 0.9582, + "step": 5704 + }, + { + "epoch": 0.8994037893771318, + "grad_norm": 0.92578125, + "learning_rate": 5.282581876456465e-05, + "loss": 1.0042, + "step": 5705 + }, + { + "epoch": 0.8995614412245249, + "grad_norm": 0.9140625, + "learning_rate": 5.282175952728334e-05, + "loss": 0.7642, + "step": 5706 + }, + { + "epoch": 0.8997190930719179, + "grad_norm": 0.953125, + "learning_rate": 5.281770038999356e-05, + "loss": 0.9775, + "step": 5707 + }, + { + "epoch": 0.8998767449193109, + "grad_norm": 0.94140625, + "learning_rate": 5.2813641352704036e-05, + "loss": 1.102, + "step": 5708 + }, + { + "epoch": 0.9000343967667039, + "grad_norm": 0.91015625, + "learning_rate": 5.2809582415423285e-05, + "loss": 0.8249, + "step": 5709 + }, + { + "epoch": 0.900192048614097, + "grad_norm": 0.96484375, + "learning_rate": 5.280552357815992e-05, + "loss": 1.1963, + "step": 5710 + }, + { + "epoch": 0.90034970046149, + "grad_norm": 2.328125, + "learning_rate": 5.280146484092257e-05, + "loss": 0.8222, + "step": 5711 + }, + { + "epoch": 0.900507352308883, + "grad_norm": 0.98828125, + "learning_rate": 5.279740620371976e-05, + "loss": 1.0591, + "step": 5712 + }, + { + "epoch": 0.9006650041562759, + "grad_norm": 0.98828125, + "learning_rate": 5.279334766656019e-05, + "loss": 1.0861, + "step": 5713 + }, + { + "epoch": 0.900822656003669, + "grad_norm": 0.89453125, + "learning_rate": 5.278928922945243e-05, + "loss": 0.9344, + "step": 5714 + }, + { + "epoch": 0.900980307851062, + "grad_norm": 0.90625, + "learning_rate": 5.278523089240506e-05, + "loss": 0.8548, + "step": 5715 + }, + { + "epoch": 0.901137959698455, + "grad_norm": 0.953125, + "learning_rate": 5.27811726554267e-05, + "loss": 1.1744, + "step": 5716 + }, + { + "epoch": 0.901295611545848, + "grad_norm": 1.0234375, + "learning_rate": 5.2777114518525915e-05, + "loss": 1.1203, + "step": 5717 + }, + { + "epoch": 0.901453263393241, + "grad_norm": 0.984375, + "learning_rate": 5.277305648171138e-05, + "loss": 1.0783, + "step": 5718 + }, + { + "epoch": 0.9016109152406341, + "grad_norm": 0.984375, + "learning_rate": 5.276899854499164e-05, + "loss": 0.8651, + "step": 5719 + }, + { + "epoch": 0.9017685670880271, + "grad_norm": 0.89453125, + "learning_rate": 5.276494070837531e-05, + "loss": 0.8279, + "step": 5720 + }, + { + "epoch": 0.90192621893542, + "grad_norm": 0.96484375, + "learning_rate": 5.276088297187098e-05, + "loss": 1.0883, + "step": 5721 + }, + { + "epoch": 0.902083870782813, + "grad_norm": 0.8203125, + "learning_rate": 5.275682533548724e-05, + "loss": 0.7935, + "step": 5722 + }, + { + "epoch": 0.9022415226302061, + "grad_norm": 1.046875, + "learning_rate": 5.275276779923273e-05, + "loss": 1.085, + "step": 5723 + }, + { + "epoch": 0.9023991744775991, + "grad_norm": 0.94921875, + "learning_rate": 5.274871036311604e-05, + "loss": 0.9311, + "step": 5724 + }, + { + "epoch": 0.9025568263249921, + "grad_norm": 0.91015625, + "learning_rate": 5.2744653027145754e-05, + "loss": 0.9166, + "step": 5725 + }, + { + "epoch": 0.9027144781723851, + "grad_norm": 1.0234375, + "learning_rate": 5.274059579133047e-05, + "loss": 1.0862, + "step": 5726 + }, + { + "epoch": 0.9028721300197782, + "grad_norm": 0.88671875, + "learning_rate": 5.27365386556788e-05, + "loss": 0.9073, + "step": 5727 + }, + { + "epoch": 0.9030297818671712, + "grad_norm": 0.921875, + "learning_rate": 5.2732481620199325e-05, + "loss": 0.9277, + "step": 5728 + }, + { + "epoch": 0.9031874337145641, + "grad_norm": 0.94140625, + "learning_rate": 5.272842468490067e-05, + "loss": 0.9981, + "step": 5729 + }, + { + "epoch": 0.9033450855619571, + "grad_norm": 0.9140625, + "learning_rate": 5.272436784979136e-05, + "loss": 0.8866, + "step": 5730 + }, + { + "epoch": 0.9035027374093502, + "grad_norm": 0.9296875, + "learning_rate": 5.272031111488009e-05, + "loss": 0.9711, + "step": 5731 + }, + { + "epoch": 0.9036603892567432, + "grad_norm": 0.9765625, + "learning_rate": 5.271625448017543e-05, + "loss": 0.9463, + "step": 5732 + }, + { + "epoch": 0.9038180411041362, + "grad_norm": 0.9296875, + "learning_rate": 5.271219794568596e-05, + "loss": 0.7304, + "step": 5733 + }, + { + "epoch": 0.9039756929515292, + "grad_norm": 1.0703125, + "learning_rate": 5.2708141511420275e-05, + "loss": 1.0453, + "step": 5734 + }, + { + "epoch": 0.9041333447989223, + "grad_norm": 0.921875, + "learning_rate": 5.2704085177386996e-05, + "loss": 0.9152, + "step": 5735 + }, + { + "epoch": 0.9042909966463153, + "grad_norm": 0.9765625, + "learning_rate": 5.270002894359469e-05, + "loss": 1.0095, + "step": 5736 + }, + { + "epoch": 0.9044486484937082, + "grad_norm": 0.98828125, + "learning_rate": 5.2695972810051945e-05, + "loss": 1.0509, + "step": 5737 + }, + { + "epoch": 0.9046063003411012, + "grad_norm": 0.95703125, + "learning_rate": 5.269191677676742e-05, + "loss": 0.9205, + "step": 5738 + }, + { + "epoch": 0.9047639521884943, + "grad_norm": 0.92578125, + "learning_rate": 5.2687860843749656e-05, + "loss": 1.138, + "step": 5739 + }, + { + "epoch": 0.9049216040358873, + "grad_norm": 1.03125, + "learning_rate": 5.2683805011007294e-05, + "loss": 1.2047, + "step": 5740 + }, + { + "epoch": 0.9050792558832803, + "grad_norm": 1.0625, + "learning_rate": 5.267974927854888e-05, + "loss": 1.0533, + "step": 5741 + }, + { + "epoch": 0.9052369077306733, + "grad_norm": 0.9921875, + "learning_rate": 5.267569364638301e-05, + "loss": 0.8819, + "step": 5742 + }, + { + "epoch": 0.9053945595780664, + "grad_norm": 1.046875, + "learning_rate": 5.2671638114518315e-05, + "loss": 0.857, + "step": 5743 + }, + { + "epoch": 0.9055522114254594, + "grad_norm": 0.953125, + "learning_rate": 5.266758268296341e-05, + "loss": 0.946, + "step": 5744 + }, + { + "epoch": 0.9057098632728523, + "grad_norm": 0.91015625, + "learning_rate": 5.2663527351726835e-05, + "loss": 0.8963, + "step": 5745 + }, + { + "epoch": 0.9058675151202453, + "grad_norm": 1.0546875, + "learning_rate": 5.265947212081722e-05, + "loss": 0.9654, + "step": 5746 + }, + { + "epoch": 0.9060251669676384, + "grad_norm": 0.953125, + "learning_rate": 5.26554169902431e-05, + "loss": 0.8807, + "step": 5747 + }, + { + "epoch": 0.9061828188150314, + "grad_norm": 1.0390625, + "learning_rate": 5.2651361960013165e-05, + "loss": 0.9528, + "step": 5748 + }, + { + "epoch": 0.9063404706624244, + "grad_norm": 0.91796875, + "learning_rate": 5.264730703013596e-05, + "loss": 0.9488, + "step": 5749 + }, + { + "epoch": 0.9064981225098174, + "grad_norm": 1.0390625, + "learning_rate": 5.2643252200620086e-05, + "loss": 1.0399, + "step": 5750 + }, + { + "epoch": 0.9066557743572105, + "grad_norm": 0.98828125, + "learning_rate": 5.2639197471474125e-05, + "loss": 1.0373, + "step": 5751 + }, + { + "epoch": 0.9068134262046035, + "grad_norm": 0.96484375, + "learning_rate": 5.2635142842706644e-05, + "loss": 0.9129, + "step": 5752 + }, + { + "epoch": 0.9069710780519964, + "grad_norm": 0.84765625, + "learning_rate": 5.26310883143263e-05, + "loss": 0.7311, + "step": 5753 + }, + { + "epoch": 0.9071287298993894, + "grad_norm": 0.8359375, + "learning_rate": 5.262703388634167e-05, + "loss": 0.7276, + "step": 5754 + }, + { + "epoch": 0.9072863817467824, + "grad_norm": 0.82421875, + "learning_rate": 5.262297955876132e-05, + "loss": 0.9429, + "step": 5755 + }, + { + "epoch": 0.9074440335941755, + "grad_norm": 1.015625, + "learning_rate": 5.261892533159387e-05, + "loss": 1.1244, + "step": 5756 + }, + { + "epoch": 0.9076016854415685, + "grad_norm": 0.96484375, + "learning_rate": 5.261487120484785e-05, + "loss": 1.0323, + "step": 5757 + }, + { + "epoch": 0.9077593372889615, + "grad_norm": 1.0625, + "learning_rate": 5.2610817178531954e-05, + "loss": 1.089, + "step": 5758 + }, + { + "epoch": 0.9079169891363545, + "grad_norm": 0.90234375, + "learning_rate": 5.2606763252654713e-05, + "loss": 1.1197, + "step": 5759 + }, + { + "epoch": 0.9080746409837476, + "grad_norm": 0.9296875, + "learning_rate": 5.2602709427224725e-05, + "loss": 1.0397, + "step": 5760 + }, + { + "epoch": 0.9082322928311405, + "grad_norm": 0.9765625, + "learning_rate": 5.25986557022506e-05, + "loss": 0.8965, + "step": 5761 + }, + { + "epoch": 0.9083899446785335, + "grad_norm": 1.015625, + "learning_rate": 5.259460207774085e-05, + "loss": 0.9575, + "step": 5762 + }, + { + "epoch": 0.9085475965259265, + "grad_norm": 1.109375, + "learning_rate": 5.25905485537042e-05, + "loss": 0.8594, + "step": 5763 + }, + { + "epoch": 0.9087052483733196, + "grad_norm": 0.91015625, + "learning_rate": 5.258649513014915e-05, + "loss": 0.969, + "step": 5764 + }, + { + "epoch": 0.9088629002207126, + "grad_norm": 1.046875, + "learning_rate": 5.2582441807084324e-05, + "loss": 1.2605, + "step": 5765 + }, + { + "epoch": 0.9090205520681056, + "grad_norm": 0.9296875, + "learning_rate": 5.25783885845183e-05, + "loss": 0.9492, + "step": 5766 + }, + { + "epoch": 0.9091782039154986, + "grad_norm": 0.91796875, + "learning_rate": 5.2574335462459665e-05, + "loss": 0.8452, + "step": 5767 + }, + { + "epoch": 0.9093358557628917, + "grad_norm": 0.94921875, + "learning_rate": 5.2570282440916995e-05, + "loss": 0.8894, + "step": 5768 + }, + { + "epoch": 0.9094935076102846, + "grad_norm": 0.97265625, + "learning_rate": 5.256622951989893e-05, + "loss": 0.9056, + "step": 5769 + }, + { + "epoch": 0.9096511594576776, + "grad_norm": 0.94140625, + "learning_rate": 5.2562176699414014e-05, + "loss": 0.8731, + "step": 5770 + }, + { + "epoch": 0.9098088113050706, + "grad_norm": 2.390625, + "learning_rate": 5.255812397947086e-05, + "loss": 1.0679, + "step": 5771 + }, + { + "epoch": 0.9099664631524637, + "grad_norm": 0.9921875, + "learning_rate": 5.255407136007806e-05, + "loss": 1.0462, + "step": 5772 + }, + { + "epoch": 0.9101241149998567, + "grad_norm": 0.87890625, + "learning_rate": 5.2550018841244195e-05, + "loss": 1.013, + "step": 5773 + }, + { + "epoch": 0.9102817668472497, + "grad_norm": 1.0234375, + "learning_rate": 5.254596642297786e-05, + "loss": 1.1781, + "step": 5774 + }, + { + "epoch": 0.9104394186946427, + "grad_norm": 0.9609375, + "learning_rate": 5.2541914105287616e-05, + "loss": 0.9664, + "step": 5775 + }, + { + "epoch": 0.9105970705420358, + "grad_norm": 0.97265625, + "learning_rate": 5.253786188818208e-05, + "loss": 1.0172, + "step": 5776 + }, + { + "epoch": 0.9107547223894287, + "grad_norm": 0.96484375, + "learning_rate": 5.253380977166984e-05, + "loss": 0.8114, + "step": 5777 + }, + { + "epoch": 0.9109123742368217, + "grad_norm": 0.984375, + "learning_rate": 5.252975775575944e-05, + "loss": 1.0404, + "step": 5778 + }, + { + "epoch": 0.9110700260842147, + "grad_norm": 0.890625, + "learning_rate": 5.252570584045953e-05, + "loss": 0.9843, + "step": 5779 + }, + { + "epoch": 0.9112276779316077, + "grad_norm": 0.9296875, + "learning_rate": 5.2521654025778685e-05, + "loss": 1.0182, + "step": 5780 + }, + { + "epoch": 0.9113853297790008, + "grad_norm": 0.78515625, + "learning_rate": 5.2517602311725464e-05, + "loss": 0.8779, + "step": 5781 + }, + { + "epoch": 0.9115429816263938, + "grad_norm": 1.015625, + "learning_rate": 5.25135506983085e-05, + "loss": 1.0808, + "step": 5782 + }, + { + "epoch": 0.9117006334737868, + "grad_norm": 0.96484375, + "learning_rate": 5.250949918553629e-05, + "loss": 0.9974, + "step": 5783 + }, + { + "epoch": 0.9118582853211799, + "grad_norm": 0.9921875, + "learning_rate": 5.2505447773417526e-05, + "loss": 1.0804, + "step": 5784 + }, + { + "epoch": 0.9120159371685728, + "grad_norm": 0.94921875, + "learning_rate": 5.250139646196075e-05, + "loss": 1.1376, + "step": 5785 + }, + { + "epoch": 0.9121735890159658, + "grad_norm": 0.9296875, + "learning_rate": 5.2497345251174555e-05, + "loss": 0.8969, + "step": 5786 + }, + { + "epoch": 0.9123312408633588, + "grad_norm": 0.91796875, + "learning_rate": 5.2493294141067514e-05, + "loss": 0.962, + "step": 5787 + }, + { + "epoch": 0.9124888927107518, + "grad_norm": 0.83984375, + "learning_rate": 5.248924313164818e-05, + "loss": 0.8887, + "step": 5788 + }, + { + "epoch": 0.9126465445581449, + "grad_norm": 1.140625, + "learning_rate": 5.248519222292523e-05, + "loss": 1.2643, + "step": 5789 + }, + { + "epoch": 0.9128041964055379, + "grad_norm": 0.98828125, + "learning_rate": 5.2481141414907184e-05, + "loss": 0.8951, + "step": 5790 + }, + { + "epoch": 0.9129618482529309, + "grad_norm": 0.9453125, + "learning_rate": 5.247709070760265e-05, + "loss": 1.0097, + "step": 5791 + }, + { + "epoch": 0.9131195001003239, + "grad_norm": 0.91015625, + "learning_rate": 5.247304010102021e-05, + "loss": 0.8005, + "step": 5792 + }, + { + "epoch": 0.913277151947717, + "grad_norm": 1.765625, + "learning_rate": 5.2468989595168404e-05, + "loss": 1.0551, + "step": 5793 + }, + { + "epoch": 0.9134348037951099, + "grad_norm": 1.09375, + "learning_rate": 5.24649391900559e-05, + "loss": 1.1272, + "step": 5794 + }, + { + "epoch": 0.9135924556425029, + "grad_norm": 0.921875, + "learning_rate": 5.246088888569123e-05, + "loss": 0.9478, + "step": 5795 + }, + { + "epoch": 0.9137501074898959, + "grad_norm": 0.98828125, + "learning_rate": 5.2456838682083e-05, + "loss": 0.9822, + "step": 5796 + }, + { + "epoch": 0.913907759337289, + "grad_norm": 1.4921875, + "learning_rate": 5.2452788579239775e-05, + "loss": 0.9619, + "step": 5797 + }, + { + "epoch": 0.914065411184682, + "grad_norm": 1.0703125, + "learning_rate": 5.244873857717011e-05, + "loss": 0.9809, + "step": 5798 + }, + { + "epoch": 0.914223063032075, + "grad_norm": 0.96875, + "learning_rate": 5.244468867588267e-05, + "loss": 0.9612, + "step": 5799 + }, + { + "epoch": 0.914380714879468, + "grad_norm": 1.015625, + "learning_rate": 5.2440638875386e-05, + "loss": 0.9462, + "step": 5800 + }, + { + "epoch": 0.9145383667268611, + "grad_norm": 0.9609375, + "learning_rate": 5.2436589175688655e-05, + "loss": 0.9039, + "step": 5801 + }, + { + "epoch": 0.914696018574254, + "grad_norm": 0.85546875, + "learning_rate": 5.243253957679926e-05, + "loss": 0.8172, + "step": 5802 + }, + { + "epoch": 0.914853670421647, + "grad_norm": 0.93359375, + "learning_rate": 5.242849007872632e-05, + "loss": 0.9576, + "step": 5803 + }, + { + "epoch": 0.91501132226904, + "grad_norm": 0.98046875, + "learning_rate": 5.242444068147854e-05, + "loss": 1.0571, + "step": 5804 + }, + { + "epoch": 0.915168974116433, + "grad_norm": 1.015625, + "learning_rate": 5.242039138506443e-05, + "loss": 1.0112, + "step": 5805 + }, + { + "epoch": 0.9153266259638261, + "grad_norm": 1.03125, + "learning_rate": 5.241634218949256e-05, + "loss": 1.1044, + "step": 5806 + }, + { + "epoch": 0.9154842778112191, + "grad_norm": 1.0, + "learning_rate": 5.2412293094771556e-05, + "loss": 1.0583, + "step": 5807 + }, + { + "epoch": 0.9156419296586121, + "grad_norm": 1.015625, + "learning_rate": 5.240824410090994e-05, + "loss": 0.946, + "step": 5808 + }, + { + "epoch": 0.9157995815060052, + "grad_norm": 0.9765625, + "learning_rate": 5.240419520791635e-05, + "loss": 1.1492, + "step": 5809 + }, + { + "epoch": 0.9159572333533981, + "grad_norm": 0.90234375, + "learning_rate": 5.2400146415799366e-05, + "loss": 1.1111, + "step": 5810 + }, + { + "epoch": 0.9161148852007911, + "grad_norm": 1.03125, + "learning_rate": 5.239609772456755e-05, + "loss": 1.0249, + "step": 5811 + }, + { + "epoch": 0.9162725370481841, + "grad_norm": 0.91796875, + "learning_rate": 5.239204913422947e-05, + "loss": 0.9796, + "step": 5812 + }, + { + "epoch": 0.9164301888955771, + "grad_norm": 0.8125, + "learning_rate": 5.2388000644793735e-05, + "loss": 0.8498, + "step": 5813 + }, + { + "epoch": 0.9165878407429702, + "grad_norm": 0.9921875, + "learning_rate": 5.238395225626891e-05, + "loss": 1.0396, + "step": 5814 + }, + { + "epoch": 0.9167454925903632, + "grad_norm": 0.94921875, + "learning_rate": 5.237990396866357e-05, + "loss": 0.985, + "step": 5815 + }, + { + "epoch": 0.9169031444377562, + "grad_norm": 1.0234375, + "learning_rate": 5.237585578198626e-05, + "loss": 1.0859, + "step": 5816 + }, + { + "epoch": 0.9170607962851492, + "grad_norm": 0.89453125, + "learning_rate": 5.2371807696245644e-05, + "loss": 1.1208, + "step": 5817 + }, + { + "epoch": 0.9172184481325422, + "grad_norm": 1.0390625, + "learning_rate": 5.236775971145026e-05, + "loss": 1.0906, + "step": 5818 + }, + { + "epoch": 0.9173760999799352, + "grad_norm": 0.8828125, + "learning_rate": 5.2363711827608684e-05, + "loss": 0.7741, + "step": 5819 + }, + { + "epoch": 0.9175337518273282, + "grad_norm": 1.015625, + "learning_rate": 5.235966404472951e-05, + "loss": 0.9896, + "step": 5820 + }, + { + "epoch": 0.9176914036747212, + "grad_norm": 0.91015625, + "learning_rate": 5.235561636282129e-05, + "loss": 0.8187, + "step": 5821 + }, + { + "epoch": 0.9178490555221143, + "grad_norm": 1.0703125, + "learning_rate": 5.2351568781892625e-05, + "loss": 1.0774, + "step": 5822 + }, + { + "epoch": 0.9180067073695073, + "grad_norm": 1.03125, + "learning_rate": 5.234752130195205e-05, + "loss": 1.1198, + "step": 5823 + }, + { + "epoch": 0.9181643592169003, + "grad_norm": 0.91015625, + "learning_rate": 5.234347392300822e-05, + "loss": 0.7836, + "step": 5824 + }, + { + "epoch": 0.9183220110642933, + "grad_norm": 1.125, + "learning_rate": 5.2339426645069664e-05, + "loss": 1.1228, + "step": 5825 + }, + { + "epoch": 0.9184796629116863, + "grad_norm": 1.1484375, + "learning_rate": 5.2335379468144976e-05, + "loss": 0.6992, + "step": 5826 + }, + { + "epoch": 0.9186373147590793, + "grad_norm": 1.015625, + "learning_rate": 5.233133239224273e-05, + "loss": 1.0232, + "step": 5827 + }, + { + "epoch": 0.9187949666064723, + "grad_norm": 1.2421875, + "learning_rate": 5.2327285417371455e-05, + "loss": 0.8242, + "step": 5828 + }, + { + "epoch": 0.9189526184538653, + "grad_norm": 0.95703125, + "learning_rate": 5.232323854353981e-05, + "loss": 0.9238, + "step": 5829 + }, + { + "epoch": 0.9191102703012584, + "grad_norm": 0.9921875, + "learning_rate": 5.231919177075634e-05, + "loss": 0.9631, + "step": 5830 + }, + { + "epoch": 0.9192679221486514, + "grad_norm": 0.91796875, + "learning_rate": 5.231514509902962e-05, + "loss": 0.9566, + "step": 5831 + }, + { + "epoch": 0.9194255739960444, + "grad_norm": 0.8671875, + "learning_rate": 5.231109852836822e-05, + "loss": 0.8354, + "step": 5832 + }, + { + "epoch": 0.9195832258434374, + "grad_norm": 0.9765625, + "learning_rate": 5.230705205878068e-05, + "loss": 1.2012, + "step": 5833 + }, + { + "epoch": 0.9197408776908303, + "grad_norm": 0.94140625, + "learning_rate": 5.230300569027566e-05, + "loss": 1.105, + "step": 5834 + }, + { + "epoch": 0.9198985295382234, + "grad_norm": 0.8046875, + "learning_rate": 5.229895942286169e-05, + "loss": 0.7986, + "step": 5835 + }, + { + "epoch": 0.9200561813856164, + "grad_norm": 0.99609375, + "learning_rate": 5.229491325654736e-05, + "loss": 1.1218, + "step": 5836 + }, + { + "epoch": 0.9202138332330094, + "grad_norm": 0.89453125, + "learning_rate": 5.229086719134122e-05, + "loss": 0.9815, + "step": 5837 + }, + { + "epoch": 0.9203714850804025, + "grad_norm": 1.0390625, + "learning_rate": 5.228682122725184e-05, + "loss": 1.1044, + "step": 5838 + }, + { + "epoch": 0.9205291369277955, + "grad_norm": 0.81640625, + "learning_rate": 5.228277536428783e-05, + "loss": 0.9445, + "step": 5839 + }, + { + "epoch": 0.9206867887751885, + "grad_norm": 0.98046875, + "learning_rate": 5.2278729602457765e-05, + "loss": 1.0341, + "step": 5840 + }, + { + "epoch": 0.9208444406225815, + "grad_norm": 0.9140625, + "learning_rate": 5.227468394177021e-05, + "loss": 1.0504, + "step": 5841 + }, + { + "epoch": 0.9210020924699744, + "grad_norm": 0.82421875, + "learning_rate": 5.227063838223372e-05, + "loss": 0.8376, + "step": 5842 + }, + { + "epoch": 0.9211597443173675, + "grad_norm": 1.03125, + "learning_rate": 5.226659292385685e-05, + "loss": 1.01, + "step": 5843 + }, + { + "epoch": 0.9213173961647605, + "grad_norm": 1.0546875, + "learning_rate": 5.2262547566648255e-05, + "loss": 1.1597, + "step": 5844 + }, + { + "epoch": 0.9214750480121535, + "grad_norm": 1.03125, + "learning_rate": 5.2258502310616455e-05, + "loss": 1.0583, + "step": 5845 + }, + { + "epoch": 0.9216326998595465, + "grad_norm": 1.0859375, + "learning_rate": 5.225445715577003e-05, + "loss": 0.9475, + "step": 5846 + }, + { + "epoch": 0.9217903517069396, + "grad_norm": 0.96875, + "learning_rate": 5.2250412102117564e-05, + "loss": 0.8149, + "step": 5847 + }, + { + "epoch": 0.9219480035543326, + "grad_norm": 1.0625, + "learning_rate": 5.224636714966756e-05, + "loss": 1.1223, + "step": 5848 + }, + { + "epoch": 0.9221056554017256, + "grad_norm": 1.0234375, + "learning_rate": 5.2242322298428714e-05, + "loss": 0.973, + "step": 5849 + }, + { + "epoch": 0.9222633072491185, + "grad_norm": 0.89453125, + "learning_rate": 5.223827754840952e-05, + "loss": 0.9615, + "step": 5850 + }, + { + "epoch": 0.9224209590965116, + "grad_norm": 0.98828125, + "learning_rate": 5.223423289961857e-05, + "loss": 0.9123, + "step": 5851 + }, + { + "epoch": 0.9225786109439046, + "grad_norm": 1.0703125, + "learning_rate": 5.2230188352064434e-05, + "loss": 1.1603, + "step": 5852 + }, + { + "epoch": 0.9227362627912976, + "grad_norm": 0.98828125, + "learning_rate": 5.222614390575564e-05, + "loss": 1.1328, + "step": 5853 + }, + { + "epoch": 0.9228939146386906, + "grad_norm": 0.89453125, + "learning_rate": 5.222209956070085e-05, + "loss": 0.8645, + "step": 5854 + }, + { + "epoch": 0.9230515664860837, + "grad_norm": 0.92578125, + "learning_rate": 5.22180553169086e-05, + "loss": 0.7007, + "step": 5855 + }, + { + "epoch": 0.9232092183334767, + "grad_norm": 0.9609375, + "learning_rate": 5.2214011174387437e-05, + "loss": 0.8474, + "step": 5856 + }, + { + "epoch": 0.9233668701808697, + "grad_norm": 1.46875, + "learning_rate": 5.220996713314594e-05, + "loss": 1.0671, + "step": 5857 + }, + { + "epoch": 0.9235245220282626, + "grad_norm": 1.0546875, + "learning_rate": 5.2205923193192695e-05, + "loss": 1.0246, + "step": 5858 + }, + { + "epoch": 0.9236821738756557, + "grad_norm": 0.96875, + "learning_rate": 5.220187935453627e-05, + "loss": 1.0155, + "step": 5859 + }, + { + "epoch": 0.9238398257230487, + "grad_norm": 1.34375, + "learning_rate": 5.219783561718521e-05, + "loss": 0.8627, + "step": 5860 + }, + { + "epoch": 0.9239974775704417, + "grad_norm": 1.09375, + "learning_rate": 5.219379198114812e-05, + "loss": 0.9377, + "step": 5861 + }, + { + "epoch": 0.9241551294178347, + "grad_norm": 0.92578125, + "learning_rate": 5.2189748446433564e-05, + "loss": 0.9327, + "step": 5862 + }, + { + "epoch": 0.9243127812652278, + "grad_norm": 0.88671875, + "learning_rate": 5.2185705013050045e-05, + "loss": 0.9565, + "step": 5863 + }, + { + "epoch": 0.9244704331126208, + "grad_norm": 1.0390625, + "learning_rate": 5.2181661681006245e-05, + "loss": 0.9692, + "step": 5864 + }, + { + "epoch": 0.9246280849600138, + "grad_norm": 1.0625, + "learning_rate": 5.217761845031066e-05, + "loss": 0.9802, + "step": 5865 + }, + { + "epoch": 0.9247857368074067, + "grad_norm": 0.8984375, + "learning_rate": 5.217357532097189e-05, + "loss": 0.8779, + "step": 5866 + }, + { + "epoch": 0.9249433886547997, + "grad_norm": 0.8671875, + "learning_rate": 5.2169532292998505e-05, + "loss": 0.8657, + "step": 5867 + }, + { + "epoch": 0.9251010405021928, + "grad_norm": 0.89453125, + "learning_rate": 5.216548936639901e-05, + "loss": 0.9938, + "step": 5868 + }, + { + "epoch": 0.9252586923495858, + "grad_norm": 0.97265625, + "learning_rate": 5.2161446541182067e-05, + "loss": 0.8194, + "step": 5869 + }, + { + "epoch": 0.9254163441969788, + "grad_norm": 0.9765625, + "learning_rate": 5.215740381735621e-05, + "loss": 0.894, + "step": 5870 + }, + { + "epoch": 0.9255739960443718, + "grad_norm": 0.98046875, + "learning_rate": 5.215336119493e-05, + "loss": 0.9336, + "step": 5871 + }, + { + "epoch": 0.9257316478917649, + "grad_norm": 0.86328125, + "learning_rate": 5.2149318673912e-05, + "loss": 0.8645, + "step": 5872 + }, + { + "epoch": 0.9258892997391579, + "grad_norm": 0.9765625, + "learning_rate": 5.214527625431075e-05, + "loss": 0.9279, + "step": 5873 + }, + { + "epoch": 0.9260469515865508, + "grad_norm": 1.0625, + "learning_rate": 5.214123393613489e-05, + "loss": 0.8389, + "step": 5874 + }, + { + "epoch": 0.9262046034339438, + "grad_norm": 1.28125, + "learning_rate": 5.213719171939295e-05, + "loss": 1.2833, + "step": 5875 + }, + { + "epoch": 0.9263622552813369, + "grad_norm": 0.92578125, + "learning_rate": 5.21331496040935e-05, + "loss": 0.9594, + "step": 5876 + }, + { + "epoch": 0.9265199071287299, + "grad_norm": 1.0234375, + "learning_rate": 5.21291075902451e-05, + "loss": 1.077, + "step": 5877 + }, + { + "epoch": 0.9266775589761229, + "grad_norm": 0.96875, + "learning_rate": 5.212506567785627e-05, + "loss": 0.8524, + "step": 5878 + }, + { + "epoch": 0.9268352108235159, + "grad_norm": 0.88671875, + "learning_rate": 5.212102386693567e-05, + "loss": 0.7821, + "step": 5879 + }, + { + "epoch": 0.926992862670909, + "grad_norm": 1.015625, + "learning_rate": 5.211698215749183e-05, + "loss": 0.9558, + "step": 5880 + }, + { + "epoch": 0.927150514518302, + "grad_norm": 0.89453125, + "learning_rate": 5.211294054953331e-05, + "loss": 0.9101, + "step": 5881 + }, + { + "epoch": 0.9273081663656949, + "grad_norm": 0.984375, + "learning_rate": 5.210889904306868e-05, + "loss": 1.0595, + "step": 5882 + }, + { + "epoch": 0.9274658182130879, + "grad_norm": 0.98046875, + "learning_rate": 5.2104857638106456e-05, + "loss": 0.9815, + "step": 5883 + }, + { + "epoch": 0.927623470060481, + "grad_norm": 1.0390625, + "learning_rate": 5.210081633465529e-05, + "loss": 1.0831, + "step": 5884 + }, + { + "epoch": 0.927781121907874, + "grad_norm": 0.8828125, + "learning_rate": 5.20967751327237e-05, + "loss": 1.1076, + "step": 5885 + }, + { + "epoch": 0.927938773755267, + "grad_norm": 0.9765625, + "learning_rate": 5.2092734032320266e-05, + "loss": 0.9915, + "step": 5886 + }, + { + "epoch": 0.92809642560266, + "grad_norm": 1.0390625, + "learning_rate": 5.208869303345354e-05, + "loss": 1.1267, + "step": 5887 + }, + { + "epoch": 0.9282540774500531, + "grad_norm": 0.95703125, + "learning_rate": 5.2084652136132054e-05, + "loss": 0.8463, + "step": 5888 + }, + { + "epoch": 0.9284117292974461, + "grad_norm": 0.8984375, + "learning_rate": 5.2080611340364436e-05, + "loss": 0.9818, + "step": 5889 + }, + { + "epoch": 0.928569381144839, + "grad_norm": 1.234375, + "learning_rate": 5.2076570646159226e-05, + "loss": 1.1022, + "step": 5890 + }, + { + "epoch": 0.928727032992232, + "grad_norm": 0.99609375, + "learning_rate": 5.207253005352499e-05, + "loss": 1.0835, + "step": 5891 + }, + { + "epoch": 0.928884684839625, + "grad_norm": 0.9765625, + "learning_rate": 5.206848956247029e-05, + "loss": 0.9519, + "step": 5892 + }, + { + "epoch": 0.9290423366870181, + "grad_norm": 0.9375, + "learning_rate": 5.206444917300365e-05, + "loss": 0.988, + "step": 5893 + }, + { + "epoch": 0.9291999885344111, + "grad_norm": 0.875, + "learning_rate": 5.2060408885133705e-05, + "loss": 0.9596, + "step": 5894 + }, + { + "epoch": 0.9293576403818041, + "grad_norm": 1.015625, + "learning_rate": 5.2056368698868986e-05, + "loss": 1.039, + "step": 5895 + }, + { + "epoch": 0.9295152922291972, + "grad_norm": 0.9453125, + "learning_rate": 5.205232861421805e-05, + "loss": 0.9141, + "step": 5896 + }, + { + "epoch": 0.9296729440765902, + "grad_norm": 1.6328125, + "learning_rate": 5.204828863118946e-05, + "loss": 1.0132, + "step": 5897 + }, + { + "epoch": 0.9298305959239831, + "grad_norm": 0.953125, + "learning_rate": 5.204424874979179e-05, + "loss": 0.8834, + "step": 5898 + }, + { + "epoch": 0.9299882477713761, + "grad_norm": 0.94140625, + "learning_rate": 5.204020897003359e-05, + "loss": 0.8777, + "step": 5899 + }, + { + "epoch": 0.9301458996187691, + "grad_norm": 0.8984375, + "learning_rate": 5.203616929192341e-05, + "loss": 0.8955, + "step": 5900 + }, + { + "epoch": 0.9303035514661622, + "grad_norm": 1.03125, + "learning_rate": 5.203212971546981e-05, + "loss": 0.9617, + "step": 5901 + }, + { + "epoch": 0.9304612033135552, + "grad_norm": 0.890625, + "learning_rate": 5.202809024068141e-05, + "loss": 0.8958, + "step": 5902 + }, + { + "epoch": 0.9306188551609482, + "grad_norm": 0.98046875, + "learning_rate": 5.202405086756672e-05, + "loss": 0.9283, + "step": 5903 + }, + { + "epoch": 0.9307765070083412, + "grad_norm": 0.83984375, + "learning_rate": 5.202001159613431e-05, + "loss": 0.7725, + "step": 5904 + }, + { + "epoch": 0.9309341588557343, + "grad_norm": 0.9296875, + "learning_rate": 5.201597242639275e-05, + "loss": 0.8305, + "step": 5905 + }, + { + "epoch": 0.9310918107031272, + "grad_norm": 0.9765625, + "learning_rate": 5.2011933358350596e-05, + "loss": 0.9733, + "step": 5906 + }, + { + "epoch": 0.9312494625505202, + "grad_norm": 0.86328125, + "learning_rate": 5.2007894392016386e-05, + "loss": 0.7721, + "step": 5907 + }, + { + "epoch": 0.9314071143979132, + "grad_norm": 1.078125, + "learning_rate": 5.2003855527398684e-05, + "loss": 0.9208, + "step": 5908 + }, + { + "epoch": 0.9315647662453063, + "grad_norm": 1.0390625, + "learning_rate": 5.1999816764506094e-05, + "loss": 0.9847, + "step": 5909 + }, + { + "epoch": 0.9317224180926993, + "grad_norm": 1.03125, + "learning_rate": 5.199577810334716e-05, + "loss": 0.8112, + "step": 5910 + }, + { + "epoch": 0.9318800699400923, + "grad_norm": 0.93359375, + "learning_rate": 5.199173954393042e-05, + "loss": 0.8262, + "step": 5911 + }, + { + "epoch": 0.9320377217874853, + "grad_norm": 1.0078125, + "learning_rate": 5.198770108626445e-05, + "loss": 1.0527, + "step": 5912 + }, + { + "epoch": 0.9321953736348784, + "grad_norm": 0.984375, + "learning_rate": 5.198366273035781e-05, + "loss": 1.2363, + "step": 5913 + }, + { + "epoch": 0.9323530254822713, + "grad_norm": 1.546875, + "learning_rate": 5.1979624476219004e-05, + "loss": 0.871, + "step": 5914 + }, + { + "epoch": 0.9325106773296643, + "grad_norm": 1.0859375, + "learning_rate": 5.197558632385667e-05, + "loss": 0.8931, + "step": 5915 + }, + { + "epoch": 0.9326683291770573, + "grad_norm": 1.0546875, + "learning_rate": 5.1971548273279344e-05, + "loss": 0.8731, + "step": 5916 + }, + { + "epoch": 0.9328259810244504, + "grad_norm": 0.9296875, + "learning_rate": 5.1967510324495585e-05, + "loss": 1.0519, + "step": 5917 + }, + { + "epoch": 0.9329836328718434, + "grad_norm": 0.96875, + "learning_rate": 5.1963472477513945e-05, + "loss": 1.0993, + "step": 5918 + }, + { + "epoch": 0.9331412847192364, + "grad_norm": 0.9765625, + "learning_rate": 5.195943473234293e-05, + "loss": 0.961, + "step": 5919 + }, + { + "epoch": 0.9332989365666294, + "grad_norm": 0.87890625, + "learning_rate": 5.195539708899118e-05, + "loss": 0.9082, + "step": 5920 + }, + { + "epoch": 0.9334565884140225, + "grad_norm": 1.015625, + "learning_rate": 5.1951359547467216e-05, + "loss": 0.8568, + "step": 5921 + }, + { + "epoch": 0.9336142402614154, + "grad_norm": 1.0, + "learning_rate": 5.194732210777962e-05, + "loss": 1.0848, + "step": 5922 + }, + { + "epoch": 0.9337718921088084, + "grad_norm": 1.0, + "learning_rate": 5.194328476993692e-05, + "loss": 0.9581, + "step": 5923 + }, + { + "epoch": 0.9339295439562014, + "grad_norm": 0.953125, + "learning_rate": 5.193924753394763e-05, + "loss": 0.9395, + "step": 5924 + }, + { + "epoch": 0.9340871958035945, + "grad_norm": 0.92578125, + "learning_rate": 5.193521039982041e-05, + "loss": 0.8488, + "step": 5925 + }, + { + "epoch": 0.9342448476509875, + "grad_norm": 0.98828125, + "learning_rate": 5.193117336756377e-05, + "loss": 1.0533, + "step": 5926 + }, + { + "epoch": 0.9344024994983805, + "grad_norm": 0.9375, + "learning_rate": 5.1927136437186255e-05, + "loss": 1.03, + "step": 5927 + }, + { + "epoch": 0.9345601513457735, + "grad_norm": 0.9375, + "learning_rate": 5.192309960869641e-05, + "loss": 0.9089, + "step": 5928 + }, + { + "epoch": 0.9347178031931666, + "grad_norm": 0.95703125, + "learning_rate": 5.1919062882102786e-05, + "loss": 0.9257, + "step": 5929 + }, + { + "epoch": 0.9348754550405595, + "grad_norm": 1.0625, + "learning_rate": 5.1915026257414e-05, + "loss": 1.1853, + "step": 5930 + }, + { + "epoch": 0.9350331068879525, + "grad_norm": 0.88671875, + "learning_rate": 5.191098973463855e-05, + "loss": 0.7492, + "step": 5931 + }, + { + "epoch": 0.9351907587353455, + "grad_norm": 0.9453125, + "learning_rate": 5.190695331378504e-05, + "loss": 0.9495, + "step": 5932 + }, + { + "epoch": 0.9353484105827385, + "grad_norm": 0.9609375, + "learning_rate": 5.190291699486196e-05, + "loss": 1.0073, + "step": 5933 + }, + { + "epoch": 0.9355060624301316, + "grad_norm": 1.0234375, + "learning_rate": 5.189888077787788e-05, + "loss": 0.9552, + "step": 5934 + }, + { + "epoch": 0.9356637142775246, + "grad_norm": 0.984375, + "learning_rate": 5.18948446628414e-05, + "loss": 0.9881, + "step": 5935 + }, + { + "epoch": 0.9358213661249176, + "grad_norm": 0.92578125, + "learning_rate": 5.189080864976106e-05, + "loss": 0.9923, + "step": 5936 + }, + { + "epoch": 0.9359790179723106, + "grad_norm": 0.91015625, + "learning_rate": 5.1886772738645394e-05, + "loss": 0.9373, + "step": 5937 + }, + { + "epoch": 0.9361366698197036, + "grad_norm": 0.96875, + "learning_rate": 5.1882736929502964e-05, + "loss": 1.0164, + "step": 5938 + }, + { + "epoch": 0.9362943216670966, + "grad_norm": 1.03125, + "learning_rate": 5.187870122234228e-05, + "loss": 0.9574, + "step": 5939 + }, + { + "epoch": 0.9364519735144896, + "grad_norm": 0.99609375, + "learning_rate": 5.187466561717198e-05, + "loss": 1.0283, + "step": 5940 + }, + { + "epoch": 0.9366096253618826, + "grad_norm": 0.87890625, + "learning_rate": 5.187063011400057e-05, + "loss": 1.1919, + "step": 5941 + }, + { + "epoch": 0.9367672772092757, + "grad_norm": 0.953125, + "learning_rate": 5.1866594712836615e-05, + "loss": 0.9824, + "step": 5942 + }, + { + "epoch": 0.9369249290566687, + "grad_norm": 0.9609375, + "learning_rate": 5.1862559413688664e-05, + "loss": 0.8855, + "step": 5943 + }, + { + "epoch": 0.9370825809040617, + "grad_norm": 0.9453125, + "learning_rate": 5.185852421656526e-05, + "loss": 0.9611, + "step": 5944 + }, + { + "epoch": 0.9372402327514547, + "grad_norm": 0.875, + "learning_rate": 5.1854489121474966e-05, + "loss": 0.9198, + "step": 5945 + }, + { + "epoch": 0.9373978845988478, + "grad_norm": 0.90234375, + "learning_rate": 5.185045412842633e-05, + "loss": 0.9571, + "step": 5946 + }, + { + "epoch": 0.9375555364462407, + "grad_norm": 1.046875, + "learning_rate": 5.18464192374279e-05, + "loss": 1.1453, + "step": 5947 + }, + { + "epoch": 0.9377131882936337, + "grad_norm": 0.88671875, + "learning_rate": 5.1842384448488245e-05, + "loss": 0.9111, + "step": 5948 + }, + { + "epoch": 0.9378708401410267, + "grad_norm": 0.9921875, + "learning_rate": 5.1838349761615857e-05, + "loss": 1.0392, + "step": 5949 + }, + { + "epoch": 0.9380284919884198, + "grad_norm": 0.91796875, + "learning_rate": 5.183431517681937e-05, + "loss": 0.879, + "step": 5950 + }, + { + "epoch": 0.9381861438358128, + "grad_norm": 1.109375, + "learning_rate": 5.1830280694107304e-05, + "loss": 1.0958, + "step": 5951 + }, + { + "epoch": 0.9383437956832058, + "grad_norm": 0.984375, + "learning_rate": 5.18262463134882e-05, + "loss": 1.0192, + "step": 5952 + }, + { + "epoch": 0.9385014475305988, + "grad_norm": 0.94140625, + "learning_rate": 5.1822212034970615e-05, + "loss": 0.8064, + "step": 5953 + }, + { + "epoch": 0.9386590993779919, + "grad_norm": 0.93359375, + "learning_rate": 5.1818177858563066e-05, + "loss": 0.942, + "step": 5954 + }, + { + "epoch": 0.9388167512253848, + "grad_norm": 0.95703125, + "learning_rate": 5.181414378427416e-05, + "loss": 1.0061, + "step": 5955 + }, + { + "epoch": 0.9389744030727778, + "grad_norm": 0.98046875, + "learning_rate": 5.181010981211243e-05, + "loss": 1.0374, + "step": 5956 + }, + { + "epoch": 0.9391320549201708, + "grad_norm": 0.8828125, + "learning_rate": 5.180607594208642e-05, + "loss": 0.9151, + "step": 5957 + }, + { + "epoch": 0.9392897067675638, + "grad_norm": 0.9140625, + "learning_rate": 5.180204217420468e-05, + "loss": 0.9625, + "step": 5958 + }, + { + "epoch": 0.9394473586149569, + "grad_norm": 0.89453125, + "learning_rate": 5.179800850847572e-05, + "loss": 0.9019, + "step": 5959 + }, + { + "epoch": 0.9396050104623499, + "grad_norm": 0.91796875, + "learning_rate": 5.179397494490814e-05, + "loss": 0.9473, + "step": 5960 + }, + { + "epoch": 0.9397626623097429, + "grad_norm": 0.8671875, + "learning_rate": 5.17899414835105e-05, + "loss": 0.9781, + "step": 5961 + }, + { + "epoch": 0.939920314157136, + "grad_norm": 0.9140625, + "learning_rate": 5.1785908124291314e-05, + "loss": 0.8964, + "step": 5962 + }, + { + "epoch": 0.9400779660045289, + "grad_norm": 0.8671875, + "learning_rate": 5.178187486725914e-05, + "loss": 0.7881, + "step": 5963 + }, + { + "epoch": 0.9402356178519219, + "grad_norm": 0.8984375, + "learning_rate": 5.177784171242248e-05, + "loss": 1.0487, + "step": 5964 + }, + { + "epoch": 0.9403932696993149, + "grad_norm": 0.9609375, + "learning_rate": 5.177380865978998e-05, + "loss": 1.0007, + "step": 5965 + }, + { + "epoch": 0.9405509215467079, + "grad_norm": 0.88671875, + "learning_rate": 5.1769775709370136e-05, + "loss": 0.876, + "step": 5966 + }, + { + "epoch": 0.940708573394101, + "grad_norm": 0.984375, + "learning_rate": 5.17657428611715e-05, + "loss": 0.9164, + "step": 5967 + }, + { + "epoch": 0.940866225241494, + "grad_norm": 1.0703125, + "learning_rate": 5.1761710115202597e-05, + "loss": 1.0681, + "step": 5968 + }, + { + "epoch": 0.941023877088887, + "grad_norm": 1.0234375, + "learning_rate": 5.175767747147197e-05, + "loss": 1.197, + "step": 5969 + }, + { + "epoch": 0.94118152893628, + "grad_norm": 0.9296875, + "learning_rate": 5.175364492998822e-05, + "loss": 0.8356, + "step": 5970 + }, + { + "epoch": 0.941339180783673, + "grad_norm": 0.89453125, + "learning_rate": 5.174961249075986e-05, + "loss": 0.8718, + "step": 5971 + }, + { + "epoch": 0.941496832631066, + "grad_norm": 0.93359375, + "learning_rate": 5.174558015379545e-05, + "loss": 0.9766, + "step": 5972 + }, + { + "epoch": 0.941654484478459, + "grad_norm": 0.921875, + "learning_rate": 5.1741547919103506e-05, + "loss": 0.9965, + "step": 5973 + }, + { + "epoch": 0.941812136325852, + "grad_norm": 0.96484375, + "learning_rate": 5.173751578669257e-05, + "loss": 1.0399, + "step": 5974 + }, + { + "epoch": 0.9419697881732451, + "grad_norm": 0.84765625, + "learning_rate": 5.173348375657123e-05, + "loss": 0.7095, + "step": 5975 + }, + { + "epoch": 0.9421274400206381, + "grad_norm": 0.97265625, + "learning_rate": 5.172945182874803e-05, + "loss": 1.1426, + "step": 5976 + }, + { + "epoch": 0.9422850918680311, + "grad_norm": 0.96875, + "learning_rate": 5.17254200032315e-05, + "loss": 0.9876, + "step": 5977 + }, + { + "epoch": 0.9424427437154241, + "grad_norm": 1.0078125, + "learning_rate": 5.172138828003017e-05, + "loss": 0.8665, + "step": 5978 + }, + { + "epoch": 0.942600395562817, + "grad_norm": 1.078125, + "learning_rate": 5.171735665915257e-05, + "loss": 1.2725, + "step": 5979 + }, + { + "epoch": 0.9427580474102101, + "grad_norm": 0.90234375, + "learning_rate": 5.171332514060731e-05, + "loss": 0.8259, + "step": 5980 + }, + { + "epoch": 0.9429156992576031, + "grad_norm": 1.015625, + "learning_rate": 5.1709293724402896e-05, + "loss": 0.9257, + "step": 5981 + }, + { + "epoch": 0.9430733511049961, + "grad_norm": 0.98828125, + "learning_rate": 5.1705262410547875e-05, + "loss": 0.842, + "step": 5982 + }, + { + "epoch": 0.9432310029523892, + "grad_norm": 0.953125, + "learning_rate": 5.170123119905078e-05, + "loss": 0.786, + "step": 5983 + }, + { + "epoch": 0.9433886547997822, + "grad_norm": 0.8984375, + "learning_rate": 5.1697200089920174e-05, + "loss": 0.9072, + "step": 5984 + }, + { + "epoch": 0.9435463066471752, + "grad_norm": 1.0234375, + "learning_rate": 5.169316908316459e-05, + "loss": 1.1225, + "step": 5985 + }, + { + "epoch": 0.9437039584945682, + "grad_norm": 0.8203125, + "learning_rate": 5.168913817879257e-05, + "loss": 0.8003, + "step": 5986 + }, + { + "epoch": 0.9438616103419611, + "grad_norm": 0.9765625, + "learning_rate": 5.1685107376812625e-05, + "loss": 1.0501, + "step": 5987 + }, + { + "epoch": 0.9440192621893542, + "grad_norm": 1.296875, + "learning_rate": 5.168107667723338e-05, + "loss": 1.0591, + "step": 5988 + }, + { + "epoch": 0.9441769140367472, + "grad_norm": 0.92578125, + "learning_rate": 5.1677046080063315e-05, + "loss": 0.7935, + "step": 5989 + }, + { + "epoch": 0.9443345658841402, + "grad_norm": 1.0078125, + "learning_rate": 5.1673015585311e-05, + "loss": 0.9711, + "step": 5990 + }, + { + "epoch": 0.9444922177315332, + "grad_norm": 1.0625, + "learning_rate": 5.1668985192984966e-05, + "loss": 1.0681, + "step": 5991 + }, + { + "epoch": 0.9446498695789263, + "grad_norm": 0.88671875, + "learning_rate": 5.166495490309376e-05, + "loss": 0.8749, + "step": 5992 + }, + { + "epoch": 0.9448075214263193, + "grad_norm": 0.9921875, + "learning_rate": 5.16609247156459e-05, + "loss": 1.0906, + "step": 5993 + }, + { + "epoch": 0.9449651732737123, + "grad_norm": 0.88671875, + "learning_rate": 5.1656894630649924e-05, + "loss": 0.892, + "step": 5994 + }, + { + "epoch": 0.9451228251211052, + "grad_norm": 0.84765625, + "learning_rate": 5.165286464811443e-05, + "loss": 0.9334, + "step": 5995 + }, + { + "epoch": 0.9452804769684983, + "grad_norm": 0.83203125, + "learning_rate": 5.1648834768047923e-05, + "loss": 0.8715, + "step": 5996 + }, + { + "epoch": 0.9454381288158913, + "grad_norm": 0.953125, + "learning_rate": 5.1644804990458964e-05, + "loss": 0.7605, + "step": 5997 + }, + { + "epoch": 0.9455957806632843, + "grad_norm": 0.94921875, + "learning_rate": 5.164077531535605e-05, + "loss": 0.9868, + "step": 5998 + }, + { + "epoch": 0.9457534325106773, + "grad_norm": 0.96875, + "learning_rate": 5.1636745742747724e-05, + "loss": 0.8976, + "step": 5999 + }, + { + "epoch": 0.9459110843580704, + "grad_norm": 0.96484375, + "learning_rate": 5.163271627264259e-05, + "loss": 0.9973, + "step": 6000 + }, + { + "epoch": 0.9459110843580704, + "eval_loss": 0.959323525428772, + "eval_runtime": 309.4169, + "eval_samples_per_second": 32.319, + "eval_steps_per_second": 0.675, + "step": 6000 + }, + { + "epoch": 0.9460687362054634, + "grad_norm": 0.875, + "learning_rate": 5.162868690504916e-05, + "loss": 0.8932, + "step": 6001 + }, + { + "epoch": 0.9462263880528564, + "grad_norm": 0.9921875, + "learning_rate": 5.162465763997595e-05, + "loss": 0.932, + "step": 6002 + }, + { + "epoch": 0.9463840399002493, + "grad_norm": 1.0078125, + "learning_rate": 5.162062847743152e-05, + "loss": 1.0426, + "step": 6003 + }, + { + "epoch": 0.9465416917476424, + "grad_norm": 0.94140625, + "learning_rate": 5.161659941742436e-05, + "loss": 1.0108, + "step": 6004 + }, + { + "epoch": 0.9466993435950354, + "grad_norm": 0.9609375, + "learning_rate": 5.161257045996308e-05, + "loss": 0.8938, + "step": 6005 + }, + { + "epoch": 0.9468569954424284, + "grad_norm": 0.98046875, + "learning_rate": 5.160854160505622e-05, + "loss": 1.0057, + "step": 6006 + }, + { + "epoch": 0.9470146472898214, + "grad_norm": 1.015625, + "learning_rate": 5.160451285271226e-05, + "loss": 1.1199, + "step": 6007 + }, + { + "epoch": 0.9471722991372145, + "grad_norm": 1.1875, + "learning_rate": 5.160048420293978e-05, + "loss": 0.8967, + "step": 6008 + }, + { + "epoch": 0.9473299509846075, + "grad_norm": 1.0234375, + "learning_rate": 5.159645565574727e-05, + "loss": 0.8131, + "step": 6009 + }, + { + "epoch": 0.9474876028320005, + "grad_norm": 1.09375, + "learning_rate": 5.159242721114334e-05, + "loss": 0.9427, + "step": 6010 + }, + { + "epoch": 0.9476452546793934, + "grad_norm": 0.96875, + "learning_rate": 5.158839886913651e-05, + "loss": 1.0102, + "step": 6011 + }, + { + "epoch": 0.9478029065267864, + "grad_norm": 1.0546875, + "learning_rate": 5.158437062973529e-05, + "loss": 1.1484, + "step": 6012 + }, + { + "epoch": 0.9479605583741795, + "grad_norm": 0.99609375, + "learning_rate": 5.158034249294823e-05, + "loss": 0.967, + "step": 6013 + }, + { + "epoch": 0.9481182102215725, + "grad_norm": 1.1796875, + "learning_rate": 5.1576314458783816e-05, + "loss": 0.977, + "step": 6014 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 0.9453125, + "learning_rate": 5.157228652725069e-05, + "loss": 1.1578, + "step": 6015 + }, + { + "epoch": 0.9484335139163586, + "grad_norm": 1.0390625, + "learning_rate": 5.156825869835733e-05, + "loss": 1.125, + "step": 6016 + }, + { + "epoch": 0.9485911657637516, + "grad_norm": 0.90625, + "learning_rate": 5.156423097211227e-05, + "loss": 0.9617, + "step": 6017 + }, + { + "epoch": 0.9487488176111446, + "grad_norm": 1.0859375, + "learning_rate": 5.156020334852407e-05, + "loss": 1.3923, + "step": 6018 + }, + { + "epoch": 0.9489064694585375, + "grad_norm": 0.984375, + "learning_rate": 5.1556175827601196e-05, + "loss": 0.9741, + "step": 6019 + }, + { + "epoch": 0.9490641213059305, + "grad_norm": 0.93359375, + "learning_rate": 5.155214840935228e-05, + "loss": 0.8716, + "step": 6020 + }, + { + "epoch": 0.9492217731533236, + "grad_norm": 1.125, + "learning_rate": 5.1548121093785825e-05, + "loss": 1.0271, + "step": 6021 + }, + { + "epoch": 0.9493794250007166, + "grad_norm": 0.99609375, + "learning_rate": 5.154409388091035e-05, + "loss": 1.0327, + "step": 6022 + }, + { + "epoch": 0.9495370768481096, + "grad_norm": 0.9140625, + "learning_rate": 5.1540066770734406e-05, + "loss": 0.9956, + "step": 6023 + }, + { + "epoch": 0.9496947286955026, + "grad_norm": 0.9921875, + "learning_rate": 5.1536039763266474e-05, + "loss": 0.8596, + "step": 6024 + }, + { + "epoch": 0.9498523805428957, + "grad_norm": 0.90234375, + "learning_rate": 5.153201285851519e-05, + "loss": 1.0377, + "step": 6025 + }, + { + "epoch": 0.9500100323902887, + "grad_norm": 0.93359375, + "learning_rate": 5.152798605648901e-05, + "loss": 0.8718, + "step": 6026 + }, + { + "epoch": 0.9501676842376816, + "grad_norm": 0.89453125, + "learning_rate": 5.1523959357196516e-05, + "loss": 1.0539, + "step": 6027 + }, + { + "epoch": 0.9503253360850746, + "grad_norm": 0.93359375, + "learning_rate": 5.1519932760646194e-05, + "loss": 0.9756, + "step": 6028 + }, + { + "epoch": 0.9504829879324677, + "grad_norm": 0.75390625, + "learning_rate": 5.1515906266846626e-05, + "loss": 0.7756, + "step": 6029 + }, + { + "epoch": 0.9506406397798607, + "grad_norm": 0.87890625, + "learning_rate": 5.151187987580631e-05, + "loss": 0.8525, + "step": 6030 + }, + { + "epoch": 0.9507982916272537, + "grad_norm": 0.890625, + "learning_rate": 5.1507853587533806e-05, + "loss": 0.9808, + "step": 6031 + }, + { + "epoch": 0.9509559434746467, + "grad_norm": 0.91796875, + "learning_rate": 5.150382740203763e-05, + "loss": 0.9077, + "step": 6032 + }, + { + "epoch": 0.9511135953220398, + "grad_norm": 0.8984375, + "learning_rate": 5.149980131932631e-05, + "loss": 0.8928, + "step": 6033 + }, + { + "epoch": 0.9512712471694328, + "grad_norm": 1.046875, + "learning_rate": 5.149577533940836e-05, + "loss": 0.8483, + "step": 6034 + }, + { + "epoch": 0.9514288990168257, + "grad_norm": 0.7890625, + "learning_rate": 5.149174946229238e-05, + "loss": 0.8436, + "step": 6035 + }, + { + "epoch": 0.9515865508642187, + "grad_norm": 0.94140625, + "learning_rate": 5.1487723687986866e-05, + "loss": 0.9597, + "step": 6036 + }, + { + "epoch": 0.9517442027116118, + "grad_norm": 0.9921875, + "learning_rate": 5.148369801650035e-05, + "loss": 1.1048, + "step": 6037 + }, + { + "epoch": 0.9519018545590048, + "grad_norm": 0.88671875, + "learning_rate": 5.1479672447841354e-05, + "loss": 0.9719, + "step": 6038 + }, + { + "epoch": 0.9520595064063978, + "grad_norm": 1.0078125, + "learning_rate": 5.147564698201838e-05, + "loss": 0.7895, + "step": 6039 + }, + { + "epoch": 0.9522171582537908, + "grad_norm": 1.0, + "learning_rate": 5.1471621619040044e-05, + "loss": 1.1075, + "step": 6040 + }, + { + "epoch": 0.9523748101011839, + "grad_norm": 0.94140625, + "learning_rate": 5.146759635891483e-05, + "loss": 0.9394, + "step": 6041 + }, + { + "epoch": 0.9525324619485769, + "grad_norm": 0.96484375, + "learning_rate": 5.1463571201651264e-05, + "loss": 0.8723, + "step": 6042 + }, + { + "epoch": 0.9526901137959698, + "grad_norm": 0.9453125, + "learning_rate": 5.14595461472579e-05, + "loss": 0.8326, + "step": 6043 + }, + { + "epoch": 0.9528477656433628, + "grad_norm": 0.92578125, + "learning_rate": 5.145552119574321e-05, + "loss": 1.0257, + "step": 6044 + }, + { + "epoch": 0.9530054174907558, + "grad_norm": 1.1328125, + "learning_rate": 5.145149634711579e-05, + "loss": 1.1076, + "step": 6045 + }, + { + "epoch": 0.9531630693381489, + "grad_norm": 1.03125, + "learning_rate": 5.144747160138417e-05, + "loss": 1.1927, + "step": 6046 + }, + { + "epoch": 0.9533207211855419, + "grad_norm": 1.5390625, + "learning_rate": 5.144344695855685e-05, + "loss": 1.0061, + "step": 6047 + }, + { + "epoch": 0.9534783730329349, + "grad_norm": 1.015625, + "learning_rate": 5.143942241864237e-05, + "loss": 0.9752, + "step": 6048 + }, + { + "epoch": 0.953636024880328, + "grad_norm": 0.875, + "learning_rate": 5.143539798164923e-05, + "loss": 0.8646, + "step": 6049 + }, + { + "epoch": 0.953793676727721, + "grad_norm": 1.0859375, + "learning_rate": 5.143137364758601e-05, + "loss": 0.999, + "step": 6050 + }, + { + "epoch": 0.9539513285751139, + "grad_norm": 0.9765625, + "learning_rate": 5.142734941646123e-05, + "loss": 0.89, + "step": 6051 + }, + { + "epoch": 0.9541089804225069, + "grad_norm": 0.96875, + "learning_rate": 5.1423325288283396e-05, + "loss": 0.9009, + "step": 6052 + }, + { + "epoch": 0.9542666322698999, + "grad_norm": 0.97265625, + "learning_rate": 5.141930126306105e-05, + "loss": 0.9557, + "step": 6053 + }, + { + "epoch": 0.954424284117293, + "grad_norm": 1.234375, + "learning_rate": 5.141527734080272e-05, + "loss": 0.7477, + "step": 6054 + }, + { + "epoch": 0.954581935964686, + "grad_norm": 0.9296875, + "learning_rate": 5.14112535215169e-05, + "loss": 1.0028, + "step": 6055 + }, + { + "epoch": 0.954739587812079, + "grad_norm": 1.0546875, + "learning_rate": 5.140722980521218e-05, + "loss": 1.1223, + "step": 6056 + }, + { + "epoch": 0.954897239659472, + "grad_norm": 0.859375, + "learning_rate": 5.140320619189707e-05, + "loss": 0.8888, + "step": 6057 + }, + { + "epoch": 0.9550548915068651, + "grad_norm": 1.0546875, + "learning_rate": 5.139918268158008e-05, + "loss": 1.1492, + "step": 6058 + }, + { + "epoch": 0.955212543354258, + "grad_norm": 0.8671875, + "learning_rate": 5.139515927426974e-05, + "loss": 0.8697, + "step": 6059 + }, + { + "epoch": 0.955370195201651, + "grad_norm": 0.9375, + "learning_rate": 5.139113596997456e-05, + "loss": 0.9028, + "step": 6060 + }, + { + "epoch": 0.955527847049044, + "grad_norm": 0.89453125, + "learning_rate": 5.13871127687031e-05, + "loss": 0.8188, + "step": 6061 + }, + { + "epoch": 0.9556854988964371, + "grad_norm": 0.90234375, + "learning_rate": 5.1383089670463904e-05, + "loss": 0.9077, + "step": 6062 + }, + { + "epoch": 0.9558431507438301, + "grad_norm": 0.9296875, + "learning_rate": 5.137906667526545e-05, + "loss": 0.8774, + "step": 6063 + }, + { + "epoch": 0.9560008025912231, + "grad_norm": 1.0234375, + "learning_rate": 5.1375043783116295e-05, + "loss": 0.9426, + "step": 6064 + }, + { + "epoch": 0.9561584544386161, + "grad_norm": 0.87890625, + "learning_rate": 5.137102099402491e-05, + "loss": 1.0485, + "step": 6065 + }, + { + "epoch": 0.9563161062860092, + "grad_norm": 0.90625, + "learning_rate": 5.1366998307999915e-05, + "loss": 0.9104, + "step": 6066 + }, + { + "epoch": 0.9564737581334021, + "grad_norm": 0.87109375, + "learning_rate": 5.1362975725049775e-05, + "loss": 0.7291, + "step": 6067 + }, + { + "epoch": 0.9566314099807951, + "grad_norm": 1.0234375, + "learning_rate": 5.135895324518305e-05, + "loss": 1.1182, + "step": 6068 + }, + { + "epoch": 0.9567890618281881, + "grad_norm": 1.09375, + "learning_rate": 5.1354930868408224e-05, + "loss": 1.1858, + "step": 6069 + }, + { + "epoch": 0.9569467136755812, + "grad_norm": 0.98046875, + "learning_rate": 5.1350908594733835e-05, + "loss": 0.8538, + "step": 6070 + }, + { + "epoch": 0.9571043655229742, + "grad_norm": 0.890625, + "learning_rate": 5.134688642416844e-05, + "loss": 0.9835, + "step": 6071 + }, + { + "epoch": 0.9572620173703672, + "grad_norm": 1.015625, + "learning_rate": 5.1342864356720486e-05, + "loss": 1.1581, + "step": 6072 + }, + { + "epoch": 0.9574196692177602, + "grad_norm": 2.609375, + "learning_rate": 5.133884239239859e-05, + "loss": 1.1422, + "step": 6073 + }, + { + "epoch": 0.9575773210651533, + "grad_norm": 0.921875, + "learning_rate": 5.1334820531211234e-05, + "loss": 0.9639, + "step": 6074 + }, + { + "epoch": 0.9577349729125462, + "grad_norm": 0.921875, + "learning_rate": 5.1330798773166946e-05, + "loss": 0.8985, + "step": 6075 + }, + { + "epoch": 0.9578926247599392, + "grad_norm": 0.984375, + "learning_rate": 5.132677711827425e-05, + "loss": 0.8634, + "step": 6076 + }, + { + "epoch": 0.9580502766073322, + "grad_norm": 0.9921875, + "learning_rate": 5.132275556654166e-05, + "loss": 0.9618, + "step": 6077 + }, + { + "epoch": 0.9582079284547252, + "grad_norm": 0.859375, + "learning_rate": 5.131873411797772e-05, + "loss": 0.9962, + "step": 6078 + }, + { + "epoch": 0.9583655803021183, + "grad_norm": 0.94140625, + "learning_rate": 5.131471277259092e-05, + "loss": 0.9375, + "step": 6079 + }, + { + "epoch": 0.9585232321495113, + "grad_norm": 0.95703125, + "learning_rate": 5.131069153038979e-05, + "loss": 0.9513, + "step": 6080 + }, + { + "epoch": 0.9586808839969043, + "grad_norm": 0.88671875, + "learning_rate": 5.130667039138288e-05, + "loss": 0.8425, + "step": 6081 + }, + { + "epoch": 0.9588385358442973, + "grad_norm": 1.0078125, + "learning_rate": 5.130264935557871e-05, + "loss": 0.9104, + "step": 6082 + }, + { + "epoch": 0.9589961876916903, + "grad_norm": 1.0078125, + "learning_rate": 5.1298628422985804e-05, + "loss": 1.2201, + "step": 6083 + }, + { + "epoch": 0.9591538395390833, + "grad_norm": 0.921875, + "learning_rate": 5.129460759361265e-05, + "loss": 1.0961, + "step": 6084 + }, + { + "epoch": 0.9593114913864763, + "grad_norm": 0.99609375, + "learning_rate": 5.1290586867467775e-05, + "loss": 0.8757, + "step": 6085 + }, + { + "epoch": 0.9594691432338693, + "grad_norm": 0.91796875, + "learning_rate": 5.128656624455974e-05, + "loss": 0.8503, + "step": 6086 + }, + { + "epoch": 0.9596267950812624, + "grad_norm": 0.99609375, + "learning_rate": 5.128254572489705e-05, + "loss": 0.9588, + "step": 6087 + }, + { + "epoch": 0.9597844469286554, + "grad_norm": 0.87890625, + "learning_rate": 5.127852530848821e-05, + "loss": 0.8888, + "step": 6088 + }, + { + "epoch": 0.9599420987760484, + "grad_norm": 0.9765625, + "learning_rate": 5.1274504995341765e-05, + "loss": 0.8162, + "step": 6089 + }, + { + "epoch": 0.9600997506234414, + "grad_norm": 1.0703125, + "learning_rate": 5.127048478546617e-05, + "loss": 0.9641, + "step": 6090 + }, + { + "epoch": 0.9602574024708344, + "grad_norm": 0.9921875, + "learning_rate": 5.126646467887004e-05, + "loss": 1.0423, + "step": 6091 + }, + { + "epoch": 0.9604150543182274, + "grad_norm": 0.98828125, + "learning_rate": 5.1262444675561846e-05, + "loss": 0.9312, + "step": 6092 + }, + { + "epoch": 0.9605727061656204, + "grad_norm": 0.890625, + "learning_rate": 5.125842477555014e-05, + "loss": 0.7581, + "step": 6093 + }, + { + "epoch": 0.9607303580130134, + "grad_norm": 0.9921875, + "learning_rate": 5.1254404978843396e-05, + "loss": 0.994, + "step": 6094 + }, + { + "epoch": 0.9608880098604065, + "grad_norm": 1.015625, + "learning_rate": 5.125038528545012e-05, + "loss": 1.0063, + "step": 6095 + }, + { + "epoch": 0.9610456617077995, + "grad_norm": 1.0625, + "learning_rate": 5.12463656953789e-05, + "loss": 0.9376, + "step": 6096 + }, + { + "epoch": 0.9612033135551925, + "grad_norm": 1.0, + "learning_rate": 5.1242346208638236e-05, + "loss": 0.9376, + "step": 6097 + }, + { + "epoch": 0.9613609654025855, + "grad_norm": 1.0078125, + "learning_rate": 5.123832682523661e-05, + "loss": 0.9298, + "step": 6098 + }, + { + "epoch": 0.9615186172499784, + "grad_norm": 0.97265625, + "learning_rate": 5.123430754518258e-05, + "loss": 1.0575, + "step": 6099 + }, + { + "epoch": 0.9616762690973715, + "grad_norm": 0.984375, + "learning_rate": 5.1230288368484604e-05, + "loss": 1.0552, + "step": 6100 + }, + { + "epoch": 0.9618339209447645, + "grad_norm": 1.40625, + "learning_rate": 5.122626929515128e-05, + "loss": 1.1064, + "step": 6101 + }, + { + "epoch": 0.9619915727921575, + "grad_norm": 0.9609375, + "learning_rate": 5.12222503251911e-05, + "loss": 1.0491, + "step": 6102 + }, + { + "epoch": 0.9621492246395505, + "grad_norm": 0.94921875, + "learning_rate": 5.121823145861257e-05, + "loss": 0.8215, + "step": 6103 + }, + { + "epoch": 0.9623068764869436, + "grad_norm": 1.171875, + "learning_rate": 5.12142126954242e-05, + "loss": 1.0392, + "step": 6104 + }, + { + "epoch": 0.9624645283343366, + "grad_norm": 0.94921875, + "learning_rate": 5.1210194035634496e-05, + "loss": 0.7783, + "step": 6105 + }, + { + "epoch": 0.9626221801817296, + "grad_norm": 0.9296875, + "learning_rate": 5.120617547925202e-05, + "loss": 0.8902, + "step": 6106 + }, + { + "epoch": 0.9627798320291227, + "grad_norm": 0.95703125, + "learning_rate": 5.120215702628527e-05, + "loss": 1.0132, + "step": 6107 + }, + { + "epoch": 0.9629374838765156, + "grad_norm": 0.8984375, + "learning_rate": 5.119813867674276e-05, + "loss": 0.9804, + "step": 6108 + }, + { + "epoch": 0.9630951357239086, + "grad_norm": 0.99609375, + "learning_rate": 5.119412043063301e-05, + "loss": 1.3122, + "step": 6109 + }, + { + "epoch": 0.9632527875713016, + "grad_norm": 0.9375, + "learning_rate": 5.119010228796448e-05, + "loss": 0.8689, + "step": 6110 + }, + { + "epoch": 0.9634104394186946, + "grad_norm": 0.82421875, + "learning_rate": 5.118608424874579e-05, + "loss": 0.9412, + "step": 6111 + }, + { + "epoch": 0.9635680912660877, + "grad_norm": 1.0, + "learning_rate": 5.118206631298541e-05, + "loss": 1.2311, + "step": 6112 + }, + { + "epoch": 0.9637257431134807, + "grad_norm": 1.765625, + "learning_rate": 5.117804848069183e-05, + "loss": 1.0583, + "step": 6113 + }, + { + "epoch": 0.9638833949608737, + "grad_norm": 1.4921875, + "learning_rate": 5.1174030751873604e-05, + "loss": 1.1644, + "step": 6114 + }, + { + "epoch": 0.9640410468082667, + "grad_norm": 1.0859375, + "learning_rate": 5.1170013126539216e-05, + "loss": 1.0145, + "step": 6115 + }, + { + "epoch": 0.9641986986556597, + "grad_norm": 0.96484375, + "learning_rate": 5.1165995604697205e-05, + "loss": 1.0502, + "step": 6116 + }, + { + "epoch": 0.9643563505030527, + "grad_norm": 0.95703125, + "learning_rate": 5.1161978186356067e-05, + "loss": 1.0838, + "step": 6117 + }, + { + "epoch": 0.9645140023504457, + "grad_norm": 0.97265625, + "learning_rate": 5.115796087152433e-05, + "loss": 1.1146, + "step": 6118 + }, + { + "epoch": 0.9646716541978387, + "grad_norm": 0.96484375, + "learning_rate": 5.11539436602105e-05, + "loss": 0.8063, + "step": 6119 + }, + { + "epoch": 0.9648293060452318, + "grad_norm": 0.94921875, + "learning_rate": 5.114992655242306e-05, + "loss": 1.0924, + "step": 6120 + }, + { + "epoch": 0.9649869578926248, + "grad_norm": 0.9765625, + "learning_rate": 5.11459095481706e-05, + "loss": 0.8917, + "step": 6121 + }, + { + "epoch": 0.9651446097400178, + "grad_norm": 1.0078125, + "learning_rate": 5.114189264746159e-05, + "loss": 0.8554, + "step": 6122 + }, + { + "epoch": 0.9653022615874108, + "grad_norm": 0.83203125, + "learning_rate": 5.113787585030454e-05, + "loss": 0.8014, + "step": 6123 + }, + { + "epoch": 0.9654599134348038, + "grad_norm": 0.9140625, + "learning_rate": 5.113385915670796e-05, + "loss": 0.8642, + "step": 6124 + }, + { + "epoch": 0.9656175652821968, + "grad_norm": 1.078125, + "learning_rate": 5.112984256668035e-05, + "loss": 0.972, + "step": 6125 + }, + { + "epoch": 0.9657752171295898, + "grad_norm": 1.0859375, + "learning_rate": 5.1125826080230285e-05, + "loss": 0.9072, + "step": 6126 + }, + { + "epoch": 0.9659328689769828, + "grad_norm": 1.0, + "learning_rate": 5.112180969736623e-05, + "loss": 0.9733, + "step": 6127 + }, + { + "epoch": 0.9660905208243759, + "grad_norm": 0.890625, + "learning_rate": 5.1117793418096704e-05, + "loss": 0.9915, + "step": 6128 + }, + { + "epoch": 0.9662481726717689, + "grad_norm": 1.0078125, + "learning_rate": 5.111377724243023e-05, + "loss": 1.1649, + "step": 6129 + }, + { + "epoch": 0.9664058245191619, + "grad_norm": 0.99609375, + "learning_rate": 5.110976117037527e-05, + "loss": 1.0184, + "step": 6130 + }, + { + "epoch": 0.9665634763665549, + "grad_norm": 0.93359375, + "learning_rate": 5.110574520194041e-05, + "loss": 0.9566, + "step": 6131 + }, + { + "epoch": 0.9667211282139478, + "grad_norm": 1.0078125, + "learning_rate": 5.110172933713413e-05, + "loss": 0.8872, + "step": 6132 + }, + { + "epoch": 0.9668787800613409, + "grad_norm": 0.91015625, + "learning_rate": 5.109771357596495e-05, + "loss": 0.9252, + "step": 6133 + }, + { + "epoch": 0.9670364319087339, + "grad_norm": 0.97265625, + "learning_rate": 5.109369791844136e-05, + "loss": 1.0443, + "step": 6134 + }, + { + "epoch": 0.9671940837561269, + "grad_norm": 0.84375, + "learning_rate": 5.108968236457185e-05, + "loss": 0.8595, + "step": 6135 + }, + { + "epoch": 0.96735173560352, + "grad_norm": 1.9765625, + "learning_rate": 5.1085666914364983e-05, + "loss": 1.0237, + "step": 6136 + }, + { + "epoch": 0.967509387450913, + "grad_norm": 0.953125, + "learning_rate": 5.108165156782927e-05, + "loss": 1.0559, + "step": 6137 + }, + { + "epoch": 0.967667039298306, + "grad_norm": 1.0703125, + "learning_rate": 5.10776363249732e-05, + "loss": 1.3043, + "step": 6138 + }, + { + "epoch": 0.967824691145699, + "grad_norm": 0.90234375, + "learning_rate": 5.107362118580528e-05, + "loss": 0.9638, + "step": 6139 + }, + { + "epoch": 0.9679823429930919, + "grad_norm": 1.015625, + "learning_rate": 5.106960615033397e-05, + "loss": 0.8888, + "step": 6140 + }, + { + "epoch": 0.968139994840485, + "grad_norm": 0.9609375, + "learning_rate": 5.106559121856789e-05, + "loss": 0.9563, + "step": 6141 + }, + { + "epoch": 0.968297646687878, + "grad_norm": 0.8203125, + "learning_rate": 5.1061576390515474e-05, + "loss": 0.767, + "step": 6142 + }, + { + "epoch": 0.968455298535271, + "grad_norm": 1.25, + "learning_rate": 5.105756166618527e-05, + "loss": 1.1356, + "step": 6143 + }, + { + "epoch": 0.968612950382664, + "grad_norm": 1.0703125, + "learning_rate": 5.105354704558576e-05, + "loss": 1.0198, + "step": 6144 + }, + { + "epoch": 0.9687706022300571, + "grad_norm": 0.9609375, + "learning_rate": 5.104953252872542e-05, + "loss": 1.1681, + "step": 6145 + }, + { + "epoch": 0.9689282540774501, + "grad_norm": 1.03125, + "learning_rate": 5.1045518115612835e-05, + "loss": 1.0892, + "step": 6146 + }, + { + "epoch": 0.9690859059248431, + "grad_norm": 1.21875, + "learning_rate": 5.1041503806256474e-05, + "loss": 1.1484, + "step": 6147 + }, + { + "epoch": 0.969243557772236, + "grad_norm": 0.8828125, + "learning_rate": 5.103748960066485e-05, + "loss": 0.9237, + "step": 6148 + }, + { + "epoch": 0.9694012096196291, + "grad_norm": 0.94140625, + "learning_rate": 5.103347549884647e-05, + "loss": 1.0899, + "step": 6149 + }, + { + "epoch": 0.9695588614670221, + "grad_norm": 0.921875, + "learning_rate": 5.1029461500809805e-05, + "loss": 1.0778, + "step": 6150 + }, + { + "epoch": 0.9697165133144151, + "grad_norm": 0.93359375, + "learning_rate": 5.102544760656343e-05, + "loss": 1.0235, + "step": 6151 + }, + { + "epoch": 0.9698741651618081, + "grad_norm": 0.8828125, + "learning_rate": 5.1021433816115814e-05, + "loss": 0.812, + "step": 6152 + }, + { + "epoch": 0.9700318170092012, + "grad_norm": 0.9765625, + "learning_rate": 5.1017420129475476e-05, + "loss": 1.0329, + "step": 6153 + }, + { + "epoch": 0.9701894688565942, + "grad_norm": 0.89453125, + "learning_rate": 5.101340654665092e-05, + "loss": 0.9798, + "step": 6154 + }, + { + "epoch": 0.9703471207039872, + "grad_norm": 0.9765625, + "learning_rate": 5.1009393067650645e-05, + "loss": 0.9775, + "step": 6155 + }, + { + "epoch": 0.9705047725513801, + "grad_norm": 0.87890625, + "learning_rate": 5.100537969248316e-05, + "loss": 0.766, + "step": 6156 + }, + { + "epoch": 0.9706624243987731, + "grad_norm": 1.375, + "learning_rate": 5.100136642115697e-05, + "loss": 1.1291, + "step": 6157 + }, + { + "epoch": 0.9708200762461662, + "grad_norm": 0.95703125, + "learning_rate": 5.099735325368056e-05, + "loss": 0.9208, + "step": 6158 + }, + { + "epoch": 0.9709777280935592, + "grad_norm": 0.9921875, + "learning_rate": 5.099334019006248e-05, + "loss": 1.126, + "step": 6159 + }, + { + "epoch": 0.9711353799409522, + "grad_norm": 5.125, + "learning_rate": 5.098932723031122e-05, + "loss": 1.0685, + "step": 6160 + }, + { + "epoch": 0.9712930317883453, + "grad_norm": 1.0390625, + "learning_rate": 5.098531437443528e-05, + "loss": 0.992, + "step": 6161 + }, + { + "epoch": 0.9714506836357383, + "grad_norm": 0.8671875, + "learning_rate": 5.0981301622443166e-05, + "loss": 0.7634, + "step": 6162 + }, + { + "epoch": 0.9716083354831313, + "grad_norm": 1.6484375, + "learning_rate": 5.097728897434337e-05, + "loss": 1.0612, + "step": 6163 + }, + { + "epoch": 0.9717659873305242, + "grad_norm": 1.1484375, + "learning_rate": 5.097327643014442e-05, + "loss": 1.0621, + "step": 6164 + }, + { + "epoch": 0.9719236391779172, + "grad_norm": 0.87890625, + "learning_rate": 5.0969263989854776e-05, + "loss": 0.8118, + "step": 6165 + }, + { + "epoch": 0.9720812910253103, + "grad_norm": 1.0703125, + "learning_rate": 5.0965251653483e-05, + "loss": 1.0723, + "step": 6166 + }, + { + "epoch": 0.9722389428727033, + "grad_norm": 0.9765625, + "learning_rate": 5.096123942103758e-05, + "loss": 0.9283, + "step": 6167 + }, + { + "epoch": 0.9723965947200963, + "grad_norm": 1.078125, + "learning_rate": 5.0957227292527e-05, + "loss": 1.0944, + "step": 6168 + }, + { + "epoch": 0.9725542465674893, + "grad_norm": 1.3828125, + "learning_rate": 5.0953215267959774e-05, + "loss": 0.7535, + "step": 6169 + }, + { + "epoch": 0.9727118984148824, + "grad_norm": 0.86328125, + "learning_rate": 5.094920334734438e-05, + "loss": 0.9668, + "step": 6170 + }, + { + "epoch": 0.9728695502622754, + "grad_norm": 1.0, + "learning_rate": 5.0945191530689374e-05, + "loss": 1.0312, + "step": 6171 + }, + { + "epoch": 0.9730272021096683, + "grad_norm": 0.921875, + "learning_rate": 5.094117981800324e-05, + "loss": 0.9909, + "step": 6172 + }, + { + "epoch": 0.9731848539570613, + "grad_norm": 1.03125, + "learning_rate": 5.093716820929446e-05, + "loss": 0.7702, + "step": 6173 + }, + { + "epoch": 0.9733425058044544, + "grad_norm": 0.91796875, + "learning_rate": 5.093315670457155e-05, + "loss": 0.9903, + "step": 6174 + }, + { + "epoch": 0.9735001576518474, + "grad_norm": 0.87109375, + "learning_rate": 5.092914530384296e-05, + "loss": 0.8161, + "step": 6175 + }, + { + "epoch": 0.9736578094992404, + "grad_norm": 1.015625, + "learning_rate": 5.092513400711729e-05, + "loss": 0.9295, + "step": 6176 + }, + { + "epoch": 0.9738154613466334, + "grad_norm": 1.1328125, + "learning_rate": 5.0921122814403e-05, + "loss": 0.9594, + "step": 6177 + }, + { + "epoch": 0.9739731131940265, + "grad_norm": 4.59375, + "learning_rate": 5.091711172570859e-05, + "loss": 0.9644, + "step": 6178 + }, + { + "epoch": 0.9741307650414195, + "grad_norm": 0.8828125, + "learning_rate": 5.091310074104254e-05, + "loss": 0.7723, + "step": 6179 + }, + { + "epoch": 0.9742884168888124, + "grad_norm": 0.828125, + "learning_rate": 5.090908986041334e-05, + "loss": 0.7609, + "step": 6180 + }, + { + "epoch": 0.9744460687362054, + "grad_norm": 0.94140625, + "learning_rate": 5.0905079083829554e-05, + "loss": 1.0714, + "step": 6181 + }, + { + "epoch": 0.9746037205835985, + "grad_norm": 1.0703125, + "learning_rate": 5.090106841129965e-05, + "loss": 1.035, + "step": 6182 + }, + { + "epoch": 0.9747613724309915, + "grad_norm": 1.0390625, + "learning_rate": 5.089705784283212e-05, + "loss": 0.9035, + "step": 6183 + }, + { + "epoch": 0.9749190242783845, + "grad_norm": 0.87890625, + "learning_rate": 5.089304737843547e-05, + "loss": 0.8264, + "step": 6184 + }, + { + "epoch": 0.9750766761257775, + "grad_norm": 0.90625, + "learning_rate": 5.088903701811816e-05, + "loss": 0.8915, + "step": 6185 + }, + { + "epoch": 0.9752343279731706, + "grad_norm": 1.125, + "learning_rate": 5.088502676188878e-05, + "loss": 1.0113, + "step": 6186 + }, + { + "epoch": 0.9753919798205636, + "grad_norm": 0.9140625, + "learning_rate": 5.088101660975575e-05, + "loss": 0.984, + "step": 6187 + }, + { + "epoch": 0.9755496316679565, + "grad_norm": 0.9453125, + "learning_rate": 5.087700656172762e-05, + "loss": 0.8693, + "step": 6188 + }, + { + "epoch": 0.9757072835153495, + "grad_norm": 0.921875, + "learning_rate": 5.087299661781286e-05, + "loss": 0.8011, + "step": 6189 + }, + { + "epoch": 0.9758649353627425, + "grad_norm": 0.9375, + "learning_rate": 5.086898677801995e-05, + "loss": 1.038, + "step": 6190 + }, + { + "epoch": 0.9760225872101356, + "grad_norm": 0.95703125, + "learning_rate": 5.086497704235743e-05, + "loss": 1.0494, + "step": 6191 + }, + { + "epoch": 0.9761802390575286, + "grad_norm": 0.875, + "learning_rate": 5.086096741083379e-05, + "loss": 0.8913, + "step": 6192 + }, + { + "epoch": 0.9763378909049216, + "grad_norm": 1.0625, + "learning_rate": 5.085695788345753e-05, + "loss": 0.9412, + "step": 6193 + }, + { + "epoch": 0.9764955427523146, + "grad_norm": 0.94921875, + "learning_rate": 5.0852948460237134e-05, + "loss": 0.8567, + "step": 6194 + }, + { + "epoch": 0.9766531945997077, + "grad_norm": 1.0546875, + "learning_rate": 5.084893914118111e-05, + "loss": 1.1136, + "step": 6195 + }, + { + "epoch": 0.9768108464471006, + "grad_norm": 1.03125, + "learning_rate": 5.08449299262979e-05, + "loss": 0.8274, + "step": 6196 + }, + { + "epoch": 0.9769684982944936, + "grad_norm": 1.0078125, + "learning_rate": 5.08409208155961e-05, + "loss": 0.9139, + "step": 6197 + }, + { + "epoch": 0.9771261501418866, + "grad_norm": 1.078125, + "learning_rate": 5.083691180908416e-05, + "loss": 0.936, + "step": 6198 + }, + { + "epoch": 0.9772838019892797, + "grad_norm": 2.96875, + "learning_rate": 5.083290290677056e-05, + "loss": 1.1384, + "step": 6199 + }, + { + "epoch": 0.9774414538366727, + "grad_norm": 1.5390625, + "learning_rate": 5.0828894108663825e-05, + "loss": 0.9161, + "step": 6200 + }, + { + "epoch": 0.9775991056840657, + "grad_norm": 1.03125, + "learning_rate": 5.082488541477244e-05, + "loss": 0.9197, + "step": 6201 + }, + { + "epoch": 0.9777567575314587, + "grad_norm": 0.85546875, + "learning_rate": 5.0820876825104905e-05, + "loss": 0.7796, + "step": 6202 + }, + { + "epoch": 0.9779144093788518, + "grad_norm": 0.98046875, + "learning_rate": 5.0816868339669696e-05, + "loss": 1.2388, + "step": 6203 + }, + { + "epoch": 0.9780720612262447, + "grad_norm": 0.9296875, + "learning_rate": 5.0812859958475335e-05, + "loss": 0.8058, + "step": 6204 + }, + { + "epoch": 0.9782297130736377, + "grad_norm": 0.921875, + "learning_rate": 5.080885168153029e-05, + "loss": 0.8904, + "step": 6205 + }, + { + "epoch": 0.9783873649210307, + "grad_norm": 0.8828125, + "learning_rate": 5.0804843508843045e-05, + "loss": 0.9233, + "step": 6206 + }, + { + "epoch": 0.9785450167684238, + "grad_norm": 1.21875, + "learning_rate": 5.080083544042216e-05, + "loss": 0.895, + "step": 6207 + }, + { + "epoch": 0.9787026686158168, + "grad_norm": 1.046875, + "learning_rate": 5.079682747627609e-05, + "loss": 0.9837, + "step": 6208 + }, + { + "epoch": 0.9788603204632098, + "grad_norm": 1.0390625, + "learning_rate": 5.079281961641333e-05, + "loss": 0.958, + "step": 6209 + }, + { + "epoch": 0.9790179723106028, + "grad_norm": 1.09375, + "learning_rate": 5.078881186084239e-05, + "loss": 0.8657, + "step": 6210 + }, + { + "epoch": 0.9791756241579959, + "grad_norm": 1.0, + "learning_rate": 5.07848042095717e-05, + "loss": 0.9161, + "step": 6211 + }, + { + "epoch": 0.9793332760053888, + "grad_norm": 1.0234375, + "learning_rate": 5.078079666260984e-05, + "loss": 0.8087, + "step": 6212 + }, + { + "epoch": 0.9794909278527818, + "grad_norm": 0.98046875, + "learning_rate": 5.077678921996527e-05, + "loss": 1.0254, + "step": 6213 + }, + { + "epoch": 0.9796485797001748, + "grad_norm": 1.515625, + "learning_rate": 5.077278188164649e-05, + "loss": 0.9125, + "step": 6214 + }, + { + "epoch": 0.9798062315475679, + "grad_norm": 0.8984375, + "learning_rate": 5.0768774647661974e-05, + "loss": 0.8898, + "step": 6215 + }, + { + "epoch": 0.9799638833949609, + "grad_norm": 0.8828125, + "learning_rate": 5.076476751802019e-05, + "loss": 0.9189, + "step": 6216 + }, + { + "epoch": 0.9801215352423539, + "grad_norm": 0.921875, + "learning_rate": 5.076076049272971e-05, + "loss": 0.9931, + "step": 6217 + }, + { + "epoch": 0.9802791870897469, + "grad_norm": 0.9921875, + "learning_rate": 5.075675357179899e-05, + "loss": 0.8888, + "step": 6218 + }, + { + "epoch": 0.98043683893714, + "grad_norm": 0.95703125, + "learning_rate": 5.07527467552365e-05, + "loss": 1.127, + "step": 6219 + }, + { + "epoch": 0.9805944907845329, + "grad_norm": 0.9765625, + "learning_rate": 5.074874004305077e-05, + "loss": 1.1695, + "step": 6220 + }, + { + "epoch": 0.9807521426319259, + "grad_norm": 0.9296875, + "learning_rate": 5.074473343525022e-05, + "loss": 1.1037, + "step": 6221 + }, + { + "epoch": 0.9809097944793189, + "grad_norm": 0.91796875, + "learning_rate": 5.074072693184342e-05, + "loss": 0.9835, + "step": 6222 + }, + { + "epoch": 0.981067446326712, + "grad_norm": 2.40625, + "learning_rate": 5.073672053283884e-05, + "loss": 0.9306, + "step": 6223 + }, + { + "epoch": 0.981225098174105, + "grad_norm": 1.0546875, + "learning_rate": 5.073271423824497e-05, + "loss": 1.0804, + "step": 6224 + }, + { + "epoch": 0.981382750021498, + "grad_norm": 0.828125, + "learning_rate": 5.072870804807031e-05, + "loss": 0.9123, + "step": 6225 + }, + { + "epoch": 0.981540401868891, + "grad_norm": 0.953125, + "learning_rate": 5.0724701962323274e-05, + "loss": 0.9612, + "step": 6226 + }, + { + "epoch": 0.981698053716284, + "grad_norm": 0.953125, + "learning_rate": 5.0720695981012455e-05, + "loss": 1.0878, + "step": 6227 + }, + { + "epoch": 0.981855705563677, + "grad_norm": 0.875, + "learning_rate": 5.071669010414633e-05, + "loss": 0.827, + "step": 6228 + }, + { + "epoch": 0.98201335741107, + "grad_norm": 0.8671875, + "learning_rate": 5.071268433173333e-05, + "loss": 0.8911, + "step": 6229 + }, + { + "epoch": 0.982171009258463, + "grad_norm": 0.921875, + "learning_rate": 5.0708678663781995e-05, + "loss": 0.9276, + "step": 6230 + }, + { + "epoch": 0.982328661105856, + "grad_norm": 0.9453125, + "learning_rate": 5.070467310030076e-05, + "loss": 0.9871, + "step": 6231 + }, + { + "epoch": 0.9824863129532491, + "grad_norm": 1.0234375, + "learning_rate": 5.0700667641298196e-05, + "loss": 0.9789, + "step": 6232 + }, + { + "epoch": 0.9826439648006421, + "grad_norm": 0.98828125, + "learning_rate": 5.069666228678274e-05, + "loss": 0.9062, + "step": 6233 + }, + { + "epoch": 0.9828016166480351, + "grad_norm": 0.96484375, + "learning_rate": 5.069265703676289e-05, + "loss": 0.8797, + "step": 6234 + }, + { + "epoch": 0.9829592684954281, + "grad_norm": 0.98046875, + "learning_rate": 5.068865189124714e-05, + "loss": 0.943, + "step": 6235 + }, + { + "epoch": 0.983116920342821, + "grad_norm": 0.8671875, + "learning_rate": 5.0684646850243946e-05, + "loss": 1.0768, + "step": 6236 + }, + { + "epoch": 0.9832745721902141, + "grad_norm": 0.953125, + "learning_rate": 5.068064191376185e-05, + "loss": 0.8451, + "step": 6237 + }, + { + "epoch": 0.9834322240376071, + "grad_norm": 1.0078125, + "learning_rate": 5.067663708180932e-05, + "loss": 0.9286, + "step": 6238 + }, + { + "epoch": 0.9835898758850001, + "grad_norm": 0.87109375, + "learning_rate": 5.067263235439483e-05, + "loss": 0.8852, + "step": 6239 + }, + { + "epoch": 0.9837475277323932, + "grad_norm": 0.91015625, + "learning_rate": 5.066862773152687e-05, + "loss": 1.0036, + "step": 6240 + }, + { + "epoch": 0.9839051795797862, + "grad_norm": 0.90625, + "learning_rate": 5.0664623213213947e-05, + "loss": 1.022, + "step": 6241 + }, + { + "epoch": 0.9840628314271792, + "grad_norm": 0.828125, + "learning_rate": 5.066061879946453e-05, + "loss": 0.7765, + "step": 6242 + }, + { + "epoch": 0.9842204832745722, + "grad_norm": 0.95703125, + "learning_rate": 5.065661449028709e-05, + "loss": 0.9773, + "step": 6243 + }, + { + "epoch": 0.9843781351219651, + "grad_norm": 1.03125, + "learning_rate": 5.065261028569015e-05, + "loss": 1.1496, + "step": 6244 + }, + { + "epoch": 0.9845357869693582, + "grad_norm": 0.90625, + "learning_rate": 5.064860618568219e-05, + "loss": 0.8877, + "step": 6245 + }, + { + "epoch": 0.9846934388167512, + "grad_norm": 0.92578125, + "learning_rate": 5.064460219027169e-05, + "loss": 0.9299, + "step": 6246 + }, + { + "epoch": 0.9848510906641442, + "grad_norm": 0.92578125, + "learning_rate": 5.064059829946715e-05, + "loss": 0.9905, + "step": 6247 + }, + { + "epoch": 0.9850087425115372, + "grad_norm": 1.0390625, + "learning_rate": 5.0636594513277015e-05, + "loss": 1.1631, + "step": 6248 + }, + { + "epoch": 0.9851663943589303, + "grad_norm": 0.96484375, + "learning_rate": 5.0632590831709816e-05, + "loss": 0.9508, + "step": 6249 + }, + { + "epoch": 0.9853240462063233, + "grad_norm": 0.9453125, + "learning_rate": 5.0628587254773995e-05, + "loss": 1.0704, + "step": 6250 + }, + { + "epoch": 0.9854816980537163, + "grad_norm": 1.0, + "learning_rate": 5.062458378247804e-05, + "loss": 0.9032, + "step": 6251 + }, + { + "epoch": 0.9856393499011092, + "grad_norm": 1.0390625, + "learning_rate": 5.062058041483049e-05, + "loss": 1.0423, + "step": 6252 + }, + { + "epoch": 0.9857970017485023, + "grad_norm": 1.03125, + "learning_rate": 5.061657715183981e-05, + "loss": 0.9898, + "step": 6253 + }, + { + "epoch": 0.9859546535958953, + "grad_norm": 0.9921875, + "learning_rate": 5.0612573993514465e-05, + "loss": 1.1372, + "step": 6254 + }, + { + "epoch": 0.9861123054432883, + "grad_norm": 1.078125, + "learning_rate": 5.0608570939862933e-05, + "loss": 0.97, + "step": 6255 + }, + { + "epoch": 0.9862699572906813, + "grad_norm": 0.95703125, + "learning_rate": 5.060456799089369e-05, + "loss": 0.9411, + "step": 6256 + }, + { + "epoch": 0.9864276091380744, + "grad_norm": 1.0078125, + "learning_rate": 5.060056514661527e-05, + "loss": 1.067, + "step": 6257 + }, + { + "epoch": 0.9865852609854674, + "grad_norm": 0.93359375, + "learning_rate": 5.059656240703614e-05, + "loss": 0.9598, + "step": 6258 + }, + { + "epoch": 0.9867429128328604, + "grad_norm": 0.921875, + "learning_rate": 5.059255977216477e-05, + "loss": 0.912, + "step": 6259 + }, + { + "epoch": 0.9869005646802534, + "grad_norm": 0.953125, + "learning_rate": 5.058855724200964e-05, + "loss": 1.0736, + "step": 6260 + }, + { + "epoch": 0.9870582165276464, + "grad_norm": 1.0078125, + "learning_rate": 5.05845548165792e-05, + "loss": 1.1007, + "step": 6261 + }, + { + "epoch": 0.9872158683750394, + "grad_norm": 0.97265625, + "learning_rate": 5.0580552495882005e-05, + "loss": 0.9513, + "step": 6262 + }, + { + "epoch": 0.9873735202224324, + "grad_norm": 1.0546875, + "learning_rate": 5.0576550279926516e-05, + "loss": 0.9707, + "step": 6263 + }, + { + "epoch": 0.9875311720698254, + "grad_norm": 0.91015625, + "learning_rate": 5.05725481687212e-05, + "loss": 0.7737, + "step": 6264 + }, + { + "epoch": 0.9876888239172185, + "grad_norm": 0.9296875, + "learning_rate": 5.0568546162274546e-05, + "loss": 0.851, + "step": 6265 + }, + { + "epoch": 0.9878464757646115, + "grad_norm": 0.94140625, + "learning_rate": 5.056454426059498e-05, + "loss": 1.0445, + "step": 6266 + }, + { + "epoch": 0.9880041276120045, + "grad_norm": 0.99609375, + "learning_rate": 5.0560542463691083e-05, + "loss": 1.0755, + "step": 6267 + }, + { + "epoch": 0.9881617794593975, + "grad_norm": 1.5234375, + "learning_rate": 5.05565407715713e-05, + "loss": 1.0833, + "step": 6268 + }, + { + "epoch": 0.9883194313067905, + "grad_norm": 0.875, + "learning_rate": 5.0552539184244094e-05, + "loss": 1.0062, + "step": 6269 + }, + { + "epoch": 0.9884770831541835, + "grad_norm": 0.921875, + "learning_rate": 5.054853770171797e-05, + "loss": 0.9839, + "step": 6270 + }, + { + "epoch": 0.9886347350015765, + "grad_norm": 0.87109375, + "learning_rate": 5.054453632400133e-05, + "loss": 1.0254, + "step": 6271 + }, + { + "epoch": 0.9887923868489695, + "grad_norm": 0.8671875, + "learning_rate": 5.0540535051102776e-05, + "loss": 0.9012, + "step": 6272 + }, + { + "epoch": 0.9889500386963626, + "grad_norm": 0.953125, + "learning_rate": 5.0536533883030726e-05, + "loss": 0.9597, + "step": 6273 + }, + { + "epoch": 0.9891076905437556, + "grad_norm": 0.984375, + "learning_rate": 5.053253281979367e-05, + "loss": 1.0043, + "step": 6274 + }, + { + "epoch": 0.9892653423911486, + "grad_norm": 0.80859375, + "learning_rate": 5.052853186140008e-05, + "loss": 0.8044, + "step": 6275 + }, + { + "epoch": 0.9894229942385416, + "grad_norm": 0.9921875, + "learning_rate": 5.052453100785839e-05, + "loss": 0.9659, + "step": 6276 + }, + { + "epoch": 0.9895806460859345, + "grad_norm": 0.96875, + "learning_rate": 5.052053025917717e-05, + "loss": 0.9449, + "step": 6277 + }, + { + "epoch": 0.9897382979333276, + "grad_norm": 0.90625, + "learning_rate": 5.051652961536486e-05, + "loss": 0.9016, + "step": 6278 + }, + { + "epoch": 0.9898959497807206, + "grad_norm": 0.8359375, + "learning_rate": 5.051252907642994e-05, + "loss": 0.8199, + "step": 6279 + }, + { + "epoch": 0.9900536016281136, + "grad_norm": 1.0, + "learning_rate": 5.050852864238089e-05, + "loss": 0.7977, + "step": 6280 + }, + { + "epoch": 0.9902112534755066, + "grad_norm": 0.94921875, + "learning_rate": 5.0504528313226144e-05, + "loss": 1.1122, + "step": 6281 + }, + { + "epoch": 0.9903689053228997, + "grad_norm": 0.89453125, + "learning_rate": 5.050052808897425e-05, + "loss": 0.9225, + "step": 6282 + }, + { + "epoch": 0.9905265571702927, + "grad_norm": 0.94140625, + "learning_rate": 5.049652796963367e-05, + "loss": 0.9358, + "step": 6283 + }, + { + "epoch": 0.9906842090176857, + "grad_norm": 0.9375, + "learning_rate": 5.049252795521286e-05, + "loss": 1.2093, + "step": 6284 + }, + { + "epoch": 0.9908418608650786, + "grad_norm": 0.90625, + "learning_rate": 5.048852804572032e-05, + "loss": 0.9311, + "step": 6285 + }, + { + "epoch": 0.9909995127124717, + "grad_norm": 0.9296875, + "learning_rate": 5.0484528241164494e-05, + "loss": 0.9212, + "step": 6286 + }, + { + "epoch": 0.9911571645598647, + "grad_norm": 0.97265625, + "learning_rate": 5.04805285415539e-05, + "loss": 1.022, + "step": 6287 + }, + { + "epoch": 0.9913148164072577, + "grad_norm": 0.921875, + "learning_rate": 5.047652894689699e-05, + "loss": 0.8836, + "step": 6288 + }, + { + "epoch": 0.9914724682546507, + "grad_norm": 0.90234375, + "learning_rate": 5.047252945720224e-05, + "loss": 0.8671, + "step": 6289 + }, + { + "epoch": 0.9916301201020438, + "grad_norm": 0.9609375, + "learning_rate": 5.046853007247814e-05, + "loss": 1.1066, + "step": 6290 + }, + { + "epoch": 0.9917877719494368, + "grad_norm": 0.9765625, + "learning_rate": 5.046453079273312e-05, + "loss": 0.9417, + "step": 6291 + }, + { + "epoch": 0.9919454237968298, + "grad_norm": 0.9140625, + "learning_rate": 5.046053161797574e-05, + "loss": 0.9379, + "step": 6292 + }, + { + "epoch": 0.9921030756442227, + "grad_norm": 1.0703125, + "learning_rate": 5.0456532548214416e-05, + "loss": 0.9187, + "step": 6293 + }, + { + "epoch": 0.9922607274916158, + "grad_norm": 0.93359375, + "learning_rate": 5.045253358345765e-05, + "loss": 0.9066, + "step": 6294 + }, + { + "epoch": 0.9924183793390088, + "grad_norm": 0.83203125, + "learning_rate": 5.044853472371391e-05, + "loss": 0.7859, + "step": 6295 + }, + { + "epoch": 0.9925760311864018, + "grad_norm": 1.1015625, + "learning_rate": 5.044453596899162e-05, + "loss": 1.0803, + "step": 6296 + }, + { + "epoch": 0.9927336830337948, + "grad_norm": 0.86328125, + "learning_rate": 5.044053731929934e-05, + "loss": 0.931, + "step": 6297 + }, + { + "epoch": 0.9928913348811879, + "grad_norm": 1.1484375, + "learning_rate": 5.0436538774645526e-05, + "loss": 1.2049, + "step": 6298 + }, + { + "epoch": 0.9930489867285809, + "grad_norm": 0.84375, + "learning_rate": 5.043254033503862e-05, + "loss": 0.9402, + "step": 6299 + }, + { + "epoch": 0.9932066385759739, + "grad_norm": 1.1953125, + "learning_rate": 5.042854200048712e-05, + "loss": 0.9887, + "step": 6300 + }, + { + "epoch": 0.9933642904233668, + "grad_norm": 0.9296875, + "learning_rate": 5.042454377099946e-05, + "loss": 0.8942, + "step": 6301 + }, + { + "epoch": 0.9935219422707599, + "grad_norm": 1.1015625, + "learning_rate": 5.042054564658416e-05, + "loss": 0.9965, + "step": 6302 + }, + { + "epoch": 0.9936795941181529, + "grad_norm": 1.0234375, + "learning_rate": 5.04165476272497e-05, + "loss": 0.9882, + "step": 6303 + }, + { + "epoch": 0.9938372459655459, + "grad_norm": 0.95703125, + "learning_rate": 5.041254971300453e-05, + "loss": 0.9204, + "step": 6304 + }, + { + "epoch": 0.9939948978129389, + "grad_norm": 0.9765625, + "learning_rate": 5.040855190385712e-05, + "loss": 0.9309, + "step": 6305 + }, + { + "epoch": 0.994152549660332, + "grad_norm": 0.9609375, + "learning_rate": 5.0404554199815915e-05, + "loss": 1.1481, + "step": 6306 + }, + { + "epoch": 0.994310201507725, + "grad_norm": 0.90234375, + "learning_rate": 5.040055660088946e-05, + "loss": 0.8913, + "step": 6307 + }, + { + "epoch": 0.994467853355118, + "grad_norm": 0.9609375, + "learning_rate": 5.039655910708618e-05, + "loss": 0.9688, + "step": 6308 + }, + { + "epoch": 0.9946255052025109, + "grad_norm": 0.9375, + "learning_rate": 5.0392561718414575e-05, + "loss": 0.8557, + "step": 6309 + }, + { + "epoch": 0.9947831570499039, + "grad_norm": 0.9140625, + "learning_rate": 5.03885644348831e-05, + "loss": 0.8381, + "step": 6310 + }, + { + "epoch": 0.994940808897297, + "grad_norm": 0.95703125, + "learning_rate": 5.038456725650018e-05, + "loss": 0.9648, + "step": 6311 + }, + { + "epoch": 0.99509846074469, + "grad_norm": 1.0234375, + "learning_rate": 5.038057018327438e-05, + "loss": 1.0214, + "step": 6312 + }, + { + "epoch": 0.995256112592083, + "grad_norm": 0.8828125, + "learning_rate": 5.037657321521412e-05, + "loss": 0.8468, + "step": 6313 + }, + { + "epoch": 0.995413764439476, + "grad_norm": 0.9375, + "learning_rate": 5.037257635232788e-05, + "loss": 1.1382, + "step": 6314 + }, + { + "epoch": 0.9955714162868691, + "grad_norm": 1.140625, + "learning_rate": 5.036857959462412e-05, + "loss": 0.9959, + "step": 6315 + }, + { + "epoch": 0.9957290681342621, + "grad_norm": 1.015625, + "learning_rate": 5.0364582942111284e-05, + "loss": 0.8813, + "step": 6316 + }, + { + "epoch": 0.995886719981655, + "grad_norm": 0.96875, + "learning_rate": 5.036058639479792e-05, + "loss": 1.0607, + "step": 6317 + }, + { + "epoch": 0.996044371829048, + "grad_norm": 0.94140625, + "learning_rate": 5.0356589952692455e-05, + "loss": 0.993, + "step": 6318 + }, + { + "epoch": 0.9962020236764411, + "grad_norm": 0.93359375, + "learning_rate": 5.035259361580336e-05, + "loss": 0.9779, + "step": 6319 + }, + { + "epoch": 0.9963596755238341, + "grad_norm": 1.0859375, + "learning_rate": 5.03485973841391e-05, + "loss": 1.0452, + "step": 6320 + }, + { + "epoch": 0.9965173273712271, + "grad_norm": 0.90625, + "learning_rate": 5.0344601257708116e-05, + "loss": 0.8008, + "step": 6321 + }, + { + "epoch": 0.9966749792186201, + "grad_norm": 1.046875, + "learning_rate": 5.0340605236518945e-05, + "loss": 0.9249, + "step": 6322 + }, + { + "epoch": 0.9968326310660132, + "grad_norm": 0.9375, + "learning_rate": 5.033660932058002e-05, + "loss": 0.8058, + "step": 6323 + }, + { + "epoch": 0.9969902829134062, + "grad_norm": 0.9375, + "learning_rate": 5.0332613509899816e-05, + "loss": 1.0034, + "step": 6324 + }, + { + "epoch": 0.9971479347607991, + "grad_norm": 0.99609375, + "learning_rate": 5.032861780448681e-05, + "loss": 0.8987, + "step": 6325 + }, + { + "epoch": 0.9973055866081921, + "grad_norm": 0.94921875, + "learning_rate": 5.032462220434946e-05, + "loss": 1.0116, + "step": 6326 + }, + { + "epoch": 0.9974632384555852, + "grad_norm": 1.0078125, + "learning_rate": 5.032062670949622e-05, + "loss": 1.1243, + "step": 6327 + }, + { + "epoch": 0.9976208903029782, + "grad_norm": 0.92578125, + "learning_rate": 5.031663131993558e-05, + "loss": 0.8762, + "step": 6328 + }, + { + "epoch": 0.9977785421503712, + "grad_norm": 0.93359375, + "learning_rate": 5.031263603567595e-05, + "loss": 1.0111, + "step": 6329 + }, + { + "epoch": 0.9979361939977642, + "grad_norm": 0.80859375, + "learning_rate": 5.030864085672591e-05, + "loss": 0.7475, + "step": 6330 + }, + { + "epoch": 0.9980938458451573, + "grad_norm": 0.8671875, + "learning_rate": 5.030464578309384e-05, + "loss": 0.9049, + "step": 6331 + }, + { + "epoch": 0.9982514976925503, + "grad_norm": 1.046875, + "learning_rate": 5.0300650814788254e-05, + "loss": 1.1051, + "step": 6332 + }, + { + "epoch": 0.9984091495399432, + "grad_norm": 0.921875, + "learning_rate": 5.029665595181758e-05, + "loss": 0.901, + "step": 6333 + }, + { + "epoch": 0.9985668013873362, + "grad_norm": 1.59375, + "learning_rate": 5.029266119419031e-05, + "loss": 0.9752, + "step": 6334 + }, + { + "epoch": 0.9987244532347292, + "grad_norm": 0.95703125, + "learning_rate": 5.028866654191491e-05, + "loss": 0.8623, + "step": 6335 + }, + { + "epoch": 0.9988821050821223, + "grad_norm": 0.9296875, + "learning_rate": 5.02846719949998e-05, + "loss": 0.886, + "step": 6336 + }, + { + "epoch": 0.9990397569295153, + "grad_norm": 1.015625, + "learning_rate": 5.0280677553453515e-05, + "loss": 0.9857, + "step": 6337 + }, + { + "epoch": 0.9991974087769083, + "grad_norm": 0.8984375, + "learning_rate": 5.027668321728449e-05, + "loss": 0.8594, + "step": 6338 + }, + { + "epoch": 0.9993550606243014, + "grad_norm": 0.94140625, + "learning_rate": 5.02726889865012e-05, + "loss": 1.0154, + "step": 6339 + }, + { + "epoch": 0.9995127124716944, + "grad_norm": 0.9375, + "learning_rate": 5.026869486111211e-05, + "loss": 0.8742, + "step": 6340 + }, + { + "epoch": 0.9996703643190873, + "grad_norm": 1.0546875, + "learning_rate": 5.0264700841125665e-05, + "loss": 1.0499, + "step": 6341 + }, + { + "epoch": 0.9998280161664803, + "grad_norm": 1.1015625, + "learning_rate": 5.0260706926550314e-05, + "loss": 0.9543, + "step": 6342 + }, + { + "epoch": 0.9999856680138733, + "grad_norm": 1.0546875, + "learning_rate": 5.025671311739459e-05, + "loss": 0.9777, + "step": 6343 + }, + { + "epoch": 1.0, + "grad_norm": 3.640625, + "learning_rate": 5.0252719413666916e-05, + "loss": 0.7707, + "step": 6344 + }, + { + "epoch": 4.384321665723373e-05, + "grad_norm": 0.97265625, + "learning_rate": 5.024872581537576e-05, + "loss": 1.0584, + "step": 1 + }, + { + "epoch": 8.768643331446746e-05, + "grad_norm": 0.95703125, + "learning_rate": 5.02447323225296e-05, + "loss": 1.0001, + "step": 2 + }, + { + "epoch": 0.00013152964997170118, + "grad_norm": 1.0078125, + "learning_rate": 5.0240738935136834e-05, + "loss": 0.9125, + "step": 3 + }, + { + "epoch": 0.00017537286662893493, + "grad_norm": 1.0234375, + "learning_rate": 5.023674565320602e-05, + "loss": 0.9166, + "step": 4 + }, + { + "epoch": 0.00021921608328616865, + "grad_norm": 0.86328125, + "learning_rate": 5.023275247674557e-05, + "loss": 0.9278, + "step": 5 + }, + { + "epoch": 0.00026305929994340237, + "grad_norm": 1.046875, + "learning_rate": 5.022875940576397e-05, + "loss": 0.8569, + "step": 6 + }, + { + "epoch": 0.00030690251660063614, + "grad_norm": 1.109375, + "learning_rate": 5.0224766440269655e-05, + "loss": 0.7878, + "step": 7 + }, + { + "epoch": 0.00035074573325786986, + "grad_norm": 0.8359375, + "learning_rate": 5.0220773580271086e-05, + "loss": 0.7722, + "step": 8 + }, + { + "epoch": 0.0003945889499151036, + "grad_norm": 0.921875, + "learning_rate": 5.0216780825776766e-05, + "loss": 0.7898, + "step": 9 + }, + { + "epoch": 0.0004384321665723373, + "grad_norm": 0.88671875, + "learning_rate": 5.021278817679513e-05, + "loss": 0.7402, + "step": 10 + }, + { + "epoch": 0.00048227538322957107, + "grad_norm": 0.9296875, + "learning_rate": 5.020879563333465e-05, + "loss": 0.8207, + "step": 11 + }, + { + "epoch": 0.0005261185998868047, + "grad_norm": 1.03125, + "learning_rate": 5.020480319540377e-05, + "loss": 1.06, + "step": 12 + }, + { + "epoch": 0.0005699618165440386, + "grad_norm": 1.1328125, + "learning_rate": 5.020081086301094e-05, + "loss": 1.0929, + "step": 13 + }, + { + "epoch": 0.0006138050332012723, + "grad_norm": 0.9765625, + "learning_rate": 5.019681863616468e-05, + "loss": 0.9009, + "step": 14 + }, + { + "epoch": 0.000657648249858506, + "grad_norm": 0.86328125, + "learning_rate": 5.019282651487341e-05, + "loss": 0.8358, + "step": 15 + }, + { + "epoch": 0.0007014914665157397, + "grad_norm": 0.88671875, + "learning_rate": 5.018883449914561e-05, + "loss": 0.8383, + "step": 16 + }, + { + "epoch": 0.0007453346831729734, + "grad_norm": 0.8671875, + "learning_rate": 5.018484258898972e-05, + "loss": 0.971, + "step": 17 + }, + { + "epoch": 0.0007891778998302072, + "grad_norm": 0.875, + "learning_rate": 5.018085078441417e-05, + "loss": 0.8209, + "step": 18 + }, + { + "epoch": 0.0008330211164874409, + "grad_norm": 0.84375, + "learning_rate": 5.01768590854275e-05, + "loss": 0.978, + "step": 19 + }, + { + "epoch": 0.0008768643331446746, + "grad_norm": 0.86328125, + "learning_rate": 5.017286749203813e-05, + "loss": 0.9122, + "step": 20 + }, + { + "epoch": 0.0009207075498019084, + "grad_norm": 0.73046875, + "learning_rate": 5.0168876004254505e-05, + "loss": 0.7657, + "step": 21 + }, + { + "epoch": 0.0009645507664591421, + "grad_norm": 0.953125, + "learning_rate": 5.016488462208512e-05, + "loss": 0.9234, + "step": 22 + }, + { + "epoch": 0.0010083939831163759, + "grad_norm": 0.73828125, + "learning_rate": 5.0160893345538373e-05, + "loss": 0.81, + "step": 23 + }, + { + "epoch": 0.0010522371997736095, + "grad_norm": 0.8046875, + "learning_rate": 5.015690217462279e-05, + "loss": 0.7965, + "step": 24 + }, + { + "epoch": 0.0010960804164308433, + "grad_norm": 0.76953125, + "learning_rate": 5.015291110934681e-05, + "loss": 0.8597, + "step": 25 + }, + { + "epoch": 0.0011399236330880771, + "grad_norm": 0.9140625, + "learning_rate": 5.014892014971888e-05, + "loss": 0.9638, + "step": 26 + }, + { + "epoch": 0.0011837668497453107, + "grad_norm": 0.96875, + "learning_rate": 5.014492929574748e-05, + "loss": 0.9593, + "step": 27 + }, + { + "epoch": 0.0012276100664025446, + "grad_norm": 1.046875, + "learning_rate": 5.014093854744104e-05, + "loss": 0.7753, + "step": 28 + }, + { + "epoch": 0.0012714532830597782, + "grad_norm": 0.84375, + "learning_rate": 5.013694790480803e-05, + "loss": 0.7393, + "step": 29 + }, + { + "epoch": 0.001315296499717012, + "grad_norm": 0.82421875, + "learning_rate": 5.0132957367856914e-05, + "loss": 0.9677, + "step": 30 + }, + { + "epoch": 0.0013591397163742456, + "grad_norm": 0.9609375, + "learning_rate": 5.0128966936596145e-05, + "loss": 0.888, + "step": 31 + }, + { + "epoch": 0.0014029829330314794, + "grad_norm": 0.9453125, + "learning_rate": 5.012497661103417e-05, + "loss": 0.9888, + "step": 32 + }, + { + "epoch": 0.0014468261496887133, + "grad_norm": 0.96875, + "learning_rate": 5.012098639117944e-05, + "loss": 0.9464, + "step": 33 + }, + { + "epoch": 0.0014906693663459469, + "grad_norm": 0.80859375, + "learning_rate": 5.011699627704045e-05, + "loss": 0.8721, + "step": 34 + }, + { + "epoch": 0.0015345125830031807, + "grad_norm": 0.84375, + "learning_rate": 5.0113006268625626e-05, + "loss": 0.8189, + "step": 35 + }, + { + "epoch": 0.0015783557996604143, + "grad_norm": 0.9296875, + "learning_rate": 5.010901636594345e-05, + "loss": 0.8813, + "step": 36 + }, + { + "epoch": 0.0016221990163176481, + "grad_norm": 0.89453125, + "learning_rate": 5.010502656900236e-05, + "loss": 0.9055, + "step": 37 + }, + { + "epoch": 0.0016660422329748817, + "grad_norm": 0.91015625, + "learning_rate": 5.010103687781078e-05, + "loss": 0.8445, + "step": 38 + }, + { + "epoch": 0.0017098854496321156, + "grad_norm": 0.828125, + "learning_rate": 5.0097047292377234e-05, + "loss": 0.8103, + "step": 39 + }, + { + "epoch": 0.0017537286662893492, + "grad_norm": 0.8671875, + "learning_rate": 5.009305781271014e-05, + "loss": 0.9634, + "step": 40 + }, + { + "epoch": 0.001797571882946583, + "grad_norm": 0.890625, + "learning_rate": 5.0089068438817955e-05, + "loss": 0.9088, + "step": 41 + }, + { + "epoch": 0.0018414150996038168, + "grad_norm": 0.83984375, + "learning_rate": 5.0085079170709145e-05, + "loss": 0.8586, + "step": 42 + }, + { + "epoch": 0.0018852583162610505, + "grad_norm": 0.83203125, + "learning_rate": 5.0081090008392114e-05, + "loss": 0.7916, + "step": 43 + }, + { + "epoch": 0.0019291015329182843, + "grad_norm": 0.828125, + "learning_rate": 5.00771009518754e-05, + "loss": 0.8336, + "step": 44 + }, + { + "epoch": 0.001972944749575518, + "grad_norm": 0.82421875, + "learning_rate": 5.007311200116742e-05, + "loss": 0.8392, + "step": 45 + }, + { + "epoch": 0.0020167879662327517, + "grad_norm": 0.96484375, + "learning_rate": 5.006912315627662e-05, + "loss": 0.8085, + "step": 46 + }, + { + "epoch": 0.0020606311828899855, + "grad_norm": 0.94921875, + "learning_rate": 5.0065134417211454e-05, + "loss": 0.835, + "step": 47 + }, + { + "epoch": 0.002104474399547219, + "grad_norm": 0.94921875, + "learning_rate": 5.0061145783980354e-05, + "loss": 0.8486, + "step": 48 + }, + { + "epoch": 0.0021483176162044528, + "grad_norm": 0.84765625, + "learning_rate": 5.0057157256591834e-05, + "loss": 0.769, + "step": 49 + }, + { + "epoch": 0.0021921608328616866, + "grad_norm": 0.82421875, + "learning_rate": 5.005316883505432e-05, + "loss": 0.9368, + "step": 50 + }, + { + "epoch": 0.0022360040495189204, + "grad_norm": 0.82421875, + "learning_rate": 5.0049180519376257e-05, + "loss": 0.9236, + "step": 51 + }, + { + "epoch": 0.0022798472661761542, + "grad_norm": 0.78515625, + "learning_rate": 5.0045192309566105e-05, + "loss": 0.784, + "step": 52 + }, + { + "epoch": 0.0023236904828333876, + "grad_norm": 0.85546875, + "learning_rate": 5.0041204205632266e-05, + "loss": 0.8094, + "step": 53 + }, + { + "epoch": 0.0023675336994906215, + "grad_norm": 0.9296875, + "learning_rate": 5.003721620758328e-05, + "loss": 0.9394, + "step": 54 + }, + { + "epoch": 0.0024113769161478553, + "grad_norm": 0.7734375, + "learning_rate": 5.003322831542756e-05, + "loss": 0.8254, + "step": 55 + }, + { + "epoch": 0.002455220132805089, + "grad_norm": 0.87109375, + "learning_rate": 5.002924052917355e-05, + "loss": 0.877, + "step": 56 + }, + { + "epoch": 0.002499063349462323, + "grad_norm": 0.87890625, + "learning_rate": 5.002525284882972e-05, + "loss": 0.9679, + "step": 57 + }, + { + "epoch": 0.0025429065661195563, + "grad_norm": 0.8515625, + "learning_rate": 5.0021265274404474e-05, + "loss": 0.9491, + "step": 58 + }, + { + "epoch": 0.00258674978277679, + "grad_norm": 0.828125, + "learning_rate": 5.001727780590632e-05, + "loss": 0.8285, + "step": 59 + }, + { + "epoch": 0.002630592999434024, + "grad_norm": 0.95703125, + "learning_rate": 5.001329044334371e-05, + "loss": 0.8668, + "step": 60 + }, + { + "epoch": 0.002674436216091258, + "grad_norm": 0.84375, + "learning_rate": 5.0009303186725056e-05, + "loss": 0.8354, + "step": 61 + }, + { + "epoch": 0.002718279432748491, + "grad_norm": 0.87109375, + "learning_rate": 5.000531603605883e-05, + "loss": 0.8973, + "step": 62 + }, + { + "epoch": 0.002762122649405725, + "grad_norm": 0.83203125, + "learning_rate": 5.0001328991353445e-05, + "loss": 0.8307, + "step": 63 + }, + { + "epoch": 0.002805965866062959, + "grad_norm": 0.94921875, + "learning_rate": 4.999734205261743e-05, + "loss": 0.9271, + "step": 64 + }, + { + "epoch": 0.0028498090827201927, + "grad_norm": 0.81640625, + "learning_rate": 4.999335521985917e-05, + "loss": 0.8197, + "step": 65 + }, + { + "epoch": 0.0028936522993774265, + "grad_norm": 0.78515625, + "learning_rate": 4.998936849308715e-05, + "loss": 0.773, + "step": 66 + }, + { + "epoch": 0.00293749551603466, + "grad_norm": 0.84765625, + "learning_rate": 4.998538187230981e-05, + "loss": 0.939, + "step": 67 + }, + { + "epoch": 0.0029813387326918937, + "grad_norm": 0.87890625, + "learning_rate": 4.998139535753559e-05, + "loss": 0.8179, + "step": 68 + }, + { + "epoch": 0.0030251819493491276, + "grad_norm": 0.9296875, + "learning_rate": 4.997740894877294e-05, + "loss": 0.8116, + "step": 69 + }, + { + "epoch": 0.0030690251660063614, + "grad_norm": 0.7734375, + "learning_rate": 4.997342264603028e-05, + "loss": 0.9225, + "step": 70 + }, + { + "epoch": 0.003112868382663595, + "grad_norm": 0.8984375, + "learning_rate": 4.996943644931612e-05, + "loss": 0.9243, + "step": 71 + }, + { + "epoch": 0.0031567115993208286, + "grad_norm": 0.8671875, + "learning_rate": 4.996545035863889e-05, + "loss": 0.7815, + "step": 72 + }, + { + "epoch": 0.0032005548159780624, + "grad_norm": 0.859375, + "learning_rate": 4.996146437400702e-05, + "loss": 1.0366, + "step": 73 + }, + { + "epoch": 0.0032443980326352963, + "grad_norm": 0.87890625, + "learning_rate": 4.9957478495428965e-05, + "loss": 0.8199, + "step": 74 + }, + { + "epoch": 0.00328824124929253, + "grad_norm": 0.8984375, + "learning_rate": 4.9953492722913185e-05, + "loss": 0.8172, + "step": 75 + }, + { + "epoch": 0.0033320844659497635, + "grad_norm": 1.0546875, + "learning_rate": 4.9949507056468115e-05, + "loss": 0.9314, + "step": 76 + }, + { + "epoch": 0.0033759276826069973, + "grad_norm": 0.91796875, + "learning_rate": 4.99455214961022e-05, + "loss": 0.9103, + "step": 77 + }, + { + "epoch": 0.003419770899264231, + "grad_norm": 0.859375, + "learning_rate": 4.9941536041823846e-05, + "loss": 0.8825, + "step": 78 + }, + { + "epoch": 0.003463614115921465, + "grad_norm": 0.93359375, + "learning_rate": 4.993755069364159e-05, + "loss": 0.7796, + "step": 79 + }, + { + "epoch": 0.0035074573325786984, + "grad_norm": 0.84375, + "learning_rate": 4.993356545156383e-05, + "loss": 0.8363, + "step": 80 + }, + { + "epoch": 0.003551300549235932, + "grad_norm": 0.859375, + "learning_rate": 4.9929580315599024e-05, + "loss": 0.8812, + "step": 81 + }, + { + "epoch": 0.003595143765893166, + "grad_norm": 0.78125, + "learning_rate": 4.992559528575561e-05, + "loss": 0.8534, + "step": 82 + }, + { + "epoch": 0.0036389869825504, + "grad_norm": 0.953125, + "learning_rate": 4.9921610362041994e-05, + "loss": 0.9928, + "step": 83 + }, + { + "epoch": 0.0036828301992076337, + "grad_norm": 0.80859375, + "learning_rate": 4.991762554446669e-05, + "loss": 0.9253, + "step": 84 + }, + { + "epoch": 0.003726673415864867, + "grad_norm": 0.8671875, + "learning_rate": 4.991364083303813e-05, + "loss": 0.9005, + "step": 85 + }, + { + "epoch": 0.003770516632522101, + "grad_norm": 0.86328125, + "learning_rate": 4.990965622776475e-05, + "loss": 0.8404, + "step": 86 + }, + { + "epoch": 0.0038143598491793347, + "grad_norm": 1.34375, + "learning_rate": 4.9905671728654976e-05, + "loss": 0.9964, + "step": 87 + }, + { + "epoch": 0.0038582030658365686, + "grad_norm": 0.78515625, + "learning_rate": 4.990168733571724e-05, + "loss": 0.8115, + "step": 88 + }, + { + "epoch": 0.003902046282493802, + "grad_norm": 0.82421875, + "learning_rate": 4.9897703048960046e-05, + "loss": 0.776, + "step": 89 + }, + { + "epoch": 0.003945889499151036, + "grad_norm": 0.76953125, + "learning_rate": 4.98937188683918e-05, + "loss": 0.7806, + "step": 90 + }, + { + "epoch": 0.00398973271580827, + "grad_norm": 0.78125, + "learning_rate": 4.988973479402097e-05, + "loss": 0.7176, + "step": 91 + }, + { + "epoch": 0.004033575932465503, + "grad_norm": 1.0625, + "learning_rate": 4.988575082585597e-05, + "loss": 0.8331, + "step": 92 + }, + { + "epoch": 0.004077419149122737, + "grad_norm": 1.0078125, + "learning_rate": 4.988176696390523e-05, + "loss": 0.9226, + "step": 93 + }, + { + "epoch": 0.004121262365779971, + "grad_norm": 0.94140625, + "learning_rate": 4.987778320817725e-05, + "loss": 0.9496, + "step": 94 + }, + { + "epoch": 0.004165105582437205, + "grad_norm": 0.78515625, + "learning_rate": 4.987379955868044e-05, + "loss": 0.8732, + "step": 95 + }, + { + "epoch": 0.004208948799094438, + "grad_norm": 0.82421875, + "learning_rate": 4.9869816015423256e-05, + "loss": 0.9693, + "step": 96 + }, + { + "epoch": 0.004252792015751672, + "grad_norm": 0.796875, + "learning_rate": 4.986583257841413e-05, + "loss": 0.7694, + "step": 97 + }, + { + "epoch": 0.0042966352324089055, + "grad_norm": 0.84765625, + "learning_rate": 4.986184924766146e-05, + "loss": 0.8459, + "step": 98 + }, + { + "epoch": 0.004340478449066139, + "grad_norm": 0.82421875, + "learning_rate": 4.985786602317378e-05, + "loss": 0.8346, + "step": 99 + }, + { + "epoch": 0.004384321665723373, + "grad_norm": 0.96875, + "learning_rate": 4.9853882904959496e-05, + "loss": 0.9138, + "step": 100 + }, + { + "epoch": 0.004428164882380607, + "grad_norm": 0.7890625, + "learning_rate": 4.984989989302702e-05, + "loss": 0.8055, + "step": 101 + }, + { + "epoch": 0.004472008099037841, + "grad_norm": 0.8984375, + "learning_rate": 4.984591698738483e-05, + "loss": 0.846, + "step": 102 + }, + { + "epoch": 0.004515851315695075, + "grad_norm": 0.9140625, + "learning_rate": 4.984193418804131e-05, + "loss": 0.9141, + "step": 103 + }, + { + "epoch": 0.0045596945323523085, + "grad_norm": 0.75, + "learning_rate": 4.9837951495004976e-05, + "loss": 0.7612, + "step": 104 + }, + { + "epoch": 0.0046035377490095414, + "grad_norm": 0.828125, + "learning_rate": 4.983396890828425e-05, + "loss": 0.8877, + "step": 105 + }, + { + "epoch": 0.004647380965666775, + "grad_norm": 0.828125, + "learning_rate": 4.982998642788754e-05, + "loss": 0.8928, + "step": 106 + }, + { + "epoch": 0.004691224182324009, + "grad_norm": 0.84375, + "learning_rate": 4.9826004053823324e-05, + "loss": 0.9693, + "step": 107 + }, + { + "epoch": 0.004735067398981243, + "grad_norm": 0.859375, + "learning_rate": 4.982202178609998e-05, + "loss": 0.8087, + "step": 108 + }, + { + "epoch": 0.004778910615638477, + "grad_norm": 0.77734375, + "learning_rate": 4.981803962472603e-05, + "loss": 0.882, + "step": 109 + }, + { + "epoch": 0.004822753832295711, + "grad_norm": 0.75390625, + "learning_rate": 4.9814057569709885e-05, + "loss": 0.7753, + "step": 110 + }, + { + "epoch": 0.004866597048952944, + "grad_norm": 0.85546875, + "learning_rate": 4.981007562105996e-05, + "loss": 0.9255, + "step": 111 + }, + { + "epoch": 0.004910440265610178, + "grad_norm": 0.88671875, + "learning_rate": 4.980609377878472e-05, + "loss": 0.8581, + "step": 112 + }, + { + "epoch": 0.004954283482267412, + "grad_norm": 0.87109375, + "learning_rate": 4.98021120428926e-05, + "loss": 0.9629, + "step": 113 + }, + { + "epoch": 0.004998126698924646, + "grad_norm": 0.859375, + "learning_rate": 4.979813041339203e-05, + "loss": 0.9996, + "step": 114 + }, + { + "epoch": 0.005041969915581879, + "grad_norm": 0.75390625, + "learning_rate": 4.979414889029146e-05, + "loss": 0.7842, + "step": 115 + }, + { + "epoch": 0.005085813132239113, + "grad_norm": 0.85546875, + "learning_rate": 4.9790167473599315e-05, + "loss": 0.8796, + "step": 116 + }, + { + "epoch": 0.0051296563488963465, + "grad_norm": 0.79296875, + "learning_rate": 4.9786186163324035e-05, + "loss": 0.8441, + "step": 117 + }, + { + "epoch": 0.00517349956555358, + "grad_norm": 0.90625, + "learning_rate": 4.9782204959474033e-05, + "loss": 0.9248, + "step": 118 + }, + { + "epoch": 0.005217342782210814, + "grad_norm": 0.83203125, + "learning_rate": 4.977822386205781e-05, + "loss": 0.9112, + "step": 119 + }, + { + "epoch": 0.005261185998868048, + "grad_norm": 0.81640625, + "learning_rate": 4.9774242871083774e-05, + "loss": 0.7753, + "step": 120 + }, + { + "epoch": 0.005305029215525282, + "grad_norm": 0.796875, + "learning_rate": 4.977026198656036e-05, + "loss": 0.9881, + "step": 121 + }, + { + "epoch": 0.005348872432182516, + "grad_norm": 0.8046875, + "learning_rate": 4.9766281208496e-05, + "loss": 0.8306, + "step": 122 + }, + { + "epoch": 0.0053927156488397495, + "grad_norm": 1.078125, + "learning_rate": 4.97623005368991e-05, + "loss": 0.8605, + "step": 123 + }, + { + "epoch": 0.005436558865496982, + "grad_norm": 0.88671875, + "learning_rate": 4.975831997177818e-05, + "loss": 1.0703, + "step": 124 + }, + { + "epoch": 0.005480402082154216, + "grad_norm": 0.8125, + "learning_rate": 4.975433951314161e-05, + "loss": 0.8439, + "step": 125 + }, + { + "epoch": 0.00552424529881145, + "grad_norm": 0.8671875, + "learning_rate": 4.975035916099786e-05, + "loss": 0.8515, + "step": 126 + }, + { + "epoch": 0.005568088515468684, + "grad_norm": 0.8515625, + "learning_rate": 4.974637891535535e-05, + "loss": 0.8655, + "step": 127 + }, + { + "epoch": 0.005611931732125918, + "grad_norm": 0.80859375, + "learning_rate": 4.974239877622248e-05, + "loss": 0.9295, + "step": 128 + }, + { + "epoch": 0.005655774948783152, + "grad_norm": 0.796875, + "learning_rate": 4.9738418743607765e-05, + "loss": 0.7708, + "step": 129 + }, + { + "epoch": 0.005699618165440385, + "grad_norm": 0.78515625, + "learning_rate": 4.973443881751959e-05, + "loss": 0.7583, + "step": 130 + }, + { + "epoch": 0.005743461382097619, + "grad_norm": 0.81640625, + "learning_rate": 4.97304589979664e-05, + "loss": 0.9152, + "step": 131 + }, + { + "epoch": 0.005787304598754853, + "grad_norm": 0.94140625, + "learning_rate": 4.972647928495663e-05, + "loss": 1.0113, + "step": 132 + }, + { + "epoch": 0.005831147815412086, + "grad_norm": 0.91796875, + "learning_rate": 4.972249967849868e-05, + "loss": 0.9016, + "step": 133 + }, + { + "epoch": 0.00587499103206932, + "grad_norm": 0.921875, + "learning_rate": 4.971852017860105e-05, + "loss": 0.9161, + "step": 134 + }, + { + "epoch": 0.005918834248726554, + "grad_norm": 0.8515625, + "learning_rate": 4.9714540785272146e-05, + "loss": 0.8866, + "step": 135 + }, + { + "epoch": 0.0059626774653837875, + "grad_norm": 0.8359375, + "learning_rate": 4.971056149852039e-05, + "loss": 0.8917, + "step": 136 + }, + { + "epoch": 0.006006520682041021, + "grad_norm": 0.85546875, + "learning_rate": 4.970658231835424e-05, + "loss": 0.8551, + "step": 137 + }, + { + "epoch": 0.006050363898698255, + "grad_norm": 0.91796875, + "learning_rate": 4.970260324478211e-05, + "loss": 0.9945, + "step": 138 + }, + { + "epoch": 0.006094207115355489, + "grad_norm": 0.85546875, + "learning_rate": 4.969862427781239e-05, + "loss": 0.8729, + "step": 139 + }, + { + "epoch": 0.006138050332012723, + "grad_norm": 0.85546875, + "learning_rate": 4.96946454174536e-05, + "loss": 0.9312, + "step": 140 + }, + { + "epoch": 0.006181893548669957, + "grad_norm": 0.84765625, + "learning_rate": 4.969066666371415e-05, + "loss": 0.8313, + "step": 141 + }, + { + "epoch": 0.00622573676532719, + "grad_norm": 0.85546875, + "learning_rate": 4.9686688016602436e-05, + "loss": 0.9059, + "step": 142 + }, + { + "epoch": 0.006269579981984423, + "grad_norm": 0.90234375, + "learning_rate": 4.968270947612692e-05, + "loss": 0.9432, + "step": 143 + }, + { + "epoch": 0.006313423198641657, + "grad_norm": 0.96484375, + "learning_rate": 4.967873104229599e-05, + "loss": 0.915, + "step": 144 + }, + { + "epoch": 0.006357266415298891, + "grad_norm": 0.85546875, + "learning_rate": 4.9674752715118146e-05, + "loss": 0.9155, + "step": 145 + }, + { + "epoch": 0.006401109631956125, + "grad_norm": 0.79296875, + "learning_rate": 4.967077449460179e-05, + "loss": 0.7594, + "step": 146 + }, + { + "epoch": 0.006444952848613359, + "grad_norm": 0.765625, + "learning_rate": 4.9666796380755356e-05, + "loss": 0.8237, + "step": 147 + }, + { + "epoch": 0.0064887960652705926, + "grad_norm": 0.8671875, + "learning_rate": 4.966281837358726e-05, + "loss": 0.7984, + "step": 148 + }, + { + "epoch": 0.006532639281927826, + "grad_norm": 0.94921875, + "learning_rate": 4.9658840473105905e-05, + "loss": 0.9267, + "step": 149 + }, + { + "epoch": 0.00657648249858506, + "grad_norm": 0.79296875, + "learning_rate": 4.9654862679319814e-05, + "loss": 0.8037, + "step": 150 + }, + { + "epoch": 0.006620325715242293, + "grad_norm": 0.75, + "learning_rate": 4.9650884992237344e-05, + "loss": 0.7768, + "step": 151 + }, + { + "epoch": 0.006664168931899527, + "grad_norm": 0.83203125, + "learning_rate": 4.964690741186696e-05, + "loss": 0.9107, + "step": 152 + }, + { + "epoch": 0.006708012148556761, + "grad_norm": 0.8125, + "learning_rate": 4.964292993821708e-05, + "loss": 0.7079, + "step": 153 + }, + { + "epoch": 0.006751855365213995, + "grad_norm": 0.8359375, + "learning_rate": 4.9638952571296115e-05, + "loss": 0.8807, + "step": 154 + }, + { + "epoch": 0.0067956985818712285, + "grad_norm": 0.93359375, + "learning_rate": 4.9634975311112484e-05, + "loss": 0.9414, + "step": 155 + }, + { + "epoch": 0.006839541798528462, + "grad_norm": 0.8515625, + "learning_rate": 4.963099815767468e-05, + "loss": 0.7123, + "step": 156 + }, + { + "epoch": 0.006883385015185696, + "grad_norm": 0.87890625, + "learning_rate": 4.96270211109911e-05, + "loss": 0.8115, + "step": 157 + }, + { + "epoch": 0.00692722823184293, + "grad_norm": 0.859375, + "learning_rate": 4.9623044171070165e-05, + "loss": 0.9378, + "step": 158 + }, + { + "epoch": 0.006971071448500164, + "grad_norm": 0.86328125, + "learning_rate": 4.961906733792031e-05, + "loss": 0.8421, + "step": 159 + }, + { + "epoch": 0.007014914665157397, + "grad_norm": 0.796875, + "learning_rate": 4.9615090611549966e-05, + "loss": 0.8475, + "step": 160 + }, + { + "epoch": 0.007058757881814631, + "grad_norm": 0.796875, + "learning_rate": 4.961111399196755e-05, + "loss": 0.8359, + "step": 161 + }, + { + "epoch": 0.007102601098471864, + "grad_norm": 0.96484375, + "learning_rate": 4.9607137479181496e-05, + "loss": 0.7138, + "step": 162 + }, + { + "epoch": 0.007146444315129098, + "grad_norm": 0.87890625, + "learning_rate": 4.960316107320024e-05, + "loss": 1.0373, + "step": 163 + }, + { + "epoch": 0.007190287531786332, + "grad_norm": 0.9140625, + "learning_rate": 4.959918477403217e-05, + "loss": 0.9913, + "step": 164 + }, + { + "epoch": 0.007234130748443566, + "grad_norm": 0.80859375, + "learning_rate": 4.959520858168578e-05, + "loss": 0.8247, + "step": 165 + }, + { + "epoch": 0.0072779739651008, + "grad_norm": 0.8828125, + "learning_rate": 4.959123249616946e-05, + "loss": 0.9366, + "step": 166 + }, + { + "epoch": 0.0073218171817580335, + "grad_norm": 0.78125, + "learning_rate": 4.9587256517491654e-05, + "loss": 0.82, + "step": 167 + }, + { + "epoch": 0.007365660398415267, + "grad_norm": 0.80078125, + "learning_rate": 4.9583280645660766e-05, + "loss": 0.9535, + "step": 168 + }, + { + "epoch": 0.0074095036150725, + "grad_norm": 0.83203125, + "learning_rate": 4.957930488068519e-05, + "loss": 0.8166, + "step": 169 + }, + { + "epoch": 0.007453346831729734, + "grad_norm": 0.80078125, + "learning_rate": 4.9575329222573444e-05, + "loss": 0.8775, + "step": 170 + }, + { + "epoch": 0.007497190048386968, + "grad_norm": 0.84375, + "learning_rate": 4.957135367133391e-05, + "loss": 0.8665, + "step": 171 + }, + { + "epoch": 0.007541033265044202, + "grad_norm": 0.97265625, + "learning_rate": 4.956737822697499e-05, + "loss": 0.9813, + "step": 172 + }, + { + "epoch": 0.007584876481701436, + "grad_norm": 0.87109375, + "learning_rate": 4.956340288950515e-05, + "loss": 0.8716, + "step": 173 + }, + { + "epoch": 0.0076287196983586695, + "grad_norm": 0.85546875, + "learning_rate": 4.9559427658932746e-05, + "loss": 0.9009, + "step": 174 + }, + { + "epoch": 0.007672562915015903, + "grad_norm": 0.76953125, + "learning_rate": 4.955545253526629e-05, + "loss": 0.8169, + "step": 175 + }, + { + "epoch": 0.007716406131673137, + "grad_norm": 0.8203125, + "learning_rate": 4.955147751851418e-05, + "loss": 0.9127, + "step": 176 + }, + { + "epoch": 0.007760249348330371, + "grad_norm": 0.8515625, + "learning_rate": 4.954750260868481e-05, + "loss": 0.7492, + "step": 177 + }, + { + "epoch": 0.007804092564987604, + "grad_norm": 0.8046875, + "learning_rate": 4.9543527805786635e-05, + "loss": 0.7702, + "step": 178 + }, + { + "epoch": 0.007847935781644838, + "grad_norm": 0.8203125, + "learning_rate": 4.953955310982803e-05, + "loss": 0.9048, + "step": 179 + }, + { + "epoch": 0.007891778998302072, + "grad_norm": 0.859375, + "learning_rate": 4.953557852081751e-05, + "loss": 0.8198, + "step": 180 + }, + { + "epoch": 0.007935622214959305, + "grad_norm": 0.8515625, + "learning_rate": 4.953160403876342e-05, + "loss": 0.8676, + "step": 181 + }, + { + "epoch": 0.00797946543161654, + "grad_norm": 0.82421875, + "learning_rate": 4.952762966367424e-05, + "loss": 0.8211, + "step": 182 + }, + { + "epoch": 0.008023308648273773, + "grad_norm": 0.90625, + "learning_rate": 4.952365539555835e-05, + "loss": 0.9046, + "step": 183 + }, + { + "epoch": 0.008067151864931007, + "grad_norm": 0.87109375, + "learning_rate": 4.951968123442414e-05, + "loss": 0.9406, + "step": 184 + }, + { + "epoch": 0.00811099508158824, + "grad_norm": 0.78125, + "learning_rate": 4.9515707180280124e-05, + "loss": 0.9604, + "step": 185 + }, + { + "epoch": 0.008154838298245475, + "grad_norm": 0.8984375, + "learning_rate": 4.9511733233134685e-05, + "loss": 0.8354, + "step": 186 + }, + { + "epoch": 0.008198681514902708, + "grad_norm": 0.80078125, + "learning_rate": 4.9507759392996245e-05, + "loss": 0.7426, + "step": 187 + }, + { + "epoch": 0.008242524731559942, + "grad_norm": 0.85546875, + "learning_rate": 4.9503785659873216e-05, + "loss": 0.8947, + "step": 188 + }, + { + "epoch": 0.008286367948217176, + "grad_norm": 0.84765625, + "learning_rate": 4.9499812033773985e-05, + "loss": 0.9796, + "step": 189 + }, + { + "epoch": 0.00833021116487441, + "grad_norm": 0.86328125, + "learning_rate": 4.9495838514707064e-05, + "loss": 0.8942, + "step": 190 + }, + { + "epoch": 0.008374054381531644, + "grad_norm": 0.8125, + "learning_rate": 4.949186510268082e-05, + "loss": 1.0091, + "step": 191 + }, + { + "epoch": 0.008417897598188876, + "grad_norm": 0.90234375, + "learning_rate": 4.9487891797703686e-05, + "loss": 0.8141, + "step": 192 + }, + { + "epoch": 0.00846174081484611, + "grad_norm": 0.90234375, + "learning_rate": 4.9483918599784076e-05, + "loss": 0.8139, + "step": 193 + }, + { + "epoch": 0.008505584031503343, + "grad_norm": 0.828125, + "learning_rate": 4.947994550893038e-05, + "loss": 0.832, + "step": 194 + }, + { + "epoch": 0.008549427248160577, + "grad_norm": 0.86328125, + "learning_rate": 4.9475972525151085e-05, + "loss": 0.8613, + "step": 195 + }, + { + "epoch": 0.008593270464817811, + "grad_norm": 0.8984375, + "learning_rate": 4.947199964845457e-05, + "loss": 0.8824, + "step": 196 + }, + { + "epoch": 0.008637113681475045, + "grad_norm": 0.828125, + "learning_rate": 4.946802687884927e-05, + "loss": 0.7715, + "step": 197 + }, + { + "epoch": 0.008680956898132279, + "grad_norm": 0.98046875, + "learning_rate": 4.9464054216343606e-05, + "loss": 0.8956, + "step": 198 + }, + { + "epoch": 0.008724800114789513, + "grad_norm": 0.78125, + "learning_rate": 4.946008166094599e-05, + "loss": 0.7596, + "step": 199 + }, + { + "epoch": 0.008768643331446746, + "grad_norm": 0.86328125, + "learning_rate": 4.9456109212664833e-05, + "loss": 0.9681, + "step": 200 + }, + { + "epoch": 0.00881248654810398, + "grad_norm": 0.83984375, + "learning_rate": 4.945213687150857e-05, + "loss": 0.8719, + "step": 201 + }, + { + "epoch": 0.008856329764761214, + "grad_norm": 0.8359375, + "learning_rate": 4.9448164637485616e-05, + "loss": 0.7695, + "step": 202 + }, + { + "epoch": 0.008900172981418448, + "grad_norm": 0.91015625, + "learning_rate": 4.9444192510604384e-05, + "loss": 0.9245, + "step": 203 + }, + { + "epoch": 0.008944016198075682, + "grad_norm": 0.8359375, + "learning_rate": 4.9440220490873255e-05, + "loss": 0.8294, + "step": 204 + }, + { + "epoch": 0.008987859414732915, + "grad_norm": 0.84765625, + "learning_rate": 4.9436248578300724e-05, + "loss": 0.7191, + "step": 205 + }, + { + "epoch": 0.00903170263139015, + "grad_norm": 0.87890625, + "learning_rate": 4.943227677289518e-05, + "loss": 0.8751, + "step": 206 + }, + { + "epoch": 0.009075545848047383, + "grad_norm": 0.78125, + "learning_rate": 4.9428305074665036e-05, + "loss": 0.7951, + "step": 207 + }, + { + "epoch": 0.009119389064704617, + "grad_norm": 0.9140625, + "learning_rate": 4.942433348361871e-05, + "loss": 0.8357, + "step": 208 + }, + { + "epoch": 0.00916323228136185, + "grad_norm": 1.1328125, + "learning_rate": 4.942036199976459e-05, + "loss": 1.0444, + "step": 209 + }, + { + "epoch": 0.009207075498019083, + "grad_norm": 0.89453125, + "learning_rate": 4.9416390623111144e-05, + "loss": 0.9875, + "step": 210 + }, + { + "epoch": 0.009250918714676317, + "grad_norm": 0.84375, + "learning_rate": 4.941241935366676e-05, + "loss": 0.7802, + "step": 211 + }, + { + "epoch": 0.00929476193133355, + "grad_norm": 0.9375, + "learning_rate": 4.940844819143988e-05, + "loss": 1.0067, + "step": 212 + }, + { + "epoch": 0.009338605147990784, + "grad_norm": 0.8984375, + "learning_rate": 4.9404477136438896e-05, + "loss": 0.8602, + "step": 213 + }, + { + "epoch": 0.009382448364648018, + "grad_norm": 0.89453125, + "learning_rate": 4.940050618867219e-05, + "loss": 0.9289, + "step": 214 + }, + { + "epoch": 0.009426291581305252, + "grad_norm": 0.81640625, + "learning_rate": 4.939653534814826e-05, + "loss": 0.8571, + "step": 215 + }, + { + "epoch": 0.009470134797962486, + "grad_norm": 0.86328125, + "learning_rate": 4.939256461487548e-05, + "loss": 0.8362, + "step": 216 + }, + { + "epoch": 0.00951397801461972, + "grad_norm": 0.90234375, + "learning_rate": 4.938859398886226e-05, + "loss": 0.794, + "step": 217 + }, + { + "epoch": 0.009557821231276954, + "grad_norm": 0.88671875, + "learning_rate": 4.938462347011703e-05, + "loss": 0.843, + "step": 218 + }, + { + "epoch": 0.009601664447934187, + "grad_norm": 0.83984375, + "learning_rate": 4.9380653058648166e-05, + "loss": 0.9118, + "step": 219 + }, + { + "epoch": 0.009645507664591421, + "grad_norm": 0.89453125, + "learning_rate": 4.937668275446414e-05, + "loss": 0.8475, + "step": 220 + }, + { + "epoch": 0.009689350881248655, + "grad_norm": 0.84375, + "learning_rate": 4.937271255757335e-05, + "loss": 0.823, + "step": 221 + }, + { + "epoch": 0.009733194097905889, + "grad_norm": 0.84375, + "learning_rate": 4.9368742467984194e-05, + "loss": 0.8775, + "step": 222 + }, + { + "epoch": 0.009777037314563123, + "grad_norm": 0.8828125, + "learning_rate": 4.93647724857051e-05, + "loss": 0.8699, + "step": 223 + }, + { + "epoch": 0.009820880531220356, + "grad_norm": 0.76953125, + "learning_rate": 4.936080261074444e-05, + "loss": 0.9074, + "step": 224 + }, + { + "epoch": 0.00986472374787759, + "grad_norm": 0.7734375, + "learning_rate": 4.93568328431107e-05, + "loss": 0.7563, + "step": 225 + }, + { + "epoch": 0.009908566964534824, + "grad_norm": 0.93359375, + "learning_rate": 4.935286318281226e-05, + "loss": 0.7951, + "step": 226 + }, + { + "epoch": 0.009952410181192058, + "grad_norm": 0.81640625, + "learning_rate": 4.934889362985753e-05, + "loss": 0.9238, + "step": 227 + }, + { + "epoch": 0.009996253397849292, + "grad_norm": 0.81640625, + "learning_rate": 4.9344924184254916e-05, + "loss": 0.7984, + "step": 228 + }, + { + "epoch": 0.010040096614506524, + "grad_norm": 0.7734375, + "learning_rate": 4.934095484601281e-05, + "loss": 0.8418, + "step": 229 + }, + { + "epoch": 0.010083939831163758, + "grad_norm": 0.82421875, + "learning_rate": 4.93369856151397e-05, + "loss": 0.9034, + "step": 230 + }, + { + "epoch": 0.010127783047820992, + "grad_norm": 0.76953125, + "learning_rate": 4.9333016491643944e-05, + "loss": 0.8955, + "step": 231 + }, + { + "epoch": 0.010171626264478225, + "grad_norm": 0.828125, + "learning_rate": 4.932904747553396e-05, + "loss": 0.8477, + "step": 232 + }, + { + "epoch": 0.01021546948113546, + "grad_norm": 0.82421875, + "learning_rate": 4.932507856681817e-05, + "loss": 0.8272, + "step": 233 + }, + { + "epoch": 0.010259312697792693, + "grad_norm": 0.81640625, + "learning_rate": 4.9321109765504944e-05, + "loss": 0.9652, + "step": 234 + }, + { + "epoch": 0.010303155914449927, + "grad_norm": 0.80859375, + "learning_rate": 4.9317141071602766e-05, + "loss": 0.8422, + "step": 235 + }, + { + "epoch": 0.01034699913110716, + "grad_norm": 0.88671875, + "learning_rate": 4.9313172485120005e-05, + "loss": 0.7664, + "step": 236 + }, + { + "epoch": 0.010390842347764394, + "grad_norm": 0.74609375, + "learning_rate": 4.930920400606508e-05, + "loss": 0.8047, + "step": 237 + }, + { + "epoch": 0.010434685564421628, + "grad_norm": 0.84765625, + "learning_rate": 4.930523563444641e-05, + "loss": 0.8528, + "step": 238 + }, + { + "epoch": 0.010478528781078862, + "grad_norm": 0.84765625, + "learning_rate": 4.930126737027239e-05, + "loss": 0.8917, + "step": 239 + }, + { + "epoch": 0.010522371997736096, + "grad_norm": 0.83203125, + "learning_rate": 4.929729921355143e-05, + "loss": 0.9132, + "step": 240 + }, + { + "epoch": 0.01056621521439333, + "grad_norm": 0.86328125, + "learning_rate": 4.929333116429191e-05, + "loss": 0.9906, + "step": 241 + }, + { + "epoch": 0.010610058431050564, + "grad_norm": 0.765625, + "learning_rate": 4.9289363222502316e-05, + "loss": 0.8452, + "step": 242 + }, + { + "epoch": 0.010653901647707797, + "grad_norm": 0.8203125, + "learning_rate": 4.928539538819101e-05, + "loss": 0.8529, + "step": 243 + }, + { + "epoch": 0.010697744864365031, + "grad_norm": 0.80078125, + "learning_rate": 4.928142766136642e-05, + "loss": 0.9488, + "step": 244 + }, + { + "epoch": 0.010741588081022265, + "grad_norm": 0.8046875, + "learning_rate": 4.927746004203695e-05, + "loss": 0.8252, + "step": 245 + }, + { + "epoch": 0.010785431297679499, + "grad_norm": 0.8359375, + "learning_rate": 4.927349253021099e-05, + "loss": 0.9818, + "step": 246 + }, + { + "epoch": 0.010829274514336731, + "grad_norm": 0.81640625, + "learning_rate": 4.9269525125896975e-05, + "loss": 0.8125, + "step": 247 + }, + { + "epoch": 0.010873117730993965, + "grad_norm": 1.15625, + "learning_rate": 4.926555782910329e-05, + "loss": 0.7566, + "step": 248 + }, + { + "epoch": 0.010916960947651199, + "grad_norm": 0.80078125, + "learning_rate": 4.926159063983833e-05, + "loss": 0.8182, + "step": 249 + }, + { + "epoch": 0.010960804164308433, + "grad_norm": 0.8515625, + "learning_rate": 4.9257623558110555e-05, + "loss": 0.8388, + "step": 250 + }, + { + "epoch": 0.011004647380965666, + "grad_norm": 0.8671875, + "learning_rate": 4.925365658392835e-05, + "loss": 0.8688, + "step": 251 + }, + { + "epoch": 0.0110484905976229, + "grad_norm": 0.84765625, + "learning_rate": 4.924968971730013e-05, + "loss": 0.8038, + "step": 252 + }, + { + "epoch": 0.011092333814280134, + "grad_norm": 0.8828125, + "learning_rate": 4.924572295823429e-05, + "loss": 0.8992, + "step": 253 + }, + { + "epoch": 0.011136177030937368, + "grad_norm": 0.83203125, + "learning_rate": 4.9241756306739185e-05, + "loss": 0.9462, + "step": 254 + }, + { + "epoch": 0.011180020247594602, + "grad_norm": 0.76953125, + "learning_rate": 4.9237789762823325e-05, + "loss": 0.8978, + "step": 255 + }, + { + "epoch": 0.011223863464251835, + "grad_norm": 0.91796875, + "learning_rate": 4.923382332649506e-05, + "loss": 0.8479, + "step": 256 + }, + { + "epoch": 0.01126770668090907, + "grad_norm": 1.1015625, + "learning_rate": 4.922985699776283e-05, + "loss": 0.9143, + "step": 257 + }, + { + "epoch": 0.011311549897566303, + "grad_norm": 0.79296875, + "learning_rate": 4.922589077663499e-05, + "loss": 0.7431, + "step": 258 + }, + { + "epoch": 0.011355393114223537, + "grad_norm": 0.80859375, + "learning_rate": 4.9221924663119946e-05, + "loss": 0.8086, + "step": 259 + }, + { + "epoch": 0.01139923633088077, + "grad_norm": 0.8125, + "learning_rate": 4.921795865722616e-05, + "loss": 0.8557, + "step": 260 + }, + { + "epoch": 0.011443079547538005, + "grad_norm": 0.80078125, + "learning_rate": 4.921399275896201e-05, + "loss": 0.7435, + "step": 261 + }, + { + "epoch": 0.011486922764195238, + "grad_norm": 0.796875, + "learning_rate": 4.921002696833591e-05, + "loss": 0.7888, + "step": 262 + }, + { + "epoch": 0.011530765980852472, + "grad_norm": 0.84375, + "learning_rate": 4.920606128535624e-05, + "loss": 0.7889, + "step": 263 + }, + { + "epoch": 0.011574609197509706, + "grad_norm": 1.21875, + "learning_rate": 4.920209571003139e-05, + "loss": 0.9391, + "step": 264 + }, + { + "epoch": 0.011618452414166938, + "grad_norm": 0.95703125, + "learning_rate": 4.919813024236983e-05, + "loss": 0.9565, + "step": 265 + }, + { + "epoch": 0.011662295630824172, + "grad_norm": 0.7734375, + "learning_rate": 4.9194164882379936e-05, + "loss": 0.7463, + "step": 266 + }, + { + "epoch": 0.011706138847481406, + "grad_norm": 0.921875, + "learning_rate": 4.91901996300701e-05, + "loss": 0.9365, + "step": 267 + }, + { + "epoch": 0.01174998206413864, + "grad_norm": 1.1328125, + "learning_rate": 4.918623448544874e-05, + "loss": 0.7788, + "step": 268 + }, + { + "epoch": 0.011793825280795874, + "grad_norm": 0.90625, + "learning_rate": 4.91822694485242e-05, + "loss": 0.8689, + "step": 269 + }, + { + "epoch": 0.011837668497453107, + "grad_norm": 0.87109375, + "learning_rate": 4.9178304519304984e-05, + "loss": 0.8732, + "step": 270 + }, + { + "epoch": 0.011881511714110341, + "grad_norm": 0.82421875, + "learning_rate": 4.9174339697799445e-05, + "loss": 0.8271, + "step": 271 + }, + { + "epoch": 0.011925354930767575, + "grad_norm": 0.7734375, + "learning_rate": 4.917037498401598e-05, + "loss": 0.7494, + "step": 272 + }, + { + "epoch": 0.011969198147424809, + "grad_norm": 0.80078125, + "learning_rate": 4.9166410377963e-05, + "loss": 0.7875, + "step": 273 + }, + { + "epoch": 0.012013041364082043, + "grad_norm": 0.80078125, + "learning_rate": 4.916244587964888e-05, + "loss": 0.8375, + "step": 274 + }, + { + "epoch": 0.012056884580739276, + "grad_norm": 0.828125, + "learning_rate": 4.9158481489082084e-05, + "loss": 0.8705, + "step": 275 + }, + { + "epoch": 0.01210072779739651, + "grad_norm": 0.83984375, + "learning_rate": 4.915451720627098e-05, + "loss": 1.045, + "step": 276 + }, + { + "epoch": 0.012144571014053744, + "grad_norm": 0.83203125, + "learning_rate": 4.915055303122397e-05, + "loss": 0.864, + "step": 277 + }, + { + "epoch": 0.012188414230710978, + "grad_norm": 0.87109375, + "learning_rate": 4.9146588963949456e-05, + "loss": 0.8254, + "step": 278 + }, + { + "epoch": 0.012232257447368212, + "grad_norm": 0.87109375, + "learning_rate": 4.914262500445584e-05, + "loss": 0.8439, + "step": 279 + }, + { + "epoch": 0.012276100664025446, + "grad_norm": 0.79296875, + "learning_rate": 4.913866115275149e-05, + "loss": 0.8367, + "step": 280 + }, + { + "epoch": 0.01231994388068268, + "grad_norm": 0.80078125, + "learning_rate": 4.9134697408844874e-05, + "loss": 0.8986, + "step": 281 + }, + { + "epoch": 0.012363787097339913, + "grad_norm": 0.71484375, + "learning_rate": 4.913073377274437e-05, + "loss": 0.8905, + "step": 282 + }, + { + "epoch": 0.012407630313997145, + "grad_norm": 0.84765625, + "learning_rate": 4.912677024445834e-05, + "loss": 0.8692, + "step": 283 + }, + { + "epoch": 0.01245147353065438, + "grad_norm": 0.92578125, + "learning_rate": 4.9122806823995236e-05, + "loss": 0.8898, + "step": 284 + }, + { + "epoch": 0.012495316747311613, + "grad_norm": 0.921875, + "learning_rate": 4.911884351136343e-05, + "loss": 0.8803, + "step": 285 + }, + { + "epoch": 0.012539159963968847, + "grad_norm": 0.90625, + "learning_rate": 4.911488030657133e-05, + "loss": 0.8027, + "step": 286 + }, + { + "epoch": 0.01258300318062608, + "grad_norm": 0.76171875, + "learning_rate": 4.911091720962733e-05, + "loss": 0.9842, + "step": 287 + }, + { + "epoch": 0.012626846397283314, + "grad_norm": 0.84375, + "learning_rate": 4.910695422053984e-05, + "loss": 0.8083, + "step": 288 + }, + { + "epoch": 0.012670689613940548, + "grad_norm": 0.84765625, + "learning_rate": 4.910299133931719e-05, + "loss": 0.8819, + "step": 289 + }, + { + "epoch": 0.012714532830597782, + "grad_norm": 0.8671875, + "learning_rate": 4.9099028565967895e-05, + "loss": 0.8287, + "step": 290 + }, + { + "epoch": 0.012758376047255016, + "grad_norm": 0.83203125, + "learning_rate": 4.9095065900500304e-05, + "loss": 0.85, + "step": 291 + }, + { + "epoch": 0.01280221926391225, + "grad_norm": 0.7890625, + "learning_rate": 4.90911033429228e-05, + "loss": 0.796, + "step": 292 + }, + { + "epoch": 0.012846062480569484, + "grad_norm": 0.8828125, + "learning_rate": 4.908714089324381e-05, + "loss": 0.9037, + "step": 293 + }, + { + "epoch": 0.012889905697226717, + "grad_norm": 0.80859375, + "learning_rate": 4.90831785514717e-05, + "loss": 0.7762, + "step": 294 + }, + { + "epoch": 0.012933748913883951, + "grad_norm": 0.93359375, + "learning_rate": 4.907921631761485e-05, + "loss": 0.8844, + "step": 295 + }, + { + "epoch": 0.012977592130541185, + "grad_norm": 0.9296875, + "learning_rate": 4.907525419168173e-05, + "loss": 0.7813, + "step": 296 + }, + { + "epoch": 0.013021435347198419, + "grad_norm": 0.84375, + "learning_rate": 4.9071292173680694e-05, + "loss": 0.9135, + "step": 297 + }, + { + "epoch": 0.013065278563855653, + "grad_norm": 0.78125, + "learning_rate": 4.9067330263620145e-05, + "loss": 0.9442, + "step": 298 + }, + { + "epoch": 0.013109121780512887, + "grad_norm": 0.88671875, + "learning_rate": 4.906336846150848e-05, + "loss": 0.8806, + "step": 299 + }, + { + "epoch": 0.01315296499717012, + "grad_norm": 0.84375, + "learning_rate": 4.905940676735405e-05, + "loss": 0.8596, + "step": 300 + }, + { + "epoch": 0.013196808213827354, + "grad_norm": 0.75, + "learning_rate": 4.905544518116534e-05, + "loss": 0.8673, + "step": 301 + }, + { + "epoch": 0.013240651430484586, + "grad_norm": 0.734375, + "learning_rate": 4.9051483702950694e-05, + "loss": 0.8887, + "step": 302 + }, + { + "epoch": 0.01328449464714182, + "grad_norm": 0.8125, + "learning_rate": 4.9047522332718534e-05, + "loss": 0.8749, + "step": 303 + }, + { + "epoch": 0.013328337863799054, + "grad_norm": 0.84765625, + "learning_rate": 4.904356107047722e-05, + "loss": 0.951, + "step": 304 + }, + { + "epoch": 0.013372181080456288, + "grad_norm": 0.80078125, + "learning_rate": 4.9039599916235134e-05, + "loss": 0.8814, + "step": 305 + }, + { + "epoch": 0.013416024297113522, + "grad_norm": 1.0, + "learning_rate": 4.9035638870000746e-05, + "loss": 0.9642, + "step": 306 + }, + { + "epoch": 0.013459867513770755, + "grad_norm": 0.8359375, + "learning_rate": 4.903167793178239e-05, + "loss": 0.8942, + "step": 307 + }, + { + "epoch": 0.01350371073042799, + "grad_norm": 0.7265625, + "learning_rate": 4.90277171015885e-05, + "loss": 0.8147, + "step": 308 + }, + { + "epoch": 0.013547553947085223, + "grad_norm": 0.80078125, + "learning_rate": 4.902375637942744e-05, + "loss": 0.7401, + "step": 309 + }, + { + "epoch": 0.013591397163742457, + "grad_norm": 0.79296875, + "learning_rate": 4.9019795765307584e-05, + "loss": 0.8283, + "step": 310 + }, + { + "epoch": 0.01363524038039969, + "grad_norm": 1.1171875, + "learning_rate": 4.901583525923738e-05, + "loss": 0.8187, + "step": 311 + }, + { + "epoch": 0.013679083597056925, + "grad_norm": 0.94921875, + "learning_rate": 4.901187486122523e-05, + "loss": 1.0098, + "step": 312 + }, + { + "epoch": 0.013722926813714158, + "grad_norm": 0.80859375, + "learning_rate": 4.900791457127947e-05, + "loss": 0.761, + "step": 313 + }, + { + "epoch": 0.013766770030371392, + "grad_norm": 0.87109375, + "learning_rate": 4.9003954389408525e-05, + "loss": 0.9011, + "step": 314 + }, + { + "epoch": 0.013810613247028626, + "grad_norm": 0.8359375, + "learning_rate": 4.899999431562075e-05, + "loss": 0.9243, + "step": 315 + }, + { + "epoch": 0.01385445646368586, + "grad_norm": 0.77734375, + "learning_rate": 4.89960343499246e-05, + "loss": 0.7063, + "step": 316 + }, + { + "epoch": 0.013898299680343094, + "grad_norm": 0.95703125, + "learning_rate": 4.899207449232845e-05, + "loss": 0.7827, + "step": 317 + }, + { + "epoch": 0.013942142897000328, + "grad_norm": 1.125, + "learning_rate": 4.898811474284068e-05, + "loss": 0.9359, + "step": 318 + }, + { + "epoch": 0.013985986113657561, + "grad_norm": 0.76171875, + "learning_rate": 4.8984155101469697e-05, + "loss": 0.9044, + "step": 319 + }, + { + "epoch": 0.014029829330314793, + "grad_norm": 0.82421875, + "learning_rate": 4.898019556822383e-05, + "loss": 0.9531, + "step": 320 + }, + { + "epoch": 0.014073672546972027, + "grad_norm": 0.78515625, + "learning_rate": 4.897623614311156e-05, + "loss": 0.8352, + "step": 321 + }, + { + "epoch": 0.014117515763629261, + "grad_norm": 0.8671875, + "learning_rate": 4.897227682614124e-05, + "loss": 0.8428, + "step": 322 + }, + { + "epoch": 0.014161358980286495, + "grad_norm": 0.84375, + "learning_rate": 4.896831761732127e-05, + "loss": 0.7798, + "step": 323 + }, + { + "epoch": 0.014205202196943729, + "grad_norm": 0.8359375, + "learning_rate": 4.896435851666004e-05, + "loss": 0.9632, + "step": 324 + }, + { + "epoch": 0.014249045413600963, + "grad_norm": 0.7578125, + "learning_rate": 4.896039952416591e-05, + "loss": 0.809, + "step": 325 + }, + { + "epoch": 0.014292888630258196, + "grad_norm": 0.87109375, + "learning_rate": 4.895644063984728e-05, + "loss": 0.9257, + "step": 326 + }, + { + "epoch": 0.01433673184691543, + "grad_norm": 0.79296875, + "learning_rate": 4.895248186371258e-05, + "loss": 0.7274, + "step": 327 + }, + { + "epoch": 0.014380575063572664, + "grad_norm": 0.796875, + "learning_rate": 4.894852319577019e-05, + "loss": 0.7734, + "step": 328 + }, + { + "epoch": 0.014424418280229898, + "grad_norm": 0.8671875, + "learning_rate": 4.894456463602849e-05, + "loss": 0.8839, + "step": 329 + }, + { + "epoch": 0.014468261496887132, + "grad_norm": 0.83984375, + "learning_rate": 4.894060618449585e-05, + "loss": 0.7676, + "step": 330 + }, + { + "epoch": 0.014512104713544366, + "grad_norm": 0.90234375, + "learning_rate": 4.893664784118069e-05, + "loss": 0.778, + "step": 331 + }, + { + "epoch": 0.0145559479302016, + "grad_norm": 0.8515625, + "learning_rate": 4.8932689606091386e-05, + "loss": 1.0188, + "step": 332 + }, + { + "epoch": 0.014599791146858833, + "grad_norm": 0.9375, + "learning_rate": 4.892873147923632e-05, + "loss": 0.8581, + "step": 333 + }, + { + "epoch": 0.014643634363516067, + "grad_norm": 0.84765625, + "learning_rate": 4.89247734606239e-05, + "loss": 0.9083, + "step": 334 + }, + { + "epoch": 0.014687477580173301, + "grad_norm": 0.90625, + "learning_rate": 4.892081555026245e-05, + "loss": 0.9071, + "step": 335 + }, + { + "epoch": 0.014731320796830535, + "grad_norm": 1.0546875, + "learning_rate": 4.891685774816046e-05, + "loss": 0.7674, + "step": 336 + }, + { + "epoch": 0.014775164013487769, + "grad_norm": 0.921875, + "learning_rate": 4.891290005432626e-05, + "loss": 1.0123, + "step": 337 + }, + { + "epoch": 0.014819007230145, + "grad_norm": 0.82421875, + "learning_rate": 4.890894246876826e-05, + "loss": 0.843, + "step": 338 + }, + { + "epoch": 0.014862850446802234, + "grad_norm": 0.93359375, + "learning_rate": 4.8904984991494826e-05, + "loss": 0.8486, + "step": 339 + }, + { + "epoch": 0.014906693663459468, + "grad_norm": 0.8671875, + "learning_rate": 4.890102762251433e-05, + "loss": 0.8797, + "step": 340 + }, + { + "epoch": 0.014950536880116702, + "grad_norm": 0.93359375, + "learning_rate": 4.889707036183522e-05, + "loss": 0.8694, + "step": 341 + }, + { + "epoch": 0.014994380096773936, + "grad_norm": 0.94921875, + "learning_rate": 4.8893113209465844e-05, + "loss": 0.7968, + "step": 342 + }, + { + "epoch": 0.01503822331343117, + "grad_norm": 0.97265625, + "learning_rate": 4.88891561654146e-05, + "loss": 0.8781, + "step": 343 + }, + { + "epoch": 0.015082066530088404, + "grad_norm": 0.859375, + "learning_rate": 4.888519922968986e-05, + "loss": 0.8264, + "step": 344 + }, + { + "epoch": 0.015125909746745637, + "grad_norm": 0.84375, + "learning_rate": 4.888124240229999e-05, + "loss": 0.8713, + "step": 345 + }, + { + "epoch": 0.015169752963402871, + "grad_norm": 0.89453125, + "learning_rate": 4.887728568325343e-05, + "loss": 0.9656, + "step": 346 + }, + { + "epoch": 0.015213596180060105, + "grad_norm": 0.86328125, + "learning_rate": 4.887332907255855e-05, + "loss": 0.8017, + "step": 347 + }, + { + "epoch": 0.015257439396717339, + "grad_norm": 0.87109375, + "learning_rate": 4.8869372570223725e-05, + "loss": 0.8521, + "step": 348 + }, + { + "epoch": 0.015301282613374573, + "grad_norm": 0.79296875, + "learning_rate": 4.8865416176257336e-05, + "loss": 0.7503, + "step": 349 + }, + { + "epoch": 0.015345125830031807, + "grad_norm": 1.0234375, + "learning_rate": 4.886145989066775e-05, + "loss": 0.8797, + "step": 350 + }, + { + "epoch": 0.01538896904668904, + "grad_norm": 0.80859375, + "learning_rate": 4.8857503713463406e-05, + "loss": 0.852, + "step": 351 + }, + { + "epoch": 0.015432812263346274, + "grad_norm": 0.8671875, + "learning_rate": 4.8853547644652664e-05, + "loss": 0.7603, + "step": 352 + }, + { + "epoch": 0.015476655480003508, + "grad_norm": 0.8515625, + "learning_rate": 4.8849591684243904e-05, + "loss": 0.8404, + "step": 353 + }, + { + "epoch": 0.015520498696660742, + "grad_norm": 0.859375, + "learning_rate": 4.884563583224551e-05, + "loss": 0.7463, + "step": 354 + }, + { + "epoch": 0.015564341913317976, + "grad_norm": 0.765625, + "learning_rate": 4.884168008866582e-05, + "loss": 0.7974, + "step": 355 + }, + { + "epoch": 0.015608185129975208, + "grad_norm": 0.90234375, + "learning_rate": 4.883772445351331e-05, + "loss": 0.8703, + "step": 356 + }, + { + "epoch": 0.015652028346632443, + "grad_norm": 0.85546875, + "learning_rate": 4.883376892679632e-05, + "loss": 0.8195, + "step": 357 + }, + { + "epoch": 0.015695871563289675, + "grad_norm": 0.9140625, + "learning_rate": 4.882981350852322e-05, + "loss": 1.0289, + "step": 358 + }, + { + "epoch": 0.01573971477994691, + "grad_norm": 0.8203125, + "learning_rate": 4.8825858198702425e-05, + "loss": 0.8706, + "step": 359 + }, + { + "epoch": 0.015783557996604143, + "grad_norm": 0.796875, + "learning_rate": 4.882190299734225e-05, + "loss": 0.7424, + "step": 360 + }, + { + "epoch": 0.01582740121326138, + "grad_norm": 0.87109375, + "learning_rate": 4.881794790445118e-05, + "loss": 0.8289, + "step": 361 + }, + { + "epoch": 0.01587124442991861, + "grad_norm": 0.859375, + "learning_rate": 4.881399292003752e-05, + "loss": 0.8104, + "step": 362 + }, + { + "epoch": 0.015915087646575846, + "grad_norm": 0.875, + "learning_rate": 4.8810038044109694e-05, + "loss": 0.941, + "step": 363 + }, + { + "epoch": 0.01595893086323308, + "grad_norm": 0.80859375, + "learning_rate": 4.880608327667605e-05, + "loss": 0.8197, + "step": 364 + }, + { + "epoch": 0.016002774079890314, + "grad_norm": 0.85546875, + "learning_rate": 4.880212861774497e-05, + "loss": 0.8605, + "step": 365 + }, + { + "epoch": 0.016046617296547546, + "grad_norm": 0.76171875, + "learning_rate": 4.8798174067324874e-05, + "loss": 0.7394, + "step": 366 + }, + { + "epoch": 0.016090460513204778, + "grad_norm": 0.89453125, + "learning_rate": 4.879421962542412e-05, + "loss": 0.7741, + "step": 367 + }, + { + "epoch": 0.016134303729862014, + "grad_norm": 1.421875, + "learning_rate": 4.8790265292051096e-05, + "loss": 0.8532, + "step": 368 + }, + { + "epoch": 0.016178146946519246, + "grad_norm": 0.7421875, + "learning_rate": 4.8786311067214186e-05, + "loss": 0.7703, + "step": 369 + }, + { + "epoch": 0.01622199016317648, + "grad_norm": 0.95703125, + "learning_rate": 4.878235695092175e-05, + "loss": 0.8496, + "step": 370 + }, + { + "epoch": 0.016265833379833713, + "grad_norm": 0.90234375, + "learning_rate": 4.877840294318219e-05, + "loss": 0.8791, + "step": 371 + }, + { + "epoch": 0.01630967659649095, + "grad_norm": 0.8984375, + "learning_rate": 4.877444904400387e-05, + "loss": 1.009, + "step": 372 + }, + { + "epoch": 0.01635351981314818, + "grad_norm": 0.8359375, + "learning_rate": 4.8770495253395174e-05, + "loss": 0.9077, + "step": 373 + }, + { + "epoch": 0.016397363029805417, + "grad_norm": 0.77734375, + "learning_rate": 4.8766541571364496e-05, + "loss": 0.8158, + "step": 374 + }, + { + "epoch": 0.01644120624646265, + "grad_norm": 0.8046875, + "learning_rate": 4.876258799792016e-05, + "loss": 0.814, + "step": 375 + }, + { + "epoch": 0.016485049463119884, + "grad_norm": 0.90234375, + "learning_rate": 4.8758634533070635e-05, + "loss": 0.8713, + "step": 376 + }, + { + "epoch": 0.016528892679777116, + "grad_norm": 0.91015625, + "learning_rate": 4.8754681176824244e-05, + "loss": 0.8241, + "step": 377 + }, + { + "epoch": 0.016572735896434352, + "grad_norm": 0.859375, + "learning_rate": 4.875072792918939e-05, + "loss": 0.8058, + "step": 378 + }, + { + "epoch": 0.016616579113091584, + "grad_norm": 1.1640625, + "learning_rate": 4.874677479017443e-05, + "loss": 0.7905, + "step": 379 + }, + { + "epoch": 0.01666042232974882, + "grad_norm": 0.84765625, + "learning_rate": 4.8742821759787714e-05, + "loss": 0.9253, + "step": 380 + }, + { + "epoch": 0.016704265546406052, + "grad_norm": 0.83203125, + "learning_rate": 4.87388688380377e-05, + "loss": 0.8338, + "step": 381 + }, + { + "epoch": 0.016748108763063287, + "grad_norm": 0.79296875, + "learning_rate": 4.873491602493272e-05, + "loss": 0.77, + "step": 382 + }, + { + "epoch": 0.01679195197972052, + "grad_norm": 0.80078125, + "learning_rate": 4.873096332048116e-05, + "loss": 0.7683, + "step": 383 + }, + { + "epoch": 0.01683579519637775, + "grad_norm": 0.828125, + "learning_rate": 4.872701072469139e-05, + "loss": 0.7457, + "step": 384 + }, + { + "epoch": 0.016879638413034987, + "grad_norm": 0.8515625, + "learning_rate": 4.872305823757175e-05, + "loss": 0.9194, + "step": 385 + }, + { + "epoch": 0.01692348162969222, + "grad_norm": 0.77734375, + "learning_rate": 4.87191058591307e-05, + "loss": 0.8988, + "step": 386 + }, + { + "epoch": 0.016967324846349455, + "grad_norm": 0.890625, + "learning_rate": 4.871515358937657e-05, + "loss": 0.8844, + "step": 387 + }, + { + "epoch": 0.017011168063006687, + "grad_norm": 0.84375, + "learning_rate": 4.8711201428317746e-05, + "loss": 0.8642, + "step": 388 + }, + { + "epoch": 0.017055011279663922, + "grad_norm": 1.0390625, + "learning_rate": 4.8707249375962595e-05, + "loss": 1.0417, + "step": 389 + }, + { + "epoch": 0.017098854496321154, + "grad_norm": 0.91015625, + "learning_rate": 4.8703297432319453e-05, + "loss": 0.9457, + "step": 390 + }, + { + "epoch": 0.01714269771297839, + "grad_norm": 0.8828125, + "learning_rate": 4.8699345597396786e-05, + "loss": 0.8286, + "step": 391 + }, + { + "epoch": 0.017186540929635622, + "grad_norm": 0.8125, + "learning_rate": 4.869539387120292e-05, + "loss": 0.805, + "step": 392 + }, + { + "epoch": 0.017230384146292858, + "grad_norm": 0.828125, + "learning_rate": 4.869144225374623e-05, + "loss": 0.9601, + "step": 393 + }, + { + "epoch": 0.01727422736295009, + "grad_norm": 0.85546875, + "learning_rate": 4.86874907450351e-05, + "loss": 0.8979, + "step": 394 + }, + { + "epoch": 0.017318070579607325, + "grad_norm": 0.91015625, + "learning_rate": 4.8683539345077864e-05, + "loss": 0.776, + "step": 395 + }, + { + "epoch": 0.017361913796264557, + "grad_norm": 0.93359375, + "learning_rate": 4.867958805388297e-05, + "loss": 0.8924, + "step": 396 + }, + { + "epoch": 0.017405757012921793, + "grad_norm": 0.84375, + "learning_rate": 4.8675636871458766e-05, + "loss": 0.8558, + "step": 397 + }, + { + "epoch": 0.017449600229579025, + "grad_norm": 0.890625, + "learning_rate": 4.867168579781361e-05, + "loss": 0.9742, + "step": 398 + }, + { + "epoch": 0.01749344344623626, + "grad_norm": 0.8125, + "learning_rate": 4.866773483295588e-05, + "loss": 0.781, + "step": 399 + }, + { + "epoch": 0.017537286662893493, + "grad_norm": 0.83203125, + "learning_rate": 4.8663783976893906e-05, + "loss": 0.8143, + "step": 400 + }, + { + "epoch": 0.01758112987955073, + "grad_norm": 0.95703125, + "learning_rate": 4.8659833229636156e-05, + "loss": 0.8317, + "step": 401 + }, + { + "epoch": 0.01762497309620796, + "grad_norm": 0.8984375, + "learning_rate": 4.865588259119095e-05, + "loss": 0.8986, + "step": 402 + }, + { + "epoch": 0.017668816312865192, + "grad_norm": 0.859375, + "learning_rate": 4.865193206156667e-05, + "loss": 0.8488, + "step": 403 + }, + { + "epoch": 0.017712659529522428, + "grad_norm": 1.3359375, + "learning_rate": 4.864798164077168e-05, + "loss": 0.9383, + "step": 404 + }, + { + "epoch": 0.01775650274617966, + "grad_norm": 0.83203125, + "learning_rate": 4.864403132881432e-05, + "loss": 0.7669, + "step": 405 + }, + { + "epoch": 0.017800345962836896, + "grad_norm": 0.82421875, + "learning_rate": 4.8640081125703054e-05, + "loss": 0.9284, + "step": 406 + }, + { + "epoch": 0.017844189179494128, + "grad_norm": 0.8671875, + "learning_rate": 4.8636131031446184e-05, + "loss": 0.9174, + "step": 407 + }, + { + "epoch": 0.017888032396151363, + "grad_norm": 0.765625, + "learning_rate": 4.86321810460521e-05, + "loss": 0.834, + "step": 408 + }, + { + "epoch": 0.017931875612808595, + "grad_norm": 0.87109375, + "learning_rate": 4.862823116952917e-05, + "loss": 1.0201, + "step": 409 + }, + { + "epoch": 0.01797571882946583, + "grad_norm": 0.83203125, + "learning_rate": 4.8624281401885776e-05, + "loss": 0.8336, + "step": 410 + }, + { + "epoch": 0.018019562046123063, + "grad_norm": 0.8828125, + "learning_rate": 4.862033174313028e-05, + "loss": 0.8119, + "step": 411 + }, + { + "epoch": 0.0180634052627803, + "grad_norm": 0.89453125, + "learning_rate": 4.861638219327101e-05, + "loss": 0.8798, + "step": 412 + }, + { + "epoch": 0.01810724847943753, + "grad_norm": 0.78125, + "learning_rate": 4.861243275231642e-05, + "loss": 0.8028, + "step": 413 + }, + { + "epoch": 0.018151091696094766, + "grad_norm": 0.8828125, + "learning_rate": 4.860848342027484e-05, + "loss": 0.9029, + "step": 414 + }, + { + "epoch": 0.018194934912752, + "grad_norm": 0.77734375, + "learning_rate": 4.860453419715465e-05, + "loss": 0.7523, + "step": 415 + }, + { + "epoch": 0.018238778129409234, + "grad_norm": 0.8359375, + "learning_rate": 4.860058508296421e-05, + "loss": 0.8171, + "step": 416 + }, + { + "epoch": 0.018282621346066466, + "grad_norm": 0.796875, + "learning_rate": 4.8596636077711885e-05, + "loss": 0.7536, + "step": 417 + }, + { + "epoch": 0.0183264645627237, + "grad_norm": 0.8359375, + "learning_rate": 4.859268718140606e-05, + "loss": 0.8907, + "step": 418 + }, + { + "epoch": 0.018370307779380934, + "grad_norm": 0.94140625, + "learning_rate": 4.85887383940551e-05, + "loss": 0.9629, + "step": 419 + }, + { + "epoch": 0.018414150996038166, + "grad_norm": 0.9375, + "learning_rate": 4.858478971566736e-05, + "loss": 0.8217, + "step": 420 + }, + { + "epoch": 0.0184579942126954, + "grad_norm": 0.85546875, + "learning_rate": 4.858084114625119e-05, + "loss": 0.9297, + "step": 421 + }, + { + "epoch": 0.018501837429352633, + "grad_norm": 1.296875, + "learning_rate": 4.857689268581501e-05, + "loss": 0.886, + "step": 422 + }, + { + "epoch": 0.01854568064600987, + "grad_norm": 0.77734375, + "learning_rate": 4.857294433436719e-05, + "loss": 0.6712, + "step": 423 + }, + { + "epoch": 0.0185895238626671, + "grad_norm": 0.7734375, + "learning_rate": 4.8568996091916056e-05, + "loss": 0.7573, + "step": 424 + }, + { + "epoch": 0.018633367079324337, + "grad_norm": 0.75390625, + "learning_rate": 4.856504795847e-05, + "loss": 0.8098, + "step": 425 + }, + { + "epoch": 0.01867721029598157, + "grad_norm": 0.80859375, + "learning_rate": 4.856109993403735e-05, + "loss": 0.836, + "step": 426 + }, + { + "epoch": 0.018721053512638804, + "grad_norm": 0.76953125, + "learning_rate": 4.855715201862655e-05, + "loss": 0.8002, + "step": 427 + }, + { + "epoch": 0.018764896729296036, + "grad_norm": 0.8515625, + "learning_rate": 4.8553204212245917e-05, + "loss": 0.8441, + "step": 428 + }, + { + "epoch": 0.018808739945953272, + "grad_norm": 0.8125, + "learning_rate": 4.8549256514903826e-05, + "loss": 0.82, + "step": 429 + }, + { + "epoch": 0.018852583162610504, + "grad_norm": 1.0390625, + "learning_rate": 4.8545308926608656e-05, + "loss": 0.8263, + "step": 430 + }, + { + "epoch": 0.01889642637926774, + "grad_norm": 0.828125, + "learning_rate": 4.8541361447368714e-05, + "loss": 0.9201, + "step": 431 + }, + { + "epoch": 0.01894026959592497, + "grad_norm": 0.71484375, + "learning_rate": 4.853741407719247e-05, + "loss": 0.8131, + "step": 432 + }, + { + "epoch": 0.018984112812582207, + "grad_norm": 0.984375, + "learning_rate": 4.8533466816088224e-05, + "loss": 0.8001, + "step": 433 + }, + { + "epoch": 0.01902795602923944, + "grad_norm": 0.8828125, + "learning_rate": 4.8529519664064346e-05, + "loss": 0.9737, + "step": 434 + }, + { + "epoch": 0.019071799245896675, + "grad_norm": 0.7890625, + "learning_rate": 4.8525572621129225e-05, + "loss": 0.7274, + "step": 435 + }, + { + "epoch": 0.019115642462553907, + "grad_norm": 1.0078125, + "learning_rate": 4.852162568729116e-05, + "loss": 0.9375, + "step": 436 + }, + { + "epoch": 0.019159485679211143, + "grad_norm": 1.09375, + "learning_rate": 4.851767886255861e-05, + "loss": 0.8482, + "step": 437 + }, + { + "epoch": 0.019203328895868375, + "grad_norm": 0.9140625, + "learning_rate": 4.851373214693989e-05, + "loss": 0.8939, + "step": 438 + }, + { + "epoch": 0.019247172112525607, + "grad_norm": 1.09375, + "learning_rate": 4.850978554044339e-05, + "loss": 0.8342, + "step": 439 + }, + { + "epoch": 0.019291015329182842, + "grad_norm": 1.15625, + "learning_rate": 4.850583904307744e-05, + "loss": 0.9333, + "step": 440 + }, + { + "epoch": 0.019334858545840074, + "grad_norm": 0.90234375, + "learning_rate": 4.850189265485039e-05, + "loss": 0.7383, + "step": 441 + }, + { + "epoch": 0.01937870176249731, + "grad_norm": 0.85546875, + "learning_rate": 4.8497946375770664e-05, + "loss": 0.8116, + "step": 442 + }, + { + "epoch": 0.019422544979154542, + "grad_norm": 0.9140625, + "learning_rate": 4.8494000205846604e-05, + "loss": 0.9949, + "step": 443 + }, + { + "epoch": 0.019466388195811778, + "grad_norm": 0.7890625, + "learning_rate": 4.849005414508657e-05, + "loss": 0.8835, + "step": 444 + }, + { + "epoch": 0.01951023141246901, + "grad_norm": 0.84375, + "learning_rate": 4.8486108193498915e-05, + "loss": 0.8915, + "step": 445 + }, + { + "epoch": 0.019554074629126245, + "grad_norm": 0.87890625, + "learning_rate": 4.8482162351091976e-05, + "loss": 0.8655, + "step": 446 + }, + { + "epoch": 0.019597917845783477, + "grad_norm": 0.84375, + "learning_rate": 4.8478216617874184e-05, + "loss": 0.787, + "step": 447 + }, + { + "epoch": 0.019641761062440713, + "grad_norm": 0.90625, + "learning_rate": 4.847427099385387e-05, + "loss": 0.8651, + "step": 448 + }, + { + "epoch": 0.019685604279097945, + "grad_norm": 0.83203125, + "learning_rate": 4.847032547903939e-05, + "loss": 0.9338, + "step": 449 + }, + { + "epoch": 0.01972944749575518, + "grad_norm": 0.81640625, + "learning_rate": 4.8466380073439125e-05, + "loss": 0.9468, + "step": 450 + }, + { + "epoch": 0.019773290712412413, + "grad_norm": 0.88671875, + "learning_rate": 4.846243477706136e-05, + "loss": 0.8263, + "step": 451 + }, + { + "epoch": 0.01981713392906965, + "grad_norm": 0.79296875, + "learning_rate": 4.845848958991457e-05, + "loss": 0.8715, + "step": 452 + }, + { + "epoch": 0.01986097714572688, + "grad_norm": 0.78515625, + "learning_rate": 4.845454451200706e-05, + "loss": 0.7744, + "step": 453 + }, + { + "epoch": 0.019904820362384116, + "grad_norm": 0.8671875, + "learning_rate": 4.845059954334721e-05, + "loss": 0.8411, + "step": 454 + }, + { + "epoch": 0.019948663579041348, + "grad_norm": 0.87109375, + "learning_rate": 4.8446654683943346e-05, + "loss": 1.0177, + "step": 455 + }, + { + "epoch": 0.019992506795698584, + "grad_norm": 1.0859375, + "learning_rate": 4.8442709933803864e-05, + "loss": 0.8628, + "step": 456 + }, + { + "epoch": 0.020036350012355816, + "grad_norm": 0.7734375, + "learning_rate": 4.8438765292937114e-05, + "loss": 0.8, + "step": 457 + }, + { + "epoch": 0.020080193229013048, + "grad_norm": 0.890625, + "learning_rate": 4.843482076135145e-05, + "loss": 0.8294, + "step": 458 + }, + { + "epoch": 0.020124036445670283, + "grad_norm": 0.9921875, + "learning_rate": 4.843087633905524e-05, + "loss": 0.8318, + "step": 459 + }, + { + "epoch": 0.020167879662327515, + "grad_norm": 0.859375, + "learning_rate": 4.842693202605679e-05, + "loss": 0.9173, + "step": 460 + }, + { + "epoch": 0.02021172287898475, + "grad_norm": 0.80859375, + "learning_rate": 4.842298782236456e-05, + "loss": 0.8477, + "step": 461 + }, + { + "epoch": 0.020255566095641983, + "grad_norm": 0.87890625, + "learning_rate": 4.841904372798686e-05, + "loss": 0.854, + "step": 462 + }, + { + "epoch": 0.02029940931229922, + "grad_norm": 0.8203125, + "learning_rate": 4.841509974293204e-05, + "loss": 0.8894, + "step": 463 + }, + { + "epoch": 0.02034325252895645, + "grad_norm": 0.8828125, + "learning_rate": 4.841115586720847e-05, + "loss": 0.8728, + "step": 464 + }, + { + "epoch": 0.020387095745613686, + "grad_norm": 0.84375, + "learning_rate": 4.8407212100824506e-05, + "loss": 0.9047, + "step": 465 + }, + { + "epoch": 0.02043093896227092, + "grad_norm": 0.83984375, + "learning_rate": 4.8403268443788476e-05, + "loss": 1.0778, + "step": 466 + }, + { + "epoch": 0.020474782178928154, + "grad_norm": 0.8359375, + "learning_rate": 4.83993248961088e-05, + "loss": 0.7296, + "step": 467 + }, + { + "epoch": 0.020518625395585386, + "grad_norm": 0.83203125, + "learning_rate": 4.839538145779381e-05, + "loss": 0.8719, + "step": 468 + }, + { + "epoch": 0.02056246861224262, + "grad_norm": 1.234375, + "learning_rate": 4.839143812885186e-05, + "loss": 0.8927, + "step": 469 + }, + { + "epoch": 0.020606311828899854, + "grad_norm": 0.78515625, + "learning_rate": 4.83874949092913e-05, + "loss": 0.8786, + "step": 470 + }, + { + "epoch": 0.02065015504555709, + "grad_norm": 0.80859375, + "learning_rate": 4.838355179912046e-05, + "loss": 0.8724, + "step": 471 + }, + { + "epoch": 0.02069399826221432, + "grad_norm": 0.80078125, + "learning_rate": 4.837960879834777e-05, + "loss": 0.8923, + "step": 472 + }, + { + "epoch": 0.020737841478871557, + "grad_norm": 0.74609375, + "learning_rate": 4.837566590698155e-05, + "loss": 0.7265, + "step": 473 + }, + { + "epoch": 0.02078168469552879, + "grad_norm": 0.828125, + "learning_rate": 4.837172312503015e-05, + "loss": 0.7073, + "step": 474 + }, + { + "epoch": 0.02082552791218602, + "grad_norm": 0.77734375, + "learning_rate": 4.836778045250194e-05, + "loss": 0.7592, + "step": 475 + }, + { + "epoch": 0.020869371128843257, + "grad_norm": 1.234375, + "learning_rate": 4.836383788940523e-05, + "loss": 0.7035, + "step": 476 + }, + { + "epoch": 0.02091321434550049, + "grad_norm": 0.85546875, + "learning_rate": 4.8359895435748447e-05, + "loss": 1.0148, + "step": 477 + }, + { + "epoch": 0.020957057562157724, + "grad_norm": 0.74609375, + "learning_rate": 4.8355953091539915e-05, + "loss": 0.7656, + "step": 478 + }, + { + "epoch": 0.021000900778814956, + "grad_norm": 0.84765625, + "learning_rate": 4.8352010856787997e-05, + "loss": 0.7657, + "step": 479 + }, + { + "epoch": 0.021044743995472192, + "grad_norm": 0.98828125, + "learning_rate": 4.8348068731501026e-05, + "loss": 0.8261, + "step": 480 + }, + { + "epoch": 0.021088587212129424, + "grad_norm": 0.80859375, + "learning_rate": 4.8344126715687344e-05, + "loss": 0.7439, + "step": 481 + }, + { + "epoch": 0.02113243042878666, + "grad_norm": 0.9765625, + "learning_rate": 4.834018480935537e-05, + "loss": 0.7135, + "step": 482 + }, + { + "epoch": 0.02117627364544389, + "grad_norm": 0.859375, + "learning_rate": 4.833624301251342e-05, + "loss": 0.6938, + "step": 483 + }, + { + "epoch": 0.021220116862101127, + "grad_norm": 0.85546875, + "learning_rate": 4.833230132516984e-05, + "loss": 0.859, + "step": 484 + }, + { + "epoch": 0.02126396007875836, + "grad_norm": 0.81640625, + "learning_rate": 4.8328359747333006e-05, + "loss": 0.833, + "step": 485 + }, + { + "epoch": 0.021307803295415595, + "grad_norm": 0.95703125, + "learning_rate": 4.832441827901122e-05, + "loss": 0.9659, + "step": 486 + }, + { + "epoch": 0.021351646512072827, + "grad_norm": 0.7734375, + "learning_rate": 4.832047692021291e-05, + "loss": 0.8758, + "step": 487 + }, + { + "epoch": 0.021395489728730063, + "grad_norm": 0.75390625, + "learning_rate": 4.831653567094639e-05, + "loss": 0.8145, + "step": 488 + }, + { + "epoch": 0.021439332945387295, + "grad_norm": 0.81640625, + "learning_rate": 4.831259453122003e-05, + "loss": 0.8236, + "step": 489 + }, + { + "epoch": 0.02148317616204453, + "grad_norm": 0.75390625, + "learning_rate": 4.8308653501042166e-05, + "loss": 0.7014, + "step": 490 + }, + { + "epoch": 0.021527019378701762, + "grad_norm": 0.7734375, + "learning_rate": 4.830471258042113e-05, + "loss": 0.8488, + "step": 491 + }, + { + "epoch": 0.021570862595358998, + "grad_norm": 0.87109375, + "learning_rate": 4.830077176936533e-05, + "loss": 0.9364, + "step": 492 + }, + { + "epoch": 0.02161470581201623, + "grad_norm": 0.734375, + "learning_rate": 4.8296831067883083e-05, + "loss": 0.7448, + "step": 493 + }, + { + "epoch": 0.021658549028673462, + "grad_norm": 0.7734375, + "learning_rate": 4.829289047598276e-05, + "loss": 0.8521, + "step": 494 + }, + { + "epoch": 0.021702392245330698, + "grad_norm": 0.8046875, + "learning_rate": 4.8288949993672685e-05, + "loss": 0.7794, + "step": 495 + }, + { + "epoch": 0.02174623546198793, + "grad_norm": 1.09375, + "learning_rate": 4.828500962096123e-05, + "loss": 0.7881, + "step": 496 + }, + { + "epoch": 0.021790078678645165, + "grad_norm": 0.82421875, + "learning_rate": 4.828106935785671e-05, + "loss": 0.8048, + "step": 497 + }, + { + "epoch": 0.021833921895302397, + "grad_norm": 0.9296875, + "learning_rate": 4.827712920436754e-05, + "loss": 0.8486, + "step": 498 + }, + { + "epoch": 0.021877765111959633, + "grad_norm": 0.82421875, + "learning_rate": 4.8273189160502044e-05, + "loss": 0.9004, + "step": 499 + }, + { + "epoch": 0.021921608328616865, + "grad_norm": 0.8515625, + "learning_rate": 4.826924922626855e-05, + "loss": 0.7962, + "step": 500 + }, + { + "epoch": 0.0219654515452741, + "grad_norm": 0.87890625, + "learning_rate": 4.826530940167544e-05, + "loss": 0.9006, + "step": 501 + }, + { + "epoch": 0.022009294761931333, + "grad_norm": 0.91015625, + "learning_rate": 4.826136968673105e-05, + "loss": 0.8367, + "step": 502 + }, + { + "epoch": 0.022053137978588568, + "grad_norm": 0.859375, + "learning_rate": 4.8257430081443734e-05, + "loss": 0.8891, + "step": 503 + }, + { + "epoch": 0.0220969811952458, + "grad_norm": 0.9609375, + "learning_rate": 4.8253490585821824e-05, + "loss": 0.9094, + "step": 504 + }, + { + "epoch": 0.022140824411903036, + "grad_norm": 1.1015625, + "learning_rate": 4.8249551199873686e-05, + "loss": 0.8307, + "step": 505 + }, + { + "epoch": 0.022184667628560268, + "grad_norm": 0.875, + "learning_rate": 4.8245611923607634e-05, + "loss": 0.9352, + "step": 506 + }, + { + "epoch": 0.022228510845217504, + "grad_norm": 0.83203125, + "learning_rate": 4.8241672757032084e-05, + "loss": 0.8543, + "step": 507 + }, + { + "epoch": 0.022272354061874736, + "grad_norm": 0.8046875, + "learning_rate": 4.8237733700155365e-05, + "loss": 0.8169, + "step": 508 + }, + { + "epoch": 0.02231619727853197, + "grad_norm": 0.90234375, + "learning_rate": 4.82337947529858e-05, + "loss": 0.8906, + "step": 509 + }, + { + "epoch": 0.022360040495189203, + "grad_norm": 0.77734375, + "learning_rate": 4.8229855915531743e-05, + "loss": 0.6769, + "step": 510 + }, + { + "epoch": 0.02240388371184644, + "grad_norm": 0.81640625, + "learning_rate": 4.8225917187801504e-05, + "loss": 0.8071, + "step": 511 + }, + { + "epoch": 0.02244772692850367, + "grad_norm": 0.8984375, + "learning_rate": 4.822197856980353e-05, + "loss": 0.825, + "step": 512 + }, + { + "epoch": 0.022491570145160903, + "grad_norm": 1.0234375, + "learning_rate": 4.82180400615461e-05, + "loss": 0.8478, + "step": 513 + }, + { + "epoch": 0.02253541336181814, + "grad_norm": 0.89453125, + "learning_rate": 4.821410166303757e-05, + "loss": 0.8509, + "step": 514 + }, + { + "epoch": 0.02257925657847537, + "grad_norm": 0.81640625, + "learning_rate": 4.82101633742863e-05, + "loss": 0.9218, + "step": 515 + }, + { + "epoch": 0.022623099795132606, + "grad_norm": 0.93359375, + "learning_rate": 4.820622519530059e-05, + "loss": 0.857, + "step": 516 + }, + { + "epoch": 0.02266694301178984, + "grad_norm": 0.93359375, + "learning_rate": 4.8202287126088866e-05, + "loss": 0.9243, + "step": 517 + }, + { + "epoch": 0.022710786228447074, + "grad_norm": 0.80078125, + "learning_rate": 4.819834916665943e-05, + "loss": 0.9337, + "step": 518 + }, + { + "epoch": 0.022754629445104306, + "grad_norm": 0.83203125, + "learning_rate": 4.819441131702064e-05, + "loss": 0.8485, + "step": 519 + }, + { + "epoch": 0.02279847266176154, + "grad_norm": 0.87890625, + "learning_rate": 4.8190473577180825e-05, + "loss": 0.7802, + "step": 520 + }, + { + "epoch": 0.022842315878418774, + "grad_norm": 0.84765625, + "learning_rate": 4.8186535947148304e-05, + "loss": 0.887, + "step": 521 + }, + { + "epoch": 0.02288615909507601, + "grad_norm": 0.890625, + "learning_rate": 4.818259842693149e-05, + "loss": 0.8787, + "step": 522 + }, + { + "epoch": 0.02293000231173324, + "grad_norm": 0.8984375, + "learning_rate": 4.8178661016538705e-05, + "loss": 0.9227, + "step": 523 + }, + { + "epoch": 0.022973845528390477, + "grad_norm": 0.85546875, + "learning_rate": 4.817472371597829e-05, + "loss": 0.8421, + "step": 524 + }, + { + "epoch": 0.02301768874504771, + "grad_norm": 0.73828125, + "learning_rate": 4.817078652525858e-05, + "loss": 0.8669, + "step": 525 + }, + { + "epoch": 0.023061531961704945, + "grad_norm": 1.0234375, + "learning_rate": 4.816684944438788e-05, + "loss": 0.9446, + "step": 526 + }, + { + "epoch": 0.023105375178362177, + "grad_norm": 0.91796875, + "learning_rate": 4.8162912473374634e-05, + "loss": 0.8515, + "step": 527 + }, + { + "epoch": 0.023149218395019412, + "grad_norm": 0.8046875, + "learning_rate": 4.815897561222712e-05, + "loss": 0.799, + "step": 528 + }, + { + "epoch": 0.023193061611676644, + "grad_norm": 1.1953125, + "learning_rate": 4.81550388609537e-05, + "loss": 0.9423, + "step": 529 + }, + { + "epoch": 0.023236904828333876, + "grad_norm": 0.9921875, + "learning_rate": 4.8151102219562716e-05, + "loss": 0.9355, + "step": 530 + }, + { + "epoch": 0.023280748044991112, + "grad_norm": 0.76171875, + "learning_rate": 4.814716568806246e-05, + "loss": 0.7694, + "step": 531 + }, + { + "epoch": 0.023324591261648344, + "grad_norm": 1.0546875, + "learning_rate": 4.814322926646136e-05, + "loss": 0.853, + "step": 532 + }, + { + "epoch": 0.02336843447830558, + "grad_norm": 0.796875, + "learning_rate": 4.813929295476773e-05, + "loss": 0.7173, + "step": 533 + }, + { + "epoch": 0.02341227769496281, + "grad_norm": 0.84765625, + "learning_rate": 4.81353567529899e-05, + "loss": 0.8364, + "step": 534 + }, + { + "epoch": 0.023456120911620047, + "grad_norm": 0.85546875, + "learning_rate": 4.813142066113622e-05, + "loss": 0.7981, + "step": 535 + }, + { + "epoch": 0.02349996412827728, + "grad_norm": 1.0078125, + "learning_rate": 4.8127484679214984e-05, + "loss": 0.7887, + "step": 536 + }, + { + "epoch": 0.023543807344934515, + "grad_norm": 0.7578125, + "learning_rate": 4.812354880723462e-05, + "loss": 0.9615, + "step": 537 + }, + { + "epoch": 0.023587650561591747, + "grad_norm": 0.9375, + "learning_rate": 4.811961304520344e-05, + "loss": 0.7969, + "step": 538 + }, + { + "epoch": 0.023631493778248983, + "grad_norm": 0.84375, + "learning_rate": 4.8115677393129764e-05, + "loss": 0.6854, + "step": 539 + }, + { + "epoch": 0.023675336994906215, + "grad_norm": 0.72265625, + "learning_rate": 4.811174185102194e-05, + "loss": 0.826, + "step": 540 + }, + { + "epoch": 0.02371918021156345, + "grad_norm": 0.80859375, + "learning_rate": 4.810780641888832e-05, + "loss": 0.8337, + "step": 541 + }, + { + "epoch": 0.023763023428220682, + "grad_norm": 0.86328125, + "learning_rate": 4.810387109673724e-05, + "loss": 0.8958, + "step": 542 + }, + { + "epoch": 0.023806866644877918, + "grad_norm": 0.9453125, + "learning_rate": 4.809993588457703e-05, + "loss": 0.8713, + "step": 543 + }, + { + "epoch": 0.02385070986153515, + "grad_norm": 0.8828125, + "learning_rate": 4.809600078241605e-05, + "loss": 0.7897, + "step": 544 + }, + { + "epoch": 0.023894553078192386, + "grad_norm": 0.8359375, + "learning_rate": 4.809206579026263e-05, + "loss": 0.7614, + "step": 545 + }, + { + "epoch": 0.023938396294849618, + "grad_norm": 0.765625, + "learning_rate": 4.808813090812506e-05, + "loss": 0.7784, + "step": 546 + }, + { + "epoch": 0.023982239511506853, + "grad_norm": 0.8359375, + "learning_rate": 4.8084196136011785e-05, + "loss": 0.8463, + "step": 547 + }, + { + "epoch": 0.024026082728164085, + "grad_norm": 0.7734375, + "learning_rate": 4.8080261473931076e-05, + "loss": 0.8731, + "step": 548 + }, + { + "epoch": 0.024069925944821317, + "grad_norm": 1.03125, + "learning_rate": 4.8076326921891277e-05, + "loss": 0.9147, + "step": 549 + }, + { + "epoch": 0.024113769161478553, + "grad_norm": 0.8828125, + "learning_rate": 4.807239247990075e-05, + "loss": 0.9259, + "step": 550 + }, + { + "epoch": 0.024157612378135785, + "grad_norm": 0.8359375, + "learning_rate": 4.806845814796778e-05, + "loss": 0.8488, + "step": 551 + }, + { + "epoch": 0.02420145559479302, + "grad_norm": 0.79296875, + "learning_rate": 4.806452392610078e-05, + "loss": 0.7041, + "step": 552 + }, + { + "epoch": 0.024245298811450253, + "grad_norm": 0.87890625, + "learning_rate": 4.8060589814308055e-05, + "loss": 0.9651, + "step": 553 + }, + { + "epoch": 0.024289142028107488, + "grad_norm": 1.0703125, + "learning_rate": 4.805665581259794e-05, + "loss": 0.9021, + "step": 554 + }, + { + "epoch": 0.02433298524476472, + "grad_norm": 0.875, + "learning_rate": 4.8052721920978773e-05, + "loss": 0.7331, + "step": 555 + }, + { + "epoch": 0.024376828461421956, + "grad_norm": 0.82421875, + "learning_rate": 4.804878813945886e-05, + "loss": 0.788, + "step": 556 + }, + { + "epoch": 0.024420671678079188, + "grad_norm": 0.8125, + "learning_rate": 4.8044854468046594e-05, + "loss": 0.8114, + "step": 557 + }, + { + "epoch": 0.024464514894736424, + "grad_norm": 0.76953125, + "learning_rate": 4.80409209067503e-05, + "loss": 0.9015, + "step": 558 + }, + { + "epoch": 0.024508358111393656, + "grad_norm": 0.8515625, + "learning_rate": 4.803698745557831e-05, + "loss": 0.915, + "step": 559 + }, + { + "epoch": 0.02455220132805089, + "grad_norm": 0.71875, + "learning_rate": 4.8033054114538946e-05, + "loss": 0.6753, + "step": 560 + }, + { + "epoch": 0.024596044544708123, + "grad_norm": 0.8359375, + "learning_rate": 4.802912088364056e-05, + "loss": 0.8059, + "step": 561 + }, + { + "epoch": 0.02463988776136536, + "grad_norm": 0.91796875, + "learning_rate": 4.802518776289143e-05, + "loss": 0.876, + "step": 562 + }, + { + "epoch": 0.02468373097802259, + "grad_norm": 0.8359375, + "learning_rate": 4.802125475229999e-05, + "loss": 0.7578, + "step": 563 + }, + { + "epoch": 0.024727574194679827, + "grad_norm": 0.78515625, + "learning_rate": 4.801732185187453e-05, + "loss": 1.001, + "step": 564 + }, + { + "epoch": 0.02477141741133706, + "grad_norm": 0.8359375, + "learning_rate": 4.8013389061623395e-05, + "loss": 0.9294, + "step": 565 + }, + { + "epoch": 0.02481526062799429, + "grad_norm": 0.81640625, + "learning_rate": 4.8009456381554896e-05, + "loss": 0.8932, + "step": 566 + }, + { + "epoch": 0.024859103844651526, + "grad_norm": 0.78125, + "learning_rate": 4.8005523811677354e-05, + "loss": 0.7996, + "step": 567 + }, + { + "epoch": 0.02490294706130876, + "grad_norm": 0.78515625, + "learning_rate": 4.8001591351999164e-05, + "loss": 0.9986, + "step": 568 + }, + { + "epoch": 0.024946790277965994, + "grad_norm": 0.9921875, + "learning_rate": 4.799765900252864e-05, + "loss": 0.8076, + "step": 569 + }, + { + "epoch": 0.024990633494623226, + "grad_norm": 0.91796875, + "learning_rate": 4.799372676327409e-05, + "loss": 0.9951, + "step": 570 + }, + { + "epoch": 0.02503447671128046, + "grad_norm": 0.88671875, + "learning_rate": 4.798979463424387e-05, + "loss": 0.7882, + "step": 571 + }, + { + "epoch": 0.025078319927937694, + "grad_norm": 0.73046875, + "learning_rate": 4.798586261544627e-05, + "loss": 0.6953, + "step": 572 + }, + { + "epoch": 0.02512216314459493, + "grad_norm": 0.8828125, + "learning_rate": 4.79819307068897e-05, + "loss": 0.8346, + "step": 573 + }, + { + "epoch": 0.02516600636125216, + "grad_norm": 0.80859375, + "learning_rate": 4.7977998908582456e-05, + "loss": 0.8285, + "step": 574 + }, + { + "epoch": 0.025209849577909397, + "grad_norm": 0.859375, + "learning_rate": 4.797406722053287e-05, + "loss": 0.8435, + "step": 575 + }, + { + "epoch": 0.02525369279456663, + "grad_norm": 0.86328125, + "learning_rate": 4.797013564274927e-05, + "loss": 0.784, + "step": 576 + }, + { + "epoch": 0.025297536011223865, + "grad_norm": 0.84765625, + "learning_rate": 4.796620417523996e-05, + "loss": 0.8603, + "step": 577 + }, + { + "epoch": 0.025341379227881097, + "grad_norm": 0.83203125, + "learning_rate": 4.7962272818013344e-05, + "loss": 0.8041, + "step": 578 + }, + { + "epoch": 0.025385222444538332, + "grad_norm": 0.8203125, + "learning_rate": 4.7958341571077714e-05, + "loss": 0.7641, + "step": 579 + }, + { + "epoch": 0.025429065661195564, + "grad_norm": 0.84765625, + "learning_rate": 4.795441043444141e-05, + "loss": 0.9052, + "step": 580 + }, + { + "epoch": 0.0254729088778528, + "grad_norm": 0.83984375, + "learning_rate": 4.795047940811275e-05, + "loss": 0.8473, + "step": 581 + }, + { + "epoch": 0.025516752094510032, + "grad_norm": 0.8125, + "learning_rate": 4.7946548492100085e-05, + "loss": 0.7956, + "step": 582 + }, + { + "epoch": 0.025560595311167267, + "grad_norm": 0.85546875, + "learning_rate": 4.7942617686411686e-05, + "loss": 0.7792, + "step": 583 + }, + { + "epoch": 0.0256044385278245, + "grad_norm": 0.82421875, + "learning_rate": 4.793868699105597e-05, + "loss": 0.7918, + "step": 584 + }, + { + "epoch": 0.02564828174448173, + "grad_norm": 0.84375, + "learning_rate": 4.7934756406041235e-05, + "loss": 0.8912, + "step": 585 + }, + { + "epoch": 0.025692124961138967, + "grad_norm": 0.97265625, + "learning_rate": 4.793082593137581e-05, + "loss": 0.7834, + "step": 586 + }, + { + "epoch": 0.0257359681777962, + "grad_norm": 0.88671875, + "learning_rate": 4.792689556706803e-05, + "loss": 0.8634, + "step": 587 + }, + { + "epoch": 0.025779811394453435, + "grad_norm": 0.921875, + "learning_rate": 4.792296531312621e-05, + "loss": 0.8586, + "step": 588 + }, + { + "epoch": 0.025823654611110667, + "grad_norm": 0.796875, + "learning_rate": 4.791903516955869e-05, + "loss": 0.8747, + "step": 589 + }, + { + "epoch": 0.025867497827767903, + "grad_norm": 0.88671875, + "learning_rate": 4.7915105136373794e-05, + "loss": 0.9171, + "step": 590 + }, + { + "epoch": 0.025911341044425135, + "grad_norm": 0.828125, + "learning_rate": 4.791117521357986e-05, + "loss": 0.8935, + "step": 591 + }, + { + "epoch": 0.02595518426108237, + "grad_norm": 0.76953125, + "learning_rate": 4.790724540118517e-05, + "loss": 0.7524, + "step": 592 + }, + { + "epoch": 0.025999027477739602, + "grad_norm": 0.703125, + "learning_rate": 4.7903315699198146e-05, + "loss": 0.6761, + "step": 593 + }, + { + "epoch": 0.026042870694396838, + "grad_norm": 0.8046875, + "learning_rate": 4.789938610762706e-05, + "loss": 0.8565, + "step": 594 + }, + { + "epoch": 0.02608671391105407, + "grad_norm": 0.86328125, + "learning_rate": 4.7895456626480254e-05, + "loss": 0.8783, + "step": 595 + }, + { + "epoch": 0.026130557127711306, + "grad_norm": 0.8515625, + "learning_rate": 4.789152725576603e-05, + "loss": 0.7662, + "step": 596 + }, + { + "epoch": 0.026174400344368538, + "grad_norm": 0.7890625, + "learning_rate": 4.7887597995492714e-05, + "loss": 0.8263, + "step": 597 + }, + { + "epoch": 0.026218243561025773, + "grad_norm": 0.8515625, + "learning_rate": 4.788366884566869e-05, + "loss": 0.7823, + "step": 598 + }, + { + "epoch": 0.026262086777683005, + "grad_norm": 0.90234375, + "learning_rate": 4.787973980630225e-05, + "loss": 0.8524, + "step": 599 + }, + { + "epoch": 0.02630592999434024, + "grad_norm": 0.7890625, + "learning_rate": 4.787581087740173e-05, + "loss": 0.8488, + "step": 600 + }, + { + "epoch": 0.026349773210997473, + "grad_norm": 0.77734375, + "learning_rate": 4.787188205897544e-05, + "loss": 0.6966, + "step": 601 + }, + { + "epoch": 0.02639361642765471, + "grad_norm": 0.796875, + "learning_rate": 4.786795335103168e-05, + "loss": 0.753, + "step": 602 + }, + { + "epoch": 0.02643745964431194, + "grad_norm": 0.91015625, + "learning_rate": 4.7864024753578854e-05, + "loss": 0.9302, + "step": 603 + }, + { + "epoch": 0.026481302860969173, + "grad_norm": 0.90234375, + "learning_rate": 4.7860096266625245e-05, + "loss": 0.9041, + "step": 604 + }, + { + "epoch": 0.026525146077626408, + "grad_norm": 0.89453125, + "learning_rate": 4.785616789017917e-05, + "loss": 0.9188, + "step": 605 + }, + { + "epoch": 0.02656898929428364, + "grad_norm": 0.90234375, + "learning_rate": 4.785223962424899e-05, + "loss": 0.7525, + "step": 606 + }, + { + "epoch": 0.026612832510940876, + "grad_norm": 1.2890625, + "learning_rate": 4.784831146884295e-05, + "loss": 0.9107, + "step": 607 + }, + { + "epoch": 0.026656675727598108, + "grad_norm": 0.90234375, + "learning_rate": 4.7844383423969486e-05, + "loss": 0.9015, + "step": 608 + }, + { + "epoch": 0.026700518944255344, + "grad_norm": 0.92578125, + "learning_rate": 4.784045548963686e-05, + "loss": 0.8011, + "step": 609 + }, + { + "epoch": 0.026744362160912576, + "grad_norm": 0.90234375, + "learning_rate": 4.7836527665853415e-05, + "loss": 0.7529, + "step": 610 + }, + { + "epoch": 0.02678820537756981, + "grad_norm": 0.91796875, + "learning_rate": 4.783259995262745e-05, + "loss": 0.8279, + "step": 611 + }, + { + "epoch": 0.026832048594227043, + "grad_norm": 1.171875, + "learning_rate": 4.78286723499673e-05, + "loss": 0.9113, + "step": 612 + }, + { + "epoch": 0.02687589181088428, + "grad_norm": 0.87890625, + "learning_rate": 4.782474485788131e-05, + "loss": 0.7899, + "step": 613 + }, + { + "epoch": 0.02691973502754151, + "grad_norm": 0.74609375, + "learning_rate": 4.78208174763778e-05, + "loss": 0.7538, + "step": 614 + }, + { + "epoch": 0.026963578244198746, + "grad_norm": 0.859375, + "learning_rate": 4.781689020546508e-05, + "loss": 0.7242, + "step": 615 + }, + { + "epoch": 0.02700742146085598, + "grad_norm": 0.90234375, + "learning_rate": 4.7812963045151494e-05, + "loss": 0.8517, + "step": 616 + }, + { + "epoch": 0.027051264677513214, + "grad_norm": 0.8203125, + "learning_rate": 4.7809035995445295e-05, + "loss": 0.8654, + "step": 617 + }, + { + "epoch": 0.027095107894170446, + "grad_norm": 0.96875, + "learning_rate": 4.78051090563549e-05, + "loss": 0.8258, + "step": 618 + }, + { + "epoch": 0.027138951110827682, + "grad_norm": 0.984375, + "learning_rate": 4.780118222788861e-05, + "loss": 0.8622, + "step": 619 + }, + { + "epoch": 0.027182794327484914, + "grad_norm": 0.81640625, + "learning_rate": 4.779725551005472e-05, + "loss": 0.8956, + "step": 620 + }, + { + "epoch": 0.027226637544142146, + "grad_norm": 0.87109375, + "learning_rate": 4.779332890286156e-05, + "loss": 0.8583, + "step": 621 + }, + { + "epoch": 0.02727048076079938, + "grad_norm": 0.7890625, + "learning_rate": 4.778940240631742e-05, + "loss": 0.9039, + "step": 622 + }, + { + "epoch": 0.027314323977456614, + "grad_norm": 0.77734375, + "learning_rate": 4.7785476020430685e-05, + "loss": 0.7984, + "step": 623 + }, + { + "epoch": 0.02735816719411385, + "grad_norm": 0.828125, + "learning_rate": 4.7781549745209664e-05, + "loss": 0.803, + "step": 624 + }, + { + "epoch": 0.02740201041077108, + "grad_norm": 1.1171875, + "learning_rate": 4.777762358066266e-05, + "loss": 0.8319, + "step": 625 + }, + { + "epoch": 0.027445853627428317, + "grad_norm": 0.81640625, + "learning_rate": 4.7773697526798e-05, + "loss": 0.8626, + "step": 626 + }, + { + "epoch": 0.02748969684408555, + "grad_norm": 0.875, + "learning_rate": 4.7769771583624e-05, + "loss": 0.9091, + "step": 627 + }, + { + "epoch": 0.027533540060742785, + "grad_norm": 0.84765625, + "learning_rate": 4.7765845751148986e-05, + "loss": 0.7989, + "step": 628 + }, + { + "epoch": 0.027577383277400017, + "grad_norm": 0.86328125, + "learning_rate": 4.776192002938127e-05, + "loss": 0.8376, + "step": 629 + }, + { + "epoch": 0.027621226494057252, + "grad_norm": 0.83203125, + "learning_rate": 4.775799441832918e-05, + "loss": 0.8567, + "step": 630 + }, + { + "epoch": 0.027665069710714484, + "grad_norm": 0.90234375, + "learning_rate": 4.7754068918001e-05, + "loss": 0.8572, + "step": 631 + }, + { + "epoch": 0.02770891292737172, + "grad_norm": 0.8125, + "learning_rate": 4.7750143528405126e-05, + "loss": 0.8056, + "step": 632 + }, + { + "epoch": 0.027752756144028952, + "grad_norm": 0.77734375, + "learning_rate": 4.7746218249549834e-05, + "loss": 0.8574, + "step": 633 + }, + { + "epoch": 0.027796599360686187, + "grad_norm": 1.0546875, + "learning_rate": 4.774229308144344e-05, + "loss": 0.8485, + "step": 634 + }, + { + "epoch": 0.02784044257734342, + "grad_norm": 0.84375, + "learning_rate": 4.773836802409427e-05, + "loss": 0.7581, + "step": 635 + }, + { + "epoch": 0.027884285794000655, + "grad_norm": 0.90625, + "learning_rate": 4.773444307751065e-05, + "loss": 0.9065, + "step": 636 + }, + { + "epoch": 0.027928129010657887, + "grad_norm": 0.859375, + "learning_rate": 4.773051824170085e-05, + "loss": 0.7655, + "step": 637 + }, + { + "epoch": 0.027971972227315123, + "grad_norm": 0.828125, + "learning_rate": 4.772659351667326e-05, + "loss": 0.809, + "step": 638 + }, + { + "epoch": 0.028015815443972355, + "grad_norm": 0.7890625, + "learning_rate": 4.7722668902436175e-05, + "loss": 0.6503, + "step": 639 + }, + { + "epoch": 0.028059658660629587, + "grad_norm": 0.77734375, + "learning_rate": 4.7718744398997893e-05, + "loss": 0.8085, + "step": 640 + }, + { + "epoch": 0.028103501877286823, + "grad_norm": 0.83203125, + "learning_rate": 4.7714820006366754e-05, + "loss": 0.8358, + "step": 641 + }, + { + "epoch": 0.028147345093944055, + "grad_norm": 0.7578125, + "learning_rate": 4.771089572455103e-05, + "loss": 0.8141, + "step": 642 + }, + { + "epoch": 0.02819118831060129, + "grad_norm": 0.8515625, + "learning_rate": 4.770697155355911e-05, + "loss": 0.8292, + "step": 643 + }, + { + "epoch": 0.028235031527258522, + "grad_norm": 0.734375, + "learning_rate": 4.7703047493399276e-05, + "loss": 0.7758, + "step": 644 + }, + { + "epoch": 0.028278874743915758, + "grad_norm": 0.828125, + "learning_rate": 4.769912354407984e-05, + "loss": 0.8289, + "step": 645 + }, + { + "epoch": 0.02832271796057299, + "grad_norm": 1.046875, + "learning_rate": 4.7695199705609115e-05, + "loss": 0.8944, + "step": 646 + }, + { + "epoch": 0.028366561177230225, + "grad_norm": 0.8203125, + "learning_rate": 4.7691275977995395e-05, + "loss": 0.7019, + "step": 647 + }, + { + "epoch": 0.028410404393887458, + "grad_norm": 0.87890625, + "learning_rate": 4.768735236124707e-05, + "loss": 0.8262, + "step": 648 + }, + { + "epoch": 0.028454247610544693, + "grad_norm": 0.78515625, + "learning_rate": 4.76834288553724e-05, + "loss": 0.8246, + "step": 649 + }, + { + "epoch": 0.028498090827201925, + "grad_norm": 0.7265625, + "learning_rate": 4.7679505460379724e-05, + "loss": 0.7194, + "step": 650 + }, + { + "epoch": 0.02854193404385916, + "grad_norm": 0.86328125, + "learning_rate": 4.767558217627733e-05, + "loss": 0.8668, + "step": 651 + }, + { + "epoch": 0.028585777260516393, + "grad_norm": 0.84765625, + "learning_rate": 4.767165900307353e-05, + "loss": 0.9918, + "step": 652 + }, + { + "epoch": 0.02862962047717363, + "grad_norm": 0.8359375, + "learning_rate": 4.766773594077668e-05, + "loss": 0.8602, + "step": 653 + }, + { + "epoch": 0.02867346369383086, + "grad_norm": 0.8359375, + "learning_rate": 4.7663812989395085e-05, + "loss": 0.9848, + "step": 654 + }, + { + "epoch": 0.028717306910488096, + "grad_norm": 0.828125, + "learning_rate": 4.7659890148937036e-05, + "loss": 0.8453, + "step": 655 + }, + { + "epoch": 0.028761150127145328, + "grad_norm": 0.8359375, + "learning_rate": 4.765596741941086e-05, + "loss": 0.7982, + "step": 656 + }, + { + "epoch": 0.028804993343802564, + "grad_norm": 0.765625, + "learning_rate": 4.765204480082484e-05, + "loss": 0.7055, + "step": 657 + }, + { + "epoch": 0.028848836560459796, + "grad_norm": 0.890625, + "learning_rate": 4.7648122293187345e-05, + "loss": 0.818, + "step": 658 + }, + { + "epoch": 0.028892679777117028, + "grad_norm": 0.796875, + "learning_rate": 4.764419989650667e-05, + "loss": 0.7841, + "step": 659 + }, + { + "epoch": 0.028936522993774264, + "grad_norm": 0.796875, + "learning_rate": 4.764027761079112e-05, + "loss": 0.86, + "step": 660 + }, + { + "epoch": 0.028980366210431496, + "grad_norm": 0.95703125, + "learning_rate": 4.7636355436049004e-05, + "loss": 0.8789, + "step": 661 + }, + { + "epoch": 0.02902420942708873, + "grad_norm": 0.85546875, + "learning_rate": 4.763243337228861e-05, + "loss": 0.8111, + "step": 662 + }, + { + "epoch": 0.029068052643745963, + "grad_norm": 0.890625, + "learning_rate": 4.762851141951832e-05, + "loss": 0.8154, + "step": 663 + }, + { + "epoch": 0.0291118958604032, + "grad_norm": 0.8125, + "learning_rate": 4.7624589577746396e-05, + "loss": 0.8313, + "step": 664 + }, + { + "epoch": 0.02915573907706043, + "grad_norm": 0.83203125, + "learning_rate": 4.762066784698116e-05, + "loss": 0.8571, + "step": 665 + }, + { + "epoch": 0.029199582293717666, + "grad_norm": 0.8515625, + "learning_rate": 4.7616746227230934e-05, + "loss": 0.7795, + "step": 666 + }, + { + "epoch": 0.0292434255103749, + "grad_norm": 0.80078125, + "learning_rate": 4.761282471850401e-05, + "loss": 0.8826, + "step": 667 + }, + { + "epoch": 0.029287268727032134, + "grad_norm": 0.9296875, + "learning_rate": 4.7608903320808686e-05, + "loss": 0.8016, + "step": 668 + }, + { + "epoch": 0.029331111943689366, + "grad_norm": 0.890625, + "learning_rate": 4.7604982034153324e-05, + "loss": 0.7778, + "step": 669 + }, + { + "epoch": 0.029374955160346602, + "grad_norm": 0.80078125, + "learning_rate": 4.760106085854621e-05, + "loss": 0.7773, + "step": 670 + }, + { + "epoch": 0.029418798377003834, + "grad_norm": 0.8515625, + "learning_rate": 4.7597139793995646e-05, + "loss": 0.7151, + "step": 671 + }, + { + "epoch": 0.02946264159366107, + "grad_norm": 0.70703125, + "learning_rate": 4.7593218840509966e-05, + "loss": 0.8104, + "step": 672 + }, + { + "epoch": 0.0295064848103183, + "grad_norm": 0.77734375, + "learning_rate": 4.7589297998097445e-05, + "loss": 0.8522, + "step": 673 + }, + { + "epoch": 0.029550328026975537, + "grad_norm": 0.828125, + "learning_rate": 4.758537726676643e-05, + "loss": 0.8953, + "step": 674 + }, + { + "epoch": 0.02959417124363277, + "grad_norm": 0.859375, + "learning_rate": 4.75814566465252e-05, + "loss": 0.8277, + "step": 675 + }, + { + "epoch": 0.02963801446029, + "grad_norm": 0.796875, + "learning_rate": 4.757753613738207e-05, + "loss": 0.7697, + "step": 676 + }, + { + "epoch": 0.029681857676947237, + "grad_norm": 0.91015625, + "learning_rate": 4.7573615739345335e-05, + "loss": 0.8374, + "step": 677 + }, + { + "epoch": 0.02972570089360447, + "grad_norm": 0.83984375, + "learning_rate": 4.756969545242335e-05, + "loss": 0.8614, + "step": 678 + }, + { + "epoch": 0.029769544110261704, + "grad_norm": 0.859375, + "learning_rate": 4.75657752766244e-05, + "loss": 0.815, + "step": 679 + }, + { + "epoch": 0.029813387326918937, + "grad_norm": 1.109375, + "learning_rate": 4.7561855211956795e-05, + "loss": 0.8955, + "step": 680 + }, + { + "epoch": 0.029857230543576172, + "grad_norm": 0.90234375, + "learning_rate": 4.755793525842884e-05, + "loss": 0.8466, + "step": 681 + }, + { + "epoch": 0.029901073760233404, + "grad_norm": 0.91015625, + "learning_rate": 4.7554015416048814e-05, + "loss": 0.8773, + "step": 682 + }, + { + "epoch": 0.02994491697689064, + "grad_norm": 0.8828125, + "learning_rate": 4.755009568482508e-05, + "loss": 0.8959, + "step": 683 + }, + { + "epoch": 0.029988760193547872, + "grad_norm": 0.828125, + "learning_rate": 4.7546176064765925e-05, + "loss": 0.8446, + "step": 684 + }, + { + "epoch": 0.030032603410205107, + "grad_norm": 0.81640625, + "learning_rate": 4.754225655587965e-05, + "loss": 0.7959, + "step": 685 + }, + { + "epoch": 0.03007644662686234, + "grad_norm": 0.77734375, + "learning_rate": 4.753833715817457e-05, + "loss": 0.7847, + "step": 686 + }, + { + "epoch": 0.030120289843519575, + "grad_norm": 0.8359375, + "learning_rate": 4.753441787165894e-05, + "loss": 0.8411, + "step": 687 + }, + { + "epoch": 0.030164133060176807, + "grad_norm": 0.7890625, + "learning_rate": 4.7530498696341154e-05, + "loss": 0.8054, + "step": 688 + }, + { + "epoch": 0.030207976276834043, + "grad_norm": 0.80859375, + "learning_rate": 4.752657963222949e-05, + "loss": 0.8435, + "step": 689 + }, + { + "epoch": 0.030251819493491275, + "grad_norm": 0.890625, + "learning_rate": 4.752266067933222e-05, + "loss": 1.0359, + "step": 690 + }, + { + "epoch": 0.03029566271014851, + "grad_norm": 0.83203125, + "learning_rate": 4.751874183765768e-05, + "loss": 0.7988, + "step": 691 + }, + { + "epoch": 0.030339505926805743, + "grad_norm": 0.88671875, + "learning_rate": 4.751482310721412e-05, + "loss": 0.7836, + "step": 692 + }, + { + "epoch": 0.030383349143462978, + "grad_norm": 0.7734375, + "learning_rate": 4.7510904488009945e-05, + "loss": 0.7849, + "step": 693 + }, + { + "epoch": 0.03042719236012021, + "grad_norm": 0.91796875, + "learning_rate": 4.750698598005341e-05, + "loss": 0.8821, + "step": 694 + }, + { + "epoch": 0.030471035576777442, + "grad_norm": 0.734375, + "learning_rate": 4.75030675833528e-05, + "loss": 0.672, + "step": 695 + }, + { + "epoch": 0.030514878793434678, + "grad_norm": 0.79296875, + "learning_rate": 4.749914929791646e-05, + "loss": 0.7745, + "step": 696 + }, + { + "epoch": 0.03055872201009191, + "grad_norm": 0.8046875, + "learning_rate": 4.7495231123752615e-05, + "loss": 0.7778, + "step": 697 + }, + { + "epoch": 0.030602565226749145, + "grad_norm": 0.875, + "learning_rate": 4.749131306086967e-05, + "loss": 0.7072, + "step": 698 + }, + { + "epoch": 0.030646408443406378, + "grad_norm": 0.8671875, + "learning_rate": 4.748739510927589e-05, + "loss": 0.9029, + "step": 699 + }, + { + "epoch": 0.030690251660063613, + "grad_norm": 0.78125, + "learning_rate": 4.748347726897957e-05, + "loss": 0.8251, + "step": 700 + }, + { + "epoch": 0.030734094876720845, + "grad_norm": 0.8359375, + "learning_rate": 4.747955953998901e-05, + "loss": 0.8614, + "step": 701 + }, + { + "epoch": 0.03077793809337808, + "grad_norm": 0.9296875, + "learning_rate": 4.7475641922312494e-05, + "loss": 0.7224, + "step": 702 + }, + { + "epoch": 0.030821781310035313, + "grad_norm": 0.96875, + "learning_rate": 4.74717244159584e-05, + "loss": 0.7608, + "step": 703 + }, + { + "epoch": 0.03086562452669255, + "grad_norm": 0.89453125, + "learning_rate": 4.7467807020934974e-05, + "loss": 0.8879, + "step": 704 + }, + { + "epoch": 0.03090946774334978, + "grad_norm": 0.921875, + "learning_rate": 4.746388973725052e-05, + "loss": 0.9805, + "step": 705 + }, + { + "epoch": 0.030953310960007016, + "grad_norm": 0.83984375, + "learning_rate": 4.745997256491337e-05, + "loss": 0.8575, + "step": 706 + }, + { + "epoch": 0.030997154176664248, + "grad_norm": 0.75, + "learning_rate": 4.745605550393179e-05, + "loss": 0.7212, + "step": 707 + }, + { + "epoch": 0.031040997393321484, + "grad_norm": 0.86328125, + "learning_rate": 4.7452138554314065e-05, + "loss": 0.7996, + "step": 708 + }, + { + "epoch": 0.031084840609978716, + "grad_norm": 0.890625, + "learning_rate": 4.744822171606855e-05, + "loss": 0.9469, + "step": 709 + }, + { + "epoch": 0.03112868382663595, + "grad_norm": 0.8046875, + "learning_rate": 4.7444304989203545e-05, + "loss": 0.9093, + "step": 710 + }, + { + "epoch": 0.031172527043293183, + "grad_norm": 0.88671875, + "learning_rate": 4.744038837372733e-05, + "loss": 0.8165, + "step": 711 + }, + { + "epoch": 0.031216370259950416, + "grad_norm": 0.7421875, + "learning_rate": 4.7436471869648205e-05, + "loss": 0.9398, + "step": 712 + }, + { + "epoch": 0.03126021347660765, + "grad_norm": 0.796875, + "learning_rate": 4.743255547697447e-05, + "loss": 0.7487, + "step": 713 + }, + { + "epoch": 0.03130405669326489, + "grad_norm": 0.796875, + "learning_rate": 4.742863919571443e-05, + "loss": 0.8652, + "step": 714 + }, + { + "epoch": 0.031347899909922115, + "grad_norm": 0.91015625, + "learning_rate": 4.7424723025876385e-05, + "loss": 0.781, + "step": 715 + }, + { + "epoch": 0.03139174312657935, + "grad_norm": 0.88671875, + "learning_rate": 4.742080696746859e-05, + "loss": 0.7513, + "step": 716 + }, + { + "epoch": 0.031435586343236586, + "grad_norm": 0.8671875, + "learning_rate": 4.741689102049944e-05, + "loss": 0.8041, + "step": 717 + }, + { + "epoch": 0.03147942955989382, + "grad_norm": 0.8046875, + "learning_rate": 4.741297518497718e-05, + "loss": 0.7829, + "step": 718 + }, + { + "epoch": 0.03152327277655105, + "grad_norm": 0.953125, + "learning_rate": 4.740905946091011e-05, + "loss": 0.864, + "step": 719 + }, + { + "epoch": 0.031567115993208286, + "grad_norm": 0.85546875, + "learning_rate": 4.7405143848306535e-05, + "loss": 0.7503, + "step": 720 + }, + { + "epoch": 0.03161095920986552, + "grad_norm": 0.859375, + "learning_rate": 4.740122834717474e-05, + "loss": 0.7768, + "step": 721 + }, + { + "epoch": 0.03165480242652276, + "grad_norm": 0.8203125, + "learning_rate": 4.7397312957523056e-05, + "loss": 0.8008, + "step": 722 + }, + { + "epoch": 0.031698645643179986, + "grad_norm": 0.83984375, + "learning_rate": 4.739339767935971e-05, + "loss": 0.8481, + "step": 723 + }, + { + "epoch": 0.03174248885983722, + "grad_norm": 0.828125, + "learning_rate": 4.7389482512693087e-05, + "loss": 0.8452, + "step": 724 + }, + { + "epoch": 0.03178633207649446, + "grad_norm": 0.84765625, + "learning_rate": 4.738556745753145e-05, + "loss": 0.8725, + "step": 725 + }, + { + "epoch": 0.03183017529315169, + "grad_norm": 0.9375, + "learning_rate": 4.73816525138831e-05, + "loss": 0.8429, + "step": 726 + }, + { + "epoch": 0.03187401850980892, + "grad_norm": 0.74609375, + "learning_rate": 4.737773768175633e-05, + "loss": 0.9292, + "step": 727 + }, + { + "epoch": 0.03191786172646616, + "grad_norm": 0.84765625, + "learning_rate": 4.73738229611594e-05, + "loss": 0.793, + "step": 728 + }, + { + "epoch": 0.03196170494312339, + "grad_norm": 0.92578125, + "learning_rate": 4.736990835210068e-05, + "loss": 0.8233, + "step": 729 + }, + { + "epoch": 0.03200554815978063, + "grad_norm": 1.0234375, + "learning_rate": 4.736599385458843e-05, + "loss": 0.8394, + "step": 730 + }, + { + "epoch": 0.03204939137643786, + "grad_norm": 0.76171875, + "learning_rate": 4.736207946863095e-05, + "loss": 0.8784, + "step": 731 + }, + { + "epoch": 0.03209323459309509, + "grad_norm": 0.81640625, + "learning_rate": 4.735816519423653e-05, + "loss": 0.8092, + "step": 732 + }, + { + "epoch": 0.03213707780975233, + "grad_norm": 0.8984375, + "learning_rate": 4.735425103141343e-05, + "loss": 0.8561, + "step": 733 + }, + { + "epoch": 0.032180921026409556, + "grad_norm": 0.88671875, + "learning_rate": 4.735033698017003e-05, + "loss": 0.87, + "step": 734 + }, + { + "epoch": 0.03222476424306679, + "grad_norm": 0.7734375, + "learning_rate": 4.7346423040514576e-05, + "loss": 0.8069, + "step": 735 + }, + { + "epoch": 0.03226860745972403, + "grad_norm": 0.83203125, + "learning_rate": 4.734250921245538e-05, + "loss": 0.7707, + "step": 736 + }, + { + "epoch": 0.03231245067638126, + "grad_norm": 0.84765625, + "learning_rate": 4.7338595496000716e-05, + "loss": 0.8856, + "step": 737 + }, + { + "epoch": 0.03235629389303849, + "grad_norm": 0.859375, + "learning_rate": 4.733468189115885e-05, + "loss": 0.8596, + "step": 738 + }, + { + "epoch": 0.03240013710969573, + "grad_norm": 0.7421875, + "learning_rate": 4.733076839793816e-05, + "loss": 0.7178, + "step": 739 + }, + { + "epoch": 0.03244398032635296, + "grad_norm": 0.82421875, + "learning_rate": 4.7326855016346895e-05, + "loss": 0.8122, + "step": 740 + }, + { + "epoch": 0.0324878235430102, + "grad_norm": 0.9296875, + "learning_rate": 4.7322941746393344e-05, + "loss": 0.8348, + "step": 741 + }, + { + "epoch": 0.03253166675966743, + "grad_norm": 0.7890625, + "learning_rate": 4.731902858808581e-05, + "loss": 0.839, + "step": 742 + }, + { + "epoch": 0.03257550997632466, + "grad_norm": 0.88671875, + "learning_rate": 4.7315115541432545e-05, + "loss": 0.8711, + "step": 743 + }, + { + "epoch": 0.0326193531929819, + "grad_norm": 0.81640625, + "learning_rate": 4.731120260644192e-05, + "loss": 0.7727, + "step": 744 + }, + { + "epoch": 0.032663196409639134, + "grad_norm": 0.87890625, + "learning_rate": 4.730728978312219e-05, + "loss": 0.9262, + "step": 745 + }, + { + "epoch": 0.03270703962629636, + "grad_norm": 1.109375, + "learning_rate": 4.730337707148165e-05, + "loss": 0.9279, + "step": 746 + }, + { + "epoch": 0.0327508828429536, + "grad_norm": 1.203125, + "learning_rate": 4.729946447152859e-05, + "loss": 0.9276, + "step": 747 + }, + { + "epoch": 0.03279472605961083, + "grad_norm": 0.828125, + "learning_rate": 4.729555198327126e-05, + "loss": 0.8873, + "step": 748 + }, + { + "epoch": 0.03283856927626806, + "grad_norm": 0.8359375, + "learning_rate": 4.729163960671804e-05, + "loss": 0.7831, + "step": 749 + }, + { + "epoch": 0.0328824124929253, + "grad_norm": 0.92578125, + "learning_rate": 4.728772734187717e-05, + "loss": 0.9304, + "step": 750 + }, + { + "epoch": 0.03292625570958253, + "grad_norm": 0.80859375, + "learning_rate": 4.728381518875695e-05, + "loss": 0.8165, + "step": 751 + }, + { + "epoch": 0.03297009892623977, + "grad_norm": 0.80859375, + "learning_rate": 4.727990314736568e-05, + "loss": 0.8079, + "step": 752 + }, + { + "epoch": 0.033013942142897, + "grad_norm": 0.76171875, + "learning_rate": 4.727599121771163e-05, + "loss": 0.7802, + "step": 753 + }, + { + "epoch": 0.03305778535955423, + "grad_norm": 0.85546875, + "learning_rate": 4.727207939980307e-05, + "loss": 0.7885, + "step": 754 + }, + { + "epoch": 0.03310162857621147, + "grad_norm": 0.921875, + "learning_rate": 4.726816769364837e-05, + "loss": 0.9366, + "step": 755 + }, + { + "epoch": 0.033145471792868704, + "grad_norm": 0.9453125, + "learning_rate": 4.726425609925577e-05, + "loss": 0.8879, + "step": 756 + }, + { + "epoch": 0.03318931500952593, + "grad_norm": 0.875, + "learning_rate": 4.7260344616633554e-05, + "loss": 0.777, + "step": 757 + }, + { + "epoch": 0.03323315822618317, + "grad_norm": 0.81640625, + "learning_rate": 4.7256433245790036e-05, + "loss": 0.9467, + "step": 758 + }, + { + "epoch": 0.033277001442840404, + "grad_norm": 0.8125, + "learning_rate": 4.725252198673348e-05, + "loss": 0.8548, + "step": 759 + }, + { + "epoch": 0.03332084465949764, + "grad_norm": 0.88671875, + "learning_rate": 4.7248610839472197e-05, + "loss": 0.8474, + "step": 760 + }, + { + "epoch": 0.03336468787615487, + "grad_norm": 0.94921875, + "learning_rate": 4.724469980401447e-05, + "loss": 0.8174, + "step": 761 + }, + { + "epoch": 0.033408531092812103, + "grad_norm": 0.91796875, + "learning_rate": 4.7240788880368583e-05, + "loss": 0.8732, + "step": 762 + }, + { + "epoch": 0.03345237430946934, + "grad_norm": 1.0625, + "learning_rate": 4.723687806854279e-05, + "loss": 1.0193, + "step": 763 + }, + { + "epoch": 0.033496217526126575, + "grad_norm": 0.83984375, + "learning_rate": 4.7232967368545455e-05, + "loss": 0.8612, + "step": 764 + }, + { + "epoch": 0.0335400607427838, + "grad_norm": 0.8515625, + "learning_rate": 4.722905678038484e-05, + "loss": 0.8594, + "step": 765 + }, + { + "epoch": 0.03358390395944104, + "grad_norm": 0.87890625, + "learning_rate": 4.722514630406921e-05, + "loss": 0.8739, + "step": 766 + }, + { + "epoch": 0.033627747176098274, + "grad_norm": 0.75390625, + "learning_rate": 4.722123593960687e-05, + "loss": 0.7648, + "step": 767 + }, + { + "epoch": 0.0336715903927555, + "grad_norm": 0.78515625, + "learning_rate": 4.721732568700606e-05, + "loss": 0.7604, + "step": 768 + }, + { + "epoch": 0.03371543360941274, + "grad_norm": 0.77734375, + "learning_rate": 4.721341554627516e-05, + "loss": 0.8262, + "step": 769 + }, + { + "epoch": 0.033759276826069974, + "grad_norm": 0.93359375, + "learning_rate": 4.720950551742241e-05, + "loss": 0.811, + "step": 770 + }, + { + "epoch": 0.03380312004272721, + "grad_norm": 0.83984375, + "learning_rate": 4.720559560045609e-05, + "loss": 0.8664, + "step": 771 + }, + { + "epoch": 0.03384696325938444, + "grad_norm": 0.90234375, + "learning_rate": 4.72016857953845e-05, + "loss": 0.8078, + "step": 772 + }, + { + "epoch": 0.033890806476041674, + "grad_norm": 0.875, + "learning_rate": 4.719777610221586e-05, + "loss": 0.9663, + "step": 773 + }, + { + "epoch": 0.03393464969269891, + "grad_norm": 0.86328125, + "learning_rate": 4.719386652095858e-05, + "loss": 0.9045, + "step": 774 + }, + { + "epoch": 0.033978492909356145, + "grad_norm": 0.93359375, + "learning_rate": 4.718995705162087e-05, + "loss": 0.8283, + "step": 775 + }, + { + "epoch": 0.034022336126013374, + "grad_norm": 0.84765625, + "learning_rate": 4.718604769421102e-05, + "loss": 0.8934, + "step": 776 + }, + { + "epoch": 0.03406617934267061, + "grad_norm": 0.80078125, + "learning_rate": 4.7182138448737334e-05, + "loss": 0.868, + "step": 777 + }, + { + "epoch": 0.034110022559327845, + "grad_norm": 0.8671875, + "learning_rate": 4.7178229315208045e-05, + "loss": 0.7925, + "step": 778 + }, + { + "epoch": 0.03415386577598508, + "grad_norm": 0.79296875, + "learning_rate": 4.7174320293631515e-05, + "loss": 0.7447, + "step": 779 + }, + { + "epoch": 0.03419770899264231, + "grad_norm": 0.875, + "learning_rate": 4.7170411384016e-05, + "loss": 0.855, + "step": 780 + }, + { + "epoch": 0.034241552209299544, + "grad_norm": 0.91015625, + "learning_rate": 4.7166502586369774e-05, + "loss": 0.839, + "step": 781 + }, + { + "epoch": 0.03428539542595678, + "grad_norm": 0.87109375, + "learning_rate": 4.7162593900701126e-05, + "loss": 0.8769, + "step": 782 + }, + { + "epoch": 0.034329238642614016, + "grad_norm": 0.81640625, + "learning_rate": 4.7158685327018305e-05, + "loss": 0.8242, + "step": 783 + }, + { + "epoch": 0.034373081859271244, + "grad_norm": 0.8515625, + "learning_rate": 4.7154776865329664e-05, + "loss": 0.8347, + "step": 784 + }, + { + "epoch": 0.03441692507592848, + "grad_norm": 0.8515625, + "learning_rate": 4.715086851564345e-05, + "loss": 0.8775, + "step": 785 + }, + { + "epoch": 0.034460768292585715, + "grad_norm": 0.75, + "learning_rate": 4.7146960277967955e-05, + "loss": 0.8233, + "step": 786 + }, + { + "epoch": 0.034504611509242944, + "grad_norm": 0.8515625, + "learning_rate": 4.714305215231146e-05, + "loss": 0.9547, + "step": 787 + }, + { + "epoch": 0.03454845472590018, + "grad_norm": 0.83203125, + "learning_rate": 4.71391441386822e-05, + "loss": 0.8473, + "step": 788 + }, + { + "epoch": 0.034592297942557415, + "grad_norm": 0.90234375, + "learning_rate": 4.713523623708854e-05, + "loss": 0.8454, + "step": 789 + }, + { + "epoch": 0.03463614115921465, + "grad_norm": 0.76953125, + "learning_rate": 4.713132844753874e-05, + "loss": 0.7942, + "step": 790 + }, + { + "epoch": 0.03467998437587188, + "grad_norm": 0.859375, + "learning_rate": 4.7127420770041054e-05, + "loss": 0.8809, + "step": 791 + }, + { + "epoch": 0.034723827592529115, + "grad_norm": 0.8203125, + "learning_rate": 4.7123513204603776e-05, + "loss": 0.7815, + "step": 792 + }, + { + "epoch": 0.03476767080918635, + "grad_norm": 0.86328125, + "learning_rate": 4.711960575123515e-05, + "loss": 0.8628, + "step": 793 + }, + { + "epoch": 0.034811514025843586, + "grad_norm": 0.9140625, + "learning_rate": 4.7115698409943545e-05, + "loss": 0.7713, + "step": 794 + }, + { + "epoch": 0.034855357242500815, + "grad_norm": 0.80078125, + "learning_rate": 4.7111791180737194e-05, + "loss": 0.8158, + "step": 795 + }, + { + "epoch": 0.03489920045915805, + "grad_norm": 1.0546875, + "learning_rate": 4.710788406362437e-05, + "loss": 0.9086, + "step": 796 + }, + { + "epoch": 0.034943043675815286, + "grad_norm": 0.8046875, + "learning_rate": 4.710397705861337e-05, + "loss": 0.7294, + "step": 797 + }, + { + "epoch": 0.03498688689247252, + "grad_norm": 0.93359375, + "learning_rate": 4.710007016571246e-05, + "loss": 0.9265, + "step": 798 + }, + { + "epoch": 0.03503073010912975, + "grad_norm": 0.93359375, + "learning_rate": 4.709616338492994e-05, + "loss": 0.8503, + "step": 799 + }, + { + "epoch": 0.035074573325786985, + "grad_norm": 1.703125, + "learning_rate": 4.709225671627406e-05, + "loss": 0.8986, + "step": 800 + }, + { + "epoch": 0.03511841654244422, + "grad_norm": 0.81640625, + "learning_rate": 4.7088350159753136e-05, + "loss": 0.7499, + "step": 801 + }, + { + "epoch": 0.03516225975910146, + "grad_norm": 0.828125, + "learning_rate": 4.708444371537538e-05, + "loss": 0.8526, + "step": 802 + }, + { + "epoch": 0.035206102975758685, + "grad_norm": 0.859375, + "learning_rate": 4.708053738314917e-05, + "loss": 0.983, + "step": 803 + }, + { + "epoch": 0.03524994619241592, + "grad_norm": 0.76171875, + "learning_rate": 4.707663116308273e-05, + "loss": 0.7617, + "step": 804 + }, + { + "epoch": 0.035293789409073156, + "grad_norm": 0.7734375, + "learning_rate": 4.707272505518434e-05, + "loss": 0.8013, + "step": 805 + }, + { + "epoch": 0.035337632625730385, + "grad_norm": 0.81640625, + "learning_rate": 4.7068819059462287e-05, + "loss": 0.8657, + "step": 806 + }, + { + "epoch": 0.03538147584238762, + "grad_norm": 0.84375, + "learning_rate": 4.706491317592485e-05, + "loss": 0.9333, + "step": 807 + }, + { + "epoch": 0.035425319059044856, + "grad_norm": 0.84765625, + "learning_rate": 4.706100740458026e-05, + "loss": 0.8138, + "step": 808 + }, + { + "epoch": 0.03546916227570209, + "grad_norm": 0.765625, + "learning_rate": 4.7057101745436885e-05, + "loss": 0.8388, + "step": 809 + }, + { + "epoch": 0.03551300549235932, + "grad_norm": 0.859375, + "learning_rate": 4.7053196198502945e-05, + "loss": 0.9826, + "step": 810 + }, + { + "epoch": 0.035556848709016556, + "grad_norm": 0.86328125, + "learning_rate": 4.704929076378675e-05, + "loss": 0.7396, + "step": 811 + }, + { + "epoch": 0.03560069192567379, + "grad_norm": 0.890625, + "learning_rate": 4.704538544129654e-05, + "loss": 0.7871, + "step": 812 + }, + { + "epoch": 0.03564453514233103, + "grad_norm": 0.83203125, + "learning_rate": 4.704148023104057e-05, + "loss": 0.878, + "step": 813 + }, + { + "epoch": 0.035688378358988256, + "grad_norm": 0.8515625, + "learning_rate": 4.70375751330272e-05, + "loss": 0.9219, + "step": 814 + }, + { + "epoch": 0.03573222157564549, + "grad_norm": 0.875, + "learning_rate": 4.703367014726466e-05, + "loss": 0.892, + "step": 815 + }, + { + "epoch": 0.03577606479230273, + "grad_norm": 0.81640625, + "learning_rate": 4.702976527376123e-05, + "loss": 0.8088, + "step": 816 + }, + { + "epoch": 0.03581990800895996, + "grad_norm": 0.80859375, + "learning_rate": 4.702586051252518e-05, + "loss": 0.7908, + "step": 817 + }, + { + "epoch": 0.03586375122561719, + "grad_norm": 0.7890625, + "learning_rate": 4.702195586356476e-05, + "loss": 0.7534, + "step": 818 + }, + { + "epoch": 0.035907594442274426, + "grad_norm": 0.86328125, + "learning_rate": 4.701805132688831e-05, + "loss": 0.9596, + "step": 819 + }, + { + "epoch": 0.03595143765893166, + "grad_norm": 0.921875, + "learning_rate": 4.701414690250408e-05, + "loss": 0.6849, + "step": 820 + }, + { + "epoch": 0.0359952808755889, + "grad_norm": 0.828125, + "learning_rate": 4.7010242590420326e-05, + "loss": 0.7761, + "step": 821 + }, + { + "epoch": 0.036039124092246126, + "grad_norm": 1.03125, + "learning_rate": 4.7006338390645336e-05, + "loss": 1.0531, + "step": 822 + }, + { + "epoch": 0.03608296730890336, + "grad_norm": 1.203125, + "learning_rate": 4.700243430318735e-05, + "loss": 0.834, + "step": 823 + }, + { + "epoch": 0.0361268105255606, + "grad_norm": 0.81640625, + "learning_rate": 4.699853032805471e-05, + "loss": 1.032, + "step": 824 + }, + { + "epoch": 0.036170653742217826, + "grad_norm": 0.83203125, + "learning_rate": 4.699462646525565e-05, + "loss": 0.7754, + "step": 825 + }, + { + "epoch": 0.03621449695887506, + "grad_norm": 0.84765625, + "learning_rate": 4.6990722714798464e-05, + "loss": 0.8472, + "step": 826 + }, + { + "epoch": 0.0362583401755323, + "grad_norm": 0.8671875, + "learning_rate": 4.69868190766914e-05, + "loss": 0.9419, + "step": 827 + }, + { + "epoch": 0.03630218339218953, + "grad_norm": 0.81640625, + "learning_rate": 4.698291555094271e-05, + "loss": 0.8693, + "step": 828 + }, + { + "epoch": 0.03634602660884676, + "grad_norm": 0.875, + "learning_rate": 4.697901213756073e-05, + "loss": 1.0556, + "step": 829 + }, + { + "epoch": 0.036389869825504, + "grad_norm": 0.9140625, + "learning_rate": 4.697510883655372e-05, + "loss": 0.7706, + "step": 830 + }, + { + "epoch": 0.03643371304216123, + "grad_norm": 0.765625, + "learning_rate": 4.6971205647929913e-05, + "loss": 0.7527, + "step": 831 + }, + { + "epoch": 0.03647755625881847, + "grad_norm": 0.8203125, + "learning_rate": 4.696730257169762e-05, + "loss": 0.813, + "step": 832 + }, + { + "epoch": 0.0365213994754757, + "grad_norm": 0.84375, + "learning_rate": 4.696339960786505e-05, + "loss": 0.7886, + "step": 833 + }, + { + "epoch": 0.03656524269213293, + "grad_norm": 0.7734375, + "learning_rate": 4.695949675644058e-05, + "loss": 0.7574, + "step": 834 + }, + { + "epoch": 0.03660908590879017, + "grad_norm": 0.79296875, + "learning_rate": 4.695559401743241e-05, + "loss": 0.8773, + "step": 835 + }, + { + "epoch": 0.0366529291254474, + "grad_norm": 0.87890625, + "learning_rate": 4.695169139084883e-05, + "loss": 0.8787, + "step": 836 + }, + { + "epoch": 0.03669677234210463, + "grad_norm": 0.7421875, + "learning_rate": 4.6947788876698106e-05, + "loss": 0.7954, + "step": 837 + }, + { + "epoch": 0.03674061555876187, + "grad_norm": 0.7734375, + "learning_rate": 4.6943886474988516e-05, + "loss": 0.8524, + "step": 838 + }, + { + "epoch": 0.0367844587754191, + "grad_norm": 0.86328125, + "learning_rate": 4.693998418572828e-05, + "loss": 0.7208, + "step": 839 + }, + { + "epoch": 0.03682830199207633, + "grad_norm": 0.84375, + "learning_rate": 4.6936082008925766e-05, + "loss": 0.9724, + "step": 840 + }, + { + "epoch": 0.03687214520873357, + "grad_norm": 0.84765625, + "learning_rate": 4.693217994458918e-05, + "loss": 0.8662, + "step": 841 + }, + { + "epoch": 0.0369159884253908, + "grad_norm": 0.796875, + "learning_rate": 4.692827799272681e-05, + "loss": 0.7121, + "step": 842 + }, + { + "epoch": 0.03695983164204804, + "grad_norm": 0.765625, + "learning_rate": 4.6924376153346914e-05, + "loss": 0.8666, + "step": 843 + }, + { + "epoch": 0.03700367485870527, + "grad_norm": 0.8359375, + "learning_rate": 4.692047442645779e-05, + "loss": 0.7397, + "step": 844 + }, + { + "epoch": 0.0370475180753625, + "grad_norm": 0.796875, + "learning_rate": 4.691657281206767e-05, + "loss": 1.0833, + "step": 845 + }, + { + "epoch": 0.03709136129201974, + "grad_norm": 0.828125, + "learning_rate": 4.6912671310184844e-05, + "loss": 0.7767, + "step": 846 + }, + { + "epoch": 0.037135204508676974, + "grad_norm": 0.84375, + "learning_rate": 4.6908769920817576e-05, + "loss": 0.7881, + "step": 847 + }, + { + "epoch": 0.0371790477253342, + "grad_norm": 0.8046875, + "learning_rate": 4.690486864397413e-05, + "loss": 0.8244, + "step": 848 + }, + { + "epoch": 0.03722289094199144, + "grad_norm": 0.765625, + "learning_rate": 4.690096747966275e-05, + "loss": 0.7373, + "step": 849 + }, + { + "epoch": 0.03726673415864867, + "grad_norm": 0.8515625, + "learning_rate": 4.689706642789177e-05, + "loss": 0.749, + "step": 850 + }, + { + "epoch": 0.03731057737530591, + "grad_norm": 0.96484375, + "learning_rate": 4.689316548866942e-05, + "loss": 0.7239, + "step": 851 + }, + { + "epoch": 0.03735442059196314, + "grad_norm": 0.85546875, + "learning_rate": 4.6889264662003974e-05, + "loss": 0.783, + "step": 852 + }, + { + "epoch": 0.03739826380862037, + "grad_norm": 0.82421875, + "learning_rate": 4.688536394790369e-05, + "loss": 0.7884, + "step": 853 + }, + { + "epoch": 0.03744210702527761, + "grad_norm": 0.78515625, + "learning_rate": 4.688146334637682e-05, + "loss": 0.8463, + "step": 854 + }, + { + "epoch": 0.037485950241934844, + "grad_norm": 0.91015625, + "learning_rate": 4.6877562857431675e-05, + "loss": 0.8341, + "step": 855 + }, + { + "epoch": 0.03752979345859207, + "grad_norm": 0.76171875, + "learning_rate": 4.68736624810765e-05, + "loss": 0.7789, + "step": 856 + }, + { + "epoch": 0.03757363667524931, + "grad_norm": 0.7265625, + "learning_rate": 4.686976221731957e-05, + "loss": 0.7288, + "step": 857 + }, + { + "epoch": 0.037617479891906544, + "grad_norm": 0.81640625, + "learning_rate": 4.6865862066169134e-05, + "loss": 0.87, + "step": 858 + }, + { + "epoch": 0.03766132310856377, + "grad_norm": 0.77734375, + "learning_rate": 4.686196202763342e-05, + "loss": 0.7538, + "step": 859 + }, + { + "epoch": 0.03770516632522101, + "grad_norm": 0.8125, + "learning_rate": 4.685806210172079e-05, + "loss": 0.9011, + "step": 860 + }, + { + "epoch": 0.037749009541878244, + "grad_norm": 0.87890625, + "learning_rate": 4.6854162288439464e-05, + "loss": 0.9053, + "step": 861 + }, + { + "epoch": 0.03779285275853548, + "grad_norm": 0.87890625, + "learning_rate": 4.68502625877977e-05, + "loss": 0.839, + "step": 862 + }, + { + "epoch": 0.03783669597519271, + "grad_norm": 0.7421875, + "learning_rate": 4.6846362999803764e-05, + "loss": 0.7591, + "step": 863 + }, + { + "epoch": 0.03788053919184994, + "grad_norm": 0.80078125, + "learning_rate": 4.6842463524465884e-05, + "loss": 0.7944, + "step": 864 + }, + { + "epoch": 0.03792438240850718, + "grad_norm": 0.828125, + "learning_rate": 4.68385641617924e-05, + "loss": 0.8077, + "step": 865 + }, + { + "epoch": 0.037968225625164415, + "grad_norm": 0.8203125, + "learning_rate": 4.6834664911791546e-05, + "loss": 0.7948, + "step": 866 + }, + { + "epoch": 0.03801206884182164, + "grad_norm": 0.80078125, + "learning_rate": 4.6830765774471575e-05, + "loss": 0.8885, + "step": 867 + }, + { + "epoch": 0.03805591205847888, + "grad_norm": 0.91796875, + "learning_rate": 4.682686674984077e-05, + "loss": 0.8867, + "step": 868 + }, + { + "epoch": 0.038099755275136114, + "grad_norm": 0.8125, + "learning_rate": 4.6822967837907324e-05, + "loss": 0.8257, + "step": 869 + }, + { + "epoch": 0.03814359849179335, + "grad_norm": 0.8671875, + "learning_rate": 4.6819069038679606e-05, + "loss": 0.8094, + "step": 870 + }, + { + "epoch": 0.03818744170845058, + "grad_norm": 0.8359375, + "learning_rate": 4.681517035216584e-05, + "loss": 0.8325, + "step": 871 + }, + { + "epoch": 0.038231284925107814, + "grad_norm": 0.83203125, + "learning_rate": 4.681127177837428e-05, + "loss": 0.9088, + "step": 872 + }, + { + "epoch": 0.03827512814176505, + "grad_norm": 0.82421875, + "learning_rate": 4.680737331731317e-05, + "loss": 0.8961, + "step": 873 + }, + { + "epoch": 0.038318971358422285, + "grad_norm": 0.73828125, + "learning_rate": 4.6803474968990766e-05, + "loss": 0.8235, + "step": 874 + }, + { + "epoch": 0.038362814575079514, + "grad_norm": 0.77734375, + "learning_rate": 4.679957673341541e-05, + "loss": 0.7752, + "step": 875 + }, + { + "epoch": 0.03840665779173675, + "grad_norm": 0.8984375, + "learning_rate": 4.6795678610595284e-05, + "loss": 0.7996, + "step": 876 + }, + { + "epoch": 0.038450501008393985, + "grad_norm": 0.78515625, + "learning_rate": 4.6791780600538696e-05, + "loss": 0.8057, + "step": 877 + }, + { + "epoch": 0.038494344225051214, + "grad_norm": 0.87890625, + "learning_rate": 4.678788270325387e-05, + "loss": 0.8253, + "step": 878 + }, + { + "epoch": 0.03853818744170845, + "grad_norm": 0.82421875, + "learning_rate": 4.678398491874906e-05, + "loss": 1.0357, + "step": 879 + }, + { + "epoch": 0.038582030658365685, + "grad_norm": 0.83203125, + "learning_rate": 4.6780087247032586e-05, + "loss": 0.821, + "step": 880 + }, + { + "epoch": 0.03862587387502292, + "grad_norm": 0.8515625, + "learning_rate": 4.677618968811267e-05, + "loss": 0.9646, + "step": 881 + }, + { + "epoch": 0.03866971709168015, + "grad_norm": 0.90625, + "learning_rate": 4.677229224199758e-05, + "loss": 0.9656, + "step": 882 + }, + { + "epoch": 0.038713560308337384, + "grad_norm": 0.8359375, + "learning_rate": 4.676839490869559e-05, + "loss": 0.7413, + "step": 883 + }, + { + "epoch": 0.03875740352499462, + "grad_norm": 0.8671875, + "learning_rate": 4.676449768821493e-05, + "loss": 0.8075, + "step": 884 + }, + { + "epoch": 0.038801246741651856, + "grad_norm": 0.7890625, + "learning_rate": 4.676060058056387e-05, + "loss": 0.9354, + "step": 885 + }, + { + "epoch": 0.038845089958309084, + "grad_norm": 0.83984375, + "learning_rate": 4.675670358575068e-05, + "loss": 0.8162, + "step": 886 + }, + { + "epoch": 0.03888893317496632, + "grad_norm": 0.8515625, + "learning_rate": 4.675280670378358e-05, + "loss": 0.8087, + "step": 887 + }, + { + "epoch": 0.038932776391623555, + "grad_norm": 0.7890625, + "learning_rate": 4.6748909934670906e-05, + "loss": 0.8237, + "step": 888 + }, + { + "epoch": 0.03897661960828079, + "grad_norm": 0.80078125, + "learning_rate": 4.674501327842086e-05, + "loss": 0.85, + "step": 889 + }, + { + "epoch": 0.03902046282493802, + "grad_norm": 0.8046875, + "learning_rate": 4.674111673504171e-05, + "loss": 0.7885, + "step": 890 + }, + { + "epoch": 0.039064306041595255, + "grad_norm": 0.83984375, + "learning_rate": 4.673722030454174e-05, + "loss": 0.7516, + "step": 891 + }, + { + "epoch": 0.03910814925825249, + "grad_norm": 0.78125, + "learning_rate": 4.6733323986929164e-05, + "loss": 0.8087, + "step": 892 + }, + { + "epoch": 0.039151992474909726, + "grad_norm": 0.83203125, + "learning_rate": 4.6729427782212274e-05, + "loss": 0.8857, + "step": 893 + }, + { + "epoch": 0.039195835691566955, + "grad_norm": 0.80859375, + "learning_rate": 4.672553169039928e-05, + "loss": 0.7438, + "step": 894 + }, + { + "epoch": 0.03923967890822419, + "grad_norm": 0.8046875, + "learning_rate": 4.672163571149853e-05, + "loss": 1.0137, + "step": 895 + }, + { + "epoch": 0.039283522124881426, + "grad_norm": 0.8828125, + "learning_rate": 4.67177398455182e-05, + "loss": 0.845, + "step": 896 + }, + { + "epoch": 0.039327365341538655, + "grad_norm": 0.8984375, + "learning_rate": 4.671384409246658e-05, + "loss": 0.8739, + "step": 897 + }, + { + "epoch": 0.03937120855819589, + "grad_norm": 0.828125, + "learning_rate": 4.670994845235194e-05, + "loss": 0.7791, + "step": 898 + }, + { + "epoch": 0.039415051774853126, + "grad_norm": 0.81640625, + "learning_rate": 4.670605292518246e-05, + "loss": 0.8193, + "step": 899 + }, + { + "epoch": 0.03945889499151036, + "grad_norm": 0.7578125, + "learning_rate": 4.6702157510966504e-05, + "loss": 0.7832, + "step": 900 + }, + { + "epoch": 0.03950273820816759, + "grad_norm": 0.875, + "learning_rate": 4.669826220971228e-05, + "loss": 0.7849, + "step": 901 + }, + { + "epoch": 0.039546581424824825, + "grad_norm": 0.83203125, + "learning_rate": 4.6694367021428044e-05, + "loss": 0.8514, + "step": 902 + }, + { + "epoch": 0.03959042464148206, + "grad_norm": 0.8203125, + "learning_rate": 4.6690471946122036e-05, + "loss": 0.7541, + "step": 903 + }, + { + "epoch": 0.0396342678581393, + "grad_norm": 0.828125, + "learning_rate": 4.66865769838025e-05, + "loss": 0.8809, + "step": 904 + }, + { + "epoch": 0.039678111074796525, + "grad_norm": 0.77734375, + "learning_rate": 4.668268213447775e-05, + "loss": 0.7546, + "step": 905 + }, + { + "epoch": 0.03972195429145376, + "grad_norm": 0.83984375, + "learning_rate": 4.667878739815601e-05, + "loss": 0.8383, + "step": 906 + }, + { + "epoch": 0.039765797508110996, + "grad_norm": 0.86328125, + "learning_rate": 4.6674892774845534e-05, + "loss": 0.7255, + "step": 907 + }, + { + "epoch": 0.03980964072476823, + "grad_norm": 0.859375, + "learning_rate": 4.6670998264554575e-05, + "loss": 0.6888, + "step": 908 + }, + { + "epoch": 0.03985348394142546, + "grad_norm": 1.0390625, + "learning_rate": 4.6667103867291354e-05, + "loss": 0.7105, + "step": 909 + }, + { + "epoch": 0.039897327158082696, + "grad_norm": 0.7734375, + "learning_rate": 4.6663209583064184e-05, + "loss": 0.7589, + "step": 910 + }, + { + "epoch": 0.03994117037473993, + "grad_norm": 0.82421875, + "learning_rate": 4.66593154118813e-05, + "loss": 0.778, + "step": 911 + }, + { + "epoch": 0.03998501359139717, + "grad_norm": 0.890625, + "learning_rate": 4.665542135375094e-05, + "loss": 0.7405, + "step": 912 + }, + { + "epoch": 0.040028856808054396, + "grad_norm": 0.83203125, + "learning_rate": 4.6651527408681375e-05, + "loss": 0.8319, + "step": 913 + }, + { + "epoch": 0.04007270002471163, + "grad_norm": 0.80859375, + "learning_rate": 4.664763357668081e-05, + "loss": 0.7368, + "step": 914 + }, + { + "epoch": 0.04011654324136887, + "grad_norm": 0.80859375, + "learning_rate": 4.6643739857757565e-05, + "loss": 0.9123, + "step": 915 + }, + { + "epoch": 0.040160386458026096, + "grad_norm": 0.828125, + "learning_rate": 4.663984625191987e-05, + "loss": 0.747, + "step": 916 + }, + { + "epoch": 0.04020422967468333, + "grad_norm": 0.99609375, + "learning_rate": 4.6635952759175973e-05, + "loss": 1.0047, + "step": 917 + }, + { + "epoch": 0.04024807289134057, + "grad_norm": 0.80078125, + "learning_rate": 4.663205937953412e-05, + "loss": 0.774, + "step": 918 + }, + { + "epoch": 0.0402919161079978, + "grad_norm": 0.80078125, + "learning_rate": 4.6628166113002525e-05, + "loss": 0.7734, + "step": 919 + }, + { + "epoch": 0.04033575932465503, + "grad_norm": 0.97265625, + "learning_rate": 4.6624272959589535e-05, + "loss": 0.87, + "step": 920 + }, + { + "epoch": 0.040379602541312266, + "grad_norm": 0.83203125, + "learning_rate": 4.6620379919303324e-05, + "loss": 0.9368, + "step": 921 + }, + { + "epoch": 0.0404234457579695, + "grad_norm": 0.890625, + "learning_rate": 4.661648699215219e-05, + "loss": 0.9472, + "step": 922 + }, + { + "epoch": 0.04046728897462674, + "grad_norm": 0.8671875, + "learning_rate": 4.661259417814434e-05, + "loss": 0.8893, + "step": 923 + }, + { + "epoch": 0.040511132191283966, + "grad_norm": 0.8359375, + "learning_rate": 4.6608701477288066e-05, + "loss": 0.8196, + "step": 924 + }, + { + "epoch": 0.0405549754079412, + "grad_norm": 0.91015625, + "learning_rate": 4.660480888959154e-05, + "loss": 0.905, + "step": 925 + }, + { + "epoch": 0.04059881862459844, + "grad_norm": 0.76953125, + "learning_rate": 4.6600916415063126e-05, + "loss": 0.7488, + "step": 926 + }, + { + "epoch": 0.04064266184125567, + "grad_norm": 0.83984375, + "learning_rate": 4.6597024053711e-05, + "loss": 0.821, + "step": 927 + }, + { + "epoch": 0.0406865050579129, + "grad_norm": 0.8046875, + "learning_rate": 4.659313180554344e-05, + "loss": 1.0071, + "step": 928 + }, + { + "epoch": 0.04073034827457014, + "grad_norm": 0.83984375, + "learning_rate": 4.658923967056869e-05, + "loss": 0.8563, + "step": 929 + }, + { + "epoch": 0.04077419149122737, + "grad_norm": 0.8203125, + "learning_rate": 4.6585347648794974e-05, + "loss": 0.792, + "step": 930 + }, + { + "epoch": 0.0408180347078846, + "grad_norm": 0.75, + "learning_rate": 4.6581455740230574e-05, + "loss": 0.7986, + "step": 931 + }, + { + "epoch": 0.04086187792454184, + "grad_norm": 0.91015625, + "learning_rate": 4.657756394488373e-05, + "loss": 0.9138, + "step": 932 + }, + { + "epoch": 0.04090572114119907, + "grad_norm": 0.84765625, + "learning_rate": 4.657367226276268e-05, + "loss": 0.7711, + "step": 933 + }, + { + "epoch": 0.04094956435785631, + "grad_norm": 0.87890625, + "learning_rate": 4.656978069387563e-05, + "loss": 0.7888, + "step": 934 + }, + { + "epoch": 0.040993407574513537, + "grad_norm": 1.078125, + "learning_rate": 4.656588923823093e-05, + "loss": 0.9499, + "step": 935 + }, + { + "epoch": 0.04103725079117077, + "grad_norm": 0.9140625, + "learning_rate": 4.656199789583676e-05, + "loss": 1.0489, + "step": 936 + }, + { + "epoch": 0.04108109400782801, + "grad_norm": 0.9453125, + "learning_rate": 4.6558106666701395e-05, + "loss": 0.9127, + "step": 937 + }, + { + "epoch": 0.04112493722448524, + "grad_norm": 0.859375, + "learning_rate": 4.655421555083306e-05, + "loss": 0.8221, + "step": 938 + }, + { + "epoch": 0.04116878044114247, + "grad_norm": 0.84375, + "learning_rate": 4.6550324548239974e-05, + "loss": 0.8431, + "step": 939 + }, + { + "epoch": 0.04121262365779971, + "grad_norm": 0.90625, + "learning_rate": 4.654643365893045e-05, + "loss": 0.8338, + "step": 940 + }, + { + "epoch": 0.04125646687445694, + "grad_norm": 0.8203125, + "learning_rate": 4.654254288291272e-05, + "loss": 0.8262, + "step": 941 + }, + { + "epoch": 0.04130031009111418, + "grad_norm": 0.85546875, + "learning_rate": 4.6538652220195014e-05, + "loss": 0.947, + "step": 942 + }, + { + "epoch": 0.04134415330777141, + "grad_norm": 0.8359375, + "learning_rate": 4.6534761670785564e-05, + "loss": 0.8573, + "step": 943 + }, + { + "epoch": 0.04138799652442864, + "grad_norm": 0.89453125, + "learning_rate": 4.6530871234692595e-05, + "loss": 0.9128, + "step": 944 + }, + { + "epoch": 0.04143183974108588, + "grad_norm": 0.91015625, + "learning_rate": 4.6526980911924435e-05, + "loss": 0.872, + "step": 945 + }, + { + "epoch": 0.041475682957743114, + "grad_norm": 0.76953125, + "learning_rate": 4.652309070248928e-05, + "loss": 0.8877, + "step": 946 + }, + { + "epoch": 0.04151952617440034, + "grad_norm": 0.82421875, + "learning_rate": 4.6519200606395386e-05, + "loss": 0.7595, + "step": 947 + }, + { + "epoch": 0.04156336939105758, + "grad_norm": 0.85546875, + "learning_rate": 4.651531062365098e-05, + "loss": 0.8726, + "step": 948 + }, + { + "epoch": 0.041607212607714814, + "grad_norm": 0.86328125, + "learning_rate": 4.651142075426428e-05, + "loss": 0.8928, + "step": 949 + }, + { + "epoch": 0.04165105582437204, + "grad_norm": 0.78515625, + "learning_rate": 4.65075309982436e-05, + "loss": 0.8522, + "step": 950 + }, + { + "epoch": 0.04169489904102928, + "grad_norm": 0.75390625, + "learning_rate": 4.650364135559715e-05, + "loss": 0.776, + "step": 951 + }, + { + "epoch": 0.04173874225768651, + "grad_norm": 0.875, + "learning_rate": 4.649975182633318e-05, + "loss": 0.8382, + "step": 952 + }, + { + "epoch": 0.04178258547434375, + "grad_norm": 0.69140625, + "learning_rate": 4.649586241045992e-05, + "loss": 0.7519, + "step": 953 + }, + { + "epoch": 0.04182642869100098, + "grad_norm": 0.79296875, + "learning_rate": 4.6491973107985595e-05, + "loss": 0.7154, + "step": 954 + }, + { + "epoch": 0.04187027190765821, + "grad_norm": 0.7734375, + "learning_rate": 4.648808391891851e-05, + "loss": 0.8479, + "step": 955 + }, + { + "epoch": 0.04191411512431545, + "grad_norm": 0.78125, + "learning_rate": 4.648419484326687e-05, + "loss": 0.8351, + "step": 956 + }, + { + "epoch": 0.041957958340972684, + "grad_norm": 1.0703125, + "learning_rate": 4.6480305881038925e-05, + "loss": 0.8151, + "step": 957 + }, + { + "epoch": 0.04200180155762991, + "grad_norm": 0.83984375, + "learning_rate": 4.64764170322429e-05, + "loss": 0.8459, + "step": 958 + }, + { + "epoch": 0.04204564477428715, + "grad_norm": 0.8046875, + "learning_rate": 4.6472528296887016e-05, + "loss": 0.8176, + "step": 959 + }, + { + "epoch": 0.042089487990944384, + "grad_norm": 0.87890625, + "learning_rate": 4.6468639674979594e-05, + "loss": 0.9259, + "step": 960 + }, + { + "epoch": 0.04213333120760162, + "grad_norm": 0.8203125, + "learning_rate": 4.646475116652883e-05, + "loss": 0.8431, + "step": 961 + }, + { + "epoch": 0.04217717442425885, + "grad_norm": 0.859375, + "learning_rate": 4.646086277154297e-05, + "loss": 0.8584, + "step": 962 + }, + { + "epoch": 0.042221017640916084, + "grad_norm": 0.9296875, + "learning_rate": 4.645697449003024e-05, + "loss": 0.9001, + "step": 963 + }, + { + "epoch": 0.04226486085757332, + "grad_norm": 0.86328125, + "learning_rate": 4.645308632199885e-05, + "loss": 0.8394, + "step": 964 + }, + { + "epoch": 0.042308704074230555, + "grad_norm": 0.8671875, + "learning_rate": 4.644919826745714e-05, + "loss": 1.0058, + "step": 965 + }, + { + "epoch": 0.04235254729088778, + "grad_norm": 0.78125, + "learning_rate": 4.644531032641328e-05, + "loss": 0.8179, + "step": 966 + }, + { + "epoch": 0.04239639050754502, + "grad_norm": 0.8046875, + "learning_rate": 4.644142249887552e-05, + "loss": 0.7913, + "step": 967 + }, + { + "epoch": 0.042440233724202255, + "grad_norm": 0.8828125, + "learning_rate": 4.643753478485211e-05, + "loss": 0.7546, + "step": 968 + }, + { + "epoch": 0.04248407694085948, + "grad_norm": 0.82421875, + "learning_rate": 4.643364718435129e-05, + "loss": 0.8855, + "step": 969 + }, + { + "epoch": 0.04252792015751672, + "grad_norm": 0.8828125, + "learning_rate": 4.642975969738128e-05, + "loss": 0.8658, + "step": 970 + }, + { + "epoch": 0.042571763374173954, + "grad_norm": 0.81640625, + "learning_rate": 4.642587232395035e-05, + "loss": 0.8082, + "step": 971 + }, + { + "epoch": 0.04261560659083119, + "grad_norm": 0.84765625, + "learning_rate": 4.642198506406671e-05, + "loss": 0.8532, + "step": 972 + }, + { + "epoch": 0.04265944980748842, + "grad_norm": 0.76953125, + "learning_rate": 4.641809791773857e-05, + "loss": 0.7739, + "step": 973 + }, + { + "epoch": 0.042703293024145654, + "grad_norm": 0.83984375, + "learning_rate": 4.641421088497425e-05, + "loss": 0.9204, + "step": 974 + }, + { + "epoch": 0.04274713624080289, + "grad_norm": 0.98046875, + "learning_rate": 4.641032396578195e-05, + "loss": 0.9235, + "step": 975 + }, + { + "epoch": 0.042790979457460125, + "grad_norm": 0.703125, + "learning_rate": 4.64064371601699e-05, + "loss": 0.6859, + "step": 976 + }, + { + "epoch": 0.042834822674117354, + "grad_norm": 1.046875, + "learning_rate": 4.640255046814635e-05, + "loss": 0.784, + "step": 977 + }, + { + "epoch": 0.04287866589077459, + "grad_norm": 0.82421875, + "learning_rate": 4.639866388971952e-05, + "loss": 0.837, + "step": 978 + }, + { + "epoch": 0.042922509107431825, + "grad_norm": 0.87109375, + "learning_rate": 4.639477742489763e-05, + "loss": 0.9581, + "step": 979 + }, + { + "epoch": 0.04296635232408906, + "grad_norm": 0.7890625, + "learning_rate": 4.639089107368898e-05, + "loss": 0.7713, + "step": 980 + }, + { + "epoch": 0.04301019554074629, + "grad_norm": 0.86328125, + "learning_rate": 4.638700483610177e-05, + "loss": 0.8536, + "step": 981 + }, + { + "epoch": 0.043054038757403525, + "grad_norm": 0.88671875, + "learning_rate": 4.638311871214425e-05, + "loss": 0.9021, + "step": 982 + }, + { + "epoch": 0.04309788197406076, + "grad_norm": 0.88671875, + "learning_rate": 4.6379232701824624e-05, + "loss": 0.8959, + "step": 983 + }, + { + "epoch": 0.043141725190717996, + "grad_norm": 0.84375, + "learning_rate": 4.6375346805151134e-05, + "loss": 0.8732, + "step": 984 + }, + { + "epoch": 0.043185568407375224, + "grad_norm": 0.94921875, + "learning_rate": 4.637146102213206e-05, + "loss": 0.9525, + "step": 985 + }, + { + "epoch": 0.04322941162403246, + "grad_norm": 0.86328125, + "learning_rate": 4.636757535277561e-05, + "loss": 0.7648, + "step": 986 + }, + { + "epoch": 0.043273254840689696, + "grad_norm": 0.81640625, + "learning_rate": 4.636368979709e-05, + "loss": 0.9577, + "step": 987 + }, + { + "epoch": 0.043317098057346924, + "grad_norm": 0.8515625, + "learning_rate": 4.635980435508352e-05, + "loss": 0.9109, + "step": 988 + }, + { + "epoch": 0.04336094127400416, + "grad_norm": 0.84765625, + "learning_rate": 4.635591902676434e-05, + "loss": 0.8395, + "step": 989 + }, + { + "epoch": 0.043404784490661395, + "grad_norm": 0.88671875, + "learning_rate": 4.6352033812140695e-05, + "loss": 0.8334, + "step": 990 + }, + { + "epoch": 0.04344862770731863, + "grad_norm": 0.8203125, + "learning_rate": 4.6348148711220886e-05, + "loss": 0.81, + "step": 991 + }, + { + "epoch": 0.04349247092397586, + "grad_norm": 0.94140625, + "learning_rate": 4.63442637240131e-05, + "loss": 1.0106, + "step": 992 + }, + { + "epoch": 0.043536314140633095, + "grad_norm": 0.90234375, + "learning_rate": 4.6340378850525604e-05, + "loss": 0.8081, + "step": 993 + }, + { + "epoch": 0.04358015735729033, + "grad_norm": 0.8671875, + "learning_rate": 4.633649409076659e-05, + "loss": 0.8272, + "step": 994 + }, + { + "epoch": 0.043624000573947566, + "grad_norm": 0.84765625, + "learning_rate": 4.633260944474427e-05, + "loss": 0.7197, + "step": 995 + }, + { + "epoch": 0.043667843790604795, + "grad_norm": 0.7734375, + "learning_rate": 4.6328724912466955e-05, + "loss": 0.7874, + "step": 996 + }, + { + "epoch": 0.04371168700726203, + "grad_norm": 0.8359375, + "learning_rate": 4.6324840493942855e-05, + "loss": 0.8228, + "step": 997 + }, + { + "epoch": 0.043755530223919266, + "grad_norm": 0.81640625, + "learning_rate": 4.632095618918017e-05, + "loss": 0.8872, + "step": 998 + }, + { + "epoch": 0.0437993734405765, + "grad_norm": 0.7734375, + "learning_rate": 4.631707199818717e-05, + "loss": 0.9569, + "step": 999 + }, + { + "epoch": 0.04384321665723373, + "grad_norm": 0.77734375, + "learning_rate": 4.631318792097201e-05, + "loss": 0.9468, + "step": 1000 + }, + { + "epoch": 0.04384321665723373, + "eval_loss": 0.824898362159729, + "eval_runtime": 275.0862, + "eval_samples_per_second": 36.352, + "eval_steps_per_second": 0.76, + "step": 1000 + }, + { + "epoch": 0.043887059873890966, + "grad_norm": 0.88671875, + "learning_rate": 4.630930395754303e-05, + "loss": 0.9607, + "step": 1001 + }, + { + "epoch": 0.0439309030905482, + "grad_norm": 0.8359375, + "learning_rate": 4.6305420107908406e-05, + "loss": 0.8991, + "step": 1002 + }, + { + "epoch": 0.04397474630720544, + "grad_norm": 0.8984375, + "learning_rate": 4.630153637207637e-05, + "loss": 0.7477, + "step": 1003 + }, + { + "epoch": 0.044018589523862665, + "grad_norm": 0.86328125, + "learning_rate": 4.629765275005517e-05, + "loss": 0.7383, + "step": 1004 + }, + { + "epoch": 0.0440624327405199, + "grad_norm": 0.78515625, + "learning_rate": 4.629376924185297e-05, + "loss": 0.817, + "step": 1005 + }, + { + "epoch": 0.044106275957177137, + "grad_norm": 0.828125, + "learning_rate": 4.628988584747811e-05, + "loss": 0.8018, + "step": 1006 + }, + { + "epoch": 0.044150119173834365, + "grad_norm": 0.8046875, + "learning_rate": 4.628600256693876e-05, + "loss": 0.8451, + "step": 1007 + }, + { + "epoch": 0.0441939623904916, + "grad_norm": 0.84765625, + "learning_rate": 4.628211940024316e-05, + "loss": 0.8048, + "step": 1008 + }, + { + "epoch": 0.044237805607148836, + "grad_norm": 0.92578125, + "learning_rate": 4.627823634739954e-05, + "loss": 0.9239, + "step": 1009 + }, + { + "epoch": 0.04428164882380607, + "grad_norm": 0.8125, + "learning_rate": 4.627435340841608e-05, + "loss": 0.8787, + "step": 1010 + }, + { + "epoch": 0.0443254920404633, + "grad_norm": 0.84375, + "learning_rate": 4.627047058330111e-05, + "loss": 0.9069, + "step": 1011 + }, + { + "epoch": 0.044369335257120536, + "grad_norm": 0.89453125, + "learning_rate": 4.62665878720628e-05, + "loss": 0.8426, + "step": 1012 + }, + { + "epoch": 0.04441317847377777, + "grad_norm": 0.78125, + "learning_rate": 4.626270527470938e-05, + "loss": 0.7745, + "step": 1013 + }, + { + "epoch": 0.04445702169043501, + "grad_norm": 0.84375, + "learning_rate": 4.625882279124909e-05, + "loss": 0.7803, + "step": 1014 + }, + { + "epoch": 0.044500864907092236, + "grad_norm": 0.8984375, + "learning_rate": 4.6254940421690155e-05, + "loss": 0.9732, + "step": 1015 + }, + { + "epoch": 0.04454470812374947, + "grad_norm": 0.8828125, + "learning_rate": 4.625105816604079e-05, + "loss": 0.8628, + "step": 1016 + }, + { + "epoch": 0.04458855134040671, + "grad_norm": 0.79296875, + "learning_rate": 4.6247176024309244e-05, + "loss": 0.7283, + "step": 1017 + }, + { + "epoch": 0.04463239455706394, + "grad_norm": 0.84765625, + "learning_rate": 4.624329399650375e-05, + "loss": 1.0226, + "step": 1018 + }, + { + "epoch": 0.04467623777372117, + "grad_norm": 0.90234375, + "learning_rate": 4.6239412082632505e-05, + "loss": 0.8751, + "step": 1019 + }, + { + "epoch": 0.04472008099037841, + "grad_norm": 0.85546875, + "learning_rate": 4.623553028270371e-05, + "loss": 0.8147, + "step": 1020 + }, + { + "epoch": 0.04476392420703564, + "grad_norm": 0.75, + "learning_rate": 4.623164859672568e-05, + "loss": 0.761, + "step": 1021 + }, + { + "epoch": 0.04480776742369288, + "grad_norm": 0.8671875, + "learning_rate": 4.6227767024706604e-05, + "loss": 0.7966, + "step": 1022 + }, + { + "epoch": 0.044851610640350106, + "grad_norm": 0.9140625, + "learning_rate": 4.62238855666547e-05, + "loss": 1.0731, + "step": 1023 + }, + { + "epoch": 0.04489545385700734, + "grad_norm": 0.80078125, + "learning_rate": 4.6220004222578194e-05, + "loss": 0.8947, + "step": 1024 + }, + { + "epoch": 0.04493929707366458, + "grad_norm": 0.953125, + "learning_rate": 4.6216122992485275e-05, + "loss": 0.9233, + "step": 1025 + }, + { + "epoch": 0.044983140290321806, + "grad_norm": 0.80078125, + "learning_rate": 4.6212241876384255e-05, + "loss": 0.9662, + "step": 1026 + }, + { + "epoch": 0.04502698350697904, + "grad_norm": 0.97265625, + "learning_rate": 4.6208360874283305e-05, + "loss": 0.8091, + "step": 1027 + }, + { + "epoch": 0.04507082672363628, + "grad_norm": 0.87890625, + "learning_rate": 4.6204479986190666e-05, + "loss": 0.8551, + "step": 1028 + }, + { + "epoch": 0.04511466994029351, + "grad_norm": 0.7734375, + "learning_rate": 4.6200599212114557e-05, + "loss": 0.9995, + "step": 1029 + }, + { + "epoch": 0.04515851315695074, + "grad_norm": 0.9453125, + "learning_rate": 4.6196718552063156e-05, + "loss": 0.9397, + "step": 1030 + }, + { + "epoch": 0.04520235637360798, + "grad_norm": 0.77734375, + "learning_rate": 4.619283800604478e-05, + "loss": 0.8135, + "step": 1031 + }, + { + "epoch": 0.04524619959026521, + "grad_norm": 0.859375, + "learning_rate": 4.618895757406761e-05, + "loss": 1.0393, + "step": 1032 + }, + { + "epoch": 0.04529004280692245, + "grad_norm": 0.83984375, + "learning_rate": 4.6185077256139866e-05, + "loss": 0.8058, + "step": 1033 + }, + { + "epoch": 0.04533388602357968, + "grad_norm": 0.7265625, + "learning_rate": 4.6181197052269776e-05, + "loss": 0.7691, + "step": 1034 + }, + { + "epoch": 0.04537772924023691, + "grad_norm": 0.9765625, + "learning_rate": 4.617731696246552e-05, + "loss": 0.8019, + "step": 1035 + }, + { + "epoch": 0.04542157245689415, + "grad_norm": 0.875, + "learning_rate": 4.6173436986735405e-05, + "loss": 0.8305, + "step": 1036 + }, + { + "epoch": 0.04546541567355138, + "grad_norm": 1.0390625, + "learning_rate": 4.6169557125087625e-05, + "loss": 0.9168, + "step": 1037 + }, + { + "epoch": 0.04550925889020861, + "grad_norm": 0.83984375, + "learning_rate": 4.616567737753038e-05, + "loss": 0.9096, + "step": 1038 + }, + { + "epoch": 0.04555310210686585, + "grad_norm": 0.8125, + "learning_rate": 4.61617977440719e-05, + "loss": 0.7926, + "step": 1039 + }, + { + "epoch": 0.04559694532352308, + "grad_norm": 0.82421875, + "learning_rate": 4.6157918224720385e-05, + "loss": 0.6909, + "step": 1040 + }, + { + "epoch": 0.04564078854018031, + "grad_norm": 0.90625, + "learning_rate": 4.6154038819484125e-05, + "loss": 0.9262, + "step": 1041 + }, + { + "epoch": 0.04568463175683755, + "grad_norm": 0.76171875, + "learning_rate": 4.61501595283713e-05, + "loss": 0.8302, + "step": 1042 + }, + { + "epoch": 0.04572847497349478, + "grad_norm": 0.875, + "learning_rate": 4.614628035139013e-05, + "loss": 0.8904, + "step": 1043 + }, + { + "epoch": 0.04577231819015202, + "grad_norm": 0.7265625, + "learning_rate": 4.614240128854883e-05, + "loss": 0.7823, + "step": 1044 + }, + { + "epoch": 0.04581616140680925, + "grad_norm": 0.8828125, + "learning_rate": 4.6138522339855604e-05, + "loss": 0.8831, + "step": 1045 + }, + { + "epoch": 0.04586000462346648, + "grad_norm": 0.87890625, + "learning_rate": 4.613464350531874e-05, + "loss": 0.7933, + "step": 1046 + }, + { + "epoch": 0.04590384784012372, + "grad_norm": 0.90234375, + "learning_rate": 4.6130764784946425e-05, + "loss": 0.8098, + "step": 1047 + }, + { + "epoch": 0.045947691056780954, + "grad_norm": 0.90234375, + "learning_rate": 4.6126886178746865e-05, + "loss": 0.7094, + "step": 1048 + }, + { + "epoch": 0.04599153427343818, + "grad_norm": 0.8515625, + "learning_rate": 4.61230076867283e-05, + "loss": 0.792, + "step": 1049 + }, + { + "epoch": 0.04603537749009542, + "grad_norm": 0.88671875, + "learning_rate": 4.61191293088989e-05, + "loss": 0.887, + "step": 1050 + }, + { + "epoch": 0.046079220706752654, + "grad_norm": 0.9609375, + "learning_rate": 4.611525104526696e-05, + "loss": 0.8725, + "step": 1051 + }, + { + "epoch": 0.04612306392340989, + "grad_norm": 0.796875, + "learning_rate": 4.611137289584066e-05, + "loss": 0.763, + "step": 1052 + }, + { + "epoch": 0.04616690714006712, + "grad_norm": 0.78125, + "learning_rate": 4.610749486062824e-05, + "loss": 0.8638, + "step": 1053 + }, + { + "epoch": 0.04621075035672435, + "grad_norm": 0.78515625, + "learning_rate": 4.6103616939637894e-05, + "loss": 0.7458, + "step": 1054 + }, + { + "epoch": 0.04625459357338159, + "grad_norm": 0.83984375, + "learning_rate": 4.609973913287786e-05, + "loss": 0.7032, + "step": 1055 + }, + { + "epoch": 0.046298436790038824, + "grad_norm": 0.84765625, + "learning_rate": 4.6095861440356336e-05, + "loss": 0.8017, + "step": 1056 + }, + { + "epoch": 0.04634228000669605, + "grad_norm": 0.83203125, + "learning_rate": 4.6091983862081564e-05, + "loss": 0.8471, + "step": 1057 + }, + { + "epoch": 0.04638612322335329, + "grad_norm": 1.03125, + "learning_rate": 4.6088106398061704e-05, + "loss": 0.8841, + "step": 1058 + }, + { + "epoch": 0.046429966440010524, + "grad_norm": 0.73828125, + "learning_rate": 4.608422904830507e-05, + "loss": 0.7957, + "step": 1059 + }, + { + "epoch": 0.04647380965666775, + "grad_norm": 0.89453125, + "learning_rate": 4.608035181281982e-05, + "loss": 0.6762, + "step": 1060 + }, + { + "epoch": 0.04651765287332499, + "grad_norm": 0.79296875, + "learning_rate": 4.607647469161419e-05, + "loss": 0.7609, + "step": 1061 + }, + { + "epoch": 0.046561496089982224, + "grad_norm": 0.921875, + "learning_rate": 4.607259768469638e-05, + "loss": 0.8598, + "step": 1062 + }, + { + "epoch": 0.04660533930663946, + "grad_norm": 0.74609375, + "learning_rate": 4.606872079207463e-05, + "loss": 0.8928, + "step": 1063 + }, + { + "epoch": 0.04664918252329669, + "grad_norm": 0.9140625, + "learning_rate": 4.606484401375714e-05, + "loss": 0.8429, + "step": 1064 + }, + { + "epoch": 0.046693025739953924, + "grad_norm": 0.7890625, + "learning_rate": 4.606096734975208e-05, + "loss": 0.7748, + "step": 1065 + }, + { + "epoch": 0.04673686895661116, + "grad_norm": 0.7890625, + "learning_rate": 4.605709080006777e-05, + "loss": 0.7011, + "step": 1066 + }, + { + "epoch": 0.046780712173268395, + "grad_norm": 0.88671875, + "learning_rate": 4.605321436471236e-05, + "loss": 0.8804, + "step": 1067 + }, + { + "epoch": 0.04682455538992562, + "grad_norm": 0.85546875, + "learning_rate": 4.604933804369409e-05, + "loss": 0.8158, + "step": 1068 + }, + { + "epoch": 0.04686839860658286, + "grad_norm": 0.7421875, + "learning_rate": 4.6045461837021165e-05, + "loss": 0.7575, + "step": 1069 + }, + { + "epoch": 0.046912241823240095, + "grad_norm": 0.83984375, + "learning_rate": 4.6041585744701754e-05, + "loss": 0.7643, + "step": 1070 + }, + { + "epoch": 0.04695608503989733, + "grad_norm": 0.90625, + "learning_rate": 4.603770976674415e-05, + "loss": 0.8647, + "step": 1071 + }, + { + "epoch": 0.04699992825655456, + "grad_norm": 0.83203125, + "learning_rate": 4.6033833903156554e-05, + "loss": 0.8291, + "step": 1072 + }, + { + "epoch": 0.047043771473211794, + "grad_norm": 1.046875, + "learning_rate": 4.602995815394715e-05, + "loss": 0.7712, + "step": 1073 + }, + { + "epoch": 0.04708761468986903, + "grad_norm": 0.87109375, + "learning_rate": 4.602608251912417e-05, + "loss": 0.957, + "step": 1074 + }, + { + "epoch": 0.047131457906526265, + "grad_norm": 0.90234375, + "learning_rate": 4.602220699869578e-05, + "loss": 0.8799, + "step": 1075 + }, + { + "epoch": 0.047175301123183494, + "grad_norm": 0.84375, + "learning_rate": 4.6018331592670286e-05, + "loss": 0.8285, + "step": 1076 + }, + { + "epoch": 0.04721914433984073, + "grad_norm": 0.81640625, + "learning_rate": 4.601445630105584e-05, + "loss": 0.8469, + "step": 1077 + }, + { + "epoch": 0.047262987556497965, + "grad_norm": 0.859375, + "learning_rate": 4.601058112386068e-05, + "loss": 0.9721, + "step": 1078 + }, + { + "epoch": 0.047306830773155194, + "grad_norm": 0.78125, + "learning_rate": 4.600670606109299e-05, + "loss": 0.7206, + "step": 1079 + }, + { + "epoch": 0.04735067398981243, + "grad_norm": 0.859375, + "learning_rate": 4.600283111276097e-05, + "loss": 0.8832, + "step": 1080 + }, + { + "epoch": 0.047394517206469665, + "grad_norm": 1.0234375, + "learning_rate": 4.5998956278872903e-05, + "loss": 0.7888, + "step": 1081 + }, + { + "epoch": 0.0474383604231269, + "grad_norm": 0.8671875, + "learning_rate": 4.599508155943697e-05, + "loss": 1.0278, + "step": 1082 + }, + { + "epoch": 0.04748220363978413, + "grad_norm": 0.8671875, + "learning_rate": 4.599120695446136e-05, + "loss": 0.9286, + "step": 1083 + }, + { + "epoch": 0.047526046856441365, + "grad_norm": 0.859375, + "learning_rate": 4.5987332463954315e-05, + "loss": 0.8182, + "step": 1084 + }, + { + "epoch": 0.0475698900730986, + "grad_norm": 0.8203125, + "learning_rate": 4.598345808792398e-05, + "loss": 0.7936, + "step": 1085 + }, + { + "epoch": 0.047613733289755836, + "grad_norm": 0.75390625, + "learning_rate": 4.597958382637866e-05, + "loss": 0.7836, + "step": 1086 + }, + { + "epoch": 0.047657576506413064, + "grad_norm": 0.83984375, + "learning_rate": 4.597570967932653e-05, + "loss": 0.7276, + "step": 1087 + }, + { + "epoch": 0.0477014197230703, + "grad_norm": 0.83203125, + "learning_rate": 4.597183564677579e-05, + "loss": 0.754, + "step": 1088 + }, + { + "epoch": 0.047745262939727535, + "grad_norm": 0.8046875, + "learning_rate": 4.5967961728734656e-05, + "loss": 0.9552, + "step": 1089 + }, + { + "epoch": 0.04778910615638477, + "grad_norm": 0.78515625, + "learning_rate": 4.596408792521131e-05, + "loss": 0.8752, + "step": 1090 + }, + { + "epoch": 0.047832949373042, + "grad_norm": 0.79296875, + "learning_rate": 4.5960214236214015e-05, + "loss": 0.7695, + "step": 1091 + }, + { + "epoch": 0.047876792589699235, + "grad_norm": 0.796875, + "learning_rate": 4.595634066175096e-05, + "loss": 0.82, + "step": 1092 + }, + { + "epoch": 0.04792063580635647, + "grad_norm": 0.76953125, + "learning_rate": 4.595246720183036e-05, + "loss": 0.8457, + "step": 1093 + }, + { + "epoch": 0.047964479023013706, + "grad_norm": 0.76171875, + "learning_rate": 4.594859385646042e-05, + "loss": 0.7993, + "step": 1094 + }, + { + "epoch": 0.048008322239670935, + "grad_norm": 1.015625, + "learning_rate": 4.5944720625649294e-05, + "loss": 0.8387, + "step": 1095 + }, + { + "epoch": 0.04805216545632817, + "grad_norm": 0.83203125, + "learning_rate": 4.59408475094053e-05, + "loss": 0.8708, + "step": 1096 + }, + { + "epoch": 0.048096008672985406, + "grad_norm": 0.80078125, + "learning_rate": 4.5936974507736566e-05, + "loss": 0.815, + "step": 1097 + }, + { + "epoch": 0.048139851889642635, + "grad_norm": 0.86328125, + "learning_rate": 4.593310162065134e-05, + "loss": 0.8596, + "step": 1098 + }, + { + "epoch": 0.04818369510629987, + "grad_norm": 0.765625, + "learning_rate": 4.592922884815781e-05, + "loss": 0.8311, + "step": 1099 + }, + { + "epoch": 0.048227538322957106, + "grad_norm": 0.7734375, + "learning_rate": 4.592535619026419e-05, + "loss": 0.8844, + "step": 1100 + }, + { + "epoch": 0.04827138153961434, + "grad_norm": 0.9140625, + "learning_rate": 4.592148364697868e-05, + "loss": 0.86, + "step": 1101 + }, + { + "epoch": 0.04831522475627157, + "grad_norm": 0.8359375, + "learning_rate": 4.591761121830951e-05, + "loss": 0.7207, + "step": 1102 + }, + { + "epoch": 0.048359067972928806, + "grad_norm": 0.74609375, + "learning_rate": 4.591373890426486e-05, + "loss": 0.8873, + "step": 1103 + }, + { + "epoch": 0.04840291118958604, + "grad_norm": 0.7265625, + "learning_rate": 4.5909866704852944e-05, + "loss": 0.7572, + "step": 1104 + }, + { + "epoch": 0.04844675440624328, + "grad_norm": 0.8203125, + "learning_rate": 4.590599462008195e-05, + "loss": 0.9982, + "step": 1105 + }, + { + "epoch": 0.048490597622900505, + "grad_norm": 0.80078125, + "learning_rate": 4.5902122649960136e-05, + "loss": 0.7975, + "step": 1106 + }, + { + "epoch": 0.04853444083955774, + "grad_norm": 0.80859375, + "learning_rate": 4.589825079449569e-05, + "loss": 0.7698, + "step": 1107 + }, + { + "epoch": 0.048578284056214976, + "grad_norm": 0.79296875, + "learning_rate": 4.58943790536968e-05, + "loss": 0.897, + "step": 1108 + }, + { + "epoch": 0.04862212727287221, + "grad_norm": 0.83203125, + "learning_rate": 4.5890507427571684e-05, + "loss": 0.8796, + "step": 1109 + }, + { + "epoch": 0.04866597048952944, + "grad_norm": 0.80078125, + "learning_rate": 4.5886635916128506e-05, + "loss": 0.8038, + "step": 1110 + }, + { + "epoch": 0.048709813706186676, + "grad_norm": 0.74609375, + "learning_rate": 4.5882764519375554e-05, + "loss": 0.8994, + "step": 1111 + }, + { + "epoch": 0.04875365692284391, + "grad_norm": 0.8359375, + "learning_rate": 4.5878893237320986e-05, + "loss": 0.768, + "step": 1112 + }, + { + "epoch": 0.04879750013950115, + "grad_norm": 0.84375, + "learning_rate": 4.5875022069973e-05, + "loss": 0.8768, + "step": 1113 + }, + { + "epoch": 0.048841343356158376, + "grad_norm": 0.921875, + "learning_rate": 4.587115101733983e-05, + "loss": 0.949, + "step": 1114 + }, + { + "epoch": 0.04888518657281561, + "grad_norm": 0.88671875, + "learning_rate": 4.586728007942962e-05, + "loss": 0.9717, + "step": 1115 + }, + { + "epoch": 0.04892902978947285, + "grad_norm": 0.87890625, + "learning_rate": 4.5863409256250645e-05, + "loss": 0.8997, + "step": 1116 + }, + { + "epoch": 0.048972873006130076, + "grad_norm": 0.98046875, + "learning_rate": 4.585953854781108e-05, + "loss": 0.8483, + "step": 1117 + }, + { + "epoch": 0.04901671622278731, + "grad_norm": 0.80859375, + "learning_rate": 4.5855667954119133e-05, + "loss": 0.7998, + "step": 1118 + }, + { + "epoch": 0.04906055943944455, + "grad_norm": 0.8828125, + "learning_rate": 4.5851797475183e-05, + "loss": 0.9554, + "step": 1119 + }, + { + "epoch": 0.04910440265610178, + "grad_norm": 0.83203125, + "learning_rate": 4.5847927111010856e-05, + "loss": 0.7895, + "step": 1120 + }, + { + "epoch": 0.04914824587275901, + "grad_norm": 0.76953125, + "learning_rate": 4.5844056861610964e-05, + "loss": 0.7728, + "step": 1121 + }, + { + "epoch": 0.04919208908941625, + "grad_norm": 0.83984375, + "learning_rate": 4.584018672699151e-05, + "loss": 0.779, + "step": 1122 + }, + { + "epoch": 0.04923593230607348, + "grad_norm": 0.90234375, + "learning_rate": 4.5836316707160666e-05, + "loss": 0.834, + "step": 1123 + }, + { + "epoch": 0.04927977552273072, + "grad_norm": 0.859375, + "learning_rate": 4.583244680212667e-05, + "loss": 0.9463, + "step": 1124 + }, + { + "epoch": 0.049323618739387946, + "grad_norm": 0.7890625, + "learning_rate": 4.5828577011897656e-05, + "loss": 0.8716, + "step": 1125 + }, + { + "epoch": 0.04936746195604518, + "grad_norm": 0.88671875, + "learning_rate": 4.582470733648193e-05, + "loss": 0.8129, + "step": 1126 + }, + { + "epoch": 0.04941130517270242, + "grad_norm": 0.8515625, + "learning_rate": 4.582083777588763e-05, + "loss": 0.9867, + "step": 1127 + }, + { + "epoch": 0.04945514838935965, + "grad_norm": 0.82421875, + "learning_rate": 4.5816968330122964e-05, + "loss": 0.7837, + "step": 1128 + }, + { + "epoch": 0.04949899160601688, + "grad_norm": 0.76953125, + "learning_rate": 4.5813098999196146e-05, + "loss": 0.7415, + "step": 1129 + }, + { + "epoch": 0.04954283482267412, + "grad_norm": 0.70703125, + "learning_rate": 4.580922978311536e-05, + "loss": 0.7356, + "step": 1130 + }, + { + "epoch": 0.04958667803933135, + "grad_norm": 0.87890625, + "learning_rate": 4.580536068188879e-05, + "loss": 0.8908, + "step": 1131 + }, + { + "epoch": 0.04963052125598858, + "grad_norm": 0.76953125, + "learning_rate": 4.580149169552468e-05, + "loss": 0.8895, + "step": 1132 + }, + { + "epoch": 0.04967436447264582, + "grad_norm": 0.984375, + "learning_rate": 4.5797622824031216e-05, + "loss": 0.8188, + "step": 1133 + }, + { + "epoch": 0.04971820768930305, + "grad_norm": 0.765625, + "learning_rate": 4.5793754067416595e-05, + "loss": 0.8485, + "step": 1134 + }, + { + "epoch": 0.04976205090596029, + "grad_norm": 0.8359375, + "learning_rate": 4.578988542568902e-05, + "loss": 0.9724, + "step": 1135 + }, + { + "epoch": 0.04980589412261752, + "grad_norm": 0.91796875, + "learning_rate": 4.578601689885664e-05, + "loss": 1.081, + "step": 1136 + }, + { + "epoch": 0.04984973733927475, + "grad_norm": 0.83203125, + "learning_rate": 4.578214848692773e-05, + "loss": 0.7413, + "step": 1137 + }, + { + "epoch": 0.04989358055593199, + "grad_norm": 0.87109375, + "learning_rate": 4.577828018991047e-05, + "loss": 0.8483, + "step": 1138 + }, + { + "epoch": 0.04993742377258922, + "grad_norm": 0.7890625, + "learning_rate": 4.577441200781304e-05, + "loss": 0.8057, + "step": 1139 + }, + { + "epoch": 0.04998126698924645, + "grad_norm": 0.89453125, + "learning_rate": 4.577054394064364e-05, + "loss": 0.8105, + "step": 1140 + }, + { + "epoch": 0.05002511020590369, + "grad_norm": 0.8515625, + "learning_rate": 4.576667598841048e-05, + "loss": 0.9416, + "step": 1141 + }, + { + "epoch": 0.05006895342256092, + "grad_norm": 0.8203125, + "learning_rate": 4.576280815112175e-05, + "loss": 0.7359, + "step": 1142 + }, + { + "epoch": 0.05011279663921816, + "grad_norm": 0.81640625, + "learning_rate": 4.5758940428785645e-05, + "loss": 0.6923, + "step": 1143 + }, + { + "epoch": 0.05015663985587539, + "grad_norm": 0.84765625, + "learning_rate": 4.5755072821410324e-05, + "loss": 0.8923, + "step": 1144 + }, + { + "epoch": 0.05020048307253262, + "grad_norm": 0.80078125, + "learning_rate": 4.575120532900407e-05, + "loss": 0.8991, + "step": 1145 + }, + { + "epoch": 0.05024432628918986, + "grad_norm": 0.88671875, + "learning_rate": 4.574733795157503e-05, + "loss": 0.8252, + "step": 1146 + }, + { + "epoch": 0.050288169505847094, + "grad_norm": 0.82421875, + "learning_rate": 4.574347068913141e-05, + "loss": 0.6902, + "step": 1147 + }, + { + "epoch": 0.05033201272250432, + "grad_norm": 0.796875, + "learning_rate": 4.573960354168141e-05, + "loss": 0.7897, + "step": 1148 + }, + { + "epoch": 0.05037585593916156, + "grad_norm": 0.7734375, + "learning_rate": 4.573573650923321e-05, + "loss": 0.8198, + "step": 1149 + }, + { + "epoch": 0.050419699155818794, + "grad_norm": 0.8984375, + "learning_rate": 4.573186959179501e-05, + "loss": 0.9041, + "step": 1150 + }, + { + "epoch": 0.05046354237247602, + "grad_norm": 0.8828125, + "learning_rate": 4.572800278937498e-05, + "loss": 0.7838, + "step": 1151 + }, + { + "epoch": 0.05050738558913326, + "grad_norm": 0.82421875, + "learning_rate": 4.572413610198139e-05, + "loss": 0.895, + "step": 1152 + }, + { + "epoch": 0.050551228805790493, + "grad_norm": 0.7421875, + "learning_rate": 4.572026952962237e-05, + "loss": 0.8161, + "step": 1153 + }, + { + "epoch": 0.05059507202244773, + "grad_norm": 1.2890625, + "learning_rate": 4.571640307230616e-05, + "loss": 0.7727, + "step": 1154 + }, + { + "epoch": 0.05063891523910496, + "grad_norm": 0.78125, + "learning_rate": 4.5712536730040924e-05, + "loss": 0.8586, + "step": 1155 + }, + { + "epoch": 0.05068275845576219, + "grad_norm": 0.85546875, + "learning_rate": 4.570867050283482e-05, + "loss": 0.8559, + "step": 1156 + }, + { + "epoch": 0.05072660167241943, + "grad_norm": 0.90625, + "learning_rate": 4.570480439069612e-05, + "loss": 0.8958, + "step": 1157 + }, + { + "epoch": 0.050770444889076664, + "grad_norm": 0.81640625, + "learning_rate": 4.5700938393632994e-05, + "loss": 0.8177, + "step": 1158 + }, + { + "epoch": 0.05081428810573389, + "grad_norm": 0.7734375, + "learning_rate": 4.5697072511653616e-05, + "loss": 0.8167, + "step": 1159 + }, + { + "epoch": 0.05085813132239113, + "grad_norm": 0.8515625, + "learning_rate": 4.56932067447662e-05, + "loss": 0.8528, + "step": 1160 + }, + { + "epoch": 0.050901974539048364, + "grad_norm": 0.984375, + "learning_rate": 4.568934109297889e-05, + "loss": 0.8604, + "step": 1161 + }, + { + "epoch": 0.0509458177557056, + "grad_norm": 0.74609375, + "learning_rate": 4.568547555629994e-05, + "loss": 0.8288, + "step": 1162 + }, + { + "epoch": 0.05098966097236283, + "grad_norm": 0.734375, + "learning_rate": 4.568161013473753e-05, + "loss": 0.7333, + "step": 1163 + }, + { + "epoch": 0.051033504189020064, + "grad_norm": 0.859375, + "learning_rate": 4.567774482829985e-05, + "loss": 0.8911, + "step": 1164 + }, + { + "epoch": 0.0510773474056773, + "grad_norm": 0.81640625, + "learning_rate": 4.5673879636995074e-05, + "loss": 0.7628, + "step": 1165 + }, + { + "epoch": 0.051121190622334535, + "grad_norm": 1.0390625, + "learning_rate": 4.5670014560831376e-05, + "loss": 0.9663, + "step": 1166 + }, + { + "epoch": 0.051165033838991764, + "grad_norm": 0.85546875, + "learning_rate": 4.566614959981701e-05, + "loss": 0.8366, + "step": 1167 + }, + { + "epoch": 0.051208877055649, + "grad_norm": 0.97265625, + "learning_rate": 4.566228475396013e-05, + "loss": 0.7582, + "step": 1168 + }, + { + "epoch": 0.051252720272306235, + "grad_norm": 0.79296875, + "learning_rate": 4.5658420023268935e-05, + "loss": 0.7674, + "step": 1169 + }, + { + "epoch": 0.05129656348896346, + "grad_norm": 0.875, + "learning_rate": 4.565455540775161e-05, + "loss": 0.8406, + "step": 1170 + }, + { + "epoch": 0.0513404067056207, + "grad_norm": 0.80078125, + "learning_rate": 4.565069090741631e-05, + "loss": 0.8153, + "step": 1171 + }, + { + "epoch": 0.051384249922277934, + "grad_norm": 2.28125, + "learning_rate": 4.5646826522271316e-05, + "loss": 0.8501, + "step": 1172 + }, + { + "epoch": 0.05142809313893517, + "grad_norm": 0.79296875, + "learning_rate": 4.564296225232475e-05, + "loss": 0.7756, + "step": 1173 + }, + { + "epoch": 0.0514719363555924, + "grad_norm": 0.86328125, + "learning_rate": 4.563909809758483e-05, + "loss": 0.8962, + "step": 1174 + }, + { + "epoch": 0.051515779572249634, + "grad_norm": 0.78125, + "learning_rate": 4.563523405805973e-05, + "loss": 0.7654, + "step": 1175 + }, + { + "epoch": 0.05155962278890687, + "grad_norm": 0.96484375, + "learning_rate": 4.56313701337576e-05, + "loss": 0.7774, + "step": 1176 + }, + { + "epoch": 0.051603466005564105, + "grad_norm": 0.890625, + "learning_rate": 4.562750632468672e-05, + "loss": 0.8476, + "step": 1177 + }, + { + "epoch": 0.051647309222221334, + "grad_norm": 0.90625, + "learning_rate": 4.5623642630855234e-05, + "loss": 0.9772, + "step": 1178 + }, + { + "epoch": 0.05169115243887857, + "grad_norm": 0.71484375, + "learning_rate": 4.561977905227133e-05, + "loss": 0.8019, + "step": 1179 + }, + { + "epoch": 0.051734995655535805, + "grad_norm": 0.85546875, + "learning_rate": 4.561591558894318e-05, + "loss": 0.9235, + "step": 1180 + }, + { + "epoch": 0.05177883887219304, + "grad_norm": 0.82421875, + "learning_rate": 4.5612052240878964e-05, + "loss": 0.8249, + "step": 1181 + }, + { + "epoch": 0.05182268208885027, + "grad_norm": 0.8046875, + "learning_rate": 4.560818900808693e-05, + "loss": 0.8916, + "step": 1182 + }, + { + "epoch": 0.051866525305507505, + "grad_norm": 0.875, + "learning_rate": 4.560432589057524e-05, + "loss": 0.8751, + "step": 1183 + }, + { + "epoch": 0.05191036852216474, + "grad_norm": 0.8828125, + "learning_rate": 4.5600462888352056e-05, + "loss": 0.7766, + "step": 1184 + }, + { + "epoch": 0.051954211738821976, + "grad_norm": 0.84765625, + "learning_rate": 4.5596600001425594e-05, + "loss": 0.7033, + "step": 1185 + }, + { + "epoch": 0.051998054955479205, + "grad_norm": 0.82421875, + "learning_rate": 4.559273722980402e-05, + "loss": 0.8538, + "step": 1186 + }, + { + "epoch": 0.05204189817213644, + "grad_norm": 0.83203125, + "learning_rate": 4.558887457349553e-05, + "loss": 0.7781, + "step": 1187 + }, + { + "epoch": 0.052085741388793676, + "grad_norm": 0.83203125, + "learning_rate": 4.558501203250831e-05, + "loss": 0.886, + "step": 1188 + }, + { + "epoch": 0.052129584605450904, + "grad_norm": 0.81640625, + "learning_rate": 4.558114960685056e-05, + "loss": 0.8886, + "step": 1189 + }, + { + "epoch": 0.05217342782210814, + "grad_norm": 0.79296875, + "learning_rate": 4.557728729653044e-05, + "loss": 0.7082, + "step": 1190 + }, + { + "epoch": 0.052217271038765375, + "grad_norm": 0.7734375, + "learning_rate": 4.557342510155611e-05, + "loss": 0.7879, + "step": 1191 + }, + { + "epoch": 0.05226111425542261, + "grad_norm": 0.90234375, + "learning_rate": 4.556956302193583e-05, + "loss": 0.8523, + "step": 1192 + }, + { + "epoch": 0.05230495747207984, + "grad_norm": 0.85546875, + "learning_rate": 4.5565701057677766e-05, + "loss": 0.8141, + "step": 1193 + }, + { + "epoch": 0.052348800688737075, + "grad_norm": 0.7890625, + "learning_rate": 4.556183920879008e-05, + "loss": 0.9231, + "step": 1194 + }, + { + "epoch": 0.05239264390539431, + "grad_norm": 0.890625, + "learning_rate": 4.555797747528096e-05, + "loss": 0.843, + "step": 1195 + }, + { + "epoch": 0.052436487122051546, + "grad_norm": 0.87109375, + "learning_rate": 4.555411585715856e-05, + "loss": 0.7924, + "step": 1196 + }, + { + "epoch": 0.052480330338708775, + "grad_norm": 0.90625, + "learning_rate": 4.555025435443113e-05, + "loss": 0.864, + "step": 1197 + }, + { + "epoch": 0.05252417355536601, + "grad_norm": 0.8359375, + "learning_rate": 4.554639296710683e-05, + "loss": 0.7809, + "step": 1198 + }, + { + "epoch": 0.052568016772023246, + "grad_norm": 0.828125, + "learning_rate": 4.5542531695193846e-05, + "loss": 0.6889, + "step": 1199 + }, + { + "epoch": 0.05261185998868048, + "grad_norm": 0.9765625, + "learning_rate": 4.553867053870035e-05, + "loss": 0.8929, + "step": 1200 + }, + { + "epoch": 0.05265570320533771, + "grad_norm": 0.828125, + "learning_rate": 4.553480949763448e-05, + "loss": 0.7413, + "step": 1201 + }, + { + "epoch": 0.052699546421994946, + "grad_norm": 0.8125, + "learning_rate": 4.553094857200452e-05, + "loss": 0.9329, + "step": 1202 + }, + { + "epoch": 0.05274338963865218, + "grad_norm": 0.83984375, + "learning_rate": 4.5527087761818584e-05, + "loss": 0.8303, + "step": 1203 + }, + { + "epoch": 0.05278723285530942, + "grad_norm": 0.79296875, + "learning_rate": 4.5523227067084886e-05, + "loss": 0.8289, + "step": 1204 + }, + { + "epoch": 0.052831076071966646, + "grad_norm": 0.83203125, + "learning_rate": 4.551936648781159e-05, + "loss": 0.9212, + "step": 1205 + }, + { + "epoch": 0.05287491928862388, + "grad_norm": 0.84765625, + "learning_rate": 4.551550602400685e-05, + "loss": 0.8174, + "step": 1206 + }, + { + "epoch": 0.05291876250528112, + "grad_norm": 0.8828125, + "learning_rate": 4.551164567567892e-05, + "loss": 0.8661, + "step": 1207 + }, + { + "epoch": 0.052962605721938345, + "grad_norm": 0.859375, + "learning_rate": 4.5507785442835924e-05, + "loss": 0.9062, + "step": 1208 + }, + { + "epoch": 0.05300644893859558, + "grad_norm": 0.8515625, + "learning_rate": 4.5503925325486076e-05, + "loss": 0.7938, + "step": 1209 + }, + { + "epoch": 0.053050292155252816, + "grad_norm": 0.9765625, + "learning_rate": 4.550006532363754e-05, + "loss": 0.7767, + "step": 1210 + }, + { + "epoch": 0.05309413537191005, + "grad_norm": 0.78515625, + "learning_rate": 4.5496205437298466e-05, + "loss": 0.8682, + "step": 1211 + }, + { + "epoch": 0.05313797858856728, + "grad_norm": 0.87109375, + "learning_rate": 4.549234566647711e-05, + "loss": 0.9065, + "step": 1212 + }, + { + "epoch": 0.053181821805224516, + "grad_norm": 0.83984375, + "learning_rate": 4.5488486011181606e-05, + "loss": 0.7861, + "step": 1213 + }, + { + "epoch": 0.05322566502188175, + "grad_norm": 1.0703125, + "learning_rate": 4.5484626471420143e-05, + "loss": 0.7622, + "step": 1214 + }, + { + "epoch": 0.05326950823853899, + "grad_norm": 0.7421875, + "learning_rate": 4.54807670472009e-05, + "loss": 0.694, + "step": 1215 + }, + { + "epoch": 0.053313351455196216, + "grad_norm": 0.76171875, + "learning_rate": 4.5476907738532015e-05, + "loss": 0.6978, + "step": 1216 + }, + { + "epoch": 0.05335719467185345, + "grad_norm": 0.82421875, + "learning_rate": 4.5473048545421734e-05, + "loss": 0.8662, + "step": 1217 + }, + { + "epoch": 0.05340103788851069, + "grad_norm": 0.828125, + "learning_rate": 4.546918946787822e-05, + "loss": 0.7203, + "step": 1218 + }, + { + "epoch": 0.05344488110516792, + "grad_norm": 0.84375, + "learning_rate": 4.5465330505909654e-05, + "loss": 0.7626, + "step": 1219 + }, + { + "epoch": 0.05348872432182515, + "grad_norm": 0.7109375, + "learning_rate": 4.546147165952419e-05, + "loss": 0.7111, + "step": 1220 + }, + { + "epoch": 0.05353256753848239, + "grad_norm": 0.83984375, + "learning_rate": 4.545761292872999e-05, + "loss": 0.7834, + "step": 1221 + }, + { + "epoch": 0.05357641075513962, + "grad_norm": 0.86328125, + "learning_rate": 4.545375431353529e-05, + "loss": 0.8242, + "step": 1222 + }, + { + "epoch": 0.05362025397179685, + "grad_norm": 0.95703125, + "learning_rate": 4.544989581394824e-05, + "loss": 0.8792, + "step": 1223 + }, + { + "epoch": 0.05366409718845409, + "grad_norm": 0.9140625, + "learning_rate": 4.544603742997702e-05, + "loss": 0.8882, + "step": 1224 + }, + { + "epoch": 0.05370794040511132, + "grad_norm": 0.8671875, + "learning_rate": 4.54421791616298e-05, + "loss": 0.8178, + "step": 1225 + }, + { + "epoch": 0.05375178362176856, + "grad_norm": 0.90234375, + "learning_rate": 4.543832100891478e-05, + "loss": 0.9092, + "step": 1226 + }, + { + "epoch": 0.053795626838425786, + "grad_norm": 0.69140625, + "learning_rate": 4.543446297184011e-05, + "loss": 0.7293, + "step": 1227 + }, + { + "epoch": 0.05383947005508302, + "grad_norm": 0.828125, + "learning_rate": 4.543060505041398e-05, + "loss": 0.7611, + "step": 1228 + }, + { + "epoch": 0.05388331327174026, + "grad_norm": 0.77734375, + "learning_rate": 4.542674724464452e-05, + "loss": 0.8311, + "step": 1229 + }, + { + "epoch": 0.05392715648839749, + "grad_norm": 0.8046875, + "learning_rate": 4.542288955453998e-05, + "loss": 0.8571, + "step": 1230 + }, + { + "epoch": 0.05397099970505472, + "grad_norm": 0.85546875, + "learning_rate": 4.5419031980108516e-05, + "loss": 0.8474, + "step": 1231 + }, + { + "epoch": 0.05401484292171196, + "grad_norm": 0.9375, + "learning_rate": 4.541517452135831e-05, + "loss": 0.9589, + "step": 1232 + }, + { + "epoch": 0.05405868613836919, + "grad_norm": 1.0390625, + "learning_rate": 4.54113171782975e-05, + "loss": 0.7971, + "step": 1233 + }, + { + "epoch": 0.05410252935502643, + "grad_norm": 0.82421875, + "learning_rate": 4.540745995093428e-05, + "loss": 0.7597, + "step": 1234 + }, + { + "epoch": 0.05414637257168366, + "grad_norm": 0.8359375, + "learning_rate": 4.540360283927685e-05, + "loss": 0.9197, + "step": 1235 + }, + { + "epoch": 0.05419021578834089, + "grad_norm": 0.875, + "learning_rate": 4.5399745843333305e-05, + "loss": 0.807, + "step": 1236 + }, + { + "epoch": 0.05423405900499813, + "grad_norm": 0.83203125, + "learning_rate": 4.539588896311192e-05, + "loss": 0.7816, + "step": 1237 + }, + { + "epoch": 0.054277902221655364, + "grad_norm": 0.9375, + "learning_rate": 4.539203219862084e-05, + "loss": 0.8935, + "step": 1238 + }, + { + "epoch": 0.05432174543831259, + "grad_norm": 0.7578125, + "learning_rate": 4.5388175549868214e-05, + "loss": 0.895, + "step": 1239 + }, + { + "epoch": 0.05436558865496983, + "grad_norm": 0.8828125, + "learning_rate": 4.5384319016862234e-05, + "loss": 1.0534, + "step": 1240 + }, + { + "epoch": 0.05440943187162706, + "grad_norm": 0.765625, + "learning_rate": 4.538046259961103e-05, + "loss": 0.8316, + "step": 1241 + }, + { + "epoch": 0.05445327508828429, + "grad_norm": 1.5390625, + "learning_rate": 4.5376606298122846e-05, + "loss": 0.8445, + "step": 1242 + }, + { + "epoch": 0.05449711830494153, + "grad_norm": 0.8828125, + "learning_rate": 4.537275011240583e-05, + "loss": 0.82, + "step": 1243 + }, + { + "epoch": 0.05454096152159876, + "grad_norm": 0.984375, + "learning_rate": 4.5368894042468134e-05, + "loss": 0.8935, + "step": 1244 + }, + { + "epoch": 0.054584804738256, + "grad_norm": 0.74609375, + "learning_rate": 4.536503808831796e-05, + "loss": 0.8391, + "step": 1245 + }, + { + "epoch": 0.05462864795491323, + "grad_norm": 0.90234375, + "learning_rate": 4.536118224996342e-05, + "loss": 0.9314, + "step": 1246 + }, + { + "epoch": 0.05467249117157046, + "grad_norm": 1.03125, + "learning_rate": 4.535732652741277e-05, + "loss": 0.7785, + "step": 1247 + }, + { + "epoch": 0.0547163343882277, + "grad_norm": 0.8359375, + "learning_rate": 4.5353470920674144e-05, + "loss": 0.8557, + "step": 1248 + }, + { + "epoch": 0.054760177604884934, + "grad_norm": 0.8984375, + "learning_rate": 4.534961542975572e-05, + "loss": 0.8892, + "step": 1249 + }, + { + "epoch": 0.05480402082154216, + "grad_norm": 0.828125, + "learning_rate": 4.5345760054665656e-05, + "loss": 0.7968, + "step": 1250 + }, + { + "epoch": 0.0548478640381994, + "grad_norm": 0.93359375, + "learning_rate": 4.534190479541208e-05, + "loss": 0.8161, + "step": 1251 + }, + { + "epoch": 0.054891707254856634, + "grad_norm": 0.9296875, + "learning_rate": 4.5338049652003265e-05, + "loss": 0.9087, + "step": 1252 + }, + { + "epoch": 0.05493555047151387, + "grad_norm": 0.83984375, + "learning_rate": 4.533419462444733e-05, + "loss": 0.8589, + "step": 1253 + }, + { + "epoch": 0.0549793936881711, + "grad_norm": 0.796875, + "learning_rate": 4.5330339712752443e-05, + "loss": 0.786, + "step": 1254 + }, + { + "epoch": 0.05502323690482833, + "grad_norm": 0.84375, + "learning_rate": 4.532648491692677e-05, + "loss": 0.9127, + "step": 1255 + }, + { + "epoch": 0.05506708012148557, + "grad_norm": 0.78125, + "learning_rate": 4.532263023697846e-05, + "loss": 0.8288, + "step": 1256 + }, + { + "epoch": 0.055110923338142805, + "grad_norm": 0.8359375, + "learning_rate": 4.531877567291574e-05, + "loss": 1.0053, + "step": 1257 + }, + { + "epoch": 0.05515476655480003, + "grad_norm": 0.82421875, + "learning_rate": 4.5314921224746755e-05, + "loss": 0.8715, + "step": 1258 + }, + { + "epoch": 0.05519860977145727, + "grad_norm": 0.7578125, + "learning_rate": 4.531106689247966e-05, + "loss": 0.701, + "step": 1259 + }, + { + "epoch": 0.055242452988114504, + "grad_norm": 0.88671875, + "learning_rate": 4.5307212676122635e-05, + "loss": 0.9003, + "step": 1260 + }, + { + "epoch": 0.05528629620477173, + "grad_norm": 0.8984375, + "learning_rate": 4.530335857568382e-05, + "loss": 0.7927, + "step": 1261 + }, + { + "epoch": 0.05533013942142897, + "grad_norm": 0.97265625, + "learning_rate": 4.529950459117144e-05, + "loss": 0.8661, + "step": 1262 + }, + { + "epoch": 0.055373982638086204, + "grad_norm": 1.671875, + "learning_rate": 4.529565072259363e-05, + "loss": 0.7163, + "step": 1263 + }, + { + "epoch": 0.05541782585474344, + "grad_norm": 0.828125, + "learning_rate": 4.5291796969958564e-05, + "loss": 0.82, + "step": 1264 + }, + { + "epoch": 0.05546166907140067, + "grad_norm": 0.80859375, + "learning_rate": 4.5287943333274405e-05, + "loss": 0.8059, + "step": 1265 + }, + { + "epoch": 0.055505512288057904, + "grad_norm": 0.859375, + "learning_rate": 4.528408981254929e-05, + "loss": 0.8823, + "step": 1266 + }, + { + "epoch": 0.05554935550471514, + "grad_norm": 0.89453125, + "learning_rate": 4.5280236407791455e-05, + "loss": 0.8517, + "step": 1267 + }, + { + "epoch": 0.055593198721372375, + "grad_norm": 0.8203125, + "learning_rate": 4.5276383119009036e-05, + "loss": 0.7236, + "step": 1268 + }, + { + "epoch": 0.055637041938029604, + "grad_norm": 0.828125, + "learning_rate": 4.527252994621018e-05, + "loss": 0.8101, + "step": 1269 + }, + { + "epoch": 0.05568088515468684, + "grad_norm": 0.87109375, + "learning_rate": 4.5268676889403084e-05, + "loss": 0.8446, + "step": 1270 + }, + { + "epoch": 0.055724728371344075, + "grad_norm": 1.734375, + "learning_rate": 4.5264823948595894e-05, + "loss": 0.9017, + "step": 1271 + }, + { + "epoch": 0.05576857158800131, + "grad_norm": 0.78515625, + "learning_rate": 4.526097112379678e-05, + "loss": 0.7934, + "step": 1272 + }, + { + "epoch": 0.05581241480465854, + "grad_norm": 0.83203125, + "learning_rate": 4.525711841501391e-05, + "loss": 0.8864, + "step": 1273 + }, + { + "epoch": 0.055856258021315774, + "grad_norm": 0.7734375, + "learning_rate": 4.525326582225545e-05, + "loss": 0.8173, + "step": 1274 + }, + { + "epoch": 0.05590010123797301, + "grad_norm": 1.03125, + "learning_rate": 4.524941334552957e-05, + "loss": 0.9113, + "step": 1275 + }, + { + "epoch": 0.055943944454630246, + "grad_norm": 0.74609375, + "learning_rate": 4.524556098484442e-05, + "loss": 0.7203, + "step": 1276 + }, + { + "epoch": 0.055987787671287474, + "grad_norm": 0.86328125, + "learning_rate": 4.524170874020813e-05, + "loss": 0.8829, + "step": 1277 + }, + { + "epoch": 0.05603163088794471, + "grad_norm": 0.90234375, + "learning_rate": 4.523785661162896e-05, + "loss": 0.8904, + "step": 1278 + }, + { + "epoch": 0.056075474104601945, + "grad_norm": 0.8359375, + "learning_rate": 4.523400459911502e-05, + "loss": 0.876, + "step": 1279 + }, + { + "epoch": 0.056119317321259174, + "grad_norm": 0.953125, + "learning_rate": 4.523015270267448e-05, + "loss": 0.9165, + "step": 1280 + }, + { + "epoch": 0.05616316053791641, + "grad_norm": 0.859375, + "learning_rate": 4.522630092231549e-05, + "loss": 0.8526, + "step": 1281 + }, + { + "epoch": 0.056207003754573645, + "grad_norm": 0.8984375, + "learning_rate": 4.5222449258046185e-05, + "loss": 0.9152, + "step": 1282 + }, + { + "epoch": 0.05625084697123088, + "grad_norm": 0.82421875, + "learning_rate": 4.521859770987482e-05, + "loss": 0.8357, + "step": 1283 + }, + { + "epoch": 0.05629469018788811, + "grad_norm": 0.76953125, + "learning_rate": 4.521474627780949e-05, + "loss": 0.7129, + "step": 1284 + }, + { + "epoch": 0.056338533404545345, + "grad_norm": 0.86328125, + "learning_rate": 4.521089496185839e-05, + "loss": 0.8133, + "step": 1285 + }, + { + "epoch": 0.05638237662120258, + "grad_norm": 0.97265625, + "learning_rate": 4.5207043762029654e-05, + "loss": 0.9086, + "step": 1286 + }, + { + "epoch": 0.056426219837859816, + "grad_norm": 0.79296875, + "learning_rate": 4.520319267833143e-05, + "loss": 0.8429, + "step": 1287 + }, + { + "epoch": 0.056470063054517045, + "grad_norm": 0.89453125, + "learning_rate": 4.5199341710771934e-05, + "loss": 0.831, + "step": 1288 + }, + { + "epoch": 0.05651390627117428, + "grad_norm": 0.8046875, + "learning_rate": 4.5195490859359313e-05, + "loss": 0.7688, + "step": 1289 + }, + { + "epoch": 0.056557749487831516, + "grad_norm": 0.8515625, + "learning_rate": 4.519164012410171e-05, + "loss": 0.8008, + "step": 1290 + }, + { + "epoch": 0.05660159270448875, + "grad_norm": 0.7578125, + "learning_rate": 4.518778950500728e-05, + "loss": 0.7794, + "step": 1291 + }, + { + "epoch": 0.05664543592114598, + "grad_norm": 0.78515625, + "learning_rate": 4.5183939002084175e-05, + "loss": 0.7733, + "step": 1292 + }, + { + "epoch": 0.056689279137803215, + "grad_norm": 0.8828125, + "learning_rate": 4.518008861534062e-05, + "loss": 0.8153, + "step": 1293 + }, + { + "epoch": 0.05673312235446045, + "grad_norm": 0.78515625, + "learning_rate": 4.517623834478473e-05, + "loss": 0.8143, + "step": 1294 + }, + { + "epoch": 0.05677696557111769, + "grad_norm": 0.88671875, + "learning_rate": 4.5172388190424664e-05, + "loss": 0.8371, + "step": 1295 + }, + { + "epoch": 0.056820808787774915, + "grad_norm": 0.97265625, + "learning_rate": 4.516853815226859e-05, + "loss": 0.7863, + "step": 1296 + }, + { + "epoch": 0.05686465200443215, + "grad_norm": 0.89453125, + "learning_rate": 4.516468823032463e-05, + "loss": 0.8136, + "step": 1297 + }, + { + "epoch": 0.056908495221089386, + "grad_norm": 0.75390625, + "learning_rate": 4.5160838424601e-05, + "loss": 0.7596, + "step": 1298 + }, + { + "epoch": 0.056952338437746615, + "grad_norm": 0.84375, + "learning_rate": 4.5156988735105867e-05, + "loss": 0.829, + "step": 1299 + }, + { + "epoch": 0.05699618165440385, + "grad_norm": 0.859375, + "learning_rate": 4.5153139161847335e-05, + "loss": 0.8074, + "step": 1300 + }, + { + "epoch": 0.057040024871061086, + "grad_norm": 0.9296875, + "learning_rate": 4.5149289704833606e-05, + "loss": 0.8375, + "step": 1301 + }, + { + "epoch": 0.05708386808771832, + "grad_norm": 0.8359375, + "learning_rate": 4.514544036407278e-05, + "loss": 0.8993, + "step": 1302 + }, + { + "epoch": 0.05712771130437555, + "grad_norm": 0.765625, + "learning_rate": 4.5141591139573103e-05, + "loss": 0.7812, + "step": 1303 + }, + { + "epoch": 0.057171554521032786, + "grad_norm": 0.859375, + "learning_rate": 4.513774203134268e-05, + "loss": 0.8414, + "step": 1304 + }, + { + "epoch": 0.05721539773769002, + "grad_norm": 0.85546875, + "learning_rate": 4.5133893039389674e-05, + "loss": 0.8004, + "step": 1305 + }, + { + "epoch": 0.05725924095434726, + "grad_norm": 1.234375, + "learning_rate": 4.513004416372225e-05, + "loss": 0.8648, + "step": 1306 + }, + { + "epoch": 0.057303084171004486, + "grad_norm": 0.74609375, + "learning_rate": 4.5126195404348526e-05, + "loss": 0.6663, + "step": 1307 + }, + { + "epoch": 0.05734692738766172, + "grad_norm": 0.77734375, + "learning_rate": 4.512234676127672e-05, + "loss": 0.8432, + "step": 1308 + }, + { + "epoch": 0.05739077060431896, + "grad_norm": 0.8125, + "learning_rate": 4.511849823451498e-05, + "loss": 0.7584, + "step": 1309 + }, + { + "epoch": 0.05743461382097619, + "grad_norm": 0.82421875, + "learning_rate": 4.511464982407144e-05, + "loss": 0.7356, + "step": 1310 + }, + { + "epoch": 0.05747845703763342, + "grad_norm": 0.90234375, + "learning_rate": 4.511080152995426e-05, + "loss": 0.8614, + "step": 1311 + }, + { + "epoch": 0.057522300254290656, + "grad_norm": 0.8046875, + "learning_rate": 4.51069533521716e-05, + "loss": 0.8342, + "step": 1312 + }, + { + "epoch": 0.05756614347094789, + "grad_norm": 0.828125, + "learning_rate": 4.510310529073162e-05, + "loss": 0.726, + "step": 1313 + }, + { + "epoch": 0.05760998668760513, + "grad_norm": 0.953125, + "learning_rate": 4.5099257345642466e-05, + "loss": 0.9086, + "step": 1314 + }, + { + "epoch": 0.057653829904262356, + "grad_norm": 0.87109375, + "learning_rate": 4.509540951691226e-05, + "loss": 0.9207, + "step": 1315 + }, + { + "epoch": 0.05769767312091959, + "grad_norm": 0.83984375, + "learning_rate": 4.509156180454923e-05, + "loss": 0.8398, + "step": 1316 + }, + { + "epoch": 0.05774151633757683, + "grad_norm": 0.8828125, + "learning_rate": 4.508771420856149e-05, + "loss": 0.8774, + "step": 1317 + }, + { + "epoch": 0.057785359554234056, + "grad_norm": 0.84765625, + "learning_rate": 4.5083866728957216e-05, + "loss": 0.8029, + "step": 1318 + }, + { + "epoch": 0.05782920277089129, + "grad_norm": 0.8828125, + "learning_rate": 4.508001936574454e-05, + "loss": 0.856, + "step": 1319 + }, + { + "epoch": 0.05787304598754853, + "grad_norm": 0.86328125, + "learning_rate": 4.507617211893162e-05, + "loss": 0.864, + "step": 1320 + }, + { + "epoch": 0.05791688920420576, + "grad_norm": 0.8359375, + "learning_rate": 4.507232498852663e-05, + "loss": 0.7398, + "step": 1321 + }, + { + "epoch": 0.05796073242086299, + "grad_norm": 0.94140625, + "learning_rate": 4.506847797453766e-05, + "loss": 0.7979, + "step": 1322 + }, + { + "epoch": 0.05800457563752023, + "grad_norm": 0.87109375, + "learning_rate": 4.5064631076972954e-05, + "loss": 0.7656, + "step": 1323 + }, + { + "epoch": 0.05804841885417746, + "grad_norm": 0.87890625, + "learning_rate": 4.5060784295840606e-05, + "loss": 0.7778, + "step": 1324 + }, + { + "epoch": 0.0580922620708347, + "grad_norm": 0.85546875, + "learning_rate": 4.50569376311488e-05, + "loss": 0.8707, + "step": 1325 + }, + { + "epoch": 0.058136105287491927, + "grad_norm": 0.8359375, + "learning_rate": 4.505309108290567e-05, + "loss": 0.7962, + "step": 1326 + }, + { + "epoch": 0.05817994850414916, + "grad_norm": 0.82421875, + "learning_rate": 4.504924465111934e-05, + "loss": 0.8878, + "step": 1327 + }, + { + "epoch": 0.0582237917208064, + "grad_norm": 0.8203125, + "learning_rate": 4.504539833579803e-05, + "loss": 0.8568, + "step": 1328 + }, + { + "epoch": 0.05826763493746363, + "grad_norm": 0.84765625, + "learning_rate": 4.504155213694985e-05, + "loss": 0.8046, + "step": 1329 + }, + { + "epoch": 0.05831147815412086, + "grad_norm": 0.76171875, + "learning_rate": 4.503770605458297e-05, + "loss": 0.7749, + "step": 1330 + }, + { + "epoch": 0.0583553213707781, + "grad_norm": 0.80078125, + "learning_rate": 4.5033860088705525e-05, + "loss": 0.7608, + "step": 1331 + }, + { + "epoch": 0.05839916458743533, + "grad_norm": 0.859375, + "learning_rate": 4.5030014239325635e-05, + "loss": 0.6881, + "step": 1332 + }, + { + "epoch": 0.05844300780409256, + "grad_norm": 1.0078125, + "learning_rate": 4.502616850645153e-05, + "loss": 0.8779, + "step": 1333 + }, + { + "epoch": 0.0584868510207498, + "grad_norm": 0.765625, + "learning_rate": 4.502232289009131e-05, + "loss": 0.7628, + "step": 1334 + }, + { + "epoch": 0.05853069423740703, + "grad_norm": 0.94921875, + "learning_rate": 4.501847739025313e-05, + "loss": 1.1057, + "step": 1335 + }, + { + "epoch": 0.05857453745406427, + "grad_norm": 0.8046875, + "learning_rate": 4.501463200694516e-05, + "loss": 0.7822, + "step": 1336 + }, + { + "epoch": 0.0586183806707215, + "grad_norm": 0.77734375, + "learning_rate": 4.501078674017549e-05, + "loss": 0.7888, + "step": 1337 + }, + { + "epoch": 0.05866222388737873, + "grad_norm": 0.7578125, + "learning_rate": 4.500694158995236e-05, + "loss": 0.81, + "step": 1338 + }, + { + "epoch": 0.05870606710403597, + "grad_norm": 0.8671875, + "learning_rate": 4.5003096556283865e-05, + "loss": 1.0387, + "step": 1339 + }, + { + "epoch": 0.058749910320693204, + "grad_norm": 0.9765625, + "learning_rate": 4.4999251639178164e-05, + "loss": 0.7772, + "step": 1340 + }, + { + "epoch": 0.05879375353735043, + "grad_norm": 0.828125, + "learning_rate": 4.499540683864341e-05, + "loss": 0.7191, + "step": 1341 + }, + { + "epoch": 0.05883759675400767, + "grad_norm": 0.8046875, + "learning_rate": 4.499156215468769e-05, + "loss": 0.6756, + "step": 1342 + }, + { + "epoch": 0.0588814399706649, + "grad_norm": 0.8359375, + "learning_rate": 4.4987717587319265e-05, + "loss": 0.7051, + "step": 1343 + }, + { + "epoch": 0.05892528318732214, + "grad_norm": 0.7578125, + "learning_rate": 4.498387313654623e-05, + "loss": 0.9116, + "step": 1344 + }, + { + "epoch": 0.05896912640397937, + "grad_norm": 0.84765625, + "learning_rate": 4.498002880237673e-05, + "loss": 0.9039, + "step": 1345 + }, + { + "epoch": 0.0590129696206366, + "grad_norm": 0.7890625, + "learning_rate": 4.497618458481892e-05, + "loss": 0.8209, + "step": 1346 + }, + { + "epoch": 0.05905681283729384, + "grad_norm": 0.84765625, + "learning_rate": 4.497234048388089e-05, + "loss": 0.6481, + "step": 1347 + }, + { + "epoch": 0.059100656053951074, + "grad_norm": 0.84375, + "learning_rate": 4.4968496499570875e-05, + "loss": 0.8318, + "step": 1348 + }, + { + "epoch": 0.0591444992706083, + "grad_norm": 0.8203125, + "learning_rate": 4.4964652631896996e-05, + "loss": 0.8187, + "step": 1349 + }, + { + "epoch": 0.05918834248726554, + "grad_norm": 0.7890625, + "learning_rate": 4.496080888086739e-05, + "loss": 0.7317, + "step": 1350 + }, + { + "epoch": 0.059232185703922774, + "grad_norm": 0.86328125, + "learning_rate": 4.49569652464902e-05, + "loss": 0.7526, + "step": 1351 + }, + { + "epoch": 0.05927602892058, + "grad_norm": 0.88671875, + "learning_rate": 4.495312172877354e-05, + "loss": 0.8202, + "step": 1352 + }, + { + "epoch": 0.05931987213723724, + "grad_norm": 0.78125, + "learning_rate": 4.494927832772563e-05, + "loss": 0.7726, + "step": 1353 + }, + { + "epoch": 0.059363715353894474, + "grad_norm": 0.875, + "learning_rate": 4.4945435043354576e-05, + "loss": 0.8943, + "step": 1354 + }, + { + "epoch": 0.05940755857055171, + "grad_norm": 0.90234375, + "learning_rate": 4.4941591875668535e-05, + "loss": 0.7154, + "step": 1355 + }, + { + "epoch": 0.05945140178720894, + "grad_norm": 0.79296875, + "learning_rate": 4.493774882467563e-05, + "loss": 0.792, + "step": 1356 + }, + { + "epoch": 0.05949524500386617, + "grad_norm": 0.83203125, + "learning_rate": 4.493390589038403e-05, + "loss": 0.7984, + "step": 1357 + }, + { + "epoch": 0.05953908822052341, + "grad_norm": 0.83984375, + "learning_rate": 4.493006307280186e-05, + "loss": 0.7622, + "step": 1358 + }, + { + "epoch": 0.059582931437180645, + "grad_norm": 0.890625, + "learning_rate": 4.4926220371937275e-05, + "loss": 0.9342, + "step": 1359 + }, + { + "epoch": 0.05962677465383787, + "grad_norm": 0.8203125, + "learning_rate": 4.492237778779843e-05, + "loss": 0.7771, + "step": 1360 + }, + { + "epoch": 0.05967061787049511, + "grad_norm": 0.93359375, + "learning_rate": 4.491853532039344e-05, + "loss": 0.8758, + "step": 1361 + }, + { + "epoch": 0.059714461087152344, + "grad_norm": 0.796875, + "learning_rate": 4.4914692969730444e-05, + "loss": 0.8508, + "step": 1362 + }, + { + "epoch": 0.05975830430380958, + "grad_norm": 0.859375, + "learning_rate": 4.491085073581763e-05, + "loss": 0.7555, + "step": 1363 + }, + { + "epoch": 0.05980214752046681, + "grad_norm": 1.9296875, + "learning_rate": 4.4907008618663125e-05, + "loss": 0.7399, + "step": 1364 + }, + { + "epoch": 0.059845990737124044, + "grad_norm": 0.80859375, + "learning_rate": 4.490316661827507e-05, + "loss": 0.8649, + "step": 1365 + }, + { + "epoch": 0.05988983395378128, + "grad_norm": 0.8828125, + "learning_rate": 4.489932473466161e-05, + "loss": 0.9026, + "step": 1366 + }, + { + "epoch": 0.059933677170438515, + "grad_norm": 0.8046875, + "learning_rate": 4.489548296783084e-05, + "loss": 0.7374, + "step": 1367 + }, + { + "epoch": 0.059977520387095744, + "grad_norm": 0.86328125, + "learning_rate": 4.489164131779097e-05, + "loss": 0.7927, + "step": 1368 + }, + { + "epoch": 0.06002136360375298, + "grad_norm": 0.73046875, + "learning_rate": 4.488779978455012e-05, + "loss": 0.7455, + "step": 1369 + }, + { + "epoch": 0.060065206820410215, + "grad_norm": 0.8203125, + "learning_rate": 4.4883958368116444e-05, + "loss": 0.8803, + "step": 1370 + }, + { + "epoch": 0.060109050037067444, + "grad_norm": 0.765625, + "learning_rate": 4.4880117068498054e-05, + "loss": 0.6578, + "step": 1371 + }, + { + "epoch": 0.06015289325372468, + "grad_norm": 0.74609375, + "learning_rate": 4.4876275885703065e-05, + "loss": 0.8027, + "step": 1372 + }, + { + "epoch": 0.060196736470381915, + "grad_norm": 0.875, + "learning_rate": 4.48724348197397e-05, + "loss": 0.8163, + "step": 1373 + }, + { + "epoch": 0.06024057968703915, + "grad_norm": 0.91015625, + "learning_rate": 4.486859387061607e-05, + "loss": 0.8611, + "step": 1374 + }, + { + "epoch": 0.06028442290369638, + "grad_norm": 0.77734375, + "learning_rate": 4.486475303834029e-05, + "loss": 0.7866, + "step": 1375 + }, + { + "epoch": 0.060328266120353614, + "grad_norm": 0.875, + "learning_rate": 4.4860912322920526e-05, + "loss": 0.8379, + "step": 1376 + }, + { + "epoch": 0.06037210933701085, + "grad_norm": 1.03125, + "learning_rate": 4.485707172436485e-05, + "loss": 0.8745, + "step": 1377 + }, + { + "epoch": 0.060415952553668086, + "grad_norm": 0.8046875, + "learning_rate": 4.485323124268152e-05, + "loss": 0.9047, + "step": 1378 + }, + { + "epoch": 0.060459795770325314, + "grad_norm": 0.796875, + "learning_rate": 4.484939087787859e-05, + "loss": 0.7964, + "step": 1379 + }, + { + "epoch": 0.06050363898698255, + "grad_norm": 0.875, + "learning_rate": 4.484555062996424e-05, + "loss": 0.8966, + "step": 1380 + }, + { + "epoch": 0.060547482203639785, + "grad_norm": 0.88671875, + "learning_rate": 4.484171049894659e-05, + "loss": 0.8464, + "step": 1381 + }, + { + "epoch": 0.06059132542029702, + "grad_norm": 0.78515625, + "learning_rate": 4.483787048483374e-05, + "loss": 0.7827, + "step": 1382 + }, + { + "epoch": 0.06063516863695425, + "grad_norm": 0.83203125, + "learning_rate": 4.4834030587633925e-05, + "loss": 0.9001, + "step": 1383 + }, + { + "epoch": 0.060679011853611485, + "grad_norm": 0.80859375, + "learning_rate": 4.483019080735521e-05, + "loss": 0.8431, + "step": 1384 + }, + { + "epoch": 0.06072285507026872, + "grad_norm": 0.8984375, + "learning_rate": 4.482635114400576e-05, + "loss": 0.7456, + "step": 1385 + }, + { + "epoch": 0.060766698286925956, + "grad_norm": 0.80859375, + "learning_rate": 4.4822511597593694e-05, + "loss": 0.8726, + "step": 1386 + }, + { + "epoch": 0.060810541503583185, + "grad_norm": 0.8671875, + "learning_rate": 4.481867216812714e-05, + "loss": 0.7606, + "step": 1387 + }, + { + "epoch": 0.06085438472024042, + "grad_norm": 0.83203125, + "learning_rate": 4.481483285561428e-05, + "loss": 0.8194, + "step": 1388 + }, + { + "epoch": 0.060898227936897656, + "grad_norm": 0.796875, + "learning_rate": 4.481099366006324e-05, + "loss": 0.8354, + "step": 1389 + }, + { + "epoch": 0.060942071153554885, + "grad_norm": 0.90234375, + "learning_rate": 4.4807154581482136e-05, + "loss": 0.864, + "step": 1390 + }, + { + "epoch": 0.06098591437021212, + "grad_norm": 0.89453125, + "learning_rate": 4.4803315619879114e-05, + "loss": 0.7969, + "step": 1391 + }, + { + "epoch": 0.061029757586869356, + "grad_norm": 0.8046875, + "learning_rate": 4.4799476775262285e-05, + "loss": 0.7718, + "step": 1392 + }, + { + "epoch": 0.06107360080352659, + "grad_norm": 0.9140625, + "learning_rate": 4.479563804763983e-05, + "loss": 0.7477, + "step": 1393 + }, + { + "epoch": 0.06111744402018382, + "grad_norm": 0.921875, + "learning_rate": 4.479179943701988e-05, + "loss": 0.8662, + "step": 1394 + }, + { + "epoch": 0.061161287236841055, + "grad_norm": 0.796875, + "learning_rate": 4.4787960943410534e-05, + "loss": 0.8544, + "step": 1395 + }, + { + "epoch": 0.06120513045349829, + "grad_norm": 0.83984375, + "learning_rate": 4.4784122566819965e-05, + "loss": 0.6983, + "step": 1396 + }, + { + "epoch": 0.061248973670155527, + "grad_norm": 0.8203125, + "learning_rate": 4.4780284307256294e-05, + "loss": 0.864, + "step": 1397 + }, + { + "epoch": 0.061292816886812755, + "grad_norm": 0.8828125, + "learning_rate": 4.477644616472765e-05, + "loss": 0.8223, + "step": 1398 + }, + { + "epoch": 0.06133666010346999, + "grad_norm": 0.79296875, + "learning_rate": 4.477260813924217e-05, + "loss": 0.7149, + "step": 1399 + }, + { + "epoch": 0.061380503320127226, + "grad_norm": 0.8984375, + "learning_rate": 4.4768770230807956e-05, + "loss": 0.7896, + "step": 1400 + }, + { + "epoch": 0.06142434653678446, + "grad_norm": 0.8203125, + "learning_rate": 4.47649324394332e-05, + "loss": 0.8308, + "step": 1401 + }, + { + "epoch": 0.06146818975344169, + "grad_norm": 0.7890625, + "learning_rate": 4.476109476512603e-05, + "loss": 0.7775, + "step": 1402 + }, + { + "epoch": 0.061512032970098926, + "grad_norm": 0.78515625, + "learning_rate": 4.4757257207894555e-05, + "loss": 0.7952, + "step": 1403 + }, + { + "epoch": 0.06155587618675616, + "grad_norm": 0.80078125, + "learning_rate": 4.475341976774692e-05, + "loss": 0.934, + "step": 1404 + }, + { + "epoch": 0.0615997194034134, + "grad_norm": 0.87890625, + "learning_rate": 4.4749582444691254e-05, + "loss": 0.819, + "step": 1405 + }, + { + "epoch": 0.061643562620070626, + "grad_norm": 0.7890625, + "learning_rate": 4.474574523873569e-05, + "loss": 0.9156, + "step": 1406 + }, + { + "epoch": 0.06168740583672786, + "grad_norm": 0.921875, + "learning_rate": 4.474190814988832e-05, + "loss": 0.832, + "step": 1407 + }, + { + "epoch": 0.0617312490533851, + "grad_norm": 0.82421875, + "learning_rate": 4.473807117815736e-05, + "loss": 0.7748, + "step": 1408 + }, + { + "epoch": 0.061775092270042326, + "grad_norm": 0.84375, + "learning_rate": 4.4734234323550894e-05, + "loss": 0.8149, + "step": 1409 + }, + { + "epoch": 0.06181893548669956, + "grad_norm": 0.86328125, + "learning_rate": 4.473039758607706e-05, + "loss": 0.8421, + "step": 1410 + }, + { + "epoch": 0.0618627787033568, + "grad_norm": 0.828125, + "learning_rate": 4.472656096574399e-05, + "loss": 0.7541, + "step": 1411 + }, + { + "epoch": 0.06190662192001403, + "grad_norm": 0.90234375, + "learning_rate": 4.472272446255977e-05, + "loss": 0.8275, + "step": 1412 + }, + { + "epoch": 0.06195046513667126, + "grad_norm": 0.76171875, + "learning_rate": 4.471888807653263e-05, + "loss": 0.7473, + "step": 1413 + }, + { + "epoch": 0.061994308353328496, + "grad_norm": 0.8359375, + "learning_rate": 4.4715051807670636e-05, + "loss": 0.7766, + "step": 1414 + }, + { + "epoch": 0.06203815156998573, + "grad_norm": 0.78125, + "learning_rate": 4.471121565598193e-05, + "loss": 0.9826, + "step": 1415 + }, + { + "epoch": 0.06208199478664297, + "grad_norm": 0.86328125, + "learning_rate": 4.470737962147464e-05, + "loss": 0.8388, + "step": 1416 + }, + { + "epoch": 0.062125838003300196, + "grad_norm": 0.98046875, + "learning_rate": 4.4703543704156915e-05, + "loss": 0.925, + "step": 1417 + }, + { + "epoch": 0.06216968121995743, + "grad_norm": 0.8125, + "learning_rate": 4.4699707904036815e-05, + "loss": 0.7989, + "step": 1418 + }, + { + "epoch": 0.06221352443661467, + "grad_norm": 0.8046875, + "learning_rate": 4.4695872221122556e-05, + "loss": 0.7546, + "step": 1419 + }, + { + "epoch": 0.0622573676532719, + "grad_norm": 0.7734375, + "learning_rate": 4.469203665542225e-05, + "loss": 0.8132, + "step": 1420 + }, + { + "epoch": 0.06230121086992913, + "grad_norm": 0.89453125, + "learning_rate": 4.4688201206944e-05, + "loss": 0.8493, + "step": 1421 + }, + { + "epoch": 0.06234505408658637, + "grad_norm": 0.8515625, + "learning_rate": 4.468436587569595e-05, + "loss": 0.7028, + "step": 1422 + }, + { + "epoch": 0.0623888973032436, + "grad_norm": 0.80078125, + "learning_rate": 4.4680530661686196e-05, + "loss": 0.8625, + "step": 1423 + }, + { + "epoch": 0.06243274051990083, + "grad_norm": 0.765625, + "learning_rate": 4.4676695564922924e-05, + "loss": 0.766, + "step": 1424 + }, + { + "epoch": 0.06247658373655807, + "grad_norm": 0.7890625, + "learning_rate": 4.467286058541423e-05, + "loss": 0.8309, + "step": 1425 + }, + { + "epoch": 0.0625204269532153, + "grad_norm": 0.9609375, + "learning_rate": 4.4669025723168254e-05, + "loss": 0.7538, + "step": 1426 + }, + { + "epoch": 0.06256427016987254, + "grad_norm": 0.82421875, + "learning_rate": 4.4665190978193106e-05, + "loss": 0.8481, + "step": 1427 + }, + { + "epoch": 0.06260811338652977, + "grad_norm": 0.8203125, + "learning_rate": 4.46613563504969e-05, + "loss": 0.8185, + "step": 1428 + }, + { + "epoch": 0.06265195660318701, + "grad_norm": 0.796875, + "learning_rate": 4.4657521840087814e-05, + "loss": 0.8431, + "step": 1429 + }, + { + "epoch": 0.06269579981984423, + "grad_norm": 0.83984375, + "learning_rate": 4.465368744697395e-05, + "loss": 0.909, + "step": 1430 + }, + { + "epoch": 0.06273964303650147, + "grad_norm": 0.84375, + "learning_rate": 4.464985317116342e-05, + "loss": 0.8068, + "step": 1431 + }, + { + "epoch": 0.0627834862531587, + "grad_norm": 0.81640625, + "learning_rate": 4.4646019012664374e-05, + "loss": 0.8181, + "step": 1432 + }, + { + "epoch": 0.06282732946981594, + "grad_norm": 0.7578125, + "learning_rate": 4.464218497148489e-05, + "loss": 0.8189, + "step": 1433 + }, + { + "epoch": 0.06287117268647317, + "grad_norm": 0.7734375, + "learning_rate": 4.4638351047633164e-05, + "loss": 0.8556, + "step": 1434 + }, + { + "epoch": 0.06291501590313041, + "grad_norm": 0.8046875, + "learning_rate": 4.4634517241117294e-05, + "loss": 0.8853, + "step": 1435 + }, + { + "epoch": 0.06295885911978764, + "grad_norm": 0.9609375, + "learning_rate": 4.46306835519454e-05, + "loss": 0.872, + "step": 1436 + }, + { + "epoch": 0.06300270233644488, + "grad_norm": 0.87890625, + "learning_rate": 4.462684998012559e-05, + "loss": 0.8863, + "step": 1437 + }, + { + "epoch": 0.0630465455531021, + "grad_norm": 0.9921875, + "learning_rate": 4.4623016525666e-05, + "loss": 0.8491, + "step": 1438 + }, + { + "epoch": 0.06309038876975934, + "grad_norm": 0.83203125, + "learning_rate": 4.461918318857478e-05, + "loss": 0.8402, + "step": 1439 + }, + { + "epoch": 0.06313423198641657, + "grad_norm": 0.88671875, + "learning_rate": 4.461534996886003e-05, + "loss": 0.8205, + "step": 1440 + }, + { + "epoch": 0.06317807520307381, + "grad_norm": 1.0390625, + "learning_rate": 4.461151686652989e-05, + "loss": 1.0176, + "step": 1441 + }, + { + "epoch": 0.06322191841973104, + "grad_norm": 0.81640625, + "learning_rate": 4.4607683881592464e-05, + "loss": 0.8514, + "step": 1442 + }, + { + "epoch": 0.06326576163638828, + "grad_norm": 0.9765625, + "learning_rate": 4.4603851014055896e-05, + "loss": 0.7586, + "step": 1443 + }, + { + "epoch": 0.06330960485304551, + "grad_norm": 1.03125, + "learning_rate": 4.46000182639283e-05, + "loss": 0.9516, + "step": 1444 + }, + { + "epoch": 0.06335344806970274, + "grad_norm": 0.80859375, + "learning_rate": 4.45961856312178e-05, + "loss": 0.893, + "step": 1445 + }, + { + "epoch": 0.06339729128635997, + "grad_norm": 0.8828125, + "learning_rate": 4.459235311593252e-05, + "loss": 0.8323, + "step": 1446 + }, + { + "epoch": 0.06344113450301721, + "grad_norm": 0.765625, + "learning_rate": 4.458852071808056e-05, + "loss": 0.8232, + "step": 1447 + }, + { + "epoch": 0.06348497771967444, + "grad_norm": 0.8984375, + "learning_rate": 4.458468843767005e-05, + "loss": 0.794, + "step": 1448 + }, + { + "epoch": 0.06352882093633168, + "grad_norm": 0.8359375, + "learning_rate": 4.458085627470915e-05, + "loss": 0.9106, + "step": 1449 + }, + { + "epoch": 0.06357266415298891, + "grad_norm": 0.828125, + "learning_rate": 4.4577024229205966e-05, + "loss": 0.8594, + "step": 1450 + }, + { + "epoch": 0.06361650736964615, + "grad_norm": 0.828125, + "learning_rate": 4.4573192301168595e-05, + "loss": 0.7285, + "step": 1451 + }, + { + "epoch": 0.06366035058630339, + "grad_norm": 0.83984375, + "learning_rate": 4.4569360490605194e-05, + "loss": 0.7169, + "step": 1452 + }, + { + "epoch": 0.06370419380296061, + "grad_norm": 0.9375, + "learning_rate": 4.456552879752381e-05, + "loss": 0.7523, + "step": 1453 + }, + { + "epoch": 0.06374803701961784, + "grad_norm": 0.92578125, + "learning_rate": 4.456169722193268e-05, + "loss": 0.8251, + "step": 1454 + }, + { + "epoch": 0.06379188023627508, + "grad_norm": 0.84375, + "learning_rate": 4.4557865763839845e-05, + "loss": 0.8145, + "step": 1455 + }, + { + "epoch": 0.06383572345293231, + "grad_norm": 0.76171875, + "learning_rate": 4.455403442325345e-05, + "loss": 0.7364, + "step": 1456 + }, + { + "epoch": 0.06387956666958955, + "grad_norm": 0.7890625, + "learning_rate": 4.455020320018161e-05, + "loss": 0.6739, + "step": 1457 + }, + { + "epoch": 0.06392340988624678, + "grad_norm": 0.79296875, + "learning_rate": 4.4546372094632406e-05, + "loss": 0.7388, + "step": 1458 + }, + { + "epoch": 0.06396725310290402, + "grad_norm": 0.7265625, + "learning_rate": 4.4542541106614046e-05, + "loss": 0.7532, + "step": 1459 + }, + { + "epoch": 0.06401109631956126, + "grad_norm": 0.82421875, + "learning_rate": 4.4538710236134586e-05, + "loss": 0.8571, + "step": 1460 + }, + { + "epoch": 0.06405493953621848, + "grad_norm": 0.80859375, + "learning_rate": 4.4534879483202164e-05, + "loss": 0.8284, + "step": 1461 + }, + { + "epoch": 0.06409878275287571, + "grad_norm": 0.79296875, + "learning_rate": 4.45310488478249e-05, + "loss": 0.7293, + "step": 1462 + }, + { + "epoch": 0.06414262596953295, + "grad_norm": 0.8203125, + "learning_rate": 4.452721833001086e-05, + "loss": 0.8462, + "step": 1463 + }, + { + "epoch": 0.06418646918619018, + "grad_norm": 0.84375, + "learning_rate": 4.4523387929768245e-05, + "loss": 0.8647, + "step": 1464 + }, + { + "epoch": 0.06423031240284742, + "grad_norm": 0.83203125, + "learning_rate": 4.451955764710515e-05, + "loss": 0.8909, + "step": 1465 + }, + { + "epoch": 0.06427415561950466, + "grad_norm": 1.0703125, + "learning_rate": 4.451572748202968e-05, + "loss": 0.9, + "step": 1466 + }, + { + "epoch": 0.06431799883616189, + "grad_norm": 0.76953125, + "learning_rate": 4.451189743454994e-05, + "loss": 0.8337, + "step": 1467 + }, + { + "epoch": 0.06436184205281911, + "grad_norm": 0.6953125, + "learning_rate": 4.4508067504674046e-05, + "loss": 0.7945, + "step": 1468 + }, + { + "epoch": 0.06440568526947635, + "grad_norm": 0.75390625, + "learning_rate": 4.4504237692410154e-05, + "loss": 0.7559, + "step": 1469 + }, + { + "epoch": 0.06444952848613358, + "grad_norm": 1.0078125, + "learning_rate": 4.450040799776636e-05, + "loss": 0.8132, + "step": 1470 + }, + { + "epoch": 0.06449337170279082, + "grad_norm": 0.953125, + "learning_rate": 4.449657842075079e-05, + "loss": 0.8612, + "step": 1471 + }, + { + "epoch": 0.06453721491944805, + "grad_norm": 0.78125, + "learning_rate": 4.449274896137153e-05, + "loss": 0.7617, + "step": 1472 + }, + { + "epoch": 0.06458105813610529, + "grad_norm": 0.9140625, + "learning_rate": 4.448891961963669e-05, + "loss": 0.7806, + "step": 1473 + }, + { + "epoch": 0.06462490135276253, + "grad_norm": 0.80859375, + "learning_rate": 4.448509039555445e-05, + "loss": 0.7614, + "step": 1474 + }, + { + "epoch": 0.06466874456941976, + "grad_norm": 0.81640625, + "learning_rate": 4.448126128913289e-05, + "loss": 0.8291, + "step": 1475 + }, + { + "epoch": 0.06471258778607698, + "grad_norm": 0.89453125, + "learning_rate": 4.447743230038012e-05, + "loss": 1.0048, + "step": 1476 + }, + { + "epoch": 0.06475643100273422, + "grad_norm": 0.87890625, + "learning_rate": 4.447360342930426e-05, + "loss": 0.7991, + "step": 1477 + }, + { + "epoch": 0.06480027421939145, + "grad_norm": 0.85546875, + "learning_rate": 4.446977467591339e-05, + "loss": 0.9897, + "step": 1478 + }, + { + "epoch": 0.06484411743604869, + "grad_norm": 0.84375, + "learning_rate": 4.446594604021569e-05, + "loss": 0.9184, + "step": 1479 + }, + { + "epoch": 0.06488796065270593, + "grad_norm": 0.83984375, + "learning_rate": 4.446211752221924e-05, + "loss": 0.7864, + "step": 1480 + }, + { + "epoch": 0.06493180386936316, + "grad_norm": 0.79296875, + "learning_rate": 4.445828912193216e-05, + "loss": 0.7619, + "step": 1481 + }, + { + "epoch": 0.0649756470860204, + "grad_norm": 0.76953125, + "learning_rate": 4.4454460839362566e-05, + "loss": 0.8931, + "step": 1482 + }, + { + "epoch": 0.06501949030267762, + "grad_norm": 0.83203125, + "learning_rate": 4.4450632674518565e-05, + "loss": 0.8982, + "step": 1483 + }, + { + "epoch": 0.06506333351933485, + "grad_norm": 0.8671875, + "learning_rate": 4.4446804627408275e-05, + "loss": 0.8356, + "step": 1484 + }, + { + "epoch": 0.06510717673599209, + "grad_norm": 0.84375, + "learning_rate": 4.444297669803981e-05, + "loss": 0.7593, + "step": 1485 + }, + { + "epoch": 0.06515101995264932, + "grad_norm": 0.7734375, + "learning_rate": 4.443914888642123e-05, + "loss": 0.834, + "step": 1486 + }, + { + "epoch": 0.06519486316930656, + "grad_norm": 0.82421875, + "learning_rate": 4.443532119256075e-05, + "loss": 0.65, + "step": 1487 + }, + { + "epoch": 0.0652387063859638, + "grad_norm": 0.7734375, + "learning_rate": 4.443149361646642e-05, + "loss": 0.8518, + "step": 1488 + }, + { + "epoch": 0.06528254960262103, + "grad_norm": 0.859375, + "learning_rate": 4.442766615814637e-05, + "loss": 0.9117, + "step": 1489 + }, + { + "epoch": 0.06532639281927827, + "grad_norm": 0.80859375, + "learning_rate": 4.442383881760871e-05, + "loss": 0.8082, + "step": 1490 + }, + { + "epoch": 0.06537023603593549, + "grad_norm": 0.80078125, + "learning_rate": 4.442001159486153e-05, + "loss": 0.8167, + "step": 1491 + }, + { + "epoch": 0.06541407925259272, + "grad_norm": 0.91015625, + "learning_rate": 4.4416184489912974e-05, + "loss": 0.8364, + "step": 1492 + }, + { + "epoch": 0.06545792246924996, + "grad_norm": 0.875, + "learning_rate": 4.4412357502771094e-05, + "loss": 0.7852, + "step": 1493 + }, + { + "epoch": 0.0655017656859072, + "grad_norm": 0.8359375, + "learning_rate": 4.440853063344409e-05, + "loss": 0.8321, + "step": 1494 + }, + { + "epoch": 0.06554560890256443, + "grad_norm": 0.734375, + "learning_rate": 4.4404703881940015e-05, + "loss": 0.7854, + "step": 1495 + }, + { + "epoch": 0.06558945211922167, + "grad_norm": 0.87109375, + "learning_rate": 4.4400877248267004e-05, + "loss": 0.8472, + "step": 1496 + }, + { + "epoch": 0.0656332953358789, + "grad_norm": 0.8671875, + "learning_rate": 4.439705073243314e-05, + "loss": 0.7658, + "step": 1497 + }, + { + "epoch": 0.06567713855253612, + "grad_norm": 0.87890625, + "learning_rate": 4.439322433444653e-05, + "loss": 0.9581, + "step": 1498 + }, + { + "epoch": 0.06572098176919336, + "grad_norm": 0.76171875, + "learning_rate": 4.438939805431534e-05, + "loss": 0.7696, + "step": 1499 + }, + { + "epoch": 0.0657648249858506, + "grad_norm": 0.83984375, + "learning_rate": 4.438557189204763e-05, + "loss": 0.7519, + "step": 1500 + }, + { + "epoch": 0.06580866820250783, + "grad_norm": 0.90234375, + "learning_rate": 4.4381745847651524e-05, + "loss": 0.911, + "step": 1501 + }, + { + "epoch": 0.06585251141916507, + "grad_norm": 0.8515625, + "learning_rate": 4.4377919921135136e-05, + "loss": 0.8256, + "step": 1502 + }, + { + "epoch": 0.0658963546358223, + "grad_norm": 0.80859375, + "learning_rate": 4.4374094112506524e-05, + "loss": 0.7822, + "step": 1503 + }, + { + "epoch": 0.06594019785247954, + "grad_norm": 0.74609375, + "learning_rate": 4.437026842177387e-05, + "loss": 0.905, + "step": 1504 + }, + { + "epoch": 0.06598404106913677, + "grad_norm": 0.8203125, + "learning_rate": 4.436644284894527e-05, + "loss": 0.8956, + "step": 1505 + }, + { + "epoch": 0.066027884285794, + "grad_norm": 0.84375, + "learning_rate": 4.4362617394028805e-05, + "loss": 0.8578, + "step": 1506 + }, + { + "epoch": 0.06607172750245123, + "grad_norm": 0.74609375, + "learning_rate": 4.4358792057032596e-05, + "loss": 0.8343, + "step": 1507 + }, + { + "epoch": 0.06611557071910847, + "grad_norm": 0.859375, + "learning_rate": 4.4354966837964706e-05, + "loss": 0.8189, + "step": 1508 + }, + { + "epoch": 0.0661594139357657, + "grad_norm": 0.7734375, + "learning_rate": 4.435114173683331e-05, + "loss": 0.78, + "step": 1509 + }, + { + "epoch": 0.06620325715242294, + "grad_norm": 0.82421875, + "learning_rate": 4.4347316753646506e-05, + "loss": 0.7942, + "step": 1510 + }, + { + "epoch": 0.06624710036908017, + "grad_norm": 0.78125, + "learning_rate": 4.434349188841238e-05, + "loss": 0.8709, + "step": 1511 + }, + { + "epoch": 0.06629094358573741, + "grad_norm": 0.875, + "learning_rate": 4.433966714113904e-05, + "loss": 0.7951, + "step": 1512 + }, + { + "epoch": 0.06633478680239464, + "grad_norm": 0.7734375, + "learning_rate": 4.4335842511834544e-05, + "loss": 0.732, + "step": 1513 + }, + { + "epoch": 0.06637863001905187, + "grad_norm": 0.88671875, + "learning_rate": 4.4332018000507106e-05, + "loss": 1.0315, + "step": 1514 + }, + { + "epoch": 0.0664224732357091, + "grad_norm": 0.9453125, + "learning_rate": 4.432819360716476e-05, + "loss": 0.7893, + "step": 1515 + }, + { + "epoch": 0.06646631645236634, + "grad_norm": 1.0, + "learning_rate": 4.432436933181563e-05, + "loss": 0.9359, + "step": 1516 + }, + { + "epoch": 0.06651015966902357, + "grad_norm": 0.82421875, + "learning_rate": 4.432054517446782e-05, + "loss": 0.7117, + "step": 1517 + }, + { + "epoch": 0.06655400288568081, + "grad_norm": 0.8671875, + "learning_rate": 4.431672113512939e-05, + "loss": 0.8147, + "step": 1518 + }, + { + "epoch": 0.06659784610233804, + "grad_norm": 0.77734375, + "learning_rate": 4.431289721380854e-05, + "loss": 0.7967, + "step": 1519 + }, + { + "epoch": 0.06664168931899528, + "grad_norm": 0.828125, + "learning_rate": 4.4309073410513303e-05, + "loss": 0.9509, + "step": 1520 + }, + { + "epoch": 0.0666855325356525, + "grad_norm": 0.97265625, + "learning_rate": 4.4305249725251806e-05, + "loss": 0.7877, + "step": 1521 + }, + { + "epoch": 0.06672937575230974, + "grad_norm": 0.796875, + "learning_rate": 4.430142615803214e-05, + "loss": 0.761, + "step": 1522 + }, + { + "epoch": 0.06677321896896697, + "grad_norm": 0.81640625, + "learning_rate": 4.42976027088624e-05, + "loss": 0.8734, + "step": 1523 + }, + { + "epoch": 0.06681706218562421, + "grad_norm": 0.91796875, + "learning_rate": 4.429377937775073e-05, + "loss": 0.9378, + "step": 1524 + }, + { + "epoch": 0.06686090540228144, + "grad_norm": 0.80859375, + "learning_rate": 4.4289956164705215e-05, + "loss": 0.8144, + "step": 1525 + }, + { + "epoch": 0.06690474861893868, + "grad_norm": 0.83203125, + "learning_rate": 4.4286133069733945e-05, + "loss": 0.9615, + "step": 1526 + }, + { + "epoch": 0.06694859183559591, + "grad_norm": 0.79296875, + "learning_rate": 4.4282310092845045e-05, + "loss": 0.895, + "step": 1527 + }, + { + "epoch": 0.06699243505225315, + "grad_norm": 0.8125, + "learning_rate": 4.4278487234046605e-05, + "loss": 0.7857, + "step": 1528 + }, + { + "epoch": 0.06703627826891037, + "grad_norm": 0.8125, + "learning_rate": 4.4274664493346716e-05, + "loss": 0.6563, + "step": 1529 + }, + { + "epoch": 0.0670801214855676, + "grad_norm": 0.8984375, + "learning_rate": 4.42708418707535e-05, + "loss": 0.8174, + "step": 1530 + }, + { + "epoch": 0.06712396470222484, + "grad_norm": 0.77734375, + "learning_rate": 4.4267019366275044e-05, + "loss": 0.6624, + "step": 1531 + }, + { + "epoch": 0.06716780791888208, + "grad_norm": 0.9609375, + "learning_rate": 4.426319697991945e-05, + "loss": 0.8155, + "step": 1532 + }, + { + "epoch": 0.06721165113553931, + "grad_norm": 0.75390625, + "learning_rate": 4.42593747116948e-05, + "loss": 0.8102, + "step": 1533 + }, + { + "epoch": 0.06725549435219655, + "grad_norm": 0.87109375, + "learning_rate": 4.425555256160925e-05, + "loss": 0.7508, + "step": 1534 + }, + { + "epoch": 0.06729933756885378, + "grad_norm": 0.828125, + "learning_rate": 4.425173052967087e-05, + "loss": 0.7831, + "step": 1535 + }, + { + "epoch": 0.067343180785511, + "grad_norm": 0.82421875, + "learning_rate": 4.4247908615887755e-05, + "loss": 0.8332, + "step": 1536 + }, + { + "epoch": 0.06738702400216824, + "grad_norm": 0.8515625, + "learning_rate": 4.4244086820268014e-05, + "loss": 0.7962, + "step": 1537 + }, + { + "epoch": 0.06743086721882548, + "grad_norm": 0.74609375, + "learning_rate": 4.424026514281972e-05, + "loss": 0.7688, + "step": 1538 + }, + { + "epoch": 0.06747471043548271, + "grad_norm": 0.83203125, + "learning_rate": 4.4236443583551016e-05, + "loss": 0.8646, + "step": 1539 + }, + { + "epoch": 0.06751855365213995, + "grad_norm": 0.83203125, + "learning_rate": 4.423262214246998e-05, + "loss": 0.7909, + "step": 1540 + }, + { + "epoch": 0.06756239686879718, + "grad_norm": 0.83203125, + "learning_rate": 4.422880081958473e-05, + "loss": 0.8648, + "step": 1541 + }, + { + "epoch": 0.06760624008545442, + "grad_norm": 0.78125, + "learning_rate": 4.422497961490334e-05, + "loss": 0.831, + "step": 1542 + }, + { + "epoch": 0.06765008330211165, + "grad_norm": 0.84765625, + "learning_rate": 4.422115852843388e-05, + "loss": 0.7818, + "step": 1543 + }, + { + "epoch": 0.06769392651876888, + "grad_norm": 0.83203125, + "learning_rate": 4.421733756018454e-05, + "loss": 0.7561, + "step": 1544 + }, + { + "epoch": 0.06773776973542611, + "grad_norm": 0.890625, + "learning_rate": 4.4213516710163336e-05, + "loss": 0.8715, + "step": 1545 + }, + { + "epoch": 0.06778161295208335, + "grad_norm": 0.80859375, + "learning_rate": 4.420969597837842e-05, + "loss": 0.9509, + "step": 1546 + }, + { + "epoch": 0.06782545616874058, + "grad_norm": 0.890625, + "learning_rate": 4.420587536483785e-05, + "loss": 0.7463, + "step": 1547 + }, + { + "epoch": 0.06786929938539782, + "grad_norm": 0.75390625, + "learning_rate": 4.42020548695497e-05, + "loss": 0.7181, + "step": 1548 + }, + { + "epoch": 0.06791314260205505, + "grad_norm": 0.78515625, + "learning_rate": 4.4198234492522153e-05, + "loss": 0.8182, + "step": 1549 + }, + { + "epoch": 0.06795698581871229, + "grad_norm": 0.890625, + "learning_rate": 4.419441423376325e-05, + "loss": 0.9673, + "step": 1550 + }, + { + "epoch": 0.06800082903536953, + "grad_norm": 0.85546875, + "learning_rate": 4.41905940932811e-05, + "loss": 0.6783, + "step": 1551 + }, + { + "epoch": 0.06804467225202675, + "grad_norm": 0.9921875, + "learning_rate": 4.41867740710838e-05, + "loss": 0.868, + "step": 1552 + }, + { + "epoch": 0.06808851546868398, + "grad_norm": 0.77734375, + "learning_rate": 4.41829541671794e-05, + "loss": 0.762, + "step": 1553 + }, + { + "epoch": 0.06813235868534122, + "grad_norm": 0.7578125, + "learning_rate": 4.417913438157607e-05, + "loss": 0.8244, + "step": 1554 + }, + { + "epoch": 0.06817620190199845, + "grad_norm": 0.8203125, + "learning_rate": 4.417531471428189e-05, + "loss": 0.7779, + "step": 1555 + }, + { + "epoch": 0.06822004511865569, + "grad_norm": 0.7890625, + "learning_rate": 4.417149516530492e-05, + "loss": 0.862, + "step": 1556 + }, + { + "epoch": 0.06826388833531292, + "grad_norm": 0.8671875, + "learning_rate": 4.416767573465329e-05, + "loss": 0.8686, + "step": 1557 + }, + { + "epoch": 0.06830773155197016, + "grad_norm": 0.84765625, + "learning_rate": 4.416385642233508e-05, + "loss": 0.7607, + "step": 1558 + }, + { + "epoch": 0.06835157476862738, + "grad_norm": 0.95703125, + "learning_rate": 4.416003722835834e-05, + "loss": 0.8648, + "step": 1559 + }, + { + "epoch": 0.06839541798528462, + "grad_norm": 0.875, + "learning_rate": 4.415621815273125e-05, + "loss": 0.8492, + "step": 1560 + }, + { + "epoch": 0.06843926120194185, + "grad_norm": 0.7890625, + "learning_rate": 4.4152399195461856e-05, + "loss": 0.8257, + "step": 1561 + }, + { + "epoch": 0.06848310441859909, + "grad_norm": 0.87890625, + "learning_rate": 4.414858035655828e-05, + "loss": 0.7447, + "step": 1562 + }, + { + "epoch": 0.06852694763525632, + "grad_norm": 0.8203125, + "learning_rate": 4.4144761636028574e-05, + "loss": 0.7927, + "step": 1563 + }, + { + "epoch": 0.06857079085191356, + "grad_norm": 0.91015625, + "learning_rate": 4.4140943033880824e-05, + "loss": 0.7782, + "step": 1564 + }, + { + "epoch": 0.0686146340685708, + "grad_norm": 0.890625, + "learning_rate": 4.4137124550123186e-05, + "loss": 0.805, + "step": 1565 + }, + { + "epoch": 0.06865847728522803, + "grad_norm": 0.8515625, + "learning_rate": 4.413330618476373e-05, + "loss": 0.909, + "step": 1566 + }, + { + "epoch": 0.06870232050188525, + "grad_norm": 0.86328125, + "learning_rate": 4.4129487937810535e-05, + "loss": 0.8367, + "step": 1567 + }, + { + "epoch": 0.06874616371854249, + "grad_norm": 0.85546875, + "learning_rate": 4.412566980927169e-05, + "loss": 0.7399, + "step": 1568 + }, + { + "epoch": 0.06879000693519972, + "grad_norm": 0.9453125, + "learning_rate": 4.41218517991553e-05, + "loss": 0.9047, + "step": 1569 + }, + { + "epoch": 0.06883385015185696, + "grad_norm": 0.890625, + "learning_rate": 4.411803390746945e-05, + "loss": 0.7383, + "step": 1570 + }, + { + "epoch": 0.0688776933685142, + "grad_norm": 0.84765625, + "learning_rate": 4.41142161342222e-05, + "loss": 0.8872, + "step": 1571 + }, + { + "epoch": 0.06892153658517143, + "grad_norm": 0.86328125, + "learning_rate": 4.41103984794217e-05, + "loss": 0.9206, + "step": 1572 + }, + { + "epoch": 0.06896537980182867, + "grad_norm": 0.796875, + "learning_rate": 4.410658094307603e-05, + "loss": 0.7165, + "step": 1573 + }, + { + "epoch": 0.06900922301848589, + "grad_norm": 0.80859375, + "learning_rate": 4.410276352519326e-05, + "loss": 0.8149, + "step": 1574 + }, + { + "epoch": 0.06905306623514312, + "grad_norm": 0.8515625, + "learning_rate": 4.409894622578148e-05, + "loss": 0.7954, + "step": 1575 + }, + { + "epoch": 0.06909690945180036, + "grad_norm": 0.80859375, + "learning_rate": 4.4095129044848806e-05, + "loss": 0.8631, + "step": 1576 + }, + { + "epoch": 0.0691407526684576, + "grad_norm": 0.83984375, + "learning_rate": 4.409131198240329e-05, + "loss": 0.8851, + "step": 1577 + }, + { + "epoch": 0.06918459588511483, + "grad_norm": 1.5859375, + "learning_rate": 4.4087495038453053e-05, + "loss": 0.7362, + "step": 1578 + }, + { + "epoch": 0.06922843910177207, + "grad_norm": 0.78125, + "learning_rate": 4.408367821300614e-05, + "loss": 0.8118, + "step": 1579 + }, + { + "epoch": 0.0692722823184293, + "grad_norm": 0.80078125, + "learning_rate": 4.407986150607071e-05, + "loss": 0.7361, + "step": 1580 + }, + { + "epoch": 0.06931612553508654, + "grad_norm": 0.8046875, + "learning_rate": 4.4076044917654824e-05, + "loss": 0.9647, + "step": 1581 + }, + { + "epoch": 0.06935996875174376, + "grad_norm": 0.86328125, + "learning_rate": 4.407222844776655e-05, + "loss": 0.8732, + "step": 1582 + }, + { + "epoch": 0.069403811968401, + "grad_norm": 0.90625, + "learning_rate": 4.4068412096414006e-05, + "loss": 0.8862, + "step": 1583 + }, + { + "epoch": 0.06944765518505823, + "grad_norm": 0.7265625, + "learning_rate": 4.406459586360522e-05, + "loss": 0.8348, + "step": 1584 + }, + { + "epoch": 0.06949149840171547, + "grad_norm": 0.8203125, + "learning_rate": 4.406077974934836e-05, + "loss": 0.8256, + "step": 1585 + }, + { + "epoch": 0.0695353416183727, + "grad_norm": 0.859375, + "learning_rate": 4.405696375365148e-05, + "loss": 0.8424, + "step": 1586 + }, + { + "epoch": 0.06957918483502994, + "grad_norm": 0.82421875, + "learning_rate": 4.405314787652268e-05, + "loss": 0.8695, + "step": 1587 + }, + { + "epoch": 0.06962302805168717, + "grad_norm": 0.87890625, + "learning_rate": 4.404933211797002e-05, + "loss": 0.8515, + "step": 1588 + }, + { + "epoch": 0.0696668712683444, + "grad_norm": 0.77734375, + "learning_rate": 4.404551647800158e-05, + "loss": 0.9008, + "step": 1589 + }, + { + "epoch": 0.06971071448500163, + "grad_norm": 0.78125, + "learning_rate": 4.40417009566255e-05, + "loss": 0.8156, + "step": 1590 + }, + { + "epoch": 0.06975455770165886, + "grad_norm": 0.78125, + "learning_rate": 4.403788555384983e-05, + "loss": 0.7609, + "step": 1591 + }, + { + "epoch": 0.0697984009183161, + "grad_norm": 0.77734375, + "learning_rate": 4.403407026968267e-05, + "loss": 0.8117, + "step": 1592 + }, + { + "epoch": 0.06984224413497334, + "grad_norm": 0.90625, + "learning_rate": 4.4030255104132104e-05, + "loss": 0.9061, + "step": 1593 + }, + { + "epoch": 0.06988608735163057, + "grad_norm": 0.88671875, + "learning_rate": 4.4026440057206164e-05, + "loss": 0.8289, + "step": 1594 + }, + { + "epoch": 0.06992993056828781, + "grad_norm": 0.78125, + "learning_rate": 4.402262512891303e-05, + "loss": 0.7658, + "step": 1595 + }, + { + "epoch": 0.06997377378494504, + "grad_norm": 0.84765625, + "learning_rate": 4.401881031926074e-05, + "loss": 0.8713, + "step": 1596 + }, + { + "epoch": 0.07001761700160226, + "grad_norm": 0.8203125, + "learning_rate": 4.4014995628257386e-05, + "loss": 0.8053, + "step": 1597 + }, + { + "epoch": 0.0700614602182595, + "grad_norm": 0.9609375, + "learning_rate": 4.401118105591104e-05, + "loss": 0.8922, + "step": 1598 + }, + { + "epoch": 0.07010530343491674, + "grad_norm": 0.80859375, + "learning_rate": 4.400736660222977e-05, + "loss": 0.7853, + "step": 1599 + }, + { + "epoch": 0.07014914665157397, + "grad_norm": 0.8125, + "learning_rate": 4.400355226722171e-05, + "loss": 0.7968, + "step": 1600 + }, + { + "epoch": 0.0701929898682312, + "grad_norm": 0.83203125, + "learning_rate": 4.399973805089493e-05, + "loss": 0.7356, + "step": 1601 + }, + { + "epoch": 0.07023683308488844, + "grad_norm": 0.91015625, + "learning_rate": 4.399592395325751e-05, + "loss": 0.8903, + "step": 1602 + }, + { + "epoch": 0.07028067630154568, + "grad_norm": 0.8828125, + "learning_rate": 4.3992109974317516e-05, + "loss": 0.8815, + "step": 1603 + }, + { + "epoch": 0.07032451951820291, + "grad_norm": 0.73046875, + "learning_rate": 4.3988296114083015e-05, + "loss": 0.7622, + "step": 1604 + }, + { + "epoch": 0.07036836273486013, + "grad_norm": 0.8046875, + "learning_rate": 4.398448237256215e-05, + "loss": 0.8541, + "step": 1605 + }, + { + "epoch": 0.07041220595151737, + "grad_norm": 0.921875, + "learning_rate": 4.398066874976298e-05, + "loss": 0.8559, + "step": 1606 + }, + { + "epoch": 0.0704560491681746, + "grad_norm": 0.90625, + "learning_rate": 4.3976855245693574e-05, + "loss": 1.0012, + "step": 1607 + }, + { + "epoch": 0.07049989238483184, + "grad_norm": 0.8671875, + "learning_rate": 4.397304186036202e-05, + "loss": 0.9074, + "step": 1608 + }, + { + "epoch": 0.07054373560148908, + "grad_norm": 0.99609375, + "learning_rate": 4.3969228593776366e-05, + "loss": 0.8286, + "step": 1609 + }, + { + "epoch": 0.07058757881814631, + "grad_norm": 0.76171875, + "learning_rate": 4.3965415445944766e-05, + "loss": 0.8889, + "step": 1610 + }, + { + "epoch": 0.07063142203480355, + "grad_norm": 0.8359375, + "learning_rate": 4.396160241687527e-05, + "loss": 0.7575, + "step": 1611 + }, + { + "epoch": 0.07067526525146077, + "grad_norm": 0.87890625, + "learning_rate": 4.395778950657595e-05, + "loss": 0.824, + "step": 1612 + }, + { + "epoch": 0.070719108468118, + "grad_norm": 0.98828125, + "learning_rate": 4.395397671505489e-05, + "loss": 0.9172, + "step": 1613 + }, + { + "epoch": 0.07076295168477524, + "grad_norm": 0.88671875, + "learning_rate": 4.395016404232017e-05, + "loss": 0.8614, + "step": 1614 + }, + { + "epoch": 0.07080679490143248, + "grad_norm": 0.7890625, + "learning_rate": 4.394635148837989e-05, + "loss": 0.877, + "step": 1615 + }, + { + "epoch": 0.07085063811808971, + "grad_norm": 0.875, + "learning_rate": 4.394253905324211e-05, + "loss": 0.8853, + "step": 1616 + }, + { + "epoch": 0.07089448133474695, + "grad_norm": 0.8671875, + "learning_rate": 4.39387267369149e-05, + "loss": 0.7944, + "step": 1617 + }, + { + "epoch": 0.07093832455140418, + "grad_norm": 0.78515625, + "learning_rate": 4.393491453940636e-05, + "loss": 0.8034, + "step": 1618 + }, + { + "epoch": 0.07098216776806142, + "grad_norm": 1.3515625, + "learning_rate": 4.3931102460724526e-05, + "loss": 0.8183, + "step": 1619 + }, + { + "epoch": 0.07102601098471864, + "grad_norm": 0.83203125, + "learning_rate": 4.392729050087756e-05, + "loss": 0.8387, + "step": 1620 + }, + { + "epoch": 0.07106985420137588, + "grad_norm": 0.80078125, + "learning_rate": 4.3923478659873496e-05, + "loss": 0.7193, + "step": 1621 + }, + { + "epoch": 0.07111369741803311, + "grad_norm": 0.82421875, + "learning_rate": 4.391966693772041e-05, + "loss": 0.8199, + "step": 1622 + }, + { + "epoch": 0.07115754063469035, + "grad_norm": 0.875, + "learning_rate": 4.391585533442638e-05, + "loss": 0.8758, + "step": 1623 + }, + { + "epoch": 0.07120138385134758, + "grad_norm": 0.78125, + "learning_rate": 4.391204384999946e-05, + "loss": 0.7164, + "step": 1624 + }, + { + "epoch": 0.07124522706800482, + "grad_norm": 0.87890625, + "learning_rate": 4.39082324844478e-05, + "loss": 0.8128, + "step": 1625 + }, + { + "epoch": 0.07128907028466205, + "grad_norm": 0.73828125, + "learning_rate": 4.3904421237779424e-05, + "loss": 0.8867, + "step": 1626 + }, + { + "epoch": 0.07133291350131928, + "grad_norm": 0.9375, + "learning_rate": 4.3900610110002426e-05, + "loss": 0.8988, + "step": 1627 + }, + { + "epoch": 0.07137675671797651, + "grad_norm": 0.75390625, + "learning_rate": 4.389679910112487e-05, + "loss": 0.7936, + "step": 1628 + }, + { + "epoch": 0.07142059993463375, + "grad_norm": 1.1015625, + "learning_rate": 4.389298821115482e-05, + "loss": 0.7892, + "step": 1629 + }, + { + "epoch": 0.07146444315129098, + "grad_norm": 0.875, + "learning_rate": 4.388917744010039e-05, + "loss": 0.8344, + "step": 1630 + }, + { + "epoch": 0.07150828636794822, + "grad_norm": 0.81640625, + "learning_rate": 4.3885366787969664e-05, + "loss": 0.9877, + "step": 1631 + }, + { + "epoch": 0.07155212958460545, + "grad_norm": 0.8125, + "learning_rate": 4.388155625477068e-05, + "loss": 0.7853, + "step": 1632 + }, + { + "epoch": 0.07159597280126269, + "grad_norm": 0.80859375, + "learning_rate": 4.3877745840511544e-05, + "loss": 0.8647, + "step": 1633 + }, + { + "epoch": 0.07163981601791992, + "grad_norm": 0.8203125, + "learning_rate": 4.387393554520026e-05, + "loss": 0.8228, + "step": 1634 + }, + { + "epoch": 0.07168365923457715, + "grad_norm": 0.87890625, + "learning_rate": 4.387012536884502e-05, + "loss": 0.8759, + "step": 1635 + }, + { + "epoch": 0.07172750245123438, + "grad_norm": 0.94921875, + "learning_rate": 4.386631531145383e-05, + "loss": 0.8305, + "step": 1636 + }, + { + "epoch": 0.07177134566789162, + "grad_norm": 0.8125, + "learning_rate": 4.3862505373034776e-05, + "loss": 0.8675, + "step": 1637 + }, + { + "epoch": 0.07181518888454885, + "grad_norm": 0.80078125, + "learning_rate": 4.385869555359593e-05, + "loss": 0.7643, + "step": 1638 + }, + { + "epoch": 0.07185903210120609, + "grad_norm": 0.8515625, + "learning_rate": 4.3854885853145345e-05, + "loss": 0.7546, + "step": 1639 + }, + { + "epoch": 0.07190287531786332, + "grad_norm": 0.87109375, + "learning_rate": 4.385107627169115e-05, + "loss": 0.7175, + "step": 1640 + }, + { + "epoch": 0.07194671853452056, + "grad_norm": 0.8515625, + "learning_rate": 4.3847266809241396e-05, + "loss": 0.8162, + "step": 1641 + }, + { + "epoch": 0.0719905617511778, + "grad_norm": 0.9296875, + "learning_rate": 4.3843457465804136e-05, + "loss": 0.8584, + "step": 1642 + }, + { + "epoch": 0.07203440496783502, + "grad_norm": 0.828125, + "learning_rate": 4.383964824138747e-05, + "loss": 0.7971, + "step": 1643 + }, + { + "epoch": 0.07207824818449225, + "grad_norm": 0.80859375, + "learning_rate": 4.3835839135999425e-05, + "loss": 0.809, + "step": 1644 + }, + { + "epoch": 0.07212209140114949, + "grad_norm": 0.765625, + "learning_rate": 4.383203014964813e-05, + "loss": 0.8058, + "step": 1645 + }, + { + "epoch": 0.07216593461780672, + "grad_norm": 0.8203125, + "learning_rate": 4.3828221282341654e-05, + "loss": 0.7726, + "step": 1646 + }, + { + "epoch": 0.07220977783446396, + "grad_norm": 0.76953125, + "learning_rate": 4.3824412534088046e-05, + "loss": 0.8217, + "step": 1647 + }, + { + "epoch": 0.0722536210511212, + "grad_norm": 0.875, + "learning_rate": 4.3820603904895374e-05, + "loss": 0.8525, + "step": 1648 + }, + { + "epoch": 0.07229746426777843, + "grad_norm": 0.8046875, + "learning_rate": 4.38167953947717e-05, + "loss": 0.8825, + "step": 1649 + }, + { + "epoch": 0.07234130748443565, + "grad_norm": 0.7734375, + "learning_rate": 4.381298700372515e-05, + "loss": 0.8417, + "step": 1650 + }, + { + "epoch": 0.07238515070109289, + "grad_norm": 0.81640625, + "learning_rate": 4.380917873176376e-05, + "loss": 0.8172, + "step": 1651 + }, + { + "epoch": 0.07242899391775012, + "grad_norm": 0.875, + "learning_rate": 4.38053705788956e-05, + "loss": 0.9377, + "step": 1652 + }, + { + "epoch": 0.07247283713440736, + "grad_norm": 0.80078125, + "learning_rate": 4.380156254512875e-05, + "loss": 0.8184, + "step": 1653 + }, + { + "epoch": 0.0725166803510646, + "grad_norm": 0.87109375, + "learning_rate": 4.379775463047128e-05, + "loss": 0.8365, + "step": 1654 + }, + { + "epoch": 0.07256052356772183, + "grad_norm": 1.171875, + "learning_rate": 4.379394683493125e-05, + "loss": 0.7186, + "step": 1655 + }, + { + "epoch": 0.07260436678437907, + "grad_norm": 0.77734375, + "learning_rate": 4.37901391585167e-05, + "loss": 0.7949, + "step": 1656 + }, + { + "epoch": 0.0726482100010363, + "grad_norm": 0.8046875, + "learning_rate": 4.378633160123577e-05, + "loss": 0.8146, + "step": 1657 + }, + { + "epoch": 0.07269205321769352, + "grad_norm": 0.84375, + "learning_rate": 4.3782524163096515e-05, + "loss": 0.7718, + "step": 1658 + }, + { + "epoch": 0.07273589643435076, + "grad_norm": 0.94140625, + "learning_rate": 4.377871684410697e-05, + "loss": 0.939, + "step": 1659 + }, + { + "epoch": 0.072779739651008, + "grad_norm": 0.88671875, + "learning_rate": 4.377490964427523e-05, + "loss": 0.7441, + "step": 1660 + }, + { + "epoch": 0.07282358286766523, + "grad_norm": 0.8203125, + "learning_rate": 4.3771102563609344e-05, + "loss": 0.8278, + "step": 1661 + }, + { + "epoch": 0.07286742608432246, + "grad_norm": 0.796875, + "learning_rate": 4.3767295602117406e-05, + "loss": 0.7009, + "step": 1662 + }, + { + "epoch": 0.0729112693009797, + "grad_norm": 0.7578125, + "learning_rate": 4.376348875980747e-05, + "loss": 0.8301, + "step": 1663 + }, + { + "epoch": 0.07295511251763694, + "grad_norm": 0.84375, + "learning_rate": 4.3759682036687556e-05, + "loss": 0.8051, + "step": 1664 + }, + { + "epoch": 0.07299895573429416, + "grad_norm": 0.8671875, + "learning_rate": 4.375587543276583e-05, + "loss": 0.8341, + "step": 1665 + }, + { + "epoch": 0.0730427989509514, + "grad_norm": 0.83203125, + "learning_rate": 4.375206894805031e-05, + "loss": 0.818, + "step": 1666 + }, + { + "epoch": 0.07308664216760863, + "grad_norm": 0.859375, + "learning_rate": 4.374826258254906e-05, + "loss": 0.8654, + "step": 1667 + }, + { + "epoch": 0.07313048538426586, + "grad_norm": 0.8984375, + "learning_rate": 4.3744456336270156e-05, + "loss": 0.8304, + "step": 1668 + }, + { + "epoch": 0.0731743286009231, + "grad_norm": 0.87109375, + "learning_rate": 4.374065020922162e-05, + "loss": 0.9295, + "step": 1669 + }, + { + "epoch": 0.07321817181758034, + "grad_norm": 0.87109375, + "learning_rate": 4.37368442014116e-05, + "loss": 0.7926, + "step": 1670 + }, + { + "epoch": 0.07326201503423757, + "grad_norm": 0.8203125, + "learning_rate": 4.3733038312848116e-05, + "loss": 0.8586, + "step": 1671 + }, + { + "epoch": 0.0733058582508948, + "grad_norm": 0.85546875, + "learning_rate": 4.3729232543539256e-05, + "loss": 0.8622, + "step": 1672 + }, + { + "epoch": 0.07334970146755203, + "grad_norm": 0.98046875, + "learning_rate": 4.372542689349306e-05, + "loss": 0.8833, + "step": 1673 + }, + { + "epoch": 0.07339354468420926, + "grad_norm": 0.77734375, + "learning_rate": 4.372162136271757e-05, + "loss": 0.7282, + "step": 1674 + }, + { + "epoch": 0.0734373879008665, + "grad_norm": 1.4375, + "learning_rate": 4.371781595122093e-05, + "loss": 0.8399, + "step": 1675 + }, + { + "epoch": 0.07348123111752373, + "grad_norm": 0.91015625, + "learning_rate": 4.371401065901115e-05, + "loss": 0.8426, + "step": 1676 + }, + { + "epoch": 0.07352507433418097, + "grad_norm": 0.84375, + "learning_rate": 4.3710205486096326e-05, + "loss": 0.8012, + "step": 1677 + }, + { + "epoch": 0.0735689175508382, + "grad_norm": 0.80078125, + "learning_rate": 4.3706400432484485e-05, + "loss": 0.8814, + "step": 1678 + }, + { + "epoch": 0.07361276076749544, + "grad_norm": 0.87890625, + "learning_rate": 4.3702595498183685e-05, + "loss": 0.7854, + "step": 1679 + }, + { + "epoch": 0.07365660398415266, + "grad_norm": 0.8359375, + "learning_rate": 4.369879068320205e-05, + "loss": 0.7385, + "step": 1680 + }, + { + "epoch": 0.0737004472008099, + "grad_norm": 0.8203125, + "learning_rate": 4.369498598754761e-05, + "loss": 0.8237, + "step": 1681 + }, + { + "epoch": 0.07374429041746713, + "grad_norm": 1.359375, + "learning_rate": 4.369118141122842e-05, + "loss": 0.7678, + "step": 1682 + }, + { + "epoch": 0.07378813363412437, + "grad_norm": 0.87109375, + "learning_rate": 4.3687376954252565e-05, + "loss": 0.8684, + "step": 1683 + }, + { + "epoch": 0.0738319768507816, + "grad_norm": 0.9140625, + "learning_rate": 4.368357261662805e-05, + "loss": 0.7175, + "step": 1684 + }, + { + "epoch": 0.07387582006743884, + "grad_norm": 0.890625, + "learning_rate": 4.367976839836303e-05, + "loss": 0.8807, + "step": 1685 + }, + { + "epoch": 0.07391966328409608, + "grad_norm": 0.828125, + "learning_rate": 4.367596429946552e-05, + "loss": 0.7837, + "step": 1686 + }, + { + "epoch": 0.07396350650075331, + "grad_norm": 0.8203125, + "learning_rate": 4.367216031994359e-05, + "loss": 0.7658, + "step": 1687 + }, + { + "epoch": 0.07400734971741053, + "grad_norm": 0.953125, + "learning_rate": 4.366835645980528e-05, + "loss": 0.9278, + "step": 1688 + }, + { + "epoch": 0.07405119293406777, + "grad_norm": 0.86328125, + "learning_rate": 4.366455271905864e-05, + "loss": 0.8371, + "step": 1689 + }, + { + "epoch": 0.074095036150725, + "grad_norm": 0.83203125, + "learning_rate": 4.3660749097711804e-05, + "loss": 0.8539, + "step": 1690 + }, + { + "epoch": 0.07413887936738224, + "grad_norm": 0.80859375, + "learning_rate": 4.365694559577278e-05, + "loss": 0.8422, + "step": 1691 + }, + { + "epoch": 0.07418272258403948, + "grad_norm": 0.8125, + "learning_rate": 4.3653142213249644e-05, + "loss": 0.7968, + "step": 1692 + }, + { + "epoch": 0.07422656580069671, + "grad_norm": 0.7734375, + "learning_rate": 4.364933895015045e-05, + "loss": 0.7286, + "step": 1693 + }, + { + "epoch": 0.07427040901735395, + "grad_norm": 0.8515625, + "learning_rate": 4.364553580648323e-05, + "loss": 0.8421, + "step": 1694 + }, + { + "epoch": 0.07431425223401118, + "grad_norm": 0.86328125, + "learning_rate": 4.364173278225611e-05, + "loss": 0.8064, + "step": 1695 + }, + { + "epoch": 0.0743580954506684, + "grad_norm": 0.8359375, + "learning_rate": 4.363792987747711e-05, + "loss": 0.8586, + "step": 1696 + }, + { + "epoch": 0.07440193866732564, + "grad_norm": 1.03125, + "learning_rate": 4.363412709215431e-05, + "loss": 0.7798, + "step": 1697 + }, + { + "epoch": 0.07444578188398288, + "grad_norm": 0.84375, + "learning_rate": 4.3630324426295745e-05, + "loss": 0.9021, + "step": 1698 + }, + { + "epoch": 0.07448962510064011, + "grad_norm": 0.84765625, + "learning_rate": 4.3626521879909485e-05, + "loss": 0.8399, + "step": 1699 + }, + { + "epoch": 0.07453346831729735, + "grad_norm": 0.9140625, + "learning_rate": 4.362271945300359e-05, + "loss": 1.0688, + "step": 1700 + }, + { + "epoch": 0.07457731153395458, + "grad_norm": 0.875, + "learning_rate": 4.361891714558612e-05, + "loss": 0.7574, + "step": 1701 + }, + { + "epoch": 0.07462115475061182, + "grad_norm": 0.75, + "learning_rate": 4.361511495766514e-05, + "loss": 0.801, + "step": 1702 + }, + { + "epoch": 0.07466499796726904, + "grad_norm": 0.765625, + "learning_rate": 4.361131288924869e-05, + "loss": 0.7058, + "step": 1703 + }, + { + "epoch": 0.07470884118392628, + "grad_norm": 0.82421875, + "learning_rate": 4.360751094034484e-05, + "loss": 0.8405, + "step": 1704 + }, + { + "epoch": 0.07475268440058351, + "grad_norm": 1.7265625, + "learning_rate": 4.360370911096162e-05, + "loss": 0.7743, + "step": 1705 + }, + { + "epoch": 0.07479652761724075, + "grad_norm": 0.86328125, + "learning_rate": 4.359990740110714e-05, + "loss": 0.8664, + "step": 1706 + }, + { + "epoch": 0.07484037083389798, + "grad_norm": 0.8359375, + "learning_rate": 4.3596105810789445e-05, + "loss": 0.7252, + "step": 1707 + }, + { + "epoch": 0.07488421405055522, + "grad_norm": 0.90625, + "learning_rate": 4.359230434001658e-05, + "loss": 0.7552, + "step": 1708 + }, + { + "epoch": 0.07492805726721245, + "grad_norm": 0.8125, + "learning_rate": 4.3588502988796586e-05, + "loss": 0.773, + "step": 1709 + }, + { + "epoch": 0.07497190048386969, + "grad_norm": 0.74609375, + "learning_rate": 4.3584701757137516e-05, + "loss": 0.7211, + "step": 1710 + }, + { + "epoch": 0.07501574370052691, + "grad_norm": 0.89453125, + "learning_rate": 4.358090064504747e-05, + "loss": 0.8742, + "step": 1711 + }, + { + "epoch": 0.07505958691718415, + "grad_norm": 0.90625, + "learning_rate": 4.357709965253448e-05, + "loss": 0.8185, + "step": 1712 + }, + { + "epoch": 0.07510343013384138, + "grad_norm": 0.80078125, + "learning_rate": 4.357329877960662e-05, + "loss": 0.7982, + "step": 1713 + }, + { + "epoch": 0.07514727335049862, + "grad_norm": 0.80078125, + "learning_rate": 4.356949802627192e-05, + "loss": 0.8351, + "step": 1714 + }, + { + "epoch": 0.07519111656715585, + "grad_norm": 0.78515625, + "learning_rate": 4.35656973925384e-05, + "loss": 0.9703, + "step": 1715 + }, + { + "epoch": 0.07523495978381309, + "grad_norm": 0.86328125, + "learning_rate": 4.3561896878414186e-05, + "loss": 0.8326, + "step": 1716 + }, + { + "epoch": 0.07527880300047032, + "grad_norm": 1.1640625, + "learning_rate": 4.3558096483907315e-05, + "loss": 0.7554, + "step": 1717 + }, + { + "epoch": 0.07532264621712755, + "grad_norm": 0.78125, + "learning_rate": 4.3554296209025844e-05, + "loss": 0.7196, + "step": 1718 + }, + { + "epoch": 0.07536648943378478, + "grad_norm": 0.7734375, + "learning_rate": 4.355049605377781e-05, + "loss": 0.7242, + "step": 1719 + }, + { + "epoch": 0.07541033265044202, + "grad_norm": 0.77734375, + "learning_rate": 4.354669601817123e-05, + "loss": 0.5938, + "step": 1720 + }, + { + "epoch": 0.07545417586709925, + "grad_norm": 0.91015625, + "learning_rate": 4.354289610221424e-05, + "loss": 0.7982, + "step": 1721 + }, + { + "epoch": 0.07549801908375649, + "grad_norm": 0.78515625, + "learning_rate": 4.353909630591486e-05, + "loss": 0.8411, + "step": 1722 + }, + { + "epoch": 0.07554186230041372, + "grad_norm": 0.78515625, + "learning_rate": 4.353529662928113e-05, + "loss": 0.8512, + "step": 1723 + }, + { + "epoch": 0.07558570551707096, + "grad_norm": 0.88671875, + "learning_rate": 4.3531497072321116e-05, + "loss": 0.939, + "step": 1724 + }, + { + "epoch": 0.0756295487337282, + "grad_norm": 0.79296875, + "learning_rate": 4.352769763504283e-05, + "loss": 0.7884, + "step": 1725 + }, + { + "epoch": 0.07567339195038542, + "grad_norm": 0.84765625, + "learning_rate": 4.35238983174544e-05, + "loss": 0.9339, + "step": 1726 + }, + { + "epoch": 0.07571723516704265, + "grad_norm": 0.828125, + "learning_rate": 4.352009911956384e-05, + "loss": 0.7689, + "step": 1727 + }, + { + "epoch": 0.07576107838369989, + "grad_norm": 0.79296875, + "learning_rate": 4.35163000413792e-05, + "loss": 0.8729, + "step": 1728 + }, + { + "epoch": 0.07580492160035712, + "grad_norm": 0.8984375, + "learning_rate": 4.351250108290853e-05, + "loss": 0.8011, + "step": 1729 + }, + { + "epoch": 0.07584876481701436, + "grad_norm": 0.9375, + "learning_rate": 4.350870224415985e-05, + "loss": 0.8782, + "step": 1730 + }, + { + "epoch": 0.0758926080336716, + "grad_norm": 0.8125, + "learning_rate": 4.350490352514128e-05, + "loss": 0.9458, + "step": 1731 + }, + { + "epoch": 0.07593645125032883, + "grad_norm": 0.8515625, + "learning_rate": 4.350110492586085e-05, + "loss": 0.8517, + "step": 1732 + }, + { + "epoch": 0.07598029446698606, + "grad_norm": 0.75390625, + "learning_rate": 4.349730644632658e-05, + "loss": 0.7099, + "step": 1733 + }, + { + "epoch": 0.07602413768364329, + "grad_norm": 0.87890625, + "learning_rate": 4.3493508086546555e-05, + "loss": 0.7631, + "step": 1734 + }, + { + "epoch": 0.07606798090030052, + "grad_norm": 0.83203125, + "learning_rate": 4.348970984652877e-05, + "loss": 0.8417, + "step": 1735 + }, + { + "epoch": 0.07611182411695776, + "grad_norm": 0.7890625, + "learning_rate": 4.3485911726281346e-05, + "loss": 0.8466, + "step": 1736 + }, + { + "epoch": 0.076155667333615, + "grad_norm": 0.83203125, + "learning_rate": 4.348211372581231e-05, + "loss": 0.7781, + "step": 1737 + }, + { + "epoch": 0.07619951055027223, + "grad_norm": 1.046875, + "learning_rate": 4.34783158451297e-05, + "loss": 0.834, + "step": 1738 + }, + { + "epoch": 0.07624335376692946, + "grad_norm": 0.78125, + "learning_rate": 4.347451808424157e-05, + "loss": 0.9869, + "step": 1739 + }, + { + "epoch": 0.0762871969835867, + "grad_norm": 0.8203125, + "learning_rate": 4.347072044315597e-05, + "loss": 0.7226, + "step": 1740 + }, + { + "epoch": 0.07633104020024392, + "grad_norm": 0.83203125, + "learning_rate": 4.3466922921880935e-05, + "loss": 0.8523, + "step": 1741 + }, + { + "epoch": 0.07637488341690116, + "grad_norm": 0.90625, + "learning_rate": 4.34631255204245e-05, + "loss": 0.7418, + "step": 1742 + }, + { + "epoch": 0.07641872663355839, + "grad_norm": 0.80078125, + "learning_rate": 4.345932823879477e-05, + "loss": 0.6869, + "step": 1743 + }, + { + "epoch": 0.07646256985021563, + "grad_norm": 0.8046875, + "learning_rate": 4.345553107699977e-05, + "loss": 0.8514, + "step": 1744 + }, + { + "epoch": 0.07650641306687286, + "grad_norm": 0.8515625, + "learning_rate": 4.345173403504753e-05, + "loss": 0.8141, + "step": 1745 + }, + { + "epoch": 0.0765502562835301, + "grad_norm": 0.828125, + "learning_rate": 4.344793711294611e-05, + "loss": 0.8919, + "step": 1746 + }, + { + "epoch": 0.07659409950018733, + "grad_norm": 0.83203125, + "learning_rate": 4.344414031070356e-05, + "loss": 0.884, + "step": 1747 + }, + { + "epoch": 0.07663794271684457, + "grad_norm": 0.97265625, + "learning_rate": 4.344034362832793e-05, + "loss": 0.8149, + "step": 1748 + }, + { + "epoch": 0.07668178593350179, + "grad_norm": 0.83203125, + "learning_rate": 4.343654706582724e-05, + "loss": 0.7848, + "step": 1749 + }, + { + "epoch": 0.07672562915015903, + "grad_norm": 0.87109375, + "learning_rate": 4.343275062320953e-05, + "loss": 0.8393, + "step": 1750 + }, + { + "epoch": 0.07676947236681626, + "grad_norm": 0.953125, + "learning_rate": 4.3428954300482906e-05, + "loss": 0.9169, + "step": 1751 + }, + { + "epoch": 0.0768133155834735, + "grad_norm": 0.828125, + "learning_rate": 4.342515809765538e-05, + "loss": 0.7723, + "step": 1752 + }, + { + "epoch": 0.07685715880013073, + "grad_norm": 0.79296875, + "learning_rate": 4.3421362014735e-05, + "loss": 0.8094, + "step": 1753 + }, + { + "epoch": 0.07690100201678797, + "grad_norm": 0.82421875, + "learning_rate": 4.341756605172981e-05, + "loss": 0.8896, + "step": 1754 + }, + { + "epoch": 0.0769448452334452, + "grad_norm": 0.87890625, + "learning_rate": 4.341377020864781e-05, + "loss": 0.8721, + "step": 1755 + }, + { + "epoch": 0.07698868845010243, + "grad_norm": 0.75390625, + "learning_rate": 4.340997448549713e-05, + "loss": 0.7941, + "step": 1756 + }, + { + "epoch": 0.07703253166675966, + "grad_norm": 0.73828125, + "learning_rate": 4.340617888228577e-05, + "loss": 0.7412, + "step": 1757 + }, + { + "epoch": 0.0770763748834169, + "grad_norm": 0.81640625, + "learning_rate": 4.3402383399021795e-05, + "loss": 0.7761, + "step": 1758 + }, + { + "epoch": 0.07712021810007413, + "grad_norm": 2.0, + "learning_rate": 4.3398588035713215e-05, + "loss": 0.834, + "step": 1759 + }, + { + "epoch": 0.07716406131673137, + "grad_norm": 0.8046875, + "learning_rate": 4.339479279236807e-05, + "loss": 0.8895, + "step": 1760 + }, + { + "epoch": 0.0772079045333886, + "grad_norm": 0.7734375, + "learning_rate": 4.339099766899446e-05, + "loss": 0.7907, + "step": 1761 + }, + { + "epoch": 0.07725174775004584, + "grad_norm": 0.79296875, + "learning_rate": 4.338720266560039e-05, + "loss": 0.8352, + "step": 1762 + }, + { + "epoch": 0.07729559096670308, + "grad_norm": 0.85546875, + "learning_rate": 4.3383407782193905e-05, + "loss": 0.8552, + "step": 1763 + }, + { + "epoch": 0.0773394341833603, + "grad_norm": 0.7421875, + "learning_rate": 4.337961301878306e-05, + "loss": 0.738, + "step": 1764 + }, + { + "epoch": 0.07738327740001753, + "grad_norm": 1.03125, + "learning_rate": 4.3375818375375846e-05, + "loss": 0.8148, + "step": 1765 + }, + { + "epoch": 0.07742712061667477, + "grad_norm": 0.88671875, + "learning_rate": 4.337202385198039e-05, + "loss": 0.9363, + "step": 1766 + }, + { + "epoch": 0.077470963833332, + "grad_norm": 0.76171875, + "learning_rate": 4.336822944860469e-05, + "loss": 0.742, + "step": 1767 + }, + { + "epoch": 0.07751480704998924, + "grad_norm": 0.87109375, + "learning_rate": 4.336443516525679e-05, + "loss": 0.8762, + "step": 1768 + }, + { + "epoch": 0.07755865026664648, + "grad_norm": 1.046875, + "learning_rate": 4.336064100194473e-05, + "loss": 0.8264, + "step": 1769 + }, + { + "epoch": 0.07760249348330371, + "grad_norm": 0.8203125, + "learning_rate": 4.335684695867652e-05, + "loss": 0.8553, + "step": 1770 + }, + { + "epoch": 0.07764633669996093, + "grad_norm": 0.79296875, + "learning_rate": 4.335305303546028e-05, + "loss": 0.826, + "step": 1771 + }, + { + "epoch": 0.07769017991661817, + "grad_norm": 0.78125, + "learning_rate": 4.334925923230399e-05, + "loss": 0.7751, + "step": 1772 + }, + { + "epoch": 0.0777340231332754, + "grad_norm": 0.83984375, + "learning_rate": 4.334546554921573e-05, + "loss": 0.7981, + "step": 1773 + }, + { + "epoch": 0.07777786634993264, + "grad_norm": 0.8515625, + "learning_rate": 4.33416719862035e-05, + "loss": 0.8977, + "step": 1774 + }, + { + "epoch": 0.07782170956658988, + "grad_norm": 0.75390625, + "learning_rate": 4.333787854327532e-05, + "loss": 0.85, + "step": 1775 + }, + { + "epoch": 0.07786555278324711, + "grad_norm": 0.84765625, + "learning_rate": 4.3334085220439316e-05, + "loss": 0.9356, + "step": 1776 + }, + { + "epoch": 0.07790939599990435, + "grad_norm": 0.79296875, + "learning_rate": 4.333029201770347e-05, + "loss": 0.8145, + "step": 1777 + }, + { + "epoch": 0.07795323921656158, + "grad_norm": 0.86328125, + "learning_rate": 4.332649893507583e-05, + "loss": 0.8208, + "step": 1778 + }, + { + "epoch": 0.0779970824332188, + "grad_norm": 0.8828125, + "learning_rate": 4.332270597256445e-05, + "loss": 0.9383, + "step": 1779 + }, + { + "epoch": 0.07804092564987604, + "grad_norm": 0.84765625, + "learning_rate": 4.331891313017731e-05, + "loss": 0.962, + "step": 1780 + }, + { + "epoch": 0.07808476886653327, + "grad_norm": 0.88671875, + "learning_rate": 4.3315120407922525e-05, + "loss": 0.8013, + "step": 1781 + }, + { + "epoch": 0.07812861208319051, + "grad_norm": 0.74609375, + "learning_rate": 4.331132780580811e-05, + "loss": 0.7906, + "step": 1782 + }, + { + "epoch": 0.07817245529984775, + "grad_norm": 0.875, + "learning_rate": 4.330753532384208e-05, + "loss": 0.7289, + "step": 1783 + }, + { + "epoch": 0.07821629851650498, + "grad_norm": 0.95703125, + "learning_rate": 4.3303742962032515e-05, + "loss": 1.0153, + "step": 1784 + }, + { + "epoch": 0.07826014173316222, + "grad_norm": 0.84765625, + "learning_rate": 4.3299950720387405e-05, + "loss": 0.9166, + "step": 1785 + }, + { + "epoch": 0.07830398494981945, + "grad_norm": 0.84375, + "learning_rate": 4.329615859891482e-05, + "loss": 0.7704, + "step": 1786 + }, + { + "epoch": 0.07834782816647667, + "grad_norm": 0.82421875, + "learning_rate": 4.329236659762278e-05, + "loss": 0.7244, + "step": 1787 + }, + { + "epoch": 0.07839167138313391, + "grad_norm": 0.94140625, + "learning_rate": 4.3288574716519325e-05, + "loss": 0.77, + "step": 1788 + }, + { + "epoch": 0.07843551459979115, + "grad_norm": 0.828125, + "learning_rate": 4.32847829556125e-05, + "loss": 0.9248, + "step": 1789 + }, + { + "epoch": 0.07847935781644838, + "grad_norm": 0.90234375, + "learning_rate": 4.32809913149103e-05, + "loss": 0.7406, + "step": 1790 + }, + { + "epoch": 0.07852320103310562, + "grad_norm": 0.87890625, + "learning_rate": 4.327719979442083e-05, + "loss": 0.8077, + "step": 1791 + }, + { + "epoch": 0.07856704424976285, + "grad_norm": 0.83203125, + "learning_rate": 4.327340839415209e-05, + "loss": 0.9434, + "step": 1792 + }, + { + "epoch": 0.07861088746642009, + "grad_norm": 0.8203125, + "learning_rate": 4.3269617114112126e-05, + "loss": 0.8456, + "step": 1793 + }, + { + "epoch": 0.07865473068307731, + "grad_norm": 0.875, + "learning_rate": 4.3265825954308955e-05, + "loss": 0.8662, + "step": 1794 + }, + { + "epoch": 0.07869857389973454, + "grad_norm": 0.87109375, + "learning_rate": 4.32620349147506e-05, + "loss": 0.7614, + "step": 1795 + }, + { + "epoch": 0.07874241711639178, + "grad_norm": 0.78515625, + "learning_rate": 4.325824399544516e-05, + "loss": 0.8938, + "step": 1796 + }, + { + "epoch": 0.07878626033304902, + "grad_norm": 0.80859375, + "learning_rate": 4.325445319640061e-05, + "loss": 0.7825, + "step": 1797 + }, + { + "epoch": 0.07883010354970625, + "grad_norm": 0.859375, + "learning_rate": 4.325066251762502e-05, + "loss": 0.7781, + "step": 1798 + }, + { + "epoch": 0.07887394676636349, + "grad_norm": 0.7734375, + "learning_rate": 4.324687195912639e-05, + "loss": 0.7955, + "step": 1799 + }, + { + "epoch": 0.07891778998302072, + "grad_norm": 0.78515625, + "learning_rate": 4.324308152091274e-05, + "loss": 0.7987, + "step": 1800 + }, + { + "epoch": 0.07896163319967796, + "grad_norm": 0.78515625, + "learning_rate": 4.3239291202992184e-05, + "loss": 0.726, + "step": 1801 + }, + { + "epoch": 0.07900547641633518, + "grad_norm": 0.88671875, + "learning_rate": 4.323550100537269e-05, + "loss": 0.8217, + "step": 1802 + }, + { + "epoch": 0.07904931963299242, + "grad_norm": 0.87109375, + "learning_rate": 4.323171092806232e-05, + "loss": 0.8105, + "step": 1803 + }, + { + "epoch": 0.07909316284964965, + "grad_norm": 0.828125, + "learning_rate": 4.322792097106908e-05, + "loss": 0.8565, + "step": 1804 + }, + { + "epoch": 0.07913700606630689, + "grad_norm": 0.87890625, + "learning_rate": 4.322413113440099e-05, + "loss": 0.8459, + "step": 1805 + }, + { + "epoch": 0.07918084928296412, + "grad_norm": 0.84375, + "learning_rate": 4.3220341418066144e-05, + "loss": 0.8744, + "step": 1806 + }, + { + "epoch": 0.07922469249962136, + "grad_norm": 0.890625, + "learning_rate": 4.3216551822072534e-05, + "loss": 0.9505, + "step": 1807 + }, + { + "epoch": 0.0792685357162786, + "grad_norm": 0.74609375, + "learning_rate": 4.3212762346428205e-05, + "loss": 0.699, + "step": 1808 + }, + { + "epoch": 0.07931237893293581, + "grad_norm": 0.78515625, + "learning_rate": 4.320897299114117e-05, + "loss": 0.8259, + "step": 1809 + }, + { + "epoch": 0.07935622214959305, + "grad_norm": 0.89453125, + "learning_rate": 4.3205183756219446e-05, + "loss": 0.8974, + "step": 1810 + }, + { + "epoch": 0.07940006536625029, + "grad_norm": 0.8515625, + "learning_rate": 4.3201394641671123e-05, + "loss": 0.8737, + "step": 1811 + }, + { + "epoch": 0.07944390858290752, + "grad_norm": 0.88671875, + "learning_rate": 4.3197605647504194e-05, + "loss": 1.0931, + "step": 1812 + }, + { + "epoch": 0.07948775179956476, + "grad_norm": 0.875, + "learning_rate": 4.3193816773726694e-05, + "loss": 0.7881, + "step": 1813 + }, + { + "epoch": 0.07953159501622199, + "grad_norm": 0.89453125, + "learning_rate": 4.3190028020346654e-05, + "loss": 0.801, + "step": 1814 + }, + { + "epoch": 0.07957543823287923, + "grad_norm": 0.91015625, + "learning_rate": 4.318623938737206e-05, + "loss": 0.878, + "step": 1815 + }, + { + "epoch": 0.07961928144953646, + "grad_norm": 0.76171875, + "learning_rate": 4.318245087481103e-05, + "loss": 0.793, + "step": 1816 + }, + { + "epoch": 0.07966312466619369, + "grad_norm": 0.81640625, + "learning_rate": 4.317866248267154e-05, + "loss": 0.8508, + "step": 1817 + }, + { + "epoch": 0.07970696788285092, + "grad_norm": 0.89453125, + "learning_rate": 4.317487421096163e-05, + "loss": 0.976, + "step": 1818 + }, + { + "epoch": 0.07975081109950816, + "grad_norm": 0.828125, + "learning_rate": 4.317108605968933e-05, + "loss": 0.8247, + "step": 1819 + }, + { + "epoch": 0.07979465431616539, + "grad_norm": 0.83203125, + "learning_rate": 4.3167298028862624e-05, + "loss": 0.847, + "step": 1820 + }, + { + "epoch": 0.07983849753282263, + "grad_norm": 0.83203125, + "learning_rate": 4.316351011848961e-05, + "loss": 0.8069, + "step": 1821 + }, + { + "epoch": 0.07988234074947986, + "grad_norm": 0.86328125, + "learning_rate": 4.31597223285783e-05, + "loss": 0.884, + "step": 1822 + }, + { + "epoch": 0.0799261839661371, + "grad_norm": 0.7890625, + "learning_rate": 4.315593465913671e-05, + "loss": 0.7151, + "step": 1823 + }, + { + "epoch": 0.07997002718279433, + "grad_norm": 0.8359375, + "learning_rate": 4.3152147110172855e-05, + "loss": 0.8387, + "step": 1824 + }, + { + "epoch": 0.08001387039945156, + "grad_norm": 0.84375, + "learning_rate": 4.314835968169478e-05, + "loss": 0.8423, + "step": 1825 + }, + { + "epoch": 0.08005771361610879, + "grad_norm": 0.87890625, + "learning_rate": 4.314457237371051e-05, + "loss": 0.8303, + "step": 1826 + }, + { + "epoch": 0.08010155683276603, + "grad_norm": 0.78125, + "learning_rate": 4.314078518622804e-05, + "loss": 0.8709, + "step": 1827 + }, + { + "epoch": 0.08014540004942326, + "grad_norm": 0.78125, + "learning_rate": 4.3136998119255454e-05, + "loss": 0.8206, + "step": 1828 + }, + { + "epoch": 0.0801892432660805, + "grad_norm": 0.9609375, + "learning_rate": 4.3133211172800744e-05, + "loss": 0.9009, + "step": 1829 + }, + { + "epoch": 0.08023308648273773, + "grad_norm": 0.8828125, + "learning_rate": 4.312942434687196e-05, + "loss": 0.736, + "step": 1830 + }, + { + "epoch": 0.08027692969939497, + "grad_norm": 0.890625, + "learning_rate": 4.31256376414771e-05, + "loss": 0.9512, + "step": 1831 + }, + { + "epoch": 0.08032077291605219, + "grad_norm": 0.76953125, + "learning_rate": 4.31218510566242e-05, + "loss": 0.7228, + "step": 1832 + }, + { + "epoch": 0.08036461613270943, + "grad_norm": 0.76953125, + "learning_rate": 4.311806459232128e-05, + "loss": 0.7857, + "step": 1833 + }, + { + "epoch": 0.08040845934936666, + "grad_norm": 0.77734375, + "learning_rate": 4.311427824857639e-05, + "loss": 0.943, + "step": 1834 + }, + { + "epoch": 0.0804523025660239, + "grad_norm": 0.83984375, + "learning_rate": 4.311049202539749e-05, + "loss": 0.878, + "step": 1835 + }, + { + "epoch": 0.08049614578268113, + "grad_norm": 0.765625, + "learning_rate": 4.310670592279268e-05, + "loss": 0.7661, + "step": 1836 + }, + { + "epoch": 0.08053998899933837, + "grad_norm": 0.75, + "learning_rate": 4.310291994076996e-05, + "loss": 0.7538, + "step": 1837 + }, + { + "epoch": 0.0805838322159956, + "grad_norm": 0.81640625, + "learning_rate": 4.309913407933734e-05, + "loss": 0.9319, + "step": 1838 + }, + { + "epoch": 0.08062767543265284, + "grad_norm": 0.8125, + "learning_rate": 4.309534833850286e-05, + "loss": 0.9217, + "step": 1839 + }, + { + "epoch": 0.08067151864931006, + "grad_norm": 0.7890625, + "learning_rate": 4.30915627182745e-05, + "loss": 0.9549, + "step": 1840 + }, + { + "epoch": 0.0807153618659673, + "grad_norm": 0.8203125, + "learning_rate": 4.3087777218660364e-05, + "loss": 0.815, + "step": 1841 + }, + { + "epoch": 0.08075920508262453, + "grad_norm": 0.78125, + "learning_rate": 4.308399183966843e-05, + "loss": 0.7697, + "step": 1842 + }, + { + "epoch": 0.08080304829928177, + "grad_norm": 0.8515625, + "learning_rate": 4.3080206581306715e-05, + "loss": 0.8313, + "step": 1843 + }, + { + "epoch": 0.080846891515939, + "grad_norm": 0.94140625, + "learning_rate": 4.307642144358325e-05, + "loss": 0.8819, + "step": 1844 + }, + { + "epoch": 0.08089073473259624, + "grad_norm": 0.8046875, + "learning_rate": 4.307263642650605e-05, + "loss": 0.8339, + "step": 1845 + }, + { + "epoch": 0.08093457794925348, + "grad_norm": 0.796875, + "learning_rate": 4.306885153008312e-05, + "loss": 0.8294, + "step": 1846 + }, + { + "epoch": 0.0809784211659107, + "grad_norm": 0.76171875, + "learning_rate": 4.306506675432254e-05, + "loss": 0.9287, + "step": 1847 + }, + { + "epoch": 0.08102226438256793, + "grad_norm": 0.76171875, + "learning_rate": 4.306128209923228e-05, + "loss": 0.8079, + "step": 1848 + }, + { + "epoch": 0.08106610759922517, + "grad_norm": 0.78125, + "learning_rate": 4.305749756482039e-05, + "loss": 0.7482, + "step": 1849 + }, + { + "epoch": 0.0811099508158824, + "grad_norm": 0.83984375, + "learning_rate": 4.305371315109489e-05, + "loss": 0.7977, + "step": 1850 + }, + { + "epoch": 0.08115379403253964, + "grad_norm": 0.90234375, + "learning_rate": 4.3049928858063735e-05, + "loss": 0.7898, + "step": 1851 + }, + { + "epoch": 0.08119763724919687, + "grad_norm": 0.77734375, + "learning_rate": 4.3046144685735046e-05, + "loss": 0.7471, + "step": 1852 + }, + { + "epoch": 0.08124148046585411, + "grad_norm": 0.76953125, + "learning_rate": 4.304236063411679e-05, + "loss": 0.7802, + "step": 1853 + }, + { + "epoch": 0.08128532368251135, + "grad_norm": 0.79296875, + "learning_rate": 4.303857670321702e-05, + "loss": 0.765, + "step": 1854 + }, + { + "epoch": 0.08132916689916857, + "grad_norm": 0.81640625, + "learning_rate": 4.3034792893043704e-05, + "loss": 0.9269, + "step": 1855 + }, + { + "epoch": 0.0813730101158258, + "grad_norm": 0.87109375, + "learning_rate": 4.303100920360487e-05, + "loss": 0.9397, + "step": 1856 + }, + { + "epoch": 0.08141685333248304, + "grad_norm": 0.80078125, + "learning_rate": 4.302722563490858e-05, + "loss": 0.8252, + "step": 1857 + }, + { + "epoch": 0.08146069654914027, + "grad_norm": 0.75, + "learning_rate": 4.3023442186962825e-05, + "loss": 0.652, + "step": 1858 + }, + { + "epoch": 0.08150453976579751, + "grad_norm": 0.75, + "learning_rate": 4.301965885977563e-05, + "loss": 0.8202, + "step": 1859 + }, + { + "epoch": 0.08154838298245475, + "grad_norm": 0.8125, + "learning_rate": 4.301587565335503e-05, + "loss": 0.8885, + "step": 1860 + }, + { + "epoch": 0.08159222619911198, + "grad_norm": 0.85546875, + "learning_rate": 4.301209256770897e-05, + "loss": 0.9328, + "step": 1861 + }, + { + "epoch": 0.0816360694157692, + "grad_norm": 0.90234375, + "learning_rate": 4.3008309602845564e-05, + "loss": 0.8459, + "step": 1862 + }, + { + "epoch": 0.08167991263242644, + "grad_norm": 0.79296875, + "learning_rate": 4.300452675877279e-05, + "loss": 0.8239, + "step": 1863 + }, + { + "epoch": 0.08172375584908367, + "grad_norm": 0.86328125, + "learning_rate": 4.3000744035498666e-05, + "loss": 1.0151, + "step": 1864 + }, + { + "epoch": 0.08176759906574091, + "grad_norm": 0.80078125, + "learning_rate": 4.29969614330312e-05, + "loss": 0.9201, + "step": 1865 + }, + { + "epoch": 0.08181144228239814, + "grad_norm": 0.8671875, + "learning_rate": 4.2993178951378374e-05, + "loss": 0.8629, + "step": 1866 + }, + { + "epoch": 0.08185528549905538, + "grad_norm": 0.78125, + "learning_rate": 4.298939659054829e-05, + "loss": 0.8031, + "step": 1867 + }, + { + "epoch": 0.08189912871571262, + "grad_norm": 0.875, + "learning_rate": 4.298561435054893e-05, + "loss": 0.8796, + "step": 1868 + }, + { + "epoch": 0.08194297193236985, + "grad_norm": 0.80859375, + "learning_rate": 4.2981832231388296e-05, + "loss": 0.695, + "step": 1869 + }, + { + "epoch": 0.08198681514902707, + "grad_norm": 0.8046875, + "learning_rate": 4.297805023307441e-05, + "loss": 0.9063, + "step": 1870 + }, + { + "epoch": 0.08203065836568431, + "grad_norm": 0.87890625, + "learning_rate": 4.297426835561529e-05, + "loss": 0.7053, + "step": 1871 + }, + { + "epoch": 0.08207450158234154, + "grad_norm": 0.78515625, + "learning_rate": 4.2970486599018935e-05, + "loss": 0.757, + "step": 1872 + }, + { + "epoch": 0.08211834479899878, + "grad_norm": 0.80859375, + "learning_rate": 4.2966704963293384e-05, + "loss": 0.7794, + "step": 1873 + }, + { + "epoch": 0.08216218801565602, + "grad_norm": 0.88671875, + "learning_rate": 4.296292344844664e-05, + "loss": 0.87, + "step": 1874 + }, + { + "epoch": 0.08220603123231325, + "grad_norm": 0.82421875, + "learning_rate": 4.2959142054486724e-05, + "loss": 0.8696, + "step": 1875 + }, + { + "epoch": 0.08224987444897049, + "grad_norm": 0.8046875, + "learning_rate": 4.295536078142161e-05, + "loss": 0.7627, + "step": 1876 + }, + { + "epoch": 0.08229371766562772, + "grad_norm": 0.890625, + "learning_rate": 4.2951579629259374e-05, + "loss": 0.7983, + "step": 1877 + }, + { + "epoch": 0.08233756088228494, + "grad_norm": 0.828125, + "learning_rate": 4.2947798598008016e-05, + "loss": 0.8516, + "step": 1878 + }, + { + "epoch": 0.08238140409894218, + "grad_norm": 0.84765625, + "learning_rate": 4.294401768767553e-05, + "loss": 0.7417, + "step": 1879 + }, + { + "epoch": 0.08242524731559941, + "grad_norm": 0.859375, + "learning_rate": 4.294023689826995e-05, + "loss": 0.8304, + "step": 1880 + }, + { + "epoch": 0.08246909053225665, + "grad_norm": 0.84765625, + "learning_rate": 4.293645622979923e-05, + "loss": 0.8748, + "step": 1881 + }, + { + "epoch": 0.08251293374891389, + "grad_norm": 0.8671875, + "learning_rate": 4.293267568227146e-05, + "loss": 0.794, + "step": 1882 + }, + { + "epoch": 0.08255677696557112, + "grad_norm": 0.79296875, + "learning_rate": 4.292889525569464e-05, + "loss": 0.7083, + "step": 1883 + }, + { + "epoch": 0.08260062018222836, + "grad_norm": 0.81640625, + "learning_rate": 4.292511495007675e-05, + "loss": 0.701, + "step": 1884 + }, + { + "epoch": 0.08264446339888558, + "grad_norm": 0.76953125, + "learning_rate": 4.2921334765425814e-05, + "loss": 0.852, + "step": 1885 + }, + { + "epoch": 0.08268830661554281, + "grad_norm": 0.796875, + "learning_rate": 4.291755470174982e-05, + "loss": 0.8169, + "step": 1886 + }, + { + "epoch": 0.08273214983220005, + "grad_norm": 0.80078125, + "learning_rate": 4.291377475905684e-05, + "loss": 0.808, + "step": 1887 + }, + { + "epoch": 0.08277599304885729, + "grad_norm": 0.88671875, + "learning_rate": 4.2909994937354846e-05, + "loss": 0.7349, + "step": 1888 + }, + { + "epoch": 0.08281983626551452, + "grad_norm": 0.87109375, + "learning_rate": 4.290621523665186e-05, + "loss": 0.7739, + "step": 1889 + }, + { + "epoch": 0.08286367948217176, + "grad_norm": 0.84375, + "learning_rate": 4.290243565695589e-05, + "loss": 0.8516, + "step": 1890 + }, + { + "epoch": 0.08290752269882899, + "grad_norm": 0.91796875, + "learning_rate": 4.28986561982749e-05, + "loss": 0.8149, + "step": 1891 + }, + { + "epoch": 0.08295136591548623, + "grad_norm": 0.796875, + "learning_rate": 4.289487686061699e-05, + "loss": 0.8292, + "step": 1892 + }, + { + "epoch": 0.08299520913214345, + "grad_norm": 0.83203125, + "learning_rate": 4.289109764399011e-05, + "loss": 0.8692, + "step": 1893 + }, + { + "epoch": 0.08303905234880068, + "grad_norm": 0.8984375, + "learning_rate": 4.28873185484023e-05, + "loss": 0.7519, + "step": 1894 + }, + { + "epoch": 0.08308289556545792, + "grad_norm": 0.83203125, + "learning_rate": 4.288353957386154e-05, + "loss": 0.9545, + "step": 1895 + }, + { + "epoch": 0.08312673878211516, + "grad_norm": 0.859375, + "learning_rate": 4.2879760720375826e-05, + "loss": 0.9213, + "step": 1896 + }, + { + "epoch": 0.08317058199877239, + "grad_norm": 0.90234375, + "learning_rate": 4.287598198795323e-05, + "loss": 0.753, + "step": 1897 + }, + { + "epoch": 0.08321442521542963, + "grad_norm": 0.81640625, + "learning_rate": 4.2872203376601725e-05, + "loss": 0.763, + "step": 1898 + }, + { + "epoch": 0.08325826843208686, + "grad_norm": 0.875, + "learning_rate": 4.2868424886329316e-05, + "loss": 0.8551, + "step": 1899 + }, + { + "epoch": 0.08330211164874408, + "grad_norm": 0.81640625, + "learning_rate": 4.286464651714402e-05, + "loss": 0.7311, + "step": 1900 + }, + { + "epoch": 0.08334595486540132, + "grad_norm": 0.81640625, + "learning_rate": 4.28608682690538e-05, + "loss": 0.8493, + "step": 1901 + }, + { + "epoch": 0.08338979808205856, + "grad_norm": 0.8046875, + "learning_rate": 4.2857090142066744e-05, + "loss": 0.7985, + "step": 1902 + }, + { + "epoch": 0.08343364129871579, + "grad_norm": 0.85546875, + "learning_rate": 4.2853312136190814e-05, + "loss": 0.868, + "step": 1903 + }, + { + "epoch": 0.08347748451537303, + "grad_norm": 0.87109375, + "learning_rate": 4.284953425143403e-05, + "loss": 0.8965, + "step": 1904 + }, + { + "epoch": 0.08352132773203026, + "grad_norm": 0.94921875, + "learning_rate": 4.284575648780439e-05, + "loss": 0.8513, + "step": 1905 + }, + { + "epoch": 0.0835651709486875, + "grad_norm": 0.73046875, + "learning_rate": 4.284197884530986e-05, + "loss": 0.6729, + "step": 1906 + }, + { + "epoch": 0.08360901416534473, + "grad_norm": 0.796875, + "learning_rate": 4.283820132395853e-05, + "loss": 0.8428, + "step": 1907 + }, + { + "epoch": 0.08365285738200195, + "grad_norm": 0.78515625, + "learning_rate": 4.283442392375836e-05, + "loss": 0.8432, + "step": 1908 + }, + { + "epoch": 0.08369670059865919, + "grad_norm": 0.8125, + "learning_rate": 4.283064664471737e-05, + "loss": 0.8432, + "step": 1909 + }, + { + "epoch": 0.08374054381531643, + "grad_norm": 0.79296875, + "learning_rate": 4.282686948684355e-05, + "loss": 0.7744, + "step": 1910 + }, + { + "epoch": 0.08378438703197366, + "grad_norm": 0.83984375, + "learning_rate": 4.282309245014492e-05, + "loss": 0.8757, + "step": 1911 + }, + { + "epoch": 0.0838282302486309, + "grad_norm": 0.92578125, + "learning_rate": 4.281931553462947e-05, + "loss": 0.963, + "step": 1912 + }, + { + "epoch": 0.08387207346528813, + "grad_norm": 0.79296875, + "learning_rate": 4.281553874030517e-05, + "loss": 0.9407, + "step": 1913 + }, + { + "epoch": 0.08391591668194537, + "grad_norm": 0.890625, + "learning_rate": 4.281176206718012e-05, + "loss": 0.9325, + "step": 1914 + }, + { + "epoch": 0.0839597598986026, + "grad_norm": 0.75, + "learning_rate": 4.2807985515262264e-05, + "loss": 0.7047, + "step": 1915 + }, + { + "epoch": 0.08400360311525983, + "grad_norm": 0.75390625, + "learning_rate": 4.28042090845596e-05, + "loss": 0.7737, + "step": 1916 + }, + { + "epoch": 0.08404744633191706, + "grad_norm": 0.8828125, + "learning_rate": 4.280043277508016e-05, + "loss": 0.776, + "step": 1917 + }, + { + "epoch": 0.0840912895485743, + "grad_norm": 0.87109375, + "learning_rate": 4.279665658683193e-05, + "loss": 0.8891, + "step": 1918 + }, + { + "epoch": 0.08413513276523153, + "grad_norm": 0.77734375, + "learning_rate": 4.279288051982292e-05, + "loss": 0.7231, + "step": 1919 + }, + { + "epoch": 0.08417897598188877, + "grad_norm": 0.8515625, + "learning_rate": 4.2789104574061134e-05, + "loss": 0.8774, + "step": 1920 + }, + { + "epoch": 0.084222819198546, + "grad_norm": 0.78125, + "learning_rate": 4.278532874955452e-05, + "loss": 0.8403, + "step": 1921 + }, + { + "epoch": 0.08426666241520324, + "grad_norm": 0.78515625, + "learning_rate": 4.278155304631118e-05, + "loss": 0.7951, + "step": 1922 + }, + { + "epoch": 0.08431050563186046, + "grad_norm": 0.7421875, + "learning_rate": 4.277777746433906e-05, + "loss": 0.7446, + "step": 1923 + }, + { + "epoch": 0.0843543488485177, + "grad_norm": 0.90234375, + "learning_rate": 4.277400200364617e-05, + "loss": 0.7913, + "step": 1924 + }, + { + "epoch": 0.08439819206517493, + "grad_norm": 0.859375, + "learning_rate": 4.277022666424052e-05, + "loss": 0.7593, + "step": 1925 + }, + { + "epoch": 0.08444203528183217, + "grad_norm": 0.7734375, + "learning_rate": 4.276645144613006e-05, + "loss": 0.8708, + "step": 1926 + }, + { + "epoch": 0.0844858784984894, + "grad_norm": 0.78515625, + "learning_rate": 4.276267634932288e-05, + "loss": 0.8014, + "step": 1927 + }, + { + "epoch": 0.08452972171514664, + "grad_norm": 0.78515625, + "learning_rate": 4.275890137382693e-05, + "loss": 0.8851, + "step": 1928 + }, + { + "epoch": 0.08457356493180387, + "grad_norm": 0.828125, + "learning_rate": 4.2755126519650225e-05, + "loss": 0.695, + "step": 1929 + }, + { + "epoch": 0.08461740814846111, + "grad_norm": 0.8671875, + "learning_rate": 4.2751351786800744e-05, + "loss": 0.909, + "step": 1930 + }, + { + "epoch": 0.08466125136511833, + "grad_norm": 0.8828125, + "learning_rate": 4.274757717528647e-05, + "loss": 0.8367, + "step": 1931 + }, + { + "epoch": 0.08470509458177557, + "grad_norm": 0.77734375, + "learning_rate": 4.2743802685115465e-05, + "loss": 0.8287, + "step": 1932 + }, + { + "epoch": 0.0847489377984328, + "grad_norm": 0.78125, + "learning_rate": 4.274002831629571e-05, + "loss": 0.8126, + "step": 1933 + }, + { + "epoch": 0.08479278101509004, + "grad_norm": 0.85546875, + "learning_rate": 4.273625406883518e-05, + "loss": 0.8979, + "step": 1934 + }, + { + "epoch": 0.08483662423174727, + "grad_norm": 0.69921875, + "learning_rate": 4.273247994274189e-05, + "loss": 0.7978, + "step": 1935 + }, + { + "epoch": 0.08488046744840451, + "grad_norm": 0.73828125, + "learning_rate": 4.2728705938023805e-05, + "loss": 0.6352, + "step": 1936 + }, + { + "epoch": 0.08492431066506174, + "grad_norm": 0.890625, + "learning_rate": 4.272493205468898e-05, + "loss": 0.836, + "step": 1937 + }, + { + "epoch": 0.08496815388171897, + "grad_norm": 0.7890625, + "learning_rate": 4.27211582927454e-05, + "loss": 0.8787, + "step": 1938 + }, + { + "epoch": 0.0850119970983762, + "grad_norm": 0.78125, + "learning_rate": 4.271738465220104e-05, + "loss": 0.7463, + "step": 1939 + }, + { + "epoch": 0.08505584031503344, + "grad_norm": 0.9453125, + "learning_rate": 4.2713611133063916e-05, + "loss": 0.8402, + "step": 1940 + }, + { + "epoch": 0.08509968353169067, + "grad_norm": 0.8125, + "learning_rate": 4.270983773534199e-05, + "loss": 0.8515, + "step": 1941 + }, + { + "epoch": 0.08514352674834791, + "grad_norm": 0.86328125, + "learning_rate": 4.27060644590433e-05, + "loss": 0.8063, + "step": 1942 + }, + { + "epoch": 0.08518736996500514, + "grad_norm": 0.76953125, + "learning_rate": 4.2702291304175855e-05, + "loss": 0.9181, + "step": 1943 + }, + { + "epoch": 0.08523121318166238, + "grad_norm": 0.84765625, + "learning_rate": 4.2698518270747624e-05, + "loss": 0.8177, + "step": 1944 + }, + { + "epoch": 0.08527505639831962, + "grad_norm": 0.83203125, + "learning_rate": 4.26947453587666e-05, + "loss": 0.884, + "step": 1945 + }, + { + "epoch": 0.08531889961497684, + "grad_norm": 0.8203125, + "learning_rate": 4.269097256824075e-05, + "loss": 0.8086, + "step": 1946 + }, + { + "epoch": 0.08536274283163407, + "grad_norm": 0.75390625, + "learning_rate": 4.2687199899178156e-05, + "loss": 0.7935, + "step": 1947 + }, + { + "epoch": 0.08540658604829131, + "grad_norm": 0.7734375, + "learning_rate": 4.268342735158676e-05, + "loss": 0.8311, + "step": 1948 + }, + { + "epoch": 0.08545042926494854, + "grad_norm": 0.8046875, + "learning_rate": 4.2679654925474566e-05, + "loss": 0.9365, + "step": 1949 + }, + { + "epoch": 0.08549427248160578, + "grad_norm": 0.73046875, + "learning_rate": 4.267588262084956e-05, + "loss": 0.734, + "step": 1950 + }, + { + "epoch": 0.08553811569826301, + "grad_norm": 0.80078125, + "learning_rate": 4.2672110437719713e-05, + "loss": 0.8397, + "step": 1951 + }, + { + "epoch": 0.08558195891492025, + "grad_norm": 0.7890625, + "learning_rate": 4.2668338376093095e-05, + "loss": 0.9353, + "step": 1952 + }, + { + "epoch": 0.08562580213157747, + "grad_norm": 0.82421875, + "learning_rate": 4.266456643597765e-05, + "loss": 0.7483, + "step": 1953 + }, + { + "epoch": 0.08566964534823471, + "grad_norm": 0.77734375, + "learning_rate": 4.266079461738137e-05, + "loss": 0.8328, + "step": 1954 + }, + { + "epoch": 0.08571348856489194, + "grad_norm": 0.94140625, + "learning_rate": 4.265702292031227e-05, + "loss": 0.7937, + "step": 1955 + }, + { + "epoch": 0.08575733178154918, + "grad_norm": 0.80859375, + "learning_rate": 4.265325134477833e-05, + "loss": 0.7677, + "step": 1956 + }, + { + "epoch": 0.08580117499820641, + "grad_norm": 0.78515625, + "learning_rate": 4.264947989078755e-05, + "loss": 0.7254, + "step": 1957 + }, + { + "epoch": 0.08584501821486365, + "grad_norm": 0.75390625, + "learning_rate": 4.2645708558347916e-05, + "loss": 0.7827, + "step": 1958 + }, + { + "epoch": 0.08588886143152089, + "grad_norm": 0.86328125, + "learning_rate": 4.264193734746742e-05, + "loss": 0.8397, + "step": 1959 + }, + { + "epoch": 0.08593270464817812, + "grad_norm": 0.86328125, + "learning_rate": 4.263816625815407e-05, + "loss": 0.8604, + "step": 1960 + }, + { + "epoch": 0.08597654786483534, + "grad_norm": 0.99609375, + "learning_rate": 4.2634395290415796e-05, + "loss": 0.8478, + "step": 1961 + }, + { + "epoch": 0.08602039108149258, + "grad_norm": 0.90625, + "learning_rate": 4.2630624444260694e-05, + "loss": 0.91, + "step": 1962 + }, + { + "epoch": 0.08606423429814981, + "grad_norm": 0.875, + "learning_rate": 4.262685371969669e-05, + "loss": 0.7638, + "step": 1963 + }, + { + "epoch": 0.08610807751480705, + "grad_norm": 0.99609375, + "learning_rate": 4.26230831167318e-05, + "loss": 0.9496, + "step": 1964 + }, + { + "epoch": 0.08615192073146428, + "grad_norm": 0.80078125, + "learning_rate": 4.261931263537401e-05, + "loss": 0.8424, + "step": 1965 + }, + { + "epoch": 0.08619576394812152, + "grad_norm": 0.796875, + "learning_rate": 4.261554227563126e-05, + "loss": 0.7102, + "step": 1966 + }, + { + "epoch": 0.08623960716477876, + "grad_norm": 0.7421875, + "learning_rate": 4.2611772037511634e-05, + "loss": 0.8517, + "step": 1967 + }, + { + "epoch": 0.08628345038143599, + "grad_norm": 0.78125, + "learning_rate": 4.260800192102307e-05, + "loss": 0.9103, + "step": 1968 + }, + { + "epoch": 0.08632729359809321, + "grad_norm": 0.859375, + "learning_rate": 4.2604231926173566e-05, + "loss": 0.8352, + "step": 1969 + }, + { + "epoch": 0.08637113681475045, + "grad_norm": 0.78125, + "learning_rate": 4.260046205297111e-05, + "loss": 0.8142, + "step": 1970 + }, + { + "epoch": 0.08641498003140768, + "grad_norm": 0.828125, + "learning_rate": 4.259669230142367e-05, + "loss": 0.8898, + "step": 1971 + }, + { + "epoch": 0.08645882324806492, + "grad_norm": 0.890625, + "learning_rate": 4.2592922671539285e-05, + "loss": 0.7685, + "step": 1972 + }, + { + "epoch": 0.08650266646472216, + "grad_norm": 0.7890625, + "learning_rate": 4.258915316332592e-05, + "loss": 0.7165, + "step": 1973 + }, + { + "epoch": 0.08654650968137939, + "grad_norm": 0.8515625, + "learning_rate": 4.258538377679157e-05, + "loss": 0.9405, + "step": 1974 + }, + { + "epoch": 0.08659035289803663, + "grad_norm": 0.87109375, + "learning_rate": 4.258161451194421e-05, + "loss": 0.8359, + "step": 1975 + }, + { + "epoch": 0.08663419611469385, + "grad_norm": 0.80078125, + "learning_rate": 4.25778453687918e-05, + "loss": 0.8612, + "step": 1976 + }, + { + "epoch": 0.08667803933135108, + "grad_norm": 0.96875, + "learning_rate": 4.2574076347342405e-05, + "loss": 0.8685, + "step": 1977 + }, + { + "epoch": 0.08672188254800832, + "grad_norm": 0.859375, + "learning_rate": 4.257030744760398e-05, + "loss": 0.9014, + "step": 1978 + }, + { + "epoch": 0.08676572576466555, + "grad_norm": 0.87109375, + "learning_rate": 4.256653866958449e-05, + "loss": 0.8977, + "step": 1979 + }, + { + "epoch": 0.08680956898132279, + "grad_norm": 0.8203125, + "learning_rate": 4.256277001329194e-05, + "loss": 0.8475, + "step": 1980 + }, + { + "epoch": 0.08685341219798003, + "grad_norm": 0.83984375, + "learning_rate": 4.255900147873429e-05, + "loss": 0.8459, + "step": 1981 + }, + { + "epoch": 0.08689725541463726, + "grad_norm": 0.84375, + "learning_rate": 4.2555233065919586e-05, + "loss": 1.0043, + "step": 1982 + }, + { + "epoch": 0.0869410986312945, + "grad_norm": 0.796875, + "learning_rate": 4.2551464774855775e-05, + "loss": 0.8118, + "step": 1983 + }, + { + "epoch": 0.08698494184795172, + "grad_norm": 0.8046875, + "learning_rate": 4.2547696605550856e-05, + "loss": 0.7342, + "step": 1984 + }, + { + "epoch": 0.08702878506460895, + "grad_norm": 0.86328125, + "learning_rate": 4.254392855801281e-05, + "loss": 0.9226, + "step": 1985 + }, + { + "epoch": 0.08707262828126619, + "grad_norm": 0.84375, + "learning_rate": 4.2540160632249624e-05, + "loss": 0.8822, + "step": 1986 + }, + { + "epoch": 0.08711647149792343, + "grad_norm": 0.7578125, + "learning_rate": 4.2536392828269256e-05, + "loss": 0.7333, + "step": 1987 + }, + { + "epoch": 0.08716031471458066, + "grad_norm": 0.80859375, + "learning_rate": 4.2532625146079744e-05, + "loss": 0.8148, + "step": 1988 + }, + { + "epoch": 0.0872041579312379, + "grad_norm": 0.8046875, + "learning_rate": 4.252885758568905e-05, + "loss": 0.9315, + "step": 1989 + }, + { + "epoch": 0.08724800114789513, + "grad_norm": 0.84375, + "learning_rate": 4.252509014710515e-05, + "loss": 0.8498, + "step": 1990 + }, + { + "epoch": 0.08729184436455235, + "grad_norm": 0.99609375, + "learning_rate": 4.252132283033605e-05, + "loss": 0.9667, + "step": 1991 + }, + { + "epoch": 0.08733568758120959, + "grad_norm": 0.80859375, + "learning_rate": 4.251755563538968e-05, + "loss": 0.6828, + "step": 1992 + }, + { + "epoch": 0.08737953079786683, + "grad_norm": 0.91015625, + "learning_rate": 4.2513788562274104e-05, + "loss": 0.8152, + "step": 1993 + }, + { + "epoch": 0.08742337401452406, + "grad_norm": 0.796875, + "learning_rate": 4.2510021610997264e-05, + "loss": 0.8058, + "step": 1994 + }, + { + "epoch": 0.0874672172311813, + "grad_norm": 0.80859375, + "learning_rate": 4.2506254781567144e-05, + "loss": 0.8581, + "step": 1995 + }, + { + "epoch": 0.08751106044783853, + "grad_norm": 0.8046875, + "learning_rate": 4.250248807399173e-05, + "loss": 0.7963, + "step": 1996 + }, + { + "epoch": 0.08755490366449577, + "grad_norm": 0.7421875, + "learning_rate": 4.249872148827901e-05, + "loss": 0.8815, + "step": 1997 + }, + { + "epoch": 0.087598746881153, + "grad_norm": 0.7109375, + "learning_rate": 4.249495502443693e-05, + "loss": 0.7704, + "step": 1998 + }, + { + "epoch": 0.08764259009781022, + "grad_norm": 0.81640625, + "learning_rate": 4.249118868247354e-05, + "loss": 0.6675, + "step": 1999 + }, + { + "epoch": 0.08768643331446746, + "grad_norm": 0.84765625, + "learning_rate": 4.2487422462396786e-05, + "loss": 0.7956, + "step": 2000 + }, + { + "epoch": 0.08768643331446746, + "eval_loss": 0.8137744069099426, + "eval_runtime": 267.3558, + "eval_samples_per_second": 37.403, + "eval_steps_per_second": 0.782, + "step": 2000 + }, + { + "epoch": 0.0877302765311247, + "grad_norm": 0.828125, + "learning_rate": 4.2483656364214654e-05, + "loss": 0.8151, + "step": 2001 + }, + { + "epoch": 0.08777411974778193, + "grad_norm": 0.84765625, + "learning_rate": 4.2479890387935126e-05, + "loss": 0.823, + "step": 2002 + }, + { + "epoch": 0.08781796296443917, + "grad_norm": 0.828125, + "learning_rate": 4.247612453356618e-05, + "loss": 0.7173, + "step": 2003 + }, + { + "epoch": 0.0878618061810964, + "grad_norm": 0.90234375, + "learning_rate": 4.247235880111581e-05, + "loss": 0.8067, + "step": 2004 + }, + { + "epoch": 0.08790564939775364, + "grad_norm": 0.78125, + "learning_rate": 4.246859319059198e-05, + "loss": 0.7786, + "step": 2005 + }, + { + "epoch": 0.08794949261441087, + "grad_norm": 0.8046875, + "learning_rate": 4.246482770200267e-05, + "loss": 0.8176, + "step": 2006 + }, + { + "epoch": 0.0879933358310681, + "grad_norm": 0.7890625, + "learning_rate": 4.246106233535585e-05, + "loss": 0.769, + "step": 2007 + }, + { + "epoch": 0.08803717904772533, + "grad_norm": 0.7734375, + "learning_rate": 4.2457297090659545e-05, + "loss": 0.7275, + "step": 2008 + }, + { + "epoch": 0.08808102226438257, + "grad_norm": 0.85546875, + "learning_rate": 4.2453531967921714e-05, + "loss": 0.9555, + "step": 2009 + }, + { + "epoch": 0.0881248654810398, + "grad_norm": 0.79296875, + "learning_rate": 4.2449766967150325e-05, + "loss": 0.9358, + "step": 2010 + }, + { + "epoch": 0.08816870869769704, + "grad_norm": 1.5, + "learning_rate": 4.2446002088353375e-05, + "loss": 0.8104, + "step": 2011 + }, + { + "epoch": 0.08821255191435427, + "grad_norm": 0.80078125, + "learning_rate": 4.244223733153879e-05, + "loss": 0.6746, + "step": 2012 + }, + { + "epoch": 0.08825639513101151, + "grad_norm": 0.90625, + "learning_rate": 4.243847269671464e-05, + "loss": 0.7984, + "step": 2013 + }, + { + "epoch": 0.08830023834766873, + "grad_norm": 0.78125, + "learning_rate": 4.243470818388885e-05, + "loss": 0.7789, + "step": 2014 + }, + { + "epoch": 0.08834408156432597, + "grad_norm": 0.9453125, + "learning_rate": 4.24309437930694e-05, + "loss": 0.8835, + "step": 2015 + }, + { + "epoch": 0.0883879247809832, + "grad_norm": 0.76953125, + "learning_rate": 4.242717952426428e-05, + "loss": 0.715, + "step": 2016 + }, + { + "epoch": 0.08843176799764044, + "grad_norm": 0.890625, + "learning_rate": 4.242341537748144e-05, + "loss": 0.8203, + "step": 2017 + }, + { + "epoch": 0.08847561121429767, + "grad_norm": 0.76953125, + "learning_rate": 4.2419651352728896e-05, + "loss": 0.7483, + "step": 2018 + }, + { + "epoch": 0.08851945443095491, + "grad_norm": 0.828125, + "learning_rate": 4.241588745001461e-05, + "loss": 0.8339, + "step": 2019 + }, + { + "epoch": 0.08856329764761214, + "grad_norm": 0.83203125, + "learning_rate": 4.241212366934657e-05, + "loss": 0.8199, + "step": 2020 + }, + { + "epoch": 0.08860714086426938, + "grad_norm": 0.796875, + "learning_rate": 4.240836001073274e-05, + "loss": 0.9453, + "step": 2021 + }, + { + "epoch": 0.0886509840809266, + "grad_norm": 0.84765625, + "learning_rate": 4.240459647418106e-05, + "loss": 0.8213, + "step": 2022 + }, + { + "epoch": 0.08869482729758384, + "grad_norm": 0.91796875, + "learning_rate": 4.2400833059699586e-05, + "loss": 0.8764, + "step": 2023 + }, + { + "epoch": 0.08873867051424107, + "grad_norm": 0.796875, + "learning_rate": 4.239706976729625e-05, + "loss": 0.9055, + "step": 2024 + }, + { + "epoch": 0.08878251373089831, + "grad_norm": 0.796875, + "learning_rate": 4.239330659697903e-05, + "loss": 0.8017, + "step": 2025 + }, + { + "epoch": 0.08882635694755554, + "grad_norm": 0.94140625, + "learning_rate": 4.238954354875591e-05, + "loss": 0.8229, + "step": 2026 + }, + { + "epoch": 0.08887020016421278, + "grad_norm": 0.91015625, + "learning_rate": 4.238578062263482e-05, + "loss": 0.7726, + "step": 2027 + }, + { + "epoch": 0.08891404338087001, + "grad_norm": 0.828125, + "learning_rate": 4.238201781862381e-05, + "loss": 0.8706, + "step": 2028 + }, + { + "epoch": 0.08895788659752724, + "grad_norm": 0.828125, + "learning_rate": 4.2378255136730814e-05, + "loss": 0.7696, + "step": 2029 + }, + { + "epoch": 0.08900172981418447, + "grad_norm": 0.73046875, + "learning_rate": 4.2374492576963834e-05, + "loss": 0.8752, + "step": 2030 + }, + { + "epoch": 0.08904557303084171, + "grad_norm": 0.82421875, + "learning_rate": 4.237073013933081e-05, + "loss": 0.78, + "step": 2031 + }, + { + "epoch": 0.08908941624749894, + "grad_norm": 0.80859375, + "learning_rate": 4.236696782383969e-05, + "loss": 0.7819, + "step": 2032 + }, + { + "epoch": 0.08913325946415618, + "grad_norm": 0.765625, + "learning_rate": 4.2363205630498524e-05, + "loss": 0.7811, + "step": 2033 + }, + { + "epoch": 0.08917710268081341, + "grad_norm": 0.94140625, + "learning_rate": 4.2359443559315246e-05, + "loss": 0.8143, + "step": 2034 + }, + { + "epoch": 0.08922094589747065, + "grad_norm": 0.7890625, + "learning_rate": 4.235568161029785e-05, + "loss": 0.8157, + "step": 2035 + }, + { + "epoch": 0.08926478911412788, + "grad_norm": 0.921875, + "learning_rate": 4.235191978345428e-05, + "loss": 0.7758, + "step": 2036 + }, + { + "epoch": 0.0893086323307851, + "grad_norm": 0.7578125, + "learning_rate": 4.2348158078792476e-05, + "loss": 0.7386, + "step": 2037 + }, + { + "epoch": 0.08935247554744234, + "grad_norm": 0.796875, + "learning_rate": 4.2344396496320504e-05, + "loss": 0.6887, + "step": 2038 + }, + { + "epoch": 0.08939631876409958, + "grad_norm": 0.8203125, + "learning_rate": 4.2340635036046295e-05, + "loss": 0.7854, + "step": 2039 + }, + { + "epoch": 0.08944016198075681, + "grad_norm": 0.88671875, + "learning_rate": 4.23368736979778e-05, + "loss": 0.7836, + "step": 2040 + }, + { + "epoch": 0.08948400519741405, + "grad_norm": 0.8359375, + "learning_rate": 4.2333112482123004e-05, + "loss": 0.8116, + "step": 2041 + }, + { + "epoch": 0.08952784841407128, + "grad_norm": 0.85546875, + "learning_rate": 4.2329351388489894e-05, + "loss": 0.8983, + "step": 2042 + }, + { + "epoch": 0.08957169163072852, + "grad_norm": 0.87109375, + "learning_rate": 4.232559041708642e-05, + "loss": 0.9842, + "step": 2043 + }, + { + "epoch": 0.08961553484738576, + "grad_norm": 0.73828125, + "learning_rate": 4.232182956792056e-05, + "loss": 0.6964, + "step": 2044 + }, + { + "epoch": 0.08965937806404298, + "grad_norm": 0.859375, + "learning_rate": 4.2318068841000294e-05, + "loss": 0.8046, + "step": 2045 + }, + { + "epoch": 0.08970322128070021, + "grad_norm": 0.81640625, + "learning_rate": 4.231430823633353e-05, + "loss": 0.869, + "step": 2046 + }, + { + "epoch": 0.08974706449735745, + "grad_norm": 0.7265625, + "learning_rate": 4.231054775392834e-05, + "loss": 0.7579, + "step": 2047 + }, + { + "epoch": 0.08979090771401468, + "grad_norm": 0.75, + "learning_rate": 4.230678739379265e-05, + "loss": 0.8578, + "step": 2048 + }, + { + "epoch": 0.08983475093067192, + "grad_norm": 0.76953125, + "learning_rate": 4.2303027155934414e-05, + "loss": 0.7229, + "step": 2049 + }, + { + "epoch": 0.08987859414732915, + "grad_norm": 0.74609375, + "learning_rate": 4.229926704036162e-05, + "loss": 0.7527, + "step": 2050 + }, + { + "epoch": 0.08992243736398639, + "grad_norm": 0.8359375, + "learning_rate": 4.229550704708224e-05, + "loss": 0.8803, + "step": 2051 + }, + { + "epoch": 0.08996628058064361, + "grad_norm": 0.890625, + "learning_rate": 4.229174717610419e-05, + "loss": 0.9462, + "step": 2052 + }, + { + "epoch": 0.09001012379730085, + "grad_norm": 0.7734375, + "learning_rate": 4.228798742743553e-05, + "loss": 0.7024, + "step": 2053 + }, + { + "epoch": 0.09005396701395808, + "grad_norm": 0.87890625, + "learning_rate": 4.2284227801084166e-05, + "loss": 0.8247, + "step": 2054 + }, + { + "epoch": 0.09009781023061532, + "grad_norm": 0.921875, + "learning_rate": 4.2280468297058095e-05, + "loss": 0.7669, + "step": 2055 + }, + { + "epoch": 0.09014165344727255, + "grad_norm": 0.7734375, + "learning_rate": 4.2276708915365274e-05, + "loss": 0.7756, + "step": 2056 + }, + { + "epoch": 0.09018549666392979, + "grad_norm": 0.76171875, + "learning_rate": 4.2272949656013615e-05, + "loss": 0.7992, + "step": 2057 + }, + { + "epoch": 0.09022933988058703, + "grad_norm": 0.87890625, + "learning_rate": 4.226919051901119e-05, + "loss": 0.7158, + "step": 2058 + }, + { + "epoch": 0.09027318309724426, + "grad_norm": 0.75, + "learning_rate": 4.226543150436591e-05, + "loss": 0.7633, + "step": 2059 + }, + { + "epoch": 0.09031702631390148, + "grad_norm": 0.89453125, + "learning_rate": 4.226167261208576e-05, + "loss": 0.8446, + "step": 2060 + }, + { + "epoch": 0.09036086953055872, + "grad_norm": 0.83203125, + "learning_rate": 4.2257913842178686e-05, + "loss": 0.6426, + "step": 2061 + }, + { + "epoch": 0.09040471274721595, + "grad_norm": 0.90625, + "learning_rate": 4.225415519465262e-05, + "loss": 0.8773, + "step": 2062 + }, + { + "epoch": 0.09044855596387319, + "grad_norm": 0.87109375, + "learning_rate": 4.225039666951561e-05, + "loss": 0.8205, + "step": 2063 + }, + { + "epoch": 0.09049239918053043, + "grad_norm": 0.76953125, + "learning_rate": 4.2246638266775595e-05, + "loss": 0.748, + "step": 2064 + }, + { + "epoch": 0.09053624239718766, + "grad_norm": 0.87890625, + "learning_rate": 4.224287998644052e-05, + "loss": 0.8625, + "step": 2065 + }, + { + "epoch": 0.0905800856138449, + "grad_norm": 0.78125, + "learning_rate": 4.223912182851837e-05, + "loss": 0.7378, + "step": 2066 + }, + { + "epoch": 0.09062392883050212, + "grad_norm": 0.91015625, + "learning_rate": 4.2235363793017044e-05, + "loss": 0.8113, + "step": 2067 + }, + { + "epoch": 0.09066777204715935, + "grad_norm": 0.84765625, + "learning_rate": 4.223160587994461e-05, + "loss": 0.7443, + "step": 2068 + }, + { + "epoch": 0.09071161526381659, + "grad_norm": 0.828125, + "learning_rate": 4.2227848089309e-05, + "loss": 0.8168, + "step": 2069 + }, + { + "epoch": 0.09075545848047382, + "grad_norm": 0.90234375, + "learning_rate": 4.2224090421118144e-05, + "loss": 0.8898, + "step": 2070 + }, + { + "epoch": 0.09079930169713106, + "grad_norm": 0.84765625, + "learning_rate": 4.222033287538003e-05, + "loss": 0.8267, + "step": 2071 + }, + { + "epoch": 0.0908431449137883, + "grad_norm": 0.8203125, + "learning_rate": 4.221657545210258e-05, + "loss": 0.8386, + "step": 2072 + }, + { + "epoch": 0.09088698813044553, + "grad_norm": 0.83984375, + "learning_rate": 4.221281815129384e-05, + "loss": 0.8474, + "step": 2073 + }, + { + "epoch": 0.09093083134710277, + "grad_norm": 0.80859375, + "learning_rate": 4.220906097296172e-05, + "loss": 0.7596, + "step": 2074 + }, + { + "epoch": 0.09097467456375999, + "grad_norm": 0.79296875, + "learning_rate": 4.2205303917114206e-05, + "loss": 0.8184, + "step": 2075 + }, + { + "epoch": 0.09101851778041722, + "grad_norm": 0.8046875, + "learning_rate": 4.2201546983759235e-05, + "loss": 0.7321, + "step": 2076 + }, + { + "epoch": 0.09106236099707446, + "grad_norm": 0.83203125, + "learning_rate": 4.219779017290475e-05, + "loss": 0.7722, + "step": 2077 + }, + { + "epoch": 0.0911062042137317, + "grad_norm": 0.8046875, + "learning_rate": 4.219403348455877e-05, + "loss": 0.8611, + "step": 2078 + }, + { + "epoch": 0.09115004743038893, + "grad_norm": 0.796875, + "learning_rate": 4.219027691872924e-05, + "loss": 0.821, + "step": 2079 + }, + { + "epoch": 0.09119389064704617, + "grad_norm": 0.86328125, + "learning_rate": 4.218652047542412e-05, + "loss": 0.7778, + "step": 2080 + }, + { + "epoch": 0.0912377338637034, + "grad_norm": 0.91796875, + "learning_rate": 4.218276415465136e-05, + "loss": 0.794, + "step": 2081 + }, + { + "epoch": 0.09128157708036062, + "grad_norm": 0.83203125, + "learning_rate": 4.217900795641893e-05, + "loss": 0.883, + "step": 2082 + }, + { + "epoch": 0.09132542029701786, + "grad_norm": 0.72265625, + "learning_rate": 4.217525188073478e-05, + "loss": 0.7854, + "step": 2083 + }, + { + "epoch": 0.0913692635136751, + "grad_norm": 0.7421875, + "learning_rate": 4.2171495927606855e-05, + "loss": 0.7586, + "step": 2084 + }, + { + "epoch": 0.09141310673033233, + "grad_norm": 1.453125, + "learning_rate": 4.2167740097043164e-05, + "loss": 0.9488, + "step": 2085 + }, + { + "epoch": 0.09145694994698957, + "grad_norm": 0.92578125, + "learning_rate": 4.2163984389051646e-05, + "loss": 0.7724, + "step": 2086 + }, + { + "epoch": 0.0915007931636468, + "grad_norm": 1.109375, + "learning_rate": 4.2160228803640256e-05, + "loss": 0.8047, + "step": 2087 + }, + { + "epoch": 0.09154463638030404, + "grad_norm": 0.80078125, + "learning_rate": 4.215647334081695e-05, + "loss": 0.7572, + "step": 2088 + }, + { + "epoch": 0.09158847959696127, + "grad_norm": 0.84375, + "learning_rate": 4.215271800058971e-05, + "loss": 0.7979, + "step": 2089 + }, + { + "epoch": 0.0916323228136185, + "grad_norm": 0.7265625, + "learning_rate": 4.214896278296646e-05, + "loss": 0.7947, + "step": 2090 + }, + { + "epoch": 0.09167616603027573, + "grad_norm": 0.859375, + "learning_rate": 4.214520768795518e-05, + "loss": 0.8635, + "step": 2091 + }, + { + "epoch": 0.09172000924693297, + "grad_norm": 0.84765625, + "learning_rate": 4.214145271556379e-05, + "loss": 0.8672, + "step": 2092 + }, + { + "epoch": 0.0917638524635902, + "grad_norm": 0.87109375, + "learning_rate": 4.213769786580033e-05, + "loss": 0.9508, + "step": 2093 + }, + { + "epoch": 0.09180769568024744, + "grad_norm": 0.8203125, + "learning_rate": 4.213394313867271e-05, + "loss": 0.7132, + "step": 2094 + }, + { + "epoch": 0.09185153889690467, + "grad_norm": 0.80859375, + "learning_rate": 4.2130188534188876e-05, + "loss": 0.8821, + "step": 2095 + }, + { + "epoch": 0.09189538211356191, + "grad_norm": 1.2421875, + "learning_rate": 4.212643405235681e-05, + "loss": 0.8816, + "step": 2096 + }, + { + "epoch": 0.09193922533021914, + "grad_norm": 0.85546875, + "learning_rate": 4.212267969318442e-05, + "loss": 0.7851, + "step": 2097 + }, + { + "epoch": 0.09198306854687636, + "grad_norm": 0.8671875, + "learning_rate": 4.2118925456679734e-05, + "loss": 1.0051, + "step": 2098 + }, + { + "epoch": 0.0920269117635336, + "grad_norm": 0.9375, + "learning_rate": 4.211517134285069e-05, + "loss": 0.9494, + "step": 2099 + }, + { + "epoch": 0.09207075498019084, + "grad_norm": 0.7890625, + "learning_rate": 4.211141735170523e-05, + "loss": 0.7364, + "step": 2100 + }, + { + "epoch": 0.09211459819684807, + "grad_norm": 0.80859375, + "learning_rate": 4.2107663483251306e-05, + "loss": 0.7945, + "step": 2101 + }, + { + "epoch": 0.09215844141350531, + "grad_norm": 0.81640625, + "learning_rate": 4.2103909737496835e-05, + "loss": 0.8594, + "step": 2102 + }, + { + "epoch": 0.09220228463016254, + "grad_norm": 0.9765625, + "learning_rate": 4.210015611444987e-05, + "loss": 0.909, + "step": 2103 + }, + { + "epoch": 0.09224612784681978, + "grad_norm": 0.8046875, + "learning_rate": 4.209640261411831e-05, + "loss": 0.8083, + "step": 2104 + }, + { + "epoch": 0.092289971063477, + "grad_norm": 0.81640625, + "learning_rate": 4.2092649236510117e-05, + "loss": 0.8897, + "step": 2105 + }, + { + "epoch": 0.09233381428013424, + "grad_norm": 0.8046875, + "learning_rate": 4.2088895981633234e-05, + "loss": 0.7101, + "step": 2106 + }, + { + "epoch": 0.09237765749679147, + "grad_norm": 0.83203125, + "learning_rate": 4.208514284949561e-05, + "loss": 0.9057, + "step": 2107 + }, + { + "epoch": 0.0924215007134487, + "grad_norm": 0.80859375, + "learning_rate": 4.208138984010524e-05, + "loss": 0.7606, + "step": 2108 + }, + { + "epoch": 0.09246534393010594, + "grad_norm": 3.03125, + "learning_rate": 4.207763695347005e-05, + "loss": 0.7455, + "step": 2109 + }, + { + "epoch": 0.09250918714676318, + "grad_norm": 0.97265625, + "learning_rate": 4.207388418959799e-05, + "loss": 0.8767, + "step": 2110 + }, + { + "epoch": 0.09255303036342041, + "grad_norm": 0.78515625, + "learning_rate": 4.207013154849704e-05, + "loss": 0.6774, + "step": 2111 + }, + { + "epoch": 0.09259687358007765, + "grad_norm": 0.8203125, + "learning_rate": 4.2066379030175085e-05, + "loss": 0.8635, + "step": 2112 + }, + { + "epoch": 0.09264071679673487, + "grad_norm": 0.8828125, + "learning_rate": 4.2062626634640165e-05, + "loss": 0.7655, + "step": 2113 + }, + { + "epoch": 0.0926845600133921, + "grad_norm": 0.87890625, + "learning_rate": 4.2058874361900214e-05, + "loss": 0.8632, + "step": 2114 + }, + { + "epoch": 0.09272840323004934, + "grad_norm": 0.9140625, + "learning_rate": 4.205512221196315e-05, + "loss": 0.8809, + "step": 2115 + }, + { + "epoch": 0.09277224644670658, + "grad_norm": 0.87890625, + "learning_rate": 4.205137018483695e-05, + "loss": 0.8343, + "step": 2116 + }, + { + "epoch": 0.09281608966336381, + "grad_norm": 0.7578125, + "learning_rate": 4.204761828052952e-05, + "loss": 0.8159, + "step": 2117 + }, + { + "epoch": 0.09285993288002105, + "grad_norm": 0.83203125, + "learning_rate": 4.204386649904888e-05, + "loss": 0.7577, + "step": 2118 + }, + { + "epoch": 0.09290377609667828, + "grad_norm": 0.77734375, + "learning_rate": 4.2040114840402965e-05, + "loss": 0.8009, + "step": 2119 + }, + { + "epoch": 0.0929476193133355, + "grad_norm": 0.8125, + "learning_rate": 4.2036363304599704e-05, + "loss": 0.8158, + "step": 2120 + }, + { + "epoch": 0.09299146252999274, + "grad_norm": 0.86328125, + "learning_rate": 4.2032611891647065e-05, + "loss": 0.8286, + "step": 2121 + }, + { + "epoch": 0.09303530574664998, + "grad_norm": 0.84375, + "learning_rate": 4.202886060155296e-05, + "loss": 0.8931, + "step": 2122 + }, + { + "epoch": 0.09307914896330721, + "grad_norm": 0.79296875, + "learning_rate": 4.2025109434325397e-05, + "loss": 0.8499, + "step": 2123 + }, + { + "epoch": 0.09312299217996445, + "grad_norm": 0.8359375, + "learning_rate": 4.20213583899723e-05, + "loss": 0.9146, + "step": 2124 + }, + { + "epoch": 0.09316683539662168, + "grad_norm": 0.90234375, + "learning_rate": 4.2017607468501616e-05, + "loss": 0.8545, + "step": 2125 + }, + { + "epoch": 0.09321067861327892, + "grad_norm": 0.89453125, + "learning_rate": 4.201385666992131e-05, + "loss": 0.9342, + "step": 2126 + }, + { + "epoch": 0.09325452182993615, + "grad_norm": 0.734375, + "learning_rate": 4.201010599423931e-05, + "loss": 0.8799, + "step": 2127 + }, + { + "epoch": 0.09329836504659338, + "grad_norm": 0.79296875, + "learning_rate": 4.20063554414636e-05, + "loss": 0.8277, + "step": 2128 + }, + { + "epoch": 0.09334220826325061, + "grad_norm": 0.77734375, + "learning_rate": 4.200260501160208e-05, + "loss": 0.7825, + "step": 2129 + }, + { + "epoch": 0.09338605147990785, + "grad_norm": 0.953125, + "learning_rate": 4.1998854704662724e-05, + "loss": 0.8696, + "step": 2130 + }, + { + "epoch": 0.09342989469656508, + "grad_norm": 0.85546875, + "learning_rate": 4.199510452065349e-05, + "loss": 0.765, + "step": 2131 + }, + { + "epoch": 0.09347373791322232, + "grad_norm": 0.87109375, + "learning_rate": 4.1991354459582276e-05, + "loss": 0.7552, + "step": 2132 + }, + { + "epoch": 0.09351758112987955, + "grad_norm": 0.83203125, + "learning_rate": 4.198760452145711e-05, + "loss": 0.8369, + "step": 2133 + }, + { + "epoch": 0.09356142434653679, + "grad_norm": 0.8125, + "learning_rate": 4.1983854706285904e-05, + "loss": 0.7574, + "step": 2134 + }, + { + "epoch": 0.09360526756319403, + "grad_norm": 1.0859375, + "learning_rate": 4.19801050140766e-05, + "loss": 0.7447, + "step": 2135 + }, + { + "epoch": 0.09364911077985125, + "grad_norm": 0.8671875, + "learning_rate": 4.1976355444837144e-05, + "loss": 0.9103, + "step": 2136 + }, + { + "epoch": 0.09369295399650848, + "grad_norm": 0.9140625, + "learning_rate": 4.197260599857549e-05, + "loss": 0.881, + "step": 2137 + }, + { + "epoch": 0.09373679721316572, + "grad_norm": 0.85546875, + "learning_rate": 4.196885667529954e-05, + "loss": 1.0173, + "step": 2138 + }, + { + "epoch": 0.09378064042982295, + "grad_norm": 0.72265625, + "learning_rate": 4.196510747501732e-05, + "loss": 0.7783, + "step": 2139 + }, + { + "epoch": 0.09382448364648019, + "grad_norm": 0.77734375, + "learning_rate": 4.196135839773673e-05, + "loss": 0.8686, + "step": 2140 + }, + { + "epoch": 0.09386832686313742, + "grad_norm": 0.94921875, + "learning_rate": 4.195760944346573e-05, + "loss": 0.8783, + "step": 2141 + }, + { + "epoch": 0.09391217007979466, + "grad_norm": 0.93359375, + "learning_rate": 4.195386061221226e-05, + "loss": 0.8478, + "step": 2142 + }, + { + "epoch": 0.09395601329645188, + "grad_norm": 0.83203125, + "learning_rate": 4.1950111903984224e-05, + "loss": 0.8223, + "step": 2143 + }, + { + "epoch": 0.09399985651310912, + "grad_norm": 0.8203125, + "learning_rate": 4.194636331878965e-05, + "loss": 0.8687, + "step": 2144 + }, + { + "epoch": 0.09404369972976635, + "grad_norm": 0.8359375, + "learning_rate": 4.194261485663644e-05, + "loss": 0.9098, + "step": 2145 + }, + { + "epoch": 0.09408754294642359, + "grad_norm": 0.890625, + "learning_rate": 4.1938866517532536e-05, + "loss": 0.6735, + "step": 2146 + }, + { + "epoch": 0.09413138616308082, + "grad_norm": 0.90234375, + "learning_rate": 4.193511830148589e-05, + "loss": 0.8796, + "step": 2147 + }, + { + "epoch": 0.09417522937973806, + "grad_norm": 0.77734375, + "learning_rate": 4.1931370208504405e-05, + "loss": 0.7683, + "step": 2148 + }, + { + "epoch": 0.0942190725963953, + "grad_norm": 0.8359375, + "learning_rate": 4.19276222385961e-05, + "loss": 0.9416, + "step": 2149 + }, + { + "epoch": 0.09426291581305253, + "grad_norm": 0.73828125, + "learning_rate": 4.192387439176887e-05, + "loss": 0.7553, + "step": 2150 + }, + { + "epoch": 0.09430675902970975, + "grad_norm": 0.7890625, + "learning_rate": 4.192012666803068e-05, + "loss": 0.8237, + "step": 2151 + }, + { + "epoch": 0.09435060224636699, + "grad_norm": 0.83203125, + "learning_rate": 4.191637906738945e-05, + "loss": 0.8579, + "step": 2152 + }, + { + "epoch": 0.09439444546302422, + "grad_norm": 0.9140625, + "learning_rate": 4.191263158985311e-05, + "loss": 0.9593, + "step": 2153 + }, + { + "epoch": 0.09443828867968146, + "grad_norm": 0.91015625, + "learning_rate": 4.1908884235429654e-05, + "loss": 0.841, + "step": 2154 + }, + { + "epoch": 0.0944821318963387, + "grad_norm": 0.85546875, + "learning_rate": 4.1905137004127016e-05, + "loss": 0.7557, + "step": 2155 + }, + { + "epoch": 0.09452597511299593, + "grad_norm": 0.79296875, + "learning_rate": 4.19013898959531e-05, + "loss": 0.9299, + "step": 2156 + }, + { + "epoch": 0.09456981832965317, + "grad_norm": 0.94140625, + "learning_rate": 4.189764291091588e-05, + "loss": 0.9817, + "step": 2157 + }, + { + "epoch": 0.09461366154631039, + "grad_norm": 0.9453125, + "learning_rate": 4.189389604902324e-05, + "loss": 1.0029, + "step": 2158 + }, + { + "epoch": 0.09465750476296762, + "grad_norm": 0.9296875, + "learning_rate": 4.18901493102832e-05, + "loss": 0.8297, + "step": 2159 + }, + { + "epoch": 0.09470134797962486, + "grad_norm": 0.88671875, + "learning_rate": 4.188640269470367e-05, + "loss": 0.9129, + "step": 2160 + }, + { + "epoch": 0.0947451911962821, + "grad_norm": 0.84375, + "learning_rate": 4.1882656202292594e-05, + "loss": 0.7918, + "step": 2161 + }, + { + "epoch": 0.09478903441293933, + "grad_norm": 0.87890625, + "learning_rate": 4.18789098330579e-05, + "loss": 0.9054, + "step": 2162 + }, + { + "epoch": 0.09483287762959657, + "grad_norm": 0.8515625, + "learning_rate": 4.187516358700751e-05, + "loss": 0.8886, + "step": 2163 + }, + { + "epoch": 0.0948767208462538, + "grad_norm": 0.859375, + "learning_rate": 4.187141746414941e-05, + "loss": 0.8827, + "step": 2164 + }, + { + "epoch": 0.09492056406291104, + "grad_norm": 0.84375, + "learning_rate": 4.186767146449152e-05, + "loss": 0.8092, + "step": 2165 + }, + { + "epoch": 0.09496440727956826, + "grad_norm": 0.8359375, + "learning_rate": 4.186392558804179e-05, + "loss": 0.8544, + "step": 2166 + }, + { + "epoch": 0.0950082504962255, + "grad_norm": 0.7578125, + "learning_rate": 4.186017983480813e-05, + "loss": 0.6354, + "step": 2167 + }, + { + "epoch": 0.09505209371288273, + "grad_norm": 0.7734375, + "learning_rate": 4.1856434204798514e-05, + "loss": 0.783, + "step": 2168 + }, + { + "epoch": 0.09509593692953996, + "grad_norm": 0.85546875, + "learning_rate": 4.1852688698020804e-05, + "loss": 1.0166, + "step": 2169 + }, + { + "epoch": 0.0951397801461972, + "grad_norm": 0.8125, + "learning_rate": 4.184894331448305e-05, + "loss": 0.893, + "step": 2170 + }, + { + "epoch": 0.09518362336285444, + "grad_norm": 0.81640625, + "learning_rate": 4.184519805419313e-05, + "loss": 0.8162, + "step": 2171 + }, + { + "epoch": 0.09522746657951167, + "grad_norm": 1.0703125, + "learning_rate": 4.1841452917159e-05, + "loss": 0.7295, + "step": 2172 + }, + { + "epoch": 0.0952713097961689, + "grad_norm": 0.8046875, + "learning_rate": 4.183770790338857e-05, + "loss": 0.798, + "step": 2173 + }, + { + "epoch": 0.09531515301282613, + "grad_norm": 0.83984375, + "learning_rate": 4.18339630128898e-05, + "loss": 0.9089, + "step": 2174 + }, + { + "epoch": 0.09535899622948336, + "grad_norm": 0.81640625, + "learning_rate": 4.183021824567062e-05, + "loss": 0.7511, + "step": 2175 + }, + { + "epoch": 0.0954028394461406, + "grad_norm": 0.80859375, + "learning_rate": 4.182647360173897e-05, + "loss": 0.7414, + "step": 2176 + }, + { + "epoch": 0.09544668266279784, + "grad_norm": 0.7734375, + "learning_rate": 4.182272908110278e-05, + "loss": 0.8344, + "step": 2177 + }, + { + "epoch": 0.09549052587945507, + "grad_norm": 0.7734375, + "learning_rate": 4.181898468376996e-05, + "loss": 0.8402, + "step": 2178 + }, + { + "epoch": 0.0955343690961123, + "grad_norm": 0.87890625, + "learning_rate": 4.181524040974851e-05, + "loss": 0.7851, + "step": 2179 + }, + { + "epoch": 0.09557821231276954, + "grad_norm": 0.8125, + "learning_rate": 4.181149625904633e-05, + "loss": 0.9016, + "step": 2180 + }, + { + "epoch": 0.09562205552942676, + "grad_norm": 0.8203125, + "learning_rate": 4.180775223167136e-05, + "loss": 0.7628, + "step": 2181 + }, + { + "epoch": 0.095665898746084, + "grad_norm": 0.7734375, + "learning_rate": 4.180400832763154e-05, + "loss": 0.8016, + "step": 2182 + }, + { + "epoch": 0.09570974196274123, + "grad_norm": 0.84765625, + "learning_rate": 4.180026454693475e-05, + "loss": 0.8932, + "step": 2183 + }, + { + "epoch": 0.09575358517939847, + "grad_norm": 0.7578125, + "learning_rate": 4.179652088958902e-05, + "loss": 0.7513, + "step": 2184 + }, + { + "epoch": 0.0957974283960557, + "grad_norm": 0.87109375, + "learning_rate": 4.1792777355602234e-05, + "loss": 0.8151, + "step": 2185 + }, + { + "epoch": 0.09584127161271294, + "grad_norm": 0.75390625, + "learning_rate": 4.1789033944982326e-05, + "loss": 0.8411, + "step": 2186 + }, + { + "epoch": 0.09588511482937018, + "grad_norm": 0.7734375, + "learning_rate": 4.178529065773723e-05, + "loss": 0.8979, + "step": 2187 + }, + { + "epoch": 0.09592895804602741, + "grad_norm": 0.9375, + "learning_rate": 4.1781547493874854e-05, + "loss": 0.8665, + "step": 2188 + }, + { + "epoch": 0.09597280126268463, + "grad_norm": 0.95703125, + "learning_rate": 4.1777804453403194e-05, + "loss": 0.8911, + "step": 2189 + }, + { + "epoch": 0.09601664447934187, + "grad_norm": 0.9140625, + "learning_rate": 4.1774061536330146e-05, + "loss": 0.8286, + "step": 2190 + }, + { + "epoch": 0.0960604876959991, + "grad_norm": 0.828125, + "learning_rate": 4.177031874266365e-05, + "loss": 0.8352, + "step": 2191 + }, + { + "epoch": 0.09610433091265634, + "grad_norm": 0.8359375, + "learning_rate": 4.1766576072411636e-05, + "loss": 0.8208, + "step": 2192 + }, + { + "epoch": 0.09614817412931358, + "grad_norm": 0.80859375, + "learning_rate": 4.176283352558199e-05, + "loss": 0.81, + "step": 2193 + }, + { + "epoch": 0.09619201734597081, + "grad_norm": 0.7265625, + "learning_rate": 4.1759091102182734e-05, + "loss": 0.7217, + "step": 2194 + }, + { + "epoch": 0.09623586056262805, + "grad_norm": 0.76171875, + "learning_rate": 4.175534880222176e-05, + "loss": 0.8342, + "step": 2195 + }, + { + "epoch": 0.09627970377928527, + "grad_norm": 0.8515625, + "learning_rate": 4.1751606625707e-05, + "loss": 0.9094, + "step": 2196 + }, + { + "epoch": 0.0963235469959425, + "grad_norm": 0.828125, + "learning_rate": 4.1747864572646364e-05, + "loss": 0.7747, + "step": 2197 + }, + { + "epoch": 0.09636739021259974, + "grad_norm": 0.82421875, + "learning_rate": 4.174412264304777e-05, + "loss": 0.7795, + "step": 2198 + }, + { + "epoch": 0.09641123342925698, + "grad_norm": 0.76953125, + "learning_rate": 4.1740380836919214e-05, + "loss": 0.6505, + "step": 2199 + }, + { + "epoch": 0.09645507664591421, + "grad_norm": 0.78515625, + "learning_rate": 4.1736639154268596e-05, + "loss": 0.7508, + "step": 2200 + }, + { + "epoch": 0.09649891986257145, + "grad_norm": 0.80859375, + "learning_rate": 4.1732897595103834e-05, + "loss": 0.8305, + "step": 2201 + }, + { + "epoch": 0.09654276307922868, + "grad_norm": 0.8046875, + "learning_rate": 4.172915615943287e-05, + "loss": 0.7514, + "step": 2202 + }, + { + "epoch": 0.09658660629588592, + "grad_norm": 0.80859375, + "learning_rate": 4.172541484726359e-05, + "loss": 0.9243, + "step": 2203 + }, + { + "epoch": 0.09663044951254314, + "grad_norm": 0.8046875, + "learning_rate": 4.1721673658604e-05, + "loss": 0.9464, + "step": 2204 + }, + { + "epoch": 0.09667429272920038, + "grad_norm": 0.734375, + "learning_rate": 4.1717932593461996e-05, + "loss": 0.8232, + "step": 2205 + }, + { + "epoch": 0.09671813594585761, + "grad_norm": 0.78515625, + "learning_rate": 4.171419165184549e-05, + "loss": 0.8756, + "step": 2206 + }, + { + "epoch": 0.09676197916251485, + "grad_norm": 0.80859375, + "learning_rate": 4.171045083376244e-05, + "loss": 0.7036, + "step": 2207 + }, + { + "epoch": 0.09680582237917208, + "grad_norm": 1.0703125, + "learning_rate": 4.1706710139220706e-05, + "loss": 0.7792, + "step": 2208 + }, + { + "epoch": 0.09684966559582932, + "grad_norm": 0.87109375, + "learning_rate": 4.170296956822831e-05, + "loss": 0.7384, + "step": 2209 + }, + { + "epoch": 0.09689350881248655, + "grad_norm": 0.8203125, + "learning_rate": 4.169922912079314e-05, + "loss": 0.7367, + "step": 2210 + }, + { + "epoch": 0.09693735202914378, + "grad_norm": 0.8515625, + "learning_rate": 4.169548879692312e-05, + "loss": 0.8528, + "step": 2211 + }, + { + "epoch": 0.09698119524580101, + "grad_norm": 0.8125, + "learning_rate": 4.169174859662618e-05, + "loss": 0.8664, + "step": 2212 + }, + { + "epoch": 0.09702503846245825, + "grad_norm": 0.7421875, + "learning_rate": 4.1688008519910246e-05, + "loss": 0.7544, + "step": 2213 + }, + { + "epoch": 0.09706888167911548, + "grad_norm": 0.78515625, + "learning_rate": 4.168426856678325e-05, + "loss": 0.8262, + "step": 2214 + }, + { + "epoch": 0.09711272489577272, + "grad_norm": 0.91015625, + "learning_rate": 4.16805287372531e-05, + "loss": 0.8872, + "step": 2215 + }, + { + "epoch": 0.09715656811242995, + "grad_norm": 0.84765625, + "learning_rate": 4.167678903132775e-05, + "loss": 0.8795, + "step": 2216 + }, + { + "epoch": 0.09720041132908719, + "grad_norm": 0.75, + "learning_rate": 4.1673049449015065e-05, + "loss": 0.7449, + "step": 2217 + }, + { + "epoch": 0.09724425454574442, + "grad_norm": 0.78515625, + "learning_rate": 4.166930999032306e-05, + "loss": 0.8178, + "step": 2218 + }, + { + "epoch": 0.09728809776240165, + "grad_norm": 0.8515625, + "learning_rate": 4.166557065525962e-05, + "loss": 0.8663, + "step": 2219 + }, + { + "epoch": 0.09733194097905888, + "grad_norm": 0.71875, + "learning_rate": 4.166183144383267e-05, + "loss": 0.7728, + "step": 2220 + }, + { + "epoch": 0.09737578419571612, + "grad_norm": 0.83203125, + "learning_rate": 4.165809235605013e-05, + "loss": 0.7816, + "step": 2221 + }, + { + "epoch": 0.09741962741237335, + "grad_norm": 0.921875, + "learning_rate": 4.165435339191992e-05, + "loss": 0.864, + "step": 2222 + }, + { + "epoch": 0.09746347062903059, + "grad_norm": 0.765625, + "learning_rate": 4.1650614551449954e-05, + "loss": 0.8403, + "step": 2223 + }, + { + "epoch": 0.09750731384568782, + "grad_norm": 0.828125, + "learning_rate": 4.16468758346482e-05, + "loss": 0.7451, + "step": 2224 + }, + { + "epoch": 0.09755115706234506, + "grad_norm": 0.859375, + "learning_rate": 4.164313724152256e-05, + "loss": 0.945, + "step": 2225 + }, + { + "epoch": 0.0975950002790023, + "grad_norm": 0.8046875, + "learning_rate": 4.163939877208095e-05, + "loss": 0.7681, + "step": 2226 + }, + { + "epoch": 0.09763884349565952, + "grad_norm": 0.8984375, + "learning_rate": 4.1635660426331316e-05, + "loss": 0.8268, + "step": 2227 + }, + { + "epoch": 0.09768268671231675, + "grad_norm": 0.9453125, + "learning_rate": 4.16319222042815e-05, + "loss": 0.7856, + "step": 2228 + }, + { + "epoch": 0.09772652992897399, + "grad_norm": 0.7890625, + "learning_rate": 4.162818410593955e-05, + "loss": 0.7938, + "step": 2229 + }, + { + "epoch": 0.09777037314563122, + "grad_norm": 0.88671875, + "learning_rate": 4.162444613131332e-05, + "loss": 0.7898, + "step": 2230 + }, + { + "epoch": 0.09781421636228846, + "grad_norm": 0.80078125, + "learning_rate": 4.162070828041073e-05, + "loss": 0.7514, + "step": 2231 + }, + { + "epoch": 0.0978580595789457, + "grad_norm": 0.83203125, + "learning_rate": 4.161697055323973e-05, + "loss": 0.856, + "step": 2232 + }, + { + "epoch": 0.09790190279560293, + "grad_norm": 0.796875, + "learning_rate": 4.1613232949808175e-05, + "loss": 0.7844, + "step": 2233 + }, + { + "epoch": 0.09794574601226015, + "grad_norm": 0.90625, + "learning_rate": 4.160949547012407e-05, + "loss": 0.887, + "step": 2234 + }, + { + "epoch": 0.09798958922891739, + "grad_norm": 0.85546875, + "learning_rate": 4.1605758114195314e-05, + "loss": 0.8807, + "step": 2235 + }, + { + "epoch": 0.09803343244557462, + "grad_norm": 0.8046875, + "learning_rate": 4.16020208820298e-05, + "loss": 0.8299, + "step": 2236 + }, + { + "epoch": 0.09807727566223186, + "grad_norm": 0.78515625, + "learning_rate": 4.159828377363548e-05, + "loss": 0.7361, + "step": 2237 + }, + { + "epoch": 0.0981211188788891, + "grad_norm": 0.8515625, + "learning_rate": 4.159454678902022e-05, + "loss": 0.8247, + "step": 2238 + }, + { + "epoch": 0.09816496209554633, + "grad_norm": 0.84765625, + "learning_rate": 4.159080992819201e-05, + "loss": 0.7996, + "step": 2239 + }, + { + "epoch": 0.09820880531220356, + "grad_norm": 0.82421875, + "learning_rate": 4.158707319115874e-05, + "loss": 0.9706, + "step": 2240 + }, + { + "epoch": 0.0982526485288608, + "grad_norm": 0.875, + "learning_rate": 4.1583336577928334e-05, + "loss": 0.8648, + "step": 2241 + }, + { + "epoch": 0.09829649174551802, + "grad_norm": 0.8515625, + "learning_rate": 4.15796000885087e-05, + "loss": 0.922, + "step": 2242 + }, + { + "epoch": 0.09834033496217526, + "grad_norm": 0.796875, + "learning_rate": 4.157586372290774e-05, + "loss": 0.8257, + "step": 2243 + }, + { + "epoch": 0.0983841781788325, + "grad_norm": 0.86328125, + "learning_rate": 4.1572127481133425e-05, + "loss": 0.779, + "step": 2244 + }, + { + "epoch": 0.09842802139548973, + "grad_norm": 0.75, + "learning_rate": 4.156839136319365e-05, + "loss": 0.834, + "step": 2245 + }, + { + "epoch": 0.09847186461214696, + "grad_norm": 0.82421875, + "learning_rate": 4.156465536909633e-05, + "loss": 0.8302, + "step": 2246 + }, + { + "epoch": 0.0985157078288042, + "grad_norm": 0.828125, + "learning_rate": 4.156091949884937e-05, + "loss": 0.7279, + "step": 2247 + }, + { + "epoch": 0.09855955104546144, + "grad_norm": 0.75390625, + "learning_rate": 4.1557183752460685e-05, + "loss": 0.7812, + "step": 2248 + }, + { + "epoch": 0.09860339426211866, + "grad_norm": 0.84765625, + "learning_rate": 4.1553448129938234e-05, + "loss": 0.7765, + "step": 2249 + }, + { + "epoch": 0.09864723747877589, + "grad_norm": 0.86328125, + "learning_rate": 4.154971263128991e-05, + "loss": 0.8116, + "step": 2250 + }, + { + "epoch": 0.09869108069543313, + "grad_norm": 0.953125, + "learning_rate": 4.1545977256523625e-05, + "loss": 0.8743, + "step": 2251 + }, + { + "epoch": 0.09873492391209036, + "grad_norm": 0.78515625, + "learning_rate": 4.15422420056473e-05, + "loss": 0.7828, + "step": 2252 + }, + { + "epoch": 0.0987787671287476, + "grad_norm": 0.95703125, + "learning_rate": 4.153850687866886e-05, + "loss": 0.8002, + "step": 2253 + }, + { + "epoch": 0.09882261034540483, + "grad_norm": 0.81640625, + "learning_rate": 4.1534771875596214e-05, + "loss": 0.8415, + "step": 2254 + }, + { + "epoch": 0.09886645356206207, + "grad_norm": 0.796875, + "learning_rate": 4.153103699643723e-05, + "loss": 0.8735, + "step": 2255 + }, + { + "epoch": 0.0989102967787193, + "grad_norm": 0.99609375, + "learning_rate": 4.152730224119992e-05, + "loss": 0.8044, + "step": 2256 + }, + { + "epoch": 0.09895413999537653, + "grad_norm": 0.875, + "learning_rate": 4.152356760989214e-05, + "loss": 0.8305, + "step": 2257 + }, + { + "epoch": 0.09899798321203376, + "grad_norm": 0.921875, + "learning_rate": 4.151983310252183e-05, + "loss": 0.8205, + "step": 2258 + }, + { + "epoch": 0.099041826428691, + "grad_norm": 0.890625, + "learning_rate": 4.151609871909688e-05, + "loss": 0.8949, + "step": 2259 + }, + { + "epoch": 0.09908566964534823, + "grad_norm": 1.046875, + "learning_rate": 4.151236445962522e-05, + "loss": 0.8322, + "step": 2260 + }, + { + "epoch": 0.09912951286200547, + "grad_norm": 0.8984375, + "learning_rate": 4.150863032411476e-05, + "loss": 0.8995, + "step": 2261 + }, + { + "epoch": 0.0991733560786627, + "grad_norm": 0.9140625, + "learning_rate": 4.150489631257342e-05, + "loss": 0.8162, + "step": 2262 + }, + { + "epoch": 0.09921719929531994, + "grad_norm": 0.87890625, + "learning_rate": 4.150116242500905e-05, + "loss": 0.7565, + "step": 2263 + }, + { + "epoch": 0.09926104251197716, + "grad_norm": 0.92578125, + "learning_rate": 4.149742866142968e-05, + "loss": 0.768, + "step": 2264 + }, + { + "epoch": 0.0993048857286344, + "grad_norm": 0.859375, + "learning_rate": 4.149369502184316e-05, + "loss": 0.8652, + "step": 2265 + }, + { + "epoch": 0.09934872894529163, + "grad_norm": 0.90625, + "learning_rate": 4.1489961506257414e-05, + "loss": 0.7901, + "step": 2266 + }, + { + "epoch": 0.09939257216194887, + "grad_norm": 0.8828125, + "learning_rate": 4.1486228114680346e-05, + "loss": 0.8283, + "step": 2267 + }, + { + "epoch": 0.0994364153786061, + "grad_norm": 0.81640625, + "learning_rate": 4.148249484711988e-05, + "loss": 0.8064, + "step": 2268 + }, + { + "epoch": 0.09948025859526334, + "grad_norm": 1.09375, + "learning_rate": 4.147876170358388e-05, + "loss": 0.9679, + "step": 2269 + }, + { + "epoch": 0.09952410181192058, + "grad_norm": 0.81640625, + "learning_rate": 4.147502868408033e-05, + "loss": 0.8764, + "step": 2270 + }, + { + "epoch": 0.09956794502857781, + "grad_norm": 0.91796875, + "learning_rate": 4.1471295788617114e-05, + "loss": 0.808, + "step": 2271 + }, + { + "epoch": 0.09961178824523503, + "grad_norm": 0.83203125, + "learning_rate": 4.146756301720215e-05, + "loss": 0.8599, + "step": 2272 + }, + { + "epoch": 0.09965563146189227, + "grad_norm": 0.8359375, + "learning_rate": 4.146383036984334e-05, + "loss": 0.7613, + "step": 2273 + }, + { + "epoch": 0.0996994746785495, + "grad_norm": 0.7890625, + "learning_rate": 4.146009784654855e-05, + "loss": 0.7563, + "step": 2274 + }, + { + "epoch": 0.09974331789520674, + "grad_norm": 0.82421875, + "learning_rate": 4.145636544732577e-05, + "loss": 0.8473, + "step": 2275 + }, + { + "epoch": 0.09978716111186398, + "grad_norm": 0.89453125, + "learning_rate": 4.1452633172182886e-05, + "loss": 0.8122, + "step": 2276 + }, + { + "epoch": 0.09983100432852121, + "grad_norm": 0.81640625, + "learning_rate": 4.144890102112779e-05, + "loss": 0.7837, + "step": 2277 + }, + { + "epoch": 0.09987484754517845, + "grad_norm": 0.89453125, + "learning_rate": 4.144516899416842e-05, + "loss": 0.9087, + "step": 2278 + }, + { + "epoch": 0.09991869076183568, + "grad_norm": 0.91015625, + "learning_rate": 4.144143709131262e-05, + "loss": 0.9386, + "step": 2279 + }, + { + "epoch": 0.0999625339784929, + "grad_norm": 0.74609375, + "learning_rate": 4.143770531256839e-05, + "loss": 0.9155, + "step": 2280 + }, + { + "epoch": 0.10000637719515014, + "grad_norm": 0.80859375, + "learning_rate": 4.143397365794358e-05, + "loss": 0.8863, + "step": 2281 + }, + { + "epoch": 0.10005022041180738, + "grad_norm": 0.78515625, + "learning_rate": 4.1430242127446126e-05, + "loss": 0.8397, + "step": 2282 + }, + { + "epoch": 0.10009406362846461, + "grad_norm": 0.83984375, + "learning_rate": 4.142651072108393e-05, + "loss": 0.7411, + "step": 2283 + }, + { + "epoch": 0.10013790684512185, + "grad_norm": 0.8046875, + "learning_rate": 4.1422779438864855e-05, + "loss": 0.7344, + "step": 2284 + }, + { + "epoch": 0.10018175006177908, + "grad_norm": 0.83984375, + "learning_rate": 4.141904828079689e-05, + "loss": 0.8117, + "step": 2285 + }, + { + "epoch": 0.10022559327843632, + "grad_norm": 0.828125, + "learning_rate": 4.1415317246887894e-05, + "loss": 0.9255, + "step": 2286 + }, + { + "epoch": 0.10026943649509354, + "grad_norm": 0.78515625, + "learning_rate": 4.14115863371458e-05, + "loss": 0.7496, + "step": 2287 + }, + { + "epoch": 0.10031327971175077, + "grad_norm": 0.74609375, + "learning_rate": 4.140785555157849e-05, + "loss": 0.8323, + "step": 2288 + }, + { + "epoch": 0.10035712292840801, + "grad_norm": 0.890625, + "learning_rate": 4.1404124890193854e-05, + "loss": 0.8763, + "step": 2289 + }, + { + "epoch": 0.10040096614506525, + "grad_norm": 0.79296875, + "learning_rate": 4.140039435299985e-05, + "loss": 0.8408, + "step": 2290 + }, + { + "epoch": 0.10044480936172248, + "grad_norm": 0.84375, + "learning_rate": 4.139666394000436e-05, + "loss": 0.8635, + "step": 2291 + }, + { + "epoch": 0.10048865257837972, + "grad_norm": 0.80078125, + "learning_rate": 4.1392933651215306e-05, + "loss": 0.7646, + "step": 2292 + }, + { + "epoch": 0.10053249579503695, + "grad_norm": 0.83984375, + "learning_rate": 4.138920348664057e-05, + "loss": 0.8436, + "step": 2293 + }, + { + "epoch": 0.10057633901169419, + "grad_norm": 0.82421875, + "learning_rate": 4.138547344628804e-05, + "loss": 0.7946, + "step": 2294 + }, + { + "epoch": 0.10062018222835141, + "grad_norm": 0.78125, + "learning_rate": 4.1381743530165675e-05, + "loss": 0.7815, + "step": 2295 + }, + { + "epoch": 0.10066402544500865, + "grad_norm": 0.8203125, + "learning_rate": 4.137801373828135e-05, + "loss": 0.8521, + "step": 2296 + }, + { + "epoch": 0.10070786866166588, + "grad_norm": 0.88671875, + "learning_rate": 4.1374284070642986e-05, + "loss": 0.7907, + "step": 2297 + }, + { + "epoch": 0.10075171187832312, + "grad_norm": 0.8984375, + "learning_rate": 4.137055452725847e-05, + "loss": 0.8372, + "step": 2298 + }, + { + "epoch": 0.10079555509498035, + "grad_norm": 0.80859375, + "learning_rate": 4.136682510813571e-05, + "loss": 0.8389, + "step": 2299 + }, + { + "epoch": 0.10083939831163759, + "grad_norm": 0.78515625, + "learning_rate": 4.136309581328262e-05, + "loss": 0.7947, + "step": 2300 + }, + { + "epoch": 0.10088324152829482, + "grad_norm": 0.82421875, + "learning_rate": 4.135936664270709e-05, + "loss": 0.8702, + "step": 2301 + }, + { + "epoch": 0.10092708474495204, + "grad_norm": 0.76953125, + "learning_rate": 4.135563759641704e-05, + "loss": 0.8517, + "step": 2302 + }, + { + "epoch": 0.10097092796160928, + "grad_norm": 0.859375, + "learning_rate": 4.135190867442031e-05, + "loss": 0.9232, + "step": 2303 + }, + { + "epoch": 0.10101477117826652, + "grad_norm": 0.921875, + "learning_rate": 4.1348179876724916e-05, + "loss": 0.832, + "step": 2304 + }, + { + "epoch": 0.10105861439492375, + "grad_norm": 0.77734375, + "learning_rate": 4.1344451203338685e-05, + "loss": 0.8797, + "step": 2305 + }, + { + "epoch": 0.10110245761158099, + "grad_norm": 0.7734375, + "learning_rate": 4.134072265426955e-05, + "loss": 0.8198, + "step": 2306 + }, + { + "epoch": 0.10114630082823822, + "grad_norm": 0.83203125, + "learning_rate": 4.1336994229525394e-05, + "loss": 0.9005, + "step": 2307 + }, + { + "epoch": 0.10119014404489546, + "grad_norm": 0.93359375, + "learning_rate": 4.133326592911413e-05, + "loss": 0.7736, + "step": 2308 + }, + { + "epoch": 0.1012339872615527, + "grad_norm": 0.828125, + "learning_rate": 4.132953775304361e-05, + "loss": 0.7258, + "step": 2309 + }, + { + "epoch": 0.10127783047820992, + "grad_norm": 0.83984375, + "learning_rate": 4.132580970132183e-05, + "loss": 0.8264, + "step": 2310 + }, + { + "epoch": 0.10132167369486715, + "grad_norm": 0.85546875, + "learning_rate": 4.132208177395662e-05, + "loss": 0.7544, + "step": 2311 + }, + { + "epoch": 0.10136551691152439, + "grad_norm": 0.97265625, + "learning_rate": 4.1318353970955926e-05, + "loss": 0.8401, + "step": 2312 + }, + { + "epoch": 0.10140936012818162, + "grad_norm": 0.83984375, + "learning_rate": 4.131462629232762e-05, + "loss": 0.8379, + "step": 2313 + }, + { + "epoch": 0.10145320334483886, + "grad_norm": 0.77734375, + "learning_rate": 4.131089873807957e-05, + "loss": 0.8587, + "step": 2314 + }, + { + "epoch": 0.1014970465614961, + "grad_norm": 0.7265625, + "learning_rate": 4.130717130821976e-05, + "loss": 0.8018, + "step": 2315 + }, + { + "epoch": 0.10154088977815333, + "grad_norm": 0.75, + "learning_rate": 4.1303444002756044e-05, + "loss": 0.7587, + "step": 2316 + }, + { + "epoch": 0.10158473299481056, + "grad_norm": 1.109375, + "learning_rate": 4.129971682169632e-05, + "loss": 0.8318, + "step": 2317 + }, + { + "epoch": 0.10162857621146779, + "grad_norm": 0.80078125, + "learning_rate": 4.1295989765048495e-05, + "loss": 0.8016, + "step": 2318 + }, + { + "epoch": 0.10167241942812502, + "grad_norm": 0.859375, + "learning_rate": 4.129226283282042e-05, + "loss": 0.9056, + "step": 2319 + }, + { + "epoch": 0.10171626264478226, + "grad_norm": 0.7578125, + "learning_rate": 4.1288536025020074e-05, + "loss": 0.8414, + "step": 2320 + }, + { + "epoch": 0.10176010586143949, + "grad_norm": 0.75, + "learning_rate": 4.1284809341655326e-05, + "loss": 0.7676, + "step": 2321 + }, + { + "epoch": 0.10180394907809673, + "grad_norm": 0.81640625, + "learning_rate": 4.128108278273407e-05, + "loss": 0.7321, + "step": 2322 + }, + { + "epoch": 0.10184779229475396, + "grad_norm": 0.79296875, + "learning_rate": 4.1277356348264195e-05, + "loss": 0.8357, + "step": 2323 + }, + { + "epoch": 0.1018916355114112, + "grad_norm": 0.8125, + "learning_rate": 4.1273630038253586e-05, + "loss": 0.7846, + "step": 2324 + }, + { + "epoch": 0.10193547872806842, + "grad_norm": 0.8671875, + "learning_rate": 4.126990385271018e-05, + "loss": 0.9239, + "step": 2325 + }, + { + "epoch": 0.10197932194472566, + "grad_norm": 0.80078125, + "learning_rate": 4.126617779164187e-05, + "loss": 0.7841, + "step": 2326 + }, + { + "epoch": 0.10202316516138289, + "grad_norm": 0.69921875, + "learning_rate": 4.126245185505653e-05, + "loss": 0.6501, + "step": 2327 + }, + { + "epoch": 0.10206700837804013, + "grad_norm": 0.8125, + "learning_rate": 4.125872604296207e-05, + "loss": 0.7523, + "step": 2328 + }, + { + "epoch": 0.10211085159469736, + "grad_norm": 0.84375, + "learning_rate": 4.125500035536635e-05, + "loss": 0.7294, + "step": 2329 + }, + { + "epoch": 0.1021546948113546, + "grad_norm": 0.78125, + "learning_rate": 4.125127479227733e-05, + "loss": 0.9567, + "step": 2330 + }, + { + "epoch": 0.10219853802801183, + "grad_norm": 0.78125, + "learning_rate": 4.1247549353702875e-05, + "loss": 0.7842, + "step": 2331 + }, + { + "epoch": 0.10224238124466907, + "grad_norm": 0.84375, + "learning_rate": 4.124382403965088e-05, + "loss": 0.751, + "step": 2332 + }, + { + "epoch": 0.10228622446132629, + "grad_norm": 0.78125, + "learning_rate": 4.124009885012924e-05, + "loss": 0.8113, + "step": 2333 + }, + { + "epoch": 0.10233006767798353, + "grad_norm": 0.80078125, + "learning_rate": 4.123637378514582e-05, + "loss": 0.8346, + "step": 2334 + }, + { + "epoch": 0.10237391089464076, + "grad_norm": 0.75390625, + "learning_rate": 4.1232648844708574e-05, + "loss": 0.7833, + "step": 2335 + }, + { + "epoch": 0.102417754111298, + "grad_norm": 0.7890625, + "learning_rate": 4.122892402882539e-05, + "loss": 0.7526, + "step": 2336 + }, + { + "epoch": 0.10246159732795523, + "grad_norm": 0.83984375, + "learning_rate": 4.122519933750413e-05, + "loss": 0.824, + "step": 2337 + }, + { + "epoch": 0.10250544054461247, + "grad_norm": 0.8984375, + "learning_rate": 4.12214747707527e-05, + "loss": 0.8083, + "step": 2338 + }, + { + "epoch": 0.1025492837612697, + "grad_norm": 0.82421875, + "learning_rate": 4.1217750328578995e-05, + "loss": 0.7948, + "step": 2339 + }, + { + "epoch": 0.10259312697792693, + "grad_norm": 0.78125, + "learning_rate": 4.121402601099087e-05, + "loss": 0.758, + "step": 2340 + }, + { + "epoch": 0.10263697019458416, + "grad_norm": 0.8203125, + "learning_rate": 4.12103018179963e-05, + "loss": 0.9317, + "step": 2341 + }, + { + "epoch": 0.1026808134112414, + "grad_norm": 0.7734375, + "learning_rate": 4.120657774960313e-05, + "loss": 0.7564, + "step": 2342 + }, + { + "epoch": 0.10272465662789863, + "grad_norm": 0.8671875, + "learning_rate": 4.1202853805819254e-05, + "loss": 0.9203, + "step": 2343 + }, + { + "epoch": 0.10276849984455587, + "grad_norm": 0.78515625, + "learning_rate": 4.1199129986652575e-05, + "loss": 0.7309, + "step": 2344 + }, + { + "epoch": 0.1028123430612131, + "grad_norm": 0.82421875, + "learning_rate": 4.119540629211097e-05, + "loss": 0.6878, + "step": 2345 + }, + { + "epoch": 0.10285618627787034, + "grad_norm": 0.83203125, + "learning_rate": 4.119168272220235e-05, + "loss": 0.7919, + "step": 2346 + }, + { + "epoch": 0.10290002949452758, + "grad_norm": 0.75390625, + "learning_rate": 4.11879592769346e-05, + "loss": 0.8081, + "step": 2347 + }, + { + "epoch": 0.1029438727111848, + "grad_norm": 0.796875, + "learning_rate": 4.118423595631561e-05, + "loss": 0.748, + "step": 2348 + }, + { + "epoch": 0.10298771592784203, + "grad_norm": 0.93359375, + "learning_rate": 4.1180512760353216e-05, + "loss": 0.8306, + "step": 2349 + }, + { + "epoch": 0.10303155914449927, + "grad_norm": 0.828125, + "learning_rate": 4.117678968905542e-05, + "loss": 0.8109, + "step": 2350 + }, + { + "epoch": 0.1030754023611565, + "grad_norm": 0.85546875, + "learning_rate": 4.1173066742430044e-05, + "loss": 0.8426, + "step": 2351 + }, + { + "epoch": 0.10311924557781374, + "grad_norm": 0.8984375, + "learning_rate": 4.116934392048499e-05, + "loss": 0.7172, + "step": 2352 + }, + { + "epoch": 0.10316308879447098, + "grad_norm": 0.78125, + "learning_rate": 4.1165621223228156e-05, + "loss": 0.8018, + "step": 2353 + }, + { + "epoch": 0.10320693201112821, + "grad_norm": 0.91015625, + "learning_rate": 4.1161898650667384e-05, + "loss": 0.7655, + "step": 2354 + }, + { + "epoch": 0.10325077522778543, + "grad_norm": 0.81640625, + "learning_rate": 4.115817620281065e-05, + "loss": 0.8041, + "step": 2355 + }, + { + "epoch": 0.10329461844444267, + "grad_norm": 0.8828125, + "learning_rate": 4.115445387966579e-05, + "loss": 0.7794, + "step": 2356 + }, + { + "epoch": 0.1033384616610999, + "grad_norm": 1.828125, + "learning_rate": 4.115073168124069e-05, + "loss": 0.8109, + "step": 2357 + }, + { + "epoch": 0.10338230487775714, + "grad_norm": 0.8359375, + "learning_rate": 4.1147009607543274e-05, + "loss": 0.8795, + "step": 2358 + }, + { + "epoch": 0.10342614809441437, + "grad_norm": 0.7890625, + "learning_rate": 4.114328765858135e-05, + "loss": 0.8159, + "step": 2359 + }, + { + "epoch": 0.10346999131107161, + "grad_norm": 0.84765625, + "learning_rate": 4.113956583436291e-05, + "loss": 0.8462, + "step": 2360 + }, + { + "epoch": 0.10351383452772885, + "grad_norm": 0.8046875, + "learning_rate": 4.1135844134895805e-05, + "loss": 0.9103, + "step": 2361 + }, + { + "epoch": 0.10355767774438608, + "grad_norm": 0.765625, + "learning_rate": 4.11321225601879e-05, + "loss": 0.8685, + "step": 2362 + }, + { + "epoch": 0.1036015209610433, + "grad_norm": 0.875, + "learning_rate": 4.1128401110247105e-05, + "loss": 0.8727, + "step": 2363 + }, + { + "epoch": 0.10364536417770054, + "grad_norm": 0.77734375, + "learning_rate": 4.1124679785081254e-05, + "loss": 0.9091, + "step": 2364 + }, + { + "epoch": 0.10368920739435777, + "grad_norm": 0.9140625, + "learning_rate": 4.112095858469832e-05, + "loss": 0.892, + "step": 2365 + }, + { + "epoch": 0.10373305061101501, + "grad_norm": 0.80859375, + "learning_rate": 4.111723750910615e-05, + "loss": 0.9788, + "step": 2366 + }, + { + "epoch": 0.10377689382767225, + "grad_norm": 0.828125, + "learning_rate": 4.1113516558312635e-05, + "loss": 0.8061, + "step": 2367 + }, + { + "epoch": 0.10382073704432948, + "grad_norm": 0.78515625, + "learning_rate": 4.110979573232565e-05, + "loss": 0.8829, + "step": 2368 + }, + { + "epoch": 0.10386458026098672, + "grad_norm": 0.81640625, + "learning_rate": 4.1106075031153046e-05, + "loss": 0.7691, + "step": 2369 + }, + { + "epoch": 0.10390842347764395, + "grad_norm": 1.0078125, + "learning_rate": 4.1102354454802797e-05, + "loss": 0.8184, + "step": 2370 + }, + { + "epoch": 0.10395226669430117, + "grad_norm": 0.7890625, + "learning_rate": 4.109863400328273e-05, + "loss": 0.7778, + "step": 2371 + }, + { + "epoch": 0.10399610991095841, + "grad_norm": 0.828125, + "learning_rate": 4.1094913676600744e-05, + "loss": 0.7305, + "step": 2372 + }, + { + "epoch": 0.10403995312761564, + "grad_norm": 0.78125, + "learning_rate": 4.109119347476472e-05, + "loss": 0.757, + "step": 2373 + }, + { + "epoch": 0.10408379634427288, + "grad_norm": 0.9140625, + "learning_rate": 4.108747339778252e-05, + "loss": 0.9152, + "step": 2374 + }, + { + "epoch": 0.10412763956093012, + "grad_norm": 0.83203125, + "learning_rate": 4.1083753445662075e-05, + "loss": 0.8698, + "step": 2375 + }, + { + "epoch": 0.10417148277758735, + "grad_norm": 0.796875, + "learning_rate": 4.1080033618411264e-05, + "loss": 0.8003, + "step": 2376 + }, + { + "epoch": 0.10421532599424459, + "grad_norm": 0.86328125, + "learning_rate": 4.107631391603794e-05, + "loss": 0.8718, + "step": 2377 + }, + { + "epoch": 0.10425916921090181, + "grad_norm": 0.75390625, + "learning_rate": 4.1072594338549996e-05, + "loss": 0.7128, + "step": 2378 + }, + { + "epoch": 0.10430301242755904, + "grad_norm": 0.75, + "learning_rate": 4.10688748859553e-05, + "loss": 0.6882, + "step": 2379 + }, + { + "epoch": 0.10434685564421628, + "grad_norm": 0.8984375, + "learning_rate": 4.106515555826178e-05, + "loss": 0.8492, + "step": 2380 + }, + { + "epoch": 0.10439069886087352, + "grad_norm": 0.76171875, + "learning_rate": 4.10614363554773e-05, + "loss": 0.7037, + "step": 2381 + }, + { + "epoch": 0.10443454207753075, + "grad_norm": 0.93359375, + "learning_rate": 4.1057717277609734e-05, + "loss": 0.9293, + "step": 2382 + }, + { + "epoch": 0.10447838529418799, + "grad_norm": 0.75, + "learning_rate": 4.105399832466697e-05, + "loss": 0.8702, + "step": 2383 + }, + { + "epoch": 0.10452222851084522, + "grad_norm": 0.8046875, + "learning_rate": 4.1050279496656886e-05, + "loss": 0.8728, + "step": 2384 + }, + { + "epoch": 0.10456607172750246, + "grad_norm": 1.0078125, + "learning_rate": 4.104656079358736e-05, + "loss": 0.9113, + "step": 2385 + }, + { + "epoch": 0.10460991494415968, + "grad_norm": 0.86328125, + "learning_rate": 4.1042842215466284e-05, + "loss": 0.7849, + "step": 2386 + }, + { + "epoch": 0.10465375816081691, + "grad_norm": 0.86328125, + "learning_rate": 4.103912376230153e-05, + "loss": 0.793, + "step": 2387 + }, + { + "epoch": 0.10469760137747415, + "grad_norm": 0.75, + "learning_rate": 4.103540543410095e-05, + "loss": 0.7966, + "step": 2388 + }, + { + "epoch": 0.10474144459413139, + "grad_norm": 0.85546875, + "learning_rate": 4.10316872308725e-05, + "loss": 0.8649, + "step": 2389 + }, + { + "epoch": 0.10478528781078862, + "grad_norm": 0.79296875, + "learning_rate": 4.102796915262401e-05, + "loss": 0.8874, + "step": 2390 + }, + { + "epoch": 0.10482913102744586, + "grad_norm": 1.03125, + "learning_rate": 4.102425119936337e-05, + "loss": 0.8291, + "step": 2391 + }, + { + "epoch": 0.10487297424410309, + "grad_norm": 0.80078125, + "learning_rate": 4.102053337109846e-05, + "loss": 0.8257, + "step": 2392 + }, + { + "epoch": 0.10491681746076031, + "grad_norm": 0.953125, + "learning_rate": 4.101681566783716e-05, + "loss": 0.8598, + "step": 2393 + }, + { + "epoch": 0.10496066067741755, + "grad_norm": 0.82421875, + "learning_rate": 4.101309808958731e-05, + "loss": 0.7072, + "step": 2394 + }, + { + "epoch": 0.10500450389407479, + "grad_norm": 0.7734375, + "learning_rate": 4.1009380636356866e-05, + "loss": 0.7685, + "step": 2395 + }, + { + "epoch": 0.10504834711073202, + "grad_norm": 0.81640625, + "learning_rate": 4.100566330815366e-05, + "loss": 0.7631, + "step": 2396 + }, + { + "epoch": 0.10509219032738926, + "grad_norm": 0.84375, + "learning_rate": 4.100194610498559e-05, + "loss": 0.7785, + "step": 2397 + }, + { + "epoch": 0.10513603354404649, + "grad_norm": 0.76171875, + "learning_rate": 4.0998229026860515e-05, + "loss": 0.7899, + "step": 2398 + }, + { + "epoch": 0.10517987676070373, + "grad_norm": 0.875, + "learning_rate": 4.099451207378628e-05, + "loss": 0.904, + "step": 2399 + }, + { + "epoch": 0.10522371997736096, + "grad_norm": 0.90625, + "learning_rate": 4.099079524577085e-05, + "loss": 0.9077, + "step": 2400 + }, + { + "epoch": 0.10526756319401818, + "grad_norm": 0.7265625, + "learning_rate": 4.098707854282205e-05, + "loss": 0.7446, + "step": 2401 + }, + { + "epoch": 0.10531140641067542, + "grad_norm": 0.8515625, + "learning_rate": 4.098336196494778e-05, + "loss": 0.8684, + "step": 2402 + }, + { + "epoch": 0.10535524962733266, + "grad_norm": 0.75, + "learning_rate": 4.097964551215588e-05, + "loss": 0.7346, + "step": 2403 + }, + { + "epoch": 0.10539909284398989, + "grad_norm": 0.859375, + "learning_rate": 4.097592918445423e-05, + "loss": 0.7118, + "step": 2404 + }, + { + "epoch": 0.10544293606064713, + "grad_norm": 0.703125, + "learning_rate": 4.097221298185074e-05, + "loss": 0.6787, + "step": 2405 + }, + { + "epoch": 0.10548677927730436, + "grad_norm": 0.86328125, + "learning_rate": 4.0968496904353295e-05, + "loss": 0.8686, + "step": 2406 + }, + { + "epoch": 0.1055306224939616, + "grad_norm": 0.79296875, + "learning_rate": 4.096478095196973e-05, + "loss": 0.8059, + "step": 2407 + }, + { + "epoch": 0.10557446571061883, + "grad_norm": 0.85546875, + "learning_rate": 4.096106512470794e-05, + "loss": 0.7445, + "step": 2408 + }, + { + "epoch": 0.10561830892727606, + "grad_norm": 0.87109375, + "learning_rate": 4.095734942257576e-05, + "loss": 0.764, + "step": 2409 + }, + { + "epoch": 0.10566215214393329, + "grad_norm": 0.80078125, + "learning_rate": 4.095363384558115e-05, + "loss": 0.9505, + "step": 2410 + }, + { + "epoch": 0.10570599536059053, + "grad_norm": 0.765625, + "learning_rate": 4.094991839373194e-05, + "loss": 0.7709, + "step": 2411 + }, + { + "epoch": 0.10574983857724776, + "grad_norm": 0.87109375, + "learning_rate": 4.094620306703598e-05, + "loss": 0.8991, + "step": 2412 + }, + { + "epoch": 0.105793681793905, + "grad_norm": 0.8984375, + "learning_rate": 4.094248786550119e-05, + "loss": 0.8065, + "step": 2413 + }, + { + "epoch": 0.10583752501056223, + "grad_norm": 0.796875, + "learning_rate": 4.0938772789135405e-05, + "loss": 0.7246, + "step": 2414 + }, + { + "epoch": 0.10588136822721947, + "grad_norm": 1.1328125, + "learning_rate": 4.093505783794649e-05, + "loss": 0.7365, + "step": 2415 + }, + { + "epoch": 0.10592521144387669, + "grad_norm": 1.0, + "learning_rate": 4.093134301194237e-05, + "loss": 0.7862, + "step": 2416 + }, + { + "epoch": 0.10596905466053393, + "grad_norm": 0.7890625, + "learning_rate": 4.09276283111309e-05, + "loss": 0.8802, + "step": 2417 + }, + { + "epoch": 0.10601289787719116, + "grad_norm": 0.859375, + "learning_rate": 4.092391373551995e-05, + "loss": 0.8609, + "step": 2418 + }, + { + "epoch": 0.1060567410938484, + "grad_norm": 0.8359375, + "learning_rate": 4.092019928511738e-05, + "loss": 1.032, + "step": 2419 + }, + { + "epoch": 0.10610058431050563, + "grad_norm": 0.76953125, + "learning_rate": 4.091648495993103e-05, + "loss": 0.8259, + "step": 2420 + }, + { + "epoch": 0.10614442752716287, + "grad_norm": 0.8828125, + "learning_rate": 4.091277075996887e-05, + "loss": 0.8854, + "step": 2421 + }, + { + "epoch": 0.1061882707438201, + "grad_norm": 0.8203125, + "learning_rate": 4.09090566852387e-05, + "loss": 0.8198, + "step": 2422 + }, + { + "epoch": 0.10623211396047734, + "grad_norm": 0.8125, + "learning_rate": 4.090534273574841e-05, + "loss": 0.862, + "step": 2423 + }, + { + "epoch": 0.10627595717713456, + "grad_norm": 0.89453125, + "learning_rate": 4.090162891150586e-05, + "loss": 0.9577, + "step": 2424 + }, + { + "epoch": 0.1063198003937918, + "grad_norm": 0.84375, + "learning_rate": 4.08979152125189e-05, + "loss": 0.8275, + "step": 2425 + }, + { + "epoch": 0.10636364361044903, + "grad_norm": 0.77734375, + "learning_rate": 4.089420163879547e-05, + "loss": 0.7628, + "step": 2426 + }, + { + "epoch": 0.10640748682710627, + "grad_norm": 0.8046875, + "learning_rate": 4.089048819034339e-05, + "loss": 0.7636, + "step": 2427 + }, + { + "epoch": 0.1064513300437635, + "grad_norm": 0.76953125, + "learning_rate": 4.088677486717055e-05, + "loss": 0.7873, + "step": 2428 + }, + { + "epoch": 0.10649517326042074, + "grad_norm": 0.7890625, + "learning_rate": 4.088306166928482e-05, + "loss": 0.7949, + "step": 2429 + }, + { + "epoch": 0.10653901647707797, + "grad_norm": 0.83203125, + "learning_rate": 4.087934859669406e-05, + "loss": 0.9301, + "step": 2430 + }, + { + "epoch": 0.1065828596937352, + "grad_norm": 0.81640625, + "learning_rate": 4.087563564940613e-05, + "loss": 0.8406, + "step": 2431 + }, + { + "epoch": 0.10662670291039243, + "grad_norm": 0.9140625, + "learning_rate": 4.087192282742891e-05, + "loss": 0.969, + "step": 2432 + }, + { + "epoch": 0.10667054612704967, + "grad_norm": 0.80078125, + "learning_rate": 4.0868210130770276e-05, + "loss": 0.7501, + "step": 2433 + }, + { + "epoch": 0.1067143893437069, + "grad_norm": 0.80859375, + "learning_rate": 4.0864497559438096e-05, + "loss": 0.7579, + "step": 2434 + }, + { + "epoch": 0.10675823256036414, + "grad_norm": 0.8203125, + "learning_rate": 4.0860785113440194e-05, + "loss": 0.7813, + "step": 2435 + }, + { + "epoch": 0.10680207577702137, + "grad_norm": 0.87890625, + "learning_rate": 4.0857072792784514e-05, + "loss": 0.8551, + "step": 2436 + }, + { + "epoch": 0.10684591899367861, + "grad_norm": 0.78125, + "learning_rate": 4.0853360597478875e-05, + "loss": 0.7635, + "step": 2437 + }, + { + "epoch": 0.10688976221033585, + "grad_norm": 0.80078125, + "learning_rate": 4.084964852753117e-05, + "loss": 0.7455, + "step": 2438 + }, + { + "epoch": 0.10693360542699307, + "grad_norm": 0.88671875, + "learning_rate": 4.084593658294925e-05, + "loss": 0.9888, + "step": 2439 + }, + { + "epoch": 0.1069774486436503, + "grad_norm": 0.73828125, + "learning_rate": 4.084222476374096e-05, + "loss": 0.7078, + "step": 2440 + }, + { + "epoch": 0.10702129186030754, + "grad_norm": 0.81640625, + "learning_rate": 4.083851306991422e-05, + "loss": 0.7729, + "step": 2441 + }, + { + "epoch": 0.10706513507696477, + "grad_norm": 0.89453125, + "learning_rate": 4.083480150147687e-05, + "loss": 0.8201, + "step": 2442 + }, + { + "epoch": 0.10710897829362201, + "grad_norm": 0.83984375, + "learning_rate": 4.0831090058436785e-05, + "loss": 0.7541, + "step": 2443 + }, + { + "epoch": 0.10715282151027924, + "grad_norm": 0.828125, + "learning_rate": 4.082737874080182e-05, + "loss": 0.8279, + "step": 2444 + }, + { + "epoch": 0.10719666472693648, + "grad_norm": 0.81640625, + "learning_rate": 4.0823667548579794e-05, + "loss": 0.8276, + "step": 2445 + }, + { + "epoch": 0.1072405079435937, + "grad_norm": 1.0390625, + "learning_rate": 4.0819956481778664e-05, + "loss": 0.8684, + "step": 2446 + }, + { + "epoch": 0.10728435116025094, + "grad_norm": 0.82421875, + "learning_rate": 4.081624554040626e-05, + "loss": 0.8446, + "step": 2447 + }, + { + "epoch": 0.10732819437690817, + "grad_norm": 0.890625, + "learning_rate": 4.081253472447044e-05, + "loss": 0.9204, + "step": 2448 + }, + { + "epoch": 0.10737203759356541, + "grad_norm": 0.8671875, + "learning_rate": 4.0808824033979066e-05, + "loss": 0.7135, + "step": 2449 + }, + { + "epoch": 0.10741588081022264, + "grad_norm": 0.7890625, + "learning_rate": 4.080511346893998e-05, + "loss": 0.8773, + "step": 2450 + }, + { + "epoch": 0.10745972402687988, + "grad_norm": 0.72265625, + "learning_rate": 4.0801403029361096e-05, + "loss": 0.675, + "step": 2451 + }, + { + "epoch": 0.10750356724353712, + "grad_norm": 0.859375, + "learning_rate": 4.0797692715250256e-05, + "loss": 1.0203, + "step": 2452 + }, + { + "epoch": 0.10754741046019435, + "grad_norm": 0.8046875, + "learning_rate": 4.0793982526615325e-05, + "loss": 0.8767, + "step": 2453 + }, + { + "epoch": 0.10759125367685157, + "grad_norm": 0.84765625, + "learning_rate": 4.079027246346417e-05, + "loss": 0.7784, + "step": 2454 + }, + { + "epoch": 0.10763509689350881, + "grad_norm": 0.953125, + "learning_rate": 4.078656252580461e-05, + "loss": 0.8648, + "step": 2455 + }, + { + "epoch": 0.10767894011016604, + "grad_norm": 0.8671875, + "learning_rate": 4.0782852713644585e-05, + "loss": 0.8441, + "step": 2456 + }, + { + "epoch": 0.10772278332682328, + "grad_norm": 0.8828125, + "learning_rate": 4.0779143026991906e-05, + "loss": 0.7581, + "step": 2457 + }, + { + "epoch": 0.10776662654348051, + "grad_norm": 0.83984375, + "learning_rate": 4.077543346585446e-05, + "loss": 0.7686, + "step": 2458 + }, + { + "epoch": 0.10781046976013775, + "grad_norm": 0.86328125, + "learning_rate": 4.0771724030240086e-05, + "loss": 0.776, + "step": 2459 + }, + { + "epoch": 0.10785431297679499, + "grad_norm": 0.8828125, + "learning_rate": 4.076801472015663e-05, + "loss": 0.7603, + "step": 2460 + }, + { + "epoch": 0.10789815619345222, + "grad_norm": 0.83984375, + "learning_rate": 4.076430553561203e-05, + "loss": 0.8774, + "step": 2461 + }, + { + "epoch": 0.10794199941010944, + "grad_norm": 0.80859375, + "learning_rate": 4.0760596476614075e-05, + "loss": 0.8253, + "step": 2462 + }, + { + "epoch": 0.10798584262676668, + "grad_norm": 0.74609375, + "learning_rate": 4.0756887543170666e-05, + "loss": 0.7407, + "step": 2463 + }, + { + "epoch": 0.10802968584342391, + "grad_norm": 0.859375, + "learning_rate": 4.0753178735289645e-05, + "loss": 0.8064, + "step": 2464 + }, + { + "epoch": 0.10807352906008115, + "grad_norm": 0.8828125, + "learning_rate": 4.074947005297883e-05, + "loss": 0.8405, + "step": 2465 + }, + { + "epoch": 0.10811737227673839, + "grad_norm": 0.81640625, + "learning_rate": 4.074576149624617e-05, + "loss": 0.8381, + "step": 2466 + }, + { + "epoch": 0.10816121549339562, + "grad_norm": 0.77734375, + "learning_rate": 4.0742053065099485e-05, + "loss": 0.7561, + "step": 2467 + }, + { + "epoch": 0.10820505871005286, + "grad_norm": 0.875, + "learning_rate": 4.0738344759546624e-05, + "loss": 0.8854, + "step": 2468 + }, + { + "epoch": 0.10824890192671008, + "grad_norm": 0.796875, + "learning_rate": 4.0734636579595453e-05, + "loss": 0.8196, + "step": 2469 + }, + { + "epoch": 0.10829274514336731, + "grad_norm": 1.1015625, + "learning_rate": 4.073092852525384e-05, + "loss": 0.833, + "step": 2470 + }, + { + "epoch": 0.10833658836002455, + "grad_norm": 0.73046875, + "learning_rate": 4.072722059652963e-05, + "loss": 0.7102, + "step": 2471 + }, + { + "epoch": 0.10838043157668178, + "grad_norm": 0.8359375, + "learning_rate": 4.0723512793430687e-05, + "loss": 0.7797, + "step": 2472 + }, + { + "epoch": 0.10842427479333902, + "grad_norm": 0.8046875, + "learning_rate": 4.0719805115964885e-05, + "loss": 0.8407, + "step": 2473 + }, + { + "epoch": 0.10846811800999626, + "grad_norm": 0.875, + "learning_rate": 4.071609756414001e-05, + "loss": 0.8778, + "step": 2474 + }, + { + "epoch": 0.10851196122665349, + "grad_norm": 0.83984375, + "learning_rate": 4.071239013796403e-05, + "loss": 0.9308, + "step": 2475 + }, + { + "epoch": 0.10855580444331073, + "grad_norm": 0.8203125, + "learning_rate": 4.070868283744475e-05, + "loss": 0.7517, + "step": 2476 + }, + { + "epoch": 0.10859964765996795, + "grad_norm": 0.73828125, + "learning_rate": 4.0704975662590014e-05, + "loss": 0.7517, + "step": 2477 + }, + { + "epoch": 0.10864349087662518, + "grad_norm": 0.79296875, + "learning_rate": 4.07012686134077e-05, + "loss": 0.7749, + "step": 2478 + }, + { + "epoch": 0.10868733409328242, + "grad_norm": 0.8984375, + "learning_rate": 4.069756168990566e-05, + "loss": 0.9334, + "step": 2479 + }, + { + "epoch": 0.10873117730993966, + "grad_norm": 1.2578125, + "learning_rate": 4.069385489209172e-05, + "loss": 1.0065, + "step": 2480 + }, + { + "epoch": 0.10877502052659689, + "grad_norm": 0.81640625, + "learning_rate": 4.069014821997379e-05, + "loss": 0.8308, + "step": 2481 + }, + { + "epoch": 0.10881886374325413, + "grad_norm": 0.87109375, + "learning_rate": 4.068644167355971e-05, + "loss": 0.9292, + "step": 2482 + }, + { + "epoch": 0.10886270695991136, + "grad_norm": 0.80859375, + "learning_rate": 4.068273525285732e-05, + "loss": 0.8636, + "step": 2483 + }, + { + "epoch": 0.10890655017656858, + "grad_norm": 0.75, + "learning_rate": 4.067902895787448e-05, + "loss": 0.8187, + "step": 2484 + }, + { + "epoch": 0.10895039339322582, + "grad_norm": 0.76171875, + "learning_rate": 4.067532278861901e-05, + "loss": 0.7753, + "step": 2485 + }, + { + "epoch": 0.10899423660988306, + "grad_norm": 0.8671875, + "learning_rate": 4.0671616745098836e-05, + "loss": 0.9302, + "step": 2486 + }, + { + "epoch": 0.10903807982654029, + "grad_norm": 0.78515625, + "learning_rate": 4.06679108273218e-05, + "loss": 0.7239, + "step": 2487 + }, + { + "epoch": 0.10908192304319753, + "grad_norm": 0.80859375, + "learning_rate": 4.066420503529571e-05, + "loss": 0.7217, + "step": 2488 + }, + { + "epoch": 0.10912576625985476, + "grad_norm": 0.796875, + "learning_rate": 4.066049936902846e-05, + "loss": 0.8517, + "step": 2489 + }, + { + "epoch": 0.109169609476512, + "grad_norm": 0.859375, + "learning_rate": 4.065679382852785e-05, + "loss": 0.8648, + "step": 2490 + }, + { + "epoch": 0.10921345269316923, + "grad_norm": 0.87109375, + "learning_rate": 4.065308841380182e-05, + "loss": 0.7435, + "step": 2491 + }, + { + "epoch": 0.10925729590982645, + "grad_norm": 0.75, + "learning_rate": 4.064938312485816e-05, + "loss": 0.9357, + "step": 2492 + }, + { + "epoch": 0.10930113912648369, + "grad_norm": 0.91796875, + "learning_rate": 4.0645677961704755e-05, + "loss": 0.8079, + "step": 2493 + }, + { + "epoch": 0.10934498234314093, + "grad_norm": 0.875, + "learning_rate": 4.0641972924349435e-05, + "loss": 0.8139, + "step": 2494 + }, + { + "epoch": 0.10938882555979816, + "grad_norm": 0.8828125, + "learning_rate": 4.063826801280003e-05, + "loss": 0.7839, + "step": 2495 + }, + { + "epoch": 0.1094326687764554, + "grad_norm": 0.73828125, + "learning_rate": 4.0634563227064445e-05, + "loss": 0.7808, + "step": 2496 + }, + { + "epoch": 0.10947651199311263, + "grad_norm": 1.0859375, + "learning_rate": 4.063085856715052e-05, + "loss": 0.7921, + "step": 2497 + }, + { + "epoch": 0.10952035520976987, + "grad_norm": 0.8203125, + "learning_rate": 4.06271540330661e-05, + "loss": 0.7279, + "step": 2498 + }, + { + "epoch": 0.1095641984264271, + "grad_norm": 0.953125, + "learning_rate": 4.062344962481903e-05, + "loss": 0.69, + "step": 2499 + }, + { + "epoch": 0.10960804164308433, + "grad_norm": 0.85546875, + "learning_rate": 4.061974534241714e-05, + "loss": 1.0217, + "step": 2500 + }, + { + "epoch": 0.10965188485974156, + "grad_norm": 0.81640625, + "learning_rate": 4.061604118586833e-05, + "loss": 0.7581, + "step": 2501 + }, + { + "epoch": 0.1096957280763988, + "grad_norm": 0.76171875, + "learning_rate": 4.0612337155180426e-05, + "loss": 0.7527, + "step": 2502 + }, + { + "epoch": 0.10973957129305603, + "grad_norm": 0.8046875, + "learning_rate": 4.0608633250361275e-05, + "loss": 0.8027, + "step": 2503 + }, + { + "epoch": 0.10978341450971327, + "grad_norm": 0.7890625, + "learning_rate": 4.060492947141874e-05, + "loss": 0.785, + "step": 2504 + }, + { + "epoch": 0.1098272577263705, + "grad_norm": 0.77734375, + "learning_rate": 4.060122581836062e-05, + "loss": 0.7968, + "step": 2505 + }, + { + "epoch": 0.10987110094302774, + "grad_norm": 0.8671875, + "learning_rate": 4.059752229119484e-05, + "loss": 0.8141, + "step": 2506 + }, + { + "epoch": 0.10991494415968496, + "grad_norm": 0.78515625, + "learning_rate": 4.059381888992922e-05, + "loss": 0.7511, + "step": 2507 + }, + { + "epoch": 0.1099587873763422, + "grad_norm": 1.0390625, + "learning_rate": 4.059011561457161e-05, + "loss": 0.7074, + "step": 2508 + }, + { + "epoch": 0.11000263059299943, + "grad_norm": 0.74609375, + "learning_rate": 4.0586412465129855e-05, + "loss": 0.8034, + "step": 2509 + }, + { + "epoch": 0.11004647380965667, + "grad_norm": 0.81640625, + "learning_rate": 4.05827094416118e-05, + "loss": 0.8901, + "step": 2510 + }, + { + "epoch": 0.1100903170263139, + "grad_norm": 0.765625, + "learning_rate": 4.0579006544025256e-05, + "loss": 0.7689, + "step": 2511 + }, + { + "epoch": 0.11013416024297114, + "grad_norm": 0.8203125, + "learning_rate": 4.0575303772378146e-05, + "loss": 0.7753, + "step": 2512 + }, + { + "epoch": 0.11017800345962837, + "grad_norm": 0.76171875, + "learning_rate": 4.0571601126678296e-05, + "loss": 0.8131, + "step": 2513 + }, + { + "epoch": 0.11022184667628561, + "grad_norm": 0.83984375, + "learning_rate": 4.056789860693353e-05, + "loss": 0.8155, + "step": 2514 + }, + { + "epoch": 0.11026568989294283, + "grad_norm": 0.7890625, + "learning_rate": 4.0564196213151704e-05, + "loss": 0.8604, + "step": 2515 + }, + { + "epoch": 0.11030953310960007, + "grad_norm": 0.79296875, + "learning_rate": 4.0560493945340675e-05, + "loss": 0.7332, + "step": 2516 + }, + { + "epoch": 0.1103533763262573, + "grad_norm": 0.78125, + "learning_rate": 4.0556791803508274e-05, + "loss": 0.8861, + "step": 2517 + }, + { + "epoch": 0.11039721954291454, + "grad_norm": 0.8359375, + "learning_rate": 4.055308978766237e-05, + "loss": 0.7974, + "step": 2518 + }, + { + "epoch": 0.11044106275957177, + "grad_norm": 0.93359375, + "learning_rate": 4.0549387897810776e-05, + "loss": 0.8358, + "step": 2519 + }, + { + "epoch": 0.11048490597622901, + "grad_norm": 0.74609375, + "learning_rate": 4.0545686133961326e-05, + "loss": 0.7546, + "step": 2520 + }, + { + "epoch": 0.11052874919288624, + "grad_norm": 0.8515625, + "learning_rate": 4.054198449612193e-05, + "loss": 0.8113, + "step": 2521 + }, + { + "epoch": 0.11057259240954347, + "grad_norm": 0.71484375, + "learning_rate": 4.053828298430041e-05, + "loss": 0.7413, + "step": 2522 + }, + { + "epoch": 0.1106164356262007, + "grad_norm": 0.73828125, + "learning_rate": 4.0534581598504595e-05, + "loss": 0.7223, + "step": 2523 + }, + { + "epoch": 0.11066027884285794, + "grad_norm": 0.75390625, + "learning_rate": 4.053088033874234e-05, + "loss": 0.6382, + "step": 2524 + }, + { + "epoch": 0.11070412205951517, + "grad_norm": 0.734375, + "learning_rate": 4.0527179205021435e-05, + "loss": 0.6796, + "step": 2525 + }, + { + "epoch": 0.11074796527617241, + "grad_norm": 0.78515625, + "learning_rate": 4.052347819734982e-05, + "loss": 0.9279, + "step": 2526 + }, + { + "epoch": 0.11079180849282964, + "grad_norm": 0.79296875, + "learning_rate": 4.0519777315735294e-05, + "loss": 0.9045, + "step": 2527 + }, + { + "epoch": 0.11083565170948688, + "grad_norm": 0.80859375, + "learning_rate": 4.05160765601857e-05, + "loss": 0.9074, + "step": 2528 + }, + { + "epoch": 0.11087949492614411, + "grad_norm": 0.81640625, + "learning_rate": 4.051237593070888e-05, + "loss": 0.7648, + "step": 2529 + }, + { + "epoch": 0.11092333814280134, + "grad_norm": 0.8515625, + "learning_rate": 4.050867542731264e-05, + "loss": 0.8532, + "step": 2530 + }, + { + "epoch": 0.11096718135945857, + "grad_norm": 0.85546875, + "learning_rate": 4.05049750500049e-05, + "loss": 0.7256, + "step": 2531 + }, + { + "epoch": 0.11101102457611581, + "grad_norm": 0.8125, + "learning_rate": 4.0501274798793466e-05, + "loss": 0.8057, + "step": 2532 + }, + { + "epoch": 0.11105486779277304, + "grad_norm": 0.78125, + "learning_rate": 4.0497574673686176e-05, + "loss": 0.7963, + "step": 2533 + }, + { + "epoch": 0.11109871100943028, + "grad_norm": 1.1796875, + "learning_rate": 4.049387467469088e-05, + "loss": 0.953, + "step": 2534 + }, + { + "epoch": 0.11114255422608751, + "grad_norm": 0.84375, + "learning_rate": 4.0490174801815384e-05, + "loss": 0.8029, + "step": 2535 + }, + { + "epoch": 0.11118639744274475, + "grad_norm": 0.7578125, + "learning_rate": 4.048647505506759e-05, + "loss": 0.7228, + "step": 2536 + }, + { + "epoch": 0.11123024065940199, + "grad_norm": 0.7265625, + "learning_rate": 4.048277543445531e-05, + "loss": 0.8357, + "step": 2537 + }, + { + "epoch": 0.11127408387605921, + "grad_norm": 0.84375, + "learning_rate": 4.0479075939986385e-05, + "loss": 0.7637, + "step": 2538 + }, + { + "epoch": 0.11131792709271644, + "grad_norm": 0.86328125, + "learning_rate": 4.0475376571668665e-05, + "loss": 0.7657, + "step": 2539 + }, + { + "epoch": 0.11136177030937368, + "grad_norm": 0.8828125, + "learning_rate": 4.0471677329509936e-05, + "loss": 0.9056, + "step": 2540 + }, + { + "epoch": 0.11140561352603091, + "grad_norm": 0.7421875, + "learning_rate": 4.0467978213518133e-05, + "loss": 0.8373, + "step": 2541 + }, + { + "epoch": 0.11144945674268815, + "grad_norm": 0.87109375, + "learning_rate": 4.046427922370103e-05, + "loss": 0.8158, + "step": 2542 + }, + { + "epoch": 0.11149329995934538, + "grad_norm": 0.87890625, + "learning_rate": 4.046058036006649e-05, + "loss": 0.8202, + "step": 2543 + }, + { + "epoch": 0.11153714317600262, + "grad_norm": 0.7890625, + "learning_rate": 4.045688162262235e-05, + "loss": 0.772, + "step": 2544 + }, + { + "epoch": 0.11158098639265984, + "grad_norm": 0.8671875, + "learning_rate": 4.045318301137642e-05, + "loss": 0.8868, + "step": 2545 + }, + { + "epoch": 0.11162482960931708, + "grad_norm": 0.80078125, + "learning_rate": 4.044948452633659e-05, + "loss": 0.7393, + "step": 2546 + }, + { + "epoch": 0.11166867282597431, + "grad_norm": 0.81640625, + "learning_rate": 4.0445786167510665e-05, + "loss": 0.6986, + "step": 2547 + }, + { + "epoch": 0.11171251604263155, + "grad_norm": 0.796875, + "learning_rate": 4.044208793490651e-05, + "loss": 0.7417, + "step": 2548 + }, + { + "epoch": 0.11175635925928878, + "grad_norm": 0.92578125, + "learning_rate": 4.0438389828531944e-05, + "loss": 0.8855, + "step": 2549 + }, + { + "epoch": 0.11180020247594602, + "grad_norm": 0.90234375, + "learning_rate": 4.0434691848394756e-05, + "loss": 0.7684, + "step": 2550 + }, + { + "epoch": 0.11184404569260326, + "grad_norm": 0.78515625, + "learning_rate": 4.0430993994502875e-05, + "loss": 0.8182, + "step": 2551 + }, + { + "epoch": 0.11188788890926049, + "grad_norm": 0.7890625, + "learning_rate": 4.04272962668641e-05, + "loss": 0.8709, + "step": 2552 + }, + { + "epoch": 0.11193173212591771, + "grad_norm": 0.80859375, + "learning_rate": 4.042359866548627e-05, + "loss": 1.0293, + "step": 2553 + }, + { + "epoch": 0.11197557534257495, + "grad_norm": 0.78515625, + "learning_rate": 4.041990119037722e-05, + "loss": 0.7778, + "step": 2554 + }, + { + "epoch": 0.11201941855923218, + "grad_norm": 0.875, + "learning_rate": 4.041620384154477e-05, + "loss": 0.8494, + "step": 2555 + }, + { + "epoch": 0.11206326177588942, + "grad_norm": 0.81640625, + "learning_rate": 4.041250661899678e-05, + "loss": 0.7382, + "step": 2556 + }, + { + "epoch": 0.11210710499254666, + "grad_norm": 0.83203125, + "learning_rate": 4.0408809522741074e-05, + "loss": 0.7451, + "step": 2557 + }, + { + "epoch": 0.11215094820920389, + "grad_norm": 0.80859375, + "learning_rate": 4.0405112552785495e-05, + "loss": 0.8125, + "step": 2558 + }, + { + "epoch": 0.11219479142586113, + "grad_norm": 0.7109375, + "learning_rate": 4.040141570913784e-05, + "loss": 0.6415, + "step": 2559 + }, + { + "epoch": 0.11223863464251835, + "grad_norm": 0.8515625, + "learning_rate": 4.0397718991806e-05, + "loss": 0.9219, + "step": 2560 + }, + { + "epoch": 0.11228247785917558, + "grad_norm": 0.89453125, + "learning_rate": 4.039402240079779e-05, + "loss": 0.8881, + "step": 2561 + }, + { + "epoch": 0.11232632107583282, + "grad_norm": 0.92578125, + "learning_rate": 4.0390325936121054e-05, + "loss": 0.9581, + "step": 2562 + }, + { + "epoch": 0.11237016429249005, + "grad_norm": 0.7265625, + "learning_rate": 4.03866295977836e-05, + "loss": 0.7074, + "step": 2563 + }, + { + "epoch": 0.11241400750914729, + "grad_norm": 0.9609375, + "learning_rate": 4.0382933385793285e-05, + "loss": 0.9325, + "step": 2564 + }, + { + "epoch": 0.11245785072580453, + "grad_norm": 0.8828125, + "learning_rate": 4.0379237300157945e-05, + "loss": 0.8207, + "step": 2565 + }, + { + "epoch": 0.11250169394246176, + "grad_norm": 0.87109375, + "learning_rate": 4.037554134088536e-05, + "loss": 0.8563, + "step": 2566 + }, + { + "epoch": 0.112545537159119, + "grad_norm": 0.75390625, + "learning_rate": 4.037184550798344e-05, + "loss": 0.632, + "step": 2567 + }, + { + "epoch": 0.11258938037577622, + "grad_norm": 0.99609375, + "learning_rate": 4.0368149801459984e-05, + "loss": 0.78, + "step": 2568 + }, + { + "epoch": 0.11263322359243345, + "grad_norm": 0.875, + "learning_rate": 4.0364454221322835e-05, + "loss": 0.8548, + "step": 2569 + }, + { + "epoch": 0.11267706680909069, + "grad_norm": 0.83203125, + "learning_rate": 4.036075876757981e-05, + "loss": 0.9648, + "step": 2570 + }, + { + "epoch": 0.11272091002574793, + "grad_norm": 0.7578125, + "learning_rate": 4.0357063440238706e-05, + "loss": 0.7011, + "step": 2571 + }, + { + "epoch": 0.11276475324240516, + "grad_norm": 0.9140625, + "learning_rate": 4.0353368239307446e-05, + "loss": 0.8611, + "step": 2572 + }, + { + "epoch": 0.1128085964590624, + "grad_norm": 0.828125, + "learning_rate": 4.03496731647938e-05, + "loss": 0.786, + "step": 2573 + }, + { + "epoch": 0.11285243967571963, + "grad_norm": 0.77734375, + "learning_rate": 4.0345978216705624e-05, + "loss": 0.7278, + "step": 2574 + }, + { + "epoch": 0.11289628289237685, + "grad_norm": 0.76953125, + "learning_rate": 4.034228339505073e-05, + "loss": 0.8419, + "step": 2575 + }, + { + "epoch": 0.11294012610903409, + "grad_norm": 0.7578125, + "learning_rate": 4.033858869983692e-05, + "loss": 0.8239, + "step": 2576 + }, + { + "epoch": 0.11298396932569132, + "grad_norm": 0.89453125, + "learning_rate": 4.03348941310721e-05, + "loss": 0.7695, + "step": 2577 + }, + { + "epoch": 0.11302781254234856, + "grad_norm": 0.88671875, + "learning_rate": 4.0331199688764063e-05, + "loss": 0.7786, + "step": 2578 + }, + { + "epoch": 0.1130716557590058, + "grad_norm": 0.87109375, + "learning_rate": 4.032750537292064e-05, + "loss": 0.7386, + "step": 2579 + }, + { + "epoch": 0.11311549897566303, + "grad_norm": 0.796875, + "learning_rate": 4.032381118354965e-05, + "loss": 0.7967, + "step": 2580 + }, + { + "epoch": 0.11315934219232027, + "grad_norm": 0.75, + "learning_rate": 4.032011712065891e-05, + "loss": 0.7314, + "step": 2581 + }, + { + "epoch": 0.1132031854089775, + "grad_norm": 0.8359375, + "learning_rate": 4.031642318425629e-05, + "loss": 0.8802, + "step": 2582 + }, + { + "epoch": 0.11324702862563472, + "grad_norm": 0.7734375, + "learning_rate": 4.031272937434961e-05, + "loss": 0.6969, + "step": 2583 + }, + { + "epoch": 0.11329087184229196, + "grad_norm": 0.828125, + "learning_rate": 4.030903569094668e-05, + "loss": 0.8554, + "step": 2584 + }, + { + "epoch": 0.1133347150589492, + "grad_norm": 1.0390625, + "learning_rate": 4.030534213405535e-05, + "loss": 0.7689, + "step": 2585 + }, + { + "epoch": 0.11337855827560643, + "grad_norm": 0.8125, + "learning_rate": 4.0301648703683394e-05, + "loss": 0.8233, + "step": 2586 + }, + { + "epoch": 0.11342240149226367, + "grad_norm": 0.9453125, + "learning_rate": 4.029795539983871e-05, + "loss": 0.8847, + "step": 2587 + }, + { + "epoch": 0.1134662447089209, + "grad_norm": 0.8359375, + "learning_rate": 4.02942622225291e-05, + "loss": 0.7634, + "step": 2588 + }, + { + "epoch": 0.11351008792557814, + "grad_norm": 0.67578125, + "learning_rate": 4.02905691717624e-05, + "loss": 0.783, + "step": 2589 + }, + { + "epoch": 0.11355393114223537, + "grad_norm": 1.453125, + "learning_rate": 4.028687624754641e-05, + "loss": 0.748, + "step": 2590 + }, + { + "epoch": 0.1135977743588926, + "grad_norm": 0.8515625, + "learning_rate": 4.028318344988894e-05, + "loss": 0.9103, + "step": 2591 + }, + { + "epoch": 0.11364161757554983, + "grad_norm": 1.0703125, + "learning_rate": 4.027949077879789e-05, + "loss": 0.9471, + "step": 2592 + }, + { + "epoch": 0.11368546079220707, + "grad_norm": 0.84765625, + "learning_rate": 4.0275798234281046e-05, + "loss": 0.9488, + "step": 2593 + }, + { + "epoch": 0.1137293040088643, + "grad_norm": 0.87109375, + "learning_rate": 4.027210581634623e-05, + "loss": 0.776, + "step": 2594 + }, + { + "epoch": 0.11377314722552154, + "grad_norm": 0.8046875, + "learning_rate": 4.026841352500128e-05, + "loss": 0.9404, + "step": 2595 + }, + { + "epoch": 0.11381699044217877, + "grad_norm": 0.8125, + "learning_rate": 4.0264721360253966e-05, + "loss": 0.843, + "step": 2596 + }, + { + "epoch": 0.11386083365883601, + "grad_norm": 0.85546875, + "learning_rate": 4.026102932211221e-05, + "loss": 0.804, + "step": 2597 + }, + { + "epoch": 0.11390467687549323, + "grad_norm": 0.8671875, + "learning_rate": 4.025733741058377e-05, + "loss": 0.7563, + "step": 2598 + }, + { + "epoch": 0.11394852009215047, + "grad_norm": 0.78125, + "learning_rate": 4.0253645625676506e-05, + "loss": 0.8333, + "step": 2599 + }, + { + "epoch": 0.1139923633088077, + "grad_norm": 0.80859375, + "learning_rate": 4.0249953967398214e-05, + "loss": 0.6974, + "step": 2600 + }, + { + "epoch": 0.11403620652546494, + "grad_norm": 0.890625, + "learning_rate": 4.024626243575674e-05, + "loss": 0.8052, + "step": 2601 + }, + { + "epoch": 0.11408004974212217, + "grad_norm": 0.7578125, + "learning_rate": 4.024257103075989e-05, + "loss": 0.7028, + "step": 2602 + }, + { + "epoch": 0.11412389295877941, + "grad_norm": 0.7890625, + "learning_rate": 4.023887975241549e-05, + "loss": 0.802, + "step": 2603 + }, + { + "epoch": 0.11416773617543664, + "grad_norm": 0.8515625, + "learning_rate": 4.0235188600731376e-05, + "loss": 0.7821, + "step": 2604 + }, + { + "epoch": 0.11421157939209388, + "grad_norm": 0.890625, + "learning_rate": 4.023149757571537e-05, + "loss": 0.8671, + "step": 2605 + }, + { + "epoch": 0.1142554226087511, + "grad_norm": 0.80859375, + "learning_rate": 4.022780667737525e-05, + "loss": 0.8579, + "step": 2606 + }, + { + "epoch": 0.11429926582540834, + "grad_norm": 0.92578125, + "learning_rate": 4.022411590571891e-05, + "loss": 0.8249, + "step": 2607 + }, + { + "epoch": 0.11434310904206557, + "grad_norm": 0.8046875, + "learning_rate": 4.0220425260754126e-05, + "loss": 0.8709, + "step": 2608 + }, + { + "epoch": 0.11438695225872281, + "grad_norm": 0.8515625, + "learning_rate": 4.021673474248875e-05, + "loss": 0.8297, + "step": 2609 + }, + { + "epoch": 0.11443079547538004, + "grad_norm": 0.81640625, + "learning_rate": 4.0213044350930585e-05, + "loss": 0.7231, + "step": 2610 + }, + { + "epoch": 0.11447463869203728, + "grad_norm": 0.76953125, + "learning_rate": 4.020935408608742e-05, + "loss": 0.7703, + "step": 2611 + }, + { + "epoch": 0.11451848190869451, + "grad_norm": 0.84375, + "learning_rate": 4.020566394796714e-05, + "loss": 0.79, + "step": 2612 + }, + { + "epoch": 0.11456232512535174, + "grad_norm": 0.84765625, + "learning_rate": 4.020197393657754e-05, + "loss": 0.8294, + "step": 2613 + }, + { + "epoch": 0.11460616834200897, + "grad_norm": 0.8125, + "learning_rate": 4.019828405192645e-05, + "loss": 0.8044, + "step": 2614 + }, + { + "epoch": 0.1146500115586662, + "grad_norm": 0.7578125, + "learning_rate": 4.019459429402167e-05, + "loss": 0.7085, + "step": 2615 + }, + { + "epoch": 0.11469385477532344, + "grad_norm": 0.78125, + "learning_rate": 4.019090466287099e-05, + "loss": 0.7616, + "step": 2616 + }, + { + "epoch": 0.11473769799198068, + "grad_norm": 0.81640625, + "learning_rate": 4.0187215158482305e-05, + "loss": 0.7381, + "step": 2617 + }, + { + "epoch": 0.11478154120863791, + "grad_norm": 0.87890625, + "learning_rate": 4.0183525780863405e-05, + "loss": 0.8727, + "step": 2618 + }, + { + "epoch": 0.11482538442529515, + "grad_norm": 0.78515625, + "learning_rate": 4.017983653002211e-05, + "loss": 0.7102, + "step": 2619 + }, + { + "epoch": 0.11486922764195238, + "grad_norm": 0.77734375, + "learning_rate": 4.0176147405966214e-05, + "loss": 0.77, + "step": 2620 + }, + { + "epoch": 0.1149130708586096, + "grad_norm": 0.81640625, + "learning_rate": 4.017245840870353e-05, + "loss": 0.8178, + "step": 2621 + }, + { + "epoch": 0.11495691407526684, + "grad_norm": 0.83984375, + "learning_rate": 4.016876953824193e-05, + "loss": 0.814, + "step": 2622 + }, + { + "epoch": 0.11500075729192408, + "grad_norm": 0.7734375, + "learning_rate": 4.016508079458922e-05, + "loss": 0.7932, + "step": 2623 + }, + { + "epoch": 0.11504460050858131, + "grad_norm": 0.76171875, + "learning_rate": 4.0161392177753174e-05, + "loss": 0.7462, + "step": 2624 + }, + { + "epoch": 0.11508844372523855, + "grad_norm": 0.81640625, + "learning_rate": 4.015770368774166e-05, + "loss": 0.804, + "step": 2625 + }, + { + "epoch": 0.11513228694189578, + "grad_norm": 0.81640625, + "learning_rate": 4.015401532456242e-05, + "loss": 0.8488, + "step": 2626 + }, + { + "epoch": 0.11517613015855302, + "grad_norm": 0.89453125, + "learning_rate": 4.015032708822337e-05, + "loss": 0.8441, + "step": 2627 + }, + { + "epoch": 0.11521997337521026, + "grad_norm": 0.890625, + "learning_rate": 4.0146638978732276e-05, + "loss": 0.8852, + "step": 2628 + }, + { + "epoch": 0.11526381659186748, + "grad_norm": 0.83984375, + "learning_rate": 4.014295099609697e-05, + "loss": 0.8921, + "step": 2629 + }, + { + "epoch": 0.11530765980852471, + "grad_norm": 0.84765625, + "learning_rate": 4.013926314032525e-05, + "loss": 0.8699, + "step": 2630 + }, + { + "epoch": 0.11535150302518195, + "grad_norm": 0.890625, + "learning_rate": 4.013557541142491e-05, + "loss": 0.7685, + "step": 2631 + }, + { + "epoch": 0.11539534624183918, + "grad_norm": 0.87890625, + "learning_rate": 4.0131887809403826e-05, + "loss": 0.7924, + "step": 2632 + }, + { + "epoch": 0.11543918945849642, + "grad_norm": 1.234375, + "learning_rate": 4.012820033426978e-05, + "loss": 0.7991, + "step": 2633 + }, + { + "epoch": 0.11548303267515365, + "grad_norm": 0.875, + "learning_rate": 4.012451298603059e-05, + "loss": 0.8209, + "step": 2634 + }, + { + "epoch": 0.11552687589181089, + "grad_norm": 0.8359375, + "learning_rate": 4.012082576469408e-05, + "loss": 0.7836, + "step": 2635 + }, + { + "epoch": 0.11557071910846811, + "grad_norm": 0.8984375, + "learning_rate": 4.0117138670268027e-05, + "loss": 0.8681, + "step": 2636 + }, + { + "epoch": 0.11561456232512535, + "grad_norm": 0.7890625, + "learning_rate": 4.0113451702760306e-05, + "loss": 0.752, + "step": 2637 + }, + { + "epoch": 0.11565840554178258, + "grad_norm": 0.88671875, + "learning_rate": 4.01097648621787e-05, + "loss": 0.8106, + "step": 2638 + }, + { + "epoch": 0.11570224875843982, + "grad_norm": 0.9375, + "learning_rate": 4.010607814853102e-05, + "loss": 0.725, + "step": 2639 + }, + { + "epoch": 0.11574609197509705, + "grad_norm": 0.78125, + "learning_rate": 4.0102391561825095e-05, + "loss": 0.7145, + "step": 2640 + }, + { + "epoch": 0.11578993519175429, + "grad_norm": 0.8203125, + "learning_rate": 4.0098705102068713e-05, + "loss": 0.9136, + "step": 2641 + }, + { + "epoch": 0.11583377840841153, + "grad_norm": 0.78125, + "learning_rate": 4.009501876926971e-05, + "loss": 0.8597, + "step": 2642 + }, + { + "epoch": 0.11587762162506876, + "grad_norm": 0.8125, + "learning_rate": 4.009133256343589e-05, + "loss": 0.7248, + "step": 2643 + }, + { + "epoch": 0.11592146484172598, + "grad_norm": 0.83203125, + "learning_rate": 4.008764648457507e-05, + "loss": 0.8211, + "step": 2644 + }, + { + "epoch": 0.11596530805838322, + "grad_norm": 0.7734375, + "learning_rate": 4.008396053269502e-05, + "loss": 0.7657, + "step": 2645 + }, + { + "epoch": 0.11600915127504045, + "grad_norm": 0.76171875, + "learning_rate": 4.008027470780362e-05, + "loss": 0.7761, + "step": 2646 + }, + { + "epoch": 0.11605299449169769, + "grad_norm": 0.875, + "learning_rate": 4.007658900990866e-05, + "loss": 0.8507, + "step": 2647 + }, + { + "epoch": 0.11609683770835492, + "grad_norm": 0.8125, + "learning_rate": 4.007290343901795e-05, + "loss": 0.803, + "step": 2648 + }, + { + "epoch": 0.11614068092501216, + "grad_norm": 0.8125, + "learning_rate": 4.0069217995139286e-05, + "loss": 0.8243, + "step": 2649 + }, + { + "epoch": 0.1161845241416694, + "grad_norm": 0.75390625, + "learning_rate": 4.0065532678280496e-05, + "loss": 0.6211, + "step": 2650 + }, + { + "epoch": 0.11622836735832662, + "grad_norm": 0.7734375, + "learning_rate": 4.006184748844935e-05, + "loss": 0.8406, + "step": 2651 + }, + { + "epoch": 0.11627221057498385, + "grad_norm": 0.81640625, + "learning_rate": 4.005816242565371e-05, + "loss": 0.8185, + "step": 2652 + }, + { + "epoch": 0.11631605379164109, + "grad_norm": 0.76171875, + "learning_rate": 4.0054477489901384e-05, + "loss": 0.7647, + "step": 2653 + }, + { + "epoch": 0.11635989700829832, + "grad_norm": 0.75, + "learning_rate": 4.0050792681200164e-05, + "loss": 0.8668, + "step": 2654 + }, + { + "epoch": 0.11640374022495556, + "grad_norm": 0.8203125, + "learning_rate": 4.004710799955786e-05, + "loss": 0.7377, + "step": 2655 + }, + { + "epoch": 0.1164475834416128, + "grad_norm": 0.96484375, + "learning_rate": 4.004342344498225e-05, + "loss": 0.8741, + "step": 2656 + }, + { + "epoch": 0.11649142665827003, + "grad_norm": 0.78515625, + "learning_rate": 4.003973901748122e-05, + "loss": 0.7533, + "step": 2657 + }, + { + "epoch": 0.11653526987492727, + "grad_norm": 0.8828125, + "learning_rate": 4.003605471706253e-05, + "loss": 0.7668, + "step": 2658 + }, + { + "epoch": 0.11657911309158449, + "grad_norm": 0.765625, + "learning_rate": 4.003237054373399e-05, + "loss": 0.7418, + "step": 2659 + }, + { + "epoch": 0.11662295630824172, + "grad_norm": 0.796875, + "learning_rate": 4.002868649750342e-05, + "loss": 0.8943, + "step": 2660 + }, + { + "epoch": 0.11666679952489896, + "grad_norm": 0.7734375, + "learning_rate": 4.002500257837858e-05, + "loss": 0.8599, + "step": 2661 + }, + { + "epoch": 0.1167106427415562, + "grad_norm": 0.7421875, + "learning_rate": 4.0021318786367355e-05, + "loss": 0.6628, + "step": 2662 + }, + { + "epoch": 0.11675448595821343, + "grad_norm": 0.8125, + "learning_rate": 4.0017635121477525e-05, + "loss": 0.8164, + "step": 2663 + }, + { + "epoch": 0.11679832917487067, + "grad_norm": 0.83984375, + "learning_rate": 4.001395158371688e-05, + "loss": 0.8093, + "step": 2664 + }, + { + "epoch": 0.1168421723915279, + "grad_norm": 0.92578125, + "learning_rate": 4.0010268173093234e-05, + "loss": 0.8154, + "step": 2665 + }, + { + "epoch": 0.11688601560818512, + "grad_norm": 0.84375, + "learning_rate": 4.000658488961437e-05, + "loss": 0.7943, + "step": 2666 + }, + { + "epoch": 0.11692985882484236, + "grad_norm": 0.81640625, + "learning_rate": 4.000290173328816e-05, + "loss": 0.8178, + "step": 2667 + }, + { + "epoch": 0.1169737020414996, + "grad_norm": 0.81640625, + "learning_rate": 3.999921870412235e-05, + "loss": 0.7962, + "step": 2668 + }, + { + "epoch": 0.11701754525815683, + "grad_norm": 0.7734375, + "learning_rate": 3.999553580212478e-05, + "loss": 0.8249, + "step": 2669 + }, + { + "epoch": 0.11706138847481407, + "grad_norm": 0.8359375, + "learning_rate": 3.999185302730324e-05, + "loss": 0.7292, + "step": 2670 + }, + { + "epoch": 0.1171052316914713, + "grad_norm": 0.765625, + "learning_rate": 3.99881703796655e-05, + "loss": 0.763, + "step": 2671 + }, + { + "epoch": 0.11714907490812854, + "grad_norm": 0.875, + "learning_rate": 3.998448785921944e-05, + "loss": 0.8595, + "step": 2672 + }, + { + "epoch": 0.11719291812478577, + "grad_norm": 0.796875, + "learning_rate": 3.998080546597282e-05, + "loss": 0.8117, + "step": 2673 + }, + { + "epoch": 0.117236761341443, + "grad_norm": 0.88671875, + "learning_rate": 3.997712319993348e-05, + "loss": 0.9969, + "step": 2674 + }, + { + "epoch": 0.11728060455810023, + "grad_norm": 0.796875, + "learning_rate": 3.9973441061109176e-05, + "loss": 0.7981, + "step": 2675 + }, + { + "epoch": 0.11732444777475746, + "grad_norm": 0.8984375, + "learning_rate": 3.996975904950769e-05, + "loss": 0.8365, + "step": 2676 + }, + { + "epoch": 0.1173682909914147, + "grad_norm": 0.8203125, + "learning_rate": 3.9966077165136914e-05, + "loss": 0.7795, + "step": 2677 + }, + { + "epoch": 0.11741213420807194, + "grad_norm": 0.76171875, + "learning_rate": 3.9962395408004605e-05, + "loss": 0.7934, + "step": 2678 + }, + { + "epoch": 0.11745597742472917, + "grad_norm": 0.78125, + "learning_rate": 3.995871377811856e-05, + "loss": 0.8574, + "step": 2679 + }, + { + "epoch": 0.11749982064138641, + "grad_norm": 0.80859375, + "learning_rate": 3.9955032275486604e-05, + "loss": 0.8256, + "step": 2680 + }, + { + "epoch": 0.11754366385804364, + "grad_norm": 0.75390625, + "learning_rate": 3.995135090011651e-05, + "loss": 0.6386, + "step": 2681 + }, + { + "epoch": 0.11758750707470086, + "grad_norm": 0.7890625, + "learning_rate": 3.994766965201607e-05, + "loss": 0.8239, + "step": 2682 + }, + { + "epoch": 0.1176313502913581, + "grad_norm": 0.82421875, + "learning_rate": 3.994398853119314e-05, + "loss": 0.839, + "step": 2683 + }, + { + "epoch": 0.11767519350801534, + "grad_norm": 0.73828125, + "learning_rate": 3.9940307537655505e-05, + "loss": 0.7379, + "step": 2684 + }, + { + "epoch": 0.11771903672467257, + "grad_norm": 0.85546875, + "learning_rate": 3.993662667141095e-05, + "loss": 0.9116, + "step": 2685 + }, + { + "epoch": 0.1177628799413298, + "grad_norm": 0.79296875, + "learning_rate": 3.993294593246728e-05, + "loss": 0.7125, + "step": 2686 + }, + { + "epoch": 0.11780672315798704, + "grad_norm": 1.0859375, + "learning_rate": 3.99292653208323e-05, + "loss": 0.823, + "step": 2687 + }, + { + "epoch": 0.11785056637464428, + "grad_norm": 0.82421875, + "learning_rate": 3.992558483651381e-05, + "loss": 0.897, + "step": 2688 + }, + { + "epoch": 0.1178944095913015, + "grad_norm": 0.90234375, + "learning_rate": 3.99219044795196e-05, + "loss": 0.8367, + "step": 2689 + }, + { + "epoch": 0.11793825280795874, + "grad_norm": 1.40625, + "learning_rate": 3.9918224249857486e-05, + "loss": 0.8155, + "step": 2690 + }, + { + "epoch": 0.11798209602461597, + "grad_norm": 0.85546875, + "learning_rate": 3.991454414753523e-05, + "loss": 0.94, + "step": 2691 + }, + { + "epoch": 0.1180259392412732, + "grad_norm": 0.8046875, + "learning_rate": 3.991086417256069e-05, + "loss": 0.8386, + "step": 2692 + }, + { + "epoch": 0.11806978245793044, + "grad_norm": 0.83984375, + "learning_rate": 3.990718432494164e-05, + "loss": 0.8019, + "step": 2693 + }, + { + "epoch": 0.11811362567458768, + "grad_norm": 0.890625, + "learning_rate": 3.9903504604685883e-05, + "loss": 0.8028, + "step": 2694 + }, + { + "epoch": 0.11815746889124491, + "grad_norm": 0.8046875, + "learning_rate": 3.9899825011801204e-05, + "loss": 0.7845, + "step": 2695 + }, + { + "epoch": 0.11820131210790215, + "grad_norm": 0.84375, + "learning_rate": 3.9896145546295426e-05, + "loss": 0.8765, + "step": 2696 + }, + { + "epoch": 0.11824515532455937, + "grad_norm": 0.828125, + "learning_rate": 3.989246620817628e-05, + "loss": 0.7561, + "step": 2697 + }, + { + "epoch": 0.1182889985412166, + "grad_norm": 0.7578125, + "learning_rate": 3.9888786997451665e-05, + "loss": 0.7505, + "step": 2698 + }, + { + "epoch": 0.11833284175787384, + "grad_norm": 0.8515625, + "learning_rate": 3.988510791412933e-05, + "loss": 0.7739, + "step": 2699 + }, + { + "epoch": 0.11837668497453108, + "grad_norm": 0.79296875, + "learning_rate": 3.988142895821707e-05, + "loss": 0.9292, + "step": 2700 + }, + { + "epoch": 0.11842052819118831, + "grad_norm": 0.8046875, + "learning_rate": 3.987775012972268e-05, + "loss": 0.7519, + "step": 2701 + }, + { + "epoch": 0.11846437140784555, + "grad_norm": 0.84765625, + "learning_rate": 3.987407142865394e-05, + "loss": 0.8383, + "step": 2702 + }, + { + "epoch": 0.11850821462450278, + "grad_norm": 0.828125, + "learning_rate": 3.987039285501869e-05, + "loss": 0.9042, + "step": 2703 + }, + { + "epoch": 0.11855205784116, + "grad_norm": 0.8359375, + "learning_rate": 3.98667144088247e-05, + "loss": 0.7429, + "step": 2704 + }, + { + "epoch": 0.11859590105781724, + "grad_norm": 0.75, + "learning_rate": 3.9863036090079785e-05, + "loss": 0.7096, + "step": 2705 + }, + { + "epoch": 0.11863974427447448, + "grad_norm": 0.7890625, + "learning_rate": 3.9859357898791725e-05, + "loss": 0.7176, + "step": 2706 + }, + { + "epoch": 0.11868358749113171, + "grad_norm": 0.8671875, + "learning_rate": 3.9855679834968285e-05, + "loss": 0.722, + "step": 2707 + }, + { + "epoch": 0.11872743070778895, + "grad_norm": 1.1171875, + "learning_rate": 3.985200189861733e-05, + "loss": 0.7175, + "step": 2708 + }, + { + "epoch": 0.11877127392444618, + "grad_norm": 0.80859375, + "learning_rate": 3.984832408974661e-05, + "loss": 0.6558, + "step": 2709 + }, + { + "epoch": 0.11881511714110342, + "grad_norm": 0.77734375, + "learning_rate": 3.984464640836394e-05, + "loss": 0.8046, + "step": 2710 + }, + { + "epoch": 0.11885896035776065, + "grad_norm": 0.78515625, + "learning_rate": 3.98409688544771e-05, + "loss": 0.8327, + "step": 2711 + }, + { + "epoch": 0.11890280357441788, + "grad_norm": 4.65625, + "learning_rate": 3.983729142809385e-05, + "loss": 0.9329, + "step": 2712 + }, + { + "epoch": 0.11894664679107511, + "grad_norm": 0.9375, + "learning_rate": 3.983361412922206e-05, + "loss": 0.9665, + "step": 2713 + }, + { + "epoch": 0.11899049000773235, + "grad_norm": 0.78515625, + "learning_rate": 3.982993695786949e-05, + "loss": 0.8561, + "step": 2714 + }, + { + "epoch": 0.11903433322438958, + "grad_norm": 0.81640625, + "learning_rate": 3.9826259914043916e-05, + "loss": 0.8494, + "step": 2715 + }, + { + "epoch": 0.11907817644104682, + "grad_norm": 0.7578125, + "learning_rate": 3.9822582997753154e-05, + "loss": 0.7349, + "step": 2716 + }, + { + "epoch": 0.11912201965770405, + "grad_norm": 0.81640625, + "learning_rate": 3.9818906209004955e-05, + "loss": 0.8761, + "step": 2717 + }, + { + "epoch": 0.11916586287436129, + "grad_norm": 0.7734375, + "learning_rate": 3.981522954780718e-05, + "loss": 0.766, + "step": 2718 + }, + { + "epoch": 0.11920970609101852, + "grad_norm": 0.74609375, + "learning_rate": 3.9811553014167576e-05, + "loss": 0.7631, + "step": 2719 + }, + { + "epoch": 0.11925354930767575, + "grad_norm": 0.734375, + "learning_rate": 3.9807876608093955e-05, + "loss": 0.7463, + "step": 2720 + }, + { + "epoch": 0.11929739252433298, + "grad_norm": 0.78125, + "learning_rate": 3.980420032959409e-05, + "loss": 0.8861, + "step": 2721 + }, + { + "epoch": 0.11934123574099022, + "grad_norm": 0.82421875, + "learning_rate": 3.980052417867576e-05, + "loss": 0.7985, + "step": 2722 + }, + { + "epoch": 0.11938507895764745, + "grad_norm": 0.80078125, + "learning_rate": 3.97968481553468e-05, + "loss": 0.8234, + "step": 2723 + }, + { + "epoch": 0.11942892217430469, + "grad_norm": 1.09375, + "learning_rate": 3.979317225961499e-05, + "loss": 0.8723, + "step": 2724 + }, + { + "epoch": 0.11947276539096192, + "grad_norm": 0.8125, + "learning_rate": 3.97894964914881e-05, + "loss": 0.8419, + "step": 2725 + }, + { + "epoch": 0.11951660860761916, + "grad_norm": 0.87109375, + "learning_rate": 3.978582085097393e-05, + "loss": 0.9186, + "step": 2726 + }, + { + "epoch": 0.11956045182427638, + "grad_norm": 0.890625, + "learning_rate": 3.978214533808028e-05, + "loss": 0.8852, + "step": 2727 + }, + { + "epoch": 0.11960429504093362, + "grad_norm": 0.86328125, + "learning_rate": 3.9778469952814935e-05, + "loss": 0.8774, + "step": 2728 + }, + { + "epoch": 0.11964813825759085, + "grad_norm": 0.7734375, + "learning_rate": 3.977479469518567e-05, + "loss": 0.8538, + "step": 2729 + }, + { + "epoch": 0.11969198147424809, + "grad_norm": 0.8125, + "learning_rate": 3.977111956520025e-05, + "loss": 0.8521, + "step": 2730 + }, + { + "epoch": 0.11973582469090532, + "grad_norm": 0.77734375, + "learning_rate": 3.976744456286654e-05, + "loss": 0.7605, + "step": 2731 + }, + { + "epoch": 0.11977966790756256, + "grad_norm": 0.8046875, + "learning_rate": 3.976376968819229e-05, + "loss": 0.9239, + "step": 2732 + }, + { + "epoch": 0.1198235111242198, + "grad_norm": 0.828125, + "learning_rate": 3.976009494118528e-05, + "loss": 0.8986, + "step": 2733 + }, + { + "epoch": 0.11986735434087703, + "grad_norm": 0.80859375, + "learning_rate": 3.9756420321853315e-05, + "loss": 0.7427, + "step": 2734 + }, + { + "epoch": 0.11991119755753425, + "grad_norm": 0.75390625, + "learning_rate": 3.9752745830204164e-05, + "loss": 0.832, + "step": 2735 + }, + { + "epoch": 0.11995504077419149, + "grad_norm": 0.8828125, + "learning_rate": 3.9749071466245636e-05, + "loss": 0.8331, + "step": 2736 + }, + { + "epoch": 0.11999888399084872, + "grad_norm": 0.71875, + "learning_rate": 3.9745397229985457e-05, + "loss": 0.7183, + "step": 2737 + }, + { + "epoch": 0.12004272720750596, + "grad_norm": 0.7890625, + "learning_rate": 3.9741723121431506e-05, + "loss": 0.7795, + "step": 2738 + }, + { + "epoch": 0.1200865704241632, + "grad_norm": 0.82421875, + "learning_rate": 3.973804914059154e-05, + "loss": 0.7902, + "step": 2739 + }, + { + "epoch": 0.12013041364082043, + "grad_norm": 0.83203125, + "learning_rate": 3.973437528747332e-05, + "loss": 0.782, + "step": 2740 + }, + { + "epoch": 0.12017425685747767, + "grad_norm": 0.8515625, + "learning_rate": 3.973070156208465e-05, + "loss": 0.7374, + "step": 2741 + }, + { + "epoch": 0.12021810007413489, + "grad_norm": 0.7421875, + "learning_rate": 3.972702796443328e-05, + "loss": 0.8727, + "step": 2742 + }, + { + "epoch": 0.12026194329079212, + "grad_norm": 0.8984375, + "learning_rate": 3.972335449452707e-05, + "loss": 0.9825, + "step": 2743 + }, + { + "epoch": 0.12030578650744936, + "grad_norm": 0.94921875, + "learning_rate": 3.971968115237375e-05, + "loss": 0.8108, + "step": 2744 + }, + { + "epoch": 0.1203496297241066, + "grad_norm": 0.875, + "learning_rate": 3.9716007937981134e-05, + "loss": 0.8966, + "step": 2745 + }, + { + "epoch": 0.12039347294076383, + "grad_norm": 0.953125, + "learning_rate": 3.971233485135699e-05, + "loss": 0.9047, + "step": 2746 + }, + { + "epoch": 0.12043731615742106, + "grad_norm": 0.8046875, + "learning_rate": 3.970866189250907e-05, + "loss": 0.9121, + "step": 2747 + }, + { + "epoch": 0.1204811593740783, + "grad_norm": 0.83984375, + "learning_rate": 3.970498906144523e-05, + "loss": 0.7955, + "step": 2748 + }, + { + "epoch": 0.12052500259073554, + "grad_norm": 0.9609375, + "learning_rate": 3.9701316358173224e-05, + "loss": 0.8752, + "step": 2749 + }, + { + "epoch": 0.12056884580739276, + "grad_norm": 0.875, + "learning_rate": 3.969764378270083e-05, + "loss": 0.8925, + "step": 2750 + }, + { + "epoch": 0.12061268902405, + "grad_norm": 0.8125, + "learning_rate": 3.969397133503583e-05, + "loss": 0.8837, + "step": 2751 + }, + { + "epoch": 0.12065653224070723, + "grad_norm": 0.796875, + "learning_rate": 3.969029901518597e-05, + "loss": 0.7153, + "step": 2752 + }, + { + "epoch": 0.12070037545736446, + "grad_norm": 0.76953125, + "learning_rate": 3.9686626823159125e-05, + "loss": 0.8274, + "step": 2753 + }, + { + "epoch": 0.1207442186740217, + "grad_norm": 0.796875, + "learning_rate": 3.968295475896301e-05, + "loss": 0.7146, + "step": 2754 + }, + { + "epoch": 0.12078806189067894, + "grad_norm": 0.82421875, + "learning_rate": 3.9679282822605434e-05, + "loss": 0.7417, + "step": 2755 + }, + { + "epoch": 0.12083190510733617, + "grad_norm": 0.90625, + "learning_rate": 3.9675611014094174e-05, + "loss": 0.8635, + "step": 2756 + }, + { + "epoch": 0.12087574832399339, + "grad_norm": 0.90234375, + "learning_rate": 3.967193933343697e-05, + "loss": 0.9173, + "step": 2757 + }, + { + "epoch": 0.12091959154065063, + "grad_norm": 0.82421875, + "learning_rate": 3.9668267780641675e-05, + "loss": 0.8899, + "step": 2758 + }, + { + "epoch": 0.12096343475730786, + "grad_norm": 0.85546875, + "learning_rate": 3.9664596355716035e-05, + "loss": 0.8165, + "step": 2759 + }, + { + "epoch": 0.1210072779739651, + "grad_norm": 0.87109375, + "learning_rate": 3.966092505866784e-05, + "loss": 0.8743, + "step": 2760 + }, + { + "epoch": 0.12105112119062233, + "grad_norm": 0.80859375, + "learning_rate": 3.965725388950486e-05, + "loss": 0.8369, + "step": 2761 + }, + { + "epoch": 0.12109496440727957, + "grad_norm": 0.92578125, + "learning_rate": 3.9653582848234846e-05, + "loss": 0.6851, + "step": 2762 + }, + { + "epoch": 0.1211388076239368, + "grad_norm": 0.80859375, + "learning_rate": 3.964991193486566e-05, + "loss": 0.7634, + "step": 2763 + }, + { + "epoch": 0.12118265084059404, + "grad_norm": 0.78125, + "learning_rate": 3.964624114940503e-05, + "loss": 0.8747, + "step": 2764 + }, + { + "epoch": 0.12122649405725126, + "grad_norm": 0.8046875, + "learning_rate": 3.964257049186074e-05, + "loss": 0.7041, + "step": 2765 + }, + { + "epoch": 0.1212703372739085, + "grad_norm": 0.9375, + "learning_rate": 3.963889996224057e-05, + "loss": 0.8328, + "step": 2766 + }, + { + "epoch": 0.12131418049056573, + "grad_norm": 0.92578125, + "learning_rate": 3.963522956055228e-05, + "loss": 0.9409, + "step": 2767 + }, + { + "epoch": 0.12135802370722297, + "grad_norm": 0.78125, + "learning_rate": 3.9631559286803685e-05, + "loss": 0.7272, + "step": 2768 + }, + { + "epoch": 0.1214018669238802, + "grad_norm": 0.82421875, + "learning_rate": 3.962788914100257e-05, + "loss": 0.8174, + "step": 2769 + }, + { + "epoch": 0.12144571014053744, + "grad_norm": 0.78515625, + "learning_rate": 3.962421912315668e-05, + "loss": 0.7102, + "step": 2770 + }, + { + "epoch": 0.12148955335719468, + "grad_norm": 0.7890625, + "learning_rate": 3.962054923327381e-05, + "loss": 0.7966, + "step": 2771 + }, + { + "epoch": 0.12153339657385191, + "grad_norm": 0.828125, + "learning_rate": 3.961687947136173e-05, + "loss": 0.8656, + "step": 2772 + }, + { + "epoch": 0.12157723979050913, + "grad_norm": 0.8828125, + "learning_rate": 3.961320983742823e-05, + "loss": 0.8814, + "step": 2773 + }, + { + "epoch": 0.12162108300716637, + "grad_norm": 0.79296875, + "learning_rate": 3.960954033148109e-05, + "loss": 0.8598, + "step": 2774 + }, + { + "epoch": 0.1216649262238236, + "grad_norm": 0.8515625, + "learning_rate": 3.960587095352806e-05, + "loss": 0.7735, + "step": 2775 + }, + { + "epoch": 0.12170876944048084, + "grad_norm": 0.83984375, + "learning_rate": 3.9602201703576934e-05, + "loss": 0.913, + "step": 2776 + }, + { + "epoch": 0.12175261265713808, + "grad_norm": 0.81640625, + "learning_rate": 3.959853258163546e-05, + "loss": 0.844, + "step": 2777 + }, + { + "epoch": 0.12179645587379531, + "grad_norm": 0.83984375, + "learning_rate": 3.959486358771148e-05, + "loss": 0.8463, + "step": 2778 + }, + { + "epoch": 0.12184029909045255, + "grad_norm": 0.8203125, + "learning_rate": 3.959119472181274e-05, + "loss": 0.6804, + "step": 2779 + }, + { + "epoch": 0.12188414230710977, + "grad_norm": 0.83203125, + "learning_rate": 3.9587525983947e-05, + "loss": 0.7673, + "step": 2780 + }, + { + "epoch": 0.121927985523767, + "grad_norm": 0.8046875, + "learning_rate": 3.958385737412205e-05, + "loss": 0.6806, + "step": 2781 + }, + { + "epoch": 0.12197182874042424, + "grad_norm": 0.79296875, + "learning_rate": 3.958018889234562e-05, + "loss": 0.8099, + "step": 2782 + }, + { + "epoch": 0.12201567195708148, + "grad_norm": 0.86328125, + "learning_rate": 3.957652053862557e-05, + "loss": 0.8513, + "step": 2783 + }, + { + "epoch": 0.12205951517373871, + "grad_norm": 1.3671875, + "learning_rate": 3.9572852312969625e-05, + "loss": 0.7201, + "step": 2784 + }, + { + "epoch": 0.12210335839039595, + "grad_norm": 0.828125, + "learning_rate": 3.956918421538557e-05, + "loss": 0.8387, + "step": 2785 + }, + { + "epoch": 0.12214720160705318, + "grad_norm": 0.79296875, + "learning_rate": 3.956551624588117e-05, + "loss": 0.7802, + "step": 2786 + }, + { + "epoch": 0.12219104482371042, + "grad_norm": 0.8203125, + "learning_rate": 3.956184840446418e-05, + "loss": 0.783, + "step": 2787 + }, + { + "epoch": 0.12223488804036764, + "grad_norm": 0.8125, + "learning_rate": 3.9558180691142424e-05, + "loss": 0.8003, + "step": 2788 + }, + { + "epoch": 0.12227873125702488, + "grad_norm": 0.83203125, + "learning_rate": 3.955451310592364e-05, + "loss": 0.851, + "step": 2789 + }, + { + "epoch": 0.12232257447368211, + "grad_norm": 0.84375, + "learning_rate": 3.955084564881563e-05, + "loss": 0.834, + "step": 2790 + }, + { + "epoch": 0.12236641769033935, + "grad_norm": 0.78515625, + "learning_rate": 3.954717831982614e-05, + "loss": 0.6217, + "step": 2791 + }, + { + "epoch": 0.12241026090699658, + "grad_norm": 0.79296875, + "learning_rate": 3.9543511118962906e-05, + "loss": 0.743, + "step": 2792 + }, + { + "epoch": 0.12245410412365382, + "grad_norm": 0.76953125, + "learning_rate": 3.9539844046233787e-05, + "loss": 0.7196, + "step": 2793 + }, + { + "epoch": 0.12249794734031105, + "grad_norm": 1.0234375, + "learning_rate": 3.953617710164651e-05, + "loss": 0.8497, + "step": 2794 + }, + { + "epoch": 0.12254179055696827, + "grad_norm": 0.734375, + "learning_rate": 3.953251028520886e-05, + "loss": 0.7341, + "step": 2795 + }, + { + "epoch": 0.12258563377362551, + "grad_norm": 0.828125, + "learning_rate": 3.952884359692859e-05, + "loss": 0.7668, + "step": 2796 + }, + { + "epoch": 0.12262947699028275, + "grad_norm": 0.8125, + "learning_rate": 3.9525177036813444e-05, + "loss": 0.9557, + "step": 2797 + }, + { + "epoch": 0.12267332020693998, + "grad_norm": 0.76953125, + "learning_rate": 3.952151060487126e-05, + "loss": 0.8188, + "step": 2798 + }, + { + "epoch": 0.12271716342359722, + "grad_norm": 0.81640625, + "learning_rate": 3.951784430110979e-05, + "loss": 0.8055, + "step": 2799 + }, + { + "epoch": 0.12276100664025445, + "grad_norm": 0.73828125, + "learning_rate": 3.951417812553678e-05, + "loss": 0.8166, + "step": 2800 + }, + { + "epoch": 0.12280484985691169, + "grad_norm": 0.81640625, + "learning_rate": 3.951051207816002e-05, + "loss": 0.8183, + "step": 2801 + }, + { + "epoch": 0.12284869307356892, + "grad_norm": 0.90625, + "learning_rate": 3.950684615898723e-05, + "loss": 0.8196, + "step": 2802 + }, + { + "epoch": 0.12289253629022615, + "grad_norm": 0.81640625, + "learning_rate": 3.950318036802626e-05, + "loss": 0.7379, + "step": 2803 + }, + { + "epoch": 0.12293637950688338, + "grad_norm": 0.9140625, + "learning_rate": 3.9499514705284845e-05, + "loss": 0.9285, + "step": 2804 + }, + { + "epoch": 0.12298022272354062, + "grad_norm": 0.78515625, + "learning_rate": 3.949584917077075e-05, + "loss": 0.855, + "step": 2805 + }, + { + "epoch": 0.12302406594019785, + "grad_norm": 0.828125, + "learning_rate": 3.949218376449174e-05, + "loss": 0.901, + "step": 2806 + }, + { + "epoch": 0.12306790915685509, + "grad_norm": 0.8046875, + "learning_rate": 3.9488518486455564e-05, + "loss": 0.8009, + "step": 2807 + }, + { + "epoch": 0.12311175237351232, + "grad_norm": 0.8203125, + "learning_rate": 3.948485333667003e-05, + "loss": 0.8263, + "step": 2808 + }, + { + "epoch": 0.12315559559016956, + "grad_norm": 0.859375, + "learning_rate": 3.94811883151429e-05, + "loss": 0.8525, + "step": 2809 + }, + { + "epoch": 0.1231994388068268, + "grad_norm": 0.79296875, + "learning_rate": 3.9477523421881936e-05, + "loss": 0.9164, + "step": 2810 + }, + { + "epoch": 0.12324328202348402, + "grad_norm": 0.81640625, + "learning_rate": 3.94738586568949e-05, + "loss": 0.8031, + "step": 2811 + }, + { + "epoch": 0.12328712524014125, + "grad_norm": 0.85546875, + "learning_rate": 3.947019402018956e-05, + "loss": 0.7309, + "step": 2812 + }, + { + "epoch": 0.12333096845679849, + "grad_norm": 0.859375, + "learning_rate": 3.9466529511773687e-05, + "loss": 0.8099, + "step": 2813 + }, + { + "epoch": 0.12337481167345572, + "grad_norm": 0.8515625, + "learning_rate": 3.9462865131655045e-05, + "loss": 0.9141, + "step": 2814 + }, + { + "epoch": 0.12341865489011296, + "grad_norm": 0.7734375, + "learning_rate": 3.9459200879841396e-05, + "loss": 0.6132, + "step": 2815 + }, + { + "epoch": 0.1234624981067702, + "grad_norm": 0.86328125, + "learning_rate": 3.9455536756340486e-05, + "loss": 0.8299, + "step": 2816 + }, + { + "epoch": 0.12350634132342743, + "grad_norm": 0.84375, + "learning_rate": 3.945187276116013e-05, + "loss": 0.9427, + "step": 2817 + }, + { + "epoch": 0.12355018454008465, + "grad_norm": 0.8984375, + "learning_rate": 3.944820889430806e-05, + "loss": 0.8893, + "step": 2818 + }, + { + "epoch": 0.12359402775674189, + "grad_norm": 0.7890625, + "learning_rate": 3.944454515579207e-05, + "loss": 0.8165, + "step": 2819 + }, + { + "epoch": 0.12363787097339912, + "grad_norm": 0.87109375, + "learning_rate": 3.944088154561989e-05, + "loss": 0.837, + "step": 2820 + }, + { + "epoch": 0.12368171419005636, + "grad_norm": 0.80078125, + "learning_rate": 3.9437218063799306e-05, + "loss": 0.8316, + "step": 2821 + }, + { + "epoch": 0.1237255574067136, + "grad_norm": 0.7890625, + "learning_rate": 3.943355471033804e-05, + "loss": 0.888, + "step": 2822 + }, + { + "epoch": 0.12376940062337083, + "grad_norm": 0.83984375, + "learning_rate": 3.942989148524393e-05, + "loss": 0.7541, + "step": 2823 + }, + { + "epoch": 0.12381324384002806, + "grad_norm": 0.91015625, + "learning_rate": 3.94262283885247e-05, + "loss": 0.9773, + "step": 2824 + }, + { + "epoch": 0.1238570870566853, + "grad_norm": 0.84765625, + "learning_rate": 3.942256542018813e-05, + "loss": 0.7498, + "step": 2825 + }, + { + "epoch": 0.12390093027334252, + "grad_norm": 0.79296875, + "learning_rate": 3.941890258024196e-05, + "loss": 0.7859, + "step": 2826 + }, + { + "epoch": 0.12394477348999976, + "grad_norm": 0.7890625, + "learning_rate": 3.941523986869392e-05, + "loss": 0.8247, + "step": 2827 + }, + { + "epoch": 0.12398861670665699, + "grad_norm": 0.83203125, + "learning_rate": 3.941157728555186e-05, + "loss": 0.7599, + "step": 2828 + }, + { + "epoch": 0.12403245992331423, + "grad_norm": 0.83984375, + "learning_rate": 3.94079148308235e-05, + "loss": 0.7405, + "step": 2829 + }, + { + "epoch": 0.12407630313997146, + "grad_norm": 0.828125, + "learning_rate": 3.94042525045166e-05, + "loss": 0.8486, + "step": 2830 + }, + { + "epoch": 0.1241201463566287, + "grad_norm": 0.79296875, + "learning_rate": 3.940059030663892e-05, + "loss": 0.8302, + "step": 2831 + }, + { + "epoch": 0.12416398957328593, + "grad_norm": 0.78515625, + "learning_rate": 3.939692823719819e-05, + "loss": 0.8748, + "step": 2832 + }, + { + "epoch": 0.12420783278994316, + "grad_norm": 0.984375, + "learning_rate": 3.9393266296202246e-05, + "loss": 0.8797, + "step": 2833 + }, + { + "epoch": 0.12425167600660039, + "grad_norm": 0.796875, + "learning_rate": 3.938960448365881e-05, + "loss": 0.8307, + "step": 2834 + }, + { + "epoch": 0.12429551922325763, + "grad_norm": 0.765625, + "learning_rate": 3.9385942799575636e-05, + "loss": 0.8283, + "step": 2835 + }, + { + "epoch": 0.12433936243991486, + "grad_norm": 0.84765625, + "learning_rate": 3.9382281243960504e-05, + "loss": 0.688, + "step": 2836 + }, + { + "epoch": 0.1243832056565721, + "grad_norm": 0.859375, + "learning_rate": 3.937861981682115e-05, + "loss": 0.8341, + "step": 2837 + }, + { + "epoch": 0.12442704887322933, + "grad_norm": 0.82421875, + "learning_rate": 3.937495851816532e-05, + "loss": 0.7819, + "step": 2838 + }, + { + "epoch": 0.12447089208988657, + "grad_norm": 0.9375, + "learning_rate": 3.9371297348000836e-05, + "loss": 0.8335, + "step": 2839 + }, + { + "epoch": 0.1245147353065438, + "grad_norm": 0.875, + "learning_rate": 3.936763630633542e-05, + "loss": 0.9129, + "step": 2840 + }, + { + "epoch": 0.12455857852320103, + "grad_norm": 0.82421875, + "learning_rate": 3.936397539317682e-05, + "loss": 0.7276, + "step": 2841 + }, + { + "epoch": 0.12460242173985826, + "grad_norm": 0.89453125, + "learning_rate": 3.9360314608532825e-05, + "loss": 0.855, + "step": 2842 + }, + { + "epoch": 0.1246462649565155, + "grad_norm": 0.85546875, + "learning_rate": 3.935665395241114e-05, + "loss": 0.7919, + "step": 2843 + }, + { + "epoch": 0.12469010817317273, + "grad_norm": 0.94921875, + "learning_rate": 3.9352993424819585e-05, + "loss": 0.7093, + "step": 2844 + }, + { + "epoch": 0.12473395138982997, + "grad_norm": 0.875, + "learning_rate": 3.934933302576591e-05, + "loss": 0.8713, + "step": 2845 + }, + { + "epoch": 0.1247777946064872, + "grad_norm": 0.859375, + "learning_rate": 3.9345672755257844e-05, + "loss": 0.8075, + "step": 2846 + }, + { + "epoch": 0.12482163782314444, + "grad_norm": 0.83984375, + "learning_rate": 3.934201261330316e-05, + "loss": 0.7994, + "step": 2847 + }, + { + "epoch": 0.12486548103980166, + "grad_norm": 0.7890625, + "learning_rate": 3.933835259990958e-05, + "loss": 0.6917, + "step": 2848 + }, + { + "epoch": 0.1249093242564589, + "grad_norm": 0.91796875, + "learning_rate": 3.933469271508493e-05, + "loss": 0.9122, + "step": 2849 + }, + { + "epoch": 0.12495316747311613, + "grad_norm": 0.765625, + "learning_rate": 3.933103295883693e-05, + "loss": 0.7664, + "step": 2850 + }, + { + "epoch": 0.12499701068977337, + "grad_norm": 0.91015625, + "learning_rate": 3.932737333117335e-05, + "loss": 0.7838, + "step": 2851 + }, + { + "epoch": 0.1250408539064306, + "grad_norm": 0.84375, + "learning_rate": 3.932371383210192e-05, + "loss": 0.8708, + "step": 2852 + }, + { + "epoch": 0.12508469712308784, + "grad_norm": 0.9140625, + "learning_rate": 3.932005446163037e-05, + "loss": 0.718, + "step": 2853 + }, + { + "epoch": 0.12512854033974508, + "grad_norm": 0.875, + "learning_rate": 3.931639521976654e-05, + "loss": 0.7975, + "step": 2854 + }, + { + "epoch": 0.1251723835564023, + "grad_norm": 0.84765625, + "learning_rate": 3.9312736106518146e-05, + "loss": 0.9699, + "step": 2855 + }, + { + "epoch": 0.12521622677305955, + "grad_norm": 0.84765625, + "learning_rate": 3.930907712189294e-05, + "loss": 0.7762, + "step": 2856 + }, + { + "epoch": 0.12526006998971678, + "grad_norm": 0.96484375, + "learning_rate": 3.9305418265898666e-05, + "loss": 0.8342, + "step": 2857 + }, + { + "epoch": 0.12530391320637402, + "grad_norm": 0.8671875, + "learning_rate": 3.930175953854309e-05, + "loss": 0.8609, + "step": 2858 + }, + { + "epoch": 0.12534775642303125, + "grad_norm": 0.859375, + "learning_rate": 3.929810093983398e-05, + "loss": 0.838, + "step": 2859 + }, + { + "epoch": 0.12539159963968846, + "grad_norm": 0.82421875, + "learning_rate": 3.929444246977906e-05, + "loss": 0.7761, + "step": 2860 + }, + { + "epoch": 0.1254354428563457, + "grad_norm": 0.83203125, + "learning_rate": 3.92907841283861e-05, + "loss": 0.8649, + "step": 2861 + }, + { + "epoch": 0.12547928607300293, + "grad_norm": 0.8203125, + "learning_rate": 3.928712591566286e-05, + "loss": 0.8268, + "step": 2862 + }, + { + "epoch": 0.12552312928966017, + "grad_norm": 0.83203125, + "learning_rate": 3.928346783161705e-05, + "loss": 0.6987, + "step": 2863 + }, + { + "epoch": 0.1255669725063174, + "grad_norm": 0.82421875, + "learning_rate": 3.927980987625649e-05, + "loss": 0.8579, + "step": 2864 + }, + { + "epoch": 0.12561081572297464, + "grad_norm": 0.83984375, + "learning_rate": 3.927615204958891e-05, + "loss": 0.935, + "step": 2865 + }, + { + "epoch": 0.12565465893963187, + "grad_norm": 0.796875, + "learning_rate": 3.927249435162205e-05, + "loss": 0.8828, + "step": 2866 + }, + { + "epoch": 0.1256985021562891, + "grad_norm": 0.890625, + "learning_rate": 3.9268836782363673e-05, + "loss": 0.8842, + "step": 2867 + }, + { + "epoch": 0.12574234537294635, + "grad_norm": 0.92578125, + "learning_rate": 3.926517934182148e-05, + "loss": 0.9658, + "step": 2868 + }, + { + "epoch": 0.12578618858960358, + "grad_norm": 0.8046875, + "learning_rate": 3.9261522030003307e-05, + "loss": 0.7256, + "step": 2869 + }, + { + "epoch": 0.12583003180626082, + "grad_norm": 0.78515625, + "learning_rate": 3.925786484691687e-05, + "loss": 0.7634, + "step": 2870 + }, + { + "epoch": 0.12587387502291805, + "grad_norm": 1.0234375, + "learning_rate": 3.925420779256992e-05, + "loss": 0.9344, + "step": 2871 + }, + { + "epoch": 0.1259177182395753, + "grad_norm": 0.859375, + "learning_rate": 3.9250550866970195e-05, + "loss": 0.9935, + "step": 2872 + }, + { + "epoch": 0.12596156145623252, + "grad_norm": 0.82421875, + "learning_rate": 3.924689407012542e-05, + "loss": 0.7678, + "step": 2873 + }, + { + "epoch": 0.12600540467288976, + "grad_norm": 0.80859375, + "learning_rate": 3.924323740204342e-05, + "loss": 0.7483, + "step": 2874 + }, + { + "epoch": 0.12604924788954697, + "grad_norm": 0.7734375, + "learning_rate": 3.9239580862731905e-05, + "loss": 0.6258, + "step": 2875 + }, + { + "epoch": 0.1260930911062042, + "grad_norm": 1.046875, + "learning_rate": 3.923592445219862e-05, + "loss": 0.8051, + "step": 2876 + }, + { + "epoch": 0.12613693432286144, + "grad_norm": 0.859375, + "learning_rate": 3.923226817045133e-05, + "loss": 0.8557, + "step": 2877 + }, + { + "epoch": 0.12618077753951867, + "grad_norm": 0.73828125, + "learning_rate": 3.9228612017497734e-05, + "loss": 0.7481, + "step": 2878 + }, + { + "epoch": 0.1262246207561759, + "grad_norm": 0.7734375, + "learning_rate": 3.922495599334566e-05, + "loss": 0.7378, + "step": 2879 + }, + { + "epoch": 0.12626846397283314, + "grad_norm": 0.80859375, + "learning_rate": 3.922130009800281e-05, + "loss": 0.737, + "step": 2880 + }, + { + "epoch": 0.12631230718949038, + "grad_norm": 0.89453125, + "learning_rate": 3.9217644331476954e-05, + "loss": 0.833, + "step": 2881 + }, + { + "epoch": 0.12635615040614762, + "grad_norm": 0.83203125, + "learning_rate": 3.9213988693775815e-05, + "loss": 0.7789, + "step": 2882 + }, + { + "epoch": 0.12639999362280485, + "grad_norm": 0.86328125, + "learning_rate": 3.921033318490712e-05, + "loss": 0.9139, + "step": 2883 + }, + { + "epoch": 0.1264438368394621, + "grad_norm": 0.86328125, + "learning_rate": 3.920667780487869e-05, + "loss": 0.8423, + "step": 2884 + }, + { + "epoch": 0.12648768005611932, + "grad_norm": 0.8359375, + "learning_rate": 3.920302255369822e-05, + "loss": 0.8969, + "step": 2885 + }, + { + "epoch": 0.12653152327277656, + "grad_norm": 0.78125, + "learning_rate": 3.919936743137348e-05, + "loss": 0.8714, + "step": 2886 + }, + { + "epoch": 0.1265753664894338, + "grad_norm": 0.81640625, + "learning_rate": 3.91957124379122e-05, + "loss": 0.8011, + "step": 2887 + }, + { + "epoch": 0.12661920970609103, + "grad_norm": 0.89453125, + "learning_rate": 3.919205757332209e-05, + "loss": 0.914, + "step": 2888 + }, + { + "epoch": 0.12666305292274826, + "grad_norm": 0.84765625, + "learning_rate": 3.918840283761099e-05, + "loss": 0.879, + "step": 2889 + }, + { + "epoch": 0.12670689613940547, + "grad_norm": 0.78515625, + "learning_rate": 3.9184748230786584e-05, + "loss": 0.7758, + "step": 2890 + }, + { + "epoch": 0.1267507393560627, + "grad_norm": 0.7890625, + "learning_rate": 3.918109375285664e-05, + "loss": 0.8756, + "step": 2891 + }, + { + "epoch": 0.12679458257271994, + "grad_norm": 0.97265625, + "learning_rate": 3.9177439403828875e-05, + "loss": 0.9007, + "step": 2892 + }, + { + "epoch": 0.12683842578937718, + "grad_norm": 0.77734375, + "learning_rate": 3.917378518371102e-05, + "loss": 0.8041, + "step": 2893 + }, + { + "epoch": 0.12688226900603441, + "grad_norm": 0.78125, + "learning_rate": 3.917013109251089e-05, + "loss": 0.9319, + "step": 2894 + }, + { + "epoch": 0.12692611222269165, + "grad_norm": 0.8203125, + "learning_rate": 3.916647713023619e-05, + "loss": 0.7708, + "step": 2895 + }, + { + "epoch": 0.12696995543934889, + "grad_norm": 0.71875, + "learning_rate": 3.916282329689466e-05, + "loss": 0.7975, + "step": 2896 + }, + { + "epoch": 0.12701379865600612, + "grad_norm": 0.90234375, + "learning_rate": 3.915916959249405e-05, + "loss": 0.7369, + "step": 2897 + }, + { + "epoch": 0.12705764187266336, + "grad_norm": 0.93359375, + "learning_rate": 3.9155516017042105e-05, + "loss": 0.85, + "step": 2898 + }, + { + "epoch": 0.1271014850893206, + "grad_norm": 0.85546875, + "learning_rate": 3.915186257054656e-05, + "loss": 0.8626, + "step": 2899 + }, + { + "epoch": 0.12714532830597783, + "grad_norm": 0.92578125, + "learning_rate": 3.9148209253015176e-05, + "loss": 0.7789, + "step": 2900 + }, + { + "epoch": 0.12718917152263506, + "grad_norm": 0.765625, + "learning_rate": 3.914455606445564e-05, + "loss": 0.8441, + "step": 2901 + }, + { + "epoch": 0.1272330147392923, + "grad_norm": 0.953125, + "learning_rate": 3.914090300487577e-05, + "loss": 0.6963, + "step": 2902 + }, + { + "epoch": 0.12727685795594953, + "grad_norm": 0.88671875, + "learning_rate": 3.913725007428329e-05, + "loss": 0.9219, + "step": 2903 + }, + { + "epoch": 0.12732070117260677, + "grad_norm": 0.734375, + "learning_rate": 3.9133597272685926e-05, + "loss": 0.8226, + "step": 2904 + }, + { + "epoch": 0.12736454438926398, + "grad_norm": 0.875, + "learning_rate": 3.9129944600091415e-05, + "loss": 0.8622, + "step": 2905 + }, + { + "epoch": 0.12740838760592121, + "grad_norm": 0.78125, + "learning_rate": 3.9126292056507516e-05, + "loss": 0.8436, + "step": 2906 + }, + { + "epoch": 0.12745223082257845, + "grad_norm": 0.80078125, + "learning_rate": 3.912263964194196e-05, + "loss": 0.8067, + "step": 2907 + }, + { + "epoch": 0.12749607403923569, + "grad_norm": 0.88671875, + "learning_rate": 3.911898735640246e-05, + "loss": 0.7563, + "step": 2908 + }, + { + "epoch": 0.12753991725589292, + "grad_norm": 0.828125, + "learning_rate": 3.911533519989682e-05, + "loss": 0.8358, + "step": 2909 + }, + { + "epoch": 0.12758376047255016, + "grad_norm": 0.78125, + "learning_rate": 3.9111683172432734e-05, + "loss": 0.782, + "step": 2910 + }, + { + "epoch": 0.1276276036892074, + "grad_norm": 0.83203125, + "learning_rate": 3.910803127401797e-05, + "loss": 0.8297, + "step": 2911 + }, + { + "epoch": 0.12767144690586463, + "grad_norm": 0.81640625, + "learning_rate": 3.910437950466025e-05, + "loss": 0.7547, + "step": 2912 + }, + { + "epoch": 0.12771529012252186, + "grad_norm": 0.8515625, + "learning_rate": 3.9100727864367284e-05, + "loss": 0.6874, + "step": 2913 + }, + { + "epoch": 0.1277591333391791, + "grad_norm": 0.87890625, + "learning_rate": 3.909707635314686e-05, + "loss": 0.8466, + "step": 2914 + }, + { + "epoch": 0.12780297655583633, + "grad_norm": 0.8671875, + "learning_rate": 3.909342497100672e-05, + "loss": 0.7831, + "step": 2915 + }, + { + "epoch": 0.12784681977249357, + "grad_norm": 0.84765625, + "learning_rate": 3.908977371795458e-05, + "loss": 0.8349, + "step": 2916 + }, + { + "epoch": 0.1278906629891508, + "grad_norm": 0.84765625, + "learning_rate": 3.9086122593998184e-05, + "loss": 0.7128, + "step": 2917 + }, + { + "epoch": 0.12793450620580804, + "grad_norm": 0.8359375, + "learning_rate": 3.908247159914524e-05, + "loss": 0.9244, + "step": 2918 + }, + { + "epoch": 0.12797834942246528, + "grad_norm": 0.8203125, + "learning_rate": 3.907882073340354e-05, + "loss": 0.9311, + "step": 2919 + }, + { + "epoch": 0.1280221926391225, + "grad_norm": 0.95703125, + "learning_rate": 3.9075169996780813e-05, + "loss": 0.9349, + "step": 2920 + }, + { + "epoch": 0.12806603585577972, + "grad_norm": 0.79296875, + "learning_rate": 3.9071519389284774e-05, + "loss": 0.7786, + "step": 2921 + }, + { + "epoch": 0.12810987907243696, + "grad_norm": 0.7421875, + "learning_rate": 3.906786891092316e-05, + "loss": 0.8532, + "step": 2922 + }, + { + "epoch": 0.1281537222890942, + "grad_norm": 0.86328125, + "learning_rate": 3.906421856170368e-05, + "loss": 0.8864, + "step": 2923 + }, + { + "epoch": 0.12819756550575143, + "grad_norm": 0.859375, + "learning_rate": 3.9060568341634154e-05, + "loss": 0.841, + "step": 2924 + }, + { + "epoch": 0.12824140872240866, + "grad_norm": 0.828125, + "learning_rate": 3.905691825072225e-05, + "loss": 0.7853, + "step": 2925 + }, + { + "epoch": 0.1282852519390659, + "grad_norm": 0.9140625, + "learning_rate": 3.905326828897574e-05, + "loss": 0.7305, + "step": 2926 + }, + { + "epoch": 0.12832909515572313, + "grad_norm": 0.828125, + "learning_rate": 3.904961845640234e-05, + "loss": 0.7129, + "step": 2927 + }, + { + "epoch": 0.12837293837238037, + "grad_norm": 0.77734375, + "learning_rate": 3.9045968753009753e-05, + "loss": 0.8834, + "step": 2928 + }, + { + "epoch": 0.1284167815890376, + "grad_norm": 0.8125, + "learning_rate": 3.9042319178805785e-05, + "loss": 0.8175, + "step": 2929 + }, + { + "epoch": 0.12846062480569484, + "grad_norm": 0.83984375, + "learning_rate": 3.903866973379814e-05, + "loss": 0.829, + "step": 2930 + }, + { + "epoch": 0.12850446802235208, + "grad_norm": 0.98046875, + "learning_rate": 3.903502041799455e-05, + "loss": 0.867, + "step": 2931 + }, + { + "epoch": 0.1285483112390093, + "grad_norm": 0.75, + "learning_rate": 3.9031371231402755e-05, + "loss": 0.7769, + "step": 2932 + }, + { + "epoch": 0.12859215445566655, + "grad_norm": 0.85546875, + "learning_rate": 3.902772217403045e-05, + "loss": 0.8775, + "step": 2933 + }, + { + "epoch": 0.12863599767232378, + "grad_norm": 0.7734375, + "learning_rate": 3.902407324588543e-05, + "loss": 0.7065, + "step": 2934 + }, + { + "epoch": 0.12867984088898102, + "grad_norm": 0.875, + "learning_rate": 3.90204244469754e-05, + "loss": 0.7227, + "step": 2935 + }, + { + "epoch": 0.12872368410563823, + "grad_norm": 0.80078125, + "learning_rate": 3.90167757773081e-05, + "loss": 0.8345, + "step": 2936 + }, + { + "epoch": 0.12876752732229546, + "grad_norm": 0.84375, + "learning_rate": 3.901312723689126e-05, + "loss": 0.8057, + "step": 2937 + }, + { + "epoch": 0.1288113705389527, + "grad_norm": 0.7890625, + "learning_rate": 3.900947882573257e-05, + "loss": 0.8045, + "step": 2938 + }, + { + "epoch": 0.12885521375560993, + "grad_norm": 0.81640625, + "learning_rate": 3.9005830543839835e-05, + "loss": 0.8189, + "step": 2939 + }, + { + "epoch": 0.12889905697226717, + "grad_norm": 0.734375, + "learning_rate": 3.900218239122076e-05, + "loss": 0.8215, + "step": 2940 + }, + { + "epoch": 0.1289429001889244, + "grad_norm": 0.83984375, + "learning_rate": 3.8998534367883074e-05, + "loss": 0.802, + "step": 2941 + }, + { + "epoch": 0.12898674340558164, + "grad_norm": 0.734375, + "learning_rate": 3.899488647383451e-05, + "loss": 0.7277, + "step": 2942 + }, + { + "epoch": 0.12903058662223887, + "grad_norm": 0.8828125, + "learning_rate": 3.89912387090828e-05, + "loss": 0.7923, + "step": 2943 + }, + { + "epoch": 0.1290744298388961, + "grad_norm": 0.78515625, + "learning_rate": 3.8987591073635665e-05, + "loss": 0.728, + "step": 2944 + }, + { + "epoch": 0.12911827305555335, + "grad_norm": 0.765625, + "learning_rate": 3.898394356750085e-05, + "loss": 0.8575, + "step": 2945 + }, + { + "epoch": 0.12916211627221058, + "grad_norm": 0.77734375, + "learning_rate": 3.898029619068607e-05, + "loss": 0.7874, + "step": 2946 + }, + { + "epoch": 0.12920595948886782, + "grad_norm": 0.8125, + "learning_rate": 3.897664894319908e-05, + "loss": 0.8825, + "step": 2947 + }, + { + "epoch": 0.12924980270552505, + "grad_norm": 0.7734375, + "learning_rate": 3.8973001825047554e-05, + "loss": 0.8319, + "step": 2948 + }, + { + "epoch": 0.1292936459221823, + "grad_norm": 1.1640625, + "learning_rate": 3.896935483623929e-05, + "loss": 0.8478, + "step": 2949 + }, + { + "epoch": 0.12933748913883952, + "grad_norm": 0.828125, + "learning_rate": 3.8965707976781994e-05, + "loss": 0.7946, + "step": 2950 + }, + { + "epoch": 0.12938133235549673, + "grad_norm": 0.96875, + "learning_rate": 3.896206124668339e-05, + "loss": 0.8271, + "step": 2951 + }, + { + "epoch": 0.12942517557215397, + "grad_norm": 0.734375, + "learning_rate": 3.89584146459512e-05, + "loss": 0.7678, + "step": 2952 + }, + { + "epoch": 0.1294690187888112, + "grad_norm": 0.9296875, + "learning_rate": 3.8954768174593136e-05, + "loss": 0.8416, + "step": 2953 + }, + { + "epoch": 0.12951286200546844, + "grad_norm": 0.8203125, + "learning_rate": 3.8951121832616986e-05, + "loss": 0.9049, + "step": 2954 + }, + { + "epoch": 0.12955670522212567, + "grad_norm": 0.88671875, + "learning_rate": 3.8947475620030446e-05, + "loss": 0.7315, + "step": 2955 + }, + { + "epoch": 0.1296005484387829, + "grad_norm": 0.83203125, + "learning_rate": 3.8943829536841246e-05, + "loss": 0.8102, + "step": 2956 + }, + { + "epoch": 0.12964439165544014, + "grad_norm": 1.0234375, + "learning_rate": 3.89401835830571e-05, + "loss": 0.8387, + "step": 2957 + }, + { + "epoch": 0.12968823487209738, + "grad_norm": 0.7734375, + "learning_rate": 3.893653775868572e-05, + "loss": 0.762, + "step": 2958 + }, + { + "epoch": 0.12973207808875462, + "grad_norm": 0.86328125, + "learning_rate": 3.8932892063734885e-05, + "loss": 0.7246, + "step": 2959 + }, + { + "epoch": 0.12977592130541185, + "grad_norm": 0.90234375, + "learning_rate": 3.892924649821229e-05, + "loss": 0.9479, + "step": 2960 + }, + { + "epoch": 0.1298197645220691, + "grad_norm": 0.84765625, + "learning_rate": 3.8925601062125674e-05, + "loss": 0.9853, + "step": 2961 + }, + { + "epoch": 0.12986360773872632, + "grad_norm": 0.8515625, + "learning_rate": 3.8921955755482754e-05, + "loss": 0.7697, + "step": 2962 + }, + { + "epoch": 0.12990745095538356, + "grad_norm": 0.85546875, + "learning_rate": 3.891831057829123e-05, + "loss": 0.9277, + "step": 2963 + }, + { + "epoch": 0.1299512941720408, + "grad_norm": 0.69921875, + "learning_rate": 3.891466553055888e-05, + "loss": 0.6479, + "step": 2964 + }, + { + "epoch": 0.12999513738869803, + "grad_norm": 0.76953125, + "learning_rate": 3.89110206122934e-05, + "loss": 0.842, + "step": 2965 + }, + { + "epoch": 0.13003898060535524, + "grad_norm": 0.859375, + "learning_rate": 3.890737582350253e-05, + "loss": 0.7702, + "step": 2966 + }, + { + "epoch": 0.13008282382201247, + "grad_norm": 0.83984375, + "learning_rate": 3.890373116419399e-05, + "loss": 0.815, + "step": 2967 + }, + { + "epoch": 0.1301266670386697, + "grad_norm": 0.8984375, + "learning_rate": 3.890008663437545e-05, + "loss": 0.7374, + "step": 2968 + }, + { + "epoch": 0.13017051025532694, + "grad_norm": 0.77734375, + "learning_rate": 3.889644223405473e-05, + "loss": 0.6909, + "step": 2969 + }, + { + "epoch": 0.13021435347198418, + "grad_norm": 0.80859375, + "learning_rate": 3.889279796323951e-05, + "loss": 0.7937, + "step": 2970 + }, + { + "epoch": 0.13025819668864141, + "grad_norm": 0.76171875, + "learning_rate": 3.888915382193751e-05, + "loss": 0.7357, + "step": 2971 + }, + { + "epoch": 0.13030203990529865, + "grad_norm": 0.91015625, + "learning_rate": 3.888550981015645e-05, + "loss": 0.7413, + "step": 2972 + }, + { + "epoch": 0.13034588312195589, + "grad_norm": 0.8203125, + "learning_rate": 3.888186592790402e-05, + "loss": 0.7759, + "step": 2973 + }, + { + "epoch": 0.13038972633861312, + "grad_norm": 0.8125, + "learning_rate": 3.8878222175188026e-05, + "loss": 0.6739, + "step": 2974 + }, + { + "epoch": 0.13043356955527036, + "grad_norm": 0.7578125, + "learning_rate": 3.887457855201615e-05, + "loss": 0.7807, + "step": 2975 + }, + { + "epoch": 0.1304774127719276, + "grad_norm": 0.82421875, + "learning_rate": 3.887093505839611e-05, + "loss": 0.7385, + "step": 2976 + }, + { + "epoch": 0.13052125598858483, + "grad_norm": 0.76953125, + "learning_rate": 3.8867291694335616e-05, + "loss": 0.8456, + "step": 2977 + }, + { + "epoch": 0.13056509920524206, + "grad_norm": 0.8671875, + "learning_rate": 3.886364845984242e-05, + "loss": 0.713, + "step": 2978 + }, + { + "epoch": 0.1306089424218993, + "grad_norm": 0.828125, + "learning_rate": 3.886000535492418e-05, + "loss": 0.9713, + "step": 2979 + }, + { + "epoch": 0.13065278563855653, + "grad_norm": 0.90625, + "learning_rate": 3.885636237958871e-05, + "loss": 0.943, + "step": 2980 + }, + { + "epoch": 0.13069662885521374, + "grad_norm": 0.86328125, + "learning_rate": 3.8852719533843674e-05, + "loss": 0.7347, + "step": 2981 + }, + { + "epoch": 0.13074047207187098, + "grad_norm": 0.73828125, + "learning_rate": 3.88490768176968e-05, + "loss": 0.769, + "step": 2982 + }, + { + "epoch": 0.1307843152885282, + "grad_norm": 0.76953125, + "learning_rate": 3.884543423115582e-05, + "loss": 0.7831, + "step": 2983 + }, + { + "epoch": 0.13082815850518545, + "grad_norm": 0.89453125, + "learning_rate": 3.884179177422845e-05, + "loss": 0.9215, + "step": 2984 + }, + { + "epoch": 0.13087200172184268, + "grad_norm": 0.8828125, + "learning_rate": 3.8838149446922404e-05, + "loss": 0.9948, + "step": 2985 + }, + { + "epoch": 0.13091584493849992, + "grad_norm": 0.98046875, + "learning_rate": 3.883450724924537e-05, + "loss": 0.8683, + "step": 2986 + }, + { + "epoch": 0.13095968815515716, + "grad_norm": 0.9140625, + "learning_rate": 3.8830865181205134e-05, + "loss": 0.8141, + "step": 2987 + }, + { + "epoch": 0.1310035313718144, + "grad_norm": 0.76171875, + "learning_rate": 3.882722324280938e-05, + "loss": 0.7579, + "step": 2988 + }, + { + "epoch": 0.13104737458847163, + "grad_norm": 0.82421875, + "learning_rate": 3.8823581434065835e-05, + "loss": 0.7368, + "step": 2989 + }, + { + "epoch": 0.13109121780512886, + "grad_norm": 0.8203125, + "learning_rate": 3.8819939754982215e-05, + "loss": 0.8657, + "step": 2990 + }, + { + "epoch": 0.1311350610217861, + "grad_norm": 1.0390625, + "learning_rate": 3.881629820556624e-05, + "loss": 0.8639, + "step": 2991 + }, + { + "epoch": 0.13117890423844333, + "grad_norm": 0.78515625, + "learning_rate": 3.881265678582561e-05, + "loss": 0.7934, + "step": 2992 + }, + { + "epoch": 0.13122274745510057, + "grad_norm": 0.78125, + "learning_rate": 3.880901549576807e-05, + "loss": 0.7686, + "step": 2993 + }, + { + "epoch": 0.1312665906717578, + "grad_norm": 0.87109375, + "learning_rate": 3.880537433540128e-05, + "loss": 0.773, + "step": 2994 + }, + { + "epoch": 0.13131043388841504, + "grad_norm": 0.828125, + "learning_rate": 3.8801733304733034e-05, + "loss": 0.9453, + "step": 2995 + }, + { + "epoch": 0.13135427710507225, + "grad_norm": 1.015625, + "learning_rate": 3.8798092403771026e-05, + "loss": 0.9452, + "step": 2996 + }, + { + "epoch": 0.13139812032172948, + "grad_norm": 0.8046875, + "learning_rate": 3.879445163252295e-05, + "loss": 0.6549, + "step": 2997 + }, + { + "epoch": 0.13144196353838672, + "grad_norm": 0.75, + "learning_rate": 3.879081099099655e-05, + "loss": 0.7809, + "step": 2998 + }, + { + "epoch": 0.13148580675504395, + "grad_norm": 0.890625, + "learning_rate": 3.878717047919949e-05, + "loss": 0.8246, + "step": 2999 + }, + { + "epoch": 0.1315296499717012, + "grad_norm": 0.828125, + "learning_rate": 3.8783530097139555e-05, + "loss": 0.8616, + "step": 3000 + }, + { + "epoch": 0.1315296499717012, + "eval_loss": 0.8106476664543152, + "eval_runtime": 296.8262, + "eval_samples_per_second": 33.69, + "eval_steps_per_second": 0.704, + "step": 3000 + }, + { + "epoch": 0.13157349318835843, + "grad_norm": 0.9296875, + "learning_rate": 3.877988984482442e-05, + "loss": 0.8124, + "step": 3001 + }, + { + "epoch": 0.13161733640501566, + "grad_norm": 0.921875, + "learning_rate": 3.877624972226183e-05, + "loss": 0.8727, + "step": 3002 + }, + { + "epoch": 0.1316611796216729, + "grad_norm": 0.8203125, + "learning_rate": 3.877260972945946e-05, + "loss": 0.8577, + "step": 3003 + }, + { + "epoch": 0.13170502283833013, + "grad_norm": 0.8359375, + "learning_rate": 3.876896986642502e-05, + "loss": 0.8054, + "step": 3004 + }, + { + "epoch": 0.13174886605498737, + "grad_norm": 0.7890625, + "learning_rate": 3.876533013316628e-05, + "loss": 0.7546, + "step": 3005 + }, + { + "epoch": 0.1317927092716446, + "grad_norm": 0.89453125, + "learning_rate": 3.876169052969092e-05, + "loss": 0.9434, + "step": 3006 + }, + { + "epoch": 0.13183655248830184, + "grad_norm": 0.8125, + "learning_rate": 3.875805105600666e-05, + "loss": 0.8218, + "step": 3007 + }, + { + "epoch": 0.13188039570495907, + "grad_norm": 0.8359375, + "learning_rate": 3.8754411712121205e-05, + "loss": 0.7923, + "step": 3008 + }, + { + "epoch": 0.1319242389216163, + "grad_norm": 0.78515625, + "learning_rate": 3.8750772498042246e-05, + "loss": 0.6966, + "step": 3009 + }, + { + "epoch": 0.13196808213827355, + "grad_norm": 0.80078125, + "learning_rate": 3.874713341377756e-05, + "loss": 0.8941, + "step": 3010 + }, + { + "epoch": 0.13201192535493078, + "grad_norm": 0.89453125, + "learning_rate": 3.8743494459334817e-05, + "loss": 0.8799, + "step": 3011 + }, + { + "epoch": 0.132055768571588, + "grad_norm": 0.82421875, + "learning_rate": 3.8739855634721735e-05, + "loss": 0.7944, + "step": 3012 + }, + { + "epoch": 0.13209961178824522, + "grad_norm": 0.828125, + "learning_rate": 3.873621693994604e-05, + "loss": 0.8274, + "step": 3013 + }, + { + "epoch": 0.13214345500490246, + "grad_norm": 0.8203125, + "learning_rate": 3.873257837501538e-05, + "loss": 0.7808, + "step": 3014 + }, + { + "epoch": 0.1321872982215597, + "grad_norm": 0.86328125, + "learning_rate": 3.872893993993757e-05, + "loss": 0.8087, + "step": 3015 + }, + { + "epoch": 0.13223114143821693, + "grad_norm": 0.83203125, + "learning_rate": 3.872530163472026e-05, + "loss": 0.8006, + "step": 3016 + }, + { + "epoch": 0.13227498465487417, + "grad_norm": 0.7578125, + "learning_rate": 3.872166345937117e-05, + "loss": 0.8158, + "step": 3017 + }, + { + "epoch": 0.1323188278715314, + "grad_norm": 0.87109375, + "learning_rate": 3.871802541389802e-05, + "loss": 0.9089, + "step": 3018 + }, + { + "epoch": 0.13236267108818864, + "grad_norm": 0.81640625, + "learning_rate": 3.871438749830847e-05, + "loss": 0.7571, + "step": 3019 + }, + { + "epoch": 0.13240651430484587, + "grad_norm": 0.8359375, + "learning_rate": 3.8710749712610305e-05, + "loss": 0.7813, + "step": 3020 + }, + { + "epoch": 0.1324503575215031, + "grad_norm": 0.83984375, + "learning_rate": 3.87071120568112e-05, + "loss": 0.8684, + "step": 3021 + }, + { + "epoch": 0.13249420073816034, + "grad_norm": 0.84765625, + "learning_rate": 3.8703474530918884e-05, + "loss": 0.924, + "step": 3022 + }, + { + "epoch": 0.13253804395481758, + "grad_norm": 0.83984375, + "learning_rate": 3.869983713494104e-05, + "loss": 0.8484, + "step": 3023 + }, + { + "epoch": 0.13258188717147482, + "grad_norm": 0.8203125, + "learning_rate": 3.8696199868885355e-05, + "loss": 0.7448, + "step": 3024 + }, + { + "epoch": 0.13262573038813205, + "grad_norm": 0.76171875, + "learning_rate": 3.8692562732759596e-05, + "loss": 0.8017, + "step": 3025 + }, + { + "epoch": 0.1326695736047893, + "grad_norm": 0.84765625, + "learning_rate": 3.8688925726571455e-05, + "loss": 0.7991, + "step": 3026 + }, + { + "epoch": 0.1327134168214465, + "grad_norm": 0.82421875, + "learning_rate": 3.868528885032864e-05, + "loss": 0.8141, + "step": 3027 + }, + { + "epoch": 0.13275726003810373, + "grad_norm": 0.9921875, + "learning_rate": 3.8681652104038836e-05, + "loss": 0.7669, + "step": 3028 + }, + { + "epoch": 0.13280110325476097, + "grad_norm": 0.87109375, + "learning_rate": 3.867801548770977e-05, + "loss": 0.8971, + "step": 3029 + }, + { + "epoch": 0.1328449464714182, + "grad_norm": 0.82421875, + "learning_rate": 3.867437900134915e-05, + "loss": 0.7386, + "step": 3030 + }, + { + "epoch": 0.13288878968807544, + "grad_norm": 0.8125, + "learning_rate": 3.8670742644964676e-05, + "loss": 0.812, + "step": 3031 + }, + { + "epoch": 0.13293263290473267, + "grad_norm": 0.84375, + "learning_rate": 3.8667106418564046e-05, + "loss": 0.772, + "step": 3032 + }, + { + "epoch": 0.1329764761213899, + "grad_norm": 0.8671875, + "learning_rate": 3.866347032215499e-05, + "loss": 0.786, + "step": 3033 + }, + { + "epoch": 0.13302031933804714, + "grad_norm": 0.859375, + "learning_rate": 3.865983435574517e-05, + "loss": 0.8039, + "step": 3034 + }, + { + "epoch": 0.13306416255470438, + "grad_norm": 0.77734375, + "learning_rate": 3.865619851934236e-05, + "loss": 0.7096, + "step": 3035 + }, + { + "epoch": 0.13310800577136161, + "grad_norm": 0.86328125, + "learning_rate": 3.8652562812954216e-05, + "loss": 0.7356, + "step": 3036 + }, + { + "epoch": 0.13315184898801885, + "grad_norm": 0.7890625, + "learning_rate": 3.8648927236588475e-05, + "loss": 0.7525, + "step": 3037 + }, + { + "epoch": 0.13319569220467609, + "grad_norm": 0.84765625, + "learning_rate": 3.864529179025282e-05, + "loss": 0.9958, + "step": 3038 + }, + { + "epoch": 0.13323953542133332, + "grad_norm": 0.796875, + "learning_rate": 3.8641656473954934e-05, + "loss": 0.9099, + "step": 3039 + }, + { + "epoch": 0.13328337863799056, + "grad_norm": 0.80078125, + "learning_rate": 3.8638021287702576e-05, + "loss": 1.0377, + "step": 3040 + }, + { + "epoch": 0.1333272218546478, + "grad_norm": 0.80859375, + "learning_rate": 3.863438623150343e-05, + "loss": 0.8266, + "step": 3041 + }, + { + "epoch": 0.133371065071305, + "grad_norm": 0.88671875, + "learning_rate": 3.8630751305365185e-05, + "loss": 0.8987, + "step": 3042 + }, + { + "epoch": 0.13341490828796224, + "grad_norm": 0.8515625, + "learning_rate": 3.8627116509295567e-05, + "loss": 0.772, + "step": 3043 + }, + { + "epoch": 0.13345875150461947, + "grad_norm": 0.734375, + "learning_rate": 3.862348184330222e-05, + "loss": 0.7225, + "step": 3044 + }, + { + "epoch": 0.1335025947212767, + "grad_norm": 0.91796875, + "learning_rate": 3.861984730739294e-05, + "loss": 0.8761, + "step": 3045 + }, + { + "epoch": 0.13354643793793394, + "grad_norm": 0.87890625, + "learning_rate": 3.861621290157539e-05, + "loss": 0.8453, + "step": 3046 + }, + { + "epoch": 0.13359028115459118, + "grad_norm": 0.8828125, + "learning_rate": 3.8612578625857266e-05, + "loss": 0.7879, + "step": 3047 + }, + { + "epoch": 0.13363412437124841, + "grad_norm": 0.83984375, + "learning_rate": 3.860894448024627e-05, + "loss": 0.9437, + "step": 3048 + }, + { + "epoch": 0.13367796758790565, + "grad_norm": 0.796875, + "learning_rate": 3.8605310464750066e-05, + "loss": 0.6918, + "step": 3049 + }, + { + "epoch": 0.13372181080456289, + "grad_norm": 0.875, + "learning_rate": 3.860167657937643e-05, + "loss": 0.8219, + "step": 3050 + }, + { + "epoch": 0.13376565402122012, + "grad_norm": 0.82421875, + "learning_rate": 3.859804282413304e-05, + "loss": 0.7607, + "step": 3051 + }, + { + "epoch": 0.13380949723787736, + "grad_norm": 0.81640625, + "learning_rate": 3.859440919902759e-05, + "loss": 0.8505, + "step": 3052 + }, + { + "epoch": 0.1338533404545346, + "grad_norm": 0.82421875, + "learning_rate": 3.859077570406778e-05, + "loss": 0.8544, + "step": 3053 + }, + { + "epoch": 0.13389718367119183, + "grad_norm": 0.85546875, + "learning_rate": 3.858714233926127e-05, + "loss": 0.8386, + "step": 3054 + }, + { + "epoch": 0.13394102688784906, + "grad_norm": 0.71484375, + "learning_rate": 3.8583509104615845e-05, + "loss": 0.7236, + "step": 3055 + }, + { + "epoch": 0.1339848701045063, + "grad_norm": 0.7890625, + "learning_rate": 3.857987600013916e-05, + "loss": 0.7957, + "step": 3056 + }, + { + "epoch": 0.1340287133211635, + "grad_norm": 0.80078125, + "learning_rate": 3.857624302583891e-05, + "loss": 0.6707, + "step": 3057 + }, + { + "epoch": 0.13407255653782074, + "grad_norm": 0.7734375, + "learning_rate": 3.85726101817228e-05, + "loss": 0.8932, + "step": 3058 + }, + { + "epoch": 0.13411639975447798, + "grad_norm": 0.91015625, + "learning_rate": 3.856897746779851e-05, + "loss": 0.7406, + "step": 3059 + }, + { + "epoch": 0.1341602429711352, + "grad_norm": 0.78515625, + "learning_rate": 3.856534488407378e-05, + "loss": 0.8095, + "step": 3060 + }, + { + "epoch": 0.13420408618779245, + "grad_norm": 0.83984375, + "learning_rate": 3.856171243055631e-05, + "loss": 0.8105, + "step": 3061 + }, + { + "epoch": 0.13424792940444968, + "grad_norm": 0.74609375, + "learning_rate": 3.8558080107253756e-05, + "loss": 0.7948, + "step": 3062 + }, + { + "epoch": 0.13429177262110692, + "grad_norm": 0.82421875, + "learning_rate": 3.8554447914173854e-05, + "loss": 0.8779, + "step": 3063 + }, + { + "epoch": 0.13433561583776416, + "grad_norm": 0.7421875, + "learning_rate": 3.855081585132425e-05, + "loss": 0.8087, + "step": 3064 + }, + { + "epoch": 0.1343794590544214, + "grad_norm": 0.83203125, + "learning_rate": 3.854718391871271e-05, + "loss": 0.7777, + "step": 3065 + }, + { + "epoch": 0.13442330227107863, + "grad_norm": 0.796875, + "learning_rate": 3.854355211634692e-05, + "loss": 0.8216, + "step": 3066 + }, + { + "epoch": 0.13446714548773586, + "grad_norm": 0.8515625, + "learning_rate": 3.8539920444234535e-05, + "loss": 0.8664, + "step": 3067 + }, + { + "epoch": 0.1345109887043931, + "grad_norm": 0.81640625, + "learning_rate": 3.853628890238329e-05, + "loss": 0.8337, + "step": 3068 + }, + { + "epoch": 0.13455483192105033, + "grad_norm": 0.984375, + "learning_rate": 3.853265749080086e-05, + "loss": 0.806, + "step": 3069 + }, + { + "epoch": 0.13459867513770757, + "grad_norm": 0.8203125, + "learning_rate": 3.8529026209494945e-05, + "loss": 0.704, + "step": 3070 + }, + { + "epoch": 0.1346425183543648, + "grad_norm": 0.76953125, + "learning_rate": 3.8525395058473254e-05, + "loss": 0.6888, + "step": 3071 + }, + { + "epoch": 0.134686361571022, + "grad_norm": 0.7421875, + "learning_rate": 3.852176403774344e-05, + "loss": 0.8319, + "step": 3072 + }, + { + "epoch": 0.13473020478767925, + "grad_norm": 0.828125, + "learning_rate": 3.851813314731326e-05, + "loss": 1.0195, + "step": 3073 + }, + { + "epoch": 0.13477404800433648, + "grad_norm": 0.78515625, + "learning_rate": 3.8514502387190386e-05, + "loss": 0.7612, + "step": 3074 + }, + { + "epoch": 0.13481789122099372, + "grad_norm": 0.89453125, + "learning_rate": 3.851087175738252e-05, + "loss": 0.8532, + "step": 3075 + }, + { + "epoch": 0.13486173443765095, + "grad_norm": 0.83203125, + "learning_rate": 3.850724125789734e-05, + "loss": 0.9818, + "step": 3076 + }, + { + "epoch": 0.1349055776543082, + "grad_norm": 0.82421875, + "learning_rate": 3.8503610888742546e-05, + "loss": 0.7848, + "step": 3077 + }, + { + "epoch": 0.13494942087096543, + "grad_norm": 0.86328125, + "learning_rate": 3.849998064992584e-05, + "loss": 0.8393, + "step": 3078 + }, + { + "epoch": 0.13499326408762266, + "grad_norm": 0.80859375, + "learning_rate": 3.8496350541454864e-05, + "loss": 0.8762, + "step": 3079 + }, + { + "epoch": 0.1350371073042799, + "grad_norm": 0.890625, + "learning_rate": 3.8492720563337394e-05, + "loss": 0.9191, + "step": 3080 + }, + { + "epoch": 0.13508095052093713, + "grad_norm": 0.890625, + "learning_rate": 3.848909071558109e-05, + "loss": 0.8616, + "step": 3081 + }, + { + "epoch": 0.13512479373759437, + "grad_norm": 0.80078125, + "learning_rate": 3.848546099819365e-05, + "loss": 0.8573, + "step": 3082 + }, + { + "epoch": 0.1351686369542516, + "grad_norm": 0.78515625, + "learning_rate": 3.8481831411182745e-05, + "loss": 0.8984, + "step": 3083 + }, + { + "epoch": 0.13521248017090884, + "grad_norm": 0.8671875, + "learning_rate": 3.847820195455606e-05, + "loss": 0.7224, + "step": 3084 + }, + { + "epoch": 0.13525632338756607, + "grad_norm": 0.9453125, + "learning_rate": 3.847457262832134e-05, + "loss": 0.8584, + "step": 3085 + }, + { + "epoch": 0.1353001666042233, + "grad_norm": 0.796875, + "learning_rate": 3.8470943432486236e-05, + "loss": 0.7343, + "step": 3086 + }, + { + "epoch": 0.13534400982088052, + "grad_norm": 0.84765625, + "learning_rate": 3.846731436705846e-05, + "loss": 0.7853, + "step": 3087 + }, + { + "epoch": 0.13538785303753775, + "grad_norm": 0.92578125, + "learning_rate": 3.8463685432045704e-05, + "loss": 0.8415, + "step": 3088 + }, + { + "epoch": 0.135431696254195, + "grad_norm": 0.828125, + "learning_rate": 3.84600566274556e-05, + "loss": 0.8161, + "step": 3089 + }, + { + "epoch": 0.13547553947085222, + "grad_norm": 0.8203125, + "learning_rate": 3.845642795329593e-05, + "loss": 0.842, + "step": 3090 + }, + { + "epoch": 0.13551938268750946, + "grad_norm": 0.9765625, + "learning_rate": 3.8452799409574334e-05, + "loss": 0.8686, + "step": 3091 + }, + { + "epoch": 0.1355632259041667, + "grad_norm": 0.83984375, + "learning_rate": 3.8449170996298525e-05, + "loss": 0.8275, + "step": 3092 + }, + { + "epoch": 0.13560706912082393, + "grad_norm": 0.77734375, + "learning_rate": 3.844554271347618e-05, + "loss": 0.8152, + "step": 3093 + }, + { + "epoch": 0.13565091233748117, + "grad_norm": 0.72265625, + "learning_rate": 3.844191456111494e-05, + "loss": 0.8845, + "step": 3094 + }, + { + "epoch": 0.1356947555541384, + "grad_norm": 0.8359375, + "learning_rate": 3.843828653922259e-05, + "loss": 0.7965, + "step": 3095 + }, + { + "epoch": 0.13573859877079564, + "grad_norm": 0.8203125, + "learning_rate": 3.843465864780678e-05, + "loss": 0.8743, + "step": 3096 + }, + { + "epoch": 0.13578244198745287, + "grad_norm": 0.76171875, + "learning_rate": 3.8431030886875175e-05, + "loss": 0.6307, + "step": 3097 + }, + { + "epoch": 0.1358262852041101, + "grad_norm": 0.90625, + "learning_rate": 3.842740325643549e-05, + "loss": 0.7625, + "step": 3098 + }, + { + "epoch": 0.13587012842076734, + "grad_norm": 0.81640625, + "learning_rate": 3.842377575649536e-05, + "loss": 0.791, + "step": 3099 + }, + { + "epoch": 0.13591397163742458, + "grad_norm": 0.78515625, + "learning_rate": 3.8420148387062574e-05, + "loss": 0.809, + "step": 3100 + }, + { + "epoch": 0.13595781485408182, + "grad_norm": 0.8125, + "learning_rate": 3.8416521148144746e-05, + "loss": 0.9386, + "step": 3101 + }, + { + "epoch": 0.13600165807073905, + "grad_norm": 0.8671875, + "learning_rate": 3.841289403974958e-05, + "loss": 0.9474, + "step": 3102 + }, + { + "epoch": 0.13604550128739626, + "grad_norm": 0.765625, + "learning_rate": 3.840926706188477e-05, + "loss": 0.748, + "step": 3103 + }, + { + "epoch": 0.1360893445040535, + "grad_norm": 0.796875, + "learning_rate": 3.840564021455797e-05, + "loss": 0.706, + "step": 3104 + }, + { + "epoch": 0.13613318772071073, + "grad_norm": 0.8828125, + "learning_rate": 3.840201349777692e-05, + "loss": 0.8163, + "step": 3105 + }, + { + "epoch": 0.13617703093736797, + "grad_norm": 0.875, + "learning_rate": 3.839838691154929e-05, + "loss": 0.9005, + "step": 3106 + }, + { + "epoch": 0.1362208741540252, + "grad_norm": 0.85546875, + "learning_rate": 3.839476045588275e-05, + "loss": 0.8241, + "step": 3107 + }, + { + "epoch": 0.13626471737068244, + "grad_norm": 0.84765625, + "learning_rate": 3.8391134130785e-05, + "loss": 0.9084, + "step": 3108 + }, + { + "epoch": 0.13630856058733967, + "grad_norm": 0.76953125, + "learning_rate": 3.838750793626368e-05, + "loss": 0.7753, + "step": 3109 + }, + { + "epoch": 0.1363524038039969, + "grad_norm": 0.734375, + "learning_rate": 3.8383881872326555e-05, + "loss": 0.7511, + "step": 3110 + }, + { + "epoch": 0.13639624702065414, + "grad_norm": 0.82421875, + "learning_rate": 3.8380255938981266e-05, + "loss": 0.8093, + "step": 3111 + }, + { + "epoch": 0.13644009023731138, + "grad_norm": 0.8125, + "learning_rate": 3.83766301362355e-05, + "loss": 0.7985, + "step": 3112 + }, + { + "epoch": 0.13648393345396861, + "grad_norm": 0.80859375, + "learning_rate": 3.837300446409695e-05, + "loss": 0.8296, + "step": 3113 + }, + { + "epoch": 0.13652777667062585, + "grad_norm": 0.87109375, + "learning_rate": 3.8369378922573296e-05, + "loss": 0.8306, + "step": 3114 + }, + { + "epoch": 0.13657161988728309, + "grad_norm": 0.86328125, + "learning_rate": 3.836575351167222e-05, + "loss": 0.8656, + "step": 3115 + }, + { + "epoch": 0.13661546310394032, + "grad_norm": 0.7421875, + "learning_rate": 3.8362128231401405e-05, + "loss": 0.6779, + "step": 3116 + }, + { + "epoch": 0.13665930632059756, + "grad_norm": 0.8671875, + "learning_rate": 3.8358503081768526e-05, + "loss": 0.6722, + "step": 3117 + }, + { + "epoch": 0.13670314953725476, + "grad_norm": 0.8203125, + "learning_rate": 3.83548780627813e-05, + "loss": 0.8028, + "step": 3118 + }, + { + "epoch": 0.136746992753912, + "grad_norm": 0.82421875, + "learning_rate": 3.8351253174447335e-05, + "loss": 0.7893, + "step": 3119 + }, + { + "epoch": 0.13679083597056924, + "grad_norm": 0.81640625, + "learning_rate": 3.83476284167744e-05, + "loss": 0.7456, + "step": 3120 + }, + { + "epoch": 0.13683467918722647, + "grad_norm": 1.0859375, + "learning_rate": 3.8344003789770144e-05, + "loss": 0.8461, + "step": 3121 + }, + { + "epoch": 0.1368785224038837, + "grad_norm": 0.76171875, + "learning_rate": 3.834037929344225e-05, + "loss": 0.7738, + "step": 3122 + }, + { + "epoch": 0.13692236562054094, + "grad_norm": 0.76171875, + "learning_rate": 3.83367549277984e-05, + "loss": 0.785, + "step": 3123 + }, + { + "epoch": 0.13696620883719818, + "grad_norm": 0.921875, + "learning_rate": 3.833313069284626e-05, + "loss": 0.8372, + "step": 3124 + }, + { + "epoch": 0.1370100520538554, + "grad_norm": 0.8359375, + "learning_rate": 3.8329506588593514e-05, + "loss": 0.8084, + "step": 3125 + }, + { + "epoch": 0.13705389527051265, + "grad_norm": 0.7890625, + "learning_rate": 3.832588261504787e-05, + "loss": 0.9983, + "step": 3126 + }, + { + "epoch": 0.13709773848716988, + "grad_norm": 0.875, + "learning_rate": 3.8322258772217e-05, + "loss": 0.8823, + "step": 3127 + }, + { + "epoch": 0.13714158170382712, + "grad_norm": 0.734375, + "learning_rate": 3.8318635060108573e-05, + "loss": 0.7809, + "step": 3128 + }, + { + "epoch": 0.13718542492048436, + "grad_norm": 0.8515625, + "learning_rate": 3.831501147873028e-05, + "loss": 0.7948, + "step": 3129 + }, + { + "epoch": 0.1372292681371416, + "grad_norm": 0.7734375, + "learning_rate": 3.8311388028089755e-05, + "loss": 0.9298, + "step": 3130 + }, + { + "epoch": 0.13727311135379883, + "grad_norm": 0.8828125, + "learning_rate": 3.830776470819475e-05, + "loss": 1.0276, + "step": 3131 + }, + { + "epoch": 0.13731695457045606, + "grad_norm": 0.80078125, + "learning_rate": 3.830414151905292e-05, + "loss": 0.8748, + "step": 3132 + }, + { + "epoch": 0.13736079778711327, + "grad_norm": 0.8125, + "learning_rate": 3.830051846067192e-05, + "loss": 0.7741, + "step": 3133 + }, + { + "epoch": 0.1374046410037705, + "grad_norm": 0.80078125, + "learning_rate": 3.829689553305945e-05, + "loss": 0.8645, + "step": 3134 + }, + { + "epoch": 0.13744848422042774, + "grad_norm": 0.77734375, + "learning_rate": 3.829327273622316e-05, + "loss": 0.7785, + "step": 3135 + }, + { + "epoch": 0.13749232743708498, + "grad_norm": 0.82421875, + "learning_rate": 3.828965007017078e-05, + "loss": 0.737, + "step": 3136 + }, + { + "epoch": 0.1375361706537422, + "grad_norm": 0.73828125, + "learning_rate": 3.828602753490995e-05, + "loss": 0.7816, + "step": 3137 + }, + { + "epoch": 0.13758001387039945, + "grad_norm": 0.84375, + "learning_rate": 3.828240513044837e-05, + "loss": 0.9262, + "step": 3138 + }, + { + "epoch": 0.13762385708705668, + "grad_norm": 0.83203125, + "learning_rate": 3.82787828567937e-05, + "loss": 0.8661, + "step": 3139 + }, + { + "epoch": 0.13766770030371392, + "grad_norm": 0.78515625, + "learning_rate": 3.827516071395358e-05, + "loss": 0.8765, + "step": 3140 + }, + { + "epoch": 0.13771154352037115, + "grad_norm": 0.84375, + "learning_rate": 3.827153870193578e-05, + "loss": 0.7978, + "step": 3141 + }, + { + "epoch": 0.1377553867370284, + "grad_norm": 0.80078125, + "learning_rate": 3.826791682074792e-05, + "loss": 0.7507, + "step": 3142 + }, + { + "epoch": 0.13779922995368563, + "grad_norm": 0.90234375, + "learning_rate": 3.826429507039767e-05, + "loss": 0.8688, + "step": 3143 + }, + { + "epoch": 0.13784307317034286, + "grad_norm": 0.84765625, + "learning_rate": 3.8260673450892725e-05, + "loss": 0.8784, + "step": 3144 + }, + { + "epoch": 0.1378869163870001, + "grad_norm": 0.890625, + "learning_rate": 3.8257051962240734e-05, + "loss": 0.903, + "step": 3145 + }, + { + "epoch": 0.13793075960365733, + "grad_norm": 0.765625, + "learning_rate": 3.8253430604449404e-05, + "loss": 0.8415, + "step": 3146 + }, + { + "epoch": 0.13797460282031457, + "grad_norm": 0.81640625, + "learning_rate": 3.824980937752641e-05, + "loss": 0.8902, + "step": 3147 + }, + { + "epoch": 0.13801844603697178, + "grad_norm": 0.8203125, + "learning_rate": 3.824618828147941e-05, + "loss": 0.7713, + "step": 3148 + }, + { + "epoch": 0.138062289253629, + "grad_norm": 0.81640625, + "learning_rate": 3.8242567316316094e-05, + "loss": 0.8074, + "step": 3149 + }, + { + "epoch": 0.13810613247028625, + "grad_norm": 1.1953125, + "learning_rate": 3.823894648204408e-05, + "loss": 0.8646, + "step": 3150 + }, + { + "epoch": 0.13814997568694348, + "grad_norm": 0.828125, + "learning_rate": 3.8235325778671125e-05, + "loss": 0.7868, + "step": 3151 + }, + { + "epoch": 0.13819381890360072, + "grad_norm": 0.82421875, + "learning_rate": 3.823170520620488e-05, + "loss": 0.9043, + "step": 3152 + }, + { + "epoch": 0.13823766212025795, + "grad_norm": 0.8359375, + "learning_rate": 3.822808476465299e-05, + "loss": 0.9168, + "step": 3153 + }, + { + "epoch": 0.1382815053369152, + "grad_norm": 0.8515625, + "learning_rate": 3.822446445402316e-05, + "loss": 0.7353, + "step": 3154 + }, + { + "epoch": 0.13832534855357242, + "grad_norm": 0.89453125, + "learning_rate": 3.822084427432303e-05, + "loss": 0.9761, + "step": 3155 + }, + { + "epoch": 0.13836919177022966, + "grad_norm": 0.76953125, + "learning_rate": 3.82172242255603e-05, + "loss": 0.6936, + "step": 3156 + }, + { + "epoch": 0.1384130349868869, + "grad_norm": 0.78125, + "learning_rate": 3.821360430774259e-05, + "loss": 0.7551, + "step": 3157 + }, + { + "epoch": 0.13845687820354413, + "grad_norm": 0.75390625, + "learning_rate": 3.820998452087765e-05, + "loss": 0.6394, + "step": 3158 + }, + { + "epoch": 0.13850072142020137, + "grad_norm": 0.8203125, + "learning_rate": 3.820636486497312e-05, + "loss": 0.7648, + "step": 3159 + }, + { + "epoch": 0.1385445646368586, + "grad_norm": 0.84375, + "learning_rate": 3.820274534003666e-05, + "loss": 0.8284, + "step": 3160 + }, + { + "epoch": 0.13858840785351584, + "grad_norm": 1.078125, + "learning_rate": 3.819912594607597e-05, + "loss": 0.8485, + "step": 3161 + }, + { + "epoch": 0.13863225107017307, + "grad_norm": 0.86328125, + "learning_rate": 3.819550668309868e-05, + "loss": 0.7755, + "step": 3162 + }, + { + "epoch": 0.13867609428683028, + "grad_norm": 0.8828125, + "learning_rate": 3.819188755111248e-05, + "loss": 0.8353, + "step": 3163 + }, + { + "epoch": 0.13871993750348752, + "grad_norm": 0.8125, + "learning_rate": 3.818826855012505e-05, + "loss": 0.6494, + "step": 3164 + }, + { + "epoch": 0.13876378072014475, + "grad_norm": 0.87109375, + "learning_rate": 3.818464968014401e-05, + "loss": 0.8688, + "step": 3165 + }, + { + "epoch": 0.138807623936802, + "grad_norm": 0.80859375, + "learning_rate": 3.8181030941177123e-05, + "loss": 0.85, + "step": 3166 + }, + { + "epoch": 0.13885146715345922, + "grad_norm": 0.8359375, + "learning_rate": 3.817741233323199e-05, + "loss": 0.8273, + "step": 3167 + }, + { + "epoch": 0.13889531037011646, + "grad_norm": 0.8515625, + "learning_rate": 3.81737938563163e-05, + "loss": 0.8502, + "step": 3168 + }, + { + "epoch": 0.1389391535867737, + "grad_norm": 0.8359375, + "learning_rate": 3.817017551043772e-05, + "loss": 0.8882, + "step": 3169 + }, + { + "epoch": 0.13898299680343093, + "grad_norm": 0.85546875, + "learning_rate": 3.81665572956039e-05, + "loss": 0.8873, + "step": 3170 + }, + { + "epoch": 0.13902684002008817, + "grad_norm": 0.8515625, + "learning_rate": 3.8162939211822544e-05, + "loss": 0.7976, + "step": 3171 + }, + { + "epoch": 0.1390706832367454, + "grad_norm": 0.8125, + "learning_rate": 3.815932125910131e-05, + "loss": 0.8521, + "step": 3172 + }, + { + "epoch": 0.13911452645340264, + "grad_norm": 0.8203125, + "learning_rate": 3.815570343744787e-05, + "loss": 0.9299, + "step": 3173 + }, + { + "epoch": 0.13915836967005987, + "grad_norm": 0.84375, + "learning_rate": 3.815208574686986e-05, + "loss": 0.8418, + "step": 3174 + }, + { + "epoch": 0.1392022128867171, + "grad_norm": 0.8828125, + "learning_rate": 3.814846818737495e-05, + "loss": 0.7782, + "step": 3175 + }, + { + "epoch": 0.13924605610337434, + "grad_norm": 0.86328125, + "learning_rate": 3.814485075897086e-05, + "loss": 0.8841, + "step": 3176 + }, + { + "epoch": 0.13928989932003158, + "grad_norm": 0.796875, + "learning_rate": 3.8141233461665215e-05, + "loss": 0.7624, + "step": 3177 + }, + { + "epoch": 0.1393337425366888, + "grad_norm": 0.8125, + "learning_rate": 3.813761629546571e-05, + "loss": 0.7709, + "step": 3178 + }, + { + "epoch": 0.13937758575334602, + "grad_norm": 0.9609375, + "learning_rate": 3.813399926037997e-05, + "loss": 0.9722, + "step": 3179 + }, + { + "epoch": 0.13942142897000326, + "grad_norm": 0.9765625, + "learning_rate": 3.813038235641565e-05, + "loss": 0.8615, + "step": 3180 + }, + { + "epoch": 0.1394652721866605, + "grad_norm": 0.7890625, + "learning_rate": 3.812676558358049e-05, + "loss": 0.8871, + "step": 3181 + }, + { + "epoch": 0.13950911540331773, + "grad_norm": 0.87890625, + "learning_rate": 3.812314894188212e-05, + "loss": 0.7999, + "step": 3182 + }, + { + "epoch": 0.13955295861997496, + "grad_norm": 0.8515625, + "learning_rate": 3.8119532431328185e-05, + "loss": 0.8088, + "step": 3183 + }, + { + "epoch": 0.1395968018366322, + "grad_norm": 0.8671875, + "learning_rate": 3.811591605192637e-05, + "loss": 0.8689, + "step": 3184 + }, + { + "epoch": 0.13964064505328944, + "grad_norm": 0.84375, + "learning_rate": 3.81122998036843e-05, + "loss": 0.7518, + "step": 3185 + }, + { + "epoch": 0.13968448826994667, + "grad_norm": 0.79296875, + "learning_rate": 3.81086836866097e-05, + "loss": 0.7989, + "step": 3186 + }, + { + "epoch": 0.1397283314866039, + "grad_norm": 0.9375, + "learning_rate": 3.810506770071022e-05, + "loss": 0.8587, + "step": 3187 + }, + { + "epoch": 0.13977217470326114, + "grad_norm": 0.77734375, + "learning_rate": 3.810145184599351e-05, + "loss": 0.7097, + "step": 3188 + }, + { + "epoch": 0.13981601791991838, + "grad_norm": 0.86328125, + "learning_rate": 3.809783612246722e-05, + "loss": 0.7977, + "step": 3189 + }, + { + "epoch": 0.13985986113657561, + "grad_norm": 0.84765625, + "learning_rate": 3.8094220530139016e-05, + "loss": 0.9148, + "step": 3190 + }, + { + "epoch": 0.13990370435323285, + "grad_norm": 0.7421875, + "learning_rate": 3.8090605069016595e-05, + "loss": 0.8846, + "step": 3191 + }, + { + "epoch": 0.13994754756989009, + "grad_norm": 0.9609375, + "learning_rate": 3.8086989739107593e-05, + "loss": 0.935, + "step": 3192 + }, + { + "epoch": 0.13999139078654732, + "grad_norm": 0.78125, + "learning_rate": 3.808337454041969e-05, + "loss": 0.87, + "step": 3193 + }, + { + "epoch": 0.14003523400320453, + "grad_norm": 0.79296875, + "learning_rate": 3.807975947296053e-05, + "loss": 0.8131, + "step": 3194 + }, + { + "epoch": 0.14007907721986176, + "grad_norm": 0.8359375, + "learning_rate": 3.807614453673775e-05, + "loss": 0.8391, + "step": 3195 + }, + { + "epoch": 0.140122920436519, + "grad_norm": 0.75, + "learning_rate": 3.8072529731759066e-05, + "loss": 0.6637, + "step": 3196 + }, + { + "epoch": 0.14016676365317624, + "grad_norm": 0.91796875, + "learning_rate": 3.8068915058032116e-05, + "loss": 0.9455, + "step": 3197 + }, + { + "epoch": 0.14021060686983347, + "grad_norm": 0.80859375, + "learning_rate": 3.806530051556457e-05, + "loss": 0.7939, + "step": 3198 + }, + { + "epoch": 0.1402544500864907, + "grad_norm": 0.91015625, + "learning_rate": 3.806168610436408e-05, + "loss": 0.8904, + "step": 3199 + }, + { + "epoch": 0.14029829330314794, + "grad_norm": 0.78125, + "learning_rate": 3.8058071824438304e-05, + "loss": 0.9943, + "step": 3200 + }, + { + "epoch": 0.14034213651980518, + "grad_norm": 0.859375, + "learning_rate": 3.80544576757949e-05, + "loss": 0.743, + "step": 3201 + }, + { + "epoch": 0.1403859797364624, + "grad_norm": 0.78125, + "learning_rate": 3.805084365844154e-05, + "loss": 0.671, + "step": 3202 + }, + { + "epoch": 0.14042982295311965, + "grad_norm": 0.78515625, + "learning_rate": 3.804722977238586e-05, + "loss": 0.827, + "step": 3203 + }, + { + "epoch": 0.14047366616977688, + "grad_norm": 0.8203125, + "learning_rate": 3.804361601763555e-05, + "loss": 0.8165, + "step": 3204 + }, + { + "epoch": 0.14051750938643412, + "grad_norm": 0.89453125, + "learning_rate": 3.804000239419822e-05, + "loss": 0.8607, + "step": 3205 + }, + { + "epoch": 0.14056135260309136, + "grad_norm": 0.82421875, + "learning_rate": 3.8036388902081586e-05, + "loss": 0.78, + "step": 3206 + }, + { + "epoch": 0.1406051958197486, + "grad_norm": 0.9140625, + "learning_rate": 3.80327755412933e-05, + "loss": 0.8194, + "step": 3207 + }, + { + "epoch": 0.14064903903640583, + "grad_norm": 0.86328125, + "learning_rate": 3.8029162311840995e-05, + "loss": 0.9343, + "step": 3208 + }, + { + "epoch": 0.14069288225306303, + "grad_norm": 0.875, + "learning_rate": 3.8025549213732336e-05, + "loss": 0.9176, + "step": 3209 + }, + { + "epoch": 0.14073672546972027, + "grad_norm": 0.82421875, + "learning_rate": 3.8021936246974954e-05, + "loss": 0.8732, + "step": 3210 + }, + { + "epoch": 0.1407805686863775, + "grad_norm": 0.78515625, + "learning_rate": 3.801832341157657e-05, + "loss": 0.7967, + "step": 3211 + }, + { + "epoch": 0.14082441190303474, + "grad_norm": 0.96484375, + "learning_rate": 3.801471070754481e-05, + "loss": 0.8893, + "step": 3212 + }, + { + "epoch": 0.14086825511969198, + "grad_norm": 0.92578125, + "learning_rate": 3.801109813488732e-05, + "loss": 0.722, + "step": 3213 + }, + { + "epoch": 0.1409120983363492, + "grad_norm": 0.7890625, + "learning_rate": 3.8007485693611764e-05, + "loss": 0.752, + "step": 3214 + }, + { + "epoch": 0.14095594155300645, + "grad_norm": 0.7734375, + "learning_rate": 3.8003873383725776e-05, + "loss": 0.7967, + "step": 3215 + }, + { + "epoch": 0.14099978476966368, + "grad_norm": 0.859375, + "learning_rate": 3.8000261205237055e-05, + "loss": 0.9157, + "step": 3216 + }, + { + "epoch": 0.14104362798632092, + "grad_norm": 0.7890625, + "learning_rate": 3.7996649158153244e-05, + "loss": 0.6824, + "step": 3217 + }, + { + "epoch": 0.14108747120297815, + "grad_norm": 0.8359375, + "learning_rate": 3.799303724248199e-05, + "loss": 0.7369, + "step": 3218 + }, + { + "epoch": 0.1411313144196354, + "grad_norm": 0.8125, + "learning_rate": 3.7989425458230954e-05, + "loss": 0.7806, + "step": 3219 + }, + { + "epoch": 0.14117515763629263, + "grad_norm": 0.75390625, + "learning_rate": 3.798581380540774e-05, + "loss": 0.8044, + "step": 3220 + }, + { + "epoch": 0.14121900085294986, + "grad_norm": 0.8359375, + "learning_rate": 3.79822022840201e-05, + "loss": 0.8542, + "step": 3221 + }, + { + "epoch": 0.1412628440696071, + "grad_norm": 0.8359375, + "learning_rate": 3.797859089407563e-05, + "loss": 0.7297, + "step": 3222 + }, + { + "epoch": 0.14130668728626433, + "grad_norm": 0.81640625, + "learning_rate": 3.7974979635581984e-05, + "loss": 0.8753, + "step": 3223 + }, + { + "epoch": 0.14135053050292154, + "grad_norm": 0.796875, + "learning_rate": 3.797136850854683e-05, + "loss": 0.8145, + "step": 3224 + }, + { + "epoch": 0.14139437371957878, + "grad_norm": 0.8046875, + "learning_rate": 3.7967757512977784e-05, + "loss": 0.7005, + "step": 3225 + }, + { + "epoch": 0.141438216936236, + "grad_norm": 0.80078125, + "learning_rate": 3.7964146648882556e-05, + "loss": 0.8891, + "step": 3226 + }, + { + "epoch": 0.14148206015289325, + "grad_norm": 0.8203125, + "learning_rate": 3.796053591626878e-05, + "loss": 0.748, + "step": 3227 + }, + { + "epoch": 0.14152590336955048, + "grad_norm": 0.91015625, + "learning_rate": 3.79569253151441e-05, + "loss": 0.8495, + "step": 3228 + }, + { + "epoch": 0.14156974658620772, + "grad_norm": 0.78515625, + "learning_rate": 3.7953314845516175e-05, + "loss": 0.789, + "step": 3229 + }, + { + "epoch": 0.14161358980286495, + "grad_norm": 0.8125, + "learning_rate": 3.7949704507392615e-05, + "loss": 0.7764, + "step": 3230 + }, + { + "epoch": 0.1416574330195222, + "grad_norm": 0.96875, + "learning_rate": 3.7946094300781145e-05, + "loss": 0.7873, + "step": 3231 + }, + { + "epoch": 0.14170127623617942, + "grad_norm": 0.875, + "learning_rate": 3.794248422568938e-05, + "loss": 0.7471, + "step": 3232 + }, + { + "epoch": 0.14174511945283666, + "grad_norm": 0.8203125, + "learning_rate": 3.793887428212497e-05, + "loss": 0.8158, + "step": 3233 + }, + { + "epoch": 0.1417889626694939, + "grad_norm": 0.796875, + "learning_rate": 3.793526447009558e-05, + "loss": 0.7563, + "step": 3234 + }, + { + "epoch": 0.14183280588615113, + "grad_norm": 0.80859375, + "learning_rate": 3.79316547896088e-05, + "loss": 0.8571, + "step": 3235 + }, + { + "epoch": 0.14187664910280837, + "grad_norm": 0.85546875, + "learning_rate": 3.7928045240672374e-05, + "loss": 0.8243, + "step": 3236 + }, + { + "epoch": 0.1419204923194656, + "grad_norm": 0.82421875, + "learning_rate": 3.79244358232939e-05, + "loss": 0.8113, + "step": 3237 + }, + { + "epoch": 0.14196433553612284, + "grad_norm": 0.8828125, + "learning_rate": 3.7920826537481046e-05, + "loss": 0.7879, + "step": 3238 + }, + { + "epoch": 0.14200817875278005, + "grad_norm": 0.765625, + "learning_rate": 3.791721738324144e-05, + "loss": 0.906, + "step": 3239 + }, + { + "epoch": 0.14205202196943728, + "grad_norm": 0.77734375, + "learning_rate": 3.791360836058275e-05, + "loss": 0.7609, + "step": 3240 + }, + { + "epoch": 0.14209586518609452, + "grad_norm": 0.79296875, + "learning_rate": 3.7909999469512624e-05, + "loss": 0.8623, + "step": 3241 + }, + { + "epoch": 0.14213970840275175, + "grad_norm": 0.74609375, + "learning_rate": 3.7906390710038696e-05, + "loss": 0.8095, + "step": 3242 + }, + { + "epoch": 0.142183551619409, + "grad_norm": 0.80859375, + "learning_rate": 3.7902782082168596e-05, + "loss": 0.791, + "step": 3243 + }, + { + "epoch": 0.14222739483606622, + "grad_norm": 0.8515625, + "learning_rate": 3.789917358591003e-05, + "loss": 0.8671, + "step": 3244 + }, + { + "epoch": 0.14227123805272346, + "grad_norm": 0.94140625, + "learning_rate": 3.789556522127062e-05, + "loss": 0.8733, + "step": 3245 + }, + { + "epoch": 0.1423150812693807, + "grad_norm": 0.796875, + "learning_rate": 3.7891956988258014e-05, + "loss": 0.7831, + "step": 3246 + }, + { + "epoch": 0.14235892448603793, + "grad_norm": 0.81640625, + "learning_rate": 3.788834888687984e-05, + "loss": 0.7943, + "step": 3247 + }, + { + "epoch": 0.14240276770269517, + "grad_norm": 0.85546875, + "learning_rate": 3.788474091714378e-05, + "loss": 0.7992, + "step": 3248 + }, + { + "epoch": 0.1424466109193524, + "grad_norm": 0.91796875, + "learning_rate": 3.788113307905745e-05, + "loss": 0.9539, + "step": 3249 + }, + { + "epoch": 0.14249045413600964, + "grad_norm": 0.8046875, + "learning_rate": 3.7877525372628475e-05, + "loss": 0.7603, + "step": 3250 + }, + { + "epoch": 0.14253429735266687, + "grad_norm": 0.75390625, + "learning_rate": 3.7873917797864565e-05, + "loss": 0.7054, + "step": 3251 + }, + { + "epoch": 0.1425781405693241, + "grad_norm": 0.8046875, + "learning_rate": 3.787031035477334e-05, + "loss": 0.79, + "step": 3252 + }, + { + "epoch": 0.14262198378598134, + "grad_norm": 0.9140625, + "learning_rate": 3.786670304336244e-05, + "loss": 0.7614, + "step": 3253 + }, + { + "epoch": 0.14266582700263855, + "grad_norm": 0.76953125, + "learning_rate": 3.786309586363951e-05, + "loss": 0.814, + "step": 3254 + }, + { + "epoch": 0.1427096702192958, + "grad_norm": 0.8046875, + "learning_rate": 3.7859488815612165e-05, + "loss": 0.8472, + "step": 3255 + }, + { + "epoch": 0.14275351343595302, + "grad_norm": 0.83203125, + "learning_rate": 3.7855881899288117e-05, + "loss": 0.9488, + "step": 3256 + }, + { + "epoch": 0.14279735665261026, + "grad_norm": 0.8125, + "learning_rate": 3.785227511467496e-05, + "loss": 0.7308, + "step": 3257 + }, + { + "epoch": 0.1428411998692675, + "grad_norm": 0.91015625, + "learning_rate": 3.784866846178037e-05, + "loss": 0.8514, + "step": 3258 + }, + { + "epoch": 0.14288504308592473, + "grad_norm": 0.78515625, + "learning_rate": 3.784506194061197e-05, + "loss": 0.6934, + "step": 3259 + }, + { + "epoch": 0.14292888630258196, + "grad_norm": 0.82421875, + "learning_rate": 3.784145555117737e-05, + "loss": 0.8814, + "step": 3260 + }, + { + "epoch": 0.1429727295192392, + "grad_norm": 1.125, + "learning_rate": 3.78378492934843e-05, + "loss": 0.8879, + "step": 3261 + }, + { + "epoch": 0.14301657273589644, + "grad_norm": 0.8203125, + "learning_rate": 3.783424316754035e-05, + "loss": 0.9097, + "step": 3262 + }, + { + "epoch": 0.14306041595255367, + "grad_norm": 0.7890625, + "learning_rate": 3.7830637173353164e-05, + "loss": 0.6794, + "step": 3263 + }, + { + "epoch": 0.1431042591692109, + "grad_norm": 0.828125, + "learning_rate": 3.782703131093039e-05, + "loss": 0.7677, + "step": 3264 + }, + { + "epoch": 0.14314810238586814, + "grad_norm": 0.80859375, + "learning_rate": 3.782342558027967e-05, + "loss": 0.8528, + "step": 3265 + }, + { + "epoch": 0.14319194560252538, + "grad_norm": 0.796875, + "learning_rate": 3.781981998140861e-05, + "loss": 0.7784, + "step": 3266 + }, + { + "epoch": 0.1432357888191826, + "grad_norm": 0.828125, + "learning_rate": 3.781621451432492e-05, + "loss": 0.7287, + "step": 3267 + }, + { + "epoch": 0.14327963203583985, + "grad_norm": 0.80078125, + "learning_rate": 3.781260917903622e-05, + "loss": 0.998, + "step": 3268 + }, + { + "epoch": 0.14332347525249706, + "grad_norm": 0.796875, + "learning_rate": 3.780900397555013e-05, + "loss": 0.749, + "step": 3269 + }, + { + "epoch": 0.1433673184691543, + "grad_norm": 0.73828125, + "learning_rate": 3.78053989038743e-05, + "loss": 0.8143, + "step": 3270 + }, + { + "epoch": 0.14341116168581153, + "grad_norm": 0.8203125, + "learning_rate": 3.780179396401634e-05, + "loss": 0.857, + "step": 3271 + }, + { + "epoch": 0.14345500490246876, + "grad_norm": 0.828125, + "learning_rate": 3.779818915598397e-05, + "loss": 0.8621, + "step": 3272 + }, + { + "epoch": 0.143498848119126, + "grad_norm": 0.84765625, + "learning_rate": 3.7794584479784766e-05, + "loss": 0.8132, + "step": 3273 + }, + { + "epoch": 0.14354269133578323, + "grad_norm": 0.85546875, + "learning_rate": 3.77909799354264e-05, + "loss": 0.8431, + "step": 3274 + }, + { + "epoch": 0.14358653455244047, + "grad_norm": 0.8671875, + "learning_rate": 3.7787375522916476e-05, + "loss": 0.8363, + "step": 3275 + }, + { + "epoch": 0.1436303777690977, + "grad_norm": 0.76171875, + "learning_rate": 3.7783771242262634e-05, + "loss": 0.7612, + "step": 3276 + }, + { + "epoch": 0.14367422098575494, + "grad_norm": 0.84375, + "learning_rate": 3.7780167093472563e-05, + "loss": 0.8472, + "step": 3277 + }, + { + "epoch": 0.14371806420241218, + "grad_norm": 0.8203125, + "learning_rate": 3.7776563076553874e-05, + "loss": 0.7852, + "step": 3278 + }, + { + "epoch": 0.1437619074190694, + "grad_norm": 0.79296875, + "learning_rate": 3.7772959191514204e-05, + "loss": 0.8632, + "step": 3279 + }, + { + "epoch": 0.14380575063572665, + "grad_norm": 0.83203125, + "learning_rate": 3.776935543836119e-05, + "loss": 0.7169, + "step": 3280 + }, + { + "epoch": 0.14384959385238388, + "grad_norm": 0.8203125, + "learning_rate": 3.7765751817102424e-05, + "loss": 0.8035, + "step": 3281 + }, + { + "epoch": 0.14389343706904112, + "grad_norm": 0.78515625, + "learning_rate": 3.7762148327745637e-05, + "loss": 0.7022, + "step": 3282 + }, + { + "epoch": 0.14393728028569835, + "grad_norm": 0.91796875, + "learning_rate": 3.7758544970298414e-05, + "loss": 0.7588, + "step": 3283 + }, + { + "epoch": 0.1439811235023556, + "grad_norm": 0.81640625, + "learning_rate": 3.77549417447684e-05, + "loss": 0.791, + "step": 3284 + }, + { + "epoch": 0.1440249667190128, + "grad_norm": 0.95703125, + "learning_rate": 3.775133865116323e-05, + "loss": 0.9642, + "step": 3285 + }, + { + "epoch": 0.14406880993567003, + "grad_norm": 0.81640625, + "learning_rate": 3.774773568949054e-05, + "loss": 0.7758, + "step": 3286 + }, + { + "epoch": 0.14411265315232727, + "grad_norm": 0.859375, + "learning_rate": 3.7744132859757955e-05, + "loss": 0.7665, + "step": 3287 + }, + { + "epoch": 0.1441564963689845, + "grad_norm": 0.82421875, + "learning_rate": 3.7740530161973134e-05, + "loss": 0.8031, + "step": 3288 + }, + { + "epoch": 0.14420033958564174, + "grad_norm": 0.85546875, + "learning_rate": 3.77369275961437e-05, + "loss": 0.8385, + "step": 3289 + }, + { + "epoch": 0.14424418280229898, + "grad_norm": 0.83984375, + "learning_rate": 3.773332516227729e-05, + "loss": 0.803, + "step": 3290 + }, + { + "epoch": 0.1442880260189562, + "grad_norm": 0.84765625, + "learning_rate": 3.77297228603815e-05, + "loss": 0.8579, + "step": 3291 + }, + { + "epoch": 0.14433186923561345, + "grad_norm": 0.87109375, + "learning_rate": 3.772612069046403e-05, + "loss": 0.8234, + "step": 3292 + }, + { + "epoch": 0.14437571245227068, + "grad_norm": 0.8046875, + "learning_rate": 3.7722518652532505e-05, + "loss": 0.8253, + "step": 3293 + }, + { + "epoch": 0.14441955566892792, + "grad_norm": 0.80859375, + "learning_rate": 3.7718916746594524e-05, + "loss": 0.8431, + "step": 3294 + }, + { + "epoch": 0.14446339888558515, + "grad_norm": 0.88671875, + "learning_rate": 3.771531497265776e-05, + "loss": 0.8638, + "step": 3295 + }, + { + "epoch": 0.1445072421022424, + "grad_norm": 0.765625, + "learning_rate": 3.771171333072977e-05, + "loss": 0.7859, + "step": 3296 + }, + { + "epoch": 0.14455108531889962, + "grad_norm": 0.84765625, + "learning_rate": 3.770811182081829e-05, + "loss": 0.8027, + "step": 3297 + }, + { + "epoch": 0.14459492853555686, + "grad_norm": 0.921875, + "learning_rate": 3.770451044293091e-05, + "loss": 0.8044, + "step": 3298 + }, + { + "epoch": 0.1446387717522141, + "grad_norm": 0.8046875, + "learning_rate": 3.7700909197075254e-05, + "loss": 0.8856, + "step": 3299 + }, + { + "epoch": 0.1446826149688713, + "grad_norm": 0.796875, + "learning_rate": 3.769730808325895e-05, + "loss": 0.8994, + "step": 3300 + }, + { + "epoch": 0.14472645818552854, + "grad_norm": 0.80078125, + "learning_rate": 3.769370710148962e-05, + "loss": 0.8763, + "step": 3301 + }, + { + "epoch": 0.14477030140218577, + "grad_norm": 0.796875, + "learning_rate": 3.769010625177495e-05, + "loss": 0.8489, + "step": 3302 + }, + { + "epoch": 0.144814144618843, + "grad_norm": 0.765625, + "learning_rate": 3.768650553412253e-05, + "loss": 0.7402, + "step": 3303 + }, + { + "epoch": 0.14485798783550025, + "grad_norm": 0.89453125, + "learning_rate": 3.768290494854001e-05, + "loss": 0.8623, + "step": 3304 + }, + { + "epoch": 0.14490183105215748, + "grad_norm": 0.77734375, + "learning_rate": 3.7679304495035004e-05, + "loss": 0.7693, + "step": 3305 + }, + { + "epoch": 0.14494567426881472, + "grad_norm": 0.82421875, + "learning_rate": 3.7675704173615124e-05, + "loss": 0.8531, + "step": 3306 + }, + { + "epoch": 0.14498951748547195, + "grad_norm": 0.8671875, + "learning_rate": 3.767210398428805e-05, + "loss": 0.8417, + "step": 3307 + }, + { + "epoch": 0.1450333607021292, + "grad_norm": 0.8828125, + "learning_rate": 3.7668503927061394e-05, + "loss": 0.7395, + "step": 3308 + }, + { + "epoch": 0.14507720391878642, + "grad_norm": 0.765625, + "learning_rate": 3.766490400194278e-05, + "loss": 0.753, + "step": 3309 + }, + { + "epoch": 0.14512104713544366, + "grad_norm": 0.75, + "learning_rate": 3.766130420893984e-05, + "loss": 0.7669, + "step": 3310 + }, + { + "epoch": 0.1451648903521009, + "grad_norm": 0.703125, + "learning_rate": 3.765770454806017e-05, + "loss": 0.7003, + "step": 3311 + }, + { + "epoch": 0.14520873356875813, + "grad_norm": 0.75, + "learning_rate": 3.7654105019311466e-05, + "loss": 0.8547, + "step": 3312 + }, + { + "epoch": 0.14525257678541537, + "grad_norm": 0.83203125, + "learning_rate": 3.7650505622701315e-05, + "loss": 0.8343, + "step": 3313 + }, + { + "epoch": 0.1452964200020726, + "grad_norm": 0.82421875, + "learning_rate": 3.764690635823737e-05, + "loss": 0.7268, + "step": 3314 + }, + { + "epoch": 0.1453402632187298, + "grad_norm": 0.77734375, + "learning_rate": 3.7643307225927226e-05, + "loss": 0.8228, + "step": 3315 + }, + { + "epoch": 0.14538410643538704, + "grad_norm": 0.90625, + "learning_rate": 3.76397082257785e-05, + "loss": 0.9406, + "step": 3316 + }, + { + "epoch": 0.14542794965204428, + "grad_norm": 0.80859375, + "learning_rate": 3.763610935779889e-05, + "loss": 0.7845, + "step": 3317 + }, + { + "epoch": 0.14547179286870152, + "grad_norm": 0.83984375, + "learning_rate": 3.7632510621995975e-05, + "loss": 0.8549, + "step": 3318 + }, + { + "epoch": 0.14551563608535875, + "grad_norm": 0.80078125, + "learning_rate": 3.762891201837739e-05, + "loss": 0.6675, + "step": 3319 + }, + { + "epoch": 0.145559479302016, + "grad_norm": 0.7890625, + "learning_rate": 3.762531354695076e-05, + "loss": 0.703, + "step": 3320 + }, + { + "epoch": 0.14560332251867322, + "grad_norm": 0.8125, + "learning_rate": 3.762171520772367e-05, + "loss": 0.8715, + "step": 3321 + }, + { + "epoch": 0.14564716573533046, + "grad_norm": 0.875, + "learning_rate": 3.761811700070385e-05, + "loss": 0.8408, + "step": 3322 + }, + { + "epoch": 0.1456910089519877, + "grad_norm": 0.78125, + "learning_rate": 3.7614518925898836e-05, + "loss": 0.7795, + "step": 3323 + }, + { + "epoch": 0.14573485216864493, + "grad_norm": 0.84375, + "learning_rate": 3.76109209833163e-05, + "loss": 0.7748, + "step": 3324 + }, + { + "epoch": 0.14577869538530216, + "grad_norm": 0.83984375, + "learning_rate": 3.760732317296385e-05, + "loss": 0.7181, + "step": 3325 + }, + { + "epoch": 0.1458225386019594, + "grad_norm": 0.859375, + "learning_rate": 3.7603725494849105e-05, + "loss": 0.9035, + "step": 3326 + }, + { + "epoch": 0.14586638181861664, + "grad_norm": 0.83203125, + "learning_rate": 3.7600127948979705e-05, + "loss": 0.9527, + "step": 3327 + }, + { + "epoch": 0.14591022503527387, + "grad_norm": 0.76953125, + "learning_rate": 3.759653053536322e-05, + "loss": 0.8297, + "step": 3328 + }, + { + "epoch": 0.1459540682519311, + "grad_norm": 0.74609375, + "learning_rate": 3.759293325400737e-05, + "loss": 0.8228, + "step": 3329 + }, + { + "epoch": 0.14599791146858832, + "grad_norm": 0.78515625, + "learning_rate": 3.758933610491972e-05, + "loss": 0.7769, + "step": 3330 + }, + { + "epoch": 0.14604175468524555, + "grad_norm": 0.7890625, + "learning_rate": 3.758573908810792e-05, + "loss": 0.7188, + "step": 3331 + }, + { + "epoch": 0.1460855979019028, + "grad_norm": 0.7890625, + "learning_rate": 3.758214220357956e-05, + "loss": 0.8005, + "step": 3332 + }, + { + "epoch": 0.14612944111856002, + "grad_norm": 0.79296875, + "learning_rate": 3.7578545451342295e-05, + "loss": 0.8127, + "step": 3333 + }, + { + "epoch": 0.14617328433521726, + "grad_norm": 0.796875, + "learning_rate": 3.7574948831403736e-05, + "loss": 0.7714, + "step": 3334 + }, + { + "epoch": 0.1462171275518745, + "grad_norm": 0.78515625, + "learning_rate": 3.757135234377149e-05, + "loss": 0.7925, + "step": 3335 + }, + { + "epoch": 0.14626097076853173, + "grad_norm": 0.8671875, + "learning_rate": 3.756775598845318e-05, + "loss": 0.8523, + "step": 3336 + }, + { + "epoch": 0.14630481398518896, + "grad_norm": 0.89453125, + "learning_rate": 3.756415976545646e-05, + "loss": 0.8159, + "step": 3337 + }, + { + "epoch": 0.1463486572018462, + "grad_norm": 0.8203125, + "learning_rate": 3.756056367478895e-05, + "loss": 0.7371, + "step": 3338 + }, + { + "epoch": 0.14639250041850344, + "grad_norm": 0.8828125, + "learning_rate": 3.755696771645825e-05, + "loss": 0.824, + "step": 3339 + }, + { + "epoch": 0.14643634363516067, + "grad_norm": 0.82421875, + "learning_rate": 3.755337189047198e-05, + "loss": 0.8921, + "step": 3340 + }, + { + "epoch": 0.1464801868518179, + "grad_norm": 0.921875, + "learning_rate": 3.7549776196837736e-05, + "loss": 0.9609, + "step": 3341 + }, + { + "epoch": 0.14652403006847514, + "grad_norm": 0.81640625, + "learning_rate": 3.754618063556322e-05, + "loss": 0.8374, + "step": 3342 + }, + { + "epoch": 0.14656787328513238, + "grad_norm": 0.765625, + "learning_rate": 3.754258520665599e-05, + "loss": 0.8455, + "step": 3343 + }, + { + "epoch": 0.1466117165017896, + "grad_norm": 0.70703125, + "learning_rate": 3.753898991012369e-05, + "loss": 0.7027, + "step": 3344 + }, + { + "epoch": 0.14665555971844682, + "grad_norm": 0.81640625, + "learning_rate": 3.753539474597393e-05, + "loss": 0.7128, + "step": 3345 + }, + { + "epoch": 0.14669940293510406, + "grad_norm": 0.8515625, + "learning_rate": 3.753179971421429e-05, + "loss": 0.7951, + "step": 3346 + }, + { + "epoch": 0.1467432461517613, + "grad_norm": 0.83984375, + "learning_rate": 3.7528204814852473e-05, + "loss": 0.8928, + "step": 3347 + }, + { + "epoch": 0.14678708936841853, + "grad_norm": 0.77734375, + "learning_rate": 3.7524610047896044e-05, + "loss": 0.7222, + "step": 3348 + }, + { + "epoch": 0.14683093258507576, + "grad_norm": 0.8125, + "learning_rate": 3.7521015413352644e-05, + "loss": 0.797, + "step": 3349 + }, + { + "epoch": 0.146874775801733, + "grad_norm": 0.8671875, + "learning_rate": 3.7517420911229875e-05, + "loss": 0.8598, + "step": 3350 + }, + { + "epoch": 0.14691861901839023, + "grad_norm": 0.859375, + "learning_rate": 3.751382654153533e-05, + "loss": 0.9034, + "step": 3351 + }, + { + "epoch": 0.14696246223504747, + "grad_norm": 0.8125, + "learning_rate": 3.7510232304276684e-05, + "loss": 0.7403, + "step": 3352 + }, + { + "epoch": 0.1470063054517047, + "grad_norm": 0.7421875, + "learning_rate": 3.750663819946154e-05, + "loss": 0.7857, + "step": 3353 + }, + { + "epoch": 0.14705014866836194, + "grad_norm": 0.87890625, + "learning_rate": 3.750304422709749e-05, + "loss": 0.9466, + "step": 3354 + }, + { + "epoch": 0.14709399188501918, + "grad_norm": 0.859375, + "learning_rate": 3.749945038719218e-05, + "loss": 0.824, + "step": 3355 + }, + { + "epoch": 0.1471378351016764, + "grad_norm": 0.7890625, + "learning_rate": 3.749585667975317e-05, + "loss": 0.8446, + "step": 3356 + }, + { + "epoch": 0.14718167831833365, + "grad_norm": 0.81640625, + "learning_rate": 3.749226310478816e-05, + "loss": 0.7009, + "step": 3357 + }, + { + "epoch": 0.14722552153499088, + "grad_norm": 0.82421875, + "learning_rate": 3.748866966230472e-05, + "loss": 0.9158, + "step": 3358 + }, + { + "epoch": 0.14726936475164812, + "grad_norm": 0.78515625, + "learning_rate": 3.748507635231048e-05, + "loss": 0.739, + "step": 3359 + }, + { + "epoch": 0.14731320796830533, + "grad_norm": 1.2734375, + "learning_rate": 3.7481483174813035e-05, + "loss": 1.0197, + "step": 3360 + }, + { + "epoch": 0.14735705118496256, + "grad_norm": 0.8671875, + "learning_rate": 3.747789012981999e-05, + "loss": 0.7301, + "step": 3361 + }, + { + "epoch": 0.1474008944016198, + "grad_norm": 0.8125, + "learning_rate": 3.747429721733902e-05, + "loss": 0.7679, + "step": 3362 + }, + { + "epoch": 0.14744473761827703, + "grad_norm": 0.859375, + "learning_rate": 3.747070443737769e-05, + "loss": 0.832, + "step": 3363 + }, + { + "epoch": 0.14748858083493427, + "grad_norm": 0.796875, + "learning_rate": 3.7467111789943634e-05, + "loss": 0.7799, + "step": 3364 + }, + { + "epoch": 0.1475324240515915, + "grad_norm": 0.828125, + "learning_rate": 3.7463519275044456e-05, + "loss": 0.8065, + "step": 3365 + }, + { + "epoch": 0.14757626726824874, + "grad_norm": 0.7421875, + "learning_rate": 3.745992689268775e-05, + "loss": 0.9456, + "step": 3366 + }, + { + "epoch": 0.14762011048490598, + "grad_norm": 0.8359375, + "learning_rate": 3.745633464288118e-05, + "loss": 0.7345, + "step": 3367 + }, + { + "epoch": 0.1476639537015632, + "grad_norm": 0.84765625, + "learning_rate": 3.7452742525632336e-05, + "loss": 0.7843, + "step": 3368 + }, + { + "epoch": 0.14770779691822045, + "grad_norm": 0.99609375, + "learning_rate": 3.744915054094883e-05, + "loss": 0.9025, + "step": 3369 + }, + { + "epoch": 0.14775164013487768, + "grad_norm": 0.7734375, + "learning_rate": 3.744555868883828e-05, + "loss": 0.8425, + "step": 3370 + }, + { + "epoch": 0.14779548335153492, + "grad_norm": 0.80859375, + "learning_rate": 3.7441966969308284e-05, + "loss": 0.9224, + "step": 3371 + }, + { + "epoch": 0.14783932656819215, + "grad_norm": 0.75390625, + "learning_rate": 3.743837538236646e-05, + "loss": 0.7085, + "step": 3372 + }, + { + "epoch": 0.1478831697848494, + "grad_norm": 0.93359375, + "learning_rate": 3.7434783928020433e-05, + "loss": 1.0076, + "step": 3373 + }, + { + "epoch": 0.14792701300150662, + "grad_norm": 0.83203125, + "learning_rate": 3.743119260627781e-05, + "loss": 0.8498, + "step": 3374 + }, + { + "epoch": 0.14797085621816386, + "grad_norm": 0.82421875, + "learning_rate": 3.7427601417146186e-05, + "loss": 0.7761, + "step": 3375 + }, + { + "epoch": 0.14801469943482107, + "grad_norm": 0.94921875, + "learning_rate": 3.7424010360633154e-05, + "loss": 0.9118, + "step": 3376 + }, + { + "epoch": 0.1480585426514783, + "grad_norm": 0.85546875, + "learning_rate": 3.7420419436746393e-05, + "loss": 0.8917, + "step": 3377 + }, + { + "epoch": 0.14810238586813554, + "grad_norm": 0.8515625, + "learning_rate": 3.741682864549347e-05, + "loss": 0.7954, + "step": 3378 + }, + { + "epoch": 0.14814622908479277, + "grad_norm": 0.77734375, + "learning_rate": 3.741323798688201e-05, + "loss": 0.7492, + "step": 3379 + }, + { + "epoch": 0.14819007230145, + "grad_norm": 0.8046875, + "learning_rate": 3.74096474609196e-05, + "loss": 0.7829, + "step": 3380 + }, + { + "epoch": 0.14823391551810725, + "grad_norm": 0.8046875, + "learning_rate": 3.740605706761384e-05, + "loss": 0.712, + "step": 3381 + }, + { + "epoch": 0.14827775873476448, + "grad_norm": 0.765625, + "learning_rate": 3.74024668069724e-05, + "loss": 0.8612, + "step": 3382 + }, + { + "epoch": 0.14832160195142172, + "grad_norm": 0.7578125, + "learning_rate": 3.739887667900286e-05, + "loss": 0.7442, + "step": 3383 + }, + { + "epoch": 0.14836544516807895, + "grad_norm": 0.8359375, + "learning_rate": 3.739528668371282e-05, + "loss": 0.75, + "step": 3384 + }, + { + "epoch": 0.1484092883847362, + "grad_norm": 0.85546875, + "learning_rate": 3.7391696821109875e-05, + "loss": 0.7979, + "step": 3385 + }, + { + "epoch": 0.14845313160139342, + "grad_norm": 0.828125, + "learning_rate": 3.738810709120163e-05, + "loss": 0.7536, + "step": 3386 + }, + { + "epoch": 0.14849697481805066, + "grad_norm": 0.8046875, + "learning_rate": 3.7384517493995755e-05, + "loss": 0.8706, + "step": 3387 + }, + { + "epoch": 0.1485408180347079, + "grad_norm": 0.8203125, + "learning_rate": 3.7380928029499796e-05, + "loss": 0.8358, + "step": 3388 + }, + { + "epoch": 0.14858466125136513, + "grad_norm": 0.90625, + "learning_rate": 3.73773386977214e-05, + "loss": 0.9202, + "step": 3389 + }, + { + "epoch": 0.14862850446802237, + "grad_norm": 0.80078125, + "learning_rate": 3.737374949866814e-05, + "loss": 0.8376, + "step": 3390 + }, + { + "epoch": 0.14867234768467957, + "grad_norm": 0.82421875, + "learning_rate": 3.737016043234761e-05, + "loss": 0.831, + "step": 3391 + }, + { + "epoch": 0.1487161909013368, + "grad_norm": 0.796875, + "learning_rate": 3.736657149876749e-05, + "loss": 0.9397, + "step": 3392 + }, + { + "epoch": 0.14876003411799404, + "grad_norm": 0.94140625, + "learning_rate": 3.736298269793532e-05, + "loss": 0.9355, + "step": 3393 + }, + { + "epoch": 0.14880387733465128, + "grad_norm": 0.80859375, + "learning_rate": 3.735939402985875e-05, + "loss": 0.8562, + "step": 3394 + }, + { + "epoch": 0.14884772055130852, + "grad_norm": 0.94140625, + "learning_rate": 3.735580549454535e-05, + "loss": 0.8104, + "step": 3395 + }, + { + "epoch": 0.14889156376796575, + "grad_norm": 0.8046875, + "learning_rate": 3.7352217092002704e-05, + "loss": 0.6885, + "step": 3396 + }, + { + "epoch": 0.148935406984623, + "grad_norm": 0.8125, + "learning_rate": 3.73486288222385e-05, + "loss": 0.8061, + "step": 3397 + }, + { + "epoch": 0.14897925020128022, + "grad_norm": 0.79296875, + "learning_rate": 3.734504068526029e-05, + "loss": 0.7754, + "step": 3398 + }, + { + "epoch": 0.14902309341793746, + "grad_norm": 0.77734375, + "learning_rate": 3.734145268107568e-05, + "loss": 0.6591, + "step": 3399 + }, + { + "epoch": 0.1490669366345947, + "grad_norm": 0.859375, + "learning_rate": 3.733786480969229e-05, + "loss": 0.8612, + "step": 3400 + }, + { + "epoch": 0.14911077985125193, + "grad_norm": 0.77734375, + "learning_rate": 3.7334277071117675e-05, + "loss": 0.7053, + "step": 3401 + }, + { + "epoch": 0.14915462306790916, + "grad_norm": 0.82421875, + "learning_rate": 3.73306894653595e-05, + "loss": 0.8391, + "step": 3402 + }, + { + "epoch": 0.1491984662845664, + "grad_norm": 0.8671875, + "learning_rate": 3.732710199242536e-05, + "loss": 0.8395, + "step": 3403 + }, + { + "epoch": 0.14924230950122364, + "grad_norm": 0.81640625, + "learning_rate": 3.732351465232284e-05, + "loss": 0.9342, + "step": 3404 + }, + { + "epoch": 0.14928615271788087, + "grad_norm": 0.7578125, + "learning_rate": 3.731992744505955e-05, + "loss": 0.7577, + "step": 3405 + }, + { + "epoch": 0.14932999593453808, + "grad_norm": 0.890625, + "learning_rate": 3.73163403706431e-05, + "loss": 0.8588, + "step": 3406 + }, + { + "epoch": 0.14937383915119531, + "grad_norm": 0.8515625, + "learning_rate": 3.731275342908104e-05, + "loss": 0.8948, + "step": 3407 + }, + { + "epoch": 0.14941768236785255, + "grad_norm": 0.8203125, + "learning_rate": 3.730916662038105e-05, + "loss": 0.7805, + "step": 3408 + }, + { + "epoch": 0.14946152558450979, + "grad_norm": 0.7421875, + "learning_rate": 3.7305579944550705e-05, + "loss": 0.6961, + "step": 3409 + }, + { + "epoch": 0.14950536880116702, + "grad_norm": 0.8359375, + "learning_rate": 3.730199340159759e-05, + "loss": 0.7992, + "step": 3410 + }, + { + "epoch": 0.14954921201782426, + "grad_norm": 0.80859375, + "learning_rate": 3.729840699152932e-05, + "loss": 0.7964, + "step": 3411 + }, + { + "epoch": 0.1495930552344815, + "grad_norm": 0.82421875, + "learning_rate": 3.72948207143535e-05, + "loss": 0.7792, + "step": 3412 + }, + { + "epoch": 0.14963689845113873, + "grad_norm": 0.80078125, + "learning_rate": 3.7291234570077716e-05, + "loss": 0.7581, + "step": 3413 + }, + { + "epoch": 0.14968074166779596, + "grad_norm": 0.90625, + "learning_rate": 3.728764855870954e-05, + "loss": 0.7538, + "step": 3414 + }, + { + "epoch": 0.1497245848844532, + "grad_norm": 0.8984375, + "learning_rate": 3.7284062680256646e-05, + "loss": 0.8689, + "step": 3415 + }, + { + "epoch": 0.14976842810111043, + "grad_norm": 0.984375, + "learning_rate": 3.728047693472659e-05, + "loss": 0.9184, + "step": 3416 + }, + { + "epoch": 0.14981227131776767, + "grad_norm": 0.890625, + "learning_rate": 3.727689132212699e-05, + "loss": 0.7764, + "step": 3417 + }, + { + "epoch": 0.1498561145344249, + "grad_norm": 0.8359375, + "learning_rate": 3.7273305842465435e-05, + "loss": 0.8131, + "step": 3418 + }, + { + "epoch": 0.14989995775108214, + "grad_norm": 0.81640625, + "learning_rate": 3.7269720495749513e-05, + "loss": 0.9139, + "step": 3419 + }, + { + "epoch": 0.14994380096773938, + "grad_norm": 0.8515625, + "learning_rate": 3.726613528198684e-05, + "loss": 0.8563, + "step": 3420 + }, + { + "epoch": 0.14998764418439658, + "grad_norm": 0.86328125, + "learning_rate": 3.7262550201185e-05, + "loss": 0.8681, + "step": 3421 + }, + { + "epoch": 0.15003148740105382, + "grad_norm": 0.703125, + "learning_rate": 3.725896525335156e-05, + "loss": 0.7231, + "step": 3422 + }, + { + "epoch": 0.15007533061771106, + "grad_norm": 0.85546875, + "learning_rate": 3.7255380438494206e-05, + "loss": 0.7122, + "step": 3423 + }, + { + "epoch": 0.1501191738343683, + "grad_norm": 0.83984375, + "learning_rate": 3.7251795756620475e-05, + "loss": 0.829, + "step": 3424 + }, + { + "epoch": 0.15016301705102553, + "grad_norm": 0.8046875, + "learning_rate": 3.724821120773799e-05, + "loss": 0.806, + "step": 3425 + }, + { + "epoch": 0.15020686026768276, + "grad_norm": 0.8515625, + "learning_rate": 3.724462679185433e-05, + "loss": 0.7114, + "step": 3426 + }, + { + "epoch": 0.15025070348434, + "grad_norm": 0.83984375, + "learning_rate": 3.7241042508977056e-05, + "loss": 0.7778, + "step": 3427 + }, + { + "epoch": 0.15029454670099723, + "grad_norm": 0.828125, + "learning_rate": 3.7237458359113844e-05, + "loss": 0.854, + "step": 3428 + }, + { + "epoch": 0.15033838991765447, + "grad_norm": 0.89453125, + "learning_rate": 3.723387434227224e-05, + "loss": 0.9329, + "step": 3429 + }, + { + "epoch": 0.1503822331343117, + "grad_norm": 0.80859375, + "learning_rate": 3.723029045845987e-05, + "loss": 0.7629, + "step": 3430 + }, + { + "epoch": 0.15042607635096894, + "grad_norm": 0.828125, + "learning_rate": 3.72267067076843e-05, + "loss": 0.8371, + "step": 3431 + }, + { + "epoch": 0.15046991956762618, + "grad_norm": 0.78515625, + "learning_rate": 3.7223123089953106e-05, + "loss": 0.7632, + "step": 3432 + }, + { + "epoch": 0.1505137627842834, + "grad_norm": 0.82421875, + "learning_rate": 3.7219539605273954e-05, + "loss": 0.9709, + "step": 3433 + }, + { + "epoch": 0.15055760600094065, + "grad_norm": 0.84765625, + "learning_rate": 3.7215956253654394e-05, + "loss": 0.7729, + "step": 3434 + }, + { + "epoch": 0.15060144921759788, + "grad_norm": 0.73046875, + "learning_rate": 3.7212373035102024e-05, + "loss": 0.7629, + "step": 3435 + }, + { + "epoch": 0.1506452924342551, + "grad_norm": 0.77734375, + "learning_rate": 3.7208789949624445e-05, + "loss": 0.7395, + "step": 3436 + }, + { + "epoch": 0.15068913565091233, + "grad_norm": 0.8359375, + "learning_rate": 3.720520699722921e-05, + "loss": 0.7768, + "step": 3437 + }, + { + "epoch": 0.15073297886756956, + "grad_norm": 0.8046875, + "learning_rate": 3.7201624177923986e-05, + "loss": 0.8277, + "step": 3438 + }, + { + "epoch": 0.1507768220842268, + "grad_norm": 0.7734375, + "learning_rate": 3.719804149171633e-05, + "loss": 0.8124, + "step": 3439 + }, + { + "epoch": 0.15082066530088403, + "grad_norm": 0.8671875, + "learning_rate": 3.719445893861383e-05, + "loss": 0.9228, + "step": 3440 + }, + { + "epoch": 0.15086450851754127, + "grad_norm": 0.80859375, + "learning_rate": 3.7190876518624095e-05, + "loss": 0.6798, + "step": 3441 + }, + { + "epoch": 0.1509083517341985, + "grad_norm": 0.8203125, + "learning_rate": 3.7187294231754664e-05, + "loss": 0.83, + "step": 3442 + }, + { + "epoch": 0.15095219495085574, + "grad_norm": 0.79296875, + "learning_rate": 3.718371207801321e-05, + "loss": 0.7762, + "step": 3443 + }, + { + "epoch": 0.15099603816751297, + "grad_norm": 0.90234375, + "learning_rate": 3.7180130057407294e-05, + "loss": 0.7779, + "step": 3444 + }, + { + "epoch": 0.1510398813841702, + "grad_norm": 0.9140625, + "learning_rate": 3.717654816994449e-05, + "loss": 0.8155, + "step": 3445 + }, + { + "epoch": 0.15108372460082745, + "grad_norm": 0.9140625, + "learning_rate": 3.7172966415632404e-05, + "loss": 0.749, + "step": 3446 + }, + { + "epoch": 0.15112756781748468, + "grad_norm": 0.8046875, + "learning_rate": 3.716938479447859e-05, + "loss": 0.7411, + "step": 3447 + }, + { + "epoch": 0.15117141103414192, + "grad_norm": 0.8203125, + "learning_rate": 3.716580330649071e-05, + "loss": 0.7953, + "step": 3448 + }, + { + "epoch": 0.15121525425079915, + "grad_norm": 0.76953125, + "learning_rate": 3.7162221951676325e-05, + "loss": 0.7003, + "step": 3449 + }, + { + "epoch": 0.1512590974674564, + "grad_norm": 0.84375, + "learning_rate": 3.7158640730043016e-05, + "loss": 0.6843, + "step": 3450 + }, + { + "epoch": 0.1513029406841136, + "grad_norm": 0.8125, + "learning_rate": 3.715505964159837e-05, + "loss": 0.794, + "step": 3451 + }, + { + "epoch": 0.15134678390077083, + "grad_norm": 0.73828125, + "learning_rate": 3.715147868634995e-05, + "loss": 0.6398, + "step": 3452 + }, + { + "epoch": 0.15139062711742807, + "grad_norm": 0.83984375, + "learning_rate": 3.7147897864305415e-05, + "loss": 0.8641, + "step": 3453 + }, + { + "epoch": 0.1514344703340853, + "grad_norm": 0.78515625, + "learning_rate": 3.7144317175472324e-05, + "loss": 0.7535, + "step": 3454 + }, + { + "epoch": 0.15147831355074254, + "grad_norm": 0.7890625, + "learning_rate": 3.714073661985825e-05, + "loss": 0.7911, + "step": 3455 + }, + { + "epoch": 0.15152215676739977, + "grad_norm": 0.8671875, + "learning_rate": 3.7137156197470804e-05, + "loss": 0.9806, + "step": 3456 + }, + { + "epoch": 0.151565999984057, + "grad_norm": 0.93359375, + "learning_rate": 3.7133575908317554e-05, + "loss": 0.8751, + "step": 3457 + }, + { + "epoch": 0.15160984320071424, + "grad_norm": 0.8125, + "learning_rate": 3.712999575240609e-05, + "loss": 0.8787, + "step": 3458 + }, + { + "epoch": 0.15165368641737148, + "grad_norm": 0.76171875, + "learning_rate": 3.7126415729744016e-05, + "loss": 0.8098, + "step": 3459 + }, + { + "epoch": 0.15169752963402872, + "grad_norm": 0.76953125, + "learning_rate": 3.712283584033891e-05, + "loss": 0.7346, + "step": 3460 + }, + { + "epoch": 0.15174137285068595, + "grad_norm": 0.80859375, + "learning_rate": 3.711925608419835e-05, + "loss": 0.8434, + "step": 3461 + }, + { + "epoch": 0.1517852160673432, + "grad_norm": 0.9609375, + "learning_rate": 3.7115676461329904e-05, + "loss": 0.7765, + "step": 3462 + }, + { + "epoch": 0.15182905928400042, + "grad_norm": 0.79296875, + "learning_rate": 3.711209697174123e-05, + "loss": 0.8767, + "step": 3463 + }, + { + "epoch": 0.15187290250065766, + "grad_norm": 0.78515625, + "learning_rate": 3.710851761543986e-05, + "loss": 0.7562, + "step": 3464 + }, + { + "epoch": 0.1519167457173149, + "grad_norm": 0.95703125, + "learning_rate": 3.710493839243341e-05, + "loss": 0.8355, + "step": 3465 + }, + { + "epoch": 0.15196058893397213, + "grad_norm": 0.859375, + "learning_rate": 3.710135930272943e-05, + "loss": 0.859, + "step": 3466 + }, + { + "epoch": 0.15200443215062934, + "grad_norm": 0.79296875, + "learning_rate": 3.70977803463355e-05, + "loss": 0.7938, + "step": 3467 + }, + { + "epoch": 0.15204827536728657, + "grad_norm": 0.859375, + "learning_rate": 3.7094201523259256e-05, + "loss": 0.8299, + "step": 3468 + }, + { + "epoch": 0.1520921185839438, + "grad_norm": 0.8359375, + "learning_rate": 3.709062283350826e-05, + "loss": 0.8904, + "step": 3469 + }, + { + "epoch": 0.15213596180060104, + "grad_norm": 0.81640625, + "learning_rate": 3.708704427709009e-05, + "loss": 0.8176, + "step": 3470 + }, + { + "epoch": 0.15217980501725828, + "grad_norm": 0.83984375, + "learning_rate": 3.708346585401233e-05, + "loss": 0.7666, + "step": 3471 + }, + { + "epoch": 0.15222364823391552, + "grad_norm": 1.0234375, + "learning_rate": 3.707988756428254e-05, + "loss": 0.9057, + "step": 3472 + }, + { + "epoch": 0.15226749145057275, + "grad_norm": 0.859375, + "learning_rate": 3.7076309407908374e-05, + "loss": 0.916, + "step": 3473 + }, + { + "epoch": 0.15231133466723, + "grad_norm": 0.7890625, + "learning_rate": 3.707273138489736e-05, + "loss": 0.766, + "step": 3474 + }, + { + "epoch": 0.15235517788388722, + "grad_norm": 1.328125, + "learning_rate": 3.70691534952571e-05, + "loss": 0.8705, + "step": 3475 + }, + { + "epoch": 0.15239902110054446, + "grad_norm": 0.78515625, + "learning_rate": 3.706557573899519e-05, + "loss": 0.7761, + "step": 3476 + }, + { + "epoch": 0.1524428643172017, + "grad_norm": 0.765625, + "learning_rate": 3.706199811611914e-05, + "loss": 0.7146, + "step": 3477 + }, + { + "epoch": 0.15248670753385893, + "grad_norm": 0.8515625, + "learning_rate": 3.705842062663661e-05, + "loss": 0.8516, + "step": 3478 + }, + { + "epoch": 0.15253055075051616, + "grad_norm": 0.87109375, + "learning_rate": 3.7054843270555185e-05, + "loss": 0.9491, + "step": 3479 + }, + { + "epoch": 0.1525743939671734, + "grad_norm": 0.9140625, + "learning_rate": 3.7051266047882416e-05, + "loss": 0.8796, + "step": 3480 + }, + { + "epoch": 0.15261823718383064, + "grad_norm": 0.8125, + "learning_rate": 3.704768895862588e-05, + "loss": 0.7414, + "step": 3481 + }, + { + "epoch": 0.15266208040048784, + "grad_norm": 0.83984375, + "learning_rate": 3.704411200279314e-05, + "loss": 0.7574, + "step": 3482 + }, + { + "epoch": 0.15270592361714508, + "grad_norm": 0.90234375, + "learning_rate": 3.704053518039184e-05, + "loss": 0.9041, + "step": 3483 + }, + { + "epoch": 0.15274976683380231, + "grad_norm": 0.875, + "learning_rate": 3.703695849142954e-05, + "loss": 0.9638, + "step": 3484 + }, + { + "epoch": 0.15279361005045955, + "grad_norm": 0.76171875, + "learning_rate": 3.703338193591379e-05, + "loss": 0.8276, + "step": 3485 + }, + { + "epoch": 0.15283745326711679, + "grad_norm": 0.8671875, + "learning_rate": 3.70298055138522e-05, + "loss": 0.7311, + "step": 3486 + }, + { + "epoch": 0.15288129648377402, + "grad_norm": 0.875, + "learning_rate": 3.70262292252523e-05, + "loss": 0.8034, + "step": 3487 + }, + { + "epoch": 0.15292513970043126, + "grad_norm": 0.8125, + "learning_rate": 3.7022653070121736e-05, + "loss": 0.9333, + "step": 3488 + }, + { + "epoch": 0.1529689829170885, + "grad_norm": 0.82421875, + "learning_rate": 3.7019077048468075e-05, + "loss": 0.7477, + "step": 3489 + }, + { + "epoch": 0.15301282613374573, + "grad_norm": 0.8203125, + "learning_rate": 3.701550116029887e-05, + "loss": 0.8083, + "step": 3490 + }, + { + "epoch": 0.15305666935040296, + "grad_norm": 0.8359375, + "learning_rate": 3.701192540562171e-05, + "loss": 0.8175, + "step": 3491 + }, + { + "epoch": 0.1531005125670602, + "grad_norm": 0.80859375, + "learning_rate": 3.700834978444414e-05, + "loss": 0.7548, + "step": 3492 + }, + { + "epoch": 0.15314435578371743, + "grad_norm": 0.82421875, + "learning_rate": 3.7004774296773806e-05, + "loss": 0.785, + "step": 3493 + }, + { + "epoch": 0.15318819900037467, + "grad_norm": 0.765625, + "learning_rate": 3.7001198942618246e-05, + "loss": 0.7462, + "step": 3494 + }, + { + "epoch": 0.1532320422170319, + "grad_norm": 0.8515625, + "learning_rate": 3.6997623721985056e-05, + "loss": 0.8551, + "step": 3495 + }, + { + "epoch": 0.15327588543368914, + "grad_norm": 0.77734375, + "learning_rate": 3.699404863488179e-05, + "loss": 0.7043, + "step": 3496 + }, + { + "epoch": 0.15331972865034635, + "grad_norm": 0.84375, + "learning_rate": 3.699047368131604e-05, + "loss": 0.8569, + "step": 3497 + }, + { + "epoch": 0.15336357186700358, + "grad_norm": 0.83203125, + "learning_rate": 3.698689886129538e-05, + "loss": 0.896, + "step": 3498 + }, + { + "epoch": 0.15340741508366082, + "grad_norm": 1.03125, + "learning_rate": 3.698332417482736e-05, + "loss": 0.913, + "step": 3499 + }, + { + "epoch": 0.15345125830031806, + "grad_norm": 0.828125, + "learning_rate": 3.697974962191961e-05, + "loss": 0.9201, + "step": 3500 + }, + { + "epoch": 0.1534951015169753, + "grad_norm": 0.82421875, + "learning_rate": 3.697617520257968e-05, + "loss": 0.7961, + "step": 3501 + }, + { + "epoch": 0.15353894473363253, + "grad_norm": 0.81640625, + "learning_rate": 3.6972600916815146e-05, + "loss": 0.7776, + "step": 3502 + }, + { + "epoch": 0.15358278795028976, + "grad_norm": 0.97265625, + "learning_rate": 3.6969026764633585e-05, + "loss": 0.8997, + "step": 3503 + }, + { + "epoch": 0.153626631166947, + "grad_norm": 0.8359375, + "learning_rate": 3.696545274604256e-05, + "loss": 0.8905, + "step": 3504 + }, + { + "epoch": 0.15367047438360423, + "grad_norm": 0.75, + "learning_rate": 3.696187886104966e-05, + "loss": 0.8161, + "step": 3505 + }, + { + "epoch": 0.15371431760026147, + "grad_norm": 0.9765625, + "learning_rate": 3.695830510966245e-05, + "loss": 0.8369, + "step": 3506 + }, + { + "epoch": 0.1537581608169187, + "grad_norm": 0.87109375, + "learning_rate": 3.6954731491888484e-05, + "loss": 0.964, + "step": 3507 + }, + { + "epoch": 0.15380200403357594, + "grad_norm": 0.8203125, + "learning_rate": 3.695115800773539e-05, + "loss": 0.8276, + "step": 3508 + }, + { + "epoch": 0.15384584725023318, + "grad_norm": 0.83203125, + "learning_rate": 3.694758465721071e-05, + "loss": 0.7393, + "step": 3509 + }, + { + "epoch": 0.1538896904668904, + "grad_norm": 0.7734375, + "learning_rate": 3.694401144032202e-05, + "loss": 0.8683, + "step": 3510 + }, + { + "epoch": 0.15393353368354765, + "grad_norm": 0.85546875, + "learning_rate": 3.6940438357076904e-05, + "loss": 0.8194, + "step": 3511 + }, + { + "epoch": 0.15397737690020485, + "grad_norm": 0.95703125, + "learning_rate": 3.693686540748288e-05, + "loss": 0.8239, + "step": 3512 + }, + { + "epoch": 0.1540212201168621, + "grad_norm": 0.7578125, + "learning_rate": 3.6933292591547596e-05, + "loss": 0.6751, + "step": 3513 + }, + { + "epoch": 0.15406506333351933, + "grad_norm": 0.73828125, + "learning_rate": 3.6929719909278605e-05, + "loss": 0.7652, + "step": 3514 + }, + { + "epoch": 0.15410890655017656, + "grad_norm": 0.86328125, + "learning_rate": 3.692614736068346e-05, + "loss": 0.7108, + "step": 3515 + }, + { + "epoch": 0.1541527497668338, + "grad_norm": 0.78515625, + "learning_rate": 3.692257494576975e-05, + "loss": 0.7946, + "step": 3516 + }, + { + "epoch": 0.15419659298349103, + "grad_norm": 0.7734375, + "learning_rate": 3.6919002664544987e-05, + "loss": 0.8246, + "step": 3517 + }, + { + "epoch": 0.15424043620014827, + "grad_norm": 0.83984375, + "learning_rate": 3.691543051701683e-05, + "loss": 0.9075, + "step": 3518 + }, + { + "epoch": 0.1542842794168055, + "grad_norm": 0.78125, + "learning_rate": 3.691185850319283e-05, + "loss": 0.7613, + "step": 3519 + }, + { + "epoch": 0.15432812263346274, + "grad_norm": 0.92578125, + "learning_rate": 3.690828662308052e-05, + "loss": 0.7938, + "step": 3520 + }, + { + "epoch": 0.15437196585011997, + "grad_norm": 0.8046875, + "learning_rate": 3.690471487668751e-05, + "loss": 0.8555, + "step": 3521 + }, + { + "epoch": 0.1544158090667772, + "grad_norm": 0.87109375, + "learning_rate": 3.69011432640213e-05, + "loss": 0.8056, + "step": 3522 + }, + { + "epoch": 0.15445965228343445, + "grad_norm": 0.71484375, + "learning_rate": 3.689757178508955e-05, + "loss": 0.7386, + "step": 3523 + }, + { + "epoch": 0.15450349550009168, + "grad_norm": 0.84375, + "learning_rate": 3.689400043989979e-05, + "loss": 0.9594, + "step": 3524 + }, + { + "epoch": 0.15454733871674892, + "grad_norm": 0.79296875, + "learning_rate": 3.6890429228459586e-05, + "loss": 0.8642, + "step": 3525 + }, + { + "epoch": 0.15459118193340615, + "grad_norm": 0.859375, + "learning_rate": 3.688685815077651e-05, + "loss": 0.9066, + "step": 3526 + }, + { + "epoch": 0.15463502515006336, + "grad_norm": 0.8828125, + "learning_rate": 3.6883287206858094e-05, + "loss": 0.8405, + "step": 3527 + }, + { + "epoch": 0.1546788683667206, + "grad_norm": 0.91796875, + "learning_rate": 3.6879716396711996e-05, + "loss": 0.7751, + "step": 3528 + }, + { + "epoch": 0.15472271158337783, + "grad_norm": 0.7890625, + "learning_rate": 3.687614572034571e-05, + "loss": 0.7567, + "step": 3529 + }, + { + "epoch": 0.15476655480003507, + "grad_norm": 0.8515625, + "learning_rate": 3.687257517776683e-05, + "loss": 0.7956, + "step": 3530 + }, + { + "epoch": 0.1548103980166923, + "grad_norm": 0.87109375, + "learning_rate": 3.686900476898293e-05, + "loss": 0.863, + "step": 3531 + }, + { + "epoch": 0.15485424123334954, + "grad_norm": 0.79296875, + "learning_rate": 3.6865434494001515e-05, + "loss": 0.7118, + "step": 3532 + }, + { + "epoch": 0.15489808445000677, + "grad_norm": 0.80078125, + "learning_rate": 3.686186435283024e-05, + "loss": 0.7119, + "step": 3533 + }, + { + "epoch": 0.154941927666664, + "grad_norm": 0.79296875, + "learning_rate": 3.6858294345476654e-05, + "loss": 0.8732, + "step": 3534 + }, + { + "epoch": 0.15498577088332124, + "grad_norm": 0.82421875, + "learning_rate": 3.685472447194829e-05, + "loss": 0.8483, + "step": 3535 + }, + { + "epoch": 0.15502961409997848, + "grad_norm": 0.80078125, + "learning_rate": 3.6851154732252723e-05, + "loss": 0.8155, + "step": 3536 + }, + { + "epoch": 0.15507345731663572, + "grad_norm": 0.82421875, + "learning_rate": 3.68475851263975e-05, + "loss": 0.7443, + "step": 3537 + }, + { + "epoch": 0.15511730053329295, + "grad_norm": 0.85546875, + "learning_rate": 3.684401565439024e-05, + "loss": 0.9014, + "step": 3538 + }, + { + "epoch": 0.1551611437499502, + "grad_norm": 0.87109375, + "learning_rate": 3.6840446316238477e-05, + "loss": 0.7847, + "step": 3539 + }, + { + "epoch": 0.15520498696660742, + "grad_norm": 0.85546875, + "learning_rate": 3.683687711194977e-05, + "loss": 0.8259, + "step": 3540 + }, + { + "epoch": 0.15524883018326466, + "grad_norm": 0.80078125, + "learning_rate": 3.683330804153171e-05, + "loss": 0.7675, + "step": 3541 + }, + { + "epoch": 0.15529267339992187, + "grad_norm": 0.79296875, + "learning_rate": 3.682973910499183e-05, + "loss": 0.8743, + "step": 3542 + }, + { + "epoch": 0.1553365166165791, + "grad_norm": 0.8125, + "learning_rate": 3.682617030233771e-05, + "loss": 0.7484, + "step": 3543 + }, + { + "epoch": 0.15538035983323634, + "grad_norm": 0.8125, + "learning_rate": 3.682260163357692e-05, + "loss": 0.7429, + "step": 3544 + }, + { + "epoch": 0.15542420304989357, + "grad_norm": 0.83203125, + "learning_rate": 3.681903309871701e-05, + "loss": 0.9007, + "step": 3545 + }, + { + "epoch": 0.1554680462665508, + "grad_norm": 0.828125, + "learning_rate": 3.6815464697765544e-05, + "loss": 0.7607, + "step": 3546 + }, + { + "epoch": 0.15551188948320804, + "grad_norm": 0.796875, + "learning_rate": 3.6811896430730065e-05, + "loss": 0.9296, + "step": 3547 + }, + { + "epoch": 0.15555573269986528, + "grad_norm": 0.73046875, + "learning_rate": 3.680832829761819e-05, + "loss": 0.8175, + "step": 3548 + }, + { + "epoch": 0.15559957591652251, + "grad_norm": 0.87890625, + "learning_rate": 3.6804760298437444e-05, + "loss": 0.8138, + "step": 3549 + }, + { + "epoch": 0.15564341913317975, + "grad_norm": 0.828125, + "learning_rate": 3.68011924331954e-05, + "loss": 0.8077, + "step": 3550 + }, + { + "epoch": 0.15568726234983699, + "grad_norm": 0.80859375, + "learning_rate": 3.679762470189961e-05, + "loss": 0.8433, + "step": 3551 + }, + { + "epoch": 0.15573110556649422, + "grad_norm": 0.80859375, + "learning_rate": 3.6794057104557655e-05, + "loss": 0.7572, + "step": 3552 + }, + { + "epoch": 0.15577494878315146, + "grad_norm": 0.8203125, + "learning_rate": 3.679048964117704e-05, + "loss": 0.7779, + "step": 3553 + }, + { + "epoch": 0.1558187919998087, + "grad_norm": 0.83984375, + "learning_rate": 3.6786922311765416e-05, + "loss": 0.9579, + "step": 3554 + }, + { + "epoch": 0.15586263521646593, + "grad_norm": 0.85546875, + "learning_rate": 3.678335511633027e-05, + "loss": 0.871, + "step": 3555 + }, + { + "epoch": 0.15590647843312316, + "grad_norm": 0.83984375, + "learning_rate": 3.6779788054879214e-05, + "loss": 0.8957, + "step": 3556 + }, + { + "epoch": 0.1559503216497804, + "grad_norm": 0.83203125, + "learning_rate": 3.677622112741978e-05, + "loss": 0.7464, + "step": 3557 + }, + { + "epoch": 0.1559941648664376, + "grad_norm": 0.875, + "learning_rate": 3.67726543339595e-05, + "loss": 0.7822, + "step": 3558 + }, + { + "epoch": 0.15603800808309484, + "grad_norm": 0.88671875, + "learning_rate": 3.676908767450599e-05, + "loss": 0.8764, + "step": 3559 + }, + { + "epoch": 0.15608185129975208, + "grad_norm": 0.9921875, + "learning_rate": 3.6765521149066784e-05, + "loss": 0.8707, + "step": 3560 + }, + { + "epoch": 0.1561256945164093, + "grad_norm": 0.7890625, + "learning_rate": 3.6761954757649456e-05, + "loss": 0.9054, + "step": 3561 + }, + { + "epoch": 0.15616953773306655, + "grad_norm": 0.87890625, + "learning_rate": 3.675838850026153e-05, + "loss": 0.789, + "step": 3562 + }, + { + "epoch": 0.15621338094972378, + "grad_norm": 0.81640625, + "learning_rate": 3.675482237691057e-05, + "loss": 0.8662, + "step": 3563 + }, + { + "epoch": 0.15625722416638102, + "grad_norm": 0.76953125, + "learning_rate": 3.675125638760417e-05, + "loss": 0.8377, + "step": 3564 + }, + { + "epoch": 0.15630106738303826, + "grad_norm": 0.83203125, + "learning_rate": 3.674769053234987e-05, + "loss": 0.9134, + "step": 3565 + }, + { + "epoch": 0.1563449105996955, + "grad_norm": 0.8359375, + "learning_rate": 3.6744124811155225e-05, + "loss": 0.888, + "step": 3566 + }, + { + "epoch": 0.15638875381635273, + "grad_norm": 0.96875, + "learning_rate": 3.674055922402779e-05, + "loss": 0.7713, + "step": 3567 + }, + { + "epoch": 0.15643259703300996, + "grad_norm": 0.875, + "learning_rate": 3.673699377097509e-05, + "loss": 0.7931, + "step": 3568 + }, + { + "epoch": 0.1564764402496672, + "grad_norm": 0.84765625, + "learning_rate": 3.673342845200476e-05, + "loss": 0.935, + "step": 3569 + }, + { + "epoch": 0.15652028346632443, + "grad_norm": 0.828125, + "learning_rate": 3.6729863267124296e-05, + "loss": 0.8677, + "step": 3570 + }, + { + "epoch": 0.15656412668298167, + "grad_norm": 0.8203125, + "learning_rate": 3.6726298216341284e-05, + "loss": 0.8367, + "step": 3571 + }, + { + "epoch": 0.1566079698996389, + "grad_norm": 0.79296875, + "learning_rate": 3.672273329966326e-05, + "loss": 0.7197, + "step": 3572 + }, + { + "epoch": 0.1566518131162961, + "grad_norm": 0.86328125, + "learning_rate": 3.6719168517097756e-05, + "loss": 0.7919, + "step": 3573 + }, + { + "epoch": 0.15669565633295335, + "grad_norm": 0.7578125, + "learning_rate": 3.671560386865239e-05, + "loss": 0.8963, + "step": 3574 + }, + { + "epoch": 0.15673949954961058, + "grad_norm": 0.796875, + "learning_rate": 3.671203935433468e-05, + "loss": 0.9031, + "step": 3575 + }, + { + "epoch": 0.15678334276626782, + "grad_norm": 0.76953125, + "learning_rate": 3.6708474974152194e-05, + "loss": 0.7892, + "step": 3576 + }, + { + "epoch": 0.15682718598292505, + "grad_norm": 0.93359375, + "learning_rate": 3.670491072811247e-05, + "loss": 0.8787, + "step": 3577 + }, + { + "epoch": 0.1568710291995823, + "grad_norm": 0.859375, + "learning_rate": 3.670134661622304e-05, + "loss": 0.8386, + "step": 3578 + }, + { + "epoch": 0.15691487241623953, + "grad_norm": 0.73828125, + "learning_rate": 3.6697782638491515e-05, + "loss": 0.7916, + "step": 3579 + }, + { + "epoch": 0.15695871563289676, + "grad_norm": 0.8828125, + "learning_rate": 3.6694218794925426e-05, + "loss": 0.8866, + "step": 3580 + }, + { + "epoch": 0.157002558849554, + "grad_norm": 0.8359375, + "learning_rate": 3.6690655085532324e-05, + "loss": 0.8861, + "step": 3581 + }, + { + "epoch": 0.15704640206621123, + "grad_norm": 0.8984375, + "learning_rate": 3.668709151031976e-05, + "loss": 0.8387, + "step": 3582 + }, + { + "epoch": 0.15709024528286847, + "grad_norm": 0.7734375, + "learning_rate": 3.668352806929528e-05, + "loss": 0.8492, + "step": 3583 + }, + { + "epoch": 0.1571340884995257, + "grad_norm": 0.85546875, + "learning_rate": 3.6679964762466454e-05, + "loss": 1.0405, + "step": 3584 + }, + { + "epoch": 0.15717793171618294, + "grad_norm": 0.91796875, + "learning_rate": 3.667640158984078e-05, + "loss": 0.8416, + "step": 3585 + }, + { + "epoch": 0.15722177493284017, + "grad_norm": 0.828125, + "learning_rate": 3.667283855142589e-05, + "loss": 0.7872, + "step": 3586 + }, + { + "epoch": 0.1572656181494974, + "grad_norm": 0.83203125, + "learning_rate": 3.666927564722931e-05, + "loss": 0.8123, + "step": 3587 + }, + { + "epoch": 0.15730946136615462, + "grad_norm": 0.91796875, + "learning_rate": 3.666571287725856e-05, + "loss": 0.7897, + "step": 3588 + }, + { + "epoch": 0.15735330458281185, + "grad_norm": 0.75390625, + "learning_rate": 3.666215024152123e-05, + "loss": 0.8016, + "step": 3589 + }, + { + "epoch": 0.1573971477994691, + "grad_norm": 0.76953125, + "learning_rate": 3.665858774002484e-05, + "loss": 0.797, + "step": 3590 + }, + { + "epoch": 0.15744099101612632, + "grad_norm": 0.8671875, + "learning_rate": 3.6655025372776966e-05, + "loss": 0.7987, + "step": 3591 + }, + { + "epoch": 0.15748483423278356, + "grad_norm": 0.88671875, + "learning_rate": 3.6651463139785134e-05, + "loss": 0.8588, + "step": 3592 + }, + { + "epoch": 0.1575286774494408, + "grad_norm": 0.765625, + "learning_rate": 3.6647901041056874e-05, + "loss": 0.865, + "step": 3593 + }, + { + "epoch": 0.15757252066609803, + "grad_norm": 0.84375, + "learning_rate": 3.6644339076599796e-05, + "loss": 0.8803, + "step": 3594 + }, + { + "epoch": 0.15761636388275527, + "grad_norm": 0.80078125, + "learning_rate": 3.6640777246421434e-05, + "loss": 0.7906, + "step": 3595 + }, + { + "epoch": 0.1576602070994125, + "grad_norm": 0.8359375, + "learning_rate": 3.6637215550529304e-05, + "loss": 0.7715, + "step": 3596 + }, + { + "epoch": 0.15770405031606974, + "grad_norm": 0.9140625, + "learning_rate": 3.663365398893098e-05, + "loss": 0.8991, + "step": 3597 + }, + { + "epoch": 0.15774789353272697, + "grad_norm": 0.73828125, + "learning_rate": 3.6630092561633976e-05, + "loss": 0.7504, + "step": 3598 + }, + { + "epoch": 0.1577917367493842, + "grad_norm": 0.8125, + "learning_rate": 3.66265312686459e-05, + "loss": 0.8039, + "step": 3599 + }, + { + "epoch": 0.15783557996604144, + "grad_norm": 0.80859375, + "learning_rate": 3.6622970109974275e-05, + "loss": 0.8288, + "step": 3600 + }, + { + "epoch": 0.15787942318269868, + "grad_norm": 0.86328125, + "learning_rate": 3.6619409085626644e-05, + "loss": 0.827, + "step": 3601 + }, + { + "epoch": 0.15792326639935592, + "grad_norm": 0.7734375, + "learning_rate": 3.661584819561054e-05, + "loss": 0.7899, + "step": 3602 + }, + { + "epoch": 0.15796710961601312, + "grad_norm": 0.8046875, + "learning_rate": 3.661228743993349e-05, + "loss": 0.8006, + "step": 3603 + }, + { + "epoch": 0.15801095283267036, + "grad_norm": 0.8828125, + "learning_rate": 3.6608726818603125e-05, + "loss": 0.783, + "step": 3604 + }, + { + "epoch": 0.1580547960493276, + "grad_norm": 0.765625, + "learning_rate": 3.660516633162692e-05, + "loss": 0.8815, + "step": 3605 + }, + { + "epoch": 0.15809863926598483, + "grad_norm": 0.96484375, + "learning_rate": 3.6601605979012457e-05, + "loss": 0.8352, + "step": 3606 + }, + { + "epoch": 0.15814248248264207, + "grad_norm": 0.90625, + "learning_rate": 3.659804576076725e-05, + "loss": 0.8762, + "step": 3607 + }, + { + "epoch": 0.1581863256992993, + "grad_norm": 0.85546875, + "learning_rate": 3.659448567689884e-05, + "loss": 0.8628, + "step": 3608 + }, + { + "epoch": 0.15823016891595654, + "grad_norm": 0.87109375, + "learning_rate": 3.659092572741483e-05, + "loss": 0.8491, + "step": 3609 + }, + { + "epoch": 0.15827401213261377, + "grad_norm": 0.86328125, + "learning_rate": 3.658736591232271e-05, + "loss": 0.8282, + "step": 3610 + }, + { + "epoch": 0.158317855349271, + "grad_norm": 0.828125, + "learning_rate": 3.658380623163006e-05, + "loss": 0.8361, + "step": 3611 + }, + { + "epoch": 0.15836169856592824, + "grad_norm": 0.8125, + "learning_rate": 3.65802466853444e-05, + "loss": 0.8929, + "step": 3612 + }, + { + "epoch": 0.15840554178258548, + "grad_norm": 0.85546875, + "learning_rate": 3.657668727347326e-05, + "loss": 0.809, + "step": 3613 + }, + { + "epoch": 0.15844938499924272, + "grad_norm": 0.81640625, + "learning_rate": 3.6573127996024224e-05, + "loss": 0.8711, + "step": 3614 + }, + { + "epoch": 0.15849322821589995, + "grad_norm": 0.8359375, + "learning_rate": 3.656956885300483e-05, + "loss": 0.9257, + "step": 3615 + }, + { + "epoch": 0.1585370714325572, + "grad_norm": 0.8046875, + "learning_rate": 3.65660098444226e-05, + "loss": 0.8343, + "step": 3616 + }, + { + "epoch": 0.15858091464921442, + "grad_norm": 0.80859375, + "learning_rate": 3.65624509702851e-05, + "loss": 0.8098, + "step": 3617 + }, + { + "epoch": 0.15862475786587163, + "grad_norm": 0.8203125, + "learning_rate": 3.655889223059982e-05, + "loss": 0.9036, + "step": 3618 + }, + { + "epoch": 0.15866860108252887, + "grad_norm": 0.8046875, + "learning_rate": 3.655533362537437e-05, + "loss": 0.8426, + "step": 3619 + }, + { + "epoch": 0.1587124442991861, + "grad_norm": 0.7890625, + "learning_rate": 3.6551775154616265e-05, + "loss": 0.7419, + "step": 3620 + }, + { + "epoch": 0.15875628751584334, + "grad_norm": 0.89453125, + "learning_rate": 3.6548216818333046e-05, + "loss": 0.8091, + "step": 3621 + }, + { + "epoch": 0.15880013073250057, + "grad_norm": 0.80078125, + "learning_rate": 3.654465861653226e-05, + "loss": 0.9077, + "step": 3622 + }, + { + "epoch": 0.1588439739491578, + "grad_norm": 0.79296875, + "learning_rate": 3.6541100549221396e-05, + "loss": 0.8512, + "step": 3623 + }, + { + "epoch": 0.15888781716581504, + "grad_norm": 0.84375, + "learning_rate": 3.653754261640809e-05, + "loss": 0.8893, + "step": 3624 + }, + { + "epoch": 0.15893166038247228, + "grad_norm": 0.76171875, + "learning_rate": 3.653398481809983e-05, + "loss": 0.6561, + "step": 3625 + }, + { + "epoch": 0.15897550359912951, + "grad_norm": 0.78125, + "learning_rate": 3.653042715430417e-05, + "loss": 0.8366, + "step": 3626 + }, + { + "epoch": 0.15901934681578675, + "grad_norm": 0.80078125, + "learning_rate": 3.652686962502863e-05, + "loss": 0.7616, + "step": 3627 + }, + { + "epoch": 0.15906319003244399, + "grad_norm": 0.8515625, + "learning_rate": 3.652331223028077e-05, + "loss": 0.8241, + "step": 3628 + }, + { + "epoch": 0.15910703324910122, + "grad_norm": 0.8125, + "learning_rate": 3.651975497006811e-05, + "loss": 0.8923, + "step": 3629 + }, + { + "epoch": 0.15915087646575846, + "grad_norm": 0.82421875, + "learning_rate": 3.6516197844398214e-05, + "loss": 0.9293, + "step": 3630 + }, + { + "epoch": 0.1591947196824157, + "grad_norm": 0.8203125, + "learning_rate": 3.65126408532786e-05, + "loss": 0.8089, + "step": 3631 + }, + { + "epoch": 0.15923856289907293, + "grad_norm": 0.7109375, + "learning_rate": 3.650908399671682e-05, + "loss": 0.7866, + "step": 3632 + }, + { + "epoch": 0.15928240611573014, + "grad_norm": 0.79296875, + "learning_rate": 3.6505527274720376e-05, + "loss": 0.8314, + "step": 3633 + }, + { + "epoch": 0.15932624933238737, + "grad_norm": 0.8203125, + "learning_rate": 3.650197068729686e-05, + "loss": 0.8213, + "step": 3634 + }, + { + "epoch": 0.1593700925490446, + "grad_norm": 0.86328125, + "learning_rate": 3.6498414234453795e-05, + "loss": 0.742, + "step": 3635 + }, + { + "epoch": 0.15941393576570184, + "grad_norm": 0.796875, + "learning_rate": 3.649485791619871e-05, + "loss": 0.8357, + "step": 3636 + }, + { + "epoch": 0.15945777898235908, + "grad_norm": 0.89453125, + "learning_rate": 3.649130173253915e-05, + "loss": 0.7567, + "step": 3637 + }, + { + "epoch": 0.1595016221990163, + "grad_norm": 0.8203125, + "learning_rate": 3.648774568348261e-05, + "loss": 0.9693, + "step": 3638 + }, + { + "epoch": 0.15954546541567355, + "grad_norm": 0.85546875, + "learning_rate": 3.648418976903669e-05, + "loss": 0.9418, + "step": 3639 + }, + { + "epoch": 0.15958930863233078, + "grad_norm": 0.73828125, + "learning_rate": 3.64806339892089e-05, + "loss": 0.8159, + "step": 3640 + }, + { + "epoch": 0.15963315184898802, + "grad_norm": 0.82421875, + "learning_rate": 3.647707834400679e-05, + "loss": 0.7906, + "step": 3641 + }, + { + "epoch": 0.15967699506564526, + "grad_norm": 0.859375, + "learning_rate": 3.647352283343787e-05, + "loss": 0.8571, + "step": 3642 + }, + { + "epoch": 0.1597208382823025, + "grad_norm": 0.74609375, + "learning_rate": 3.6469967457509645e-05, + "loss": 0.8599, + "step": 3643 + }, + { + "epoch": 0.15976468149895973, + "grad_norm": 0.90625, + "learning_rate": 3.646641221622974e-05, + "loss": 0.892, + "step": 3644 + }, + { + "epoch": 0.15980852471561696, + "grad_norm": 0.859375, + "learning_rate": 3.646285710960564e-05, + "loss": 0.7963, + "step": 3645 + }, + { + "epoch": 0.1598523679322742, + "grad_norm": 0.78125, + "learning_rate": 3.645930213764488e-05, + "loss": 0.715, + "step": 3646 + }, + { + "epoch": 0.15989621114893143, + "grad_norm": 0.8046875, + "learning_rate": 3.6455747300355e-05, + "loss": 0.843, + "step": 3647 + }, + { + "epoch": 0.15994005436558867, + "grad_norm": 0.90234375, + "learning_rate": 3.6452192597743495e-05, + "loss": 0.8305, + "step": 3648 + }, + { + "epoch": 0.15998389758224588, + "grad_norm": 0.921875, + "learning_rate": 3.644863802981797e-05, + "loss": 0.7295, + "step": 3649 + }, + { + "epoch": 0.1600277407989031, + "grad_norm": 0.859375, + "learning_rate": 3.6445083596585926e-05, + "loss": 0.8039, + "step": 3650 + }, + { + "epoch": 0.16007158401556035, + "grad_norm": 0.75, + "learning_rate": 3.64415292980549e-05, + "loss": 0.7503, + "step": 3651 + }, + { + "epoch": 0.16011542723221758, + "grad_norm": 0.7578125, + "learning_rate": 3.643797513423242e-05, + "loss": 0.6567, + "step": 3652 + }, + { + "epoch": 0.16015927044887482, + "grad_norm": 0.7109375, + "learning_rate": 3.643442110512597e-05, + "loss": 0.7191, + "step": 3653 + }, + { + "epoch": 0.16020311366553205, + "grad_norm": 0.88671875, + "learning_rate": 3.6430867210743166e-05, + "loss": 0.8798, + "step": 3654 + }, + { + "epoch": 0.1602469568821893, + "grad_norm": 0.74609375, + "learning_rate": 3.642731345109153e-05, + "loss": 0.7856, + "step": 3655 + }, + { + "epoch": 0.16029080009884653, + "grad_norm": 0.84375, + "learning_rate": 3.642375982617855e-05, + "loss": 0.7627, + "step": 3656 + }, + { + "epoch": 0.16033464331550376, + "grad_norm": 0.8828125, + "learning_rate": 3.6420206336011774e-05, + "loss": 0.6737, + "step": 3657 + }, + { + "epoch": 0.160378486532161, + "grad_norm": 0.86328125, + "learning_rate": 3.641665298059871e-05, + "loss": 0.7742, + "step": 3658 + }, + { + "epoch": 0.16042232974881823, + "grad_norm": 0.7890625, + "learning_rate": 3.6413099759946945e-05, + "loss": 0.8097, + "step": 3659 + }, + { + "epoch": 0.16046617296547547, + "grad_norm": 0.82421875, + "learning_rate": 3.6409546674063985e-05, + "loss": 0.9026, + "step": 3660 + }, + { + "epoch": 0.1605100161821327, + "grad_norm": 0.76953125, + "learning_rate": 3.6405993722957356e-05, + "loss": 0.8327, + "step": 3661 + }, + { + "epoch": 0.16055385939878994, + "grad_norm": 0.875, + "learning_rate": 3.6402440906634595e-05, + "loss": 0.7672, + "step": 3662 + }, + { + "epoch": 0.16059770261544717, + "grad_norm": 0.796875, + "learning_rate": 3.6398888225103175e-05, + "loss": 0.7456, + "step": 3663 + }, + { + "epoch": 0.16064154583210438, + "grad_norm": 0.87109375, + "learning_rate": 3.6395335678370726e-05, + "loss": 0.7006, + "step": 3664 + }, + { + "epoch": 0.16068538904876162, + "grad_norm": 0.828125, + "learning_rate": 3.639178326644472e-05, + "loss": 0.7989, + "step": 3665 + }, + { + "epoch": 0.16072923226541885, + "grad_norm": 0.8671875, + "learning_rate": 3.6388230989332694e-05, + "loss": 0.7692, + "step": 3666 + }, + { + "epoch": 0.1607730754820761, + "grad_norm": 0.97265625, + "learning_rate": 3.6384678847042174e-05, + "loss": 0.7978, + "step": 3667 + }, + { + "epoch": 0.16081691869873332, + "grad_norm": 0.765625, + "learning_rate": 3.638112683958069e-05, + "loss": 0.7422, + "step": 3668 + }, + { + "epoch": 0.16086076191539056, + "grad_norm": 0.80859375, + "learning_rate": 3.637757496695579e-05, + "loss": 0.9195, + "step": 3669 + }, + { + "epoch": 0.1609046051320478, + "grad_norm": 0.86328125, + "learning_rate": 3.6374023229174926e-05, + "loss": 0.8297, + "step": 3670 + }, + { + "epoch": 0.16094844834870503, + "grad_norm": 0.7734375, + "learning_rate": 3.6370471626245726e-05, + "loss": 0.8345, + "step": 3671 + }, + { + "epoch": 0.16099229156536227, + "grad_norm": 0.86328125, + "learning_rate": 3.636692015817568e-05, + "loss": 0.8308, + "step": 3672 + }, + { + "epoch": 0.1610361347820195, + "grad_norm": 0.7890625, + "learning_rate": 3.63633688249723e-05, + "loss": 0.7655, + "step": 3673 + }, + { + "epoch": 0.16107997799867674, + "grad_norm": 0.890625, + "learning_rate": 3.635981762664312e-05, + "loss": 0.7861, + "step": 3674 + }, + { + "epoch": 0.16112382121533397, + "grad_norm": 0.82421875, + "learning_rate": 3.6356266563195686e-05, + "loss": 0.8034, + "step": 3675 + }, + { + "epoch": 0.1611676644319912, + "grad_norm": 0.8046875, + "learning_rate": 3.6352715634637494e-05, + "loss": 0.8085, + "step": 3676 + }, + { + "epoch": 0.16121150764864844, + "grad_norm": 0.7890625, + "learning_rate": 3.634916484097609e-05, + "loss": 0.89, + "step": 3677 + }, + { + "epoch": 0.16125535086530568, + "grad_norm": 0.8203125, + "learning_rate": 3.634561418221896e-05, + "loss": 0.8953, + "step": 3678 + }, + { + "epoch": 0.1612991940819629, + "grad_norm": 0.80859375, + "learning_rate": 3.634206365837368e-05, + "loss": 0.7912, + "step": 3679 + }, + { + "epoch": 0.16134303729862012, + "grad_norm": 0.9375, + "learning_rate": 3.6338513269447774e-05, + "loss": 0.8688, + "step": 3680 + }, + { + "epoch": 0.16138688051527736, + "grad_norm": 0.80859375, + "learning_rate": 3.6334963015448745e-05, + "loss": 0.7609, + "step": 3681 + }, + { + "epoch": 0.1614307237319346, + "grad_norm": 0.80859375, + "learning_rate": 3.633141289638412e-05, + "loss": 0.7917, + "step": 3682 + }, + { + "epoch": 0.16147456694859183, + "grad_norm": 0.90625, + "learning_rate": 3.63278629122614e-05, + "loss": 0.8936, + "step": 3683 + }, + { + "epoch": 0.16151841016524907, + "grad_norm": 0.75390625, + "learning_rate": 3.632431306308817e-05, + "loss": 0.6776, + "step": 3684 + }, + { + "epoch": 0.1615622533819063, + "grad_norm": 0.8046875, + "learning_rate": 3.632076334887191e-05, + "loss": 0.7871, + "step": 3685 + }, + { + "epoch": 0.16160609659856354, + "grad_norm": 0.78515625, + "learning_rate": 3.631721376962016e-05, + "loss": 0.8382, + "step": 3686 + }, + { + "epoch": 0.16164993981522077, + "grad_norm": 0.8203125, + "learning_rate": 3.631366432534042e-05, + "loss": 0.905, + "step": 3687 + }, + { + "epoch": 0.161693783031878, + "grad_norm": 0.8515625, + "learning_rate": 3.631011501604021e-05, + "loss": 0.8638, + "step": 3688 + }, + { + "epoch": 0.16173762624853524, + "grad_norm": 0.83203125, + "learning_rate": 3.6306565841727106e-05, + "loss": 0.9078, + "step": 3689 + }, + { + "epoch": 0.16178146946519248, + "grad_norm": 0.81640625, + "learning_rate": 3.6303016802408594e-05, + "loss": 0.8858, + "step": 3690 + }, + { + "epoch": 0.16182531268184971, + "grad_norm": 0.80078125, + "learning_rate": 3.6299467898092185e-05, + "loss": 0.7621, + "step": 3691 + }, + { + "epoch": 0.16186915589850695, + "grad_norm": 0.80859375, + "learning_rate": 3.629591912878543e-05, + "loss": 0.7984, + "step": 3692 + }, + { + "epoch": 0.16191299911516419, + "grad_norm": 0.8828125, + "learning_rate": 3.629237049449582e-05, + "loss": 0.8709, + "step": 3693 + }, + { + "epoch": 0.1619568423318214, + "grad_norm": 0.8359375, + "learning_rate": 3.6288821995230856e-05, + "loss": 0.8838, + "step": 3694 + }, + { + "epoch": 0.16200068554847863, + "grad_norm": 0.796875, + "learning_rate": 3.628527363099814e-05, + "loss": 0.8248, + "step": 3695 + }, + { + "epoch": 0.16204452876513586, + "grad_norm": 0.7734375, + "learning_rate": 3.628172540180513e-05, + "loss": 0.7367, + "step": 3696 + }, + { + "epoch": 0.1620883719817931, + "grad_norm": 0.8515625, + "learning_rate": 3.6278177307659366e-05, + "loss": 0.9747, + "step": 3697 + }, + { + "epoch": 0.16213221519845034, + "grad_norm": 0.83203125, + "learning_rate": 3.627462934856837e-05, + "loss": 0.8098, + "step": 3698 + }, + { + "epoch": 0.16217605841510757, + "grad_norm": 0.90234375, + "learning_rate": 3.627108152453961e-05, + "loss": 0.7317, + "step": 3699 + }, + { + "epoch": 0.1622199016317648, + "grad_norm": 0.81640625, + "learning_rate": 3.626753383558069e-05, + "loss": 0.7753, + "step": 3700 + }, + { + "epoch": 0.16226374484842204, + "grad_norm": 0.80859375, + "learning_rate": 3.626398628169908e-05, + "loss": 0.9656, + "step": 3701 + }, + { + "epoch": 0.16230758806507928, + "grad_norm": 0.87890625, + "learning_rate": 3.6260438862902326e-05, + "loss": 0.8454, + "step": 3702 + }, + { + "epoch": 0.1623514312817365, + "grad_norm": 0.8671875, + "learning_rate": 3.6256891579197914e-05, + "loss": 0.8604, + "step": 3703 + }, + { + "epoch": 0.16239527449839375, + "grad_norm": 0.9296875, + "learning_rate": 3.625334443059335e-05, + "loss": 0.9905, + "step": 3704 + }, + { + "epoch": 0.16243911771505098, + "grad_norm": 0.89453125, + "learning_rate": 3.624979741709621e-05, + "loss": 0.8583, + "step": 3705 + }, + { + "epoch": 0.16248296093170822, + "grad_norm": 0.828125, + "learning_rate": 3.624625053871398e-05, + "loss": 0.8521, + "step": 3706 + }, + { + "epoch": 0.16252680414836546, + "grad_norm": 0.83984375, + "learning_rate": 3.624270379545417e-05, + "loss": 0.8579, + "step": 3707 + }, + { + "epoch": 0.1625706473650227, + "grad_norm": 0.8203125, + "learning_rate": 3.623915718732431e-05, + "loss": 0.8841, + "step": 3708 + }, + { + "epoch": 0.1626144905816799, + "grad_norm": 0.75, + "learning_rate": 3.623561071433189e-05, + "loss": 0.7167, + "step": 3709 + }, + { + "epoch": 0.16265833379833713, + "grad_norm": 0.796875, + "learning_rate": 3.623206437648448e-05, + "loss": 0.8276, + "step": 3710 + }, + { + "epoch": 0.16270217701499437, + "grad_norm": 0.796875, + "learning_rate": 3.622851817378955e-05, + "loss": 0.7609, + "step": 3711 + }, + { + "epoch": 0.1627460202316516, + "grad_norm": 0.8359375, + "learning_rate": 3.622497210625464e-05, + "loss": 0.8762, + "step": 3712 + }, + { + "epoch": 0.16278986344830884, + "grad_norm": 0.7734375, + "learning_rate": 3.622142617388726e-05, + "loss": 0.7016, + "step": 3713 + }, + { + "epoch": 0.16283370666496608, + "grad_norm": 0.80859375, + "learning_rate": 3.6217880376694914e-05, + "loss": 0.7959, + "step": 3714 + }, + { + "epoch": 0.1628775498816233, + "grad_norm": 0.6796875, + "learning_rate": 3.621433471468513e-05, + "loss": 0.6281, + "step": 3715 + }, + { + "epoch": 0.16292139309828055, + "grad_norm": 0.88671875, + "learning_rate": 3.6210789187865415e-05, + "loss": 0.7326, + "step": 3716 + }, + { + "epoch": 0.16296523631493778, + "grad_norm": 0.74609375, + "learning_rate": 3.620724379624329e-05, + "loss": 0.8745, + "step": 3717 + }, + { + "epoch": 0.16300907953159502, + "grad_norm": 0.83203125, + "learning_rate": 3.620369853982623e-05, + "loss": 0.9764, + "step": 3718 + }, + { + "epoch": 0.16305292274825225, + "grad_norm": 0.80859375, + "learning_rate": 3.620015341862182e-05, + "loss": 0.7863, + "step": 3719 + }, + { + "epoch": 0.1630967659649095, + "grad_norm": 0.7734375, + "learning_rate": 3.6196608432637535e-05, + "loss": 0.6907, + "step": 3720 + }, + { + "epoch": 0.16314060918156673, + "grad_norm": 0.7578125, + "learning_rate": 3.6193063581880894e-05, + "loss": 0.7969, + "step": 3721 + }, + { + "epoch": 0.16318445239822396, + "grad_norm": 0.7734375, + "learning_rate": 3.6189518866359406e-05, + "loss": 0.7548, + "step": 3722 + }, + { + "epoch": 0.1632282956148812, + "grad_norm": 0.7890625, + "learning_rate": 3.6185974286080585e-05, + "loss": 0.7892, + "step": 3723 + }, + { + "epoch": 0.1632721388315384, + "grad_norm": 0.80859375, + "learning_rate": 3.618242984105191e-05, + "loss": 0.6971, + "step": 3724 + }, + { + "epoch": 0.16331598204819564, + "grad_norm": 0.80859375, + "learning_rate": 3.617888553128096e-05, + "loss": 0.9602, + "step": 3725 + }, + { + "epoch": 0.16335982526485288, + "grad_norm": 0.87109375, + "learning_rate": 3.6175341356775215e-05, + "loss": 0.8744, + "step": 3726 + }, + { + "epoch": 0.1634036684815101, + "grad_norm": 0.83984375, + "learning_rate": 3.6171797317542186e-05, + "loss": 0.8361, + "step": 3727 + }, + { + "epoch": 0.16344751169816735, + "grad_norm": 0.7578125, + "learning_rate": 3.616825341358939e-05, + "loss": 0.8523, + "step": 3728 + }, + { + "epoch": 0.16349135491482458, + "grad_norm": 0.91015625, + "learning_rate": 3.61647096449243e-05, + "loss": 0.9063, + "step": 3729 + }, + { + "epoch": 0.16353519813148182, + "grad_norm": 0.84375, + "learning_rate": 3.616116601155448e-05, + "loss": 0.73, + "step": 3730 + }, + { + "epoch": 0.16357904134813905, + "grad_norm": 0.84765625, + "learning_rate": 3.6157622513487433e-05, + "loss": 0.8294, + "step": 3731 + }, + { + "epoch": 0.1636228845647963, + "grad_norm": 0.953125, + "learning_rate": 3.615407915073065e-05, + "loss": 0.9362, + "step": 3732 + }, + { + "epoch": 0.16366672778145352, + "grad_norm": 0.82421875, + "learning_rate": 3.6150535923291637e-05, + "loss": 0.7036, + "step": 3733 + }, + { + "epoch": 0.16371057099811076, + "grad_norm": 0.765625, + "learning_rate": 3.6146992831177896e-05, + "loss": 0.6971, + "step": 3734 + }, + { + "epoch": 0.163754414214768, + "grad_norm": 0.859375, + "learning_rate": 3.614344987439698e-05, + "loss": 0.7966, + "step": 3735 + }, + { + "epoch": 0.16379825743142523, + "grad_norm": 0.82421875, + "learning_rate": 3.613990705295638e-05, + "loss": 0.7299, + "step": 3736 + }, + { + "epoch": 0.16384210064808247, + "grad_norm": 0.828125, + "learning_rate": 3.6136364366863586e-05, + "loss": 0.7542, + "step": 3737 + }, + { + "epoch": 0.1638859438647397, + "grad_norm": 0.8828125, + "learning_rate": 3.613282181612613e-05, + "loss": 0.7433, + "step": 3738 + }, + { + "epoch": 0.16392978708139694, + "grad_norm": 0.91015625, + "learning_rate": 3.6129279400751456e-05, + "loss": 0.9232, + "step": 3739 + }, + { + "epoch": 0.16397363029805415, + "grad_norm": 0.8515625, + "learning_rate": 3.612573712074717e-05, + "loss": 0.8647, + "step": 3740 + }, + { + "epoch": 0.16401747351471138, + "grad_norm": 0.796875, + "learning_rate": 3.6122194976120736e-05, + "loss": 0.8756, + "step": 3741 + }, + { + "epoch": 0.16406131673136862, + "grad_norm": 0.765625, + "learning_rate": 3.611865296687965e-05, + "loss": 0.8369, + "step": 3742 + }, + { + "epoch": 0.16410515994802585, + "grad_norm": 0.7890625, + "learning_rate": 3.611511109303144e-05, + "loss": 0.7784, + "step": 3743 + }, + { + "epoch": 0.1641490031646831, + "grad_norm": 0.828125, + "learning_rate": 3.6111569354583554e-05, + "loss": 0.8583, + "step": 3744 + }, + { + "epoch": 0.16419284638134032, + "grad_norm": 0.8125, + "learning_rate": 3.6108027751543584e-05, + "loss": 0.8107, + "step": 3745 + }, + { + "epoch": 0.16423668959799756, + "grad_norm": 0.82421875, + "learning_rate": 3.6104486283918994e-05, + "loss": 0.9314, + "step": 3746 + }, + { + "epoch": 0.1642805328146548, + "grad_norm": 0.890625, + "learning_rate": 3.6100944951717294e-05, + "loss": 0.903, + "step": 3747 + }, + { + "epoch": 0.16432437603131203, + "grad_norm": 0.890625, + "learning_rate": 3.609740375494599e-05, + "loss": 0.7802, + "step": 3748 + }, + { + "epoch": 0.16436821924796927, + "grad_norm": 0.84765625, + "learning_rate": 3.609386269361256e-05, + "loss": 0.7503, + "step": 3749 + }, + { + "epoch": 0.1644120624646265, + "grad_norm": 0.890625, + "learning_rate": 3.6090321767724554e-05, + "loss": 0.8564, + "step": 3750 + }, + { + "epoch": 0.16445590568128374, + "grad_norm": 0.84375, + "learning_rate": 3.608678097728947e-05, + "loss": 0.9658, + "step": 3751 + }, + { + "epoch": 0.16449974889794097, + "grad_norm": 0.80859375, + "learning_rate": 3.6083240322314796e-05, + "loss": 0.7905, + "step": 3752 + }, + { + "epoch": 0.1645435921145982, + "grad_norm": 0.77734375, + "learning_rate": 3.607969980280804e-05, + "loss": 0.7373, + "step": 3753 + }, + { + "epoch": 0.16458743533125544, + "grad_norm": 0.7890625, + "learning_rate": 3.6076159418776714e-05, + "loss": 0.6802, + "step": 3754 + }, + { + "epoch": 0.16463127854791265, + "grad_norm": 0.84375, + "learning_rate": 3.6072619170228275e-05, + "loss": 0.7424, + "step": 3755 + }, + { + "epoch": 0.1646751217645699, + "grad_norm": 0.81640625, + "learning_rate": 3.60690790571703e-05, + "loss": 0.8414, + "step": 3756 + }, + { + "epoch": 0.16471896498122712, + "grad_norm": 0.859375, + "learning_rate": 3.606553907961026e-05, + "loss": 0.7919, + "step": 3757 + }, + { + "epoch": 0.16476280819788436, + "grad_norm": 0.83203125, + "learning_rate": 3.6061999237555654e-05, + "loss": 0.8704, + "step": 3758 + }, + { + "epoch": 0.1648066514145416, + "grad_norm": 0.84765625, + "learning_rate": 3.6058459531013975e-05, + "loss": 0.9193, + "step": 3759 + }, + { + "epoch": 0.16485049463119883, + "grad_norm": 0.890625, + "learning_rate": 3.605491995999275e-05, + "loss": 0.8251, + "step": 3760 + }, + { + "epoch": 0.16489433784785607, + "grad_norm": 0.7890625, + "learning_rate": 3.605138052449947e-05, + "loss": 0.7784, + "step": 3761 + }, + { + "epoch": 0.1649381810645133, + "grad_norm": 0.7734375, + "learning_rate": 3.604784122454162e-05, + "loss": 0.7811, + "step": 3762 + }, + { + "epoch": 0.16498202428117054, + "grad_norm": 0.91796875, + "learning_rate": 3.6044302060126724e-05, + "loss": 0.9147, + "step": 3763 + }, + { + "epoch": 0.16502586749782777, + "grad_norm": 0.91796875, + "learning_rate": 3.604076303126224e-05, + "loss": 0.8774, + "step": 3764 + }, + { + "epoch": 0.165069710714485, + "grad_norm": 0.77734375, + "learning_rate": 3.603722413795574e-05, + "loss": 0.9401, + "step": 3765 + }, + { + "epoch": 0.16511355393114224, + "grad_norm": 0.7734375, + "learning_rate": 3.603368538021468e-05, + "loss": 0.8796, + "step": 3766 + }, + { + "epoch": 0.16515739714779948, + "grad_norm": 0.84375, + "learning_rate": 3.603014675804657e-05, + "loss": 0.8595, + "step": 3767 + }, + { + "epoch": 0.16520124036445671, + "grad_norm": 0.87109375, + "learning_rate": 3.6026608271458915e-05, + "loss": 0.8485, + "step": 3768 + }, + { + "epoch": 0.16524508358111395, + "grad_norm": 0.82421875, + "learning_rate": 3.602306992045916e-05, + "loss": 0.823, + "step": 3769 + }, + { + "epoch": 0.16528892679777116, + "grad_norm": 0.875, + "learning_rate": 3.60195317050549e-05, + "loss": 0.7638, + "step": 3770 + }, + { + "epoch": 0.1653327700144284, + "grad_norm": 0.8515625, + "learning_rate": 3.6015993625253575e-05, + "loss": 0.8585, + "step": 3771 + }, + { + "epoch": 0.16537661323108563, + "grad_norm": 0.83984375, + "learning_rate": 3.6012455681062694e-05, + "loss": 0.8948, + "step": 3772 + }, + { + "epoch": 0.16542045644774286, + "grad_norm": 0.78515625, + "learning_rate": 3.600891787248977e-05, + "loss": 0.6896, + "step": 3773 + }, + { + "epoch": 0.1654642996644001, + "grad_norm": 0.859375, + "learning_rate": 3.600538019954224e-05, + "loss": 0.7399, + "step": 3774 + }, + { + "epoch": 0.16550814288105734, + "grad_norm": 0.7890625, + "learning_rate": 3.600184266222768e-05, + "loss": 0.9541, + "step": 3775 + }, + { + "epoch": 0.16555198609771457, + "grad_norm": 0.87890625, + "learning_rate": 3.5998305260553566e-05, + "loss": 0.8796, + "step": 3776 + }, + { + "epoch": 0.1655958293143718, + "grad_norm": 0.78515625, + "learning_rate": 3.5994767994527377e-05, + "loss": 0.7471, + "step": 3777 + }, + { + "epoch": 0.16563967253102904, + "grad_norm": 0.83203125, + "learning_rate": 3.5991230864156624e-05, + "loss": 0.94, + "step": 3778 + }, + { + "epoch": 0.16568351574768628, + "grad_norm": 0.80859375, + "learning_rate": 3.598769386944877e-05, + "loss": 0.8837, + "step": 3779 + }, + { + "epoch": 0.1657273589643435, + "grad_norm": 0.78515625, + "learning_rate": 3.5984157010411366e-05, + "loss": 0.7473, + "step": 3780 + }, + { + "epoch": 0.16577120218100075, + "grad_norm": 0.8359375, + "learning_rate": 3.5980620287051884e-05, + "loss": 0.9228, + "step": 3781 + }, + { + "epoch": 0.16581504539765798, + "grad_norm": 0.828125, + "learning_rate": 3.597708369937782e-05, + "loss": 0.9179, + "step": 3782 + }, + { + "epoch": 0.16585888861431522, + "grad_norm": 0.84765625, + "learning_rate": 3.5973547247396666e-05, + "loss": 0.7744, + "step": 3783 + }, + { + "epoch": 0.16590273183097246, + "grad_norm": 0.828125, + "learning_rate": 3.5970010931115885e-05, + "loss": 0.8078, + "step": 3784 + }, + { + "epoch": 0.16594657504762966, + "grad_norm": 0.8359375, + "learning_rate": 3.5966474750543044e-05, + "loss": 0.812, + "step": 3785 + }, + { + "epoch": 0.1659904182642869, + "grad_norm": 0.875, + "learning_rate": 3.596293870568559e-05, + "loss": 0.7824, + "step": 3786 + }, + { + "epoch": 0.16603426148094413, + "grad_norm": 0.75390625, + "learning_rate": 3.5959402796551036e-05, + "loss": 0.7849, + "step": 3787 + }, + { + "epoch": 0.16607810469760137, + "grad_norm": 0.77734375, + "learning_rate": 3.595586702314686e-05, + "loss": 0.7737, + "step": 3788 + }, + { + "epoch": 0.1661219479142586, + "grad_norm": 0.84765625, + "learning_rate": 3.595233138548054e-05, + "loss": 0.8147, + "step": 3789 + }, + { + "epoch": 0.16616579113091584, + "grad_norm": 0.90625, + "learning_rate": 3.5948795883559614e-05, + "loss": 0.8441, + "step": 3790 + }, + { + "epoch": 0.16620963434757308, + "grad_norm": 0.9140625, + "learning_rate": 3.594526051739156e-05, + "loss": 0.9523, + "step": 3791 + }, + { + "epoch": 0.1662534775642303, + "grad_norm": 0.82421875, + "learning_rate": 3.5941725286983873e-05, + "loss": 0.773, + "step": 3792 + }, + { + "epoch": 0.16629732078088755, + "grad_norm": 0.734375, + "learning_rate": 3.593819019234403e-05, + "loss": 0.8164, + "step": 3793 + }, + { + "epoch": 0.16634116399754478, + "grad_norm": 0.82421875, + "learning_rate": 3.59346552334795e-05, + "loss": 0.7859, + "step": 3794 + }, + { + "epoch": 0.16638500721420202, + "grad_norm": 0.91796875, + "learning_rate": 3.5931120410397844e-05, + "loss": 0.9074, + "step": 3795 + }, + { + "epoch": 0.16642885043085925, + "grad_norm": 0.76171875, + "learning_rate": 3.592758572310651e-05, + "loss": 0.7716, + "step": 3796 + }, + { + "epoch": 0.1664726936475165, + "grad_norm": 0.79296875, + "learning_rate": 3.592405117161299e-05, + "loss": 0.7346, + "step": 3797 + }, + { + "epoch": 0.16651653686417373, + "grad_norm": 0.90625, + "learning_rate": 3.5920516755924796e-05, + "loss": 0.8415, + "step": 3798 + }, + { + "epoch": 0.16656038008083096, + "grad_norm": 0.81640625, + "learning_rate": 3.59169824760494e-05, + "loss": 0.8014, + "step": 3799 + }, + { + "epoch": 0.16660422329748817, + "grad_norm": 0.80078125, + "learning_rate": 3.591344833199429e-05, + "loss": 0.7582, + "step": 3800 + }, + { + "epoch": 0.1666480665141454, + "grad_norm": 0.77734375, + "learning_rate": 3.590991432376697e-05, + "loss": 0.8113, + "step": 3801 + }, + { + "epoch": 0.16669190973080264, + "grad_norm": 0.8046875, + "learning_rate": 3.590638045137491e-05, + "loss": 0.7742, + "step": 3802 + }, + { + "epoch": 0.16673575294745988, + "grad_norm": 0.828125, + "learning_rate": 3.590284671482562e-05, + "loss": 0.7631, + "step": 3803 + }, + { + "epoch": 0.1667795961641171, + "grad_norm": 0.75390625, + "learning_rate": 3.5899313114126555e-05, + "loss": 0.8693, + "step": 3804 + }, + { + "epoch": 0.16682343938077435, + "grad_norm": 0.921875, + "learning_rate": 3.5895779649285264e-05, + "loss": 0.9456, + "step": 3805 + }, + { + "epoch": 0.16686728259743158, + "grad_norm": 0.9375, + "learning_rate": 3.589224632030921e-05, + "loss": 0.8462, + "step": 3806 + }, + { + "epoch": 0.16691112581408882, + "grad_norm": 0.78125, + "learning_rate": 3.5888713127205866e-05, + "loss": 0.7728, + "step": 3807 + }, + { + "epoch": 0.16695496903074605, + "grad_norm": 0.85546875, + "learning_rate": 3.588518006998273e-05, + "loss": 0.7893, + "step": 3808 + }, + { + "epoch": 0.1669988122474033, + "grad_norm": 0.84375, + "learning_rate": 3.5881647148647256e-05, + "loss": 0.7647, + "step": 3809 + }, + { + "epoch": 0.16704265546406052, + "grad_norm": 0.8203125, + "learning_rate": 3.5878114363207006e-05, + "loss": 0.9928, + "step": 3810 + }, + { + "epoch": 0.16708649868071776, + "grad_norm": 0.828125, + "learning_rate": 3.587458171366943e-05, + "loss": 0.7751, + "step": 3811 + }, + { + "epoch": 0.167130341897375, + "grad_norm": 0.76171875, + "learning_rate": 3.5871049200042006e-05, + "loss": 0.8289, + "step": 3812 + }, + { + "epoch": 0.16717418511403223, + "grad_norm": 0.9296875, + "learning_rate": 3.5867516822332225e-05, + "loss": 0.852, + "step": 3813 + }, + { + "epoch": 0.16721802833068947, + "grad_norm": 0.84765625, + "learning_rate": 3.586398458054755e-05, + "loss": 0.8734, + "step": 3814 + }, + { + "epoch": 0.16726187154734667, + "grad_norm": 0.87890625, + "learning_rate": 3.586045247469553e-05, + "loss": 0.8047, + "step": 3815 + }, + { + "epoch": 0.1673057147640039, + "grad_norm": 0.8359375, + "learning_rate": 3.58569205047836e-05, + "loss": 0.7899, + "step": 3816 + }, + { + "epoch": 0.16734955798066115, + "grad_norm": 0.85546875, + "learning_rate": 3.5853388670819274e-05, + "loss": 0.932, + "step": 3817 + }, + { + "epoch": 0.16739340119731838, + "grad_norm": 0.79296875, + "learning_rate": 3.584985697281001e-05, + "loss": 0.8566, + "step": 3818 + }, + { + "epoch": 0.16743724441397562, + "grad_norm": 0.765625, + "learning_rate": 3.584632541076329e-05, + "loss": 0.8284, + "step": 3819 + }, + { + "epoch": 0.16748108763063285, + "grad_norm": 0.84375, + "learning_rate": 3.584279398468665e-05, + "loss": 0.9722, + "step": 3820 + }, + { + "epoch": 0.1675249308472901, + "grad_norm": 0.84375, + "learning_rate": 3.5839262694587525e-05, + "loss": 0.7082, + "step": 3821 + }, + { + "epoch": 0.16756877406394732, + "grad_norm": 0.8515625, + "learning_rate": 3.583573154047344e-05, + "loss": 0.6237, + "step": 3822 + }, + { + "epoch": 0.16761261728060456, + "grad_norm": 0.76171875, + "learning_rate": 3.5832200522351834e-05, + "loss": 0.8183, + "step": 3823 + }, + { + "epoch": 0.1676564604972618, + "grad_norm": 0.83984375, + "learning_rate": 3.5828669640230185e-05, + "loss": 0.8064, + "step": 3824 + }, + { + "epoch": 0.16770030371391903, + "grad_norm": 0.7265625, + "learning_rate": 3.582513889411604e-05, + "loss": 0.6246, + "step": 3825 + }, + { + "epoch": 0.16774414693057627, + "grad_norm": 0.8671875, + "learning_rate": 3.582160828401684e-05, + "loss": 0.754, + "step": 3826 + }, + { + "epoch": 0.1677879901472335, + "grad_norm": 0.92578125, + "learning_rate": 3.5818077809940066e-05, + "loss": 0.8111, + "step": 3827 + }, + { + "epoch": 0.16783183336389074, + "grad_norm": 0.84375, + "learning_rate": 3.5814547471893225e-05, + "loss": 0.7293, + "step": 3828 + }, + { + "epoch": 0.16787567658054797, + "grad_norm": 0.796875, + "learning_rate": 3.581101726988373e-05, + "loss": 0.6978, + "step": 3829 + }, + { + "epoch": 0.1679195197972052, + "grad_norm": 0.87890625, + "learning_rate": 3.5807487203919154e-05, + "loss": 0.8819, + "step": 3830 + }, + { + "epoch": 0.16796336301386242, + "grad_norm": 0.77734375, + "learning_rate": 3.580395727400695e-05, + "loss": 0.8413, + "step": 3831 + }, + { + "epoch": 0.16800720623051965, + "grad_norm": 0.859375, + "learning_rate": 3.580042748015458e-05, + "loss": 0.7861, + "step": 3832 + }, + { + "epoch": 0.1680510494471769, + "grad_norm": 0.80859375, + "learning_rate": 3.579689782236954e-05, + "loss": 0.81, + "step": 3833 + }, + { + "epoch": 0.16809489266383412, + "grad_norm": 0.765625, + "learning_rate": 3.57933683006593e-05, + "loss": 0.6745, + "step": 3834 + }, + { + "epoch": 0.16813873588049136, + "grad_norm": 0.7890625, + "learning_rate": 3.5789838915031314e-05, + "loss": 0.7739, + "step": 3835 + }, + { + "epoch": 0.1681825790971486, + "grad_norm": 0.796875, + "learning_rate": 3.578630966549314e-05, + "loss": 0.813, + "step": 3836 + }, + { + "epoch": 0.16822642231380583, + "grad_norm": 0.83203125, + "learning_rate": 3.578278055205219e-05, + "loss": 0.8411, + "step": 3837 + }, + { + "epoch": 0.16827026553046306, + "grad_norm": 0.8828125, + "learning_rate": 3.577925157471598e-05, + "loss": 0.8394, + "step": 3838 + }, + { + "epoch": 0.1683141087471203, + "grad_norm": 0.7734375, + "learning_rate": 3.5775722733491966e-05, + "loss": 0.877, + "step": 3839 + }, + { + "epoch": 0.16835795196377754, + "grad_norm": 0.828125, + "learning_rate": 3.577219402838765e-05, + "loss": 0.8283, + "step": 3840 + }, + { + "epoch": 0.16840179518043477, + "grad_norm": 0.828125, + "learning_rate": 3.5768665459410457e-05, + "loss": 0.8653, + "step": 3841 + }, + { + "epoch": 0.168445638397092, + "grad_norm": 0.89453125, + "learning_rate": 3.576513702656794e-05, + "loss": 0.8386, + "step": 3842 + }, + { + "epoch": 0.16848948161374924, + "grad_norm": 0.84375, + "learning_rate": 3.5761608729867544e-05, + "loss": 0.7721, + "step": 3843 + }, + { + "epoch": 0.16853332483040648, + "grad_norm": 0.76171875, + "learning_rate": 3.575808056931674e-05, + "loss": 0.7643, + "step": 3844 + }, + { + "epoch": 0.1685771680470637, + "grad_norm": 0.81640625, + "learning_rate": 3.575455254492303e-05, + "loss": 0.8361, + "step": 3845 + }, + { + "epoch": 0.16862101126372092, + "grad_norm": 0.88671875, + "learning_rate": 3.575102465669385e-05, + "loss": 0.8771, + "step": 3846 + }, + { + "epoch": 0.16866485448037816, + "grad_norm": 0.82421875, + "learning_rate": 3.574749690463672e-05, + "loss": 0.7487, + "step": 3847 + }, + { + "epoch": 0.1687086976970354, + "grad_norm": 0.80859375, + "learning_rate": 3.574396928875909e-05, + "loss": 0.7242, + "step": 3848 + }, + { + "epoch": 0.16875254091369263, + "grad_norm": 0.8125, + "learning_rate": 3.574044180906845e-05, + "loss": 0.8285, + "step": 3849 + }, + { + "epoch": 0.16879638413034986, + "grad_norm": 0.8125, + "learning_rate": 3.5736914465572234e-05, + "loss": 0.7411, + "step": 3850 + }, + { + "epoch": 0.1688402273470071, + "grad_norm": 0.8046875, + "learning_rate": 3.573338725827798e-05, + "loss": 0.8292, + "step": 3851 + }, + { + "epoch": 0.16888407056366433, + "grad_norm": 0.78125, + "learning_rate": 3.572986018719315e-05, + "loss": 0.7476, + "step": 3852 + }, + { + "epoch": 0.16892791378032157, + "grad_norm": 0.7890625, + "learning_rate": 3.572633325232521e-05, + "loss": 0.6384, + "step": 3853 + }, + { + "epoch": 0.1689717569969788, + "grad_norm": 0.81640625, + "learning_rate": 3.572280645368162e-05, + "loss": 0.7665, + "step": 3854 + }, + { + "epoch": 0.16901560021363604, + "grad_norm": 0.765625, + "learning_rate": 3.5719279791269846e-05, + "loss": 0.7106, + "step": 3855 + }, + { + "epoch": 0.16905944343029328, + "grad_norm": 0.859375, + "learning_rate": 3.5715753265097405e-05, + "loss": 0.8745, + "step": 3856 + }, + { + "epoch": 0.1691032866469505, + "grad_norm": 0.8671875, + "learning_rate": 3.571222687517176e-05, + "loss": 0.832, + "step": 3857 + }, + { + "epoch": 0.16914712986360775, + "grad_norm": 0.80859375, + "learning_rate": 3.570870062150038e-05, + "loss": 0.7108, + "step": 3858 + }, + { + "epoch": 0.16919097308026498, + "grad_norm": 0.734375, + "learning_rate": 3.570517450409072e-05, + "loss": 0.7243, + "step": 3859 + }, + { + "epoch": 0.16923481629692222, + "grad_norm": 0.82421875, + "learning_rate": 3.570164852295025e-05, + "loss": 0.8121, + "step": 3860 + }, + { + "epoch": 0.16927865951357943, + "grad_norm": 0.80859375, + "learning_rate": 3.569812267808649e-05, + "loss": 0.8863, + "step": 3861 + }, + { + "epoch": 0.16932250273023666, + "grad_norm": 0.84765625, + "learning_rate": 3.5694596969506875e-05, + "loss": 0.7864, + "step": 3862 + }, + { + "epoch": 0.1693663459468939, + "grad_norm": 0.81640625, + "learning_rate": 3.569107139721889e-05, + "loss": 0.7943, + "step": 3863 + }, + { + "epoch": 0.16941018916355113, + "grad_norm": 0.96875, + "learning_rate": 3.568754596123001e-05, + "loss": 0.923, + "step": 3864 + }, + { + "epoch": 0.16945403238020837, + "grad_norm": 0.796875, + "learning_rate": 3.5684020661547654e-05, + "loss": 0.8068, + "step": 3865 + }, + { + "epoch": 0.1694978755968656, + "grad_norm": 0.85546875, + "learning_rate": 3.568049549817939e-05, + "loss": 0.8877, + "step": 3866 + }, + { + "epoch": 0.16954171881352284, + "grad_norm": 0.80078125, + "learning_rate": 3.567697047113263e-05, + "loss": 0.7247, + "step": 3867 + }, + { + "epoch": 0.16958556203018008, + "grad_norm": 0.80859375, + "learning_rate": 3.567344558041485e-05, + "loss": 0.7482, + "step": 3868 + }, + { + "epoch": 0.1696294052468373, + "grad_norm": 0.83984375, + "learning_rate": 3.566992082603354e-05, + "loss": 0.8191, + "step": 3869 + }, + { + "epoch": 0.16967324846349455, + "grad_norm": 0.8984375, + "learning_rate": 3.566639620799611e-05, + "loss": 1.0053, + "step": 3870 + }, + { + "epoch": 0.16971709168015178, + "grad_norm": 1.0234375, + "learning_rate": 3.566287172631011e-05, + "loss": 0.7574, + "step": 3871 + }, + { + "epoch": 0.16976093489680902, + "grad_norm": 0.83984375, + "learning_rate": 3.5659347380982986e-05, + "loss": 0.7676, + "step": 3872 + }, + { + "epoch": 0.16980477811346625, + "grad_norm": 0.859375, + "learning_rate": 3.565582317202219e-05, + "loss": 0.7925, + "step": 3873 + }, + { + "epoch": 0.1698486213301235, + "grad_norm": 0.796875, + "learning_rate": 3.565229909943519e-05, + "loss": 0.8624, + "step": 3874 + }, + { + "epoch": 0.16989246454678072, + "grad_norm": 0.99609375, + "learning_rate": 3.564877516322945e-05, + "loss": 1.1006, + "step": 3875 + }, + { + "epoch": 0.16993630776343793, + "grad_norm": 0.70703125, + "learning_rate": 3.564525136341248e-05, + "loss": 0.7022, + "step": 3876 + }, + { + "epoch": 0.16998015098009517, + "grad_norm": 0.81640625, + "learning_rate": 3.564172769999171e-05, + "loss": 0.9472, + "step": 3877 + }, + { + "epoch": 0.1700239941967524, + "grad_norm": 0.828125, + "learning_rate": 3.563820417297463e-05, + "loss": 0.7678, + "step": 3878 + }, + { + "epoch": 0.17006783741340964, + "grad_norm": 0.94921875, + "learning_rate": 3.56346807823687e-05, + "loss": 0.7386, + "step": 3879 + }, + { + "epoch": 0.17011168063006687, + "grad_norm": 0.76171875, + "learning_rate": 3.563115752818135e-05, + "loss": 0.8479, + "step": 3880 + }, + { + "epoch": 0.1701555238467241, + "grad_norm": 0.796875, + "learning_rate": 3.562763441042011e-05, + "loss": 0.7283, + "step": 3881 + }, + { + "epoch": 0.17019936706338135, + "grad_norm": 0.8203125, + "learning_rate": 3.5624111429092424e-05, + "loss": 0.7943, + "step": 3882 + }, + { + "epoch": 0.17024321028003858, + "grad_norm": 0.80859375, + "learning_rate": 3.562058858420575e-05, + "loss": 0.791, + "step": 3883 + }, + { + "epoch": 0.17028705349669582, + "grad_norm": 0.86328125, + "learning_rate": 3.561706587576756e-05, + "loss": 0.9216, + "step": 3884 + }, + { + "epoch": 0.17033089671335305, + "grad_norm": 0.921875, + "learning_rate": 3.561354330378532e-05, + "loss": 0.8367, + "step": 3885 + }, + { + "epoch": 0.1703747399300103, + "grad_norm": 0.7734375, + "learning_rate": 3.5610020868266506e-05, + "loss": 0.7818, + "step": 3886 + }, + { + "epoch": 0.17041858314666752, + "grad_norm": 0.80078125, + "learning_rate": 3.560649856921855e-05, + "loss": 0.7612, + "step": 3887 + }, + { + "epoch": 0.17046242636332476, + "grad_norm": 0.78125, + "learning_rate": 3.5602976406648956e-05, + "loss": 0.7275, + "step": 3888 + }, + { + "epoch": 0.170506269579982, + "grad_norm": 0.99609375, + "learning_rate": 3.559945438056513e-05, + "loss": 0.9648, + "step": 3889 + }, + { + "epoch": 0.17055011279663923, + "grad_norm": 0.9140625, + "learning_rate": 3.559593249097463e-05, + "loss": 0.8957, + "step": 3890 + }, + { + "epoch": 0.17059395601329644, + "grad_norm": 0.83984375, + "learning_rate": 3.559241073788485e-05, + "loss": 0.8626, + "step": 3891 + }, + { + "epoch": 0.17063779922995367, + "grad_norm": 0.91796875, + "learning_rate": 3.558888912130328e-05, + "loss": 0.8924, + "step": 3892 + }, + { + "epoch": 0.1706816424466109, + "grad_norm": 0.86328125, + "learning_rate": 3.558536764123738e-05, + "loss": 0.8011, + "step": 3893 + }, + { + "epoch": 0.17072548566326815, + "grad_norm": 0.85546875, + "learning_rate": 3.558184629769461e-05, + "loss": 0.8931, + "step": 3894 + }, + { + "epoch": 0.17076932887992538, + "grad_norm": 0.83203125, + "learning_rate": 3.5578325090682405e-05, + "loss": 0.7877, + "step": 3895 + }, + { + "epoch": 0.17081317209658262, + "grad_norm": 0.77734375, + "learning_rate": 3.557480402020828e-05, + "loss": 0.7446, + "step": 3896 + }, + { + "epoch": 0.17085701531323985, + "grad_norm": 0.87109375, + "learning_rate": 3.5571283086279694e-05, + "loss": 0.9187, + "step": 3897 + }, + { + "epoch": 0.1709008585298971, + "grad_norm": 0.828125, + "learning_rate": 3.5567762288904085e-05, + "loss": 0.6594, + "step": 3898 + }, + { + "epoch": 0.17094470174655432, + "grad_norm": 0.8203125, + "learning_rate": 3.556424162808891e-05, + "loss": 0.7776, + "step": 3899 + }, + { + "epoch": 0.17098854496321156, + "grad_norm": 0.83984375, + "learning_rate": 3.556072110384162e-05, + "loss": 0.8009, + "step": 3900 + }, + { + "epoch": 0.1710323881798688, + "grad_norm": 0.8828125, + "learning_rate": 3.5557200716169724e-05, + "loss": 0.9161, + "step": 3901 + }, + { + "epoch": 0.17107623139652603, + "grad_norm": 0.88671875, + "learning_rate": 3.555368046508066e-05, + "loss": 1.0037, + "step": 3902 + }, + { + "epoch": 0.17112007461318327, + "grad_norm": 0.76171875, + "learning_rate": 3.5550160350581896e-05, + "loss": 0.7871, + "step": 3903 + }, + { + "epoch": 0.1711639178298405, + "grad_norm": 0.7734375, + "learning_rate": 3.5546640372680875e-05, + "loss": 0.7646, + "step": 3904 + }, + { + "epoch": 0.17120776104649774, + "grad_norm": 0.8359375, + "learning_rate": 3.554312053138503e-05, + "loss": 0.7491, + "step": 3905 + }, + { + "epoch": 0.17125160426315494, + "grad_norm": 0.80859375, + "learning_rate": 3.5539600826701894e-05, + "loss": 0.7784, + "step": 3906 + }, + { + "epoch": 0.17129544747981218, + "grad_norm": 0.81640625, + "learning_rate": 3.553608125863889e-05, + "loss": 0.7895, + "step": 3907 + }, + { + "epoch": 0.17133929069646942, + "grad_norm": 0.7421875, + "learning_rate": 3.553256182720347e-05, + "loss": 0.8027, + "step": 3908 + }, + { + "epoch": 0.17138313391312665, + "grad_norm": 0.80859375, + "learning_rate": 3.5529042532403114e-05, + "loss": 0.7421, + "step": 3909 + }, + { + "epoch": 0.1714269771297839, + "grad_norm": 0.84765625, + "learning_rate": 3.552552337424522e-05, + "loss": 0.7515, + "step": 3910 + }, + { + "epoch": 0.17147082034644112, + "grad_norm": 0.7734375, + "learning_rate": 3.552200435273734e-05, + "loss": 0.8502, + "step": 3911 + }, + { + "epoch": 0.17151466356309836, + "grad_norm": 0.77734375, + "learning_rate": 3.551848546788687e-05, + "loss": 0.7668, + "step": 3912 + }, + { + "epoch": 0.1715585067797556, + "grad_norm": 0.85546875, + "learning_rate": 3.5514966719701305e-05, + "loss": 0.7947, + "step": 3913 + }, + { + "epoch": 0.17160234999641283, + "grad_norm": 0.8359375, + "learning_rate": 3.5511448108188074e-05, + "loss": 0.8324, + "step": 3914 + }, + { + "epoch": 0.17164619321307006, + "grad_norm": 0.8984375, + "learning_rate": 3.55079296333546e-05, + "loss": 0.7993, + "step": 3915 + }, + { + "epoch": 0.1716900364297273, + "grad_norm": 0.84375, + "learning_rate": 3.5504411295208426e-05, + "loss": 0.8724, + "step": 3916 + }, + { + "epoch": 0.17173387964638454, + "grad_norm": 0.83203125, + "learning_rate": 3.550089309375696e-05, + "loss": 0.9319, + "step": 3917 + }, + { + "epoch": 0.17177772286304177, + "grad_norm": 0.81640625, + "learning_rate": 3.5497375029007675e-05, + "loss": 0.9181, + "step": 3918 + }, + { + "epoch": 0.171821566079699, + "grad_norm": 0.84765625, + "learning_rate": 3.5493857100968006e-05, + "loss": 0.8741, + "step": 3919 + }, + { + "epoch": 0.17186540929635624, + "grad_norm": 0.83203125, + "learning_rate": 3.549033930964539e-05, + "loss": 0.7657, + "step": 3920 + }, + { + "epoch": 0.17190925251301348, + "grad_norm": 0.7734375, + "learning_rate": 3.5486821655047344e-05, + "loss": 0.7251, + "step": 3921 + }, + { + "epoch": 0.17195309572967069, + "grad_norm": 0.80859375, + "learning_rate": 3.548330413718129e-05, + "loss": 0.7313, + "step": 3922 + }, + { + "epoch": 0.17199693894632792, + "grad_norm": 0.875, + "learning_rate": 3.54797867560547e-05, + "loss": 0.9797, + "step": 3923 + }, + { + "epoch": 0.17204078216298516, + "grad_norm": 0.82421875, + "learning_rate": 3.5476269511675e-05, + "loss": 0.7853, + "step": 3924 + }, + { + "epoch": 0.1720846253796424, + "grad_norm": 0.8046875, + "learning_rate": 3.5472752404049656e-05, + "loss": 0.7547, + "step": 3925 + }, + { + "epoch": 0.17212846859629963, + "grad_norm": 0.81640625, + "learning_rate": 3.54692354331861e-05, + "loss": 0.9019, + "step": 3926 + }, + { + "epoch": 0.17217231181295686, + "grad_norm": 0.86328125, + "learning_rate": 3.5465718599091846e-05, + "loss": 0.8958, + "step": 3927 + }, + { + "epoch": 0.1722161550296141, + "grad_norm": 0.875, + "learning_rate": 3.5462201901774305e-05, + "loss": 0.7502, + "step": 3928 + }, + { + "epoch": 0.17225999824627133, + "grad_norm": 0.83984375, + "learning_rate": 3.545868534124095e-05, + "loss": 0.7701, + "step": 3929 + }, + { + "epoch": 0.17230384146292857, + "grad_norm": 0.7578125, + "learning_rate": 3.5455168917499215e-05, + "loss": 0.7216, + "step": 3930 + }, + { + "epoch": 0.1723476846795858, + "grad_norm": 0.7890625, + "learning_rate": 3.545165263055657e-05, + "loss": 0.7354, + "step": 3931 + }, + { + "epoch": 0.17239152789624304, + "grad_norm": 0.78515625, + "learning_rate": 3.5448136480420434e-05, + "loss": 0.7589, + "step": 3932 + }, + { + "epoch": 0.17243537111290028, + "grad_norm": 0.796875, + "learning_rate": 3.54446204670983e-05, + "loss": 0.8154, + "step": 3933 + }, + { + "epoch": 0.1724792143295575, + "grad_norm": 0.890625, + "learning_rate": 3.5441104590597605e-05, + "loss": 0.8815, + "step": 3934 + }, + { + "epoch": 0.17252305754621475, + "grad_norm": 0.76953125, + "learning_rate": 3.543758885092576e-05, + "loss": 0.8131, + "step": 3935 + }, + { + "epoch": 0.17256690076287198, + "grad_norm": 0.91015625, + "learning_rate": 3.543407324809029e-05, + "loss": 0.8821, + "step": 3936 + }, + { + "epoch": 0.1726107439795292, + "grad_norm": 0.80859375, + "learning_rate": 3.5430557782098604e-05, + "loss": 0.7131, + "step": 3937 + }, + { + "epoch": 0.17265458719618643, + "grad_norm": 0.9296875, + "learning_rate": 3.542704245295817e-05, + "loss": 0.7501, + "step": 3938 + }, + { + "epoch": 0.17269843041284366, + "grad_norm": 1.03125, + "learning_rate": 3.542352726067643e-05, + "loss": 0.8033, + "step": 3939 + }, + { + "epoch": 0.1727422736295009, + "grad_norm": 0.81640625, + "learning_rate": 3.54200122052608e-05, + "loss": 0.756, + "step": 3940 + }, + { + "epoch": 0.17278611684615813, + "grad_norm": 0.8359375, + "learning_rate": 3.541649728671878e-05, + "loss": 0.8335, + "step": 3941 + }, + { + "epoch": 0.17282996006281537, + "grad_norm": 0.8359375, + "learning_rate": 3.541298250505782e-05, + "loss": 0.8044, + "step": 3942 + }, + { + "epoch": 0.1728738032794726, + "grad_norm": 0.91015625, + "learning_rate": 3.540946786028535e-05, + "loss": 0.8985, + "step": 3943 + }, + { + "epoch": 0.17291764649612984, + "grad_norm": 0.9609375, + "learning_rate": 3.5405953352408814e-05, + "loss": 0.7756, + "step": 3944 + }, + { + "epoch": 0.17296148971278708, + "grad_norm": 0.84375, + "learning_rate": 3.540243898143564e-05, + "loss": 0.862, + "step": 3945 + }, + { + "epoch": 0.1730053329294443, + "grad_norm": 0.875, + "learning_rate": 3.539892474737333e-05, + "loss": 0.8202, + "step": 3946 + }, + { + "epoch": 0.17304917614610155, + "grad_norm": 0.89453125, + "learning_rate": 3.539541065022932e-05, + "loss": 0.9316, + "step": 3947 + }, + { + "epoch": 0.17309301936275878, + "grad_norm": 0.75390625, + "learning_rate": 3.539189669001104e-05, + "loss": 0.8538, + "step": 3948 + }, + { + "epoch": 0.17313686257941602, + "grad_norm": 0.75, + "learning_rate": 3.5388382866725934e-05, + "loss": 0.7513, + "step": 3949 + }, + { + "epoch": 0.17318070579607325, + "grad_norm": 1.0703125, + "learning_rate": 3.538486918038143e-05, + "loss": 0.8444, + "step": 3950 + }, + { + "epoch": 0.1732245490127305, + "grad_norm": 0.85546875, + "learning_rate": 3.5381355630985034e-05, + "loss": 0.8119, + "step": 3951 + }, + { + "epoch": 0.1732683922293877, + "grad_norm": 0.890625, + "learning_rate": 3.5377842218544164e-05, + "loss": 0.8773, + "step": 3952 + }, + { + "epoch": 0.17331223544604493, + "grad_norm": 0.78515625, + "learning_rate": 3.537432894306626e-05, + "loss": 0.814, + "step": 3953 + }, + { + "epoch": 0.17335607866270217, + "grad_norm": 0.7734375, + "learning_rate": 3.5370815804558776e-05, + "loss": 0.8468, + "step": 3954 + }, + { + "epoch": 0.1733999218793594, + "grad_norm": 0.87109375, + "learning_rate": 3.536730280302911e-05, + "loss": 0.8558, + "step": 3955 + }, + { + "epoch": 0.17344376509601664, + "grad_norm": 0.81640625, + "learning_rate": 3.53637899384848e-05, + "loss": 0.7074, + "step": 3956 + }, + { + "epoch": 0.17348760831267387, + "grad_norm": 0.81640625, + "learning_rate": 3.536027721093323e-05, + "loss": 0.8637, + "step": 3957 + }, + { + "epoch": 0.1735314515293311, + "grad_norm": 0.75, + "learning_rate": 3.535676462038187e-05, + "loss": 0.8844, + "step": 3958 + }, + { + "epoch": 0.17357529474598835, + "grad_norm": 0.91796875, + "learning_rate": 3.535325216683815e-05, + "loss": 0.6986, + "step": 3959 + }, + { + "epoch": 0.17361913796264558, + "grad_norm": 0.80078125, + "learning_rate": 3.5349739850309486e-05, + "loss": 0.7911, + "step": 3960 + }, + { + "epoch": 0.17366298117930282, + "grad_norm": 0.80859375, + "learning_rate": 3.534622767080337e-05, + "loss": 0.9161, + "step": 3961 + }, + { + "epoch": 0.17370682439596005, + "grad_norm": 0.7890625, + "learning_rate": 3.534271562832724e-05, + "loss": 0.752, + "step": 3962 + }, + { + "epoch": 0.1737506676126173, + "grad_norm": 0.78515625, + "learning_rate": 3.533920372288852e-05, + "loss": 0.8556, + "step": 3963 + }, + { + "epoch": 0.17379451082927452, + "grad_norm": 0.82421875, + "learning_rate": 3.533569195449468e-05, + "loss": 0.8358, + "step": 3964 + }, + { + "epoch": 0.17383835404593176, + "grad_norm": 0.87109375, + "learning_rate": 3.53321803231531e-05, + "loss": 1.011, + "step": 3965 + }, + { + "epoch": 0.173882197262589, + "grad_norm": 0.83203125, + "learning_rate": 3.5328668828871306e-05, + "loss": 0.8758, + "step": 3966 + }, + { + "epoch": 0.1739260404792462, + "grad_norm": 0.82421875, + "learning_rate": 3.53251574716567e-05, + "loss": 0.8646, + "step": 3967 + }, + { + "epoch": 0.17396988369590344, + "grad_norm": 0.80859375, + "learning_rate": 3.5321646251516727e-05, + "loss": 0.7888, + "step": 3968 + }, + { + "epoch": 0.17401372691256067, + "grad_norm": 0.8671875, + "learning_rate": 3.531813516845882e-05, + "loss": 0.8706, + "step": 3969 + }, + { + "epoch": 0.1740575701292179, + "grad_norm": 1.21875, + "learning_rate": 3.531462422249043e-05, + "loss": 0.7303, + "step": 3970 + }, + { + "epoch": 0.17410141334587514, + "grad_norm": 0.7890625, + "learning_rate": 3.531111341361901e-05, + "loss": 0.8614, + "step": 3971 + }, + { + "epoch": 0.17414525656253238, + "grad_norm": 0.77734375, + "learning_rate": 3.530760274185197e-05, + "loss": 0.7444, + "step": 3972 + }, + { + "epoch": 0.17418909977918962, + "grad_norm": 0.80859375, + "learning_rate": 3.530409220719678e-05, + "loss": 0.818, + "step": 3973 + }, + { + "epoch": 0.17423294299584685, + "grad_norm": 0.7421875, + "learning_rate": 3.530058180966087e-05, + "loss": 0.7022, + "step": 3974 + }, + { + "epoch": 0.1742767862125041, + "grad_norm": 0.87109375, + "learning_rate": 3.5297071549251635e-05, + "loss": 0.8508, + "step": 3975 + }, + { + "epoch": 0.17432062942916132, + "grad_norm": 0.83984375, + "learning_rate": 3.529356142597661e-05, + "loss": 0.8557, + "step": 3976 + }, + { + "epoch": 0.17436447264581856, + "grad_norm": 0.8671875, + "learning_rate": 3.5290051439843165e-05, + "loss": 0.8401, + "step": 3977 + }, + { + "epoch": 0.1744083158624758, + "grad_norm": 0.77734375, + "learning_rate": 3.528654159085877e-05, + "loss": 0.7816, + "step": 3978 + }, + { + "epoch": 0.17445215907913303, + "grad_norm": 0.84765625, + "learning_rate": 3.5283031879030835e-05, + "loss": 0.7893, + "step": 3979 + }, + { + "epoch": 0.17449600229579026, + "grad_norm": 0.8515625, + "learning_rate": 3.527952230436683e-05, + "loss": 0.7345, + "step": 3980 + }, + { + "epoch": 0.1745398455124475, + "grad_norm": 0.75390625, + "learning_rate": 3.5276012866874144e-05, + "loss": 0.7949, + "step": 3981 + }, + { + "epoch": 0.1745836887291047, + "grad_norm": 0.8515625, + "learning_rate": 3.527250356656027e-05, + "loss": 0.8653, + "step": 3982 + }, + { + "epoch": 0.17462753194576194, + "grad_norm": 0.84765625, + "learning_rate": 3.526899440343263e-05, + "loss": 0.7847, + "step": 3983 + }, + { + "epoch": 0.17467137516241918, + "grad_norm": 0.76953125, + "learning_rate": 3.5265485377498666e-05, + "loss": 0.7654, + "step": 3984 + }, + { + "epoch": 0.17471521837907641, + "grad_norm": 0.76171875, + "learning_rate": 3.52619764887658e-05, + "loss": 0.7073, + "step": 3985 + }, + { + "epoch": 0.17475906159573365, + "grad_norm": 0.859375, + "learning_rate": 3.525846773724144e-05, + "loss": 0.7529, + "step": 3986 + }, + { + "epoch": 0.17480290481239089, + "grad_norm": 0.7734375, + "learning_rate": 3.52549591229331e-05, + "loss": 0.8363, + "step": 3987 + }, + { + "epoch": 0.17484674802904812, + "grad_norm": 0.81640625, + "learning_rate": 3.525145064584815e-05, + "loss": 0.7988, + "step": 3988 + }, + { + "epoch": 0.17489059124570536, + "grad_norm": 0.76171875, + "learning_rate": 3.524794230599406e-05, + "loss": 0.7262, + "step": 3989 + }, + { + "epoch": 0.1749344344623626, + "grad_norm": 0.8671875, + "learning_rate": 3.524443410337827e-05, + "loss": 0.6951, + "step": 3990 + }, + { + "epoch": 0.17497827767901983, + "grad_norm": 0.87109375, + "learning_rate": 3.524092603800816e-05, + "loss": 0.7389, + "step": 3991 + }, + { + "epoch": 0.17502212089567706, + "grad_norm": 0.80078125, + "learning_rate": 3.523741810989123e-05, + "loss": 0.7826, + "step": 3992 + }, + { + "epoch": 0.1750659641123343, + "grad_norm": 0.875, + "learning_rate": 3.52339103190349e-05, + "loss": 0.8631, + "step": 3993 + }, + { + "epoch": 0.17510980732899153, + "grad_norm": 0.82421875, + "learning_rate": 3.523040266544659e-05, + "loss": 0.7577, + "step": 3994 + }, + { + "epoch": 0.17515365054564877, + "grad_norm": 0.78125, + "learning_rate": 3.522689514913374e-05, + "loss": 0.7737, + "step": 3995 + }, + { + "epoch": 0.175197493762306, + "grad_norm": 0.75390625, + "learning_rate": 3.522338777010376e-05, + "loss": 0.8777, + "step": 3996 + }, + { + "epoch": 0.17524133697896324, + "grad_norm": 0.77734375, + "learning_rate": 3.5219880528364126e-05, + "loss": 0.8359, + "step": 3997 + }, + { + "epoch": 0.17528518019562045, + "grad_norm": 0.765625, + "learning_rate": 3.521637342392225e-05, + "loss": 0.7779, + "step": 3998 + }, + { + "epoch": 0.17532902341227768, + "grad_norm": 0.9296875, + "learning_rate": 3.5212866456785587e-05, + "loss": 0.8002, + "step": 3999 + }, + { + "epoch": 0.17537286662893492, + "grad_norm": 0.7421875, + "learning_rate": 3.520935962696153e-05, + "loss": 0.7856, + "step": 4000 + }, + { + "epoch": 0.17537286662893492, + "eval_loss": 0.804158091545105, + "eval_runtime": 294.0775, + "eval_samples_per_second": 34.005, + "eval_steps_per_second": 0.711, + "step": 4000 + }, + { + "epoch": 0.17541670984559216, + "grad_norm": 0.68359375, + "learning_rate": 3.520585293445752e-05, + "loss": 0.7417, + "step": 4001 + }, + { + "epoch": 0.1754605530622494, + "grad_norm": 0.8203125, + "learning_rate": 3.520234637928102e-05, + "loss": 0.8393, + "step": 4002 + }, + { + "epoch": 0.17550439627890663, + "grad_norm": 0.83203125, + "learning_rate": 3.5198839961439444e-05, + "loss": 0.7764, + "step": 4003 + }, + { + "epoch": 0.17554823949556386, + "grad_norm": 0.859375, + "learning_rate": 3.5195333680940224e-05, + "loss": 0.741, + "step": 4004 + }, + { + "epoch": 0.1755920827122211, + "grad_norm": 0.7421875, + "learning_rate": 3.519182753779079e-05, + "loss": 0.5566, + "step": 4005 + }, + { + "epoch": 0.17563592592887833, + "grad_norm": 0.86328125, + "learning_rate": 3.5188321531998545e-05, + "loss": 0.8487, + "step": 4006 + }, + { + "epoch": 0.17567976914553557, + "grad_norm": 0.9609375, + "learning_rate": 3.518481566357098e-05, + "loss": 0.8922, + "step": 4007 + }, + { + "epoch": 0.1757236123621928, + "grad_norm": 0.78515625, + "learning_rate": 3.518130993251549e-05, + "loss": 0.7862, + "step": 4008 + }, + { + "epoch": 0.17576745557885004, + "grad_norm": 0.8125, + "learning_rate": 3.51778043388395e-05, + "loss": 0.8998, + "step": 4009 + }, + { + "epoch": 0.17581129879550728, + "grad_norm": 0.7890625, + "learning_rate": 3.517429888255047e-05, + "loss": 0.7713, + "step": 4010 + }, + { + "epoch": 0.1758551420121645, + "grad_norm": 0.87109375, + "learning_rate": 3.517079356365579e-05, + "loss": 0.8879, + "step": 4011 + }, + { + "epoch": 0.17589898522882175, + "grad_norm": 0.87890625, + "learning_rate": 3.516728838216289e-05, + "loss": 0.7926, + "step": 4012 + }, + { + "epoch": 0.17594282844547895, + "grad_norm": 0.85546875, + "learning_rate": 3.516378333807924e-05, + "loss": 0.8494, + "step": 4013 + }, + { + "epoch": 0.1759866716621362, + "grad_norm": 0.90625, + "learning_rate": 3.516027843141224e-05, + "loss": 0.8232, + "step": 4014 + }, + { + "epoch": 0.17603051487879343, + "grad_norm": 0.8203125, + "learning_rate": 3.515677366216933e-05, + "loss": 0.7265, + "step": 4015 + }, + { + "epoch": 0.17607435809545066, + "grad_norm": 0.84375, + "learning_rate": 3.5153269030357935e-05, + "loss": 0.778, + "step": 4016 + }, + { + "epoch": 0.1761182013121079, + "grad_norm": 2.8125, + "learning_rate": 3.5149764535985476e-05, + "loss": 0.76, + "step": 4017 + }, + { + "epoch": 0.17616204452876513, + "grad_norm": 0.8515625, + "learning_rate": 3.514626017905939e-05, + "loss": 0.8951, + "step": 4018 + }, + { + "epoch": 0.17620588774542237, + "grad_norm": 0.84765625, + "learning_rate": 3.5142755959587094e-05, + "loss": 0.8081, + "step": 4019 + }, + { + "epoch": 0.1762497309620796, + "grad_norm": 0.953125, + "learning_rate": 3.513925187757602e-05, + "loss": 0.7772, + "step": 4020 + }, + { + "epoch": 0.17629357417873684, + "grad_norm": 0.8046875, + "learning_rate": 3.5135747933033556e-05, + "loss": 0.6822, + "step": 4021 + }, + { + "epoch": 0.17633741739539407, + "grad_norm": 0.796875, + "learning_rate": 3.5132244125967205e-05, + "loss": 0.7888, + "step": 4022 + }, + { + "epoch": 0.1763812606120513, + "grad_norm": 0.76953125, + "learning_rate": 3.512874045638436e-05, + "loss": 0.7292, + "step": 4023 + }, + { + "epoch": 0.17642510382870855, + "grad_norm": 0.859375, + "learning_rate": 3.5125236924292434e-05, + "loss": 0.8758, + "step": 4024 + }, + { + "epoch": 0.17646894704536578, + "grad_norm": 0.86328125, + "learning_rate": 3.512173352969887e-05, + "loss": 0.8405, + "step": 4025 + }, + { + "epoch": 0.17651279026202302, + "grad_norm": 0.90234375, + "learning_rate": 3.511823027261103e-05, + "loss": 0.9625, + "step": 4026 + }, + { + "epoch": 0.17655663347868025, + "grad_norm": 0.97265625, + "learning_rate": 3.511472715303644e-05, + "loss": 0.7438, + "step": 4027 + }, + { + "epoch": 0.17660047669533746, + "grad_norm": 0.828125, + "learning_rate": 3.511122417098248e-05, + "loss": 0.7666, + "step": 4028 + }, + { + "epoch": 0.1766443199119947, + "grad_norm": 0.7265625, + "learning_rate": 3.510772132645657e-05, + "loss": 0.7121, + "step": 4029 + }, + { + "epoch": 0.17668816312865193, + "grad_norm": 0.77734375, + "learning_rate": 3.510421861946613e-05, + "loss": 0.7661, + "step": 4030 + }, + { + "epoch": 0.17673200634530917, + "grad_norm": 0.88671875, + "learning_rate": 3.5100716050018554e-05, + "loss": 0.9686, + "step": 4031 + }, + { + "epoch": 0.1767758495619664, + "grad_norm": 0.8359375, + "learning_rate": 3.509721361812135e-05, + "loss": 0.7528, + "step": 4032 + }, + { + "epoch": 0.17681969277862364, + "grad_norm": 0.7734375, + "learning_rate": 3.509371132378188e-05, + "loss": 0.7474, + "step": 4033 + }, + { + "epoch": 0.17686353599528087, + "grad_norm": 0.8671875, + "learning_rate": 3.5090209167007585e-05, + "loss": 0.8243, + "step": 4034 + }, + { + "epoch": 0.1769073792119381, + "grad_norm": 0.9453125, + "learning_rate": 3.508670714780588e-05, + "loss": 0.9186, + "step": 4035 + }, + { + "epoch": 0.17695122242859535, + "grad_norm": 0.80859375, + "learning_rate": 3.508320526618414e-05, + "loss": 0.8123, + "step": 4036 + }, + { + "epoch": 0.17699506564525258, + "grad_norm": 0.8359375, + "learning_rate": 3.5079703522149886e-05, + "loss": 0.8254, + "step": 4037 + }, + { + "epoch": 0.17703890886190982, + "grad_norm": 0.81640625, + "learning_rate": 3.5076201915710484e-05, + "loss": 0.7569, + "step": 4038 + }, + { + "epoch": 0.17708275207856705, + "grad_norm": 0.81640625, + "learning_rate": 3.507270044687336e-05, + "loss": 0.8168, + "step": 4039 + }, + { + "epoch": 0.1771265952952243, + "grad_norm": 0.8046875, + "learning_rate": 3.506919911564595e-05, + "loss": 0.7757, + "step": 4040 + }, + { + "epoch": 0.17717043851188152, + "grad_norm": 0.7890625, + "learning_rate": 3.506569792203561e-05, + "loss": 0.7869, + "step": 4041 + }, + { + "epoch": 0.17721428172853876, + "grad_norm": 0.8203125, + "learning_rate": 3.5062196866049854e-05, + "loss": 0.8106, + "step": 4042 + }, + { + "epoch": 0.17725812494519597, + "grad_norm": 0.83984375, + "learning_rate": 3.5058695947696054e-05, + "loss": 0.8401, + "step": 4043 + }, + { + "epoch": 0.1773019681618532, + "grad_norm": 0.83203125, + "learning_rate": 3.5055195166981645e-05, + "loss": 0.9368, + "step": 4044 + }, + { + "epoch": 0.17734581137851044, + "grad_norm": 0.890625, + "learning_rate": 3.5051694523914034e-05, + "loss": 0.8722, + "step": 4045 + }, + { + "epoch": 0.17738965459516767, + "grad_norm": 0.8203125, + "learning_rate": 3.50481940185006e-05, + "loss": 0.9111, + "step": 4046 + }, + { + "epoch": 0.1774334978118249, + "grad_norm": 0.91015625, + "learning_rate": 3.504469365074886e-05, + "loss": 0.7882, + "step": 4047 + }, + { + "epoch": 0.17747734102848214, + "grad_norm": 0.85546875, + "learning_rate": 3.504119342066616e-05, + "loss": 0.8814, + "step": 4048 + }, + { + "epoch": 0.17752118424513938, + "grad_norm": 0.80859375, + "learning_rate": 3.503769332825996e-05, + "loss": 0.7463, + "step": 4049 + }, + { + "epoch": 0.17756502746179662, + "grad_norm": 1.0390625, + "learning_rate": 3.503419337353764e-05, + "loss": 0.837, + "step": 4050 + }, + { + "epoch": 0.17760887067845385, + "grad_norm": 0.92578125, + "learning_rate": 3.50306935565066e-05, + "loss": 0.9039, + "step": 4051 + }, + { + "epoch": 0.1776527138951111, + "grad_norm": 0.76953125, + "learning_rate": 3.5027193877174327e-05, + "loss": 0.9057, + "step": 4052 + }, + { + "epoch": 0.17769655711176832, + "grad_norm": 0.8046875, + "learning_rate": 3.5023694335548204e-05, + "loss": 0.8609, + "step": 4053 + }, + { + "epoch": 0.17774040032842556, + "grad_norm": 0.8828125, + "learning_rate": 3.502019493163564e-05, + "loss": 0.7628, + "step": 4054 + }, + { + "epoch": 0.1777842435450828, + "grad_norm": 0.83984375, + "learning_rate": 3.5016695665444065e-05, + "loss": 0.8391, + "step": 4055 + }, + { + "epoch": 0.17782808676174003, + "grad_norm": 0.80078125, + "learning_rate": 3.5013196536980895e-05, + "loss": 0.837, + "step": 4056 + }, + { + "epoch": 0.17787192997839726, + "grad_norm": 0.77734375, + "learning_rate": 3.5009697546253536e-05, + "loss": 0.7604, + "step": 4057 + }, + { + "epoch": 0.17791577319505447, + "grad_norm": 0.84375, + "learning_rate": 3.500619869326941e-05, + "loss": 0.9101, + "step": 4058 + }, + { + "epoch": 0.1779596164117117, + "grad_norm": 0.8046875, + "learning_rate": 3.5002699978035924e-05, + "loss": 0.8396, + "step": 4059 + }, + { + "epoch": 0.17800345962836894, + "grad_norm": 0.87890625, + "learning_rate": 3.4999201400560476e-05, + "loss": 0.8306, + "step": 4060 + }, + { + "epoch": 0.17804730284502618, + "grad_norm": 1.0234375, + "learning_rate": 3.4995702960850526e-05, + "loss": 0.8155, + "step": 4061 + }, + { + "epoch": 0.17809114606168341, + "grad_norm": 0.828125, + "learning_rate": 3.499220465891349e-05, + "loss": 0.888, + "step": 4062 + }, + { + "epoch": 0.17813498927834065, + "grad_norm": 0.75, + "learning_rate": 3.498870649475674e-05, + "loss": 0.6611, + "step": 4063 + }, + { + "epoch": 0.17817883249499789, + "grad_norm": 0.80078125, + "learning_rate": 3.498520846838772e-05, + "loss": 0.6207, + "step": 4064 + }, + { + "epoch": 0.17822267571165512, + "grad_norm": 1.0078125, + "learning_rate": 3.498171057981383e-05, + "loss": 0.9137, + "step": 4065 + }, + { + "epoch": 0.17826651892831236, + "grad_norm": 0.87890625, + "learning_rate": 3.497821282904246e-05, + "loss": 0.8185, + "step": 4066 + }, + { + "epoch": 0.1783103621449696, + "grad_norm": 0.7265625, + "learning_rate": 3.4974715216081076e-05, + "loss": 0.7669, + "step": 4067 + }, + { + "epoch": 0.17835420536162683, + "grad_norm": 0.859375, + "learning_rate": 3.497121774093708e-05, + "loss": 0.7556, + "step": 4068 + }, + { + "epoch": 0.17839804857828406, + "grad_norm": 0.796875, + "learning_rate": 3.4967720403617856e-05, + "loss": 0.7084, + "step": 4069 + }, + { + "epoch": 0.1784418917949413, + "grad_norm": 0.81640625, + "learning_rate": 3.496422320413084e-05, + "loss": 0.8268, + "step": 4070 + }, + { + "epoch": 0.17848573501159853, + "grad_norm": 0.875, + "learning_rate": 3.49607261424834e-05, + "loss": 0.8458, + "step": 4071 + }, + { + "epoch": 0.17852957822825577, + "grad_norm": 0.83203125, + "learning_rate": 3.495722921868302e-05, + "loss": 0.7918, + "step": 4072 + }, + { + "epoch": 0.17857342144491298, + "grad_norm": 0.81640625, + "learning_rate": 3.4953732432737076e-05, + "loss": 0.8378, + "step": 4073 + }, + { + "epoch": 0.1786172646615702, + "grad_norm": 0.75390625, + "learning_rate": 3.4950235784652974e-05, + "loss": 0.7215, + "step": 4074 + }, + { + "epoch": 0.17866110787822745, + "grad_norm": 0.84765625, + "learning_rate": 3.4946739274438135e-05, + "loss": 0.8057, + "step": 4075 + }, + { + "epoch": 0.17870495109488468, + "grad_norm": 0.75390625, + "learning_rate": 3.4943242902099924e-05, + "loss": 0.7066, + "step": 4076 + }, + { + "epoch": 0.17874879431154192, + "grad_norm": 0.86328125, + "learning_rate": 3.493974666764582e-05, + "loss": 0.7313, + "step": 4077 + }, + { + "epoch": 0.17879263752819916, + "grad_norm": 0.828125, + "learning_rate": 3.4936250571083216e-05, + "loss": 0.6779, + "step": 4078 + }, + { + "epoch": 0.1788364807448564, + "grad_norm": 0.83984375, + "learning_rate": 3.4932754612419505e-05, + "loss": 0.7503, + "step": 4079 + }, + { + "epoch": 0.17888032396151363, + "grad_norm": 0.8828125, + "learning_rate": 3.49292587916621e-05, + "loss": 0.7633, + "step": 4080 + }, + { + "epoch": 0.17892416717817086, + "grad_norm": 0.87890625, + "learning_rate": 3.492576310881838e-05, + "loss": 0.813, + "step": 4081 + }, + { + "epoch": 0.1789680103948281, + "grad_norm": 0.77734375, + "learning_rate": 3.492226756389582e-05, + "loss": 0.7968, + "step": 4082 + }, + { + "epoch": 0.17901185361148533, + "grad_norm": 0.70703125, + "learning_rate": 3.491877215690179e-05, + "loss": 0.6875, + "step": 4083 + }, + { + "epoch": 0.17905569682814257, + "grad_norm": 0.765625, + "learning_rate": 3.491527688784371e-05, + "loss": 0.7323, + "step": 4084 + }, + { + "epoch": 0.1790995400447998, + "grad_norm": 0.94140625, + "learning_rate": 3.491178175672898e-05, + "loss": 0.817, + "step": 4085 + }, + { + "epoch": 0.17914338326145704, + "grad_norm": 0.74609375, + "learning_rate": 3.4908286763564966e-05, + "loss": 0.6284, + "step": 4086 + }, + { + "epoch": 0.17918722647811428, + "grad_norm": 0.7421875, + "learning_rate": 3.4904791908359156e-05, + "loss": 0.738, + "step": 4087 + }, + { + "epoch": 0.1792310696947715, + "grad_norm": 0.765625, + "learning_rate": 3.490129719111892e-05, + "loss": 0.8755, + "step": 4088 + }, + { + "epoch": 0.17927491291142872, + "grad_norm": 0.8203125, + "learning_rate": 3.489780261185166e-05, + "loss": 0.7868, + "step": 4089 + }, + { + "epoch": 0.17931875612808595, + "grad_norm": 0.86328125, + "learning_rate": 3.489430817056479e-05, + "loss": 0.8235, + "step": 4090 + }, + { + "epoch": 0.1793625993447432, + "grad_norm": 0.86328125, + "learning_rate": 3.489081386726568e-05, + "loss": 0.8719, + "step": 4091 + }, + { + "epoch": 0.17940644256140043, + "grad_norm": 0.80078125, + "learning_rate": 3.4887319701961796e-05, + "loss": 0.7279, + "step": 4092 + }, + { + "epoch": 0.17945028577805766, + "grad_norm": 0.80859375, + "learning_rate": 3.488382567466053e-05, + "loss": 0.8017, + "step": 4093 + }, + { + "epoch": 0.1794941289947149, + "grad_norm": 0.8984375, + "learning_rate": 3.488033178536926e-05, + "loss": 0.86, + "step": 4094 + }, + { + "epoch": 0.17953797221137213, + "grad_norm": 1.71875, + "learning_rate": 3.48768380340954e-05, + "loss": 0.7856, + "step": 4095 + }, + { + "epoch": 0.17958181542802937, + "grad_norm": 0.8515625, + "learning_rate": 3.487334442084638e-05, + "loss": 0.8333, + "step": 4096 + }, + { + "epoch": 0.1796256586446866, + "grad_norm": 0.80078125, + "learning_rate": 3.4869850945629533e-05, + "loss": 0.8258, + "step": 4097 + }, + { + "epoch": 0.17966950186134384, + "grad_norm": 0.80859375, + "learning_rate": 3.486635760845236e-05, + "loss": 0.7597, + "step": 4098 + }, + { + "epoch": 0.17971334507800107, + "grad_norm": 0.796875, + "learning_rate": 3.486286440932222e-05, + "loss": 0.8698, + "step": 4099 + }, + { + "epoch": 0.1797571882946583, + "grad_norm": 0.77734375, + "learning_rate": 3.48593713482465e-05, + "loss": 0.8294, + "step": 4100 + }, + { + "epoch": 0.17980103151131555, + "grad_norm": 0.86328125, + "learning_rate": 3.485587842523262e-05, + "loss": 0.8781, + "step": 4101 + }, + { + "epoch": 0.17984487472797278, + "grad_norm": 0.84375, + "learning_rate": 3.4852385640288e-05, + "loss": 0.8443, + "step": 4102 + }, + { + "epoch": 0.17988871794463002, + "grad_norm": 0.81640625, + "learning_rate": 3.484889299342001e-05, + "loss": 0.801, + "step": 4103 + }, + { + "epoch": 0.17993256116128722, + "grad_norm": 0.8046875, + "learning_rate": 3.484540048463607e-05, + "loss": 0.8896, + "step": 4104 + }, + { + "epoch": 0.17997640437794446, + "grad_norm": 0.80078125, + "learning_rate": 3.484190811394359e-05, + "loss": 0.6939, + "step": 4105 + }, + { + "epoch": 0.1800202475946017, + "grad_norm": 0.87109375, + "learning_rate": 3.4838415881349914e-05, + "loss": 0.9043, + "step": 4106 + }, + { + "epoch": 0.18006409081125893, + "grad_norm": 0.84765625, + "learning_rate": 3.4834923786862536e-05, + "loss": 0.8486, + "step": 4107 + }, + { + "epoch": 0.18010793402791617, + "grad_norm": 0.84765625, + "learning_rate": 3.4831431830488806e-05, + "loss": 0.9503, + "step": 4108 + }, + { + "epoch": 0.1801517772445734, + "grad_norm": 0.8671875, + "learning_rate": 3.482794001223614e-05, + "loss": 0.9117, + "step": 4109 + }, + { + "epoch": 0.18019562046123064, + "grad_norm": 0.87109375, + "learning_rate": 3.482444833211191e-05, + "loss": 0.8776, + "step": 4110 + }, + { + "epoch": 0.18023946367788787, + "grad_norm": 0.80078125, + "learning_rate": 3.4820956790123525e-05, + "loss": 0.7347, + "step": 4111 + }, + { + "epoch": 0.1802833068945451, + "grad_norm": 0.76953125, + "learning_rate": 3.481746538627841e-05, + "loss": 0.8106, + "step": 4112 + }, + { + "epoch": 0.18032715011120234, + "grad_norm": 0.75, + "learning_rate": 3.481397412058397e-05, + "loss": 0.8185, + "step": 4113 + }, + { + "epoch": 0.18037099332785958, + "grad_norm": 0.8203125, + "learning_rate": 3.4810482993047576e-05, + "loss": 1.0105, + "step": 4114 + }, + { + "epoch": 0.18041483654451682, + "grad_norm": 0.83203125, + "learning_rate": 3.4806992003676643e-05, + "loss": 0.7841, + "step": 4115 + }, + { + "epoch": 0.18045867976117405, + "grad_norm": 0.8359375, + "learning_rate": 3.480350115247856e-05, + "loss": 0.825, + "step": 4116 + }, + { + "epoch": 0.1805025229778313, + "grad_norm": 0.83203125, + "learning_rate": 3.4800010439460696e-05, + "loss": 0.8283, + "step": 4117 + }, + { + "epoch": 0.18054636619448852, + "grad_norm": 0.71484375, + "learning_rate": 3.4796519864630514e-05, + "loss": 0.6238, + "step": 4118 + }, + { + "epoch": 0.18059020941114573, + "grad_norm": 0.78515625, + "learning_rate": 3.47930294279954e-05, + "loss": 0.8125, + "step": 4119 + }, + { + "epoch": 0.18063405262780297, + "grad_norm": 0.8359375, + "learning_rate": 3.478953912956271e-05, + "loss": 0.7909, + "step": 4120 + }, + { + "epoch": 0.1806778958444602, + "grad_norm": 0.80859375, + "learning_rate": 3.478604896933987e-05, + "loss": 0.8055, + "step": 4121 + }, + { + "epoch": 0.18072173906111744, + "grad_norm": 0.8203125, + "learning_rate": 3.4782558947334245e-05, + "loss": 0.7322, + "step": 4122 + }, + { + "epoch": 0.18076558227777467, + "grad_norm": 0.79296875, + "learning_rate": 3.4779069063553284e-05, + "loss": 0.8162, + "step": 4123 + }, + { + "epoch": 0.1808094254944319, + "grad_norm": 0.796875, + "learning_rate": 3.477557931800436e-05, + "loss": 0.8161, + "step": 4124 + }, + { + "epoch": 0.18085326871108914, + "grad_norm": 0.92578125, + "learning_rate": 3.477208971069488e-05, + "loss": 0.848, + "step": 4125 + }, + { + "epoch": 0.18089711192774638, + "grad_norm": 0.78515625, + "learning_rate": 3.476860024163221e-05, + "loss": 0.7934, + "step": 4126 + }, + { + "epoch": 0.18094095514440361, + "grad_norm": 0.86328125, + "learning_rate": 3.476511091082374e-05, + "loss": 0.8436, + "step": 4127 + }, + { + "epoch": 0.18098479836106085, + "grad_norm": 0.9140625, + "learning_rate": 3.476162171827691e-05, + "loss": 0.9059, + "step": 4128 + }, + { + "epoch": 0.18102864157771809, + "grad_norm": 0.890625, + "learning_rate": 3.475813266399911e-05, + "loss": 0.8116, + "step": 4129 + }, + { + "epoch": 0.18107248479437532, + "grad_norm": 0.7890625, + "learning_rate": 3.475464374799772e-05, + "loss": 0.8995, + "step": 4130 + }, + { + "epoch": 0.18111632801103256, + "grad_norm": 0.81640625, + "learning_rate": 3.475115497028012e-05, + "loss": 0.828, + "step": 4131 + }, + { + "epoch": 0.1811601712276898, + "grad_norm": 0.82421875, + "learning_rate": 3.474766633085369e-05, + "loss": 0.8599, + "step": 4132 + }, + { + "epoch": 0.18120401444434703, + "grad_norm": 0.8203125, + "learning_rate": 3.474417782972589e-05, + "loss": 0.7343, + "step": 4133 + }, + { + "epoch": 0.18124785766100424, + "grad_norm": 0.8359375, + "learning_rate": 3.4740689466904063e-05, + "loss": 0.7454, + "step": 4134 + }, + { + "epoch": 0.18129170087766147, + "grad_norm": 0.85546875, + "learning_rate": 3.473720124239562e-05, + "loss": 0.8216, + "step": 4135 + }, + { + "epoch": 0.1813355440943187, + "grad_norm": 0.80078125, + "learning_rate": 3.473371315620796e-05, + "loss": 0.8103, + "step": 4136 + }, + { + "epoch": 0.18137938731097594, + "grad_norm": 0.734375, + "learning_rate": 3.4730225208348424e-05, + "loss": 0.7647, + "step": 4137 + }, + { + "epoch": 0.18142323052763318, + "grad_norm": 0.78515625, + "learning_rate": 3.472673739882447e-05, + "loss": 0.7747, + "step": 4138 + }, + { + "epoch": 0.1814670737442904, + "grad_norm": 0.7421875, + "learning_rate": 3.472324972764347e-05, + "loss": 0.6724, + "step": 4139 + }, + { + "epoch": 0.18151091696094765, + "grad_norm": 0.9140625, + "learning_rate": 3.471976219481281e-05, + "loss": 0.8657, + "step": 4140 + }, + { + "epoch": 0.18155476017760488, + "grad_norm": 0.9296875, + "learning_rate": 3.471627480033989e-05, + "loss": 0.8363, + "step": 4141 + }, + { + "epoch": 0.18159860339426212, + "grad_norm": 0.7421875, + "learning_rate": 3.471278754423208e-05, + "loss": 0.8154, + "step": 4142 + }, + { + "epoch": 0.18164244661091936, + "grad_norm": 0.83984375, + "learning_rate": 3.4709300426496806e-05, + "loss": 0.7705, + "step": 4143 + }, + { + "epoch": 0.1816862898275766, + "grad_norm": 0.8671875, + "learning_rate": 3.470581344714142e-05, + "loss": 0.7644, + "step": 4144 + }, + { + "epoch": 0.18173013304423383, + "grad_norm": 0.80859375, + "learning_rate": 3.470232660617334e-05, + "loss": 0.7371, + "step": 4145 + }, + { + "epoch": 0.18177397626089106, + "grad_norm": 1.0859375, + "learning_rate": 3.469883990359991e-05, + "loss": 0.8332, + "step": 4146 + }, + { + "epoch": 0.1818178194775483, + "grad_norm": 0.828125, + "learning_rate": 3.469535333942859e-05, + "loss": 0.8709, + "step": 4147 + }, + { + "epoch": 0.18186166269420553, + "grad_norm": 0.80859375, + "learning_rate": 3.469186691366674e-05, + "loss": 0.7774, + "step": 4148 + }, + { + "epoch": 0.18190550591086274, + "grad_norm": 0.859375, + "learning_rate": 3.468838062632174e-05, + "loss": 0.8675, + "step": 4149 + }, + { + "epoch": 0.18194934912751998, + "grad_norm": 0.828125, + "learning_rate": 3.4684894477400984e-05, + "loss": 0.8139, + "step": 4150 + }, + { + "epoch": 0.1819931923441772, + "grad_norm": 0.76171875, + "learning_rate": 3.468140846691186e-05, + "loss": 0.7342, + "step": 4151 + }, + { + "epoch": 0.18203703556083445, + "grad_norm": 0.78125, + "learning_rate": 3.467792259486172e-05, + "loss": 0.8484, + "step": 4152 + }, + { + "epoch": 0.18208087877749168, + "grad_norm": 0.80078125, + "learning_rate": 3.467443686125803e-05, + "loss": 0.7017, + "step": 4153 + }, + { + "epoch": 0.18212472199414892, + "grad_norm": 1.015625, + "learning_rate": 3.467095126610814e-05, + "loss": 0.8213, + "step": 4154 + }, + { + "epoch": 0.18216856521080615, + "grad_norm": 0.87890625, + "learning_rate": 3.466746580941943e-05, + "loss": 0.8208, + "step": 4155 + }, + { + "epoch": 0.1822124084274634, + "grad_norm": 0.81640625, + "learning_rate": 3.4663980491199296e-05, + "loss": 0.8553, + "step": 4156 + }, + { + "epoch": 0.18225625164412063, + "grad_norm": 0.84375, + "learning_rate": 3.466049531145509e-05, + "loss": 0.91, + "step": 4157 + }, + { + "epoch": 0.18230009486077786, + "grad_norm": 0.8359375, + "learning_rate": 3.465701027019426e-05, + "loss": 0.8579, + "step": 4158 + }, + { + "epoch": 0.1823439380774351, + "grad_norm": 0.8359375, + "learning_rate": 3.4653525367424154e-05, + "loss": 0.8227, + "step": 4159 + }, + { + "epoch": 0.18238778129409233, + "grad_norm": 0.7890625, + "learning_rate": 3.465004060315217e-05, + "loss": 0.7374, + "step": 4160 + }, + { + "epoch": 0.18243162451074957, + "grad_norm": 0.84375, + "learning_rate": 3.4646555977385685e-05, + "loss": 0.8995, + "step": 4161 + }, + { + "epoch": 0.1824754677274068, + "grad_norm": 0.77734375, + "learning_rate": 3.4643071490132065e-05, + "loss": 0.7335, + "step": 4162 + }, + { + "epoch": 0.18251931094406404, + "grad_norm": 0.8203125, + "learning_rate": 3.463958714139876e-05, + "loss": 0.9437, + "step": 4163 + }, + { + "epoch": 0.18256315416072125, + "grad_norm": 0.828125, + "learning_rate": 3.4636102931193095e-05, + "loss": 0.7958, + "step": 4164 + }, + { + "epoch": 0.18260699737737848, + "grad_norm": 0.79296875, + "learning_rate": 3.463261885952248e-05, + "loss": 0.7955, + "step": 4165 + }, + { + "epoch": 0.18265084059403572, + "grad_norm": 0.78515625, + "learning_rate": 3.46291349263943e-05, + "loss": 0.7376, + "step": 4166 + }, + { + "epoch": 0.18269468381069295, + "grad_norm": 0.77734375, + "learning_rate": 3.462565113181589e-05, + "loss": 0.8776, + "step": 4167 + }, + { + "epoch": 0.1827385270273502, + "grad_norm": 0.83203125, + "learning_rate": 3.462216747579472e-05, + "loss": 0.9142, + "step": 4168 + }, + { + "epoch": 0.18278237024400742, + "grad_norm": 0.80859375, + "learning_rate": 3.461868395833812e-05, + "loss": 0.8694, + "step": 4169 + }, + { + "epoch": 0.18282621346066466, + "grad_norm": 0.8671875, + "learning_rate": 3.461520057945349e-05, + "loss": 0.8736, + "step": 4170 + }, + { + "epoch": 0.1828700566773219, + "grad_norm": 0.83203125, + "learning_rate": 3.4611717339148197e-05, + "loss": 0.7792, + "step": 4171 + }, + { + "epoch": 0.18291389989397913, + "grad_norm": 0.88671875, + "learning_rate": 3.46082342374296e-05, + "loss": 0.8872, + "step": 4172 + }, + { + "epoch": 0.18295774311063637, + "grad_norm": 0.80859375, + "learning_rate": 3.4604751274305146e-05, + "loss": 0.83, + "step": 4173 + }, + { + "epoch": 0.1830015863272936, + "grad_norm": 0.7890625, + "learning_rate": 3.460126844978219e-05, + "loss": 0.7325, + "step": 4174 + }, + { + "epoch": 0.18304542954395084, + "grad_norm": 0.87109375, + "learning_rate": 3.4597785763868096e-05, + "loss": 0.7949, + "step": 4175 + }, + { + "epoch": 0.18308927276060807, + "grad_norm": 0.7421875, + "learning_rate": 3.459430321657027e-05, + "loss": 0.6974, + "step": 4176 + }, + { + "epoch": 0.1831331159772653, + "grad_norm": 0.76171875, + "learning_rate": 3.459082080789604e-05, + "loss": 0.7405, + "step": 4177 + }, + { + "epoch": 0.18317695919392254, + "grad_norm": 0.8515625, + "learning_rate": 3.458733853785287e-05, + "loss": 0.9467, + "step": 4178 + }, + { + "epoch": 0.18322080241057978, + "grad_norm": 0.81640625, + "learning_rate": 3.458385640644808e-05, + "loss": 0.723, + "step": 4179 + }, + { + "epoch": 0.183264645627237, + "grad_norm": 0.85546875, + "learning_rate": 3.4580374413689076e-05, + "loss": 0.9218, + "step": 4180 + }, + { + "epoch": 0.18330848884389422, + "grad_norm": 0.75390625, + "learning_rate": 3.457689255958323e-05, + "loss": 0.7853, + "step": 4181 + }, + { + "epoch": 0.18335233206055146, + "grad_norm": 0.81640625, + "learning_rate": 3.457341084413792e-05, + "loss": 0.7758, + "step": 4182 + }, + { + "epoch": 0.1833961752772087, + "grad_norm": 0.7890625, + "learning_rate": 3.456992926736049e-05, + "loss": 0.8554, + "step": 4183 + }, + { + "epoch": 0.18344001849386593, + "grad_norm": 0.78125, + "learning_rate": 3.4566447829258385e-05, + "loss": 0.7293, + "step": 4184 + }, + { + "epoch": 0.18348386171052317, + "grad_norm": 0.80859375, + "learning_rate": 3.456296652983896e-05, + "loss": 0.8318, + "step": 4185 + }, + { + "epoch": 0.1835277049271804, + "grad_norm": 0.9296875, + "learning_rate": 3.455948536910959e-05, + "loss": 0.8746, + "step": 4186 + }, + { + "epoch": 0.18357154814383764, + "grad_norm": 0.86328125, + "learning_rate": 3.455600434707764e-05, + "loss": 0.8619, + "step": 4187 + }, + { + "epoch": 0.18361539136049487, + "grad_norm": 0.78515625, + "learning_rate": 3.4552523463750506e-05, + "loss": 0.774, + "step": 4188 + }, + { + "epoch": 0.1836592345771521, + "grad_norm": 0.71875, + "learning_rate": 3.454904271913556e-05, + "loss": 0.595, + "step": 4189 + }, + { + "epoch": 0.18370307779380934, + "grad_norm": 0.76953125, + "learning_rate": 3.454556211324018e-05, + "loss": 0.8022, + "step": 4190 + }, + { + "epoch": 0.18374692101046658, + "grad_norm": 0.7734375, + "learning_rate": 3.4542081646071735e-05, + "loss": 0.7176, + "step": 4191 + }, + { + "epoch": 0.18379076422712382, + "grad_norm": 0.92578125, + "learning_rate": 3.453860131763756e-05, + "loss": 0.8525, + "step": 4192 + }, + { + "epoch": 0.18383460744378105, + "grad_norm": 0.84375, + "learning_rate": 3.453512112794514e-05, + "loss": 0.8051, + "step": 4193 + }, + { + "epoch": 0.1838784506604383, + "grad_norm": 0.9296875, + "learning_rate": 3.4531641077001765e-05, + "loss": 0.8255, + "step": 4194 + }, + { + "epoch": 0.1839222938770955, + "grad_norm": 0.765625, + "learning_rate": 3.452816116481484e-05, + "loss": 0.6813, + "step": 4195 + }, + { + "epoch": 0.18396613709375273, + "grad_norm": 0.83984375, + "learning_rate": 3.452468139139173e-05, + "loss": 0.8645, + "step": 4196 + }, + { + "epoch": 0.18400998031040997, + "grad_norm": 0.86328125, + "learning_rate": 3.45212017567398e-05, + "loss": 0.7747, + "step": 4197 + }, + { + "epoch": 0.1840538235270672, + "grad_norm": 0.81640625, + "learning_rate": 3.451772226086646e-05, + "loss": 0.8823, + "step": 4198 + }, + { + "epoch": 0.18409766674372444, + "grad_norm": 0.8203125, + "learning_rate": 3.451424290377907e-05, + "loss": 0.8707, + "step": 4199 + }, + { + "epoch": 0.18414150996038167, + "grad_norm": 0.9140625, + "learning_rate": 3.4510763685484994e-05, + "loss": 0.8816, + "step": 4200 + }, + { + "epoch": 0.1841853531770389, + "grad_norm": 0.8125, + "learning_rate": 3.450728460599162e-05, + "loss": 0.6922, + "step": 4201 + }, + { + "epoch": 0.18422919639369614, + "grad_norm": 0.859375, + "learning_rate": 3.4503805665306264e-05, + "loss": 0.8719, + "step": 4202 + }, + { + "epoch": 0.18427303961035338, + "grad_norm": 0.8828125, + "learning_rate": 3.4500326863436394e-05, + "loss": 0.7679, + "step": 4203 + }, + { + "epoch": 0.18431688282701061, + "grad_norm": 0.8984375, + "learning_rate": 3.449684820038934e-05, + "loss": 0.8469, + "step": 4204 + }, + { + "epoch": 0.18436072604366785, + "grad_norm": 0.76171875, + "learning_rate": 3.449336967617246e-05, + "loss": 0.7553, + "step": 4205 + }, + { + "epoch": 0.18440456926032509, + "grad_norm": 0.90234375, + "learning_rate": 3.448989129079315e-05, + "loss": 0.7778, + "step": 4206 + }, + { + "epoch": 0.18444841247698232, + "grad_norm": 0.85546875, + "learning_rate": 3.4486413044258737e-05, + "loss": 0.9524, + "step": 4207 + }, + { + "epoch": 0.18449225569363956, + "grad_norm": 0.85546875, + "learning_rate": 3.448293493657665e-05, + "loss": 0.8878, + "step": 4208 + }, + { + "epoch": 0.1845360989102968, + "grad_norm": 0.8125, + "learning_rate": 3.447945696775425e-05, + "loss": 0.7621, + "step": 4209 + }, + { + "epoch": 0.184579942126954, + "grad_norm": 0.75390625, + "learning_rate": 3.447597913779889e-05, + "loss": 0.7866, + "step": 4210 + }, + { + "epoch": 0.18462378534361124, + "grad_norm": 0.9375, + "learning_rate": 3.447250144671794e-05, + "loss": 0.8551, + "step": 4211 + }, + { + "epoch": 0.18466762856026847, + "grad_norm": 0.8359375, + "learning_rate": 3.446902389451876e-05, + "loss": 0.8327, + "step": 4212 + }, + { + "epoch": 0.1847114717769257, + "grad_norm": 0.78125, + "learning_rate": 3.4465546481208765e-05, + "loss": 0.8941, + "step": 4213 + }, + { + "epoch": 0.18475531499358294, + "grad_norm": 0.8203125, + "learning_rate": 3.44620692067953e-05, + "loss": 0.7538, + "step": 4214 + }, + { + "epoch": 0.18479915821024018, + "grad_norm": 0.8359375, + "learning_rate": 3.4458592071285724e-05, + "loss": 0.7424, + "step": 4215 + }, + { + "epoch": 0.1848430014268974, + "grad_norm": 0.84765625, + "learning_rate": 3.4455115074687424e-05, + "loss": 0.8489, + "step": 4216 + }, + { + "epoch": 0.18488684464355465, + "grad_norm": 0.8359375, + "learning_rate": 3.445163821700773e-05, + "loss": 0.8434, + "step": 4217 + }, + { + "epoch": 0.18493068786021188, + "grad_norm": 0.8671875, + "learning_rate": 3.444816149825407e-05, + "loss": 0.8729, + "step": 4218 + }, + { + "epoch": 0.18497453107686912, + "grad_norm": 0.76953125, + "learning_rate": 3.444468491843379e-05, + "loss": 0.7079, + "step": 4219 + }, + { + "epoch": 0.18501837429352636, + "grad_norm": 0.83203125, + "learning_rate": 3.444120847755424e-05, + "loss": 0.7428, + "step": 4220 + }, + { + "epoch": 0.1850622175101836, + "grad_norm": 0.8515625, + "learning_rate": 3.443773217562281e-05, + "loss": 0.8226, + "step": 4221 + }, + { + "epoch": 0.18510606072684083, + "grad_norm": 0.8828125, + "learning_rate": 3.4434256012646835e-05, + "loss": 0.8555, + "step": 4222 + }, + { + "epoch": 0.18514990394349806, + "grad_norm": 0.75, + "learning_rate": 3.4430779988633734e-05, + "loss": 0.5772, + "step": 4223 + }, + { + "epoch": 0.1851937471601553, + "grad_norm": 0.7890625, + "learning_rate": 3.442730410359084e-05, + "loss": 0.8139, + "step": 4224 + }, + { + "epoch": 0.1852375903768125, + "grad_norm": 0.89453125, + "learning_rate": 3.4423828357525537e-05, + "loss": 0.7789, + "step": 4225 + }, + { + "epoch": 0.18528143359346974, + "grad_norm": 0.8671875, + "learning_rate": 3.442035275044517e-05, + "loss": 0.7704, + "step": 4226 + }, + { + "epoch": 0.18532527681012698, + "grad_norm": 0.734375, + "learning_rate": 3.441687728235713e-05, + "loss": 0.6919, + "step": 4227 + }, + { + "epoch": 0.1853691200267842, + "grad_norm": 0.78515625, + "learning_rate": 3.441340195326876e-05, + "loss": 0.7449, + "step": 4228 + }, + { + "epoch": 0.18541296324344145, + "grad_norm": 0.80078125, + "learning_rate": 3.440992676318745e-05, + "loss": 0.6726, + "step": 4229 + }, + { + "epoch": 0.18545680646009868, + "grad_norm": 0.7734375, + "learning_rate": 3.440645171212054e-05, + "loss": 0.8633, + "step": 4230 + }, + { + "epoch": 0.18550064967675592, + "grad_norm": 0.859375, + "learning_rate": 3.440297680007537e-05, + "loss": 0.8013, + "step": 4231 + }, + { + "epoch": 0.18554449289341315, + "grad_norm": 0.7265625, + "learning_rate": 3.439950202705938e-05, + "loss": 0.7005, + "step": 4232 + }, + { + "epoch": 0.1855883361100704, + "grad_norm": 0.7734375, + "learning_rate": 3.43960273930799e-05, + "loss": 0.8055, + "step": 4233 + }, + { + "epoch": 0.18563217932672763, + "grad_norm": 0.83203125, + "learning_rate": 3.439255289814428e-05, + "loss": 1.0025, + "step": 4234 + }, + { + "epoch": 0.18567602254338486, + "grad_norm": 0.81640625, + "learning_rate": 3.43890785422599e-05, + "loss": 0.7537, + "step": 4235 + }, + { + "epoch": 0.1857198657600421, + "grad_norm": 0.8203125, + "learning_rate": 3.438560432543412e-05, + "loss": 0.9864, + "step": 4236 + }, + { + "epoch": 0.18576370897669933, + "grad_norm": 0.85546875, + "learning_rate": 3.438213024767427e-05, + "loss": 0.8076, + "step": 4237 + }, + { + "epoch": 0.18580755219335657, + "grad_norm": 0.8125, + "learning_rate": 3.437865630898777e-05, + "loss": 0.7493, + "step": 4238 + }, + { + "epoch": 0.1858513954100138, + "grad_norm": 0.73046875, + "learning_rate": 3.4375182509381965e-05, + "loss": 0.6987, + "step": 4239 + }, + { + "epoch": 0.185895238626671, + "grad_norm": 0.72265625, + "learning_rate": 3.43717088488642e-05, + "loss": 0.682, + "step": 4240 + }, + { + "epoch": 0.18593908184332825, + "grad_norm": 0.92578125, + "learning_rate": 3.436823532744185e-05, + "loss": 0.79, + "step": 4241 + }, + { + "epoch": 0.18598292505998548, + "grad_norm": 0.875, + "learning_rate": 3.436476194512225e-05, + "loss": 0.7701, + "step": 4242 + }, + { + "epoch": 0.18602676827664272, + "grad_norm": 0.84765625, + "learning_rate": 3.436128870191281e-05, + "loss": 0.8862, + "step": 4243 + }, + { + "epoch": 0.18607061149329995, + "grad_norm": 0.79296875, + "learning_rate": 3.4357815597820864e-05, + "loss": 0.9073, + "step": 4244 + }, + { + "epoch": 0.1861144547099572, + "grad_norm": 0.77734375, + "learning_rate": 3.435434263285379e-05, + "loss": 0.7247, + "step": 4245 + }, + { + "epoch": 0.18615829792661442, + "grad_norm": 0.921875, + "learning_rate": 3.435086980701891e-05, + "loss": 0.7688, + "step": 4246 + }, + { + "epoch": 0.18620214114327166, + "grad_norm": 0.72265625, + "learning_rate": 3.43473971203236e-05, + "loss": 0.8785, + "step": 4247 + }, + { + "epoch": 0.1862459843599289, + "grad_norm": 0.8828125, + "learning_rate": 3.4343924572775253e-05, + "loss": 0.881, + "step": 4248 + }, + { + "epoch": 0.18628982757658613, + "grad_norm": 0.828125, + "learning_rate": 3.434045216438121e-05, + "loss": 0.756, + "step": 4249 + }, + { + "epoch": 0.18633367079324337, + "grad_norm": 0.8828125, + "learning_rate": 3.433697989514881e-05, + "loss": 0.86, + "step": 4250 + }, + { + "epoch": 0.1863775140099006, + "grad_norm": 0.80078125, + "learning_rate": 3.433350776508545e-05, + "loss": 0.8837, + "step": 4251 + }, + { + "epoch": 0.18642135722655784, + "grad_norm": 0.8125, + "learning_rate": 3.433003577419842e-05, + "loss": 0.8497, + "step": 4252 + }, + { + "epoch": 0.18646520044321507, + "grad_norm": 0.8671875, + "learning_rate": 3.4326563922495156e-05, + "loss": 0.882, + "step": 4253 + }, + { + "epoch": 0.1865090436598723, + "grad_norm": 0.8828125, + "learning_rate": 3.4323092209983e-05, + "loss": 0.7976, + "step": 4254 + }, + { + "epoch": 0.18655288687652952, + "grad_norm": 0.87109375, + "learning_rate": 3.431962063666929e-05, + "loss": 0.7784, + "step": 4255 + }, + { + "epoch": 0.18659673009318675, + "grad_norm": 0.83984375, + "learning_rate": 3.4316149202561386e-05, + "loss": 0.8039, + "step": 4256 + }, + { + "epoch": 0.186640573309844, + "grad_norm": 0.80078125, + "learning_rate": 3.431267790766661e-05, + "loss": 0.8824, + "step": 4257 + }, + { + "epoch": 0.18668441652650122, + "grad_norm": 0.84375, + "learning_rate": 3.4309206751992405e-05, + "loss": 0.8635, + "step": 4258 + }, + { + "epoch": 0.18672825974315846, + "grad_norm": 0.9765625, + "learning_rate": 3.430573573554607e-05, + "loss": 0.7363, + "step": 4259 + }, + { + "epoch": 0.1867721029598157, + "grad_norm": 0.93359375, + "learning_rate": 3.430226485833499e-05, + "loss": 0.9648, + "step": 4260 + }, + { + "epoch": 0.18681594617647293, + "grad_norm": 0.80859375, + "learning_rate": 3.429879412036649e-05, + "loss": 0.835, + "step": 4261 + }, + { + "epoch": 0.18685978939313017, + "grad_norm": 0.76953125, + "learning_rate": 3.429532352164795e-05, + "loss": 0.7683, + "step": 4262 + }, + { + "epoch": 0.1869036326097874, + "grad_norm": 0.78125, + "learning_rate": 3.429185306218667e-05, + "loss": 0.8654, + "step": 4263 + }, + { + "epoch": 0.18694747582644464, + "grad_norm": 0.83984375, + "learning_rate": 3.42883827419901e-05, + "loss": 0.8346, + "step": 4264 + }, + { + "epoch": 0.18699131904310187, + "grad_norm": 0.85546875, + "learning_rate": 3.428491256106554e-05, + "loss": 0.9271, + "step": 4265 + }, + { + "epoch": 0.1870351622597591, + "grad_norm": 0.86328125, + "learning_rate": 3.428144251942035e-05, + "loss": 0.7427, + "step": 4266 + }, + { + "epoch": 0.18707900547641634, + "grad_norm": 0.8125, + "learning_rate": 3.427797261706189e-05, + "loss": 0.7216, + "step": 4267 + }, + { + "epoch": 0.18712284869307358, + "grad_norm": 0.80859375, + "learning_rate": 3.427450285399748e-05, + "loss": 0.8187, + "step": 4268 + }, + { + "epoch": 0.18716669190973081, + "grad_norm": 0.8515625, + "learning_rate": 3.427103323023453e-05, + "loss": 0.9746, + "step": 4269 + }, + { + "epoch": 0.18721053512638805, + "grad_norm": 0.81640625, + "learning_rate": 3.426756374578037e-05, + "loss": 0.7543, + "step": 4270 + }, + { + "epoch": 0.18725437834304526, + "grad_norm": 0.8515625, + "learning_rate": 3.4264094400642355e-05, + "loss": 0.9181, + "step": 4271 + }, + { + "epoch": 0.1872982215597025, + "grad_norm": 0.87109375, + "learning_rate": 3.4260625194827834e-05, + "loss": 0.9943, + "step": 4272 + }, + { + "epoch": 0.18734206477635973, + "grad_norm": 0.88671875, + "learning_rate": 3.425715612834416e-05, + "loss": 0.8556, + "step": 4273 + }, + { + "epoch": 0.18738590799301696, + "grad_norm": 0.85546875, + "learning_rate": 3.425368720119868e-05, + "loss": 0.858, + "step": 4274 + }, + { + "epoch": 0.1874297512096742, + "grad_norm": 1.453125, + "learning_rate": 3.425021841339876e-05, + "loss": 0.8668, + "step": 4275 + }, + { + "epoch": 0.18747359442633144, + "grad_norm": 0.80859375, + "learning_rate": 3.4246749764951734e-05, + "loss": 0.86, + "step": 4276 + }, + { + "epoch": 0.18751743764298867, + "grad_norm": 0.85546875, + "learning_rate": 3.4243281255864976e-05, + "loss": 0.8021, + "step": 4277 + }, + { + "epoch": 0.1875612808596459, + "grad_norm": 0.8359375, + "learning_rate": 3.423981288614578e-05, + "loss": 0.91, + "step": 4278 + }, + { + "epoch": 0.18760512407630314, + "grad_norm": 0.8046875, + "learning_rate": 3.423634465580159e-05, + "loss": 0.8305, + "step": 4279 + }, + { + "epoch": 0.18764896729296038, + "grad_norm": 1.0546875, + "learning_rate": 3.42328765648397e-05, + "loss": 0.865, + "step": 4280 + }, + { + "epoch": 0.1876928105096176, + "grad_norm": 0.78125, + "learning_rate": 3.422940861326747e-05, + "loss": 0.7579, + "step": 4281 + }, + { + "epoch": 0.18773665372627485, + "grad_norm": 0.796875, + "learning_rate": 3.422594080109225e-05, + "loss": 0.8992, + "step": 4282 + }, + { + "epoch": 0.18778049694293208, + "grad_norm": 0.8046875, + "learning_rate": 3.422247312832135e-05, + "loss": 0.9271, + "step": 4283 + }, + { + "epoch": 0.18782434015958932, + "grad_norm": 0.765625, + "learning_rate": 3.4219005594962196e-05, + "loss": 0.6688, + "step": 4284 + }, + { + "epoch": 0.18786818337624656, + "grad_norm": 0.79296875, + "learning_rate": 3.42155382010221e-05, + "loss": 0.7062, + "step": 4285 + }, + { + "epoch": 0.18791202659290376, + "grad_norm": 0.8359375, + "learning_rate": 3.421207094650841e-05, + "loss": 0.7772, + "step": 4286 + }, + { + "epoch": 0.187955869809561, + "grad_norm": 0.82421875, + "learning_rate": 3.420860383142848e-05, + "loss": 0.828, + "step": 4287 + }, + { + "epoch": 0.18799971302621823, + "grad_norm": 0.765625, + "learning_rate": 3.420513685578961e-05, + "loss": 0.8905, + "step": 4288 + }, + { + "epoch": 0.18804355624287547, + "grad_norm": 0.80859375, + "learning_rate": 3.420167001959923e-05, + "loss": 0.7497, + "step": 4289 + }, + { + "epoch": 0.1880873994595327, + "grad_norm": 0.78125, + "learning_rate": 3.4198203322864655e-05, + "loss": 0.8077, + "step": 4290 + }, + { + "epoch": 0.18813124267618994, + "grad_norm": 0.890625, + "learning_rate": 3.419473676559323e-05, + "loss": 0.8445, + "step": 4291 + }, + { + "epoch": 0.18817508589284718, + "grad_norm": 0.85546875, + "learning_rate": 3.419127034779228e-05, + "loss": 0.7738, + "step": 4292 + }, + { + "epoch": 0.1882189291095044, + "grad_norm": 0.80859375, + "learning_rate": 3.418780406946915e-05, + "loss": 0.7466, + "step": 4293 + }, + { + "epoch": 0.18826277232616165, + "grad_norm": 0.93359375, + "learning_rate": 3.418433793063125e-05, + "loss": 0.7754, + "step": 4294 + }, + { + "epoch": 0.18830661554281888, + "grad_norm": 0.89453125, + "learning_rate": 3.4180871931285875e-05, + "loss": 0.9188, + "step": 4295 + }, + { + "epoch": 0.18835045875947612, + "grad_norm": 0.8125, + "learning_rate": 3.417740607144038e-05, + "loss": 0.806, + "step": 4296 + }, + { + "epoch": 0.18839430197613335, + "grad_norm": 0.7890625, + "learning_rate": 3.417394035110211e-05, + "loss": 0.6982, + "step": 4297 + }, + { + "epoch": 0.1884381451927906, + "grad_norm": 0.73046875, + "learning_rate": 3.417047477027837e-05, + "loss": 0.6906, + "step": 4298 + }, + { + "epoch": 0.18848198840944783, + "grad_norm": 0.85546875, + "learning_rate": 3.416700932897658e-05, + "loss": 0.9595, + "step": 4299 + }, + { + "epoch": 0.18852583162610506, + "grad_norm": 0.76953125, + "learning_rate": 3.416354402720406e-05, + "loss": 0.7966, + "step": 4300 + }, + { + "epoch": 0.18856967484276227, + "grad_norm": 0.828125, + "learning_rate": 3.416007886496815e-05, + "loss": 0.9162, + "step": 4301 + }, + { + "epoch": 0.1886135180594195, + "grad_norm": 1.2265625, + "learning_rate": 3.4156613842276186e-05, + "loss": 0.7663, + "step": 4302 + }, + { + "epoch": 0.18865736127607674, + "grad_norm": 0.76953125, + "learning_rate": 3.415314895913548e-05, + "loss": 0.8842, + "step": 4303 + }, + { + "epoch": 0.18870120449273398, + "grad_norm": 0.828125, + "learning_rate": 3.4149684215553445e-05, + "loss": 0.8789, + "step": 4304 + }, + { + "epoch": 0.1887450477093912, + "grad_norm": 0.8359375, + "learning_rate": 3.414621961153739e-05, + "loss": 0.8407, + "step": 4305 + }, + { + "epoch": 0.18878889092604845, + "grad_norm": 0.859375, + "learning_rate": 3.414275514709465e-05, + "loss": 0.7351, + "step": 4306 + }, + { + "epoch": 0.18883273414270568, + "grad_norm": 0.90234375, + "learning_rate": 3.413929082223258e-05, + "loss": 0.95, + "step": 4307 + }, + { + "epoch": 0.18887657735936292, + "grad_norm": 0.859375, + "learning_rate": 3.41358266369585e-05, + "loss": 0.8253, + "step": 4308 + }, + { + "epoch": 0.18892042057602015, + "grad_norm": 0.78515625, + "learning_rate": 3.413236259127979e-05, + "loss": 0.7218, + "step": 4309 + }, + { + "epoch": 0.1889642637926774, + "grad_norm": 0.80859375, + "learning_rate": 3.412889868520378e-05, + "loss": 0.7518, + "step": 4310 + }, + { + "epoch": 0.18900810700933462, + "grad_norm": 0.75390625, + "learning_rate": 3.41254349187378e-05, + "loss": 0.7478, + "step": 4311 + }, + { + "epoch": 0.18905195022599186, + "grad_norm": 0.87890625, + "learning_rate": 3.41219712918892e-05, + "loss": 0.9225, + "step": 4312 + }, + { + "epoch": 0.1890957934426491, + "grad_norm": 0.7734375, + "learning_rate": 3.411850780466531e-05, + "loss": 0.882, + "step": 4313 + }, + { + "epoch": 0.18913963665930633, + "grad_norm": 0.77734375, + "learning_rate": 3.4115044457073474e-05, + "loss": 0.7164, + "step": 4314 + }, + { + "epoch": 0.18918347987596357, + "grad_norm": 0.77734375, + "learning_rate": 3.411158124912104e-05, + "loss": 0.6602, + "step": 4315 + }, + { + "epoch": 0.18922732309262077, + "grad_norm": 0.828125, + "learning_rate": 3.410811818081532e-05, + "loss": 0.79, + "step": 4316 + }, + { + "epoch": 0.189271166309278, + "grad_norm": 0.82421875, + "learning_rate": 3.4104655252163696e-05, + "loss": 0.7023, + "step": 4317 + }, + { + "epoch": 0.18931500952593525, + "grad_norm": 0.80859375, + "learning_rate": 3.4101192463173495e-05, + "loss": 0.8871, + "step": 4318 + }, + { + "epoch": 0.18935885274259248, + "grad_norm": 0.8125, + "learning_rate": 3.4097729813852054e-05, + "loss": 0.8043, + "step": 4319 + }, + { + "epoch": 0.18940269595924972, + "grad_norm": 1.0546875, + "learning_rate": 3.409426730420669e-05, + "loss": 0.7359, + "step": 4320 + }, + { + "epoch": 0.18944653917590695, + "grad_norm": 1.65625, + "learning_rate": 3.409080493424478e-05, + "loss": 0.723, + "step": 4321 + }, + { + "epoch": 0.1894903823925642, + "grad_norm": 0.80078125, + "learning_rate": 3.408734270397363e-05, + "loss": 0.8924, + "step": 4322 + }, + { + "epoch": 0.18953422560922142, + "grad_norm": 1.953125, + "learning_rate": 3.4083880613400565e-05, + "loss": 0.8092, + "step": 4323 + }, + { + "epoch": 0.18957806882587866, + "grad_norm": 0.8046875, + "learning_rate": 3.408041866253298e-05, + "loss": 0.829, + "step": 4324 + }, + { + "epoch": 0.1896219120425359, + "grad_norm": 0.921875, + "learning_rate": 3.407695685137817e-05, + "loss": 0.9874, + "step": 4325 + }, + { + "epoch": 0.18966575525919313, + "grad_norm": 0.890625, + "learning_rate": 3.4073495179943494e-05, + "loss": 0.8312, + "step": 4326 + }, + { + "epoch": 0.18970959847585037, + "grad_norm": 0.91796875, + "learning_rate": 3.4070033648236274e-05, + "loss": 0.7563, + "step": 4327 + }, + { + "epoch": 0.1897534416925076, + "grad_norm": 0.82421875, + "learning_rate": 3.4066572256263815e-05, + "loss": 0.8294, + "step": 4328 + }, + { + "epoch": 0.18979728490916484, + "grad_norm": 0.77734375, + "learning_rate": 3.406311100403352e-05, + "loss": 0.7635, + "step": 4329 + }, + { + "epoch": 0.18984112812582207, + "grad_norm": 0.8046875, + "learning_rate": 3.405964989155269e-05, + "loss": 0.7449, + "step": 4330 + }, + { + "epoch": 0.18988497134247928, + "grad_norm": 0.87890625, + "learning_rate": 3.405618891882867e-05, + "loss": 0.7956, + "step": 4331 + }, + { + "epoch": 0.18992881455913652, + "grad_norm": 0.90625, + "learning_rate": 3.405272808586878e-05, + "loss": 0.8223, + "step": 4332 + }, + { + "epoch": 0.18997265777579375, + "grad_norm": 0.8515625, + "learning_rate": 3.404926739268033e-05, + "loss": 0.9047, + "step": 4333 + }, + { + "epoch": 0.190016500992451, + "grad_norm": 0.7265625, + "learning_rate": 3.404580683927072e-05, + "loss": 0.6859, + "step": 4334 + }, + { + "epoch": 0.19006034420910822, + "grad_norm": 0.8984375, + "learning_rate": 3.4042346425647266e-05, + "loss": 0.9289, + "step": 4335 + }, + { + "epoch": 0.19010418742576546, + "grad_norm": 0.87890625, + "learning_rate": 3.4038886151817285e-05, + "loss": 0.9394, + "step": 4336 + }, + { + "epoch": 0.1901480306424227, + "grad_norm": 0.765625, + "learning_rate": 3.4035426017788106e-05, + "loss": 0.7082, + "step": 4337 + }, + { + "epoch": 0.19019187385907993, + "grad_norm": 0.8125, + "learning_rate": 3.403196602356704e-05, + "loss": 0.6528, + "step": 4338 + }, + { + "epoch": 0.19023571707573717, + "grad_norm": 0.90234375, + "learning_rate": 3.402850616916149e-05, + "loss": 0.7878, + "step": 4339 + }, + { + "epoch": 0.1902795602923944, + "grad_norm": 0.83984375, + "learning_rate": 3.402504645457874e-05, + "loss": 0.8705, + "step": 4340 + }, + { + "epoch": 0.19032340350905164, + "grad_norm": 0.80859375, + "learning_rate": 3.402158687982615e-05, + "loss": 0.7627, + "step": 4341 + }, + { + "epoch": 0.19036724672570887, + "grad_norm": 0.85546875, + "learning_rate": 3.401812744491102e-05, + "loss": 1.0481, + "step": 4342 + }, + { + "epoch": 0.1904110899423661, + "grad_norm": 0.88671875, + "learning_rate": 3.401466814984067e-05, + "loss": 0.8861, + "step": 4343 + }, + { + "epoch": 0.19045493315902334, + "grad_norm": 0.7734375, + "learning_rate": 3.401120899462249e-05, + "loss": 0.8309, + "step": 4344 + }, + { + "epoch": 0.19049877637568058, + "grad_norm": 0.84375, + "learning_rate": 3.400774997926379e-05, + "loss": 0.7891, + "step": 4345 + }, + { + "epoch": 0.1905426195923378, + "grad_norm": 0.78515625, + "learning_rate": 3.400429110377188e-05, + "loss": 0.723, + "step": 4346 + }, + { + "epoch": 0.19058646280899502, + "grad_norm": 0.828125, + "learning_rate": 3.400083236815411e-05, + "loss": 0.9573, + "step": 4347 + }, + { + "epoch": 0.19063030602565226, + "grad_norm": 0.83984375, + "learning_rate": 3.3997373772417775e-05, + "loss": 0.8284, + "step": 4348 + }, + { + "epoch": 0.1906741492423095, + "grad_norm": 0.80859375, + "learning_rate": 3.399391531657026e-05, + "loss": 0.7888, + "step": 4349 + }, + { + "epoch": 0.19071799245896673, + "grad_norm": 0.828125, + "learning_rate": 3.399045700061887e-05, + "loss": 0.8912, + "step": 4350 + }, + { + "epoch": 0.19076183567562396, + "grad_norm": 0.76953125, + "learning_rate": 3.398699882457093e-05, + "loss": 0.7611, + "step": 4351 + }, + { + "epoch": 0.1908056788922812, + "grad_norm": 0.8359375, + "learning_rate": 3.398354078843378e-05, + "loss": 0.8066, + "step": 4352 + }, + { + "epoch": 0.19084952210893844, + "grad_norm": 0.7890625, + "learning_rate": 3.398008289221473e-05, + "loss": 0.8401, + "step": 4353 + }, + { + "epoch": 0.19089336532559567, + "grad_norm": 0.83203125, + "learning_rate": 3.397662513592109e-05, + "loss": 0.8038, + "step": 4354 + }, + { + "epoch": 0.1909372085422529, + "grad_norm": 0.84375, + "learning_rate": 3.3973167519560264e-05, + "loss": 0.824, + "step": 4355 + }, + { + "epoch": 0.19098105175891014, + "grad_norm": 0.765625, + "learning_rate": 3.3969710043139525e-05, + "loss": 0.7357, + "step": 4356 + }, + { + "epoch": 0.19102489497556738, + "grad_norm": 0.8125, + "learning_rate": 3.396625270666622e-05, + "loss": 0.8323, + "step": 4357 + }, + { + "epoch": 0.1910687381922246, + "grad_norm": 0.87109375, + "learning_rate": 3.396279551014766e-05, + "loss": 0.7974, + "step": 4358 + }, + { + "epoch": 0.19111258140888185, + "grad_norm": 0.890625, + "learning_rate": 3.395933845359118e-05, + "loss": 0.8317, + "step": 4359 + }, + { + "epoch": 0.19115642462553908, + "grad_norm": 0.84375, + "learning_rate": 3.395588153700412e-05, + "loss": 0.8095, + "step": 4360 + }, + { + "epoch": 0.19120026784219632, + "grad_norm": 0.77734375, + "learning_rate": 3.3952424760393785e-05, + "loss": 0.7285, + "step": 4361 + }, + { + "epoch": 0.19124411105885353, + "grad_norm": 0.82421875, + "learning_rate": 3.394896812376751e-05, + "loss": 0.9059, + "step": 4362 + }, + { + "epoch": 0.19128795427551076, + "grad_norm": 0.75390625, + "learning_rate": 3.394551162713259e-05, + "loss": 0.7327, + "step": 4363 + }, + { + "epoch": 0.191331797492168, + "grad_norm": 0.78125, + "learning_rate": 3.394205527049642e-05, + "loss": 0.7368, + "step": 4364 + }, + { + "epoch": 0.19137564070882523, + "grad_norm": 0.84765625, + "learning_rate": 3.3938599053866286e-05, + "loss": 0.7393, + "step": 4365 + }, + { + "epoch": 0.19141948392548247, + "grad_norm": 0.88671875, + "learning_rate": 3.393514297724951e-05, + "loss": 0.727, + "step": 4366 + }, + { + "epoch": 0.1914633271421397, + "grad_norm": 0.82421875, + "learning_rate": 3.3931687040653424e-05, + "loss": 0.7058, + "step": 4367 + }, + { + "epoch": 0.19150717035879694, + "grad_norm": 0.77734375, + "learning_rate": 3.392823124408532e-05, + "loss": 0.7453, + "step": 4368 + }, + { + "epoch": 0.19155101357545418, + "grad_norm": 0.765625, + "learning_rate": 3.392477558755258e-05, + "loss": 0.719, + "step": 4369 + }, + { + "epoch": 0.1915948567921114, + "grad_norm": 0.84765625, + "learning_rate": 3.392132007106251e-05, + "loss": 0.8889, + "step": 4370 + }, + { + "epoch": 0.19163870000876865, + "grad_norm": 0.8515625, + "learning_rate": 3.3917864694622424e-05, + "loss": 0.8591, + "step": 4371 + }, + { + "epoch": 0.19168254322542588, + "grad_norm": 0.796875, + "learning_rate": 3.391440945823965e-05, + "loss": 0.7578, + "step": 4372 + }, + { + "epoch": 0.19172638644208312, + "grad_norm": 0.8359375, + "learning_rate": 3.3910954361921466e-05, + "loss": 0.7581, + "step": 4373 + }, + { + "epoch": 0.19177022965874035, + "grad_norm": 0.890625, + "learning_rate": 3.390749940567528e-05, + "loss": 0.8724, + "step": 4374 + }, + { + "epoch": 0.1918140728753976, + "grad_norm": 0.83984375, + "learning_rate": 3.390404458950837e-05, + "loss": 0.8064, + "step": 4375 + }, + { + "epoch": 0.19185791609205483, + "grad_norm": 0.9375, + "learning_rate": 3.390058991342805e-05, + "loss": 0.953, + "step": 4376 + }, + { + "epoch": 0.19190175930871203, + "grad_norm": 0.8359375, + "learning_rate": 3.3897135377441656e-05, + "loss": 0.7841, + "step": 4377 + }, + { + "epoch": 0.19194560252536927, + "grad_norm": 0.8046875, + "learning_rate": 3.389368098155647e-05, + "loss": 0.8045, + "step": 4378 + }, + { + "epoch": 0.1919894457420265, + "grad_norm": 0.83984375, + "learning_rate": 3.389022672577988e-05, + "loss": 0.8692, + "step": 4379 + }, + { + "epoch": 0.19203328895868374, + "grad_norm": 1.046875, + "learning_rate": 3.3886772610119186e-05, + "loss": 0.888, + "step": 4380 + }, + { + "epoch": 0.19207713217534098, + "grad_norm": 0.81640625, + "learning_rate": 3.388331863458169e-05, + "loss": 0.7592, + "step": 4381 + }, + { + "epoch": 0.1921209753919982, + "grad_norm": 0.828125, + "learning_rate": 3.3879864799174734e-05, + "loss": 0.8329, + "step": 4382 + }, + { + "epoch": 0.19216481860865545, + "grad_norm": 0.85546875, + "learning_rate": 3.387641110390558e-05, + "loss": 0.8631, + "step": 4383 + }, + { + "epoch": 0.19220866182531268, + "grad_norm": 0.80859375, + "learning_rate": 3.387295754878162e-05, + "loss": 0.9041, + "step": 4384 + }, + { + "epoch": 0.19225250504196992, + "grad_norm": 0.79296875, + "learning_rate": 3.386950413381016e-05, + "loss": 0.8859, + "step": 4385 + }, + { + "epoch": 0.19229634825862715, + "grad_norm": 0.84765625, + "learning_rate": 3.386605085899851e-05, + "loss": 0.8544, + "step": 4386 + }, + { + "epoch": 0.1923401914752844, + "grad_norm": 0.82421875, + "learning_rate": 3.3862597724353976e-05, + "loss": 0.764, + "step": 4387 + }, + { + "epoch": 0.19238403469194162, + "grad_norm": 0.859375, + "learning_rate": 3.385914472988385e-05, + "loss": 0.8804, + "step": 4388 + }, + { + "epoch": 0.19242787790859886, + "grad_norm": 0.82421875, + "learning_rate": 3.385569187559553e-05, + "loss": 0.8032, + "step": 4389 + }, + { + "epoch": 0.1924717211252561, + "grad_norm": 0.91015625, + "learning_rate": 3.385223916149628e-05, + "loss": 0.715, + "step": 4390 + }, + { + "epoch": 0.19251556434191333, + "grad_norm": 0.83984375, + "learning_rate": 3.3848786587593425e-05, + "loss": 0.8085, + "step": 4391 + }, + { + "epoch": 0.19255940755857054, + "grad_norm": 0.9296875, + "learning_rate": 3.3845334153894305e-05, + "loss": 0.8783, + "step": 4392 + }, + { + "epoch": 0.19260325077522777, + "grad_norm": 0.828125, + "learning_rate": 3.384188186040617e-05, + "loss": 0.9009, + "step": 4393 + }, + { + "epoch": 0.192647093991885, + "grad_norm": 0.80859375, + "learning_rate": 3.383842970713642e-05, + "loss": 0.833, + "step": 4394 + }, + { + "epoch": 0.19269093720854225, + "grad_norm": 0.79296875, + "learning_rate": 3.383497769409234e-05, + "loss": 0.8148, + "step": 4395 + }, + { + "epoch": 0.19273478042519948, + "grad_norm": 0.765625, + "learning_rate": 3.383152582128124e-05, + "loss": 0.8583, + "step": 4396 + }, + { + "epoch": 0.19277862364185672, + "grad_norm": 0.7890625, + "learning_rate": 3.3828074088710436e-05, + "loss": 0.8393, + "step": 4397 + }, + { + "epoch": 0.19282246685851395, + "grad_norm": 0.796875, + "learning_rate": 3.382462249638725e-05, + "loss": 0.8483, + "step": 4398 + }, + { + "epoch": 0.1928663100751712, + "grad_norm": 0.75390625, + "learning_rate": 3.382117104431899e-05, + "loss": 0.7247, + "step": 4399 + }, + { + "epoch": 0.19291015329182842, + "grad_norm": 0.79296875, + "learning_rate": 3.381771973251299e-05, + "loss": 0.7176, + "step": 4400 + }, + { + "epoch": 0.19295399650848566, + "grad_norm": 0.83203125, + "learning_rate": 3.3814268560976536e-05, + "loss": 0.8643, + "step": 4401 + }, + { + "epoch": 0.1929978397251429, + "grad_norm": 0.74609375, + "learning_rate": 3.381081752971692e-05, + "loss": 0.8238, + "step": 4402 + }, + { + "epoch": 0.19304168294180013, + "grad_norm": 0.78515625, + "learning_rate": 3.3807366638741546e-05, + "loss": 0.9327, + "step": 4403 + }, + { + "epoch": 0.19308552615845737, + "grad_norm": 0.8203125, + "learning_rate": 3.380391588805766e-05, + "loss": 0.8396, + "step": 4404 + }, + { + "epoch": 0.1931293693751146, + "grad_norm": 0.83984375, + "learning_rate": 3.3800465277672586e-05, + "loss": 0.8227, + "step": 4405 + }, + { + "epoch": 0.19317321259177184, + "grad_norm": 0.84765625, + "learning_rate": 3.379701480759365e-05, + "loss": 0.8379, + "step": 4406 + }, + { + "epoch": 0.19321705580842904, + "grad_norm": 0.7578125, + "learning_rate": 3.379356447782817e-05, + "loss": 0.7158, + "step": 4407 + }, + { + "epoch": 0.19326089902508628, + "grad_norm": 0.796875, + "learning_rate": 3.3790114288383424e-05, + "loss": 0.8571, + "step": 4408 + }, + { + "epoch": 0.19330474224174352, + "grad_norm": 0.9453125, + "learning_rate": 3.378666423926672e-05, + "loss": 0.9118, + "step": 4409 + }, + { + "epoch": 0.19334858545840075, + "grad_norm": 0.75, + "learning_rate": 3.378321433048543e-05, + "loss": 0.8174, + "step": 4410 + }, + { + "epoch": 0.193392428675058, + "grad_norm": 0.80078125, + "learning_rate": 3.3779764562046835e-05, + "loss": 0.8038, + "step": 4411 + }, + { + "epoch": 0.19343627189171522, + "grad_norm": 0.890625, + "learning_rate": 3.377631493395824e-05, + "loss": 0.9305, + "step": 4412 + }, + { + "epoch": 0.19348011510837246, + "grad_norm": 0.87109375, + "learning_rate": 3.377286544622697e-05, + "loss": 0.8814, + "step": 4413 + }, + { + "epoch": 0.1935239583250297, + "grad_norm": 0.85546875, + "learning_rate": 3.3769416098860284e-05, + "loss": 0.8711, + "step": 4414 + }, + { + "epoch": 0.19356780154168693, + "grad_norm": 0.8828125, + "learning_rate": 3.3765966891865576e-05, + "loss": 0.8798, + "step": 4415 + }, + { + "epoch": 0.19361164475834416, + "grad_norm": 0.84375, + "learning_rate": 3.3762517825250106e-05, + "loss": 0.8455, + "step": 4416 + }, + { + "epoch": 0.1936554879750014, + "grad_norm": 1.53125, + "learning_rate": 3.375906889902119e-05, + "loss": 0.8109, + "step": 4417 + }, + { + "epoch": 0.19369933119165864, + "grad_norm": 0.85546875, + "learning_rate": 3.375562011318616e-05, + "loss": 0.88, + "step": 4418 + }, + { + "epoch": 0.19374317440831587, + "grad_norm": 0.8671875, + "learning_rate": 3.375217146775226e-05, + "loss": 0.7843, + "step": 4419 + }, + { + "epoch": 0.1937870176249731, + "grad_norm": 0.83203125, + "learning_rate": 3.3748722962726885e-05, + "loss": 0.7964, + "step": 4420 + }, + { + "epoch": 0.19383086084163034, + "grad_norm": 0.890625, + "learning_rate": 3.3745274598117305e-05, + "loss": 0.7078, + "step": 4421 + }, + { + "epoch": 0.19387470405828755, + "grad_norm": 0.83203125, + "learning_rate": 3.374182637393082e-05, + "loss": 0.8444, + "step": 4422 + }, + { + "epoch": 0.19391854727494479, + "grad_norm": 0.8125, + "learning_rate": 3.373837829017475e-05, + "loss": 0.8729, + "step": 4423 + }, + { + "epoch": 0.19396239049160202, + "grad_norm": 0.8359375, + "learning_rate": 3.373493034685638e-05, + "loss": 0.8141, + "step": 4424 + }, + { + "epoch": 0.19400623370825926, + "grad_norm": 0.7421875, + "learning_rate": 3.373148254398305e-05, + "loss": 0.6877, + "step": 4425 + }, + { + "epoch": 0.1940500769249165, + "grad_norm": 0.84765625, + "learning_rate": 3.372803488156206e-05, + "loss": 0.8899, + "step": 4426 + }, + { + "epoch": 0.19409392014157373, + "grad_norm": 0.7734375, + "learning_rate": 3.3724587359600726e-05, + "loss": 0.8512, + "step": 4427 + }, + { + "epoch": 0.19413776335823096, + "grad_norm": 0.81640625, + "learning_rate": 3.3721139978106334e-05, + "loss": 0.7981, + "step": 4428 + }, + { + "epoch": 0.1941816065748882, + "grad_norm": 0.87109375, + "learning_rate": 3.371769273708616e-05, + "loss": 0.8074, + "step": 4429 + }, + { + "epoch": 0.19422544979154543, + "grad_norm": 0.9296875, + "learning_rate": 3.3714245636547584e-05, + "loss": 0.8597, + "step": 4430 + }, + { + "epoch": 0.19426929300820267, + "grad_norm": 0.82421875, + "learning_rate": 3.3710798676497877e-05, + "loss": 0.7589, + "step": 4431 + }, + { + "epoch": 0.1943131362248599, + "grad_norm": 0.82421875, + "learning_rate": 3.370735185694434e-05, + "loss": 0.8245, + "step": 4432 + }, + { + "epoch": 0.19435697944151714, + "grad_norm": 0.77734375, + "learning_rate": 3.370390517789428e-05, + "loss": 0.7472, + "step": 4433 + }, + { + "epoch": 0.19440082265817438, + "grad_norm": 0.8203125, + "learning_rate": 3.370045863935497e-05, + "loss": 0.8617, + "step": 4434 + }, + { + "epoch": 0.1944446658748316, + "grad_norm": 0.8203125, + "learning_rate": 3.3697012241333794e-05, + "loss": 0.8079, + "step": 4435 + }, + { + "epoch": 0.19448850909148885, + "grad_norm": 0.87890625, + "learning_rate": 3.3693565983837994e-05, + "loss": 0.7004, + "step": 4436 + }, + { + "epoch": 0.19453235230814606, + "grad_norm": 0.7421875, + "learning_rate": 3.36901198668749e-05, + "loss": 0.7444, + "step": 4437 + }, + { + "epoch": 0.1945761955248033, + "grad_norm": 0.8125, + "learning_rate": 3.368667389045181e-05, + "loss": 0.7573, + "step": 4438 + }, + { + "epoch": 0.19462003874146053, + "grad_norm": 0.91015625, + "learning_rate": 3.3683228054575986e-05, + "loss": 0.8611, + "step": 4439 + }, + { + "epoch": 0.19466388195811776, + "grad_norm": 0.8359375, + "learning_rate": 3.367978235925481e-05, + "loss": 0.8173, + "step": 4440 + }, + { + "epoch": 0.194707725174775, + "grad_norm": 0.8203125, + "learning_rate": 3.3676336804495535e-05, + "loss": 0.829, + "step": 4441 + }, + { + "epoch": 0.19475156839143223, + "grad_norm": 0.9140625, + "learning_rate": 3.367289139030547e-05, + "loss": 0.9767, + "step": 4442 + }, + { + "epoch": 0.19479541160808947, + "grad_norm": 0.82421875, + "learning_rate": 3.366944611669193e-05, + "loss": 0.8063, + "step": 4443 + }, + { + "epoch": 0.1948392548247467, + "grad_norm": 0.875, + "learning_rate": 3.366600098366221e-05, + "loss": 0.7653, + "step": 4444 + }, + { + "epoch": 0.19488309804140394, + "grad_norm": 0.796875, + "learning_rate": 3.36625559912236e-05, + "loss": 0.7633, + "step": 4445 + }, + { + "epoch": 0.19492694125806118, + "grad_norm": 0.86328125, + "learning_rate": 3.365911113938341e-05, + "loss": 0.9954, + "step": 4446 + }, + { + "epoch": 0.1949707844747184, + "grad_norm": 0.8828125, + "learning_rate": 3.3655666428148955e-05, + "loss": 0.8599, + "step": 4447 + }, + { + "epoch": 0.19501462769137565, + "grad_norm": 0.84375, + "learning_rate": 3.365222185752751e-05, + "loss": 0.8325, + "step": 4448 + }, + { + "epoch": 0.19505847090803288, + "grad_norm": 0.77734375, + "learning_rate": 3.364877742752637e-05, + "loss": 0.7113, + "step": 4449 + }, + { + "epoch": 0.19510231412469012, + "grad_norm": 0.7734375, + "learning_rate": 3.364533313815287e-05, + "loss": 0.7944, + "step": 4450 + }, + { + "epoch": 0.19514615734134735, + "grad_norm": 0.796875, + "learning_rate": 3.364188898941431e-05, + "loss": 0.7946, + "step": 4451 + }, + { + "epoch": 0.1951900005580046, + "grad_norm": 0.7578125, + "learning_rate": 3.363844498131796e-05, + "loss": 0.815, + "step": 4452 + }, + { + "epoch": 0.1952338437746618, + "grad_norm": 0.84375, + "learning_rate": 3.3635001113871134e-05, + "loss": 0.7466, + "step": 4453 + }, + { + "epoch": 0.19527768699131903, + "grad_norm": 0.91015625, + "learning_rate": 3.36315573870811e-05, + "loss": 0.9056, + "step": 4454 + }, + { + "epoch": 0.19532153020797627, + "grad_norm": 0.77734375, + "learning_rate": 3.362811380095522e-05, + "loss": 0.751, + "step": 4455 + }, + { + "epoch": 0.1953653734246335, + "grad_norm": 0.74609375, + "learning_rate": 3.3624670355500766e-05, + "loss": 0.6749, + "step": 4456 + }, + { + "epoch": 0.19540921664129074, + "grad_norm": 0.8671875, + "learning_rate": 3.3621227050725025e-05, + "loss": 0.6792, + "step": 4457 + }, + { + "epoch": 0.19545305985794797, + "grad_norm": 0.83984375, + "learning_rate": 3.361778388663529e-05, + "loss": 0.8953, + "step": 4458 + }, + { + "epoch": 0.1954969030746052, + "grad_norm": 0.84765625, + "learning_rate": 3.3614340863238846e-05, + "loss": 0.9155, + "step": 4459 + }, + { + "epoch": 0.19554074629126245, + "grad_norm": 0.85546875, + "learning_rate": 3.3610897980543045e-05, + "loss": 0.7915, + "step": 4460 + }, + { + "epoch": 0.19558458950791968, + "grad_norm": 0.82421875, + "learning_rate": 3.3607455238555144e-05, + "loss": 0.9302, + "step": 4461 + }, + { + "epoch": 0.19562843272457692, + "grad_norm": 0.859375, + "learning_rate": 3.360401263728245e-05, + "loss": 0.8418, + "step": 4462 + }, + { + "epoch": 0.19567227594123415, + "grad_norm": 0.8515625, + "learning_rate": 3.360057017673225e-05, + "loss": 0.842, + "step": 4463 + }, + { + "epoch": 0.1957161191578914, + "grad_norm": 0.8046875, + "learning_rate": 3.359712785691182e-05, + "loss": 0.9003, + "step": 4464 + }, + { + "epoch": 0.19575996237454862, + "grad_norm": 0.84375, + "learning_rate": 3.359368567782851e-05, + "loss": 0.6695, + "step": 4465 + }, + { + "epoch": 0.19580380559120586, + "grad_norm": 0.7578125, + "learning_rate": 3.3590243639489594e-05, + "loss": 0.7752, + "step": 4466 + }, + { + "epoch": 0.1958476488078631, + "grad_norm": 0.88671875, + "learning_rate": 3.358680174190236e-05, + "loss": 0.9352, + "step": 4467 + }, + { + "epoch": 0.1958914920245203, + "grad_norm": 0.79296875, + "learning_rate": 3.3583359985074105e-05, + "loss": 0.7728, + "step": 4468 + }, + { + "epoch": 0.19593533524117754, + "grad_norm": 0.87890625, + "learning_rate": 3.357991836901209e-05, + "loss": 0.8463, + "step": 4469 + }, + { + "epoch": 0.19597917845783477, + "grad_norm": 0.703125, + "learning_rate": 3.3576476893723674e-05, + "loss": 0.7726, + "step": 4470 + }, + { + "epoch": 0.196023021674492, + "grad_norm": 0.87890625, + "learning_rate": 3.357303555921611e-05, + "loss": 0.8101, + "step": 4471 + }, + { + "epoch": 0.19606686489114925, + "grad_norm": 0.84375, + "learning_rate": 3.3569594365496704e-05, + "loss": 0.8663, + "step": 4472 + }, + { + "epoch": 0.19611070810780648, + "grad_norm": 0.78515625, + "learning_rate": 3.356615331257276e-05, + "loss": 0.7488, + "step": 4473 + }, + { + "epoch": 0.19615455132446372, + "grad_norm": 0.81640625, + "learning_rate": 3.3562712400451505e-05, + "loss": 0.7288, + "step": 4474 + }, + { + "epoch": 0.19619839454112095, + "grad_norm": 0.875, + "learning_rate": 3.355927162914032e-05, + "loss": 0.9281, + "step": 4475 + }, + { + "epoch": 0.1962422377577782, + "grad_norm": 0.75390625, + "learning_rate": 3.355583099864647e-05, + "loss": 0.7567, + "step": 4476 + }, + { + "epoch": 0.19628608097443542, + "grad_norm": 0.82421875, + "learning_rate": 3.355239050897724e-05, + "loss": 0.8224, + "step": 4477 + }, + { + "epoch": 0.19632992419109266, + "grad_norm": 0.78125, + "learning_rate": 3.35489501601399e-05, + "loss": 0.8553, + "step": 4478 + }, + { + "epoch": 0.1963737674077499, + "grad_norm": 0.8203125, + "learning_rate": 3.354550995214175e-05, + "loss": 0.7709, + "step": 4479 + }, + { + "epoch": 0.19641761062440713, + "grad_norm": 0.80078125, + "learning_rate": 3.354206988499013e-05, + "loss": 0.8461, + "step": 4480 + }, + { + "epoch": 0.19646145384106437, + "grad_norm": 0.75, + "learning_rate": 3.353862995869227e-05, + "loss": 0.7382, + "step": 4481 + }, + { + "epoch": 0.1965052970577216, + "grad_norm": 0.8515625, + "learning_rate": 3.3535190173255503e-05, + "loss": 0.9407, + "step": 4482 + }, + { + "epoch": 0.1965491402743788, + "grad_norm": 0.76171875, + "learning_rate": 3.35317505286871e-05, + "loss": 0.8343, + "step": 4483 + }, + { + "epoch": 0.19659298349103604, + "grad_norm": 0.8984375, + "learning_rate": 3.352831102499434e-05, + "loss": 0.8391, + "step": 4484 + }, + { + "epoch": 0.19663682670769328, + "grad_norm": 0.8359375, + "learning_rate": 3.352487166218454e-05, + "loss": 0.7867, + "step": 4485 + }, + { + "epoch": 0.19668066992435052, + "grad_norm": 0.7890625, + "learning_rate": 3.3521432440264966e-05, + "loss": 0.7242, + "step": 4486 + }, + { + "epoch": 0.19672451314100775, + "grad_norm": 0.875, + "learning_rate": 3.3517993359242896e-05, + "loss": 0.8273, + "step": 4487 + }, + { + "epoch": 0.196768356357665, + "grad_norm": 0.796875, + "learning_rate": 3.351455441912565e-05, + "loss": 0.7637, + "step": 4488 + }, + { + "epoch": 0.19681219957432222, + "grad_norm": 0.7578125, + "learning_rate": 3.351111561992053e-05, + "loss": 0.757, + "step": 4489 + }, + { + "epoch": 0.19685604279097946, + "grad_norm": 0.85546875, + "learning_rate": 3.3507676961634796e-05, + "loss": 0.878, + "step": 4490 + }, + { + "epoch": 0.1968998860076367, + "grad_norm": 0.84765625, + "learning_rate": 3.350423844427573e-05, + "loss": 0.8112, + "step": 4491 + }, + { + "epoch": 0.19694372922429393, + "grad_norm": 0.7734375, + "learning_rate": 3.3500800067850635e-05, + "loss": 0.844, + "step": 4492 + }, + { + "epoch": 0.19698757244095116, + "grad_norm": 0.7890625, + "learning_rate": 3.349736183236679e-05, + "loss": 0.7486, + "step": 4493 + }, + { + "epoch": 0.1970314156576084, + "grad_norm": 0.87109375, + "learning_rate": 3.349392373783146e-05, + "loss": 0.9041, + "step": 4494 + }, + { + "epoch": 0.19707525887426564, + "grad_norm": 0.91015625, + "learning_rate": 3.3490485784251976e-05, + "loss": 0.812, + "step": 4495 + }, + { + "epoch": 0.19711910209092287, + "grad_norm": 0.85546875, + "learning_rate": 3.348704797163561e-05, + "loss": 0.9058, + "step": 4496 + }, + { + "epoch": 0.1971629453075801, + "grad_norm": 0.8203125, + "learning_rate": 3.3483610299989654e-05, + "loss": 0.9043, + "step": 4497 + }, + { + "epoch": 0.19720678852423731, + "grad_norm": 0.8046875, + "learning_rate": 3.348017276932138e-05, + "loss": 0.7965, + "step": 4498 + }, + { + "epoch": 0.19725063174089455, + "grad_norm": 0.79296875, + "learning_rate": 3.347673537963803e-05, + "loss": 0.852, + "step": 4499 + }, + { + "epoch": 0.19729447495755179, + "grad_norm": 0.78125, + "learning_rate": 3.3473298130946996e-05, + "loss": 0.7706, + "step": 4500 + }, + { + "epoch": 0.19733831817420902, + "grad_norm": 0.75390625, + "learning_rate": 3.346986102325549e-05, + "loss": 0.7637, + "step": 4501 + }, + { + "epoch": 0.19738216139086626, + "grad_norm": 0.859375, + "learning_rate": 3.346642405657081e-05, + "loss": 0.8762, + "step": 4502 + }, + { + "epoch": 0.1974260046075235, + "grad_norm": 0.73828125, + "learning_rate": 3.3462987230900245e-05, + "loss": 0.7942, + "step": 4503 + }, + { + "epoch": 0.19746984782418073, + "grad_norm": 0.82421875, + "learning_rate": 3.3459550546251037e-05, + "loss": 0.7951, + "step": 4504 + }, + { + "epoch": 0.19751369104083796, + "grad_norm": 0.80859375, + "learning_rate": 3.345611400263056e-05, + "loss": 0.7911, + "step": 4505 + }, + { + "epoch": 0.1975575342574952, + "grad_norm": 0.7734375, + "learning_rate": 3.3452677600046024e-05, + "loss": 0.8174, + "step": 4506 + }, + { + "epoch": 0.19760137747415243, + "grad_norm": 0.83203125, + "learning_rate": 3.344924133850475e-05, + "loss": 0.7988, + "step": 4507 + }, + { + "epoch": 0.19764522069080967, + "grad_norm": 0.81640625, + "learning_rate": 3.344580521801399e-05, + "loss": 0.7985, + "step": 4508 + }, + { + "epoch": 0.1976890639074669, + "grad_norm": 0.80078125, + "learning_rate": 3.344236923858102e-05, + "loss": 0.7182, + "step": 4509 + }, + { + "epoch": 0.19773290712412414, + "grad_norm": 0.78125, + "learning_rate": 3.343893340021317e-05, + "loss": 0.7752, + "step": 4510 + }, + { + "epoch": 0.19777675034078138, + "grad_norm": 0.77734375, + "learning_rate": 3.34354977029177e-05, + "loss": 0.7318, + "step": 4511 + }, + { + "epoch": 0.1978205935574386, + "grad_norm": 0.85546875, + "learning_rate": 3.343206214670188e-05, + "loss": 0.7858, + "step": 4512 + }, + { + "epoch": 0.19786443677409582, + "grad_norm": 0.91796875, + "learning_rate": 3.342862673157301e-05, + "loss": 0.9669, + "step": 4513 + }, + { + "epoch": 0.19790827999075306, + "grad_norm": 0.828125, + "learning_rate": 3.342519145753832e-05, + "loss": 0.8753, + "step": 4514 + }, + { + "epoch": 0.1979521232074103, + "grad_norm": 0.84375, + "learning_rate": 3.342175632460517e-05, + "loss": 0.8107, + "step": 4515 + }, + { + "epoch": 0.19799596642406753, + "grad_norm": 0.75390625, + "learning_rate": 3.34183213327808e-05, + "loss": 0.7463, + "step": 4516 + }, + { + "epoch": 0.19803980964072476, + "grad_norm": 0.92578125, + "learning_rate": 3.341488648207249e-05, + "loss": 0.7951, + "step": 4517 + }, + { + "epoch": 0.198083652857382, + "grad_norm": 0.8046875, + "learning_rate": 3.341145177248753e-05, + "loss": 0.8121, + "step": 4518 + }, + { + "epoch": 0.19812749607403923, + "grad_norm": 1.078125, + "learning_rate": 3.3408017204033157e-05, + "loss": 0.8147, + "step": 4519 + }, + { + "epoch": 0.19817133929069647, + "grad_norm": 0.953125, + "learning_rate": 3.3404582776716717e-05, + "loss": 0.9703, + "step": 4520 + }, + { + "epoch": 0.1982151825073537, + "grad_norm": 0.80859375, + "learning_rate": 3.3401148490545456e-05, + "loss": 0.7809, + "step": 4521 + }, + { + "epoch": 0.19825902572401094, + "grad_norm": 0.953125, + "learning_rate": 3.339771434552665e-05, + "loss": 0.8568, + "step": 4522 + }, + { + "epoch": 0.19830286894066818, + "grad_norm": 0.77734375, + "learning_rate": 3.3394280341667583e-05, + "loss": 0.7448, + "step": 4523 + }, + { + "epoch": 0.1983467121573254, + "grad_norm": 0.87890625, + "learning_rate": 3.3390846478975534e-05, + "loss": 0.8376, + "step": 4524 + }, + { + "epoch": 0.19839055537398265, + "grad_norm": 0.7734375, + "learning_rate": 3.338741275745775e-05, + "loss": 0.8553, + "step": 4525 + }, + { + "epoch": 0.19843439859063988, + "grad_norm": 0.7578125, + "learning_rate": 3.338397917712157e-05, + "loss": 0.7554, + "step": 4526 + }, + { + "epoch": 0.19847824180729712, + "grad_norm": 0.74609375, + "learning_rate": 3.3380545737974236e-05, + "loss": 0.7958, + "step": 4527 + }, + { + "epoch": 0.19852208502395433, + "grad_norm": 0.8359375, + "learning_rate": 3.337711244002303e-05, + "loss": 0.796, + "step": 4528 + }, + { + "epoch": 0.19856592824061156, + "grad_norm": 0.87109375, + "learning_rate": 3.337367928327522e-05, + "loss": 0.899, + "step": 4529 + }, + { + "epoch": 0.1986097714572688, + "grad_norm": 0.87890625, + "learning_rate": 3.337024626773809e-05, + "loss": 0.8459, + "step": 4530 + }, + { + "epoch": 0.19865361467392603, + "grad_norm": 0.765625, + "learning_rate": 3.3366813393418916e-05, + "loss": 0.7312, + "step": 4531 + }, + { + "epoch": 0.19869745789058327, + "grad_norm": 0.87890625, + "learning_rate": 3.336338066032497e-05, + "loss": 0.8133, + "step": 4532 + }, + { + "epoch": 0.1987413011072405, + "grad_norm": 0.83984375, + "learning_rate": 3.335994806846353e-05, + "loss": 0.8458, + "step": 4533 + }, + { + "epoch": 0.19878514432389774, + "grad_norm": 0.8359375, + "learning_rate": 3.335651561784184e-05, + "loss": 0.8418, + "step": 4534 + }, + { + "epoch": 0.19882898754055497, + "grad_norm": 0.81640625, + "learning_rate": 3.335308330846724e-05, + "loss": 0.8915, + "step": 4535 + }, + { + "epoch": 0.1988728307572122, + "grad_norm": 0.81640625, + "learning_rate": 3.334965114034697e-05, + "loss": 0.7984, + "step": 4536 + }, + { + "epoch": 0.19891667397386945, + "grad_norm": 0.8515625, + "learning_rate": 3.334621911348831e-05, + "loss": 0.7482, + "step": 4537 + }, + { + "epoch": 0.19896051719052668, + "grad_norm": 0.7578125, + "learning_rate": 3.334278722789852e-05, + "loss": 0.7956, + "step": 4538 + }, + { + "epoch": 0.19900436040718392, + "grad_norm": 0.83984375, + "learning_rate": 3.333935548358486e-05, + "loss": 0.6645, + "step": 4539 + }, + { + "epoch": 0.19904820362384115, + "grad_norm": 0.84375, + "learning_rate": 3.3335923880554645e-05, + "loss": 0.8171, + "step": 4540 + }, + { + "epoch": 0.1990920468404984, + "grad_norm": 0.78125, + "learning_rate": 3.333249241881514e-05, + "loss": 0.7322, + "step": 4541 + }, + { + "epoch": 0.19913589005715562, + "grad_norm": 0.84375, + "learning_rate": 3.3329061098373606e-05, + "loss": 0.7773, + "step": 4542 + }, + { + "epoch": 0.19917973327381286, + "grad_norm": 0.88671875, + "learning_rate": 3.332562991923731e-05, + "loss": 0.7832, + "step": 4543 + }, + { + "epoch": 0.19922357649047007, + "grad_norm": 0.79296875, + "learning_rate": 3.332219888141353e-05, + "loss": 0.8886, + "step": 4544 + }, + { + "epoch": 0.1992674197071273, + "grad_norm": 0.796875, + "learning_rate": 3.3318767984909514e-05, + "loss": 0.7812, + "step": 4545 + }, + { + "epoch": 0.19931126292378454, + "grad_norm": 0.74609375, + "learning_rate": 3.331533722973258e-05, + "loss": 0.7135, + "step": 4546 + }, + { + "epoch": 0.19935510614044177, + "grad_norm": 0.74609375, + "learning_rate": 3.331190661588999e-05, + "loss": 0.6689, + "step": 4547 + }, + { + "epoch": 0.199398949357099, + "grad_norm": 0.828125, + "learning_rate": 3.330847614338899e-05, + "loss": 0.8117, + "step": 4548 + }, + { + "epoch": 0.19944279257375624, + "grad_norm": 0.90234375, + "learning_rate": 3.330504581223687e-05, + "loss": 0.8125, + "step": 4549 + }, + { + "epoch": 0.19948663579041348, + "grad_norm": 0.8125, + "learning_rate": 3.330161562244086e-05, + "loss": 0.8935, + "step": 4550 + }, + { + "epoch": 0.19953047900707072, + "grad_norm": 0.80859375, + "learning_rate": 3.329818557400829e-05, + "loss": 0.8622, + "step": 4551 + }, + { + "epoch": 0.19957432222372795, + "grad_norm": 0.82421875, + "learning_rate": 3.3294755666946406e-05, + "loss": 0.8877, + "step": 4552 + }, + { + "epoch": 0.1996181654403852, + "grad_norm": 0.85546875, + "learning_rate": 3.329132590126248e-05, + "loss": 0.848, + "step": 4553 + }, + { + "epoch": 0.19966200865704242, + "grad_norm": 0.8359375, + "learning_rate": 3.328789627696378e-05, + "loss": 0.8882, + "step": 4554 + }, + { + "epoch": 0.19970585187369966, + "grad_norm": 0.8828125, + "learning_rate": 3.3284466794057525e-05, + "loss": 0.916, + "step": 4555 + }, + { + "epoch": 0.1997496950903569, + "grad_norm": 0.86328125, + "learning_rate": 3.3281037452551065e-05, + "loss": 0.8891, + "step": 4556 + }, + { + "epoch": 0.19979353830701413, + "grad_norm": 0.828125, + "learning_rate": 3.327760825245164e-05, + "loss": 0.8333, + "step": 4557 + }, + { + "epoch": 0.19983738152367136, + "grad_norm": 0.89453125, + "learning_rate": 3.327417919376651e-05, + "loss": 0.8563, + "step": 4558 + }, + { + "epoch": 0.19988122474032857, + "grad_norm": 0.88671875, + "learning_rate": 3.327075027650294e-05, + "loss": 0.7711, + "step": 4559 + }, + { + "epoch": 0.1999250679569858, + "grad_norm": 0.92578125, + "learning_rate": 3.326732150066817e-05, + "loss": 1.0407, + "step": 4560 + }, + { + "epoch": 0.19996891117364304, + "grad_norm": 0.74609375, + "learning_rate": 3.3263892866269516e-05, + "loss": 0.8376, + "step": 4561 + }, + { + "epoch": 0.20001275439030028, + "grad_norm": 0.875, + "learning_rate": 3.3260464373314246e-05, + "loss": 0.8663, + "step": 4562 + }, + { + "epoch": 0.20005659760695751, + "grad_norm": 0.79296875, + "learning_rate": 3.3257036021809596e-05, + "loss": 0.8435, + "step": 4563 + }, + { + "epoch": 0.20010044082361475, + "grad_norm": 0.9921875, + "learning_rate": 3.3253607811762857e-05, + "loss": 0.7376, + "step": 4564 + }, + { + "epoch": 0.20014428404027199, + "grad_norm": 0.75390625, + "learning_rate": 3.325017974318123e-05, + "loss": 0.8161, + "step": 4565 + }, + { + "epoch": 0.20018812725692922, + "grad_norm": 0.76953125, + "learning_rate": 3.324675181607208e-05, + "loss": 0.8347, + "step": 4566 + }, + { + "epoch": 0.20023197047358646, + "grad_norm": 0.7578125, + "learning_rate": 3.3243324030442615e-05, + "loss": 0.8717, + "step": 4567 + }, + { + "epoch": 0.2002758136902437, + "grad_norm": 0.7265625, + "learning_rate": 3.3239896386300105e-05, + "loss": 0.725, + "step": 4568 + }, + { + "epoch": 0.20031965690690093, + "grad_norm": 0.83203125, + "learning_rate": 3.323646888365183e-05, + "loss": 0.7573, + "step": 4569 + }, + { + "epoch": 0.20036350012355816, + "grad_norm": 0.80859375, + "learning_rate": 3.323304152250504e-05, + "loss": 0.8044, + "step": 4570 + }, + { + "epoch": 0.2004073433402154, + "grad_norm": 0.83984375, + "learning_rate": 3.3229614302866994e-05, + "loss": 0.8857, + "step": 4571 + }, + { + "epoch": 0.20045118655687263, + "grad_norm": 0.85546875, + "learning_rate": 3.3226187224744975e-05, + "loss": 0.9015, + "step": 4572 + }, + { + "epoch": 0.20049502977352987, + "grad_norm": 0.76953125, + "learning_rate": 3.3222760288146196e-05, + "loss": 0.701, + "step": 4573 + }, + { + "epoch": 0.20053887299018708, + "grad_norm": 0.8125, + "learning_rate": 3.3219333493077996e-05, + "loss": 0.8561, + "step": 4574 + }, + { + "epoch": 0.2005827162068443, + "grad_norm": 0.80078125, + "learning_rate": 3.321590683954761e-05, + "loss": 0.8709, + "step": 4575 + }, + { + "epoch": 0.20062655942350155, + "grad_norm": 0.796875, + "learning_rate": 3.321248032756228e-05, + "loss": 0.8571, + "step": 4576 + }, + { + "epoch": 0.20067040264015878, + "grad_norm": 0.9296875, + "learning_rate": 3.320905395712928e-05, + "loss": 0.8306, + "step": 4577 + }, + { + "epoch": 0.20071424585681602, + "grad_norm": 0.83984375, + "learning_rate": 3.3205627728255875e-05, + "loss": 0.8695, + "step": 4578 + }, + { + "epoch": 0.20075808907347326, + "grad_norm": 0.7578125, + "learning_rate": 3.3202201640949324e-05, + "loss": 0.719, + "step": 4579 + }, + { + "epoch": 0.2008019322901305, + "grad_norm": 0.72265625, + "learning_rate": 3.319877569521686e-05, + "loss": 0.8482, + "step": 4580 + }, + { + "epoch": 0.20084577550678773, + "grad_norm": 0.78515625, + "learning_rate": 3.31953498910658e-05, + "loss": 0.7883, + "step": 4581 + }, + { + "epoch": 0.20088961872344496, + "grad_norm": 0.85546875, + "learning_rate": 3.319192422850338e-05, + "loss": 0.7496, + "step": 4582 + }, + { + "epoch": 0.2009334619401022, + "grad_norm": 0.8515625, + "learning_rate": 3.3188498707536863e-05, + "loss": 0.7896, + "step": 4583 + }, + { + "epoch": 0.20097730515675943, + "grad_norm": 0.828125, + "learning_rate": 3.31850733281735e-05, + "loss": 0.9032, + "step": 4584 + }, + { + "epoch": 0.20102114837341667, + "grad_norm": 0.77734375, + "learning_rate": 3.3181648090420515e-05, + "loss": 0.7398, + "step": 4585 + }, + { + "epoch": 0.2010649915900739, + "grad_norm": 0.78515625, + "learning_rate": 3.317822299428525e-05, + "loss": 0.8456, + "step": 4586 + }, + { + "epoch": 0.20110883480673114, + "grad_norm": 0.84375, + "learning_rate": 3.3174798039774926e-05, + "loss": 0.793, + "step": 4587 + }, + { + "epoch": 0.20115267802338838, + "grad_norm": 0.84375, + "learning_rate": 3.317137322689679e-05, + "loss": 0.7327, + "step": 4588 + }, + { + "epoch": 0.20119652124004558, + "grad_norm": 0.76171875, + "learning_rate": 3.31679485556581e-05, + "loss": 0.6804, + "step": 4589 + }, + { + "epoch": 0.20124036445670282, + "grad_norm": 0.796875, + "learning_rate": 3.31645240260661e-05, + "loss": 0.7848, + "step": 4590 + }, + { + "epoch": 0.20128420767336005, + "grad_norm": 0.84375, + "learning_rate": 3.3161099638128114e-05, + "loss": 0.8306, + "step": 4591 + }, + { + "epoch": 0.2013280508900173, + "grad_norm": 0.8671875, + "learning_rate": 3.315767539185134e-05, + "loss": 0.8612, + "step": 4592 + }, + { + "epoch": 0.20137189410667453, + "grad_norm": 0.796875, + "learning_rate": 3.315425128724306e-05, + "loss": 0.875, + "step": 4593 + }, + { + "epoch": 0.20141573732333176, + "grad_norm": 0.84375, + "learning_rate": 3.3150827324310526e-05, + "loss": 0.8612, + "step": 4594 + }, + { + "epoch": 0.201459580539989, + "grad_norm": 0.80859375, + "learning_rate": 3.314740350306096e-05, + "loss": 0.8, + "step": 4595 + }, + { + "epoch": 0.20150342375664623, + "grad_norm": 1.4453125, + "learning_rate": 3.314397982350168e-05, + "loss": 0.8073, + "step": 4596 + }, + { + "epoch": 0.20154726697330347, + "grad_norm": 0.890625, + "learning_rate": 3.314055628563991e-05, + "loss": 0.8767, + "step": 4597 + }, + { + "epoch": 0.2015911101899607, + "grad_norm": 0.9140625, + "learning_rate": 3.313713288948291e-05, + "loss": 0.6687, + "step": 4598 + }, + { + "epoch": 0.20163495340661794, + "grad_norm": 0.76953125, + "learning_rate": 3.313370963503794e-05, + "loss": 0.8948, + "step": 4599 + }, + { + "epoch": 0.20167879662327517, + "grad_norm": 0.8203125, + "learning_rate": 3.313028652231223e-05, + "loss": 0.6796, + "step": 4600 + }, + { + "epoch": 0.2017226398399324, + "grad_norm": 0.796875, + "learning_rate": 3.312686355131307e-05, + "loss": 0.7824, + "step": 4601 + }, + { + "epoch": 0.20176648305658965, + "grad_norm": 0.83203125, + "learning_rate": 3.3123440722047705e-05, + "loss": 0.838, + "step": 4602 + }, + { + "epoch": 0.20181032627324688, + "grad_norm": 0.875, + "learning_rate": 3.312001803452339e-05, + "loss": 0.8254, + "step": 4603 + }, + { + "epoch": 0.2018541694899041, + "grad_norm": 0.79296875, + "learning_rate": 3.311659548874737e-05, + "loss": 0.8378, + "step": 4604 + }, + { + "epoch": 0.20189801270656133, + "grad_norm": 0.72265625, + "learning_rate": 3.311317308472687e-05, + "loss": 0.7372, + "step": 4605 + }, + { + "epoch": 0.20194185592321856, + "grad_norm": 0.75390625, + "learning_rate": 3.310975082246921e-05, + "loss": 0.7546, + "step": 4606 + }, + { + "epoch": 0.2019856991398758, + "grad_norm": 0.8125, + "learning_rate": 3.3106328701981606e-05, + "loss": 0.7381, + "step": 4607 + }, + { + "epoch": 0.20202954235653303, + "grad_norm": 0.8671875, + "learning_rate": 3.310290672327132e-05, + "loss": 0.9875, + "step": 4608 + }, + { + "epoch": 0.20207338557319027, + "grad_norm": 0.81640625, + "learning_rate": 3.30994848863456e-05, + "loss": 0.876, + "step": 4609 + }, + { + "epoch": 0.2021172287898475, + "grad_norm": 0.8359375, + "learning_rate": 3.309606319121166e-05, + "loss": 0.7073, + "step": 4610 + }, + { + "epoch": 0.20216107200650474, + "grad_norm": 0.921875, + "learning_rate": 3.309264163787682e-05, + "loss": 0.9929, + "step": 4611 + }, + { + "epoch": 0.20220491522316197, + "grad_norm": 0.828125, + "learning_rate": 3.3089220226348315e-05, + "loss": 0.7792, + "step": 4612 + }, + { + "epoch": 0.2022487584398192, + "grad_norm": 0.8046875, + "learning_rate": 3.3085798956633375e-05, + "loss": 0.8932, + "step": 4613 + }, + { + "epoch": 0.20229260165647645, + "grad_norm": 0.79296875, + "learning_rate": 3.308237782873926e-05, + "loss": 0.7409, + "step": 4614 + }, + { + "epoch": 0.20233644487313368, + "grad_norm": 0.8046875, + "learning_rate": 3.307895684267323e-05, + "loss": 0.8465, + "step": 4615 + }, + { + "epoch": 0.20238028808979092, + "grad_norm": 0.796875, + "learning_rate": 3.307553599844251e-05, + "loss": 0.7435, + "step": 4616 + }, + { + "epoch": 0.20242413130644815, + "grad_norm": 0.8125, + "learning_rate": 3.307211529605437e-05, + "loss": 0.7719, + "step": 4617 + }, + { + "epoch": 0.2024679745231054, + "grad_norm": 0.9296875, + "learning_rate": 3.306869473551606e-05, + "loss": 0.8454, + "step": 4618 + }, + { + "epoch": 0.2025118177397626, + "grad_norm": 0.83203125, + "learning_rate": 3.306527431683483e-05, + "loss": 0.8708, + "step": 4619 + }, + { + "epoch": 0.20255566095641983, + "grad_norm": 0.8359375, + "learning_rate": 3.306185404001788e-05, + "loss": 0.8526, + "step": 4620 + }, + { + "epoch": 0.20259950417307707, + "grad_norm": 0.7109375, + "learning_rate": 3.305843390507255e-05, + "loss": 0.7789, + "step": 4621 + }, + { + "epoch": 0.2026433473897343, + "grad_norm": 0.7421875, + "learning_rate": 3.3055013912006036e-05, + "loss": 0.7702, + "step": 4622 + }, + { + "epoch": 0.20268719060639154, + "grad_norm": 0.79296875, + "learning_rate": 3.3051594060825596e-05, + "loss": 0.8227, + "step": 4623 + }, + { + "epoch": 0.20273103382304877, + "grad_norm": 0.75, + "learning_rate": 3.3048174351538475e-05, + "loss": 0.7611, + "step": 4624 + }, + { + "epoch": 0.202774877039706, + "grad_norm": 0.8203125, + "learning_rate": 3.304475478415189e-05, + "loss": 0.7824, + "step": 4625 + }, + { + "epoch": 0.20281872025636324, + "grad_norm": 0.78515625, + "learning_rate": 3.3041335358673155e-05, + "loss": 0.7112, + "step": 4626 + }, + { + "epoch": 0.20286256347302048, + "grad_norm": 0.796875, + "learning_rate": 3.3037916075109476e-05, + "loss": 0.8115, + "step": 4627 + }, + { + "epoch": 0.20290640668967772, + "grad_norm": 0.8046875, + "learning_rate": 3.3034496933468105e-05, + "loss": 0.7178, + "step": 4628 + }, + { + "epoch": 0.20295024990633495, + "grad_norm": 0.81640625, + "learning_rate": 3.3031077933756294e-05, + "loss": 0.8855, + "step": 4629 + }, + { + "epoch": 0.2029940931229922, + "grad_norm": 0.8359375, + "learning_rate": 3.302765907598126e-05, + "loss": 0.9084, + "step": 4630 + }, + { + "epoch": 0.20303793633964942, + "grad_norm": 0.7890625, + "learning_rate": 3.30242403601503e-05, + "loss": 0.7741, + "step": 4631 + }, + { + "epoch": 0.20308177955630666, + "grad_norm": 0.99609375, + "learning_rate": 3.302082178627064e-05, + "loss": 0.8643, + "step": 4632 + }, + { + "epoch": 0.2031256227729639, + "grad_norm": 0.81640625, + "learning_rate": 3.3017403354349506e-05, + "loss": 0.7666, + "step": 4633 + }, + { + "epoch": 0.20316946598962113, + "grad_norm": 0.82421875, + "learning_rate": 3.3013985064394174e-05, + "loss": 0.8138, + "step": 4634 + }, + { + "epoch": 0.20321330920627834, + "grad_norm": 0.82421875, + "learning_rate": 3.3010566916411836e-05, + "loss": 0.716, + "step": 4635 + }, + { + "epoch": 0.20325715242293557, + "grad_norm": 0.82421875, + "learning_rate": 3.3007148910409805e-05, + "loss": 0.7602, + "step": 4636 + }, + { + "epoch": 0.2033009956395928, + "grad_norm": 0.76953125, + "learning_rate": 3.300373104639528e-05, + "loss": 0.6972, + "step": 4637 + }, + { + "epoch": 0.20334483885625004, + "grad_norm": 0.87109375, + "learning_rate": 3.300031332437553e-05, + "loss": 0.8919, + "step": 4638 + }, + { + "epoch": 0.20338868207290728, + "grad_norm": 0.875, + "learning_rate": 3.2996895744357784e-05, + "loss": 0.8676, + "step": 4639 + }, + { + "epoch": 0.20343252528956451, + "grad_norm": 0.84375, + "learning_rate": 3.2993478306349254e-05, + "loss": 0.8583, + "step": 4640 + }, + { + "epoch": 0.20347636850622175, + "grad_norm": 0.81640625, + "learning_rate": 3.299006101035724e-05, + "loss": 0.7964, + "step": 4641 + }, + { + "epoch": 0.20352021172287899, + "grad_norm": 1.046875, + "learning_rate": 3.298664385638898e-05, + "loss": 0.8943, + "step": 4642 + }, + { + "epoch": 0.20356405493953622, + "grad_norm": 0.8359375, + "learning_rate": 3.298322684445169e-05, + "loss": 0.8736, + "step": 4643 + }, + { + "epoch": 0.20360789815619346, + "grad_norm": 0.765625, + "learning_rate": 3.297980997455262e-05, + "loss": 0.793, + "step": 4644 + }, + { + "epoch": 0.2036517413728507, + "grad_norm": 0.859375, + "learning_rate": 3.2976393246698975e-05, + "loss": 0.8111, + "step": 4645 + }, + { + "epoch": 0.20369558458950793, + "grad_norm": 0.90625, + "learning_rate": 3.297297666089807e-05, + "loss": 0.9211, + "step": 4646 + }, + { + "epoch": 0.20373942780616516, + "grad_norm": 0.765625, + "learning_rate": 3.296956021715712e-05, + "loss": 0.8157, + "step": 4647 + }, + { + "epoch": 0.2037832710228224, + "grad_norm": 0.765625, + "learning_rate": 3.296614391548334e-05, + "loss": 0.6714, + "step": 4648 + }, + { + "epoch": 0.20382711423947963, + "grad_norm": 1.078125, + "learning_rate": 3.296272775588399e-05, + "loss": 0.7555, + "step": 4649 + }, + { + "epoch": 0.20387095745613684, + "grad_norm": 0.83984375, + "learning_rate": 3.295931173836628e-05, + "loss": 0.8502, + "step": 4650 + }, + { + "epoch": 0.20391480067279408, + "grad_norm": 0.76953125, + "learning_rate": 3.29558958629375e-05, + "loss": 0.8376, + "step": 4651 + }, + { + "epoch": 0.2039586438894513, + "grad_norm": 0.92578125, + "learning_rate": 3.2952480129604876e-05, + "loss": 0.759, + "step": 4652 + }, + { + "epoch": 0.20400248710610855, + "grad_norm": 0.91796875, + "learning_rate": 3.294906453837564e-05, + "loss": 0.911, + "step": 4653 + }, + { + "epoch": 0.20404633032276578, + "grad_norm": 0.84765625, + "learning_rate": 3.2945649089257024e-05, + "loss": 0.7025, + "step": 4654 + }, + { + "epoch": 0.20409017353942302, + "grad_norm": 0.75390625, + "learning_rate": 3.2942233782256273e-05, + "loss": 0.7705, + "step": 4655 + }, + { + "epoch": 0.20413401675608026, + "grad_norm": 0.9140625, + "learning_rate": 3.293881861738062e-05, + "loss": 0.8149, + "step": 4656 + }, + { + "epoch": 0.2041778599727375, + "grad_norm": 0.8203125, + "learning_rate": 3.2935403594637313e-05, + "loss": 0.7737, + "step": 4657 + }, + { + "epoch": 0.20422170318939473, + "grad_norm": 0.828125, + "learning_rate": 3.293198871403356e-05, + "loss": 0.8341, + "step": 4658 + }, + { + "epoch": 0.20426554640605196, + "grad_norm": 1.15625, + "learning_rate": 3.2928573975576636e-05, + "loss": 0.7601, + "step": 4659 + }, + { + "epoch": 0.2043093896227092, + "grad_norm": 0.828125, + "learning_rate": 3.2925159379273784e-05, + "loss": 0.9344, + "step": 4660 + }, + { + "epoch": 0.20435323283936643, + "grad_norm": 0.9453125, + "learning_rate": 3.292174492513221e-05, + "loss": 0.9343, + "step": 4661 + }, + { + "epoch": 0.20439707605602367, + "grad_norm": 0.84765625, + "learning_rate": 3.2918330613159177e-05, + "loss": 0.8985, + "step": 4662 + }, + { + "epoch": 0.2044409192726809, + "grad_norm": 0.8203125, + "learning_rate": 3.2914916443361896e-05, + "loss": 0.8544, + "step": 4663 + }, + { + "epoch": 0.20448476248933814, + "grad_norm": 0.78515625, + "learning_rate": 3.291150241574762e-05, + "loss": 0.8104, + "step": 4664 + }, + { + "epoch": 0.20452860570599535, + "grad_norm": 0.88671875, + "learning_rate": 3.2908088530323545e-05, + "loss": 0.8306, + "step": 4665 + }, + { + "epoch": 0.20457244892265258, + "grad_norm": 0.7890625, + "learning_rate": 3.2904674787096976e-05, + "loss": 0.7265, + "step": 4666 + }, + { + "epoch": 0.20461629213930982, + "grad_norm": 0.8203125, + "learning_rate": 3.2901261186075117e-05, + "loss": 0.7815, + "step": 4667 + }, + { + "epoch": 0.20466013535596705, + "grad_norm": 0.84375, + "learning_rate": 3.2897847727265196e-05, + "loss": 0.7964, + "step": 4668 + }, + { + "epoch": 0.2047039785726243, + "grad_norm": 0.85546875, + "learning_rate": 3.289443441067446e-05, + "loss": 0.8196, + "step": 4669 + }, + { + "epoch": 0.20474782178928153, + "grad_norm": 0.83203125, + "learning_rate": 3.2891021236310095e-05, + "loss": 0.8471, + "step": 4670 + }, + { + "epoch": 0.20479166500593876, + "grad_norm": 0.81640625, + "learning_rate": 3.28876082041794e-05, + "loss": 0.7788, + "step": 4671 + }, + { + "epoch": 0.204835508222596, + "grad_norm": 0.91015625, + "learning_rate": 3.288419531428959e-05, + "loss": 0.7632, + "step": 4672 + }, + { + "epoch": 0.20487935143925323, + "grad_norm": 0.8515625, + "learning_rate": 3.2880782566647894e-05, + "loss": 0.8279, + "step": 4673 + }, + { + "epoch": 0.20492319465591047, + "grad_norm": 0.7890625, + "learning_rate": 3.287736996126154e-05, + "loss": 0.784, + "step": 4674 + }, + { + "epoch": 0.2049670378725677, + "grad_norm": 0.92578125, + "learning_rate": 3.287395749813773e-05, + "loss": 0.7979, + "step": 4675 + }, + { + "epoch": 0.20501088108922494, + "grad_norm": 0.82421875, + "learning_rate": 3.287054517728377e-05, + "loss": 0.7818, + "step": 4676 + }, + { + "epoch": 0.20505472430588217, + "grad_norm": 0.92578125, + "learning_rate": 3.286713299870685e-05, + "loss": 0.7496, + "step": 4677 + }, + { + "epoch": 0.2050985675225394, + "grad_norm": 0.76953125, + "learning_rate": 3.28637209624142e-05, + "loss": 0.7126, + "step": 4678 + }, + { + "epoch": 0.20514241073919665, + "grad_norm": 0.8359375, + "learning_rate": 3.286030906841306e-05, + "loss": 0.7019, + "step": 4679 + }, + { + "epoch": 0.20518625395585385, + "grad_norm": 0.8671875, + "learning_rate": 3.285689731671062e-05, + "loss": 0.87, + "step": 4680 + }, + { + "epoch": 0.2052300971725111, + "grad_norm": 0.8359375, + "learning_rate": 3.285348570731419e-05, + "loss": 0.9448, + "step": 4681 + }, + { + "epoch": 0.20527394038916832, + "grad_norm": 0.8125, + "learning_rate": 3.285007424023095e-05, + "loss": 0.7184, + "step": 4682 + }, + { + "epoch": 0.20531778360582556, + "grad_norm": 0.83203125, + "learning_rate": 3.284666291546815e-05, + "loss": 0.8145, + "step": 4683 + }, + { + "epoch": 0.2053616268224828, + "grad_norm": 0.72265625, + "learning_rate": 3.2843251733032996e-05, + "loss": 0.6664, + "step": 4684 + }, + { + "epoch": 0.20540547003914003, + "grad_norm": 0.85546875, + "learning_rate": 3.2839840692932744e-05, + "loss": 0.9066, + "step": 4685 + }, + { + "epoch": 0.20544931325579727, + "grad_norm": 0.8671875, + "learning_rate": 3.283642979517457e-05, + "loss": 0.937, + "step": 4686 + }, + { + "epoch": 0.2054931564724545, + "grad_norm": 0.83984375, + "learning_rate": 3.283301903976578e-05, + "loss": 0.8773, + "step": 4687 + }, + { + "epoch": 0.20553699968911174, + "grad_norm": 0.85546875, + "learning_rate": 3.2829608426713566e-05, + "loss": 0.9601, + "step": 4688 + }, + { + "epoch": 0.20558084290576897, + "grad_norm": 0.82421875, + "learning_rate": 3.282619795602516e-05, + "loss": 0.8464, + "step": 4689 + }, + { + "epoch": 0.2056246861224262, + "grad_norm": 0.734375, + "learning_rate": 3.2822787627707784e-05, + "loss": 0.6838, + "step": 4690 + }, + { + "epoch": 0.20566852933908344, + "grad_norm": 1.015625, + "learning_rate": 3.281937744176864e-05, + "loss": 0.8226, + "step": 4691 + }, + { + "epoch": 0.20571237255574068, + "grad_norm": 0.8125, + "learning_rate": 3.281596739821502e-05, + "loss": 0.7589, + "step": 4692 + }, + { + "epoch": 0.20575621577239792, + "grad_norm": 0.83203125, + "learning_rate": 3.2812557497054106e-05, + "loss": 0.8046, + "step": 4693 + }, + { + "epoch": 0.20580005898905515, + "grad_norm": 0.8359375, + "learning_rate": 3.2809147738293154e-05, + "loss": 0.731, + "step": 4694 + }, + { + "epoch": 0.20584390220571236, + "grad_norm": 0.81640625, + "learning_rate": 3.2805738121939365e-05, + "loss": 0.7802, + "step": 4695 + }, + { + "epoch": 0.2058877454223696, + "grad_norm": 0.7421875, + "learning_rate": 3.2802328647999936e-05, + "loss": 0.7687, + "step": 4696 + }, + { + "epoch": 0.20593158863902683, + "grad_norm": 0.84375, + "learning_rate": 3.2798919316482166e-05, + "loss": 0.7423, + "step": 4697 + }, + { + "epoch": 0.20597543185568407, + "grad_norm": 0.984375, + "learning_rate": 3.279551012739325e-05, + "loss": 0.8728, + "step": 4698 + }, + { + "epoch": 0.2060192750723413, + "grad_norm": 0.77734375, + "learning_rate": 3.2792101080740404e-05, + "loss": 0.7654, + "step": 4699 + }, + { + "epoch": 0.20606311828899854, + "grad_norm": 0.83984375, + "learning_rate": 3.278869217653085e-05, + "loss": 0.8011, + "step": 4700 + }, + { + "epoch": 0.20610696150565577, + "grad_norm": 0.828125, + "learning_rate": 3.278528341477184e-05, + "loss": 0.8733, + "step": 4701 + }, + { + "epoch": 0.206150804722313, + "grad_norm": 0.77734375, + "learning_rate": 3.278187479547057e-05, + "loss": 0.7407, + "step": 4702 + }, + { + "epoch": 0.20619464793897024, + "grad_norm": 0.8671875, + "learning_rate": 3.2778466318634284e-05, + "loss": 0.8858, + "step": 4703 + }, + { + "epoch": 0.20623849115562748, + "grad_norm": 0.78125, + "learning_rate": 3.277505798427019e-05, + "loss": 0.7265, + "step": 4704 + }, + { + "epoch": 0.20628233437228471, + "grad_norm": 0.84375, + "learning_rate": 3.277164979238551e-05, + "loss": 0.7928, + "step": 4705 + }, + { + "epoch": 0.20632617758894195, + "grad_norm": 0.82421875, + "learning_rate": 3.2768241742987446e-05, + "loss": 0.8615, + "step": 4706 + }, + { + "epoch": 0.20637002080559919, + "grad_norm": 0.796875, + "learning_rate": 3.276483383608328e-05, + "loss": 0.7801, + "step": 4707 + }, + { + "epoch": 0.20641386402225642, + "grad_norm": 0.8046875, + "learning_rate": 3.276142607168022e-05, + "loss": 0.7968, + "step": 4708 + }, + { + "epoch": 0.20645770723891366, + "grad_norm": 0.8359375, + "learning_rate": 3.275801844978547e-05, + "loss": 0.7962, + "step": 4709 + }, + { + "epoch": 0.20650155045557086, + "grad_norm": 0.86328125, + "learning_rate": 3.275461097040625e-05, + "loss": 0.8111, + "step": 4710 + }, + { + "epoch": 0.2065453936722281, + "grad_norm": 0.84765625, + "learning_rate": 3.275120363354975e-05, + "loss": 0.9146, + "step": 4711 + }, + { + "epoch": 0.20658923688888534, + "grad_norm": 0.80859375, + "learning_rate": 3.274779643922328e-05, + "loss": 0.788, + "step": 4712 + }, + { + "epoch": 0.20663308010554257, + "grad_norm": 0.87109375, + "learning_rate": 3.2744389387434e-05, + "loss": 0.7366, + "step": 4713 + }, + { + "epoch": 0.2066769233221998, + "grad_norm": 0.9375, + "learning_rate": 3.274098247818915e-05, + "loss": 0.8609, + "step": 4714 + }, + { + "epoch": 0.20672076653885704, + "grad_norm": 0.81640625, + "learning_rate": 3.273757571149594e-05, + "loss": 0.7547, + "step": 4715 + }, + { + "epoch": 0.20676460975551428, + "grad_norm": 0.8984375, + "learning_rate": 3.273416908736155e-05, + "loss": 0.8862, + "step": 4716 + }, + { + "epoch": 0.2068084529721715, + "grad_norm": 0.80859375, + "learning_rate": 3.27307626057933e-05, + "loss": 0.7906, + "step": 4717 + }, + { + "epoch": 0.20685229618882875, + "grad_norm": 0.9140625, + "learning_rate": 3.272735626679834e-05, + "loss": 0.9497, + "step": 4718 + }, + { + "epoch": 0.20689613940548598, + "grad_norm": 0.80859375, + "learning_rate": 3.27239500703839e-05, + "loss": 0.9751, + "step": 4719 + }, + { + "epoch": 0.20693998262214322, + "grad_norm": 0.7734375, + "learning_rate": 3.272054401655721e-05, + "loss": 0.8174, + "step": 4720 + }, + { + "epoch": 0.20698382583880046, + "grad_norm": 0.7890625, + "learning_rate": 3.271713810532544e-05, + "loss": 0.7817, + "step": 4721 + }, + { + "epoch": 0.2070276690554577, + "grad_norm": 1.234375, + "learning_rate": 3.271373233669589e-05, + "loss": 0.982, + "step": 4722 + }, + { + "epoch": 0.20707151227211493, + "grad_norm": 0.89453125, + "learning_rate": 3.271032671067573e-05, + "loss": 0.8443, + "step": 4723 + }, + { + "epoch": 0.20711535548877216, + "grad_norm": 0.80859375, + "learning_rate": 3.27069212272722e-05, + "loss": 0.8725, + "step": 4724 + }, + { + "epoch": 0.2071591987054294, + "grad_norm": 0.76171875, + "learning_rate": 3.2703515886492496e-05, + "loss": 0.8021, + "step": 4725 + }, + { + "epoch": 0.2072030419220866, + "grad_norm": 0.8046875, + "learning_rate": 3.270011068834381e-05, + "loss": 0.8764, + "step": 4726 + }, + { + "epoch": 0.20724688513874384, + "grad_norm": 0.84375, + "learning_rate": 3.269670563283342e-05, + "loss": 0.8708, + "step": 4727 + }, + { + "epoch": 0.20729072835540108, + "grad_norm": 0.80078125, + "learning_rate": 3.269330071996853e-05, + "loss": 0.7696, + "step": 4728 + }, + { + "epoch": 0.2073345715720583, + "grad_norm": 0.90625, + "learning_rate": 3.268989594975633e-05, + "loss": 0.7317, + "step": 4729 + }, + { + "epoch": 0.20737841478871555, + "grad_norm": 0.90234375, + "learning_rate": 3.268649132220406e-05, + "loss": 0.8479, + "step": 4730 + }, + { + "epoch": 0.20742225800537278, + "grad_norm": 0.7265625, + "learning_rate": 3.268308683731889e-05, + "loss": 0.7408, + "step": 4731 + }, + { + "epoch": 0.20746610122203002, + "grad_norm": 0.8203125, + "learning_rate": 3.267968249510809e-05, + "loss": 0.864, + "step": 4732 + }, + { + "epoch": 0.20750994443868725, + "grad_norm": 0.81640625, + "learning_rate": 3.2676278295578866e-05, + "loss": 0.871, + "step": 4733 + }, + { + "epoch": 0.2075537876553445, + "grad_norm": 0.89453125, + "learning_rate": 3.267287423873842e-05, + "loss": 0.9574, + "step": 4734 + }, + { + "epoch": 0.20759763087200173, + "grad_norm": 0.8125, + "learning_rate": 3.266947032459396e-05, + "loss": 0.9104, + "step": 4735 + }, + { + "epoch": 0.20764147408865896, + "grad_norm": 0.79296875, + "learning_rate": 3.266606655315269e-05, + "loss": 0.7023, + "step": 4736 + }, + { + "epoch": 0.2076853173053162, + "grad_norm": 0.8671875, + "learning_rate": 3.266266292442186e-05, + "loss": 0.8963, + "step": 4737 + }, + { + "epoch": 0.20772916052197343, + "grad_norm": 0.8515625, + "learning_rate": 3.2659259438408684e-05, + "loss": 0.6561, + "step": 4738 + }, + { + "epoch": 0.20777300373863067, + "grad_norm": 0.86328125, + "learning_rate": 3.265585609512035e-05, + "loss": 0.8406, + "step": 4739 + }, + { + "epoch": 0.2078168469552879, + "grad_norm": 0.8515625, + "learning_rate": 3.265245289456408e-05, + "loss": 0.8481, + "step": 4740 + }, + { + "epoch": 0.2078606901719451, + "grad_norm": 0.90234375, + "learning_rate": 3.2649049836747093e-05, + "loss": 0.7782, + "step": 4741 + }, + { + "epoch": 0.20790453338860235, + "grad_norm": 0.7578125, + "learning_rate": 3.2645646921676584e-05, + "loss": 0.822, + "step": 4742 + }, + { + "epoch": 0.20794837660525958, + "grad_norm": 0.88671875, + "learning_rate": 3.264224414935978e-05, + "loss": 0.8591, + "step": 4743 + }, + { + "epoch": 0.20799221982191682, + "grad_norm": 0.8359375, + "learning_rate": 3.263884151980387e-05, + "loss": 0.9194, + "step": 4744 + }, + { + "epoch": 0.20803606303857405, + "grad_norm": 0.9140625, + "learning_rate": 3.26354390330161e-05, + "loss": 0.8164, + "step": 4745 + }, + { + "epoch": 0.2080799062552313, + "grad_norm": 0.8203125, + "learning_rate": 3.263203668900368e-05, + "loss": 0.7635, + "step": 4746 + }, + { + "epoch": 0.20812374947188853, + "grad_norm": 0.7734375, + "learning_rate": 3.2628634487773803e-05, + "loss": 0.889, + "step": 4747 + }, + { + "epoch": 0.20816759268854576, + "grad_norm": 0.88671875, + "learning_rate": 3.2625232429333674e-05, + "loss": 0.8277, + "step": 4748 + }, + { + "epoch": 0.208211435905203, + "grad_norm": 0.9140625, + "learning_rate": 3.262183051369053e-05, + "loss": 0.9294, + "step": 4749 + }, + { + "epoch": 0.20825527912186023, + "grad_norm": 0.84375, + "learning_rate": 3.2618428740851556e-05, + "loss": 0.8554, + "step": 4750 + }, + { + "epoch": 0.20829912233851747, + "grad_norm": 0.8828125, + "learning_rate": 3.261502711082394e-05, + "loss": 0.7512, + "step": 4751 + }, + { + "epoch": 0.2083429655551747, + "grad_norm": 0.83984375, + "learning_rate": 3.261162562361496e-05, + "loss": 0.9299, + "step": 4752 + }, + { + "epoch": 0.20838680877183194, + "grad_norm": 0.7890625, + "learning_rate": 3.260822427923178e-05, + "loss": 0.7765, + "step": 4753 + }, + { + "epoch": 0.20843065198848917, + "grad_norm": 0.75, + "learning_rate": 3.260482307768162e-05, + "loss": 0.7742, + "step": 4754 + }, + { + "epoch": 0.2084744952051464, + "grad_norm": 0.90625, + "learning_rate": 3.2601422018971684e-05, + "loss": 0.8645, + "step": 4755 + }, + { + "epoch": 0.20851833842180362, + "grad_norm": 0.74609375, + "learning_rate": 3.259802110310916e-05, + "loss": 0.7096, + "step": 4756 + }, + { + "epoch": 0.20856218163846085, + "grad_norm": 0.8671875, + "learning_rate": 3.259462033010129e-05, + "loss": 0.8633, + "step": 4757 + }, + { + "epoch": 0.2086060248551181, + "grad_norm": 0.84765625, + "learning_rate": 3.259121969995528e-05, + "loss": 0.914, + "step": 4758 + }, + { + "epoch": 0.20864986807177532, + "grad_norm": 0.82421875, + "learning_rate": 3.258781921267833e-05, + "loss": 0.8378, + "step": 4759 + }, + { + "epoch": 0.20869371128843256, + "grad_norm": 0.765625, + "learning_rate": 3.258441886827765e-05, + "loss": 0.7277, + "step": 4760 + }, + { + "epoch": 0.2087375545050898, + "grad_norm": 0.82421875, + "learning_rate": 3.258101866676039e-05, + "loss": 0.7625, + "step": 4761 + }, + { + "epoch": 0.20878139772174703, + "grad_norm": 0.84765625, + "learning_rate": 3.257761860813385e-05, + "loss": 0.7439, + "step": 4762 + }, + { + "epoch": 0.20882524093840427, + "grad_norm": 0.8671875, + "learning_rate": 3.25742186924052e-05, + "loss": 0.8188, + "step": 4763 + }, + { + "epoch": 0.2088690841550615, + "grad_norm": 0.89453125, + "learning_rate": 3.257081891958162e-05, + "loss": 0.8916, + "step": 4764 + }, + { + "epoch": 0.20891292737171874, + "grad_norm": 0.83203125, + "learning_rate": 3.2567419289670355e-05, + "loss": 0.8554, + "step": 4765 + }, + { + "epoch": 0.20895677058837597, + "grad_norm": 0.8046875, + "learning_rate": 3.2564019802678555e-05, + "loss": 0.8363, + "step": 4766 + }, + { + "epoch": 0.2090006138050332, + "grad_norm": 0.82421875, + "learning_rate": 3.256062045861349e-05, + "loss": 0.7839, + "step": 4767 + }, + { + "epoch": 0.20904445702169044, + "grad_norm": 1.0390625, + "learning_rate": 3.255722125748233e-05, + "loss": 0.972, + "step": 4768 + }, + { + "epoch": 0.20908830023834768, + "grad_norm": 0.8359375, + "learning_rate": 3.25538221992923e-05, + "loss": 0.8676, + "step": 4769 + }, + { + "epoch": 0.20913214345500492, + "grad_norm": 0.9453125, + "learning_rate": 3.255042328405057e-05, + "loss": 0.808, + "step": 4770 + }, + { + "epoch": 0.20917598667166212, + "grad_norm": 0.83203125, + "learning_rate": 3.2547024511764334e-05, + "loss": 0.7068, + "step": 4771 + }, + { + "epoch": 0.20921982988831936, + "grad_norm": 0.828125, + "learning_rate": 3.254362588244086e-05, + "loss": 0.7325, + "step": 4772 + }, + { + "epoch": 0.2092636731049766, + "grad_norm": 0.84765625, + "learning_rate": 3.254022739608732e-05, + "loss": 0.8818, + "step": 4773 + }, + { + "epoch": 0.20930751632163383, + "grad_norm": 0.7421875, + "learning_rate": 3.253682905271092e-05, + "loss": 0.6749, + "step": 4774 + }, + { + "epoch": 0.20935135953829107, + "grad_norm": 0.75, + "learning_rate": 3.253343085231884e-05, + "loss": 0.7585, + "step": 4775 + }, + { + "epoch": 0.2093952027549483, + "grad_norm": 0.859375, + "learning_rate": 3.2530032794918265e-05, + "loss": 0.8574, + "step": 4776 + }, + { + "epoch": 0.20943904597160554, + "grad_norm": 0.8125, + "learning_rate": 3.252663488051646e-05, + "loss": 0.8913, + "step": 4777 + }, + { + "epoch": 0.20948288918826277, + "grad_norm": 0.796875, + "learning_rate": 3.252323710912061e-05, + "loss": 0.8236, + "step": 4778 + }, + { + "epoch": 0.20952673240492, + "grad_norm": 0.8828125, + "learning_rate": 3.251983948073789e-05, + "loss": 0.9271, + "step": 4779 + }, + { + "epoch": 0.20957057562157724, + "grad_norm": 0.8671875, + "learning_rate": 3.251644199537551e-05, + "loss": 0.8836, + "step": 4780 + }, + { + "epoch": 0.20961441883823448, + "grad_norm": 1.1328125, + "learning_rate": 3.251304465304065e-05, + "loss": 0.7834, + "step": 4781 + }, + { + "epoch": 0.20965826205489171, + "grad_norm": 0.84375, + "learning_rate": 3.250964745374057e-05, + "loss": 0.9737, + "step": 4782 + }, + { + "epoch": 0.20970210527154895, + "grad_norm": 0.8828125, + "learning_rate": 3.2506250397482415e-05, + "loss": 0.8856, + "step": 4783 + }, + { + "epoch": 0.20974594848820619, + "grad_norm": 0.9375, + "learning_rate": 3.2502853484273423e-05, + "loss": 0.8878, + "step": 4784 + }, + { + "epoch": 0.20978979170486342, + "grad_norm": 0.8359375, + "learning_rate": 3.249945671412077e-05, + "loss": 0.9177, + "step": 4785 + }, + { + "epoch": 0.20983363492152063, + "grad_norm": 0.8203125, + "learning_rate": 3.2496060087031666e-05, + "loss": 0.945, + "step": 4786 + }, + { + "epoch": 0.20987747813817786, + "grad_norm": 0.85546875, + "learning_rate": 3.249266360301329e-05, + "loss": 0.8134, + "step": 4787 + }, + { + "epoch": 0.2099213213548351, + "grad_norm": 0.78515625, + "learning_rate": 3.248926726207288e-05, + "loss": 0.8142, + "step": 4788 + }, + { + "epoch": 0.20996516457149234, + "grad_norm": 0.765625, + "learning_rate": 3.248587106421759e-05, + "loss": 0.6959, + "step": 4789 + }, + { + "epoch": 0.21000900778814957, + "grad_norm": 0.75390625, + "learning_rate": 3.248247500945464e-05, + "loss": 0.7946, + "step": 4790 + }, + { + "epoch": 0.2100528510048068, + "grad_norm": 0.7578125, + "learning_rate": 3.247907909779119e-05, + "loss": 0.725, + "step": 4791 + }, + { + "epoch": 0.21009669422146404, + "grad_norm": 0.8046875, + "learning_rate": 3.247568332923451e-05, + "loss": 0.8355, + "step": 4792 + }, + { + "epoch": 0.21014053743812128, + "grad_norm": 0.8125, + "learning_rate": 3.2472287703791756e-05, + "loss": 0.8039, + "step": 4793 + }, + { + "epoch": 0.2101843806547785, + "grad_norm": 0.85546875, + "learning_rate": 3.2468892221470135e-05, + "loss": 0.8541, + "step": 4794 + }, + { + "epoch": 0.21022822387143575, + "grad_norm": 0.734375, + "learning_rate": 3.246549688227683e-05, + "loss": 0.6747, + "step": 4795 + }, + { + "epoch": 0.21027206708809298, + "grad_norm": 0.77734375, + "learning_rate": 3.246210168621902e-05, + "loss": 0.6767, + "step": 4796 + }, + { + "epoch": 0.21031591030475022, + "grad_norm": 0.88671875, + "learning_rate": 3.245870663330395e-05, + "loss": 0.9188, + "step": 4797 + }, + { + "epoch": 0.21035975352140746, + "grad_norm": 0.78515625, + "learning_rate": 3.2455311723538794e-05, + "loss": 0.7925, + "step": 4798 + }, + { + "epoch": 0.2104035967380647, + "grad_norm": 0.7734375, + "learning_rate": 3.245191695693074e-05, + "loss": 0.7647, + "step": 4799 + }, + { + "epoch": 0.21044743995472193, + "grad_norm": 0.890625, + "learning_rate": 3.244852233348699e-05, + "loss": 0.853, + "step": 4800 + }, + { + "epoch": 0.21049128317137913, + "grad_norm": 0.87890625, + "learning_rate": 3.244512785321471e-05, + "loss": 0.7687, + "step": 4801 + }, + { + "epoch": 0.21053512638803637, + "grad_norm": 0.82421875, + "learning_rate": 3.2441733516121144e-05, + "loss": 0.6396, + "step": 4802 + }, + { + "epoch": 0.2105789696046936, + "grad_norm": 0.79296875, + "learning_rate": 3.2438339322213465e-05, + "loss": 0.7039, + "step": 4803 + }, + { + "epoch": 0.21062281282135084, + "grad_norm": 0.875, + "learning_rate": 3.243494527149886e-05, + "loss": 0.9172, + "step": 4804 + }, + { + "epoch": 0.21066665603800808, + "grad_norm": 0.73828125, + "learning_rate": 3.2431551363984534e-05, + "loss": 0.6894, + "step": 4805 + }, + { + "epoch": 0.2107104992546653, + "grad_norm": 0.8046875, + "learning_rate": 3.2428157599677634e-05, + "loss": 0.7101, + "step": 4806 + }, + { + "epoch": 0.21075434247132255, + "grad_norm": 0.85546875, + "learning_rate": 3.2424763978585425e-05, + "loss": 0.9645, + "step": 4807 + }, + { + "epoch": 0.21079818568797978, + "grad_norm": 1.2734375, + "learning_rate": 3.2421370500715076e-05, + "loss": 0.8045, + "step": 4808 + }, + { + "epoch": 0.21084202890463702, + "grad_norm": 0.80078125, + "learning_rate": 3.2417977166073756e-05, + "loss": 0.8605, + "step": 4809 + }, + { + "epoch": 0.21088587212129425, + "grad_norm": 0.82421875, + "learning_rate": 3.2414583974668676e-05, + "loss": 0.828, + "step": 4810 + }, + { + "epoch": 0.2109297153379515, + "grad_norm": 0.8515625, + "learning_rate": 3.2411190926507e-05, + "loss": 0.7086, + "step": 4811 + }, + { + "epoch": 0.21097355855460873, + "grad_norm": 0.9375, + "learning_rate": 3.240779802159596e-05, + "loss": 1.0161, + "step": 4812 + }, + { + "epoch": 0.21101740177126596, + "grad_norm": 0.87109375, + "learning_rate": 3.240440525994273e-05, + "loss": 0.9119, + "step": 4813 + }, + { + "epoch": 0.2110612449879232, + "grad_norm": 0.8046875, + "learning_rate": 3.240101264155451e-05, + "loss": 0.8226, + "step": 4814 + }, + { + "epoch": 0.21110508820458043, + "grad_norm": 0.85546875, + "learning_rate": 3.239762016643847e-05, + "loss": 0.8216, + "step": 4815 + }, + { + "epoch": 0.21114893142123767, + "grad_norm": 0.83984375, + "learning_rate": 3.239422783460177e-05, + "loss": 0.8892, + "step": 4816 + }, + { + "epoch": 0.21119277463789488, + "grad_norm": 1.328125, + "learning_rate": 3.2390835646051684e-05, + "loss": 0.8937, + "step": 4817 + }, + { + "epoch": 0.2112366178545521, + "grad_norm": 0.89453125, + "learning_rate": 3.2387443600795366e-05, + "loss": 0.9256, + "step": 4818 + }, + { + "epoch": 0.21128046107120935, + "grad_norm": 0.7890625, + "learning_rate": 3.238405169883998e-05, + "loss": 0.7258, + "step": 4819 + }, + { + "epoch": 0.21132430428786658, + "grad_norm": 0.890625, + "learning_rate": 3.238065994019275e-05, + "loss": 0.8381, + "step": 4820 + }, + { + "epoch": 0.21136814750452382, + "grad_norm": 0.8046875, + "learning_rate": 3.2377268324860796e-05, + "loss": 0.798, + "step": 4821 + }, + { + "epoch": 0.21141199072118105, + "grad_norm": 0.83203125, + "learning_rate": 3.237387685285139e-05, + "loss": 0.8486, + "step": 4822 + }, + { + "epoch": 0.2114558339378383, + "grad_norm": 0.79296875, + "learning_rate": 3.23704855241717e-05, + "loss": 0.8317, + "step": 4823 + }, + { + "epoch": 0.21149967715449552, + "grad_norm": 0.76171875, + "learning_rate": 3.2367094338828894e-05, + "loss": 0.8006, + "step": 4824 + }, + { + "epoch": 0.21154352037115276, + "grad_norm": 0.765625, + "learning_rate": 3.236370329683016e-05, + "loss": 0.7628, + "step": 4825 + }, + { + "epoch": 0.21158736358781, + "grad_norm": 0.84765625, + "learning_rate": 3.23603123981827e-05, + "loss": 0.8143, + "step": 4826 + }, + { + "epoch": 0.21163120680446723, + "grad_norm": 0.72265625, + "learning_rate": 3.235692164289369e-05, + "loss": 0.6908, + "step": 4827 + }, + { + "epoch": 0.21167505002112447, + "grad_norm": 0.78125, + "learning_rate": 3.235353103097031e-05, + "loss": 0.8393, + "step": 4828 + }, + { + "epoch": 0.2117188932377817, + "grad_norm": 0.78125, + "learning_rate": 3.235014056241973e-05, + "loss": 0.7568, + "step": 4829 + }, + { + "epoch": 0.21176273645443894, + "grad_norm": 0.81640625, + "learning_rate": 3.234675023724919e-05, + "loss": 0.7623, + "step": 4830 + }, + { + "epoch": 0.21180657967109617, + "grad_norm": 0.796875, + "learning_rate": 3.234336005546584e-05, + "loss": 0.7888, + "step": 4831 + }, + { + "epoch": 0.21185042288775338, + "grad_norm": 0.8203125, + "learning_rate": 3.233997001707688e-05, + "loss": 0.7422, + "step": 4832 + }, + { + "epoch": 0.21189426610441062, + "grad_norm": 0.83203125, + "learning_rate": 3.233658012208949e-05, + "loss": 0.8693, + "step": 4833 + }, + { + "epoch": 0.21193810932106785, + "grad_norm": 0.8125, + "learning_rate": 3.233319037051085e-05, + "loss": 0.811, + "step": 4834 + }, + { + "epoch": 0.2119819525377251, + "grad_norm": 0.81640625, + "learning_rate": 3.232980076234814e-05, + "loss": 0.8611, + "step": 4835 + }, + { + "epoch": 0.21202579575438232, + "grad_norm": 0.73828125, + "learning_rate": 3.232641129760855e-05, + "loss": 0.8368, + "step": 4836 + }, + { + "epoch": 0.21206963897103956, + "grad_norm": 0.87890625, + "learning_rate": 3.232302197629923e-05, + "loss": 0.838, + "step": 4837 + }, + { + "epoch": 0.2121134821876968, + "grad_norm": 0.8359375, + "learning_rate": 3.231963279842742e-05, + "loss": 0.8555, + "step": 4838 + }, + { + "epoch": 0.21215732540435403, + "grad_norm": 0.80078125, + "learning_rate": 3.23162437640003e-05, + "loss": 0.8245, + "step": 4839 + }, + { + "epoch": 0.21220116862101127, + "grad_norm": 0.80078125, + "learning_rate": 3.231285487302502e-05, + "loss": 0.813, + "step": 4840 + }, + { + "epoch": 0.2122450118376685, + "grad_norm": 0.77734375, + "learning_rate": 3.230946612550878e-05, + "loss": 0.7883, + "step": 4841 + }, + { + "epoch": 0.21228885505432574, + "grad_norm": 0.890625, + "learning_rate": 3.230607752145872e-05, + "loss": 0.81, + "step": 4842 + }, + { + "epoch": 0.21233269827098297, + "grad_norm": 0.76953125, + "learning_rate": 3.2302689060882094e-05, + "loss": 0.7936, + "step": 4843 + }, + { + "epoch": 0.2123765414876402, + "grad_norm": 0.8515625, + "learning_rate": 3.229930074378605e-05, + "loss": 0.9099, + "step": 4844 + }, + { + "epoch": 0.21242038470429744, + "grad_norm": 0.8046875, + "learning_rate": 3.2295912570177765e-05, + "loss": 0.8366, + "step": 4845 + }, + { + "epoch": 0.21246422792095468, + "grad_norm": 0.81640625, + "learning_rate": 3.229252454006443e-05, + "loss": 0.7604, + "step": 4846 + }, + { + "epoch": 0.2125080711376119, + "grad_norm": 0.90625, + "learning_rate": 3.228913665345319e-05, + "loss": 0.949, + "step": 4847 + }, + { + "epoch": 0.21255191435426912, + "grad_norm": 0.79296875, + "learning_rate": 3.2285748910351275e-05, + "loss": 0.8384, + "step": 4848 + }, + { + "epoch": 0.21259575757092636, + "grad_norm": 0.8125, + "learning_rate": 3.228236131076585e-05, + "loss": 0.8358, + "step": 4849 + }, + { + "epoch": 0.2126396007875836, + "grad_norm": 0.8203125, + "learning_rate": 3.22789738547041e-05, + "loss": 0.7419, + "step": 4850 + }, + { + "epoch": 0.21268344400424083, + "grad_norm": 0.68359375, + "learning_rate": 3.2275586542173174e-05, + "loss": 0.7158, + "step": 4851 + }, + { + "epoch": 0.21272728722089806, + "grad_norm": 0.79296875, + "learning_rate": 3.2272199373180256e-05, + "loss": 0.8515, + "step": 4852 + }, + { + "epoch": 0.2127711304375553, + "grad_norm": 0.828125, + "learning_rate": 3.226881234773257e-05, + "loss": 0.7895, + "step": 4853 + }, + { + "epoch": 0.21281497365421254, + "grad_norm": 0.8125, + "learning_rate": 3.2265425465837255e-05, + "loss": 0.8258, + "step": 4854 + }, + { + "epoch": 0.21285881687086977, + "grad_norm": 0.83203125, + "learning_rate": 3.226203872750151e-05, + "loss": 0.9047, + "step": 4855 + }, + { + "epoch": 0.212902660087527, + "grad_norm": 0.83984375, + "learning_rate": 3.2258652132732494e-05, + "loss": 0.8854, + "step": 4856 + }, + { + "epoch": 0.21294650330418424, + "grad_norm": 0.8359375, + "learning_rate": 3.225526568153736e-05, + "loss": 0.7978, + "step": 4857 + }, + { + "epoch": 0.21299034652084148, + "grad_norm": 0.7890625, + "learning_rate": 3.2251879373923356e-05, + "loss": 0.8111, + "step": 4858 + }, + { + "epoch": 0.2130341897374987, + "grad_norm": 0.8671875, + "learning_rate": 3.2248493209897626e-05, + "loss": 0.8345, + "step": 4859 + }, + { + "epoch": 0.21307803295415595, + "grad_norm": 0.7734375, + "learning_rate": 3.224510718946734e-05, + "loss": 0.8016, + "step": 4860 + }, + { + "epoch": 0.21312187617081318, + "grad_norm": 0.984375, + "learning_rate": 3.224172131263966e-05, + "loss": 0.9499, + "step": 4861 + }, + { + "epoch": 0.2131657193874704, + "grad_norm": 0.90234375, + "learning_rate": 3.2238335579421766e-05, + "loss": 0.8547, + "step": 4862 + }, + { + "epoch": 0.21320956260412763, + "grad_norm": 0.7734375, + "learning_rate": 3.2234949989820875e-05, + "loss": 0.7836, + "step": 4863 + }, + { + "epoch": 0.21325340582078486, + "grad_norm": 0.76953125, + "learning_rate": 3.223156454384413e-05, + "loss": 0.781, + "step": 4864 + }, + { + "epoch": 0.2132972490374421, + "grad_norm": 0.69140625, + "learning_rate": 3.2228179241498716e-05, + "loss": 0.7675, + "step": 4865 + }, + { + "epoch": 0.21334109225409933, + "grad_norm": 0.8984375, + "learning_rate": 3.22247940827918e-05, + "loss": 0.8444, + "step": 4866 + }, + { + "epoch": 0.21338493547075657, + "grad_norm": 0.7890625, + "learning_rate": 3.2221409067730524e-05, + "loss": 0.7897, + "step": 4867 + }, + { + "epoch": 0.2134287786874138, + "grad_norm": 0.83984375, + "learning_rate": 3.2218024196322126e-05, + "loss": 0.8633, + "step": 4868 + }, + { + "epoch": 0.21347262190407104, + "grad_norm": 0.78125, + "learning_rate": 3.221463946857376e-05, + "loss": 0.727, + "step": 4869 + }, + { + "epoch": 0.21351646512072828, + "grad_norm": 0.828125, + "learning_rate": 3.221125488449258e-05, + "loss": 0.7346, + "step": 4870 + }, + { + "epoch": 0.2135603083373855, + "grad_norm": 0.87890625, + "learning_rate": 3.2207870444085786e-05, + "loss": 0.8091, + "step": 4871 + }, + { + "epoch": 0.21360415155404275, + "grad_norm": 0.89453125, + "learning_rate": 3.2204486147360526e-05, + "loss": 0.8639, + "step": 4872 + }, + { + "epoch": 0.21364799477069998, + "grad_norm": 0.75, + "learning_rate": 3.220110199432399e-05, + "loss": 0.7854, + "step": 4873 + }, + { + "epoch": 0.21369183798735722, + "grad_norm": 0.80078125, + "learning_rate": 3.219771798498333e-05, + "loss": 0.8816, + "step": 4874 + }, + { + "epoch": 0.21373568120401445, + "grad_norm": 0.7421875, + "learning_rate": 3.219433411934575e-05, + "loss": 0.7503, + "step": 4875 + }, + { + "epoch": 0.2137795244206717, + "grad_norm": 0.80078125, + "learning_rate": 3.2190950397418386e-05, + "loss": 0.7713, + "step": 4876 + }, + { + "epoch": 0.2138233676373289, + "grad_norm": 0.84765625, + "learning_rate": 3.2187566819208406e-05, + "loss": 0.734, + "step": 4877 + }, + { + "epoch": 0.21386721085398613, + "grad_norm": 0.80078125, + "learning_rate": 3.218418338472302e-05, + "loss": 0.7725, + "step": 4878 + }, + { + "epoch": 0.21391105407064337, + "grad_norm": 0.8203125, + "learning_rate": 3.21808000939694e-05, + "loss": 0.8365, + "step": 4879 + }, + { + "epoch": 0.2139548972873006, + "grad_norm": 0.86328125, + "learning_rate": 3.2177416946954685e-05, + "loss": 0.7617, + "step": 4880 + }, + { + "epoch": 0.21399874050395784, + "grad_norm": 0.921875, + "learning_rate": 3.2174033943686065e-05, + "loss": 0.8311, + "step": 4881 + }, + { + "epoch": 0.21404258372061508, + "grad_norm": 0.875, + "learning_rate": 3.2170651084170666e-05, + "loss": 0.8935, + "step": 4882 + }, + { + "epoch": 0.2140864269372723, + "grad_norm": 1.0546875, + "learning_rate": 3.2167268368415725e-05, + "loss": 0.7871, + "step": 4883 + }, + { + "epoch": 0.21413027015392955, + "grad_norm": 0.83203125, + "learning_rate": 3.2163885796428385e-05, + "loss": 0.7688, + "step": 4884 + }, + { + "epoch": 0.21417411337058678, + "grad_norm": 0.75390625, + "learning_rate": 3.216050336821582e-05, + "loss": 0.7415, + "step": 4885 + }, + { + "epoch": 0.21421795658724402, + "grad_norm": 0.77734375, + "learning_rate": 3.215712108378518e-05, + "loss": 0.8083, + "step": 4886 + }, + { + "epoch": 0.21426179980390125, + "grad_norm": 0.86328125, + "learning_rate": 3.215373894314362e-05, + "loss": 0.9009, + "step": 4887 + }, + { + "epoch": 0.2143056430205585, + "grad_norm": 0.81640625, + "learning_rate": 3.2150356946298354e-05, + "loss": 0.8441, + "step": 4888 + }, + { + "epoch": 0.21434948623721573, + "grad_norm": 0.8203125, + "learning_rate": 3.214697509325654e-05, + "loss": 0.8678, + "step": 4889 + }, + { + "epoch": 0.21439332945387296, + "grad_norm": 0.8359375, + "learning_rate": 3.214359338402533e-05, + "loss": 0.7104, + "step": 4890 + }, + { + "epoch": 0.2144371726705302, + "grad_norm": 0.7890625, + "learning_rate": 3.2140211818611886e-05, + "loss": 0.8435, + "step": 4891 + }, + { + "epoch": 0.2144810158871874, + "grad_norm": 0.7890625, + "learning_rate": 3.2136830397023366e-05, + "loss": 0.9381, + "step": 4892 + }, + { + "epoch": 0.21452485910384464, + "grad_norm": 0.79296875, + "learning_rate": 3.2133449119266976e-05, + "loss": 0.7691, + "step": 4893 + }, + { + "epoch": 0.21456870232050188, + "grad_norm": 0.79296875, + "learning_rate": 3.2130067985349866e-05, + "loss": 0.7519, + "step": 4894 + }, + { + "epoch": 0.2146125455371591, + "grad_norm": 0.8203125, + "learning_rate": 3.21266869952792e-05, + "loss": 0.9113, + "step": 4895 + }, + { + "epoch": 0.21465638875381635, + "grad_norm": 0.8515625, + "learning_rate": 3.212330614906214e-05, + "loss": 0.7966, + "step": 4896 + }, + { + "epoch": 0.21470023197047358, + "grad_norm": 0.8515625, + "learning_rate": 3.211992544670582e-05, + "loss": 0.7676, + "step": 4897 + }, + { + "epoch": 0.21474407518713082, + "grad_norm": 0.8046875, + "learning_rate": 3.211654488821747e-05, + "loss": 0.7492, + "step": 4898 + }, + { + "epoch": 0.21478791840378805, + "grad_norm": 0.8125, + "learning_rate": 3.211316447360422e-05, + "loss": 0.7714, + "step": 4899 + }, + { + "epoch": 0.2148317616204453, + "grad_norm": 0.86328125, + "learning_rate": 3.210978420287324e-05, + "loss": 0.7599, + "step": 4900 + }, + { + "epoch": 0.21487560483710252, + "grad_norm": 0.82421875, + "learning_rate": 3.2106404076031696e-05, + "loss": 0.7233, + "step": 4901 + }, + { + "epoch": 0.21491944805375976, + "grad_norm": 0.7421875, + "learning_rate": 3.210302409308672e-05, + "loss": 0.8131, + "step": 4902 + }, + { + "epoch": 0.214963291270417, + "grad_norm": 0.84765625, + "learning_rate": 3.2099644254045524e-05, + "loss": 0.7838, + "step": 4903 + }, + { + "epoch": 0.21500713448707423, + "grad_norm": 0.69921875, + "learning_rate": 3.2096264558915254e-05, + "loss": 0.7437, + "step": 4904 + }, + { + "epoch": 0.21505097770373147, + "grad_norm": 0.89453125, + "learning_rate": 3.209288500770308e-05, + "loss": 0.8067, + "step": 4905 + }, + { + "epoch": 0.2150948209203887, + "grad_norm": 0.8125, + "learning_rate": 3.208950560041614e-05, + "loss": 0.8924, + "step": 4906 + }, + { + "epoch": 0.21513866413704594, + "grad_norm": 0.796875, + "learning_rate": 3.2086126337061585e-05, + "loss": 0.8021, + "step": 4907 + }, + { + "epoch": 0.21518250735370315, + "grad_norm": 1.3671875, + "learning_rate": 3.208274721764664e-05, + "loss": 0.7814, + "step": 4908 + }, + { + "epoch": 0.21522635057036038, + "grad_norm": 0.76953125, + "learning_rate": 3.207936824217843e-05, + "loss": 0.827, + "step": 4909 + }, + { + "epoch": 0.21527019378701762, + "grad_norm": 0.84375, + "learning_rate": 3.207598941066411e-05, + "loss": 0.8858, + "step": 4910 + }, + { + "epoch": 0.21531403700367485, + "grad_norm": 0.78125, + "learning_rate": 3.207261072311085e-05, + "loss": 0.7401, + "step": 4911 + }, + { + "epoch": 0.2153578802203321, + "grad_norm": 0.81640625, + "learning_rate": 3.206923217952581e-05, + "loss": 0.8339, + "step": 4912 + }, + { + "epoch": 0.21540172343698932, + "grad_norm": 0.8203125, + "learning_rate": 3.2065853779916156e-05, + "loss": 0.6695, + "step": 4913 + }, + { + "epoch": 0.21544556665364656, + "grad_norm": 0.82421875, + "learning_rate": 3.206247552428904e-05, + "loss": 0.7851, + "step": 4914 + }, + { + "epoch": 0.2154894098703038, + "grad_norm": 0.84375, + "learning_rate": 3.205909741265159e-05, + "loss": 0.8764, + "step": 4915 + }, + { + "epoch": 0.21553325308696103, + "grad_norm": 0.8359375, + "learning_rate": 3.205571944501102e-05, + "loss": 0.8922, + "step": 4916 + }, + { + "epoch": 0.21557709630361827, + "grad_norm": 0.796875, + "learning_rate": 3.205234162137448e-05, + "loss": 0.8066, + "step": 4917 + }, + { + "epoch": 0.2156209395202755, + "grad_norm": 0.8046875, + "learning_rate": 3.204896394174912e-05, + "loss": 0.7914, + "step": 4918 + }, + { + "epoch": 0.21566478273693274, + "grad_norm": 0.8046875, + "learning_rate": 3.204558640614209e-05, + "loss": 0.7492, + "step": 4919 + }, + { + "epoch": 0.21570862595358997, + "grad_norm": 0.88671875, + "learning_rate": 3.2042209014560565e-05, + "loss": 0.9134, + "step": 4920 + }, + { + "epoch": 0.2157524691702472, + "grad_norm": 0.828125, + "learning_rate": 3.203883176701169e-05, + "loss": 0.8288, + "step": 4921 + }, + { + "epoch": 0.21579631238690444, + "grad_norm": 0.87890625, + "learning_rate": 3.2035454663502594e-05, + "loss": 0.897, + "step": 4922 + }, + { + "epoch": 0.21584015560356165, + "grad_norm": 0.859375, + "learning_rate": 3.2032077704040496e-05, + "loss": 0.8354, + "step": 4923 + }, + { + "epoch": 0.2158839988202189, + "grad_norm": 0.78515625, + "learning_rate": 3.202870088863253e-05, + "loss": 0.7964, + "step": 4924 + }, + { + "epoch": 0.21592784203687612, + "grad_norm": 0.75, + "learning_rate": 3.2025324217285844e-05, + "loss": 0.7162, + "step": 4925 + }, + { + "epoch": 0.21597168525353336, + "grad_norm": 0.7734375, + "learning_rate": 3.20219476900076e-05, + "loss": 0.7661, + "step": 4926 + }, + { + "epoch": 0.2160155284701906, + "grad_norm": 0.87109375, + "learning_rate": 3.201857130680492e-05, + "loss": 0.8721, + "step": 4927 + }, + { + "epoch": 0.21605937168684783, + "grad_norm": 0.85546875, + "learning_rate": 3.2015195067685025e-05, + "loss": 0.764, + "step": 4928 + }, + { + "epoch": 0.21610321490350506, + "grad_norm": 0.77734375, + "learning_rate": 3.201181897265504e-05, + "loss": 0.7091, + "step": 4929 + }, + { + "epoch": 0.2161470581201623, + "grad_norm": 0.86328125, + "learning_rate": 3.2008443021722114e-05, + "loss": 0.8283, + "step": 4930 + }, + { + "epoch": 0.21619090133681954, + "grad_norm": 0.7578125, + "learning_rate": 3.20050672148934e-05, + "loss": 0.8542, + "step": 4931 + }, + { + "epoch": 0.21623474455347677, + "grad_norm": 0.80859375, + "learning_rate": 3.200169155217604e-05, + "loss": 0.8041, + "step": 4932 + }, + { + "epoch": 0.216278587770134, + "grad_norm": 0.86328125, + "learning_rate": 3.199831603357724e-05, + "loss": 0.8703, + "step": 4933 + }, + { + "epoch": 0.21632243098679124, + "grad_norm": 0.83984375, + "learning_rate": 3.199494065910411e-05, + "loss": 0.794, + "step": 4934 + }, + { + "epoch": 0.21636627420344848, + "grad_norm": 0.8359375, + "learning_rate": 3.199156542876383e-05, + "loss": 0.7825, + "step": 4935 + }, + { + "epoch": 0.2164101174201057, + "grad_norm": 0.90234375, + "learning_rate": 3.198819034256354e-05, + "loss": 0.8847, + "step": 4936 + }, + { + "epoch": 0.21645396063676295, + "grad_norm": 0.9609375, + "learning_rate": 3.198481540051035e-05, + "loss": 0.6839, + "step": 4937 + }, + { + "epoch": 0.21649780385342016, + "grad_norm": 0.90625, + "learning_rate": 3.198144060261149e-05, + "loss": 0.7573, + "step": 4938 + }, + { + "epoch": 0.2165416470700774, + "grad_norm": 0.890625, + "learning_rate": 3.197806594887409e-05, + "loss": 1.0468, + "step": 4939 + }, + { + "epoch": 0.21658549028673463, + "grad_norm": 0.828125, + "learning_rate": 3.197469143930528e-05, + "loss": 0.9636, + "step": 4940 + }, + { + "epoch": 0.21662933350339186, + "grad_norm": 0.89453125, + "learning_rate": 3.1971317073912234e-05, + "loss": 0.9187, + "step": 4941 + }, + { + "epoch": 0.2166731767200491, + "grad_norm": 0.8671875, + "learning_rate": 3.196794285270205e-05, + "loss": 0.7319, + "step": 4942 + }, + { + "epoch": 0.21671701993670633, + "grad_norm": 0.84375, + "learning_rate": 3.1964568775681956e-05, + "loss": 0.8843, + "step": 4943 + }, + { + "epoch": 0.21676086315336357, + "grad_norm": 0.765625, + "learning_rate": 3.196119484285908e-05, + "loss": 0.8649, + "step": 4944 + }, + { + "epoch": 0.2168047063700208, + "grad_norm": 0.82421875, + "learning_rate": 3.195782105424055e-05, + "loss": 0.8545, + "step": 4945 + }, + { + "epoch": 0.21684854958667804, + "grad_norm": 0.78515625, + "learning_rate": 3.1954447409833546e-05, + "loss": 0.8462, + "step": 4946 + }, + { + "epoch": 0.21689239280333528, + "grad_norm": 0.8046875, + "learning_rate": 3.195107390964515e-05, + "loss": 0.7357, + "step": 4947 + }, + { + "epoch": 0.2169362360199925, + "grad_norm": 0.90234375, + "learning_rate": 3.19477005536826e-05, + "loss": 0.9094, + "step": 4948 + }, + { + "epoch": 0.21698007923664975, + "grad_norm": 0.7265625, + "learning_rate": 3.194432734195302e-05, + "loss": 0.6969, + "step": 4949 + }, + { + "epoch": 0.21702392245330698, + "grad_norm": 0.83984375, + "learning_rate": 3.194095427446354e-05, + "loss": 0.8643, + "step": 4950 + }, + { + "epoch": 0.21706776566996422, + "grad_norm": 0.734375, + "learning_rate": 3.1937581351221316e-05, + "loss": 0.7242, + "step": 4951 + }, + { + "epoch": 0.21711160888662145, + "grad_norm": 0.8203125, + "learning_rate": 3.193420857223347e-05, + "loss": 0.8743, + "step": 4952 + }, + { + "epoch": 0.21715545210327866, + "grad_norm": 0.78125, + "learning_rate": 3.193083593750721e-05, + "loss": 0.6879, + "step": 4953 + }, + { + "epoch": 0.2171992953199359, + "grad_norm": 0.87109375, + "learning_rate": 3.192746344704965e-05, + "loss": 0.8092, + "step": 4954 + }, + { + "epoch": 0.21724313853659313, + "grad_norm": 0.796875, + "learning_rate": 3.1924091100867946e-05, + "loss": 0.9007, + "step": 4955 + }, + { + "epoch": 0.21728698175325037, + "grad_norm": 0.85546875, + "learning_rate": 3.192071889896924e-05, + "loss": 0.7398, + "step": 4956 + }, + { + "epoch": 0.2173308249699076, + "grad_norm": 0.8203125, + "learning_rate": 3.1917346841360665e-05, + "loss": 0.8413, + "step": 4957 + }, + { + "epoch": 0.21737466818656484, + "grad_norm": 0.7890625, + "learning_rate": 3.1913974928049394e-05, + "loss": 0.7836, + "step": 4958 + }, + { + "epoch": 0.21741851140322208, + "grad_norm": 0.78125, + "learning_rate": 3.191060315904256e-05, + "loss": 0.7507, + "step": 4959 + }, + { + "epoch": 0.2174623546198793, + "grad_norm": 0.875, + "learning_rate": 3.1907231534347316e-05, + "loss": 0.7654, + "step": 4960 + }, + { + "epoch": 0.21750619783653655, + "grad_norm": 1.015625, + "learning_rate": 3.190386005397079e-05, + "loss": 0.8097, + "step": 4961 + }, + { + "epoch": 0.21755004105319378, + "grad_norm": 0.97265625, + "learning_rate": 3.190048871792012e-05, + "loss": 0.8356, + "step": 4962 + }, + { + "epoch": 0.21759388426985102, + "grad_norm": 0.828125, + "learning_rate": 3.189711752620249e-05, + "loss": 0.7744, + "step": 4963 + }, + { + "epoch": 0.21763772748650825, + "grad_norm": 0.92578125, + "learning_rate": 3.189374647882504e-05, + "loss": 0.7938, + "step": 4964 + }, + { + "epoch": 0.2176815707031655, + "grad_norm": 0.73828125, + "learning_rate": 3.18903755757949e-05, + "loss": 0.7687, + "step": 4965 + }, + { + "epoch": 0.21772541391982272, + "grad_norm": 0.89453125, + "learning_rate": 3.188700481711921e-05, + "loss": 0.9334, + "step": 4966 + }, + { + "epoch": 0.21776925713647996, + "grad_norm": 0.70703125, + "learning_rate": 3.188363420280509e-05, + "loss": 0.6962, + "step": 4967 + }, + { + "epoch": 0.21781310035313717, + "grad_norm": 0.8203125, + "learning_rate": 3.188026373285975e-05, + "loss": 0.6986, + "step": 4968 + }, + { + "epoch": 0.2178569435697944, + "grad_norm": 0.8203125, + "learning_rate": 3.18768934072903e-05, + "loss": 0.8297, + "step": 4969 + }, + { + "epoch": 0.21790078678645164, + "grad_norm": 0.76953125, + "learning_rate": 3.187352322610387e-05, + "loss": 0.7086, + "step": 4970 + }, + { + "epoch": 0.21794463000310887, + "grad_norm": 1.2265625, + "learning_rate": 3.1870153189307625e-05, + "loss": 0.8975, + "step": 4971 + }, + { + "epoch": 0.2179884732197661, + "grad_norm": 0.78125, + "learning_rate": 3.186678329690869e-05, + "loss": 0.8523, + "step": 4972 + }, + { + "epoch": 0.21803231643642335, + "grad_norm": 0.83984375, + "learning_rate": 3.186341354891418e-05, + "loss": 0.7462, + "step": 4973 + }, + { + "epoch": 0.21807615965308058, + "grad_norm": 0.765625, + "learning_rate": 3.186004394533131e-05, + "loss": 0.6756, + "step": 4974 + }, + { + "epoch": 0.21812000286973782, + "grad_norm": 0.78125, + "learning_rate": 3.185667448616717e-05, + "loss": 0.7246, + "step": 4975 + }, + { + "epoch": 0.21816384608639505, + "grad_norm": 1.03125, + "learning_rate": 3.185330517142893e-05, + "loss": 0.7977, + "step": 4976 + }, + { + "epoch": 0.2182076893030523, + "grad_norm": 0.80859375, + "learning_rate": 3.18499360011237e-05, + "loss": 0.704, + "step": 4977 + }, + { + "epoch": 0.21825153251970952, + "grad_norm": 0.953125, + "learning_rate": 3.1846566975258605e-05, + "loss": 0.8796, + "step": 4978 + }, + { + "epoch": 0.21829537573636676, + "grad_norm": 0.765625, + "learning_rate": 3.184319809384085e-05, + "loss": 0.789, + "step": 4979 + }, + { + "epoch": 0.218339218953024, + "grad_norm": 0.74609375, + "learning_rate": 3.183982935687755e-05, + "loss": 0.712, + "step": 4980 + }, + { + "epoch": 0.21838306216968123, + "grad_norm": 0.75, + "learning_rate": 3.1836460764375816e-05, + "loss": 0.7405, + "step": 4981 + }, + { + "epoch": 0.21842690538633847, + "grad_norm": 0.8671875, + "learning_rate": 3.183309231634283e-05, + "loss": 0.8119, + "step": 4982 + }, + { + "epoch": 0.21847074860299567, + "grad_norm": 0.80859375, + "learning_rate": 3.182972401278567e-05, + "loss": 0.8154, + "step": 4983 + }, + { + "epoch": 0.2185145918196529, + "grad_norm": 0.81640625, + "learning_rate": 3.182635585371153e-05, + "loss": 0.7595, + "step": 4984 + }, + { + "epoch": 0.21855843503631014, + "grad_norm": 0.7734375, + "learning_rate": 3.182298783912755e-05, + "loss": 0.7187, + "step": 4985 + }, + { + "epoch": 0.21860227825296738, + "grad_norm": 0.8671875, + "learning_rate": 3.181961996904085e-05, + "loss": 0.7594, + "step": 4986 + }, + { + "epoch": 0.21864612146962462, + "grad_norm": 0.8671875, + "learning_rate": 3.181625224345856e-05, + "loss": 0.8075, + "step": 4987 + }, + { + "epoch": 0.21868996468628185, + "grad_norm": 0.8359375, + "learning_rate": 3.1812884662387786e-05, + "loss": 0.8658, + "step": 4988 + }, + { + "epoch": 0.2187338079029391, + "grad_norm": 0.7734375, + "learning_rate": 3.180951722583575e-05, + "loss": 0.8111, + "step": 4989 + }, + { + "epoch": 0.21877765111959632, + "grad_norm": 0.84765625, + "learning_rate": 3.180614993380955e-05, + "loss": 0.7962, + "step": 4990 + }, + { + "epoch": 0.21882149433625356, + "grad_norm": 0.8125, + "learning_rate": 3.180278278631631e-05, + "loss": 0.864, + "step": 4991 + }, + { + "epoch": 0.2188653375529108, + "grad_norm": 0.8203125, + "learning_rate": 3.1799415783363175e-05, + "loss": 0.8265, + "step": 4992 + }, + { + "epoch": 0.21890918076956803, + "grad_norm": 0.86328125, + "learning_rate": 3.1796048924957244e-05, + "loss": 0.8413, + "step": 4993 + }, + { + "epoch": 0.21895302398622526, + "grad_norm": 0.8203125, + "learning_rate": 3.1792682211105725e-05, + "loss": 0.824, + "step": 4994 + }, + { + "epoch": 0.2189968672028825, + "grad_norm": 0.76953125, + "learning_rate": 3.178931564181572e-05, + "loss": 0.8222, + "step": 4995 + }, + { + "epoch": 0.21904071041953974, + "grad_norm": 0.8515625, + "learning_rate": 3.178594921709436e-05, + "loss": 0.7475, + "step": 4996 + }, + { + "epoch": 0.21908455363619697, + "grad_norm": 0.87109375, + "learning_rate": 3.178258293694878e-05, + "loss": 0.7142, + "step": 4997 + }, + { + "epoch": 0.2191283968528542, + "grad_norm": 1.1171875, + "learning_rate": 3.177921680138611e-05, + "loss": 0.8744, + "step": 4998 + }, + { + "epoch": 0.21917224006951141, + "grad_norm": 1.1640625, + "learning_rate": 3.17758508104135e-05, + "loss": 0.8037, + "step": 4999 + }, + { + "epoch": 0.21921608328616865, + "grad_norm": 0.83984375, + "learning_rate": 3.177248496403804e-05, + "loss": 0.7991, + "step": 5000 + }, + { + "epoch": 0.21921608328616865, + "eval_loss": 0.8009320497512817, + "eval_runtime": 304.8085, + "eval_samples_per_second": 32.807, + "eval_steps_per_second": 0.686, + "step": 5000 + }, + { + "epoch": 0.21925992650282589, + "grad_norm": 0.8046875, + "learning_rate": 3.176911926226693e-05, + "loss": 0.8011, + "step": 5001 + }, + { + "epoch": 0.21930376971948312, + "grad_norm": 0.828125, + "learning_rate": 3.176575370510727e-05, + "loss": 0.7755, + "step": 5002 + }, + { + "epoch": 0.21934761293614036, + "grad_norm": 0.90625, + "learning_rate": 3.17623882925662e-05, + "loss": 0.88, + "step": 5003 + }, + { + "epoch": 0.2193914561527976, + "grad_norm": 0.90234375, + "learning_rate": 3.175902302465085e-05, + "loss": 0.8548, + "step": 5004 + }, + { + "epoch": 0.21943529936945483, + "grad_norm": 0.80859375, + "learning_rate": 3.175565790136834e-05, + "loss": 0.8823, + "step": 5005 + }, + { + "epoch": 0.21947914258611206, + "grad_norm": 0.8671875, + "learning_rate": 3.1752292922725826e-05, + "loss": 0.906, + "step": 5006 + }, + { + "epoch": 0.2195229858027693, + "grad_norm": 0.8828125, + "learning_rate": 3.1748928088730415e-05, + "loss": 0.8637, + "step": 5007 + }, + { + "epoch": 0.21956682901942653, + "grad_norm": 0.9375, + "learning_rate": 3.174556339938922e-05, + "loss": 0.8252, + "step": 5008 + }, + { + "epoch": 0.21961067223608377, + "grad_norm": 0.90625, + "learning_rate": 3.1742198854709434e-05, + "loss": 0.9246, + "step": 5009 + }, + { + "epoch": 0.219654515452741, + "grad_norm": 0.8984375, + "learning_rate": 3.173883445469815e-05, + "loss": 0.8451, + "step": 5010 + }, + { + "epoch": 0.21969835866939824, + "grad_norm": 0.85546875, + "learning_rate": 3.173547019936252e-05, + "loss": 0.6724, + "step": 5011 + }, + { + "epoch": 0.21974220188605548, + "grad_norm": 0.796875, + "learning_rate": 3.173210608870964e-05, + "loss": 0.6935, + "step": 5012 + }, + { + "epoch": 0.2197860451027127, + "grad_norm": 0.8203125, + "learning_rate": 3.172874212274665e-05, + "loss": 0.7319, + "step": 5013 + }, + { + "epoch": 0.21982988831936992, + "grad_norm": 0.90234375, + "learning_rate": 3.172537830148071e-05, + "loss": 0.8122, + "step": 5014 + }, + { + "epoch": 0.21987373153602716, + "grad_norm": 0.8046875, + "learning_rate": 3.172201462491893e-05, + "loss": 0.8252, + "step": 5015 + }, + { + "epoch": 0.2199175747526844, + "grad_norm": 0.8828125, + "learning_rate": 3.171865109306843e-05, + "loss": 0.8896, + "step": 5016 + }, + { + "epoch": 0.21996141796934163, + "grad_norm": 0.80859375, + "learning_rate": 3.171528770593635e-05, + "loss": 0.8486, + "step": 5017 + }, + { + "epoch": 0.22000526118599886, + "grad_norm": 0.8671875, + "learning_rate": 3.1711924463529784e-05, + "loss": 0.7665, + "step": 5018 + }, + { + "epoch": 0.2200491044026561, + "grad_norm": 0.96484375, + "learning_rate": 3.170856136585593e-05, + "loss": 0.8329, + "step": 5019 + }, + { + "epoch": 0.22009294761931333, + "grad_norm": 0.99609375, + "learning_rate": 3.1705198412921864e-05, + "loss": 0.7131, + "step": 5020 + }, + { + "epoch": 0.22013679083597057, + "grad_norm": 0.796875, + "learning_rate": 3.170183560473473e-05, + "loss": 0.8031, + "step": 5021 + }, + { + "epoch": 0.2201806340526278, + "grad_norm": 0.82421875, + "learning_rate": 3.169847294130166e-05, + "loss": 0.7632, + "step": 5022 + }, + { + "epoch": 0.22022447726928504, + "grad_norm": 0.890625, + "learning_rate": 3.1695110422629727e-05, + "loss": 0.9261, + "step": 5023 + }, + { + "epoch": 0.22026832048594228, + "grad_norm": 0.8125, + "learning_rate": 3.1691748048726144e-05, + "loss": 0.8748, + "step": 5024 + }, + { + "epoch": 0.2203121637025995, + "grad_norm": 0.78515625, + "learning_rate": 3.1688385819597996e-05, + "loss": 0.7118, + "step": 5025 + }, + { + "epoch": 0.22035600691925675, + "grad_norm": 0.8203125, + "learning_rate": 3.16850237352524e-05, + "loss": 0.8157, + "step": 5026 + }, + { + "epoch": 0.22039985013591398, + "grad_norm": 0.9609375, + "learning_rate": 3.1681661795696515e-05, + "loss": 0.7603, + "step": 5027 + }, + { + "epoch": 0.22044369335257122, + "grad_norm": 0.84375, + "learning_rate": 3.1678300000937386e-05, + "loss": 0.8174, + "step": 5028 + }, + { + "epoch": 0.22048753656922843, + "grad_norm": 0.75390625, + "learning_rate": 3.1674938350982245e-05, + "loss": 0.6775, + "step": 5029 + }, + { + "epoch": 0.22053137978588566, + "grad_norm": 0.8359375, + "learning_rate": 3.167157684583816e-05, + "loss": 0.6779, + "step": 5030 + }, + { + "epoch": 0.2205752230025429, + "grad_norm": 0.890625, + "learning_rate": 3.166821548551226e-05, + "loss": 0.83, + "step": 5031 + }, + { + "epoch": 0.22061906621920013, + "grad_norm": 0.8046875, + "learning_rate": 3.166485427001168e-05, + "loss": 0.7422, + "step": 5032 + }, + { + "epoch": 0.22066290943585737, + "grad_norm": 0.88671875, + "learning_rate": 3.1661493199343494e-05, + "loss": 0.9577, + "step": 5033 + }, + { + "epoch": 0.2207067526525146, + "grad_norm": 0.77734375, + "learning_rate": 3.1658132273514896e-05, + "loss": 0.7367, + "step": 5034 + }, + { + "epoch": 0.22075059586917184, + "grad_norm": 0.765625, + "learning_rate": 3.1654771492532984e-05, + "loss": 0.7724, + "step": 5035 + }, + { + "epoch": 0.22079443908582908, + "grad_norm": 0.8203125, + "learning_rate": 3.1651410856404886e-05, + "loss": 0.7591, + "step": 5036 + }, + { + "epoch": 0.2208382823024863, + "grad_norm": 0.8125, + "learning_rate": 3.164805036513772e-05, + "loss": 0.7719, + "step": 5037 + }, + { + "epoch": 0.22088212551914355, + "grad_norm": 0.76953125, + "learning_rate": 3.1644690018738556e-05, + "loss": 0.7096, + "step": 5038 + }, + { + "epoch": 0.22092596873580078, + "grad_norm": 0.8515625, + "learning_rate": 3.164132981721461e-05, + "loss": 0.7844, + "step": 5039 + }, + { + "epoch": 0.22096981195245802, + "grad_norm": 0.76171875, + "learning_rate": 3.163796976057294e-05, + "loss": 0.7111, + "step": 5040 + }, + { + "epoch": 0.22101365516911525, + "grad_norm": 0.859375, + "learning_rate": 3.16346098488207e-05, + "loss": 0.7649, + "step": 5041 + }, + { + "epoch": 0.2210574983857725, + "grad_norm": 0.75, + "learning_rate": 3.163125008196499e-05, + "loss": 0.7002, + "step": 5042 + }, + { + "epoch": 0.22110134160242972, + "grad_norm": 0.78515625, + "learning_rate": 3.162789046001294e-05, + "loss": 0.7854, + "step": 5043 + }, + { + "epoch": 0.22114518481908693, + "grad_norm": 0.7734375, + "learning_rate": 3.1624530982971677e-05, + "loss": 0.7075, + "step": 5044 + }, + { + "epoch": 0.22118902803574417, + "grad_norm": 0.78125, + "learning_rate": 3.16211716508483e-05, + "loss": 0.8859, + "step": 5045 + }, + { + "epoch": 0.2212328712524014, + "grad_norm": 0.84765625, + "learning_rate": 3.161781246364995e-05, + "loss": 0.8499, + "step": 5046 + }, + { + "epoch": 0.22127671446905864, + "grad_norm": 0.83984375, + "learning_rate": 3.161445342138374e-05, + "loss": 0.8172, + "step": 5047 + }, + { + "epoch": 0.22132055768571587, + "grad_norm": 0.875, + "learning_rate": 3.161109452405674e-05, + "loss": 0.8312, + "step": 5048 + }, + { + "epoch": 0.2213644009023731, + "grad_norm": 0.76953125, + "learning_rate": 3.160773577167616e-05, + "loss": 0.7498, + "step": 5049 + }, + { + "epoch": 0.22140824411903035, + "grad_norm": 0.90234375, + "learning_rate": 3.1604377164249076e-05, + "loss": 0.8887, + "step": 5050 + }, + { + "epoch": 0.22145208733568758, + "grad_norm": 1.03125, + "learning_rate": 3.1601018701782606e-05, + "loss": 0.9085, + "step": 5051 + }, + { + "epoch": 0.22149593055234482, + "grad_norm": 0.82421875, + "learning_rate": 3.1597660384283854e-05, + "loss": 0.7499, + "step": 5052 + }, + { + "epoch": 0.22153977376900205, + "grad_norm": 2.328125, + "learning_rate": 3.1594302211759926e-05, + "loss": 0.917, + "step": 5053 + }, + { + "epoch": 0.2215836169856593, + "grad_norm": 0.78515625, + "learning_rate": 3.159094418421801e-05, + "loss": 0.7132, + "step": 5054 + }, + { + "epoch": 0.22162746020231652, + "grad_norm": 0.9140625, + "learning_rate": 3.1587586301665164e-05, + "loss": 0.7181, + "step": 5055 + }, + { + "epoch": 0.22167130341897376, + "grad_norm": 0.80078125, + "learning_rate": 3.158422856410852e-05, + "loss": 0.7312, + "step": 5056 + }, + { + "epoch": 0.221715146635631, + "grad_norm": 0.81640625, + "learning_rate": 3.15808709715552e-05, + "loss": 0.7967, + "step": 5057 + }, + { + "epoch": 0.22175898985228823, + "grad_norm": 0.8515625, + "learning_rate": 3.157751352401227e-05, + "loss": 0.9043, + "step": 5058 + }, + { + "epoch": 0.22180283306894544, + "grad_norm": 0.765625, + "learning_rate": 3.1574156221486925e-05, + "loss": 0.7096, + "step": 5059 + }, + { + "epoch": 0.22184667628560267, + "grad_norm": 0.8046875, + "learning_rate": 3.1570799063986255e-05, + "loss": 0.7687, + "step": 5060 + }, + { + "epoch": 0.2218905195022599, + "grad_norm": 0.859375, + "learning_rate": 3.156744205151736e-05, + "loss": 0.8133, + "step": 5061 + }, + { + "epoch": 0.22193436271891714, + "grad_norm": 0.921875, + "learning_rate": 3.1564085184087355e-05, + "loss": 0.8475, + "step": 5062 + }, + { + "epoch": 0.22197820593557438, + "grad_norm": 0.72265625, + "learning_rate": 3.156072846170333e-05, + "loss": 0.7124, + "step": 5063 + }, + { + "epoch": 0.22202204915223162, + "grad_norm": 0.734375, + "learning_rate": 3.1557371884372454e-05, + "loss": 0.7983, + "step": 5064 + }, + { + "epoch": 0.22206589236888885, + "grad_norm": 0.81640625, + "learning_rate": 3.155401545210183e-05, + "loss": 0.6795, + "step": 5065 + }, + { + "epoch": 0.2221097355855461, + "grad_norm": 0.87109375, + "learning_rate": 3.1550659164898555e-05, + "loss": 0.8432, + "step": 5066 + }, + { + "epoch": 0.22215357880220332, + "grad_norm": 0.96484375, + "learning_rate": 3.154730302276973e-05, + "loss": 0.9081, + "step": 5067 + }, + { + "epoch": 0.22219742201886056, + "grad_norm": 1.2421875, + "learning_rate": 3.1543947025722476e-05, + "loss": 0.8378, + "step": 5068 + }, + { + "epoch": 0.2222412652355178, + "grad_norm": 0.8671875, + "learning_rate": 3.154059117376393e-05, + "loss": 0.7755, + "step": 5069 + }, + { + "epoch": 0.22228510845217503, + "grad_norm": 0.86328125, + "learning_rate": 3.153723546690119e-05, + "loss": 0.781, + "step": 5070 + }, + { + "epoch": 0.22232895166883226, + "grad_norm": 0.78515625, + "learning_rate": 3.1533879905141375e-05, + "loss": 0.7538, + "step": 5071 + }, + { + "epoch": 0.2223727948854895, + "grad_norm": 0.80859375, + "learning_rate": 3.1530524488491576e-05, + "loss": 0.7158, + "step": 5072 + }, + { + "epoch": 0.22241663810214674, + "grad_norm": 0.80859375, + "learning_rate": 3.1527169216958884e-05, + "loss": 0.8112, + "step": 5073 + }, + { + "epoch": 0.22246048131880397, + "grad_norm": 0.87109375, + "learning_rate": 3.152381409055049e-05, + "loss": 0.8722, + "step": 5074 + }, + { + "epoch": 0.22250432453546118, + "grad_norm": 0.8203125, + "learning_rate": 3.152045910927345e-05, + "loss": 0.8491, + "step": 5075 + }, + { + "epoch": 0.22254816775211841, + "grad_norm": 0.8046875, + "learning_rate": 3.1517104273134875e-05, + "loss": 0.6988, + "step": 5076 + }, + { + "epoch": 0.22259201096877565, + "grad_norm": 0.84765625, + "learning_rate": 3.1513749582141896e-05, + "loss": 0.863, + "step": 5077 + }, + { + "epoch": 0.22263585418543289, + "grad_norm": 0.765625, + "learning_rate": 3.1510395036301564e-05, + "loss": 0.8229, + "step": 5078 + }, + { + "epoch": 0.22267969740209012, + "grad_norm": 0.79296875, + "learning_rate": 3.150704063562109e-05, + "loss": 0.7378, + "step": 5079 + }, + { + "epoch": 0.22272354061874736, + "grad_norm": 0.84765625, + "learning_rate": 3.150368638010751e-05, + "loss": 0.7159, + "step": 5080 + }, + { + "epoch": 0.2227673838354046, + "grad_norm": 0.96484375, + "learning_rate": 3.1500332269767954e-05, + "loss": 0.8332, + "step": 5081 + }, + { + "epoch": 0.22281122705206183, + "grad_norm": 0.8359375, + "learning_rate": 3.149697830460953e-05, + "loss": 0.7545, + "step": 5082 + }, + { + "epoch": 0.22285507026871906, + "grad_norm": 0.83984375, + "learning_rate": 3.149362448463934e-05, + "loss": 0.7651, + "step": 5083 + }, + { + "epoch": 0.2228989134853763, + "grad_norm": 0.94140625, + "learning_rate": 3.149027080986451e-05, + "loss": 0.7714, + "step": 5084 + }, + { + "epoch": 0.22294275670203353, + "grad_norm": 0.87890625, + "learning_rate": 3.148691728029209e-05, + "loss": 0.8493, + "step": 5085 + }, + { + "epoch": 0.22298659991869077, + "grad_norm": 0.83203125, + "learning_rate": 3.148356389592927e-05, + "loss": 0.8548, + "step": 5086 + }, + { + "epoch": 0.223030443135348, + "grad_norm": 0.796875, + "learning_rate": 3.1480210656783125e-05, + "loss": 0.8281, + "step": 5087 + }, + { + "epoch": 0.22307428635200524, + "grad_norm": 0.8125, + "learning_rate": 3.147685756286075e-05, + "loss": 0.8982, + "step": 5088 + }, + { + "epoch": 0.22311812956866248, + "grad_norm": 0.78515625, + "learning_rate": 3.147350461416927e-05, + "loss": 0.912, + "step": 5089 + }, + { + "epoch": 0.22316197278531968, + "grad_norm": 0.76953125, + "learning_rate": 3.147015181071577e-05, + "loss": 0.7517, + "step": 5090 + }, + { + "epoch": 0.22320581600197692, + "grad_norm": 0.77734375, + "learning_rate": 3.1466799152507366e-05, + "loss": 0.7541, + "step": 5091 + }, + { + "epoch": 0.22324965921863416, + "grad_norm": 0.82421875, + "learning_rate": 3.146344663955116e-05, + "loss": 0.8278, + "step": 5092 + }, + { + "epoch": 0.2232935024352914, + "grad_norm": 0.8828125, + "learning_rate": 3.146009427185423e-05, + "loss": 0.8807, + "step": 5093 + }, + { + "epoch": 0.22333734565194863, + "grad_norm": 0.765625, + "learning_rate": 3.145674204942376e-05, + "loss": 0.7435, + "step": 5094 + }, + { + "epoch": 0.22338118886860586, + "grad_norm": 0.87109375, + "learning_rate": 3.14533899722668e-05, + "loss": 0.7796, + "step": 5095 + }, + { + "epoch": 0.2234250320852631, + "grad_norm": 0.91015625, + "learning_rate": 3.1450038040390464e-05, + "loss": 0.8334, + "step": 5096 + }, + { + "epoch": 0.22346887530192033, + "grad_norm": 0.83203125, + "learning_rate": 3.144668625380185e-05, + "loss": 0.7871, + "step": 5097 + }, + { + "epoch": 0.22351271851857757, + "grad_norm": 0.828125, + "learning_rate": 3.144333461250803e-05, + "loss": 0.7659, + "step": 5098 + }, + { + "epoch": 0.2235565617352348, + "grad_norm": 0.81640625, + "learning_rate": 3.143998311651618e-05, + "loss": 0.769, + "step": 5099 + }, + { + "epoch": 0.22360040495189204, + "grad_norm": 0.734375, + "learning_rate": 3.143663176583337e-05, + "loss": 0.7534, + "step": 5100 + }, + { + "epoch": 0.22364424816854928, + "grad_norm": 0.86328125, + "learning_rate": 3.143328056046669e-05, + "loss": 0.8475, + "step": 5101 + }, + { + "epoch": 0.2236880913852065, + "grad_norm": 0.85546875, + "learning_rate": 3.142992950042326e-05, + "loss": 0.9182, + "step": 5102 + }, + { + "epoch": 0.22373193460186375, + "grad_norm": 0.8359375, + "learning_rate": 3.1426578585710145e-05, + "loss": 0.8831, + "step": 5103 + }, + { + "epoch": 0.22377577781852098, + "grad_norm": 0.859375, + "learning_rate": 3.142322781633451e-05, + "loss": 0.8576, + "step": 5104 + }, + { + "epoch": 0.2238196210351782, + "grad_norm": 0.859375, + "learning_rate": 3.141987719230341e-05, + "loss": 0.8093, + "step": 5105 + }, + { + "epoch": 0.22386346425183543, + "grad_norm": 0.859375, + "learning_rate": 3.141652671362398e-05, + "loss": 0.7518, + "step": 5106 + }, + { + "epoch": 0.22390730746849266, + "grad_norm": 0.7109375, + "learning_rate": 3.1413176380303286e-05, + "loss": 0.6166, + "step": 5107 + }, + { + "epoch": 0.2239511506851499, + "grad_norm": 0.87890625, + "learning_rate": 3.1409826192348426e-05, + "loss": 0.753, + "step": 5108 + }, + { + "epoch": 0.22399499390180713, + "grad_norm": 0.89453125, + "learning_rate": 3.140647614976655e-05, + "loss": 0.7387, + "step": 5109 + }, + { + "epoch": 0.22403883711846437, + "grad_norm": 0.859375, + "learning_rate": 3.1403126252564715e-05, + "loss": 0.798, + "step": 5110 + }, + { + "epoch": 0.2240826803351216, + "grad_norm": 0.83984375, + "learning_rate": 3.139977650075005e-05, + "loss": 0.8162, + "step": 5111 + }, + { + "epoch": 0.22412652355177884, + "grad_norm": 0.8046875, + "learning_rate": 3.139642689432962e-05, + "loss": 0.7836, + "step": 5112 + }, + { + "epoch": 0.22417036676843607, + "grad_norm": 0.83984375, + "learning_rate": 3.1393077433310546e-05, + "loss": 0.8271, + "step": 5113 + }, + { + "epoch": 0.2242142099850933, + "grad_norm": 0.87109375, + "learning_rate": 3.13897281176999e-05, + "loss": 0.6844, + "step": 5114 + }, + { + "epoch": 0.22425805320175055, + "grad_norm": 0.7578125, + "learning_rate": 3.1386378947504826e-05, + "loss": 0.6875, + "step": 5115 + }, + { + "epoch": 0.22430189641840778, + "grad_norm": 0.8046875, + "learning_rate": 3.138302992273241e-05, + "loss": 0.8487, + "step": 5116 + }, + { + "epoch": 0.22434573963506502, + "grad_norm": 0.87109375, + "learning_rate": 3.137968104338973e-05, + "loss": 0.8607, + "step": 5117 + }, + { + "epoch": 0.22438958285172225, + "grad_norm": 0.80078125, + "learning_rate": 3.1376332309483905e-05, + "loss": 0.7231, + "step": 5118 + }, + { + "epoch": 0.2244334260683795, + "grad_norm": 0.85546875, + "learning_rate": 3.1372983721021976e-05, + "loss": 0.8143, + "step": 5119 + }, + { + "epoch": 0.2244772692850367, + "grad_norm": 0.84375, + "learning_rate": 3.136963527801112e-05, + "loss": 0.7227, + "step": 5120 + }, + { + "epoch": 0.22452111250169393, + "grad_norm": 0.828125, + "learning_rate": 3.136628698045842e-05, + "loss": 0.7973, + "step": 5121 + }, + { + "epoch": 0.22456495571835117, + "grad_norm": 0.78515625, + "learning_rate": 3.136293882837092e-05, + "loss": 0.8367, + "step": 5122 + }, + { + "epoch": 0.2246087989350084, + "grad_norm": 0.78515625, + "learning_rate": 3.1359590821755766e-05, + "loss": 0.8698, + "step": 5123 + }, + { + "epoch": 0.22465264215166564, + "grad_norm": 0.796875, + "learning_rate": 3.135624296062001e-05, + "loss": 0.8273, + "step": 5124 + }, + { + "epoch": 0.22469648536832287, + "grad_norm": 0.7734375, + "learning_rate": 3.1352895244970794e-05, + "loss": 0.7454, + "step": 5125 + }, + { + "epoch": 0.2247403285849801, + "grad_norm": 0.8828125, + "learning_rate": 3.1349547674815195e-05, + "loss": 0.9776, + "step": 5126 + }, + { + "epoch": 0.22478417180163734, + "grad_norm": 0.90625, + "learning_rate": 3.1346200250160307e-05, + "loss": 0.9115, + "step": 5127 + }, + { + "epoch": 0.22482801501829458, + "grad_norm": 0.78515625, + "learning_rate": 3.1342852971013224e-05, + "loss": 0.762, + "step": 5128 + }, + { + "epoch": 0.22487185823495182, + "grad_norm": 0.79296875, + "learning_rate": 3.1339505837381035e-05, + "loss": 0.7957, + "step": 5129 + }, + { + "epoch": 0.22491570145160905, + "grad_norm": 0.9609375, + "learning_rate": 3.133615884927085e-05, + "loss": 0.8515, + "step": 5130 + }, + { + "epoch": 0.2249595446682663, + "grad_norm": 0.8515625, + "learning_rate": 3.1332812006689736e-05, + "loss": 0.8974, + "step": 5131 + }, + { + "epoch": 0.22500338788492352, + "grad_norm": 0.8671875, + "learning_rate": 3.132946530964481e-05, + "loss": 0.7325, + "step": 5132 + }, + { + "epoch": 0.22504723110158076, + "grad_norm": 0.828125, + "learning_rate": 3.132611875814316e-05, + "loss": 0.9107, + "step": 5133 + }, + { + "epoch": 0.225091074318238, + "grad_norm": 0.80859375, + "learning_rate": 3.132277235219184e-05, + "loss": 0.8033, + "step": 5134 + }, + { + "epoch": 0.2251349175348952, + "grad_norm": 0.79296875, + "learning_rate": 3.1319426091798024e-05, + "loss": 0.7134, + "step": 5135 + }, + { + "epoch": 0.22517876075155244, + "grad_norm": 0.88671875, + "learning_rate": 3.1316079976968736e-05, + "loss": 0.9195, + "step": 5136 + }, + { + "epoch": 0.22522260396820967, + "grad_norm": 0.82421875, + "learning_rate": 3.1312734007711106e-05, + "loss": 0.8402, + "step": 5137 + }, + { + "epoch": 0.2252664471848669, + "grad_norm": 0.75390625, + "learning_rate": 3.130938818403221e-05, + "loss": 0.7102, + "step": 5138 + }, + { + "epoch": 0.22531029040152414, + "grad_norm": 0.86328125, + "learning_rate": 3.1306042505939105e-05, + "loss": 0.8257, + "step": 5139 + }, + { + "epoch": 0.22535413361818138, + "grad_norm": 0.8125, + "learning_rate": 3.130269697343895e-05, + "loss": 0.7672, + "step": 5140 + }, + { + "epoch": 0.22539797683483861, + "grad_norm": 0.79296875, + "learning_rate": 3.129935158653879e-05, + "loss": 0.8166, + "step": 5141 + }, + { + "epoch": 0.22544182005149585, + "grad_norm": 0.8046875, + "learning_rate": 3.129600634524574e-05, + "loss": 0.8508, + "step": 5142 + }, + { + "epoch": 0.22548566326815309, + "grad_norm": 0.9921875, + "learning_rate": 3.129266124956688e-05, + "loss": 0.8438, + "step": 5143 + }, + { + "epoch": 0.22552950648481032, + "grad_norm": 0.875, + "learning_rate": 3.1289316299509254e-05, + "loss": 0.9326, + "step": 5144 + }, + { + "epoch": 0.22557334970146756, + "grad_norm": 0.828125, + "learning_rate": 3.1285971495080037e-05, + "loss": 0.9169, + "step": 5145 + }, + { + "epoch": 0.2256171929181248, + "grad_norm": 0.875, + "learning_rate": 3.1282626836286276e-05, + "loss": 0.6661, + "step": 5146 + }, + { + "epoch": 0.22566103613478203, + "grad_norm": 0.75390625, + "learning_rate": 3.127928232313505e-05, + "loss": 0.6637, + "step": 5147 + }, + { + "epoch": 0.22570487935143926, + "grad_norm": 0.83203125, + "learning_rate": 3.127593795563346e-05, + "loss": 0.9782, + "step": 5148 + }, + { + "epoch": 0.2257487225680965, + "grad_norm": 0.890625, + "learning_rate": 3.127259373378857e-05, + "loss": 0.7386, + "step": 5149 + }, + { + "epoch": 0.2257925657847537, + "grad_norm": 0.80078125, + "learning_rate": 3.1269249657607516e-05, + "loss": 0.7518, + "step": 5150 + }, + { + "epoch": 0.22583640900141094, + "grad_norm": 0.828125, + "learning_rate": 3.126590572709735e-05, + "loss": 0.7613, + "step": 5151 + }, + { + "epoch": 0.22588025221806818, + "grad_norm": 0.89453125, + "learning_rate": 3.126256194226518e-05, + "loss": 0.8197, + "step": 5152 + }, + { + "epoch": 0.2259240954347254, + "grad_norm": 0.77734375, + "learning_rate": 3.1259218303118075e-05, + "loss": 0.8065, + "step": 5153 + }, + { + "epoch": 0.22596793865138265, + "grad_norm": 0.8671875, + "learning_rate": 3.1255874809663096e-05, + "loss": 0.9097, + "step": 5154 + }, + { + "epoch": 0.22601178186803988, + "grad_norm": 0.85546875, + "learning_rate": 3.12525314619074e-05, + "loss": 0.7479, + "step": 5155 + }, + { + "epoch": 0.22605562508469712, + "grad_norm": 0.7578125, + "learning_rate": 3.124918825985802e-05, + "loss": 0.7142, + "step": 5156 + }, + { + "epoch": 0.22609946830135436, + "grad_norm": 1.015625, + "learning_rate": 3.124584520352206e-05, + "loss": 0.7621, + "step": 5157 + }, + { + "epoch": 0.2261433115180116, + "grad_norm": 1.015625, + "learning_rate": 3.1242502292906605e-05, + "loss": 0.8331, + "step": 5158 + }, + { + "epoch": 0.22618715473466883, + "grad_norm": 0.8828125, + "learning_rate": 3.1239159528018715e-05, + "loss": 0.8039, + "step": 5159 + }, + { + "epoch": 0.22623099795132606, + "grad_norm": 0.921875, + "learning_rate": 3.123581690886551e-05, + "loss": 0.8474, + "step": 5160 + }, + { + "epoch": 0.2262748411679833, + "grad_norm": 0.76953125, + "learning_rate": 3.123247443545407e-05, + "loss": 0.6792, + "step": 5161 + }, + { + "epoch": 0.22631868438464053, + "grad_norm": 0.80078125, + "learning_rate": 3.122913210779147e-05, + "loss": 0.7311, + "step": 5162 + }, + { + "epoch": 0.22636252760129777, + "grad_norm": 0.8671875, + "learning_rate": 3.1225789925884794e-05, + "loss": 0.8095, + "step": 5163 + }, + { + "epoch": 0.226406370817955, + "grad_norm": 0.84765625, + "learning_rate": 3.122244788974109e-05, + "loss": 0.7667, + "step": 5164 + }, + { + "epoch": 0.22645021403461224, + "grad_norm": 0.82421875, + "learning_rate": 3.12191059993675e-05, + "loss": 0.7772, + "step": 5165 + }, + { + "epoch": 0.22649405725126945, + "grad_norm": 0.7578125, + "learning_rate": 3.121576425477111e-05, + "loss": 0.8297, + "step": 5166 + }, + { + "epoch": 0.22653790046792668, + "grad_norm": 0.79296875, + "learning_rate": 3.121242265595895e-05, + "loss": 0.7606, + "step": 5167 + }, + { + "epoch": 0.22658174368458392, + "grad_norm": 0.9296875, + "learning_rate": 3.120908120293814e-05, + "loss": 0.9436, + "step": 5168 + }, + { + "epoch": 0.22662558690124116, + "grad_norm": 0.89453125, + "learning_rate": 3.1205739895715744e-05, + "loss": 0.9296, + "step": 5169 + }, + { + "epoch": 0.2266694301178984, + "grad_norm": 0.8984375, + "learning_rate": 3.1202398734298865e-05, + "loss": 0.797, + "step": 5170 + }, + { + "epoch": 0.22671327333455563, + "grad_norm": 0.80859375, + "learning_rate": 3.1199057718694527e-05, + "loss": 0.7785, + "step": 5171 + }, + { + "epoch": 0.22675711655121286, + "grad_norm": 0.796875, + "learning_rate": 3.1195716848909885e-05, + "loss": 0.777, + "step": 5172 + }, + { + "epoch": 0.2268009597678701, + "grad_norm": 0.85546875, + "learning_rate": 3.119237612495198e-05, + "loss": 0.8569, + "step": 5173 + }, + { + "epoch": 0.22684480298452733, + "grad_norm": 0.9375, + "learning_rate": 3.1189035546827914e-05, + "loss": 0.9029, + "step": 5174 + }, + { + "epoch": 0.22688864620118457, + "grad_norm": 0.84375, + "learning_rate": 3.1185695114544755e-05, + "loss": 0.7714, + "step": 5175 + }, + { + "epoch": 0.2269324894178418, + "grad_norm": 0.83203125, + "learning_rate": 3.118235482810957e-05, + "loss": 1.0125, + "step": 5176 + }, + { + "epoch": 0.22697633263449904, + "grad_norm": 0.79296875, + "learning_rate": 3.117901468752946e-05, + "loss": 0.7276, + "step": 5177 + }, + { + "epoch": 0.22702017585115628, + "grad_norm": 0.74609375, + "learning_rate": 3.11756746928115e-05, + "loss": 0.7443, + "step": 5178 + }, + { + "epoch": 0.2270640190678135, + "grad_norm": 0.9375, + "learning_rate": 3.117233484396272e-05, + "loss": 0.7693, + "step": 5179 + }, + { + "epoch": 0.22710786228447075, + "grad_norm": 0.859375, + "learning_rate": 3.1168995140990286e-05, + "loss": 0.8413, + "step": 5180 + }, + { + "epoch": 0.22715170550112795, + "grad_norm": 0.77734375, + "learning_rate": 3.116565558390122e-05, + "loss": 0.7908, + "step": 5181 + }, + { + "epoch": 0.2271955487177852, + "grad_norm": 0.921875, + "learning_rate": 3.116231617270262e-05, + "loss": 0.784, + "step": 5182 + }, + { + "epoch": 0.22723939193444243, + "grad_norm": 0.82421875, + "learning_rate": 3.1158976907401546e-05, + "loss": 0.86, + "step": 5183 + }, + { + "epoch": 0.22728323515109966, + "grad_norm": 0.79296875, + "learning_rate": 3.115563778800506e-05, + "loss": 0.6692, + "step": 5184 + }, + { + "epoch": 0.2273270783677569, + "grad_norm": 0.8515625, + "learning_rate": 3.115229881452031e-05, + "loss": 0.8637, + "step": 5185 + }, + { + "epoch": 0.22737092158441413, + "grad_norm": 0.78125, + "learning_rate": 3.114895998695432e-05, + "loss": 0.7147, + "step": 5186 + }, + { + "epoch": 0.22741476480107137, + "grad_norm": 0.80859375, + "learning_rate": 3.1145621305314164e-05, + "loss": 0.8455, + "step": 5187 + }, + { + "epoch": 0.2274586080177286, + "grad_norm": 0.84375, + "learning_rate": 3.114228276960693e-05, + "loss": 0.9073, + "step": 5188 + }, + { + "epoch": 0.22750245123438584, + "grad_norm": 0.78515625, + "learning_rate": 3.1138944379839675e-05, + "loss": 0.816, + "step": 5189 + }, + { + "epoch": 0.22754629445104307, + "grad_norm": 0.859375, + "learning_rate": 3.1135606136019516e-05, + "loss": 0.8456, + "step": 5190 + }, + { + "epoch": 0.2275901376677003, + "grad_norm": 0.79296875, + "learning_rate": 3.113226803815349e-05, + "loss": 0.8223, + "step": 5191 + }, + { + "epoch": 0.22763398088435755, + "grad_norm": 0.8671875, + "learning_rate": 3.11289300862487e-05, + "loss": 0.8562, + "step": 5192 + }, + { + "epoch": 0.22767782410101478, + "grad_norm": 0.8203125, + "learning_rate": 3.11255922803122e-05, + "loss": 0.8937, + "step": 5193 + }, + { + "epoch": 0.22772166731767202, + "grad_norm": 0.69140625, + "learning_rate": 3.1122254620351046e-05, + "loss": 0.7779, + "step": 5194 + }, + { + "epoch": 0.22776551053432925, + "grad_norm": 0.82421875, + "learning_rate": 3.111891710637236e-05, + "loss": 0.8069, + "step": 5195 + }, + { + "epoch": 0.22780935375098646, + "grad_norm": 0.7734375, + "learning_rate": 3.111557973838319e-05, + "loss": 0.7175, + "step": 5196 + }, + { + "epoch": 0.2278531969676437, + "grad_norm": 0.8046875, + "learning_rate": 3.111224251639062e-05, + "loss": 0.7422, + "step": 5197 + }, + { + "epoch": 0.22789704018430093, + "grad_norm": 0.80078125, + "learning_rate": 3.110890544040171e-05, + "loss": 0.7032, + "step": 5198 + }, + { + "epoch": 0.22794088340095817, + "grad_norm": 0.8125, + "learning_rate": 3.110556851042351e-05, + "loss": 0.8039, + "step": 5199 + }, + { + "epoch": 0.2279847266176154, + "grad_norm": 0.84375, + "learning_rate": 3.110223172646315e-05, + "loss": 0.8218, + "step": 5200 + }, + { + "epoch": 0.22802856983427264, + "grad_norm": 0.8125, + "learning_rate": 3.109889508852767e-05, + "loss": 0.8163, + "step": 5201 + }, + { + "epoch": 0.22807241305092987, + "grad_norm": 0.96484375, + "learning_rate": 3.109555859662414e-05, + "loss": 0.8349, + "step": 5202 + }, + { + "epoch": 0.2281162562675871, + "grad_norm": 0.734375, + "learning_rate": 3.109222225075964e-05, + "loss": 0.7316, + "step": 5203 + }, + { + "epoch": 0.22816009948424434, + "grad_norm": 0.890625, + "learning_rate": 3.1088886050941213e-05, + "loss": 0.84, + "step": 5204 + }, + { + "epoch": 0.22820394270090158, + "grad_norm": 0.88671875, + "learning_rate": 3.108554999717597e-05, + "loss": 0.9279, + "step": 5205 + }, + { + "epoch": 0.22824778591755882, + "grad_norm": 0.8359375, + "learning_rate": 3.1082214089470976e-05, + "loss": 0.7768, + "step": 5206 + }, + { + "epoch": 0.22829162913421605, + "grad_norm": 0.796875, + "learning_rate": 3.1078878327833285e-05, + "loss": 0.8185, + "step": 5207 + }, + { + "epoch": 0.2283354723508733, + "grad_norm": 0.94921875, + "learning_rate": 3.107554271226997e-05, + "loss": 0.739, + "step": 5208 + }, + { + "epoch": 0.22837931556753052, + "grad_norm": 0.828125, + "learning_rate": 3.107220724278808e-05, + "loss": 0.8165, + "step": 5209 + }, + { + "epoch": 0.22842315878418776, + "grad_norm": 0.90625, + "learning_rate": 3.1068871919394725e-05, + "loss": 0.7394, + "step": 5210 + }, + { + "epoch": 0.22846700200084497, + "grad_norm": 0.765625, + "learning_rate": 3.106553674209696e-05, + "loss": 0.8011, + "step": 5211 + }, + { + "epoch": 0.2285108452175022, + "grad_norm": 1.1328125, + "learning_rate": 3.106220171090185e-05, + "loss": 0.8311, + "step": 5212 + }, + { + "epoch": 0.22855468843415944, + "grad_norm": 0.7578125, + "learning_rate": 3.105886682581647e-05, + "loss": 0.6633, + "step": 5213 + }, + { + "epoch": 0.22859853165081667, + "grad_norm": 0.828125, + "learning_rate": 3.105553208684787e-05, + "loss": 0.8214, + "step": 5214 + }, + { + "epoch": 0.2286423748674739, + "grad_norm": 0.953125, + "learning_rate": 3.1052197494003146e-05, + "loss": 0.8143, + "step": 5215 + }, + { + "epoch": 0.22868621808413114, + "grad_norm": 0.8125, + "learning_rate": 3.104886304728933e-05, + "loss": 0.7315, + "step": 5216 + }, + { + "epoch": 0.22873006130078838, + "grad_norm": 0.76953125, + "learning_rate": 3.104552874671351e-05, + "loss": 0.6999, + "step": 5217 + }, + { + "epoch": 0.22877390451744561, + "grad_norm": 0.80859375, + "learning_rate": 3.104219459228277e-05, + "loss": 0.7536, + "step": 5218 + }, + { + "epoch": 0.22881774773410285, + "grad_norm": 1.3046875, + "learning_rate": 3.103886058400411e-05, + "loss": 0.8652, + "step": 5219 + }, + { + "epoch": 0.22886159095076009, + "grad_norm": 0.76953125, + "learning_rate": 3.103552672188468e-05, + "loss": 0.8647, + "step": 5220 + }, + { + "epoch": 0.22890543416741732, + "grad_norm": 0.8046875, + "learning_rate": 3.10321930059315e-05, + "loss": 0.8836, + "step": 5221 + }, + { + "epoch": 0.22894927738407456, + "grad_norm": 0.83984375, + "learning_rate": 3.1028859436151655e-05, + "loss": 1.0156, + "step": 5222 + }, + { + "epoch": 0.2289931206007318, + "grad_norm": 0.7890625, + "learning_rate": 3.1025526012552196e-05, + "loss": 0.7947, + "step": 5223 + }, + { + "epoch": 0.22903696381738903, + "grad_norm": 0.8203125, + "learning_rate": 3.102219273514017e-05, + "loss": 0.736, + "step": 5224 + }, + { + "epoch": 0.22908080703404626, + "grad_norm": 0.8359375, + "learning_rate": 3.101885960392268e-05, + "loss": 0.7119, + "step": 5225 + }, + { + "epoch": 0.22912465025070347, + "grad_norm": 0.81640625, + "learning_rate": 3.1015526618906774e-05, + "loss": 0.9305, + "step": 5226 + }, + { + "epoch": 0.2291684934673607, + "grad_norm": 0.87109375, + "learning_rate": 3.101219378009953e-05, + "loss": 0.91, + "step": 5227 + }, + { + "epoch": 0.22921233668401794, + "grad_norm": 0.765625, + "learning_rate": 3.100886108750799e-05, + "loss": 0.7758, + "step": 5228 + }, + { + "epoch": 0.22925617990067518, + "grad_norm": 0.83984375, + "learning_rate": 3.100552854113919e-05, + "loss": 0.6994, + "step": 5229 + }, + { + "epoch": 0.2293000231173324, + "grad_norm": 0.8125, + "learning_rate": 3.100219614100027e-05, + "loss": 0.832, + "step": 5230 + }, + { + "epoch": 0.22934386633398965, + "grad_norm": 0.8125, + "learning_rate": 3.0998863887098254e-05, + "loss": 0.8466, + "step": 5231 + }, + { + "epoch": 0.22938770955064688, + "grad_norm": 0.80859375, + "learning_rate": 3.099553177944019e-05, + "loss": 0.8159, + "step": 5232 + }, + { + "epoch": 0.22943155276730412, + "grad_norm": 0.71875, + "learning_rate": 3.099219981803316e-05, + "loss": 0.8322, + "step": 5233 + }, + { + "epoch": 0.22947539598396136, + "grad_norm": 0.83203125, + "learning_rate": 3.0988868002884184e-05, + "loss": 0.8337, + "step": 5234 + }, + { + "epoch": 0.2295192392006186, + "grad_norm": 0.91015625, + "learning_rate": 3.098553633400039e-05, + "loss": 0.8528, + "step": 5235 + }, + { + "epoch": 0.22956308241727583, + "grad_norm": 0.85546875, + "learning_rate": 3.0982204811388805e-05, + "loss": 0.7631, + "step": 5236 + }, + { + "epoch": 0.22960692563393306, + "grad_norm": 0.78125, + "learning_rate": 3.09788734350565e-05, + "loss": 0.7802, + "step": 5237 + }, + { + "epoch": 0.2296507688505903, + "grad_norm": 0.859375, + "learning_rate": 3.097554220501052e-05, + "loss": 0.7572, + "step": 5238 + }, + { + "epoch": 0.22969461206724753, + "grad_norm": 0.78125, + "learning_rate": 3.09722111212579e-05, + "loss": 0.8219, + "step": 5239 + }, + { + "epoch": 0.22973845528390477, + "grad_norm": 0.82421875, + "learning_rate": 3.096888018380577e-05, + "loss": 0.8158, + "step": 5240 + }, + { + "epoch": 0.22978229850056198, + "grad_norm": 0.79296875, + "learning_rate": 3.096554939266115e-05, + "loss": 0.7961, + "step": 5241 + }, + { + "epoch": 0.2298261417172192, + "grad_norm": 0.87109375, + "learning_rate": 3.096221874783111e-05, + "loss": 0.9455, + "step": 5242 + }, + { + "epoch": 0.22986998493387645, + "grad_norm": 1.0, + "learning_rate": 3.0958888249322684e-05, + "loss": 0.7773, + "step": 5243 + }, + { + "epoch": 0.22991382815053368, + "grad_norm": 0.83203125, + "learning_rate": 3.095555789714293e-05, + "loss": 0.7668, + "step": 5244 + }, + { + "epoch": 0.22995767136719092, + "grad_norm": 0.76171875, + "learning_rate": 3.095222769129895e-05, + "loss": 0.7214, + "step": 5245 + }, + { + "epoch": 0.23000151458384815, + "grad_norm": 0.828125, + "learning_rate": 3.094889763179778e-05, + "loss": 0.8247, + "step": 5246 + }, + { + "epoch": 0.2300453578005054, + "grad_norm": 0.81640625, + "learning_rate": 3.0945567718646477e-05, + "loss": 0.8677, + "step": 5247 + }, + { + "epoch": 0.23008920101716263, + "grad_norm": 0.84765625, + "learning_rate": 3.094223795185209e-05, + "loss": 0.8624, + "step": 5248 + }, + { + "epoch": 0.23013304423381986, + "grad_norm": 0.8203125, + "learning_rate": 3.0938908331421656e-05, + "loss": 0.8778, + "step": 5249 + }, + { + "epoch": 0.2301768874504771, + "grad_norm": 0.83203125, + "learning_rate": 3.093557885736229e-05, + "loss": 0.8168, + "step": 5250 + }, + { + "epoch": 0.23022073066713433, + "grad_norm": 0.87109375, + "learning_rate": 3.093224952968101e-05, + "loss": 0.8139, + "step": 5251 + }, + { + "epoch": 0.23026457388379157, + "grad_norm": 0.8359375, + "learning_rate": 3.092892034838489e-05, + "loss": 0.7975, + "step": 5252 + }, + { + "epoch": 0.2303084171004488, + "grad_norm": 0.76953125, + "learning_rate": 3.0925591313480974e-05, + "loss": 0.7516, + "step": 5253 + }, + { + "epoch": 0.23035226031710604, + "grad_norm": 0.78125, + "learning_rate": 3.0922262424976314e-05, + "loss": 0.7377, + "step": 5254 + }, + { + "epoch": 0.23039610353376327, + "grad_norm": 0.859375, + "learning_rate": 3.091893368287797e-05, + "loss": 0.8981, + "step": 5255 + }, + { + "epoch": 0.2304399467504205, + "grad_norm": 0.89453125, + "learning_rate": 3.091560508719297e-05, + "loss": 0.7843, + "step": 5256 + }, + { + "epoch": 0.23048378996707772, + "grad_norm": 0.86328125, + "learning_rate": 3.091227663792843e-05, + "loss": 0.7219, + "step": 5257 + }, + { + "epoch": 0.23052763318373495, + "grad_norm": 0.828125, + "learning_rate": 3.090894833509136e-05, + "loss": 0.7652, + "step": 5258 + }, + { + "epoch": 0.2305714764003922, + "grad_norm": 0.81640625, + "learning_rate": 3.090562017868883e-05, + "loss": 0.8426, + "step": 5259 + }, + { + "epoch": 0.23061531961704942, + "grad_norm": 0.8359375, + "learning_rate": 3.0902292168727894e-05, + "loss": 0.7839, + "step": 5260 + }, + { + "epoch": 0.23065916283370666, + "grad_norm": 0.83203125, + "learning_rate": 3.0898964305215595e-05, + "loss": 0.7875, + "step": 5261 + }, + { + "epoch": 0.2307030060503639, + "grad_norm": 0.8359375, + "learning_rate": 3.089563658815899e-05, + "loss": 0.8505, + "step": 5262 + }, + { + "epoch": 0.23074684926702113, + "grad_norm": 0.76953125, + "learning_rate": 3.0892309017565146e-05, + "loss": 0.8302, + "step": 5263 + }, + { + "epoch": 0.23079069248367837, + "grad_norm": 0.71484375, + "learning_rate": 3.088898159344109e-05, + "loss": 0.659, + "step": 5264 + }, + { + "epoch": 0.2308345357003356, + "grad_norm": 0.80859375, + "learning_rate": 3.088565431579387e-05, + "loss": 0.9316, + "step": 5265 + }, + { + "epoch": 0.23087837891699284, + "grad_norm": 0.8984375, + "learning_rate": 3.0882327184630565e-05, + "loss": 0.8267, + "step": 5266 + }, + { + "epoch": 0.23092222213365007, + "grad_norm": 0.81640625, + "learning_rate": 3.087900019995823e-05, + "loss": 0.7382, + "step": 5267 + }, + { + "epoch": 0.2309660653503073, + "grad_norm": 0.8203125, + "learning_rate": 3.0875673361783905e-05, + "loss": 0.8981, + "step": 5268 + }, + { + "epoch": 0.23100990856696454, + "grad_norm": 0.80859375, + "learning_rate": 3.087234667011464e-05, + "loss": 0.7267, + "step": 5269 + }, + { + "epoch": 0.23105375178362178, + "grad_norm": 0.86328125, + "learning_rate": 3.086902012495745e-05, + "loss": 0.8354, + "step": 5270 + }, + { + "epoch": 0.23109759500027902, + "grad_norm": 0.82421875, + "learning_rate": 3.086569372631944e-05, + "loss": 0.7886, + "step": 5271 + }, + { + "epoch": 0.23114143821693622, + "grad_norm": 0.8515625, + "learning_rate": 3.086236747420766e-05, + "loss": 0.7636, + "step": 5272 + }, + { + "epoch": 0.23118528143359346, + "grad_norm": 0.79296875, + "learning_rate": 3.085904136862914e-05, + "loss": 0.6507, + "step": 5273 + }, + { + "epoch": 0.2312291246502507, + "grad_norm": 0.80859375, + "learning_rate": 3.085571540959091e-05, + "loss": 0.8641, + "step": 5274 + }, + { + "epoch": 0.23127296786690793, + "grad_norm": 0.8046875, + "learning_rate": 3.0852389597100026e-05, + "loss": 0.8091, + "step": 5275 + }, + { + "epoch": 0.23131681108356517, + "grad_norm": 0.89453125, + "learning_rate": 3.084906393116358e-05, + "loss": 0.7754, + "step": 5276 + }, + { + "epoch": 0.2313606543002224, + "grad_norm": 0.7109375, + "learning_rate": 3.084573841178858e-05, + "loss": 0.7383, + "step": 5277 + }, + { + "epoch": 0.23140449751687964, + "grad_norm": 0.8046875, + "learning_rate": 3.08424130389821e-05, + "loss": 0.7933, + "step": 5278 + }, + { + "epoch": 0.23144834073353687, + "grad_norm": 16.5, + "learning_rate": 3.083908781275116e-05, + "loss": 0.8047, + "step": 5279 + }, + { + "epoch": 0.2314921839501941, + "grad_norm": 0.828125, + "learning_rate": 3.083576273310279e-05, + "loss": 0.8106, + "step": 5280 + }, + { + "epoch": 0.23153602716685134, + "grad_norm": 0.875, + "learning_rate": 3.08324378000441e-05, + "loss": 0.88, + "step": 5281 + }, + { + "epoch": 0.23157987038350858, + "grad_norm": 0.80859375, + "learning_rate": 3.0829113013582114e-05, + "loss": 0.7375, + "step": 5282 + }, + { + "epoch": 0.23162371360016581, + "grad_norm": 0.78125, + "learning_rate": 3.0825788373723865e-05, + "loss": 0.8901, + "step": 5283 + }, + { + "epoch": 0.23166755681682305, + "grad_norm": 0.87890625, + "learning_rate": 3.0822463880476394e-05, + "loss": 0.8133, + "step": 5284 + }, + { + "epoch": 0.23171140003348029, + "grad_norm": 0.81640625, + "learning_rate": 3.081913953384673e-05, + "loss": 0.8858, + "step": 5285 + }, + { + "epoch": 0.23175524325013752, + "grad_norm": 0.859375, + "learning_rate": 3.081581533384198e-05, + "loss": 0.7775, + "step": 5286 + }, + { + "epoch": 0.23179908646679473, + "grad_norm": 0.9609375, + "learning_rate": 3.0812491280469155e-05, + "loss": 0.7541, + "step": 5287 + }, + { + "epoch": 0.23184292968345196, + "grad_norm": 0.84375, + "learning_rate": 3.08091673737353e-05, + "loss": 0.7168, + "step": 5288 + }, + { + "epoch": 0.2318867729001092, + "grad_norm": 0.8359375, + "learning_rate": 3.080584361364746e-05, + "loss": 0.7667, + "step": 5289 + }, + { + "epoch": 0.23193061611676644, + "grad_norm": 0.9140625, + "learning_rate": 3.080252000021264e-05, + "loss": 0.8284, + "step": 5290 + }, + { + "epoch": 0.23197445933342367, + "grad_norm": 0.875, + "learning_rate": 3.079919653343797e-05, + "loss": 0.8318, + "step": 5291 + }, + { + "epoch": 0.2320183025500809, + "grad_norm": 0.9140625, + "learning_rate": 3.079587321333044e-05, + "loss": 1.0043, + "step": 5292 + }, + { + "epoch": 0.23206214576673814, + "grad_norm": 0.82421875, + "learning_rate": 3.079255003989709e-05, + "loss": 0.8884, + "step": 5293 + }, + { + "epoch": 0.23210598898339538, + "grad_norm": 0.828125, + "learning_rate": 3.0789227013144996e-05, + "loss": 0.7822, + "step": 5294 + }, + { + "epoch": 0.2321498322000526, + "grad_norm": 0.8203125, + "learning_rate": 3.078590413308113e-05, + "loss": 0.7114, + "step": 5295 + }, + { + "epoch": 0.23219367541670985, + "grad_norm": 0.72265625, + "learning_rate": 3.0782581399712626e-05, + "loss": 0.6723, + "step": 5296 + }, + { + "epoch": 0.23223751863336708, + "grad_norm": 0.9140625, + "learning_rate": 3.0779258813046486e-05, + "loss": 0.7854, + "step": 5297 + }, + { + "epoch": 0.23228136185002432, + "grad_norm": 0.83203125, + "learning_rate": 3.077593637308974e-05, + "loss": 0.8124, + "step": 5298 + }, + { + "epoch": 0.23232520506668156, + "grad_norm": 0.8828125, + "learning_rate": 3.0772614079849446e-05, + "loss": 0.9135, + "step": 5299 + }, + { + "epoch": 0.2323690482833388, + "grad_norm": 0.84765625, + "learning_rate": 3.076929193333265e-05, + "loss": 0.7309, + "step": 5300 + }, + { + "epoch": 0.23241289149999603, + "grad_norm": 0.8046875, + "learning_rate": 3.076596993354637e-05, + "loss": 0.7974, + "step": 5301 + }, + { + "epoch": 0.23245673471665323, + "grad_norm": 0.8984375, + "learning_rate": 3.0762648080497655e-05, + "loss": 0.9109, + "step": 5302 + }, + { + "epoch": 0.23250057793331047, + "grad_norm": 0.84375, + "learning_rate": 3.075932637419356e-05, + "loss": 0.8975, + "step": 5303 + }, + { + "epoch": 0.2325444211499677, + "grad_norm": 0.8671875, + "learning_rate": 3.075600481464111e-05, + "loss": 0.7702, + "step": 5304 + }, + { + "epoch": 0.23258826436662494, + "grad_norm": 0.90234375, + "learning_rate": 3.0752683401847326e-05, + "loss": 0.9374, + "step": 5305 + }, + { + "epoch": 0.23263210758328218, + "grad_norm": 0.76953125, + "learning_rate": 3.0749362135819304e-05, + "loss": 0.7358, + "step": 5306 + }, + { + "epoch": 0.2326759507999394, + "grad_norm": 0.875, + "learning_rate": 3.074604101656404e-05, + "loss": 0.8295, + "step": 5307 + }, + { + "epoch": 0.23271979401659665, + "grad_norm": 0.75, + "learning_rate": 3.074272004408859e-05, + "loss": 0.7137, + "step": 5308 + }, + { + "epoch": 0.23276363723325388, + "grad_norm": 0.828125, + "learning_rate": 3.073939921839999e-05, + "loss": 0.7728, + "step": 5309 + }, + { + "epoch": 0.23280748044991112, + "grad_norm": 0.9296875, + "learning_rate": 3.0736078539505234e-05, + "loss": 0.8897, + "step": 5310 + }, + { + "epoch": 0.23285132366656836, + "grad_norm": 0.89453125, + "learning_rate": 3.073275800741143e-05, + "loss": 0.7658, + "step": 5311 + }, + { + "epoch": 0.2328951668832256, + "grad_norm": 0.9296875, + "learning_rate": 3.07294376221256e-05, + "loss": 0.8235, + "step": 5312 + }, + { + "epoch": 0.23293901009988283, + "grad_norm": 0.8125, + "learning_rate": 3.0726117383654754e-05, + "loss": 0.7746, + "step": 5313 + }, + { + "epoch": 0.23298285331654006, + "grad_norm": 0.859375, + "learning_rate": 3.072279729200596e-05, + "loss": 0.8168, + "step": 5314 + }, + { + "epoch": 0.2330266965331973, + "grad_norm": 0.8828125, + "learning_rate": 3.071947734718619e-05, + "loss": 0.8284, + "step": 5315 + }, + { + "epoch": 0.23307053974985453, + "grad_norm": 0.80859375, + "learning_rate": 3.071615754920256e-05, + "loss": 0.7292, + "step": 5316 + }, + { + "epoch": 0.23311438296651174, + "grad_norm": 0.84375, + "learning_rate": 3.071283789806208e-05, + "loss": 0.8217, + "step": 5317 + }, + { + "epoch": 0.23315822618316898, + "grad_norm": 0.91796875, + "learning_rate": 3.070951839377178e-05, + "loss": 0.8686, + "step": 5318 + }, + { + "epoch": 0.2332020693998262, + "grad_norm": 0.80078125, + "learning_rate": 3.070619903633869e-05, + "loss": 0.9203, + "step": 5319 + }, + { + "epoch": 0.23324591261648345, + "grad_norm": 0.91015625, + "learning_rate": 3.0702879825769814e-05, + "loss": 0.8993, + "step": 5320 + }, + { + "epoch": 0.23328975583314068, + "grad_norm": 0.8359375, + "learning_rate": 3.069956076207227e-05, + "loss": 0.8449, + "step": 5321 + }, + { + "epoch": 0.23333359904979792, + "grad_norm": 0.8828125, + "learning_rate": 3.069624184525304e-05, + "loss": 0.9233, + "step": 5322 + }, + { + "epoch": 0.23337744226645515, + "grad_norm": 0.81640625, + "learning_rate": 3.069292307531916e-05, + "loss": 0.7609, + "step": 5323 + }, + { + "epoch": 0.2334212854831124, + "grad_norm": 0.90234375, + "learning_rate": 3.0689604452277677e-05, + "loss": 0.7876, + "step": 5324 + }, + { + "epoch": 0.23346512869976963, + "grad_norm": 0.76953125, + "learning_rate": 3.068628597613558e-05, + "loss": 0.7204, + "step": 5325 + }, + { + "epoch": 0.23350897191642686, + "grad_norm": 0.86328125, + "learning_rate": 3.068296764689996e-05, + "loss": 0.7575, + "step": 5326 + }, + { + "epoch": 0.2335528151330841, + "grad_norm": 0.8359375, + "learning_rate": 3.067964946457784e-05, + "loss": 0.6952, + "step": 5327 + }, + { + "epoch": 0.23359665834974133, + "grad_norm": 0.8203125, + "learning_rate": 3.0676331429176234e-05, + "loss": 0.7917, + "step": 5328 + }, + { + "epoch": 0.23364050156639857, + "grad_norm": 0.78515625, + "learning_rate": 3.0673013540702186e-05, + "loss": 0.755, + "step": 5329 + }, + { + "epoch": 0.2336843447830558, + "grad_norm": 0.80859375, + "learning_rate": 3.066969579916269e-05, + "loss": 0.8873, + "step": 5330 + }, + { + "epoch": 0.23372818799971304, + "grad_norm": 0.8203125, + "learning_rate": 3.066637820456484e-05, + "loss": 0.801, + "step": 5331 + }, + { + "epoch": 0.23377203121637025, + "grad_norm": 0.8125, + "learning_rate": 3.066306075691564e-05, + "loss": 0.8218, + "step": 5332 + }, + { + "epoch": 0.23381587443302748, + "grad_norm": 0.90625, + "learning_rate": 3.065974345622212e-05, + "loss": 0.9119, + "step": 5333 + }, + { + "epoch": 0.23385971764968472, + "grad_norm": 0.83984375, + "learning_rate": 3.065642630249131e-05, + "loss": 0.8891, + "step": 5334 + }, + { + "epoch": 0.23390356086634195, + "grad_norm": 0.86328125, + "learning_rate": 3.065310929573022e-05, + "loss": 0.8504, + "step": 5335 + }, + { + "epoch": 0.2339474040829992, + "grad_norm": 0.83203125, + "learning_rate": 3.064979243594592e-05, + "loss": 0.8789, + "step": 5336 + }, + { + "epoch": 0.23399124729965642, + "grad_norm": 0.8203125, + "learning_rate": 3.064647572314542e-05, + "loss": 0.9114, + "step": 5337 + }, + { + "epoch": 0.23403509051631366, + "grad_norm": 0.87109375, + "learning_rate": 3.064315915733575e-05, + "loss": 0.8143, + "step": 5338 + }, + { + "epoch": 0.2340789337329709, + "grad_norm": 0.8671875, + "learning_rate": 3.0639842738523947e-05, + "loss": 0.7194, + "step": 5339 + }, + { + "epoch": 0.23412277694962813, + "grad_norm": 0.8046875, + "learning_rate": 3.063652646671704e-05, + "loss": 0.7707, + "step": 5340 + }, + { + "epoch": 0.23416662016628537, + "grad_norm": 0.828125, + "learning_rate": 3.063321034192204e-05, + "loss": 0.7843, + "step": 5341 + }, + { + "epoch": 0.2342104633829426, + "grad_norm": 0.80859375, + "learning_rate": 3.062989436414595e-05, + "loss": 0.9002, + "step": 5342 + }, + { + "epoch": 0.23425430659959984, + "grad_norm": 0.796875, + "learning_rate": 3.062657853339588e-05, + "loss": 0.6504, + "step": 5343 + }, + { + "epoch": 0.23429814981625707, + "grad_norm": 0.76171875, + "learning_rate": 3.0623262849678805e-05, + "loss": 0.7017, + "step": 5344 + }, + { + "epoch": 0.2343419930329143, + "grad_norm": 0.8515625, + "learning_rate": 3.061994731300175e-05, + "loss": 0.7815, + "step": 5345 + }, + { + "epoch": 0.23438583624957154, + "grad_norm": 0.7734375, + "learning_rate": 3.061663192337175e-05, + "loss": 0.8808, + "step": 5346 + }, + { + "epoch": 0.23442967946622878, + "grad_norm": 0.8125, + "learning_rate": 3.0613316680795844e-05, + "loss": 0.7691, + "step": 5347 + }, + { + "epoch": 0.234473522682886, + "grad_norm": 0.83203125, + "learning_rate": 3.0610001585281047e-05, + "loss": 0.9008, + "step": 5348 + }, + { + "epoch": 0.23451736589954322, + "grad_norm": 0.796875, + "learning_rate": 3.0606686636834384e-05, + "loss": 0.7931, + "step": 5349 + }, + { + "epoch": 0.23456120911620046, + "grad_norm": 0.96875, + "learning_rate": 3.060337183546284e-05, + "loss": 0.9518, + "step": 5350 + }, + { + "epoch": 0.2346050523328577, + "grad_norm": 0.83984375, + "learning_rate": 3.060005718117354e-05, + "loss": 0.6906, + "step": 5351 + }, + { + "epoch": 0.23464889554951493, + "grad_norm": 0.84765625, + "learning_rate": 3.059674267397342e-05, + "loss": 0.8778, + "step": 5352 + }, + { + "epoch": 0.23469273876617217, + "grad_norm": 0.82421875, + "learning_rate": 3.0593428313869564e-05, + "loss": 0.9273, + "step": 5353 + }, + { + "epoch": 0.2347365819828294, + "grad_norm": 0.8359375, + "learning_rate": 3.059011410086895e-05, + "loss": 0.5959, + "step": 5354 + }, + { + "epoch": 0.23478042519948664, + "grad_norm": 0.8125, + "learning_rate": 3.05868000349786e-05, + "loss": 0.8607, + "step": 5355 + }, + { + "epoch": 0.23482426841614387, + "grad_norm": 0.78125, + "learning_rate": 3.058348611620559e-05, + "loss": 0.8983, + "step": 5356 + }, + { + "epoch": 0.2348681116328011, + "grad_norm": 0.765625, + "learning_rate": 3.0580172344556915e-05, + "loss": 0.7524, + "step": 5357 + }, + { + "epoch": 0.23491195484945834, + "grad_norm": 0.8125, + "learning_rate": 3.05768587200396e-05, + "loss": 0.7687, + "step": 5358 + }, + { + "epoch": 0.23495579806611558, + "grad_norm": 0.78515625, + "learning_rate": 3.057354524266066e-05, + "loss": 0.7383, + "step": 5359 + }, + { + "epoch": 0.23499964128277281, + "grad_norm": 0.77734375, + "learning_rate": 3.0570231912427095e-05, + "loss": 0.7872, + "step": 5360 + }, + { + "epoch": 0.23504348449943005, + "grad_norm": 0.88671875, + "learning_rate": 3.0566918729345976e-05, + "loss": 0.8538, + "step": 5361 + }, + { + "epoch": 0.23508732771608729, + "grad_norm": 0.828125, + "learning_rate": 3.0563605693424315e-05, + "loss": 0.8584, + "step": 5362 + }, + { + "epoch": 0.2351311709327445, + "grad_norm": 0.8828125, + "learning_rate": 3.056029280466912e-05, + "loss": 0.8578, + "step": 5363 + }, + { + "epoch": 0.23517501414940173, + "grad_norm": 0.77734375, + "learning_rate": 3.055698006308742e-05, + "loss": 0.7993, + "step": 5364 + }, + { + "epoch": 0.23521885736605896, + "grad_norm": 0.80078125, + "learning_rate": 3.055366746868619e-05, + "loss": 0.8387, + "step": 5365 + }, + { + "epoch": 0.2352627005827162, + "grad_norm": 1.1328125, + "learning_rate": 3.0550355021472524e-05, + "loss": 0.9297, + "step": 5366 + }, + { + "epoch": 0.23530654379937344, + "grad_norm": 0.859375, + "learning_rate": 3.0547042721453414e-05, + "loss": 0.8261, + "step": 5367 + }, + { + "epoch": 0.23535038701603067, + "grad_norm": 0.79296875, + "learning_rate": 3.054373056863589e-05, + "loss": 0.8579, + "step": 5368 + }, + { + "epoch": 0.2353942302326879, + "grad_norm": 0.859375, + "learning_rate": 3.054041856302694e-05, + "loss": 0.9427, + "step": 5369 + }, + { + "epoch": 0.23543807344934514, + "grad_norm": 0.8203125, + "learning_rate": 3.0537106704633576e-05, + "loss": 0.8562, + "step": 5370 + }, + { + "epoch": 0.23548191666600238, + "grad_norm": 0.85546875, + "learning_rate": 3.0533794993462886e-05, + "loss": 0.8366, + "step": 5371 + }, + { + "epoch": 0.2355257598826596, + "grad_norm": 0.83984375, + "learning_rate": 3.053048342952184e-05, + "loss": 0.8949, + "step": 5372 + }, + { + "epoch": 0.23556960309931685, + "grad_norm": 0.71484375, + "learning_rate": 3.052717201281746e-05, + "loss": 0.7137, + "step": 5373 + }, + { + "epoch": 0.23561344631597408, + "grad_norm": 0.890625, + "learning_rate": 3.052386074335677e-05, + "loss": 0.8763, + "step": 5374 + }, + { + "epoch": 0.23565728953263132, + "grad_norm": 0.87109375, + "learning_rate": 3.052054962114676e-05, + "loss": 0.8061, + "step": 5375 + }, + { + "epoch": 0.23570113274928856, + "grad_norm": 0.828125, + "learning_rate": 3.05172386461945e-05, + "loss": 0.933, + "step": 5376 + }, + { + "epoch": 0.2357449759659458, + "grad_norm": 0.77734375, + "learning_rate": 3.051392781850698e-05, + "loss": 0.7063, + "step": 5377 + }, + { + "epoch": 0.235788819182603, + "grad_norm": 0.78125, + "learning_rate": 3.0510617138091224e-05, + "loss": 0.6828, + "step": 5378 + }, + { + "epoch": 0.23583266239926023, + "grad_norm": 0.7578125, + "learning_rate": 3.0507306604954233e-05, + "loss": 0.7552, + "step": 5379 + }, + { + "epoch": 0.23587650561591747, + "grad_norm": 0.82421875, + "learning_rate": 3.0503996219103016e-05, + "loss": 0.8921, + "step": 5380 + }, + { + "epoch": 0.2359203488325747, + "grad_norm": 0.82421875, + "learning_rate": 3.0500685980544628e-05, + "loss": 0.724, + "step": 5381 + }, + { + "epoch": 0.23596419204923194, + "grad_norm": 0.82421875, + "learning_rate": 3.0497375889286073e-05, + "loss": 0.7803, + "step": 5382 + }, + { + "epoch": 0.23600803526588918, + "grad_norm": 0.87109375, + "learning_rate": 3.049406594533435e-05, + "loss": 0.827, + "step": 5383 + }, + { + "epoch": 0.2360518784825464, + "grad_norm": 0.72265625, + "learning_rate": 3.0490756148696474e-05, + "loss": 0.8137, + "step": 5384 + }, + { + "epoch": 0.23609572169920365, + "grad_norm": 0.80078125, + "learning_rate": 3.048744649937948e-05, + "loss": 0.8313, + "step": 5385 + }, + { + "epoch": 0.23613956491586088, + "grad_norm": 0.83984375, + "learning_rate": 3.0484136997390366e-05, + "loss": 0.8857, + "step": 5386 + }, + { + "epoch": 0.23618340813251812, + "grad_norm": 0.98828125, + "learning_rate": 3.0480827642736154e-05, + "loss": 0.9493, + "step": 5387 + }, + { + "epoch": 0.23622725134917535, + "grad_norm": 0.74609375, + "learning_rate": 3.0477518435423846e-05, + "loss": 0.6741, + "step": 5388 + }, + { + "epoch": 0.2362710945658326, + "grad_norm": 0.7734375, + "learning_rate": 3.047420937546047e-05, + "loss": 0.7057, + "step": 5389 + }, + { + "epoch": 0.23631493778248983, + "grad_norm": 0.87109375, + "learning_rate": 3.0470900462853002e-05, + "loss": 0.7788, + "step": 5390 + }, + { + "epoch": 0.23635878099914706, + "grad_norm": 0.8515625, + "learning_rate": 3.046759169760852e-05, + "loss": 0.9394, + "step": 5391 + }, + { + "epoch": 0.2364026242158043, + "grad_norm": 0.73046875, + "learning_rate": 3.0464283079734002e-05, + "loss": 0.7924, + "step": 5392 + }, + { + "epoch": 0.2364464674324615, + "grad_norm": 0.9453125, + "learning_rate": 3.0460974609236458e-05, + "loss": 0.8135, + "step": 5393 + }, + { + "epoch": 0.23649031064911874, + "grad_norm": 0.859375, + "learning_rate": 3.04576662861229e-05, + "loss": 0.9246, + "step": 5394 + }, + { + "epoch": 0.23653415386577598, + "grad_norm": 0.8671875, + "learning_rate": 3.045435811040035e-05, + "loss": 0.791, + "step": 5395 + }, + { + "epoch": 0.2365779970824332, + "grad_norm": 0.8828125, + "learning_rate": 3.0451050082075782e-05, + "loss": 0.8652, + "step": 5396 + }, + { + "epoch": 0.23662184029909045, + "grad_norm": 0.87109375, + "learning_rate": 3.044774220115627e-05, + "loss": 0.7053, + "step": 5397 + }, + { + "epoch": 0.23666568351574768, + "grad_norm": 0.85546875, + "learning_rate": 3.0444434467648787e-05, + "loss": 0.7971, + "step": 5398 + }, + { + "epoch": 0.23670952673240492, + "grad_norm": 0.7890625, + "learning_rate": 3.0441126881560355e-05, + "loss": 0.8588, + "step": 5399 + }, + { + "epoch": 0.23675336994906215, + "grad_norm": 0.86328125, + "learning_rate": 3.0437819442897974e-05, + "loss": 0.821, + "step": 5400 + }, + { + "epoch": 0.2367972131657194, + "grad_norm": 0.86328125, + "learning_rate": 3.0434512151668636e-05, + "loss": 0.8015, + "step": 5401 + }, + { + "epoch": 0.23684105638237662, + "grad_norm": 0.80859375, + "learning_rate": 3.0431205007879393e-05, + "loss": 0.811, + "step": 5402 + }, + { + "epoch": 0.23688489959903386, + "grad_norm": 0.921875, + "learning_rate": 3.0427898011537248e-05, + "loss": 0.7811, + "step": 5403 + }, + { + "epoch": 0.2369287428156911, + "grad_norm": 0.8125, + "learning_rate": 3.0424591162649186e-05, + "loss": 0.7878, + "step": 5404 + }, + { + "epoch": 0.23697258603234833, + "grad_norm": 0.8046875, + "learning_rate": 3.0421284461222233e-05, + "loss": 0.8972, + "step": 5405 + }, + { + "epoch": 0.23701642924900557, + "grad_norm": 0.84765625, + "learning_rate": 3.0417977907263352e-05, + "loss": 0.882, + "step": 5406 + }, + { + "epoch": 0.2370602724656628, + "grad_norm": 0.8046875, + "learning_rate": 3.0414671500779623e-05, + "loss": 0.928, + "step": 5407 + }, + { + "epoch": 0.23710411568232, + "grad_norm": 0.7734375, + "learning_rate": 3.0411365241778022e-05, + "loss": 0.7947, + "step": 5408 + }, + { + "epoch": 0.23714795889897725, + "grad_norm": 0.890625, + "learning_rate": 3.0408059130265555e-05, + "loss": 0.9074, + "step": 5409 + }, + { + "epoch": 0.23719180211563448, + "grad_norm": 0.859375, + "learning_rate": 3.0404753166249232e-05, + "loss": 0.8893, + "step": 5410 + }, + { + "epoch": 0.23723564533229172, + "grad_norm": 0.80859375, + "learning_rate": 3.0401447349736023e-05, + "loss": 0.7268, + "step": 5411 + }, + { + "epoch": 0.23727948854894895, + "grad_norm": 0.76953125, + "learning_rate": 3.0398141680732995e-05, + "loss": 0.7487, + "step": 5412 + }, + { + "epoch": 0.2373233317656062, + "grad_norm": 0.82421875, + "learning_rate": 3.039483615924713e-05, + "loss": 0.8839, + "step": 5413 + }, + { + "epoch": 0.23736717498226342, + "grad_norm": 0.890625, + "learning_rate": 3.039153078528544e-05, + "loss": 0.7217, + "step": 5414 + }, + { + "epoch": 0.23741101819892066, + "grad_norm": 0.77734375, + "learning_rate": 3.038822555885491e-05, + "loss": 0.8096, + "step": 5415 + }, + { + "epoch": 0.2374548614155779, + "grad_norm": 0.78515625, + "learning_rate": 3.038492047996253e-05, + "loss": 0.7877, + "step": 5416 + }, + { + "epoch": 0.23749870463223513, + "grad_norm": 0.90625, + "learning_rate": 3.038161554861536e-05, + "loss": 0.8185, + "step": 5417 + }, + { + "epoch": 0.23754254784889237, + "grad_norm": 0.72265625, + "learning_rate": 3.0378310764820384e-05, + "loss": 0.6987, + "step": 5418 + }, + { + "epoch": 0.2375863910655496, + "grad_norm": 0.8359375, + "learning_rate": 3.0375006128584595e-05, + "loss": 0.9077, + "step": 5419 + }, + { + "epoch": 0.23763023428220684, + "grad_norm": 0.8515625, + "learning_rate": 3.0371701639915006e-05, + "loss": 0.8008, + "step": 5420 + }, + { + "epoch": 0.23767407749886407, + "grad_norm": 0.796875, + "learning_rate": 3.0368397298818574e-05, + "loss": 0.8306, + "step": 5421 + }, + { + "epoch": 0.2377179207155213, + "grad_norm": 0.859375, + "learning_rate": 3.036509310530239e-05, + "loss": 0.8284, + "step": 5422 + }, + { + "epoch": 0.23776176393217852, + "grad_norm": 0.83984375, + "learning_rate": 3.0361789059373402e-05, + "loss": 0.7768, + "step": 5423 + }, + { + "epoch": 0.23780560714883575, + "grad_norm": 0.8203125, + "learning_rate": 3.035848516103862e-05, + "loss": 0.8364, + "step": 5424 + }, + { + "epoch": 0.237849450365493, + "grad_norm": 0.83203125, + "learning_rate": 3.0355181410305045e-05, + "loss": 0.8376, + "step": 5425 + }, + { + "epoch": 0.23789329358215022, + "grad_norm": 0.78125, + "learning_rate": 3.0351877807179685e-05, + "loss": 0.8734, + "step": 5426 + }, + { + "epoch": 0.23793713679880746, + "grad_norm": 0.9296875, + "learning_rate": 3.034857435166951e-05, + "loss": 0.7605, + "step": 5427 + }, + { + "epoch": 0.2379809800154647, + "grad_norm": 0.8203125, + "learning_rate": 3.0345271043781588e-05, + "loss": 0.6659, + "step": 5428 + }, + { + "epoch": 0.23802482323212193, + "grad_norm": 0.8515625, + "learning_rate": 3.034196788352287e-05, + "loss": 0.865, + "step": 5429 + }, + { + "epoch": 0.23806866644877916, + "grad_norm": 0.87890625, + "learning_rate": 3.033866487090038e-05, + "loss": 0.8734, + "step": 5430 + }, + { + "epoch": 0.2381125096654364, + "grad_norm": 0.92578125, + "learning_rate": 3.033536200592111e-05, + "loss": 0.9917, + "step": 5431 + }, + { + "epoch": 0.23815635288209364, + "grad_norm": 0.86328125, + "learning_rate": 3.033205928859205e-05, + "loss": 0.7571, + "step": 5432 + }, + { + "epoch": 0.23820019609875087, + "grad_norm": 0.87109375, + "learning_rate": 3.0328756718920217e-05, + "loss": 0.7834, + "step": 5433 + }, + { + "epoch": 0.2382440393154081, + "grad_norm": 0.8984375, + "learning_rate": 3.032545429691259e-05, + "loss": 0.8357, + "step": 5434 + }, + { + "epoch": 0.23828788253206534, + "grad_norm": 0.84765625, + "learning_rate": 3.032215202257619e-05, + "loss": 0.842, + "step": 5435 + }, + { + "epoch": 0.23833172574872258, + "grad_norm": 0.80078125, + "learning_rate": 3.0318849895917966e-05, + "loss": 0.8017, + "step": 5436 + }, + { + "epoch": 0.2383755689653798, + "grad_norm": 0.9375, + "learning_rate": 3.0315547916944997e-05, + "loss": 0.8983, + "step": 5437 + }, + { + "epoch": 0.23841941218203705, + "grad_norm": 0.75390625, + "learning_rate": 3.031224608566423e-05, + "loss": 0.7886, + "step": 5438 + }, + { + "epoch": 0.23846325539869426, + "grad_norm": 0.8125, + "learning_rate": 3.0308944402082685e-05, + "loss": 0.7818, + "step": 5439 + }, + { + "epoch": 0.2385070986153515, + "grad_norm": 0.84375, + "learning_rate": 3.0305642866207352e-05, + "loss": 0.7358, + "step": 5440 + }, + { + "epoch": 0.23855094183200873, + "grad_norm": 0.8203125, + "learning_rate": 3.0302341478045184e-05, + "loss": 0.7667, + "step": 5441 + }, + { + "epoch": 0.23859478504866596, + "grad_norm": 1.28125, + "learning_rate": 3.029904023760326e-05, + "loss": 0.8673, + "step": 5442 + }, + { + "epoch": 0.2386386282653232, + "grad_norm": 0.92578125, + "learning_rate": 3.0295739144888525e-05, + "loss": 0.8807, + "step": 5443 + }, + { + "epoch": 0.23868247148198043, + "grad_norm": 0.77734375, + "learning_rate": 3.029243819990799e-05, + "loss": 0.8385, + "step": 5444 + }, + { + "epoch": 0.23872631469863767, + "grad_norm": 0.8125, + "learning_rate": 3.0289137402668654e-05, + "loss": 0.8401, + "step": 5445 + }, + { + "epoch": 0.2387701579152949, + "grad_norm": 0.92578125, + "learning_rate": 3.0285836753177478e-05, + "loss": 0.6882, + "step": 5446 + }, + { + "epoch": 0.23881400113195214, + "grad_norm": 0.84765625, + "learning_rate": 3.0282536251441505e-05, + "loss": 0.832, + "step": 5447 + }, + { + "epoch": 0.23885784434860938, + "grad_norm": 0.80859375, + "learning_rate": 3.0279235897467717e-05, + "loss": 0.8429, + "step": 5448 + }, + { + "epoch": 0.2389016875652666, + "grad_norm": 0.8359375, + "learning_rate": 3.0275935691263112e-05, + "loss": 0.8163, + "step": 5449 + }, + { + "epoch": 0.23894553078192385, + "grad_norm": 0.8828125, + "learning_rate": 3.0272635632834667e-05, + "loss": 0.8353, + "step": 5450 + }, + { + "epoch": 0.23898937399858108, + "grad_norm": 0.87890625, + "learning_rate": 3.026933572218935e-05, + "loss": 0.7941, + "step": 5451 + }, + { + "epoch": 0.23903321721523832, + "grad_norm": 0.8828125, + "learning_rate": 3.026603595933424e-05, + "loss": 0.8283, + "step": 5452 + }, + { + "epoch": 0.23907706043189556, + "grad_norm": 0.91015625, + "learning_rate": 3.0262736344276266e-05, + "loss": 0.8518, + "step": 5453 + }, + { + "epoch": 0.23912090364855276, + "grad_norm": 0.76953125, + "learning_rate": 3.025943687702244e-05, + "loss": 0.7775, + "step": 5454 + }, + { + "epoch": 0.23916474686521, + "grad_norm": 0.8125, + "learning_rate": 3.0256137557579744e-05, + "loss": 0.8157, + "step": 5455 + }, + { + "epoch": 0.23920859008186723, + "grad_norm": 0.77734375, + "learning_rate": 3.025283838595515e-05, + "loss": 0.8298, + "step": 5456 + }, + { + "epoch": 0.23925243329852447, + "grad_norm": 0.81640625, + "learning_rate": 3.0249539362155722e-05, + "loss": 0.7711, + "step": 5457 + }, + { + "epoch": 0.2392962765151817, + "grad_norm": 0.6796875, + "learning_rate": 3.0246240486188392e-05, + "loss": 0.7467, + "step": 5458 + }, + { + "epoch": 0.23934011973183894, + "grad_norm": 1.0078125, + "learning_rate": 3.0242941758060183e-05, + "loss": 0.8681, + "step": 5459 + }, + { + "epoch": 0.23938396294849618, + "grad_norm": 0.90234375, + "learning_rate": 3.0239643177778065e-05, + "loss": 0.8217, + "step": 5460 + }, + { + "epoch": 0.2394278061651534, + "grad_norm": 0.77734375, + "learning_rate": 3.0236344745348998e-05, + "loss": 0.6561, + "step": 5461 + }, + { + "epoch": 0.23947164938181065, + "grad_norm": 1.125, + "learning_rate": 3.0233046460780058e-05, + "loss": 0.9082, + "step": 5462 + }, + { + "epoch": 0.23951549259846788, + "grad_norm": 0.828125, + "learning_rate": 3.0229748324078178e-05, + "loss": 0.8746, + "step": 5463 + }, + { + "epoch": 0.23955933581512512, + "grad_norm": 0.796875, + "learning_rate": 3.0226450335250354e-05, + "loss": 0.7465, + "step": 5464 + }, + { + "epoch": 0.23960317903178235, + "grad_norm": 0.84765625, + "learning_rate": 3.0223152494303587e-05, + "loss": 0.8505, + "step": 5465 + }, + { + "epoch": 0.2396470222484396, + "grad_norm": 0.95703125, + "learning_rate": 3.021985480124483e-05, + "loss": 0.8586, + "step": 5466 + }, + { + "epoch": 0.23969086546509683, + "grad_norm": 0.78125, + "learning_rate": 3.0216557256081123e-05, + "loss": 0.6775, + "step": 5467 + }, + { + "epoch": 0.23973470868175406, + "grad_norm": 0.78515625, + "learning_rate": 3.0213259858819453e-05, + "loss": 0.845, + "step": 5468 + }, + { + "epoch": 0.23977855189841127, + "grad_norm": 0.828125, + "learning_rate": 3.020996260946678e-05, + "loss": 0.9027, + "step": 5469 + }, + { + "epoch": 0.2398223951150685, + "grad_norm": 0.7734375, + "learning_rate": 3.0206665508030095e-05, + "loss": 0.724, + "step": 5470 + }, + { + "epoch": 0.23986623833172574, + "grad_norm": 0.875, + "learning_rate": 3.02033685545164e-05, + "loss": 0.9814, + "step": 5471 + }, + { + "epoch": 0.23991008154838298, + "grad_norm": 0.828125, + "learning_rate": 3.0200071748932678e-05, + "loss": 0.8219, + "step": 5472 + }, + { + "epoch": 0.2399539247650402, + "grad_norm": 0.8125, + "learning_rate": 3.019677509128591e-05, + "loss": 0.7908, + "step": 5473 + }, + { + "epoch": 0.23999776798169745, + "grad_norm": 0.828125, + "learning_rate": 3.019347858158309e-05, + "loss": 0.7244, + "step": 5474 + }, + { + "epoch": 0.24004161119835468, + "grad_norm": 0.86328125, + "learning_rate": 3.0190182219831208e-05, + "loss": 0.8568, + "step": 5475 + }, + { + "epoch": 0.24008545441501192, + "grad_norm": 0.78515625, + "learning_rate": 3.0186886006037206e-05, + "loss": 0.7057, + "step": 5476 + }, + { + "epoch": 0.24012929763166915, + "grad_norm": 0.8984375, + "learning_rate": 3.0183589940208133e-05, + "loss": 0.8909, + "step": 5477 + }, + { + "epoch": 0.2401731408483264, + "grad_norm": 0.87109375, + "learning_rate": 3.0180294022350963e-05, + "loss": 0.7956, + "step": 5478 + }, + { + "epoch": 0.24021698406498362, + "grad_norm": 0.91796875, + "learning_rate": 3.0176998252472665e-05, + "loss": 0.7267, + "step": 5479 + }, + { + "epoch": 0.24026082728164086, + "grad_norm": 0.8515625, + "learning_rate": 3.0173702630580235e-05, + "loss": 0.8675, + "step": 5480 + }, + { + "epoch": 0.2403046704982981, + "grad_norm": 0.8046875, + "learning_rate": 3.0170407156680615e-05, + "loss": 0.8087, + "step": 5481 + }, + { + "epoch": 0.24034851371495533, + "grad_norm": 0.80078125, + "learning_rate": 3.0167111830780846e-05, + "loss": 0.7533, + "step": 5482 + }, + { + "epoch": 0.24039235693161257, + "grad_norm": 1.1953125, + "learning_rate": 3.01638166528879e-05, + "loss": 0.8573, + "step": 5483 + }, + { + "epoch": 0.24043620014826977, + "grad_norm": 0.90625, + "learning_rate": 3.0160521623008763e-05, + "loss": 0.7714, + "step": 5484 + }, + { + "epoch": 0.240480043364927, + "grad_norm": 0.86328125, + "learning_rate": 3.01572267411504e-05, + "loss": 0.9052, + "step": 5485 + }, + { + "epoch": 0.24052388658158425, + "grad_norm": 0.8125, + "learning_rate": 3.0153932007319764e-05, + "loss": 0.7258, + "step": 5486 + }, + { + "epoch": 0.24056772979824148, + "grad_norm": 0.953125, + "learning_rate": 3.015063742152392e-05, + "loss": 0.8215, + "step": 5487 + }, + { + "epoch": 0.24061157301489872, + "grad_norm": 0.8125, + "learning_rate": 3.01473429837698e-05, + "loss": 0.778, + "step": 5488 + }, + { + "epoch": 0.24065541623155595, + "grad_norm": 0.76171875, + "learning_rate": 3.0144048694064397e-05, + "loss": 0.8342, + "step": 5489 + }, + { + "epoch": 0.2406992594482132, + "grad_norm": 0.88671875, + "learning_rate": 3.0140754552414697e-05, + "loss": 0.8958, + "step": 5490 + }, + { + "epoch": 0.24074310266487042, + "grad_norm": 1.1171875, + "learning_rate": 3.0137460558827634e-05, + "loss": 0.774, + "step": 5491 + }, + { + "epoch": 0.24078694588152766, + "grad_norm": 0.76953125, + "learning_rate": 3.0134166713310264e-05, + "loss": 0.7356, + "step": 5492 + }, + { + "epoch": 0.2408307890981849, + "grad_norm": 1.0390625, + "learning_rate": 3.013087301586953e-05, + "loss": 0.8728, + "step": 5493 + }, + { + "epoch": 0.24087463231484213, + "grad_norm": 0.875, + "learning_rate": 3.0127579466512424e-05, + "loss": 0.8503, + "step": 5494 + }, + { + "epoch": 0.24091847553149937, + "grad_norm": 0.83203125, + "learning_rate": 3.0124286065245922e-05, + "loss": 0.8234, + "step": 5495 + }, + { + "epoch": 0.2409623187481566, + "grad_norm": 1.78125, + "learning_rate": 3.0120992812076965e-05, + "loss": 0.8263, + "step": 5496 + }, + { + "epoch": 0.24100616196481384, + "grad_norm": 1.1015625, + "learning_rate": 3.0117699707012603e-05, + "loss": 0.8463, + "step": 5497 + }, + { + "epoch": 0.24105000518147107, + "grad_norm": 0.79296875, + "learning_rate": 3.0114406750059774e-05, + "loss": 0.8234, + "step": 5498 + }, + { + "epoch": 0.24109384839812828, + "grad_norm": 0.84375, + "learning_rate": 3.0111113941225477e-05, + "loss": 0.872, + "step": 5499 + }, + { + "epoch": 0.24113769161478552, + "grad_norm": 0.890625, + "learning_rate": 3.010782128051668e-05, + "loss": 0.893, + "step": 5500 + }, + { + "epoch": 0.24118153483144275, + "grad_norm": 0.7578125, + "learning_rate": 3.010452876794032e-05, + "loss": 0.7912, + "step": 5501 + }, + { + "epoch": 0.2412253780481, + "grad_norm": 0.90234375, + "learning_rate": 3.010123640350345e-05, + "loss": 0.7638, + "step": 5502 + }, + { + "epoch": 0.24126922126475722, + "grad_norm": 0.859375, + "learning_rate": 3.0097944187213013e-05, + "loss": 0.8116, + "step": 5503 + }, + { + "epoch": 0.24131306448141446, + "grad_norm": 0.7734375, + "learning_rate": 3.0094652119075994e-05, + "loss": 0.8407, + "step": 5504 + }, + { + "epoch": 0.2413569076980717, + "grad_norm": 0.86328125, + "learning_rate": 3.0091360199099362e-05, + "loss": 0.777, + "step": 5505 + }, + { + "epoch": 0.24140075091472893, + "grad_norm": 0.8359375, + "learning_rate": 3.0088068427290063e-05, + "loss": 0.7913, + "step": 5506 + }, + { + "epoch": 0.24144459413138616, + "grad_norm": 0.8984375, + "learning_rate": 3.0084776803655145e-05, + "loss": 0.7484, + "step": 5507 + }, + { + "epoch": 0.2414884373480434, + "grad_norm": 0.80859375, + "learning_rate": 3.0081485328201542e-05, + "loss": 0.8681, + "step": 5508 + }, + { + "epoch": 0.24153228056470064, + "grad_norm": 0.8515625, + "learning_rate": 3.0078194000936232e-05, + "loss": 0.8096, + "step": 5509 + }, + { + "epoch": 0.24157612378135787, + "grad_norm": 0.91015625, + "learning_rate": 3.007490282186619e-05, + "loss": 0.9051, + "step": 5510 + }, + { + "epoch": 0.2416199669980151, + "grad_norm": 0.7421875, + "learning_rate": 3.00716117909984e-05, + "loss": 0.8037, + "step": 5511 + }, + { + "epoch": 0.24166381021467234, + "grad_norm": 0.8125, + "learning_rate": 3.006832090833983e-05, + "loss": 0.765, + "step": 5512 + }, + { + "epoch": 0.24170765343132958, + "grad_norm": 0.80078125, + "learning_rate": 3.0065030173897436e-05, + "loss": 0.8335, + "step": 5513 + }, + { + "epoch": 0.24175149664798679, + "grad_norm": 0.9375, + "learning_rate": 3.006173958767823e-05, + "loss": 0.9047, + "step": 5514 + }, + { + "epoch": 0.24179533986464402, + "grad_norm": 0.76953125, + "learning_rate": 3.005844914968917e-05, + "loss": 0.7815, + "step": 5515 + }, + { + "epoch": 0.24183918308130126, + "grad_norm": 0.80078125, + "learning_rate": 3.005515885993724e-05, + "loss": 0.8062, + "step": 5516 + }, + { + "epoch": 0.2418830262979585, + "grad_norm": 0.796875, + "learning_rate": 3.00518687184294e-05, + "loss": 0.8949, + "step": 5517 + }, + { + "epoch": 0.24192686951461573, + "grad_norm": 0.8515625, + "learning_rate": 3.0048578725172616e-05, + "loss": 0.9244, + "step": 5518 + }, + { + "epoch": 0.24197071273127296, + "grad_norm": 0.9453125, + "learning_rate": 3.0045288880173883e-05, + "loss": 0.9384, + "step": 5519 + }, + { + "epoch": 0.2420145559479302, + "grad_norm": 0.74609375, + "learning_rate": 3.0041999183440162e-05, + "loss": 0.8204, + "step": 5520 + }, + { + "epoch": 0.24205839916458743, + "grad_norm": 0.7734375, + "learning_rate": 3.003870963497838e-05, + "loss": 0.8234, + "step": 5521 + }, + { + "epoch": 0.24210224238124467, + "grad_norm": 0.87890625, + "learning_rate": 3.0035420234795597e-05, + "loss": 0.706, + "step": 5522 + }, + { + "epoch": 0.2421460855979019, + "grad_norm": 0.83203125, + "learning_rate": 3.0032130982898742e-05, + "loss": 0.9044, + "step": 5523 + }, + { + "epoch": 0.24218992881455914, + "grad_norm": 0.80078125, + "learning_rate": 3.0028841879294777e-05, + "loss": 0.9011, + "step": 5524 + }, + { + "epoch": 0.24223377203121638, + "grad_norm": 0.8046875, + "learning_rate": 3.0025552923990696e-05, + "loss": 0.73, + "step": 5525 + }, + { + "epoch": 0.2422776152478736, + "grad_norm": 0.9609375, + "learning_rate": 3.0022264116993414e-05, + "loss": 0.7817, + "step": 5526 + }, + { + "epoch": 0.24232145846453085, + "grad_norm": 0.90625, + "learning_rate": 3.0018975458309985e-05, + "loss": 0.7896, + "step": 5527 + }, + { + "epoch": 0.24236530168118808, + "grad_norm": 0.81640625, + "learning_rate": 3.001568694794733e-05, + "loss": 0.8647, + "step": 5528 + }, + { + "epoch": 0.24240914489784532, + "grad_norm": 0.83984375, + "learning_rate": 3.0012398585912428e-05, + "loss": 0.697, + "step": 5529 + }, + { + "epoch": 0.24245298811450253, + "grad_norm": 0.92578125, + "learning_rate": 3.0009110372212245e-05, + "loss": 0.7664, + "step": 5530 + }, + { + "epoch": 0.24249683133115976, + "grad_norm": 0.734375, + "learning_rate": 3.0005822306853715e-05, + "loss": 0.8688, + "step": 5531 + }, + { + "epoch": 0.242540674547817, + "grad_norm": 0.85546875, + "learning_rate": 3.0002534389843884e-05, + "loss": 0.8179, + "step": 5532 + }, + { + "epoch": 0.24258451776447423, + "grad_norm": 0.88671875, + "learning_rate": 2.999924662118968e-05, + "loss": 0.8012, + "step": 5533 + }, + { + "epoch": 0.24262836098113147, + "grad_norm": 0.8125, + "learning_rate": 2.999595900089808e-05, + "loss": 0.8173, + "step": 5534 + }, + { + "epoch": 0.2426722041977887, + "grad_norm": 0.80078125, + "learning_rate": 2.9992671528976025e-05, + "loss": 0.8093, + "step": 5535 + }, + { + "epoch": 0.24271604741444594, + "grad_norm": 0.84375, + "learning_rate": 2.998938420543048e-05, + "loss": 0.6687, + "step": 5536 + }, + { + "epoch": 0.24275989063110318, + "grad_norm": 0.8359375, + "learning_rate": 2.9986097030268467e-05, + "loss": 0.7254, + "step": 5537 + }, + { + "epoch": 0.2428037338477604, + "grad_norm": 0.81640625, + "learning_rate": 2.998281000349692e-05, + "loss": 0.7665, + "step": 5538 + }, + { + "epoch": 0.24284757706441765, + "grad_norm": 0.84765625, + "learning_rate": 2.99795231251228e-05, + "loss": 0.7478, + "step": 5539 + }, + { + "epoch": 0.24289142028107488, + "grad_norm": 0.80859375, + "learning_rate": 2.997623639515308e-05, + "loss": 0.9144, + "step": 5540 + }, + { + "epoch": 0.24293526349773212, + "grad_norm": 0.9296875, + "learning_rate": 2.997294981359472e-05, + "loss": 0.8546, + "step": 5541 + }, + { + "epoch": 0.24297910671438935, + "grad_norm": 0.828125, + "learning_rate": 2.996966338045467e-05, + "loss": 0.8848, + "step": 5542 + }, + { + "epoch": 0.2430229499310466, + "grad_norm": 0.890625, + "learning_rate": 2.996637709573994e-05, + "loss": 0.7858, + "step": 5543 + }, + { + "epoch": 0.24306679314770382, + "grad_norm": 0.78515625, + "learning_rate": 2.9963090959457473e-05, + "loss": 0.8067, + "step": 5544 + }, + { + "epoch": 0.24311063636436103, + "grad_norm": 0.8671875, + "learning_rate": 2.9959804971614226e-05, + "loss": 0.8318, + "step": 5545 + }, + { + "epoch": 0.24315447958101827, + "grad_norm": 0.7890625, + "learning_rate": 2.9956519132217176e-05, + "loss": 0.7419, + "step": 5546 + }, + { + "epoch": 0.2431983227976755, + "grad_norm": 0.87109375, + "learning_rate": 2.9953233441273242e-05, + "loss": 0.827, + "step": 5547 + }, + { + "epoch": 0.24324216601433274, + "grad_norm": 0.8203125, + "learning_rate": 2.9949947898789467e-05, + "loss": 0.7361, + "step": 5548 + }, + { + "epoch": 0.24328600923098997, + "grad_norm": 0.84375, + "learning_rate": 2.994666250477277e-05, + "loss": 0.7017, + "step": 5549 + }, + { + "epoch": 0.2433298524476472, + "grad_norm": 0.87890625, + "learning_rate": 2.994337725923011e-05, + "loss": 0.7755, + "step": 5550 + }, + { + "epoch": 0.24337369566430445, + "grad_norm": 0.80078125, + "learning_rate": 2.9940092162168466e-05, + "loss": 0.815, + "step": 5551 + }, + { + "epoch": 0.24341753888096168, + "grad_norm": 0.8515625, + "learning_rate": 2.9936807213594764e-05, + "loss": 0.9429, + "step": 5552 + }, + { + "epoch": 0.24346138209761892, + "grad_norm": 0.84765625, + "learning_rate": 2.9933522413516014e-05, + "loss": 0.8546, + "step": 5553 + }, + { + "epoch": 0.24350522531427615, + "grad_norm": 0.79296875, + "learning_rate": 2.993023776193916e-05, + "loss": 0.882, + "step": 5554 + }, + { + "epoch": 0.2435490685309334, + "grad_norm": 0.796875, + "learning_rate": 2.992695325887117e-05, + "loss": 0.8217, + "step": 5555 + }, + { + "epoch": 0.24359291174759062, + "grad_norm": 0.77734375, + "learning_rate": 2.9923668904319004e-05, + "loss": 0.704, + "step": 5556 + }, + { + "epoch": 0.24363675496424786, + "grad_norm": 0.78515625, + "learning_rate": 2.9920384698289607e-05, + "loss": 0.7582, + "step": 5557 + }, + { + "epoch": 0.2436805981809051, + "grad_norm": 1.0390625, + "learning_rate": 2.991710064078994e-05, + "loss": 0.8182, + "step": 5558 + }, + { + "epoch": 0.24372444139756233, + "grad_norm": 0.81640625, + "learning_rate": 2.991381673182698e-05, + "loss": 0.7094, + "step": 5559 + }, + { + "epoch": 0.24376828461421954, + "grad_norm": 0.90234375, + "learning_rate": 2.991053297140769e-05, + "loss": 0.7929, + "step": 5560 + }, + { + "epoch": 0.24381212783087677, + "grad_norm": 0.90625, + "learning_rate": 2.990724935953897e-05, + "loss": 0.7429, + "step": 5561 + }, + { + "epoch": 0.243855971047534, + "grad_norm": 0.8203125, + "learning_rate": 2.990396589622787e-05, + "loss": 0.7299, + "step": 5562 + }, + { + "epoch": 0.24389981426419124, + "grad_norm": 0.7734375, + "learning_rate": 2.990068258148131e-05, + "loss": 0.8075, + "step": 5563 + }, + { + "epoch": 0.24394365748084848, + "grad_norm": 0.86328125, + "learning_rate": 2.9897399415306237e-05, + "loss": 0.7496, + "step": 5564 + }, + { + "epoch": 0.24398750069750572, + "grad_norm": 0.8125, + "learning_rate": 2.9894116397709625e-05, + "loss": 0.8171, + "step": 5565 + }, + { + "epoch": 0.24403134391416295, + "grad_norm": 0.72265625, + "learning_rate": 2.9890833528698435e-05, + "loss": 0.7985, + "step": 5566 + }, + { + "epoch": 0.2440751871308202, + "grad_norm": 0.8203125, + "learning_rate": 2.9887550808279575e-05, + "loss": 0.7555, + "step": 5567 + }, + { + "epoch": 0.24411903034747742, + "grad_norm": 0.88671875, + "learning_rate": 2.988426823646008e-05, + "loss": 0.897, + "step": 5568 + }, + { + "epoch": 0.24416287356413466, + "grad_norm": 0.75390625, + "learning_rate": 2.9880985813246866e-05, + "loss": 0.7052, + "step": 5569 + }, + { + "epoch": 0.2442067167807919, + "grad_norm": 0.78515625, + "learning_rate": 2.9877703538646894e-05, + "loss": 0.8263, + "step": 5570 + }, + { + "epoch": 0.24425055999744913, + "grad_norm": 2.015625, + "learning_rate": 2.987442141266712e-05, + "loss": 0.8005, + "step": 5571 + }, + { + "epoch": 0.24429440321410636, + "grad_norm": 0.90234375, + "learning_rate": 2.9871139435314476e-05, + "loss": 0.9388, + "step": 5572 + }, + { + "epoch": 0.2443382464307636, + "grad_norm": 0.74609375, + "learning_rate": 2.9867857606595972e-05, + "loss": 0.7439, + "step": 5573 + }, + { + "epoch": 0.24438208964742084, + "grad_norm": 0.86328125, + "learning_rate": 2.9864575926518533e-05, + "loss": 0.9728, + "step": 5574 + }, + { + "epoch": 0.24442593286407804, + "grad_norm": 0.80859375, + "learning_rate": 2.9861294395089125e-05, + "loss": 0.8667, + "step": 5575 + }, + { + "epoch": 0.24446977608073528, + "grad_norm": 0.84375, + "learning_rate": 2.985801301231469e-05, + "loss": 0.8706, + "step": 5576 + }, + { + "epoch": 0.24451361929739251, + "grad_norm": 0.87890625, + "learning_rate": 2.985473177820215e-05, + "loss": 0.8411, + "step": 5577 + }, + { + "epoch": 0.24455746251404975, + "grad_norm": 0.921875, + "learning_rate": 2.985145069275853e-05, + "loss": 0.776, + "step": 5578 + }, + { + "epoch": 0.24460130573070699, + "grad_norm": 0.80859375, + "learning_rate": 2.984816975599075e-05, + "loss": 0.7121, + "step": 5579 + }, + { + "epoch": 0.24464514894736422, + "grad_norm": 0.8046875, + "learning_rate": 2.9844888967905772e-05, + "loss": 0.7588, + "step": 5580 + }, + { + "epoch": 0.24468899216402146, + "grad_norm": 0.82421875, + "learning_rate": 2.984160832851054e-05, + "loss": 0.7469, + "step": 5581 + }, + { + "epoch": 0.2447328353806787, + "grad_norm": 0.79296875, + "learning_rate": 2.9838327837811976e-05, + "loss": 0.8441, + "step": 5582 + }, + { + "epoch": 0.24477667859733593, + "grad_norm": 0.875, + "learning_rate": 2.9835047495817092e-05, + "loss": 0.9546, + "step": 5583 + }, + { + "epoch": 0.24482052181399316, + "grad_norm": 0.84375, + "learning_rate": 2.983176730253282e-05, + "loss": 0.8198, + "step": 5584 + }, + { + "epoch": 0.2448643650306504, + "grad_norm": 0.859375, + "learning_rate": 2.982848725796611e-05, + "loss": 0.9843, + "step": 5585 + }, + { + "epoch": 0.24490820824730763, + "grad_norm": 0.828125, + "learning_rate": 2.9825207362123908e-05, + "loss": 0.8067, + "step": 5586 + }, + { + "epoch": 0.24495205146396487, + "grad_norm": 0.7421875, + "learning_rate": 2.9821927615013134e-05, + "loss": 0.8145, + "step": 5587 + }, + { + "epoch": 0.2449958946806221, + "grad_norm": 0.79296875, + "learning_rate": 2.9818648016640803e-05, + "loss": 0.7951, + "step": 5588 + }, + { + "epoch": 0.24503973789727934, + "grad_norm": 0.77734375, + "learning_rate": 2.9815368567013845e-05, + "loss": 0.717, + "step": 5589 + }, + { + "epoch": 0.24508358111393655, + "grad_norm": 0.8125, + "learning_rate": 2.9812089266139187e-05, + "loss": 0.802, + "step": 5590 + }, + { + "epoch": 0.24512742433059378, + "grad_norm": 0.921875, + "learning_rate": 2.980881011402381e-05, + "loss": 0.9317, + "step": 5591 + }, + { + "epoch": 0.24517126754725102, + "grad_norm": 0.89453125, + "learning_rate": 2.980553111067461e-05, + "loss": 0.8827, + "step": 5592 + }, + { + "epoch": 0.24521511076390826, + "grad_norm": 0.8828125, + "learning_rate": 2.9802252256098605e-05, + "loss": 0.7895, + "step": 5593 + }, + { + "epoch": 0.2452589539805655, + "grad_norm": 0.8359375, + "learning_rate": 2.979897355030271e-05, + "loss": 0.832, + "step": 5594 + }, + { + "epoch": 0.24530279719722273, + "grad_norm": 0.8203125, + "learning_rate": 2.9795694993293887e-05, + "loss": 0.8635, + "step": 5595 + }, + { + "epoch": 0.24534664041387996, + "grad_norm": 0.73828125, + "learning_rate": 2.9792416585079064e-05, + "loss": 0.6665, + "step": 5596 + }, + { + "epoch": 0.2453904836305372, + "grad_norm": 0.77734375, + "learning_rate": 2.978913832566521e-05, + "loss": 0.7814, + "step": 5597 + }, + { + "epoch": 0.24543432684719443, + "grad_norm": 0.7890625, + "learning_rate": 2.9785860215059235e-05, + "loss": 0.9622, + "step": 5598 + }, + { + "epoch": 0.24547817006385167, + "grad_norm": 0.83203125, + "learning_rate": 2.9782582253268143e-05, + "loss": 0.8642, + "step": 5599 + }, + { + "epoch": 0.2455220132805089, + "grad_norm": 0.80078125, + "learning_rate": 2.977930444029886e-05, + "loss": 0.7631, + "step": 5600 + }, + { + "epoch": 0.24556585649716614, + "grad_norm": 0.8671875, + "learning_rate": 2.9776026776158318e-05, + "loss": 0.9303, + "step": 5601 + }, + { + "epoch": 0.24560969971382338, + "grad_norm": 0.79296875, + "learning_rate": 2.9772749260853484e-05, + "loss": 0.7576, + "step": 5602 + }, + { + "epoch": 0.2456535429304806, + "grad_norm": 0.75390625, + "learning_rate": 2.976947189439129e-05, + "loss": 0.6453, + "step": 5603 + }, + { + "epoch": 0.24569738614713785, + "grad_norm": 0.8203125, + "learning_rate": 2.976619467677868e-05, + "loss": 0.8468, + "step": 5604 + }, + { + "epoch": 0.24574122936379506, + "grad_norm": 0.85546875, + "learning_rate": 2.976291760802261e-05, + "loss": 0.7777, + "step": 5605 + }, + { + "epoch": 0.2457850725804523, + "grad_norm": 0.828125, + "learning_rate": 2.975964068813002e-05, + "loss": 0.7387, + "step": 5606 + }, + { + "epoch": 0.24582891579710953, + "grad_norm": 0.875, + "learning_rate": 2.975636391710783e-05, + "loss": 0.8704, + "step": 5607 + }, + { + "epoch": 0.24587275901376676, + "grad_norm": 0.8203125, + "learning_rate": 2.9753087294963035e-05, + "loss": 0.8831, + "step": 5608 + }, + { + "epoch": 0.245916602230424, + "grad_norm": 0.85546875, + "learning_rate": 2.974981082170256e-05, + "loss": 0.8606, + "step": 5609 + }, + { + "epoch": 0.24596044544708123, + "grad_norm": 0.83203125, + "learning_rate": 2.9746534497333356e-05, + "loss": 0.8716, + "step": 5610 + }, + { + "epoch": 0.24600428866373847, + "grad_norm": 0.82421875, + "learning_rate": 2.9743258321862344e-05, + "loss": 0.8308, + "step": 5611 + }, + { + "epoch": 0.2460481318803957, + "grad_norm": 0.81640625, + "learning_rate": 2.9739982295296452e-05, + "loss": 0.8242, + "step": 5612 + }, + { + "epoch": 0.24609197509705294, + "grad_norm": 0.734375, + "learning_rate": 2.973670641764269e-05, + "loss": 0.7302, + "step": 5613 + }, + { + "epoch": 0.24613581831371018, + "grad_norm": 0.76953125, + "learning_rate": 2.973343068890795e-05, + "loss": 0.7054, + "step": 5614 + }, + { + "epoch": 0.2461796615303674, + "grad_norm": 0.76953125, + "learning_rate": 2.97301551090992e-05, + "loss": 0.7554, + "step": 5615 + }, + { + "epoch": 0.24622350474702465, + "grad_norm": 0.87109375, + "learning_rate": 2.9726879678223364e-05, + "loss": 0.8443, + "step": 5616 + }, + { + "epoch": 0.24626734796368188, + "grad_norm": 0.72265625, + "learning_rate": 2.972360439628735e-05, + "loss": 0.8368, + "step": 5617 + }, + { + "epoch": 0.24631119118033912, + "grad_norm": 0.89453125, + "learning_rate": 2.9720329263298186e-05, + "loss": 0.8427, + "step": 5618 + }, + { + "epoch": 0.24635503439699635, + "grad_norm": 1.0, + "learning_rate": 2.9717054279262756e-05, + "loss": 0.8071, + "step": 5619 + }, + { + "epoch": 0.2463988776136536, + "grad_norm": 0.78125, + "learning_rate": 2.9713779444188017e-05, + "loss": 0.7183, + "step": 5620 + }, + { + "epoch": 0.2464427208303108, + "grad_norm": 0.88671875, + "learning_rate": 2.9710504758080915e-05, + "loss": 0.9074, + "step": 5621 + }, + { + "epoch": 0.24648656404696803, + "grad_norm": 0.76953125, + "learning_rate": 2.9707230220948335e-05, + "loss": 0.8485, + "step": 5622 + }, + { + "epoch": 0.24653040726362527, + "grad_norm": 0.83203125, + "learning_rate": 2.9703955832797304e-05, + "loss": 0.8368, + "step": 5623 + }, + { + "epoch": 0.2465742504802825, + "grad_norm": 0.84765625, + "learning_rate": 2.9700681593634706e-05, + "loss": 1.016, + "step": 5624 + }, + { + "epoch": 0.24661809369693974, + "grad_norm": 0.92578125, + "learning_rate": 2.969740750346751e-05, + "loss": 0.9152, + "step": 5625 + }, + { + "epoch": 0.24666193691359697, + "grad_norm": 0.80859375, + "learning_rate": 2.969413356230263e-05, + "loss": 0.6988, + "step": 5626 + }, + { + "epoch": 0.2467057801302542, + "grad_norm": 0.8125, + "learning_rate": 2.9690859770146996e-05, + "loss": 0.7872, + "step": 5627 + }, + { + "epoch": 0.24674962334691145, + "grad_norm": 0.765625, + "learning_rate": 2.968758612700758e-05, + "loss": 0.861, + "step": 5628 + }, + { + "epoch": 0.24679346656356868, + "grad_norm": 0.83984375, + "learning_rate": 2.968431263289132e-05, + "loss": 0.8782, + "step": 5629 + }, + { + "epoch": 0.24683730978022592, + "grad_norm": 0.8046875, + "learning_rate": 2.9681039287805145e-05, + "loss": 0.8011, + "step": 5630 + }, + { + "epoch": 0.24688115299688315, + "grad_norm": 0.8515625, + "learning_rate": 2.9677766091755975e-05, + "loss": 0.9406, + "step": 5631 + }, + { + "epoch": 0.2469249962135404, + "grad_norm": 0.828125, + "learning_rate": 2.967449304475074e-05, + "loss": 0.8425, + "step": 5632 + }, + { + "epoch": 0.24696883943019762, + "grad_norm": 0.89453125, + "learning_rate": 2.967122014679642e-05, + "loss": 0.8661, + "step": 5633 + }, + { + "epoch": 0.24701268264685486, + "grad_norm": 0.8046875, + "learning_rate": 2.966794739789993e-05, + "loss": 0.8234, + "step": 5634 + }, + { + "epoch": 0.2470565258635121, + "grad_norm": 0.7890625, + "learning_rate": 2.9664674798068205e-05, + "loss": 0.7672, + "step": 5635 + }, + { + "epoch": 0.2471003690801693, + "grad_norm": 0.84765625, + "learning_rate": 2.9661402347308187e-05, + "loss": 0.7967, + "step": 5636 + }, + { + "epoch": 0.24714421229682654, + "grad_norm": 0.81640625, + "learning_rate": 2.9658130045626777e-05, + "loss": 0.7684, + "step": 5637 + }, + { + "epoch": 0.24718805551348377, + "grad_norm": 0.91796875, + "learning_rate": 2.965485789303096e-05, + "loss": 0.8961, + "step": 5638 + }, + { + "epoch": 0.247231898730141, + "grad_norm": 0.8984375, + "learning_rate": 2.9651585889527657e-05, + "loss": 0.8303, + "step": 5639 + }, + { + "epoch": 0.24727574194679824, + "grad_norm": 0.86328125, + "learning_rate": 2.9648314035123804e-05, + "loss": 0.7697, + "step": 5640 + }, + { + "epoch": 0.24731958516345548, + "grad_norm": 0.8125, + "learning_rate": 2.964504232982631e-05, + "loss": 0.8534, + "step": 5641 + }, + { + "epoch": 0.24736342838011272, + "grad_norm": 0.84375, + "learning_rate": 2.9641770773642143e-05, + "loss": 0.7847, + "step": 5642 + }, + { + "epoch": 0.24740727159676995, + "grad_norm": 0.88671875, + "learning_rate": 2.963849936657822e-05, + "loss": 0.7873, + "step": 5643 + }, + { + "epoch": 0.2474511148134272, + "grad_norm": 0.83984375, + "learning_rate": 2.9635228108641476e-05, + "loss": 0.6595, + "step": 5644 + }, + { + "epoch": 0.24749495803008442, + "grad_norm": 0.875, + "learning_rate": 2.963195699983884e-05, + "loss": 0.9488, + "step": 5645 + }, + { + "epoch": 0.24753880124674166, + "grad_norm": 0.80859375, + "learning_rate": 2.962868604017721e-05, + "loss": 0.7241, + "step": 5646 + }, + { + "epoch": 0.2475826444633989, + "grad_norm": 0.8359375, + "learning_rate": 2.96254152296636e-05, + "loss": 0.7475, + "step": 5647 + }, + { + "epoch": 0.24762648768005613, + "grad_norm": 0.81640625, + "learning_rate": 2.96221445683049e-05, + "loss": 0.7832, + "step": 5648 + }, + { + "epoch": 0.24767033089671336, + "grad_norm": 1.6015625, + "learning_rate": 2.961887405610805e-05, + "loss": 0.8924, + "step": 5649 + }, + { + "epoch": 0.2477141741133706, + "grad_norm": 0.88671875, + "learning_rate": 2.9615603693079952e-05, + "loss": 0.8534, + "step": 5650 + }, + { + "epoch": 0.2477580173300278, + "grad_norm": 0.84765625, + "learning_rate": 2.9612333479227573e-05, + "loss": 0.7668, + "step": 5651 + }, + { + "epoch": 0.24780186054668504, + "grad_norm": 0.890625, + "learning_rate": 2.9609063414557802e-05, + "loss": 0.7931, + "step": 5652 + }, + { + "epoch": 0.24784570376334228, + "grad_norm": 0.85546875, + "learning_rate": 2.9605793499077618e-05, + "loss": 0.7443, + "step": 5653 + }, + { + "epoch": 0.24788954697999951, + "grad_norm": 0.875, + "learning_rate": 2.9602523732793942e-05, + "loss": 0.85, + "step": 5654 + }, + { + "epoch": 0.24793339019665675, + "grad_norm": 0.78125, + "learning_rate": 2.9599254115713683e-05, + "loss": 0.8574, + "step": 5655 + }, + { + "epoch": 0.24797723341331399, + "grad_norm": 0.8515625, + "learning_rate": 2.9595984647843787e-05, + "loss": 0.7889, + "step": 5656 + }, + { + "epoch": 0.24802107662997122, + "grad_norm": 0.8828125, + "learning_rate": 2.9592715329191145e-05, + "loss": 1.0264, + "step": 5657 + }, + { + "epoch": 0.24806491984662846, + "grad_norm": 0.83203125, + "learning_rate": 2.9589446159762747e-05, + "loss": 0.8299, + "step": 5658 + }, + { + "epoch": 0.2481087630632857, + "grad_norm": 0.82421875, + "learning_rate": 2.9586177139565495e-05, + "loss": 0.7733, + "step": 5659 + }, + { + "epoch": 0.24815260627994293, + "grad_norm": 0.8671875, + "learning_rate": 2.9582908268606324e-05, + "loss": 0.7012, + "step": 5660 + }, + { + "epoch": 0.24819644949660016, + "grad_norm": 0.8828125, + "learning_rate": 2.9579639546892146e-05, + "loss": 0.7925, + "step": 5661 + }, + { + "epoch": 0.2482402927132574, + "grad_norm": 0.80859375, + "learning_rate": 2.957637097442987e-05, + "loss": 0.687, + "step": 5662 + }, + { + "epoch": 0.24828413592991463, + "grad_norm": 0.83984375, + "learning_rate": 2.9573102551226472e-05, + "loss": 0.8584, + "step": 5663 + }, + { + "epoch": 0.24832797914657187, + "grad_norm": 0.78125, + "learning_rate": 2.9569834277288865e-05, + "loss": 0.8004, + "step": 5664 + }, + { + "epoch": 0.2483718223632291, + "grad_norm": 0.859375, + "learning_rate": 2.9566566152623976e-05, + "loss": 0.9392, + "step": 5665 + }, + { + "epoch": 0.2484156655798863, + "grad_norm": 0.89453125, + "learning_rate": 2.9563298177238707e-05, + "loss": 0.9608, + "step": 5666 + }, + { + "epoch": 0.24845950879654355, + "grad_norm": 0.80859375, + "learning_rate": 2.956003035113998e-05, + "loss": 0.7779, + "step": 5667 + }, + { + "epoch": 0.24850335201320078, + "grad_norm": 0.79296875, + "learning_rate": 2.9556762674334782e-05, + "loss": 0.7789, + "step": 5668 + }, + { + "epoch": 0.24854719522985802, + "grad_norm": 0.828125, + "learning_rate": 2.955349514682999e-05, + "loss": 0.8747, + "step": 5669 + }, + { + "epoch": 0.24859103844651526, + "grad_norm": 0.89453125, + "learning_rate": 2.9550227768632534e-05, + "loss": 0.7459, + "step": 5670 + }, + { + "epoch": 0.2486348816631725, + "grad_norm": 0.8046875, + "learning_rate": 2.9546960539749347e-05, + "loss": 0.763, + "step": 5671 + }, + { + "epoch": 0.24867872487982973, + "grad_norm": 0.89453125, + "learning_rate": 2.9543693460187326e-05, + "loss": 0.8942, + "step": 5672 + }, + { + "epoch": 0.24872256809648696, + "grad_norm": 0.78125, + "learning_rate": 2.9540426529953445e-05, + "loss": 0.7771, + "step": 5673 + }, + { + "epoch": 0.2487664113131442, + "grad_norm": 0.81640625, + "learning_rate": 2.95371597490546e-05, + "loss": 0.8388, + "step": 5674 + }, + { + "epoch": 0.24881025452980143, + "grad_norm": 0.87109375, + "learning_rate": 2.9533893117497723e-05, + "loss": 0.8413, + "step": 5675 + }, + { + "epoch": 0.24885409774645867, + "grad_norm": 0.8671875, + "learning_rate": 2.9530626635289726e-05, + "loss": 0.742, + "step": 5676 + }, + { + "epoch": 0.2488979409631159, + "grad_norm": 0.8046875, + "learning_rate": 2.9527360302437513e-05, + "loss": 0.7398, + "step": 5677 + }, + { + "epoch": 0.24894178417977314, + "grad_norm": 0.76171875, + "learning_rate": 2.9524094118948055e-05, + "loss": 0.8314, + "step": 5678 + }, + { + "epoch": 0.24898562739643038, + "grad_norm": 0.93359375, + "learning_rate": 2.9520828084828256e-05, + "loss": 1.047, + "step": 5679 + }, + { + "epoch": 0.2490294706130876, + "grad_norm": 0.91796875, + "learning_rate": 2.9517562200085035e-05, + "loss": 0.9604, + "step": 5680 + }, + { + "epoch": 0.24907331382974482, + "grad_norm": 0.81640625, + "learning_rate": 2.951429646472531e-05, + "loss": 0.8529, + "step": 5681 + }, + { + "epoch": 0.24911715704640205, + "grad_norm": 0.828125, + "learning_rate": 2.9511030878756006e-05, + "loss": 0.7842, + "step": 5682 + }, + { + "epoch": 0.2491610002630593, + "grad_norm": 0.796875, + "learning_rate": 2.9507765442184053e-05, + "loss": 0.8113, + "step": 5683 + }, + { + "epoch": 0.24920484347971653, + "grad_norm": 0.890625, + "learning_rate": 2.950450015501631e-05, + "loss": 0.8552, + "step": 5684 + }, + { + "epoch": 0.24924868669637376, + "grad_norm": 0.8984375, + "learning_rate": 2.9501235017259798e-05, + "loss": 0.8863, + "step": 5685 + }, + { + "epoch": 0.249292529913031, + "grad_norm": 0.84765625, + "learning_rate": 2.9497970028921383e-05, + "loss": 0.8508, + "step": 5686 + }, + { + "epoch": 0.24933637312968823, + "grad_norm": 0.9609375, + "learning_rate": 2.9494705190007987e-05, + "loss": 0.8195, + "step": 5687 + }, + { + "epoch": 0.24938021634634547, + "grad_norm": 0.8359375, + "learning_rate": 2.9491440500526547e-05, + "loss": 0.8926, + "step": 5688 + }, + { + "epoch": 0.2494240595630027, + "grad_norm": 0.7890625, + "learning_rate": 2.9488175960483956e-05, + "loss": 0.8037, + "step": 5689 + }, + { + "epoch": 0.24946790277965994, + "grad_norm": 0.87109375, + "learning_rate": 2.948491156988715e-05, + "loss": 0.8557, + "step": 5690 + }, + { + "epoch": 0.24951174599631717, + "grad_norm": 0.8828125, + "learning_rate": 2.9481647328743046e-05, + "loss": 0.8961, + "step": 5691 + }, + { + "epoch": 0.2495555892129744, + "grad_norm": 0.80078125, + "learning_rate": 2.9478383237058548e-05, + "loss": 0.7861, + "step": 5692 + }, + { + "epoch": 0.24959943242963165, + "grad_norm": 0.86328125, + "learning_rate": 2.947511929484057e-05, + "loss": 0.8983, + "step": 5693 + }, + { + "epoch": 0.24964327564628888, + "grad_norm": 0.859375, + "learning_rate": 2.9471855502096068e-05, + "loss": 0.7514, + "step": 5694 + }, + { + "epoch": 0.24968711886294612, + "grad_norm": 0.74609375, + "learning_rate": 2.946859185883194e-05, + "loss": 0.7037, + "step": 5695 + }, + { + "epoch": 0.24973096207960332, + "grad_norm": 0.8203125, + "learning_rate": 2.946532836505509e-05, + "loss": 0.788, + "step": 5696 + }, + { + "epoch": 0.24977480529626056, + "grad_norm": 0.7578125, + "learning_rate": 2.9462065020772454e-05, + "loss": 0.7037, + "step": 5697 + }, + { + "epoch": 0.2498186485129178, + "grad_norm": 0.78515625, + "learning_rate": 2.945880182599091e-05, + "loss": 0.9418, + "step": 5698 + }, + { + "epoch": 0.24986249172957503, + "grad_norm": 0.9140625, + "learning_rate": 2.945553878071744e-05, + "loss": 0.8073, + "step": 5699 + }, + { + "epoch": 0.24990633494623227, + "grad_norm": 0.796875, + "learning_rate": 2.9452275884958913e-05, + "loss": 0.8167, + "step": 5700 + }, + { + "epoch": 0.2499501781628895, + "grad_norm": 0.765625, + "learning_rate": 2.944901313872226e-05, + "loss": 0.7353, + "step": 5701 + }, + { + "epoch": 0.24999402137954674, + "grad_norm": 0.859375, + "learning_rate": 2.9445750542014382e-05, + "loss": 0.796, + "step": 5702 + }, + { + "epoch": 0.250037864596204, + "grad_norm": 0.87109375, + "learning_rate": 2.944248809484218e-05, + "loss": 0.7881, + "step": 5703 + }, + { + "epoch": 0.2500817078128612, + "grad_norm": 0.765625, + "learning_rate": 2.943922579721262e-05, + "loss": 0.7905, + "step": 5704 + }, + { + "epoch": 0.25012555102951844, + "grad_norm": 0.80859375, + "learning_rate": 2.9435963649132592e-05, + "loss": 0.8098, + "step": 5705 + }, + { + "epoch": 0.2501693942461757, + "grad_norm": 0.82421875, + "learning_rate": 2.943270165060901e-05, + "loss": 0.8114, + "step": 5706 + }, + { + "epoch": 0.2502132374628329, + "grad_norm": 0.94921875, + "learning_rate": 2.9429439801648773e-05, + "loss": 0.7968, + "step": 5707 + }, + { + "epoch": 0.25025708067949015, + "grad_norm": 0.89453125, + "learning_rate": 2.9426178102258784e-05, + "loss": 0.8692, + "step": 5708 + }, + { + "epoch": 0.2503009238961474, + "grad_norm": 0.84375, + "learning_rate": 2.9422916552446e-05, + "loss": 0.7558, + "step": 5709 + }, + { + "epoch": 0.2503447671128046, + "grad_norm": 0.8671875, + "learning_rate": 2.941965515221731e-05, + "loss": 0.639, + "step": 5710 + }, + { + "epoch": 0.25038861032946186, + "grad_norm": 0.83984375, + "learning_rate": 2.9416393901579632e-05, + "loss": 0.8413, + "step": 5711 + }, + { + "epoch": 0.2504324535461191, + "grad_norm": 1.6328125, + "learning_rate": 2.9413132800539867e-05, + "loss": 0.7636, + "step": 5712 + }, + { + "epoch": 0.25047629676277633, + "grad_norm": 0.86328125, + "learning_rate": 2.9409871849104897e-05, + "loss": 0.8543, + "step": 5713 + }, + { + "epoch": 0.25052013997943356, + "grad_norm": 1.0859375, + "learning_rate": 2.940661104728171e-05, + "loss": 0.8435, + "step": 5714 + }, + { + "epoch": 0.2505639831960908, + "grad_norm": 0.8359375, + "learning_rate": 2.9403350395077177e-05, + "loss": 0.7469, + "step": 5715 + }, + { + "epoch": 0.25060782641274804, + "grad_norm": 0.79296875, + "learning_rate": 2.9400089892498206e-05, + "loss": 0.7677, + "step": 5716 + }, + { + "epoch": 0.25065166962940527, + "grad_norm": 0.9453125, + "learning_rate": 2.9396829539551706e-05, + "loss": 0.908, + "step": 5717 + }, + { + "epoch": 0.2506955128460625, + "grad_norm": 0.87890625, + "learning_rate": 2.9393569336244553e-05, + "loss": 0.8221, + "step": 5718 + }, + { + "epoch": 0.2507393560627197, + "grad_norm": 0.81640625, + "learning_rate": 2.939030928258374e-05, + "loss": 0.7991, + "step": 5719 + }, + { + "epoch": 0.2507831992793769, + "grad_norm": 0.75, + "learning_rate": 2.9387049378576113e-05, + "loss": 0.902, + "step": 5720 + }, + { + "epoch": 0.25082704249603416, + "grad_norm": 0.8125, + "learning_rate": 2.938378962422861e-05, + "loss": 0.7605, + "step": 5721 + }, + { + "epoch": 0.2508708857126914, + "grad_norm": 0.87890625, + "learning_rate": 2.9380530019548134e-05, + "loss": 0.7981, + "step": 5722 + }, + { + "epoch": 0.25091472892934863, + "grad_norm": 0.78515625, + "learning_rate": 2.937727056454155e-05, + "loss": 0.7815, + "step": 5723 + }, + { + "epoch": 0.25095857214600586, + "grad_norm": 0.80859375, + "learning_rate": 2.9374011259215826e-05, + "loss": 0.8355, + "step": 5724 + }, + { + "epoch": 0.2510024153626631, + "grad_norm": 0.859375, + "learning_rate": 2.937075210357787e-05, + "loss": 0.8335, + "step": 5725 + }, + { + "epoch": 0.25104625857932034, + "grad_norm": 0.88671875, + "learning_rate": 2.9367493097634547e-05, + "loss": 0.7489, + "step": 5726 + }, + { + "epoch": 0.25109010179597757, + "grad_norm": 0.88671875, + "learning_rate": 2.9364234241392795e-05, + "loss": 0.7303, + "step": 5727 + }, + { + "epoch": 0.2511339450126348, + "grad_norm": 0.7421875, + "learning_rate": 2.9360975534859514e-05, + "loss": 0.7514, + "step": 5728 + }, + { + "epoch": 0.25117778822929204, + "grad_norm": 0.83203125, + "learning_rate": 2.9357716978041604e-05, + "loss": 1.0024, + "step": 5729 + }, + { + "epoch": 0.2512216314459493, + "grad_norm": 0.7109375, + "learning_rate": 2.9354458570945977e-05, + "loss": 0.7957, + "step": 5730 + }, + { + "epoch": 0.2512654746626065, + "grad_norm": 0.76171875, + "learning_rate": 2.935120031357954e-05, + "loss": 0.7539, + "step": 5731 + }, + { + "epoch": 0.25130931787926375, + "grad_norm": 0.8125, + "learning_rate": 2.934794220594915e-05, + "loss": 0.8112, + "step": 5732 + }, + { + "epoch": 0.251353161095921, + "grad_norm": 0.796875, + "learning_rate": 2.9344684248061805e-05, + "loss": 0.708, + "step": 5733 + }, + { + "epoch": 0.2513970043125782, + "grad_norm": 0.8125, + "learning_rate": 2.9341426439924357e-05, + "loss": 0.7684, + "step": 5734 + }, + { + "epoch": 0.25144084752923546, + "grad_norm": 0.86328125, + "learning_rate": 2.9338168781543716e-05, + "loss": 0.8634, + "step": 5735 + }, + { + "epoch": 0.2514846907458927, + "grad_norm": 0.82421875, + "learning_rate": 2.933491127292679e-05, + "loss": 0.8207, + "step": 5736 + }, + { + "epoch": 0.2515285339625499, + "grad_norm": 0.8359375, + "learning_rate": 2.933165391408048e-05, + "loss": 0.9554, + "step": 5737 + }, + { + "epoch": 0.25157237717920716, + "grad_norm": 0.79296875, + "learning_rate": 2.9328396705011662e-05, + "loss": 0.8412, + "step": 5738 + }, + { + "epoch": 0.2516162203958644, + "grad_norm": 0.73828125, + "learning_rate": 2.9325139645727295e-05, + "loss": 0.741, + "step": 5739 + }, + { + "epoch": 0.25166006361252163, + "grad_norm": 0.8203125, + "learning_rate": 2.9321882736234264e-05, + "loss": 0.7019, + "step": 5740 + }, + { + "epoch": 0.25170390682917887, + "grad_norm": 0.8203125, + "learning_rate": 2.931862597653945e-05, + "loss": 0.6629, + "step": 5741 + }, + { + "epoch": 0.2517477500458361, + "grad_norm": 0.75390625, + "learning_rate": 2.931536936664977e-05, + "loss": 0.8715, + "step": 5742 + }, + { + "epoch": 0.25179159326249334, + "grad_norm": 0.78125, + "learning_rate": 2.93121129065721e-05, + "loss": 0.7485, + "step": 5743 + }, + { + "epoch": 0.2518354364791506, + "grad_norm": 0.8125, + "learning_rate": 2.9308856596313393e-05, + "loss": 0.7015, + "step": 5744 + }, + { + "epoch": 0.2518792796958078, + "grad_norm": 0.8671875, + "learning_rate": 2.930560043588052e-05, + "loss": 0.7374, + "step": 5745 + }, + { + "epoch": 0.25192312291246505, + "grad_norm": 0.79296875, + "learning_rate": 2.9302344425280392e-05, + "loss": 0.7511, + "step": 5746 + }, + { + "epoch": 0.2519669661291223, + "grad_norm": 0.8359375, + "learning_rate": 2.9299088564519906e-05, + "loss": 0.8594, + "step": 5747 + }, + { + "epoch": 0.2520108093457795, + "grad_norm": 0.81640625, + "learning_rate": 2.9295832853605932e-05, + "loss": 0.8244, + "step": 5748 + }, + { + "epoch": 0.25205465256243675, + "grad_norm": 0.81640625, + "learning_rate": 2.929257729254542e-05, + "loss": 0.7749, + "step": 5749 + }, + { + "epoch": 0.25209849577909393, + "grad_norm": 1.0703125, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7557, + "step": 5750 + }, + { + "epoch": 0.25214233899575117, + "grad_norm": 0.7421875, + "learning_rate": 2.928606662001233e-05, + "loss": 0.6797, + "step": 5751 + }, + { + "epoch": 0.2521861822124084, + "grad_norm": 0.84765625, + "learning_rate": 2.9282811508553544e-05, + "loss": 0.6959, + "step": 5752 + }, + { + "epoch": 0.25223002542906564, + "grad_norm": 0.7890625, + "learning_rate": 2.9279556546975774e-05, + "loss": 0.7785, + "step": 5753 + }, + { + "epoch": 0.2522738686457229, + "grad_norm": 0.7578125, + "learning_rate": 2.927630173528596e-05, + "loss": 0.6504, + "step": 5754 + }, + { + "epoch": 0.2523177118623801, + "grad_norm": 0.83984375, + "learning_rate": 2.927304707349099e-05, + "loss": 0.6997, + "step": 5755 + }, + { + "epoch": 0.25236155507903735, + "grad_norm": 0.8125, + "learning_rate": 2.9269792561597752e-05, + "loss": 0.7595, + "step": 5756 + }, + { + "epoch": 0.2524053982956946, + "grad_norm": 0.7890625, + "learning_rate": 2.9266538199613135e-05, + "loss": 0.7623, + "step": 5757 + }, + { + "epoch": 0.2524492415123518, + "grad_norm": 0.83203125, + "learning_rate": 2.9263283987544033e-05, + "loss": 0.87, + "step": 5758 + }, + { + "epoch": 0.25249308472900905, + "grad_norm": 0.734375, + "learning_rate": 2.926002992539738e-05, + "loss": 0.8162, + "step": 5759 + }, + { + "epoch": 0.2525369279456663, + "grad_norm": 0.7734375, + "learning_rate": 2.9256776013180055e-05, + "loss": 0.6866, + "step": 5760 + }, + { + "epoch": 0.2525807711623235, + "grad_norm": 0.85546875, + "learning_rate": 2.9253522250898945e-05, + "loss": 0.7267, + "step": 5761 + }, + { + "epoch": 0.25262461437898076, + "grad_norm": 0.8671875, + "learning_rate": 2.925026863856095e-05, + "loss": 0.8071, + "step": 5762 + }, + { + "epoch": 0.252668457595638, + "grad_norm": 0.89453125, + "learning_rate": 2.9247015176172943e-05, + "loss": 0.8981, + "step": 5763 + }, + { + "epoch": 0.25271230081229523, + "grad_norm": 0.7890625, + "learning_rate": 2.924376186374187e-05, + "loss": 0.8486, + "step": 5764 + }, + { + "epoch": 0.25275614402895247, + "grad_norm": 0.80078125, + "learning_rate": 2.9240508701274604e-05, + "loss": 0.7902, + "step": 5765 + }, + { + "epoch": 0.2527999872456097, + "grad_norm": 0.76171875, + "learning_rate": 2.923725568877803e-05, + "loss": 0.8644, + "step": 5766 + }, + { + "epoch": 0.25284383046226694, + "grad_norm": 0.79296875, + "learning_rate": 2.9234002826259045e-05, + "loss": 0.7827, + "step": 5767 + }, + { + "epoch": 0.2528876736789242, + "grad_norm": 0.79296875, + "learning_rate": 2.923075011372456e-05, + "loss": 0.7719, + "step": 5768 + }, + { + "epoch": 0.2529315168955814, + "grad_norm": 0.796875, + "learning_rate": 2.9227497551181404e-05, + "loss": 0.7634, + "step": 5769 + }, + { + "epoch": 0.25297536011223865, + "grad_norm": 0.83984375, + "learning_rate": 2.9224245138636563e-05, + "loss": 0.748, + "step": 5770 + }, + { + "epoch": 0.2530192033288959, + "grad_norm": 0.82421875, + "learning_rate": 2.92209928760969e-05, + "loss": 0.7132, + "step": 5771 + }, + { + "epoch": 0.2530630465455531, + "grad_norm": 0.7890625, + "learning_rate": 2.9217740763569278e-05, + "loss": 0.7539, + "step": 5772 + }, + { + "epoch": 0.25310688976221035, + "grad_norm": 0.8203125, + "learning_rate": 2.9214488801060624e-05, + "loss": 0.8242, + "step": 5773 + }, + { + "epoch": 0.2531507329788676, + "grad_norm": 0.9765625, + "learning_rate": 2.9211236988577805e-05, + "loss": 0.8621, + "step": 5774 + }, + { + "epoch": 0.2531945761955248, + "grad_norm": 0.96875, + "learning_rate": 2.920798532612773e-05, + "loss": 0.8471, + "step": 5775 + }, + { + "epoch": 0.25323841941218206, + "grad_norm": 0.84375, + "learning_rate": 2.9204733813717277e-05, + "loss": 0.8659, + "step": 5776 + }, + { + "epoch": 0.2532822626288393, + "grad_norm": 0.81640625, + "learning_rate": 2.9201482451353346e-05, + "loss": 0.9205, + "step": 5777 + }, + { + "epoch": 0.25332610584549653, + "grad_norm": 0.91015625, + "learning_rate": 2.9198231239042794e-05, + "loss": 0.7697, + "step": 5778 + }, + { + "epoch": 0.25336994906215377, + "grad_norm": 0.828125, + "learning_rate": 2.9194980176792584e-05, + "loss": 0.8059, + "step": 5779 + }, + { + "epoch": 0.25341379227881095, + "grad_norm": 0.76953125, + "learning_rate": 2.919172926460956e-05, + "loss": 0.7719, + "step": 5780 + }, + { + "epoch": 0.2534576354954682, + "grad_norm": 0.77734375, + "learning_rate": 2.9188478502500615e-05, + "loss": 0.7493, + "step": 5781 + }, + { + "epoch": 0.2535014787121254, + "grad_norm": 0.8203125, + "learning_rate": 2.9185227890472643e-05, + "loss": 0.807, + "step": 5782 + }, + { + "epoch": 0.25354532192878265, + "grad_norm": 0.78515625, + "learning_rate": 2.9181977428532515e-05, + "loss": 0.7482, + "step": 5783 + }, + { + "epoch": 0.2535891651454399, + "grad_norm": 0.83203125, + "learning_rate": 2.9178727116687153e-05, + "loss": 0.8369, + "step": 5784 + }, + { + "epoch": 0.2536330083620971, + "grad_norm": 0.79296875, + "learning_rate": 2.9175476954943437e-05, + "loss": 0.8282, + "step": 5785 + }, + { + "epoch": 0.25367685157875436, + "grad_norm": 0.88671875, + "learning_rate": 2.9172226943308256e-05, + "loss": 0.8321, + "step": 5786 + }, + { + "epoch": 0.2537206947954116, + "grad_norm": 0.80859375, + "learning_rate": 2.9168977081788483e-05, + "loss": 0.7948, + "step": 5787 + }, + { + "epoch": 0.25376453801206883, + "grad_norm": 0.7890625, + "learning_rate": 2.9165727370390982e-05, + "loss": 0.7816, + "step": 5788 + }, + { + "epoch": 0.25380838122872607, + "grad_norm": 0.8984375, + "learning_rate": 2.916247780912271e-05, + "loss": 0.7812, + "step": 5789 + }, + { + "epoch": 0.2538522244453833, + "grad_norm": 0.82421875, + "learning_rate": 2.9159228397990513e-05, + "loss": 0.818, + "step": 5790 + }, + { + "epoch": 0.25389606766204054, + "grad_norm": 0.859375, + "learning_rate": 2.9155979137001287e-05, + "loss": 0.8627, + "step": 5791 + }, + { + "epoch": 0.25393991087869777, + "grad_norm": 0.85546875, + "learning_rate": 2.9152730026161912e-05, + "loss": 0.7448, + "step": 5792 + }, + { + "epoch": 0.253983754095355, + "grad_norm": 0.73046875, + "learning_rate": 2.9149481065479246e-05, + "loss": 0.6628, + "step": 5793 + }, + { + "epoch": 0.25402759731201224, + "grad_norm": 0.88671875, + "learning_rate": 2.9146232254960237e-05, + "loss": 0.7416, + "step": 5794 + }, + { + "epoch": 0.2540714405286695, + "grad_norm": 0.86328125, + "learning_rate": 2.914298359461174e-05, + "loss": 0.8305, + "step": 5795 + }, + { + "epoch": 0.2541152837453267, + "grad_norm": 0.7890625, + "learning_rate": 2.9139735084440634e-05, + "loss": 0.7812, + "step": 5796 + }, + { + "epoch": 0.25415912696198395, + "grad_norm": 0.9140625, + "learning_rate": 2.9136486724453815e-05, + "loss": 0.9477, + "step": 5797 + }, + { + "epoch": 0.2542029701786412, + "grad_norm": 0.82421875, + "learning_rate": 2.9133238514658125e-05, + "loss": 0.7194, + "step": 5798 + }, + { + "epoch": 0.2542468133952984, + "grad_norm": 0.78125, + "learning_rate": 2.912999045506052e-05, + "loss": 0.6778, + "step": 5799 + }, + { + "epoch": 0.25429065661195566, + "grad_norm": 0.79296875, + "learning_rate": 2.9126742545667852e-05, + "loss": 0.804, + "step": 5800 + }, + { + "epoch": 0.2543344998286129, + "grad_norm": 0.8125, + "learning_rate": 2.9123494786486992e-05, + "loss": 0.7833, + "step": 5801 + }, + { + "epoch": 0.2543783430452701, + "grad_norm": 0.8515625, + "learning_rate": 2.912024717752484e-05, + "loss": 0.8012, + "step": 5802 + }, + { + "epoch": 0.25442218626192736, + "grad_norm": 0.94140625, + "learning_rate": 2.9116999718788252e-05, + "loss": 0.82, + "step": 5803 + }, + { + "epoch": 0.2544660294785846, + "grad_norm": 0.859375, + "learning_rate": 2.9113752410284146e-05, + "loss": 0.7823, + "step": 5804 + }, + { + "epoch": 0.25450987269524183, + "grad_norm": 0.79296875, + "learning_rate": 2.9110505252019406e-05, + "loss": 0.7312, + "step": 5805 + }, + { + "epoch": 0.25455371591189907, + "grad_norm": 0.88671875, + "learning_rate": 2.9107258244000878e-05, + "loss": 0.7692, + "step": 5806 + }, + { + "epoch": 0.2545975591285563, + "grad_norm": 0.8828125, + "learning_rate": 2.9104011386235485e-05, + "loss": 0.9561, + "step": 5807 + }, + { + "epoch": 0.25464140234521354, + "grad_norm": 0.86328125, + "learning_rate": 2.910076467873004e-05, + "loss": 0.8578, + "step": 5808 + }, + { + "epoch": 0.2546852455618708, + "grad_norm": 0.7734375, + "learning_rate": 2.9097518121491518e-05, + "loss": 0.7343, + "step": 5809 + }, + { + "epoch": 0.25472908877852796, + "grad_norm": 0.7109375, + "learning_rate": 2.9094271714526743e-05, + "loss": 0.7004, + "step": 5810 + }, + { + "epoch": 0.2547729319951852, + "grad_norm": 0.80859375, + "learning_rate": 2.9091025457842614e-05, + "loss": 0.7925, + "step": 5811 + }, + { + "epoch": 0.25481677521184243, + "grad_norm": 0.87890625, + "learning_rate": 2.9087779351446e-05, + "loss": 0.8773, + "step": 5812 + }, + { + "epoch": 0.25486061842849966, + "grad_norm": 0.8203125, + "learning_rate": 2.908453339534378e-05, + "loss": 0.7682, + "step": 5813 + }, + { + "epoch": 0.2549044616451569, + "grad_norm": 0.83203125, + "learning_rate": 2.908128758954284e-05, + "loss": 0.84, + "step": 5814 + }, + { + "epoch": 0.25494830486181413, + "grad_norm": 0.87109375, + "learning_rate": 2.907804193405006e-05, + "loss": 0.9516, + "step": 5815 + }, + { + "epoch": 0.25499214807847137, + "grad_norm": 0.84375, + "learning_rate": 2.907479642887232e-05, + "loss": 0.7328, + "step": 5816 + }, + { + "epoch": 0.2550359912951286, + "grad_norm": 0.79296875, + "learning_rate": 2.9071551074016457e-05, + "loss": 0.7175, + "step": 5817 + }, + { + "epoch": 0.25507983451178584, + "grad_norm": 0.80859375, + "learning_rate": 2.906830586948942e-05, + "loss": 0.7333, + "step": 5818 + }, + { + "epoch": 0.2551236777284431, + "grad_norm": 0.91015625, + "learning_rate": 2.906506081529805e-05, + "loss": 0.8633, + "step": 5819 + }, + { + "epoch": 0.2551675209451003, + "grad_norm": 0.8515625, + "learning_rate": 2.906181591144924e-05, + "loss": 0.716, + "step": 5820 + }, + { + "epoch": 0.25521136416175755, + "grad_norm": 0.828125, + "learning_rate": 2.9058571157949853e-05, + "loss": 0.7761, + "step": 5821 + }, + { + "epoch": 0.2552552073784148, + "grad_norm": 0.74609375, + "learning_rate": 2.905532655480676e-05, + "loss": 0.7562, + "step": 5822 + }, + { + "epoch": 0.255299050595072, + "grad_norm": 0.74609375, + "learning_rate": 2.9052082102026857e-05, + "loss": 0.7249, + "step": 5823 + }, + { + "epoch": 0.25534289381172925, + "grad_norm": 0.75390625, + "learning_rate": 2.9048837799616978e-05, + "loss": 0.6981, + "step": 5824 + }, + { + "epoch": 0.2553867370283865, + "grad_norm": 0.84765625, + "learning_rate": 2.9045593647584067e-05, + "loss": 0.8601, + "step": 5825 + }, + { + "epoch": 0.2554305802450437, + "grad_norm": 0.72265625, + "learning_rate": 2.9042349645934964e-05, + "loss": 0.6956, + "step": 5826 + }, + { + "epoch": 0.25547442346170096, + "grad_norm": 0.79296875, + "learning_rate": 2.9039105794676535e-05, + "loss": 0.788, + "step": 5827 + }, + { + "epoch": 0.2555182666783582, + "grad_norm": 0.75, + "learning_rate": 2.903586209381567e-05, + "loss": 0.8534, + "step": 5828 + }, + { + "epoch": 0.25556210989501543, + "grad_norm": 0.78125, + "learning_rate": 2.9032618543359215e-05, + "loss": 0.7408, + "step": 5829 + }, + { + "epoch": 0.25560595311167267, + "grad_norm": 0.765625, + "learning_rate": 2.9029375143314097e-05, + "loss": 0.774, + "step": 5830 + }, + { + "epoch": 0.2556497963283299, + "grad_norm": 0.71875, + "learning_rate": 2.9026131893687157e-05, + "loss": 0.6942, + "step": 5831 + }, + { + "epoch": 0.25569363954498714, + "grad_norm": 0.8359375, + "learning_rate": 2.902288879448528e-05, + "loss": 0.7614, + "step": 5832 + }, + { + "epoch": 0.2557374827616444, + "grad_norm": 0.8515625, + "learning_rate": 2.9019645845715325e-05, + "loss": 0.8245, + "step": 5833 + }, + { + "epoch": 0.2557813259783016, + "grad_norm": 0.80078125, + "learning_rate": 2.9016403047384155e-05, + "loss": 0.8031, + "step": 5834 + }, + { + "epoch": 0.25582516919495885, + "grad_norm": 0.75, + "learning_rate": 2.9013160399498684e-05, + "loss": 0.6889, + "step": 5835 + }, + { + "epoch": 0.2558690124116161, + "grad_norm": 0.8046875, + "learning_rate": 2.9009917902065763e-05, + "loss": 0.9838, + "step": 5836 + }, + { + "epoch": 0.2559128556282733, + "grad_norm": 0.91015625, + "learning_rate": 2.9006675555092254e-05, + "loss": 0.8106, + "step": 5837 + }, + { + "epoch": 0.25595669884493055, + "grad_norm": 0.8828125, + "learning_rate": 2.9003433358585053e-05, + "loss": 0.76, + "step": 5838 + }, + { + "epoch": 0.2560005420615878, + "grad_norm": 0.8203125, + "learning_rate": 2.9000191312550983e-05, + "loss": 0.8456, + "step": 5839 + }, + { + "epoch": 0.256044385278245, + "grad_norm": 0.8828125, + "learning_rate": 2.8996949416996977e-05, + "loss": 0.8799, + "step": 5840 + }, + { + "epoch": 0.2560882284949022, + "grad_norm": 0.8203125, + "learning_rate": 2.899370767192988e-05, + "loss": 0.8055, + "step": 5841 + }, + { + "epoch": 0.25613207171155944, + "grad_norm": 0.875, + "learning_rate": 2.8990466077356558e-05, + "loss": 0.9243, + "step": 5842 + }, + { + "epoch": 0.2561759149282167, + "grad_norm": 0.90234375, + "learning_rate": 2.8987224633283893e-05, + "loss": 0.8528, + "step": 5843 + }, + { + "epoch": 0.2562197581448739, + "grad_norm": 0.8359375, + "learning_rate": 2.898398333971871e-05, + "loss": 0.842, + "step": 5844 + }, + { + "epoch": 0.25626360136153115, + "grad_norm": 0.828125, + "learning_rate": 2.8980742196667942e-05, + "loss": 0.7433, + "step": 5845 + }, + { + "epoch": 0.2563074445781884, + "grad_norm": 0.9296875, + "learning_rate": 2.8977501204138435e-05, + "loss": 0.8531, + "step": 5846 + }, + { + "epoch": 0.2563512877948456, + "grad_norm": 0.81640625, + "learning_rate": 2.8974260362137063e-05, + "loss": 0.7386, + "step": 5847 + }, + { + "epoch": 0.25639513101150285, + "grad_norm": 0.8203125, + "learning_rate": 2.8971019670670675e-05, + "loss": 0.7498, + "step": 5848 + }, + { + "epoch": 0.2564389742281601, + "grad_norm": 0.80078125, + "learning_rate": 2.896777912974612e-05, + "loss": 0.8462, + "step": 5849 + }, + { + "epoch": 0.2564828174448173, + "grad_norm": 1.46875, + "learning_rate": 2.896453873937034e-05, + "loss": 0.7748, + "step": 5850 + }, + { + "epoch": 0.25652666066147456, + "grad_norm": 0.81640625, + "learning_rate": 2.896129849955015e-05, + "loss": 0.739, + "step": 5851 + }, + { + "epoch": 0.2565705038781318, + "grad_norm": 0.9453125, + "learning_rate": 2.895805841029242e-05, + "loss": 0.8178, + "step": 5852 + }, + { + "epoch": 0.25661434709478903, + "grad_norm": 0.8046875, + "learning_rate": 2.895481847160404e-05, + "loss": 0.8488, + "step": 5853 + }, + { + "epoch": 0.25665819031144627, + "grad_norm": 0.95703125, + "learning_rate": 2.8951578683491853e-05, + "loss": 0.8031, + "step": 5854 + }, + { + "epoch": 0.2567020335281035, + "grad_norm": 1.0, + "learning_rate": 2.8948339045962713e-05, + "loss": 0.9195, + "step": 5855 + }, + { + "epoch": 0.25674587674476074, + "grad_norm": 0.8046875, + "learning_rate": 2.8945099559023526e-05, + "loss": 0.8288, + "step": 5856 + }, + { + "epoch": 0.256789719961418, + "grad_norm": 0.83984375, + "learning_rate": 2.894186022268115e-05, + "loss": 0.9496, + "step": 5857 + }, + { + "epoch": 0.2568335631780752, + "grad_norm": 0.80859375, + "learning_rate": 2.8938621036942426e-05, + "loss": 0.6643, + "step": 5858 + }, + { + "epoch": 0.25687740639473244, + "grad_norm": 0.73046875, + "learning_rate": 2.8935382001814248e-05, + "loss": 0.7389, + "step": 5859 + }, + { + "epoch": 0.2569212496113897, + "grad_norm": 0.87109375, + "learning_rate": 2.8932143117303456e-05, + "loss": 0.8699, + "step": 5860 + }, + { + "epoch": 0.2569650928280469, + "grad_norm": 0.8046875, + "learning_rate": 2.8928904383416932e-05, + "loss": 0.7065, + "step": 5861 + }, + { + "epoch": 0.25700893604470415, + "grad_norm": 0.87109375, + "learning_rate": 2.8925665800161526e-05, + "loss": 0.7111, + "step": 5862 + }, + { + "epoch": 0.2570527792613614, + "grad_norm": 0.80859375, + "learning_rate": 2.8922427367544113e-05, + "loss": 0.8771, + "step": 5863 + }, + { + "epoch": 0.2570966224780186, + "grad_norm": 0.796875, + "learning_rate": 2.891918908557152e-05, + "loss": 0.757, + "step": 5864 + }, + { + "epoch": 0.25714046569467586, + "grad_norm": 0.89453125, + "learning_rate": 2.891595095425067e-05, + "loss": 0.8473, + "step": 5865 + }, + { + "epoch": 0.2571843089113331, + "grad_norm": 0.953125, + "learning_rate": 2.891271297358841e-05, + "loss": 0.8733, + "step": 5866 + }, + { + "epoch": 0.25722815212799033, + "grad_norm": 0.8203125, + "learning_rate": 2.890947514359158e-05, + "loss": 0.9077, + "step": 5867 + }, + { + "epoch": 0.25727199534464756, + "grad_norm": 0.91015625, + "learning_rate": 2.8906237464267062e-05, + "loss": 0.7424, + "step": 5868 + }, + { + "epoch": 0.2573158385613048, + "grad_norm": 0.79296875, + "learning_rate": 2.8902999935621665e-05, + "loss": 1.0095, + "step": 5869 + }, + { + "epoch": 0.25735968177796203, + "grad_norm": 0.8359375, + "learning_rate": 2.8899762557662345e-05, + "loss": 0.8016, + "step": 5870 + }, + { + "epoch": 0.2574035249946192, + "grad_norm": 0.87890625, + "learning_rate": 2.8896525330395906e-05, + "loss": 0.8081, + "step": 5871 + }, + { + "epoch": 0.25744736821127645, + "grad_norm": 0.859375, + "learning_rate": 2.8893288253829222e-05, + "loss": 0.9718, + "step": 5872 + }, + { + "epoch": 0.2574912114279337, + "grad_norm": 0.76171875, + "learning_rate": 2.8890051327969147e-05, + "loss": 0.7569, + "step": 5873 + }, + { + "epoch": 0.2575350546445909, + "grad_norm": 0.8203125, + "learning_rate": 2.8886814552822505e-05, + "loss": 0.6347, + "step": 5874 + }, + { + "epoch": 0.25757889786124816, + "grad_norm": 0.83203125, + "learning_rate": 2.888357792839623e-05, + "loss": 0.9427, + "step": 5875 + }, + { + "epoch": 0.2576227410779054, + "grad_norm": 0.82421875, + "learning_rate": 2.8880341454697156e-05, + "loss": 0.7311, + "step": 5876 + }, + { + "epoch": 0.25766658429456263, + "grad_norm": 0.82421875, + "learning_rate": 2.8877105131732128e-05, + "loss": 0.7381, + "step": 5877 + }, + { + "epoch": 0.25771042751121986, + "grad_norm": 0.71484375, + "learning_rate": 2.887386895950801e-05, + "loss": 0.7459, + "step": 5878 + }, + { + "epoch": 0.2577542707278771, + "grad_norm": 0.859375, + "learning_rate": 2.8870632938031627e-05, + "loss": 0.7639, + "step": 5879 + }, + { + "epoch": 0.25779811394453434, + "grad_norm": 0.83203125, + "learning_rate": 2.886739706730991e-05, + "loss": 0.8976, + "step": 5880 + }, + { + "epoch": 0.25784195716119157, + "grad_norm": 0.73046875, + "learning_rate": 2.8864161347349672e-05, + "loss": 0.7068, + "step": 5881 + }, + { + "epoch": 0.2578858003778488, + "grad_norm": 0.8125, + "learning_rate": 2.8860925778157776e-05, + "loss": 0.8372, + "step": 5882 + }, + { + "epoch": 0.25792964359450604, + "grad_norm": 0.796875, + "learning_rate": 2.8857690359741085e-05, + "loss": 0.7081, + "step": 5883 + }, + { + "epoch": 0.2579734868111633, + "grad_norm": 0.8671875, + "learning_rate": 2.8854455092106424e-05, + "loss": 0.8858, + "step": 5884 + }, + { + "epoch": 0.2580173300278205, + "grad_norm": 0.80859375, + "learning_rate": 2.8851219975260713e-05, + "loss": 0.7656, + "step": 5885 + }, + { + "epoch": 0.25806117324447775, + "grad_norm": 0.78515625, + "learning_rate": 2.8847985009210776e-05, + "loss": 0.8301, + "step": 5886 + }, + { + "epoch": 0.258105016461135, + "grad_norm": 0.8125, + "learning_rate": 2.8844750193963456e-05, + "loss": 0.8298, + "step": 5887 + }, + { + "epoch": 0.2581488596777922, + "grad_norm": 0.80078125, + "learning_rate": 2.884151552952563e-05, + "loss": 0.8578, + "step": 5888 + }, + { + "epoch": 0.25819270289444946, + "grad_norm": 0.8359375, + "learning_rate": 2.883828101590411e-05, + "loss": 0.7252, + "step": 5889 + }, + { + "epoch": 0.2582365461111067, + "grad_norm": 0.875, + "learning_rate": 2.8835046653105825e-05, + "loss": 0.727, + "step": 5890 + }, + { + "epoch": 0.2582803893277639, + "grad_norm": 0.7890625, + "learning_rate": 2.8831812441137585e-05, + "loss": 0.6582, + "step": 5891 + }, + { + "epoch": 0.25832423254442116, + "grad_norm": 0.78125, + "learning_rate": 2.8828578380006245e-05, + "loss": 0.8447, + "step": 5892 + }, + { + "epoch": 0.2583680757610784, + "grad_norm": 0.82421875, + "learning_rate": 2.8825344469718672e-05, + "loss": 0.8304, + "step": 5893 + }, + { + "epoch": 0.25841191897773563, + "grad_norm": 0.8984375, + "learning_rate": 2.882211071028168e-05, + "loss": 0.9402, + "step": 5894 + }, + { + "epoch": 0.25845576219439287, + "grad_norm": 0.78515625, + "learning_rate": 2.8818877101702192e-05, + "loss": 0.7853, + "step": 5895 + }, + { + "epoch": 0.2584996054110501, + "grad_norm": 0.82421875, + "learning_rate": 2.881564364398701e-05, + "loss": 0.7623, + "step": 5896 + }, + { + "epoch": 0.25854344862770734, + "grad_norm": 0.84765625, + "learning_rate": 2.8812410337143026e-05, + "loss": 0.6855, + "step": 5897 + }, + { + "epoch": 0.2585872918443646, + "grad_norm": 0.80078125, + "learning_rate": 2.880917718117705e-05, + "loss": 0.8232, + "step": 5898 + }, + { + "epoch": 0.2586311350610218, + "grad_norm": 0.80078125, + "learning_rate": 2.8805944176095966e-05, + "loss": 0.7585, + "step": 5899 + }, + { + "epoch": 0.25867497827767905, + "grad_norm": 0.9140625, + "learning_rate": 2.8802711321906606e-05, + "loss": 0.8025, + "step": 5900 + }, + { + "epoch": 0.2587188214943362, + "grad_norm": 0.84765625, + "learning_rate": 2.879947861861584e-05, + "loss": 0.8458, + "step": 5901 + }, + { + "epoch": 0.25876266471099346, + "grad_norm": 0.8203125, + "learning_rate": 2.8796246066230492e-05, + "loss": 0.8054, + "step": 5902 + }, + { + "epoch": 0.2588065079276507, + "grad_norm": 0.765625, + "learning_rate": 2.879301366475742e-05, + "loss": 0.8196, + "step": 5903 + }, + { + "epoch": 0.25885035114430793, + "grad_norm": 0.84375, + "learning_rate": 2.8789781414203497e-05, + "loss": 0.8283, + "step": 5904 + }, + { + "epoch": 0.25889419436096517, + "grad_norm": 0.87109375, + "learning_rate": 2.8786549314575572e-05, + "loss": 0.7647, + "step": 5905 + }, + { + "epoch": 0.2589380375776224, + "grad_norm": 0.859375, + "learning_rate": 2.8783317365880492e-05, + "loss": 0.8951, + "step": 5906 + }, + { + "epoch": 0.25898188079427964, + "grad_norm": 1.015625, + "learning_rate": 2.8780085568125094e-05, + "loss": 0.7214, + "step": 5907 + }, + { + "epoch": 0.2590257240109369, + "grad_norm": 0.76953125, + "learning_rate": 2.8776853921316226e-05, + "loss": 0.8117, + "step": 5908 + }, + { + "epoch": 0.2590695672275941, + "grad_norm": 0.96875, + "learning_rate": 2.877362242546072e-05, + "loss": 0.8111, + "step": 5909 + }, + { + "epoch": 0.25911341044425135, + "grad_norm": 0.81640625, + "learning_rate": 2.8770391080565485e-05, + "loss": 0.7823, + "step": 5910 + }, + { + "epoch": 0.2591572536609086, + "grad_norm": 0.83203125, + "learning_rate": 2.876715988663734e-05, + "loss": 0.814, + "step": 5911 + }, + { + "epoch": 0.2592010968775658, + "grad_norm": 0.89453125, + "learning_rate": 2.876392884368312e-05, + "loss": 0.8343, + "step": 5912 + }, + { + "epoch": 0.25924494009422305, + "grad_norm": 0.8359375, + "learning_rate": 2.876069795170968e-05, + "loss": 0.8333, + "step": 5913 + }, + { + "epoch": 0.2592887833108803, + "grad_norm": 1.015625, + "learning_rate": 2.8757467210723844e-05, + "loss": 0.8394, + "step": 5914 + }, + { + "epoch": 0.2593326265275375, + "grad_norm": 0.86328125, + "learning_rate": 2.8754236620732512e-05, + "loss": 0.8623, + "step": 5915 + }, + { + "epoch": 0.25937646974419476, + "grad_norm": 0.9140625, + "learning_rate": 2.875100618174249e-05, + "loss": 0.6979, + "step": 5916 + }, + { + "epoch": 0.259420312960852, + "grad_norm": 0.8125, + "learning_rate": 2.8747775893760652e-05, + "loss": 0.765, + "step": 5917 + }, + { + "epoch": 0.25946415617750923, + "grad_norm": 0.83203125, + "learning_rate": 2.8744545756793827e-05, + "loss": 0.8001, + "step": 5918 + }, + { + "epoch": 0.25950799939416647, + "grad_norm": 0.71875, + "learning_rate": 2.8741315770848842e-05, + "loss": 0.6916, + "step": 5919 + }, + { + "epoch": 0.2595518426108237, + "grad_norm": 0.7734375, + "learning_rate": 2.873808593593259e-05, + "loss": 0.6982, + "step": 5920 + }, + { + "epoch": 0.25959568582748094, + "grad_norm": 0.78515625, + "learning_rate": 2.873485625205189e-05, + "loss": 0.7749, + "step": 5921 + }, + { + "epoch": 0.2596395290441382, + "grad_norm": 1.140625, + "learning_rate": 2.873162671921359e-05, + "loss": 0.7453, + "step": 5922 + }, + { + "epoch": 0.2596833722607954, + "grad_norm": 0.82421875, + "learning_rate": 2.872839733742453e-05, + "loss": 0.9858, + "step": 5923 + }, + { + "epoch": 0.25972721547745264, + "grad_norm": 0.89453125, + "learning_rate": 2.8725168106691535e-05, + "loss": 0.7628, + "step": 5924 + }, + { + "epoch": 0.2597710586941099, + "grad_norm": 0.7734375, + "learning_rate": 2.8721939027021493e-05, + "loss": 0.7667, + "step": 5925 + }, + { + "epoch": 0.2598149019107671, + "grad_norm": 0.83984375, + "learning_rate": 2.871871009842124e-05, + "loss": 0.7202, + "step": 5926 + }, + { + "epoch": 0.25985874512742435, + "grad_norm": 0.9375, + "learning_rate": 2.8715481320897596e-05, + "loss": 0.8806, + "step": 5927 + }, + { + "epoch": 0.2599025883440816, + "grad_norm": 0.8046875, + "learning_rate": 2.871225269445742e-05, + "loss": 0.8037, + "step": 5928 + }, + { + "epoch": 0.2599464315607388, + "grad_norm": 0.75390625, + "learning_rate": 2.8709024219107516e-05, + "loss": 0.6704, + "step": 5929 + }, + { + "epoch": 0.25999027477739606, + "grad_norm": 0.8203125, + "learning_rate": 2.8705795894854793e-05, + "loss": 0.6841, + "step": 5930 + }, + { + "epoch": 0.2600341179940533, + "grad_norm": 0.9140625, + "learning_rate": 2.8702567721706062e-05, + "loss": 0.7652, + "step": 5931 + }, + { + "epoch": 0.2600779612107105, + "grad_norm": 0.80859375, + "learning_rate": 2.869933969966816e-05, + "loss": 0.7257, + "step": 5932 + }, + { + "epoch": 0.2601218044273677, + "grad_norm": 0.8203125, + "learning_rate": 2.869611182874794e-05, + "loss": 0.7458, + "step": 5933 + }, + { + "epoch": 0.26016564764402494, + "grad_norm": 0.8359375, + "learning_rate": 2.8692884108952202e-05, + "loss": 0.7086, + "step": 5934 + }, + { + "epoch": 0.2602094908606822, + "grad_norm": 0.83203125, + "learning_rate": 2.8689656540287847e-05, + "loss": 0.8187, + "step": 5935 + }, + { + "epoch": 0.2602533340773394, + "grad_norm": 0.81640625, + "learning_rate": 2.8686429122761694e-05, + "loss": 0.8468, + "step": 5936 + }, + { + "epoch": 0.26029717729399665, + "grad_norm": 0.80078125, + "learning_rate": 2.8683201856380572e-05, + "loss": 0.8126, + "step": 5937 + }, + { + "epoch": 0.2603410205106539, + "grad_norm": 0.765625, + "learning_rate": 2.8679974741151327e-05, + "loss": 0.6953, + "step": 5938 + }, + { + "epoch": 0.2603848637273111, + "grad_norm": 0.8671875, + "learning_rate": 2.8676747777080804e-05, + "loss": 0.9455, + "step": 5939 + }, + { + "epoch": 0.26042870694396836, + "grad_norm": 0.8359375, + "learning_rate": 2.8673520964175805e-05, + "loss": 0.7197, + "step": 5940 + }, + { + "epoch": 0.2604725501606256, + "grad_norm": 0.79296875, + "learning_rate": 2.8670294302443233e-05, + "loss": 0.8478, + "step": 5941 + }, + { + "epoch": 0.26051639337728283, + "grad_norm": 0.84375, + "learning_rate": 2.866706779188989e-05, + "loss": 0.9215, + "step": 5942 + }, + { + "epoch": 0.26056023659394006, + "grad_norm": 0.96484375, + "learning_rate": 2.866384143252263e-05, + "loss": 0.8565, + "step": 5943 + }, + { + "epoch": 0.2606040798105973, + "grad_norm": 0.80078125, + "learning_rate": 2.8660615224348276e-05, + "loss": 0.7693, + "step": 5944 + }, + { + "epoch": 0.26064792302725454, + "grad_norm": 0.796875, + "learning_rate": 2.865738916737366e-05, + "loss": 0.7696, + "step": 5945 + }, + { + "epoch": 0.26069176624391177, + "grad_norm": 0.84765625, + "learning_rate": 2.8654163261605648e-05, + "loss": 0.9534, + "step": 5946 + }, + { + "epoch": 0.260735609460569, + "grad_norm": 0.85546875, + "learning_rate": 2.8650937507051044e-05, + "loss": 0.9137, + "step": 5947 + }, + { + "epoch": 0.26077945267722624, + "grad_norm": 0.90234375, + "learning_rate": 2.864771190371671e-05, + "loss": 0.8769, + "step": 5948 + }, + { + "epoch": 0.2608232958938835, + "grad_norm": 0.78515625, + "learning_rate": 2.864448645160943e-05, + "loss": 0.7951, + "step": 5949 + }, + { + "epoch": 0.2608671391105407, + "grad_norm": 0.8125, + "learning_rate": 2.8641261150736133e-05, + "loss": 0.7609, + "step": 5950 + }, + { + "epoch": 0.26091098232719795, + "grad_norm": 0.8125, + "learning_rate": 2.8638036001103587e-05, + "loss": 0.8138, + "step": 5951 + }, + { + "epoch": 0.2609548255438552, + "grad_norm": 0.82421875, + "learning_rate": 2.8634811002718653e-05, + "loss": 0.8406, + "step": 5952 + }, + { + "epoch": 0.2609986687605124, + "grad_norm": 0.9140625, + "learning_rate": 2.8631586155588152e-05, + "loss": 0.9015, + "step": 5953 + }, + { + "epoch": 0.26104251197716966, + "grad_norm": 0.73828125, + "learning_rate": 2.86283614597189e-05, + "loss": 0.7875, + "step": 5954 + }, + { + "epoch": 0.2610863551938269, + "grad_norm": 0.796875, + "learning_rate": 2.8625136915117777e-05, + "loss": 0.7583, + "step": 5955 + }, + { + "epoch": 0.2611301984104841, + "grad_norm": 0.98046875, + "learning_rate": 2.8621912521791605e-05, + "loss": 0.8195, + "step": 5956 + }, + { + "epoch": 0.26117404162714136, + "grad_norm": 0.81640625, + "learning_rate": 2.8618688279747208e-05, + "loss": 0.8448, + "step": 5957 + }, + { + "epoch": 0.2612178848437986, + "grad_norm": 0.81640625, + "learning_rate": 2.8615464188991427e-05, + "loss": 0.954, + "step": 5958 + }, + { + "epoch": 0.26126172806045583, + "grad_norm": 0.84765625, + "learning_rate": 2.8612240249531054e-05, + "loss": 0.9611, + "step": 5959 + }, + { + "epoch": 0.26130557127711307, + "grad_norm": 0.8203125, + "learning_rate": 2.8609016461372985e-05, + "loss": 0.7541, + "step": 5960 + }, + { + "epoch": 0.2613494144937703, + "grad_norm": 0.9609375, + "learning_rate": 2.8605792824524024e-05, + "loss": 0.9011, + "step": 5961 + }, + { + "epoch": 0.2613932577104275, + "grad_norm": 0.83984375, + "learning_rate": 2.8602569338991013e-05, + "loss": 0.7255, + "step": 5962 + }, + { + "epoch": 0.2614371009270847, + "grad_norm": 0.765625, + "learning_rate": 2.859934600478077e-05, + "loss": 0.7385, + "step": 5963 + }, + { + "epoch": 0.26148094414374196, + "grad_norm": 0.828125, + "learning_rate": 2.859612282190014e-05, + "loss": 0.7749, + "step": 5964 + }, + { + "epoch": 0.2615247873603992, + "grad_norm": 0.82421875, + "learning_rate": 2.85928997903559e-05, + "loss": 0.8699, + "step": 5965 + }, + { + "epoch": 0.2615686305770564, + "grad_norm": 0.78515625, + "learning_rate": 2.8589676910154974e-05, + "loss": 0.7632, + "step": 5966 + }, + { + "epoch": 0.26161247379371366, + "grad_norm": 0.796875, + "learning_rate": 2.8586454181304135e-05, + "loss": 0.7389, + "step": 5967 + }, + { + "epoch": 0.2616563170103709, + "grad_norm": 0.796875, + "learning_rate": 2.8583231603810234e-05, + "loss": 0.7727, + "step": 5968 + }, + { + "epoch": 0.26170016022702813, + "grad_norm": 0.828125, + "learning_rate": 2.8580009177680077e-05, + "loss": 0.7743, + "step": 5969 + }, + { + "epoch": 0.26174400344368537, + "grad_norm": 0.90234375, + "learning_rate": 2.8576786902920495e-05, + "loss": 0.742, + "step": 5970 + }, + { + "epoch": 0.2617878466603426, + "grad_norm": 0.84375, + "learning_rate": 2.8573564779538352e-05, + "loss": 0.8851, + "step": 5971 + }, + { + "epoch": 0.26183168987699984, + "grad_norm": 0.734375, + "learning_rate": 2.857034280754045e-05, + "loss": 0.867, + "step": 5972 + }, + { + "epoch": 0.2618755330936571, + "grad_norm": 0.76171875, + "learning_rate": 2.8567120986933636e-05, + "loss": 0.7672, + "step": 5973 + }, + { + "epoch": 0.2619193763103143, + "grad_norm": 0.8125, + "learning_rate": 2.8563899317724718e-05, + "loss": 0.7275, + "step": 5974 + }, + { + "epoch": 0.26196321952697155, + "grad_norm": 0.9140625, + "learning_rate": 2.856067779992051e-05, + "loss": 0.8917, + "step": 5975 + }, + { + "epoch": 0.2620070627436288, + "grad_norm": 0.8359375, + "learning_rate": 2.8557456433527887e-05, + "loss": 0.8929, + "step": 5976 + }, + { + "epoch": 0.262050905960286, + "grad_norm": 0.87109375, + "learning_rate": 2.8554235218553648e-05, + "loss": 0.8301, + "step": 5977 + }, + { + "epoch": 0.26209474917694325, + "grad_norm": 0.85546875, + "learning_rate": 2.855101415500463e-05, + "loss": 0.7767, + "step": 5978 + }, + { + "epoch": 0.2621385923936005, + "grad_norm": 0.8671875, + "learning_rate": 2.8547793242887655e-05, + "loss": 0.861, + "step": 5979 + }, + { + "epoch": 0.2621824356102577, + "grad_norm": 0.91796875, + "learning_rate": 2.854457248220951e-05, + "loss": 0.8958, + "step": 5980 + }, + { + "epoch": 0.26222627882691496, + "grad_norm": 0.8203125, + "learning_rate": 2.8541351872977096e-05, + "loss": 0.7252, + "step": 5981 + }, + { + "epoch": 0.2622701220435722, + "grad_norm": 0.796875, + "learning_rate": 2.85381314151972e-05, + "loss": 0.6936, + "step": 5982 + }, + { + "epoch": 0.26231396526022943, + "grad_norm": 0.921875, + "learning_rate": 2.853491110887665e-05, + "loss": 0.8329, + "step": 5983 + }, + { + "epoch": 0.26235780847688667, + "grad_norm": 0.83203125, + "learning_rate": 2.853169095402227e-05, + "loss": 0.8193, + "step": 5984 + }, + { + "epoch": 0.2624016516935439, + "grad_norm": 0.73046875, + "learning_rate": 2.8528470950640897e-05, + "loss": 0.8511, + "step": 5985 + }, + { + "epoch": 0.26244549491020114, + "grad_norm": 0.99609375, + "learning_rate": 2.8525251098739338e-05, + "loss": 0.8772, + "step": 5986 + }, + { + "epoch": 0.2624893381268584, + "grad_norm": 0.8359375, + "learning_rate": 2.852203139832441e-05, + "loss": 0.8162, + "step": 5987 + }, + { + "epoch": 0.2625331813435156, + "grad_norm": 0.80859375, + "learning_rate": 2.8518811849402938e-05, + "loss": 0.832, + "step": 5988 + }, + { + "epoch": 0.26257702456017284, + "grad_norm": 0.73828125, + "learning_rate": 2.8515592451981775e-05, + "loss": 0.6717, + "step": 5989 + }, + { + "epoch": 0.2626208677768301, + "grad_norm": 0.81640625, + "learning_rate": 2.8512373206067745e-05, + "loss": 0.8192, + "step": 5990 + }, + { + "epoch": 0.2626647109934873, + "grad_norm": 0.71875, + "learning_rate": 2.8509154111667646e-05, + "loss": 0.7767, + "step": 5991 + }, + { + "epoch": 0.2627085542101445, + "grad_norm": 0.87890625, + "learning_rate": 2.85059351687883e-05, + "loss": 0.8481, + "step": 5992 + }, + { + "epoch": 0.26275239742680173, + "grad_norm": 0.8125, + "learning_rate": 2.8502716377436554e-05, + "loss": 0.7492, + "step": 5993 + }, + { + "epoch": 0.26279624064345897, + "grad_norm": 0.84375, + "learning_rate": 2.8499497737619207e-05, + "loss": 0.7823, + "step": 5994 + }, + { + "epoch": 0.2628400838601162, + "grad_norm": 0.7890625, + "learning_rate": 2.849627924934305e-05, + "loss": 0.7519, + "step": 5995 + }, + { + "epoch": 0.26288392707677344, + "grad_norm": 0.93359375, + "learning_rate": 2.849306091261498e-05, + "loss": 0.8409, + "step": 5996 + }, + { + "epoch": 0.2629277702934307, + "grad_norm": 0.76171875, + "learning_rate": 2.8489842727441786e-05, + "loss": 0.6906, + "step": 5997 + }, + { + "epoch": 0.2629716135100879, + "grad_norm": 0.73046875, + "learning_rate": 2.8486624693830266e-05, + "loss": 0.7361, + "step": 5998 + }, + { + "epoch": 0.26301545672674514, + "grad_norm": 0.7890625, + "learning_rate": 2.848340681178727e-05, + "loss": 0.7588, + "step": 5999 + }, + { + "epoch": 0.2630592999434024, + "grad_norm": 0.8671875, + "learning_rate": 2.8480189081319576e-05, + "loss": 0.6633, + "step": 6000 + }, + { + "epoch": 0.2630592999434024, + "eval_loss": 0.8014892339706421, + "eval_runtime": 300.1908, + "eval_samples_per_second": 33.312, + "eval_steps_per_second": 0.696, + "step": 6000 + }, + { + "epoch": 0.2631031431600596, + "grad_norm": 0.921875, + "learning_rate": 2.8476971502434058e-05, + "loss": 0.9343, + "step": 6001 + }, + { + "epoch": 0.26314698637671685, + "grad_norm": 0.8359375, + "learning_rate": 2.8473754075137517e-05, + "loss": 0.8425, + "step": 6002 + }, + { + "epoch": 0.2631908295933741, + "grad_norm": 0.87890625, + "learning_rate": 2.8470536799436753e-05, + "loss": 0.7259, + "step": 6003 + }, + { + "epoch": 0.2632346728100313, + "grad_norm": 0.76953125, + "learning_rate": 2.846731967533861e-05, + "loss": 0.8381, + "step": 6004 + }, + { + "epoch": 0.26327851602668856, + "grad_norm": 0.80078125, + "learning_rate": 2.846410270284986e-05, + "loss": 0.806, + "step": 6005 + }, + { + "epoch": 0.2633223592433458, + "grad_norm": 0.765625, + "learning_rate": 2.8460885881977396e-05, + "loss": 0.7413, + "step": 6006 + }, + { + "epoch": 0.26336620246000303, + "grad_norm": 0.8515625, + "learning_rate": 2.8457669212727977e-05, + "loss": 0.6647, + "step": 6007 + }, + { + "epoch": 0.26341004567666026, + "grad_norm": 0.80078125, + "learning_rate": 2.8454452695108446e-05, + "loss": 0.7823, + "step": 6008 + }, + { + "epoch": 0.2634538888933175, + "grad_norm": 0.89453125, + "learning_rate": 2.845123632912562e-05, + "loss": 0.9532, + "step": 6009 + }, + { + "epoch": 0.26349773210997474, + "grad_norm": 0.82421875, + "learning_rate": 2.8448020114786277e-05, + "loss": 0.7316, + "step": 6010 + }, + { + "epoch": 0.26354157532663197, + "grad_norm": 0.89453125, + "learning_rate": 2.8444804052097297e-05, + "loss": 0.8737, + "step": 6011 + }, + { + "epoch": 0.2635854185432892, + "grad_norm": 1.0234375, + "learning_rate": 2.8441588141065456e-05, + "loss": 0.7728, + "step": 6012 + }, + { + "epoch": 0.26362926175994644, + "grad_norm": 0.859375, + "learning_rate": 2.8438372381697585e-05, + "loss": 0.8321, + "step": 6013 + }, + { + "epoch": 0.2636731049766037, + "grad_norm": 0.8125, + "learning_rate": 2.8435156774000494e-05, + "loss": 0.8565, + "step": 6014 + }, + { + "epoch": 0.2637169481932609, + "grad_norm": 0.82421875, + "learning_rate": 2.843194131798096e-05, + "loss": 0.7247, + "step": 6015 + }, + { + "epoch": 0.26376079140991815, + "grad_norm": 0.8125, + "learning_rate": 2.842872601364588e-05, + "loss": 0.8806, + "step": 6016 + }, + { + "epoch": 0.2638046346265754, + "grad_norm": 0.828125, + "learning_rate": 2.8425510861002013e-05, + "loss": 0.7689, + "step": 6017 + }, + { + "epoch": 0.2638484778432326, + "grad_norm": 0.8671875, + "learning_rate": 2.842229586005619e-05, + "loss": 0.9566, + "step": 6018 + }, + { + "epoch": 0.26389232105988986, + "grad_norm": 0.8984375, + "learning_rate": 2.841908101081522e-05, + "loss": 0.9045, + "step": 6019 + }, + { + "epoch": 0.2639361642765471, + "grad_norm": 1.109375, + "learning_rate": 2.8415866313285876e-05, + "loss": 0.7538, + "step": 6020 + }, + { + "epoch": 0.2639800074932043, + "grad_norm": 0.734375, + "learning_rate": 2.841265176747505e-05, + "loss": 0.6882, + "step": 6021 + }, + { + "epoch": 0.26402385070986156, + "grad_norm": 0.765625, + "learning_rate": 2.8409437373389515e-05, + "loss": 0.6071, + "step": 6022 + }, + { + "epoch": 0.26406769392651874, + "grad_norm": 0.77734375, + "learning_rate": 2.840622313103609e-05, + "loss": 0.6827, + "step": 6023 + }, + { + "epoch": 0.264111537143176, + "grad_norm": 0.87109375, + "learning_rate": 2.840300904042158e-05, + "loss": 0.7871, + "step": 6024 + }, + { + "epoch": 0.2641553803598332, + "grad_norm": 0.83984375, + "learning_rate": 2.8399795101552774e-05, + "loss": 0.8096, + "step": 6025 + }, + { + "epoch": 0.26419922357649045, + "grad_norm": 0.80078125, + "learning_rate": 2.8396581314436533e-05, + "loss": 0.6665, + "step": 6026 + }, + { + "epoch": 0.2642430667931477, + "grad_norm": 0.7890625, + "learning_rate": 2.839336767907965e-05, + "loss": 0.8227, + "step": 6027 + }, + { + "epoch": 0.2642869100098049, + "grad_norm": 0.85546875, + "learning_rate": 2.8390154195488928e-05, + "loss": 0.8329, + "step": 6028 + }, + { + "epoch": 0.26433075322646216, + "grad_norm": 0.98046875, + "learning_rate": 2.8386940863671185e-05, + "loss": 0.9125, + "step": 6029 + }, + { + "epoch": 0.2643745964431194, + "grad_norm": 0.7890625, + "learning_rate": 2.8383727683633222e-05, + "loss": 0.7956, + "step": 6030 + }, + { + "epoch": 0.2644184396597766, + "grad_norm": 0.796875, + "learning_rate": 2.8380514655381862e-05, + "loss": 0.8287, + "step": 6031 + }, + { + "epoch": 0.26446228287643386, + "grad_norm": 0.77734375, + "learning_rate": 2.8377301778923905e-05, + "loss": 0.853, + "step": 6032 + }, + { + "epoch": 0.2645061260930911, + "grad_norm": 0.80078125, + "learning_rate": 2.8374089054266162e-05, + "loss": 0.7074, + "step": 6033 + }, + { + "epoch": 0.26454996930974833, + "grad_norm": 0.8515625, + "learning_rate": 2.8370876481415442e-05, + "loss": 0.77, + "step": 6034 + }, + { + "epoch": 0.26459381252640557, + "grad_norm": 0.76953125, + "learning_rate": 2.836766406037853e-05, + "loss": 0.739, + "step": 6035 + }, + { + "epoch": 0.2646376557430628, + "grad_norm": 0.8203125, + "learning_rate": 2.8364451791162283e-05, + "loss": 0.7702, + "step": 6036 + }, + { + "epoch": 0.26468149895972004, + "grad_norm": 0.72265625, + "learning_rate": 2.836123967377349e-05, + "loss": 0.6826, + "step": 6037 + }, + { + "epoch": 0.2647253421763773, + "grad_norm": 0.79296875, + "learning_rate": 2.835802770821896e-05, + "loss": 0.7681, + "step": 6038 + }, + { + "epoch": 0.2647691853930345, + "grad_norm": 0.7578125, + "learning_rate": 2.835481589450548e-05, + "loss": 0.6847, + "step": 6039 + }, + { + "epoch": 0.26481302860969175, + "grad_norm": 0.83984375, + "learning_rate": 2.835160423263986e-05, + "loss": 0.6665, + "step": 6040 + }, + { + "epoch": 0.264856871826349, + "grad_norm": 0.8125, + "learning_rate": 2.8348392722628936e-05, + "loss": 0.8173, + "step": 6041 + }, + { + "epoch": 0.2649007150430062, + "grad_norm": 0.70703125, + "learning_rate": 2.8345181364479513e-05, + "loss": 0.7861, + "step": 6042 + }, + { + "epoch": 0.26494455825966345, + "grad_norm": 0.79296875, + "learning_rate": 2.8341970158198374e-05, + "loss": 0.9193, + "step": 6043 + }, + { + "epoch": 0.2649884014763207, + "grad_norm": 0.75, + "learning_rate": 2.8338759103792322e-05, + "loss": 0.7925, + "step": 6044 + }, + { + "epoch": 0.2650322446929779, + "grad_norm": 0.7734375, + "learning_rate": 2.833554820126816e-05, + "loss": 0.6822, + "step": 6045 + }, + { + "epoch": 0.26507608790963516, + "grad_norm": 0.765625, + "learning_rate": 2.833233745063273e-05, + "loss": 0.7242, + "step": 6046 + }, + { + "epoch": 0.2651199311262924, + "grad_norm": 0.86328125, + "learning_rate": 2.8329126851892816e-05, + "loss": 0.8768, + "step": 6047 + }, + { + "epoch": 0.26516377434294963, + "grad_norm": 0.85546875, + "learning_rate": 2.832591640505523e-05, + "loss": 0.9136, + "step": 6048 + }, + { + "epoch": 0.26520761755960687, + "grad_norm": 0.87109375, + "learning_rate": 2.832270611012675e-05, + "loss": 0.7719, + "step": 6049 + }, + { + "epoch": 0.2652514607762641, + "grad_norm": 0.7890625, + "learning_rate": 2.8319495967114184e-05, + "loss": 0.9126, + "step": 6050 + }, + { + "epoch": 0.26529530399292134, + "grad_norm": 0.765625, + "learning_rate": 2.831628597602437e-05, + "loss": 0.6928, + "step": 6051 + }, + { + "epoch": 0.2653391472095786, + "grad_norm": 0.7890625, + "learning_rate": 2.831307613686409e-05, + "loss": 0.7586, + "step": 6052 + }, + { + "epoch": 0.26538299042623575, + "grad_norm": 0.83203125, + "learning_rate": 2.8309866449640154e-05, + "loss": 0.7271, + "step": 6053 + }, + { + "epoch": 0.265426833642893, + "grad_norm": 0.86328125, + "learning_rate": 2.8306656914359352e-05, + "loss": 0.8174, + "step": 6054 + }, + { + "epoch": 0.2654706768595502, + "grad_norm": 0.77734375, + "learning_rate": 2.830344753102847e-05, + "loss": 0.7886, + "step": 6055 + }, + { + "epoch": 0.26551452007620746, + "grad_norm": 0.77734375, + "learning_rate": 2.830023829965436e-05, + "loss": 0.6975, + "step": 6056 + }, + { + "epoch": 0.2655583632928647, + "grad_norm": 0.8046875, + "learning_rate": 2.82970292202438e-05, + "loss": 0.7905, + "step": 6057 + }, + { + "epoch": 0.26560220650952193, + "grad_norm": 0.8515625, + "learning_rate": 2.8293820292803574e-05, + "loss": 0.9146, + "step": 6058 + }, + { + "epoch": 0.26564604972617917, + "grad_norm": 0.875, + "learning_rate": 2.8290611517340514e-05, + "loss": 0.7944, + "step": 6059 + }, + { + "epoch": 0.2656898929428364, + "grad_norm": 0.89453125, + "learning_rate": 2.828740289386137e-05, + "loss": 0.8987, + "step": 6060 + }, + { + "epoch": 0.26573373615949364, + "grad_norm": 0.93359375, + "learning_rate": 2.8284194422373e-05, + "loss": 0.9232, + "step": 6061 + }, + { + "epoch": 0.2657775793761509, + "grad_norm": 0.78125, + "learning_rate": 2.828098610288219e-05, + "loss": 0.7518, + "step": 6062 + }, + { + "epoch": 0.2658214225928081, + "grad_norm": 0.87890625, + "learning_rate": 2.8277777935395722e-05, + "loss": 0.9621, + "step": 6063 + }, + { + "epoch": 0.26586526580946535, + "grad_norm": 0.8046875, + "learning_rate": 2.8274569919920414e-05, + "loss": 0.8067, + "step": 6064 + }, + { + "epoch": 0.2659091090261226, + "grad_norm": 1.0625, + "learning_rate": 2.8271362056463014e-05, + "loss": 0.8075, + "step": 6065 + }, + { + "epoch": 0.2659529522427798, + "grad_norm": 0.78515625, + "learning_rate": 2.8268154345030396e-05, + "loss": 0.7525, + "step": 6066 + }, + { + "epoch": 0.26599679545943705, + "grad_norm": 0.796875, + "learning_rate": 2.826494678562933e-05, + "loss": 0.7823, + "step": 6067 + }, + { + "epoch": 0.2660406386760943, + "grad_norm": 0.81640625, + "learning_rate": 2.8261739378266606e-05, + "loss": 0.8144, + "step": 6068 + }, + { + "epoch": 0.2660844818927515, + "grad_norm": 0.78125, + "learning_rate": 2.8258532122949022e-05, + "loss": 0.7948, + "step": 6069 + }, + { + "epoch": 0.26612832510940876, + "grad_norm": 0.8984375, + "learning_rate": 2.8255325019683388e-05, + "loss": 0.8749, + "step": 6070 + }, + { + "epoch": 0.266172168326066, + "grad_norm": 0.78125, + "learning_rate": 2.8252118068476484e-05, + "loss": 0.8311, + "step": 6071 + }, + { + "epoch": 0.26621601154272323, + "grad_norm": 0.90625, + "learning_rate": 2.8248911269335108e-05, + "loss": 0.8295, + "step": 6072 + }, + { + "epoch": 0.26625985475938047, + "grad_norm": 0.796875, + "learning_rate": 2.8245704622266077e-05, + "loss": 0.7978, + "step": 6073 + }, + { + "epoch": 0.2663036979760377, + "grad_norm": 0.7734375, + "learning_rate": 2.824249812727614e-05, + "loss": 0.7906, + "step": 6074 + }, + { + "epoch": 0.26634754119269494, + "grad_norm": 0.8203125, + "learning_rate": 2.8239291784372158e-05, + "loss": 0.7359, + "step": 6075 + }, + { + "epoch": 0.26639138440935217, + "grad_norm": 0.7578125, + "learning_rate": 2.823608559356089e-05, + "loss": 0.7341, + "step": 6076 + }, + { + "epoch": 0.2664352276260094, + "grad_norm": 0.90234375, + "learning_rate": 2.8232879554849133e-05, + "loss": 0.8539, + "step": 6077 + }, + { + "epoch": 0.26647907084266664, + "grad_norm": 0.875, + "learning_rate": 2.8229673668243694e-05, + "loss": 0.906, + "step": 6078 + }, + { + "epoch": 0.2665229140593239, + "grad_norm": 0.7890625, + "learning_rate": 2.8226467933751355e-05, + "loss": 0.7308, + "step": 6079 + }, + { + "epoch": 0.2665667572759811, + "grad_norm": 0.78515625, + "learning_rate": 2.8223262351378886e-05, + "loss": 0.8364, + "step": 6080 + }, + { + "epoch": 0.26661060049263835, + "grad_norm": 0.8125, + "learning_rate": 2.8220056921133142e-05, + "loss": 0.7731, + "step": 6081 + }, + { + "epoch": 0.2666544437092956, + "grad_norm": 1.3671875, + "learning_rate": 2.821685164302089e-05, + "loss": 0.8194, + "step": 6082 + }, + { + "epoch": 0.26669828692595277, + "grad_norm": 0.78515625, + "learning_rate": 2.8213646517048907e-05, + "loss": 0.7976, + "step": 6083 + }, + { + "epoch": 0.26674213014261, + "grad_norm": 0.78515625, + "learning_rate": 2.821044154322401e-05, + "loss": 0.8875, + "step": 6084 + }, + { + "epoch": 0.26678597335926724, + "grad_norm": 0.8125, + "learning_rate": 2.8207236721552933e-05, + "loss": 0.7945, + "step": 6085 + }, + { + "epoch": 0.26682981657592447, + "grad_norm": 0.875, + "learning_rate": 2.8204032052042562e-05, + "loss": 0.9733, + "step": 6086 + }, + { + "epoch": 0.2668736597925817, + "grad_norm": 0.77734375, + "learning_rate": 2.8200827534699624e-05, + "loss": 0.7787, + "step": 6087 + }, + { + "epoch": 0.26691750300923894, + "grad_norm": 0.93359375, + "learning_rate": 2.819762316953094e-05, + "loss": 0.8867, + "step": 6088 + }, + { + "epoch": 0.2669613462258962, + "grad_norm": 0.828125, + "learning_rate": 2.819441895654329e-05, + "loss": 0.8551, + "step": 6089 + }, + { + "epoch": 0.2670051894425534, + "grad_norm": 0.7890625, + "learning_rate": 2.8191214895743424e-05, + "loss": 0.7277, + "step": 6090 + }, + { + "epoch": 0.26704903265921065, + "grad_norm": 0.921875, + "learning_rate": 2.8188010987138213e-05, + "loss": 0.8207, + "step": 6091 + }, + { + "epoch": 0.2670928758758679, + "grad_norm": 0.8125, + "learning_rate": 2.81848072307344e-05, + "loss": 0.753, + "step": 6092 + }, + { + "epoch": 0.2671367190925251, + "grad_norm": 0.84765625, + "learning_rate": 2.8181603626538788e-05, + "loss": 0.7938, + "step": 6093 + }, + { + "epoch": 0.26718056230918236, + "grad_norm": 0.76171875, + "learning_rate": 2.817840017455816e-05, + "loss": 0.7862, + "step": 6094 + }, + { + "epoch": 0.2672244055258396, + "grad_norm": 0.79296875, + "learning_rate": 2.817519687479928e-05, + "loss": 0.7755, + "step": 6095 + }, + { + "epoch": 0.26726824874249683, + "grad_norm": 0.85546875, + "learning_rate": 2.8171993727269e-05, + "loss": 0.8857, + "step": 6096 + }, + { + "epoch": 0.26731209195915406, + "grad_norm": 0.85546875, + "learning_rate": 2.8168790731974072e-05, + "loss": 0.8521, + "step": 6097 + }, + { + "epoch": 0.2673559351758113, + "grad_norm": 0.828125, + "learning_rate": 2.8165587888921286e-05, + "loss": 0.8152, + "step": 6098 + }, + { + "epoch": 0.26739977839246853, + "grad_norm": 0.76953125, + "learning_rate": 2.8162385198117424e-05, + "loss": 0.7927, + "step": 6099 + }, + { + "epoch": 0.26744362160912577, + "grad_norm": 0.79296875, + "learning_rate": 2.815918265956925e-05, + "loss": 0.7231, + "step": 6100 + }, + { + "epoch": 0.267487464825783, + "grad_norm": 0.88671875, + "learning_rate": 2.815598027328362e-05, + "loss": 0.7076, + "step": 6101 + }, + { + "epoch": 0.26753130804244024, + "grad_norm": 0.765625, + "learning_rate": 2.815277803926728e-05, + "loss": 0.7534, + "step": 6102 + }, + { + "epoch": 0.2675751512590975, + "grad_norm": 0.8046875, + "learning_rate": 2.814957595752702e-05, + "loss": 0.7593, + "step": 6103 + }, + { + "epoch": 0.2676189944757547, + "grad_norm": 0.8125, + "learning_rate": 2.8146374028069633e-05, + "loss": 0.7228, + "step": 6104 + }, + { + "epoch": 0.26766283769241195, + "grad_norm": 0.83203125, + "learning_rate": 2.814317225090186e-05, + "loss": 0.8904, + "step": 6105 + }, + { + "epoch": 0.2677066809090692, + "grad_norm": 0.8203125, + "learning_rate": 2.8139970626030555e-05, + "loss": 0.8213, + "step": 6106 + }, + { + "epoch": 0.2677505241257264, + "grad_norm": 1.15625, + "learning_rate": 2.8136769153462485e-05, + "loss": 0.9094, + "step": 6107 + }, + { + "epoch": 0.26779436734238365, + "grad_norm": 0.77734375, + "learning_rate": 2.813356783320442e-05, + "loss": 0.6966, + "step": 6108 + }, + { + "epoch": 0.2678382105590409, + "grad_norm": 0.80078125, + "learning_rate": 2.8130366665263154e-05, + "loss": 0.8291, + "step": 6109 + }, + { + "epoch": 0.2678820537756981, + "grad_norm": 0.81640625, + "learning_rate": 2.812716564964546e-05, + "loss": 0.7709, + "step": 6110 + }, + { + "epoch": 0.26792589699235536, + "grad_norm": 0.91796875, + "learning_rate": 2.8123964786358102e-05, + "loss": 0.8575, + "step": 6111 + }, + { + "epoch": 0.2679697402090126, + "grad_norm": 0.85546875, + "learning_rate": 2.8120764075407923e-05, + "loss": 0.7494, + "step": 6112 + }, + { + "epoch": 0.26801358342566983, + "grad_norm": 0.8359375, + "learning_rate": 2.811756351680168e-05, + "loss": 0.8858, + "step": 6113 + }, + { + "epoch": 0.268057426642327, + "grad_norm": 0.9609375, + "learning_rate": 2.811436311054616e-05, + "loss": 1.0821, + "step": 6114 + }, + { + "epoch": 0.26810126985898425, + "grad_norm": 0.8515625, + "learning_rate": 2.8111162856648122e-05, + "loss": 0.8003, + "step": 6115 + }, + { + "epoch": 0.2681451130756415, + "grad_norm": 0.953125, + "learning_rate": 2.810796275511438e-05, + "loss": 0.896, + "step": 6116 + }, + { + "epoch": 0.2681889562922987, + "grad_norm": 0.86328125, + "learning_rate": 2.810476280595169e-05, + "loss": 0.9045, + "step": 6117 + }, + { + "epoch": 0.26823279950895595, + "grad_norm": 0.8984375, + "learning_rate": 2.8101563009166852e-05, + "loss": 0.8643, + "step": 6118 + }, + { + "epoch": 0.2682766427256132, + "grad_norm": 0.8515625, + "learning_rate": 2.8098363364766645e-05, + "loss": 0.8751, + "step": 6119 + }, + { + "epoch": 0.2683204859422704, + "grad_norm": 0.796875, + "learning_rate": 2.8095163872757836e-05, + "loss": 0.7861, + "step": 6120 + }, + { + "epoch": 0.26836432915892766, + "grad_norm": 0.88671875, + "learning_rate": 2.8091964533147195e-05, + "loss": 0.874, + "step": 6121 + }, + { + "epoch": 0.2684081723755849, + "grad_norm": 0.85546875, + "learning_rate": 2.808876534594156e-05, + "loss": 0.7822, + "step": 6122 + }, + { + "epoch": 0.26845201559224213, + "grad_norm": 0.8046875, + "learning_rate": 2.808556631114766e-05, + "loss": 0.7582, + "step": 6123 + }, + { + "epoch": 0.26849585880889937, + "grad_norm": 0.84765625, + "learning_rate": 2.8082367428772303e-05, + "loss": 0.8403, + "step": 6124 + }, + { + "epoch": 0.2685397020255566, + "grad_norm": 0.8203125, + "learning_rate": 2.8079168698822256e-05, + "loss": 0.8212, + "step": 6125 + }, + { + "epoch": 0.26858354524221384, + "grad_norm": 0.7734375, + "learning_rate": 2.8075970121304274e-05, + "loss": 0.8048, + "step": 6126 + }, + { + "epoch": 0.2686273884588711, + "grad_norm": 0.84375, + "learning_rate": 2.8072771696225186e-05, + "loss": 0.9509, + "step": 6127 + }, + { + "epoch": 0.2686712316755283, + "grad_norm": 0.765625, + "learning_rate": 2.8069573423591754e-05, + "loss": 0.7114, + "step": 6128 + }, + { + "epoch": 0.26871507489218555, + "grad_norm": 0.77734375, + "learning_rate": 2.8066375303410754e-05, + "loss": 0.7591, + "step": 6129 + }, + { + "epoch": 0.2687589181088428, + "grad_norm": 0.828125, + "learning_rate": 2.8063177335688952e-05, + "loss": 0.7872, + "step": 6130 + }, + { + "epoch": 0.2688027613255, + "grad_norm": 0.8359375, + "learning_rate": 2.80599795204331e-05, + "loss": 0.6391, + "step": 6131 + }, + { + "epoch": 0.26884660454215725, + "grad_norm": 0.89453125, + "learning_rate": 2.8056781857650038e-05, + "loss": 0.908, + "step": 6132 + }, + { + "epoch": 0.2688904477588145, + "grad_norm": 0.8125, + "learning_rate": 2.805358434734653e-05, + "loss": 0.7875, + "step": 6133 + }, + { + "epoch": 0.2689342909754717, + "grad_norm": 0.75390625, + "learning_rate": 2.805038698952932e-05, + "loss": 0.6903, + "step": 6134 + }, + { + "epoch": 0.26897813419212896, + "grad_norm": 0.76953125, + "learning_rate": 2.8047189784205218e-05, + "loss": 0.8198, + "step": 6135 + }, + { + "epoch": 0.2690219774087862, + "grad_norm": 0.83984375, + "learning_rate": 2.804399273138094e-05, + "loss": 0.8682, + "step": 6136 + }, + { + "epoch": 0.26906582062544343, + "grad_norm": 0.86328125, + "learning_rate": 2.8040795831063348e-05, + "loss": 0.83, + "step": 6137 + }, + { + "epoch": 0.26910966384210067, + "grad_norm": 0.83984375, + "learning_rate": 2.803759908325917e-05, + "loss": 0.7886, + "step": 6138 + }, + { + "epoch": 0.2691535070587579, + "grad_norm": 0.99609375, + "learning_rate": 2.8034402487975186e-05, + "loss": 0.8396, + "step": 6139 + }, + { + "epoch": 0.26919735027541514, + "grad_norm": 0.85546875, + "learning_rate": 2.8031206045218183e-05, + "loss": 0.7475, + "step": 6140 + }, + { + "epoch": 0.2692411934920724, + "grad_norm": 0.76171875, + "learning_rate": 2.8028009754994887e-05, + "loss": 0.7015, + "step": 6141 + }, + { + "epoch": 0.2692850367087296, + "grad_norm": 0.8203125, + "learning_rate": 2.802481361731214e-05, + "loss": 0.7811, + "step": 6142 + }, + { + "epoch": 0.26932887992538684, + "grad_norm": 0.8359375, + "learning_rate": 2.802161763217669e-05, + "loss": 0.7953, + "step": 6143 + }, + { + "epoch": 0.269372723142044, + "grad_norm": 0.91796875, + "learning_rate": 2.8018421799595308e-05, + "loss": 0.7911, + "step": 6144 + }, + { + "epoch": 0.26941656635870126, + "grad_norm": 0.8671875, + "learning_rate": 2.8015226119574767e-05, + "loss": 0.8816, + "step": 6145 + }, + { + "epoch": 0.2694604095753585, + "grad_norm": 0.87109375, + "learning_rate": 2.8012030592121808e-05, + "loss": 0.9121, + "step": 6146 + }, + { + "epoch": 0.26950425279201573, + "grad_norm": 0.88671875, + "learning_rate": 2.8008835217243257e-05, + "loss": 0.7542, + "step": 6147 + }, + { + "epoch": 0.26954809600867297, + "grad_norm": 0.77734375, + "learning_rate": 2.8005639994945877e-05, + "loss": 0.7354, + "step": 6148 + }, + { + "epoch": 0.2695919392253302, + "grad_norm": 0.87890625, + "learning_rate": 2.8002444925236425e-05, + "loss": 0.82, + "step": 6149 + }, + { + "epoch": 0.26963578244198744, + "grad_norm": 0.94921875, + "learning_rate": 2.7999250008121682e-05, + "loss": 0.6991, + "step": 6150 + }, + { + "epoch": 0.2696796256586447, + "grad_norm": 0.8515625, + "learning_rate": 2.7996055243608364e-05, + "loss": 0.7799, + "step": 6151 + }, + { + "epoch": 0.2697234688753019, + "grad_norm": 0.81640625, + "learning_rate": 2.7992860631703333e-05, + "loss": 0.8771, + "step": 6152 + }, + { + "epoch": 0.26976731209195914, + "grad_norm": 0.77734375, + "learning_rate": 2.798966617241332e-05, + "loss": 0.7719, + "step": 6153 + }, + { + "epoch": 0.2698111553086164, + "grad_norm": 0.8203125, + "learning_rate": 2.7986471865745102e-05, + "loss": 0.8285, + "step": 6154 + }, + { + "epoch": 0.2698549985252736, + "grad_norm": 0.7734375, + "learning_rate": 2.7983277711705425e-05, + "loss": 0.7061, + "step": 6155 + }, + { + "epoch": 0.26989884174193085, + "grad_norm": 0.81640625, + "learning_rate": 2.7980083710301088e-05, + "loss": 0.71, + "step": 6156 + }, + { + "epoch": 0.2699426849585881, + "grad_norm": 0.796875, + "learning_rate": 2.7976889861538837e-05, + "loss": 0.8045, + "step": 6157 + }, + { + "epoch": 0.2699865281752453, + "grad_norm": 0.8046875, + "learning_rate": 2.7973696165425455e-05, + "loss": 0.9128, + "step": 6158 + }, + { + "epoch": 0.27003037139190256, + "grad_norm": 0.83984375, + "learning_rate": 2.7970502621967675e-05, + "loss": 0.7361, + "step": 6159 + }, + { + "epoch": 0.2700742146085598, + "grad_norm": 0.84375, + "learning_rate": 2.7967309231172324e-05, + "loss": 0.8458, + "step": 6160 + }, + { + "epoch": 0.27011805782521703, + "grad_norm": 0.84375, + "learning_rate": 2.7964115993046147e-05, + "loss": 0.8349, + "step": 6161 + }, + { + "epoch": 0.27016190104187426, + "grad_norm": 0.77734375, + "learning_rate": 2.796092290759591e-05, + "loss": 0.7945, + "step": 6162 + }, + { + "epoch": 0.2702057442585315, + "grad_norm": 0.7890625, + "learning_rate": 2.7957729974828383e-05, + "loss": 0.888, + "step": 6163 + }, + { + "epoch": 0.27024958747518874, + "grad_norm": 0.75390625, + "learning_rate": 2.795453719475033e-05, + "loss": 0.7891, + "step": 6164 + }, + { + "epoch": 0.27029343069184597, + "grad_norm": 0.76953125, + "learning_rate": 2.795134456736851e-05, + "loss": 0.8161, + "step": 6165 + }, + { + "epoch": 0.2703372739085032, + "grad_norm": 0.80078125, + "learning_rate": 2.7948152092689673e-05, + "loss": 0.8432, + "step": 6166 + }, + { + "epoch": 0.27038111712516044, + "grad_norm": 0.7890625, + "learning_rate": 2.7944959770720634e-05, + "loss": 0.8265, + "step": 6167 + }, + { + "epoch": 0.2704249603418177, + "grad_norm": 0.828125, + "learning_rate": 2.794176760146814e-05, + "loss": 0.7855, + "step": 6168 + }, + { + "epoch": 0.2704688035584749, + "grad_norm": 0.84375, + "learning_rate": 2.7938575584938953e-05, + "loss": 0.7944, + "step": 6169 + }, + { + "epoch": 0.27051264677513215, + "grad_norm": 0.77734375, + "learning_rate": 2.793538372113983e-05, + "loss": 0.785, + "step": 6170 + }, + { + "epoch": 0.2705564899917894, + "grad_norm": 0.85546875, + "learning_rate": 2.793219201007752e-05, + "loss": 0.8188, + "step": 6171 + }, + { + "epoch": 0.2706003332084466, + "grad_norm": 0.859375, + "learning_rate": 2.7929000451758825e-05, + "loss": 0.9076, + "step": 6172 + }, + { + "epoch": 0.27064417642510386, + "grad_norm": 0.88671875, + "learning_rate": 2.792580904619051e-05, + "loss": 0.8597, + "step": 6173 + }, + { + "epoch": 0.27068801964176104, + "grad_norm": 0.765625, + "learning_rate": 2.7922617793379315e-05, + "loss": 0.7899, + "step": 6174 + }, + { + "epoch": 0.27073186285841827, + "grad_norm": 0.8203125, + "learning_rate": 2.7919426693332006e-05, + "loss": 0.7079, + "step": 6175 + }, + { + "epoch": 0.2707757060750755, + "grad_norm": 0.84765625, + "learning_rate": 2.7916235746055332e-05, + "loss": 0.8975, + "step": 6176 + }, + { + "epoch": 0.27081954929173274, + "grad_norm": 0.7265625, + "learning_rate": 2.791304495155611e-05, + "loss": 0.7823, + "step": 6177 + }, + { + "epoch": 0.27086339250839, + "grad_norm": 0.8984375, + "learning_rate": 2.7909854309841053e-05, + "loss": 0.824, + "step": 6178 + }, + { + "epoch": 0.2709072357250472, + "grad_norm": 0.8125, + "learning_rate": 2.7906663820916957e-05, + "loss": 0.9157, + "step": 6179 + }, + { + "epoch": 0.27095107894170445, + "grad_norm": 0.828125, + "learning_rate": 2.7903473484790553e-05, + "loss": 0.8039, + "step": 6180 + }, + { + "epoch": 0.2709949221583617, + "grad_norm": 0.8828125, + "learning_rate": 2.79002833014686e-05, + "loss": 0.8258, + "step": 6181 + }, + { + "epoch": 0.2710387653750189, + "grad_norm": 0.953125, + "learning_rate": 2.7897093270957886e-05, + "loss": 0.9104, + "step": 6182 + }, + { + "epoch": 0.27108260859167616, + "grad_norm": 0.9375, + "learning_rate": 2.7893903393265175e-05, + "loss": 0.7847, + "step": 6183 + }, + { + "epoch": 0.2711264518083334, + "grad_norm": 0.81640625, + "learning_rate": 2.7890713668397218e-05, + "loss": 0.8512, + "step": 6184 + }, + { + "epoch": 0.2711702950249906, + "grad_norm": 0.86328125, + "learning_rate": 2.788752409636076e-05, + "loss": 0.9435, + "step": 6185 + }, + { + "epoch": 0.27121413824164786, + "grad_norm": 0.9140625, + "learning_rate": 2.788433467716255e-05, + "loss": 0.8913, + "step": 6186 + }, + { + "epoch": 0.2712579814583051, + "grad_norm": 0.87109375, + "learning_rate": 2.7881145410809397e-05, + "loss": 0.8814, + "step": 6187 + }, + { + "epoch": 0.27130182467496233, + "grad_norm": 0.83984375, + "learning_rate": 2.787795629730804e-05, + "loss": 0.8218, + "step": 6188 + }, + { + "epoch": 0.27134566789161957, + "grad_norm": 0.8828125, + "learning_rate": 2.787476733666523e-05, + "loss": 0.7795, + "step": 6189 + }, + { + "epoch": 0.2713895111082768, + "grad_norm": 0.79296875, + "learning_rate": 2.787157852888772e-05, + "loss": 0.7385, + "step": 6190 + }, + { + "epoch": 0.27143335432493404, + "grad_norm": 0.73828125, + "learning_rate": 2.7868389873982258e-05, + "loss": 0.7206, + "step": 6191 + }, + { + "epoch": 0.2714771975415913, + "grad_norm": 0.79296875, + "learning_rate": 2.7865201371955652e-05, + "loss": 0.8639, + "step": 6192 + }, + { + "epoch": 0.2715210407582485, + "grad_norm": 0.87109375, + "learning_rate": 2.7862013022814616e-05, + "loss": 0.8204, + "step": 6193 + }, + { + "epoch": 0.27156488397490575, + "grad_norm": 0.89453125, + "learning_rate": 2.785882482656593e-05, + "loss": 0.8988, + "step": 6194 + }, + { + "epoch": 0.271608727191563, + "grad_norm": 0.83984375, + "learning_rate": 2.7855636783216343e-05, + "loss": 0.8226, + "step": 6195 + }, + { + "epoch": 0.2716525704082202, + "grad_norm": 0.81640625, + "learning_rate": 2.7852448892772576e-05, + "loss": 0.8182, + "step": 6196 + }, + { + "epoch": 0.27169641362487745, + "grad_norm": 0.875, + "learning_rate": 2.7849261155241445e-05, + "loss": 0.8257, + "step": 6197 + }, + { + "epoch": 0.2717402568415347, + "grad_norm": 0.765625, + "learning_rate": 2.784607357062968e-05, + "loss": 0.8001, + "step": 6198 + }, + { + "epoch": 0.2717841000581919, + "grad_norm": 0.875, + "learning_rate": 2.7842886138944048e-05, + "loss": 0.7602, + "step": 6199 + }, + { + "epoch": 0.27182794327484916, + "grad_norm": 0.8203125, + "learning_rate": 2.7839698860191297e-05, + "loss": 0.8141, + "step": 6200 + }, + { + "epoch": 0.2718717864915064, + "grad_norm": 0.83984375, + "learning_rate": 2.783651173437817e-05, + "loss": 0.8951, + "step": 6201 + }, + { + "epoch": 0.27191562970816363, + "grad_norm": 0.7890625, + "learning_rate": 2.7833324761511438e-05, + "loss": 0.7442, + "step": 6202 + }, + { + "epoch": 0.27195947292482087, + "grad_norm": 0.8125, + "learning_rate": 2.7830137941597844e-05, + "loss": 0.726, + "step": 6203 + }, + { + "epoch": 0.2720033161414781, + "grad_norm": 0.83203125, + "learning_rate": 2.7826951274644154e-05, + "loss": 0.8961, + "step": 6204 + }, + { + "epoch": 0.2720471593581353, + "grad_norm": 0.91796875, + "learning_rate": 2.7823764760657112e-05, + "loss": 0.9495, + "step": 6205 + }, + { + "epoch": 0.2720910025747925, + "grad_norm": 0.8125, + "learning_rate": 2.782057839964344e-05, + "loss": 0.8342, + "step": 6206 + }, + { + "epoch": 0.27213484579144975, + "grad_norm": 0.88671875, + "learning_rate": 2.781739219160996e-05, + "loss": 0.8258, + "step": 6207 + }, + { + "epoch": 0.272178689008107, + "grad_norm": 0.765625, + "learning_rate": 2.7814206136563394e-05, + "loss": 0.8268, + "step": 6208 + }, + { + "epoch": 0.2722225322247642, + "grad_norm": 0.85546875, + "learning_rate": 2.7811020234510487e-05, + "loss": 0.8574, + "step": 6209 + }, + { + "epoch": 0.27226637544142146, + "grad_norm": 0.796875, + "learning_rate": 2.7807834485458006e-05, + "loss": 0.7697, + "step": 6210 + }, + { + "epoch": 0.2723102186580787, + "grad_norm": 0.76171875, + "learning_rate": 2.7804648889412654e-05, + "loss": 0.7154, + "step": 6211 + }, + { + "epoch": 0.27235406187473593, + "grad_norm": 0.86328125, + "learning_rate": 2.780146344638126e-05, + "loss": 0.8875, + "step": 6212 + }, + { + "epoch": 0.27239790509139317, + "grad_norm": 0.796875, + "learning_rate": 2.7798278156370526e-05, + "loss": 0.9097, + "step": 6213 + }, + { + "epoch": 0.2724417483080504, + "grad_norm": 0.84375, + "learning_rate": 2.779509301938723e-05, + "loss": 0.7517, + "step": 6214 + }, + { + "epoch": 0.27248559152470764, + "grad_norm": 0.76953125, + "learning_rate": 2.7791908035438096e-05, + "loss": 0.7443, + "step": 6215 + }, + { + "epoch": 0.2725294347413649, + "grad_norm": 0.8203125, + "learning_rate": 2.7788723204529854e-05, + "loss": 0.7996, + "step": 6216 + }, + { + "epoch": 0.2725732779580221, + "grad_norm": 0.92578125, + "learning_rate": 2.778553852666932e-05, + "loss": 0.6826, + "step": 6217 + }, + { + "epoch": 0.27261712117467934, + "grad_norm": 0.79296875, + "learning_rate": 2.778235400186322e-05, + "loss": 0.7733, + "step": 6218 + }, + { + "epoch": 0.2726609643913366, + "grad_norm": 0.89453125, + "learning_rate": 2.7779169630118274e-05, + "loss": 0.7318, + "step": 6219 + }, + { + "epoch": 0.2727048076079938, + "grad_norm": 0.75, + "learning_rate": 2.7775985411441262e-05, + "loss": 0.7289, + "step": 6220 + }, + { + "epoch": 0.27274865082465105, + "grad_norm": 0.90625, + "learning_rate": 2.7772801345838896e-05, + "loss": 0.8921, + "step": 6221 + }, + { + "epoch": 0.2727924940413083, + "grad_norm": 0.84375, + "learning_rate": 2.7769617433317962e-05, + "loss": 0.7424, + "step": 6222 + }, + { + "epoch": 0.2728363372579655, + "grad_norm": 0.83203125, + "learning_rate": 2.7766433673885206e-05, + "loss": 0.8148, + "step": 6223 + }, + { + "epoch": 0.27288018047462276, + "grad_norm": 0.83984375, + "learning_rate": 2.7763250067547363e-05, + "loss": 0.6907, + "step": 6224 + }, + { + "epoch": 0.27292402369128, + "grad_norm": 0.8984375, + "learning_rate": 2.7760066614311186e-05, + "loss": 0.8448, + "step": 6225 + }, + { + "epoch": 0.27296786690793723, + "grad_norm": 0.87109375, + "learning_rate": 2.775688331418339e-05, + "loss": 0.7287, + "step": 6226 + }, + { + "epoch": 0.27301171012459446, + "grad_norm": 0.78125, + "learning_rate": 2.775370016717077e-05, + "loss": 0.7032, + "step": 6227 + }, + { + "epoch": 0.2730555533412517, + "grad_norm": 0.78515625, + "learning_rate": 2.7750517173280066e-05, + "loss": 0.673, + "step": 6228 + }, + { + "epoch": 0.27309939655790894, + "grad_norm": 0.828125, + "learning_rate": 2.7747334332518004e-05, + "loss": 0.7964, + "step": 6229 + }, + { + "epoch": 0.27314323977456617, + "grad_norm": 0.82421875, + "learning_rate": 2.7744151644891337e-05, + "loss": 0.7278, + "step": 6230 + }, + { + "epoch": 0.2731870829912234, + "grad_norm": 0.78125, + "learning_rate": 2.7740969110406777e-05, + "loss": 0.8114, + "step": 6231 + }, + { + "epoch": 0.27323092620788064, + "grad_norm": 0.80078125, + "learning_rate": 2.7737786729071147e-05, + "loss": 0.8183, + "step": 6232 + }, + { + "epoch": 0.2732747694245379, + "grad_norm": 0.8203125, + "learning_rate": 2.7734604500891136e-05, + "loss": 0.7772, + "step": 6233 + }, + { + "epoch": 0.2733186126411951, + "grad_norm": 0.79296875, + "learning_rate": 2.77314224258735e-05, + "loss": 0.8082, + "step": 6234 + }, + { + "epoch": 0.2733624558578523, + "grad_norm": 0.875, + "learning_rate": 2.7728240504024983e-05, + "loss": 0.7279, + "step": 6235 + }, + { + "epoch": 0.27340629907450953, + "grad_norm": 0.99609375, + "learning_rate": 2.7725058735352305e-05, + "loss": 0.9046, + "step": 6236 + }, + { + "epoch": 0.27345014229116676, + "grad_norm": 0.83984375, + "learning_rate": 2.772187711986226e-05, + "loss": 0.8393, + "step": 6237 + }, + { + "epoch": 0.273493985507824, + "grad_norm": 0.83984375, + "learning_rate": 2.7718695657561566e-05, + "loss": 0.9007, + "step": 6238 + }, + { + "epoch": 0.27353782872448124, + "grad_norm": 0.90234375, + "learning_rate": 2.7715514348456972e-05, + "loss": 0.7942, + "step": 6239 + }, + { + "epoch": 0.27358167194113847, + "grad_norm": 0.84375, + "learning_rate": 2.7712333192555206e-05, + "loss": 0.7125, + "step": 6240 + }, + { + "epoch": 0.2736255151577957, + "grad_norm": 0.84375, + "learning_rate": 2.770915218986302e-05, + "loss": 0.7227, + "step": 6241 + }, + { + "epoch": 0.27366935837445294, + "grad_norm": 0.92578125, + "learning_rate": 2.770597134038716e-05, + "loss": 1.0465, + "step": 6242 + }, + { + "epoch": 0.2737132015911102, + "grad_norm": 0.96484375, + "learning_rate": 2.7702790644134348e-05, + "loss": 0.9596, + "step": 6243 + }, + { + "epoch": 0.2737570448077674, + "grad_norm": 0.875, + "learning_rate": 2.7699610101111352e-05, + "loss": 0.8045, + "step": 6244 + }, + { + "epoch": 0.27380088802442465, + "grad_norm": 0.796875, + "learning_rate": 2.7696429711324866e-05, + "loss": 0.8372, + "step": 6245 + }, + { + "epoch": 0.2738447312410819, + "grad_norm": 0.7578125, + "learning_rate": 2.769324947478169e-05, + "loss": 0.7441, + "step": 6246 + }, + { + "epoch": 0.2738885744577391, + "grad_norm": 0.91015625, + "learning_rate": 2.7690069391488538e-05, + "loss": 0.8919, + "step": 6247 + }, + { + "epoch": 0.27393241767439636, + "grad_norm": 0.82421875, + "learning_rate": 2.768688946145216e-05, + "loss": 0.7924, + "step": 6248 + }, + { + "epoch": 0.2739762608910536, + "grad_norm": 1.1171875, + "learning_rate": 2.768370968467928e-05, + "loss": 0.851, + "step": 6249 + }, + { + "epoch": 0.2740201041077108, + "grad_norm": 0.94921875, + "learning_rate": 2.7680530061176645e-05, + "loss": 0.851, + "step": 6250 + }, + { + "epoch": 0.27406394732436806, + "grad_norm": 0.76953125, + "learning_rate": 2.767735059095099e-05, + "loss": 0.7617, + "step": 6251 + }, + { + "epoch": 0.2741077905410253, + "grad_norm": 0.8359375, + "learning_rate": 2.767417127400903e-05, + "loss": 0.7967, + "step": 6252 + }, + { + "epoch": 0.27415163375768253, + "grad_norm": 0.87890625, + "learning_rate": 2.7670992110357562e-05, + "loss": 0.8631, + "step": 6253 + }, + { + "epoch": 0.27419547697433977, + "grad_norm": 0.8046875, + "learning_rate": 2.7667813100003293e-05, + "loss": 0.8373, + "step": 6254 + }, + { + "epoch": 0.274239320190997, + "grad_norm": 0.78515625, + "learning_rate": 2.766463424295296e-05, + "loss": 0.8673, + "step": 6255 + }, + { + "epoch": 0.27428316340765424, + "grad_norm": 0.8046875, + "learning_rate": 2.7661455539213286e-05, + "loss": 0.9592, + "step": 6256 + }, + { + "epoch": 0.2743270066243115, + "grad_norm": 0.86328125, + "learning_rate": 2.765827698879101e-05, + "loss": 0.7367, + "step": 6257 + }, + { + "epoch": 0.2743708498409687, + "grad_norm": 0.8828125, + "learning_rate": 2.7655098591692907e-05, + "loss": 0.9243, + "step": 6258 + }, + { + "epoch": 0.27441469305762595, + "grad_norm": 0.8046875, + "learning_rate": 2.7651920347925675e-05, + "loss": 0.7478, + "step": 6259 + }, + { + "epoch": 0.2744585362742832, + "grad_norm": 0.9609375, + "learning_rate": 2.7648742257496075e-05, + "loss": 0.9622, + "step": 6260 + }, + { + "epoch": 0.2745023794909404, + "grad_norm": 0.84375, + "learning_rate": 2.7645564320410832e-05, + "loss": 0.749, + "step": 6261 + }, + { + "epoch": 0.27454622270759765, + "grad_norm": 0.93359375, + "learning_rate": 2.7642386536676646e-05, + "loss": 0.8581, + "step": 6262 + }, + { + "epoch": 0.2745900659242549, + "grad_norm": 0.78125, + "learning_rate": 2.7639208906300317e-05, + "loss": 0.5849, + "step": 6263 + }, + { + "epoch": 0.2746339091409121, + "grad_norm": 0.7890625, + "learning_rate": 2.7636031429288545e-05, + "loss": 0.6628, + "step": 6264 + }, + { + "epoch": 0.2746777523575693, + "grad_norm": 0.80859375, + "learning_rate": 2.7632854105648077e-05, + "loss": 0.8905, + "step": 6265 + }, + { + "epoch": 0.27472159557422654, + "grad_norm": 0.8984375, + "learning_rate": 2.7629676935385628e-05, + "loss": 0.8912, + "step": 6266 + }, + { + "epoch": 0.2747654387908838, + "grad_norm": 0.81640625, + "learning_rate": 2.762649991850791e-05, + "loss": 0.784, + "step": 6267 + }, + { + "epoch": 0.274809282007541, + "grad_norm": 0.8125, + "learning_rate": 2.7623323055021732e-05, + "loss": 0.7972, + "step": 6268 + }, + { + "epoch": 0.27485312522419825, + "grad_norm": 0.86328125, + "learning_rate": 2.7620146344933774e-05, + "loss": 0.8234, + "step": 6269 + }, + { + "epoch": 0.2748969684408555, + "grad_norm": 0.83203125, + "learning_rate": 2.761696978825078e-05, + "loss": 0.9124, + "step": 6270 + }, + { + "epoch": 0.2749408116575127, + "grad_norm": 0.8125, + "learning_rate": 2.761379338497948e-05, + "loss": 0.8544, + "step": 6271 + }, + { + "epoch": 0.27498465487416995, + "grad_norm": 0.92578125, + "learning_rate": 2.7610617135126583e-05, + "loss": 0.7767, + "step": 6272 + }, + { + "epoch": 0.2750284980908272, + "grad_norm": 0.83984375, + "learning_rate": 2.7607441038698877e-05, + "loss": 0.8072, + "step": 6273 + }, + { + "epoch": 0.2750723413074844, + "grad_norm": 0.859375, + "learning_rate": 2.760426509570305e-05, + "loss": 0.9246, + "step": 6274 + }, + { + "epoch": 0.27511618452414166, + "grad_norm": 0.73828125, + "learning_rate": 2.7601089306145865e-05, + "loss": 0.7891, + "step": 6275 + }, + { + "epoch": 0.2751600277407989, + "grad_norm": 0.81640625, + "learning_rate": 2.7597913670034013e-05, + "loss": 0.7201, + "step": 6276 + }, + { + "epoch": 0.27520387095745613, + "grad_norm": 0.91796875, + "learning_rate": 2.7594738187374224e-05, + "loss": 0.9136, + "step": 6277 + }, + { + "epoch": 0.27524771417411337, + "grad_norm": 0.79296875, + "learning_rate": 2.7591562858173282e-05, + "loss": 0.7177, + "step": 6278 + }, + { + "epoch": 0.2752915573907706, + "grad_norm": 0.73828125, + "learning_rate": 2.7588387682437877e-05, + "loss": 0.601, + "step": 6279 + }, + { + "epoch": 0.27533540060742784, + "grad_norm": 0.81640625, + "learning_rate": 2.7585212660174753e-05, + "loss": 0.8509, + "step": 6280 + }, + { + "epoch": 0.2753792438240851, + "grad_norm": 0.8046875, + "learning_rate": 2.7582037791390624e-05, + "loss": 0.7516, + "step": 6281 + }, + { + "epoch": 0.2754230870407423, + "grad_norm": 0.76953125, + "learning_rate": 2.75788630760922e-05, + "loss": 0.706, + "step": 6282 + }, + { + "epoch": 0.27546693025739954, + "grad_norm": 0.77734375, + "learning_rate": 2.7575688514286267e-05, + "loss": 0.7599, + "step": 6283 + }, + { + "epoch": 0.2755107734740568, + "grad_norm": 0.828125, + "learning_rate": 2.7572514105979518e-05, + "loss": 0.7717, + "step": 6284 + }, + { + "epoch": 0.275554616690714, + "grad_norm": 0.80078125, + "learning_rate": 2.756933985117869e-05, + "loss": 0.8099, + "step": 6285 + }, + { + "epoch": 0.27559845990737125, + "grad_norm": 0.91796875, + "learning_rate": 2.7566165749890504e-05, + "loss": 0.8353, + "step": 6286 + }, + { + "epoch": 0.2756423031240285, + "grad_norm": 0.8125, + "learning_rate": 2.7562991802121697e-05, + "loss": 0.7062, + "step": 6287 + }, + { + "epoch": 0.2756861463406857, + "grad_norm": 0.80078125, + "learning_rate": 2.7559818007878978e-05, + "loss": 0.755, + "step": 6288 + }, + { + "epoch": 0.27572998955734296, + "grad_norm": 0.8359375, + "learning_rate": 2.7556644367169094e-05, + "loss": 0.9036, + "step": 6289 + }, + { + "epoch": 0.2757738327740002, + "grad_norm": 1.3203125, + "learning_rate": 2.7553470879998756e-05, + "loss": 0.8567, + "step": 6290 + }, + { + "epoch": 0.27581767599065743, + "grad_norm": 0.828125, + "learning_rate": 2.7550297546374694e-05, + "loss": 0.8247, + "step": 6291 + }, + { + "epoch": 0.27586151920731466, + "grad_norm": 0.85546875, + "learning_rate": 2.754712436630361e-05, + "loss": 0.8461, + "step": 6292 + }, + { + "epoch": 0.2759053624239719, + "grad_norm": 0.91796875, + "learning_rate": 2.7543951339792283e-05, + "loss": 0.8173, + "step": 6293 + }, + { + "epoch": 0.27594920564062914, + "grad_norm": 1.09375, + "learning_rate": 2.7540778466847405e-05, + "loss": 0.8592, + "step": 6294 + }, + { + "epoch": 0.27599304885728637, + "grad_norm": 0.88671875, + "learning_rate": 2.7537605747475716e-05, + "loss": 0.8727, + "step": 6295 + }, + { + "epoch": 0.27603689207394355, + "grad_norm": 0.84765625, + "learning_rate": 2.7534433181683917e-05, + "loss": 0.9256, + "step": 6296 + }, + { + "epoch": 0.2760807352906008, + "grad_norm": 0.87109375, + "learning_rate": 2.7531260769478728e-05, + "loss": 0.824, + "step": 6297 + }, + { + "epoch": 0.276124578507258, + "grad_norm": 0.8828125, + "learning_rate": 2.752808851086691e-05, + "loss": 0.9059, + "step": 6298 + }, + { + "epoch": 0.27616842172391526, + "grad_norm": 0.79296875, + "learning_rate": 2.752491640585517e-05, + "loss": 0.7419, + "step": 6299 + }, + { + "epoch": 0.2762122649405725, + "grad_norm": 0.83203125, + "learning_rate": 2.7521744454450227e-05, + "loss": 0.7976, + "step": 6300 + }, + { + "epoch": 0.27625610815722973, + "grad_norm": 0.8671875, + "learning_rate": 2.7518572656658804e-05, + "loss": 0.8287, + "step": 6301 + }, + { + "epoch": 0.27629995137388697, + "grad_norm": 0.89453125, + "learning_rate": 2.7515401012487583e-05, + "loss": 0.8731, + "step": 6302 + }, + { + "epoch": 0.2763437945905442, + "grad_norm": 0.8046875, + "learning_rate": 2.7512229521943366e-05, + "loss": 0.9196, + "step": 6303 + }, + { + "epoch": 0.27638763780720144, + "grad_norm": 0.87109375, + "learning_rate": 2.7509058185032842e-05, + "loss": 0.8504, + "step": 6304 + }, + { + "epoch": 0.27643148102385867, + "grad_norm": 0.83984375, + "learning_rate": 2.750588700176272e-05, + "loss": 0.9278, + "step": 6305 + }, + { + "epoch": 0.2764753242405159, + "grad_norm": 0.8203125, + "learning_rate": 2.7502715972139725e-05, + "loss": 0.8064, + "step": 6306 + }, + { + "epoch": 0.27651916745717314, + "grad_norm": 0.82421875, + "learning_rate": 2.7499545096170555e-05, + "loss": 0.6739, + "step": 6307 + }, + { + "epoch": 0.2765630106738304, + "grad_norm": 0.90625, + "learning_rate": 2.7496374373861978e-05, + "loss": 0.8127, + "step": 6308 + }, + { + "epoch": 0.2766068538904876, + "grad_norm": 0.8515625, + "learning_rate": 2.749320380522069e-05, + "loss": 0.832, + "step": 6309 + }, + { + "epoch": 0.27665069710714485, + "grad_norm": 0.8046875, + "learning_rate": 2.749003339025342e-05, + "loss": 0.7565, + "step": 6310 + }, + { + "epoch": 0.2766945403238021, + "grad_norm": 0.8984375, + "learning_rate": 2.7486863128966877e-05, + "loss": 0.8316, + "step": 6311 + }, + { + "epoch": 0.2767383835404593, + "grad_norm": 0.80078125, + "learning_rate": 2.748369302136775e-05, + "loss": 0.746, + "step": 6312 + }, + { + "epoch": 0.27678222675711656, + "grad_norm": 0.8046875, + "learning_rate": 2.7480523067462825e-05, + "loss": 0.8023, + "step": 6313 + }, + { + "epoch": 0.2768260699737738, + "grad_norm": 0.87890625, + "learning_rate": 2.7477353267258787e-05, + "loss": 0.8772, + "step": 6314 + }, + { + "epoch": 0.276869913190431, + "grad_norm": 0.81640625, + "learning_rate": 2.747418362076235e-05, + "loss": 0.8084, + "step": 6315 + }, + { + "epoch": 0.27691375640708826, + "grad_norm": 0.9609375, + "learning_rate": 2.747101412798023e-05, + "loss": 0.8176, + "step": 6316 + }, + { + "epoch": 0.2769575996237455, + "grad_norm": 0.97265625, + "learning_rate": 2.7467844788919127e-05, + "loss": 0.8379, + "step": 6317 + }, + { + "epoch": 0.27700144284040273, + "grad_norm": 0.79296875, + "learning_rate": 2.7464675603585822e-05, + "loss": 0.9051, + "step": 6318 + }, + { + "epoch": 0.27704528605705997, + "grad_norm": 0.890625, + "learning_rate": 2.746150657198697e-05, + "loss": 0.8314, + "step": 6319 + }, + { + "epoch": 0.2770891292737172, + "grad_norm": 0.8515625, + "learning_rate": 2.745833769412932e-05, + "loss": 0.7348, + "step": 6320 + }, + { + "epoch": 0.27713297249037444, + "grad_norm": 0.82421875, + "learning_rate": 2.7455168970019583e-05, + "loss": 0.6516, + "step": 6321 + }, + { + "epoch": 0.2771768157070317, + "grad_norm": 0.84765625, + "learning_rate": 2.7452000399664433e-05, + "loss": 0.8251, + "step": 6322 + }, + { + "epoch": 0.2772206589236889, + "grad_norm": 0.875, + "learning_rate": 2.7448831983070645e-05, + "loss": 0.6572, + "step": 6323 + }, + { + "epoch": 0.27726450214034615, + "grad_norm": 0.82421875, + "learning_rate": 2.7445663720244918e-05, + "loss": 0.8988, + "step": 6324 + }, + { + "epoch": 0.2773083453570034, + "grad_norm": 0.85546875, + "learning_rate": 2.7442495611193953e-05, + "loss": 0.7935, + "step": 6325 + }, + { + "epoch": 0.27735218857366056, + "grad_norm": 0.85546875, + "learning_rate": 2.7439327655924487e-05, + "loss": 0.7998, + "step": 6326 + }, + { + "epoch": 0.2773960317903178, + "grad_norm": 0.90625, + "learning_rate": 2.74361598544432e-05, + "loss": 0.8447, + "step": 6327 + }, + { + "epoch": 0.27743987500697503, + "grad_norm": 0.88671875, + "learning_rate": 2.7432992206756837e-05, + "loss": 0.764, + "step": 6328 + }, + { + "epoch": 0.27748371822363227, + "grad_norm": 0.765625, + "learning_rate": 2.742982471287209e-05, + "loss": 0.7228, + "step": 6329 + }, + { + "epoch": 0.2775275614402895, + "grad_norm": 0.7578125, + "learning_rate": 2.742665737279566e-05, + "loss": 0.7879, + "step": 6330 + }, + { + "epoch": 0.27757140465694674, + "grad_norm": 0.82421875, + "learning_rate": 2.7423490186534306e-05, + "loss": 0.8028, + "step": 6331 + }, + { + "epoch": 0.277615247873604, + "grad_norm": 0.87109375, + "learning_rate": 2.742032315409472e-05, + "loss": 0.8677, + "step": 6332 + }, + { + "epoch": 0.2776590910902612, + "grad_norm": 0.81640625, + "learning_rate": 2.7417156275483614e-05, + "loss": 0.8117, + "step": 6333 + }, + { + "epoch": 0.27770293430691845, + "grad_norm": 0.78125, + "learning_rate": 2.7413989550707685e-05, + "loss": 0.7564, + "step": 6334 + }, + { + "epoch": 0.2777467775235757, + "grad_norm": 0.78125, + "learning_rate": 2.7410822979773664e-05, + "loss": 0.7979, + "step": 6335 + }, + { + "epoch": 0.2777906207402329, + "grad_norm": 0.921875, + "learning_rate": 2.7407656562688254e-05, + "loss": 0.8221, + "step": 6336 + }, + { + "epoch": 0.27783446395689015, + "grad_norm": 0.80859375, + "learning_rate": 2.740449029945813e-05, + "loss": 0.8483, + "step": 6337 + }, + { + "epoch": 0.2778783071735474, + "grad_norm": 0.77734375, + "learning_rate": 2.7401324190090084e-05, + "loss": 0.7868, + "step": 6338 + }, + { + "epoch": 0.2779221503902046, + "grad_norm": 0.8828125, + "learning_rate": 2.7398158234590775e-05, + "loss": 0.8197, + "step": 6339 + }, + { + "epoch": 0.27796599360686186, + "grad_norm": 0.83203125, + "learning_rate": 2.7394992432966914e-05, + "loss": 0.7805, + "step": 6340 + }, + { + "epoch": 0.2780098368235191, + "grad_norm": 0.82421875, + "learning_rate": 2.7391826785225216e-05, + "loss": 0.833, + "step": 6341 + }, + { + "epoch": 0.27805368004017633, + "grad_norm": 0.9765625, + "learning_rate": 2.7388661291372375e-05, + "loss": 0.8916, + "step": 6342 + }, + { + "epoch": 0.27809752325683357, + "grad_norm": 0.7421875, + "learning_rate": 2.738549595141513e-05, + "loss": 0.73, + "step": 6343 + }, + { + "epoch": 0.2781413664734908, + "grad_norm": 0.8359375, + "learning_rate": 2.7382330765360186e-05, + "loss": 0.867, + "step": 6344 + }, + { + "epoch": 0.27818520969014804, + "grad_norm": 0.94140625, + "learning_rate": 2.737916573321424e-05, + "loss": 0.9648, + "step": 6345 + }, + { + "epoch": 0.2782290529068053, + "grad_norm": 0.765625, + "learning_rate": 2.7376000854983995e-05, + "loss": 0.7212, + "step": 6346 + }, + { + "epoch": 0.2782728961234625, + "grad_norm": 0.91015625, + "learning_rate": 2.737283613067615e-05, + "loss": 0.805, + "step": 6347 + }, + { + "epoch": 0.27831673934011975, + "grad_norm": 0.80859375, + "learning_rate": 2.7369671560297438e-05, + "loss": 0.7463, + "step": 6348 + }, + { + "epoch": 0.278360582556777, + "grad_norm": 0.765625, + "learning_rate": 2.736650714385457e-05, + "loss": 0.8707, + "step": 6349 + }, + { + "epoch": 0.2784044257734342, + "grad_norm": 0.91015625, + "learning_rate": 2.7363342881354235e-05, + "loss": 0.7604, + "step": 6350 + }, + { + "epoch": 0.27844826899009145, + "grad_norm": 0.8046875, + "learning_rate": 2.7360178772803146e-05, + "loss": 0.7739, + "step": 6351 + }, + { + "epoch": 0.2784921122067487, + "grad_norm": 0.8125, + "learning_rate": 2.735701481820797e-05, + "loss": 0.8011, + "step": 6352 + }, + { + "epoch": 0.2785359554234059, + "grad_norm": 0.79296875, + "learning_rate": 2.735385101757548e-05, + "loss": 0.7941, + "step": 6353 + }, + { + "epoch": 0.27857979864006316, + "grad_norm": 0.8671875, + "learning_rate": 2.7350687370912353e-05, + "loss": 0.8363, + "step": 6354 + }, + { + "epoch": 0.2786236418567204, + "grad_norm": 0.90234375, + "learning_rate": 2.734752387822529e-05, + "loss": 0.8969, + "step": 6355 + }, + { + "epoch": 0.2786674850733776, + "grad_norm": 0.86328125, + "learning_rate": 2.7344360539520996e-05, + "loss": 0.874, + "step": 6356 + }, + { + "epoch": 0.2787113282900348, + "grad_norm": 0.875, + "learning_rate": 2.7341197354806147e-05, + "loss": 0.8892, + "step": 6357 + }, + { + "epoch": 0.27875517150669205, + "grad_norm": 0.84765625, + "learning_rate": 2.7338034324087512e-05, + "loss": 0.8323, + "step": 6358 + }, + { + "epoch": 0.2787990147233493, + "grad_norm": 0.81640625, + "learning_rate": 2.7334871447371756e-05, + "loss": 0.8158, + "step": 6359 + }, + { + "epoch": 0.2788428579400065, + "grad_norm": 1.0859375, + "learning_rate": 2.7331708724665594e-05, + "loss": 0.8766, + "step": 6360 + }, + { + "epoch": 0.27888670115666375, + "grad_norm": 0.87890625, + "learning_rate": 2.732854615597571e-05, + "loss": 0.8061, + "step": 6361 + }, + { + "epoch": 0.278930544373321, + "grad_norm": 0.78125, + "learning_rate": 2.7325383741308796e-05, + "loss": 0.8423, + "step": 6362 + }, + { + "epoch": 0.2789743875899782, + "grad_norm": 0.77734375, + "learning_rate": 2.7322221480671606e-05, + "loss": 0.6959, + "step": 6363 + }, + { + "epoch": 0.27901823080663546, + "grad_norm": 0.7890625, + "learning_rate": 2.731905937407081e-05, + "loss": 0.7967, + "step": 6364 + }, + { + "epoch": 0.2790620740232927, + "grad_norm": 0.765625, + "learning_rate": 2.7315897421513124e-05, + "loss": 0.7329, + "step": 6365 + }, + { + "epoch": 0.27910591723994993, + "grad_norm": 0.87890625, + "learning_rate": 2.7312735623005224e-05, + "loss": 0.795, + "step": 6366 + }, + { + "epoch": 0.27914976045660717, + "grad_norm": 0.87109375, + "learning_rate": 2.7309573978553816e-05, + "loss": 0.798, + "step": 6367 + }, + { + "epoch": 0.2791936036732644, + "grad_norm": 0.8984375, + "learning_rate": 2.7306412488165622e-05, + "loss": 0.7842, + "step": 6368 + }, + { + "epoch": 0.27923744688992164, + "grad_norm": 0.859375, + "learning_rate": 2.730325115184734e-05, + "loss": 0.8605, + "step": 6369 + }, + { + "epoch": 0.27928129010657887, + "grad_norm": 0.81640625, + "learning_rate": 2.7300089969605658e-05, + "loss": 0.7855, + "step": 6370 + }, + { + "epoch": 0.2793251333232361, + "grad_norm": 0.83203125, + "learning_rate": 2.7296928941447288e-05, + "loss": 0.6434, + "step": 6371 + }, + { + "epoch": 0.27936897653989334, + "grad_norm": 0.85546875, + "learning_rate": 2.729376806737891e-05, + "loss": 0.9781, + "step": 6372 + }, + { + "epoch": 0.2794128197565506, + "grad_norm": 0.984375, + "learning_rate": 2.7290607347407248e-05, + "loss": 0.7525, + "step": 6373 + }, + { + "epoch": 0.2794566629732078, + "grad_norm": 0.828125, + "learning_rate": 2.7287446781538982e-05, + "loss": 0.7482, + "step": 6374 + }, + { + "epoch": 0.27950050618986505, + "grad_norm": 0.98046875, + "learning_rate": 2.728428636978082e-05, + "loss": 0.8193, + "step": 6375 + }, + { + "epoch": 0.2795443494065223, + "grad_norm": 0.94921875, + "learning_rate": 2.7281126112139444e-05, + "loss": 0.8594, + "step": 6376 + }, + { + "epoch": 0.2795881926231795, + "grad_norm": 0.84765625, + "learning_rate": 2.7277966008621548e-05, + "loss": 0.7056, + "step": 6377 + }, + { + "epoch": 0.27963203583983676, + "grad_norm": 0.76171875, + "learning_rate": 2.727480605923387e-05, + "loss": 0.7712, + "step": 6378 + }, + { + "epoch": 0.279675879056494, + "grad_norm": 0.84375, + "learning_rate": 2.7271646263983087e-05, + "loss": 0.8293, + "step": 6379 + }, + { + "epoch": 0.27971972227315123, + "grad_norm": 0.87890625, + "learning_rate": 2.726848662287589e-05, + "loss": 0.8075, + "step": 6380 + }, + { + "epoch": 0.27976356548980846, + "grad_norm": 0.82421875, + "learning_rate": 2.7265327135918985e-05, + "loss": 0.8554, + "step": 6381 + }, + { + "epoch": 0.2798074087064657, + "grad_norm": 0.99609375, + "learning_rate": 2.726216780311902e-05, + "loss": 0.8948, + "step": 6382 + }, + { + "epoch": 0.27985125192312293, + "grad_norm": 0.8671875, + "learning_rate": 2.7259008624482775e-05, + "loss": 0.8387, + "step": 6383 + }, + { + "epoch": 0.27989509513978017, + "grad_norm": 0.7890625, + "learning_rate": 2.72558496000169e-05, + "loss": 0.7969, + "step": 6384 + }, + { + "epoch": 0.2799389383564374, + "grad_norm": 0.734375, + "learning_rate": 2.7252690729728093e-05, + "loss": 0.6947, + "step": 6385 + }, + { + "epoch": 0.27998278157309464, + "grad_norm": 0.8515625, + "learning_rate": 2.7249532013623046e-05, + "loss": 0.8181, + "step": 6386 + }, + { + "epoch": 0.2800266247897518, + "grad_norm": 0.82421875, + "learning_rate": 2.724637345170843e-05, + "loss": 0.7441, + "step": 6387 + }, + { + "epoch": 0.28007046800640906, + "grad_norm": 0.84375, + "learning_rate": 2.7243215043991e-05, + "loss": 0.8279, + "step": 6388 + }, + { + "epoch": 0.2801143112230663, + "grad_norm": 0.84375, + "learning_rate": 2.7240056790477418e-05, + "loss": 0.8283, + "step": 6389 + }, + { + "epoch": 0.28015815443972353, + "grad_norm": 0.84375, + "learning_rate": 2.7236898691174373e-05, + "loss": 0.8303, + "step": 6390 + }, + { + "epoch": 0.28020199765638076, + "grad_norm": 0.83203125, + "learning_rate": 2.7233740746088564e-05, + "loss": 0.8528, + "step": 6391 + }, + { + "epoch": 0.280245840873038, + "grad_norm": 0.96484375, + "learning_rate": 2.723058295522668e-05, + "loss": 0.8095, + "step": 6392 + }, + { + "epoch": 0.28028968408969523, + "grad_norm": 0.89453125, + "learning_rate": 2.722742531859539e-05, + "loss": 0.8258, + "step": 6393 + }, + { + "epoch": 0.28033352730635247, + "grad_norm": 0.8203125, + "learning_rate": 2.7224267836201435e-05, + "loss": 0.8258, + "step": 6394 + }, + { + "epoch": 0.2803773705230097, + "grad_norm": 0.80859375, + "learning_rate": 2.7221110508051493e-05, + "loss": 0.7985, + "step": 6395 + }, + { + "epoch": 0.28042121373966694, + "grad_norm": 0.83984375, + "learning_rate": 2.7217953334152245e-05, + "loss": 0.8366, + "step": 6396 + }, + { + "epoch": 0.2804650569563242, + "grad_norm": 0.84375, + "learning_rate": 2.7214796314510382e-05, + "loss": 0.9402, + "step": 6397 + }, + { + "epoch": 0.2805089001729814, + "grad_norm": 0.84375, + "learning_rate": 2.7211639449132575e-05, + "loss": 0.8643, + "step": 6398 + }, + { + "epoch": 0.28055274338963865, + "grad_norm": 0.94921875, + "learning_rate": 2.7208482738025566e-05, + "loss": 0.7722, + "step": 6399 + }, + { + "epoch": 0.2805965866062959, + "grad_norm": 0.765625, + "learning_rate": 2.720532618119601e-05, + "loss": 0.822, + "step": 6400 + }, + { + "epoch": 0.2806404298229531, + "grad_norm": 0.80859375, + "learning_rate": 2.720216977865061e-05, + "loss": 0.8214, + "step": 6401 + }, + { + "epoch": 0.28068427303961035, + "grad_norm": 0.83203125, + "learning_rate": 2.7199013530396044e-05, + "loss": 0.7804, + "step": 6402 + }, + { + "epoch": 0.2807281162562676, + "grad_norm": 0.8125, + "learning_rate": 2.719585743643899e-05, + "loss": 0.8348, + "step": 6403 + }, + { + "epoch": 0.2807719594729248, + "grad_norm": 0.859375, + "learning_rate": 2.7192701496786176e-05, + "loss": 0.9523, + "step": 6404 + }, + { + "epoch": 0.28081580268958206, + "grad_norm": 0.8359375, + "learning_rate": 2.7189545711444264e-05, + "loss": 0.8585, + "step": 6405 + }, + { + "epoch": 0.2808596459062393, + "grad_norm": 0.97265625, + "learning_rate": 2.7186390080419955e-05, + "loss": 0.9148, + "step": 6406 + }, + { + "epoch": 0.28090348912289653, + "grad_norm": 0.8359375, + "learning_rate": 2.7183234603719932e-05, + "loss": 0.819, + "step": 6407 + }, + { + "epoch": 0.28094733233955377, + "grad_norm": 0.8828125, + "learning_rate": 2.7180079281350857e-05, + "loss": 0.9135, + "step": 6408 + }, + { + "epoch": 0.280991175556211, + "grad_norm": 0.85546875, + "learning_rate": 2.717692411331946e-05, + "loss": 0.7555, + "step": 6409 + }, + { + "epoch": 0.28103501877286824, + "grad_norm": 0.85546875, + "learning_rate": 2.717376909963242e-05, + "loss": 0.7151, + "step": 6410 + }, + { + "epoch": 0.2810788619895255, + "grad_norm": 0.7421875, + "learning_rate": 2.7170614240296412e-05, + "loss": 0.7711, + "step": 6411 + }, + { + "epoch": 0.2811227052061827, + "grad_norm": 0.79296875, + "learning_rate": 2.716745953531812e-05, + "loss": 0.8098, + "step": 6412 + }, + { + "epoch": 0.28116654842283995, + "grad_norm": 0.84765625, + "learning_rate": 2.7164304984704235e-05, + "loss": 0.8448, + "step": 6413 + }, + { + "epoch": 0.2812103916394972, + "grad_norm": 0.84375, + "learning_rate": 2.7161150588461436e-05, + "loss": 0.798, + "step": 6414 + }, + { + "epoch": 0.2812542348561544, + "grad_norm": 0.95703125, + "learning_rate": 2.7157996346596425e-05, + "loss": 0.862, + "step": 6415 + }, + { + "epoch": 0.28129807807281165, + "grad_norm": 0.890625, + "learning_rate": 2.7154842259115843e-05, + "loss": 0.8734, + "step": 6416 + }, + { + "epoch": 0.28134192128946883, + "grad_norm": 0.8984375, + "learning_rate": 2.7151688326026448e-05, + "loss": 0.8917, + "step": 6417 + }, + { + "epoch": 0.28138576450612607, + "grad_norm": 0.8515625, + "learning_rate": 2.7148534547334868e-05, + "loss": 0.727, + "step": 6418 + }, + { + "epoch": 0.2814296077227833, + "grad_norm": 0.87890625, + "learning_rate": 2.7145380923047827e-05, + "loss": 0.8442, + "step": 6419 + }, + { + "epoch": 0.28147345093944054, + "grad_norm": 0.78515625, + "learning_rate": 2.7142227453171974e-05, + "loss": 0.7911, + "step": 6420 + }, + { + "epoch": 0.2815172941560978, + "grad_norm": 0.8125, + "learning_rate": 2.7139074137714006e-05, + "loss": 0.806, + "step": 6421 + }, + { + "epoch": 0.281561137372755, + "grad_norm": 0.79296875, + "learning_rate": 2.7135920976680607e-05, + "loss": 0.701, + "step": 6422 + }, + { + "epoch": 0.28160498058941225, + "grad_norm": 2.375, + "learning_rate": 2.7132767970078434e-05, + "loss": 0.9091, + "step": 6423 + }, + { + "epoch": 0.2816488238060695, + "grad_norm": 0.77734375, + "learning_rate": 2.712961511791421e-05, + "loss": 0.8358, + "step": 6424 + }, + { + "epoch": 0.2816926670227267, + "grad_norm": 0.85546875, + "learning_rate": 2.7126462420194608e-05, + "loss": 0.9272, + "step": 6425 + }, + { + "epoch": 0.28173651023938395, + "grad_norm": 0.78125, + "learning_rate": 2.712330987692632e-05, + "loss": 0.7381, + "step": 6426 + }, + { + "epoch": 0.2817803534560412, + "grad_norm": 0.85546875, + "learning_rate": 2.7120157488115994e-05, + "loss": 0.7152, + "step": 6427 + }, + { + "epoch": 0.2818241966726984, + "grad_norm": 0.7734375, + "learning_rate": 2.71170052537703e-05, + "loss": 0.8444, + "step": 6428 + }, + { + "epoch": 0.28186803988935566, + "grad_norm": 0.79296875, + "learning_rate": 2.7113853173895975e-05, + "loss": 0.8184, + "step": 6429 + }, + { + "epoch": 0.2819118831060129, + "grad_norm": 0.7890625, + "learning_rate": 2.711070124849968e-05, + "loss": 0.8805, + "step": 6430 + }, + { + "epoch": 0.28195572632267013, + "grad_norm": 0.734375, + "learning_rate": 2.710754947758809e-05, + "loss": 0.6903, + "step": 6431 + }, + { + "epoch": 0.28199956953932737, + "grad_norm": 0.91796875, + "learning_rate": 2.7104397861167873e-05, + "loss": 0.7009, + "step": 6432 + }, + { + "epoch": 0.2820434127559846, + "grad_norm": 0.8046875, + "learning_rate": 2.7101246399245693e-05, + "loss": 0.7929, + "step": 6433 + }, + { + "epoch": 0.28208725597264184, + "grad_norm": 0.8359375, + "learning_rate": 2.7098095091828292e-05, + "loss": 0.8204, + "step": 6434 + }, + { + "epoch": 0.2821310991892991, + "grad_norm": 0.76171875, + "learning_rate": 2.70949439389223e-05, + "loss": 0.8051, + "step": 6435 + }, + { + "epoch": 0.2821749424059563, + "grad_norm": 0.95703125, + "learning_rate": 2.7091792940534412e-05, + "loss": 0.844, + "step": 6436 + }, + { + "epoch": 0.28221878562261354, + "grad_norm": 0.75390625, + "learning_rate": 2.7088642096671303e-05, + "loss": 0.7582, + "step": 6437 + }, + { + "epoch": 0.2822626288392708, + "grad_norm": 0.86328125, + "learning_rate": 2.7085491407339614e-05, + "loss": 0.8209, + "step": 6438 + }, + { + "epoch": 0.282306472055928, + "grad_norm": 1.046875, + "learning_rate": 2.7082340872546087e-05, + "loss": 0.7941, + "step": 6439 + }, + { + "epoch": 0.28235031527258525, + "grad_norm": 0.8515625, + "learning_rate": 2.7079190492297378e-05, + "loss": 0.8186, + "step": 6440 + }, + { + "epoch": 0.2823941584892425, + "grad_norm": 0.8125, + "learning_rate": 2.7076040266600157e-05, + "loss": 0.8273, + "step": 6441 + }, + { + "epoch": 0.2824380017058997, + "grad_norm": 1.0859375, + "learning_rate": 2.7072890195461098e-05, + "loss": 0.7741, + "step": 6442 + }, + { + "epoch": 0.28248184492255696, + "grad_norm": 0.83203125, + "learning_rate": 2.7069740278886847e-05, + "loss": 0.73, + "step": 6443 + }, + { + "epoch": 0.2825256881392142, + "grad_norm": 0.87109375, + "learning_rate": 2.706659051688415e-05, + "loss": 0.9024, + "step": 6444 + }, + { + "epoch": 0.28256953135587143, + "grad_norm": 0.8828125, + "learning_rate": 2.7063440909459637e-05, + "loss": 0.8568, + "step": 6445 + }, + { + "epoch": 0.28261337457252866, + "grad_norm": 0.80078125, + "learning_rate": 2.7060291456620003e-05, + "loss": 0.7464, + "step": 6446 + }, + { + "epoch": 0.28265721778918584, + "grad_norm": 0.7890625, + "learning_rate": 2.7057142158371895e-05, + "loss": 0.7333, + "step": 6447 + }, + { + "epoch": 0.2827010610058431, + "grad_norm": 0.77734375, + "learning_rate": 2.705399301472199e-05, + "loss": 0.8493, + "step": 6448 + }, + { + "epoch": 0.2827449042225003, + "grad_norm": 0.765625, + "learning_rate": 2.7050844025677003e-05, + "loss": 0.7545, + "step": 6449 + }, + { + "epoch": 0.28278874743915755, + "grad_norm": 0.765625, + "learning_rate": 2.704769519124358e-05, + "loss": 0.7697, + "step": 6450 + }, + { + "epoch": 0.2828325906558148, + "grad_norm": 0.79296875, + "learning_rate": 2.7044546511428405e-05, + "loss": 0.798, + "step": 6451 + }, + { + "epoch": 0.282876433872472, + "grad_norm": 0.83203125, + "learning_rate": 2.7041397986238137e-05, + "loss": 0.8644, + "step": 6452 + }, + { + "epoch": 0.28292027708912926, + "grad_norm": 0.7421875, + "learning_rate": 2.7038249615679435e-05, + "loss": 0.7687, + "step": 6453 + }, + { + "epoch": 0.2829641203057865, + "grad_norm": 0.90234375, + "learning_rate": 2.703510139975901e-05, + "loss": 0.943, + "step": 6454 + }, + { + "epoch": 0.28300796352244373, + "grad_norm": 0.73828125, + "learning_rate": 2.7031953338483517e-05, + "loss": 0.7912, + "step": 6455 + }, + { + "epoch": 0.28305180673910096, + "grad_norm": 0.90234375, + "learning_rate": 2.7028805431859638e-05, + "loss": 0.7609, + "step": 6456 + }, + { + "epoch": 0.2830956499557582, + "grad_norm": 0.86328125, + "learning_rate": 2.7025657679894035e-05, + "loss": 0.8526, + "step": 6457 + }, + { + "epoch": 0.28313949317241544, + "grad_norm": 0.97265625, + "learning_rate": 2.7022510082593378e-05, + "loss": 0.8243, + "step": 6458 + }, + { + "epoch": 0.28318333638907267, + "grad_norm": 0.8671875, + "learning_rate": 2.701936263996433e-05, + "loss": 0.7891, + "step": 6459 + }, + { + "epoch": 0.2832271796057299, + "grad_norm": 0.8984375, + "learning_rate": 2.7016215352013586e-05, + "loss": 0.8777, + "step": 6460 + }, + { + "epoch": 0.28327102282238714, + "grad_norm": 0.83984375, + "learning_rate": 2.7013068218747783e-05, + "loss": 0.8503, + "step": 6461 + }, + { + "epoch": 0.2833148660390444, + "grad_norm": 0.70703125, + "learning_rate": 2.700992124017363e-05, + "loss": 0.7544, + "step": 6462 + }, + { + "epoch": 0.2833587092557016, + "grad_norm": 1.734375, + "learning_rate": 2.700677441629773e-05, + "loss": 0.7741, + "step": 6463 + }, + { + "epoch": 0.28340255247235885, + "grad_norm": 0.91015625, + "learning_rate": 2.700362774712685e-05, + "loss": 0.7309, + "step": 6464 + }, + { + "epoch": 0.2834463956890161, + "grad_norm": 0.69140625, + "learning_rate": 2.7000481232667596e-05, + "loss": 0.7238, + "step": 6465 + }, + { + "epoch": 0.2834902389056733, + "grad_norm": 0.8046875, + "learning_rate": 2.699733487292665e-05, + "loss": 0.7258, + "step": 6466 + }, + { + "epoch": 0.28353408212233056, + "grad_norm": 0.9140625, + "learning_rate": 2.699418866791068e-05, + "loss": 0.8522, + "step": 6467 + }, + { + "epoch": 0.2835779253389878, + "grad_norm": 0.82421875, + "learning_rate": 2.6991042617626316e-05, + "loss": 0.8215, + "step": 6468 + }, + { + "epoch": 0.283621768555645, + "grad_norm": 0.79296875, + "learning_rate": 2.698789672208031e-05, + "loss": 0.7858, + "step": 6469 + }, + { + "epoch": 0.28366561177230226, + "grad_norm": 0.80859375, + "learning_rate": 2.6984750981279273e-05, + "loss": 0.7261, + "step": 6470 + }, + { + "epoch": 0.2837094549889595, + "grad_norm": 0.91796875, + "learning_rate": 2.6981605395229893e-05, + "loss": 0.695, + "step": 6471 + }, + { + "epoch": 0.28375329820561673, + "grad_norm": 0.875, + "learning_rate": 2.697845996393882e-05, + "loss": 0.8071, + "step": 6472 + }, + { + "epoch": 0.28379714142227397, + "grad_norm": 0.8359375, + "learning_rate": 2.69753146874127e-05, + "loss": 0.7283, + "step": 6473 + }, + { + "epoch": 0.2838409846389312, + "grad_norm": 0.8125, + "learning_rate": 2.6972169565658246e-05, + "loss": 0.7864, + "step": 6474 + }, + { + "epoch": 0.28388482785558844, + "grad_norm": 0.8125, + "learning_rate": 2.6969024598682112e-05, + "loss": 0.7421, + "step": 6475 + }, + { + "epoch": 0.2839286710722457, + "grad_norm": 0.78515625, + "learning_rate": 2.6965879786490967e-05, + "loss": 0.8651, + "step": 6476 + }, + { + "epoch": 0.2839725142889029, + "grad_norm": 0.8359375, + "learning_rate": 2.6962735129091454e-05, + "loss": 0.7835, + "step": 6477 + }, + { + "epoch": 0.2840163575055601, + "grad_norm": 0.90234375, + "learning_rate": 2.6959590626490218e-05, + "loss": 0.8241, + "step": 6478 + }, + { + "epoch": 0.2840602007222173, + "grad_norm": 0.80078125, + "learning_rate": 2.6956446278693993e-05, + "loss": 0.9047, + "step": 6479 + }, + { + "epoch": 0.28410404393887456, + "grad_norm": 0.8203125, + "learning_rate": 2.6953302085709397e-05, + "loss": 0.6928, + "step": 6480 + }, + { + "epoch": 0.2841478871555318, + "grad_norm": 0.875, + "learning_rate": 2.6950158047543106e-05, + "loss": 0.8816, + "step": 6481 + }, + { + "epoch": 0.28419173037218903, + "grad_norm": 0.79296875, + "learning_rate": 2.6947014164201778e-05, + "loss": 0.789, + "step": 6482 + }, + { + "epoch": 0.28423557358884627, + "grad_norm": 0.83984375, + "learning_rate": 2.6943870435692043e-05, + "loss": 0.7743, + "step": 6483 + }, + { + "epoch": 0.2842794168055035, + "grad_norm": 0.80859375, + "learning_rate": 2.694072686202064e-05, + "loss": 0.7733, + "step": 6484 + }, + { + "epoch": 0.28432326002216074, + "grad_norm": 0.7578125, + "learning_rate": 2.6937583443194182e-05, + "loss": 0.7378, + "step": 6485 + }, + { + "epoch": 0.284367103238818, + "grad_norm": 0.828125, + "learning_rate": 2.6934440179219344e-05, + "loss": 0.8402, + "step": 6486 + }, + { + "epoch": 0.2844109464554752, + "grad_norm": 0.74609375, + "learning_rate": 2.693129707010278e-05, + "loss": 0.8179, + "step": 6487 + }, + { + "epoch": 0.28445478967213245, + "grad_norm": 0.859375, + "learning_rate": 2.6928154115851134e-05, + "loss": 0.7186, + "step": 6488 + }, + { + "epoch": 0.2844986328887897, + "grad_norm": 0.828125, + "learning_rate": 2.6925011316471117e-05, + "loss": 0.7752, + "step": 6489 + }, + { + "epoch": 0.2845424761054469, + "grad_norm": 0.73828125, + "learning_rate": 2.6921868671969353e-05, + "loss": 0.7822, + "step": 6490 + }, + { + "epoch": 0.28458631932210415, + "grad_norm": 0.796875, + "learning_rate": 2.6918726182352517e-05, + "loss": 0.755, + "step": 6491 + }, + { + "epoch": 0.2846301625387614, + "grad_norm": 0.8046875, + "learning_rate": 2.6915583847627257e-05, + "loss": 0.7766, + "step": 6492 + }, + { + "epoch": 0.2846740057554186, + "grad_norm": 0.87890625, + "learning_rate": 2.6912441667800226e-05, + "loss": 0.8829, + "step": 6493 + }, + { + "epoch": 0.28471784897207586, + "grad_norm": 0.7734375, + "learning_rate": 2.6909299642878116e-05, + "loss": 0.7517, + "step": 6494 + }, + { + "epoch": 0.2847616921887331, + "grad_norm": 0.83203125, + "learning_rate": 2.6906157772867568e-05, + "loss": 0.848, + "step": 6495 + }, + { + "epoch": 0.28480553540539033, + "grad_norm": 0.859375, + "learning_rate": 2.6903016057775243e-05, + "loss": 0.8196, + "step": 6496 + }, + { + "epoch": 0.28484937862204757, + "grad_norm": 0.828125, + "learning_rate": 2.6899874497607793e-05, + "loss": 0.9166, + "step": 6497 + }, + { + "epoch": 0.2848932218387048, + "grad_norm": 0.91796875, + "learning_rate": 2.6896733092371884e-05, + "loss": 0.6963, + "step": 6498 + }, + { + "epoch": 0.28493706505536204, + "grad_norm": 0.83984375, + "learning_rate": 2.6893591842074174e-05, + "loss": 0.8416, + "step": 6499 + }, + { + "epoch": 0.2849809082720193, + "grad_norm": 0.8359375, + "learning_rate": 2.6890450746721308e-05, + "loss": 0.8516, + "step": 6500 + }, + { + "epoch": 0.2850247514886765, + "grad_norm": 0.8203125, + "learning_rate": 2.6887309806319926e-05, + "loss": 0.8279, + "step": 6501 + }, + { + "epoch": 0.28506859470533374, + "grad_norm": 0.9921875, + "learning_rate": 2.6884169020876748e-05, + "loss": 0.713, + "step": 6502 + }, + { + "epoch": 0.285112437921991, + "grad_norm": 0.8125, + "learning_rate": 2.68810283903984e-05, + "loss": 0.843, + "step": 6503 + }, + { + "epoch": 0.2851562811386482, + "grad_norm": 0.875, + "learning_rate": 2.6877887914891518e-05, + "loss": 0.9037, + "step": 6504 + }, + { + "epoch": 0.28520012435530545, + "grad_norm": 0.83984375, + "learning_rate": 2.6874747594362783e-05, + "loss": 0.8032, + "step": 6505 + }, + { + "epoch": 0.2852439675719627, + "grad_norm": 0.84375, + "learning_rate": 2.6871607428818833e-05, + "loss": 0.8232, + "step": 6506 + }, + { + "epoch": 0.2852878107886199, + "grad_norm": 0.7890625, + "learning_rate": 2.686846741826634e-05, + "loss": 0.7786, + "step": 6507 + }, + { + "epoch": 0.2853316540052771, + "grad_norm": 0.84375, + "learning_rate": 2.6865327562711918e-05, + "loss": 0.8064, + "step": 6508 + }, + { + "epoch": 0.28537549722193434, + "grad_norm": 0.91015625, + "learning_rate": 2.686218786216228e-05, + "loss": 0.9083, + "step": 6509 + }, + { + "epoch": 0.2854193404385916, + "grad_norm": 0.83984375, + "learning_rate": 2.685904831662406e-05, + "loss": 0.8423, + "step": 6510 + }, + { + "epoch": 0.2854631836552488, + "grad_norm": 0.89453125, + "learning_rate": 2.68559089261039e-05, + "loss": 0.7698, + "step": 6511 + }, + { + "epoch": 0.28550702687190604, + "grad_norm": 0.8125, + "learning_rate": 2.685276969060846e-05, + "loss": 0.8203, + "step": 6512 + }, + { + "epoch": 0.2855508700885633, + "grad_norm": 0.8984375, + "learning_rate": 2.684963061014437e-05, + "loss": 0.745, + "step": 6513 + }, + { + "epoch": 0.2855947133052205, + "grad_norm": 0.7578125, + "learning_rate": 2.6846491684718334e-05, + "loss": 0.6721, + "step": 6514 + }, + { + "epoch": 0.28563855652187775, + "grad_norm": 0.83984375, + "learning_rate": 2.6843352914336982e-05, + "loss": 0.8912, + "step": 6515 + }, + { + "epoch": 0.285682399738535, + "grad_norm": 0.73828125, + "learning_rate": 2.684021429900696e-05, + "loss": 0.7085, + "step": 6516 + }, + { + "epoch": 0.2857262429551922, + "grad_norm": 0.77734375, + "learning_rate": 2.6837075838734914e-05, + "loss": 0.765, + "step": 6517 + }, + { + "epoch": 0.28577008617184946, + "grad_norm": 0.85546875, + "learning_rate": 2.6833937533527485e-05, + "loss": 0.9441, + "step": 6518 + }, + { + "epoch": 0.2858139293885067, + "grad_norm": 0.828125, + "learning_rate": 2.683079938339137e-05, + "loss": 0.7712, + "step": 6519 + }, + { + "epoch": 0.28585777260516393, + "grad_norm": 0.80078125, + "learning_rate": 2.6827661388333192e-05, + "loss": 0.7896, + "step": 6520 + }, + { + "epoch": 0.28590161582182116, + "grad_norm": 0.8125, + "learning_rate": 2.6824523548359603e-05, + "loss": 0.8197, + "step": 6521 + }, + { + "epoch": 0.2859454590384784, + "grad_norm": 0.75390625, + "learning_rate": 2.6821385863477254e-05, + "loss": 0.7141, + "step": 6522 + }, + { + "epoch": 0.28598930225513564, + "grad_norm": 0.77734375, + "learning_rate": 2.6818248333692765e-05, + "loss": 0.8033, + "step": 6523 + }, + { + "epoch": 0.28603314547179287, + "grad_norm": 0.7421875, + "learning_rate": 2.6815110959012845e-05, + "loss": 0.7254, + "step": 6524 + }, + { + "epoch": 0.2860769886884501, + "grad_norm": 0.84375, + "learning_rate": 2.681197373944412e-05, + "loss": 0.8706, + "step": 6525 + }, + { + "epoch": 0.28612083190510734, + "grad_norm": 0.765625, + "learning_rate": 2.6808836674993222e-05, + "loss": 0.7189, + "step": 6526 + }, + { + "epoch": 0.2861646751217646, + "grad_norm": 0.8359375, + "learning_rate": 2.6805699765666824e-05, + "loss": 0.8372, + "step": 6527 + }, + { + "epoch": 0.2862085183384218, + "grad_norm": 0.8046875, + "learning_rate": 2.6802563011471525e-05, + "loss": 0.9458, + "step": 6528 + }, + { + "epoch": 0.28625236155507905, + "grad_norm": 0.83984375, + "learning_rate": 2.679942641241404e-05, + "loss": 0.8593, + "step": 6529 + }, + { + "epoch": 0.2862962047717363, + "grad_norm": 0.75390625, + "learning_rate": 2.6796289968500988e-05, + "loss": 0.7184, + "step": 6530 + }, + { + "epoch": 0.2863400479883935, + "grad_norm": 0.80078125, + "learning_rate": 2.6793153679739015e-05, + "loss": 0.7819, + "step": 6531 + }, + { + "epoch": 0.28638389120505076, + "grad_norm": 0.82421875, + "learning_rate": 2.679001754613477e-05, + "loss": 0.846, + "step": 6532 + }, + { + "epoch": 0.286427734421708, + "grad_norm": 0.81640625, + "learning_rate": 2.678688156769489e-05, + "loss": 0.785, + "step": 6533 + }, + { + "epoch": 0.2864715776383652, + "grad_norm": 0.80859375, + "learning_rate": 2.6783745744426014e-05, + "loss": 0.8842, + "step": 6534 + }, + { + "epoch": 0.28651542085502246, + "grad_norm": 0.79296875, + "learning_rate": 2.6780610076334822e-05, + "loss": 0.7929, + "step": 6535 + }, + { + "epoch": 0.2865592640716797, + "grad_norm": 0.81640625, + "learning_rate": 2.677747456342795e-05, + "loss": 0.8078, + "step": 6536 + }, + { + "epoch": 0.28660310728833693, + "grad_norm": 0.859375, + "learning_rate": 2.677433920571203e-05, + "loss": 0.8234, + "step": 6537 + }, + { + "epoch": 0.2866469505049941, + "grad_norm": 0.7265625, + "learning_rate": 2.6771204003193717e-05, + "loss": 0.7497, + "step": 6538 + }, + { + "epoch": 0.28669079372165135, + "grad_norm": 0.8046875, + "learning_rate": 2.676806895587961e-05, + "loss": 0.7986, + "step": 6539 + }, + { + "epoch": 0.2867346369383086, + "grad_norm": 0.8984375, + "learning_rate": 2.6764934063776436e-05, + "loss": 0.7032, + "step": 6540 + }, + { + "epoch": 0.2867784801549658, + "grad_norm": 0.7890625, + "learning_rate": 2.6761799326890803e-05, + "loss": 0.7822, + "step": 6541 + }, + { + "epoch": 0.28682232337162306, + "grad_norm": 1.015625, + "learning_rate": 2.6758664745229345e-05, + "loss": 0.8287, + "step": 6542 + }, + { + "epoch": 0.2868661665882803, + "grad_norm": 0.75390625, + "learning_rate": 2.6755530318798705e-05, + "loss": 0.766, + "step": 6543 + }, + { + "epoch": 0.2869100098049375, + "grad_norm": 1.59375, + "learning_rate": 2.6752396047605543e-05, + "loss": 0.8157, + "step": 6544 + }, + { + "epoch": 0.28695385302159476, + "grad_norm": 0.73828125, + "learning_rate": 2.6749261931656477e-05, + "loss": 0.8811, + "step": 6545 + }, + { + "epoch": 0.286997696238252, + "grad_norm": 0.7265625, + "learning_rate": 2.674612797095818e-05, + "loss": 0.6925, + "step": 6546 + }, + { + "epoch": 0.28704153945490923, + "grad_norm": 0.96875, + "learning_rate": 2.6742994165517266e-05, + "loss": 0.7373, + "step": 6547 + }, + { + "epoch": 0.28708538267156647, + "grad_norm": 0.92578125, + "learning_rate": 2.6739860515340387e-05, + "loss": 0.7695, + "step": 6548 + }, + { + "epoch": 0.2871292258882237, + "grad_norm": 0.765625, + "learning_rate": 2.673672702043416e-05, + "loss": 0.7699, + "step": 6549 + }, + { + "epoch": 0.28717306910488094, + "grad_norm": 0.8828125, + "learning_rate": 2.673359368080529e-05, + "loss": 0.8613, + "step": 6550 + }, + { + "epoch": 0.2872169123215382, + "grad_norm": 0.8828125, + "learning_rate": 2.6730460496460374e-05, + "loss": 0.8569, + "step": 6551 + }, + { + "epoch": 0.2872607555381954, + "grad_norm": 0.8125, + "learning_rate": 2.672732746740606e-05, + "loss": 0.7775, + "step": 6552 + }, + { + "epoch": 0.28730459875485265, + "grad_norm": 0.828125, + "learning_rate": 2.6724194593648988e-05, + "loss": 0.947, + "step": 6553 + }, + { + "epoch": 0.2873484419715099, + "grad_norm": 0.84375, + "learning_rate": 2.6721061875195763e-05, + "loss": 0.8547, + "step": 6554 + }, + { + "epoch": 0.2873922851881671, + "grad_norm": 0.80078125, + "learning_rate": 2.671792931205309e-05, + "loss": 0.7646, + "step": 6555 + }, + { + "epoch": 0.28743612840482435, + "grad_norm": 0.79296875, + "learning_rate": 2.671479690422758e-05, + "loss": 0.7528, + "step": 6556 + }, + { + "epoch": 0.2874799716214816, + "grad_norm": 0.796875, + "learning_rate": 2.6711664651725864e-05, + "loss": 0.817, + "step": 6557 + }, + { + "epoch": 0.2875238148381388, + "grad_norm": 0.875, + "learning_rate": 2.670853255455459e-05, + "loss": 0.8647, + "step": 6558 + }, + { + "epoch": 0.28756765805479606, + "grad_norm": 0.859375, + "learning_rate": 2.6705400612720365e-05, + "loss": 0.8587, + "step": 6559 + }, + { + "epoch": 0.2876115012714533, + "grad_norm": 0.8046875, + "learning_rate": 2.670226882622988e-05, + "loss": 0.8519, + "step": 6560 + }, + { + "epoch": 0.28765534448811053, + "grad_norm": 0.79296875, + "learning_rate": 2.669913719508975e-05, + "loss": 0.7433, + "step": 6561 + }, + { + "epoch": 0.28769918770476777, + "grad_norm": 0.91796875, + "learning_rate": 2.6696005719306606e-05, + "loss": 0.8018, + "step": 6562 + }, + { + "epoch": 0.287743030921425, + "grad_norm": 0.81640625, + "learning_rate": 2.6692874398887092e-05, + "loss": 0.7653, + "step": 6563 + }, + { + "epoch": 0.28778687413808224, + "grad_norm": 0.79296875, + "learning_rate": 2.6689743233837805e-05, + "loss": 0.7439, + "step": 6564 + }, + { + "epoch": 0.2878307173547395, + "grad_norm": 0.796875, + "learning_rate": 2.6686612224165453e-05, + "loss": 0.7473, + "step": 6565 + }, + { + "epoch": 0.2878745605713967, + "grad_norm": 0.8125, + "learning_rate": 2.668348136987663e-05, + "loss": 0.7679, + "step": 6566 + }, + { + "epoch": 0.28791840378805394, + "grad_norm": 1.2109375, + "learning_rate": 2.6680350670977982e-05, + "loss": 0.7889, + "step": 6567 + }, + { + "epoch": 0.2879622470047112, + "grad_norm": 0.75390625, + "learning_rate": 2.667722012747613e-05, + "loss": 0.6323, + "step": 6568 + }, + { + "epoch": 0.28800609022136836, + "grad_norm": 0.74609375, + "learning_rate": 2.6674089739377705e-05, + "loss": 0.6906, + "step": 6569 + }, + { + "epoch": 0.2880499334380256, + "grad_norm": 0.79296875, + "learning_rate": 2.6670959506689373e-05, + "loss": 0.8647, + "step": 6570 + }, + { + "epoch": 0.28809377665468283, + "grad_norm": 0.8203125, + "learning_rate": 2.6667829429417757e-05, + "loss": 0.7678, + "step": 6571 + }, + { + "epoch": 0.28813761987134007, + "grad_norm": 0.79296875, + "learning_rate": 2.666469950756949e-05, + "loss": 0.7894, + "step": 6572 + }, + { + "epoch": 0.2881814630879973, + "grad_norm": 0.86328125, + "learning_rate": 2.6661569741151194e-05, + "loss": 0.8821, + "step": 6573 + }, + { + "epoch": 0.28822530630465454, + "grad_norm": 0.828125, + "learning_rate": 2.665844013016948e-05, + "loss": 0.8683, + "step": 6574 + }, + { + "epoch": 0.2882691495213118, + "grad_norm": 0.7734375, + "learning_rate": 2.665531067463104e-05, + "loss": 0.8035, + "step": 6575 + }, + { + "epoch": 0.288312992737969, + "grad_norm": 0.94140625, + "learning_rate": 2.6652181374542472e-05, + "loss": 0.796, + "step": 6576 + }, + { + "epoch": 0.28835683595462624, + "grad_norm": 0.8671875, + "learning_rate": 2.664905222991042e-05, + "loss": 0.865, + "step": 6577 + }, + { + "epoch": 0.2884006791712835, + "grad_norm": 0.75, + "learning_rate": 2.664592324074151e-05, + "loss": 0.7038, + "step": 6578 + }, + { + "epoch": 0.2884445223879407, + "grad_norm": 0.83203125, + "learning_rate": 2.664279440704234e-05, + "loss": 0.8369, + "step": 6579 + }, + { + "epoch": 0.28848836560459795, + "grad_norm": 0.90234375, + "learning_rate": 2.663966572881961e-05, + "loss": 0.8925, + "step": 6580 + }, + { + "epoch": 0.2885322088212552, + "grad_norm": 0.92578125, + "learning_rate": 2.6636537206079904e-05, + "loss": 0.737, + "step": 6581 + }, + { + "epoch": 0.2885760520379124, + "grad_norm": 0.83203125, + "learning_rate": 2.6633408838829877e-05, + "loss": 0.7863, + "step": 6582 + }, + { + "epoch": 0.28861989525456966, + "grad_norm": 0.8828125, + "learning_rate": 2.6630280627076133e-05, + "loss": 0.8449, + "step": 6583 + }, + { + "epoch": 0.2886637384712269, + "grad_norm": 0.8125, + "learning_rate": 2.662715257082532e-05, + "loss": 0.9408, + "step": 6584 + }, + { + "epoch": 0.28870758168788413, + "grad_norm": 0.75390625, + "learning_rate": 2.662402467008407e-05, + "loss": 0.8526, + "step": 6585 + }, + { + "epoch": 0.28875142490454137, + "grad_norm": 0.88671875, + "learning_rate": 2.6620896924858975e-05, + "loss": 0.7844, + "step": 6586 + }, + { + "epoch": 0.2887952681211986, + "grad_norm": 0.828125, + "learning_rate": 2.6617769335156717e-05, + "loss": 0.7564, + "step": 6587 + }, + { + "epoch": 0.28883911133785584, + "grad_norm": 0.7734375, + "learning_rate": 2.6614641900983904e-05, + "loss": 0.7176, + "step": 6588 + }, + { + "epoch": 0.28888295455451307, + "grad_norm": 0.79296875, + "learning_rate": 2.6611514622347167e-05, + "loss": 0.7016, + "step": 6589 + }, + { + "epoch": 0.2889267977711703, + "grad_norm": 0.86328125, + "learning_rate": 2.660838749925313e-05, + "loss": 0.8596, + "step": 6590 + }, + { + "epoch": 0.28897064098782754, + "grad_norm": 0.82421875, + "learning_rate": 2.660526053170842e-05, + "loss": 0.8421, + "step": 6591 + }, + { + "epoch": 0.2890144842044848, + "grad_norm": 0.85546875, + "learning_rate": 2.6602133719719668e-05, + "loss": 0.8038, + "step": 6592 + }, + { + "epoch": 0.289058327421142, + "grad_norm": 0.83984375, + "learning_rate": 2.6599007063293503e-05, + "loss": 0.7818, + "step": 6593 + }, + { + "epoch": 0.28910217063779925, + "grad_norm": 0.86328125, + "learning_rate": 2.6595880562436503e-05, + "loss": 0.8737, + "step": 6594 + }, + { + "epoch": 0.2891460138544565, + "grad_norm": 0.84765625, + "learning_rate": 2.659275421715538e-05, + "loss": 0.8811, + "step": 6595 + }, + { + "epoch": 0.2891898570711137, + "grad_norm": 0.81640625, + "learning_rate": 2.6589628027456716e-05, + "loss": 0.8366, + "step": 6596 + }, + { + "epoch": 0.28923370028777096, + "grad_norm": 0.9921875, + "learning_rate": 2.6586501993347135e-05, + "loss": 0.724, + "step": 6597 + }, + { + "epoch": 0.2892775435044282, + "grad_norm": 0.91015625, + "learning_rate": 2.658337611483327e-05, + "loss": 0.8297, + "step": 6598 + }, + { + "epoch": 0.28932138672108537, + "grad_norm": 0.78515625, + "learning_rate": 2.658025039192171e-05, + "loss": 0.7469, + "step": 6599 + }, + { + "epoch": 0.2893652299377426, + "grad_norm": 0.8046875, + "learning_rate": 2.657712482461915e-05, + "loss": 0.8378, + "step": 6600 + }, + { + "epoch": 0.28940907315439984, + "grad_norm": 0.79296875, + "learning_rate": 2.657399941293217e-05, + "loss": 0.6179, + "step": 6601 + }, + { + "epoch": 0.2894529163710571, + "grad_norm": 0.8046875, + "learning_rate": 2.6570874156867397e-05, + "loss": 0.8729, + "step": 6602 + }, + { + "epoch": 0.2894967595877143, + "grad_norm": 0.75, + "learning_rate": 2.6567749056431467e-05, + "loss": 0.7771, + "step": 6603 + }, + { + "epoch": 0.28954060280437155, + "grad_norm": 0.85546875, + "learning_rate": 2.6564624111630965e-05, + "loss": 0.9085, + "step": 6604 + }, + { + "epoch": 0.2895844460210288, + "grad_norm": 0.8203125, + "learning_rate": 2.6561499322472572e-05, + "loss": 0.8534, + "step": 6605 + }, + { + "epoch": 0.289628289237686, + "grad_norm": 0.7578125, + "learning_rate": 2.6558374688962896e-05, + "loss": 0.7234, + "step": 6606 + }, + { + "epoch": 0.28967213245434326, + "grad_norm": 0.78125, + "learning_rate": 2.6555250211108528e-05, + "loss": 0.7615, + "step": 6607 + }, + { + "epoch": 0.2897159756710005, + "grad_norm": 0.859375, + "learning_rate": 2.6552125888916126e-05, + "loss": 0.7531, + "step": 6608 + }, + { + "epoch": 0.2897598188876577, + "grad_norm": 0.94921875, + "learning_rate": 2.654900172239225e-05, + "loss": 0.7901, + "step": 6609 + }, + { + "epoch": 0.28980366210431496, + "grad_norm": 0.859375, + "learning_rate": 2.654587771154361e-05, + "loss": 0.7364, + "step": 6610 + }, + { + "epoch": 0.2898475053209722, + "grad_norm": 0.92578125, + "learning_rate": 2.654275385637678e-05, + "loss": 0.8142, + "step": 6611 + }, + { + "epoch": 0.28989134853762943, + "grad_norm": 0.8828125, + "learning_rate": 2.6539630156898377e-05, + "loss": 0.7549, + "step": 6612 + }, + { + "epoch": 0.28993519175428667, + "grad_norm": 0.79296875, + "learning_rate": 2.6536506613115043e-05, + "loss": 0.8227, + "step": 6613 + }, + { + "epoch": 0.2899790349709439, + "grad_norm": 0.875, + "learning_rate": 2.6533383225033336e-05, + "loss": 0.7783, + "step": 6614 + }, + { + "epoch": 0.29002287818760114, + "grad_norm": 0.83203125, + "learning_rate": 2.6530259992659968e-05, + "loss": 0.8814, + "step": 6615 + }, + { + "epoch": 0.2900667214042584, + "grad_norm": 0.90625, + "learning_rate": 2.6527136916001515e-05, + "loss": 0.8484, + "step": 6616 + }, + { + "epoch": 0.2901105646209156, + "grad_norm": 0.9296875, + "learning_rate": 2.652401399506459e-05, + "loss": 1.0586, + "step": 6617 + }, + { + "epoch": 0.29015440783757285, + "grad_norm": 0.87109375, + "learning_rate": 2.6520891229855815e-05, + "loss": 0.8205, + "step": 6618 + }, + { + "epoch": 0.2901982510542301, + "grad_norm": 0.8515625, + "learning_rate": 2.651776862038179e-05, + "loss": 0.8598, + "step": 6619 + }, + { + "epoch": 0.2902420942708873, + "grad_norm": 0.8046875, + "learning_rate": 2.6514646166649183e-05, + "loss": 0.7668, + "step": 6620 + }, + { + "epoch": 0.29028593748754455, + "grad_norm": 0.8515625, + "learning_rate": 2.6511523868664578e-05, + "loss": 0.7576, + "step": 6621 + }, + { + "epoch": 0.2903297807042018, + "grad_norm": 0.76953125, + "learning_rate": 2.6508401726434596e-05, + "loss": 0.9087, + "step": 6622 + }, + { + "epoch": 0.290373623920859, + "grad_norm": 0.75390625, + "learning_rate": 2.650527973996586e-05, + "loss": 0.8099, + "step": 6623 + }, + { + "epoch": 0.29041746713751626, + "grad_norm": 0.83984375, + "learning_rate": 2.6502157909264957e-05, + "loss": 0.9363, + "step": 6624 + }, + { + "epoch": 0.2904613103541735, + "grad_norm": 0.78125, + "learning_rate": 2.6499036234338546e-05, + "loss": 0.8264, + "step": 6625 + }, + { + "epoch": 0.29050515357083073, + "grad_norm": 0.78125, + "learning_rate": 2.6495914715193238e-05, + "loss": 0.8577, + "step": 6626 + }, + { + "epoch": 0.29054899678748797, + "grad_norm": 0.76953125, + "learning_rate": 2.6492793351835642e-05, + "loss": 0.8268, + "step": 6627 + }, + { + "epoch": 0.2905928400041452, + "grad_norm": 0.8046875, + "learning_rate": 2.648967214427236e-05, + "loss": 0.7577, + "step": 6628 + }, + { + "epoch": 0.2906366832208024, + "grad_norm": 0.890625, + "learning_rate": 2.6486551092510015e-05, + "loss": 0.8204, + "step": 6629 + }, + { + "epoch": 0.2906805264374596, + "grad_norm": 0.76171875, + "learning_rate": 2.6483430196555225e-05, + "loss": 0.7466, + "step": 6630 + }, + { + "epoch": 0.29072436965411685, + "grad_norm": 0.69140625, + "learning_rate": 2.648030945641461e-05, + "loss": 0.8232, + "step": 6631 + }, + { + "epoch": 0.2907682128707741, + "grad_norm": 0.765625, + "learning_rate": 2.647718887209477e-05, + "loss": 0.7292, + "step": 6632 + }, + { + "epoch": 0.2908120560874313, + "grad_norm": 0.79296875, + "learning_rate": 2.647406844360233e-05, + "loss": 0.7961, + "step": 6633 + }, + { + "epoch": 0.29085589930408856, + "grad_norm": 0.92578125, + "learning_rate": 2.6470948170943865e-05, + "loss": 0.8091, + "step": 6634 + }, + { + "epoch": 0.2908997425207458, + "grad_norm": 0.875, + "learning_rate": 2.6467828054126053e-05, + "loss": 0.8269, + "step": 6635 + }, + { + "epoch": 0.29094358573740303, + "grad_norm": 0.79296875, + "learning_rate": 2.6464708093155477e-05, + "loss": 0.7679, + "step": 6636 + }, + { + "epoch": 0.29098742895406027, + "grad_norm": 0.91015625, + "learning_rate": 2.6461588288038753e-05, + "loss": 0.8565, + "step": 6637 + }, + { + "epoch": 0.2910312721707175, + "grad_norm": 0.82421875, + "learning_rate": 2.6458468638782484e-05, + "loss": 0.7423, + "step": 6638 + }, + { + "epoch": 0.29107511538737474, + "grad_norm": 0.90234375, + "learning_rate": 2.645534914539326e-05, + "loss": 0.8206, + "step": 6639 + }, + { + "epoch": 0.291118958604032, + "grad_norm": 0.80078125, + "learning_rate": 2.6452229807877748e-05, + "loss": 0.8097, + "step": 6640 + }, + { + "epoch": 0.2911628018206892, + "grad_norm": 0.8984375, + "learning_rate": 2.6449110626242535e-05, + "loss": 0.8789, + "step": 6641 + }, + { + "epoch": 0.29120664503734645, + "grad_norm": 0.89453125, + "learning_rate": 2.6445991600494226e-05, + "loss": 0.8347, + "step": 6642 + }, + { + "epoch": 0.2912504882540037, + "grad_norm": 0.8828125, + "learning_rate": 2.6442872730639433e-05, + "loss": 0.8667, + "step": 6643 + }, + { + "epoch": 0.2912943314706609, + "grad_norm": 0.83203125, + "learning_rate": 2.6439754016684738e-05, + "loss": 0.6994, + "step": 6644 + }, + { + "epoch": 0.29133817468731815, + "grad_norm": 0.79296875, + "learning_rate": 2.6436635458636804e-05, + "loss": 0.7393, + "step": 6645 + }, + { + "epoch": 0.2913820179039754, + "grad_norm": 0.7734375, + "learning_rate": 2.643351705650222e-05, + "loss": 0.8275, + "step": 6646 + }, + { + "epoch": 0.2914258611206326, + "grad_norm": 0.91796875, + "learning_rate": 2.6430398810287592e-05, + "loss": 0.8334, + "step": 6647 + }, + { + "epoch": 0.29146970433728986, + "grad_norm": 0.8203125, + "learning_rate": 2.6427280719999536e-05, + "loss": 0.7047, + "step": 6648 + }, + { + "epoch": 0.2915135475539471, + "grad_norm": 0.93359375, + "learning_rate": 2.6424162785644613e-05, + "loss": 0.7656, + "step": 6649 + }, + { + "epoch": 0.29155739077060433, + "grad_norm": 0.76171875, + "learning_rate": 2.6421045007229507e-05, + "loss": 0.7438, + "step": 6650 + }, + { + "epoch": 0.29160123398726157, + "grad_norm": 0.85546875, + "learning_rate": 2.6417927384760787e-05, + "loss": 0.8763, + "step": 6651 + }, + { + "epoch": 0.2916450772039188, + "grad_norm": 0.88671875, + "learning_rate": 2.6414809918245065e-05, + "loss": 0.8359, + "step": 6652 + }, + { + "epoch": 0.29168892042057604, + "grad_norm": 0.77734375, + "learning_rate": 2.6411692607688953e-05, + "loss": 0.7525, + "step": 6653 + }, + { + "epoch": 0.29173276363723327, + "grad_norm": 0.890625, + "learning_rate": 2.6408575453099016e-05, + "loss": 0.9395, + "step": 6654 + }, + { + "epoch": 0.2917766068538905, + "grad_norm": 0.7890625, + "learning_rate": 2.6405458454481936e-05, + "loss": 0.7987, + "step": 6655 + }, + { + "epoch": 0.29182045007054774, + "grad_norm": 0.83984375, + "learning_rate": 2.6402341611844273e-05, + "loss": 0.7345, + "step": 6656 + }, + { + "epoch": 0.291864293287205, + "grad_norm": 0.7890625, + "learning_rate": 2.6399224925192633e-05, + "loss": 0.7512, + "step": 6657 + }, + { + "epoch": 0.2919081365038622, + "grad_norm": 0.875, + "learning_rate": 2.6396108394533637e-05, + "loss": 0.7561, + "step": 6658 + }, + { + "epoch": 0.29195197972051945, + "grad_norm": 0.8046875, + "learning_rate": 2.6392992019873854e-05, + "loss": 0.7944, + "step": 6659 + }, + { + "epoch": 0.29199582293717663, + "grad_norm": 0.859375, + "learning_rate": 2.638987580121994e-05, + "loss": 0.8368, + "step": 6660 + }, + { + "epoch": 0.29203966615383387, + "grad_norm": 0.828125, + "learning_rate": 2.6386759738578494e-05, + "loss": 0.8391, + "step": 6661 + }, + { + "epoch": 0.2920835093704911, + "grad_norm": 0.80859375, + "learning_rate": 2.6383643831956083e-05, + "loss": 0.7783, + "step": 6662 + }, + { + "epoch": 0.29212735258714834, + "grad_norm": 0.90234375, + "learning_rate": 2.6380528081359347e-05, + "loss": 0.8363, + "step": 6663 + }, + { + "epoch": 0.2921711958038056, + "grad_norm": 0.86328125, + "learning_rate": 2.6377412486794828e-05, + "loss": 0.7737, + "step": 6664 + }, + { + "epoch": 0.2922150390204628, + "grad_norm": 0.84765625, + "learning_rate": 2.6374297048269214e-05, + "loss": 0.7772, + "step": 6665 + }, + { + "epoch": 0.29225888223712004, + "grad_norm": 0.77734375, + "learning_rate": 2.6371181765789078e-05, + "loss": 0.7584, + "step": 6666 + }, + { + "epoch": 0.2923027254537773, + "grad_norm": 0.8984375, + "learning_rate": 2.6368066639361e-05, + "loss": 0.8299, + "step": 6667 + }, + { + "epoch": 0.2923465686704345, + "grad_norm": 0.90234375, + "learning_rate": 2.63649516689916e-05, + "loss": 0.8432, + "step": 6668 + }, + { + "epoch": 0.29239041188709175, + "grad_norm": 0.765625, + "learning_rate": 2.6361836854687483e-05, + "loss": 0.7477, + "step": 6669 + }, + { + "epoch": 0.292434255103749, + "grad_norm": 0.859375, + "learning_rate": 2.6358722196455234e-05, + "loss": 0.9207, + "step": 6670 + }, + { + "epoch": 0.2924780983204062, + "grad_norm": 0.86328125, + "learning_rate": 2.6355607694301467e-05, + "loss": 0.8351, + "step": 6671 + }, + { + "epoch": 0.29252194153706346, + "grad_norm": 0.8671875, + "learning_rate": 2.6352493348232754e-05, + "loss": 0.8506, + "step": 6672 + }, + { + "epoch": 0.2925657847537207, + "grad_norm": 0.81640625, + "learning_rate": 2.6349379158255748e-05, + "loss": 0.6997, + "step": 6673 + }, + { + "epoch": 0.29260962797037793, + "grad_norm": 0.8828125, + "learning_rate": 2.6346265124377023e-05, + "loss": 0.8599, + "step": 6674 + }, + { + "epoch": 0.29265347118703516, + "grad_norm": 0.8671875, + "learning_rate": 2.634315124660318e-05, + "loss": 0.7762, + "step": 6675 + }, + { + "epoch": 0.2926973144036924, + "grad_norm": 0.828125, + "learning_rate": 2.634003752494082e-05, + "loss": 0.7399, + "step": 6676 + }, + { + "epoch": 0.29274115762034963, + "grad_norm": 0.8359375, + "learning_rate": 2.633692395939654e-05, + "loss": 0.8673, + "step": 6677 + }, + { + "epoch": 0.29278500083700687, + "grad_norm": 0.8515625, + "learning_rate": 2.6333810549976933e-05, + "loss": 0.8437, + "step": 6678 + }, + { + "epoch": 0.2928288440536641, + "grad_norm": 0.859375, + "learning_rate": 2.633069729668861e-05, + "loss": 0.8378, + "step": 6679 + }, + { + "epoch": 0.29287268727032134, + "grad_norm": 0.828125, + "learning_rate": 2.6327584199538135e-05, + "loss": 0.884, + "step": 6680 + }, + { + "epoch": 0.2929165304869786, + "grad_norm": 0.8984375, + "learning_rate": 2.6324471258532158e-05, + "loss": 0.7634, + "step": 6681 + }, + { + "epoch": 0.2929603737036358, + "grad_norm": 0.8671875, + "learning_rate": 2.632135847367726e-05, + "loss": 0.826, + "step": 6682 + }, + { + "epoch": 0.29300421692029305, + "grad_norm": 0.83203125, + "learning_rate": 2.631824584498003e-05, + "loss": 0.7456, + "step": 6683 + }, + { + "epoch": 0.2930480601369503, + "grad_norm": 0.859375, + "learning_rate": 2.6315133372447063e-05, + "loss": 0.7628, + "step": 6684 + }, + { + "epoch": 0.2930919033536075, + "grad_norm": 0.83984375, + "learning_rate": 2.6312021056084934e-05, + "loss": 0.729, + "step": 6685 + }, + { + "epoch": 0.29313574657026475, + "grad_norm": 0.85546875, + "learning_rate": 2.6308908895900287e-05, + "loss": 0.9637, + "step": 6686 + }, + { + "epoch": 0.293179589786922, + "grad_norm": 0.8828125, + "learning_rate": 2.6305796891899704e-05, + "loss": 0.759, + "step": 6687 + }, + { + "epoch": 0.2932234330035792, + "grad_norm": 0.84375, + "learning_rate": 2.630268504408977e-05, + "loss": 0.776, + "step": 6688 + }, + { + "epoch": 0.29326727622023646, + "grad_norm": 1.34375, + "learning_rate": 2.629957335247708e-05, + "loss": 0.7343, + "step": 6689 + }, + { + "epoch": 0.29331111943689364, + "grad_norm": 0.8359375, + "learning_rate": 2.6296461817068196e-05, + "loss": 0.862, + "step": 6690 + }, + { + "epoch": 0.2933549626535509, + "grad_norm": 0.87890625, + "learning_rate": 2.629335043786979e-05, + "loss": 0.7947, + "step": 6691 + }, + { + "epoch": 0.2933988058702081, + "grad_norm": 0.79296875, + "learning_rate": 2.629023921488841e-05, + "loss": 0.775, + "step": 6692 + }, + { + "epoch": 0.29344264908686535, + "grad_norm": 0.8359375, + "learning_rate": 2.6287128148130658e-05, + "loss": 0.8543, + "step": 6693 + }, + { + "epoch": 0.2934864923035226, + "grad_norm": 0.859375, + "learning_rate": 2.628401723760312e-05, + "loss": 0.8305, + "step": 6694 + }, + { + "epoch": 0.2935303355201798, + "grad_norm": 0.81640625, + "learning_rate": 2.6280906483312362e-05, + "loss": 0.8028, + "step": 6695 + }, + { + "epoch": 0.29357417873683705, + "grad_norm": 0.83203125, + "learning_rate": 2.6277795885265044e-05, + "loss": 0.689, + "step": 6696 + }, + { + "epoch": 0.2936180219534943, + "grad_norm": 0.80078125, + "learning_rate": 2.6274685443467728e-05, + "loss": 0.7696, + "step": 6697 + }, + { + "epoch": 0.2936618651701515, + "grad_norm": 0.91015625, + "learning_rate": 2.6271575157927e-05, + "loss": 0.8488, + "step": 6698 + }, + { + "epoch": 0.29370570838680876, + "grad_norm": 0.875, + "learning_rate": 2.6268465028649448e-05, + "loss": 0.7196, + "step": 6699 + }, + { + "epoch": 0.293749551603466, + "grad_norm": 0.8046875, + "learning_rate": 2.626535505564164e-05, + "loss": 0.7696, + "step": 6700 + }, + { + "epoch": 0.29379339482012323, + "grad_norm": 0.7734375, + "learning_rate": 2.626224523891023e-05, + "loss": 0.7245, + "step": 6701 + }, + { + "epoch": 0.29383723803678047, + "grad_norm": 1.0625, + "learning_rate": 2.625913557846178e-05, + "loss": 0.7894, + "step": 6702 + }, + { + "epoch": 0.2938810812534377, + "grad_norm": 0.94921875, + "learning_rate": 2.6256026074302874e-05, + "loss": 0.7878, + "step": 6703 + }, + { + "epoch": 0.29392492447009494, + "grad_norm": 0.8125, + "learning_rate": 2.6252916726440102e-05, + "loss": 0.7406, + "step": 6704 + }, + { + "epoch": 0.2939687676867522, + "grad_norm": 0.859375, + "learning_rate": 2.6249807534880032e-05, + "loss": 0.8574, + "step": 6705 + }, + { + "epoch": 0.2940126109034094, + "grad_norm": 0.85546875, + "learning_rate": 2.6246698499629318e-05, + "loss": 0.7058, + "step": 6706 + }, + { + "epoch": 0.29405645412006665, + "grad_norm": 0.88671875, + "learning_rate": 2.62435896206945e-05, + "loss": 0.7649, + "step": 6707 + }, + { + "epoch": 0.2941002973367239, + "grad_norm": 0.875, + "learning_rate": 2.6240480898082186e-05, + "loss": 0.8462, + "step": 6708 + }, + { + "epoch": 0.2941441405533811, + "grad_norm": 0.7890625, + "learning_rate": 2.6237372331798947e-05, + "loss": 0.7166, + "step": 6709 + }, + { + "epoch": 0.29418798377003835, + "grad_norm": 0.87890625, + "learning_rate": 2.6234263921851353e-05, + "loss": 0.8621, + "step": 6710 + }, + { + "epoch": 0.2942318269866956, + "grad_norm": 0.8515625, + "learning_rate": 2.6231155668246055e-05, + "loss": 0.8229, + "step": 6711 + }, + { + "epoch": 0.2942756702033528, + "grad_norm": 0.80859375, + "learning_rate": 2.622804757098961e-05, + "loss": 0.7418, + "step": 6712 + }, + { + "epoch": 0.29431951342001006, + "grad_norm": 0.828125, + "learning_rate": 2.6224939630088596e-05, + "loss": 0.8288, + "step": 6713 + }, + { + "epoch": 0.2943633566366673, + "grad_norm": 0.859375, + "learning_rate": 2.6221831845549605e-05, + "loss": 0.7654, + "step": 6714 + }, + { + "epoch": 0.29440719985332453, + "grad_norm": 0.859375, + "learning_rate": 2.621872421737923e-05, + "loss": 0.8718, + "step": 6715 + }, + { + "epoch": 0.29445104306998177, + "grad_norm": 0.7578125, + "learning_rate": 2.621561674558405e-05, + "loss": 0.8548, + "step": 6716 + }, + { + "epoch": 0.294494886286639, + "grad_norm": 0.78515625, + "learning_rate": 2.621250943017064e-05, + "loss": 0.819, + "step": 6717 + }, + { + "epoch": 0.29453872950329624, + "grad_norm": 0.72265625, + "learning_rate": 2.620940227114561e-05, + "loss": 0.7578, + "step": 6718 + }, + { + "epoch": 0.2945825727199535, + "grad_norm": 0.82421875, + "learning_rate": 2.6206295268515536e-05, + "loss": 0.7992, + "step": 6719 + }, + { + "epoch": 0.29462641593661065, + "grad_norm": 0.82421875, + "learning_rate": 2.6203188422286972e-05, + "loss": 0.7807, + "step": 6720 + }, + { + "epoch": 0.2946702591532679, + "grad_norm": 0.875, + "learning_rate": 2.6200081732466552e-05, + "loss": 0.8776, + "step": 6721 + }, + { + "epoch": 0.2947141023699251, + "grad_norm": 0.87890625, + "learning_rate": 2.6196975199060845e-05, + "loss": 0.7641, + "step": 6722 + }, + { + "epoch": 0.29475794558658236, + "grad_norm": 0.78515625, + "learning_rate": 2.619386882207644e-05, + "loss": 0.8115, + "step": 6723 + }, + { + "epoch": 0.2948017888032396, + "grad_norm": 0.859375, + "learning_rate": 2.61907626015199e-05, + "loss": 0.8737, + "step": 6724 + }, + { + "epoch": 0.29484563201989683, + "grad_norm": 0.890625, + "learning_rate": 2.6187656537397797e-05, + "loss": 0.7945, + "step": 6725 + }, + { + "epoch": 0.29488947523655407, + "grad_norm": 0.828125, + "learning_rate": 2.6184550629716765e-05, + "loss": 0.845, + "step": 6726 + }, + { + "epoch": 0.2949333184532113, + "grad_norm": 0.80859375, + "learning_rate": 2.6181444878483362e-05, + "loss": 0.7318, + "step": 6727 + }, + { + "epoch": 0.29497716166986854, + "grad_norm": 0.69921875, + "learning_rate": 2.6178339283704168e-05, + "loss": 0.6291, + "step": 6728 + }, + { + "epoch": 0.2950210048865258, + "grad_norm": 0.85546875, + "learning_rate": 2.617523384538576e-05, + "loss": 0.786, + "step": 6729 + }, + { + "epoch": 0.295064848103183, + "grad_norm": 0.7421875, + "learning_rate": 2.6172128563534692e-05, + "loss": 0.6974, + "step": 6730 + }, + { + "epoch": 0.29510869131984024, + "grad_norm": 0.890625, + "learning_rate": 2.6169023438157603e-05, + "loss": 0.8132, + "step": 6731 + }, + { + "epoch": 0.2951525345364975, + "grad_norm": 0.9296875, + "learning_rate": 2.6165918469261063e-05, + "loss": 0.7607, + "step": 6732 + }, + { + "epoch": 0.2951963777531547, + "grad_norm": 0.78515625, + "learning_rate": 2.6162813656851626e-05, + "loss": 0.7949, + "step": 6733 + }, + { + "epoch": 0.29524022096981195, + "grad_norm": 0.83203125, + "learning_rate": 2.6159709000935894e-05, + "loss": 0.7797, + "step": 6734 + }, + { + "epoch": 0.2952840641864692, + "grad_norm": 0.7265625, + "learning_rate": 2.615660450152041e-05, + "loss": 0.7685, + "step": 6735 + }, + { + "epoch": 0.2953279074031264, + "grad_norm": 0.8203125, + "learning_rate": 2.6153500158611798e-05, + "loss": 0.787, + "step": 6736 + }, + { + "epoch": 0.29537175061978366, + "grad_norm": 0.7734375, + "learning_rate": 2.6150395972216634e-05, + "loss": 0.7313, + "step": 6737 + }, + { + "epoch": 0.2954155938364409, + "grad_norm": 0.78125, + "learning_rate": 2.614729194234148e-05, + "loss": 0.7128, + "step": 6738 + }, + { + "epoch": 0.29545943705309813, + "grad_norm": 0.73828125, + "learning_rate": 2.6144188068992924e-05, + "loss": 0.7038, + "step": 6739 + }, + { + "epoch": 0.29550328026975536, + "grad_norm": 0.8046875, + "learning_rate": 2.6141084352177505e-05, + "loss": 0.8966, + "step": 6740 + }, + { + "epoch": 0.2955471234864126, + "grad_norm": 0.90234375, + "learning_rate": 2.613798079190187e-05, + "loss": 0.7937, + "step": 6741 + }, + { + "epoch": 0.29559096670306984, + "grad_norm": 1.3671875, + "learning_rate": 2.6134877388172563e-05, + "loss": 0.7601, + "step": 6742 + }, + { + "epoch": 0.29563480991972707, + "grad_norm": 0.77734375, + "learning_rate": 2.6131774140996157e-05, + "loss": 0.7929, + "step": 6743 + }, + { + "epoch": 0.2956786531363843, + "grad_norm": 0.7578125, + "learning_rate": 2.612867105037924e-05, + "loss": 0.6931, + "step": 6744 + }, + { + "epoch": 0.29572249635304154, + "grad_norm": 0.82421875, + "learning_rate": 2.6125568116328346e-05, + "loss": 0.864, + "step": 6745 + }, + { + "epoch": 0.2957663395696988, + "grad_norm": 0.859375, + "learning_rate": 2.6122465338850123e-05, + "loss": 0.8126, + "step": 6746 + }, + { + "epoch": 0.295810182786356, + "grad_norm": 0.73828125, + "learning_rate": 2.611936271795111e-05, + "loss": 0.7247, + "step": 6747 + }, + { + "epoch": 0.29585402600301325, + "grad_norm": 0.86328125, + "learning_rate": 2.611626025363788e-05, + "loss": 0.9049, + "step": 6748 + }, + { + "epoch": 0.2958978692196705, + "grad_norm": 0.8203125, + "learning_rate": 2.6113157945917022e-05, + "loss": 0.7697, + "step": 6749 + }, + { + "epoch": 0.2959417124363277, + "grad_norm": 0.921875, + "learning_rate": 2.6110055794795063e-05, + "loss": 1.0058, + "step": 6750 + }, + { + "epoch": 0.2959855556529849, + "grad_norm": 0.80078125, + "learning_rate": 2.6106953800278654e-05, + "loss": 0.6941, + "step": 6751 + }, + { + "epoch": 0.29602939886964214, + "grad_norm": 0.828125, + "learning_rate": 2.6103851962374337e-05, + "loss": 0.7604, + "step": 6752 + }, + { + "epoch": 0.29607324208629937, + "grad_norm": 0.83203125, + "learning_rate": 2.6100750281088672e-05, + "loss": 0.757, + "step": 6753 + }, + { + "epoch": 0.2961170853029566, + "grad_norm": 0.87890625, + "learning_rate": 2.609764875642825e-05, + "loss": 0.829, + "step": 6754 + }, + { + "epoch": 0.29616092851961384, + "grad_norm": 0.7265625, + "learning_rate": 2.6094547388399637e-05, + "loss": 0.7849, + "step": 6755 + }, + { + "epoch": 0.2962047717362711, + "grad_norm": 0.78125, + "learning_rate": 2.6091446177009403e-05, + "loss": 0.7478, + "step": 6756 + }, + { + "epoch": 0.2962486149529283, + "grad_norm": 0.75390625, + "learning_rate": 2.6088345122264092e-05, + "loss": 0.6867, + "step": 6757 + }, + { + "epoch": 0.29629245816958555, + "grad_norm": 0.84375, + "learning_rate": 2.6085244224170345e-05, + "loss": 0.7576, + "step": 6758 + }, + { + "epoch": 0.2963363013862428, + "grad_norm": 0.80078125, + "learning_rate": 2.60821434827347e-05, + "loss": 0.6858, + "step": 6759 + }, + { + "epoch": 0.2963801446029, + "grad_norm": 1.3515625, + "learning_rate": 2.6079042897963724e-05, + "loss": 0.8319, + "step": 6760 + }, + { + "epoch": 0.29642398781955726, + "grad_norm": 0.78515625, + "learning_rate": 2.607594246986399e-05, + "loss": 0.7103, + "step": 6761 + }, + { + "epoch": 0.2964678310362145, + "grad_norm": 0.890625, + "learning_rate": 2.6072842198442073e-05, + "loss": 0.8768, + "step": 6762 + }, + { + "epoch": 0.2965116742528717, + "grad_norm": 0.859375, + "learning_rate": 2.606974208370454e-05, + "loss": 0.8037, + "step": 6763 + }, + { + "epoch": 0.29655551746952896, + "grad_norm": 0.8125, + "learning_rate": 2.6066642125657958e-05, + "loss": 0.7597, + "step": 6764 + }, + { + "epoch": 0.2965993606861862, + "grad_norm": 0.7890625, + "learning_rate": 2.6063542324308875e-05, + "loss": 0.7956, + "step": 6765 + }, + { + "epoch": 0.29664320390284343, + "grad_norm": 0.84375, + "learning_rate": 2.606044267966392e-05, + "loss": 0.8069, + "step": 6766 + }, + { + "epoch": 0.29668704711950067, + "grad_norm": 0.75390625, + "learning_rate": 2.6057343191729634e-05, + "loss": 0.7534, + "step": 6767 + }, + { + "epoch": 0.2967308903361579, + "grad_norm": 0.77734375, + "learning_rate": 2.6054243860512585e-05, + "loss": 0.7942, + "step": 6768 + }, + { + "epoch": 0.29677473355281514, + "grad_norm": 0.8828125, + "learning_rate": 2.6051144686019325e-05, + "loss": 0.8797, + "step": 6769 + }, + { + "epoch": 0.2968185767694724, + "grad_norm": 0.828125, + "learning_rate": 2.6048045668256417e-05, + "loss": 0.8257, + "step": 6770 + }, + { + "epoch": 0.2968624199861296, + "grad_norm": 0.859375, + "learning_rate": 2.6044946807230474e-05, + "loss": 0.763, + "step": 6771 + }, + { + "epoch": 0.29690626320278685, + "grad_norm": 0.890625, + "learning_rate": 2.6041848102948052e-05, + "loss": 0.8284, + "step": 6772 + }, + { + "epoch": 0.2969501064194441, + "grad_norm": 0.953125, + "learning_rate": 2.6038749555415698e-05, + "loss": 0.8166, + "step": 6773 + }, + { + "epoch": 0.2969939496361013, + "grad_norm": 0.796875, + "learning_rate": 2.6035651164639984e-05, + "loss": 0.7544, + "step": 6774 + }, + { + "epoch": 0.29703779285275855, + "grad_norm": 1.1015625, + "learning_rate": 2.6032552930627453e-05, + "loss": 0.7361, + "step": 6775 + }, + { + "epoch": 0.2970816360694158, + "grad_norm": 0.7265625, + "learning_rate": 2.6029454853384728e-05, + "loss": 0.7799, + "step": 6776 + }, + { + "epoch": 0.297125479286073, + "grad_norm": 0.90625, + "learning_rate": 2.602635693291834e-05, + "loss": 0.7633, + "step": 6777 + }, + { + "epoch": 0.29716932250273026, + "grad_norm": 0.875, + "learning_rate": 2.6023259169234872e-05, + "loss": 0.8807, + "step": 6778 + }, + { + "epoch": 0.2972131657193875, + "grad_norm": 0.8125, + "learning_rate": 2.6020161562340863e-05, + "loss": 0.8082, + "step": 6779 + }, + { + "epoch": 0.29725700893604473, + "grad_norm": 0.8515625, + "learning_rate": 2.6017064112242874e-05, + "loss": 0.9007, + "step": 6780 + }, + { + "epoch": 0.2973008521527019, + "grad_norm": 0.83984375, + "learning_rate": 2.601396681894751e-05, + "loss": 0.896, + "step": 6781 + }, + { + "epoch": 0.29734469536935915, + "grad_norm": 0.83984375, + "learning_rate": 2.6010869682461324e-05, + "loss": 0.7986, + "step": 6782 + }, + { + "epoch": 0.2973885385860164, + "grad_norm": 0.94921875, + "learning_rate": 2.600777270279087e-05, + "loss": 0.7746, + "step": 6783 + }, + { + "epoch": 0.2974323818026736, + "grad_norm": 0.80078125, + "learning_rate": 2.600467587994271e-05, + "loss": 0.7642, + "step": 6784 + }, + { + "epoch": 0.29747622501933085, + "grad_norm": 0.8046875, + "learning_rate": 2.6001579213923376e-05, + "loss": 0.8071, + "step": 6785 + }, + { + "epoch": 0.2975200682359881, + "grad_norm": 0.8359375, + "learning_rate": 2.5998482704739502e-05, + "loss": 0.7396, + "step": 6786 + }, + { + "epoch": 0.2975639114526453, + "grad_norm": 0.7890625, + "learning_rate": 2.599538635239761e-05, + "loss": 0.8583, + "step": 6787 + }, + { + "epoch": 0.29760775466930256, + "grad_norm": 0.890625, + "learning_rate": 2.5992290156904266e-05, + "loss": 0.8992, + "step": 6788 + }, + { + "epoch": 0.2976515978859598, + "grad_norm": 0.8046875, + "learning_rate": 2.5989194118266035e-05, + "loss": 0.8905, + "step": 6789 + }, + { + "epoch": 0.29769544110261703, + "grad_norm": 0.87109375, + "learning_rate": 2.5986098236489454e-05, + "loss": 0.7297, + "step": 6790 + }, + { + "epoch": 0.29773928431927427, + "grad_norm": 0.92578125, + "learning_rate": 2.5983002511581124e-05, + "loss": 0.7897, + "step": 6791 + }, + { + "epoch": 0.2977831275359315, + "grad_norm": 0.8828125, + "learning_rate": 2.5979906943547593e-05, + "loss": 0.8616, + "step": 6792 + }, + { + "epoch": 0.29782697075258874, + "grad_norm": 0.796875, + "learning_rate": 2.597681153239543e-05, + "loss": 0.7761, + "step": 6793 + }, + { + "epoch": 0.297870813969246, + "grad_norm": 0.7734375, + "learning_rate": 2.5973716278131165e-05, + "loss": 0.7352, + "step": 6794 + }, + { + "epoch": 0.2979146571859032, + "grad_norm": 0.79296875, + "learning_rate": 2.5970621180761367e-05, + "loss": 0.7353, + "step": 6795 + }, + { + "epoch": 0.29795850040256044, + "grad_norm": 0.79296875, + "learning_rate": 2.5967526240292628e-05, + "loss": 0.7375, + "step": 6796 + }, + { + "epoch": 0.2980023436192177, + "grad_norm": 0.7578125, + "learning_rate": 2.5964431456731475e-05, + "loss": 0.7694, + "step": 6797 + }, + { + "epoch": 0.2980461868358749, + "grad_norm": 0.765625, + "learning_rate": 2.5961336830084492e-05, + "loss": 0.7748, + "step": 6798 + }, + { + "epoch": 0.29809003005253215, + "grad_norm": 0.85546875, + "learning_rate": 2.595824236035822e-05, + "loss": 0.9404, + "step": 6799 + }, + { + "epoch": 0.2981338732691894, + "grad_norm": 0.87109375, + "learning_rate": 2.595514804755922e-05, + "loss": 0.8202, + "step": 6800 + }, + { + "epoch": 0.2981777164858466, + "grad_norm": 0.74609375, + "learning_rate": 2.595205389169405e-05, + "loss": 0.7008, + "step": 6801 + }, + { + "epoch": 0.29822155970250386, + "grad_norm": 0.82421875, + "learning_rate": 2.5948959892769263e-05, + "loss": 0.8823, + "step": 6802 + }, + { + "epoch": 0.2982654029191611, + "grad_norm": 0.8671875, + "learning_rate": 2.5945866050791435e-05, + "loss": 0.8308, + "step": 6803 + }, + { + "epoch": 0.29830924613581833, + "grad_norm": 0.8671875, + "learning_rate": 2.594277236576711e-05, + "loss": 0.9897, + "step": 6804 + }, + { + "epoch": 0.29835308935247556, + "grad_norm": 0.83984375, + "learning_rate": 2.5939678837702806e-05, + "loss": 0.8511, + "step": 6805 + }, + { + "epoch": 0.2983969325691328, + "grad_norm": 0.94140625, + "learning_rate": 2.593658546660517e-05, + "loss": 0.9863, + "step": 6806 + }, + { + "epoch": 0.29844077578579004, + "grad_norm": 0.890625, + "learning_rate": 2.5933492252480684e-05, + "loss": 0.7605, + "step": 6807 + }, + { + "epoch": 0.29848461900244727, + "grad_norm": 0.828125, + "learning_rate": 2.593039919533594e-05, + "loss": 0.7606, + "step": 6808 + }, + { + "epoch": 0.2985284622191045, + "grad_norm": 0.85546875, + "learning_rate": 2.592730629517748e-05, + "loss": 0.9828, + "step": 6809 + }, + { + "epoch": 0.29857230543576174, + "grad_norm": 0.83984375, + "learning_rate": 2.5924213552011833e-05, + "loss": 0.8695, + "step": 6810 + }, + { + "epoch": 0.2986161486524189, + "grad_norm": 0.765625, + "learning_rate": 2.5921120965845615e-05, + "loss": 0.8086, + "step": 6811 + }, + { + "epoch": 0.29865999186907616, + "grad_norm": 0.82421875, + "learning_rate": 2.5918028536685335e-05, + "loss": 0.9166, + "step": 6812 + }, + { + "epoch": 0.2987038350857334, + "grad_norm": 0.82421875, + "learning_rate": 2.5914936264537558e-05, + "loss": 0.7938, + "step": 6813 + }, + { + "epoch": 0.29874767830239063, + "grad_norm": 0.8984375, + "learning_rate": 2.5911844149408848e-05, + "loss": 0.8408, + "step": 6814 + }, + { + "epoch": 0.29879152151904786, + "grad_norm": 0.83203125, + "learning_rate": 2.590875219130572e-05, + "loss": 0.8608, + "step": 6815 + }, + { + "epoch": 0.2988353647357051, + "grad_norm": 0.8984375, + "learning_rate": 2.5905660390234786e-05, + "loss": 0.7723, + "step": 6816 + }, + { + "epoch": 0.29887920795236234, + "grad_norm": 0.82421875, + "learning_rate": 2.5902568746202573e-05, + "loss": 0.7895, + "step": 6817 + }, + { + "epoch": 0.29892305116901957, + "grad_norm": 0.92578125, + "learning_rate": 2.589947725921562e-05, + "loss": 0.766, + "step": 6818 + }, + { + "epoch": 0.2989668943856768, + "grad_norm": 0.77734375, + "learning_rate": 2.5896385929280498e-05, + "loss": 0.7213, + "step": 6819 + }, + { + "epoch": 0.29901073760233404, + "grad_norm": 0.87890625, + "learning_rate": 2.589329475640374e-05, + "loss": 0.7635, + "step": 6820 + }, + { + "epoch": 0.2990545808189913, + "grad_norm": 0.8203125, + "learning_rate": 2.589020374059189e-05, + "loss": 0.8653, + "step": 6821 + }, + { + "epoch": 0.2990984240356485, + "grad_norm": 0.9140625, + "learning_rate": 2.588711288185154e-05, + "loss": 0.8766, + "step": 6822 + }, + { + "epoch": 0.29914226725230575, + "grad_norm": 0.890625, + "learning_rate": 2.5884022180189226e-05, + "loss": 0.8084, + "step": 6823 + }, + { + "epoch": 0.299186110468963, + "grad_norm": 0.7734375, + "learning_rate": 2.5880931635611484e-05, + "loss": 0.7749, + "step": 6824 + }, + { + "epoch": 0.2992299536856202, + "grad_norm": 0.84765625, + "learning_rate": 2.5877841248124868e-05, + "loss": 0.8056, + "step": 6825 + }, + { + "epoch": 0.29927379690227746, + "grad_norm": 0.921875, + "learning_rate": 2.587475101773591e-05, + "loss": 0.8386, + "step": 6826 + }, + { + "epoch": 0.2993176401189347, + "grad_norm": 0.8125, + "learning_rate": 2.5871660944451205e-05, + "loss": 0.9133, + "step": 6827 + }, + { + "epoch": 0.2993614833355919, + "grad_norm": 0.84375, + "learning_rate": 2.586857102827728e-05, + "loss": 0.9799, + "step": 6828 + }, + { + "epoch": 0.29940532655224916, + "grad_norm": 0.796875, + "learning_rate": 2.586548126922068e-05, + "loss": 0.8414, + "step": 6829 + }, + { + "epoch": 0.2994491697689064, + "grad_norm": 0.7578125, + "learning_rate": 2.5862391667287955e-05, + "loss": 0.7964, + "step": 6830 + }, + { + "epoch": 0.29949301298556363, + "grad_norm": 0.81640625, + "learning_rate": 2.5859302222485625e-05, + "loss": 0.7433, + "step": 6831 + }, + { + "epoch": 0.29953685620222087, + "grad_norm": 0.80859375, + "learning_rate": 2.585621293482029e-05, + "loss": 0.8099, + "step": 6832 + }, + { + "epoch": 0.2995806994188781, + "grad_norm": 0.875, + "learning_rate": 2.5853123804298486e-05, + "loss": 0.7815, + "step": 6833 + }, + { + "epoch": 0.29962454263553534, + "grad_norm": 0.7890625, + "learning_rate": 2.585003483092674e-05, + "loss": 0.7712, + "step": 6834 + }, + { + "epoch": 0.2996683858521926, + "grad_norm": 0.83984375, + "learning_rate": 2.5846946014711604e-05, + "loss": 0.7945, + "step": 6835 + }, + { + "epoch": 0.2997122290688498, + "grad_norm": 0.80078125, + "learning_rate": 2.5843857355659608e-05, + "loss": 0.7773, + "step": 6836 + }, + { + "epoch": 0.29975607228550705, + "grad_norm": 0.90625, + "learning_rate": 2.584076885377733e-05, + "loss": 0.8189, + "step": 6837 + }, + { + "epoch": 0.2997999155021643, + "grad_norm": 0.86328125, + "learning_rate": 2.5837680509071315e-05, + "loss": 0.8678, + "step": 6838 + }, + { + "epoch": 0.2998437587188215, + "grad_norm": 0.828125, + "learning_rate": 2.5834592321548103e-05, + "loss": 0.7818, + "step": 6839 + }, + { + "epoch": 0.29988760193547875, + "grad_norm": 0.84375, + "learning_rate": 2.5831504291214226e-05, + "loss": 0.8462, + "step": 6840 + }, + { + "epoch": 0.299931445152136, + "grad_norm": 0.828125, + "learning_rate": 2.582841641807624e-05, + "loss": 0.6607, + "step": 6841 + }, + { + "epoch": 0.29997528836879317, + "grad_norm": 0.75390625, + "learning_rate": 2.5825328702140682e-05, + "loss": 0.857, + "step": 6842 + }, + { + "epoch": 0.3000191315854504, + "grad_norm": 0.78125, + "learning_rate": 2.5822241143414073e-05, + "loss": 0.823, + "step": 6843 + }, + { + "epoch": 0.30006297480210764, + "grad_norm": 0.91015625, + "learning_rate": 2.5819153741903012e-05, + "loss": 0.8488, + "step": 6844 + }, + { + "epoch": 0.3001068180187649, + "grad_norm": 0.8203125, + "learning_rate": 2.581606649761401e-05, + "loss": 0.8133, + "step": 6845 + }, + { + "epoch": 0.3001506612354221, + "grad_norm": 0.765625, + "learning_rate": 2.5812979410553628e-05, + "loss": 0.7987, + "step": 6846 + }, + { + "epoch": 0.30019450445207935, + "grad_norm": 0.765625, + "learning_rate": 2.5809892480728382e-05, + "loss": 0.6685, + "step": 6847 + }, + { + "epoch": 0.3002383476687366, + "grad_norm": 1.03125, + "learning_rate": 2.5806805708144842e-05, + "loss": 0.9171, + "step": 6848 + }, + { + "epoch": 0.3002821908853938, + "grad_norm": 0.77734375, + "learning_rate": 2.580371909280953e-05, + "loss": 0.7495, + "step": 6849 + }, + { + "epoch": 0.30032603410205105, + "grad_norm": 0.7890625, + "learning_rate": 2.5800632634728993e-05, + "loss": 0.7617, + "step": 6850 + }, + { + "epoch": 0.3003698773187083, + "grad_norm": 0.75, + "learning_rate": 2.579754633390974e-05, + "loss": 0.8526, + "step": 6851 + }, + { + "epoch": 0.3004137205353655, + "grad_norm": 0.80078125, + "learning_rate": 2.5794460190358382e-05, + "loss": 0.7533, + "step": 6852 + }, + { + "epoch": 0.30045756375202276, + "grad_norm": 0.80859375, + "learning_rate": 2.5791374204081432e-05, + "loss": 0.8539, + "step": 6853 + }, + { + "epoch": 0.30050140696868, + "grad_norm": 0.84765625, + "learning_rate": 2.5788288375085412e-05, + "loss": 0.7644, + "step": 6854 + }, + { + "epoch": 0.30054525018533723, + "grad_norm": 0.765625, + "learning_rate": 2.5785202703376875e-05, + "loss": 0.865, + "step": 6855 + }, + { + "epoch": 0.30058909340199447, + "grad_norm": 0.8125, + "learning_rate": 2.578211718896233e-05, + "loss": 0.8026, + "step": 6856 + }, + { + "epoch": 0.3006329366186517, + "grad_norm": 0.84375, + "learning_rate": 2.5779031831848377e-05, + "loss": 0.8891, + "step": 6857 + }, + { + "epoch": 0.30067677983530894, + "grad_norm": 0.80078125, + "learning_rate": 2.5775946632041515e-05, + "loss": 0.7698, + "step": 6858 + }, + { + "epoch": 0.3007206230519662, + "grad_norm": 0.78125, + "learning_rate": 2.5772861589548303e-05, + "loss": 0.7445, + "step": 6859 + }, + { + "epoch": 0.3007644662686234, + "grad_norm": 0.921875, + "learning_rate": 2.5769776704375258e-05, + "loss": 0.8532, + "step": 6860 + }, + { + "epoch": 0.30080830948528064, + "grad_norm": 0.89453125, + "learning_rate": 2.576669197652891e-05, + "loss": 0.7737, + "step": 6861 + }, + { + "epoch": 0.3008521527019379, + "grad_norm": 0.8671875, + "learning_rate": 2.5763607406015846e-05, + "loss": 0.7928, + "step": 6862 + }, + { + "epoch": 0.3008959959185951, + "grad_norm": 0.84765625, + "learning_rate": 2.5760522992842563e-05, + "loss": 0.7565, + "step": 6863 + }, + { + "epoch": 0.30093983913525235, + "grad_norm": 0.84765625, + "learning_rate": 2.5757438737015604e-05, + "loss": 0.8976, + "step": 6864 + }, + { + "epoch": 0.3009836823519096, + "grad_norm": 0.859375, + "learning_rate": 2.5754354638541523e-05, + "loss": 0.7611, + "step": 6865 + }, + { + "epoch": 0.3010275255685668, + "grad_norm": 0.85546875, + "learning_rate": 2.575127069742681e-05, + "loss": 0.7504, + "step": 6866 + }, + { + "epoch": 0.30107136878522406, + "grad_norm": 0.76171875, + "learning_rate": 2.5748186913678062e-05, + "loss": 0.6903, + "step": 6867 + }, + { + "epoch": 0.3011152120018813, + "grad_norm": 0.86328125, + "learning_rate": 2.5745103287301785e-05, + "loss": 0.7926, + "step": 6868 + }, + { + "epoch": 0.30115905521853853, + "grad_norm": 0.82421875, + "learning_rate": 2.574201981830453e-05, + "loss": 0.8333, + "step": 6869 + }, + { + "epoch": 0.30120289843519577, + "grad_norm": 0.96875, + "learning_rate": 2.5738936506692802e-05, + "loss": 0.76, + "step": 6870 + }, + { + "epoch": 0.301246741651853, + "grad_norm": 0.90234375, + "learning_rate": 2.5735853352473137e-05, + "loss": 0.836, + "step": 6871 + }, + { + "epoch": 0.3012905848685102, + "grad_norm": 0.828125, + "learning_rate": 2.5732770355652113e-05, + "loss": 0.7491, + "step": 6872 + }, + { + "epoch": 0.3013344280851674, + "grad_norm": 0.87109375, + "learning_rate": 2.5729687516236234e-05, + "loss": 0.7729, + "step": 6873 + }, + { + "epoch": 0.30137827130182465, + "grad_norm": 0.88671875, + "learning_rate": 2.5726604834232036e-05, + "loss": 0.889, + "step": 6874 + }, + { + "epoch": 0.3014221145184819, + "grad_norm": 0.71875, + "learning_rate": 2.5723522309646066e-05, + "loss": 0.6739, + "step": 6875 + }, + { + "epoch": 0.3014659577351391, + "grad_norm": 0.8671875, + "learning_rate": 2.5720439942484807e-05, + "loss": 0.7841, + "step": 6876 + }, + { + "epoch": 0.30150980095179636, + "grad_norm": 0.7890625, + "learning_rate": 2.5717357732754855e-05, + "loss": 0.7803, + "step": 6877 + }, + { + "epoch": 0.3015536441684536, + "grad_norm": 0.91796875, + "learning_rate": 2.571427568046272e-05, + "loss": 0.9205, + "step": 6878 + }, + { + "epoch": 0.30159748738511083, + "grad_norm": 0.83984375, + "learning_rate": 2.571119378561494e-05, + "loss": 0.7605, + "step": 6879 + }, + { + "epoch": 0.30164133060176807, + "grad_norm": 0.796875, + "learning_rate": 2.5708112048218026e-05, + "loss": 0.9313, + "step": 6880 + }, + { + "epoch": 0.3016851738184253, + "grad_norm": 0.828125, + "learning_rate": 2.5705030468278503e-05, + "loss": 0.7141, + "step": 6881 + }, + { + "epoch": 0.30172901703508254, + "grad_norm": 0.82421875, + "learning_rate": 2.5701949045802943e-05, + "loss": 0.768, + "step": 6882 + }, + { + "epoch": 0.30177286025173977, + "grad_norm": 0.828125, + "learning_rate": 2.5698867780797863e-05, + "loss": 0.7069, + "step": 6883 + }, + { + "epoch": 0.301816703468397, + "grad_norm": 0.83984375, + "learning_rate": 2.5695786673269783e-05, + "loss": 0.8913, + "step": 6884 + }, + { + "epoch": 0.30186054668505424, + "grad_norm": 0.80859375, + "learning_rate": 2.569270572322523e-05, + "loss": 0.6854, + "step": 6885 + }, + { + "epoch": 0.3019043899017115, + "grad_norm": 0.78125, + "learning_rate": 2.568962493067074e-05, + "loss": 0.8102, + "step": 6886 + }, + { + "epoch": 0.3019482331183687, + "grad_norm": 0.90625, + "learning_rate": 2.5686544295612858e-05, + "loss": 0.8147, + "step": 6887 + }, + { + "epoch": 0.30199207633502595, + "grad_norm": 0.76953125, + "learning_rate": 2.5683463818058086e-05, + "loss": 0.6997, + "step": 6888 + }, + { + "epoch": 0.3020359195516832, + "grad_norm": 0.76171875, + "learning_rate": 2.568038349801296e-05, + "loss": 0.7641, + "step": 6889 + }, + { + "epoch": 0.3020797627683404, + "grad_norm": 0.83203125, + "learning_rate": 2.5677303335484025e-05, + "loss": 0.8401, + "step": 6890 + }, + { + "epoch": 0.30212360598499766, + "grad_norm": 0.87890625, + "learning_rate": 2.5674223330477763e-05, + "loss": 0.7689, + "step": 6891 + }, + { + "epoch": 0.3021674492016549, + "grad_norm": 0.77734375, + "learning_rate": 2.5671143483000758e-05, + "loss": 0.8066, + "step": 6892 + }, + { + "epoch": 0.3022112924183121, + "grad_norm": 0.8515625, + "learning_rate": 2.5668063793059528e-05, + "loss": 0.8416, + "step": 6893 + }, + { + "epoch": 0.30225513563496936, + "grad_norm": 0.82421875, + "learning_rate": 2.5664984260660585e-05, + "loss": 0.7102, + "step": 6894 + }, + { + "epoch": 0.3022989788516266, + "grad_norm": 0.9765625, + "learning_rate": 2.566190488581045e-05, + "loss": 0.7829, + "step": 6895 + }, + { + "epoch": 0.30234282206828383, + "grad_norm": 0.77734375, + "learning_rate": 2.5658825668515642e-05, + "loss": 0.8385, + "step": 6896 + }, + { + "epoch": 0.30238666528494107, + "grad_norm": 0.81640625, + "learning_rate": 2.5655746608782727e-05, + "loss": 0.8039, + "step": 6897 + }, + { + "epoch": 0.3024305085015983, + "grad_norm": 0.82421875, + "learning_rate": 2.5652667706618204e-05, + "loss": 0.8199, + "step": 6898 + }, + { + "epoch": 0.30247435171825554, + "grad_norm": 0.80078125, + "learning_rate": 2.5649588962028613e-05, + "loss": 0.8432, + "step": 6899 + }, + { + "epoch": 0.3025181949349128, + "grad_norm": 0.83984375, + "learning_rate": 2.564651037502046e-05, + "loss": 0.7785, + "step": 6900 + }, + { + "epoch": 0.30256203815157, + "grad_norm": 0.9296875, + "learning_rate": 2.564343194560025e-05, + "loss": 0.7825, + "step": 6901 + }, + { + "epoch": 0.3026058813682272, + "grad_norm": 0.7578125, + "learning_rate": 2.5640353673774575e-05, + "loss": 0.8137, + "step": 6902 + }, + { + "epoch": 0.3026497245848844, + "grad_norm": 0.94921875, + "learning_rate": 2.5637275559549912e-05, + "loss": 0.9298, + "step": 6903 + }, + { + "epoch": 0.30269356780154166, + "grad_norm": 0.81640625, + "learning_rate": 2.5634197602932785e-05, + "loss": 0.7203, + "step": 6904 + }, + { + "epoch": 0.3027374110181989, + "grad_norm": 0.86328125, + "learning_rate": 2.563111980392974e-05, + "loss": 0.7599, + "step": 6905 + }, + { + "epoch": 0.30278125423485613, + "grad_norm": 0.77734375, + "learning_rate": 2.562804216254725e-05, + "loss": 0.8343, + "step": 6906 + }, + { + "epoch": 0.30282509745151337, + "grad_norm": 0.84765625, + "learning_rate": 2.5624964678791907e-05, + "loss": 0.8476, + "step": 6907 + }, + { + "epoch": 0.3028689406681706, + "grad_norm": 0.8046875, + "learning_rate": 2.5621887352670195e-05, + "loss": 0.8299, + "step": 6908 + }, + { + "epoch": 0.30291278388482784, + "grad_norm": 0.87109375, + "learning_rate": 2.5618810184188636e-05, + "loss": 0.7848, + "step": 6909 + }, + { + "epoch": 0.3029566271014851, + "grad_norm": 0.859375, + "learning_rate": 2.5615733173353773e-05, + "loss": 0.8449, + "step": 6910 + }, + { + "epoch": 0.3030004703181423, + "grad_norm": 0.83203125, + "learning_rate": 2.5612656320172067e-05, + "loss": 0.8064, + "step": 6911 + }, + { + "epoch": 0.30304431353479955, + "grad_norm": 0.77734375, + "learning_rate": 2.5609579624650128e-05, + "loss": 0.7484, + "step": 6912 + }, + { + "epoch": 0.3030881567514568, + "grad_norm": 0.8984375, + "learning_rate": 2.560650308679442e-05, + "loss": 0.925, + "step": 6913 + }, + { + "epoch": 0.303131999968114, + "grad_norm": 0.859375, + "learning_rate": 2.5603426706611487e-05, + "loss": 0.8614, + "step": 6914 + }, + { + "epoch": 0.30317584318477125, + "grad_norm": 0.828125, + "learning_rate": 2.5600350484107827e-05, + "loss": 0.7218, + "step": 6915 + }, + { + "epoch": 0.3032196864014285, + "grad_norm": 0.79296875, + "learning_rate": 2.559727441928995e-05, + "loss": 0.689, + "step": 6916 + }, + { + "epoch": 0.3032635296180857, + "grad_norm": 0.76171875, + "learning_rate": 2.5594198512164425e-05, + "loss": 0.8285, + "step": 6917 + }, + { + "epoch": 0.30330737283474296, + "grad_norm": 0.82421875, + "learning_rate": 2.5591122762737753e-05, + "loss": 0.7075, + "step": 6918 + }, + { + "epoch": 0.3033512160514002, + "grad_norm": 0.78125, + "learning_rate": 2.5588047171016428e-05, + "loss": 0.8572, + "step": 6919 + }, + { + "epoch": 0.30339505926805743, + "grad_norm": 0.80859375, + "learning_rate": 2.558497173700699e-05, + "loss": 0.7783, + "step": 6920 + }, + { + "epoch": 0.30343890248471467, + "grad_norm": 0.8203125, + "learning_rate": 2.5581896460715927e-05, + "loss": 0.9593, + "step": 6921 + }, + { + "epoch": 0.3034827457013719, + "grad_norm": 0.83984375, + "learning_rate": 2.5578821342149805e-05, + "loss": 0.8142, + "step": 6922 + }, + { + "epoch": 0.30352658891802914, + "grad_norm": 0.7421875, + "learning_rate": 2.5575746381315113e-05, + "loss": 0.7799, + "step": 6923 + }, + { + "epoch": 0.3035704321346864, + "grad_norm": 0.8359375, + "learning_rate": 2.557267157821839e-05, + "loss": 0.8504, + "step": 6924 + }, + { + "epoch": 0.3036142753513436, + "grad_norm": 0.85546875, + "learning_rate": 2.5569596932866112e-05, + "loss": 0.8202, + "step": 6925 + }, + { + "epoch": 0.30365811856800085, + "grad_norm": 0.8828125, + "learning_rate": 2.556652244526484e-05, + "loss": 0.7349, + "step": 6926 + }, + { + "epoch": 0.3037019617846581, + "grad_norm": 0.76953125, + "learning_rate": 2.556344811542105e-05, + "loss": 0.7173, + "step": 6927 + }, + { + "epoch": 0.3037458050013153, + "grad_norm": 0.83984375, + "learning_rate": 2.5560373943341264e-05, + "loss": 0.8923, + "step": 6928 + }, + { + "epoch": 0.30378964821797255, + "grad_norm": 0.83984375, + "learning_rate": 2.555729992903203e-05, + "loss": 0.7545, + "step": 6929 + }, + { + "epoch": 0.3038334914346298, + "grad_norm": 0.8046875, + "learning_rate": 2.5554226072499855e-05, + "loss": 0.9417, + "step": 6930 + }, + { + "epoch": 0.303877334651287, + "grad_norm": 0.78125, + "learning_rate": 2.555115237375123e-05, + "loss": 0.7768, + "step": 6931 + }, + { + "epoch": 0.30392117786794426, + "grad_norm": 0.9140625, + "learning_rate": 2.5548078832792687e-05, + "loss": 0.9241, + "step": 6932 + }, + { + "epoch": 0.30396502108460144, + "grad_norm": 0.9140625, + "learning_rate": 2.5545005449630734e-05, + "loss": 0.8389, + "step": 6933 + }, + { + "epoch": 0.3040088643012587, + "grad_norm": 0.83984375, + "learning_rate": 2.5541932224271893e-05, + "loss": 0.8389, + "step": 6934 + }, + { + "epoch": 0.3040527075179159, + "grad_norm": 0.85546875, + "learning_rate": 2.5538859156722662e-05, + "loss": 0.7781, + "step": 6935 + }, + { + "epoch": 0.30409655073457315, + "grad_norm": 0.9453125, + "learning_rate": 2.5535786246989534e-05, + "loss": 0.8672, + "step": 6936 + }, + { + "epoch": 0.3041403939512304, + "grad_norm": 0.76171875, + "learning_rate": 2.5532713495079085e-05, + "loss": 0.8117, + "step": 6937 + }, + { + "epoch": 0.3041842371678876, + "grad_norm": 0.80859375, + "learning_rate": 2.55296409009978e-05, + "loss": 0.759, + "step": 6938 + }, + { + "epoch": 0.30422808038454485, + "grad_norm": 0.85546875, + "learning_rate": 2.5526568464752165e-05, + "loss": 0.7531, + "step": 6939 + }, + { + "epoch": 0.3042719236012021, + "grad_norm": 0.75, + "learning_rate": 2.552349618634873e-05, + "loss": 0.8065, + "step": 6940 + }, + { + "epoch": 0.3043157668178593, + "grad_norm": 0.828125, + "learning_rate": 2.5520424065793948e-05, + "loss": 0.7241, + "step": 6941 + }, + { + "epoch": 0.30435961003451656, + "grad_norm": 0.78515625, + "learning_rate": 2.5517352103094405e-05, + "loss": 0.7045, + "step": 6942 + }, + { + "epoch": 0.3044034532511738, + "grad_norm": 0.7578125, + "learning_rate": 2.5514280298256577e-05, + "loss": 0.7952, + "step": 6943 + }, + { + "epoch": 0.30444729646783103, + "grad_norm": 0.75390625, + "learning_rate": 2.5511208651286976e-05, + "loss": 0.7599, + "step": 6944 + }, + { + "epoch": 0.30449113968448827, + "grad_norm": 0.76953125, + "learning_rate": 2.5508137162192103e-05, + "loss": 0.8057, + "step": 6945 + }, + { + "epoch": 0.3045349829011455, + "grad_norm": 0.82421875, + "learning_rate": 2.5505065830978446e-05, + "loss": 0.7973, + "step": 6946 + }, + { + "epoch": 0.30457882611780274, + "grad_norm": 0.83203125, + "learning_rate": 2.5501994657652583e-05, + "loss": 0.7627, + "step": 6947 + }, + { + "epoch": 0.30462266933446, + "grad_norm": 0.79296875, + "learning_rate": 2.5498923642220984e-05, + "loss": 0.7274, + "step": 6948 + }, + { + "epoch": 0.3046665125511172, + "grad_norm": 0.9375, + "learning_rate": 2.5495852784690143e-05, + "loss": 0.8576, + "step": 6949 + }, + { + "epoch": 0.30471035576777444, + "grad_norm": 0.84375, + "learning_rate": 2.5492782085066602e-05, + "loss": 0.8037, + "step": 6950 + }, + { + "epoch": 0.3047541989844317, + "grad_norm": 0.91015625, + "learning_rate": 2.548971154335681e-05, + "loss": 0.8765, + "step": 6951 + }, + { + "epoch": 0.3047980422010889, + "grad_norm": 0.75, + "learning_rate": 2.548664115956735e-05, + "loss": 0.7754, + "step": 6952 + }, + { + "epoch": 0.30484188541774615, + "grad_norm": 0.75390625, + "learning_rate": 2.548357093370469e-05, + "loss": 0.8333, + "step": 6953 + }, + { + "epoch": 0.3048857286344034, + "grad_norm": 0.828125, + "learning_rate": 2.5480500865775348e-05, + "loss": 0.8084, + "step": 6954 + }, + { + "epoch": 0.3049295718510606, + "grad_norm": 0.80078125, + "learning_rate": 2.5477430955785818e-05, + "loss": 0.7697, + "step": 6955 + }, + { + "epoch": 0.30497341506771786, + "grad_norm": 0.8828125, + "learning_rate": 2.5474361203742592e-05, + "loss": 0.8274, + "step": 6956 + }, + { + "epoch": 0.3050172582843751, + "grad_norm": 0.78515625, + "learning_rate": 2.5471291609652225e-05, + "loss": 0.7784, + "step": 6957 + }, + { + "epoch": 0.30506110150103233, + "grad_norm": 0.8203125, + "learning_rate": 2.5468222173521196e-05, + "loss": 0.9084, + "step": 6958 + }, + { + "epoch": 0.30510494471768956, + "grad_norm": 0.93359375, + "learning_rate": 2.5465152895356004e-05, + "loss": 0.9533, + "step": 6959 + }, + { + "epoch": 0.3051487879343468, + "grad_norm": 0.80078125, + "learning_rate": 2.546208377516317e-05, + "loss": 0.7094, + "step": 6960 + }, + { + "epoch": 0.30519263115100403, + "grad_norm": 0.8125, + "learning_rate": 2.5459014812949188e-05, + "loss": 0.7839, + "step": 6961 + }, + { + "epoch": 0.30523647436766127, + "grad_norm": 0.87109375, + "learning_rate": 2.545594600872052e-05, + "loss": 0.9492, + "step": 6962 + }, + { + "epoch": 0.30528031758431845, + "grad_norm": 0.76953125, + "learning_rate": 2.5452877362483764e-05, + "loss": 0.8188, + "step": 6963 + }, + { + "epoch": 0.3053241608009757, + "grad_norm": 0.87109375, + "learning_rate": 2.544980887424536e-05, + "loss": 0.7883, + "step": 6964 + }, + { + "epoch": 0.3053680040176329, + "grad_norm": 0.8046875, + "learning_rate": 2.5446740544011827e-05, + "loss": 0.8312, + "step": 6965 + }, + { + "epoch": 0.30541184723429016, + "grad_norm": 0.8046875, + "learning_rate": 2.544367237178966e-05, + "loss": 0.704, + "step": 6966 + }, + { + "epoch": 0.3054556904509474, + "grad_norm": 0.7421875, + "learning_rate": 2.5440604357585353e-05, + "loss": 0.7931, + "step": 6967 + }, + { + "epoch": 0.30549953366760463, + "grad_norm": 0.80078125, + "learning_rate": 2.5437536501405446e-05, + "loss": 0.9064, + "step": 6968 + }, + { + "epoch": 0.30554337688426186, + "grad_norm": 0.859375, + "learning_rate": 2.543446880325642e-05, + "loss": 0.8607, + "step": 6969 + }, + { + "epoch": 0.3055872201009191, + "grad_norm": 0.91015625, + "learning_rate": 2.5431401263144773e-05, + "loss": 0.8363, + "step": 6970 + }, + { + "epoch": 0.30563106331757633, + "grad_norm": 0.8203125, + "learning_rate": 2.5428333881077006e-05, + "loss": 0.7667, + "step": 6971 + }, + { + "epoch": 0.30567490653423357, + "grad_norm": 0.953125, + "learning_rate": 2.542526665705962e-05, + "loss": 0.9068, + "step": 6972 + }, + { + "epoch": 0.3057187497508908, + "grad_norm": 0.8203125, + "learning_rate": 2.5422199591099126e-05, + "loss": 1.005, + "step": 6973 + }, + { + "epoch": 0.30576259296754804, + "grad_norm": 0.85546875, + "learning_rate": 2.541913268320202e-05, + "loss": 0.8159, + "step": 6974 + }, + { + "epoch": 0.3058064361842053, + "grad_norm": 0.80859375, + "learning_rate": 2.54160659333748e-05, + "loss": 0.7495, + "step": 6975 + }, + { + "epoch": 0.3058502794008625, + "grad_norm": 0.8984375, + "learning_rate": 2.5412999341623932e-05, + "loss": 0.7728, + "step": 6976 + }, + { + "epoch": 0.30589412261751975, + "grad_norm": 0.80078125, + "learning_rate": 2.540993290795598e-05, + "loss": 0.8097, + "step": 6977 + }, + { + "epoch": 0.305937965834177, + "grad_norm": 0.8046875, + "learning_rate": 2.5406866632377403e-05, + "loss": 0.8281, + "step": 6978 + }, + { + "epoch": 0.3059818090508342, + "grad_norm": 0.8203125, + "learning_rate": 2.5403800514894716e-05, + "loss": 0.7904, + "step": 6979 + }, + { + "epoch": 0.30602565226749145, + "grad_norm": 0.7578125, + "learning_rate": 2.5400734555514417e-05, + "loss": 0.733, + "step": 6980 + }, + { + "epoch": 0.3060694954841487, + "grad_norm": 0.8515625, + "learning_rate": 2.5397668754242987e-05, + "loss": 0.7006, + "step": 6981 + }, + { + "epoch": 0.3061133387008059, + "grad_norm": 0.8203125, + "learning_rate": 2.53946031110869e-05, + "loss": 0.7401, + "step": 6982 + }, + { + "epoch": 0.30615718191746316, + "grad_norm": 0.76171875, + "learning_rate": 2.539153762605273e-05, + "loss": 0.8134, + "step": 6983 + }, + { + "epoch": 0.3062010251341204, + "grad_norm": 0.7890625, + "learning_rate": 2.5388472299146927e-05, + "loss": 0.6923, + "step": 6984 + }, + { + "epoch": 0.30624486835077763, + "grad_norm": 0.921875, + "learning_rate": 2.5385407130375994e-05, + "loss": 0.8534, + "step": 6985 + }, + { + "epoch": 0.30628871156743487, + "grad_norm": 0.859375, + "learning_rate": 2.5382342119746417e-05, + "loss": 0.8127, + "step": 6986 + }, + { + "epoch": 0.3063325547840921, + "grad_norm": 0.86328125, + "learning_rate": 2.537927726726468e-05, + "loss": 0.9515, + "step": 6987 + }, + { + "epoch": 0.30637639800074934, + "grad_norm": 0.8828125, + "learning_rate": 2.537621257293732e-05, + "loss": 0.923, + "step": 6988 + }, + { + "epoch": 0.3064202412174066, + "grad_norm": 0.80859375, + "learning_rate": 2.5373148036770822e-05, + "loss": 0.7771, + "step": 6989 + }, + { + "epoch": 0.3064640844340638, + "grad_norm": 0.80078125, + "learning_rate": 2.5370083658771672e-05, + "loss": 0.7822, + "step": 6990 + }, + { + "epoch": 0.30650792765072105, + "grad_norm": 1.1171875, + "learning_rate": 2.536701943894636e-05, + "loss": 0.9117, + "step": 6991 + }, + { + "epoch": 0.3065517708673783, + "grad_norm": 0.8203125, + "learning_rate": 2.5363955377301353e-05, + "loss": 0.818, + "step": 6992 + }, + { + "epoch": 0.30659561408403546, + "grad_norm": 0.921875, + "learning_rate": 2.5360891473843206e-05, + "loss": 0.7649, + "step": 6993 + }, + { + "epoch": 0.3066394573006927, + "grad_norm": 0.828125, + "learning_rate": 2.535782772857839e-05, + "loss": 0.8497, + "step": 6994 + }, + { + "epoch": 0.30668330051734993, + "grad_norm": 0.80078125, + "learning_rate": 2.5354764141513386e-05, + "loss": 0.8139, + "step": 6995 + }, + { + "epoch": 0.30672714373400717, + "grad_norm": 0.7890625, + "learning_rate": 2.535170071265469e-05, + "loss": 0.6865, + "step": 6996 + }, + { + "epoch": 0.3067709869506644, + "grad_norm": 0.79296875, + "learning_rate": 2.5348637442008773e-05, + "loss": 0.7759, + "step": 6997 + }, + { + "epoch": 0.30681483016732164, + "grad_norm": 0.796875, + "learning_rate": 2.534557432958218e-05, + "loss": 0.6806, + "step": 6998 + }, + { + "epoch": 0.3068586733839789, + "grad_norm": 0.890625, + "learning_rate": 2.5342511375381384e-05, + "loss": 0.9119, + "step": 6999 + }, + { + "epoch": 0.3069025166006361, + "grad_norm": 0.8828125, + "learning_rate": 2.533944857941285e-05, + "loss": 0.774, + "step": 7000 + }, + { + "epoch": 0.3069025166006361, + "eval_loss": 0.7985827326774597, + "eval_runtime": 299.9182, + "eval_samples_per_second": 33.342, + "eval_steps_per_second": 0.697, + "step": 7000 + }, + { + "epoch": 0.30694635981729335, + "grad_norm": 0.8515625, + "learning_rate": 2.5336385941683105e-05, + "loss": 0.8799, + "step": 7001 + }, + { + "epoch": 0.3069902030339506, + "grad_norm": 0.80078125, + "learning_rate": 2.5333323462198588e-05, + "loss": 0.8423, + "step": 7002 + }, + { + "epoch": 0.3070340462506078, + "grad_norm": 0.90234375, + "learning_rate": 2.533026114096585e-05, + "loss": 1.0042, + "step": 7003 + }, + { + "epoch": 0.30707788946726505, + "grad_norm": 0.81640625, + "learning_rate": 2.532719897799135e-05, + "loss": 0.8157, + "step": 7004 + }, + { + "epoch": 0.3071217326839223, + "grad_norm": 0.921875, + "learning_rate": 2.5324136973281597e-05, + "loss": 0.8182, + "step": 7005 + }, + { + "epoch": 0.3071655759005795, + "grad_norm": 0.828125, + "learning_rate": 2.5321075126843054e-05, + "loss": 0.816, + "step": 7006 + }, + { + "epoch": 0.30720941911723676, + "grad_norm": 0.8828125, + "learning_rate": 2.531801343868221e-05, + "loss": 0.7585, + "step": 7007 + }, + { + "epoch": 0.307253262333894, + "grad_norm": 0.92578125, + "learning_rate": 2.5314951908805583e-05, + "loss": 0.8691, + "step": 7008 + }, + { + "epoch": 0.30729710555055123, + "grad_norm": 0.80078125, + "learning_rate": 2.5311890537219652e-05, + "loss": 0.6583, + "step": 7009 + }, + { + "epoch": 0.30734094876720847, + "grad_norm": 0.7890625, + "learning_rate": 2.5308829323930895e-05, + "loss": 0.8544, + "step": 7010 + }, + { + "epoch": 0.3073847919838657, + "grad_norm": 0.8515625, + "learning_rate": 2.530576826894582e-05, + "loss": 0.9524, + "step": 7011 + }, + { + "epoch": 0.30742863520052294, + "grad_norm": 0.7578125, + "learning_rate": 2.5302707372270885e-05, + "loss": 0.7163, + "step": 7012 + }, + { + "epoch": 0.3074724784171802, + "grad_norm": 0.8125, + "learning_rate": 2.52996466339126e-05, + "loss": 0.778, + "step": 7013 + }, + { + "epoch": 0.3075163216338374, + "grad_norm": 0.80859375, + "learning_rate": 2.5296586053877414e-05, + "loss": 0.7913, + "step": 7014 + }, + { + "epoch": 0.30756016485049464, + "grad_norm": 0.80078125, + "learning_rate": 2.5293525632171865e-05, + "loss": 0.7635, + "step": 7015 + }, + { + "epoch": 0.3076040080671519, + "grad_norm": 0.875, + "learning_rate": 2.529046536880243e-05, + "loss": 0.8014, + "step": 7016 + }, + { + "epoch": 0.3076478512838091, + "grad_norm": 0.88671875, + "learning_rate": 2.5287405263775575e-05, + "loss": 0.8437, + "step": 7017 + }, + { + "epoch": 0.30769169450046635, + "grad_norm": 0.796875, + "learning_rate": 2.5284345317097803e-05, + "loss": 0.6795, + "step": 7018 + }, + { + "epoch": 0.3077355377171236, + "grad_norm": 0.83203125, + "learning_rate": 2.5281285528775578e-05, + "loss": 0.757, + "step": 7019 + }, + { + "epoch": 0.3077793809337808, + "grad_norm": 0.9140625, + "learning_rate": 2.5278225898815412e-05, + "loss": 0.793, + "step": 7020 + }, + { + "epoch": 0.30782322415043806, + "grad_norm": 0.79296875, + "learning_rate": 2.5275166427223763e-05, + "loss": 0.7476, + "step": 7021 + }, + { + "epoch": 0.3078670673670953, + "grad_norm": 0.7890625, + "learning_rate": 2.5272107114007105e-05, + "loss": 0.8211, + "step": 7022 + }, + { + "epoch": 0.30791091058375253, + "grad_norm": 0.76171875, + "learning_rate": 2.5269047959171978e-05, + "loss": 0.7525, + "step": 7023 + }, + { + "epoch": 0.3079547538004097, + "grad_norm": 0.8359375, + "learning_rate": 2.526598896272482e-05, + "loss": 0.8786, + "step": 7024 + }, + { + "epoch": 0.30799859701706694, + "grad_norm": 0.890625, + "learning_rate": 2.5262930124672134e-05, + "loss": 0.8333, + "step": 7025 + }, + { + "epoch": 0.3080424402337242, + "grad_norm": 0.80859375, + "learning_rate": 2.52598714450204e-05, + "loss": 0.7054, + "step": 7026 + }, + { + "epoch": 0.3080862834503814, + "grad_norm": 0.91015625, + "learning_rate": 2.525681292377606e-05, + "loss": 0.8164, + "step": 7027 + }, + { + "epoch": 0.30813012666703865, + "grad_norm": 0.80859375, + "learning_rate": 2.5253754560945665e-05, + "loss": 0.7879, + "step": 7028 + }, + { + "epoch": 0.3081739698836959, + "grad_norm": 0.734375, + "learning_rate": 2.5250696356535664e-05, + "loss": 0.8699, + "step": 7029 + }, + { + "epoch": 0.3082178131003531, + "grad_norm": 0.7421875, + "learning_rate": 2.5247638310552534e-05, + "loss": 0.8214, + "step": 7030 + }, + { + "epoch": 0.30826165631701036, + "grad_norm": 0.8359375, + "learning_rate": 2.5244580423002773e-05, + "loss": 0.7618, + "step": 7031 + }, + { + "epoch": 0.3083054995336676, + "grad_norm": 1.3203125, + "learning_rate": 2.5241522693892815e-05, + "loss": 0.8179, + "step": 7032 + }, + { + "epoch": 0.30834934275032483, + "grad_norm": 0.78125, + "learning_rate": 2.5238465123229216e-05, + "loss": 0.7796, + "step": 7033 + }, + { + "epoch": 0.30839318596698206, + "grad_norm": 0.9140625, + "learning_rate": 2.523540771101841e-05, + "loss": 0.8906, + "step": 7034 + }, + { + "epoch": 0.3084370291836393, + "grad_norm": 0.796875, + "learning_rate": 2.5232350457266873e-05, + "loss": 0.8134, + "step": 7035 + }, + { + "epoch": 0.30848087240029654, + "grad_norm": 0.82421875, + "learning_rate": 2.522929336198111e-05, + "loss": 0.7674, + "step": 7036 + }, + { + "epoch": 0.30852471561695377, + "grad_norm": 0.91015625, + "learning_rate": 2.522623642516755e-05, + "loss": 0.7012, + "step": 7037 + }, + { + "epoch": 0.308568558833611, + "grad_norm": 0.82421875, + "learning_rate": 2.522317964683274e-05, + "loss": 0.8237, + "step": 7038 + }, + { + "epoch": 0.30861240205026824, + "grad_norm": 0.859375, + "learning_rate": 2.5220123026983135e-05, + "loss": 0.729, + "step": 7039 + }, + { + "epoch": 0.3086562452669255, + "grad_norm": 0.859375, + "learning_rate": 2.521706656562519e-05, + "loss": 0.7835, + "step": 7040 + }, + { + "epoch": 0.3087000884835827, + "grad_norm": 0.859375, + "learning_rate": 2.521401026276541e-05, + "loss": 0.8372, + "step": 7041 + }, + { + "epoch": 0.30874393170023995, + "grad_norm": 0.8828125, + "learning_rate": 2.5210954118410223e-05, + "loss": 0.7103, + "step": 7042 + }, + { + "epoch": 0.3087877749168972, + "grad_norm": 0.83984375, + "learning_rate": 2.5207898132566176e-05, + "loss": 0.825, + "step": 7043 + }, + { + "epoch": 0.3088316181335544, + "grad_norm": 0.89453125, + "learning_rate": 2.5204842305239718e-05, + "loss": 0.8987, + "step": 7044 + }, + { + "epoch": 0.30887546135021166, + "grad_norm": 0.796875, + "learning_rate": 2.520178663643732e-05, + "loss": 0.8438, + "step": 7045 + }, + { + "epoch": 0.3089193045668689, + "grad_norm": 0.8203125, + "learning_rate": 2.5198731126165454e-05, + "loss": 0.7928, + "step": 7046 + }, + { + "epoch": 0.3089631477835261, + "grad_norm": 0.80078125, + "learning_rate": 2.519567577443057e-05, + "loss": 0.7594, + "step": 7047 + }, + { + "epoch": 0.30900699100018336, + "grad_norm": 0.8515625, + "learning_rate": 2.5192620581239224e-05, + "loss": 0.8219, + "step": 7048 + }, + { + "epoch": 0.3090508342168406, + "grad_norm": 0.85546875, + "learning_rate": 2.5189565546597826e-05, + "loss": 0.9174, + "step": 7049 + }, + { + "epoch": 0.30909467743349783, + "grad_norm": 0.7890625, + "learning_rate": 2.5186510670512874e-05, + "loss": 0.8207, + "step": 7050 + }, + { + "epoch": 0.30913852065015507, + "grad_norm": 0.90625, + "learning_rate": 2.518345595299083e-05, + "loss": 0.8354, + "step": 7051 + }, + { + "epoch": 0.3091823638668123, + "grad_norm": 0.76171875, + "learning_rate": 2.5180401394038155e-05, + "loss": 0.8135, + "step": 7052 + }, + { + "epoch": 0.30922620708346954, + "grad_norm": 0.79296875, + "learning_rate": 2.5177346993661367e-05, + "loss": 0.8323, + "step": 7053 + }, + { + "epoch": 0.3092700503001267, + "grad_norm": 0.88671875, + "learning_rate": 2.5174292751866914e-05, + "loss": 0.8254, + "step": 7054 + }, + { + "epoch": 0.30931389351678396, + "grad_norm": 0.96484375, + "learning_rate": 2.5171238668661267e-05, + "loss": 0.8644, + "step": 7055 + }, + { + "epoch": 0.3093577367334412, + "grad_norm": 0.84765625, + "learning_rate": 2.5168184744050916e-05, + "loss": 0.7922, + "step": 7056 + }, + { + "epoch": 0.3094015799500984, + "grad_norm": 0.9765625, + "learning_rate": 2.5165130978042306e-05, + "loss": 0.8639, + "step": 7057 + }, + { + "epoch": 0.30944542316675566, + "grad_norm": 2.078125, + "learning_rate": 2.5162077370641934e-05, + "loss": 0.9534, + "step": 7058 + }, + { + "epoch": 0.3094892663834129, + "grad_norm": 0.7734375, + "learning_rate": 2.515902392185626e-05, + "loss": 0.7295, + "step": 7059 + }, + { + "epoch": 0.30953310960007013, + "grad_norm": 1.0625, + "learning_rate": 2.515597063169175e-05, + "loss": 1.0125, + "step": 7060 + }, + { + "epoch": 0.30957695281672737, + "grad_norm": 0.85546875, + "learning_rate": 2.5152917500154894e-05, + "loss": 0.8647, + "step": 7061 + }, + { + "epoch": 0.3096207960333846, + "grad_norm": 0.76171875, + "learning_rate": 2.514986452725211e-05, + "loss": 0.8303, + "step": 7062 + }, + { + "epoch": 0.30966463925004184, + "grad_norm": 0.86328125, + "learning_rate": 2.5146811712989947e-05, + "loss": 0.9007, + "step": 7063 + }, + { + "epoch": 0.3097084824666991, + "grad_norm": 0.7890625, + "learning_rate": 2.5143759057374826e-05, + "loss": 0.7765, + "step": 7064 + }, + { + "epoch": 0.3097523256833563, + "grad_norm": 0.8046875, + "learning_rate": 2.5140706560413242e-05, + "loss": 0.7582, + "step": 7065 + }, + { + "epoch": 0.30979616890001355, + "grad_norm": 0.921875, + "learning_rate": 2.5137654222111652e-05, + "loss": 0.9708, + "step": 7066 + }, + { + "epoch": 0.3098400121166708, + "grad_norm": 0.80859375, + "learning_rate": 2.5134602042476497e-05, + "loss": 0.8756, + "step": 7067 + }, + { + "epoch": 0.309883855333328, + "grad_norm": 0.82421875, + "learning_rate": 2.5131550021514293e-05, + "loss": 0.8984, + "step": 7068 + }, + { + "epoch": 0.30992769854998525, + "grad_norm": 0.84765625, + "learning_rate": 2.5128498159231494e-05, + "loss": 1.0007, + "step": 7069 + }, + { + "epoch": 0.3099715417666425, + "grad_norm": 0.7734375, + "learning_rate": 2.512544645563456e-05, + "loss": 0.785, + "step": 7070 + }, + { + "epoch": 0.3100153849832997, + "grad_norm": 0.77734375, + "learning_rate": 2.5122394910729973e-05, + "loss": 0.7303, + "step": 7071 + }, + { + "epoch": 0.31005922819995696, + "grad_norm": 0.76171875, + "learning_rate": 2.5119343524524153e-05, + "loss": 0.7722, + "step": 7072 + }, + { + "epoch": 0.3101030714166142, + "grad_norm": 0.796875, + "learning_rate": 2.5116292297023636e-05, + "loss": 0.7085, + "step": 7073 + }, + { + "epoch": 0.31014691463327143, + "grad_norm": 0.765625, + "learning_rate": 2.511324122823485e-05, + "loss": 0.7481, + "step": 7074 + }, + { + "epoch": 0.31019075784992867, + "grad_norm": 0.8203125, + "learning_rate": 2.5110190318164282e-05, + "loss": 0.8463, + "step": 7075 + }, + { + "epoch": 0.3102346010665859, + "grad_norm": 0.84765625, + "learning_rate": 2.510713956681837e-05, + "loss": 0.7219, + "step": 7076 + }, + { + "epoch": 0.31027844428324314, + "grad_norm": 0.83203125, + "learning_rate": 2.5104088974203577e-05, + "loss": 0.9231, + "step": 7077 + }, + { + "epoch": 0.3103222874999004, + "grad_norm": 0.91015625, + "learning_rate": 2.5101038540326416e-05, + "loss": 0.9064, + "step": 7078 + }, + { + "epoch": 0.3103661307165576, + "grad_norm": 0.7890625, + "learning_rate": 2.5097988265193317e-05, + "loss": 0.7551, + "step": 7079 + }, + { + "epoch": 0.31040997393321484, + "grad_norm": 0.77734375, + "learning_rate": 2.5094938148810753e-05, + "loss": 0.7647, + "step": 7080 + }, + { + "epoch": 0.3104538171498721, + "grad_norm": 0.83984375, + "learning_rate": 2.50918881911852e-05, + "loss": 0.8724, + "step": 7081 + }, + { + "epoch": 0.3104976603665293, + "grad_norm": 0.81640625, + "learning_rate": 2.5088838392323056e-05, + "loss": 0.7501, + "step": 7082 + }, + { + "epoch": 0.31054150358318655, + "grad_norm": 0.78125, + "learning_rate": 2.5085788752230875e-05, + "loss": 0.8189, + "step": 7083 + }, + { + "epoch": 0.31058534679984373, + "grad_norm": 0.8515625, + "learning_rate": 2.508273927091509e-05, + "loss": 0.785, + "step": 7084 + }, + { + "epoch": 0.31062919001650097, + "grad_norm": 0.80078125, + "learning_rate": 2.5079689948382147e-05, + "loss": 0.7365, + "step": 7085 + }, + { + "epoch": 0.3106730332331582, + "grad_norm": 0.78515625, + "learning_rate": 2.5076640784638528e-05, + "loss": 0.7916, + "step": 7086 + }, + { + "epoch": 0.31071687644981544, + "grad_norm": 0.73828125, + "learning_rate": 2.5073591779690652e-05, + "loss": 0.7221, + "step": 7087 + }, + { + "epoch": 0.3107607196664727, + "grad_norm": 0.83203125, + "learning_rate": 2.507054293354505e-05, + "loss": 0.8427, + "step": 7088 + }, + { + "epoch": 0.3108045628831299, + "grad_norm": 0.85546875, + "learning_rate": 2.5067494246208145e-05, + "loss": 0.7471, + "step": 7089 + }, + { + "epoch": 0.31084840609978714, + "grad_norm": 0.859375, + "learning_rate": 2.50644457176864e-05, + "loss": 0.7761, + "step": 7090 + }, + { + "epoch": 0.3108922493164444, + "grad_norm": 0.84375, + "learning_rate": 2.506139734798628e-05, + "loss": 0.8023, + "step": 7091 + }, + { + "epoch": 0.3109360925331016, + "grad_norm": 0.77734375, + "learning_rate": 2.505834913711422e-05, + "loss": 0.7619, + "step": 7092 + }, + { + "epoch": 0.31097993574975885, + "grad_norm": 0.76953125, + "learning_rate": 2.505530108507673e-05, + "loss": 0.8146, + "step": 7093 + }, + { + "epoch": 0.3110237789664161, + "grad_norm": 0.875, + "learning_rate": 2.505225319188025e-05, + "loss": 0.9058, + "step": 7094 + }, + { + "epoch": 0.3110676221830733, + "grad_norm": 0.8203125, + "learning_rate": 2.5049205457531234e-05, + "loss": 0.8681, + "step": 7095 + }, + { + "epoch": 0.31111146539973056, + "grad_norm": 0.828125, + "learning_rate": 2.5046157882036147e-05, + "loss": 0.8054, + "step": 7096 + }, + { + "epoch": 0.3111553086163878, + "grad_norm": 0.84765625, + "learning_rate": 2.504311046540144e-05, + "loss": 0.9022, + "step": 7097 + }, + { + "epoch": 0.31119915183304503, + "grad_norm": 0.84765625, + "learning_rate": 2.504006320763358e-05, + "loss": 0.7406, + "step": 7098 + }, + { + "epoch": 0.31124299504970226, + "grad_norm": 0.8046875, + "learning_rate": 2.5037016108738988e-05, + "loss": 0.7397, + "step": 7099 + }, + { + "epoch": 0.3112868382663595, + "grad_norm": 0.91796875, + "learning_rate": 2.5033969168724182e-05, + "loss": 0.9083, + "step": 7100 + }, + { + "epoch": 0.31133068148301674, + "grad_norm": 0.87890625, + "learning_rate": 2.50309223875956e-05, + "loss": 0.8983, + "step": 7101 + }, + { + "epoch": 0.31137452469967397, + "grad_norm": 0.7890625, + "learning_rate": 2.502787576535969e-05, + "loss": 0.7629, + "step": 7102 + }, + { + "epoch": 0.3114183679163312, + "grad_norm": 0.81640625, + "learning_rate": 2.5024829302022913e-05, + "loss": 0.7852, + "step": 7103 + }, + { + "epoch": 0.31146221113298844, + "grad_norm": 0.80078125, + "learning_rate": 2.5021782997591724e-05, + "loss": 0.7795, + "step": 7104 + }, + { + "epoch": 0.3115060543496457, + "grad_norm": 0.828125, + "learning_rate": 2.5018736852072576e-05, + "loss": 0.8383, + "step": 7105 + }, + { + "epoch": 0.3115498975663029, + "grad_norm": 0.7890625, + "learning_rate": 2.501569086547194e-05, + "loss": 0.8244, + "step": 7106 + }, + { + "epoch": 0.31159374078296015, + "grad_norm": 0.80078125, + "learning_rate": 2.5012645037796255e-05, + "loss": 0.8626, + "step": 7107 + }, + { + "epoch": 0.3116375839996174, + "grad_norm": 0.796875, + "learning_rate": 2.500959936905195e-05, + "loss": 0.8821, + "step": 7108 + }, + { + "epoch": 0.3116814272162746, + "grad_norm": 0.81640625, + "learning_rate": 2.5006553859245553e-05, + "loss": 0.8344, + "step": 7109 + }, + { + "epoch": 0.31172527043293186, + "grad_norm": 0.7265625, + "learning_rate": 2.5003508508383467e-05, + "loss": 0.6686, + "step": 7110 + }, + { + "epoch": 0.3117691136495891, + "grad_norm": 0.84765625, + "learning_rate": 2.5000463316472168e-05, + "loss": 0.8695, + "step": 7111 + }, + { + "epoch": 0.3118129568662463, + "grad_norm": 0.82421875, + "learning_rate": 2.4997418283518094e-05, + "loss": 0.9329, + "step": 7112 + }, + { + "epoch": 0.31185680008290356, + "grad_norm": 0.78125, + "learning_rate": 2.4994373409527672e-05, + "loss": 0.7536, + "step": 7113 + }, + { + "epoch": 0.3119006432995608, + "grad_norm": 0.83203125, + "learning_rate": 2.4991328694507422e-05, + "loss": 0.8329, + "step": 7114 + }, + { + "epoch": 0.311944486516218, + "grad_norm": 0.75, + "learning_rate": 2.498828413846377e-05, + "loss": 0.7696, + "step": 7115 + }, + { + "epoch": 0.3119883297328752, + "grad_norm": 0.79296875, + "learning_rate": 2.4985239741403154e-05, + "loss": 0.8025, + "step": 7116 + }, + { + "epoch": 0.31203217294953245, + "grad_norm": 0.79296875, + "learning_rate": 2.498219550333204e-05, + "loss": 0.7422, + "step": 7117 + }, + { + "epoch": 0.3120760161661897, + "grad_norm": 0.8984375, + "learning_rate": 2.497915142425684e-05, + "loss": 0.7209, + "step": 7118 + }, + { + "epoch": 0.3121198593828469, + "grad_norm": 0.8359375, + "learning_rate": 2.4976107504184065e-05, + "loss": 0.8877, + "step": 7119 + }, + { + "epoch": 0.31216370259950416, + "grad_norm": 0.79296875, + "learning_rate": 2.497306374312015e-05, + "loss": 0.8182, + "step": 7120 + }, + { + "epoch": 0.3122075458161614, + "grad_norm": 0.79296875, + "learning_rate": 2.4970020141071536e-05, + "loss": 0.8263, + "step": 7121 + }, + { + "epoch": 0.3122513890328186, + "grad_norm": 0.890625, + "learning_rate": 2.4966976698044688e-05, + "loss": 0.857, + "step": 7122 + }, + { + "epoch": 0.31229523224947586, + "grad_norm": 0.84375, + "learning_rate": 2.4963933414046005e-05, + "loss": 0.7575, + "step": 7123 + }, + { + "epoch": 0.3123390754661331, + "grad_norm": 0.80859375, + "learning_rate": 2.4960890289081994e-05, + "loss": 0.7886, + "step": 7124 + }, + { + "epoch": 0.31238291868279033, + "grad_norm": 0.765625, + "learning_rate": 2.49578473231591e-05, + "loss": 0.7836, + "step": 7125 + }, + { + "epoch": 0.31242676189944757, + "grad_norm": 0.83203125, + "learning_rate": 2.4954804516283757e-05, + "loss": 0.7699, + "step": 7126 + }, + { + "epoch": 0.3124706051161048, + "grad_norm": 0.7578125, + "learning_rate": 2.495176186846242e-05, + "loss": 0.6931, + "step": 7127 + }, + { + "epoch": 0.31251444833276204, + "grad_norm": 0.8515625, + "learning_rate": 2.49487193797015e-05, + "loss": 0.8588, + "step": 7128 + }, + { + "epoch": 0.3125582915494193, + "grad_norm": 0.77734375, + "learning_rate": 2.494567705000751e-05, + "loss": 0.7849, + "step": 7129 + }, + { + "epoch": 0.3126021347660765, + "grad_norm": 0.97265625, + "learning_rate": 2.4942634879386872e-05, + "loss": 0.8951, + "step": 7130 + }, + { + "epoch": 0.31264597798273375, + "grad_norm": 0.796875, + "learning_rate": 2.493959286784603e-05, + "loss": 0.7344, + "step": 7131 + }, + { + "epoch": 0.312689821199391, + "grad_norm": 0.83203125, + "learning_rate": 2.493655101539142e-05, + "loss": 0.8454, + "step": 7132 + }, + { + "epoch": 0.3127336644160482, + "grad_norm": 0.82421875, + "learning_rate": 2.493350932202948e-05, + "loss": 0.7983, + "step": 7133 + }, + { + "epoch": 0.31277750763270545, + "grad_norm": 0.98046875, + "learning_rate": 2.4930467787766697e-05, + "loss": 0.9174, + "step": 7134 + }, + { + "epoch": 0.3128213508493627, + "grad_norm": 0.75, + "learning_rate": 2.49274264126095e-05, + "loss": 0.8231, + "step": 7135 + }, + { + "epoch": 0.3128651940660199, + "grad_norm": 0.828125, + "learning_rate": 2.4924385196564336e-05, + "loss": 0.8509, + "step": 7136 + }, + { + "epoch": 0.31290903728267716, + "grad_norm": 0.83203125, + "learning_rate": 2.492134413963765e-05, + "loss": 0.8354, + "step": 7137 + }, + { + "epoch": 0.3129528804993344, + "grad_norm": 0.86328125, + "learning_rate": 2.491830324183585e-05, + "loss": 0.9523, + "step": 7138 + }, + { + "epoch": 0.31299672371599163, + "grad_norm": 0.796875, + "learning_rate": 2.4915262503165436e-05, + "loss": 0.7858, + "step": 7139 + }, + { + "epoch": 0.31304056693264887, + "grad_norm": 0.9921875, + "learning_rate": 2.491222192363284e-05, + "loss": 0.8554, + "step": 7140 + }, + { + "epoch": 0.3130844101493061, + "grad_norm": 0.8203125, + "learning_rate": 2.4909181503244493e-05, + "loss": 0.9881, + "step": 7141 + }, + { + "epoch": 0.31312825336596334, + "grad_norm": 0.76171875, + "learning_rate": 2.490614124200685e-05, + "loss": 0.6588, + "step": 7142 + }, + { + "epoch": 0.3131720965826206, + "grad_norm": 0.86328125, + "learning_rate": 2.4903101139926345e-05, + "loss": 0.746, + "step": 7143 + }, + { + "epoch": 0.3132159397992778, + "grad_norm": 0.921875, + "learning_rate": 2.4900061197009427e-05, + "loss": 0.8199, + "step": 7144 + }, + { + "epoch": 0.313259783015935, + "grad_norm": 0.890625, + "learning_rate": 2.4897021413262533e-05, + "loss": 0.8981, + "step": 7145 + }, + { + "epoch": 0.3133036262325922, + "grad_norm": 0.77734375, + "learning_rate": 2.489398178869211e-05, + "loss": 0.6966, + "step": 7146 + }, + { + "epoch": 0.31334746944924946, + "grad_norm": 0.84765625, + "learning_rate": 2.489094232330458e-05, + "loss": 0.7937, + "step": 7147 + }, + { + "epoch": 0.3133913126659067, + "grad_norm": 0.8984375, + "learning_rate": 2.4887903017106417e-05, + "loss": 0.9372, + "step": 7148 + }, + { + "epoch": 0.31343515588256393, + "grad_norm": 0.8359375, + "learning_rate": 2.488486387010407e-05, + "loss": 0.8313, + "step": 7149 + }, + { + "epoch": 0.31347899909922117, + "grad_norm": 0.8671875, + "learning_rate": 2.488182488230395e-05, + "loss": 0.7756, + "step": 7150 + }, + { + "epoch": 0.3135228423158784, + "grad_norm": 0.76953125, + "learning_rate": 2.48787860537125e-05, + "loss": 0.8655, + "step": 7151 + }, + { + "epoch": 0.31356668553253564, + "grad_norm": 0.85546875, + "learning_rate": 2.487574738433618e-05, + "loss": 0.7806, + "step": 7152 + }, + { + "epoch": 0.3136105287491929, + "grad_norm": 0.87890625, + "learning_rate": 2.4872708874181395e-05, + "loss": 0.8998, + "step": 7153 + }, + { + "epoch": 0.3136543719658501, + "grad_norm": 0.87109375, + "learning_rate": 2.4869670523254628e-05, + "loss": 0.919, + "step": 7154 + }, + { + "epoch": 0.31369821518250735, + "grad_norm": 0.90234375, + "learning_rate": 2.48666323315623e-05, + "loss": 0.7591, + "step": 7155 + }, + { + "epoch": 0.3137420583991646, + "grad_norm": 0.80078125, + "learning_rate": 2.4863594299110858e-05, + "loss": 0.8578, + "step": 7156 + }, + { + "epoch": 0.3137859016158218, + "grad_norm": 0.77734375, + "learning_rate": 2.4860556425906733e-05, + "loss": 0.9096, + "step": 7157 + }, + { + "epoch": 0.31382974483247905, + "grad_norm": 0.8046875, + "learning_rate": 2.485751871195633e-05, + "loss": 0.835, + "step": 7158 + }, + { + "epoch": 0.3138735880491363, + "grad_norm": 0.77734375, + "learning_rate": 2.4854481157266153e-05, + "loss": 0.6927, + "step": 7159 + }, + { + "epoch": 0.3139174312657935, + "grad_norm": 0.78515625, + "learning_rate": 2.4851443761842607e-05, + "loss": 0.95, + "step": 7160 + }, + { + "epoch": 0.31396127448245076, + "grad_norm": 0.87890625, + "learning_rate": 2.4848406525692124e-05, + "loss": 0.8366, + "step": 7161 + }, + { + "epoch": 0.314005117699108, + "grad_norm": 0.8828125, + "learning_rate": 2.4845369448821156e-05, + "loss": 0.736, + "step": 7162 + }, + { + "epoch": 0.31404896091576523, + "grad_norm": 0.9609375, + "learning_rate": 2.48423325312361e-05, + "loss": 0.8245, + "step": 7163 + }, + { + "epoch": 0.31409280413242247, + "grad_norm": 0.7890625, + "learning_rate": 2.483929577294346e-05, + "loss": 0.7352, + "step": 7164 + }, + { + "epoch": 0.3141366473490797, + "grad_norm": 0.8046875, + "learning_rate": 2.4836259173949626e-05, + "loss": 0.7373, + "step": 7165 + }, + { + "epoch": 0.31418049056573694, + "grad_norm": 0.80078125, + "learning_rate": 2.4833222734261052e-05, + "loss": 0.7457, + "step": 7166 + }, + { + "epoch": 0.31422433378239417, + "grad_norm": 0.8125, + "learning_rate": 2.4830186453884163e-05, + "loss": 0.7458, + "step": 7167 + }, + { + "epoch": 0.3142681769990514, + "grad_norm": 1.0078125, + "learning_rate": 2.4827150332825366e-05, + "loss": 0.779, + "step": 7168 + }, + { + "epoch": 0.31431202021570864, + "grad_norm": 1.203125, + "learning_rate": 2.4824114371091155e-05, + "loss": 0.7429, + "step": 7169 + }, + { + "epoch": 0.3143558634323659, + "grad_norm": 0.70703125, + "learning_rate": 2.482107856868795e-05, + "loss": 0.7924, + "step": 7170 + }, + { + "epoch": 0.3143997066490231, + "grad_norm": 0.859375, + "learning_rate": 2.4818042925622165e-05, + "loss": 0.794, + "step": 7171 + }, + { + "epoch": 0.31444354986568035, + "grad_norm": 0.890625, + "learning_rate": 2.481500744190024e-05, + "loss": 0.8892, + "step": 7172 + }, + { + "epoch": 0.3144873930823376, + "grad_norm": 0.734375, + "learning_rate": 2.481197211752857e-05, + "loss": 0.8269, + "step": 7173 + }, + { + "epoch": 0.3145312362989948, + "grad_norm": 0.875, + "learning_rate": 2.4808936952513673e-05, + "loss": 0.814, + "step": 7174 + }, + { + "epoch": 0.314575079515652, + "grad_norm": 0.859375, + "learning_rate": 2.480590194686192e-05, + "loss": 0.7602, + "step": 7175 + }, + { + "epoch": 0.31461892273230924, + "grad_norm": 0.859375, + "learning_rate": 2.4802867100579774e-05, + "loss": 0.7573, + "step": 7176 + }, + { + "epoch": 0.31466276594896647, + "grad_norm": 0.90625, + "learning_rate": 2.4799832413673653e-05, + "loss": 0.7359, + "step": 7177 + }, + { + "epoch": 0.3147066091656237, + "grad_norm": 0.8046875, + "learning_rate": 2.4796797886149947e-05, + "loss": 0.7442, + "step": 7178 + }, + { + "epoch": 0.31475045238228094, + "grad_norm": 0.73828125, + "learning_rate": 2.4793763518015168e-05, + "loss": 0.766, + "step": 7179 + }, + { + "epoch": 0.3147942955989382, + "grad_norm": 0.78515625, + "learning_rate": 2.4790729309275706e-05, + "loss": 0.6874, + "step": 7180 + }, + { + "epoch": 0.3148381388155954, + "grad_norm": 0.828125, + "learning_rate": 2.4787695259938006e-05, + "loss": 0.8834, + "step": 7181 + }, + { + "epoch": 0.31488198203225265, + "grad_norm": 0.81640625, + "learning_rate": 2.4784661370008467e-05, + "loss": 0.7647, + "step": 7182 + }, + { + "epoch": 0.3149258252489099, + "grad_norm": 0.82421875, + "learning_rate": 2.4781627639493555e-05, + "loss": 0.8127, + "step": 7183 + }, + { + "epoch": 0.3149696684655671, + "grad_norm": 0.79296875, + "learning_rate": 2.4778594068399673e-05, + "loss": 0.6966, + "step": 7184 + }, + { + "epoch": 0.31501351168222436, + "grad_norm": 0.81640625, + "learning_rate": 2.4775560656733244e-05, + "loss": 0.7881, + "step": 7185 + }, + { + "epoch": 0.3150573548988816, + "grad_norm": 0.85546875, + "learning_rate": 2.4772527404500734e-05, + "loss": 0.7553, + "step": 7186 + }, + { + "epoch": 0.3151011981155388, + "grad_norm": 0.82421875, + "learning_rate": 2.4769494311708552e-05, + "loss": 0.8473, + "step": 7187 + }, + { + "epoch": 0.31514504133219606, + "grad_norm": 0.77734375, + "learning_rate": 2.4766461378363127e-05, + "loss": 0.7895, + "step": 7188 + }, + { + "epoch": 0.3151888845488533, + "grad_norm": 0.78125, + "learning_rate": 2.4763428604470894e-05, + "loss": 0.8831, + "step": 7189 + }, + { + "epoch": 0.31523272776551053, + "grad_norm": 0.82421875, + "learning_rate": 2.4760395990038266e-05, + "loss": 0.7632, + "step": 7190 + }, + { + "epoch": 0.31527657098216777, + "grad_norm": 0.91015625, + "learning_rate": 2.475736353507169e-05, + "loss": 0.8796, + "step": 7191 + }, + { + "epoch": 0.315320414198825, + "grad_norm": 0.78125, + "learning_rate": 2.4754331239577566e-05, + "loss": 0.7628, + "step": 7192 + }, + { + "epoch": 0.31536425741548224, + "grad_norm": 0.7890625, + "learning_rate": 2.475129910356232e-05, + "loss": 0.7663, + "step": 7193 + }, + { + "epoch": 0.3154081006321395, + "grad_norm": 0.76171875, + "learning_rate": 2.474826712703242e-05, + "loss": 0.7263, + "step": 7194 + }, + { + "epoch": 0.3154519438487967, + "grad_norm": 0.83984375, + "learning_rate": 2.4745235309994262e-05, + "loss": 0.7683, + "step": 7195 + }, + { + "epoch": 0.31549578706545395, + "grad_norm": 0.8046875, + "learning_rate": 2.4742203652454277e-05, + "loss": 0.7385, + "step": 7196 + }, + { + "epoch": 0.3155396302821112, + "grad_norm": 0.859375, + "learning_rate": 2.473917215441889e-05, + "loss": 0.7637, + "step": 7197 + }, + { + "epoch": 0.3155834734987684, + "grad_norm": 0.80078125, + "learning_rate": 2.4736140815894503e-05, + "loss": 0.773, + "step": 7198 + }, + { + "epoch": 0.31562731671542565, + "grad_norm": 0.8125, + "learning_rate": 2.4733109636887587e-05, + "loss": 0.7995, + "step": 7199 + }, + { + "epoch": 0.3156711599320829, + "grad_norm": 0.765625, + "learning_rate": 2.473007861740454e-05, + "loss": 0.7768, + "step": 7200 + }, + { + "epoch": 0.3157150031487401, + "grad_norm": 0.87109375, + "learning_rate": 2.4727047757451794e-05, + "loss": 0.7063, + "step": 7201 + }, + { + "epoch": 0.31575884636539736, + "grad_norm": 0.859375, + "learning_rate": 2.4724017057035776e-05, + "loss": 0.8984, + "step": 7202 + }, + { + "epoch": 0.3158026895820546, + "grad_norm": 0.81640625, + "learning_rate": 2.4720986516162858e-05, + "loss": 0.8342, + "step": 7203 + }, + { + "epoch": 0.31584653279871183, + "grad_norm": 0.8125, + "learning_rate": 2.4717956134839537e-05, + "loss": 0.7849, + "step": 7204 + }, + { + "epoch": 0.31589037601536907, + "grad_norm": 0.8515625, + "learning_rate": 2.4714925913072218e-05, + "loss": 0.6677, + "step": 7205 + }, + { + "epoch": 0.31593421923202625, + "grad_norm": 0.78125, + "learning_rate": 2.4711895850867295e-05, + "loss": 0.8034, + "step": 7206 + }, + { + "epoch": 0.3159780624486835, + "grad_norm": 0.84375, + "learning_rate": 2.470886594823122e-05, + "loss": 0.805, + "step": 7207 + }, + { + "epoch": 0.3160219056653407, + "grad_norm": 0.8828125, + "learning_rate": 2.4705836205170362e-05, + "loss": 0.8593, + "step": 7208 + }, + { + "epoch": 0.31606574888199795, + "grad_norm": 0.83984375, + "learning_rate": 2.470280662169121e-05, + "loss": 0.8082, + "step": 7209 + }, + { + "epoch": 0.3161095920986552, + "grad_norm": 0.78515625, + "learning_rate": 2.4699777197800154e-05, + "loss": 0.8193, + "step": 7210 + }, + { + "epoch": 0.3161534353153124, + "grad_norm": 1.0625, + "learning_rate": 2.4696747933503616e-05, + "loss": 0.9486, + "step": 7211 + }, + { + "epoch": 0.31619727853196966, + "grad_norm": 0.85546875, + "learning_rate": 2.469371882880802e-05, + "loss": 0.9684, + "step": 7212 + }, + { + "epoch": 0.3162411217486269, + "grad_norm": 0.85546875, + "learning_rate": 2.4690689883719754e-05, + "loss": 0.8819, + "step": 7213 + }, + { + "epoch": 0.31628496496528413, + "grad_norm": 0.8125, + "learning_rate": 2.4687661098245296e-05, + "loss": 0.7487, + "step": 7214 + }, + { + "epoch": 0.31632880818194137, + "grad_norm": 0.78515625, + "learning_rate": 2.4684632472391035e-05, + "loss": 0.891, + "step": 7215 + }, + { + "epoch": 0.3163726513985986, + "grad_norm": 0.82421875, + "learning_rate": 2.4681604006163383e-05, + "loss": 0.8078, + "step": 7216 + }, + { + "epoch": 0.31641649461525584, + "grad_norm": 0.8359375, + "learning_rate": 2.4678575699568773e-05, + "loss": 0.73, + "step": 7217 + }, + { + "epoch": 0.3164603378319131, + "grad_norm": 0.82421875, + "learning_rate": 2.4675547552613588e-05, + "loss": 0.8534, + "step": 7218 + }, + { + "epoch": 0.3165041810485703, + "grad_norm": 0.6953125, + "learning_rate": 2.4672519565304307e-05, + "loss": 0.6864, + "step": 7219 + }, + { + "epoch": 0.31654802426522755, + "grad_norm": 0.84375, + "learning_rate": 2.466949173764731e-05, + "loss": 0.7743, + "step": 7220 + }, + { + "epoch": 0.3165918674818848, + "grad_norm": 0.8046875, + "learning_rate": 2.466646406964902e-05, + "loss": 0.8022, + "step": 7221 + }, + { + "epoch": 0.316635710698542, + "grad_norm": 0.86328125, + "learning_rate": 2.4663436561315856e-05, + "loss": 0.7444, + "step": 7222 + }, + { + "epoch": 0.31667955391519925, + "grad_norm": 0.7734375, + "learning_rate": 2.46604092126542e-05, + "loss": 0.8502, + "step": 7223 + }, + { + "epoch": 0.3167233971318565, + "grad_norm": 0.90234375, + "learning_rate": 2.465738202367054e-05, + "loss": 0.8591, + "step": 7224 + }, + { + "epoch": 0.3167672403485137, + "grad_norm": 0.828125, + "learning_rate": 2.465435499437124e-05, + "loss": 0.9423, + "step": 7225 + }, + { + "epoch": 0.31681108356517096, + "grad_norm": 0.8515625, + "learning_rate": 2.465132812476273e-05, + "loss": 0.8541, + "step": 7226 + }, + { + "epoch": 0.3168549267818282, + "grad_norm": 0.859375, + "learning_rate": 2.4648301414851427e-05, + "loss": 0.8849, + "step": 7227 + }, + { + "epoch": 0.31689876999848543, + "grad_norm": 0.796875, + "learning_rate": 2.4645274864643743e-05, + "loss": 0.8112, + "step": 7228 + }, + { + "epoch": 0.31694261321514267, + "grad_norm": 0.828125, + "learning_rate": 2.464224847414609e-05, + "loss": 0.8499, + "step": 7229 + }, + { + "epoch": 0.3169864564317999, + "grad_norm": 0.9296875, + "learning_rate": 2.4639222243364878e-05, + "loss": 0.8622, + "step": 7230 + }, + { + "epoch": 0.31703029964845714, + "grad_norm": 0.8671875, + "learning_rate": 2.4636196172306538e-05, + "loss": 0.8309, + "step": 7231 + }, + { + "epoch": 0.3170741428651144, + "grad_norm": 0.9140625, + "learning_rate": 2.4633170260977466e-05, + "loss": 0.8919, + "step": 7232 + }, + { + "epoch": 0.3171179860817716, + "grad_norm": 1.0625, + "learning_rate": 2.4630144509384046e-05, + "loss": 0.9208, + "step": 7233 + }, + { + "epoch": 0.31716182929842884, + "grad_norm": 0.9453125, + "learning_rate": 2.4627118917532766e-05, + "loss": 0.8348, + "step": 7234 + }, + { + "epoch": 0.3172056725150861, + "grad_norm": 0.8046875, + "learning_rate": 2.462409348543e-05, + "loss": 0.8659, + "step": 7235 + }, + { + "epoch": 0.31724951573174326, + "grad_norm": 0.79296875, + "learning_rate": 2.4621068213082156e-05, + "loss": 0.7126, + "step": 7236 + }, + { + "epoch": 0.3172933589484005, + "grad_norm": 0.81640625, + "learning_rate": 2.4618043100495646e-05, + "loss": 0.8571, + "step": 7237 + }, + { + "epoch": 0.31733720216505773, + "grad_norm": 0.828125, + "learning_rate": 2.461501814767686e-05, + "loss": 0.723, + "step": 7238 + }, + { + "epoch": 0.31738104538171497, + "grad_norm": 0.8203125, + "learning_rate": 2.4611993354632266e-05, + "loss": 0.8168, + "step": 7239 + }, + { + "epoch": 0.3174248885983722, + "grad_norm": 0.8203125, + "learning_rate": 2.460896872136824e-05, + "loss": 0.7511, + "step": 7240 + }, + { + "epoch": 0.31746873181502944, + "grad_norm": 0.8515625, + "learning_rate": 2.46059442478912e-05, + "loss": 0.7778, + "step": 7241 + }, + { + "epoch": 0.3175125750316867, + "grad_norm": 0.7890625, + "learning_rate": 2.4602919934207545e-05, + "loss": 0.7048, + "step": 7242 + }, + { + "epoch": 0.3175564182483439, + "grad_norm": 0.8125, + "learning_rate": 2.45998957803237e-05, + "loss": 0.8552, + "step": 7243 + }, + { + "epoch": 0.31760026146500114, + "grad_norm": 0.8125, + "learning_rate": 2.4596871786246023e-05, + "loss": 0.8035, + "step": 7244 + }, + { + "epoch": 0.3176441046816584, + "grad_norm": 0.84375, + "learning_rate": 2.4593847951981007e-05, + "loss": 0.8574, + "step": 7245 + }, + { + "epoch": 0.3176879478983156, + "grad_norm": 0.96484375, + "learning_rate": 2.4590824277535018e-05, + "loss": 0.7646, + "step": 7246 + }, + { + "epoch": 0.31773179111497285, + "grad_norm": 0.82421875, + "learning_rate": 2.458780076291447e-05, + "loss": 0.7759, + "step": 7247 + }, + { + "epoch": 0.3177756343316301, + "grad_norm": 0.9921875, + "learning_rate": 2.458477740812576e-05, + "loss": 0.6842, + "step": 7248 + }, + { + "epoch": 0.3178194775482873, + "grad_norm": 0.79296875, + "learning_rate": 2.458175421317528e-05, + "loss": 0.7201, + "step": 7249 + }, + { + "epoch": 0.31786332076494456, + "grad_norm": 0.80859375, + "learning_rate": 2.457873117806949e-05, + "loss": 0.7459, + "step": 7250 + }, + { + "epoch": 0.3179071639816018, + "grad_norm": 0.95703125, + "learning_rate": 2.457570830281477e-05, + "loss": 0.9193, + "step": 7251 + }, + { + "epoch": 0.31795100719825903, + "grad_norm": 0.86328125, + "learning_rate": 2.457268558741752e-05, + "loss": 0.8745, + "step": 7252 + }, + { + "epoch": 0.31799485041491626, + "grad_norm": 0.8203125, + "learning_rate": 2.456966303188416e-05, + "loss": 0.7857, + "step": 7253 + }, + { + "epoch": 0.3180386936315735, + "grad_norm": 0.83984375, + "learning_rate": 2.456664063622105e-05, + "loss": 0.8611, + "step": 7254 + }, + { + "epoch": 0.31808253684823073, + "grad_norm": 0.90625, + "learning_rate": 2.4563618400434662e-05, + "loss": 0.7621, + "step": 7255 + }, + { + "epoch": 0.31812638006488797, + "grad_norm": 0.890625, + "learning_rate": 2.4560596324531382e-05, + "loss": 0.8793, + "step": 7256 + }, + { + "epoch": 0.3181702232815452, + "grad_norm": 0.83203125, + "learning_rate": 2.4557574408517603e-05, + "loss": 0.802, + "step": 7257 + }, + { + "epoch": 0.31821406649820244, + "grad_norm": 0.81640625, + "learning_rate": 2.455455265239973e-05, + "loss": 0.7052, + "step": 7258 + }, + { + "epoch": 0.3182579097148597, + "grad_norm": 0.80078125, + "learning_rate": 2.455153105618414e-05, + "loss": 0.8312, + "step": 7259 + }, + { + "epoch": 0.3183017529315169, + "grad_norm": 0.921875, + "learning_rate": 2.4548509619877303e-05, + "loss": 0.8566, + "step": 7260 + }, + { + "epoch": 0.31834559614817415, + "grad_norm": 0.875, + "learning_rate": 2.454548834348558e-05, + "loss": 0.8327, + "step": 7261 + }, + { + "epoch": 0.3183894393648314, + "grad_norm": 0.80859375, + "learning_rate": 2.4542467227015388e-05, + "loss": 0.7645, + "step": 7262 + }, + { + "epoch": 0.3184332825814886, + "grad_norm": 0.86328125, + "learning_rate": 2.453944627047312e-05, + "loss": 0.8479, + "step": 7263 + }, + { + "epoch": 0.31847712579814585, + "grad_norm": 0.80859375, + "learning_rate": 2.4536425473865155e-05, + "loss": 0.8371, + "step": 7264 + }, + { + "epoch": 0.3185209690148031, + "grad_norm": 0.8125, + "learning_rate": 2.453340483719795e-05, + "loss": 0.9142, + "step": 7265 + }, + { + "epoch": 0.31856481223146027, + "grad_norm": 0.82421875, + "learning_rate": 2.4530384360477877e-05, + "loss": 0.8382, + "step": 7266 + }, + { + "epoch": 0.3186086554481175, + "grad_norm": 0.84765625, + "learning_rate": 2.4527364043711342e-05, + "loss": 0.8249, + "step": 7267 + }, + { + "epoch": 0.31865249866477474, + "grad_norm": 0.8671875, + "learning_rate": 2.4524343886904756e-05, + "loss": 0.9487, + "step": 7268 + }, + { + "epoch": 0.318696341881432, + "grad_norm": 0.77734375, + "learning_rate": 2.4521323890064496e-05, + "loss": 0.6856, + "step": 7269 + }, + { + "epoch": 0.3187401850980892, + "grad_norm": 0.83203125, + "learning_rate": 2.451830405319695e-05, + "loss": 0.7508, + "step": 7270 + }, + { + "epoch": 0.31878402831474645, + "grad_norm": 0.87109375, + "learning_rate": 2.4515284376308578e-05, + "loss": 0.9035, + "step": 7271 + }, + { + "epoch": 0.3188278715314037, + "grad_norm": 0.7890625, + "learning_rate": 2.451226485940574e-05, + "loss": 0.7973, + "step": 7272 + }, + { + "epoch": 0.3188717147480609, + "grad_norm": 0.91796875, + "learning_rate": 2.4509245502494848e-05, + "loss": 0.8824, + "step": 7273 + }, + { + "epoch": 0.31891555796471815, + "grad_norm": 0.7890625, + "learning_rate": 2.450622630558229e-05, + "loss": 0.7389, + "step": 7274 + }, + { + "epoch": 0.3189594011813754, + "grad_norm": 0.77734375, + "learning_rate": 2.450320726867448e-05, + "loss": 0.9104, + "step": 7275 + }, + { + "epoch": 0.3190032443980326, + "grad_norm": 0.89453125, + "learning_rate": 2.45001883917778e-05, + "loss": 0.8064, + "step": 7276 + }, + { + "epoch": 0.31904708761468986, + "grad_norm": 0.85546875, + "learning_rate": 2.4497169674898656e-05, + "loss": 0.8423, + "step": 7277 + }, + { + "epoch": 0.3190909308313471, + "grad_norm": 0.875, + "learning_rate": 2.449415111804344e-05, + "loss": 0.8602, + "step": 7278 + }, + { + "epoch": 0.31913477404800433, + "grad_norm": 0.84375, + "learning_rate": 2.4491132721218533e-05, + "loss": 0.8518, + "step": 7279 + }, + { + "epoch": 0.31917861726466157, + "grad_norm": 0.80078125, + "learning_rate": 2.448811448443038e-05, + "loss": 0.7626, + "step": 7280 + }, + { + "epoch": 0.3192224604813188, + "grad_norm": 0.8125, + "learning_rate": 2.448509640768536e-05, + "loss": 0.75, + "step": 7281 + }, + { + "epoch": 0.31926630369797604, + "grad_norm": 0.7578125, + "learning_rate": 2.4482078490989858e-05, + "loss": 0.7236, + "step": 7282 + }, + { + "epoch": 0.3193101469146333, + "grad_norm": 1.3671875, + "learning_rate": 2.447906073435027e-05, + "loss": 0.842, + "step": 7283 + }, + { + "epoch": 0.3193539901312905, + "grad_norm": 0.8046875, + "learning_rate": 2.4476043137772963e-05, + "loss": 0.8378, + "step": 7284 + }, + { + "epoch": 0.31939783334794775, + "grad_norm": 0.83984375, + "learning_rate": 2.4473025701264407e-05, + "loss": 0.7891, + "step": 7285 + }, + { + "epoch": 0.319441676564605, + "grad_norm": 0.73046875, + "learning_rate": 2.4470008424830947e-05, + "loss": 0.7184, + "step": 7286 + }, + { + "epoch": 0.3194855197812622, + "grad_norm": 0.7890625, + "learning_rate": 2.446699130847899e-05, + "loss": 0.7733, + "step": 7287 + }, + { + "epoch": 0.31952936299791945, + "grad_norm": 0.89453125, + "learning_rate": 2.4463974352214924e-05, + "loss": 0.8046, + "step": 7288 + }, + { + "epoch": 0.3195732062145767, + "grad_norm": 0.83984375, + "learning_rate": 2.4460957556045126e-05, + "loss": 0.8181, + "step": 7289 + }, + { + "epoch": 0.3196170494312339, + "grad_norm": 0.88671875, + "learning_rate": 2.445794091997603e-05, + "loss": 0.9205, + "step": 7290 + }, + { + "epoch": 0.31966089264789116, + "grad_norm": 0.84765625, + "learning_rate": 2.4454924444014015e-05, + "loss": 0.7979, + "step": 7291 + }, + { + "epoch": 0.3197047358645484, + "grad_norm": 0.7734375, + "learning_rate": 2.445190812816548e-05, + "loss": 0.7519, + "step": 7292 + }, + { + "epoch": 0.31974857908120563, + "grad_norm": 0.875, + "learning_rate": 2.4448891972436793e-05, + "loss": 0.7356, + "step": 7293 + }, + { + "epoch": 0.31979242229786287, + "grad_norm": 0.8671875, + "learning_rate": 2.4445875976834344e-05, + "loss": 0.8092, + "step": 7294 + }, + { + "epoch": 0.3198362655145201, + "grad_norm": 0.87890625, + "learning_rate": 2.4442860141364554e-05, + "loss": 0.7799, + "step": 7295 + }, + { + "epoch": 0.31988010873117734, + "grad_norm": 0.77734375, + "learning_rate": 2.4439844466033824e-05, + "loss": 0.76, + "step": 7296 + }, + { + "epoch": 0.3199239519478345, + "grad_norm": 0.7734375, + "learning_rate": 2.443682895084851e-05, + "loss": 0.7571, + "step": 7297 + }, + { + "epoch": 0.31996779516449175, + "grad_norm": 0.77734375, + "learning_rate": 2.4433813595815025e-05, + "loss": 0.7195, + "step": 7298 + }, + { + "epoch": 0.320011638381149, + "grad_norm": 0.90234375, + "learning_rate": 2.4430798400939713e-05, + "loss": 0.8457, + "step": 7299 + }, + { + "epoch": 0.3200554815978062, + "grad_norm": 0.84765625, + "learning_rate": 2.442778336622905e-05, + "loss": 0.8608, + "step": 7300 + }, + { + "epoch": 0.32009932481446346, + "grad_norm": 0.875, + "learning_rate": 2.4424768491689374e-05, + "loss": 0.8251, + "step": 7301 + }, + { + "epoch": 0.3201431680311207, + "grad_norm": 0.84375, + "learning_rate": 2.442175377732707e-05, + "loss": 0.7954, + "step": 7302 + }, + { + "epoch": 0.32018701124777793, + "grad_norm": 0.87109375, + "learning_rate": 2.4418739223148557e-05, + "loss": 0.8835, + "step": 7303 + }, + { + "epoch": 0.32023085446443517, + "grad_norm": 0.74609375, + "learning_rate": 2.441572482916017e-05, + "loss": 0.7284, + "step": 7304 + }, + { + "epoch": 0.3202746976810924, + "grad_norm": 0.828125, + "learning_rate": 2.441271059536836e-05, + "loss": 0.7948, + "step": 7305 + }, + { + "epoch": 0.32031854089774964, + "grad_norm": 0.8671875, + "learning_rate": 2.4409696521779492e-05, + "loss": 0.802, + "step": 7306 + }, + { + "epoch": 0.3203623841144069, + "grad_norm": 0.83984375, + "learning_rate": 2.4406682608399956e-05, + "loss": 0.8462, + "step": 7307 + }, + { + "epoch": 0.3204062273310641, + "grad_norm": 0.8671875, + "learning_rate": 2.4403668855236127e-05, + "loss": 0.996, + "step": 7308 + }, + { + "epoch": 0.32045007054772134, + "grad_norm": 0.83203125, + "learning_rate": 2.440065526229438e-05, + "loss": 0.7528, + "step": 7309 + }, + { + "epoch": 0.3204939137643786, + "grad_norm": 0.87109375, + "learning_rate": 2.4397641829581152e-05, + "loss": 0.9802, + "step": 7310 + }, + { + "epoch": 0.3205377569810358, + "grad_norm": 0.79296875, + "learning_rate": 2.4394628557102794e-05, + "loss": 0.8374, + "step": 7311 + }, + { + "epoch": 0.32058160019769305, + "grad_norm": 0.8515625, + "learning_rate": 2.43916154448657e-05, + "loss": 0.8208, + "step": 7312 + }, + { + "epoch": 0.3206254434143503, + "grad_norm": 0.84375, + "learning_rate": 2.4388602492876255e-05, + "loss": 0.7879, + "step": 7313 + }, + { + "epoch": 0.3206692866310075, + "grad_norm": 0.78515625, + "learning_rate": 2.4385589701140855e-05, + "loss": 0.7293, + "step": 7314 + }, + { + "epoch": 0.32071312984766476, + "grad_norm": 0.83984375, + "learning_rate": 2.4382577069665868e-05, + "loss": 0.7837, + "step": 7315 + }, + { + "epoch": 0.320756973064322, + "grad_norm": 0.890625, + "learning_rate": 2.4379564598457693e-05, + "loss": 0.8469, + "step": 7316 + }, + { + "epoch": 0.32080081628097923, + "grad_norm": 0.71875, + "learning_rate": 2.4376552287522703e-05, + "loss": 0.6774, + "step": 7317 + }, + { + "epoch": 0.32084465949763646, + "grad_norm": 0.71875, + "learning_rate": 2.4373540136867255e-05, + "loss": 0.8522, + "step": 7318 + }, + { + "epoch": 0.3208885027142937, + "grad_norm": 0.84375, + "learning_rate": 2.4370528146497807e-05, + "loss": 0.8311, + "step": 7319 + }, + { + "epoch": 0.32093234593095094, + "grad_norm": 0.796875, + "learning_rate": 2.436751631642069e-05, + "loss": 0.7269, + "step": 7320 + }, + { + "epoch": 0.32097618914760817, + "grad_norm": 0.80078125, + "learning_rate": 2.4364504646642307e-05, + "loss": 0.7905, + "step": 7321 + }, + { + "epoch": 0.3210200323642654, + "grad_norm": 0.765625, + "learning_rate": 2.436149313716902e-05, + "loss": 0.8333, + "step": 7322 + }, + { + "epoch": 0.32106387558092264, + "grad_norm": 0.7734375, + "learning_rate": 2.4358481788007238e-05, + "loss": 0.6628, + "step": 7323 + }, + { + "epoch": 0.3211077187975799, + "grad_norm": 0.80859375, + "learning_rate": 2.43554705991633e-05, + "loss": 0.8203, + "step": 7324 + }, + { + "epoch": 0.3211515620142371, + "grad_norm": 0.8359375, + "learning_rate": 2.435245957064365e-05, + "loss": 0.9229, + "step": 7325 + }, + { + "epoch": 0.32119540523089435, + "grad_norm": 0.8359375, + "learning_rate": 2.4349448702454637e-05, + "loss": 0.8154, + "step": 7326 + }, + { + "epoch": 0.32123924844755153, + "grad_norm": 0.8515625, + "learning_rate": 2.4346437994602633e-05, + "loss": 0.771, + "step": 7327 + }, + { + "epoch": 0.32128309166420876, + "grad_norm": 0.80078125, + "learning_rate": 2.4343427447094037e-05, + "loss": 0.7478, + "step": 7328 + }, + { + "epoch": 0.321326934880866, + "grad_norm": 0.82421875, + "learning_rate": 2.4340417059935196e-05, + "loss": 0.7653, + "step": 7329 + }, + { + "epoch": 0.32137077809752324, + "grad_norm": 0.83984375, + "learning_rate": 2.433740683313255e-05, + "loss": 0.7715, + "step": 7330 + }, + { + "epoch": 0.32141462131418047, + "grad_norm": 0.8671875, + "learning_rate": 2.4334396766692436e-05, + "loss": 0.917, + "step": 7331 + }, + { + "epoch": 0.3214584645308377, + "grad_norm": 0.8125, + "learning_rate": 2.433138686062124e-05, + "loss": 0.7453, + "step": 7332 + }, + { + "epoch": 0.32150230774749494, + "grad_norm": 0.9140625, + "learning_rate": 2.432837711492535e-05, + "loss": 0.8891, + "step": 7333 + }, + { + "epoch": 0.3215461509641522, + "grad_norm": 0.76171875, + "learning_rate": 2.432536752961111e-05, + "loss": 0.7749, + "step": 7334 + }, + { + "epoch": 0.3215899941808094, + "grad_norm": 0.78125, + "learning_rate": 2.4322358104684962e-05, + "loss": 0.8933, + "step": 7335 + }, + { + "epoch": 0.32163383739746665, + "grad_norm": 0.78125, + "learning_rate": 2.4319348840153245e-05, + "loss": 0.7668, + "step": 7336 + }, + { + "epoch": 0.3216776806141239, + "grad_norm": 0.7734375, + "learning_rate": 2.4316339736022344e-05, + "loss": 0.8034, + "step": 7337 + }, + { + "epoch": 0.3217215238307811, + "grad_norm": 0.97265625, + "learning_rate": 2.431333079229864e-05, + "loss": 0.8647, + "step": 7338 + }, + { + "epoch": 0.32176536704743836, + "grad_norm": 0.83984375, + "learning_rate": 2.4310322008988463e-05, + "loss": 0.7829, + "step": 7339 + }, + { + "epoch": 0.3218092102640956, + "grad_norm": 0.8203125, + "learning_rate": 2.4307313386098274e-05, + "loss": 0.9955, + "step": 7340 + }, + { + "epoch": 0.3218530534807528, + "grad_norm": 0.87890625, + "learning_rate": 2.4304304923634404e-05, + "loss": 0.8053, + "step": 7341 + }, + { + "epoch": 0.32189689669741006, + "grad_norm": 0.796875, + "learning_rate": 2.4301296621603232e-05, + "loss": 0.8216, + "step": 7342 + }, + { + "epoch": 0.3219407399140673, + "grad_norm": 0.7578125, + "learning_rate": 2.4298288480011133e-05, + "loss": 0.7952, + "step": 7343 + }, + { + "epoch": 0.32198458313072453, + "grad_norm": 0.79296875, + "learning_rate": 2.429528049886446e-05, + "loss": 0.7715, + "step": 7344 + }, + { + "epoch": 0.32202842634738177, + "grad_norm": 0.78125, + "learning_rate": 2.4292272678169636e-05, + "loss": 0.8176, + "step": 7345 + }, + { + "epoch": 0.322072269564039, + "grad_norm": 0.9140625, + "learning_rate": 2.4289265017933028e-05, + "loss": 0.9039, + "step": 7346 + }, + { + "epoch": 0.32211611278069624, + "grad_norm": 0.75, + "learning_rate": 2.428625751816098e-05, + "loss": 0.7768, + "step": 7347 + }, + { + "epoch": 0.3221599559973535, + "grad_norm": 0.85546875, + "learning_rate": 2.4283250178859885e-05, + "loss": 0.8619, + "step": 7348 + }, + { + "epoch": 0.3222037992140107, + "grad_norm": 0.8125, + "learning_rate": 2.4280243000036086e-05, + "loss": 0.8088, + "step": 7349 + }, + { + "epoch": 0.32224764243066795, + "grad_norm": 1.015625, + "learning_rate": 2.427723598169601e-05, + "loss": 0.6979, + "step": 7350 + }, + { + "epoch": 0.3222914856473252, + "grad_norm": 0.765625, + "learning_rate": 2.4274229123846003e-05, + "loss": 0.737, + "step": 7351 + }, + { + "epoch": 0.3223353288639824, + "grad_norm": 0.8515625, + "learning_rate": 2.4271222426492445e-05, + "loss": 0.806, + "step": 7352 + }, + { + "epoch": 0.32237917208063965, + "grad_norm": 0.84375, + "learning_rate": 2.426821588964171e-05, + "loss": 0.9255, + "step": 7353 + }, + { + "epoch": 0.3224230152972969, + "grad_norm": 0.7890625, + "learning_rate": 2.4265209513300147e-05, + "loss": 0.7503, + "step": 7354 + }, + { + "epoch": 0.3224668585139541, + "grad_norm": 0.77734375, + "learning_rate": 2.4262203297474117e-05, + "loss": 0.7569, + "step": 7355 + }, + { + "epoch": 0.32251070173061136, + "grad_norm": 0.72265625, + "learning_rate": 2.4259197242170062e-05, + "loss": 0.7868, + "step": 7356 + }, + { + "epoch": 0.32255454494726854, + "grad_norm": 0.75, + "learning_rate": 2.4256191347394296e-05, + "loss": 0.679, + "step": 7357 + }, + { + "epoch": 0.3225983881639258, + "grad_norm": 0.78125, + "learning_rate": 2.4253185613153207e-05, + "loss": 0.8338, + "step": 7358 + }, + { + "epoch": 0.322642231380583, + "grad_norm": 0.80859375, + "learning_rate": 2.425018003945315e-05, + "loss": 0.7446, + "step": 7359 + }, + { + "epoch": 0.32268607459724025, + "grad_norm": 0.96875, + "learning_rate": 2.4247174626300528e-05, + "loss": 0.8663, + "step": 7360 + }, + { + "epoch": 0.3227299178138975, + "grad_norm": 0.8203125, + "learning_rate": 2.4244169373701676e-05, + "loss": 0.8254, + "step": 7361 + }, + { + "epoch": 0.3227737610305547, + "grad_norm": 0.859375, + "learning_rate": 2.4241164281662977e-05, + "loss": 0.8683, + "step": 7362 + }, + { + "epoch": 0.32281760424721195, + "grad_norm": 0.8125, + "learning_rate": 2.4238159350190802e-05, + "loss": 0.8388, + "step": 7363 + }, + { + "epoch": 0.3228614474638692, + "grad_norm": 0.73828125, + "learning_rate": 2.4235154579291486e-05, + "loss": 0.7089, + "step": 7364 + }, + { + "epoch": 0.3229052906805264, + "grad_norm": 0.828125, + "learning_rate": 2.4232149968971463e-05, + "loss": 0.8026, + "step": 7365 + }, + { + "epoch": 0.32294913389718366, + "grad_norm": 0.8125, + "learning_rate": 2.4229145519237073e-05, + "loss": 0.8104, + "step": 7366 + }, + { + "epoch": 0.3229929771138409, + "grad_norm": 0.8515625, + "learning_rate": 2.4226141230094668e-05, + "loss": 0.73, + "step": 7367 + }, + { + "epoch": 0.32303682033049813, + "grad_norm": 0.90234375, + "learning_rate": 2.422313710155063e-05, + "loss": 0.7633, + "step": 7368 + }, + { + "epoch": 0.32308066354715537, + "grad_norm": 0.78125, + "learning_rate": 2.4220133133611288e-05, + "loss": 0.8189, + "step": 7369 + }, + { + "epoch": 0.3231245067638126, + "grad_norm": 0.80078125, + "learning_rate": 2.4217129326283083e-05, + "loss": 0.8735, + "step": 7370 + }, + { + "epoch": 0.32316834998046984, + "grad_norm": 0.81640625, + "learning_rate": 2.421412567957233e-05, + "loss": 0.8839, + "step": 7371 + }, + { + "epoch": 0.3232121931971271, + "grad_norm": 0.83203125, + "learning_rate": 2.4211122193485415e-05, + "loss": 0.7455, + "step": 7372 + }, + { + "epoch": 0.3232560364137843, + "grad_norm": 0.76171875, + "learning_rate": 2.4208118868028684e-05, + "loss": 0.8783, + "step": 7373 + }, + { + "epoch": 0.32329987963044154, + "grad_norm": 0.82421875, + "learning_rate": 2.420511570320849e-05, + "loss": 0.8007, + "step": 7374 + }, + { + "epoch": 0.3233437228470988, + "grad_norm": 0.78125, + "learning_rate": 2.4202112699031242e-05, + "loss": 0.7117, + "step": 7375 + }, + { + "epoch": 0.323387566063756, + "grad_norm": 0.87109375, + "learning_rate": 2.41991098555033e-05, + "loss": 0.8434, + "step": 7376 + }, + { + "epoch": 0.32343140928041325, + "grad_norm": 0.78125, + "learning_rate": 2.4196107172631e-05, + "loss": 0.6701, + "step": 7377 + }, + { + "epoch": 0.3234752524970705, + "grad_norm": 0.77734375, + "learning_rate": 2.4193104650420718e-05, + "loss": 0.7875, + "step": 7378 + }, + { + "epoch": 0.3235190957137277, + "grad_norm": 0.828125, + "learning_rate": 2.419010228887879e-05, + "loss": 0.818, + "step": 7379 + }, + { + "epoch": 0.32356293893038496, + "grad_norm": 0.70703125, + "learning_rate": 2.4187100088011637e-05, + "loss": 0.7247, + "step": 7380 + }, + { + "epoch": 0.3236067821470422, + "grad_norm": 0.8359375, + "learning_rate": 2.41840980478256e-05, + "loss": 0.8536, + "step": 7381 + }, + { + "epoch": 0.32365062536369943, + "grad_norm": 0.83203125, + "learning_rate": 2.4181096168327023e-05, + "loss": 0.7874, + "step": 7382 + }, + { + "epoch": 0.32369446858035666, + "grad_norm": 0.875, + "learning_rate": 2.417809444952228e-05, + "loss": 0.7779, + "step": 7383 + }, + { + "epoch": 0.3237383117970139, + "grad_norm": 0.79296875, + "learning_rate": 2.4175092891417704e-05, + "loss": 0.682, + "step": 7384 + }, + { + "epoch": 0.32378215501367114, + "grad_norm": 0.828125, + "learning_rate": 2.4172091494019722e-05, + "loss": 0.7993, + "step": 7385 + }, + { + "epoch": 0.32382599823032837, + "grad_norm": 0.79296875, + "learning_rate": 2.416909025733465e-05, + "loss": 0.8365, + "step": 7386 + }, + { + "epoch": 0.3238698414469856, + "grad_norm": 0.90625, + "learning_rate": 2.4166089181368855e-05, + "loss": 0.9463, + "step": 7387 + }, + { + "epoch": 0.3239136846636428, + "grad_norm": 0.80078125, + "learning_rate": 2.416308826612871e-05, + "loss": 0.7754, + "step": 7388 + }, + { + "epoch": 0.3239575278803, + "grad_norm": 0.8046875, + "learning_rate": 2.4160087511620555e-05, + "loss": 0.672, + "step": 7389 + }, + { + "epoch": 0.32400137109695726, + "grad_norm": 0.7578125, + "learning_rate": 2.4157086917850734e-05, + "loss": 0.8073, + "step": 7390 + }, + { + "epoch": 0.3240452143136145, + "grad_norm": 0.84375, + "learning_rate": 2.4154086484825655e-05, + "loss": 0.8951, + "step": 7391 + }, + { + "epoch": 0.32408905753027173, + "grad_norm": 0.8984375, + "learning_rate": 2.4151086212551664e-05, + "loss": 0.8546, + "step": 7392 + }, + { + "epoch": 0.32413290074692896, + "grad_norm": 0.94921875, + "learning_rate": 2.4148086101035105e-05, + "loss": 0.7703, + "step": 7393 + }, + { + "epoch": 0.3241767439635862, + "grad_norm": 0.77734375, + "learning_rate": 2.414508615028235e-05, + "loss": 0.7224, + "step": 7394 + }, + { + "epoch": 0.32422058718024344, + "grad_norm": 0.83984375, + "learning_rate": 2.414208636029971e-05, + "loss": 0.915, + "step": 7395 + }, + { + "epoch": 0.32426443039690067, + "grad_norm": 0.83203125, + "learning_rate": 2.4139086731093618e-05, + "loss": 0.7664, + "step": 7396 + }, + { + "epoch": 0.3243082736135579, + "grad_norm": 0.8203125, + "learning_rate": 2.4136087262670394e-05, + "loss": 0.7643, + "step": 7397 + }, + { + "epoch": 0.32435211683021514, + "grad_norm": 0.96484375, + "learning_rate": 2.4133087955036393e-05, + "loss": 0.7974, + "step": 7398 + }, + { + "epoch": 0.3243959600468724, + "grad_norm": 0.8125, + "learning_rate": 2.4130088808197983e-05, + "loss": 0.6817, + "step": 7399 + }, + { + "epoch": 0.3244398032635296, + "grad_norm": 0.82421875, + "learning_rate": 2.4127089822161496e-05, + "loss": 0.7718, + "step": 7400 + }, + { + "epoch": 0.32448364648018685, + "grad_norm": 0.84765625, + "learning_rate": 2.4124090996933324e-05, + "loss": 0.8592, + "step": 7401 + }, + { + "epoch": 0.3245274896968441, + "grad_norm": 1.2578125, + "learning_rate": 2.4121092332519802e-05, + "loss": 0.8193, + "step": 7402 + }, + { + "epoch": 0.3245713329135013, + "grad_norm": 0.80078125, + "learning_rate": 2.411809382892728e-05, + "loss": 0.8142, + "step": 7403 + }, + { + "epoch": 0.32461517613015856, + "grad_norm": 0.8359375, + "learning_rate": 2.4115095486162097e-05, + "loss": 0.8383, + "step": 7404 + }, + { + "epoch": 0.3246590193468158, + "grad_norm": 0.8828125, + "learning_rate": 2.4112097304230664e-05, + "loss": 0.8922, + "step": 7405 + }, + { + "epoch": 0.324702862563473, + "grad_norm": 0.73046875, + "learning_rate": 2.4109099283139292e-05, + "loss": 0.8257, + "step": 7406 + }, + { + "epoch": 0.32474670578013026, + "grad_norm": 0.80859375, + "learning_rate": 2.4106101422894357e-05, + "loss": 0.7924, + "step": 7407 + }, + { + "epoch": 0.3247905489967875, + "grad_norm": 0.8203125, + "learning_rate": 2.4103103723502195e-05, + "loss": 0.7597, + "step": 7408 + }, + { + "epoch": 0.32483439221344473, + "grad_norm": 0.8828125, + "learning_rate": 2.4100106184969173e-05, + "loss": 0.8709, + "step": 7409 + }, + { + "epoch": 0.32487823543010197, + "grad_norm": 0.7578125, + "learning_rate": 2.4097108807301605e-05, + "loss": 0.7282, + "step": 7410 + }, + { + "epoch": 0.3249220786467592, + "grad_norm": 0.8359375, + "learning_rate": 2.409411159050591e-05, + "loss": 0.8314, + "step": 7411 + }, + { + "epoch": 0.32496592186341644, + "grad_norm": 0.84765625, + "learning_rate": 2.40911145345884e-05, + "loss": 0.8198, + "step": 7412 + }, + { + "epoch": 0.3250097650800737, + "grad_norm": 0.94140625, + "learning_rate": 2.408811763955544e-05, + "loss": 0.9485, + "step": 7413 + }, + { + "epoch": 0.3250536082967309, + "grad_norm": 0.7890625, + "learning_rate": 2.408512090541337e-05, + "loss": 0.8104, + "step": 7414 + }, + { + "epoch": 0.32509745151338815, + "grad_norm": 0.828125, + "learning_rate": 2.408212433216852e-05, + "loss": 0.7779, + "step": 7415 + }, + { + "epoch": 0.3251412947300454, + "grad_norm": 0.78515625, + "learning_rate": 2.4079127919827306e-05, + "loss": 0.9062, + "step": 7416 + }, + { + "epoch": 0.3251851379467026, + "grad_norm": 0.8125, + "learning_rate": 2.407613166839603e-05, + "loss": 0.7468, + "step": 7417 + }, + { + "epoch": 0.3252289811633598, + "grad_norm": 0.8046875, + "learning_rate": 2.4073135577881056e-05, + "loss": 0.787, + "step": 7418 + }, + { + "epoch": 0.32527282438001703, + "grad_norm": 0.7734375, + "learning_rate": 2.4070139648288726e-05, + "loss": 0.7815, + "step": 7419 + }, + { + "epoch": 0.32531666759667427, + "grad_norm": 0.8125, + "learning_rate": 2.4067143879625375e-05, + "loss": 0.7773, + "step": 7420 + }, + { + "epoch": 0.3253605108133315, + "grad_norm": 0.83984375, + "learning_rate": 2.4064148271897403e-05, + "loss": 0.8386, + "step": 7421 + }, + { + "epoch": 0.32540435402998874, + "grad_norm": 0.8125, + "learning_rate": 2.4061152825111123e-05, + "loss": 0.8606, + "step": 7422 + }, + { + "epoch": 0.325448197246646, + "grad_norm": 0.82421875, + "learning_rate": 2.4058157539272886e-05, + "loss": 0.8533, + "step": 7423 + }, + { + "epoch": 0.3254920404633032, + "grad_norm": 0.80859375, + "learning_rate": 2.4055162414389056e-05, + "loss": 0.7348, + "step": 7424 + }, + { + "epoch": 0.32553588367996045, + "grad_norm": 0.875, + "learning_rate": 2.4052167450465924e-05, + "loss": 0.8359, + "step": 7425 + }, + { + "epoch": 0.3255797268966177, + "grad_norm": 0.87109375, + "learning_rate": 2.4049172647509922e-05, + "loss": 0.7679, + "step": 7426 + }, + { + "epoch": 0.3256235701132749, + "grad_norm": 0.82421875, + "learning_rate": 2.4046178005527354e-05, + "loss": 0.8376, + "step": 7427 + }, + { + "epoch": 0.32566741332993215, + "grad_norm": 0.7578125, + "learning_rate": 2.404318352452457e-05, + "loss": 0.7005, + "step": 7428 + }, + { + "epoch": 0.3257112565465894, + "grad_norm": 0.890625, + "learning_rate": 2.4040189204507913e-05, + "loss": 0.8257, + "step": 7429 + }, + { + "epoch": 0.3257550997632466, + "grad_norm": 0.84375, + "learning_rate": 2.4037195045483708e-05, + "loss": 0.8929, + "step": 7430 + }, + { + "epoch": 0.32579894297990386, + "grad_norm": 0.83984375, + "learning_rate": 2.403420104745836e-05, + "loss": 0.74, + "step": 7431 + }, + { + "epoch": 0.3258427861965611, + "grad_norm": 0.9453125, + "learning_rate": 2.4031207210438177e-05, + "loss": 0.8999, + "step": 7432 + }, + { + "epoch": 0.32588662941321833, + "grad_norm": 1.0078125, + "learning_rate": 2.40282135344295e-05, + "loss": 0.6725, + "step": 7433 + }, + { + "epoch": 0.32593047262987557, + "grad_norm": 0.83984375, + "learning_rate": 2.4025220019438698e-05, + "loss": 0.8957, + "step": 7434 + }, + { + "epoch": 0.3259743158465328, + "grad_norm": 0.76953125, + "learning_rate": 2.4022226665472058e-05, + "loss": 0.8572, + "step": 7435 + }, + { + "epoch": 0.32601815906319004, + "grad_norm": 0.84765625, + "learning_rate": 2.4019233472535997e-05, + "loss": 0.7751, + "step": 7436 + }, + { + "epoch": 0.3260620022798473, + "grad_norm": 0.83984375, + "learning_rate": 2.401624044063684e-05, + "loss": 0.7827, + "step": 7437 + }, + { + "epoch": 0.3261058454965045, + "grad_norm": 0.77734375, + "learning_rate": 2.4013247569780904e-05, + "loss": 0.7475, + "step": 7438 + }, + { + "epoch": 0.32614968871316175, + "grad_norm": 0.83203125, + "learning_rate": 2.4010254859974547e-05, + "loss": 0.8524, + "step": 7439 + }, + { + "epoch": 0.326193531929819, + "grad_norm": 0.85546875, + "learning_rate": 2.4007262311224122e-05, + "loss": 0.906, + "step": 7440 + }, + { + "epoch": 0.3262373751464762, + "grad_norm": 0.83203125, + "learning_rate": 2.4004269923535928e-05, + "loss": 0.8033, + "step": 7441 + }, + { + "epoch": 0.32628121836313345, + "grad_norm": 0.7890625, + "learning_rate": 2.4001277696916358e-05, + "loss": 0.7476, + "step": 7442 + }, + { + "epoch": 0.3263250615797907, + "grad_norm": 0.83984375, + "learning_rate": 2.3998285631371755e-05, + "loss": 0.7811, + "step": 7443 + }, + { + "epoch": 0.3263689047964479, + "grad_norm": 0.87109375, + "learning_rate": 2.399529372690843e-05, + "loss": 0.8624, + "step": 7444 + }, + { + "epoch": 0.32641274801310516, + "grad_norm": 0.73046875, + "learning_rate": 2.3992301983532727e-05, + "loss": 0.7272, + "step": 7445 + }, + { + "epoch": 0.3264565912297624, + "grad_norm": 0.8828125, + "learning_rate": 2.3989310401251008e-05, + "loss": 0.8853, + "step": 7446 + }, + { + "epoch": 0.32650043444641963, + "grad_norm": 0.8515625, + "learning_rate": 2.39863189800696e-05, + "loss": 0.7367, + "step": 7447 + }, + { + "epoch": 0.3265442776630768, + "grad_norm": 1.03125, + "learning_rate": 2.3983327719994843e-05, + "loss": 0.7819, + "step": 7448 + }, + { + "epoch": 0.32658812087973405, + "grad_norm": 0.78125, + "learning_rate": 2.3980336621033085e-05, + "loss": 0.7413, + "step": 7449 + }, + { + "epoch": 0.3266319640963913, + "grad_norm": 0.8125, + "learning_rate": 2.397734568319062e-05, + "loss": 0.7188, + "step": 7450 + }, + { + "epoch": 0.3266758073130485, + "grad_norm": 0.83984375, + "learning_rate": 2.397435490647385e-05, + "loss": 0.8629, + "step": 7451 + }, + { + "epoch": 0.32671965052970575, + "grad_norm": 0.7578125, + "learning_rate": 2.39713642908891e-05, + "loss": 0.8721, + "step": 7452 + }, + { + "epoch": 0.326763493746363, + "grad_norm": 0.87109375, + "learning_rate": 2.3968373836442693e-05, + "loss": 0.864, + "step": 7453 + }, + { + "epoch": 0.3268073369630202, + "grad_norm": 0.859375, + "learning_rate": 2.3965383543140974e-05, + "loss": 0.8236, + "step": 7454 + }, + { + "epoch": 0.32685118017967746, + "grad_norm": 0.74609375, + "learning_rate": 2.3962393410990246e-05, + "loss": 0.7096, + "step": 7455 + }, + { + "epoch": 0.3268950233963347, + "grad_norm": 0.8828125, + "learning_rate": 2.3959403439996907e-05, + "loss": 0.8651, + "step": 7456 + }, + { + "epoch": 0.32693886661299193, + "grad_norm": 0.8984375, + "learning_rate": 2.3956413630167273e-05, + "loss": 0.8717, + "step": 7457 + }, + { + "epoch": 0.32698270982964917, + "grad_norm": 0.90625, + "learning_rate": 2.395342398150767e-05, + "loss": 0.8187, + "step": 7458 + }, + { + "epoch": 0.3270265530463064, + "grad_norm": 0.85546875, + "learning_rate": 2.395043449402444e-05, + "loss": 0.7826, + "step": 7459 + }, + { + "epoch": 0.32707039626296364, + "grad_norm": 0.80859375, + "learning_rate": 2.394744516772389e-05, + "loss": 0.6875, + "step": 7460 + }, + { + "epoch": 0.32711423947962087, + "grad_norm": 0.8125, + "learning_rate": 2.3944456002612413e-05, + "loss": 0.7848, + "step": 7461 + }, + { + "epoch": 0.3271580826962781, + "grad_norm": 0.875, + "learning_rate": 2.3941466998696305e-05, + "loss": 0.8243, + "step": 7462 + }, + { + "epoch": 0.32720192591293534, + "grad_norm": 0.7890625, + "learning_rate": 2.3938478155981915e-05, + "loss": 0.8424, + "step": 7463 + }, + { + "epoch": 0.3272457691295926, + "grad_norm": 0.84765625, + "learning_rate": 2.393548947447558e-05, + "loss": 0.9339, + "step": 7464 + }, + { + "epoch": 0.3272896123462498, + "grad_norm": 0.8203125, + "learning_rate": 2.393250095418359e-05, + "loss": 0.7472, + "step": 7465 + }, + { + "epoch": 0.32733345556290705, + "grad_norm": 0.84765625, + "learning_rate": 2.3929512595112346e-05, + "loss": 0.9067, + "step": 7466 + }, + { + "epoch": 0.3273772987795643, + "grad_norm": 0.83984375, + "learning_rate": 2.3926524397268157e-05, + "loss": 0.8277, + "step": 7467 + }, + { + "epoch": 0.3274211419962215, + "grad_norm": 0.96484375, + "learning_rate": 2.392353636065735e-05, + "loss": 0.8081, + "step": 7468 + }, + { + "epoch": 0.32746498521287876, + "grad_norm": 0.8203125, + "learning_rate": 2.3920548485286253e-05, + "loss": 0.7692, + "step": 7469 + }, + { + "epoch": 0.327508828429536, + "grad_norm": 0.8359375, + "learning_rate": 2.3917560771161184e-05, + "loss": 0.797, + "step": 7470 + }, + { + "epoch": 0.3275526716461932, + "grad_norm": 0.87890625, + "learning_rate": 2.3914573218288528e-05, + "loss": 0.8399, + "step": 7471 + }, + { + "epoch": 0.32759651486285046, + "grad_norm": 0.8046875, + "learning_rate": 2.3911585826674577e-05, + "loss": 0.7393, + "step": 7472 + }, + { + "epoch": 0.3276403580795077, + "grad_norm": 0.81640625, + "learning_rate": 2.390859859632567e-05, + "loss": 0.8312, + "step": 7473 + }, + { + "epoch": 0.32768420129616493, + "grad_norm": 0.80078125, + "learning_rate": 2.390561152724814e-05, + "loss": 0.8478, + "step": 7474 + }, + { + "epoch": 0.32772804451282217, + "grad_norm": 0.69921875, + "learning_rate": 2.390262461944829e-05, + "loss": 0.7068, + "step": 7475 + }, + { + "epoch": 0.3277718877294794, + "grad_norm": 0.765625, + "learning_rate": 2.3899637872932513e-05, + "loss": 0.6745, + "step": 7476 + }, + { + "epoch": 0.32781573094613664, + "grad_norm": 0.90625, + "learning_rate": 2.3896651287707094e-05, + "loss": 0.8616, + "step": 7477 + }, + { + "epoch": 0.3278595741627939, + "grad_norm": 0.9609375, + "learning_rate": 2.3893664863778375e-05, + "loss": 0.9236, + "step": 7478 + }, + { + "epoch": 0.32790341737945106, + "grad_norm": 0.87890625, + "learning_rate": 2.3890678601152684e-05, + "loss": 0.9862, + "step": 7479 + }, + { + "epoch": 0.3279472605961083, + "grad_norm": 0.8515625, + "learning_rate": 2.3887692499836324e-05, + "loss": 0.7552, + "step": 7480 + }, + { + "epoch": 0.3279911038127655, + "grad_norm": 0.84765625, + "learning_rate": 2.3884706559835678e-05, + "loss": 0.8637, + "step": 7481 + }, + { + "epoch": 0.32803494702942276, + "grad_norm": 0.828125, + "learning_rate": 2.3881720781157047e-05, + "loss": 0.849, + "step": 7482 + }, + { + "epoch": 0.32807879024608, + "grad_norm": 0.9140625, + "learning_rate": 2.3878735163806754e-05, + "loss": 0.7712, + "step": 7483 + }, + { + "epoch": 0.32812263346273723, + "grad_norm": 0.75390625, + "learning_rate": 2.387574970779113e-05, + "loss": 0.797, + "step": 7484 + }, + { + "epoch": 0.32816647667939447, + "grad_norm": 0.81640625, + "learning_rate": 2.38727644131165e-05, + "loss": 0.7559, + "step": 7485 + }, + { + "epoch": 0.3282103198960517, + "grad_norm": 1.40625, + "learning_rate": 2.3869779279789205e-05, + "loss": 0.6688, + "step": 7486 + }, + { + "epoch": 0.32825416311270894, + "grad_norm": 0.75390625, + "learning_rate": 2.3866794307815554e-05, + "loss": 0.811, + "step": 7487 + }, + { + "epoch": 0.3282980063293662, + "grad_norm": 0.75, + "learning_rate": 2.3863809497201884e-05, + "loss": 0.7818, + "step": 7488 + }, + { + "epoch": 0.3283418495460234, + "grad_norm": 0.8125, + "learning_rate": 2.3860824847954488e-05, + "loss": 0.7632, + "step": 7489 + }, + { + "epoch": 0.32838569276268065, + "grad_norm": 0.78125, + "learning_rate": 2.3857840360079754e-05, + "loss": 0.8107, + "step": 7490 + }, + { + "epoch": 0.3284295359793379, + "grad_norm": 0.80078125, + "learning_rate": 2.3854856033583973e-05, + "loss": 0.8805, + "step": 7491 + }, + { + "epoch": 0.3284733791959951, + "grad_norm": 0.890625, + "learning_rate": 2.3851871868473464e-05, + "loss": 0.7319, + "step": 7492 + }, + { + "epoch": 0.32851722241265235, + "grad_norm": 0.765625, + "learning_rate": 2.3848887864754566e-05, + "loss": 0.7718, + "step": 7493 + }, + { + "epoch": 0.3285610656293096, + "grad_norm": 0.82421875, + "learning_rate": 2.3845904022433606e-05, + "loss": 0.7548, + "step": 7494 + }, + { + "epoch": 0.3286049088459668, + "grad_norm": 0.8125, + "learning_rate": 2.384292034151686e-05, + "loss": 0.8977, + "step": 7495 + }, + { + "epoch": 0.32864875206262406, + "grad_norm": 0.8359375, + "learning_rate": 2.3839936822010724e-05, + "loss": 0.9754, + "step": 7496 + }, + { + "epoch": 0.3286925952792813, + "grad_norm": 0.8671875, + "learning_rate": 2.3836953463921497e-05, + "loss": 0.776, + "step": 7497 + }, + { + "epoch": 0.32873643849593853, + "grad_norm": 0.8203125, + "learning_rate": 2.383397026725548e-05, + "loss": 0.7568, + "step": 7498 + }, + { + "epoch": 0.32878028171259577, + "grad_norm": 0.87890625, + "learning_rate": 2.3830987232019018e-05, + "loss": 0.8077, + "step": 7499 + }, + { + "epoch": 0.328824124929253, + "grad_norm": 0.82421875, + "learning_rate": 2.3828004358218393e-05, + "loss": 0.6713, + "step": 7500 + }, + { + "epoch": 0.32886796814591024, + "grad_norm": 0.78515625, + "learning_rate": 2.382502164585999e-05, + "loss": 0.8736, + "step": 7501 + }, + { + "epoch": 0.3289118113625675, + "grad_norm": 0.90625, + "learning_rate": 2.38220390949501e-05, + "loss": 0.7489, + "step": 7502 + }, + { + "epoch": 0.3289556545792247, + "grad_norm": 0.76953125, + "learning_rate": 2.3819056705495036e-05, + "loss": 0.7728, + "step": 7503 + }, + { + "epoch": 0.32899949779588195, + "grad_norm": 0.82421875, + "learning_rate": 2.381607447750114e-05, + "loss": 0.8071, + "step": 7504 + }, + { + "epoch": 0.3290433410125392, + "grad_norm": 0.73046875, + "learning_rate": 2.381309241097468e-05, + "loss": 0.8169, + "step": 7505 + }, + { + "epoch": 0.3290871842291964, + "grad_norm": 0.83203125, + "learning_rate": 2.3810110505922044e-05, + "loss": 0.8081, + "step": 7506 + }, + { + "epoch": 0.32913102744585365, + "grad_norm": 0.8203125, + "learning_rate": 2.380712876234954e-05, + "loss": 0.7486, + "step": 7507 + }, + { + "epoch": 0.3291748706625109, + "grad_norm": 0.86328125, + "learning_rate": 2.3804147180263458e-05, + "loss": 0.8673, + "step": 7508 + }, + { + "epoch": 0.32921871387916807, + "grad_norm": 0.890625, + "learning_rate": 2.380116575967013e-05, + "loss": 0.7858, + "step": 7509 + }, + { + "epoch": 0.3292625570958253, + "grad_norm": 0.8671875, + "learning_rate": 2.379818450057585e-05, + "loss": 0.8852, + "step": 7510 + }, + { + "epoch": 0.32930640031248254, + "grad_norm": 0.9765625, + "learning_rate": 2.3795203402986997e-05, + "loss": 0.7882, + "step": 7511 + }, + { + "epoch": 0.3293502435291398, + "grad_norm": 0.8359375, + "learning_rate": 2.3792222466909842e-05, + "loss": 0.8257, + "step": 7512 + }, + { + "epoch": 0.329394086745797, + "grad_norm": 0.8046875, + "learning_rate": 2.378924169235073e-05, + "loss": 0.827, + "step": 7513 + }, + { + "epoch": 0.32943792996245425, + "grad_norm": 0.85546875, + "learning_rate": 2.378626107931595e-05, + "loss": 0.822, + "step": 7514 + }, + { + "epoch": 0.3294817731791115, + "grad_norm": 0.8515625, + "learning_rate": 2.378328062781182e-05, + "loss": 0.8149, + "step": 7515 + }, + { + "epoch": 0.3295256163957687, + "grad_norm": 0.7890625, + "learning_rate": 2.3780300337844685e-05, + "loss": 0.7233, + "step": 7516 + }, + { + "epoch": 0.32956945961242595, + "grad_norm": 0.84765625, + "learning_rate": 2.377732020942086e-05, + "loss": 0.6996, + "step": 7517 + }, + { + "epoch": 0.3296133028290832, + "grad_norm": 0.828125, + "learning_rate": 2.3774340242546634e-05, + "loss": 0.857, + "step": 7518 + }, + { + "epoch": 0.3296571460457404, + "grad_norm": 0.93359375, + "learning_rate": 2.3771360437228353e-05, + "loss": 0.9637, + "step": 7519 + }, + { + "epoch": 0.32970098926239766, + "grad_norm": 0.765625, + "learning_rate": 2.376838079347228e-05, + "loss": 0.7743, + "step": 7520 + }, + { + "epoch": 0.3297448324790549, + "grad_norm": 0.828125, + "learning_rate": 2.3765401311284785e-05, + "loss": 0.9021, + "step": 7521 + }, + { + "epoch": 0.32978867569571213, + "grad_norm": 0.91796875, + "learning_rate": 2.376242199067218e-05, + "loss": 0.845, + "step": 7522 + }, + { + "epoch": 0.32983251891236937, + "grad_norm": 0.89453125, + "learning_rate": 2.3759442831640756e-05, + "loss": 0.9061, + "step": 7523 + }, + { + "epoch": 0.3298763621290266, + "grad_norm": 1.0859375, + "learning_rate": 2.375646383419684e-05, + "loss": 0.781, + "step": 7524 + }, + { + "epoch": 0.32992020534568384, + "grad_norm": 0.75, + "learning_rate": 2.3753484998346732e-05, + "loss": 0.6835, + "step": 7525 + }, + { + "epoch": 0.3299640485623411, + "grad_norm": 0.91015625, + "learning_rate": 2.3750506324096732e-05, + "loss": 0.7989, + "step": 7526 + }, + { + "epoch": 0.3300078917789983, + "grad_norm": 0.8515625, + "learning_rate": 2.3747527811453206e-05, + "loss": 0.8872, + "step": 7527 + }, + { + "epoch": 0.33005173499565554, + "grad_norm": 0.7890625, + "learning_rate": 2.3744549460422438e-05, + "loss": 0.7807, + "step": 7528 + }, + { + "epoch": 0.3300955782123128, + "grad_norm": 0.82421875, + "learning_rate": 2.3741571271010733e-05, + "loss": 0.8693, + "step": 7529 + }, + { + "epoch": 0.33013942142897, + "grad_norm": 0.8203125, + "learning_rate": 2.3738593243224417e-05, + "loss": 0.8403, + "step": 7530 + }, + { + "epoch": 0.33018326464562725, + "grad_norm": 0.87890625, + "learning_rate": 2.3735615377069786e-05, + "loss": 0.8999, + "step": 7531 + }, + { + "epoch": 0.3302271078622845, + "grad_norm": 0.77734375, + "learning_rate": 2.373263767255316e-05, + "loss": 0.7147, + "step": 7532 + }, + { + "epoch": 0.3302709510789417, + "grad_norm": 0.80078125, + "learning_rate": 2.3729660129680854e-05, + "loss": 0.819, + "step": 7533 + }, + { + "epoch": 0.33031479429559896, + "grad_norm": 0.77734375, + "learning_rate": 2.372668274845917e-05, + "loss": 0.6786, + "step": 7534 + }, + { + "epoch": 0.3303586375122562, + "grad_norm": 0.81640625, + "learning_rate": 2.3723705528894425e-05, + "loss": 0.7313, + "step": 7535 + }, + { + "epoch": 0.33040248072891343, + "grad_norm": 0.9140625, + "learning_rate": 2.3720728470992892e-05, + "loss": 0.7488, + "step": 7536 + }, + { + "epoch": 0.33044632394557066, + "grad_norm": 0.77734375, + "learning_rate": 2.371775157476095e-05, + "loss": 0.6801, + "step": 7537 + }, + { + "epoch": 0.3304901671622279, + "grad_norm": 0.80859375, + "learning_rate": 2.3714774840204866e-05, + "loss": 0.813, + "step": 7538 + }, + { + "epoch": 0.3305340103788851, + "grad_norm": 1.0078125, + "learning_rate": 2.3711798267330964e-05, + "loss": 0.8426, + "step": 7539 + }, + { + "epoch": 0.3305778535955423, + "grad_norm": 0.88671875, + "learning_rate": 2.3708821856145548e-05, + "loss": 0.7354, + "step": 7540 + }, + { + "epoch": 0.33062169681219955, + "grad_norm": 0.78515625, + "learning_rate": 2.3705845606654886e-05, + "loss": 0.7797, + "step": 7541 + }, + { + "epoch": 0.3306655400288568, + "grad_norm": 0.76171875, + "learning_rate": 2.3702869518865355e-05, + "loss": 0.7454, + "step": 7542 + }, + { + "epoch": 0.330709383245514, + "grad_norm": 0.79296875, + "learning_rate": 2.369989359278324e-05, + "loss": 0.646, + "step": 7543 + }, + { + "epoch": 0.33075322646217126, + "grad_norm": 0.83203125, + "learning_rate": 2.3696917828414834e-05, + "loss": 0.8033, + "step": 7544 + }, + { + "epoch": 0.3307970696788285, + "grad_norm": 0.87109375, + "learning_rate": 2.3693942225766442e-05, + "loss": 0.72, + "step": 7545 + }, + { + "epoch": 0.33084091289548573, + "grad_norm": 0.86328125, + "learning_rate": 2.369096678484435e-05, + "loss": 0.8391, + "step": 7546 + }, + { + "epoch": 0.33088475611214296, + "grad_norm": 0.921875, + "learning_rate": 2.368799150565493e-05, + "loss": 0.7626, + "step": 7547 + }, + { + "epoch": 0.3309285993288002, + "grad_norm": 0.7734375, + "learning_rate": 2.368501638820444e-05, + "loss": 0.7306, + "step": 7548 + }, + { + "epoch": 0.33097244254545743, + "grad_norm": 0.76171875, + "learning_rate": 2.3682041432499204e-05, + "loss": 0.7045, + "step": 7549 + }, + { + "epoch": 0.33101628576211467, + "grad_norm": 0.7734375, + "learning_rate": 2.3679066638545523e-05, + "loss": 0.8005, + "step": 7550 + }, + { + "epoch": 0.3310601289787719, + "grad_norm": 0.78515625, + "learning_rate": 2.367609200634966e-05, + "loss": 0.6945, + "step": 7551 + }, + { + "epoch": 0.33110397219542914, + "grad_norm": 0.82421875, + "learning_rate": 2.3673117535917988e-05, + "loss": 0.7434, + "step": 7552 + }, + { + "epoch": 0.3311478154120864, + "grad_norm": 0.77734375, + "learning_rate": 2.367014322725678e-05, + "loss": 0.8138, + "step": 7553 + }, + { + "epoch": 0.3311916586287436, + "grad_norm": 0.83203125, + "learning_rate": 2.3667169080372342e-05, + "loss": 0.8414, + "step": 7554 + }, + { + "epoch": 0.33123550184540085, + "grad_norm": 0.85546875, + "learning_rate": 2.366419509527097e-05, + "loss": 0.8756, + "step": 7555 + }, + { + "epoch": 0.3312793450620581, + "grad_norm": 0.79296875, + "learning_rate": 2.3661221271958956e-05, + "loss": 0.8026, + "step": 7556 + }, + { + "epoch": 0.3313231882787153, + "grad_norm": 0.6953125, + "learning_rate": 2.365824761044264e-05, + "loss": 0.7485, + "step": 7557 + }, + { + "epoch": 0.33136703149537255, + "grad_norm": 0.8515625, + "learning_rate": 2.365527411072831e-05, + "loss": 0.7922, + "step": 7558 + }, + { + "epoch": 0.3314108747120298, + "grad_norm": 0.796875, + "learning_rate": 2.3652300772822267e-05, + "loss": 0.8143, + "step": 7559 + }, + { + "epoch": 0.331454717928687, + "grad_norm": 0.81640625, + "learning_rate": 2.3649327596730796e-05, + "loss": 0.8341, + "step": 7560 + }, + { + "epoch": 0.33149856114534426, + "grad_norm": 0.8046875, + "learning_rate": 2.3646354582460196e-05, + "loss": 0.7025, + "step": 7561 + }, + { + "epoch": 0.3315424043620015, + "grad_norm": 0.8046875, + "learning_rate": 2.3643381730016802e-05, + "loss": 0.8219, + "step": 7562 + }, + { + "epoch": 0.33158624757865873, + "grad_norm": 0.87109375, + "learning_rate": 2.3640409039406896e-05, + "loss": 0.7898, + "step": 7563 + }, + { + "epoch": 0.33163009079531597, + "grad_norm": 0.81640625, + "learning_rate": 2.363743651063679e-05, + "loss": 0.7744, + "step": 7564 + }, + { + "epoch": 0.3316739340119732, + "grad_norm": 0.89453125, + "learning_rate": 2.3634464143712763e-05, + "loss": 0.7954, + "step": 7565 + }, + { + "epoch": 0.33171777722863044, + "grad_norm": 0.7734375, + "learning_rate": 2.3631491938641093e-05, + "loss": 0.905, + "step": 7566 + }, + { + "epoch": 0.3317616204452877, + "grad_norm": 0.89453125, + "learning_rate": 2.362851989542815e-05, + "loss": 0.8053, + "step": 7567 + }, + { + "epoch": 0.3318054636619449, + "grad_norm": 0.8125, + "learning_rate": 2.362554801408019e-05, + "loss": 0.7985, + "step": 7568 + }, + { + "epoch": 0.33184930687860215, + "grad_norm": 0.82421875, + "learning_rate": 2.3622576294603505e-05, + "loss": 0.7641, + "step": 7569 + }, + { + "epoch": 0.3318931500952593, + "grad_norm": 0.859375, + "learning_rate": 2.361960473700442e-05, + "loss": 0.9487, + "step": 7570 + }, + { + "epoch": 0.33193699331191656, + "grad_norm": 0.78515625, + "learning_rate": 2.3616633341289206e-05, + "loss": 0.7585, + "step": 7571 + }, + { + "epoch": 0.3319808365285738, + "grad_norm": 0.890625, + "learning_rate": 2.3613662107464175e-05, + "loss": 0.8083, + "step": 7572 + }, + { + "epoch": 0.33202467974523103, + "grad_norm": 0.82421875, + "learning_rate": 2.3610691035535627e-05, + "loss": 0.913, + "step": 7573 + }, + { + "epoch": 0.33206852296188827, + "grad_norm": 0.84375, + "learning_rate": 2.3607720125509848e-05, + "loss": 0.8153, + "step": 7574 + }, + { + "epoch": 0.3321123661785455, + "grad_norm": 0.8203125, + "learning_rate": 2.360474937739311e-05, + "loss": 0.9348, + "step": 7575 + }, + { + "epoch": 0.33215620939520274, + "grad_norm": 0.83203125, + "learning_rate": 2.3601778791191763e-05, + "loss": 0.7377, + "step": 7576 + }, + { + "epoch": 0.33220005261186, + "grad_norm": 0.828125, + "learning_rate": 2.3598808366912084e-05, + "loss": 0.9011, + "step": 7577 + }, + { + "epoch": 0.3322438958285172, + "grad_norm": 0.859375, + "learning_rate": 2.3595838104560365e-05, + "loss": 0.8035, + "step": 7578 + }, + { + "epoch": 0.33228773904517445, + "grad_norm": 0.87890625, + "learning_rate": 2.3592868004142897e-05, + "loss": 0.8739, + "step": 7579 + }, + { + "epoch": 0.3323315822618317, + "grad_norm": 0.796875, + "learning_rate": 2.3589898065665982e-05, + "loss": 0.7708, + "step": 7580 + }, + { + "epoch": 0.3323754254784889, + "grad_norm": 0.8125, + "learning_rate": 2.358692828913588e-05, + "loss": 0.7785, + "step": 7581 + }, + { + "epoch": 0.33241926869514615, + "grad_norm": 0.8203125, + "learning_rate": 2.3583958674558948e-05, + "loss": 0.8144, + "step": 7582 + }, + { + "epoch": 0.3324631119118034, + "grad_norm": 1.1796875, + "learning_rate": 2.358098922194144e-05, + "loss": 0.8113, + "step": 7583 + }, + { + "epoch": 0.3325069551284606, + "grad_norm": 0.85546875, + "learning_rate": 2.3578019931289675e-05, + "loss": 0.9067, + "step": 7584 + }, + { + "epoch": 0.33255079834511786, + "grad_norm": 0.98828125, + "learning_rate": 2.3575050802609912e-05, + "loss": 0.9504, + "step": 7585 + }, + { + "epoch": 0.3325946415617751, + "grad_norm": 0.83203125, + "learning_rate": 2.3572081835908432e-05, + "loss": 0.8068, + "step": 7586 + }, + { + "epoch": 0.33263848477843233, + "grad_norm": 1.1796875, + "learning_rate": 2.3569113031191592e-05, + "loss": 0.7804, + "step": 7587 + }, + { + "epoch": 0.33268232799508957, + "grad_norm": 0.78125, + "learning_rate": 2.356614438846565e-05, + "loss": 0.8302, + "step": 7588 + }, + { + "epoch": 0.3327261712117468, + "grad_norm": 0.80078125, + "learning_rate": 2.3563175907736888e-05, + "loss": 0.8635, + "step": 7589 + }, + { + "epoch": 0.33277001442840404, + "grad_norm": 0.7421875, + "learning_rate": 2.3560207589011608e-05, + "loss": 0.7269, + "step": 7590 + }, + { + "epoch": 0.3328138576450613, + "grad_norm": 0.83203125, + "learning_rate": 2.3557239432296074e-05, + "loss": 0.8002, + "step": 7591 + }, + { + "epoch": 0.3328577008617185, + "grad_norm": 0.87109375, + "learning_rate": 2.355427143759663e-05, + "loss": 0.8723, + "step": 7592 + }, + { + "epoch": 0.33290154407837574, + "grad_norm": 0.82421875, + "learning_rate": 2.3551303604919527e-05, + "loss": 0.9179, + "step": 7593 + }, + { + "epoch": 0.332945387295033, + "grad_norm": 0.97265625, + "learning_rate": 2.354833593427108e-05, + "loss": 0.7848, + "step": 7594 + }, + { + "epoch": 0.3329892305116902, + "grad_norm": 0.78515625, + "learning_rate": 2.3545368425657565e-05, + "loss": 0.7584, + "step": 7595 + }, + { + "epoch": 0.33303307372834745, + "grad_norm": 0.78515625, + "learning_rate": 2.354240107908524e-05, + "loss": 0.7455, + "step": 7596 + }, + { + "epoch": 0.3330769169450047, + "grad_norm": 0.8671875, + "learning_rate": 2.3539433894560448e-05, + "loss": 0.7868, + "step": 7597 + }, + { + "epoch": 0.3331207601616619, + "grad_norm": 0.7890625, + "learning_rate": 2.3536466872089457e-05, + "loss": 0.8073, + "step": 7598 + }, + { + "epoch": 0.33316460337831916, + "grad_norm": 0.7578125, + "learning_rate": 2.3533500011678566e-05, + "loss": 0.6942, + "step": 7599 + }, + { + "epoch": 0.33320844659497634, + "grad_norm": 0.83203125, + "learning_rate": 2.3530533313334046e-05, + "loss": 0.8945, + "step": 7600 + }, + { + "epoch": 0.3332522898116336, + "grad_norm": 0.859375, + "learning_rate": 2.352756677706215e-05, + "loss": 0.7661, + "step": 7601 + }, + { + "epoch": 0.3332961330282908, + "grad_norm": 0.85546875, + "learning_rate": 2.3524600402869244e-05, + "loss": 0.8917, + "step": 7602 + }, + { + "epoch": 0.33333997624494804, + "grad_norm": 0.86328125, + "learning_rate": 2.3521634190761576e-05, + "loss": 0.8178, + "step": 7603 + }, + { + "epoch": 0.3333838194616053, + "grad_norm": 0.79296875, + "learning_rate": 2.3518668140745435e-05, + "loss": 0.7743, + "step": 7604 + }, + { + "epoch": 0.3334276626782625, + "grad_norm": 0.90625, + "learning_rate": 2.3515702252827098e-05, + "loss": 0.8904, + "step": 7605 + }, + { + "epoch": 0.33347150589491975, + "grad_norm": 0.8046875, + "learning_rate": 2.3512736527012835e-05, + "loss": 0.8474, + "step": 7606 + }, + { + "epoch": 0.333515349111577, + "grad_norm": 0.8671875, + "learning_rate": 2.3509770963308986e-05, + "loss": 0.8964, + "step": 7607 + }, + { + "epoch": 0.3335591923282342, + "grad_norm": 0.765625, + "learning_rate": 2.3506805561721802e-05, + "loss": 0.7245, + "step": 7608 + }, + { + "epoch": 0.33360303554489146, + "grad_norm": 1.1796875, + "learning_rate": 2.3503840322257564e-05, + "loss": 0.9403, + "step": 7609 + }, + { + "epoch": 0.3336468787615487, + "grad_norm": 0.84765625, + "learning_rate": 2.350087524492258e-05, + "loss": 0.8946, + "step": 7610 + }, + { + "epoch": 0.33369072197820593, + "grad_norm": 0.8671875, + "learning_rate": 2.3497910329723104e-05, + "loss": 0.8806, + "step": 7611 + }, + { + "epoch": 0.33373456519486316, + "grad_norm": 0.8046875, + "learning_rate": 2.349494557666542e-05, + "loss": 0.8051, + "step": 7612 + }, + { + "epoch": 0.3337784084115204, + "grad_norm": 0.90625, + "learning_rate": 2.3491980985755846e-05, + "loss": 0.7914, + "step": 7613 + }, + { + "epoch": 0.33382225162817764, + "grad_norm": 0.7890625, + "learning_rate": 2.3489016557000642e-05, + "loss": 0.8556, + "step": 7614 + }, + { + "epoch": 0.33386609484483487, + "grad_norm": 0.8984375, + "learning_rate": 2.34860522904061e-05, + "loss": 0.7825, + "step": 7615 + }, + { + "epoch": 0.3339099380614921, + "grad_norm": 0.8359375, + "learning_rate": 2.3483088185978497e-05, + "loss": 0.8723, + "step": 7616 + }, + { + "epoch": 0.33395378127814934, + "grad_norm": 0.77734375, + "learning_rate": 2.348012424372411e-05, + "loss": 0.6504, + "step": 7617 + }, + { + "epoch": 0.3339976244948066, + "grad_norm": 0.8828125, + "learning_rate": 2.3477160463649227e-05, + "loss": 0.851, + "step": 7618 + }, + { + "epoch": 0.3340414677114638, + "grad_norm": 0.765625, + "learning_rate": 2.3474196845760133e-05, + "loss": 0.7131, + "step": 7619 + }, + { + "epoch": 0.33408531092812105, + "grad_norm": 0.86328125, + "learning_rate": 2.3471233390063107e-05, + "loss": 0.9576, + "step": 7620 + }, + { + "epoch": 0.3341291541447783, + "grad_norm": 0.8125, + "learning_rate": 2.3468270096564394e-05, + "loss": 0.7938, + "step": 7621 + }, + { + "epoch": 0.3341729973614355, + "grad_norm": 0.93359375, + "learning_rate": 2.3465306965270327e-05, + "loss": 0.8317, + "step": 7622 + }, + { + "epoch": 0.33421684057809276, + "grad_norm": 0.828125, + "learning_rate": 2.3462343996187187e-05, + "loss": 0.8651, + "step": 7623 + }, + { + "epoch": 0.33426068379475, + "grad_norm": 0.87890625, + "learning_rate": 2.3459381189321217e-05, + "loss": 0.8426, + "step": 7624 + }, + { + "epoch": 0.3343045270114072, + "grad_norm": 0.734375, + "learning_rate": 2.3456418544678727e-05, + "loss": 0.6918, + "step": 7625 + }, + { + "epoch": 0.33434837022806446, + "grad_norm": 0.83984375, + "learning_rate": 2.3453456062265942e-05, + "loss": 0.8025, + "step": 7626 + }, + { + "epoch": 0.3343922134447217, + "grad_norm": 0.69140625, + "learning_rate": 2.345049374208922e-05, + "loss": 0.7436, + "step": 7627 + }, + { + "epoch": 0.33443605666137893, + "grad_norm": 0.88671875, + "learning_rate": 2.3447531584154802e-05, + "loss": 0.8483, + "step": 7628 + }, + { + "epoch": 0.33447989987803617, + "grad_norm": 0.9140625, + "learning_rate": 2.344456958846897e-05, + "loss": 0.8014, + "step": 7629 + }, + { + "epoch": 0.33452374309469335, + "grad_norm": 0.84375, + "learning_rate": 2.3441607755037985e-05, + "loss": 0.8072, + "step": 7630 + }, + { + "epoch": 0.3345675863113506, + "grad_norm": 0.8515625, + "learning_rate": 2.343864608386811e-05, + "loss": 0.8311, + "step": 7631 + }, + { + "epoch": 0.3346114295280078, + "grad_norm": 0.80859375, + "learning_rate": 2.3435684574965676e-05, + "loss": 0.783, + "step": 7632 + }, + { + "epoch": 0.33465527274466506, + "grad_norm": 1.140625, + "learning_rate": 2.3432723228336938e-05, + "loss": 0.7153, + "step": 7633 + }, + { + "epoch": 0.3346991159613223, + "grad_norm": 0.78125, + "learning_rate": 2.3429762043988156e-05, + "loss": 0.8038, + "step": 7634 + }, + { + "epoch": 0.3347429591779795, + "grad_norm": 0.8515625, + "learning_rate": 2.3426801021925628e-05, + "loss": 0.7434, + "step": 7635 + }, + { + "epoch": 0.33478680239463676, + "grad_norm": 0.79296875, + "learning_rate": 2.3423840162155585e-05, + "loss": 0.824, + "step": 7636 + }, + { + "epoch": 0.334830645611294, + "grad_norm": 0.87890625, + "learning_rate": 2.3420879464684376e-05, + "loss": 0.8593, + "step": 7637 + }, + { + "epoch": 0.33487448882795123, + "grad_norm": 0.796875, + "learning_rate": 2.3417918929518223e-05, + "loss": 0.674, + "step": 7638 + }, + { + "epoch": 0.33491833204460847, + "grad_norm": 0.9375, + "learning_rate": 2.3414958556663413e-05, + "loss": 0.8796, + "step": 7639 + }, + { + "epoch": 0.3349621752612657, + "grad_norm": 0.8671875, + "learning_rate": 2.341199834612623e-05, + "loss": 0.8671, + "step": 7640 + }, + { + "epoch": 0.33500601847792294, + "grad_norm": 0.78515625, + "learning_rate": 2.34090382979129e-05, + "loss": 0.7994, + "step": 7641 + }, + { + "epoch": 0.3350498616945802, + "grad_norm": 0.83984375, + "learning_rate": 2.3406078412029774e-05, + "loss": 0.6505, + "step": 7642 + }, + { + "epoch": 0.3350937049112374, + "grad_norm": 0.7421875, + "learning_rate": 2.3403118688483083e-05, + "loss": 0.6861, + "step": 7643 + }, + { + "epoch": 0.33513754812789465, + "grad_norm": 0.91796875, + "learning_rate": 2.34001591272791e-05, + "loss": 0.8439, + "step": 7644 + }, + { + "epoch": 0.3351813913445519, + "grad_norm": 0.8125, + "learning_rate": 2.33971997284241e-05, + "loss": 0.7797, + "step": 7645 + }, + { + "epoch": 0.3352252345612091, + "grad_norm": 0.80078125, + "learning_rate": 2.3394240491924335e-05, + "loss": 0.8292, + "step": 7646 + }, + { + "epoch": 0.33526907777786635, + "grad_norm": 0.859375, + "learning_rate": 2.3391281417786125e-05, + "loss": 0.8939, + "step": 7647 + }, + { + "epoch": 0.3353129209945236, + "grad_norm": 0.859375, + "learning_rate": 2.3388322506015715e-05, + "loss": 0.8765, + "step": 7648 + }, + { + "epoch": 0.3353567642111808, + "grad_norm": 0.84765625, + "learning_rate": 2.3385363756619384e-05, + "loss": 0.8593, + "step": 7649 + }, + { + "epoch": 0.33540060742783806, + "grad_norm": 0.8359375, + "learning_rate": 2.3382405169603384e-05, + "loss": 0.7271, + "step": 7650 + }, + { + "epoch": 0.3354444506444953, + "grad_norm": 0.94921875, + "learning_rate": 2.3379446744973976e-05, + "loss": 0.7691, + "step": 7651 + }, + { + "epoch": 0.33548829386115253, + "grad_norm": 0.8203125, + "learning_rate": 2.337648848273747e-05, + "loss": 0.8303, + "step": 7652 + }, + { + "epoch": 0.33553213707780977, + "grad_norm": 0.80859375, + "learning_rate": 2.337353038290012e-05, + "loss": 0.8127, + "step": 7653 + }, + { + "epoch": 0.335575980294467, + "grad_norm": 0.8203125, + "learning_rate": 2.3370572445468196e-05, + "loss": 0.7657, + "step": 7654 + }, + { + "epoch": 0.33561982351112424, + "grad_norm": 0.859375, + "learning_rate": 2.336761467044797e-05, + "loss": 0.7882, + "step": 7655 + }, + { + "epoch": 0.3356636667277815, + "grad_norm": 0.78515625, + "learning_rate": 2.3364657057845695e-05, + "loss": 0.8609, + "step": 7656 + }, + { + "epoch": 0.3357075099444387, + "grad_norm": 0.84375, + "learning_rate": 2.3361699607667654e-05, + "loss": 0.7936, + "step": 7657 + }, + { + "epoch": 0.33575135316109594, + "grad_norm": 0.81640625, + "learning_rate": 2.335874231992011e-05, + "loss": 0.7383, + "step": 7658 + }, + { + "epoch": 0.3357951963777532, + "grad_norm": 0.875, + "learning_rate": 2.3355785194609326e-05, + "loss": 0.8893, + "step": 7659 + }, + { + "epoch": 0.3358390395944104, + "grad_norm": 0.77734375, + "learning_rate": 2.335282823174155e-05, + "loss": 0.705, + "step": 7660 + }, + { + "epoch": 0.3358828828110676, + "grad_norm": 0.8515625, + "learning_rate": 2.33498714313231e-05, + "loss": 0.7791, + "step": 7661 + }, + { + "epoch": 0.33592672602772483, + "grad_norm": 0.80859375, + "learning_rate": 2.334691479336022e-05, + "loss": 0.726, + "step": 7662 + }, + { + "epoch": 0.33597056924438207, + "grad_norm": 0.8515625, + "learning_rate": 2.334395831785917e-05, + "loss": 0.9414, + "step": 7663 + }, + { + "epoch": 0.3360144124610393, + "grad_norm": 0.7890625, + "learning_rate": 2.334100200482623e-05, + "loss": 0.7438, + "step": 7664 + }, + { + "epoch": 0.33605825567769654, + "grad_norm": 0.8125, + "learning_rate": 2.3338045854267644e-05, + "loss": 0.7669, + "step": 7665 + }, + { + "epoch": 0.3361020988943538, + "grad_norm": 0.78125, + "learning_rate": 2.333508986618965e-05, + "loss": 0.8114, + "step": 7666 + }, + { + "epoch": 0.336145942111011, + "grad_norm": 0.828125, + "learning_rate": 2.3332134040598597e-05, + "loss": 0.9477, + "step": 7667 + }, + { + "epoch": 0.33618978532766824, + "grad_norm": 0.86328125, + "learning_rate": 2.3329178377500694e-05, + "loss": 0.8878, + "step": 7668 + }, + { + "epoch": 0.3362336285443255, + "grad_norm": 0.83984375, + "learning_rate": 2.3326222876902216e-05, + "loss": 0.9403, + "step": 7669 + }, + { + "epoch": 0.3362774717609827, + "grad_norm": 0.81640625, + "learning_rate": 2.3323267538809435e-05, + "loss": 0.7929, + "step": 7670 + }, + { + "epoch": 0.33632131497763995, + "grad_norm": 0.7578125, + "learning_rate": 2.3320312363228592e-05, + "loss": 0.8346, + "step": 7671 + }, + { + "epoch": 0.3363651581942972, + "grad_norm": 0.7890625, + "learning_rate": 2.331735735016595e-05, + "loss": 0.7416, + "step": 7672 + }, + { + "epoch": 0.3364090014109544, + "grad_norm": 0.8046875, + "learning_rate": 2.331440249962781e-05, + "loss": 0.7066, + "step": 7673 + }, + { + "epoch": 0.33645284462761166, + "grad_norm": 0.76953125, + "learning_rate": 2.33114478116204e-05, + "loss": 0.7526, + "step": 7674 + }, + { + "epoch": 0.3364966878442689, + "grad_norm": 0.73828125, + "learning_rate": 2.3308493286150003e-05, + "loss": 0.6896, + "step": 7675 + }, + { + "epoch": 0.33654053106092613, + "grad_norm": 0.8046875, + "learning_rate": 2.330553892322287e-05, + "loss": 0.8731, + "step": 7676 + }, + { + "epoch": 0.33658437427758336, + "grad_norm": 0.79296875, + "learning_rate": 2.3302584722845234e-05, + "loss": 0.7922, + "step": 7677 + }, + { + "epoch": 0.3366282174942406, + "grad_norm": 1.078125, + "learning_rate": 2.329963068502341e-05, + "loss": 0.8197, + "step": 7678 + }, + { + "epoch": 0.33667206071089784, + "grad_norm": 0.84375, + "learning_rate": 2.329667680976364e-05, + "loss": 0.8341, + "step": 7679 + }, + { + "epoch": 0.33671590392755507, + "grad_norm": 0.7890625, + "learning_rate": 2.3293723097072185e-05, + "loss": 0.8374, + "step": 7680 + }, + { + "epoch": 0.3367597471442123, + "grad_norm": 0.79296875, + "learning_rate": 2.3290769546955292e-05, + "loss": 0.8788, + "step": 7681 + }, + { + "epoch": 0.33680359036086954, + "grad_norm": 0.828125, + "learning_rate": 2.3287816159419195e-05, + "loss": 0.8254, + "step": 7682 + }, + { + "epoch": 0.3368474335775268, + "grad_norm": 0.8046875, + "learning_rate": 2.328486293447022e-05, + "loss": 0.9074, + "step": 7683 + }, + { + "epoch": 0.336891276794184, + "grad_norm": 0.87109375, + "learning_rate": 2.3281909872114592e-05, + "loss": 0.9022, + "step": 7684 + }, + { + "epoch": 0.33693512001084125, + "grad_norm": 0.7578125, + "learning_rate": 2.3278956972358578e-05, + "loss": 0.6883, + "step": 7685 + }, + { + "epoch": 0.3369789632274985, + "grad_norm": 0.828125, + "learning_rate": 2.3276004235208425e-05, + "loss": 0.7416, + "step": 7686 + }, + { + "epoch": 0.3370228064441557, + "grad_norm": 0.875, + "learning_rate": 2.3273051660670376e-05, + "loss": 1.0013, + "step": 7687 + }, + { + "epoch": 0.33706664966081296, + "grad_norm": 0.8359375, + "learning_rate": 2.327009924875072e-05, + "loss": 0.9797, + "step": 7688 + }, + { + "epoch": 0.3371104928774702, + "grad_norm": 0.83984375, + "learning_rate": 2.3267146999455715e-05, + "loss": 0.9381, + "step": 7689 + }, + { + "epoch": 0.3371543360941274, + "grad_norm": 0.8828125, + "learning_rate": 2.3264194912791605e-05, + "loss": 0.7967, + "step": 7690 + }, + { + "epoch": 0.3371981793107846, + "grad_norm": 0.75390625, + "learning_rate": 2.326124298876464e-05, + "loss": 0.7312, + "step": 7691 + }, + { + "epoch": 0.33724202252744184, + "grad_norm": 0.80859375, + "learning_rate": 2.3258291227381068e-05, + "loss": 0.6676, + "step": 7692 + }, + { + "epoch": 0.3372858657440991, + "grad_norm": 0.76171875, + "learning_rate": 2.3255339628647176e-05, + "loss": 0.7792, + "step": 7693 + }, + { + "epoch": 0.3373297089607563, + "grad_norm": 0.74609375, + "learning_rate": 2.325238819256922e-05, + "loss": 0.9347, + "step": 7694 + }, + { + "epoch": 0.33737355217741355, + "grad_norm": 0.8203125, + "learning_rate": 2.3249436919153432e-05, + "loss": 0.8215, + "step": 7695 + }, + { + "epoch": 0.3374173953940708, + "grad_norm": 0.78125, + "learning_rate": 2.324648580840607e-05, + "loss": 0.8204, + "step": 7696 + }, + { + "epoch": 0.337461238610728, + "grad_norm": 0.859375, + "learning_rate": 2.3243534860333372e-05, + "loss": 0.7852, + "step": 7697 + }, + { + "epoch": 0.33750508182738526, + "grad_norm": 0.8125, + "learning_rate": 2.3240584074941652e-05, + "loss": 0.7461, + "step": 7698 + }, + { + "epoch": 0.3375489250440425, + "grad_norm": 0.82421875, + "learning_rate": 2.3237633452237117e-05, + "loss": 0.9432, + "step": 7699 + }, + { + "epoch": 0.3375927682606997, + "grad_norm": 0.84375, + "learning_rate": 2.323468299222603e-05, + "loss": 0.7179, + "step": 7700 + }, + { + "epoch": 0.33763661147735696, + "grad_norm": 0.8515625, + "learning_rate": 2.3231732694914643e-05, + "loss": 0.86, + "step": 7701 + }, + { + "epoch": 0.3376804546940142, + "grad_norm": 0.85546875, + "learning_rate": 2.3228782560309216e-05, + "loss": 0.8053, + "step": 7702 + }, + { + "epoch": 0.33772429791067143, + "grad_norm": 0.77734375, + "learning_rate": 2.3225832588415995e-05, + "loss": 0.7142, + "step": 7703 + }, + { + "epoch": 0.33776814112732867, + "grad_norm": 0.8515625, + "learning_rate": 2.322288277924123e-05, + "loss": 0.8854, + "step": 7704 + }, + { + "epoch": 0.3378119843439859, + "grad_norm": 0.7578125, + "learning_rate": 2.3219933132791183e-05, + "loss": 0.7148, + "step": 7705 + }, + { + "epoch": 0.33785582756064314, + "grad_norm": 0.8984375, + "learning_rate": 2.321698364907209e-05, + "loss": 0.8456, + "step": 7706 + }, + { + "epoch": 0.3378996707773004, + "grad_norm": 0.80078125, + "learning_rate": 2.3214034328090194e-05, + "loss": 0.745, + "step": 7707 + }, + { + "epoch": 0.3379435139939576, + "grad_norm": 0.83203125, + "learning_rate": 2.3211085169851788e-05, + "loss": 0.7678, + "step": 7708 + }, + { + "epoch": 0.33798735721061485, + "grad_norm": 0.76171875, + "learning_rate": 2.320813617436309e-05, + "loss": 0.7436, + "step": 7709 + }, + { + "epoch": 0.3380312004272721, + "grad_norm": 0.76953125, + "learning_rate": 2.3205187341630364e-05, + "loss": 0.7329, + "step": 7710 + }, + { + "epoch": 0.3380750436439293, + "grad_norm": 0.8984375, + "learning_rate": 2.3202238671659858e-05, + "loss": 0.7402, + "step": 7711 + }, + { + "epoch": 0.33811888686058655, + "grad_norm": 0.84375, + "learning_rate": 2.319929016445779e-05, + "loss": 0.8867, + "step": 7712 + }, + { + "epoch": 0.3381627300772438, + "grad_norm": 0.7421875, + "learning_rate": 2.3196341820030454e-05, + "loss": 0.6801, + "step": 7713 + }, + { + "epoch": 0.338206573293901, + "grad_norm": 0.82421875, + "learning_rate": 2.3193393638384085e-05, + "loss": 0.8165, + "step": 7714 + }, + { + "epoch": 0.33825041651055826, + "grad_norm": 1.0078125, + "learning_rate": 2.3190445619524936e-05, + "loss": 0.8692, + "step": 7715 + }, + { + "epoch": 0.3382942597272155, + "grad_norm": 0.75390625, + "learning_rate": 2.3187497763459242e-05, + "loss": 0.7923, + "step": 7716 + }, + { + "epoch": 0.33833810294387273, + "grad_norm": 0.7109375, + "learning_rate": 2.3184550070193232e-05, + "loss": 0.6993, + "step": 7717 + }, + { + "epoch": 0.33838194616052997, + "grad_norm": 0.84765625, + "learning_rate": 2.3181602539733206e-05, + "loss": 0.7895, + "step": 7718 + }, + { + "epoch": 0.3384257893771872, + "grad_norm": 0.71484375, + "learning_rate": 2.3178655172085374e-05, + "loss": 0.7788, + "step": 7719 + }, + { + "epoch": 0.33846963259384444, + "grad_norm": 0.8515625, + "learning_rate": 2.3175707967256e-05, + "loss": 0.8469, + "step": 7720 + }, + { + "epoch": 0.3385134758105016, + "grad_norm": 0.88671875, + "learning_rate": 2.3172760925251314e-05, + "loss": 0.9785, + "step": 7721 + }, + { + "epoch": 0.33855731902715885, + "grad_norm": 0.8984375, + "learning_rate": 2.3169814046077553e-05, + "loss": 0.9433, + "step": 7722 + }, + { + "epoch": 0.3386011622438161, + "grad_norm": 0.7890625, + "learning_rate": 2.316686732974099e-05, + "loss": 0.8392, + "step": 7723 + }, + { + "epoch": 0.3386450054604733, + "grad_norm": 0.7890625, + "learning_rate": 2.3163920776247883e-05, + "loss": 0.7403, + "step": 7724 + }, + { + "epoch": 0.33868884867713056, + "grad_norm": 0.84765625, + "learning_rate": 2.316097438560444e-05, + "loss": 0.7823, + "step": 7725 + }, + { + "epoch": 0.3387326918937878, + "grad_norm": 0.859375, + "learning_rate": 2.3158028157816923e-05, + "loss": 0.6867, + "step": 7726 + }, + { + "epoch": 0.33877653511044503, + "grad_norm": 0.82421875, + "learning_rate": 2.3155082092891544e-05, + "loss": 0.8119, + "step": 7727 + }, + { + "epoch": 0.33882037832710227, + "grad_norm": 0.80859375, + "learning_rate": 2.3152136190834605e-05, + "loss": 0.738, + "step": 7728 + }, + { + "epoch": 0.3388642215437595, + "grad_norm": 0.85546875, + "learning_rate": 2.3149190451652324e-05, + "loss": 0.8305, + "step": 7729 + }, + { + "epoch": 0.33890806476041674, + "grad_norm": 0.78125, + "learning_rate": 2.314624487535093e-05, + "loss": 0.7906, + "step": 7730 + }, + { + "epoch": 0.338951907977074, + "grad_norm": 0.79296875, + "learning_rate": 2.3143299461936695e-05, + "loss": 0.7465, + "step": 7731 + }, + { + "epoch": 0.3389957511937312, + "grad_norm": 0.8671875, + "learning_rate": 2.3140354211415806e-05, + "loss": 0.891, + "step": 7732 + }, + { + "epoch": 0.33903959441038845, + "grad_norm": 0.71484375, + "learning_rate": 2.3137409123794574e-05, + "loss": 0.698, + "step": 7733 + }, + { + "epoch": 0.3390834376270457, + "grad_norm": 0.84375, + "learning_rate": 2.31344641990792e-05, + "loss": 0.9165, + "step": 7734 + }, + { + "epoch": 0.3391272808437029, + "grad_norm": 0.85546875, + "learning_rate": 2.313151943727595e-05, + "loss": 0.7968, + "step": 7735 + }, + { + "epoch": 0.33917112406036015, + "grad_norm": 0.828125, + "learning_rate": 2.3128574838391036e-05, + "loss": 0.8329, + "step": 7736 + }, + { + "epoch": 0.3392149672770174, + "grad_norm": 0.69921875, + "learning_rate": 2.31256304024307e-05, + "loss": 0.774, + "step": 7737 + }, + { + "epoch": 0.3392588104936746, + "grad_norm": 0.87109375, + "learning_rate": 2.3122686129401216e-05, + "loss": 0.9504, + "step": 7738 + }, + { + "epoch": 0.33930265371033186, + "grad_norm": 0.8828125, + "learning_rate": 2.311974201930881e-05, + "loss": 0.8221, + "step": 7739 + }, + { + "epoch": 0.3393464969269891, + "grad_norm": 0.82421875, + "learning_rate": 2.311679807215972e-05, + "loss": 0.7691, + "step": 7740 + }, + { + "epoch": 0.33939034014364633, + "grad_norm": 0.88671875, + "learning_rate": 2.3113854287960167e-05, + "loss": 0.9612, + "step": 7741 + }, + { + "epoch": 0.33943418336030357, + "grad_norm": 0.86328125, + "learning_rate": 2.311091066671641e-05, + "loss": 0.7656, + "step": 7742 + }, + { + "epoch": 0.3394780265769608, + "grad_norm": 0.8359375, + "learning_rate": 2.310796720843469e-05, + "loss": 0.7896, + "step": 7743 + }, + { + "epoch": 0.33952186979361804, + "grad_norm": 0.8984375, + "learning_rate": 2.310502391312124e-05, + "loss": 0.9784, + "step": 7744 + }, + { + "epoch": 0.33956571301027527, + "grad_norm": 0.8359375, + "learning_rate": 2.3102080780782288e-05, + "loss": 0.7689, + "step": 7745 + }, + { + "epoch": 0.3396095562269325, + "grad_norm": 0.76171875, + "learning_rate": 2.3099137811424067e-05, + "loss": 0.7041, + "step": 7746 + }, + { + "epoch": 0.33965339944358974, + "grad_norm": 0.796875, + "learning_rate": 2.3096195005052833e-05, + "loss": 0.7854, + "step": 7747 + }, + { + "epoch": 0.339697242660247, + "grad_norm": 0.8046875, + "learning_rate": 2.3093252361674832e-05, + "loss": 0.7013, + "step": 7748 + }, + { + "epoch": 0.3397410858769042, + "grad_norm": 1.015625, + "learning_rate": 2.309030988129629e-05, + "loss": 0.8201, + "step": 7749 + }, + { + "epoch": 0.33978492909356145, + "grad_norm": 0.8515625, + "learning_rate": 2.3087367563923435e-05, + "loss": 0.7767, + "step": 7750 + }, + { + "epoch": 0.3398287723102187, + "grad_norm": 0.8359375, + "learning_rate": 2.3084425409562514e-05, + "loss": 0.8563, + "step": 7751 + }, + { + "epoch": 0.33987261552687587, + "grad_norm": 0.80078125, + "learning_rate": 2.308148341821972e-05, + "loss": 0.8185, + "step": 7752 + }, + { + "epoch": 0.3399164587435331, + "grad_norm": 0.8515625, + "learning_rate": 2.307854158990136e-05, + "loss": 0.8035, + "step": 7753 + }, + { + "epoch": 0.33996030196019034, + "grad_norm": 0.91015625, + "learning_rate": 2.3075599924613634e-05, + "loss": 0.7009, + "step": 7754 + }, + { + "epoch": 0.34000414517684757, + "grad_norm": 0.8125, + "learning_rate": 2.307265842236278e-05, + "loss": 0.834, + "step": 7755 + }, + { + "epoch": 0.3400479883935048, + "grad_norm": 0.77734375, + "learning_rate": 2.3069717083155017e-05, + "loss": 0.8206, + "step": 7756 + }, + { + "epoch": 0.34009183161016204, + "grad_norm": 0.875, + "learning_rate": 2.3066775906996574e-05, + "loss": 0.7929, + "step": 7757 + }, + { + "epoch": 0.3401356748268193, + "grad_norm": 0.83203125, + "learning_rate": 2.306383489389373e-05, + "loss": 0.9064, + "step": 7758 + }, + { + "epoch": 0.3401795180434765, + "grad_norm": 0.78515625, + "learning_rate": 2.3060894043852686e-05, + "loss": 0.7443, + "step": 7759 + }, + { + "epoch": 0.34022336126013375, + "grad_norm": 0.8984375, + "learning_rate": 2.305795335687968e-05, + "loss": 0.791, + "step": 7760 + }, + { + "epoch": 0.340267204476791, + "grad_norm": 0.7265625, + "learning_rate": 2.305501283298095e-05, + "loss": 0.7454, + "step": 7761 + }, + { + "epoch": 0.3403110476934482, + "grad_norm": 0.828125, + "learning_rate": 2.305207247216269e-05, + "loss": 0.7416, + "step": 7762 + }, + { + "epoch": 0.34035489091010546, + "grad_norm": 0.7578125, + "learning_rate": 2.3049132274431194e-05, + "loss": 0.7683, + "step": 7763 + }, + { + "epoch": 0.3403987341267627, + "grad_norm": 0.7890625, + "learning_rate": 2.3046192239792664e-05, + "loss": 0.8238, + "step": 7764 + }, + { + "epoch": 0.3404425773434199, + "grad_norm": 0.8046875, + "learning_rate": 2.3043252368253332e-05, + "loss": 0.7788, + "step": 7765 + }, + { + "epoch": 0.34048642056007716, + "grad_norm": 0.77734375, + "learning_rate": 2.3040312659819418e-05, + "loss": 0.8504, + "step": 7766 + }, + { + "epoch": 0.3405302637767344, + "grad_norm": 0.80078125, + "learning_rate": 2.303737311449714e-05, + "loss": 0.7377, + "step": 7767 + }, + { + "epoch": 0.34057410699339163, + "grad_norm": 1.03125, + "learning_rate": 2.303443373229278e-05, + "loss": 0.9688, + "step": 7768 + }, + { + "epoch": 0.34061795021004887, + "grad_norm": 0.89453125, + "learning_rate": 2.3031494513212548e-05, + "loss": 0.8068, + "step": 7769 + }, + { + "epoch": 0.3406617934267061, + "grad_norm": 0.796875, + "learning_rate": 2.3028555457262648e-05, + "loss": 0.7677, + "step": 7770 + }, + { + "epoch": 0.34070563664336334, + "grad_norm": 0.78515625, + "learning_rate": 2.3025616564449338e-05, + "loss": 0.7784, + "step": 7771 + }, + { + "epoch": 0.3407494798600206, + "grad_norm": 0.79296875, + "learning_rate": 2.3022677834778796e-05, + "loss": 0.7446, + "step": 7772 + }, + { + "epoch": 0.3407933230766778, + "grad_norm": 0.7734375, + "learning_rate": 2.3019739268257322e-05, + "loss": 0.82, + "step": 7773 + }, + { + "epoch": 0.34083716629333505, + "grad_norm": 0.796875, + "learning_rate": 2.301680086489111e-05, + "loss": 0.7647, + "step": 7774 + }, + { + "epoch": 0.3408810095099923, + "grad_norm": 0.796875, + "learning_rate": 2.3013862624686388e-05, + "loss": 0.7735, + "step": 7775 + }, + { + "epoch": 0.3409248527266495, + "grad_norm": 0.796875, + "learning_rate": 2.301092454764938e-05, + "loss": 0.7724, + "step": 7776 + }, + { + "epoch": 0.34096869594330675, + "grad_norm": 0.80078125, + "learning_rate": 2.3007986633786293e-05, + "loss": 0.8598, + "step": 7777 + }, + { + "epoch": 0.341012539159964, + "grad_norm": 0.78125, + "learning_rate": 2.3005048883103407e-05, + "loss": 0.8844, + "step": 7778 + }, + { + "epoch": 0.3410563823766212, + "grad_norm": 0.828125, + "learning_rate": 2.300211129560692e-05, + "loss": 0.9246, + "step": 7779 + }, + { + "epoch": 0.34110022559327846, + "grad_norm": 0.765625, + "learning_rate": 2.2999173871303058e-05, + "loss": 0.6222, + "step": 7780 + }, + { + "epoch": 0.3411440688099357, + "grad_norm": 0.8671875, + "learning_rate": 2.299623661019804e-05, + "loss": 0.9506, + "step": 7781 + }, + { + "epoch": 0.3411879120265929, + "grad_norm": 0.890625, + "learning_rate": 2.2993299512298106e-05, + "loss": 0.8709, + "step": 7782 + }, + { + "epoch": 0.3412317552432501, + "grad_norm": 0.84375, + "learning_rate": 2.2990362577609438e-05, + "loss": 0.7561, + "step": 7783 + }, + { + "epoch": 0.34127559845990735, + "grad_norm": 0.7421875, + "learning_rate": 2.298742580613832e-05, + "loss": 0.7045, + "step": 7784 + }, + { + "epoch": 0.3413194416765646, + "grad_norm": 0.828125, + "learning_rate": 2.2984489197890955e-05, + "loss": 0.797, + "step": 7785 + }, + { + "epoch": 0.3413632848932218, + "grad_norm": 0.82421875, + "learning_rate": 2.298155275287356e-05, + "loss": 0.8002, + "step": 7786 + }, + { + "epoch": 0.34140712810987905, + "grad_norm": 0.81640625, + "learning_rate": 2.2978616471092364e-05, + "loss": 0.8774, + "step": 7787 + }, + { + "epoch": 0.3414509713265363, + "grad_norm": 0.8203125, + "learning_rate": 2.2975680352553584e-05, + "loss": 0.7289, + "step": 7788 + }, + { + "epoch": 0.3414948145431935, + "grad_norm": 0.73046875, + "learning_rate": 2.2972744397263458e-05, + "loss": 0.7123, + "step": 7789 + }, + { + "epoch": 0.34153865775985076, + "grad_norm": 0.8359375, + "learning_rate": 2.2969808605228192e-05, + "loss": 0.8493, + "step": 7790 + }, + { + "epoch": 0.341582500976508, + "grad_norm": 0.8046875, + "learning_rate": 2.2966872976454014e-05, + "loss": 0.875, + "step": 7791 + }, + { + "epoch": 0.34162634419316523, + "grad_norm": 0.7890625, + "learning_rate": 2.296393751094711e-05, + "loss": 0.7712, + "step": 7792 + }, + { + "epoch": 0.34167018740982247, + "grad_norm": 0.828125, + "learning_rate": 2.2961002208713778e-05, + "loss": 0.8229, + "step": 7793 + }, + { + "epoch": 0.3417140306264797, + "grad_norm": 0.85546875, + "learning_rate": 2.2958067069760193e-05, + "loss": 0.9326, + "step": 7794 + }, + { + "epoch": 0.34175787384313694, + "grad_norm": 0.8828125, + "learning_rate": 2.2955132094092568e-05, + "loss": 0.9123, + "step": 7795 + }, + { + "epoch": 0.3418017170597942, + "grad_norm": 0.8671875, + "learning_rate": 2.2952197281717157e-05, + "loss": 0.7858, + "step": 7796 + }, + { + "epoch": 0.3418455602764514, + "grad_norm": 0.9375, + "learning_rate": 2.2949262632640123e-05, + "loss": 0.7885, + "step": 7797 + }, + { + "epoch": 0.34188940349310865, + "grad_norm": 0.78515625, + "learning_rate": 2.2946328146867745e-05, + "loss": 0.6864, + "step": 7798 + }, + { + "epoch": 0.3419332467097659, + "grad_norm": 0.79296875, + "learning_rate": 2.2943393824406233e-05, + "loss": 0.7498, + "step": 7799 + }, + { + "epoch": 0.3419770899264231, + "grad_norm": 0.76953125, + "learning_rate": 2.2940459665261794e-05, + "loss": 0.7266, + "step": 7800 + }, + { + "epoch": 0.34202093314308035, + "grad_norm": 0.859375, + "learning_rate": 2.2937525669440642e-05, + "loss": 0.8117, + "step": 7801 + }, + { + "epoch": 0.3420647763597376, + "grad_norm": 0.8125, + "learning_rate": 2.293459183694896e-05, + "loss": 0.803, + "step": 7802 + }, + { + "epoch": 0.3421086195763948, + "grad_norm": 0.8203125, + "learning_rate": 2.2931658167793047e-05, + "loss": 0.8325, + "step": 7803 + }, + { + "epoch": 0.34215246279305206, + "grad_norm": 0.85546875, + "learning_rate": 2.2928724661979073e-05, + "loss": 0.7773, + "step": 7804 + }, + { + "epoch": 0.3421963060097093, + "grad_norm": 1.015625, + "learning_rate": 2.2925791319513268e-05, + "loss": 0.7118, + "step": 7805 + }, + { + "epoch": 0.34224014922636653, + "grad_norm": 0.83203125, + "learning_rate": 2.2922858140401836e-05, + "loss": 0.7811, + "step": 7806 + }, + { + "epoch": 0.34228399244302377, + "grad_norm": 0.88671875, + "learning_rate": 2.2919925124650977e-05, + "loss": 0.7995, + "step": 7807 + }, + { + "epoch": 0.342327835659681, + "grad_norm": 1.125, + "learning_rate": 2.2916992272266956e-05, + "loss": 0.8903, + "step": 7808 + }, + { + "epoch": 0.34237167887633824, + "grad_norm": 0.703125, + "learning_rate": 2.2914059583255977e-05, + "loss": 0.6349, + "step": 7809 + }, + { + "epoch": 0.3424155220929955, + "grad_norm": 0.7734375, + "learning_rate": 2.2911127057624226e-05, + "loss": 0.7505, + "step": 7810 + }, + { + "epoch": 0.3424593653096527, + "grad_norm": 0.76953125, + "learning_rate": 2.2908194695377938e-05, + "loss": 0.8748, + "step": 7811 + }, + { + "epoch": 0.3425032085263099, + "grad_norm": 0.80859375, + "learning_rate": 2.2905262496523326e-05, + "loss": 0.8159, + "step": 7812 + }, + { + "epoch": 0.3425470517429671, + "grad_norm": 0.765625, + "learning_rate": 2.290233046106658e-05, + "loss": 0.6974, + "step": 7813 + }, + { + "epoch": 0.34259089495962436, + "grad_norm": 0.83203125, + "learning_rate": 2.2899398589013966e-05, + "loss": 0.9132, + "step": 7814 + }, + { + "epoch": 0.3426347381762816, + "grad_norm": 0.85546875, + "learning_rate": 2.289646688037167e-05, + "loss": 0.7372, + "step": 7815 + }, + { + "epoch": 0.34267858139293883, + "grad_norm": 0.81640625, + "learning_rate": 2.2893535335145898e-05, + "loss": 0.7758, + "step": 7816 + }, + { + "epoch": 0.34272242460959607, + "grad_norm": 0.8125, + "learning_rate": 2.2890603953342882e-05, + "loss": 0.7321, + "step": 7817 + }, + { + "epoch": 0.3427662678262533, + "grad_norm": 0.8671875, + "learning_rate": 2.288767273496879e-05, + "loss": 0.9505, + "step": 7818 + }, + { + "epoch": 0.34281011104291054, + "grad_norm": 0.83203125, + "learning_rate": 2.28847416800299e-05, + "loss": 0.7653, + "step": 7819 + }, + { + "epoch": 0.3428539542595678, + "grad_norm": 0.8125, + "learning_rate": 2.2881810788532375e-05, + "loss": 0.7765, + "step": 7820 + }, + { + "epoch": 0.342897797476225, + "grad_norm": 0.79296875, + "learning_rate": 2.2878880060482466e-05, + "loss": 0.795, + "step": 7821 + }, + { + "epoch": 0.34294164069288224, + "grad_norm": 0.75, + "learning_rate": 2.287594949588635e-05, + "loss": 0.8492, + "step": 7822 + }, + { + "epoch": 0.3429854839095395, + "grad_norm": 0.90234375, + "learning_rate": 2.2873019094750225e-05, + "loss": 0.8114, + "step": 7823 + }, + { + "epoch": 0.3430293271261967, + "grad_norm": 0.77734375, + "learning_rate": 2.2870088857080364e-05, + "loss": 0.7922, + "step": 7824 + }, + { + "epoch": 0.34307317034285395, + "grad_norm": 0.875, + "learning_rate": 2.2867158782882926e-05, + "loss": 0.7855, + "step": 7825 + }, + { + "epoch": 0.3431170135595112, + "grad_norm": 0.82421875, + "learning_rate": 2.286422887216415e-05, + "loss": 0.8783, + "step": 7826 + }, + { + "epoch": 0.3431608567761684, + "grad_norm": 0.84375, + "learning_rate": 2.2861299124930226e-05, + "loss": 0.7957, + "step": 7827 + }, + { + "epoch": 0.34320469999282566, + "grad_norm": 0.76953125, + "learning_rate": 2.2858369541187364e-05, + "loss": 0.827, + "step": 7828 + }, + { + "epoch": 0.3432485432094829, + "grad_norm": 0.7734375, + "learning_rate": 2.2855440120941784e-05, + "loss": 0.971, + "step": 7829 + }, + { + "epoch": 0.34329238642614013, + "grad_norm": 0.8046875, + "learning_rate": 2.2852510864199695e-05, + "loss": 0.8556, + "step": 7830 + }, + { + "epoch": 0.34333622964279736, + "grad_norm": 0.8046875, + "learning_rate": 2.2849581770967264e-05, + "loss": 0.8483, + "step": 7831 + }, + { + "epoch": 0.3433800728594546, + "grad_norm": 0.84765625, + "learning_rate": 2.284665284125077e-05, + "loss": 0.7875, + "step": 7832 + }, + { + "epoch": 0.34342391607611183, + "grad_norm": 0.78125, + "learning_rate": 2.2843724075056384e-05, + "loss": 0.9243, + "step": 7833 + }, + { + "epoch": 0.34346775929276907, + "grad_norm": 0.86328125, + "learning_rate": 2.2840795472390308e-05, + "loss": 0.8867, + "step": 7834 + }, + { + "epoch": 0.3435116025094263, + "grad_norm": 0.8359375, + "learning_rate": 2.2837867033258752e-05, + "loss": 0.8624, + "step": 7835 + }, + { + "epoch": 0.34355544572608354, + "grad_norm": 0.78125, + "learning_rate": 2.2834938757667933e-05, + "loss": 0.7998, + "step": 7836 + }, + { + "epoch": 0.3435992889427408, + "grad_norm": 0.83984375, + "learning_rate": 2.283201064562406e-05, + "loss": 0.7378, + "step": 7837 + }, + { + "epoch": 0.343643132159398, + "grad_norm": 0.9609375, + "learning_rate": 2.2829082697133285e-05, + "loss": 0.8576, + "step": 7838 + }, + { + "epoch": 0.34368697537605525, + "grad_norm": 0.7890625, + "learning_rate": 2.2826154912201903e-05, + "loss": 0.8578, + "step": 7839 + }, + { + "epoch": 0.3437308185927125, + "grad_norm": 0.7734375, + "learning_rate": 2.282322729083607e-05, + "loss": 0.7166, + "step": 7840 + }, + { + "epoch": 0.3437746618093697, + "grad_norm": 1.40625, + "learning_rate": 2.2820299833041982e-05, + "loss": 0.7373, + "step": 7841 + }, + { + "epoch": 0.34381850502602695, + "grad_norm": 0.921875, + "learning_rate": 2.2817372538825867e-05, + "loss": 0.8895, + "step": 7842 + }, + { + "epoch": 0.34386234824268413, + "grad_norm": 0.76171875, + "learning_rate": 2.2814445408193898e-05, + "loss": 0.6509, + "step": 7843 + }, + { + "epoch": 0.34390619145934137, + "grad_norm": 0.9296875, + "learning_rate": 2.2811518441152312e-05, + "loss": 0.7745, + "step": 7844 + }, + { + "epoch": 0.3439500346759986, + "grad_norm": 0.89453125, + "learning_rate": 2.280859163770731e-05, + "loss": 0.7676, + "step": 7845 + }, + { + "epoch": 0.34399387789265584, + "grad_norm": 0.8046875, + "learning_rate": 2.2805664997865096e-05, + "loss": 0.7704, + "step": 7846 + }, + { + "epoch": 0.3440377211093131, + "grad_norm": 0.80078125, + "learning_rate": 2.280273852163185e-05, + "loss": 0.8028, + "step": 7847 + }, + { + "epoch": 0.3440815643259703, + "grad_norm": 0.7578125, + "learning_rate": 2.279981220901376e-05, + "loss": 0.7263, + "step": 7848 + }, + { + "epoch": 0.34412540754262755, + "grad_norm": 0.80078125, + "learning_rate": 2.2796886060017086e-05, + "loss": 0.6352, + "step": 7849 + }, + { + "epoch": 0.3441692507592848, + "grad_norm": 0.89453125, + "learning_rate": 2.2793960074648004e-05, + "loss": 0.8579, + "step": 7850 + }, + { + "epoch": 0.344213093975942, + "grad_norm": 0.859375, + "learning_rate": 2.2791034252912704e-05, + "loss": 0.7544, + "step": 7851 + }, + { + "epoch": 0.34425693719259925, + "grad_norm": 0.80859375, + "learning_rate": 2.2788108594817393e-05, + "loss": 0.7821, + "step": 7852 + }, + { + "epoch": 0.3443007804092565, + "grad_norm": 0.8125, + "learning_rate": 2.2785183100368244e-05, + "loss": 0.8206, + "step": 7853 + }, + { + "epoch": 0.3443446236259137, + "grad_norm": 0.8125, + "learning_rate": 2.2782257769571526e-05, + "loss": 0.8132, + "step": 7854 + }, + { + "epoch": 0.34438846684257096, + "grad_norm": 0.8203125, + "learning_rate": 2.277933260243339e-05, + "loss": 0.7797, + "step": 7855 + }, + { + "epoch": 0.3444323100592282, + "grad_norm": 2.390625, + "learning_rate": 2.2776407598960036e-05, + "loss": 0.8571, + "step": 7856 + }, + { + "epoch": 0.34447615327588543, + "grad_norm": 0.7734375, + "learning_rate": 2.2773482759157682e-05, + "loss": 0.7999, + "step": 7857 + }, + { + "epoch": 0.34451999649254267, + "grad_norm": 0.7890625, + "learning_rate": 2.277055808303249e-05, + "loss": 0.7751, + "step": 7858 + }, + { + "epoch": 0.3445638397091999, + "grad_norm": 0.8515625, + "learning_rate": 2.276763357059072e-05, + "loss": 0.8153, + "step": 7859 + }, + { + "epoch": 0.34460768292585714, + "grad_norm": 0.7890625, + "learning_rate": 2.2764709221838522e-05, + "loss": 0.7816, + "step": 7860 + }, + { + "epoch": 0.3446515261425144, + "grad_norm": 0.99609375, + "learning_rate": 2.2761785036782123e-05, + "loss": 0.7531, + "step": 7861 + }, + { + "epoch": 0.3446953693591716, + "grad_norm": 0.73046875, + "learning_rate": 2.2758861015427692e-05, + "loss": 0.7265, + "step": 7862 + }, + { + "epoch": 0.34473921257582885, + "grad_norm": 0.71875, + "learning_rate": 2.2755937157781416e-05, + "loss": 0.7215, + "step": 7863 + }, + { + "epoch": 0.3447830557924861, + "grad_norm": 0.88671875, + "learning_rate": 2.275301346384955e-05, + "loss": 0.8966, + "step": 7864 + }, + { + "epoch": 0.3448268990091433, + "grad_norm": 0.82421875, + "learning_rate": 2.2750089933638252e-05, + "loss": 0.7981, + "step": 7865 + }, + { + "epoch": 0.34487074222580055, + "grad_norm": 1.1796875, + "learning_rate": 2.274716656715372e-05, + "loss": 0.8332, + "step": 7866 + }, + { + "epoch": 0.3449145854424578, + "grad_norm": 0.75, + "learning_rate": 2.2744243364402162e-05, + "loss": 0.7709, + "step": 7867 + }, + { + "epoch": 0.344958428659115, + "grad_norm": 0.83984375, + "learning_rate": 2.2741320325389736e-05, + "loss": 0.852, + "step": 7868 + }, + { + "epoch": 0.34500227187577226, + "grad_norm": 0.8515625, + "learning_rate": 2.2738397450122684e-05, + "loss": 0.9027, + "step": 7869 + }, + { + "epoch": 0.3450461150924295, + "grad_norm": 0.82421875, + "learning_rate": 2.273547473860719e-05, + "loss": 0.7042, + "step": 7870 + }, + { + "epoch": 0.34508995830908673, + "grad_norm": 1.0859375, + "learning_rate": 2.2732552190849444e-05, + "loss": 0.8313, + "step": 7871 + }, + { + "epoch": 0.34513380152574397, + "grad_norm": 0.84375, + "learning_rate": 2.2729629806855623e-05, + "loss": 0.8799, + "step": 7872 + }, + { + "epoch": 0.34517764474240115, + "grad_norm": 0.8125, + "learning_rate": 2.2726707586631946e-05, + "loss": 0.7248, + "step": 7873 + }, + { + "epoch": 0.3452214879590584, + "grad_norm": 0.78125, + "learning_rate": 2.2723785530184593e-05, + "loss": 0.8012, + "step": 7874 + }, + { + "epoch": 0.3452653311757156, + "grad_norm": 0.80859375, + "learning_rate": 2.272086363751976e-05, + "loss": 0.797, + "step": 7875 + }, + { + "epoch": 0.34530917439237285, + "grad_norm": 0.796875, + "learning_rate": 2.2717941908643647e-05, + "loss": 0.8367, + "step": 7876 + }, + { + "epoch": 0.3453530176090301, + "grad_norm": 0.9453125, + "learning_rate": 2.271502034356243e-05, + "loss": 0.8622, + "step": 7877 + }, + { + "epoch": 0.3453968608256873, + "grad_norm": 0.85546875, + "learning_rate": 2.2712098942282277e-05, + "loss": 0.7975, + "step": 7878 + }, + { + "epoch": 0.34544070404234456, + "grad_norm": 0.8046875, + "learning_rate": 2.2709177704809447e-05, + "loss": 0.9194, + "step": 7879 + }, + { + "epoch": 0.3454845472590018, + "grad_norm": 0.8125, + "learning_rate": 2.2706256631150103e-05, + "loss": 0.8965, + "step": 7880 + }, + { + "epoch": 0.34552839047565903, + "grad_norm": 0.90234375, + "learning_rate": 2.2703335721310424e-05, + "loss": 0.8344, + "step": 7881 + }, + { + "epoch": 0.34557223369231627, + "grad_norm": 0.80859375, + "learning_rate": 2.27004149752966e-05, + "loss": 0.8174, + "step": 7882 + }, + { + "epoch": 0.3456160769089735, + "grad_norm": 1.078125, + "learning_rate": 2.2697494393114816e-05, + "loss": 0.7752, + "step": 7883 + }, + { + "epoch": 0.34565992012563074, + "grad_norm": 0.81640625, + "learning_rate": 2.2694573974771293e-05, + "loss": 0.6974, + "step": 7884 + }, + { + "epoch": 0.345703763342288, + "grad_norm": 0.84765625, + "learning_rate": 2.26916537202722e-05, + "loss": 1.0143, + "step": 7885 + }, + { + "epoch": 0.3457476065589452, + "grad_norm": 0.77734375, + "learning_rate": 2.268873362962374e-05, + "loss": 0.8971, + "step": 7886 + }, + { + "epoch": 0.34579144977560244, + "grad_norm": 1.0390625, + "learning_rate": 2.268581370283208e-05, + "loss": 0.8616, + "step": 7887 + }, + { + "epoch": 0.3458352929922597, + "grad_norm": 0.83203125, + "learning_rate": 2.2682893939903395e-05, + "loss": 0.8431, + "step": 7888 + }, + { + "epoch": 0.3458791362089169, + "grad_norm": 0.8203125, + "learning_rate": 2.2679974340843912e-05, + "loss": 0.8027, + "step": 7889 + }, + { + "epoch": 0.34592297942557415, + "grad_norm": 0.7578125, + "learning_rate": 2.2677054905659822e-05, + "loss": 0.7447, + "step": 7890 + }, + { + "epoch": 0.3459668226422314, + "grad_norm": 0.7890625, + "learning_rate": 2.2674135634357285e-05, + "loss": 0.8378, + "step": 7891 + }, + { + "epoch": 0.3460106658588886, + "grad_norm": 0.8671875, + "learning_rate": 2.2671216526942494e-05, + "loss": 0.8161, + "step": 7892 + }, + { + "epoch": 0.34605450907554586, + "grad_norm": 0.92578125, + "learning_rate": 2.2668297583421616e-05, + "loss": 0.8179, + "step": 7893 + }, + { + "epoch": 0.3460983522922031, + "grad_norm": 1.265625, + "learning_rate": 2.2665378803800895e-05, + "loss": 0.8321, + "step": 7894 + }, + { + "epoch": 0.34614219550886033, + "grad_norm": 0.8046875, + "learning_rate": 2.2662460188086477e-05, + "loss": 0.807, + "step": 7895 + }, + { + "epoch": 0.34618603872551756, + "grad_norm": 0.83203125, + "learning_rate": 2.2659541736284563e-05, + "loss": 0.8165, + "step": 7896 + }, + { + "epoch": 0.3462298819421748, + "grad_norm": 0.8359375, + "learning_rate": 2.2656623448401314e-05, + "loss": 0.7796, + "step": 7897 + }, + { + "epoch": 0.34627372515883204, + "grad_norm": 0.734375, + "learning_rate": 2.265370532444292e-05, + "loss": 0.793, + "step": 7898 + }, + { + "epoch": 0.34631756837548927, + "grad_norm": 1.34375, + "learning_rate": 2.2650787364415595e-05, + "loss": 0.715, + "step": 7899 + }, + { + "epoch": 0.3463614115921465, + "grad_norm": 0.75390625, + "learning_rate": 2.264786956832551e-05, + "loss": 0.7021, + "step": 7900 + }, + { + "epoch": 0.34640525480880374, + "grad_norm": 0.79296875, + "learning_rate": 2.264495193617884e-05, + "loss": 0.7622, + "step": 7901 + }, + { + "epoch": 0.346449098025461, + "grad_norm": 0.8984375, + "learning_rate": 2.2642034467981764e-05, + "loss": 0.8058, + "step": 7902 + }, + { + "epoch": 0.3464929412421182, + "grad_norm": 0.82421875, + "learning_rate": 2.2639117163740466e-05, + "loss": 0.8243, + "step": 7903 + }, + { + "epoch": 0.3465367844587754, + "grad_norm": 0.828125, + "learning_rate": 2.263620002346115e-05, + "loss": 0.7971, + "step": 7904 + }, + { + "epoch": 0.34658062767543263, + "grad_norm": 0.828125, + "learning_rate": 2.263328304714999e-05, + "loss": 0.8235, + "step": 7905 + }, + { + "epoch": 0.34662447089208986, + "grad_norm": 0.82421875, + "learning_rate": 2.2630366234813172e-05, + "loss": 0.8004, + "step": 7906 + }, + { + "epoch": 0.3466683141087471, + "grad_norm": 0.80078125, + "learning_rate": 2.262744958645686e-05, + "loss": 0.8312, + "step": 7907 + }, + { + "epoch": 0.34671215732540434, + "grad_norm": 0.96484375, + "learning_rate": 2.262453310208722e-05, + "loss": 0.8922, + "step": 7908 + }, + { + "epoch": 0.34675600054206157, + "grad_norm": 0.828125, + "learning_rate": 2.26216167817105e-05, + "loss": 0.8287, + "step": 7909 + }, + { + "epoch": 0.3467998437587188, + "grad_norm": 0.78515625, + "learning_rate": 2.261870062533282e-05, + "loss": 0.7224, + "step": 7910 + }, + { + "epoch": 0.34684368697537604, + "grad_norm": 0.94140625, + "learning_rate": 2.261578463296039e-05, + "loss": 0.6931, + "step": 7911 + }, + { + "epoch": 0.3468875301920333, + "grad_norm": 0.82421875, + "learning_rate": 2.2612868804599386e-05, + "loss": 0.8348, + "step": 7912 + }, + { + "epoch": 0.3469313734086905, + "grad_norm": 0.875, + "learning_rate": 2.2609953140255978e-05, + "loss": 0.8583, + "step": 7913 + }, + { + "epoch": 0.34697521662534775, + "grad_norm": 0.90234375, + "learning_rate": 2.2607037639936356e-05, + "loss": 0.9337, + "step": 7914 + }, + { + "epoch": 0.347019059842005, + "grad_norm": 0.83203125, + "learning_rate": 2.260412230364669e-05, + "loss": 0.8626, + "step": 7915 + }, + { + "epoch": 0.3470629030586622, + "grad_norm": 0.78125, + "learning_rate": 2.2601207131393142e-05, + "loss": 0.7958, + "step": 7916 + }, + { + "epoch": 0.34710674627531946, + "grad_norm": 0.8203125, + "learning_rate": 2.259829212318193e-05, + "loss": 0.7427, + "step": 7917 + }, + { + "epoch": 0.3471505894919767, + "grad_norm": 0.92578125, + "learning_rate": 2.2595377279019227e-05, + "loss": 0.7589, + "step": 7918 + }, + { + "epoch": 0.3471944327086339, + "grad_norm": 0.8359375, + "learning_rate": 2.259246259891118e-05, + "loss": 0.8647, + "step": 7919 + }, + { + "epoch": 0.34723827592529116, + "grad_norm": 0.85546875, + "learning_rate": 2.2589548082864e-05, + "loss": 0.8997, + "step": 7920 + }, + { + "epoch": 0.3472821191419484, + "grad_norm": 0.84765625, + "learning_rate": 2.2586633730883844e-05, + "loss": 0.8862, + "step": 7921 + }, + { + "epoch": 0.34732596235860563, + "grad_norm": 0.765625, + "learning_rate": 2.258371954297689e-05, + "loss": 0.8103, + "step": 7922 + }, + { + "epoch": 0.34736980557526287, + "grad_norm": 0.83203125, + "learning_rate": 2.2580805519149285e-05, + "loss": 0.9481, + "step": 7923 + }, + { + "epoch": 0.3474136487919201, + "grad_norm": 0.7265625, + "learning_rate": 2.257789165940727e-05, + "loss": 0.8027, + "step": 7924 + }, + { + "epoch": 0.34745749200857734, + "grad_norm": 1.046875, + "learning_rate": 2.2574977963756995e-05, + "loss": 0.714, + "step": 7925 + }, + { + "epoch": 0.3475013352252346, + "grad_norm": 0.82421875, + "learning_rate": 2.257206443220462e-05, + "loss": 0.8485, + "step": 7926 + }, + { + "epoch": 0.3475451784418918, + "grad_norm": 0.859375, + "learning_rate": 2.2569151064756333e-05, + "loss": 0.8331, + "step": 7927 + }, + { + "epoch": 0.34758902165854905, + "grad_norm": 0.8828125, + "learning_rate": 2.2566237861418273e-05, + "loss": 0.8054, + "step": 7928 + }, + { + "epoch": 0.3476328648752063, + "grad_norm": 1.0546875, + "learning_rate": 2.2563324822196674e-05, + "loss": 0.8809, + "step": 7929 + }, + { + "epoch": 0.3476767080918635, + "grad_norm": 0.84765625, + "learning_rate": 2.256041194709768e-05, + "loss": 0.7661, + "step": 7930 + }, + { + "epoch": 0.34772055130852075, + "grad_norm": 0.76953125, + "learning_rate": 2.2557499236127476e-05, + "loss": 0.8083, + "step": 7931 + }, + { + "epoch": 0.347764394525178, + "grad_norm": 0.765625, + "learning_rate": 2.255458668929222e-05, + "loss": 0.7897, + "step": 7932 + }, + { + "epoch": 0.3478082377418352, + "grad_norm": 0.84375, + "learning_rate": 2.2551674306598058e-05, + "loss": 0.732, + "step": 7933 + }, + { + "epoch": 0.3478520809584924, + "grad_norm": 0.7421875, + "learning_rate": 2.2548762088051232e-05, + "loss": 0.7366, + "step": 7934 + }, + { + "epoch": 0.34789592417514964, + "grad_norm": 0.7421875, + "learning_rate": 2.2545850033657878e-05, + "loss": 0.8541, + "step": 7935 + }, + { + "epoch": 0.3479397673918069, + "grad_norm": 0.76171875, + "learning_rate": 2.2542938143424163e-05, + "loss": 0.7832, + "step": 7936 + }, + { + "epoch": 0.3479836106084641, + "grad_norm": 0.86328125, + "learning_rate": 2.2540026417356263e-05, + "loss": 0.9231, + "step": 7937 + }, + { + "epoch": 0.34802745382512135, + "grad_norm": 0.87109375, + "learning_rate": 2.253711485546033e-05, + "loss": 0.8427, + "step": 7938 + }, + { + "epoch": 0.3480712970417786, + "grad_norm": 0.77734375, + "learning_rate": 2.2534203457742575e-05, + "loss": 0.7648, + "step": 7939 + }, + { + "epoch": 0.3481151402584358, + "grad_norm": 0.8359375, + "learning_rate": 2.2531292224209144e-05, + "loss": 0.9099, + "step": 7940 + }, + { + "epoch": 0.34815898347509305, + "grad_norm": 0.80078125, + "learning_rate": 2.252838115486622e-05, + "loss": 0.6751, + "step": 7941 + }, + { + "epoch": 0.3482028266917503, + "grad_norm": 0.80859375, + "learning_rate": 2.252547024971996e-05, + "loss": 0.7501, + "step": 7942 + }, + { + "epoch": 0.3482466699084075, + "grad_norm": 0.828125, + "learning_rate": 2.2522559508776508e-05, + "loss": 0.9077, + "step": 7943 + }, + { + "epoch": 0.34829051312506476, + "grad_norm": 0.87109375, + "learning_rate": 2.2519648932042094e-05, + "loss": 0.9048, + "step": 7944 + }, + { + "epoch": 0.348334356341722, + "grad_norm": 0.76953125, + "learning_rate": 2.2516738519522862e-05, + "loss": 0.7709, + "step": 7945 + }, + { + "epoch": 0.34837819955837923, + "grad_norm": 1.078125, + "learning_rate": 2.251382827122496e-05, + "loss": 0.7671, + "step": 7946 + }, + { + "epoch": 0.34842204277503647, + "grad_norm": 0.83984375, + "learning_rate": 2.2510918187154583e-05, + "loss": 0.7625, + "step": 7947 + }, + { + "epoch": 0.3484658859916937, + "grad_norm": 0.81640625, + "learning_rate": 2.250800826731785e-05, + "loss": 0.7464, + "step": 7948 + }, + { + "epoch": 0.34850972920835094, + "grad_norm": 0.765625, + "learning_rate": 2.250509851172099e-05, + "loss": 0.8221, + "step": 7949 + }, + { + "epoch": 0.3485535724250082, + "grad_norm": 0.80078125, + "learning_rate": 2.2502188920370147e-05, + "loss": 0.791, + "step": 7950 + }, + { + "epoch": 0.3485974156416654, + "grad_norm": 0.87890625, + "learning_rate": 2.2499279493271486e-05, + "loss": 0.7406, + "step": 7951 + }, + { + "epoch": 0.34864125885832264, + "grad_norm": 0.76953125, + "learning_rate": 2.2496370230431175e-05, + "loss": 0.7736, + "step": 7952 + }, + { + "epoch": 0.3486851020749799, + "grad_norm": 0.8671875, + "learning_rate": 2.2493461131855377e-05, + "loss": 0.7617, + "step": 7953 + }, + { + "epoch": 0.3487289452916371, + "grad_norm": 0.73828125, + "learning_rate": 2.249055219755023e-05, + "loss": 0.8122, + "step": 7954 + }, + { + "epoch": 0.34877278850829435, + "grad_norm": 0.9375, + "learning_rate": 2.2487643427521942e-05, + "loss": 0.8189, + "step": 7955 + }, + { + "epoch": 0.3488166317249516, + "grad_norm": 0.73828125, + "learning_rate": 2.2484734821776676e-05, + "loss": 0.7022, + "step": 7956 + }, + { + "epoch": 0.3488604749416088, + "grad_norm": 0.84375, + "learning_rate": 2.2481826380320592e-05, + "loss": 0.8411, + "step": 7957 + }, + { + "epoch": 0.34890431815826606, + "grad_norm": 0.8125, + "learning_rate": 2.247891810315983e-05, + "loss": 0.9264, + "step": 7958 + }, + { + "epoch": 0.3489481613749233, + "grad_norm": 0.79296875, + "learning_rate": 2.2476009990300585e-05, + "loss": 0.6772, + "step": 7959 + }, + { + "epoch": 0.34899200459158053, + "grad_norm": 0.82421875, + "learning_rate": 2.2473102041748996e-05, + "loss": 0.8152, + "step": 7960 + }, + { + "epoch": 0.34903584780823776, + "grad_norm": 0.8359375, + "learning_rate": 2.2470194257511234e-05, + "loss": 0.7621, + "step": 7961 + }, + { + "epoch": 0.349079691024895, + "grad_norm": 0.92578125, + "learning_rate": 2.2467286637593466e-05, + "loss": 0.8436, + "step": 7962 + }, + { + "epoch": 0.34912353424155224, + "grad_norm": 0.859375, + "learning_rate": 2.246437918200186e-05, + "loss": 0.8191, + "step": 7963 + }, + { + "epoch": 0.3491673774582094, + "grad_norm": 0.8125, + "learning_rate": 2.2461471890742537e-05, + "loss": 0.8783, + "step": 7964 + }, + { + "epoch": 0.34921122067486665, + "grad_norm": 0.76953125, + "learning_rate": 2.245856476382172e-05, + "loss": 0.7171, + "step": 7965 + }, + { + "epoch": 0.3492550638915239, + "grad_norm": 0.77734375, + "learning_rate": 2.2455657801245543e-05, + "loss": 0.8431, + "step": 7966 + }, + { + "epoch": 0.3492989071081811, + "grad_norm": 0.74609375, + "learning_rate": 2.2452751003020166e-05, + "loss": 0.6805, + "step": 7967 + }, + { + "epoch": 0.34934275032483836, + "grad_norm": 0.78515625, + "learning_rate": 2.244984436915175e-05, + "loss": 0.7754, + "step": 7968 + }, + { + "epoch": 0.3493865935414956, + "grad_norm": 0.88671875, + "learning_rate": 2.2446937899646424e-05, + "loss": 0.8699, + "step": 7969 + }, + { + "epoch": 0.34943043675815283, + "grad_norm": 0.828125, + "learning_rate": 2.244403159451042e-05, + "loss": 0.8583, + "step": 7970 + }, + { + "epoch": 0.34947427997481006, + "grad_norm": 0.84765625, + "learning_rate": 2.2441125453749857e-05, + "loss": 0.8115, + "step": 7971 + }, + { + "epoch": 0.3495181231914673, + "grad_norm": 0.8515625, + "learning_rate": 2.2438219477370893e-05, + "loss": 0.8083, + "step": 7972 + }, + { + "epoch": 0.34956196640812454, + "grad_norm": 1.3203125, + "learning_rate": 2.243531366537969e-05, + "loss": 0.7577, + "step": 7973 + }, + { + "epoch": 0.34960580962478177, + "grad_norm": 0.80859375, + "learning_rate": 2.243240801778237e-05, + "loss": 0.8491, + "step": 7974 + }, + { + "epoch": 0.349649652841439, + "grad_norm": 0.79296875, + "learning_rate": 2.2429502534585168e-05, + "loss": 0.814, + "step": 7975 + }, + { + "epoch": 0.34969349605809624, + "grad_norm": 0.69921875, + "learning_rate": 2.2426597215794188e-05, + "loss": 0.7315, + "step": 7976 + }, + { + "epoch": 0.3497373392747535, + "grad_norm": 0.7578125, + "learning_rate": 2.2423692061415613e-05, + "loss": 0.7279, + "step": 7977 + }, + { + "epoch": 0.3497811824914107, + "grad_norm": 0.75, + "learning_rate": 2.2420787071455586e-05, + "loss": 0.7479, + "step": 7978 + }, + { + "epoch": 0.34982502570806795, + "grad_norm": 0.8515625, + "learning_rate": 2.241788224592023e-05, + "loss": 0.8869, + "step": 7979 + }, + { + "epoch": 0.3498688689247252, + "grad_norm": 0.81640625, + "learning_rate": 2.2414977584815777e-05, + "loss": 0.8032, + "step": 7980 + }, + { + "epoch": 0.3499127121413824, + "grad_norm": 0.78515625, + "learning_rate": 2.2412073088148345e-05, + "loss": 0.7576, + "step": 7981 + }, + { + "epoch": 0.34995655535803966, + "grad_norm": 0.80078125, + "learning_rate": 2.2409168755924082e-05, + "loss": 0.7348, + "step": 7982 + }, + { + "epoch": 0.3500003985746969, + "grad_norm": 0.859375, + "learning_rate": 2.2406264588149152e-05, + "loss": 0.7631, + "step": 7983 + }, + { + "epoch": 0.3500442417913541, + "grad_norm": 0.8046875, + "learning_rate": 2.240336058482968e-05, + "loss": 0.8026, + "step": 7984 + }, + { + "epoch": 0.35008808500801136, + "grad_norm": 0.7578125, + "learning_rate": 2.2400456745971887e-05, + "loss": 0.8662, + "step": 7985 + }, + { + "epoch": 0.3501319282246686, + "grad_norm": 0.734375, + "learning_rate": 2.239755307158189e-05, + "loss": 0.7746, + "step": 7986 + }, + { + "epoch": 0.35017577144132583, + "grad_norm": 0.79296875, + "learning_rate": 2.2394649561665838e-05, + "loss": 0.7772, + "step": 7987 + }, + { + "epoch": 0.35021961465798307, + "grad_norm": 0.9609375, + "learning_rate": 2.2391746216229893e-05, + "loss": 0.8916, + "step": 7988 + }, + { + "epoch": 0.3502634578746403, + "grad_norm": 1.375, + "learning_rate": 2.238884303528017e-05, + "loss": 0.7301, + "step": 7989 + }, + { + "epoch": 0.35030730109129754, + "grad_norm": 0.8046875, + "learning_rate": 2.2385940018822893e-05, + "loss": 0.7338, + "step": 7990 + }, + { + "epoch": 0.3503511443079548, + "grad_norm": 0.79296875, + "learning_rate": 2.2383037166864186e-05, + "loss": 0.8447, + "step": 7991 + }, + { + "epoch": 0.350394987524612, + "grad_norm": 0.82421875, + "learning_rate": 2.2380134479410188e-05, + "loss": 0.8957, + "step": 7992 + }, + { + "epoch": 0.35043883074126925, + "grad_norm": 0.80859375, + "learning_rate": 2.2377231956467047e-05, + "loss": 0.8061, + "step": 7993 + }, + { + "epoch": 0.3504826739579265, + "grad_norm": 0.83984375, + "learning_rate": 2.237432959804092e-05, + "loss": 0.7525, + "step": 7994 + }, + { + "epoch": 0.35052651717458366, + "grad_norm": 0.7578125, + "learning_rate": 2.237142740413798e-05, + "loss": 0.7709, + "step": 7995 + }, + { + "epoch": 0.3505703603912409, + "grad_norm": 0.8359375, + "learning_rate": 2.236852537476437e-05, + "loss": 0.8788, + "step": 7996 + }, + { + "epoch": 0.35061420360789813, + "grad_norm": 0.88671875, + "learning_rate": 2.2365623509926226e-05, + "loss": 0.9288, + "step": 7997 + }, + { + "epoch": 0.35065804682455537, + "grad_norm": 0.8203125, + "learning_rate": 2.2362721809629706e-05, + "loss": 0.8058, + "step": 7998 + }, + { + "epoch": 0.3507018900412126, + "grad_norm": 1.015625, + "learning_rate": 2.235982027388096e-05, + "loss": 0.7849, + "step": 7999 + }, + { + "epoch": 0.35074573325786984, + "grad_norm": 0.80859375, + "learning_rate": 2.235691890268614e-05, + "loss": 0.755, + "step": 8000 + }, + { + "epoch": 0.35074573325786984, + "eval_loss": 0.7976738214492798, + "eval_runtime": 297.5878, + "eval_samples_per_second": 33.604, + "eval_steps_per_second": 0.702, + "step": 8000 + }, + { + "epoch": 0.3507895764745271, + "grad_norm": 0.81640625, + "learning_rate": 2.2354017696051387e-05, + "loss": 0.9594, + "step": 8001 + }, + { + "epoch": 0.3508334196911843, + "grad_norm": 0.77734375, + "learning_rate": 2.235111665398284e-05, + "loss": 0.7403, + "step": 8002 + }, + { + "epoch": 0.35087726290784155, + "grad_norm": 0.890625, + "learning_rate": 2.234821577648668e-05, + "loss": 0.7568, + "step": 8003 + }, + { + "epoch": 0.3509211061244988, + "grad_norm": 0.8671875, + "learning_rate": 2.234531506356904e-05, + "loss": 0.9454, + "step": 8004 + }, + { + "epoch": 0.350964949341156, + "grad_norm": 0.93359375, + "learning_rate": 2.2342414515236077e-05, + "loss": 0.8088, + "step": 8005 + }, + { + "epoch": 0.35100879255781325, + "grad_norm": 0.95703125, + "learning_rate": 2.233951413149391e-05, + "loss": 0.9025, + "step": 8006 + }, + { + "epoch": 0.3510526357744705, + "grad_norm": 0.8828125, + "learning_rate": 2.2336613912348713e-05, + "loss": 0.8719, + "step": 8007 + }, + { + "epoch": 0.3510964789911277, + "grad_norm": 0.80859375, + "learning_rate": 2.2333713857806625e-05, + "loss": 0.7431, + "step": 8008 + }, + { + "epoch": 0.35114032220778496, + "grad_norm": 0.87109375, + "learning_rate": 2.2330813967873766e-05, + "loss": 0.7972, + "step": 8009 + }, + { + "epoch": 0.3511841654244422, + "grad_norm": 0.78515625, + "learning_rate": 2.2327914242556326e-05, + "loss": 0.7359, + "step": 8010 + }, + { + "epoch": 0.35122800864109943, + "grad_norm": 0.91796875, + "learning_rate": 2.2325014681860434e-05, + "loss": 1.0034, + "step": 8011 + }, + { + "epoch": 0.35127185185775667, + "grad_norm": 0.84375, + "learning_rate": 2.2322115285792246e-05, + "loss": 0.815, + "step": 8012 + }, + { + "epoch": 0.3513156950744139, + "grad_norm": 0.80859375, + "learning_rate": 2.2319216054357884e-05, + "loss": 0.7132, + "step": 8013 + }, + { + "epoch": 0.35135953829107114, + "grad_norm": 0.77734375, + "learning_rate": 2.2316316987563478e-05, + "loss": 0.6676, + "step": 8014 + }, + { + "epoch": 0.3514033815077284, + "grad_norm": 0.83203125, + "learning_rate": 2.2313418085415217e-05, + "loss": 0.8928, + "step": 8015 + }, + { + "epoch": 0.3514472247243856, + "grad_norm": 0.84765625, + "learning_rate": 2.2310519347919235e-05, + "loss": 0.838, + "step": 8016 + }, + { + "epoch": 0.35149106794104285, + "grad_norm": 0.90625, + "learning_rate": 2.2307620775081674e-05, + "loss": 0.8757, + "step": 8017 + }, + { + "epoch": 0.3515349111577001, + "grad_norm": 1.2265625, + "learning_rate": 2.230472236690866e-05, + "loss": 0.84, + "step": 8018 + }, + { + "epoch": 0.3515787543743573, + "grad_norm": 0.85546875, + "learning_rate": 2.2301824123406323e-05, + "loss": 0.7598, + "step": 8019 + }, + { + "epoch": 0.35162259759101455, + "grad_norm": 0.84375, + "learning_rate": 2.2298926044580847e-05, + "loss": 0.9107, + "step": 8020 + }, + { + "epoch": 0.3516664408076718, + "grad_norm": 0.828125, + "learning_rate": 2.2296028130438362e-05, + "loss": 0.8615, + "step": 8021 + }, + { + "epoch": 0.351710284024329, + "grad_norm": 0.84375, + "learning_rate": 2.2293130380985005e-05, + "loss": 0.839, + "step": 8022 + }, + { + "epoch": 0.35175412724098626, + "grad_norm": 1.0, + "learning_rate": 2.2290232796226918e-05, + "loss": 0.8134, + "step": 8023 + }, + { + "epoch": 0.3517979704576435, + "grad_norm": 0.78125, + "learning_rate": 2.2287335376170217e-05, + "loss": 0.7047, + "step": 8024 + }, + { + "epoch": 0.3518418136743007, + "grad_norm": 0.89453125, + "learning_rate": 2.228443812082108e-05, + "loss": 0.8359, + "step": 8025 + }, + { + "epoch": 0.3518856568909579, + "grad_norm": 0.7890625, + "learning_rate": 2.2281541030185648e-05, + "loss": 0.8454, + "step": 8026 + }, + { + "epoch": 0.35192950010761515, + "grad_norm": 0.88671875, + "learning_rate": 2.227864410427004e-05, + "loss": 0.8509, + "step": 8027 + }, + { + "epoch": 0.3519733433242724, + "grad_norm": 0.8125, + "learning_rate": 2.2275747343080412e-05, + "loss": 0.7051, + "step": 8028 + }, + { + "epoch": 0.3520171865409296, + "grad_norm": 0.81640625, + "learning_rate": 2.2272850746622864e-05, + "loss": 0.8972, + "step": 8029 + }, + { + "epoch": 0.35206102975758685, + "grad_norm": 0.78515625, + "learning_rate": 2.2269954314903586e-05, + "loss": 0.6864, + "step": 8030 + }, + { + "epoch": 0.3521048729742441, + "grad_norm": 0.828125, + "learning_rate": 2.2267058047928703e-05, + "loss": 0.9034, + "step": 8031 + }, + { + "epoch": 0.3521487161909013, + "grad_norm": 0.67578125, + "learning_rate": 2.2264161945704353e-05, + "loss": 0.6749, + "step": 8032 + }, + { + "epoch": 0.35219255940755856, + "grad_norm": 0.8203125, + "learning_rate": 2.226126600823666e-05, + "loss": 0.7833, + "step": 8033 + }, + { + "epoch": 0.3522364026242158, + "grad_norm": 0.796875, + "learning_rate": 2.2258370235531746e-05, + "loss": 0.7974, + "step": 8034 + }, + { + "epoch": 0.35228024584087303, + "grad_norm": 0.7890625, + "learning_rate": 2.2255474627595797e-05, + "loss": 0.8402, + "step": 8035 + }, + { + "epoch": 0.35232408905753027, + "grad_norm": 0.80078125, + "learning_rate": 2.225257918443493e-05, + "loss": 0.8795, + "step": 8036 + }, + { + "epoch": 0.3523679322741875, + "grad_norm": 0.83203125, + "learning_rate": 2.2249683906055285e-05, + "loss": 0.7929, + "step": 8037 + }, + { + "epoch": 0.35241177549084474, + "grad_norm": 0.859375, + "learning_rate": 2.224678879246298e-05, + "loss": 0.9168, + "step": 8038 + }, + { + "epoch": 0.35245561870750197, + "grad_norm": 0.9140625, + "learning_rate": 2.224389384366413e-05, + "loss": 0.7382, + "step": 8039 + }, + { + "epoch": 0.3524994619241592, + "grad_norm": 0.8359375, + "learning_rate": 2.2240999059664947e-05, + "loss": 0.8465, + "step": 8040 + }, + { + "epoch": 0.35254330514081644, + "grad_norm": 0.8515625, + "learning_rate": 2.223810444047151e-05, + "loss": 0.7914, + "step": 8041 + }, + { + "epoch": 0.3525871483574737, + "grad_norm": 0.828125, + "learning_rate": 2.2235209986089968e-05, + "loss": 0.7786, + "step": 8042 + }, + { + "epoch": 0.3526309915741309, + "grad_norm": 0.80078125, + "learning_rate": 2.223231569652645e-05, + "loss": 0.7121, + "step": 8043 + }, + { + "epoch": 0.35267483479078815, + "grad_norm": 0.82421875, + "learning_rate": 2.2229421571787103e-05, + "loss": 0.783, + "step": 8044 + }, + { + "epoch": 0.3527186780074454, + "grad_norm": 0.87109375, + "learning_rate": 2.2226527611878046e-05, + "loss": 0.9461, + "step": 8045 + }, + { + "epoch": 0.3527625212241026, + "grad_norm": 0.83203125, + "learning_rate": 2.2223633816805413e-05, + "loss": 0.7784, + "step": 8046 + }, + { + "epoch": 0.35280636444075986, + "grad_norm": 0.90625, + "learning_rate": 2.222074018657535e-05, + "loss": 0.7675, + "step": 8047 + }, + { + "epoch": 0.3528502076574171, + "grad_norm": 0.921875, + "learning_rate": 2.221784672119398e-05, + "loss": 0.905, + "step": 8048 + }, + { + "epoch": 0.3528940508740743, + "grad_norm": 0.84375, + "learning_rate": 2.221495342066742e-05, + "loss": 0.854, + "step": 8049 + }, + { + "epoch": 0.35293789409073156, + "grad_norm": 0.8125, + "learning_rate": 2.2212060285001834e-05, + "loss": 0.7096, + "step": 8050 + }, + { + "epoch": 0.3529817373073888, + "grad_norm": 0.828125, + "learning_rate": 2.220916731420335e-05, + "loss": 0.8429, + "step": 8051 + }, + { + "epoch": 0.35302558052404603, + "grad_norm": 0.80078125, + "learning_rate": 2.220627450827808e-05, + "loss": 0.7369, + "step": 8052 + }, + { + "epoch": 0.35306942374070327, + "grad_norm": 0.90625, + "learning_rate": 2.2203381867232175e-05, + "loss": 0.8527, + "step": 8053 + }, + { + "epoch": 0.3531132669573605, + "grad_norm": 0.765625, + "learning_rate": 2.2200489391071722e-05, + "loss": 0.7293, + "step": 8054 + }, + { + "epoch": 0.3531571101740177, + "grad_norm": 0.78125, + "learning_rate": 2.2197597079802912e-05, + "loss": 0.8262, + "step": 8055 + }, + { + "epoch": 0.3532009533906749, + "grad_norm": 0.85546875, + "learning_rate": 2.2194704933431853e-05, + "loss": 0.8426, + "step": 8056 + }, + { + "epoch": 0.35324479660733216, + "grad_norm": 0.828125, + "learning_rate": 2.2191812951964662e-05, + "loss": 0.7325, + "step": 8057 + }, + { + "epoch": 0.3532886398239894, + "grad_norm": 0.86328125, + "learning_rate": 2.218892113540748e-05, + "loss": 0.8311, + "step": 8058 + }, + { + "epoch": 0.3533324830406466, + "grad_norm": 0.79296875, + "learning_rate": 2.21860294837664e-05, + "loss": 0.8786, + "step": 8059 + }, + { + "epoch": 0.35337632625730386, + "grad_norm": 0.89453125, + "learning_rate": 2.218313799704762e-05, + "loss": 0.9417, + "step": 8060 + }, + { + "epoch": 0.3534201694739611, + "grad_norm": 0.77734375, + "learning_rate": 2.218024667525722e-05, + "loss": 0.7174, + "step": 8061 + }, + { + "epoch": 0.35346401269061833, + "grad_norm": 0.83984375, + "learning_rate": 2.2177355518401354e-05, + "loss": 0.8286, + "step": 8062 + }, + { + "epoch": 0.35350785590727557, + "grad_norm": 0.83984375, + "learning_rate": 2.2174464526486126e-05, + "loss": 0.8649, + "step": 8063 + }, + { + "epoch": 0.3535516991239328, + "grad_norm": 0.75390625, + "learning_rate": 2.2171573699517646e-05, + "loss": 0.88, + "step": 8064 + }, + { + "epoch": 0.35359554234059004, + "grad_norm": 0.85546875, + "learning_rate": 2.2168683037502093e-05, + "loss": 1.024, + "step": 8065 + }, + { + "epoch": 0.3536393855572473, + "grad_norm": 0.96484375, + "learning_rate": 2.2165792540445562e-05, + "loss": 0.7305, + "step": 8066 + }, + { + "epoch": 0.3536832287739045, + "grad_norm": 0.83984375, + "learning_rate": 2.2162902208354197e-05, + "loss": 0.9557, + "step": 8067 + }, + { + "epoch": 0.35372707199056175, + "grad_norm": 0.82421875, + "learning_rate": 2.2160012041234103e-05, + "loss": 0.8295, + "step": 8068 + }, + { + "epoch": 0.353770915207219, + "grad_norm": 0.83203125, + "learning_rate": 2.2157122039091392e-05, + "loss": 0.7492, + "step": 8069 + }, + { + "epoch": 0.3538147584238762, + "grad_norm": 0.78515625, + "learning_rate": 2.2154232201932234e-05, + "loss": 0.8003, + "step": 8070 + }, + { + "epoch": 0.35385860164053345, + "grad_norm": 0.87890625, + "learning_rate": 2.215134252976274e-05, + "loss": 0.7649, + "step": 8071 + }, + { + "epoch": 0.3539024448571907, + "grad_norm": 0.83203125, + "learning_rate": 2.214845302258902e-05, + "loss": 0.7501, + "step": 8072 + }, + { + "epoch": 0.3539462880738479, + "grad_norm": 0.87109375, + "learning_rate": 2.21455636804172e-05, + "loss": 0.9377, + "step": 8073 + }, + { + "epoch": 0.35399013129050516, + "grad_norm": 0.8671875, + "learning_rate": 2.214267450325339e-05, + "loss": 0.7941, + "step": 8074 + }, + { + "epoch": 0.3540339745071624, + "grad_norm": 0.8359375, + "learning_rate": 2.213978549110375e-05, + "loss": 0.7254, + "step": 8075 + }, + { + "epoch": 0.35407781772381963, + "grad_norm": 0.8203125, + "learning_rate": 2.2136896643974393e-05, + "loss": 0.8137, + "step": 8076 + }, + { + "epoch": 0.35412166094047687, + "grad_norm": 0.85546875, + "learning_rate": 2.213400796187143e-05, + "loss": 0.811, + "step": 8077 + }, + { + "epoch": 0.3541655041571341, + "grad_norm": 0.80078125, + "learning_rate": 2.2131119444800986e-05, + "loss": 0.7082, + "step": 8078 + }, + { + "epoch": 0.35420934737379134, + "grad_norm": 0.82421875, + "learning_rate": 2.2128231092769146e-05, + "loss": 0.8363, + "step": 8079 + }, + { + "epoch": 0.3542531905904486, + "grad_norm": 0.84765625, + "learning_rate": 2.212534290578211e-05, + "loss": 0.8958, + "step": 8080 + }, + { + "epoch": 0.3542970338071058, + "grad_norm": 0.78125, + "learning_rate": 2.2122454883845956e-05, + "loss": 0.8833, + "step": 8081 + }, + { + "epoch": 0.35434087702376305, + "grad_norm": 0.79296875, + "learning_rate": 2.2119567026966802e-05, + "loss": 0.81, + "step": 8082 + }, + { + "epoch": 0.3543847202404203, + "grad_norm": 0.86328125, + "learning_rate": 2.211667933515078e-05, + "loss": 0.9369, + "step": 8083 + }, + { + "epoch": 0.3544285634570775, + "grad_norm": 0.7890625, + "learning_rate": 2.2113791808404007e-05, + "loss": 0.9641, + "step": 8084 + }, + { + "epoch": 0.35447240667373475, + "grad_norm": 0.83203125, + "learning_rate": 2.2110904446732596e-05, + "loss": 0.8046, + "step": 8085 + }, + { + "epoch": 0.35451624989039193, + "grad_norm": 0.875, + "learning_rate": 2.2108017250142665e-05, + "loss": 0.8155, + "step": 8086 + }, + { + "epoch": 0.35456009310704917, + "grad_norm": 0.78515625, + "learning_rate": 2.2105130218640313e-05, + "loss": 0.7676, + "step": 8087 + }, + { + "epoch": 0.3546039363237064, + "grad_norm": 0.8671875, + "learning_rate": 2.2102243352231723e-05, + "loss": 0.8593, + "step": 8088 + }, + { + "epoch": 0.35464777954036364, + "grad_norm": 0.8046875, + "learning_rate": 2.2099356650922977e-05, + "loss": 0.8366, + "step": 8089 + }, + { + "epoch": 0.3546916227570209, + "grad_norm": 0.81640625, + "learning_rate": 2.209647011472018e-05, + "loss": 0.6677, + "step": 8090 + }, + { + "epoch": 0.3547354659736781, + "grad_norm": 0.7421875, + "learning_rate": 2.2093583743629464e-05, + "loss": 0.671, + "step": 8091 + }, + { + "epoch": 0.35477930919033535, + "grad_norm": 0.80859375, + "learning_rate": 2.2090697537656945e-05, + "loss": 0.8038, + "step": 8092 + }, + { + "epoch": 0.3548231524069926, + "grad_norm": 0.8359375, + "learning_rate": 2.2087811496808753e-05, + "loss": 0.8671, + "step": 8093 + }, + { + "epoch": 0.3548669956236498, + "grad_norm": 0.76953125, + "learning_rate": 2.2084925621090945e-05, + "loss": 0.8603, + "step": 8094 + }, + { + "epoch": 0.35491083884030705, + "grad_norm": 0.88671875, + "learning_rate": 2.208203991050972e-05, + "loss": 0.9937, + "step": 8095 + }, + { + "epoch": 0.3549546820569643, + "grad_norm": 0.7578125, + "learning_rate": 2.2079154365071164e-05, + "loss": 0.7381, + "step": 8096 + }, + { + "epoch": 0.3549985252736215, + "grad_norm": 0.83203125, + "learning_rate": 2.207626898478138e-05, + "loss": 0.7674, + "step": 8097 + }, + { + "epoch": 0.35504236849027876, + "grad_norm": 0.8046875, + "learning_rate": 2.2073383769646484e-05, + "loss": 0.7071, + "step": 8098 + }, + { + "epoch": 0.355086211706936, + "grad_norm": 0.8359375, + "learning_rate": 2.2070498719672605e-05, + "loss": 0.8698, + "step": 8099 + }, + { + "epoch": 0.35513005492359323, + "grad_norm": 0.6875, + "learning_rate": 2.2067613834865818e-05, + "loss": 0.7706, + "step": 8100 + }, + { + "epoch": 0.35517389814025047, + "grad_norm": 0.75390625, + "learning_rate": 2.2064729115232295e-05, + "loss": 0.8054, + "step": 8101 + }, + { + "epoch": 0.3552177413569077, + "grad_norm": 0.8828125, + "learning_rate": 2.2061844560778132e-05, + "loss": 0.7896, + "step": 8102 + }, + { + "epoch": 0.35526158457356494, + "grad_norm": 0.765625, + "learning_rate": 2.2058960171509434e-05, + "loss": 0.7563, + "step": 8103 + }, + { + "epoch": 0.3553054277902222, + "grad_norm": 0.69140625, + "learning_rate": 2.205607594743231e-05, + "loss": 0.6991, + "step": 8104 + }, + { + "epoch": 0.3553492710068794, + "grad_norm": 0.78515625, + "learning_rate": 2.205319188855285e-05, + "loss": 0.8495, + "step": 8105 + }, + { + "epoch": 0.35539311422353664, + "grad_norm": 0.87890625, + "learning_rate": 2.2050307994877216e-05, + "loss": 0.7669, + "step": 8106 + }, + { + "epoch": 0.3554369574401939, + "grad_norm": 0.91015625, + "learning_rate": 2.204742426641151e-05, + "loss": 0.7354, + "step": 8107 + }, + { + "epoch": 0.3554808006568511, + "grad_norm": 0.84765625, + "learning_rate": 2.204454070316183e-05, + "loss": 0.8179, + "step": 8108 + }, + { + "epoch": 0.35552464387350835, + "grad_norm": 0.859375, + "learning_rate": 2.204165730513429e-05, + "loss": 0.7698, + "step": 8109 + }, + { + "epoch": 0.3555684870901656, + "grad_norm": 0.83984375, + "learning_rate": 2.203877407233498e-05, + "loss": 0.8526, + "step": 8110 + }, + { + "epoch": 0.3556123303068228, + "grad_norm": 0.83984375, + "learning_rate": 2.2035891004770048e-05, + "loss": 0.6953, + "step": 8111 + }, + { + "epoch": 0.35565617352348006, + "grad_norm": 0.9765625, + "learning_rate": 2.2033008102445595e-05, + "loss": 0.8496, + "step": 8112 + }, + { + "epoch": 0.3557000167401373, + "grad_norm": 0.7890625, + "learning_rate": 2.2030125365367726e-05, + "loss": 0.6929, + "step": 8113 + }, + { + "epoch": 0.35574385995679453, + "grad_norm": 0.80859375, + "learning_rate": 2.2027242793542545e-05, + "loss": 0.7986, + "step": 8114 + }, + { + "epoch": 0.35578770317345176, + "grad_norm": 1.0, + "learning_rate": 2.202436038697614e-05, + "loss": 0.8716, + "step": 8115 + }, + { + "epoch": 0.35583154639010894, + "grad_norm": 0.89453125, + "learning_rate": 2.202147814567468e-05, + "loss": 0.9401, + "step": 8116 + }, + { + "epoch": 0.3558753896067662, + "grad_norm": 0.8359375, + "learning_rate": 2.2018596069644237e-05, + "loss": 0.874, + "step": 8117 + }, + { + "epoch": 0.3559192328234234, + "grad_norm": 0.81640625, + "learning_rate": 2.201571415889092e-05, + "loss": 0.8035, + "step": 8118 + }, + { + "epoch": 0.35596307604008065, + "grad_norm": 0.8828125, + "learning_rate": 2.2012832413420835e-05, + "loss": 0.8038, + "step": 8119 + }, + { + "epoch": 0.3560069192567379, + "grad_norm": 0.71875, + "learning_rate": 2.2009950833240077e-05, + "loss": 0.7582, + "step": 8120 + }, + { + "epoch": 0.3560507624733951, + "grad_norm": 0.8046875, + "learning_rate": 2.2007069418354787e-05, + "loss": 0.7829, + "step": 8121 + }, + { + "epoch": 0.35609460569005236, + "grad_norm": 0.77734375, + "learning_rate": 2.2004188168771066e-05, + "loss": 0.7965, + "step": 8122 + }, + { + "epoch": 0.3561384489067096, + "grad_norm": 0.81640625, + "learning_rate": 2.2001307084495004e-05, + "loss": 0.7851, + "step": 8123 + }, + { + "epoch": 0.35618229212336683, + "grad_norm": 0.828125, + "learning_rate": 2.1998426165532715e-05, + "loss": 0.8289, + "step": 8124 + }, + { + "epoch": 0.35622613534002406, + "grad_norm": 0.859375, + "learning_rate": 2.199554541189027e-05, + "loss": 0.7955, + "step": 8125 + }, + { + "epoch": 0.3562699785566813, + "grad_norm": 0.921875, + "learning_rate": 2.199266482357385e-05, + "loss": 0.9781, + "step": 8126 + }, + { + "epoch": 0.35631382177333853, + "grad_norm": 0.9140625, + "learning_rate": 2.1989784400589508e-05, + "loss": 0.789, + "step": 8127 + }, + { + "epoch": 0.35635766498999577, + "grad_norm": 0.81640625, + "learning_rate": 2.1986904142943365e-05, + "loss": 0.7383, + "step": 8128 + }, + { + "epoch": 0.356401508206653, + "grad_norm": 0.96875, + "learning_rate": 2.1984024050641515e-05, + "loss": 0.7031, + "step": 8129 + }, + { + "epoch": 0.35644535142331024, + "grad_norm": 0.86328125, + "learning_rate": 2.198114412369007e-05, + "loss": 0.9882, + "step": 8130 + }, + { + "epoch": 0.3564891946399675, + "grad_norm": 0.859375, + "learning_rate": 2.1978264362095124e-05, + "loss": 0.8801, + "step": 8131 + }, + { + "epoch": 0.3565330378566247, + "grad_norm": 0.80859375, + "learning_rate": 2.1975384765862796e-05, + "loss": 0.7312, + "step": 8132 + }, + { + "epoch": 0.35657688107328195, + "grad_norm": 1.109375, + "learning_rate": 2.1972505334999173e-05, + "loss": 0.9071, + "step": 8133 + }, + { + "epoch": 0.3566207242899392, + "grad_norm": 0.75390625, + "learning_rate": 2.196962606951036e-05, + "loss": 0.7623, + "step": 8134 + }, + { + "epoch": 0.3566645675065964, + "grad_norm": 0.890625, + "learning_rate": 2.1966746969402452e-05, + "loss": 0.7726, + "step": 8135 + }, + { + "epoch": 0.35670841072325365, + "grad_norm": 0.80078125, + "learning_rate": 2.196386803468158e-05, + "loss": 0.8299, + "step": 8136 + }, + { + "epoch": 0.3567522539399109, + "grad_norm": 0.8359375, + "learning_rate": 2.196098926535384e-05, + "loss": 0.8259, + "step": 8137 + }, + { + "epoch": 0.3567960971565681, + "grad_norm": 0.890625, + "learning_rate": 2.1958110661425314e-05, + "loss": 0.87, + "step": 8138 + }, + { + "epoch": 0.35683994037322536, + "grad_norm": 0.80078125, + "learning_rate": 2.195523222290211e-05, + "loss": 0.7961, + "step": 8139 + }, + { + "epoch": 0.3568837835898826, + "grad_norm": 0.7890625, + "learning_rate": 2.1952353949790304e-05, + "loss": 0.7894, + "step": 8140 + }, + { + "epoch": 0.35692762680653983, + "grad_norm": 0.7578125, + "learning_rate": 2.1949475842096055e-05, + "loss": 0.8259, + "step": 8141 + }, + { + "epoch": 0.35697147002319707, + "grad_norm": 0.890625, + "learning_rate": 2.1946597899825427e-05, + "loss": 0.8141, + "step": 8142 + }, + { + "epoch": 0.3570153132398543, + "grad_norm": 0.7890625, + "learning_rate": 2.1943720122984525e-05, + "loss": 0.7754, + "step": 8143 + }, + { + "epoch": 0.35705915645651154, + "grad_norm": 0.8125, + "learning_rate": 2.194084251157945e-05, + "loss": 0.7117, + "step": 8144 + }, + { + "epoch": 0.3571029996731688, + "grad_norm": 0.84765625, + "learning_rate": 2.193796506561627e-05, + "loss": 0.8537, + "step": 8145 + }, + { + "epoch": 0.35714684288982596, + "grad_norm": 0.84375, + "learning_rate": 2.1935087785101138e-05, + "loss": 0.7926, + "step": 8146 + }, + { + "epoch": 0.3571906861064832, + "grad_norm": 0.78125, + "learning_rate": 2.193221067004012e-05, + "loss": 0.7788, + "step": 8147 + }, + { + "epoch": 0.3572345293231404, + "grad_norm": 0.80859375, + "learning_rate": 2.1929333720439327e-05, + "loss": 0.7817, + "step": 8148 + }, + { + "epoch": 0.35727837253979766, + "grad_norm": 0.859375, + "learning_rate": 2.192645693630484e-05, + "loss": 0.9024, + "step": 8149 + }, + { + "epoch": 0.3573222157564549, + "grad_norm": 0.7890625, + "learning_rate": 2.1923580317642746e-05, + "loss": 0.8123, + "step": 8150 + }, + { + "epoch": 0.35736605897311213, + "grad_norm": 0.82421875, + "learning_rate": 2.192070386445918e-05, + "loss": 0.7984, + "step": 8151 + }, + { + "epoch": 0.35740990218976937, + "grad_norm": 0.80859375, + "learning_rate": 2.191782757676022e-05, + "loss": 0.8435, + "step": 8152 + }, + { + "epoch": 0.3574537454064266, + "grad_norm": 0.83203125, + "learning_rate": 2.191495145455197e-05, + "loss": 0.8643, + "step": 8153 + }, + { + "epoch": 0.35749758862308384, + "grad_norm": 0.734375, + "learning_rate": 2.1912075497840513e-05, + "loss": 0.665, + "step": 8154 + }, + { + "epoch": 0.3575414318397411, + "grad_norm": 0.78515625, + "learning_rate": 2.190919970663192e-05, + "loss": 0.8308, + "step": 8155 + }, + { + "epoch": 0.3575852750563983, + "grad_norm": 0.8671875, + "learning_rate": 2.1906324080932338e-05, + "loss": 0.8147, + "step": 8156 + }, + { + "epoch": 0.35762911827305555, + "grad_norm": 0.7265625, + "learning_rate": 2.1903448620747847e-05, + "loss": 0.7106, + "step": 8157 + }, + { + "epoch": 0.3576729614897128, + "grad_norm": 0.80859375, + "learning_rate": 2.1900573326084528e-05, + "loss": 0.8891, + "step": 8158 + }, + { + "epoch": 0.35771680470637, + "grad_norm": 0.85546875, + "learning_rate": 2.189769819694848e-05, + "loss": 0.7276, + "step": 8159 + }, + { + "epoch": 0.35776064792302725, + "grad_norm": 0.8359375, + "learning_rate": 2.1894823233345775e-05, + "loss": 0.7689, + "step": 8160 + }, + { + "epoch": 0.3578044911396845, + "grad_norm": 0.72265625, + "learning_rate": 2.189194843528255e-05, + "loss": 0.8214, + "step": 8161 + }, + { + "epoch": 0.3578483343563417, + "grad_norm": 0.78125, + "learning_rate": 2.188907380276488e-05, + "loss": 0.7872, + "step": 8162 + }, + { + "epoch": 0.35789217757299896, + "grad_norm": 0.9609375, + "learning_rate": 2.188619933579885e-05, + "loss": 0.6933, + "step": 8163 + }, + { + "epoch": 0.3579360207896562, + "grad_norm": 0.796875, + "learning_rate": 2.1883325034390557e-05, + "loss": 0.7843, + "step": 8164 + }, + { + "epoch": 0.35797986400631343, + "grad_norm": 0.8046875, + "learning_rate": 2.1880450898546057e-05, + "loss": 0.8088, + "step": 8165 + }, + { + "epoch": 0.35802370722297067, + "grad_norm": 0.859375, + "learning_rate": 2.1877576928271514e-05, + "loss": 0.8233, + "step": 8166 + }, + { + "epoch": 0.3580675504396279, + "grad_norm": 0.83984375, + "learning_rate": 2.187470312357297e-05, + "loss": 0.7938, + "step": 8167 + }, + { + "epoch": 0.35811139365628514, + "grad_norm": 0.76953125, + "learning_rate": 2.1871829484456542e-05, + "loss": 0.7236, + "step": 8168 + }, + { + "epoch": 0.3581552368729424, + "grad_norm": 0.8203125, + "learning_rate": 2.18689560109283e-05, + "loss": 0.7072, + "step": 8169 + }, + { + "epoch": 0.3581990800895996, + "grad_norm": 0.8046875, + "learning_rate": 2.186608270299434e-05, + "loss": 0.8348, + "step": 8170 + }, + { + "epoch": 0.35824292330625684, + "grad_norm": 0.8359375, + "learning_rate": 2.1863209560660746e-05, + "loss": 0.8673, + "step": 8171 + }, + { + "epoch": 0.3582867665229141, + "grad_norm": 0.83984375, + "learning_rate": 2.1860336583933617e-05, + "loss": 0.8508, + "step": 8172 + }, + { + "epoch": 0.3583306097395713, + "grad_norm": 0.875, + "learning_rate": 2.1857463772819008e-05, + "loss": 0.9222, + "step": 8173 + }, + { + "epoch": 0.35837445295622855, + "grad_norm": 0.83203125, + "learning_rate": 2.1854591127323077e-05, + "loss": 0.7965, + "step": 8174 + }, + { + "epoch": 0.3584182961728858, + "grad_norm": 0.73046875, + "learning_rate": 2.1851718647451858e-05, + "loss": 0.7921, + "step": 8175 + }, + { + "epoch": 0.358462139389543, + "grad_norm": 0.8125, + "learning_rate": 2.1848846333211458e-05, + "loss": 0.8041, + "step": 8176 + }, + { + "epoch": 0.3585059826062002, + "grad_norm": 0.78515625, + "learning_rate": 2.1845974184607974e-05, + "loss": 0.8561, + "step": 8177 + }, + { + "epoch": 0.35854982582285744, + "grad_norm": 0.91796875, + "learning_rate": 2.184310220164747e-05, + "loss": 0.7982, + "step": 8178 + }, + { + "epoch": 0.3585936690395147, + "grad_norm": 0.83203125, + "learning_rate": 2.1840230384336047e-05, + "loss": 0.865, + "step": 8179 + }, + { + "epoch": 0.3586375122561719, + "grad_norm": 0.87109375, + "learning_rate": 2.183735873267976e-05, + "loss": 0.828, + "step": 8180 + }, + { + "epoch": 0.35868135547282914, + "grad_norm": 0.87890625, + "learning_rate": 2.1834487246684743e-05, + "loss": 0.7918, + "step": 8181 + }, + { + "epoch": 0.3587251986894864, + "grad_norm": 0.82421875, + "learning_rate": 2.1831615926357075e-05, + "loss": 0.8558, + "step": 8182 + }, + { + "epoch": 0.3587690419061436, + "grad_norm": 0.984375, + "learning_rate": 2.1828744771702825e-05, + "loss": 0.8069, + "step": 8183 + }, + { + "epoch": 0.35881288512280085, + "grad_norm": 0.77734375, + "learning_rate": 2.182587378272808e-05, + "loss": 0.7637, + "step": 8184 + }, + { + "epoch": 0.3588567283394581, + "grad_norm": 0.7890625, + "learning_rate": 2.1823002959438898e-05, + "loss": 0.7378, + "step": 8185 + }, + { + "epoch": 0.3589005715561153, + "grad_norm": 0.81640625, + "learning_rate": 2.1820132301841423e-05, + "loss": 0.8468, + "step": 8186 + }, + { + "epoch": 0.35894441477277256, + "grad_norm": 0.94921875, + "learning_rate": 2.1817261809941715e-05, + "loss": 0.9429, + "step": 8187 + }, + { + "epoch": 0.3589882579894298, + "grad_norm": 0.80078125, + "learning_rate": 2.1814391483745843e-05, + "loss": 0.9364, + "step": 8188 + }, + { + "epoch": 0.35903210120608703, + "grad_norm": 0.796875, + "learning_rate": 2.181152132325991e-05, + "loss": 0.8137, + "step": 8189 + }, + { + "epoch": 0.35907594442274426, + "grad_norm": 0.90625, + "learning_rate": 2.1808651328489947e-05, + "loss": 0.8942, + "step": 8190 + }, + { + "epoch": 0.3591197876394015, + "grad_norm": 0.7265625, + "learning_rate": 2.1805781499442113e-05, + "loss": 0.7277, + "step": 8191 + }, + { + "epoch": 0.35916363085605874, + "grad_norm": 0.8125, + "learning_rate": 2.1802911836122463e-05, + "loss": 0.8416, + "step": 8192 + }, + { + "epoch": 0.35920747407271597, + "grad_norm": 0.76171875, + "learning_rate": 2.1800042338537063e-05, + "loss": 0.7833, + "step": 8193 + }, + { + "epoch": 0.3592513172893732, + "grad_norm": 0.86328125, + "learning_rate": 2.1797173006692006e-05, + "loss": 0.8726, + "step": 8194 + }, + { + "epoch": 0.35929516050603044, + "grad_norm": 0.859375, + "learning_rate": 2.1794303840593343e-05, + "loss": 0.7425, + "step": 8195 + }, + { + "epoch": 0.3593390037226877, + "grad_norm": 0.78125, + "learning_rate": 2.1791434840247206e-05, + "loss": 0.6658, + "step": 8196 + }, + { + "epoch": 0.3593828469393449, + "grad_norm": 0.7734375, + "learning_rate": 2.178856600565966e-05, + "loss": 0.7964, + "step": 8197 + }, + { + "epoch": 0.35942669015600215, + "grad_norm": 0.84765625, + "learning_rate": 2.1785697336836776e-05, + "loss": 0.7641, + "step": 8198 + }, + { + "epoch": 0.3594705333726594, + "grad_norm": 0.828125, + "learning_rate": 2.178282883378463e-05, + "loss": 0.8324, + "step": 8199 + }, + { + "epoch": 0.3595143765893166, + "grad_norm": 0.94140625, + "learning_rate": 2.177996049650929e-05, + "loss": 0.8222, + "step": 8200 + }, + { + "epoch": 0.35955821980597386, + "grad_norm": 0.83984375, + "learning_rate": 2.177709232501687e-05, + "loss": 0.9309, + "step": 8201 + }, + { + "epoch": 0.3596020630226311, + "grad_norm": 0.76953125, + "learning_rate": 2.177422431931344e-05, + "loss": 0.7163, + "step": 8202 + }, + { + "epoch": 0.3596459062392883, + "grad_norm": 0.828125, + "learning_rate": 2.177135647940507e-05, + "loss": 0.767, + "step": 8203 + }, + { + "epoch": 0.35968974945594556, + "grad_norm": 0.88671875, + "learning_rate": 2.1768488805297825e-05, + "loss": 0.8725, + "step": 8204 + }, + { + "epoch": 0.3597335926726028, + "grad_norm": 0.92578125, + "learning_rate": 2.1765621296997786e-05, + "loss": 0.926, + "step": 8205 + }, + { + "epoch": 0.35977743588926003, + "grad_norm": 0.75, + "learning_rate": 2.1762753954511063e-05, + "loss": 0.7449, + "step": 8206 + }, + { + "epoch": 0.3598212791059172, + "grad_norm": 0.83203125, + "learning_rate": 2.1759886777843708e-05, + "loss": 0.8686, + "step": 8207 + }, + { + "epoch": 0.35986512232257445, + "grad_norm": 0.8046875, + "learning_rate": 2.175701976700181e-05, + "loss": 0.756, + "step": 8208 + }, + { + "epoch": 0.3599089655392317, + "grad_norm": 0.87109375, + "learning_rate": 2.175415292199142e-05, + "loss": 0.8683, + "step": 8209 + }, + { + "epoch": 0.3599528087558889, + "grad_norm": 0.79296875, + "learning_rate": 2.1751286242818614e-05, + "loss": 0.726, + "step": 8210 + }, + { + "epoch": 0.35999665197254616, + "grad_norm": 0.91015625, + "learning_rate": 2.174841972948951e-05, + "loss": 0.739, + "step": 8211 + }, + { + "epoch": 0.3600404951892034, + "grad_norm": 0.97265625, + "learning_rate": 2.1745553382010164e-05, + "loss": 0.7872, + "step": 8212 + }, + { + "epoch": 0.3600843384058606, + "grad_norm": 0.8046875, + "learning_rate": 2.1742687200386635e-05, + "loss": 0.9071, + "step": 8213 + }, + { + "epoch": 0.36012818162251786, + "grad_norm": 0.7734375, + "learning_rate": 2.173982118462502e-05, + "loss": 0.7348, + "step": 8214 + }, + { + "epoch": 0.3601720248391751, + "grad_norm": 0.83203125, + "learning_rate": 2.1736955334731367e-05, + "loss": 0.6956, + "step": 8215 + }, + { + "epoch": 0.36021586805583233, + "grad_norm": 0.81640625, + "learning_rate": 2.1734089650711777e-05, + "loss": 0.7983, + "step": 8216 + }, + { + "epoch": 0.36025971127248957, + "grad_norm": 0.8125, + "learning_rate": 2.1731224132572303e-05, + "loss": 0.72, + "step": 8217 + }, + { + "epoch": 0.3603035544891468, + "grad_norm": 0.80859375, + "learning_rate": 2.1728358780319025e-05, + "loss": 0.8449, + "step": 8218 + }, + { + "epoch": 0.36034739770580404, + "grad_norm": 0.796875, + "learning_rate": 2.1725493593958013e-05, + "loss": 0.8532, + "step": 8219 + }, + { + "epoch": 0.3603912409224613, + "grad_norm": 2.5625, + "learning_rate": 2.172262857349532e-05, + "loss": 0.7527, + "step": 8220 + }, + { + "epoch": 0.3604350841391185, + "grad_norm": 0.97265625, + "learning_rate": 2.171976371893707e-05, + "loss": 0.8515, + "step": 8221 + }, + { + "epoch": 0.36047892735577575, + "grad_norm": 0.79296875, + "learning_rate": 2.1716899030289305e-05, + "loss": 0.8589, + "step": 8222 + }, + { + "epoch": 0.360522770572433, + "grad_norm": 0.81640625, + "learning_rate": 2.1714034507558102e-05, + "loss": 0.9111, + "step": 8223 + }, + { + "epoch": 0.3605666137890902, + "grad_norm": 0.77734375, + "learning_rate": 2.1711170150749528e-05, + "loss": 0.77, + "step": 8224 + }, + { + "epoch": 0.36061045700574745, + "grad_norm": 0.76953125, + "learning_rate": 2.1708305959869625e-05, + "loss": 0.577, + "step": 8225 + }, + { + "epoch": 0.3606543002224047, + "grad_norm": 0.8828125, + "learning_rate": 2.170544193492452e-05, + "loss": 0.83, + "step": 8226 + }, + { + "epoch": 0.3606981434390619, + "grad_norm": 0.80859375, + "learning_rate": 2.1702578075920255e-05, + "loss": 0.8028, + "step": 8227 + }, + { + "epoch": 0.36074198665571916, + "grad_norm": 0.85546875, + "learning_rate": 2.16997143828629e-05, + "loss": 0.8465, + "step": 8228 + }, + { + "epoch": 0.3607858298723764, + "grad_norm": 0.77734375, + "learning_rate": 2.169685085575852e-05, + "loss": 0.7569, + "step": 8229 + }, + { + "epoch": 0.36082967308903363, + "grad_norm": 0.79296875, + "learning_rate": 2.1693987494613177e-05, + "loss": 0.728, + "step": 8230 + }, + { + "epoch": 0.36087351630569087, + "grad_norm": 1.8515625, + "learning_rate": 2.1691124299432962e-05, + "loss": 0.7514, + "step": 8231 + }, + { + "epoch": 0.3609173595223481, + "grad_norm": 0.89453125, + "learning_rate": 2.168826127022394e-05, + "loss": 0.836, + "step": 8232 + }, + { + "epoch": 0.36096120273900534, + "grad_norm": 0.76171875, + "learning_rate": 2.1685398406992174e-05, + "loss": 0.8079, + "step": 8233 + }, + { + "epoch": 0.3610050459556626, + "grad_norm": 0.859375, + "learning_rate": 2.1682535709743733e-05, + "loss": 0.8831, + "step": 8234 + }, + { + "epoch": 0.3610488891723198, + "grad_norm": 0.828125, + "learning_rate": 2.167967317848465e-05, + "loss": 0.7435, + "step": 8235 + }, + { + "epoch": 0.36109273238897704, + "grad_norm": 0.84375, + "learning_rate": 2.167681081322105e-05, + "loss": 0.9087, + "step": 8236 + }, + { + "epoch": 0.3611365756056342, + "grad_norm": 1.1015625, + "learning_rate": 2.167394861395897e-05, + "loss": 0.7711, + "step": 8237 + }, + { + "epoch": 0.36118041882229146, + "grad_norm": 0.859375, + "learning_rate": 2.167108658070448e-05, + "loss": 0.7976, + "step": 8238 + }, + { + "epoch": 0.3612242620389487, + "grad_norm": 0.875, + "learning_rate": 2.1668224713463647e-05, + "loss": 0.7853, + "step": 8239 + }, + { + "epoch": 0.36126810525560593, + "grad_norm": 0.8203125, + "learning_rate": 2.1665363012242533e-05, + "loss": 0.794, + "step": 8240 + }, + { + "epoch": 0.36131194847226317, + "grad_norm": 0.73046875, + "learning_rate": 2.166250147704718e-05, + "loss": 0.7541, + "step": 8241 + }, + { + "epoch": 0.3613557916889204, + "grad_norm": 0.765625, + "learning_rate": 2.1659640107883704e-05, + "loss": 0.9023, + "step": 8242 + }, + { + "epoch": 0.36139963490557764, + "grad_norm": 0.74609375, + "learning_rate": 2.1656778904758136e-05, + "loss": 0.7907, + "step": 8243 + }, + { + "epoch": 0.3614434781222349, + "grad_norm": 0.82421875, + "learning_rate": 2.1653917867676556e-05, + "loss": 0.7804, + "step": 8244 + }, + { + "epoch": 0.3614873213388921, + "grad_norm": 0.796875, + "learning_rate": 2.1651056996645013e-05, + "loss": 0.7523, + "step": 8245 + }, + { + "epoch": 0.36153116455554934, + "grad_norm": 0.80078125, + "learning_rate": 2.1648196291669554e-05, + "loss": 0.815, + "step": 8246 + }, + { + "epoch": 0.3615750077722066, + "grad_norm": 0.9453125, + "learning_rate": 2.164533575275629e-05, + "loss": 0.7826, + "step": 8247 + }, + { + "epoch": 0.3616188509888638, + "grad_norm": 0.80078125, + "learning_rate": 2.1642475379911265e-05, + "loss": 0.8298, + "step": 8248 + }, + { + "epoch": 0.36166269420552105, + "grad_norm": 0.99609375, + "learning_rate": 2.1639615173140527e-05, + "loss": 0.8218, + "step": 8249 + }, + { + "epoch": 0.3617065374221783, + "grad_norm": 0.8359375, + "learning_rate": 2.163675513245015e-05, + "loss": 0.7178, + "step": 8250 + }, + { + "epoch": 0.3617503806388355, + "grad_norm": 0.78515625, + "learning_rate": 2.163389525784616e-05, + "loss": 0.7861, + "step": 8251 + }, + { + "epoch": 0.36179422385549276, + "grad_norm": 0.859375, + "learning_rate": 2.1631035549334676e-05, + "loss": 0.7691, + "step": 8252 + }, + { + "epoch": 0.36183806707215, + "grad_norm": 0.83984375, + "learning_rate": 2.162817600692174e-05, + "loss": 0.6457, + "step": 8253 + }, + { + "epoch": 0.36188191028880723, + "grad_norm": 0.9375, + "learning_rate": 2.16253166306134e-05, + "loss": 0.8774, + "step": 8254 + }, + { + "epoch": 0.36192575350546446, + "grad_norm": 0.86328125, + "learning_rate": 2.162245742041572e-05, + "loss": 0.8677, + "step": 8255 + }, + { + "epoch": 0.3619695967221217, + "grad_norm": 0.82421875, + "learning_rate": 2.161959837633476e-05, + "loss": 0.7442, + "step": 8256 + }, + { + "epoch": 0.36201343993877894, + "grad_norm": 0.8671875, + "learning_rate": 2.161673949837659e-05, + "loss": 0.7836, + "step": 8257 + }, + { + "epoch": 0.36205728315543617, + "grad_norm": 0.859375, + "learning_rate": 2.1613880786547224e-05, + "loss": 0.7777, + "step": 8258 + }, + { + "epoch": 0.3621011263720934, + "grad_norm": 0.80078125, + "learning_rate": 2.1611022240852796e-05, + "loss": 0.8689, + "step": 8259 + }, + { + "epoch": 0.36214496958875064, + "grad_norm": 0.88671875, + "learning_rate": 2.1608163861299313e-05, + "loss": 0.8221, + "step": 8260 + }, + { + "epoch": 0.3621888128054079, + "grad_norm": 0.92578125, + "learning_rate": 2.160530564789286e-05, + "loss": 0.8099, + "step": 8261 + }, + { + "epoch": 0.3622326560220651, + "grad_norm": 0.84375, + "learning_rate": 2.1602447600639476e-05, + "loss": 0.8051, + "step": 8262 + }, + { + "epoch": 0.36227649923872235, + "grad_norm": 0.76953125, + "learning_rate": 2.159958971954521e-05, + "loss": 0.7296, + "step": 8263 + }, + { + "epoch": 0.3623203424553796, + "grad_norm": 0.87890625, + "learning_rate": 2.159673200461615e-05, + "loss": 0.9479, + "step": 8264 + }, + { + "epoch": 0.3623641856720368, + "grad_norm": 0.81640625, + "learning_rate": 2.1593874455858332e-05, + "loss": 0.8014, + "step": 8265 + }, + { + "epoch": 0.36240802888869406, + "grad_norm": 0.8515625, + "learning_rate": 2.1591017073277776e-05, + "loss": 0.8684, + "step": 8266 + }, + { + "epoch": 0.3624518721053513, + "grad_norm": 0.90625, + "learning_rate": 2.158815985688062e-05, + "loss": 0.9228, + "step": 8267 + }, + { + "epoch": 0.36249571532200847, + "grad_norm": 0.76171875, + "learning_rate": 2.1585302806672868e-05, + "loss": 0.7731, + "step": 8268 + }, + { + "epoch": 0.3625395585386657, + "grad_norm": 0.85546875, + "learning_rate": 2.1582445922660587e-05, + "loss": 0.7853, + "step": 8269 + }, + { + "epoch": 0.36258340175532294, + "grad_norm": 0.8203125, + "learning_rate": 2.1579589204849825e-05, + "loss": 0.7576, + "step": 8270 + }, + { + "epoch": 0.3626272449719802, + "grad_norm": 0.84375, + "learning_rate": 2.1576732653246622e-05, + "loss": 0.9182, + "step": 8271 + }, + { + "epoch": 0.3626710881886374, + "grad_norm": 0.86328125, + "learning_rate": 2.1573876267857074e-05, + "loss": 0.8272, + "step": 8272 + }, + { + "epoch": 0.36271493140529465, + "grad_norm": 0.85546875, + "learning_rate": 2.157102004868722e-05, + "loss": 0.8467, + "step": 8273 + }, + { + "epoch": 0.3627587746219519, + "grad_norm": 0.86328125, + "learning_rate": 2.1568163995743096e-05, + "loss": 0.8139, + "step": 8274 + }, + { + "epoch": 0.3628026178386091, + "grad_norm": 0.69921875, + "learning_rate": 2.156530810903078e-05, + "loss": 0.6952, + "step": 8275 + }, + { + "epoch": 0.36284646105526636, + "grad_norm": 0.80078125, + "learning_rate": 2.1562452388556264e-05, + "loss": 0.7065, + "step": 8276 + }, + { + "epoch": 0.3628903042719236, + "grad_norm": 0.80078125, + "learning_rate": 2.155959683432569e-05, + "loss": 0.8024, + "step": 8277 + }, + { + "epoch": 0.3629341474885808, + "grad_norm": 0.9140625, + "learning_rate": 2.1556741446345064e-05, + "loss": 0.8703, + "step": 8278 + }, + { + "epoch": 0.36297799070523806, + "grad_norm": 0.7890625, + "learning_rate": 2.1553886224620434e-05, + "loss": 0.8184, + "step": 8279 + }, + { + "epoch": 0.3630218339218953, + "grad_norm": 0.81640625, + "learning_rate": 2.1551031169157866e-05, + "loss": 0.7331, + "step": 8280 + }, + { + "epoch": 0.36306567713855253, + "grad_norm": 0.8828125, + "learning_rate": 2.1548176279963372e-05, + "loss": 0.77, + "step": 8281 + }, + { + "epoch": 0.36310952035520977, + "grad_norm": 0.84765625, + "learning_rate": 2.1545321557043062e-05, + "loss": 0.9374, + "step": 8282 + }, + { + "epoch": 0.363153363571867, + "grad_norm": 0.80078125, + "learning_rate": 2.1542467000402956e-05, + "loss": 0.6928, + "step": 8283 + }, + { + "epoch": 0.36319720678852424, + "grad_norm": 0.76953125, + "learning_rate": 2.153961261004911e-05, + "loss": 0.8588, + "step": 8284 + }, + { + "epoch": 0.3632410500051815, + "grad_norm": 0.81640625, + "learning_rate": 2.1536758385987566e-05, + "loss": 0.7514, + "step": 8285 + }, + { + "epoch": 0.3632848932218387, + "grad_norm": 0.71484375, + "learning_rate": 2.1533904328224354e-05, + "loss": 0.7437, + "step": 8286 + }, + { + "epoch": 0.36332873643849595, + "grad_norm": 0.8203125, + "learning_rate": 2.1531050436765578e-05, + "loss": 0.9944, + "step": 8287 + }, + { + "epoch": 0.3633725796551532, + "grad_norm": 0.953125, + "learning_rate": 2.1528196711617255e-05, + "loss": 0.9103, + "step": 8288 + }, + { + "epoch": 0.3634164228718104, + "grad_norm": 0.74609375, + "learning_rate": 2.1525343152785428e-05, + "loss": 0.7055, + "step": 8289 + }, + { + "epoch": 0.36346026608846765, + "grad_norm": 0.82421875, + "learning_rate": 2.1522489760276156e-05, + "loss": 0.818, + "step": 8290 + }, + { + "epoch": 0.3635041093051249, + "grad_norm": 0.87109375, + "learning_rate": 2.1519636534095454e-05, + "loss": 0.8023, + "step": 8291 + }, + { + "epoch": 0.3635479525217821, + "grad_norm": 0.8515625, + "learning_rate": 2.1516783474249414e-05, + "loss": 0.7626, + "step": 8292 + }, + { + "epoch": 0.36359179573843936, + "grad_norm": 0.7265625, + "learning_rate": 2.151393058074408e-05, + "loss": 0.7137, + "step": 8293 + }, + { + "epoch": 0.3636356389550966, + "grad_norm": 0.84375, + "learning_rate": 2.151107785358547e-05, + "loss": 0.707, + "step": 8294 + }, + { + "epoch": 0.36367948217175383, + "grad_norm": 0.84375, + "learning_rate": 2.150822529277966e-05, + "loss": 0.8237, + "step": 8295 + }, + { + "epoch": 0.36372332538841107, + "grad_norm": 0.83984375, + "learning_rate": 2.1505372898332643e-05, + "loss": 0.7714, + "step": 8296 + }, + { + "epoch": 0.3637671686050683, + "grad_norm": 0.875, + "learning_rate": 2.1502520670250525e-05, + "loss": 0.9127, + "step": 8297 + }, + { + "epoch": 0.3638110118217255, + "grad_norm": 0.828125, + "learning_rate": 2.1499668608539338e-05, + "loss": 0.8512, + "step": 8298 + }, + { + "epoch": 0.3638548550383827, + "grad_norm": 0.8671875, + "learning_rate": 2.1496816713205114e-05, + "loss": 0.9113, + "step": 8299 + }, + { + "epoch": 0.36389869825503995, + "grad_norm": 2.703125, + "learning_rate": 2.14939649842539e-05, + "loss": 0.9049, + "step": 8300 + }, + { + "epoch": 0.3639425414716972, + "grad_norm": 0.77734375, + "learning_rate": 2.1491113421691735e-05, + "loss": 0.7601, + "step": 8301 + }, + { + "epoch": 0.3639863846883544, + "grad_norm": 1.7265625, + "learning_rate": 2.1488262025524675e-05, + "loss": 0.8996, + "step": 8302 + }, + { + "epoch": 0.36403022790501166, + "grad_norm": 0.83203125, + "learning_rate": 2.1485410795758743e-05, + "loss": 0.8811, + "step": 8303 + }, + { + "epoch": 0.3640740711216689, + "grad_norm": 0.81640625, + "learning_rate": 2.148255973240001e-05, + "loss": 0.7555, + "step": 8304 + }, + { + "epoch": 0.36411791433832613, + "grad_norm": 0.86328125, + "learning_rate": 2.1479708835454494e-05, + "loss": 0.93, + "step": 8305 + }, + { + "epoch": 0.36416175755498337, + "grad_norm": 0.8203125, + "learning_rate": 2.147685810492822e-05, + "loss": 0.856, + "step": 8306 + }, + { + "epoch": 0.3642056007716406, + "grad_norm": 0.78125, + "learning_rate": 2.147400754082729e-05, + "loss": 0.8116, + "step": 8307 + }, + { + "epoch": 0.36424944398829784, + "grad_norm": 0.84765625, + "learning_rate": 2.1471157143157717e-05, + "loss": 0.6423, + "step": 8308 + }, + { + "epoch": 0.3642932872049551, + "grad_norm": 0.8515625, + "learning_rate": 2.146830691192553e-05, + "loss": 0.7548, + "step": 8309 + }, + { + "epoch": 0.3643371304216123, + "grad_norm": 0.98046875, + "learning_rate": 2.146545684713678e-05, + "loss": 0.797, + "step": 8310 + }, + { + "epoch": 0.36438097363826955, + "grad_norm": 0.8203125, + "learning_rate": 2.146260694879747e-05, + "loss": 0.8968, + "step": 8311 + }, + { + "epoch": 0.3644248168549268, + "grad_norm": 0.8203125, + "learning_rate": 2.1459757216913713e-05, + "loss": 0.8195, + "step": 8312 + }, + { + "epoch": 0.364468660071584, + "grad_norm": 0.87890625, + "learning_rate": 2.145690765149151e-05, + "loss": 0.8661, + "step": 8313 + }, + { + "epoch": 0.36451250328824125, + "grad_norm": 0.98828125, + "learning_rate": 2.14540582525369e-05, + "loss": 0.8498, + "step": 8314 + }, + { + "epoch": 0.3645563465048985, + "grad_norm": 0.875, + "learning_rate": 2.1451209020055928e-05, + "loss": 0.797, + "step": 8315 + }, + { + "epoch": 0.3646001897215557, + "grad_norm": 0.859375, + "learning_rate": 2.14483599540546e-05, + "loss": 0.7952, + "step": 8316 + }, + { + "epoch": 0.36464403293821296, + "grad_norm": 0.875, + "learning_rate": 2.1445511054539e-05, + "loss": 0.8695, + "step": 8317 + }, + { + "epoch": 0.3646878761548702, + "grad_norm": 0.7734375, + "learning_rate": 2.144266232151516e-05, + "loss": 0.708, + "step": 8318 + }, + { + "epoch": 0.36473171937152743, + "grad_norm": 0.859375, + "learning_rate": 2.1439813754989112e-05, + "loss": 0.8885, + "step": 8319 + }, + { + "epoch": 0.36477556258818467, + "grad_norm": 0.8125, + "learning_rate": 2.1436965354966876e-05, + "loss": 0.7711, + "step": 8320 + }, + { + "epoch": 0.3648194058048419, + "grad_norm": 0.80859375, + "learning_rate": 2.1434117121454487e-05, + "loss": 0.7714, + "step": 8321 + }, + { + "epoch": 0.36486324902149914, + "grad_norm": 1.640625, + "learning_rate": 2.1431269054458013e-05, + "loss": 0.7131, + "step": 8322 + }, + { + "epoch": 0.36490709223815637, + "grad_norm": 1.015625, + "learning_rate": 2.142842115398348e-05, + "loss": 0.7802, + "step": 8323 + }, + { + "epoch": 0.3649509354548136, + "grad_norm": 0.78125, + "learning_rate": 2.1425573420036916e-05, + "loss": 0.8488, + "step": 8324 + }, + { + "epoch": 0.36499477867147084, + "grad_norm": 1.0, + "learning_rate": 2.1422725852624346e-05, + "loss": 0.807, + "step": 8325 + }, + { + "epoch": 0.3650386218881281, + "grad_norm": 0.8203125, + "learning_rate": 2.1419878451751807e-05, + "loss": 0.8034, + "step": 8326 + }, + { + "epoch": 0.3650824651047853, + "grad_norm": 0.828125, + "learning_rate": 2.1417031217425365e-05, + "loss": 0.8638, + "step": 8327 + }, + { + "epoch": 0.3651263083214425, + "grad_norm": 0.8125, + "learning_rate": 2.141418414965103e-05, + "loss": 0.7408, + "step": 8328 + }, + { + "epoch": 0.36517015153809973, + "grad_norm": 0.84765625, + "learning_rate": 2.141133724843485e-05, + "loss": 0.817, + "step": 8329 + }, + { + "epoch": 0.36521399475475697, + "grad_norm": 0.79296875, + "learning_rate": 2.1408490513782843e-05, + "loss": 0.6953, + "step": 8330 + }, + { + "epoch": 0.3652578379714142, + "grad_norm": 0.83984375, + "learning_rate": 2.1405643945701015e-05, + "loss": 0.8861, + "step": 8331 + }, + { + "epoch": 0.36530168118807144, + "grad_norm": 0.8515625, + "learning_rate": 2.1402797544195463e-05, + "loss": 0.8261, + "step": 8332 + }, + { + "epoch": 0.36534552440472867, + "grad_norm": 0.765625, + "learning_rate": 2.139995130927219e-05, + "loss": 0.738, + "step": 8333 + }, + { + "epoch": 0.3653893676213859, + "grad_norm": 0.82421875, + "learning_rate": 2.1397105240937234e-05, + "loss": 0.8332, + "step": 8334 + }, + { + "epoch": 0.36543321083804314, + "grad_norm": 0.83984375, + "learning_rate": 2.139425933919662e-05, + "loss": 0.8165, + "step": 8335 + }, + { + "epoch": 0.3654770540547004, + "grad_norm": 0.76953125, + "learning_rate": 2.1391413604056344e-05, + "loss": 0.7841, + "step": 8336 + }, + { + "epoch": 0.3655208972713576, + "grad_norm": 0.92578125, + "learning_rate": 2.1388568035522505e-05, + "loss": 0.8241, + "step": 8337 + }, + { + "epoch": 0.36556474048801485, + "grad_norm": 0.875, + "learning_rate": 2.1385722633601103e-05, + "loss": 1.0064, + "step": 8338 + }, + { + "epoch": 0.3656085837046721, + "grad_norm": 0.90625, + "learning_rate": 2.1382877398298174e-05, + "loss": 0.8357, + "step": 8339 + }, + { + "epoch": 0.3656524269213293, + "grad_norm": 0.796875, + "learning_rate": 2.1380032329619737e-05, + "loss": 0.7534, + "step": 8340 + }, + { + "epoch": 0.36569627013798656, + "grad_norm": 0.83203125, + "learning_rate": 2.137718742757183e-05, + "loss": 0.8467, + "step": 8341 + }, + { + "epoch": 0.3657401133546438, + "grad_norm": 0.87109375, + "learning_rate": 2.1374342692160475e-05, + "loss": 0.7439, + "step": 8342 + }, + { + "epoch": 0.365783956571301, + "grad_norm": 0.828125, + "learning_rate": 2.137149812339171e-05, + "loss": 0.8068, + "step": 8343 + }, + { + "epoch": 0.36582779978795826, + "grad_norm": 0.875, + "learning_rate": 2.1368653721271535e-05, + "loss": 0.7809, + "step": 8344 + }, + { + "epoch": 0.3658716430046155, + "grad_norm": 0.78515625, + "learning_rate": 2.1365809485806032e-05, + "loss": 0.7948, + "step": 8345 + }, + { + "epoch": 0.36591548622127273, + "grad_norm": 0.8203125, + "learning_rate": 2.13629654170012e-05, + "loss": 0.6917, + "step": 8346 + }, + { + "epoch": 0.36595932943792997, + "grad_norm": 0.96875, + "learning_rate": 2.1360121514863075e-05, + "loss": 0.8114, + "step": 8347 + }, + { + "epoch": 0.3660031726545872, + "grad_norm": 0.79296875, + "learning_rate": 2.1357277779397668e-05, + "loss": 0.7876, + "step": 8348 + }, + { + "epoch": 0.36604701587124444, + "grad_norm": 0.84375, + "learning_rate": 2.135443421061102e-05, + "loss": 0.8024, + "step": 8349 + }, + { + "epoch": 0.3660908590879017, + "grad_norm": 0.80859375, + "learning_rate": 2.1351590808509147e-05, + "loss": 0.7976, + "step": 8350 + }, + { + "epoch": 0.3661347023045589, + "grad_norm": 0.96484375, + "learning_rate": 2.134874757309806e-05, + "loss": 0.7663, + "step": 8351 + }, + { + "epoch": 0.36617854552121615, + "grad_norm": 1.046875, + "learning_rate": 2.134590450438383e-05, + "loss": 0.7327, + "step": 8352 + }, + { + "epoch": 0.3662223887378734, + "grad_norm": 0.71484375, + "learning_rate": 2.134306160237247e-05, + "loss": 0.6338, + "step": 8353 + }, + { + "epoch": 0.3662662319545306, + "grad_norm": 0.8359375, + "learning_rate": 2.1340218867069985e-05, + "loss": 0.8109, + "step": 8354 + }, + { + "epoch": 0.36631007517118785, + "grad_norm": 0.8671875, + "learning_rate": 2.1337376298482414e-05, + "loss": 0.8162, + "step": 8355 + }, + { + "epoch": 0.3663539183878451, + "grad_norm": 0.82421875, + "learning_rate": 2.133453389661575e-05, + "loss": 0.7419, + "step": 8356 + }, + { + "epoch": 0.3663977616045023, + "grad_norm": 0.796875, + "learning_rate": 2.1331691661476072e-05, + "loss": 0.8156, + "step": 8357 + }, + { + "epoch": 0.36644160482115956, + "grad_norm": 0.76953125, + "learning_rate": 2.132884959306938e-05, + "loss": 0.838, + "step": 8358 + }, + { + "epoch": 0.36648544803781674, + "grad_norm": 0.85546875, + "learning_rate": 2.1326007691401693e-05, + "loss": 0.8029, + "step": 8359 + }, + { + "epoch": 0.366529291254474, + "grad_norm": 0.79296875, + "learning_rate": 2.132316595647904e-05, + "loss": 0.8142, + "step": 8360 + }, + { + "epoch": 0.3665731344711312, + "grad_norm": 0.75, + "learning_rate": 2.13203243883074e-05, + "loss": 0.8627, + "step": 8361 + }, + { + "epoch": 0.36661697768778845, + "grad_norm": 0.76171875, + "learning_rate": 2.1317482986892877e-05, + "loss": 0.7026, + "step": 8362 + }, + { + "epoch": 0.3666608209044457, + "grad_norm": 0.81640625, + "learning_rate": 2.131464175224144e-05, + "loss": 0.7777, + "step": 8363 + }, + { + "epoch": 0.3667046641211029, + "grad_norm": 0.89453125, + "learning_rate": 2.131180068435913e-05, + "loss": 1.0198, + "step": 8364 + }, + { + "epoch": 0.36674850733776015, + "grad_norm": 0.859375, + "learning_rate": 2.130895978325196e-05, + "loss": 0.9087, + "step": 8365 + }, + { + "epoch": 0.3667923505544174, + "grad_norm": 0.81640625, + "learning_rate": 2.1306119048925922e-05, + "loss": 0.8324, + "step": 8366 + }, + { + "epoch": 0.3668361937710746, + "grad_norm": 0.8515625, + "learning_rate": 2.130327848138709e-05, + "loss": 0.8766, + "step": 8367 + }, + { + "epoch": 0.36688003698773186, + "grad_norm": 0.68359375, + "learning_rate": 2.130043808064147e-05, + "loss": 0.6343, + "step": 8368 + }, + { + "epoch": 0.3669238802043891, + "grad_norm": 0.8125, + "learning_rate": 2.1297597846695072e-05, + "loss": 0.8237, + "step": 8369 + }, + { + "epoch": 0.36696772342104633, + "grad_norm": 0.83203125, + "learning_rate": 2.129475777955391e-05, + "loss": 0.8376, + "step": 8370 + }, + { + "epoch": 0.36701156663770357, + "grad_norm": 0.8515625, + "learning_rate": 2.129191787922399e-05, + "loss": 0.9168, + "step": 8371 + }, + { + "epoch": 0.3670554098543608, + "grad_norm": 0.7890625, + "learning_rate": 2.128907814571137e-05, + "loss": 0.74, + "step": 8372 + }, + { + "epoch": 0.36709925307101804, + "grad_norm": 0.84375, + "learning_rate": 2.1286238579022067e-05, + "loss": 0.794, + "step": 8373 + }, + { + "epoch": 0.3671430962876753, + "grad_norm": 0.87109375, + "learning_rate": 2.1283399179162065e-05, + "loss": 0.8235, + "step": 8374 + }, + { + "epoch": 0.3671869395043325, + "grad_norm": 0.76953125, + "learning_rate": 2.1280559946137403e-05, + "loss": 0.7612, + "step": 8375 + }, + { + "epoch": 0.36723078272098975, + "grad_norm": 0.79296875, + "learning_rate": 2.1277720879954078e-05, + "loss": 0.7391, + "step": 8376 + }, + { + "epoch": 0.367274625937647, + "grad_norm": 0.7890625, + "learning_rate": 2.1274881980618144e-05, + "loss": 0.804, + "step": 8377 + }, + { + "epoch": 0.3673184691543042, + "grad_norm": 0.84375, + "learning_rate": 2.1272043248135598e-05, + "loss": 0.7444, + "step": 8378 + }, + { + "epoch": 0.36736231237096145, + "grad_norm": 0.8515625, + "learning_rate": 2.1269204682512454e-05, + "loss": 0.8281, + "step": 8379 + }, + { + "epoch": 0.3674061555876187, + "grad_norm": 0.7578125, + "learning_rate": 2.1266366283754723e-05, + "loss": 0.7837, + "step": 8380 + }, + { + "epoch": 0.3674499988042759, + "grad_norm": 0.80859375, + "learning_rate": 2.1263528051868443e-05, + "loss": 0.875, + "step": 8381 + }, + { + "epoch": 0.36749384202093316, + "grad_norm": 0.84765625, + "learning_rate": 2.1260689986859582e-05, + "loss": 0.8277, + "step": 8382 + }, + { + "epoch": 0.3675376852375904, + "grad_norm": 0.8359375, + "learning_rate": 2.125785208873422e-05, + "loss": 0.7651, + "step": 8383 + }, + { + "epoch": 0.36758152845424763, + "grad_norm": 1.78125, + "learning_rate": 2.1255014357498337e-05, + "loss": 0.8842, + "step": 8384 + }, + { + "epoch": 0.36762537167090487, + "grad_norm": 0.765625, + "learning_rate": 2.1252176793157942e-05, + "loss": 0.7956, + "step": 8385 + }, + { + "epoch": 0.3676692148875621, + "grad_norm": 0.8671875, + "learning_rate": 2.124933939571907e-05, + "loss": 0.7262, + "step": 8386 + }, + { + "epoch": 0.36771305810421934, + "grad_norm": 0.859375, + "learning_rate": 2.124650216518771e-05, + "loss": 0.7821, + "step": 8387 + }, + { + "epoch": 0.3677569013208766, + "grad_norm": 0.84375, + "learning_rate": 2.1243665101569898e-05, + "loss": 0.9329, + "step": 8388 + }, + { + "epoch": 0.36780074453753375, + "grad_norm": 0.734375, + "learning_rate": 2.1240828204871634e-05, + "loss": 0.6737, + "step": 8389 + }, + { + "epoch": 0.367844587754191, + "grad_norm": 0.8359375, + "learning_rate": 2.1237991475098928e-05, + "loss": 0.7183, + "step": 8390 + }, + { + "epoch": 0.3678884309708482, + "grad_norm": 0.8203125, + "learning_rate": 2.1235154912257805e-05, + "loss": 0.7695, + "step": 8391 + }, + { + "epoch": 0.36793227418750546, + "grad_norm": 0.90234375, + "learning_rate": 2.123231851635423e-05, + "loss": 0.8227, + "step": 8392 + }, + { + "epoch": 0.3679761174041627, + "grad_norm": 0.80859375, + "learning_rate": 2.1229482287394286e-05, + "loss": 0.8607, + "step": 8393 + }, + { + "epoch": 0.36801996062081993, + "grad_norm": 0.83203125, + "learning_rate": 2.122664622538396e-05, + "loss": 0.7893, + "step": 8394 + }, + { + "epoch": 0.36806380383747717, + "grad_norm": 0.80078125, + "learning_rate": 2.1223810330329252e-05, + "loss": 0.8489, + "step": 8395 + }, + { + "epoch": 0.3681076470541344, + "grad_norm": 0.8125, + "learning_rate": 2.1220974602236164e-05, + "loss": 0.7406, + "step": 8396 + }, + { + "epoch": 0.36815149027079164, + "grad_norm": 0.77734375, + "learning_rate": 2.1218139041110706e-05, + "loss": 0.7933, + "step": 8397 + }, + { + "epoch": 0.3681953334874489, + "grad_norm": 0.8828125, + "learning_rate": 2.1215303646958918e-05, + "loss": 0.7671, + "step": 8398 + }, + { + "epoch": 0.3682391767041061, + "grad_norm": 0.859375, + "learning_rate": 2.121246841978679e-05, + "loss": 0.8351, + "step": 8399 + }, + { + "epoch": 0.36828301992076334, + "grad_norm": 0.78125, + "learning_rate": 2.1209633359600334e-05, + "loss": 0.7542, + "step": 8400 + }, + { + "epoch": 0.3683268631374206, + "grad_norm": 0.75, + "learning_rate": 2.1206798466405553e-05, + "loss": 0.7076, + "step": 8401 + }, + { + "epoch": 0.3683707063540778, + "grad_norm": 0.859375, + "learning_rate": 2.1203963740208433e-05, + "loss": 0.9357, + "step": 8402 + }, + { + "epoch": 0.36841454957073505, + "grad_norm": 0.875, + "learning_rate": 2.1201129181015046e-05, + "loss": 0.8229, + "step": 8403 + }, + { + "epoch": 0.3684583927873923, + "grad_norm": 1.03125, + "learning_rate": 2.1198294788831352e-05, + "loss": 0.7687, + "step": 8404 + }, + { + "epoch": 0.3685022360040495, + "grad_norm": 0.8671875, + "learning_rate": 2.119546056366337e-05, + "loss": 0.7921, + "step": 8405 + }, + { + "epoch": 0.36854607922070676, + "grad_norm": 0.92578125, + "learning_rate": 2.11926265055171e-05, + "loss": 0.814, + "step": 8406 + }, + { + "epoch": 0.368589922437364, + "grad_norm": 0.85546875, + "learning_rate": 2.118979261439853e-05, + "loss": 0.723, + "step": 8407 + }, + { + "epoch": 0.36863376565402123, + "grad_norm": 0.7890625, + "learning_rate": 2.1186958890313723e-05, + "loss": 0.7478, + "step": 8408 + }, + { + "epoch": 0.36867760887067846, + "grad_norm": 0.953125, + "learning_rate": 2.1184125333268645e-05, + "loss": 0.8419, + "step": 8409 + }, + { + "epoch": 0.3687214520873357, + "grad_norm": 0.82421875, + "learning_rate": 2.118129194326931e-05, + "loss": 0.8787, + "step": 8410 + }, + { + "epoch": 0.36876529530399293, + "grad_norm": 0.78515625, + "learning_rate": 2.1178458720321727e-05, + "loss": 0.7336, + "step": 8411 + }, + { + "epoch": 0.36880913852065017, + "grad_norm": 0.82421875, + "learning_rate": 2.1175625664431864e-05, + "loss": 0.8969, + "step": 8412 + }, + { + "epoch": 0.3688529817373074, + "grad_norm": 0.81640625, + "learning_rate": 2.1172792775605776e-05, + "loss": 0.8624, + "step": 8413 + }, + { + "epoch": 0.36889682495396464, + "grad_norm": 0.76953125, + "learning_rate": 2.116996005384946e-05, + "loss": 0.7371, + "step": 8414 + }, + { + "epoch": 0.3689406681706219, + "grad_norm": 0.84375, + "learning_rate": 2.1167127499168903e-05, + "loss": 0.8491, + "step": 8415 + }, + { + "epoch": 0.3689845113872791, + "grad_norm": 0.8359375, + "learning_rate": 2.116429511157012e-05, + "loss": 0.8622, + "step": 8416 + }, + { + "epoch": 0.36902835460393635, + "grad_norm": 0.79296875, + "learning_rate": 2.116146289105907e-05, + "loss": 0.8176, + "step": 8417 + }, + { + "epoch": 0.3690721978205936, + "grad_norm": 0.8984375, + "learning_rate": 2.1158630837641825e-05, + "loss": 0.7847, + "step": 8418 + }, + { + "epoch": 0.36911604103725076, + "grad_norm": 0.796875, + "learning_rate": 2.1155798951324356e-05, + "loss": 0.7276, + "step": 8419 + }, + { + "epoch": 0.369159884253908, + "grad_norm": 0.83203125, + "learning_rate": 2.1152967232112664e-05, + "loss": 0.8646, + "step": 8420 + }, + { + "epoch": 0.36920372747056523, + "grad_norm": 0.88671875, + "learning_rate": 2.115013568001275e-05, + "loss": 0.7717, + "step": 8421 + }, + { + "epoch": 0.36924757068722247, + "grad_norm": 0.796875, + "learning_rate": 2.114730429503059e-05, + "loss": 0.777, + "step": 8422 + }, + { + "epoch": 0.3692914139038797, + "grad_norm": 0.9140625, + "learning_rate": 2.1144473077172244e-05, + "loss": 0.9453, + "step": 8423 + }, + { + "epoch": 0.36933525712053694, + "grad_norm": 0.78515625, + "learning_rate": 2.1141642026443677e-05, + "loss": 0.7633, + "step": 8424 + }, + { + "epoch": 0.3693791003371942, + "grad_norm": 0.7734375, + "learning_rate": 2.113881114285089e-05, + "loss": 0.7839, + "step": 8425 + }, + { + "epoch": 0.3694229435538514, + "grad_norm": 0.828125, + "learning_rate": 2.1135980426399883e-05, + "loss": 0.8444, + "step": 8426 + }, + { + "epoch": 0.36946678677050865, + "grad_norm": 0.84765625, + "learning_rate": 2.113314987709667e-05, + "loss": 0.8935, + "step": 8427 + }, + { + "epoch": 0.3695106299871659, + "grad_norm": 0.87890625, + "learning_rate": 2.1130319494947225e-05, + "loss": 0.9231, + "step": 8428 + }, + { + "epoch": 0.3695544732038231, + "grad_norm": 0.80859375, + "learning_rate": 2.1127489279957536e-05, + "loss": 0.7901, + "step": 8429 + }, + { + "epoch": 0.36959831642048036, + "grad_norm": 0.84765625, + "learning_rate": 2.1124659232133647e-05, + "loss": 0.9202, + "step": 8430 + }, + { + "epoch": 0.3696421596371376, + "grad_norm": 0.81640625, + "learning_rate": 2.1121829351481547e-05, + "loss": 0.7719, + "step": 8431 + }, + { + "epoch": 0.3696860028537948, + "grad_norm": 0.8125, + "learning_rate": 2.111899963800722e-05, + "loss": 0.7873, + "step": 8432 + }, + { + "epoch": 0.36972984607045206, + "grad_norm": 0.86328125, + "learning_rate": 2.1116170091716657e-05, + "loss": 0.8399, + "step": 8433 + }, + { + "epoch": 0.3697736892871093, + "grad_norm": 0.73828125, + "learning_rate": 2.111334071261586e-05, + "loss": 0.7811, + "step": 8434 + }, + { + "epoch": 0.36981753250376653, + "grad_norm": 0.80078125, + "learning_rate": 2.111051150071084e-05, + "loss": 0.9122, + "step": 8435 + }, + { + "epoch": 0.36986137572042377, + "grad_norm": 0.79296875, + "learning_rate": 2.1107682456007572e-05, + "loss": 0.7291, + "step": 8436 + }, + { + "epoch": 0.369905218937081, + "grad_norm": 1.9765625, + "learning_rate": 2.110485357851204e-05, + "loss": 0.86, + "step": 8437 + }, + { + "epoch": 0.36994906215373824, + "grad_norm": 1.0234375, + "learning_rate": 2.1102024868230287e-05, + "loss": 0.77, + "step": 8438 + }, + { + "epoch": 0.3699929053703955, + "grad_norm": 0.8125, + "learning_rate": 2.109919632516828e-05, + "loss": 0.7559, + "step": 8439 + }, + { + "epoch": 0.3700367485870527, + "grad_norm": 0.77734375, + "learning_rate": 2.109636794933202e-05, + "loss": 0.6849, + "step": 8440 + }, + { + "epoch": 0.37008059180370995, + "grad_norm": 0.78125, + "learning_rate": 2.1093539740727497e-05, + "loss": 0.8643, + "step": 8441 + }, + { + "epoch": 0.3701244350203672, + "grad_norm": 0.84765625, + "learning_rate": 2.109071169936069e-05, + "loss": 0.8375, + "step": 8442 + }, + { + "epoch": 0.3701682782370244, + "grad_norm": 0.81640625, + "learning_rate": 2.108788382523762e-05, + "loss": 0.769, + "step": 8443 + }, + { + "epoch": 0.37021212145368165, + "grad_norm": 0.921875, + "learning_rate": 2.1085056118364278e-05, + "loss": 0.929, + "step": 8444 + }, + { + "epoch": 0.3702559646703389, + "grad_norm": 0.87890625, + "learning_rate": 2.1082228578746645e-05, + "loss": 0.7835, + "step": 8445 + }, + { + "epoch": 0.3702998078869961, + "grad_norm": 0.859375, + "learning_rate": 2.1079401206390724e-05, + "loss": 0.8499, + "step": 8446 + }, + { + "epoch": 0.37034365110365336, + "grad_norm": 0.83203125, + "learning_rate": 2.1076574001302462e-05, + "loss": 0.6838, + "step": 8447 + }, + { + "epoch": 0.3703874943203106, + "grad_norm": 0.7734375, + "learning_rate": 2.107374696348793e-05, + "loss": 0.804, + "step": 8448 + }, + { + "epoch": 0.37043133753696783, + "grad_norm": 0.73828125, + "learning_rate": 2.1070920092953074e-05, + "loss": 0.7035, + "step": 8449 + }, + { + "epoch": 0.370475180753625, + "grad_norm": 0.72265625, + "learning_rate": 2.106809338970389e-05, + "loss": 0.7703, + "step": 8450 + }, + { + "epoch": 0.37051902397028225, + "grad_norm": 0.796875, + "learning_rate": 2.1065266853746378e-05, + "loss": 0.759, + "step": 8451 + }, + { + "epoch": 0.3705628671869395, + "grad_norm": 0.765625, + "learning_rate": 2.106244048508649e-05, + "loss": 0.7533, + "step": 8452 + }, + { + "epoch": 0.3706067104035967, + "grad_norm": 0.90625, + "learning_rate": 2.1059614283730276e-05, + "loss": 0.7953, + "step": 8453 + }, + { + "epoch": 0.37065055362025395, + "grad_norm": 0.83203125, + "learning_rate": 2.105678824968369e-05, + "loss": 0.7906, + "step": 8454 + }, + { + "epoch": 0.3706943968369112, + "grad_norm": 0.9375, + "learning_rate": 2.105396238295274e-05, + "loss": 0.8451, + "step": 8455 + }, + { + "epoch": 0.3707382400535684, + "grad_norm": 0.81640625, + "learning_rate": 2.1051136683543406e-05, + "loss": 0.8663, + "step": 8456 + }, + { + "epoch": 0.37078208327022566, + "grad_norm": 0.76953125, + "learning_rate": 2.104831115146163e-05, + "loss": 0.8059, + "step": 8457 + }, + { + "epoch": 0.3708259264868829, + "grad_norm": 0.83984375, + "learning_rate": 2.104548578671349e-05, + "loss": 0.8703, + "step": 8458 + }, + { + "epoch": 0.37086976970354013, + "grad_norm": 0.83984375, + "learning_rate": 2.104266058930493e-05, + "loss": 0.6932, + "step": 8459 + }, + { + "epoch": 0.37091361292019737, + "grad_norm": 0.8671875, + "learning_rate": 2.1039835559241927e-05, + "loss": 0.8648, + "step": 8460 + }, + { + "epoch": 0.3709574561368546, + "grad_norm": 0.93359375, + "learning_rate": 2.103701069653049e-05, + "loss": 0.7422, + "step": 8461 + }, + { + "epoch": 0.37100129935351184, + "grad_norm": 0.8203125, + "learning_rate": 2.103418600117656e-05, + "loss": 0.7714, + "step": 8462 + }, + { + "epoch": 0.3710451425701691, + "grad_norm": 0.85546875, + "learning_rate": 2.1031361473186184e-05, + "loss": 0.795, + "step": 8463 + }, + { + "epoch": 0.3710889857868263, + "grad_norm": 0.84375, + "learning_rate": 2.1028537112565337e-05, + "loss": 0.8163, + "step": 8464 + }, + { + "epoch": 0.37113282900348354, + "grad_norm": 0.86328125, + "learning_rate": 2.1025712919319983e-05, + "loss": 0.6979, + "step": 8465 + }, + { + "epoch": 0.3711766722201408, + "grad_norm": 0.8203125, + "learning_rate": 2.1022888893456115e-05, + "loss": 0.7001, + "step": 8466 + }, + { + "epoch": 0.371220515436798, + "grad_norm": 0.8671875, + "learning_rate": 2.1020065034979696e-05, + "loss": 0.8384, + "step": 8467 + }, + { + "epoch": 0.37126435865345525, + "grad_norm": 0.75390625, + "learning_rate": 2.1017241343896764e-05, + "loss": 0.8787, + "step": 8468 + }, + { + "epoch": 0.3713082018701125, + "grad_norm": 0.82421875, + "learning_rate": 2.101441782021327e-05, + "loss": 0.9248, + "step": 8469 + }, + { + "epoch": 0.3713520450867697, + "grad_norm": 0.8125, + "learning_rate": 2.1011594463935203e-05, + "loss": 0.8274, + "step": 8470 + }, + { + "epoch": 0.37139588830342696, + "grad_norm": 0.765625, + "learning_rate": 2.100877127506855e-05, + "loss": 0.8452, + "step": 8471 + }, + { + "epoch": 0.3714397315200842, + "grad_norm": 0.74609375, + "learning_rate": 2.100594825361929e-05, + "loss": 0.773, + "step": 8472 + }, + { + "epoch": 0.37148357473674143, + "grad_norm": 0.9140625, + "learning_rate": 2.1003125399593414e-05, + "loss": 0.8959, + "step": 8473 + }, + { + "epoch": 0.37152741795339866, + "grad_norm": 0.84375, + "learning_rate": 2.100030271299689e-05, + "loss": 0.8048, + "step": 8474 + }, + { + "epoch": 0.3715712611700559, + "grad_norm": 0.796875, + "learning_rate": 2.0997480193835706e-05, + "loss": 0.7266, + "step": 8475 + }, + { + "epoch": 0.37161510438671314, + "grad_norm": 0.74609375, + "learning_rate": 2.099465784211586e-05, + "loss": 0.6987, + "step": 8476 + }, + { + "epoch": 0.37165894760337037, + "grad_norm": 0.83203125, + "learning_rate": 2.0991835657843284e-05, + "loss": 0.8571, + "step": 8477 + }, + { + "epoch": 0.3717027908200276, + "grad_norm": 0.7890625, + "learning_rate": 2.0989013641024025e-05, + "loss": 0.8003, + "step": 8478 + }, + { + "epoch": 0.37174663403668484, + "grad_norm": 0.83203125, + "learning_rate": 2.0986191791664034e-05, + "loss": 0.7855, + "step": 8479 + }, + { + "epoch": 0.371790477253342, + "grad_norm": 0.87890625, + "learning_rate": 2.0983370109769294e-05, + "loss": 0.8133, + "step": 8480 + }, + { + "epoch": 0.37183432046999926, + "grad_norm": 0.765625, + "learning_rate": 2.0980548595345796e-05, + "loss": 0.7003, + "step": 8481 + }, + { + "epoch": 0.3718781636866565, + "grad_norm": 0.80078125, + "learning_rate": 2.097772724839947e-05, + "loss": 0.7361, + "step": 8482 + }, + { + "epoch": 0.37192200690331373, + "grad_norm": 0.82421875, + "learning_rate": 2.097490606893636e-05, + "loss": 0.7761, + "step": 8483 + }, + { + "epoch": 0.37196585011997096, + "grad_norm": 0.83984375, + "learning_rate": 2.097208505696242e-05, + "loss": 0.8115, + "step": 8484 + }, + { + "epoch": 0.3720096933366282, + "grad_norm": 0.83984375, + "learning_rate": 2.096926421248364e-05, + "loss": 0.719, + "step": 8485 + }, + { + "epoch": 0.37205353655328544, + "grad_norm": 0.8359375, + "learning_rate": 2.0966443535505977e-05, + "loss": 0.8499, + "step": 8486 + }, + { + "epoch": 0.37209737976994267, + "grad_norm": 0.7890625, + "learning_rate": 2.0963623026035394e-05, + "loss": 0.8316, + "step": 8487 + }, + { + "epoch": 0.3721412229865999, + "grad_norm": 0.76953125, + "learning_rate": 2.096080268407792e-05, + "loss": 0.7709, + "step": 8488 + }, + { + "epoch": 0.37218506620325714, + "grad_norm": 0.828125, + "learning_rate": 2.0957982509639506e-05, + "loss": 0.7787, + "step": 8489 + }, + { + "epoch": 0.3722289094199144, + "grad_norm": 1.109375, + "learning_rate": 2.0955162502726135e-05, + "loss": 0.8446, + "step": 8490 + }, + { + "epoch": 0.3722727526365716, + "grad_norm": 1.1015625, + "learning_rate": 2.0952342663343784e-05, + "loss": 0.7645, + "step": 8491 + }, + { + "epoch": 0.37231659585322885, + "grad_norm": 0.76953125, + "learning_rate": 2.0949522991498394e-05, + "loss": 0.7713, + "step": 8492 + }, + { + "epoch": 0.3723604390698861, + "grad_norm": 0.8984375, + "learning_rate": 2.0946703487195996e-05, + "loss": 0.8086, + "step": 8493 + }, + { + "epoch": 0.3724042822865433, + "grad_norm": 0.73046875, + "learning_rate": 2.094388415044254e-05, + "loss": 0.8498, + "step": 8494 + }, + { + "epoch": 0.37244812550320056, + "grad_norm": 0.82421875, + "learning_rate": 2.0941064981243996e-05, + "loss": 0.8256, + "step": 8495 + }, + { + "epoch": 0.3724919687198578, + "grad_norm": 0.8359375, + "learning_rate": 2.0938245979606362e-05, + "loss": 0.7735, + "step": 8496 + }, + { + "epoch": 0.372535811936515, + "grad_norm": 0.859375, + "learning_rate": 2.0935427145535558e-05, + "loss": 0.7489, + "step": 8497 + }, + { + "epoch": 0.37257965515317226, + "grad_norm": 0.78515625, + "learning_rate": 2.093260847903763e-05, + "loss": 0.838, + "step": 8498 + }, + { + "epoch": 0.3726234983698295, + "grad_norm": 0.80078125, + "learning_rate": 2.092978998011852e-05, + "loss": 0.8883, + "step": 8499 + }, + { + "epoch": 0.37266734158648673, + "grad_norm": 0.859375, + "learning_rate": 2.09269716487842e-05, + "loss": 0.9351, + "step": 8500 + } + ], + "logging_steps": 1, + "max_steps": 22809, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5575446439363766e+20, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}