diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,91137 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0505262732580456, + "eval_steps": 1000, + "global_step": 13000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.080971332754196e-05, + "grad_norm": 6.690799236297607, + "learning_rate": 1.3458950201884254e-08, + "loss": 1.248, + "step": 1 + }, + { + "epoch": 0.00016161942665508393, + "grad_norm": 5.96121883392334, + "learning_rate": 2.6917900403768507e-08, + "loss": 1.1711, + "step": 2 + }, + { + "epoch": 0.00024242913998262592, + "grad_norm": 7.078202247619629, + "learning_rate": 4.037685060565276e-08, + "loss": 1.3206, + "step": 3 + }, + { + "epoch": 0.00032323885331016786, + "grad_norm": 13.463970184326172, + "learning_rate": 5.3835800807537014e-08, + "loss": 1.1678, + "step": 4 + }, + { + "epoch": 0.00040404856663770985, + "grad_norm": 8.267407417297363, + "learning_rate": 6.729475100942127e-08, + "loss": 1.2331, + "step": 5 + }, + { + "epoch": 0.00048485827996525184, + "grad_norm": 7.39365816116333, + "learning_rate": 8.075370121130552e-08, + "loss": 1.3377, + "step": 6 + }, + { + "epoch": 0.0005656679932927938, + "grad_norm": 7.1661763191223145, + "learning_rate": 9.421265141318978e-08, + "loss": 1.3193, + "step": 7 + }, + { + "epoch": 0.0006464777066203357, + "grad_norm": 15.627093315124512, + "learning_rate": 1.0767160161507403e-07, + "loss": 1.3257, + "step": 8 + }, + { + "epoch": 0.0007272874199478778, + "grad_norm": 8.536316871643066, + "learning_rate": 1.211305518169583e-07, + "loss": 1.2181, + "step": 9 + }, + { + "epoch": 0.0008080971332754197, + "grad_norm": 11.300825119018555, + "learning_rate": 1.3458950201884255e-07, + "loss": 1.3117, + "step": 10 + }, + { + "epoch": 0.0008889068466029616, + "grad_norm": 13.28189754486084, + "learning_rate": 1.480484522207268e-07, + "loss": 1.2713, + "step": 11 + }, + { + "epoch": 0.0009697165599305037, + "grad_norm": 10.544150352478027, + "learning_rate": 1.6150740242261104e-07, + "loss": 1.1934, + "step": 12 + }, + { + "epoch": 0.0010505262732580457, + "grad_norm": 16.782533645629883, + "learning_rate": 1.7496635262449528e-07, + "loss": 1.3952, + "step": 13 + }, + { + "epoch": 0.0011313359865855876, + "grad_norm": 6.385349750518799, + "learning_rate": 1.8842530282637956e-07, + "loss": 1.2814, + "step": 14 + }, + { + "epoch": 0.0012121456999131296, + "grad_norm": 12.990242958068848, + "learning_rate": 2.018842530282638e-07, + "loss": 1.3054, + "step": 15 + }, + { + "epoch": 0.0012929554132406714, + "grad_norm": 9.732259750366211, + "learning_rate": 2.1534320323014806e-07, + "loss": 1.2828, + "step": 16 + }, + { + "epoch": 0.0013737651265682135, + "grad_norm": 21.376018524169922, + "learning_rate": 2.2880215343203232e-07, + "loss": 1.2997, + "step": 17 + }, + { + "epoch": 0.0014545748398957555, + "grad_norm": 7.185873031616211, + "learning_rate": 2.422611036339166e-07, + "loss": 1.2679, + "step": 18 + }, + { + "epoch": 0.0015353845532232973, + "grad_norm": 5.8356242179870605, + "learning_rate": 2.557200538358008e-07, + "loss": 1.2401, + "step": 19 + }, + { + "epoch": 0.0016161942665508394, + "grad_norm": 6.466715335845947, + "learning_rate": 2.691790040376851e-07, + "loss": 1.1889, + "step": 20 + }, + { + "epoch": 0.0016970039798783814, + "grad_norm": 6.673277378082275, + "learning_rate": 2.8263795423956933e-07, + "loss": 1.2365, + "step": 21 + }, + { + "epoch": 0.0017778136932059233, + "grad_norm": 12.293954849243164, + "learning_rate": 2.960969044414536e-07, + "loss": 1.1535, + "step": 22 + }, + { + "epoch": 0.0018586234065334653, + "grad_norm": 6.82350492477417, + "learning_rate": 3.0955585464333785e-07, + "loss": 1.3097, + "step": 23 + }, + { + "epoch": 0.0019394331198610074, + "grad_norm": 8.443634986877441, + "learning_rate": 3.230148048452221e-07, + "loss": 1.1801, + "step": 24 + }, + { + "epoch": 0.002020242833188549, + "grad_norm": 7.782437324523926, + "learning_rate": 3.3647375504710637e-07, + "loss": 1.1593, + "step": 25 + }, + { + "epoch": 0.0021010525465160915, + "grad_norm": 11.640141487121582, + "learning_rate": 3.4993270524899055e-07, + "loss": 1.1689, + "step": 26 + }, + { + "epoch": 0.0021818622598436333, + "grad_norm": 6.59988260269165, + "learning_rate": 3.6339165545087484e-07, + "loss": 1.2397, + "step": 27 + }, + { + "epoch": 0.002262671973171175, + "grad_norm": 5.805958271026611, + "learning_rate": 3.768506056527591e-07, + "loss": 1.2492, + "step": 28 + }, + { + "epoch": 0.002343481686498717, + "grad_norm": 6.94270658493042, + "learning_rate": 3.903095558546434e-07, + "loss": 1.1362, + "step": 29 + }, + { + "epoch": 0.002424291399826259, + "grad_norm": 6.733250141143799, + "learning_rate": 4.037685060565276e-07, + "loss": 1.2791, + "step": 30 + }, + { + "epoch": 0.002505101113153801, + "grad_norm": 6.11829948425293, + "learning_rate": 4.172274562584119e-07, + "loss": 1.2586, + "step": 31 + }, + { + "epoch": 0.002585910826481343, + "grad_norm": 10.662686347961426, + "learning_rate": 4.306864064602961e-07, + "loss": 1.31, + "step": 32 + }, + { + "epoch": 0.002666720539808885, + "grad_norm": 6.052074909210205, + "learning_rate": 4.441453566621804e-07, + "loss": 1.3037, + "step": 33 + }, + { + "epoch": 0.002747530253136427, + "grad_norm": 5.352566242218018, + "learning_rate": 4.5760430686406463e-07, + "loss": 1.2243, + "step": 34 + }, + { + "epoch": 0.0028283399664639688, + "grad_norm": 5.09272575378418, + "learning_rate": 4.7106325706594887e-07, + "loss": 1.1706, + "step": 35 + }, + { + "epoch": 0.002909149679791511, + "grad_norm": 4.496048450469971, + "learning_rate": 4.845222072678332e-07, + "loss": 1.2023, + "step": 36 + }, + { + "epoch": 0.002989959393119053, + "grad_norm": 4.400938034057617, + "learning_rate": 4.979811574697174e-07, + "loss": 1.245, + "step": 37 + }, + { + "epoch": 0.0030707691064465947, + "grad_norm": 5.95123291015625, + "learning_rate": 5.114401076716016e-07, + "loss": 1.1342, + "step": 38 + }, + { + "epoch": 0.003151578819774137, + "grad_norm": 5.758495330810547, + "learning_rate": 5.248990578734859e-07, + "loss": 1.2793, + "step": 39 + }, + { + "epoch": 0.003232388533101679, + "grad_norm": 4.290956020355225, + "learning_rate": 5.383580080753702e-07, + "loss": 1.2222, + "step": 40 + }, + { + "epoch": 0.0033131982464292206, + "grad_norm": 5.054476261138916, + "learning_rate": 5.518169582772545e-07, + "loss": 1.2194, + "step": 41 + }, + { + "epoch": 0.003394007959756763, + "grad_norm": 4.21449613571167, + "learning_rate": 5.652759084791387e-07, + "loss": 1.168, + "step": 42 + }, + { + "epoch": 0.0034748176730843047, + "grad_norm": 4.471845626831055, + "learning_rate": 5.78734858681023e-07, + "loss": 1.139, + "step": 43 + }, + { + "epoch": 0.0035556273864118465, + "grad_norm": 4.471460342407227, + "learning_rate": 5.921938088829072e-07, + "loss": 1.1644, + "step": 44 + }, + { + "epoch": 0.003636437099739389, + "grad_norm": 5.968162536621094, + "learning_rate": 6.056527590847914e-07, + "loss": 1.2744, + "step": 45 + }, + { + "epoch": 0.0037172468130669306, + "grad_norm": 4.4738664627075195, + "learning_rate": 6.191117092866757e-07, + "loss": 1.2473, + "step": 46 + }, + { + "epoch": 0.0037980565263944725, + "grad_norm": 4.90024471282959, + "learning_rate": 6.3257065948856e-07, + "loss": 1.1564, + "step": 47 + }, + { + "epoch": 0.0038788662397220147, + "grad_norm": 3.56142520904541, + "learning_rate": 6.460296096904442e-07, + "loss": 1.2167, + "step": 48 + }, + { + "epoch": 0.003959675953049557, + "grad_norm": 4.147864818572998, + "learning_rate": 6.594885598923285e-07, + "loss": 1.2479, + "step": 49 + }, + { + "epoch": 0.004040485666377098, + "grad_norm": 4.415273189544678, + "learning_rate": 6.729475100942127e-07, + "loss": 1.2028, + "step": 50 + }, + { + "epoch": 0.004121295379704641, + "grad_norm": 3.895045757293701, + "learning_rate": 6.864064602960969e-07, + "loss": 1.061, + "step": 51 + }, + { + "epoch": 0.004202105093032183, + "grad_norm": 3.7078940868377686, + "learning_rate": 6.998654104979811e-07, + "loss": 1.0716, + "step": 52 + }, + { + "epoch": 0.004282914806359724, + "grad_norm": 3.556968927383423, + "learning_rate": 7.133243606998655e-07, + "loss": 1.2259, + "step": 53 + }, + { + "epoch": 0.0043637245196872666, + "grad_norm": 4.950558185577393, + "learning_rate": 7.267833109017497e-07, + "loss": 1.1629, + "step": 54 + }, + { + "epoch": 0.004444534233014808, + "grad_norm": 4.822227478027344, + "learning_rate": 7.402422611036341e-07, + "loss": 1.1893, + "step": 55 + }, + { + "epoch": 0.00452534394634235, + "grad_norm": 4.09580135345459, + "learning_rate": 7.537012113055183e-07, + "loss": 1.1445, + "step": 56 + }, + { + "epoch": 0.0046061536596698925, + "grad_norm": 3.3497204780578613, + "learning_rate": 7.671601615074024e-07, + "loss": 1.1026, + "step": 57 + }, + { + "epoch": 0.004686963372997434, + "grad_norm": 5.070769309997559, + "learning_rate": 7.806191117092868e-07, + "loss": 1.0559, + "step": 58 + }, + { + "epoch": 0.004767773086324976, + "grad_norm": 3.769427537918091, + "learning_rate": 7.94078061911171e-07, + "loss": 1.1608, + "step": 59 + }, + { + "epoch": 0.004848582799652518, + "grad_norm": 3.896399736404419, + "learning_rate": 8.075370121130552e-07, + "loss": 1.2354, + "step": 60 + }, + { + "epoch": 0.00492939251298006, + "grad_norm": 4.363215446472168, + "learning_rate": 8.209959623149396e-07, + "loss": 1.2881, + "step": 61 + }, + { + "epoch": 0.005010202226307602, + "grad_norm": 3.7588884830474854, + "learning_rate": 8.344549125168238e-07, + "loss": 1.0485, + "step": 62 + }, + { + "epoch": 0.005091011939635144, + "grad_norm": 4.2055182456970215, + "learning_rate": 8.47913862718708e-07, + "loss": 1.0611, + "step": 63 + }, + { + "epoch": 0.005171821652962686, + "grad_norm": 3.6553685665130615, + "learning_rate": 8.613728129205922e-07, + "loss": 1.0145, + "step": 64 + }, + { + "epoch": 0.005252631366290228, + "grad_norm": 4.0810699462890625, + "learning_rate": 8.748317631224765e-07, + "loss": 1.1216, + "step": 65 + }, + { + "epoch": 0.00533344107961777, + "grad_norm": 3.6002376079559326, + "learning_rate": 8.882907133243608e-07, + "loss": 1.1444, + "step": 66 + }, + { + "epoch": 0.005414250792945312, + "grad_norm": 3.773752212524414, + "learning_rate": 9.01749663526245e-07, + "loss": 1.1017, + "step": 67 + }, + { + "epoch": 0.005495060506272854, + "grad_norm": 3.276829719543457, + "learning_rate": 9.152086137281293e-07, + "loss": 1.1463, + "step": 68 + }, + { + "epoch": 0.005575870219600396, + "grad_norm": 3.6140477657318115, + "learning_rate": 9.286675639300136e-07, + "loss": 1.1648, + "step": 69 + }, + { + "epoch": 0.0056566799329279376, + "grad_norm": 4.192599296569824, + "learning_rate": 9.421265141318977e-07, + "loss": 1.1997, + "step": 70 + }, + { + "epoch": 0.00573748964625548, + "grad_norm": 3.8302218914031982, + "learning_rate": 9.55585464333782e-07, + "loss": 1.0759, + "step": 71 + }, + { + "epoch": 0.005818299359583022, + "grad_norm": 3.8942480087280273, + "learning_rate": 9.690444145356663e-07, + "loss": 1.0674, + "step": 72 + }, + { + "epoch": 0.0058991090729105635, + "grad_norm": 3.1199991703033447, + "learning_rate": 9.825033647375506e-07, + "loss": 1.1144, + "step": 73 + }, + { + "epoch": 0.005979918786238106, + "grad_norm": 3.5112509727478027, + "learning_rate": 9.959623149394349e-07, + "loss": 1.1946, + "step": 74 + }, + { + "epoch": 0.006060728499565648, + "grad_norm": 3.831355333328247, + "learning_rate": 1.009421265141319e-06, + "loss": 1.0892, + "step": 75 + }, + { + "epoch": 0.006141538212893189, + "grad_norm": 3.520054340362549, + "learning_rate": 1.0228802153432032e-06, + "loss": 1.1438, + "step": 76 + }, + { + "epoch": 0.006222347926220732, + "grad_norm": 3.0068299770355225, + "learning_rate": 1.0363391655450875e-06, + "loss": 1.0978, + "step": 77 + }, + { + "epoch": 0.006303157639548274, + "grad_norm": 3.718480110168457, + "learning_rate": 1.0497981157469718e-06, + "loss": 1.0832, + "step": 78 + }, + { + "epoch": 0.006383967352875815, + "grad_norm": 3.4276466369628906, + "learning_rate": 1.063257065948856e-06, + "loss": 1.243, + "step": 79 + }, + { + "epoch": 0.006464777066203358, + "grad_norm": 4.088370323181152, + "learning_rate": 1.0767160161507404e-06, + "loss": 1.1034, + "step": 80 + }, + { + "epoch": 0.0065455867795309, + "grad_norm": 4.904850482940674, + "learning_rate": 1.0901749663526245e-06, + "loss": 1.1799, + "step": 81 + }, + { + "epoch": 0.006626396492858441, + "grad_norm": 3.5775904655456543, + "learning_rate": 1.103633916554509e-06, + "loss": 0.9865, + "step": 82 + }, + { + "epoch": 0.0067072062061859835, + "grad_norm": 3.5812389850616455, + "learning_rate": 1.117092866756393e-06, + "loss": 1.0355, + "step": 83 + }, + { + "epoch": 0.006788015919513526, + "grad_norm": 3.087735891342163, + "learning_rate": 1.1305518169582773e-06, + "loss": 1.0554, + "step": 84 + }, + { + "epoch": 0.006868825632841067, + "grad_norm": 3.641606092453003, + "learning_rate": 1.1440107671601616e-06, + "loss": 1.199, + "step": 85 + }, + { + "epoch": 0.006949635346168609, + "grad_norm": 3.337639808654785, + "learning_rate": 1.157469717362046e-06, + "loss": 0.9961, + "step": 86 + }, + { + "epoch": 0.007030445059496152, + "grad_norm": 3.4842021465301514, + "learning_rate": 1.1709286675639302e-06, + "loss": 1.0043, + "step": 87 + }, + { + "epoch": 0.007111254772823693, + "grad_norm": 4.307300090789795, + "learning_rate": 1.1843876177658145e-06, + "loss": 1.1515, + "step": 88 + }, + { + "epoch": 0.007192064486151235, + "grad_norm": 3.455139636993408, + "learning_rate": 1.1978465679676985e-06, + "loss": 1.14, + "step": 89 + }, + { + "epoch": 0.007272874199478778, + "grad_norm": 3.4302618503570557, + "learning_rate": 1.2113055181695828e-06, + "loss": 1.0239, + "step": 90 + }, + { + "epoch": 0.007353683912806319, + "grad_norm": 4.0325188636779785, + "learning_rate": 1.2247644683714671e-06, + "loss": 1.1421, + "step": 91 + }, + { + "epoch": 0.007434493626133861, + "grad_norm": 3.5034971237182617, + "learning_rate": 1.2382234185733514e-06, + "loss": 1.036, + "step": 92 + }, + { + "epoch": 0.0075153033394614035, + "grad_norm": 3.5903069972991943, + "learning_rate": 1.2516823687752355e-06, + "loss": 0.9697, + "step": 93 + }, + { + "epoch": 0.007596113052788945, + "grad_norm": 3.2864511013031006, + "learning_rate": 1.26514131897712e-06, + "loss": 1.1045, + "step": 94 + }, + { + "epoch": 0.007676922766116487, + "grad_norm": 3.3390679359436035, + "learning_rate": 1.2786002691790043e-06, + "loss": 1.1308, + "step": 95 + }, + { + "epoch": 0.0077577324794440294, + "grad_norm": 3.5311758518218994, + "learning_rate": 1.2920592193808883e-06, + "loss": 1.1261, + "step": 96 + }, + { + "epoch": 0.007838542192771572, + "grad_norm": 3.247596025466919, + "learning_rate": 1.3055181695827726e-06, + "loss": 1.1322, + "step": 97 + }, + { + "epoch": 0.007919351906099114, + "grad_norm": 4.039790630340576, + "learning_rate": 1.318977119784657e-06, + "loss": 1.0643, + "step": 98 + }, + { + "epoch": 0.008000161619426654, + "grad_norm": 3.3263556957244873, + "learning_rate": 1.3324360699865412e-06, + "loss": 1.2159, + "step": 99 + }, + { + "epoch": 0.008080971332754197, + "grad_norm": 2.9595468044281006, + "learning_rate": 1.3458950201884255e-06, + "loss": 1.0827, + "step": 100 + }, + { + "epoch": 0.008161781046081739, + "grad_norm": 3.3838250637054443, + "learning_rate": 1.3593539703903098e-06, + "loss": 1.1956, + "step": 101 + }, + { + "epoch": 0.008242590759409281, + "grad_norm": 3.3292553424835205, + "learning_rate": 1.3728129205921938e-06, + "loss": 1.1723, + "step": 102 + }, + { + "epoch": 0.008323400472736824, + "grad_norm": 3.0163986682891846, + "learning_rate": 1.3862718707940781e-06, + "loss": 1.1573, + "step": 103 + }, + { + "epoch": 0.008404210186064366, + "grad_norm": 3.532806158065796, + "learning_rate": 1.3997308209959622e-06, + "loss": 1.1104, + "step": 104 + }, + { + "epoch": 0.008485019899391906, + "grad_norm": 3.480621337890625, + "learning_rate": 1.4131897711978467e-06, + "loss": 1.0676, + "step": 105 + }, + { + "epoch": 0.008565829612719449, + "grad_norm": 3.874357223510742, + "learning_rate": 1.426648721399731e-06, + "loss": 1.1004, + "step": 106 + }, + { + "epoch": 0.00864663932604699, + "grad_norm": 3.192427635192871, + "learning_rate": 1.4401076716016153e-06, + "loss": 1.1254, + "step": 107 + }, + { + "epoch": 0.008727449039374533, + "grad_norm": 4.162766933441162, + "learning_rate": 1.4535666218034994e-06, + "loss": 0.9671, + "step": 108 + }, + { + "epoch": 0.008808258752702075, + "grad_norm": 3.8476390838623047, + "learning_rate": 1.4670255720053836e-06, + "loss": 0.9608, + "step": 109 + }, + { + "epoch": 0.008889068466029616, + "grad_norm": 2.9822797775268555, + "learning_rate": 1.4804845222072681e-06, + "loss": 1.2083, + "step": 110 + }, + { + "epoch": 0.008969878179357158, + "grad_norm": 3.5006701946258545, + "learning_rate": 1.4939434724091522e-06, + "loss": 1.1788, + "step": 111 + }, + { + "epoch": 0.0090506878926847, + "grad_norm": 3.982487678527832, + "learning_rate": 1.5074024226110365e-06, + "loss": 1.1051, + "step": 112 + }, + { + "epoch": 0.009131497606012243, + "grad_norm": 3.3723080158233643, + "learning_rate": 1.5208613728129206e-06, + "loss": 1.0072, + "step": 113 + }, + { + "epoch": 0.009212307319339785, + "grad_norm": 4.094737529754639, + "learning_rate": 1.5343203230148049e-06, + "loss": 1.1149, + "step": 114 + }, + { + "epoch": 0.009293117032667327, + "grad_norm": 3.6751790046691895, + "learning_rate": 1.5477792732166894e-06, + "loss": 1.114, + "step": 115 + }, + { + "epoch": 0.009373926745994868, + "grad_norm": 3.659663677215576, + "learning_rate": 1.5612382234185736e-06, + "loss": 1.1847, + "step": 116 + }, + { + "epoch": 0.00945473645932241, + "grad_norm": 4.006715297698975, + "learning_rate": 1.5746971736204577e-06, + "loss": 1.2139, + "step": 117 + }, + { + "epoch": 0.009535546172649952, + "grad_norm": 4.317342281341553, + "learning_rate": 1.588156123822342e-06, + "loss": 1.1472, + "step": 118 + }, + { + "epoch": 0.009616355885977495, + "grad_norm": 3.78163480758667, + "learning_rate": 1.601615074024226e-06, + "loss": 1.1659, + "step": 119 + }, + { + "epoch": 0.009697165599305037, + "grad_norm": 3.253537178039551, + "learning_rate": 1.6150740242261104e-06, + "loss": 1.1435, + "step": 120 + }, + { + "epoch": 0.009777975312632579, + "grad_norm": 3.281273603439331, + "learning_rate": 1.6285329744279949e-06, + "loss": 1.148, + "step": 121 + }, + { + "epoch": 0.00985878502596012, + "grad_norm": 3.442288875579834, + "learning_rate": 1.6419919246298792e-06, + "loss": 1.2081, + "step": 122 + }, + { + "epoch": 0.009939594739287662, + "grad_norm": 3.544372797012329, + "learning_rate": 1.6554508748317632e-06, + "loss": 1.1631, + "step": 123 + }, + { + "epoch": 0.010020404452615204, + "grad_norm": 3.7572271823883057, + "learning_rate": 1.6689098250336475e-06, + "loss": 1.0829, + "step": 124 + }, + { + "epoch": 0.010101214165942746, + "grad_norm": 3.042733907699585, + "learning_rate": 1.6823687752355316e-06, + "loss": 1.1737, + "step": 125 + }, + { + "epoch": 0.010182023879270289, + "grad_norm": 2.8813230991363525, + "learning_rate": 1.695827725437416e-06, + "loss": 1.0451, + "step": 126 + }, + { + "epoch": 0.010262833592597831, + "grad_norm": 3.256988525390625, + "learning_rate": 1.7092866756393004e-06, + "loss": 1.1688, + "step": 127 + }, + { + "epoch": 0.010343643305925371, + "grad_norm": 3.1898910999298096, + "learning_rate": 1.7227456258411845e-06, + "loss": 1.08, + "step": 128 + }, + { + "epoch": 0.010424453019252914, + "grad_norm": 3.8104963302612305, + "learning_rate": 1.7362045760430687e-06, + "loss": 1.0908, + "step": 129 + }, + { + "epoch": 0.010505262732580456, + "grad_norm": 3.439345121383667, + "learning_rate": 1.749663526244953e-06, + "loss": 1.0263, + "step": 130 + }, + { + "epoch": 0.010586072445907998, + "grad_norm": 4.058794021606445, + "learning_rate": 1.7631224764468375e-06, + "loss": 1.0531, + "step": 131 + }, + { + "epoch": 0.01066688215923554, + "grad_norm": 2.9343535900115967, + "learning_rate": 1.7765814266487216e-06, + "loss": 1.0031, + "step": 132 + }, + { + "epoch": 0.010747691872563083, + "grad_norm": 3.176175832748413, + "learning_rate": 1.7900403768506059e-06, + "loss": 1.1362, + "step": 133 + }, + { + "epoch": 0.010828501585890623, + "grad_norm": 3.4748244285583496, + "learning_rate": 1.80349932705249e-06, + "loss": 1.0497, + "step": 134 + }, + { + "epoch": 0.010909311299218166, + "grad_norm": 3.0900204181671143, + "learning_rate": 1.8169582772543742e-06, + "loss": 1.099, + "step": 135 + }, + { + "epoch": 0.010990121012545708, + "grad_norm": 3.0723509788513184, + "learning_rate": 1.8304172274562585e-06, + "loss": 1.0654, + "step": 136 + }, + { + "epoch": 0.01107093072587325, + "grad_norm": 2.9936509132385254, + "learning_rate": 1.8438761776581428e-06, + "loss": 1.0557, + "step": 137 + }, + { + "epoch": 0.011151740439200792, + "grad_norm": 2.9151737689971924, + "learning_rate": 1.8573351278600271e-06, + "loss": 1.1121, + "step": 138 + }, + { + "epoch": 0.011232550152528335, + "grad_norm": 3.0358784198760986, + "learning_rate": 1.8707940780619114e-06, + "loss": 1.1448, + "step": 139 + }, + { + "epoch": 0.011313359865855875, + "grad_norm": 3.3080246448516846, + "learning_rate": 1.8842530282637955e-06, + "loss": 1.1314, + "step": 140 + }, + { + "epoch": 0.011394169579183417, + "grad_norm": 3.5009357929229736, + "learning_rate": 1.8977119784656798e-06, + "loss": 0.9784, + "step": 141 + }, + { + "epoch": 0.01147497929251096, + "grad_norm": 3.626203775405884, + "learning_rate": 1.911170928667564e-06, + "loss": 1.0753, + "step": 142 + }, + { + "epoch": 0.011555789005838502, + "grad_norm": 3.3503258228302, + "learning_rate": 1.9246298788694483e-06, + "loss": 1.1486, + "step": 143 + }, + { + "epoch": 0.011636598719166044, + "grad_norm": 3.2445223331451416, + "learning_rate": 1.9380888290713326e-06, + "loss": 1.1295, + "step": 144 + }, + { + "epoch": 0.011717408432493586, + "grad_norm": 3.2183837890625, + "learning_rate": 1.951547779273217e-06, + "loss": 1.0644, + "step": 145 + }, + { + "epoch": 0.011798218145821127, + "grad_norm": 3.118913173675537, + "learning_rate": 1.965006729475101e-06, + "loss": 1.2114, + "step": 146 + }, + { + "epoch": 0.01187902785914867, + "grad_norm": 3.2243826389312744, + "learning_rate": 1.9784656796769855e-06, + "loss": 1.205, + "step": 147 + }, + { + "epoch": 0.011959837572476211, + "grad_norm": 2.848964214324951, + "learning_rate": 1.9919246298788698e-06, + "loss": 0.9954, + "step": 148 + }, + { + "epoch": 0.012040647285803754, + "grad_norm": 3.7954328060150146, + "learning_rate": 2.005383580080754e-06, + "loss": 1.0534, + "step": 149 + }, + { + "epoch": 0.012121456999131296, + "grad_norm": 2.8739662170410156, + "learning_rate": 2.018842530282638e-06, + "loss": 1.2154, + "step": 150 + }, + { + "epoch": 0.012202266712458838, + "grad_norm": 3.972341299057007, + "learning_rate": 2.032301480484522e-06, + "loss": 1.1231, + "step": 151 + }, + { + "epoch": 0.012283076425786379, + "grad_norm": 3.7189602851867676, + "learning_rate": 2.0457604306864065e-06, + "loss": 1.1857, + "step": 152 + }, + { + "epoch": 0.012363886139113921, + "grad_norm": 3.0613625049591064, + "learning_rate": 2.059219380888291e-06, + "loss": 1.0634, + "step": 153 + }, + { + "epoch": 0.012444695852441463, + "grad_norm": 3.0153865814208984, + "learning_rate": 2.072678331090175e-06, + "loss": 1.0734, + "step": 154 + }, + { + "epoch": 0.012525505565769006, + "grad_norm": 3.1202192306518555, + "learning_rate": 2.0861372812920593e-06, + "loss": 1.0793, + "step": 155 + }, + { + "epoch": 0.012606315279096548, + "grad_norm": 2.8822708129882812, + "learning_rate": 2.0995962314939436e-06, + "loss": 1.035, + "step": 156 + }, + { + "epoch": 0.01268712499242409, + "grad_norm": 3.4996628761291504, + "learning_rate": 2.113055181695828e-06, + "loss": 1.1336, + "step": 157 + }, + { + "epoch": 0.01276793470575163, + "grad_norm": 3.41062331199646, + "learning_rate": 2.126514131897712e-06, + "loss": 1.03, + "step": 158 + }, + { + "epoch": 0.012848744419079173, + "grad_norm": 3.3173232078552246, + "learning_rate": 2.1399730820995965e-06, + "loss": 1.0768, + "step": 159 + }, + { + "epoch": 0.012929554132406715, + "grad_norm": 3.113736629486084, + "learning_rate": 2.1534320323014808e-06, + "loss": 1.2176, + "step": 160 + }, + { + "epoch": 0.013010363845734257, + "grad_norm": 3.7131903171539307, + "learning_rate": 2.166890982503365e-06, + "loss": 1.1292, + "step": 161 + }, + { + "epoch": 0.0130911735590618, + "grad_norm": 3.4207205772399902, + "learning_rate": 2.180349932705249e-06, + "loss": 1.0778, + "step": 162 + }, + { + "epoch": 0.013171983272389342, + "grad_norm": 3.311591625213623, + "learning_rate": 2.1938088829071332e-06, + "loss": 1.0529, + "step": 163 + }, + { + "epoch": 0.013252792985716882, + "grad_norm": 3.1533114910125732, + "learning_rate": 2.207267833109018e-06, + "loss": 1.087, + "step": 164 + }, + { + "epoch": 0.013333602699044425, + "grad_norm": 3.94301700592041, + "learning_rate": 2.2207267833109018e-06, + "loss": 1.0665, + "step": 165 + }, + { + "epoch": 0.013414412412371967, + "grad_norm": 3.766512632369995, + "learning_rate": 2.234185733512786e-06, + "loss": 1.1203, + "step": 166 + }, + { + "epoch": 0.01349522212569951, + "grad_norm": 4.3540263175964355, + "learning_rate": 2.2476446837146704e-06, + "loss": 1.1282, + "step": 167 + }, + { + "epoch": 0.013576031839027052, + "grad_norm": 3.223722457885742, + "learning_rate": 2.2611036339165546e-06, + "loss": 1.0528, + "step": 168 + }, + { + "epoch": 0.013656841552354594, + "grad_norm": 3.663490056991577, + "learning_rate": 2.274562584118439e-06, + "loss": 1.0388, + "step": 169 + }, + { + "epoch": 0.013737651265682134, + "grad_norm": 3.2799320220947266, + "learning_rate": 2.2880215343203232e-06, + "loss": 1.2821, + "step": 170 + }, + { + "epoch": 0.013818460979009677, + "grad_norm": 3.058697462081909, + "learning_rate": 2.3014804845222075e-06, + "loss": 1.0764, + "step": 171 + }, + { + "epoch": 0.013899270692337219, + "grad_norm": 3.0403807163238525, + "learning_rate": 2.314939434724092e-06, + "loss": 1.0531, + "step": 172 + }, + { + "epoch": 0.013980080405664761, + "grad_norm": 3.4602065086364746, + "learning_rate": 2.3283983849259757e-06, + "loss": 1.1011, + "step": 173 + }, + { + "epoch": 0.014060890118992303, + "grad_norm": 3.9035913944244385, + "learning_rate": 2.3418573351278604e-06, + "loss": 1.1127, + "step": 174 + }, + { + "epoch": 0.014141699832319846, + "grad_norm": 2.8776652812957764, + "learning_rate": 2.3553162853297447e-06, + "loss": 1.1733, + "step": 175 + }, + { + "epoch": 0.014222509545647386, + "grad_norm": 3.1856839656829834, + "learning_rate": 2.368775235531629e-06, + "loss": 1.0929, + "step": 176 + }, + { + "epoch": 0.014303319258974928, + "grad_norm": 3.479437828063965, + "learning_rate": 2.382234185733513e-06, + "loss": 1.1085, + "step": 177 + }, + { + "epoch": 0.01438412897230247, + "grad_norm": 3.5891594886779785, + "learning_rate": 2.395693135935397e-06, + "loss": 0.9675, + "step": 178 + }, + { + "epoch": 0.014464938685630013, + "grad_norm": 3.4782447814941406, + "learning_rate": 2.4091520861372814e-06, + "loss": 0.9895, + "step": 179 + }, + { + "epoch": 0.014545748398957555, + "grad_norm": 2.508082151412964, + "learning_rate": 2.4226110363391657e-06, + "loss": 0.9304, + "step": 180 + }, + { + "epoch": 0.014626558112285097, + "grad_norm": 3.4057435989379883, + "learning_rate": 2.43606998654105e-06, + "loss": 1.0716, + "step": 181 + }, + { + "epoch": 0.014707367825612638, + "grad_norm": 3.42026424407959, + "learning_rate": 2.4495289367429342e-06, + "loss": 1.1508, + "step": 182 + }, + { + "epoch": 0.01478817753894018, + "grad_norm": 3.884592056274414, + "learning_rate": 2.4629878869448185e-06, + "loss": 1.165, + "step": 183 + }, + { + "epoch": 0.014868987252267723, + "grad_norm": 3.5591719150543213, + "learning_rate": 2.476446837146703e-06, + "loss": 1.1261, + "step": 184 + }, + { + "epoch": 0.014949796965595265, + "grad_norm": 3.258593797683716, + "learning_rate": 2.489905787348587e-06, + "loss": 1.0031, + "step": 185 + }, + { + "epoch": 0.015030606678922807, + "grad_norm": 3.988311767578125, + "learning_rate": 2.503364737550471e-06, + "loss": 1.225, + "step": 186 + }, + { + "epoch": 0.01511141639225035, + "grad_norm": 3.750598430633545, + "learning_rate": 2.5168236877523557e-06, + "loss": 1.0899, + "step": 187 + }, + { + "epoch": 0.01519222610557789, + "grad_norm": 3.344961404800415, + "learning_rate": 2.53028263795424e-06, + "loss": 1.0497, + "step": 188 + }, + { + "epoch": 0.015273035818905432, + "grad_norm": 3.5242574214935303, + "learning_rate": 2.543741588156124e-06, + "loss": 0.983, + "step": 189 + }, + { + "epoch": 0.015353845532232974, + "grad_norm": 3.1082167625427246, + "learning_rate": 2.5572005383580085e-06, + "loss": 1.0544, + "step": 190 + }, + { + "epoch": 0.015434655245560517, + "grad_norm": 3.4438302516937256, + "learning_rate": 2.5706594885598924e-06, + "loss": 1.1623, + "step": 191 + }, + { + "epoch": 0.015515464958888059, + "grad_norm": 3.1369824409484863, + "learning_rate": 2.5841184387617767e-06, + "loss": 1.0385, + "step": 192 + }, + { + "epoch": 0.015596274672215601, + "grad_norm": 3.228114366531372, + "learning_rate": 2.5975773889636614e-06, + "loss": 0.9485, + "step": 193 + }, + { + "epoch": 0.015677084385543143, + "grad_norm": 3.369588613510132, + "learning_rate": 2.6110363391655453e-06, + "loss": 1.2509, + "step": 194 + }, + { + "epoch": 0.015757894098870684, + "grad_norm": 3.242443323135376, + "learning_rate": 2.6244952893674295e-06, + "loss": 1.079, + "step": 195 + }, + { + "epoch": 0.015838703812198228, + "grad_norm": 3.1657700538635254, + "learning_rate": 2.637954239569314e-06, + "loss": 1.1094, + "step": 196 + }, + { + "epoch": 0.01591951352552577, + "grad_norm": 3.7620697021484375, + "learning_rate": 2.651413189771198e-06, + "loss": 1.0819, + "step": 197 + }, + { + "epoch": 0.01600032323885331, + "grad_norm": 3.6370279788970947, + "learning_rate": 2.6648721399730824e-06, + "loss": 0.9747, + "step": 198 + }, + { + "epoch": 0.016081132952180853, + "grad_norm": 2.9298436641693115, + "learning_rate": 2.6783310901749667e-06, + "loss": 1.1835, + "step": 199 + }, + { + "epoch": 0.016161942665508394, + "grad_norm": 3.335447072982788, + "learning_rate": 2.691790040376851e-06, + "loss": 1.0578, + "step": 200 + }, + { + "epoch": 0.016242752378835938, + "grad_norm": 3.341702938079834, + "learning_rate": 2.705248990578735e-06, + "loss": 0.9886, + "step": 201 + }, + { + "epoch": 0.016323562092163478, + "grad_norm": 2.7136051654815674, + "learning_rate": 2.7187079407806195e-06, + "loss": 1.017, + "step": 202 + }, + { + "epoch": 0.01640437180549102, + "grad_norm": 3.599208354949951, + "learning_rate": 2.7321668909825034e-06, + "loss": 1.0553, + "step": 203 + }, + { + "epoch": 0.016485181518818563, + "grad_norm": 2.866802453994751, + "learning_rate": 2.7456258411843877e-06, + "loss": 1.1636, + "step": 204 + }, + { + "epoch": 0.016565991232146103, + "grad_norm": 3.0029284954071045, + "learning_rate": 2.7590847913862724e-06, + "loss": 1.0556, + "step": 205 + }, + { + "epoch": 0.016646800945473647, + "grad_norm": 3.1328749656677246, + "learning_rate": 2.7725437415881563e-06, + "loss": 1.0848, + "step": 206 + }, + { + "epoch": 0.016727610658801188, + "grad_norm": 3.0416531562805176, + "learning_rate": 2.7860026917900406e-06, + "loss": 1.0294, + "step": 207 + }, + { + "epoch": 0.01680842037212873, + "grad_norm": 2.908458709716797, + "learning_rate": 2.7994616419919244e-06, + "loss": 1.2528, + "step": 208 + }, + { + "epoch": 0.016889230085456272, + "grad_norm": 3.4849448204040527, + "learning_rate": 2.812920592193809e-06, + "loss": 1.1045, + "step": 209 + }, + { + "epoch": 0.016970039798783813, + "grad_norm": 3.44240140914917, + "learning_rate": 2.8263795423956934e-06, + "loss": 1.0109, + "step": 210 + }, + { + "epoch": 0.017050849512111357, + "grad_norm": 3.6446714401245117, + "learning_rate": 2.8398384925975777e-06, + "loss": 1.1597, + "step": 211 + }, + { + "epoch": 0.017131659225438897, + "grad_norm": 3.3532803058624268, + "learning_rate": 2.853297442799462e-06, + "loss": 1.012, + "step": 212 + }, + { + "epoch": 0.01721246893876644, + "grad_norm": 2.927619457244873, + "learning_rate": 2.866756393001346e-06, + "loss": 1.0018, + "step": 213 + }, + { + "epoch": 0.01729327865209398, + "grad_norm": 3.5168118476867676, + "learning_rate": 2.8802153432032306e-06, + "loss": 1.0221, + "step": 214 + }, + { + "epoch": 0.017374088365421522, + "grad_norm": 2.9070892333984375, + "learning_rate": 2.893674293405115e-06, + "loss": 1.0964, + "step": 215 + }, + { + "epoch": 0.017454898078749066, + "grad_norm": 3.6018590927124023, + "learning_rate": 2.9071332436069987e-06, + "loss": 1.1081, + "step": 216 + }, + { + "epoch": 0.017535707792076607, + "grad_norm": 4.031643390655518, + "learning_rate": 2.9205921938088834e-06, + "loss": 1.0165, + "step": 217 + }, + { + "epoch": 0.01761651750540415, + "grad_norm": 3.1344571113586426, + "learning_rate": 2.9340511440107673e-06, + "loss": 1.1071, + "step": 218 + }, + { + "epoch": 0.01769732721873169, + "grad_norm": 4.082541465759277, + "learning_rate": 2.9475100942126516e-06, + "loss": 1.1359, + "step": 219 + }, + { + "epoch": 0.017778136932059232, + "grad_norm": 3.3356659412384033, + "learning_rate": 2.9609690444145363e-06, + "loss": 1.034, + "step": 220 + }, + { + "epoch": 0.017858946645386776, + "grad_norm": 3.1675662994384766, + "learning_rate": 2.97442799461642e-06, + "loss": 1.0509, + "step": 221 + }, + { + "epoch": 0.017939756358714316, + "grad_norm": 3.5920846462249756, + "learning_rate": 2.9878869448183044e-06, + "loss": 1.0415, + "step": 222 + }, + { + "epoch": 0.01802056607204186, + "grad_norm": 3.178565740585327, + "learning_rate": 3.0013458950201883e-06, + "loss": 1.2007, + "step": 223 + }, + { + "epoch": 0.0181013757853694, + "grad_norm": 4.121781349182129, + "learning_rate": 3.014804845222073e-06, + "loss": 0.9993, + "step": 224 + }, + { + "epoch": 0.018182185498696945, + "grad_norm": 3.4476571083068848, + "learning_rate": 3.0282637954239573e-06, + "loss": 1.0524, + "step": 225 + }, + { + "epoch": 0.018262995212024485, + "grad_norm": 3.460421323776245, + "learning_rate": 3.041722745625841e-06, + "loss": 1.0942, + "step": 226 + }, + { + "epoch": 0.018343804925352026, + "grad_norm": 2.8429582118988037, + "learning_rate": 3.055181695827726e-06, + "loss": 1.0494, + "step": 227 + }, + { + "epoch": 0.01842461463867957, + "grad_norm": 2.999035120010376, + "learning_rate": 3.0686406460296097e-06, + "loss": 1.1238, + "step": 228 + }, + { + "epoch": 0.01850542435200711, + "grad_norm": 4.474518775939941, + "learning_rate": 3.0820995962314944e-06, + "loss": 1.085, + "step": 229 + }, + { + "epoch": 0.018586234065334654, + "grad_norm": 3.249032735824585, + "learning_rate": 3.0955585464333787e-06, + "loss": 1.083, + "step": 230 + }, + { + "epoch": 0.018667043778662195, + "grad_norm": 3.566030502319336, + "learning_rate": 3.1090174966352626e-06, + "loss": 1.2091, + "step": 231 + }, + { + "epoch": 0.018747853491989735, + "grad_norm": 3.5390536785125732, + "learning_rate": 3.1224764468371473e-06, + "loss": 1.0893, + "step": 232 + }, + { + "epoch": 0.01882866320531728, + "grad_norm": 3.0526182651519775, + "learning_rate": 3.135935397039031e-06, + "loss": 1.0634, + "step": 233 + }, + { + "epoch": 0.01890947291864482, + "grad_norm": 4.032508850097656, + "learning_rate": 3.1493943472409154e-06, + "loss": 1.0615, + "step": 234 + }, + { + "epoch": 0.018990282631972364, + "grad_norm": 5.153072834014893, + "learning_rate": 3.1628532974427993e-06, + "loss": 1.1304, + "step": 235 + }, + { + "epoch": 0.019071092345299905, + "grad_norm": 3.5427660942077637, + "learning_rate": 3.176312247644684e-06, + "loss": 1.1508, + "step": 236 + }, + { + "epoch": 0.01915190205862745, + "grad_norm": 3.172734260559082, + "learning_rate": 3.1897711978465683e-06, + "loss": 1.0361, + "step": 237 + }, + { + "epoch": 0.01923271177195499, + "grad_norm": 4.713091850280762, + "learning_rate": 3.203230148048452e-06, + "loss": 1.0781, + "step": 238 + }, + { + "epoch": 0.01931352148528253, + "grad_norm": 3.1458215713500977, + "learning_rate": 3.216689098250337e-06, + "loss": 1.1755, + "step": 239 + }, + { + "epoch": 0.019394331198610074, + "grad_norm": 3.0068893432617188, + "learning_rate": 3.2301480484522207e-06, + "loss": 1.1408, + "step": 240 + }, + { + "epoch": 0.019475140911937614, + "grad_norm": 2.879537343978882, + "learning_rate": 3.243606998654105e-06, + "loss": 0.9688, + "step": 241 + }, + { + "epoch": 0.019555950625265158, + "grad_norm": 3.4544901847839355, + "learning_rate": 3.2570659488559897e-06, + "loss": 0.9786, + "step": 242 + }, + { + "epoch": 0.0196367603385927, + "grad_norm": 3.8788928985595703, + "learning_rate": 3.2705248990578736e-06, + "loss": 1.0054, + "step": 243 + }, + { + "epoch": 0.01971757005192024, + "grad_norm": 3.1861045360565186, + "learning_rate": 3.2839838492597583e-06, + "loss": 1.0104, + "step": 244 + }, + { + "epoch": 0.019798379765247783, + "grad_norm": 3.352832078933716, + "learning_rate": 3.297442799461642e-06, + "loss": 1.0723, + "step": 245 + }, + { + "epoch": 0.019879189478575324, + "grad_norm": 2.9049644470214844, + "learning_rate": 3.3109017496635265e-06, + "loss": 1.108, + "step": 246 + }, + { + "epoch": 0.019959999191902868, + "grad_norm": 3.1392064094543457, + "learning_rate": 3.324360699865411e-06, + "loss": 0.9637, + "step": 247 + }, + { + "epoch": 0.020040808905230408, + "grad_norm": 2.9236528873443604, + "learning_rate": 3.337819650067295e-06, + "loss": 1.1056, + "step": 248 + }, + { + "epoch": 0.020121618618557952, + "grad_norm": 3.3994264602661133, + "learning_rate": 3.3512786002691793e-06, + "loss": 0.9997, + "step": 249 + }, + { + "epoch": 0.020202428331885493, + "grad_norm": 3.403186559677124, + "learning_rate": 3.364737550471063e-06, + "loss": 0.9786, + "step": 250 + }, + { + "epoch": 0.020283238045213033, + "grad_norm": 3.129040002822876, + "learning_rate": 3.378196500672948e-06, + "loss": 1.2622, + "step": 251 + }, + { + "epoch": 0.020364047758540577, + "grad_norm": 3.049415111541748, + "learning_rate": 3.391655450874832e-06, + "loss": 1.2219, + "step": 252 + }, + { + "epoch": 0.020444857471868118, + "grad_norm": 2.960981845855713, + "learning_rate": 3.405114401076716e-06, + "loss": 1.0447, + "step": 253 + }, + { + "epoch": 0.020525667185195662, + "grad_norm": 3.1426732540130615, + "learning_rate": 3.4185733512786008e-06, + "loss": 1.0372, + "step": 254 + }, + { + "epoch": 0.020606476898523202, + "grad_norm": 2.869678258895874, + "learning_rate": 3.4320323014804846e-06, + "loss": 0.9815, + "step": 255 + }, + { + "epoch": 0.020687286611850743, + "grad_norm": 3.1211798191070557, + "learning_rate": 3.445491251682369e-06, + "loss": 1.0019, + "step": 256 + }, + { + "epoch": 0.020768096325178287, + "grad_norm": 2.9854209423065186, + "learning_rate": 3.4589502018842536e-06, + "loss": 1.0042, + "step": 257 + }, + { + "epoch": 0.020848906038505827, + "grad_norm": 3.7202820777893066, + "learning_rate": 3.4724091520861375e-06, + "loss": 1.0727, + "step": 258 + }, + { + "epoch": 0.02092971575183337, + "grad_norm": 3.9204063415527344, + "learning_rate": 3.4858681022880218e-06, + "loss": 1.0386, + "step": 259 + }, + { + "epoch": 0.021010525465160912, + "grad_norm": 3.1514883041381836, + "learning_rate": 3.499327052489906e-06, + "loss": 1.0543, + "step": 260 + }, + { + "epoch": 0.021091335178488456, + "grad_norm": 3.1727118492126465, + "learning_rate": 3.5127860026917903e-06, + "loss": 1.0541, + "step": 261 + }, + { + "epoch": 0.021172144891815996, + "grad_norm": 3.517979860305786, + "learning_rate": 3.526244952893675e-06, + "loss": 1.0653, + "step": 262 + }, + { + "epoch": 0.021252954605143537, + "grad_norm": 3.120305061340332, + "learning_rate": 3.539703903095559e-06, + "loss": 1.0551, + "step": 263 + }, + { + "epoch": 0.02133376431847108, + "grad_norm": 3.2179126739501953, + "learning_rate": 3.553162853297443e-06, + "loss": 0.9822, + "step": 264 + }, + { + "epoch": 0.02141457403179862, + "grad_norm": 3.563760995864868, + "learning_rate": 3.566621803499327e-06, + "loss": 0.9401, + "step": 265 + }, + { + "epoch": 0.021495383745126165, + "grad_norm": 3.1966042518615723, + "learning_rate": 3.5800807537012118e-06, + "loss": 1.1475, + "step": 266 + }, + { + "epoch": 0.021576193458453706, + "grad_norm": 2.9117894172668457, + "learning_rate": 3.5935397039030956e-06, + "loss": 1.0869, + "step": 267 + }, + { + "epoch": 0.021657003171781247, + "grad_norm": 2.9326651096343994, + "learning_rate": 3.60699865410498e-06, + "loss": 1.0897, + "step": 268 + }, + { + "epoch": 0.02173781288510879, + "grad_norm": 3.3142008781433105, + "learning_rate": 3.6204576043068646e-06, + "loss": 1.1048, + "step": 269 + }, + { + "epoch": 0.02181862259843633, + "grad_norm": 3.220008611679077, + "learning_rate": 3.6339165545087485e-06, + "loss": 1.1076, + "step": 270 + }, + { + "epoch": 0.021899432311763875, + "grad_norm": 3.6159610748291016, + "learning_rate": 3.6473755047106328e-06, + "loss": 1.0769, + "step": 271 + }, + { + "epoch": 0.021980242025091416, + "grad_norm": 3.3969810009002686, + "learning_rate": 3.660834454912517e-06, + "loss": 1.105, + "step": 272 + }, + { + "epoch": 0.02206105173841896, + "grad_norm": 3.580536365509033, + "learning_rate": 3.6742934051144014e-06, + "loss": 1.0128, + "step": 273 + }, + { + "epoch": 0.0221418614517465, + "grad_norm": 3.2871317863464355, + "learning_rate": 3.6877523553162856e-06, + "loss": 0.9887, + "step": 274 + }, + { + "epoch": 0.02222267116507404, + "grad_norm": 3.7190616130828857, + "learning_rate": 3.70121130551817e-06, + "loss": 1.0394, + "step": 275 + }, + { + "epoch": 0.022303480878401585, + "grad_norm": 3.1299095153808594, + "learning_rate": 3.7146702557200542e-06, + "loss": 1.1785, + "step": 276 + }, + { + "epoch": 0.022384290591729125, + "grad_norm": 3.4073848724365234, + "learning_rate": 3.728129205921938e-06, + "loss": 1.0502, + "step": 277 + }, + { + "epoch": 0.02246510030505667, + "grad_norm": 3.4248945713043213, + "learning_rate": 3.7415881561238228e-06, + "loss": 1.0235, + "step": 278 + }, + { + "epoch": 0.02254591001838421, + "grad_norm": 3.900418758392334, + "learning_rate": 3.755047106325707e-06, + "loss": 1.0979, + "step": 279 + }, + { + "epoch": 0.02262671973171175, + "grad_norm": 3.830432415008545, + "learning_rate": 3.768506056527591e-06, + "loss": 1.1172, + "step": 280 + }, + { + "epoch": 0.022707529445039294, + "grad_norm": 3.209726333618164, + "learning_rate": 3.7819650067294756e-06, + "loss": 1.0036, + "step": 281 + }, + { + "epoch": 0.022788339158366835, + "grad_norm": 3.1260225772857666, + "learning_rate": 3.7954239569313595e-06, + "loss": 1.1003, + "step": 282 + }, + { + "epoch": 0.02286914887169438, + "grad_norm": 3.050652503967285, + "learning_rate": 3.808882907133244e-06, + "loss": 1.0699, + "step": 283 + }, + { + "epoch": 0.02294995858502192, + "grad_norm": 3.4412708282470703, + "learning_rate": 3.822341857335128e-06, + "loss": 1.0617, + "step": 284 + }, + { + "epoch": 0.023030768298349463, + "grad_norm": 3.2412991523742676, + "learning_rate": 3.835800807537012e-06, + "loss": 1.1161, + "step": 285 + }, + { + "epoch": 0.023111578011677004, + "grad_norm": 3.3610498905181885, + "learning_rate": 3.849259757738897e-06, + "loss": 1.0642, + "step": 286 + }, + { + "epoch": 0.023192387725004544, + "grad_norm": 4.764945983886719, + "learning_rate": 3.8627187079407805e-06, + "loss": 1.0355, + "step": 287 + }, + { + "epoch": 0.02327319743833209, + "grad_norm": 3.3774383068084717, + "learning_rate": 3.876177658142665e-06, + "loss": 0.9947, + "step": 288 + }, + { + "epoch": 0.02335400715165963, + "grad_norm": 3.4327151775360107, + "learning_rate": 3.88963660834455e-06, + "loss": 0.9403, + "step": 289 + }, + { + "epoch": 0.023434816864987173, + "grad_norm": 3.2053916454315186, + "learning_rate": 3.903095558546434e-06, + "loss": 0.9516, + "step": 290 + }, + { + "epoch": 0.023515626578314713, + "grad_norm": 3.281947612762451, + "learning_rate": 3.9165545087483185e-06, + "loss": 1.004, + "step": 291 + }, + { + "epoch": 0.023596436291642254, + "grad_norm": 3.3425886631011963, + "learning_rate": 3.930013458950202e-06, + "loss": 1.0692, + "step": 292 + }, + { + "epoch": 0.023677246004969798, + "grad_norm": 3.682798147201538, + "learning_rate": 3.943472409152086e-06, + "loss": 0.9881, + "step": 293 + }, + { + "epoch": 0.02375805571829734, + "grad_norm": 4.1106977462768555, + "learning_rate": 3.956931359353971e-06, + "loss": 1.029, + "step": 294 + }, + { + "epoch": 0.023838865431624882, + "grad_norm": 2.9321463108062744, + "learning_rate": 3.970390309555855e-06, + "loss": 1.0701, + "step": 295 + }, + { + "epoch": 0.023919675144952423, + "grad_norm": 3.840993881225586, + "learning_rate": 3.9838492597577395e-06, + "loss": 1.0822, + "step": 296 + }, + { + "epoch": 0.024000484858279967, + "grad_norm": 3.33097505569458, + "learning_rate": 3.997308209959623e-06, + "loss": 1.0678, + "step": 297 + }, + { + "epoch": 0.024081294571607507, + "grad_norm": 3.3836615085601807, + "learning_rate": 4.010767160161508e-06, + "loss": 0.9405, + "step": 298 + }, + { + "epoch": 0.024162104284935048, + "grad_norm": 3.6300384998321533, + "learning_rate": 4.024226110363392e-06, + "loss": 1.0488, + "step": 299 + }, + { + "epoch": 0.024242913998262592, + "grad_norm": 4.789526462554932, + "learning_rate": 4.037685060565276e-06, + "loss": 1.1742, + "step": 300 + }, + { + "epoch": 0.024323723711590133, + "grad_norm": 4.001083850860596, + "learning_rate": 4.0511440107671605e-06, + "loss": 1.0492, + "step": 301 + }, + { + "epoch": 0.024404533424917677, + "grad_norm": 3.0341265201568604, + "learning_rate": 4.064602960969044e-06, + "loss": 1.0646, + "step": 302 + }, + { + "epoch": 0.024485343138245217, + "grad_norm": 3.0529284477233887, + "learning_rate": 4.078061911170929e-06, + "loss": 1.0188, + "step": 303 + }, + { + "epoch": 0.024566152851572758, + "grad_norm": 3.5513386726379395, + "learning_rate": 4.091520861372813e-06, + "loss": 1.0259, + "step": 304 + }, + { + "epoch": 0.0246469625649003, + "grad_norm": 3.639235734939575, + "learning_rate": 4.104979811574698e-06, + "loss": 1.0545, + "step": 305 + }, + { + "epoch": 0.024727772278227842, + "grad_norm": 2.771613597869873, + "learning_rate": 4.118438761776582e-06, + "loss": 0.9668, + "step": 306 + }, + { + "epoch": 0.024808581991555386, + "grad_norm": 2.699173927307129, + "learning_rate": 4.131897711978466e-06, + "loss": 1.1614, + "step": 307 + }, + { + "epoch": 0.024889391704882927, + "grad_norm": 3.053584098815918, + "learning_rate": 4.14535666218035e-06, + "loss": 1.1264, + "step": 308 + }, + { + "epoch": 0.02497020141821047, + "grad_norm": 3.2321035861968994, + "learning_rate": 4.158815612382234e-06, + "loss": 1.0321, + "step": 309 + }, + { + "epoch": 0.02505101113153801, + "grad_norm": 3.159599542617798, + "learning_rate": 4.172274562584119e-06, + "loss": 1.0271, + "step": 310 + }, + { + "epoch": 0.02513182084486555, + "grad_norm": 3.739326238632202, + "learning_rate": 4.185733512786003e-06, + "loss": 1.0775, + "step": 311 + }, + { + "epoch": 0.025212630558193096, + "grad_norm": 2.9522006511688232, + "learning_rate": 4.199192462987887e-06, + "loss": 1.16, + "step": 312 + }, + { + "epoch": 0.025293440271520636, + "grad_norm": 4.1226301193237305, + "learning_rate": 4.212651413189772e-06, + "loss": 1.0076, + "step": 313 + }, + { + "epoch": 0.02537424998484818, + "grad_norm": 3.438518762588501, + "learning_rate": 4.226110363391656e-06, + "loss": 1.1045, + "step": 314 + }, + { + "epoch": 0.02545505969817572, + "grad_norm": 3.4645230770111084, + "learning_rate": 4.23956931359354e-06, + "loss": 1.0568, + "step": 315 + }, + { + "epoch": 0.02553586941150326, + "grad_norm": 3.5905091762542725, + "learning_rate": 4.253028263795424e-06, + "loss": 1.0134, + "step": 316 + }, + { + "epoch": 0.025616679124830805, + "grad_norm": 3.258754014968872, + "learning_rate": 4.266487213997308e-06, + "loss": 0.958, + "step": 317 + }, + { + "epoch": 0.025697488838158346, + "grad_norm": 3.016822338104248, + "learning_rate": 4.279946164199193e-06, + "loss": 1.069, + "step": 318 + }, + { + "epoch": 0.02577829855148589, + "grad_norm": 3.46635365486145, + "learning_rate": 4.293405114401077e-06, + "loss": 1.0767, + "step": 319 + }, + { + "epoch": 0.02585910826481343, + "grad_norm": 3.2740538120269775, + "learning_rate": 4.3068640646029616e-06, + "loss": 1.1643, + "step": 320 + }, + { + "epoch": 0.02593991797814097, + "grad_norm": 2.509807586669922, + "learning_rate": 4.320323014804845e-06, + "loss": 1.0419, + "step": 321 + }, + { + "epoch": 0.026020727691468515, + "grad_norm": 3.7150912284851074, + "learning_rate": 4.33378196500673e-06, + "loss": 0.9806, + "step": 322 + }, + { + "epoch": 0.026101537404796055, + "grad_norm": 3.265268087387085, + "learning_rate": 4.347240915208614e-06, + "loss": 1.0416, + "step": 323 + }, + { + "epoch": 0.0261823471181236, + "grad_norm": 3.423750162124634, + "learning_rate": 4.360699865410498e-06, + "loss": 1.0875, + "step": 324 + }, + { + "epoch": 0.02626315683145114, + "grad_norm": 3.0355896949768066, + "learning_rate": 4.3741588156123826e-06, + "loss": 1.0706, + "step": 325 + }, + { + "epoch": 0.026343966544778684, + "grad_norm": 3.399970531463623, + "learning_rate": 4.3876177658142664e-06, + "loss": 0.9565, + "step": 326 + }, + { + "epoch": 0.026424776258106224, + "grad_norm": 2.427396774291992, + "learning_rate": 4.401076716016151e-06, + "loss": 1.1196, + "step": 327 + }, + { + "epoch": 0.026505585971433765, + "grad_norm": 2.8762001991271973, + "learning_rate": 4.414535666218036e-06, + "loss": 1.0821, + "step": 328 + }, + { + "epoch": 0.02658639568476131, + "grad_norm": 2.805931329727173, + "learning_rate": 4.42799461641992e-06, + "loss": 1.085, + "step": 329 + }, + { + "epoch": 0.02666720539808885, + "grad_norm": 2.9563746452331543, + "learning_rate": 4.4414535666218036e-06, + "loss": 1.1011, + "step": 330 + }, + { + "epoch": 0.026748015111416393, + "grad_norm": 3.239896535873413, + "learning_rate": 4.454912516823688e-06, + "loss": 1.0285, + "step": 331 + }, + { + "epoch": 0.026828824824743934, + "grad_norm": 3.9557912349700928, + "learning_rate": 4.468371467025572e-06, + "loss": 1.0994, + "step": 332 + }, + { + "epoch": 0.026909634538071475, + "grad_norm": 3.006235122680664, + "learning_rate": 4.481830417227457e-06, + "loss": 1.0512, + "step": 333 + }, + { + "epoch": 0.02699044425139902, + "grad_norm": 2.7034130096435547, + "learning_rate": 4.495289367429341e-06, + "loss": 1.0497, + "step": 334 + }, + { + "epoch": 0.02707125396472656, + "grad_norm": 3.289365530014038, + "learning_rate": 4.5087483176312254e-06, + "loss": 1.0073, + "step": 335 + }, + { + "epoch": 0.027152063678054103, + "grad_norm": 2.970745801925659, + "learning_rate": 4.522207267833109e-06, + "loss": 1.0532, + "step": 336 + }, + { + "epoch": 0.027232873391381644, + "grad_norm": 3.1716456413269043, + "learning_rate": 4.535666218034994e-06, + "loss": 1.1299, + "step": 337 + }, + { + "epoch": 0.027313683104709188, + "grad_norm": 3.3215408325195312, + "learning_rate": 4.549125168236878e-06, + "loss": 0.9924, + "step": 338 + }, + { + "epoch": 0.027394492818036728, + "grad_norm": 3.2503867149353027, + "learning_rate": 4.562584118438762e-06, + "loss": 1.1627, + "step": 339 + }, + { + "epoch": 0.02747530253136427, + "grad_norm": 3.6553633213043213, + "learning_rate": 4.5760430686406464e-06, + "loss": 1.005, + "step": 340 + }, + { + "epoch": 0.027556112244691813, + "grad_norm": 3.136037588119507, + "learning_rate": 4.58950201884253e-06, + "loss": 1.0578, + "step": 341 + }, + { + "epoch": 0.027636921958019353, + "grad_norm": 3.279416561126709, + "learning_rate": 4.602960969044415e-06, + "loss": 1.1861, + "step": 342 + }, + { + "epoch": 0.027717731671346897, + "grad_norm": 3.5951592922210693, + "learning_rate": 4.6164199192463e-06, + "loss": 1.0194, + "step": 343 + }, + { + "epoch": 0.027798541384674438, + "grad_norm": 3.4381399154663086, + "learning_rate": 4.629878869448184e-06, + "loss": 1.0519, + "step": 344 + }, + { + "epoch": 0.027879351098001978, + "grad_norm": 3.5402069091796875, + "learning_rate": 4.6433378196500674e-06, + "loss": 1.0954, + "step": 345 + }, + { + "epoch": 0.027960160811329522, + "grad_norm": 2.8578295707702637, + "learning_rate": 4.656796769851951e-06, + "loss": 1.112, + "step": 346 + }, + { + "epoch": 0.028040970524657063, + "grad_norm": 2.570523977279663, + "learning_rate": 4.670255720053836e-06, + "loss": 1.1534, + "step": 347 + }, + { + "epoch": 0.028121780237984607, + "grad_norm": 3.5016822814941406, + "learning_rate": 4.683714670255721e-06, + "loss": 1.0122, + "step": 348 + }, + { + "epoch": 0.028202589951312147, + "grad_norm": 3.6116175651550293, + "learning_rate": 4.697173620457605e-06, + "loss": 0.9669, + "step": 349 + }, + { + "epoch": 0.02828339966463969, + "grad_norm": 3.697629690170288, + "learning_rate": 4.710632570659489e-06, + "loss": 0.9453, + "step": 350 + }, + { + "epoch": 0.028364209377967232, + "grad_norm": 2.860255479812622, + "learning_rate": 4.724091520861373e-06, + "loss": 1.0293, + "step": 351 + }, + { + "epoch": 0.028445019091294772, + "grad_norm": 2.7185349464416504, + "learning_rate": 4.737550471063258e-06, + "loss": 1.0949, + "step": 352 + }, + { + "epoch": 0.028525828804622316, + "grad_norm": 3.5234262943267822, + "learning_rate": 4.751009421265142e-06, + "loss": 1.0291, + "step": 353 + }, + { + "epoch": 0.028606638517949857, + "grad_norm": 3.042196273803711, + "learning_rate": 4.764468371467026e-06, + "loss": 1.0282, + "step": 354 + }, + { + "epoch": 0.0286874482312774, + "grad_norm": 3.581164836883545, + "learning_rate": 4.77792732166891e-06, + "loss": 1.1342, + "step": 355 + }, + { + "epoch": 0.02876825794460494, + "grad_norm": 4.2634477615356445, + "learning_rate": 4.791386271870794e-06, + "loss": 1.0666, + "step": 356 + }, + { + "epoch": 0.028849067657932482, + "grad_norm": 3.3745503425598145, + "learning_rate": 4.804845222072679e-06, + "loss": 0.9496, + "step": 357 + }, + { + "epoch": 0.028929877371260026, + "grad_norm": 3.2284529209136963, + "learning_rate": 4.818304172274563e-06, + "loss": 1.0532, + "step": 358 + }, + { + "epoch": 0.029010687084587566, + "grad_norm": 3.2654168605804443, + "learning_rate": 4.8317631224764475e-06, + "loss": 1.1196, + "step": 359 + }, + { + "epoch": 0.02909149679791511, + "grad_norm": 3.0453786849975586, + "learning_rate": 4.845222072678331e-06, + "loss": 0.9026, + "step": 360 + }, + { + "epoch": 0.02917230651124265, + "grad_norm": 3.1820859909057617, + "learning_rate": 4.858681022880215e-06, + "loss": 1.0151, + "step": 361 + }, + { + "epoch": 0.029253116224570195, + "grad_norm": 2.892068386077881, + "learning_rate": 4.8721399730821e-06, + "loss": 1.0974, + "step": 362 + }, + { + "epoch": 0.029333925937897735, + "grad_norm": 3.9766123294830322, + "learning_rate": 4.885598923283984e-06, + "loss": 1.0929, + "step": 363 + }, + { + "epoch": 0.029414735651225276, + "grad_norm": 3.188502550125122, + "learning_rate": 4.8990578734858685e-06, + "loss": 1.1204, + "step": 364 + }, + { + "epoch": 0.02949554536455282, + "grad_norm": 2.9032983779907227, + "learning_rate": 4.912516823687753e-06, + "loss": 1.1307, + "step": 365 + }, + { + "epoch": 0.02957635507788036, + "grad_norm": 3.1441397666931152, + "learning_rate": 4.925975773889637e-06, + "loss": 1.2278, + "step": 366 + }, + { + "epoch": 0.029657164791207904, + "grad_norm": 3.025193929672241, + "learning_rate": 4.939434724091522e-06, + "loss": 1.0689, + "step": 367 + }, + { + "epoch": 0.029737974504535445, + "grad_norm": 3.3389151096343994, + "learning_rate": 4.952893674293406e-06, + "loss": 1.074, + "step": 368 + }, + { + "epoch": 0.029818784217862986, + "grad_norm": 2.899667263031006, + "learning_rate": 4.9663526244952895e-06, + "loss": 1.0223, + "step": 369 + }, + { + "epoch": 0.02989959393119053, + "grad_norm": 3.271374464035034, + "learning_rate": 4.979811574697174e-06, + "loss": 1.01, + "step": 370 + }, + { + "epoch": 0.02998040364451807, + "grad_norm": 2.800734043121338, + "learning_rate": 4.993270524899058e-06, + "loss": 1.0948, + "step": 371 + }, + { + "epoch": 0.030061213357845614, + "grad_norm": 3.0062313079833984, + "learning_rate": 5.006729475100942e-06, + "loss": 1.059, + "step": 372 + }, + { + "epoch": 0.030142023071173155, + "grad_norm": 3.1042842864990234, + "learning_rate": 5.020188425302827e-06, + "loss": 1.0002, + "step": 373 + }, + { + "epoch": 0.0302228327845007, + "grad_norm": 3.154513120651245, + "learning_rate": 5.033647375504711e-06, + "loss": 0.9859, + "step": 374 + }, + { + "epoch": 0.03030364249782824, + "grad_norm": 2.9707469940185547, + "learning_rate": 5.047106325706595e-06, + "loss": 1.0158, + "step": 375 + }, + { + "epoch": 0.03038445221115578, + "grad_norm": 3.7019219398498535, + "learning_rate": 5.06056527590848e-06, + "loss": 0.9693, + "step": 376 + }, + { + "epoch": 0.030465261924483324, + "grad_norm": 3.5970447063446045, + "learning_rate": 5.074024226110364e-06, + "loss": 1.0817, + "step": 377 + }, + { + "epoch": 0.030546071637810864, + "grad_norm": 3.0415992736816406, + "learning_rate": 5.087483176312248e-06, + "loss": 1.0749, + "step": 378 + }, + { + "epoch": 0.030626881351138408, + "grad_norm": 3.1389529705047607, + "learning_rate": 5.100942126514132e-06, + "loss": 1.1466, + "step": 379 + }, + { + "epoch": 0.03070769106446595, + "grad_norm": 3.5312931537628174, + "learning_rate": 5.114401076716017e-06, + "loss": 1.0444, + "step": 380 + }, + { + "epoch": 0.03078850077779349, + "grad_norm": 2.7313449382781982, + "learning_rate": 5.127860026917901e-06, + "loss": 1.0735, + "step": 381 + }, + { + "epoch": 0.030869310491121033, + "grad_norm": 3.8215954303741455, + "learning_rate": 5.141318977119785e-06, + "loss": 1.0956, + "step": 382 + }, + { + "epoch": 0.030950120204448574, + "grad_norm": 3.008662223815918, + "learning_rate": 5.1547779273216695e-06, + "loss": 1.1227, + "step": 383 + }, + { + "epoch": 0.031030929917776118, + "grad_norm": 3.5137126445770264, + "learning_rate": 5.168236877523553e-06, + "loss": 1.0647, + "step": 384 + }, + { + "epoch": 0.03111173963110366, + "grad_norm": 3.1032652854919434, + "learning_rate": 5.181695827725438e-06, + "loss": 0.9709, + "step": 385 + }, + { + "epoch": 0.031192549344431202, + "grad_norm": 2.643289089202881, + "learning_rate": 5.195154777927323e-06, + "loss": 1.0143, + "step": 386 + }, + { + "epoch": 0.03127335905775874, + "grad_norm": 3.3892714977264404, + "learning_rate": 5.208613728129206e-06, + "loss": 1.0897, + "step": 387 + }, + { + "epoch": 0.03135416877108629, + "grad_norm": 3.087230920791626, + "learning_rate": 5.2220726783310905e-06, + "loss": 1.0535, + "step": 388 + }, + { + "epoch": 0.031434978484413824, + "grad_norm": 3.1502482891082764, + "learning_rate": 5.235531628532975e-06, + "loss": 0.9344, + "step": 389 + }, + { + "epoch": 0.03151578819774137, + "grad_norm": 3.1660571098327637, + "learning_rate": 5.248990578734859e-06, + "loss": 0.9898, + "step": 390 + }, + { + "epoch": 0.03159659791106891, + "grad_norm": 3.1750237941741943, + "learning_rate": 5.262449528936744e-06, + "loss": 0.9014, + "step": 391 + }, + { + "epoch": 0.031677407624396456, + "grad_norm": 3.0551154613494873, + "learning_rate": 5.275908479138628e-06, + "loss": 1.1992, + "step": 392 + }, + { + "epoch": 0.03175821733772399, + "grad_norm": 2.822049617767334, + "learning_rate": 5.2893674293405115e-06, + "loss": 1.0354, + "step": 393 + }, + { + "epoch": 0.03183902705105154, + "grad_norm": 3.1479337215423584, + "learning_rate": 5.302826379542396e-06, + "loss": 1.0756, + "step": 394 + }, + { + "epoch": 0.03191983676437908, + "grad_norm": 4.29952335357666, + "learning_rate": 5.316285329744281e-06, + "loss": 0.9937, + "step": 395 + }, + { + "epoch": 0.03200064647770662, + "grad_norm": 3.460026264190674, + "learning_rate": 5.329744279946165e-06, + "loss": 1.0865, + "step": 396 + }, + { + "epoch": 0.03208145619103416, + "grad_norm": 2.976209878921509, + "learning_rate": 5.343203230148049e-06, + "loss": 1.0514, + "step": 397 + }, + { + "epoch": 0.032162265904361706, + "grad_norm": 3.255568027496338, + "learning_rate": 5.356662180349933e-06, + "loss": 1.0109, + "step": 398 + }, + { + "epoch": 0.03224307561768924, + "grad_norm": 2.8264927864074707, + "learning_rate": 5.370121130551817e-06, + "loss": 1.0703, + "step": 399 + }, + { + "epoch": 0.03232388533101679, + "grad_norm": 3.4914638996124268, + "learning_rate": 5.383580080753702e-06, + "loss": 1.0132, + "step": 400 + }, + { + "epoch": 0.03240469504434433, + "grad_norm": 3.143792152404785, + "learning_rate": 5.397039030955587e-06, + "loss": 1.0457, + "step": 401 + }, + { + "epoch": 0.032485504757671875, + "grad_norm": 3.3566250801086426, + "learning_rate": 5.41049798115747e-06, + "loss": 1.0481, + "step": 402 + }, + { + "epoch": 0.03256631447099941, + "grad_norm": 2.8489530086517334, + "learning_rate": 5.423956931359354e-06, + "loss": 1.1821, + "step": 403 + }, + { + "epoch": 0.032647124184326956, + "grad_norm": 3.260387897491455, + "learning_rate": 5.437415881561239e-06, + "loss": 1.0802, + "step": 404 + }, + { + "epoch": 0.0327279338976545, + "grad_norm": 3.239534378051758, + "learning_rate": 5.450874831763123e-06, + "loss": 1.0194, + "step": 405 + }, + { + "epoch": 0.03280874361098204, + "grad_norm": 3.2290117740631104, + "learning_rate": 5.464333781965007e-06, + "loss": 1.014, + "step": 406 + }, + { + "epoch": 0.03288955332430958, + "grad_norm": 3.052867889404297, + "learning_rate": 5.4777927321668915e-06, + "loss": 1.147, + "step": 407 + }, + { + "epoch": 0.032970363037637125, + "grad_norm": 3.0773065090179443, + "learning_rate": 5.491251682368775e-06, + "loss": 1.1505, + "step": 408 + }, + { + "epoch": 0.03305117275096467, + "grad_norm": 4.307579040527344, + "learning_rate": 5.50471063257066e-06, + "loss": 1.0423, + "step": 409 + }, + { + "epoch": 0.033131982464292206, + "grad_norm": 2.731487274169922, + "learning_rate": 5.518169582772545e-06, + "loss": 1.0521, + "step": 410 + }, + { + "epoch": 0.03321279217761975, + "grad_norm": 2.8557963371276855, + "learning_rate": 5.531628532974428e-06, + "loss": 1.1326, + "step": 411 + }, + { + "epoch": 0.033293601890947294, + "grad_norm": 2.9911723136901855, + "learning_rate": 5.5450874831763125e-06, + "loss": 1.0449, + "step": 412 + }, + { + "epoch": 0.03337441160427483, + "grad_norm": 3.545870780944824, + "learning_rate": 5.558546433378197e-06, + "loss": 0.9643, + "step": 413 + }, + { + "epoch": 0.033455221317602375, + "grad_norm": 3.228506326675415, + "learning_rate": 5.572005383580081e-06, + "loss": 0.9888, + "step": 414 + }, + { + "epoch": 0.03353603103092992, + "grad_norm": 3.3360986709594727, + "learning_rate": 5.585464333781966e-06, + "loss": 1.1374, + "step": 415 + }, + { + "epoch": 0.03361684074425746, + "grad_norm": 3.142885208129883, + "learning_rate": 5.598923283983849e-06, + "loss": 1.0261, + "step": 416 + }, + { + "epoch": 0.033697650457585, + "grad_norm": 3.678441286087036, + "learning_rate": 5.6123822341857335e-06, + "loss": 1.0356, + "step": 417 + }, + { + "epoch": 0.033778460170912544, + "grad_norm": 3.1850669384002686, + "learning_rate": 5.625841184387618e-06, + "loss": 0.8973, + "step": 418 + }, + { + "epoch": 0.03385926988424009, + "grad_norm": 3.2738733291625977, + "learning_rate": 5.639300134589503e-06, + "loss": 1.0297, + "step": 419 + }, + { + "epoch": 0.033940079597567625, + "grad_norm": 3.262432336807251, + "learning_rate": 5.652759084791387e-06, + "loss": 1.1121, + "step": 420 + }, + { + "epoch": 0.03402088931089517, + "grad_norm": 3.300443172454834, + "learning_rate": 5.666218034993271e-06, + "loss": 1.0872, + "step": 421 + }, + { + "epoch": 0.03410169902422271, + "grad_norm": 3.37052059173584, + "learning_rate": 5.679676985195155e-06, + "loss": 0.8631, + "step": 422 + }, + { + "epoch": 0.03418250873755025, + "grad_norm": 3.0574722290039062, + "learning_rate": 5.693135935397039e-06, + "loss": 0.8917, + "step": 423 + }, + { + "epoch": 0.034263318450877794, + "grad_norm": 3.1333374977111816, + "learning_rate": 5.706594885598924e-06, + "loss": 1.1487, + "step": 424 + }, + { + "epoch": 0.03434412816420534, + "grad_norm": 3.4714949131011963, + "learning_rate": 5.720053835800809e-06, + "loss": 1.0668, + "step": 425 + }, + { + "epoch": 0.03442493787753288, + "grad_norm": 3.0593292713165283, + "learning_rate": 5.733512786002692e-06, + "loss": 1.0453, + "step": 426 + }, + { + "epoch": 0.03450574759086042, + "grad_norm": 3.006802558898926, + "learning_rate": 5.746971736204576e-06, + "loss": 1.0875, + "step": 427 + }, + { + "epoch": 0.03458655730418796, + "grad_norm": 3.3820269107818604, + "learning_rate": 5.760430686406461e-06, + "loss": 1.0163, + "step": 428 + }, + { + "epoch": 0.03466736701751551, + "grad_norm": 3.098365306854248, + "learning_rate": 5.773889636608345e-06, + "loss": 1.1997, + "step": 429 + }, + { + "epoch": 0.034748176730843044, + "grad_norm": 3.436250686645508, + "learning_rate": 5.78734858681023e-06, + "loss": 0.9806, + "step": 430 + }, + { + "epoch": 0.03482898644417059, + "grad_norm": 3.1773383617401123, + "learning_rate": 5.800807537012113e-06, + "loss": 0.9911, + "step": 431 + }, + { + "epoch": 0.03490979615749813, + "grad_norm": 3.1867425441741943, + "learning_rate": 5.814266487213997e-06, + "loss": 1.1133, + "step": 432 + }, + { + "epoch": 0.034990605870825676, + "grad_norm": 3.287316083908081, + "learning_rate": 5.827725437415882e-06, + "loss": 1.0283, + "step": 433 + }, + { + "epoch": 0.035071415584153214, + "grad_norm": 3.760188579559326, + "learning_rate": 5.841184387617767e-06, + "loss": 1.1156, + "step": 434 + }, + { + "epoch": 0.03515222529748076, + "grad_norm": 2.9129562377929688, + "learning_rate": 5.854643337819651e-06, + "loss": 1.0237, + "step": 435 + }, + { + "epoch": 0.0352330350108083, + "grad_norm": 3.5939934253692627, + "learning_rate": 5.8681022880215346e-06, + "loss": 0.9305, + "step": 436 + }, + { + "epoch": 0.03531384472413584, + "grad_norm": 3.105889081954956, + "learning_rate": 5.8815612382234184e-06, + "loss": 1.0533, + "step": 437 + }, + { + "epoch": 0.03539465443746338, + "grad_norm": 3.263901948928833, + "learning_rate": 5.895020188425303e-06, + "loss": 0.9298, + "step": 438 + }, + { + "epoch": 0.03547546415079093, + "grad_norm": 3.7842049598693848, + "learning_rate": 5.908479138627188e-06, + "loss": 1.0523, + "step": 439 + }, + { + "epoch": 0.035556273864118464, + "grad_norm": 2.6877212524414062, + "learning_rate": 5.9219380888290726e-06, + "loss": 1.0692, + "step": 440 + }, + { + "epoch": 0.03563708357744601, + "grad_norm": 3.427471399307251, + "learning_rate": 5.9353970390309556e-06, + "loss": 0.993, + "step": 441 + }, + { + "epoch": 0.03571789329077355, + "grad_norm": 3.7671279907226562, + "learning_rate": 5.94885598923284e-06, + "loss": 1.1378, + "step": 442 + }, + { + "epoch": 0.035798703004101096, + "grad_norm": 3.6590023040771484, + "learning_rate": 5.962314939434725e-06, + "loss": 1.0065, + "step": 443 + }, + { + "epoch": 0.03587951271742863, + "grad_norm": 3.3577499389648438, + "learning_rate": 5.975773889636609e-06, + "loss": 1.0334, + "step": 444 + }, + { + "epoch": 0.03596032243075618, + "grad_norm": 2.663809061050415, + "learning_rate": 5.9892328398384936e-06, + "loss": 1.135, + "step": 445 + }, + { + "epoch": 0.03604113214408372, + "grad_norm": 2.9979307651519775, + "learning_rate": 6.002691790040377e-06, + "loss": 0.9237, + "step": 446 + }, + { + "epoch": 0.03612194185741126, + "grad_norm": 3.3452415466308594, + "learning_rate": 6.016150740242261e-06, + "loss": 1.0103, + "step": 447 + }, + { + "epoch": 0.0362027515707388, + "grad_norm": 3.129573106765747, + "learning_rate": 6.029609690444146e-06, + "loss": 0.9468, + "step": 448 + }, + { + "epoch": 0.036283561284066346, + "grad_norm": 3.246513605117798, + "learning_rate": 6.043068640646031e-06, + "loss": 1.0707, + "step": 449 + }, + { + "epoch": 0.03636437099739389, + "grad_norm": 3.0842173099517822, + "learning_rate": 6.056527590847915e-06, + "loss": 1.1869, + "step": 450 + }, + { + "epoch": 0.03644518071072143, + "grad_norm": 3.1168720722198486, + "learning_rate": 6.0699865410497984e-06, + "loss": 1.1247, + "step": 451 + }, + { + "epoch": 0.03652599042404897, + "grad_norm": 3.1685454845428467, + "learning_rate": 6.083445491251682e-06, + "loss": 1.037, + "step": 452 + }, + { + "epoch": 0.036606800137376515, + "grad_norm": 3.101418972015381, + "learning_rate": 6.096904441453567e-06, + "loss": 1.0524, + "step": 453 + }, + { + "epoch": 0.03668760985070405, + "grad_norm": 3.458691358566284, + "learning_rate": 6.110363391655452e-06, + "loss": 1.0948, + "step": 454 + }, + { + "epoch": 0.036768419564031596, + "grad_norm": 3.8993406295776367, + "learning_rate": 6.1238223418573364e-06, + "loss": 1.218, + "step": 455 + }, + { + "epoch": 0.03684922927735914, + "grad_norm": 3.081502676010132, + "learning_rate": 6.1372812920592195e-06, + "loss": 1.1364, + "step": 456 + }, + { + "epoch": 0.036930038990686684, + "grad_norm": 3.4621806144714355, + "learning_rate": 6.150740242261104e-06, + "loss": 1.0946, + "step": 457 + }, + { + "epoch": 0.03701084870401422, + "grad_norm": 3.7043323516845703, + "learning_rate": 6.164199192462989e-06, + "loss": 0.964, + "step": 458 + }, + { + "epoch": 0.037091658417341765, + "grad_norm": 2.8379194736480713, + "learning_rate": 6.177658142664873e-06, + "loss": 1.0017, + "step": 459 + }, + { + "epoch": 0.03717246813066931, + "grad_norm": 3.2246439456939697, + "learning_rate": 6.1911170928667574e-06, + "loss": 1.0454, + "step": 460 + }, + { + "epoch": 0.037253277843996846, + "grad_norm": 3.018449306488037, + "learning_rate": 6.2045760430686405e-06, + "loss": 0.9643, + "step": 461 + }, + { + "epoch": 0.03733408755732439, + "grad_norm": 2.728684425354004, + "learning_rate": 6.218034993270525e-06, + "loss": 1.0723, + "step": 462 + }, + { + "epoch": 0.037414897270651934, + "grad_norm": 3.353196382522583, + "learning_rate": 6.23149394347241e-06, + "loss": 0.8839, + "step": 463 + }, + { + "epoch": 0.03749570698397947, + "grad_norm": 3.0325539112091064, + "learning_rate": 6.244952893674295e-06, + "loss": 1.0293, + "step": 464 + }, + { + "epoch": 0.037576516697307015, + "grad_norm": 3.853480815887451, + "learning_rate": 6.258411843876178e-06, + "loss": 0.9792, + "step": 465 + }, + { + "epoch": 0.03765732641063456, + "grad_norm": 3.710716724395752, + "learning_rate": 6.271870794078062e-06, + "loss": 1.0661, + "step": 466 + }, + { + "epoch": 0.0377381361239621, + "grad_norm": 3.857067108154297, + "learning_rate": 6.285329744279946e-06, + "loss": 1.0816, + "step": 467 + }, + { + "epoch": 0.03781894583728964, + "grad_norm": 3.108438730239868, + "learning_rate": 6.298788694481831e-06, + "loss": 0.9808, + "step": 468 + }, + { + "epoch": 0.037899755550617184, + "grad_norm": 2.9933106899261475, + "learning_rate": 6.312247644683716e-06, + "loss": 1.072, + "step": 469 + }, + { + "epoch": 0.03798056526394473, + "grad_norm": 2.842607021331787, + "learning_rate": 6.325706594885599e-06, + "loss": 1.072, + "step": 470 + }, + { + "epoch": 0.038061374977272265, + "grad_norm": 2.906888723373413, + "learning_rate": 6.339165545087483e-06, + "loss": 1.0432, + "step": 471 + }, + { + "epoch": 0.03814218469059981, + "grad_norm": 4.220092296600342, + "learning_rate": 6.352624495289368e-06, + "loss": 1.1025, + "step": 472 + }, + { + "epoch": 0.03822299440392735, + "grad_norm": 2.690276622772217, + "learning_rate": 6.366083445491253e-06, + "loss": 1.0178, + "step": 473 + }, + { + "epoch": 0.0383038041172549, + "grad_norm": 3.268096685409546, + "learning_rate": 6.379542395693137e-06, + "loss": 0.9672, + "step": 474 + }, + { + "epoch": 0.038384613830582434, + "grad_norm": 3.46506404876709, + "learning_rate": 6.3930013458950205e-06, + "loss": 1.0341, + "step": 475 + }, + { + "epoch": 0.03846542354390998, + "grad_norm": 2.906332492828369, + "learning_rate": 6.406460296096904e-06, + "loss": 1.0948, + "step": 476 + }, + { + "epoch": 0.03854623325723752, + "grad_norm": 3.1585898399353027, + "learning_rate": 6.419919246298789e-06, + "loss": 0.9946, + "step": 477 + }, + { + "epoch": 0.03862704297056506, + "grad_norm": 3.1073646545410156, + "learning_rate": 6.433378196500674e-06, + "loss": 1.0518, + "step": 478 + }, + { + "epoch": 0.0387078526838926, + "grad_norm": 2.927382230758667, + "learning_rate": 6.4468371467025585e-06, + "loss": 1.0208, + "step": 479 + }, + { + "epoch": 0.03878866239722015, + "grad_norm": 3.1290457248687744, + "learning_rate": 6.4602960969044415e-06, + "loss": 1.0853, + "step": 480 + }, + { + "epoch": 0.03886947211054769, + "grad_norm": 3.4820735454559326, + "learning_rate": 6.473755047106326e-06, + "loss": 1.1244, + "step": 481 + }, + { + "epoch": 0.03895028182387523, + "grad_norm": 2.8432974815368652, + "learning_rate": 6.48721399730821e-06, + "loss": 0.965, + "step": 482 + }, + { + "epoch": 0.03903109153720277, + "grad_norm": 3.6052823066711426, + "learning_rate": 6.500672947510095e-06, + "loss": 1.0264, + "step": 483 + }, + { + "epoch": 0.039111901250530316, + "grad_norm": 3.049638271331787, + "learning_rate": 6.5141318977119795e-06, + "loss": 0.9849, + "step": 484 + }, + { + "epoch": 0.03919271096385785, + "grad_norm": 3.698913812637329, + "learning_rate": 6.5275908479138625e-06, + "loss": 1.0338, + "step": 485 + }, + { + "epoch": 0.0392735206771854, + "grad_norm": 3.1109588146209717, + "learning_rate": 6.541049798115747e-06, + "loss": 1.0894, + "step": 486 + }, + { + "epoch": 0.03935433039051294, + "grad_norm": 3.6122915744781494, + "learning_rate": 6.554508748317632e-06, + "loss": 1.0949, + "step": 487 + }, + { + "epoch": 0.03943514010384048, + "grad_norm": 3.2624239921569824, + "learning_rate": 6.567967698519517e-06, + "loss": 1.069, + "step": 488 + }, + { + "epoch": 0.03951594981716802, + "grad_norm": 3.7620856761932373, + "learning_rate": 6.5814266487214005e-06, + "loss": 0.9941, + "step": 489 + }, + { + "epoch": 0.039596759530495566, + "grad_norm": 3.5220227241516113, + "learning_rate": 6.594885598923284e-06, + "loss": 1.0572, + "step": 490 + }, + { + "epoch": 0.03967756924382311, + "grad_norm": 2.813934564590454, + "learning_rate": 6.608344549125168e-06, + "loss": 1.0088, + "step": 491 + }, + { + "epoch": 0.03975837895715065, + "grad_norm": 3.2144017219543457, + "learning_rate": 6.621803499327053e-06, + "loss": 1.0113, + "step": 492 + }, + { + "epoch": 0.03983918867047819, + "grad_norm": 3.5524468421936035, + "learning_rate": 6.635262449528938e-06, + "loss": 1.0492, + "step": 493 + }, + { + "epoch": 0.039919998383805735, + "grad_norm": 3.4819772243499756, + "learning_rate": 6.648721399730822e-06, + "loss": 1.0768, + "step": 494 + }, + { + "epoch": 0.04000080809713327, + "grad_norm": 3.494050979614258, + "learning_rate": 6.662180349932705e-06, + "loss": 0.9604, + "step": 495 + }, + { + "epoch": 0.040081617810460816, + "grad_norm": 3.5531656742095947, + "learning_rate": 6.67563930013459e-06, + "loss": 1.1349, + "step": 496 + }, + { + "epoch": 0.04016242752378836, + "grad_norm": 3.5784482955932617, + "learning_rate": 6.689098250336474e-06, + "loss": 0.995, + "step": 497 + }, + { + "epoch": 0.040243237237115904, + "grad_norm": 2.6178855895996094, + "learning_rate": 6.702557200538359e-06, + "loss": 1.1689, + "step": 498 + }, + { + "epoch": 0.04032404695044344, + "grad_norm": 3.002977132797241, + "learning_rate": 6.716016150740243e-06, + "loss": 1.1346, + "step": 499 + }, + { + "epoch": 0.040404856663770986, + "grad_norm": 3.476471185684204, + "learning_rate": 6.729475100942126e-06, + "loss": 1.0845, + "step": 500 + }, + { + "epoch": 0.04048566637709853, + "grad_norm": 2.8205230236053467, + "learning_rate": 6.742934051144011e-06, + "loss": 0.9693, + "step": 501 + }, + { + "epoch": 0.04056647609042607, + "grad_norm": 3.3831284046173096, + "learning_rate": 6.756393001345896e-06, + "loss": 1.0258, + "step": 502 + }, + { + "epoch": 0.04064728580375361, + "grad_norm": 3.420717716217041, + "learning_rate": 6.7698519515477805e-06, + "loss": 1.0712, + "step": 503 + }, + { + "epoch": 0.040728095517081155, + "grad_norm": 3.0955588817596436, + "learning_rate": 6.783310901749664e-06, + "loss": 1.113, + "step": 504 + }, + { + "epoch": 0.0408089052304087, + "grad_norm": 3.0548081398010254, + "learning_rate": 6.796769851951548e-06, + "loss": 1.0572, + "step": 505 + }, + { + "epoch": 0.040889714943736236, + "grad_norm": 2.7227015495300293, + "learning_rate": 6.810228802153432e-06, + "loss": 1.0681, + "step": 506 + }, + { + "epoch": 0.04097052465706378, + "grad_norm": 3.3277525901794434, + "learning_rate": 6.823687752355317e-06, + "loss": 1.0435, + "step": 507 + }, + { + "epoch": 0.041051334370391324, + "grad_norm": 3.2822682857513428, + "learning_rate": 6.8371467025572015e-06, + "loss": 1.0552, + "step": 508 + }, + { + "epoch": 0.04113214408371886, + "grad_norm": 3.707235813140869, + "learning_rate": 6.850605652759086e-06, + "loss": 1.0892, + "step": 509 + }, + { + "epoch": 0.041212953797046405, + "grad_norm": 3.084048271179199, + "learning_rate": 6.864064602960969e-06, + "loss": 1.0666, + "step": 510 + }, + { + "epoch": 0.04129376351037395, + "grad_norm": 3.647761344909668, + "learning_rate": 6.877523553162854e-06, + "loss": 1.044, + "step": 511 + }, + { + "epoch": 0.041374573223701486, + "grad_norm": 3.338500499725342, + "learning_rate": 6.890982503364738e-06, + "loss": 1.0436, + "step": 512 + }, + { + "epoch": 0.04145538293702903, + "grad_norm": 3.313065767288208, + "learning_rate": 6.9044414535666225e-06, + "loss": 1.0777, + "step": 513 + }, + { + "epoch": 0.041536192650356574, + "grad_norm": 3.0511531829833984, + "learning_rate": 6.917900403768507e-06, + "loss": 1.0664, + "step": 514 + }, + { + "epoch": 0.04161700236368412, + "grad_norm": 3.0324454307556152, + "learning_rate": 6.93135935397039e-06, + "loss": 1.0095, + "step": 515 + }, + { + "epoch": 0.041697812077011655, + "grad_norm": 3.7721328735351562, + "learning_rate": 6.944818304172275e-06, + "loss": 1.0013, + "step": 516 + }, + { + "epoch": 0.0417786217903392, + "grad_norm": 2.761353015899658, + "learning_rate": 6.95827725437416e-06, + "loss": 1.0539, + "step": 517 + }, + { + "epoch": 0.04185943150366674, + "grad_norm": 2.751193046569824, + "learning_rate": 6.9717362045760435e-06, + "loss": 1.0355, + "step": 518 + }, + { + "epoch": 0.04194024121699428, + "grad_norm": 3.0338006019592285, + "learning_rate": 6.985195154777928e-06, + "loss": 1.1598, + "step": 519 + }, + { + "epoch": 0.042021050930321824, + "grad_norm": 3.42229962348938, + "learning_rate": 6.998654104979812e-06, + "loss": 0.9632, + "step": 520 + }, + { + "epoch": 0.04210186064364937, + "grad_norm": 2.795228958129883, + "learning_rate": 7.012113055181696e-06, + "loss": 1.1743, + "step": 521 + }, + { + "epoch": 0.04218267035697691, + "grad_norm": 3.4653124809265137, + "learning_rate": 7.025572005383581e-06, + "loss": 1.0084, + "step": 522 + }, + { + "epoch": 0.04226348007030445, + "grad_norm": 3.5253326892852783, + "learning_rate": 7.039030955585465e-06, + "loss": 1.0455, + "step": 523 + }, + { + "epoch": 0.04234428978363199, + "grad_norm": 3.0897176265716553, + "learning_rate": 7.05248990578735e-06, + "loss": 1.1321, + "step": 524 + }, + { + "epoch": 0.04242509949695954, + "grad_norm": 3.368062973022461, + "learning_rate": 7.065948855989233e-06, + "loss": 1.0558, + "step": 525 + }, + { + "epoch": 0.042505909210287074, + "grad_norm": 2.7995309829711914, + "learning_rate": 7.079407806191118e-06, + "loss": 1.2026, + "step": 526 + }, + { + "epoch": 0.04258671892361462, + "grad_norm": 2.430469274520874, + "learning_rate": 7.092866756393002e-06, + "loss": 1.0671, + "step": 527 + }, + { + "epoch": 0.04266752863694216, + "grad_norm": 3.5027177333831787, + "learning_rate": 7.106325706594886e-06, + "loss": 1.0246, + "step": 528 + }, + { + "epoch": 0.0427483383502697, + "grad_norm": 3.2911908626556396, + "learning_rate": 7.11978465679677e-06, + "loss": 1.0821, + "step": 529 + }, + { + "epoch": 0.04282914806359724, + "grad_norm": 3.2344489097595215, + "learning_rate": 7.133243606998654e-06, + "loss": 0.9794, + "step": 530 + }, + { + "epoch": 0.04290995777692479, + "grad_norm": 3.9854869842529297, + "learning_rate": 7.146702557200539e-06, + "loss": 1.0751, + "step": 531 + }, + { + "epoch": 0.04299076749025233, + "grad_norm": 3.5476224422454834, + "learning_rate": 7.1601615074024235e-06, + "loss": 1.1167, + "step": 532 + }, + { + "epoch": 0.04307157720357987, + "grad_norm": 3.635493516921997, + "learning_rate": 7.173620457604307e-06, + "loss": 1.1054, + "step": 533 + }, + { + "epoch": 0.04315238691690741, + "grad_norm": 3.257753849029541, + "learning_rate": 7.187079407806191e-06, + "loss": 1.0461, + "step": 534 + }, + { + "epoch": 0.043233196630234956, + "grad_norm": 3.016845464706421, + "learning_rate": 7.200538358008076e-06, + "loss": 1.1186, + "step": 535 + }, + { + "epoch": 0.04331400634356249, + "grad_norm": 3.6017231941223145, + "learning_rate": 7.21399730820996e-06, + "loss": 1.1136, + "step": 536 + }, + { + "epoch": 0.04339481605689004, + "grad_norm": 3.1112570762634277, + "learning_rate": 7.2274562584118446e-06, + "loss": 0.9677, + "step": 537 + }, + { + "epoch": 0.04347562577021758, + "grad_norm": 3.657864809036255, + "learning_rate": 7.240915208613729e-06, + "loss": 1.1158, + "step": 538 + }, + { + "epoch": 0.043556435483545125, + "grad_norm": 3.514461040496826, + "learning_rate": 7.254374158815612e-06, + "loss": 1.0667, + "step": 539 + }, + { + "epoch": 0.04363724519687266, + "grad_norm": 2.8252127170562744, + "learning_rate": 7.267833109017497e-06, + "loss": 1.0977, + "step": 540 + }, + { + "epoch": 0.043718054910200206, + "grad_norm": 3.032993793487549, + "learning_rate": 7.281292059219382e-06, + "loss": 1.0275, + "step": 541 + }, + { + "epoch": 0.04379886462352775, + "grad_norm": 3.2496330738067627, + "learning_rate": 7.2947510094212656e-06, + "loss": 1.0126, + "step": 542 + }, + { + "epoch": 0.04387967433685529, + "grad_norm": 4.544018745422363, + "learning_rate": 7.30820995962315e-06, + "loss": 1.0042, + "step": 543 + }, + { + "epoch": 0.04396048405018283, + "grad_norm": 2.744112014770508, + "learning_rate": 7.321668909825034e-06, + "loss": 1.1303, + "step": 544 + }, + { + "epoch": 0.044041293763510375, + "grad_norm": 3.860858201980591, + "learning_rate": 7.335127860026918e-06, + "loss": 1.1068, + "step": 545 + }, + { + "epoch": 0.04412210347683792, + "grad_norm": 3.5854122638702393, + "learning_rate": 7.348586810228803e-06, + "loss": 1.0888, + "step": 546 + }, + { + "epoch": 0.044202913190165456, + "grad_norm": 3.399890422821045, + "learning_rate": 7.362045760430687e-06, + "loss": 1.1058, + "step": 547 + }, + { + "epoch": 0.044283722903493, + "grad_norm": 3.412484884262085, + "learning_rate": 7.375504710632571e-06, + "loss": 1.0304, + "step": 548 + }, + { + "epoch": 0.044364532616820544, + "grad_norm": 3.3257498741149902, + "learning_rate": 7.388963660834455e-06, + "loss": 1.0559, + "step": 549 + }, + { + "epoch": 0.04444534233014808, + "grad_norm": 3.3390965461730957, + "learning_rate": 7.40242261103634e-06, + "loss": 1.0454, + "step": 550 + }, + { + "epoch": 0.044526152043475625, + "grad_norm": 3.017277479171753, + "learning_rate": 7.415881561238224e-06, + "loss": 1.147, + "step": 551 + }, + { + "epoch": 0.04460696175680317, + "grad_norm": 3.9158685207366943, + "learning_rate": 7.4293405114401084e-06, + "loss": 1.0479, + "step": 552 + }, + { + "epoch": 0.044687771470130706, + "grad_norm": 3.922313928604126, + "learning_rate": 7.442799461641993e-06, + "loss": 0.9658, + "step": 553 + }, + { + "epoch": 0.04476858118345825, + "grad_norm": 3.4209396839141846, + "learning_rate": 7.456258411843876e-06, + "loss": 1.0282, + "step": 554 + }, + { + "epoch": 0.044849390896785794, + "grad_norm": 2.8000848293304443, + "learning_rate": 7.469717362045761e-06, + "loss": 1.0759, + "step": 555 + }, + { + "epoch": 0.04493020061011334, + "grad_norm": 3.410449504852295, + "learning_rate": 7.4831763122476456e-06, + "loss": 1.0831, + "step": 556 + }, + { + "epoch": 0.045011010323440875, + "grad_norm": 2.632046937942505, + "learning_rate": 7.4966352624495294e-06, + "loss": 1.1639, + "step": 557 + }, + { + "epoch": 0.04509182003676842, + "grad_norm": 2.8796796798706055, + "learning_rate": 7.510094212651414e-06, + "loss": 1.1227, + "step": 558 + }, + { + "epoch": 0.04517262975009596, + "grad_norm": 2.673081636428833, + "learning_rate": 7.523553162853298e-06, + "loss": 0.9932, + "step": 559 + }, + { + "epoch": 0.0452534394634235, + "grad_norm": 3.3943419456481934, + "learning_rate": 7.537012113055182e-06, + "loss": 0.946, + "step": 560 + }, + { + "epoch": 0.045334249176751044, + "grad_norm": 2.836904764175415, + "learning_rate": 7.550471063257067e-06, + "loss": 0.9938, + "step": 561 + }, + { + "epoch": 0.04541505889007859, + "grad_norm": 3.4832441806793213, + "learning_rate": 7.563930013458951e-06, + "loss": 1.1204, + "step": 562 + }, + { + "epoch": 0.04549586860340613, + "grad_norm": 3.456580638885498, + "learning_rate": 7.577388963660835e-06, + "loss": 1.0397, + "step": 563 + }, + { + "epoch": 0.04557667831673367, + "grad_norm": 3.6279962062835693, + "learning_rate": 7.590847913862719e-06, + "loss": 0.9863, + "step": 564 + }, + { + "epoch": 0.045657488030061213, + "grad_norm": 2.8886189460754395, + "learning_rate": 7.604306864064604e-06, + "loss": 1.0374, + "step": 565 + }, + { + "epoch": 0.04573829774338876, + "grad_norm": 3.0096817016601562, + "learning_rate": 7.617765814266488e-06, + "loss": 1.1097, + "step": 566 + }, + { + "epoch": 0.045819107456716295, + "grad_norm": 3.125135898590088, + "learning_rate": 7.631224764468373e-06, + "loss": 1.0381, + "step": 567 + }, + { + "epoch": 0.04589991717004384, + "grad_norm": 3.959364652633667, + "learning_rate": 7.644683714670256e-06, + "loss": 1.0374, + "step": 568 + }, + { + "epoch": 0.04598072688337138, + "grad_norm": 3.551934242248535, + "learning_rate": 7.658142664872141e-06, + "loss": 1.1002, + "step": 569 + }, + { + "epoch": 0.046061536596698927, + "grad_norm": 4.027700901031494, + "learning_rate": 7.671601615074024e-06, + "loss": 1.0443, + "step": 570 + }, + { + "epoch": 0.046142346310026464, + "grad_norm": 3.957592248916626, + "learning_rate": 7.685060565275909e-06, + "loss": 1.1657, + "step": 571 + }, + { + "epoch": 0.04622315602335401, + "grad_norm": 3.6092169284820557, + "learning_rate": 7.698519515477793e-06, + "loss": 1.0972, + "step": 572 + }, + { + "epoch": 0.04630396573668155, + "grad_norm": 3.7678136825561523, + "learning_rate": 7.711978465679678e-06, + "loss": 0.9487, + "step": 573 + }, + { + "epoch": 0.04638477545000909, + "grad_norm": 3.7308578491210938, + "learning_rate": 7.725437415881561e-06, + "loss": 1.0842, + "step": 574 + }, + { + "epoch": 0.04646558516333663, + "grad_norm": 3.7531898021698, + "learning_rate": 7.738896366083446e-06, + "loss": 1.0672, + "step": 575 + }, + { + "epoch": 0.04654639487666418, + "grad_norm": 3.43570613861084, + "learning_rate": 7.75235531628533e-06, + "loss": 0.9758, + "step": 576 + }, + { + "epoch": 0.046627204589991714, + "grad_norm": 3.809967517852783, + "learning_rate": 7.765814266487215e-06, + "loss": 1.004, + "step": 577 + }, + { + "epoch": 0.04670801430331926, + "grad_norm": 3.0760293006896973, + "learning_rate": 7.7792732166891e-06, + "loss": 1.0434, + "step": 578 + }, + { + "epoch": 0.0467888240166468, + "grad_norm": 3.495680809020996, + "learning_rate": 7.792732166890983e-06, + "loss": 1.091, + "step": 579 + }, + { + "epoch": 0.046869633729974346, + "grad_norm": 2.8191490173339844, + "learning_rate": 7.806191117092868e-06, + "loss": 1.1569, + "step": 580 + }, + { + "epoch": 0.04695044344330188, + "grad_norm": 3.2043919563293457, + "learning_rate": 7.819650067294752e-06, + "loss": 1.0259, + "step": 581 + }, + { + "epoch": 0.04703125315662943, + "grad_norm": 3.280008554458618, + "learning_rate": 7.833109017496637e-06, + "loss": 1.1297, + "step": 582 + }, + { + "epoch": 0.04711206286995697, + "grad_norm": 3.444746494293213, + "learning_rate": 7.84656796769852e-06, + "loss": 1.0008, + "step": 583 + }, + { + "epoch": 0.04719287258328451, + "grad_norm": 3.302302598953247, + "learning_rate": 7.860026917900405e-06, + "loss": 0.9861, + "step": 584 + }, + { + "epoch": 0.04727368229661205, + "grad_norm": 3.2500033378601074, + "learning_rate": 7.873485868102288e-06, + "loss": 1.0184, + "step": 585 + }, + { + "epoch": 0.047354492009939596, + "grad_norm": 3.9170641899108887, + "learning_rate": 7.886944818304172e-06, + "loss": 1.011, + "step": 586 + }, + { + "epoch": 0.04743530172326714, + "grad_norm": 3.257427453994751, + "learning_rate": 7.900403768506057e-06, + "loss": 1.0766, + "step": 587 + }, + { + "epoch": 0.04751611143659468, + "grad_norm": 2.6629252433776855, + "learning_rate": 7.913862718707942e-06, + "loss": 1.0352, + "step": 588 + }, + { + "epoch": 0.04759692114992222, + "grad_norm": 3.212484836578369, + "learning_rate": 7.927321668909825e-06, + "loss": 0.9738, + "step": 589 + }, + { + "epoch": 0.047677730863249765, + "grad_norm": 2.638335704803467, + "learning_rate": 7.94078061911171e-06, + "loss": 0.9539, + "step": 590 + }, + { + "epoch": 0.0477585405765773, + "grad_norm": 2.8916549682617188, + "learning_rate": 7.954239569313594e-06, + "loss": 1.0734, + "step": 591 + }, + { + "epoch": 0.047839350289904846, + "grad_norm": 3.582719564437866, + "learning_rate": 7.967698519515479e-06, + "loss": 1.1697, + "step": 592 + }, + { + "epoch": 0.04792016000323239, + "grad_norm": 3.850970506668091, + "learning_rate": 7.981157469717362e-06, + "loss": 1.1088, + "step": 593 + }, + { + "epoch": 0.048000969716559934, + "grad_norm": 2.4932446479797363, + "learning_rate": 7.994616419919247e-06, + "loss": 1.1026, + "step": 594 + }, + { + "epoch": 0.04808177942988747, + "grad_norm": 3.447291374206543, + "learning_rate": 8.008075370121131e-06, + "loss": 1.0143, + "step": 595 + }, + { + "epoch": 0.048162589143215015, + "grad_norm": 2.8634376525878906, + "learning_rate": 8.021534320323016e-06, + "loss": 1.1517, + "step": 596 + }, + { + "epoch": 0.04824339885654256, + "grad_norm": 3.1959426403045654, + "learning_rate": 8.034993270524901e-06, + "loss": 1.0189, + "step": 597 + }, + { + "epoch": 0.048324208569870096, + "grad_norm": 3.149561882019043, + "learning_rate": 8.048452220726784e-06, + "loss": 1.0388, + "step": 598 + }, + { + "epoch": 0.04840501828319764, + "grad_norm": 3.0349740982055664, + "learning_rate": 8.061911170928669e-06, + "loss": 1.0017, + "step": 599 + }, + { + "epoch": 0.048485827996525184, + "grad_norm": 3.0691850185394287, + "learning_rate": 8.075370121130552e-06, + "loss": 1.0108, + "step": 600 + }, + { + "epoch": 0.04856663770985272, + "grad_norm": 3.194122791290283, + "learning_rate": 8.088829071332436e-06, + "loss": 1.0434, + "step": 601 + }, + { + "epoch": 0.048647447423180265, + "grad_norm": 2.7453277111053467, + "learning_rate": 8.102288021534321e-06, + "loss": 1.0744, + "step": 602 + }, + { + "epoch": 0.04872825713650781, + "grad_norm": 4.147895336151123, + "learning_rate": 8.115746971736204e-06, + "loss": 1.1366, + "step": 603 + }, + { + "epoch": 0.04880906684983535, + "grad_norm": 3.354184865951538, + "learning_rate": 8.129205921938089e-06, + "loss": 1.0573, + "step": 604 + }, + { + "epoch": 0.04888987656316289, + "grad_norm": 3.0165493488311768, + "learning_rate": 8.142664872139973e-06, + "loss": 1.0278, + "step": 605 + }, + { + "epoch": 0.048970686276490434, + "grad_norm": 3.1162426471710205, + "learning_rate": 8.156123822341858e-06, + "loss": 1.0117, + "step": 606 + }, + { + "epoch": 0.04905149598981798, + "grad_norm": 3.0014259815216064, + "learning_rate": 8.169582772543743e-06, + "loss": 0.9395, + "step": 607 + }, + { + "epoch": 0.049132305703145515, + "grad_norm": 2.954028367996216, + "learning_rate": 8.183041722745626e-06, + "loss": 1.0006, + "step": 608 + }, + { + "epoch": 0.04921311541647306, + "grad_norm": 4.1641764640808105, + "learning_rate": 8.19650067294751e-06, + "loss": 1.1276, + "step": 609 + }, + { + "epoch": 0.0492939251298006, + "grad_norm": 2.9266011714935303, + "learning_rate": 8.209959623149395e-06, + "loss": 1.1714, + "step": 610 + }, + { + "epoch": 0.04937473484312815, + "grad_norm": 3.3506548404693604, + "learning_rate": 8.22341857335128e-06, + "loss": 1.0914, + "step": 611 + }, + { + "epoch": 0.049455544556455684, + "grad_norm": 3.3130972385406494, + "learning_rate": 8.236877523553165e-06, + "loss": 1.176, + "step": 612 + }, + { + "epoch": 0.04953635426978323, + "grad_norm": 3.0160202980041504, + "learning_rate": 8.250336473755048e-06, + "loss": 0.9559, + "step": 613 + }, + { + "epoch": 0.04961716398311077, + "grad_norm": 3.3474361896514893, + "learning_rate": 8.263795423956933e-06, + "loss": 0.9843, + "step": 614 + }, + { + "epoch": 0.04969797369643831, + "grad_norm": 3.325251579284668, + "learning_rate": 8.277254374158816e-06, + "loss": 0.9848, + "step": 615 + }, + { + "epoch": 0.04977878340976585, + "grad_norm": 3.5325891971588135, + "learning_rate": 8.2907133243607e-06, + "loss": 1.071, + "step": 616 + }, + { + "epoch": 0.0498595931230934, + "grad_norm": 3.2238943576812744, + "learning_rate": 8.304172274562585e-06, + "loss": 1.0326, + "step": 617 + }, + { + "epoch": 0.04994040283642094, + "grad_norm": 3.123817205429077, + "learning_rate": 8.317631224764468e-06, + "loss": 1.0025, + "step": 618 + }, + { + "epoch": 0.05002121254974848, + "grad_norm": 3.456085205078125, + "learning_rate": 8.331090174966353e-06, + "loss": 1.0593, + "step": 619 + }, + { + "epoch": 0.05010202226307602, + "grad_norm": 3.784614324569702, + "learning_rate": 8.344549125168237e-06, + "loss": 1.0484, + "step": 620 + }, + { + "epoch": 0.050182831976403566, + "grad_norm": 3.1221022605895996, + "learning_rate": 8.358008075370122e-06, + "loss": 1.191, + "step": 621 + }, + { + "epoch": 0.0502636416897311, + "grad_norm": 2.8174052238464355, + "learning_rate": 8.371467025572007e-06, + "loss": 1.1322, + "step": 622 + }, + { + "epoch": 0.05034445140305865, + "grad_norm": 2.835664749145508, + "learning_rate": 8.38492597577389e-06, + "loss": 1.0163, + "step": 623 + }, + { + "epoch": 0.05042526111638619, + "grad_norm": 3.2237017154693604, + "learning_rate": 8.398384925975775e-06, + "loss": 1.0373, + "step": 624 + }, + { + "epoch": 0.05050607082971373, + "grad_norm": 3.0693700313568115, + "learning_rate": 8.41184387617766e-06, + "loss": 1.0844, + "step": 625 + }, + { + "epoch": 0.05058688054304127, + "grad_norm": 2.6899616718292236, + "learning_rate": 8.425302826379544e-06, + "loss": 1.0018, + "step": 626 + }, + { + "epoch": 0.050667690256368816, + "grad_norm": 2.584296226501465, + "learning_rate": 8.438761776581429e-06, + "loss": 1.0926, + "step": 627 + }, + { + "epoch": 0.05074849996969636, + "grad_norm": 2.579590082168579, + "learning_rate": 8.452220726783312e-06, + "loss": 0.9834, + "step": 628 + }, + { + "epoch": 0.0508293096830239, + "grad_norm": 3.64367938041687, + "learning_rate": 8.465679676985196e-06, + "loss": 1.0661, + "step": 629 + }, + { + "epoch": 0.05091011939635144, + "grad_norm": 3.95210862159729, + "learning_rate": 8.47913862718708e-06, + "loss": 1.0216, + "step": 630 + }, + { + "epoch": 0.050990929109678985, + "grad_norm": 3.1546990871429443, + "learning_rate": 8.492597577388964e-06, + "loss": 1.0854, + "step": 631 + }, + { + "epoch": 0.05107173882300652, + "grad_norm": 2.9738261699676514, + "learning_rate": 8.506056527590849e-06, + "loss": 1.0582, + "step": 632 + }, + { + "epoch": 0.051152548536334067, + "grad_norm": 3.137644052505493, + "learning_rate": 8.519515477792732e-06, + "loss": 1.2241, + "step": 633 + }, + { + "epoch": 0.05123335824966161, + "grad_norm": 3.27915096282959, + "learning_rate": 8.532974427994617e-06, + "loss": 1.0469, + "step": 634 + }, + { + "epoch": 0.051314167962989155, + "grad_norm": 3.2506301403045654, + "learning_rate": 8.546433378196501e-06, + "loss": 1.0474, + "step": 635 + }, + { + "epoch": 0.05139497767631669, + "grad_norm": 2.8712782859802246, + "learning_rate": 8.559892328398386e-06, + "loss": 1.0206, + "step": 636 + }, + { + "epoch": 0.051475787389644236, + "grad_norm": 2.924260377883911, + "learning_rate": 8.57335127860027e-06, + "loss": 1.1007, + "step": 637 + }, + { + "epoch": 0.05155659710297178, + "grad_norm": 3.638122797012329, + "learning_rate": 8.586810228802154e-06, + "loss": 1.131, + "step": 638 + }, + { + "epoch": 0.05163740681629932, + "grad_norm": 2.8430850505828857, + "learning_rate": 8.600269179004038e-06, + "loss": 1.2306, + "step": 639 + }, + { + "epoch": 0.05171821652962686, + "grad_norm": 2.9707863330841064, + "learning_rate": 8.613728129205923e-06, + "loss": 1.0065, + "step": 640 + }, + { + "epoch": 0.051799026242954405, + "grad_norm": 3.260795831680298, + "learning_rate": 8.627187079407808e-06, + "loss": 0.9224, + "step": 641 + }, + { + "epoch": 0.05187983595628194, + "grad_norm": 3.043245553970337, + "learning_rate": 8.64064602960969e-06, + "loss": 1.0305, + "step": 642 + }, + { + "epoch": 0.051960645669609486, + "grad_norm": 2.996899127960205, + "learning_rate": 8.654104979811576e-06, + "loss": 1.0677, + "step": 643 + }, + { + "epoch": 0.05204145538293703, + "grad_norm": 2.490837335586548, + "learning_rate": 8.66756393001346e-06, + "loss": 1.0811, + "step": 644 + }, + { + "epoch": 0.052122265096264574, + "grad_norm": 2.932539701461792, + "learning_rate": 8.681022880215343e-06, + "loss": 1.1236, + "step": 645 + }, + { + "epoch": 0.05220307480959211, + "grad_norm": 2.9249560832977295, + "learning_rate": 8.694481830417228e-06, + "loss": 1.1811, + "step": 646 + }, + { + "epoch": 0.052283884522919655, + "grad_norm": 3.095759153366089, + "learning_rate": 8.707940780619113e-06, + "loss": 0.989, + "step": 647 + }, + { + "epoch": 0.0523646942362472, + "grad_norm": 3.0386643409729004, + "learning_rate": 8.721399730820996e-06, + "loss": 1.041, + "step": 648 + }, + { + "epoch": 0.052445503949574736, + "grad_norm": 3.2120351791381836, + "learning_rate": 8.73485868102288e-06, + "loss": 1.0539, + "step": 649 + }, + { + "epoch": 0.05252631366290228, + "grad_norm": 2.643620491027832, + "learning_rate": 8.748317631224765e-06, + "loss": 0.9998, + "step": 650 + }, + { + "epoch": 0.052607123376229824, + "grad_norm": 3.5676777362823486, + "learning_rate": 8.76177658142665e-06, + "loss": 1.026, + "step": 651 + }, + { + "epoch": 0.05268793308955737, + "grad_norm": 3.254486322402954, + "learning_rate": 8.775235531628533e-06, + "loss": 1.1402, + "step": 652 + }, + { + "epoch": 0.052768742802884905, + "grad_norm": 3.1883718967437744, + "learning_rate": 8.788694481830418e-06, + "loss": 1.0265, + "step": 653 + }, + { + "epoch": 0.05284955251621245, + "grad_norm": 2.793278217315674, + "learning_rate": 8.802153432032302e-06, + "loss": 1.0114, + "step": 654 + }, + { + "epoch": 0.05293036222953999, + "grad_norm": 2.92417049407959, + "learning_rate": 8.815612382234187e-06, + "loss": 1.0035, + "step": 655 + }, + { + "epoch": 0.05301117194286753, + "grad_norm": 3.133819580078125, + "learning_rate": 8.829071332436072e-06, + "loss": 1.0065, + "step": 656 + }, + { + "epoch": 0.053091981656195074, + "grad_norm": 3.041851758956909, + "learning_rate": 8.842530282637955e-06, + "loss": 1.0923, + "step": 657 + }, + { + "epoch": 0.05317279136952262, + "grad_norm": 3.1926145553588867, + "learning_rate": 8.85598923283984e-06, + "loss": 1.0643, + "step": 658 + }, + { + "epoch": 0.05325360108285016, + "grad_norm": 2.8128445148468018, + "learning_rate": 8.869448183041724e-06, + "loss": 1.2255, + "step": 659 + }, + { + "epoch": 0.0533344107961777, + "grad_norm": 3.1745128631591797, + "learning_rate": 8.882907133243607e-06, + "loss": 0.9263, + "step": 660 + }, + { + "epoch": 0.05341522050950524, + "grad_norm": 2.8548998832702637, + "learning_rate": 8.896366083445492e-06, + "loss": 0.9909, + "step": 661 + }, + { + "epoch": 0.05349603022283279, + "grad_norm": 3.219050168991089, + "learning_rate": 8.909825033647377e-06, + "loss": 1.1081, + "step": 662 + }, + { + "epoch": 0.053576839936160324, + "grad_norm": 3.200005531311035, + "learning_rate": 8.92328398384926e-06, + "loss": 0.9797, + "step": 663 + }, + { + "epoch": 0.05365764964948787, + "grad_norm": 3.417999744415283, + "learning_rate": 8.936742934051144e-06, + "loss": 0.9842, + "step": 664 + }, + { + "epoch": 0.05373845936281541, + "grad_norm": 3.4707634449005127, + "learning_rate": 8.950201884253029e-06, + "loss": 1.0817, + "step": 665 + }, + { + "epoch": 0.05381926907614295, + "grad_norm": 3.108090400695801, + "learning_rate": 8.963660834454914e-06, + "loss": 1.1044, + "step": 666 + }, + { + "epoch": 0.05390007878947049, + "grad_norm": 3.1225290298461914, + "learning_rate": 8.977119784656797e-06, + "loss": 0.9562, + "step": 667 + }, + { + "epoch": 0.05398088850279804, + "grad_norm": 3.255286455154419, + "learning_rate": 8.990578734858681e-06, + "loss": 0.9713, + "step": 668 + }, + { + "epoch": 0.05406169821612558, + "grad_norm": 3.492190361022949, + "learning_rate": 9.004037685060566e-06, + "loss": 1.0841, + "step": 669 + }, + { + "epoch": 0.05414250792945312, + "grad_norm": 3.2213480472564697, + "learning_rate": 9.017496635262451e-06, + "loss": 1.0451, + "step": 670 + }, + { + "epoch": 0.05422331764278066, + "grad_norm": 2.792081594467163, + "learning_rate": 9.030955585464336e-06, + "loss": 1.005, + "step": 671 + }, + { + "epoch": 0.054304127356108206, + "grad_norm": 2.712742805480957, + "learning_rate": 9.044414535666219e-06, + "loss": 1.0522, + "step": 672 + }, + { + "epoch": 0.05438493706943574, + "grad_norm": 2.6193344593048096, + "learning_rate": 9.057873485868103e-06, + "loss": 1.0185, + "step": 673 + }, + { + "epoch": 0.05446574678276329, + "grad_norm": 3.2356667518615723, + "learning_rate": 9.071332436069988e-06, + "loss": 1.0265, + "step": 674 + }, + { + "epoch": 0.05454655649609083, + "grad_norm": 4.429942607879639, + "learning_rate": 9.084791386271871e-06, + "loss": 1.0277, + "step": 675 + }, + { + "epoch": 0.054627366209418375, + "grad_norm": 3.260923385620117, + "learning_rate": 9.098250336473756e-06, + "loss": 1.1831, + "step": 676 + }, + { + "epoch": 0.05470817592274591, + "grad_norm": 3.3523643016815186, + "learning_rate": 9.111709286675639e-06, + "loss": 1.0104, + "step": 677 + }, + { + "epoch": 0.054788985636073456, + "grad_norm": 3.8985984325408936, + "learning_rate": 9.125168236877523e-06, + "loss": 1.014, + "step": 678 + }, + { + "epoch": 0.054869795349401, + "grad_norm": 2.8820083141326904, + "learning_rate": 9.138627187079408e-06, + "loss": 1.0623, + "step": 679 + }, + { + "epoch": 0.05495060506272854, + "grad_norm": 3.6060869693756104, + "learning_rate": 9.152086137281293e-06, + "loss": 1.0811, + "step": 680 + }, + { + "epoch": 0.05503141477605608, + "grad_norm": 3.030777931213379, + "learning_rate": 9.165545087483178e-06, + "loss": 1.1389, + "step": 681 + }, + { + "epoch": 0.055112224489383625, + "grad_norm": 3.050401210784912, + "learning_rate": 9.17900403768506e-06, + "loss": 1.0367, + "step": 682 + }, + { + "epoch": 0.05519303420271117, + "grad_norm": 3.2267160415649414, + "learning_rate": 9.192462987886945e-06, + "loss": 1.0889, + "step": 683 + }, + { + "epoch": 0.055273843916038706, + "grad_norm": 3.1425304412841797, + "learning_rate": 9.20592193808883e-06, + "loss": 0.952, + "step": 684 + }, + { + "epoch": 0.05535465362936625, + "grad_norm": 3.436549425125122, + "learning_rate": 9.219380888290715e-06, + "loss": 1.1109, + "step": 685 + }, + { + "epoch": 0.055435463342693794, + "grad_norm": 3.5831665992736816, + "learning_rate": 9.2328398384926e-06, + "loss": 1.0488, + "step": 686 + }, + { + "epoch": 0.05551627305602133, + "grad_norm": 3.0877110958099365, + "learning_rate": 9.246298788694482e-06, + "loss": 1.1262, + "step": 687 + }, + { + "epoch": 0.055597082769348875, + "grad_norm": 2.9710967540740967, + "learning_rate": 9.259757738896367e-06, + "loss": 1.0023, + "step": 688 + }, + { + "epoch": 0.05567789248267642, + "grad_norm": 3.0827548503875732, + "learning_rate": 9.273216689098252e-06, + "loss": 1.0158, + "step": 689 + }, + { + "epoch": 0.055758702196003956, + "grad_norm": 3.3601818084716797, + "learning_rate": 9.286675639300135e-06, + "loss": 1.1819, + "step": 690 + }, + { + "epoch": 0.0558395119093315, + "grad_norm": 2.9309380054473877, + "learning_rate": 9.30013458950202e-06, + "loss": 1.0279, + "step": 691 + }, + { + "epoch": 0.055920321622659044, + "grad_norm": 3.019272804260254, + "learning_rate": 9.313593539703903e-06, + "loss": 1.0785, + "step": 692 + }, + { + "epoch": 0.05600113133598659, + "grad_norm": 2.882826089859009, + "learning_rate": 9.327052489905787e-06, + "loss": 0.9971, + "step": 693 + }, + { + "epoch": 0.056081941049314125, + "grad_norm": 3.164386510848999, + "learning_rate": 9.340511440107672e-06, + "loss": 0.9612, + "step": 694 + }, + { + "epoch": 0.05616275076264167, + "grad_norm": 3.490043878555298, + "learning_rate": 9.353970390309557e-06, + "loss": 1.0776, + "step": 695 + }, + { + "epoch": 0.05624356047596921, + "grad_norm": 2.96453595161438, + "learning_rate": 9.367429340511441e-06, + "loss": 0.9989, + "step": 696 + }, + { + "epoch": 0.05632437018929675, + "grad_norm": 4.331329822540283, + "learning_rate": 9.380888290713324e-06, + "loss": 1.2126, + "step": 697 + }, + { + "epoch": 0.056405179902624294, + "grad_norm": 2.8803155422210693, + "learning_rate": 9.39434724091521e-06, + "loss": 1.0638, + "step": 698 + }, + { + "epoch": 0.05648598961595184, + "grad_norm": 3.1566812992095947, + "learning_rate": 9.407806191117094e-06, + "loss": 1.0227, + "step": 699 + }, + { + "epoch": 0.05656679932927938, + "grad_norm": 3.2209270000457764, + "learning_rate": 9.421265141318979e-06, + "loss": 1.1027, + "step": 700 + }, + { + "epoch": 0.05664760904260692, + "grad_norm": 3.1682584285736084, + "learning_rate": 9.434724091520863e-06, + "loss": 1.2391, + "step": 701 + }, + { + "epoch": 0.056728418755934464, + "grad_norm": 3.444187879562378, + "learning_rate": 9.448183041722746e-06, + "loss": 1.0747, + "step": 702 + }, + { + "epoch": 0.05680922846926201, + "grad_norm": 3.329221725463867, + "learning_rate": 9.461641991924631e-06, + "loss": 1.051, + "step": 703 + }, + { + "epoch": 0.056890038182589545, + "grad_norm": 3.5517077445983887, + "learning_rate": 9.475100942126516e-06, + "loss": 1.1431, + "step": 704 + }, + { + "epoch": 0.05697084789591709, + "grad_norm": 2.881688117980957, + "learning_rate": 9.488559892328399e-06, + "loss": 1.1095, + "step": 705 + }, + { + "epoch": 0.05705165760924463, + "grad_norm": 3.5473861694335938, + "learning_rate": 9.502018842530283e-06, + "loss": 0.9955, + "step": 706 + }, + { + "epoch": 0.05713246732257218, + "grad_norm": 2.839627504348755, + "learning_rate": 9.515477792732166e-06, + "loss": 1.1872, + "step": 707 + }, + { + "epoch": 0.057213277035899714, + "grad_norm": 3.1089134216308594, + "learning_rate": 9.528936742934051e-06, + "loss": 1.0786, + "step": 708 + }, + { + "epoch": 0.05729408674922726, + "grad_norm": 2.776536703109741, + "learning_rate": 9.542395693135936e-06, + "loss": 1.056, + "step": 709 + }, + { + "epoch": 0.0573748964625548, + "grad_norm": 3.4086854457855225, + "learning_rate": 9.55585464333782e-06, + "loss": 1.0237, + "step": 710 + }, + { + "epoch": 0.05745570617588234, + "grad_norm": 3.041487455368042, + "learning_rate": 9.569313593539705e-06, + "loss": 1.0635, + "step": 711 + }, + { + "epoch": 0.05753651588920988, + "grad_norm": 3.320389986038208, + "learning_rate": 9.582772543741588e-06, + "loss": 1.0418, + "step": 712 + }, + { + "epoch": 0.05761732560253743, + "grad_norm": 3.3112857341766357, + "learning_rate": 9.596231493943473e-06, + "loss": 1.1571, + "step": 713 + }, + { + "epoch": 0.057698135315864964, + "grad_norm": 3.4754772186279297, + "learning_rate": 9.609690444145358e-06, + "loss": 1.1472, + "step": 714 + }, + { + "epoch": 0.05777894502919251, + "grad_norm": 3.2244203090667725, + "learning_rate": 9.623149394347242e-06, + "loss": 1.1125, + "step": 715 + }, + { + "epoch": 0.05785975474252005, + "grad_norm": 2.8610832691192627, + "learning_rate": 9.636608344549126e-06, + "loss": 1.1578, + "step": 716 + }, + { + "epoch": 0.057940564455847596, + "grad_norm": 3.1100966930389404, + "learning_rate": 9.65006729475101e-06, + "loss": 1.0516, + "step": 717 + }, + { + "epoch": 0.05802137416917513, + "grad_norm": 2.962012767791748, + "learning_rate": 9.663526244952895e-06, + "loss": 1.084, + "step": 718 + }, + { + "epoch": 0.05810218388250268, + "grad_norm": 3.183809757232666, + "learning_rate": 9.67698519515478e-06, + "loss": 1.0381, + "step": 719 + }, + { + "epoch": 0.05818299359583022, + "grad_norm": 3.398799419403076, + "learning_rate": 9.690444145356663e-06, + "loss": 1.1571, + "step": 720 + }, + { + "epoch": 0.05826380330915776, + "grad_norm": 3.447977066040039, + "learning_rate": 9.703903095558547e-06, + "loss": 1.0165, + "step": 721 + }, + { + "epoch": 0.0583446130224853, + "grad_norm": 3.278087615966797, + "learning_rate": 9.71736204576043e-06, + "loss": 1.027, + "step": 722 + }, + { + "epoch": 0.058425422735812846, + "grad_norm": 3.1923909187316895, + "learning_rate": 9.730820995962315e-06, + "loss": 0.9763, + "step": 723 + }, + { + "epoch": 0.05850623244914039, + "grad_norm": 3.14595890045166, + "learning_rate": 9.7442799461642e-06, + "loss": 1.0987, + "step": 724 + }, + { + "epoch": 0.05858704216246793, + "grad_norm": 2.855151653289795, + "learning_rate": 9.757738896366085e-06, + "loss": 1.096, + "step": 725 + }, + { + "epoch": 0.05866785187579547, + "grad_norm": 3.2433483600616455, + "learning_rate": 9.771197846567968e-06, + "loss": 1.0199, + "step": 726 + }, + { + "epoch": 0.058748661589123015, + "grad_norm": 3.5929629802703857, + "learning_rate": 9.784656796769852e-06, + "loss": 1.0424, + "step": 727 + }, + { + "epoch": 0.05882947130245055, + "grad_norm": 3.2948145866394043, + "learning_rate": 9.798115746971737e-06, + "loss": 0.9823, + "step": 728 + }, + { + "epoch": 0.058910281015778096, + "grad_norm": 3.665013313293457, + "learning_rate": 9.811574697173622e-06, + "loss": 1.0612, + "step": 729 + }, + { + "epoch": 0.05899109072910564, + "grad_norm": 3.3471338748931885, + "learning_rate": 9.825033647375506e-06, + "loss": 1.0525, + "step": 730 + }, + { + "epoch": 0.05907190044243318, + "grad_norm": 3.174917697906494, + "learning_rate": 9.83849259757739e-06, + "loss": 1.1074, + "step": 731 + }, + { + "epoch": 0.05915271015576072, + "grad_norm": 3.52289080619812, + "learning_rate": 9.851951547779274e-06, + "loss": 1.0424, + "step": 732 + }, + { + "epoch": 0.059233519869088265, + "grad_norm": 2.875234603881836, + "learning_rate": 9.865410497981159e-06, + "loss": 0.8862, + "step": 733 + }, + { + "epoch": 0.05931432958241581, + "grad_norm": 3.1018879413604736, + "learning_rate": 9.878869448183044e-06, + "loss": 0.9249, + "step": 734 + }, + { + "epoch": 0.059395139295743346, + "grad_norm": 2.894240617752075, + "learning_rate": 9.892328398384927e-06, + "loss": 1.0293, + "step": 735 + }, + { + "epoch": 0.05947594900907089, + "grad_norm": 2.658221960067749, + "learning_rate": 9.905787348586811e-06, + "loss": 1.024, + "step": 736 + }, + { + "epoch": 0.059556758722398434, + "grad_norm": 3.6054623126983643, + "learning_rate": 9.919246298788694e-06, + "loss": 1.0726, + "step": 737 + }, + { + "epoch": 0.05963756843572597, + "grad_norm": 3.583958148956299, + "learning_rate": 9.932705248990579e-06, + "loss": 1.0371, + "step": 738 + }, + { + "epoch": 0.059718378149053515, + "grad_norm": 3.387402296066284, + "learning_rate": 9.946164199192464e-06, + "loss": 1.0455, + "step": 739 + }, + { + "epoch": 0.05979918786238106, + "grad_norm": 3.2275664806365967, + "learning_rate": 9.959623149394348e-06, + "loss": 1.0076, + "step": 740 + }, + { + "epoch": 0.0598799975757086, + "grad_norm": 2.8620572090148926, + "learning_rate": 9.973082099596231e-06, + "loss": 0.9865, + "step": 741 + }, + { + "epoch": 0.05996080728903614, + "grad_norm": 2.740565776824951, + "learning_rate": 9.986541049798116e-06, + "loss": 0.9939, + "step": 742 + }, + { + "epoch": 0.060041617002363684, + "grad_norm": 2.7126121520996094, + "learning_rate": 1e-05, + "loss": 1.1355, + "step": 743 + }, + { + "epoch": 0.06012242671569123, + "grad_norm": 3.2582151889801025, + "learning_rate": 9.99999995718102e-06, + "loss": 1.0385, + "step": 744 + }, + { + "epoch": 0.060203236429018765, + "grad_norm": 2.884678840637207, + "learning_rate": 9.999999828724076e-06, + "loss": 0.9873, + "step": 745 + }, + { + "epoch": 0.06028404614234631, + "grad_norm": 2.7058234214782715, + "learning_rate": 9.999999614629171e-06, + "loss": 1.057, + "step": 746 + }, + { + "epoch": 0.06036485585567385, + "grad_norm": 3.1888225078582764, + "learning_rate": 9.999999314896312e-06, + "loss": 1.0373, + "step": 747 + }, + { + "epoch": 0.0604456655690014, + "grad_norm": 3.0204105377197266, + "learning_rate": 9.9999989295255e-06, + "loss": 1.0303, + "step": 748 + }, + { + "epoch": 0.060526475282328934, + "grad_norm": 2.9828920364379883, + "learning_rate": 9.999998458516744e-06, + "loss": 0.9333, + "step": 749 + }, + { + "epoch": 0.06060728499565648, + "grad_norm": 3.6406376361846924, + "learning_rate": 9.999997901870051e-06, + "loss": 1.0704, + "step": 750 + }, + { + "epoch": 0.06068809470898402, + "grad_norm": 3.295814037322998, + "learning_rate": 9.999997259585433e-06, + "loss": 1.0499, + "step": 751 + }, + { + "epoch": 0.06076890442231156, + "grad_norm": 3.1430790424346924, + "learning_rate": 9.999996531662896e-06, + "loss": 1.0707, + "step": 752 + }, + { + "epoch": 0.0608497141356391, + "grad_norm": 2.681826591491699, + "learning_rate": 9.999995718102456e-06, + "loss": 1.0735, + "step": 753 + }, + { + "epoch": 0.06093052384896665, + "grad_norm": 3.966184616088867, + "learning_rate": 9.999994818904128e-06, + "loss": 0.9832, + "step": 754 + }, + { + "epoch": 0.061011333562294184, + "grad_norm": 2.9914650917053223, + "learning_rate": 9.999993834067924e-06, + "loss": 1.0851, + "step": 755 + }, + { + "epoch": 0.06109214327562173, + "grad_norm": 3.3987998962402344, + "learning_rate": 9.999992763593863e-06, + "loss": 1.1394, + "step": 756 + }, + { + "epoch": 0.06117295298894927, + "grad_norm": 3.3105270862579346, + "learning_rate": 9.999991607481963e-06, + "loss": 1.0244, + "step": 757 + }, + { + "epoch": 0.061253762702276816, + "grad_norm": 3.0653975009918213, + "learning_rate": 9.999990365732244e-06, + "loss": 1.0862, + "step": 758 + }, + { + "epoch": 0.06133457241560435, + "grad_norm": 3.077054262161255, + "learning_rate": 9.999989038344727e-06, + "loss": 0.9286, + "step": 759 + }, + { + "epoch": 0.0614153821289319, + "grad_norm": 3.343345880508423, + "learning_rate": 9.999987625319436e-06, + "loss": 1.1721, + "step": 760 + }, + { + "epoch": 0.06149619184225944, + "grad_norm": 3.037191152572632, + "learning_rate": 9.999986126656392e-06, + "loss": 0.9661, + "step": 761 + }, + { + "epoch": 0.06157700155558698, + "grad_norm": 3.292630195617676, + "learning_rate": 9.999984542355623e-06, + "loss": 1.0473, + "step": 762 + }, + { + "epoch": 0.06165781126891452, + "grad_norm": 3.0494720935821533, + "learning_rate": 9.999982872417156e-06, + "loss": 1.1973, + "step": 763 + }, + { + "epoch": 0.061738620982242066, + "grad_norm": 2.7087042331695557, + "learning_rate": 9.99998111684102e-06, + "loss": 1.1034, + "step": 764 + }, + { + "epoch": 0.06181943069556961, + "grad_norm": 3.176063299179077, + "learning_rate": 9.999979275627243e-06, + "loss": 1.1785, + "step": 765 + }, + { + "epoch": 0.06190024040889715, + "grad_norm": 2.815727949142456, + "learning_rate": 9.99997734877586e-06, + "loss": 1.1357, + "step": 766 + }, + { + "epoch": 0.06198105012222469, + "grad_norm": 4.608881950378418, + "learning_rate": 9.9999753362869e-06, + "loss": 1.0143, + "step": 767 + }, + { + "epoch": 0.062061859835552236, + "grad_norm": 3.1536128520965576, + "learning_rate": 9.999973238160401e-06, + "loss": 1.1959, + "step": 768 + }, + { + "epoch": 0.06214266954887977, + "grad_norm": 2.6403682231903076, + "learning_rate": 9.999971054396396e-06, + "loss": 0.9694, + "step": 769 + }, + { + "epoch": 0.06222347926220732, + "grad_norm": 2.8894197940826416, + "learning_rate": 9.999968784994924e-06, + "loss": 1.0606, + "step": 770 + }, + { + "epoch": 0.06230428897553486, + "grad_norm": 3.4575185775756836, + "learning_rate": 9.999966429956026e-06, + "loss": 1.0327, + "step": 771 + }, + { + "epoch": 0.062385098688862405, + "grad_norm": 3.1994760036468506, + "learning_rate": 9.999963989279737e-06, + "loss": 0.9079, + "step": 772 + }, + { + "epoch": 0.06246590840218994, + "grad_norm": 2.893705129623413, + "learning_rate": 9.999961462966104e-06, + "loss": 1.0912, + "step": 773 + }, + { + "epoch": 0.06254671811551749, + "grad_norm": 3.508070230484009, + "learning_rate": 9.999958851015165e-06, + "loss": 1.0183, + "step": 774 + }, + { + "epoch": 0.06262752782884502, + "grad_norm": 3.0492401123046875, + "learning_rate": 9.999956153426971e-06, + "loss": 1.024, + "step": 775 + }, + { + "epoch": 0.06270833754217257, + "grad_norm": 3.115318775177002, + "learning_rate": 9.999953370201564e-06, + "loss": 1.0539, + "step": 776 + }, + { + "epoch": 0.06278914725550011, + "grad_norm": 2.9921085834503174, + "learning_rate": 9.999950501338992e-06, + "loss": 0.8383, + "step": 777 + }, + { + "epoch": 0.06286995696882765, + "grad_norm": 3.039691209793091, + "learning_rate": 9.999947546839306e-06, + "loss": 1.0069, + "step": 778 + }, + { + "epoch": 0.0629507666821552, + "grad_norm": 3.0498998165130615, + "learning_rate": 9.999944506702554e-06, + "loss": 0.9399, + "step": 779 + }, + { + "epoch": 0.06303157639548274, + "grad_norm": 2.6876518726348877, + "learning_rate": 9.99994138092879e-06, + "loss": 1.1121, + "step": 780 + }, + { + "epoch": 0.06311238610881027, + "grad_norm": 3.357180595397949, + "learning_rate": 9.999938169518067e-06, + "loss": 1.0398, + "step": 781 + }, + { + "epoch": 0.06319319582213782, + "grad_norm": 3.427842140197754, + "learning_rate": 9.99993487247044e-06, + "loss": 1.0485, + "step": 782 + }, + { + "epoch": 0.06327400553546536, + "grad_norm": 3.3528783321380615, + "learning_rate": 9.999931489785965e-06, + "loss": 0.9594, + "step": 783 + }, + { + "epoch": 0.06335481524879291, + "grad_norm": 3.1690218448638916, + "learning_rate": 9.999928021464701e-06, + "loss": 1.0566, + "step": 784 + }, + { + "epoch": 0.06343562496212045, + "grad_norm": 3.1988813877105713, + "learning_rate": 9.999924467506707e-06, + "loss": 1.086, + "step": 785 + }, + { + "epoch": 0.06351643467544799, + "grad_norm": 2.7500905990600586, + "learning_rate": 9.999920827912044e-06, + "loss": 1.0032, + "step": 786 + }, + { + "epoch": 0.06359724438877554, + "grad_norm": 3.1908798217773438, + "learning_rate": 9.999917102680773e-06, + "loss": 1.1315, + "step": 787 + }, + { + "epoch": 0.06367805410210307, + "grad_norm": 3.889151096343994, + "learning_rate": 9.99991329181296e-06, + "loss": 1.0072, + "step": 788 + }, + { + "epoch": 0.06375886381543061, + "grad_norm": 2.5109405517578125, + "learning_rate": 9.999909395308669e-06, + "loss": 1.0633, + "step": 789 + }, + { + "epoch": 0.06383967352875816, + "grad_norm": 3.0104565620422363, + "learning_rate": 9.999905413167965e-06, + "loss": 1.0833, + "step": 790 + }, + { + "epoch": 0.0639204832420857, + "grad_norm": 2.7210376262664795, + "learning_rate": 9.99990134539092e-06, + "loss": 1.0066, + "step": 791 + }, + { + "epoch": 0.06400129295541324, + "grad_norm": 3.518920421600342, + "learning_rate": 9.9998971919776e-06, + "loss": 1.0236, + "step": 792 + }, + { + "epoch": 0.06408210266874079, + "grad_norm": 2.496920108795166, + "learning_rate": 9.99989295292808e-06, + "loss": 1.133, + "step": 793 + }, + { + "epoch": 0.06416291238206832, + "grad_norm": 3.115713357925415, + "learning_rate": 9.999888628242429e-06, + "loss": 1.0996, + "step": 794 + }, + { + "epoch": 0.06424372209539586, + "grad_norm": 2.825153112411499, + "learning_rate": 9.999884217920724e-06, + "loss": 1.078, + "step": 795 + }, + { + "epoch": 0.06432453180872341, + "grad_norm": 3.8622965812683105, + "learning_rate": 9.999879721963037e-06, + "loss": 1.0097, + "step": 796 + }, + { + "epoch": 0.06440534152205095, + "grad_norm": 3.4580605030059814, + "learning_rate": 9.999875140369448e-06, + "loss": 1.0305, + "step": 797 + }, + { + "epoch": 0.06448615123537849, + "grad_norm": 2.898871421813965, + "learning_rate": 9.999870473140036e-06, + "loss": 1.0498, + "step": 798 + }, + { + "epoch": 0.06456696094870604, + "grad_norm": 3.083414077758789, + "learning_rate": 9.999865720274877e-06, + "loss": 1.1388, + "step": 799 + }, + { + "epoch": 0.06464777066203357, + "grad_norm": 2.934309720993042, + "learning_rate": 9.999860881774057e-06, + "loss": 1.0504, + "step": 800 + }, + { + "epoch": 0.06472858037536112, + "grad_norm": 2.991290807723999, + "learning_rate": 9.999855957637657e-06, + "loss": 1.1315, + "step": 801 + }, + { + "epoch": 0.06480939008868866, + "grad_norm": 3.165949583053589, + "learning_rate": 9.999850947865759e-06, + "loss": 1.0564, + "step": 802 + }, + { + "epoch": 0.0648901998020162, + "grad_norm": 2.813497304916382, + "learning_rate": 9.999845852458453e-06, + "loss": 1.1347, + "step": 803 + }, + { + "epoch": 0.06497100951534375, + "grad_norm": 2.7869584560394287, + "learning_rate": 9.99984067141582e-06, + "loss": 1.029, + "step": 804 + }, + { + "epoch": 0.06505181922867129, + "grad_norm": 3.336273670196533, + "learning_rate": 9.99983540473796e-06, + "loss": 1.0999, + "step": 805 + }, + { + "epoch": 0.06513262894199882, + "grad_norm": 2.916585683822632, + "learning_rate": 9.99983005242495e-06, + "loss": 0.9567, + "step": 806 + }, + { + "epoch": 0.06521343865532638, + "grad_norm": 3.108661413192749, + "learning_rate": 9.99982461447689e-06, + "loss": 1.1269, + "step": 807 + }, + { + "epoch": 0.06529424836865391, + "grad_norm": 3.390730857849121, + "learning_rate": 9.999819090893871e-06, + "loss": 1.0214, + "step": 808 + }, + { + "epoch": 0.06537505808198145, + "grad_norm": 3.4384777545928955, + "learning_rate": 9.999813481675988e-06, + "loss": 1.0744, + "step": 809 + }, + { + "epoch": 0.065455867795309, + "grad_norm": 2.9382858276367188, + "learning_rate": 9.999807786823336e-06, + "loss": 0.938, + "step": 810 + }, + { + "epoch": 0.06553667750863654, + "grad_norm": 2.9017443656921387, + "learning_rate": 9.999802006336012e-06, + "loss": 0.964, + "step": 811 + }, + { + "epoch": 0.06561748722196407, + "grad_norm": 3.442960500717163, + "learning_rate": 9.999796140214117e-06, + "loss": 0.9884, + "step": 812 + }, + { + "epoch": 0.06569829693529163, + "grad_norm": 3.5074703693389893, + "learning_rate": 9.999790188457753e-06, + "loss": 1.1886, + "step": 813 + }, + { + "epoch": 0.06577910664861916, + "grad_norm": 2.902418613433838, + "learning_rate": 9.999784151067017e-06, + "loss": 0.9786, + "step": 814 + }, + { + "epoch": 0.0658599163619467, + "grad_norm": 3.5503032207489014, + "learning_rate": 9.999778028042015e-06, + "loss": 1.1564, + "step": 815 + }, + { + "epoch": 0.06594072607527425, + "grad_norm": 3.613726854324341, + "learning_rate": 9.999771819382854e-06, + "loss": 0.8653, + "step": 816 + }, + { + "epoch": 0.06602153578860179, + "grad_norm": 3.3173012733459473, + "learning_rate": 9.999765525089635e-06, + "loss": 1.0776, + "step": 817 + }, + { + "epoch": 0.06610234550192934, + "grad_norm": 2.9546091556549072, + "learning_rate": 9.99975914516247e-06, + "loss": 1.106, + "step": 818 + }, + { + "epoch": 0.06618315521525688, + "grad_norm": 2.750858783721924, + "learning_rate": 9.999752679601468e-06, + "loss": 1.0307, + "step": 819 + }, + { + "epoch": 0.06626396492858441, + "grad_norm": 3.2712745666503906, + "learning_rate": 9.99974612840674e-06, + "loss": 1.1934, + "step": 820 + }, + { + "epoch": 0.06634477464191196, + "grad_norm": 3.2843518257141113, + "learning_rate": 9.999739491578395e-06, + "loss": 1.0981, + "step": 821 + }, + { + "epoch": 0.0664255843552395, + "grad_norm": 3.065136671066284, + "learning_rate": 9.99973276911655e-06, + "loss": 1.0886, + "step": 822 + }, + { + "epoch": 0.06650639406856704, + "grad_norm": 3.0093564987182617, + "learning_rate": 9.99972596102132e-06, + "loss": 0.9862, + "step": 823 + }, + { + "epoch": 0.06658720378189459, + "grad_norm": 2.9246206283569336, + "learning_rate": 9.99971906729282e-06, + "loss": 0.9095, + "step": 824 + }, + { + "epoch": 0.06666801349522213, + "grad_norm": 2.747081995010376, + "learning_rate": 9.999712087931168e-06, + "loss": 1.0432, + "step": 825 + }, + { + "epoch": 0.06674882320854966, + "grad_norm": 3.6614139080047607, + "learning_rate": 9.999705022936484e-06, + "loss": 0.9202, + "step": 826 + }, + { + "epoch": 0.06682963292187721, + "grad_norm": 2.832754611968994, + "learning_rate": 9.999697872308892e-06, + "loss": 1.1262, + "step": 827 + }, + { + "epoch": 0.06691044263520475, + "grad_norm": 3.048327684402466, + "learning_rate": 9.999690636048508e-06, + "loss": 1.0015, + "step": 828 + }, + { + "epoch": 0.06699125234853229, + "grad_norm": 2.694953680038452, + "learning_rate": 9.999683314155462e-06, + "loss": 1.0099, + "step": 829 + }, + { + "epoch": 0.06707206206185984, + "grad_norm": 3.4013941287994385, + "learning_rate": 9.999675906629876e-06, + "loss": 1.0881, + "step": 830 + }, + { + "epoch": 0.06715287177518738, + "grad_norm": 3.1940932273864746, + "learning_rate": 9.999668413471878e-06, + "loss": 1.0954, + "step": 831 + }, + { + "epoch": 0.06723368148851493, + "grad_norm": 2.710059404373169, + "learning_rate": 9.999660834681597e-06, + "loss": 1.0692, + "step": 832 + }, + { + "epoch": 0.06731449120184246, + "grad_norm": 4.080317974090576, + "learning_rate": 9.99965317025916e-06, + "loss": 1.0583, + "step": 833 + }, + { + "epoch": 0.06739530091517, + "grad_norm": 3.435955762863159, + "learning_rate": 9.999645420204703e-06, + "loss": 1.1735, + "step": 834 + }, + { + "epoch": 0.06747611062849755, + "grad_norm": 3.6010239124298096, + "learning_rate": 9.999637584518356e-06, + "loss": 1.0486, + "step": 835 + }, + { + "epoch": 0.06755692034182509, + "grad_norm": 2.755072832107544, + "learning_rate": 9.999629663200253e-06, + "loss": 1.1078, + "step": 836 + }, + { + "epoch": 0.06763773005515263, + "grad_norm": 2.9218761920928955, + "learning_rate": 9.999621656250528e-06, + "loss": 0.9966, + "step": 837 + }, + { + "epoch": 0.06771853976848018, + "grad_norm": 2.929852247238159, + "learning_rate": 9.999613563669322e-06, + "loss": 1.0675, + "step": 838 + }, + { + "epoch": 0.06779934948180771, + "grad_norm": 2.6583492755889893, + "learning_rate": 9.999605385456771e-06, + "loss": 1.0421, + "step": 839 + }, + { + "epoch": 0.06788015919513525, + "grad_norm": 3.123640537261963, + "learning_rate": 9.999597121613016e-06, + "loss": 1.0543, + "step": 840 + }, + { + "epoch": 0.0679609689084628, + "grad_norm": 3.2733051776885986, + "learning_rate": 9.9995887721382e-06, + "loss": 1.0589, + "step": 841 + }, + { + "epoch": 0.06804177862179034, + "grad_norm": 3.10587215423584, + "learning_rate": 9.999580337032462e-06, + "loss": 1.0889, + "step": 842 + }, + { + "epoch": 0.06812258833511788, + "grad_norm": 3.3167688846588135, + "learning_rate": 9.99957181629595e-06, + "loss": 1.1007, + "step": 843 + }, + { + "epoch": 0.06820339804844543, + "grad_norm": 3.340961456298828, + "learning_rate": 9.999563209928807e-06, + "loss": 1.0252, + "step": 844 + }, + { + "epoch": 0.06828420776177296, + "grad_norm": 3.1415212154388428, + "learning_rate": 9.999554517931185e-06, + "loss": 1.1617, + "step": 845 + }, + { + "epoch": 0.0683650174751005, + "grad_norm": 2.381803035736084, + "learning_rate": 9.999545740303228e-06, + "loss": 1.071, + "step": 846 + }, + { + "epoch": 0.06844582718842805, + "grad_norm": 3.0057709217071533, + "learning_rate": 9.999536877045088e-06, + "loss": 1.085, + "step": 847 + }, + { + "epoch": 0.06852663690175559, + "grad_norm": 3.0332605838775635, + "learning_rate": 9.99952792815692e-06, + "loss": 0.9406, + "step": 848 + }, + { + "epoch": 0.06860744661508314, + "grad_norm": 3.0253756046295166, + "learning_rate": 9.99951889363887e-06, + "loss": 1.0439, + "step": 849 + }, + { + "epoch": 0.06868825632841068, + "grad_norm": 2.754026174545288, + "learning_rate": 9.999509773491102e-06, + "loss": 1.1399, + "step": 850 + }, + { + "epoch": 0.06876906604173821, + "grad_norm": 3.0445125102996826, + "learning_rate": 9.999500567713765e-06, + "loss": 1.1194, + "step": 851 + }, + { + "epoch": 0.06884987575506576, + "grad_norm": 2.929490566253662, + "learning_rate": 9.999491276307018e-06, + "loss": 1.0489, + "step": 852 + }, + { + "epoch": 0.0689306854683933, + "grad_norm": 3.9951939582824707, + "learning_rate": 9.999481899271024e-06, + "loss": 1.1572, + "step": 853 + }, + { + "epoch": 0.06901149518172084, + "grad_norm": 3.125185251235962, + "learning_rate": 9.99947243660594e-06, + "loss": 1.0289, + "step": 854 + }, + { + "epoch": 0.06909230489504839, + "grad_norm": 3.0805680751800537, + "learning_rate": 9.999462888311928e-06, + "loss": 1.0773, + "step": 855 + }, + { + "epoch": 0.06917311460837593, + "grad_norm": 3.3032066822052, + "learning_rate": 9.999453254389152e-06, + "loss": 1.1828, + "step": 856 + }, + { + "epoch": 0.06925392432170346, + "grad_norm": 3.104738712310791, + "learning_rate": 9.999443534837778e-06, + "loss": 1.1429, + "step": 857 + }, + { + "epoch": 0.06933473403503101, + "grad_norm": 3.1322181224823, + "learning_rate": 9.999433729657972e-06, + "loss": 1.0625, + "step": 858 + }, + { + "epoch": 0.06941554374835855, + "grad_norm": 3.1211037635803223, + "learning_rate": 9.999423838849902e-06, + "loss": 1.1511, + "step": 859 + }, + { + "epoch": 0.06949635346168609, + "grad_norm": 2.916612148284912, + "learning_rate": 9.999413862413738e-06, + "loss": 1.0347, + "step": 860 + }, + { + "epoch": 0.06957716317501364, + "grad_norm": 2.9557623863220215, + "learning_rate": 9.999403800349649e-06, + "loss": 1.011, + "step": 861 + }, + { + "epoch": 0.06965797288834118, + "grad_norm": 3.3609611988067627, + "learning_rate": 9.999393652657809e-06, + "loss": 0.9942, + "step": 862 + }, + { + "epoch": 0.06973878260166871, + "grad_norm": 3.2939343452453613, + "learning_rate": 9.999383419338392e-06, + "loss": 0.9848, + "step": 863 + }, + { + "epoch": 0.06981959231499626, + "grad_norm": 3.0276999473571777, + "learning_rate": 9.99937310039157e-06, + "loss": 1.013, + "step": 864 + }, + { + "epoch": 0.0699004020283238, + "grad_norm": 3.5387251377105713, + "learning_rate": 9.999362695817524e-06, + "loss": 1.0187, + "step": 865 + }, + { + "epoch": 0.06998121174165135, + "grad_norm": 2.804373025894165, + "learning_rate": 9.999352205616431e-06, + "loss": 1.0709, + "step": 866 + }, + { + "epoch": 0.07006202145497889, + "grad_norm": 2.9923603534698486, + "learning_rate": 9.999341629788471e-06, + "loss": 1.1802, + "step": 867 + }, + { + "epoch": 0.07014283116830643, + "grad_norm": 3.1994659900665283, + "learning_rate": 9.999330968333823e-06, + "loss": 1.071, + "step": 868 + }, + { + "epoch": 0.07022364088163398, + "grad_norm": 2.899416923522949, + "learning_rate": 9.99932022125267e-06, + "loss": 1.2746, + "step": 869 + }, + { + "epoch": 0.07030445059496152, + "grad_norm": 3.0050110816955566, + "learning_rate": 9.999309388545198e-06, + "loss": 1.0684, + "step": 870 + }, + { + "epoch": 0.07038526030828905, + "grad_norm": 2.437838315963745, + "learning_rate": 9.999298470211591e-06, + "loss": 1.0498, + "step": 871 + }, + { + "epoch": 0.0704660700216166, + "grad_norm": 3.2453885078430176, + "learning_rate": 9.999287466252037e-06, + "loss": 0.9166, + "step": 872 + }, + { + "epoch": 0.07054687973494414, + "grad_norm": 2.980861186981201, + "learning_rate": 9.999276376666724e-06, + "loss": 1.1694, + "step": 873 + }, + { + "epoch": 0.07062768944827168, + "grad_norm": 3.631786346435547, + "learning_rate": 9.999265201455841e-06, + "loss": 1.01, + "step": 874 + }, + { + "epoch": 0.07070849916159923, + "grad_norm": 3.124498128890991, + "learning_rate": 9.999253940619582e-06, + "loss": 1.0673, + "step": 875 + }, + { + "epoch": 0.07078930887492677, + "grad_norm": 3.792041301727295, + "learning_rate": 9.999242594158136e-06, + "loss": 0.96, + "step": 876 + }, + { + "epoch": 0.0708701185882543, + "grad_norm": 3.1600944995880127, + "learning_rate": 9.999231162071701e-06, + "loss": 1.1406, + "step": 877 + }, + { + "epoch": 0.07095092830158185, + "grad_norm": 3.425360918045044, + "learning_rate": 9.999219644360471e-06, + "loss": 1.0799, + "step": 878 + }, + { + "epoch": 0.07103173801490939, + "grad_norm": 3.1234705448150635, + "learning_rate": 9.999208041024644e-06, + "loss": 1.1135, + "step": 879 + }, + { + "epoch": 0.07111254772823693, + "grad_norm": 3.099431037902832, + "learning_rate": 9.999196352064418e-06, + "loss": 0.9932, + "step": 880 + }, + { + "epoch": 0.07119335744156448, + "grad_norm": 4.025658130645752, + "learning_rate": 9.999184577479994e-06, + "loss": 0.9866, + "step": 881 + }, + { + "epoch": 0.07127416715489202, + "grad_norm": 3.110438108444214, + "learning_rate": 9.999172717271573e-06, + "loss": 0.9624, + "step": 882 + }, + { + "epoch": 0.07135497686821957, + "grad_norm": 2.9473679065704346, + "learning_rate": 9.99916077143936e-06, + "loss": 0.9861, + "step": 883 + }, + { + "epoch": 0.0714357865815471, + "grad_norm": 3.1517536640167236, + "learning_rate": 9.999148739983555e-06, + "loss": 1.068, + "step": 884 + }, + { + "epoch": 0.07151659629487464, + "grad_norm": 3.280348062515259, + "learning_rate": 9.99913662290437e-06, + "loss": 1.0738, + "step": 885 + }, + { + "epoch": 0.07159740600820219, + "grad_norm": 2.6466808319091797, + "learning_rate": 9.999124420202006e-06, + "loss": 1.0165, + "step": 886 + }, + { + "epoch": 0.07167821572152973, + "grad_norm": 3.676473617553711, + "learning_rate": 9.999112131876679e-06, + "loss": 1.0023, + "step": 887 + }, + { + "epoch": 0.07175902543485727, + "grad_norm": 3.317379951477051, + "learning_rate": 9.999099757928594e-06, + "loss": 1.0467, + "step": 888 + }, + { + "epoch": 0.07183983514818482, + "grad_norm": 3.21604061126709, + "learning_rate": 9.999087298357965e-06, + "loss": 0.9958, + "step": 889 + }, + { + "epoch": 0.07192064486151235, + "grad_norm": 3.962308168411255, + "learning_rate": 9.999074753165006e-06, + "loss": 1.0039, + "step": 890 + }, + { + "epoch": 0.07200145457483989, + "grad_norm": 3.2292165756225586, + "learning_rate": 9.999062122349931e-06, + "loss": 1.0584, + "step": 891 + }, + { + "epoch": 0.07208226428816744, + "grad_norm": 3.0409646034240723, + "learning_rate": 9.999049405912957e-06, + "loss": 1.0484, + "step": 892 + }, + { + "epoch": 0.07216307400149498, + "grad_norm": 2.9035871028900146, + "learning_rate": 9.999036603854302e-06, + "loss": 1.1937, + "step": 893 + }, + { + "epoch": 0.07224388371482252, + "grad_norm": 3.519012451171875, + "learning_rate": 9.999023716174183e-06, + "loss": 0.9877, + "step": 894 + }, + { + "epoch": 0.07232469342815007, + "grad_norm": 2.935112953186035, + "learning_rate": 9.999010742872824e-06, + "loss": 1.0983, + "step": 895 + }, + { + "epoch": 0.0724055031414776, + "grad_norm": 2.5634124279022217, + "learning_rate": 9.998997683950445e-06, + "loss": 1.0543, + "step": 896 + }, + { + "epoch": 0.07248631285480515, + "grad_norm": 3.281818389892578, + "learning_rate": 9.99898453940727e-06, + "loss": 1.0396, + "step": 897 + }, + { + "epoch": 0.07256712256813269, + "grad_norm": 3.64463472366333, + "learning_rate": 9.998971309243524e-06, + "loss": 1.1441, + "step": 898 + }, + { + "epoch": 0.07264793228146023, + "grad_norm": 2.7865548133850098, + "learning_rate": 9.998957993459436e-06, + "loss": 1.1753, + "step": 899 + }, + { + "epoch": 0.07272874199478778, + "grad_norm": 3.0096328258514404, + "learning_rate": 9.998944592055231e-06, + "loss": 0.9944, + "step": 900 + }, + { + "epoch": 0.07280955170811532, + "grad_norm": 2.683293581008911, + "learning_rate": 9.998931105031138e-06, + "loss": 1.1411, + "step": 901 + }, + { + "epoch": 0.07289036142144285, + "grad_norm": 3.6463422775268555, + "learning_rate": 9.99891753238739e-06, + "loss": 1.0012, + "step": 902 + }, + { + "epoch": 0.0729711711347704, + "grad_norm": 2.827143669128418, + "learning_rate": 9.998903874124222e-06, + "loss": 1.217, + "step": 903 + }, + { + "epoch": 0.07305198084809794, + "grad_norm": 3.3450632095336914, + "learning_rate": 9.998890130241862e-06, + "loss": 0.903, + "step": 904 + }, + { + "epoch": 0.07313279056142548, + "grad_norm": 2.957606792449951, + "learning_rate": 9.99887630074055e-06, + "loss": 1.077, + "step": 905 + }, + { + "epoch": 0.07321360027475303, + "grad_norm": 3.0255279541015625, + "learning_rate": 9.99886238562052e-06, + "loss": 0.9448, + "step": 906 + }, + { + "epoch": 0.07329440998808057, + "grad_norm": 3.0869853496551514, + "learning_rate": 9.998848384882011e-06, + "loss": 1.0614, + "step": 907 + }, + { + "epoch": 0.0733752197014081, + "grad_norm": 3.3012335300445557, + "learning_rate": 9.998834298525266e-06, + "loss": 1.0049, + "step": 908 + }, + { + "epoch": 0.07345602941473565, + "grad_norm": 3.3511605262756348, + "learning_rate": 9.998820126550522e-06, + "loss": 1.0177, + "step": 909 + }, + { + "epoch": 0.07353683912806319, + "grad_norm": 3.8056230545043945, + "learning_rate": 9.998805868958024e-06, + "loss": 1.1596, + "step": 910 + }, + { + "epoch": 0.07361764884139073, + "grad_norm": 3.6114351749420166, + "learning_rate": 9.998791525748014e-06, + "loss": 0.986, + "step": 911 + }, + { + "epoch": 0.07369845855471828, + "grad_norm": 3.1725072860717773, + "learning_rate": 9.998777096920742e-06, + "loss": 1.0334, + "step": 912 + }, + { + "epoch": 0.07377926826804582, + "grad_norm": 3.106642007827759, + "learning_rate": 9.99876258247645e-06, + "loss": 0.9502, + "step": 913 + }, + { + "epoch": 0.07386007798137337, + "grad_norm": 3.1275382041931152, + "learning_rate": 9.99874798241539e-06, + "loss": 0.9597, + "step": 914 + }, + { + "epoch": 0.0739408876947009, + "grad_norm": 3.19773006439209, + "learning_rate": 9.998733296737813e-06, + "loss": 1.1206, + "step": 915 + }, + { + "epoch": 0.07402169740802844, + "grad_norm": 3.3579866886138916, + "learning_rate": 9.998718525443965e-06, + "loss": 1.258, + "step": 916 + }, + { + "epoch": 0.07410250712135599, + "grad_norm": 3.145287275314331, + "learning_rate": 9.998703668534104e-06, + "loss": 1.1013, + "step": 917 + }, + { + "epoch": 0.07418331683468353, + "grad_norm": 2.775991678237915, + "learning_rate": 9.998688726008484e-06, + "loss": 1.1676, + "step": 918 + }, + { + "epoch": 0.07426412654801107, + "grad_norm": 2.732689619064331, + "learning_rate": 9.99867369786736e-06, + "loss": 1.0491, + "step": 919 + }, + { + "epoch": 0.07434493626133862, + "grad_norm": 3.073866605758667, + "learning_rate": 9.998658584110988e-06, + "loss": 1.1827, + "step": 920 + }, + { + "epoch": 0.07442574597466615, + "grad_norm": 2.9492454528808594, + "learning_rate": 9.998643384739628e-06, + "loss": 1.0028, + "step": 921 + }, + { + "epoch": 0.07450655568799369, + "grad_norm": 2.6749632358551025, + "learning_rate": 9.998628099753542e-06, + "loss": 0.9757, + "step": 922 + }, + { + "epoch": 0.07458736540132124, + "grad_norm": 2.9336817264556885, + "learning_rate": 9.99861272915299e-06, + "loss": 1.0409, + "step": 923 + }, + { + "epoch": 0.07466817511464878, + "grad_norm": 2.6571335792541504, + "learning_rate": 9.998597272938235e-06, + "loss": 0.9513, + "step": 924 + }, + { + "epoch": 0.07474898482797632, + "grad_norm": 2.5291669368743896, + "learning_rate": 9.998581731109542e-06, + "loss": 1.1042, + "step": 925 + }, + { + "epoch": 0.07482979454130387, + "grad_norm": 2.519695281982422, + "learning_rate": 9.998566103667178e-06, + "loss": 1.0565, + "step": 926 + }, + { + "epoch": 0.0749106042546314, + "grad_norm": 3.283141613006592, + "learning_rate": 9.998550390611411e-06, + "loss": 1.1389, + "step": 927 + }, + { + "epoch": 0.07499141396795894, + "grad_norm": 3.3262479305267334, + "learning_rate": 9.99853459194251e-06, + "loss": 0.9328, + "step": 928 + }, + { + "epoch": 0.07507222368128649, + "grad_norm": 2.841034173965454, + "learning_rate": 9.998518707660742e-06, + "loss": 1.0963, + "step": 929 + }, + { + "epoch": 0.07515303339461403, + "grad_norm": 2.4933481216430664, + "learning_rate": 9.998502737766385e-06, + "loss": 1.1025, + "step": 930 + }, + { + "epoch": 0.07523384310794158, + "grad_norm": 3.578246593475342, + "learning_rate": 9.998486682259707e-06, + "loss": 1.0138, + "step": 931 + }, + { + "epoch": 0.07531465282126912, + "grad_norm": 3.591609001159668, + "learning_rate": 9.998470541140988e-06, + "loss": 0.9914, + "step": 932 + }, + { + "epoch": 0.07539546253459666, + "grad_norm": 3.2323899269104004, + "learning_rate": 9.9984543144105e-06, + "loss": 1.1327, + "step": 933 + }, + { + "epoch": 0.0754762722479242, + "grad_norm": 3.5043742656707764, + "learning_rate": 9.998438002068524e-06, + "loss": 0.9981, + "step": 934 + }, + { + "epoch": 0.07555708196125174, + "grad_norm": 3.090705156326294, + "learning_rate": 9.998421604115338e-06, + "loss": 1.0002, + "step": 935 + }, + { + "epoch": 0.07563789167457928, + "grad_norm": 3.4069063663482666, + "learning_rate": 9.998405120551223e-06, + "loss": 1.0914, + "step": 936 + }, + { + "epoch": 0.07571870138790683, + "grad_norm": 3.153970241546631, + "learning_rate": 9.99838855137646e-06, + "loss": 1.0971, + "step": 937 + }, + { + "epoch": 0.07579951110123437, + "grad_norm": 2.9036552906036377, + "learning_rate": 9.998371896591337e-06, + "loss": 1.1212, + "step": 938 + }, + { + "epoch": 0.0758803208145619, + "grad_norm": 2.978816032409668, + "learning_rate": 9.998355156196134e-06, + "loss": 1.0859, + "step": 939 + }, + { + "epoch": 0.07596113052788946, + "grad_norm": 3.104283094406128, + "learning_rate": 9.998338330191142e-06, + "loss": 0.9989, + "step": 940 + }, + { + "epoch": 0.076041940241217, + "grad_norm": 2.811476945877075, + "learning_rate": 9.998321418576647e-06, + "loss": 1.0216, + "step": 941 + }, + { + "epoch": 0.07612274995454453, + "grad_norm": 3.6909635066986084, + "learning_rate": 9.998304421352938e-06, + "loss": 1.0462, + "step": 942 + }, + { + "epoch": 0.07620355966787208, + "grad_norm": 2.850799322128296, + "learning_rate": 9.998287338520309e-06, + "loss": 1.1047, + "step": 943 + }, + { + "epoch": 0.07628436938119962, + "grad_norm": 2.847245693206787, + "learning_rate": 9.998270170079049e-06, + "loss": 1.0456, + "step": 944 + }, + { + "epoch": 0.07636517909452716, + "grad_norm": 2.992706060409546, + "learning_rate": 9.998252916029453e-06, + "loss": 1.063, + "step": 945 + }, + { + "epoch": 0.0764459888078547, + "grad_norm": 3.3464736938476562, + "learning_rate": 9.99823557637182e-06, + "loss": 1.1306, + "step": 946 + }, + { + "epoch": 0.07652679852118224, + "grad_norm": 2.7523386478424072, + "learning_rate": 9.998218151106445e-06, + "loss": 1.0695, + "step": 947 + }, + { + "epoch": 0.0766076082345098, + "grad_norm": 3.064141035079956, + "learning_rate": 9.998200640233623e-06, + "loss": 1.0609, + "step": 948 + }, + { + "epoch": 0.07668841794783733, + "grad_norm": 3.3081955909729004, + "learning_rate": 9.998183043753657e-06, + "loss": 0.8739, + "step": 949 + }, + { + "epoch": 0.07676922766116487, + "grad_norm": 2.872645616531372, + "learning_rate": 9.998165361666849e-06, + "loss": 1.1083, + "step": 950 + }, + { + "epoch": 0.07685003737449242, + "grad_norm": 3.413339138031006, + "learning_rate": 9.998147593973501e-06, + "loss": 0.9652, + "step": 951 + }, + { + "epoch": 0.07693084708781996, + "grad_norm": 3.288818120956421, + "learning_rate": 9.998129740673918e-06, + "loss": 1.0153, + "step": 952 + }, + { + "epoch": 0.0770116568011475, + "grad_norm": 3.6216039657592773, + "learning_rate": 9.998111801768405e-06, + "loss": 0.9523, + "step": 953 + }, + { + "epoch": 0.07709246651447504, + "grad_norm": 3.0285098552703857, + "learning_rate": 9.998093777257267e-06, + "loss": 1.0642, + "step": 954 + }, + { + "epoch": 0.07717327622780258, + "grad_norm": 2.905763864517212, + "learning_rate": 9.998075667140817e-06, + "loss": 1.09, + "step": 955 + }, + { + "epoch": 0.07725408594113012, + "grad_norm": 3.0776312351226807, + "learning_rate": 9.998057471419362e-06, + "loss": 1.0764, + "step": 956 + }, + { + "epoch": 0.07733489565445767, + "grad_norm": 3.221165418624878, + "learning_rate": 9.998039190093216e-06, + "loss": 0.9886, + "step": 957 + }, + { + "epoch": 0.0774157053677852, + "grad_norm": 2.9817917346954346, + "learning_rate": 9.998020823162691e-06, + "loss": 1.1292, + "step": 958 + }, + { + "epoch": 0.07749651508111274, + "grad_norm": 3.237248182296753, + "learning_rate": 9.998002370628101e-06, + "loss": 1.0642, + "step": 959 + }, + { + "epoch": 0.0775773247944403, + "grad_norm": 2.8197250366210938, + "learning_rate": 9.99798383248976e-06, + "loss": 1.0919, + "step": 960 + }, + { + "epoch": 0.07765813450776783, + "grad_norm": 3.466444253921509, + "learning_rate": 9.997965208747993e-06, + "loss": 1.1215, + "step": 961 + }, + { + "epoch": 0.07773894422109538, + "grad_norm": 3.26401424407959, + "learning_rate": 9.997946499403111e-06, + "loss": 1.0712, + "step": 962 + }, + { + "epoch": 0.07781975393442292, + "grad_norm": 2.941288471221924, + "learning_rate": 9.997927704455439e-06, + "loss": 1.0511, + "step": 963 + }, + { + "epoch": 0.07790056364775046, + "grad_norm": 3.2247517108917236, + "learning_rate": 9.997908823905297e-06, + "loss": 1.0109, + "step": 964 + }, + { + "epoch": 0.07798137336107801, + "grad_norm": 2.8406410217285156, + "learning_rate": 9.99788985775301e-06, + "loss": 1.0049, + "step": 965 + }, + { + "epoch": 0.07806218307440554, + "grad_norm": 3.1028623580932617, + "learning_rate": 9.9978708059989e-06, + "loss": 0.9748, + "step": 966 + }, + { + "epoch": 0.07814299278773308, + "grad_norm": 3.500028371810913, + "learning_rate": 9.997851668643294e-06, + "loss": 0.9857, + "step": 967 + }, + { + "epoch": 0.07822380250106063, + "grad_norm": 3.2542593479156494, + "learning_rate": 9.997832445686521e-06, + "loss": 1.1175, + "step": 968 + }, + { + "epoch": 0.07830461221438817, + "grad_norm": 2.904175281524658, + "learning_rate": 9.997813137128912e-06, + "loss": 1.1044, + "step": 969 + }, + { + "epoch": 0.0783854219277157, + "grad_norm": 2.8977391719818115, + "learning_rate": 9.997793742970794e-06, + "loss": 1.1186, + "step": 970 + }, + { + "epoch": 0.07846623164104326, + "grad_norm": 2.478787660598755, + "learning_rate": 9.997774263212503e-06, + "loss": 1.3754, + "step": 971 + }, + { + "epoch": 0.0785470413543708, + "grad_norm": 2.9613890647888184, + "learning_rate": 9.997754697854369e-06, + "loss": 1.0197, + "step": 972 + }, + { + "epoch": 0.07862785106769833, + "grad_norm": 3.194923162460327, + "learning_rate": 9.997735046896728e-06, + "loss": 1.0814, + "step": 973 + }, + { + "epoch": 0.07870866078102588, + "grad_norm": 2.8879735469818115, + "learning_rate": 9.997715310339918e-06, + "loss": 1.0357, + "step": 974 + }, + { + "epoch": 0.07878947049435342, + "grad_norm": 2.7833099365234375, + "learning_rate": 9.997695488184275e-06, + "loss": 1.0096, + "step": 975 + }, + { + "epoch": 0.07887028020768096, + "grad_norm": 2.786844253540039, + "learning_rate": 9.997675580430141e-06, + "loss": 1.0949, + "step": 976 + }, + { + "epoch": 0.07895108992100851, + "grad_norm": 3.25620174407959, + "learning_rate": 9.997655587077858e-06, + "loss": 1.1849, + "step": 977 + }, + { + "epoch": 0.07903189963433604, + "grad_norm": 3.4656901359558105, + "learning_rate": 9.997635508127763e-06, + "loss": 0.9759, + "step": 978 + }, + { + "epoch": 0.0791127093476636, + "grad_norm": 3.0284624099731445, + "learning_rate": 9.997615343580202e-06, + "loss": 1.0203, + "step": 979 + }, + { + "epoch": 0.07919351906099113, + "grad_norm": 3.2560715675354004, + "learning_rate": 9.997595093435525e-06, + "loss": 1.058, + "step": 980 + }, + { + "epoch": 0.07927432877431867, + "grad_norm": 3.0773305892944336, + "learning_rate": 9.997574757694073e-06, + "loss": 1.1351, + "step": 981 + }, + { + "epoch": 0.07935513848764622, + "grad_norm": 3.027306318283081, + "learning_rate": 9.997554336356197e-06, + "loss": 1.0502, + "step": 982 + }, + { + "epoch": 0.07943594820097376, + "grad_norm": 3.059410810470581, + "learning_rate": 9.997533829422247e-06, + "loss": 0.9773, + "step": 983 + }, + { + "epoch": 0.0795167579143013, + "grad_norm": 3.317594289779663, + "learning_rate": 9.997513236892573e-06, + "loss": 1.0683, + "step": 984 + }, + { + "epoch": 0.07959756762762885, + "grad_norm": 2.8482987880706787, + "learning_rate": 9.997492558767527e-06, + "loss": 1.162, + "step": 985 + }, + { + "epoch": 0.07967837734095638, + "grad_norm": 3.2453463077545166, + "learning_rate": 9.997471795047467e-06, + "loss": 1.0496, + "step": 986 + }, + { + "epoch": 0.07975918705428392, + "grad_norm": 2.988056182861328, + "learning_rate": 9.997450945732745e-06, + "loss": 1.1193, + "step": 987 + }, + { + "epoch": 0.07983999676761147, + "grad_norm": 3.4100332260131836, + "learning_rate": 9.997430010823718e-06, + "loss": 1.028, + "step": 988 + }, + { + "epoch": 0.07992080648093901, + "grad_norm": 2.7486419677734375, + "learning_rate": 9.997408990320748e-06, + "loss": 1.0424, + "step": 989 + }, + { + "epoch": 0.08000161619426654, + "grad_norm": 2.617588520050049, + "learning_rate": 9.997387884224192e-06, + "loss": 1.0095, + "step": 990 + }, + { + "epoch": 0.0800824259075941, + "grad_norm": 3.6332125663757324, + "learning_rate": 9.997366692534411e-06, + "loss": 1.0239, + "step": 991 + }, + { + "epoch": 0.08016323562092163, + "grad_norm": 2.829860210418701, + "learning_rate": 9.99734541525177e-06, + "loss": 1.1689, + "step": 992 + }, + { + "epoch": 0.08024404533424917, + "grad_norm": 2.8051412105560303, + "learning_rate": 9.997324052376632e-06, + "loss": 1.1691, + "step": 993 + }, + { + "epoch": 0.08032485504757672, + "grad_norm": 3.1492583751678467, + "learning_rate": 9.997302603909364e-06, + "loss": 1.0621, + "step": 994 + }, + { + "epoch": 0.08040566476090426, + "grad_norm": 2.920750141143799, + "learning_rate": 9.997281069850333e-06, + "loss": 1.0543, + "step": 995 + }, + { + "epoch": 0.08048647447423181, + "grad_norm": 3.082822322845459, + "learning_rate": 9.997259450199908e-06, + "loss": 1.1169, + "step": 996 + }, + { + "epoch": 0.08056728418755935, + "grad_norm": 2.97986102104187, + "learning_rate": 9.99723774495846e-06, + "loss": 0.892, + "step": 997 + }, + { + "epoch": 0.08064809390088688, + "grad_norm": 3.309490919113159, + "learning_rate": 9.997215954126358e-06, + "loss": 0.9458, + "step": 998 + }, + { + "epoch": 0.08072890361421443, + "grad_norm": 3.0830867290496826, + "learning_rate": 9.997194077703979e-06, + "loss": 0.9706, + "step": 999 + }, + { + "epoch": 0.08080971332754197, + "grad_norm": 2.942925214767456, + "learning_rate": 9.997172115691693e-06, + "loss": 1.0849, + "step": 1000 + }, + { + "epoch": 0.08080971332754197, + "eval_loss": 0.8867675065994263, + "eval_runtime": 811.2983, + "eval_samples_per_second": 102.756, + "eval_steps_per_second": 12.845, + "step": 1000 + }, + { + "epoch": 0.08089052304086951, + "grad_norm": 3.0301153659820557, + "learning_rate": 9.99715006808988e-06, + "loss": 0.9936, + "step": 1001 + }, + { + "epoch": 0.08097133275419706, + "grad_norm": 3.1681180000305176, + "learning_rate": 9.997127934898917e-06, + "loss": 0.9121, + "step": 1002 + }, + { + "epoch": 0.0810521424675246, + "grad_norm": 2.993309497833252, + "learning_rate": 9.997105716119182e-06, + "loss": 1.0414, + "step": 1003 + }, + { + "epoch": 0.08113295218085213, + "grad_norm": 3.3118903636932373, + "learning_rate": 9.997083411751057e-06, + "loss": 1.0731, + "step": 1004 + }, + { + "epoch": 0.08121376189417968, + "grad_norm": 2.8904967308044434, + "learning_rate": 9.997061021794923e-06, + "loss": 0.9163, + "step": 1005 + }, + { + "epoch": 0.08129457160750722, + "grad_norm": 3.920422077178955, + "learning_rate": 9.997038546251163e-06, + "loss": 1.1656, + "step": 1006 + }, + { + "epoch": 0.08137538132083476, + "grad_norm": 3.122448444366455, + "learning_rate": 9.997015985120162e-06, + "loss": 1.0838, + "step": 1007 + }, + { + "epoch": 0.08145619103416231, + "grad_norm": 2.9611315727233887, + "learning_rate": 9.996993338402307e-06, + "loss": 1.0386, + "step": 1008 + }, + { + "epoch": 0.08153700074748985, + "grad_norm": 3.181063652038574, + "learning_rate": 9.996970606097987e-06, + "loss": 1.0529, + "step": 1009 + }, + { + "epoch": 0.0816178104608174, + "grad_norm": 2.72807240486145, + "learning_rate": 9.996947788207591e-06, + "loss": 1.0604, + "step": 1010 + }, + { + "epoch": 0.08169862017414493, + "grad_norm": 3.366835832595825, + "learning_rate": 9.996924884731507e-06, + "loss": 1.1703, + "step": 1011 + }, + { + "epoch": 0.08177942988747247, + "grad_norm": 2.6687614917755127, + "learning_rate": 9.99690189567013e-06, + "loss": 1.0441, + "step": 1012 + }, + { + "epoch": 0.08186023960080002, + "grad_norm": 2.6495840549468994, + "learning_rate": 9.996878821023854e-06, + "loss": 0.9448, + "step": 1013 + }, + { + "epoch": 0.08194104931412756, + "grad_norm": 2.3618621826171875, + "learning_rate": 9.996855660793071e-06, + "loss": 1.1143, + "step": 1014 + }, + { + "epoch": 0.0820218590274551, + "grad_norm": 3.137068510055542, + "learning_rate": 9.996832414978183e-06, + "loss": 1.1173, + "step": 1015 + }, + { + "epoch": 0.08210266874078265, + "grad_norm": 2.6272051334381104, + "learning_rate": 9.996809083579584e-06, + "loss": 0.9598, + "step": 1016 + }, + { + "epoch": 0.08218347845411018, + "grad_norm": 3.519331932067871, + "learning_rate": 9.996785666597675e-06, + "loss": 1.0637, + "step": 1017 + }, + { + "epoch": 0.08226428816743772, + "grad_norm": 3.0328478813171387, + "learning_rate": 9.996762164032857e-06, + "loss": 1.0441, + "step": 1018 + }, + { + "epoch": 0.08234509788076527, + "grad_norm": 3.2000579833984375, + "learning_rate": 9.99673857588553e-06, + "loss": 0.9696, + "step": 1019 + }, + { + "epoch": 0.08242590759409281, + "grad_norm": 2.9278225898742676, + "learning_rate": 9.996714902156104e-06, + "loss": 1.0213, + "step": 1020 + }, + { + "epoch": 0.08250671730742035, + "grad_norm": 2.7018179893493652, + "learning_rate": 9.996691142844977e-06, + "loss": 1.0357, + "step": 1021 + }, + { + "epoch": 0.0825875270207479, + "grad_norm": 3.73225736618042, + "learning_rate": 9.996667297952562e-06, + "loss": 0.9324, + "step": 1022 + }, + { + "epoch": 0.08266833673407543, + "grad_norm": 3.1943392753601074, + "learning_rate": 9.996643367479264e-06, + "loss": 0.9327, + "step": 1023 + }, + { + "epoch": 0.08274914644740297, + "grad_norm": 3.605224847793579, + "learning_rate": 9.996619351425495e-06, + "loss": 1.0015, + "step": 1024 + }, + { + "epoch": 0.08282995616073052, + "grad_norm": 2.8012125492095947, + "learning_rate": 9.996595249791666e-06, + "loss": 1.0836, + "step": 1025 + }, + { + "epoch": 0.08291076587405806, + "grad_norm": 3.3962903022766113, + "learning_rate": 9.996571062578187e-06, + "loss": 0.9289, + "step": 1026 + }, + { + "epoch": 0.08299157558738561, + "grad_norm": 4.059617519378662, + "learning_rate": 9.996546789785476e-06, + "loss": 1.0981, + "step": 1027 + }, + { + "epoch": 0.08307238530071315, + "grad_norm": 3.8433380126953125, + "learning_rate": 9.996522431413948e-06, + "loss": 1.0653, + "step": 1028 + }, + { + "epoch": 0.08315319501404068, + "grad_norm": 2.651620626449585, + "learning_rate": 9.996497987464019e-06, + "loss": 1.0522, + "step": 1029 + }, + { + "epoch": 0.08323400472736824, + "grad_norm": 3.1162960529327393, + "learning_rate": 9.996473457936107e-06, + "loss": 0.9291, + "step": 1030 + }, + { + "epoch": 0.08331481444069577, + "grad_norm": 3.1897988319396973, + "learning_rate": 9.996448842830633e-06, + "loss": 1.0302, + "step": 1031 + }, + { + "epoch": 0.08339562415402331, + "grad_norm": 2.5630762577056885, + "learning_rate": 9.99642414214802e-06, + "loss": 1.1536, + "step": 1032 + }, + { + "epoch": 0.08347643386735086, + "grad_norm": 3.242868423461914, + "learning_rate": 9.99639935588869e-06, + "loss": 1.0734, + "step": 1033 + }, + { + "epoch": 0.0835572435806784, + "grad_norm": 3.3553194999694824, + "learning_rate": 9.996374484053065e-06, + "loss": 0.969, + "step": 1034 + }, + { + "epoch": 0.08363805329400593, + "grad_norm": 3.5512146949768066, + "learning_rate": 9.996349526641575e-06, + "loss": 0.9276, + "step": 1035 + }, + { + "epoch": 0.08371886300733349, + "grad_norm": 2.9279568195343018, + "learning_rate": 9.996324483654646e-06, + "loss": 1.1008, + "step": 1036 + }, + { + "epoch": 0.08379967272066102, + "grad_norm": 3.434234380722046, + "learning_rate": 9.996299355092707e-06, + "loss": 1.1089, + "step": 1037 + }, + { + "epoch": 0.08388048243398856, + "grad_norm": 2.9546749591827393, + "learning_rate": 9.996274140956188e-06, + "loss": 1.0548, + "step": 1038 + }, + { + "epoch": 0.08396129214731611, + "grad_norm": 2.6084954738616943, + "learning_rate": 9.996248841245519e-06, + "loss": 1.0349, + "step": 1039 + }, + { + "epoch": 0.08404210186064365, + "grad_norm": 3.8449978828430176, + "learning_rate": 9.996223455961138e-06, + "loss": 1.1803, + "step": 1040 + }, + { + "epoch": 0.08412291157397118, + "grad_norm": 3.5063529014587402, + "learning_rate": 9.996197985103476e-06, + "loss": 0.9196, + "step": 1041 + }, + { + "epoch": 0.08420372128729874, + "grad_norm": 3.098813533782959, + "learning_rate": 9.99617242867297e-06, + "loss": 1.0411, + "step": 1042 + }, + { + "epoch": 0.08428453100062627, + "grad_norm": 3.460789442062378, + "learning_rate": 9.996146786670059e-06, + "loss": 1.0768, + "step": 1043 + }, + { + "epoch": 0.08436534071395382, + "grad_norm": 3.2159345149993896, + "learning_rate": 9.996121059095181e-06, + "loss": 1.0289, + "step": 1044 + }, + { + "epoch": 0.08444615042728136, + "grad_norm": 3.090740203857422, + "learning_rate": 9.996095245948776e-06, + "loss": 0.9809, + "step": 1045 + }, + { + "epoch": 0.0845269601406089, + "grad_norm": 3.0718343257904053, + "learning_rate": 9.996069347231288e-06, + "loss": 0.9484, + "step": 1046 + }, + { + "epoch": 0.08460776985393645, + "grad_norm": 2.9252374172210693, + "learning_rate": 9.99604336294316e-06, + "loss": 0.9557, + "step": 1047 + }, + { + "epoch": 0.08468857956726399, + "grad_norm": 3.0955123901367188, + "learning_rate": 9.996017293084837e-06, + "loss": 1.1824, + "step": 1048 + }, + { + "epoch": 0.08476938928059152, + "grad_norm": 2.992337942123413, + "learning_rate": 9.995991137656763e-06, + "loss": 1.0964, + "step": 1049 + }, + { + "epoch": 0.08485019899391907, + "grad_norm": 2.545712471008301, + "learning_rate": 9.99596489665939e-06, + "loss": 1.0103, + "step": 1050 + }, + { + "epoch": 0.08493100870724661, + "grad_norm": 3.002333402633667, + "learning_rate": 9.995938570093165e-06, + "loss": 1.0722, + "step": 1051 + }, + { + "epoch": 0.08501181842057415, + "grad_norm": 2.9183366298675537, + "learning_rate": 9.995912157958539e-06, + "loss": 0.9306, + "step": 1052 + }, + { + "epoch": 0.0850926281339017, + "grad_norm": 2.8887429237365723, + "learning_rate": 9.995885660255966e-06, + "loss": 1.0188, + "step": 1053 + }, + { + "epoch": 0.08517343784722924, + "grad_norm": 2.960714340209961, + "learning_rate": 9.9958590769859e-06, + "loss": 1.017, + "step": 1054 + }, + { + "epoch": 0.08525424756055677, + "grad_norm": 3.45218825340271, + "learning_rate": 9.995832408148791e-06, + "loss": 1.0186, + "step": 1055 + }, + { + "epoch": 0.08533505727388432, + "grad_norm": 3.007883071899414, + "learning_rate": 9.995805653745103e-06, + "loss": 0.9975, + "step": 1056 + }, + { + "epoch": 0.08541586698721186, + "grad_norm": 2.8929126262664795, + "learning_rate": 9.99577881377529e-06, + "loss": 0.993, + "step": 1057 + }, + { + "epoch": 0.0854966767005394, + "grad_norm": 2.9702260494232178, + "learning_rate": 9.995751888239814e-06, + "loss": 1.1256, + "step": 1058 + }, + { + "epoch": 0.08557748641386695, + "grad_norm": 2.7810113430023193, + "learning_rate": 9.995724877139133e-06, + "loss": 1.0006, + "step": 1059 + }, + { + "epoch": 0.08565829612719449, + "grad_norm": 2.9695522785186768, + "learning_rate": 9.995697780473711e-06, + "loss": 1.0215, + "step": 1060 + }, + { + "epoch": 0.08573910584052204, + "grad_norm": 2.9190425872802734, + "learning_rate": 9.995670598244017e-06, + "loss": 1.0327, + "step": 1061 + }, + { + "epoch": 0.08581991555384957, + "grad_norm": 3.2689478397369385, + "learning_rate": 9.995643330450508e-06, + "loss": 1.0618, + "step": 1062 + }, + { + "epoch": 0.08590072526717711, + "grad_norm": 3.0045993328094482, + "learning_rate": 9.995615977093656e-06, + "loss": 1.0528, + "step": 1063 + }, + { + "epoch": 0.08598153498050466, + "grad_norm": 3.6134586334228516, + "learning_rate": 9.99558853817393e-06, + "loss": 1.0186, + "step": 1064 + }, + { + "epoch": 0.0860623446938322, + "grad_norm": 2.8262598514556885, + "learning_rate": 9.9955610136918e-06, + "loss": 1.1177, + "step": 1065 + }, + { + "epoch": 0.08614315440715974, + "grad_norm": 2.9657886028289795, + "learning_rate": 9.995533403647733e-06, + "loss": 0.964, + "step": 1066 + }, + { + "epoch": 0.08622396412048729, + "grad_norm": 2.782003402709961, + "learning_rate": 9.995505708042206e-06, + "loss": 1.0412, + "step": 1067 + }, + { + "epoch": 0.08630477383381482, + "grad_norm": 3.0706048011779785, + "learning_rate": 9.995477926875692e-06, + "loss": 1.042, + "step": 1068 + }, + { + "epoch": 0.08638558354714236, + "grad_norm": 3.0264039039611816, + "learning_rate": 9.995450060148668e-06, + "loss": 1.0444, + "step": 1069 + }, + { + "epoch": 0.08646639326046991, + "grad_norm": 3.288853406906128, + "learning_rate": 9.995422107861612e-06, + "loss": 0.9645, + "step": 1070 + }, + { + "epoch": 0.08654720297379745, + "grad_norm": 2.773499011993408, + "learning_rate": 9.995394070015e-06, + "loss": 1.1742, + "step": 1071 + }, + { + "epoch": 0.08662801268712499, + "grad_norm": 3.0497007369995117, + "learning_rate": 9.995365946609312e-06, + "loss": 1.0304, + "step": 1072 + }, + { + "epoch": 0.08670882240045254, + "grad_norm": 3.416323661804199, + "learning_rate": 9.995337737645034e-06, + "loss": 1.1184, + "step": 1073 + }, + { + "epoch": 0.08678963211378007, + "grad_norm": 2.6845903396606445, + "learning_rate": 9.995309443122644e-06, + "loss": 1.0445, + "step": 1074 + }, + { + "epoch": 0.08687044182710763, + "grad_norm": 3.158496141433716, + "learning_rate": 9.99528106304263e-06, + "loss": 1.0373, + "step": 1075 + }, + { + "epoch": 0.08695125154043516, + "grad_norm": 2.750772714614868, + "learning_rate": 9.995252597405478e-06, + "loss": 1.1135, + "step": 1076 + }, + { + "epoch": 0.0870320612537627, + "grad_norm": 3.0192668437957764, + "learning_rate": 9.995224046211672e-06, + "loss": 0.9874, + "step": 1077 + }, + { + "epoch": 0.08711287096709025, + "grad_norm": 3.3132712841033936, + "learning_rate": 9.995195409461705e-06, + "loss": 0.9698, + "step": 1078 + }, + { + "epoch": 0.08719368068041779, + "grad_norm": 2.6165432929992676, + "learning_rate": 9.995166687156065e-06, + "loss": 0.9945, + "step": 1079 + }, + { + "epoch": 0.08727449039374532, + "grad_norm": 3.113999128341675, + "learning_rate": 9.995137879295246e-06, + "loss": 0.9752, + "step": 1080 + }, + { + "epoch": 0.08735530010707288, + "grad_norm": 3.393367052078247, + "learning_rate": 9.995108985879742e-06, + "loss": 1.1017, + "step": 1081 + }, + { + "epoch": 0.08743610982040041, + "grad_norm": 3.375389575958252, + "learning_rate": 9.995080006910044e-06, + "loss": 0.983, + "step": 1082 + }, + { + "epoch": 0.08751691953372795, + "grad_norm": 3.390043258666992, + "learning_rate": 9.995050942386653e-06, + "loss": 1.0365, + "step": 1083 + }, + { + "epoch": 0.0875977292470555, + "grad_norm": 3.4741687774658203, + "learning_rate": 9.995021792310063e-06, + "loss": 1.0178, + "step": 1084 + }, + { + "epoch": 0.08767853896038304, + "grad_norm": 2.5700831413269043, + "learning_rate": 9.994992556680774e-06, + "loss": 1.1193, + "step": 1085 + }, + { + "epoch": 0.08775934867371057, + "grad_norm": 3.1229987144470215, + "learning_rate": 9.994963235499288e-06, + "loss": 1.0312, + "step": 1086 + }, + { + "epoch": 0.08784015838703813, + "grad_norm": 2.904766082763672, + "learning_rate": 9.994933828766108e-06, + "loss": 0.9871, + "step": 1087 + }, + { + "epoch": 0.08792096810036566, + "grad_norm": 2.8027563095092773, + "learning_rate": 9.994904336481735e-06, + "loss": 1.0225, + "step": 1088 + }, + { + "epoch": 0.0880017778136932, + "grad_norm": 2.766493558883667, + "learning_rate": 9.994874758646676e-06, + "loss": 1.041, + "step": 1089 + }, + { + "epoch": 0.08808258752702075, + "grad_norm": 3.0756635665893555, + "learning_rate": 9.994845095261436e-06, + "loss": 1.1068, + "step": 1090 + }, + { + "epoch": 0.08816339724034829, + "grad_norm": 3.4534332752227783, + "learning_rate": 9.994815346326524e-06, + "loss": 1.0894, + "step": 1091 + }, + { + "epoch": 0.08824420695367584, + "grad_norm": 2.903165817260742, + "learning_rate": 9.99478551184245e-06, + "loss": 0.9894, + "step": 1092 + }, + { + "epoch": 0.08832501666700338, + "grad_norm": 3.0376791954040527, + "learning_rate": 9.994755591809726e-06, + "loss": 0.8919, + "step": 1093 + }, + { + "epoch": 0.08840582638033091, + "grad_norm": 3.136165142059326, + "learning_rate": 9.994725586228861e-06, + "loss": 1.065, + "step": 1094 + }, + { + "epoch": 0.08848663609365846, + "grad_norm": 3.06294322013855, + "learning_rate": 9.994695495100372e-06, + "loss": 0.993, + "step": 1095 + }, + { + "epoch": 0.088567445806986, + "grad_norm": 2.9708285331726074, + "learning_rate": 9.994665318424774e-06, + "loss": 0.9118, + "step": 1096 + }, + { + "epoch": 0.08864825552031354, + "grad_norm": 3.305607557296753, + "learning_rate": 9.994635056202584e-06, + "loss": 1.047, + "step": 1097 + }, + { + "epoch": 0.08872906523364109, + "grad_norm": 2.799258232116699, + "learning_rate": 9.994604708434318e-06, + "loss": 0.9901, + "step": 1098 + }, + { + "epoch": 0.08880987494696863, + "grad_norm": 3.0276477336883545, + "learning_rate": 9.994574275120497e-06, + "loss": 1.0149, + "step": 1099 + }, + { + "epoch": 0.08889068466029616, + "grad_norm": 2.715073823928833, + "learning_rate": 9.994543756261644e-06, + "loss": 1.0994, + "step": 1100 + }, + { + "epoch": 0.08897149437362371, + "grad_norm": 2.9331631660461426, + "learning_rate": 9.99451315185828e-06, + "loss": 1.0786, + "step": 1101 + }, + { + "epoch": 0.08905230408695125, + "grad_norm": 3.1864383220672607, + "learning_rate": 9.99448246191093e-06, + "loss": 0.9758, + "step": 1102 + }, + { + "epoch": 0.08913311380027879, + "grad_norm": 3.592210054397583, + "learning_rate": 9.994451686420117e-06, + "loss": 0.9905, + "step": 1103 + }, + { + "epoch": 0.08921392351360634, + "grad_norm": 2.7969889640808105, + "learning_rate": 9.994420825386373e-06, + "loss": 0.9889, + "step": 1104 + }, + { + "epoch": 0.08929473322693388, + "grad_norm": 3.270756721496582, + "learning_rate": 9.994389878810222e-06, + "loss": 1.0471, + "step": 1105 + }, + { + "epoch": 0.08937554294026141, + "grad_norm": 3.086674213409424, + "learning_rate": 9.994358846692197e-06, + "loss": 1.0376, + "step": 1106 + }, + { + "epoch": 0.08945635265358896, + "grad_norm": 4.83221960067749, + "learning_rate": 9.994327729032827e-06, + "loss": 1.1289, + "step": 1107 + }, + { + "epoch": 0.0895371623669165, + "grad_norm": 2.469668388366699, + "learning_rate": 9.994296525832647e-06, + "loss": 1.1292, + "step": 1108 + }, + { + "epoch": 0.08961797208024405, + "grad_norm": 2.915151357650757, + "learning_rate": 9.99426523709219e-06, + "loss": 0.9702, + "step": 1109 + }, + { + "epoch": 0.08969878179357159, + "grad_norm": 2.6539580821990967, + "learning_rate": 9.994233862811996e-06, + "loss": 1.0733, + "step": 1110 + }, + { + "epoch": 0.08977959150689913, + "grad_norm": 3.2237725257873535, + "learning_rate": 9.994202402992595e-06, + "loss": 1.0379, + "step": 1111 + }, + { + "epoch": 0.08986040122022668, + "grad_norm": 2.849095344543457, + "learning_rate": 9.994170857634531e-06, + "loss": 1.1208, + "step": 1112 + }, + { + "epoch": 0.08994121093355421, + "grad_norm": 2.9023728370666504, + "learning_rate": 9.994139226738345e-06, + "loss": 1.1777, + "step": 1113 + }, + { + "epoch": 0.09002202064688175, + "grad_norm": 3.650582790374756, + "learning_rate": 9.994107510304576e-06, + "loss": 1.0579, + "step": 1114 + }, + { + "epoch": 0.0901028303602093, + "grad_norm": 3.201936721801758, + "learning_rate": 9.994075708333767e-06, + "loss": 1.166, + "step": 1115 + }, + { + "epoch": 0.09018364007353684, + "grad_norm": 3.080941915512085, + "learning_rate": 9.994043820826465e-06, + "loss": 0.891, + "step": 1116 + }, + { + "epoch": 0.09026444978686438, + "grad_norm": 3.0385451316833496, + "learning_rate": 9.994011847783213e-06, + "loss": 1.0791, + "step": 1117 + }, + { + "epoch": 0.09034525950019193, + "grad_norm": 3.177011251449585, + "learning_rate": 9.993979789204565e-06, + "loss": 1.0181, + "step": 1118 + }, + { + "epoch": 0.09042606921351946, + "grad_norm": 3.9674606323242188, + "learning_rate": 9.993947645091063e-06, + "loss": 1.1167, + "step": 1119 + }, + { + "epoch": 0.090506878926847, + "grad_norm": 2.9916577339172363, + "learning_rate": 9.993915415443259e-06, + "loss": 0.989, + "step": 1120 + }, + { + "epoch": 0.09058768864017455, + "grad_norm": 2.9214396476745605, + "learning_rate": 9.993883100261707e-06, + "loss": 0.9741, + "step": 1121 + }, + { + "epoch": 0.09066849835350209, + "grad_norm": 3.219503164291382, + "learning_rate": 9.993850699546962e-06, + "loss": 1.0711, + "step": 1122 + }, + { + "epoch": 0.09074930806682964, + "grad_norm": 3.2714686393737793, + "learning_rate": 9.993818213299574e-06, + "loss": 0.9782, + "step": 1123 + }, + { + "epoch": 0.09083011778015718, + "grad_norm": 3.129592180252075, + "learning_rate": 9.993785641520104e-06, + "loss": 0.9803, + "step": 1124 + }, + { + "epoch": 0.09091092749348471, + "grad_norm": 3.368413209915161, + "learning_rate": 9.993752984209106e-06, + "loss": 0.9238, + "step": 1125 + }, + { + "epoch": 0.09099173720681226, + "grad_norm": 3.0969696044921875, + "learning_rate": 9.993720241367144e-06, + "loss": 1.0342, + "step": 1126 + }, + { + "epoch": 0.0910725469201398, + "grad_norm": 3.028538942337036, + "learning_rate": 9.993687412994774e-06, + "loss": 1.059, + "step": 1127 + }, + { + "epoch": 0.09115335663346734, + "grad_norm": 2.712517023086548, + "learning_rate": 9.993654499092563e-06, + "loss": 0.9761, + "step": 1128 + }, + { + "epoch": 0.09123416634679489, + "grad_norm": 3.0142993927001953, + "learning_rate": 9.993621499661069e-06, + "loss": 1.0237, + "step": 1129 + }, + { + "epoch": 0.09131497606012243, + "grad_norm": 3.3147966861724854, + "learning_rate": 9.993588414700862e-06, + "loss": 1.0907, + "step": 1130 + }, + { + "epoch": 0.09139578577344996, + "grad_norm": 3.217470407485962, + "learning_rate": 9.993555244212508e-06, + "loss": 1.0408, + "step": 1131 + }, + { + "epoch": 0.09147659548677751, + "grad_norm": 3.2648322582244873, + "learning_rate": 9.993521988196572e-06, + "loss": 1.0677, + "step": 1132 + }, + { + "epoch": 0.09155740520010505, + "grad_norm": 3.304222583770752, + "learning_rate": 9.993488646653626e-06, + "loss": 0.8679, + "step": 1133 + }, + { + "epoch": 0.09163821491343259, + "grad_norm": 2.758460521697998, + "learning_rate": 9.993455219584242e-06, + "loss": 1.0116, + "step": 1134 + }, + { + "epoch": 0.09171902462676014, + "grad_norm": 2.5346457958221436, + "learning_rate": 9.993421706988991e-06, + "loss": 1.1609, + "step": 1135 + }, + { + "epoch": 0.09179983434008768, + "grad_norm": 3.143198251724243, + "learning_rate": 9.993388108868447e-06, + "loss": 0.945, + "step": 1136 + }, + { + "epoch": 0.09188064405341521, + "grad_norm": 3.185825824737549, + "learning_rate": 9.993354425223186e-06, + "loss": 1.0233, + "step": 1137 + }, + { + "epoch": 0.09196145376674277, + "grad_norm": 2.7583227157592773, + "learning_rate": 9.993320656053785e-06, + "loss": 1.1102, + "step": 1138 + }, + { + "epoch": 0.0920422634800703, + "grad_norm": 2.8564047813415527, + "learning_rate": 9.993286801360822e-06, + "loss": 0.9633, + "step": 1139 + }, + { + "epoch": 0.09212307319339785, + "grad_norm": 3.02797269821167, + "learning_rate": 9.993252861144875e-06, + "loss": 0.9511, + "step": 1140 + }, + { + "epoch": 0.09220388290672539, + "grad_norm": 2.9829790592193604, + "learning_rate": 9.993218835406531e-06, + "loss": 0.9639, + "step": 1141 + }, + { + "epoch": 0.09228469262005293, + "grad_norm": 2.6532115936279297, + "learning_rate": 9.993184724146367e-06, + "loss": 0.988, + "step": 1142 + }, + { + "epoch": 0.09236550233338048, + "grad_norm": 3.163055896759033, + "learning_rate": 9.993150527364969e-06, + "loss": 1.0345, + "step": 1143 + }, + { + "epoch": 0.09244631204670802, + "grad_norm": 2.615417957305908, + "learning_rate": 9.993116245062923e-06, + "loss": 1.1074, + "step": 1144 + }, + { + "epoch": 0.09252712176003555, + "grad_norm": 2.909055471420288, + "learning_rate": 9.993081877240816e-06, + "loss": 0.9334, + "step": 1145 + }, + { + "epoch": 0.0926079314733631, + "grad_norm": 2.8035972118377686, + "learning_rate": 9.993047423899239e-06, + "loss": 1.0726, + "step": 1146 + }, + { + "epoch": 0.09268874118669064, + "grad_norm": 4.139641761779785, + "learning_rate": 9.993012885038777e-06, + "loss": 1.0193, + "step": 1147 + }, + { + "epoch": 0.09276955090001818, + "grad_norm": 2.9986088275909424, + "learning_rate": 9.992978260660024e-06, + "loss": 0.9637, + "step": 1148 + }, + { + "epoch": 0.09285036061334573, + "grad_norm": 2.904437303543091, + "learning_rate": 9.992943550763577e-06, + "loss": 1.1149, + "step": 1149 + }, + { + "epoch": 0.09293117032667327, + "grad_norm": 2.809866428375244, + "learning_rate": 9.992908755350024e-06, + "loss": 1.145, + "step": 1150 + }, + { + "epoch": 0.0930119800400008, + "grad_norm": 3.3679118156433105, + "learning_rate": 9.992873874419965e-06, + "loss": 1.0692, + "step": 1151 + }, + { + "epoch": 0.09309278975332835, + "grad_norm": 3.463749647140503, + "learning_rate": 9.992838907973996e-06, + "loss": 0.952, + "step": 1152 + }, + { + "epoch": 0.09317359946665589, + "grad_norm": 2.9335153102874756, + "learning_rate": 9.992803856012718e-06, + "loss": 1.0042, + "step": 1153 + }, + { + "epoch": 0.09325440917998343, + "grad_norm": 3.3813300132751465, + "learning_rate": 9.992768718536727e-06, + "loss": 0.9115, + "step": 1154 + }, + { + "epoch": 0.09333521889331098, + "grad_norm": 3.05812406539917, + "learning_rate": 9.99273349554663e-06, + "loss": 1.0089, + "step": 1155 + }, + { + "epoch": 0.09341602860663852, + "grad_norm": 2.7985036373138428, + "learning_rate": 9.992698187043026e-06, + "loss": 0.9933, + "step": 1156 + }, + { + "epoch": 0.09349683831996607, + "grad_norm": 2.7855563163757324, + "learning_rate": 9.992662793026522e-06, + "loss": 1.0122, + "step": 1157 + }, + { + "epoch": 0.0935776480332936, + "grad_norm": 2.9177639484405518, + "learning_rate": 9.992627313497724e-06, + "loss": 0.9769, + "step": 1158 + }, + { + "epoch": 0.09365845774662114, + "grad_norm": 2.8957858085632324, + "learning_rate": 9.992591748457239e-06, + "loss": 1.0274, + "step": 1159 + }, + { + "epoch": 0.09373926745994869, + "grad_norm": 2.8816301822662354, + "learning_rate": 9.992556097905677e-06, + "loss": 1.0627, + "step": 1160 + }, + { + "epoch": 0.09382007717327623, + "grad_norm": 3.1168038845062256, + "learning_rate": 9.992520361843647e-06, + "loss": 1.0046, + "step": 1161 + }, + { + "epoch": 0.09390088688660377, + "grad_norm": 2.815091609954834, + "learning_rate": 9.992484540271764e-06, + "loss": 1.0452, + "step": 1162 + }, + { + "epoch": 0.09398169659993132, + "grad_norm": 2.5361690521240234, + "learning_rate": 9.992448633190637e-06, + "loss": 1.0325, + "step": 1163 + }, + { + "epoch": 0.09406250631325885, + "grad_norm": 3.326178789138794, + "learning_rate": 9.992412640600886e-06, + "loss": 1.016, + "step": 1164 + }, + { + "epoch": 0.09414331602658639, + "grad_norm": 2.790809392929077, + "learning_rate": 9.992376562503125e-06, + "loss": 1.0654, + "step": 1165 + }, + { + "epoch": 0.09422412573991394, + "grad_norm": 2.718125104904175, + "learning_rate": 9.992340398897971e-06, + "loss": 1.0797, + "step": 1166 + }, + { + "epoch": 0.09430493545324148, + "grad_norm": 2.9559214115142822, + "learning_rate": 9.992304149786045e-06, + "loss": 1.0271, + "step": 1167 + }, + { + "epoch": 0.09438574516656902, + "grad_norm": 2.895761489868164, + "learning_rate": 9.992267815167968e-06, + "loss": 1.1303, + "step": 1168 + }, + { + "epoch": 0.09446655487989657, + "grad_norm": 2.5579259395599365, + "learning_rate": 9.992231395044363e-06, + "loss": 0.9703, + "step": 1169 + }, + { + "epoch": 0.0945473645932241, + "grad_norm": 3.430900812149048, + "learning_rate": 9.992194889415851e-06, + "loss": 1.0025, + "step": 1170 + }, + { + "epoch": 0.09462817430655164, + "grad_norm": 2.4399726390838623, + "learning_rate": 9.992158298283058e-06, + "loss": 1.0111, + "step": 1171 + }, + { + "epoch": 0.09470898401987919, + "grad_norm": 2.5215682983398438, + "learning_rate": 9.992121621646612e-06, + "loss": 1.1092, + "step": 1172 + }, + { + "epoch": 0.09478979373320673, + "grad_norm": 2.789412021636963, + "learning_rate": 9.99208485950714e-06, + "loss": 1.11, + "step": 1173 + }, + { + "epoch": 0.09487060344653428, + "grad_norm": 3.4019277095794678, + "learning_rate": 9.992048011865275e-06, + "loss": 1.1689, + "step": 1174 + }, + { + "epoch": 0.09495141315986182, + "grad_norm": 3.0252597332000732, + "learning_rate": 9.992011078721643e-06, + "loss": 1.0104, + "step": 1175 + }, + { + "epoch": 0.09503222287318935, + "grad_norm": 3.443145751953125, + "learning_rate": 9.991974060076878e-06, + "loss": 0.9915, + "step": 1176 + }, + { + "epoch": 0.0951130325865169, + "grad_norm": 3.1933462619781494, + "learning_rate": 9.991936955931618e-06, + "loss": 0.9152, + "step": 1177 + }, + { + "epoch": 0.09519384229984444, + "grad_norm": 3.098817825317383, + "learning_rate": 9.991899766286495e-06, + "loss": 1.11, + "step": 1178 + }, + { + "epoch": 0.09527465201317198, + "grad_norm": 2.6892173290252686, + "learning_rate": 9.991862491142145e-06, + "loss": 1.0359, + "step": 1179 + }, + { + "epoch": 0.09535546172649953, + "grad_norm": 2.912292003631592, + "learning_rate": 9.991825130499208e-06, + "loss": 1.0121, + "step": 1180 + }, + { + "epoch": 0.09543627143982707, + "grad_norm": 2.7129578590393066, + "learning_rate": 9.991787684358326e-06, + "loss": 0.989, + "step": 1181 + }, + { + "epoch": 0.0955170811531546, + "grad_norm": 3.12666654586792, + "learning_rate": 9.991750152720135e-06, + "loss": 1.0979, + "step": 1182 + }, + { + "epoch": 0.09559789086648215, + "grad_norm": 2.8867383003234863, + "learning_rate": 9.991712535585283e-06, + "loss": 1.1116, + "step": 1183 + }, + { + "epoch": 0.09567870057980969, + "grad_norm": 4.017873287200928, + "learning_rate": 9.991674832954413e-06, + "loss": 1.0397, + "step": 1184 + }, + { + "epoch": 0.09575951029313723, + "grad_norm": 2.715961217880249, + "learning_rate": 9.991637044828169e-06, + "loss": 1.0186, + "step": 1185 + }, + { + "epoch": 0.09584032000646478, + "grad_norm": 3.015178918838501, + "learning_rate": 9.991599171207198e-06, + "loss": 0.9089, + "step": 1186 + }, + { + "epoch": 0.09592112971979232, + "grad_norm": 3.0427072048187256, + "learning_rate": 9.991561212092152e-06, + "loss": 1.0281, + "step": 1187 + }, + { + "epoch": 0.09600193943311987, + "grad_norm": 2.909169912338257, + "learning_rate": 9.99152316748368e-06, + "loss": 0.9984, + "step": 1188 + }, + { + "epoch": 0.0960827491464474, + "grad_norm": 3.5129024982452393, + "learning_rate": 9.99148503738243e-06, + "loss": 1.0376, + "step": 1189 + }, + { + "epoch": 0.09616355885977494, + "grad_norm": 3.431941270828247, + "learning_rate": 9.99144682178906e-06, + "loss": 0.919, + "step": 1190 + }, + { + "epoch": 0.09624436857310249, + "grad_norm": 3.0185487270355225, + "learning_rate": 9.99140852070422e-06, + "loss": 0.9442, + "step": 1191 + }, + { + "epoch": 0.09632517828643003, + "grad_norm": 3.1691038608551025, + "learning_rate": 9.99137013412857e-06, + "loss": 1.0793, + "step": 1192 + }, + { + "epoch": 0.09640598799975757, + "grad_norm": 3.392519235610962, + "learning_rate": 9.991331662062766e-06, + "loss": 1.0241, + "step": 1193 + }, + { + "epoch": 0.09648679771308512, + "grad_norm": 3.272472381591797, + "learning_rate": 9.991293104507467e-06, + "loss": 1.1483, + "step": 1194 + }, + { + "epoch": 0.09656760742641265, + "grad_norm": 2.778557300567627, + "learning_rate": 9.991254461463332e-06, + "loss": 0.9363, + "step": 1195 + }, + { + "epoch": 0.09664841713974019, + "grad_norm": 3.2703022956848145, + "learning_rate": 9.991215732931024e-06, + "loss": 1.1747, + "step": 1196 + }, + { + "epoch": 0.09672922685306774, + "grad_norm": 2.765848398208618, + "learning_rate": 9.991176918911207e-06, + "loss": 1.0589, + "step": 1197 + }, + { + "epoch": 0.09681003656639528, + "grad_norm": 3.1882576942443848, + "learning_rate": 9.991138019404545e-06, + "loss": 1.0079, + "step": 1198 + }, + { + "epoch": 0.09689084627972282, + "grad_norm": 2.969113826751709, + "learning_rate": 9.991099034411705e-06, + "loss": 1.0743, + "step": 1199 + }, + { + "epoch": 0.09697165599305037, + "grad_norm": 3.393944501876831, + "learning_rate": 9.991059963933355e-06, + "loss": 1.0501, + "step": 1200 + }, + { + "epoch": 0.0970524657063779, + "grad_norm": 2.5616579055786133, + "learning_rate": 9.99102080797016e-06, + "loss": 1.1143, + "step": 1201 + }, + { + "epoch": 0.09713327541970544, + "grad_norm": 2.8260860443115234, + "learning_rate": 9.990981566522797e-06, + "loss": 1.0439, + "step": 1202 + }, + { + "epoch": 0.09721408513303299, + "grad_norm": 2.793118953704834, + "learning_rate": 9.990942239591934e-06, + "loss": 1.0708, + "step": 1203 + }, + { + "epoch": 0.09729489484636053, + "grad_norm": 2.6585116386413574, + "learning_rate": 9.990902827178246e-06, + "loss": 1.1239, + "step": 1204 + }, + { + "epoch": 0.09737570455968808, + "grad_norm": 2.647916555404663, + "learning_rate": 9.990863329282406e-06, + "loss": 1.0526, + "step": 1205 + }, + { + "epoch": 0.09745651427301562, + "grad_norm": 2.7440760135650635, + "learning_rate": 9.990823745905095e-06, + "loss": 1.0276, + "step": 1206 + }, + { + "epoch": 0.09753732398634316, + "grad_norm": 3.040421962738037, + "learning_rate": 9.990784077046985e-06, + "loss": 1.0668, + "step": 1207 + }, + { + "epoch": 0.0976181336996707, + "grad_norm": 3.0546274185180664, + "learning_rate": 9.990744322708761e-06, + "loss": 0.913, + "step": 1208 + }, + { + "epoch": 0.09769894341299824, + "grad_norm": 3.5525715351104736, + "learning_rate": 9.9907044828911e-06, + "loss": 0.9561, + "step": 1209 + }, + { + "epoch": 0.09777975312632578, + "grad_norm": 3.1630806922912598, + "learning_rate": 9.990664557594687e-06, + "loss": 1.1618, + "step": 1210 + }, + { + "epoch": 0.09786056283965333, + "grad_norm": 3.20241379737854, + "learning_rate": 9.990624546820204e-06, + "loss": 1.1931, + "step": 1211 + }, + { + "epoch": 0.09794137255298087, + "grad_norm": 2.860574722290039, + "learning_rate": 9.990584450568338e-06, + "loss": 0.958, + "step": 1212 + }, + { + "epoch": 0.0980221822663084, + "grad_norm": 3.0059783458709717, + "learning_rate": 9.990544268839773e-06, + "loss": 1.0053, + "step": 1213 + }, + { + "epoch": 0.09810299197963596, + "grad_norm": 3.187241315841675, + "learning_rate": 9.9905040016352e-06, + "loss": 0.9463, + "step": 1214 + }, + { + "epoch": 0.0981838016929635, + "grad_norm": 3.665228843688965, + "learning_rate": 9.990463648955306e-06, + "loss": 1.0178, + "step": 1215 + }, + { + "epoch": 0.09826461140629103, + "grad_norm": 2.5117361545562744, + "learning_rate": 9.990423210800786e-06, + "loss": 0.9685, + "step": 1216 + }, + { + "epoch": 0.09834542111961858, + "grad_norm": 3.453538417816162, + "learning_rate": 9.99038268717233e-06, + "loss": 1.0154, + "step": 1217 + }, + { + "epoch": 0.09842623083294612, + "grad_norm": 3.6725711822509766, + "learning_rate": 9.990342078070632e-06, + "loss": 1.0863, + "step": 1218 + }, + { + "epoch": 0.09850704054627366, + "grad_norm": 2.6737985610961914, + "learning_rate": 9.990301383496389e-06, + "loss": 1.1508, + "step": 1219 + }, + { + "epoch": 0.0985878502596012, + "grad_norm": 3.180436134338379, + "learning_rate": 9.990260603450294e-06, + "loss": 1.0313, + "step": 1220 + }, + { + "epoch": 0.09866865997292874, + "grad_norm": 3.468973159790039, + "learning_rate": 9.99021973793305e-06, + "loss": 1.0211, + "step": 1221 + }, + { + "epoch": 0.0987494696862563, + "grad_norm": 3.042269468307495, + "learning_rate": 9.990178786945356e-06, + "loss": 0.9764, + "step": 1222 + }, + { + "epoch": 0.09883027939958383, + "grad_norm": 2.8079960346221924, + "learning_rate": 9.990137750487912e-06, + "loss": 1.1182, + "step": 1223 + }, + { + "epoch": 0.09891108911291137, + "grad_norm": 2.746098279953003, + "learning_rate": 9.990096628561422e-06, + "loss": 0.958, + "step": 1224 + }, + { + "epoch": 0.09899189882623892, + "grad_norm": 2.608017683029175, + "learning_rate": 9.99005542116659e-06, + "loss": 1.0234, + "step": 1225 + }, + { + "epoch": 0.09907270853956646, + "grad_norm": 3.6061453819274902, + "learning_rate": 9.990014128304122e-06, + "loss": 1.0367, + "step": 1226 + }, + { + "epoch": 0.099153518252894, + "grad_norm": 2.522488832473755, + "learning_rate": 9.989972749974724e-06, + "loss": 1.0161, + "step": 1227 + }, + { + "epoch": 0.09923432796622154, + "grad_norm": 3.319269895553589, + "learning_rate": 9.989931286179106e-06, + "loss": 1.1268, + "step": 1228 + }, + { + "epoch": 0.09931513767954908, + "grad_norm": 2.809621572494507, + "learning_rate": 9.989889736917979e-06, + "loss": 0.9355, + "step": 1229 + }, + { + "epoch": 0.09939594739287662, + "grad_norm": 3.0901947021484375, + "learning_rate": 9.989848102192052e-06, + "loss": 1.1745, + "step": 1230 + }, + { + "epoch": 0.09947675710620417, + "grad_norm": 2.6698098182678223, + "learning_rate": 9.989806382002039e-06, + "loss": 1.0672, + "step": 1231 + }, + { + "epoch": 0.0995575668195317, + "grad_norm": 2.8612308502197266, + "learning_rate": 9.989764576348656e-06, + "loss": 0.9378, + "step": 1232 + }, + { + "epoch": 0.09963837653285924, + "grad_norm": 3.5653903484344482, + "learning_rate": 9.98972268523262e-06, + "loss": 1.0345, + "step": 1233 + }, + { + "epoch": 0.0997191862461868, + "grad_norm": 2.6366031169891357, + "learning_rate": 9.989680708654644e-06, + "loss": 1.011, + "step": 1234 + }, + { + "epoch": 0.09979999595951433, + "grad_norm": 3.223850965499878, + "learning_rate": 9.989638646615452e-06, + "loss": 0.9727, + "step": 1235 + }, + { + "epoch": 0.09988080567284188, + "grad_norm": 3.5510125160217285, + "learning_rate": 9.989596499115759e-06, + "loss": 1.0507, + "step": 1236 + }, + { + "epoch": 0.09996161538616942, + "grad_norm": 3.2256150245666504, + "learning_rate": 9.989554266156291e-06, + "loss": 0.9427, + "step": 1237 + }, + { + "epoch": 0.10004242509949696, + "grad_norm": 2.9781923294067383, + "learning_rate": 9.989511947737772e-06, + "loss": 0.9778, + "step": 1238 + }, + { + "epoch": 0.10012323481282451, + "grad_norm": 2.9848222732543945, + "learning_rate": 9.989469543860924e-06, + "loss": 1.0727, + "step": 1239 + }, + { + "epoch": 0.10020404452615204, + "grad_norm": 2.984651803970337, + "learning_rate": 9.989427054526476e-06, + "loss": 1.0833, + "step": 1240 + }, + { + "epoch": 0.10028485423947958, + "grad_norm": 2.8715219497680664, + "learning_rate": 9.98938447973515e-06, + "loss": 1.0833, + "step": 1241 + }, + { + "epoch": 0.10036566395280713, + "grad_norm": 2.879469156265259, + "learning_rate": 9.989341819487683e-06, + "loss": 1.1641, + "step": 1242 + }, + { + "epoch": 0.10044647366613467, + "grad_norm": 3.4816324710845947, + "learning_rate": 9.989299073784801e-06, + "loss": 1.0091, + "step": 1243 + }, + { + "epoch": 0.1005272833794622, + "grad_norm": 3.2763583660125732, + "learning_rate": 9.989256242627237e-06, + "loss": 1.104, + "step": 1244 + }, + { + "epoch": 0.10060809309278976, + "grad_norm": 3.425124406814575, + "learning_rate": 9.989213326015724e-06, + "loss": 1.0348, + "step": 1245 + }, + { + "epoch": 0.1006889028061173, + "grad_norm": 2.846920967102051, + "learning_rate": 9.989170323950999e-06, + "loss": 0.967, + "step": 1246 + }, + { + "epoch": 0.10076971251944483, + "grad_norm": 2.907984495162964, + "learning_rate": 9.989127236433795e-06, + "loss": 1.0871, + "step": 1247 + }, + { + "epoch": 0.10085052223277238, + "grad_norm": 2.9998886585235596, + "learning_rate": 9.989084063464855e-06, + "loss": 1.004, + "step": 1248 + }, + { + "epoch": 0.10093133194609992, + "grad_norm": 2.672520637512207, + "learning_rate": 9.989040805044914e-06, + "loss": 1.1658, + "step": 1249 + }, + { + "epoch": 0.10101214165942746, + "grad_norm": 2.842155694961548, + "learning_rate": 9.988997461174717e-06, + "loss": 1.0719, + "step": 1250 + }, + { + "epoch": 0.10109295137275501, + "grad_norm": 3.131115198135376, + "learning_rate": 9.988954031855001e-06, + "loss": 1.0328, + "step": 1251 + }, + { + "epoch": 0.10117376108608254, + "grad_norm": 3.119647264480591, + "learning_rate": 9.988910517086514e-06, + "loss": 0.9574, + "step": 1252 + }, + { + "epoch": 0.1012545707994101, + "grad_norm": 2.5935230255126953, + "learning_rate": 9.988866916870001e-06, + "loss": 1.0931, + "step": 1253 + }, + { + "epoch": 0.10133538051273763, + "grad_norm": 3.033775806427002, + "learning_rate": 9.988823231206208e-06, + "loss": 1.0586, + "step": 1254 + }, + { + "epoch": 0.10141619022606517, + "grad_norm": 2.881053924560547, + "learning_rate": 9.98877946009588e-06, + "loss": 1.1503, + "step": 1255 + }, + { + "epoch": 0.10149699993939272, + "grad_norm": 2.9397523403167725, + "learning_rate": 9.988735603539775e-06, + "loss": 0.9481, + "step": 1256 + }, + { + "epoch": 0.10157780965272026, + "grad_norm": 3.23587965965271, + "learning_rate": 9.988691661538634e-06, + "loss": 1.1529, + "step": 1257 + }, + { + "epoch": 0.1016586193660478, + "grad_norm": 3.011920690536499, + "learning_rate": 9.988647634093218e-06, + "loss": 0.9915, + "step": 1258 + }, + { + "epoch": 0.10173942907937535, + "grad_norm": 3.658949136734009, + "learning_rate": 9.988603521204276e-06, + "loss": 0.9997, + "step": 1259 + }, + { + "epoch": 0.10182023879270288, + "grad_norm": 3.2877025604248047, + "learning_rate": 9.988559322872567e-06, + "loss": 1.0461, + "step": 1260 + }, + { + "epoch": 0.10190104850603042, + "grad_norm": 3.080794334411621, + "learning_rate": 9.988515039098845e-06, + "loss": 0.9342, + "step": 1261 + }, + { + "epoch": 0.10198185821935797, + "grad_norm": 3.1314265727996826, + "learning_rate": 9.988470669883869e-06, + "loss": 1.0981, + "step": 1262 + }, + { + "epoch": 0.10206266793268551, + "grad_norm": 2.952272415161133, + "learning_rate": 9.9884262152284e-06, + "loss": 1.0115, + "step": 1263 + }, + { + "epoch": 0.10214347764601305, + "grad_norm": 3.143523693084717, + "learning_rate": 9.988381675133202e-06, + "loss": 1.0682, + "step": 1264 + }, + { + "epoch": 0.1022242873593406, + "grad_norm": 2.6488254070281982, + "learning_rate": 9.98833704959903e-06, + "loss": 1.2389, + "step": 1265 + }, + { + "epoch": 0.10230509707266813, + "grad_norm": 3.380514144897461, + "learning_rate": 9.988292338626658e-06, + "loss": 1.0109, + "step": 1266 + }, + { + "epoch": 0.10238590678599567, + "grad_norm": 2.8261666297912598, + "learning_rate": 9.988247542216844e-06, + "loss": 1.057, + "step": 1267 + }, + { + "epoch": 0.10246671649932322, + "grad_norm": 3.0670905113220215, + "learning_rate": 9.988202660370358e-06, + "loss": 1.1282, + "step": 1268 + }, + { + "epoch": 0.10254752621265076, + "grad_norm": 2.9264938831329346, + "learning_rate": 9.988157693087971e-06, + "loss": 0.971, + "step": 1269 + }, + { + "epoch": 0.10262833592597831, + "grad_norm": 2.8461177349090576, + "learning_rate": 9.98811264037045e-06, + "loss": 1.0776, + "step": 1270 + }, + { + "epoch": 0.10270914563930585, + "grad_norm": 2.623901844024658, + "learning_rate": 9.988067502218569e-06, + "loss": 1.096, + "step": 1271 + }, + { + "epoch": 0.10278995535263338, + "grad_norm": 2.9775118827819824, + "learning_rate": 9.988022278633097e-06, + "loss": 0.9979, + "step": 1272 + }, + { + "epoch": 0.10287076506596093, + "grad_norm": 3.2639927864074707, + "learning_rate": 9.987976969614816e-06, + "loss": 1.0962, + "step": 1273 + }, + { + "epoch": 0.10295157477928847, + "grad_norm": 2.864597797393799, + "learning_rate": 9.987931575164495e-06, + "loss": 0.9859, + "step": 1274 + }, + { + "epoch": 0.10303238449261601, + "grad_norm": 2.9163618087768555, + "learning_rate": 9.987886095282916e-06, + "loss": 1.0499, + "step": 1275 + }, + { + "epoch": 0.10311319420594356, + "grad_norm": 3.827516794204712, + "learning_rate": 9.987840529970853e-06, + "loss": 0.9777, + "step": 1276 + }, + { + "epoch": 0.1031940039192711, + "grad_norm": 2.858494758605957, + "learning_rate": 9.987794879229091e-06, + "loss": 0.9267, + "step": 1277 + }, + { + "epoch": 0.10327481363259863, + "grad_norm": 2.7237348556518555, + "learning_rate": 9.98774914305841e-06, + "loss": 1.0345, + "step": 1278 + }, + { + "epoch": 0.10335562334592618, + "grad_norm": 2.8835742473602295, + "learning_rate": 9.987703321459594e-06, + "loss": 1.0166, + "step": 1279 + }, + { + "epoch": 0.10343643305925372, + "grad_norm": 3.4978060722351074, + "learning_rate": 9.987657414433428e-06, + "loss": 0.992, + "step": 1280 + }, + { + "epoch": 0.10351724277258126, + "grad_norm": 2.936861753463745, + "learning_rate": 9.987611421980697e-06, + "loss": 1.01, + "step": 1281 + }, + { + "epoch": 0.10359805248590881, + "grad_norm": 3.074481725692749, + "learning_rate": 9.98756534410219e-06, + "loss": 0.9462, + "step": 1282 + }, + { + "epoch": 0.10367886219923635, + "grad_norm": 3.292534828186035, + "learning_rate": 9.987519180798696e-06, + "loss": 1.0477, + "step": 1283 + }, + { + "epoch": 0.10375967191256388, + "grad_norm": 2.9260239601135254, + "learning_rate": 9.987472932071004e-06, + "loss": 1.0362, + "step": 1284 + }, + { + "epoch": 0.10384048162589143, + "grad_norm": 2.984226942062378, + "learning_rate": 9.987426597919908e-06, + "loss": 1.0361, + "step": 1285 + }, + { + "epoch": 0.10392129133921897, + "grad_norm": 3.5397887229919434, + "learning_rate": 9.987380178346203e-06, + "loss": 1.0109, + "step": 1286 + }, + { + "epoch": 0.10400210105254652, + "grad_norm": 3.0161139965057373, + "learning_rate": 9.98733367335068e-06, + "loss": 1.0292, + "step": 1287 + }, + { + "epoch": 0.10408291076587406, + "grad_norm": 2.910106658935547, + "learning_rate": 9.987287082934139e-06, + "loss": 1.1038, + "step": 1288 + }, + { + "epoch": 0.1041637204792016, + "grad_norm": 2.628661870956421, + "learning_rate": 9.987240407097376e-06, + "loss": 0.9265, + "step": 1289 + }, + { + "epoch": 0.10424453019252915, + "grad_norm": 3.4557712078094482, + "learning_rate": 9.987193645841191e-06, + "loss": 1.0006, + "step": 1290 + }, + { + "epoch": 0.10432533990585668, + "grad_norm": 2.7255899906158447, + "learning_rate": 9.987146799166386e-06, + "loss": 0.909, + "step": 1291 + }, + { + "epoch": 0.10440614961918422, + "grad_norm": 2.7850592136383057, + "learning_rate": 9.98709986707376e-06, + "loss": 0.9859, + "step": 1292 + }, + { + "epoch": 0.10448695933251177, + "grad_norm": 2.9532461166381836, + "learning_rate": 9.98705284956412e-06, + "loss": 1.1035, + "step": 1293 + }, + { + "epoch": 0.10456776904583931, + "grad_norm": 2.9681034088134766, + "learning_rate": 9.987005746638272e-06, + "loss": 1.1082, + "step": 1294 + }, + { + "epoch": 0.10464857875916685, + "grad_norm": 2.5660200119018555, + "learning_rate": 9.986958558297021e-06, + "loss": 1.0165, + "step": 1295 + }, + { + "epoch": 0.1047293884724944, + "grad_norm": 3.059093952178955, + "learning_rate": 9.986911284541177e-06, + "loss": 0.9627, + "step": 1296 + }, + { + "epoch": 0.10481019818582193, + "grad_norm": 3.7301979064941406, + "learning_rate": 9.986863925371545e-06, + "loss": 1.1762, + "step": 1297 + }, + { + "epoch": 0.10489100789914947, + "grad_norm": 3.185795545578003, + "learning_rate": 9.986816480788941e-06, + "loss": 1.1375, + "step": 1298 + }, + { + "epoch": 0.10497181761247702, + "grad_norm": 2.9829509258270264, + "learning_rate": 9.986768950794176e-06, + "loss": 0.986, + "step": 1299 + }, + { + "epoch": 0.10505262732580456, + "grad_norm": 2.644713878631592, + "learning_rate": 9.986721335388064e-06, + "loss": 1.0705, + "step": 1300 + }, + { + "epoch": 0.10513343703913211, + "grad_norm": 2.5542094707489014, + "learning_rate": 9.98667363457142e-06, + "loss": 1.0634, + "step": 1301 + }, + { + "epoch": 0.10521424675245965, + "grad_norm": 3.101796865463257, + "learning_rate": 9.986625848345063e-06, + "loss": 1.0187, + "step": 1302 + }, + { + "epoch": 0.10529505646578718, + "grad_norm": 3.5837759971618652, + "learning_rate": 9.986577976709808e-06, + "loss": 0.8596, + "step": 1303 + }, + { + "epoch": 0.10537586617911474, + "grad_norm": 3.0948948860168457, + "learning_rate": 9.986530019666477e-06, + "loss": 1.1609, + "step": 1304 + }, + { + "epoch": 0.10545667589244227, + "grad_norm": 3.398942232131958, + "learning_rate": 9.986481977215892e-06, + "loss": 1.0362, + "step": 1305 + }, + { + "epoch": 0.10553748560576981, + "grad_norm": 2.6761116981506348, + "learning_rate": 9.986433849358876e-06, + "loss": 1.1106, + "step": 1306 + }, + { + "epoch": 0.10561829531909736, + "grad_norm": 3.135209560394287, + "learning_rate": 9.986385636096252e-06, + "loss": 1.0177, + "step": 1307 + }, + { + "epoch": 0.1056991050324249, + "grad_norm": 2.8185837268829346, + "learning_rate": 9.986337337428847e-06, + "loss": 1.0082, + "step": 1308 + }, + { + "epoch": 0.10577991474575243, + "grad_norm": 2.7826640605926514, + "learning_rate": 9.986288953357486e-06, + "loss": 0.9948, + "step": 1309 + }, + { + "epoch": 0.10586072445907999, + "grad_norm": 2.922977924346924, + "learning_rate": 9.986240483883e-06, + "loss": 1.0352, + "step": 1310 + }, + { + "epoch": 0.10594153417240752, + "grad_norm": 2.787425994873047, + "learning_rate": 9.986191929006217e-06, + "loss": 1.0207, + "step": 1311 + }, + { + "epoch": 0.10602234388573506, + "grad_norm": 3.1789329051971436, + "learning_rate": 9.986143288727972e-06, + "loss": 0.9805, + "step": 1312 + }, + { + "epoch": 0.10610315359906261, + "grad_norm": 3.885017156600952, + "learning_rate": 9.986094563049096e-06, + "loss": 0.9988, + "step": 1313 + }, + { + "epoch": 0.10618396331239015, + "grad_norm": 2.5836968421936035, + "learning_rate": 9.986045751970423e-06, + "loss": 0.9195, + "step": 1314 + }, + { + "epoch": 0.10626477302571768, + "grad_norm": 2.9889109134674072, + "learning_rate": 9.98599685549279e-06, + "loss": 0.8965, + "step": 1315 + }, + { + "epoch": 0.10634558273904524, + "grad_norm": 2.8345937728881836, + "learning_rate": 9.985947873617033e-06, + "loss": 1.0412, + "step": 1316 + }, + { + "epoch": 0.10642639245237277, + "grad_norm": 2.8975749015808105, + "learning_rate": 9.985898806343995e-06, + "loss": 0.9263, + "step": 1317 + }, + { + "epoch": 0.10650720216570032, + "grad_norm": 3.2546446323394775, + "learning_rate": 9.985849653674512e-06, + "loss": 1.1148, + "step": 1318 + }, + { + "epoch": 0.10658801187902786, + "grad_norm": 2.811129093170166, + "learning_rate": 9.985800415609426e-06, + "loss": 1.1397, + "step": 1319 + }, + { + "epoch": 0.1066688215923554, + "grad_norm": 2.553903102874756, + "learning_rate": 9.985751092149581e-06, + "loss": 0.939, + "step": 1320 + }, + { + "epoch": 0.10674963130568295, + "grad_norm": 2.546623468399048, + "learning_rate": 9.985701683295825e-06, + "loss": 1.0799, + "step": 1321 + }, + { + "epoch": 0.10683044101901049, + "grad_norm": 3.3942575454711914, + "learning_rate": 9.985652189049001e-06, + "loss": 1.028, + "step": 1322 + }, + { + "epoch": 0.10691125073233802, + "grad_norm": 2.967158079147339, + "learning_rate": 9.985602609409957e-06, + "loss": 1.1134, + "step": 1323 + }, + { + "epoch": 0.10699206044566557, + "grad_norm": 3.029263734817505, + "learning_rate": 9.985552944379544e-06, + "loss": 1.0863, + "step": 1324 + }, + { + "epoch": 0.10707287015899311, + "grad_norm": 2.7675929069519043, + "learning_rate": 9.98550319395861e-06, + "loss": 1.0656, + "step": 1325 + }, + { + "epoch": 0.10715367987232065, + "grad_norm": 2.4967398643493652, + "learning_rate": 9.985453358148008e-06, + "loss": 0.9626, + "step": 1326 + }, + { + "epoch": 0.1072344895856482, + "grad_norm": 3.1739935874938965, + "learning_rate": 9.985403436948593e-06, + "loss": 0.8729, + "step": 1327 + }, + { + "epoch": 0.10731529929897574, + "grad_norm": 2.97395658493042, + "learning_rate": 9.985353430361219e-06, + "loss": 0.9491, + "step": 1328 + }, + { + "epoch": 0.10739610901230327, + "grad_norm": 2.934847116470337, + "learning_rate": 9.985303338386743e-06, + "loss": 1.0922, + "step": 1329 + }, + { + "epoch": 0.10747691872563082, + "grad_norm": 2.6325595378875732, + "learning_rate": 9.98525316102602e-06, + "loss": 1.0173, + "step": 1330 + }, + { + "epoch": 0.10755772843895836, + "grad_norm": 2.7959065437316895, + "learning_rate": 9.985202898279914e-06, + "loss": 1.0029, + "step": 1331 + }, + { + "epoch": 0.1076385381522859, + "grad_norm": 2.5871143341064453, + "learning_rate": 9.985152550149283e-06, + "loss": 1.0173, + "step": 1332 + }, + { + "epoch": 0.10771934786561345, + "grad_norm": 3.379809617996216, + "learning_rate": 9.985102116634991e-06, + "loss": 1.0963, + "step": 1333 + }, + { + "epoch": 0.10780015757894099, + "grad_norm": 3.1764838695526123, + "learning_rate": 9.9850515977379e-06, + "loss": 1.1185, + "step": 1334 + }, + { + "epoch": 0.10788096729226854, + "grad_norm": 2.8184707164764404, + "learning_rate": 9.985000993458876e-06, + "loss": 0.8538, + "step": 1335 + }, + { + "epoch": 0.10796177700559607, + "grad_norm": 3.082862377166748, + "learning_rate": 9.984950303798787e-06, + "loss": 1.0536, + "step": 1336 + }, + { + "epoch": 0.10804258671892361, + "grad_norm": 3.090818405151367, + "learning_rate": 9.984899528758498e-06, + "loss": 1.0958, + "step": 1337 + }, + { + "epoch": 0.10812339643225116, + "grad_norm": 3.066433906555176, + "learning_rate": 9.984848668338883e-06, + "loss": 1.0357, + "step": 1338 + }, + { + "epoch": 0.1082042061455787, + "grad_norm": 2.9226813316345215, + "learning_rate": 9.984797722540808e-06, + "loss": 0.9513, + "step": 1339 + }, + { + "epoch": 0.10828501585890624, + "grad_norm": 2.9141769409179688, + "learning_rate": 9.984746691365153e-06, + "loss": 0.8497, + "step": 1340 + }, + { + "epoch": 0.10836582557223379, + "grad_norm": 2.980806589126587, + "learning_rate": 9.984695574812785e-06, + "loss": 0.9877, + "step": 1341 + }, + { + "epoch": 0.10844663528556132, + "grad_norm": 2.9618945121765137, + "learning_rate": 9.984644372884579e-06, + "loss": 1.0403, + "step": 1342 + }, + { + "epoch": 0.10852744499888886, + "grad_norm": 2.344740152359009, + "learning_rate": 9.984593085581419e-06, + "loss": 1.1414, + "step": 1343 + }, + { + "epoch": 0.10860825471221641, + "grad_norm": 3.0067477226257324, + "learning_rate": 9.984541712904178e-06, + "loss": 1.0102, + "step": 1344 + }, + { + "epoch": 0.10868906442554395, + "grad_norm": 2.7962794303894043, + "learning_rate": 9.984490254853737e-06, + "loss": 1.0444, + "step": 1345 + }, + { + "epoch": 0.10876987413887149, + "grad_norm": 2.8900249004364014, + "learning_rate": 9.984438711430978e-06, + "loss": 1.0275, + "step": 1346 + }, + { + "epoch": 0.10885068385219904, + "grad_norm": 3.3476011753082275, + "learning_rate": 9.984387082636783e-06, + "loss": 1.0067, + "step": 1347 + }, + { + "epoch": 0.10893149356552657, + "grad_norm": 2.934443473815918, + "learning_rate": 9.984335368472039e-06, + "loss": 1.0258, + "step": 1348 + }, + { + "epoch": 0.10901230327885413, + "grad_norm": 3.3023133277893066, + "learning_rate": 9.984283568937625e-06, + "loss": 0.9624, + "step": 1349 + }, + { + "epoch": 0.10909311299218166, + "grad_norm": 3.7269654273986816, + "learning_rate": 9.984231684034436e-06, + "loss": 1.0803, + "step": 1350 + }, + { + "epoch": 0.1091739227055092, + "grad_norm": 3.2603659629821777, + "learning_rate": 9.984179713763356e-06, + "loss": 1.1566, + "step": 1351 + }, + { + "epoch": 0.10925473241883675, + "grad_norm": 3.2887766361236572, + "learning_rate": 9.984127658125278e-06, + "loss": 0.994, + "step": 1352 + }, + { + "epoch": 0.10933554213216429, + "grad_norm": 3.1544361114501953, + "learning_rate": 9.98407551712109e-06, + "loss": 1.0248, + "step": 1353 + }, + { + "epoch": 0.10941635184549182, + "grad_norm": 2.8217058181762695, + "learning_rate": 9.984023290751688e-06, + "loss": 1.1021, + "step": 1354 + }, + { + "epoch": 0.10949716155881938, + "grad_norm": 3.799391269683838, + "learning_rate": 9.983970979017966e-06, + "loss": 1.0307, + "step": 1355 + }, + { + "epoch": 0.10957797127214691, + "grad_norm": 3.279231071472168, + "learning_rate": 9.983918581920817e-06, + "loss": 1.011, + "step": 1356 + }, + { + "epoch": 0.10965878098547445, + "grad_norm": 3.18418288230896, + "learning_rate": 9.983866099461144e-06, + "loss": 1.014, + "step": 1357 + }, + { + "epoch": 0.109739590698802, + "grad_norm": 2.627591371536255, + "learning_rate": 9.983813531639843e-06, + "loss": 1.1113, + "step": 1358 + }, + { + "epoch": 0.10982040041212954, + "grad_norm": 2.9131274223327637, + "learning_rate": 9.983760878457812e-06, + "loss": 1.0794, + "step": 1359 + }, + { + "epoch": 0.10990121012545707, + "grad_norm": 2.730416774749756, + "learning_rate": 9.983708139915956e-06, + "loss": 0.9852, + "step": 1360 + }, + { + "epoch": 0.10998201983878463, + "grad_norm": 2.822190761566162, + "learning_rate": 9.983655316015178e-06, + "loss": 0.9026, + "step": 1361 + }, + { + "epoch": 0.11006282955211216, + "grad_norm": 2.740520715713501, + "learning_rate": 9.983602406756381e-06, + "loss": 1.0623, + "step": 1362 + }, + { + "epoch": 0.1101436392654397, + "grad_norm": 3.0149307250976562, + "learning_rate": 9.983549412140475e-06, + "loss": 1.0159, + "step": 1363 + }, + { + "epoch": 0.11022444897876725, + "grad_norm": 2.778958797454834, + "learning_rate": 9.98349633216836e-06, + "loss": 1.0258, + "step": 1364 + }, + { + "epoch": 0.11030525869209479, + "grad_norm": 2.7317237854003906, + "learning_rate": 9.983443166840953e-06, + "loss": 1.0187, + "step": 1365 + }, + { + "epoch": 0.11038606840542234, + "grad_norm": 2.9775919914245605, + "learning_rate": 9.983389916159164e-06, + "loss": 0.9955, + "step": 1366 + }, + { + "epoch": 0.11046687811874988, + "grad_norm": 3.141310214996338, + "learning_rate": 9.983336580123899e-06, + "loss": 1.0282, + "step": 1367 + }, + { + "epoch": 0.11054768783207741, + "grad_norm": 3.1330995559692383, + "learning_rate": 9.983283158736077e-06, + "loss": 1.1138, + "step": 1368 + }, + { + "epoch": 0.11062849754540496, + "grad_norm": 2.7490227222442627, + "learning_rate": 9.98322965199661e-06, + "loss": 1.0073, + "step": 1369 + }, + { + "epoch": 0.1107093072587325, + "grad_norm": 3.0610904693603516, + "learning_rate": 9.98317605990642e-06, + "loss": 1.0441, + "step": 1370 + }, + { + "epoch": 0.11079011697206004, + "grad_norm": 2.916208505630493, + "learning_rate": 9.983122382466416e-06, + "loss": 0.974, + "step": 1371 + }, + { + "epoch": 0.11087092668538759, + "grad_norm": 3.0176069736480713, + "learning_rate": 9.983068619677522e-06, + "loss": 1.1055, + "step": 1372 + }, + { + "epoch": 0.11095173639871513, + "grad_norm": 3.2212870121002197, + "learning_rate": 9.98301477154066e-06, + "loss": 1.0239, + "step": 1373 + }, + { + "epoch": 0.11103254611204266, + "grad_norm": 2.967841386795044, + "learning_rate": 9.982960838056752e-06, + "loss": 0.976, + "step": 1374 + }, + { + "epoch": 0.11111335582537021, + "grad_norm": 3.0756235122680664, + "learning_rate": 9.98290681922672e-06, + "loss": 1.0716, + "step": 1375 + }, + { + "epoch": 0.11119416553869775, + "grad_norm": 2.6591484546661377, + "learning_rate": 9.98285271505149e-06, + "loss": 1.1386, + "step": 1376 + }, + { + "epoch": 0.11127497525202529, + "grad_norm": 2.74874210357666, + "learning_rate": 9.982798525531988e-06, + "loss": 1.0843, + "step": 1377 + }, + { + "epoch": 0.11135578496535284, + "grad_norm": 3.1984851360321045, + "learning_rate": 9.982744250669144e-06, + "loss": 1.0687, + "step": 1378 + }, + { + "epoch": 0.11143659467868038, + "grad_norm": 2.9788684844970703, + "learning_rate": 9.982689890463886e-06, + "loss": 1.1124, + "step": 1379 + }, + { + "epoch": 0.11151740439200791, + "grad_norm": 2.97041392326355, + "learning_rate": 9.982635444917146e-06, + "loss": 0.976, + "step": 1380 + }, + { + "epoch": 0.11159821410533546, + "grad_norm": 2.7469606399536133, + "learning_rate": 9.982580914029855e-06, + "loss": 1.0973, + "step": 1381 + }, + { + "epoch": 0.111679023818663, + "grad_norm": 3.545305013656616, + "learning_rate": 9.982526297802948e-06, + "loss": 1.0933, + "step": 1382 + }, + { + "epoch": 0.11175983353199055, + "grad_norm": 2.8980941772460938, + "learning_rate": 9.982471596237363e-06, + "loss": 0.9828, + "step": 1383 + }, + { + "epoch": 0.11184064324531809, + "grad_norm": 2.698004722595215, + "learning_rate": 9.982416809334031e-06, + "loss": 0.9321, + "step": 1384 + }, + { + "epoch": 0.11192145295864563, + "grad_norm": 3.128380298614502, + "learning_rate": 9.982361937093896e-06, + "loss": 1.0612, + "step": 1385 + }, + { + "epoch": 0.11200226267197318, + "grad_norm": 2.8916828632354736, + "learning_rate": 9.982306979517895e-06, + "loss": 1.0464, + "step": 1386 + }, + { + "epoch": 0.11208307238530071, + "grad_norm": 3.5671095848083496, + "learning_rate": 9.982251936606968e-06, + "loss": 1.0209, + "step": 1387 + }, + { + "epoch": 0.11216388209862825, + "grad_norm": 3.4572086334228516, + "learning_rate": 9.982196808362061e-06, + "loss": 1.0191, + "step": 1388 + }, + { + "epoch": 0.1122446918119558, + "grad_norm": 2.9889228343963623, + "learning_rate": 9.982141594784117e-06, + "loss": 1.0063, + "step": 1389 + }, + { + "epoch": 0.11232550152528334, + "grad_norm": 2.919299602508545, + "learning_rate": 9.982086295874083e-06, + "loss": 0.8683, + "step": 1390 + }, + { + "epoch": 0.11240631123861088, + "grad_norm": 2.9756603240966797, + "learning_rate": 9.982030911632903e-06, + "loss": 0.9901, + "step": 1391 + }, + { + "epoch": 0.11248712095193843, + "grad_norm": 2.9206433296203613, + "learning_rate": 9.981975442061527e-06, + "loss": 0.9961, + "step": 1392 + }, + { + "epoch": 0.11256793066526596, + "grad_norm": 2.865624189376831, + "learning_rate": 9.981919887160907e-06, + "loss": 1.0601, + "step": 1393 + }, + { + "epoch": 0.1126487403785935, + "grad_norm": 2.8166418075561523, + "learning_rate": 9.98186424693199e-06, + "loss": 0.9622, + "step": 1394 + }, + { + "epoch": 0.11272955009192105, + "grad_norm": 2.8043010234832764, + "learning_rate": 9.981808521375733e-06, + "loss": 1.0692, + "step": 1395 + }, + { + "epoch": 0.11281035980524859, + "grad_norm": 3.0252106189727783, + "learning_rate": 9.98175271049309e-06, + "loss": 1.0087, + "step": 1396 + }, + { + "epoch": 0.11289116951857613, + "grad_norm": 3.2231223583221436, + "learning_rate": 9.981696814285016e-06, + "loss": 1.0396, + "step": 1397 + }, + { + "epoch": 0.11297197923190368, + "grad_norm": 3.1894619464874268, + "learning_rate": 9.981640832752469e-06, + "loss": 1.0445, + "step": 1398 + }, + { + "epoch": 0.11305278894523121, + "grad_norm": 2.6307520866394043, + "learning_rate": 9.981584765896406e-06, + "loss": 1.0222, + "step": 1399 + }, + { + "epoch": 0.11313359865855876, + "grad_norm": 3.258815050125122, + "learning_rate": 9.981528613717789e-06, + "loss": 1.1528, + "step": 1400 + }, + { + "epoch": 0.1132144083718863, + "grad_norm": 2.789964199066162, + "learning_rate": 9.981472376217579e-06, + "loss": 1.01, + "step": 1401 + }, + { + "epoch": 0.11329521808521384, + "grad_norm": 2.6406161785125732, + "learning_rate": 9.981416053396741e-06, + "loss": 0.9244, + "step": 1402 + }, + { + "epoch": 0.11337602779854139, + "grad_norm": 2.730431318283081, + "learning_rate": 9.981359645256237e-06, + "loss": 1.0405, + "step": 1403 + }, + { + "epoch": 0.11345683751186893, + "grad_norm": 2.996708631515503, + "learning_rate": 9.981303151797036e-06, + "loss": 1.098, + "step": 1404 + }, + { + "epoch": 0.11353764722519646, + "grad_norm": 2.910529375076294, + "learning_rate": 9.981246573020102e-06, + "loss": 0.9583, + "step": 1405 + }, + { + "epoch": 0.11361845693852402, + "grad_norm": 2.94097638130188, + "learning_rate": 9.981189908926407e-06, + "loss": 1.0203, + "step": 1406 + }, + { + "epoch": 0.11369926665185155, + "grad_norm": 2.6606483459472656, + "learning_rate": 9.981133159516921e-06, + "loss": 0.9185, + "step": 1407 + }, + { + "epoch": 0.11378007636517909, + "grad_norm": 2.9483907222747803, + "learning_rate": 9.981076324792615e-06, + "loss": 1.0317, + "step": 1408 + }, + { + "epoch": 0.11386088607850664, + "grad_norm": 3.210864543914795, + "learning_rate": 9.981019404754462e-06, + "loss": 0.9826, + "step": 1409 + }, + { + "epoch": 0.11394169579183418, + "grad_norm": 2.6373202800750732, + "learning_rate": 9.98096239940344e-06, + "loss": 1.0134, + "step": 1410 + }, + { + "epoch": 0.11402250550516171, + "grad_norm": 2.9394280910491943, + "learning_rate": 9.980905308740521e-06, + "loss": 1.1652, + "step": 1411 + }, + { + "epoch": 0.11410331521848927, + "grad_norm": 3.0923264026641846, + "learning_rate": 9.980848132766688e-06, + "loss": 1.0251, + "step": 1412 + }, + { + "epoch": 0.1141841249318168, + "grad_norm": 3.204538583755493, + "learning_rate": 9.980790871482914e-06, + "loss": 1.0652, + "step": 1413 + }, + { + "epoch": 0.11426493464514435, + "grad_norm": 2.6705994606018066, + "learning_rate": 9.980733524890186e-06, + "loss": 1.1874, + "step": 1414 + }, + { + "epoch": 0.11434574435847189, + "grad_norm": 2.97686505317688, + "learning_rate": 9.980676092989481e-06, + "loss": 0.906, + "step": 1415 + }, + { + "epoch": 0.11442655407179943, + "grad_norm": 2.7322659492492676, + "learning_rate": 9.980618575781788e-06, + "loss": 1.0015, + "step": 1416 + }, + { + "epoch": 0.11450736378512698, + "grad_norm": 3.1375131607055664, + "learning_rate": 9.980560973268088e-06, + "loss": 0.9007, + "step": 1417 + }, + { + "epoch": 0.11458817349845452, + "grad_norm": 2.9137816429138184, + "learning_rate": 9.980503285449368e-06, + "loss": 1.0052, + "step": 1418 + }, + { + "epoch": 0.11466898321178205, + "grad_norm": 2.709336280822754, + "learning_rate": 9.980445512326616e-06, + "loss": 1.0539, + "step": 1419 + }, + { + "epoch": 0.1147497929251096, + "grad_norm": 3.1966333389282227, + "learning_rate": 9.980387653900822e-06, + "loss": 0.9864, + "step": 1420 + }, + { + "epoch": 0.11483060263843714, + "grad_norm": 2.9226574897766113, + "learning_rate": 9.98032971017298e-06, + "loss": 1.1161, + "step": 1421 + }, + { + "epoch": 0.11491141235176468, + "grad_norm": 3.04280161857605, + "learning_rate": 9.980271681144078e-06, + "loss": 1.0011, + "step": 1422 + }, + { + "epoch": 0.11499222206509223, + "grad_norm": 2.8626012802124023, + "learning_rate": 9.980213566815111e-06, + "loss": 1.1177, + "step": 1423 + }, + { + "epoch": 0.11507303177841977, + "grad_norm": 2.996809720993042, + "learning_rate": 9.980155367187077e-06, + "loss": 1.014, + "step": 1424 + }, + { + "epoch": 0.1151538414917473, + "grad_norm": 2.845885753631592, + "learning_rate": 9.980097082260968e-06, + "loss": 1.04, + "step": 1425 + }, + { + "epoch": 0.11523465120507485, + "grad_norm": 2.825157403945923, + "learning_rate": 9.980038712037788e-06, + "loss": 1.13, + "step": 1426 + }, + { + "epoch": 0.11531546091840239, + "grad_norm": 3.3714938163757324, + "learning_rate": 9.979980256518534e-06, + "loss": 1.0191, + "step": 1427 + }, + { + "epoch": 0.11539627063172993, + "grad_norm": 3.208986520767212, + "learning_rate": 9.979921715704204e-06, + "loss": 1.1193, + "step": 1428 + }, + { + "epoch": 0.11547708034505748, + "grad_norm": 3.0360562801361084, + "learning_rate": 9.979863089595804e-06, + "loss": 1.0445, + "step": 1429 + }, + { + "epoch": 0.11555789005838502, + "grad_norm": 2.9771480560302734, + "learning_rate": 9.979804378194339e-06, + "loss": 0.9945, + "step": 1430 + }, + { + "epoch": 0.11563869977171257, + "grad_norm": 2.920020341873169, + "learning_rate": 9.979745581500815e-06, + "loss": 1.0168, + "step": 1431 + }, + { + "epoch": 0.1157195094850401, + "grad_norm": 3.5531094074249268, + "learning_rate": 9.979686699516236e-06, + "loss": 1.1163, + "step": 1432 + }, + { + "epoch": 0.11580031919836764, + "grad_norm": 2.9046502113342285, + "learning_rate": 9.979627732241613e-06, + "loss": 0.9846, + "step": 1433 + }, + { + "epoch": 0.11588112891169519, + "grad_norm": 3.326899528503418, + "learning_rate": 9.979568679677952e-06, + "loss": 1.0666, + "step": 1434 + }, + { + "epoch": 0.11596193862502273, + "grad_norm": 2.7189176082611084, + "learning_rate": 9.97950954182627e-06, + "loss": 1.0358, + "step": 1435 + }, + { + "epoch": 0.11604274833835027, + "grad_norm": 2.8128626346588135, + "learning_rate": 9.979450318687576e-06, + "loss": 1.0221, + "step": 1436 + }, + { + "epoch": 0.11612355805167782, + "grad_norm": 2.7262070178985596, + "learning_rate": 9.979391010262885e-06, + "loss": 0.9747, + "step": 1437 + }, + { + "epoch": 0.11620436776500535, + "grad_norm": 2.7805073261260986, + "learning_rate": 9.979331616553215e-06, + "loss": 0.9004, + "step": 1438 + }, + { + "epoch": 0.11628517747833289, + "grad_norm": 2.975998878479004, + "learning_rate": 9.979272137559582e-06, + "loss": 1.0997, + "step": 1439 + }, + { + "epoch": 0.11636598719166044, + "grad_norm": 3.03562068939209, + "learning_rate": 9.979212573283002e-06, + "loss": 0.9495, + "step": 1440 + }, + { + "epoch": 0.11644679690498798, + "grad_norm": 3.038222074508667, + "learning_rate": 9.9791529237245e-06, + "loss": 1.0314, + "step": 1441 + }, + { + "epoch": 0.11652760661831552, + "grad_norm": 3.4328672885894775, + "learning_rate": 9.979093188885095e-06, + "loss": 0.9923, + "step": 1442 + }, + { + "epoch": 0.11660841633164307, + "grad_norm": 2.62213134765625, + "learning_rate": 9.979033368765806e-06, + "loss": 1.1393, + "step": 1443 + }, + { + "epoch": 0.1166892260449706, + "grad_norm": 2.790879726409912, + "learning_rate": 9.978973463367668e-06, + "loss": 1.103, + "step": 1444 + }, + { + "epoch": 0.11677003575829814, + "grad_norm": 3.236301898956299, + "learning_rate": 9.978913472691697e-06, + "loss": 0.986, + "step": 1445 + }, + { + "epoch": 0.11685084547162569, + "grad_norm": 3.7407853603363037, + "learning_rate": 9.978853396738926e-06, + "loss": 1.0055, + "step": 1446 + }, + { + "epoch": 0.11693165518495323, + "grad_norm": 2.775300979614258, + "learning_rate": 9.978793235510382e-06, + "loss": 1.0833, + "step": 1447 + }, + { + "epoch": 0.11701246489828078, + "grad_norm": 2.7946441173553467, + "learning_rate": 9.978732989007096e-06, + "loss": 1.1405, + "step": 1448 + }, + { + "epoch": 0.11709327461160832, + "grad_norm": 2.7497661113739014, + "learning_rate": 9.978672657230099e-06, + "loss": 1.0585, + "step": 1449 + }, + { + "epoch": 0.11717408432493585, + "grad_norm": 3.4076762199401855, + "learning_rate": 9.978612240180428e-06, + "loss": 1.0371, + "step": 1450 + }, + { + "epoch": 0.1172548940382634, + "grad_norm": 2.9828991889953613, + "learning_rate": 9.978551737859112e-06, + "loss": 1.0987, + "step": 1451 + }, + { + "epoch": 0.11733570375159094, + "grad_norm": 2.623058795928955, + "learning_rate": 9.97849115026719e-06, + "loss": 1.0565, + "step": 1452 + }, + { + "epoch": 0.11741651346491848, + "grad_norm": 3.308063268661499, + "learning_rate": 9.9784304774057e-06, + "loss": 0.9084, + "step": 1453 + }, + { + "epoch": 0.11749732317824603, + "grad_norm": 2.9729201793670654, + "learning_rate": 9.978369719275682e-06, + "loss": 1.1879, + "step": 1454 + }, + { + "epoch": 0.11757813289157357, + "grad_norm": 3.2391316890716553, + "learning_rate": 9.978308875878176e-06, + "loss": 1.0461, + "step": 1455 + }, + { + "epoch": 0.1176589426049011, + "grad_norm": 3.392163038253784, + "learning_rate": 9.978247947214223e-06, + "loss": 1.0646, + "step": 1456 + }, + { + "epoch": 0.11773975231822865, + "grad_norm": 2.89571213722229, + "learning_rate": 9.978186933284868e-06, + "loss": 0.9876, + "step": 1457 + }, + { + "epoch": 0.11782056203155619, + "grad_norm": 2.7080161571502686, + "learning_rate": 9.978125834091155e-06, + "loss": 1.0437, + "step": 1458 + }, + { + "epoch": 0.11790137174488373, + "grad_norm": 3.3069257736206055, + "learning_rate": 9.97806464963413e-06, + "loss": 1.0233, + "step": 1459 + }, + { + "epoch": 0.11798218145821128, + "grad_norm": 2.981694221496582, + "learning_rate": 9.978003379914843e-06, + "loss": 1.0159, + "step": 1460 + }, + { + "epoch": 0.11806299117153882, + "grad_norm": 2.967569351196289, + "learning_rate": 9.977942024934339e-06, + "loss": 0.9759, + "step": 1461 + }, + { + "epoch": 0.11814380088486635, + "grad_norm": 3.4966399669647217, + "learning_rate": 9.977880584693676e-06, + "loss": 1.0371, + "step": 1462 + }, + { + "epoch": 0.1182246105981939, + "grad_norm": 2.8589589595794678, + "learning_rate": 9.977819059193901e-06, + "loss": 1.002, + "step": 1463 + }, + { + "epoch": 0.11830542031152144, + "grad_norm": 2.8677189350128174, + "learning_rate": 9.977757448436068e-06, + "loss": 0.9869, + "step": 1464 + }, + { + "epoch": 0.11838623002484899, + "grad_norm": 3.4240634441375732, + "learning_rate": 9.977695752421235e-06, + "loss": 0.9439, + "step": 1465 + }, + { + "epoch": 0.11846703973817653, + "grad_norm": 3.0775208473205566, + "learning_rate": 9.977633971150455e-06, + "loss": 0.9236, + "step": 1466 + }, + { + "epoch": 0.11854784945150407, + "grad_norm": 2.7149441242218018, + "learning_rate": 9.97757210462479e-06, + "loss": 0.8936, + "step": 1467 + }, + { + "epoch": 0.11862865916483162, + "grad_norm": 3.633910894393921, + "learning_rate": 9.977510152845298e-06, + "loss": 1.1891, + "step": 1468 + }, + { + "epoch": 0.11870946887815916, + "grad_norm": 2.748262405395508, + "learning_rate": 9.97744811581304e-06, + "loss": 0.9211, + "step": 1469 + }, + { + "epoch": 0.11879027859148669, + "grad_norm": 2.662281036376953, + "learning_rate": 9.977385993529077e-06, + "loss": 1.0569, + "step": 1470 + }, + { + "epoch": 0.11887108830481424, + "grad_norm": 2.6894032955169678, + "learning_rate": 9.977323785994475e-06, + "loss": 1.033, + "step": 1471 + }, + { + "epoch": 0.11895189801814178, + "grad_norm": 2.809366226196289, + "learning_rate": 9.977261493210299e-06, + "loss": 1.005, + "step": 1472 + }, + { + "epoch": 0.11903270773146932, + "grad_norm": 3.1424038410186768, + "learning_rate": 9.977199115177616e-06, + "loss": 1.0223, + "step": 1473 + }, + { + "epoch": 0.11911351744479687, + "grad_norm": 2.9231789112091064, + "learning_rate": 9.977136651897495e-06, + "loss": 0.9585, + "step": 1474 + }, + { + "epoch": 0.1191943271581244, + "grad_norm": 3.0695743560791016, + "learning_rate": 9.977074103371005e-06, + "loss": 1.0148, + "step": 1475 + }, + { + "epoch": 0.11927513687145194, + "grad_norm": 3.018730640411377, + "learning_rate": 9.977011469599218e-06, + "loss": 1.01, + "step": 1476 + }, + { + "epoch": 0.1193559465847795, + "grad_norm": 2.963622570037842, + "learning_rate": 9.976948750583204e-06, + "loss": 0.9736, + "step": 1477 + }, + { + "epoch": 0.11943675629810703, + "grad_norm": 2.6269237995147705, + "learning_rate": 9.976885946324042e-06, + "loss": 0.9666, + "step": 1478 + }, + { + "epoch": 0.11951756601143458, + "grad_norm": 2.6247920989990234, + "learning_rate": 9.976823056822802e-06, + "loss": 1.0616, + "step": 1479 + }, + { + "epoch": 0.11959837572476212, + "grad_norm": 2.672311782836914, + "learning_rate": 9.976760082080567e-06, + "loss": 1.0102, + "step": 1480 + }, + { + "epoch": 0.11967918543808966, + "grad_norm": 3.7789974212646484, + "learning_rate": 9.976697022098411e-06, + "loss": 0.9777, + "step": 1481 + }, + { + "epoch": 0.1197599951514172, + "grad_norm": 2.782094955444336, + "learning_rate": 9.976633876877417e-06, + "loss": 1.1142, + "step": 1482 + }, + { + "epoch": 0.11984080486474474, + "grad_norm": 2.689985513687134, + "learning_rate": 9.976570646418665e-06, + "loss": 1.038, + "step": 1483 + }, + { + "epoch": 0.11992161457807228, + "grad_norm": 2.7190632820129395, + "learning_rate": 9.97650733072324e-06, + "loss": 0.999, + "step": 1484 + }, + { + "epoch": 0.12000242429139983, + "grad_norm": 2.327003240585327, + "learning_rate": 9.976443929792223e-06, + "loss": 1.0793, + "step": 1485 + }, + { + "epoch": 0.12008323400472737, + "grad_norm": 3.1247661113739014, + "learning_rate": 9.976380443626701e-06, + "loss": 1.0358, + "step": 1486 + }, + { + "epoch": 0.1201640437180549, + "grad_norm": 3.1908297538757324, + "learning_rate": 9.976316872227765e-06, + "loss": 1.0745, + "step": 1487 + }, + { + "epoch": 0.12024485343138246, + "grad_norm": 2.9428727626800537, + "learning_rate": 9.976253215596498e-06, + "loss": 1.0319, + "step": 1488 + }, + { + "epoch": 0.12032566314471, + "grad_norm": 3.5103609561920166, + "learning_rate": 9.976189473733995e-06, + "loss": 0.9165, + "step": 1489 + }, + { + "epoch": 0.12040647285803753, + "grad_norm": 2.866847276687622, + "learning_rate": 9.976125646641346e-06, + "loss": 0.9826, + "step": 1490 + }, + { + "epoch": 0.12048728257136508, + "grad_norm": 2.668189287185669, + "learning_rate": 9.976061734319644e-06, + "loss": 1.0166, + "step": 1491 + }, + { + "epoch": 0.12056809228469262, + "grad_norm": 2.6091597080230713, + "learning_rate": 9.975997736769984e-06, + "loss": 1.0109, + "step": 1492 + }, + { + "epoch": 0.12064890199802016, + "grad_norm": 3.3840620517730713, + "learning_rate": 9.975933653993462e-06, + "loss": 1.1303, + "step": 1493 + }, + { + "epoch": 0.1207297117113477, + "grad_norm": 2.5798120498657227, + "learning_rate": 9.975869485991175e-06, + "loss": 1.1408, + "step": 1494 + }, + { + "epoch": 0.12081052142467524, + "grad_norm": 3.8495700359344482, + "learning_rate": 9.975805232764223e-06, + "loss": 1.1363, + "step": 1495 + }, + { + "epoch": 0.1208913311380028, + "grad_norm": 2.622189998626709, + "learning_rate": 9.975740894313706e-06, + "loss": 1.0252, + "step": 1496 + }, + { + "epoch": 0.12097214085133033, + "grad_norm": 2.8880317211151123, + "learning_rate": 9.975676470640724e-06, + "loss": 0.9853, + "step": 1497 + }, + { + "epoch": 0.12105295056465787, + "grad_norm": 3.6159539222717285, + "learning_rate": 9.975611961746387e-06, + "loss": 1.0694, + "step": 1498 + }, + { + "epoch": 0.12113376027798542, + "grad_norm": 4.053859710693359, + "learning_rate": 9.975547367631793e-06, + "loss": 1.0925, + "step": 1499 + }, + { + "epoch": 0.12121456999131296, + "grad_norm": 2.7435827255249023, + "learning_rate": 9.97548268829805e-06, + "loss": 1.0758, + "step": 1500 + }, + { + "epoch": 0.1212953797046405, + "grad_norm": 2.9291670322418213, + "learning_rate": 9.975417923746268e-06, + "loss": 1.0237, + "step": 1501 + }, + { + "epoch": 0.12137618941796804, + "grad_norm": 2.6226656436920166, + "learning_rate": 9.975353073977555e-06, + "loss": 1.0267, + "step": 1502 + }, + { + "epoch": 0.12145699913129558, + "grad_norm": 2.6459484100341797, + "learning_rate": 9.97528813899302e-06, + "loss": 0.9548, + "step": 1503 + }, + { + "epoch": 0.12153780884462312, + "grad_norm": 3.0570766925811768, + "learning_rate": 9.975223118793776e-06, + "loss": 0.9643, + "step": 1504 + }, + { + "epoch": 0.12161861855795067, + "grad_norm": 3.1126012802124023, + "learning_rate": 9.97515801338094e-06, + "loss": 1.0619, + "step": 1505 + }, + { + "epoch": 0.1216994282712782, + "grad_norm": 3.191077947616577, + "learning_rate": 9.975092822755623e-06, + "loss": 0.8965, + "step": 1506 + }, + { + "epoch": 0.12178023798460574, + "grad_norm": 2.781688690185547, + "learning_rate": 9.975027546918943e-06, + "loss": 1.0474, + "step": 1507 + }, + { + "epoch": 0.1218610476979333, + "grad_norm": 2.710775375366211, + "learning_rate": 9.974962185872017e-06, + "loss": 1.0024, + "step": 1508 + }, + { + "epoch": 0.12194185741126083, + "grad_norm": 3.1338086128234863, + "learning_rate": 9.974896739615967e-06, + "loss": 1.0516, + "step": 1509 + }, + { + "epoch": 0.12202266712458837, + "grad_norm": 2.9669113159179688, + "learning_rate": 9.974831208151912e-06, + "loss": 0.9887, + "step": 1510 + }, + { + "epoch": 0.12210347683791592, + "grad_norm": 3.1898107528686523, + "learning_rate": 9.974765591480975e-06, + "loss": 1.129, + "step": 1511 + }, + { + "epoch": 0.12218428655124346, + "grad_norm": 2.8128304481506348, + "learning_rate": 9.97469988960428e-06, + "loss": 0.9935, + "step": 1512 + }, + { + "epoch": 0.12226509626457101, + "grad_norm": 2.8253588676452637, + "learning_rate": 9.974634102522951e-06, + "loss": 1.022, + "step": 1513 + }, + { + "epoch": 0.12234590597789854, + "grad_norm": 3.3674936294555664, + "learning_rate": 9.974568230238116e-06, + "loss": 1.0668, + "step": 1514 + }, + { + "epoch": 0.12242671569122608, + "grad_norm": 2.8994176387786865, + "learning_rate": 9.974502272750904e-06, + "loss": 0.952, + "step": 1515 + }, + { + "epoch": 0.12250752540455363, + "grad_norm": 3.3196377754211426, + "learning_rate": 9.974436230062443e-06, + "loss": 1.0472, + "step": 1516 + }, + { + "epoch": 0.12258833511788117, + "grad_norm": 2.662877321243286, + "learning_rate": 9.974370102173864e-06, + "loss": 1.0556, + "step": 1517 + }, + { + "epoch": 0.1226691448312087, + "grad_norm": 2.990156888961792, + "learning_rate": 9.974303889086302e-06, + "loss": 0.9752, + "step": 1518 + }, + { + "epoch": 0.12274995454453626, + "grad_norm": 3.2955334186553955, + "learning_rate": 9.974237590800888e-06, + "loss": 0.9966, + "step": 1519 + }, + { + "epoch": 0.1228307642578638, + "grad_norm": 2.9709582328796387, + "learning_rate": 9.974171207318762e-06, + "loss": 0.9343, + "step": 1520 + }, + { + "epoch": 0.12291157397119133, + "grad_norm": 3.0959041118621826, + "learning_rate": 9.974104738641056e-06, + "loss": 1.076, + "step": 1521 + }, + { + "epoch": 0.12299238368451888, + "grad_norm": 2.8134162425994873, + "learning_rate": 9.974038184768908e-06, + "loss": 0.911, + "step": 1522 + }, + { + "epoch": 0.12307319339784642, + "grad_norm": 2.486924171447754, + "learning_rate": 9.973971545703464e-06, + "loss": 0.94, + "step": 1523 + }, + { + "epoch": 0.12315400311117396, + "grad_norm": 3.090559244155884, + "learning_rate": 9.97390482144586e-06, + "loss": 0.9848, + "step": 1524 + }, + { + "epoch": 0.12323481282450151, + "grad_norm": 2.6111233234405518, + "learning_rate": 9.97383801199724e-06, + "loss": 1.077, + "step": 1525 + }, + { + "epoch": 0.12331562253782904, + "grad_norm": 2.474102020263672, + "learning_rate": 9.973771117358751e-06, + "loss": 1.0163, + "step": 1526 + }, + { + "epoch": 0.1233964322511566, + "grad_norm": 3.4042325019836426, + "learning_rate": 9.973704137531537e-06, + "loss": 1.054, + "step": 1527 + }, + { + "epoch": 0.12347724196448413, + "grad_norm": 2.7256107330322266, + "learning_rate": 9.973637072516742e-06, + "loss": 0.957, + "step": 1528 + }, + { + "epoch": 0.12355805167781167, + "grad_norm": 3.280165910720825, + "learning_rate": 9.97356992231552e-06, + "loss": 1.0552, + "step": 1529 + }, + { + "epoch": 0.12363886139113922, + "grad_norm": 2.8168840408325195, + "learning_rate": 9.973502686929018e-06, + "loss": 0.8983, + "step": 1530 + }, + { + "epoch": 0.12371967110446676, + "grad_norm": 2.5056023597717285, + "learning_rate": 9.973435366358388e-06, + "loss": 1.1126, + "step": 1531 + }, + { + "epoch": 0.1238004808177943, + "grad_norm": 3.222533702850342, + "learning_rate": 9.973367960604783e-06, + "loss": 0.9352, + "step": 1532 + }, + { + "epoch": 0.12388129053112185, + "grad_norm": 3.013413906097412, + "learning_rate": 9.973300469669357e-06, + "loss": 0.9878, + "step": 1533 + }, + { + "epoch": 0.12396210024444938, + "grad_norm": 2.886183977127075, + "learning_rate": 9.973232893553268e-06, + "loss": 1.0524, + "step": 1534 + }, + { + "epoch": 0.12404290995777692, + "grad_norm": 2.5032222270965576, + "learning_rate": 9.973165232257671e-06, + "loss": 0.9094, + "step": 1535 + }, + { + "epoch": 0.12412371967110447, + "grad_norm": 3.167069673538208, + "learning_rate": 9.973097485783727e-06, + "loss": 0.9374, + "step": 1536 + }, + { + "epoch": 0.12420452938443201, + "grad_norm": 3.163844108581543, + "learning_rate": 9.973029654132595e-06, + "loss": 0.8943, + "step": 1537 + }, + { + "epoch": 0.12428533909775955, + "grad_norm": 2.601196765899658, + "learning_rate": 9.972961737305437e-06, + "loss": 1.0731, + "step": 1538 + }, + { + "epoch": 0.1243661488110871, + "grad_norm": 3.303682565689087, + "learning_rate": 9.972893735303414e-06, + "loss": 1.0653, + "step": 1539 + }, + { + "epoch": 0.12444695852441463, + "grad_norm": 3.289884328842163, + "learning_rate": 9.972825648127697e-06, + "loss": 1.0373, + "step": 1540 + }, + { + "epoch": 0.12452776823774217, + "grad_norm": 2.7413132190704346, + "learning_rate": 9.972757475779446e-06, + "loss": 0.9449, + "step": 1541 + }, + { + "epoch": 0.12460857795106972, + "grad_norm": 2.809582233428955, + "learning_rate": 9.972689218259831e-06, + "loss": 1.0173, + "step": 1542 + }, + { + "epoch": 0.12468938766439726, + "grad_norm": 2.9432179927825928, + "learning_rate": 9.972620875570022e-06, + "loss": 1.0912, + "step": 1543 + }, + { + "epoch": 0.12477019737772481, + "grad_norm": 3.4051713943481445, + "learning_rate": 9.972552447711188e-06, + "loss": 1.1274, + "step": 1544 + }, + { + "epoch": 0.12485100709105235, + "grad_norm": 3.025392770767212, + "learning_rate": 9.972483934684503e-06, + "loss": 1.0697, + "step": 1545 + }, + { + "epoch": 0.12493181680437988, + "grad_norm": 3.9957685470581055, + "learning_rate": 9.972415336491137e-06, + "loss": 0.9908, + "step": 1546 + }, + { + "epoch": 0.12501262651770742, + "grad_norm": 2.8916802406311035, + "learning_rate": 9.972346653132266e-06, + "loss": 0.942, + "step": 1547 + }, + { + "epoch": 0.12509343623103497, + "grad_norm": 3.4179534912109375, + "learning_rate": 9.97227788460907e-06, + "loss": 1.1929, + "step": 1548 + }, + { + "epoch": 0.12517424594436252, + "grad_norm": 3.1534507274627686, + "learning_rate": 9.972209030922722e-06, + "loss": 0.9994, + "step": 1549 + }, + { + "epoch": 0.12525505565769005, + "grad_norm": 3.0198779106140137, + "learning_rate": 9.972140092074404e-06, + "loss": 1.015, + "step": 1550 + }, + { + "epoch": 0.1253358653710176, + "grad_norm": 3.012791395187378, + "learning_rate": 9.972071068065297e-06, + "loss": 1.1126, + "step": 1551 + }, + { + "epoch": 0.12541667508434515, + "grad_norm": 2.9998300075531006, + "learning_rate": 9.97200195889658e-06, + "loss": 0.9682, + "step": 1552 + }, + { + "epoch": 0.12549748479767267, + "grad_norm": 2.617751121520996, + "learning_rate": 9.97193276456944e-06, + "loss": 1.1222, + "step": 1553 + }, + { + "epoch": 0.12557829451100022, + "grad_norm": 2.919160842895508, + "learning_rate": 9.971863485085063e-06, + "loss": 1.104, + "step": 1554 + }, + { + "epoch": 0.12565910422432777, + "grad_norm": 3.4130806922912598, + "learning_rate": 9.971794120444633e-06, + "loss": 1.0856, + "step": 1555 + }, + { + "epoch": 0.1257399139376553, + "grad_norm": 2.5439846515655518, + "learning_rate": 9.97172467064934e-06, + "loss": 0.9971, + "step": 1556 + }, + { + "epoch": 0.12582072365098285, + "grad_norm": 2.987593650817871, + "learning_rate": 9.971655135700369e-06, + "loss": 0.9025, + "step": 1557 + }, + { + "epoch": 0.1259015333643104, + "grad_norm": 2.4838151931762695, + "learning_rate": 9.971585515598916e-06, + "loss": 1.0237, + "step": 1558 + }, + { + "epoch": 0.12598234307763792, + "grad_norm": 3.056072473526001, + "learning_rate": 9.971515810346172e-06, + "loss": 0.982, + "step": 1559 + }, + { + "epoch": 0.12606315279096547, + "grad_norm": 2.6471681594848633, + "learning_rate": 9.97144601994333e-06, + "loss": 1.081, + "step": 1560 + }, + { + "epoch": 0.12614396250429302, + "grad_norm": 2.814272880554199, + "learning_rate": 9.971376144391587e-06, + "loss": 1.0383, + "step": 1561 + }, + { + "epoch": 0.12622477221762055, + "grad_norm": 3.0984888076782227, + "learning_rate": 9.971306183692138e-06, + "loss": 1.0266, + "step": 1562 + }, + { + "epoch": 0.1263055819309481, + "grad_norm": 2.8970961570739746, + "learning_rate": 9.971236137846181e-06, + "loss": 1.0187, + "step": 1563 + }, + { + "epoch": 0.12638639164427565, + "grad_norm": 3.058929920196533, + "learning_rate": 9.971166006854918e-06, + "loss": 1.0151, + "step": 1564 + }, + { + "epoch": 0.12646720135760317, + "grad_norm": 3.1027820110321045, + "learning_rate": 9.971095790719549e-06, + "loss": 1.0664, + "step": 1565 + }, + { + "epoch": 0.12654801107093072, + "grad_norm": 2.7299392223358154, + "learning_rate": 9.971025489441277e-06, + "loss": 1.0323, + "step": 1566 + }, + { + "epoch": 0.12662882078425827, + "grad_norm": 2.9159555435180664, + "learning_rate": 9.970955103021304e-06, + "loss": 1.003, + "step": 1567 + }, + { + "epoch": 0.12670963049758582, + "grad_norm": 2.9966683387756348, + "learning_rate": 9.970884631460837e-06, + "loss": 1.0543, + "step": 1568 + }, + { + "epoch": 0.12679044021091335, + "grad_norm": 2.942044258117676, + "learning_rate": 9.970814074761086e-06, + "loss": 1.0306, + "step": 1569 + }, + { + "epoch": 0.1268712499242409, + "grad_norm": 3.0210185050964355, + "learning_rate": 9.970743432923254e-06, + "loss": 1.01, + "step": 1570 + }, + { + "epoch": 0.12695205963756845, + "grad_norm": 3.9895670413970947, + "learning_rate": 9.970672705948554e-06, + "loss": 0.9611, + "step": 1571 + }, + { + "epoch": 0.12703286935089597, + "grad_norm": 3.2037484645843506, + "learning_rate": 9.970601893838199e-06, + "loss": 1.0619, + "step": 1572 + }, + { + "epoch": 0.12711367906422352, + "grad_norm": 2.9033043384552, + "learning_rate": 9.970530996593396e-06, + "loss": 0.9644, + "step": 1573 + }, + { + "epoch": 0.12719448877755107, + "grad_norm": 2.760676383972168, + "learning_rate": 9.970460014215365e-06, + "loss": 1.0322, + "step": 1574 + }, + { + "epoch": 0.1272752984908786, + "grad_norm": 2.8878512382507324, + "learning_rate": 9.970388946705322e-06, + "loss": 1.1337, + "step": 1575 + }, + { + "epoch": 0.12735610820420615, + "grad_norm": 3.3586654663085938, + "learning_rate": 9.97031779406448e-06, + "loss": 1.0103, + "step": 1576 + }, + { + "epoch": 0.1274369179175337, + "grad_norm": 2.880124807357788, + "learning_rate": 9.970246556294059e-06, + "loss": 1.1382, + "step": 1577 + }, + { + "epoch": 0.12751772763086122, + "grad_norm": 2.892253875732422, + "learning_rate": 9.970175233395282e-06, + "loss": 1.0529, + "step": 1578 + }, + { + "epoch": 0.12759853734418877, + "grad_norm": 2.8811302185058594, + "learning_rate": 9.970103825369368e-06, + "loss": 1.0315, + "step": 1579 + }, + { + "epoch": 0.12767934705751632, + "grad_norm": 2.897580623626709, + "learning_rate": 9.970032332217539e-06, + "loss": 0.9286, + "step": 1580 + }, + { + "epoch": 0.12776015677084385, + "grad_norm": 3.135737895965576, + "learning_rate": 9.969960753941021e-06, + "loss": 1.0004, + "step": 1581 + }, + { + "epoch": 0.1278409664841714, + "grad_norm": 2.7766566276550293, + "learning_rate": 9.96988909054104e-06, + "loss": 0.9378, + "step": 1582 + }, + { + "epoch": 0.12792177619749895, + "grad_norm": 2.9192938804626465, + "learning_rate": 9.969817342018826e-06, + "loss": 1.0221, + "step": 1583 + }, + { + "epoch": 0.12800258591082647, + "grad_norm": 2.775981903076172, + "learning_rate": 9.969745508375604e-06, + "loss": 0.9592, + "step": 1584 + }, + { + "epoch": 0.12808339562415402, + "grad_norm": 3.3218469619750977, + "learning_rate": 9.969673589612604e-06, + "loss": 1.1219, + "step": 1585 + }, + { + "epoch": 0.12816420533748157, + "grad_norm": 2.779749631881714, + "learning_rate": 9.96960158573106e-06, + "loss": 1.0264, + "step": 1586 + }, + { + "epoch": 0.1282450150508091, + "grad_norm": 2.96001935005188, + "learning_rate": 9.969529496732205e-06, + "loss": 1.1207, + "step": 1587 + }, + { + "epoch": 0.12832582476413665, + "grad_norm": 3.388425588607788, + "learning_rate": 9.969457322617273e-06, + "loss": 0.9705, + "step": 1588 + }, + { + "epoch": 0.1284066344774642, + "grad_norm": 2.4653358459472656, + "learning_rate": 9.969385063387503e-06, + "loss": 0.9421, + "step": 1589 + }, + { + "epoch": 0.12848744419079172, + "grad_norm": 2.5082638263702393, + "learning_rate": 9.969312719044127e-06, + "loss": 1.0129, + "step": 1590 + }, + { + "epoch": 0.12856825390411927, + "grad_norm": 3.0804243087768555, + "learning_rate": 9.969240289588388e-06, + "loss": 1.0081, + "step": 1591 + }, + { + "epoch": 0.12864906361744682, + "grad_norm": 2.7551684379577637, + "learning_rate": 9.969167775021527e-06, + "loss": 0.9329, + "step": 1592 + }, + { + "epoch": 0.12872987333077435, + "grad_norm": 2.7766008377075195, + "learning_rate": 9.969095175344784e-06, + "loss": 0.9295, + "step": 1593 + }, + { + "epoch": 0.1288106830441019, + "grad_norm": 3.1291987895965576, + "learning_rate": 9.969022490559403e-06, + "loss": 1.052, + "step": 1594 + }, + { + "epoch": 0.12889149275742945, + "grad_norm": 3.7330756187438965, + "learning_rate": 9.96894972066663e-06, + "loss": 1.0948, + "step": 1595 + }, + { + "epoch": 0.12897230247075697, + "grad_norm": 2.907026529312134, + "learning_rate": 9.968876865667709e-06, + "loss": 0.9207, + "step": 1596 + }, + { + "epoch": 0.12905311218408452, + "grad_norm": 2.8110721111297607, + "learning_rate": 9.968803925563891e-06, + "loss": 0.9706, + "step": 1597 + }, + { + "epoch": 0.12913392189741207, + "grad_norm": 3.5314857959747314, + "learning_rate": 9.968730900356423e-06, + "loss": 0.9355, + "step": 1598 + }, + { + "epoch": 0.1292147316107396, + "grad_norm": 3.0395474433898926, + "learning_rate": 9.968657790046557e-06, + "loss": 1.0052, + "step": 1599 + }, + { + "epoch": 0.12929554132406715, + "grad_norm": 2.651458263397217, + "learning_rate": 9.968584594635544e-06, + "loss": 0.9559, + "step": 1600 + }, + { + "epoch": 0.1293763510373947, + "grad_norm": 2.9642333984375, + "learning_rate": 9.968511314124638e-06, + "loss": 1.007, + "step": 1601 + }, + { + "epoch": 0.12945716075072225, + "grad_norm": 2.743347644805908, + "learning_rate": 9.968437948515094e-06, + "loss": 1.0504, + "step": 1602 + }, + { + "epoch": 0.12953797046404977, + "grad_norm": 2.830263614654541, + "learning_rate": 9.96836449780817e-06, + "loss": 1.003, + "step": 1603 + }, + { + "epoch": 0.12961878017737732, + "grad_norm": 2.768022060394287, + "learning_rate": 9.968290962005122e-06, + "loss": 0.9653, + "step": 1604 + }, + { + "epoch": 0.12969958989070487, + "grad_norm": 2.78212833404541, + "learning_rate": 9.968217341107212e-06, + "loss": 1.0442, + "step": 1605 + }, + { + "epoch": 0.1297803996040324, + "grad_norm": 3.129190444946289, + "learning_rate": 9.968143635115698e-06, + "loss": 1.1285, + "step": 1606 + }, + { + "epoch": 0.12986120931735995, + "grad_norm": 3.335914134979248, + "learning_rate": 9.968069844031846e-06, + "loss": 1.1157, + "step": 1607 + }, + { + "epoch": 0.1299420190306875, + "grad_norm": 3.0782394409179688, + "learning_rate": 9.967995967856917e-06, + "loss": 1.0001, + "step": 1608 + }, + { + "epoch": 0.13002282874401502, + "grad_norm": 2.9339089393615723, + "learning_rate": 9.967922006592175e-06, + "loss": 1.0452, + "step": 1609 + }, + { + "epoch": 0.13010363845734257, + "grad_norm": 2.6225640773773193, + "learning_rate": 9.96784796023889e-06, + "loss": 1.1259, + "step": 1610 + }, + { + "epoch": 0.13018444817067013, + "grad_norm": 2.4689254760742188, + "learning_rate": 9.967773828798328e-06, + "loss": 1.0334, + "step": 1611 + }, + { + "epoch": 0.13026525788399765, + "grad_norm": 3.2011399269104004, + "learning_rate": 9.967699612271762e-06, + "loss": 1.0487, + "step": 1612 + }, + { + "epoch": 0.1303460675973252, + "grad_norm": 3.048624277114868, + "learning_rate": 9.967625310660461e-06, + "loss": 1.0496, + "step": 1613 + }, + { + "epoch": 0.13042687731065275, + "grad_norm": 2.8851726055145264, + "learning_rate": 9.967550923965695e-06, + "loss": 1.0354, + "step": 1614 + }, + { + "epoch": 0.13050768702398027, + "grad_norm": 3.3419952392578125, + "learning_rate": 9.967476452188742e-06, + "loss": 1.0365, + "step": 1615 + }, + { + "epoch": 0.13058849673730782, + "grad_norm": 2.9978623390197754, + "learning_rate": 9.967401895330874e-06, + "loss": 1.0527, + "step": 1616 + }, + { + "epoch": 0.13066930645063538, + "grad_norm": 2.8629448413848877, + "learning_rate": 9.967327253393373e-06, + "loss": 1.0362, + "step": 1617 + }, + { + "epoch": 0.1307501161639629, + "grad_norm": 2.6528220176696777, + "learning_rate": 9.967252526377513e-06, + "loss": 1.0123, + "step": 1618 + }, + { + "epoch": 0.13083092587729045, + "grad_norm": 3.220398187637329, + "learning_rate": 9.967177714284577e-06, + "loss": 1.0968, + "step": 1619 + }, + { + "epoch": 0.130911735590618, + "grad_norm": 3.31400728225708, + "learning_rate": 9.967102817115844e-06, + "loss": 1.04, + "step": 1620 + }, + { + "epoch": 0.13099254530394552, + "grad_norm": 2.9584708213806152, + "learning_rate": 9.967027834872595e-06, + "loss": 1.0777, + "step": 1621 + }, + { + "epoch": 0.13107335501727307, + "grad_norm": 2.9647164344787598, + "learning_rate": 9.96695276755612e-06, + "loss": 0.9648, + "step": 1622 + }, + { + "epoch": 0.13115416473060063, + "grad_norm": 2.6207494735717773, + "learning_rate": 9.9668776151677e-06, + "loss": 1.0751, + "step": 1623 + }, + { + "epoch": 0.13123497444392815, + "grad_norm": 3.3089005947113037, + "learning_rate": 9.966802377708625e-06, + "loss": 1.0214, + "step": 1624 + }, + { + "epoch": 0.1313157841572557, + "grad_norm": 3.3470988273620605, + "learning_rate": 9.966727055180183e-06, + "loss": 1.0695, + "step": 1625 + }, + { + "epoch": 0.13139659387058325, + "grad_norm": 2.7479801177978516, + "learning_rate": 9.966651647583661e-06, + "loss": 1.0169, + "step": 1626 + }, + { + "epoch": 0.13147740358391077, + "grad_norm": 2.6470906734466553, + "learning_rate": 9.966576154920354e-06, + "loss": 0.998, + "step": 1627 + }, + { + "epoch": 0.13155821329723832, + "grad_norm": 2.62980318069458, + "learning_rate": 9.966500577191554e-06, + "loss": 1.0047, + "step": 1628 + }, + { + "epoch": 0.13163902301056588, + "grad_norm": 3.066617488861084, + "learning_rate": 9.966424914398556e-06, + "loss": 1.0842, + "step": 1629 + }, + { + "epoch": 0.1317198327238934, + "grad_norm": 2.5637338161468506, + "learning_rate": 9.966349166542655e-06, + "loss": 1.093, + "step": 1630 + }, + { + "epoch": 0.13180064243722095, + "grad_norm": 3.0110042095184326, + "learning_rate": 9.966273333625149e-06, + "loss": 1.0627, + "step": 1631 + }, + { + "epoch": 0.1318814521505485, + "grad_norm": 2.872307777404785, + "learning_rate": 9.966197415647336e-06, + "loss": 0.9796, + "step": 1632 + }, + { + "epoch": 0.13196226186387605, + "grad_norm": 2.9434759616851807, + "learning_rate": 9.966121412610516e-06, + "loss": 0.9855, + "step": 1633 + }, + { + "epoch": 0.13204307157720357, + "grad_norm": 2.70803165435791, + "learning_rate": 9.966045324515993e-06, + "loss": 0.9988, + "step": 1634 + }, + { + "epoch": 0.13212388129053113, + "grad_norm": 2.9613752365112305, + "learning_rate": 9.96596915136507e-06, + "loss": 1.0091, + "step": 1635 + }, + { + "epoch": 0.13220469100385868, + "grad_norm": 2.6390221118927, + "learning_rate": 9.965892893159048e-06, + "loss": 1.196, + "step": 1636 + }, + { + "epoch": 0.1322855007171862, + "grad_norm": 2.928917169570923, + "learning_rate": 9.965816549899237e-06, + "loss": 0.9174, + "step": 1637 + }, + { + "epoch": 0.13236631043051375, + "grad_norm": 2.783160448074341, + "learning_rate": 9.965740121586942e-06, + "loss": 1.1806, + "step": 1638 + }, + { + "epoch": 0.1324471201438413, + "grad_norm": 2.8499670028686523, + "learning_rate": 9.965663608223476e-06, + "loss": 1.0064, + "step": 1639 + }, + { + "epoch": 0.13252792985716882, + "grad_norm": 3.1209301948547363, + "learning_rate": 9.965587009810145e-06, + "loss": 0.9955, + "step": 1640 + }, + { + "epoch": 0.13260873957049638, + "grad_norm": 3.3921396732330322, + "learning_rate": 9.965510326348263e-06, + "loss": 1.0438, + "step": 1641 + }, + { + "epoch": 0.13268954928382393, + "grad_norm": 2.538139820098877, + "learning_rate": 9.965433557839143e-06, + "loss": 1.0475, + "step": 1642 + }, + { + "epoch": 0.13277035899715145, + "grad_norm": 3.514256238937378, + "learning_rate": 9.9653567042841e-06, + "loss": 0.9934, + "step": 1643 + }, + { + "epoch": 0.132851168710479, + "grad_norm": 2.601105213165283, + "learning_rate": 9.965279765684449e-06, + "loss": 0.9303, + "step": 1644 + }, + { + "epoch": 0.13293197842380655, + "grad_norm": 3.2366483211517334, + "learning_rate": 9.96520274204151e-06, + "loss": 1.024, + "step": 1645 + }, + { + "epoch": 0.13301278813713407, + "grad_norm": 3.0515966415405273, + "learning_rate": 9.965125633356602e-06, + "loss": 1.0211, + "step": 1646 + }, + { + "epoch": 0.13309359785046163, + "grad_norm": 3.162769317626953, + "learning_rate": 9.965048439631045e-06, + "loss": 1.0201, + "step": 1647 + }, + { + "epoch": 0.13317440756378918, + "grad_norm": 2.630096197128296, + "learning_rate": 9.964971160866159e-06, + "loss": 1.1113, + "step": 1648 + }, + { + "epoch": 0.1332552172771167, + "grad_norm": 2.9393413066864014, + "learning_rate": 9.964893797063271e-06, + "loss": 1.1372, + "step": 1649 + }, + { + "epoch": 0.13333602699044425, + "grad_norm": 2.98698091506958, + "learning_rate": 9.964816348223705e-06, + "loss": 1.0106, + "step": 1650 + }, + { + "epoch": 0.1334168367037718, + "grad_norm": 2.547166585922241, + "learning_rate": 9.964738814348787e-06, + "loss": 1.0499, + "step": 1651 + }, + { + "epoch": 0.13349764641709932, + "grad_norm": 3.0125157833099365, + "learning_rate": 9.964661195439847e-06, + "loss": 1.0054, + "step": 1652 + }, + { + "epoch": 0.13357845613042688, + "grad_norm": 2.9701647758483887, + "learning_rate": 9.96458349149821e-06, + "loss": 1.0462, + "step": 1653 + }, + { + "epoch": 0.13365926584375443, + "grad_norm": 2.3569045066833496, + "learning_rate": 9.964505702525211e-06, + "loss": 1.1098, + "step": 1654 + }, + { + "epoch": 0.13374007555708195, + "grad_norm": 2.789321184158325, + "learning_rate": 9.96442782852218e-06, + "loss": 1.0091, + "step": 1655 + }, + { + "epoch": 0.1338208852704095, + "grad_norm": 2.524693727493286, + "learning_rate": 9.964349869490453e-06, + "loss": 1.0318, + "step": 1656 + }, + { + "epoch": 0.13390169498373705, + "grad_norm": 3.0347893238067627, + "learning_rate": 9.964271825431362e-06, + "loss": 1.0054, + "step": 1657 + }, + { + "epoch": 0.13398250469706457, + "grad_norm": 2.8926568031311035, + "learning_rate": 9.964193696346248e-06, + "loss": 1.0277, + "step": 1658 + }, + { + "epoch": 0.13406331441039213, + "grad_norm": 2.7216997146606445, + "learning_rate": 9.964115482236444e-06, + "loss": 1.1218, + "step": 1659 + }, + { + "epoch": 0.13414412412371968, + "grad_norm": 2.5974133014678955, + "learning_rate": 9.964037183103295e-06, + "loss": 1.0529, + "step": 1660 + }, + { + "epoch": 0.1342249338370472, + "grad_norm": 3.0064709186553955, + "learning_rate": 9.963958798948137e-06, + "loss": 1.0208, + "step": 1661 + }, + { + "epoch": 0.13430574355037475, + "grad_norm": 2.6379201412200928, + "learning_rate": 9.963880329772317e-06, + "loss": 1.0222, + "step": 1662 + }, + { + "epoch": 0.1343865532637023, + "grad_norm": 3.0081305503845215, + "learning_rate": 9.963801775577175e-06, + "loss": 1.0505, + "step": 1663 + }, + { + "epoch": 0.13446736297702985, + "grad_norm": 3.289186954498291, + "learning_rate": 9.96372313636406e-06, + "loss": 0.9699, + "step": 1664 + }, + { + "epoch": 0.13454817269035738, + "grad_norm": 3.1908047199249268, + "learning_rate": 9.963644412134319e-06, + "loss": 0.9822, + "step": 1665 + }, + { + "epoch": 0.13462898240368493, + "grad_norm": 2.66094970703125, + "learning_rate": 9.963565602889295e-06, + "loss": 1.0759, + "step": 1666 + }, + { + "epoch": 0.13470979211701248, + "grad_norm": 2.412898540496826, + "learning_rate": 9.963486708630344e-06, + "loss": 1.021, + "step": 1667 + }, + { + "epoch": 0.13479060183034, + "grad_norm": 2.9810945987701416, + "learning_rate": 9.963407729358813e-06, + "loss": 0.9571, + "step": 1668 + }, + { + "epoch": 0.13487141154366755, + "grad_norm": 2.7914586067199707, + "learning_rate": 9.96332866507606e-06, + "loss": 1.0353, + "step": 1669 + }, + { + "epoch": 0.1349522212569951, + "grad_norm": 2.8356821537017822, + "learning_rate": 9.963249515783433e-06, + "loss": 0.9546, + "step": 1670 + }, + { + "epoch": 0.13503303097032263, + "grad_norm": 2.4492247104644775, + "learning_rate": 9.96317028148229e-06, + "loss": 1.0215, + "step": 1671 + }, + { + "epoch": 0.13511384068365018, + "grad_norm": 3.152137279510498, + "learning_rate": 9.963090962173989e-06, + "loss": 0.935, + "step": 1672 + }, + { + "epoch": 0.13519465039697773, + "grad_norm": 3.0612106323242188, + "learning_rate": 9.963011557859888e-06, + "loss": 0.914, + "step": 1673 + }, + { + "epoch": 0.13527546011030525, + "grad_norm": 2.58404278755188, + "learning_rate": 9.962932068541347e-06, + "loss": 0.9286, + "step": 1674 + }, + { + "epoch": 0.1353562698236328, + "grad_norm": 2.5726115703582764, + "learning_rate": 9.962852494219728e-06, + "loss": 1.057, + "step": 1675 + }, + { + "epoch": 0.13543707953696035, + "grad_norm": 2.853208541870117, + "learning_rate": 9.962772834896392e-06, + "loss": 0.9748, + "step": 1676 + }, + { + "epoch": 0.13551788925028788, + "grad_norm": 3.302896738052368, + "learning_rate": 9.962693090572706e-06, + "loss": 1.0241, + "step": 1677 + }, + { + "epoch": 0.13559869896361543, + "grad_norm": 3.2174758911132812, + "learning_rate": 9.962613261250034e-06, + "loss": 1.0237, + "step": 1678 + }, + { + "epoch": 0.13567950867694298, + "grad_norm": 3.20485782623291, + "learning_rate": 9.962533346929744e-06, + "loss": 0.9673, + "step": 1679 + }, + { + "epoch": 0.1357603183902705, + "grad_norm": 2.799534320831299, + "learning_rate": 9.962453347613205e-06, + "loss": 0.9905, + "step": 1680 + }, + { + "epoch": 0.13584112810359805, + "grad_norm": 3.0652966499328613, + "learning_rate": 9.962373263301785e-06, + "loss": 1.0848, + "step": 1681 + }, + { + "epoch": 0.1359219378169256, + "grad_norm": 2.4975082874298096, + "learning_rate": 9.96229309399686e-06, + "loss": 1.0285, + "step": 1682 + }, + { + "epoch": 0.13600274753025313, + "grad_norm": 2.763394355773926, + "learning_rate": 9.962212839699799e-06, + "loss": 0.9335, + "step": 1683 + }, + { + "epoch": 0.13608355724358068, + "grad_norm": 2.9514853954315186, + "learning_rate": 9.962132500411978e-06, + "loss": 0.9796, + "step": 1684 + }, + { + "epoch": 0.13616436695690823, + "grad_norm": 2.668872356414795, + "learning_rate": 9.962052076134773e-06, + "loss": 1.0644, + "step": 1685 + }, + { + "epoch": 0.13624517667023575, + "grad_norm": 3.2854628562927246, + "learning_rate": 9.961971566869564e-06, + "loss": 1.1409, + "step": 1686 + }, + { + "epoch": 0.1363259863835633, + "grad_norm": 2.632746458053589, + "learning_rate": 9.961890972617724e-06, + "loss": 0.9844, + "step": 1687 + }, + { + "epoch": 0.13640679609689085, + "grad_norm": 2.752347707748413, + "learning_rate": 9.96181029338064e-06, + "loss": 1.0505, + "step": 1688 + }, + { + "epoch": 0.13648760581021838, + "grad_norm": 2.6630587577819824, + "learning_rate": 9.961729529159689e-06, + "loss": 1.0518, + "step": 1689 + }, + { + "epoch": 0.13656841552354593, + "grad_norm": 2.814940929412842, + "learning_rate": 9.961648679956257e-06, + "loss": 1.0219, + "step": 1690 + }, + { + "epoch": 0.13664922523687348, + "grad_norm": 3.119725227355957, + "learning_rate": 9.961567745771725e-06, + "loss": 0.9951, + "step": 1691 + }, + { + "epoch": 0.136730034950201, + "grad_norm": 2.685208797454834, + "learning_rate": 9.961486726607486e-06, + "loss": 0.946, + "step": 1692 + }, + { + "epoch": 0.13681084466352855, + "grad_norm": 3.2550692558288574, + "learning_rate": 9.96140562246492e-06, + "loss": 0.915, + "step": 1693 + }, + { + "epoch": 0.1368916543768561, + "grad_norm": 2.9896762371063232, + "learning_rate": 9.961324433345423e-06, + "loss": 0.9649, + "step": 1694 + }, + { + "epoch": 0.13697246409018363, + "grad_norm": 2.7193186283111572, + "learning_rate": 9.96124315925038e-06, + "loss": 1.0714, + "step": 1695 + }, + { + "epoch": 0.13705327380351118, + "grad_norm": 3.505295991897583, + "learning_rate": 9.961161800181187e-06, + "loss": 0.9097, + "step": 1696 + }, + { + "epoch": 0.13713408351683873, + "grad_norm": 2.7523858547210693, + "learning_rate": 9.961080356139235e-06, + "loss": 1.0551, + "step": 1697 + }, + { + "epoch": 0.13721489323016628, + "grad_norm": 3.00246262550354, + "learning_rate": 9.960998827125921e-06, + "loss": 1.0762, + "step": 1698 + }, + { + "epoch": 0.1372957029434938, + "grad_norm": 2.5749502182006836, + "learning_rate": 9.960917213142637e-06, + "loss": 0.925, + "step": 1699 + }, + { + "epoch": 0.13737651265682135, + "grad_norm": 3.183603525161743, + "learning_rate": 9.960835514190787e-06, + "loss": 1.0033, + "step": 1700 + }, + { + "epoch": 0.1374573223701489, + "grad_norm": 2.928327798843384, + "learning_rate": 9.960753730271766e-06, + "loss": 0.9592, + "step": 1701 + }, + { + "epoch": 0.13753813208347643, + "grad_norm": 2.5657637119293213, + "learning_rate": 9.960671861386978e-06, + "loss": 0.916, + "step": 1702 + }, + { + "epoch": 0.13761894179680398, + "grad_norm": 2.9228098392486572, + "learning_rate": 9.960589907537821e-06, + "loss": 1.1493, + "step": 1703 + }, + { + "epoch": 0.13769975151013153, + "grad_norm": 2.7308812141418457, + "learning_rate": 9.960507868725703e-06, + "loss": 1.0686, + "step": 1704 + }, + { + "epoch": 0.13778056122345905, + "grad_norm": 2.9457664489746094, + "learning_rate": 9.960425744952027e-06, + "loss": 0.9022, + "step": 1705 + }, + { + "epoch": 0.1378613709367866, + "grad_norm": 2.8565680980682373, + "learning_rate": 9.960343536218198e-06, + "loss": 0.947, + "step": 1706 + }, + { + "epoch": 0.13794218065011415, + "grad_norm": 3.288144826889038, + "learning_rate": 9.960261242525626e-06, + "loss": 0.9386, + "step": 1707 + }, + { + "epoch": 0.13802299036344168, + "grad_norm": 2.7543015480041504, + "learning_rate": 9.960178863875723e-06, + "loss": 1.0274, + "step": 1708 + }, + { + "epoch": 0.13810380007676923, + "grad_norm": 3.6321394443511963, + "learning_rate": 9.960096400269894e-06, + "loss": 1.0048, + "step": 1709 + }, + { + "epoch": 0.13818460979009678, + "grad_norm": 3.537015199661255, + "learning_rate": 9.960013851709555e-06, + "loss": 0.9272, + "step": 1710 + }, + { + "epoch": 0.1382654195034243, + "grad_norm": 2.508254289627075, + "learning_rate": 9.959931218196122e-06, + "loss": 1.0029, + "step": 1711 + }, + { + "epoch": 0.13834622921675185, + "grad_norm": 2.9683449268341064, + "learning_rate": 9.959848499731004e-06, + "loss": 1.0596, + "step": 1712 + }, + { + "epoch": 0.1384270389300794, + "grad_norm": 3.078261375427246, + "learning_rate": 9.959765696315625e-06, + "loss": 1.0411, + "step": 1713 + }, + { + "epoch": 0.13850784864340693, + "grad_norm": 2.7142679691314697, + "learning_rate": 9.959682807951399e-06, + "loss": 1.0267, + "step": 1714 + }, + { + "epoch": 0.13858865835673448, + "grad_norm": 2.784114122390747, + "learning_rate": 9.959599834639745e-06, + "loss": 1.0803, + "step": 1715 + }, + { + "epoch": 0.13866946807006203, + "grad_norm": 3.481841564178467, + "learning_rate": 9.959516776382086e-06, + "loss": 0.9428, + "step": 1716 + }, + { + "epoch": 0.13875027778338955, + "grad_norm": 3.4312241077423096, + "learning_rate": 9.959433633179844e-06, + "loss": 0.9275, + "step": 1717 + }, + { + "epoch": 0.1388310874967171, + "grad_norm": 2.9053895473480225, + "learning_rate": 9.959350405034445e-06, + "loss": 0.956, + "step": 1718 + }, + { + "epoch": 0.13891189721004465, + "grad_norm": 3.082746744155884, + "learning_rate": 9.959267091947311e-06, + "loss": 0.9316, + "step": 1719 + }, + { + "epoch": 0.13899270692337218, + "grad_norm": 2.992299795150757, + "learning_rate": 9.959183693919871e-06, + "loss": 1.0388, + "step": 1720 + }, + { + "epoch": 0.13907351663669973, + "grad_norm": 2.9082751274108887, + "learning_rate": 9.959100210953555e-06, + "loss": 0.9716, + "step": 1721 + }, + { + "epoch": 0.13915432635002728, + "grad_norm": 2.7955381870269775, + "learning_rate": 9.959016643049788e-06, + "loss": 0.9659, + "step": 1722 + }, + { + "epoch": 0.1392351360633548, + "grad_norm": 2.9297831058502197, + "learning_rate": 9.958932990210006e-06, + "loss": 1.0348, + "step": 1723 + }, + { + "epoch": 0.13931594577668235, + "grad_norm": 2.9167332649230957, + "learning_rate": 9.958849252435638e-06, + "loss": 1.0126, + "step": 1724 + }, + { + "epoch": 0.1393967554900099, + "grad_norm": 3.501805543899536, + "learning_rate": 9.958765429728121e-06, + "loss": 1.0629, + "step": 1725 + }, + { + "epoch": 0.13947756520333743, + "grad_norm": 3.1738104820251465, + "learning_rate": 9.95868152208889e-06, + "loss": 1.0639, + "step": 1726 + }, + { + "epoch": 0.13955837491666498, + "grad_norm": 3.3682796955108643, + "learning_rate": 9.958597529519384e-06, + "loss": 1.0865, + "step": 1727 + }, + { + "epoch": 0.13963918462999253, + "grad_norm": 2.4615824222564697, + "learning_rate": 9.958513452021038e-06, + "loss": 1.0323, + "step": 1728 + }, + { + "epoch": 0.13971999434332008, + "grad_norm": 2.5694899559020996, + "learning_rate": 9.958429289595295e-06, + "loss": 1.1025, + "step": 1729 + }, + { + "epoch": 0.1398008040566476, + "grad_norm": 2.9231653213500977, + "learning_rate": 9.958345042243594e-06, + "loss": 1.057, + "step": 1730 + }, + { + "epoch": 0.13988161376997515, + "grad_norm": 3.7526228427886963, + "learning_rate": 9.95826070996738e-06, + "loss": 1.1402, + "step": 1731 + }, + { + "epoch": 0.1399624234833027, + "grad_norm": 2.602592945098877, + "learning_rate": 9.958176292768095e-06, + "loss": 0.9878, + "step": 1732 + }, + { + "epoch": 0.14004323319663023, + "grad_norm": 3.1426117420196533, + "learning_rate": 9.958091790647188e-06, + "loss": 0.9382, + "step": 1733 + }, + { + "epoch": 0.14012404290995778, + "grad_norm": 3.071408271789551, + "learning_rate": 9.958007203606103e-06, + "loss": 0.9991, + "step": 1734 + }, + { + "epoch": 0.14020485262328533, + "grad_norm": 3.2682929039001465, + "learning_rate": 9.957922531646293e-06, + "loss": 1.1086, + "step": 1735 + }, + { + "epoch": 0.14028566233661285, + "grad_norm": 3.3437681198120117, + "learning_rate": 9.957837774769203e-06, + "loss": 1.0144, + "step": 1736 + }, + { + "epoch": 0.1403664720499404, + "grad_norm": 2.6054043769836426, + "learning_rate": 9.95775293297629e-06, + "loss": 1.02, + "step": 1737 + }, + { + "epoch": 0.14044728176326796, + "grad_norm": 2.789842367172241, + "learning_rate": 9.957668006269003e-06, + "loss": 0.9584, + "step": 1738 + }, + { + "epoch": 0.14052809147659548, + "grad_norm": 3.1631407737731934, + "learning_rate": 9.957582994648798e-06, + "loss": 1.0068, + "step": 1739 + }, + { + "epoch": 0.14060890118992303, + "grad_norm": 3.041994094848633, + "learning_rate": 9.957497898117133e-06, + "loss": 1.0425, + "step": 1740 + }, + { + "epoch": 0.14068971090325058, + "grad_norm": 3.3951337337493896, + "learning_rate": 9.957412716675461e-06, + "loss": 0.919, + "step": 1741 + }, + { + "epoch": 0.1407705206165781, + "grad_norm": 3.176448106765747, + "learning_rate": 9.957327450325245e-06, + "loss": 1.0793, + "step": 1742 + }, + { + "epoch": 0.14085133032990566, + "grad_norm": 3.1270318031311035, + "learning_rate": 9.957242099067945e-06, + "loss": 0.9797, + "step": 1743 + }, + { + "epoch": 0.1409321400432332, + "grad_norm": 3.135282039642334, + "learning_rate": 9.957156662905022e-06, + "loss": 0.9498, + "step": 1744 + }, + { + "epoch": 0.14101294975656073, + "grad_norm": 3.351174831390381, + "learning_rate": 9.957071141837938e-06, + "loss": 1.1038, + "step": 1745 + }, + { + "epoch": 0.14109375946988828, + "grad_norm": 3.098735809326172, + "learning_rate": 9.956985535868157e-06, + "loss": 1.0687, + "step": 1746 + }, + { + "epoch": 0.14117456918321583, + "grad_norm": 2.904134750366211, + "learning_rate": 9.956899844997151e-06, + "loss": 0.9037, + "step": 1747 + }, + { + "epoch": 0.14125537889654335, + "grad_norm": 2.7765417098999023, + "learning_rate": 9.95681406922638e-06, + "loss": 1.1331, + "step": 1748 + }, + { + "epoch": 0.1413361886098709, + "grad_norm": 3.543769121170044, + "learning_rate": 9.956728208557322e-06, + "loss": 1.0291, + "step": 1749 + }, + { + "epoch": 0.14141699832319846, + "grad_norm": 2.637230157852173, + "learning_rate": 9.95664226299144e-06, + "loss": 0.931, + "step": 1750 + }, + { + "epoch": 0.14149780803652598, + "grad_norm": 3.0778005123138428, + "learning_rate": 9.956556232530207e-06, + "loss": 0.9302, + "step": 1751 + }, + { + "epoch": 0.14157861774985353, + "grad_norm": 2.739237070083618, + "learning_rate": 9.9564701171751e-06, + "loss": 1.0227, + "step": 1752 + }, + { + "epoch": 0.14165942746318108, + "grad_norm": 2.5961148738861084, + "learning_rate": 9.956383916927594e-06, + "loss": 1.1691, + "step": 1753 + }, + { + "epoch": 0.1417402371765086, + "grad_norm": 2.5436360836029053, + "learning_rate": 9.95629763178916e-06, + "loss": 0.9643, + "step": 1754 + }, + { + "epoch": 0.14182104688983616, + "grad_norm": 2.729128122329712, + "learning_rate": 9.956211261761282e-06, + "loss": 1.0151, + "step": 1755 + }, + { + "epoch": 0.1419018566031637, + "grad_norm": 2.8905763626098633, + "learning_rate": 9.956124806845435e-06, + "loss": 1.0167, + "step": 1756 + }, + { + "epoch": 0.14198266631649123, + "grad_norm": 2.813722610473633, + "learning_rate": 9.956038267043101e-06, + "loss": 1.0791, + "step": 1757 + }, + { + "epoch": 0.14206347602981878, + "grad_norm": 3.6439390182495117, + "learning_rate": 9.955951642355765e-06, + "loss": 1.0461, + "step": 1758 + }, + { + "epoch": 0.14214428574314633, + "grad_norm": 2.759126663208008, + "learning_rate": 9.955864932784907e-06, + "loss": 0.956, + "step": 1759 + }, + { + "epoch": 0.14222509545647385, + "grad_norm": 2.8389487266540527, + "learning_rate": 9.955778138332012e-06, + "loss": 1.007, + "step": 1760 + }, + { + "epoch": 0.1423059051698014, + "grad_norm": 3.0590078830718994, + "learning_rate": 9.955691258998571e-06, + "loss": 1.0354, + "step": 1761 + }, + { + "epoch": 0.14238671488312896, + "grad_norm": 2.698124885559082, + "learning_rate": 9.955604294786067e-06, + "loss": 0.9418, + "step": 1762 + }, + { + "epoch": 0.1424675245964565, + "grad_norm": 3.084228515625, + "learning_rate": 9.955517245695992e-06, + "loss": 1.0172, + "step": 1763 + }, + { + "epoch": 0.14254833430978403, + "grad_norm": 2.997697114944458, + "learning_rate": 9.955430111729838e-06, + "loss": 0.9607, + "step": 1764 + }, + { + "epoch": 0.14262914402311158, + "grad_norm": 3.1823668479919434, + "learning_rate": 9.955342892889093e-06, + "loss": 1.0731, + "step": 1765 + }, + { + "epoch": 0.14270995373643913, + "grad_norm": 3.112485885620117, + "learning_rate": 9.955255589175255e-06, + "loss": 0.9316, + "step": 1766 + }, + { + "epoch": 0.14279076344976666, + "grad_norm": 3.300565004348755, + "learning_rate": 9.95516820058982e-06, + "loss": 0.9639, + "step": 1767 + }, + { + "epoch": 0.1428715731630942, + "grad_norm": 3.437588691711426, + "learning_rate": 9.95508072713428e-06, + "loss": 1.0657, + "step": 1768 + }, + { + "epoch": 0.14295238287642176, + "grad_norm": 2.999753713607788, + "learning_rate": 9.954993168810137e-06, + "loss": 1.0074, + "step": 1769 + }, + { + "epoch": 0.14303319258974928, + "grad_norm": 2.9702138900756836, + "learning_rate": 9.95490552561889e-06, + "loss": 1.143, + "step": 1770 + }, + { + "epoch": 0.14311400230307683, + "grad_norm": 2.9902632236480713, + "learning_rate": 9.95481779756204e-06, + "loss": 1.0102, + "step": 1771 + }, + { + "epoch": 0.14319481201640438, + "grad_norm": 2.7242865562438965, + "learning_rate": 9.954729984641089e-06, + "loss": 1.0243, + "step": 1772 + }, + { + "epoch": 0.1432756217297319, + "grad_norm": 2.6519744396209717, + "learning_rate": 9.954642086857541e-06, + "loss": 0.9326, + "step": 1773 + }, + { + "epoch": 0.14335643144305946, + "grad_norm": 2.8660805225372314, + "learning_rate": 9.954554104212902e-06, + "loss": 1.1151, + "step": 1774 + }, + { + "epoch": 0.143437241156387, + "grad_norm": 3.0777528285980225, + "learning_rate": 9.954466036708678e-06, + "loss": 1.0236, + "step": 1775 + }, + { + "epoch": 0.14351805086971453, + "grad_norm": 2.6604461669921875, + "learning_rate": 9.95437788434638e-06, + "loss": 1.2005, + "step": 1776 + }, + { + "epoch": 0.14359886058304208, + "grad_norm": 2.9123761653900146, + "learning_rate": 9.954289647127516e-06, + "loss": 1.046, + "step": 1777 + }, + { + "epoch": 0.14367967029636963, + "grad_norm": 2.609844923019409, + "learning_rate": 9.954201325053596e-06, + "loss": 1.0015, + "step": 1778 + }, + { + "epoch": 0.14376048000969716, + "grad_norm": 2.655297040939331, + "learning_rate": 9.954112918126135e-06, + "loss": 0.9992, + "step": 1779 + }, + { + "epoch": 0.1438412897230247, + "grad_norm": 2.788531541824341, + "learning_rate": 9.954024426346645e-06, + "loss": 0.9095, + "step": 1780 + }, + { + "epoch": 0.14392209943635226, + "grad_norm": 2.9971466064453125, + "learning_rate": 9.953935849716645e-06, + "loss": 1.011, + "step": 1781 + }, + { + "epoch": 0.14400290914967978, + "grad_norm": 3.2861266136169434, + "learning_rate": 9.95384718823765e-06, + "loss": 0.8697, + "step": 1782 + }, + { + "epoch": 0.14408371886300733, + "grad_norm": 2.6745035648345947, + "learning_rate": 9.953758441911176e-06, + "loss": 1.005, + "step": 1783 + }, + { + "epoch": 0.14416452857633488, + "grad_norm": 2.98370623588562, + "learning_rate": 9.953669610738747e-06, + "loss": 0.9531, + "step": 1784 + }, + { + "epoch": 0.1442453382896624, + "grad_norm": 2.832010269165039, + "learning_rate": 9.953580694721882e-06, + "loss": 0.8957, + "step": 1785 + }, + { + "epoch": 0.14432614800298996, + "grad_norm": 2.6120967864990234, + "learning_rate": 9.953491693862107e-06, + "loss": 1.1036, + "step": 1786 + }, + { + "epoch": 0.1444069577163175, + "grad_norm": 2.9595401287078857, + "learning_rate": 9.953402608160943e-06, + "loss": 1.0677, + "step": 1787 + }, + { + "epoch": 0.14448776742964503, + "grad_norm": 2.821009397506714, + "learning_rate": 9.953313437619915e-06, + "loss": 0.9509, + "step": 1788 + }, + { + "epoch": 0.14456857714297258, + "grad_norm": 2.1895339488983154, + "learning_rate": 9.953224182240557e-06, + "loss": 1.1377, + "step": 1789 + }, + { + "epoch": 0.14464938685630013, + "grad_norm": 2.7714715003967285, + "learning_rate": 9.95313484202439e-06, + "loss": 1.0252, + "step": 1790 + }, + { + "epoch": 0.14473019656962766, + "grad_norm": 2.9314582347869873, + "learning_rate": 9.953045416972948e-06, + "loss": 0.8948, + "step": 1791 + }, + { + "epoch": 0.1448110062829552, + "grad_norm": 3.025941848754883, + "learning_rate": 9.952955907087762e-06, + "loss": 0.9994, + "step": 1792 + }, + { + "epoch": 0.14489181599628276, + "grad_norm": 3.127885580062866, + "learning_rate": 9.952866312370367e-06, + "loss": 1.0634, + "step": 1793 + }, + { + "epoch": 0.1449726257096103, + "grad_norm": 2.982555866241455, + "learning_rate": 9.952776632822293e-06, + "loss": 0.9424, + "step": 1794 + }, + { + "epoch": 0.14505343542293783, + "grad_norm": 2.7474989891052246, + "learning_rate": 9.95268686844508e-06, + "loss": 0.8819, + "step": 1795 + }, + { + "epoch": 0.14513424513626538, + "grad_norm": 2.7660906314849854, + "learning_rate": 9.952597019240264e-06, + "loss": 1.095, + "step": 1796 + }, + { + "epoch": 0.14521505484959293, + "grad_norm": 2.556553602218628, + "learning_rate": 9.952507085209382e-06, + "loss": 0.958, + "step": 1797 + }, + { + "epoch": 0.14529586456292046, + "grad_norm": 3.200990676879883, + "learning_rate": 9.952417066353979e-06, + "loss": 1.182, + "step": 1798 + }, + { + "epoch": 0.145376674276248, + "grad_norm": 2.2909719944000244, + "learning_rate": 9.952326962675593e-06, + "loss": 1.0834, + "step": 1799 + }, + { + "epoch": 0.14545748398957556, + "grad_norm": 3.0186989307403564, + "learning_rate": 9.952236774175767e-06, + "loss": 1.0537, + "step": 1800 + }, + { + "epoch": 0.14553829370290308, + "grad_norm": 2.985985040664673, + "learning_rate": 9.95214650085605e-06, + "loss": 0.9396, + "step": 1801 + }, + { + "epoch": 0.14561910341623063, + "grad_norm": 3.10266375541687, + "learning_rate": 9.952056142717983e-06, + "loss": 1.0285, + "step": 1802 + }, + { + "epoch": 0.14569991312955818, + "grad_norm": 2.8633596897125244, + "learning_rate": 9.951965699763118e-06, + "loss": 1.0462, + "step": 1803 + }, + { + "epoch": 0.1457807228428857, + "grad_norm": 3.117130994796753, + "learning_rate": 9.951875171993e-06, + "loss": 0.962, + "step": 1804 + }, + { + "epoch": 0.14586153255621326, + "grad_norm": 2.4210338592529297, + "learning_rate": 9.951784559409181e-06, + "loss": 1.1702, + "step": 1805 + }, + { + "epoch": 0.1459423422695408, + "grad_norm": 3.194343090057373, + "learning_rate": 9.951693862013214e-06, + "loss": 0.9176, + "step": 1806 + }, + { + "epoch": 0.14602315198286833, + "grad_norm": 2.8605997562408447, + "learning_rate": 9.951603079806653e-06, + "loss": 0.9523, + "step": 1807 + }, + { + "epoch": 0.14610396169619588, + "grad_norm": 2.895078182220459, + "learning_rate": 9.95151221279105e-06, + "loss": 0.9703, + "step": 1808 + }, + { + "epoch": 0.14618477140952343, + "grad_norm": 3.0536563396453857, + "learning_rate": 9.951421260967964e-06, + "loss": 0.9933, + "step": 1809 + }, + { + "epoch": 0.14626558112285096, + "grad_norm": 2.785574197769165, + "learning_rate": 9.951330224338952e-06, + "loss": 1.0731, + "step": 1810 + }, + { + "epoch": 0.1463463908361785, + "grad_norm": 2.6607959270477295, + "learning_rate": 9.951239102905573e-06, + "loss": 1.0461, + "step": 1811 + }, + { + "epoch": 0.14642720054950606, + "grad_norm": 2.855102300643921, + "learning_rate": 9.951147896669389e-06, + "loss": 1.1034, + "step": 1812 + }, + { + "epoch": 0.14650801026283358, + "grad_norm": 2.478165864944458, + "learning_rate": 9.951056605631959e-06, + "loss": 0.9691, + "step": 1813 + }, + { + "epoch": 0.14658881997616113, + "grad_norm": 2.6855709552764893, + "learning_rate": 9.950965229794849e-06, + "loss": 1.0317, + "step": 1814 + }, + { + "epoch": 0.14666962968948868, + "grad_norm": 2.644298791885376, + "learning_rate": 9.950873769159624e-06, + "loss": 1.0109, + "step": 1815 + }, + { + "epoch": 0.1467504394028162, + "grad_norm": 3.3271265029907227, + "learning_rate": 9.950782223727851e-06, + "loss": 1.0195, + "step": 1816 + }, + { + "epoch": 0.14683124911614376, + "grad_norm": 2.890562057495117, + "learning_rate": 9.950690593501096e-06, + "loss": 1.1402, + "step": 1817 + }, + { + "epoch": 0.1469120588294713, + "grad_norm": 2.3465516567230225, + "learning_rate": 9.950598878480928e-06, + "loss": 1.0696, + "step": 1818 + }, + { + "epoch": 0.14699286854279883, + "grad_norm": 2.366276741027832, + "learning_rate": 9.95050707866892e-06, + "loss": 0.9195, + "step": 1819 + }, + { + "epoch": 0.14707367825612638, + "grad_norm": 2.5351967811584473, + "learning_rate": 9.950415194066646e-06, + "loss": 1.016, + "step": 1820 + }, + { + "epoch": 0.14715448796945393, + "grad_norm": 2.9258339405059814, + "learning_rate": 9.950323224675675e-06, + "loss": 0.9287, + "step": 1821 + }, + { + "epoch": 0.14723529768278146, + "grad_norm": 3.2073264122009277, + "learning_rate": 9.950231170497585e-06, + "loss": 1.0693, + "step": 1822 + }, + { + "epoch": 0.147316107396109, + "grad_norm": 3.120288133621216, + "learning_rate": 9.950139031533952e-06, + "loss": 1.0181, + "step": 1823 + }, + { + "epoch": 0.14739691710943656, + "grad_norm": 2.8186001777648926, + "learning_rate": 9.950046807786355e-06, + "loss": 0.909, + "step": 1824 + }, + { + "epoch": 0.14747772682276408, + "grad_norm": 2.494920015335083, + "learning_rate": 9.949954499256373e-06, + "loss": 1.1287, + "step": 1825 + }, + { + "epoch": 0.14755853653609163, + "grad_norm": 2.6874499320983887, + "learning_rate": 9.949862105945587e-06, + "loss": 0.9388, + "step": 1826 + }, + { + "epoch": 0.14763934624941918, + "grad_norm": 2.686732292175293, + "learning_rate": 9.949769627855579e-06, + "loss": 0.9626, + "step": 1827 + }, + { + "epoch": 0.14772015596274674, + "grad_norm": 3.2707130908966064, + "learning_rate": 9.949677064987933e-06, + "loss": 1.0166, + "step": 1828 + }, + { + "epoch": 0.14780096567607426, + "grad_norm": 2.8146145343780518, + "learning_rate": 9.949584417344236e-06, + "loss": 0.9725, + "step": 1829 + }, + { + "epoch": 0.1478817753894018, + "grad_norm": 2.8295071125030518, + "learning_rate": 9.949491684926071e-06, + "loss": 1.0094, + "step": 1830 + }, + { + "epoch": 0.14796258510272936, + "grad_norm": 3.5557024478912354, + "learning_rate": 9.949398867735032e-06, + "loss": 0.967, + "step": 1831 + }, + { + "epoch": 0.14804339481605688, + "grad_norm": 3.3532896041870117, + "learning_rate": 9.949305965772704e-06, + "loss": 1.1545, + "step": 1832 + }, + { + "epoch": 0.14812420452938443, + "grad_norm": 3.946514844894409, + "learning_rate": 9.94921297904068e-06, + "loss": 0.9121, + "step": 1833 + }, + { + "epoch": 0.14820501424271199, + "grad_norm": 2.8444509506225586, + "learning_rate": 9.949119907540552e-06, + "loss": 1.0807, + "step": 1834 + }, + { + "epoch": 0.1482858239560395, + "grad_norm": 3.132686138153076, + "learning_rate": 9.949026751273916e-06, + "loss": 1.1495, + "step": 1835 + }, + { + "epoch": 0.14836663366936706, + "grad_norm": 2.9394941329956055, + "learning_rate": 9.948933510242365e-06, + "loss": 0.9223, + "step": 1836 + }, + { + "epoch": 0.1484474433826946, + "grad_norm": 3.1135542392730713, + "learning_rate": 9.948840184447497e-06, + "loss": 1.0792, + "step": 1837 + }, + { + "epoch": 0.14852825309602213, + "grad_norm": 3.050935745239258, + "learning_rate": 9.94874677389091e-06, + "loss": 1.0063, + "step": 1838 + }, + { + "epoch": 0.14860906280934968, + "grad_norm": 2.676487684249878, + "learning_rate": 9.948653278574205e-06, + "loss": 0.9533, + "step": 1839 + }, + { + "epoch": 0.14868987252267724, + "grad_norm": 2.9276177883148193, + "learning_rate": 9.948559698498984e-06, + "loss": 0.9597, + "step": 1840 + }, + { + "epoch": 0.14877068223600476, + "grad_norm": 2.749194383621216, + "learning_rate": 9.948466033666846e-06, + "loss": 1.0334, + "step": 1841 + }, + { + "epoch": 0.1488514919493323, + "grad_norm": 2.471069812774658, + "learning_rate": 9.948372284079398e-06, + "loss": 1.0528, + "step": 1842 + }, + { + "epoch": 0.14893230166265986, + "grad_norm": 3.0131027698516846, + "learning_rate": 9.948278449738246e-06, + "loss": 0.9566, + "step": 1843 + }, + { + "epoch": 0.14901311137598738, + "grad_norm": 3.0999159812927246, + "learning_rate": 9.948184530644998e-06, + "loss": 1.0947, + "step": 1844 + }, + { + "epoch": 0.14909392108931493, + "grad_norm": 2.7991368770599365, + "learning_rate": 9.948090526801259e-06, + "loss": 0.9532, + "step": 1845 + }, + { + "epoch": 0.14917473080264249, + "grad_norm": 2.8065764904022217, + "learning_rate": 9.947996438208644e-06, + "loss": 1.0142, + "step": 1846 + }, + { + "epoch": 0.14925554051597, + "grad_norm": 2.9444782733917236, + "learning_rate": 9.94790226486876e-06, + "loss": 1.0238, + "step": 1847 + }, + { + "epoch": 0.14933635022929756, + "grad_norm": 2.598299980163574, + "learning_rate": 9.947808006783223e-06, + "loss": 1.0269, + "step": 1848 + }, + { + "epoch": 0.1494171599426251, + "grad_norm": 3.035118579864502, + "learning_rate": 9.947713663953644e-06, + "loss": 1.0333, + "step": 1849 + }, + { + "epoch": 0.14949796965595263, + "grad_norm": 3.401883840560913, + "learning_rate": 9.947619236381644e-06, + "loss": 0.9576, + "step": 1850 + }, + { + "epoch": 0.14957877936928018, + "grad_norm": 2.4581234455108643, + "learning_rate": 9.947524724068835e-06, + "loss": 1.0626, + "step": 1851 + }, + { + "epoch": 0.14965958908260774, + "grad_norm": 2.7806568145751953, + "learning_rate": 9.94743012701684e-06, + "loss": 1.0083, + "step": 1852 + }, + { + "epoch": 0.14974039879593526, + "grad_norm": 2.992265224456787, + "learning_rate": 9.947335445227276e-06, + "loss": 1.1236, + "step": 1853 + }, + { + "epoch": 0.1498212085092628, + "grad_norm": 2.996896982192993, + "learning_rate": 9.947240678701766e-06, + "loss": 0.9795, + "step": 1854 + }, + { + "epoch": 0.14990201822259036, + "grad_norm": 2.843867778778076, + "learning_rate": 9.947145827441934e-06, + "loss": 1.0279, + "step": 1855 + }, + { + "epoch": 0.14998282793591788, + "grad_norm": 3.454298734664917, + "learning_rate": 9.947050891449403e-06, + "loss": 1.1464, + "step": 1856 + }, + { + "epoch": 0.15006363764924543, + "grad_norm": 2.8720662593841553, + "learning_rate": 9.9469558707258e-06, + "loss": 0.9398, + "step": 1857 + }, + { + "epoch": 0.15014444736257299, + "grad_norm": 2.728085994720459, + "learning_rate": 9.946860765272753e-06, + "loss": 0.9881, + "step": 1858 + }, + { + "epoch": 0.15022525707590054, + "grad_norm": 2.709182024002075, + "learning_rate": 9.94676557509189e-06, + "loss": 1.1373, + "step": 1859 + }, + { + "epoch": 0.15030606678922806, + "grad_norm": 3.1405465602874756, + "learning_rate": 9.946670300184841e-06, + "loss": 1.0212, + "step": 1860 + }, + { + "epoch": 0.1503868765025556, + "grad_norm": 4.029134750366211, + "learning_rate": 9.946574940553238e-06, + "loss": 0.9502, + "step": 1861 + }, + { + "epoch": 0.15046768621588316, + "grad_norm": 3.1117727756500244, + "learning_rate": 9.946479496198715e-06, + "loss": 1.0837, + "step": 1862 + }, + { + "epoch": 0.15054849592921069, + "grad_norm": 2.859290361404419, + "learning_rate": 9.946383967122907e-06, + "loss": 1.1341, + "step": 1863 + }, + { + "epoch": 0.15062930564253824, + "grad_norm": 2.6758170127868652, + "learning_rate": 9.94628835332745e-06, + "loss": 1.1002, + "step": 1864 + }, + { + "epoch": 0.1507101153558658, + "grad_norm": 3.0312492847442627, + "learning_rate": 9.94619265481398e-06, + "loss": 0.939, + "step": 1865 + }, + { + "epoch": 0.1507909250691933, + "grad_norm": 2.5172882080078125, + "learning_rate": 9.946096871584138e-06, + "loss": 1.0346, + "step": 1866 + }, + { + "epoch": 0.15087173478252086, + "grad_norm": 3.1140804290771484, + "learning_rate": 9.946001003639562e-06, + "loss": 1.0218, + "step": 1867 + }, + { + "epoch": 0.1509525444958484, + "grad_norm": 2.838855266571045, + "learning_rate": 9.9459050509819e-06, + "loss": 0.9093, + "step": 1868 + }, + { + "epoch": 0.15103335420917594, + "grad_norm": 2.6162962913513184, + "learning_rate": 9.945809013612787e-06, + "loss": 0.9472, + "step": 1869 + }, + { + "epoch": 0.15111416392250349, + "grad_norm": 4.642756938934326, + "learning_rate": 9.945712891533874e-06, + "loss": 1.0032, + "step": 1870 + }, + { + "epoch": 0.15119497363583104, + "grad_norm": 3.521791696548462, + "learning_rate": 9.945616684746805e-06, + "loss": 1.053, + "step": 1871 + }, + { + "epoch": 0.15127578334915856, + "grad_norm": 3.1945931911468506, + "learning_rate": 9.945520393253228e-06, + "loss": 1.0093, + "step": 1872 + }, + { + "epoch": 0.1513565930624861, + "grad_norm": 3.0325980186462402, + "learning_rate": 9.945424017054794e-06, + "loss": 0.9768, + "step": 1873 + }, + { + "epoch": 0.15143740277581366, + "grad_norm": 2.7389962673187256, + "learning_rate": 9.945327556153151e-06, + "loss": 1.1918, + "step": 1874 + }, + { + "epoch": 0.15151821248914119, + "grad_norm": 2.963046073913574, + "learning_rate": 9.945231010549952e-06, + "loss": 0.9698, + "step": 1875 + }, + { + "epoch": 0.15159902220246874, + "grad_norm": 3.454117774963379, + "learning_rate": 9.945134380246853e-06, + "loss": 1.0315, + "step": 1876 + }, + { + "epoch": 0.1516798319157963, + "grad_norm": 3.3037338256835938, + "learning_rate": 9.945037665245504e-06, + "loss": 0.9624, + "step": 1877 + }, + { + "epoch": 0.1517606416291238, + "grad_norm": 2.6708765029907227, + "learning_rate": 9.944940865547566e-06, + "loss": 0.9077, + "step": 1878 + }, + { + "epoch": 0.15184145134245136, + "grad_norm": 2.526172399520874, + "learning_rate": 9.944843981154696e-06, + "loss": 1.041, + "step": 1879 + }, + { + "epoch": 0.1519222610557789, + "grad_norm": 2.736255407333374, + "learning_rate": 9.944747012068553e-06, + "loss": 1.0002, + "step": 1880 + }, + { + "epoch": 0.15200307076910644, + "grad_norm": 2.9772162437438965, + "learning_rate": 9.944649958290796e-06, + "loss": 0.9891, + "step": 1881 + }, + { + "epoch": 0.152083880482434, + "grad_norm": 2.9495186805725098, + "learning_rate": 9.94455281982309e-06, + "loss": 1.118, + "step": 1882 + }, + { + "epoch": 0.15216469019576154, + "grad_norm": 2.974454879760742, + "learning_rate": 9.944455596667097e-06, + "loss": 0.9662, + "step": 1883 + }, + { + "epoch": 0.15224549990908906, + "grad_norm": 2.9383349418640137, + "learning_rate": 9.944358288824485e-06, + "loss": 1.0998, + "step": 1884 + }, + { + "epoch": 0.1523263096224166, + "grad_norm": 2.7243707180023193, + "learning_rate": 9.944260896296917e-06, + "loss": 1.0039, + "step": 1885 + }, + { + "epoch": 0.15240711933574416, + "grad_norm": 3.1253015995025635, + "learning_rate": 9.944163419086062e-06, + "loss": 1.0838, + "step": 1886 + }, + { + "epoch": 0.15248792904907169, + "grad_norm": 3.283964157104492, + "learning_rate": 9.944065857193591e-06, + "loss": 0.9866, + "step": 1887 + }, + { + "epoch": 0.15256873876239924, + "grad_norm": 3.455512523651123, + "learning_rate": 9.943968210621174e-06, + "loss": 1.0921, + "step": 1888 + }, + { + "epoch": 0.1526495484757268, + "grad_norm": 2.915229320526123, + "learning_rate": 9.943870479370485e-06, + "loss": 1.0121, + "step": 1889 + }, + { + "epoch": 0.1527303581890543, + "grad_norm": 2.8723795413970947, + "learning_rate": 9.943772663443194e-06, + "loss": 1.0359, + "step": 1890 + }, + { + "epoch": 0.15281116790238186, + "grad_norm": 3.028423547744751, + "learning_rate": 9.94367476284098e-06, + "loss": 0.9814, + "step": 1891 + }, + { + "epoch": 0.1528919776157094, + "grad_norm": 2.655653238296509, + "learning_rate": 9.94357677756552e-06, + "loss": 1.1435, + "step": 1892 + }, + { + "epoch": 0.15297278732903696, + "grad_norm": 2.7452893257141113, + "learning_rate": 9.94347870761849e-06, + "loss": 1.0701, + "step": 1893 + }, + { + "epoch": 0.1530535970423645, + "grad_norm": 2.8322525024414062, + "learning_rate": 9.943380553001571e-06, + "loss": 0.9368, + "step": 1894 + }, + { + "epoch": 0.15313440675569204, + "grad_norm": 2.974698305130005, + "learning_rate": 9.943282313716444e-06, + "loss": 1.0267, + "step": 1895 + }, + { + "epoch": 0.1532152164690196, + "grad_norm": 3.2014925479888916, + "learning_rate": 9.94318398976479e-06, + "loss": 1.0498, + "step": 1896 + }, + { + "epoch": 0.1532960261823471, + "grad_norm": 2.808967351913452, + "learning_rate": 9.943085581148296e-06, + "loss": 0.9781, + "step": 1897 + }, + { + "epoch": 0.15337683589567466, + "grad_norm": 2.5994133949279785, + "learning_rate": 9.942987087868646e-06, + "loss": 1.033, + "step": 1898 + }, + { + "epoch": 0.1534576456090022, + "grad_norm": 2.7472386360168457, + "learning_rate": 9.942888509927525e-06, + "loss": 1.0477, + "step": 1899 + }, + { + "epoch": 0.15353845532232974, + "grad_norm": 3.5498239994049072, + "learning_rate": 9.942789847326626e-06, + "loss": 1.0185, + "step": 1900 + }, + { + "epoch": 0.1536192650356573, + "grad_norm": 2.7498979568481445, + "learning_rate": 9.942691100067635e-06, + "loss": 1.0287, + "step": 1901 + }, + { + "epoch": 0.15370007474898484, + "grad_norm": 2.7843916416168213, + "learning_rate": 9.942592268152244e-06, + "loss": 1.0318, + "step": 1902 + }, + { + "epoch": 0.15378088446231236, + "grad_norm": 2.5021860599517822, + "learning_rate": 9.942493351582147e-06, + "loss": 1.0759, + "step": 1903 + }, + { + "epoch": 0.1538616941756399, + "grad_norm": 2.946166753768921, + "learning_rate": 9.942394350359038e-06, + "loss": 0.9596, + "step": 1904 + }, + { + "epoch": 0.15394250388896746, + "grad_norm": 2.8323495388031006, + "learning_rate": 9.942295264484612e-06, + "loss": 1.1329, + "step": 1905 + }, + { + "epoch": 0.154023313602295, + "grad_norm": 2.8073818683624268, + "learning_rate": 9.942196093960564e-06, + "loss": 1.066, + "step": 1906 + }, + { + "epoch": 0.15410412331562254, + "grad_norm": 2.6901755332946777, + "learning_rate": 9.942096838788598e-06, + "loss": 1.0574, + "step": 1907 + }, + { + "epoch": 0.1541849330289501, + "grad_norm": 3.3798248767852783, + "learning_rate": 9.941997498970408e-06, + "loss": 1.1475, + "step": 1908 + }, + { + "epoch": 0.1542657427422776, + "grad_norm": 2.586766004562378, + "learning_rate": 9.941898074507698e-06, + "loss": 1.0121, + "step": 1909 + }, + { + "epoch": 0.15434655245560516, + "grad_norm": 2.8129591941833496, + "learning_rate": 9.941798565402175e-06, + "loss": 1.1243, + "step": 1910 + }, + { + "epoch": 0.1544273621689327, + "grad_norm": 2.8571395874023438, + "learning_rate": 9.941698971655536e-06, + "loss": 0.9355, + "step": 1911 + }, + { + "epoch": 0.15450817188226024, + "grad_norm": 2.6519362926483154, + "learning_rate": 9.94159929326949e-06, + "loss": 0.9789, + "step": 1912 + }, + { + "epoch": 0.1545889815955878, + "grad_norm": 2.305694818496704, + "learning_rate": 9.941499530245746e-06, + "loss": 0.8932, + "step": 1913 + }, + { + "epoch": 0.15466979130891534, + "grad_norm": 2.8709194660186768, + "learning_rate": 9.94139968258601e-06, + "loss": 1.0991, + "step": 1914 + }, + { + "epoch": 0.15475060102224286, + "grad_norm": 3.5704281330108643, + "learning_rate": 9.941299750291994e-06, + "loss": 1.0547, + "step": 1915 + }, + { + "epoch": 0.1548314107355704, + "grad_norm": 2.815941095352173, + "learning_rate": 9.94119973336541e-06, + "loss": 1.1171, + "step": 1916 + }, + { + "epoch": 0.15491222044889796, + "grad_norm": 2.738818407058716, + "learning_rate": 9.941099631807968e-06, + "loss": 1.1006, + "step": 1917 + }, + { + "epoch": 0.1549930301622255, + "grad_norm": 2.942873239517212, + "learning_rate": 9.940999445621387e-06, + "loss": 1.0785, + "step": 1918 + }, + { + "epoch": 0.15507383987555304, + "grad_norm": 2.959956407546997, + "learning_rate": 9.940899174807379e-06, + "loss": 1.051, + "step": 1919 + }, + { + "epoch": 0.1551546495888806, + "grad_norm": 3.177203893661499, + "learning_rate": 9.940798819367663e-06, + "loss": 1.0827, + "step": 1920 + }, + { + "epoch": 0.1552354593022081, + "grad_norm": 2.9764657020568848, + "learning_rate": 9.940698379303958e-06, + "loss": 0.9899, + "step": 1921 + }, + { + "epoch": 0.15531626901553566, + "grad_norm": 3.004303455352783, + "learning_rate": 9.940597854617984e-06, + "loss": 1.0142, + "step": 1922 + }, + { + "epoch": 0.1553970787288632, + "grad_norm": 2.973814010620117, + "learning_rate": 9.940497245311462e-06, + "loss": 0.9586, + "step": 1923 + }, + { + "epoch": 0.15547788844219076, + "grad_norm": 2.8607869148254395, + "learning_rate": 9.940396551386117e-06, + "loss": 0.9894, + "step": 1924 + }, + { + "epoch": 0.1555586981555183, + "grad_norm": 3.0081584453582764, + "learning_rate": 9.940295772843673e-06, + "loss": 1.0247, + "step": 1925 + }, + { + "epoch": 0.15563950786884584, + "grad_norm": 3.7671658992767334, + "learning_rate": 9.940194909685854e-06, + "loss": 1.0165, + "step": 1926 + }, + { + "epoch": 0.1557203175821734, + "grad_norm": 2.9663097858428955, + "learning_rate": 9.94009396191439e-06, + "loss": 0.9678, + "step": 1927 + }, + { + "epoch": 0.1558011272955009, + "grad_norm": 3.5792739391326904, + "learning_rate": 9.93999292953101e-06, + "loss": 0.9796, + "step": 1928 + }, + { + "epoch": 0.15588193700882846, + "grad_norm": 2.8195860385894775, + "learning_rate": 9.939891812537442e-06, + "loss": 0.961, + "step": 1929 + }, + { + "epoch": 0.15596274672215601, + "grad_norm": 2.773045539855957, + "learning_rate": 9.93979061093542e-06, + "loss": 1.0076, + "step": 1930 + }, + { + "epoch": 0.15604355643548354, + "grad_norm": 3.2170090675354004, + "learning_rate": 9.939689324726678e-06, + "loss": 1.0341, + "step": 1931 + }, + { + "epoch": 0.1561243661488111, + "grad_norm": 2.630054235458374, + "learning_rate": 9.939587953912949e-06, + "loss": 0.9949, + "step": 1932 + }, + { + "epoch": 0.15620517586213864, + "grad_norm": 2.7837579250335693, + "learning_rate": 9.93948649849597e-06, + "loss": 1.1105, + "step": 1933 + }, + { + "epoch": 0.15628598557546616, + "grad_norm": 3.19868803024292, + "learning_rate": 9.939384958477478e-06, + "loss": 1.0349, + "step": 1934 + }, + { + "epoch": 0.15636679528879371, + "grad_norm": 2.537768840789795, + "learning_rate": 9.939283333859214e-06, + "loss": 0.9077, + "step": 1935 + }, + { + "epoch": 0.15644760500212126, + "grad_norm": 3.3658368587493896, + "learning_rate": 9.939181624642917e-06, + "loss": 0.9721, + "step": 1936 + }, + { + "epoch": 0.1565284147154488, + "grad_norm": 3.2723824977874756, + "learning_rate": 9.939079830830329e-06, + "loss": 0.9994, + "step": 1937 + }, + { + "epoch": 0.15660922442877634, + "grad_norm": 3.204620838165283, + "learning_rate": 9.938977952423193e-06, + "loss": 1.0439, + "step": 1938 + }, + { + "epoch": 0.1566900341421039, + "grad_norm": 3.06638240814209, + "learning_rate": 9.938875989423255e-06, + "loss": 1.0653, + "step": 1939 + }, + { + "epoch": 0.1567708438554314, + "grad_norm": 2.6601171493530273, + "learning_rate": 9.938773941832263e-06, + "loss": 0.9215, + "step": 1940 + }, + { + "epoch": 0.15685165356875896, + "grad_norm": 3.1091103553771973, + "learning_rate": 9.938671809651961e-06, + "loss": 0.9964, + "step": 1941 + }, + { + "epoch": 0.15693246328208652, + "grad_norm": 3.090252637863159, + "learning_rate": 9.938569592884101e-06, + "loss": 1.011, + "step": 1942 + }, + { + "epoch": 0.15701327299541404, + "grad_norm": 3.389678716659546, + "learning_rate": 9.938467291530434e-06, + "loss": 1.0345, + "step": 1943 + }, + { + "epoch": 0.1570940827087416, + "grad_norm": 3.044147491455078, + "learning_rate": 9.93836490559271e-06, + "loss": 1.0974, + "step": 1944 + }, + { + "epoch": 0.15717489242206914, + "grad_norm": 2.801713228225708, + "learning_rate": 9.938262435072683e-06, + "loss": 1.0745, + "step": 1945 + }, + { + "epoch": 0.15725570213539666, + "grad_norm": 3.206634283065796, + "learning_rate": 9.93815987997211e-06, + "loss": 1.0328, + "step": 1946 + }, + { + "epoch": 0.15733651184872421, + "grad_norm": 2.9063611030578613, + "learning_rate": 9.938057240292747e-06, + "loss": 1.0866, + "step": 1947 + }, + { + "epoch": 0.15741732156205177, + "grad_norm": 2.8502037525177, + "learning_rate": 9.93795451603635e-06, + "loss": 0.9957, + "step": 1948 + }, + { + "epoch": 0.1574981312753793, + "grad_norm": 2.766782760620117, + "learning_rate": 9.937851707204682e-06, + "loss": 0.9233, + "step": 1949 + }, + { + "epoch": 0.15757894098870684, + "grad_norm": 2.7361745834350586, + "learning_rate": 9.937748813799499e-06, + "loss": 0.9254, + "step": 1950 + }, + { + "epoch": 0.1576597507020344, + "grad_norm": 2.6297945976257324, + "learning_rate": 9.937645835822567e-06, + "loss": 0.9656, + "step": 1951 + }, + { + "epoch": 0.1577405604153619, + "grad_norm": 2.5020108222961426, + "learning_rate": 9.937542773275648e-06, + "loss": 1.1277, + "step": 1952 + }, + { + "epoch": 0.15782137012868946, + "grad_norm": 2.625633955001831, + "learning_rate": 9.93743962616051e-06, + "loss": 1.0259, + "step": 1953 + }, + { + "epoch": 0.15790217984201702, + "grad_norm": 2.857710123062134, + "learning_rate": 9.937336394478916e-06, + "loss": 1.0568, + "step": 1954 + }, + { + "epoch": 0.15798298955534457, + "grad_norm": 2.824298143386841, + "learning_rate": 9.937233078232636e-06, + "loss": 0.8799, + "step": 1955 + }, + { + "epoch": 0.1580637992686721, + "grad_norm": 2.856640338897705, + "learning_rate": 9.937129677423439e-06, + "loss": 1.0913, + "step": 1956 + }, + { + "epoch": 0.15814460898199964, + "grad_norm": 3.206674098968506, + "learning_rate": 9.937026192053097e-06, + "loss": 0.9916, + "step": 1957 + }, + { + "epoch": 0.1582254186953272, + "grad_norm": 2.5003178119659424, + "learning_rate": 9.936922622123382e-06, + "loss": 1.0594, + "step": 1958 + }, + { + "epoch": 0.15830622840865471, + "grad_norm": 2.9067587852478027, + "learning_rate": 9.936818967636065e-06, + "loss": 0.9746, + "step": 1959 + }, + { + "epoch": 0.15838703812198227, + "grad_norm": 3.135910749435425, + "learning_rate": 9.936715228592927e-06, + "loss": 1.0536, + "step": 1960 + }, + { + "epoch": 0.15846784783530982, + "grad_norm": 2.9614381790161133, + "learning_rate": 9.93661140499574e-06, + "loss": 1.0545, + "step": 1961 + }, + { + "epoch": 0.15854865754863734, + "grad_norm": 2.7072744369506836, + "learning_rate": 9.936507496846286e-06, + "loss": 0.9803, + "step": 1962 + }, + { + "epoch": 0.1586294672619649, + "grad_norm": 3.2678558826446533, + "learning_rate": 9.936403504146341e-06, + "loss": 1.0455, + "step": 1963 + }, + { + "epoch": 0.15871027697529244, + "grad_norm": 2.2548952102661133, + "learning_rate": 9.93629942689769e-06, + "loss": 1.0457, + "step": 1964 + }, + { + "epoch": 0.15879108668861996, + "grad_norm": 2.572094440460205, + "learning_rate": 9.936195265102111e-06, + "loss": 1.0572, + "step": 1965 + }, + { + "epoch": 0.15887189640194752, + "grad_norm": 2.6914725303649902, + "learning_rate": 9.936091018761392e-06, + "loss": 0.9983, + "step": 1966 + }, + { + "epoch": 0.15895270611527507, + "grad_norm": 2.4944095611572266, + "learning_rate": 9.935986687877314e-06, + "loss": 1.0104, + "step": 1967 + }, + { + "epoch": 0.1590335158286026, + "grad_norm": 2.5117979049682617, + "learning_rate": 9.93588227245167e-06, + "loss": 1.1069, + "step": 1968 + }, + { + "epoch": 0.15911432554193014, + "grad_norm": 2.5497148036956787, + "learning_rate": 9.935777772486244e-06, + "loss": 1.1017, + "step": 1969 + }, + { + "epoch": 0.1591951352552577, + "grad_norm": 3.289396286010742, + "learning_rate": 9.935673187982828e-06, + "loss": 1.0039, + "step": 1970 + }, + { + "epoch": 0.15927594496858521, + "grad_norm": 2.688300609588623, + "learning_rate": 9.935568518943213e-06, + "loss": 0.8942, + "step": 1971 + }, + { + "epoch": 0.15935675468191277, + "grad_norm": 2.555457353591919, + "learning_rate": 9.935463765369192e-06, + "loss": 1.0607, + "step": 1972 + }, + { + "epoch": 0.15943756439524032, + "grad_norm": 2.710364580154419, + "learning_rate": 9.935358927262554e-06, + "loss": 1.1108, + "step": 1973 + }, + { + "epoch": 0.15951837410856784, + "grad_norm": 2.998103380203247, + "learning_rate": 9.935254004625104e-06, + "loss": 1.0177, + "step": 1974 + }, + { + "epoch": 0.1595991838218954, + "grad_norm": 2.5873477458953857, + "learning_rate": 9.93514899745863e-06, + "loss": 0.9205, + "step": 1975 + }, + { + "epoch": 0.15967999353522294, + "grad_norm": 2.698263645172119, + "learning_rate": 9.935043905764936e-06, + "loss": 0.8586, + "step": 1976 + }, + { + "epoch": 0.15976080324855046, + "grad_norm": 2.9557156562805176, + "learning_rate": 9.934938729545823e-06, + "loss": 1.0399, + "step": 1977 + }, + { + "epoch": 0.15984161296187802, + "grad_norm": 3.3453972339630127, + "learning_rate": 9.934833468803087e-06, + "loss": 0.9828, + "step": 1978 + }, + { + "epoch": 0.15992242267520557, + "grad_norm": 3.246471643447876, + "learning_rate": 9.934728123538534e-06, + "loss": 0.9561, + "step": 1979 + }, + { + "epoch": 0.1600032323885331, + "grad_norm": 2.8648548126220703, + "learning_rate": 9.934622693753968e-06, + "loss": 1.0334, + "step": 1980 + }, + { + "epoch": 0.16008404210186064, + "grad_norm": 2.9262137413024902, + "learning_rate": 9.934517179451197e-06, + "loss": 1.1668, + "step": 1981 + }, + { + "epoch": 0.1601648518151882, + "grad_norm": 2.7647500038146973, + "learning_rate": 9.934411580632025e-06, + "loss": 0.9749, + "step": 1982 + }, + { + "epoch": 0.16024566152851571, + "grad_norm": 2.793349266052246, + "learning_rate": 9.93430589729826e-06, + "loss": 1.0755, + "step": 1983 + }, + { + "epoch": 0.16032647124184327, + "grad_norm": 3.2601189613342285, + "learning_rate": 9.934200129451716e-06, + "loss": 1.0347, + "step": 1984 + }, + { + "epoch": 0.16040728095517082, + "grad_norm": 3.3131632804870605, + "learning_rate": 9.934094277094202e-06, + "loss": 1.1393, + "step": 1985 + }, + { + "epoch": 0.16048809066849834, + "grad_norm": 2.795083999633789, + "learning_rate": 9.93398834022753e-06, + "loss": 1.0861, + "step": 1986 + }, + { + "epoch": 0.1605689003818259, + "grad_norm": 3.098545551300049, + "learning_rate": 9.933882318853517e-06, + "loss": 1.0253, + "step": 1987 + }, + { + "epoch": 0.16064971009515344, + "grad_norm": 2.473449468612671, + "learning_rate": 9.933776212973977e-06, + "loss": 0.9149, + "step": 1988 + }, + { + "epoch": 0.160730519808481, + "grad_norm": 2.8285419940948486, + "learning_rate": 9.933670022590729e-06, + "loss": 0.9828, + "step": 1989 + }, + { + "epoch": 0.16081132952180852, + "grad_norm": 2.5255420207977295, + "learning_rate": 9.93356374770559e-06, + "loss": 1.0115, + "step": 1990 + }, + { + "epoch": 0.16089213923513607, + "grad_norm": 2.6797611713409424, + "learning_rate": 9.933457388320382e-06, + "loss": 0.9577, + "step": 1991 + }, + { + "epoch": 0.16097294894846362, + "grad_norm": 2.7080936431884766, + "learning_rate": 9.933350944436925e-06, + "loss": 0.9191, + "step": 1992 + }, + { + "epoch": 0.16105375866179114, + "grad_norm": 2.987670660018921, + "learning_rate": 9.933244416057044e-06, + "loss": 0.9964, + "step": 1993 + }, + { + "epoch": 0.1611345683751187, + "grad_norm": 2.8873672485351562, + "learning_rate": 9.93313780318256e-06, + "loss": 0.9755, + "step": 1994 + }, + { + "epoch": 0.16121537808844624, + "grad_norm": 2.718186855316162, + "learning_rate": 9.933031105815304e-06, + "loss": 0.909, + "step": 1995 + }, + { + "epoch": 0.16129618780177377, + "grad_norm": 2.6171813011169434, + "learning_rate": 9.9329243239571e-06, + "loss": 1.0523, + "step": 1996 + }, + { + "epoch": 0.16137699751510132, + "grad_norm": 2.519484043121338, + "learning_rate": 9.932817457609777e-06, + "loss": 1.0525, + "step": 1997 + }, + { + "epoch": 0.16145780722842887, + "grad_norm": 2.6761474609375, + "learning_rate": 9.932710506775169e-06, + "loss": 0.9614, + "step": 1998 + }, + { + "epoch": 0.1615386169417564, + "grad_norm": 3.2018771171569824, + "learning_rate": 9.932603471455101e-06, + "loss": 0.898, + "step": 1999 + }, + { + "epoch": 0.16161942665508394, + "grad_norm": 3.4662911891937256, + "learning_rate": 9.932496351651413e-06, + "loss": 0.9578, + "step": 2000 + }, + { + "epoch": 0.16161942665508394, + "eval_loss": 0.8565676212310791, + "eval_runtime": 811.8928, + "eval_samples_per_second": 102.681, + "eval_steps_per_second": 12.835, + "step": 2000 + }, + { + "epoch": 0.1617002363684115, + "grad_norm": 3.3392117023468018, + "learning_rate": 9.932389147365937e-06, + "loss": 0.9734, + "step": 2001 + }, + { + "epoch": 0.16178104608173902, + "grad_norm": 3.312257766723633, + "learning_rate": 9.932281858600508e-06, + "loss": 1.0206, + "step": 2002 + }, + { + "epoch": 0.16186185579506657, + "grad_norm": 2.7282278537750244, + "learning_rate": 9.932174485356965e-06, + "loss": 1.0603, + "step": 2003 + }, + { + "epoch": 0.16194266550839412, + "grad_norm": 2.7596275806427, + "learning_rate": 9.932067027637148e-06, + "loss": 0.9617, + "step": 2004 + }, + { + "epoch": 0.16202347522172164, + "grad_norm": 3.511976480484009, + "learning_rate": 9.931959485442895e-06, + "loss": 1.0428, + "step": 2005 + }, + { + "epoch": 0.1621042849350492, + "grad_norm": 2.7216062545776367, + "learning_rate": 9.93185185877605e-06, + "loss": 1.0315, + "step": 2006 + }, + { + "epoch": 0.16218509464837674, + "grad_norm": 2.992203712463379, + "learning_rate": 9.931744147638456e-06, + "loss": 0.9996, + "step": 2007 + }, + { + "epoch": 0.16226590436170427, + "grad_norm": 2.978989839553833, + "learning_rate": 9.931636352031957e-06, + "loss": 1.0715, + "step": 2008 + }, + { + "epoch": 0.16234671407503182, + "grad_norm": 3.601834535598755, + "learning_rate": 9.931528471958398e-06, + "loss": 0.9904, + "step": 2009 + }, + { + "epoch": 0.16242752378835937, + "grad_norm": 3.0366740226745605, + "learning_rate": 9.93142050741963e-06, + "loss": 0.9713, + "step": 2010 + }, + { + "epoch": 0.1625083335016869, + "grad_norm": 3.2024919986724854, + "learning_rate": 9.931312458417501e-06, + "loss": 1.0013, + "step": 2011 + }, + { + "epoch": 0.16258914321501444, + "grad_norm": 3.1184558868408203, + "learning_rate": 9.93120432495386e-06, + "loss": 0.9458, + "step": 2012 + }, + { + "epoch": 0.162669952928342, + "grad_norm": 3.003923177719116, + "learning_rate": 9.931096107030561e-06, + "loss": 1.0266, + "step": 2013 + }, + { + "epoch": 0.16275076264166952, + "grad_norm": 3.309068441390991, + "learning_rate": 9.930987804649456e-06, + "loss": 0.9442, + "step": 2014 + }, + { + "epoch": 0.16283157235499707, + "grad_norm": 2.885678768157959, + "learning_rate": 9.930879417812402e-06, + "loss": 1.1082, + "step": 2015 + }, + { + "epoch": 0.16291238206832462, + "grad_norm": 2.6922783851623535, + "learning_rate": 9.930770946521254e-06, + "loss": 1.027, + "step": 2016 + }, + { + "epoch": 0.16299319178165214, + "grad_norm": 3.1714210510253906, + "learning_rate": 9.93066239077787e-06, + "loss": 1.0827, + "step": 2017 + }, + { + "epoch": 0.1630740014949797, + "grad_norm": 2.744509220123291, + "learning_rate": 9.930553750584108e-06, + "loss": 0.9794, + "step": 2018 + }, + { + "epoch": 0.16315481120830724, + "grad_norm": 3.1168100833892822, + "learning_rate": 9.930445025941833e-06, + "loss": 1.064, + "step": 2019 + }, + { + "epoch": 0.1632356209216348, + "grad_norm": 2.8695626258850098, + "learning_rate": 9.930336216852902e-06, + "loss": 1.0854, + "step": 2020 + }, + { + "epoch": 0.16331643063496232, + "grad_norm": 2.774207592010498, + "learning_rate": 9.930227323319182e-06, + "loss": 0.9408, + "step": 2021 + }, + { + "epoch": 0.16339724034828987, + "grad_norm": 2.645362377166748, + "learning_rate": 9.930118345342535e-06, + "loss": 1.0234, + "step": 2022 + }, + { + "epoch": 0.16347805006161742, + "grad_norm": 2.7598698139190674, + "learning_rate": 9.930009282924831e-06, + "loss": 1.0593, + "step": 2023 + }, + { + "epoch": 0.16355885977494494, + "grad_norm": 2.9732730388641357, + "learning_rate": 9.929900136067936e-06, + "loss": 1.1, + "step": 2024 + }, + { + "epoch": 0.1636396694882725, + "grad_norm": 3.9611868858337402, + "learning_rate": 9.929790904773722e-06, + "loss": 0.9501, + "step": 2025 + }, + { + "epoch": 0.16372047920160004, + "grad_norm": 2.8488450050354004, + "learning_rate": 9.929681589044056e-06, + "loss": 0.992, + "step": 2026 + }, + { + "epoch": 0.16380128891492757, + "grad_norm": 2.877012014389038, + "learning_rate": 9.929572188880811e-06, + "loss": 1.1216, + "step": 2027 + }, + { + "epoch": 0.16388209862825512, + "grad_norm": 2.778301954269409, + "learning_rate": 9.929462704285864e-06, + "loss": 1.0419, + "step": 2028 + }, + { + "epoch": 0.16396290834158267, + "grad_norm": 3.2158327102661133, + "learning_rate": 9.929353135261085e-06, + "loss": 1.1185, + "step": 2029 + }, + { + "epoch": 0.1640437180549102, + "grad_norm": 2.8992562294006348, + "learning_rate": 9.929243481808357e-06, + "loss": 0.9015, + "step": 2030 + }, + { + "epoch": 0.16412452776823774, + "grad_norm": 3.4684958457946777, + "learning_rate": 9.929133743929554e-06, + "loss": 0.9631, + "step": 2031 + }, + { + "epoch": 0.1642053374815653, + "grad_norm": 2.9608821868896484, + "learning_rate": 9.929023921626555e-06, + "loss": 0.9267, + "step": 2032 + }, + { + "epoch": 0.16428614719489282, + "grad_norm": 3.2251927852630615, + "learning_rate": 9.928914014901245e-06, + "loss": 0.9074, + "step": 2033 + }, + { + "epoch": 0.16436695690822037, + "grad_norm": 2.771134376525879, + "learning_rate": 9.928804023755501e-06, + "loss": 0.9306, + "step": 2034 + }, + { + "epoch": 0.16444776662154792, + "grad_norm": 3.039679765701294, + "learning_rate": 9.928693948191212e-06, + "loss": 1.0715, + "step": 2035 + }, + { + "epoch": 0.16452857633487544, + "grad_norm": 2.6662399768829346, + "learning_rate": 9.92858378821026e-06, + "loss": 1.045, + "step": 2036 + }, + { + "epoch": 0.164609386048203, + "grad_norm": 2.980207681655884, + "learning_rate": 9.928473543814532e-06, + "loss": 1.0713, + "step": 2037 + }, + { + "epoch": 0.16469019576153054, + "grad_norm": 3.3169238567352295, + "learning_rate": 9.928363215005919e-06, + "loss": 0.9875, + "step": 2038 + }, + { + "epoch": 0.16477100547485807, + "grad_norm": 3.3401217460632324, + "learning_rate": 9.928252801786307e-06, + "loss": 1.0844, + "step": 2039 + }, + { + "epoch": 0.16485181518818562, + "grad_norm": 2.9196760654449463, + "learning_rate": 9.928142304157589e-06, + "loss": 0.9477, + "step": 2040 + }, + { + "epoch": 0.16493262490151317, + "grad_norm": 2.6891627311706543, + "learning_rate": 9.928031722121658e-06, + "loss": 0.9829, + "step": 2041 + }, + { + "epoch": 0.1650134346148407, + "grad_norm": 2.8249528408050537, + "learning_rate": 9.927921055680405e-06, + "loss": 0.9991, + "step": 2042 + }, + { + "epoch": 0.16509424432816824, + "grad_norm": 2.618173599243164, + "learning_rate": 9.92781030483573e-06, + "loss": 0.9323, + "step": 2043 + }, + { + "epoch": 0.1651750540414958, + "grad_norm": 3.1562328338623047, + "learning_rate": 9.927699469589528e-06, + "loss": 0.9882, + "step": 2044 + }, + { + "epoch": 0.16525586375482332, + "grad_norm": 2.5101442337036133, + "learning_rate": 9.927588549943697e-06, + "loss": 0.9527, + "step": 2045 + }, + { + "epoch": 0.16533667346815087, + "grad_norm": 2.8681554794311523, + "learning_rate": 9.927477545900136e-06, + "loss": 1.057, + "step": 2046 + }, + { + "epoch": 0.16541748318147842, + "grad_norm": 2.7133429050445557, + "learning_rate": 9.927366457460748e-06, + "loss": 0.9862, + "step": 2047 + }, + { + "epoch": 0.16549829289480594, + "grad_norm": 2.907148838043213, + "learning_rate": 9.927255284627434e-06, + "loss": 0.9719, + "step": 2048 + }, + { + "epoch": 0.1655791026081335, + "grad_norm": 3.191343307495117, + "learning_rate": 9.927144027402097e-06, + "loss": 0.8926, + "step": 2049 + }, + { + "epoch": 0.16565991232146104, + "grad_norm": 2.7720816135406494, + "learning_rate": 9.927032685786647e-06, + "loss": 1.134, + "step": 2050 + }, + { + "epoch": 0.16574072203478857, + "grad_norm": 3.098902702331543, + "learning_rate": 9.926921259782988e-06, + "loss": 0.9979, + "step": 2051 + }, + { + "epoch": 0.16582153174811612, + "grad_norm": 2.7182741165161133, + "learning_rate": 9.926809749393028e-06, + "loss": 0.9128, + "step": 2052 + }, + { + "epoch": 0.16590234146144367, + "grad_norm": 3.1051368713378906, + "learning_rate": 9.926698154618679e-06, + "loss": 1.1124, + "step": 2053 + }, + { + "epoch": 0.16598315117477122, + "grad_norm": 3.2548975944519043, + "learning_rate": 9.92658647546185e-06, + "loss": 1.0029, + "step": 2054 + }, + { + "epoch": 0.16606396088809874, + "grad_norm": 3.4755003452301025, + "learning_rate": 9.926474711924456e-06, + "loss": 1.0163, + "step": 2055 + }, + { + "epoch": 0.1661447706014263, + "grad_norm": 2.8650968074798584, + "learning_rate": 9.92636286400841e-06, + "loss": 1.0742, + "step": 2056 + }, + { + "epoch": 0.16622558031475385, + "grad_norm": 2.8151135444641113, + "learning_rate": 9.926250931715627e-06, + "loss": 0.9316, + "step": 2057 + }, + { + "epoch": 0.16630639002808137, + "grad_norm": 2.9163153171539307, + "learning_rate": 9.926138915048026e-06, + "loss": 0.9452, + "step": 2058 + }, + { + "epoch": 0.16638719974140892, + "grad_norm": 2.589385986328125, + "learning_rate": 9.926026814007525e-06, + "loss": 1.1494, + "step": 2059 + }, + { + "epoch": 0.16646800945473647, + "grad_norm": 3.025043249130249, + "learning_rate": 9.925914628596043e-06, + "loss": 0.9743, + "step": 2060 + }, + { + "epoch": 0.166548819168064, + "grad_norm": 3.18224835395813, + "learning_rate": 9.925802358815502e-06, + "loss": 1.0212, + "step": 2061 + }, + { + "epoch": 0.16662962888139154, + "grad_norm": 2.94707989692688, + "learning_rate": 9.925690004667824e-06, + "loss": 1.0495, + "step": 2062 + }, + { + "epoch": 0.1667104385947191, + "grad_norm": 3.2346737384796143, + "learning_rate": 9.925577566154935e-06, + "loss": 1.031, + "step": 2063 + }, + { + "epoch": 0.16679124830804662, + "grad_norm": 2.933335304260254, + "learning_rate": 9.92546504327876e-06, + "loss": 0.9792, + "step": 2064 + }, + { + "epoch": 0.16687205802137417, + "grad_norm": 2.6725218296051025, + "learning_rate": 9.925352436041226e-06, + "loss": 1.063, + "step": 2065 + }, + { + "epoch": 0.16695286773470172, + "grad_norm": 2.6440131664276123, + "learning_rate": 9.925239744444263e-06, + "loss": 1.0466, + "step": 2066 + }, + { + "epoch": 0.16703367744802924, + "grad_norm": 2.71238374710083, + "learning_rate": 9.925126968489802e-06, + "loss": 0.9376, + "step": 2067 + }, + { + "epoch": 0.1671144871613568, + "grad_norm": 2.830446243286133, + "learning_rate": 9.925014108179769e-06, + "loss": 0.9454, + "step": 2068 + }, + { + "epoch": 0.16719529687468435, + "grad_norm": 2.73677659034729, + "learning_rate": 9.924901163516104e-06, + "loss": 1.1244, + "step": 2069 + }, + { + "epoch": 0.16727610658801187, + "grad_norm": 3.2142064571380615, + "learning_rate": 9.924788134500735e-06, + "loss": 1.0707, + "step": 2070 + }, + { + "epoch": 0.16735691630133942, + "grad_norm": 3.0720906257629395, + "learning_rate": 9.924675021135603e-06, + "loss": 1.159, + "step": 2071 + }, + { + "epoch": 0.16743772601466697, + "grad_norm": 3.0425429344177246, + "learning_rate": 9.924561823422646e-06, + "loss": 1.0203, + "step": 2072 + }, + { + "epoch": 0.1675185357279945, + "grad_norm": 3.232916831970215, + "learning_rate": 9.924448541363797e-06, + "loss": 1.0186, + "step": 2073 + }, + { + "epoch": 0.16759934544132205, + "grad_norm": 3.0700509548187256, + "learning_rate": 9.924335174961e-06, + "loss": 0.9727, + "step": 2074 + }, + { + "epoch": 0.1676801551546496, + "grad_norm": 2.597489833831787, + "learning_rate": 9.924221724216197e-06, + "loss": 0.8967, + "step": 2075 + }, + { + "epoch": 0.16776096486797712, + "grad_norm": 2.4284236431121826, + "learning_rate": 9.924108189131331e-06, + "loss": 0.9514, + "step": 2076 + }, + { + "epoch": 0.16784177458130467, + "grad_norm": 2.5934653282165527, + "learning_rate": 9.923994569708345e-06, + "loss": 1.0835, + "step": 2077 + }, + { + "epoch": 0.16792258429463222, + "grad_norm": 2.8386030197143555, + "learning_rate": 9.923880865949187e-06, + "loss": 0.9402, + "step": 2078 + }, + { + "epoch": 0.16800339400795974, + "grad_norm": 2.592698574066162, + "learning_rate": 9.923767077855802e-06, + "loss": 0.9582, + "step": 2079 + }, + { + "epoch": 0.1680842037212873, + "grad_norm": 3.1584062576293945, + "learning_rate": 9.923653205430141e-06, + "loss": 0.9662, + "step": 2080 + }, + { + "epoch": 0.16816501343461485, + "grad_norm": 2.8627047538757324, + "learning_rate": 9.923539248674154e-06, + "loss": 1.0484, + "step": 2081 + }, + { + "epoch": 0.16824582314794237, + "grad_norm": 2.56826114654541, + "learning_rate": 9.923425207589793e-06, + "loss": 1.1137, + "step": 2082 + }, + { + "epoch": 0.16832663286126992, + "grad_norm": 2.89048433303833, + "learning_rate": 9.923311082179012e-06, + "loss": 0.9849, + "step": 2083 + }, + { + "epoch": 0.16840744257459747, + "grad_norm": 2.874666452407837, + "learning_rate": 9.923196872443764e-06, + "loss": 1.1006, + "step": 2084 + }, + { + "epoch": 0.16848825228792502, + "grad_norm": 3.502878427505493, + "learning_rate": 9.923082578386003e-06, + "loss": 1.1278, + "step": 2085 + }, + { + "epoch": 0.16856906200125255, + "grad_norm": 2.9675581455230713, + "learning_rate": 9.922968200007691e-06, + "loss": 1.0886, + "step": 2086 + }, + { + "epoch": 0.1686498717145801, + "grad_norm": 2.957794427871704, + "learning_rate": 9.922853737310787e-06, + "loss": 0.9379, + "step": 2087 + }, + { + "epoch": 0.16873068142790765, + "grad_norm": 2.7929487228393555, + "learning_rate": 9.922739190297248e-06, + "loss": 0.9304, + "step": 2088 + }, + { + "epoch": 0.16881149114123517, + "grad_norm": 3.1866209506988525, + "learning_rate": 9.922624558969037e-06, + "loss": 0.9009, + "step": 2089 + }, + { + "epoch": 0.16889230085456272, + "grad_norm": 3.1909749507904053, + "learning_rate": 9.922509843328118e-06, + "loss": 1.0454, + "step": 2090 + }, + { + "epoch": 0.16897311056789027, + "grad_norm": 2.9721570014953613, + "learning_rate": 9.922395043376459e-06, + "loss": 0.9353, + "step": 2091 + }, + { + "epoch": 0.1690539202812178, + "grad_norm": 2.662489414215088, + "learning_rate": 9.92228015911602e-06, + "loss": 0.8749, + "step": 2092 + }, + { + "epoch": 0.16913472999454535, + "grad_norm": 2.497319459915161, + "learning_rate": 9.922165190548773e-06, + "loss": 1.0896, + "step": 2093 + }, + { + "epoch": 0.1692155397078729, + "grad_norm": 3.0421512126922607, + "learning_rate": 9.922050137676685e-06, + "loss": 0.9221, + "step": 2094 + }, + { + "epoch": 0.16929634942120042, + "grad_norm": 3.1248340606689453, + "learning_rate": 9.921935000501728e-06, + "loss": 1.049, + "step": 2095 + }, + { + "epoch": 0.16937715913452797, + "grad_norm": 2.7764015197753906, + "learning_rate": 9.921819779025874e-06, + "loss": 1.0277, + "step": 2096 + }, + { + "epoch": 0.16945796884785552, + "grad_norm": 3.0548336505889893, + "learning_rate": 9.921704473251095e-06, + "loss": 1.0134, + "step": 2097 + }, + { + "epoch": 0.16953877856118305, + "grad_norm": 2.8793864250183105, + "learning_rate": 9.921589083179369e-06, + "loss": 1.0039, + "step": 2098 + }, + { + "epoch": 0.1696195882745106, + "grad_norm": 2.764636516571045, + "learning_rate": 9.921473608812669e-06, + "loss": 0.9687, + "step": 2099 + }, + { + "epoch": 0.16970039798783815, + "grad_norm": 2.684819459915161, + "learning_rate": 9.921358050152973e-06, + "loss": 1.0376, + "step": 2100 + }, + { + "epoch": 0.16978120770116567, + "grad_norm": 2.733719825744629, + "learning_rate": 9.921242407202262e-06, + "loss": 0.9721, + "step": 2101 + }, + { + "epoch": 0.16986201741449322, + "grad_norm": 2.589684009552002, + "learning_rate": 9.921126679962515e-06, + "loss": 0.9558, + "step": 2102 + }, + { + "epoch": 0.16994282712782077, + "grad_norm": 2.8476974964141846, + "learning_rate": 9.921010868435716e-06, + "loss": 1.0327, + "step": 2103 + }, + { + "epoch": 0.1700236368411483, + "grad_norm": 3.0370452404022217, + "learning_rate": 9.92089497262385e-06, + "loss": 0.9495, + "step": 2104 + }, + { + "epoch": 0.17010444655447585, + "grad_norm": 2.854360342025757, + "learning_rate": 9.920778992528896e-06, + "loss": 1.087, + "step": 2105 + }, + { + "epoch": 0.1701852562678034, + "grad_norm": 3.350029945373535, + "learning_rate": 9.920662928152846e-06, + "loss": 0.928, + "step": 2106 + }, + { + "epoch": 0.17026606598113092, + "grad_norm": 2.5941176414489746, + "learning_rate": 9.920546779497686e-06, + "loss": 1.0268, + "step": 2107 + }, + { + "epoch": 0.17034687569445847, + "grad_norm": 2.824829339981079, + "learning_rate": 9.920430546565405e-06, + "loss": 1.0249, + "step": 2108 + }, + { + "epoch": 0.17042768540778602, + "grad_norm": 3.5965116024017334, + "learning_rate": 9.920314229357995e-06, + "loss": 1.0276, + "step": 2109 + }, + { + "epoch": 0.17050849512111355, + "grad_norm": 2.5723979473114014, + "learning_rate": 9.920197827877445e-06, + "loss": 1.0343, + "step": 2110 + }, + { + "epoch": 0.1705893048344411, + "grad_norm": 3.0630664825439453, + "learning_rate": 9.920081342125753e-06, + "loss": 0.963, + "step": 2111 + }, + { + "epoch": 0.17067011454776865, + "grad_norm": 2.9649055004119873, + "learning_rate": 9.919964772104912e-06, + "loss": 0.9868, + "step": 2112 + }, + { + "epoch": 0.17075092426109617, + "grad_norm": 2.8002421855926514, + "learning_rate": 9.919848117816919e-06, + "loss": 1.0483, + "step": 2113 + }, + { + "epoch": 0.17083173397442372, + "grad_norm": 3.262807607650757, + "learning_rate": 9.919731379263772e-06, + "loss": 1.1254, + "step": 2114 + }, + { + "epoch": 0.17091254368775127, + "grad_norm": 2.4717869758605957, + "learning_rate": 9.91961455644747e-06, + "loss": 0.9715, + "step": 2115 + }, + { + "epoch": 0.1709933534010788, + "grad_norm": 2.941643476486206, + "learning_rate": 9.919497649370014e-06, + "loss": 0.9739, + "step": 2116 + }, + { + "epoch": 0.17107416311440635, + "grad_norm": 2.670825481414795, + "learning_rate": 9.919380658033405e-06, + "loss": 1.0739, + "step": 2117 + }, + { + "epoch": 0.1711549728277339, + "grad_norm": 3.208672285079956, + "learning_rate": 9.91926358243965e-06, + "loss": 1.0586, + "step": 2118 + }, + { + "epoch": 0.17123578254106145, + "grad_norm": 2.5170207023620605, + "learning_rate": 9.919146422590753e-06, + "loss": 0.9602, + "step": 2119 + }, + { + "epoch": 0.17131659225438897, + "grad_norm": 2.994489908218384, + "learning_rate": 9.91902917848872e-06, + "loss": 1.0003, + "step": 2120 + }, + { + "epoch": 0.17139740196771652, + "grad_norm": 3.4159748554229736, + "learning_rate": 9.918911850135557e-06, + "loss": 1.0409, + "step": 2121 + }, + { + "epoch": 0.17147821168104407, + "grad_norm": 2.351083993911743, + "learning_rate": 9.918794437533279e-06, + "loss": 1.0347, + "step": 2122 + }, + { + "epoch": 0.1715590213943716, + "grad_norm": 2.8775289058685303, + "learning_rate": 9.918676940683891e-06, + "loss": 1.0373, + "step": 2123 + }, + { + "epoch": 0.17163983110769915, + "grad_norm": 2.7405753135681152, + "learning_rate": 9.918559359589411e-06, + "loss": 0.9521, + "step": 2124 + }, + { + "epoch": 0.1717206408210267, + "grad_norm": 2.8221731185913086, + "learning_rate": 9.918441694251848e-06, + "loss": 1.1207, + "step": 2125 + }, + { + "epoch": 0.17180145053435422, + "grad_norm": 2.7699766159057617, + "learning_rate": 9.918323944673221e-06, + "loss": 0.8984, + "step": 2126 + }, + { + "epoch": 0.17188226024768177, + "grad_norm": 3.1191518306732178, + "learning_rate": 9.918206110855543e-06, + "loss": 1.0376, + "step": 2127 + }, + { + "epoch": 0.17196306996100932, + "grad_norm": 2.816222667694092, + "learning_rate": 9.918088192800836e-06, + "loss": 1.0366, + "step": 2128 + }, + { + "epoch": 0.17204387967433685, + "grad_norm": 2.8271803855895996, + "learning_rate": 9.917970190511117e-06, + "loss": 0.9186, + "step": 2129 + }, + { + "epoch": 0.1721246893876644, + "grad_norm": 2.88246488571167, + "learning_rate": 9.91785210398841e-06, + "loss": 0.9728, + "step": 2130 + }, + { + "epoch": 0.17220549910099195, + "grad_norm": 2.6959381103515625, + "learning_rate": 9.917733933234733e-06, + "loss": 0.9794, + "step": 2131 + }, + { + "epoch": 0.17228630881431947, + "grad_norm": 2.781506299972534, + "learning_rate": 9.917615678252114e-06, + "loss": 0.9592, + "step": 2132 + }, + { + "epoch": 0.17236711852764702, + "grad_norm": 3.295612335205078, + "learning_rate": 9.917497339042579e-06, + "loss": 1.0035, + "step": 2133 + }, + { + "epoch": 0.17244792824097457, + "grad_norm": 3.5183417797088623, + "learning_rate": 9.917378915608151e-06, + "loss": 1.1322, + "step": 2134 + }, + { + "epoch": 0.1725287379543021, + "grad_norm": 2.9559192657470703, + "learning_rate": 9.917260407950859e-06, + "loss": 1.0715, + "step": 2135 + }, + { + "epoch": 0.17260954766762965, + "grad_norm": 2.6079885959625244, + "learning_rate": 9.917141816072737e-06, + "loss": 0.9281, + "step": 2136 + }, + { + "epoch": 0.1726903573809572, + "grad_norm": 3.3428955078125, + "learning_rate": 9.917023139975813e-06, + "loss": 0.9539, + "step": 2137 + }, + { + "epoch": 0.17277116709428472, + "grad_norm": 2.4502480030059814, + "learning_rate": 9.916904379662118e-06, + "loss": 0.8888, + "step": 2138 + }, + { + "epoch": 0.17285197680761227, + "grad_norm": 2.9971001148223877, + "learning_rate": 9.916785535133688e-06, + "loss": 1.1042, + "step": 2139 + }, + { + "epoch": 0.17293278652093982, + "grad_norm": 2.7164599895477295, + "learning_rate": 9.91666660639256e-06, + "loss": 1.0137, + "step": 2140 + }, + { + "epoch": 0.17301359623426735, + "grad_norm": 2.8608083724975586, + "learning_rate": 9.916547593440769e-06, + "loss": 1.0781, + "step": 2141 + }, + { + "epoch": 0.1730944059475949, + "grad_norm": 2.838046073913574, + "learning_rate": 9.916428496280353e-06, + "loss": 1.0676, + "step": 2142 + }, + { + "epoch": 0.17317521566092245, + "grad_norm": 2.646768808364868, + "learning_rate": 9.916309314913353e-06, + "loss": 0.9184, + "step": 2143 + }, + { + "epoch": 0.17325602537424997, + "grad_norm": 3.8071377277374268, + "learning_rate": 9.91619004934181e-06, + "loss": 1.0174, + "step": 2144 + }, + { + "epoch": 0.17333683508757752, + "grad_norm": 2.8915224075317383, + "learning_rate": 9.916070699567767e-06, + "loss": 0.9417, + "step": 2145 + }, + { + "epoch": 0.17341764480090507, + "grad_norm": 2.7951903343200684, + "learning_rate": 9.915951265593266e-06, + "loss": 0.9639, + "step": 2146 + }, + { + "epoch": 0.1734984545142326, + "grad_norm": 3.3803536891937256, + "learning_rate": 9.915831747420357e-06, + "loss": 1.0123, + "step": 2147 + }, + { + "epoch": 0.17357926422756015, + "grad_norm": 3.14410662651062, + "learning_rate": 9.915712145051084e-06, + "loss": 1.0791, + "step": 2148 + }, + { + "epoch": 0.1736600739408877, + "grad_norm": 2.457718849182129, + "learning_rate": 9.915592458487495e-06, + "loss": 1.0325, + "step": 2149 + }, + { + "epoch": 0.17374088365421525, + "grad_norm": 2.9668896198272705, + "learning_rate": 9.915472687731642e-06, + "loss": 0.9524, + "step": 2150 + }, + { + "epoch": 0.17382169336754277, + "grad_norm": 2.822756290435791, + "learning_rate": 9.915352832785574e-06, + "loss": 1.0502, + "step": 2151 + }, + { + "epoch": 0.17390250308087032, + "grad_norm": 2.743389129638672, + "learning_rate": 9.915232893651347e-06, + "loss": 1.0771, + "step": 2152 + }, + { + "epoch": 0.17398331279419788, + "grad_norm": 2.876563787460327, + "learning_rate": 9.915112870331012e-06, + "loss": 0.975, + "step": 2153 + }, + { + "epoch": 0.1740641225075254, + "grad_norm": 2.6534552574157715, + "learning_rate": 9.914992762826628e-06, + "loss": 0.9492, + "step": 2154 + }, + { + "epoch": 0.17414493222085295, + "grad_norm": 3.0383739471435547, + "learning_rate": 9.914872571140248e-06, + "loss": 0.9855, + "step": 2155 + }, + { + "epoch": 0.1742257419341805, + "grad_norm": 2.998772144317627, + "learning_rate": 9.914752295273934e-06, + "loss": 0.9629, + "step": 2156 + }, + { + "epoch": 0.17430655164750802, + "grad_norm": 2.6088712215423584, + "learning_rate": 9.914631935229746e-06, + "loss": 1.0546, + "step": 2157 + }, + { + "epoch": 0.17438736136083557, + "grad_norm": 2.937267303466797, + "learning_rate": 9.914511491009744e-06, + "loss": 0.9758, + "step": 2158 + }, + { + "epoch": 0.17446817107416313, + "grad_norm": 2.9102063179016113, + "learning_rate": 9.914390962615992e-06, + "loss": 0.955, + "step": 2159 + }, + { + "epoch": 0.17454898078749065, + "grad_norm": 2.8595845699310303, + "learning_rate": 9.914270350050552e-06, + "loss": 0.9489, + "step": 2160 + }, + { + "epoch": 0.1746297905008182, + "grad_norm": 2.782841205596924, + "learning_rate": 9.914149653315492e-06, + "loss": 1.1525, + "step": 2161 + }, + { + "epoch": 0.17471060021414575, + "grad_norm": 2.7954177856445312, + "learning_rate": 9.914028872412879e-06, + "loss": 0.9337, + "step": 2162 + }, + { + "epoch": 0.17479140992747327, + "grad_norm": 2.7344326972961426, + "learning_rate": 9.91390800734478e-06, + "loss": 0.9292, + "step": 2163 + }, + { + "epoch": 0.17487221964080082, + "grad_norm": 2.9386417865753174, + "learning_rate": 9.91378705811327e-06, + "loss": 0.9851, + "step": 2164 + }, + { + "epoch": 0.17495302935412838, + "grad_norm": 2.8945112228393555, + "learning_rate": 9.913666024720414e-06, + "loss": 1.0128, + "step": 2165 + }, + { + "epoch": 0.1750338390674559, + "grad_norm": 3.150853395462036, + "learning_rate": 9.91354490716829e-06, + "loss": 1.0122, + "step": 2166 + }, + { + "epoch": 0.17511464878078345, + "grad_norm": 2.9394326210021973, + "learning_rate": 9.91342370545897e-06, + "loss": 0.9914, + "step": 2167 + }, + { + "epoch": 0.175195458494111, + "grad_norm": 2.774982213973999, + "learning_rate": 9.91330241959453e-06, + "loss": 1.0258, + "step": 2168 + }, + { + "epoch": 0.17527626820743852, + "grad_norm": 2.781067132949829, + "learning_rate": 9.913181049577048e-06, + "loss": 1.0331, + "step": 2169 + }, + { + "epoch": 0.17535707792076607, + "grad_norm": 2.9024343490600586, + "learning_rate": 9.913059595408603e-06, + "loss": 0.9953, + "step": 2170 + }, + { + "epoch": 0.17543788763409363, + "grad_norm": 3.012073516845703, + "learning_rate": 9.912938057091274e-06, + "loss": 1.0043, + "step": 2171 + }, + { + "epoch": 0.17551869734742115, + "grad_norm": 2.716157913208008, + "learning_rate": 9.912816434627147e-06, + "loss": 1.0252, + "step": 2172 + }, + { + "epoch": 0.1755995070607487, + "grad_norm": 3.0797886848449707, + "learning_rate": 9.912694728018297e-06, + "loss": 0.9114, + "step": 2173 + }, + { + "epoch": 0.17568031677407625, + "grad_norm": 2.655963182449341, + "learning_rate": 9.912572937266816e-06, + "loss": 1.1196, + "step": 2174 + }, + { + "epoch": 0.17576112648740377, + "grad_norm": 3.1143031120300293, + "learning_rate": 9.912451062374786e-06, + "loss": 1.0057, + "step": 2175 + }, + { + "epoch": 0.17584193620073132, + "grad_norm": 2.9792284965515137, + "learning_rate": 9.912329103344295e-06, + "loss": 0.9854, + "step": 2176 + }, + { + "epoch": 0.17592274591405888, + "grad_norm": 2.852971315383911, + "learning_rate": 9.912207060177433e-06, + "loss": 0.9972, + "step": 2177 + }, + { + "epoch": 0.1760035556273864, + "grad_norm": 2.5394182205200195, + "learning_rate": 9.91208493287629e-06, + "loss": 0.9824, + "step": 2178 + }, + { + "epoch": 0.17608436534071395, + "grad_norm": 2.984800338745117, + "learning_rate": 9.911962721442957e-06, + "loss": 1.1021, + "step": 2179 + }, + { + "epoch": 0.1761651750540415, + "grad_norm": 2.647099733352661, + "learning_rate": 9.91184042587953e-06, + "loss": 0.93, + "step": 2180 + }, + { + "epoch": 0.17624598476736905, + "grad_norm": 2.4311182498931885, + "learning_rate": 9.911718046188096e-06, + "loss": 1.074, + "step": 2181 + }, + { + "epoch": 0.17632679448069657, + "grad_norm": 2.8348121643066406, + "learning_rate": 9.91159558237076e-06, + "loss": 0.9774, + "step": 2182 + }, + { + "epoch": 0.17640760419402413, + "grad_norm": 2.905052423477173, + "learning_rate": 9.911473034429617e-06, + "loss": 0.9155, + "step": 2183 + }, + { + "epoch": 0.17648841390735168, + "grad_norm": 2.745105504989624, + "learning_rate": 9.911350402366764e-06, + "loss": 0.9996, + "step": 2184 + }, + { + "epoch": 0.1765692236206792, + "grad_norm": 3.183624267578125, + "learning_rate": 9.911227686184299e-06, + "loss": 0.927, + "step": 2185 + }, + { + "epoch": 0.17665003333400675, + "grad_norm": 2.996196746826172, + "learning_rate": 9.911104885884331e-06, + "loss": 0.9375, + "step": 2186 + }, + { + "epoch": 0.1767308430473343, + "grad_norm": 3.10860276222229, + "learning_rate": 9.910982001468958e-06, + "loss": 1.0341, + "step": 2187 + }, + { + "epoch": 0.17681165276066182, + "grad_norm": 3.0041086673736572, + "learning_rate": 9.910859032940286e-06, + "loss": 1.028, + "step": 2188 + }, + { + "epoch": 0.17689246247398938, + "grad_norm": 3.4713857173919678, + "learning_rate": 9.91073598030042e-06, + "loss": 1.0096, + "step": 2189 + }, + { + "epoch": 0.17697327218731693, + "grad_norm": 3.2745108604431152, + "learning_rate": 9.91061284355147e-06, + "loss": 1.0437, + "step": 2190 + }, + { + "epoch": 0.17705408190064445, + "grad_norm": 3.0695979595184326, + "learning_rate": 9.910489622695542e-06, + "loss": 0.9703, + "step": 2191 + }, + { + "epoch": 0.177134891613972, + "grad_norm": 3.1220107078552246, + "learning_rate": 9.910366317734752e-06, + "loss": 1.1641, + "step": 2192 + }, + { + "epoch": 0.17721570132729955, + "grad_norm": 2.478935956954956, + "learning_rate": 9.910242928671206e-06, + "loss": 0.9982, + "step": 2193 + }, + { + "epoch": 0.17729651104062708, + "grad_norm": 2.8023176193237305, + "learning_rate": 9.91011945550702e-06, + "loss": 1.08, + "step": 2194 + }, + { + "epoch": 0.17737732075395463, + "grad_norm": 2.962545394897461, + "learning_rate": 9.909995898244306e-06, + "loss": 1.052, + "step": 2195 + }, + { + "epoch": 0.17745813046728218, + "grad_norm": 3.106247663497925, + "learning_rate": 9.909872256885184e-06, + "loss": 0.9073, + "step": 2196 + }, + { + "epoch": 0.1775389401806097, + "grad_norm": 2.672226667404175, + "learning_rate": 9.909748531431772e-06, + "loss": 0.9634, + "step": 2197 + }, + { + "epoch": 0.17761974989393725, + "grad_norm": 2.9921627044677734, + "learning_rate": 9.909624721886186e-06, + "loss": 0.8934, + "step": 2198 + }, + { + "epoch": 0.1777005596072648, + "grad_norm": 2.4521498680114746, + "learning_rate": 9.90950082825055e-06, + "loss": 1.0277, + "step": 2199 + }, + { + "epoch": 0.17778136932059233, + "grad_norm": 2.8029232025146484, + "learning_rate": 9.90937685052698e-06, + "loss": 0.8953, + "step": 2200 + }, + { + "epoch": 0.17786217903391988, + "grad_norm": 2.7503063678741455, + "learning_rate": 9.909252788717606e-06, + "loss": 1.0325, + "step": 2201 + }, + { + "epoch": 0.17794298874724743, + "grad_norm": 3.0869476795196533, + "learning_rate": 9.90912864282455e-06, + "loss": 0.9307, + "step": 2202 + }, + { + "epoch": 0.17802379846057495, + "grad_norm": 2.852287530899048, + "learning_rate": 9.909004412849939e-06, + "loss": 0.9603, + "step": 2203 + }, + { + "epoch": 0.1781046081739025, + "grad_norm": 2.903380870819092, + "learning_rate": 9.908880098795898e-06, + "loss": 1.0392, + "step": 2204 + }, + { + "epoch": 0.17818541788723005, + "grad_norm": 2.9859073162078857, + "learning_rate": 9.90875570066456e-06, + "loss": 1.0383, + "step": 2205 + }, + { + "epoch": 0.17826622760055758, + "grad_norm": 2.5537900924682617, + "learning_rate": 9.908631218458056e-06, + "loss": 1.1067, + "step": 2206 + }, + { + "epoch": 0.17834703731388513, + "grad_norm": 2.6442184448242188, + "learning_rate": 9.908506652178513e-06, + "loss": 1.0278, + "step": 2207 + }, + { + "epoch": 0.17842784702721268, + "grad_norm": 2.952118158340454, + "learning_rate": 9.90838200182807e-06, + "loss": 0.9521, + "step": 2208 + }, + { + "epoch": 0.1785086567405402, + "grad_norm": 2.5400750637054443, + "learning_rate": 9.908257267408861e-06, + "loss": 0.9087, + "step": 2209 + }, + { + "epoch": 0.17858946645386775, + "grad_norm": 2.7366065979003906, + "learning_rate": 9.908132448923019e-06, + "loss": 0.9954, + "step": 2210 + }, + { + "epoch": 0.1786702761671953, + "grad_norm": 2.8425841331481934, + "learning_rate": 9.908007546372685e-06, + "loss": 0.9866, + "step": 2211 + }, + { + "epoch": 0.17875108588052283, + "grad_norm": 3.0371780395507812, + "learning_rate": 9.907882559759996e-06, + "loss": 0.9368, + "step": 2212 + }, + { + "epoch": 0.17883189559385038, + "grad_norm": 2.5463290214538574, + "learning_rate": 9.907757489087094e-06, + "loss": 1.0368, + "step": 2213 + }, + { + "epoch": 0.17891270530717793, + "grad_norm": 2.4215946197509766, + "learning_rate": 9.907632334356123e-06, + "loss": 1.0487, + "step": 2214 + }, + { + "epoch": 0.17899351502050548, + "grad_norm": 2.9008519649505615, + "learning_rate": 9.907507095569222e-06, + "loss": 1.0081, + "step": 2215 + }, + { + "epoch": 0.179074324733833, + "grad_norm": 2.6641945838928223, + "learning_rate": 9.907381772728541e-06, + "loss": 1.0243, + "step": 2216 + }, + { + "epoch": 0.17915513444716055, + "grad_norm": 2.735846996307373, + "learning_rate": 9.907256365836224e-06, + "loss": 0.9198, + "step": 2217 + }, + { + "epoch": 0.1792359441604881, + "grad_norm": 3.134108066558838, + "learning_rate": 9.90713087489442e-06, + "loss": 1.0603, + "step": 2218 + }, + { + "epoch": 0.17931675387381563, + "grad_norm": 2.7960338592529297, + "learning_rate": 9.907005299905275e-06, + "loss": 0.8058, + "step": 2219 + }, + { + "epoch": 0.17939756358714318, + "grad_norm": 3.0357117652893066, + "learning_rate": 9.906879640870943e-06, + "loss": 1.0705, + "step": 2220 + }, + { + "epoch": 0.17947837330047073, + "grad_norm": 2.984736442565918, + "learning_rate": 9.906753897793578e-06, + "loss": 1.009, + "step": 2221 + }, + { + "epoch": 0.17955918301379825, + "grad_norm": 2.8583996295928955, + "learning_rate": 9.906628070675329e-06, + "loss": 1.0643, + "step": 2222 + }, + { + "epoch": 0.1796399927271258, + "grad_norm": 3.1405558586120605, + "learning_rate": 9.906502159518353e-06, + "loss": 1.118, + "step": 2223 + }, + { + "epoch": 0.17972080244045335, + "grad_norm": 2.5613033771514893, + "learning_rate": 9.906376164324808e-06, + "loss": 1.0275, + "step": 2224 + }, + { + "epoch": 0.17980161215378088, + "grad_norm": 3.179513454437256, + "learning_rate": 9.906250085096852e-06, + "loss": 1.1959, + "step": 2225 + }, + { + "epoch": 0.17988242186710843, + "grad_norm": 3.293401002883911, + "learning_rate": 9.90612392183664e-06, + "loss": 1.0808, + "step": 2226 + }, + { + "epoch": 0.17996323158043598, + "grad_norm": 2.9560415744781494, + "learning_rate": 9.90599767454634e-06, + "loss": 0.9355, + "step": 2227 + }, + { + "epoch": 0.1800440412937635, + "grad_norm": 3.093355655670166, + "learning_rate": 9.905871343228108e-06, + "loss": 0.9234, + "step": 2228 + }, + { + "epoch": 0.18012485100709105, + "grad_norm": 3.24405574798584, + "learning_rate": 9.905744927884112e-06, + "loss": 0.991, + "step": 2229 + }, + { + "epoch": 0.1802056607204186, + "grad_norm": 2.381044864654541, + "learning_rate": 9.905618428516514e-06, + "loss": 1.0116, + "step": 2230 + }, + { + "epoch": 0.18028647043374613, + "grad_norm": 2.3708693981170654, + "learning_rate": 9.905491845127485e-06, + "loss": 1.156, + "step": 2231 + }, + { + "epoch": 0.18036728014707368, + "grad_norm": 2.8890292644500732, + "learning_rate": 9.905365177719187e-06, + "loss": 0.9778, + "step": 2232 + }, + { + "epoch": 0.18044808986040123, + "grad_norm": 2.637432098388672, + "learning_rate": 9.905238426293793e-06, + "loss": 0.9777, + "step": 2233 + }, + { + "epoch": 0.18052889957372875, + "grad_norm": 2.81315016746521, + "learning_rate": 9.905111590853475e-06, + "loss": 1.0677, + "step": 2234 + }, + { + "epoch": 0.1806097092870563, + "grad_norm": 2.9049456119537354, + "learning_rate": 9.904984671400403e-06, + "loss": 1.0104, + "step": 2235 + }, + { + "epoch": 0.18069051900038385, + "grad_norm": 2.824903964996338, + "learning_rate": 9.904857667936753e-06, + "loss": 1.0085, + "step": 2236 + }, + { + "epoch": 0.18077132871371138, + "grad_norm": 2.541304588317871, + "learning_rate": 9.904730580464698e-06, + "loss": 1.0775, + "step": 2237 + }, + { + "epoch": 0.18085213842703893, + "grad_norm": 3.0248048305511475, + "learning_rate": 9.904603408986416e-06, + "loss": 0.946, + "step": 2238 + }, + { + "epoch": 0.18093294814036648, + "grad_norm": 2.661759853363037, + "learning_rate": 9.904476153504085e-06, + "loss": 1.1053, + "step": 2239 + }, + { + "epoch": 0.181013757853694, + "grad_norm": 2.5051047801971436, + "learning_rate": 9.904348814019885e-06, + "loss": 0.9926, + "step": 2240 + }, + { + "epoch": 0.18109456756702155, + "grad_norm": 2.8634889125823975, + "learning_rate": 9.904221390535996e-06, + "loss": 0.9587, + "step": 2241 + }, + { + "epoch": 0.1811753772803491, + "grad_norm": 2.776017904281616, + "learning_rate": 9.904093883054602e-06, + "loss": 0.9272, + "step": 2242 + }, + { + "epoch": 0.18125618699367663, + "grad_norm": 2.577670097351074, + "learning_rate": 9.903966291577884e-06, + "loss": 0.9992, + "step": 2243 + }, + { + "epoch": 0.18133699670700418, + "grad_norm": 2.838353395462036, + "learning_rate": 9.90383861610803e-06, + "loss": 1.012, + "step": 2244 + }, + { + "epoch": 0.18141780642033173, + "grad_norm": 2.815898895263672, + "learning_rate": 9.903710856647227e-06, + "loss": 1.0239, + "step": 2245 + }, + { + "epoch": 0.18149861613365928, + "grad_norm": 2.680938243865967, + "learning_rate": 9.903583013197662e-06, + "loss": 1.0676, + "step": 2246 + }, + { + "epoch": 0.1815794258469868, + "grad_norm": 2.8625309467315674, + "learning_rate": 9.903455085761525e-06, + "loss": 1.0337, + "step": 2247 + }, + { + "epoch": 0.18166023556031435, + "grad_norm": 2.6191186904907227, + "learning_rate": 9.903327074341006e-06, + "loss": 1.0447, + "step": 2248 + }, + { + "epoch": 0.1817410452736419, + "grad_norm": 2.5081663131713867, + "learning_rate": 9.9031989789383e-06, + "loss": 1.0414, + "step": 2249 + }, + { + "epoch": 0.18182185498696943, + "grad_norm": 3.1408021450042725, + "learning_rate": 9.903070799555598e-06, + "loss": 1.0195, + "step": 2250 + }, + { + "epoch": 0.18190266470029698, + "grad_norm": 2.689897298812866, + "learning_rate": 9.902942536195098e-06, + "loss": 1.0318, + "step": 2251 + }, + { + "epoch": 0.18198347441362453, + "grad_norm": 3.066641092300415, + "learning_rate": 9.902814188858994e-06, + "loss": 1.0626, + "step": 2252 + }, + { + "epoch": 0.18206428412695205, + "grad_norm": 3.338888645172119, + "learning_rate": 9.902685757549486e-06, + "loss": 0.9944, + "step": 2253 + }, + { + "epoch": 0.1821450938402796, + "grad_norm": 2.9678893089294434, + "learning_rate": 9.902557242268775e-06, + "loss": 1.0364, + "step": 2254 + }, + { + "epoch": 0.18222590355360715, + "grad_norm": 3.2926056385040283, + "learning_rate": 9.90242864301906e-06, + "loss": 1.0177, + "step": 2255 + }, + { + "epoch": 0.18230671326693468, + "grad_norm": 2.818373680114746, + "learning_rate": 9.902299959802546e-06, + "loss": 1.2125, + "step": 2256 + }, + { + "epoch": 0.18238752298026223, + "grad_norm": 2.7557077407836914, + "learning_rate": 9.902171192621435e-06, + "loss": 0.9113, + "step": 2257 + }, + { + "epoch": 0.18246833269358978, + "grad_norm": 2.6193900108337402, + "learning_rate": 9.902042341477932e-06, + "loss": 1.1363, + "step": 2258 + }, + { + "epoch": 0.1825491424069173, + "grad_norm": 2.5538594722747803, + "learning_rate": 9.901913406374246e-06, + "loss": 1.0446, + "step": 2259 + }, + { + "epoch": 0.18262995212024485, + "grad_norm": 2.68630051612854, + "learning_rate": 9.901784387312583e-06, + "loss": 1.0794, + "step": 2260 + }, + { + "epoch": 0.1827107618335724, + "grad_norm": 2.8566384315490723, + "learning_rate": 9.901655284295156e-06, + "loss": 1.0573, + "step": 2261 + }, + { + "epoch": 0.18279157154689993, + "grad_norm": 2.828277111053467, + "learning_rate": 9.901526097324171e-06, + "loss": 0.9004, + "step": 2262 + }, + { + "epoch": 0.18287238126022748, + "grad_norm": 2.6929445266723633, + "learning_rate": 9.901396826401846e-06, + "loss": 1.0182, + "step": 2263 + }, + { + "epoch": 0.18295319097355503, + "grad_norm": 2.9283299446105957, + "learning_rate": 9.901267471530393e-06, + "loss": 0.9546, + "step": 2264 + }, + { + "epoch": 0.18303400068688255, + "grad_norm": 2.531330108642578, + "learning_rate": 9.901138032712028e-06, + "loss": 0.958, + "step": 2265 + }, + { + "epoch": 0.1831148104002101, + "grad_norm": 2.7807202339172363, + "learning_rate": 9.901008509948967e-06, + "loss": 1.0124, + "step": 2266 + }, + { + "epoch": 0.18319562011353765, + "grad_norm": 2.8674800395965576, + "learning_rate": 9.900878903243428e-06, + "loss": 1.034, + "step": 2267 + }, + { + "epoch": 0.18327642982686518, + "grad_norm": 2.8897581100463867, + "learning_rate": 9.900749212597632e-06, + "loss": 0.9783, + "step": 2268 + }, + { + "epoch": 0.18335723954019273, + "grad_norm": 2.7262604236602783, + "learning_rate": 9.900619438013802e-06, + "loss": 0.971, + "step": 2269 + }, + { + "epoch": 0.18343804925352028, + "grad_norm": 3.7016351222991943, + "learning_rate": 9.900489579494156e-06, + "loss": 0.9499, + "step": 2270 + }, + { + "epoch": 0.1835188589668478, + "grad_norm": 2.412543535232544, + "learning_rate": 9.900359637040922e-06, + "loss": 1.0185, + "step": 2271 + }, + { + "epoch": 0.18359966868017535, + "grad_norm": 2.899360179901123, + "learning_rate": 9.900229610656324e-06, + "loss": 0.9437, + "step": 2272 + }, + { + "epoch": 0.1836804783935029, + "grad_norm": 2.7710652351379395, + "learning_rate": 9.90009950034259e-06, + "loss": 0.9944, + "step": 2273 + }, + { + "epoch": 0.18376128810683043, + "grad_norm": 3.9818215370178223, + "learning_rate": 9.89996930610195e-06, + "loss": 1.0571, + "step": 2274 + }, + { + "epoch": 0.18384209782015798, + "grad_norm": 4.321252822875977, + "learning_rate": 9.89983902793663e-06, + "loss": 0.9093, + "step": 2275 + }, + { + "epoch": 0.18392290753348553, + "grad_norm": 2.7052674293518066, + "learning_rate": 9.899708665848864e-06, + "loss": 0.9636, + "step": 2276 + }, + { + "epoch": 0.18400371724681305, + "grad_norm": 2.865767240524292, + "learning_rate": 9.899578219840883e-06, + "loss": 1.0194, + "step": 2277 + }, + { + "epoch": 0.1840845269601406, + "grad_norm": 3.1455867290496826, + "learning_rate": 9.899447689914924e-06, + "loss": 1.0515, + "step": 2278 + }, + { + "epoch": 0.18416533667346816, + "grad_norm": 2.658369302749634, + "learning_rate": 9.89931707607322e-06, + "loss": 1.0933, + "step": 2279 + }, + { + "epoch": 0.1842461463867957, + "grad_norm": 3.026167392730713, + "learning_rate": 9.899186378318008e-06, + "loss": 0.9614, + "step": 2280 + }, + { + "epoch": 0.18432695610012323, + "grad_norm": 3.9894533157348633, + "learning_rate": 9.89905559665153e-06, + "loss": 0.9487, + "step": 2281 + }, + { + "epoch": 0.18440776581345078, + "grad_norm": 3.4070656299591064, + "learning_rate": 9.898924731076022e-06, + "loss": 0.9747, + "step": 2282 + }, + { + "epoch": 0.18448857552677833, + "grad_norm": 2.44861102104187, + "learning_rate": 9.89879378159373e-06, + "loss": 0.9783, + "step": 2283 + }, + { + "epoch": 0.18456938524010585, + "grad_norm": 2.5936949253082275, + "learning_rate": 9.898662748206891e-06, + "loss": 0.9835, + "step": 2284 + }, + { + "epoch": 0.1846501949534334, + "grad_norm": 2.9348199367523193, + "learning_rate": 9.89853163091775e-06, + "loss": 0.957, + "step": 2285 + }, + { + "epoch": 0.18473100466676096, + "grad_norm": 2.6662724018096924, + "learning_rate": 9.898400429728559e-06, + "loss": 0.9759, + "step": 2286 + }, + { + "epoch": 0.18481181438008848, + "grad_norm": 3.1823620796203613, + "learning_rate": 9.89826914464156e-06, + "loss": 1.082, + "step": 2287 + }, + { + "epoch": 0.18489262409341603, + "grad_norm": 2.6004109382629395, + "learning_rate": 9.898137775659002e-06, + "loss": 1.0556, + "step": 2288 + }, + { + "epoch": 0.18497343380674358, + "grad_norm": 2.8404204845428467, + "learning_rate": 9.898006322783137e-06, + "loss": 0.9183, + "step": 2289 + }, + { + "epoch": 0.1850542435200711, + "grad_norm": 2.837385892868042, + "learning_rate": 9.897874786016213e-06, + "loss": 0.9642, + "step": 2290 + }, + { + "epoch": 0.18513505323339866, + "grad_norm": 2.9858784675598145, + "learning_rate": 9.897743165360487e-06, + "loss": 1.1122, + "step": 2291 + }, + { + "epoch": 0.1852158629467262, + "grad_norm": 2.650075912475586, + "learning_rate": 9.89761146081821e-06, + "loss": 1.142, + "step": 2292 + }, + { + "epoch": 0.18529667266005373, + "grad_norm": 3.637911796569824, + "learning_rate": 9.89747967239164e-06, + "loss": 1.0368, + "step": 2293 + }, + { + "epoch": 0.18537748237338128, + "grad_norm": 2.865354537963867, + "learning_rate": 9.897347800083034e-06, + "loss": 1.0131, + "step": 2294 + }, + { + "epoch": 0.18545829208670883, + "grad_norm": 2.903250217437744, + "learning_rate": 9.89721584389465e-06, + "loss": 0.9644, + "step": 2295 + }, + { + "epoch": 0.18553910180003635, + "grad_norm": 2.552626848220825, + "learning_rate": 9.897083803828747e-06, + "loss": 1.0422, + "step": 2296 + }, + { + "epoch": 0.1856199115133639, + "grad_norm": 2.6443898677825928, + "learning_rate": 9.896951679887588e-06, + "loss": 1.0194, + "step": 2297 + }, + { + "epoch": 0.18570072122669146, + "grad_norm": 2.6982264518737793, + "learning_rate": 9.896819472073435e-06, + "loss": 0.9968, + "step": 2298 + }, + { + "epoch": 0.18578153094001898, + "grad_norm": 3.444505453109741, + "learning_rate": 9.896687180388555e-06, + "loss": 0.9242, + "step": 2299 + }, + { + "epoch": 0.18586234065334653, + "grad_norm": 2.814103364944458, + "learning_rate": 9.89655480483521e-06, + "loss": 1.0171, + "step": 2300 + }, + { + "epoch": 0.18594315036667408, + "grad_norm": 2.8541147708892822, + "learning_rate": 9.896422345415671e-06, + "loss": 1.121, + "step": 2301 + }, + { + "epoch": 0.1860239600800016, + "grad_norm": 2.724806070327759, + "learning_rate": 9.896289802132204e-06, + "loss": 1.0631, + "step": 2302 + }, + { + "epoch": 0.18610476979332916, + "grad_norm": 2.483346700668335, + "learning_rate": 9.896157174987079e-06, + "loss": 1.0567, + "step": 2303 + }, + { + "epoch": 0.1861855795066567, + "grad_norm": 2.560619592666626, + "learning_rate": 9.89602446398257e-06, + "loss": 1.0899, + "step": 2304 + }, + { + "epoch": 0.18626638921998423, + "grad_norm": 2.711629629135132, + "learning_rate": 9.895891669120948e-06, + "loss": 0.9556, + "step": 2305 + }, + { + "epoch": 0.18634719893331178, + "grad_norm": 3.1584885120391846, + "learning_rate": 9.895758790404488e-06, + "loss": 1.0342, + "step": 2306 + }, + { + "epoch": 0.18642800864663933, + "grad_norm": 2.9955644607543945, + "learning_rate": 9.895625827835466e-06, + "loss": 0.9627, + "step": 2307 + }, + { + "epoch": 0.18650881835996685, + "grad_norm": 2.802232503890991, + "learning_rate": 9.89549278141616e-06, + "loss": 1.0367, + "step": 2308 + }, + { + "epoch": 0.1865896280732944, + "grad_norm": 2.8121204376220703, + "learning_rate": 9.895359651148848e-06, + "loss": 1.0337, + "step": 2309 + }, + { + "epoch": 0.18667043778662196, + "grad_norm": 2.481471538543701, + "learning_rate": 9.895226437035808e-06, + "loss": 0.9112, + "step": 2310 + }, + { + "epoch": 0.1867512474999495, + "grad_norm": 2.9287850856781006, + "learning_rate": 9.895093139079326e-06, + "loss": 0.9825, + "step": 2311 + }, + { + "epoch": 0.18683205721327703, + "grad_norm": 2.7301418781280518, + "learning_rate": 9.894959757281684e-06, + "loss": 1.0598, + "step": 2312 + }, + { + "epoch": 0.18691286692660458, + "grad_norm": 2.687197685241699, + "learning_rate": 9.894826291645163e-06, + "loss": 0.9043, + "step": 2313 + }, + { + "epoch": 0.18699367663993213, + "grad_norm": 2.7087533473968506, + "learning_rate": 9.894692742172052e-06, + "loss": 1.0148, + "step": 2314 + }, + { + "epoch": 0.18707448635325966, + "grad_norm": 2.8972854614257812, + "learning_rate": 9.89455910886464e-06, + "loss": 1.0323, + "step": 2315 + }, + { + "epoch": 0.1871552960665872, + "grad_norm": 2.723487138748169, + "learning_rate": 9.894425391725211e-06, + "loss": 0.9708, + "step": 2316 + }, + { + "epoch": 0.18723610577991476, + "grad_norm": 2.984046220779419, + "learning_rate": 9.89429159075606e-06, + "loss": 1.0942, + "step": 2317 + }, + { + "epoch": 0.18731691549324228, + "grad_norm": 2.903895854949951, + "learning_rate": 9.894157705959474e-06, + "loss": 1.0002, + "step": 2318 + }, + { + "epoch": 0.18739772520656983, + "grad_norm": 3.094198703765869, + "learning_rate": 9.89402373733775e-06, + "loss": 0.9538, + "step": 2319 + }, + { + "epoch": 0.18747853491989738, + "grad_norm": 2.7821521759033203, + "learning_rate": 9.893889684893182e-06, + "loss": 0.97, + "step": 2320 + }, + { + "epoch": 0.1875593446332249, + "grad_norm": 3.355893135070801, + "learning_rate": 9.893755548628065e-06, + "loss": 0.9173, + "step": 2321 + }, + { + "epoch": 0.18764015434655246, + "grad_norm": 2.6395037174224854, + "learning_rate": 9.893621328544697e-06, + "loss": 0.8974, + "step": 2322 + }, + { + "epoch": 0.18772096405988, + "grad_norm": 2.7599050998687744, + "learning_rate": 9.893487024645376e-06, + "loss": 1.1345, + "step": 2323 + }, + { + "epoch": 0.18780177377320753, + "grad_norm": 3.1787357330322266, + "learning_rate": 9.893352636932403e-06, + "loss": 1.05, + "step": 2324 + }, + { + "epoch": 0.18788258348653508, + "grad_norm": 2.8817572593688965, + "learning_rate": 9.89321816540808e-06, + "loss": 0.9488, + "step": 2325 + }, + { + "epoch": 0.18796339319986263, + "grad_norm": 2.432938814163208, + "learning_rate": 9.893083610074708e-06, + "loss": 0.9587, + "step": 2326 + }, + { + "epoch": 0.18804420291319016, + "grad_norm": 2.720282793045044, + "learning_rate": 9.892948970934595e-06, + "loss": 1.1109, + "step": 2327 + }, + { + "epoch": 0.1881250126265177, + "grad_norm": 2.891789436340332, + "learning_rate": 9.892814247990045e-06, + "loss": 0.976, + "step": 2328 + }, + { + "epoch": 0.18820582233984526, + "grad_norm": 2.7646126747131348, + "learning_rate": 9.892679441243367e-06, + "loss": 1.0885, + "step": 2329 + }, + { + "epoch": 0.18828663205317278, + "grad_norm": 2.6306145191192627, + "learning_rate": 9.892544550696867e-06, + "loss": 1.1034, + "step": 2330 + }, + { + "epoch": 0.18836744176650033, + "grad_norm": 2.8713583946228027, + "learning_rate": 9.892409576352859e-06, + "loss": 1.0409, + "step": 2331 + }, + { + "epoch": 0.18844825147982788, + "grad_norm": 3.1042556762695312, + "learning_rate": 9.892274518213652e-06, + "loss": 1.0268, + "step": 2332 + }, + { + "epoch": 0.1885290611931554, + "grad_norm": 2.764892578125, + "learning_rate": 9.892139376281559e-06, + "loss": 0.9407, + "step": 2333 + }, + { + "epoch": 0.18860987090648296, + "grad_norm": 2.5440847873687744, + "learning_rate": 9.892004150558897e-06, + "loss": 1.0254, + "step": 2334 + }, + { + "epoch": 0.1886906806198105, + "grad_norm": 2.648404121398926, + "learning_rate": 9.89186884104798e-06, + "loss": 1.0697, + "step": 2335 + }, + { + "epoch": 0.18877149033313803, + "grad_norm": 2.939295530319214, + "learning_rate": 9.891733447751129e-06, + "loss": 0.9549, + "step": 2336 + }, + { + "epoch": 0.18885230004646558, + "grad_norm": 2.9315342903137207, + "learning_rate": 9.891597970670657e-06, + "loss": 0.8858, + "step": 2337 + }, + { + "epoch": 0.18893310975979313, + "grad_norm": 2.5525410175323486, + "learning_rate": 9.89146240980889e-06, + "loss": 0.9254, + "step": 2338 + }, + { + "epoch": 0.18901391947312066, + "grad_norm": 2.603762626647949, + "learning_rate": 9.891326765168147e-06, + "loss": 1.0947, + "step": 2339 + }, + { + "epoch": 0.1890947291864482, + "grad_norm": 2.6096351146698, + "learning_rate": 9.891191036750752e-06, + "loss": 0.9084, + "step": 2340 + }, + { + "epoch": 0.18917553889977576, + "grad_norm": 2.9336111545562744, + "learning_rate": 9.89105522455903e-06, + "loss": 0.9864, + "step": 2341 + }, + { + "epoch": 0.18925634861310328, + "grad_norm": 2.792168140411377, + "learning_rate": 9.890919328595306e-06, + "loss": 0.9951, + "step": 2342 + }, + { + "epoch": 0.18933715832643083, + "grad_norm": 2.763575792312622, + "learning_rate": 9.890783348861909e-06, + "loss": 1.0332, + "step": 2343 + }, + { + "epoch": 0.18941796803975838, + "grad_norm": 2.647984027862549, + "learning_rate": 9.890647285361166e-06, + "loss": 1.0565, + "step": 2344 + }, + { + "epoch": 0.18949877775308593, + "grad_norm": 2.5773236751556396, + "learning_rate": 9.890511138095411e-06, + "loss": 1.0024, + "step": 2345 + }, + { + "epoch": 0.18957958746641346, + "grad_norm": 3.2868947982788086, + "learning_rate": 9.890374907066971e-06, + "loss": 1.0944, + "step": 2346 + }, + { + "epoch": 0.189660397179741, + "grad_norm": 3.268430709838867, + "learning_rate": 9.890238592278184e-06, + "loss": 1.0724, + "step": 2347 + }, + { + "epoch": 0.18974120689306856, + "grad_norm": 3.127488136291504, + "learning_rate": 9.890102193731381e-06, + "loss": 1.0143, + "step": 2348 + }, + { + "epoch": 0.18982201660639608, + "grad_norm": 2.9193356037139893, + "learning_rate": 9.889965711428901e-06, + "loss": 0.9556, + "step": 2349 + }, + { + "epoch": 0.18990282631972363, + "grad_norm": 3.0478930473327637, + "learning_rate": 9.88982914537308e-06, + "loss": 1.0065, + "step": 2350 + }, + { + "epoch": 0.18998363603305118, + "grad_norm": 2.723362922668457, + "learning_rate": 9.88969249556626e-06, + "loss": 0.9885, + "step": 2351 + }, + { + "epoch": 0.1900644457463787, + "grad_norm": 2.417820930480957, + "learning_rate": 9.889555762010776e-06, + "loss": 0.9351, + "step": 2352 + }, + { + "epoch": 0.19014525545970626, + "grad_norm": 2.9843337535858154, + "learning_rate": 9.889418944708973e-06, + "loss": 0.999, + "step": 2353 + }, + { + "epoch": 0.1902260651730338, + "grad_norm": 3.093397855758667, + "learning_rate": 9.889282043663196e-06, + "loss": 1.0169, + "step": 2354 + }, + { + "epoch": 0.19030687488636133, + "grad_norm": 2.4472341537475586, + "learning_rate": 9.889145058875786e-06, + "loss": 0.8608, + "step": 2355 + }, + { + "epoch": 0.19038768459968888, + "grad_norm": 3.399470806121826, + "learning_rate": 9.889007990349093e-06, + "loss": 1.1322, + "step": 2356 + }, + { + "epoch": 0.19046849431301643, + "grad_norm": 2.760270833969116, + "learning_rate": 9.888870838085463e-06, + "loss": 1.0778, + "step": 2357 + }, + { + "epoch": 0.19054930402634396, + "grad_norm": 2.8980255126953125, + "learning_rate": 9.888733602087244e-06, + "loss": 1.0017, + "step": 2358 + }, + { + "epoch": 0.1906301137396715, + "grad_norm": 3.068690776824951, + "learning_rate": 9.888596282356788e-06, + "loss": 1.057, + "step": 2359 + }, + { + "epoch": 0.19071092345299906, + "grad_norm": 2.7030603885650635, + "learning_rate": 9.888458878896445e-06, + "loss": 1.0547, + "step": 2360 + }, + { + "epoch": 0.19079173316632658, + "grad_norm": 2.6108741760253906, + "learning_rate": 9.888321391708571e-06, + "loss": 0.9859, + "step": 2361 + }, + { + "epoch": 0.19087254287965413, + "grad_norm": 2.7236034870147705, + "learning_rate": 9.88818382079552e-06, + "loss": 1.0942, + "step": 2362 + }, + { + "epoch": 0.19095335259298168, + "grad_norm": 3.056166410446167, + "learning_rate": 9.888046166159648e-06, + "loss": 1.0482, + "step": 2363 + }, + { + "epoch": 0.1910341623063092, + "grad_norm": 3.031191349029541, + "learning_rate": 9.887908427803313e-06, + "loss": 1.0582, + "step": 2364 + }, + { + "epoch": 0.19111497201963676, + "grad_norm": 2.9865663051605225, + "learning_rate": 9.887770605728873e-06, + "loss": 1.1339, + "step": 2365 + }, + { + "epoch": 0.1911957817329643, + "grad_norm": 2.5968196392059326, + "learning_rate": 9.88763269993869e-06, + "loss": 1.0259, + "step": 2366 + }, + { + "epoch": 0.19127659144629183, + "grad_norm": 3.1886796951293945, + "learning_rate": 9.887494710435125e-06, + "loss": 0.9362, + "step": 2367 + }, + { + "epoch": 0.19135740115961938, + "grad_norm": 3.991478443145752, + "learning_rate": 9.887356637220543e-06, + "loss": 0.9187, + "step": 2368 + }, + { + "epoch": 0.19143821087294693, + "grad_norm": 2.774512767791748, + "learning_rate": 9.887218480297305e-06, + "loss": 0.9443, + "step": 2369 + }, + { + "epoch": 0.19151902058627446, + "grad_norm": 3.140108108520508, + "learning_rate": 9.887080239667782e-06, + "loss": 0.9853, + "step": 2370 + }, + { + "epoch": 0.191599830299602, + "grad_norm": 2.8573155403137207, + "learning_rate": 9.886941915334339e-06, + "loss": 0.9862, + "step": 2371 + }, + { + "epoch": 0.19168064001292956, + "grad_norm": 3.353161334991455, + "learning_rate": 9.886803507299347e-06, + "loss": 1.0625, + "step": 2372 + }, + { + "epoch": 0.19176144972625708, + "grad_norm": 3.1049435138702393, + "learning_rate": 9.886665015565173e-06, + "loss": 0.9729, + "step": 2373 + }, + { + "epoch": 0.19184225943958463, + "grad_norm": 2.4401090145111084, + "learning_rate": 9.886526440134195e-06, + "loss": 0.9366, + "step": 2374 + }, + { + "epoch": 0.19192306915291218, + "grad_norm": 2.4159469604492188, + "learning_rate": 9.886387781008779e-06, + "loss": 0.8985, + "step": 2375 + }, + { + "epoch": 0.19200387886623974, + "grad_norm": 2.9945261478424072, + "learning_rate": 9.886249038191305e-06, + "loss": 1.0135, + "step": 2376 + }, + { + "epoch": 0.19208468857956726, + "grad_norm": 2.855586528778076, + "learning_rate": 9.88611021168415e-06, + "loss": 1.0542, + "step": 2377 + }, + { + "epoch": 0.1921654982928948, + "grad_norm": 3.1779487133026123, + "learning_rate": 9.885971301489687e-06, + "loss": 0.8935, + "step": 2378 + }, + { + "epoch": 0.19224630800622236, + "grad_norm": 2.762692451477051, + "learning_rate": 9.8858323076103e-06, + "loss": 1.0176, + "step": 2379 + }, + { + "epoch": 0.19232711771954988, + "grad_norm": 3.1234843730926514, + "learning_rate": 9.885693230048368e-06, + "loss": 0.9998, + "step": 2380 + }, + { + "epoch": 0.19240792743287743, + "grad_norm": 3.9722297191619873, + "learning_rate": 9.885554068806272e-06, + "loss": 0.9857, + "step": 2381 + }, + { + "epoch": 0.19248873714620499, + "grad_norm": 2.6323139667510986, + "learning_rate": 9.885414823886397e-06, + "loss": 0.9864, + "step": 2382 + }, + { + "epoch": 0.1925695468595325, + "grad_norm": 3.1694223880767822, + "learning_rate": 9.885275495291127e-06, + "loss": 0.9757, + "step": 2383 + }, + { + "epoch": 0.19265035657286006, + "grad_norm": 2.5772705078125, + "learning_rate": 9.885136083022847e-06, + "loss": 1.0262, + "step": 2384 + }, + { + "epoch": 0.1927311662861876, + "grad_norm": 2.823486566543579, + "learning_rate": 9.884996587083948e-06, + "loss": 1.0096, + "step": 2385 + }, + { + "epoch": 0.19281197599951513, + "grad_norm": 2.7001473903656006, + "learning_rate": 9.884857007476817e-06, + "loss": 0.9953, + "step": 2386 + }, + { + "epoch": 0.19289278571284268, + "grad_norm": 2.8641977310180664, + "learning_rate": 9.884717344203846e-06, + "loss": 1.0912, + "step": 2387 + }, + { + "epoch": 0.19297359542617024, + "grad_norm": 3.891497850418091, + "learning_rate": 9.884577597267426e-06, + "loss": 0.9944, + "step": 2388 + }, + { + "epoch": 0.19305440513949776, + "grad_norm": 2.898452043533325, + "learning_rate": 9.88443776666995e-06, + "loss": 0.9517, + "step": 2389 + }, + { + "epoch": 0.1931352148528253, + "grad_norm": 3.1403660774230957, + "learning_rate": 9.884297852413815e-06, + "loss": 1.0849, + "step": 2390 + }, + { + "epoch": 0.19321602456615286, + "grad_norm": 2.761983871459961, + "learning_rate": 9.884157854501416e-06, + "loss": 1.0986, + "step": 2391 + }, + { + "epoch": 0.19329683427948038, + "grad_norm": 3.095810651779175, + "learning_rate": 9.884017772935151e-06, + "loss": 0.9498, + "step": 2392 + }, + { + "epoch": 0.19337764399280793, + "grad_norm": 2.5449116230010986, + "learning_rate": 9.883877607717421e-06, + "loss": 0.998, + "step": 2393 + }, + { + "epoch": 0.19345845370613549, + "grad_norm": 2.822758436203003, + "learning_rate": 9.883737358850622e-06, + "loss": 0.9593, + "step": 2394 + }, + { + "epoch": 0.193539263419463, + "grad_norm": 2.54616379737854, + "learning_rate": 9.883597026337161e-06, + "loss": 1.0445, + "step": 2395 + }, + { + "epoch": 0.19362007313279056, + "grad_norm": 2.7621335983276367, + "learning_rate": 9.883456610179437e-06, + "loss": 1.0078, + "step": 2396 + }, + { + "epoch": 0.1937008828461181, + "grad_norm": 2.8157503604888916, + "learning_rate": 9.883316110379861e-06, + "loss": 0.9763, + "step": 2397 + }, + { + "epoch": 0.19378169255944563, + "grad_norm": 3.068323850631714, + "learning_rate": 9.883175526940835e-06, + "loss": 0.9221, + "step": 2398 + }, + { + "epoch": 0.19386250227277319, + "grad_norm": 3.1782143115997314, + "learning_rate": 9.883034859864768e-06, + "loss": 0.9957, + "step": 2399 + }, + { + "epoch": 0.19394331198610074, + "grad_norm": 2.9651782512664795, + "learning_rate": 9.882894109154071e-06, + "loss": 0.9781, + "step": 2400 + }, + { + "epoch": 0.19402412169942826, + "grad_norm": 3.156034469604492, + "learning_rate": 9.88275327481115e-06, + "loss": 1.0522, + "step": 2401 + }, + { + "epoch": 0.1941049314127558, + "grad_norm": 2.7413623332977295, + "learning_rate": 9.882612356838422e-06, + "loss": 1.0363, + "step": 2402 + }, + { + "epoch": 0.19418574112608336, + "grad_norm": 2.7400834560394287, + "learning_rate": 9.8824713552383e-06, + "loss": 1.0021, + "step": 2403 + }, + { + "epoch": 0.19426655083941088, + "grad_norm": 2.8538806438446045, + "learning_rate": 9.882330270013194e-06, + "loss": 1.0255, + "step": 2404 + }, + { + "epoch": 0.19434736055273844, + "grad_norm": 3.811596393585205, + "learning_rate": 9.882189101165527e-06, + "loss": 1.0438, + "step": 2405 + }, + { + "epoch": 0.19442817026606599, + "grad_norm": 3.5310895442962646, + "learning_rate": 9.882047848697714e-06, + "loss": 1.0074, + "step": 2406 + }, + { + "epoch": 0.1945089799793935, + "grad_norm": 3.50319504737854, + "learning_rate": 9.881906512612172e-06, + "loss": 1.13, + "step": 2407 + }, + { + "epoch": 0.19458978969272106, + "grad_norm": 2.4688737392425537, + "learning_rate": 9.881765092911327e-06, + "loss": 0.9055, + "step": 2408 + }, + { + "epoch": 0.1946705994060486, + "grad_norm": 2.4967446327209473, + "learning_rate": 9.881623589597596e-06, + "loss": 0.8877, + "step": 2409 + }, + { + "epoch": 0.19475140911937616, + "grad_norm": 2.6067020893096924, + "learning_rate": 9.881482002673406e-06, + "loss": 1.0201, + "step": 2410 + }, + { + "epoch": 0.19483221883270369, + "grad_norm": 2.8096611499786377, + "learning_rate": 9.881340332141183e-06, + "loss": 1.0239, + "step": 2411 + }, + { + "epoch": 0.19491302854603124, + "grad_norm": 2.951375722885132, + "learning_rate": 9.881198578003348e-06, + "loss": 0.9808, + "step": 2412 + }, + { + "epoch": 0.1949938382593588, + "grad_norm": 3.2135069370269775, + "learning_rate": 9.881056740262334e-06, + "loss": 1.0019, + "step": 2413 + }, + { + "epoch": 0.1950746479726863, + "grad_norm": 2.837573528289795, + "learning_rate": 9.880914818920568e-06, + "loss": 0.9657, + "step": 2414 + }, + { + "epoch": 0.19515545768601386, + "grad_norm": 2.8844478130340576, + "learning_rate": 9.880772813980484e-06, + "loss": 1.0598, + "step": 2415 + }, + { + "epoch": 0.1952362673993414, + "grad_norm": 2.8207285404205322, + "learning_rate": 9.880630725444509e-06, + "loss": 0.9335, + "step": 2416 + }, + { + "epoch": 0.19531707711266894, + "grad_norm": 2.8908140659332275, + "learning_rate": 9.88048855331508e-06, + "loss": 0.9494, + "step": 2417 + }, + { + "epoch": 0.1953978868259965, + "grad_norm": 2.654698610305786, + "learning_rate": 9.880346297594631e-06, + "loss": 1.0921, + "step": 2418 + }, + { + "epoch": 0.19547869653932404, + "grad_norm": 2.965447187423706, + "learning_rate": 9.8802039582856e-06, + "loss": 0.9454, + "step": 2419 + }, + { + "epoch": 0.19555950625265156, + "grad_norm": 2.5932366847991943, + "learning_rate": 9.880061535390424e-06, + "loss": 1.0111, + "step": 2420 + }, + { + "epoch": 0.1956403159659791, + "grad_norm": 2.62595796585083, + "learning_rate": 9.87991902891154e-06, + "loss": 0.9679, + "step": 2421 + }, + { + "epoch": 0.19572112567930666, + "grad_norm": 2.9708480834960938, + "learning_rate": 9.879776438851393e-06, + "loss": 1.071, + "step": 2422 + }, + { + "epoch": 0.19580193539263419, + "grad_norm": 2.690122604370117, + "learning_rate": 9.879633765212422e-06, + "loss": 0.9928, + "step": 2423 + }, + { + "epoch": 0.19588274510596174, + "grad_norm": 3.028838872909546, + "learning_rate": 9.879491007997073e-06, + "loss": 0.9819, + "step": 2424 + }, + { + "epoch": 0.1959635548192893, + "grad_norm": 2.915705919265747, + "learning_rate": 9.87934816720779e-06, + "loss": 1.0966, + "step": 2425 + }, + { + "epoch": 0.1960443645326168, + "grad_norm": 2.9188296794891357, + "learning_rate": 9.879205242847018e-06, + "loss": 1.1146, + "step": 2426 + }, + { + "epoch": 0.19612517424594436, + "grad_norm": 2.9386608600616455, + "learning_rate": 9.879062234917208e-06, + "loss": 0.864, + "step": 2427 + }, + { + "epoch": 0.1962059839592719, + "grad_norm": 2.9816863536834717, + "learning_rate": 9.878919143420806e-06, + "loss": 0.9519, + "step": 2428 + }, + { + "epoch": 0.19628679367259944, + "grad_norm": 2.8826448917388916, + "learning_rate": 9.878775968360265e-06, + "loss": 1.0976, + "step": 2429 + }, + { + "epoch": 0.196367603385927, + "grad_norm": 3.5601329803466797, + "learning_rate": 9.878632709738036e-06, + "loss": 1.0267, + "step": 2430 + }, + { + "epoch": 0.19644841309925454, + "grad_norm": 3.157382011413574, + "learning_rate": 9.878489367556576e-06, + "loss": 0.9642, + "step": 2431 + }, + { + "epoch": 0.19652922281258206, + "grad_norm": 3.0705273151397705, + "learning_rate": 9.878345941818338e-06, + "loss": 0.9957, + "step": 2432 + }, + { + "epoch": 0.1966100325259096, + "grad_norm": 2.779900074005127, + "learning_rate": 9.878202432525774e-06, + "loss": 0.9627, + "step": 2433 + }, + { + "epoch": 0.19669084223923716, + "grad_norm": 3.0926270484924316, + "learning_rate": 9.87805883968135e-06, + "loss": 0.9306, + "step": 2434 + }, + { + "epoch": 0.19677165195256469, + "grad_norm": 2.7956552505493164, + "learning_rate": 9.877915163287519e-06, + "loss": 0.9934, + "step": 2435 + }, + { + "epoch": 0.19685246166589224, + "grad_norm": 3.262789011001587, + "learning_rate": 9.877771403346747e-06, + "loss": 0.951, + "step": 2436 + }, + { + "epoch": 0.1969332713792198, + "grad_norm": 2.491626501083374, + "learning_rate": 9.87762755986149e-06, + "loss": 1.0969, + "step": 2437 + }, + { + "epoch": 0.1970140810925473, + "grad_norm": 3.3042078018188477, + "learning_rate": 9.877483632834219e-06, + "loss": 0.9696, + "step": 2438 + }, + { + "epoch": 0.19709489080587486, + "grad_norm": 2.8220388889312744, + "learning_rate": 9.877339622267394e-06, + "loss": 0.9866, + "step": 2439 + }, + { + "epoch": 0.1971757005192024, + "grad_norm": 2.9727158546447754, + "learning_rate": 9.877195528163483e-06, + "loss": 1.0571, + "step": 2440 + }, + { + "epoch": 0.19725651023252996, + "grad_norm": 3.236030101776123, + "learning_rate": 9.877051350524953e-06, + "loss": 1.0211, + "step": 2441 + }, + { + "epoch": 0.1973373199458575, + "grad_norm": 2.409712076187134, + "learning_rate": 9.876907089354276e-06, + "loss": 1.0282, + "step": 2442 + }, + { + "epoch": 0.19741812965918504, + "grad_norm": 2.8064048290252686, + "learning_rate": 9.876762744653921e-06, + "loss": 0.9514, + "step": 2443 + }, + { + "epoch": 0.1974989393725126, + "grad_norm": 3.2507739067077637, + "learning_rate": 9.87661831642636e-06, + "loss": 1.0725, + "step": 2444 + }, + { + "epoch": 0.1975797490858401, + "grad_norm": 2.829425096511841, + "learning_rate": 9.876473804674067e-06, + "loss": 0.9631, + "step": 2445 + }, + { + "epoch": 0.19766055879916766, + "grad_norm": 2.590425491333008, + "learning_rate": 9.876329209399518e-06, + "loss": 0.9723, + "step": 2446 + }, + { + "epoch": 0.1977413685124952, + "grad_norm": 2.5433905124664307, + "learning_rate": 9.876184530605189e-06, + "loss": 1.0544, + "step": 2447 + }, + { + "epoch": 0.19782217822582274, + "grad_norm": 2.9684863090515137, + "learning_rate": 9.876039768293557e-06, + "loss": 0.9622, + "step": 2448 + }, + { + "epoch": 0.1979029879391503, + "grad_norm": 2.6978437900543213, + "learning_rate": 9.875894922467101e-06, + "loss": 0.9914, + "step": 2449 + }, + { + "epoch": 0.19798379765247784, + "grad_norm": 2.679476499557495, + "learning_rate": 9.875749993128306e-06, + "loss": 1.0121, + "step": 2450 + }, + { + "epoch": 0.19806460736580536, + "grad_norm": 2.7482845783233643, + "learning_rate": 9.875604980279651e-06, + "loss": 1.0252, + "step": 2451 + }, + { + "epoch": 0.1981454170791329, + "grad_norm": 2.545858860015869, + "learning_rate": 9.875459883923619e-06, + "loss": 1.0051, + "step": 2452 + }, + { + "epoch": 0.19822622679246046, + "grad_norm": 2.84562349319458, + "learning_rate": 9.875314704062697e-06, + "loss": 1.0222, + "step": 2453 + }, + { + "epoch": 0.198307036505788, + "grad_norm": 3.056476354598999, + "learning_rate": 9.875169440699372e-06, + "loss": 1.0069, + "step": 2454 + }, + { + "epoch": 0.19838784621911554, + "grad_norm": 3.218858242034912, + "learning_rate": 9.87502409383613e-06, + "loss": 0.9731, + "step": 2455 + }, + { + "epoch": 0.1984686559324431, + "grad_norm": 2.7342875003814697, + "learning_rate": 9.874878663475462e-06, + "loss": 1.0858, + "step": 2456 + }, + { + "epoch": 0.1985494656457706, + "grad_norm": 2.640138626098633, + "learning_rate": 9.874733149619857e-06, + "loss": 1.086, + "step": 2457 + }, + { + "epoch": 0.19863027535909816, + "grad_norm": 2.712902784347534, + "learning_rate": 9.87458755227181e-06, + "loss": 0.9231, + "step": 2458 + }, + { + "epoch": 0.1987110850724257, + "grad_norm": 2.5488665103912354, + "learning_rate": 9.874441871433814e-06, + "loss": 0.9176, + "step": 2459 + }, + { + "epoch": 0.19879189478575324, + "grad_norm": 2.6249120235443115, + "learning_rate": 9.874296107108362e-06, + "loss": 0.9626, + "step": 2460 + }, + { + "epoch": 0.1988727044990808, + "grad_norm": 2.878230333328247, + "learning_rate": 9.874150259297952e-06, + "loss": 0.8884, + "step": 2461 + }, + { + "epoch": 0.19895351421240834, + "grad_norm": 2.5039255619049072, + "learning_rate": 9.87400432800508e-06, + "loss": 1.0489, + "step": 2462 + }, + { + "epoch": 0.19903432392573586, + "grad_norm": 2.8095619678497314, + "learning_rate": 9.87385831323225e-06, + "loss": 1.084, + "step": 2463 + }, + { + "epoch": 0.1991151336390634, + "grad_norm": 2.7729170322418213, + "learning_rate": 9.87371221498196e-06, + "loss": 0.9587, + "step": 2464 + }, + { + "epoch": 0.19919594335239096, + "grad_norm": 2.798548936843872, + "learning_rate": 9.873566033256714e-06, + "loss": 1.0125, + "step": 2465 + }, + { + "epoch": 0.1992767530657185, + "grad_norm": 2.673734188079834, + "learning_rate": 9.873419768059014e-06, + "loss": 1.0097, + "step": 2466 + }, + { + "epoch": 0.19935756277904604, + "grad_norm": 2.5640718936920166, + "learning_rate": 9.873273419391364e-06, + "loss": 1.0343, + "step": 2467 + }, + { + "epoch": 0.1994383724923736, + "grad_norm": 2.601259469985962, + "learning_rate": 9.873126987256273e-06, + "loss": 0.9153, + "step": 2468 + }, + { + "epoch": 0.1995191822057011, + "grad_norm": 2.698262929916382, + "learning_rate": 9.87298047165625e-06, + "loss": 1.0388, + "step": 2469 + }, + { + "epoch": 0.19959999191902866, + "grad_norm": 2.803069591522217, + "learning_rate": 9.872833872593801e-06, + "loss": 0.9826, + "step": 2470 + }, + { + "epoch": 0.19968080163235621, + "grad_norm": 2.310145616531372, + "learning_rate": 9.87268719007144e-06, + "loss": 1.0419, + "step": 2471 + }, + { + "epoch": 0.19976161134568376, + "grad_norm": 2.7354466915130615, + "learning_rate": 9.872540424091677e-06, + "loss": 1.0474, + "step": 2472 + }, + { + "epoch": 0.1998424210590113, + "grad_norm": 2.915072202682495, + "learning_rate": 9.872393574657026e-06, + "loss": 1.0912, + "step": 2473 + }, + { + "epoch": 0.19992323077233884, + "grad_norm": 2.9000625610351562, + "learning_rate": 9.872246641770004e-06, + "loss": 1.1786, + "step": 2474 + }, + { + "epoch": 0.2000040404856664, + "grad_norm": 2.6486880779266357, + "learning_rate": 9.872099625433127e-06, + "loss": 0.9718, + "step": 2475 + }, + { + "epoch": 0.2000848501989939, + "grad_norm": 2.8417553901672363, + "learning_rate": 9.871952525648911e-06, + "loss": 1.14, + "step": 2476 + }, + { + "epoch": 0.20016565991232146, + "grad_norm": 3.3232791423797607, + "learning_rate": 9.871805342419879e-06, + "loss": 1.0333, + "step": 2477 + }, + { + "epoch": 0.20024646962564902, + "grad_norm": 2.7784183025360107, + "learning_rate": 9.871658075748546e-06, + "loss": 0.9657, + "step": 2478 + }, + { + "epoch": 0.20032727933897654, + "grad_norm": 2.562795639038086, + "learning_rate": 9.871510725637442e-06, + "loss": 1.0903, + "step": 2479 + }, + { + "epoch": 0.2004080890523041, + "grad_norm": 3.0607187747955322, + "learning_rate": 9.871363292089085e-06, + "loss": 0.9937, + "step": 2480 + }, + { + "epoch": 0.20048889876563164, + "grad_norm": 2.629852533340454, + "learning_rate": 9.871215775106003e-06, + "loss": 0.9726, + "step": 2481 + }, + { + "epoch": 0.20056970847895916, + "grad_norm": 2.361461639404297, + "learning_rate": 9.871068174690722e-06, + "loss": 1.0028, + "step": 2482 + }, + { + "epoch": 0.20065051819228671, + "grad_norm": 2.896855592727661, + "learning_rate": 9.87092049084577e-06, + "loss": 0.9409, + "step": 2483 + }, + { + "epoch": 0.20073132790561427, + "grad_norm": 2.9578440189361572, + "learning_rate": 9.870772723573674e-06, + "loss": 1.018, + "step": 2484 + }, + { + "epoch": 0.2008121376189418, + "grad_norm": 2.7431399822235107, + "learning_rate": 9.87062487287697e-06, + "loss": 0.9153, + "step": 2485 + }, + { + "epoch": 0.20089294733226934, + "grad_norm": 2.8610525131225586, + "learning_rate": 9.870476938758185e-06, + "loss": 1.0967, + "step": 2486 + }, + { + "epoch": 0.2009737570455969, + "grad_norm": 2.564570665359497, + "learning_rate": 9.870328921219856e-06, + "loss": 0.9596, + "step": 2487 + }, + { + "epoch": 0.2010545667589244, + "grad_norm": 3.0416998863220215, + "learning_rate": 9.870180820264518e-06, + "loss": 1.1215, + "step": 2488 + }, + { + "epoch": 0.20113537647225196, + "grad_norm": 2.994410276412964, + "learning_rate": 9.870032635894708e-06, + "loss": 0.952, + "step": 2489 + }, + { + "epoch": 0.20121618618557952, + "grad_norm": 3.3901889324188232, + "learning_rate": 9.869884368112961e-06, + "loss": 1.0788, + "step": 2490 + }, + { + "epoch": 0.20129699589890704, + "grad_norm": 2.755059242248535, + "learning_rate": 9.86973601692182e-06, + "loss": 0.9808, + "step": 2491 + }, + { + "epoch": 0.2013778056122346, + "grad_norm": 2.62873911857605, + "learning_rate": 9.869587582323824e-06, + "loss": 0.9143, + "step": 2492 + }, + { + "epoch": 0.20145861532556214, + "grad_norm": 2.8190078735351562, + "learning_rate": 9.869439064321516e-06, + "loss": 1.0799, + "step": 2493 + }, + { + "epoch": 0.20153942503888966, + "grad_norm": 2.5255749225616455, + "learning_rate": 9.86929046291744e-06, + "loss": 1.0696, + "step": 2494 + }, + { + "epoch": 0.20162023475221721, + "grad_norm": 2.4826602935791016, + "learning_rate": 9.86914177811414e-06, + "loss": 1.0309, + "step": 2495 + }, + { + "epoch": 0.20170104446554477, + "grad_norm": 2.7080788612365723, + "learning_rate": 9.868993009914162e-06, + "loss": 1.0622, + "step": 2496 + }, + { + "epoch": 0.2017818541788723, + "grad_norm": 2.994687557220459, + "learning_rate": 9.868844158320056e-06, + "loss": 1.0052, + "step": 2497 + }, + { + "epoch": 0.20186266389219984, + "grad_norm": 2.9315123558044434, + "learning_rate": 9.868695223334372e-06, + "loss": 0.9804, + "step": 2498 + }, + { + "epoch": 0.2019434736055274, + "grad_norm": 2.8823635578155518, + "learning_rate": 9.868546204959659e-06, + "loss": 0.9873, + "step": 2499 + }, + { + "epoch": 0.2020242833188549, + "grad_norm": 2.6396167278289795, + "learning_rate": 9.868397103198471e-06, + "loss": 0.9844, + "step": 2500 + }, + { + "epoch": 0.20210509303218246, + "grad_norm": 2.803288221359253, + "learning_rate": 9.86824791805336e-06, + "loss": 0.8829, + "step": 2501 + }, + { + "epoch": 0.20218590274551002, + "grad_norm": 2.912961483001709, + "learning_rate": 9.86809864952688e-06, + "loss": 0.9903, + "step": 2502 + }, + { + "epoch": 0.20226671245883754, + "grad_norm": 2.793362855911255, + "learning_rate": 9.867949297621592e-06, + "loss": 1.0127, + "step": 2503 + }, + { + "epoch": 0.2023475221721651, + "grad_norm": 2.580033779144287, + "learning_rate": 9.867799862340054e-06, + "loss": 0.9637, + "step": 2504 + }, + { + "epoch": 0.20242833188549264, + "grad_norm": 3.100097179412842, + "learning_rate": 9.867650343684818e-06, + "loss": 0.9721, + "step": 2505 + }, + { + "epoch": 0.2025091415988202, + "grad_norm": 2.967200756072998, + "learning_rate": 9.867500741658454e-06, + "loss": 0.9854, + "step": 2506 + }, + { + "epoch": 0.20258995131214771, + "grad_norm": 2.929659843444824, + "learning_rate": 9.867351056263517e-06, + "loss": 0.9923, + "step": 2507 + }, + { + "epoch": 0.20267076102547527, + "grad_norm": 2.7155697345733643, + "learning_rate": 9.867201287502576e-06, + "loss": 1.0235, + "step": 2508 + }, + { + "epoch": 0.20275157073880282, + "grad_norm": 3.458383321762085, + "learning_rate": 9.867051435378194e-06, + "loss": 0.9887, + "step": 2509 + }, + { + "epoch": 0.20283238045213034, + "grad_norm": 2.467725992202759, + "learning_rate": 9.866901499892938e-06, + "loss": 0.9697, + "step": 2510 + }, + { + "epoch": 0.2029131901654579, + "grad_norm": 2.7580583095550537, + "learning_rate": 9.866751481049377e-06, + "loss": 1.0378, + "step": 2511 + }, + { + "epoch": 0.20299399987878544, + "grad_norm": 3.0809414386749268, + "learning_rate": 9.866601378850077e-06, + "loss": 0.9444, + "step": 2512 + }, + { + "epoch": 0.20307480959211296, + "grad_norm": 2.972541093826294, + "learning_rate": 9.866451193297613e-06, + "loss": 0.9478, + "step": 2513 + }, + { + "epoch": 0.20315561930544052, + "grad_norm": 2.998645067214966, + "learning_rate": 9.866300924394556e-06, + "loss": 1.0539, + "step": 2514 + }, + { + "epoch": 0.20323642901876807, + "grad_norm": 3.317265033721924, + "learning_rate": 9.866150572143477e-06, + "loss": 0.9376, + "step": 2515 + }, + { + "epoch": 0.2033172387320956, + "grad_norm": 3.172424554824829, + "learning_rate": 9.866000136546954e-06, + "loss": 0.8793, + "step": 2516 + }, + { + "epoch": 0.20339804844542314, + "grad_norm": 2.846748113632202, + "learning_rate": 9.865849617607565e-06, + "loss": 1.0539, + "step": 2517 + }, + { + "epoch": 0.2034788581587507, + "grad_norm": 2.9173882007598877, + "learning_rate": 9.865699015327885e-06, + "loss": 1.0106, + "step": 2518 + }, + { + "epoch": 0.20355966787207821, + "grad_norm": 2.671884059906006, + "learning_rate": 9.865548329710496e-06, + "loss": 1.1, + "step": 2519 + }, + { + "epoch": 0.20364047758540577, + "grad_norm": 2.9467697143554688, + "learning_rate": 9.865397560757975e-06, + "loss": 1.0092, + "step": 2520 + }, + { + "epoch": 0.20372128729873332, + "grad_norm": 2.752096176147461, + "learning_rate": 9.865246708472907e-06, + "loss": 0.9468, + "step": 2521 + }, + { + "epoch": 0.20380209701206084, + "grad_norm": 3.0249135494232178, + "learning_rate": 9.865095772857875e-06, + "loss": 0.949, + "step": 2522 + }, + { + "epoch": 0.2038829067253884, + "grad_norm": 3.1122334003448486, + "learning_rate": 9.864944753915466e-06, + "loss": 0.969, + "step": 2523 + }, + { + "epoch": 0.20396371643871594, + "grad_norm": 2.7139110565185547, + "learning_rate": 9.864793651648266e-06, + "loss": 0.9581, + "step": 2524 + }, + { + "epoch": 0.20404452615204346, + "grad_norm": 2.9858901500701904, + "learning_rate": 9.864642466058861e-06, + "loss": 0.9883, + "step": 2525 + }, + { + "epoch": 0.20412533586537102, + "grad_norm": 2.9757955074310303, + "learning_rate": 9.864491197149841e-06, + "loss": 1.1164, + "step": 2526 + }, + { + "epoch": 0.20420614557869857, + "grad_norm": 3.0603973865509033, + "learning_rate": 9.864339844923801e-06, + "loss": 0.9447, + "step": 2527 + }, + { + "epoch": 0.2042869552920261, + "grad_norm": 2.7169153690338135, + "learning_rate": 9.864188409383326e-06, + "loss": 0.9277, + "step": 2528 + }, + { + "epoch": 0.20436776500535364, + "grad_norm": 3.0862722396850586, + "learning_rate": 9.864036890531014e-06, + "loss": 0.9635, + "step": 2529 + }, + { + "epoch": 0.2044485747186812, + "grad_norm": 2.611670970916748, + "learning_rate": 9.863885288369461e-06, + "loss": 0.9959, + "step": 2530 + }, + { + "epoch": 0.20452938443200872, + "grad_norm": 2.7444217205047607, + "learning_rate": 9.863733602901262e-06, + "loss": 0.9811, + "step": 2531 + }, + { + "epoch": 0.20461019414533627, + "grad_norm": 2.7754032611846924, + "learning_rate": 9.863581834129017e-06, + "loss": 0.9322, + "step": 2532 + }, + { + "epoch": 0.20469100385866382, + "grad_norm": 3.2942535877227783, + "learning_rate": 9.863429982055322e-06, + "loss": 1.0804, + "step": 2533 + }, + { + "epoch": 0.20477181357199134, + "grad_norm": 2.810506820678711, + "learning_rate": 9.86327804668278e-06, + "loss": 1.0103, + "step": 2534 + }, + { + "epoch": 0.2048526232853189, + "grad_norm": 3.491856575012207, + "learning_rate": 9.863126028013993e-06, + "loss": 0.9455, + "step": 2535 + }, + { + "epoch": 0.20493343299864644, + "grad_norm": 2.792609214782715, + "learning_rate": 9.862973926051565e-06, + "loss": 0.959, + "step": 2536 + }, + { + "epoch": 0.205014242711974, + "grad_norm": 2.7215230464935303, + "learning_rate": 9.8628217407981e-06, + "loss": 0.9344, + "step": 2537 + }, + { + "epoch": 0.20509505242530152, + "grad_norm": 2.9850406646728516, + "learning_rate": 9.862669472256206e-06, + "loss": 1.0273, + "step": 2538 + }, + { + "epoch": 0.20517586213862907, + "grad_norm": 2.7064993381500244, + "learning_rate": 9.86251712042849e-06, + "loss": 0.9229, + "step": 2539 + }, + { + "epoch": 0.20525667185195662, + "grad_norm": 2.823521375656128, + "learning_rate": 9.86236468531756e-06, + "loss": 1.105, + "step": 2540 + }, + { + "epoch": 0.20533748156528414, + "grad_norm": 3.5803730487823486, + "learning_rate": 9.862212166926031e-06, + "loss": 0.9772, + "step": 2541 + }, + { + "epoch": 0.2054182912786117, + "grad_norm": 2.4767980575561523, + "learning_rate": 9.862059565256512e-06, + "loss": 1.0041, + "step": 2542 + }, + { + "epoch": 0.20549910099193924, + "grad_norm": 3.0558948516845703, + "learning_rate": 9.861906880311617e-06, + "loss": 0.996, + "step": 2543 + }, + { + "epoch": 0.20557991070526677, + "grad_norm": 2.6973979473114014, + "learning_rate": 9.861754112093964e-06, + "loss": 1.0557, + "step": 2544 + }, + { + "epoch": 0.20566072041859432, + "grad_norm": 3.0480377674102783, + "learning_rate": 9.861601260606166e-06, + "loss": 1.0027, + "step": 2545 + }, + { + "epoch": 0.20574153013192187, + "grad_norm": 2.9968481063842773, + "learning_rate": 9.861448325850842e-06, + "loss": 1.0336, + "step": 2546 + }, + { + "epoch": 0.2058223398452494, + "grad_norm": 2.68129825592041, + "learning_rate": 9.861295307830612e-06, + "loss": 1.0041, + "step": 2547 + }, + { + "epoch": 0.20590314955857694, + "grad_norm": 4.269932270050049, + "learning_rate": 9.861142206548096e-06, + "loss": 1.1087, + "step": 2548 + }, + { + "epoch": 0.2059839592719045, + "grad_norm": 2.564828634262085, + "learning_rate": 9.860989022005915e-06, + "loss": 1.023, + "step": 2549 + }, + { + "epoch": 0.20606476898523202, + "grad_norm": 3.118055582046509, + "learning_rate": 9.860835754206698e-06, + "loss": 1.0049, + "step": 2550 + }, + { + "epoch": 0.20614557869855957, + "grad_norm": 2.6097300052642822, + "learning_rate": 9.860682403153064e-06, + "loss": 1.0637, + "step": 2551 + }, + { + "epoch": 0.20622638841188712, + "grad_norm": 2.9691739082336426, + "learning_rate": 9.860528968847642e-06, + "loss": 1.036, + "step": 2552 + }, + { + "epoch": 0.20630719812521464, + "grad_norm": 3.2187507152557373, + "learning_rate": 9.86037545129306e-06, + "loss": 1.0471, + "step": 2553 + }, + { + "epoch": 0.2063880078385422, + "grad_norm": 2.5262982845306396, + "learning_rate": 9.860221850491949e-06, + "loss": 1.1136, + "step": 2554 + }, + { + "epoch": 0.20646881755186974, + "grad_norm": 2.7065389156341553, + "learning_rate": 9.860068166446938e-06, + "loss": 0.9593, + "step": 2555 + }, + { + "epoch": 0.20654962726519727, + "grad_norm": 2.8696558475494385, + "learning_rate": 9.85991439916066e-06, + "loss": 1.0267, + "step": 2556 + }, + { + "epoch": 0.20663043697852482, + "grad_norm": 2.757690906524658, + "learning_rate": 9.859760548635746e-06, + "loss": 0.95, + "step": 2557 + }, + { + "epoch": 0.20671124669185237, + "grad_norm": 2.6395418643951416, + "learning_rate": 9.859606614874834e-06, + "loss": 0.947, + "step": 2558 + }, + { + "epoch": 0.2067920564051799, + "grad_norm": 2.605659008026123, + "learning_rate": 9.859452597880559e-06, + "loss": 0.8962, + "step": 2559 + }, + { + "epoch": 0.20687286611850744, + "grad_norm": 2.7953250408172607, + "learning_rate": 9.85929849765556e-06, + "loss": 1.0655, + "step": 2560 + }, + { + "epoch": 0.206953675831835, + "grad_norm": 2.9792885780334473, + "learning_rate": 9.859144314202478e-06, + "loss": 1.0137, + "step": 2561 + }, + { + "epoch": 0.20703448554516252, + "grad_norm": 3.3989391326904297, + "learning_rate": 9.85899004752395e-06, + "loss": 0.8957, + "step": 2562 + }, + { + "epoch": 0.20711529525849007, + "grad_norm": 2.6138968467712402, + "learning_rate": 9.858835697622619e-06, + "loss": 1.1482, + "step": 2563 + }, + { + "epoch": 0.20719610497181762, + "grad_norm": 2.423694133758545, + "learning_rate": 9.858681264501133e-06, + "loss": 1.0465, + "step": 2564 + }, + { + "epoch": 0.20727691468514514, + "grad_norm": 3.0095713138580322, + "learning_rate": 9.858526748162132e-06, + "loss": 0.9462, + "step": 2565 + }, + { + "epoch": 0.2073577243984727, + "grad_norm": 3.7489736080169678, + "learning_rate": 9.858372148608263e-06, + "loss": 1.0188, + "step": 2566 + }, + { + "epoch": 0.20743853411180024, + "grad_norm": 2.8325319290161133, + "learning_rate": 9.858217465842178e-06, + "loss": 0.9389, + "step": 2567 + }, + { + "epoch": 0.20751934382512777, + "grad_norm": 2.903128147125244, + "learning_rate": 9.85806269986652e-06, + "loss": 1.1131, + "step": 2568 + }, + { + "epoch": 0.20760015353845532, + "grad_norm": 3.4537770748138428, + "learning_rate": 9.857907850683946e-06, + "loss": 1.1703, + "step": 2569 + }, + { + "epoch": 0.20768096325178287, + "grad_norm": 3.0188677310943604, + "learning_rate": 9.857752918297103e-06, + "loss": 0.9049, + "step": 2570 + }, + { + "epoch": 0.20776177296511042, + "grad_norm": 3.0577592849731445, + "learning_rate": 9.857597902708649e-06, + "loss": 1.1445, + "step": 2571 + }, + { + "epoch": 0.20784258267843794, + "grad_norm": 2.568509340286255, + "learning_rate": 9.857442803921235e-06, + "loss": 0.9156, + "step": 2572 + }, + { + "epoch": 0.2079233923917655, + "grad_norm": 2.7146859169006348, + "learning_rate": 9.857287621937522e-06, + "loss": 1.0497, + "step": 2573 + }, + { + "epoch": 0.20800420210509304, + "grad_norm": 3.3684639930725098, + "learning_rate": 9.857132356760164e-06, + "loss": 0.9339, + "step": 2574 + }, + { + "epoch": 0.20808501181842057, + "grad_norm": 2.9014923572540283, + "learning_rate": 9.856977008391824e-06, + "loss": 1.0074, + "step": 2575 + }, + { + "epoch": 0.20816582153174812, + "grad_norm": 2.680986166000366, + "learning_rate": 9.856821576835159e-06, + "loss": 1.1141, + "step": 2576 + }, + { + "epoch": 0.20824663124507567, + "grad_norm": 2.9888999462127686, + "learning_rate": 9.856666062092833e-06, + "loss": 0.9345, + "step": 2577 + }, + { + "epoch": 0.2083274409584032, + "grad_norm": 2.7942521572113037, + "learning_rate": 9.856510464167508e-06, + "loss": 1.0688, + "step": 2578 + }, + { + "epoch": 0.20840825067173074, + "grad_norm": 2.224989414215088, + "learning_rate": 9.856354783061851e-06, + "loss": 1.0542, + "step": 2579 + }, + { + "epoch": 0.2084890603850583, + "grad_norm": 2.730905294418335, + "learning_rate": 9.856199018778527e-06, + "loss": 0.9543, + "step": 2580 + }, + { + "epoch": 0.20856987009838582, + "grad_norm": 2.5238234996795654, + "learning_rate": 9.856043171320206e-06, + "loss": 1.0149, + "step": 2581 + }, + { + "epoch": 0.20865067981171337, + "grad_norm": 2.5379068851470947, + "learning_rate": 9.855887240689556e-06, + "loss": 0.9269, + "step": 2582 + }, + { + "epoch": 0.20873148952504092, + "grad_norm": 3.115297317504883, + "learning_rate": 9.855731226889246e-06, + "loss": 0.9183, + "step": 2583 + }, + { + "epoch": 0.20881229923836844, + "grad_norm": 2.9481358528137207, + "learning_rate": 9.855575129921953e-06, + "loss": 1.0207, + "step": 2584 + }, + { + "epoch": 0.208893108951696, + "grad_norm": 2.897611141204834, + "learning_rate": 9.855418949790346e-06, + "loss": 1.1211, + "step": 2585 + }, + { + "epoch": 0.20897391866502354, + "grad_norm": 3.3586490154266357, + "learning_rate": 9.8552626864971e-06, + "loss": 0.9698, + "step": 2586 + }, + { + "epoch": 0.20905472837835107, + "grad_norm": 2.8079240322113037, + "learning_rate": 9.855106340044893e-06, + "loss": 1.0102, + "step": 2587 + }, + { + "epoch": 0.20913553809167862, + "grad_norm": 2.4567348957061768, + "learning_rate": 9.854949910436403e-06, + "loss": 0.9974, + "step": 2588 + }, + { + "epoch": 0.20921634780500617, + "grad_norm": 2.6109845638275146, + "learning_rate": 9.85479339767431e-06, + "loss": 0.977, + "step": 2589 + }, + { + "epoch": 0.2092971575183337, + "grad_norm": 3.109159231185913, + "learning_rate": 9.854636801761292e-06, + "loss": 0.9796, + "step": 2590 + }, + { + "epoch": 0.20937796723166124, + "grad_norm": 2.863799571990967, + "learning_rate": 9.854480122700031e-06, + "loss": 1.036, + "step": 2591 + }, + { + "epoch": 0.2094587769449888, + "grad_norm": 3.403641939163208, + "learning_rate": 9.854323360493215e-06, + "loss": 1.1061, + "step": 2592 + }, + { + "epoch": 0.20953958665831632, + "grad_norm": 2.7065038681030273, + "learning_rate": 9.854166515143526e-06, + "loss": 0.9701, + "step": 2593 + }, + { + "epoch": 0.20962039637164387, + "grad_norm": 2.658090829849243, + "learning_rate": 9.85400958665365e-06, + "loss": 0.9804, + "step": 2594 + }, + { + "epoch": 0.20970120608497142, + "grad_norm": 3.0017261505126953, + "learning_rate": 9.853852575026274e-06, + "loss": 1.1035, + "step": 2595 + }, + { + "epoch": 0.20978201579829894, + "grad_norm": 3.070401430130005, + "learning_rate": 9.853695480264091e-06, + "loss": 1.0101, + "step": 2596 + }, + { + "epoch": 0.2098628255116265, + "grad_norm": 2.946901559829712, + "learning_rate": 9.853538302369787e-06, + "loss": 0.932, + "step": 2597 + }, + { + "epoch": 0.20994363522495404, + "grad_norm": 2.6623542308807373, + "learning_rate": 9.853381041346058e-06, + "loss": 1.0965, + "step": 2598 + }, + { + "epoch": 0.21002444493828157, + "grad_norm": 2.513234853744507, + "learning_rate": 9.853223697195596e-06, + "loss": 0.952, + "step": 2599 + }, + { + "epoch": 0.21010525465160912, + "grad_norm": 2.526477813720703, + "learning_rate": 9.853066269921095e-06, + "loss": 1.1357, + "step": 2600 + }, + { + "epoch": 0.21018606436493667, + "grad_norm": 3.1761679649353027, + "learning_rate": 9.85290875952525e-06, + "loss": 1.0726, + "step": 2601 + }, + { + "epoch": 0.21026687407826422, + "grad_norm": 2.8965203762054443, + "learning_rate": 9.852751166010764e-06, + "loss": 1.0015, + "step": 2602 + }, + { + "epoch": 0.21034768379159174, + "grad_norm": 2.646193027496338, + "learning_rate": 9.852593489380331e-06, + "loss": 0.9826, + "step": 2603 + }, + { + "epoch": 0.2104284935049193, + "grad_norm": 2.5496766567230225, + "learning_rate": 9.852435729636656e-06, + "loss": 1.0915, + "step": 2604 + }, + { + "epoch": 0.21050930321824685, + "grad_norm": 2.8119606971740723, + "learning_rate": 9.852277886782436e-06, + "loss": 0.9898, + "step": 2605 + }, + { + "epoch": 0.21059011293157437, + "grad_norm": 2.8318567276000977, + "learning_rate": 9.852119960820379e-06, + "loss": 1.0825, + "step": 2606 + }, + { + "epoch": 0.21067092264490192, + "grad_norm": 3.3942651748657227, + "learning_rate": 9.851961951753186e-06, + "loss": 0.962, + "step": 2607 + }, + { + "epoch": 0.21075173235822947, + "grad_norm": 2.667219400405884, + "learning_rate": 9.85180385958357e-06, + "loss": 1.0618, + "step": 2608 + }, + { + "epoch": 0.210832542071557, + "grad_norm": 2.84959077835083, + "learning_rate": 9.851645684314229e-06, + "loss": 0.9492, + "step": 2609 + }, + { + "epoch": 0.21091335178488455, + "grad_norm": 2.9073519706726074, + "learning_rate": 9.851487425947878e-06, + "loss": 1.1176, + "step": 2610 + }, + { + "epoch": 0.2109941614982121, + "grad_norm": 2.5956332683563232, + "learning_rate": 9.85132908448723e-06, + "loss": 1.1036, + "step": 2611 + }, + { + "epoch": 0.21107497121153962, + "grad_norm": 2.601874589920044, + "learning_rate": 9.851170659934992e-06, + "loss": 1.0242, + "step": 2612 + }, + { + "epoch": 0.21115578092486717, + "grad_norm": 3.020754337310791, + "learning_rate": 9.851012152293878e-06, + "loss": 0.9522, + "step": 2613 + }, + { + "epoch": 0.21123659063819472, + "grad_norm": 2.9108903408050537, + "learning_rate": 9.850853561566607e-06, + "loss": 1.1153, + "step": 2614 + }, + { + "epoch": 0.21131740035152224, + "grad_norm": 3.2650146484375, + "learning_rate": 9.85069488775589e-06, + "loss": 1.0406, + "step": 2615 + }, + { + "epoch": 0.2113982100648498, + "grad_norm": 3.147453546524048, + "learning_rate": 9.850536130864447e-06, + "loss": 0.9551, + "step": 2616 + }, + { + "epoch": 0.21147901977817735, + "grad_norm": 2.9640023708343506, + "learning_rate": 9.850377290894999e-06, + "loss": 1.0614, + "step": 2617 + }, + { + "epoch": 0.21155982949150487, + "grad_norm": 2.782986879348755, + "learning_rate": 9.850218367850263e-06, + "loss": 0.9692, + "step": 2618 + }, + { + "epoch": 0.21164063920483242, + "grad_norm": 2.605253219604492, + "learning_rate": 9.850059361732966e-06, + "loss": 1.0636, + "step": 2619 + }, + { + "epoch": 0.21172144891815997, + "grad_norm": 3.21453857421875, + "learning_rate": 9.849900272545824e-06, + "loss": 1.047, + "step": 2620 + }, + { + "epoch": 0.2118022586314875, + "grad_norm": 2.6399147510528564, + "learning_rate": 9.84974110029157e-06, + "loss": 0.9074, + "step": 2621 + }, + { + "epoch": 0.21188306834481505, + "grad_norm": 3.2476913928985596, + "learning_rate": 9.849581844972924e-06, + "loss": 0.9601, + "step": 2622 + }, + { + "epoch": 0.2119638780581426, + "grad_norm": 2.878459930419922, + "learning_rate": 9.849422506592616e-06, + "loss": 1.0309, + "step": 2623 + }, + { + "epoch": 0.21204468777147012, + "grad_norm": 2.9090170860290527, + "learning_rate": 9.849263085153375e-06, + "loss": 0.9745, + "step": 2624 + }, + { + "epoch": 0.21212549748479767, + "grad_norm": 2.7304728031158447, + "learning_rate": 9.849103580657933e-06, + "loss": 1.0822, + "step": 2625 + }, + { + "epoch": 0.21220630719812522, + "grad_norm": 2.804781436920166, + "learning_rate": 9.848943993109018e-06, + "loss": 0.9415, + "step": 2626 + }, + { + "epoch": 0.21228711691145274, + "grad_norm": 2.578540563583374, + "learning_rate": 9.848784322509366e-06, + "loss": 1.0419, + "step": 2627 + }, + { + "epoch": 0.2123679266247803, + "grad_norm": 2.940869092941284, + "learning_rate": 9.848624568861713e-06, + "loss": 0.9309, + "step": 2628 + }, + { + "epoch": 0.21244873633810785, + "grad_norm": 2.9244909286499023, + "learning_rate": 9.848464732168794e-06, + "loss": 0.7676, + "step": 2629 + }, + { + "epoch": 0.21252954605143537, + "grad_norm": 2.6605122089385986, + "learning_rate": 9.848304812433345e-06, + "loss": 0.9459, + "step": 2630 + }, + { + "epoch": 0.21261035576476292, + "grad_norm": 2.712827444076538, + "learning_rate": 9.848144809658106e-06, + "loss": 0.8843, + "step": 2631 + }, + { + "epoch": 0.21269116547809047, + "grad_norm": 2.9642374515533447, + "learning_rate": 9.84798472384582e-06, + "loss": 1.0906, + "step": 2632 + }, + { + "epoch": 0.212771975191418, + "grad_norm": 2.5326364040374756, + "learning_rate": 9.847824554999224e-06, + "loss": 0.9623, + "step": 2633 + }, + { + "epoch": 0.21285278490474555, + "grad_norm": 2.921952247619629, + "learning_rate": 9.847664303121064e-06, + "loss": 0.9162, + "step": 2634 + }, + { + "epoch": 0.2129335946180731, + "grad_norm": 2.930405616760254, + "learning_rate": 9.847503968214087e-06, + "loss": 1.0664, + "step": 2635 + }, + { + "epoch": 0.21301440433140065, + "grad_norm": 2.6176633834838867, + "learning_rate": 9.847343550281037e-06, + "loss": 0.9447, + "step": 2636 + }, + { + "epoch": 0.21309521404472817, + "grad_norm": 2.7375478744506836, + "learning_rate": 9.84718304932466e-06, + "loss": 1.0987, + "step": 2637 + }, + { + "epoch": 0.21317602375805572, + "grad_norm": 2.5312304496765137, + "learning_rate": 9.847022465347708e-06, + "loss": 0.9693, + "step": 2638 + }, + { + "epoch": 0.21325683347138327, + "grad_norm": 2.684300422668457, + "learning_rate": 9.84686179835293e-06, + "loss": 0.9798, + "step": 2639 + }, + { + "epoch": 0.2133376431847108, + "grad_norm": 2.656810998916626, + "learning_rate": 9.846701048343075e-06, + "loss": 0.9271, + "step": 2640 + }, + { + "epoch": 0.21341845289803835, + "grad_norm": 3.095665693283081, + "learning_rate": 9.8465402153209e-06, + "loss": 0.9004, + "step": 2641 + }, + { + "epoch": 0.2134992626113659, + "grad_norm": 2.690553665161133, + "learning_rate": 9.84637929928916e-06, + "loss": 1.0095, + "step": 2642 + }, + { + "epoch": 0.21358007232469342, + "grad_norm": 2.515697479248047, + "learning_rate": 9.84621830025061e-06, + "loss": 0.9953, + "step": 2643 + }, + { + "epoch": 0.21366088203802097, + "grad_norm": 2.9535646438598633, + "learning_rate": 9.846057218208004e-06, + "loss": 1.021, + "step": 2644 + }, + { + "epoch": 0.21374169175134852, + "grad_norm": 3.273070812225342, + "learning_rate": 9.845896053164108e-06, + "loss": 1.047, + "step": 2645 + }, + { + "epoch": 0.21382250146467605, + "grad_norm": 2.442079544067383, + "learning_rate": 9.845734805121678e-06, + "loss": 1.0215, + "step": 2646 + }, + { + "epoch": 0.2139033111780036, + "grad_norm": 2.7628297805786133, + "learning_rate": 9.845573474083477e-06, + "loss": 1.1598, + "step": 2647 + }, + { + "epoch": 0.21398412089133115, + "grad_norm": 3.064302921295166, + "learning_rate": 9.845412060052264e-06, + "loss": 0.9836, + "step": 2648 + }, + { + "epoch": 0.21406493060465867, + "grad_norm": 2.716076135635376, + "learning_rate": 9.84525056303081e-06, + "loss": 1.1441, + "step": 2649 + }, + { + "epoch": 0.21414574031798622, + "grad_norm": 2.504394292831421, + "learning_rate": 9.845088983021878e-06, + "loss": 0.9482, + "step": 2650 + }, + { + "epoch": 0.21422655003131377, + "grad_norm": 2.709070920944214, + "learning_rate": 9.844927320028236e-06, + "loss": 1.0309, + "step": 2651 + }, + { + "epoch": 0.2143073597446413, + "grad_norm": 2.7059810161590576, + "learning_rate": 9.844765574052653e-06, + "loss": 0.9578, + "step": 2652 + }, + { + "epoch": 0.21438816945796885, + "grad_norm": 2.5697734355926514, + "learning_rate": 9.844603745097898e-06, + "loss": 0.9499, + "step": 2653 + }, + { + "epoch": 0.2144689791712964, + "grad_norm": 2.6361827850341797, + "learning_rate": 9.844441833166744e-06, + "loss": 1.1609, + "step": 2654 + }, + { + "epoch": 0.21454978888462392, + "grad_norm": 2.6841630935668945, + "learning_rate": 9.844279838261966e-06, + "loss": 0.9998, + "step": 2655 + }, + { + "epoch": 0.21463059859795147, + "grad_norm": 2.805133581161499, + "learning_rate": 9.844117760386333e-06, + "loss": 0.9384, + "step": 2656 + }, + { + "epoch": 0.21471140831127902, + "grad_norm": 2.73477840423584, + "learning_rate": 9.843955599542627e-06, + "loss": 1.1571, + "step": 2657 + }, + { + "epoch": 0.21479221802460655, + "grad_norm": 2.5829696655273438, + "learning_rate": 9.843793355733622e-06, + "loss": 0.9096, + "step": 2658 + }, + { + "epoch": 0.2148730277379341, + "grad_norm": 3.528594493865967, + "learning_rate": 9.843631028962098e-06, + "loss": 0.9122, + "step": 2659 + }, + { + "epoch": 0.21495383745126165, + "grad_norm": 2.6276795864105225, + "learning_rate": 9.843468619230833e-06, + "loss": 1.0149, + "step": 2660 + }, + { + "epoch": 0.21503464716458917, + "grad_norm": 2.650006055831909, + "learning_rate": 9.843306126542613e-06, + "loss": 0.9662, + "step": 2661 + }, + { + "epoch": 0.21511545687791672, + "grad_norm": 3.024837017059326, + "learning_rate": 9.843143550900219e-06, + "loss": 1.0055, + "step": 2662 + }, + { + "epoch": 0.21519626659124427, + "grad_norm": 2.8011395931243896, + "learning_rate": 9.842980892306436e-06, + "loss": 1.0391, + "step": 2663 + }, + { + "epoch": 0.2152770763045718, + "grad_norm": 2.5469000339508057, + "learning_rate": 9.842818150764047e-06, + "loss": 1.1026, + "step": 2664 + }, + { + "epoch": 0.21535788601789935, + "grad_norm": 2.7628118991851807, + "learning_rate": 9.842655326275843e-06, + "loss": 0.9756, + "step": 2665 + }, + { + "epoch": 0.2154386957312269, + "grad_norm": 2.6595170497894287, + "learning_rate": 9.842492418844612e-06, + "loss": 1.032, + "step": 2666 + }, + { + "epoch": 0.21551950544455445, + "grad_norm": 2.702587604522705, + "learning_rate": 9.842329428473143e-06, + "loss": 0.972, + "step": 2667 + }, + { + "epoch": 0.21560031515788197, + "grad_norm": 2.8801355361938477, + "learning_rate": 9.842166355164227e-06, + "loss": 1.0336, + "step": 2668 + }, + { + "epoch": 0.21568112487120952, + "grad_norm": 3.040523052215576, + "learning_rate": 9.84200319892066e-06, + "loss": 0.9853, + "step": 2669 + }, + { + "epoch": 0.21576193458453707, + "grad_norm": 3.4121859073638916, + "learning_rate": 9.841839959745236e-06, + "loss": 1.0504, + "step": 2670 + }, + { + "epoch": 0.2158427442978646, + "grad_norm": 2.914210796356201, + "learning_rate": 9.841676637640747e-06, + "loss": 0.9771, + "step": 2671 + }, + { + "epoch": 0.21592355401119215, + "grad_norm": 2.971970796585083, + "learning_rate": 9.841513232609994e-06, + "loss": 1.0135, + "step": 2672 + }, + { + "epoch": 0.2160043637245197, + "grad_norm": 2.7145802974700928, + "learning_rate": 9.841349744655776e-06, + "loss": 1.046, + "step": 2673 + }, + { + "epoch": 0.21608517343784722, + "grad_norm": 2.629023790359497, + "learning_rate": 9.84118617378089e-06, + "loss": 0.9498, + "step": 2674 + }, + { + "epoch": 0.21616598315117477, + "grad_norm": 2.536773920059204, + "learning_rate": 9.841022519988142e-06, + "loss": 1.0527, + "step": 2675 + }, + { + "epoch": 0.21624679286450232, + "grad_norm": 2.9043614864349365, + "learning_rate": 9.84085878328033e-06, + "loss": 0.9488, + "step": 2676 + }, + { + "epoch": 0.21632760257782985, + "grad_norm": 2.8709604740142822, + "learning_rate": 9.840694963660262e-06, + "loss": 0.9625, + "step": 2677 + }, + { + "epoch": 0.2164084122911574, + "grad_norm": 2.7993056774139404, + "learning_rate": 9.840531061130742e-06, + "loss": 1.1074, + "step": 2678 + }, + { + "epoch": 0.21648922200448495, + "grad_norm": 3.800844430923462, + "learning_rate": 9.84036707569458e-06, + "loss": 1.053, + "step": 2679 + }, + { + "epoch": 0.21657003171781247, + "grad_norm": 2.661588430404663, + "learning_rate": 9.840203007354581e-06, + "loss": 0.9514, + "step": 2680 + }, + { + "epoch": 0.21665084143114002, + "grad_norm": 2.9842541217803955, + "learning_rate": 9.840038856113558e-06, + "loss": 0.9228, + "step": 2681 + }, + { + "epoch": 0.21673165114446757, + "grad_norm": 2.817148447036743, + "learning_rate": 9.83987462197432e-06, + "loss": 0.9725, + "step": 2682 + }, + { + "epoch": 0.2168124608577951, + "grad_norm": 3.2969770431518555, + "learning_rate": 9.839710304939683e-06, + "loss": 1.0276, + "step": 2683 + }, + { + "epoch": 0.21689327057112265, + "grad_norm": 2.8415517807006836, + "learning_rate": 9.839545905012457e-06, + "loss": 1.0284, + "step": 2684 + }, + { + "epoch": 0.2169740802844502, + "grad_norm": 3.089386224746704, + "learning_rate": 9.839381422195464e-06, + "loss": 1.0254, + "step": 2685 + }, + { + "epoch": 0.21705488999777772, + "grad_norm": 3.7873218059539795, + "learning_rate": 9.839216856491514e-06, + "loss": 1.0733, + "step": 2686 + }, + { + "epoch": 0.21713569971110527, + "grad_norm": 2.827165365219116, + "learning_rate": 9.839052207903431e-06, + "loss": 0.918, + "step": 2687 + }, + { + "epoch": 0.21721650942443282, + "grad_norm": 3.022129535675049, + "learning_rate": 9.838887476434033e-06, + "loss": 0.9254, + "step": 2688 + }, + { + "epoch": 0.21729731913776035, + "grad_norm": 2.853858470916748, + "learning_rate": 9.838722662086142e-06, + "loss": 1.0868, + "step": 2689 + }, + { + "epoch": 0.2173781288510879, + "grad_norm": 2.8158044815063477, + "learning_rate": 9.83855776486258e-06, + "loss": 0.9851, + "step": 2690 + }, + { + "epoch": 0.21745893856441545, + "grad_norm": 2.678738594055176, + "learning_rate": 9.838392784766172e-06, + "loss": 1.1053, + "step": 2691 + }, + { + "epoch": 0.21753974827774297, + "grad_norm": 2.6264524459838867, + "learning_rate": 9.838227721799742e-06, + "loss": 0.978, + "step": 2692 + }, + { + "epoch": 0.21762055799107052, + "grad_norm": 2.5500595569610596, + "learning_rate": 9.83806257596612e-06, + "loss": 0.9456, + "step": 2693 + }, + { + "epoch": 0.21770136770439807, + "grad_norm": 2.5118589401245117, + "learning_rate": 9.837897347268134e-06, + "loss": 0.8679, + "step": 2694 + }, + { + "epoch": 0.2177821774177256, + "grad_norm": 2.5622048377990723, + "learning_rate": 9.837732035708613e-06, + "loss": 1.1071, + "step": 2695 + }, + { + "epoch": 0.21786298713105315, + "grad_norm": 3.4003055095672607, + "learning_rate": 9.837566641290388e-06, + "loss": 1.0472, + "step": 2696 + }, + { + "epoch": 0.2179437968443807, + "grad_norm": 2.976776599884033, + "learning_rate": 9.837401164016293e-06, + "loss": 1.0828, + "step": 2697 + }, + { + "epoch": 0.21802460655770825, + "grad_norm": 2.4960176944732666, + "learning_rate": 9.837235603889162e-06, + "loss": 0.9775, + "step": 2698 + }, + { + "epoch": 0.21810541627103577, + "grad_norm": 2.8212971687316895, + "learning_rate": 9.837069960911829e-06, + "loss": 0.9227, + "step": 2699 + }, + { + "epoch": 0.21818622598436332, + "grad_norm": 3.3433587551116943, + "learning_rate": 9.836904235087132e-06, + "loss": 0.9447, + "step": 2700 + }, + { + "epoch": 0.21826703569769088, + "grad_norm": 3.5738041400909424, + "learning_rate": 9.836738426417911e-06, + "loss": 0.9704, + "step": 2701 + }, + { + "epoch": 0.2183478454110184, + "grad_norm": 2.5765058994293213, + "learning_rate": 9.836572534907005e-06, + "loss": 1.0977, + "step": 2702 + }, + { + "epoch": 0.21842865512434595, + "grad_norm": 2.7243411540985107, + "learning_rate": 9.836406560557254e-06, + "loss": 1.0175, + "step": 2703 + }, + { + "epoch": 0.2185094648376735, + "grad_norm": 2.91159725189209, + "learning_rate": 9.836240503371503e-06, + "loss": 1.0504, + "step": 2704 + }, + { + "epoch": 0.21859027455100102, + "grad_norm": 2.647514581680298, + "learning_rate": 9.836074363352594e-06, + "loss": 1.0958, + "step": 2705 + }, + { + "epoch": 0.21867108426432857, + "grad_norm": 2.7375965118408203, + "learning_rate": 9.835908140503374e-06, + "loss": 1.0491, + "step": 2706 + }, + { + "epoch": 0.21875189397765613, + "grad_norm": 2.7068612575531006, + "learning_rate": 9.83574183482669e-06, + "loss": 1.0139, + "step": 2707 + }, + { + "epoch": 0.21883270369098365, + "grad_norm": 3.1376380920410156, + "learning_rate": 9.835575446325386e-06, + "loss": 0.9426, + "step": 2708 + }, + { + "epoch": 0.2189135134043112, + "grad_norm": 3.265387773513794, + "learning_rate": 9.83540897500232e-06, + "loss": 1.0667, + "step": 2709 + }, + { + "epoch": 0.21899432311763875, + "grad_norm": 2.5371029376983643, + "learning_rate": 9.835242420860338e-06, + "loss": 0.9974, + "step": 2710 + }, + { + "epoch": 0.21907513283096627, + "grad_norm": 2.32127046585083, + "learning_rate": 9.835075783902294e-06, + "loss": 0.9514, + "step": 2711 + }, + { + "epoch": 0.21915594254429382, + "grad_norm": 2.689162254333496, + "learning_rate": 9.834909064131042e-06, + "loss": 0.8845, + "step": 2712 + }, + { + "epoch": 0.21923675225762138, + "grad_norm": 2.634633779525757, + "learning_rate": 9.834742261549436e-06, + "loss": 1.0544, + "step": 2713 + }, + { + "epoch": 0.2193175619709489, + "grad_norm": 2.6736345291137695, + "learning_rate": 9.834575376160336e-06, + "loss": 0.9929, + "step": 2714 + }, + { + "epoch": 0.21939837168427645, + "grad_norm": 2.721517562866211, + "learning_rate": 9.834408407966597e-06, + "loss": 1.113, + "step": 2715 + }, + { + "epoch": 0.219479181397604, + "grad_norm": 2.985367774963379, + "learning_rate": 9.834241356971082e-06, + "loss": 0.9398, + "step": 2716 + }, + { + "epoch": 0.21955999111093152, + "grad_norm": 2.432053804397583, + "learning_rate": 9.834074223176648e-06, + "loss": 1.0002, + "step": 2717 + }, + { + "epoch": 0.21964080082425907, + "grad_norm": 2.7274587154388428, + "learning_rate": 9.833907006586162e-06, + "loss": 1.1054, + "step": 2718 + }, + { + "epoch": 0.21972161053758663, + "grad_norm": 2.675016403198242, + "learning_rate": 9.833739707202485e-06, + "loss": 0.973, + "step": 2719 + }, + { + "epoch": 0.21980242025091415, + "grad_norm": 2.9005885124206543, + "learning_rate": 9.833572325028485e-06, + "loss": 1.0865, + "step": 2720 + }, + { + "epoch": 0.2198832299642417, + "grad_norm": 3.1062748432159424, + "learning_rate": 9.833404860067027e-06, + "loss": 0.9241, + "step": 2721 + }, + { + "epoch": 0.21996403967756925, + "grad_norm": 2.929474353790283, + "learning_rate": 9.833237312320979e-06, + "loss": 0.9104, + "step": 2722 + }, + { + "epoch": 0.22004484939089677, + "grad_norm": 2.922116279602051, + "learning_rate": 9.833069681793212e-06, + "loss": 0.9415, + "step": 2723 + }, + { + "epoch": 0.22012565910422432, + "grad_norm": 2.6386096477508545, + "learning_rate": 9.832901968486597e-06, + "loss": 1.0628, + "step": 2724 + }, + { + "epoch": 0.22020646881755188, + "grad_norm": 2.8515048027038574, + "learning_rate": 9.832734172404003e-06, + "loss": 0.8596, + "step": 2725 + }, + { + "epoch": 0.2202872785308794, + "grad_norm": 2.7838757038116455, + "learning_rate": 9.83256629354831e-06, + "loss": 0.987, + "step": 2726 + }, + { + "epoch": 0.22036808824420695, + "grad_norm": 2.8564083576202393, + "learning_rate": 9.83239833192239e-06, + "loss": 1.0477, + "step": 2727 + }, + { + "epoch": 0.2204488979575345, + "grad_norm": 2.657640218734741, + "learning_rate": 9.83223028752912e-06, + "loss": 0.9471, + "step": 2728 + }, + { + "epoch": 0.22052970767086202, + "grad_norm": 2.1479780673980713, + "learning_rate": 9.832062160371378e-06, + "loss": 1.0779, + "step": 2729 + }, + { + "epoch": 0.22061051738418958, + "grad_norm": 2.413940906524658, + "learning_rate": 9.831893950452044e-06, + "loss": 1.0194, + "step": 2730 + }, + { + "epoch": 0.22069132709751713, + "grad_norm": 2.5537819862365723, + "learning_rate": 9.831725657773999e-06, + "loss": 1.0628, + "step": 2731 + }, + { + "epoch": 0.22077213681084468, + "grad_norm": 2.848949909210205, + "learning_rate": 9.831557282340125e-06, + "loss": 1.0231, + "step": 2732 + }, + { + "epoch": 0.2208529465241722, + "grad_norm": 3.080414295196533, + "learning_rate": 9.831388824153306e-06, + "loss": 1.0486, + "step": 2733 + }, + { + "epoch": 0.22093375623749975, + "grad_norm": 3.120030641555786, + "learning_rate": 9.831220283216428e-06, + "loss": 1.0214, + "step": 2734 + }, + { + "epoch": 0.2210145659508273, + "grad_norm": 3.1183664798736572, + "learning_rate": 9.831051659532378e-06, + "loss": 1.0401, + "step": 2735 + }, + { + "epoch": 0.22109537566415483, + "grad_norm": 3.092597246170044, + "learning_rate": 9.830882953104042e-06, + "loss": 0.9186, + "step": 2736 + }, + { + "epoch": 0.22117618537748238, + "grad_norm": 3.042254686355591, + "learning_rate": 9.830714163934312e-06, + "loss": 0.9569, + "step": 2737 + }, + { + "epoch": 0.22125699509080993, + "grad_norm": 2.5840420722961426, + "learning_rate": 9.830545292026077e-06, + "loss": 1.0692, + "step": 2738 + }, + { + "epoch": 0.22133780480413745, + "grad_norm": 3.314822196960449, + "learning_rate": 9.83037633738223e-06, + "loss": 0.9808, + "step": 2739 + }, + { + "epoch": 0.221418614517465, + "grad_norm": 2.7320046424865723, + "learning_rate": 9.830207300005665e-06, + "loss": 0.9668, + "step": 2740 + }, + { + "epoch": 0.22149942423079255, + "grad_norm": 2.76070237159729, + "learning_rate": 9.830038179899278e-06, + "loss": 0.9549, + "step": 2741 + }, + { + "epoch": 0.22158023394412008, + "grad_norm": 2.71372389793396, + "learning_rate": 9.829868977065964e-06, + "loss": 0.9483, + "step": 2742 + }, + { + "epoch": 0.22166104365744763, + "grad_norm": 3.2915704250335693, + "learning_rate": 9.829699691508624e-06, + "loss": 0.9798, + "step": 2743 + }, + { + "epoch": 0.22174185337077518, + "grad_norm": 2.8336358070373535, + "learning_rate": 9.829530323230151e-06, + "loss": 1.0364, + "step": 2744 + }, + { + "epoch": 0.2218226630841027, + "grad_norm": 3.036170721054077, + "learning_rate": 9.829360872233455e-06, + "loss": 1.034, + "step": 2745 + }, + { + "epoch": 0.22190347279743025, + "grad_norm": 2.8102879524230957, + "learning_rate": 9.829191338521431e-06, + "loss": 1.0109, + "step": 2746 + }, + { + "epoch": 0.2219842825107578, + "grad_norm": 3.2442543506622314, + "learning_rate": 9.829021722096984e-06, + "loss": 1.0299, + "step": 2747 + }, + { + "epoch": 0.22206509222408533, + "grad_norm": 2.845066547393799, + "learning_rate": 9.828852022963023e-06, + "loss": 0.9285, + "step": 2748 + }, + { + "epoch": 0.22214590193741288, + "grad_norm": 2.9424619674682617, + "learning_rate": 9.828682241122452e-06, + "loss": 1.0555, + "step": 2749 + }, + { + "epoch": 0.22222671165074043, + "grad_norm": 2.4247968196868896, + "learning_rate": 9.828512376578177e-06, + "loss": 0.9813, + "step": 2750 + }, + { + "epoch": 0.22230752136406795, + "grad_norm": 3.497758626937866, + "learning_rate": 9.828342429333108e-06, + "loss": 0.9966, + "step": 2751 + }, + { + "epoch": 0.2223883310773955, + "grad_norm": 2.899437427520752, + "learning_rate": 9.828172399390158e-06, + "loss": 1.0316, + "step": 2752 + }, + { + "epoch": 0.22246914079072305, + "grad_norm": 2.879088878631592, + "learning_rate": 9.82800228675224e-06, + "loss": 1.031, + "step": 2753 + }, + { + "epoch": 0.22254995050405058, + "grad_norm": 2.6835739612579346, + "learning_rate": 9.827832091422265e-06, + "loss": 0.9471, + "step": 2754 + }, + { + "epoch": 0.22263076021737813, + "grad_norm": 2.815185308456421, + "learning_rate": 9.827661813403148e-06, + "loss": 1.058, + "step": 2755 + }, + { + "epoch": 0.22271156993070568, + "grad_norm": 2.932528018951416, + "learning_rate": 9.827491452697806e-06, + "loss": 0.9813, + "step": 2756 + }, + { + "epoch": 0.2227923796440332, + "grad_norm": 2.6303751468658447, + "learning_rate": 9.827321009309159e-06, + "loss": 0.9364, + "step": 2757 + }, + { + "epoch": 0.22287318935736075, + "grad_norm": 2.613288640975952, + "learning_rate": 9.827150483240123e-06, + "loss": 0.9016, + "step": 2758 + }, + { + "epoch": 0.2229539990706883, + "grad_norm": 2.978224039077759, + "learning_rate": 9.826979874493618e-06, + "loss": 0.9855, + "step": 2759 + }, + { + "epoch": 0.22303480878401583, + "grad_norm": 2.6431267261505127, + "learning_rate": 9.826809183072572e-06, + "loss": 0.9776, + "step": 2760 + }, + { + "epoch": 0.22311561849734338, + "grad_norm": 2.6241977214813232, + "learning_rate": 9.826638408979903e-06, + "loss": 1.0613, + "step": 2761 + }, + { + "epoch": 0.22319642821067093, + "grad_norm": 2.6231777667999268, + "learning_rate": 9.826467552218537e-06, + "loss": 1.0571, + "step": 2762 + }, + { + "epoch": 0.22327723792399848, + "grad_norm": 3.046290397644043, + "learning_rate": 9.826296612791403e-06, + "loss": 0.9999, + "step": 2763 + }, + { + "epoch": 0.223358047637326, + "grad_norm": 3.1164638996124268, + "learning_rate": 9.826125590701425e-06, + "loss": 1.0017, + "step": 2764 + }, + { + "epoch": 0.22343885735065355, + "grad_norm": 2.63796067237854, + "learning_rate": 9.825954485951536e-06, + "loss": 1.073, + "step": 2765 + }, + { + "epoch": 0.2235196670639811, + "grad_norm": 2.663356065750122, + "learning_rate": 9.825783298544662e-06, + "loss": 0.9145, + "step": 2766 + }, + { + "epoch": 0.22360047677730863, + "grad_norm": 3.481412887573242, + "learning_rate": 9.82561202848374e-06, + "loss": 0.8822, + "step": 2767 + }, + { + "epoch": 0.22368128649063618, + "grad_norm": 2.5267741680145264, + "learning_rate": 9.8254406757717e-06, + "loss": 0.9521, + "step": 2768 + }, + { + "epoch": 0.22376209620396373, + "grad_norm": 2.999199151992798, + "learning_rate": 9.825269240411478e-06, + "loss": 0.9925, + "step": 2769 + }, + { + "epoch": 0.22384290591729125, + "grad_norm": 2.903404474258423, + "learning_rate": 9.825097722406012e-06, + "loss": 0.9252, + "step": 2770 + }, + { + "epoch": 0.2239237156306188, + "grad_norm": 2.6118807792663574, + "learning_rate": 9.824926121758236e-06, + "loss": 0.857, + "step": 2771 + }, + { + "epoch": 0.22400452534394635, + "grad_norm": 2.513718366622925, + "learning_rate": 9.824754438471091e-06, + "loss": 1.0926, + "step": 2772 + }, + { + "epoch": 0.22408533505727388, + "grad_norm": 2.6014599800109863, + "learning_rate": 9.82458267254752e-06, + "loss": 0.9563, + "step": 2773 + }, + { + "epoch": 0.22416614477060143, + "grad_norm": 2.4804561138153076, + "learning_rate": 9.82441082399046e-06, + "loss": 1.1281, + "step": 2774 + }, + { + "epoch": 0.22424695448392898, + "grad_norm": 2.6518805027008057, + "learning_rate": 9.824238892802858e-06, + "loss": 0.9119, + "step": 2775 + }, + { + "epoch": 0.2243277641972565, + "grad_norm": 2.772603750228882, + "learning_rate": 9.824066878987657e-06, + "loss": 1.0272, + "step": 2776 + }, + { + "epoch": 0.22440857391058405, + "grad_norm": 3.0260932445526123, + "learning_rate": 9.823894782547803e-06, + "loss": 1.1759, + "step": 2777 + }, + { + "epoch": 0.2244893836239116, + "grad_norm": 2.77278208732605, + "learning_rate": 9.823722603486247e-06, + "loss": 0.9999, + "step": 2778 + }, + { + "epoch": 0.22457019333723913, + "grad_norm": 2.864107370376587, + "learning_rate": 9.823550341805933e-06, + "loss": 1.0211, + "step": 2779 + }, + { + "epoch": 0.22465100305056668, + "grad_norm": 2.401287078857422, + "learning_rate": 9.823377997509816e-06, + "loss": 0.9441, + "step": 2780 + }, + { + "epoch": 0.22473181276389423, + "grad_norm": 2.9367048740386963, + "learning_rate": 9.823205570600844e-06, + "loss": 0.9651, + "step": 2781 + }, + { + "epoch": 0.22481262247722175, + "grad_norm": 2.9967193603515625, + "learning_rate": 9.823033061081973e-06, + "loss": 1.0323, + "step": 2782 + }, + { + "epoch": 0.2248934321905493, + "grad_norm": 2.6746819019317627, + "learning_rate": 9.822860468956155e-06, + "loss": 1.0986, + "step": 2783 + }, + { + "epoch": 0.22497424190387685, + "grad_norm": 2.1297669410705566, + "learning_rate": 9.822687794226348e-06, + "loss": 1.1349, + "step": 2784 + }, + { + "epoch": 0.22505505161720438, + "grad_norm": 2.794849395751953, + "learning_rate": 9.82251503689551e-06, + "loss": 0.9071, + "step": 2785 + }, + { + "epoch": 0.22513586133053193, + "grad_norm": 2.7644519805908203, + "learning_rate": 9.822342196966601e-06, + "loss": 1.0522, + "step": 2786 + }, + { + "epoch": 0.22521667104385948, + "grad_norm": 2.584132194519043, + "learning_rate": 9.822169274442577e-06, + "loss": 1.0595, + "step": 2787 + }, + { + "epoch": 0.225297480757187, + "grad_norm": 2.742427349090576, + "learning_rate": 9.821996269326403e-06, + "loss": 0.9886, + "step": 2788 + }, + { + "epoch": 0.22537829047051455, + "grad_norm": 2.970609188079834, + "learning_rate": 9.821823181621043e-06, + "loss": 0.9794, + "step": 2789 + }, + { + "epoch": 0.2254591001838421, + "grad_norm": 3.2143619060516357, + "learning_rate": 9.821650011329458e-06, + "loss": 1.0037, + "step": 2790 + }, + { + "epoch": 0.22553990989716963, + "grad_norm": 3.3506712913513184, + "learning_rate": 9.821476758454616e-06, + "loss": 1.0079, + "step": 2791 + }, + { + "epoch": 0.22562071961049718, + "grad_norm": 2.8092939853668213, + "learning_rate": 9.821303422999484e-06, + "loss": 0.9338, + "step": 2792 + }, + { + "epoch": 0.22570152932382473, + "grad_norm": 2.6990749835968018, + "learning_rate": 9.821130004967032e-06, + "loss": 1.0369, + "step": 2793 + }, + { + "epoch": 0.22578233903715225, + "grad_norm": 2.468820810317993, + "learning_rate": 9.82095650436023e-06, + "loss": 1.1118, + "step": 2794 + }, + { + "epoch": 0.2258631487504798, + "grad_norm": 2.9115161895751953, + "learning_rate": 9.820782921182049e-06, + "loss": 0.9823, + "step": 2795 + }, + { + "epoch": 0.22594395846380735, + "grad_norm": 2.591421365737915, + "learning_rate": 9.82060925543546e-06, + "loss": 1.1116, + "step": 2796 + }, + { + "epoch": 0.2260247681771349, + "grad_norm": 2.7503275871276855, + "learning_rate": 9.82043550712344e-06, + "loss": 1.0216, + "step": 2797 + }, + { + "epoch": 0.22610557789046243, + "grad_norm": 2.9692800045013428, + "learning_rate": 9.820261676248969e-06, + "loss": 0.8514, + "step": 2798 + }, + { + "epoch": 0.22618638760378998, + "grad_norm": 2.3442323207855225, + "learning_rate": 9.820087762815013e-06, + "loss": 1.0024, + "step": 2799 + }, + { + "epoch": 0.22626719731711753, + "grad_norm": 2.81729793548584, + "learning_rate": 9.819913766824563e-06, + "loss": 1.0367, + "step": 2800 + }, + { + "epoch": 0.22634800703044505, + "grad_norm": 2.569409132003784, + "learning_rate": 9.81973968828059e-06, + "loss": 1.1571, + "step": 2801 + }, + { + "epoch": 0.2264288167437726, + "grad_norm": 3.1350796222686768, + "learning_rate": 9.819565527186082e-06, + "loss": 0.9435, + "step": 2802 + }, + { + "epoch": 0.22650962645710015, + "grad_norm": 2.7625088691711426, + "learning_rate": 9.819391283544018e-06, + "loss": 0.9354, + "step": 2803 + }, + { + "epoch": 0.22659043617042768, + "grad_norm": 2.755223035812378, + "learning_rate": 9.819216957357382e-06, + "loss": 0.962, + "step": 2804 + }, + { + "epoch": 0.22667124588375523, + "grad_norm": 2.7333672046661377, + "learning_rate": 9.819042548629163e-06, + "loss": 0.9441, + "step": 2805 + }, + { + "epoch": 0.22675205559708278, + "grad_norm": 2.9572391510009766, + "learning_rate": 9.818868057362346e-06, + "loss": 1.0607, + "step": 2806 + }, + { + "epoch": 0.2268328653104103, + "grad_norm": 2.7844395637512207, + "learning_rate": 9.81869348355992e-06, + "loss": 0.9216, + "step": 2807 + }, + { + "epoch": 0.22691367502373785, + "grad_norm": 2.7145252227783203, + "learning_rate": 9.818518827224877e-06, + "loss": 1.0075, + "step": 2808 + }, + { + "epoch": 0.2269944847370654, + "grad_norm": 3.09114670753479, + "learning_rate": 9.818344088360204e-06, + "loss": 1.1068, + "step": 2809 + }, + { + "epoch": 0.22707529445039293, + "grad_norm": 2.9854092597961426, + "learning_rate": 9.818169266968899e-06, + "loss": 1.0327, + "step": 2810 + }, + { + "epoch": 0.22715610416372048, + "grad_norm": 3.1989142894744873, + "learning_rate": 9.81799436305395e-06, + "loss": 1.0527, + "step": 2811 + }, + { + "epoch": 0.22723691387704803, + "grad_norm": 3.338559150695801, + "learning_rate": 9.81781937661836e-06, + "loss": 1.0737, + "step": 2812 + }, + { + "epoch": 0.22731772359037555, + "grad_norm": 3.4588310718536377, + "learning_rate": 9.81764430766512e-06, + "loss": 0.9356, + "step": 2813 + }, + { + "epoch": 0.2273985333037031, + "grad_norm": 2.61423397064209, + "learning_rate": 9.817469156197232e-06, + "loss": 1.0299, + "step": 2814 + }, + { + "epoch": 0.22747934301703066, + "grad_norm": 3.1423604488372803, + "learning_rate": 9.817293922217697e-06, + "loss": 0.9767, + "step": 2815 + }, + { + "epoch": 0.22756015273035818, + "grad_norm": 2.9998934268951416, + "learning_rate": 9.817118605729512e-06, + "loss": 1.0646, + "step": 2816 + }, + { + "epoch": 0.22764096244368573, + "grad_norm": 2.620575428009033, + "learning_rate": 9.816943206735682e-06, + "loss": 0.9874, + "step": 2817 + }, + { + "epoch": 0.22772177215701328, + "grad_norm": 3.3366506099700928, + "learning_rate": 9.816767725239212e-06, + "loss": 1.1476, + "step": 2818 + }, + { + "epoch": 0.2278025818703408, + "grad_norm": 2.993959903717041, + "learning_rate": 9.816592161243106e-06, + "loss": 1.1764, + "step": 2819 + }, + { + "epoch": 0.22788339158366835, + "grad_norm": 2.9595839977264404, + "learning_rate": 9.816416514750372e-06, + "loss": 0.9691, + "step": 2820 + }, + { + "epoch": 0.2279642012969959, + "grad_norm": 2.701143741607666, + "learning_rate": 9.816240785764019e-06, + "loss": 1.0103, + "step": 2821 + }, + { + "epoch": 0.22804501101032343, + "grad_norm": 3.1361124515533447, + "learning_rate": 9.816064974287055e-06, + "loss": 0.9805, + "step": 2822 + }, + { + "epoch": 0.22812582072365098, + "grad_norm": 2.8855795860290527, + "learning_rate": 9.815889080322491e-06, + "loss": 1.0092, + "step": 2823 + }, + { + "epoch": 0.22820663043697853, + "grad_norm": 2.637803316116333, + "learning_rate": 9.815713103873343e-06, + "loss": 0.9422, + "step": 2824 + }, + { + "epoch": 0.22828744015030605, + "grad_norm": 2.517380475997925, + "learning_rate": 9.815537044942622e-06, + "loss": 1.0371, + "step": 2825 + }, + { + "epoch": 0.2283682498636336, + "grad_norm": 2.720059633255005, + "learning_rate": 9.815360903533345e-06, + "loss": 0.9992, + "step": 2826 + }, + { + "epoch": 0.22844905957696116, + "grad_norm": 2.635488986968994, + "learning_rate": 9.815184679648529e-06, + "loss": 1.0112, + "step": 2827 + }, + { + "epoch": 0.2285298692902887, + "grad_norm": 3.504739284515381, + "learning_rate": 9.815008373291188e-06, + "loss": 1.0923, + "step": 2828 + }, + { + "epoch": 0.22861067900361623, + "grad_norm": 3.1671524047851562, + "learning_rate": 9.814831984464347e-06, + "loss": 0.9475, + "step": 2829 + }, + { + "epoch": 0.22869148871694378, + "grad_norm": 2.455099105834961, + "learning_rate": 9.814655513171028e-06, + "loss": 1.089, + "step": 2830 + }, + { + "epoch": 0.22877229843027133, + "grad_norm": 2.755807876586914, + "learning_rate": 9.814478959414248e-06, + "loss": 0.9854, + "step": 2831 + }, + { + "epoch": 0.22885310814359885, + "grad_norm": 3.0457632541656494, + "learning_rate": 9.814302323197033e-06, + "loss": 0.982, + "step": 2832 + }, + { + "epoch": 0.2289339178569264, + "grad_norm": 2.8635318279266357, + "learning_rate": 9.814125604522412e-06, + "loss": 0.9566, + "step": 2833 + }, + { + "epoch": 0.22901472757025396, + "grad_norm": 2.9081757068634033, + "learning_rate": 9.813948803393407e-06, + "loss": 0.9664, + "step": 2834 + }, + { + "epoch": 0.22909553728358148, + "grad_norm": 2.9280664920806885, + "learning_rate": 9.813771919813049e-06, + "loss": 1.0221, + "step": 2835 + }, + { + "epoch": 0.22917634699690903, + "grad_norm": 2.643336534500122, + "learning_rate": 9.813594953784366e-06, + "loss": 0.923, + "step": 2836 + }, + { + "epoch": 0.22925715671023658, + "grad_norm": 2.497655153274536, + "learning_rate": 9.813417905310391e-06, + "loss": 0.9823, + "step": 2837 + }, + { + "epoch": 0.2293379664235641, + "grad_norm": 2.8711609840393066, + "learning_rate": 9.813240774394153e-06, + "loss": 1.0016, + "step": 2838 + }, + { + "epoch": 0.22941877613689166, + "grad_norm": 2.8473143577575684, + "learning_rate": 9.81306356103869e-06, + "loss": 1.0128, + "step": 2839 + }, + { + "epoch": 0.2294995858502192, + "grad_norm": 2.6120336055755615, + "learning_rate": 9.812886265247035e-06, + "loss": 0.9938, + "step": 2840 + }, + { + "epoch": 0.22958039556354673, + "grad_norm": 2.9384772777557373, + "learning_rate": 9.812708887022223e-06, + "loss": 0.9876, + "step": 2841 + }, + { + "epoch": 0.22966120527687428, + "grad_norm": 3.0501091480255127, + "learning_rate": 9.812531426367296e-06, + "loss": 1.0746, + "step": 2842 + }, + { + "epoch": 0.22974201499020183, + "grad_norm": 2.7544689178466797, + "learning_rate": 9.81235388328529e-06, + "loss": 1.0758, + "step": 2843 + }, + { + "epoch": 0.22982282470352935, + "grad_norm": 3.162550210952759, + "learning_rate": 9.812176257779248e-06, + "loss": 0.9563, + "step": 2844 + }, + { + "epoch": 0.2299036344168569, + "grad_norm": 2.8001787662506104, + "learning_rate": 9.81199854985221e-06, + "loss": 0.9562, + "step": 2845 + }, + { + "epoch": 0.22998444413018446, + "grad_norm": 3.1681456565856934, + "learning_rate": 9.811820759507223e-06, + "loss": 0.8944, + "step": 2846 + }, + { + "epoch": 0.23006525384351198, + "grad_norm": 2.618809461593628, + "learning_rate": 9.81164288674733e-06, + "loss": 1.0327, + "step": 2847 + }, + { + "epoch": 0.23014606355683953, + "grad_norm": 3.1844985485076904, + "learning_rate": 9.81146493157558e-06, + "loss": 1.1014, + "step": 2848 + }, + { + "epoch": 0.23022687327016708, + "grad_norm": 2.4997401237487793, + "learning_rate": 9.811286893995014e-06, + "loss": 1.0001, + "step": 2849 + }, + { + "epoch": 0.2303076829834946, + "grad_norm": 2.8787453174591064, + "learning_rate": 9.811108774008689e-06, + "loss": 0.9882, + "step": 2850 + }, + { + "epoch": 0.23038849269682216, + "grad_norm": 2.824585437774658, + "learning_rate": 9.810930571619652e-06, + "loss": 0.9349, + "step": 2851 + }, + { + "epoch": 0.2304693024101497, + "grad_norm": 3.053377151489258, + "learning_rate": 9.810752286830958e-06, + "loss": 0.8595, + "step": 2852 + }, + { + "epoch": 0.23055011212347723, + "grad_norm": 2.454207420349121, + "learning_rate": 9.810573919645658e-06, + "loss": 0.9653, + "step": 2853 + }, + { + "epoch": 0.23063092183680478, + "grad_norm": 3.3735809326171875, + "learning_rate": 9.810395470066807e-06, + "loss": 0.978, + "step": 2854 + }, + { + "epoch": 0.23071173155013233, + "grad_norm": 2.8116884231567383, + "learning_rate": 9.810216938097463e-06, + "loss": 1.046, + "step": 2855 + }, + { + "epoch": 0.23079254126345985, + "grad_norm": 3.1904733180999756, + "learning_rate": 9.810038323740683e-06, + "loss": 1.0383, + "step": 2856 + }, + { + "epoch": 0.2308733509767874, + "grad_norm": 3.200434923171997, + "learning_rate": 9.809859626999526e-06, + "loss": 0.9868, + "step": 2857 + }, + { + "epoch": 0.23095416069011496, + "grad_norm": 2.598670721054077, + "learning_rate": 9.809680847877052e-06, + "loss": 0.9273, + "step": 2858 + }, + { + "epoch": 0.23103497040344248, + "grad_norm": 3.1879770755767822, + "learning_rate": 9.809501986376324e-06, + "loss": 1.1089, + "step": 2859 + }, + { + "epoch": 0.23111578011677003, + "grad_norm": 2.736104726791382, + "learning_rate": 9.809323042500406e-06, + "loss": 1.1119, + "step": 2860 + }, + { + "epoch": 0.23119658983009758, + "grad_norm": 2.963679075241089, + "learning_rate": 9.809144016252361e-06, + "loss": 0.9761, + "step": 2861 + }, + { + "epoch": 0.23127739954342513, + "grad_norm": 2.912889242172241, + "learning_rate": 9.808964907635258e-06, + "loss": 1.0861, + "step": 2862 + }, + { + "epoch": 0.23135820925675266, + "grad_norm": 2.7533481121063232, + "learning_rate": 9.808785716652163e-06, + "loss": 1.0043, + "step": 2863 + }, + { + "epoch": 0.2314390189700802, + "grad_norm": 2.922305107116699, + "learning_rate": 9.808606443306146e-06, + "loss": 0.9635, + "step": 2864 + }, + { + "epoch": 0.23151982868340776, + "grad_norm": 3.466017961502075, + "learning_rate": 9.808427087600276e-06, + "loss": 1.0228, + "step": 2865 + }, + { + "epoch": 0.23160063839673528, + "grad_norm": 2.7406976222991943, + "learning_rate": 9.808247649537626e-06, + "loss": 1.0092, + "step": 2866 + }, + { + "epoch": 0.23168144811006283, + "grad_norm": 2.6772921085357666, + "learning_rate": 9.808068129121268e-06, + "loss": 1.0089, + "step": 2867 + }, + { + "epoch": 0.23176225782339038, + "grad_norm": 2.930373191833496, + "learning_rate": 9.80788852635428e-06, + "loss": 0.9548, + "step": 2868 + }, + { + "epoch": 0.2318430675367179, + "grad_norm": 2.7349531650543213, + "learning_rate": 9.807708841239734e-06, + "loss": 1.0062, + "step": 2869 + }, + { + "epoch": 0.23192387725004546, + "grad_norm": 2.849152088165283, + "learning_rate": 9.807529073780712e-06, + "loss": 0.9175, + "step": 2870 + }, + { + "epoch": 0.232004686963373, + "grad_norm": 2.6046605110168457, + "learning_rate": 9.80734922398029e-06, + "loss": 1.0255, + "step": 2871 + }, + { + "epoch": 0.23208549667670053, + "grad_norm": 2.920684337615967, + "learning_rate": 9.807169291841548e-06, + "loss": 0.9472, + "step": 2872 + }, + { + "epoch": 0.23216630639002808, + "grad_norm": 3.0803701877593994, + "learning_rate": 9.806989277367569e-06, + "loss": 1.1611, + "step": 2873 + }, + { + "epoch": 0.23224711610335563, + "grad_norm": 3.029608964920044, + "learning_rate": 9.806809180561436e-06, + "loss": 0.9328, + "step": 2874 + }, + { + "epoch": 0.23232792581668316, + "grad_norm": 2.641021966934204, + "learning_rate": 9.806629001426234e-06, + "loss": 0.9521, + "step": 2875 + }, + { + "epoch": 0.2324087355300107, + "grad_norm": 2.828425884246826, + "learning_rate": 9.806448739965048e-06, + "loss": 0.9299, + "step": 2876 + }, + { + "epoch": 0.23248954524333826, + "grad_norm": 2.696329355239868, + "learning_rate": 9.806268396180967e-06, + "loss": 0.9952, + "step": 2877 + }, + { + "epoch": 0.23257035495666578, + "grad_norm": 2.6079211235046387, + "learning_rate": 9.806087970077079e-06, + "loss": 0.9596, + "step": 2878 + }, + { + "epoch": 0.23265116466999333, + "grad_norm": 3.1535024642944336, + "learning_rate": 9.805907461656473e-06, + "loss": 0.8422, + "step": 2879 + }, + { + "epoch": 0.23273197438332088, + "grad_norm": 2.7458996772766113, + "learning_rate": 9.805726870922244e-06, + "loss": 1.0438, + "step": 2880 + }, + { + "epoch": 0.2328127840966484, + "grad_norm": 2.8035130500793457, + "learning_rate": 9.80554619787748e-06, + "loss": 1.1018, + "step": 2881 + }, + { + "epoch": 0.23289359380997596, + "grad_norm": 2.688476085662842, + "learning_rate": 9.80536544252528e-06, + "loss": 0.9787, + "step": 2882 + }, + { + "epoch": 0.2329744035233035, + "grad_norm": 3.001877784729004, + "learning_rate": 9.80518460486874e-06, + "loss": 1.12, + "step": 2883 + }, + { + "epoch": 0.23305521323663103, + "grad_norm": 2.7390987873077393, + "learning_rate": 9.805003684910955e-06, + "loss": 0.9534, + "step": 2884 + }, + { + "epoch": 0.23313602294995858, + "grad_norm": 3.121856451034546, + "learning_rate": 9.804822682655023e-06, + "loss": 1.0815, + "step": 2885 + }, + { + "epoch": 0.23321683266328613, + "grad_norm": 2.5415515899658203, + "learning_rate": 9.804641598104048e-06, + "loss": 1.1259, + "step": 2886 + }, + { + "epoch": 0.23329764237661366, + "grad_norm": 2.9589216709136963, + "learning_rate": 9.804460431261128e-06, + "loss": 0.9394, + "step": 2887 + }, + { + "epoch": 0.2333784520899412, + "grad_norm": 2.749288320541382, + "learning_rate": 9.804279182129366e-06, + "loss": 0.9642, + "step": 2888 + }, + { + "epoch": 0.23345926180326876, + "grad_norm": 2.368316650390625, + "learning_rate": 9.804097850711867e-06, + "loss": 1.0122, + "step": 2889 + }, + { + "epoch": 0.23354007151659628, + "grad_norm": 3.158442258834839, + "learning_rate": 9.80391643701174e-06, + "loss": 0.9345, + "step": 2890 + }, + { + "epoch": 0.23362088122992383, + "grad_norm": 2.4102766513824463, + "learning_rate": 9.803734941032087e-06, + "loss": 1.0051, + "step": 2891 + }, + { + "epoch": 0.23370169094325138, + "grad_norm": 2.6964380741119385, + "learning_rate": 9.803553362776019e-06, + "loss": 1.1063, + "step": 2892 + }, + { + "epoch": 0.23378250065657893, + "grad_norm": 3.018733024597168, + "learning_rate": 9.803371702246647e-06, + "loss": 0.9383, + "step": 2893 + }, + { + "epoch": 0.23386331036990646, + "grad_norm": 2.8816728591918945, + "learning_rate": 9.803189959447082e-06, + "loss": 0.9539, + "step": 2894 + }, + { + "epoch": 0.233944120083234, + "grad_norm": 2.8543145656585693, + "learning_rate": 9.803008134380435e-06, + "loss": 0.9333, + "step": 2895 + }, + { + "epoch": 0.23402492979656156, + "grad_norm": 2.6977081298828125, + "learning_rate": 9.802826227049822e-06, + "loss": 1.0031, + "step": 2896 + }, + { + "epoch": 0.23410573950988908, + "grad_norm": 2.8548684120178223, + "learning_rate": 9.802644237458357e-06, + "loss": 1.045, + "step": 2897 + }, + { + "epoch": 0.23418654922321663, + "grad_norm": 3.118274688720703, + "learning_rate": 9.802462165609159e-06, + "loss": 1.0571, + "step": 2898 + }, + { + "epoch": 0.23426735893654418, + "grad_norm": 2.4857306480407715, + "learning_rate": 9.802280011505345e-06, + "loss": 1.016, + "step": 2899 + }, + { + "epoch": 0.2343481686498717, + "grad_norm": 3.076098680496216, + "learning_rate": 9.802097775150037e-06, + "loss": 1.011, + "step": 2900 + }, + { + "epoch": 0.23442897836319926, + "grad_norm": 2.630444288253784, + "learning_rate": 9.801915456546353e-06, + "loss": 0.976, + "step": 2901 + }, + { + "epoch": 0.2345097880765268, + "grad_norm": 2.6252129077911377, + "learning_rate": 9.801733055697417e-06, + "loss": 0.99, + "step": 2902 + }, + { + "epoch": 0.23459059778985433, + "grad_norm": 2.6402323246002197, + "learning_rate": 9.801550572606355e-06, + "loss": 0.9066, + "step": 2903 + }, + { + "epoch": 0.23467140750318188, + "grad_norm": 2.880779981613159, + "learning_rate": 9.80136800727629e-06, + "loss": 0.9364, + "step": 2904 + }, + { + "epoch": 0.23475221721650943, + "grad_norm": 2.7739346027374268, + "learning_rate": 9.801185359710352e-06, + "loss": 1.0044, + "step": 2905 + }, + { + "epoch": 0.23483302692983696, + "grad_norm": 2.8313372135162354, + "learning_rate": 9.801002629911664e-06, + "loss": 0.9938, + "step": 2906 + }, + { + "epoch": 0.2349138366431645, + "grad_norm": 3.302762508392334, + "learning_rate": 9.800819817883362e-06, + "loss": 1.0321, + "step": 2907 + }, + { + "epoch": 0.23499464635649206, + "grad_norm": 2.7084765434265137, + "learning_rate": 9.800636923628572e-06, + "loss": 1.0704, + "step": 2908 + }, + { + "epoch": 0.23507545606981958, + "grad_norm": 2.904644727706909, + "learning_rate": 9.800453947150427e-06, + "loss": 0.9729, + "step": 2909 + }, + { + "epoch": 0.23515626578314713, + "grad_norm": 2.632418394088745, + "learning_rate": 9.800270888452065e-06, + "loss": 1.0213, + "step": 2910 + }, + { + "epoch": 0.23523707549647468, + "grad_norm": 3.0656416416168213, + "learning_rate": 9.80008774753662e-06, + "loss": 0.9299, + "step": 2911 + }, + { + "epoch": 0.2353178852098022, + "grad_norm": 2.863532781600952, + "learning_rate": 9.799904524407224e-06, + "loss": 0.9577, + "step": 2912 + }, + { + "epoch": 0.23539869492312976, + "grad_norm": 2.7166807651519775, + "learning_rate": 9.799721219067023e-06, + "loss": 0.9382, + "step": 2913 + }, + { + "epoch": 0.2354795046364573, + "grad_norm": 2.9613704681396484, + "learning_rate": 9.799537831519149e-06, + "loss": 1.0444, + "step": 2914 + }, + { + "epoch": 0.23556031434978483, + "grad_norm": 3.0312464237213135, + "learning_rate": 9.799354361766746e-06, + "loss": 1.0073, + "step": 2915 + }, + { + "epoch": 0.23564112406311238, + "grad_norm": 2.47131609916687, + "learning_rate": 9.79917080981296e-06, + "loss": 0.9809, + "step": 2916 + }, + { + "epoch": 0.23572193377643993, + "grad_norm": 2.8032500743865967, + "learning_rate": 9.798987175660928e-06, + "loss": 1.1325, + "step": 2917 + }, + { + "epoch": 0.23580274348976746, + "grad_norm": 2.947420358657837, + "learning_rate": 9.798803459313802e-06, + "loss": 1.1373, + "step": 2918 + }, + { + "epoch": 0.235883553203095, + "grad_norm": 3.7523951530456543, + "learning_rate": 9.798619660774724e-06, + "loss": 1.089, + "step": 2919 + }, + { + "epoch": 0.23596436291642256, + "grad_norm": 2.727415084838867, + "learning_rate": 9.798435780046842e-06, + "loss": 1.1135, + "step": 2920 + }, + { + "epoch": 0.23604517262975008, + "grad_norm": 2.7972569465637207, + "learning_rate": 9.79825181713331e-06, + "loss": 0.9596, + "step": 2921 + }, + { + "epoch": 0.23612598234307763, + "grad_norm": 2.521152973175049, + "learning_rate": 9.798067772037272e-06, + "loss": 0.985, + "step": 2922 + }, + { + "epoch": 0.23620679205640518, + "grad_norm": 2.7290236949920654, + "learning_rate": 9.797883644761886e-06, + "loss": 1.0603, + "step": 2923 + }, + { + "epoch": 0.2362876017697327, + "grad_norm": 2.937466621398926, + "learning_rate": 9.797699435310305e-06, + "loss": 0.9835, + "step": 2924 + }, + { + "epoch": 0.23636841148306026, + "grad_norm": 2.8509199619293213, + "learning_rate": 9.79751514368568e-06, + "loss": 0.9966, + "step": 2925 + }, + { + "epoch": 0.2364492211963878, + "grad_norm": 2.9798872470855713, + "learning_rate": 9.79733076989117e-06, + "loss": 0.9335, + "step": 2926 + }, + { + "epoch": 0.23653003090971536, + "grad_norm": 2.9888782501220703, + "learning_rate": 9.797146313929935e-06, + "loss": 1.0645, + "step": 2927 + }, + { + "epoch": 0.23661084062304288, + "grad_norm": 2.9092607498168945, + "learning_rate": 9.796961775805131e-06, + "loss": 0.9685, + "step": 2928 + }, + { + "epoch": 0.23669165033637043, + "grad_norm": 2.904297113418579, + "learning_rate": 9.796777155519921e-06, + "loss": 1.1148, + "step": 2929 + }, + { + "epoch": 0.23677246004969799, + "grad_norm": 2.948443651199341, + "learning_rate": 9.796592453077466e-06, + "loss": 1.0204, + "step": 2930 + }, + { + "epoch": 0.2368532697630255, + "grad_norm": 2.9255568981170654, + "learning_rate": 9.79640766848093e-06, + "loss": 0.9652, + "step": 2931 + }, + { + "epoch": 0.23693407947635306, + "grad_norm": 2.946317195892334, + "learning_rate": 9.796222801733476e-06, + "loss": 0.9634, + "step": 2932 + }, + { + "epoch": 0.2370148891896806, + "grad_norm": 2.8127524852752686, + "learning_rate": 9.79603785283827e-06, + "loss": 0.9098, + "step": 2933 + }, + { + "epoch": 0.23709569890300813, + "grad_norm": 2.9615817070007324, + "learning_rate": 9.795852821798486e-06, + "loss": 0.9135, + "step": 2934 + }, + { + "epoch": 0.23717650861633569, + "grad_norm": 2.7446794509887695, + "learning_rate": 9.795667708617287e-06, + "loss": 1.1187, + "step": 2935 + }, + { + "epoch": 0.23725731832966324, + "grad_norm": 2.8557610511779785, + "learning_rate": 9.795482513297845e-06, + "loss": 1.1194, + "step": 2936 + }, + { + "epoch": 0.23733812804299076, + "grad_norm": 2.5263028144836426, + "learning_rate": 9.795297235843333e-06, + "loss": 0.9522, + "step": 2937 + }, + { + "epoch": 0.2374189377563183, + "grad_norm": 2.5753190517425537, + "learning_rate": 9.795111876256921e-06, + "loss": 1.0271, + "step": 2938 + }, + { + "epoch": 0.23749974746964586, + "grad_norm": 2.882373809814453, + "learning_rate": 9.79492643454179e-06, + "loss": 0.9694, + "step": 2939 + }, + { + "epoch": 0.23758055718297338, + "grad_norm": 2.4024150371551514, + "learning_rate": 9.794740910701111e-06, + "loss": 1.0339, + "step": 2940 + }, + { + "epoch": 0.23766136689630094, + "grad_norm": 3.4500327110290527, + "learning_rate": 9.794555304738063e-06, + "loss": 0.9981, + "step": 2941 + }, + { + "epoch": 0.23774217660962849, + "grad_norm": 2.7431914806365967, + "learning_rate": 9.794369616655823e-06, + "loss": 1.0637, + "step": 2942 + }, + { + "epoch": 0.237822986322956, + "grad_norm": 2.808762550354004, + "learning_rate": 9.794183846457577e-06, + "loss": 0.9553, + "step": 2943 + }, + { + "epoch": 0.23790379603628356, + "grad_norm": 2.5411794185638428, + "learning_rate": 9.7939979941465e-06, + "loss": 0.9299, + "step": 2944 + }, + { + "epoch": 0.2379846057496111, + "grad_norm": 2.707524299621582, + "learning_rate": 9.793812059725781e-06, + "loss": 1.028, + "step": 2945 + }, + { + "epoch": 0.23806541546293863, + "grad_norm": 2.967235565185547, + "learning_rate": 9.7936260431986e-06, + "loss": 1.0642, + "step": 2946 + }, + { + "epoch": 0.23814622517626619, + "grad_norm": 2.7679848670959473, + "learning_rate": 9.793439944568146e-06, + "loss": 0.9546, + "step": 2947 + }, + { + "epoch": 0.23822703488959374, + "grad_norm": 2.782672166824341, + "learning_rate": 9.793253763837606e-06, + "loss": 1.0416, + "step": 2948 + }, + { + "epoch": 0.23830784460292126, + "grad_norm": 3.0185306072235107, + "learning_rate": 9.793067501010167e-06, + "loss": 1.0117, + "step": 2949 + }, + { + "epoch": 0.2383886543162488, + "grad_norm": 2.802870035171509, + "learning_rate": 9.792881156089023e-06, + "loss": 1.0281, + "step": 2950 + }, + { + "epoch": 0.23846946402957636, + "grad_norm": 2.8822507858276367, + "learning_rate": 9.79269472907736e-06, + "loss": 1.0046, + "step": 2951 + }, + { + "epoch": 0.23855027374290388, + "grad_norm": 3.2563014030456543, + "learning_rate": 9.792508219978377e-06, + "loss": 1.0214, + "step": 2952 + }, + { + "epoch": 0.23863108345623144, + "grad_norm": 2.619318962097168, + "learning_rate": 9.792321628795264e-06, + "loss": 1.0024, + "step": 2953 + }, + { + "epoch": 0.238711893169559, + "grad_norm": 2.830854654312134, + "learning_rate": 9.792134955531219e-06, + "loss": 0.944, + "step": 2954 + }, + { + "epoch": 0.2387927028828865, + "grad_norm": 3.4874989986419678, + "learning_rate": 9.791948200189439e-06, + "loss": 1.0637, + "step": 2955 + }, + { + "epoch": 0.23887351259621406, + "grad_norm": 2.825983762741089, + "learning_rate": 9.791761362773122e-06, + "loss": 0.8994, + "step": 2956 + }, + { + "epoch": 0.2389543223095416, + "grad_norm": 2.640371799468994, + "learning_rate": 9.791574443285469e-06, + "loss": 0.9939, + "step": 2957 + }, + { + "epoch": 0.23903513202286916, + "grad_norm": 2.5753226280212402, + "learning_rate": 9.791387441729681e-06, + "loss": 1.192, + "step": 2958 + }, + { + "epoch": 0.23911594173619669, + "grad_norm": 3.058668375015259, + "learning_rate": 9.79120035810896e-06, + "loss": 0.9996, + "step": 2959 + }, + { + "epoch": 0.23919675144952424, + "grad_norm": 2.856213331222534, + "learning_rate": 9.791013192426513e-06, + "loss": 1.0295, + "step": 2960 + }, + { + "epoch": 0.2392775611628518, + "grad_norm": 2.7181801795959473, + "learning_rate": 9.790825944685542e-06, + "loss": 0.9916, + "step": 2961 + }, + { + "epoch": 0.2393583708761793, + "grad_norm": 2.5401663780212402, + "learning_rate": 9.790638614889256e-06, + "loss": 0.9128, + "step": 2962 + }, + { + "epoch": 0.23943918058950686, + "grad_norm": 2.5197436809539795, + "learning_rate": 9.790451203040865e-06, + "loss": 0.9002, + "step": 2963 + }, + { + "epoch": 0.2395199903028344, + "grad_norm": 2.8694779872894287, + "learning_rate": 9.790263709143577e-06, + "loss": 0.8323, + "step": 2964 + }, + { + "epoch": 0.23960080001616194, + "grad_norm": 2.646378755569458, + "learning_rate": 9.790076133200604e-06, + "loss": 1.0607, + "step": 2965 + }, + { + "epoch": 0.2396816097294895, + "grad_norm": 2.950759172439575, + "learning_rate": 9.789888475215158e-06, + "loss": 0.8371, + "step": 2966 + }, + { + "epoch": 0.23976241944281704, + "grad_norm": 2.921673059463501, + "learning_rate": 9.789700735190453e-06, + "loss": 0.9896, + "step": 2967 + }, + { + "epoch": 0.23984322915614456, + "grad_norm": 3.2484123706817627, + "learning_rate": 9.789512913129706e-06, + "loss": 0.8615, + "step": 2968 + }, + { + "epoch": 0.2399240388694721, + "grad_norm": 2.5600643157958984, + "learning_rate": 9.789325009036134e-06, + "loss": 0.9869, + "step": 2969 + }, + { + "epoch": 0.24000484858279966, + "grad_norm": 2.513324499130249, + "learning_rate": 9.789137022912953e-06, + "loss": 1.014, + "step": 2970 + }, + { + "epoch": 0.24008565829612719, + "grad_norm": 3.612118721008301, + "learning_rate": 9.788948954763385e-06, + "loss": 0.9303, + "step": 2971 + }, + { + "epoch": 0.24016646800945474, + "grad_norm": 2.5807790756225586, + "learning_rate": 9.78876080459065e-06, + "loss": 1.0551, + "step": 2972 + }, + { + "epoch": 0.2402472777227823, + "grad_norm": 2.8657569885253906, + "learning_rate": 9.788572572397969e-06, + "loss": 0.8713, + "step": 2973 + }, + { + "epoch": 0.2403280874361098, + "grad_norm": 3.181068181991577, + "learning_rate": 9.78838425818857e-06, + "loss": 0.917, + "step": 2974 + }, + { + "epoch": 0.24040889714943736, + "grad_norm": 3.0996499061584473, + "learning_rate": 9.788195861965678e-06, + "loss": 0.9985, + "step": 2975 + }, + { + "epoch": 0.2404897068627649, + "grad_norm": 2.7109224796295166, + "learning_rate": 9.788007383732514e-06, + "loss": 0.995, + "step": 2976 + }, + { + "epoch": 0.24057051657609244, + "grad_norm": 2.801605463027954, + "learning_rate": 9.787818823492312e-06, + "loss": 0.961, + "step": 2977 + }, + { + "epoch": 0.24065132628942, + "grad_norm": 2.9663140773773193, + "learning_rate": 9.7876301812483e-06, + "loss": 0.9846, + "step": 2978 + }, + { + "epoch": 0.24073213600274754, + "grad_norm": 2.6364505290985107, + "learning_rate": 9.787441457003709e-06, + "loss": 1.1276, + "step": 2979 + }, + { + "epoch": 0.24081294571607506, + "grad_norm": 2.5775394439697266, + "learning_rate": 9.78725265076177e-06, + "loss": 0.9062, + "step": 2980 + }, + { + "epoch": 0.2408937554294026, + "grad_norm": 3.2745320796966553, + "learning_rate": 9.787063762525717e-06, + "loss": 0.9921, + "step": 2981 + }, + { + "epoch": 0.24097456514273016, + "grad_norm": 2.4596545696258545, + "learning_rate": 9.786874792298788e-06, + "loss": 0.9798, + "step": 2982 + }, + { + "epoch": 0.24105537485605769, + "grad_norm": 2.857215166091919, + "learning_rate": 9.786685740084219e-06, + "loss": 0.9625, + "step": 2983 + }, + { + "epoch": 0.24113618456938524, + "grad_norm": 2.645327568054199, + "learning_rate": 9.786496605885245e-06, + "loss": 0.9486, + "step": 2984 + }, + { + "epoch": 0.2412169942827128, + "grad_norm": 2.879042148590088, + "learning_rate": 9.786307389705108e-06, + "loss": 0.9191, + "step": 2985 + }, + { + "epoch": 0.2412978039960403, + "grad_norm": 3.1000540256500244, + "learning_rate": 9.786118091547045e-06, + "loss": 1.0089, + "step": 2986 + }, + { + "epoch": 0.24137861370936786, + "grad_norm": 3.00303316116333, + "learning_rate": 9.785928711414306e-06, + "loss": 0.9263, + "step": 2987 + }, + { + "epoch": 0.2414594234226954, + "grad_norm": 2.8019394874572754, + "learning_rate": 9.785739249310126e-06, + "loss": 1.0141, + "step": 2988 + }, + { + "epoch": 0.24154023313602296, + "grad_norm": 2.7068896293640137, + "learning_rate": 9.785549705237755e-06, + "loss": 0.9381, + "step": 2989 + }, + { + "epoch": 0.2416210428493505, + "grad_norm": 2.8227250576019287, + "learning_rate": 9.785360079200439e-06, + "loss": 0.9671, + "step": 2990 + }, + { + "epoch": 0.24170185256267804, + "grad_norm": 3.5661299228668213, + "learning_rate": 9.785170371201424e-06, + "loss": 0.9556, + "step": 2991 + }, + { + "epoch": 0.2417826622760056, + "grad_norm": 2.823042392730713, + "learning_rate": 9.784980581243962e-06, + "loss": 1.0224, + "step": 2992 + }, + { + "epoch": 0.2418634719893331, + "grad_norm": 2.84657621383667, + "learning_rate": 9.7847907093313e-06, + "loss": 0.8715, + "step": 2993 + }, + { + "epoch": 0.24194428170266066, + "grad_norm": 3.0351145267486572, + "learning_rate": 9.784600755466693e-06, + "loss": 0.9789, + "step": 2994 + }, + { + "epoch": 0.2420250914159882, + "grad_norm": 2.847142219543457, + "learning_rate": 9.784410719653395e-06, + "loss": 1.0276, + "step": 2995 + }, + { + "epoch": 0.24210590112931574, + "grad_norm": 3.054438352584839, + "learning_rate": 9.784220601894656e-06, + "loss": 0.9257, + "step": 2996 + }, + { + "epoch": 0.2421867108426433, + "grad_norm": 2.8259174823760986, + "learning_rate": 9.784030402193737e-06, + "loss": 0.9961, + "step": 2997 + }, + { + "epoch": 0.24226752055597084, + "grad_norm": 2.6278364658355713, + "learning_rate": 9.783840120553895e-06, + "loss": 0.9693, + "step": 2998 + }, + { + "epoch": 0.24234833026929836, + "grad_norm": 2.6503517627716064, + "learning_rate": 9.78364975697839e-06, + "loss": 1.0962, + "step": 2999 + }, + { + "epoch": 0.2424291399826259, + "grad_norm": 2.714247941970825, + "learning_rate": 9.783459311470478e-06, + "loss": 0.9403, + "step": 3000 + }, + { + "epoch": 0.2424291399826259, + "eval_loss": 0.8388580083847046, + "eval_runtime": 814.4038, + "eval_samples_per_second": 102.364, + "eval_steps_per_second": 12.796, + "step": 3000 + }, + { + "epoch": 0.24250994969595346, + "grad_norm": 2.9535956382751465, + "learning_rate": 9.783268784033426e-06, + "loss": 0.9662, + "step": 3001 + }, + { + "epoch": 0.242590759409281, + "grad_norm": 2.770203113555908, + "learning_rate": 9.783078174670492e-06, + "loss": 0.9747, + "step": 3002 + }, + { + "epoch": 0.24267156912260854, + "grad_norm": 2.815793514251709, + "learning_rate": 9.782887483384946e-06, + "loss": 1.0265, + "step": 3003 + }, + { + "epoch": 0.2427523788359361, + "grad_norm": 2.5195326805114746, + "learning_rate": 9.782696710180051e-06, + "loss": 1.0203, + "step": 3004 + }, + { + "epoch": 0.2428331885492636, + "grad_norm": 3.260784387588501, + "learning_rate": 9.782505855059076e-06, + "loss": 0.9573, + "step": 3005 + }, + { + "epoch": 0.24291399826259116, + "grad_norm": 2.7619948387145996, + "learning_rate": 9.782314918025289e-06, + "loss": 1.0384, + "step": 3006 + }, + { + "epoch": 0.24299480797591871, + "grad_norm": 2.631403684616089, + "learning_rate": 9.782123899081958e-06, + "loss": 0.984, + "step": 3007 + }, + { + "epoch": 0.24307561768924624, + "grad_norm": 3.0094573497772217, + "learning_rate": 9.781932798232362e-06, + "loss": 1.1815, + "step": 3008 + }, + { + "epoch": 0.2431564274025738, + "grad_norm": 2.775972366333008, + "learning_rate": 9.781741615479764e-06, + "loss": 0.9415, + "step": 3009 + }, + { + "epoch": 0.24323723711590134, + "grad_norm": 2.6309051513671875, + "learning_rate": 9.781550350827446e-06, + "loss": 0.9621, + "step": 3010 + }, + { + "epoch": 0.24331804682922886, + "grad_norm": 2.8019237518310547, + "learning_rate": 9.78135900427868e-06, + "loss": 0.9563, + "step": 3011 + }, + { + "epoch": 0.2433988565425564, + "grad_norm": 3.1115059852600098, + "learning_rate": 9.781167575836747e-06, + "loss": 0.9576, + "step": 3012 + }, + { + "epoch": 0.24347966625588396, + "grad_norm": 2.668731212615967, + "learning_rate": 9.780976065504923e-06, + "loss": 0.9748, + "step": 3013 + }, + { + "epoch": 0.2435604759692115, + "grad_norm": 3.3178939819335938, + "learning_rate": 9.78078447328649e-06, + "loss": 0.9354, + "step": 3014 + }, + { + "epoch": 0.24364128568253904, + "grad_norm": 3.220385789871216, + "learning_rate": 9.780592799184728e-06, + "loss": 0.9475, + "step": 3015 + }, + { + "epoch": 0.2437220953958666, + "grad_norm": 2.9032740592956543, + "learning_rate": 9.780401043202919e-06, + "loss": 0.949, + "step": 3016 + }, + { + "epoch": 0.2438029051091941, + "grad_norm": 3.000532627105713, + "learning_rate": 9.780209205344347e-06, + "loss": 1.0839, + "step": 3017 + }, + { + "epoch": 0.24388371482252166, + "grad_norm": 2.539182424545288, + "learning_rate": 9.780017285612303e-06, + "loss": 0.9495, + "step": 3018 + }, + { + "epoch": 0.24396452453584921, + "grad_norm": 2.2839114665985107, + "learning_rate": 9.779825284010067e-06, + "loss": 0.974, + "step": 3019 + }, + { + "epoch": 0.24404533424917674, + "grad_norm": 2.8400840759277344, + "learning_rate": 9.779633200540933e-06, + "loss": 0.9371, + "step": 3020 + }, + { + "epoch": 0.2441261439625043, + "grad_norm": 2.7170376777648926, + "learning_rate": 9.779441035208185e-06, + "loss": 1.0756, + "step": 3021 + }, + { + "epoch": 0.24420695367583184, + "grad_norm": 2.4907898902893066, + "learning_rate": 9.779248788015123e-06, + "loss": 1.0196, + "step": 3022 + }, + { + "epoch": 0.2442877633891594, + "grad_norm": 3.175201654434204, + "learning_rate": 9.779056458965032e-06, + "loss": 1.061, + "step": 3023 + }, + { + "epoch": 0.2443685731024869, + "grad_norm": 2.583282709121704, + "learning_rate": 9.778864048061209e-06, + "loss": 0.9611, + "step": 3024 + }, + { + "epoch": 0.24444938281581446, + "grad_norm": 2.6285994052886963, + "learning_rate": 9.77867155530695e-06, + "loss": 1.0399, + "step": 3025 + }, + { + "epoch": 0.24453019252914202, + "grad_norm": 3.0164005756378174, + "learning_rate": 9.778478980705552e-06, + "loss": 0.9734, + "step": 3026 + }, + { + "epoch": 0.24461100224246954, + "grad_norm": 2.634148359298706, + "learning_rate": 9.778286324260314e-06, + "loss": 1.073, + "step": 3027 + }, + { + "epoch": 0.2446918119557971, + "grad_norm": 2.6194915771484375, + "learning_rate": 9.778093585974531e-06, + "loss": 0.9263, + "step": 3028 + }, + { + "epoch": 0.24477262166912464, + "grad_norm": 2.734647512435913, + "learning_rate": 9.77790076585151e-06, + "loss": 1.0295, + "step": 3029 + }, + { + "epoch": 0.24485343138245216, + "grad_norm": 3.3778486251831055, + "learning_rate": 9.777707863894551e-06, + "loss": 1.0255, + "step": 3030 + }, + { + "epoch": 0.24493424109577971, + "grad_norm": 2.509401798248291, + "learning_rate": 9.777514880106957e-06, + "loss": 0.9847, + "step": 3031 + }, + { + "epoch": 0.24501505080910727, + "grad_norm": 3.0848164558410645, + "learning_rate": 9.777321814492036e-06, + "loss": 0.9488, + "step": 3032 + }, + { + "epoch": 0.2450958605224348, + "grad_norm": 2.637641191482544, + "learning_rate": 9.777128667053093e-06, + "loss": 0.8775, + "step": 3033 + }, + { + "epoch": 0.24517667023576234, + "grad_norm": 2.3294782638549805, + "learning_rate": 9.776935437793436e-06, + "loss": 1.1534, + "step": 3034 + }, + { + "epoch": 0.2452574799490899, + "grad_norm": 2.6865875720977783, + "learning_rate": 9.776742126716374e-06, + "loss": 0.846, + "step": 3035 + }, + { + "epoch": 0.2453382896624174, + "grad_norm": 2.6133646965026855, + "learning_rate": 9.77654873382522e-06, + "loss": 1.0109, + "step": 3036 + }, + { + "epoch": 0.24541909937574496, + "grad_norm": 2.9119057655334473, + "learning_rate": 9.776355259123286e-06, + "loss": 1.0162, + "step": 3037 + }, + { + "epoch": 0.24549990908907252, + "grad_norm": 3.213521718978882, + "learning_rate": 9.776161702613884e-06, + "loss": 0.8763, + "step": 3038 + }, + { + "epoch": 0.24558071880240004, + "grad_norm": 2.7100210189819336, + "learning_rate": 9.775968064300331e-06, + "loss": 1.001, + "step": 3039 + }, + { + "epoch": 0.2456615285157276, + "grad_norm": 2.9769105911254883, + "learning_rate": 9.775774344185942e-06, + "loss": 1.0154, + "step": 3040 + }, + { + "epoch": 0.24574233822905514, + "grad_norm": 3.063699960708618, + "learning_rate": 9.775580542274035e-06, + "loss": 0.9613, + "step": 3041 + }, + { + "epoch": 0.24582314794238266, + "grad_norm": 2.744769334793091, + "learning_rate": 9.775386658567931e-06, + "loss": 1.007, + "step": 3042 + }, + { + "epoch": 0.24590395765571021, + "grad_norm": 2.8555009365081787, + "learning_rate": 9.775192693070949e-06, + "loss": 0.9691, + "step": 3043 + }, + { + "epoch": 0.24598476736903777, + "grad_norm": 2.6923105716705322, + "learning_rate": 9.774998645786413e-06, + "loss": 1.0184, + "step": 3044 + }, + { + "epoch": 0.2460655770823653, + "grad_norm": 2.7845826148986816, + "learning_rate": 9.774804516717646e-06, + "loss": 0.8735, + "step": 3045 + }, + { + "epoch": 0.24614638679569284, + "grad_norm": 2.641279935836792, + "learning_rate": 9.774610305867972e-06, + "loss": 1.0847, + "step": 3046 + }, + { + "epoch": 0.2462271965090204, + "grad_norm": 2.825593948364258, + "learning_rate": 9.774416013240717e-06, + "loss": 1.0203, + "step": 3047 + }, + { + "epoch": 0.2463080062223479, + "grad_norm": 2.7793221473693848, + "learning_rate": 9.77422163883921e-06, + "loss": 0.9459, + "step": 3048 + }, + { + "epoch": 0.24638881593567546, + "grad_norm": 2.340348482131958, + "learning_rate": 9.77402718266678e-06, + "loss": 0.9582, + "step": 3049 + }, + { + "epoch": 0.24646962564900302, + "grad_norm": 2.321822166442871, + "learning_rate": 9.773832644726757e-06, + "loss": 1.0461, + "step": 3050 + }, + { + "epoch": 0.24655043536233054, + "grad_norm": 2.6112189292907715, + "learning_rate": 9.773638025022474e-06, + "loss": 1.0322, + "step": 3051 + }, + { + "epoch": 0.2466312450756581, + "grad_norm": 2.735830545425415, + "learning_rate": 9.773443323557263e-06, + "loss": 0.972, + "step": 3052 + }, + { + "epoch": 0.24671205478898564, + "grad_norm": 2.960653781890869, + "learning_rate": 9.773248540334461e-06, + "loss": 1.0007, + "step": 3053 + }, + { + "epoch": 0.2467928645023132, + "grad_norm": 2.744239330291748, + "learning_rate": 9.7730536753574e-06, + "loss": 1.0094, + "step": 3054 + }, + { + "epoch": 0.24687367421564071, + "grad_norm": 3.0212156772613525, + "learning_rate": 9.772858728629421e-06, + "loss": 0.9539, + "step": 3055 + }, + { + "epoch": 0.24695448392896827, + "grad_norm": 2.6391782760620117, + "learning_rate": 9.772663700153864e-06, + "loss": 1.0186, + "step": 3056 + }, + { + "epoch": 0.24703529364229582, + "grad_norm": 2.6837496757507324, + "learning_rate": 9.772468589934066e-06, + "loss": 1.0038, + "step": 3057 + }, + { + "epoch": 0.24711610335562334, + "grad_norm": 2.7959649562835693, + "learning_rate": 9.77227339797337e-06, + "loss": 1.0742, + "step": 3058 + }, + { + "epoch": 0.2471969130689509, + "grad_norm": 2.8792660236358643, + "learning_rate": 9.772078124275121e-06, + "loss": 0.9766, + "step": 3059 + }, + { + "epoch": 0.24727772278227844, + "grad_norm": 2.823575496673584, + "learning_rate": 9.771882768842663e-06, + "loss": 0.9364, + "step": 3060 + }, + { + "epoch": 0.24735853249560597, + "grad_norm": 2.5848450660705566, + "learning_rate": 9.771687331679338e-06, + "loss": 0.9675, + "step": 3061 + }, + { + "epoch": 0.24743934220893352, + "grad_norm": 2.9905905723571777, + "learning_rate": 9.771491812788498e-06, + "loss": 1.0847, + "step": 3062 + }, + { + "epoch": 0.24752015192226107, + "grad_norm": 3.0971758365631104, + "learning_rate": 9.77129621217349e-06, + "loss": 0.9738, + "step": 3063 + }, + { + "epoch": 0.2476009616355886, + "grad_norm": 2.6731889247894287, + "learning_rate": 9.771100529837662e-06, + "loss": 0.9633, + "step": 3064 + }, + { + "epoch": 0.24768177134891614, + "grad_norm": 2.5933165550231934, + "learning_rate": 9.770904765784372e-06, + "loss": 1.0145, + "step": 3065 + }, + { + "epoch": 0.2477625810622437, + "grad_norm": 2.8988571166992188, + "learning_rate": 9.770708920016967e-06, + "loss": 1.002, + "step": 3066 + }, + { + "epoch": 0.24784339077557122, + "grad_norm": 2.518685817718506, + "learning_rate": 9.770512992538801e-06, + "loss": 0.9592, + "step": 3067 + }, + { + "epoch": 0.24792420048889877, + "grad_norm": 2.4646313190460205, + "learning_rate": 9.770316983353235e-06, + "loss": 0.9018, + "step": 3068 + }, + { + "epoch": 0.24800501020222632, + "grad_norm": 2.8748724460601807, + "learning_rate": 9.770120892463622e-06, + "loss": 0.9996, + "step": 3069 + }, + { + "epoch": 0.24808581991555384, + "grad_norm": 2.6587514877319336, + "learning_rate": 9.769924719873322e-06, + "loss": 0.9461, + "step": 3070 + }, + { + "epoch": 0.2481666296288814, + "grad_norm": 2.981292247772217, + "learning_rate": 9.769728465585694e-06, + "loss": 0.9517, + "step": 3071 + }, + { + "epoch": 0.24824743934220894, + "grad_norm": 2.7922825813293457, + "learning_rate": 9.7695321296041e-06, + "loss": 1.0152, + "step": 3072 + }, + { + "epoch": 0.24832824905553647, + "grad_norm": 2.7855329513549805, + "learning_rate": 9.769335711931904e-06, + "loss": 0.9409, + "step": 3073 + }, + { + "epoch": 0.24840905876886402, + "grad_norm": 2.9992854595184326, + "learning_rate": 9.769139212572469e-06, + "loss": 1.0594, + "step": 3074 + }, + { + "epoch": 0.24848986848219157, + "grad_norm": 2.942552089691162, + "learning_rate": 9.768942631529158e-06, + "loss": 0.9937, + "step": 3075 + }, + { + "epoch": 0.2485706781955191, + "grad_norm": 2.8532662391662598, + "learning_rate": 9.768745968805343e-06, + "loss": 0.9324, + "step": 3076 + }, + { + "epoch": 0.24865148790884664, + "grad_norm": 3.196516752243042, + "learning_rate": 9.768549224404388e-06, + "loss": 0.9785, + "step": 3077 + }, + { + "epoch": 0.2487322976221742, + "grad_norm": 2.676330089569092, + "learning_rate": 9.768352398329668e-06, + "loss": 1.0158, + "step": 3078 + }, + { + "epoch": 0.24881310733550172, + "grad_norm": 2.9280941486358643, + "learning_rate": 9.768155490584548e-06, + "loss": 1.0447, + "step": 3079 + }, + { + "epoch": 0.24889391704882927, + "grad_norm": 3.0791561603546143, + "learning_rate": 9.767958501172404e-06, + "loss": 1.0058, + "step": 3080 + }, + { + "epoch": 0.24897472676215682, + "grad_norm": 2.6139445304870605, + "learning_rate": 9.767761430096608e-06, + "loss": 0.9489, + "step": 3081 + }, + { + "epoch": 0.24905553647548434, + "grad_norm": 3.2888574600219727, + "learning_rate": 9.767564277360538e-06, + "loss": 1.0186, + "step": 3082 + }, + { + "epoch": 0.2491363461888119, + "grad_norm": 2.849696636199951, + "learning_rate": 9.767367042967568e-06, + "loss": 1.0644, + "step": 3083 + }, + { + "epoch": 0.24921715590213944, + "grad_norm": 2.7495691776275635, + "learning_rate": 9.76716972692108e-06, + "loss": 1.0258, + "step": 3084 + }, + { + "epoch": 0.24929796561546697, + "grad_norm": 2.598660945892334, + "learning_rate": 9.766972329224449e-06, + "loss": 0.9113, + "step": 3085 + }, + { + "epoch": 0.24937877532879452, + "grad_norm": 2.9330737590789795, + "learning_rate": 9.766774849881058e-06, + "loss": 0.9168, + "step": 3086 + }, + { + "epoch": 0.24945958504212207, + "grad_norm": 3.1084072589874268, + "learning_rate": 9.766577288894291e-06, + "loss": 1.0031, + "step": 3087 + }, + { + "epoch": 0.24954039475544962, + "grad_norm": 2.482382297515869, + "learning_rate": 9.76637964626753e-06, + "loss": 0.9676, + "step": 3088 + }, + { + "epoch": 0.24962120446877714, + "grad_norm": 3.176751136779785, + "learning_rate": 9.766181922004158e-06, + "loss": 0.9304, + "step": 3089 + }, + { + "epoch": 0.2497020141821047, + "grad_norm": 2.7606074810028076, + "learning_rate": 9.765984116107565e-06, + "loss": 1.0468, + "step": 3090 + }, + { + "epoch": 0.24978282389543224, + "grad_norm": 2.808696985244751, + "learning_rate": 9.765786228581138e-06, + "loss": 0.8697, + "step": 3091 + }, + { + "epoch": 0.24986363360875977, + "grad_norm": 2.8495774269104004, + "learning_rate": 9.765588259428267e-06, + "loss": 1.0248, + "step": 3092 + }, + { + "epoch": 0.24994444332208732, + "grad_norm": 2.910490036010742, + "learning_rate": 9.76539020865234e-06, + "loss": 1.0361, + "step": 3093 + }, + { + "epoch": 0.25002525303541484, + "grad_norm": 2.546539306640625, + "learning_rate": 9.765192076256752e-06, + "loss": 0.9902, + "step": 3094 + }, + { + "epoch": 0.2501060627487424, + "grad_norm": 2.936716079711914, + "learning_rate": 9.764993862244895e-06, + "loss": 1.1006, + "step": 3095 + }, + { + "epoch": 0.25018687246206994, + "grad_norm": 2.944744110107422, + "learning_rate": 9.764795566620164e-06, + "loss": 0.9621, + "step": 3096 + }, + { + "epoch": 0.25026768217539747, + "grad_norm": 2.513930559158325, + "learning_rate": 9.764597189385957e-06, + "loss": 0.9737, + "step": 3097 + }, + { + "epoch": 0.25034849188872504, + "grad_norm": 2.5775349140167236, + "learning_rate": 9.76439873054567e-06, + "loss": 0.9102, + "step": 3098 + }, + { + "epoch": 0.25042930160205257, + "grad_norm": 2.507645845413208, + "learning_rate": 9.764200190102702e-06, + "loss": 0.9838, + "step": 3099 + }, + { + "epoch": 0.2505101113153801, + "grad_norm": 2.95524263381958, + "learning_rate": 9.764001568060455e-06, + "loss": 0.9593, + "step": 3100 + }, + { + "epoch": 0.25059092102870767, + "grad_norm": 2.742894172668457, + "learning_rate": 9.763802864422329e-06, + "loss": 0.9644, + "step": 3101 + }, + { + "epoch": 0.2506717307420352, + "grad_norm": 2.268688201904297, + "learning_rate": 9.76360407919173e-06, + "loss": 1.0229, + "step": 3102 + }, + { + "epoch": 0.2507525404553627, + "grad_norm": 2.7283692359924316, + "learning_rate": 9.763405212372059e-06, + "loss": 1.0102, + "step": 3103 + }, + { + "epoch": 0.2508333501686903, + "grad_norm": 2.77974534034729, + "learning_rate": 9.763206263966725e-06, + "loss": 1.0688, + "step": 3104 + }, + { + "epoch": 0.2509141598820178, + "grad_norm": 2.8854501247406006, + "learning_rate": 9.763007233979133e-06, + "loss": 1.1202, + "step": 3105 + }, + { + "epoch": 0.25099496959534534, + "grad_norm": 2.7231879234313965, + "learning_rate": 9.762808122412694e-06, + "loss": 0.9909, + "step": 3106 + }, + { + "epoch": 0.2510757793086729, + "grad_norm": 2.7498559951782227, + "learning_rate": 9.762608929270821e-06, + "loss": 0.9976, + "step": 3107 + }, + { + "epoch": 0.25115658902200044, + "grad_norm": 2.8472721576690674, + "learning_rate": 9.762409654556919e-06, + "loss": 0.9627, + "step": 3108 + }, + { + "epoch": 0.25123739873532797, + "grad_norm": 2.599015474319458, + "learning_rate": 9.762210298274408e-06, + "loss": 0.9065, + "step": 3109 + }, + { + "epoch": 0.25131820844865554, + "grad_norm": 2.8924973011016846, + "learning_rate": 9.762010860426696e-06, + "loss": 1.0123, + "step": 3110 + }, + { + "epoch": 0.25139901816198307, + "grad_norm": 2.4286463260650635, + "learning_rate": 9.761811341017205e-06, + "loss": 1.1075, + "step": 3111 + }, + { + "epoch": 0.2514798278753106, + "grad_norm": 2.6996352672576904, + "learning_rate": 9.761611740049345e-06, + "loss": 1.0744, + "step": 3112 + }, + { + "epoch": 0.25156063758863817, + "grad_norm": 3.5628347396850586, + "learning_rate": 9.761412057526543e-06, + "loss": 0.8973, + "step": 3113 + }, + { + "epoch": 0.2516414473019657, + "grad_norm": 2.604649305343628, + "learning_rate": 9.761212293452213e-06, + "loss": 0.9428, + "step": 3114 + }, + { + "epoch": 0.2517222570152932, + "grad_norm": 2.881993293762207, + "learning_rate": 9.76101244782978e-06, + "loss": 0.8902, + "step": 3115 + }, + { + "epoch": 0.2518030667286208, + "grad_norm": 2.4835922718048096, + "learning_rate": 9.760812520662665e-06, + "loss": 0.9009, + "step": 3116 + }, + { + "epoch": 0.2518838764419483, + "grad_norm": 2.4867897033691406, + "learning_rate": 9.760612511954293e-06, + "loss": 1.0544, + "step": 3117 + }, + { + "epoch": 0.25196468615527584, + "grad_norm": 2.5339856147766113, + "learning_rate": 9.760412421708087e-06, + "loss": 0.851, + "step": 3118 + }, + { + "epoch": 0.2520454958686034, + "grad_norm": 2.5214011669158936, + "learning_rate": 9.760212249927479e-06, + "loss": 1.0445, + "step": 3119 + }, + { + "epoch": 0.25212630558193094, + "grad_norm": 2.8076467514038086, + "learning_rate": 9.760011996615894e-06, + "loss": 1.1723, + "step": 3120 + }, + { + "epoch": 0.25220711529525847, + "grad_norm": 2.591118812561035, + "learning_rate": 9.759811661776763e-06, + "loss": 0.9127, + "step": 3121 + }, + { + "epoch": 0.25228792500858604, + "grad_norm": 2.901501417160034, + "learning_rate": 9.759611245413518e-06, + "loss": 0.8978, + "step": 3122 + }, + { + "epoch": 0.25236873472191357, + "grad_norm": 3.079751968383789, + "learning_rate": 9.759410747529589e-06, + "loss": 1.037, + "step": 3123 + }, + { + "epoch": 0.2524495444352411, + "grad_norm": 2.789341926574707, + "learning_rate": 9.759210168128412e-06, + "loss": 0.9348, + "step": 3124 + }, + { + "epoch": 0.25253035414856867, + "grad_norm": 2.325867176055908, + "learning_rate": 9.759009507213423e-06, + "loss": 0.8928, + "step": 3125 + }, + { + "epoch": 0.2526111638618962, + "grad_norm": 3.3124213218688965, + "learning_rate": 9.758808764788056e-06, + "loss": 1.0175, + "step": 3126 + }, + { + "epoch": 0.2526919735752237, + "grad_norm": 2.7578797340393066, + "learning_rate": 9.758607940855754e-06, + "loss": 0.9931, + "step": 3127 + }, + { + "epoch": 0.2527727832885513, + "grad_norm": 2.998295307159424, + "learning_rate": 9.758407035419952e-06, + "loss": 0.9904, + "step": 3128 + }, + { + "epoch": 0.2528535930018788, + "grad_norm": 2.877265453338623, + "learning_rate": 9.758206048484094e-06, + "loss": 1.0652, + "step": 3129 + }, + { + "epoch": 0.25293440271520634, + "grad_norm": 2.6761255264282227, + "learning_rate": 9.758004980051621e-06, + "loss": 1.0342, + "step": 3130 + }, + { + "epoch": 0.2530152124285339, + "grad_norm": 2.4134268760681152, + "learning_rate": 9.757803830125976e-06, + "loss": 1.0312, + "step": 3131 + }, + { + "epoch": 0.25309602214186144, + "grad_norm": 2.7035300731658936, + "learning_rate": 9.757602598710607e-06, + "loss": 0.9359, + "step": 3132 + }, + { + "epoch": 0.25317683185518897, + "grad_norm": 2.915403366088867, + "learning_rate": 9.757401285808957e-06, + "loss": 1.0197, + "step": 3133 + }, + { + "epoch": 0.25325764156851654, + "grad_norm": 2.634140968322754, + "learning_rate": 9.757199891424478e-06, + "loss": 0.9837, + "step": 3134 + }, + { + "epoch": 0.25333845128184407, + "grad_norm": 2.884725570678711, + "learning_rate": 9.756998415560616e-06, + "loss": 1.068, + "step": 3135 + }, + { + "epoch": 0.25341926099517165, + "grad_norm": 2.804107666015625, + "learning_rate": 9.756796858220823e-06, + "loss": 0.9709, + "step": 3136 + }, + { + "epoch": 0.25350007070849917, + "grad_norm": 2.7716612815856934, + "learning_rate": 9.756595219408552e-06, + "loss": 0.9399, + "step": 3137 + }, + { + "epoch": 0.2535808804218267, + "grad_norm": 2.914701461791992, + "learning_rate": 9.756393499127257e-06, + "loss": 1.0254, + "step": 3138 + }, + { + "epoch": 0.25366169013515427, + "grad_norm": 2.6848130226135254, + "learning_rate": 9.756191697380391e-06, + "loss": 1.0057, + "step": 3139 + }, + { + "epoch": 0.2537424998484818, + "grad_norm": 2.1119136810302734, + "learning_rate": 9.755989814171409e-06, + "loss": 0.9878, + "step": 3140 + }, + { + "epoch": 0.2538233095618093, + "grad_norm": 2.6999025344848633, + "learning_rate": 9.755787849503775e-06, + "loss": 0.9924, + "step": 3141 + }, + { + "epoch": 0.2539041192751369, + "grad_norm": 2.9252264499664307, + "learning_rate": 9.755585803380941e-06, + "loss": 0.9457, + "step": 3142 + }, + { + "epoch": 0.2539849289884644, + "grad_norm": 2.814427137374878, + "learning_rate": 9.755383675806372e-06, + "loss": 0.9491, + "step": 3143 + }, + { + "epoch": 0.25406573870179194, + "grad_norm": 2.658046007156372, + "learning_rate": 9.75518146678353e-06, + "loss": 1.0038, + "step": 3144 + }, + { + "epoch": 0.2541465484151195, + "grad_norm": 2.743593454360962, + "learning_rate": 9.754979176315876e-06, + "loss": 1.1438, + "step": 3145 + }, + { + "epoch": 0.25422735812844705, + "grad_norm": 3.0818870067596436, + "learning_rate": 9.754776804406876e-06, + "loss": 1.0256, + "step": 3146 + }, + { + "epoch": 0.25430816784177457, + "grad_norm": 2.808840036392212, + "learning_rate": 9.754574351059995e-06, + "loss": 0.9556, + "step": 3147 + }, + { + "epoch": 0.25438897755510215, + "grad_norm": 2.8346235752105713, + "learning_rate": 9.754371816278702e-06, + "loss": 0.9963, + "step": 3148 + }, + { + "epoch": 0.25446978726842967, + "grad_norm": 2.8395915031433105, + "learning_rate": 9.754169200066466e-06, + "loss": 0.9539, + "step": 3149 + }, + { + "epoch": 0.2545505969817572, + "grad_norm": 3.3882341384887695, + "learning_rate": 9.753966502426756e-06, + "loss": 1.0115, + "step": 3150 + }, + { + "epoch": 0.25463140669508477, + "grad_norm": 3.1990177631378174, + "learning_rate": 9.753763723363045e-06, + "loss": 0.989, + "step": 3151 + }, + { + "epoch": 0.2547122164084123, + "grad_norm": 2.8756611347198486, + "learning_rate": 9.753560862878806e-06, + "loss": 0.9778, + "step": 3152 + }, + { + "epoch": 0.2547930261217398, + "grad_norm": 2.808302879333496, + "learning_rate": 9.753357920977514e-06, + "loss": 1.048, + "step": 3153 + }, + { + "epoch": 0.2548738358350674, + "grad_norm": 2.6227824687957764, + "learning_rate": 9.753154897662642e-06, + "loss": 0.9811, + "step": 3154 + }, + { + "epoch": 0.2549546455483949, + "grad_norm": 2.6170687675476074, + "learning_rate": 9.75295179293767e-06, + "loss": 0.9553, + "step": 3155 + }, + { + "epoch": 0.25503545526172244, + "grad_norm": 2.8382341861724854, + "learning_rate": 9.752748606806077e-06, + "loss": 0.9198, + "step": 3156 + }, + { + "epoch": 0.25511626497505, + "grad_norm": 2.7465264797210693, + "learning_rate": 9.752545339271342e-06, + "loss": 0.9956, + "step": 3157 + }, + { + "epoch": 0.25519707468837755, + "grad_norm": 2.976637125015259, + "learning_rate": 9.752341990336946e-06, + "loss": 1.0486, + "step": 3158 + }, + { + "epoch": 0.25527788440170507, + "grad_norm": 2.98663592338562, + "learning_rate": 9.752138560006372e-06, + "loss": 1.0955, + "step": 3159 + }, + { + "epoch": 0.25535869411503265, + "grad_norm": 2.647614002227783, + "learning_rate": 9.751935048283105e-06, + "loss": 1.0881, + "step": 3160 + }, + { + "epoch": 0.25543950382836017, + "grad_norm": 2.572448968887329, + "learning_rate": 9.751731455170632e-06, + "loss": 0.9703, + "step": 3161 + }, + { + "epoch": 0.2555203135416877, + "grad_norm": 2.944451332092285, + "learning_rate": 9.751527780672438e-06, + "loss": 0.811, + "step": 3162 + }, + { + "epoch": 0.2556011232550153, + "grad_norm": 2.7639005184173584, + "learning_rate": 9.75132402479201e-06, + "loss": 1.0385, + "step": 3163 + }, + { + "epoch": 0.2556819329683428, + "grad_norm": 2.619258165359497, + "learning_rate": 9.75112018753284e-06, + "loss": 0.9642, + "step": 3164 + }, + { + "epoch": 0.2557627426816703, + "grad_norm": 3.1263070106506348, + "learning_rate": 9.750916268898423e-06, + "loss": 0.9945, + "step": 3165 + }, + { + "epoch": 0.2558435523949979, + "grad_norm": 3.069749116897583, + "learning_rate": 9.750712268892245e-06, + "loss": 0.965, + "step": 3166 + }, + { + "epoch": 0.2559243621083254, + "grad_norm": 2.712395429611206, + "learning_rate": 9.750508187517802e-06, + "loss": 1.0585, + "step": 3167 + }, + { + "epoch": 0.25600517182165294, + "grad_norm": 2.7137744426727295, + "learning_rate": 9.75030402477859e-06, + "loss": 1.1198, + "step": 3168 + }, + { + "epoch": 0.2560859815349805, + "grad_norm": 3.896778106689453, + "learning_rate": 9.750099780678106e-06, + "loss": 1.0689, + "step": 3169 + }, + { + "epoch": 0.25616679124830805, + "grad_norm": 2.7930946350097656, + "learning_rate": 9.749895455219849e-06, + "loss": 0.9992, + "step": 3170 + }, + { + "epoch": 0.25624760096163557, + "grad_norm": 2.665447473526001, + "learning_rate": 9.749691048407318e-06, + "loss": 0.9512, + "step": 3171 + }, + { + "epoch": 0.25632841067496315, + "grad_norm": 2.651581287384033, + "learning_rate": 9.749486560244014e-06, + "loss": 0.9462, + "step": 3172 + }, + { + "epoch": 0.25640922038829067, + "grad_norm": 2.931328535079956, + "learning_rate": 9.749281990733438e-06, + "loss": 0.9202, + "step": 3173 + }, + { + "epoch": 0.2564900301016182, + "grad_norm": 2.4715116024017334, + "learning_rate": 9.749077339879095e-06, + "loss": 1.0637, + "step": 3174 + }, + { + "epoch": 0.2565708398149458, + "grad_norm": 2.9267590045928955, + "learning_rate": 9.74887260768449e-06, + "loss": 1.1934, + "step": 3175 + }, + { + "epoch": 0.2566516495282733, + "grad_norm": 2.7444472312927246, + "learning_rate": 9.748667794153131e-06, + "loss": 0.8217, + "step": 3176 + }, + { + "epoch": 0.2567324592416008, + "grad_norm": 2.5331554412841797, + "learning_rate": 9.748462899288523e-06, + "loss": 1.0896, + "step": 3177 + }, + { + "epoch": 0.2568132689549284, + "grad_norm": 2.6985771656036377, + "learning_rate": 9.748257923094177e-06, + "loss": 0.9791, + "step": 3178 + }, + { + "epoch": 0.2568940786682559, + "grad_norm": 2.4230868816375732, + "learning_rate": 9.748052865573605e-06, + "loss": 1.0133, + "step": 3179 + }, + { + "epoch": 0.25697488838158344, + "grad_norm": 2.8683249950408936, + "learning_rate": 9.747847726730318e-06, + "loss": 0.9224, + "step": 3180 + }, + { + "epoch": 0.257055698094911, + "grad_norm": 3.0334200859069824, + "learning_rate": 9.74764250656783e-06, + "loss": 0.8719, + "step": 3181 + }, + { + "epoch": 0.25713650780823855, + "grad_norm": 3.434133768081665, + "learning_rate": 9.747437205089654e-06, + "loss": 0.9158, + "step": 3182 + }, + { + "epoch": 0.25721731752156607, + "grad_norm": 2.634556770324707, + "learning_rate": 9.747231822299308e-06, + "loss": 1.0091, + "step": 3183 + }, + { + "epoch": 0.25729812723489365, + "grad_norm": 2.6447670459747314, + "learning_rate": 9.747026358200309e-06, + "loss": 0.9963, + "step": 3184 + }, + { + "epoch": 0.25737893694822117, + "grad_norm": 2.687317132949829, + "learning_rate": 9.746820812796176e-06, + "loss": 1.1168, + "step": 3185 + }, + { + "epoch": 0.2574597466615487, + "grad_norm": 2.8211278915405273, + "learning_rate": 9.746615186090432e-06, + "loss": 1.0444, + "step": 3186 + }, + { + "epoch": 0.2575405563748763, + "grad_norm": 2.3882555961608887, + "learning_rate": 9.746409478086593e-06, + "loss": 1.0406, + "step": 3187 + }, + { + "epoch": 0.2576213660882038, + "grad_norm": 2.952209711074829, + "learning_rate": 9.74620368878819e-06, + "loss": 0.9646, + "step": 3188 + }, + { + "epoch": 0.2577021758015313, + "grad_norm": 2.738934278488159, + "learning_rate": 9.745997818198743e-06, + "loss": 0.9903, + "step": 3189 + }, + { + "epoch": 0.2577829855148589, + "grad_norm": 3.0450305938720703, + "learning_rate": 9.74579186632178e-06, + "loss": 1.0351, + "step": 3190 + }, + { + "epoch": 0.2578637952281864, + "grad_norm": 2.916048049926758, + "learning_rate": 9.745585833160824e-06, + "loss": 1.0677, + "step": 3191 + }, + { + "epoch": 0.25794460494151394, + "grad_norm": 2.883650541305542, + "learning_rate": 9.745379718719408e-06, + "loss": 0.8588, + "step": 3192 + }, + { + "epoch": 0.2580254146548415, + "grad_norm": 2.514897584915161, + "learning_rate": 9.745173523001063e-06, + "loss": 1.0435, + "step": 3193 + }, + { + "epoch": 0.25810622436816905, + "grad_norm": 3.411410331726074, + "learning_rate": 9.744967246009319e-06, + "loss": 1.0101, + "step": 3194 + }, + { + "epoch": 0.25818703408149657, + "grad_norm": 2.801797389984131, + "learning_rate": 9.744760887747708e-06, + "loss": 0.9591, + "step": 3195 + }, + { + "epoch": 0.25826784379482415, + "grad_norm": 2.578493118286133, + "learning_rate": 9.744554448219767e-06, + "loss": 1.1203, + "step": 3196 + }, + { + "epoch": 0.25834865350815167, + "grad_norm": 2.7883083820343018, + "learning_rate": 9.74434792742903e-06, + "loss": 1.0737, + "step": 3197 + }, + { + "epoch": 0.2584294632214792, + "grad_norm": 3.0916290283203125, + "learning_rate": 9.744141325379032e-06, + "loss": 0.8681, + "step": 3198 + }, + { + "epoch": 0.2585102729348068, + "grad_norm": 2.6086947917938232, + "learning_rate": 9.743934642073318e-06, + "loss": 0.9693, + "step": 3199 + }, + { + "epoch": 0.2585910826481343, + "grad_norm": 3.25252628326416, + "learning_rate": 9.743727877515422e-06, + "loss": 1.0934, + "step": 3200 + }, + { + "epoch": 0.2586718923614619, + "grad_norm": 2.8991661071777344, + "learning_rate": 9.743521031708888e-06, + "loss": 1.045, + "step": 3201 + }, + { + "epoch": 0.2587527020747894, + "grad_norm": 2.863893985748291, + "learning_rate": 9.743314104657258e-06, + "loss": 1.0193, + "step": 3202 + }, + { + "epoch": 0.2588335117881169, + "grad_norm": 2.706991672515869, + "learning_rate": 9.743107096364078e-06, + "loss": 1.0053, + "step": 3203 + }, + { + "epoch": 0.2589143215014445, + "grad_norm": 3.0327980518341064, + "learning_rate": 9.742900006832889e-06, + "loss": 0.9308, + "step": 3204 + }, + { + "epoch": 0.258995131214772, + "grad_norm": 2.8023879528045654, + "learning_rate": 9.742692836067242e-06, + "loss": 1.0627, + "step": 3205 + }, + { + "epoch": 0.25907594092809955, + "grad_norm": 2.7891502380371094, + "learning_rate": 9.742485584070687e-06, + "loss": 1.0111, + "step": 3206 + }, + { + "epoch": 0.2591567506414271, + "grad_norm": 2.743758201599121, + "learning_rate": 9.742278250846769e-06, + "loss": 1.001, + "step": 3207 + }, + { + "epoch": 0.25923756035475465, + "grad_norm": 3.3825297355651855, + "learning_rate": 9.74207083639904e-06, + "loss": 1.0003, + "step": 3208 + }, + { + "epoch": 0.25931837006808217, + "grad_norm": 2.479440927505493, + "learning_rate": 9.741863340731054e-06, + "loss": 1.0394, + "step": 3209 + }, + { + "epoch": 0.25939917978140975, + "grad_norm": 3.305624485015869, + "learning_rate": 9.741655763846365e-06, + "loss": 0.9289, + "step": 3210 + }, + { + "epoch": 0.2594799894947373, + "grad_norm": 3.0319902896881104, + "learning_rate": 9.741448105748529e-06, + "loss": 0.9316, + "step": 3211 + }, + { + "epoch": 0.2595607992080648, + "grad_norm": 3.279977560043335, + "learning_rate": 9.7412403664411e-06, + "loss": 1.0275, + "step": 3212 + }, + { + "epoch": 0.2596416089213924, + "grad_norm": 3.2768304347991943, + "learning_rate": 9.741032545927639e-06, + "loss": 0.9945, + "step": 3213 + }, + { + "epoch": 0.2597224186347199, + "grad_norm": 3.07078218460083, + "learning_rate": 9.740824644211703e-06, + "loss": 0.9448, + "step": 3214 + }, + { + "epoch": 0.2598032283480474, + "grad_norm": 2.890254259109497, + "learning_rate": 9.740616661296853e-06, + "loss": 1.0775, + "step": 3215 + }, + { + "epoch": 0.259884038061375, + "grad_norm": 3.059772491455078, + "learning_rate": 9.740408597186655e-06, + "loss": 0.9619, + "step": 3216 + }, + { + "epoch": 0.2599648477747025, + "grad_norm": 2.7392866611480713, + "learning_rate": 9.740200451884668e-06, + "loss": 0.8956, + "step": 3217 + }, + { + "epoch": 0.26004565748803005, + "grad_norm": 3.0844335556030273, + "learning_rate": 9.739992225394459e-06, + "loss": 0.9921, + "step": 3218 + }, + { + "epoch": 0.2601264672013576, + "grad_norm": 2.98893404006958, + "learning_rate": 9.739783917719595e-06, + "loss": 1.0277, + "step": 3219 + }, + { + "epoch": 0.26020727691468515, + "grad_norm": 2.8196518421173096, + "learning_rate": 9.73957552886364e-06, + "loss": 1.0273, + "step": 3220 + }, + { + "epoch": 0.26028808662801267, + "grad_norm": 2.75636887550354, + "learning_rate": 9.739367058830169e-06, + "loss": 1.0258, + "step": 3221 + }, + { + "epoch": 0.26036889634134025, + "grad_norm": 2.8138818740844727, + "learning_rate": 9.73915850762275e-06, + "loss": 1.0349, + "step": 3222 + }, + { + "epoch": 0.2604497060546678, + "grad_norm": 3.6661946773529053, + "learning_rate": 9.738949875244953e-06, + "loss": 0.9743, + "step": 3223 + }, + { + "epoch": 0.2605305157679953, + "grad_norm": 2.9072020053863525, + "learning_rate": 9.738741161700356e-06, + "loss": 0.9443, + "step": 3224 + }, + { + "epoch": 0.2606113254813229, + "grad_norm": 3.0198700428009033, + "learning_rate": 9.738532366992528e-06, + "loss": 1.0106, + "step": 3225 + }, + { + "epoch": 0.2606921351946504, + "grad_norm": 2.8815252780914307, + "learning_rate": 9.73832349112505e-06, + "loss": 1.0824, + "step": 3226 + }, + { + "epoch": 0.2607729449079779, + "grad_norm": 2.8785507678985596, + "learning_rate": 9.738114534101498e-06, + "loss": 1.004, + "step": 3227 + }, + { + "epoch": 0.2608537546213055, + "grad_norm": 3.012033224105835, + "learning_rate": 9.737905495925448e-06, + "loss": 0.912, + "step": 3228 + }, + { + "epoch": 0.260934564334633, + "grad_norm": 3.8269202709198, + "learning_rate": 9.737696376600485e-06, + "loss": 0.9791, + "step": 3229 + }, + { + "epoch": 0.26101537404796055, + "grad_norm": 2.977001190185547, + "learning_rate": 9.737487176130189e-06, + "loss": 1.0622, + "step": 3230 + }, + { + "epoch": 0.2610961837612881, + "grad_norm": 2.966339349746704, + "learning_rate": 9.73727789451814e-06, + "loss": 1.0245, + "step": 3231 + }, + { + "epoch": 0.26117699347461565, + "grad_norm": 2.9824018478393555, + "learning_rate": 9.737068531767927e-06, + "loss": 0.9502, + "step": 3232 + }, + { + "epoch": 0.26125780318794317, + "grad_norm": 3.0262484550476074, + "learning_rate": 9.736859087883135e-06, + "loss": 1.1588, + "step": 3233 + }, + { + "epoch": 0.26133861290127075, + "grad_norm": 2.919766664505005, + "learning_rate": 9.736649562867349e-06, + "loss": 0.9691, + "step": 3234 + }, + { + "epoch": 0.2614194226145983, + "grad_norm": 2.6849515438079834, + "learning_rate": 9.73643995672416e-06, + "loss": 0.9459, + "step": 3235 + }, + { + "epoch": 0.2615002323279258, + "grad_norm": 2.936185121536255, + "learning_rate": 9.736230269457156e-06, + "loss": 0.9981, + "step": 3236 + }, + { + "epoch": 0.2615810420412534, + "grad_norm": 2.6898229122161865, + "learning_rate": 9.73602050106993e-06, + "loss": 1.0971, + "step": 3237 + }, + { + "epoch": 0.2616618517545809, + "grad_norm": 2.7416326999664307, + "learning_rate": 9.735810651566076e-06, + "loss": 1.0667, + "step": 3238 + }, + { + "epoch": 0.2617426614679084, + "grad_norm": 2.762446880340576, + "learning_rate": 9.735600720949183e-06, + "loss": 0.8635, + "step": 3239 + }, + { + "epoch": 0.261823471181236, + "grad_norm": 2.7108333110809326, + "learning_rate": 9.735390709222853e-06, + "loss": 0.9456, + "step": 3240 + }, + { + "epoch": 0.2619042808945635, + "grad_norm": 2.640698194503784, + "learning_rate": 9.735180616390678e-06, + "loss": 0.9913, + "step": 3241 + }, + { + "epoch": 0.26198509060789105, + "grad_norm": 2.8711159229278564, + "learning_rate": 9.734970442456261e-06, + "loss": 0.9699, + "step": 3242 + }, + { + "epoch": 0.2620659003212186, + "grad_norm": 2.8066492080688477, + "learning_rate": 9.734760187423198e-06, + "loss": 1.0387, + "step": 3243 + }, + { + "epoch": 0.26214671003454615, + "grad_norm": 3.017329692840576, + "learning_rate": 9.73454985129509e-06, + "loss": 0.8811, + "step": 3244 + }, + { + "epoch": 0.26222751974787367, + "grad_norm": 2.606670379638672, + "learning_rate": 9.734339434075543e-06, + "loss": 0.9169, + "step": 3245 + }, + { + "epoch": 0.26230832946120125, + "grad_norm": 2.583763360977173, + "learning_rate": 9.734128935768158e-06, + "loss": 1.0191, + "step": 3246 + }, + { + "epoch": 0.2623891391745288, + "grad_norm": 3.545943260192871, + "learning_rate": 9.733918356376542e-06, + "loss": 1.1157, + "step": 3247 + }, + { + "epoch": 0.2624699488878563, + "grad_norm": 2.810558319091797, + "learning_rate": 9.733707695904301e-06, + "loss": 0.9794, + "step": 3248 + }, + { + "epoch": 0.2625507586011839, + "grad_norm": 2.942075252532959, + "learning_rate": 9.733496954355042e-06, + "loss": 0.9981, + "step": 3249 + }, + { + "epoch": 0.2626315683145114, + "grad_norm": 2.93169903755188, + "learning_rate": 9.733286131732377e-06, + "loss": 0.9009, + "step": 3250 + }, + { + "epoch": 0.2627123780278389, + "grad_norm": 2.745704174041748, + "learning_rate": 9.733075228039914e-06, + "loss": 0.8353, + "step": 3251 + }, + { + "epoch": 0.2627931877411665, + "grad_norm": 2.6289989948272705, + "learning_rate": 9.732864243281269e-06, + "loss": 1.0759, + "step": 3252 + }, + { + "epoch": 0.262873997454494, + "grad_norm": 2.8611576557159424, + "learning_rate": 9.732653177460052e-06, + "loss": 1.0328, + "step": 3253 + }, + { + "epoch": 0.26295480716782155, + "grad_norm": 2.9202113151550293, + "learning_rate": 9.73244203057988e-06, + "loss": 1.0254, + "step": 3254 + }, + { + "epoch": 0.2630356168811491, + "grad_norm": 2.776247501373291, + "learning_rate": 9.732230802644367e-06, + "loss": 0.9879, + "step": 3255 + }, + { + "epoch": 0.26311642659447665, + "grad_norm": 3.0972414016723633, + "learning_rate": 9.732019493657134e-06, + "loss": 0.9659, + "step": 3256 + }, + { + "epoch": 0.26319723630780417, + "grad_norm": 2.735015392303467, + "learning_rate": 9.7318081036218e-06, + "loss": 0.9677, + "step": 3257 + }, + { + "epoch": 0.26327804602113175, + "grad_norm": 2.8691771030426025, + "learning_rate": 9.731596632541985e-06, + "loss": 1.1455, + "step": 3258 + }, + { + "epoch": 0.2633588557344593, + "grad_norm": 2.6793711185455322, + "learning_rate": 9.731385080421308e-06, + "loss": 0.9582, + "step": 3259 + }, + { + "epoch": 0.2634396654477868, + "grad_norm": 2.3230631351470947, + "learning_rate": 9.731173447263395e-06, + "loss": 0.9922, + "step": 3260 + }, + { + "epoch": 0.2635204751611144, + "grad_norm": 3.0409085750579834, + "learning_rate": 9.730961733071873e-06, + "loss": 0.9773, + "step": 3261 + }, + { + "epoch": 0.2636012848744419, + "grad_norm": 2.7706222534179688, + "learning_rate": 9.730749937850365e-06, + "loss": 1.031, + "step": 3262 + }, + { + "epoch": 0.2636820945877694, + "grad_norm": 3.1750383377075195, + "learning_rate": 9.730538061602497e-06, + "loss": 1.0103, + "step": 3263 + }, + { + "epoch": 0.263762904301097, + "grad_norm": 2.8183891773223877, + "learning_rate": 9.730326104331904e-06, + "loss": 1.0243, + "step": 3264 + }, + { + "epoch": 0.2638437140144245, + "grad_norm": 3.4168620109558105, + "learning_rate": 9.73011406604221e-06, + "loss": 0.9172, + "step": 3265 + }, + { + "epoch": 0.2639245237277521, + "grad_norm": 2.2862253189086914, + "learning_rate": 9.72990194673705e-06, + "loss": 1.0757, + "step": 3266 + }, + { + "epoch": 0.2640053334410796, + "grad_norm": 2.386305570602417, + "learning_rate": 9.729689746420057e-06, + "loss": 0.9463, + "step": 3267 + }, + { + "epoch": 0.26408614315440715, + "grad_norm": 2.6689112186431885, + "learning_rate": 9.729477465094866e-06, + "loss": 0.9309, + "step": 3268 + }, + { + "epoch": 0.26416695286773473, + "grad_norm": 2.803179979324341, + "learning_rate": 9.729265102765108e-06, + "loss": 0.9962, + "step": 3269 + }, + { + "epoch": 0.26424776258106225, + "grad_norm": 2.9772040843963623, + "learning_rate": 9.729052659434428e-06, + "loss": 1.0425, + "step": 3270 + }, + { + "epoch": 0.2643285722943898, + "grad_norm": 3.799191951751709, + "learning_rate": 9.728840135106458e-06, + "loss": 0.9622, + "step": 3271 + }, + { + "epoch": 0.26440938200771735, + "grad_norm": 3.263028383255005, + "learning_rate": 9.728627529784842e-06, + "loss": 0.98, + "step": 3272 + }, + { + "epoch": 0.2644901917210449, + "grad_norm": 3.127373695373535, + "learning_rate": 9.72841484347322e-06, + "loss": 1.0495, + "step": 3273 + }, + { + "epoch": 0.2645710014343724, + "grad_norm": 2.6552889347076416, + "learning_rate": 9.728202076175235e-06, + "loss": 0.993, + "step": 3274 + }, + { + "epoch": 0.2646518111477, + "grad_norm": 2.61242938041687, + "learning_rate": 9.727989227894532e-06, + "loss": 0.914, + "step": 3275 + }, + { + "epoch": 0.2647326208610275, + "grad_norm": 2.606858253479004, + "learning_rate": 9.727776298634755e-06, + "loss": 0.9245, + "step": 3276 + }, + { + "epoch": 0.264813430574355, + "grad_norm": 2.996605634689331, + "learning_rate": 9.72756328839955e-06, + "loss": 1.0037, + "step": 3277 + }, + { + "epoch": 0.2648942402876826, + "grad_norm": 2.7778725624084473, + "learning_rate": 9.72735019719257e-06, + "loss": 0.9636, + "step": 3278 + }, + { + "epoch": 0.2649750500010101, + "grad_norm": 3.145533323287964, + "learning_rate": 9.727137025017459e-06, + "loss": 0.8808, + "step": 3279 + }, + { + "epoch": 0.26505585971433765, + "grad_norm": 3.605461835861206, + "learning_rate": 9.726923771877872e-06, + "loss": 1.1387, + "step": 3280 + }, + { + "epoch": 0.26513666942766523, + "grad_norm": 3.2222542762756348, + "learning_rate": 9.72671043777746e-06, + "loss": 0.986, + "step": 3281 + }, + { + "epoch": 0.26521747914099275, + "grad_norm": 2.7679476737976074, + "learning_rate": 9.726497022719878e-06, + "loss": 1.0436, + "step": 3282 + }, + { + "epoch": 0.2652982888543203, + "grad_norm": 2.496774196624756, + "learning_rate": 9.72628352670878e-06, + "loss": 0.9885, + "step": 3283 + }, + { + "epoch": 0.26537909856764785, + "grad_norm": 2.3951852321624756, + "learning_rate": 9.726069949747823e-06, + "loss": 1.0944, + "step": 3284 + }, + { + "epoch": 0.2654599082809754, + "grad_norm": 2.877875566482544, + "learning_rate": 9.725856291840666e-06, + "loss": 1.0931, + "step": 3285 + }, + { + "epoch": 0.2655407179943029, + "grad_norm": 2.9747419357299805, + "learning_rate": 9.725642552990967e-06, + "loss": 1.045, + "step": 3286 + }, + { + "epoch": 0.2656215277076305, + "grad_norm": 3.114614963531494, + "learning_rate": 9.725428733202388e-06, + "loss": 1.0449, + "step": 3287 + }, + { + "epoch": 0.265702337420958, + "grad_norm": 2.7330870628356934, + "learning_rate": 9.725214832478591e-06, + "loss": 0.9713, + "step": 3288 + }, + { + "epoch": 0.2657831471342855, + "grad_norm": 2.685214042663574, + "learning_rate": 9.725000850823241e-06, + "loss": 0.9988, + "step": 3289 + }, + { + "epoch": 0.2658639568476131, + "grad_norm": 2.7034835815429688, + "learning_rate": 9.72478678824e-06, + "loss": 1.1093, + "step": 3290 + }, + { + "epoch": 0.2659447665609406, + "grad_norm": 2.813911199569702, + "learning_rate": 9.724572644732535e-06, + "loss": 0.9647, + "step": 3291 + }, + { + "epoch": 0.26602557627426815, + "grad_norm": 2.4258036613464355, + "learning_rate": 9.724358420304514e-06, + "loss": 0.949, + "step": 3292 + }, + { + "epoch": 0.26610638598759573, + "grad_norm": 2.711528778076172, + "learning_rate": 9.724144114959609e-06, + "loss": 0.9055, + "step": 3293 + }, + { + "epoch": 0.26618719570092325, + "grad_norm": 3.055825710296631, + "learning_rate": 9.723929728701487e-06, + "loss": 0.9706, + "step": 3294 + }, + { + "epoch": 0.2662680054142508, + "grad_norm": 2.645800828933716, + "learning_rate": 9.723715261533819e-06, + "loss": 1.0238, + "step": 3295 + }, + { + "epoch": 0.26634881512757835, + "grad_norm": 2.7595863342285156, + "learning_rate": 9.723500713460282e-06, + "loss": 1.0574, + "step": 3296 + }, + { + "epoch": 0.2664296248409059, + "grad_norm": 2.8845608234405518, + "learning_rate": 9.72328608448455e-06, + "loss": 1.0352, + "step": 3297 + }, + { + "epoch": 0.2665104345542334, + "grad_norm": 2.635479211807251, + "learning_rate": 9.7230713746103e-06, + "loss": 1.0123, + "step": 3298 + }, + { + "epoch": 0.266591244267561, + "grad_norm": 2.7326478958129883, + "learning_rate": 9.722856583841204e-06, + "loss": 1.0065, + "step": 3299 + }, + { + "epoch": 0.2666720539808885, + "grad_norm": 2.9225542545318604, + "learning_rate": 9.722641712180946e-06, + "loss": 0.9945, + "step": 3300 + }, + { + "epoch": 0.266752863694216, + "grad_norm": 3.000427007675171, + "learning_rate": 9.722426759633206e-06, + "loss": 0.9869, + "step": 3301 + }, + { + "epoch": 0.2668336734075436, + "grad_norm": 2.937549352645874, + "learning_rate": 9.722211726201663e-06, + "loss": 1.0905, + "step": 3302 + }, + { + "epoch": 0.2669144831208711, + "grad_norm": 3.3800082206726074, + "learning_rate": 9.721996611890001e-06, + "loss": 0.9916, + "step": 3303 + }, + { + "epoch": 0.26699529283419865, + "grad_norm": 2.5424742698669434, + "learning_rate": 9.721781416701906e-06, + "loss": 0.8998, + "step": 3304 + }, + { + "epoch": 0.26707610254752623, + "grad_norm": 2.817131280899048, + "learning_rate": 9.721566140641061e-06, + "loss": 1.0004, + "step": 3305 + }, + { + "epoch": 0.26715691226085375, + "grad_norm": 2.5327131748199463, + "learning_rate": 9.721350783711156e-06, + "loss": 0.9162, + "step": 3306 + }, + { + "epoch": 0.2672377219741813, + "grad_norm": 2.8354663848876953, + "learning_rate": 9.72113534591588e-06, + "loss": 0.9002, + "step": 3307 + }, + { + "epoch": 0.26731853168750885, + "grad_norm": 2.798231363296509, + "learning_rate": 9.72091982725892e-06, + "loss": 1.0898, + "step": 3308 + }, + { + "epoch": 0.2673993414008364, + "grad_norm": 2.914630174636841, + "learning_rate": 9.720704227743967e-06, + "loss": 0.9134, + "step": 3309 + }, + { + "epoch": 0.2674801511141639, + "grad_norm": 2.7227044105529785, + "learning_rate": 9.720488547374715e-06, + "loss": 0.8023, + "step": 3310 + }, + { + "epoch": 0.2675609608274915, + "grad_norm": 2.8826520442962646, + "learning_rate": 9.720272786154859e-06, + "loss": 1.0412, + "step": 3311 + }, + { + "epoch": 0.267641770540819, + "grad_norm": 3.4115779399871826, + "learning_rate": 9.720056944088095e-06, + "loss": 0.9443, + "step": 3312 + }, + { + "epoch": 0.2677225802541465, + "grad_norm": 2.8294053077697754, + "learning_rate": 9.719841021178118e-06, + "loss": 0.961, + "step": 3313 + }, + { + "epoch": 0.2678033899674741, + "grad_norm": 2.9594686031341553, + "learning_rate": 9.719625017428624e-06, + "loss": 0.9638, + "step": 3314 + }, + { + "epoch": 0.2678841996808016, + "grad_norm": 2.814594030380249, + "learning_rate": 9.71940893284332e-06, + "loss": 0.9381, + "step": 3315 + }, + { + "epoch": 0.26796500939412915, + "grad_norm": 2.804126262664795, + "learning_rate": 9.7191927674259e-06, + "loss": 0.9781, + "step": 3316 + }, + { + "epoch": 0.26804581910745673, + "grad_norm": 2.8507261276245117, + "learning_rate": 9.718976521180068e-06, + "loss": 1.0205, + "step": 3317 + }, + { + "epoch": 0.26812662882078425, + "grad_norm": 3.2076685428619385, + "learning_rate": 9.718760194109531e-06, + "loss": 1.0617, + "step": 3318 + }, + { + "epoch": 0.2682074385341118, + "grad_norm": 3.3740994930267334, + "learning_rate": 9.71854378621799e-06, + "loss": 0.9181, + "step": 3319 + }, + { + "epoch": 0.26828824824743935, + "grad_norm": 3.2359611988067627, + "learning_rate": 9.718327297509154e-06, + "loss": 1.0289, + "step": 3320 + }, + { + "epoch": 0.2683690579607669, + "grad_norm": 2.7341487407684326, + "learning_rate": 9.718110727986732e-06, + "loss": 1.0355, + "step": 3321 + }, + { + "epoch": 0.2684498676740944, + "grad_norm": 2.5096347332000732, + "learning_rate": 9.71789407765443e-06, + "loss": 1.0184, + "step": 3322 + }, + { + "epoch": 0.268530677387422, + "grad_norm": 2.4363651275634766, + "learning_rate": 9.71767734651596e-06, + "loss": 1.0611, + "step": 3323 + }, + { + "epoch": 0.2686114871007495, + "grad_norm": 2.4556515216827393, + "learning_rate": 9.717460534575034e-06, + "loss": 0.997, + "step": 3324 + }, + { + "epoch": 0.268692296814077, + "grad_norm": 2.579828977584839, + "learning_rate": 9.717243641835367e-06, + "loss": 1.0561, + "step": 3325 + }, + { + "epoch": 0.2687731065274046, + "grad_norm": 2.8750548362731934, + "learning_rate": 9.717026668300674e-06, + "loss": 1.0476, + "step": 3326 + }, + { + "epoch": 0.2688539162407321, + "grad_norm": 2.656571865081787, + "learning_rate": 9.716809613974667e-06, + "loss": 0.9635, + "step": 3327 + }, + { + "epoch": 0.2689347259540597, + "grad_norm": 2.723982572555542, + "learning_rate": 9.716592478861067e-06, + "loss": 0.9942, + "step": 3328 + }, + { + "epoch": 0.26901553566738723, + "grad_norm": 2.554208755493164, + "learning_rate": 9.716375262963595e-06, + "loss": 0.9859, + "step": 3329 + }, + { + "epoch": 0.26909634538071475, + "grad_norm": 2.954664707183838, + "learning_rate": 9.716157966285966e-06, + "loss": 0.8633, + "step": 3330 + }, + { + "epoch": 0.26917715509404233, + "grad_norm": 2.7281293869018555, + "learning_rate": 9.715940588831906e-06, + "loss": 0.9219, + "step": 3331 + }, + { + "epoch": 0.26925796480736985, + "grad_norm": 2.283665180206299, + "learning_rate": 9.715723130605139e-06, + "loss": 0.9913, + "step": 3332 + }, + { + "epoch": 0.2693387745206974, + "grad_norm": 2.8884527683258057, + "learning_rate": 9.715505591609383e-06, + "loss": 0.9713, + "step": 3333 + }, + { + "epoch": 0.26941958423402496, + "grad_norm": 3.3616936206817627, + "learning_rate": 9.715287971848373e-06, + "loss": 0.9262, + "step": 3334 + }, + { + "epoch": 0.2695003939473525, + "grad_norm": 2.526184558868408, + "learning_rate": 9.715070271325828e-06, + "loss": 0.9996, + "step": 3335 + }, + { + "epoch": 0.26958120366068, + "grad_norm": 3.252199411392212, + "learning_rate": 9.714852490045483e-06, + "loss": 0.9224, + "step": 3336 + }, + { + "epoch": 0.2696620133740076, + "grad_norm": 2.5886433124542236, + "learning_rate": 9.714634628011064e-06, + "loss": 1.0188, + "step": 3337 + }, + { + "epoch": 0.2697428230873351, + "grad_norm": 2.8822431564331055, + "learning_rate": 9.714416685226305e-06, + "loss": 1.0581, + "step": 3338 + }, + { + "epoch": 0.2698236328006626, + "grad_norm": 2.8415160179138184, + "learning_rate": 9.714198661694936e-06, + "loss": 1.0685, + "step": 3339 + }, + { + "epoch": 0.2699044425139902, + "grad_norm": 3.07682728767395, + "learning_rate": 9.713980557420693e-06, + "loss": 1.0587, + "step": 3340 + }, + { + "epoch": 0.26998525222731773, + "grad_norm": 2.992180824279785, + "learning_rate": 9.713762372407311e-06, + "loss": 1.1087, + "step": 3341 + }, + { + "epoch": 0.27006606194064525, + "grad_norm": 2.7979042530059814, + "learning_rate": 9.71354410665853e-06, + "loss": 0.979, + "step": 3342 + }, + { + "epoch": 0.27014687165397283, + "grad_norm": 2.861024856567383, + "learning_rate": 9.713325760178085e-06, + "loss": 1.041, + "step": 3343 + }, + { + "epoch": 0.27022768136730035, + "grad_norm": 2.712601900100708, + "learning_rate": 9.713107332969715e-06, + "loss": 0.957, + "step": 3344 + }, + { + "epoch": 0.2703084910806279, + "grad_norm": 2.7607052326202393, + "learning_rate": 9.712888825037164e-06, + "loss": 1.0124, + "step": 3345 + }, + { + "epoch": 0.27038930079395546, + "grad_norm": 2.5137217044830322, + "learning_rate": 9.712670236384172e-06, + "loss": 0.9161, + "step": 3346 + }, + { + "epoch": 0.270470110507283, + "grad_norm": 3.13071870803833, + "learning_rate": 9.712451567014485e-06, + "loss": 1.0568, + "step": 3347 + }, + { + "epoch": 0.2705509202206105, + "grad_norm": 2.989260673522949, + "learning_rate": 9.712232816931848e-06, + "loss": 0.9237, + "step": 3348 + }, + { + "epoch": 0.2706317299339381, + "grad_norm": 2.614856004714966, + "learning_rate": 9.712013986140006e-06, + "loss": 1.0629, + "step": 3349 + }, + { + "epoch": 0.2707125396472656, + "grad_norm": 2.7321884632110596, + "learning_rate": 9.711795074642709e-06, + "loss": 0.9444, + "step": 3350 + }, + { + "epoch": 0.2707933493605931, + "grad_norm": 2.5214805603027344, + "learning_rate": 9.711576082443705e-06, + "loss": 0.9694, + "step": 3351 + }, + { + "epoch": 0.2708741590739207, + "grad_norm": 2.974591016769409, + "learning_rate": 9.711357009546746e-06, + "loss": 1.0546, + "step": 3352 + }, + { + "epoch": 0.27095496878724823, + "grad_norm": 2.881301164627075, + "learning_rate": 9.711137855955584e-06, + "loss": 0.9214, + "step": 3353 + }, + { + "epoch": 0.27103577850057575, + "grad_norm": 3.0097289085388184, + "learning_rate": 9.71091862167397e-06, + "loss": 1.0195, + "step": 3354 + }, + { + "epoch": 0.27111658821390333, + "grad_norm": 2.7289974689483643, + "learning_rate": 9.710699306705664e-06, + "loss": 0.9618, + "step": 3355 + }, + { + "epoch": 0.27119739792723085, + "grad_norm": 2.989780902862549, + "learning_rate": 9.710479911054417e-06, + "loss": 1.045, + "step": 3356 + }, + { + "epoch": 0.2712782076405584, + "grad_norm": 2.8268089294433594, + "learning_rate": 9.71026043472399e-06, + "loss": 1.0552, + "step": 3357 + }, + { + "epoch": 0.27135901735388596, + "grad_norm": 2.8823611736297607, + "learning_rate": 9.710040877718142e-06, + "loss": 0.9612, + "step": 3358 + }, + { + "epoch": 0.2714398270672135, + "grad_norm": 2.8212215900421143, + "learning_rate": 9.709821240040632e-06, + "loss": 0.9574, + "step": 3359 + }, + { + "epoch": 0.271520636780541, + "grad_norm": 2.992547035217285, + "learning_rate": 9.709601521695223e-06, + "loss": 0.9661, + "step": 3360 + }, + { + "epoch": 0.2716014464938686, + "grad_norm": 2.427157402038574, + "learning_rate": 9.709381722685675e-06, + "loss": 0.9108, + "step": 3361 + }, + { + "epoch": 0.2716822562071961, + "grad_norm": 3.3962388038635254, + "learning_rate": 9.70916184301576e-06, + "loss": 0.9932, + "step": 3362 + }, + { + "epoch": 0.2717630659205236, + "grad_norm": 2.3203012943267822, + "learning_rate": 9.708941882689236e-06, + "loss": 1.1635, + "step": 3363 + }, + { + "epoch": 0.2718438756338512, + "grad_norm": 2.8734757900238037, + "learning_rate": 9.708721841709875e-06, + "loss": 0.9125, + "step": 3364 + }, + { + "epoch": 0.27192468534717873, + "grad_norm": 3.199871778488159, + "learning_rate": 9.708501720081445e-06, + "loss": 1.0013, + "step": 3365 + }, + { + "epoch": 0.27200549506050625, + "grad_norm": 2.4899682998657227, + "learning_rate": 9.708281517807717e-06, + "loss": 0.9772, + "step": 3366 + }, + { + "epoch": 0.27208630477383383, + "grad_norm": 2.588430643081665, + "learning_rate": 9.70806123489246e-06, + "loss": 1.0641, + "step": 3367 + }, + { + "epoch": 0.27216711448716135, + "grad_norm": 2.7496261596679688, + "learning_rate": 9.707840871339447e-06, + "loss": 1.0542, + "step": 3368 + }, + { + "epoch": 0.2722479242004889, + "grad_norm": 2.8912103176116943, + "learning_rate": 9.707620427152455e-06, + "loss": 1.1012, + "step": 3369 + }, + { + "epoch": 0.27232873391381646, + "grad_norm": 2.703918695449829, + "learning_rate": 9.707399902335258e-06, + "loss": 0.997, + "step": 3370 + }, + { + "epoch": 0.272409543627144, + "grad_norm": 2.8338420391082764, + "learning_rate": 9.707179296891633e-06, + "loss": 1.0285, + "step": 3371 + }, + { + "epoch": 0.2724903533404715, + "grad_norm": 2.996202230453491, + "learning_rate": 9.706958610825359e-06, + "loss": 0.8792, + "step": 3372 + }, + { + "epoch": 0.2725711630537991, + "grad_norm": 3.0629310607910156, + "learning_rate": 9.706737844140216e-06, + "loss": 0.9494, + "step": 3373 + }, + { + "epoch": 0.2726519727671266, + "grad_norm": 2.534231424331665, + "learning_rate": 9.706516996839983e-06, + "loss": 0.9461, + "step": 3374 + }, + { + "epoch": 0.27273278248045413, + "grad_norm": 2.7926554679870605, + "learning_rate": 9.706296068928446e-06, + "loss": 0.9521, + "step": 3375 + }, + { + "epoch": 0.2728135921937817, + "grad_norm": 2.740150213241577, + "learning_rate": 9.706075060409387e-06, + "loss": 1.0774, + "step": 3376 + }, + { + "epoch": 0.27289440190710923, + "grad_norm": 2.7504961490631104, + "learning_rate": 9.705853971286592e-06, + "loss": 1.0194, + "step": 3377 + }, + { + "epoch": 0.27297521162043675, + "grad_norm": 2.932079315185547, + "learning_rate": 9.705632801563846e-06, + "loss": 0.9249, + "step": 3378 + }, + { + "epoch": 0.27305602133376433, + "grad_norm": 3.284442901611328, + "learning_rate": 9.70541155124494e-06, + "loss": 0.9577, + "step": 3379 + }, + { + "epoch": 0.27313683104709185, + "grad_norm": 2.3821094036102295, + "learning_rate": 9.70519022033366e-06, + "loss": 1.0973, + "step": 3380 + }, + { + "epoch": 0.2732176407604194, + "grad_norm": 2.890260934829712, + "learning_rate": 9.7049688088338e-06, + "loss": 0.96, + "step": 3381 + }, + { + "epoch": 0.27329845047374696, + "grad_norm": 2.68121337890625, + "learning_rate": 9.704747316749152e-06, + "loss": 1.0355, + "step": 3382 + }, + { + "epoch": 0.2733792601870745, + "grad_norm": 2.871030330657959, + "learning_rate": 9.704525744083506e-06, + "loss": 1.0175, + "step": 3383 + }, + { + "epoch": 0.273460069900402, + "grad_norm": 2.8889451026916504, + "learning_rate": 9.704304090840662e-06, + "loss": 1.0256, + "step": 3384 + }, + { + "epoch": 0.2735408796137296, + "grad_norm": 2.9536449909210205, + "learning_rate": 9.704082357024414e-06, + "loss": 1.0409, + "step": 3385 + }, + { + "epoch": 0.2736216893270571, + "grad_norm": 3.0326268672943115, + "learning_rate": 9.703860542638558e-06, + "loss": 0.8903, + "step": 3386 + }, + { + "epoch": 0.27370249904038463, + "grad_norm": 2.743579149246216, + "learning_rate": 9.703638647686898e-06, + "loss": 1.0098, + "step": 3387 + }, + { + "epoch": 0.2737833087537122, + "grad_norm": 2.934258460998535, + "learning_rate": 9.703416672173229e-06, + "loss": 1.0523, + "step": 3388 + }, + { + "epoch": 0.27386411846703973, + "grad_norm": 2.703131675720215, + "learning_rate": 9.703194616101356e-06, + "loss": 0.9418, + "step": 3389 + }, + { + "epoch": 0.27394492818036725, + "grad_norm": 2.9281959533691406, + "learning_rate": 9.702972479475082e-06, + "loss": 0.9658, + "step": 3390 + }, + { + "epoch": 0.27402573789369483, + "grad_norm": 3.2979321479797363, + "learning_rate": 9.702750262298212e-06, + "loss": 0.9057, + "step": 3391 + }, + { + "epoch": 0.27410654760702235, + "grad_norm": 3.229180335998535, + "learning_rate": 9.70252796457455e-06, + "loss": 0.8603, + "step": 3392 + }, + { + "epoch": 0.27418735732034993, + "grad_norm": 2.976534843444824, + "learning_rate": 9.702305586307906e-06, + "loss": 0.9375, + "step": 3393 + }, + { + "epoch": 0.27426816703367746, + "grad_norm": 2.718949317932129, + "learning_rate": 9.702083127502087e-06, + "loss": 1.0614, + "step": 3394 + }, + { + "epoch": 0.274348976747005, + "grad_norm": 2.5181586742401123, + "learning_rate": 9.701860588160903e-06, + "loss": 0.9732, + "step": 3395 + }, + { + "epoch": 0.27442978646033256, + "grad_norm": 2.5775434970855713, + "learning_rate": 9.701637968288168e-06, + "loss": 1.0289, + "step": 3396 + }, + { + "epoch": 0.2745105961736601, + "grad_norm": 2.569443464279175, + "learning_rate": 9.701415267887693e-06, + "loss": 1.0845, + "step": 3397 + }, + { + "epoch": 0.2745914058869876, + "grad_norm": 2.777613639831543, + "learning_rate": 9.701192486963293e-06, + "loss": 0.9458, + "step": 3398 + }, + { + "epoch": 0.2746722156003152, + "grad_norm": 2.5597851276397705, + "learning_rate": 9.700969625518784e-06, + "loss": 1.0283, + "step": 3399 + }, + { + "epoch": 0.2747530253136427, + "grad_norm": 2.84451961517334, + "learning_rate": 9.70074668355798e-06, + "loss": 1.0286, + "step": 3400 + }, + { + "epoch": 0.27483383502697023, + "grad_norm": 3.0637969970703125, + "learning_rate": 9.700523661084703e-06, + "loss": 1.0789, + "step": 3401 + }, + { + "epoch": 0.2749146447402978, + "grad_norm": 2.773308753967285, + "learning_rate": 9.700300558102773e-06, + "loss": 0.9973, + "step": 3402 + }, + { + "epoch": 0.27499545445362533, + "grad_norm": 2.556149482727051, + "learning_rate": 9.700077374616009e-06, + "loss": 0.8536, + "step": 3403 + }, + { + "epoch": 0.27507626416695286, + "grad_norm": 3.3209409713745117, + "learning_rate": 9.699854110628233e-06, + "loss": 0.8942, + "step": 3404 + }, + { + "epoch": 0.27515707388028043, + "grad_norm": 2.7973313331604004, + "learning_rate": 9.699630766143273e-06, + "loss": 1.044, + "step": 3405 + }, + { + "epoch": 0.27523788359360796, + "grad_norm": 2.9914135932922363, + "learning_rate": 9.69940734116495e-06, + "loss": 1.0301, + "step": 3406 + }, + { + "epoch": 0.2753186933069355, + "grad_norm": 2.7159347534179688, + "learning_rate": 9.699183835697092e-06, + "loss": 0.9963, + "step": 3407 + }, + { + "epoch": 0.27539950302026306, + "grad_norm": 2.775402069091797, + "learning_rate": 9.69896024974353e-06, + "loss": 1.0171, + "step": 3408 + }, + { + "epoch": 0.2754803127335906, + "grad_norm": 3.0971827507019043, + "learning_rate": 9.69873658330809e-06, + "loss": 1.025, + "step": 3409 + }, + { + "epoch": 0.2755611224469181, + "grad_norm": 2.6890649795532227, + "learning_rate": 9.698512836394605e-06, + "loss": 0.9742, + "step": 3410 + }, + { + "epoch": 0.2756419321602457, + "grad_norm": 2.641848564147949, + "learning_rate": 9.698289009006904e-06, + "loss": 0.9539, + "step": 3411 + }, + { + "epoch": 0.2757227418735732, + "grad_norm": 2.791677713394165, + "learning_rate": 9.698065101148826e-06, + "loss": 0.9554, + "step": 3412 + }, + { + "epoch": 0.27580355158690073, + "grad_norm": 2.483377695083618, + "learning_rate": 9.697841112824202e-06, + "loss": 1.0689, + "step": 3413 + }, + { + "epoch": 0.2758843613002283, + "grad_norm": 3.3745176792144775, + "learning_rate": 9.697617044036868e-06, + "loss": 1.0128, + "step": 3414 + }, + { + "epoch": 0.27596517101355583, + "grad_norm": 2.8752527236938477, + "learning_rate": 9.697392894790665e-06, + "loss": 1.1637, + "step": 3415 + }, + { + "epoch": 0.27604598072688336, + "grad_norm": 2.905606985092163, + "learning_rate": 9.69716866508943e-06, + "loss": 0.9819, + "step": 3416 + }, + { + "epoch": 0.27612679044021093, + "grad_norm": 3.0775539875030518, + "learning_rate": 9.696944354937e-06, + "loss": 0.9637, + "step": 3417 + }, + { + "epoch": 0.27620760015353846, + "grad_norm": 3.1826045513153076, + "learning_rate": 9.696719964337224e-06, + "loss": 0.9855, + "step": 3418 + }, + { + "epoch": 0.276288409866866, + "grad_norm": 2.7648861408233643, + "learning_rate": 9.696495493293942e-06, + "loss": 0.9567, + "step": 3419 + }, + { + "epoch": 0.27636921958019356, + "grad_norm": 2.7583377361297607, + "learning_rate": 9.696270941811e-06, + "loss": 1.007, + "step": 3420 + }, + { + "epoch": 0.2764500292935211, + "grad_norm": 2.6436402797698975, + "learning_rate": 9.69604630989224e-06, + "loss": 1.01, + "step": 3421 + }, + { + "epoch": 0.2765308390068486, + "grad_norm": 3.6561508178710938, + "learning_rate": 9.695821597541512e-06, + "loss": 0.9828, + "step": 3422 + }, + { + "epoch": 0.2766116487201762, + "grad_norm": 2.9194488525390625, + "learning_rate": 9.695596804762666e-06, + "loss": 0.8796, + "step": 3423 + }, + { + "epoch": 0.2766924584335037, + "grad_norm": 2.4754891395568848, + "learning_rate": 9.69537193155955e-06, + "loss": 1.0016, + "step": 3424 + }, + { + "epoch": 0.27677326814683123, + "grad_norm": 2.4715240001678467, + "learning_rate": 9.695146977936016e-06, + "loss": 0.8923, + "step": 3425 + }, + { + "epoch": 0.2768540778601588, + "grad_norm": 2.36635422706604, + "learning_rate": 9.694921943895918e-06, + "loss": 1.0358, + "step": 3426 + }, + { + "epoch": 0.27693488757348633, + "grad_norm": 2.7679364681243896, + "learning_rate": 9.694696829443112e-06, + "loss": 1.0525, + "step": 3427 + }, + { + "epoch": 0.27701569728681386, + "grad_norm": 2.478515625, + "learning_rate": 9.694471634581447e-06, + "loss": 0.9719, + "step": 3428 + }, + { + "epoch": 0.27709650700014143, + "grad_norm": 2.82259202003479, + "learning_rate": 9.694246359314787e-06, + "loss": 1.0163, + "step": 3429 + }, + { + "epoch": 0.27717731671346896, + "grad_norm": 2.9287900924682617, + "learning_rate": 9.694021003646987e-06, + "loss": 1.013, + "step": 3430 + }, + { + "epoch": 0.2772581264267965, + "grad_norm": 2.914734363555908, + "learning_rate": 9.693795567581907e-06, + "loss": 0.9487, + "step": 3431 + }, + { + "epoch": 0.27733893614012406, + "grad_norm": 2.637640953063965, + "learning_rate": 9.693570051123412e-06, + "loss": 1.0951, + "step": 3432 + }, + { + "epoch": 0.2774197458534516, + "grad_norm": 2.958834171295166, + "learning_rate": 9.693344454275358e-06, + "loss": 0.9708, + "step": 3433 + }, + { + "epoch": 0.2775005555667791, + "grad_norm": 2.638725757598877, + "learning_rate": 9.693118777041612e-06, + "loss": 1.0823, + "step": 3434 + }, + { + "epoch": 0.2775813652801067, + "grad_norm": 2.6947219371795654, + "learning_rate": 9.692893019426042e-06, + "loss": 1.0269, + "step": 3435 + }, + { + "epoch": 0.2776621749934342, + "grad_norm": 2.7380142211914062, + "learning_rate": 9.692667181432512e-06, + "loss": 0.9439, + "step": 3436 + }, + { + "epoch": 0.27774298470676173, + "grad_norm": 2.9021363258361816, + "learning_rate": 9.692441263064889e-06, + "loss": 0.9586, + "step": 3437 + }, + { + "epoch": 0.2778237944200893, + "grad_norm": 3.3157687187194824, + "learning_rate": 9.692215264327042e-06, + "loss": 1.0625, + "step": 3438 + }, + { + "epoch": 0.27790460413341683, + "grad_norm": 2.8448753356933594, + "learning_rate": 9.691989185222847e-06, + "loss": 1.0707, + "step": 3439 + }, + { + "epoch": 0.27798541384674436, + "grad_norm": 3.3725030422210693, + "learning_rate": 9.691763025756171e-06, + "loss": 0.9813, + "step": 3440 + }, + { + "epoch": 0.27806622356007193, + "grad_norm": 2.9721672534942627, + "learning_rate": 9.691536785930891e-06, + "loss": 0.9135, + "step": 3441 + }, + { + "epoch": 0.27814703327339946, + "grad_norm": 2.909263849258423, + "learning_rate": 9.691310465750879e-06, + "loss": 1.0183, + "step": 3442 + }, + { + "epoch": 0.278227842986727, + "grad_norm": 2.8733043670654297, + "learning_rate": 9.691084065220013e-06, + "loss": 1.2139, + "step": 3443 + }, + { + "epoch": 0.27830865270005456, + "grad_norm": 2.779677152633667, + "learning_rate": 9.69085758434217e-06, + "loss": 1.0551, + "step": 3444 + }, + { + "epoch": 0.2783894624133821, + "grad_norm": 2.7430167198181152, + "learning_rate": 9.690631023121228e-06, + "loss": 0.9888, + "step": 3445 + }, + { + "epoch": 0.2784702721267096, + "grad_norm": 3.428687810897827, + "learning_rate": 9.690404381561072e-06, + "loss": 0.979, + "step": 3446 + }, + { + "epoch": 0.2785510818400372, + "grad_norm": 2.6249406337738037, + "learning_rate": 9.690177659665578e-06, + "loss": 1.055, + "step": 3447 + }, + { + "epoch": 0.2786318915533647, + "grad_norm": 2.9783623218536377, + "learning_rate": 9.689950857438632e-06, + "loss": 0.9662, + "step": 3448 + }, + { + "epoch": 0.27871270126669223, + "grad_norm": 2.5768818855285645, + "learning_rate": 9.68972397488412e-06, + "loss": 0.9955, + "step": 3449 + }, + { + "epoch": 0.2787935109800198, + "grad_norm": 2.944002389907837, + "learning_rate": 9.689497012005924e-06, + "loss": 1.1916, + "step": 3450 + }, + { + "epoch": 0.27887432069334733, + "grad_norm": 3.232891321182251, + "learning_rate": 9.689269968807936e-06, + "loss": 1.0031, + "step": 3451 + }, + { + "epoch": 0.27895513040667486, + "grad_norm": 2.5034186840057373, + "learning_rate": 9.689042845294041e-06, + "loss": 1.0152, + "step": 3452 + }, + { + "epoch": 0.27903594012000243, + "grad_norm": 3.2794289588928223, + "learning_rate": 9.688815641468131e-06, + "loss": 1.1153, + "step": 3453 + }, + { + "epoch": 0.27911674983332996, + "grad_norm": 2.6757853031158447, + "learning_rate": 9.688588357334096e-06, + "loss": 0.921, + "step": 3454 + }, + { + "epoch": 0.2791975595466575, + "grad_norm": 3.212266445159912, + "learning_rate": 9.688360992895832e-06, + "loss": 0.9788, + "step": 3455 + }, + { + "epoch": 0.27927836925998506, + "grad_norm": 2.9902377128601074, + "learning_rate": 9.68813354815723e-06, + "loss": 1.1899, + "step": 3456 + }, + { + "epoch": 0.2793591789733126, + "grad_norm": 2.3348207473754883, + "learning_rate": 9.687906023122184e-06, + "loss": 0.9246, + "step": 3457 + }, + { + "epoch": 0.27943998868664016, + "grad_norm": 3.160766363143921, + "learning_rate": 9.687678417794597e-06, + "loss": 1.1292, + "step": 3458 + }, + { + "epoch": 0.2795207983999677, + "grad_norm": 2.848845958709717, + "learning_rate": 9.687450732178363e-06, + "loss": 0.8698, + "step": 3459 + }, + { + "epoch": 0.2796016081132952, + "grad_norm": 3.4590060710906982, + "learning_rate": 9.687222966277381e-06, + "loss": 1.089, + "step": 3460 + }, + { + "epoch": 0.2796824178266228, + "grad_norm": 2.869417667388916, + "learning_rate": 9.686995120095555e-06, + "loss": 0.915, + "step": 3461 + }, + { + "epoch": 0.2797632275399503, + "grad_norm": 2.422532081604004, + "learning_rate": 9.686767193636785e-06, + "loss": 0.9609, + "step": 3462 + }, + { + "epoch": 0.27984403725327783, + "grad_norm": 3.6336019039154053, + "learning_rate": 9.686539186904977e-06, + "loss": 0.9827, + "step": 3463 + }, + { + "epoch": 0.2799248469666054, + "grad_norm": 3.015155076980591, + "learning_rate": 9.686311099904034e-06, + "loss": 1.1245, + "step": 3464 + }, + { + "epoch": 0.28000565667993293, + "grad_norm": 2.3626017570495605, + "learning_rate": 9.686082932637864e-06, + "loss": 0.9372, + "step": 3465 + }, + { + "epoch": 0.28008646639326046, + "grad_norm": 2.927370309829712, + "learning_rate": 9.685854685110376e-06, + "loss": 1.0378, + "step": 3466 + }, + { + "epoch": 0.28016727610658804, + "grad_norm": 2.8526339530944824, + "learning_rate": 9.685626357325477e-06, + "loss": 0.9278, + "step": 3467 + }, + { + "epoch": 0.28024808581991556, + "grad_norm": 2.72507643699646, + "learning_rate": 9.685397949287079e-06, + "loss": 1.044, + "step": 3468 + }, + { + "epoch": 0.2803288955332431, + "grad_norm": 2.673955202102661, + "learning_rate": 9.685169460999093e-06, + "loss": 1.0791, + "step": 3469 + }, + { + "epoch": 0.28040970524657066, + "grad_norm": 2.734114646911621, + "learning_rate": 9.684940892465434e-06, + "loss": 0.9674, + "step": 3470 + }, + { + "epoch": 0.2804905149598982, + "grad_norm": 2.4035141468048096, + "learning_rate": 9.684712243690015e-06, + "loss": 0.941, + "step": 3471 + }, + { + "epoch": 0.2805713246732257, + "grad_norm": 3.046450138092041, + "learning_rate": 9.684483514676755e-06, + "loss": 1.0485, + "step": 3472 + }, + { + "epoch": 0.2806521343865533, + "grad_norm": 2.5257482528686523, + "learning_rate": 9.684254705429568e-06, + "loss": 0.881, + "step": 3473 + }, + { + "epoch": 0.2807329440998808, + "grad_norm": 3.0747008323669434, + "learning_rate": 9.684025815952375e-06, + "loss": 1.0863, + "step": 3474 + }, + { + "epoch": 0.28081375381320833, + "grad_norm": 2.855036497116089, + "learning_rate": 9.683796846249097e-06, + "loss": 1.0607, + "step": 3475 + }, + { + "epoch": 0.2808945635265359, + "grad_norm": 2.654651403427124, + "learning_rate": 9.683567796323654e-06, + "loss": 0.9433, + "step": 3476 + }, + { + "epoch": 0.28097537323986344, + "grad_norm": 2.6321558952331543, + "learning_rate": 9.683338666179971e-06, + "loss": 0.9739, + "step": 3477 + }, + { + "epoch": 0.28105618295319096, + "grad_norm": 2.620388984680176, + "learning_rate": 9.683109455821972e-06, + "loss": 1.0461, + "step": 3478 + }, + { + "epoch": 0.28113699266651854, + "grad_norm": 2.774127244949341, + "learning_rate": 9.68288016525358e-06, + "loss": 1.0158, + "step": 3479 + }, + { + "epoch": 0.28121780237984606, + "grad_norm": 3.1548571586608887, + "learning_rate": 9.682650794478725e-06, + "loss": 0.8939, + "step": 3480 + }, + { + "epoch": 0.2812986120931736, + "grad_norm": 2.7580490112304688, + "learning_rate": 9.682421343501335e-06, + "loss": 1.0513, + "step": 3481 + }, + { + "epoch": 0.28137942180650116, + "grad_norm": 2.82216215133667, + "learning_rate": 9.68219181232534e-06, + "loss": 0.9678, + "step": 3482 + }, + { + "epoch": 0.2814602315198287, + "grad_norm": 2.7549870014190674, + "learning_rate": 9.681962200954671e-06, + "loss": 0.9741, + "step": 3483 + }, + { + "epoch": 0.2815410412331562, + "grad_norm": 2.6894516944885254, + "learning_rate": 9.68173250939326e-06, + "loss": 0.9422, + "step": 3484 + }, + { + "epoch": 0.2816218509464838, + "grad_norm": 2.5008151531219482, + "learning_rate": 9.681502737645043e-06, + "loss": 0.9906, + "step": 3485 + }, + { + "epoch": 0.2817026606598113, + "grad_norm": 2.5604753494262695, + "learning_rate": 9.681272885713955e-06, + "loss": 0.9815, + "step": 3486 + }, + { + "epoch": 0.28178347037313883, + "grad_norm": 3.2388126850128174, + "learning_rate": 9.681042953603932e-06, + "loss": 0.9839, + "step": 3487 + }, + { + "epoch": 0.2818642800864664, + "grad_norm": 2.8115487098693848, + "learning_rate": 9.68081294131891e-06, + "loss": 0.9874, + "step": 3488 + }, + { + "epoch": 0.28194508979979394, + "grad_norm": 2.4315402507781982, + "learning_rate": 9.680582848862834e-06, + "loss": 0.8547, + "step": 3489 + }, + { + "epoch": 0.28202589951312146, + "grad_norm": 2.8511130809783936, + "learning_rate": 9.680352676239641e-06, + "loss": 1.012, + "step": 3490 + }, + { + "epoch": 0.28210670922644904, + "grad_norm": 3.0524845123291016, + "learning_rate": 9.680122423453272e-06, + "loss": 0.9357, + "step": 3491 + }, + { + "epoch": 0.28218751893977656, + "grad_norm": 3.1882684230804443, + "learning_rate": 9.679892090507676e-06, + "loss": 1.0719, + "step": 3492 + }, + { + "epoch": 0.2822683286531041, + "grad_norm": 2.989455461502075, + "learning_rate": 9.679661677406793e-06, + "loss": 1.0933, + "step": 3493 + }, + { + "epoch": 0.28234913836643166, + "grad_norm": 2.903139114379883, + "learning_rate": 9.679431184154572e-06, + "loss": 0.8652, + "step": 3494 + }, + { + "epoch": 0.2824299480797592, + "grad_norm": 2.8893330097198486, + "learning_rate": 9.67920061075496e-06, + "loss": 0.9554, + "step": 3495 + }, + { + "epoch": 0.2825107577930867, + "grad_norm": 2.6712095737457275, + "learning_rate": 9.678969957211905e-06, + "loss": 0.9762, + "step": 3496 + }, + { + "epoch": 0.2825915675064143, + "grad_norm": 2.954951763153076, + "learning_rate": 9.67873922352936e-06, + "loss": 0.964, + "step": 3497 + }, + { + "epoch": 0.2826723772197418, + "grad_norm": 2.700484037399292, + "learning_rate": 9.678508409711276e-06, + "loss": 1.0128, + "step": 3498 + }, + { + "epoch": 0.28275318693306933, + "grad_norm": 2.642106533050537, + "learning_rate": 9.678277515761605e-06, + "loss": 0.9558, + "step": 3499 + }, + { + "epoch": 0.2828339966463969, + "grad_norm": 2.848428964614868, + "learning_rate": 9.678046541684302e-06, + "loss": 1.0457, + "step": 3500 + }, + { + "epoch": 0.28291480635972444, + "grad_norm": 2.7813591957092285, + "learning_rate": 9.677815487483326e-06, + "loss": 0.9187, + "step": 3501 + }, + { + "epoch": 0.28299561607305196, + "grad_norm": 2.9099740982055664, + "learning_rate": 9.67758435316263e-06, + "loss": 1.1141, + "step": 3502 + }, + { + "epoch": 0.28307642578637954, + "grad_norm": 2.7438671588897705, + "learning_rate": 9.677353138726177e-06, + "loss": 0.9629, + "step": 3503 + }, + { + "epoch": 0.28315723549970706, + "grad_norm": 2.780836820602417, + "learning_rate": 9.677121844177923e-06, + "loss": 0.9671, + "step": 3504 + }, + { + "epoch": 0.2832380452130346, + "grad_norm": 2.482347011566162, + "learning_rate": 9.676890469521833e-06, + "loss": 1.0432, + "step": 3505 + }, + { + "epoch": 0.28331885492636216, + "grad_norm": 2.5355114936828613, + "learning_rate": 9.676659014761868e-06, + "loss": 1.0369, + "step": 3506 + }, + { + "epoch": 0.2833996646396897, + "grad_norm": 2.961168050765991, + "learning_rate": 9.676427479901991e-06, + "loss": 1.0697, + "step": 3507 + }, + { + "epoch": 0.2834804743530172, + "grad_norm": 2.63830828666687, + "learning_rate": 9.676195864946171e-06, + "loss": 1.0377, + "step": 3508 + }, + { + "epoch": 0.2835612840663448, + "grad_norm": 3.5168895721435547, + "learning_rate": 9.675964169898373e-06, + "loss": 0.9061, + "step": 3509 + }, + { + "epoch": 0.2836420937796723, + "grad_norm": 2.7696313858032227, + "learning_rate": 9.675732394762567e-06, + "loss": 0.9532, + "step": 3510 + }, + { + "epoch": 0.28372290349299983, + "grad_norm": 2.9334309101104736, + "learning_rate": 9.675500539542719e-06, + "loss": 1.0098, + "step": 3511 + }, + { + "epoch": 0.2838037132063274, + "grad_norm": 2.6671390533447266, + "learning_rate": 9.675268604242804e-06, + "loss": 1.0216, + "step": 3512 + }, + { + "epoch": 0.28388452291965494, + "grad_norm": 2.7944910526275635, + "learning_rate": 9.675036588866793e-06, + "loss": 0.9303, + "step": 3513 + }, + { + "epoch": 0.28396533263298246, + "grad_norm": 2.7365944385528564, + "learning_rate": 9.674804493418659e-06, + "loss": 0.9002, + "step": 3514 + }, + { + "epoch": 0.28404614234631004, + "grad_norm": 3.015563726425171, + "learning_rate": 9.67457231790238e-06, + "loss": 0.9412, + "step": 3515 + }, + { + "epoch": 0.28412695205963756, + "grad_norm": 2.741112470626831, + "learning_rate": 9.674340062321929e-06, + "loss": 0.9404, + "step": 3516 + }, + { + "epoch": 0.2842077617729651, + "grad_norm": 2.7046263217926025, + "learning_rate": 9.674107726681285e-06, + "loss": 0.9861, + "step": 3517 + }, + { + "epoch": 0.28428857148629266, + "grad_norm": 2.813877582550049, + "learning_rate": 9.67387531098443e-06, + "loss": 1.0162, + "step": 3518 + }, + { + "epoch": 0.2843693811996202, + "grad_norm": 2.9360432624816895, + "learning_rate": 9.673642815235342e-06, + "loss": 1.0211, + "step": 3519 + }, + { + "epoch": 0.2844501909129477, + "grad_norm": 2.8569142818450928, + "learning_rate": 9.673410239438007e-06, + "loss": 1.0572, + "step": 3520 + }, + { + "epoch": 0.2845310006262753, + "grad_norm": 2.4324591159820557, + "learning_rate": 9.6731775835964e-06, + "loss": 0.9912, + "step": 3521 + }, + { + "epoch": 0.2846118103396028, + "grad_norm": 2.9998257160186768, + "learning_rate": 9.672944847714515e-06, + "loss": 0.9571, + "step": 3522 + }, + { + "epoch": 0.2846926200529304, + "grad_norm": 2.362875461578369, + "learning_rate": 9.672712031796332e-06, + "loss": 0.9492, + "step": 3523 + }, + { + "epoch": 0.2847734297662579, + "grad_norm": 2.6290156841278076, + "learning_rate": 9.672479135845843e-06, + "loss": 0.9707, + "step": 3524 + }, + { + "epoch": 0.28485423947958544, + "grad_norm": 2.2613868713378906, + "learning_rate": 9.672246159867033e-06, + "loss": 0.9548, + "step": 3525 + }, + { + "epoch": 0.284935049192913, + "grad_norm": 2.750572443008423, + "learning_rate": 9.672013103863895e-06, + "loss": 1.036, + "step": 3526 + }, + { + "epoch": 0.28501585890624054, + "grad_norm": 2.6883738040924072, + "learning_rate": 9.671779967840422e-06, + "loss": 0.9813, + "step": 3527 + }, + { + "epoch": 0.28509666861956806, + "grad_norm": 3.0548415184020996, + "learning_rate": 9.671546751800602e-06, + "loss": 1.0667, + "step": 3528 + }, + { + "epoch": 0.28517747833289564, + "grad_norm": 2.81595516204834, + "learning_rate": 9.671313455748434e-06, + "loss": 0.8933, + "step": 3529 + }, + { + "epoch": 0.28525828804622316, + "grad_norm": 2.566643238067627, + "learning_rate": 9.671080079687913e-06, + "loss": 1.1025, + "step": 3530 + }, + { + "epoch": 0.2853390977595507, + "grad_norm": 3.4577713012695312, + "learning_rate": 9.670846623623033e-06, + "loss": 0.96, + "step": 3531 + }, + { + "epoch": 0.28541990747287826, + "grad_norm": 3.4869630336761475, + "learning_rate": 9.670613087557797e-06, + "loss": 1.1067, + "step": 3532 + }, + { + "epoch": 0.2855007171862058, + "grad_norm": 2.777127981185913, + "learning_rate": 9.670379471496203e-06, + "loss": 0.9792, + "step": 3533 + }, + { + "epoch": 0.2855815268995333, + "grad_norm": 2.815192699432373, + "learning_rate": 9.67014577544225e-06, + "loss": 1.097, + "step": 3534 + }, + { + "epoch": 0.2856623366128609, + "grad_norm": 3.0062084197998047, + "learning_rate": 9.669911999399945e-06, + "loss": 1.0119, + "step": 3535 + }, + { + "epoch": 0.2857431463261884, + "grad_norm": 3.3104159832000732, + "learning_rate": 9.669678143373289e-06, + "loss": 0.8744, + "step": 3536 + }, + { + "epoch": 0.28582395603951594, + "grad_norm": 2.381932497024536, + "learning_rate": 9.669444207366288e-06, + "loss": 0.996, + "step": 3537 + }, + { + "epoch": 0.2859047657528435, + "grad_norm": 2.455183506011963, + "learning_rate": 9.669210191382949e-06, + "loss": 1.0022, + "step": 3538 + }, + { + "epoch": 0.28598557546617104, + "grad_norm": 2.8171143531799316, + "learning_rate": 9.66897609542728e-06, + "loss": 0.918, + "step": 3539 + }, + { + "epoch": 0.28606638517949856, + "grad_norm": 2.676234006881714, + "learning_rate": 9.66874191950329e-06, + "loss": 0.9389, + "step": 3540 + }, + { + "epoch": 0.28614719489282614, + "grad_norm": 2.7240962982177734, + "learning_rate": 9.668507663614993e-06, + "loss": 0.9976, + "step": 3541 + }, + { + "epoch": 0.28622800460615366, + "grad_norm": 3.261559009552002, + "learning_rate": 9.668273327766395e-06, + "loss": 0.9515, + "step": 3542 + }, + { + "epoch": 0.2863088143194812, + "grad_norm": 3.1276698112487793, + "learning_rate": 9.668038911961516e-06, + "loss": 1.0021, + "step": 3543 + }, + { + "epoch": 0.28638962403280877, + "grad_norm": 2.76226544380188, + "learning_rate": 9.667804416204367e-06, + "loss": 0.949, + "step": 3544 + }, + { + "epoch": 0.2864704337461363, + "grad_norm": 2.9888455867767334, + "learning_rate": 9.667569840498966e-06, + "loss": 0.9896, + "step": 3545 + }, + { + "epoch": 0.2865512434594638, + "grad_norm": 2.889864206314087, + "learning_rate": 9.667335184849332e-06, + "loss": 1.0067, + "step": 3546 + }, + { + "epoch": 0.2866320531727914, + "grad_norm": 2.665231466293335, + "learning_rate": 9.66710044925948e-06, + "loss": 0.9624, + "step": 3547 + }, + { + "epoch": 0.2867128628861189, + "grad_norm": 2.893362045288086, + "learning_rate": 9.666865633733434e-06, + "loss": 0.8477, + "step": 3548 + }, + { + "epoch": 0.28679367259944644, + "grad_norm": 2.84060001373291, + "learning_rate": 9.666630738275213e-06, + "loss": 1.0926, + "step": 3549 + }, + { + "epoch": 0.286874482312774, + "grad_norm": 2.3701794147491455, + "learning_rate": 9.666395762888844e-06, + "loss": 0.9602, + "step": 3550 + }, + { + "epoch": 0.28695529202610154, + "grad_norm": 2.820270299911499, + "learning_rate": 9.666160707578349e-06, + "loss": 1.0241, + "step": 3551 + }, + { + "epoch": 0.28703610173942906, + "grad_norm": 2.936659336090088, + "learning_rate": 9.665925572347754e-06, + "loss": 0.9598, + "step": 3552 + }, + { + "epoch": 0.28711691145275664, + "grad_norm": 2.8876888751983643, + "learning_rate": 9.665690357201087e-06, + "loss": 0.9806, + "step": 3553 + }, + { + "epoch": 0.28719772116608416, + "grad_norm": 2.5708611011505127, + "learning_rate": 9.665455062142377e-06, + "loss": 1.006, + "step": 3554 + }, + { + "epoch": 0.2872785308794117, + "grad_norm": 2.5681517124176025, + "learning_rate": 9.665219687175652e-06, + "loss": 1.0493, + "step": 3555 + }, + { + "epoch": 0.28735934059273927, + "grad_norm": 2.6039462089538574, + "learning_rate": 9.664984232304946e-06, + "loss": 0.9338, + "step": 3556 + }, + { + "epoch": 0.2874401503060668, + "grad_norm": 2.749727964401245, + "learning_rate": 9.66474869753429e-06, + "loss": 0.9797, + "step": 3557 + }, + { + "epoch": 0.2875209600193943, + "grad_norm": 2.9040639400482178, + "learning_rate": 9.66451308286772e-06, + "loss": 1.0086, + "step": 3558 + }, + { + "epoch": 0.2876017697327219, + "grad_norm": 2.893544912338257, + "learning_rate": 9.664277388309268e-06, + "loss": 0.9645, + "step": 3559 + }, + { + "epoch": 0.2876825794460494, + "grad_norm": 2.9192464351654053, + "learning_rate": 9.664041613862973e-06, + "loss": 0.9184, + "step": 3560 + }, + { + "epoch": 0.28776338915937694, + "grad_norm": 2.6505277156829834, + "learning_rate": 9.663805759532876e-06, + "loss": 0.9097, + "step": 3561 + }, + { + "epoch": 0.2878441988727045, + "grad_norm": 2.486187219619751, + "learning_rate": 9.663569825323012e-06, + "loss": 1.0441, + "step": 3562 + }, + { + "epoch": 0.28792500858603204, + "grad_norm": 2.4875733852386475, + "learning_rate": 9.663333811237426e-06, + "loss": 0.9103, + "step": 3563 + }, + { + "epoch": 0.28800581829935956, + "grad_norm": 2.6382365226745605, + "learning_rate": 9.663097717280157e-06, + "loss": 0.8666, + "step": 3564 + }, + { + "epoch": 0.28808662801268714, + "grad_norm": 3.0614850521087646, + "learning_rate": 9.662861543455248e-06, + "loss": 1.0001, + "step": 3565 + }, + { + "epoch": 0.28816743772601466, + "grad_norm": 2.5728049278259277, + "learning_rate": 9.662625289766749e-06, + "loss": 0.9767, + "step": 3566 + }, + { + "epoch": 0.2882482474393422, + "grad_norm": 2.496704339981079, + "learning_rate": 9.662388956218702e-06, + "loss": 1.0522, + "step": 3567 + }, + { + "epoch": 0.28832905715266977, + "grad_norm": 2.6987080574035645, + "learning_rate": 9.662152542815158e-06, + "loss": 0.8845, + "step": 3568 + }, + { + "epoch": 0.2884098668659973, + "grad_norm": 2.947056770324707, + "learning_rate": 9.661916049560162e-06, + "loss": 1.0148, + "step": 3569 + }, + { + "epoch": 0.2884906765793248, + "grad_norm": 3.203179359436035, + "learning_rate": 9.661679476457771e-06, + "loss": 0.9221, + "step": 3570 + }, + { + "epoch": 0.2885714862926524, + "grad_norm": 3.0656864643096924, + "learning_rate": 9.66144282351203e-06, + "loss": 1.0925, + "step": 3571 + }, + { + "epoch": 0.2886522960059799, + "grad_norm": 3.0500292778015137, + "learning_rate": 9.661206090726996e-06, + "loss": 0.9946, + "step": 3572 + }, + { + "epoch": 0.28873310571930744, + "grad_norm": 2.9077112674713135, + "learning_rate": 9.660969278106724e-06, + "loss": 0.8432, + "step": 3573 + }, + { + "epoch": 0.288813915432635, + "grad_norm": 3.2144157886505127, + "learning_rate": 9.66073238565527e-06, + "loss": 1.082, + "step": 3574 + }, + { + "epoch": 0.28889472514596254, + "grad_norm": 2.6139988899230957, + "learning_rate": 9.660495413376688e-06, + "loss": 0.9484, + "step": 3575 + }, + { + "epoch": 0.28897553485929006, + "grad_norm": 3.0894522666931152, + "learning_rate": 9.66025836127504e-06, + "loss": 0.8584, + "step": 3576 + }, + { + "epoch": 0.28905634457261764, + "grad_norm": 2.5344035625457764, + "learning_rate": 9.660021229354386e-06, + "loss": 0.9804, + "step": 3577 + }, + { + "epoch": 0.28913715428594516, + "grad_norm": 2.615623712539673, + "learning_rate": 9.659784017618787e-06, + "loss": 0.9411, + "step": 3578 + }, + { + "epoch": 0.2892179639992727, + "grad_norm": 2.9990971088409424, + "learning_rate": 9.659546726072306e-06, + "loss": 1.0582, + "step": 3579 + }, + { + "epoch": 0.28929877371260027, + "grad_norm": 2.8509674072265625, + "learning_rate": 9.659309354719005e-06, + "loss": 0.9297, + "step": 3580 + }, + { + "epoch": 0.2893795834259278, + "grad_norm": 2.2842657566070557, + "learning_rate": 9.659071903562953e-06, + "loss": 1.0806, + "step": 3581 + }, + { + "epoch": 0.2894603931392553, + "grad_norm": 2.9615931510925293, + "learning_rate": 9.658834372608216e-06, + "loss": 0.9645, + "step": 3582 + }, + { + "epoch": 0.2895412028525829, + "grad_norm": 2.4293406009674072, + "learning_rate": 9.65859676185886e-06, + "loss": 1.0036, + "step": 3583 + }, + { + "epoch": 0.2896220125659104, + "grad_norm": 3.4946937561035156, + "learning_rate": 9.65835907131896e-06, + "loss": 0.844, + "step": 3584 + }, + { + "epoch": 0.28970282227923794, + "grad_norm": 2.9931979179382324, + "learning_rate": 9.65812130099258e-06, + "loss": 0.8941, + "step": 3585 + }, + { + "epoch": 0.2897836319925655, + "grad_norm": 3.216002941131592, + "learning_rate": 9.657883450883798e-06, + "loss": 1.0303, + "step": 3586 + }, + { + "epoch": 0.28986444170589304, + "grad_norm": 2.9423599243164062, + "learning_rate": 9.657645520996686e-06, + "loss": 0.9116, + "step": 3587 + }, + { + "epoch": 0.2899452514192206, + "grad_norm": 2.7516708374023438, + "learning_rate": 9.657407511335319e-06, + "loss": 0.9291, + "step": 3588 + }, + { + "epoch": 0.29002606113254814, + "grad_norm": 3.6603939533233643, + "learning_rate": 9.657169421903772e-06, + "loss": 1.0703, + "step": 3589 + }, + { + "epoch": 0.29010687084587566, + "grad_norm": 2.783923625946045, + "learning_rate": 9.656931252706126e-06, + "loss": 1.0308, + "step": 3590 + }, + { + "epoch": 0.29018768055920324, + "grad_norm": 2.6937367916107178, + "learning_rate": 9.656693003746458e-06, + "loss": 0.8576, + "step": 3591 + }, + { + "epoch": 0.29026849027253077, + "grad_norm": 2.9233529567718506, + "learning_rate": 9.65645467502885e-06, + "loss": 1.0536, + "step": 3592 + }, + { + "epoch": 0.2903492999858583, + "grad_norm": 2.8620266914367676, + "learning_rate": 9.656216266557384e-06, + "loss": 0.9495, + "step": 3593 + }, + { + "epoch": 0.29043010969918587, + "grad_norm": 2.6702728271484375, + "learning_rate": 9.655977778336142e-06, + "loss": 1.0955, + "step": 3594 + }, + { + "epoch": 0.2905109194125134, + "grad_norm": 2.7964835166931152, + "learning_rate": 9.655739210369208e-06, + "loss": 0.961, + "step": 3595 + }, + { + "epoch": 0.2905917291258409, + "grad_norm": 3.0626680850982666, + "learning_rate": 9.65550056266067e-06, + "loss": 0.8856, + "step": 3596 + }, + { + "epoch": 0.2906725388391685, + "grad_norm": 3.4128012657165527, + "learning_rate": 9.655261835214617e-06, + "loss": 0.9221, + "step": 3597 + }, + { + "epoch": 0.290753348552496, + "grad_norm": 2.408466339111328, + "learning_rate": 9.655023028035135e-06, + "loss": 1.0366, + "step": 3598 + }, + { + "epoch": 0.29083415826582354, + "grad_norm": 2.675818920135498, + "learning_rate": 9.654784141126315e-06, + "loss": 0.9772, + "step": 3599 + }, + { + "epoch": 0.2909149679791511, + "grad_norm": 2.5891449451446533, + "learning_rate": 9.654545174492248e-06, + "loss": 1.0234, + "step": 3600 + }, + { + "epoch": 0.29099577769247864, + "grad_norm": 2.5514161586761475, + "learning_rate": 9.654306128137028e-06, + "loss": 1.1006, + "step": 3601 + }, + { + "epoch": 0.29107658740580616, + "grad_norm": 2.697810173034668, + "learning_rate": 9.65406700206475e-06, + "loss": 1.0898, + "step": 3602 + }, + { + "epoch": 0.29115739711913374, + "grad_norm": 2.2984206676483154, + "learning_rate": 9.653827796279507e-06, + "loss": 1.0286, + "step": 3603 + }, + { + "epoch": 0.29123820683246127, + "grad_norm": 2.5531578063964844, + "learning_rate": 9.653588510785398e-06, + "loss": 1.0689, + "step": 3604 + }, + { + "epoch": 0.2913190165457888, + "grad_norm": 2.9822959899902344, + "learning_rate": 9.65334914558652e-06, + "loss": 0.9697, + "step": 3605 + }, + { + "epoch": 0.29139982625911637, + "grad_norm": 2.4546971321105957, + "learning_rate": 9.653109700686974e-06, + "loss": 1.0312, + "step": 3606 + }, + { + "epoch": 0.2914806359724439, + "grad_norm": 2.8869683742523193, + "learning_rate": 9.652870176090862e-06, + "loss": 1.0018, + "step": 3607 + }, + { + "epoch": 0.2915614456857714, + "grad_norm": 2.554905891418457, + "learning_rate": 9.652630571802283e-06, + "loss": 0.8601, + "step": 3608 + }, + { + "epoch": 0.291642255399099, + "grad_norm": 2.709362030029297, + "learning_rate": 9.652390887825344e-06, + "loss": 0.961, + "step": 3609 + }, + { + "epoch": 0.2917230651124265, + "grad_norm": 3.170376777648926, + "learning_rate": 9.65215112416415e-06, + "loss": 1.0732, + "step": 3610 + }, + { + "epoch": 0.29180387482575404, + "grad_norm": 2.6856372356414795, + "learning_rate": 9.651911280822806e-06, + "loss": 0.9184, + "step": 3611 + }, + { + "epoch": 0.2918846845390816, + "grad_norm": 2.742732048034668, + "learning_rate": 9.651671357805421e-06, + "loss": 0.8924, + "step": 3612 + }, + { + "epoch": 0.29196549425240914, + "grad_norm": 3.003530740737915, + "learning_rate": 9.651431355116105e-06, + "loss": 1.0086, + "step": 3613 + }, + { + "epoch": 0.29204630396573666, + "grad_norm": 2.8207788467407227, + "learning_rate": 9.651191272758967e-06, + "loss": 1.0997, + "step": 3614 + }, + { + "epoch": 0.29212711367906424, + "grad_norm": 3.3507986068725586, + "learning_rate": 9.65095111073812e-06, + "loss": 0.8922, + "step": 3615 + }, + { + "epoch": 0.29220792339239177, + "grad_norm": 2.6607301235198975, + "learning_rate": 9.650710869057675e-06, + "loss": 1.0341, + "step": 3616 + }, + { + "epoch": 0.2922887331057193, + "grad_norm": 2.9044384956359863, + "learning_rate": 9.650470547721753e-06, + "loss": 0.9735, + "step": 3617 + }, + { + "epoch": 0.29236954281904687, + "grad_norm": 2.7206690311431885, + "learning_rate": 9.650230146734463e-06, + "loss": 0.9322, + "step": 3618 + }, + { + "epoch": 0.2924503525323744, + "grad_norm": 2.93376088142395, + "learning_rate": 9.649989666099926e-06, + "loss": 0.9781, + "step": 3619 + }, + { + "epoch": 0.2925311622457019, + "grad_norm": 2.40251088142395, + "learning_rate": 9.649749105822261e-06, + "loss": 1.1007, + "step": 3620 + }, + { + "epoch": 0.2926119719590295, + "grad_norm": 2.493821859359741, + "learning_rate": 9.649508465905589e-06, + "loss": 1.0024, + "step": 3621 + }, + { + "epoch": 0.292692781672357, + "grad_norm": 2.7107715606689453, + "learning_rate": 9.649267746354027e-06, + "loss": 1.0928, + "step": 3622 + }, + { + "epoch": 0.29277359138568454, + "grad_norm": 3.0153157711029053, + "learning_rate": 9.649026947171703e-06, + "loss": 1.0024, + "step": 3623 + }, + { + "epoch": 0.2928544010990121, + "grad_norm": 2.727877378463745, + "learning_rate": 9.64878606836274e-06, + "loss": 1.0563, + "step": 3624 + }, + { + "epoch": 0.29293521081233964, + "grad_norm": 2.567570686340332, + "learning_rate": 9.648545109931262e-06, + "loss": 0.9565, + "step": 3625 + }, + { + "epoch": 0.29301602052566716, + "grad_norm": 3.3060126304626465, + "learning_rate": 9.648304071881398e-06, + "loss": 1.1092, + "step": 3626 + }, + { + "epoch": 0.29309683023899474, + "grad_norm": 2.8973357677459717, + "learning_rate": 9.648062954217275e-06, + "loss": 0.9373, + "step": 3627 + }, + { + "epoch": 0.29317763995232227, + "grad_norm": 2.6033413410186768, + "learning_rate": 9.647821756943023e-06, + "loss": 1.1346, + "step": 3628 + }, + { + "epoch": 0.2932584496656498, + "grad_norm": 2.7342493534088135, + "learning_rate": 9.647580480062775e-06, + "loss": 0.9699, + "step": 3629 + }, + { + "epoch": 0.29333925937897737, + "grad_norm": 2.461733818054199, + "learning_rate": 9.647339123580662e-06, + "loss": 1.0773, + "step": 3630 + }, + { + "epoch": 0.2934200690923049, + "grad_norm": 3.093608856201172, + "learning_rate": 9.647097687500815e-06, + "loss": 1.1015, + "step": 3631 + }, + { + "epoch": 0.2935008788056324, + "grad_norm": 2.7451560497283936, + "learning_rate": 9.646856171827374e-06, + "loss": 0.9776, + "step": 3632 + }, + { + "epoch": 0.29358168851896, + "grad_norm": 3.1740598678588867, + "learning_rate": 9.646614576564475e-06, + "loss": 0.8478, + "step": 3633 + }, + { + "epoch": 0.2936624982322875, + "grad_norm": 2.531581401824951, + "learning_rate": 9.646372901716252e-06, + "loss": 1.043, + "step": 3634 + }, + { + "epoch": 0.29374330794561504, + "grad_norm": 2.956531047821045, + "learning_rate": 9.646131147286848e-06, + "loss": 0.9862, + "step": 3635 + }, + { + "epoch": 0.2938241176589426, + "grad_norm": 2.895075798034668, + "learning_rate": 9.645889313280403e-06, + "loss": 0.892, + "step": 3636 + }, + { + "epoch": 0.29390492737227014, + "grad_norm": 3.3034794330596924, + "learning_rate": 9.645647399701058e-06, + "loss": 0.8616, + "step": 3637 + }, + { + "epoch": 0.29398573708559766, + "grad_norm": 2.8100292682647705, + "learning_rate": 9.645405406552956e-06, + "loss": 0.9875, + "step": 3638 + }, + { + "epoch": 0.29406654679892524, + "grad_norm": 2.9189043045043945, + "learning_rate": 9.645163333840244e-06, + "loss": 0.9983, + "step": 3639 + }, + { + "epoch": 0.29414735651225277, + "grad_norm": 2.8886024951934814, + "learning_rate": 9.644921181567068e-06, + "loss": 0.9779, + "step": 3640 + }, + { + "epoch": 0.2942281662255803, + "grad_norm": 3.025773048400879, + "learning_rate": 9.644678949737573e-06, + "loss": 1.0725, + "step": 3641 + }, + { + "epoch": 0.29430897593890787, + "grad_norm": 2.791072130203247, + "learning_rate": 9.64443663835591e-06, + "loss": 1.0175, + "step": 3642 + }, + { + "epoch": 0.2943897856522354, + "grad_norm": 2.844024896621704, + "learning_rate": 9.644194247426227e-06, + "loss": 0.9882, + "step": 3643 + }, + { + "epoch": 0.2944705953655629, + "grad_norm": 2.7305147647857666, + "learning_rate": 9.643951776952677e-06, + "loss": 0.943, + "step": 3644 + }, + { + "epoch": 0.2945514050788905, + "grad_norm": 2.6960883140563965, + "learning_rate": 9.643709226939414e-06, + "loss": 0.9377, + "step": 3645 + }, + { + "epoch": 0.294632214792218, + "grad_norm": 2.5656726360321045, + "learning_rate": 9.643466597390591e-06, + "loss": 0.9167, + "step": 3646 + }, + { + "epoch": 0.29471302450554554, + "grad_norm": 2.8714895248413086, + "learning_rate": 9.643223888310363e-06, + "loss": 0.9791, + "step": 3647 + }, + { + "epoch": 0.2947938342188731, + "grad_norm": 3.0841047763824463, + "learning_rate": 9.642981099702888e-06, + "loss": 1.0111, + "step": 3648 + }, + { + "epoch": 0.29487464393220064, + "grad_norm": 3.318530321121216, + "learning_rate": 9.642738231572327e-06, + "loss": 0.9167, + "step": 3649 + }, + { + "epoch": 0.29495545364552817, + "grad_norm": 3.0374815464019775, + "learning_rate": 9.642495283922834e-06, + "loss": 0.9001, + "step": 3650 + }, + { + "epoch": 0.29503626335885574, + "grad_norm": 2.714785575866699, + "learning_rate": 9.642252256758573e-06, + "loss": 0.9983, + "step": 3651 + }, + { + "epoch": 0.29511707307218327, + "grad_norm": 2.6203105449676514, + "learning_rate": 9.64200915008371e-06, + "loss": 1.1392, + "step": 3652 + }, + { + "epoch": 0.29519788278551085, + "grad_norm": 3.2234745025634766, + "learning_rate": 9.6417659639024e-06, + "loss": 1.1179, + "step": 3653 + }, + { + "epoch": 0.29527869249883837, + "grad_norm": 3.3086729049682617, + "learning_rate": 9.641522698218817e-06, + "loss": 0.9776, + "step": 3654 + }, + { + "epoch": 0.2953595022121659, + "grad_norm": 2.2480361461639404, + "learning_rate": 9.641279353037125e-06, + "loss": 0.9454, + "step": 3655 + }, + { + "epoch": 0.29544031192549347, + "grad_norm": 2.897298812866211, + "learning_rate": 9.64103592836149e-06, + "loss": 0.9071, + "step": 3656 + }, + { + "epoch": 0.295521121638821, + "grad_norm": 3.1581902503967285, + "learning_rate": 9.640792424196081e-06, + "loss": 0.9598, + "step": 3657 + }, + { + "epoch": 0.2956019313521485, + "grad_norm": 3.033461332321167, + "learning_rate": 9.640548840545071e-06, + "loss": 0.9439, + "step": 3658 + }, + { + "epoch": 0.2956827410654761, + "grad_norm": 2.8116235733032227, + "learning_rate": 9.640305177412633e-06, + "loss": 0.9067, + "step": 3659 + }, + { + "epoch": 0.2957635507788036, + "grad_norm": 2.388702869415283, + "learning_rate": 9.640061434802936e-06, + "loss": 1.0109, + "step": 3660 + }, + { + "epoch": 0.29584436049213114, + "grad_norm": 2.5147347450256348, + "learning_rate": 9.63981761272016e-06, + "loss": 1.0931, + "step": 3661 + }, + { + "epoch": 0.2959251702054587, + "grad_norm": 2.6969945430755615, + "learning_rate": 9.639573711168476e-06, + "loss": 1.0101, + "step": 3662 + }, + { + "epoch": 0.29600597991878624, + "grad_norm": 2.7561285495758057, + "learning_rate": 9.639329730152062e-06, + "loss": 0.9888, + "step": 3663 + }, + { + "epoch": 0.29608678963211377, + "grad_norm": 2.8578975200653076, + "learning_rate": 9.639085669675102e-06, + "loss": 0.9464, + "step": 3664 + }, + { + "epoch": 0.29616759934544135, + "grad_norm": 2.514591932296753, + "learning_rate": 9.63884152974177e-06, + "loss": 0.8707, + "step": 3665 + }, + { + "epoch": 0.29624840905876887, + "grad_norm": 2.9842793941497803, + "learning_rate": 9.638597310356251e-06, + "loss": 1.0779, + "step": 3666 + }, + { + "epoch": 0.2963292187720964, + "grad_norm": 2.5405080318450928, + "learning_rate": 9.638353011522727e-06, + "loss": 1.0123, + "step": 3667 + }, + { + "epoch": 0.29641002848542397, + "grad_norm": 3.033236265182495, + "learning_rate": 9.638108633245382e-06, + "loss": 0.9378, + "step": 3668 + }, + { + "epoch": 0.2964908381987515, + "grad_norm": 2.6469950675964355, + "learning_rate": 9.637864175528403e-06, + "loss": 1.0793, + "step": 3669 + }, + { + "epoch": 0.296571647912079, + "grad_norm": 3.0484907627105713, + "learning_rate": 9.637619638375975e-06, + "loss": 1.0042, + "step": 3670 + }, + { + "epoch": 0.2966524576254066, + "grad_norm": 2.8458609580993652, + "learning_rate": 9.637375021792288e-06, + "loss": 0.985, + "step": 3671 + }, + { + "epoch": 0.2967332673387341, + "grad_norm": 2.721964120864868, + "learning_rate": 9.63713032578153e-06, + "loss": 1.0156, + "step": 3672 + }, + { + "epoch": 0.29681407705206164, + "grad_norm": 2.86923885345459, + "learning_rate": 9.636885550347892e-06, + "loss": 0.931, + "step": 3673 + }, + { + "epoch": 0.2968948867653892, + "grad_norm": 2.955361843109131, + "learning_rate": 9.63664069549557e-06, + "loss": 1.0084, + "step": 3674 + }, + { + "epoch": 0.29697569647871674, + "grad_norm": 2.169773817062378, + "learning_rate": 9.636395761228753e-06, + "loss": 0.8846, + "step": 3675 + }, + { + "epoch": 0.29705650619204427, + "grad_norm": 3.340684652328491, + "learning_rate": 9.636150747551637e-06, + "loss": 0.8681, + "step": 3676 + }, + { + "epoch": 0.29713731590537185, + "grad_norm": 2.6499557495117188, + "learning_rate": 9.635905654468424e-06, + "loss": 0.9807, + "step": 3677 + }, + { + "epoch": 0.29721812561869937, + "grad_norm": 2.8025360107421875, + "learning_rate": 9.635660481983304e-06, + "loss": 1.0047, + "step": 3678 + }, + { + "epoch": 0.2972989353320269, + "grad_norm": 2.656595468521118, + "learning_rate": 9.635415230100481e-06, + "loss": 0.9095, + "step": 3679 + }, + { + "epoch": 0.29737974504535447, + "grad_norm": 3.083083391189575, + "learning_rate": 9.635169898824156e-06, + "loss": 1.1252, + "step": 3680 + }, + { + "epoch": 0.297460554758682, + "grad_norm": 2.9544124603271484, + "learning_rate": 9.634924488158529e-06, + "loss": 0.949, + "step": 3681 + }, + { + "epoch": 0.2975413644720095, + "grad_norm": 2.7282896041870117, + "learning_rate": 9.634678998107802e-06, + "loss": 0.9059, + "step": 3682 + }, + { + "epoch": 0.2976221741853371, + "grad_norm": 2.920116424560547, + "learning_rate": 9.634433428676182e-06, + "loss": 0.9557, + "step": 3683 + }, + { + "epoch": 0.2977029838986646, + "grad_norm": 2.758039712905884, + "learning_rate": 9.634187779867874e-06, + "loss": 0.9961, + "step": 3684 + }, + { + "epoch": 0.29778379361199214, + "grad_norm": 2.424414873123169, + "learning_rate": 9.633942051687086e-06, + "loss": 1.0828, + "step": 3685 + }, + { + "epoch": 0.2978646033253197, + "grad_norm": 2.638777017593384, + "learning_rate": 9.633696244138026e-06, + "loss": 0.9923, + "step": 3686 + }, + { + "epoch": 0.29794541303864724, + "grad_norm": 2.3497443199157715, + "learning_rate": 9.633450357224905e-06, + "loss": 0.9937, + "step": 3687 + }, + { + "epoch": 0.29802622275197477, + "grad_norm": 2.947047472000122, + "learning_rate": 9.633204390951933e-06, + "loss": 1.0171, + "step": 3688 + }, + { + "epoch": 0.29810703246530235, + "grad_norm": 2.8731634616851807, + "learning_rate": 9.632958345323324e-06, + "loss": 0.9742, + "step": 3689 + }, + { + "epoch": 0.29818784217862987, + "grad_norm": 2.6260106563568115, + "learning_rate": 9.632712220343293e-06, + "loss": 1.0433, + "step": 3690 + }, + { + "epoch": 0.2982686518919574, + "grad_norm": 2.646087169647217, + "learning_rate": 9.632466016016055e-06, + "loss": 1.0002, + "step": 3691 + }, + { + "epoch": 0.29834946160528497, + "grad_norm": 2.8434994220733643, + "learning_rate": 9.632219732345824e-06, + "loss": 1.0329, + "step": 3692 + }, + { + "epoch": 0.2984302713186125, + "grad_norm": 2.7054171562194824, + "learning_rate": 9.631973369336822e-06, + "loss": 0.985, + "step": 3693 + }, + { + "epoch": 0.29851108103194, + "grad_norm": 2.8496406078338623, + "learning_rate": 9.631726926993268e-06, + "loss": 0.9409, + "step": 3694 + }, + { + "epoch": 0.2985918907452676, + "grad_norm": 2.6724507808685303, + "learning_rate": 9.631480405319381e-06, + "loss": 1.023, + "step": 3695 + }, + { + "epoch": 0.2986727004585951, + "grad_norm": 2.8490123748779297, + "learning_rate": 9.631233804319384e-06, + "loss": 1.1671, + "step": 3696 + }, + { + "epoch": 0.29875351017192264, + "grad_norm": 2.7283239364624023, + "learning_rate": 9.630987123997503e-06, + "loss": 1.0061, + "step": 3697 + }, + { + "epoch": 0.2988343198852502, + "grad_norm": 2.5398035049438477, + "learning_rate": 9.63074036435796e-06, + "loss": 0.9462, + "step": 3698 + }, + { + "epoch": 0.29891512959857774, + "grad_norm": 2.722632884979248, + "learning_rate": 9.630493525404982e-06, + "loss": 1.0019, + "step": 3699 + }, + { + "epoch": 0.29899593931190527, + "grad_norm": 2.939347743988037, + "learning_rate": 9.630246607142799e-06, + "loss": 1.0087, + "step": 3700 + }, + { + "epoch": 0.29907674902523285, + "grad_norm": 2.5201287269592285, + "learning_rate": 9.629999609575638e-06, + "loss": 1.0367, + "step": 3701 + }, + { + "epoch": 0.29915755873856037, + "grad_norm": 2.288954019546509, + "learning_rate": 9.629752532707729e-06, + "loss": 0.9458, + "step": 3702 + }, + { + "epoch": 0.2992383684518879, + "grad_norm": 2.856987476348877, + "learning_rate": 9.629505376543306e-06, + "loss": 0.9537, + "step": 3703 + }, + { + "epoch": 0.29931917816521547, + "grad_norm": 3.162282705307007, + "learning_rate": 9.6292581410866e-06, + "loss": 0.9306, + "step": 3704 + }, + { + "epoch": 0.299399987878543, + "grad_norm": 2.8928518295288086, + "learning_rate": 9.629010826341846e-06, + "loss": 1.0406, + "step": 3705 + }, + { + "epoch": 0.2994807975918705, + "grad_norm": 2.788278818130493, + "learning_rate": 9.628763432313282e-06, + "loss": 0.9235, + "step": 3706 + }, + { + "epoch": 0.2995616073051981, + "grad_norm": 2.2690505981445312, + "learning_rate": 9.628515959005142e-06, + "loss": 0.9959, + "step": 3707 + }, + { + "epoch": 0.2996424170185256, + "grad_norm": 2.923549175262451, + "learning_rate": 9.628268406421668e-06, + "loss": 0.9595, + "step": 3708 + }, + { + "epoch": 0.29972322673185314, + "grad_norm": 2.489567279815674, + "learning_rate": 9.628020774567098e-06, + "loss": 1.0296, + "step": 3709 + }, + { + "epoch": 0.2998040364451807, + "grad_norm": 3.2813069820404053, + "learning_rate": 9.627773063445674e-06, + "loss": 0.9688, + "step": 3710 + }, + { + "epoch": 0.29988484615850824, + "grad_norm": 2.8532261848449707, + "learning_rate": 9.627525273061637e-06, + "loss": 1.0838, + "step": 3711 + }, + { + "epoch": 0.29996565587183577, + "grad_norm": 2.5741493701934814, + "learning_rate": 9.627277403419233e-06, + "loss": 1.0229, + "step": 3712 + }, + { + "epoch": 0.30004646558516335, + "grad_norm": 3.150580406188965, + "learning_rate": 9.627029454522706e-06, + "loss": 1.0542, + "step": 3713 + }, + { + "epoch": 0.30012727529849087, + "grad_norm": 2.7101001739501953, + "learning_rate": 9.626781426376305e-06, + "loss": 0.8599, + "step": 3714 + }, + { + "epoch": 0.3002080850118184, + "grad_norm": 3.4060416221618652, + "learning_rate": 9.626533318984275e-06, + "loss": 0.8764, + "step": 3715 + }, + { + "epoch": 0.30028889472514597, + "grad_norm": 3.1740713119506836, + "learning_rate": 9.62628513235087e-06, + "loss": 1.0542, + "step": 3716 + }, + { + "epoch": 0.3003697044384735, + "grad_norm": 2.790522336959839, + "learning_rate": 9.626036866480335e-06, + "loss": 1.0059, + "step": 3717 + }, + { + "epoch": 0.3004505141518011, + "grad_norm": 2.853146553039551, + "learning_rate": 9.625788521376927e-06, + "loss": 0.9635, + "step": 3718 + }, + { + "epoch": 0.3005313238651286, + "grad_norm": 2.8684794902801514, + "learning_rate": 9.625540097044896e-06, + "loss": 1.0199, + "step": 3719 + }, + { + "epoch": 0.3006121335784561, + "grad_norm": 2.624752998352051, + "learning_rate": 9.625291593488501e-06, + "loss": 0.9774, + "step": 3720 + }, + { + "epoch": 0.3006929432917837, + "grad_norm": 2.7984793186187744, + "learning_rate": 9.625043010711995e-06, + "loss": 0.9907, + "step": 3721 + }, + { + "epoch": 0.3007737530051112, + "grad_norm": 2.6331899166107178, + "learning_rate": 9.624794348719636e-06, + "loss": 1.0337, + "step": 3722 + }, + { + "epoch": 0.30085456271843874, + "grad_norm": 2.8247597217559814, + "learning_rate": 9.624545607515685e-06, + "loss": 0.8952, + "step": 3723 + }, + { + "epoch": 0.3009353724317663, + "grad_norm": 2.4679906368255615, + "learning_rate": 9.624296787104398e-06, + "loss": 1.1373, + "step": 3724 + }, + { + "epoch": 0.30101618214509385, + "grad_norm": 2.802913188934326, + "learning_rate": 9.624047887490043e-06, + "loss": 1.0895, + "step": 3725 + }, + { + "epoch": 0.30109699185842137, + "grad_norm": 3.0468852519989014, + "learning_rate": 9.623798908676877e-06, + "loss": 1.0068, + "step": 3726 + }, + { + "epoch": 0.30117780157174895, + "grad_norm": 2.6949493885040283, + "learning_rate": 9.623549850669168e-06, + "loss": 0.9805, + "step": 3727 + }, + { + "epoch": 0.30125861128507647, + "grad_norm": 2.809337615966797, + "learning_rate": 9.623300713471181e-06, + "loss": 1.0349, + "step": 3728 + }, + { + "epoch": 0.301339420998404, + "grad_norm": 2.325697898864746, + "learning_rate": 9.623051497087183e-06, + "loss": 1.058, + "step": 3729 + }, + { + "epoch": 0.3014202307117316, + "grad_norm": 2.797555685043335, + "learning_rate": 9.622802201521441e-06, + "loss": 0.8398, + "step": 3730 + }, + { + "epoch": 0.3015010404250591, + "grad_norm": 3.0890145301818848, + "learning_rate": 9.622552826778228e-06, + "loss": 1.0047, + "step": 3731 + }, + { + "epoch": 0.3015818501383866, + "grad_norm": 2.638610363006592, + "learning_rate": 9.622303372861812e-06, + "loss": 0.9964, + "step": 3732 + }, + { + "epoch": 0.3016626598517142, + "grad_norm": 2.8396639823913574, + "learning_rate": 9.622053839776469e-06, + "loss": 0.972, + "step": 3733 + }, + { + "epoch": 0.3017434695650417, + "grad_norm": 3.0043957233428955, + "learning_rate": 9.62180422752647e-06, + "loss": 0.9412, + "step": 3734 + }, + { + "epoch": 0.30182427927836925, + "grad_norm": 3.0119330883026123, + "learning_rate": 9.62155453611609e-06, + "loss": 0.9555, + "step": 3735 + }, + { + "epoch": 0.3019050889916968, + "grad_norm": 2.6366915702819824, + "learning_rate": 9.621304765549607e-06, + "loss": 0.9421, + "step": 3736 + }, + { + "epoch": 0.30198589870502435, + "grad_norm": 2.7750697135925293, + "learning_rate": 9.621054915831299e-06, + "loss": 0.8742, + "step": 3737 + }, + { + "epoch": 0.30206670841835187, + "grad_norm": 2.836611032485962, + "learning_rate": 9.620804986965447e-06, + "loss": 0.9479, + "step": 3738 + }, + { + "epoch": 0.30214751813167945, + "grad_norm": 2.514456033706665, + "learning_rate": 9.620554978956326e-06, + "loss": 0.9826, + "step": 3739 + }, + { + "epoch": 0.30222832784500697, + "grad_norm": 2.808971643447876, + "learning_rate": 9.620304891808225e-06, + "loss": 0.9684, + "step": 3740 + }, + { + "epoch": 0.3023091375583345, + "grad_norm": 2.664354085922241, + "learning_rate": 9.620054725525423e-06, + "loss": 0.9621, + "step": 3741 + }, + { + "epoch": 0.3023899472716621, + "grad_norm": 3.2560720443725586, + "learning_rate": 9.619804480112205e-06, + "loss": 0.9162, + "step": 3742 + }, + { + "epoch": 0.3024707569849896, + "grad_norm": 2.7616066932678223, + "learning_rate": 9.619554155572859e-06, + "loss": 1.0174, + "step": 3743 + }, + { + "epoch": 0.3025515666983171, + "grad_norm": 2.6008918285369873, + "learning_rate": 9.61930375191167e-06, + "loss": 0.9481, + "step": 3744 + }, + { + "epoch": 0.3026323764116447, + "grad_norm": 2.4243476390838623, + "learning_rate": 9.61905326913293e-06, + "loss": 1.0277, + "step": 3745 + }, + { + "epoch": 0.3027131861249722, + "grad_norm": 3.0201876163482666, + "learning_rate": 9.618802707240926e-06, + "loss": 1.0801, + "step": 3746 + }, + { + "epoch": 0.30279399583829975, + "grad_norm": 2.391294240951538, + "learning_rate": 9.618552066239952e-06, + "loss": 0.9288, + "step": 3747 + }, + { + "epoch": 0.3028748055516273, + "grad_norm": 2.787870407104492, + "learning_rate": 9.6183013461343e-06, + "loss": 1.0832, + "step": 3748 + }, + { + "epoch": 0.30295561526495485, + "grad_norm": 2.917569637298584, + "learning_rate": 9.618050546928265e-06, + "loss": 1.1066, + "step": 3749 + }, + { + "epoch": 0.30303642497828237, + "grad_norm": 2.6563382148742676, + "learning_rate": 9.617799668626138e-06, + "loss": 1.0291, + "step": 3750 + }, + { + "epoch": 0.30311723469160995, + "grad_norm": 2.652367115020752, + "learning_rate": 9.617548711232223e-06, + "loss": 1.1087, + "step": 3751 + }, + { + "epoch": 0.3031980444049375, + "grad_norm": 2.404257297515869, + "learning_rate": 9.617297674750813e-06, + "loss": 1.0194, + "step": 3752 + }, + { + "epoch": 0.303278854118265, + "grad_norm": 2.656034469604492, + "learning_rate": 9.617046559186209e-06, + "loss": 1.1243, + "step": 3753 + }, + { + "epoch": 0.3033596638315926, + "grad_norm": 2.990955114364624, + "learning_rate": 9.616795364542715e-06, + "loss": 0.9551, + "step": 3754 + }, + { + "epoch": 0.3034404735449201, + "grad_norm": 2.93247127532959, + "learning_rate": 9.61654409082463e-06, + "loss": 1.0054, + "step": 3755 + }, + { + "epoch": 0.3035212832582476, + "grad_norm": 2.9435315132141113, + "learning_rate": 9.616292738036258e-06, + "loss": 0.9341, + "step": 3756 + }, + { + "epoch": 0.3036020929715752, + "grad_norm": 3.014838695526123, + "learning_rate": 9.616041306181905e-06, + "loss": 0.8704, + "step": 3757 + }, + { + "epoch": 0.3036829026849027, + "grad_norm": 2.5602517127990723, + "learning_rate": 9.615789795265877e-06, + "loss": 0.981, + "step": 3758 + }, + { + "epoch": 0.30376371239823025, + "grad_norm": 3.1778597831726074, + "learning_rate": 9.61553820529248e-06, + "loss": 1.0666, + "step": 3759 + }, + { + "epoch": 0.3038445221115578, + "grad_norm": 2.844055414199829, + "learning_rate": 9.615286536266028e-06, + "loss": 1.0498, + "step": 3760 + }, + { + "epoch": 0.30392533182488535, + "grad_norm": 2.639004707336426, + "learning_rate": 9.615034788190827e-06, + "loss": 1.0543, + "step": 3761 + }, + { + "epoch": 0.30400614153821287, + "grad_norm": 2.979391098022461, + "learning_rate": 9.61478296107119e-06, + "loss": 1.0619, + "step": 3762 + }, + { + "epoch": 0.30408695125154045, + "grad_norm": 2.64461088180542, + "learning_rate": 9.61453105491143e-06, + "loss": 0.889, + "step": 3763 + }, + { + "epoch": 0.304167760964868, + "grad_norm": 2.8508715629577637, + "learning_rate": 9.614279069715865e-06, + "loss": 0.9243, + "step": 3764 + }, + { + "epoch": 0.3042485706781955, + "grad_norm": 2.7899696826934814, + "learning_rate": 9.614027005488806e-06, + "loss": 1.0311, + "step": 3765 + }, + { + "epoch": 0.3043293803915231, + "grad_norm": 2.9851186275482178, + "learning_rate": 9.613774862234573e-06, + "loss": 1.039, + "step": 3766 + }, + { + "epoch": 0.3044101901048506, + "grad_norm": 2.874389171600342, + "learning_rate": 9.613522639957482e-06, + "loss": 1.0424, + "step": 3767 + }, + { + "epoch": 0.3044909998181781, + "grad_norm": 2.706479072570801, + "learning_rate": 9.613270338661856e-06, + "loss": 1.0294, + "step": 3768 + }, + { + "epoch": 0.3045718095315057, + "grad_norm": 2.365277051925659, + "learning_rate": 9.613017958352015e-06, + "loss": 0.9595, + "step": 3769 + }, + { + "epoch": 0.3046526192448332, + "grad_norm": 3.055814266204834, + "learning_rate": 9.612765499032281e-06, + "loss": 1.0066, + "step": 3770 + }, + { + "epoch": 0.30473342895816075, + "grad_norm": 2.9857189655303955, + "learning_rate": 9.61251296070698e-06, + "loss": 0.9575, + "step": 3771 + }, + { + "epoch": 0.3048142386714883, + "grad_norm": 2.737423896789551, + "learning_rate": 9.612260343380438e-06, + "loss": 0.9298, + "step": 3772 + }, + { + "epoch": 0.30489504838481585, + "grad_norm": 2.642305850982666, + "learning_rate": 9.612007647056976e-06, + "loss": 0.9823, + "step": 3773 + }, + { + "epoch": 0.30497585809814337, + "grad_norm": 2.365912914276123, + "learning_rate": 9.611754871740928e-06, + "loss": 1.0307, + "step": 3774 + }, + { + "epoch": 0.30505666781147095, + "grad_norm": 2.7018494606018066, + "learning_rate": 9.61150201743662e-06, + "loss": 0.901, + "step": 3775 + }, + { + "epoch": 0.3051374775247985, + "grad_norm": 2.776804208755493, + "learning_rate": 9.611249084148386e-06, + "loss": 1.0134, + "step": 3776 + }, + { + "epoch": 0.305218287238126, + "grad_norm": 2.5004022121429443, + "learning_rate": 9.610996071880557e-06, + "loss": 0.9594, + "step": 3777 + }, + { + "epoch": 0.3052990969514536, + "grad_norm": 3.125434637069702, + "learning_rate": 9.610742980637462e-06, + "loss": 1.0563, + "step": 3778 + }, + { + "epoch": 0.3053799066647811, + "grad_norm": 2.4795141220092773, + "learning_rate": 9.610489810423442e-06, + "loss": 0.984, + "step": 3779 + }, + { + "epoch": 0.3054607163781086, + "grad_norm": 3.0191361904144287, + "learning_rate": 9.610236561242832e-06, + "loss": 0.9131, + "step": 3780 + }, + { + "epoch": 0.3055415260914362, + "grad_norm": 2.7647979259490967, + "learning_rate": 9.609983233099967e-06, + "loss": 0.8676, + "step": 3781 + }, + { + "epoch": 0.3056223358047637, + "grad_norm": 2.794858455657959, + "learning_rate": 9.609729825999188e-06, + "loss": 1.0217, + "step": 3782 + }, + { + "epoch": 0.3057031455180913, + "grad_norm": 2.7970950603485107, + "learning_rate": 9.609476339944833e-06, + "loss": 1.1258, + "step": 3783 + }, + { + "epoch": 0.3057839552314188, + "grad_norm": 3.3518712520599365, + "learning_rate": 9.609222774941248e-06, + "loss": 0.9881, + "step": 3784 + }, + { + "epoch": 0.30586476494474635, + "grad_norm": 2.4890379905700684, + "learning_rate": 9.608969130992769e-06, + "loss": 1.1094, + "step": 3785 + }, + { + "epoch": 0.3059455746580739, + "grad_norm": 3.038357973098755, + "learning_rate": 9.608715408103748e-06, + "loss": 1.0204, + "step": 3786 + }, + { + "epoch": 0.30602638437140145, + "grad_norm": 3.140078067779541, + "learning_rate": 9.608461606278526e-06, + "loss": 0.8341, + "step": 3787 + }, + { + "epoch": 0.306107194084729, + "grad_norm": 2.7476046085357666, + "learning_rate": 9.60820772552145e-06, + "loss": 0.9423, + "step": 3788 + }, + { + "epoch": 0.30618800379805655, + "grad_norm": 2.845513105392456, + "learning_rate": 9.60795376583687e-06, + "loss": 1.1315, + "step": 3789 + }, + { + "epoch": 0.3062688135113841, + "grad_norm": 2.850813388824463, + "learning_rate": 9.607699727229136e-06, + "loss": 1.0409, + "step": 3790 + }, + { + "epoch": 0.3063496232247116, + "grad_norm": 2.919067144393921, + "learning_rate": 9.607445609702598e-06, + "loss": 0.881, + "step": 3791 + }, + { + "epoch": 0.3064304329380392, + "grad_norm": 2.876399517059326, + "learning_rate": 9.607191413261609e-06, + "loss": 0.9883, + "step": 3792 + }, + { + "epoch": 0.3065112426513667, + "grad_norm": 2.606030225753784, + "learning_rate": 9.606937137910522e-06, + "loss": 0.9093, + "step": 3793 + }, + { + "epoch": 0.3065920523646942, + "grad_norm": 2.95466947555542, + "learning_rate": 9.606682783653692e-06, + "loss": 1.09, + "step": 3794 + }, + { + "epoch": 0.3066728620780218, + "grad_norm": 2.9630496501922607, + "learning_rate": 9.606428350495476e-06, + "loss": 1.0455, + "step": 3795 + }, + { + "epoch": 0.3067536717913493, + "grad_norm": 2.888209104537964, + "learning_rate": 9.606173838440234e-06, + "loss": 0.9933, + "step": 3796 + }, + { + "epoch": 0.30683448150467685, + "grad_norm": 2.905669927597046, + "learning_rate": 9.605919247492322e-06, + "loss": 1.0449, + "step": 3797 + }, + { + "epoch": 0.3069152912180044, + "grad_norm": 2.7727410793304443, + "learning_rate": 9.605664577656099e-06, + "loss": 0.9082, + "step": 3798 + }, + { + "epoch": 0.30699610093133195, + "grad_norm": 2.158062696456909, + "learning_rate": 9.605409828935932e-06, + "loss": 1.0205, + "step": 3799 + }, + { + "epoch": 0.3070769106446595, + "grad_norm": 2.29494047164917, + "learning_rate": 9.605155001336182e-06, + "loss": 1.0388, + "step": 3800 + }, + { + "epoch": 0.30715772035798705, + "grad_norm": 2.8616902828216553, + "learning_rate": 9.604900094861212e-06, + "loss": 0.9782, + "step": 3801 + }, + { + "epoch": 0.3072385300713146, + "grad_norm": 2.527134895324707, + "learning_rate": 9.60464510951539e-06, + "loss": 1.0172, + "step": 3802 + }, + { + "epoch": 0.3073193397846421, + "grad_norm": 2.363241195678711, + "learning_rate": 9.604390045303083e-06, + "loss": 1.1337, + "step": 3803 + }, + { + "epoch": 0.3074001494979697, + "grad_norm": 2.7523727416992188, + "learning_rate": 9.604134902228658e-06, + "loss": 0.8518, + "step": 3804 + }, + { + "epoch": 0.3074809592112972, + "grad_norm": 2.749645709991455, + "learning_rate": 9.603879680296486e-06, + "loss": 1.0404, + "step": 3805 + }, + { + "epoch": 0.3075617689246247, + "grad_norm": 3.2840802669525146, + "learning_rate": 9.603624379510938e-06, + "loss": 1.1761, + "step": 3806 + }, + { + "epoch": 0.3076425786379523, + "grad_norm": 2.6806998252868652, + "learning_rate": 9.60336899987639e-06, + "loss": 1.004, + "step": 3807 + }, + { + "epoch": 0.3077233883512798, + "grad_norm": 2.5842466354370117, + "learning_rate": 9.60311354139721e-06, + "loss": 0.8935, + "step": 3808 + }, + { + "epoch": 0.30780419806460735, + "grad_norm": 2.7936301231384277, + "learning_rate": 9.602858004077778e-06, + "loss": 0.9205, + "step": 3809 + }, + { + "epoch": 0.3078850077779349, + "grad_norm": 2.390162944793701, + "learning_rate": 9.602602387922471e-06, + "loss": 1.1122, + "step": 3810 + }, + { + "epoch": 0.30796581749126245, + "grad_norm": 2.725221633911133, + "learning_rate": 9.602346692935662e-06, + "loss": 1.0698, + "step": 3811 + }, + { + "epoch": 0.30804662720459, + "grad_norm": 3.072129964828491, + "learning_rate": 9.602090919121736e-06, + "loss": 0.8672, + "step": 3812 + }, + { + "epoch": 0.30812743691791755, + "grad_norm": 3.481904983520508, + "learning_rate": 9.60183506648507e-06, + "loss": 0.9182, + "step": 3813 + }, + { + "epoch": 0.3082082466312451, + "grad_norm": 2.9256131649017334, + "learning_rate": 9.601579135030051e-06, + "loss": 0.9397, + "step": 3814 + }, + { + "epoch": 0.3082890563445726, + "grad_norm": 2.864955425262451, + "learning_rate": 9.601323124761057e-06, + "loss": 0.9037, + "step": 3815 + }, + { + "epoch": 0.3083698660579002, + "grad_norm": 2.587109088897705, + "learning_rate": 9.601067035682474e-06, + "loss": 0.9215, + "step": 3816 + }, + { + "epoch": 0.3084506757712277, + "grad_norm": 3.277244806289673, + "learning_rate": 9.60081086779869e-06, + "loss": 0.9555, + "step": 3817 + }, + { + "epoch": 0.3085314854845552, + "grad_norm": 2.660778760910034, + "learning_rate": 9.600554621114093e-06, + "loss": 0.9269, + "step": 3818 + }, + { + "epoch": 0.3086122951978828, + "grad_norm": 2.7236886024475098, + "learning_rate": 9.60029829563307e-06, + "loss": 0.9754, + "step": 3819 + }, + { + "epoch": 0.3086931049112103, + "grad_norm": 2.7859270572662354, + "learning_rate": 9.600041891360013e-06, + "loss": 0.9567, + "step": 3820 + }, + { + "epoch": 0.30877391462453785, + "grad_norm": 2.2497758865356445, + "learning_rate": 9.599785408299311e-06, + "loss": 0.8751, + "step": 3821 + }, + { + "epoch": 0.3088547243378654, + "grad_norm": 2.236384630203247, + "learning_rate": 9.599528846455359e-06, + "loss": 1.0906, + "step": 3822 + }, + { + "epoch": 0.30893553405119295, + "grad_norm": 2.7493271827697754, + "learning_rate": 9.599272205832553e-06, + "loss": 1.0709, + "step": 3823 + }, + { + "epoch": 0.3090163437645205, + "grad_norm": 2.8007590770721436, + "learning_rate": 9.599015486435284e-06, + "loss": 1.0544, + "step": 3824 + }, + { + "epoch": 0.30909715347784805, + "grad_norm": 3.1281704902648926, + "learning_rate": 9.59875868826795e-06, + "loss": 0.9405, + "step": 3825 + }, + { + "epoch": 0.3091779631911756, + "grad_norm": 2.5548365116119385, + "learning_rate": 9.598501811334955e-06, + "loss": 0.8974, + "step": 3826 + }, + { + "epoch": 0.3092587729045031, + "grad_norm": 2.692622661590576, + "learning_rate": 9.59824485564069e-06, + "loss": 1.0373, + "step": 3827 + }, + { + "epoch": 0.3093395826178307, + "grad_norm": 3.0443758964538574, + "learning_rate": 9.597987821189563e-06, + "loss": 0.9423, + "step": 3828 + }, + { + "epoch": 0.3094203923311582, + "grad_norm": 2.748542070388794, + "learning_rate": 9.597730707985972e-06, + "loss": 0.9463, + "step": 3829 + }, + { + "epoch": 0.3095012020444857, + "grad_norm": 2.822598695755005, + "learning_rate": 9.597473516034325e-06, + "loss": 1.0552, + "step": 3830 + }, + { + "epoch": 0.3095820117578133, + "grad_norm": 2.4813973903656006, + "learning_rate": 9.597216245339023e-06, + "loss": 1.0568, + "step": 3831 + }, + { + "epoch": 0.3096628214711408, + "grad_norm": 2.863154172897339, + "learning_rate": 9.596958895904475e-06, + "loss": 1.1406, + "step": 3832 + }, + { + "epoch": 0.30974363118446835, + "grad_norm": 3.0755743980407715, + "learning_rate": 9.596701467735087e-06, + "loss": 1.0006, + "step": 3833 + }, + { + "epoch": 0.3098244408977959, + "grad_norm": 2.7633132934570312, + "learning_rate": 9.596443960835269e-06, + "loss": 0.9465, + "step": 3834 + }, + { + "epoch": 0.30990525061112345, + "grad_norm": 2.625476598739624, + "learning_rate": 9.59618637520943e-06, + "loss": 0.8874, + "step": 3835 + }, + { + "epoch": 0.309986060324451, + "grad_norm": 2.920628309249878, + "learning_rate": 9.595928710861987e-06, + "loss": 0.9775, + "step": 3836 + }, + { + "epoch": 0.31006687003777855, + "grad_norm": 2.6560709476470947, + "learning_rate": 9.595670967797347e-06, + "loss": 0.9767, + "step": 3837 + }, + { + "epoch": 0.3101476797511061, + "grad_norm": 2.8147332668304443, + "learning_rate": 9.595413146019927e-06, + "loss": 0.9863, + "step": 3838 + }, + { + "epoch": 0.3102284894644336, + "grad_norm": 2.8549447059631348, + "learning_rate": 9.595155245534143e-06, + "loss": 0.8773, + "step": 3839 + }, + { + "epoch": 0.3103092991777612, + "grad_norm": 3.3137104511260986, + "learning_rate": 9.594897266344411e-06, + "loss": 1.023, + "step": 3840 + }, + { + "epoch": 0.3103901088910887, + "grad_norm": 2.49528431892395, + "learning_rate": 9.594639208455154e-06, + "loss": 0.9012, + "step": 3841 + }, + { + "epoch": 0.3104709186044162, + "grad_norm": 3.0016326904296875, + "learning_rate": 9.594381071870785e-06, + "loss": 1.0405, + "step": 3842 + }, + { + "epoch": 0.3105517283177438, + "grad_norm": 2.6793394088745117, + "learning_rate": 9.594122856595731e-06, + "loss": 0.9916, + "step": 3843 + }, + { + "epoch": 0.3106325380310713, + "grad_norm": 2.6459403038024902, + "learning_rate": 9.593864562634411e-06, + "loss": 0.993, + "step": 3844 + }, + { + "epoch": 0.3107133477443989, + "grad_norm": 3.2174904346466064, + "learning_rate": 9.59360618999125e-06, + "loss": 0.9395, + "step": 3845 + }, + { + "epoch": 0.3107941574577264, + "grad_norm": 2.563730001449585, + "learning_rate": 9.593347738670676e-06, + "loss": 0.9565, + "step": 3846 + }, + { + "epoch": 0.31087496717105395, + "grad_norm": 2.956404209136963, + "learning_rate": 9.593089208677112e-06, + "loss": 0.9685, + "step": 3847 + }, + { + "epoch": 0.31095577688438153, + "grad_norm": 2.6325273513793945, + "learning_rate": 9.592830600014985e-06, + "loss": 0.8934, + "step": 3848 + }, + { + "epoch": 0.31103658659770905, + "grad_norm": 2.8246214389801025, + "learning_rate": 9.592571912688728e-06, + "loss": 0.9795, + "step": 3849 + }, + { + "epoch": 0.3111173963110366, + "grad_norm": 2.808764934539795, + "learning_rate": 9.592313146702773e-06, + "loss": 1.0248, + "step": 3850 + }, + { + "epoch": 0.31119820602436415, + "grad_norm": 2.5618157386779785, + "learning_rate": 9.592054302061546e-06, + "loss": 1.0312, + "step": 3851 + }, + { + "epoch": 0.3112790157376917, + "grad_norm": 2.673449993133545, + "learning_rate": 9.591795378769485e-06, + "loss": 0.8822, + "step": 3852 + }, + { + "epoch": 0.3113598254510192, + "grad_norm": 2.4913876056671143, + "learning_rate": 9.591536376831023e-06, + "loss": 0.9974, + "step": 3853 + }, + { + "epoch": 0.3114406351643468, + "grad_norm": 2.7856106758117676, + "learning_rate": 9.591277296250596e-06, + "loss": 0.9446, + "step": 3854 + }, + { + "epoch": 0.3115214448776743, + "grad_norm": 3.2890031337738037, + "learning_rate": 9.591018137032642e-06, + "loss": 0.9611, + "step": 3855 + }, + { + "epoch": 0.3116022545910018, + "grad_norm": 2.4133715629577637, + "learning_rate": 9.590758899181601e-06, + "loss": 0.9354, + "step": 3856 + }, + { + "epoch": 0.3116830643043294, + "grad_norm": 3.1188809871673584, + "learning_rate": 9.590499582701913e-06, + "loss": 1.1055, + "step": 3857 + }, + { + "epoch": 0.31176387401765693, + "grad_norm": 2.61584210395813, + "learning_rate": 9.590240187598016e-06, + "loss": 0.9736, + "step": 3858 + }, + { + "epoch": 0.31184468373098445, + "grad_norm": 2.536604166030884, + "learning_rate": 9.589980713874357e-06, + "loss": 1.044, + "step": 3859 + }, + { + "epoch": 0.31192549344431203, + "grad_norm": 2.9626662731170654, + "learning_rate": 9.589721161535375e-06, + "loss": 0.8921, + "step": 3860 + }, + { + "epoch": 0.31200630315763955, + "grad_norm": 3.1650912761688232, + "learning_rate": 9.589461530585523e-06, + "loss": 1.1349, + "step": 3861 + }, + { + "epoch": 0.3120871128709671, + "grad_norm": 2.8416025638580322, + "learning_rate": 9.58920182102924e-06, + "loss": 0.8835, + "step": 3862 + }, + { + "epoch": 0.31216792258429465, + "grad_norm": 3.3654446601867676, + "learning_rate": 9.58894203287098e-06, + "loss": 0.9788, + "step": 3863 + }, + { + "epoch": 0.3122487322976222, + "grad_norm": 2.6304450035095215, + "learning_rate": 9.58868216611519e-06, + "loss": 0.974, + "step": 3864 + }, + { + "epoch": 0.3123295420109497, + "grad_norm": 2.8896827697753906, + "learning_rate": 9.58842222076632e-06, + "loss": 0.9549, + "step": 3865 + }, + { + "epoch": 0.3124103517242773, + "grad_norm": 2.5072031021118164, + "learning_rate": 9.588162196828826e-06, + "loss": 0.9045, + "step": 3866 + }, + { + "epoch": 0.3124911614376048, + "grad_norm": 2.752204418182373, + "learning_rate": 9.587902094307158e-06, + "loss": 1.0282, + "step": 3867 + }, + { + "epoch": 0.3125719711509323, + "grad_norm": 2.5259454250335693, + "learning_rate": 9.58764191320577e-06, + "loss": 1.0006, + "step": 3868 + }, + { + "epoch": 0.3126527808642599, + "grad_norm": 2.7212178707122803, + "learning_rate": 9.587381653529123e-06, + "loss": 1.0307, + "step": 3869 + }, + { + "epoch": 0.31273359057758743, + "grad_norm": 3.102100133895874, + "learning_rate": 9.587121315281671e-06, + "loss": 0.9911, + "step": 3870 + }, + { + "epoch": 0.31281440029091495, + "grad_norm": 3.042722463607788, + "learning_rate": 9.586860898467875e-06, + "loss": 1.0282, + "step": 3871 + }, + { + "epoch": 0.31289521000424253, + "grad_norm": 3.0969510078430176, + "learning_rate": 9.586600403092192e-06, + "loss": 1.0182, + "step": 3872 + }, + { + "epoch": 0.31297601971757005, + "grad_norm": 2.870457172393799, + "learning_rate": 9.586339829159088e-06, + "loss": 0.9135, + "step": 3873 + }, + { + "epoch": 0.3130568294308976, + "grad_norm": 2.727113962173462, + "learning_rate": 9.586079176673021e-06, + "loss": 1.1281, + "step": 3874 + }, + { + "epoch": 0.31313763914422515, + "grad_norm": 2.603419065475464, + "learning_rate": 9.585818445638462e-06, + "loss": 0.822, + "step": 3875 + }, + { + "epoch": 0.3132184488575527, + "grad_norm": 2.8258495330810547, + "learning_rate": 9.585557636059872e-06, + "loss": 0.9585, + "step": 3876 + }, + { + "epoch": 0.3132992585708802, + "grad_norm": 2.852052688598633, + "learning_rate": 9.585296747941717e-06, + "loss": 1.1353, + "step": 3877 + }, + { + "epoch": 0.3133800682842078, + "grad_norm": 2.827730178833008, + "learning_rate": 9.58503578128847e-06, + "loss": 1.0242, + "step": 3878 + }, + { + "epoch": 0.3134608779975353, + "grad_norm": 3.0042948722839355, + "learning_rate": 9.584774736104597e-06, + "loss": 1.1395, + "step": 3879 + }, + { + "epoch": 0.3135416877108628, + "grad_norm": 2.697857618331909, + "learning_rate": 9.584513612394568e-06, + "loss": 1.045, + "step": 3880 + }, + { + "epoch": 0.3136224974241904, + "grad_norm": 2.605607748031616, + "learning_rate": 9.58425241016286e-06, + "loss": 1.0711, + "step": 3881 + }, + { + "epoch": 0.31370330713751793, + "grad_norm": 2.865797281265259, + "learning_rate": 9.583991129413943e-06, + "loss": 1.1232, + "step": 3882 + }, + { + "epoch": 0.31378411685084545, + "grad_norm": 2.542365550994873, + "learning_rate": 9.583729770152295e-06, + "loss": 0.9739, + "step": 3883 + }, + { + "epoch": 0.31386492656417303, + "grad_norm": 2.798555850982666, + "learning_rate": 9.58346833238239e-06, + "loss": 0.9233, + "step": 3884 + }, + { + "epoch": 0.31394573627750055, + "grad_norm": 2.568464756011963, + "learning_rate": 9.583206816108706e-06, + "loss": 1.0478, + "step": 3885 + }, + { + "epoch": 0.3140265459908281, + "grad_norm": 2.5329174995422363, + "learning_rate": 9.582945221335724e-06, + "loss": 1.0034, + "step": 3886 + }, + { + "epoch": 0.31410735570415566, + "grad_norm": 3.2005858421325684, + "learning_rate": 9.582683548067924e-06, + "loss": 0.9865, + "step": 3887 + }, + { + "epoch": 0.3141881654174832, + "grad_norm": 2.885324001312256, + "learning_rate": 9.582421796309786e-06, + "loss": 1.0066, + "step": 3888 + }, + { + "epoch": 0.3142689751308107, + "grad_norm": 3.1991405487060547, + "learning_rate": 9.582159966065793e-06, + "loss": 1.0843, + "step": 3889 + }, + { + "epoch": 0.3143497848441383, + "grad_norm": 2.9450576305389404, + "learning_rate": 9.581898057340432e-06, + "loss": 1.0509, + "step": 3890 + }, + { + "epoch": 0.3144305945574658, + "grad_norm": 2.807943105697632, + "learning_rate": 9.581636070138189e-06, + "loss": 0.8537, + "step": 3891 + }, + { + "epoch": 0.3145114042707933, + "grad_norm": 2.774735450744629, + "learning_rate": 9.58137400446355e-06, + "loss": 1.0605, + "step": 3892 + }, + { + "epoch": 0.3145922139841209, + "grad_norm": 2.735790491104126, + "learning_rate": 9.581111860321002e-06, + "loss": 1.0951, + "step": 3893 + }, + { + "epoch": 0.31467302369744843, + "grad_norm": 2.945537567138672, + "learning_rate": 9.580849637715037e-06, + "loss": 0.9333, + "step": 3894 + }, + { + "epoch": 0.31475383341077595, + "grad_norm": 2.72914457321167, + "learning_rate": 9.580587336650147e-06, + "loss": 1.0557, + "step": 3895 + }, + { + "epoch": 0.31483464312410353, + "grad_norm": 3.2410850524902344, + "learning_rate": 9.580324957130823e-06, + "loss": 1.0085, + "step": 3896 + }, + { + "epoch": 0.31491545283743105, + "grad_norm": 3.114778757095337, + "learning_rate": 9.580062499161557e-06, + "loss": 1.0151, + "step": 3897 + }, + { + "epoch": 0.3149962625507586, + "grad_norm": 2.4615917205810547, + "learning_rate": 9.579799962746848e-06, + "loss": 0.974, + "step": 3898 + }, + { + "epoch": 0.31507707226408616, + "grad_norm": 2.4671497344970703, + "learning_rate": 9.57953734789119e-06, + "loss": 0.9907, + "step": 3899 + }, + { + "epoch": 0.3151578819774137, + "grad_norm": 2.4573681354522705, + "learning_rate": 9.579274654599082e-06, + "loss": 0.9598, + "step": 3900 + }, + { + "epoch": 0.3152386916907412, + "grad_norm": 2.579907178878784, + "learning_rate": 9.579011882875024e-06, + "loss": 0.9427, + "step": 3901 + }, + { + "epoch": 0.3153195014040688, + "grad_norm": 3.1489217281341553, + "learning_rate": 9.578749032723516e-06, + "loss": 0.9931, + "step": 3902 + }, + { + "epoch": 0.3154003111173963, + "grad_norm": 3.128610849380493, + "learning_rate": 9.57848610414906e-06, + "loss": 0.9354, + "step": 3903 + }, + { + "epoch": 0.3154811208307238, + "grad_norm": 2.8404691219329834, + "learning_rate": 9.57822309715616e-06, + "loss": 0.908, + "step": 3904 + }, + { + "epoch": 0.3155619305440514, + "grad_norm": 2.4756977558135986, + "learning_rate": 9.577960011749319e-06, + "loss": 1.0025, + "step": 3905 + }, + { + "epoch": 0.31564274025737893, + "grad_norm": 2.6812450885772705, + "learning_rate": 9.577696847933044e-06, + "loss": 0.9053, + "step": 3906 + }, + { + "epoch": 0.31572354997070645, + "grad_norm": 2.4922304153442383, + "learning_rate": 9.577433605711842e-06, + "loss": 0.999, + "step": 3907 + }, + { + "epoch": 0.31580435968403403, + "grad_norm": 2.8330764770507812, + "learning_rate": 9.577170285090223e-06, + "loss": 0.9833, + "step": 3908 + }, + { + "epoch": 0.31588516939736155, + "grad_norm": 2.674743890762329, + "learning_rate": 9.576906886072695e-06, + "loss": 0.9638, + "step": 3909 + }, + { + "epoch": 0.31596597911068913, + "grad_norm": 2.507943630218506, + "learning_rate": 9.576643408663771e-06, + "loss": 0.9634, + "step": 3910 + }, + { + "epoch": 0.31604678882401666, + "grad_norm": 2.446462869644165, + "learning_rate": 9.576379852867964e-06, + "loss": 1.045, + "step": 3911 + }, + { + "epoch": 0.3161275985373442, + "grad_norm": 2.408454179763794, + "learning_rate": 9.576116218689785e-06, + "loss": 1.0675, + "step": 3912 + }, + { + "epoch": 0.31620840825067176, + "grad_norm": 2.7893428802490234, + "learning_rate": 9.575852506133756e-06, + "loss": 0.9539, + "step": 3913 + }, + { + "epoch": 0.3162892179639993, + "grad_norm": 2.7513561248779297, + "learning_rate": 9.575588715204387e-06, + "loss": 0.9783, + "step": 3914 + }, + { + "epoch": 0.3163700276773268, + "grad_norm": 3.0103142261505127, + "learning_rate": 9.575324845906197e-06, + "loss": 0.9148, + "step": 3915 + }, + { + "epoch": 0.3164508373906544, + "grad_norm": 2.799314260482788, + "learning_rate": 9.575060898243709e-06, + "loss": 1.0869, + "step": 3916 + }, + { + "epoch": 0.3165316471039819, + "grad_norm": 2.6418752670288086, + "learning_rate": 9.574796872221441e-06, + "loss": 0.9344, + "step": 3917 + }, + { + "epoch": 0.31661245681730943, + "grad_norm": 2.895109176635742, + "learning_rate": 9.574532767843917e-06, + "loss": 0.9376, + "step": 3918 + }, + { + "epoch": 0.316693266530637, + "grad_norm": 2.6166465282440186, + "learning_rate": 9.574268585115659e-06, + "loss": 0.9595, + "step": 3919 + }, + { + "epoch": 0.31677407624396453, + "grad_norm": 2.743903875350952, + "learning_rate": 9.574004324041191e-06, + "loss": 1.0445, + "step": 3920 + }, + { + "epoch": 0.31685488595729205, + "grad_norm": 2.5423619747161865, + "learning_rate": 9.573739984625041e-06, + "loss": 1.0441, + "step": 3921 + }, + { + "epoch": 0.31693569567061963, + "grad_norm": 2.3177995681762695, + "learning_rate": 9.573475566871737e-06, + "loss": 1.0372, + "step": 3922 + }, + { + "epoch": 0.31701650538394716, + "grad_norm": 2.443727731704712, + "learning_rate": 9.573211070785807e-06, + "loss": 0.9468, + "step": 3923 + }, + { + "epoch": 0.3170973150972747, + "grad_norm": 2.727766990661621, + "learning_rate": 9.572946496371778e-06, + "loss": 0.9571, + "step": 3924 + }, + { + "epoch": 0.31717812481060226, + "grad_norm": 2.215959310531616, + "learning_rate": 9.572681843634187e-06, + "loss": 0.9412, + "step": 3925 + }, + { + "epoch": 0.3172589345239298, + "grad_norm": 2.926909923553467, + "learning_rate": 9.572417112577563e-06, + "loss": 1.1068, + "step": 3926 + }, + { + "epoch": 0.3173397442372573, + "grad_norm": 2.5570342540740967, + "learning_rate": 9.572152303206443e-06, + "loss": 0.9862, + "step": 3927 + }, + { + "epoch": 0.3174205539505849, + "grad_norm": 2.497966766357422, + "learning_rate": 9.57188741552536e-06, + "loss": 0.9851, + "step": 3928 + }, + { + "epoch": 0.3175013636639124, + "grad_norm": 3.5437493324279785, + "learning_rate": 9.571622449538852e-06, + "loss": 1.023, + "step": 3929 + }, + { + "epoch": 0.31758217337723993, + "grad_norm": 3.100724697113037, + "learning_rate": 9.571357405251459e-06, + "loss": 1.1043, + "step": 3930 + }, + { + "epoch": 0.3176629830905675, + "grad_norm": 2.3053576946258545, + "learning_rate": 9.571092282667716e-06, + "loss": 1.0503, + "step": 3931 + }, + { + "epoch": 0.31774379280389503, + "grad_norm": 2.488769054412842, + "learning_rate": 9.570827081792167e-06, + "loss": 1.0722, + "step": 3932 + }, + { + "epoch": 0.31782460251722255, + "grad_norm": 2.8296937942504883, + "learning_rate": 9.570561802629355e-06, + "loss": 1.0153, + "step": 3933 + }, + { + "epoch": 0.31790541223055013, + "grad_norm": 2.601266384124756, + "learning_rate": 9.570296445183822e-06, + "loss": 1.0516, + "step": 3934 + }, + { + "epoch": 0.31798622194387766, + "grad_norm": 2.7215843200683594, + "learning_rate": 9.570031009460114e-06, + "loss": 0.8979, + "step": 3935 + }, + { + "epoch": 0.3180670316572052, + "grad_norm": 2.969320297241211, + "learning_rate": 9.569765495462777e-06, + "loss": 0.993, + "step": 3936 + }, + { + "epoch": 0.31814784137053276, + "grad_norm": 2.622695207595825, + "learning_rate": 9.569499903196357e-06, + "loss": 0.9333, + "step": 3937 + }, + { + "epoch": 0.3182286510838603, + "grad_norm": 3.369231700897217, + "learning_rate": 9.569234232665405e-06, + "loss": 0.9316, + "step": 3938 + }, + { + "epoch": 0.3183094607971878, + "grad_norm": 2.9470860958099365, + "learning_rate": 9.56896848387447e-06, + "loss": 1.0316, + "step": 3939 + }, + { + "epoch": 0.3183902705105154, + "grad_norm": 3.0356106758117676, + "learning_rate": 9.568702656828103e-06, + "loss": 0.8713, + "step": 3940 + }, + { + "epoch": 0.3184710802238429, + "grad_norm": 2.354508399963379, + "learning_rate": 9.568436751530862e-06, + "loss": 1.0344, + "step": 3941 + }, + { + "epoch": 0.31855188993717043, + "grad_norm": 2.9620916843414307, + "learning_rate": 9.568170767987294e-06, + "loss": 0.8858, + "step": 3942 + }, + { + "epoch": 0.318632699650498, + "grad_norm": 3.0026702880859375, + "learning_rate": 9.567904706201961e-06, + "loss": 0.9729, + "step": 3943 + }, + { + "epoch": 0.31871350936382553, + "grad_norm": 2.5581581592559814, + "learning_rate": 9.567638566179414e-06, + "loss": 0.9812, + "step": 3944 + }, + { + "epoch": 0.31879431907715305, + "grad_norm": 2.487456798553467, + "learning_rate": 9.567372347924217e-06, + "loss": 0.9962, + "step": 3945 + }, + { + "epoch": 0.31887512879048063, + "grad_norm": 2.9502642154693604, + "learning_rate": 9.567106051440926e-06, + "loss": 0.9709, + "step": 3946 + }, + { + "epoch": 0.31895593850380816, + "grad_norm": 3.1015729904174805, + "learning_rate": 9.566839676734103e-06, + "loss": 0.9733, + "step": 3947 + }, + { + "epoch": 0.3190367482171357, + "grad_norm": 2.408635377883911, + "learning_rate": 9.566573223808313e-06, + "loss": 1.0635, + "step": 3948 + }, + { + "epoch": 0.31911755793046326, + "grad_norm": 3.367497444152832, + "learning_rate": 9.566306692668115e-06, + "loss": 0.9283, + "step": 3949 + }, + { + "epoch": 0.3191983676437908, + "grad_norm": 3.2404932975769043, + "learning_rate": 9.566040083318076e-06, + "loss": 1.0313, + "step": 3950 + }, + { + "epoch": 0.3192791773571183, + "grad_norm": 2.889040946960449, + "learning_rate": 9.565773395762763e-06, + "loss": 0.9841, + "step": 3951 + }, + { + "epoch": 0.3193599870704459, + "grad_norm": 2.901196241378784, + "learning_rate": 9.565506630006745e-06, + "loss": 1.0316, + "step": 3952 + }, + { + "epoch": 0.3194407967837734, + "grad_norm": 2.811598062515259, + "learning_rate": 9.565239786054587e-06, + "loss": 1.0213, + "step": 3953 + }, + { + "epoch": 0.31952160649710093, + "grad_norm": 3.011934757232666, + "learning_rate": 9.564972863910862e-06, + "loss": 1.0583, + "step": 3954 + }, + { + "epoch": 0.3196024162104285, + "grad_norm": 3.0957868099212646, + "learning_rate": 9.564705863580145e-06, + "loss": 0.9961, + "step": 3955 + }, + { + "epoch": 0.31968322592375603, + "grad_norm": 2.9000260829925537, + "learning_rate": 9.564438785067002e-06, + "loss": 1.0179, + "step": 3956 + }, + { + "epoch": 0.31976403563708355, + "grad_norm": 2.7005488872528076, + "learning_rate": 9.564171628376013e-06, + "loss": 0.9293, + "step": 3957 + }, + { + "epoch": 0.31984484535041113, + "grad_norm": 2.8162786960601807, + "learning_rate": 9.563904393511749e-06, + "loss": 0.9181, + "step": 3958 + }, + { + "epoch": 0.31992565506373866, + "grad_norm": 2.960033416748047, + "learning_rate": 9.563637080478793e-06, + "loss": 0.9736, + "step": 3959 + }, + { + "epoch": 0.3200064647770662, + "grad_norm": 3.206284523010254, + "learning_rate": 9.563369689281718e-06, + "loss": 0.8939, + "step": 3960 + }, + { + "epoch": 0.32008727449039376, + "grad_norm": 2.616919755935669, + "learning_rate": 9.563102219925109e-06, + "loss": 0.971, + "step": 3961 + }, + { + "epoch": 0.3201680842037213, + "grad_norm": 3.0238308906555176, + "learning_rate": 9.562834672413542e-06, + "loss": 0.9845, + "step": 3962 + }, + { + "epoch": 0.3202488939170488, + "grad_norm": 2.5969364643096924, + "learning_rate": 9.562567046751603e-06, + "loss": 1.0305, + "step": 3963 + }, + { + "epoch": 0.3203297036303764, + "grad_norm": 2.2757885456085205, + "learning_rate": 9.562299342943873e-06, + "loss": 0.999, + "step": 3964 + }, + { + "epoch": 0.3204105133437039, + "grad_norm": 2.4738149642944336, + "learning_rate": 9.56203156099494e-06, + "loss": 1.0553, + "step": 3965 + }, + { + "epoch": 0.32049132305703143, + "grad_norm": 2.243032217025757, + "learning_rate": 9.561763700909387e-06, + "loss": 1.1218, + "step": 3966 + }, + { + "epoch": 0.320572132770359, + "grad_norm": 2.5168676376342773, + "learning_rate": 9.561495762691804e-06, + "loss": 1.1254, + "step": 3967 + }, + { + "epoch": 0.32065294248368653, + "grad_norm": 2.716259717941284, + "learning_rate": 9.561227746346783e-06, + "loss": 0.8839, + "step": 3968 + }, + { + "epoch": 0.32073375219701405, + "grad_norm": 2.474407434463501, + "learning_rate": 9.560959651878908e-06, + "loss": 1.0295, + "step": 3969 + }, + { + "epoch": 0.32081456191034163, + "grad_norm": 2.8599421977996826, + "learning_rate": 9.560691479292777e-06, + "loss": 0.9559, + "step": 3970 + }, + { + "epoch": 0.32089537162366916, + "grad_norm": 2.726964235305786, + "learning_rate": 9.560423228592978e-06, + "loss": 0.8889, + "step": 3971 + }, + { + "epoch": 0.3209761813369967, + "grad_norm": 3.0214600563049316, + "learning_rate": 9.560154899784109e-06, + "loss": 1.0466, + "step": 3972 + }, + { + "epoch": 0.32105699105032426, + "grad_norm": 3.2783267498016357, + "learning_rate": 9.559886492870764e-06, + "loss": 1.0275, + "step": 3973 + }, + { + "epoch": 0.3211378007636518, + "grad_norm": 2.6123223304748535, + "learning_rate": 9.55961800785754e-06, + "loss": 0.9725, + "step": 3974 + }, + { + "epoch": 0.32121861047697936, + "grad_norm": 2.3589444160461426, + "learning_rate": 9.55934944474904e-06, + "loss": 0.9172, + "step": 3975 + }, + { + "epoch": 0.3212994201903069, + "grad_norm": 2.607569456100464, + "learning_rate": 9.559080803549857e-06, + "loss": 0.9393, + "step": 3976 + }, + { + "epoch": 0.3213802299036344, + "grad_norm": 2.7016165256500244, + "learning_rate": 9.558812084264595e-06, + "loss": 1.0372, + "step": 3977 + }, + { + "epoch": 0.321461039616962, + "grad_norm": 2.495267629623413, + "learning_rate": 9.55854328689786e-06, + "loss": 1.0041, + "step": 3978 + }, + { + "epoch": 0.3215418493302895, + "grad_norm": 2.5262086391448975, + "learning_rate": 9.558274411454249e-06, + "loss": 1.0064, + "step": 3979 + }, + { + "epoch": 0.32162265904361703, + "grad_norm": 2.77862286567688, + "learning_rate": 9.558005457938372e-06, + "loss": 1.0388, + "step": 3980 + }, + { + "epoch": 0.3217034687569446, + "grad_norm": 2.4823708534240723, + "learning_rate": 9.557736426354837e-06, + "loss": 0.9786, + "step": 3981 + }, + { + "epoch": 0.32178427847027213, + "grad_norm": 2.639963388442993, + "learning_rate": 9.557467316708246e-06, + "loss": 0.9069, + "step": 3982 + }, + { + "epoch": 0.32186508818359966, + "grad_norm": 3.4461371898651123, + "learning_rate": 9.557198129003213e-06, + "loss": 1.0156, + "step": 3983 + }, + { + "epoch": 0.32194589789692724, + "grad_norm": 2.6586742401123047, + "learning_rate": 9.556928863244348e-06, + "loss": 1.0775, + "step": 3984 + }, + { + "epoch": 0.32202670761025476, + "grad_norm": 2.1365299224853516, + "learning_rate": 9.556659519436262e-06, + "loss": 0.8579, + "step": 3985 + }, + { + "epoch": 0.3221075173235823, + "grad_norm": 2.56663179397583, + "learning_rate": 9.556390097583567e-06, + "loss": 1.0838, + "step": 3986 + }, + { + "epoch": 0.32218832703690986, + "grad_norm": 2.783458948135376, + "learning_rate": 9.556120597690879e-06, + "loss": 0.9394, + "step": 3987 + }, + { + "epoch": 0.3222691367502374, + "grad_norm": 2.74609112739563, + "learning_rate": 9.555851019762812e-06, + "loss": 1.0268, + "step": 3988 + }, + { + "epoch": 0.3223499464635649, + "grad_norm": 2.665937662124634, + "learning_rate": 9.555581363803987e-06, + "loss": 1.0699, + "step": 3989 + }, + { + "epoch": 0.3224307561768925, + "grad_norm": 2.3670506477355957, + "learning_rate": 9.55531162981902e-06, + "loss": 1.014, + "step": 3990 + }, + { + "epoch": 0.32251156589022, + "grad_norm": 2.883831739425659, + "learning_rate": 9.555041817812531e-06, + "loss": 0.9548, + "step": 3991 + }, + { + "epoch": 0.32259237560354753, + "grad_norm": 2.978527307510376, + "learning_rate": 9.554771927789142e-06, + "loss": 0.984, + "step": 3992 + }, + { + "epoch": 0.3226731853168751, + "grad_norm": 2.601973295211792, + "learning_rate": 9.554501959753472e-06, + "loss": 1.0509, + "step": 3993 + }, + { + "epoch": 0.32275399503020263, + "grad_norm": 3.092226266860962, + "learning_rate": 9.554231913710153e-06, + "loss": 0.995, + "step": 3994 + }, + { + "epoch": 0.32283480474353016, + "grad_norm": 2.800023078918457, + "learning_rate": 9.5539617896638e-06, + "loss": 1.015, + "step": 3995 + }, + { + "epoch": 0.32291561445685774, + "grad_norm": 2.28165340423584, + "learning_rate": 9.553691587619048e-06, + "loss": 0.9769, + "step": 3996 + }, + { + "epoch": 0.32299642417018526, + "grad_norm": 2.8008766174316406, + "learning_rate": 9.553421307580521e-06, + "loss": 1.0006, + "step": 3997 + }, + { + "epoch": 0.3230772338835128, + "grad_norm": 3.051344394683838, + "learning_rate": 9.55315094955285e-06, + "loss": 0.9896, + "step": 3998 + }, + { + "epoch": 0.32315804359684036, + "grad_norm": 2.8846046924591064, + "learning_rate": 9.552880513540663e-06, + "loss": 1.0331, + "step": 3999 + }, + { + "epoch": 0.3232388533101679, + "grad_norm": 2.6979334354400635, + "learning_rate": 9.552609999548594e-06, + "loss": 0.8969, + "step": 4000 + }, + { + "epoch": 0.3232388533101679, + "eval_loss": 0.8279703855514526, + "eval_runtime": 814.558, + "eval_samples_per_second": 102.345, + "eval_steps_per_second": 12.793, + "step": 4000 + }, + { + "epoch": 0.3233196630234954, + "grad_norm": 2.881255865097046, + "learning_rate": 9.552339407581275e-06, + "loss": 0.9799, + "step": 4001 + }, + { + "epoch": 0.323400472736823, + "grad_norm": 2.7741382122039795, + "learning_rate": 9.55206873764334e-06, + "loss": 0.9929, + "step": 4002 + }, + { + "epoch": 0.3234812824501505, + "grad_norm": 2.676832914352417, + "learning_rate": 9.55179798973943e-06, + "loss": 0.9728, + "step": 4003 + }, + { + "epoch": 0.32356209216347803, + "grad_norm": 2.584409475326538, + "learning_rate": 9.551527163874174e-06, + "loss": 0.9822, + "step": 4004 + }, + { + "epoch": 0.3236429018768056, + "grad_norm": 2.607215166091919, + "learning_rate": 9.55125626005222e-06, + "loss": 0.9944, + "step": 4005 + }, + { + "epoch": 0.32372371159013313, + "grad_norm": 2.788327217102051, + "learning_rate": 9.5509852782782e-06, + "loss": 0.9474, + "step": 4006 + }, + { + "epoch": 0.32380452130346066, + "grad_norm": 3.0161447525024414, + "learning_rate": 9.550714218556759e-06, + "loss": 0.9448, + "step": 4007 + }, + { + "epoch": 0.32388533101678824, + "grad_norm": 2.7205970287323, + "learning_rate": 9.550443080892538e-06, + "loss": 1.0268, + "step": 4008 + }, + { + "epoch": 0.32396614073011576, + "grad_norm": 2.9318370819091797, + "learning_rate": 9.550171865290181e-06, + "loss": 0.9557, + "step": 4009 + }, + { + "epoch": 0.3240469504434433, + "grad_norm": 2.945824384689331, + "learning_rate": 9.549900571754336e-06, + "loss": 0.927, + "step": 4010 + }, + { + "epoch": 0.32412776015677086, + "grad_norm": 2.685911178588867, + "learning_rate": 9.549629200289646e-06, + "loss": 0.9796, + "step": 4011 + }, + { + "epoch": 0.3242085698700984, + "grad_norm": 2.4019417762756348, + "learning_rate": 9.549357750900762e-06, + "loss": 0.9273, + "step": 4012 + }, + { + "epoch": 0.3242893795834259, + "grad_norm": 3.1122894287109375, + "learning_rate": 9.54908622359233e-06, + "loss": 0.9409, + "step": 4013 + }, + { + "epoch": 0.3243701892967535, + "grad_norm": 2.6726021766662598, + "learning_rate": 9.548814618369004e-06, + "loss": 1.0427, + "step": 4014 + }, + { + "epoch": 0.324450999010081, + "grad_norm": 2.7439935207366943, + "learning_rate": 9.548542935235433e-06, + "loss": 0.8824, + "step": 4015 + }, + { + "epoch": 0.32453180872340853, + "grad_norm": 2.644510269165039, + "learning_rate": 9.548271174196275e-06, + "loss": 0.9333, + "step": 4016 + }, + { + "epoch": 0.3246126184367361, + "grad_norm": 3.181877613067627, + "learning_rate": 9.547999335256179e-06, + "loss": 0.9233, + "step": 4017 + }, + { + "epoch": 0.32469342815006363, + "grad_norm": 2.7948482036590576, + "learning_rate": 9.547727418419802e-06, + "loss": 1.0905, + "step": 4018 + }, + { + "epoch": 0.32477423786339116, + "grad_norm": 2.7870097160339355, + "learning_rate": 9.547455423691804e-06, + "loss": 0.9834, + "step": 4019 + }, + { + "epoch": 0.32485504757671874, + "grad_norm": 2.678520679473877, + "learning_rate": 9.547183351076843e-06, + "loss": 0.919, + "step": 4020 + }, + { + "epoch": 0.32493585729004626, + "grad_norm": 2.41336727142334, + "learning_rate": 9.546911200579579e-06, + "loss": 1.0095, + "step": 4021 + }, + { + "epoch": 0.3250166670033738, + "grad_norm": 2.888129949569702, + "learning_rate": 9.546638972204669e-06, + "loss": 0.9891, + "step": 4022 + }, + { + "epoch": 0.32509747671670136, + "grad_norm": 2.523606538772583, + "learning_rate": 9.54636666595678e-06, + "loss": 1.0194, + "step": 4023 + }, + { + "epoch": 0.3251782864300289, + "grad_norm": 2.9586949348449707, + "learning_rate": 9.546094281840576e-06, + "loss": 1.0903, + "step": 4024 + }, + { + "epoch": 0.3252590961433564, + "grad_norm": 3.0866212844848633, + "learning_rate": 9.545821819860722e-06, + "loss": 0.9632, + "step": 4025 + }, + { + "epoch": 0.325339905856684, + "grad_norm": 2.503249168395996, + "learning_rate": 9.545549280021882e-06, + "loss": 0.9814, + "step": 4026 + }, + { + "epoch": 0.3254207155700115, + "grad_norm": 3.371840000152588, + "learning_rate": 9.545276662328727e-06, + "loss": 0.9856, + "step": 4027 + }, + { + "epoch": 0.32550152528333903, + "grad_norm": 2.9147255420684814, + "learning_rate": 9.545003966785922e-06, + "loss": 0.9717, + "step": 4028 + }, + { + "epoch": 0.3255823349966666, + "grad_norm": 2.737259864807129, + "learning_rate": 9.544731193398144e-06, + "loss": 1.047, + "step": 4029 + }, + { + "epoch": 0.32566314470999413, + "grad_norm": 2.514043092727661, + "learning_rate": 9.54445834217006e-06, + "loss": 0.979, + "step": 4030 + }, + { + "epoch": 0.32574395442332166, + "grad_norm": 2.676771402359009, + "learning_rate": 9.544185413106345e-06, + "loss": 0.9492, + "step": 4031 + }, + { + "epoch": 0.32582476413664924, + "grad_norm": 2.657778024673462, + "learning_rate": 9.543912406211677e-06, + "loss": 1.0499, + "step": 4032 + }, + { + "epoch": 0.32590557384997676, + "grad_norm": 2.5434489250183105, + "learning_rate": 9.543639321490725e-06, + "loss": 0.9831, + "step": 4033 + }, + { + "epoch": 0.3259863835633043, + "grad_norm": 2.455796003341675, + "learning_rate": 9.543366158948171e-06, + "loss": 1.0414, + "step": 4034 + }, + { + "epoch": 0.32606719327663186, + "grad_norm": 2.7317655086517334, + "learning_rate": 9.543092918588691e-06, + "loss": 0.9999, + "step": 4035 + }, + { + "epoch": 0.3261480029899594, + "grad_norm": 2.511050224304199, + "learning_rate": 9.542819600416968e-06, + "loss": 0.9754, + "step": 4036 + }, + { + "epoch": 0.3262288127032869, + "grad_norm": 2.6154658794403076, + "learning_rate": 9.54254620443768e-06, + "loss": 0.912, + "step": 4037 + }, + { + "epoch": 0.3263096224166145, + "grad_norm": 2.6074345111846924, + "learning_rate": 9.542272730655513e-06, + "loss": 1.0228, + "step": 4038 + }, + { + "epoch": 0.326390432129942, + "grad_norm": 2.535208225250244, + "learning_rate": 9.541999179075147e-06, + "loss": 0.9436, + "step": 4039 + }, + { + "epoch": 0.3264712418432696, + "grad_norm": 2.6358659267425537, + "learning_rate": 9.541725549701273e-06, + "loss": 0.8877, + "step": 4040 + }, + { + "epoch": 0.3265520515565971, + "grad_norm": 3.4987316131591797, + "learning_rate": 9.541451842538571e-06, + "loss": 0.9417, + "step": 4041 + }, + { + "epoch": 0.32663286126992463, + "grad_norm": 2.594028949737549, + "learning_rate": 9.541178057591733e-06, + "loss": 1.0146, + "step": 4042 + }, + { + "epoch": 0.3267136709832522, + "grad_norm": 2.839911460876465, + "learning_rate": 9.540904194865448e-06, + "loss": 0.9229, + "step": 4043 + }, + { + "epoch": 0.32679448069657974, + "grad_norm": 2.946542263031006, + "learning_rate": 9.540630254364404e-06, + "loss": 1.0548, + "step": 4044 + }, + { + "epoch": 0.32687529040990726, + "grad_norm": 2.7420732975006104, + "learning_rate": 9.540356236093296e-06, + "loss": 0.9801, + "step": 4045 + }, + { + "epoch": 0.32695610012323484, + "grad_norm": 2.915590286254883, + "learning_rate": 9.540082140056816e-06, + "loss": 0.9744, + "step": 4046 + }, + { + "epoch": 0.32703690983656236, + "grad_norm": 2.9443070888519287, + "learning_rate": 9.53980796625966e-06, + "loss": 0.9807, + "step": 4047 + }, + { + "epoch": 0.3271177195498899, + "grad_norm": 3.117426872253418, + "learning_rate": 9.539533714706522e-06, + "loss": 1.0068, + "step": 4048 + }, + { + "epoch": 0.32719852926321746, + "grad_norm": 3.088529348373413, + "learning_rate": 9.5392593854021e-06, + "loss": 1.0154, + "step": 4049 + }, + { + "epoch": 0.327279338976545, + "grad_norm": 2.6547067165374756, + "learning_rate": 9.538984978351092e-06, + "loss": 1.0103, + "step": 4050 + }, + { + "epoch": 0.3273601486898725, + "grad_norm": 3.094447135925293, + "learning_rate": 9.538710493558199e-06, + "loss": 0.975, + "step": 4051 + }, + { + "epoch": 0.3274409584032001, + "grad_norm": 2.459456443786621, + "learning_rate": 9.53843593102812e-06, + "loss": 1.0546, + "step": 4052 + }, + { + "epoch": 0.3275217681165276, + "grad_norm": 2.74337100982666, + "learning_rate": 9.538161290765561e-06, + "loss": 0.9193, + "step": 4053 + }, + { + "epoch": 0.32760257782985513, + "grad_norm": 2.9267196655273438, + "learning_rate": 9.537886572775225e-06, + "loss": 0.9215, + "step": 4054 + }, + { + "epoch": 0.3276833875431827, + "grad_norm": 3.0507187843322754, + "learning_rate": 9.537611777061815e-06, + "loss": 0.9681, + "step": 4055 + }, + { + "epoch": 0.32776419725651024, + "grad_norm": 3.12231707572937, + "learning_rate": 9.537336903630038e-06, + "loss": 0.9909, + "step": 4056 + }, + { + "epoch": 0.32784500696983776, + "grad_norm": 2.637436866760254, + "learning_rate": 9.537061952484606e-06, + "loss": 0.8506, + "step": 4057 + }, + { + "epoch": 0.32792581668316534, + "grad_norm": 2.2959401607513428, + "learning_rate": 9.536786923630223e-06, + "loss": 0.9894, + "step": 4058 + }, + { + "epoch": 0.32800662639649286, + "grad_norm": 3.118029832839966, + "learning_rate": 9.536511817071602e-06, + "loss": 1.0285, + "step": 4059 + }, + { + "epoch": 0.3280874361098204, + "grad_norm": 2.8827004432678223, + "learning_rate": 9.536236632813458e-06, + "loss": 1.0687, + "step": 4060 + }, + { + "epoch": 0.32816824582314796, + "grad_norm": 3.069139242172241, + "learning_rate": 9.535961370860497e-06, + "loss": 1.1305, + "step": 4061 + }, + { + "epoch": 0.3282490555364755, + "grad_norm": 2.61059832572937, + "learning_rate": 9.53568603121744e-06, + "loss": 0.9611, + "step": 4062 + }, + { + "epoch": 0.328329865249803, + "grad_norm": 2.6590816974639893, + "learning_rate": 9.535410613889e-06, + "loss": 0.9605, + "step": 4063 + }, + { + "epoch": 0.3284106749631306, + "grad_norm": 2.606452465057373, + "learning_rate": 9.535135118879895e-06, + "loss": 0.9853, + "step": 4064 + }, + { + "epoch": 0.3284914846764581, + "grad_norm": 2.8682236671447754, + "learning_rate": 9.534859546194843e-06, + "loss": 1.0149, + "step": 4065 + }, + { + "epoch": 0.32857229438978564, + "grad_norm": 3.171236515045166, + "learning_rate": 9.534583895838562e-06, + "loss": 0.9628, + "step": 4066 + }, + { + "epoch": 0.3286531041031132, + "grad_norm": 2.8859357833862305, + "learning_rate": 9.534308167815778e-06, + "loss": 1.0213, + "step": 4067 + }, + { + "epoch": 0.32873391381644074, + "grad_norm": 2.733534097671509, + "learning_rate": 9.534032362131211e-06, + "loss": 0.9466, + "step": 4068 + }, + { + "epoch": 0.32881472352976826, + "grad_norm": 2.582289218902588, + "learning_rate": 9.533756478789585e-06, + "loss": 0.9773, + "step": 4069 + }, + { + "epoch": 0.32889553324309584, + "grad_norm": 3.1076979637145996, + "learning_rate": 9.533480517795623e-06, + "loss": 0.9909, + "step": 4070 + }, + { + "epoch": 0.32897634295642336, + "grad_norm": 3.08135724067688, + "learning_rate": 9.533204479154056e-06, + "loss": 0.9729, + "step": 4071 + }, + { + "epoch": 0.3290571526697509, + "grad_norm": 2.463721513748169, + "learning_rate": 9.532928362869609e-06, + "loss": 1.0492, + "step": 4072 + }, + { + "epoch": 0.32913796238307846, + "grad_norm": 2.6856791973114014, + "learning_rate": 9.532652168947011e-06, + "loss": 0.9851, + "step": 4073 + }, + { + "epoch": 0.329218772096406, + "grad_norm": 3.083483934402466, + "learning_rate": 9.532375897390993e-06, + "loss": 0.9419, + "step": 4074 + }, + { + "epoch": 0.3292995818097335, + "grad_norm": 2.5787622928619385, + "learning_rate": 9.532099548206288e-06, + "loss": 0.9431, + "step": 4075 + }, + { + "epoch": 0.3293803915230611, + "grad_norm": 2.474858283996582, + "learning_rate": 9.531823121397628e-06, + "loss": 1.0828, + "step": 4076 + }, + { + "epoch": 0.3294612012363886, + "grad_norm": 2.7180659770965576, + "learning_rate": 9.531546616969747e-06, + "loss": 0.8964, + "step": 4077 + }, + { + "epoch": 0.32954201094971614, + "grad_norm": 2.3666176795959473, + "learning_rate": 9.531270034927383e-06, + "loss": 0.9429, + "step": 4078 + }, + { + "epoch": 0.3296228206630437, + "grad_norm": 2.6842448711395264, + "learning_rate": 9.530993375275272e-06, + "loss": 0.9373, + "step": 4079 + }, + { + "epoch": 0.32970363037637124, + "grad_norm": 2.9368889331817627, + "learning_rate": 9.53071663801815e-06, + "loss": 0.9983, + "step": 4080 + }, + { + "epoch": 0.32978444008969876, + "grad_norm": 2.569736957550049, + "learning_rate": 9.530439823160761e-06, + "loss": 0.9224, + "step": 4081 + }, + { + "epoch": 0.32986524980302634, + "grad_norm": 2.9519383907318115, + "learning_rate": 9.530162930707846e-06, + "loss": 0.9483, + "step": 4082 + }, + { + "epoch": 0.32994605951635386, + "grad_norm": 2.475437641143799, + "learning_rate": 9.529885960664144e-06, + "loss": 1.0032, + "step": 4083 + }, + { + "epoch": 0.3300268692296814, + "grad_norm": 2.5936172008514404, + "learning_rate": 9.529608913034401e-06, + "loss": 1.1032, + "step": 4084 + }, + { + "epoch": 0.33010767894300896, + "grad_norm": 2.4394097328186035, + "learning_rate": 9.529331787823362e-06, + "loss": 0.9851, + "step": 4085 + }, + { + "epoch": 0.3301884886563365, + "grad_norm": 2.895510673522949, + "learning_rate": 9.529054585035774e-06, + "loss": 0.9855, + "step": 4086 + }, + { + "epoch": 0.330269298369664, + "grad_norm": 3.4800808429718018, + "learning_rate": 9.528777304676384e-06, + "loss": 0.9552, + "step": 4087 + }, + { + "epoch": 0.3303501080829916, + "grad_norm": 2.9893293380737305, + "learning_rate": 9.528499946749941e-06, + "loss": 1.0569, + "step": 4088 + }, + { + "epoch": 0.3304309177963191, + "grad_norm": 2.711333990097046, + "learning_rate": 9.528222511261196e-06, + "loss": 0.9181, + "step": 4089 + }, + { + "epoch": 0.33051172750964664, + "grad_norm": 3.0602447986602783, + "learning_rate": 9.527944998214899e-06, + "loss": 0.9545, + "step": 4090 + }, + { + "epoch": 0.3305925372229742, + "grad_norm": 2.645359516143799, + "learning_rate": 9.527667407615807e-06, + "loss": 1.0609, + "step": 4091 + }, + { + "epoch": 0.33067334693630174, + "grad_norm": 2.887587785720825, + "learning_rate": 9.527389739468672e-06, + "loss": 1.0834, + "step": 4092 + }, + { + "epoch": 0.33075415664962926, + "grad_norm": 2.4695920944213867, + "learning_rate": 9.527111993778248e-06, + "loss": 0.8339, + "step": 4093 + }, + { + "epoch": 0.33083496636295684, + "grad_norm": 2.296949863433838, + "learning_rate": 9.526834170549298e-06, + "loss": 1.0393, + "step": 4094 + }, + { + "epoch": 0.33091577607628436, + "grad_norm": 3.023770332336426, + "learning_rate": 9.526556269786573e-06, + "loss": 0.9751, + "step": 4095 + }, + { + "epoch": 0.3309965857896119, + "grad_norm": 2.6461973190307617, + "learning_rate": 9.526278291494837e-06, + "loss": 1.1395, + "step": 4096 + }, + { + "epoch": 0.33107739550293946, + "grad_norm": 2.568086624145508, + "learning_rate": 9.526000235678852e-06, + "loss": 0.9888, + "step": 4097 + }, + { + "epoch": 0.331158205216267, + "grad_norm": 3.141347885131836, + "learning_rate": 9.525722102343377e-06, + "loss": 0.9531, + "step": 4098 + }, + { + "epoch": 0.3312390149295945, + "grad_norm": 2.9465346336364746, + "learning_rate": 9.525443891493178e-06, + "loss": 0.979, + "step": 4099 + }, + { + "epoch": 0.3313198246429221, + "grad_norm": 3.0027921199798584, + "learning_rate": 9.52516560313302e-06, + "loss": 0.9878, + "step": 4100 + }, + { + "epoch": 0.3314006343562496, + "grad_norm": 2.8586180210113525, + "learning_rate": 9.524887237267671e-06, + "loss": 0.9823, + "step": 4101 + }, + { + "epoch": 0.33148144406957714, + "grad_norm": 3.216400384902954, + "learning_rate": 9.524608793901893e-06, + "loss": 1.0316, + "step": 4102 + }, + { + "epoch": 0.3315622537829047, + "grad_norm": 2.8101775646209717, + "learning_rate": 9.524330273040462e-06, + "loss": 0.9666, + "step": 4103 + }, + { + "epoch": 0.33164306349623224, + "grad_norm": 3.338576078414917, + "learning_rate": 9.524051674688145e-06, + "loss": 1.0002, + "step": 4104 + }, + { + "epoch": 0.3317238732095598, + "grad_norm": 2.9538190364837646, + "learning_rate": 9.523772998849712e-06, + "loss": 0.9646, + "step": 4105 + }, + { + "epoch": 0.33180468292288734, + "grad_norm": 2.605074167251587, + "learning_rate": 9.52349424552994e-06, + "loss": 0.9202, + "step": 4106 + }, + { + "epoch": 0.33188549263621486, + "grad_norm": 2.7699732780456543, + "learning_rate": 9.5232154147336e-06, + "loss": 0.9016, + "step": 4107 + }, + { + "epoch": 0.33196630234954244, + "grad_norm": 2.916870355606079, + "learning_rate": 9.52293650646547e-06, + "loss": 0.8376, + "step": 4108 + }, + { + "epoch": 0.33204711206286996, + "grad_norm": 2.7381067276000977, + "learning_rate": 9.522657520730327e-06, + "loss": 0.9252, + "step": 4109 + }, + { + "epoch": 0.3321279217761975, + "grad_norm": 2.4672820568084717, + "learning_rate": 9.522378457532948e-06, + "loss": 1.0367, + "step": 4110 + }, + { + "epoch": 0.33220873148952507, + "grad_norm": 2.7581498622894287, + "learning_rate": 9.522099316878113e-06, + "loss": 1.0315, + "step": 4111 + }, + { + "epoch": 0.3322895412028526, + "grad_norm": 2.723599433898926, + "learning_rate": 9.521820098770602e-06, + "loss": 0.9118, + "step": 4112 + }, + { + "epoch": 0.3323703509161801, + "grad_norm": 2.8597822189331055, + "learning_rate": 9.521540803215199e-06, + "loss": 1.0321, + "step": 4113 + }, + { + "epoch": 0.3324511606295077, + "grad_norm": 2.4662044048309326, + "learning_rate": 9.521261430216689e-06, + "loss": 0.9933, + "step": 4114 + }, + { + "epoch": 0.3325319703428352, + "grad_norm": 2.8004307746887207, + "learning_rate": 9.520981979779853e-06, + "loss": 1.0135, + "step": 4115 + }, + { + "epoch": 0.33261278005616274, + "grad_norm": 2.5971570014953613, + "learning_rate": 9.520702451909481e-06, + "loss": 0.92, + "step": 4116 + }, + { + "epoch": 0.3326935897694903, + "grad_norm": 2.7834208011627197, + "learning_rate": 9.520422846610359e-06, + "loss": 1.0115, + "step": 4117 + }, + { + "epoch": 0.33277439948281784, + "grad_norm": 2.5413529872894287, + "learning_rate": 9.520143163887277e-06, + "loss": 0.9328, + "step": 4118 + }, + { + "epoch": 0.33285520919614536, + "grad_norm": 2.7738726139068604, + "learning_rate": 9.519863403745023e-06, + "loss": 1.0739, + "step": 4119 + }, + { + "epoch": 0.33293601890947294, + "grad_norm": 2.608670949935913, + "learning_rate": 9.519583566188389e-06, + "loss": 1.0701, + "step": 4120 + }, + { + "epoch": 0.33301682862280046, + "grad_norm": 2.788914442062378, + "learning_rate": 9.519303651222171e-06, + "loss": 1.0519, + "step": 4121 + }, + { + "epoch": 0.333097638336128, + "grad_norm": 3.1283442974090576, + "learning_rate": 9.51902365885116e-06, + "loss": 1.0496, + "step": 4122 + }, + { + "epoch": 0.33317844804945557, + "grad_norm": 2.777540922164917, + "learning_rate": 9.518743589080153e-06, + "loss": 0.9842, + "step": 4123 + }, + { + "epoch": 0.3332592577627831, + "grad_norm": 2.6435718536376953, + "learning_rate": 9.518463441913947e-06, + "loss": 0.948, + "step": 4124 + }, + { + "epoch": 0.3333400674761106, + "grad_norm": 2.7920753955841064, + "learning_rate": 9.51818321735734e-06, + "loss": 1.032, + "step": 4125 + }, + { + "epoch": 0.3334208771894382, + "grad_norm": 2.519684076309204, + "learning_rate": 9.517902915415131e-06, + "loss": 0.9893, + "step": 4126 + }, + { + "epoch": 0.3335016869027657, + "grad_norm": 2.717278480529785, + "learning_rate": 9.517622536092123e-06, + "loss": 1.0427, + "step": 4127 + }, + { + "epoch": 0.33358249661609324, + "grad_norm": 2.4553167819976807, + "learning_rate": 9.517342079393114e-06, + "loss": 0.992, + "step": 4128 + }, + { + "epoch": 0.3336633063294208, + "grad_norm": 2.500657320022583, + "learning_rate": 9.517061545322912e-06, + "loss": 1.003, + "step": 4129 + }, + { + "epoch": 0.33374411604274834, + "grad_norm": 2.820107936859131, + "learning_rate": 9.51678093388632e-06, + "loss": 1.0727, + "step": 4130 + }, + { + "epoch": 0.33382492575607586, + "grad_norm": 2.645263910293579, + "learning_rate": 9.516500245088144e-06, + "loss": 0.9722, + "step": 4131 + }, + { + "epoch": 0.33390573546940344, + "grad_norm": 2.426103353500366, + "learning_rate": 9.516219478933193e-06, + "loss": 1.1225, + "step": 4132 + }, + { + "epoch": 0.33398654518273097, + "grad_norm": 2.993973970413208, + "learning_rate": 9.515938635426274e-06, + "loss": 0.9664, + "step": 4133 + }, + { + "epoch": 0.3340673548960585, + "grad_norm": 2.6964805126190186, + "learning_rate": 9.515657714572197e-06, + "loss": 0.9502, + "step": 4134 + }, + { + "epoch": 0.33414816460938607, + "grad_norm": 2.846302032470703, + "learning_rate": 9.515376716375777e-06, + "loss": 1.0977, + "step": 4135 + }, + { + "epoch": 0.3342289743227136, + "grad_norm": 2.6280717849731445, + "learning_rate": 9.515095640841822e-06, + "loss": 0.925, + "step": 4136 + }, + { + "epoch": 0.3343097840360411, + "grad_norm": 3.2251338958740234, + "learning_rate": 9.514814487975148e-06, + "loss": 1.063, + "step": 4137 + }, + { + "epoch": 0.3343905937493687, + "grad_norm": 2.701317310333252, + "learning_rate": 9.514533257780571e-06, + "loss": 0.967, + "step": 4138 + }, + { + "epoch": 0.3344714034626962, + "grad_norm": 2.729400396347046, + "learning_rate": 9.514251950262908e-06, + "loss": 1.0842, + "step": 4139 + }, + { + "epoch": 0.33455221317602374, + "grad_norm": 2.852540969848633, + "learning_rate": 9.513970565426978e-06, + "loss": 0.9324, + "step": 4140 + }, + { + "epoch": 0.3346330228893513, + "grad_norm": 2.7535130977630615, + "learning_rate": 9.513689103277597e-06, + "loss": 0.9181, + "step": 4141 + }, + { + "epoch": 0.33471383260267884, + "grad_norm": 2.5309736728668213, + "learning_rate": 9.513407563819589e-06, + "loss": 0.9963, + "step": 4142 + }, + { + "epoch": 0.33479464231600636, + "grad_norm": 2.8194198608398438, + "learning_rate": 9.513125947057776e-06, + "loss": 1.0348, + "step": 4143 + }, + { + "epoch": 0.33487545202933394, + "grad_norm": 2.8604578971862793, + "learning_rate": 9.512844252996978e-06, + "loss": 1.1125, + "step": 4144 + }, + { + "epoch": 0.33495626174266147, + "grad_norm": 2.880497932434082, + "learning_rate": 9.512562481642023e-06, + "loss": 0.9588, + "step": 4145 + }, + { + "epoch": 0.335037071455989, + "grad_norm": 2.5192902088165283, + "learning_rate": 9.512280632997737e-06, + "loss": 1.0429, + "step": 4146 + }, + { + "epoch": 0.33511788116931657, + "grad_norm": 2.6330995559692383, + "learning_rate": 9.511998707068946e-06, + "loss": 0.9643, + "step": 4147 + }, + { + "epoch": 0.3351986908826441, + "grad_norm": 2.7292113304138184, + "learning_rate": 9.511716703860479e-06, + "loss": 0.9461, + "step": 4148 + }, + { + "epoch": 0.3352795005959716, + "grad_norm": 2.478864908218384, + "learning_rate": 9.511434623377167e-06, + "loss": 0.9124, + "step": 4149 + }, + { + "epoch": 0.3353603103092992, + "grad_norm": 2.4361627101898193, + "learning_rate": 9.51115246562384e-06, + "loss": 0.9846, + "step": 4150 + }, + { + "epoch": 0.3354411200226267, + "grad_norm": 2.6726222038269043, + "learning_rate": 9.510870230605333e-06, + "loss": 1.0666, + "step": 4151 + }, + { + "epoch": 0.33552192973595424, + "grad_norm": 3.023792266845703, + "learning_rate": 9.510587918326477e-06, + "loss": 1.0163, + "step": 4152 + }, + { + "epoch": 0.3356027394492818, + "grad_norm": 3.5809295177459717, + "learning_rate": 9.51030552879211e-06, + "loss": 0.9217, + "step": 4153 + }, + { + "epoch": 0.33568354916260934, + "grad_norm": 2.224635601043701, + "learning_rate": 9.510023062007067e-06, + "loss": 0.9508, + "step": 4154 + }, + { + "epoch": 0.33576435887593686, + "grad_norm": 2.5898799896240234, + "learning_rate": 9.509740517976186e-06, + "loss": 0.9548, + "step": 4155 + }, + { + "epoch": 0.33584516858926444, + "grad_norm": 2.8230583667755127, + "learning_rate": 9.509457896704306e-06, + "loss": 0.9562, + "step": 4156 + }, + { + "epoch": 0.33592597830259197, + "grad_norm": 2.452807664871216, + "learning_rate": 9.50917519819627e-06, + "loss": 1.0768, + "step": 4157 + }, + { + "epoch": 0.3360067880159195, + "grad_norm": 2.7506778240203857, + "learning_rate": 9.508892422456916e-06, + "loss": 1.0678, + "step": 4158 + }, + { + "epoch": 0.33608759772924707, + "grad_norm": 2.8394293785095215, + "learning_rate": 9.508609569491091e-06, + "loss": 0.9876, + "step": 4159 + }, + { + "epoch": 0.3361684074425746, + "grad_norm": 3.0543792247772217, + "learning_rate": 9.508326639303639e-06, + "loss": 0.9489, + "step": 4160 + }, + { + "epoch": 0.3362492171559021, + "grad_norm": 2.382162570953369, + "learning_rate": 9.508043631899405e-06, + "loss": 1.0068, + "step": 4161 + }, + { + "epoch": 0.3363300268692297, + "grad_norm": 2.6702747344970703, + "learning_rate": 9.507760547283233e-06, + "loss": 0.94, + "step": 4162 + }, + { + "epoch": 0.3364108365825572, + "grad_norm": 2.6312103271484375, + "learning_rate": 9.507477385459978e-06, + "loss": 1.0088, + "step": 4163 + }, + { + "epoch": 0.33649164629588474, + "grad_norm": 3.097381353378296, + "learning_rate": 9.507194146434486e-06, + "loss": 0.8685, + "step": 4164 + }, + { + "epoch": 0.3365724560092123, + "grad_norm": 2.7595736980438232, + "learning_rate": 9.50691083021161e-06, + "loss": 1.0228, + "step": 4165 + }, + { + "epoch": 0.33665326572253984, + "grad_norm": 2.5723166465759277, + "learning_rate": 9.506627436796199e-06, + "loss": 0.8889, + "step": 4166 + }, + { + "epoch": 0.33673407543586736, + "grad_norm": 3.3848299980163574, + "learning_rate": 9.50634396619311e-06, + "loss": 1.0653, + "step": 4167 + }, + { + "epoch": 0.33681488514919494, + "grad_norm": 2.8883447647094727, + "learning_rate": 9.506060418407197e-06, + "loss": 0.8401, + "step": 4168 + }, + { + "epoch": 0.33689569486252247, + "grad_norm": 2.591768980026245, + "learning_rate": 9.505776793443318e-06, + "loss": 1.0239, + "step": 4169 + }, + { + "epoch": 0.33697650457585004, + "grad_norm": 2.822874069213867, + "learning_rate": 9.50549309130633e-06, + "loss": 1.0274, + "step": 4170 + }, + { + "epoch": 0.33705731428917757, + "grad_norm": 2.902327537536621, + "learning_rate": 9.505209312001091e-06, + "loss": 1.1014, + "step": 4171 + }, + { + "epoch": 0.3371381240025051, + "grad_norm": 2.242377996444702, + "learning_rate": 9.504925455532463e-06, + "loss": 0.9613, + "step": 4172 + }, + { + "epoch": 0.33721893371583267, + "grad_norm": 2.6212575435638428, + "learning_rate": 9.504641521905306e-06, + "loss": 0.8804, + "step": 4173 + }, + { + "epoch": 0.3372997434291602, + "grad_norm": 2.40804123878479, + "learning_rate": 9.504357511124487e-06, + "loss": 0.9385, + "step": 4174 + }, + { + "epoch": 0.3373805531424877, + "grad_norm": 2.574183940887451, + "learning_rate": 9.504073423194864e-06, + "loss": 0.9373, + "step": 4175 + }, + { + "epoch": 0.3374613628558153, + "grad_norm": 2.52545428276062, + "learning_rate": 9.503789258121309e-06, + "loss": 1.1001, + "step": 4176 + }, + { + "epoch": 0.3375421725691428, + "grad_norm": 2.7722158432006836, + "learning_rate": 9.503505015908685e-06, + "loss": 0.9674, + "step": 4177 + }, + { + "epoch": 0.33762298228247034, + "grad_norm": 2.7113494873046875, + "learning_rate": 9.503220696561863e-06, + "loss": 1.0573, + "step": 4178 + }, + { + "epoch": 0.3377037919957979, + "grad_norm": 2.638622999191284, + "learning_rate": 9.50293630008571e-06, + "loss": 0.8603, + "step": 4179 + }, + { + "epoch": 0.33778460170912544, + "grad_norm": 2.8496179580688477, + "learning_rate": 9.5026518264851e-06, + "loss": 0.9971, + "step": 4180 + }, + { + "epoch": 0.33786541142245297, + "grad_norm": 3.526613473892212, + "learning_rate": 9.502367275764904e-06, + "loss": 0.8981, + "step": 4181 + }, + { + "epoch": 0.33794622113578054, + "grad_norm": 2.460406541824341, + "learning_rate": 9.502082647929993e-06, + "loss": 0.9297, + "step": 4182 + }, + { + "epoch": 0.33802703084910807, + "grad_norm": 2.441196918487549, + "learning_rate": 9.501797942985247e-06, + "loss": 1.0186, + "step": 4183 + }, + { + "epoch": 0.3381078405624356, + "grad_norm": 2.971407651901245, + "learning_rate": 9.50151316093554e-06, + "loss": 1.1108, + "step": 4184 + }, + { + "epoch": 0.33818865027576317, + "grad_norm": 2.512982130050659, + "learning_rate": 9.501228301785748e-06, + "loss": 0.9974, + "step": 4185 + }, + { + "epoch": 0.3382694599890907, + "grad_norm": 2.587165117263794, + "learning_rate": 9.500943365540753e-06, + "loss": 1.0078, + "step": 4186 + }, + { + "epoch": 0.3383502697024182, + "grad_norm": 2.699758529663086, + "learning_rate": 9.500658352205433e-06, + "loss": 1.0141, + "step": 4187 + }, + { + "epoch": 0.3384310794157458, + "grad_norm": 2.9663798809051514, + "learning_rate": 9.50037326178467e-06, + "loss": 0.8985, + "step": 4188 + }, + { + "epoch": 0.3385118891290733, + "grad_norm": 2.920869827270508, + "learning_rate": 9.500088094283347e-06, + "loss": 0.9699, + "step": 4189 + }, + { + "epoch": 0.33859269884240084, + "grad_norm": 2.5547428131103516, + "learning_rate": 9.499802849706348e-06, + "loss": 0.8421, + "step": 4190 + }, + { + "epoch": 0.3386735085557284, + "grad_norm": 2.5543925762176514, + "learning_rate": 9.499517528058562e-06, + "loss": 0.966, + "step": 4191 + }, + { + "epoch": 0.33875431826905594, + "grad_norm": 2.707092046737671, + "learning_rate": 9.49923212934487e-06, + "loss": 0.9476, + "step": 4192 + }, + { + "epoch": 0.33883512798238347, + "grad_norm": 2.891502618789673, + "learning_rate": 9.498946653570164e-06, + "loss": 0.9965, + "step": 4193 + }, + { + "epoch": 0.33891593769571104, + "grad_norm": 2.6104674339294434, + "learning_rate": 9.498661100739332e-06, + "loss": 1.0823, + "step": 4194 + }, + { + "epoch": 0.33899674740903857, + "grad_norm": 2.9352173805236816, + "learning_rate": 9.498375470857266e-06, + "loss": 1.0509, + "step": 4195 + }, + { + "epoch": 0.3390775571223661, + "grad_norm": 2.6980602741241455, + "learning_rate": 9.498089763928857e-06, + "loss": 0.9992, + "step": 4196 + }, + { + "epoch": 0.33915836683569367, + "grad_norm": 2.7117533683776855, + "learning_rate": 9.497803979959e-06, + "loss": 1.0155, + "step": 4197 + }, + { + "epoch": 0.3392391765490212, + "grad_norm": 2.6956875324249268, + "learning_rate": 9.497518118952588e-06, + "loss": 0.9609, + "step": 4198 + }, + { + "epoch": 0.3393199862623487, + "grad_norm": 2.6220474243164062, + "learning_rate": 9.49723218091452e-06, + "loss": 0.9975, + "step": 4199 + }, + { + "epoch": 0.3394007959756763, + "grad_norm": 2.730402708053589, + "learning_rate": 9.49694616584969e-06, + "loss": 1.1123, + "step": 4200 + }, + { + "epoch": 0.3394816056890038, + "grad_norm": 2.820833206176758, + "learning_rate": 9.496660073762998e-06, + "loss": 1.023, + "step": 4201 + }, + { + "epoch": 0.33956241540233134, + "grad_norm": 2.9102706909179688, + "learning_rate": 9.496373904659344e-06, + "loss": 0.903, + "step": 4202 + }, + { + "epoch": 0.3396432251156589, + "grad_norm": 3.058046340942383, + "learning_rate": 9.496087658543629e-06, + "loss": 0.9479, + "step": 4203 + }, + { + "epoch": 0.33972403482898644, + "grad_norm": 2.2796804904937744, + "learning_rate": 9.495801335420757e-06, + "loss": 0.9136, + "step": 4204 + }, + { + "epoch": 0.33980484454231397, + "grad_norm": 2.5576324462890625, + "learning_rate": 9.495514935295631e-06, + "loss": 1.024, + "step": 4205 + }, + { + "epoch": 0.33988565425564154, + "grad_norm": 2.435377597808838, + "learning_rate": 9.495228458173159e-06, + "loss": 0.8844, + "step": 4206 + }, + { + "epoch": 0.33996646396896907, + "grad_norm": 2.8297946453094482, + "learning_rate": 9.494941904058241e-06, + "loss": 0.9269, + "step": 4207 + }, + { + "epoch": 0.3400472736822966, + "grad_norm": 2.8731467723846436, + "learning_rate": 9.494655272955792e-06, + "loss": 1.0457, + "step": 4208 + }, + { + "epoch": 0.34012808339562417, + "grad_norm": 2.9491989612579346, + "learning_rate": 9.49436856487072e-06, + "loss": 0.8883, + "step": 4209 + }, + { + "epoch": 0.3402088931089517, + "grad_norm": 2.664522647857666, + "learning_rate": 9.494081779807931e-06, + "loss": 0.9569, + "step": 4210 + }, + { + "epoch": 0.3402897028222792, + "grad_norm": 2.518174886703491, + "learning_rate": 9.493794917772342e-06, + "loss": 1.0473, + "step": 4211 + }, + { + "epoch": 0.3403705125356068, + "grad_norm": 2.750535249710083, + "learning_rate": 9.493507978768865e-06, + "loss": 0.9221, + "step": 4212 + }, + { + "epoch": 0.3404513222489343, + "grad_norm": 2.493407964706421, + "learning_rate": 9.493220962802413e-06, + "loss": 1.0743, + "step": 4213 + }, + { + "epoch": 0.34053213196226184, + "grad_norm": 2.778715133666992, + "learning_rate": 9.492933869877902e-06, + "loss": 0.9011, + "step": 4214 + }, + { + "epoch": 0.3406129416755894, + "grad_norm": 3.035158634185791, + "learning_rate": 9.492646700000252e-06, + "loss": 0.9334, + "step": 4215 + }, + { + "epoch": 0.34069375138891694, + "grad_norm": 2.4467737674713135, + "learning_rate": 9.492359453174377e-06, + "loss": 0.8573, + "step": 4216 + }, + { + "epoch": 0.34077456110224447, + "grad_norm": 2.7152509689331055, + "learning_rate": 9.492072129405203e-06, + "loss": 0.9816, + "step": 4217 + }, + { + "epoch": 0.34085537081557205, + "grad_norm": 3.1276209354400635, + "learning_rate": 9.491784728697646e-06, + "loss": 0.9982, + "step": 4218 + }, + { + "epoch": 0.34093618052889957, + "grad_norm": 2.524350166320801, + "learning_rate": 9.49149725105663e-06, + "loss": 0.8906, + "step": 4219 + }, + { + "epoch": 0.3410169902422271, + "grad_norm": 2.934344530105591, + "learning_rate": 9.491209696487077e-06, + "loss": 0.9201, + "step": 4220 + }, + { + "epoch": 0.34109779995555467, + "grad_norm": 2.8160815238952637, + "learning_rate": 9.490922064993917e-06, + "loss": 0.8818, + "step": 4221 + }, + { + "epoch": 0.3411786096688822, + "grad_norm": 2.7600460052490234, + "learning_rate": 9.490634356582072e-06, + "loss": 1.055, + "step": 4222 + }, + { + "epoch": 0.3412594193822097, + "grad_norm": 3.295536518096924, + "learning_rate": 9.490346571256472e-06, + "loss": 0.9552, + "step": 4223 + }, + { + "epoch": 0.3413402290955373, + "grad_norm": 2.6677610874176025, + "learning_rate": 9.490058709022045e-06, + "loss": 0.9235, + "step": 4224 + }, + { + "epoch": 0.3414210388088648, + "grad_norm": 3.1460516452789307, + "learning_rate": 9.48977076988372e-06, + "loss": 0.9745, + "step": 4225 + }, + { + "epoch": 0.34150184852219234, + "grad_norm": 2.595522165298462, + "learning_rate": 9.489482753846435e-06, + "loss": 1.0154, + "step": 4226 + }, + { + "epoch": 0.3415826582355199, + "grad_norm": 2.8672518730163574, + "learning_rate": 9.489194660915115e-06, + "loss": 0.9013, + "step": 4227 + }, + { + "epoch": 0.34166346794884744, + "grad_norm": 3.376835584640503, + "learning_rate": 9.488906491094698e-06, + "loss": 1.0286, + "step": 4228 + }, + { + "epoch": 0.34174427766217497, + "grad_norm": 2.7237389087677, + "learning_rate": 9.48861824439012e-06, + "loss": 1.034, + "step": 4229 + }, + { + "epoch": 0.34182508737550255, + "grad_norm": 3.1918201446533203, + "learning_rate": 9.488329920806316e-06, + "loss": 0.8988, + "step": 4230 + }, + { + "epoch": 0.34190589708883007, + "grad_norm": 3.385815382003784, + "learning_rate": 9.488041520348228e-06, + "loss": 0.924, + "step": 4231 + }, + { + "epoch": 0.3419867068021576, + "grad_norm": 2.591743230819702, + "learning_rate": 9.48775304302079e-06, + "loss": 0.9091, + "step": 4232 + }, + { + "epoch": 0.34206751651548517, + "grad_norm": 2.834713935852051, + "learning_rate": 9.48746448882895e-06, + "loss": 0.8816, + "step": 4233 + }, + { + "epoch": 0.3421483262288127, + "grad_norm": 2.700610637664795, + "learning_rate": 9.487175857777644e-06, + "loss": 1.0109, + "step": 4234 + }, + { + "epoch": 0.3422291359421403, + "grad_norm": 2.9804446697235107, + "learning_rate": 9.48688714987182e-06, + "loss": 0.971, + "step": 4235 + }, + { + "epoch": 0.3423099456554678, + "grad_norm": 2.7324395179748535, + "learning_rate": 9.486598365116418e-06, + "loss": 1.0875, + "step": 4236 + }, + { + "epoch": 0.3423907553687953, + "grad_norm": 2.8630380630493164, + "learning_rate": 9.486309503516388e-06, + "loss": 0.8134, + "step": 4237 + }, + { + "epoch": 0.3424715650821229, + "grad_norm": 2.923279047012329, + "learning_rate": 9.486020565076677e-06, + "loss": 0.8796, + "step": 4238 + }, + { + "epoch": 0.3425523747954504, + "grad_norm": 3.012637138366699, + "learning_rate": 9.485731549802235e-06, + "loss": 1.0446, + "step": 4239 + }, + { + "epoch": 0.34263318450877794, + "grad_norm": 2.2806928157806396, + "learning_rate": 9.48544245769801e-06, + "loss": 1.0663, + "step": 4240 + }, + { + "epoch": 0.3427139942221055, + "grad_norm": 2.2977986335754395, + "learning_rate": 9.485153288768951e-06, + "loss": 0.9665, + "step": 4241 + }, + { + "epoch": 0.34279480393543305, + "grad_norm": 3.0221078395843506, + "learning_rate": 9.484864043020017e-06, + "loss": 0.8723, + "step": 4242 + }, + { + "epoch": 0.34287561364876057, + "grad_norm": 2.7185659408569336, + "learning_rate": 9.484574720456156e-06, + "loss": 0.9093, + "step": 4243 + }, + { + "epoch": 0.34295642336208815, + "grad_norm": 2.2894680500030518, + "learning_rate": 9.484285321082329e-06, + "loss": 0.9966, + "step": 4244 + }, + { + "epoch": 0.34303723307541567, + "grad_norm": 2.9871702194213867, + "learning_rate": 9.483995844903488e-06, + "loss": 0.9714, + "step": 4245 + }, + { + "epoch": 0.3431180427887432, + "grad_norm": 2.6651320457458496, + "learning_rate": 9.483706291924593e-06, + "loss": 1.0343, + "step": 4246 + }, + { + "epoch": 0.3431988525020708, + "grad_norm": 2.907731294631958, + "learning_rate": 9.483416662150604e-06, + "loss": 0.9251, + "step": 4247 + }, + { + "epoch": 0.3432796622153983, + "grad_norm": 2.44343638420105, + "learning_rate": 9.483126955586481e-06, + "loss": 1.0846, + "step": 4248 + }, + { + "epoch": 0.3433604719287258, + "grad_norm": 2.701967477798462, + "learning_rate": 9.482837172237185e-06, + "loss": 1.0523, + "step": 4249 + }, + { + "epoch": 0.3434412816420534, + "grad_norm": 3.1003775596618652, + "learning_rate": 9.482547312107682e-06, + "loss": 0.932, + "step": 4250 + }, + { + "epoch": 0.3435220913553809, + "grad_norm": 2.4994635581970215, + "learning_rate": 9.482257375202934e-06, + "loss": 0.944, + "step": 4251 + }, + { + "epoch": 0.34360290106870844, + "grad_norm": 2.710099697113037, + "learning_rate": 9.481967361527907e-06, + "loss": 1.012, + "step": 4252 + }, + { + "epoch": 0.343683710782036, + "grad_norm": 3.0543906688690186, + "learning_rate": 9.48167727108757e-06, + "loss": 0.9846, + "step": 4253 + }, + { + "epoch": 0.34376452049536355, + "grad_norm": 2.9311554431915283, + "learning_rate": 9.48138710388689e-06, + "loss": 0.8771, + "step": 4254 + }, + { + "epoch": 0.34384533020869107, + "grad_norm": 2.8884940147399902, + "learning_rate": 9.481096859930839e-06, + "loss": 0.8946, + "step": 4255 + }, + { + "epoch": 0.34392613992201865, + "grad_norm": 2.545696258544922, + "learning_rate": 9.480806539224383e-06, + "loss": 1.0564, + "step": 4256 + }, + { + "epoch": 0.34400694963534617, + "grad_norm": 3.0662972927093506, + "learning_rate": 9.480516141772501e-06, + "loss": 1.0557, + "step": 4257 + }, + { + "epoch": 0.3440877593486737, + "grad_norm": 2.9301185607910156, + "learning_rate": 9.480225667580164e-06, + "loss": 0.9476, + "step": 4258 + }, + { + "epoch": 0.3441685690620013, + "grad_norm": 2.5623271465301514, + "learning_rate": 9.479935116652345e-06, + "loss": 1.0103, + "step": 4259 + }, + { + "epoch": 0.3442493787753288, + "grad_norm": 2.6191351413726807, + "learning_rate": 9.479644488994025e-06, + "loss": 0.8592, + "step": 4260 + }, + { + "epoch": 0.3443301884886563, + "grad_norm": 3.001798629760742, + "learning_rate": 9.479353784610177e-06, + "loss": 1.0849, + "step": 4261 + }, + { + "epoch": 0.3444109982019839, + "grad_norm": 2.927638530731201, + "learning_rate": 9.479063003505782e-06, + "loss": 1.06, + "step": 4262 + }, + { + "epoch": 0.3444918079153114, + "grad_norm": 2.7032415866851807, + "learning_rate": 9.478772145685821e-06, + "loss": 1.0905, + "step": 4263 + }, + { + "epoch": 0.34457261762863894, + "grad_norm": 3.0599684715270996, + "learning_rate": 9.478481211155277e-06, + "loss": 0.9663, + "step": 4264 + }, + { + "epoch": 0.3446534273419665, + "grad_norm": 2.296513319015503, + "learning_rate": 9.478190199919131e-06, + "loss": 0.9691, + "step": 4265 + }, + { + "epoch": 0.34473423705529405, + "grad_norm": 2.9048807621002197, + "learning_rate": 9.477899111982367e-06, + "loss": 1.1204, + "step": 4266 + }, + { + "epoch": 0.34481504676862157, + "grad_norm": 2.9777579307556152, + "learning_rate": 9.477607947349971e-06, + "loss": 0.9684, + "step": 4267 + }, + { + "epoch": 0.34489585648194915, + "grad_norm": 3.0115413665771484, + "learning_rate": 9.47731670602693e-06, + "loss": 0.8818, + "step": 4268 + }, + { + "epoch": 0.34497666619527667, + "grad_norm": 2.541673183441162, + "learning_rate": 9.477025388018235e-06, + "loss": 0.987, + "step": 4269 + }, + { + "epoch": 0.3450574759086042, + "grad_norm": 2.505735397338867, + "learning_rate": 9.47673399332887e-06, + "loss": 0.9997, + "step": 4270 + }, + { + "epoch": 0.3451382856219318, + "grad_norm": 2.5219671726226807, + "learning_rate": 9.476442521963831e-06, + "loss": 1.0012, + "step": 4271 + }, + { + "epoch": 0.3452190953352593, + "grad_norm": 2.983621597290039, + "learning_rate": 9.476150973928107e-06, + "loss": 0.8977, + "step": 4272 + }, + { + "epoch": 0.3452999050485868, + "grad_norm": 3.414088726043701, + "learning_rate": 9.475859349226693e-06, + "loss": 0.8983, + "step": 4273 + }, + { + "epoch": 0.3453807147619144, + "grad_norm": 2.7984800338745117, + "learning_rate": 9.475567647864584e-06, + "loss": 0.9156, + "step": 4274 + }, + { + "epoch": 0.3454615244752419, + "grad_norm": 2.5534207820892334, + "learning_rate": 9.475275869846776e-06, + "loss": 1.0151, + "step": 4275 + }, + { + "epoch": 0.34554233418856944, + "grad_norm": 2.717841625213623, + "learning_rate": 9.474984015178266e-06, + "loss": 0.935, + "step": 4276 + }, + { + "epoch": 0.345623143901897, + "grad_norm": 2.970533847808838, + "learning_rate": 9.474692083864052e-06, + "loss": 0.9799, + "step": 4277 + }, + { + "epoch": 0.34570395361522455, + "grad_norm": 2.94915771484375, + "learning_rate": 9.474400075909136e-06, + "loss": 0.8858, + "step": 4278 + }, + { + "epoch": 0.34578476332855207, + "grad_norm": 2.6709272861480713, + "learning_rate": 9.474107991318517e-06, + "loss": 1.0326, + "step": 4279 + }, + { + "epoch": 0.34586557304187965, + "grad_norm": 2.8930013179779053, + "learning_rate": 9.4738158300972e-06, + "loss": 0.9573, + "step": 4280 + }, + { + "epoch": 0.34594638275520717, + "grad_norm": 2.6665401458740234, + "learning_rate": 9.473523592250188e-06, + "loss": 1.0777, + "step": 4281 + }, + { + "epoch": 0.3460271924685347, + "grad_norm": 2.5457189083099365, + "learning_rate": 9.473231277782486e-06, + "loss": 1.1171, + "step": 4282 + }, + { + "epoch": 0.3461080021818623, + "grad_norm": 3.2128570079803467, + "learning_rate": 9.472938886699103e-06, + "loss": 1.0744, + "step": 4283 + }, + { + "epoch": 0.3461888118951898, + "grad_norm": 2.9156112670898438, + "learning_rate": 9.472646419005043e-06, + "loss": 0.9827, + "step": 4284 + }, + { + "epoch": 0.3462696216085173, + "grad_norm": 2.5341570377349854, + "learning_rate": 9.472353874705318e-06, + "loss": 1.0247, + "step": 4285 + }, + { + "epoch": 0.3463504313218449, + "grad_norm": 2.9804251194000244, + "learning_rate": 9.472061253804937e-06, + "loss": 0.9844, + "step": 4286 + }, + { + "epoch": 0.3464312410351724, + "grad_norm": 2.4154458045959473, + "learning_rate": 9.471768556308914e-06, + "loss": 1.0119, + "step": 4287 + }, + { + "epoch": 0.34651205074849994, + "grad_norm": 2.570214033126831, + "learning_rate": 9.471475782222261e-06, + "loss": 0.9532, + "step": 4288 + }, + { + "epoch": 0.3465928604618275, + "grad_norm": 2.7691421508789062, + "learning_rate": 9.471182931549992e-06, + "loss": 1.0611, + "step": 4289 + }, + { + "epoch": 0.34667367017515505, + "grad_norm": 2.6109275817871094, + "learning_rate": 9.470890004297122e-06, + "loss": 1.0118, + "step": 4290 + }, + { + "epoch": 0.34675447988848257, + "grad_norm": 2.7187349796295166, + "learning_rate": 9.47059700046867e-06, + "loss": 0.9936, + "step": 4291 + }, + { + "epoch": 0.34683528960181015, + "grad_norm": 3.0581231117248535, + "learning_rate": 9.470303920069655e-06, + "loss": 1.0772, + "step": 4292 + }, + { + "epoch": 0.34691609931513767, + "grad_norm": 2.838987350463867, + "learning_rate": 9.470010763105096e-06, + "loss": 1.0029, + "step": 4293 + }, + { + "epoch": 0.3469969090284652, + "grad_norm": 2.7598118782043457, + "learning_rate": 9.469717529580013e-06, + "loss": 0.8415, + "step": 4294 + }, + { + "epoch": 0.3470777187417928, + "grad_norm": 2.6195731163024902, + "learning_rate": 9.469424219499429e-06, + "loss": 0.9175, + "step": 4295 + }, + { + "epoch": 0.3471585284551203, + "grad_norm": 3.4818670749664307, + "learning_rate": 9.469130832868369e-06, + "loss": 0.9711, + "step": 4296 + }, + { + "epoch": 0.3472393381684478, + "grad_norm": 2.504044771194458, + "learning_rate": 9.468837369691854e-06, + "loss": 1.0015, + "step": 4297 + }, + { + "epoch": 0.3473201478817754, + "grad_norm": 3.460052251815796, + "learning_rate": 9.468543829974917e-06, + "loss": 0.9326, + "step": 4298 + }, + { + "epoch": 0.3474009575951029, + "grad_norm": 2.8199450969696045, + "learning_rate": 9.468250213722578e-06, + "loss": 0.9231, + "step": 4299 + }, + { + "epoch": 0.3474817673084305, + "grad_norm": 2.3318564891815186, + "learning_rate": 9.467956520939872e-06, + "loss": 0.9933, + "step": 4300 + }, + { + "epoch": 0.347562577021758, + "grad_norm": 2.505892276763916, + "learning_rate": 9.467662751631827e-06, + "loss": 0.9749, + "step": 4301 + }, + { + "epoch": 0.34764338673508555, + "grad_norm": 2.5387418270111084, + "learning_rate": 9.467368905803474e-06, + "loss": 1.0967, + "step": 4302 + }, + { + "epoch": 0.3477241964484131, + "grad_norm": 3.123910665512085, + "learning_rate": 9.467074983459845e-06, + "loss": 0.9769, + "step": 4303 + }, + { + "epoch": 0.34780500616174065, + "grad_norm": 2.621479034423828, + "learning_rate": 9.466780984605978e-06, + "loss": 0.9603, + "step": 4304 + }, + { + "epoch": 0.34788581587506817, + "grad_norm": 2.7805023193359375, + "learning_rate": 9.466486909246904e-06, + "loss": 0.9352, + "step": 4305 + }, + { + "epoch": 0.34796662558839575, + "grad_norm": 2.935696601867676, + "learning_rate": 9.466192757387665e-06, + "loss": 0.9463, + "step": 4306 + }, + { + "epoch": 0.3480474353017233, + "grad_norm": 3.6778948307037354, + "learning_rate": 9.465898529033292e-06, + "loss": 0.9605, + "step": 4307 + }, + { + "epoch": 0.3481282450150508, + "grad_norm": 2.4616496562957764, + "learning_rate": 9.46560422418883e-06, + "loss": 1.0901, + "step": 4308 + }, + { + "epoch": 0.3482090547283784, + "grad_norm": 3.0213122367858887, + "learning_rate": 9.46530984285932e-06, + "loss": 1.0312, + "step": 4309 + }, + { + "epoch": 0.3482898644417059, + "grad_norm": 2.616098165512085, + "learning_rate": 9.4650153850498e-06, + "loss": 1.048, + "step": 4310 + }, + { + "epoch": 0.3483706741550334, + "grad_norm": 2.6346166133880615, + "learning_rate": 9.464720850765317e-06, + "loss": 1.0417, + "step": 4311 + }, + { + "epoch": 0.348451483868361, + "grad_norm": 3.287877321243286, + "learning_rate": 9.464426240010912e-06, + "loss": 0.9386, + "step": 4312 + }, + { + "epoch": 0.3485322935816885, + "grad_norm": 2.364901304244995, + "learning_rate": 9.464131552791634e-06, + "loss": 0.9209, + "step": 4313 + }, + { + "epoch": 0.34861310329501605, + "grad_norm": 2.60255765914917, + "learning_rate": 9.46383678911253e-06, + "loss": 0.9576, + "step": 4314 + }, + { + "epoch": 0.3486939130083436, + "grad_norm": 2.7048897743225098, + "learning_rate": 9.46354194897865e-06, + "loss": 0.9808, + "step": 4315 + }, + { + "epoch": 0.34877472272167115, + "grad_norm": 2.939545154571533, + "learning_rate": 9.463247032395039e-06, + "loss": 0.9164, + "step": 4316 + }, + { + "epoch": 0.34885553243499867, + "grad_norm": 2.8341784477233887, + "learning_rate": 9.462952039366752e-06, + "loss": 0.9076, + "step": 4317 + }, + { + "epoch": 0.34893634214832625, + "grad_norm": 2.6567914485931396, + "learning_rate": 9.46265696989884e-06, + "loss": 0.9411, + "step": 4318 + }, + { + "epoch": 0.3490171518616538, + "grad_norm": 2.7168097496032715, + "learning_rate": 9.46236182399636e-06, + "loss": 0.9037, + "step": 4319 + }, + { + "epoch": 0.3490979615749813, + "grad_norm": 3.1529085636138916, + "learning_rate": 9.46206660166436e-06, + "loss": 0.9303, + "step": 4320 + }, + { + "epoch": 0.3491787712883089, + "grad_norm": 3.0032570362091064, + "learning_rate": 9.461771302907907e-06, + "loss": 1.1699, + "step": 4321 + }, + { + "epoch": 0.3492595810016364, + "grad_norm": 3.1368443965911865, + "learning_rate": 9.461475927732049e-06, + "loss": 0.9446, + "step": 4322 + }, + { + "epoch": 0.3493403907149639, + "grad_norm": 2.594129800796509, + "learning_rate": 9.461180476141848e-06, + "loss": 1.0802, + "step": 4323 + }, + { + "epoch": 0.3494212004282915, + "grad_norm": 2.818218946456909, + "learning_rate": 9.460884948142368e-06, + "loss": 1.0315, + "step": 4324 + }, + { + "epoch": 0.349502010141619, + "grad_norm": 2.2989888191223145, + "learning_rate": 9.460589343738669e-06, + "loss": 1.0713, + "step": 4325 + }, + { + "epoch": 0.34958281985494655, + "grad_norm": 2.951605796813965, + "learning_rate": 9.46029366293581e-06, + "loss": 1.0521, + "step": 4326 + }, + { + "epoch": 0.3496636295682741, + "grad_norm": 2.722334146499634, + "learning_rate": 9.45999790573886e-06, + "loss": 1.039, + "step": 4327 + }, + { + "epoch": 0.34974443928160165, + "grad_norm": 2.9131970405578613, + "learning_rate": 9.459702072152883e-06, + "loss": 1.0311, + "step": 4328 + }, + { + "epoch": 0.34982524899492917, + "grad_norm": 2.7194559574127197, + "learning_rate": 9.459406162182944e-06, + "loss": 0.9912, + "step": 4329 + }, + { + "epoch": 0.34990605870825675, + "grad_norm": 2.382632255554199, + "learning_rate": 9.459110175834114e-06, + "loss": 0.8718, + "step": 4330 + }, + { + "epoch": 0.3499868684215843, + "grad_norm": 2.467538356781006, + "learning_rate": 9.45881411311146e-06, + "loss": 1.0031, + "step": 4331 + }, + { + "epoch": 0.3500676781349118, + "grad_norm": 2.790538787841797, + "learning_rate": 9.458517974020058e-06, + "loss": 0.9404, + "step": 4332 + }, + { + "epoch": 0.3501484878482394, + "grad_norm": 3.0771515369415283, + "learning_rate": 9.458221758564973e-06, + "loss": 1.0456, + "step": 4333 + }, + { + "epoch": 0.3502292975615669, + "grad_norm": 2.63175630569458, + "learning_rate": 9.457925466751285e-06, + "loss": 0.9279, + "step": 4334 + }, + { + "epoch": 0.3503101072748944, + "grad_norm": 2.870114803314209, + "learning_rate": 9.457629098584064e-06, + "loss": 0.9564, + "step": 4335 + }, + { + "epoch": 0.350390916988222, + "grad_norm": 2.5363028049468994, + "learning_rate": 9.457332654068389e-06, + "loss": 1.0778, + "step": 4336 + }, + { + "epoch": 0.3504717267015495, + "grad_norm": 2.3083391189575195, + "learning_rate": 9.457036133209334e-06, + "loss": 0.9804, + "step": 4337 + }, + { + "epoch": 0.35055253641487705, + "grad_norm": 2.6185781955718994, + "learning_rate": 9.456739536011982e-06, + "loss": 0.9889, + "step": 4338 + }, + { + "epoch": 0.3506333461282046, + "grad_norm": 3.330862045288086, + "learning_rate": 9.45644286248141e-06, + "loss": 0.9267, + "step": 4339 + }, + { + "epoch": 0.35071415584153215, + "grad_norm": 2.8113012313842773, + "learning_rate": 9.456146112622702e-06, + "loss": 1.0139, + "step": 4340 + }, + { + "epoch": 0.3507949655548597, + "grad_norm": 2.7219343185424805, + "learning_rate": 9.455849286440936e-06, + "loss": 0.8668, + "step": 4341 + }, + { + "epoch": 0.35087577526818725, + "grad_norm": 3.1652705669403076, + "learning_rate": 9.4555523839412e-06, + "loss": 0.9704, + "step": 4342 + }, + { + "epoch": 0.3509565849815148, + "grad_norm": 2.5455594062805176, + "learning_rate": 9.455255405128579e-06, + "loss": 0.9233, + "step": 4343 + }, + { + "epoch": 0.3510373946948423, + "grad_norm": 2.779808282852173, + "learning_rate": 9.454958350008156e-06, + "loss": 1.086, + "step": 4344 + }, + { + "epoch": 0.3511182044081699, + "grad_norm": 2.620594024658203, + "learning_rate": 9.454661218585024e-06, + "loss": 0.9271, + "step": 4345 + }, + { + "epoch": 0.3511990141214974, + "grad_norm": 2.7019081115722656, + "learning_rate": 9.454364010864267e-06, + "loss": 0.9576, + "step": 4346 + }, + { + "epoch": 0.3512798238348249, + "grad_norm": 3.306880474090576, + "learning_rate": 9.45406672685098e-06, + "loss": 0.9455, + "step": 4347 + }, + { + "epoch": 0.3513606335481525, + "grad_norm": 2.6243791580200195, + "learning_rate": 9.453769366550251e-06, + "loss": 0.9925, + "step": 4348 + }, + { + "epoch": 0.35144144326148, + "grad_norm": 3.0641255378723145, + "learning_rate": 9.453471929967177e-06, + "loss": 1.0466, + "step": 4349 + }, + { + "epoch": 0.35152225297480755, + "grad_norm": 3.116964817047119, + "learning_rate": 9.45317441710685e-06, + "loss": 0.9735, + "step": 4350 + }, + { + "epoch": 0.3516030626881351, + "grad_norm": 2.6487467288970947, + "learning_rate": 9.452876827974364e-06, + "loss": 0.9188, + "step": 4351 + }, + { + "epoch": 0.35168387240146265, + "grad_norm": 3.2928214073181152, + "learning_rate": 9.452579162574817e-06, + "loss": 0.9692, + "step": 4352 + }, + { + "epoch": 0.3517646821147902, + "grad_norm": 2.889279842376709, + "learning_rate": 9.45228142091331e-06, + "loss": 1.0466, + "step": 4353 + }, + { + "epoch": 0.35184549182811775, + "grad_norm": 2.446531057357788, + "learning_rate": 9.451983602994941e-06, + "loss": 1.088, + "step": 4354 + }, + { + "epoch": 0.3519263015414453, + "grad_norm": 2.785648822784424, + "learning_rate": 9.45168570882481e-06, + "loss": 0.8018, + "step": 4355 + }, + { + "epoch": 0.3520071112547728, + "grad_norm": 2.1402995586395264, + "learning_rate": 9.45138773840802e-06, + "loss": 1.0743, + "step": 4356 + }, + { + "epoch": 0.3520879209681004, + "grad_norm": 2.4489831924438477, + "learning_rate": 9.451089691749673e-06, + "loss": 0.9196, + "step": 4357 + }, + { + "epoch": 0.3521687306814279, + "grad_norm": 3.200695514678955, + "learning_rate": 9.450791568854876e-06, + "loss": 1.0073, + "step": 4358 + }, + { + "epoch": 0.3522495403947554, + "grad_norm": 3.0508291721343994, + "learning_rate": 9.450493369728734e-06, + "loss": 0.9789, + "step": 4359 + }, + { + "epoch": 0.352330350108083, + "grad_norm": 2.3306634426116943, + "learning_rate": 9.450195094376356e-06, + "loss": 0.9437, + "step": 4360 + }, + { + "epoch": 0.3524111598214105, + "grad_norm": 2.770146369934082, + "learning_rate": 9.44989674280285e-06, + "loss": 0.9436, + "step": 4361 + }, + { + "epoch": 0.3524919695347381, + "grad_norm": 2.736323595046997, + "learning_rate": 9.449598315013321e-06, + "loss": 1.0182, + "step": 4362 + }, + { + "epoch": 0.3525727792480656, + "grad_norm": 2.631716012954712, + "learning_rate": 9.44929981101289e-06, + "loss": 0.9786, + "step": 4363 + }, + { + "epoch": 0.35265358896139315, + "grad_norm": 3.045142412185669, + "learning_rate": 9.449001230806663e-06, + "loss": 1.0003, + "step": 4364 + }, + { + "epoch": 0.35273439867472073, + "grad_norm": 2.553821325302124, + "learning_rate": 9.448702574399752e-06, + "loss": 0.9904, + "step": 4365 + }, + { + "epoch": 0.35281520838804825, + "grad_norm": 2.679896831512451, + "learning_rate": 9.44840384179728e-06, + "loss": 1.0476, + "step": 4366 + }, + { + "epoch": 0.3528960181013758, + "grad_norm": 3.0261433124542236, + "learning_rate": 9.448105033004358e-06, + "loss": 0.9751, + "step": 4367 + }, + { + "epoch": 0.35297682781470335, + "grad_norm": 2.6874983310699463, + "learning_rate": 9.447806148026103e-06, + "loss": 0.9185, + "step": 4368 + }, + { + "epoch": 0.3530576375280309, + "grad_norm": 2.4449284076690674, + "learning_rate": 9.44750718686764e-06, + "loss": 1.0648, + "step": 4369 + }, + { + "epoch": 0.3531384472413584, + "grad_norm": 2.977109670639038, + "learning_rate": 9.447208149534084e-06, + "loss": 1.0043, + "step": 4370 + }, + { + "epoch": 0.353219256954686, + "grad_norm": 2.5412518978118896, + "learning_rate": 9.446909036030558e-06, + "loss": 0.9824, + "step": 4371 + }, + { + "epoch": 0.3533000666680135, + "grad_norm": 2.513836145401001, + "learning_rate": 9.446609846362187e-06, + "loss": 1.1681, + "step": 4372 + }, + { + "epoch": 0.353380876381341, + "grad_norm": 3.205918550491333, + "learning_rate": 9.446310580534094e-06, + "loss": 0.935, + "step": 4373 + }, + { + "epoch": 0.3534616860946686, + "grad_norm": 2.459001302719116, + "learning_rate": 9.446011238551404e-06, + "loss": 1.0462, + "step": 4374 + }, + { + "epoch": 0.3535424958079961, + "grad_norm": 2.2630016803741455, + "learning_rate": 9.445711820419245e-06, + "loss": 0.9954, + "step": 4375 + }, + { + "epoch": 0.35362330552132365, + "grad_norm": 2.9816484451293945, + "learning_rate": 9.445412326142747e-06, + "loss": 1.0124, + "step": 4376 + }, + { + "epoch": 0.35370411523465123, + "grad_norm": 2.439490556716919, + "learning_rate": 9.445112755727036e-06, + "loss": 1.0331, + "step": 4377 + }, + { + "epoch": 0.35378492494797875, + "grad_norm": 2.5915751457214355, + "learning_rate": 9.444813109177245e-06, + "loss": 0.973, + "step": 4378 + }, + { + "epoch": 0.3538657346613063, + "grad_norm": 2.94569730758667, + "learning_rate": 9.444513386498504e-06, + "loss": 0.9762, + "step": 4379 + }, + { + "epoch": 0.35394654437463385, + "grad_norm": 2.8117334842681885, + "learning_rate": 9.444213587695949e-06, + "loss": 0.9397, + "step": 4380 + }, + { + "epoch": 0.3540273540879614, + "grad_norm": 2.5008766651153564, + "learning_rate": 9.443913712774717e-06, + "loss": 1.052, + "step": 4381 + }, + { + "epoch": 0.3541081638012889, + "grad_norm": 2.7154204845428467, + "learning_rate": 9.443613761739939e-06, + "loss": 0.9143, + "step": 4382 + }, + { + "epoch": 0.3541889735146165, + "grad_norm": 3.495941162109375, + "learning_rate": 9.443313734596756e-06, + "loss": 1.0291, + "step": 4383 + }, + { + "epoch": 0.354269783227944, + "grad_norm": 2.525376796722412, + "learning_rate": 9.443013631350303e-06, + "loss": 1.0136, + "step": 4384 + }, + { + "epoch": 0.3543505929412715, + "grad_norm": 3.2318382263183594, + "learning_rate": 9.442713452005728e-06, + "loss": 0.9272, + "step": 4385 + }, + { + "epoch": 0.3544314026545991, + "grad_norm": 2.618738889694214, + "learning_rate": 9.442413196568161e-06, + "loss": 0.9367, + "step": 4386 + }, + { + "epoch": 0.3545122123679266, + "grad_norm": 3.1385159492492676, + "learning_rate": 9.442112865042753e-06, + "loss": 1.0642, + "step": 4387 + }, + { + "epoch": 0.35459302208125415, + "grad_norm": 2.933108329772949, + "learning_rate": 9.441812457434647e-06, + "loss": 0.9823, + "step": 4388 + }, + { + "epoch": 0.35467383179458173, + "grad_norm": 2.355842351913452, + "learning_rate": 9.441511973748987e-06, + "loss": 0.9344, + "step": 4389 + }, + { + "epoch": 0.35475464150790925, + "grad_norm": 2.866903305053711, + "learning_rate": 9.441211413990918e-06, + "loss": 0.9452, + "step": 4390 + }, + { + "epoch": 0.3548354512212368, + "grad_norm": 2.942039728164673, + "learning_rate": 9.440910778165588e-06, + "loss": 1.0805, + "step": 4391 + }, + { + "epoch": 0.35491626093456435, + "grad_norm": 3.402123212814331, + "learning_rate": 9.44061006627815e-06, + "loss": 1.1258, + "step": 4392 + }, + { + "epoch": 0.3549970706478919, + "grad_norm": 2.882495641708374, + "learning_rate": 9.44030927833375e-06, + "loss": 0.9454, + "step": 4393 + }, + { + "epoch": 0.3550778803612194, + "grad_norm": 2.3867785930633545, + "learning_rate": 9.440008414337543e-06, + "loss": 0.9132, + "step": 4394 + }, + { + "epoch": 0.355158690074547, + "grad_norm": 2.6597611904144287, + "learning_rate": 9.439707474294679e-06, + "loss": 0.8589, + "step": 4395 + }, + { + "epoch": 0.3552394997878745, + "grad_norm": 3.0939173698425293, + "learning_rate": 9.439406458210316e-06, + "loss": 0.9932, + "step": 4396 + }, + { + "epoch": 0.355320309501202, + "grad_norm": 2.8719091415405273, + "learning_rate": 9.439105366089606e-06, + "loss": 0.9933, + "step": 4397 + }, + { + "epoch": 0.3554011192145296, + "grad_norm": 2.700834035873413, + "learning_rate": 9.43880419793771e-06, + "loss": 1.1072, + "step": 4398 + }, + { + "epoch": 0.3554819289278571, + "grad_norm": 3.0498509407043457, + "learning_rate": 9.438502953759783e-06, + "loss": 0.9062, + "step": 4399 + }, + { + "epoch": 0.35556273864118465, + "grad_norm": 2.5735740661621094, + "learning_rate": 9.438201633560983e-06, + "loss": 0.999, + "step": 4400 + }, + { + "epoch": 0.35564354835451223, + "grad_norm": 2.5071537494659424, + "learning_rate": 9.437900237346479e-06, + "loss": 0.9051, + "step": 4401 + }, + { + "epoch": 0.35572435806783975, + "grad_norm": 2.5384161472320557, + "learning_rate": 9.437598765121423e-06, + "loss": 1.1434, + "step": 4402 + }, + { + "epoch": 0.3558051677811673, + "grad_norm": 2.3582170009613037, + "learning_rate": 9.437297216890986e-06, + "loss": 0.9889, + "step": 4403 + }, + { + "epoch": 0.35588597749449485, + "grad_norm": 2.8007912635803223, + "learning_rate": 9.436995592660328e-06, + "loss": 0.8858, + "step": 4404 + }, + { + "epoch": 0.3559667872078224, + "grad_norm": 2.6563291549682617, + "learning_rate": 9.436693892434618e-06, + "loss": 0.9193, + "step": 4405 + }, + { + "epoch": 0.3560475969211499, + "grad_norm": 2.6463418006896973, + "learning_rate": 9.436392116219024e-06, + "loss": 1.0564, + "step": 4406 + }, + { + "epoch": 0.3561284066344775, + "grad_norm": 3.1819608211517334, + "learning_rate": 9.436090264018711e-06, + "loss": 0.9396, + "step": 4407 + }, + { + "epoch": 0.356209216347805, + "grad_norm": 2.84258770942688, + "learning_rate": 9.435788335838852e-06, + "loss": 1.0478, + "step": 4408 + }, + { + "epoch": 0.3562900260611325, + "grad_norm": 3.5766983032226562, + "learning_rate": 9.435486331684617e-06, + "loss": 1.224, + "step": 4409 + }, + { + "epoch": 0.3563708357744601, + "grad_norm": 2.5731217861175537, + "learning_rate": 9.435184251561181e-06, + "loss": 0.9477, + "step": 4410 + }, + { + "epoch": 0.3564516454877876, + "grad_norm": 2.6633787155151367, + "learning_rate": 9.434882095473714e-06, + "loss": 0.9986, + "step": 4411 + }, + { + "epoch": 0.35653245520111515, + "grad_norm": 2.7791190147399902, + "learning_rate": 9.434579863427396e-06, + "loss": 0.9405, + "step": 4412 + }, + { + "epoch": 0.35661326491444273, + "grad_norm": 2.5482404232025146, + "learning_rate": 9.434277555427397e-06, + "loss": 0.896, + "step": 4413 + }, + { + "epoch": 0.35669407462777025, + "grad_norm": 2.590073585510254, + "learning_rate": 9.433975171478901e-06, + "loss": 0.7863, + "step": 4414 + }, + { + "epoch": 0.3567748843410978, + "grad_norm": 2.9536354541778564, + "learning_rate": 9.433672711587086e-06, + "loss": 1.1034, + "step": 4415 + }, + { + "epoch": 0.35685569405442535, + "grad_norm": 2.2706964015960693, + "learning_rate": 9.433370175757129e-06, + "loss": 1.0198, + "step": 4416 + }, + { + "epoch": 0.3569365037677529, + "grad_norm": 2.293123245239258, + "learning_rate": 9.433067563994214e-06, + "loss": 0.9699, + "step": 4417 + }, + { + "epoch": 0.3570173134810804, + "grad_norm": 2.8644261360168457, + "learning_rate": 9.432764876303523e-06, + "loss": 0.9393, + "step": 4418 + }, + { + "epoch": 0.357098123194408, + "grad_norm": 2.9875433444976807, + "learning_rate": 9.432462112690242e-06, + "loss": 0.9763, + "step": 4419 + }, + { + "epoch": 0.3571789329077355, + "grad_norm": 2.342803478240967, + "learning_rate": 9.432159273159556e-06, + "loss": 0.9794, + "step": 4420 + }, + { + "epoch": 0.357259742621063, + "grad_norm": 2.90524959564209, + "learning_rate": 9.43185635771665e-06, + "loss": 0.8631, + "step": 4421 + }, + { + "epoch": 0.3573405523343906, + "grad_norm": 2.5036685466766357, + "learning_rate": 9.431553366366716e-06, + "loss": 0.9362, + "step": 4422 + }, + { + "epoch": 0.3574213620477181, + "grad_norm": 3.7688751220703125, + "learning_rate": 9.43125029911494e-06, + "loss": 1.0821, + "step": 4423 + }, + { + "epoch": 0.35750217176104565, + "grad_norm": 2.795793056488037, + "learning_rate": 9.430947155966514e-06, + "loss": 1.0631, + "step": 4424 + }, + { + "epoch": 0.35758298147437323, + "grad_norm": 2.783707857131958, + "learning_rate": 9.430643936926631e-06, + "loss": 0.9612, + "step": 4425 + }, + { + "epoch": 0.35766379118770075, + "grad_norm": 3.1343765258789062, + "learning_rate": 9.430340642000484e-06, + "loss": 0.8432, + "step": 4426 + }, + { + "epoch": 0.35774460090102833, + "grad_norm": 2.3770313262939453, + "learning_rate": 9.430037271193267e-06, + "loss": 1.0099, + "step": 4427 + }, + { + "epoch": 0.35782541061435585, + "grad_norm": 2.72056245803833, + "learning_rate": 9.429733824510176e-06, + "loss": 0.9592, + "step": 4428 + }, + { + "epoch": 0.3579062203276834, + "grad_norm": 2.875227451324463, + "learning_rate": 9.42943030195641e-06, + "loss": 0.969, + "step": 4429 + }, + { + "epoch": 0.35798703004101096, + "grad_norm": 2.701589584350586, + "learning_rate": 9.429126703537165e-06, + "loss": 0.9963, + "step": 4430 + }, + { + "epoch": 0.3580678397543385, + "grad_norm": 2.879420518875122, + "learning_rate": 9.428823029257643e-06, + "loss": 1.0575, + "step": 4431 + }, + { + "epoch": 0.358148649467666, + "grad_norm": 2.6203296184539795, + "learning_rate": 9.428519279123045e-06, + "loss": 1.008, + "step": 4432 + }, + { + "epoch": 0.3582294591809936, + "grad_norm": 2.648378849029541, + "learning_rate": 9.428215453138571e-06, + "loss": 1.0841, + "step": 4433 + }, + { + "epoch": 0.3583102688943211, + "grad_norm": 2.7141456604003906, + "learning_rate": 9.42791155130943e-06, + "loss": 1.0489, + "step": 4434 + }, + { + "epoch": 0.3583910786076486, + "grad_norm": 2.451988935470581, + "learning_rate": 9.42760757364082e-06, + "loss": 0.978, + "step": 4435 + }, + { + "epoch": 0.3584718883209762, + "grad_norm": 3.4383907318115234, + "learning_rate": 9.427303520137954e-06, + "loss": 0.9989, + "step": 4436 + }, + { + "epoch": 0.35855269803430373, + "grad_norm": 3.1784305572509766, + "learning_rate": 9.426999390806037e-06, + "loss": 1.0339, + "step": 4437 + }, + { + "epoch": 0.35863350774763125, + "grad_norm": 2.6991403102874756, + "learning_rate": 9.426695185650276e-06, + "loss": 0.9995, + "step": 4438 + }, + { + "epoch": 0.35871431746095883, + "grad_norm": 2.82651424407959, + "learning_rate": 9.426390904675887e-06, + "loss": 0.9791, + "step": 4439 + }, + { + "epoch": 0.35879512717428635, + "grad_norm": 2.4426968097686768, + "learning_rate": 9.426086547888074e-06, + "loss": 0.9886, + "step": 4440 + }, + { + "epoch": 0.3588759368876139, + "grad_norm": 2.8267159461975098, + "learning_rate": 9.425782115292054e-06, + "loss": 1.0125, + "step": 4441 + }, + { + "epoch": 0.35895674660094146, + "grad_norm": 2.4940249919891357, + "learning_rate": 9.425477606893044e-06, + "loss": 1.0049, + "step": 4442 + }, + { + "epoch": 0.359037556314269, + "grad_norm": 3.048516035079956, + "learning_rate": 9.425173022696255e-06, + "loss": 0.8767, + "step": 4443 + }, + { + "epoch": 0.3591183660275965, + "grad_norm": 2.5132670402526855, + "learning_rate": 9.424868362706905e-06, + "loss": 0.8913, + "step": 4444 + }, + { + "epoch": 0.3591991757409241, + "grad_norm": 2.2747271060943604, + "learning_rate": 9.424563626930213e-06, + "loss": 1.0056, + "step": 4445 + }, + { + "epoch": 0.3592799854542516, + "grad_norm": 2.6004068851470947, + "learning_rate": 9.424258815371397e-06, + "loss": 0.9968, + "step": 4446 + }, + { + "epoch": 0.35936079516757913, + "grad_norm": 2.3039145469665527, + "learning_rate": 9.423953928035678e-06, + "loss": 0.8961, + "step": 4447 + }, + { + "epoch": 0.3594416048809067, + "grad_norm": 2.539527654647827, + "learning_rate": 9.423648964928279e-06, + "loss": 0.9564, + "step": 4448 + }, + { + "epoch": 0.35952241459423423, + "grad_norm": 2.3412609100341797, + "learning_rate": 9.423343926054422e-06, + "loss": 1.0276, + "step": 4449 + }, + { + "epoch": 0.35960322430756175, + "grad_norm": 2.665891170501709, + "learning_rate": 9.423038811419334e-06, + "loss": 0.9202, + "step": 4450 + }, + { + "epoch": 0.35968403402088933, + "grad_norm": 2.923896312713623, + "learning_rate": 9.422733621028239e-06, + "loss": 0.9785, + "step": 4451 + }, + { + "epoch": 0.35976484373421685, + "grad_norm": 2.6648497581481934, + "learning_rate": 9.422428354886364e-06, + "loss": 1.0934, + "step": 4452 + }, + { + "epoch": 0.3598456534475444, + "grad_norm": 2.780925989151001, + "learning_rate": 9.422123012998936e-06, + "loss": 0.8999, + "step": 4453 + }, + { + "epoch": 0.35992646316087196, + "grad_norm": 2.895123243331909, + "learning_rate": 9.421817595371188e-06, + "loss": 0.9942, + "step": 4454 + }, + { + "epoch": 0.3600072728741995, + "grad_norm": 2.727386951446533, + "learning_rate": 9.42151210200835e-06, + "loss": 1.0444, + "step": 4455 + }, + { + "epoch": 0.360088082587527, + "grad_norm": 2.6367311477661133, + "learning_rate": 9.421206532915655e-06, + "loss": 0.9106, + "step": 4456 + }, + { + "epoch": 0.3601688923008546, + "grad_norm": 2.836529016494751, + "learning_rate": 9.420900888098334e-06, + "loss": 0.9969, + "step": 4457 + }, + { + "epoch": 0.3602497020141821, + "grad_norm": 2.6362826824188232, + "learning_rate": 9.420595167561624e-06, + "loss": 0.9138, + "step": 4458 + }, + { + "epoch": 0.36033051172750963, + "grad_norm": 2.9698569774627686, + "learning_rate": 9.420289371310762e-06, + "loss": 1.0344, + "step": 4459 + }, + { + "epoch": 0.3604113214408372, + "grad_norm": 2.665956735610962, + "learning_rate": 9.419983499350982e-06, + "loss": 0.9462, + "step": 4460 + }, + { + "epoch": 0.36049213115416473, + "grad_norm": 2.300938129425049, + "learning_rate": 9.419677551687528e-06, + "loss": 0.9502, + "step": 4461 + }, + { + "epoch": 0.36057294086749225, + "grad_norm": 2.4960052967071533, + "learning_rate": 9.419371528325638e-06, + "loss": 0.9395, + "step": 4462 + }, + { + "epoch": 0.36065375058081983, + "grad_norm": 2.689157485961914, + "learning_rate": 9.41906542927055e-06, + "loss": 0.9432, + "step": 4463 + }, + { + "epoch": 0.36073456029414736, + "grad_norm": 2.563105821609497, + "learning_rate": 9.418759254527512e-06, + "loss": 1.0336, + "step": 4464 + }, + { + "epoch": 0.3608153700074749, + "grad_norm": 2.5278499126434326, + "learning_rate": 9.418453004101763e-06, + "loss": 1.1189, + "step": 4465 + }, + { + "epoch": 0.36089617972080246, + "grad_norm": 2.824989080429077, + "learning_rate": 9.418146677998554e-06, + "loss": 1.0469, + "step": 4466 + }, + { + "epoch": 0.36097698943413, + "grad_norm": 2.2645199298858643, + "learning_rate": 9.417840276223127e-06, + "loss": 1.0926, + "step": 4467 + }, + { + "epoch": 0.3610577991474575, + "grad_norm": 3.170844554901123, + "learning_rate": 9.417533798780732e-06, + "loss": 0.956, + "step": 4468 + }, + { + "epoch": 0.3611386088607851, + "grad_norm": 2.6823577880859375, + "learning_rate": 9.417227245676618e-06, + "loss": 1.0137, + "step": 4469 + }, + { + "epoch": 0.3612194185741126, + "grad_norm": 2.3987109661102295, + "learning_rate": 9.416920616916035e-06, + "loss": 0.9045, + "step": 4470 + }, + { + "epoch": 0.36130022828744013, + "grad_norm": 2.764575719833374, + "learning_rate": 9.416613912504235e-06, + "loss": 1.0281, + "step": 4471 + }, + { + "epoch": 0.3613810380007677, + "grad_norm": 2.5729925632476807, + "learning_rate": 9.416307132446474e-06, + "loss": 0.9318, + "step": 4472 + }, + { + "epoch": 0.36146184771409523, + "grad_norm": 2.812591075897217, + "learning_rate": 9.416000276748e-06, + "loss": 1.0329, + "step": 4473 + }, + { + "epoch": 0.36154265742742275, + "grad_norm": 2.8140928745269775, + "learning_rate": 9.415693345414072e-06, + "loss": 1.0318, + "step": 4474 + }, + { + "epoch": 0.36162346714075033, + "grad_norm": 2.4964237213134766, + "learning_rate": 9.41538633844995e-06, + "loss": 0.9337, + "step": 4475 + }, + { + "epoch": 0.36170427685407786, + "grad_norm": 2.8262624740600586, + "learning_rate": 9.415079255860888e-06, + "loss": 0.9315, + "step": 4476 + }, + { + "epoch": 0.3617850865674054, + "grad_norm": 2.7971997261047363, + "learning_rate": 9.414772097652148e-06, + "loss": 0.9649, + "step": 4477 + }, + { + "epoch": 0.36186589628073296, + "grad_norm": 2.835785388946533, + "learning_rate": 9.41446486382899e-06, + "loss": 1.0176, + "step": 4478 + }, + { + "epoch": 0.3619467059940605, + "grad_norm": 2.54367995262146, + "learning_rate": 9.414157554396677e-06, + "loss": 0.9148, + "step": 4479 + }, + { + "epoch": 0.362027515707388, + "grad_norm": 3.091097116470337, + "learning_rate": 9.41385016936047e-06, + "loss": 0.9647, + "step": 4480 + }, + { + "epoch": 0.3621083254207156, + "grad_norm": 2.828221321105957, + "learning_rate": 9.413542708725635e-06, + "loss": 0.9779, + "step": 4481 + }, + { + "epoch": 0.3621891351340431, + "grad_norm": 2.577510118484497, + "learning_rate": 9.413235172497442e-06, + "loss": 0.9412, + "step": 4482 + }, + { + "epoch": 0.36226994484737063, + "grad_norm": 2.7678253650665283, + "learning_rate": 9.412927560681154e-06, + "loss": 1.0555, + "step": 4483 + }, + { + "epoch": 0.3623507545606982, + "grad_norm": 2.514561176300049, + "learning_rate": 9.412619873282038e-06, + "loss": 1.0715, + "step": 4484 + }, + { + "epoch": 0.36243156427402573, + "grad_norm": 2.812615394592285, + "learning_rate": 9.412312110305368e-06, + "loss": 0.8406, + "step": 4485 + }, + { + "epoch": 0.36251237398735325, + "grad_norm": 2.455737590789795, + "learning_rate": 9.412004271756415e-06, + "loss": 0.8643, + "step": 4486 + }, + { + "epoch": 0.36259318370068083, + "grad_norm": 2.6049814224243164, + "learning_rate": 9.411696357640447e-06, + "loss": 0.914, + "step": 4487 + }, + { + "epoch": 0.36267399341400836, + "grad_norm": 3.2906692028045654, + "learning_rate": 9.411388367962744e-06, + "loss": 1.0165, + "step": 4488 + }, + { + "epoch": 0.3627548031273359, + "grad_norm": 2.6364407539367676, + "learning_rate": 9.411080302728577e-06, + "loss": 0.9579, + "step": 4489 + }, + { + "epoch": 0.36283561284066346, + "grad_norm": 2.5739123821258545, + "learning_rate": 9.410772161943224e-06, + "loss": 0.9622, + "step": 4490 + }, + { + "epoch": 0.362916422553991, + "grad_norm": 2.5797173976898193, + "learning_rate": 9.410463945611963e-06, + "loss": 0.8997, + "step": 4491 + }, + { + "epoch": 0.36299723226731856, + "grad_norm": 2.528426170349121, + "learning_rate": 9.410155653740071e-06, + "loss": 0.9644, + "step": 4492 + }, + { + "epoch": 0.3630780419806461, + "grad_norm": 2.6368141174316406, + "learning_rate": 9.409847286332831e-06, + "loss": 1.1497, + "step": 4493 + }, + { + "epoch": 0.3631588516939736, + "grad_norm": 2.983921766281128, + "learning_rate": 9.409538843395523e-06, + "loss": 0.9232, + "step": 4494 + }, + { + "epoch": 0.3632396614073012, + "grad_norm": 2.966362476348877, + "learning_rate": 9.40923032493343e-06, + "loss": 0.7913, + "step": 4495 + }, + { + "epoch": 0.3633204711206287, + "grad_norm": 2.6679775714874268, + "learning_rate": 9.408921730951835e-06, + "loss": 0.8781, + "step": 4496 + }, + { + "epoch": 0.36340128083395623, + "grad_norm": 2.9152774810791016, + "learning_rate": 9.408613061456027e-06, + "loss": 1.0007, + "step": 4497 + }, + { + "epoch": 0.3634820905472838, + "grad_norm": 3.973459243774414, + "learning_rate": 9.40830431645129e-06, + "loss": 0.9284, + "step": 4498 + }, + { + "epoch": 0.36356290026061133, + "grad_norm": 2.953460931777954, + "learning_rate": 9.40799549594291e-06, + "loss": 1.046, + "step": 4499 + }, + { + "epoch": 0.36364370997393886, + "grad_norm": 2.8357183933258057, + "learning_rate": 9.407686599936182e-06, + "loss": 0.9262, + "step": 4500 + }, + { + "epoch": 0.36372451968726643, + "grad_norm": 2.6355485916137695, + "learning_rate": 9.407377628436394e-06, + "loss": 0.9544, + "step": 4501 + }, + { + "epoch": 0.36380532940059396, + "grad_norm": 2.738980770111084, + "learning_rate": 9.407068581448836e-06, + "loss": 1.0368, + "step": 4502 + }, + { + "epoch": 0.3638861391139215, + "grad_norm": 2.5949649810791016, + "learning_rate": 9.406759458978803e-06, + "loss": 0.9707, + "step": 4503 + }, + { + "epoch": 0.36396694882724906, + "grad_norm": 2.6837685108184814, + "learning_rate": 9.406450261031589e-06, + "loss": 1.0089, + "step": 4504 + }, + { + "epoch": 0.3640477585405766, + "grad_norm": 2.6699249744415283, + "learning_rate": 9.40614098761249e-06, + "loss": 1.0019, + "step": 4505 + }, + { + "epoch": 0.3641285682539041, + "grad_norm": 3.1839182376861572, + "learning_rate": 9.405831638726804e-06, + "loss": 0.9515, + "step": 4506 + }, + { + "epoch": 0.3642093779672317, + "grad_norm": 2.8554553985595703, + "learning_rate": 9.405522214379828e-06, + "loss": 0.9493, + "step": 4507 + }, + { + "epoch": 0.3642901876805592, + "grad_norm": 2.585563898086548, + "learning_rate": 9.405212714576863e-06, + "loss": 1.0017, + "step": 4508 + }, + { + "epoch": 0.36437099739388673, + "grad_norm": 2.9683499336242676, + "learning_rate": 9.40490313932321e-06, + "loss": 1.1998, + "step": 4509 + }, + { + "epoch": 0.3644518071072143, + "grad_norm": 3.171738624572754, + "learning_rate": 9.404593488624168e-06, + "loss": 1.0149, + "step": 4510 + }, + { + "epoch": 0.36453261682054183, + "grad_norm": 2.4399123191833496, + "learning_rate": 9.404283762485045e-06, + "loss": 1.0968, + "step": 4511 + }, + { + "epoch": 0.36461342653386936, + "grad_norm": 2.8575239181518555, + "learning_rate": 9.403973960911143e-06, + "loss": 1.0486, + "step": 4512 + }, + { + "epoch": 0.36469423624719693, + "grad_norm": 3.0597331523895264, + "learning_rate": 9.40366408390777e-06, + "loss": 0.9539, + "step": 4513 + }, + { + "epoch": 0.36477504596052446, + "grad_norm": 2.4745028018951416, + "learning_rate": 9.403354131480233e-06, + "loss": 0.8326, + "step": 4514 + }, + { + "epoch": 0.364855855673852, + "grad_norm": 2.9149515628814697, + "learning_rate": 9.40304410363384e-06, + "loss": 0.983, + "step": 4515 + }, + { + "epoch": 0.36493666538717956, + "grad_norm": 2.8048317432403564, + "learning_rate": 9.402734000373903e-06, + "loss": 0.9749, + "step": 4516 + }, + { + "epoch": 0.3650174751005071, + "grad_norm": 2.515558958053589, + "learning_rate": 9.402423821705728e-06, + "loss": 0.9942, + "step": 4517 + }, + { + "epoch": 0.3650982848138346, + "grad_norm": 2.7846484184265137, + "learning_rate": 9.402113567634633e-06, + "loss": 0.9612, + "step": 4518 + }, + { + "epoch": 0.3651790945271622, + "grad_norm": 2.86734676361084, + "learning_rate": 9.401803238165933e-06, + "loss": 1.0884, + "step": 4519 + }, + { + "epoch": 0.3652599042404897, + "grad_norm": 2.6100146770477295, + "learning_rate": 9.401492833304936e-06, + "loss": 0.8563, + "step": 4520 + }, + { + "epoch": 0.36534071395381723, + "grad_norm": 2.7019171714782715, + "learning_rate": 9.401182353056966e-06, + "loss": 0.9184, + "step": 4521 + }, + { + "epoch": 0.3654215236671448, + "grad_norm": 3.166868209838867, + "learning_rate": 9.400871797427338e-06, + "loss": 0.992, + "step": 4522 + }, + { + "epoch": 0.36550233338047233, + "grad_norm": 2.637887477874756, + "learning_rate": 9.400561166421369e-06, + "loss": 0.984, + "step": 4523 + }, + { + "epoch": 0.36558314309379986, + "grad_norm": 2.8214378356933594, + "learning_rate": 9.400250460044382e-06, + "loss": 1.0293, + "step": 4524 + }, + { + "epoch": 0.36566395280712743, + "grad_norm": 3.0553970336914062, + "learning_rate": 9.399939678301697e-06, + "loss": 1.0487, + "step": 4525 + }, + { + "epoch": 0.36574476252045496, + "grad_norm": 2.739971160888672, + "learning_rate": 9.39962882119864e-06, + "loss": 1.1134, + "step": 4526 + }, + { + "epoch": 0.3658255722337825, + "grad_norm": 2.5887255668640137, + "learning_rate": 9.39931788874053e-06, + "loss": 0.9873, + "step": 4527 + }, + { + "epoch": 0.36590638194711006, + "grad_norm": 2.8833765983581543, + "learning_rate": 9.399006880932696e-06, + "loss": 0.9344, + "step": 4528 + }, + { + "epoch": 0.3659871916604376, + "grad_norm": 2.879179000854492, + "learning_rate": 9.398695797780465e-06, + "loss": 0.9708, + "step": 4529 + }, + { + "epoch": 0.3660680013737651, + "grad_norm": 3.0080201625823975, + "learning_rate": 9.398384639289165e-06, + "loss": 0.9233, + "step": 4530 + }, + { + "epoch": 0.3661488110870927, + "grad_norm": 2.843886137008667, + "learning_rate": 9.398073405464123e-06, + "loss": 1.0434, + "step": 4531 + }, + { + "epoch": 0.3662296208004202, + "grad_norm": 2.586495876312256, + "learning_rate": 9.397762096310673e-06, + "loss": 1.1622, + "step": 4532 + }, + { + "epoch": 0.36631043051374773, + "grad_norm": 2.517352819442749, + "learning_rate": 9.397450711834145e-06, + "loss": 1.0387, + "step": 4533 + }, + { + "epoch": 0.3663912402270753, + "grad_norm": 2.4357497692108154, + "learning_rate": 9.397139252039873e-06, + "loss": 1.0196, + "step": 4534 + }, + { + "epoch": 0.36647204994040283, + "grad_norm": 2.8926069736480713, + "learning_rate": 9.396827716933191e-06, + "loss": 1.017, + "step": 4535 + }, + { + "epoch": 0.36655285965373036, + "grad_norm": 2.7496743202209473, + "learning_rate": 9.396516106519436e-06, + "loss": 1.0057, + "step": 4536 + }, + { + "epoch": 0.36663366936705793, + "grad_norm": 2.624067783355713, + "learning_rate": 9.396204420803943e-06, + "loss": 1.075, + "step": 4537 + }, + { + "epoch": 0.36671447908038546, + "grad_norm": 2.9119105339050293, + "learning_rate": 9.395892659792053e-06, + "loss": 1.0004, + "step": 4538 + }, + { + "epoch": 0.366795288793713, + "grad_norm": 2.5376782417297363, + "learning_rate": 9.395580823489103e-06, + "loss": 1.0453, + "step": 4539 + }, + { + "epoch": 0.36687609850704056, + "grad_norm": 2.7258594036102295, + "learning_rate": 9.395268911900437e-06, + "loss": 1.0709, + "step": 4540 + }, + { + "epoch": 0.3669569082203681, + "grad_norm": 2.718148946762085, + "learning_rate": 9.394956925031394e-06, + "loss": 0.943, + "step": 4541 + }, + { + "epoch": 0.3670377179336956, + "grad_norm": 2.7300665378570557, + "learning_rate": 9.39464486288732e-06, + "loss": 0.9242, + "step": 4542 + }, + { + "epoch": 0.3671185276470232, + "grad_norm": 2.771728277206421, + "learning_rate": 9.39433272547356e-06, + "loss": 0.998, + "step": 4543 + }, + { + "epoch": 0.3671993373603507, + "grad_norm": 2.4261975288391113, + "learning_rate": 9.394020512795459e-06, + "loss": 0.8755, + "step": 4544 + }, + { + "epoch": 0.36728014707367823, + "grad_norm": 2.893710136413574, + "learning_rate": 9.393708224858365e-06, + "loss": 0.9709, + "step": 4545 + }, + { + "epoch": 0.3673609567870058, + "grad_norm": 2.779921293258667, + "learning_rate": 9.393395861667625e-06, + "loss": 0.9192, + "step": 4546 + }, + { + "epoch": 0.36744176650033333, + "grad_norm": 2.4235339164733887, + "learning_rate": 9.393083423228591e-06, + "loss": 1.103, + "step": 4547 + }, + { + "epoch": 0.36752257621366086, + "grad_norm": 3.1893932819366455, + "learning_rate": 9.392770909546615e-06, + "loss": 1.0178, + "step": 4548 + }, + { + "epoch": 0.36760338592698844, + "grad_norm": 2.8173398971557617, + "learning_rate": 9.392458320627046e-06, + "loss": 0.9715, + "step": 4549 + }, + { + "epoch": 0.36768419564031596, + "grad_norm": 2.9895713329315186, + "learning_rate": 9.392145656475245e-06, + "loss": 1.1446, + "step": 4550 + }, + { + "epoch": 0.3677650053536435, + "grad_norm": 2.971907138824463, + "learning_rate": 9.39183291709656e-06, + "loss": 0.9728, + "step": 4551 + }, + { + "epoch": 0.36784581506697106, + "grad_norm": 2.853642225265503, + "learning_rate": 9.39152010249635e-06, + "loss": 1.0509, + "step": 4552 + }, + { + "epoch": 0.3679266247802986, + "grad_norm": 3.2043793201446533, + "learning_rate": 9.391207212679971e-06, + "loss": 0.9442, + "step": 4553 + }, + { + "epoch": 0.3680074344936261, + "grad_norm": 2.5248141288757324, + "learning_rate": 9.390894247652786e-06, + "loss": 1.0295, + "step": 4554 + }, + { + "epoch": 0.3680882442069537, + "grad_norm": 2.902416229248047, + "learning_rate": 9.390581207420153e-06, + "loss": 0.9146, + "step": 4555 + }, + { + "epoch": 0.3681690539202812, + "grad_norm": 2.6413331031799316, + "learning_rate": 9.390268091987434e-06, + "loss": 0.9069, + "step": 4556 + }, + { + "epoch": 0.3682498636336088, + "grad_norm": 2.557710886001587, + "learning_rate": 9.38995490135999e-06, + "loss": 0.9081, + "step": 4557 + }, + { + "epoch": 0.3683306733469363, + "grad_norm": 2.6280970573425293, + "learning_rate": 9.389641635543189e-06, + "loss": 0.9767, + "step": 4558 + }, + { + "epoch": 0.36841148306026383, + "grad_norm": 2.41762113571167, + "learning_rate": 9.389328294542392e-06, + "loss": 1.0447, + "step": 4559 + }, + { + "epoch": 0.3684922927735914, + "grad_norm": 2.6446521282196045, + "learning_rate": 9.38901487836297e-06, + "loss": 1.044, + "step": 4560 + }, + { + "epoch": 0.36857310248691894, + "grad_norm": 3.1756978034973145, + "learning_rate": 9.38870138701029e-06, + "loss": 0.8964, + "step": 4561 + }, + { + "epoch": 0.36865391220024646, + "grad_norm": 2.9383111000061035, + "learning_rate": 9.388387820489719e-06, + "loss": 0.9009, + "step": 4562 + }, + { + "epoch": 0.36873472191357404, + "grad_norm": 2.624556064605713, + "learning_rate": 9.38807417880663e-06, + "loss": 0.9675, + "step": 4563 + }, + { + "epoch": 0.36881553162690156, + "grad_norm": 2.8243956565856934, + "learning_rate": 9.387760461966395e-06, + "loss": 0.9539, + "step": 4564 + }, + { + "epoch": 0.3688963413402291, + "grad_norm": 2.3740501403808594, + "learning_rate": 9.387446669974384e-06, + "loss": 0.9306, + "step": 4565 + }, + { + "epoch": 0.36897715105355666, + "grad_norm": 2.642275094985962, + "learning_rate": 9.387132802835977e-06, + "loss": 0.9699, + "step": 4566 + }, + { + "epoch": 0.3690579607668842, + "grad_norm": 3.44392728805542, + "learning_rate": 9.386818860556545e-06, + "loss": 0.9687, + "step": 4567 + }, + { + "epoch": 0.3691387704802117, + "grad_norm": 2.5872433185577393, + "learning_rate": 9.386504843141466e-06, + "loss": 0.8814, + "step": 4568 + }, + { + "epoch": 0.3692195801935393, + "grad_norm": 2.684056520462036, + "learning_rate": 9.38619075059612e-06, + "loss": 0.9708, + "step": 4569 + }, + { + "epoch": 0.3693003899068668, + "grad_norm": 2.7787370681762695, + "learning_rate": 9.385876582925886e-06, + "loss": 0.8949, + "step": 4570 + }, + { + "epoch": 0.36938119962019433, + "grad_norm": 2.595571756362915, + "learning_rate": 9.385562340136144e-06, + "loss": 1.0831, + "step": 4571 + }, + { + "epoch": 0.3694620093335219, + "grad_norm": 2.7705674171447754, + "learning_rate": 9.385248022232278e-06, + "loss": 1.0678, + "step": 4572 + }, + { + "epoch": 0.36954281904684944, + "grad_norm": 2.8611176013946533, + "learning_rate": 9.384933629219669e-06, + "loss": 0.9156, + "step": 4573 + }, + { + "epoch": 0.36962362876017696, + "grad_norm": 2.8933324813842773, + "learning_rate": 9.384619161103703e-06, + "loss": 1.2177, + "step": 4574 + }, + { + "epoch": 0.36970443847350454, + "grad_norm": 2.9562392234802246, + "learning_rate": 9.384304617889768e-06, + "loss": 0.9406, + "step": 4575 + }, + { + "epoch": 0.36978524818683206, + "grad_norm": 2.4091672897338867, + "learning_rate": 9.38398999958325e-06, + "loss": 1.052, + "step": 4576 + }, + { + "epoch": 0.3698660579001596, + "grad_norm": 2.5455174446105957, + "learning_rate": 9.383675306189535e-06, + "loss": 1.003, + "step": 4577 + }, + { + "epoch": 0.36994686761348716, + "grad_norm": 2.325300931930542, + "learning_rate": 9.383360537714018e-06, + "loss": 1.0479, + "step": 4578 + }, + { + "epoch": 0.3700276773268147, + "grad_norm": 2.7085652351379395, + "learning_rate": 9.383045694162085e-06, + "loss": 0.9682, + "step": 4579 + }, + { + "epoch": 0.3701084870401422, + "grad_norm": 3.875587224960327, + "learning_rate": 9.382730775539133e-06, + "loss": 0.9901, + "step": 4580 + }, + { + "epoch": 0.3701892967534698, + "grad_norm": 2.634938955307007, + "learning_rate": 9.382415781850553e-06, + "loss": 0.9529, + "step": 4581 + }, + { + "epoch": 0.3702701064667973, + "grad_norm": 2.4124062061309814, + "learning_rate": 9.38210071310174e-06, + "loss": 0.9837, + "step": 4582 + }, + { + "epoch": 0.37035091618012483, + "grad_norm": 2.9303414821624756, + "learning_rate": 9.381785569298092e-06, + "loss": 0.9598, + "step": 4583 + }, + { + "epoch": 0.3704317258934524, + "grad_norm": 3.144341468811035, + "learning_rate": 9.381470350445006e-06, + "loss": 1.0144, + "step": 4584 + }, + { + "epoch": 0.37051253560677994, + "grad_norm": 2.8793258666992188, + "learning_rate": 9.38115505654788e-06, + "loss": 0.908, + "step": 4585 + }, + { + "epoch": 0.37059334532010746, + "grad_norm": 2.8264477252960205, + "learning_rate": 9.380839687612116e-06, + "loss": 0.9622, + "step": 4586 + }, + { + "epoch": 0.37067415503343504, + "grad_norm": 2.8271288871765137, + "learning_rate": 9.380524243643115e-06, + "loss": 1.0603, + "step": 4587 + }, + { + "epoch": 0.37075496474676256, + "grad_norm": 2.6553587913513184, + "learning_rate": 9.380208724646279e-06, + "loss": 0.9144, + "step": 4588 + }, + { + "epoch": 0.3708357744600901, + "grad_norm": 2.6913859844207764, + "learning_rate": 9.379893130627011e-06, + "loss": 1.0163, + "step": 4589 + }, + { + "epoch": 0.37091658417341766, + "grad_norm": 3.0093512535095215, + "learning_rate": 9.37957746159072e-06, + "loss": 0.9999, + "step": 4590 + }, + { + "epoch": 0.3709973938867452, + "grad_norm": 3.0305333137512207, + "learning_rate": 9.37926171754281e-06, + "loss": 0.9112, + "step": 4591 + }, + { + "epoch": 0.3710782036000727, + "grad_norm": 2.818683624267578, + "learning_rate": 9.37894589848869e-06, + "loss": 0.9946, + "step": 4592 + }, + { + "epoch": 0.3711590133134003, + "grad_norm": 2.397392749786377, + "learning_rate": 9.378630004433766e-06, + "loss": 1.0275, + "step": 4593 + }, + { + "epoch": 0.3712398230267278, + "grad_norm": 2.944654703140259, + "learning_rate": 9.378314035383454e-06, + "loss": 0.973, + "step": 4594 + }, + { + "epoch": 0.37132063274005533, + "grad_norm": 2.380833148956299, + "learning_rate": 9.377997991343163e-06, + "loss": 0.9979, + "step": 4595 + }, + { + "epoch": 0.3714014424533829, + "grad_norm": 2.939668655395508, + "learning_rate": 9.377681872318303e-06, + "loss": 0.9139, + "step": 4596 + }, + { + "epoch": 0.37148225216671044, + "grad_norm": 3.133352279663086, + "learning_rate": 9.377365678314293e-06, + "loss": 0.9466, + "step": 4597 + }, + { + "epoch": 0.37156306188003796, + "grad_norm": 2.810354471206665, + "learning_rate": 9.377049409336547e-06, + "loss": 0.9573, + "step": 4598 + }, + { + "epoch": 0.37164387159336554, + "grad_norm": 2.642275810241699, + "learning_rate": 9.376733065390483e-06, + "loss": 1.0413, + "step": 4599 + }, + { + "epoch": 0.37172468130669306, + "grad_norm": 3.47493314743042, + "learning_rate": 9.376416646481516e-06, + "loss": 0.8108, + "step": 4600 + }, + { + "epoch": 0.3718054910200206, + "grad_norm": 2.652791976928711, + "learning_rate": 9.37610015261507e-06, + "loss": 0.9259, + "step": 4601 + }, + { + "epoch": 0.37188630073334816, + "grad_norm": 3.052654981613159, + "learning_rate": 9.375783583796562e-06, + "loss": 0.9279, + "step": 4602 + }, + { + "epoch": 0.3719671104466757, + "grad_norm": 2.550945997238159, + "learning_rate": 9.375466940031416e-06, + "loss": 0.9951, + "step": 4603 + }, + { + "epoch": 0.3720479201600032, + "grad_norm": 2.9834840297698975, + "learning_rate": 9.375150221325053e-06, + "loss": 0.9975, + "step": 4604 + }, + { + "epoch": 0.3721287298733308, + "grad_norm": 2.732625961303711, + "learning_rate": 9.3748334276829e-06, + "loss": 1.0384, + "step": 4605 + }, + { + "epoch": 0.3722095395866583, + "grad_norm": 2.907871961593628, + "learning_rate": 9.374516559110386e-06, + "loss": 1.1045, + "step": 4606 + }, + { + "epoch": 0.37229034929998583, + "grad_norm": 2.7010860443115234, + "learning_rate": 9.37419961561293e-06, + "loss": 0.9371, + "step": 4607 + }, + { + "epoch": 0.3723711590133134, + "grad_norm": 2.9858970642089844, + "learning_rate": 9.373882597195969e-06, + "loss": 0.9703, + "step": 4608 + }, + { + "epoch": 0.37245196872664094, + "grad_norm": 2.719649076461792, + "learning_rate": 9.373565503864925e-06, + "loss": 0.9681, + "step": 4609 + }, + { + "epoch": 0.37253277843996846, + "grad_norm": 2.775792121887207, + "learning_rate": 9.373248335625237e-06, + "loss": 1.1137, + "step": 4610 + }, + { + "epoch": 0.37261358815329604, + "grad_norm": 3.075058698654175, + "learning_rate": 9.37293109248233e-06, + "loss": 1.0127, + "step": 4611 + }, + { + "epoch": 0.37269439786662356, + "grad_norm": 3.0514657497406006, + "learning_rate": 9.37261377444164e-06, + "loss": 1.0136, + "step": 4612 + }, + { + "epoch": 0.3727752075799511, + "grad_norm": 2.7548375129699707, + "learning_rate": 9.372296381508606e-06, + "loss": 0.9435, + "step": 4613 + }, + { + "epoch": 0.37285601729327866, + "grad_norm": 2.669076919555664, + "learning_rate": 9.37197891368866e-06, + "loss": 0.7948, + "step": 4614 + }, + { + "epoch": 0.3729368270066062, + "grad_norm": 3.102717876434326, + "learning_rate": 9.371661370987238e-06, + "loss": 0.9228, + "step": 4615 + }, + { + "epoch": 0.3730176367199337, + "grad_norm": 2.722560167312622, + "learning_rate": 9.371343753409783e-06, + "loss": 0.9516, + "step": 4616 + }, + { + "epoch": 0.3730984464332613, + "grad_norm": 2.6848738193511963, + "learning_rate": 9.371026060961732e-06, + "loss": 1.0751, + "step": 4617 + }, + { + "epoch": 0.3731792561465888, + "grad_norm": 2.5699706077575684, + "learning_rate": 9.370708293648528e-06, + "loss": 1.1555, + "step": 4618 + }, + { + "epoch": 0.37326006585991633, + "grad_norm": 2.504427671432495, + "learning_rate": 9.370390451475614e-06, + "loss": 0.8811, + "step": 4619 + }, + { + "epoch": 0.3733408755732439, + "grad_norm": 2.705014228820801, + "learning_rate": 9.370072534448432e-06, + "loss": 1.0209, + "step": 4620 + }, + { + "epoch": 0.37342168528657144, + "grad_norm": 2.680250644683838, + "learning_rate": 9.369754542572427e-06, + "loss": 0.9363, + "step": 4621 + }, + { + "epoch": 0.373502494999899, + "grad_norm": 2.696732759475708, + "learning_rate": 9.369436475853048e-06, + "loss": 0.9178, + "step": 4622 + }, + { + "epoch": 0.37358330471322654, + "grad_norm": 2.756539821624756, + "learning_rate": 9.36911833429574e-06, + "loss": 0.9903, + "step": 4623 + }, + { + "epoch": 0.37366411442655406, + "grad_norm": 2.860239267349243, + "learning_rate": 9.368800117905954e-06, + "loss": 1.0033, + "step": 4624 + }, + { + "epoch": 0.37374492413988164, + "grad_norm": 2.2197537422180176, + "learning_rate": 9.368481826689138e-06, + "loss": 0.8965, + "step": 4625 + }, + { + "epoch": 0.37382573385320916, + "grad_norm": 2.4059908390045166, + "learning_rate": 9.368163460650747e-06, + "loss": 0.9164, + "step": 4626 + }, + { + "epoch": 0.3739065435665367, + "grad_norm": 2.822862386703491, + "learning_rate": 9.36784501979623e-06, + "loss": 0.9616, + "step": 4627 + }, + { + "epoch": 0.37398735327986427, + "grad_norm": 2.7833025455474854, + "learning_rate": 9.367526504131043e-06, + "loss": 1.0955, + "step": 4628 + }, + { + "epoch": 0.3740681629931918, + "grad_norm": 2.7099392414093018, + "learning_rate": 9.367207913660643e-06, + "loss": 0.9954, + "step": 4629 + }, + { + "epoch": 0.3741489727065193, + "grad_norm": 2.4273574352264404, + "learning_rate": 9.366889248390486e-06, + "loss": 0.9942, + "step": 4630 + }, + { + "epoch": 0.3742297824198469, + "grad_norm": 2.7751688957214355, + "learning_rate": 9.366570508326026e-06, + "loss": 1.0044, + "step": 4631 + }, + { + "epoch": 0.3743105921331744, + "grad_norm": 2.7348666191101074, + "learning_rate": 9.366251693472728e-06, + "loss": 0.9199, + "step": 4632 + }, + { + "epoch": 0.37439140184650194, + "grad_norm": 2.558270215988159, + "learning_rate": 9.365932803836046e-06, + "loss": 1.117, + "step": 4633 + }, + { + "epoch": 0.3744722115598295, + "grad_norm": 2.744283676147461, + "learning_rate": 9.36561383942145e-06, + "loss": 0.9424, + "step": 4634 + }, + { + "epoch": 0.37455302127315704, + "grad_norm": 2.5696918964385986, + "learning_rate": 9.365294800234397e-06, + "loss": 1.0267, + "step": 4635 + }, + { + "epoch": 0.37463383098648456, + "grad_norm": 2.5888736248016357, + "learning_rate": 9.364975686280352e-06, + "loss": 1.038, + "step": 4636 + }, + { + "epoch": 0.37471464069981214, + "grad_norm": 2.646268129348755, + "learning_rate": 9.364656497564782e-06, + "loss": 1.0989, + "step": 4637 + }, + { + "epoch": 0.37479545041313966, + "grad_norm": 2.6140968799591064, + "learning_rate": 9.364337234093155e-06, + "loss": 0.9057, + "step": 4638 + }, + { + "epoch": 0.3748762601264672, + "grad_norm": 2.7762510776519775, + "learning_rate": 9.364017895870938e-06, + "loss": 0.9697, + "step": 4639 + }, + { + "epoch": 0.37495706983979477, + "grad_norm": 2.621048927307129, + "learning_rate": 9.363698482903598e-06, + "loss": 0.8835, + "step": 4640 + }, + { + "epoch": 0.3750378795531223, + "grad_norm": 2.8719687461853027, + "learning_rate": 9.36337899519661e-06, + "loss": 0.9315, + "step": 4641 + }, + { + "epoch": 0.3751186892664498, + "grad_norm": 2.5091278553009033, + "learning_rate": 9.363059432755443e-06, + "loss": 0.988, + "step": 4642 + }, + { + "epoch": 0.3751994989797774, + "grad_norm": 2.474213123321533, + "learning_rate": 9.362739795585573e-06, + "loss": 0.9952, + "step": 4643 + }, + { + "epoch": 0.3752803086931049, + "grad_norm": 3.510223865509033, + "learning_rate": 9.362420083692474e-06, + "loss": 0.9259, + "step": 4644 + }, + { + "epoch": 0.37536111840643244, + "grad_norm": 2.7654919624328613, + "learning_rate": 9.36210029708162e-06, + "loss": 1.0706, + "step": 4645 + }, + { + "epoch": 0.37544192811976, + "grad_norm": 2.7717068195343018, + "learning_rate": 9.361780435758488e-06, + "loss": 0.8909, + "step": 4646 + }, + { + "epoch": 0.37552273783308754, + "grad_norm": 2.8184854984283447, + "learning_rate": 9.361460499728558e-06, + "loss": 0.9658, + "step": 4647 + }, + { + "epoch": 0.37560354754641506, + "grad_norm": 2.530444860458374, + "learning_rate": 9.361140488997311e-06, + "loss": 0.7889, + "step": 4648 + }, + { + "epoch": 0.37568435725974264, + "grad_norm": 2.4376254081726074, + "learning_rate": 9.360820403570225e-06, + "loss": 1.007, + "step": 4649 + }, + { + "epoch": 0.37576516697307016, + "grad_norm": 2.770446538925171, + "learning_rate": 9.360500243452785e-06, + "loss": 1.0088, + "step": 4650 + }, + { + "epoch": 0.3758459766863977, + "grad_norm": 2.4954068660736084, + "learning_rate": 9.360180008650472e-06, + "loss": 1.1051, + "step": 4651 + }, + { + "epoch": 0.37592678639972527, + "grad_norm": 2.4725492000579834, + "learning_rate": 9.359859699168773e-06, + "loss": 1.0519, + "step": 4652 + }, + { + "epoch": 0.3760075961130528, + "grad_norm": 2.686007499694824, + "learning_rate": 9.359539315013173e-06, + "loss": 1.0196, + "step": 4653 + }, + { + "epoch": 0.3760884058263803, + "grad_norm": 2.589627504348755, + "learning_rate": 9.359218856189161e-06, + "loss": 0.945, + "step": 4654 + }, + { + "epoch": 0.3761692155397079, + "grad_norm": 3.184671401977539, + "learning_rate": 9.358898322702222e-06, + "loss": 0.9778, + "step": 4655 + }, + { + "epoch": 0.3762500252530354, + "grad_norm": 2.420375108718872, + "learning_rate": 9.358577714557849e-06, + "loss": 0.9061, + "step": 4656 + }, + { + "epoch": 0.37633083496636294, + "grad_norm": 2.8433828353881836, + "learning_rate": 9.358257031761532e-06, + "loss": 1.1095, + "step": 4657 + }, + { + "epoch": 0.3764116446796905, + "grad_norm": 3.1400113105773926, + "learning_rate": 9.357936274318766e-06, + "loss": 1.0581, + "step": 4658 + }, + { + "epoch": 0.37649245439301804, + "grad_norm": 2.6416494846343994, + "learning_rate": 9.357615442235042e-06, + "loss": 0.984, + "step": 4659 + }, + { + "epoch": 0.37657326410634556, + "grad_norm": 2.5612497329711914, + "learning_rate": 9.357294535515857e-06, + "loss": 0.9338, + "step": 4660 + }, + { + "epoch": 0.37665407381967314, + "grad_norm": 3.2765259742736816, + "learning_rate": 9.356973554166704e-06, + "loss": 1.1653, + "step": 4661 + }, + { + "epoch": 0.37673488353300066, + "grad_norm": 2.5079116821289062, + "learning_rate": 9.356652498193085e-06, + "loss": 1.0834, + "step": 4662 + }, + { + "epoch": 0.3768156932463282, + "grad_norm": 2.9052252769470215, + "learning_rate": 9.356331367600497e-06, + "loss": 0.8942, + "step": 4663 + }, + { + "epoch": 0.37689650295965577, + "grad_norm": 2.5440428256988525, + "learning_rate": 9.35601016239444e-06, + "loss": 0.9333, + "step": 4664 + }, + { + "epoch": 0.3769773126729833, + "grad_norm": 2.6236190795898438, + "learning_rate": 9.355688882580414e-06, + "loss": 0.9764, + "step": 4665 + }, + { + "epoch": 0.3770581223863108, + "grad_norm": 2.6265389919281006, + "learning_rate": 9.355367528163925e-06, + "loss": 0.8844, + "step": 4666 + }, + { + "epoch": 0.3771389320996384, + "grad_norm": 2.5625059604644775, + "learning_rate": 9.355046099150475e-06, + "loss": 0.9659, + "step": 4667 + }, + { + "epoch": 0.3772197418129659, + "grad_norm": 2.58971905708313, + "learning_rate": 9.35472459554557e-06, + "loss": 0.9519, + "step": 4668 + }, + { + "epoch": 0.37730055152629344, + "grad_norm": 2.975165843963623, + "learning_rate": 9.354403017354715e-06, + "loss": 0.9435, + "step": 4669 + }, + { + "epoch": 0.377381361239621, + "grad_norm": 2.81882905960083, + "learning_rate": 9.35408136458342e-06, + "loss": 0.9213, + "step": 4670 + }, + { + "epoch": 0.37746217095294854, + "grad_norm": 2.43086314201355, + "learning_rate": 9.353759637237192e-06, + "loss": 1.0836, + "step": 4671 + }, + { + "epoch": 0.37754298066627606, + "grad_norm": 2.7550718784332275, + "learning_rate": 9.353437835321543e-06, + "loss": 0.9825, + "step": 4672 + }, + { + "epoch": 0.37762379037960364, + "grad_norm": 2.6080548763275146, + "learning_rate": 9.353115958841987e-06, + "loss": 0.9459, + "step": 4673 + }, + { + "epoch": 0.37770460009293116, + "grad_norm": 2.7612552642822266, + "learning_rate": 9.352794007804033e-06, + "loss": 0.9331, + "step": 4674 + }, + { + "epoch": 0.3777854098062587, + "grad_norm": 2.4627575874328613, + "learning_rate": 9.352471982213195e-06, + "loss": 0.8379, + "step": 4675 + }, + { + "epoch": 0.37786621951958627, + "grad_norm": 2.686471700668335, + "learning_rate": 9.35214988207499e-06, + "loss": 1.0379, + "step": 4676 + }, + { + "epoch": 0.3779470292329138, + "grad_norm": 2.488630771636963, + "learning_rate": 9.351827707394937e-06, + "loss": 1.0181, + "step": 4677 + }, + { + "epoch": 0.3780278389462413, + "grad_norm": 2.6898245811462402, + "learning_rate": 9.351505458178551e-06, + "loss": 1.0643, + "step": 4678 + }, + { + "epoch": 0.3781086486595689, + "grad_norm": 2.7466509342193604, + "learning_rate": 9.351183134431352e-06, + "loss": 0.9447, + "step": 4679 + }, + { + "epoch": 0.3781894583728964, + "grad_norm": 2.5569167137145996, + "learning_rate": 9.350860736158861e-06, + "loss": 1.016, + "step": 4680 + }, + { + "epoch": 0.37827026808622394, + "grad_norm": 2.2112040519714355, + "learning_rate": 9.3505382633666e-06, + "loss": 0.8968, + "step": 4681 + }, + { + "epoch": 0.3783510777995515, + "grad_norm": 3.138573169708252, + "learning_rate": 9.350215716060093e-06, + "loss": 1.0001, + "step": 4682 + }, + { + "epoch": 0.37843188751287904, + "grad_norm": 2.741068124771118, + "learning_rate": 9.349893094244863e-06, + "loss": 1.0164, + "step": 4683 + }, + { + "epoch": 0.37851269722620656, + "grad_norm": 2.4641666412353516, + "learning_rate": 9.349570397926435e-06, + "loss": 1.0531, + "step": 4684 + }, + { + "epoch": 0.37859350693953414, + "grad_norm": 2.799994468688965, + "learning_rate": 9.349247627110338e-06, + "loss": 1.0897, + "step": 4685 + }, + { + "epoch": 0.37867431665286166, + "grad_norm": 2.553637742996216, + "learning_rate": 9.3489247818021e-06, + "loss": 0.9916, + "step": 4686 + }, + { + "epoch": 0.37875512636618924, + "grad_norm": 2.3900206089019775, + "learning_rate": 9.34860186200725e-06, + "loss": 0.9037, + "step": 4687 + }, + { + "epoch": 0.37883593607951677, + "grad_norm": 2.382781505584717, + "learning_rate": 9.348278867731317e-06, + "loss": 0.9374, + "step": 4688 + }, + { + "epoch": 0.3789167457928443, + "grad_norm": 2.3783068656921387, + "learning_rate": 9.347955798979838e-06, + "loss": 1.0351, + "step": 4689 + }, + { + "epoch": 0.37899755550617187, + "grad_norm": 2.8407225608825684, + "learning_rate": 9.347632655758341e-06, + "loss": 1.0577, + "step": 4690 + }, + { + "epoch": 0.3790783652194994, + "grad_norm": 3.121370553970337, + "learning_rate": 9.347309438072365e-06, + "loss": 0.9437, + "step": 4691 + }, + { + "epoch": 0.3791591749328269, + "grad_norm": 2.5054781436920166, + "learning_rate": 9.346986145927443e-06, + "loss": 0.9386, + "step": 4692 + }, + { + "epoch": 0.3792399846461545, + "grad_norm": 2.5887506008148193, + "learning_rate": 9.346662779329115e-06, + "loss": 0.9107, + "step": 4693 + }, + { + "epoch": 0.379320794359482, + "grad_norm": 2.8974032402038574, + "learning_rate": 9.346339338282915e-06, + "loss": 0.923, + "step": 4694 + }, + { + "epoch": 0.37940160407280954, + "grad_norm": 2.7863271236419678, + "learning_rate": 9.346015822794387e-06, + "loss": 0.9532, + "step": 4695 + }, + { + "epoch": 0.3794824137861371, + "grad_norm": 2.574018716812134, + "learning_rate": 9.34569223286907e-06, + "loss": 0.8615, + "step": 4696 + }, + { + "epoch": 0.37956322349946464, + "grad_norm": 2.5277822017669678, + "learning_rate": 9.345368568512508e-06, + "loss": 1.028, + "step": 4697 + }, + { + "epoch": 0.37964403321279216, + "grad_norm": 3.112009048461914, + "learning_rate": 9.345044829730243e-06, + "loss": 0.929, + "step": 4698 + }, + { + "epoch": 0.37972484292611974, + "grad_norm": 2.4879302978515625, + "learning_rate": 9.34472101652782e-06, + "loss": 1.0871, + "step": 4699 + }, + { + "epoch": 0.37980565263944727, + "grad_norm": 2.7445104122161865, + "learning_rate": 9.344397128910784e-06, + "loss": 1.0077, + "step": 4700 + }, + { + "epoch": 0.3798864623527748, + "grad_norm": 2.4609758853912354, + "learning_rate": 9.344073166884686e-06, + "loss": 1.0116, + "step": 4701 + }, + { + "epoch": 0.37996727206610237, + "grad_norm": 3.099045753479004, + "learning_rate": 9.343749130455074e-06, + "loss": 0.9865, + "step": 4702 + }, + { + "epoch": 0.3800480817794299, + "grad_norm": 2.8231124877929688, + "learning_rate": 9.343425019627493e-06, + "loss": 1.1225, + "step": 4703 + }, + { + "epoch": 0.3801288914927574, + "grad_norm": 2.8193726539611816, + "learning_rate": 9.3431008344075e-06, + "loss": 0.9882, + "step": 4704 + }, + { + "epoch": 0.380209701206085, + "grad_norm": 2.245363473892212, + "learning_rate": 9.342776574800645e-06, + "loss": 0.9592, + "step": 4705 + }, + { + "epoch": 0.3802905109194125, + "grad_norm": 2.7241601943969727, + "learning_rate": 9.342452240812481e-06, + "loss": 0.831, + "step": 4706 + }, + { + "epoch": 0.38037132063274004, + "grad_norm": 2.3399782180786133, + "learning_rate": 9.342127832448565e-06, + "loss": 1.0248, + "step": 4707 + }, + { + "epoch": 0.3804521303460676, + "grad_norm": 2.310903310775757, + "learning_rate": 9.341803349714453e-06, + "loss": 1.0077, + "step": 4708 + }, + { + "epoch": 0.38053294005939514, + "grad_norm": 2.90739369392395, + "learning_rate": 9.341478792615702e-06, + "loss": 1.059, + "step": 4709 + }, + { + "epoch": 0.38061374977272266, + "grad_norm": 2.5221786499023438, + "learning_rate": 9.341154161157868e-06, + "loss": 0.9381, + "step": 4710 + }, + { + "epoch": 0.38069455948605024, + "grad_norm": 2.8505871295928955, + "learning_rate": 9.340829455346518e-06, + "loss": 1.0079, + "step": 4711 + }, + { + "epoch": 0.38077536919937777, + "grad_norm": 2.158698558807373, + "learning_rate": 9.340504675187207e-06, + "loss": 1.0045, + "step": 4712 + }, + { + "epoch": 0.3808561789127053, + "grad_norm": 3.18208909034729, + "learning_rate": 9.340179820685503e-06, + "loss": 1.1025, + "step": 4713 + }, + { + "epoch": 0.38093698862603287, + "grad_norm": 2.932091474533081, + "learning_rate": 9.339854891846964e-06, + "loss": 1.005, + "step": 4714 + }, + { + "epoch": 0.3810177983393604, + "grad_norm": 2.5078186988830566, + "learning_rate": 9.339529888677161e-06, + "loss": 1.0195, + "step": 4715 + }, + { + "epoch": 0.3810986080526879, + "grad_norm": 3.0139050483703613, + "learning_rate": 9.339204811181657e-06, + "loss": 0.8981, + "step": 4716 + }, + { + "epoch": 0.3811794177660155, + "grad_norm": 2.5075485706329346, + "learning_rate": 9.33887965936602e-06, + "loss": 0.9846, + "step": 4717 + }, + { + "epoch": 0.381260227479343, + "grad_norm": 2.7496562004089355, + "learning_rate": 9.33855443323582e-06, + "loss": 0.8334, + "step": 4718 + }, + { + "epoch": 0.38134103719267054, + "grad_norm": 2.6395156383514404, + "learning_rate": 9.338229132796629e-06, + "loss": 1.0034, + "step": 4719 + }, + { + "epoch": 0.3814218469059981, + "grad_norm": 2.8209757804870605, + "learning_rate": 9.337903758054016e-06, + "loss": 1.0862, + "step": 4720 + }, + { + "epoch": 0.38150265661932564, + "grad_norm": 2.652390718460083, + "learning_rate": 9.337578309013554e-06, + "loss": 0.8791, + "step": 4721 + }, + { + "epoch": 0.38158346633265317, + "grad_norm": 3.160367012023926, + "learning_rate": 9.337252785680818e-06, + "loss": 1.1129, + "step": 4722 + }, + { + "epoch": 0.38166427604598074, + "grad_norm": 2.3652641773223877, + "learning_rate": 9.336927188061385e-06, + "loss": 1.0013, + "step": 4723 + }, + { + "epoch": 0.38174508575930827, + "grad_norm": 2.477011203765869, + "learning_rate": 9.336601516160828e-06, + "loss": 1.0117, + "step": 4724 + }, + { + "epoch": 0.3818258954726358, + "grad_norm": 3.6358301639556885, + "learning_rate": 9.336275769984727e-06, + "loss": 1.0446, + "step": 4725 + }, + { + "epoch": 0.38190670518596337, + "grad_norm": 3.075953483581543, + "learning_rate": 9.335949949538663e-06, + "loss": 0.9108, + "step": 4726 + }, + { + "epoch": 0.3819875148992909, + "grad_norm": 2.7538113594055176, + "learning_rate": 9.335624054828212e-06, + "loss": 0.9995, + "step": 4727 + }, + { + "epoch": 0.3820683246126184, + "grad_norm": 2.8855652809143066, + "learning_rate": 9.335298085858959e-06, + "loss": 0.9995, + "step": 4728 + }, + { + "epoch": 0.382149134325946, + "grad_norm": 3.38128399848938, + "learning_rate": 9.334972042636489e-06, + "loss": 1.0016, + "step": 4729 + }, + { + "epoch": 0.3822299440392735, + "grad_norm": 2.4725615978240967, + "learning_rate": 9.334645925166382e-06, + "loss": 0.9524, + "step": 4730 + }, + { + "epoch": 0.38231075375260104, + "grad_norm": 2.950460433959961, + "learning_rate": 9.334319733454227e-06, + "loss": 0.8953, + "step": 4731 + }, + { + "epoch": 0.3823915634659286, + "grad_norm": 3.0027737617492676, + "learning_rate": 9.333993467505608e-06, + "loss": 0.9027, + "step": 4732 + }, + { + "epoch": 0.38247237317925614, + "grad_norm": 2.9997451305389404, + "learning_rate": 9.333667127326114e-06, + "loss": 1.1046, + "step": 4733 + }, + { + "epoch": 0.38255318289258367, + "grad_norm": 2.69627046585083, + "learning_rate": 9.333340712921337e-06, + "loss": 0.9887, + "step": 4734 + }, + { + "epoch": 0.38263399260591124, + "grad_norm": 2.633528232574463, + "learning_rate": 9.333014224296864e-06, + "loss": 0.9717, + "step": 4735 + }, + { + "epoch": 0.38271480231923877, + "grad_norm": 3.3414008617401123, + "learning_rate": 9.33268766145829e-06, + "loss": 0.9609, + "step": 4736 + }, + { + "epoch": 0.3827956120325663, + "grad_norm": 2.7071304321289062, + "learning_rate": 9.332361024411206e-06, + "loss": 0.9961, + "step": 4737 + }, + { + "epoch": 0.38287642174589387, + "grad_norm": 2.9471144676208496, + "learning_rate": 9.332034313161207e-06, + "loss": 0.9456, + "step": 4738 + }, + { + "epoch": 0.3829572314592214, + "grad_norm": 2.5740861892700195, + "learning_rate": 9.331707527713891e-06, + "loss": 0.9751, + "step": 4739 + }, + { + "epoch": 0.3830380411725489, + "grad_norm": 2.7647855281829834, + "learning_rate": 9.331380668074852e-06, + "loss": 0.9046, + "step": 4740 + }, + { + "epoch": 0.3831188508858765, + "grad_norm": 3.026322364807129, + "learning_rate": 9.331053734249688e-06, + "loss": 0.904, + "step": 4741 + }, + { + "epoch": 0.383199660599204, + "grad_norm": 3.0095396041870117, + "learning_rate": 9.330726726244002e-06, + "loss": 0.9377, + "step": 4742 + }, + { + "epoch": 0.38328047031253154, + "grad_norm": 2.852733850479126, + "learning_rate": 9.330399644063392e-06, + "loss": 0.9513, + "step": 4743 + }, + { + "epoch": 0.3833612800258591, + "grad_norm": 3.252817153930664, + "learning_rate": 9.330072487713462e-06, + "loss": 0.9426, + "step": 4744 + }, + { + "epoch": 0.38344208973918664, + "grad_norm": 2.7273945808410645, + "learning_rate": 9.329745257199816e-06, + "loss": 1.0306, + "step": 4745 + }, + { + "epoch": 0.38352289945251417, + "grad_norm": 2.3329997062683105, + "learning_rate": 9.329417952528055e-06, + "loss": 1.0341, + "step": 4746 + }, + { + "epoch": 0.38360370916584174, + "grad_norm": 2.8117733001708984, + "learning_rate": 9.329090573703787e-06, + "loss": 0.8849, + "step": 4747 + }, + { + "epoch": 0.38368451887916927, + "grad_norm": 2.821118116378784, + "learning_rate": 9.32876312073262e-06, + "loss": 0.9057, + "step": 4748 + }, + { + "epoch": 0.3837653285924968, + "grad_norm": 2.883186101913452, + "learning_rate": 9.328435593620162e-06, + "loss": 1.0717, + "step": 4749 + }, + { + "epoch": 0.38384613830582437, + "grad_norm": 2.41925048828125, + "learning_rate": 9.328107992372023e-06, + "loss": 1.0482, + "step": 4750 + }, + { + "epoch": 0.3839269480191519, + "grad_norm": 2.7774710655212402, + "learning_rate": 9.327780316993811e-06, + "loss": 0.9736, + "step": 4751 + }, + { + "epoch": 0.38400775773247947, + "grad_norm": 3.132113218307495, + "learning_rate": 9.327452567491143e-06, + "loss": 0.9372, + "step": 4752 + }, + { + "epoch": 0.384088567445807, + "grad_norm": 2.61859130859375, + "learning_rate": 9.327124743869631e-06, + "loss": 1.05, + "step": 4753 + }, + { + "epoch": 0.3841693771591345, + "grad_norm": 3.047009229660034, + "learning_rate": 9.326796846134888e-06, + "loss": 0.9907, + "step": 4754 + }, + { + "epoch": 0.3842501868724621, + "grad_norm": 2.372396945953369, + "learning_rate": 9.326468874292531e-06, + "loss": 1.0379, + "step": 4755 + }, + { + "epoch": 0.3843309965857896, + "grad_norm": 2.8072192668914795, + "learning_rate": 9.32614082834818e-06, + "loss": 0.9341, + "step": 4756 + }, + { + "epoch": 0.38441180629911714, + "grad_norm": 2.7737817764282227, + "learning_rate": 9.325812708307449e-06, + "loss": 0.9666, + "step": 4757 + }, + { + "epoch": 0.3844926160124447, + "grad_norm": 2.4324991703033447, + "learning_rate": 9.32548451417596e-06, + "loss": 0.9549, + "step": 4758 + }, + { + "epoch": 0.38457342572577224, + "grad_norm": 2.6352956295013428, + "learning_rate": 9.325156245959336e-06, + "loss": 1.0462, + "step": 4759 + }, + { + "epoch": 0.38465423543909977, + "grad_norm": 2.7103171348571777, + "learning_rate": 9.324827903663198e-06, + "loss": 1.0216, + "step": 4760 + }, + { + "epoch": 0.38473504515242735, + "grad_norm": 2.5112156867980957, + "learning_rate": 9.32449948729317e-06, + "loss": 1.0214, + "step": 4761 + }, + { + "epoch": 0.38481585486575487, + "grad_norm": 2.653942823410034, + "learning_rate": 9.324170996854875e-06, + "loss": 0.9748, + "step": 4762 + }, + { + "epoch": 0.3848966645790824, + "grad_norm": 2.8191370964050293, + "learning_rate": 9.323842432353943e-06, + "loss": 0.9905, + "step": 4763 + }, + { + "epoch": 0.38497747429240997, + "grad_norm": 2.8964755535125732, + "learning_rate": 9.323513793795997e-06, + "loss": 1.0662, + "step": 4764 + }, + { + "epoch": 0.3850582840057375, + "grad_norm": 2.9598093032836914, + "learning_rate": 9.32318508118667e-06, + "loss": 1.0305, + "step": 4765 + }, + { + "epoch": 0.385139093719065, + "grad_norm": 2.8310484886169434, + "learning_rate": 9.322856294531589e-06, + "loss": 0.9283, + "step": 4766 + }, + { + "epoch": 0.3852199034323926, + "grad_norm": 3.1651182174682617, + "learning_rate": 9.322527433836386e-06, + "loss": 0.975, + "step": 4767 + }, + { + "epoch": 0.3853007131457201, + "grad_norm": 2.526073932647705, + "learning_rate": 9.322198499106693e-06, + "loss": 0.8425, + "step": 4768 + }, + { + "epoch": 0.38538152285904764, + "grad_norm": 3.729663848876953, + "learning_rate": 9.321869490348147e-06, + "loss": 0.8551, + "step": 4769 + }, + { + "epoch": 0.3854623325723752, + "grad_norm": 2.6058268547058105, + "learning_rate": 9.321540407566382e-06, + "loss": 1.0275, + "step": 4770 + }, + { + "epoch": 0.38554314228570274, + "grad_norm": 2.9478402137756348, + "learning_rate": 9.321211250767033e-06, + "loss": 1.0433, + "step": 4771 + }, + { + "epoch": 0.38562395199903027, + "grad_norm": 2.6278555393218994, + "learning_rate": 9.320882019955737e-06, + "loss": 1.0271, + "step": 4772 + }, + { + "epoch": 0.38570476171235785, + "grad_norm": 2.607504367828369, + "learning_rate": 9.320552715138136e-06, + "loss": 1.0778, + "step": 4773 + }, + { + "epoch": 0.38578557142568537, + "grad_norm": 2.555558919906616, + "learning_rate": 9.320223336319865e-06, + "loss": 0.9599, + "step": 4774 + }, + { + "epoch": 0.3858663811390129, + "grad_norm": 3.114999294281006, + "learning_rate": 9.319893883506572e-06, + "loss": 0.8927, + "step": 4775 + }, + { + "epoch": 0.38594719085234047, + "grad_norm": 2.8525390625, + "learning_rate": 9.319564356703895e-06, + "loss": 1.0061, + "step": 4776 + }, + { + "epoch": 0.386028000565668, + "grad_norm": 2.945138454437256, + "learning_rate": 9.31923475591748e-06, + "loss": 1.0231, + "step": 4777 + }, + { + "epoch": 0.3861088102789955, + "grad_norm": 2.8288087844848633, + "learning_rate": 9.318905081152972e-06, + "loss": 1.0277, + "step": 4778 + }, + { + "epoch": 0.3861896199923231, + "grad_norm": 3.0563604831695557, + "learning_rate": 9.318575332416016e-06, + "loss": 0.921, + "step": 4779 + }, + { + "epoch": 0.3862704297056506, + "grad_norm": 2.299703598022461, + "learning_rate": 9.318245509712262e-06, + "loss": 1.0233, + "step": 4780 + }, + { + "epoch": 0.38635123941897814, + "grad_norm": 2.6213436126708984, + "learning_rate": 9.317915613047358e-06, + "loss": 1.0653, + "step": 4781 + }, + { + "epoch": 0.3864320491323057, + "grad_norm": 2.7528960704803467, + "learning_rate": 9.317585642426954e-06, + "loss": 0.961, + "step": 4782 + }, + { + "epoch": 0.38651285884563324, + "grad_norm": 2.405949354171753, + "learning_rate": 9.317255597856703e-06, + "loss": 1.0507, + "step": 4783 + }, + { + "epoch": 0.38659366855896077, + "grad_norm": 2.5849359035491943, + "learning_rate": 9.316925479342258e-06, + "loss": 0.9822, + "step": 4784 + }, + { + "epoch": 0.38667447827228835, + "grad_norm": 3.079529285430908, + "learning_rate": 9.316595286889271e-06, + "loss": 0.8935, + "step": 4785 + }, + { + "epoch": 0.38675528798561587, + "grad_norm": 3.638134717941284, + "learning_rate": 9.316265020503398e-06, + "loss": 0.9915, + "step": 4786 + }, + { + "epoch": 0.3868360976989434, + "grad_norm": 2.217405080795288, + "learning_rate": 9.315934680190296e-06, + "loss": 0.9402, + "step": 4787 + }, + { + "epoch": 0.38691690741227097, + "grad_norm": 2.814835548400879, + "learning_rate": 9.315604265955625e-06, + "loss": 0.9646, + "step": 4788 + }, + { + "epoch": 0.3869977171255985, + "grad_norm": 2.7332446575164795, + "learning_rate": 9.315273777805041e-06, + "loss": 1.0287, + "step": 4789 + }, + { + "epoch": 0.387078526838926, + "grad_norm": 2.6318838596343994, + "learning_rate": 9.314943215744205e-06, + "loss": 0.9873, + "step": 4790 + }, + { + "epoch": 0.3871593365522536, + "grad_norm": 2.76735782623291, + "learning_rate": 9.31461257977878e-06, + "loss": 0.8939, + "step": 4791 + }, + { + "epoch": 0.3872401462655811, + "grad_norm": 2.2776925563812256, + "learning_rate": 9.314281869914429e-06, + "loss": 1.1241, + "step": 4792 + }, + { + "epoch": 0.38732095597890864, + "grad_norm": 2.7892775535583496, + "learning_rate": 9.313951086156815e-06, + "loss": 1.0662, + "step": 4793 + }, + { + "epoch": 0.3874017656922362, + "grad_norm": 3.429621696472168, + "learning_rate": 9.313620228511605e-06, + "loss": 0.9103, + "step": 4794 + }, + { + "epoch": 0.38748257540556375, + "grad_norm": 2.6924426555633545, + "learning_rate": 9.313289296984465e-06, + "loss": 1.0738, + "step": 4795 + }, + { + "epoch": 0.38756338511889127, + "grad_norm": 2.7512428760528564, + "learning_rate": 9.312958291581064e-06, + "loss": 1.023, + "step": 4796 + }, + { + "epoch": 0.38764419483221885, + "grad_norm": 2.5741891860961914, + "learning_rate": 9.312627212307069e-06, + "loss": 0.8525, + "step": 4797 + }, + { + "epoch": 0.38772500454554637, + "grad_norm": 2.840163469314575, + "learning_rate": 9.312296059168153e-06, + "loss": 1.0139, + "step": 4798 + }, + { + "epoch": 0.3878058142588739, + "grad_norm": 2.5960631370544434, + "learning_rate": 9.311964832169987e-06, + "loss": 1.0073, + "step": 4799 + }, + { + "epoch": 0.38788662397220147, + "grad_norm": 2.8192176818847656, + "learning_rate": 9.311633531318243e-06, + "loss": 1.0632, + "step": 4800 + }, + { + "epoch": 0.387967433685529, + "grad_norm": 3.081470251083374, + "learning_rate": 9.311302156618597e-06, + "loss": 0.8643, + "step": 4801 + }, + { + "epoch": 0.3880482433988565, + "grad_norm": 2.318772077560425, + "learning_rate": 9.310970708076724e-06, + "loss": 0.9243, + "step": 4802 + }, + { + "epoch": 0.3881290531121841, + "grad_norm": 2.716639757156372, + "learning_rate": 9.310639185698301e-06, + "loss": 1.0154, + "step": 4803 + }, + { + "epoch": 0.3882098628255116, + "grad_norm": 2.643319606781006, + "learning_rate": 9.310307589489007e-06, + "loss": 1.0741, + "step": 4804 + }, + { + "epoch": 0.38829067253883914, + "grad_norm": 2.5733048915863037, + "learning_rate": 9.309975919454519e-06, + "loss": 1.0569, + "step": 4805 + }, + { + "epoch": 0.3883714822521667, + "grad_norm": 2.561190128326416, + "learning_rate": 9.309644175600521e-06, + "loss": 1.0505, + "step": 4806 + }, + { + "epoch": 0.38845229196549425, + "grad_norm": 3.104463577270508, + "learning_rate": 9.309312357932693e-06, + "loss": 0.9595, + "step": 4807 + }, + { + "epoch": 0.38853310167882177, + "grad_norm": 2.5482091903686523, + "learning_rate": 9.308980466456718e-06, + "loss": 0.9709, + "step": 4808 + }, + { + "epoch": 0.38861391139214935, + "grad_norm": 2.562335252761841, + "learning_rate": 9.30864850117828e-06, + "loss": 0.91, + "step": 4809 + }, + { + "epoch": 0.38869472110547687, + "grad_norm": 2.939732074737549, + "learning_rate": 9.308316462103069e-06, + "loss": 0.9, + "step": 4810 + }, + { + "epoch": 0.3887755308188044, + "grad_norm": 2.47993803024292, + "learning_rate": 9.307984349236767e-06, + "loss": 1.0863, + "step": 4811 + }, + { + "epoch": 0.38885634053213197, + "grad_norm": 2.6861395835876465, + "learning_rate": 9.307652162585063e-06, + "loss": 0.9657, + "step": 4812 + }, + { + "epoch": 0.3889371502454595, + "grad_norm": 2.1660261154174805, + "learning_rate": 9.30731990215365e-06, + "loss": 0.9754, + "step": 4813 + }, + { + "epoch": 0.389017959958787, + "grad_norm": 2.4602410793304443, + "learning_rate": 9.306987567948216e-06, + "loss": 0.9003, + "step": 4814 + }, + { + "epoch": 0.3890987696721146, + "grad_norm": 2.805319309234619, + "learning_rate": 9.306655159974451e-06, + "loss": 0.9549, + "step": 4815 + }, + { + "epoch": 0.3891795793854421, + "grad_norm": 3.027040719985962, + "learning_rate": 9.306322678238054e-06, + "loss": 0.9587, + "step": 4816 + }, + { + "epoch": 0.3892603890987697, + "grad_norm": 2.4003875255584717, + "learning_rate": 9.305990122744716e-06, + "loss": 0.8755, + "step": 4817 + }, + { + "epoch": 0.3893411988120972, + "grad_norm": 2.4520020484924316, + "learning_rate": 9.305657493500134e-06, + "loss": 0.9611, + "step": 4818 + }, + { + "epoch": 0.38942200852542475, + "grad_norm": 2.8413140773773193, + "learning_rate": 9.305324790510001e-06, + "loss": 0.9743, + "step": 4819 + }, + { + "epoch": 0.3895028182387523, + "grad_norm": 2.7833945751190186, + "learning_rate": 9.304992013780023e-06, + "loss": 1.025, + "step": 4820 + }, + { + "epoch": 0.38958362795207985, + "grad_norm": 2.8623552322387695, + "learning_rate": 9.304659163315894e-06, + "loss": 1.0315, + "step": 4821 + }, + { + "epoch": 0.38966443766540737, + "grad_norm": 3.116002321243286, + "learning_rate": 9.304326239123316e-06, + "loss": 0.9782, + "step": 4822 + }, + { + "epoch": 0.38974524737873495, + "grad_norm": 2.8525900840759277, + "learning_rate": 9.303993241207994e-06, + "loss": 0.8915, + "step": 4823 + }, + { + "epoch": 0.3898260570920625, + "grad_norm": 2.838837146759033, + "learning_rate": 9.303660169575626e-06, + "loss": 1.0472, + "step": 4824 + }, + { + "epoch": 0.38990686680539, + "grad_norm": 3.3665411472320557, + "learning_rate": 9.303327024231924e-06, + "loss": 1.0027, + "step": 4825 + }, + { + "epoch": 0.3899876765187176, + "grad_norm": 2.8950564861297607, + "learning_rate": 9.302993805182586e-06, + "loss": 0.8665, + "step": 4826 + }, + { + "epoch": 0.3900684862320451, + "grad_norm": 2.328845500946045, + "learning_rate": 9.302660512433324e-06, + "loss": 1.0294, + "step": 4827 + }, + { + "epoch": 0.3901492959453726, + "grad_norm": 2.6346685886383057, + "learning_rate": 9.302327145989846e-06, + "loss": 0.9807, + "step": 4828 + }, + { + "epoch": 0.3902301056587002, + "grad_norm": 2.6841721534729004, + "learning_rate": 9.301993705857864e-06, + "loss": 0.994, + "step": 4829 + }, + { + "epoch": 0.3903109153720277, + "grad_norm": 2.8069612979888916, + "learning_rate": 9.301660192043082e-06, + "loss": 0.9295, + "step": 4830 + }, + { + "epoch": 0.39039172508535525, + "grad_norm": 3.122180223464966, + "learning_rate": 9.301326604551219e-06, + "loss": 0.994, + "step": 4831 + }, + { + "epoch": 0.3904725347986828, + "grad_norm": 2.763744354248047, + "learning_rate": 9.300992943387988e-06, + "loss": 0.8689, + "step": 4832 + }, + { + "epoch": 0.39055334451201035, + "grad_norm": 2.752152442932129, + "learning_rate": 9.3006592085591e-06, + "loss": 1.0883, + "step": 4833 + }, + { + "epoch": 0.39063415422533787, + "grad_norm": 3.1523523330688477, + "learning_rate": 9.300325400070274e-06, + "loss": 1.0242, + "step": 4834 + }, + { + "epoch": 0.39071496393866545, + "grad_norm": 2.9127399921417236, + "learning_rate": 9.299991517927224e-06, + "loss": 1.2013, + "step": 4835 + }, + { + "epoch": 0.390795773651993, + "grad_norm": 2.768834352493286, + "learning_rate": 9.299657562135676e-06, + "loss": 1.1064, + "step": 4836 + }, + { + "epoch": 0.3908765833653205, + "grad_norm": 2.5925710201263428, + "learning_rate": 9.29932353270134e-06, + "loss": 1.0339, + "step": 4837 + }, + { + "epoch": 0.3909573930786481, + "grad_norm": 3.155369281768799, + "learning_rate": 9.298989429629946e-06, + "loss": 0.9888, + "step": 4838 + }, + { + "epoch": 0.3910382027919756, + "grad_norm": 2.962634801864624, + "learning_rate": 9.298655252927211e-06, + "loss": 1.1091, + "step": 4839 + }, + { + "epoch": 0.3911190125053031, + "grad_norm": 2.7181291580200195, + "learning_rate": 9.298321002598858e-06, + "loss": 0.9446, + "step": 4840 + }, + { + "epoch": 0.3911998222186307, + "grad_norm": 2.699556350708008, + "learning_rate": 9.297986678650617e-06, + "loss": 1.0881, + "step": 4841 + }, + { + "epoch": 0.3912806319319582, + "grad_norm": 2.432003974914551, + "learning_rate": 9.29765228108821e-06, + "loss": 0.9296, + "step": 4842 + }, + { + "epoch": 0.39136144164528575, + "grad_norm": 2.3125052452087402, + "learning_rate": 9.297317809917366e-06, + "loss": 0.9134, + "step": 4843 + }, + { + "epoch": 0.3914422513586133, + "grad_norm": 2.6786537170410156, + "learning_rate": 9.296983265143812e-06, + "loss": 0.905, + "step": 4844 + }, + { + "epoch": 0.39152306107194085, + "grad_norm": 2.5250349044799805, + "learning_rate": 9.296648646773279e-06, + "loss": 0.9933, + "step": 4845 + }, + { + "epoch": 0.39160387078526837, + "grad_norm": 3.2397947311401367, + "learning_rate": 9.2963139548115e-06, + "loss": 0.9712, + "step": 4846 + }, + { + "epoch": 0.39168468049859595, + "grad_norm": 2.5650792121887207, + "learning_rate": 9.295979189264206e-06, + "loss": 1.1292, + "step": 4847 + }, + { + "epoch": 0.3917654902119235, + "grad_norm": 3.4991955757141113, + "learning_rate": 9.29564435013713e-06, + "loss": 0.9921, + "step": 4848 + }, + { + "epoch": 0.391846299925251, + "grad_norm": 2.7665810585021973, + "learning_rate": 9.295309437436007e-06, + "loss": 1.0006, + "step": 4849 + }, + { + "epoch": 0.3919271096385786, + "grad_norm": 3.329380512237549, + "learning_rate": 9.294974451166576e-06, + "loss": 0.973, + "step": 4850 + }, + { + "epoch": 0.3920079193519061, + "grad_norm": 2.8061275482177734, + "learning_rate": 9.29463939133457e-06, + "loss": 0.9337, + "step": 4851 + }, + { + "epoch": 0.3920887290652336, + "grad_norm": 2.5380403995513916, + "learning_rate": 9.294304257945732e-06, + "loss": 0.8911, + "step": 4852 + }, + { + "epoch": 0.3921695387785612, + "grad_norm": 2.8728508949279785, + "learning_rate": 9.293969051005798e-06, + "loss": 0.9115, + "step": 4853 + }, + { + "epoch": 0.3922503484918887, + "grad_norm": 3.0287833213806152, + "learning_rate": 9.293633770520514e-06, + "loss": 0.8794, + "step": 4854 + }, + { + "epoch": 0.39233115820521625, + "grad_norm": 3.8823435306549072, + "learning_rate": 9.293298416495618e-06, + "loss": 0.9241, + "step": 4855 + }, + { + "epoch": 0.3924119679185438, + "grad_norm": 2.7386648654937744, + "learning_rate": 9.292962988936856e-06, + "loss": 0.9388, + "step": 4856 + }, + { + "epoch": 0.39249277763187135, + "grad_norm": 2.5009844303131104, + "learning_rate": 9.292627487849975e-06, + "loss": 0.994, + "step": 4857 + }, + { + "epoch": 0.39257358734519887, + "grad_norm": 2.9496538639068604, + "learning_rate": 9.292291913240716e-06, + "loss": 0.9525, + "step": 4858 + }, + { + "epoch": 0.39265439705852645, + "grad_norm": 2.7968921661376953, + "learning_rate": 9.291956265114832e-06, + "loss": 0.918, + "step": 4859 + }, + { + "epoch": 0.392735206771854, + "grad_norm": 3.234957695007324, + "learning_rate": 9.29162054347807e-06, + "loss": 0.8922, + "step": 4860 + }, + { + "epoch": 0.3928160164851815, + "grad_norm": 2.7150919437408447, + "learning_rate": 9.291284748336179e-06, + "loss": 0.8799, + "step": 4861 + }, + { + "epoch": 0.3928968261985091, + "grad_norm": 2.8099822998046875, + "learning_rate": 9.29094887969491e-06, + "loss": 0.9143, + "step": 4862 + }, + { + "epoch": 0.3929776359118366, + "grad_norm": 2.824078321456909, + "learning_rate": 9.290612937560017e-06, + "loss": 0.9023, + "step": 4863 + }, + { + "epoch": 0.3930584456251641, + "grad_norm": 2.730456590652466, + "learning_rate": 9.290276921937256e-06, + "loss": 1.0412, + "step": 4864 + }, + { + "epoch": 0.3931392553384917, + "grad_norm": 2.659019947052002, + "learning_rate": 9.289940832832377e-06, + "loss": 1.0026, + "step": 4865 + }, + { + "epoch": 0.3932200650518192, + "grad_norm": 3.3061633110046387, + "learning_rate": 9.28960467025114e-06, + "loss": 0.9082, + "step": 4866 + }, + { + "epoch": 0.39330087476514675, + "grad_norm": 3.18700909614563, + "learning_rate": 9.289268434199302e-06, + "loss": 0.9735, + "step": 4867 + }, + { + "epoch": 0.3933816844784743, + "grad_norm": 2.674553632736206, + "learning_rate": 9.28893212468262e-06, + "loss": 1.0538, + "step": 4868 + }, + { + "epoch": 0.39346249419180185, + "grad_norm": 2.7704389095306396, + "learning_rate": 9.28859574170686e-06, + "loss": 0.9991, + "step": 4869 + }, + { + "epoch": 0.39354330390512937, + "grad_norm": 2.8787927627563477, + "learning_rate": 9.288259285277776e-06, + "loss": 0.9363, + "step": 4870 + }, + { + "epoch": 0.39362411361845695, + "grad_norm": 2.7347707748413086, + "learning_rate": 9.287922755401135e-06, + "loss": 0.8876, + "step": 4871 + }, + { + "epoch": 0.3937049233317845, + "grad_norm": 2.5701942443847656, + "learning_rate": 9.2875861520827e-06, + "loss": 0.9612, + "step": 4872 + }, + { + "epoch": 0.393785733045112, + "grad_norm": 2.8623557090759277, + "learning_rate": 9.287249475328236e-06, + "loss": 1.0507, + "step": 4873 + }, + { + "epoch": 0.3938665427584396, + "grad_norm": 2.523864269256592, + "learning_rate": 9.28691272514351e-06, + "loss": 1.0098, + "step": 4874 + }, + { + "epoch": 0.3939473524717671, + "grad_norm": 2.9024195671081543, + "learning_rate": 9.28657590153429e-06, + "loss": 1.0518, + "step": 4875 + }, + { + "epoch": 0.3940281621850946, + "grad_norm": 2.553307056427002, + "learning_rate": 9.286239004506342e-06, + "loss": 0.8807, + "step": 4876 + }, + { + "epoch": 0.3941089718984222, + "grad_norm": 2.453803777694702, + "learning_rate": 9.28590203406544e-06, + "loss": 0.9068, + "step": 4877 + }, + { + "epoch": 0.3941897816117497, + "grad_norm": 2.7090606689453125, + "learning_rate": 9.285564990217355e-06, + "loss": 0.9219, + "step": 4878 + }, + { + "epoch": 0.3942705913250773, + "grad_norm": 2.4958701133728027, + "learning_rate": 9.285227872967857e-06, + "loss": 1.0385, + "step": 4879 + }, + { + "epoch": 0.3943514010384048, + "grad_norm": 2.777512550354004, + "learning_rate": 9.284890682322723e-06, + "loss": 0.9917, + "step": 4880 + }, + { + "epoch": 0.39443221075173235, + "grad_norm": 2.6221389770507812, + "learning_rate": 9.284553418287725e-06, + "loss": 1.0376, + "step": 4881 + }, + { + "epoch": 0.3945130204650599, + "grad_norm": 2.436344861984253, + "learning_rate": 9.284216080868645e-06, + "loss": 1.0367, + "step": 4882 + }, + { + "epoch": 0.39459383017838745, + "grad_norm": 2.7965519428253174, + "learning_rate": 9.283878670071255e-06, + "loss": 0.8879, + "step": 4883 + }, + { + "epoch": 0.394674639891715, + "grad_norm": 2.6184844970703125, + "learning_rate": 9.283541185901337e-06, + "loss": 0.9681, + "step": 4884 + }, + { + "epoch": 0.39475544960504255, + "grad_norm": 2.4829251766204834, + "learning_rate": 9.28320362836467e-06, + "loss": 0.9471, + "step": 4885 + }, + { + "epoch": 0.3948362593183701, + "grad_norm": 2.3925609588623047, + "learning_rate": 9.282865997467037e-06, + "loss": 0.9715, + "step": 4886 + }, + { + "epoch": 0.3949170690316976, + "grad_norm": 2.9331045150756836, + "learning_rate": 9.282528293214219e-06, + "loss": 1.0717, + "step": 4887 + }, + { + "epoch": 0.3949978787450252, + "grad_norm": 2.435237169265747, + "learning_rate": 9.282190515612003e-06, + "loss": 0.9683, + "step": 4888 + }, + { + "epoch": 0.3950786884583527, + "grad_norm": 2.5075645446777344, + "learning_rate": 9.281852664666171e-06, + "loss": 0.9484, + "step": 4889 + }, + { + "epoch": 0.3951594981716802, + "grad_norm": 2.5758745670318604, + "learning_rate": 9.281514740382511e-06, + "loss": 0.9597, + "step": 4890 + }, + { + "epoch": 0.3952403078850078, + "grad_norm": 2.3525283336639404, + "learning_rate": 9.281176742766811e-06, + "loss": 1.0151, + "step": 4891 + }, + { + "epoch": 0.3953211175983353, + "grad_norm": 2.4200117588043213, + "learning_rate": 9.280838671824861e-06, + "loss": 0.9881, + "step": 4892 + }, + { + "epoch": 0.39540192731166285, + "grad_norm": 2.7584762573242188, + "learning_rate": 9.280500527562449e-06, + "loss": 0.8927, + "step": 4893 + }, + { + "epoch": 0.3954827370249904, + "grad_norm": 2.8647139072418213, + "learning_rate": 9.280162309985369e-06, + "loss": 1.0694, + "step": 4894 + }, + { + "epoch": 0.39556354673831795, + "grad_norm": 3.1318676471710205, + "learning_rate": 9.279824019099412e-06, + "loss": 1.0155, + "step": 4895 + }, + { + "epoch": 0.3956443564516455, + "grad_norm": 2.8281431198120117, + "learning_rate": 9.279485654910371e-06, + "loss": 0.9599, + "step": 4896 + }, + { + "epoch": 0.39572516616497305, + "grad_norm": 2.5201003551483154, + "learning_rate": 9.279147217424046e-06, + "loss": 1.0393, + "step": 4897 + }, + { + "epoch": 0.3958059758783006, + "grad_norm": 2.6277709007263184, + "learning_rate": 9.27880870664623e-06, + "loss": 1.1643, + "step": 4898 + }, + { + "epoch": 0.3958867855916281, + "grad_norm": 2.8465261459350586, + "learning_rate": 9.27847012258272e-06, + "loss": 0.9927, + "step": 4899 + }, + { + "epoch": 0.3959675953049557, + "grad_norm": 2.7331273555755615, + "learning_rate": 9.27813146523932e-06, + "loss": 0.9616, + "step": 4900 + }, + { + "epoch": 0.3960484050182832, + "grad_norm": 3.133472442626953, + "learning_rate": 9.277792734621825e-06, + "loss": 1.0276, + "step": 4901 + }, + { + "epoch": 0.3961292147316107, + "grad_norm": 2.812513828277588, + "learning_rate": 9.277453930736039e-06, + "loss": 0.923, + "step": 4902 + }, + { + "epoch": 0.3962100244449383, + "grad_norm": 3.0874874591827393, + "learning_rate": 9.277115053587764e-06, + "loss": 0.9529, + "step": 4903 + }, + { + "epoch": 0.3962908341582658, + "grad_norm": 2.806426525115967, + "learning_rate": 9.276776103182806e-06, + "loss": 0.9119, + "step": 4904 + }, + { + "epoch": 0.39637164387159335, + "grad_norm": 2.516444206237793, + "learning_rate": 9.276437079526969e-06, + "loss": 0.8762, + "step": 4905 + }, + { + "epoch": 0.3964524535849209, + "grad_norm": 3.095404863357544, + "learning_rate": 9.27609798262606e-06, + "loss": 0.9768, + "step": 4906 + }, + { + "epoch": 0.39653326329824845, + "grad_norm": 2.437582015991211, + "learning_rate": 9.275758812485887e-06, + "loss": 0.9226, + "step": 4907 + }, + { + "epoch": 0.396614073011576, + "grad_norm": 2.405050754547119, + "learning_rate": 9.275419569112258e-06, + "loss": 1.0769, + "step": 4908 + }, + { + "epoch": 0.39669488272490355, + "grad_norm": 2.7021193504333496, + "learning_rate": 9.275080252510986e-06, + "loss": 0.9472, + "step": 4909 + }, + { + "epoch": 0.3967756924382311, + "grad_norm": 3.1435816287994385, + "learning_rate": 9.27474086268788e-06, + "loss": 0.9132, + "step": 4910 + }, + { + "epoch": 0.3968565021515586, + "grad_norm": 3.182762384414673, + "learning_rate": 9.274401399648755e-06, + "loss": 0.902, + "step": 4911 + }, + { + "epoch": 0.3969373118648862, + "grad_norm": 2.7405760288238525, + "learning_rate": 9.274061863399424e-06, + "loss": 0.924, + "step": 4912 + }, + { + "epoch": 0.3970181215782137, + "grad_norm": 2.6312766075134277, + "learning_rate": 9.273722253945701e-06, + "loss": 1.1052, + "step": 4913 + }, + { + "epoch": 0.3970989312915412, + "grad_norm": 2.851205825805664, + "learning_rate": 9.273382571293407e-06, + "loss": 0.9171, + "step": 4914 + }, + { + "epoch": 0.3971797410048688, + "grad_norm": 2.6783201694488525, + "learning_rate": 9.273042815448357e-06, + "loss": 1.0252, + "step": 4915 + }, + { + "epoch": 0.3972605507181963, + "grad_norm": 2.60844087600708, + "learning_rate": 9.272702986416368e-06, + "loss": 0.9343, + "step": 4916 + }, + { + "epoch": 0.39734136043152385, + "grad_norm": 2.7676470279693604, + "learning_rate": 9.272363084203264e-06, + "loss": 0.9558, + "step": 4917 + }, + { + "epoch": 0.3974221701448514, + "grad_norm": 3.101247549057007, + "learning_rate": 9.272023108814867e-06, + "loss": 0.8752, + "step": 4918 + }, + { + "epoch": 0.39750297985817895, + "grad_norm": 3.0404317378997803, + "learning_rate": 9.271683060256997e-06, + "loss": 1.0014, + "step": 4919 + }, + { + "epoch": 0.3975837895715065, + "grad_norm": 2.6339497566223145, + "learning_rate": 9.271342938535481e-06, + "loss": 0.983, + "step": 4920 + }, + { + "epoch": 0.39766459928483405, + "grad_norm": 2.664785623550415, + "learning_rate": 9.271002743656142e-06, + "loss": 0.993, + "step": 4921 + }, + { + "epoch": 0.3977454089981616, + "grad_norm": 3.0166149139404297, + "learning_rate": 9.270662475624809e-06, + "loss": 0.874, + "step": 4922 + }, + { + "epoch": 0.3978262187114891, + "grad_norm": 2.7920243740081787, + "learning_rate": 9.270322134447309e-06, + "loss": 1.0232, + "step": 4923 + }, + { + "epoch": 0.3979070284248167, + "grad_norm": 2.591841220855713, + "learning_rate": 9.26998172012947e-06, + "loss": 0.9601, + "step": 4924 + }, + { + "epoch": 0.3979878381381442, + "grad_norm": 3.2867536544799805, + "learning_rate": 9.269641232677126e-06, + "loss": 1.0158, + "step": 4925 + }, + { + "epoch": 0.3980686478514717, + "grad_norm": 2.7288825511932373, + "learning_rate": 9.269300672096105e-06, + "loss": 0.9982, + "step": 4926 + }, + { + "epoch": 0.3981494575647993, + "grad_norm": 2.49446702003479, + "learning_rate": 9.268960038392242e-06, + "loss": 0.9894, + "step": 4927 + }, + { + "epoch": 0.3982302672781268, + "grad_norm": 2.6400210857391357, + "learning_rate": 9.268619331571369e-06, + "loss": 0.9767, + "step": 4928 + }, + { + "epoch": 0.39831107699145435, + "grad_norm": 2.6287052631378174, + "learning_rate": 9.268278551639325e-06, + "loss": 0.9216, + "step": 4929 + }, + { + "epoch": 0.39839188670478193, + "grad_norm": 2.7632782459259033, + "learning_rate": 9.267937698601946e-06, + "loss": 0.9815, + "step": 4930 + }, + { + "epoch": 0.39847269641810945, + "grad_norm": 2.5979580879211426, + "learning_rate": 9.267596772465066e-06, + "loss": 0.9465, + "step": 4931 + }, + { + "epoch": 0.398553506131437, + "grad_norm": 2.6885600090026855, + "learning_rate": 9.267255773234526e-06, + "loss": 0.9149, + "step": 4932 + }, + { + "epoch": 0.39863431584476455, + "grad_norm": 2.767235517501831, + "learning_rate": 9.266914700916172e-06, + "loss": 0.9113, + "step": 4933 + }, + { + "epoch": 0.3987151255580921, + "grad_norm": 2.7402396202087402, + "learning_rate": 9.266573555515838e-06, + "loss": 0.9815, + "step": 4934 + }, + { + "epoch": 0.3987959352714196, + "grad_norm": 2.782092332839966, + "learning_rate": 9.266232337039372e-06, + "loss": 0.8879, + "step": 4935 + }, + { + "epoch": 0.3988767449847472, + "grad_norm": 2.624999523162842, + "learning_rate": 9.265891045492616e-06, + "loss": 0.9257, + "step": 4936 + }, + { + "epoch": 0.3989575546980747, + "grad_norm": 2.7730746269226074, + "learning_rate": 9.265549680881416e-06, + "loss": 1.0308, + "step": 4937 + }, + { + "epoch": 0.3990383644114022, + "grad_norm": 2.442979335784912, + "learning_rate": 9.26520824321162e-06, + "loss": 0.9566, + "step": 4938 + }, + { + "epoch": 0.3991191741247298, + "grad_norm": 3.1745896339416504, + "learning_rate": 9.264866732489073e-06, + "loss": 1.0078, + "step": 4939 + }, + { + "epoch": 0.3991999838380573, + "grad_norm": 2.7653796672821045, + "learning_rate": 9.264525148719628e-06, + "loss": 0.8728, + "step": 4940 + }, + { + "epoch": 0.39928079355138485, + "grad_norm": 2.759915828704834, + "learning_rate": 9.264183491909133e-06, + "loss": 1.0393, + "step": 4941 + }, + { + "epoch": 0.39936160326471243, + "grad_norm": 2.7928683757781982, + "learning_rate": 9.263841762063438e-06, + "loss": 0.9586, + "step": 4942 + }, + { + "epoch": 0.39944241297803995, + "grad_norm": 2.7925634384155273, + "learning_rate": 9.263499959188403e-06, + "loss": 1.0601, + "step": 4943 + }, + { + "epoch": 0.39952322269136753, + "grad_norm": 2.689061164855957, + "learning_rate": 9.263158083289874e-06, + "loss": 1.0542, + "step": 4944 + }, + { + "epoch": 0.39960403240469505, + "grad_norm": 2.5203075408935547, + "learning_rate": 9.262816134373711e-06, + "loss": 0.8948, + "step": 4945 + }, + { + "epoch": 0.3996848421180226, + "grad_norm": 3.390443801879883, + "learning_rate": 9.26247411244577e-06, + "loss": 0.9546, + "step": 4946 + }, + { + "epoch": 0.39976565183135016, + "grad_norm": 2.86378812789917, + "learning_rate": 9.26213201751191e-06, + "loss": 1.1114, + "step": 4947 + }, + { + "epoch": 0.3998464615446777, + "grad_norm": 3.285104274749756, + "learning_rate": 9.261789849577988e-06, + "loss": 0.968, + "step": 4948 + }, + { + "epoch": 0.3999272712580052, + "grad_norm": 2.772576332092285, + "learning_rate": 9.261447608649866e-06, + "loss": 1.0612, + "step": 4949 + }, + { + "epoch": 0.4000080809713328, + "grad_norm": 2.4840214252471924, + "learning_rate": 9.261105294733405e-06, + "loss": 0.9996, + "step": 4950 + }, + { + "epoch": 0.4000888906846603, + "grad_norm": 2.6143908500671387, + "learning_rate": 9.26076290783447e-06, + "loss": 0.8929, + "step": 4951 + }, + { + "epoch": 0.4001697003979878, + "grad_norm": 3.0794944763183594, + "learning_rate": 9.260420447958922e-06, + "loss": 0.9006, + "step": 4952 + }, + { + "epoch": 0.4002505101113154, + "grad_norm": 2.430558204650879, + "learning_rate": 9.26007791511263e-06, + "loss": 1.0531, + "step": 4953 + }, + { + "epoch": 0.40033131982464293, + "grad_norm": 2.5490753650665283, + "learning_rate": 9.259735309301458e-06, + "loss": 0.9219, + "step": 4954 + }, + { + "epoch": 0.40041212953797045, + "grad_norm": 2.9990432262420654, + "learning_rate": 9.259392630531275e-06, + "loss": 1.0364, + "step": 4955 + }, + { + "epoch": 0.40049293925129803, + "grad_norm": 2.576552629470825, + "learning_rate": 9.259049878807951e-06, + "loss": 0.9433, + "step": 4956 + }, + { + "epoch": 0.40057374896462555, + "grad_norm": 3.066596508026123, + "learning_rate": 9.258707054137354e-06, + "loss": 0.9373, + "step": 4957 + }, + { + "epoch": 0.4006545586779531, + "grad_norm": 2.7827351093292236, + "learning_rate": 9.258364156525359e-06, + "loss": 0.9872, + "step": 4958 + }, + { + "epoch": 0.40073536839128066, + "grad_norm": 2.649155378341675, + "learning_rate": 9.258021185977838e-06, + "loss": 0.9655, + "step": 4959 + }, + { + "epoch": 0.4008161781046082, + "grad_norm": 2.7922468185424805, + "learning_rate": 9.257678142500663e-06, + "loss": 0.9581, + "step": 4960 + }, + { + "epoch": 0.4008969878179357, + "grad_norm": 3.1652913093566895, + "learning_rate": 9.257335026099714e-06, + "loss": 1.0054, + "step": 4961 + }, + { + "epoch": 0.4009777975312633, + "grad_norm": 2.7333192825317383, + "learning_rate": 9.256991836780864e-06, + "loss": 1.0715, + "step": 4962 + }, + { + "epoch": 0.4010586072445908, + "grad_norm": 2.778630018234253, + "learning_rate": 9.256648574549992e-06, + "loss": 0.9657, + "step": 4963 + }, + { + "epoch": 0.4011394169579183, + "grad_norm": 2.9172372817993164, + "learning_rate": 9.256305239412977e-06, + "loss": 0.9402, + "step": 4964 + }, + { + "epoch": 0.4012202266712459, + "grad_norm": 2.408510446548462, + "learning_rate": 9.2559618313757e-06, + "loss": 1.058, + "step": 4965 + }, + { + "epoch": 0.40130103638457343, + "grad_norm": 2.9090781211853027, + "learning_rate": 9.255618350444042e-06, + "loss": 0.9233, + "step": 4966 + }, + { + "epoch": 0.40138184609790095, + "grad_norm": 2.568174123764038, + "learning_rate": 9.255274796623887e-06, + "loss": 0.9663, + "step": 4967 + }, + { + "epoch": 0.40146265581122853, + "grad_norm": 3.020400285720825, + "learning_rate": 9.254931169921121e-06, + "loss": 0.9937, + "step": 4968 + }, + { + "epoch": 0.40154346552455605, + "grad_norm": 2.8069992065429688, + "learning_rate": 9.254587470341624e-06, + "loss": 0.9923, + "step": 4969 + }, + { + "epoch": 0.4016242752378836, + "grad_norm": 2.922452211380005, + "learning_rate": 9.25424369789129e-06, + "loss": 1.0148, + "step": 4970 + }, + { + "epoch": 0.40170508495121116, + "grad_norm": 2.6371235847473145, + "learning_rate": 9.253899852576e-06, + "loss": 1.0003, + "step": 4971 + }, + { + "epoch": 0.4017858946645387, + "grad_norm": 2.6325368881225586, + "learning_rate": 9.253555934401647e-06, + "loss": 1.0359, + "step": 4972 + }, + { + "epoch": 0.4018667043778662, + "grad_norm": 2.802062511444092, + "learning_rate": 9.253211943374122e-06, + "loss": 0.877, + "step": 4973 + }, + { + "epoch": 0.4019475140911938, + "grad_norm": 2.4746196269989014, + "learning_rate": 9.252867879499314e-06, + "loss": 0.9746, + "step": 4974 + }, + { + "epoch": 0.4020283238045213, + "grad_norm": 2.960315227508545, + "learning_rate": 9.25252374278312e-06, + "loss": 0.9999, + "step": 4975 + }, + { + "epoch": 0.4021091335178488, + "grad_norm": 3.069545269012451, + "learning_rate": 9.252179533231428e-06, + "loss": 1.0284, + "step": 4976 + }, + { + "epoch": 0.4021899432311764, + "grad_norm": 2.5236165523529053, + "learning_rate": 9.251835250850141e-06, + "loss": 0.88, + "step": 4977 + }, + { + "epoch": 0.40227075294450393, + "grad_norm": 2.598947525024414, + "learning_rate": 9.25149089564515e-06, + "loss": 1.0208, + "step": 4978 + }, + { + "epoch": 0.40235156265783145, + "grad_norm": 2.8756184577941895, + "learning_rate": 9.251146467622356e-06, + "loss": 0.9926, + "step": 4979 + }, + { + "epoch": 0.40243237237115903, + "grad_norm": 2.3564445972442627, + "learning_rate": 9.250801966787657e-06, + "loss": 0.9608, + "step": 4980 + }, + { + "epoch": 0.40251318208448655, + "grad_norm": 2.262138605117798, + "learning_rate": 9.250457393146954e-06, + "loss": 1.012, + "step": 4981 + }, + { + "epoch": 0.4025939917978141, + "grad_norm": 2.7162976264953613, + "learning_rate": 9.250112746706148e-06, + "loss": 0.9554, + "step": 4982 + }, + { + "epoch": 0.40267480151114166, + "grad_norm": 2.8713443279266357, + "learning_rate": 9.249768027471142e-06, + "loss": 0.9066, + "step": 4983 + }, + { + "epoch": 0.4027556112244692, + "grad_norm": 2.5405688285827637, + "learning_rate": 9.24942323544784e-06, + "loss": 1.043, + "step": 4984 + }, + { + "epoch": 0.4028364209377967, + "grad_norm": 2.867854595184326, + "learning_rate": 9.249078370642149e-06, + "loss": 0.9656, + "step": 4985 + }, + { + "epoch": 0.4029172306511243, + "grad_norm": 2.8119189739227295, + "learning_rate": 9.248733433059976e-06, + "loss": 1.0146, + "step": 4986 + }, + { + "epoch": 0.4029980403644518, + "grad_norm": 2.9293620586395264, + "learning_rate": 9.248388422707227e-06, + "loss": 1.0749, + "step": 4987 + }, + { + "epoch": 0.4030788500777793, + "grad_norm": 2.5957796573638916, + "learning_rate": 9.24804333958981e-06, + "loss": 1.0848, + "step": 4988 + }, + { + "epoch": 0.4031596597911069, + "grad_norm": 2.9666616916656494, + "learning_rate": 9.247698183713637e-06, + "loss": 0.9397, + "step": 4989 + }, + { + "epoch": 0.40324046950443443, + "grad_norm": 2.509471893310547, + "learning_rate": 9.247352955084623e-06, + "loss": 1.0488, + "step": 4990 + }, + { + "epoch": 0.40332127921776195, + "grad_norm": 2.3868610858917236, + "learning_rate": 9.247007653708677e-06, + "loss": 1.0358, + "step": 4991 + }, + { + "epoch": 0.40340208893108953, + "grad_norm": 2.387303352355957, + "learning_rate": 9.246662279591713e-06, + "loss": 0.9579, + "step": 4992 + }, + { + "epoch": 0.40348289864441705, + "grad_norm": 2.869910717010498, + "learning_rate": 9.24631683273965e-06, + "loss": 1.0516, + "step": 4993 + }, + { + "epoch": 0.4035637083577446, + "grad_norm": 2.5866353511810303, + "learning_rate": 9.245971313158399e-06, + "loss": 0.8982, + "step": 4994 + }, + { + "epoch": 0.40364451807107216, + "grad_norm": 2.5346262454986572, + "learning_rate": 9.245625720853883e-06, + "loss": 1.0128, + "step": 4995 + }, + { + "epoch": 0.4037253277843997, + "grad_norm": 2.6702122688293457, + "learning_rate": 9.24528005583202e-06, + "loss": 0.857, + "step": 4996 + }, + { + "epoch": 0.4038061374977272, + "grad_norm": 3.1742382049560547, + "learning_rate": 9.244934318098729e-06, + "loss": 0.9896, + "step": 4997 + }, + { + "epoch": 0.4038869472110548, + "grad_norm": 3.3246395587921143, + "learning_rate": 9.24458850765993e-06, + "loss": 1.0591, + "step": 4998 + }, + { + "epoch": 0.4039677569243823, + "grad_norm": 2.796469211578369, + "learning_rate": 9.244242624521551e-06, + "loss": 0.8671, + "step": 4999 + }, + { + "epoch": 0.4040485666377098, + "grad_norm": 2.476184606552124, + "learning_rate": 9.243896668689514e-06, + "loss": 1.1087, + "step": 5000 + }, + { + "epoch": 0.4040485666377098, + "eval_loss": 0.8171849846839905, + "eval_runtime": 813.6456, + "eval_samples_per_second": 102.46, + "eval_steps_per_second": 12.808, + "step": 5000 + }, + { + "epoch": 0.4041293763510374, + "grad_norm": 2.734010934829712, + "learning_rate": 9.243550640169743e-06, + "loss": 0.9336, + "step": 5001 + }, + { + "epoch": 0.40421018606436493, + "grad_norm": 2.4164507389068604, + "learning_rate": 9.243204538968165e-06, + "loss": 0.9413, + "step": 5002 + }, + { + "epoch": 0.40429099577769245, + "grad_norm": 2.576552152633667, + "learning_rate": 9.242858365090708e-06, + "loss": 0.9995, + "step": 5003 + }, + { + "epoch": 0.40437180549102003, + "grad_norm": 2.5718369483947754, + "learning_rate": 9.242512118543302e-06, + "loss": 0.8484, + "step": 5004 + }, + { + "epoch": 0.40445261520434755, + "grad_norm": 2.749690532684326, + "learning_rate": 9.242165799331877e-06, + "loss": 0.9282, + "step": 5005 + }, + { + "epoch": 0.4045334249176751, + "grad_norm": 2.779601812362671, + "learning_rate": 9.241819407462364e-06, + "loss": 0.9441, + "step": 5006 + }, + { + "epoch": 0.40461423463100266, + "grad_norm": 2.642305374145508, + "learning_rate": 9.241472942940697e-06, + "loss": 1.0112, + "step": 5007 + }, + { + "epoch": 0.4046950443443302, + "grad_norm": 2.582791805267334, + "learning_rate": 9.241126405772809e-06, + "loss": 0.9899, + "step": 5008 + }, + { + "epoch": 0.40477585405765776, + "grad_norm": 2.3769354820251465, + "learning_rate": 9.240779795964637e-06, + "loss": 0.9363, + "step": 5009 + }, + { + "epoch": 0.4048566637709853, + "grad_norm": 3.000255823135376, + "learning_rate": 9.240433113522114e-06, + "loss": 0.899, + "step": 5010 + }, + { + "epoch": 0.4049374734843128, + "grad_norm": 2.493863821029663, + "learning_rate": 9.240086358451182e-06, + "loss": 1.0402, + "step": 5011 + }, + { + "epoch": 0.4050182831976404, + "grad_norm": 2.9411659240722656, + "learning_rate": 9.239739530757776e-06, + "loss": 1.0769, + "step": 5012 + }, + { + "epoch": 0.4050990929109679, + "grad_norm": 3.193103790283203, + "learning_rate": 9.23939263044784e-06, + "loss": 1.161, + "step": 5013 + }, + { + "epoch": 0.40517990262429543, + "grad_norm": 2.7817883491516113, + "learning_rate": 9.239045657527315e-06, + "loss": 1.0976, + "step": 5014 + }, + { + "epoch": 0.405260712337623, + "grad_norm": 2.5293238162994385, + "learning_rate": 9.238698612002143e-06, + "loss": 1.0325, + "step": 5015 + }, + { + "epoch": 0.40534152205095053, + "grad_norm": 2.5327422618865967, + "learning_rate": 9.238351493878268e-06, + "loss": 0.885, + "step": 5016 + }, + { + "epoch": 0.40542233176427805, + "grad_norm": 2.972574472427368, + "learning_rate": 9.238004303161635e-06, + "loss": 0.9468, + "step": 5017 + }, + { + "epoch": 0.40550314147760563, + "grad_norm": 2.9105494022369385, + "learning_rate": 9.23765703985819e-06, + "loss": 0.9391, + "step": 5018 + }, + { + "epoch": 0.40558395119093316, + "grad_norm": 2.900505542755127, + "learning_rate": 9.237309703973882e-06, + "loss": 1.1088, + "step": 5019 + }, + { + "epoch": 0.4056647609042607, + "grad_norm": 2.674318790435791, + "learning_rate": 9.23696229551466e-06, + "loss": 0.8911, + "step": 5020 + }, + { + "epoch": 0.40574557061758826, + "grad_norm": 3.3266994953155518, + "learning_rate": 9.236614814486473e-06, + "loss": 1.0359, + "step": 5021 + }, + { + "epoch": 0.4058263803309158, + "grad_norm": 3.1881871223449707, + "learning_rate": 9.236267260895275e-06, + "loss": 0.9984, + "step": 5022 + }, + { + "epoch": 0.4059071900442433, + "grad_norm": 3.207288980484009, + "learning_rate": 9.235919634747017e-06, + "loss": 0.9254, + "step": 5023 + }, + { + "epoch": 0.4059879997575709, + "grad_norm": 2.8666768074035645, + "learning_rate": 9.235571936047652e-06, + "loss": 0.9434, + "step": 5024 + }, + { + "epoch": 0.4060688094708984, + "grad_norm": 2.9539434909820557, + "learning_rate": 9.235224164803138e-06, + "loss": 1.001, + "step": 5025 + }, + { + "epoch": 0.40614961918422593, + "grad_norm": 2.580145835876465, + "learning_rate": 9.234876321019429e-06, + "loss": 1.102, + "step": 5026 + }, + { + "epoch": 0.4062304288975535, + "grad_norm": 2.855926036834717, + "learning_rate": 9.234528404702484e-06, + "loss": 0.9665, + "step": 5027 + }, + { + "epoch": 0.40631123861088103, + "grad_norm": 2.862095594406128, + "learning_rate": 9.23418041585826e-06, + "loss": 0.9499, + "step": 5028 + }, + { + "epoch": 0.40639204832420855, + "grad_norm": 2.802712917327881, + "learning_rate": 9.233832354492721e-06, + "loss": 0.9587, + "step": 5029 + }, + { + "epoch": 0.40647285803753613, + "grad_norm": 2.902677297592163, + "learning_rate": 9.233484220611825e-06, + "loss": 1.161, + "step": 5030 + }, + { + "epoch": 0.40655366775086366, + "grad_norm": 2.7959911823272705, + "learning_rate": 9.233136014221537e-06, + "loss": 0.9348, + "step": 5031 + }, + { + "epoch": 0.4066344774641912, + "grad_norm": 2.9856088161468506, + "learning_rate": 9.232787735327821e-06, + "loss": 0.9717, + "step": 5032 + }, + { + "epoch": 0.40671528717751876, + "grad_norm": 3.0933585166931152, + "learning_rate": 9.232439383936638e-06, + "loss": 0.8442, + "step": 5033 + }, + { + "epoch": 0.4067960968908463, + "grad_norm": 2.7330124378204346, + "learning_rate": 9.23209096005396e-06, + "loss": 0.8693, + "step": 5034 + }, + { + "epoch": 0.4068769066041738, + "grad_norm": 2.3398380279541016, + "learning_rate": 9.23174246368575e-06, + "loss": 1.0293, + "step": 5035 + }, + { + "epoch": 0.4069577163175014, + "grad_norm": 2.698263168334961, + "learning_rate": 9.231393894837983e-06, + "loss": 0.8785, + "step": 5036 + }, + { + "epoch": 0.4070385260308289, + "grad_norm": 2.6033833026885986, + "learning_rate": 9.231045253516622e-06, + "loss": 1.0154, + "step": 5037 + }, + { + "epoch": 0.40711933574415643, + "grad_norm": 2.1953933238983154, + "learning_rate": 9.230696539727641e-06, + "loss": 1.0248, + "step": 5038 + }, + { + "epoch": 0.407200145457484, + "grad_norm": 2.8867642879486084, + "learning_rate": 9.230347753477015e-06, + "loss": 0.9788, + "step": 5039 + }, + { + "epoch": 0.40728095517081153, + "grad_norm": 3.269437551498413, + "learning_rate": 9.229998894770717e-06, + "loss": 0.953, + "step": 5040 + }, + { + "epoch": 0.40736176488413905, + "grad_norm": 2.780695676803589, + "learning_rate": 9.22964996361472e-06, + "loss": 0.96, + "step": 5041 + }, + { + "epoch": 0.40744257459746663, + "grad_norm": 2.9803736209869385, + "learning_rate": 9.229300960015003e-06, + "loss": 1.0653, + "step": 5042 + }, + { + "epoch": 0.40752338431079416, + "grad_norm": 2.8825063705444336, + "learning_rate": 9.22895188397754e-06, + "loss": 1.0618, + "step": 5043 + }, + { + "epoch": 0.4076041940241217, + "grad_norm": 2.861132860183716, + "learning_rate": 9.228602735508312e-06, + "loss": 1.0021, + "step": 5044 + }, + { + "epoch": 0.40768500373744926, + "grad_norm": 2.679664373397827, + "learning_rate": 9.2282535146133e-06, + "loss": 0.9712, + "step": 5045 + }, + { + "epoch": 0.4077658134507768, + "grad_norm": 2.448105812072754, + "learning_rate": 9.227904221298485e-06, + "loss": 0.8787, + "step": 5046 + }, + { + "epoch": 0.4078466231641043, + "grad_norm": 3.302600145339966, + "learning_rate": 9.227554855569847e-06, + "loss": 0.9055, + "step": 5047 + }, + { + "epoch": 0.4079274328774319, + "grad_norm": 2.812666654586792, + "learning_rate": 9.227205417433373e-06, + "loss": 0.8513, + "step": 5048 + }, + { + "epoch": 0.4080082425907594, + "grad_norm": 2.6948466300964355, + "learning_rate": 9.226855906895047e-06, + "loss": 0.9003, + "step": 5049 + }, + { + "epoch": 0.40808905230408693, + "grad_norm": 2.5601093769073486, + "learning_rate": 9.226506323960856e-06, + "loss": 0.9442, + "step": 5050 + }, + { + "epoch": 0.4081698620174145, + "grad_norm": 2.8308019638061523, + "learning_rate": 9.226156668636785e-06, + "loss": 0.8969, + "step": 5051 + }, + { + "epoch": 0.40825067173074203, + "grad_norm": 3.942269802093506, + "learning_rate": 9.225806940928825e-06, + "loss": 1.0799, + "step": 5052 + }, + { + "epoch": 0.40833148144406956, + "grad_norm": 2.7755072116851807, + "learning_rate": 9.225457140842964e-06, + "loss": 1.0485, + "step": 5053 + }, + { + "epoch": 0.40841229115739713, + "grad_norm": 2.5427701473236084, + "learning_rate": 9.225107268385196e-06, + "loss": 1.0699, + "step": 5054 + }, + { + "epoch": 0.40849310087072466, + "grad_norm": 2.619122266769409, + "learning_rate": 9.224757323561511e-06, + "loss": 0.9209, + "step": 5055 + }, + { + "epoch": 0.4085739105840522, + "grad_norm": 2.6481258869171143, + "learning_rate": 9.224407306377906e-06, + "loss": 0.8791, + "step": 5056 + }, + { + "epoch": 0.40865472029737976, + "grad_norm": 2.825996160507202, + "learning_rate": 9.224057216840371e-06, + "loss": 0.9755, + "step": 5057 + }, + { + "epoch": 0.4087355300107073, + "grad_norm": 2.170696496963501, + "learning_rate": 9.223707054954905e-06, + "loss": 1.1268, + "step": 5058 + }, + { + "epoch": 0.4088163397240348, + "grad_norm": 2.748819589614868, + "learning_rate": 9.223356820727507e-06, + "loss": 0.958, + "step": 5059 + }, + { + "epoch": 0.4088971494373624, + "grad_norm": 2.8398070335388184, + "learning_rate": 9.223006514164174e-06, + "loss": 0.9628, + "step": 5060 + }, + { + "epoch": 0.4089779591506899, + "grad_norm": 2.6341593265533447, + "learning_rate": 9.222656135270904e-06, + "loss": 0.9338, + "step": 5061 + }, + { + "epoch": 0.40905876886401743, + "grad_norm": 2.7226674556732178, + "learning_rate": 9.2223056840537e-06, + "loss": 0.9315, + "step": 5062 + }, + { + "epoch": 0.409139578577345, + "grad_norm": 2.8942036628723145, + "learning_rate": 9.221955160518567e-06, + "loss": 0.8611, + "step": 5063 + }, + { + "epoch": 0.40922038829067253, + "grad_norm": 2.848767042160034, + "learning_rate": 9.221604564671505e-06, + "loss": 1.0153, + "step": 5064 + }, + { + "epoch": 0.40930119800400006, + "grad_norm": 2.7654354572296143, + "learning_rate": 9.221253896518519e-06, + "loss": 0.9566, + "step": 5065 + }, + { + "epoch": 0.40938200771732763, + "grad_norm": 3.298430919647217, + "learning_rate": 9.220903156065617e-06, + "loss": 0.9473, + "step": 5066 + }, + { + "epoch": 0.40946281743065516, + "grad_norm": 2.4986090660095215, + "learning_rate": 9.220552343318804e-06, + "loss": 0.9936, + "step": 5067 + }, + { + "epoch": 0.4095436271439827, + "grad_norm": 2.3295962810516357, + "learning_rate": 9.220201458284091e-06, + "loss": 1.0303, + "step": 5068 + }, + { + "epoch": 0.40962443685731026, + "grad_norm": 2.584465980529785, + "learning_rate": 9.219850500967487e-06, + "loss": 0.8988, + "step": 5069 + }, + { + "epoch": 0.4097052465706378, + "grad_norm": 2.6033895015716553, + "learning_rate": 9.219499471375002e-06, + "loss": 1.0548, + "step": 5070 + }, + { + "epoch": 0.4097860562839653, + "grad_norm": 2.4835457801818848, + "learning_rate": 9.219148369512649e-06, + "loss": 0.9786, + "step": 5071 + }, + { + "epoch": 0.4098668659972929, + "grad_norm": 2.6273036003112793, + "learning_rate": 9.218797195386443e-06, + "loss": 0.9294, + "step": 5072 + }, + { + "epoch": 0.4099476757106204, + "grad_norm": 3.011119842529297, + "learning_rate": 9.218445949002395e-06, + "loss": 0.9466, + "step": 5073 + }, + { + "epoch": 0.410028485423948, + "grad_norm": 2.3667774200439453, + "learning_rate": 9.218094630366525e-06, + "loss": 0.9471, + "step": 5074 + }, + { + "epoch": 0.4101092951372755, + "grad_norm": 2.8165531158447266, + "learning_rate": 9.217743239484848e-06, + "loss": 0.9382, + "step": 5075 + }, + { + "epoch": 0.41019010485060303, + "grad_norm": 2.526662588119507, + "learning_rate": 9.217391776363385e-06, + "loss": 0.9283, + "step": 5076 + }, + { + "epoch": 0.4102709145639306, + "grad_norm": 3.033874750137329, + "learning_rate": 9.217040241008152e-06, + "loss": 0.9426, + "step": 5077 + }, + { + "epoch": 0.41035172427725813, + "grad_norm": 2.5313334465026855, + "learning_rate": 9.216688633425172e-06, + "loss": 1.0562, + "step": 5078 + }, + { + "epoch": 0.41043253399058566, + "grad_norm": 3.0072054862976074, + "learning_rate": 9.216336953620467e-06, + "loss": 0.8155, + "step": 5079 + }, + { + "epoch": 0.41051334370391324, + "grad_norm": 2.9152071475982666, + "learning_rate": 9.215985201600059e-06, + "loss": 0.8833, + "step": 5080 + }, + { + "epoch": 0.41059415341724076, + "grad_norm": 2.9390978813171387, + "learning_rate": 9.215633377369977e-06, + "loss": 0.8395, + "step": 5081 + }, + { + "epoch": 0.4106749631305683, + "grad_norm": 2.6304209232330322, + "learning_rate": 9.215281480936242e-06, + "loss": 0.9726, + "step": 5082 + }, + { + "epoch": 0.41075577284389586, + "grad_norm": 2.772843599319458, + "learning_rate": 9.214929512304884e-06, + "loss": 0.8998, + "step": 5083 + }, + { + "epoch": 0.4108365825572234, + "grad_norm": 2.7768871784210205, + "learning_rate": 9.214577471481929e-06, + "loss": 1.0487, + "step": 5084 + }, + { + "epoch": 0.4109173922705509, + "grad_norm": 3.6414031982421875, + "learning_rate": 9.21422535847341e-06, + "loss": 1.0651, + "step": 5085 + }, + { + "epoch": 0.4109982019838785, + "grad_norm": 3.065429449081421, + "learning_rate": 9.213873173285354e-06, + "loss": 1.0332, + "step": 5086 + }, + { + "epoch": 0.411079011697206, + "grad_norm": 2.226344585418701, + "learning_rate": 9.213520915923798e-06, + "loss": 1.1063, + "step": 5087 + }, + { + "epoch": 0.41115982141053353, + "grad_norm": 2.8055105209350586, + "learning_rate": 9.21316858639477e-06, + "loss": 0.9426, + "step": 5088 + }, + { + "epoch": 0.4112406311238611, + "grad_norm": 3.0142786502838135, + "learning_rate": 9.212816184704307e-06, + "loss": 1.1111, + "step": 5089 + }, + { + "epoch": 0.41132144083718863, + "grad_norm": 3.043494701385498, + "learning_rate": 9.212463710858446e-06, + "loss": 0.9475, + "step": 5090 + }, + { + "epoch": 0.41140225055051616, + "grad_norm": 2.598656415939331, + "learning_rate": 9.212111164863223e-06, + "loss": 0.9964, + "step": 5091 + }, + { + "epoch": 0.41148306026384374, + "grad_norm": 2.5568392276763916, + "learning_rate": 9.211758546724674e-06, + "loss": 0.9552, + "step": 5092 + }, + { + "epoch": 0.41156386997717126, + "grad_norm": 2.725928544998169, + "learning_rate": 9.21140585644884e-06, + "loss": 0.8849, + "step": 5093 + }, + { + "epoch": 0.4116446796904988, + "grad_norm": 2.8262064456939697, + "learning_rate": 9.211053094041764e-06, + "loss": 1.1436, + "step": 5094 + }, + { + "epoch": 0.41172548940382636, + "grad_norm": 2.905388832092285, + "learning_rate": 9.210700259509487e-06, + "loss": 0.9966, + "step": 5095 + }, + { + "epoch": 0.4118062991171539, + "grad_norm": 3.468975782394409, + "learning_rate": 9.210347352858048e-06, + "loss": 1.01, + "step": 5096 + }, + { + "epoch": 0.4118871088304814, + "grad_norm": 2.6812238693237305, + "learning_rate": 9.209994374093499e-06, + "loss": 0.9068, + "step": 5097 + }, + { + "epoch": 0.411967918543809, + "grad_norm": 2.638923406600952, + "learning_rate": 9.209641323221879e-06, + "loss": 1.0169, + "step": 5098 + }, + { + "epoch": 0.4120487282571365, + "grad_norm": 2.6545894145965576, + "learning_rate": 9.209288200249238e-06, + "loss": 0.961, + "step": 5099 + }, + { + "epoch": 0.41212953797046403, + "grad_norm": 2.69412899017334, + "learning_rate": 9.208935005181622e-06, + "loss": 0.9719, + "step": 5100 + }, + { + "epoch": 0.4122103476837916, + "grad_norm": 2.4850449562072754, + "learning_rate": 9.208581738025084e-06, + "loss": 0.934, + "step": 5101 + }, + { + "epoch": 0.41229115739711913, + "grad_norm": 2.5823187828063965, + "learning_rate": 9.208228398785672e-06, + "loss": 0.9177, + "step": 5102 + }, + { + "epoch": 0.41237196711044666, + "grad_norm": 3.0760159492492676, + "learning_rate": 9.207874987469439e-06, + "loss": 0.8992, + "step": 5103 + }, + { + "epoch": 0.41245277682377424, + "grad_norm": 2.7605202198028564, + "learning_rate": 9.207521504082438e-06, + "loss": 0.8759, + "step": 5104 + }, + { + "epoch": 0.41253358653710176, + "grad_norm": 2.959604501724243, + "learning_rate": 9.207167948630721e-06, + "loss": 0.8524, + "step": 5105 + }, + { + "epoch": 0.4126143962504293, + "grad_norm": 3.298809289932251, + "learning_rate": 9.206814321120346e-06, + "loss": 0.986, + "step": 5106 + }, + { + "epoch": 0.41269520596375686, + "grad_norm": 2.623382329940796, + "learning_rate": 9.206460621557369e-06, + "loss": 0.9225, + "step": 5107 + }, + { + "epoch": 0.4127760156770844, + "grad_norm": 2.648101806640625, + "learning_rate": 9.20610684994785e-06, + "loss": 0.9735, + "step": 5108 + }, + { + "epoch": 0.4128568253904119, + "grad_norm": 2.6998050212860107, + "learning_rate": 9.205753006297845e-06, + "loss": 0.8762, + "step": 5109 + }, + { + "epoch": 0.4129376351037395, + "grad_norm": 3.2479190826416016, + "learning_rate": 9.205399090613415e-06, + "loss": 0.8798, + "step": 5110 + }, + { + "epoch": 0.413018444817067, + "grad_norm": 2.274042844772339, + "learning_rate": 9.205045102900624e-06, + "loss": 0.9613, + "step": 5111 + }, + { + "epoch": 0.41309925453039453, + "grad_norm": 2.6246562004089355, + "learning_rate": 9.204691043165533e-06, + "loss": 1.0374, + "step": 5112 + }, + { + "epoch": 0.4131800642437221, + "grad_norm": 2.179845094680786, + "learning_rate": 9.204336911414207e-06, + "loss": 1.0956, + "step": 5113 + }, + { + "epoch": 0.41326087395704963, + "grad_norm": 2.4248814582824707, + "learning_rate": 9.203982707652711e-06, + "loss": 0.8883, + "step": 5114 + }, + { + "epoch": 0.41334168367037716, + "grad_norm": 2.8744332790374756, + "learning_rate": 9.203628431887113e-06, + "loss": 1.0765, + "step": 5115 + }, + { + "epoch": 0.41342249338370474, + "grad_norm": 2.878135919570923, + "learning_rate": 9.20327408412348e-06, + "loss": 0.9488, + "step": 5116 + }, + { + "epoch": 0.41350330309703226, + "grad_norm": 2.7689199447631836, + "learning_rate": 9.202919664367878e-06, + "loss": 0.9663, + "step": 5117 + }, + { + "epoch": 0.4135841128103598, + "grad_norm": 3.0066959857940674, + "learning_rate": 9.202565172626383e-06, + "loss": 1.0099, + "step": 5118 + }, + { + "epoch": 0.41366492252368736, + "grad_norm": 2.8451433181762695, + "learning_rate": 9.202210608905062e-06, + "loss": 0.9581, + "step": 5119 + }, + { + "epoch": 0.4137457322370149, + "grad_norm": 2.792628288269043, + "learning_rate": 9.201855973209992e-06, + "loss": 1.0241, + "step": 5120 + }, + { + "epoch": 0.4138265419503424, + "grad_norm": 3.0069565773010254, + "learning_rate": 9.201501265547242e-06, + "loss": 0.938, + "step": 5121 + }, + { + "epoch": 0.41390735166367, + "grad_norm": 2.465031862258911, + "learning_rate": 9.201146485922891e-06, + "loss": 1.0827, + "step": 5122 + }, + { + "epoch": 0.4139881613769975, + "grad_norm": 2.852816343307495, + "learning_rate": 9.200791634343015e-06, + "loss": 0.9214, + "step": 5123 + }, + { + "epoch": 0.41406897109032503, + "grad_norm": 2.762470006942749, + "learning_rate": 9.20043671081369e-06, + "loss": 1.1156, + "step": 5124 + }, + { + "epoch": 0.4141497808036526, + "grad_norm": 2.724828004837036, + "learning_rate": 9.200081715341001e-06, + "loss": 0.9464, + "step": 5125 + }, + { + "epoch": 0.41423059051698013, + "grad_norm": 2.659640073776245, + "learning_rate": 9.19972664793102e-06, + "loss": 0.9812, + "step": 5126 + }, + { + "epoch": 0.41431140023030766, + "grad_norm": 2.647005081176758, + "learning_rate": 9.199371508589831e-06, + "loss": 1.0383, + "step": 5127 + }, + { + "epoch": 0.41439220994363524, + "grad_norm": 3.1099772453308105, + "learning_rate": 9.199016297323518e-06, + "loss": 1.0125, + "step": 5128 + }, + { + "epoch": 0.41447301965696276, + "grad_norm": 2.5530290603637695, + "learning_rate": 9.198661014138166e-06, + "loss": 1.0799, + "step": 5129 + }, + { + "epoch": 0.4145538293702903, + "grad_norm": 2.73724102973938, + "learning_rate": 9.198305659039858e-06, + "loss": 0.9411, + "step": 5130 + }, + { + "epoch": 0.41463463908361786, + "grad_norm": 2.540130615234375, + "learning_rate": 9.19795023203468e-06, + "loss": 0.8835, + "step": 5131 + }, + { + "epoch": 0.4147154487969454, + "grad_norm": 2.7805604934692383, + "learning_rate": 9.197594733128724e-06, + "loss": 0.8499, + "step": 5132 + }, + { + "epoch": 0.4147962585102729, + "grad_norm": 3.1604864597320557, + "learning_rate": 9.197239162328071e-06, + "loss": 0.9373, + "step": 5133 + }, + { + "epoch": 0.4148770682236005, + "grad_norm": 2.4651713371276855, + "learning_rate": 9.196883519638818e-06, + "loss": 0.9582, + "step": 5134 + }, + { + "epoch": 0.414957877936928, + "grad_norm": 2.41593074798584, + "learning_rate": 9.196527805067054e-06, + "loss": 0.9802, + "step": 5135 + }, + { + "epoch": 0.41503868765025553, + "grad_norm": 2.9213290214538574, + "learning_rate": 9.19617201861887e-06, + "loss": 0.8652, + "step": 5136 + }, + { + "epoch": 0.4151194973635831, + "grad_norm": 2.4444069862365723, + "learning_rate": 9.195816160300363e-06, + "loss": 1.009, + "step": 5137 + }, + { + "epoch": 0.41520030707691064, + "grad_norm": 2.550560235977173, + "learning_rate": 9.195460230117626e-06, + "loss": 1.154, + "step": 5138 + }, + { + "epoch": 0.4152811167902382, + "grad_norm": 2.699474334716797, + "learning_rate": 9.195104228076754e-06, + "loss": 0.9999, + "step": 5139 + }, + { + "epoch": 0.41536192650356574, + "grad_norm": 2.7403440475463867, + "learning_rate": 9.194748154183849e-06, + "loss": 1.0574, + "step": 5140 + }, + { + "epoch": 0.41544273621689326, + "grad_norm": 2.607996702194214, + "learning_rate": 9.194392008445003e-06, + "loss": 0.9134, + "step": 5141 + }, + { + "epoch": 0.41552354593022084, + "grad_norm": 2.9673011302948, + "learning_rate": 9.19403579086632e-06, + "loss": 1.0222, + "step": 5142 + }, + { + "epoch": 0.41560435564354836, + "grad_norm": 2.496568202972412, + "learning_rate": 9.193679501453902e-06, + "loss": 0.9699, + "step": 5143 + }, + { + "epoch": 0.4156851653568759, + "grad_norm": 2.9500949382781982, + "learning_rate": 9.19332314021385e-06, + "loss": 1.034, + "step": 5144 + }, + { + "epoch": 0.41576597507020346, + "grad_norm": 2.675473690032959, + "learning_rate": 9.192966707152266e-06, + "loss": 1.0035, + "step": 5145 + }, + { + "epoch": 0.415846784783531, + "grad_norm": 2.7029106616973877, + "learning_rate": 9.192610202275259e-06, + "loss": 0.9405, + "step": 5146 + }, + { + "epoch": 0.4159275944968585, + "grad_norm": 2.850595235824585, + "learning_rate": 9.19225362558893e-06, + "loss": 0.8701, + "step": 5147 + }, + { + "epoch": 0.4160084042101861, + "grad_norm": 3.1219558715820312, + "learning_rate": 9.19189697709939e-06, + "loss": 1.0485, + "step": 5148 + }, + { + "epoch": 0.4160892139235136, + "grad_norm": 2.5429701805114746, + "learning_rate": 9.191540256812745e-06, + "loss": 0.9133, + "step": 5149 + }, + { + "epoch": 0.41617002363684114, + "grad_norm": 2.516554832458496, + "learning_rate": 9.191183464735107e-06, + "loss": 1.0231, + "step": 5150 + }, + { + "epoch": 0.4162508333501687, + "grad_norm": 2.6769444942474365, + "learning_rate": 9.190826600872587e-06, + "loss": 1.0348, + "step": 5151 + }, + { + "epoch": 0.41633164306349624, + "grad_norm": 2.447317123413086, + "learning_rate": 9.190469665231296e-06, + "loss": 0.9809, + "step": 5152 + }, + { + "epoch": 0.41641245277682376, + "grad_norm": 2.7418107986450195, + "learning_rate": 9.190112657817347e-06, + "loss": 1.014, + "step": 5153 + }, + { + "epoch": 0.41649326249015134, + "grad_norm": 2.7991063594818115, + "learning_rate": 9.189755578636856e-06, + "loss": 0.9121, + "step": 5154 + }, + { + "epoch": 0.41657407220347886, + "grad_norm": 2.72627592086792, + "learning_rate": 9.18939842769594e-06, + "loss": 0.9702, + "step": 5155 + }, + { + "epoch": 0.4166548819168064, + "grad_norm": 2.372669219970703, + "learning_rate": 9.189041205000713e-06, + "loss": 1.1127, + "step": 5156 + }, + { + "epoch": 0.41673569163013396, + "grad_norm": 3.2202353477478027, + "learning_rate": 9.188683910557294e-06, + "loss": 1.0525, + "step": 5157 + }, + { + "epoch": 0.4168165013434615, + "grad_norm": 2.886657238006592, + "learning_rate": 9.188326544371805e-06, + "loss": 0.9962, + "step": 5158 + }, + { + "epoch": 0.416897311056789, + "grad_norm": 2.703826904296875, + "learning_rate": 9.187969106450364e-06, + "loss": 0.9504, + "step": 5159 + }, + { + "epoch": 0.4169781207701166, + "grad_norm": 2.624884843826294, + "learning_rate": 9.187611596799094e-06, + "loss": 1.0139, + "step": 5160 + }, + { + "epoch": 0.4170589304834441, + "grad_norm": 2.1545822620391846, + "learning_rate": 9.18725401542412e-06, + "loss": 1.0799, + "step": 5161 + }, + { + "epoch": 0.41713974019677164, + "grad_norm": 2.6193768978118896, + "learning_rate": 9.186896362331564e-06, + "loss": 0.8731, + "step": 5162 + }, + { + "epoch": 0.4172205499100992, + "grad_norm": 2.7223222255706787, + "learning_rate": 9.186538637527554e-06, + "loss": 1.049, + "step": 5163 + }, + { + "epoch": 0.41730135962342674, + "grad_norm": 2.860353708267212, + "learning_rate": 9.186180841018216e-06, + "loss": 0.8993, + "step": 5164 + }, + { + "epoch": 0.41738216933675426, + "grad_norm": 3.40059757232666, + "learning_rate": 9.185822972809677e-06, + "loss": 0.9874, + "step": 5165 + }, + { + "epoch": 0.41746297905008184, + "grad_norm": 3.3224143981933594, + "learning_rate": 9.185465032908068e-06, + "loss": 1.0188, + "step": 5166 + }, + { + "epoch": 0.41754378876340936, + "grad_norm": 2.659998893737793, + "learning_rate": 9.185107021319516e-06, + "loss": 1.0582, + "step": 5167 + }, + { + "epoch": 0.4176245984767369, + "grad_norm": 2.666018486022949, + "learning_rate": 9.184748938050161e-06, + "loss": 0.8759, + "step": 5168 + }, + { + "epoch": 0.41770540819006446, + "grad_norm": 2.5733697414398193, + "learning_rate": 9.184390783106128e-06, + "loss": 1.017, + "step": 5169 + }, + { + "epoch": 0.417786217903392, + "grad_norm": 2.5420942306518555, + "learning_rate": 9.184032556493555e-06, + "loss": 0.9655, + "step": 5170 + }, + { + "epoch": 0.4178670276167195, + "grad_norm": 2.6693360805511475, + "learning_rate": 9.183674258218577e-06, + "loss": 0.9056, + "step": 5171 + }, + { + "epoch": 0.4179478373300471, + "grad_norm": 2.8547558784484863, + "learning_rate": 9.183315888287331e-06, + "loss": 0.9444, + "step": 5172 + }, + { + "epoch": 0.4180286470433746, + "grad_norm": 2.659933090209961, + "learning_rate": 9.182957446705956e-06, + "loss": 0.8854, + "step": 5173 + }, + { + "epoch": 0.41810945675670214, + "grad_norm": 2.836608409881592, + "learning_rate": 9.182598933480588e-06, + "loss": 1.0334, + "step": 5174 + }, + { + "epoch": 0.4181902664700297, + "grad_norm": 2.6722848415374756, + "learning_rate": 9.18224034861737e-06, + "loss": 0.9776, + "step": 5175 + }, + { + "epoch": 0.41827107618335724, + "grad_norm": 2.8575034141540527, + "learning_rate": 9.181881692122443e-06, + "loss": 0.9902, + "step": 5176 + }, + { + "epoch": 0.41835188589668476, + "grad_norm": 2.6821653842926025, + "learning_rate": 9.18152296400195e-06, + "loss": 0.955, + "step": 5177 + }, + { + "epoch": 0.41843269561001234, + "grad_norm": 2.884547710418701, + "learning_rate": 9.181164164262036e-06, + "loss": 1.0227, + "step": 5178 + }, + { + "epoch": 0.41851350532333986, + "grad_norm": 2.6616051197052, + "learning_rate": 9.180805292908846e-06, + "loss": 1.0137, + "step": 5179 + }, + { + "epoch": 0.4185943150366674, + "grad_norm": 2.635457754135132, + "learning_rate": 9.180446349948523e-06, + "loss": 1.0195, + "step": 5180 + }, + { + "epoch": 0.41867512474999496, + "grad_norm": 2.7012946605682373, + "learning_rate": 9.180087335387222e-06, + "loss": 1.0158, + "step": 5181 + }, + { + "epoch": 0.4187559344633225, + "grad_norm": 2.358668327331543, + "learning_rate": 9.179728249231086e-06, + "loss": 1.0879, + "step": 5182 + }, + { + "epoch": 0.41883674417665, + "grad_norm": 2.769690752029419, + "learning_rate": 9.179369091486268e-06, + "loss": 0.9497, + "step": 5183 + }, + { + "epoch": 0.4189175538899776, + "grad_norm": 2.7684693336486816, + "learning_rate": 9.179009862158919e-06, + "loss": 0.9457, + "step": 5184 + }, + { + "epoch": 0.4189983636033051, + "grad_norm": 2.4689581394195557, + "learning_rate": 9.178650561255192e-06, + "loss": 1.1107, + "step": 5185 + }, + { + "epoch": 0.41907917331663264, + "grad_norm": 2.504530668258667, + "learning_rate": 9.178291188781238e-06, + "loss": 1.1336, + "step": 5186 + }, + { + "epoch": 0.4191599830299602, + "grad_norm": 2.4523117542266846, + "learning_rate": 9.177931744743218e-06, + "loss": 0.944, + "step": 5187 + }, + { + "epoch": 0.41924079274328774, + "grad_norm": 3.249720335006714, + "learning_rate": 9.177572229147283e-06, + "loss": 0.9814, + "step": 5188 + }, + { + "epoch": 0.41932160245661526, + "grad_norm": 2.5293381214141846, + "learning_rate": 9.177212641999595e-06, + "loss": 0.9223, + "step": 5189 + }, + { + "epoch": 0.41940241216994284, + "grad_norm": 2.4914333820343018, + "learning_rate": 9.176852983306309e-06, + "loss": 0.8903, + "step": 5190 + }, + { + "epoch": 0.41948322188327036, + "grad_norm": 3.0650694370269775, + "learning_rate": 9.176493253073587e-06, + "loss": 0.9382, + "step": 5191 + }, + { + "epoch": 0.4195640315965979, + "grad_norm": 2.7988922595977783, + "learning_rate": 9.17613345130759e-06, + "loss": 0.9149, + "step": 5192 + }, + { + "epoch": 0.41964484130992546, + "grad_norm": 2.7266433238983154, + "learning_rate": 9.175773578014483e-06, + "loss": 1.1189, + "step": 5193 + }, + { + "epoch": 0.419725651023253, + "grad_norm": 3.0123531818389893, + "learning_rate": 9.175413633200422e-06, + "loss": 1.0823, + "step": 5194 + }, + { + "epoch": 0.4198064607365805, + "grad_norm": 2.556122303009033, + "learning_rate": 9.175053616871582e-06, + "loss": 0.9105, + "step": 5195 + }, + { + "epoch": 0.4198872704499081, + "grad_norm": 2.5047812461853027, + "learning_rate": 9.174693529034122e-06, + "loss": 1.0527, + "step": 5196 + }, + { + "epoch": 0.4199680801632356, + "grad_norm": 2.6202611923217773, + "learning_rate": 9.174333369694214e-06, + "loss": 0.9677, + "step": 5197 + }, + { + "epoch": 0.42004888987656314, + "grad_norm": 2.7027087211608887, + "learning_rate": 9.173973138858023e-06, + "loss": 1.0207, + "step": 5198 + }, + { + "epoch": 0.4201296995898907, + "grad_norm": 2.796079397201538, + "learning_rate": 9.173612836531722e-06, + "loss": 0.9337, + "step": 5199 + }, + { + "epoch": 0.42021050930321824, + "grad_norm": 2.7054691314697266, + "learning_rate": 9.173252462721481e-06, + "loss": 0.9401, + "step": 5200 + }, + { + "epoch": 0.42029131901654576, + "grad_norm": 2.5890510082244873, + "learning_rate": 9.17289201743347e-06, + "loss": 0.9646, + "step": 5201 + }, + { + "epoch": 0.42037212872987334, + "grad_norm": 2.548583507537842, + "learning_rate": 9.172531500673866e-06, + "loss": 0.9433, + "step": 5202 + }, + { + "epoch": 0.42045293844320086, + "grad_norm": 2.728844165802002, + "learning_rate": 9.17217091244884e-06, + "loss": 0.8917, + "step": 5203 + }, + { + "epoch": 0.42053374815652844, + "grad_norm": 3.138631582260132, + "learning_rate": 9.171810252764575e-06, + "loss": 1.0581, + "step": 5204 + }, + { + "epoch": 0.42061455786985597, + "grad_norm": 2.84704327583313, + "learning_rate": 9.17144952162724e-06, + "loss": 0.9265, + "step": 5205 + }, + { + "epoch": 0.4206953675831835, + "grad_norm": 2.7900187969207764, + "learning_rate": 9.171088719043018e-06, + "loss": 1.0159, + "step": 5206 + }, + { + "epoch": 0.42077617729651107, + "grad_norm": 2.5694427490234375, + "learning_rate": 9.170727845018089e-06, + "loss": 0.9761, + "step": 5207 + }, + { + "epoch": 0.4208569870098386, + "grad_norm": 2.68499493598938, + "learning_rate": 9.17036689955863e-06, + "loss": 0.8771, + "step": 5208 + }, + { + "epoch": 0.4209377967231661, + "grad_norm": 2.537735939025879, + "learning_rate": 9.170005882670827e-06, + "loss": 0.8706, + "step": 5209 + }, + { + "epoch": 0.4210186064364937, + "grad_norm": 2.7610819339752197, + "learning_rate": 9.169644794360862e-06, + "loss": 1.0456, + "step": 5210 + }, + { + "epoch": 0.4210994161498212, + "grad_norm": 2.8448081016540527, + "learning_rate": 9.16928363463492e-06, + "loss": 0.926, + "step": 5211 + }, + { + "epoch": 0.42118022586314874, + "grad_norm": 2.9562947750091553, + "learning_rate": 9.168922403499187e-06, + "loss": 0.9373, + "step": 5212 + }, + { + "epoch": 0.4212610355764763, + "grad_norm": 2.7735419273376465, + "learning_rate": 9.16856110095985e-06, + "loss": 1.0635, + "step": 5213 + }, + { + "epoch": 0.42134184528980384, + "grad_norm": 2.9197466373443604, + "learning_rate": 9.168199727023095e-06, + "loss": 0.9387, + "step": 5214 + }, + { + "epoch": 0.42142265500313136, + "grad_norm": 2.7584502696990967, + "learning_rate": 9.167838281695114e-06, + "loss": 0.9189, + "step": 5215 + }, + { + "epoch": 0.42150346471645894, + "grad_norm": 2.570056676864624, + "learning_rate": 9.167476764982096e-06, + "loss": 0.973, + "step": 5216 + }, + { + "epoch": 0.42158427442978647, + "grad_norm": 2.8928418159484863, + "learning_rate": 9.167115176890234e-06, + "loss": 0.8473, + "step": 5217 + }, + { + "epoch": 0.421665084143114, + "grad_norm": 2.551988124847412, + "learning_rate": 9.166753517425722e-06, + "loss": 0.9271, + "step": 5218 + }, + { + "epoch": 0.42174589385644157, + "grad_norm": 2.578200578689575, + "learning_rate": 9.166391786594752e-06, + "loss": 0.9274, + "step": 5219 + }, + { + "epoch": 0.4218267035697691, + "grad_norm": 2.5528528690338135, + "learning_rate": 9.166029984403522e-06, + "loss": 1.0344, + "step": 5220 + }, + { + "epoch": 0.4219075132830966, + "grad_norm": 2.59887433052063, + "learning_rate": 9.165668110858227e-06, + "loss": 1.023, + "step": 5221 + }, + { + "epoch": 0.4219883229964242, + "grad_norm": 2.60282564163208, + "learning_rate": 9.165306165965067e-06, + "loss": 0.9438, + "step": 5222 + }, + { + "epoch": 0.4220691327097517, + "grad_norm": 2.800067901611328, + "learning_rate": 9.164944149730239e-06, + "loss": 0.9983, + "step": 5223 + }, + { + "epoch": 0.42214994242307924, + "grad_norm": 3.0861284732818604, + "learning_rate": 9.164582062159944e-06, + "loss": 0.9616, + "step": 5224 + }, + { + "epoch": 0.4222307521364068, + "grad_norm": 2.7225136756896973, + "learning_rate": 9.164219903260385e-06, + "loss": 1.0283, + "step": 5225 + }, + { + "epoch": 0.42231156184973434, + "grad_norm": 2.9631736278533936, + "learning_rate": 9.163857673037763e-06, + "loss": 0.9806, + "step": 5226 + }, + { + "epoch": 0.42239237156306186, + "grad_norm": 2.812352180480957, + "learning_rate": 9.163495371498284e-06, + "loss": 0.9563, + "step": 5227 + }, + { + "epoch": 0.42247318127638944, + "grad_norm": 2.5974884033203125, + "learning_rate": 9.163132998648151e-06, + "loss": 1.0102, + "step": 5228 + }, + { + "epoch": 0.42255399098971697, + "grad_norm": 2.6368088722229004, + "learning_rate": 9.162770554493574e-06, + "loss": 1.0636, + "step": 5229 + }, + { + "epoch": 0.4226348007030445, + "grad_norm": 2.7173140048980713, + "learning_rate": 9.162408039040757e-06, + "loss": 0.9587, + "step": 5230 + }, + { + "epoch": 0.42271561041637207, + "grad_norm": 2.772019386291504, + "learning_rate": 9.162045452295912e-06, + "loss": 0.9674, + "step": 5231 + }, + { + "epoch": 0.4227964201296996, + "grad_norm": 2.743579626083374, + "learning_rate": 9.161682794265249e-06, + "loss": 0.8895, + "step": 5232 + }, + { + "epoch": 0.4228772298430271, + "grad_norm": 2.71034836769104, + "learning_rate": 9.161320064954977e-06, + "loss": 0.9423, + "step": 5233 + }, + { + "epoch": 0.4229580395563547, + "grad_norm": 2.274339437484741, + "learning_rate": 9.16095726437131e-06, + "loss": 0.952, + "step": 5234 + }, + { + "epoch": 0.4230388492696822, + "grad_norm": 2.940167188644409, + "learning_rate": 9.160594392520464e-06, + "loss": 1.0118, + "step": 5235 + }, + { + "epoch": 0.42311965898300974, + "grad_norm": 2.619039297103882, + "learning_rate": 9.160231449408652e-06, + "loss": 1.0813, + "step": 5236 + }, + { + "epoch": 0.4232004686963373, + "grad_norm": 2.540062427520752, + "learning_rate": 9.15986843504209e-06, + "loss": 0.8947, + "step": 5237 + }, + { + "epoch": 0.42328127840966484, + "grad_norm": 2.7829084396362305, + "learning_rate": 9.159505349426996e-06, + "loss": 0.9161, + "step": 5238 + }, + { + "epoch": 0.42336208812299236, + "grad_norm": 3.0055229663848877, + "learning_rate": 9.15914219256959e-06, + "loss": 0.9497, + "step": 5239 + }, + { + "epoch": 0.42344289783631994, + "grad_norm": 2.353222608566284, + "learning_rate": 9.158778964476089e-06, + "loss": 1.1137, + "step": 5240 + }, + { + "epoch": 0.42352370754964747, + "grad_norm": 2.474656820297241, + "learning_rate": 9.158415665152716e-06, + "loss": 0.9872, + "step": 5241 + }, + { + "epoch": 0.423604517262975, + "grad_norm": 2.186896324157715, + "learning_rate": 9.158052294605696e-06, + "loss": 0.8645, + "step": 5242 + }, + { + "epoch": 0.42368532697630257, + "grad_norm": 2.6836929321289062, + "learning_rate": 9.15768885284125e-06, + "loss": 0.9984, + "step": 5243 + }, + { + "epoch": 0.4237661366896301, + "grad_norm": 2.5624165534973145, + "learning_rate": 9.157325339865602e-06, + "loss": 0.9355, + "step": 5244 + }, + { + "epoch": 0.4238469464029576, + "grad_norm": 2.7690236568450928, + "learning_rate": 9.15696175568498e-06, + "loss": 0.9173, + "step": 5245 + }, + { + "epoch": 0.4239277561162852, + "grad_norm": 3.2101879119873047, + "learning_rate": 9.156598100305609e-06, + "loss": 1.0797, + "step": 5246 + }, + { + "epoch": 0.4240085658296127, + "grad_norm": 2.504683494567871, + "learning_rate": 9.156234373733722e-06, + "loss": 1.0471, + "step": 5247 + }, + { + "epoch": 0.42408937554294024, + "grad_norm": 2.4415435791015625, + "learning_rate": 9.155870575975543e-06, + "loss": 1.0734, + "step": 5248 + }, + { + "epoch": 0.4241701852562678, + "grad_norm": 2.7042734622955322, + "learning_rate": 9.155506707037307e-06, + "loss": 0.9438, + "step": 5249 + }, + { + "epoch": 0.42425099496959534, + "grad_norm": 2.7948741912841797, + "learning_rate": 9.155142766925245e-06, + "loss": 0.9362, + "step": 5250 + }, + { + "epoch": 0.42433180468292286, + "grad_norm": 2.6793696880340576, + "learning_rate": 9.15477875564559e-06, + "loss": 0.9816, + "step": 5251 + }, + { + "epoch": 0.42441261439625044, + "grad_norm": 3.6548120975494385, + "learning_rate": 9.15441467320458e-06, + "loss": 0.9709, + "step": 5252 + }, + { + "epoch": 0.42449342410957797, + "grad_norm": 2.507627487182617, + "learning_rate": 9.154050519608444e-06, + "loss": 0.959, + "step": 5253 + }, + { + "epoch": 0.4245742338229055, + "grad_norm": 2.5162367820739746, + "learning_rate": 9.153686294863424e-06, + "loss": 1.1674, + "step": 5254 + }, + { + "epoch": 0.42465504353623307, + "grad_norm": 2.562340497970581, + "learning_rate": 9.153321998975759e-06, + "loss": 1.0999, + "step": 5255 + }, + { + "epoch": 0.4247358532495606, + "grad_norm": 2.9593281745910645, + "learning_rate": 9.152957631951686e-06, + "loss": 0.9948, + "step": 5256 + }, + { + "epoch": 0.4248166629628881, + "grad_norm": 2.4677393436431885, + "learning_rate": 9.152593193797447e-06, + "loss": 1.0488, + "step": 5257 + }, + { + "epoch": 0.4248974726762157, + "grad_norm": 2.6863720417022705, + "learning_rate": 9.152228684519285e-06, + "loss": 1.0273, + "step": 5258 + }, + { + "epoch": 0.4249782823895432, + "grad_norm": 2.892573118209839, + "learning_rate": 9.151864104123439e-06, + "loss": 0.9474, + "step": 5259 + }, + { + "epoch": 0.42505909210287074, + "grad_norm": 2.885190010070801, + "learning_rate": 9.151499452616158e-06, + "loss": 1.0603, + "step": 5260 + }, + { + "epoch": 0.4251399018161983, + "grad_norm": 2.7721941471099854, + "learning_rate": 9.151134730003683e-06, + "loss": 1.0027, + "step": 5261 + }, + { + "epoch": 0.42522071152952584, + "grad_norm": 2.463444709777832, + "learning_rate": 9.150769936292267e-06, + "loss": 0.8949, + "step": 5262 + }, + { + "epoch": 0.42530152124285336, + "grad_norm": 2.463534355163574, + "learning_rate": 9.150405071488153e-06, + "loss": 1.0008, + "step": 5263 + }, + { + "epoch": 0.42538233095618094, + "grad_norm": 2.7191054821014404, + "learning_rate": 9.150040135597591e-06, + "loss": 1.0674, + "step": 5264 + }, + { + "epoch": 0.42546314066950847, + "grad_norm": 2.578448534011841, + "learning_rate": 9.149675128626833e-06, + "loss": 1.0489, + "step": 5265 + }, + { + "epoch": 0.425543950382836, + "grad_norm": 2.3779213428497314, + "learning_rate": 9.149310050582129e-06, + "loss": 0.8996, + "step": 5266 + }, + { + "epoch": 0.42562476009616357, + "grad_norm": 3.0681824684143066, + "learning_rate": 9.148944901469736e-06, + "loss": 0.8802, + "step": 5267 + }, + { + "epoch": 0.4257055698094911, + "grad_norm": 3.037339210510254, + "learning_rate": 9.148579681295901e-06, + "loss": 0.9154, + "step": 5268 + }, + { + "epoch": 0.42578637952281867, + "grad_norm": 3.0940604209899902, + "learning_rate": 9.148214390066885e-06, + "loss": 0.8757, + "step": 5269 + }, + { + "epoch": 0.4258671892361462, + "grad_norm": 2.309497833251953, + "learning_rate": 9.147849027788943e-06, + "loss": 0.843, + "step": 5270 + }, + { + "epoch": 0.4259479989494737, + "grad_norm": 2.5686209201812744, + "learning_rate": 9.147483594468334e-06, + "loss": 0.9062, + "step": 5271 + }, + { + "epoch": 0.4260288086628013, + "grad_norm": 3.318406820297241, + "learning_rate": 9.147118090111316e-06, + "loss": 0.9611, + "step": 5272 + }, + { + "epoch": 0.4261096183761288, + "grad_norm": 2.794062852859497, + "learning_rate": 9.146752514724147e-06, + "loss": 1.0105, + "step": 5273 + }, + { + "epoch": 0.42619042808945634, + "grad_norm": 2.572922468185425, + "learning_rate": 9.146386868313091e-06, + "loss": 0.9437, + "step": 5274 + }, + { + "epoch": 0.4262712378027839, + "grad_norm": 2.977999210357666, + "learning_rate": 9.14602115088441e-06, + "loss": 0.9243, + "step": 5275 + }, + { + "epoch": 0.42635204751611144, + "grad_norm": 2.479283094406128, + "learning_rate": 9.145655362444366e-06, + "loss": 0.8979, + "step": 5276 + }, + { + "epoch": 0.42643285722943897, + "grad_norm": 3.1657676696777344, + "learning_rate": 9.145289502999228e-06, + "loss": 0.9842, + "step": 5277 + }, + { + "epoch": 0.42651366694276655, + "grad_norm": 3.2508034706115723, + "learning_rate": 9.14492357255526e-06, + "loss": 1.0106, + "step": 5278 + }, + { + "epoch": 0.42659447665609407, + "grad_norm": 2.5886282920837402, + "learning_rate": 9.144557571118729e-06, + "loss": 1.1737, + "step": 5279 + }, + { + "epoch": 0.4266752863694216, + "grad_norm": 3.0532402992248535, + "learning_rate": 9.144191498695904e-06, + "loss": 0.9356, + "step": 5280 + }, + { + "epoch": 0.42675609608274917, + "grad_norm": 3.0943808555603027, + "learning_rate": 9.143825355293058e-06, + "loss": 0.954, + "step": 5281 + }, + { + "epoch": 0.4268369057960767, + "grad_norm": 2.315962314605713, + "learning_rate": 9.143459140916456e-06, + "loss": 0.9484, + "step": 5282 + }, + { + "epoch": 0.4269177155094042, + "grad_norm": 2.507899522781372, + "learning_rate": 9.143092855572375e-06, + "loss": 0.9995, + "step": 5283 + }, + { + "epoch": 0.4269985252227318, + "grad_norm": 2.505897045135498, + "learning_rate": 9.14272649926709e-06, + "loss": 0.917, + "step": 5284 + }, + { + "epoch": 0.4270793349360593, + "grad_norm": 2.5773026943206787, + "learning_rate": 9.14236007200687e-06, + "loss": 0.8165, + "step": 5285 + }, + { + "epoch": 0.42716014464938684, + "grad_norm": 2.3543965816497803, + "learning_rate": 9.141993573797997e-06, + "loss": 1.1064, + "step": 5286 + }, + { + "epoch": 0.4272409543627144, + "grad_norm": 2.9803483486175537, + "learning_rate": 9.141627004646743e-06, + "loss": 1.0708, + "step": 5287 + }, + { + "epoch": 0.42732176407604194, + "grad_norm": 2.6834347248077393, + "learning_rate": 9.14126036455939e-06, + "loss": 1.0106, + "step": 5288 + }, + { + "epoch": 0.42740257378936947, + "grad_norm": 2.5566248893737793, + "learning_rate": 9.140893653542216e-06, + "loss": 1.0089, + "step": 5289 + }, + { + "epoch": 0.42748338350269705, + "grad_norm": 3.1436350345611572, + "learning_rate": 9.140526871601503e-06, + "loss": 0.9301, + "step": 5290 + }, + { + "epoch": 0.42756419321602457, + "grad_norm": 2.8564767837524414, + "learning_rate": 9.140160018743533e-06, + "loss": 1.0127, + "step": 5291 + }, + { + "epoch": 0.4276450029293521, + "grad_norm": 2.2093665599823, + "learning_rate": 9.13979309497459e-06, + "loss": 1.0052, + "step": 5292 + }, + { + "epoch": 0.42772581264267967, + "grad_norm": 2.702512264251709, + "learning_rate": 9.139426100300956e-06, + "loss": 0.9751, + "step": 5293 + }, + { + "epoch": 0.4278066223560072, + "grad_norm": 2.3560664653778076, + "learning_rate": 9.139059034728918e-06, + "loss": 0.9461, + "step": 5294 + }, + { + "epoch": 0.4278874320693347, + "grad_norm": 2.5384583473205566, + "learning_rate": 9.138691898264762e-06, + "loss": 0.9219, + "step": 5295 + }, + { + "epoch": 0.4279682417826623, + "grad_norm": 2.374020576477051, + "learning_rate": 9.13832469091478e-06, + "loss": 1.0367, + "step": 5296 + }, + { + "epoch": 0.4280490514959898, + "grad_norm": 2.316286563873291, + "learning_rate": 9.137957412685257e-06, + "loss": 0.9588, + "step": 5297 + }, + { + "epoch": 0.42812986120931734, + "grad_norm": 3.2077367305755615, + "learning_rate": 9.137590063582486e-06, + "loss": 1.0781, + "step": 5298 + }, + { + "epoch": 0.4282106709226449, + "grad_norm": 2.6625146865844727, + "learning_rate": 9.137222643612757e-06, + "loss": 0.9954, + "step": 5299 + }, + { + "epoch": 0.42829148063597244, + "grad_norm": 2.5936596393585205, + "learning_rate": 9.136855152782364e-06, + "loss": 0.8905, + "step": 5300 + }, + { + "epoch": 0.42837229034929997, + "grad_norm": 2.6800479888916016, + "learning_rate": 9.136487591097603e-06, + "loss": 0.9877, + "step": 5301 + }, + { + "epoch": 0.42845310006262755, + "grad_norm": 2.628873348236084, + "learning_rate": 9.136119958564766e-06, + "loss": 0.984, + "step": 5302 + }, + { + "epoch": 0.42853390977595507, + "grad_norm": 2.3615784645080566, + "learning_rate": 9.135752255190153e-06, + "loss": 1.0313, + "step": 5303 + }, + { + "epoch": 0.4286147194892826, + "grad_norm": 3.0959222316741943, + "learning_rate": 9.13538448098006e-06, + "loss": 0.9252, + "step": 5304 + }, + { + "epoch": 0.42869552920261017, + "grad_norm": 2.471327304840088, + "learning_rate": 9.135016635940785e-06, + "loss": 1.0587, + "step": 5305 + }, + { + "epoch": 0.4287763389159377, + "grad_norm": 3.0087757110595703, + "learning_rate": 9.134648720078631e-06, + "loss": 0.8549, + "step": 5306 + }, + { + "epoch": 0.4288571486292652, + "grad_norm": 3.0245444774627686, + "learning_rate": 9.134280733399898e-06, + "loss": 0.9498, + "step": 5307 + }, + { + "epoch": 0.4289379583425928, + "grad_norm": 2.6729087829589844, + "learning_rate": 9.13391267591089e-06, + "loss": 0.9431, + "step": 5308 + }, + { + "epoch": 0.4290187680559203, + "grad_norm": 2.839160919189453, + "learning_rate": 9.133544547617907e-06, + "loss": 0.8525, + "step": 5309 + }, + { + "epoch": 0.42909957776924784, + "grad_norm": 2.6436760425567627, + "learning_rate": 9.133176348527258e-06, + "loss": 0.9851, + "step": 5310 + }, + { + "epoch": 0.4291803874825754, + "grad_norm": 2.847926378250122, + "learning_rate": 9.13280807864525e-06, + "loss": 0.9391, + "step": 5311 + }, + { + "epoch": 0.42926119719590294, + "grad_norm": 2.718937397003174, + "learning_rate": 9.132439737978186e-06, + "loss": 0.902, + "step": 5312 + }, + { + "epoch": 0.42934200690923047, + "grad_norm": 2.855121612548828, + "learning_rate": 9.132071326532381e-06, + "loss": 1.0452, + "step": 5313 + }, + { + "epoch": 0.42942281662255805, + "grad_norm": 2.9336607456207275, + "learning_rate": 9.131702844314139e-06, + "loss": 0.8561, + "step": 5314 + }, + { + "epoch": 0.42950362633588557, + "grad_norm": 2.779228448867798, + "learning_rate": 9.131334291329777e-06, + "loss": 1.0408, + "step": 5315 + }, + { + "epoch": 0.4295844360492131, + "grad_norm": 2.922060251235962, + "learning_rate": 9.130965667585603e-06, + "loss": 0.935, + "step": 5316 + }, + { + "epoch": 0.42966524576254067, + "grad_norm": 2.9307093620300293, + "learning_rate": 9.13059697308793e-06, + "loss": 1.0235, + "step": 5317 + }, + { + "epoch": 0.4297460554758682, + "grad_norm": 2.2900094985961914, + "learning_rate": 9.130228207843077e-06, + "loss": 0.9313, + "step": 5318 + }, + { + "epoch": 0.4298268651891957, + "grad_norm": 2.5423219203948975, + "learning_rate": 9.129859371857357e-06, + "loss": 0.9774, + "step": 5319 + }, + { + "epoch": 0.4299076749025233, + "grad_norm": 3.076057195663452, + "learning_rate": 9.129490465137088e-06, + "loss": 0.9863, + "step": 5320 + }, + { + "epoch": 0.4299884846158508, + "grad_norm": 2.477181911468506, + "learning_rate": 9.12912148768859e-06, + "loss": 0.8986, + "step": 5321 + }, + { + "epoch": 0.43006929432917834, + "grad_norm": 2.6529858112335205, + "learning_rate": 9.12875243951818e-06, + "loss": 0.9301, + "step": 5322 + }, + { + "epoch": 0.4301501040425059, + "grad_norm": 2.9500670433044434, + "learning_rate": 9.128383320632182e-06, + "loss": 1.0576, + "step": 5323 + }, + { + "epoch": 0.43023091375583344, + "grad_norm": 3.4155526161193848, + "learning_rate": 9.128014131036915e-06, + "loss": 1.0253, + "step": 5324 + }, + { + "epoch": 0.43031172346916097, + "grad_norm": 2.9432568550109863, + "learning_rate": 9.127644870738703e-06, + "loss": 0.9619, + "step": 5325 + }, + { + "epoch": 0.43039253318248855, + "grad_norm": 2.53903865814209, + "learning_rate": 9.127275539743873e-06, + "loss": 1.0469, + "step": 5326 + }, + { + "epoch": 0.43047334289581607, + "grad_norm": 2.6842873096466064, + "learning_rate": 9.12690613805875e-06, + "loss": 0.97, + "step": 5327 + }, + { + "epoch": 0.4305541526091436, + "grad_norm": 2.5515427589416504, + "learning_rate": 9.126536665689656e-06, + "loss": 0.96, + "step": 5328 + }, + { + "epoch": 0.43063496232247117, + "grad_norm": 2.4751298427581787, + "learning_rate": 9.126167122642926e-06, + "loss": 0.976, + "step": 5329 + }, + { + "epoch": 0.4307157720357987, + "grad_norm": 2.612642765045166, + "learning_rate": 9.125797508924886e-06, + "loss": 0.9744, + "step": 5330 + }, + { + "epoch": 0.4307965817491262, + "grad_norm": 3.090175151824951, + "learning_rate": 9.125427824541867e-06, + "loss": 0.9953, + "step": 5331 + }, + { + "epoch": 0.4308773914624538, + "grad_norm": 3.0229883193969727, + "learning_rate": 9.1250580695002e-06, + "loss": 1.028, + "step": 5332 + }, + { + "epoch": 0.4309582011757813, + "grad_norm": 2.398843765258789, + "learning_rate": 9.124688243806221e-06, + "loss": 1.0018, + "step": 5333 + }, + { + "epoch": 0.4310390108891089, + "grad_norm": 2.757558584213257, + "learning_rate": 9.124318347466262e-06, + "loss": 0.8998, + "step": 5334 + }, + { + "epoch": 0.4311198206024364, + "grad_norm": 3.0316805839538574, + "learning_rate": 9.123948380486657e-06, + "loss": 1.109, + "step": 5335 + }, + { + "epoch": 0.43120063031576394, + "grad_norm": 3.376316547393799, + "learning_rate": 9.123578342873745e-06, + "loss": 1.1187, + "step": 5336 + }, + { + "epoch": 0.4312814400290915, + "grad_norm": 2.987802505493164, + "learning_rate": 9.123208234633862e-06, + "loss": 1.0732, + "step": 5337 + }, + { + "epoch": 0.43136224974241905, + "grad_norm": 3.3925554752349854, + "learning_rate": 9.12283805577335e-06, + "loss": 0.9825, + "step": 5338 + }, + { + "epoch": 0.43144305945574657, + "grad_norm": 2.5645298957824707, + "learning_rate": 9.122467806298546e-06, + "loss": 1.018, + "step": 5339 + }, + { + "epoch": 0.43152386916907415, + "grad_norm": 2.698145866394043, + "learning_rate": 9.122097486215793e-06, + "loss": 0.9105, + "step": 5340 + }, + { + "epoch": 0.43160467888240167, + "grad_norm": 2.5226173400878906, + "learning_rate": 9.121727095531435e-06, + "loss": 0.9614, + "step": 5341 + }, + { + "epoch": 0.4316854885957292, + "grad_norm": 3.4001269340515137, + "learning_rate": 9.121356634251813e-06, + "loss": 0.9106, + "step": 5342 + }, + { + "epoch": 0.4317662983090568, + "grad_norm": 2.5968616008758545, + "learning_rate": 9.120986102383274e-06, + "loss": 0.976, + "step": 5343 + }, + { + "epoch": 0.4318471080223843, + "grad_norm": 2.5773744583129883, + "learning_rate": 9.120615499932166e-06, + "loss": 0.9412, + "step": 5344 + }, + { + "epoch": 0.4319279177357118, + "grad_norm": 2.390458583831787, + "learning_rate": 9.120244826904832e-06, + "loss": 1.0266, + "step": 5345 + }, + { + "epoch": 0.4320087274490394, + "grad_norm": 2.7882652282714844, + "learning_rate": 9.119874083307624e-06, + "loss": 1.0087, + "step": 5346 + }, + { + "epoch": 0.4320895371623669, + "grad_norm": 2.5810043811798096, + "learning_rate": 9.11950326914689e-06, + "loss": 0.9264, + "step": 5347 + }, + { + "epoch": 0.43217034687569444, + "grad_norm": 2.6967110633850098, + "learning_rate": 9.119132384428984e-06, + "loss": 0.9351, + "step": 5348 + }, + { + "epoch": 0.432251156589022, + "grad_norm": 2.826792001724243, + "learning_rate": 9.118761429160256e-06, + "loss": 1.0107, + "step": 5349 + }, + { + "epoch": 0.43233196630234955, + "grad_norm": 2.733144760131836, + "learning_rate": 9.118390403347059e-06, + "loss": 1.1316, + "step": 5350 + }, + { + "epoch": 0.43241277601567707, + "grad_norm": 2.64494252204895, + "learning_rate": 9.118019306995752e-06, + "loss": 0.9722, + "step": 5351 + }, + { + "epoch": 0.43249358572900465, + "grad_norm": 2.6184942722320557, + "learning_rate": 9.117648140112685e-06, + "loss": 0.9555, + "step": 5352 + }, + { + "epoch": 0.43257439544233217, + "grad_norm": 3.931938409805298, + "learning_rate": 9.11727690270422e-06, + "loss": 0.9374, + "step": 5353 + }, + { + "epoch": 0.4326552051556597, + "grad_norm": 2.304018497467041, + "learning_rate": 9.116905594776713e-06, + "loss": 0.9416, + "step": 5354 + }, + { + "epoch": 0.4327360148689873, + "grad_norm": 2.753777265548706, + "learning_rate": 9.116534216336524e-06, + "loss": 1.0664, + "step": 5355 + }, + { + "epoch": 0.4328168245823148, + "grad_norm": 2.5695817470550537, + "learning_rate": 9.116162767390014e-06, + "loss": 0.9207, + "step": 5356 + }, + { + "epoch": 0.4328976342956423, + "grad_norm": 2.5012948513031006, + "learning_rate": 9.115791247943546e-06, + "loss": 0.9799, + "step": 5357 + }, + { + "epoch": 0.4329784440089699, + "grad_norm": 3.0362002849578857, + "learning_rate": 9.115419658003482e-06, + "loss": 1.0229, + "step": 5358 + }, + { + "epoch": 0.4330592537222974, + "grad_norm": 3.1641323566436768, + "learning_rate": 9.115047997576186e-06, + "loss": 0.8914, + "step": 5359 + }, + { + "epoch": 0.43314006343562494, + "grad_norm": 2.7601094245910645, + "learning_rate": 9.114676266668024e-06, + "loss": 0.8356, + "step": 5360 + }, + { + "epoch": 0.4332208731489525, + "grad_norm": 2.494013786315918, + "learning_rate": 9.114304465285363e-06, + "loss": 0.9652, + "step": 5361 + }, + { + "epoch": 0.43330168286228005, + "grad_norm": 2.597374677658081, + "learning_rate": 9.113932593434573e-06, + "loss": 0.9342, + "step": 5362 + }, + { + "epoch": 0.43338249257560757, + "grad_norm": 2.848135232925415, + "learning_rate": 9.11356065112202e-06, + "loss": 0.9799, + "step": 5363 + }, + { + "epoch": 0.43346330228893515, + "grad_norm": 2.5346763134002686, + "learning_rate": 9.113188638354078e-06, + "loss": 1.0089, + "step": 5364 + }, + { + "epoch": 0.43354411200226267, + "grad_norm": 2.888031244277954, + "learning_rate": 9.112816555137115e-06, + "loss": 0.8894, + "step": 5365 + }, + { + "epoch": 0.4336249217155902, + "grad_norm": 2.9543514251708984, + "learning_rate": 9.112444401477506e-06, + "loss": 0.8563, + "step": 5366 + }, + { + "epoch": 0.4337057314289178, + "grad_norm": 2.573028326034546, + "learning_rate": 9.112072177381625e-06, + "loss": 1.01, + "step": 5367 + }, + { + "epoch": 0.4337865411422453, + "grad_norm": 2.6583313941955566, + "learning_rate": 9.111699882855846e-06, + "loss": 0.9755, + "step": 5368 + }, + { + "epoch": 0.4338673508555728, + "grad_norm": 2.7696800231933594, + "learning_rate": 9.111327517906548e-06, + "loss": 0.9147, + "step": 5369 + }, + { + "epoch": 0.4339481605689004, + "grad_norm": 2.383000373840332, + "learning_rate": 9.110955082540108e-06, + "loss": 0.9502, + "step": 5370 + }, + { + "epoch": 0.4340289702822279, + "grad_norm": 2.257112979888916, + "learning_rate": 9.1105825767629e-06, + "loss": 0.9889, + "step": 5371 + }, + { + "epoch": 0.43410977999555544, + "grad_norm": 2.764824628829956, + "learning_rate": 9.110210000581312e-06, + "loss": 1.05, + "step": 5372 + }, + { + "epoch": 0.434190589708883, + "grad_norm": 2.6023004055023193, + "learning_rate": 9.109837354001721e-06, + "loss": 1.1121, + "step": 5373 + }, + { + "epoch": 0.43427139942221055, + "grad_norm": 2.582038640975952, + "learning_rate": 9.10946463703051e-06, + "loss": 0.948, + "step": 5374 + }, + { + "epoch": 0.43435220913553807, + "grad_norm": 3.0894858837127686, + "learning_rate": 9.109091849674063e-06, + "loss": 0.9489, + "step": 5375 + }, + { + "epoch": 0.43443301884886565, + "grad_norm": 2.3062944412231445, + "learning_rate": 9.108718991938764e-06, + "loss": 1.1621, + "step": 5376 + }, + { + "epoch": 0.43451382856219317, + "grad_norm": 2.9417057037353516, + "learning_rate": 9.108346063831002e-06, + "loss": 0.7992, + "step": 5377 + }, + { + "epoch": 0.4345946382755207, + "grad_norm": 2.533569097518921, + "learning_rate": 9.10797306535716e-06, + "loss": 0.9554, + "step": 5378 + }, + { + "epoch": 0.4346754479888483, + "grad_norm": 2.499107599258423, + "learning_rate": 9.10759999652363e-06, + "loss": 1.0426, + "step": 5379 + }, + { + "epoch": 0.4347562577021758, + "grad_norm": 2.9666755199432373, + "learning_rate": 9.1072268573368e-06, + "loss": 0.9048, + "step": 5380 + }, + { + "epoch": 0.4348370674155033, + "grad_norm": 2.65811824798584, + "learning_rate": 9.106853647803062e-06, + "loss": 0.9305, + "step": 5381 + }, + { + "epoch": 0.4349178771288309, + "grad_norm": 3.227161169052124, + "learning_rate": 9.106480367928808e-06, + "loss": 1.0654, + "step": 5382 + }, + { + "epoch": 0.4349986868421584, + "grad_norm": 2.56229567527771, + "learning_rate": 9.10610701772043e-06, + "loss": 1.1111, + "step": 5383 + }, + { + "epoch": 0.43507949655548595, + "grad_norm": 2.882650136947632, + "learning_rate": 9.105733597184327e-06, + "loss": 0.9176, + "step": 5384 + }, + { + "epoch": 0.4351603062688135, + "grad_norm": 2.5286736488342285, + "learning_rate": 9.10536010632689e-06, + "loss": 0.9715, + "step": 5385 + }, + { + "epoch": 0.43524111598214105, + "grad_norm": 2.5907533168792725, + "learning_rate": 9.104986545154516e-06, + "loss": 1.0406, + "step": 5386 + }, + { + "epoch": 0.43532192569546857, + "grad_norm": 2.4601783752441406, + "learning_rate": 9.104612913673607e-06, + "loss": 0.9509, + "step": 5387 + }, + { + "epoch": 0.43540273540879615, + "grad_norm": 2.891664981842041, + "learning_rate": 9.10423921189056e-06, + "loss": 1.0152, + "step": 5388 + }, + { + "epoch": 0.43548354512212367, + "grad_norm": 2.612304925918579, + "learning_rate": 9.103865439811775e-06, + "loss": 0.9979, + "step": 5389 + }, + { + "epoch": 0.4355643548354512, + "grad_norm": 2.445415735244751, + "learning_rate": 9.103491597443656e-06, + "loss": 0.9951, + "step": 5390 + }, + { + "epoch": 0.4356451645487788, + "grad_norm": 2.978505849838257, + "learning_rate": 9.103117684792605e-06, + "loss": 1.0978, + "step": 5391 + }, + { + "epoch": 0.4357259742621063, + "grad_norm": 2.5591397285461426, + "learning_rate": 9.102743701865023e-06, + "loss": 0.9738, + "step": 5392 + }, + { + "epoch": 0.4358067839754338, + "grad_norm": 2.7000041007995605, + "learning_rate": 9.102369648667319e-06, + "loss": 0.9148, + "step": 5393 + }, + { + "epoch": 0.4358875936887614, + "grad_norm": 2.5955159664154053, + "learning_rate": 9.101995525205901e-06, + "loss": 0.9332, + "step": 5394 + }, + { + "epoch": 0.4359684034020889, + "grad_norm": 2.9734086990356445, + "learning_rate": 9.101621331487174e-06, + "loss": 1.0102, + "step": 5395 + }, + { + "epoch": 0.4360492131154165, + "grad_norm": 2.4886577129364014, + "learning_rate": 9.101247067517547e-06, + "loss": 0.9779, + "step": 5396 + }, + { + "epoch": 0.436130022828744, + "grad_norm": 2.8928043842315674, + "learning_rate": 9.100872733303432e-06, + "loss": 0.909, + "step": 5397 + }, + { + "epoch": 0.43621083254207155, + "grad_norm": 2.512948751449585, + "learning_rate": 9.10049832885124e-06, + "loss": 0.9903, + "step": 5398 + }, + { + "epoch": 0.4362916422553991, + "grad_norm": 2.312678575515747, + "learning_rate": 9.100123854167381e-06, + "loss": 1.0323, + "step": 5399 + }, + { + "epoch": 0.43637245196872665, + "grad_norm": 2.5982494354248047, + "learning_rate": 9.099749309258273e-06, + "loss": 1.0086, + "step": 5400 + }, + { + "epoch": 0.43645326168205417, + "grad_norm": 2.693448781967163, + "learning_rate": 9.099374694130329e-06, + "loss": 0.9778, + "step": 5401 + }, + { + "epoch": 0.43653407139538175, + "grad_norm": 2.573061227798462, + "learning_rate": 9.099000008789965e-06, + "loss": 1.0132, + "step": 5402 + }, + { + "epoch": 0.4366148811087093, + "grad_norm": 3.0626840591430664, + "learning_rate": 9.098625253243598e-06, + "loss": 0.9552, + "step": 5403 + }, + { + "epoch": 0.4366956908220368, + "grad_norm": 2.446549654006958, + "learning_rate": 9.098250427497648e-06, + "loss": 0.8148, + "step": 5404 + }, + { + "epoch": 0.4367765005353644, + "grad_norm": 2.5462405681610107, + "learning_rate": 9.097875531558534e-06, + "loss": 0.9274, + "step": 5405 + }, + { + "epoch": 0.4368573102486919, + "grad_norm": 2.6182477474212646, + "learning_rate": 9.097500565432677e-06, + "loss": 0.8131, + "step": 5406 + }, + { + "epoch": 0.4369381199620194, + "grad_norm": 2.399460792541504, + "learning_rate": 9.097125529126501e-06, + "loss": 0.994, + "step": 5407 + }, + { + "epoch": 0.437018929675347, + "grad_norm": 2.7606916427612305, + "learning_rate": 9.096750422646427e-06, + "loss": 1.0381, + "step": 5408 + }, + { + "epoch": 0.4370997393886745, + "grad_norm": 2.4011104106903076, + "learning_rate": 9.096375245998883e-06, + "loss": 0.9902, + "step": 5409 + }, + { + "epoch": 0.43718054910200205, + "grad_norm": 2.7909364700317383, + "learning_rate": 9.09599999919029e-06, + "loss": 0.8746, + "step": 5410 + }, + { + "epoch": 0.4372613588153296, + "grad_norm": 2.861340045928955, + "learning_rate": 9.095624682227079e-06, + "loss": 1.0243, + "step": 5411 + }, + { + "epoch": 0.43734216852865715, + "grad_norm": 2.7483179569244385, + "learning_rate": 9.095249295115677e-06, + "loss": 0.843, + "step": 5412 + }, + { + "epoch": 0.4374229782419847, + "grad_norm": 2.6070587635040283, + "learning_rate": 9.094873837862512e-06, + "loss": 0.8971, + "step": 5413 + }, + { + "epoch": 0.43750378795531225, + "grad_norm": 2.604743719100952, + "learning_rate": 9.094498310474018e-06, + "loss": 1.0212, + "step": 5414 + }, + { + "epoch": 0.4375845976686398, + "grad_norm": 2.5673041343688965, + "learning_rate": 9.094122712956624e-06, + "loss": 0.8909, + "step": 5415 + }, + { + "epoch": 0.4376654073819673, + "grad_norm": 2.5342977046966553, + "learning_rate": 9.093747045316765e-06, + "loss": 0.8853, + "step": 5416 + }, + { + "epoch": 0.4377462170952949, + "grad_norm": 2.943723440170288, + "learning_rate": 9.093371307560874e-06, + "loss": 0.9714, + "step": 5417 + }, + { + "epoch": 0.4378270268086224, + "grad_norm": 2.836357355117798, + "learning_rate": 9.092995499695387e-06, + "loss": 1.0187, + "step": 5418 + }, + { + "epoch": 0.4379078365219499, + "grad_norm": 2.8180837631225586, + "learning_rate": 9.092619621726739e-06, + "loss": 0.954, + "step": 5419 + }, + { + "epoch": 0.4379886462352775, + "grad_norm": 2.809494972229004, + "learning_rate": 9.092243673661371e-06, + "loss": 1.0528, + "step": 5420 + }, + { + "epoch": 0.438069455948605, + "grad_norm": 2.688429117202759, + "learning_rate": 9.091867655505721e-06, + "loss": 0.9568, + "step": 5421 + }, + { + "epoch": 0.43815026566193255, + "grad_norm": 2.4685428142547607, + "learning_rate": 9.091491567266228e-06, + "loss": 0.9759, + "step": 5422 + }, + { + "epoch": 0.4382310753752601, + "grad_norm": 2.688058614730835, + "learning_rate": 9.091115408949334e-06, + "loss": 1.0722, + "step": 5423 + }, + { + "epoch": 0.43831188508858765, + "grad_norm": 2.458096742630005, + "learning_rate": 9.090739180561482e-06, + "loss": 0.8386, + "step": 5424 + }, + { + "epoch": 0.4383926948019152, + "grad_norm": 3.4665586948394775, + "learning_rate": 9.090362882109118e-06, + "loss": 0.8698, + "step": 5425 + }, + { + "epoch": 0.43847350451524275, + "grad_norm": 2.9267451763153076, + "learning_rate": 9.089986513598683e-06, + "loss": 0.8727, + "step": 5426 + }, + { + "epoch": 0.4385543142285703, + "grad_norm": 2.9240384101867676, + "learning_rate": 9.089610075036625e-06, + "loss": 1.1244, + "step": 5427 + }, + { + "epoch": 0.4386351239418978, + "grad_norm": 2.644961357116699, + "learning_rate": 9.089233566429393e-06, + "loss": 0.9061, + "step": 5428 + }, + { + "epoch": 0.4387159336552254, + "grad_norm": 3.047579526901245, + "learning_rate": 9.088856987783435e-06, + "loss": 0.9648, + "step": 5429 + }, + { + "epoch": 0.4387967433685529, + "grad_norm": 2.378953695297241, + "learning_rate": 9.088480339105198e-06, + "loss": 0.915, + "step": 5430 + }, + { + "epoch": 0.4388775530818804, + "grad_norm": 2.9312777519226074, + "learning_rate": 9.088103620401136e-06, + "loss": 0.9505, + "step": 5431 + }, + { + "epoch": 0.438958362795208, + "grad_norm": 3.608285665512085, + "learning_rate": 9.087726831677702e-06, + "loss": 0.9127, + "step": 5432 + }, + { + "epoch": 0.4390391725085355, + "grad_norm": 2.4746925830841064, + "learning_rate": 9.087349972941348e-06, + "loss": 0.882, + "step": 5433 + }, + { + "epoch": 0.43911998222186305, + "grad_norm": 2.8620457649230957, + "learning_rate": 9.086973044198529e-06, + "loss": 0.944, + "step": 5434 + }, + { + "epoch": 0.4392007919351906, + "grad_norm": 3.0637123584747314, + "learning_rate": 9.086596045455699e-06, + "loss": 1.0807, + "step": 5435 + }, + { + "epoch": 0.43928160164851815, + "grad_norm": 2.63630747795105, + "learning_rate": 9.086218976719318e-06, + "loss": 0.9779, + "step": 5436 + }, + { + "epoch": 0.4393624113618457, + "grad_norm": 2.4325077533721924, + "learning_rate": 9.085841837995843e-06, + "loss": 1.0537, + "step": 5437 + }, + { + "epoch": 0.43944322107517325, + "grad_norm": 3.2006518840789795, + "learning_rate": 9.085464629291733e-06, + "loss": 0.9899, + "step": 5438 + }, + { + "epoch": 0.4395240307885008, + "grad_norm": 2.8442983627319336, + "learning_rate": 9.08508735061345e-06, + "loss": 0.9736, + "step": 5439 + }, + { + "epoch": 0.4396048405018283, + "grad_norm": 2.949164867401123, + "learning_rate": 9.084710001967455e-06, + "loss": 0.9632, + "step": 5440 + }, + { + "epoch": 0.4396856502151559, + "grad_norm": 2.717405080795288, + "learning_rate": 9.084332583360211e-06, + "loss": 0.9216, + "step": 5441 + }, + { + "epoch": 0.4397664599284834, + "grad_norm": 2.8537914752960205, + "learning_rate": 9.083955094798183e-06, + "loss": 0.9337, + "step": 5442 + }, + { + "epoch": 0.4398472696418109, + "grad_norm": 2.6488921642303467, + "learning_rate": 9.083577536287836e-06, + "loss": 0.9685, + "step": 5443 + }, + { + "epoch": 0.4399280793551385, + "grad_norm": 2.626530408859253, + "learning_rate": 9.083199907835636e-06, + "loss": 1.0865, + "step": 5444 + }, + { + "epoch": 0.440008889068466, + "grad_norm": 2.767813205718994, + "learning_rate": 9.082822209448052e-06, + "loss": 0.9972, + "step": 5445 + }, + { + "epoch": 0.44008969878179355, + "grad_norm": 2.9953784942626953, + "learning_rate": 9.082444441131552e-06, + "loss": 0.961, + "step": 5446 + }, + { + "epoch": 0.4401705084951211, + "grad_norm": 2.9615743160247803, + "learning_rate": 9.082066602892606e-06, + "loss": 1.048, + "step": 5447 + }, + { + "epoch": 0.44025131820844865, + "grad_norm": 2.9631004333496094, + "learning_rate": 9.081688694737687e-06, + "loss": 0.9233, + "step": 5448 + }, + { + "epoch": 0.4403321279217762, + "grad_norm": 3.297285795211792, + "learning_rate": 9.081310716673268e-06, + "loss": 0.8017, + "step": 5449 + }, + { + "epoch": 0.44041293763510375, + "grad_norm": 2.679313898086548, + "learning_rate": 9.08093266870582e-06, + "loss": 0.9945, + "step": 5450 + }, + { + "epoch": 0.4404937473484313, + "grad_norm": 2.6820168495178223, + "learning_rate": 9.08055455084182e-06, + "loss": 0.9967, + "step": 5451 + }, + { + "epoch": 0.4405745570617588, + "grad_norm": 2.633715867996216, + "learning_rate": 9.080176363087746e-06, + "loss": 1.0125, + "step": 5452 + }, + { + "epoch": 0.4406553667750864, + "grad_norm": 2.7603092193603516, + "learning_rate": 9.079798105450073e-06, + "loss": 0.9295, + "step": 5453 + }, + { + "epoch": 0.4407361764884139, + "grad_norm": 2.804338216781616, + "learning_rate": 9.07941977793528e-06, + "loss": 0.9907, + "step": 5454 + }, + { + "epoch": 0.4408169862017414, + "grad_norm": 2.5634734630584717, + "learning_rate": 9.079041380549846e-06, + "loss": 0.8885, + "step": 5455 + }, + { + "epoch": 0.440897795915069, + "grad_norm": 2.8201401233673096, + "learning_rate": 9.078662913300254e-06, + "loss": 0.8524, + "step": 5456 + }, + { + "epoch": 0.4409786056283965, + "grad_norm": 2.7683045864105225, + "learning_rate": 9.078284376192985e-06, + "loss": 0.9688, + "step": 5457 + }, + { + "epoch": 0.44105941534172405, + "grad_norm": 2.4153308868408203, + "learning_rate": 9.077905769234521e-06, + "loss": 0.9944, + "step": 5458 + }, + { + "epoch": 0.4411402250550516, + "grad_norm": 2.8334877490997314, + "learning_rate": 9.07752709243135e-06, + "loss": 1.0495, + "step": 5459 + }, + { + "epoch": 0.44122103476837915, + "grad_norm": 2.699657678604126, + "learning_rate": 9.077148345789957e-06, + "loss": 0.9845, + "step": 5460 + }, + { + "epoch": 0.44130184448170673, + "grad_norm": 3.2312214374542236, + "learning_rate": 9.076769529316828e-06, + "loss": 0.9549, + "step": 5461 + }, + { + "epoch": 0.44138265419503425, + "grad_norm": 2.6487627029418945, + "learning_rate": 9.07639064301845e-06, + "loss": 1.0104, + "step": 5462 + }, + { + "epoch": 0.4414634639083618, + "grad_norm": 2.9800760746002197, + "learning_rate": 9.076011686901314e-06, + "loss": 1.0576, + "step": 5463 + }, + { + "epoch": 0.44154427362168935, + "grad_norm": 2.6835033893585205, + "learning_rate": 9.075632660971912e-06, + "loss": 0.9986, + "step": 5464 + }, + { + "epoch": 0.4416250833350169, + "grad_norm": 3.2633471488952637, + "learning_rate": 9.075253565236733e-06, + "loss": 1.005, + "step": 5465 + }, + { + "epoch": 0.4417058930483444, + "grad_norm": 2.7454686164855957, + "learning_rate": 9.07487439970227e-06, + "loss": 1.0678, + "step": 5466 + }, + { + "epoch": 0.441786702761672, + "grad_norm": 3.266718864440918, + "learning_rate": 9.07449516437502e-06, + "loss": 0.9595, + "step": 5467 + }, + { + "epoch": 0.4418675124749995, + "grad_norm": 2.8997883796691895, + "learning_rate": 9.074115859261477e-06, + "loss": 0.8997, + "step": 5468 + }, + { + "epoch": 0.441948322188327, + "grad_norm": 2.9768147468566895, + "learning_rate": 9.073736484368136e-06, + "loss": 1.0163, + "step": 5469 + }, + { + "epoch": 0.4420291319016546, + "grad_norm": 2.6398308277130127, + "learning_rate": 9.073357039701497e-06, + "loss": 0.9092, + "step": 5470 + }, + { + "epoch": 0.4421099416149821, + "grad_norm": 2.6662533283233643, + "learning_rate": 9.072977525268058e-06, + "loss": 0.9313, + "step": 5471 + }, + { + "epoch": 0.44219075132830965, + "grad_norm": 2.9424004554748535, + "learning_rate": 9.07259794107432e-06, + "loss": 1.0139, + "step": 5472 + }, + { + "epoch": 0.44227156104163723, + "grad_norm": 2.9376235008239746, + "learning_rate": 9.072218287126781e-06, + "loss": 0.8727, + "step": 5473 + }, + { + "epoch": 0.44235237075496475, + "grad_norm": 2.708012342453003, + "learning_rate": 9.07183856343195e-06, + "loss": 1.0118, + "step": 5474 + }, + { + "epoch": 0.4424331804682923, + "grad_norm": 2.6264543533325195, + "learning_rate": 9.071458769996323e-06, + "loss": 1.0235, + "step": 5475 + }, + { + "epoch": 0.44251399018161985, + "grad_norm": 2.2916948795318604, + "learning_rate": 9.071078906826413e-06, + "loss": 0.9985, + "step": 5476 + }, + { + "epoch": 0.4425947998949474, + "grad_norm": 2.640254259109497, + "learning_rate": 9.07069897392872e-06, + "loss": 0.9172, + "step": 5477 + }, + { + "epoch": 0.4426756096082749, + "grad_norm": 2.9489004611968994, + "learning_rate": 9.070318971309753e-06, + "loss": 1.0118, + "step": 5478 + }, + { + "epoch": 0.4427564193216025, + "grad_norm": 2.5072970390319824, + "learning_rate": 9.069938898976021e-06, + "loss": 0.9398, + "step": 5479 + }, + { + "epoch": 0.44283722903493, + "grad_norm": 2.9356276988983154, + "learning_rate": 9.069558756934035e-06, + "loss": 0.9477, + "step": 5480 + }, + { + "epoch": 0.4429180387482575, + "grad_norm": 2.9397525787353516, + "learning_rate": 9.069178545190303e-06, + "loss": 0.863, + "step": 5481 + }, + { + "epoch": 0.4429988484615851, + "grad_norm": 2.6035776138305664, + "learning_rate": 9.06879826375134e-06, + "loss": 0.9092, + "step": 5482 + }, + { + "epoch": 0.4430796581749126, + "grad_norm": 2.726120948791504, + "learning_rate": 9.068417912623658e-06, + "loss": 0.9508, + "step": 5483 + }, + { + "epoch": 0.44316046788824015, + "grad_norm": 2.9055328369140625, + "learning_rate": 9.06803749181377e-06, + "loss": 0.9201, + "step": 5484 + }, + { + "epoch": 0.44324127760156773, + "grad_norm": 2.7960684299468994, + "learning_rate": 9.067657001328192e-06, + "loss": 1.0273, + "step": 5485 + }, + { + "epoch": 0.44332208731489525, + "grad_norm": 2.568148374557495, + "learning_rate": 9.067276441173444e-06, + "loss": 0.9686, + "step": 5486 + }, + { + "epoch": 0.4434028970282228, + "grad_norm": 2.7288520336151123, + "learning_rate": 9.066895811356042e-06, + "loss": 1.0194, + "step": 5487 + }, + { + "epoch": 0.44348370674155035, + "grad_norm": 2.4051589965820312, + "learning_rate": 9.066515111882506e-06, + "loss": 1.0438, + "step": 5488 + }, + { + "epoch": 0.4435645164548779, + "grad_norm": 2.540262222290039, + "learning_rate": 9.066134342759355e-06, + "loss": 0.9808, + "step": 5489 + }, + { + "epoch": 0.4436453261682054, + "grad_norm": 2.9549736976623535, + "learning_rate": 9.065753503993111e-06, + "loss": 0.9709, + "step": 5490 + }, + { + "epoch": 0.443726135881533, + "grad_norm": 2.5096869468688965, + "learning_rate": 9.0653725955903e-06, + "loss": 0.9466, + "step": 5491 + }, + { + "epoch": 0.4438069455948605, + "grad_norm": 3.1507253646850586, + "learning_rate": 9.064991617557442e-06, + "loss": 1.0093, + "step": 5492 + }, + { + "epoch": 0.443887755308188, + "grad_norm": 2.5430915355682373, + "learning_rate": 9.064610569901062e-06, + "loss": 1.017, + "step": 5493 + }, + { + "epoch": 0.4439685650215156, + "grad_norm": 2.9738543033599854, + "learning_rate": 9.06422945262769e-06, + "loss": 0.9594, + "step": 5494 + }, + { + "epoch": 0.4440493747348431, + "grad_norm": 3.2727372646331787, + "learning_rate": 9.06384826574385e-06, + "loss": 0.9934, + "step": 5495 + }, + { + "epoch": 0.44413018444817065, + "grad_norm": 2.7948172092437744, + "learning_rate": 9.063467009256075e-06, + "loss": 1.0145, + "step": 5496 + }, + { + "epoch": 0.44421099416149823, + "grad_norm": 2.4964559078216553, + "learning_rate": 9.063085683170892e-06, + "loss": 1.1203, + "step": 5497 + }, + { + "epoch": 0.44429180387482575, + "grad_norm": 3.0438883304595947, + "learning_rate": 9.06270428749483e-06, + "loss": 1.0103, + "step": 5498 + }, + { + "epoch": 0.4443726135881533, + "grad_norm": 2.4090576171875, + "learning_rate": 9.062322822234426e-06, + "loss": 0.9486, + "step": 5499 + }, + { + "epoch": 0.44445342330148085, + "grad_norm": 2.6331546306610107, + "learning_rate": 9.061941287396211e-06, + "loss": 0.8768, + "step": 5500 + }, + { + "epoch": 0.4445342330148084, + "grad_norm": 2.906010627746582, + "learning_rate": 9.061559682986722e-06, + "loss": 0.9696, + "step": 5501 + }, + { + "epoch": 0.4446150427281359, + "grad_norm": 2.735092878341675, + "learning_rate": 9.061178009012492e-06, + "loss": 0.9804, + "step": 5502 + }, + { + "epoch": 0.4446958524414635, + "grad_norm": 2.8308145999908447, + "learning_rate": 9.06079626548006e-06, + "loss": 1.0371, + "step": 5503 + }, + { + "epoch": 0.444776662154791, + "grad_norm": 2.8933494091033936, + "learning_rate": 9.060414452395964e-06, + "loss": 0.9639, + "step": 5504 + }, + { + "epoch": 0.4448574718681185, + "grad_norm": 2.6455814838409424, + "learning_rate": 9.060032569766746e-06, + "loss": 0.9895, + "step": 5505 + }, + { + "epoch": 0.4449382815814461, + "grad_norm": 2.737823963165283, + "learning_rate": 9.059650617598941e-06, + "loss": 0.9732, + "step": 5506 + }, + { + "epoch": 0.4450190912947736, + "grad_norm": 2.6841917037963867, + "learning_rate": 9.059268595899095e-06, + "loss": 0.9628, + "step": 5507 + }, + { + "epoch": 0.44509990100810115, + "grad_norm": 2.488337516784668, + "learning_rate": 9.05888650467375e-06, + "loss": 1.0677, + "step": 5508 + }, + { + "epoch": 0.44518071072142873, + "grad_norm": 3.0691475868225098, + "learning_rate": 9.05850434392945e-06, + "loss": 0.994, + "step": 5509 + }, + { + "epoch": 0.44526152043475625, + "grad_norm": 2.6210825443267822, + "learning_rate": 9.058122113672742e-06, + "loss": 0.9076, + "step": 5510 + }, + { + "epoch": 0.4453423301480838, + "grad_norm": 3.26194429397583, + "learning_rate": 9.05773981391017e-06, + "loss": 0.9735, + "step": 5511 + }, + { + "epoch": 0.44542313986141135, + "grad_norm": 3.310988664627075, + "learning_rate": 9.057357444648287e-06, + "loss": 1.0247, + "step": 5512 + }, + { + "epoch": 0.4455039495747389, + "grad_norm": 3.0622739791870117, + "learning_rate": 9.056975005893638e-06, + "loss": 1.01, + "step": 5513 + }, + { + "epoch": 0.4455847592880664, + "grad_norm": 3.1104676723480225, + "learning_rate": 9.056592497652772e-06, + "loss": 0.8911, + "step": 5514 + }, + { + "epoch": 0.445665569001394, + "grad_norm": 2.6094138622283936, + "learning_rate": 9.056209919932243e-06, + "loss": 0.889, + "step": 5515 + }, + { + "epoch": 0.4457463787147215, + "grad_norm": 2.6728122234344482, + "learning_rate": 9.055827272738601e-06, + "loss": 1.0073, + "step": 5516 + }, + { + "epoch": 0.445827188428049, + "grad_norm": 2.468600034713745, + "learning_rate": 9.055444556078406e-06, + "loss": 1.021, + "step": 5517 + }, + { + "epoch": 0.4459079981413766, + "grad_norm": 2.840611457824707, + "learning_rate": 9.055061769958206e-06, + "loss": 0.9477, + "step": 5518 + }, + { + "epoch": 0.44598880785470413, + "grad_norm": 3.3472161293029785, + "learning_rate": 9.05467891438456e-06, + "loss": 0.9948, + "step": 5519 + }, + { + "epoch": 0.44606961756803165, + "grad_norm": 3.0067176818847656, + "learning_rate": 9.054295989364027e-06, + "loss": 0.9886, + "step": 5520 + }, + { + "epoch": 0.44615042728135923, + "grad_norm": 3.0566020011901855, + "learning_rate": 9.053912994903163e-06, + "loss": 0.9347, + "step": 5521 + }, + { + "epoch": 0.44623123699468675, + "grad_norm": 2.7558786869049072, + "learning_rate": 9.053529931008529e-06, + "loss": 0.9629, + "step": 5522 + }, + { + "epoch": 0.4463120467080143, + "grad_norm": 2.394063711166382, + "learning_rate": 9.053146797686685e-06, + "loss": 0.9981, + "step": 5523 + }, + { + "epoch": 0.44639285642134185, + "grad_norm": 2.907478094100952, + "learning_rate": 9.052763594944197e-06, + "loss": 0.8697, + "step": 5524 + }, + { + "epoch": 0.4464736661346694, + "grad_norm": 3.145693778991699, + "learning_rate": 9.052380322787622e-06, + "loss": 0.9103, + "step": 5525 + }, + { + "epoch": 0.44655447584799696, + "grad_norm": 2.6979012489318848, + "learning_rate": 9.051996981223527e-06, + "loss": 1.0811, + "step": 5526 + }, + { + "epoch": 0.4466352855613245, + "grad_norm": 2.6951351165771484, + "learning_rate": 9.051613570258481e-06, + "loss": 1.0288, + "step": 5527 + }, + { + "epoch": 0.446716095274652, + "grad_norm": 2.644151210784912, + "learning_rate": 9.051230089899048e-06, + "loss": 0.9223, + "step": 5528 + }, + { + "epoch": 0.4467969049879796, + "grad_norm": 2.6776485443115234, + "learning_rate": 9.050846540151796e-06, + "loss": 1.0171, + "step": 5529 + }, + { + "epoch": 0.4468777147013071, + "grad_norm": 3.01741361618042, + "learning_rate": 9.050462921023295e-06, + "loss": 0.9938, + "step": 5530 + }, + { + "epoch": 0.44695852441463463, + "grad_norm": 3.307741403579712, + "learning_rate": 9.050079232520115e-06, + "loss": 0.9213, + "step": 5531 + }, + { + "epoch": 0.4470393341279622, + "grad_norm": 2.58425235748291, + "learning_rate": 9.04969547464883e-06, + "loss": 1.0016, + "step": 5532 + }, + { + "epoch": 0.44712014384128973, + "grad_norm": 3.055528402328491, + "learning_rate": 9.049311647416006e-06, + "loss": 0.9972, + "step": 5533 + }, + { + "epoch": 0.44720095355461725, + "grad_norm": 2.4644417762756348, + "learning_rate": 9.048927750828225e-06, + "loss": 1.0032, + "step": 5534 + }, + { + "epoch": 0.44728176326794483, + "grad_norm": 2.6862640380859375, + "learning_rate": 9.048543784892058e-06, + "loss": 0.9424, + "step": 5535 + }, + { + "epoch": 0.44736257298127236, + "grad_norm": 2.4994943141937256, + "learning_rate": 9.048159749614084e-06, + "loss": 1.0303, + "step": 5536 + }, + { + "epoch": 0.4474433826945999, + "grad_norm": 2.360243797302246, + "learning_rate": 9.047775645000878e-06, + "loss": 1.034, + "step": 5537 + }, + { + "epoch": 0.44752419240792746, + "grad_norm": 2.4282078742980957, + "learning_rate": 9.047391471059021e-06, + "loss": 1.1387, + "step": 5538 + }, + { + "epoch": 0.447605002121255, + "grad_norm": 2.479712724685669, + "learning_rate": 9.04700722779509e-06, + "loss": 0.9426, + "step": 5539 + }, + { + "epoch": 0.4476858118345825, + "grad_norm": 2.675340414047241, + "learning_rate": 9.046622915215668e-06, + "loss": 1.0598, + "step": 5540 + }, + { + "epoch": 0.4477666215479101, + "grad_norm": 2.6983983516693115, + "learning_rate": 9.046238533327338e-06, + "loss": 1.0777, + "step": 5541 + }, + { + "epoch": 0.4478474312612376, + "grad_norm": 2.520012855529785, + "learning_rate": 9.045854082136683e-06, + "loss": 1.0144, + "step": 5542 + }, + { + "epoch": 0.44792824097456513, + "grad_norm": 2.781796455383301, + "learning_rate": 9.045469561650288e-06, + "loss": 0.9603, + "step": 5543 + }, + { + "epoch": 0.4480090506878927, + "grad_norm": 2.2349886894226074, + "learning_rate": 9.045084971874738e-06, + "loss": 1.0555, + "step": 5544 + }, + { + "epoch": 0.44808986040122023, + "grad_norm": 2.447571039199829, + "learning_rate": 9.044700312816621e-06, + "loss": 0.9906, + "step": 5545 + }, + { + "epoch": 0.44817067011454775, + "grad_norm": 2.8241076469421387, + "learning_rate": 9.044315584482524e-06, + "loss": 0.8948, + "step": 5546 + }, + { + "epoch": 0.44825147982787533, + "grad_norm": 2.5972695350646973, + "learning_rate": 9.043930786879038e-06, + "loss": 0.9589, + "step": 5547 + }, + { + "epoch": 0.44833228954120286, + "grad_norm": 2.873607873916626, + "learning_rate": 9.043545920012753e-06, + "loss": 0.8906, + "step": 5548 + }, + { + "epoch": 0.4484130992545304, + "grad_norm": 3.0557618141174316, + "learning_rate": 9.04316098389026e-06, + "loss": 0.9086, + "step": 5549 + }, + { + "epoch": 0.44849390896785796, + "grad_norm": 2.5151772499084473, + "learning_rate": 9.042775978518152e-06, + "loss": 0.8784, + "step": 5550 + }, + { + "epoch": 0.4485747186811855, + "grad_norm": 2.53061580657959, + "learning_rate": 9.042390903903027e-06, + "loss": 0.9832, + "step": 5551 + }, + { + "epoch": 0.448655528394513, + "grad_norm": 3.1286022663116455, + "learning_rate": 9.042005760051476e-06, + "loss": 0.9332, + "step": 5552 + }, + { + "epoch": 0.4487363381078406, + "grad_norm": 2.4694337844848633, + "learning_rate": 9.041620546970096e-06, + "loss": 0.9306, + "step": 5553 + }, + { + "epoch": 0.4488171478211681, + "grad_norm": 2.8130273818969727, + "learning_rate": 9.041235264665487e-06, + "loss": 0.9732, + "step": 5554 + }, + { + "epoch": 0.44889795753449563, + "grad_norm": 3.2897021770477295, + "learning_rate": 9.040849913144245e-06, + "loss": 0.9042, + "step": 5555 + }, + { + "epoch": 0.4489787672478232, + "grad_norm": 3.095824718475342, + "learning_rate": 9.040464492412974e-06, + "loss": 0.9953, + "step": 5556 + }, + { + "epoch": 0.44905957696115073, + "grad_norm": 2.5027883052825928, + "learning_rate": 9.040079002478274e-06, + "loss": 0.9512, + "step": 5557 + }, + { + "epoch": 0.44914038667447825, + "grad_norm": 2.522536039352417, + "learning_rate": 9.039693443346745e-06, + "loss": 0.9921, + "step": 5558 + }, + { + "epoch": 0.44922119638780583, + "grad_norm": 2.464492082595825, + "learning_rate": 9.039307815024994e-06, + "loss": 1.0918, + "step": 5559 + }, + { + "epoch": 0.44930200610113336, + "grad_norm": 2.9177801609039307, + "learning_rate": 9.038922117519622e-06, + "loss": 0.7919, + "step": 5560 + }, + { + "epoch": 0.4493828158144609, + "grad_norm": 2.7841975688934326, + "learning_rate": 9.038536350837239e-06, + "loss": 1.029, + "step": 5561 + }, + { + "epoch": 0.44946362552778846, + "grad_norm": 2.675217628479004, + "learning_rate": 9.038150514984452e-06, + "loss": 0.8656, + "step": 5562 + }, + { + "epoch": 0.449544435241116, + "grad_norm": 2.4436745643615723, + "learning_rate": 9.037764609967865e-06, + "loss": 0.9462, + "step": 5563 + }, + { + "epoch": 0.4496252449544435, + "grad_norm": 2.2391719818115234, + "learning_rate": 9.037378635794093e-06, + "loss": 0.9355, + "step": 5564 + }, + { + "epoch": 0.4497060546677711, + "grad_norm": 3.132204294204712, + "learning_rate": 9.036992592469744e-06, + "loss": 0.9168, + "step": 5565 + }, + { + "epoch": 0.4497868643810986, + "grad_norm": 2.589966297149658, + "learning_rate": 9.03660648000143e-06, + "loss": 1.015, + "step": 5566 + }, + { + "epoch": 0.44986767409442613, + "grad_norm": 2.939164876937866, + "learning_rate": 9.036220298395767e-06, + "loss": 0.9294, + "step": 5567 + }, + { + "epoch": 0.4499484838077537, + "grad_norm": 2.3374087810516357, + "learning_rate": 9.035834047659365e-06, + "loss": 0.9356, + "step": 5568 + }, + { + "epoch": 0.45002929352108123, + "grad_norm": 3.0965662002563477, + "learning_rate": 9.03544772779884e-06, + "loss": 0.94, + "step": 5569 + }, + { + "epoch": 0.45011010323440875, + "grad_norm": 2.566420793533325, + "learning_rate": 9.035061338820815e-06, + "loss": 1.0781, + "step": 5570 + }, + { + "epoch": 0.45019091294773633, + "grad_norm": 2.7284440994262695, + "learning_rate": 9.0346748807319e-06, + "loss": 0.9242, + "step": 5571 + }, + { + "epoch": 0.45027172266106386, + "grad_norm": 2.9208481311798096, + "learning_rate": 9.034288353538716e-06, + "loss": 0.9111, + "step": 5572 + }, + { + "epoch": 0.4503525323743914, + "grad_norm": 2.588937282562256, + "learning_rate": 9.033901757247888e-06, + "loss": 1.0383, + "step": 5573 + }, + { + "epoch": 0.45043334208771896, + "grad_norm": 2.595132350921631, + "learning_rate": 9.033515091866033e-06, + "loss": 1.0119, + "step": 5574 + }, + { + "epoch": 0.4505141518010465, + "grad_norm": 2.634265899658203, + "learning_rate": 9.033128357399774e-06, + "loss": 0.9153, + "step": 5575 + }, + { + "epoch": 0.450594961514374, + "grad_norm": 2.527857542037964, + "learning_rate": 9.032741553855736e-06, + "loss": 1.0447, + "step": 5576 + }, + { + "epoch": 0.4506757712277016, + "grad_norm": 2.6825711727142334, + "learning_rate": 9.032354681240543e-06, + "loss": 0.9012, + "step": 5577 + }, + { + "epoch": 0.4507565809410291, + "grad_norm": 3.1356825828552246, + "learning_rate": 9.031967739560823e-06, + "loss": 0.9397, + "step": 5578 + }, + { + "epoch": 0.45083739065435663, + "grad_norm": 3.129138946533203, + "learning_rate": 9.031580728823201e-06, + "loss": 0.9757, + "step": 5579 + }, + { + "epoch": 0.4509182003676842, + "grad_norm": 3.3961873054504395, + "learning_rate": 9.031193649034308e-06, + "loss": 0.9768, + "step": 5580 + }, + { + "epoch": 0.45099901008101173, + "grad_norm": 2.80542254447937, + "learning_rate": 9.030806500200773e-06, + "loss": 0.9601, + "step": 5581 + }, + { + "epoch": 0.45107981979433925, + "grad_norm": 2.480135679244995, + "learning_rate": 9.030419282329225e-06, + "loss": 1.1052, + "step": 5582 + }, + { + "epoch": 0.45116062950766683, + "grad_norm": 2.8546664714813232, + "learning_rate": 9.030031995426299e-06, + "loss": 0.9511, + "step": 5583 + }, + { + "epoch": 0.45124143922099436, + "grad_norm": 2.225099563598633, + "learning_rate": 9.029644639498624e-06, + "loss": 0.9371, + "step": 5584 + }, + { + "epoch": 0.4513222489343219, + "grad_norm": 2.7203240394592285, + "learning_rate": 9.02925721455284e-06, + "loss": 1.0717, + "step": 5585 + }, + { + "epoch": 0.45140305864764946, + "grad_norm": 2.8199462890625, + "learning_rate": 9.02886972059558e-06, + "loss": 1.0173, + "step": 5586 + }, + { + "epoch": 0.451483868360977, + "grad_norm": 2.914057970046997, + "learning_rate": 9.02848215763348e-06, + "loss": 0.8989, + "step": 5587 + }, + { + "epoch": 0.4515646780743045, + "grad_norm": 2.5518972873687744, + "learning_rate": 9.028094525673176e-06, + "loss": 1.0232, + "step": 5588 + }, + { + "epoch": 0.4516454877876321, + "grad_norm": 2.5966525077819824, + "learning_rate": 9.027706824721315e-06, + "loss": 0.9939, + "step": 5589 + }, + { + "epoch": 0.4517262975009596, + "grad_norm": 2.928921699523926, + "learning_rate": 9.027319054784529e-06, + "loss": 1.003, + "step": 5590 + }, + { + "epoch": 0.4518071072142872, + "grad_norm": 2.5647921562194824, + "learning_rate": 9.026931215869465e-06, + "loss": 0.9058, + "step": 5591 + }, + { + "epoch": 0.4518879169276147, + "grad_norm": 3.163996696472168, + "learning_rate": 9.026543307982762e-06, + "loss": 0.8761, + "step": 5592 + }, + { + "epoch": 0.45196872664094223, + "grad_norm": 2.940082550048828, + "learning_rate": 9.026155331131066e-06, + "loss": 1.0036, + "step": 5593 + }, + { + "epoch": 0.4520495363542698, + "grad_norm": 2.8263580799102783, + "learning_rate": 9.025767285321023e-06, + "loss": 0.8245, + "step": 5594 + }, + { + "epoch": 0.45213034606759733, + "grad_norm": 2.4488768577575684, + "learning_rate": 9.025379170559277e-06, + "loss": 0.9657, + "step": 5595 + }, + { + "epoch": 0.45221115578092486, + "grad_norm": 2.892960548400879, + "learning_rate": 9.024990986852476e-06, + "loss": 1.0057, + "step": 5596 + }, + { + "epoch": 0.45229196549425243, + "grad_norm": 4.108729362487793, + "learning_rate": 9.024602734207271e-06, + "loss": 0.9929, + "step": 5597 + }, + { + "epoch": 0.45237277520757996, + "grad_norm": 2.586036205291748, + "learning_rate": 9.024214412630307e-06, + "loss": 0.9764, + "step": 5598 + }, + { + "epoch": 0.4524535849209075, + "grad_norm": 2.725705862045288, + "learning_rate": 9.023826022128242e-06, + "loss": 1.0102, + "step": 5599 + }, + { + "epoch": 0.45253439463423506, + "grad_norm": 3.0589475631713867, + "learning_rate": 9.023437562707721e-06, + "loss": 0.9423, + "step": 5600 + }, + { + "epoch": 0.4526152043475626, + "grad_norm": 2.645115852355957, + "learning_rate": 9.023049034375401e-06, + "loss": 0.8723, + "step": 5601 + }, + { + "epoch": 0.4526960140608901, + "grad_norm": 2.4714303016662598, + "learning_rate": 9.022660437137937e-06, + "loss": 1.0745, + "step": 5602 + }, + { + "epoch": 0.4527768237742177, + "grad_norm": 2.32243275642395, + "learning_rate": 9.022271771001985e-06, + "loss": 1.0224, + "step": 5603 + }, + { + "epoch": 0.4528576334875452, + "grad_norm": 2.6025161743164062, + "learning_rate": 9.021883035974198e-06, + "loss": 1.0014, + "step": 5604 + }, + { + "epoch": 0.45293844320087273, + "grad_norm": 2.8429667949676514, + "learning_rate": 9.021494232061239e-06, + "loss": 0.8874, + "step": 5605 + }, + { + "epoch": 0.4530192529142003, + "grad_norm": 2.6361501216888428, + "learning_rate": 9.021105359269764e-06, + "loss": 1.0371, + "step": 5606 + }, + { + "epoch": 0.45310006262752783, + "grad_norm": 2.868471384048462, + "learning_rate": 9.020716417606435e-06, + "loss": 1.0487, + "step": 5607 + }, + { + "epoch": 0.45318087234085536, + "grad_norm": 2.3944990634918213, + "learning_rate": 9.020327407077913e-06, + "loss": 1.0229, + "step": 5608 + }, + { + "epoch": 0.45326168205418293, + "grad_norm": 2.7799999713897705, + "learning_rate": 9.019938327690863e-06, + "loss": 1.0133, + "step": 5609 + }, + { + "epoch": 0.45334249176751046, + "grad_norm": 3.00563907623291, + "learning_rate": 9.019549179451946e-06, + "loss": 0.9378, + "step": 5610 + }, + { + "epoch": 0.453423301480838, + "grad_norm": 2.709092378616333, + "learning_rate": 9.019159962367826e-06, + "loss": 0.9811, + "step": 5611 + }, + { + "epoch": 0.45350411119416556, + "grad_norm": 2.7960126399993896, + "learning_rate": 9.018770676445174e-06, + "loss": 0.9504, + "step": 5612 + }, + { + "epoch": 0.4535849209074931, + "grad_norm": 2.468398094177246, + "learning_rate": 9.018381321690655e-06, + "loss": 0.9909, + "step": 5613 + }, + { + "epoch": 0.4536657306208206, + "grad_norm": 2.8399434089660645, + "learning_rate": 9.017991898110936e-06, + "loss": 1.0006, + "step": 5614 + }, + { + "epoch": 0.4537465403341482, + "grad_norm": 2.856843948364258, + "learning_rate": 9.01760240571269e-06, + "loss": 0.9189, + "step": 5615 + }, + { + "epoch": 0.4538273500474757, + "grad_norm": 3.444997549057007, + "learning_rate": 9.017212844502587e-06, + "loss": 1.087, + "step": 5616 + }, + { + "epoch": 0.45390815976080323, + "grad_norm": 2.8802638053894043, + "learning_rate": 9.016823214487298e-06, + "loss": 0.9089, + "step": 5617 + }, + { + "epoch": 0.4539889694741308, + "grad_norm": 2.468212604522705, + "learning_rate": 9.016433515673498e-06, + "loss": 1.0928, + "step": 5618 + }, + { + "epoch": 0.45406977918745833, + "grad_norm": 2.814220905303955, + "learning_rate": 9.016043748067861e-06, + "loss": 0.9596, + "step": 5619 + }, + { + "epoch": 0.45415058890078586, + "grad_norm": 2.472062110900879, + "learning_rate": 9.015653911677064e-06, + "loss": 1.0257, + "step": 5620 + }, + { + "epoch": 0.45423139861411344, + "grad_norm": 2.6337456703186035, + "learning_rate": 9.015264006507781e-06, + "loss": 0.9351, + "step": 5621 + }, + { + "epoch": 0.45431220832744096, + "grad_norm": 2.545419931411743, + "learning_rate": 9.014874032566694e-06, + "loss": 1.0553, + "step": 5622 + }, + { + "epoch": 0.4543930180407685, + "grad_norm": 2.675389528274536, + "learning_rate": 9.014483989860478e-06, + "loss": 0.9357, + "step": 5623 + }, + { + "epoch": 0.45447382775409606, + "grad_norm": 3.3827435970306396, + "learning_rate": 9.014093878395816e-06, + "loss": 0.9865, + "step": 5624 + }, + { + "epoch": 0.4545546374674236, + "grad_norm": 2.7035064697265625, + "learning_rate": 9.01370369817939e-06, + "loss": 0.9868, + "step": 5625 + }, + { + "epoch": 0.4546354471807511, + "grad_norm": 2.7121737003326416, + "learning_rate": 9.013313449217884e-06, + "loss": 0.9845, + "step": 5626 + }, + { + "epoch": 0.4547162568940787, + "grad_norm": 2.6461739540100098, + "learning_rate": 9.012923131517978e-06, + "loss": 0.908, + "step": 5627 + }, + { + "epoch": 0.4547970666074062, + "grad_norm": 2.8254449367523193, + "learning_rate": 9.01253274508636e-06, + "loss": 0.7409, + "step": 5628 + }, + { + "epoch": 0.45487787632073373, + "grad_norm": 2.7166452407836914, + "learning_rate": 9.012142289929714e-06, + "loss": 0.9015, + "step": 5629 + }, + { + "epoch": 0.4549586860340613, + "grad_norm": 2.6406447887420654, + "learning_rate": 9.011751766054732e-06, + "loss": 0.931, + "step": 5630 + }, + { + "epoch": 0.45503949574738883, + "grad_norm": 2.7557570934295654, + "learning_rate": 9.0113611734681e-06, + "loss": 0.7792, + "step": 5631 + }, + { + "epoch": 0.45512030546071636, + "grad_norm": 2.6907272338867188, + "learning_rate": 9.010970512176509e-06, + "loss": 1.0254, + "step": 5632 + }, + { + "epoch": 0.45520111517404394, + "grad_norm": 2.2805144786834717, + "learning_rate": 9.010579782186647e-06, + "loss": 1.0791, + "step": 5633 + }, + { + "epoch": 0.45528192488737146, + "grad_norm": 3.235880136489868, + "learning_rate": 9.010188983505208e-06, + "loss": 0.9581, + "step": 5634 + }, + { + "epoch": 0.455362734600699, + "grad_norm": 3.049842119216919, + "learning_rate": 9.009798116138889e-06, + "loss": 0.9207, + "step": 5635 + }, + { + "epoch": 0.45544354431402656, + "grad_norm": 2.42728590965271, + "learning_rate": 9.009407180094378e-06, + "loss": 0.8828, + "step": 5636 + }, + { + "epoch": 0.4555243540273541, + "grad_norm": 2.411924362182617, + "learning_rate": 9.009016175378375e-06, + "loss": 0.9356, + "step": 5637 + }, + { + "epoch": 0.4556051637406816, + "grad_norm": 2.7910783290863037, + "learning_rate": 9.008625101997577e-06, + "loss": 0.9046, + "step": 5638 + }, + { + "epoch": 0.4556859734540092, + "grad_norm": 2.5463552474975586, + "learning_rate": 9.008233959958682e-06, + "loss": 0.834, + "step": 5639 + }, + { + "epoch": 0.4557667831673367, + "grad_norm": 2.5790765285491943, + "learning_rate": 9.007842749268388e-06, + "loss": 0.9421, + "step": 5640 + }, + { + "epoch": 0.45584759288066423, + "grad_norm": 2.989309787750244, + "learning_rate": 9.007451469933395e-06, + "loss": 0.9489, + "step": 5641 + }, + { + "epoch": 0.4559284025939918, + "grad_norm": 2.7662150859832764, + "learning_rate": 9.007060121960408e-06, + "loss": 1.1282, + "step": 5642 + }, + { + "epoch": 0.45600921230731933, + "grad_norm": 2.5617337226867676, + "learning_rate": 9.006668705356128e-06, + "loss": 1.0608, + "step": 5643 + }, + { + "epoch": 0.45609002202064686, + "grad_norm": 3.037203550338745, + "learning_rate": 9.006277220127257e-06, + "loss": 0.9584, + "step": 5644 + }, + { + "epoch": 0.45617083173397444, + "grad_norm": 2.7384161949157715, + "learning_rate": 9.0058856662805e-06, + "loss": 1.0149, + "step": 5645 + }, + { + "epoch": 0.45625164144730196, + "grad_norm": 2.435323476791382, + "learning_rate": 9.00549404382257e-06, + "loss": 0.9534, + "step": 5646 + }, + { + "epoch": 0.4563324511606295, + "grad_norm": 2.4581611156463623, + "learning_rate": 9.005102352760166e-06, + "loss": 0.866, + "step": 5647 + }, + { + "epoch": 0.45641326087395706, + "grad_norm": 2.6652679443359375, + "learning_rate": 9.004710593100003e-06, + "loss": 0.9682, + "step": 5648 + }, + { + "epoch": 0.4564940705872846, + "grad_norm": 2.8415911197662354, + "learning_rate": 9.004318764848787e-06, + "loss": 0.996, + "step": 5649 + }, + { + "epoch": 0.4565748803006121, + "grad_norm": 2.6814072132110596, + "learning_rate": 9.003926868013231e-06, + "loss": 0.8604, + "step": 5650 + }, + { + "epoch": 0.4566556900139397, + "grad_norm": 3.0876829624176025, + "learning_rate": 9.00353490260005e-06, + "loss": 1.0219, + "step": 5651 + }, + { + "epoch": 0.4567364997272672, + "grad_norm": 2.7799458503723145, + "learning_rate": 9.003142868615948e-06, + "loss": 0.9927, + "step": 5652 + }, + { + "epoch": 0.45681730944059473, + "grad_norm": 9.848539352416992, + "learning_rate": 9.002750766067649e-06, + "loss": 0.9253, + "step": 5653 + }, + { + "epoch": 0.4568981191539223, + "grad_norm": 3.060499668121338, + "learning_rate": 9.002358594961867e-06, + "loss": 0.8231, + "step": 5654 + }, + { + "epoch": 0.45697892886724983, + "grad_norm": 2.563527822494507, + "learning_rate": 9.001966355305317e-06, + "loss": 1.005, + "step": 5655 + }, + { + "epoch": 0.4570597385805774, + "grad_norm": 2.9491777420043945, + "learning_rate": 9.001574047104716e-06, + "loss": 1.0011, + "step": 5656 + }, + { + "epoch": 0.45714054829390494, + "grad_norm": 3.1052157878875732, + "learning_rate": 9.001181670366787e-06, + "loss": 0.893, + "step": 5657 + }, + { + "epoch": 0.45722135800723246, + "grad_norm": 2.521304130554199, + "learning_rate": 9.000789225098247e-06, + "loss": 0.9368, + "step": 5658 + }, + { + "epoch": 0.45730216772056004, + "grad_norm": 2.575448989868164, + "learning_rate": 9.00039671130582e-06, + "loss": 0.9664, + "step": 5659 + }, + { + "epoch": 0.45738297743388756, + "grad_norm": 2.4491307735443115, + "learning_rate": 9.000004128996226e-06, + "loss": 1.0269, + "step": 5660 + }, + { + "epoch": 0.4574637871472151, + "grad_norm": 2.6399776935577393, + "learning_rate": 8.999611478176192e-06, + "loss": 1.001, + "step": 5661 + }, + { + "epoch": 0.45754459686054266, + "grad_norm": 2.724689245223999, + "learning_rate": 8.999218758852443e-06, + "loss": 0.9712, + "step": 5662 + }, + { + "epoch": 0.4576254065738702, + "grad_norm": 2.584888219833374, + "learning_rate": 8.998825971031704e-06, + "loss": 0.9636, + "step": 5663 + }, + { + "epoch": 0.4577062162871977, + "grad_norm": 3.007192373275757, + "learning_rate": 8.998433114720701e-06, + "loss": 0.9979, + "step": 5664 + }, + { + "epoch": 0.4577870260005253, + "grad_norm": 2.7164647579193115, + "learning_rate": 8.998040189926168e-06, + "loss": 0.8697, + "step": 5665 + }, + { + "epoch": 0.4578678357138528, + "grad_norm": 2.5364456176757812, + "learning_rate": 8.997647196654828e-06, + "loss": 0.9978, + "step": 5666 + }, + { + "epoch": 0.45794864542718033, + "grad_norm": 2.646486282348633, + "learning_rate": 8.997254134913418e-06, + "loss": 0.9778, + "step": 5667 + }, + { + "epoch": 0.4580294551405079, + "grad_norm": 2.6803393363952637, + "learning_rate": 8.996861004708667e-06, + "loss": 1.1445, + "step": 5668 + }, + { + "epoch": 0.45811026485383544, + "grad_norm": 2.4635229110717773, + "learning_rate": 8.996467806047309e-06, + "loss": 1.0915, + "step": 5669 + }, + { + "epoch": 0.45819107456716296, + "grad_norm": 2.5180959701538086, + "learning_rate": 8.996074538936077e-06, + "loss": 0.9426, + "step": 5670 + }, + { + "epoch": 0.45827188428049054, + "grad_norm": 2.726747989654541, + "learning_rate": 8.99568120338171e-06, + "loss": 1.0266, + "step": 5671 + }, + { + "epoch": 0.45835269399381806, + "grad_norm": 3.1313908100128174, + "learning_rate": 8.995287799390943e-06, + "loss": 0.915, + "step": 5672 + }, + { + "epoch": 0.4584335037071456, + "grad_norm": 3.0397677421569824, + "learning_rate": 8.994894326970514e-06, + "loss": 0.9487, + "step": 5673 + }, + { + "epoch": 0.45851431342047316, + "grad_norm": 2.721773147583008, + "learning_rate": 8.994500786127163e-06, + "loss": 0.9255, + "step": 5674 + }, + { + "epoch": 0.4585951231338007, + "grad_norm": 2.7440669536590576, + "learning_rate": 8.994107176867628e-06, + "loss": 1.0435, + "step": 5675 + }, + { + "epoch": 0.4586759328471282, + "grad_norm": 2.8031744956970215, + "learning_rate": 8.993713499198655e-06, + "loss": 0.9442, + "step": 5676 + }, + { + "epoch": 0.4587567425604558, + "grad_norm": 2.5146827697753906, + "learning_rate": 8.993319753126983e-06, + "loss": 0.9099, + "step": 5677 + }, + { + "epoch": 0.4588375522737833, + "grad_norm": 2.8717031478881836, + "learning_rate": 8.992925938659357e-06, + "loss": 1.0509, + "step": 5678 + }, + { + "epoch": 0.45891836198711083, + "grad_norm": 2.7194910049438477, + "learning_rate": 8.992532055802524e-06, + "loss": 0.9336, + "step": 5679 + }, + { + "epoch": 0.4589991717004384, + "grad_norm": 2.8881373405456543, + "learning_rate": 8.992138104563226e-06, + "loss": 0.9019, + "step": 5680 + }, + { + "epoch": 0.45907998141376594, + "grad_norm": 2.431675910949707, + "learning_rate": 8.991744084948214e-06, + "loss": 1.0839, + "step": 5681 + }, + { + "epoch": 0.45916079112709346, + "grad_norm": 2.25947642326355, + "learning_rate": 8.991349996964236e-06, + "loss": 0.9717, + "step": 5682 + }, + { + "epoch": 0.45924160084042104, + "grad_norm": 2.704808473587036, + "learning_rate": 8.990955840618041e-06, + "loss": 0.948, + "step": 5683 + }, + { + "epoch": 0.45932241055374856, + "grad_norm": 2.8153810501098633, + "learning_rate": 8.99056161591638e-06, + "loss": 0.895, + "step": 5684 + }, + { + "epoch": 0.4594032202670761, + "grad_norm": 2.8578639030456543, + "learning_rate": 8.990167322866005e-06, + "loss": 1.1398, + "step": 5685 + }, + { + "epoch": 0.45948402998040366, + "grad_norm": 2.7915093898773193, + "learning_rate": 8.989772961473671e-06, + "loss": 0.894, + "step": 5686 + }, + { + "epoch": 0.4595648396937312, + "grad_norm": 2.6754133701324463, + "learning_rate": 8.989378531746131e-06, + "loss": 1.0066, + "step": 5687 + }, + { + "epoch": 0.4596456494070587, + "grad_norm": 2.9381282329559326, + "learning_rate": 8.98898403369014e-06, + "loss": 0.9623, + "step": 5688 + }, + { + "epoch": 0.4597264591203863, + "grad_norm": 3.558082342147827, + "learning_rate": 8.988589467312455e-06, + "loss": 0.9892, + "step": 5689 + }, + { + "epoch": 0.4598072688337138, + "grad_norm": 2.568845748901367, + "learning_rate": 8.988194832619835e-06, + "loss": 0.9607, + "step": 5690 + }, + { + "epoch": 0.45988807854704133, + "grad_norm": 2.9882869720458984, + "learning_rate": 8.98780012961904e-06, + "loss": 0.9085, + "step": 5691 + }, + { + "epoch": 0.4599688882603689, + "grad_norm": 2.629019260406494, + "learning_rate": 8.987405358316827e-06, + "loss": 1.0403, + "step": 5692 + }, + { + "epoch": 0.46004969797369644, + "grad_norm": 2.489562511444092, + "learning_rate": 8.987010518719961e-06, + "loss": 0.8797, + "step": 5693 + }, + { + "epoch": 0.46013050768702396, + "grad_norm": 2.858153820037842, + "learning_rate": 8.986615610835203e-06, + "loss": 1.046, + "step": 5694 + }, + { + "epoch": 0.46021131740035154, + "grad_norm": 2.669794797897339, + "learning_rate": 8.986220634669318e-06, + "loss": 0.9201, + "step": 5695 + }, + { + "epoch": 0.46029212711367906, + "grad_norm": 2.7267401218414307, + "learning_rate": 8.985825590229068e-06, + "loss": 0.9804, + "step": 5696 + }, + { + "epoch": 0.4603729368270066, + "grad_norm": 2.7089977264404297, + "learning_rate": 8.985430477521222e-06, + "loss": 0.9896, + "step": 5697 + }, + { + "epoch": 0.46045374654033416, + "grad_norm": 2.598404884338379, + "learning_rate": 8.985035296552546e-06, + "loss": 0.989, + "step": 5698 + }, + { + "epoch": 0.4605345562536617, + "grad_norm": 3.1748383045196533, + "learning_rate": 8.984640047329809e-06, + "loss": 0.9024, + "step": 5699 + }, + { + "epoch": 0.4606153659669892, + "grad_norm": 2.6499054431915283, + "learning_rate": 8.984244729859781e-06, + "loss": 1.0047, + "step": 5700 + }, + { + "epoch": 0.4606961756803168, + "grad_norm": 3.3875343799591064, + "learning_rate": 8.983849344149232e-06, + "loss": 0.9978, + "step": 5701 + }, + { + "epoch": 0.4607769853936443, + "grad_norm": 2.834674835205078, + "learning_rate": 8.983453890204935e-06, + "loss": 0.9273, + "step": 5702 + }, + { + "epoch": 0.46085779510697183, + "grad_norm": 2.6363673210144043, + "learning_rate": 8.983058368033663e-06, + "loss": 0.9971, + "step": 5703 + }, + { + "epoch": 0.4609386048202994, + "grad_norm": 2.9036641120910645, + "learning_rate": 8.982662777642188e-06, + "loss": 0.9269, + "step": 5704 + }, + { + "epoch": 0.46101941453362694, + "grad_norm": 2.6789112091064453, + "learning_rate": 8.982267119037289e-06, + "loss": 0.9112, + "step": 5705 + }, + { + "epoch": 0.46110022424695446, + "grad_norm": 2.800849437713623, + "learning_rate": 8.981871392225742e-06, + "loss": 0.9262, + "step": 5706 + }, + { + "epoch": 0.46118103396028204, + "grad_norm": 2.4979593753814697, + "learning_rate": 8.981475597214324e-06, + "loss": 0.9378, + "step": 5707 + }, + { + "epoch": 0.46126184367360956, + "grad_norm": 2.5978217124938965, + "learning_rate": 8.981079734009813e-06, + "loss": 1.0223, + "step": 5708 + }, + { + "epoch": 0.4613426533869371, + "grad_norm": 2.449002981185913, + "learning_rate": 8.980683802618989e-06, + "loss": 0.9522, + "step": 5709 + }, + { + "epoch": 0.46142346310026466, + "grad_norm": 2.9029011726379395, + "learning_rate": 8.980287803048636e-06, + "loss": 0.9524, + "step": 5710 + }, + { + "epoch": 0.4615042728135922, + "grad_norm": 2.6404201984405518, + "learning_rate": 8.979891735305534e-06, + "loss": 0.9854, + "step": 5711 + }, + { + "epoch": 0.4615850825269197, + "grad_norm": 3.219568967819214, + "learning_rate": 8.97949559939647e-06, + "loss": 0.8591, + "step": 5712 + }, + { + "epoch": 0.4616658922402473, + "grad_norm": 3.0073201656341553, + "learning_rate": 8.979099395328226e-06, + "loss": 1.0269, + "step": 5713 + }, + { + "epoch": 0.4617467019535748, + "grad_norm": 2.707247734069824, + "learning_rate": 8.978703123107588e-06, + "loss": 1.0136, + "step": 5714 + }, + { + "epoch": 0.46182751166690234, + "grad_norm": 2.714320659637451, + "learning_rate": 8.978306782741344e-06, + "loss": 0.9909, + "step": 5715 + }, + { + "epoch": 0.4619083213802299, + "grad_norm": 3.021829605102539, + "learning_rate": 8.977910374236281e-06, + "loss": 0.9573, + "step": 5716 + }, + { + "epoch": 0.46198913109355744, + "grad_norm": 2.5574440956115723, + "learning_rate": 8.97751389759919e-06, + "loss": 0.8751, + "step": 5717 + }, + { + "epoch": 0.46206994080688496, + "grad_norm": 2.4902138710021973, + "learning_rate": 8.977117352836864e-06, + "loss": 0.8953, + "step": 5718 + }, + { + "epoch": 0.46215075052021254, + "grad_norm": 2.8296992778778076, + "learning_rate": 8.97672073995609e-06, + "loss": 1.0601, + "step": 5719 + }, + { + "epoch": 0.46223156023354006, + "grad_norm": 2.5583040714263916, + "learning_rate": 8.976324058963664e-06, + "loss": 0.9975, + "step": 5720 + }, + { + "epoch": 0.46231236994686764, + "grad_norm": 2.419672966003418, + "learning_rate": 8.975927309866379e-06, + "loss": 1.051, + "step": 5721 + }, + { + "epoch": 0.46239317966019516, + "grad_norm": 2.8298377990722656, + "learning_rate": 8.975530492671031e-06, + "loss": 1.0306, + "step": 5722 + }, + { + "epoch": 0.4624739893735227, + "grad_norm": 2.5772652626037598, + "learning_rate": 8.975133607384416e-06, + "loss": 0.8895, + "step": 5723 + }, + { + "epoch": 0.46255479908685027, + "grad_norm": 2.7519946098327637, + "learning_rate": 8.974736654013333e-06, + "loss": 0.9955, + "step": 5724 + }, + { + "epoch": 0.4626356088001778, + "grad_norm": 3.088197708129883, + "learning_rate": 8.97433963256458e-06, + "loss": 0.9018, + "step": 5725 + }, + { + "epoch": 0.4627164185135053, + "grad_norm": 2.506464719772339, + "learning_rate": 8.973942543044956e-06, + "loss": 0.9529, + "step": 5726 + }, + { + "epoch": 0.4627972282268329, + "grad_norm": 3.6072235107421875, + "learning_rate": 8.973545385461261e-06, + "loss": 0.9706, + "step": 5727 + }, + { + "epoch": 0.4628780379401604, + "grad_norm": 3.1062591075897217, + "learning_rate": 8.973148159820304e-06, + "loss": 0.9377, + "step": 5728 + }, + { + "epoch": 0.46295884765348794, + "grad_norm": 2.4257314205169678, + "learning_rate": 8.972750866128881e-06, + "loss": 1.0533, + "step": 5729 + }, + { + "epoch": 0.4630396573668155, + "grad_norm": 2.609692335128784, + "learning_rate": 8.9723535043938e-06, + "loss": 0.9097, + "step": 5730 + }, + { + "epoch": 0.46312046708014304, + "grad_norm": 3.091174602508545, + "learning_rate": 8.971956074621867e-06, + "loss": 0.8879, + "step": 5731 + }, + { + "epoch": 0.46320127679347056, + "grad_norm": 2.596283435821533, + "learning_rate": 8.97155857681989e-06, + "loss": 1.0411, + "step": 5732 + }, + { + "epoch": 0.46328208650679814, + "grad_norm": 2.7031702995300293, + "learning_rate": 8.971161010994674e-06, + "loss": 0.9502, + "step": 5733 + }, + { + "epoch": 0.46336289622012566, + "grad_norm": 3.0437660217285156, + "learning_rate": 8.97076337715303e-06, + "loss": 0.9302, + "step": 5734 + }, + { + "epoch": 0.4634437059334532, + "grad_norm": 2.7243452072143555, + "learning_rate": 8.970365675301768e-06, + "loss": 0.9279, + "step": 5735 + }, + { + "epoch": 0.46352451564678077, + "grad_norm": 2.4927990436553955, + "learning_rate": 8.969967905447702e-06, + "loss": 1.0101, + "step": 5736 + }, + { + "epoch": 0.4636053253601083, + "grad_norm": 2.4401707649230957, + "learning_rate": 8.969570067597641e-06, + "loss": 0.8932, + "step": 5737 + }, + { + "epoch": 0.4636861350734358, + "grad_norm": 2.596168279647827, + "learning_rate": 8.969172161758404e-06, + "loss": 0.8357, + "step": 5738 + }, + { + "epoch": 0.4637669447867634, + "grad_norm": 2.832498550415039, + "learning_rate": 8.968774187936802e-06, + "loss": 0.9875, + "step": 5739 + }, + { + "epoch": 0.4638477545000909, + "grad_norm": 2.958401918411255, + "learning_rate": 8.968376146139653e-06, + "loss": 0.9529, + "step": 5740 + }, + { + "epoch": 0.46392856421341844, + "grad_norm": 2.814657211303711, + "learning_rate": 8.967978036373773e-06, + "loss": 1.0036, + "step": 5741 + }, + { + "epoch": 0.464009373926746, + "grad_norm": 2.820213794708252, + "learning_rate": 8.96757985864598e-06, + "loss": 0.9722, + "step": 5742 + }, + { + "epoch": 0.46409018364007354, + "grad_norm": 2.4376022815704346, + "learning_rate": 8.967181612963098e-06, + "loss": 1.0275, + "step": 5743 + }, + { + "epoch": 0.46417099335340106, + "grad_norm": 2.5551934242248535, + "learning_rate": 8.966783299331945e-06, + "loss": 0.9647, + "step": 5744 + }, + { + "epoch": 0.46425180306672864, + "grad_norm": 2.318618059158325, + "learning_rate": 8.966384917759345e-06, + "loss": 1.0881, + "step": 5745 + }, + { + "epoch": 0.46433261278005616, + "grad_norm": 2.677311420440674, + "learning_rate": 8.96598646825212e-06, + "loss": 1.0155, + "step": 5746 + }, + { + "epoch": 0.4644134224933837, + "grad_norm": 2.4367010593414307, + "learning_rate": 8.965587950817091e-06, + "loss": 0.9527, + "step": 5747 + }, + { + "epoch": 0.46449423220671127, + "grad_norm": 2.6899735927581787, + "learning_rate": 8.965189365461091e-06, + "loss": 0.8498, + "step": 5748 + }, + { + "epoch": 0.4645750419200388, + "grad_norm": 2.9822683334350586, + "learning_rate": 8.96479071219094e-06, + "loss": 0.9261, + "step": 5749 + }, + { + "epoch": 0.4646558516333663, + "grad_norm": 2.724426746368408, + "learning_rate": 8.964391991013473e-06, + "loss": 1.0494, + "step": 5750 + }, + { + "epoch": 0.4647366613466939, + "grad_norm": 3.3723952770233154, + "learning_rate": 8.963993201935513e-06, + "loss": 1.054, + "step": 5751 + }, + { + "epoch": 0.4648174710600214, + "grad_norm": 2.668436050415039, + "learning_rate": 8.96359434496389e-06, + "loss": 0.8919, + "step": 5752 + }, + { + "epoch": 0.46489828077334894, + "grad_norm": 2.999967098236084, + "learning_rate": 8.96319542010544e-06, + "loss": 0.8965, + "step": 5753 + }, + { + "epoch": 0.4649790904866765, + "grad_norm": 2.8097987174987793, + "learning_rate": 8.962796427366993e-06, + "loss": 0.9619, + "step": 5754 + }, + { + "epoch": 0.46505990020000404, + "grad_norm": 2.3243496417999268, + "learning_rate": 8.962397366755384e-06, + "loss": 0.971, + "step": 5755 + }, + { + "epoch": 0.46514070991333156, + "grad_norm": 2.707468032836914, + "learning_rate": 8.961998238277447e-06, + "loss": 0.9702, + "step": 5756 + }, + { + "epoch": 0.46522151962665914, + "grad_norm": 2.9703903198242188, + "learning_rate": 8.961599041940018e-06, + "loss": 1.0181, + "step": 5757 + }, + { + "epoch": 0.46530232933998666, + "grad_norm": 2.3622820377349854, + "learning_rate": 8.961199777749935e-06, + "loss": 1.0661, + "step": 5758 + }, + { + "epoch": 0.4653831390533142, + "grad_norm": 2.6456966400146484, + "learning_rate": 8.960800445714035e-06, + "loss": 1.0032, + "step": 5759 + }, + { + "epoch": 0.46546394876664177, + "grad_norm": 3.1365106105804443, + "learning_rate": 8.96040104583916e-06, + "loss": 0.9849, + "step": 5760 + }, + { + "epoch": 0.4655447584799693, + "grad_norm": 2.591796636581421, + "learning_rate": 8.960001578132148e-06, + "loss": 0.9843, + "step": 5761 + }, + { + "epoch": 0.4656255681932968, + "grad_norm": 2.7226526737213135, + "learning_rate": 8.959602042599843e-06, + "loss": 0.9802, + "step": 5762 + }, + { + "epoch": 0.4657063779066244, + "grad_norm": 2.3909027576446533, + "learning_rate": 8.959202439249087e-06, + "loss": 0.9329, + "step": 5763 + }, + { + "epoch": 0.4657871876199519, + "grad_norm": 2.6812145709991455, + "learning_rate": 8.958802768086726e-06, + "loss": 1.0069, + "step": 5764 + }, + { + "epoch": 0.46586799733327944, + "grad_norm": 3.149233818054199, + "learning_rate": 8.958403029119602e-06, + "loss": 0.9595, + "step": 5765 + }, + { + "epoch": 0.465948807046607, + "grad_norm": 2.8651227951049805, + "learning_rate": 8.958003222354566e-06, + "loss": 0.9259, + "step": 5766 + }, + { + "epoch": 0.46602961675993454, + "grad_norm": 2.6721975803375244, + "learning_rate": 8.95760334779846e-06, + "loss": 0.9779, + "step": 5767 + }, + { + "epoch": 0.46611042647326206, + "grad_norm": 2.89190936088562, + "learning_rate": 8.957203405458139e-06, + "loss": 0.868, + "step": 5768 + }, + { + "epoch": 0.46619123618658964, + "grad_norm": 2.8196425437927246, + "learning_rate": 8.956803395340448e-06, + "loss": 1.0964, + "step": 5769 + }, + { + "epoch": 0.46627204589991716, + "grad_norm": 2.3658134937286377, + "learning_rate": 8.956403317452242e-06, + "loss": 0.828, + "step": 5770 + }, + { + "epoch": 0.4663528556132447, + "grad_norm": 2.655977725982666, + "learning_rate": 8.95600317180037e-06, + "loss": 0.9586, + "step": 5771 + }, + { + "epoch": 0.46643366532657227, + "grad_norm": 3.4540302753448486, + "learning_rate": 8.955602958391691e-06, + "loss": 1.0272, + "step": 5772 + }, + { + "epoch": 0.4665144750398998, + "grad_norm": 2.621208906173706, + "learning_rate": 8.955202677233052e-06, + "loss": 0.9203, + "step": 5773 + }, + { + "epoch": 0.4665952847532273, + "grad_norm": 2.926393747329712, + "learning_rate": 8.954802328331315e-06, + "loss": 1.0637, + "step": 5774 + }, + { + "epoch": 0.4666760944665549, + "grad_norm": 2.921645402908325, + "learning_rate": 8.954401911693336e-06, + "loss": 1.1009, + "step": 5775 + }, + { + "epoch": 0.4667569041798824, + "grad_norm": 2.4261834621429443, + "learning_rate": 8.95400142732597e-06, + "loss": 1.0117, + "step": 5776 + }, + { + "epoch": 0.46683771389320994, + "grad_norm": 3.024146556854248, + "learning_rate": 8.95360087523608e-06, + "loss": 0.9313, + "step": 5777 + }, + { + "epoch": 0.4669185236065375, + "grad_norm": 2.688607931137085, + "learning_rate": 8.953200255430523e-06, + "loss": 0.9076, + "step": 5778 + }, + { + "epoch": 0.46699933331986504, + "grad_norm": 2.5541460514068604, + "learning_rate": 8.952799567916164e-06, + "loss": 0.9786, + "step": 5779 + }, + { + "epoch": 0.46708014303319256, + "grad_norm": 2.9037370681762695, + "learning_rate": 8.952398812699865e-06, + "loss": 0.8789, + "step": 5780 + }, + { + "epoch": 0.46716095274652014, + "grad_norm": 3.2507524490356445, + "learning_rate": 8.951997989788487e-06, + "loss": 0.9646, + "step": 5781 + }, + { + "epoch": 0.46724176245984766, + "grad_norm": 2.625180959701538, + "learning_rate": 8.9515970991889e-06, + "loss": 0.8997, + "step": 5782 + }, + { + "epoch": 0.4673225721731752, + "grad_norm": 2.617286443710327, + "learning_rate": 8.951196140907967e-06, + "loss": 0.91, + "step": 5783 + }, + { + "epoch": 0.46740338188650277, + "grad_norm": 2.7201478481292725, + "learning_rate": 8.950795114952554e-06, + "loss": 0.9564, + "step": 5784 + }, + { + "epoch": 0.4674841915998303, + "grad_norm": 3.4661874771118164, + "learning_rate": 8.950394021329535e-06, + "loss": 0.9577, + "step": 5785 + }, + { + "epoch": 0.46756500131315787, + "grad_norm": 2.390563726425171, + "learning_rate": 8.949992860045776e-06, + "loss": 0.958, + "step": 5786 + }, + { + "epoch": 0.4676458110264854, + "grad_norm": 2.7613518238067627, + "learning_rate": 8.949591631108147e-06, + "loss": 0.9444, + "step": 5787 + }, + { + "epoch": 0.4677266207398129, + "grad_norm": 3.0035452842712402, + "learning_rate": 8.949190334523523e-06, + "loss": 1.0011, + "step": 5788 + }, + { + "epoch": 0.4678074304531405, + "grad_norm": 2.6570937633514404, + "learning_rate": 8.948788970298774e-06, + "loss": 0.8661, + "step": 5789 + }, + { + "epoch": 0.467888240166468, + "grad_norm": 2.3081705570220947, + "learning_rate": 8.948387538440777e-06, + "loss": 0.9587, + "step": 5790 + }, + { + "epoch": 0.46796904987979554, + "grad_norm": 2.375067710876465, + "learning_rate": 8.947986038956409e-06, + "loss": 1.1291, + "step": 5791 + }, + { + "epoch": 0.4680498595931231, + "grad_norm": 2.3014063835144043, + "learning_rate": 8.947584471852541e-06, + "loss": 0.9375, + "step": 5792 + }, + { + "epoch": 0.46813066930645064, + "grad_norm": 2.9732635021209717, + "learning_rate": 8.947182837136057e-06, + "loss": 0.8834, + "step": 5793 + }, + { + "epoch": 0.46821147901977817, + "grad_norm": 2.5396475791931152, + "learning_rate": 8.946781134813833e-06, + "loss": 0.9785, + "step": 5794 + }, + { + "epoch": 0.46829228873310574, + "grad_norm": 2.2900161743164062, + "learning_rate": 8.94637936489275e-06, + "loss": 0.8655, + "step": 5795 + }, + { + "epoch": 0.46837309844643327, + "grad_norm": 2.8865981101989746, + "learning_rate": 8.945977527379688e-06, + "loss": 0.9418, + "step": 5796 + }, + { + "epoch": 0.4684539081597608, + "grad_norm": 2.7252612113952637, + "learning_rate": 8.945575622281531e-06, + "loss": 0.9861, + "step": 5797 + }, + { + "epoch": 0.46853471787308837, + "grad_norm": 2.5759341716766357, + "learning_rate": 8.945173649605163e-06, + "loss": 1.0446, + "step": 5798 + }, + { + "epoch": 0.4686155275864159, + "grad_norm": 2.8114752769470215, + "learning_rate": 8.944771609357466e-06, + "loss": 0.9281, + "step": 5799 + }, + { + "epoch": 0.4686963372997434, + "grad_norm": 2.4970602989196777, + "learning_rate": 8.944369501545329e-06, + "loss": 0.8837, + "step": 5800 + }, + { + "epoch": 0.468777147013071, + "grad_norm": 4.049400329589844, + "learning_rate": 8.94396732617564e-06, + "loss": 0.8989, + "step": 5801 + }, + { + "epoch": 0.4688579567263985, + "grad_norm": 2.4196088314056396, + "learning_rate": 8.943565083255283e-06, + "loss": 0.9404, + "step": 5802 + }, + { + "epoch": 0.46893876643972604, + "grad_norm": 2.362985610961914, + "learning_rate": 8.94316277279115e-06, + "loss": 0.8586, + "step": 5803 + }, + { + "epoch": 0.4690195761530536, + "grad_norm": 2.5610573291778564, + "learning_rate": 8.942760394790131e-06, + "loss": 0.8932, + "step": 5804 + }, + { + "epoch": 0.46910038586638114, + "grad_norm": 2.748135805130005, + "learning_rate": 8.942357949259121e-06, + "loss": 0.9569, + "step": 5805 + }, + { + "epoch": 0.46918119557970867, + "grad_norm": 2.7132761478424072, + "learning_rate": 8.941955436205007e-06, + "loss": 0.9156, + "step": 5806 + }, + { + "epoch": 0.46926200529303624, + "grad_norm": 2.4545814990997314, + "learning_rate": 8.941552855634688e-06, + "loss": 0.9154, + "step": 5807 + }, + { + "epoch": 0.46934281500636377, + "grad_norm": 2.44968318939209, + "learning_rate": 8.941150207555058e-06, + "loss": 0.9353, + "step": 5808 + }, + { + "epoch": 0.4694236247196913, + "grad_norm": 2.758939266204834, + "learning_rate": 8.940747491973012e-06, + "loss": 1.0059, + "step": 5809 + }, + { + "epoch": 0.46950443443301887, + "grad_norm": 2.6677908897399902, + "learning_rate": 8.940344708895448e-06, + "loss": 0.9244, + "step": 5810 + }, + { + "epoch": 0.4695852441463464, + "grad_norm": 2.568324089050293, + "learning_rate": 8.939941858329266e-06, + "loss": 1.1052, + "step": 5811 + }, + { + "epoch": 0.4696660538596739, + "grad_norm": 2.742100477218628, + "learning_rate": 8.939538940281365e-06, + "loss": 0.9692, + "step": 5812 + }, + { + "epoch": 0.4697468635730015, + "grad_norm": 2.638664960861206, + "learning_rate": 8.939135954758645e-06, + "loss": 0.9043, + "step": 5813 + }, + { + "epoch": 0.469827673286329, + "grad_norm": 2.781904935836792, + "learning_rate": 8.93873290176801e-06, + "loss": 0.9713, + "step": 5814 + }, + { + "epoch": 0.46990848299965654, + "grad_norm": 2.575031280517578, + "learning_rate": 8.938329781316362e-06, + "loss": 0.9987, + "step": 5815 + }, + { + "epoch": 0.4699892927129841, + "grad_norm": 2.5054314136505127, + "learning_rate": 8.937926593410606e-06, + "loss": 0.9191, + "step": 5816 + }, + { + "epoch": 0.47007010242631164, + "grad_norm": 2.7720255851745605, + "learning_rate": 8.937523338057648e-06, + "loss": 0.9486, + "step": 5817 + }, + { + "epoch": 0.47015091213963917, + "grad_norm": 2.47377872467041, + "learning_rate": 8.937120015264394e-06, + "loss": 1.0157, + "step": 5818 + }, + { + "epoch": 0.47023172185296674, + "grad_norm": 2.3373284339904785, + "learning_rate": 8.936716625037752e-06, + "loss": 1.0121, + "step": 5819 + }, + { + "epoch": 0.47031253156629427, + "grad_norm": 2.8756234645843506, + "learning_rate": 8.936313167384632e-06, + "loss": 1.0173, + "step": 5820 + }, + { + "epoch": 0.4703933412796218, + "grad_norm": 3.0548393726348877, + "learning_rate": 8.935909642311945e-06, + "loss": 0.8297, + "step": 5821 + }, + { + "epoch": 0.47047415099294937, + "grad_norm": 2.876857042312622, + "learning_rate": 8.935506049826599e-06, + "loss": 1.0103, + "step": 5822 + }, + { + "epoch": 0.4705549607062769, + "grad_norm": 2.4455296993255615, + "learning_rate": 8.935102389935511e-06, + "loss": 1.02, + "step": 5823 + }, + { + "epoch": 0.4706357704196044, + "grad_norm": 2.428161144256592, + "learning_rate": 8.93469866264559e-06, + "loss": 1.0236, + "step": 5824 + }, + { + "epoch": 0.470716580132932, + "grad_norm": 2.6065218448638916, + "learning_rate": 8.934294867963755e-06, + "loss": 0.9891, + "step": 5825 + }, + { + "epoch": 0.4707973898462595, + "grad_norm": 2.5996830463409424, + "learning_rate": 8.93389100589692e-06, + "loss": 1.0679, + "step": 5826 + }, + { + "epoch": 0.47087819955958704, + "grad_norm": 3.0469918251037598, + "learning_rate": 8.933487076452002e-06, + "loss": 0.9552, + "step": 5827 + }, + { + "epoch": 0.4709590092729146, + "grad_norm": 2.888056993484497, + "learning_rate": 8.93308307963592e-06, + "loss": 0.9379, + "step": 5828 + }, + { + "epoch": 0.47103981898624214, + "grad_norm": 2.7134406566619873, + "learning_rate": 8.932679015455594e-06, + "loss": 0.8655, + "step": 5829 + }, + { + "epoch": 0.47112062869956967, + "grad_norm": 2.543226718902588, + "learning_rate": 8.932274883917944e-06, + "loss": 1.1304, + "step": 5830 + }, + { + "epoch": 0.47120143841289724, + "grad_norm": 2.520210027694702, + "learning_rate": 8.931870685029891e-06, + "loss": 0.9515, + "step": 5831 + }, + { + "epoch": 0.47128224812622477, + "grad_norm": 2.9487199783325195, + "learning_rate": 8.931466418798357e-06, + "loss": 1.0249, + "step": 5832 + }, + { + "epoch": 0.4713630578395523, + "grad_norm": 2.8173553943634033, + "learning_rate": 8.93106208523027e-06, + "loss": 0.9164, + "step": 5833 + }, + { + "epoch": 0.47144386755287987, + "grad_norm": 2.3859004974365234, + "learning_rate": 8.930657684332555e-06, + "loss": 0.8828, + "step": 5834 + }, + { + "epoch": 0.4715246772662074, + "grad_norm": 2.8427581787109375, + "learning_rate": 8.930253216112135e-06, + "loss": 0.8944, + "step": 5835 + }, + { + "epoch": 0.4716054869795349, + "grad_norm": 2.501901865005493, + "learning_rate": 8.929848680575938e-06, + "loss": 1.0132, + "step": 5836 + }, + { + "epoch": 0.4716862966928625, + "grad_norm": 2.772751808166504, + "learning_rate": 8.929444077730894e-06, + "loss": 1.0731, + "step": 5837 + }, + { + "epoch": 0.47176710640619, + "grad_norm": 2.6158859729766846, + "learning_rate": 8.929039407583933e-06, + "loss": 0.998, + "step": 5838 + }, + { + "epoch": 0.47184791611951754, + "grad_norm": 2.7519757747650146, + "learning_rate": 8.928634670141987e-06, + "loss": 0.8945, + "step": 5839 + }, + { + "epoch": 0.4719287258328451, + "grad_norm": 2.8849856853485107, + "learning_rate": 8.928229865411986e-06, + "loss": 0.9832, + "step": 5840 + }, + { + "epoch": 0.47200953554617264, + "grad_norm": 2.746818780899048, + "learning_rate": 8.927824993400864e-06, + "loss": 1.0956, + "step": 5841 + }, + { + "epoch": 0.47209034525950017, + "grad_norm": 2.6034724712371826, + "learning_rate": 8.927420054115556e-06, + "loss": 0.9588, + "step": 5842 + }, + { + "epoch": 0.47217115497282774, + "grad_norm": 2.913607597351074, + "learning_rate": 8.927015047562998e-06, + "loss": 1.0609, + "step": 5843 + }, + { + "epoch": 0.47225196468615527, + "grad_norm": 2.7796878814697266, + "learning_rate": 8.926609973750125e-06, + "loss": 0.9498, + "step": 5844 + }, + { + "epoch": 0.4723327743994828, + "grad_norm": 2.9315896034240723, + "learning_rate": 8.926204832683876e-06, + "loss": 1.0, + "step": 5845 + }, + { + "epoch": 0.47241358411281037, + "grad_norm": 3.195502996444702, + "learning_rate": 8.92579962437119e-06, + "loss": 1.0401, + "step": 5846 + }, + { + "epoch": 0.4724943938261379, + "grad_norm": 3.0949106216430664, + "learning_rate": 8.925394348819008e-06, + "loss": 0.9566, + "step": 5847 + }, + { + "epoch": 0.4725752035394654, + "grad_norm": 2.629986047744751, + "learning_rate": 8.92498900603427e-06, + "loss": 1.0411, + "step": 5848 + }, + { + "epoch": 0.472656013252793, + "grad_norm": 2.9056739807128906, + "learning_rate": 8.924583596023921e-06, + "loss": 0.9203, + "step": 5849 + }, + { + "epoch": 0.4727368229661205, + "grad_norm": 3.0195114612579346, + "learning_rate": 8.924178118794902e-06, + "loss": 1.005, + "step": 5850 + }, + { + "epoch": 0.4728176326794481, + "grad_norm": 2.76839542388916, + "learning_rate": 8.923772574354159e-06, + "loss": 0.8678, + "step": 5851 + }, + { + "epoch": 0.4728984423927756, + "grad_norm": 2.5567920207977295, + "learning_rate": 8.923366962708639e-06, + "loss": 0.964, + "step": 5852 + }, + { + "epoch": 0.47297925210610314, + "grad_norm": 2.298448085784912, + "learning_rate": 8.922961283865285e-06, + "loss": 0.9843, + "step": 5853 + }, + { + "epoch": 0.4730600618194307, + "grad_norm": 2.7807490825653076, + "learning_rate": 8.922555537831053e-06, + "loss": 0.8826, + "step": 5854 + }, + { + "epoch": 0.47314087153275824, + "grad_norm": 2.535998582839966, + "learning_rate": 8.922149724612884e-06, + "loss": 1.0366, + "step": 5855 + }, + { + "epoch": 0.47322168124608577, + "grad_norm": 3.046980857849121, + "learning_rate": 8.921743844217734e-06, + "loss": 0.9763, + "step": 5856 + }, + { + "epoch": 0.47330249095941335, + "grad_norm": 2.562838077545166, + "learning_rate": 8.921337896652552e-06, + "loss": 0.9475, + "step": 5857 + }, + { + "epoch": 0.47338330067274087, + "grad_norm": 3.1323447227478027, + "learning_rate": 8.920931881924294e-06, + "loss": 1.0673, + "step": 5858 + }, + { + "epoch": 0.4734641103860684, + "grad_norm": 2.4061942100524902, + "learning_rate": 8.92052580003991e-06, + "loss": 0.9865, + "step": 5859 + }, + { + "epoch": 0.47354492009939597, + "grad_norm": 2.6887903213500977, + "learning_rate": 8.920119651006358e-06, + "loss": 1.0016, + "step": 5860 + }, + { + "epoch": 0.4736257298127235, + "grad_norm": 2.7243714332580566, + "learning_rate": 8.919713434830595e-06, + "loss": 1.0094, + "step": 5861 + }, + { + "epoch": 0.473706539526051, + "grad_norm": 2.7244873046875, + "learning_rate": 8.919307151519576e-06, + "loss": 0.8577, + "step": 5862 + }, + { + "epoch": 0.4737873492393786, + "grad_norm": 2.487475633621216, + "learning_rate": 8.91890080108026e-06, + "loss": 1.0437, + "step": 5863 + }, + { + "epoch": 0.4738681589527061, + "grad_norm": 2.6981966495513916, + "learning_rate": 8.91849438351961e-06, + "loss": 0.9076, + "step": 5864 + }, + { + "epoch": 0.47394896866603364, + "grad_norm": 2.518085479736328, + "learning_rate": 8.918087898844583e-06, + "loss": 0.8898, + "step": 5865 + }, + { + "epoch": 0.4740297783793612, + "grad_norm": 2.9908382892608643, + "learning_rate": 8.917681347062142e-06, + "loss": 0.9356, + "step": 5866 + }, + { + "epoch": 0.47411058809268875, + "grad_norm": 2.8750083446502686, + "learning_rate": 8.917274728179253e-06, + "loss": 0.8916, + "step": 5867 + }, + { + "epoch": 0.47419139780601627, + "grad_norm": 2.3316619396209717, + "learning_rate": 8.916868042202876e-06, + "loss": 0.9936, + "step": 5868 + }, + { + "epoch": 0.47427220751934385, + "grad_norm": 2.837986707687378, + "learning_rate": 8.91646128913998e-06, + "loss": 0.9301, + "step": 5869 + }, + { + "epoch": 0.47435301723267137, + "grad_norm": 2.7421464920043945, + "learning_rate": 8.916054468997532e-06, + "loss": 0.8151, + "step": 5870 + }, + { + "epoch": 0.4744338269459989, + "grad_norm": 2.631556272506714, + "learning_rate": 8.915647581782496e-06, + "loss": 0.9708, + "step": 5871 + }, + { + "epoch": 0.47451463665932647, + "grad_norm": 2.9722564220428467, + "learning_rate": 8.915240627501845e-06, + "loss": 0.9472, + "step": 5872 + }, + { + "epoch": 0.474595446372654, + "grad_norm": 2.663175106048584, + "learning_rate": 8.914833606162547e-06, + "loss": 1.0013, + "step": 5873 + }, + { + "epoch": 0.4746762560859815, + "grad_norm": 2.532076120376587, + "learning_rate": 8.914426517771574e-06, + "loss": 0.9357, + "step": 5874 + }, + { + "epoch": 0.4747570657993091, + "grad_norm": 2.539815902709961, + "learning_rate": 8.914019362335899e-06, + "loss": 1.0391, + "step": 5875 + }, + { + "epoch": 0.4748378755126366, + "grad_norm": 2.487412452697754, + "learning_rate": 8.913612139862495e-06, + "loss": 1.1073, + "step": 5876 + }, + { + "epoch": 0.47491868522596414, + "grad_norm": 2.6713690757751465, + "learning_rate": 8.913204850358337e-06, + "loss": 0.9434, + "step": 5877 + }, + { + "epoch": 0.4749994949392917, + "grad_norm": 2.9342000484466553, + "learning_rate": 8.912797493830399e-06, + "loss": 1.0425, + "step": 5878 + }, + { + "epoch": 0.47508030465261925, + "grad_norm": 2.4762911796569824, + "learning_rate": 8.91239007028566e-06, + "loss": 0.8584, + "step": 5879 + }, + { + "epoch": 0.47516111436594677, + "grad_norm": 2.757404327392578, + "learning_rate": 8.911982579731097e-06, + "loss": 0.8604, + "step": 5880 + }, + { + "epoch": 0.47524192407927435, + "grad_norm": 2.7208292484283447, + "learning_rate": 8.911575022173692e-06, + "loss": 0.9445, + "step": 5881 + }, + { + "epoch": 0.47532273379260187, + "grad_norm": 2.792088031768799, + "learning_rate": 8.911167397620423e-06, + "loss": 1.0644, + "step": 5882 + }, + { + "epoch": 0.4754035435059294, + "grad_norm": 2.9895739555358887, + "learning_rate": 8.910759706078273e-06, + "loss": 1.0276, + "step": 5883 + }, + { + "epoch": 0.47548435321925697, + "grad_norm": 2.301095962524414, + "learning_rate": 8.910351947554223e-06, + "loss": 1.0377, + "step": 5884 + }, + { + "epoch": 0.4755651629325845, + "grad_norm": 2.9060397148132324, + "learning_rate": 8.909944122055259e-06, + "loss": 0.9816, + "step": 5885 + }, + { + "epoch": 0.475645972645912, + "grad_norm": 2.6341824531555176, + "learning_rate": 8.909536229588362e-06, + "loss": 0.9556, + "step": 5886 + }, + { + "epoch": 0.4757267823592396, + "grad_norm": 2.6049411296844482, + "learning_rate": 8.909128270160522e-06, + "loss": 1.0321, + "step": 5887 + }, + { + "epoch": 0.4758075920725671, + "grad_norm": 2.5819296836853027, + "learning_rate": 8.90872024377873e-06, + "loss": 0.9837, + "step": 5888 + }, + { + "epoch": 0.47588840178589464, + "grad_norm": 2.6825339794158936, + "learning_rate": 8.908312150449965e-06, + "loss": 0.9934, + "step": 5889 + }, + { + "epoch": 0.4759692114992222, + "grad_norm": 2.4855387210845947, + "learning_rate": 8.907903990181224e-06, + "loss": 0.9458, + "step": 5890 + }, + { + "epoch": 0.47605002121254975, + "grad_norm": 2.2605373859405518, + "learning_rate": 8.907495762979495e-06, + "loss": 0.9588, + "step": 5891 + }, + { + "epoch": 0.47613083092587727, + "grad_norm": 3.199838161468506, + "learning_rate": 8.907087468851772e-06, + "loss": 0.8523, + "step": 5892 + }, + { + "epoch": 0.47621164063920485, + "grad_norm": 2.5428595542907715, + "learning_rate": 8.906679107805046e-06, + "loss": 0.929, + "step": 5893 + }, + { + "epoch": 0.47629245035253237, + "grad_norm": 2.752619743347168, + "learning_rate": 8.90627067984631e-06, + "loss": 0.9855, + "step": 5894 + }, + { + "epoch": 0.4763732600658599, + "grad_norm": 2.589021921157837, + "learning_rate": 8.905862184982561e-06, + "loss": 0.9226, + "step": 5895 + }, + { + "epoch": 0.4764540697791875, + "grad_norm": 2.7154488563537598, + "learning_rate": 8.905453623220797e-06, + "loss": 0.9373, + "step": 5896 + }, + { + "epoch": 0.476534879492515, + "grad_norm": 2.9029784202575684, + "learning_rate": 8.905044994568015e-06, + "loss": 1.1735, + "step": 5897 + }, + { + "epoch": 0.4766156892058425, + "grad_norm": 2.720885992050171, + "learning_rate": 8.904636299031212e-06, + "loss": 1.0186, + "step": 5898 + }, + { + "epoch": 0.4766964989191701, + "grad_norm": 2.810302257537842, + "learning_rate": 8.90422753661739e-06, + "loss": 0.8561, + "step": 5899 + }, + { + "epoch": 0.4767773086324976, + "grad_norm": 2.8350160121917725, + "learning_rate": 8.90381870733355e-06, + "loss": 0.9832, + "step": 5900 + }, + { + "epoch": 0.47685811834582514, + "grad_norm": 3.249141216278076, + "learning_rate": 8.903409811186694e-06, + "loss": 0.97, + "step": 5901 + }, + { + "epoch": 0.4769389280591527, + "grad_norm": 2.7863097190856934, + "learning_rate": 8.903000848183822e-06, + "loss": 0.9245, + "step": 5902 + }, + { + "epoch": 0.47701973777248025, + "grad_norm": 2.48960542678833, + "learning_rate": 8.902591818331944e-06, + "loss": 0.916, + "step": 5903 + }, + { + "epoch": 0.47710054748580777, + "grad_norm": 2.8880462646484375, + "learning_rate": 8.902182721638064e-06, + "loss": 0.9901, + "step": 5904 + }, + { + "epoch": 0.47718135719913535, + "grad_norm": 2.901137351989746, + "learning_rate": 8.901773558109185e-06, + "loss": 1.0175, + "step": 5905 + }, + { + "epoch": 0.47726216691246287, + "grad_norm": 2.259371280670166, + "learning_rate": 8.90136432775232e-06, + "loss": 1.0321, + "step": 5906 + }, + { + "epoch": 0.4773429766257904, + "grad_norm": 2.7619972229003906, + "learning_rate": 8.900955030574478e-06, + "loss": 0.9332, + "step": 5907 + }, + { + "epoch": 0.477423786339118, + "grad_norm": 2.9904873371124268, + "learning_rate": 8.900545666582665e-06, + "loss": 0.9263, + "step": 5908 + }, + { + "epoch": 0.4775045960524455, + "grad_norm": 2.4642465114593506, + "learning_rate": 8.900136235783896e-06, + "loss": 0.9581, + "step": 5909 + }, + { + "epoch": 0.477585405765773, + "grad_norm": 3.090346336364746, + "learning_rate": 8.899726738185182e-06, + "loss": 0.9824, + "step": 5910 + }, + { + "epoch": 0.4776662154791006, + "grad_norm": 2.9905292987823486, + "learning_rate": 8.899317173793537e-06, + "loss": 0.981, + "step": 5911 + }, + { + "epoch": 0.4777470251924281, + "grad_norm": 2.654024600982666, + "learning_rate": 8.898907542615975e-06, + "loss": 0.9052, + "step": 5912 + }, + { + "epoch": 0.4778278349057557, + "grad_norm": 2.6835696697235107, + "learning_rate": 8.898497844659515e-06, + "loss": 1.0652, + "step": 5913 + }, + { + "epoch": 0.4779086446190832, + "grad_norm": 2.551361560821533, + "learning_rate": 8.898088079931171e-06, + "loss": 0.9675, + "step": 5914 + }, + { + "epoch": 0.47798945433241075, + "grad_norm": 2.444711685180664, + "learning_rate": 8.897678248437965e-06, + "loss": 0.9254, + "step": 5915 + }, + { + "epoch": 0.4780702640457383, + "grad_norm": 2.4407832622528076, + "learning_rate": 8.89726835018691e-06, + "loss": 0.967, + "step": 5916 + }, + { + "epoch": 0.47815107375906585, + "grad_norm": 2.499941110610962, + "learning_rate": 8.896858385185032e-06, + "loss": 0.9801, + "step": 5917 + }, + { + "epoch": 0.47823188347239337, + "grad_norm": 2.9067742824554443, + "learning_rate": 8.896448353439352e-06, + "loss": 0.9183, + "step": 5918 + }, + { + "epoch": 0.47831269318572095, + "grad_norm": 2.7358031272888184, + "learning_rate": 8.896038254956892e-06, + "loss": 0.9574, + "step": 5919 + }, + { + "epoch": 0.4783935028990485, + "grad_norm": 2.9418129920959473, + "learning_rate": 8.895628089744674e-06, + "loss": 1.0692, + "step": 5920 + }, + { + "epoch": 0.478474312612376, + "grad_norm": 2.660916328430176, + "learning_rate": 8.895217857809728e-06, + "loss": 1.1609, + "step": 5921 + }, + { + "epoch": 0.4785551223257036, + "grad_norm": 2.3367667198181152, + "learning_rate": 8.894807559159075e-06, + "loss": 1.0068, + "step": 5922 + }, + { + "epoch": 0.4786359320390311, + "grad_norm": 2.8970179557800293, + "learning_rate": 8.894397193799747e-06, + "loss": 1.054, + "step": 5923 + }, + { + "epoch": 0.4787167417523586, + "grad_norm": 2.2323038578033447, + "learning_rate": 8.893986761738769e-06, + "loss": 1.0777, + "step": 5924 + }, + { + "epoch": 0.4787975514656862, + "grad_norm": 2.432058572769165, + "learning_rate": 8.893576262983173e-06, + "loss": 1.0511, + "step": 5925 + }, + { + "epoch": 0.4788783611790137, + "grad_norm": 2.663792848587036, + "learning_rate": 8.893165697539988e-06, + "loss": 1.0488, + "step": 5926 + }, + { + "epoch": 0.47895917089234125, + "grad_norm": 2.9636640548706055, + "learning_rate": 8.892755065416247e-06, + "loss": 1.048, + "step": 5927 + }, + { + "epoch": 0.4790399806056688, + "grad_norm": 2.5020740032196045, + "learning_rate": 8.892344366618985e-06, + "loss": 1.122, + "step": 5928 + }, + { + "epoch": 0.47912079031899635, + "grad_norm": 2.7484488487243652, + "learning_rate": 8.891933601155233e-06, + "loss": 0.9464, + "step": 5929 + }, + { + "epoch": 0.47920160003232387, + "grad_norm": 2.557628631591797, + "learning_rate": 8.891522769032029e-06, + "loss": 0.9269, + "step": 5930 + }, + { + "epoch": 0.47928240974565145, + "grad_norm": 2.434396266937256, + "learning_rate": 8.891111870256406e-06, + "loss": 0.9905, + "step": 5931 + }, + { + "epoch": 0.479363219458979, + "grad_norm": 2.8306679725646973, + "learning_rate": 8.890700904835405e-06, + "loss": 0.959, + "step": 5932 + }, + { + "epoch": 0.4794440291723065, + "grad_norm": 3.045585870742798, + "learning_rate": 8.890289872776066e-06, + "loss": 1.0857, + "step": 5933 + }, + { + "epoch": 0.4795248388856341, + "grad_norm": 2.8493340015411377, + "learning_rate": 8.889878774085425e-06, + "loss": 0.996, + "step": 5934 + }, + { + "epoch": 0.4796056485989616, + "grad_norm": 2.993567705154419, + "learning_rate": 8.889467608770526e-06, + "loss": 1.0702, + "step": 5935 + }, + { + "epoch": 0.4796864583122891, + "grad_norm": 2.2887275218963623, + "learning_rate": 8.88905637683841e-06, + "loss": 0.9916, + "step": 5936 + }, + { + "epoch": 0.4797672680256167, + "grad_norm": 2.474585771560669, + "learning_rate": 8.88864507829612e-06, + "loss": 1.0589, + "step": 5937 + }, + { + "epoch": 0.4798480777389442, + "grad_norm": 2.6525509357452393, + "learning_rate": 8.888233713150702e-06, + "loss": 1.0499, + "step": 5938 + }, + { + "epoch": 0.47992888745227175, + "grad_norm": 2.752514600753784, + "learning_rate": 8.887822281409202e-06, + "loss": 1.0005, + "step": 5939 + }, + { + "epoch": 0.4800096971655993, + "grad_norm": 2.515058755874634, + "learning_rate": 8.887410783078664e-06, + "loss": 0.9086, + "step": 5940 + }, + { + "epoch": 0.48009050687892685, + "grad_norm": 2.3969075679779053, + "learning_rate": 8.88699921816614e-06, + "loss": 0.9103, + "step": 5941 + }, + { + "epoch": 0.48017131659225437, + "grad_norm": 2.6233177185058594, + "learning_rate": 8.886587586678675e-06, + "loss": 1.0183, + "step": 5942 + }, + { + "epoch": 0.48025212630558195, + "grad_norm": 2.835303783416748, + "learning_rate": 8.886175888623323e-06, + "loss": 0.9628, + "step": 5943 + }, + { + "epoch": 0.4803329360189095, + "grad_norm": 3.0450258255004883, + "learning_rate": 8.885764124007132e-06, + "loss": 0.8766, + "step": 5944 + }, + { + "epoch": 0.480413745732237, + "grad_norm": 2.686596155166626, + "learning_rate": 8.885352292837157e-06, + "loss": 0.9347, + "step": 5945 + }, + { + "epoch": 0.4804945554455646, + "grad_norm": 2.518345355987549, + "learning_rate": 8.884940395120451e-06, + "loss": 1.0938, + "step": 5946 + }, + { + "epoch": 0.4805753651588921, + "grad_norm": 2.5679078102111816, + "learning_rate": 8.884528430864067e-06, + "loss": 1.114, + "step": 5947 + }, + { + "epoch": 0.4806561748722196, + "grad_norm": 2.5896337032318115, + "learning_rate": 8.884116400075064e-06, + "loss": 0.9511, + "step": 5948 + }, + { + "epoch": 0.4807369845855472, + "grad_norm": 2.400423288345337, + "learning_rate": 8.883704302760499e-06, + "loss": 0.9513, + "step": 5949 + }, + { + "epoch": 0.4808177942988747, + "grad_norm": 3.165663242340088, + "learning_rate": 8.883292138927427e-06, + "loss": 0.9374, + "step": 5950 + }, + { + "epoch": 0.48089860401220225, + "grad_norm": 3.0608668327331543, + "learning_rate": 8.88287990858291e-06, + "loss": 0.9592, + "step": 5951 + }, + { + "epoch": 0.4809794137255298, + "grad_norm": 2.5450081825256348, + "learning_rate": 8.882467611734006e-06, + "loss": 0.9344, + "step": 5952 + }, + { + "epoch": 0.48106022343885735, + "grad_norm": 2.836158037185669, + "learning_rate": 8.882055248387781e-06, + "loss": 0.8362, + "step": 5953 + }, + { + "epoch": 0.48114103315218487, + "grad_norm": 2.946566104888916, + "learning_rate": 8.881642818551295e-06, + "loss": 0.8943, + "step": 5954 + }, + { + "epoch": 0.48122184286551245, + "grad_norm": 2.8138973712921143, + "learning_rate": 8.881230322231612e-06, + "loss": 1.0198, + "step": 5955 + }, + { + "epoch": 0.48130265257884, + "grad_norm": 2.5750601291656494, + "learning_rate": 8.880817759435796e-06, + "loss": 1.0705, + "step": 5956 + }, + { + "epoch": 0.4813834622921675, + "grad_norm": 2.8209667205810547, + "learning_rate": 8.880405130170916e-06, + "loss": 1.1097, + "step": 5957 + }, + { + "epoch": 0.4814642720054951, + "grad_norm": 3.044922113418579, + "learning_rate": 8.879992434444037e-06, + "loss": 0.9697, + "step": 5958 + }, + { + "epoch": 0.4815450817188226, + "grad_norm": 2.602264642715454, + "learning_rate": 8.879579672262228e-06, + "loss": 1.042, + "step": 5959 + }, + { + "epoch": 0.4816258914321501, + "grad_norm": 2.904402017593384, + "learning_rate": 8.879166843632559e-06, + "loss": 0.9123, + "step": 5960 + }, + { + "epoch": 0.4817067011454777, + "grad_norm": 2.8899734020233154, + "learning_rate": 8.878753948562103e-06, + "loss": 0.9966, + "step": 5961 + }, + { + "epoch": 0.4817875108588052, + "grad_norm": 3.0337719917297363, + "learning_rate": 8.878340987057926e-06, + "loss": 0.9003, + "step": 5962 + }, + { + "epoch": 0.48186832057213275, + "grad_norm": 2.6741790771484375, + "learning_rate": 8.877927959127106e-06, + "loss": 1.1136, + "step": 5963 + }, + { + "epoch": 0.4819491302854603, + "grad_norm": 2.7310738563537598, + "learning_rate": 8.877514864776718e-06, + "loss": 0.9247, + "step": 5964 + }, + { + "epoch": 0.48202993999878785, + "grad_norm": 2.645674467086792, + "learning_rate": 8.877101704013832e-06, + "loss": 0.9974, + "step": 5965 + }, + { + "epoch": 0.48211074971211537, + "grad_norm": 2.878068208694458, + "learning_rate": 8.876688476845527e-06, + "loss": 1.1114, + "step": 5966 + }, + { + "epoch": 0.48219155942544295, + "grad_norm": 2.9921646118164062, + "learning_rate": 8.876275183278883e-06, + "loss": 0.9797, + "step": 5967 + }, + { + "epoch": 0.4822723691387705, + "grad_norm": 2.4352478981018066, + "learning_rate": 8.875861823320977e-06, + "loss": 0.8394, + "step": 5968 + }, + { + "epoch": 0.482353178852098, + "grad_norm": 2.6279549598693848, + "learning_rate": 8.87544839697889e-06, + "loss": 1.0402, + "step": 5969 + }, + { + "epoch": 0.4824339885654256, + "grad_norm": 2.7221219539642334, + "learning_rate": 8.8750349042597e-06, + "loss": 0.8392, + "step": 5970 + }, + { + "epoch": 0.4825147982787531, + "grad_norm": 2.767946481704712, + "learning_rate": 8.87462134517049e-06, + "loss": 0.9744, + "step": 5971 + }, + { + "epoch": 0.4825956079920806, + "grad_norm": 2.7020928859710693, + "learning_rate": 8.874207719718345e-06, + "loss": 0.8544, + "step": 5972 + }, + { + "epoch": 0.4826764177054082, + "grad_norm": 3.2032032012939453, + "learning_rate": 8.873794027910349e-06, + "loss": 0.8749, + "step": 5973 + }, + { + "epoch": 0.4827572274187357, + "grad_norm": 2.381051778793335, + "learning_rate": 8.873380269753586e-06, + "loss": 0.9447, + "step": 5974 + }, + { + "epoch": 0.48283803713206325, + "grad_norm": 2.5544629096984863, + "learning_rate": 8.872966445255144e-06, + "loss": 0.9809, + "step": 5975 + }, + { + "epoch": 0.4829188468453908, + "grad_norm": 2.5966579914093018, + "learning_rate": 8.872552554422111e-06, + "loss": 1.0221, + "step": 5976 + }, + { + "epoch": 0.48299965655871835, + "grad_norm": 2.7803614139556885, + "learning_rate": 8.872138597261578e-06, + "loss": 0.8829, + "step": 5977 + }, + { + "epoch": 0.4830804662720459, + "grad_norm": 2.755143880844116, + "learning_rate": 8.87172457378063e-06, + "loss": 0.9079, + "step": 5978 + }, + { + "epoch": 0.48316127598537345, + "grad_norm": 2.575796604156494, + "learning_rate": 8.871310483986359e-06, + "loss": 0.9363, + "step": 5979 + }, + { + "epoch": 0.483242085698701, + "grad_norm": 2.5812630653381348, + "learning_rate": 8.870896327885863e-06, + "loss": 0.8756, + "step": 5980 + }, + { + "epoch": 0.48332289541202855, + "grad_norm": 2.4501044750213623, + "learning_rate": 8.870482105486229e-06, + "loss": 1.0085, + "step": 5981 + }, + { + "epoch": 0.4834037051253561, + "grad_norm": 2.852572202682495, + "learning_rate": 8.870067816794557e-06, + "loss": 0.9326, + "step": 5982 + }, + { + "epoch": 0.4834845148386836, + "grad_norm": 2.9296329021453857, + "learning_rate": 8.869653461817937e-06, + "loss": 0.8954, + "step": 5983 + }, + { + "epoch": 0.4835653245520112, + "grad_norm": 3.000253915786743, + "learning_rate": 8.86923904056347e-06, + "loss": 1.0676, + "step": 5984 + }, + { + "epoch": 0.4836461342653387, + "grad_norm": 2.5141992568969727, + "learning_rate": 8.868824553038255e-06, + "loss": 0.8814, + "step": 5985 + }, + { + "epoch": 0.4837269439786662, + "grad_norm": 2.6345553398132324, + "learning_rate": 8.868409999249387e-06, + "loss": 0.8525, + "step": 5986 + }, + { + "epoch": 0.4838077536919938, + "grad_norm": 2.6637556552886963, + "learning_rate": 8.867995379203969e-06, + "loss": 1.0274, + "step": 5987 + }, + { + "epoch": 0.4838885634053213, + "grad_norm": 2.3642663955688477, + "learning_rate": 8.867580692909102e-06, + "loss": 0.938, + "step": 5988 + }, + { + "epoch": 0.48396937311864885, + "grad_norm": 2.8194892406463623, + "learning_rate": 8.867165940371888e-06, + "loss": 0.9672, + "step": 5989 + }, + { + "epoch": 0.4840501828319764, + "grad_norm": 3.4492805004119873, + "learning_rate": 8.866751121599432e-06, + "loss": 0.8927, + "step": 5990 + }, + { + "epoch": 0.48413099254530395, + "grad_norm": 2.597614049911499, + "learning_rate": 8.866336236598839e-06, + "loss": 0.8299, + "step": 5991 + }, + { + "epoch": 0.4842118022586315, + "grad_norm": 2.568424701690674, + "learning_rate": 8.865921285377214e-06, + "loss": 1.0524, + "step": 5992 + }, + { + "epoch": 0.48429261197195905, + "grad_norm": 3.424942970275879, + "learning_rate": 8.865506267941663e-06, + "loss": 0.9346, + "step": 5993 + }, + { + "epoch": 0.4843734216852866, + "grad_norm": 2.7126035690307617, + "learning_rate": 8.865091184299295e-06, + "loss": 0.9374, + "step": 5994 + }, + { + "epoch": 0.4844542313986141, + "grad_norm": 2.644007682800293, + "learning_rate": 8.864676034457222e-06, + "loss": 1.0067, + "step": 5995 + }, + { + "epoch": 0.4845350411119417, + "grad_norm": 2.539374828338623, + "learning_rate": 8.864260818422549e-06, + "loss": 0.9168, + "step": 5996 + }, + { + "epoch": 0.4846158508252692, + "grad_norm": 2.574723243713379, + "learning_rate": 8.863845536202394e-06, + "loss": 0.9339, + "step": 5997 + }, + { + "epoch": 0.4846966605385967, + "grad_norm": 2.717312812805176, + "learning_rate": 8.863430187803867e-06, + "loss": 0.9301, + "step": 5998 + }, + { + "epoch": 0.4847774702519243, + "grad_norm": 2.5925450325012207, + "learning_rate": 8.86301477323408e-06, + "loss": 0.9089, + "step": 5999 + }, + { + "epoch": 0.4848582799652518, + "grad_norm": 2.3955633640289307, + "learning_rate": 8.862599292500151e-06, + "loss": 0.9023, + "step": 6000 + }, + { + "epoch": 0.4848582799652518, + "eval_loss": 0.8077141046524048, + "eval_runtime": 812.2973, + "eval_samples_per_second": 102.63, + "eval_steps_per_second": 12.829, + "step": 6000 + }, + { + "epoch": 0.48493908967857935, + "grad_norm": 2.3458774089813232, + "learning_rate": 8.862183745609195e-06, + "loss": 0.9074, + "step": 6001 + }, + { + "epoch": 0.48501989939190693, + "grad_norm": 2.666863441467285, + "learning_rate": 8.861768132568327e-06, + "loss": 1.0235, + "step": 6002 + }, + { + "epoch": 0.48510070910523445, + "grad_norm": 2.8406612873077393, + "learning_rate": 8.86135245338467e-06, + "loss": 1.088, + "step": 6003 + }, + { + "epoch": 0.485181518818562, + "grad_norm": 2.791804075241089, + "learning_rate": 8.86093670806534e-06, + "loss": 0.9759, + "step": 6004 + }, + { + "epoch": 0.48526232853188955, + "grad_norm": 3.0780794620513916, + "learning_rate": 8.860520896617459e-06, + "loss": 1.0224, + "step": 6005 + }, + { + "epoch": 0.4853431382452171, + "grad_norm": 3.0164146423339844, + "learning_rate": 8.86010501904815e-06, + "loss": 0.8652, + "step": 6006 + }, + { + "epoch": 0.4854239479585446, + "grad_norm": 3.068770170211792, + "learning_rate": 8.859689075364535e-06, + "loss": 1.0249, + "step": 6007 + }, + { + "epoch": 0.4855047576718722, + "grad_norm": 2.6718451976776123, + "learning_rate": 8.859273065573736e-06, + "loss": 0.9633, + "step": 6008 + }, + { + "epoch": 0.4855855673851997, + "grad_norm": 2.410693645477295, + "learning_rate": 8.858856989682883e-06, + "loss": 0.967, + "step": 6009 + }, + { + "epoch": 0.4856663770985272, + "grad_norm": 2.8103816509246826, + "learning_rate": 8.858440847699097e-06, + "loss": 0.8983, + "step": 6010 + }, + { + "epoch": 0.4857471868118548, + "grad_norm": 2.8238930702209473, + "learning_rate": 8.85802463962951e-06, + "loss": 1.0157, + "step": 6011 + }, + { + "epoch": 0.4858279965251823, + "grad_norm": 2.3492908477783203, + "learning_rate": 8.857608365481247e-06, + "loss": 1.0542, + "step": 6012 + }, + { + "epoch": 0.48590880623850985, + "grad_norm": 2.9035751819610596, + "learning_rate": 8.85719202526144e-06, + "loss": 0.9303, + "step": 6013 + }, + { + "epoch": 0.48598961595183743, + "grad_norm": 2.6763007640838623, + "learning_rate": 8.85677561897722e-06, + "loss": 0.8904, + "step": 6014 + }, + { + "epoch": 0.48607042566516495, + "grad_norm": 2.706169605255127, + "learning_rate": 8.85635914663572e-06, + "loss": 1.0327, + "step": 6015 + }, + { + "epoch": 0.4861512353784925, + "grad_norm": 2.4542858600616455, + "learning_rate": 8.855942608244069e-06, + "loss": 0.9041, + "step": 6016 + }, + { + "epoch": 0.48623204509182005, + "grad_norm": 2.3222744464874268, + "learning_rate": 8.855526003809405e-06, + "loss": 1.0501, + "step": 6017 + }, + { + "epoch": 0.4863128548051476, + "grad_norm": 2.7569198608398438, + "learning_rate": 8.855109333338863e-06, + "loss": 0.9584, + "step": 6018 + }, + { + "epoch": 0.4863936645184751, + "grad_norm": 2.7721915245056152, + "learning_rate": 8.854692596839577e-06, + "loss": 1.0452, + "step": 6019 + }, + { + "epoch": 0.4864744742318027, + "grad_norm": 2.8172824382781982, + "learning_rate": 8.854275794318688e-06, + "loss": 1.1104, + "step": 6020 + }, + { + "epoch": 0.4865552839451302, + "grad_norm": 2.705747365951538, + "learning_rate": 8.853858925783334e-06, + "loss": 0.9695, + "step": 6021 + }, + { + "epoch": 0.4866360936584577, + "grad_norm": 2.884138822555542, + "learning_rate": 8.853441991240652e-06, + "loss": 0.8577, + "step": 6022 + }, + { + "epoch": 0.4867169033717853, + "grad_norm": 2.4505674839019775, + "learning_rate": 8.853024990697787e-06, + "loss": 0.9479, + "step": 6023 + }, + { + "epoch": 0.4867977130851128, + "grad_norm": 2.5882649421691895, + "learning_rate": 8.85260792416188e-06, + "loss": 1.0449, + "step": 6024 + }, + { + "epoch": 0.48687852279844035, + "grad_norm": 2.8596298694610596, + "learning_rate": 8.852190791640075e-06, + "loss": 0.9191, + "step": 6025 + }, + { + "epoch": 0.48695933251176793, + "grad_norm": 2.4536993503570557, + "learning_rate": 8.851773593139514e-06, + "loss": 1.0858, + "step": 6026 + }, + { + "epoch": 0.48704014222509545, + "grad_norm": 3.342395305633545, + "learning_rate": 8.851356328667343e-06, + "loss": 0.995, + "step": 6027 + }, + { + "epoch": 0.487120951938423, + "grad_norm": 2.5988099575042725, + "learning_rate": 8.850938998230711e-06, + "loss": 1.0465, + "step": 6028 + }, + { + "epoch": 0.48720176165175055, + "grad_norm": 2.6186883449554443, + "learning_rate": 8.850521601836765e-06, + "loss": 0.9064, + "step": 6029 + }, + { + "epoch": 0.4872825713650781, + "grad_norm": 3.1623666286468506, + "learning_rate": 8.850104139492655e-06, + "loss": 0.9117, + "step": 6030 + }, + { + "epoch": 0.4873633810784056, + "grad_norm": 2.673964500427246, + "learning_rate": 8.849686611205528e-06, + "loss": 1.0292, + "step": 6031 + }, + { + "epoch": 0.4874441907917332, + "grad_norm": 2.733144521713257, + "learning_rate": 8.849269016982537e-06, + "loss": 0.9927, + "step": 6032 + }, + { + "epoch": 0.4875250005050607, + "grad_norm": 2.493893623352051, + "learning_rate": 8.848851356830834e-06, + "loss": 0.9182, + "step": 6033 + }, + { + "epoch": 0.4876058102183882, + "grad_norm": 2.8255133628845215, + "learning_rate": 8.848433630757575e-06, + "loss": 1.0022, + "step": 6034 + }, + { + "epoch": 0.4876866199317158, + "grad_norm": 3.0941851139068604, + "learning_rate": 8.848015838769912e-06, + "loss": 0.9175, + "step": 6035 + }, + { + "epoch": 0.4877674296450433, + "grad_norm": 2.5062406063079834, + "learning_rate": 8.847597980875e-06, + "loss": 0.8803, + "step": 6036 + }, + { + "epoch": 0.48784823935837085, + "grad_norm": 2.7179102897644043, + "learning_rate": 8.84718005708e-06, + "loss": 0.901, + "step": 6037 + }, + { + "epoch": 0.48792904907169843, + "grad_norm": 2.4092929363250732, + "learning_rate": 8.846762067392065e-06, + "loss": 1.114, + "step": 6038 + }, + { + "epoch": 0.48800985878502595, + "grad_norm": 2.510439395904541, + "learning_rate": 8.846344011818357e-06, + "loss": 1.1026, + "step": 6039 + }, + { + "epoch": 0.4880906684983535, + "grad_norm": 2.695366621017456, + "learning_rate": 8.845925890366036e-06, + "loss": 1.1017, + "step": 6040 + }, + { + "epoch": 0.48817147821168105, + "grad_norm": 2.9203357696533203, + "learning_rate": 8.845507703042263e-06, + "loss": 0.9382, + "step": 6041 + }, + { + "epoch": 0.4882522879250086, + "grad_norm": 2.4055418968200684, + "learning_rate": 8.8450894498542e-06, + "loss": 1.0106, + "step": 6042 + }, + { + "epoch": 0.48833309763833616, + "grad_norm": 2.5554018020629883, + "learning_rate": 8.844671130809013e-06, + "loss": 1.0267, + "step": 6043 + }, + { + "epoch": 0.4884139073516637, + "grad_norm": 3.0125513076782227, + "learning_rate": 8.844252745913866e-06, + "loss": 0.9283, + "step": 6044 + }, + { + "epoch": 0.4884947170649912, + "grad_norm": 2.96811842918396, + "learning_rate": 8.843834295175921e-06, + "loss": 1.0757, + "step": 6045 + }, + { + "epoch": 0.4885755267783188, + "grad_norm": 2.77514910697937, + "learning_rate": 8.843415778602352e-06, + "loss": 0.8695, + "step": 6046 + }, + { + "epoch": 0.4886563364916463, + "grad_norm": 2.5683491230010986, + "learning_rate": 8.842997196200318e-06, + "loss": 0.9588, + "step": 6047 + }, + { + "epoch": 0.4887371462049738, + "grad_norm": 2.290745973587036, + "learning_rate": 8.842578547976998e-06, + "loss": 0.9693, + "step": 6048 + }, + { + "epoch": 0.4888179559183014, + "grad_norm": 2.7583298683166504, + "learning_rate": 8.842159833939557e-06, + "loss": 0.8398, + "step": 6049 + }, + { + "epoch": 0.48889876563162893, + "grad_norm": 2.3874542713165283, + "learning_rate": 8.841741054095167e-06, + "loss": 0.9163, + "step": 6050 + }, + { + "epoch": 0.48897957534495645, + "grad_norm": 2.66827392578125, + "learning_rate": 8.841322208451003e-06, + "loss": 0.9906, + "step": 6051 + }, + { + "epoch": 0.48906038505828403, + "grad_norm": 2.6189911365509033, + "learning_rate": 8.840903297014236e-06, + "loss": 0.9406, + "step": 6052 + }, + { + "epoch": 0.48914119477161155, + "grad_norm": 2.418142080307007, + "learning_rate": 8.840484319792042e-06, + "loss": 0.9687, + "step": 6053 + }, + { + "epoch": 0.4892220044849391, + "grad_norm": 2.5195775032043457, + "learning_rate": 8.840065276791598e-06, + "loss": 1.0489, + "step": 6054 + }, + { + "epoch": 0.48930281419826666, + "grad_norm": 2.4295156002044678, + "learning_rate": 8.83964616802008e-06, + "loss": 1.0801, + "step": 6055 + }, + { + "epoch": 0.4893836239115942, + "grad_norm": 3.047612428665161, + "learning_rate": 8.839226993484667e-06, + "loss": 0.87, + "step": 6056 + }, + { + "epoch": 0.4894644336249217, + "grad_norm": 2.496460437774658, + "learning_rate": 8.838807753192537e-06, + "loss": 1.0319, + "step": 6057 + }, + { + "epoch": 0.4895452433382493, + "grad_norm": 2.630291700363159, + "learning_rate": 8.838388447150872e-06, + "loss": 0.9838, + "step": 6058 + }, + { + "epoch": 0.4896260530515768, + "grad_norm": 2.7976157665252686, + "learning_rate": 8.837969075366855e-06, + "loss": 0.9455, + "step": 6059 + }, + { + "epoch": 0.4897068627649043, + "grad_norm": 2.8019280433654785, + "learning_rate": 8.837549637847665e-06, + "loss": 0.9119, + "step": 6060 + }, + { + "epoch": 0.4897876724782319, + "grad_norm": 2.3282132148742676, + "learning_rate": 8.837130134600489e-06, + "loss": 0.9218, + "step": 6061 + }, + { + "epoch": 0.48986848219155943, + "grad_norm": 2.7614595890045166, + "learning_rate": 8.83671056563251e-06, + "loss": 1.082, + "step": 6062 + }, + { + "epoch": 0.48994929190488695, + "grad_norm": 3.011939525604248, + "learning_rate": 8.836290930950918e-06, + "loss": 0.9681, + "step": 6063 + }, + { + "epoch": 0.49003010161821453, + "grad_norm": 3.2438557147979736, + "learning_rate": 8.835871230562899e-06, + "loss": 0.8931, + "step": 6064 + }, + { + "epoch": 0.49011091133154205, + "grad_norm": 2.61104679107666, + "learning_rate": 8.835451464475637e-06, + "loss": 0.9651, + "step": 6065 + }, + { + "epoch": 0.4901917210448696, + "grad_norm": 2.612506628036499, + "learning_rate": 8.835031632696328e-06, + "loss": 0.9069, + "step": 6066 + }, + { + "epoch": 0.49027253075819716, + "grad_norm": 2.769911050796509, + "learning_rate": 8.834611735232157e-06, + "loss": 0.9735, + "step": 6067 + }, + { + "epoch": 0.4903533404715247, + "grad_norm": 2.936298131942749, + "learning_rate": 8.83419177209032e-06, + "loss": 0.9326, + "step": 6068 + }, + { + "epoch": 0.4904341501848522, + "grad_norm": 2.64540696144104, + "learning_rate": 8.833771743278007e-06, + "loss": 0.9809, + "step": 6069 + }, + { + "epoch": 0.4905149598981798, + "grad_norm": 3.032196283340454, + "learning_rate": 8.833351648802413e-06, + "loss": 0.9453, + "step": 6070 + }, + { + "epoch": 0.4905957696115073, + "grad_norm": 2.9307849407196045, + "learning_rate": 8.832931488670735e-06, + "loss": 1.0201, + "step": 6071 + }, + { + "epoch": 0.4906765793248348, + "grad_norm": 2.5332212448120117, + "learning_rate": 8.832511262890169e-06, + "loss": 0.8267, + "step": 6072 + }, + { + "epoch": 0.4907573890381624, + "grad_norm": 2.430079460144043, + "learning_rate": 8.832090971467909e-06, + "loss": 0.956, + "step": 6073 + }, + { + "epoch": 0.49083819875148993, + "grad_norm": 2.6602840423583984, + "learning_rate": 8.831670614411157e-06, + "loss": 0.9903, + "step": 6074 + }, + { + "epoch": 0.49091900846481745, + "grad_norm": 3.0974788665771484, + "learning_rate": 8.831250191727112e-06, + "loss": 1.0623, + "step": 6075 + }, + { + "epoch": 0.49099981817814503, + "grad_norm": 2.710204601287842, + "learning_rate": 8.830829703422976e-06, + "loss": 0.9669, + "step": 6076 + }, + { + "epoch": 0.49108062789147255, + "grad_norm": 3.272197961807251, + "learning_rate": 8.830409149505947e-06, + "loss": 0.8662, + "step": 6077 + }, + { + "epoch": 0.4911614376048001, + "grad_norm": 2.55169415473938, + "learning_rate": 8.829988529983232e-06, + "loss": 0.96, + "step": 6078 + }, + { + "epoch": 0.49124224731812766, + "grad_norm": 2.7996485233306885, + "learning_rate": 8.829567844862033e-06, + "loss": 0.9724, + "step": 6079 + }, + { + "epoch": 0.4913230570314552, + "grad_norm": 2.742427110671997, + "learning_rate": 8.829147094149557e-06, + "loss": 1.0475, + "step": 6080 + }, + { + "epoch": 0.4914038667447827, + "grad_norm": 2.507977247238159, + "learning_rate": 8.82872627785301e-06, + "loss": 1.0049, + "step": 6081 + }, + { + "epoch": 0.4914846764581103, + "grad_norm": 2.621269702911377, + "learning_rate": 8.828305395979597e-06, + "loss": 0.8754, + "step": 6082 + }, + { + "epoch": 0.4915654861714378, + "grad_norm": 2.5403740406036377, + "learning_rate": 8.827884448536531e-06, + "loss": 0.9419, + "step": 6083 + }, + { + "epoch": 0.4916462958847653, + "grad_norm": 2.73575496673584, + "learning_rate": 8.827463435531018e-06, + "loss": 0.8716, + "step": 6084 + }, + { + "epoch": 0.4917271055980929, + "grad_norm": 2.153745412826538, + "learning_rate": 8.827042356970272e-06, + "loss": 0.9779, + "step": 6085 + }, + { + "epoch": 0.49180791531142043, + "grad_norm": 2.304722785949707, + "learning_rate": 8.826621212861504e-06, + "loss": 1.0142, + "step": 6086 + }, + { + "epoch": 0.49188872502474795, + "grad_norm": 2.536217212677002, + "learning_rate": 8.826200003211924e-06, + "loss": 0.8366, + "step": 6087 + }, + { + "epoch": 0.49196953473807553, + "grad_norm": 2.6360926628112793, + "learning_rate": 8.825778728028753e-06, + "loss": 0.8163, + "step": 6088 + }, + { + "epoch": 0.49205034445140305, + "grad_norm": 2.456071376800537, + "learning_rate": 8.8253573873192e-06, + "loss": 0.8741, + "step": 6089 + }, + { + "epoch": 0.4921311541647306, + "grad_norm": 2.8125364780426025, + "learning_rate": 8.824935981090485e-06, + "loss": 0.9358, + "step": 6090 + }, + { + "epoch": 0.49221196387805816, + "grad_norm": 2.7256171703338623, + "learning_rate": 8.824514509349824e-06, + "loss": 0.9223, + "step": 6091 + }, + { + "epoch": 0.4922927735913857, + "grad_norm": 2.9064652919769287, + "learning_rate": 8.824092972104437e-06, + "loss": 1.1134, + "step": 6092 + }, + { + "epoch": 0.4923735833047132, + "grad_norm": 2.519120454788208, + "learning_rate": 8.823671369361545e-06, + "loss": 1.0679, + "step": 6093 + }, + { + "epoch": 0.4924543930180408, + "grad_norm": 2.463228940963745, + "learning_rate": 8.823249701128366e-06, + "loss": 0.9919, + "step": 6094 + }, + { + "epoch": 0.4925352027313683, + "grad_norm": 3.5522258281707764, + "learning_rate": 8.822827967412123e-06, + "loss": 0.8532, + "step": 6095 + }, + { + "epoch": 0.4926160124446958, + "grad_norm": 2.7985095977783203, + "learning_rate": 8.82240616822004e-06, + "loss": 0.9936, + "step": 6096 + }, + { + "epoch": 0.4926968221580234, + "grad_norm": 3.1773319244384766, + "learning_rate": 8.821984303559343e-06, + "loss": 1.0097, + "step": 6097 + }, + { + "epoch": 0.49277763187135093, + "grad_norm": 3.1122496128082275, + "learning_rate": 8.821562373437256e-06, + "loss": 0.9816, + "step": 6098 + }, + { + "epoch": 0.49285844158467845, + "grad_norm": 3.0248422622680664, + "learning_rate": 8.821140377861005e-06, + "loss": 0.9274, + "step": 6099 + }, + { + "epoch": 0.49293925129800603, + "grad_norm": 2.9675469398498535, + "learning_rate": 8.820718316837818e-06, + "loss": 0.9304, + "step": 6100 + }, + { + "epoch": 0.49302006101133355, + "grad_norm": 2.454986572265625, + "learning_rate": 8.820296190374924e-06, + "loss": 1.0008, + "step": 6101 + }, + { + "epoch": 0.4931008707246611, + "grad_norm": 2.251755714416504, + "learning_rate": 8.819873998479554e-06, + "loss": 1.1383, + "step": 6102 + }, + { + "epoch": 0.49318168043798866, + "grad_norm": 3.004356622695923, + "learning_rate": 8.819451741158938e-06, + "loss": 1.0893, + "step": 6103 + }, + { + "epoch": 0.4932624901513162, + "grad_norm": 2.6525914669036865, + "learning_rate": 8.819029418420309e-06, + "loss": 0.9572, + "step": 6104 + }, + { + "epoch": 0.4933432998646437, + "grad_norm": 2.451263189315796, + "learning_rate": 8.8186070302709e-06, + "loss": 0.9217, + "step": 6105 + }, + { + "epoch": 0.4934241095779713, + "grad_norm": 3.0996081829071045, + "learning_rate": 8.818184576717945e-06, + "loss": 1.0521, + "step": 6106 + }, + { + "epoch": 0.4935049192912988, + "grad_norm": 2.725372314453125, + "learning_rate": 8.81776205776868e-06, + "loss": 0.9312, + "step": 6107 + }, + { + "epoch": 0.4935857290046264, + "grad_norm": 2.304767370223999, + "learning_rate": 8.817339473430342e-06, + "loss": 1.0237, + "step": 6108 + }, + { + "epoch": 0.4936665387179539, + "grad_norm": 2.740532636642456, + "learning_rate": 8.816916823710168e-06, + "loss": 0.9926, + "step": 6109 + }, + { + "epoch": 0.49374734843128143, + "grad_norm": 2.321873426437378, + "learning_rate": 8.8164941086154e-06, + "loss": 0.9443, + "step": 6110 + }, + { + "epoch": 0.493828158144609, + "grad_norm": 2.3607263565063477, + "learning_rate": 8.816071328153275e-06, + "loss": 1.0647, + "step": 6111 + }, + { + "epoch": 0.49390896785793653, + "grad_norm": 2.1246426105499268, + "learning_rate": 8.815648482331033e-06, + "loss": 1.006, + "step": 6112 + }, + { + "epoch": 0.49398977757126405, + "grad_norm": 2.4910242557525635, + "learning_rate": 8.81522557115592e-06, + "loss": 1.0039, + "step": 6113 + }, + { + "epoch": 0.49407058728459163, + "grad_norm": 3.2152459621429443, + "learning_rate": 8.814802594635177e-06, + "loss": 0.9698, + "step": 6114 + }, + { + "epoch": 0.49415139699791916, + "grad_norm": 2.749821186065674, + "learning_rate": 8.81437955277605e-06, + "loss": 1.0242, + "step": 6115 + }, + { + "epoch": 0.4942322067112467, + "grad_norm": 2.736800193786621, + "learning_rate": 8.813956445585784e-06, + "loss": 0.9153, + "step": 6116 + }, + { + "epoch": 0.49431301642457426, + "grad_norm": 3.20119309425354, + "learning_rate": 8.813533273071625e-06, + "loss": 0.9608, + "step": 6117 + }, + { + "epoch": 0.4943938261379018, + "grad_norm": 2.3296122550964355, + "learning_rate": 8.813110035240822e-06, + "loss": 0.975, + "step": 6118 + }, + { + "epoch": 0.4944746358512293, + "grad_norm": 2.2238399982452393, + "learning_rate": 8.812686732100623e-06, + "loss": 0.925, + "step": 6119 + }, + { + "epoch": 0.4945554455645569, + "grad_norm": 3.024690866470337, + "learning_rate": 8.81226336365828e-06, + "loss": 0.9206, + "step": 6120 + }, + { + "epoch": 0.4946362552778844, + "grad_norm": 2.7818827629089355, + "learning_rate": 8.811839929921045e-06, + "loss": 0.8784, + "step": 6121 + }, + { + "epoch": 0.49471706499121193, + "grad_norm": 2.881441354751587, + "learning_rate": 8.811416430896166e-06, + "loss": 0.9981, + "step": 6122 + }, + { + "epoch": 0.4947978747045395, + "grad_norm": 2.6555793285369873, + "learning_rate": 8.8109928665909e-06, + "loss": 0.9033, + "step": 6123 + }, + { + "epoch": 0.49487868441786703, + "grad_norm": 2.7477972507476807, + "learning_rate": 8.8105692370125e-06, + "loss": 0.8881, + "step": 6124 + }, + { + "epoch": 0.49495949413119456, + "grad_norm": 2.9489376544952393, + "learning_rate": 8.810145542168224e-06, + "loss": 0.9328, + "step": 6125 + }, + { + "epoch": 0.49504030384452213, + "grad_norm": 2.777998685836792, + "learning_rate": 8.809721782065326e-06, + "loss": 0.946, + "step": 6126 + }, + { + "epoch": 0.49512111355784966, + "grad_norm": 3.1331918239593506, + "learning_rate": 8.809297956711067e-06, + "loss": 0.9587, + "step": 6127 + }, + { + "epoch": 0.4952019232711772, + "grad_norm": 3.153442621231079, + "learning_rate": 8.808874066112702e-06, + "loss": 1.0215, + "step": 6128 + }, + { + "epoch": 0.49528273298450476, + "grad_norm": 2.8020200729370117, + "learning_rate": 8.808450110277497e-06, + "loss": 1.0223, + "step": 6129 + }, + { + "epoch": 0.4953635426978323, + "grad_norm": 2.3491687774658203, + "learning_rate": 8.808026089212707e-06, + "loss": 0.8781, + "step": 6130 + }, + { + "epoch": 0.4954443524111598, + "grad_norm": 2.952775478363037, + "learning_rate": 8.8076020029256e-06, + "loss": 0.8275, + "step": 6131 + }, + { + "epoch": 0.4955251621244874, + "grad_norm": 2.6388676166534424, + "learning_rate": 8.807177851423436e-06, + "loss": 1.0038, + "step": 6132 + }, + { + "epoch": 0.4956059718378149, + "grad_norm": 2.2663233280181885, + "learning_rate": 8.806753634713482e-06, + "loss": 1.0661, + "step": 6133 + }, + { + "epoch": 0.49568678155114243, + "grad_norm": 2.8605127334594727, + "learning_rate": 8.806329352803e-06, + "loss": 0.8744, + "step": 6134 + }, + { + "epoch": 0.49576759126447, + "grad_norm": 2.6653573513031006, + "learning_rate": 8.80590500569926e-06, + "loss": 0.987, + "step": 6135 + }, + { + "epoch": 0.49584840097779753, + "grad_norm": 2.4701249599456787, + "learning_rate": 8.805480593409532e-06, + "loss": 0.95, + "step": 6136 + }, + { + "epoch": 0.49592921069112506, + "grad_norm": 3.3145911693573, + "learning_rate": 8.805056115941081e-06, + "loss": 0.8803, + "step": 6137 + }, + { + "epoch": 0.49601002040445263, + "grad_norm": 3.1727418899536133, + "learning_rate": 8.804631573301179e-06, + "loss": 1.1275, + "step": 6138 + }, + { + "epoch": 0.49609083011778016, + "grad_norm": 2.9942028522491455, + "learning_rate": 8.8042069654971e-06, + "loss": 0.8496, + "step": 6139 + }, + { + "epoch": 0.4961716398311077, + "grad_norm": 2.2804789543151855, + "learning_rate": 8.80378229253611e-06, + "loss": 1.0268, + "step": 6140 + }, + { + "epoch": 0.49625244954443526, + "grad_norm": 2.989638328552246, + "learning_rate": 8.803357554425489e-06, + "loss": 0.9654, + "step": 6141 + }, + { + "epoch": 0.4963332592577628, + "grad_norm": 2.3394556045532227, + "learning_rate": 8.802932751172508e-06, + "loss": 0.9999, + "step": 6142 + }, + { + "epoch": 0.4964140689710903, + "grad_norm": 2.64165997505188, + "learning_rate": 8.802507882784444e-06, + "loss": 0.9509, + "step": 6143 + }, + { + "epoch": 0.4964948786844179, + "grad_norm": 2.64670729637146, + "learning_rate": 8.802082949268576e-06, + "loss": 1.0073, + "step": 6144 + }, + { + "epoch": 0.4965756883977454, + "grad_norm": 2.7997217178344727, + "learning_rate": 8.801657950632178e-06, + "loss": 0.8719, + "step": 6145 + }, + { + "epoch": 0.49665649811107293, + "grad_norm": 2.3776872158050537, + "learning_rate": 8.801232886882534e-06, + "loss": 0.9945, + "step": 6146 + }, + { + "epoch": 0.4967373078244005, + "grad_norm": 2.469510793685913, + "learning_rate": 8.80080775802692e-06, + "loss": 0.9354, + "step": 6147 + }, + { + "epoch": 0.49681811753772803, + "grad_norm": 3.029223918914795, + "learning_rate": 8.80038256407262e-06, + "loss": 0.9409, + "step": 6148 + }, + { + "epoch": 0.49689892725105556, + "grad_norm": 2.9083776473999023, + "learning_rate": 8.799957305026915e-06, + "loss": 0.8882, + "step": 6149 + }, + { + "epoch": 0.49697973696438313, + "grad_norm": 2.4205853939056396, + "learning_rate": 8.79953198089709e-06, + "loss": 0.9442, + "step": 6150 + }, + { + "epoch": 0.49706054667771066, + "grad_norm": 2.592292308807373, + "learning_rate": 8.799106591690427e-06, + "loss": 0.9678, + "step": 6151 + }, + { + "epoch": 0.4971413563910382, + "grad_norm": 2.641314744949341, + "learning_rate": 8.798681137414215e-06, + "loss": 0.8714, + "step": 6152 + }, + { + "epoch": 0.49722216610436576, + "grad_norm": 3.0298564434051514, + "learning_rate": 8.798255618075742e-06, + "loss": 1.0515, + "step": 6153 + }, + { + "epoch": 0.4973029758176933, + "grad_norm": 2.943699598312378, + "learning_rate": 8.797830033682293e-06, + "loss": 1.1118, + "step": 6154 + }, + { + "epoch": 0.4973837855310208, + "grad_norm": 2.3978610038757324, + "learning_rate": 8.79740438424116e-06, + "loss": 0.9146, + "step": 6155 + }, + { + "epoch": 0.4974645952443484, + "grad_norm": 3.341064214706421, + "learning_rate": 8.79697866975963e-06, + "loss": 0.9141, + "step": 6156 + }, + { + "epoch": 0.4975454049576759, + "grad_norm": 2.4609732627868652, + "learning_rate": 8.796552890244996e-06, + "loss": 1.1024, + "step": 6157 + }, + { + "epoch": 0.49762621467100343, + "grad_norm": 2.2896673679351807, + "learning_rate": 8.79612704570455e-06, + "loss": 0.893, + "step": 6158 + }, + { + "epoch": 0.497707024384331, + "grad_norm": 2.6929233074188232, + "learning_rate": 8.795701136145588e-06, + "loss": 0.9694, + "step": 6159 + }, + { + "epoch": 0.49778783409765853, + "grad_norm": 2.648365020751953, + "learning_rate": 8.795275161575404e-06, + "loss": 1.099, + "step": 6160 + }, + { + "epoch": 0.49786864381098606, + "grad_norm": 2.5364372730255127, + "learning_rate": 8.794849122001293e-06, + "loss": 0.8936, + "step": 6161 + }, + { + "epoch": 0.49794945352431363, + "grad_norm": 3.0253398418426514, + "learning_rate": 8.794423017430552e-06, + "loss": 1.0144, + "step": 6162 + }, + { + "epoch": 0.49803026323764116, + "grad_norm": 2.4398863315582275, + "learning_rate": 8.793996847870478e-06, + "loss": 0.9247, + "step": 6163 + }, + { + "epoch": 0.4981110729509687, + "grad_norm": 2.418994426727295, + "learning_rate": 8.793570613328373e-06, + "loss": 1.0, + "step": 6164 + }, + { + "epoch": 0.49819188266429626, + "grad_norm": 2.9876675605773926, + "learning_rate": 8.793144313811535e-06, + "loss": 1.0659, + "step": 6165 + }, + { + "epoch": 0.4982726923776238, + "grad_norm": 2.9962754249572754, + "learning_rate": 8.792717949327268e-06, + "loss": 0.8499, + "step": 6166 + }, + { + "epoch": 0.4983535020909513, + "grad_norm": 2.7783010005950928, + "learning_rate": 8.792291519882873e-06, + "loss": 0.9165, + "step": 6167 + }, + { + "epoch": 0.4984343118042789, + "grad_norm": 2.6788246631622314, + "learning_rate": 8.791865025485653e-06, + "loss": 0.899, + "step": 6168 + }, + { + "epoch": 0.4985151215176064, + "grad_norm": 3.0774552822113037, + "learning_rate": 8.791438466142915e-06, + "loss": 1.0353, + "step": 6169 + }, + { + "epoch": 0.49859593123093393, + "grad_norm": 2.8979735374450684, + "learning_rate": 8.791011841861961e-06, + "loss": 1.0816, + "step": 6170 + }, + { + "epoch": 0.4986767409442615, + "grad_norm": 2.9480981826782227, + "learning_rate": 8.790585152650102e-06, + "loss": 0.9721, + "step": 6171 + }, + { + "epoch": 0.49875755065758903, + "grad_norm": 2.411952257156372, + "learning_rate": 8.790158398514646e-06, + "loss": 0.9462, + "step": 6172 + }, + { + "epoch": 0.4988383603709166, + "grad_norm": 2.7789766788482666, + "learning_rate": 8.7897315794629e-06, + "loss": 0.9406, + "step": 6173 + }, + { + "epoch": 0.49891917008424413, + "grad_norm": 2.9420881271362305, + "learning_rate": 8.789304695502175e-06, + "loss": 0.9678, + "step": 6174 + }, + { + "epoch": 0.49899997979757166, + "grad_norm": 2.5352845191955566, + "learning_rate": 8.788877746639784e-06, + "loss": 0.8404, + "step": 6175 + }, + { + "epoch": 0.49908078951089924, + "grad_norm": 2.512543201446533, + "learning_rate": 8.788450732883037e-06, + "loss": 1.0372, + "step": 6176 + }, + { + "epoch": 0.49916159922422676, + "grad_norm": 2.5715396404266357, + "learning_rate": 8.78802365423925e-06, + "loss": 0.9153, + "step": 6177 + }, + { + "epoch": 0.4992424089375543, + "grad_norm": 2.588712692260742, + "learning_rate": 8.787596510715737e-06, + "loss": 0.9617, + "step": 6178 + }, + { + "epoch": 0.49932321865088186, + "grad_norm": 2.6835100650787354, + "learning_rate": 8.787169302319816e-06, + "loss": 0.9415, + "step": 6179 + }, + { + "epoch": 0.4994040283642094, + "grad_norm": 2.8224241733551025, + "learning_rate": 8.786742029058798e-06, + "loss": 0.9496, + "step": 6180 + }, + { + "epoch": 0.4994848380775369, + "grad_norm": 3.0012919902801514, + "learning_rate": 8.78631469094001e-06, + "loss": 0.9521, + "step": 6181 + }, + { + "epoch": 0.4995656477908645, + "grad_norm": 2.628826856613159, + "learning_rate": 8.785887287970764e-06, + "loss": 1.0576, + "step": 6182 + }, + { + "epoch": 0.499646457504192, + "grad_norm": 2.7362473011016846, + "learning_rate": 8.785459820158381e-06, + "loss": 0.9859, + "step": 6183 + }, + { + "epoch": 0.49972726721751953, + "grad_norm": 3.128894567489624, + "learning_rate": 8.785032287510188e-06, + "loss": 1.0043, + "step": 6184 + }, + { + "epoch": 0.4998080769308471, + "grad_norm": 2.8722219467163086, + "learning_rate": 8.784604690033503e-06, + "loss": 0.9722, + "step": 6185 + }, + { + "epoch": 0.49988888664417463, + "grad_norm": 2.324587821960449, + "learning_rate": 8.78417702773565e-06, + "loss": 0.9709, + "step": 6186 + }, + { + "epoch": 0.49996969635750216, + "grad_norm": 2.820197820663452, + "learning_rate": 8.783749300623954e-06, + "loss": 0.9434, + "step": 6187 + }, + { + "epoch": 0.5000505060708297, + "grad_norm": 2.7109031677246094, + "learning_rate": 8.783321508705744e-06, + "loss": 1.0395, + "step": 6188 + }, + { + "epoch": 0.5001313157841573, + "grad_norm": 2.534446954727173, + "learning_rate": 8.782893651988342e-06, + "loss": 0.8489, + "step": 6189 + }, + { + "epoch": 0.5002121254974848, + "grad_norm": 3.0825867652893066, + "learning_rate": 8.78246573047908e-06, + "loss": 1.0358, + "step": 6190 + }, + { + "epoch": 0.5002929352108123, + "grad_norm": 2.6846885681152344, + "learning_rate": 8.782037744185285e-06, + "loss": 0.961, + "step": 6191 + }, + { + "epoch": 0.5003737449241399, + "grad_norm": 2.543034076690674, + "learning_rate": 8.781609693114288e-06, + "loss": 0.9305, + "step": 6192 + }, + { + "epoch": 0.5004545546374675, + "grad_norm": 2.4893667697906494, + "learning_rate": 8.781181577273423e-06, + "loss": 0.8925, + "step": 6193 + }, + { + "epoch": 0.5005353643507949, + "grad_norm": 2.456136465072632, + "learning_rate": 8.780753396670019e-06, + "loss": 0.8987, + "step": 6194 + }, + { + "epoch": 0.5006161740641225, + "grad_norm": 2.920161247253418, + "learning_rate": 8.78032515131141e-06, + "loss": 0.9138, + "step": 6195 + }, + { + "epoch": 0.5006969837774501, + "grad_norm": 2.6189510822296143, + "learning_rate": 8.779896841204933e-06, + "loss": 0.9793, + "step": 6196 + }, + { + "epoch": 0.5007777934907776, + "grad_norm": 2.9476518630981445, + "learning_rate": 8.779468466357923e-06, + "loss": 0.9536, + "step": 6197 + }, + { + "epoch": 0.5008586032041051, + "grad_norm": 2.7596771717071533, + "learning_rate": 8.779040026777716e-06, + "loss": 1.036, + "step": 6198 + }, + { + "epoch": 0.5009394129174327, + "grad_norm": 2.8270766735076904, + "learning_rate": 8.778611522471653e-06, + "loss": 0.954, + "step": 6199 + }, + { + "epoch": 0.5010202226307602, + "grad_norm": 2.723810911178589, + "learning_rate": 8.77818295344707e-06, + "loss": 0.9046, + "step": 6200 + }, + { + "epoch": 0.5011010323440878, + "grad_norm": 2.7161903381347656, + "learning_rate": 8.777754319711309e-06, + "loss": 0.979, + "step": 6201 + }, + { + "epoch": 0.5011818420574153, + "grad_norm": 2.6303982734680176, + "learning_rate": 8.77732562127171e-06, + "loss": 1.0112, + "step": 6202 + }, + { + "epoch": 0.5012626517707428, + "grad_norm": 2.57110595703125, + "learning_rate": 8.776896858135618e-06, + "loss": 1.1584, + "step": 6203 + }, + { + "epoch": 0.5013434614840704, + "grad_norm": 2.7270331382751465, + "learning_rate": 8.776468030310375e-06, + "loss": 0.8973, + "step": 6204 + }, + { + "epoch": 0.501424271197398, + "grad_norm": 2.3623132705688477, + "learning_rate": 8.776039137803325e-06, + "loss": 0.9521, + "step": 6205 + }, + { + "epoch": 0.5015050809107254, + "grad_norm": 2.399603843688965, + "learning_rate": 8.775610180621816e-06, + "loss": 0.9881, + "step": 6206 + }, + { + "epoch": 0.501585890624053, + "grad_norm": 2.3874268531799316, + "learning_rate": 8.775181158773194e-06, + "loss": 1.0545, + "step": 6207 + }, + { + "epoch": 0.5016667003373806, + "grad_norm": 2.8882362842559814, + "learning_rate": 8.774752072264807e-06, + "loss": 1.0007, + "step": 6208 + }, + { + "epoch": 0.5017475100507081, + "grad_norm": 2.8591010570526123, + "learning_rate": 8.774322921104003e-06, + "loss": 0.919, + "step": 6209 + }, + { + "epoch": 0.5018283197640356, + "grad_norm": 2.402244806289673, + "learning_rate": 8.773893705298135e-06, + "loss": 0.8777, + "step": 6210 + }, + { + "epoch": 0.5019091294773632, + "grad_norm": 2.6203484535217285, + "learning_rate": 8.773464424854553e-06, + "loss": 0.9995, + "step": 6211 + }, + { + "epoch": 0.5019899391906907, + "grad_norm": 2.413853883743286, + "learning_rate": 8.773035079780612e-06, + "loss": 0.8767, + "step": 6212 + }, + { + "epoch": 0.5020707489040183, + "grad_norm": 2.554950714111328, + "learning_rate": 8.77260567008366e-06, + "loss": 0.9528, + "step": 6213 + }, + { + "epoch": 0.5021515586173458, + "grad_norm": 2.642920970916748, + "learning_rate": 8.772176195771056e-06, + "loss": 1.0379, + "step": 6214 + }, + { + "epoch": 0.5022323683306733, + "grad_norm": 2.720839023590088, + "learning_rate": 8.771746656850156e-06, + "loss": 1.0416, + "step": 6215 + }, + { + "epoch": 0.5023131780440009, + "grad_norm": 2.906888008117676, + "learning_rate": 8.771317053328313e-06, + "loss": 0.9075, + "step": 6216 + }, + { + "epoch": 0.5023939877573285, + "grad_norm": 2.529552698135376, + "learning_rate": 8.77088738521289e-06, + "loss": 0.9825, + "step": 6217 + }, + { + "epoch": 0.5024747974706559, + "grad_norm": 3.1397318840026855, + "learning_rate": 8.770457652511244e-06, + "loss": 0.9345, + "step": 6218 + }, + { + "epoch": 0.5025556071839835, + "grad_norm": 3.220461130142212, + "learning_rate": 8.770027855230737e-06, + "loss": 0.8796, + "step": 6219 + }, + { + "epoch": 0.5026364168973111, + "grad_norm": 2.9072651863098145, + "learning_rate": 8.769597993378728e-06, + "loss": 0.9466, + "step": 6220 + }, + { + "epoch": 0.5027172266106386, + "grad_norm": 2.3401410579681396, + "learning_rate": 8.769168066962577e-06, + "loss": 0.8891, + "step": 6221 + }, + { + "epoch": 0.5027980363239661, + "grad_norm": 3.1132638454437256, + "learning_rate": 8.768738075989654e-06, + "loss": 1.0079, + "step": 6222 + }, + { + "epoch": 0.5028788460372937, + "grad_norm": 2.5749056339263916, + "learning_rate": 8.76830802046732e-06, + "loss": 0.9403, + "step": 6223 + }, + { + "epoch": 0.5029596557506212, + "grad_norm": 2.546679735183716, + "learning_rate": 8.767877900402941e-06, + "loss": 0.8923, + "step": 6224 + }, + { + "epoch": 0.5030404654639488, + "grad_norm": 2.497368812561035, + "learning_rate": 8.767447715803885e-06, + "loss": 0.9375, + "step": 6225 + }, + { + "epoch": 0.5031212751772763, + "grad_norm": 2.727546453475952, + "learning_rate": 8.76701746667752e-06, + "loss": 1.0558, + "step": 6226 + }, + { + "epoch": 0.5032020848906038, + "grad_norm": 2.8279621601104736, + "learning_rate": 8.766587153031214e-06, + "loss": 0.8809, + "step": 6227 + }, + { + "epoch": 0.5032828946039314, + "grad_norm": 2.2303953170776367, + "learning_rate": 8.766156774872336e-06, + "loss": 0.8804, + "step": 6228 + }, + { + "epoch": 0.503363704317259, + "grad_norm": 3.113374948501587, + "learning_rate": 8.765726332208263e-06, + "loss": 0.9811, + "step": 6229 + }, + { + "epoch": 0.5034445140305864, + "grad_norm": 3.204686164855957, + "learning_rate": 8.765295825046359e-06, + "loss": 0.8846, + "step": 6230 + }, + { + "epoch": 0.503525323743914, + "grad_norm": 2.4541518688201904, + "learning_rate": 8.764865253394005e-06, + "loss": 1.006, + "step": 6231 + }, + { + "epoch": 0.5036061334572416, + "grad_norm": 2.583739995956421, + "learning_rate": 8.764434617258572e-06, + "loss": 1.0605, + "step": 6232 + }, + { + "epoch": 0.5036869431705691, + "grad_norm": 2.5617687702178955, + "learning_rate": 8.764003916647437e-06, + "loss": 0.9262, + "step": 6233 + }, + { + "epoch": 0.5037677528838966, + "grad_norm": 2.6554017066955566, + "learning_rate": 8.763573151567974e-06, + "loss": 0.911, + "step": 6234 + }, + { + "epoch": 0.5038485625972242, + "grad_norm": 2.6688547134399414, + "learning_rate": 8.763142322027567e-06, + "loss": 1.0141, + "step": 6235 + }, + { + "epoch": 0.5039293723105517, + "grad_norm": 2.570295810699463, + "learning_rate": 8.762711428033589e-06, + "loss": 0.9379, + "step": 6236 + }, + { + "epoch": 0.5040101820238793, + "grad_norm": 2.693105697631836, + "learning_rate": 8.762280469593422e-06, + "loss": 0.9292, + "step": 6237 + }, + { + "epoch": 0.5040909917372068, + "grad_norm": 2.7524890899658203, + "learning_rate": 8.76184944671445e-06, + "loss": 0.9364, + "step": 6238 + }, + { + "epoch": 0.5041718014505343, + "grad_norm": 2.74165940284729, + "learning_rate": 8.761418359404053e-06, + "loss": 0.9026, + "step": 6239 + }, + { + "epoch": 0.5042526111638619, + "grad_norm": 2.6603362560272217, + "learning_rate": 8.760987207669613e-06, + "loss": 0.996, + "step": 6240 + }, + { + "epoch": 0.5043334208771895, + "grad_norm": 2.772826671600342, + "learning_rate": 8.760555991518519e-06, + "loss": 1.0076, + "step": 6241 + }, + { + "epoch": 0.5044142305905169, + "grad_norm": 2.823110580444336, + "learning_rate": 8.760124710958151e-06, + "loss": 0.9509, + "step": 6242 + }, + { + "epoch": 0.5044950403038445, + "grad_norm": 2.750408172607422, + "learning_rate": 8.7596933659959e-06, + "loss": 0.9475, + "step": 6243 + }, + { + "epoch": 0.5045758500171721, + "grad_norm": 2.5096611976623535, + "learning_rate": 8.759261956639154e-06, + "loss": 1.0581, + "step": 6244 + }, + { + "epoch": 0.5046566597304996, + "grad_norm": 2.499408006668091, + "learning_rate": 8.7588304828953e-06, + "loss": 1.0803, + "step": 6245 + }, + { + "epoch": 0.5047374694438271, + "grad_norm": 2.963024139404297, + "learning_rate": 8.758398944771729e-06, + "loss": 0.9131, + "step": 6246 + }, + { + "epoch": 0.5048182791571547, + "grad_norm": 2.4039483070373535, + "learning_rate": 8.757967342275832e-06, + "loss": 0.9971, + "step": 6247 + }, + { + "epoch": 0.5048990888704822, + "grad_norm": 2.4837539196014404, + "learning_rate": 8.757535675415002e-06, + "loss": 0.9081, + "step": 6248 + }, + { + "epoch": 0.5049798985838098, + "grad_norm": 2.674001455307007, + "learning_rate": 8.75710394419663e-06, + "loss": 1.0918, + "step": 6249 + }, + { + "epoch": 0.5050607082971373, + "grad_norm": 2.8628764152526855, + "learning_rate": 8.756672148628113e-06, + "loss": 0.9556, + "step": 6250 + }, + { + "epoch": 0.5051415180104648, + "grad_norm": 3.219918727874756, + "learning_rate": 8.756240288716845e-06, + "loss": 0.9388, + "step": 6251 + }, + { + "epoch": 0.5052223277237924, + "grad_norm": 2.65590238571167, + "learning_rate": 8.755808364470226e-06, + "loss": 0.977, + "step": 6252 + }, + { + "epoch": 0.50530313743712, + "grad_norm": 2.408907651901245, + "learning_rate": 8.75537637589565e-06, + "loss": 1.1253, + "step": 6253 + }, + { + "epoch": 0.5053839471504474, + "grad_norm": 2.954531192779541, + "learning_rate": 8.754944323000516e-06, + "loss": 0.9744, + "step": 6254 + }, + { + "epoch": 0.505464756863775, + "grad_norm": 3.1942427158355713, + "learning_rate": 8.754512205792228e-06, + "loss": 0.8807, + "step": 6255 + }, + { + "epoch": 0.5055455665771026, + "grad_norm": 2.611370801925659, + "learning_rate": 8.754080024278184e-06, + "loss": 0.9943, + "step": 6256 + }, + { + "epoch": 0.5056263762904301, + "grad_norm": 2.339695692062378, + "learning_rate": 8.753647778465787e-06, + "loss": 0.8966, + "step": 6257 + }, + { + "epoch": 0.5057071860037576, + "grad_norm": 2.845473051071167, + "learning_rate": 8.753215468362437e-06, + "loss": 0.947, + "step": 6258 + }, + { + "epoch": 0.5057879957170852, + "grad_norm": 3.000074625015259, + "learning_rate": 8.752783093975545e-06, + "loss": 0.9843, + "step": 6259 + }, + { + "epoch": 0.5058688054304127, + "grad_norm": 2.9566633701324463, + "learning_rate": 8.75235065531251e-06, + "loss": 0.9323, + "step": 6260 + }, + { + "epoch": 0.5059496151437403, + "grad_norm": 2.8121120929718018, + "learning_rate": 8.751918152380745e-06, + "loss": 1.0533, + "step": 6261 + }, + { + "epoch": 0.5060304248570678, + "grad_norm": 2.6486778259277344, + "learning_rate": 8.751485585187653e-06, + "loss": 0.9038, + "step": 6262 + }, + { + "epoch": 0.5061112345703953, + "grad_norm": 2.8989672660827637, + "learning_rate": 8.751052953740644e-06, + "loss": 0.9592, + "step": 6263 + }, + { + "epoch": 0.5061920442837229, + "grad_norm": 2.752859592437744, + "learning_rate": 8.750620258047129e-06, + "loss": 0.8946, + "step": 6264 + }, + { + "epoch": 0.5062728539970505, + "grad_norm": 2.588942289352417, + "learning_rate": 8.750187498114517e-06, + "loss": 0.9424, + "step": 6265 + }, + { + "epoch": 0.5063536637103779, + "grad_norm": 2.766521453857422, + "learning_rate": 8.749754673950224e-06, + "loss": 1.0519, + "step": 6266 + }, + { + "epoch": 0.5064344734237055, + "grad_norm": 2.765406847000122, + "learning_rate": 8.749321785561657e-06, + "loss": 0.9219, + "step": 6267 + }, + { + "epoch": 0.5065152831370331, + "grad_norm": 2.811837911605835, + "learning_rate": 8.748888832956236e-06, + "loss": 1.0876, + "step": 6268 + }, + { + "epoch": 0.5065960928503607, + "grad_norm": 3.11313796043396, + "learning_rate": 8.748455816141374e-06, + "loss": 0.9263, + "step": 6269 + }, + { + "epoch": 0.5066769025636881, + "grad_norm": 2.6542294025421143, + "learning_rate": 8.74802273512449e-06, + "loss": 0.7961, + "step": 6270 + }, + { + "epoch": 0.5067577122770157, + "grad_norm": 2.3677380084991455, + "learning_rate": 8.747589589912995e-06, + "loss": 1.0259, + "step": 6271 + }, + { + "epoch": 0.5068385219903433, + "grad_norm": 2.921304225921631, + "learning_rate": 8.747156380514315e-06, + "loss": 1.0164, + "step": 6272 + }, + { + "epoch": 0.5069193317036708, + "grad_norm": 2.4303314685821533, + "learning_rate": 8.746723106935867e-06, + "loss": 0.9315, + "step": 6273 + }, + { + "epoch": 0.5070001414169983, + "grad_norm": 2.752507448196411, + "learning_rate": 8.746289769185073e-06, + "loss": 0.9311, + "step": 6274 + }, + { + "epoch": 0.5070809511303259, + "grad_norm": 3.3525657653808594, + "learning_rate": 8.745856367269352e-06, + "loss": 1.0205, + "step": 6275 + }, + { + "epoch": 0.5071617608436534, + "grad_norm": 2.606055974960327, + "learning_rate": 8.74542290119613e-06, + "loss": 0.941, + "step": 6276 + }, + { + "epoch": 0.507242570556981, + "grad_norm": 2.3623929023742676, + "learning_rate": 8.744989370972831e-06, + "loss": 0.991, + "step": 6277 + }, + { + "epoch": 0.5073233802703085, + "grad_norm": 3.1393396854400635, + "learning_rate": 8.744555776606879e-06, + "loss": 0.9919, + "step": 6278 + }, + { + "epoch": 0.507404189983636, + "grad_norm": 2.143432855606079, + "learning_rate": 8.744122118105702e-06, + "loss": 1.0498, + "step": 6279 + }, + { + "epoch": 0.5074849996969636, + "grad_norm": 2.5479798316955566, + "learning_rate": 8.743688395476726e-06, + "loss": 0.8121, + "step": 6280 + }, + { + "epoch": 0.5075658094102912, + "grad_norm": 2.4630141258239746, + "learning_rate": 8.74325460872738e-06, + "loss": 1.031, + "step": 6281 + }, + { + "epoch": 0.5076466191236186, + "grad_norm": 2.621180772781372, + "learning_rate": 8.742820757865094e-06, + "loss": 0.892, + "step": 6282 + }, + { + "epoch": 0.5077274288369462, + "grad_norm": 2.5219764709472656, + "learning_rate": 8.742386842897302e-06, + "loss": 1.064, + "step": 6283 + }, + { + "epoch": 0.5078082385502738, + "grad_norm": 2.7024383544921875, + "learning_rate": 8.741952863831429e-06, + "loss": 0.9999, + "step": 6284 + }, + { + "epoch": 0.5078890482636013, + "grad_norm": 3.3070693016052246, + "learning_rate": 8.741518820674912e-06, + "loss": 0.9797, + "step": 6285 + }, + { + "epoch": 0.5079698579769288, + "grad_norm": 2.8247246742248535, + "learning_rate": 8.741084713435187e-06, + "loss": 0.9545, + "step": 6286 + }, + { + "epoch": 0.5080506676902564, + "grad_norm": 3.0412349700927734, + "learning_rate": 8.740650542119686e-06, + "loss": 0.9883, + "step": 6287 + }, + { + "epoch": 0.5081314774035839, + "grad_norm": 2.3945281505584717, + "learning_rate": 8.740216306735847e-06, + "loss": 0.9806, + "step": 6288 + }, + { + "epoch": 0.5082122871169115, + "grad_norm": 2.5237603187561035, + "learning_rate": 8.739782007291107e-06, + "loss": 0.908, + "step": 6289 + }, + { + "epoch": 0.508293096830239, + "grad_norm": 2.7784175872802734, + "learning_rate": 8.739347643792904e-06, + "loss": 0.9167, + "step": 6290 + }, + { + "epoch": 0.5083739065435665, + "grad_norm": 2.8435075283050537, + "learning_rate": 8.738913216248678e-06, + "loss": 1.0108, + "step": 6291 + }, + { + "epoch": 0.5084547162568941, + "grad_norm": 2.382512092590332, + "learning_rate": 8.73847872466587e-06, + "loss": 1.0241, + "step": 6292 + }, + { + "epoch": 0.5085355259702217, + "grad_norm": 2.925574541091919, + "learning_rate": 8.73804416905192e-06, + "loss": 0.9335, + "step": 6293 + }, + { + "epoch": 0.5086163356835491, + "grad_norm": 2.9449074268341064, + "learning_rate": 8.737609549414274e-06, + "loss": 0.955, + "step": 6294 + }, + { + "epoch": 0.5086971453968767, + "grad_norm": 2.4350380897521973, + "learning_rate": 8.737174865760374e-06, + "loss": 0.9562, + "step": 6295 + }, + { + "epoch": 0.5087779551102043, + "grad_norm": 2.7059905529022217, + "learning_rate": 8.736740118097665e-06, + "loss": 1.0349, + "step": 6296 + }, + { + "epoch": 0.5088587648235318, + "grad_norm": 3.114983558654785, + "learning_rate": 8.736305306433595e-06, + "loss": 1.0846, + "step": 6297 + }, + { + "epoch": 0.5089395745368593, + "grad_norm": 3.2726974487304688, + "learning_rate": 8.735870430775609e-06, + "loss": 0.9581, + "step": 6298 + }, + { + "epoch": 0.5090203842501869, + "grad_norm": 2.6871676445007324, + "learning_rate": 8.735435491131155e-06, + "loss": 0.8718, + "step": 6299 + }, + { + "epoch": 0.5091011939635144, + "grad_norm": 2.542513608932495, + "learning_rate": 8.735000487507684e-06, + "loss": 0.929, + "step": 6300 + }, + { + "epoch": 0.509182003676842, + "grad_norm": 2.7970104217529297, + "learning_rate": 8.734565419912649e-06, + "loss": 0.9334, + "step": 6301 + }, + { + "epoch": 0.5092628133901695, + "grad_norm": 2.6967902183532715, + "learning_rate": 8.734130288353495e-06, + "loss": 1.0328, + "step": 6302 + }, + { + "epoch": 0.509343623103497, + "grad_norm": 2.9185562133789062, + "learning_rate": 8.733695092837681e-06, + "loss": 0.9334, + "step": 6303 + }, + { + "epoch": 0.5094244328168246, + "grad_norm": 2.716060161590576, + "learning_rate": 8.73325983337266e-06, + "loss": 1.0498, + "step": 6304 + }, + { + "epoch": 0.5095052425301522, + "grad_norm": 2.8402271270751953, + "learning_rate": 8.732824509965882e-06, + "loss": 1.0054, + "step": 6305 + }, + { + "epoch": 0.5095860522434796, + "grad_norm": 2.672893762588501, + "learning_rate": 8.732389122624809e-06, + "loss": 1.0344, + "step": 6306 + }, + { + "epoch": 0.5096668619568072, + "grad_norm": 2.799960136413574, + "learning_rate": 8.731953671356895e-06, + "loss": 0.9002, + "step": 6307 + }, + { + "epoch": 0.5097476716701348, + "grad_norm": 2.4315521717071533, + "learning_rate": 8.7315181561696e-06, + "loss": 0.9639, + "step": 6308 + }, + { + "epoch": 0.5098284813834623, + "grad_norm": 2.9446537494659424, + "learning_rate": 8.73108257707038e-06, + "loss": 0.9541, + "step": 6309 + }, + { + "epoch": 0.5099092910967898, + "grad_norm": 2.6569461822509766, + "learning_rate": 8.730646934066699e-06, + "loss": 0.9573, + "step": 6310 + }, + { + "epoch": 0.5099901008101174, + "grad_norm": 3.246030807495117, + "learning_rate": 8.730211227166017e-06, + "loss": 1.0095, + "step": 6311 + }, + { + "epoch": 0.5100709105234449, + "grad_norm": 2.408226490020752, + "learning_rate": 8.729775456375798e-06, + "loss": 0.9738, + "step": 6312 + }, + { + "epoch": 0.5101517202367725, + "grad_norm": 3.040785789489746, + "learning_rate": 8.729339621703502e-06, + "loss": 0.9502, + "step": 6313 + }, + { + "epoch": 0.5102325299501, + "grad_norm": 2.7272884845733643, + "learning_rate": 8.728903723156598e-06, + "loss": 0.9586, + "step": 6314 + }, + { + "epoch": 0.5103133396634275, + "grad_norm": 2.6391963958740234, + "learning_rate": 8.72846776074255e-06, + "loss": 0.9756, + "step": 6315 + }, + { + "epoch": 0.5103941493767551, + "grad_norm": 2.7884223461151123, + "learning_rate": 8.728031734468825e-06, + "loss": 0.9851, + "step": 6316 + }, + { + "epoch": 0.5104749590900827, + "grad_norm": 2.6884872913360596, + "learning_rate": 8.727595644342892e-06, + "loss": 0.8943, + "step": 6317 + }, + { + "epoch": 0.5105557688034101, + "grad_norm": 2.61759614944458, + "learning_rate": 8.72715949037222e-06, + "loss": 1.0132, + "step": 6318 + }, + { + "epoch": 0.5106365785167377, + "grad_norm": 2.98608660697937, + "learning_rate": 8.726723272564274e-06, + "loss": 0.9991, + "step": 6319 + }, + { + "epoch": 0.5107173882300653, + "grad_norm": 2.5214684009552, + "learning_rate": 8.726286990926537e-06, + "loss": 0.979, + "step": 6320 + }, + { + "epoch": 0.5107981979433928, + "grad_norm": 3.056746482849121, + "learning_rate": 8.725850645466469e-06, + "loss": 0.9909, + "step": 6321 + }, + { + "epoch": 0.5108790076567203, + "grad_norm": 2.549048662185669, + "learning_rate": 8.725414236191552e-06, + "loss": 0.9043, + "step": 6322 + }, + { + "epoch": 0.5109598173700479, + "grad_norm": 3.2244904041290283, + "learning_rate": 8.724977763109256e-06, + "loss": 0.8554, + "step": 6323 + }, + { + "epoch": 0.5110406270833754, + "grad_norm": 2.591453790664673, + "learning_rate": 8.724541226227059e-06, + "loss": 0.9035, + "step": 6324 + }, + { + "epoch": 0.511121436796703, + "grad_norm": 2.7734375, + "learning_rate": 8.724104625552437e-06, + "loss": 0.9175, + "step": 6325 + }, + { + "epoch": 0.5112022465100305, + "grad_norm": 2.62668776512146, + "learning_rate": 8.72366796109287e-06, + "loss": 0.9148, + "step": 6326 + }, + { + "epoch": 0.511283056223358, + "grad_norm": 2.4754507541656494, + "learning_rate": 8.723231232855833e-06, + "loss": 1.0398, + "step": 6327 + }, + { + "epoch": 0.5113638659366856, + "grad_norm": 2.5738584995269775, + "learning_rate": 8.72279444084881e-06, + "loss": 0.9858, + "step": 6328 + }, + { + "epoch": 0.5114446756500132, + "grad_norm": 2.735138416290283, + "learning_rate": 8.72235758507928e-06, + "loss": 0.9875, + "step": 6329 + }, + { + "epoch": 0.5115254853633406, + "grad_norm": 2.754800796508789, + "learning_rate": 8.721920665554724e-06, + "loss": 0.8941, + "step": 6330 + }, + { + "epoch": 0.5116062950766682, + "grad_norm": 2.9557249546051025, + "learning_rate": 8.721483682282628e-06, + "loss": 0.8747, + "step": 6331 + }, + { + "epoch": 0.5116871047899958, + "grad_norm": 2.5468697547912598, + "learning_rate": 8.721046635270478e-06, + "loss": 0.8815, + "step": 6332 + }, + { + "epoch": 0.5117679145033233, + "grad_norm": 2.746835947036743, + "learning_rate": 8.720609524525754e-06, + "loss": 1.001, + "step": 6333 + }, + { + "epoch": 0.5118487242166508, + "grad_norm": 3.068387985229492, + "learning_rate": 8.720172350055947e-06, + "loss": 0.9632, + "step": 6334 + }, + { + "epoch": 0.5119295339299784, + "grad_norm": 3.0410642623901367, + "learning_rate": 8.719735111868544e-06, + "loss": 0.9902, + "step": 6335 + }, + { + "epoch": 0.5120103436433059, + "grad_norm": 3.010000467300415, + "learning_rate": 8.719297809971034e-06, + "loss": 1.0023, + "step": 6336 + }, + { + "epoch": 0.5120911533566335, + "grad_norm": 2.6686947345733643, + "learning_rate": 8.718860444370905e-06, + "loss": 0.896, + "step": 6337 + }, + { + "epoch": 0.512171963069961, + "grad_norm": 2.974932909011841, + "learning_rate": 8.71842301507565e-06, + "loss": 0.8491, + "step": 6338 + }, + { + "epoch": 0.5122527727832885, + "grad_norm": 3.2390475273132324, + "learning_rate": 8.71798552209276e-06, + "loss": 0.9092, + "step": 6339 + }, + { + "epoch": 0.5123335824966161, + "grad_norm": 2.7157886028289795, + "learning_rate": 8.71754796542973e-06, + "loss": 0.9994, + "step": 6340 + }, + { + "epoch": 0.5124143922099437, + "grad_norm": 3.2451345920562744, + "learning_rate": 8.717110345094053e-06, + "loss": 0.9487, + "step": 6341 + }, + { + "epoch": 0.5124952019232711, + "grad_norm": 2.884882926940918, + "learning_rate": 8.716672661093222e-06, + "loss": 1.073, + "step": 6342 + }, + { + "epoch": 0.5125760116365987, + "grad_norm": 2.51513409614563, + "learning_rate": 8.716234913434738e-06, + "loss": 1.0276, + "step": 6343 + }, + { + "epoch": 0.5126568213499263, + "grad_norm": 2.5609960556030273, + "learning_rate": 8.715797102126096e-06, + "loss": 0.9558, + "step": 6344 + }, + { + "epoch": 0.5127376310632538, + "grad_norm": 2.093696355819702, + "learning_rate": 8.715359227174795e-06, + "loss": 1.0238, + "step": 6345 + }, + { + "epoch": 0.5128184407765813, + "grad_norm": 2.7014880180358887, + "learning_rate": 8.714921288588334e-06, + "loss": 0.8161, + "step": 6346 + }, + { + "epoch": 0.5128992504899089, + "grad_norm": 2.665351390838623, + "learning_rate": 8.714483286374216e-06, + "loss": 1.0244, + "step": 6347 + }, + { + "epoch": 0.5129800602032364, + "grad_norm": 2.641115665435791, + "learning_rate": 8.714045220539939e-06, + "loss": 0.9913, + "step": 6348 + }, + { + "epoch": 0.513060869916564, + "grad_norm": 2.4953019618988037, + "learning_rate": 8.713607091093011e-06, + "loss": 0.9176, + "step": 6349 + }, + { + "epoch": 0.5131416796298915, + "grad_norm": 3.101696014404297, + "learning_rate": 8.713168898040933e-06, + "loss": 0.8918, + "step": 6350 + }, + { + "epoch": 0.513222489343219, + "grad_norm": 3.100341320037842, + "learning_rate": 8.712730641391212e-06, + "loss": 0.8482, + "step": 6351 + }, + { + "epoch": 0.5133032990565466, + "grad_norm": 2.8223681449890137, + "learning_rate": 8.712292321151352e-06, + "loss": 0.9577, + "step": 6352 + }, + { + "epoch": 0.5133841087698742, + "grad_norm": 2.632502794265747, + "learning_rate": 8.711853937328862e-06, + "loss": 0.9707, + "step": 6353 + }, + { + "epoch": 0.5134649184832016, + "grad_norm": 2.548308849334717, + "learning_rate": 8.71141548993125e-06, + "loss": 0.9325, + "step": 6354 + }, + { + "epoch": 0.5135457281965292, + "grad_norm": 2.7708146572113037, + "learning_rate": 8.710976978966024e-06, + "loss": 0.9812, + "step": 6355 + }, + { + "epoch": 0.5136265379098568, + "grad_norm": 2.337926149368286, + "learning_rate": 8.710538404440697e-06, + "loss": 0.9608, + "step": 6356 + }, + { + "epoch": 0.5137073476231843, + "grad_norm": 2.679847240447998, + "learning_rate": 8.71009976636278e-06, + "loss": 0.9442, + "step": 6357 + }, + { + "epoch": 0.5137881573365118, + "grad_norm": 2.399090528488159, + "learning_rate": 8.709661064739786e-06, + "loss": 0.9894, + "step": 6358 + }, + { + "epoch": 0.5138689670498394, + "grad_norm": 2.6882615089416504, + "learning_rate": 8.70922229957923e-06, + "loss": 0.9342, + "step": 6359 + }, + { + "epoch": 0.5139497767631669, + "grad_norm": 3.1053476333618164, + "learning_rate": 8.708783470888621e-06, + "loss": 0.9774, + "step": 6360 + }, + { + "epoch": 0.5140305864764945, + "grad_norm": 2.697941780090332, + "learning_rate": 8.708344578675486e-06, + "loss": 0.9726, + "step": 6361 + }, + { + "epoch": 0.514111396189822, + "grad_norm": 3.2440953254699707, + "learning_rate": 8.70790562294733e-06, + "loss": 1.0524, + "step": 6362 + }, + { + "epoch": 0.5141922059031495, + "grad_norm": 2.412670135498047, + "learning_rate": 8.70746660371168e-06, + "loss": 0.9276, + "step": 6363 + }, + { + "epoch": 0.5142730156164771, + "grad_norm": 2.728304624557495, + "learning_rate": 8.707027520976053e-06, + "loss": 0.9257, + "step": 6364 + }, + { + "epoch": 0.5143538253298047, + "grad_norm": 2.4933555126190186, + "learning_rate": 8.706588374747967e-06, + "loss": 0.8722, + "step": 6365 + }, + { + "epoch": 0.5144346350431321, + "grad_norm": 2.3907010555267334, + "learning_rate": 8.706149165034948e-06, + "loss": 0.9703, + "step": 6366 + }, + { + "epoch": 0.5145154447564597, + "grad_norm": 2.875530958175659, + "learning_rate": 8.705709891844514e-06, + "loss": 0.9495, + "step": 6367 + }, + { + "epoch": 0.5145962544697873, + "grad_norm": 2.625635862350464, + "learning_rate": 8.70527055518419e-06, + "loss": 0.8828, + "step": 6368 + }, + { + "epoch": 0.5146770641831148, + "grad_norm": 2.691448211669922, + "learning_rate": 8.704831155061504e-06, + "loss": 0.9231, + "step": 6369 + }, + { + "epoch": 0.5147578738964423, + "grad_norm": 2.423642873764038, + "learning_rate": 8.704391691483977e-06, + "loss": 0.9891, + "step": 6370 + }, + { + "epoch": 0.5148386836097699, + "grad_norm": 2.7718937397003174, + "learning_rate": 8.70395216445914e-06, + "loss": 1.0474, + "step": 6371 + }, + { + "epoch": 0.5149194933230974, + "grad_norm": 2.699124574661255, + "learning_rate": 8.703512573994516e-06, + "loss": 0.9036, + "step": 6372 + }, + { + "epoch": 0.515000303036425, + "grad_norm": 2.8082897663116455, + "learning_rate": 8.703072920097641e-06, + "loss": 0.9693, + "step": 6373 + }, + { + "epoch": 0.5150811127497525, + "grad_norm": 2.2477915287017822, + "learning_rate": 8.702633202776041e-06, + "loss": 0.9756, + "step": 6374 + }, + { + "epoch": 0.51516192246308, + "grad_norm": 2.8851675987243652, + "learning_rate": 8.702193422037248e-06, + "loss": 0.926, + "step": 6375 + }, + { + "epoch": 0.5152427321764076, + "grad_norm": 2.42008376121521, + "learning_rate": 8.701753577888792e-06, + "loss": 0.9037, + "step": 6376 + }, + { + "epoch": 0.5153235418897352, + "grad_norm": 2.554056167602539, + "learning_rate": 8.701313670338212e-06, + "loss": 0.9475, + "step": 6377 + }, + { + "epoch": 0.5154043516030626, + "grad_norm": 3.2706005573272705, + "learning_rate": 8.700873699393037e-06, + "loss": 1.0666, + "step": 6378 + }, + { + "epoch": 0.5154851613163902, + "grad_norm": 2.3901360034942627, + "learning_rate": 8.700433665060806e-06, + "loss": 0.9394, + "step": 6379 + }, + { + "epoch": 0.5155659710297178, + "grad_norm": 2.8380510807037354, + "learning_rate": 8.699993567349055e-06, + "loss": 0.9402, + "step": 6380 + }, + { + "epoch": 0.5156467807430453, + "grad_norm": 2.676969528198242, + "learning_rate": 8.699553406265321e-06, + "loss": 0.9905, + "step": 6381 + }, + { + "epoch": 0.5157275904563728, + "grad_norm": 2.6981847286224365, + "learning_rate": 8.699113181817145e-06, + "loss": 0.8519, + "step": 6382 + }, + { + "epoch": 0.5158084001697004, + "grad_norm": 2.890380382537842, + "learning_rate": 8.698672894012063e-06, + "loss": 0.9418, + "step": 6383 + }, + { + "epoch": 0.5158892098830279, + "grad_norm": 2.671825408935547, + "learning_rate": 8.69823254285762e-06, + "loss": 0.9275, + "step": 6384 + }, + { + "epoch": 0.5159700195963555, + "grad_norm": 2.573758602142334, + "learning_rate": 8.697792128361358e-06, + "loss": 1.0287, + "step": 6385 + }, + { + "epoch": 0.516050829309683, + "grad_norm": 2.424605131149292, + "learning_rate": 8.697351650530816e-06, + "loss": 0.9835, + "step": 6386 + }, + { + "epoch": 0.5161316390230105, + "grad_norm": 2.4024574756622314, + "learning_rate": 8.696911109373544e-06, + "loss": 0.9429, + "step": 6387 + }, + { + "epoch": 0.5162124487363381, + "grad_norm": 2.7372288703918457, + "learning_rate": 8.696470504897084e-06, + "loss": 0.986, + "step": 6388 + }, + { + "epoch": 0.5162932584496657, + "grad_norm": 2.214829683303833, + "learning_rate": 8.696029837108981e-06, + "loss": 0.9655, + "step": 6389 + }, + { + "epoch": 0.5163740681629931, + "grad_norm": 2.0110013484954834, + "learning_rate": 8.695589106016787e-06, + "loss": 0.9664, + "step": 6390 + }, + { + "epoch": 0.5164548778763207, + "grad_norm": 2.6689956188201904, + "learning_rate": 8.695148311628047e-06, + "loss": 0.9042, + "step": 6391 + }, + { + "epoch": 0.5165356875896483, + "grad_norm": 3.2499120235443115, + "learning_rate": 8.694707453950312e-06, + "loss": 1.0567, + "step": 6392 + }, + { + "epoch": 0.5166164973029758, + "grad_norm": 2.2970924377441406, + "learning_rate": 8.694266532991133e-06, + "loss": 0.9115, + "step": 6393 + }, + { + "epoch": 0.5166973070163033, + "grad_norm": 2.5825273990631104, + "learning_rate": 8.693825548758064e-06, + "loss": 0.9039, + "step": 6394 + }, + { + "epoch": 0.5167781167296309, + "grad_norm": 2.4262425899505615, + "learning_rate": 8.693384501258653e-06, + "loss": 1.1179, + "step": 6395 + }, + { + "epoch": 0.5168589264429584, + "grad_norm": 2.5739622116088867, + "learning_rate": 8.69294339050046e-06, + "loss": 0.9481, + "step": 6396 + }, + { + "epoch": 0.516939736156286, + "grad_norm": 2.987832546234131, + "learning_rate": 8.692502216491034e-06, + "loss": 0.968, + "step": 6397 + }, + { + "epoch": 0.5170205458696135, + "grad_norm": 2.867089033126831, + "learning_rate": 8.692060979237936e-06, + "loss": 1.1234, + "step": 6398 + }, + { + "epoch": 0.5171013555829411, + "grad_norm": 2.766071319580078, + "learning_rate": 8.691619678748722e-06, + "loss": 0.9993, + "step": 6399 + }, + { + "epoch": 0.5171821652962686, + "grad_norm": 2.8523569107055664, + "learning_rate": 8.69117831503095e-06, + "loss": 0.9752, + "step": 6400 + }, + { + "epoch": 0.5172629750095962, + "grad_norm": 2.809799909591675, + "learning_rate": 8.69073688809218e-06, + "loss": 0.9286, + "step": 6401 + }, + { + "epoch": 0.5173437847229237, + "grad_norm": 2.233978748321533, + "learning_rate": 8.69029539793997e-06, + "loss": 0.9414, + "step": 6402 + }, + { + "epoch": 0.5174245944362512, + "grad_norm": 2.607534646987915, + "learning_rate": 8.689853844581886e-06, + "loss": 1.0514, + "step": 6403 + }, + { + "epoch": 0.5175054041495788, + "grad_norm": 2.491774559020996, + "learning_rate": 8.689412228025487e-06, + "loss": 1.0021, + "step": 6404 + }, + { + "epoch": 0.5175862138629064, + "grad_norm": 2.586745262145996, + "learning_rate": 8.688970548278339e-06, + "loss": 1.0436, + "step": 6405 + }, + { + "epoch": 0.5176670235762338, + "grad_norm": 2.635716676712036, + "learning_rate": 8.688528805348008e-06, + "loss": 0.9233, + "step": 6406 + }, + { + "epoch": 0.5177478332895614, + "grad_norm": 2.7674312591552734, + "learning_rate": 8.688086999242056e-06, + "loss": 0.8556, + "step": 6407 + }, + { + "epoch": 0.517828643002889, + "grad_norm": 2.457615375518799, + "learning_rate": 8.687645129968054e-06, + "loss": 0.9286, + "step": 6408 + }, + { + "epoch": 0.5179094527162165, + "grad_norm": 3.614577054977417, + "learning_rate": 8.687203197533567e-06, + "loss": 0.9653, + "step": 6409 + }, + { + "epoch": 0.517990262429544, + "grad_norm": 2.7701103687286377, + "learning_rate": 8.686761201946168e-06, + "loss": 1.0364, + "step": 6410 + }, + { + "epoch": 0.5180710721428716, + "grad_norm": 2.433927297592163, + "learning_rate": 8.686319143213424e-06, + "loss": 0.9752, + "step": 6411 + }, + { + "epoch": 0.5181518818561991, + "grad_norm": 2.5998244285583496, + "learning_rate": 8.685877021342907e-06, + "loss": 0.7826, + "step": 6412 + }, + { + "epoch": 0.5182326915695267, + "grad_norm": 2.511528253555298, + "learning_rate": 8.68543483634219e-06, + "loss": 0.9163, + "step": 6413 + }, + { + "epoch": 0.5183135012828542, + "grad_norm": 2.436471462249756, + "learning_rate": 8.684992588218848e-06, + "loss": 1.1039, + "step": 6414 + }, + { + "epoch": 0.5183943109961817, + "grad_norm": 2.6319479942321777, + "learning_rate": 8.684550276980453e-06, + "loss": 1.0089, + "step": 6415 + }, + { + "epoch": 0.5184751207095093, + "grad_norm": 2.991459369659424, + "learning_rate": 8.684107902634581e-06, + "loss": 0.939, + "step": 6416 + }, + { + "epoch": 0.5185559304228369, + "grad_norm": 2.6421146392822266, + "learning_rate": 8.683665465188811e-06, + "loss": 0.9869, + "step": 6417 + }, + { + "epoch": 0.5186367401361643, + "grad_norm": 2.276886224746704, + "learning_rate": 8.683222964650721e-06, + "loss": 0.8912, + "step": 6418 + }, + { + "epoch": 0.5187175498494919, + "grad_norm": 2.6830523014068604, + "learning_rate": 8.682780401027886e-06, + "loss": 0.8998, + "step": 6419 + }, + { + "epoch": 0.5187983595628195, + "grad_norm": 3.1952884197235107, + "learning_rate": 8.68233777432789e-06, + "loss": 1.0521, + "step": 6420 + }, + { + "epoch": 0.518879169276147, + "grad_norm": 2.9305951595306396, + "learning_rate": 8.681895084558314e-06, + "loss": 0.9575, + "step": 6421 + }, + { + "epoch": 0.5189599789894745, + "grad_norm": 2.691216230392456, + "learning_rate": 8.681452331726737e-06, + "loss": 0.8598, + "step": 6422 + }, + { + "epoch": 0.5190407887028021, + "grad_norm": 2.364846706390381, + "learning_rate": 8.681009515840744e-06, + "loss": 1.0365, + "step": 6423 + }, + { + "epoch": 0.5191215984161296, + "grad_norm": 2.915999174118042, + "learning_rate": 8.680566636907922e-06, + "loss": 0.8954, + "step": 6424 + }, + { + "epoch": 0.5192024081294572, + "grad_norm": 2.4738638401031494, + "learning_rate": 8.680123694935852e-06, + "loss": 0.8674, + "step": 6425 + }, + { + "epoch": 0.5192832178427848, + "grad_norm": 2.6491429805755615, + "learning_rate": 8.679680689932123e-06, + "loss": 0.9489, + "step": 6426 + }, + { + "epoch": 0.5193640275561122, + "grad_norm": 2.5052988529205322, + "learning_rate": 8.679237621904324e-06, + "loss": 0.9889, + "step": 6427 + }, + { + "epoch": 0.5194448372694398, + "grad_norm": 2.9090888500213623, + "learning_rate": 8.678794490860039e-06, + "loss": 1.0662, + "step": 6428 + }, + { + "epoch": 0.5195256469827674, + "grad_norm": 2.6907501220703125, + "learning_rate": 8.678351296806863e-06, + "loss": 1.0126, + "step": 6429 + }, + { + "epoch": 0.5196064566960948, + "grad_norm": 3.101341962814331, + "learning_rate": 8.677908039752383e-06, + "loss": 0.9364, + "step": 6430 + }, + { + "epoch": 0.5196872664094224, + "grad_norm": 2.7768609523773193, + "learning_rate": 8.677464719704194e-06, + "loss": 0.8984, + "step": 6431 + }, + { + "epoch": 0.51976807612275, + "grad_norm": 2.6042120456695557, + "learning_rate": 8.677021336669887e-06, + "loss": 0.9307, + "step": 6432 + }, + { + "epoch": 0.5198488858360775, + "grad_norm": 3.216299295425415, + "learning_rate": 8.676577890657056e-06, + "loss": 1.0496, + "step": 6433 + }, + { + "epoch": 0.519929695549405, + "grad_norm": 2.657881736755371, + "learning_rate": 8.676134381673296e-06, + "loss": 0.9144, + "step": 6434 + }, + { + "epoch": 0.5200105052627326, + "grad_norm": 2.426283836364746, + "learning_rate": 8.675690809726206e-06, + "loss": 0.9773, + "step": 6435 + }, + { + "epoch": 0.5200913149760601, + "grad_norm": 3.0907323360443115, + "learning_rate": 8.67524717482338e-06, + "loss": 0.9152, + "step": 6436 + }, + { + "epoch": 0.5201721246893877, + "grad_norm": 2.756096839904785, + "learning_rate": 8.674803476972418e-06, + "loss": 1.0208, + "step": 6437 + }, + { + "epoch": 0.5202529344027153, + "grad_norm": 2.7864134311676025, + "learning_rate": 8.67435971618092e-06, + "loss": 1.1348, + "step": 6438 + }, + { + "epoch": 0.5203337441160427, + "grad_norm": 2.7034497261047363, + "learning_rate": 8.673915892456484e-06, + "loss": 0.9545, + "step": 6439 + }, + { + "epoch": 0.5204145538293703, + "grad_norm": 2.788236618041992, + "learning_rate": 8.673472005806715e-06, + "loss": 0.9214, + "step": 6440 + }, + { + "epoch": 0.5204953635426979, + "grad_norm": 2.6394190788269043, + "learning_rate": 8.673028056239213e-06, + "loss": 0.8938, + "step": 6441 + }, + { + "epoch": 0.5205761732560253, + "grad_norm": 2.9849536418914795, + "learning_rate": 8.672584043761583e-06, + "loss": 1.0054, + "step": 6442 + }, + { + "epoch": 0.5206569829693529, + "grad_norm": 2.5224430561065674, + "learning_rate": 8.67213996838143e-06, + "loss": 0.8606, + "step": 6443 + }, + { + "epoch": 0.5207377926826805, + "grad_norm": 2.4942100048065186, + "learning_rate": 8.67169583010636e-06, + "loss": 0.9383, + "step": 6444 + }, + { + "epoch": 0.520818602396008, + "grad_norm": 2.6450839042663574, + "learning_rate": 8.67125162894398e-06, + "loss": 1.0274, + "step": 6445 + }, + { + "epoch": 0.5208994121093355, + "grad_norm": 2.9041450023651123, + "learning_rate": 8.670807364901896e-06, + "loss": 1.1383, + "step": 6446 + }, + { + "epoch": 0.5209802218226631, + "grad_norm": 2.6695570945739746, + "learning_rate": 8.67036303798772e-06, + "loss": 0.9661, + "step": 6447 + }, + { + "epoch": 0.5210610315359906, + "grad_norm": 2.626668691635132, + "learning_rate": 8.669918648209062e-06, + "loss": 0.9429, + "step": 6448 + }, + { + "epoch": 0.5211418412493182, + "grad_norm": 3.0560965538024902, + "learning_rate": 8.66947419557353e-06, + "loss": 0.9078, + "step": 6449 + }, + { + "epoch": 0.5212226509626458, + "grad_norm": 2.9653520584106445, + "learning_rate": 8.66902968008874e-06, + "loss": 1.0656, + "step": 6450 + }, + { + "epoch": 0.5213034606759732, + "grad_norm": 2.8498342037200928, + "learning_rate": 8.668585101762305e-06, + "loss": 0.9314, + "step": 6451 + }, + { + "epoch": 0.5213842703893008, + "grad_norm": 3.286355972290039, + "learning_rate": 8.668140460601841e-06, + "loss": 1.057, + "step": 6452 + }, + { + "epoch": 0.5214650801026284, + "grad_norm": 2.399308204650879, + "learning_rate": 8.66769575661496e-06, + "loss": 0.9351, + "step": 6453 + }, + { + "epoch": 0.5215458898159558, + "grad_norm": 2.535555362701416, + "learning_rate": 8.667250989809279e-06, + "loss": 0.9142, + "step": 6454 + }, + { + "epoch": 0.5216266995292834, + "grad_norm": 2.7442376613616943, + "learning_rate": 8.666806160192419e-06, + "loss": 1.0837, + "step": 6455 + }, + { + "epoch": 0.521707509242611, + "grad_norm": 2.702324151992798, + "learning_rate": 8.666361267771994e-06, + "loss": 1.1003, + "step": 6456 + }, + { + "epoch": 0.5217883189559385, + "grad_norm": 2.9018096923828125, + "learning_rate": 8.66591631255563e-06, + "loss": 0.9353, + "step": 6457 + }, + { + "epoch": 0.521869128669266, + "grad_norm": 2.298851728439331, + "learning_rate": 8.665471294550943e-06, + "loss": 1.1122, + "step": 6458 + }, + { + "epoch": 0.5219499383825936, + "grad_norm": 2.289459705352783, + "learning_rate": 8.665026213765558e-06, + "loss": 0.929, + "step": 6459 + }, + { + "epoch": 0.5220307480959211, + "grad_norm": 3.1866111755371094, + "learning_rate": 8.664581070207098e-06, + "loss": 0.9406, + "step": 6460 + }, + { + "epoch": 0.5221115578092487, + "grad_norm": 2.3597025871276855, + "learning_rate": 8.664135863883185e-06, + "loss": 0.9293, + "step": 6461 + }, + { + "epoch": 0.5221923675225763, + "grad_norm": 2.3700859546661377, + "learning_rate": 8.663690594801446e-06, + "loss": 0.8038, + "step": 6462 + }, + { + "epoch": 0.5222731772359037, + "grad_norm": 2.5032904148101807, + "learning_rate": 8.663245262969507e-06, + "loss": 0.8848, + "step": 6463 + }, + { + "epoch": 0.5223539869492313, + "grad_norm": 2.6346776485443115, + "learning_rate": 8.662799868394995e-06, + "loss": 1.0412, + "step": 6464 + }, + { + "epoch": 0.5224347966625589, + "grad_norm": 2.7388508319854736, + "learning_rate": 8.66235441108554e-06, + "loss": 1.0035, + "step": 6465 + }, + { + "epoch": 0.5225156063758863, + "grad_norm": 3.1407368183135986, + "learning_rate": 8.66190889104877e-06, + "loss": 1.0042, + "step": 6466 + }, + { + "epoch": 0.5225964160892139, + "grad_norm": 2.942356824874878, + "learning_rate": 8.661463308292317e-06, + "loss": 0.9301, + "step": 6467 + }, + { + "epoch": 0.5226772258025415, + "grad_norm": 2.982595443725586, + "learning_rate": 8.661017662823812e-06, + "loss": 0.9718, + "step": 6468 + }, + { + "epoch": 0.522758035515869, + "grad_norm": 2.887547731399536, + "learning_rate": 8.660571954650887e-06, + "loss": 1.0011, + "step": 6469 + }, + { + "epoch": 0.5228388452291965, + "grad_norm": 2.8909153938293457, + "learning_rate": 8.660126183781179e-06, + "loss": 0.9297, + "step": 6470 + }, + { + "epoch": 0.5229196549425241, + "grad_norm": 2.794318914413452, + "learning_rate": 8.65968035022232e-06, + "loss": 1.0122, + "step": 6471 + }, + { + "epoch": 0.5230004646558516, + "grad_norm": 2.5340795516967773, + "learning_rate": 8.659234453981946e-06, + "loss": 0.9236, + "step": 6472 + }, + { + "epoch": 0.5230812743691792, + "grad_norm": 2.4756805896759033, + "learning_rate": 8.658788495067696e-06, + "loss": 0.9408, + "step": 6473 + }, + { + "epoch": 0.5231620840825068, + "grad_norm": 2.262382984161377, + "learning_rate": 8.658342473487207e-06, + "loss": 1.0647, + "step": 6474 + }, + { + "epoch": 0.5232428937958342, + "grad_norm": 2.6728365421295166, + "learning_rate": 8.657896389248117e-06, + "loss": 1.0638, + "step": 6475 + }, + { + "epoch": 0.5233237035091618, + "grad_norm": 2.5527806282043457, + "learning_rate": 8.657450242358069e-06, + "loss": 1.0011, + "step": 6476 + }, + { + "epoch": 0.5234045132224894, + "grad_norm": 2.3403241634368896, + "learning_rate": 8.657004032824705e-06, + "loss": 0.9129, + "step": 6477 + }, + { + "epoch": 0.5234853229358168, + "grad_norm": 2.7129452228546143, + "learning_rate": 8.656557760655663e-06, + "loss": 0.976, + "step": 6478 + }, + { + "epoch": 0.5235661326491444, + "grad_norm": 2.948230028152466, + "learning_rate": 8.656111425858591e-06, + "loss": 0.9973, + "step": 6479 + }, + { + "epoch": 0.523646942362472, + "grad_norm": 2.6018083095550537, + "learning_rate": 8.655665028441132e-06, + "loss": 0.8752, + "step": 6480 + }, + { + "epoch": 0.5237277520757995, + "grad_norm": 3.023979425430298, + "learning_rate": 8.655218568410931e-06, + "loss": 0.9006, + "step": 6481 + }, + { + "epoch": 0.523808561789127, + "grad_norm": 2.693397045135498, + "learning_rate": 8.654772045775636e-06, + "loss": 0.9366, + "step": 6482 + }, + { + "epoch": 0.5238893715024546, + "grad_norm": 2.627021074295044, + "learning_rate": 8.654325460542894e-06, + "loss": 1.012, + "step": 6483 + }, + { + "epoch": 0.5239701812157821, + "grad_norm": 2.4792442321777344, + "learning_rate": 8.653878812720356e-06, + "loss": 0.8224, + "step": 6484 + }, + { + "epoch": 0.5240509909291097, + "grad_norm": 2.6263723373413086, + "learning_rate": 8.65343210231567e-06, + "loss": 1.0071, + "step": 6485 + }, + { + "epoch": 0.5241318006424373, + "grad_norm": 2.6514103412628174, + "learning_rate": 8.652985329336485e-06, + "loss": 0.8563, + "step": 6486 + }, + { + "epoch": 0.5242126103557647, + "grad_norm": 2.460580348968506, + "learning_rate": 8.652538493790457e-06, + "loss": 0.9829, + "step": 6487 + }, + { + "epoch": 0.5242934200690923, + "grad_norm": 2.533325433731079, + "learning_rate": 8.652091595685238e-06, + "loss": 0.9769, + "step": 6488 + }, + { + "epoch": 0.5243742297824199, + "grad_norm": 2.8130195140838623, + "learning_rate": 8.65164463502848e-06, + "loss": 0.9436, + "step": 6489 + }, + { + "epoch": 0.5244550394957473, + "grad_norm": 2.5389840602874756, + "learning_rate": 8.651197611827842e-06, + "loss": 0.9137, + "step": 6490 + }, + { + "epoch": 0.5245358492090749, + "grad_norm": 2.9623239040374756, + "learning_rate": 8.650750526090978e-06, + "loss": 0.9418, + "step": 6491 + }, + { + "epoch": 0.5246166589224025, + "grad_norm": 2.6795060634613037, + "learning_rate": 8.650303377825549e-06, + "loss": 1.053, + "step": 6492 + }, + { + "epoch": 0.52469746863573, + "grad_norm": 2.6709554195404053, + "learning_rate": 8.649856167039208e-06, + "loss": 1.0574, + "step": 6493 + }, + { + "epoch": 0.5247782783490575, + "grad_norm": 2.550464391708374, + "learning_rate": 8.649408893739619e-06, + "loss": 1.0703, + "step": 6494 + }, + { + "epoch": 0.5248590880623851, + "grad_norm": 2.6652603149414062, + "learning_rate": 8.648961557934439e-06, + "loss": 0.8755, + "step": 6495 + }, + { + "epoch": 0.5249398977757126, + "grad_norm": 3.513270378112793, + "learning_rate": 8.648514159631333e-06, + "loss": 0.9393, + "step": 6496 + }, + { + "epoch": 0.5250207074890402, + "grad_norm": 3.112147808074951, + "learning_rate": 8.648066698837965e-06, + "loss": 1.0004, + "step": 6497 + }, + { + "epoch": 0.5251015172023678, + "grad_norm": 2.3255598545074463, + "learning_rate": 8.647619175561995e-06, + "loss": 0.9419, + "step": 6498 + }, + { + "epoch": 0.5251823269156952, + "grad_norm": 2.878896474838257, + "learning_rate": 8.64717158981109e-06, + "loss": 0.8765, + "step": 6499 + }, + { + "epoch": 0.5252631366290228, + "grad_norm": 3.013002872467041, + "learning_rate": 8.646723941592916e-06, + "loss": 1.0088, + "step": 6500 + }, + { + "epoch": 0.5253439463423504, + "grad_norm": 2.42220139503479, + "learning_rate": 8.64627623091514e-06, + "loss": 0.9786, + "step": 6501 + }, + { + "epoch": 0.5254247560556778, + "grad_norm": 2.6195785999298096, + "learning_rate": 8.64582845778543e-06, + "loss": 0.8912, + "step": 6502 + }, + { + "epoch": 0.5255055657690054, + "grad_norm": 2.3431684970855713, + "learning_rate": 8.645380622211457e-06, + "loss": 0.9507, + "step": 6503 + }, + { + "epoch": 0.525586375482333, + "grad_norm": 2.744147777557373, + "learning_rate": 8.644932724200888e-06, + "loss": 0.93, + "step": 6504 + }, + { + "epoch": 0.5256671851956605, + "grad_norm": 2.6927852630615234, + "learning_rate": 8.644484763761397e-06, + "loss": 0.9626, + "step": 6505 + }, + { + "epoch": 0.525747994908988, + "grad_norm": 3.2699437141418457, + "learning_rate": 8.644036740900657e-06, + "loss": 1.0035, + "step": 6506 + }, + { + "epoch": 0.5258288046223156, + "grad_norm": 3.2002522945404053, + "learning_rate": 8.643588655626337e-06, + "loss": 0.9647, + "step": 6507 + }, + { + "epoch": 0.5259096143356431, + "grad_norm": 3.257383108139038, + "learning_rate": 8.643140507946117e-06, + "loss": 0.9159, + "step": 6508 + }, + { + "epoch": 0.5259904240489707, + "grad_norm": 2.667975664138794, + "learning_rate": 8.642692297867672e-06, + "loss": 0.886, + "step": 6509 + }, + { + "epoch": 0.5260712337622983, + "grad_norm": 2.6128931045532227, + "learning_rate": 8.642244025398675e-06, + "loss": 0.906, + "step": 6510 + }, + { + "epoch": 0.5261520434756257, + "grad_norm": 3.0815443992614746, + "learning_rate": 8.641795690546806e-06, + "loss": 0.8116, + "step": 6511 + }, + { + "epoch": 0.5262328531889533, + "grad_norm": 2.6134774684906006, + "learning_rate": 8.641347293319746e-06, + "loss": 0.9856, + "step": 6512 + }, + { + "epoch": 0.5263136629022809, + "grad_norm": 2.595999240875244, + "learning_rate": 8.640898833725172e-06, + "loss": 0.9108, + "step": 6513 + }, + { + "epoch": 0.5263944726156083, + "grad_norm": 2.8844268321990967, + "learning_rate": 8.640450311770766e-06, + "loss": 0.9831, + "step": 6514 + }, + { + "epoch": 0.5264752823289359, + "grad_norm": 2.3618969917297363, + "learning_rate": 8.64000172746421e-06, + "loss": 0.9003, + "step": 6515 + }, + { + "epoch": 0.5265560920422635, + "grad_norm": 2.6652464866638184, + "learning_rate": 8.639553080813188e-06, + "loss": 0.8747, + "step": 6516 + }, + { + "epoch": 0.526636901755591, + "grad_norm": 2.7724783420562744, + "learning_rate": 8.639104371825383e-06, + "loss": 0.9955, + "step": 6517 + }, + { + "epoch": 0.5267177114689185, + "grad_norm": 2.8445286750793457, + "learning_rate": 8.638655600508481e-06, + "loss": 1.0135, + "step": 6518 + }, + { + "epoch": 0.5267985211822461, + "grad_norm": 2.5326647758483887, + "learning_rate": 8.63820676687017e-06, + "loss": 0.8841, + "step": 6519 + }, + { + "epoch": 0.5268793308955736, + "grad_norm": 2.761439085006714, + "learning_rate": 8.637757870918132e-06, + "loss": 0.8969, + "step": 6520 + }, + { + "epoch": 0.5269601406089012, + "grad_norm": 2.558090925216675, + "learning_rate": 8.637308912660064e-06, + "loss": 0.9825, + "step": 6521 + }, + { + "epoch": 0.5270409503222288, + "grad_norm": 2.850323438644409, + "learning_rate": 8.636859892103648e-06, + "loss": 1.1012, + "step": 6522 + }, + { + "epoch": 0.5271217600355562, + "grad_norm": 2.6106185913085938, + "learning_rate": 8.636410809256577e-06, + "loss": 0.8804, + "step": 6523 + }, + { + "epoch": 0.5272025697488838, + "grad_norm": 2.455256938934326, + "learning_rate": 8.635961664126543e-06, + "loss": 1.0573, + "step": 6524 + }, + { + "epoch": 0.5272833794622114, + "grad_norm": 2.6972479820251465, + "learning_rate": 8.63551245672124e-06, + "loss": 0.981, + "step": 6525 + }, + { + "epoch": 0.5273641891755388, + "grad_norm": 2.7601194381713867, + "learning_rate": 8.63506318704836e-06, + "loss": 0.974, + "step": 6526 + }, + { + "epoch": 0.5274449988888664, + "grad_norm": 2.5542311668395996, + "learning_rate": 8.634613855115599e-06, + "loss": 0.8569, + "step": 6527 + }, + { + "epoch": 0.527525808602194, + "grad_norm": 2.483264446258545, + "learning_rate": 8.634164460930653e-06, + "loss": 0.8624, + "step": 6528 + }, + { + "epoch": 0.5276066183155216, + "grad_norm": 2.668414831161499, + "learning_rate": 8.633715004501219e-06, + "loss": 0.9694, + "step": 6529 + }, + { + "epoch": 0.527687428028849, + "grad_norm": 2.5893137454986572, + "learning_rate": 8.633265485834993e-06, + "loss": 0.9313, + "step": 6530 + }, + { + "epoch": 0.5277682377421766, + "grad_norm": 3.162181854248047, + "learning_rate": 8.632815904939678e-06, + "loss": 0.9288, + "step": 6531 + }, + { + "epoch": 0.5278490474555042, + "grad_norm": 2.2989354133605957, + "learning_rate": 8.63236626182297e-06, + "loss": 0.8628, + "step": 6532 + }, + { + "epoch": 0.5279298571688317, + "grad_norm": 2.7202649116516113, + "learning_rate": 8.631916556492576e-06, + "loss": 1.0436, + "step": 6533 + }, + { + "epoch": 0.5280106668821593, + "grad_norm": 2.672980785369873, + "learning_rate": 8.631466788956191e-06, + "loss": 0.9771, + "step": 6534 + }, + { + "epoch": 0.5280914765954868, + "grad_norm": 2.866274833679199, + "learning_rate": 8.631016959221526e-06, + "loss": 1.0554, + "step": 6535 + }, + { + "epoch": 0.5281722863088143, + "grad_norm": 2.7136855125427246, + "learning_rate": 8.630567067296278e-06, + "loss": 0.82, + "step": 6536 + }, + { + "epoch": 0.5282530960221419, + "grad_norm": 3.0631158351898193, + "learning_rate": 8.63011711318816e-06, + "loss": 0.9494, + "step": 6537 + }, + { + "epoch": 0.5283339057354695, + "grad_norm": 2.781109094619751, + "learning_rate": 8.629667096904872e-06, + "loss": 0.9363, + "step": 6538 + }, + { + "epoch": 0.5284147154487969, + "grad_norm": 2.4523978233337402, + "learning_rate": 8.629217018454126e-06, + "loss": 0.9444, + "step": 6539 + }, + { + "epoch": 0.5284955251621245, + "grad_norm": 2.6394524574279785, + "learning_rate": 8.62876687784363e-06, + "loss": 0.9864, + "step": 6540 + }, + { + "epoch": 0.5285763348754521, + "grad_norm": 2.700120449066162, + "learning_rate": 8.628316675081092e-06, + "loss": 1.0109, + "step": 6541 + }, + { + "epoch": 0.5286571445887795, + "grad_norm": 2.723179578781128, + "learning_rate": 8.627866410174225e-06, + "loss": 0.9629, + "step": 6542 + }, + { + "epoch": 0.5287379543021071, + "grad_norm": 2.4862112998962402, + "learning_rate": 8.62741608313074e-06, + "loss": 0.9166, + "step": 6543 + }, + { + "epoch": 0.5288187640154347, + "grad_norm": 2.704634666442871, + "learning_rate": 8.62696569395835e-06, + "loss": 0.8236, + "step": 6544 + }, + { + "epoch": 0.5288995737287622, + "grad_norm": 2.9109954833984375, + "learning_rate": 8.626515242664769e-06, + "loss": 0.8608, + "step": 6545 + }, + { + "epoch": 0.5289803834420898, + "grad_norm": 2.65484356880188, + "learning_rate": 8.626064729257712e-06, + "loss": 0.9483, + "step": 6546 + }, + { + "epoch": 0.5290611931554173, + "grad_norm": 2.6971874237060547, + "learning_rate": 8.625614153744897e-06, + "loss": 0.9079, + "step": 6547 + }, + { + "epoch": 0.5291420028687448, + "grad_norm": 2.643887758255005, + "learning_rate": 8.62516351613404e-06, + "loss": 1.1361, + "step": 6548 + }, + { + "epoch": 0.5292228125820724, + "grad_norm": 2.857724905014038, + "learning_rate": 8.624712816432857e-06, + "loss": 1.0655, + "step": 6549 + }, + { + "epoch": 0.5293036222954, + "grad_norm": 2.50374436378479, + "learning_rate": 8.624262054649072e-06, + "loss": 1.0183, + "step": 6550 + }, + { + "epoch": 0.5293844320087274, + "grad_norm": 2.981233596801758, + "learning_rate": 8.623811230790402e-06, + "loss": 0.9639, + "step": 6551 + }, + { + "epoch": 0.529465241722055, + "grad_norm": 2.4913814067840576, + "learning_rate": 8.623360344864569e-06, + "loss": 1.0296, + "step": 6552 + }, + { + "epoch": 0.5295460514353826, + "grad_norm": 2.6574978828430176, + "learning_rate": 8.622909396879298e-06, + "loss": 0.9511, + "step": 6553 + }, + { + "epoch": 0.52962686114871, + "grad_norm": 2.43039608001709, + "learning_rate": 8.622458386842308e-06, + "loss": 0.9564, + "step": 6554 + }, + { + "epoch": 0.5297076708620376, + "grad_norm": 2.9565107822418213, + "learning_rate": 8.62200731476133e-06, + "loss": 0.978, + "step": 6555 + }, + { + "epoch": 0.5297884805753652, + "grad_norm": 2.835583209991455, + "learning_rate": 8.621556180644083e-06, + "loss": 0.8722, + "step": 6556 + }, + { + "epoch": 0.5298692902886927, + "grad_norm": 2.6030168533325195, + "learning_rate": 8.621104984498299e-06, + "loss": 0.9509, + "step": 6557 + }, + { + "epoch": 0.5299501000020203, + "grad_norm": 2.6257119178771973, + "learning_rate": 8.620653726331703e-06, + "loss": 0.9394, + "step": 6558 + }, + { + "epoch": 0.5300309097153478, + "grad_norm": 2.4450864791870117, + "learning_rate": 8.620202406152027e-06, + "loss": 0.9802, + "step": 6559 + }, + { + "epoch": 0.5301117194286753, + "grad_norm": 2.6981029510498047, + "learning_rate": 8.619751023966996e-06, + "loss": 0.9408, + "step": 6560 + }, + { + "epoch": 0.5301925291420029, + "grad_norm": 2.702737331390381, + "learning_rate": 8.619299579784347e-06, + "loss": 0.8579, + "step": 6561 + }, + { + "epoch": 0.5302733388553305, + "grad_norm": 2.8359806537628174, + "learning_rate": 8.618848073611807e-06, + "loss": 1.0747, + "step": 6562 + }, + { + "epoch": 0.5303541485686579, + "grad_norm": 2.9311721324920654, + "learning_rate": 8.618396505457114e-06, + "loss": 0.9953, + "step": 6563 + }, + { + "epoch": 0.5304349582819855, + "grad_norm": 2.3743112087249756, + "learning_rate": 8.617944875327998e-06, + "loss": 0.9683, + "step": 6564 + }, + { + "epoch": 0.5305157679953131, + "grad_norm": 2.845492124557495, + "learning_rate": 8.617493183232198e-06, + "loss": 0.9726, + "step": 6565 + }, + { + "epoch": 0.5305965777086405, + "grad_norm": 2.371586799621582, + "learning_rate": 8.617041429177447e-06, + "loss": 0.9463, + "step": 6566 + }, + { + "epoch": 0.5306773874219681, + "grad_norm": 2.7971203327178955, + "learning_rate": 8.616589613171482e-06, + "loss": 0.9046, + "step": 6567 + }, + { + "epoch": 0.5307581971352957, + "grad_norm": 2.6262612342834473, + "learning_rate": 8.616137735222047e-06, + "loss": 1.0292, + "step": 6568 + }, + { + "epoch": 0.5308390068486232, + "grad_norm": 2.373046875, + "learning_rate": 8.615685795336877e-06, + "loss": 0.9418, + "step": 6569 + }, + { + "epoch": 0.5309198165619508, + "grad_norm": 2.4569997787475586, + "learning_rate": 8.615233793523713e-06, + "loss": 0.9875, + "step": 6570 + }, + { + "epoch": 0.5310006262752783, + "grad_norm": 2.4921178817749023, + "learning_rate": 8.614781729790298e-06, + "loss": 0.8933, + "step": 6571 + }, + { + "epoch": 0.5310814359886058, + "grad_norm": 2.863889694213867, + "learning_rate": 8.614329604144373e-06, + "loss": 0.9791, + "step": 6572 + }, + { + "epoch": 0.5311622457019334, + "grad_norm": 2.38996958732605, + "learning_rate": 8.613877416593686e-06, + "loss": 0.9147, + "step": 6573 + }, + { + "epoch": 0.531243055415261, + "grad_norm": 2.6225709915161133, + "learning_rate": 8.613425167145977e-06, + "loss": 0.9874, + "step": 6574 + }, + { + "epoch": 0.5313238651285884, + "grad_norm": 2.7070398330688477, + "learning_rate": 8.612972855808993e-06, + "loss": 1.0344, + "step": 6575 + }, + { + "epoch": 0.531404674841916, + "grad_norm": 2.5063893795013428, + "learning_rate": 8.612520482590483e-06, + "loss": 1.0081, + "step": 6576 + }, + { + "epoch": 0.5314854845552436, + "grad_norm": 2.8511345386505127, + "learning_rate": 8.612068047498195e-06, + "loss": 0.9409, + "step": 6577 + }, + { + "epoch": 0.531566294268571, + "grad_norm": 2.7853493690490723, + "learning_rate": 8.611615550539874e-06, + "loss": 0.8633, + "step": 6578 + }, + { + "epoch": 0.5316471039818986, + "grad_norm": 2.6691391468048096, + "learning_rate": 8.611162991723277e-06, + "loss": 0.9804, + "step": 6579 + }, + { + "epoch": 0.5317279136952262, + "grad_norm": 2.67984676361084, + "learning_rate": 8.610710371056148e-06, + "loss": 0.8895, + "step": 6580 + }, + { + "epoch": 0.5318087234085537, + "grad_norm": 3.4826438426971436, + "learning_rate": 8.610257688546244e-06, + "loss": 1.0228, + "step": 6581 + }, + { + "epoch": 0.5318895331218813, + "grad_norm": 2.6742348670959473, + "learning_rate": 8.609804944201319e-06, + "loss": 0.951, + "step": 6582 + }, + { + "epoch": 0.5319703428352088, + "grad_norm": 2.6604034900665283, + "learning_rate": 8.609352138029123e-06, + "loss": 0.9672, + "step": 6583 + }, + { + "epoch": 0.5320511525485363, + "grad_norm": 3.0618724822998047, + "learning_rate": 8.608899270037414e-06, + "loss": 0.9743, + "step": 6584 + }, + { + "epoch": 0.5321319622618639, + "grad_norm": 2.5103518962860107, + "learning_rate": 8.608446340233951e-06, + "loss": 0.9295, + "step": 6585 + }, + { + "epoch": 0.5322127719751915, + "grad_norm": 2.4761383533477783, + "learning_rate": 8.607993348626486e-06, + "loss": 1.021, + "step": 6586 + }, + { + "epoch": 0.5322935816885189, + "grad_norm": 2.9981186389923096, + "learning_rate": 8.607540295222784e-06, + "loss": 0.9241, + "step": 6587 + }, + { + "epoch": 0.5323743914018465, + "grad_norm": 2.822262763977051, + "learning_rate": 8.6070871800306e-06, + "loss": 0.865, + "step": 6588 + }, + { + "epoch": 0.5324552011151741, + "grad_norm": 2.8145699501037598, + "learning_rate": 8.606634003057697e-06, + "loss": 0.9334, + "step": 6589 + }, + { + "epoch": 0.5325360108285015, + "grad_norm": 2.835066318511963, + "learning_rate": 8.606180764311836e-06, + "loss": 0.9147, + "step": 6590 + }, + { + "epoch": 0.5326168205418291, + "grad_norm": 2.5754809379577637, + "learning_rate": 8.60572746380078e-06, + "loss": 0.9673, + "step": 6591 + }, + { + "epoch": 0.5326976302551567, + "grad_norm": 2.8904290199279785, + "learning_rate": 8.60527410153229e-06, + "loss": 0.842, + "step": 6592 + }, + { + "epoch": 0.5327784399684842, + "grad_norm": 2.563721179962158, + "learning_rate": 8.604820677514139e-06, + "loss": 0.7603, + "step": 6593 + }, + { + "epoch": 0.5328592496818118, + "grad_norm": 2.49072003364563, + "learning_rate": 8.604367191754083e-06, + "loss": 0.9352, + "step": 6594 + }, + { + "epoch": 0.5329400593951393, + "grad_norm": 2.5579614639282227, + "learning_rate": 8.603913644259898e-06, + "loss": 1.1044, + "step": 6595 + }, + { + "epoch": 0.5330208691084668, + "grad_norm": 2.7474677562713623, + "learning_rate": 8.603460035039348e-06, + "loss": 1.0202, + "step": 6596 + }, + { + "epoch": 0.5331016788217944, + "grad_norm": 2.919316291809082, + "learning_rate": 8.603006364100201e-06, + "loss": 0.9191, + "step": 6597 + }, + { + "epoch": 0.533182488535122, + "grad_norm": 2.361276865005493, + "learning_rate": 8.60255263145023e-06, + "loss": 1.0616, + "step": 6598 + }, + { + "epoch": 0.5332632982484494, + "grad_norm": 2.8892202377319336, + "learning_rate": 8.602098837097203e-06, + "loss": 0.9934, + "step": 6599 + }, + { + "epoch": 0.533344107961777, + "grad_norm": 2.831618309020996, + "learning_rate": 8.601644981048897e-06, + "loss": 0.968, + "step": 6600 + }, + { + "epoch": 0.5334249176751046, + "grad_norm": 2.7197771072387695, + "learning_rate": 8.601191063313084e-06, + "loss": 1.0363, + "step": 6601 + }, + { + "epoch": 0.533505727388432, + "grad_norm": 2.8328351974487305, + "learning_rate": 8.600737083897534e-06, + "loss": 0.9337, + "step": 6602 + }, + { + "epoch": 0.5335865371017596, + "grad_norm": 2.942678928375244, + "learning_rate": 8.60028304281003e-06, + "loss": 0.9096, + "step": 6603 + }, + { + "epoch": 0.5336673468150872, + "grad_norm": 2.599435806274414, + "learning_rate": 8.599828940058343e-06, + "loss": 0.9754, + "step": 6604 + }, + { + "epoch": 0.5337481565284147, + "grad_norm": 2.5882532596588135, + "learning_rate": 8.599374775650252e-06, + "loss": 1.0987, + "step": 6605 + }, + { + "epoch": 0.5338289662417423, + "grad_norm": 3.2290847301483154, + "learning_rate": 8.598920549593536e-06, + "loss": 0.8644, + "step": 6606 + }, + { + "epoch": 0.5339097759550698, + "grad_norm": 2.351295232772827, + "learning_rate": 8.598466261895976e-06, + "loss": 0.9477, + "step": 6607 + }, + { + "epoch": 0.5339905856683973, + "grad_norm": 2.7535810470581055, + "learning_rate": 8.598011912565352e-06, + "loss": 1.0735, + "step": 6608 + }, + { + "epoch": 0.5340713953817249, + "grad_norm": 2.615550994873047, + "learning_rate": 8.597557501609447e-06, + "loss": 0.993, + "step": 6609 + }, + { + "epoch": 0.5341522050950525, + "grad_norm": 2.746276617050171, + "learning_rate": 8.59710302903604e-06, + "loss": 0.919, + "step": 6610 + }, + { + "epoch": 0.5342330148083799, + "grad_norm": 2.752744674682617, + "learning_rate": 8.596648494852919e-06, + "loss": 0.9607, + "step": 6611 + }, + { + "epoch": 0.5343138245217075, + "grad_norm": 2.741203546524048, + "learning_rate": 8.596193899067868e-06, + "loss": 0.926, + "step": 6612 + }, + { + "epoch": 0.5343946342350351, + "grad_norm": 2.859609365463257, + "learning_rate": 8.595739241688673e-06, + "loss": 0.887, + "step": 6613 + }, + { + "epoch": 0.5344754439483625, + "grad_norm": 2.8038549423217773, + "learning_rate": 8.59528452272312e-06, + "loss": 0.8971, + "step": 6614 + }, + { + "epoch": 0.5345562536616901, + "grad_norm": 3.0580995082855225, + "learning_rate": 8.594829742179e-06, + "loss": 1.0158, + "step": 6615 + }, + { + "epoch": 0.5346370633750177, + "grad_norm": 2.649477243423462, + "learning_rate": 8.5943749000641e-06, + "loss": 1.0207, + "step": 6616 + }, + { + "epoch": 0.5347178730883452, + "grad_norm": 3.593618869781494, + "learning_rate": 8.593919996386212e-06, + "loss": 0.9654, + "step": 6617 + }, + { + "epoch": 0.5347986828016728, + "grad_norm": 2.7303850650787354, + "learning_rate": 8.593465031153126e-06, + "loss": 0.8854, + "step": 6618 + }, + { + "epoch": 0.5348794925150003, + "grad_norm": 2.9342663288116455, + "learning_rate": 8.593010004372634e-06, + "loss": 0.8591, + "step": 6619 + }, + { + "epoch": 0.5349603022283278, + "grad_norm": 3.488084554672241, + "learning_rate": 8.592554916052531e-06, + "loss": 0.9059, + "step": 6620 + }, + { + "epoch": 0.5350411119416554, + "grad_norm": 2.514772653579712, + "learning_rate": 8.592099766200613e-06, + "loss": 0.9205, + "step": 6621 + }, + { + "epoch": 0.535121921654983, + "grad_norm": 3.1306204795837402, + "learning_rate": 8.59164455482467e-06, + "loss": 0.9401, + "step": 6622 + }, + { + "epoch": 0.5352027313683104, + "grad_norm": 2.5622668266296387, + "learning_rate": 8.591189281932504e-06, + "loss": 0.8657, + "step": 6623 + }, + { + "epoch": 0.535283541081638, + "grad_norm": 2.785163402557373, + "learning_rate": 8.590733947531911e-06, + "loss": 0.9722, + "step": 6624 + }, + { + "epoch": 0.5353643507949656, + "grad_norm": 2.698099374771118, + "learning_rate": 8.590278551630691e-06, + "loss": 1.0265, + "step": 6625 + }, + { + "epoch": 0.535445160508293, + "grad_norm": 2.6029014587402344, + "learning_rate": 8.589823094236642e-06, + "loss": 0.9085, + "step": 6626 + }, + { + "epoch": 0.5355259702216206, + "grad_norm": 3.1489837169647217, + "learning_rate": 8.589367575357564e-06, + "loss": 0.9826, + "step": 6627 + }, + { + "epoch": 0.5356067799349482, + "grad_norm": 3.264782190322876, + "learning_rate": 8.588911995001262e-06, + "loss": 0.9107, + "step": 6628 + }, + { + "epoch": 0.5356875896482757, + "grad_norm": 2.4482884407043457, + "learning_rate": 8.588456353175536e-06, + "loss": 1.0412, + "step": 6629 + }, + { + "epoch": 0.5357683993616033, + "grad_norm": 2.366326093673706, + "learning_rate": 8.588000649888194e-06, + "loss": 0.926, + "step": 6630 + }, + { + "epoch": 0.5358492090749308, + "grad_norm": 3.108952760696411, + "learning_rate": 8.587544885147037e-06, + "loss": 0.9872, + "step": 6631 + }, + { + "epoch": 0.5359300187882583, + "grad_norm": 2.9690911769866943, + "learning_rate": 8.587089058959872e-06, + "loss": 0.9808, + "step": 6632 + }, + { + "epoch": 0.5360108285015859, + "grad_norm": 2.4989426136016846, + "learning_rate": 8.586633171334508e-06, + "loss": 0.9387, + "step": 6633 + }, + { + "epoch": 0.5360916382149135, + "grad_norm": 2.7757833003997803, + "learning_rate": 8.586177222278753e-06, + "loss": 0.9552, + "step": 6634 + }, + { + "epoch": 0.5361724479282409, + "grad_norm": 2.7151308059692383, + "learning_rate": 8.585721211800415e-06, + "loss": 0.8695, + "step": 6635 + }, + { + "epoch": 0.5362532576415685, + "grad_norm": 2.7193984985351562, + "learning_rate": 8.585265139907303e-06, + "loss": 1.0022, + "step": 6636 + }, + { + "epoch": 0.5363340673548961, + "grad_norm": 2.4473953247070312, + "learning_rate": 8.584809006607234e-06, + "loss": 0.9212, + "step": 6637 + }, + { + "epoch": 0.5364148770682236, + "grad_norm": 2.8431286811828613, + "learning_rate": 8.584352811908015e-06, + "loss": 0.9203, + "step": 6638 + }, + { + "epoch": 0.5364956867815511, + "grad_norm": 3.207179546356201, + "learning_rate": 8.58389655581746e-06, + "loss": 0.9965, + "step": 6639 + }, + { + "epoch": 0.5365764964948787, + "grad_norm": 2.723459005355835, + "learning_rate": 8.583440238343385e-06, + "loss": 0.9286, + "step": 6640 + }, + { + "epoch": 0.5366573062082062, + "grad_norm": 2.376145601272583, + "learning_rate": 8.582983859493607e-06, + "loss": 0.9126, + "step": 6641 + }, + { + "epoch": 0.5367381159215338, + "grad_norm": 2.7797765731811523, + "learning_rate": 8.58252741927594e-06, + "loss": 0.9176, + "step": 6642 + }, + { + "epoch": 0.5368189256348613, + "grad_norm": 2.5153796672821045, + "learning_rate": 8.582070917698204e-06, + "loss": 0.9259, + "step": 6643 + }, + { + "epoch": 0.5368997353481888, + "grad_norm": 2.734215021133423, + "learning_rate": 8.581614354768218e-06, + "loss": 0.9787, + "step": 6644 + }, + { + "epoch": 0.5369805450615164, + "grad_norm": 2.379599094390869, + "learning_rate": 8.581157730493798e-06, + "loss": 0.983, + "step": 6645 + }, + { + "epoch": 0.537061354774844, + "grad_norm": 2.7251689434051514, + "learning_rate": 8.580701044882768e-06, + "loss": 0.8952, + "step": 6646 + }, + { + "epoch": 0.5371421644881714, + "grad_norm": 2.262371063232422, + "learning_rate": 8.58024429794295e-06, + "loss": 1.1348, + "step": 6647 + }, + { + "epoch": 0.537222974201499, + "grad_norm": 2.9627561569213867, + "learning_rate": 8.579787489682166e-06, + "loss": 0.9859, + "step": 6648 + }, + { + "epoch": 0.5373037839148266, + "grad_norm": 2.3098561763763428, + "learning_rate": 8.57933062010824e-06, + "loss": 1.0214, + "step": 6649 + }, + { + "epoch": 0.537384593628154, + "grad_norm": 2.6136844158172607, + "learning_rate": 8.578873689228996e-06, + "loss": 1.0146, + "step": 6650 + }, + { + "epoch": 0.5374654033414816, + "grad_norm": 2.760162353515625, + "learning_rate": 8.578416697052263e-06, + "loss": 0.9337, + "step": 6651 + }, + { + "epoch": 0.5375462130548092, + "grad_norm": 2.3018577098846436, + "learning_rate": 8.577959643585867e-06, + "loss": 1.0456, + "step": 6652 + }, + { + "epoch": 0.5376270227681367, + "grad_norm": 3.419112205505371, + "learning_rate": 8.577502528837634e-06, + "loss": 1.0884, + "step": 6653 + }, + { + "epoch": 0.5377078324814643, + "grad_norm": 2.4860055446624756, + "learning_rate": 8.577045352815397e-06, + "loss": 1.0344, + "step": 6654 + }, + { + "epoch": 0.5377886421947918, + "grad_norm": 2.4065463542938232, + "learning_rate": 8.576588115526985e-06, + "loss": 1.0467, + "step": 6655 + }, + { + "epoch": 0.5378694519081194, + "grad_norm": 2.653153419494629, + "learning_rate": 8.576130816980226e-06, + "loss": 0.991, + "step": 6656 + }, + { + "epoch": 0.5379502616214469, + "grad_norm": 2.1766774654388428, + "learning_rate": 8.575673457182958e-06, + "loss": 0.9505, + "step": 6657 + }, + { + "epoch": 0.5380310713347745, + "grad_norm": 2.454543113708496, + "learning_rate": 8.57521603614301e-06, + "loss": 0.9752, + "step": 6658 + }, + { + "epoch": 0.538111881048102, + "grad_norm": 3.144435405731201, + "learning_rate": 8.57475855386822e-06, + "loss": 0.926, + "step": 6659 + }, + { + "epoch": 0.5381926907614295, + "grad_norm": 2.16461181640625, + "learning_rate": 8.57430101036642e-06, + "loss": 1.0754, + "step": 6660 + }, + { + "epoch": 0.5382735004747571, + "grad_norm": 3.0113322734832764, + "learning_rate": 8.57384340564545e-06, + "loss": 1.0134, + "step": 6661 + }, + { + "epoch": 0.5383543101880847, + "grad_norm": 2.7249655723571777, + "learning_rate": 8.573385739713147e-06, + "loss": 0.9509, + "step": 6662 + }, + { + "epoch": 0.5384351199014121, + "grad_norm": 2.586225986480713, + "learning_rate": 8.572928012577347e-06, + "loss": 1.0151, + "step": 6663 + }, + { + "epoch": 0.5385159296147397, + "grad_norm": 2.8383288383483887, + "learning_rate": 8.572470224245892e-06, + "loss": 0.9979, + "step": 6664 + }, + { + "epoch": 0.5385967393280673, + "grad_norm": 2.82600998878479, + "learning_rate": 8.572012374726623e-06, + "loss": 0.9417, + "step": 6665 + }, + { + "epoch": 0.5386775490413948, + "grad_norm": 2.528249740600586, + "learning_rate": 8.571554464027381e-06, + "loss": 0.9845, + "step": 6666 + }, + { + "epoch": 0.5387583587547223, + "grad_norm": 3.2860031127929688, + "learning_rate": 8.571096492156008e-06, + "loss": 1.0213, + "step": 6667 + }, + { + "epoch": 0.5388391684680499, + "grad_norm": 2.217886447906494, + "learning_rate": 8.570638459120351e-06, + "loss": 1.0046, + "step": 6668 + }, + { + "epoch": 0.5389199781813774, + "grad_norm": 3.0639541149139404, + "learning_rate": 8.570180364928252e-06, + "loss": 0.9779, + "step": 6669 + }, + { + "epoch": 0.539000787894705, + "grad_norm": 2.952514171600342, + "learning_rate": 8.56972220958756e-06, + "loss": 1.0254, + "step": 6670 + }, + { + "epoch": 0.5390815976080325, + "grad_norm": 2.5589892864227295, + "learning_rate": 8.569263993106118e-06, + "loss": 1.0359, + "step": 6671 + }, + { + "epoch": 0.53916240732136, + "grad_norm": 2.618180751800537, + "learning_rate": 8.568805715491777e-06, + "loss": 0.8866, + "step": 6672 + }, + { + "epoch": 0.5392432170346876, + "grad_norm": 2.5594685077667236, + "learning_rate": 8.568347376752387e-06, + "loss": 0.9899, + "step": 6673 + }, + { + "epoch": 0.5393240267480152, + "grad_norm": 2.4848663806915283, + "learning_rate": 8.567888976895795e-06, + "loss": 0.9149, + "step": 6674 + }, + { + "epoch": 0.5394048364613426, + "grad_norm": 2.714613199234009, + "learning_rate": 8.567430515929856e-06, + "loss": 0.9139, + "step": 6675 + }, + { + "epoch": 0.5394856461746702, + "grad_norm": 2.9260153770446777, + "learning_rate": 8.566971993862419e-06, + "loss": 0.9209, + "step": 6676 + }, + { + "epoch": 0.5395664558879978, + "grad_norm": 2.8303847312927246, + "learning_rate": 8.566513410701338e-06, + "loss": 0.8885, + "step": 6677 + }, + { + "epoch": 0.5396472656013253, + "grad_norm": 3.4838356971740723, + "learning_rate": 8.566054766454471e-06, + "loss": 0.9771, + "step": 6678 + }, + { + "epoch": 0.5397280753146528, + "grad_norm": 2.4133317470550537, + "learning_rate": 8.565596061129669e-06, + "loss": 0.8819, + "step": 6679 + }, + { + "epoch": 0.5398088850279804, + "grad_norm": 2.5783698558807373, + "learning_rate": 8.56513729473479e-06, + "loss": 0.9464, + "step": 6680 + }, + { + "epoch": 0.5398896947413079, + "grad_norm": 2.557776689529419, + "learning_rate": 8.564678467277693e-06, + "loss": 0.9017, + "step": 6681 + }, + { + "epoch": 0.5399705044546355, + "grad_norm": 2.467772960662842, + "learning_rate": 8.564219578766236e-06, + "loss": 1.0832, + "step": 6682 + }, + { + "epoch": 0.540051314167963, + "grad_norm": 3.1684677600860596, + "learning_rate": 8.563760629208277e-06, + "loss": 0.9311, + "step": 6683 + }, + { + "epoch": 0.5401321238812905, + "grad_norm": 3.209038019180298, + "learning_rate": 8.563301618611678e-06, + "loss": 0.8623, + "step": 6684 + }, + { + "epoch": 0.5402129335946181, + "grad_norm": 2.0523617267608643, + "learning_rate": 8.562842546984301e-06, + "loss": 1.0347, + "step": 6685 + }, + { + "epoch": 0.5402937433079457, + "grad_norm": 2.662071943283081, + "learning_rate": 8.562383414334006e-06, + "loss": 0.955, + "step": 6686 + }, + { + "epoch": 0.5403745530212731, + "grad_norm": 2.708681106567383, + "learning_rate": 8.561924220668663e-06, + "loss": 0.968, + "step": 6687 + }, + { + "epoch": 0.5404553627346007, + "grad_norm": 2.951434850692749, + "learning_rate": 8.561464965996132e-06, + "loss": 1.0313, + "step": 6688 + }, + { + "epoch": 0.5405361724479283, + "grad_norm": 2.426435708999634, + "learning_rate": 8.561005650324277e-06, + "loss": 0.971, + "step": 6689 + }, + { + "epoch": 0.5406169821612558, + "grad_norm": 2.5620830059051514, + "learning_rate": 8.56054627366097e-06, + "loss": 0.9397, + "step": 6690 + }, + { + "epoch": 0.5406977918745833, + "grad_norm": 3.1869468688964844, + "learning_rate": 8.560086836014078e-06, + "loss": 0.9638, + "step": 6691 + }, + { + "epoch": 0.5407786015879109, + "grad_norm": 2.8679094314575195, + "learning_rate": 8.559627337391469e-06, + "loss": 0.7878, + "step": 6692 + }, + { + "epoch": 0.5408594113012384, + "grad_norm": 2.715805768966675, + "learning_rate": 8.559167777801012e-06, + "loss": 0.8415, + "step": 6693 + }, + { + "epoch": 0.540940221014566, + "grad_norm": 2.559495687484741, + "learning_rate": 8.55870815725058e-06, + "loss": 0.8802, + "step": 6694 + }, + { + "epoch": 0.5410210307278935, + "grad_norm": 2.8612630367279053, + "learning_rate": 8.558248475748044e-06, + "loss": 0.8723, + "step": 6695 + }, + { + "epoch": 0.541101840441221, + "grad_norm": 2.882262945175171, + "learning_rate": 8.557788733301278e-06, + "loss": 0.9767, + "step": 6696 + }, + { + "epoch": 0.5411826501545486, + "grad_norm": 2.5127177238464355, + "learning_rate": 8.557328929918155e-06, + "loss": 1.0426, + "step": 6697 + }, + { + "epoch": 0.5412634598678762, + "grad_norm": 2.405726432800293, + "learning_rate": 8.556869065606554e-06, + "loss": 0.8544, + "step": 6698 + }, + { + "epoch": 0.5413442695812036, + "grad_norm": 2.8429133892059326, + "learning_rate": 8.556409140374347e-06, + "loss": 1.0006, + "step": 6699 + }, + { + "epoch": 0.5414250792945312, + "grad_norm": 3.339569091796875, + "learning_rate": 8.555949154229412e-06, + "loss": 0.9891, + "step": 6700 + }, + { + "epoch": 0.5415058890078588, + "grad_norm": 2.438951015472412, + "learning_rate": 8.55548910717963e-06, + "loss": 0.9815, + "step": 6701 + }, + { + "epoch": 0.5415866987211863, + "grad_norm": 2.578977108001709, + "learning_rate": 8.555028999232879e-06, + "loss": 1.0372, + "step": 6702 + }, + { + "epoch": 0.5416675084345138, + "grad_norm": 2.533268451690674, + "learning_rate": 8.554568830397039e-06, + "loss": 0.8836, + "step": 6703 + }, + { + "epoch": 0.5417483181478414, + "grad_norm": 2.6665029525756836, + "learning_rate": 8.554108600679993e-06, + "loss": 1.1245, + "step": 6704 + }, + { + "epoch": 0.5418291278611689, + "grad_norm": 2.560865879058838, + "learning_rate": 8.553648310089624e-06, + "loss": 0.9025, + "step": 6705 + }, + { + "epoch": 0.5419099375744965, + "grad_norm": 2.8905866146087646, + "learning_rate": 8.553187958633813e-06, + "loss": 0.9169, + "step": 6706 + }, + { + "epoch": 0.541990747287824, + "grad_norm": 2.840790271759033, + "learning_rate": 8.552727546320444e-06, + "loss": 0.9517, + "step": 6707 + }, + { + "epoch": 0.5420715570011515, + "grad_norm": 2.5757713317871094, + "learning_rate": 8.552267073157408e-06, + "loss": 1.091, + "step": 6708 + }, + { + "epoch": 0.5421523667144791, + "grad_norm": 2.7778074741363525, + "learning_rate": 8.55180653915259e-06, + "loss": 0.7855, + "step": 6709 + }, + { + "epoch": 0.5422331764278067, + "grad_norm": 2.9378228187561035, + "learning_rate": 8.551345944313874e-06, + "loss": 0.9325, + "step": 6710 + }, + { + "epoch": 0.5423139861411341, + "grad_norm": 2.6456212997436523, + "learning_rate": 8.55088528864915e-06, + "loss": 1.0049, + "step": 6711 + }, + { + "epoch": 0.5423947958544617, + "grad_norm": 2.3709723949432373, + "learning_rate": 8.550424572166313e-06, + "loss": 0.8832, + "step": 6712 + }, + { + "epoch": 0.5424756055677893, + "grad_norm": 2.709120750427246, + "learning_rate": 8.54996379487325e-06, + "loss": 0.9989, + "step": 6713 + }, + { + "epoch": 0.5425564152811168, + "grad_norm": 2.762544870376587, + "learning_rate": 8.549502956777852e-06, + "loss": 0.8828, + "step": 6714 + }, + { + "epoch": 0.5426372249944443, + "grad_norm": 2.7496256828308105, + "learning_rate": 8.549042057888013e-06, + "loss": 1.0145, + "step": 6715 + }, + { + "epoch": 0.5427180347077719, + "grad_norm": 2.6640915870666504, + "learning_rate": 8.548581098211628e-06, + "loss": 0.8594, + "step": 6716 + }, + { + "epoch": 0.5427988444210994, + "grad_norm": 2.965928554534912, + "learning_rate": 8.548120077756593e-06, + "loss": 0.9964, + "step": 6717 + }, + { + "epoch": 0.542879654134427, + "grad_norm": 2.6074132919311523, + "learning_rate": 8.547658996530801e-06, + "loss": 0.9174, + "step": 6718 + }, + { + "epoch": 0.5429604638477545, + "grad_norm": 2.2748515605926514, + "learning_rate": 8.547197854542154e-06, + "loss": 1.0433, + "step": 6719 + }, + { + "epoch": 0.543041273561082, + "grad_norm": 3.151670455932617, + "learning_rate": 8.546736651798544e-06, + "loss": 0.9248, + "step": 6720 + }, + { + "epoch": 0.5431220832744096, + "grad_norm": 2.9950156211853027, + "learning_rate": 8.546275388307874e-06, + "loss": 0.8626, + "step": 6721 + }, + { + "epoch": 0.5432028929877372, + "grad_norm": 2.853771686553955, + "learning_rate": 8.545814064078045e-06, + "loss": 0.9385, + "step": 6722 + }, + { + "epoch": 0.5432837027010646, + "grad_norm": 2.752655506134033, + "learning_rate": 8.545352679116956e-06, + "loss": 0.9427, + "step": 6723 + }, + { + "epoch": 0.5433645124143922, + "grad_norm": 2.7993664741516113, + "learning_rate": 8.544891233432513e-06, + "loss": 1.0047, + "step": 6724 + }, + { + "epoch": 0.5434453221277198, + "grad_norm": 2.392876386642456, + "learning_rate": 8.544429727032616e-06, + "loss": 0.8839, + "step": 6725 + }, + { + "epoch": 0.5435261318410473, + "grad_norm": 2.535710334777832, + "learning_rate": 8.54396815992517e-06, + "loss": 0.9808, + "step": 6726 + }, + { + "epoch": 0.5436069415543748, + "grad_norm": 2.690338611602783, + "learning_rate": 8.543506532118081e-06, + "loss": 0.9425, + "step": 6727 + }, + { + "epoch": 0.5436877512677024, + "grad_norm": 2.519347667694092, + "learning_rate": 8.543044843619255e-06, + "loss": 1.0035, + "step": 6728 + }, + { + "epoch": 0.5437685609810299, + "grad_norm": 3.0266170501708984, + "learning_rate": 8.542583094436603e-06, + "loss": 1.0533, + "step": 6729 + }, + { + "epoch": 0.5438493706943575, + "grad_norm": 2.4525606632232666, + "learning_rate": 8.542121284578028e-06, + "loss": 0.8922, + "step": 6730 + }, + { + "epoch": 0.543930180407685, + "grad_norm": 2.5058019161224365, + "learning_rate": 8.541659414051446e-06, + "loss": 0.9306, + "step": 6731 + }, + { + "epoch": 0.5440109901210125, + "grad_norm": 2.8877944946289062, + "learning_rate": 8.541197482864763e-06, + "loss": 0.8953, + "step": 6732 + }, + { + "epoch": 0.5440917998343401, + "grad_norm": 3.1442575454711914, + "learning_rate": 8.54073549102589e-06, + "loss": 0.9758, + "step": 6733 + }, + { + "epoch": 0.5441726095476677, + "grad_norm": 2.724261522293091, + "learning_rate": 8.540273438542746e-06, + "loss": 1.1241, + "step": 6734 + }, + { + "epoch": 0.5442534192609951, + "grad_norm": 2.991394519805908, + "learning_rate": 8.539811325423237e-06, + "loss": 1.0696, + "step": 6735 + }, + { + "epoch": 0.5443342289743227, + "grad_norm": 2.69022274017334, + "learning_rate": 8.539349151675285e-06, + "loss": 1.0104, + "step": 6736 + }, + { + "epoch": 0.5444150386876503, + "grad_norm": 2.8307008743286133, + "learning_rate": 8.5388869173068e-06, + "loss": 1.0641, + "step": 6737 + }, + { + "epoch": 0.5444958484009778, + "grad_norm": 2.664032459259033, + "learning_rate": 8.538424622325705e-06, + "loss": 0.9752, + "step": 6738 + }, + { + "epoch": 0.5445766581143053, + "grad_norm": 2.362687110900879, + "learning_rate": 8.537962266739912e-06, + "loss": 0.9436, + "step": 6739 + }, + { + "epoch": 0.5446574678276329, + "grad_norm": 2.7971317768096924, + "learning_rate": 8.537499850557343e-06, + "loss": 0.9524, + "step": 6740 + }, + { + "epoch": 0.5447382775409604, + "grad_norm": 2.574246644973755, + "learning_rate": 8.537037373785917e-06, + "loss": 0.9919, + "step": 6741 + }, + { + "epoch": 0.544819087254288, + "grad_norm": 2.4597177505493164, + "learning_rate": 8.536574836433558e-06, + "loss": 1.008, + "step": 6742 + }, + { + "epoch": 0.5448998969676155, + "grad_norm": 2.4348719120025635, + "learning_rate": 8.536112238508183e-06, + "loss": 1.0923, + "step": 6743 + }, + { + "epoch": 0.544980706680943, + "grad_norm": 2.532266855239868, + "learning_rate": 8.53564958001772e-06, + "loss": 0.9468, + "step": 6744 + }, + { + "epoch": 0.5450615163942706, + "grad_norm": 2.626479387283325, + "learning_rate": 8.535186860970093e-06, + "loss": 0.9075, + "step": 6745 + }, + { + "epoch": 0.5451423261075982, + "grad_norm": 2.474057674407959, + "learning_rate": 8.534724081373224e-06, + "loss": 0.9538, + "step": 6746 + }, + { + "epoch": 0.5452231358209256, + "grad_norm": 2.630878210067749, + "learning_rate": 8.53426124123504e-06, + "loss": 0.8873, + "step": 6747 + }, + { + "epoch": 0.5453039455342532, + "grad_norm": 2.385979413986206, + "learning_rate": 8.53379834056347e-06, + "loss": 0.9396, + "step": 6748 + }, + { + "epoch": 0.5453847552475808, + "grad_norm": 2.3376524448394775, + "learning_rate": 8.533335379366442e-06, + "loss": 0.9887, + "step": 6749 + }, + { + "epoch": 0.5454655649609083, + "grad_norm": 2.728322744369507, + "learning_rate": 8.532872357651887e-06, + "loss": 1.0434, + "step": 6750 + }, + { + "epoch": 0.5455463746742358, + "grad_norm": 2.6892123222351074, + "learning_rate": 8.532409275427731e-06, + "loss": 0.8898, + "step": 6751 + }, + { + "epoch": 0.5456271843875634, + "grad_norm": 2.754441022872925, + "learning_rate": 8.531946132701909e-06, + "loss": 0.886, + "step": 6752 + }, + { + "epoch": 0.5457079941008909, + "grad_norm": 2.2685694694519043, + "learning_rate": 8.531482929482352e-06, + "loss": 1.0556, + "step": 6753 + }, + { + "epoch": 0.5457888038142185, + "grad_norm": 2.628941059112549, + "learning_rate": 8.531019665776994e-06, + "loss": 0.9329, + "step": 6754 + }, + { + "epoch": 0.545869613527546, + "grad_norm": 2.7105507850646973, + "learning_rate": 8.53055634159377e-06, + "loss": 0.9958, + "step": 6755 + }, + { + "epoch": 0.5459504232408735, + "grad_norm": 3.347438097000122, + "learning_rate": 8.530092956940617e-06, + "loss": 0.9348, + "step": 6756 + }, + { + "epoch": 0.5460312329542011, + "grad_norm": 2.9810469150543213, + "learning_rate": 8.529629511825467e-06, + "loss": 0.9408, + "step": 6757 + }, + { + "epoch": 0.5461120426675287, + "grad_norm": 2.905885696411133, + "learning_rate": 8.529166006256263e-06, + "loss": 0.9919, + "step": 6758 + }, + { + "epoch": 0.5461928523808561, + "grad_norm": 3.1253116130828857, + "learning_rate": 8.52870244024094e-06, + "loss": 0.9033, + "step": 6759 + }, + { + "epoch": 0.5462736620941837, + "grad_norm": 2.615163803100586, + "learning_rate": 8.52823881378744e-06, + "loss": 0.8916, + "step": 6760 + }, + { + "epoch": 0.5463544718075113, + "grad_norm": 2.942991018295288, + "learning_rate": 8.527775126903704e-06, + "loss": 0.949, + "step": 6761 + }, + { + "epoch": 0.5464352815208388, + "grad_norm": 2.9247636795043945, + "learning_rate": 8.527311379597672e-06, + "loss": 0.9719, + "step": 6762 + }, + { + "epoch": 0.5465160912341663, + "grad_norm": 2.6420843601226807, + "learning_rate": 8.526847571877286e-06, + "loss": 0.9688, + "step": 6763 + }, + { + "epoch": 0.5465969009474939, + "grad_norm": 2.7086527347564697, + "learning_rate": 8.526383703750494e-06, + "loss": 0.998, + "step": 6764 + }, + { + "epoch": 0.5466777106608214, + "grad_norm": 3.503931760787964, + "learning_rate": 8.525919775225237e-06, + "loss": 1.0477, + "step": 6765 + }, + { + "epoch": 0.546758520374149, + "grad_norm": 2.868701457977295, + "learning_rate": 8.525455786309464e-06, + "loss": 0.9422, + "step": 6766 + }, + { + "epoch": 0.5468393300874765, + "grad_norm": 2.823075532913208, + "learning_rate": 8.524991737011119e-06, + "loss": 0.8767, + "step": 6767 + }, + { + "epoch": 0.546920139800804, + "grad_norm": 2.9327478408813477, + "learning_rate": 8.524527627338152e-06, + "loss": 0.9128, + "step": 6768 + }, + { + "epoch": 0.5470009495141316, + "grad_norm": 2.5013482570648193, + "learning_rate": 8.524063457298513e-06, + "loss": 0.9008, + "step": 6769 + }, + { + "epoch": 0.5470817592274592, + "grad_norm": 2.986111879348755, + "learning_rate": 8.52359922690015e-06, + "loss": 1.0234, + "step": 6770 + }, + { + "epoch": 0.5471625689407866, + "grad_norm": 2.450460195541382, + "learning_rate": 8.523134936151014e-06, + "loss": 0.9352, + "step": 6771 + }, + { + "epoch": 0.5472433786541142, + "grad_norm": 2.786874294281006, + "learning_rate": 8.52267058505906e-06, + "loss": 0.96, + "step": 6772 + }, + { + "epoch": 0.5473241883674418, + "grad_norm": 2.892984390258789, + "learning_rate": 8.522206173632238e-06, + "loss": 0.9867, + "step": 6773 + }, + { + "epoch": 0.5474049980807693, + "grad_norm": 2.5300848484039307, + "learning_rate": 8.521741701878502e-06, + "loss": 0.9617, + "step": 6774 + }, + { + "epoch": 0.5474858077940968, + "grad_norm": 2.673757791519165, + "learning_rate": 8.521277169805813e-06, + "loss": 0.9111, + "step": 6775 + }, + { + "epoch": 0.5475666175074244, + "grad_norm": 2.650216817855835, + "learning_rate": 8.520812577422119e-06, + "loss": 0.9323, + "step": 6776 + }, + { + "epoch": 0.5476474272207519, + "grad_norm": 2.9507553577423096, + "learning_rate": 8.520347924735387e-06, + "loss": 0.891, + "step": 6777 + }, + { + "epoch": 0.5477282369340795, + "grad_norm": 2.8094611167907715, + "learning_rate": 8.519883211753566e-06, + "loss": 1.0348, + "step": 6778 + }, + { + "epoch": 0.547809046647407, + "grad_norm": 2.6251227855682373, + "learning_rate": 8.51941843848462e-06, + "loss": 0.9825, + "step": 6779 + }, + { + "epoch": 0.5478898563607345, + "grad_norm": 2.7280211448669434, + "learning_rate": 8.518953604936507e-06, + "loss": 0.9915, + "step": 6780 + }, + { + "epoch": 0.5479706660740621, + "grad_norm": 3.11441969871521, + "learning_rate": 8.518488711117192e-06, + "loss": 0.8633, + "step": 6781 + }, + { + "epoch": 0.5480514757873897, + "grad_norm": 2.5203189849853516, + "learning_rate": 8.518023757034636e-06, + "loss": 1.0, + "step": 6782 + }, + { + "epoch": 0.5481322855007171, + "grad_norm": 2.710209608078003, + "learning_rate": 8.517558742696802e-06, + "loss": 0.8671, + "step": 6783 + }, + { + "epoch": 0.5482130952140447, + "grad_norm": 2.584124803543091, + "learning_rate": 8.517093668111656e-06, + "loss": 1.02, + "step": 6784 + }, + { + "epoch": 0.5482939049273723, + "grad_norm": 2.77771258354187, + "learning_rate": 8.516628533287162e-06, + "loss": 1.0516, + "step": 6785 + }, + { + "epoch": 0.5483747146406999, + "grad_norm": 2.7872426509857178, + "learning_rate": 8.516163338231286e-06, + "loss": 0.8756, + "step": 6786 + }, + { + "epoch": 0.5484555243540273, + "grad_norm": 2.467712163925171, + "learning_rate": 8.515698082951998e-06, + "loss": 0.9397, + "step": 6787 + }, + { + "epoch": 0.5485363340673549, + "grad_norm": 2.6244587898254395, + "learning_rate": 8.515232767457265e-06, + "loss": 1.0316, + "step": 6788 + }, + { + "epoch": 0.5486171437806825, + "grad_norm": 2.871271848678589, + "learning_rate": 8.514767391755057e-06, + "loss": 0.994, + "step": 6789 + }, + { + "epoch": 0.54869795349401, + "grad_norm": 2.5260796546936035, + "learning_rate": 8.514301955853345e-06, + "loss": 0.9915, + "step": 6790 + }, + { + "epoch": 0.5487787632073375, + "grad_norm": 3.337914228439331, + "learning_rate": 8.513836459760102e-06, + "loss": 0.9211, + "step": 6791 + }, + { + "epoch": 0.5488595729206651, + "grad_norm": 2.516460657119751, + "learning_rate": 8.513370903483299e-06, + "loss": 0.9487, + "step": 6792 + }, + { + "epoch": 0.5489403826339926, + "grad_norm": 3.115722417831421, + "learning_rate": 8.51290528703091e-06, + "loss": 0.9131, + "step": 6793 + }, + { + "epoch": 0.5490211923473202, + "grad_norm": 2.6946163177490234, + "learning_rate": 8.51243961041091e-06, + "loss": 0.9662, + "step": 6794 + }, + { + "epoch": 0.5491020020606477, + "grad_norm": 2.703784704208374, + "learning_rate": 8.511973873631278e-06, + "loss": 0.9843, + "step": 6795 + }, + { + "epoch": 0.5491828117739752, + "grad_norm": 2.922241449356079, + "learning_rate": 8.511508076699985e-06, + "loss": 0.9989, + "step": 6796 + }, + { + "epoch": 0.5492636214873028, + "grad_norm": 3.141031265258789, + "learning_rate": 8.511042219625015e-06, + "loss": 1.0081, + "step": 6797 + }, + { + "epoch": 0.5493444312006304, + "grad_norm": 2.9548773765563965, + "learning_rate": 8.510576302414342e-06, + "loss": 0.9744, + "step": 6798 + }, + { + "epoch": 0.5494252409139578, + "grad_norm": 2.4163296222686768, + "learning_rate": 8.510110325075948e-06, + "loss": 1.0437, + "step": 6799 + }, + { + "epoch": 0.5495060506272854, + "grad_norm": 2.843646764755249, + "learning_rate": 8.509644287617817e-06, + "loss": 1.1203, + "step": 6800 + }, + { + "epoch": 0.549586860340613, + "grad_norm": 2.435694456100464, + "learning_rate": 8.509178190047926e-06, + "loss": 1.0005, + "step": 6801 + }, + { + "epoch": 0.5496676700539405, + "grad_norm": 2.7450308799743652, + "learning_rate": 8.508712032374263e-06, + "loss": 0.9601, + "step": 6802 + }, + { + "epoch": 0.549748479767268, + "grad_norm": 2.4280636310577393, + "learning_rate": 8.50824581460481e-06, + "loss": 0.8992, + "step": 6803 + }, + { + "epoch": 0.5498292894805956, + "grad_norm": 2.5397446155548096, + "learning_rate": 8.50777953674755e-06, + "loss": 0.9561, + "step": 6804 + }, + { + "epoch": 0.5499100991939231, + "grad_norm": 3.3508310317993164, + "learning_rate": 8.50731319881047e-06, + "loss": 1.0794, + "step": 6805 + }, + { + "epoch": 0.5499909089072507, + "grad_norm": 2.627250909805298, + "learning_rate": 8.50684680080156e-06, + "loss": 0.977, + "step": 6806 + }, + { + "epoch": 0.5500717186205782, + "grad_norm": 2.7339882850646973, + "learning_rate": 8.506380342728808e-06, + "loss": 0.8917, + "step": 6807 + }, + { + "epoch": 0.5501525283339057, + "grad_norm": 2.551746368408203, + "learning_rate": 8.505913824600201e-06, + "loss": 1.0379, + "step": 6808 + }, + { + "epoch": 0.5502333380472333, + "grad_norm": 2.5389673709869385, + "learning_rate": 8.50544724642373e-06, + "loss": 0.8873, + "step": 6809 + }, + { + "epoch": 0.5503141477605609, + "grad_norm": 2.4108564853668213, + "learning_rate": 8.504980608207386e-06, + "loss": 0.9618, + "step": 6810 + }, + { + "epoch": 0.5503949574738883, + "grad_norm": 2.981092691421509, + "learning_rate": 8.504513909959164e-06, + "loss": 0.8691, + "step": 6811 + }, + { + "epoch": 0.5504757671872159, + "grad_norm": 2.546948194503784, + "learning_rate": 8.504047151687054e-06, + "loss": 0.9692, + "step": 6812 + }, + { + "epoch": 0.5505565769005435, + "grad_norm": 2.4042325019836426, + "learning_rate": 8.50358033339905e-06, + "loss": 0.9395, + "step": 6813 + }, + { + "epoch": 0.550637386613871, + "grad_norm": 2.7482199668884277, + "learning_rate": 8.503113455103154e-06, + "loss": 0.8634, + "step": 6814 + }, + { + "epoch": 0.5507181963271985, + "grad_norm": 2.4952049255371094, + "learning_rate": 8.502646516807354e-06, + "loss": 0.9774, + "step": 6815 + }, + { + "epoch": 0.5507990060405261, + "grad_norm": 2.6934478282928467, + "learning_rate": 8.502179518519654e-06, + "loss": 0.939, + "step": 6816 + }, + { + "epoch": 0.5508798157538536, + "grad_norm": 3.1893129348754883, + "learning_rate": 8.501712460248049e-06, + "loss": 0.9484, + "step": 6817 + }, + { + "epoch": 0.5509606254671812, + "grad_norm": 2.5171241760253906, + "learning_rate": 8.501245342000539e-06, + "loss": 0.9987, + "step": 6818 + }, + { + "epoch": 0.5510414351805087, + "grad_norm": 2.6813125610351562, + "learning_rate": 8.500778163785126e-06, + "loss": 0.9635, + "step": 6819 + }, + { + "epoch": 0.5511222448938362, + "grad_norm": 3.047844886779785, + "learning_rate": 8.500310925609812e-06, + "loss": 0.9055, + "step": 6820 + }, + { + "epoch": 0.5512030546071638, + "grad_norm": 2.725375175476074, + "learning_rate": 8.499843627482596e-06, + "loss": 0.991, + "step": 6821 + }, + { + "epoch": 0.5512838643204914, + "grad_norm": 2.3166511058807373, + "learning_rate": 8.499376269411485e-06, + "loss": 0.8638, + "step": 6822 + }, + { + "epoch": 0.5513646740338188, + "grad_norm": 2.7540152072906494, + "learning_rate": 8.498908851404484e-06, + "loss": 1.0671, + "step": 6823 + }, + { + "epoch": 0.5514454837471464, + "grad_norm": 2.56469464302063, + "learning_rate": 8.498441373469597e-06, + "loss": 0.9647, + "step": 6824 + }, + { + "epoch": 0.551526293460474, + "grad_norm": 2.7377378940582275, + "learning_rate": 8.497973835614831e-06, + "loss": 1.0408, + "step": 6825 + }, + { + "epoch": 0.5516071031738015, + "grad_norm": 2.9289684295654297, + "learning_rate": 8.497506237848194e-06, + "loss": 0.9382, + "step": 6826 + }, + { + "epoch": 0.551687912887129, + "grad_norm": 2.7107300758361816, + "learning_rate": 8.497038580177696e-06, + "loss": 0.9402, + "step": 6827 + }, + { + "epoch": 0.5517687226004566, + "grad_norm": 3.1685383319854736, + "learning_rate": 8.496570862611346e-06, + "loss": 1.0167, + "step": 6828 + }, + { + "epoch": 0.5518495323137841, + "grad_norm": 2.8260293006896973, + "learning_rate": 8.496103085157155e-06, + "loss": 0.8844, + "step": 6829 + }, + { + "epoch": 0.5519303420271117, + "grad_norm": 2.6349635124206543, + "learning_rate": 8.495635247823132e-06, + "loss": 1.0421, + "step": 6830 + }, + { + "epoch": 0.5520111517404392, + "grad_norm": 2.400712728500366, + "learning_rate": 8.495167350617296e-06, + "loss": 1.0026, + "step": 6831 + }, + { + "epoch": 0.5520919614537667, + "grad_norm": 2.3358864784240723, + "learning_rate": 8.494699393547656e-06, + "loss": 0.9533, + "step": 6832 + }, + { + "epoch": 0.5521727711670943, + "grad_norm": 2.700319528579712, + "learning_rate": 8.494231376622228e-06, + "loss": 0.9356, + "step": 6833 + }, + { + "epoch": 0.5522535808804219, + "grad_norm": 2.5020880699157715, + "learning_rate": 8.49376329984903e-06, + "loss": 0.8913, + "step": 6834 + }, + { + "epoch": 0.5523343905937493, + "grad_norm": 2.6957719326019287, + "learning_rate": 8.493295163236077e-06, + "loss": 0.9682, + "step": 6835 + }, + { + "epoch": 0.5524152003070769, + "grad_norm": 2.592831611633301, + "learning_rate": 8.492826966791387e-06, + "loss": 0.9135, + "step": 6836 + }, + { + "epoch": 0.5524960100204045, + "grad_norm": 2.630263566970825, + "learning_rate": 8.49235871052298e-06, + "loss": 0.946, + "step": 6837 + }, + { + "epoch": 0.552576819733732, + "grad_norm": 2.397728204727173, + "learning_rate": 8.491890394438875e-06, + "loss": 0.9447, + "step": 6838 + }, + { + "epoch": 0.5526576294470595, + "grad_norm": 2.4945151805877686, + "learning_rate": 8.491422018547096e-06, + "loss": 1.0325, + "step": 6839 + }, + { + "epoch": 0.5527384391603871, + "grad_norm": 2.8656532764434814, + "learning_rate": 8.490953582855662e-06, + "loss": 1.0034, + "step": 6840 + }, + { + "epoch": 0.5528192488737146, + "grad_norm": 2.750896453857422, + "learning_rate": 8.490485087372597e-06, + "loss": 1.0123, + "step": 6841 + }, + { + "epoch": 0.5529000585870422, + "grad_norm": 2.744513750076294, + "learning_rate": 8.490016532105925e-06, + "loss": 0.9346, + "step": 6842 + }, + { + "epoch": 0.5529808683003697, + "grad_norm": 2.2509818077087402, + "learning_rate": 8.489547917063672e-06, + "loss": 0.9299, + "step": 6843 + }, + { + "epoch": 0.5530616780136972, + "grad_norm": 3.089818239212036, + "learning_rate": 8.489079242253863e-06, + "loss": 1.0968, + "step": 6844 + }, + { + "epoch": 0.5531424877270248, + "grad_norm": 3.1147658824920654, + "learning_rate": 8.48861050768453e-06, + "loss": 0.9114, + "step": 6845 + }, + { + "epoch": 0.5532232974403524, + "grad_norm": 2.765091896057129, + "learning_rate": 8.488141713363694e-06, + "loss": 0.8719, + "step": 6846 + }, + { + "epoch": 0.5533041071536798, + "grad_norm": 2.625201940536499, + "learning_rate": 8.487672859299389e-06, + "loss": 0.8287, + "step": 6847 + }, + { + "epoch": 0.5533849168670074, + "grad_norm": 2.8303000926971436, + "learning_rate": 8.487203945499644e-06, + "loss": 0.9859, + "step": 6848 + }, + { + "epoch": 0.553465726580335, + "grad_norm": 3.1322367191314697, + "learning_rate": 8.486734971972488e-06, + "loss": 0.9282, + "step": 6849 + }, + { + "epoch": 0.5535465362936625, + "grad_norm": 2.78836989402771, + "learning_rate": 8.48626593872596e-06, + "loss": 0.8623, + "step": 6850 + }, + { + "epoch": 0.55362734600699, + "grad_norm": 2.747438669204712, + "learning_rate": 8.485796845768088e-06, + "loss": 0.9469, + "step": 6851 + }, + { + "epoch": 0.5537081557203176, + "grad_norm": 2.545588493347168, + "learning_rate": 8.485327693106907e-06, + "loss": 0.9366, + "step": 6852 + }, + { + "epoch": 0.5537889654336451, + "grad_norm": 3.3030734062194824, + "learning_rate": 8.484858480750453e-06, + "loss": 0.9091, + "step": 6853 + }, + { + "epoch": 0.5538697751469727, + "grad_norm": 2.8627467155456543, + "learning_rate": 8.484389208706763e-06, + "loss": 1.0392, + "step": 6854 + }, + { + "epoch": 0.5539505848603002, + "grad_norm": 2.5307259559631348, + "learning_rate": 8.483919876983875e-06, + "loss": 1.0267, + "step": 6855 + }, + { + "epoch": 0.5540313945736277, + "grad_norm": 2.709975242614746, + "learning_rate": 8.483450485589825e-06, + "loss": 1.0282, + "step": 6856 + }, + { + "epoch": 0.5541122042869553, + "grad_norm": 3.306159019470215, + "learning_rate": 8.482981034532655e-06, + "loss": 0.9008, + "step": 6857 + }, + { + "epoch": 0.5541930140002829, + "grad_norm": 2.525493621826172, + "learning_rate": 8.482511523820407e-06, + "loss": 1.053, + "step": 6858 + }, + { + "epoch": 0.5542738237136103, + "grad_norm": 2.9141557216644287, + "learning_rate": 8.482041953461117e-06, + "loss": 0.9085, + "step": 6859 + }, + { + "epoch": 0.5543546334269379, + "grad_norm": 2.58060359954834, + "learning_rate": 8.481572323462831e-06, + "loss": 0.9574, + "step": 6860 + }, + { + "epoch": 0.5544354431402655, + "grad_norm": 2.615966558456421, + "learning_rate": 8.481102633833595e-06, + "loss": 0.8299, + "step": 6861 + }, + { + "epoch": 0.554516252853593, + "grad_norm": 2.61551833152771, + "learning_rate": 8.480632884581453e-06, + "loss": 1.0211, + "step": 6862 + }, + { + "epoch": 0.5545970625669205, + "grad_norm": 2.8956634998321533, + "learning_rate": 8.480163075714445e-06, + "loss": 0.8885, + "step": 6863 + }, + { + "epoch": 0.5546778722802481, + "grad_norm": 2.880720615386963, + "learning_rate": 8.479693207240624e-06, + "loss": 1.1534, + "step": 6864 + }, + { + "epoch": 0.5547586819935756, + "grad_norm": 2.5820982456207275, + "learning_rate": 8.479223279168034e-06, + "loss": 0.8149, + "step": 6865 + }, + { + "epoch": 0.5548394917069032, + "grad_norm": 2.46889591217041, + "learning_rate": 8.478753291504726e-06, + "loss": 0.8981, + "step": 6866 + }, + { + "epoch": 0.5549203014202307, + "grad_norm": 2.5251431465148926, + "learning_rate": 8.478283244258749e-06, + "loss": 1.0616, + "step": 6867 + }, + { + "epoch": 0.5550011111335582, + "grad_norm": 2.763038396835327, + "learning_rate": 8.47781313743815e-06, + "loss": 0.9674, + "step": 6868 + }, + { + "epoch": 0.5550819208468858, + "grad_norm": 2.5594913959503174, + "learning_rate": 8.47734297105099e-06, + "loss": 0.9419, + "step": 6869 + }, + { + "epoch": 0.5551627305602134, + "grad_norm": 2.556610584259033, + "learning_rate": 8.476872745105314e-06, + "loss": 0.973, + "step": 6870 + }, + { + "epoch": 0.5552435402735408, + "grad_norm": 3.166529417037964, + "learning_rate": 8.476402459609177e-06, + "loss": 0.9445, + "step": 6871 + }, + { + "epoch": 0.5553243499868684, + "grad_norm": 2.978961229324341, + "learning_rate": 8.475932114570636e-06, + "loss": 0.9309, + "step": 6872 + }, + { + "epoch": 0.555405159700196, + "grad_norm": 2.9318814277648926, + "learning_rate": 8.475461709997747e-06, + "loss": 0.8804, + "step": 6873 + }, + { + "epoch": 0.5554859694135235, + "grad_norm": 2.61983323097229, + "learning_rate": 8.474991245898564e-06, + "loss": 1.0351, + "step": 6874 + }, + { + "epoch": 0.555566779126851, + "grad_norm": 2.6989834308624268, + "learning_rate": 8.474520722281148e-06, + "loss": 0.9426, + "step": 6875 + }, + { + "epoch": 0.5556475888401786, + "grad_norm": 2.703728675842285, + "learning_rate": 8.474050139153556e-06, + "loss": 0.9859, + "step": 6876 + }, + { + "epoch": 0.5557283985535061, + "grad_norm": 2.4480912685394287, + "learning_rate": 8.473579496523846e-06, + "loss": 1.0633, + "step": 6877 + }, + { + "epoch": 0.5558092082668337, + "grad_norm": 2.636629819869995, + "learning_rate": 8.473108794400084e-06, + "loss": 0.9045, + "step": 6878 + }, + { + "epoch": 0.5558900179801612, + "grad_norm": 2.699205160140991, + "learning_rate": 8.47263803279033e-06, + "loss": 0.9455, + "step": 6879 + }, + { + "epoch": 0.5559708276934887, + "grad_norm": 2.7170090675354004, + "learning_rate": 8.472167211702646e-06, + "loss": 1.059, + "step": 6880 + }, + { + "epoch": 0.5560516374068163, + "grad_norm": 2.557217597961426, + "learning_rate": 8.471696331145096e-06, + "loss": 0.9927, + "step": 6881 + }, + { + "epoch": 0.5561324471201439, + "grad_norm": 2.4627785682678223, + "learning_rate": 8.471225391125745e-06, + "loss": 0.9437, + "step": 6882 + }, + { + "epoch": 0.5562132568334713, + "grad_norm": 2.7011351585388184, + "learning_rate": 8.470754391652662e-06, + "loss": 0.8459, + "step": 6883 + }, + { + "epoch": 0.5562940665467989, + "grad_norm": 2.37245774269104, + "learning_rate": 8.470283332733911e-06, + "loss": 0.9837, + "step": 6884 + }, + { + "epoch": 0.5563748762601265, + "grad_norm": 2.810795783996582, + "learning_rate": 8.46981221437756e-06, + "loss": 1.0007, + "step": 6885 + }, + { + "epoch": 0.556455685973454, + "grad_norm": 2.83638596534729, + "learning_rate": 8.46934103659168e-06, + "loss": 1.0018, + "step": 6886 + }, + { + "epoch": 0.5565364956867815, + "grad_norm": 2.2954795360565186, + "learning_rate": 8.468869799384338e-06, + "loss": 0.9581, + "step": 6887 + }, + { + "epoch": 0.5566173054001091, + "grad_norm": 2.6287801265716553, + "learning_rate": 8.46839850276361e-06, + "loss": 0.9544, + "step": 6888 + }, + { + "epoch": 0.5566981151134366, + "grad_norm": 2.4404377937316895, + "learning_rate": 8.467927146737565e-06, + "loss": 1.169, + "step": 6889 + }, + { + "epoch": 0.5567789248267642, + "grad_norm": 2.6569740772247314, + "learning_rate": 8.467455731314276e-06, + "loss": 0.853, + "step": 6890 + }, + { + "epoch": 0.5568597345400917, + "grad_norm": 2.9116122722625732, + "learning_rate": 8.466984256501817e-06, + "loss": 1.0032, + "step": 6891 + }, + { + "epoch": 0.5569405442534192, + "grad_norm": 2.972496271133423, + "learning_rate": 8.466512722308264e-06, + "loss": 0.9627, + "step": 6892 + }, + { + "epoch": 0.5570213539667468, + "grad_norm": 2.762817859649658, + "learning_rate": 8.466041128741695e-06, + "loss": 1.1623, + "step": 6893 + }, + { + "epoch": 0.5571021636800744, + "grad_norm": 2.7529847621917725, + "learning_rate": 8.465569475810185e-06, + "loss": 0.9453, + "step": 6894 + }, + { + "epoch": 0.5571829733934018, + "grad_norm": 2.5715386867523193, + "learning_rate": 8.465097763521812e-06, + "loss": 1.0209, + "step": 6895 + }, + { + "epoch": 0.5572637831067294, + "grad_norm": 2.6773593425750732, + "learning_rate": 8.464625991884658e-06, + "loss": 0.8398, + "step": 6896 + }, + { + "epoch": 0.557344592820057, + "grad_norm": 3.032032012939453, + "learning_rate": 8.4641541609068e-06, + "loss": 1.0124, + "step": 6897 + }, + { + "epoch": 0.5574254025333845, + "grad_norm": 3.1945559978485107, + "learning_rate": 8.463682270596322e-06, + "loss": 0.958, + "step": 6898 + }, + { + "epoch": 0.557506212246712, + "grad_norm": 2.645900011062622, + "learning_rate": 8.463210320961303e-06, + "loss": 0.8841, + "step": 6899 + }, + { + "epoch": 0.5575870219600396, + "grad_norm": 2.7547786235809326, + "learning_rate": 8.462738312009831e-06, + "loss": 1.0065, + "step": 6900 + }, + { + "epoch": 0.5576678316733671, + "grad_norm": 2.526700019836426, + "learning_rate": 8.462266243749987e-06, + "loss": 0.9441, + "step": 6901 + }, + { + "epoch": 0.5577486413866947, + "grad_norm": 2.5337414741516113, + "learning_rate": 8.461794116189857e-06, + "loss": 1.057, + "step": 6902 + }, + { + "epoch": 0.5578294511000222, + "grad_norm": 2.90421199798584, + "learning_rate": 8.46132192933753e-06, + "loss": 1.1456, + "step": 6903 + }, + { + "epoch": 0.5579102608133497, + "grad_norm": 2.5645174980163574, + "learning_rate": 8.46084968320109e-06, + "loss": 1.0617, + "step": 6904 + }, + { + "epoch": 0.5579910705266773, + "grad_norm": 2.4103856086730957, + "learning_rate": 8.460377377788624e-06, + "loss": 0.9526, + "step": 6905 + }, + { + "epoch": 0.5580718802400049, + "grad_norm": 2.3386809825897217, + "learning_rate": 8.459905013108225e-06, + "loss": 0.9411, + "step": 6906 + }, + { + "epoch": 0.5581526899533323, + "grad_norm": 2.7163736820220947, + "learning_rate": 8.459432589167985e-06, + "loss": 0.9477, + "step": 6907 + }, + { + "epoch": 0.5582334996666599, + "grad_norm": 2.474764347076416, + "learning_rate": 8.45896010597599e-06, + "loss": 0.8838, + "step": 6908 + }, + { + "epoch": 0.5583143093799875, + "grad_norm": 2.712538003921509, + "learning_rate": 8.458487563540337e-06, + "loss": 1.0101, + "step": 6909 + }, + { + "epoch": 0.558395119093315, + "grad_norm": 2.8555169105529785, + "learning_rate": 8.458014961869118e-06, + "loss": 1.005, + "step": 6910 + }, + { + "epoch": 0.5584759288066425, + "grad_norm": 2.7186081409454346, + "learning_rate": 8.457542300970427e-06, + "loss": 0.9449, + "step": 6911 + }, + { + "epoch": 0.5585567385199701, + "grad_norm": 2.64795184135437, + "learning_rate": 8.45706958085236e-06, + "loss": 0.9931, + "step": 6912 + }, + { + "epoch": 0.5586375482332976, + "grad_norm": 2.4156928062438965, + "learning_rate": 8.456596801523014e-06, + "loss": 1.0063, + "step": 6913 + }, + { + "epoch": 0.5587183579466252, + "grad_norm": 2.5548274517059326, + "learning_rate": 8.456123962990487e-06, + "loss": 0.9713, + "step": 6914 + }, + { + "epoch": 0.5587991676599527, + "grad_norm": 2.6821956634521484, + "learning_rate": 8.455651065262876e-06, + "loss": 0.8685, + "step": 6915 + }, + { + "epoch": 0.5588799773732803, + "grad_norm": 2.311872959136963, + "learning_rate": 8.45517810834828e-06, + "loss": 1.0587, + "step": 6916 + }, + { + "epoch": 0.5589607870866078, + "grad_norm": 3.150648355484009, + "learning_rate": 8.454705092254803e-06, + "loss": 0.9739, + "step": 6917 + }, + { + "epoch": 0.5590415967999354, + "grad_norm": 2.1819918155670166, + "learning_rate": 8.454232016990544e-06, + "loss": 0.9802, + "step": 6918 + }, + { + "epoch": 0.559122406513263, + "grad_norm": 2.6903281211853027, + "learning_rate": 8.453758882563608e-06, + "loss": 1.0517, + "step": 6919 + }, + { + "epoch": 0.5592032162265904, + "grad_norm": 2.49411678314209, + "learning_rate": 8.453285688982095e-06, + "loss": 0.964, + "step": 6920 + }, + { + "epoch": 0.559284025939918, + "grad_norm": 2.580589532852173, + "learning_rate": 8.452812436254112e-06, + "loss": 0.9212, + "step": 6921 + }, + { + "epoch": 0.5593648356532456, + "grad_norm": 2.5534653663635254, + "learning_rate": 8.452339124387763e-06, + "loss": 1.0879, + "step": 6922 + }, + { + "epoch": 0.559445645366573, + "grad_norm": 2.9323341846466064, + "learning_rate": 8.451865753391158e-06, + "loss": 1.0121, + "step": 6923 + }, + { + "epoch": 0.5595264550799006, + "grad_norm": 2.8727517127990723, + "learning_rate": 8.451392323272401e-06, + "loss": 0.9481, + "step": 6924 + }, + { + "epoch": 0.5596072647932282, + "grad_norm": 2.88751220703125, + "learning_rate": 8.450918834039602e-06, + "loss": 0.8822, + "step": 6925 + }, + { + "epoch": 0.5596880745065557, + "grad_norm": 2.4941325187683105, + "learning_rate": 8.450445285700875e-06, + "loss": 0.9589, + "step": 6926 + }, + { + "epoch": 0.5597688842198832, + "grad_norm": 2.9503421783447266, + "learning_rate": 8.449971678264322e-06, + "loss": 0.9996, + "step": 6927 + }, + { + "epoch": 0.5598496939332108, + "grad_norm": 2.571575403213501, + "learning_rate": 8.449498011738063e-06, + "loss": 1.0112, + "step": 6928 + }, + { + "epoch": 0.5599305036465383, + "grad_norm": 2.6412312984466553, + "learning_rate": 8.449024286130207e-06, + "loss": 0.8747, + "step": 6929 + }, + { + "epoch": 0.5600113133598659, + "grad_norm": 2.6379234790802, + "learning_rate": 8.448550501448867e-06, + "loss": 0.9118, + "step": 6930 + }, + { + "epoch": 0.5600921230731934, + "grad_norm": 2.6920595169067383, + "learning_rate": 8.448076657702158e-06, + "loss": 0.89, + "step": 6931 + }, + { + "epoch": 0.5601729327865209, + "grad_norm": 2.924105644226074, + "learning_rate": 8.4476027548982e-06, + "loss": 0.9643, + "step": 6932 + }, + { + "epoch": 0.5602537424998485, + "grad_norm": 2.5920870304107666, + "learning_rate": 8.447128793045103e-06, + "loss": 0.9073, + "step": 6933 + }, + { + "epoch": 0.5603345522131761, + "grad_norm": 2.683720827102661, + "learning_rate": 8.44665477215099e-06, + "loss": 0.9764, + "step": 6934 + }, + { + "epoch": 0.5604153619265035, + "grad_norm": 2.871969223022461, + "learning_rate": 8.446180692223977e-06, + "loss": 0.8684, + "step": 6935 + }, + { + "epoch": 0.5604961716398311, + "grad_norm": 2.516979217529297, + "learning_rate": 8.445706553272188e-06, + "loss": 1.0522, + "step": 6936 + }, + { + "epoch": 0.5605769813531587, + "grad_norm": 2.7543396949768066, + "learning_rate": 8.445232355303739e-06, + "loss": 0.976, + "step": 6937 + }, + { + "epoch": 0.5606577910664862, + "grad_norm": 2.5632481575012207, + "learning_rate": 8.444758098326753e-06, + "loss": 0.9587, + "step": 6938 + }, + { + "epoch": 0.5607386007798137, + "grad_norm": 2.6871962547302246, + "learning_rate": 8.444283782349356e-06, + "loss": 1.001, + "step": 6939 + }, + { + "epoch": 0.5608194104931413, + "grad_norm": 2.6770501136779785, + "learning_rate": 8.443809407379668e-06, + "loss": 1.0675, + "step": 6940 + }, + { + "epoch": 0.5609002202064688, + "grad_norm": 2.706904649734497, + "learning_rate": 8.443334973425817e-06, + "loss": 0.9417, + "step": 6941 + }, + { + "epoch": 0.5609810299197964, + "grad_norm": 2.690647840499878, + "learning_rate": 8.442860480495925e-06, + "loss": 0.9333, + "step": 6942 + }, + { + "epoch": 0.561061839633124, + "grad_norm": 3.1479380130767822, + "learning_rate": 8.442385928598123e-06, + "loss": 1.0568, + "step": 6943 + }, + { + "epoch": 0.5611426493464514, + "grad_norm": 2.623107671737671, + "learning_rate": 8.441911317740537e-06, + "loss": 0.9015, + "step": 6944 + }, + { + "epoch": 0.561223459059779, + "grad_norm": 2.5914926528930664, + "learning_rate": 8.441436647931296e-06, + "loss": 0.9363, + "step": 6945 + }, + { + "epoch": 0.5613042687731066, + "grad_norm": 2.887040376663208, + "learning_rate": 8.440961919178529e-06, + "loss": 1.0202, + "step": 6946 + }, + { + "epoch": 0.561385078486434, + "grad_norm": 2.8716883659362793, + "learning_rate": 8.44048713149037e-06, + "loss": 0.9894, + "step": 6947 + }, + { + "epoch": 0.5614658881997616, + "grad_norm": 2.8763785362243652, + "learning_rate": 8.44001228487495e-06, + "loss": 1.0792, + "step": 6948 + }, + { + "epoch": 0.5615466979130892, + "grad_norm": 2.7583351135253906, + "learning_rate": 8.439537379340398e-06, + "loss": 1.043, + "step": 6949 + }, + { + "epoch": 0.5616275076264167, + "grad_norm": 2.1673460006713867, + "learning_rate": 8.439062414894852e-06, + "loss": 1.0191, + "step": 6950 + }, + { + "epoch": 0.5617083173397442, + "grad_norm": 2.888838529586792, + "learning_rate": 8.438587391546447e-06, + "loss": 1.0027, + "step": 6951 + }, + { + "epoch": 0.5617891270530718, + "grad_norm": 2.6036126613616943, + "learning_rate": 8.438112309303318e-06, + "loss": 0.9132, + "step": 6952 + }, + { + "epoch": 0.5618699367663993, + "grad_norm": 2.374340295791626, + "learning_rate": 8.4376371681736e-06, + "loss": 0.9941, + "step": 6953 + }, + { + "epoch": 0.5619507464797269, + "grad_norm": 2.4654552936553955, + "learning_rate": 8.437161968165436e-06, + "loss": 0.9955, + "step": 6954 + }, + { + "epoch": 0.5620315561930544, + "grad_norm": 2.4640560150146484, + "learning_rate": 8.43668670928696e-06, + "loss": 0.9802, + "step": 6955 + }, + { + "epoch": 0.5621123659063819, + "grad_norm": 2.866013526916504, + "learning_rate": 8.436211391546315e-06, + "loss": 0.8866, + "step": 6956 + }, + { + "epoch": 0.5621931756197095, + "grad_norm": 3.00115704536438, + "learning_rate": 8.43573601495164e-06, + "loss": 0.8571, + "step": 6957 + }, + { + "epoch": 0.5622739853330371, + "grad_norm": 2.5477354526519775, + "learning_rate": 8.43526057951108e-06, + "loss": 0.9537, + "step": 6958 + }, + { + "epoch": 0.5623547950463645, + "grad_norm": 2.5641021728515625, + "learning_rate": 8.434785085232777e-06, + "loss": 0.9218, + "step": 6959 + }, + { + "epoch": 0.5624356047596921, + "grad_norm": 2.412095069885254, + "learning_rate": 8.434309532124872e-06, + "loss": 1.0215, + "step": 6960 + }, + { + "epoch": 0.5625164144730197, + "grad_norm": 2.4804725646972656, + "learning_rate": 8.433833920195514e-06, + "loss": 0.925, + "step": 6961 + }, + { + "epoch": 0.5625972241863472, + "grad_norm": 2.8121204376220703, + "learning_rate": 8.433358249452848e-06, + "loss": 0.8633, + "step": 6962 + }, + { + "epoch": 0.5626780338996747, + "grad_norm": 2.7459716796875, + "learning_rate": 8.432882519905019e-06, + "loss": 0.9291, + "step": 6963 + }, + { + "epoch": 0.5627588436130023, + "grad_norm": 2.6513407230377197, + "learning_rate": 8.432406731560178e-06, + "loss": 0.9366, + "step": 6964 + }, + { + "epoch": 0.5628396533263298, + "grad_norm": 2.5133769512176514, + "learning_rate": 8.431930884426472e-06, + "loss": 0.9824, + "step": 6965 + }, + { + "epoch": 0.5629204630396574, + "grad_norm": 2.3399899005889893, + "learning_rate": 8.431454978512052e-06, + "loss": 1.0431, + "step": 6966 + }, + { + "epoch": 0.563001272752985, + "grad_norm": 2.996201515197754, + "learning_rate": 8.430979013825069e-06, + "loss": 0.986, + "step": 6967 + }, + { + "epoch": 0.5630820824663124, + "grad_norm": 2.232870578765869, + "learning_rate": 8.430502990373677e-06, + "loss": 1.0036, + "step": 6968 + }, + { + "epoch": 0.56316289217964, + "grad_norm": 3.242262840270996, + "learning_rate": 8.430026908166026e-06, + "loss": 0.9543, + "step": 6969 + }, + { + "epoch": 0.5632437018929676, + "grad_norm": 3.069676399230957, + "learning_rate": 8.429550767210271e-06, + "loss": 0.8638, + "step": 6970 + }, + { + "epoch": 0.563324511606295, + "grad_norm": 2.892313003540039, + "learning_rate": 8.42907456751457e-06, + "loss": 0.9887, + "step": 6971 + }, + { + "epoch": 0.5634053213196226, + "grad_norm": 2.5909368991851807, + "learning_rate": 8.428598309087073e-06, + "loss": 0.9523, + "step": 6972 + }, + { + "epoch": 0.5634861310329502, + "grad_norm": 2.4261319637298584, + "learning_rate": 8.428121991935945e-06, + "loss": 0.9799, + "step": 6973 + }, + { + "epoch": 0.5635669407462777, + "grad_norm": 2.2519493103027344, + "learning_rate": 8.427645616069338e-06, + "loss": 0.9108, + "step": 6974 + }, + { + "epoch": 0.5636477504596052, + "grad_norm": 2.5064189434051514, + "learning_rate": 8.427169181495413e-06, + "loss": 0.9722, + "step": 6975 + }, + { + "epoch": 0.5637285601729328, + "grad_norm": 2.6627674102783203, + "learning_rate": 8.426692688222332e-06, + "loss": 0.9325, + "step": 6976 + }, + { + "epoch": 0.5638093698862603, + "grad_norm": 2.35725474357605, + "learning_rate": 8.426216136258251e-06, + "loss": 1.0678, + "step": 6977 + }, + { + "epoch": 0.5638901795995879, + "grad_norm": 2.640176773071289, + "learning_rate": 8.42573952561134e-06, + "loss": 0.921, + "step": 6978 + }, + { + "epoch": 0.5639709893129154, + "grad_norm": 2.5032410621643066, + "learning_rate": 8.425262856289757e-06, + "loss": 0.8822, + "step": 6979 + }, + { + "epoch": 0.5640517990262429, + "grad_norm": 2.42099928855896, + "learning_rate": 8.424786128301666e-06, + "loss": 0.8589, + "step": 6980 + }, + { + "epoch": 0.5641326087395705, + "grad_norm": 2.7244465351104736, + "learning_rate": 8.424309341655235e-06, + "loss": 0.9118, + "step": 6981 + }, + { + "epoch": 0.5642134184528981, + "grad_norm": 2.6891651153564453, + "learning_rate": 8.423832496358629e-06, + "loss": 0.9213, + "step": 6982 + }, + { + "epoch": 0.5642942281662255, + "grad_norm": 2.518742322921753, + "learning_rate": 8.423355592420014e-06, + "loss": 0.9701, + "step": 6983 + }, + { + "epoch": 0.5643750378795531, + "grad_norm": 2.695110321044922, + "learning_rate": 8.422878629847557e-06, + "loss": 0.9443, + "step": 6984 + }, + { + "epoch": 0.5644558475928807, + "grad_norm": 2.894137144088745, + "learning_rate": 8.422401608649433e-06, + "loss": 0.9273, + "step": 6985 + }, + { + "epoch": 0.5645366573062082, + "grad_norm": 2.9529757499694824, + "learning_rate": 8.421924528833806e-06, + "loss": 0.9318, + "step": 6986 + }, + { + "epoch": 0.5646174670195357, + "grad_norm": 2.6024718284606934, + "learning_rate": 8.421447390408851e-06, + "loss": 0.953, + "step": 6987 + }, + { + "epoch": 0.5646982767328633, + "grad_norm": 2.8305442333221436, + "learning_rate": 8.420970193382739e-06, + "loss": 1.1961, + "step": 6988 + }, + { + "epoch": 0.5647790864461908, + "grad_norm": 2.8597564697265625, + "learning_rate": 8.420492937763642e-06, + "loss": 0.9659, + "step": 6989 + }, + { + "epoch": 0.5648598961595184, + "grad_norm": 2.536705732345581, + "learning_rate": 8.420015623559737e-06, + "loss": 0.9049, + "step": 6990 + }, + { + "epoch": 0.564940705872846, + "grad_norm": 3.3660593032836914, + "learning_rate": 8.419538250779197e-06, + "loss": 0.9746, + "step": 6991 + }, + { + "epoch": 0.5650215155861734, + "grad_norm": 3.0411932468414307, + "learning_rate": 8.419060819430198e-06, + "loss": 0.9653, + "step": 6992 + }, + { + "epoch": 0.565102325299501, + "grad_norm": 2.857156276702881, + "learning_rate": 8.41858332952092e-06, + "loss": 0.9238, + "step": 6993 + }, + { + "epoch": 0.5651831350128286, + "grad_norm": 2.5991122722625732, + "learning_rate": 8.418105781059538e-06, + "loss": 0.9415, + "step": 6994 + }, + { + "epoch": 0.565263944726156, + "grad_norm": 2.8292133808135986, + "learning_rate": 8.417628174054234e-06, + "loss": 0.9808, + "step": 6995 + }, + { + "epoch": 0.5653447544394836, + "grad_norm": 2.5704286098480225, + "learning_rate": 8.417150508513187e-06, + "loss": 0.9807, + "step": 6996 + }, + { + "epoch": 0.5654255641528112, + "grad_norm": 3.0911080837249756, + "learning_rate": 8.416672784444577e-06, + "loss": 1.0539, + "step": 6997 + }, + { + "epoch": 0.5655063738661387, + "grad_norm": 2.783277750015259, + "learning_rate": 8.416195001856588e-06, + "loss": 0.866, + "step": 6998 + }, + { + "epoch": 0.5655871835794662, + "grad_norm": 2.933220624923706, + "learning_rate": 8.415717160757403e-06, + "loss": 0.8734, + "step": 6999 + }, + { + "epoch": 0.5656679932927938, + "grad_norm": 2.308877468109131, + "learning_rate": 8.415239261155206e-06, + "loss": 0.9979, + "step": 7000 + }, + { + "epoch": 0.5656679932927938, + "eval_loss": 0.8000102639198303, + "eval_runtime": 816.7886, + "eval_samples_per_second": 102.066, + "eval_steps_per_second": 12.759, + "step": 7000 + }, + { + "epoch": 0.5657488030061213, + "grad_norm": 2.592982769012451, + "learning_rate": 8.414761303058183e-06, + "loss": 1.0604, + "step": 7001 + }, + { + "epoch": 0.5658296127194489, + "grad_norm": 2.4939510822296143, + "learning_rate": 8.414283286474519e-06, + "loss": 0.9632, + "step": 7002 + }, + { + "epoch": 0.5659104224327764, + "grad_norm": 2.696521043777466, + "learning_rate": 8.413805211412401e-06, + "loss": 1.0048, + "step": 7003 + }, + { + "epoch": 0.5659912321461039, + "grad_norm": 2.826878547668457, + "learning_rate": 8.413327077880021e-06, + "loss": 0.9808, + "step": 7004 + }, + { + "epoch": 0.5660720418594315, + "grad_norm": 2.856379270553589, + "learning_rate": 8.412848885885562e-06, + "loss": 0.9076, + "step": 7005 + }, + { + "epoch": 0.5661528515727591, + "grad_norm": 2.592928647994995, + "learning_rate": 8.41237063543722e-06, + "loss": 0.9142, + "step": 7006 + }, + { + "epoch": 0.5662336612860865, + "grad_norm": 2.515559434890747, + "learning_rate": 8.411892326543181e-06, + "loss": 1.0286, + "step": 7007 + }, + { + "epoch": 0.5663144709994141, + "grad_norm": 2.604215145111084, + "learning_rate": 8.411413959211642e-06, + "loss": 0.9673, + "step": 7008 + }, + { + "epoch": 0.5663952807127417, + "grad_norm": 2.451754093170166, + "learning_rate": 8.410935533450796e-06, + "loss": 1.0003, + "step": 7009 + }, + { + "epoch": 0.5664760904260692, + "grad_norm": 3.207737684249878, + "learning_rate": 8.410457049268834e-06, + "loss": 0.8736, + "step": 7010 + }, + { + "epoch": 0.5665569001393967, + "grad_norm": 2.349968194961548, + "learning_rate": 8.409978506673954e-06, + "loss": 0.8771, + "step": 7011 + }, + { + "epoch": 0.5666377098527243, + "grad_norm": 2.256417751312256, + "learning_rate": 8.409499905674351e-06, + "loss": 1.0773, + "step": 7012 + }, + { + "epoch": 0.5667185195660518, + "grad_norm": 2.558096408843994, + "learning_rate": 8.409021246278222e-06, + "loss": 0.9073, + "step": 7013 + }, + { + "epoch": 0.5667993292793794, + "grad_norm": 2.764721155166626, + "learning_rate": 8.408542528493765e-06, + "loss": 0.9293, + "step": 7014 + }, + { + "epoch": 0.566880138992707, + "grad_norm": 2.7050933837890625, + "learning_rate": 8.408063752329182e-06, + "loss": 0.8146, + "step": 7015 + }, + { + "epoch": 0.5669609487060344, + "grad_norm": 2.8810532093048096, + "learning_rate": 8.407584917792672e-06, + "loss": 0.9299, + "step": 7016 + }, + { + "epoch": 0.567041758419362, + "grad_norm": 2.6436405181884766, + "learning_rate": 8.407106024892436e-06, + "loss": 0.9474, + "step": 7017 + }, + { + "epoch": 0.5671225681326896, + "grad_norm": 2.8703296184539795, + "learning_rate": 8.406627073636674e-06, + "loss": 0.8982, + "step": 7018 + }, + { + "epoch": 0.567203377846017, + "grad_norm": 2.899139642715454, + "learning_rate": 8.406148064033592e-06, + "loss": 1.0491, + "step": 7019 + }, + { + "epoch": 0.5672841875593446, + "grad_norm": 2.6143975257873535, + "learning_rate": 8.405668996091397e-06, + "loss": 0.9591, + "step": 7020 + }, + { + "epoch": 0.5673649972726722, + "grad_norm": 2.3370583057403564, + "learning_rate": 8.405189869818286e-06, + "loss": 1.0002, + "step": 7021 + }, + { + "epoch": 0.5674458069859997, + "grad_norm": 2.314232349395752, + "learning_rate": 8.404710685222473e-06, + "loss": 0.9367, + "step": 7022 + }, + { + "epoch": 0.5675266166993272, + "grad_norm": 3.1822986602783203, + "learning_rate": 8.40423144231216e-06, + "loss": 0.9731, + "step": 7023 + }, + { + "epoch": 0.5676074264126548, + "grad_norm": 2.6218934059143066, + "learning_rate": 8.40375214109556e-06, + "loss": 0.9003, + "step": 7024 + }, + { + "epoch": 0.5676882361259823, + "grad_norm": 2.5922951698303223, + "learning_rate": 8.40327278158088e-06, + "loss": 0.941, + "step": 7025 + }, + { + "epoch": 0.5677690458393099, + "grad_norm": 2.763942241668701, + "learning_rate": 8.402793363776329e-06, + "loss": 1.0936, + "step": 7026 + }, + { + "epoch": 0.5678498555526375, + "grad_norm": 2.2726926803588867, + "learning_rate": 8.402313887690122e-06, + "loss": 1.0425, + "step": 7027 + }, + { + "epoch": 0.5679306652659649, + "grad_norm": 2.314948081970215, + "learning_rate": 8.401834353330466e-06, + "loss": 0.938, + "step": 7028 + }, + { + "epoch": 0.5680114749792925, + "grad_norm": 2.565999984741211, + "learning_rate": 8.401354760705578e-06, + "loss": 1.0071, + "step": 7029 + }, + { + "epoch": 0.5680922846926201, + "grad_norm": 2.7477312088012695, + "learning_rate": 8.400875109823672e-06, + "loss": 0.9649, + "step": 7030 + }, + { + "epoch": 0.5681730944059475, + "grad_norm": 2.6854000091552734, + "learning_rate": 8.400395400692962e-06, + "loss": 0.9304, + "step": 7031 + }, + { + "epoch": 0.5682539041192751, + "grad_norm": 2.1080284118652344, + "learning_rate": 8.399915633321664e-06, + "loss": 0.9809, + "step": 7032 + }, + { + "epoch": 0.5683347138326027, + "grad_norm": 2.6586344242095947, + "learning_rate": 8.399435807717998e-06, + "loss": 1.17, + "step": 7033 + }, + { + "epoch": 0.5684155235459302, + "grad_norm": 2.4868946075439453, + "learning_rate": 8.39895592389018e-06, + "loss": 0.8837, + "step": 7034 + }, + { + "epoch": 0.5684963332592577, + "grad_norm": 2.6929616928100586, + "learning_rate": 8.398475981846431e-06, + "loss": 0.9189, + "step": 7035 + }, + { + "epoch": 0.5685771429725853, + "grad_norm": 2.7626705169677734, + "learning_rate": 8.397995981594966e-06, + "loss": 0.9134, + "step": 7036 + }, + { + "epoch": 0.5686579526859128, + "grad_norm": 2.5226573944091797, + "learning_rate": 8.397515923144012e-06, + "loss": 1.019, + "step": 7037 + }, + { + "epoch": 0.5687387623992404, + "grad_norm": 3.266282558441162, + "learning_rate": 8.397035806501792e-06, + "loss": 0.9652, + "step": 7038 + }, + { + "epoch": 0.568819572112568, + "grad_norm": 2.445091724395752, + "learning_rate": 8.396555631676523e-06, + "loss": 0.9725, + "step": 7039 + }, + { + "epoch": 0.5689003818258954, + "grad_norm": 2.4900972843170166, + "learning_rate": 8.396075398676435e-06, + "loss": 0.9285, + "step": 7040 + }, + { + "epoch": 0.568981191539223, + "grad_norm": 2.3779239654541016, + "learning_rate": 8.395595107509751e-06, + "loss": 0.9441, + "step": 7041 + }, + { + "epoch": 0.5690620012525506, + "grad_norm": 2.8242993354797363, + "learning_rate": 8.395114758184697e-06, + "loss": 1.0378, + "step": 7042 + }, + { + "epoch": 0.569142810965878, + "grad_norm": 2.4411308765411377, + "learning_rate": 8.394634350709501e-06, + "loss": 0.8461, + "step": 7043 + }, + { + "epoch": 0.5692236206792056, + "grad_norm": 2.3372957706451416, + "learning_rate": 8.39415388509239e-06, + "loss": 0.9613, + "step": 7044 + }, + { + "epoch": 0.5693044303925332, + "grad_norm": 2.3165688514709473, + "learning_rate": 8.393673361341594e-06, + "loss": 0.9957, + "step": 7045 + }, + { + "epoch": 0.5693852401058608, + "grad_norm": 2.674180030822754, + "learning_rate": 8.393192779465345e-06, + "loss": 0.9776, + "step": 7046 + }, + { + "epoch": 0.5694660498191882, + "grad_norm": 3.1991283893585205, + "learning_rate": 8.39271213947187e-06, + "loss": 0.8736, + "step": 7047 + }, + { + "epoch": 0.5695468595325158, + "grad_norm": 2.841813802719116, + "learning_rate": 8.392231441369405e-06, + "loss": 0.9953, + "step": 7048 + }, + { + "epoch": 0.5696276692458434, + "grad_norm": 3.1734557151794434, + "learning_rate": 8.391750685166182e-06, + "loss": 0.9715, + "step": 7049 + }, + { + "epoch": 0.5697084789591709, + "grad_norm": 2.545691967010498, + "learning_rate": 8.391269870870435e-06, + "loss": 1.0736, + "step": 7050 + }, + { + "epoch": 0.5697892886724985, + "grad_norm": 2.592190742492676, + "learning_rate": 8.3907889984904e-06, + "loss": 0.9419, + "step": 7051 + }, + { + "epoch": 0.569870098385826, + "grad_norm": 2.8696320056915283, + "learning_rate": 8.390308068034312e-06, + "loss": 0.9299, + "step": 7052 + }, + { + "epoch": 0.5699509080991535, + "grad_norm": 2.8169021606445312, + "learning_rate": 8.389827079510406e-06, + "loss": 1.1305, + "step": 7053 + }, + { + "epoch": 0.5700317178124811, + "grad_norm": 2.3149781227111816, + "learning_rate": 8.389346032926926e-06, + "loss": 0.9445, + "step": 7054 + }, + { + "epoch": 0.5701125275258087, + "grad_norm": 2.4093823432922363, + "learning_rate": 8.388864928292106e-06, + "loss": 1.1357, + "step": 7055 + }, + { + "epoch": 0.5701933372391361, + "grad_norm": 2.9673547744750977, + "learning_rate": 8.38838376561419e-06, + "loss": 0.8642, + "step": 7056 + }, + { + "epoch": 0.5702741469524637, + "grad_norm": 2.409287452697754, + "learning_rate": 8.387902544901416e-06, + "loss": 0.9395, + "step": 7057 + }, + { + "epoch": 0.5703549566657913, + "grad_norm": 2.6744260787963867, + "learning_rate": 8.387421266162027e-06, + "loss": 1.0365, + "step": 7058 + }, + { + "epoch": 0.5704357663791187, + "grad_norm": 2.8118560314178467, + "learning_rate": 8.386939929404268e-06, + "loss": 0.8687, + "step": 7059 + }, + { + "epoch": 0.5705165760924463, + "grad_norm": 2.3100764751434326, + "learning_rate": 8.386458534636382e-06, + "loss": 1.105, + "step": 7060 + }, + { + "epoch": 0.5705973858057739, + "grad_norm": 2.456096887588501, + "learning_rate": 8.385977081866611e-06, + "loss": 0.9708, + "step": 7061 + }, + { + "epoch": 0.5706781955191014, + "grad_norm": 2.628122091293335, + "learning_rate": 8.385495571103209e-06, + "loss": 1.0057, + "step": 7062 + }, + { + "epoch": 0.570759005232429, + "grad_norm": 3.010484457015991, + "learning_rate": 8.385014002354415e-06, + "loss": 0.9209, + "step": 7063 + }, + { + "epoch": 0.5708398149457565, + "grad_norm": 3.124950647354126, + "learning_rate": 8.384532375628478e-06, + "loss": 0.9172, + "step": 7064 + }, + { + "epoch": 0.570920624659084, + "grad_norm": 4.180485248565674, + "learning_rate": 8.384050690933653e-06, + "loss": 0.9709, + "step": 7065 + }, + { + "epoch": 0.5710014343724116, + "grad_norm": 2.5992653369903564, + "learning_rate": 8.383568948278185e-06, + "loss": 1.1398, + "step": 7066 + }, + { + "epoch": 0.5710822440857392, + "grad_norm": 2.78249454498291, + "learning_rate": 8.383087147670325e-06, + "loss": 0.9614, + "step": 7067 + }, + { + "epoch": 0.5711630537990666, + "grad_norm": 2.8518197536468506, + "learning_rate": 8.38260528911833e-06, + "loss": 0.9481, + "step": 7068 + }, + { + "epoch": 0.5712438635123942, + "grad_norm": 2.5273258686065674, + "learning_rate": 8.382123372630448e-06, + "loss": 0.8787, + "step": 7069 + }, + { + "epoch": 0.5713246732257218, + "grad_norm": 2.849349021911621, + "learning_rate": 8.381641398214935e-06, + "loss": 0.9052, + "step": 7070 + }, + { + "epoch": 0.5714054829390492, + "grad_norm": 2.7267680168151855, + "learning_rate": 8.381159365880045e-06, + "loss": 1.0531, + "step": 7071 + }, + { + "epoch": 0.5714862926523768, + "grad_norm": 3.0048511028289795, + "learning_rate": 8.380677275634035e-06, + "loss": 1.0393, + "step": 7072 + }, + { + "epoch": 0.5715671023657044, + "grad_norm": 2.633543014526367, + "learning_rate": 8.380195127485161e-06, + "loss": 1.0095, + "step": 7073 + }, + { + "epoch": 0.5716479120790319, + "grad_norm": 2.8810977935791016, + "learning_rate": 8.379712921441685e-06, + "loss": 0.781, + "step": 7074 + }, + { + "epoch": 0.5717287217923595, + "grad_norm": 2.575315475463867, + "learning_rate": 8.379230657511861e-06, + "loss": 1.0401, + "step": 7075 + }, + { + "epoch": 0.571809531505687, + "grad_norm": 2.9347527027130127, + "learning_rate": 8.378748335703953e-06, + "loss": 0.9515, + "step": 7076 + }, + { + "epoch": 0.5718903412190145, + "grad_norm": 2.5889923572540283, + "learning_rate": 8.378265956026216e-06, + "loss": 0.8905, + "step": 7077 + }, + { + "epoch": 0.5719711509323421, + "grad_norm": 2.9868171215057373, + "learning_rate": 8.377783518486919e-06, + "loss": 0.9832, + "step": 7078 + }, + { + "epoch": 0.5720519606456697, + "grad_norm": 2.6267364025115967, + "learning_rate": 8.377301023094322e-06, + "loss": 0.9181, + "step": 7079 + }, + { + "epoch": 0.5721327703589971, + "grad_norm": 2.6298704147338867, + "learning_rate": 8.376818469856687e-06, + "loss": 0.9886, + "step": 7080 + }, + { + "epoch": 0.5722135800723247, + "grad_norm": 2.3796985149383545, + "learning_rate": 8.376335858782282e-06, + "loss": 0.9637, + "step": 7081 + }, + { + "epoch": 0.5722943897856523, + "grad_norm": 2.550248861312866, + "learning_rate": 8.375853189879373e-06, + "loss": 0.9606, + "step": 7082 + }, + { + "epoch": 0.5723751994989797, + "grad_norm": 3.1329081058502197, + "learning_rate": 8.375370463156225e-06, + "loss": 1.0353, + "step": 7083 + }, + { + "epoch": 0.5724560092123073, + "grad_norm": 2.2887074947357178, + "learning_rate": 8.374887678621106e-06, + "loss": 0.964, + "step": 7084 + }, + { + "epoch": 0.5725368189256349, + "grad_norm": 2.3649778366088867, + "learning_rate": 8.374404836282288e-06, + "loss": 0.9174, + "step": 7085 + }, + { + "epoch": 0.5726176286389624, + "grad_norm": 2.7177512645721436, + "learning_rate": 8.373921936148037e-06, + "loss": 0.9458, + "step": 7086 + }, + { + "epoch": 0.57269843835229, + "grad_norm": 2.5574512481689453, + "learning_rate": 8.373438978226627e-06, + "loss": 1.0039, + "step": 7087 + }, + { + "epoch": 0.5727792480656175, + "grad_norm": 2.699385166168213, + "learning_rate": 8.372955962526326e-06, + "loss": 0.9806, + "step": 7088 + }, + { + "epoch": 0.572860057778945, + "grad_norm": 2.452263355255127, + "learning_rate": 8.372472889055412e-06, + "loss": 0.9901, + "step": 7089 + }, + { + "epoch": 0.5729408674922726, + "grad_norm": 2.6215109825134277, + "learning_rate": 8.371989757822154e-06, + "loss": 0.934, + "step": 7090 + }, + { + "epoch": 0.5730216772056002, + "grad_norm": 2.643709182739258, + "learning_rate": 8.371506568834831e-06, + "loss": 0.933, + "step": 7091 + }, + { + "epoch": 0.5731024869189276, + "grad_norm": 3.367661952972412, + "learning_rate": 8.371023322101716e-06, + "loss": 1.0407, + "step": 7092 + }, + { + "epoch": 0.5731832966322552, + "grad_norm": 2.4877586364746094, + "learning_rate": 8.370540017631087e-06, + "loss": 1.1873, + "step": 7093 + }, + { + "epoch": 0.5732641063455828, + "grad_norm": 2.6113100051879883, + "learning_rate": 8.370056655431224e-06, + "loss": 0.9028, + "step": 7094 + }, + { + "epoch": 0.5733449160589102, + "grad_norm": 2.850865364074707, + "learning_rate": 8.3695732355104e-06, + "loss": 0.9281, + "step": 7095 + }, + { + "epoch": 0.5734257257722378, + "grad_norm": 2.618288993835449, + "learning_rate": 8.369089757876901e-06, + "loss": 0.846, + "step": 7096 + }, + { + "epoch": 0.5735065354855654, + "grad_norm": 2.4725635051727295, + "learning_rate": 8.368606222539004e-06, + "loss": 1.0211, + "step": 7097 + }, + { + "epoch": 0.5735873451988929, + "grad_norm": 2.657615900039673, + "learning_rate": 8.368122629504994e-06, + "loss": 0.9606, + "step": 7098 + }, + { + "epoch": 0.5736681549122205, + "grad_norm": 2.788099527359009, + "learning_rate": 8.367638978783149e-06, + "loss": 1.0066, + "step": 7099 + }, + { + "epoch": 0.573748964625548, + "grad_norm": 2.808821201324463, + "learning_rate": 8.367155270381757e-06, + "loss": 0.962, + "step": 7100 + }, + { + "epoch": 0.5738297743388755, + "grad_norm": 2.753215789794922, + "learning_rate": 8.3666715043091e-06, + "loss": 0.8577, + "step": 7101 + }, + { + "epoch": 0.5739105840522031, + "grad_norm": 2.542543649673462, + "learning_rate": 8.366187680573466e-06, + "loss": 1.0162, + "step": 7102 + }, + { + "epoch": 0.5739913937655307, + "grad_norm": 2.808396816253662, + "learning_rate": 8.365703799183141e-06, + "loss": 1.0071, + "step": 7103 + }, + { + "epoch": 0.5740722034788581, + "grad_norm": 2.6289422512054443, + "learning_rate": 8.365219860146413e-06, + "loss": 0.9115, + "step": 7104 + }, + { + "epoch": 0.5741530131921857, + "grad_norm": 2.6151201725006104, + "learning_rate": 8.364735863471569e-06, + "loss": 0.9664, + "step": 7105 + }, + { + "epoch": 0.5742338229055133, + "grad_norm": 2.8494882583618164, + "learning_rate": 8.364251809166902e-06, + "loss": 0.9145, + "step": 7106 + }, + { + "epoch": 0.5743146326188407, + "grad_norm": 2.6571810245513916, + "learning_rate": 8.363767697240697e-06, + "loss": 0.8728, + "step": 7107 + }, + { + "epoch": 0.5743954423321683, + "grad_norm": 2.619377851486206, + "learning_rate": 8.363283527701252e-06, + "loss": 0.8994, + "step": 7108 + }, + { + "epoch": 0.5744762520454959, + "grad_norm": 2.767608642578125, + "learning_rate": 8.362799300556856e-06, + "loss": 0.8547, + "step": 7109 + }, + { + "epoch": 0.5745570617588234, + "grad_norm": 2.7516744136810303, + "learning_rate": 8.362315015815805e-06, + "loss": 0.9478, + "step": 7110 + }, + { + "epoch": 0.574637871472151, + "grad_norm": 2.5909171104431152, + "learning_rate": 8.36183067348639e-06, + "loss": 0.912, + "step": 7111 + }, + { + "epoch": 0.5747186811854785, + "grad_norm": 2.6638970375061035, + "learning_rate": 8.36134627357691e-06, + "loss": 0.9576, + "step": 7112 + }, + { + "epoch": 0.574799490898806, + "grad_norm": 2.5319557189941406, + "learning_rate": 8.360861816095662e-06, + "loss": 0.9856, + "step": 7113 + }, + { + "epoch": 0.5748803006121336, + "grad_norm": 2.6411960124969482, + "learning_rate": 8.36037730105094e-06, + "loss": 0.8588, + "step": 7114 + }, + { + "epoch": 0.5749611103254612, + "grad_norm": 2.6225197315216064, + "learning_rate": 8.359892728451044e-06, + "loss": 1.0094, + "step": 7115 + }, + { + "epoch": 0.5750419200387886, + "grad_norm": 2.844329357147217, + "learning_rate": 8.359408098304276e-06, + "loss": 0.8998, + "step": 7116 + }, + { + "epoch": 0.5751227297521162, + "grad_norm": 2.257103204727173, + "learning_rate": 8.358923410618933e-06, + "loss": 1.0439, + "step": 7117 + }, + { + "epoch": 0.5752035394654438, + "grad_norm": 2.5816025733947754, + "learning_rate": 8.358438665403318e-06, + "loss": 0.9783, + "step": 7118 + }, + { + "epoch": 0.5752843491787712, + "grad_norm": 2.5600202083587646, + "learning_rate": 8.357953862665738e-06, + "loss": 0.8909, + "step": 7119 + }, + { + "epoch": 0.5753651588920988, + "grad_norm": 2.647162675857544, + "learning_rate": 8.357469002414487e-06, + "loss": 0.8291, + "step": 7120 + }, + { + "epoch": 0.5754459686054264, + "grad_norm": 2.674480676651001, + "learning_rate": 8.356984084657878e-06, + "loss": 1.0018, + "step": 7121 + }, + { + "epoch": 0.5755267783187539, + "grad_norm": 2.764030694961548, + "learning_rate": 8.356499109404213e-06, + "loss": 0.9144, + "step": 7122 + }, + { + "epoch": 0.5756075880320815, + "grad_norm": 2.7310590744018555, + "learning_rate": 8.356014076661797e-06, + "loss": 0.9947, + "step": 7123 + }, + { + "epoch": 0.575688397745409, + "grad_norm": 2.3526554107666016, + "learning_rate": 8.35552898643894e-06, + "loss": 0.9841, + "step": 7124 + }, + { + "epoch": 0.5757692074587365, + "grad_norm": 2.3746774196624756, + "learning_rate": 8.35504383874395e-06, + "loss": 0.9476, + "step": 7125 + }, + { + "epoch": 0.5758500171720641, + "grad_norm": 2.7105541229248047, + "learning_rate": 8.354558633585135e-06, + "loss": 0.9622, + "step": 7126 + }, + { + "epoch": 0.5759308268853917, + "grad_norm": 2.2798879146575928, + "learning_rate": 8.354073370970808e-06, + "loss": 0.9252, + "step": 7127 + }, + { + "epoch": 0.5760116365987191, + "grad_norm": 2.9079911708831787, + "learning_rate": 8.353588050909278e-06, + "loss": 0.9173, + "step": 7128 + }, + { + "epoch": 0.5760924463120467, + "grad_norm": 2.9166505336761475, + "learning_rate": 8.353102673408857e-06, + "loss": 0.9159, + "step": 7129 + }, + { + "epoch": 0.5761732560253743, + "grad_norm": 2.9057743549346924, + "learning_rate": 8.35261723847786e-06, + "loss": 0.9977, + "step": 7130 + }, + { + "epoch": 0.5762540657387017, + "grad_norm": 2.689920425415039, + "learning_rate": 8.352131746124602e-06, + "loss": 0.9428, + "step": 7131 + }, + { + "epoch": 0.5763348754520293, + "grad_norm": 2.1676008701324463, + "learning_rate": 8.351646196357396e-06, + "loss": 0.9955, + "step": 7132 + }, + { + "epoch": 0.5764156851653569, + "grad_norm": 2.628889322280884, + "learning_rate": 8.35116058918456e-06, + "loss": 0.9724, + "step": 7133 + }, + { + "epoch": 0.5764964948786844, + "grad_norm": 2.7892470359802246, + "learning_rate": 8.350674924614411e-06, + "loss": 0.8356, + "step": 7134 + }, + { + "epoch": 0.576577304592012, + "grad_norm": 2.6595675945281982, + "learning_rate": 8.350189202655265e-06, + "loss": 0.8782, + "step": 7135 + }, + { + "epoch": 0.5766581143053395, + "grad_norm": 3.222132682800293, + "learning_rate": 8.349703423315446e-06, + "loss": 1.0538, + "step": 7136 + }, + { + "epoch": 0.576738924018667, + "grad_norm": 2.904111623764038, + "learning_rate": 8.34921758660327e-06, + "loss": 1.0349, + "step": 7137 + }, + { + "epoch": 0.5768197337319946, + "grad_norm": 2.999847173690796, + "learning_rate": 8.348731692527058e-06, + "loss": 0.9379, + "step": 7138 + }, + { + "epoch": 0.5769005434453222, + "grad_norm": 3.2096683979034424, + "learning_rate": 8.348245741095139e-06, + "loss": 0.9267, + "step": 7139 + }, + { + "epoch": 0.5769813531586496, + "grad_norm": 2.9243977069854736, + "learning_rate": 8.347759732315826e-06, + "loss": 0.9898, + "step": 7140 + }, + { + "epoch": 0.5770621628719772, + "grad_norm": 2.47684907913208, + "learning_rate": 8.347273666197449e-06, + "loss": 1.0357, + "step": 7141 + }, + { + "epoch": 0.5771429725853048, + "grad_norm": 2.582578659057617, + "learning_rate": 8.346787542748333e-06, + "loss": 0.8993, + "step": 7142 + }, + { + "epoch": 0.5772237822986322, + "grad_norm": 3.000941753387451, + "learning_rate": 8.346301361976804e-06, + "loss": 0.8562, + "step": 7143 + }, + { + "epoch": 0.5773045920119598, + "grad_norm": 2.593186616897583, + "learning_rate": 8.345815123891188e-06, + "loss": 1.0636, + "step": 7144 + }, + { + "epoch": 0.5773854017252874, + "grad_norm": 2.888679027557373, + "learning_rate": 8.345328828499813e-06, + "loss": 0.9278, + "step": 7145 + }, + { + "epoch": 0.5774662114386149, + "grad_norm": 2.4658422470092773, + "learning_rate": 8.34484247581101e-06, + "loss": 0.9563, + "step": 7146 + }, + { + "epoch": 0.5775470211519425, + "grad_norm": 3.0275471210479736, + "learning_rate": 8.344356065833107e-06, + "loss": 0.9294, + "step": 7147 + }, + { + "epoch": 0.57762783086527, + "grad_norm": 2.5276520252227783, + "learning_rate": 8.343869598574436e-06, + "loss": 0.9542, + "step": 7148 + }, + { + "epoch": 0.5777086405785975, + "grad_norm": 2.551970958709717, + "learning_rate": 8.34338307404333e-06, + "loss": 1.0405, + "step": 7149 + }, + { + "epoch": 0.5777894502919251, + "grad_norm": 3.1526784896850586, + "learning_rate": 8.34289649224812e-06, + "loss": 0.9271, + "step": 7150 + }, + { + "epoch": 0.5778702600052527, + "grad_norm": 2.6485044956207275, + "learning_rate": 8.34240985319714e-06, + "loss": 0.9898, + "step": 7151 + }, + { + "epoch": 0.5779510697185801, + "grad_norm": 2.561781167984009, + "learning_rate": 8.341923156898725e-06, + "loss": 0.9021, + "step": 7152 + }, + { + "epoch": 0.5780318794319077, + "grad_norm": 2.259307861328125, + "learning_rate": 8.341436403361214e-06, + "loss": 0.951, + "step": 7153 + }, + { + "epoch": 0.5781126891452353, + "grad_norm": 2.5447819232940674, + "learning_rate": 8.34094959259294e-06, + "loss": 0.9541, + "step": 7154 + }, + { + "epoch": 0.5781934988585627, + "grad_norm": 2.4141488075256348, + "learning_rate": 8.340462724602243e-06, + "loss": 0.9795, + "step": 7155 + }, + { + "epoch": 0.5782743085718903, + "grad_norm": 3.415891647338867, + "learning_rate": 8.339975799397462e-06, + "loss": 1.0438, + "step": 7156 + }, + { + "epoch": 0.5783551182852179, + "grad_norm": 2.540407657623291, + "learning_rate": 8.339488816986934e-06, + "loss": 1.0681, + "step": 7157 + }, + { + "epoch": 0.5784359279985454, + "grad_norm": 2.658569812774658, + "learning_rate": 8.339001777379004e-06, + "loss": 0.9345, + "step": 7158 + }, + { + "epoch": 0.578516737711873, + "grad_norm": 2.5978288650512695, + "learning_rate": 8.338514680582011e-06, + "loss": 0.9114, + "step": 7159 + }, + { + "epoch": 0.5785975474252005, + "grad_norm": 2.3763232231140137, + "learning_rate": 8.3380275266043e-06, + "loss": 1.045, + "step": 7160 + }, + { + "epoch": 0.578678357138528, + "grad_norm": 2.8415863513946533, + "learning_rate": 8.337540315454213e-06, + "loss": 0.9316, + "step": 7161 + }, + { + "epoch": 0.5787591668518556, + "grad_norm": 2.7142231464385986, + "learning_rate": 8.337053047140094e-06, + "loss": 0.9099, + "step": 7162 + }, + { + "epoch": 0.5788399765651832, + "grad_norm": 2.7461740970611572, + "learning_rate": 8.33656572167029e-06, + "loss": 1.0128, + "step": 7163 + }, + { + "epoch": 0.5789207862785106, + "grad_norm": 3.922994375228882, + "learning_rate": 8.33607833905315e-06, + "loss": 1.022, + "step": 7164 + }, + { + "epoch": 0.5790015959918382, + "grad_norm": 2.5308823585510254, + "learning_rate": 8.335590899297018e-06, + "loss": 0.9988, + "step": 7165 + }, + { + "epoch": 0.5790824057051658, + "grad_norm": 2.4008872509002686, + "learning_rate": 8.335103402410243e-06, + "loss": 1.058, + "step": 7166 + }, + { + "epoch": 0.5791632154184932, + "grad_norm": 2.4188337326049805, + "learning_rate": 8.334615848401176e-06, + "loss": 0.9581, + "step": 7167 + }, + { + "epoch": 0.5792440251318208, + "grad_norm": 2.4827866554260254, + "learning_rate": 8.334128237278168e-06, + "loss": 0.924, + "step": 7168 + }, + { + "epoch": 0.5793248348451484, + "grad_norm": 2.613064765930176, + "learning_rate": 8.333640569049569e-06, + "loss": 0.9715, + "step": 7169 + }, + { + "epoch": 0.5794056445584759, + "grad_norm": 2.6098952293395996, + "learning_rate": 8.333152843723732e-06, + "loss": 0.9353, + "step": 7170 + }, + { + "epoch": 0.5794864542718035, + "grad_norm": 3.0971457958221436, + "learning_rate": 8.332665061309014e-06, + "loss": 0.9005, + "step": 7171 + }, + { + "epoch": 0.579567263985131, + "grad_norm": 2.992938756942749, + "learning_rate": 8.332177221813765e-06, + "loss": 1.087, + "step": 7172 + }, + { + "epoch": 0.5796480736984586, + "grad_norm": 2.5264017581939697, + "learning_rate": 8.331689325246339e-06, + "loss": 0.9867, + "step": 7173 + }, + { + "epoch": 0.5797288834117861, + "grad_norm": 2.314453363418579, + "learning_rate": 8.3312013716151e-06, + "loss": 0.9367, + "step": 7174 + }, + { + "epoch": 0.5798096931251137, + "grad_norm": 2.806817054748535, + "learning_rate": 8.330713360928398e-06, + "loss": 0.9998, + "step": 7175 + }, + { + "epoch": 0.5798905028384412, + "grad_norm": 2.609281063079834, + "learning_rate": 8.330225293194595e-06, + "loss": 1.1114, + "step": 7176 + }, + { + "epoch": 0.5799713125517687, + "grad_norm": 2.8653433322906494, + "learning_rate": 8.329737168422051e-06, + "loss": 0.9069, + "step": 7177 + }, + { + "epoch": 0.5800521222650963, + "grad_norm": 2.562836170196533, + "learning_rate": 8.329248986619126e-06, + "loss": 0.9759, + "step": 7178 + }, + { + "epoch": 0.5801329319784239, + "grad_norm": 2.6038596630096436, + "learning_rate": 8.328760747794179e-06, + "loss": 0.842, + "step": 7179 + }, + { + "epoch": 0.5802137416917513, + "grad_norm": 2.6815073490142822, + "learning_rate": 8.328272451955574e-06, + "loss": 0.9192, + "step": 7180 + }, + { + "epoch": 0.5802945514050789, + "grad_norm": 2.736222267150879, + "learning_rate": 8.327784099111676e-06, + "loss": 1.022, + "step": 7181 + }, + { + "epoch": 0.5803753611184065, + "grad_norm": 2.6508312225341797, + "learning_rate": 8.327295689270847e-06, + "loss": 1.0588, + "step": 7182 + }, + { + "epoch": 0.580456170831734, + "grad_norm": 2.9490833282470703, + "learning_rate": 8.326807222441454e-06, + "loss": 0.8545, + "step": 7183 + }, + { + "epoch": 0.5805369805450615, + "grad_norm": 2.8956761360168457, + "learning_rate": 8.32631869863186e-06, + "loss": 0.9728, + "step": 7184 + }, + { + "epoch": 0.5806177902583891, + "grad_norm": 2.576817512512207, + "learning_rate": 8.325830117850434e-06, + "loss": 0.9309, + "step": 7185 + }, + { + "epoch": 0.5806985999717166, + "grad_norm": 2.527831554412842, + "learning_rate": 8.325341480105547e-06, + "loss": 0.9836, + "step": 7186 + }, + { + "epoch": 0.5807794096850442, + "grad_norm": 2.846911668777466, + "learning_rate": 8.324852785405565e-06, + "loss": 0.9205, + "step": 7187 + }, + { + "epoch": 0.5808602193983717, + "grad_norm": 2.2964630126953125, + "learning_rate": 8.32436403375886e-06, + "loss": 1.0159, + "step": 7188 + }, + { + "epoch": 0.5809410291116992, + "grad_norm": 2.5868728160858154, + "learning_rate": 8.3238752251738e-06, + "loss": 0.9503, + "step": 7189 + }, + { + "epoch": 0.5810218388250268, + "grad_norm": 2.3481106758117676, + "learning_rate": 8.32338635965876e-06, + "loss": 0.8758, + "step": 7190 + }, + { + "epoch": 0.5811026485383544, + "grad_norm": 3.0495853424072266, + "learning_rate": 8.322897437222115e-06, + "loss": 0.9098, + "step": 7191 + }, + { + "epoch": 0.5811834582516818, + "grad_norm": 2.799420118331909, + "learning_rate": 8.322408457872234e-06, + "loss": 0.9523, + "step": 7192 + }, + { + "epoch": 0.5812642679650094, + "grad_norm": 2.3985579013824463, + "learning_rate": 8.321919421617495e-06, + "loss": 0.9581, + "step": 7193 + }, + { + "epoch": 0.581345077678337, + "grad_norm": 3.2880301475524902, + "learning_rate": 8.321430328466273e-06, + "loss": 0.9008, + "step": 7194 + }, + { + "epoch": 0.5814258873916645, + "grad_norm": 2.5351295471191406, + "learning_rate": 8.320941178426946e-06, + "loss": 0.921, + "step": 7195 + }, + { + "epoch": 0.581506697104992, + "grad_norm": 2.904498815536499, + "learning_rate": 8.320451971507892e-06, + "loss": 0.9153, + "step": 7196 + }, + { + "epoch": 0.5815875068183196, + "grad_norm": 3.0620815753936768, + "learning_rate": 8.319962707717489e-06, + "loss": 1.0211, + "step": 7197 + }, + { + "epoch": 0.5816683165316471, + "grad_norm": 2.6478400230407715, + "learning_rate": 8.319473387064116e-06, + "loss": 0.9977, + "step": 7198 + }, + { + "epoch": 0.5817491262449747, + "grad_norm": 2.799684524536133, + "learning_rate": 8.318984009556157e-06, + "loss": 0.8999, + "step": 7199 + }, + { + "epoch": 0.5818299359583022, + "grad_norm": 2.7371294498443604, + "learning_rate": 8.31849457520199e-06, + "loss": 0.9103, + "step": 7200 + }, + { + "epoch": 0.5819107456716297, + "grad_norm": 2.4936740398406982, + "learning_rate": 8.318005084010001e-06, + "loss": 0.9903, + "step": 7201 + }, + { + "epoch": 0.5819915553849573, + "grad_norm": 2.1372649669647217, + "learning_rate": 8.317515535988574e-06, + "loss": 0.9715, + "step": 7202 + }, + { + "epoch": 0.5820723650982849, + "grad_norm": 2.432664155960083, + "learning_rate": 8.31702593114609e-06, + "loss": 1.106, + "step": 7203 + }, + { + "epoch": 0.5821531748116123, + "grad_norm": 2.3609468936920166, + "learning_rate": 8.31653626949094e-06, + "loss": 1.1036, + "step": 7204 + }, + { + "epoch": 0.5822339845249399, + "grad_norm": 2.9737699031829834, + "learning_rate": 8.316046551031506e-06, + "loss": 0.8853, + "step": 7205 + }, + { + "epoch": 0.5823147942382675, + "grad_norm": 2.6651222705841064, + "learning_rate": 8.315556775776179e-06, + "loss": 0.9186, + "step": 7206 + }, + { + "epoch": 0.582395603951595, + "grad_norm": 2.665828227996826, + "learning_rate": 8.315066943733344e-06, + "loss": 0.8584, + "step": 7207 + }, + { + "epoch": 0.5824764136649225, + "grad_norm": 2.7233054637908936, + "learning_rate": 8.314577054911395e-06, + "loss": 0.987, + "step": 7208 + }, + { + "epoch": 0.5825572233782501, + "grad_norm": 2.7811362743377686, + "learning_rate": 8.31408710931872e-06, + "loss": 1.026, + "step": 7209 + }, + { + "epoch": 0.5826380330915776, + "grad_norm": 3.068962812423706, + "learning_rate": 8.313597106963712e-06, + "loss": 0.9771, + "step": 7210 + }, + { + "epoch": 0.5827188428049052, + "grad_norm": 2.772899866104126, + "learning_rate": 8.31310704785476e-06, + "loss": 1.0116, + "step": 7211 + }, + { + "epoch": 0.5827996525182327, + "grad_norm": 3.039334297180176, + "learning_rate": 8.312616932000262e-06, + "loss": 0.9946, + "step": 7212 + }, + { + "epoch": 0.5828804622315602, + "grad_norm": 2.5728089809417725, + "learning_rate": 8.312126759408613e-06, + "loss": 0.9242, + "step": 7213 + }, + { + "epoch": 0.5829612719448878, + "grad_norm": 2.7099645137786865, + "learning_rate": 8.311636530088203e-06, + "loss": 0.9262, + "step": 7214 + }, + { + "epoch": 0.5830420816582154, + "grad_norm": 2.4517130851745605, + "learning_rate": 8.311146244047433e-06, + "loss": 0.9646, + "step": 7215 + }, + { + "epoch": 0.5831228913715428, + "grad_norm": 2.8632915019989014, + "learning_rate": 8.310655901294698e-06, + "loss": 0.9943, + "step": 7216 + }, + { + "epoch": 0.5832037010848704, + "grad_norm": 2.633495569229126, + "learning_rate": 8.310165501838398e-06, + "loss": 1.0139, + "step": 7217 + }, + { + "epoch": 0.583284510798198, + "grad_norm": 2.336775302886963, + "learning_rate": 8.309675045686932e-06, + "loss": 0.937, + "step": 7218 + }, + { + "epoch": 0.5833653205115255, + "grad_norm": 2.640777826309204, + "learning_rate": 8.3091845328487e-06, + "loss": 0.9503, + "step": 7219 + }, + { + "epoch": 0.583446130224853, + "grad_norm": 2.6599156856536865, + "learning_rate": 8.308693963332104e-06, + "loss": 1.0876, + "step": 7220 + }, + { + "epoch": 0.5835269399381806, + "grad_norm": 2.600266933441162, + "learning_rate": 8.308203337145547e-06, + "loss": 0.897, + "step": 7221 + }, + { + "epoch": 0.5836077496515081, + "grad_norm": 2.3513786792755127, + "learning_rate": 8.307712654297428e-06, + "loss": 0.934, + "step": 7222 + }, + { + "epoch": 0.5836885593648357, + "grad_norm": 2.633467674255371, + "learning_rate": 8.307221914796155e-06, + "loss": 0.9819, + "step": 7223 + }, + { + "epoch": 0.5837693690781632, + "grad_norm": 2.2107534408569336, + "learning_rate": 8.306731118650135e-06, + "loss": 1.0194, + "step": 7224 + }, + { + "epoch": 0.5838501787914907, + "grad_norm": 2.882359504699707, + "learning_rate": 8.306240265867768e-06, + "loss": 0.9784, + "step": 7225 + }, + { + "epoch": 0.5839309885048183, + "grad_norm": 2.542127847671509, + "learning_rate": 8.305749356457468e-06, + "loss": 0.8831, + "step": 7226 + }, + { + "epoch": 0.5840117982181459, + "grad_norm": 2.408803939819336, + "learning_rate": 8.305258390427638e-06, + "loss": 0.9947, + "step": 7227 + }, + { + "epoch": 0.5840926079314733, + "grad_norm": 2.930621385574341, + "learning_rate": 8.30476736778669e-06, + "loss": 0.9591, + "step": 7228 + }, + { + "epoch": 0.5841734176448009, + "grad_norm": 3.039104461669922, + "learning_rate": 8.304276288543031e-06, + "loss": 0.9962, + "step": 7229 + }, + { + "epoch": 0.5842542273581285, + "grad_norm": 2.72379469871521, + "learning_rate": 8.303785152705076e-06, + "loss": 0.8664, + "step": 7230 + }, + { + "epoch": 0.584335037071456, + "grad_norm": 2.2838335037231445, + "learning_rate": 8.303293960281233e-06, + "loss": 1.021, + "step": 7231 + }, + { + "epoch": 0.5844158467847835, + "grad_norm": 2.7396748065948486, + "learning_rate": 8.302802711279917e-06, + "loss": 0.9151, + "step": 7232 + }, + { + "epoch": 0.5844966564981111, + "grad_norm": 2.399461269378662, + "learning_rate": 8.302311405709542e-06, + "loss": 0.9, + "step": 7233 + }, + { + "epoch": 0.5845774662114386, + "grad_norm": 2.778099775314331, + "learning_rate": 8.301820043578524e-06, + "loss": 1.0041, + "step": 7234 + }, + { + "epoch": 0.5846582759247662, + "grad_norm": 2.4411673545837402, + "learning_rate": 8.301328624895277e-06, + "loss": 1.0502, + "step": 7235 + }, + { + "epoch": 0.5847390856380937, + "grad_norm": 3.5182604789733887, + "learning_rate": 8.300837149668218e-06, + "loss": 0.9082, + "step": 7236 + }, + { + "epoch": 0.5848198953514212, + "grad_norm": 2.8782472610473633, + "learning_rate": 8.300345617905763e-06, + "loss": 0.9671, + "step": 7237 + }, + { + "epoch": 0.5849007050647488, + "grad_norm": 2.3218936920166016, + "learning_rate": 8.299854029616335e-06, + "loss": 0.8876, + "step": 7238 + }, + { + "epoch": 0.5849815147780764, + "grad_norm": 3.071110963821411, + "learning_rate": 8.299362384808352e-06, + "loss": 0.8901, + "step": 7239 + }, + { + "epoch": 0.5850623244914038, + "grad_norm": 3.2636032104492188, + "learning_rate": 8.298870683490232e-06, + "loss": 1.0622, + "step": 7240 + }, + { + "epoch": 0.5851431342047314, + "grad_norm": 2.4568161964416504, + "learning_rate": 8.2983789256704e-06, + "loss": 0.9507, + "step": 7241 + }, + { + "epoch": 0.585223943918059, + "grad_norm": 2.903306722640991, + "learning_rate": 8.297887111357279e-06, + "loss": 0.9607, + "step": 7242 + }, + { + "epoch": 0.5853047536313865, + "grad_norm": 2.5162227153778076, + "learning_rate": 8.297395240559289e-06, + "loss": 0.9627, + "step": 7243 + }, + { + "epoch": 0.585385563344714, + "grad_norm": 2.6980717182159424, + "learning_rate": 8.29690331328486e-06, + "loss": 1.0173, + "step": 7244 + }, + { + "epoch": 0.5854663730580416, + "grad_norm": 2.7115542888641357, + "learning_rate": 8.29641132954241e-06, + "loss": 0.9689, + "step": 7245 + }, + { + "epoch": 0.5855471827713691, + "grad_norm": 2.9262375831604004, + "learning_rate": 8.295919289340371e-06, + "loss": 0.9344, + "step": 7246 + }, + { + "epoch": 0.5856279924846967, + "grad_norm": 2.454115152359009, + "learning_rate": 8.29542719268717e-06, + "loss": 1.0366, + "step": 7247 + }, + { + "epoch": 0.5857088021980242, + "grad_norm": 2.3883273601531982, + "learning_rate": 8.294935039591235e-06, + "loss": 0.9147, + "step": 7248 + }, + { + "epoch": 0.5857896119113517, + "grad_norm": 2.728160858154297, + "learning_rate": 8.294442830060993e-06, + "loss": 0.9353, + "step": 7249 + }, + { + "epoch": 0.5858704216246793, + "grad_norm": 2.707059621810913, + "learning_rate": 8.293950564104878e-06, + "loss": 0.9389, + "step": 7250 + }, + { + "epoch": 0.5859512313380069, + "grad_norm": 2.561579942703247, + "learning_rate": 8.293458241731319e-06, + "loss": 0.9523, + "step": 7251 + }, + { + "epoch": 0.5860320410513343, + "grad_norm": 2.8697996139526367, + "learning_rate": 8.29296586294875e-06, + "loss": 0.8341, + "step": 7252 + }, + { + "epoch": 0.5861128507646619, + "grad_norm": 3.0197203159332275, + "learning_rate": 8.292473427765603e-06, + "loss": 0.9398, + "step": 7253 + }, + { + "epoch": 0.5861936604779895, + "grad_norm": 2.7292654514312744, + "learning_rate": 8.291980936190312e-06, + "loss": 0.8719, + "step": 7254 + }, + { + "epoch": 0.586274470191317, + "grad_norm": 3.2213222980499268, + "learning_rate": 8.291488388231313e-06, + "loss": 0.9678, + "step": 7255 + }, + { + "epoch": 0.5863552799046445, + "grad_norm": 2.80500864982605, + "learning_rate": 8.290995783897041e-06, + "loss": 0.8987, + "step": 7256 + }, + { + "epoch": 0.5864360896179721, + "grad_norm": 2.932123899459839, + "learning_rate": 8.290503123195934e-06, + "loss": 0.9552, + "step": 7257 + }, + { + "epoch": 0.5865168993312996, + "grad_norm": 2.6169843673706055, + "learning_rate": 8.290010406136433e-06, + "loss": 0.9654, + "step": 7258 + }, + { + "epoch": 0.5865977090446272, + "grad_norm": 2.683082103729248, + "learning_rate": 8.289517632726972e-06, + "loss": 0.942, + "step": 7259 + }, + { + "epoch": 0.5866785187579547, + "grad_norm": 2.525327444076538, + "learning_rate": 8.289024802975991e-06, + "loss": 0.8518, + "step": 7260 + }, + { + "epoch": 0.5867593284712822, + "grad_norm": 2.8130624294281006, + "learning_rate": 8.288531916891936e-06, + "loss": 0.9454, + "step": 7261 + }, + { + "epoch": 0.5868401381846098, + "grad_norm": 2.7846462726593018, + "learning_rate": 8.288038974483244e-06, + "loss": 0.87, + "step": 7262 + }, + { + "epoch": 0.5869209478979374, + "grad_norm": 2.4028542041778564, + "learning_rate": 8.287545975758362e-06, + "loss": 0.8436, + "step": 7263 + }, + { + "epoch": 0.5870017576112648, + "grad_norm": 2.866311550140381, + "learning_rate": 8.287052920725731e-06, + "loss": 1.0084, + "step": 7264 + }, + { + "epoch": 0.5870825673245924, + "grad_norm": 2.5690367221832275, + "learning_rate": 8.286559809393796e-06, + "loss": 0.9965, + "step": 7265 + }, + { + "epoch": 0.58716337703792, + "grad_norm": 2.5451879501342773, + "learning_rate": 8.286066641771005e-06, + "loss": 0.9242, + "step": 7266 + }, + { + "epoch": 0.5872441867512475, + "grad_norm": 2.705350399017334, + "learning_rate": 8.285573417865802e-06, + "loss": 0.8542, + "step": 7267 + }, + { + "epoch": 0.587324996464575, + "grad_norm": 2.1438097953796387, + "learning_rate": 8.285080137686637e-06, + "loss": 1.1101, + "step": 7268 + }, + { + "epoch": 0.5874058061779026, + "grad_norm": 2.670281410217285, + "learning_rate": 8.284586801241957e-06, + "loss": 0.8841, + "step": 7269 + }, + { + "epoch": 0.5874866158912301, + "grad_norm": 2.326840877532959, + "learning_rate": 8.284093408540213e-06, + "loss": 0.9651, + "step": 7270 + }, + { + "epoch": 0.5875674256045577, + "grad_norm": 2.614589214324951, + "learning_rate": 8.283599959589854e-06, + "loss": 0.8955, + "step": 7271 + }, + { + "epoch": 0.5876482353178852, + "grad_norm": 2.9460232257843018, + "learning_rate": 8.283106454399334e-06, + "loss": 1.1043, + "step": 7272 + }, + { + "epoch": 0.5877290450312127, + "grad_norm": 2.689347982406616, + "learning_rate": 8.282612892977104e-06, + "loss": 0.9888, + "step": 7273 + }, + { + "epoch": 0.5878098547445403, + "grad_norm": 2.7261712551116943, + "learning_rate": 8.282119275331617e-06, + "loss": 1.0793, + "step": 7274 + }, + { + "epoch": 0.5878906644578679, + "grad_norm": 2.2428059577941895, + "learning_rate": 8.281625601471329e-06, + "loss": 0.8454, + "step": 7275 + }, + { + "epoch": 0.5879714741711953, + "grad_norm": 2.658480644226074, + "learning_rate": 8.281131871404693e-06, + "loss": 0.9489, + "step": 7276 + }, + { + "epoch": 0.5880522838845229, + "grad_norm": 2.676419496536255, + "learning_rate": 8.28063808514017e-06, + "loss": 0.9293, + "step": 7277 + }, + { + "epoch": 0.5881330935978505, + "grad_norm": 2.534071683883667, + "learning_rate": 8.280144242686213e-06, + "loss": 0.9243, + "step": 7278 + }, + { + "epoch": 0.588213903311178, + "grad_norm": 2.4483423233032227, + "learning_rate": 8.27965034405128e-06, + "loss": 0.9228, + "step": 7279 + }, + { + "epoch": 0.5882947130245055, + "grad_norm": 2.6890275478363037, + "learning_rate": 8.279156389243835e-06, + "loss": 0.9201, + "step": 7280 + }, + { + "epoch": 0.5883755227378331, + "grad_norm": 2.8360040187835693, + "learning_rate": 8.278662378272333e-06, + "loss": 0.9064, + "step": 7281 + }, + { + "epoch": 0.5884563324511606, + "grad_norm": 3.0665738582611084, + "learning_rate": 8.278168311145238e-06, + "loss": 0.8436, + "step": 7282 + }, + { + "epoch": 0.5885371421644882, + "grad_norm": 2.62737774848938, + "learning_rate": 8.277674187871012e-06, + "loss": 0.8682, + "step": 7283 + }, + { + "epoch": 0.5886179518778157, + "grad_norm": 3.6039865016937256, + "learning_rate": 8.277180008458118e-06, + "loss": 0.9289, + "step": 7284 + }, + { + "epoch": 0.5886987615911432, + "grad_norm": 3.059206962585449, + "learning_rate": 8.27668577291502e-06, + "loss": 0.9672, + "step": 7285 + }, + { + "epoch": 0.5887795713044708, + "grad_norm": 2.4514665603637695, + "learning_rate": 8.276191481250183e-06, + "loss": 0.9134, + "step": 7286 + }, + { + "epoch": 0.5888603810177984, + "grad_norm": 2.8695521354675293, + "learning_rate": 8.275697133472073e-06, + "loss": 0.914, + "step": 7287 + }, + { + "epoch": 0.5889411907311258, + "grad_norm": 2.4840245246887207, + "learning_rate": 8.275202729589156e-06, + "loss": 0.9021, + "step": 7288 + }, + { + "epoch": 0.5890220004444534, + "grad_norm": 2.4578912258148193, + "learning_rate": 8.274708269609902e-06, + "loss": 0.972, + "step": 7289 + }, + { + "epoch": 0.589102810157781, + "grad_norm": 2.822356939315796, + "learning_rate": 8.274213753542778e-06, + "loss": 0.9244, + "step": 7290 + }, + { + "epoch": 0.5891836198711085, + "grad_norm": 2.9471700191497803, + "learning_rate": 8.273719181396257e-06, + "loss": 0.9063, + "step": 7291 + }, + { + "epoch": 0.589264429584436, + "grad_norm": 2.3739912509918213, + "learning_rate": 8.273224553178806e-06, + "loss": 0.8829, + "step": 7292 + }, + { + "epoch": 0.5893452392977636, + "grad_norm": 2.6234970092773438, + "learning_rate": 8.272729868898897e-06, + "loss": 1.0808, + "step": 7293 + }, + { + "epoch": 0.5894260490110911, + "grad_norm": 3.0504112243652344, + "learning_rate": 8.272235128565006e-06, + "loss": 1.044, + "step": 7294 + }, + { + "epoch": 0.5895068587244187, + "grad_norm": 3.024522542953491, + "learning_rate": 8.271740332185605e-06, + "loss": 0.9667, + "step": 7295 + }, + { + "epoch": 0.5895876684377462, + "grad_norm": 3.0582995414733887, + "learning_rate": 8.271245479769168e-06, + "loss": 0.9668, + "step": 7296 + }, + { + "epoch": 0.5896684781510737, + "grad_norm": 2.4406895637512207, + "learning_rate": 8.27075057132417e-06, + "loss": 0.8866, + "step": 7297 + }, + { + "epoch": 0.5897492878644013, + "grad_norm": 2.813310384750366, + "learning_rate": 8.27025560685909e-06, + "loss": 0.8483, + "step": 7298 + }, + { + "epoch": 0.5898300975777289, + "grad_norm": 2.3181216716766357, + "learning_rate": 8.269760586382404e-06, + "loss": 0.9512, + "step": 7299 + }, + { + "epoch": 0.5899109072910563, + "grad_norm": 2.4975626468658447, + "learning_rate": 8.26926550990259e-06, + "loss": 1.0061, + "step": 7300 + }, + { + "epoch": 0.5899917170043839, + "grad_norm": 2.848518133163452, + "learning_rate": 8.268770377428131e-06, + "loss": 1.0314, + "step": 7301 + }, + { + "epoch": 0.5900725267177115, + "grad_norm": 2.6034278869628906, + "learning_rate": 8.268275188967503e-06, + "loss": 0.8208, + "step": 7302 + }, + { + "epoch": 0.5901533364310391, + "grad_norm": 2.475684642791748, + "learning_rate": 8.267779944529187e-06, + "loss": 1.0302, + "step": 7303 + }, + { + "epoch": 0.5902341461443665, + "grad_norm": 2.6110689640045166, + "learning_rate": 8.267284644121669e-06, + "loss": 0.915, + "step": 7304 + }, + { + "epoch": 0.5903149558576941, + "grad_norm": 2.9698400497436523, + "learning_rate": 8.266789287753432e-06, + "loss": 0.8153, + "step": 7305 + }, + { + "epoch": 0.5903957655710217, + "grad_norm": 2.4260551929473877, + "learning_rate": 8.266293875432957e-06, + "loss": 0.974, + "step": 7306 + }, + { + "epoch": 0.5904765752843492, + "grad_norm": 2.4733686447143555, + "learning_rate": 8.265798407168732e-06, + "loss": 0.9055, + "step": 7307 + }, + { + "epoch": 0.5905573849976767, + "grad_norm": 2.69268536567688, + "learning_rate": 8.265302882969242e-06, + "loss": 0.8344, + "step": 7308 + }, + { + "epoch": 0.5906381947110043, + "grad_norm": 3.423067092895508, + "learning_rate": 8.264807302842976e-06, + "loss": 0.9971, + "step": 7309 + }, + { + "epoch": 0.5907190044243318, + "grad_norm": 3.1059982776641846, + "learning_rate": 8.264311666798419e-06, + "loss": 0.9854, + "step": 7310 + }, + { + "epoch": 0.5907998141376594, + "grad_norm": 2.9936695098876953, + "learning_rate": 8.263815974844063e-06, + "loss": 0.9392, + "step": 7311 + }, + { + "epoch": 0.5908806238509869, + "grad_norm": 2.4910175800323486, + "learning_rate": 8.263320226988395e-06, + "loss": 0.9391, + "step": 7312 + }, + { + "epoch": 0.5909614335643144, + "grad_norm": 2.7224578857421875, + "learning_rate": 8.262824423239908e-06, + "loss": 0.9075, + "step": 7313 + }, + { + "epoch": 0.591042243277642, + "grad_norm": 2.775599718093872, + "learning_rate": 8.262328563607094e-06, + "loss": 1.0178, + "step": 7314 + }, + { + "epoch": 0.5911230529909696, + "grad_norm": 2.9733424186706543, + "learning_rate": 8.261832648098447e-06, + "loss": 1.1278, + "step": 7315 + }, + { + "epoch": 0.591203862704297, + "grad_norm": 2.845141649246216, + "learning_rate": 8.261336676722457e-06, + "loss": 0.9019, + "step": 7316 + }, + { + "epoch": 0.5912846724176246, + "grad_norm": 2.9141879081726074, + "learning_rate": 8.260840649487622e-06, + "loss": 0.9286, + "step": 7317 + }, + { + "epoch": 0.5913654821309522, + "grad_norm": 2.4800522327423096, + "learning_rate": 8.260344566402436e-06, + "loss": 0.9283, + "step": 7318 + }, + { + "epoch": 0.5914462918442797, + "grad_norm": 3.3712503910064697, + "learning_rate": 8.259848427475397e-06, + "loss": 0.9504, + "step": 7319 + }, + { + "epoch": 0.5915271015576072, + "grad_norm": 2.650286912918091, + "learning_rate": 8.259352232715004e-06, + "loss": 1.0402, + "step": 7320 + }, + { + "epoch": 0.5916079112709348, + "grad_norm": 2.863093852996826, + "learning_rate": 8.25885598212975e-06, + "loss": 0.8921, + "step": 7321 + }, + { + "epoch": 0.5916887209842623, + "grad_norm": 2.793614149093628, + "learning_rate": 8.258359675728143e-06, + "loss": 0.8471, + "step": 7322 + }, + { + "epoch": 0.5917695306975899, + "grad_norm": 2.6360037326812744, + "learning_rate": 8.257863313518676e-06, + "loss": 0.9054, + "step": 7323 + }, + { + "epoch": 0.5918503404109174, + "grad_norm": 2.5683095455169678, + "learning_rate": 8.257366895509853e-06, + "loss": 1.0138, + "step": 7324 + }, + { + "epoch": 0.5919311501242449, + "grad_norm": 2.5132172107696533, + "learning_rate": 8.25687042171018e-06, + "loss": 0.9629, + "step": 7325 + }, + { + "epoch": 0.5920119598375725, + "grad_norm": 3.5530898571014404, + "learning_rate": 8.256373892128154e-06, + "loss": 0.9259, + "step": 7326 + }, + { + "epoch": 0.5920927695509001, + "grad_norm": 2.6273193359375, + "learning_rate": 8.255877306772283e-06, + "loss": 0.867, + "step": 7327 + }, + { + "epoch": 0.5921735792642275, + "grad_norm": 3.0033140182495117, + "learning_rate": 8.255380665651073e-06, + "loss": 0.9118, + "step": 7328 + }, + { + "epoch": 0.5922543889775551, + "grad_norm": 2.8479557037353516, + "learning_rate": 8.254883968773028e-06, + "loss": 0.9061, + "step": 7329 + }, + { + "epoch": 0.5923351986908827, + "grad_norm": 2.35652756690979, + "learning_rate": 8.254387216146658e-06, + "loss": 1.0341, + "step": 7330 + }, + { + "epoch": 0.5924160084042102, + "grad_norm": 2.7019267082214355, + "learning_rate": 8.25389040778047e-06, + "loss": 1.0876, + "step": 7331 + }, + { + "epoch": 0.5924968181175377, + "grad_norm": 2.4694161415100098, + "learning_rate": 8.25339354368297e-06, + "loss": 1.1297, + "step": 7332 + }, + { + "epoch": 0.5925776278308653, + "grad_norm": 2.877805471420288, + "learning_rate": 8.252896623862674e-06, + "loss": 0.9301, + "step": 7333 + }, + { + "epoch": 0.5926584375441928, + "grad_norm": 2.8389344215393066, + "learning_rate": 8.25239964832809e-06, + "loss": 0.8912, + "step": 7334 + }, + { + "epoch": 0.5927392472575204, + "grad_norm": 2.62739896774292, + "learning_rate": 8.251902617087726e-06, + "loss": 0.9648, + "step": 7335 + }, + { + "epoch": 0.5928200569708479, + "grad_norm": 2.79373836517334, + "learning_rate": 8.251405530150101e-06, + "loss": 0.8026, + "step": 7336 + }, + { + "epoch": 0.5929008666841754, + "grad_norm": 2.92045259475708, + "learning_rate": 8.250908387523727e-06, + "loss": 0.9487, + "step": 7337 + }, + { + "epoch": 0.592981676397503, + "grad_norm": 2.3661844730377197, + "learning_rate": 8.250411189217118e-06, + "loss": 0.9737, + "step": 7338 + }, + { + "epoch": 0.5930624861108306, + "grad_norm": 2.865668773651123, + "learning_rate": 8.249913935238792e-06, + "loss": 1.0048, + "step": 7339 + }, + { + "epoch": 0.593143295824158, + "grad_norm": 2.6446855068206787, + "learning_rate": 8.249416625597262e-06, + "loss": 1.0937, + "step": 7340 + }, + { + "epoch": 0.5932241055374856, + "grad_norm": 2.608536720275879, + "learning_rate": 8.248919260301048e-06, + "loss": 0.9387, + "step": 7341 + }, + { + "epoch": 0.5933049152508132, + "grad_norm": 2.9983580112457275, + "learning_rate": 8.248421839358669e-06, + "loss": 0.8629, + "step": 7342 + }, + { + "epoch": 0.5933857249641407, + "grad_norm": 2.7614946365356445, + "learning_rate": 8.247924362778645e-06, + "loss": 1.0247, + "step": 7343 + }, + { + "epoch": 0.5934665346774682, + "grad_norm": 2.579474449157715, + "learning_rate": 8.247426830569494e-06, + "loss": 0.9084, + "step": 7344 + }, + { + "epoch": 0.5935473443907958, + "grad_norm": 2.6016366481781006, + "learning_rate": 8.24692924273974e-06, + "loss": 1.019, + "step": 7345 + }, + { + "epoch": 0.5936281541041233, + "grad_norm": 2.929500102996826, + "learning_rate": 8.246431599297905e-06, + "loss": 1.1036, + "step": 7346 + }, + { + "epoch": 0.5937089638174509, + "grad_norm": 2.6554834842681885, + "learning_rate": 8.245933900252514e-06, + "loss": 0.9863, + "step": 7347 + }, + { + "epoch": 0.5937897735307784, + "grad_norm": 3.1263253688812256, + "learning_rate": 8.245436145612088e-06, + "loss": 0.927, + "step": 7348 + }, + { + "epoch": 0.5938705832441059, + "grad_norm": 2.7033302783966064, + "learning_rate": 8.244938335385154e-06, + "loss": 1.0174, + "step": 7349 + }, + { + "epoch": 0.5939513929574335, + "grad_norm": 3.122936964035034, + "learning_rate": 8.244440469580237e-06, + "loss": 0.9281, + "step": 7350 + }, + { + "epoch": 0.5940322026707611, + "grad_norm": 2.663856029510498, + "learning_rate": 8.243942548205867e-06, + "loss": 1.0489, + "step": 7351 + }, + { + "epoch": 0.5941130123840885, + "grad_norm": 2.829989194869995, + "learning_rate": 8.243444571270568e-06, + "loss": 0.9527, + "step": 7352 + }, + { + "epoch": 0.5941938220974161, + "grad_norm": 2.4378714561462402, + "learning_rate": 8.242946538782875e-06, + "loss": 0.8978, + "step": 7353 + }, + { + "epoch": 0.5942746318107437, + "grad_norm": 2.8279662132263184, + "learning_rate": 8.242448450751314e-06, + "loss": 1.0525, + "step": 7354 + }, + { + "epoch": 0.5943554415240712, + "grad_norm": 3.1409215927124023, + "learning_rate": 8.241950307184416e-06, + "loss": 0.982, + "step": 7355 + }, + { + "epoch": 0.5944362512373987, + "grad_norm": 2.909543991088867, + "learning_rate": 8.241452108090716e-06, + "loss": 0.9226, + "step": 7356 + }, + { + "epoch": 0.5945170609507263, + "grad_norm": 2.2005367279052734, + "learning_rate": 8.240953853478742e-06, + "loss": 0.9917, + "step": 7357 + }, + { + "epoch": 0.5945978706640538, + "grad_norm": 2.7676758766174316, + "learning_rate": 8.240455543357031e-06, + "loss": 1.03, + "step": 7358 + }, + { + "epoch": 0.5946786803773814, + "grad_norm": 2.4235360622406006, + "learning_rate": 8.239957177734119e-06, + "loss": 1.0517, + "step": 7359 + }, + { + "epoch": 0.5947594900907089, + "grad_norm": 2.5609865188598633, + "learning_rate": 8.23945875661854e-06, + "loss": 0.9424, + "step": 7360 + }, + { + "epoch": 0.5948402998040364, + "grad_norm": 2.680320978164673, + "learning_rate": 8.238960280018832e-06, + "loss": 0.9987, + "step": 7361 + }, + { + "epoch": 0.594921109517364, + "grad_norm": 2.6540329456329346, + "learning_rate": 8.23846174794353e-06, + "loss": 0.959, + "step": 7362 + }, + { + "epoch": 0.5950019192306916, + "grad_norm": 2.4815590381622314, + "learning_rate": 8.237963160401176e-06, + "loss": 0.9409, + "step": 7363 + }, + { + "epoch": 0.595082728944019, + "grad_norm": 2.2560675144195557, + "learning_rate": 8.237464517400308e-06, + "loss": 0.9085, + "step": 7364 + }, + { + "epoch": 0.5951635386573466, + "grad_norm": 2.394665479660034, + "learning_rate": 8.236965818949467e-06, + "loss": 0.9635, + "step": 7365 + }, + { + "epoch": 0.5952443483706742, + "grad_norm": 2.595749855041504, + "learning_rate": 8.236467065057193e-06, + "loss": 0.9486, + "step": 7366 + }, + { + "epoch": 0.5953251580840017, + "grad_norm": 2.5327694416046143, + "learning_rate": 8.23596825573203e-06, + "loss": 0.8764, + "step": 7367 + }, + { + "epoch": 0.5954059677973292, + "grad_norm": 3.028306722640991, + "learning_rate": 8.235469390982522e-06, + "loss": 1.0335, + "step": 7368 + }, + { + "epoch": 0.5954867775106568, + "grad_norm": 2.623086452484131, + "learning_rate": 8.234970470817212e-06, + "loss": 1.0315, + "step": 7369 + }, + { + "epoch": 0.5955675872239843, + "grad_norm": 2.815917491912842, + "learning_rate": 8.234471495244644e-06, + "loss": 0.9318, + "step": 7370 + }, + { + "epoch": 0.5956483969373119, + "grad_norm": 2.6206581592559814, + "learning_rate": 8.23397246427337e-06, + "loss": 0.9279, + "step": 7371 + }, + { + "epoch": 0.5957292066506394, + "grad_norm": 2.9340102672576904, + "learning_rate": 8.23347337791193e-06, + "loss": 1.0592, + "step": 7372 + }, + { + "epoch": 0.5958100163639669, + "grad_norm": 2.8901724815368652, + "learning_rate": 8.232974236168875e-06, + "loss": 1.0235, + "step": 7373 + }, + { + "epoch": 0.5958908260772945, + "grad_norm": 2.838430166244507, + "learning_rate": 8.232475039052755e-06, + "loss": 1.0582, + "step": 7374 + }, + { + "epoch": 0.5959716357906221, + "grad_norm": 2.7173655033111572, + "learning_rate": 8.23197578657212e-06, + "loss": 0.8986, + "step": 7375 + }, + { + "epoch": 0.5960524455039495, + "grad_norm": 2.1216917037963867, + "learning_rate": 8.23147647873552e-06, + "loss": 1.0306, + "step": 7376 + }, + { + "epoch": 0.5961332552172771, + "grad_norm": 3.172276735305786, + "learning_rate": 8.230977115551508e-06, + "loss": 0.9584, + "step": 7377 + }, + { + "epoch": 0.5962140649306047, + "grad_norm": 2.7974860668182373, + "learning_rate": 8.230477697028636e-06, + "loss": 0.9677, + "step": 7378 + }, + { + "epoch": 0.5962948746439322, + "grad_norm": 2.6010758876800537, + "learning_rate": 8.229978223175459e-06, + "loss": 1.0288, + "step": 7379 + }, + { + "epoch": 0.5963756843572597, + "grad_norm": 3.3252294063568115, + "learning_rate": 8.229478694000527e-06, + "loss": 0.9599, + "step": 7380 + }, + { + "epoch": 0.5964564940705873, + "grad_norm": 3.0079755783081055, + "learning_rate": 8.228979109512405e-06, + "loss": 0.8889, + "step": 7381 + }, + { + "epoch": 0.5965373037839148, + "grad_norm": 3.624636650085449, + "learning_rate": 8.228479469719641e-06, + "loss": 1.0804, + "step": 7382 + }, + { + "epoch": 0.5966181134972424, + "grad_norm": 2.2371068000793457, + "learning_rate": 8.227979774630796e-06, + "loss": 0.9555, + "step": 7383 + }, + { + "epoch": 0.5966989232105699, + "grad_norm": 2.666361093521118, + "learning_rate": 8.22748002425443e-06, + "loss": 0.9783, + "step": 7384 + }, + { + "epoch": 0.5967797329238974, + "grad_norm": 2.5665626525878906, + "learning_rate": 8.2269802185991e-06, + "loss": 0.9119, + "step": 7385 + }, + { + "epoch": 0.596860542637225, + "grad_norm": 2.503783702850342, + "learning_rate": 8.226480357673367e-06, + "loss": 0.8624, + "step": 7386 + }, + { + "epoch": 0.5969413523505526, + "grad_norm": 2.597759962081909, + "learning_rate": 8.225980441485794e-06, + "loss": 1.1148, + "step": 7387 + }, + { + "epoch": 0.59702216206388, + "grad_norm": 2.4626972675323486, + "learning_rate": 8.225480470044942e-06, + "loss": 0.9465, + "step": 7388 + }, + { + "epoch": 0.5971029717772076, + "grad_norm": 2.6293704509735107, + "learning_rate": 8.224980443359374e-06, + "loss": 0.9974, + "step": 7389 + }, + { + "epoch": 0.5971837814905352, + "grad_norm": 2.6583855152130127, + "learning_rate": 8.224480361437657e-06, + "loss": 0.9461, + "step": 7390 + }, + { + "epoch": 0.5972645912038627, + "grad_norm": 2.6637701988220215, + "learning_rate": 8.223980224288351e-06, + "loss": 0.9341, + "step": 7391 + }, + { + "epoch": 0.5973454009171902, + "grad_norm": 2.804215431213379, + "learning_rate": 8.223480031920029e-06, + "loss": 1.0286, + "step": 7392 + }, + { + "epoch": 0.5974262106305178, + "grad_norm": 2.570969581604004, + "learning_rate": 8.22297978434125e-06, + "loss": 0.9489, + "step": 7393 + }, + { + "epoch": 0.5975070203438453, + "grad_norm": 2.333669900894165, + "learning_rate": 8.222479481560588e-06, + "loss": 0.9311, + "step": 7394 + }, + { + "epoch": 0.5975878300571729, + "grad_norm": 2.5228443145751953, + "learning_rate": 8.221979123586611e-06, + "loss": 0.9284, + "step": 7395 + }, + { + "epoch": 0.5976686397705004, + "grad_norm": 2.806471586227417, + "learning_rate": 8.221478710427889e-06, + "loss": 0.8775, + "step": 7396 + }, + { + "epoch": 0.5977494494838279, + "grad_norm": 2.5036611557006836, + "learning_rate": 8.22097824209299e-06, + "loss": 0.9738, + "step": 7397 + }, + { + "epoch": 0.5978302591971555, + "grad_norm": 2.396198034286499, + "learning_rate": 8.220477718590486e-06, + "loss": 0.9063, + "step": 7398 + }, + { + "epoch": 0.5979110689104831, + "grad_norm": 3.1461191177368164, + "learning_rate": 8.219977139928957e-06, + "loss": 1.0042, + "step": 7399 + }, + { + "epoch": 0.5979918786238105, + "grad_norm": 2.9181549549102783, + "learning_rate": 8.219476506116968e-06, + "loss": 0.9777, + "step": 7400 + }, + { + "epoch": 0.5980726883371381, + "grad_norm": 3.023691415786743, + "learning_rate": 8.218975817163098e-06, + "loss": 0.9521, + "step": 7401 + }, + { + "epoch": 0.5981534980504657, + "grad_norm": 2.5120723247528076, + "learning_rate": 8.21847507307592e-06, + "loss": 1.0271, + "step": 7402 + }, + { + "epoch": 0.5982343077637932, + "grad_norm": 2.790689706802368, + "learning_rate": 8.217974273864013e-06, + "loss": 0.9678, + "step": 7403 + }, + { + "epoch": 0.5983151174771207, + "grad_norm": 2.6812565326690674, + "learning_rate": 8.217473419535956e-06, + "loss": 1.0998, + "step": 7404 + }, + { + "epoch": 0.5983959271904483, + "grad_norm": 2.502741813659668, + "learning_rate": 8.216972510100322e-06, + "loss": 0.9043, + "step": 7405 + }, + { + "epoch": 0.5984767369037758, + "grad_norm": 2.2891488075256348, + "learning_rate": 8.216471545565694e-06, + "loss": 0.9692, + "step": 7406 + }, + { + "epoch": 0.5985575466171034, + "grad_norm": 2.8040974140167236, + "learning_rate": 8.215970525940653e-06, + "loss": 0.9022, + "step": 7407 + }, + { + "epoch": 0.5986383563304309, + "grad_norm": 2.492466688156128, + "learning_rate": 8.215469451233778e-06, + "loss": 0.8387, + "step": 7408 + }, + { + "epoch": 0.5987191660437584, + "grad_norm": 2.9831202030181885, + "learning_rate": 8.214968321453653e-06, + "loss": 1.0022, + "step": 7409 + }, + { + "epoch": 0.598799975757086, + "grad_norm": 2.6915781497955322, + "learning_rate": 8.214467136608861e-06, + "loss": 1.091, + "step": 7410 + }, + { + "epoch": 0.5988807854704136, + "grad_norm": 2.6560022830963135, + "learning_rate": 8.213965896707983e-06, + "loss": 1.0015, + "step": 7411 + }, + { + "epoch": 0.598961595183741, + "grad_norm": 2.7713708877563477, + "learning_rate": 8.213464601759609e-06, + "loss": 0.8648, + "step": 7412 + }, + { + "epoch": 0.5990424048970686, + "grad_norm": 2.421834945678711, + "learning_rate": 8.212963251772322e-06, + "loss": 0.9333, + "step": 7413 + }, + { + "epoch": 0.5991232146103962, + "grad_norm": 2.515977382659912, + "learning_rate": 8.212461846754708e-06, + "loss": 0.9433, + "step": 7414 + }, + { + "epoch": 0.5992040243237237, + "grad_norm": 2.403157949447632, + "learning_rate": 8.211960386715356e-06, + "loss": 1.0099, + "step": 7415 + }, + { + "epoch": 0.5992848340370512, + "grad_norm": 3.128310441970825, + "learning_rate": 8.211458871662855e-06, + "loss": 0.8759, + "step": 7416 + }, + { + "epoch": 0.5993656437503788, + "grad_norm": 2.983551025390625, + "learning_rate": 8.210957301605797e-06, + "loss": 0.8309, + "step": 7417 + }, + { + "epoch": 0.5994464534637063, + "grad_norm": 2.517723798751831, + "learning_rate": 8.210455676552771e-06, + "loss": 1.0831, + "step": 7418 + }, + { + "epoch": 0.5995272631770339, + "grad_norm": 2.5016441345214844, + "learning_rate": 8.209953996512366e-06, + "loss": 0.9106, + "step": 7419 + }, + { + "epoch": 0.5996080728903614, + "grad_norm": 2.536555290222168, + "learning_rate": 8.209452261493178e-06, + "loss": 0.9192, + "step": 7420 + }, + { + "epoch": 0.5996888826036889, + "grad_norm": 2.7434983253479004, + "learning_rate": 8.208950471503798e-06, + "loss": 0.9539, + "step": 7421 + }, + { + "epoch": 0.5997696923170165, + "grad_norm": 2.569958448410034, + "learning_rate": 8.208448626552821e-06, + "loss": 0.9704, + "step": 7422 + }, + { + "epoch": 0.5998505020303441, + "grad_norm": 2.4909441471099854, + "learning_rate": 8.207946726648846e-06, + "loss": 0.9827, + "step": 7423 + }, + { + "epoch": 0.5999313117436715, + "grad_norm": 3.1090803146362305, + "learning_rate": 8.207444771800464e-06, + "loss": 1.0315, + "step": 7424 + }, + { + "epoch": 0.6000121214569991, + "grad_norm": 2.9979429244995117, + "learning_rate": 8.206942762016275e-06, + "loss": 0.8577, + "step": 7425 + }, + { + "epoch": 0.6000929311703267, + "grad_norm": 2.3236005306243896, + "learning_rate": 8.206440697304876e-06, + "loss": 0.9198, + "step": 7426 + }, + { + "epoch": 0.6001737408836542, + "grad_norm": 3.4106571674346924, + "learning_rate": 8.205938577674869e-06, + "loss": 0.8742, + "step": 7427 + }, + { + "epoch": 0.6002545505969817, + "grad_norm": 2.469843864440918, + "learning_rate": 8.20543640313485e-06, + "loss": 1.1598, + "step": 7428 + }, + { + "epoch": 0.6003353603103093, + "grad_norm": 2.6596758365631104, + "learning_rate": 8.204934173693425e-06, + "loss": 0.8582, + "step": 7429 + }, + { + "epoch": 0.6004161700236368, + "grad_norm": 2.2257728576660156, + "learning_rate": 8.20443188935919e-06, + "loss": 0.9651, + "step": 7430 + }, + { + "epoch": 0.6004969797369644, + "grad_norm": 2.8060038089752197, + "learning_rate": 8.203929550140754e-06, + "loss": 0.9679, + "step": 7431 + }, + { + "epoch": 0.6005777894502919, + "grad_norm": 2.5629186630249023, + "learning_rate": 8.203427156046715e-06, + "loss": 0.9635, + "step": 7432 + }, + { + "epoch": 0.6006585991636195, + "grad_norm": 2.5945253372192383, + "learning_rate": 8.202924707085684e-06, + "loss": 0.9805, + "step": 7433 + }, + { + "epoch": 0.600739408876947, + "grad_norm": 2.6697072982788086, + "learning_rate": 8.20242220326626e-06, + "loss": 0.9685, + "step": 7434 + }, + { + "epoch": 0.6008202185902746, + "grad_norm": 2.6331472396850586, + "learning_rate": 8.201919644597056e-06, + "loss": 0.9987, + "step": 7435 + }, + { + "epoch": 0.6009010283036021, + "grad_norm": 2.4247093200683594, + "learning_rate": 8.201417031086676e-06, + "loss": 0.8182, + "step": 7436 + }, + { + "epoch": 0.6009818380169296, + "grad_norm": 2.3713717460632324, + "learning_rate": 8.20091436274373e-06, + "loss": 1.0406, + "step": 7437 + }, + { + "epoch": 0.6010626477302572, + "grad_norm": 2.7394397258758545, + "learning_rate": 8.200411639576827e-06, + "loss": 0.8715, + "step": 7438 + }, + { + "epoch": 0.6011434574435848, + "grad_norm": 2.602278709411621, + "learning_rate": 8.199908861594575e-06, + "loss": 1.0104, + "step": 7439 + }, + { + "epoch": 0.6012242671569122, + "grad_norm": 2.413912057876587, + "learning_rate": 8.19940602880559e-06, + "loss": 1.0092, + "step": 7440 + }, + { + "epoch": 0.6013050768702398, + "grad_norm": 2.4183106422424316, + "learning_rate": 8.19890314121848e-06, + "loss": 0.867, + "step": 7441 + }, + { + "epoch": 0.6013858865835674, + "grad_norm": 2.622666358947754, + "learning_rate": 8.198400198841861e-06, + "loss": 1.0506, + "step": 7442 + }, + { + "epoch": 0.6014666962968949, + "grad_norm": 2.7496120929718018, + "learning_rate": 8.197897201684347e-06, + "loss": 0.9275, + "step": 7443 + }, + { + "epoch": 0.6015475060102224, + "grad_norm": 2.7471072673797607, + "learning_rate": 8.197394149754552e-06, + "loss": 0.9175, + "step": 7444 + }, + { + "epoch": 0.60162831572355, + "grad_norm": 2.3735315799713135, + "learning_rate": 8.196891043061093e-06, + "loss": 0.9111, + "step": 7445 + }, + { + "epoch": 0.6017091254368775, + "grad_norm": 2.809560537338257, + "learning_rate": 8.196387881612586e-06, + "loss": 0.9707, + "step": 7446 + }, + { + "epoch": 0.6017899351502051, + "grad_norm": 2.4325170516967773, + "learning_rate": 8.19588466541765e-06, + "loss": 0.9298, + "step": 7447 + }, + { + "epoch": 0.6018707448635326, + "grad_norm": 2.7904233932495117, + "learning_rate": 8.195381394484903e-06, + "loss": 0.9868, + "step": 7448 + }, + { + "epoch": 0.6019515545768601, + "grad_norm": 2.5866594314575195, + "learning_rate": 8.194878068822967e-06, + "loss": 0.9125, + "step": 7449 + }, + { + "epoch": 0.6020323642901877, + "grad_norm": 2.3913865089416504, + "learning_rate": 8.19437468844046e-06, + "loss": 0.9177, + "step": 7450 + }, + { + "epoch": 0.6021131740035153, + "grad_norm": 2.8396122455596924, + "learning_rate": 8.193871253346005e-06, + "loss": 1.0068, + "step": 7451 + }, + { + "epoch": 0.6021939837168427, + "grad_norm": 2.1447858810424805, + "learning_rate": 8.193367763548223e-06, + "loss": 0.9674, + "step": 7452 + }, + { + "epoch": 0.6022747934301703, + "grad_norm": 2.292912721633911, + "learning_rate": 8.192864219055741e-06, + "loss": 0.9786, + "step": 7453 + }, + { + "epoch": 0.6023556031434979, + "grad_norm": 2.575404167175293, + "learning_rate": 8.19236061987718e-06, + "loss": 1.0059, + "step": 7454 + }, + { + "epoch": 0.6024364128568254, + "grad_norm": 2.3621768951416016, + "learning_rate": 8.191856966021166e-06, + "loss": 0.8969, + "step": 7455 + }, + { + "epoch": 0.6025172225701529, + "grad_norm": 2.828322410583496, + "learning_rate": 8.191353257496328e-06, + "loss": 0.9147, + "step": 7456 + }, + { + "epoch": 0.6025980322834805, + "grad_norm": 2.3915247917175293, + "learning_rate": 8.190849494311291e-06, + "loss": 0.8263, + "step": 7457 + }, + { + "epoch": 0.602678841996808, + "grad_norm": 2.792440891265869, + "learning_rate": 8.190345676474684e-06, + "loss": 0.9835, + "step": 7458 + }, + { + "epoch": 0.6027596517101356, + "grad_norm": 2.724057912826538, + "learning_rate": 8.189841803995135e-06, + "loss": 0.9776, + "step": 7459 + }, + { + "epoch": 0.6028404614234631, + "grad_norm": 2.665722608566284, + "learning_rate": 8.189337876881276e-06, + "loss": 0.9499, + "step": 7460 + }, + { + "epoch": 0.6029212711367906, + "grad_norm": 2.611553192138672, + "learning_rate": 8.188833895141737e-06, + "loss": 0.9243, + "step": 7461 + }, + { + "epoch": 0.6030020808501182, + "grad_norm": 2.957235813140869, + "learning_rate": 8.188329858785152e-06, + "loss": 1.12, + "step": 7462 + }, + { + "epoch": 0.6030828905634458, + "grad_norm": 2.905423641204834, + "learning_rate": 8.18782576782015e-06, + "loss": 0.9059, + "step": 7463 + }, + { + "epoch": 0.6031637002767732, + "grad_norm": 2.612804412841797, + "learning_rate": 8.187321622255366e-06, + "loss": 0.9177, + "step": 7464 + }, + { + "epoch": 0.6032445099901008, + "grad_norm": 2.303976058959961, + "learning_rate": 8.186817422099437e-06, + "loss": 0.9688, + "step": 7465 + }, + { + "epoch": 0.6033253197034284, + "grad_norm": 2.7772412300109863, + "learning_rate": 8.186313167361e-06, + "loss": 0.8792, + "step": 7466 + }, + { + "epoch": 0.6034061294167559, + "grad_norm": 2.651742935180664, + "learning_rate": 8.185808858048684e-06, + "loss": 0.992, + "step": 7467 + }, + { + "epoch": 0.6034869391300834, + "grad_norm": 2.477022886276245, + "learning_rate": 8.185304494171136e-06, + "loss": 1.0104, + "step": 7468 + }, + { + "epoch": 0.603567748843411, + "grad_norm": 2.6424434185028076, + "learning_rate": 8.18480007573699e-06, + "loss": 1.0653, + "step": 7469 + }, + { + "epoch": 0.6036485585567385, + "grad_norm": 2.523757219314575, + "learning_rate": 8.184295602754883e-06, + "loss": 1.0043, + "step": 7470 + }, + { + "epoch": 0.6037293682700661, + "grad_norm": 2.7214486598968506, + "learning_rate": 8.183791075233461e-06, + "loss": 0.9176, + "step": 7471 + }, + { + "epoch": 0.6038101779833936, + "grad_norm": 3.0870211124420166, + "learning_rate": 8.183286493181361e-06, + "loss": 0.8808, + "step": 7472 + }, + { + "epoch": 0.6038909876967211, + "grad_norm": 2.231369972229004, + "learning_rate": 8.182781856607229e-06, + "loss": 0.9157, + "step": 7473 + }, + { + "epoch": 0.6039717974100487, + "grad_norm": 2.3250484466552734, + "learning_rate": 8.182277165519703e-06, + "loss": 0.9548, + "step": 7474 + }, + { + "epoch": 0.6040526071233763, + "grad_norm": 2.418064594268799, + "learning_rate": 8.181772419927431e-06, + "loss": 0.9099, + "step": 7475 + }, + { + "epoch": 0.6041334168367037, + "grad_norm": 2.4413342475891113, + "learning_rate": 8.18126761983906e-06, + "loss": 1.0852, + "step": 7476 + }, + { + "epoch": 0.6042142265500313, + "grad_norm": 2.696765422821045, + "learning_rate": 8.18076276526323e-06, + "loss": 0.9989, + "step": 7477 + }, + { + "epoch": 0.6042950362633589, + "grad_norm": 2.770642042160034, + "learning_rate": 8.18025785620859e-06, + "loss": 1.0051, + "step": 7478 + }, + { + "epoch": 0.6043758459766864, + "grad_norm": 3.536003589630127, + "learning_rate": 8.179752892683793e-06, + "loss": 0.9775, + "step": 7479 + }, + { + "epoch": 0.6044566556900139, + "grad_norm": 2.349836587905884, + "learning_rate": 8.179247874697482e-06, + "loss": 0.9597, + "step": 7480 + }, + { + "epoch": 0.6045374654033415, + "grad_norm": 2.5413570404052734, + "learning_rate": 8.17874280225831e-06, + "loss": 0.9222, + "step": 7481 + }, + { + "epoch": 0.604618275116669, + "grad_norm": 2.447821855545044, + "learning_rate": 8.178237675374924e-06, + "loss": 0.9519, + "step": 7482 + }, + { + "epoch": 0.6046990848299966, + "grad_norm": 2.8698275089263916, + "learning_rate": 8.177732494055979e-06, + "loss": 0.9258, + "step": 7483 + }, + { + "epoch": 0.6047798945433241, + "grad_norm": 2.419726848602295, + "learning_rate": 8.177227258310128e-06, + "loss": 0.9944, + "step": 7484 + }, + { + "epoch": 0.6048607042566516, + "grad_norm": 2.484703779220581, + "learning_rate": 8.176721968146021e-06, + "loss": 1.0323, + "step": 7485 + }, + { + "epoch": 0.6049415139699792, + "grad_norm": 2.353822708129883, + "learning_rate": 8.176216623572315e-06, + "loss": 0.8977, + "step": 7486 + }, + { + "epoch": 0.6050223236833068, + "grad_norm": 2.9588623046875, + "learning_rate": 8.175711224597664e-06, + "loss": 1.0361, + "step": 7487 + }, + { + "epoch": 0.6051031333966342, + "grad_norm": 2.3673336505889893, + "learning_rate": 8.175205771230725e-06, + "loss": 0.9604, + "step": 7488 + }, + { + "epoch": 0.6051839431099618, + "grad_norm": 2.559023380279541, + "learning_rate": 8.174700263480156e-06, + "loss": 1.001, + "step": 7489 + }, + { + "epoch": 0.6052647528232894, + "grad_norm": 2.4415910243988037, + "learning_rate": 8.174194701354614e-06, + "loss": 0.9434, + "step": 7490 + }, + { + "epoch": 0.6053455625366169, + "grad_norm": 3.00506329536438, + "learning_rate": 8.173689084862758e-06, + "loss": 1.0497, + "step": 7491 + }, + { + "epoch": 0.6054263722499444, + "grad_norm": 2.8126955032348633, + "learning_rate": 8.173183414013249e-06, + "loss": 0.9418, + "step": 7492 + }, + { + "epoch": 0.605507181963272, + "grad_norm": 2.812666416168213, + "learning_rate": 8.172677688814746e-06, + "loss": 0.983, + "step": 7493 + }, + { + "epoch": 0.6055879916765995, + "grad_norm": 2.6819217205047607, + "learning_rate": 8.172171909275912e-06, + "loss": 0.9592, + "step": 7494 + }, + { + "epoch": 0.6056688013899271, + "grad_norm": 3.085527181625366, + "learning_rate": 8.171666075405411e-06, + "loss": 0.9987, + "step": 7495 + }, + { + "epoch": 0.6057496111032546, + "grad_norm": 2.5476555824279785, + "learning_rate": 8.171160187211906e-06, + "loss": 0.875, + "step": 7496 + }, + { + "epoch": 0.6058304208165821, + "grad_norm": 2.542846441268921, + "learning_rate": 8.17065424470406e-06, + "loss": 0.9667, + "step": 7497 + }, + { + "epoch": 0.6059112305299097, + "grad_norm": 2.782371759414673, + "learning_rate": 8.170148247890541e-06, + "loss": 0.9271, + "step": 7498 + }, + { + "epoch": 0.6059920402432373, + "grad_norm": 2.778146743774414, + "learning_rate": 8.169642196780015e-06, + "loss": 0.9166, + "step": 7499 + }, + { + "epoch": 0.6060728499565647, + "grad_norm": 2.539367914199829, + "learning_rate": 8.169136091381146e-06, + "loss": 0.891, + "step": 7500 + }, + { + "epoch": 0.6061536596698923, + "grad_norm": 2.3302626609802246, + "learning_rate": 8.168629931702606e-06, + "loss": 0.9383, + "step": 7501 + }, + { + "epoch": 0.6062344693832199, + "grad_norm": 2.7369325160980225, + "learning_rate": 8.168123717753065e-06, + "loss": 0.9104, + "step": 7502 + }, + { + "epoch": 0.6063152790965474, + "grad_norm": 2.4281458854675293, + "learning_rate": 8.16761744954119e-06, + "loss": 0.8976, + "step": 7503 + }, + { + "epoch": 0.606396088809875, + "grad_norm": 2.6938211917877197, + "learning_rate": 8.167111127075656e-06, + "loss": 1.1036, + "step": 7504 + }, + { + "epoch": 0.6064768985232025, + "grad_norm": 3.0842552185058594, + "learning_rate": 8.166604750365131e-06, + "loss": 0.9602, + "step": 7505 + }, + { + "epoch": 0.60655770823653, + "grad_norm": 2.772709608078003, + "learning_rate": 8.16609831941829e-06, + "loss": 1.1227, + "step": 7506 + }, + { + "epoch": 0.6066385179498576, + "grad_norm": 3.11780047416687, + "learning_rate": 8.165591834243807e-06, + "loss": 0.9699, + "step": 7507 + }, + { + "epoch": 0.6067193276631851, + "grad_norm": 2.5700080394744873, + "learning_rate": 8.165085294850356e-06, + "loss": 1.0149, + "step": 7508 + }, + { + "epoch": 0.6068001373765126, + "grad_norm": 2.578562021255493, + "learning_rate": 8.164578701246615e-06, + "loss": 0.852, + "step": 7509 + }, + { + "epoch": 0.6068809470898402, + "grad_norm": 2.594264507293701, + "learning_rate": 8.164072053441259e-06, + "loss": 0.9298, + "step": 7510 + }, + { + "epoch": 0.6069617568031678, + "grad_norm": 2.7076611518859863, + "learning_rate": 8.163565351442965e-06, + "loss": 1.0175, + "step": 7511 + }, + { + "epoch": 0.6070425665164952, + "grad_norm": 2.7060465812683105, + "learning_rate": 8.163058595260413e-06, + "loss": 0.8987, + "step": 7512 + }, + { + "epoch": 0.6071233762298228, + "grad_norm": 2.303374767303467, + "learning_rate": 8.162551784902284e-06, + "loss": 0.9175, + "step": 7513 + }, + { + "epoch": 0.6072041859431504, + "grad_norm": 2.9991860389709473, + "learning_rate": 8.162044920377253e-06, + "loss": 1.051, + "step": 7514 + }, + { + "epoch": 0.6072849956564779, + "grad_norm": 2.8217544555664062, + "learning_rate": 8.161538001694007e-06, + "loss": 0.9991, + "step": 7515 + }, + { + "epoch": 0.6073658053698054, + "grad_norm": 2.489773750305176, + "learning_rate": 8.161031028861226e-06, + "loss": 0.9147, + "step": 7516 + }, + { + "epoch": 0.607446615083133, + "grad_norm": 3.427377462387085, + "learning_rate": 8.160524001887592e-06, + "loss": 0.9328, + "step": 7517 + }, + { + "epoch": 0.6075274247964605, + "grad_norm": 2.4631266593933105, + "learning_rate": 8.160016920781792e-06, + "loss": 0.89, + "step": 7518 + }, + { + "epoch": 0.6076082345097881, + "grad_norm": 2.5758299827575684, + "learning_rate": 8.15950978555251e-06, + "loss": 0.9989, + "step": 7519 + }, + { + "epoch": 0.6076890442231156, + "grad_norm": 2.9490103721618652, + "learning_rate": 8.15900259620843e-06, + "loss": 0.9204, + "step": 7520 + }, + { + "epoch": 0.6077698539364431, + "grad_norm": 2.7213451862335205, + "learning_rate": 8.15849535275824e-06, + "loss": 0.9274, + "step": 7521 + }, + { + "epoch": 0.6078506636497707, + "grad_norm": 2.4654622077941895, + "learning_rate": 8.15798805521063e-06, + "loss": 0.9773, + "step": 7522 + }, + { + "epoch": 0.6079314733630983, + "grad_norm": 2.4854447841644287, + "learning_rate": 8.157480703574287e-06, + "loss": 1.1008, + "step": 7523 + }, + { + "epoch": 0.6080122830764257, + "grad_norm": 2.836308240890503, + "learning_rate": 8.1569732978579e-06, + "loss": 0.8587, + "step": 7524 + }, + { + "epoch": 0.6080930927897533, + "grad_norm": 2.7810237407684326, + "learning_rate": 8.156465838070161e-06, + "loss": 0.9275, + "step": 7525 + }, + { + "epoch": 0.6081739025030809, + "grad_norm": 2.991436004638672, + "learning_rate": 8.155958324219761e-06, + "loss": 1.0333, + "step": 7526 + }, + { + "epoch": 0.6082547122164084, + "grad_norm": 2.6031649112701416, + "learning_rate": 8.155450756315393e-06, + "loss": 0.9937, + "step": 7527 + }, + { + "epoch": 0.608335521929736, + "grad_norm": 2.783151865005493, + "learning_rate": 8.15494313436575e-06, + "loss": 0.9891, + "step": 7528 + }, + { + "epoch": 0.6084163316430635, + "grad_norm": 2.942488193511963, + "learning_rate": 8.154435458379527e-06, + "loss": 1.0682, + "step": 7529 + }, + { + "epoch": 0.608497141356391, + "grad_norm": 2.9991884231567383, + "learning_rate": 8.153927728365416e-06, + "loss": 0.8114, + "step": 7530 + }, + { + "epoch": 0.6085779510697186, + "grad_norm": 2.4739584922790527, + "learning_rate": 8.153419944332117e-06, + "loss": 0.9758, + "step": 7531 + }, + { + "epoch": 0.6086587607830461, + "grad_norm": 2.494847297668457, + "learning_rate": 8.152912106288326e-06, + "loss": 1.0421, + "step": 7532 + }, + { + "epoch": 0.6087395704963736, + "grad_norm": 2.6952362060546875, + "learning_rate": 8.152404214242741e-06, + "loss": 0.9681, + "step": 7533 + }, + { + "epoch": 0.6088203802097012, + "grad_norm": 2.7268807888031006, + "learning_rate": 8.151896268204063e-06, + "loss": 0.9916, + "step": 7534 + }, + { + "epoch": 0.6089011899230288, + "grad_norm": 2.6562206745147705, + "learning_rate": 8.151388268180987e-06, + "loss": 0.8829, + "step": 7535 + }, + { + "epoch": 0.6089819996363562, + "grad_norm": 2.884728193283081, + "learning_rate": 8.150880214182217e-06, + "loss": 0.8888, + "step": 7536 + }, + { + "epoch": 0.6090628093496838, + "grad_norm": 3.1932179927825928, + "learning_rate": 8.150372106216455e-06, + "loss": 0.9161, + "step": 7537 + }, + { + "epoch": 0.6091436190630114, + "grad_norm": 2.834439277648926, + "learning_rate": 8.149863944292404e-06, + "loss": 0.9637, + "step": 7538 + }, + { + "epoch": 0.6092244287763389, + "grad_norm": 2.834709644317627, + "learning_rate": 8.149355728418764e-06, + "loss": 0.9427, + "step": 7539 + }, + { + "epoch": 0.6093052384896664, + "grad_norm": 2.3518810272216797, + "learning_rate": 8.148847458604245e-06, + "loss": 1.0289, + "step": 7540 + }, + { + "epoch": 0.609386048202994, + "grad_norm": 2.834364175796509, + "learning_rate": 8.148339134857548e-06, + "loss": 0.9526, + "step": 7541 + }, + { + "epoch": 0.6094668579163215, + "grad_norm": 2.6153712272644043, + "learning_rate": 8.14783075718738e-06, + "loss": 0.9339, + "step": 7542 + }, + { + "epoch": 0.6095476676296491, + "grad_norm": 2.8085949420928955, + "learning_rate": 8.14732232560245e-06, + "loss": 0.9856, + "step": 7543 + }, + { + "epoch": 0.6096284773429766, + "grad_norm": 2.8711469173431396, + "learning_rate": 8.146813840111467e-06, + "loss": 0.9479, + "step": 7544 + }, + { + "epoch": 0.6097092870563041, + "grad_norm": 2.610267400741577, + "learning_rate": 8.146305300723138e-06, + "loss": 0.982, + "step": 7545 + }, + { + "epoch": 0.6097900967696317, + "grad_norm": 2.5013654232025146, + "learning_rate": 8.145796707446173e-06, + "loss": 1.023, + "step": 7546 + }, + { + "epoch": 0.6098709064829593, + "grad_norm": 2.7156944274902344, + "learning_rate": 8.145288060289283e-06, + "loss": 0.9269, + "step": 7547 + }, + { + "epoch": 0.6099517161962867, + "grad_norm": 2.9470441341400146, + "learning_rate": 8.14477935926118e-06, + "loss": 0.997, + "step": 7548 + }, + { + "epoch": 0.6100325259096143, + "grad_norm": 2.524672031402588, + "learning_rate": 8.14427060437058e-06, + "loss": 0.9207, + "step": 7549 + }, + { + "epoch": 0.6101133356229419, + "grad_norm": 2.64919114112854, + "learning_rate": 8.143761795626192e-06, + "loss": 0.8817, + "step": 7550 + }, + { + "epoch": 0.6101941453362694, + "grad_norm": 2.4781274795532227, + "learning_rate": 8.143252933036733e-06, + "loss": 0.9171, + "step": 7551 + }, + { + "epoch": 0.610274955049597, + "grad_norm": 2.727407932281494, + "learning_rate": 8.142744016610919e-06, + "loss": 0.82, + "step": 7552 + }, + { + "epoch": 0.6103557647629245, + "grad_norm": 2.7073307037353516, + "learning_rate": 8.142235046357465e-06, + "loss": 0.8762, + "step": 7553 + }, + { + "epoch": 0.610436574476252, + "grad_norm": 2.8427350521087646, + "learning_rate": 8.141726022285091e-06, + "loss": 1.0253, + "step": 7554 + }, + { + "epoch": 0.6105173841895796, + "grad_norm": 2.542018413543701, + "learning_rate": 8.141216944402513e-06, + "loss": 1.0002, + "step": 7555 + }, + { + "epoch": 0.6105981939029071, + "grad_norm": 2.543044090270996, + "learning_rate": 8.140707812718449e-06, + "loss": 0.8985, + "step": 7556 + }, + { + "epoch": 0.6106790036162346, + "grad_norm": 2.4094595909118652, + "learning_rate": 8.140198627241623e-06, + "loss": 1.0356, + "step": 7557 + }, + { + "epoch": 0.6107598133295622, + "grad_norm": 2.7881250381469727, + "learning_rate": 8.139689387980754e-06, + "loss": 0.9284, + "step": 7558 + }, + { + "epoch": 0.6108406230428898, + "grad_norm": 2.783515691757202, + "learning_rate": 8.139180094944564e-06, + "loss": 0.9208, + "step": 7559 + }, + { + "epoch": 0.6109214327562172, + "grad_norm": 2.75824236869812, + "learning_rate": 8.138670748141775e-06, + "loss": 0.9096, + "step": 7560 + }, + { + "epoch": 0.6110022424695448, + "grad_norm": 2.610955238342285, + "learning_rate": 8.138161347581113e-06, + "loss": 0.8984, + "step": 7561 + }, + { + "epoch": 0.6110830521828724, + "grad_norm": 2.691279172897339, + "learning_rate": 8.137651893271303e-06, + "loss": 0.974, + "step": 7562 + }, + { + "epoch": 0.6111638618962, + "grad_norm": 2.675173759460449, + "learning_rate": 8.137142385221069e-06, + "loss": 0.9107, + "step": 7563 + }, + { + "epoch": 0.6112446716095274, + "grad_norm": 2.454118013381958, + "learning_rate": 8.136632823439136e-06, + "loss": 0.9351, + "step": 7564 + }, + { + "epoch": 0.611325481322855, + "grad_norm": 3.163738965988159, + "learning_rate": 8.136123207934238e-06, + "loss": 0.9331, + "step": 7565 + }, + { + "epoch": 0.6114062910361826, + "grad_norm": 2.5796146392822266, + "learning_rate": 8.135613538715097e-06, + "loss": 0.8914, + "step": 7566 + }, + { + "epoch": 0.6114871007495101, + "grad_norm": 2.610637903213501, + "learning_rate": 8.135103815790445e-06, + "loss": 0.9718, + "step": 7567 + }, + { + "epoch": 0.6115679104628376, + "grad_norm": 2.7604098320007324, + "learning_rate": 8.134594039169013e-06, + "loss": 0.913, + "step": 7568 + }, + { + "epoch": 0.6116487201761652, + "grad_norm": 2.9775872230529785, + "learning_rate": 8.13408420885953e-06, + "loss": 0.9978, + "step": 7569 + }, + { + "epoch": 0.6117295298894927, + "grad_norm": 2.9809317588806152, + "learning_rate": 8.13357432487073e-06, + "loss": 0.9162, + "step": 7570 + }, + { + "epoch": 0.6118103396028203, + "grad_norm": 2.4901397228240967, + "learning_rate": 8.133064387211344e-06, + "loss": 0.9872, + "step": 7571 + }, + { + "epoch": 0.6118911493161479, + "grad_norm": 3.06878399848938, + "learning_rate": 8.132554395890111e-06, + "loss": 0.8949, + "step": 7572 + }, + { + "epoch": 0.6119719590294753, + "grad_norm": 2.597285747528076, + "learning_rate": 8.13204435091576e-06, + "loss": 0.8572, + "step": 7573 + }, + { + "epoch": 0.6120527687428029, + "grad_norm": 2.7203357219696045, + "learning_rate": 8.13153425229703e-06, + "loss": 0.8784, + "step": 7574 + }, + { + "epoch": 0.6121335784561305, + "grad_norm": 2.711458206176758, + "learning_rate": 8.131024100042658e-06, + "loss": 0.9551, + "step": 7575 + }, + { + "epoch": 0.612214388169458, + "grad_norm": 2.75632381439209, + "learning_rate": 8.130513894161381e-06, + "loss": 0.9443, + "step": 7576 + }, + { + "epoch": 0.6122951978827855, + "grad_norm": 2.6390979290008545, + "learning_rate": 8.130003634661936e-06, + "loss": 0.9309, + "step": 7577 + }, + { + "epoch": 0.6123760075961131, + "grad_norm": 2.926806926727295, + "learning_rate": 8.129493321553063e-06, + "loss": 0.9948, + "step": 7578 + }, + { + "epoch": 0.6124568173094406, + "grad_norm": 2.803065061569214, + "learning_rate": 8.128982954843504e-06, + "loss": 0.9275, + "step": 7579 + }, + { + "epoch": 0.6125376270227681, + "grad_norm": 2.6212358474731445, + "learning_rate": 8.128472534542002e-06, + "loss": 0.972, + "step": 7580 + }, + { + "epoch": 0.6126184367360957, + "grad_norm": 2.381321668624878, + "learning_rate": 8.127962060657295e-06, + "loss": 0.8629, + "step": 7581 + }, + { + "epoch": 0.6126992464494232, + "grad_norm": 2.4566802978515625, + "learning_rate": 8.127451533198129e-06, + "loss": 1.0621, + "step": 7582 + }, + { + "epoch": 0.6127800561627508, + "grad_norm": 2.5291404724121094, + "learning_rate": 8.126940952173247e-06, + "loss": 1.0163, + "step": 7583 + }, + { + "epoch": 0.6128608658760784, + "grad_norm": 2.574113368988037, + "learning_rate": 8.126430317591395e-06, + "loss": 0.9515, + "step": 7584 + }, + { + "epoch": 0.6129416755894058, + "grad_norm": 2.8008768558502197, + "learning_rate": 8.125919629461317e-06, + "loss": 0.8503, + "step": 7585 + }, + { + "epoch": 0.6130224853027334, + "grad_norm": 2.6720664501190186, + "learning_rate": 8.125408887791761e-06, + "loss": 1.0535, + "step": 7586 + }, + { + "epoch": 0.613103295016061, + "grad_norm": 2.9697463512420654, + "learning_rate": 8.124898092591474e-06, + "loss": 1.0389, + "step": 7587 + }, + { + "epoch": 0.6131841047293884, + "grad_norm": 2.585587978363037, + "learning_rate": 8.124387243869208e-06, + "loss": 0.8858, + "step": 7588 + }, + { + "epoch": 0.613264914442716, + "grad_norm": 3.2030749320983887, + "learning_rate": 8.123876341633707e-06, + "loss": 0.9673, + "step": 7589 + }, + { + "epoch": 0.6133457241560436, + "grad_norm": 2.4657204151153564, + "learning_rate": 8.123365385893728e-06, + "loss": 1.0361, + "step": 7590 + }, + { + "epoch": 0.6134265338693711, + "grad_norm": 2.3138375282287598, + "learning_rate": 8.122854376658019e-06, + "loss": 1.0051, + "step": 7591 + }, + { + "epoch": 0.6135073435826986, + "grad_norm": 2.5959856510162354, + "learning_rate": 8.122343313935331e-06, + "loss": 0.9098, + "step": 7592 + }, + { + "epoch": 0.6135881532960262, + "grad_norm": 2.63698673248291, + "learning_rate": 8.121832197734419e-06, + "loss": 1.0095, + "step": 7593 + }, + { + "epoch": 0.6136689630093537, + "grad_norm": 2.67195987701416, + "learning_rate": 8.121321028064038e-06, + "loss": 0.9975, + "step": 7594 + }, + { + "epoch": 0.6137497727226813, + "grad_norm": 2.6742136478424072, + "learning_rate": 8.120809804932938e-06, + "loss": 0.9893, + "step": 7595 + }, + { + "epoch": 0.6138305824360089, + "grad_norm": 2.8699092864990234, + "learning_rate": 8.120298528349883e-06, + "loss": 0.9978, + "step": 7596 + }, + { + "epoch": 0.6139113921493363, + "grad_norm": 2.471646308898926, + "learning_rate": 8.119787198323626e-06, + "loss": 0.9892, + "step": 7597 + }, + { + "epoch": 0.6139922018626639, + "grad_norm": 2.8393266201019287, + "learning_rate": 8.119275814862924e-06, + "loss": 1.0142, + "step": 7598 + }, + { + "epoch": 0.6140730115759915, + "grad_norm": 2.6085870265960693, + "learning_rate": 8.118764377976537e-06, + "loss": 1.04, + "step": 7599 + }, + { + "epoch": 0.614153821289319, + "grad_norm": 2.986100673675537, + "learning_rate": 8.118252887673224e-06, + "loss": 0.9623, + "step": 7600 + }, + { + "epoch": 0.6142346310026465, + "grad_norm": 3.16513991355896, + "learning_rate": 8.117741343961746e-06, + "loss": 0.9224, + "step": 7601 + }, + { + "epoch": 0.6143154407159741, + "grad_norm": 2.460918426513672, + "learning_rate": 8.117229746850866e-06, + "loss": 0.9181, + "step": 7602 + }, + { + "epoch": 0.6143962504293016, + "grad_norm": 2.756314516067505, + "learning_rate": 8.116718096349341e-06, + "loss": 0.9514, + "step": 7603 + }, + { + "epoch": 0.6144770601426292, + "grad_norm": 2.6732490062713623, + "learning_rate": 8.116206392465942e-06, + "loss": 1.0893, + "step": 7604 + }, + { + "epoch": 0.6145578698559567, + "grad_norm": 2.8199009895324707, + "learning_rate": 8.115694635209427e-06, + "loss": 0.9249, + "step": 7605 + }, + { + "epoch": 0.6146386795692842, + "grad_norm": 2.402858257293701, + "learning_rate": 8.115182824588565e-06, + "loss": 0.9422, + "step": 7606 + }, + { + "epoch": 0.6147194892826118, + "grad_norm": 3.0272507667541504, + "learning_rate": 8.11467096061212e-06, + "loss": 0.9662, + "step": 7607 + }, + { + "epoch": 0.6148002989959394, + "grad_norm": 3.0503029823303223, + "learning_rate": 8.114159043288861e-06, + "loss": 0.839, + "step": 7608 + }, + { + "epoch": 0.6148811087092668, + "grad_norm": 2.922384023666382, + "learning_rate": 8.113647072627553e-06, + "loss": 0.8798, + "step": 7609 + }, + { + "epoch": 0.6149619184225944, + "grad_norm": 2.7376458644866943, + "learning_rate": 8.113135048636967e-06, + "loss": 0.9187, + "step": 7610 + }, + { + "epoch": 0.615042728135922, + "grad_norm": 3.5451629161834717, + "learning_rate": 8.112622971325872e-06, + "loss": 0.9327, + "step": 7611 + }, + { + "epoch": 0.6151235378492494, + "grad_norm": 2.669274091720581, + "learning_rate": 8.112110840703038e-06, + "loss": 1.004, + "step": 7612 + }, + { + "epoch": 0.615204347562577, + "grad_norm": 2.3365893363952637, + "learning_rate": 8.11159865677724e-06, + "loss": 1.1307, + "step": 7613 + }, + { + "epoch": 0.6152851572759046, + "grad_norm": 2.4509685039520264, + "learning_rate": 8.111086419557246e-06, + "loss": 1.0756, + "step": 7614 + }, + { + "epoch": 0.6153659669892321, + "grad_norm": 2.6364622116088867, + "learning_rate": 8.110574129051831e-06, + "loss": 1.0557, + "step": 7615 + }, + { + "epoch": 0.6154467767025597, + "grad_norm": 2.7215840816497803, + "learning_rate": 8.110061785269772e-06, + "loss": 0.881, + "step": 7616 + }, + { + "epoch": 0.6155275864158872, + "grad_norm": 2.5293588638305664, + "learning_rate": 8.109549388219838e-06, + "loss": 0.9913, + "step": 7617 + }, + { + "epoch": 0.6156083961292147, + "grad_norm": 2.808417320251465, + "learning_rate": 8.10903693791081e-06, + "loss": 0.9183, + "step": 7618 + }, + { + "epoch": 0.6156892058425423, + "grad_norm": 2.7782840728759766, + "learning_rate": 8.108524434351466e-06, + "loss": 0.9487, + "step": 7619 + }, + { + "epoch": 0.6157700155558699, + "grad_norm": 2.7505362033843994, + "learning_rate": 8.108011877550581e-06, + "loss": 0.8463, + "step": 7620 + }, + { + "epoch": 0.6158508252691973, + "grad_norm": 2.883913040161133, + "learning_rate": 8.107499267516934e-06, + "loss": 0.8409, + "step": 7621 + }, + { + "epoch": 0.6159316349825249, + "grad_norm": 2.6276159286499023, + "learning_rate": 8.106986604259307e-06, + "loss": 0.947, + "step": 7622 + }, + { + "epoch": 0.6160124446958525, + "grad_norm": 2.501807928085327, + "learning_rate": 8.106473887786478e-06, + "loss": 0.7892, + "step": 7623 + }, + { + "epoch": 0.61609325440918, + "grad_norm": 2.743285655975342, + "learning_rate": 8.10596111810723e-06, + "loss": 1.0016, + "step": 7624 + }, + { + "epoch": 0.6161740641225075, + "grad_norm": 3.7059848308563232, + "learning_rate": 8.105448295230345e-06, + "loss": 0.8897, + "step": 7625 + }, + { + "epoch": 0.6162548738358351, + "grad_norm": 2.7780449390411377, + "learning_rate": 8.104935419164608e-06, + "loss": 0.9984, + "step": 7626 + }, + { + "epoch": 0.6163356835491626, + "grad_norm": 2.756298542022705, + "learning_rate": 8.104422489918802e-06, + "loss": 1.0321, + "step": 7627 + }, + { + "epoch": 0.6164164932624902, + "grad_norm": 2.452901601791382, + "learning_rate": 8.10390950750171e-06, + "loss": 0.9096, + "step": 7628 + }, + { + "epoch": 0.6164973029758177, + "grad_norm": 2.7609567642211914, + "learning_rate": 8.103396471922123e-06, + "loss": 0.975, + "step": 7629 + }, + { + "epoch": 0.6165781126891452, + "grad_norm": 2.735701560974121, + "learning_rate": 8.102883383188825e-06, + "loss": 0.9084, + "step": 7630 + }, + { + "epoch": 0.6166589224024728, + "grad_norm": 2.292210817337036, + "learning_rate": 8.102370241310605e-06, + "loss": 0.9648, + "step": 7631 + }, + { + "epoch": 0.6167397321158004, + "grad_norm": 3.1585936546325684, + "learning_rate": 8.10185704629625e-06, + "loss": 1.0051, + "step": 7632 + }, + { + "epoch": 0.6168205418291278, + "grad_norm": 2.4181199073791504, + "learning_rate": 8.101343798154551e-06, + "loss": 1.0571, + "step": 7633 + }, + { + "epoch": 0.6169013515424554, + "grad_norm": 2.7890429496765137, + "learning_rate": 8.1008304968943e-06, + "loss": 0.949, + "step": 7634 + }, + { + "epoch": 0.616982161255783, + "grad_norm": 2.5355355739593506, + "learning_rate": 8.100317142524287e-06, + "loss": 0.9061, + "step": 7635 + }, + { + "epoch": 0.6170629709691104, + "grad_norm": 2.5393078327178955, + "learning_rate": 8.099803735053306e-06, + "loss": 1.008, + "step": 7636 + }, + { + "epoch": 0.617143780682438, + "grad_norm": 2.9275808334350586, + "learning_rate": 8.099290274490149e-06, + "loss": 0.9149, + "step": 7637 + }, + { + "epoch": 0.6172245903957656, + "grad_norm": 2.668325901031494, + "learning_rate": 8.09877676084361e-06, + "loss": 0.9632, + "step": 7638 + }, + { + "epoch": 0.6173054001090931, + "grad_norm": 2.934856414794922, + "learning_rate": 8.098263194122486e-06, + "loss": 0.9808, + "step": 7639 + }, + { + "epoch": 0.6173862098224207, + "grad_norm": 3.2335379123687744, + "learning_rate": 8.097749574335573e-06, + "loss": 0.8622, + "step": 7640 + }, + { + "epoch": 0.6174670195357482, + "grad_norm": 2.827376127243042, + "learning_rate": 8.097235901491667e-06, + "loss": 0.9455, + "step": 7641 + }, + { + "epoch": 0.6175478292490757, + "grad_norm": 3.290057420730591, + "learning_rate": 8.096722175599566e-06, + "loss": 0.866, + "step": 7642 + }, + { + "epoch": 0.6176286389624033, + "grad_norm": 2.7397851943969727, + "learning_rate": 8.096208396668068e-06, + "loss": 1.0122, + "step": 7643 + }, + { + "epoch": 0.6177094486757309, + "grad_norm": 2.768148183822632, + "learning_rate": 8.095694564705974e-06, + "loss": 1.0163, + "step": 7644 + }, + { + "epoch": 0.6177902583890583, + "grad_norm": 2.8673741817474365, + "learning_rate": 8.095180679722085e-06, + "loss": 0.9631, + "step": 7645 + }, + { + "epoch": 0.6178710681023859, + "grad_norm": 2.8062868118286133, + "learning_rate": 8.094666741725203e-06, + "loss": 1.0469, + "step": 7646 + }, + { + "epoch": 0.6179518778157135, + "grad_norm": 2.763274669647217, + "learning_rate": 8.094152750724132e-06, + "loss": 1.031, + "step": 7647 + }, + { + "epoch": 0.618032687529041, + "grad_norm": 2.3404388427734375, + "learning_rate": 8.09363870672767e-06, + "loss": 0.9555, + "step": 7648 + }, + { + "epoch": 0.6181134972423685, + "grad_norm": 2.8336215019226074, + "learning_rate": 8.093124609744625e-06, + "loss": 1.0455, + "step": 7649 + }, + { + "epoch": 0.6181943069556961, + "grad_norm": 2.526571035385132, + "learning_rate": 8.092610459783802e-06, + "loss": 0.8752, + "step": 7650 + }, + { + "epoch": 0.6182751166690236, + "grad_norm": 2.445449113845825, + "learning_rate": 8.092096256854007e-06, + "loss": 0.9826, + "step": 7651 + }, + { + "epoch": 0.6183559263823512, + "grad_norm": 2.870954990386963, + "learning_rate": 8.091582000964049e-06, + "loss": 0.9998, + "step": 7652 + }, + { + "epoch": 0.6184367360956787, + "grad_norm": 2.7198636531829834, + "learning_rate": 8.091067692122731e-06, + "loss": 0.9769, + "step": 7653 + }, + { + "epoch": 0.6185175458090062, + "grad_norm": 2.578165292739868, + "learning_rate": 8.090553330338867e-06, + "loss": 0.8554, + "step": 7654 + }, + { + "epoch": 0.6185983555223338, + "grad_norm": 2.680886745452881, + "learning_rate": 8.090038915621263e-06, + "loss": 0.9601, + "step": 7655 + }, + { + "epoch": 0.6186791652356614, + "grad_norm": 2.909229278564453, + "learning_rate": 8.089524447978734e-06, + "loss": 0.8381, + "step": 7656 + }, + { + "epoch": 0.6187599749489888, + "grad_norm": 2.6055145263671875, + "learning_rate": 8.089009927420087e-06, + "loss": 0.9023, + "step": 7657 + }, + { + "epoch": 0.6188407846623164, + "grad_norm": 3.098064422607422, + "learning_rate": 8.088495353954135e-06, + "loss": 1.0305, + "step": 7658 + }, + { + "epoch": 0.618921594375644, + "grad_norm": 2.6698648929595947, + "learning_rate": 8.087980727589695e-06, + "loss": 0.8798, + "step": 7659 + }, + { + "epoch": 0.6190024040889714, + "grad_norm": 2.487757444381714, + "learning_rate": 8.087466048335578e-06, + "loss": 0.9364, + "step": 7660 + }, + { + "epoch": 0.619083213802299, + "grad_norm": 2.4260802268981934, + "learning_rate": 8.086951316200603e-06, + "loss": 0.8854, + "step": 7661 + }, + { + "epoch": 0.6191640235156266, + "grad_norm": 2.9222092628479004, + "learning_rate": 8.08643653119358e-06, + "loss": 1.0188, + "step": 7662 + }, + { + "epoch": 0.6192448332289541, + "grad_norm": 2.7569892406463623, + "learning_rate": 8.085921693323331e-06, + "loss": 0.9232, + "step": 7663 + }, + { + "epoch": 0.6193256429422817, + "grad_norm": 3.115506410598755, + "learning_rate": 8.085406802598671e-06, + "loss": 0.9975, + "step": 7664 + }, + { + "epoch": 0.6194064526556092, + "grad_norm": 2.9450008869171143, + "learning_rate": 8.084891859028423e-06, + "loss": 0.9828, + "step": 7665 + }, + { + "epoch": 0.6194872623689367, + "grad_norm": 2.8621959686279297, + "learning_rate": 8.084376862621402e-06, + "loss": 0.9365, + "step": 7666 + }, + { + "epoch": 0.6195680720822643, + "grad_norm": 2.434705972671509, + "learning_rate": 8.083861813386433e-06, + "loss": 1.0411, + "step": 7667 + }, + { + "epoch": 0.6196488817955919, + "grad_norm": 2.969108819961548, + "learning_rate": 8.083346711332332e-06, + "loss": 0.9242, + "step": 7668 + }, + { + "epoch": 0.6197296915089193, + "grad_norm": 2.6051666736602783, + "learning_rate": 8.082831556467927e-06, + "loss": 0.9473, + "step": 7669 + }, + { + "epoch": 0.6198105012222469, + "grad_norm": 3.0353684425354004, + "learning_rate": 8.082316348802038e-06, + "loss": 1.0033, + "step": 7670 + }, + { + "epoch": 0.6198913109355745, + "grad_norm": 2.894141912460327, + "learning_rate": 8.08180108834349e-06, + "loss": 1.0795, + "step": 7671 + }, + { + "epoch": 0.619972120648902, + "grad_norm": 2.572617769241333, + "learning_rate": 8.081285775101111e-06, + "loss": 0.9461, + "step": 7672 + }, + { + "epoch": 0.6200529303622295, + "grad_norm": 3.017763376235962, + "learning_rate": 8.080770409083722e-06, + "loss": 1.0067, + "step": 7673 + }, + { + "epoch": 0.6201337400755571, + "grad_norm": 2.480952262878418, + "learning_rate": 8.080254990300153e-06, + "loss": 0.9987, + "step": 7674 + }, + { + "epoch": 0.6202145497888846, + "grad_norm": 2.371596574783325, + "learning_rate": 8.079739518759232e-06, + "loss": 0.9978, + "step": 7675 + }, + { + "epoch": 0.6202953595022122, + "grad_norm": 2.7950267791748047, + "learning_rate": 8.079223994469786e-06, + "loss": 0.9195, + "step": 7676 + }, + { + "epoch": 0.6203761692155397, + "grad_norm": 3.1806201934814453, + "learning_rate": 8.078708417440647e-06, + "loss": 0.9923, + "step": 7677 + }, + { + "epoch": 0.6204569789288672, + "grad_norm": 3.063190460205078, + "learning_rate": 8.078192787680645e-06, + "loss": 0.9292, + "step": 7678 + }, + { + "epoch": 0.6205377886421948, + "grad_norm": 2.8096413612365723, + "learning_rate": 8.07767710519861e-06, + "loss": 0.9318, + "step": 7679 + }, + { + "epoch": 0.6206185983555224, + "grad_norm": 2.7971744537353516, + "learning_rate": 8.077161370003376e-06, + "loss": 0.8552, + "step": 7680 + }, + { + "epoch": 0.6206994080688498, + "grad_norm": 2.738651752471924, + "learning_rate": 8.076645582103775e-06, + "loss": 0.8102, + "step": 7681 + }, + { + "epoch": 0.6207802177821774, + "grad_norm": 2.5714449882507324, + "learning_rate": 8.076129741508642e-06, + "loss": 0.9529, + "step": 7682 + }, + { + "epoch": 0.620861027495505, + "grad_norm": 2.2240352630615234, + "learning_rate": 8.075613848226813e-06, + "loss": 0.9133, + "step": 7683 + }, + { + "epoch": 0.6209418372088324, + "grad_norm": 2.441371202468872, + "learning_rate": 8.075097902267122e-06, + "loss": 0.8994, + "step": 7684 + }, + { + "epoch": 0.62102264692216, + "grad_norm": 2.7136309146881104, + "learning_rate": 8.074581903638408e-06, + "loss": 1.0391, + "step": 7685 + }, + { + "epoch": 0.6211034566354876, + "grad_norm": 2.485348701477051, + "learning_rate": 8.074065852349506e-06, + "loss": 1.1127, + "step": 7686 + }, + { + "epoch": 0.6211842663488151, + "grad_norm": 2.329624891281128, + "learning_rate": 8.073549748409258e-06, + "loss": 1.017, + "step": 7687 + }, + { + "epoch": 0.6212650760621427, + "grad_norm": 3.4000937938690186, + "learning_rate": 8.073033591826502e-06, + "loss": 0.9897, + "step": 7688 + }, + { + "epoch": 0.6213458857754702, + "grad_norm": 2.233670234680176, + "learning_rate": 8.072517382610077e-06, + "loss": 1.015, + "step": 7689 + }, + { + "epoch": 0.6214266954887978, + "grad_norm": 2.7281813621520996, + "learning_rate": 8.072001120768827e-06, + "loss": 1.0299, + "step": 7690 + }, + { + "epoch": 0.6215075052021253, + "grad_norm": 2.977562189102173, + "learning_rate": 8.071484806311593e-06, + "loss": 1.133, + "step": 7691 + }, + { + "epoch": 0.6215883149154529, + "grad_norm": 3.0641415119171143, + "learning_rate": 8.070968439247219e-06, + "loss": 0.9024, + "step": 7692 + }, + { + "epoch": 0.6216691246287804, + "grad_norm": 2.2583773136138916, + "learning_rate": 8.070452019584549e-06, + "loss": 0.9157, + "step": 7693 + }, + { + "epoch": 0.6217499343421079, + "grad_norm": 2.4764106273651123, + "learning_rate": 8.069935547332427e-06, + "loss": 0.9392, + "step": 7694 + }, + { + "epoch": 0.6218307440554355, + "grad_norm": 2.8662688732147217, + "learning_rate": 8.0694190224997e-06, + "loss": 1.0317, + "step": 7695 + }, + { + "epoch": 0.6219115537687631, + "grad_norm": 3.5188772678375244, + "learning_rate": 8.068902445095216e-06, + "loss": 0.89, + "step": 7696 + }, + { + "epoch": 0.6219923634820905, + "grad_norm": 2.7252142429351807, + "learning_rate": 8.06838581512782e-06, + "loss": 0.9942, + "step": 7697 + }, + { + "epoch": 0.6220731731954181, + "grad_norm": 2.730548143386841, + "learning_rate": 8.06786913260636e-06, + "loss": 0.9666, + "step": 7698 + }, + { + "epoch": 0.6221539829087457, + "grad_norm": 2.656900644302368, + "learning_rate": 8.06735239753969e-06, + "loss": 0.8996, + "step": 7699 + }, + { + "epoch": 0.6222347926220732, + "grad_norm": 3.110222339630127, + "learning_rate": 8.066835609936656e-06, + "loss": 0.8834, + "step": 7700 + }, + { + "epoch": 0.6223156023354007, + "grad_norm": 2.502372980117798, + "learning_rate": 8.066318769806111e-06, + "loss": 0.8727, + "step": 7701 + }, + { + "epoch": 0.6223964120487283, + "grad_norm": 3.104184627532959, + "learning_rate": 8.06580187715691e-06, + "loss": 0.883, + "step": 7702 + }, + { + "epoch": 0.6224772217620558, + "grad_norm": 2.5845611095428467, + "learning_rate": 8.0652849319979e-06, + "loss": 0.9605, + "step": 7703 + }, + { + "epoch": 0.6225580314753834, + "grad_norm": 2.477426767349243, + "learning_rate": 8.064767934337942e-06, + "loss": 1.0792, + "step": 7704 + }, + { + "epoch": 0.6226388411887109, + "grad_norm": 2.931091547012329, + "learning_rate": 8.064250884185884e-06, + "loss": 0.9479, + "step": 7705 + }, + { + "epoch": 0.6227196509020384, + "grad_norm": 2.541217803955078, + "learning_rate": 8.063733781550588e-06, + "loss": 1.0394, + "step": 7706 + }, + { + "epoch": 0.622800460615366, + "grad_norm": 3.0264840126037598, + "learning_rate": 8.063216626440907e-06, + "loss": 0.9937, + "step": 7707 + }, + { + "epoch": 0.6228812703286936, + "grad_norm": 2.858987808227539, + "learning_rate": 8.062699418865697e-06, + "loss": 0.9599, + "step": 7708 + }, + { + "epoch": 0.622962080042021, + "grad_norm": 2.7067854404449463, + "learning_rate": 8.062182158833824e-06, + "loss": 0.9244, + "step": 7709 + }, + { + "epoch": 0.6230428897553486, + "grad_norm": 2.5970213413238525, + "learning_rate": 8.061664846354138e-06, + "loss": 0.895, + "step": 7710 + }, + { + "epoch": 0.6231236994686762, + "grad_norm": 2.6465702056884766, + "learning_rate": 8.061147481435507e-06, + "loss": 0.9913, + "step": 7711 + }, + { + "epoch": 0.6232045091820037, + "grad_norm": 2.559605598449707, + "learning_rate": 8.060630064086788e-06, + "loss": 1.0017, + "step": 7712 + }, + { + "epoch": 0.6232853188953312, + "grad_norm": 2.321462869644165, + "learning_rate": 8.060112594316843e-06, + "loss": 0.9695, + "step": 7713 + }, + { + "epoch": 0.6233661286086588, + "grad_norm": 2.7076878547668457, + "learning_rate": 8.059595072134538e-06, + "loss": 0.8661, + "step": 7714 + }, + { + "epoch": 0.6234469383219863, + "grad_norm": 2.558384656906128, + "learning_rate": 8.059077497548733e-06, + "loss": 0.8615, + "step": 7715 + }, + { + "epoch": 0.6235277480353139, + "grad_norm": 2.7877197265625, + "learning_rate": 8.058559870568297e-06, + "loss": 0.9967, + "step": 7716 + }, + { + "epoch": 0.6236085577486414, + "grad_norm": 2.6813652515411377, + "learning_rate": 8.058042191202094e-06, + "loss": 1.0862, + "step": 7717 + }, + { + "epoch": 0.6236893674619689, + "grad_norm": 2.652118444442749, + "learning_rate": 8.057524459458988e-06, + "loss": 0.9581, + "step": 7718 + }, + { + "epoch": 0.6237701771752965, + "grad_norm": 3.2014918327331543, + "learning_rate": 8.05700667534785e-06, + "loss": 0.9084, + "step": 7719 + }, + { + "epoch": 0.6238509868886241, + "grad_norm": 2.563494920730591, + "learning_rate": 8.056488838877547e-06, + "loss": 0.8831, + "step": 7720 + }, + { + "epoch": 0.6239317966019515, + "grad_norm": 3.018420457839966, + "learning_rate": 8.055970950056946e-06, + "loss": 0.8915, + "step": 7721 + }, + { + "epoch": 0.6240126063152791, + "grad_norm": 2.609955310821533, + "learning_rate": 8.055453008894922e-06, + "loss": 1.0439, + "step": 7722 + }, + { + "epoch": 0.6240934160286067, + "grad_norm": 2.7135331630706787, + "learning_rate": 8.054935015400345e-06, + "loss": 0.809, + "step": 7723 + }, + { + "epoch": 0.6241742257419342, + "grad_norm": 2.5502469539642334, + "learning_rate": 8.054416969582085e-06, + "loss": 1.0508, + "step": 7724 + }, + { + "epoch": 0.6242550354552617, + "grad_norm": 2.7606074810028076, + "learning_rate": 8.053898871449013e-06, + "loss": 1.0064, + "step": 7725 + }, + { + "epoch": 0.6243358451685893, + "grad_norm": 2.7798190116882324, + "learning_rate": 8.053380721010007e-06, + "loss": 0.9649, + "step": 7726 + }, + { + "epoch": 0.6244166548819168, + "grad_norm": 2.289156436920166, + "learning_rate": 8.052862518273939e-06, + "loss": 0.9963, + "step": 7727 + }, + { + "epoch": 0.6244974645952444, + "grad_norm": 2.4941487312316895, + "learning_rate": 8.052344263249688e-06, + "loss": 0.9801, + "step": 7728 + }, + { + "epoch": 0.6245782743085719, + "grad_norm": 2.8087286949157715, + "learning_rate": 8.051825955946124e-06, + "loss": 1.0877, + "step": 7729 + }, + { + "epoch": 0.6246590840218994, + "grad_norm": 2.871695041656494, + "learning_rate": 8.051307596372132e-06, + "loss": 1.0272, + "step": 7730 + }, + { + "epoch": 0.624739893735227, + "grad_norm": 1.9633084535598755, + "learning_rate": 8.050789184536584e-06, + "loss": 1.0399, + "step": 7731 + }, + { + "epoch": 0.6248207034485546, + "grad_norm": 2.7255969047546387, + "learning_rate": 8.050270720448364e-06, + "loss": 0.8947, + "step": 7732 + }, + { + "epoch": 0.624901513161882, + "grad_norm": 2.166358232498169, + "learning_rate": 8.049752204116349e-06, + "loss": 0.9626, + "step": 7733 + }, + { + "epoch": 0.6249823228752096, + "grad_norm": 2.5491621494293213, + "learning_rate": 8.049233635549421e-06, + "loss": 0.943, + "step": 7734 + }, + { + "epoch": 0.6250631325885372, + "grad_norm": 2.7059831619262695, + "learning_rate": 8.048715014756462e-06, + "loss": 1.0776, + "step": 7735 + }, + { + "epoch": 0.6251439423018647, + "grad_norm": 2.5231704711914062, + "learning_rate": 8.048196341746353e-06, + "loss": 0.9624, + "step": 7736 + }, + { + "epoch": 0.6252247520151922, + "grad_norm": 2.5308783054351807, + "learning_rate": 8.047677616527979e-06, + "loss": 0.8851, + "step": 7737 + }, + { + "epoch": 0.6253055617285198, + "grad_norm": 3.063410520553589, + "learning_rate": 8.047158839110223e-06, + "loss": 0.8693, + "step": 7738 + }, + { + "epoch": 0.6253863714418473, + "grad_norm": 2.350961208343506, + "learning_rate": 8.046640009501973e-06, + "loss": 0.9015, + "step": 7739 + }, + { + "epoch": 0.6254671811551749, + "grad_norm": 2.388249635696411, + "learning_rate": 8.046121127712116e-06, + "loss": 0.8678, + "step": 7740 + }, + { + "epoch": 0.6255479908685024, + "grad_norm": 2.5745139122009277, + "learning_rate": 8.045602193749536e-06, + "loss": 0.9142, + "step": 7741 + }, + { + "epoch": 0.6256288005818299, + "grad_norm": 2.595492362976074, + "learning_rate": 8.045083207623122e-06, + "loss": 0.9591, + "step": 7742 + }, + { + "epoch": 0.6257096102951575, + "grad_norm": 2.4084506034851074, + "learning_rate": 8.044564169341765e-06, + "loss": 0.9244, + "step": 7743 + }, + { + "epoch": 0.6257904200084851, + "grad_norm": 3.108823537826538, + "learning_rate": 8.04404507891435e-06, + "loss": 1.0732, + "step": 7744 + }, + { + "epoch": 0.6258712297218125, + "grad_norm": 2.862159013748169, + "learning_rate": 8.043525936349775e-06, + "loss": 1.0014, + "step": 7745 + }, + { + "epoch": 0.6259520394351401, + "grad_norm": 2.857966423034668, + "learning_rate": 8.043006741656925e-06, + "loss": 0.8613, + "step": 7746 + }, + { + "epoch": 0.6260328491484677, + "grad_norm": 2.724604368209839, + "learning_rate": 8.042487494844695e-06, + "loss": 0.8828, + "step": 7747 + }, + { + "epoch": 0.6261136588617952, + "grad_norm": 2.9525856971740723, + "learning_rate": 8.041968195921981e-06, + "loss": 0.9661, + "step": 7748 + }, + { + "epoch": 0.6261944685751227, + "grad_norm": 2.5938563346862793, + "learning_rate": 8.041448844897672e-06, + "loss": 1.0016, + "step": 7749 + }, + { + "epoch": 0.6262752782884503, + "grad_norm": 2.6016035079956055, + "learning_rate": 8.04092944178067e-06, + "loss": 0.8378, + "step": 7750 + }, + { + "epoch": 0.6263560880017778, + "grad_norm": 2.6379079818725586, + "learning_rate": 8.040409986579865e-06, + "loss": 1.1437, + "step": 7751 + }, + { + "epoch": 0.6264368977151054, + "grad_norm": 2.7775843143463135, + "learning_rate": 8.039890479304156e-06, + "loss": 0.9355, + "step": 7752 + }, + { + "epoch": 0.6265177074284329, + "grad_norm": 2.626626968383789, + "learning_rate": 8.039370919962443e-06, + "loss": 0.9042, + "step": 7753 + }, + { + "epoch": 0.6265985171417604, + "grad_norm": 2.5845067501068115, + "learning_rate": 8.038851308563621e-06, + "loss": 0.9599, + "step": 7754 + }, + { + "epoch": 0.626679326855088, + "grad_norm": 2.277064085006714, + "learning_rate": 8.038331645116593e-06, + "loss": 0.9462, + "step": 7755 + }, + { + "epoch": 0.6267601365684156, + "grad_norm": 2.5511014461517334, + "learning_rate": 8.037811929630258e-06, + "loss": 0.989, + "step": 7756 + }, + { + "epoch": 0.626840946281743, + "grad_norm": 2.4857261180877686, + "learning_rate": 8.037292162113519e-06, + "loss": 0.989, + "step": 7757 + }, + { + "epoch": 0.6269217559950706, + "grad_norm": 2.708094835281372, + "learning_rate": 8.036772342575277e-06, + "loss": 0.9264, + "step": 7758 + }, + { + "epoch": 0.6270025657083982, + "grad_norm": 3.272844076156616, + "learning_rate": 8.036252471024436e-06, + "loss": 0.9398, + "step": 7759 + }, + { + "epoch": 0.6270833754217257, + "grad_norm": 2.653989553451538, + "learning_rate": 8.035732547469897e-06, + "loss": 1.1018, + "step": 7760 + }, + { + "epoch": 0.6271641851350532, + "grad_norm": 2.796238899230957, + "learning_rate": 8.035212571920571e-06, + "loss": 1.0137, + "step": 7761 + }, + { + "epoch": 0.6272449948483808, + "grad_norm": 2.274282217025757, + "learning_rate": 8.034692544385359e-06, + "loss": 1.0252, + "step": 7762 + }, + { + "epoch": 0.6273258045617083, + "grad_norm": 2.3375332355499268, + "learning_rate": 8.034172464873169e-06, + "loss": 1.0097, + "step": 7763 + }, + { + "epoch": 0.6274066142750359, + "grad_norm": 3.058797597885132, + "learning_rate": 8.03365233339291e-06, + "loss": 0.9241, + "step": 7764 + }, + { + "epoch": 0.6274874239883634, + "grad_norm": 2.5773017406463623, + "learning_rate": 8.033132149953489e-06, + "loss": 0.8924, + "step": 7765 + }, + { + "epoch": 0.6275682337016909, + "grad_norm": 2.597038507461548, + "learning_rate": 8.032611914563816e-06, + "loss": 1.0032, + "step": 7766 + }, + { + "epoch": 0.6276490434150185, + "grad_norm": 2.7467923164367676, + "learning_rate": 8.032091627232803e-06, + "loss": 1.0524, + "step": 7767 + }, + { + "epoch": 0.6277298531283461, + "grad_norm": 2.6006813049316406, + "learning_rate": 8.031571287969359e-06, + "loss": 1.0723, + "step": 7768 + }, + { + "epoch": 0.6278106628416735, + "grad_norm": 2.623356580734253, + "learning_rate": 8.031050896782397e-06, + "loss": 0.8168, + "step": 7769 + }, + { + "epoch": 0.6278914725550011, + "grad_norm": 2.6809449195861816, + "learning_rate": 8.030530453680832e-06, + "loss": 0.9118, + "step": 7770 + }, + { + "epoch": 0.6279722822683287, + "grad_norm": 3.0026824474334717, + "learning_rate": 8.030009958673573e-06, + "loss": 0.8335, + "step": 7771 + }, + { + "epoch": 0.6280530919816562, + "grad_norm": 2.902270555496216, + "learning_rate": 8.02948941176954e-06, + "loss": 0.9533, + "step": 7772 + }, + { + "epoch": 0.6281339016949837, + "grad_norm": 2.7863898277282715, + "learning_rate": 8.028968812977645e-06, + "loss": 0.9708, + "step": 7773 + }, + { + "epoch": 0.6282147114083113, + "grad_norm": 3.016594171524048, + "learning_rate": 8.028448162306807e-06, + "loss": 1.0456, + "step": 7774 + }, + { + "epoch": 0.6282955211216388, + "grad_norm": 2.954946994781494, + "learning_rate": 8.027927459765944e-06, + "loss": 1.0253, + "step": 7775 + }, + { + "epoch": 0.6283763308349664, + "grad_norm": 2.2077760696411133, + "learning_rate": 8.02740670536397e-06, + "loss": 0.945, + "step": 7776 + }, + { + "epoch": 0.6284571405482939, + "grad_norm": 2.8146488666534424, + "learning_rate": 8.026885899109808e-06, + "loss": 0.7922, + "step": 7777 + }, + { + "epoch": 0.6285379502616214, + "grad_norm": 2.6830267906188965, + "learning_rate": 8.02636504101238e-06, + "loss": 1.0701, + "step": 7778 + }, + { + "epoch": 0.628618759974949, + "grad_norm": 2.534834384918213, + "learning_rate": 8.025844131080602e-06, + "loss": 0.892, + "step": 7779 + }, + { + "epoch": 0.6286995696882766, + "grad_norm": 3.6110360622406006, + "learning_rate": 8.0253231693234e-06, + "loss": 0.8395, + "step": 7780 + }, + { + "epoch": 0.628780379401604, + "grad_norm": 2.882394790649414, + "learning_rate": 8.024802155749696e-06, + "loss": 0.9216, + "step": 7781 + }, + { + "epoch": 0.6288611891149316, + "grad_norm": 2.889479398727417, + "learning_rate": 8.02428109036841e-06, + "loss": 0.9265, + "step": 7782 + }, + { + "epoch": 0.6289419988282592, + "grad_norm": 2.5216856002807617, + "learning_rate": 8.023759973188471e-06, + "loss": 0.876, + "step": 7783 + }, + { + "epoch": 0.6290228085415867, + "grad_norm": 2.7473416328430176, + "learning_rate": 8.023238804218805e-06, + "loss": 1.0997, + "step": 7784 + }, + { + "epoch": 0.6291036182549142, + "grad_norm": 3.13901948928833, + "learning_rate": 8.022717583468334e-06, + "loss": 1.0614, + "step": 7785 + }, + { + "epoch": 0.6291844279682418, + "grad_norm": 2.6193127632141113, + "learning_rate": 8.022196310945988e-06, + "loss": 1.0336, + "step": 7786 + }, + { + "epoch": 0.6292652376815693, + "grad_norm": 3.199700355529785, + "learning_rate": 8.021674986660696e-06, + "loss": 0.9012, + "step": 7787 + }, + { + "epoch": 0.6293460473948969, + "grad_norm": 2.636735200881958, + "learning_rate": 8.021153610621385e-06, + "loss": 0.939, + "step": 7788 + }, + { + "epoch": 0.6294268571082244, + "grad_norm": 2.5263350009918213, + "learning_rate": 8.020632182836986e-06, + "loss": 0.845, + "step": 7789 + }, + { + "epoch": 0.6295076668215519, + "grad_norm": 2.9626433849334717, + "learning_rate": 8.02011070331643e-06, + "loss": 0.899, + "step": 7790 + }, + { + "epoch": 0.6295884765348795, + "grad_norm": 2.3330037593841553, + "learning_rate": 8.019589172068646e-06, + "loss": 0.9297, + "step": 7791 + }, + { + "epoch": 0.6296692862482071, + "grad_norm": 2.7311487197875977, + "learning_rate": 8.019067589102572e-06, + "loss": 0.7996, + "step": 7792 + }, + { + "epoch": 0.6297500959615345, + "grad_norm": 2.5752053260803223, + "learning_rate": 8.018545954427138e-06, + "loss": 0.8863, + "step": 7793 + }, + { + "epoch": 0.6298309056748621, + "grad_norm": 2.6735050678253174, + "learning_rate": 8.018024268051276e-06, + "loss": 0.9915, + "step": 7794 + }, + { + "epoch": 0.6299117153881897, + "grad_norm": 2.407761573791504, + "learning_rate": 8.017502529983927e-06, + "loss": 1.0075, + "step": 7795 + }, + { + "epoch": 0.6299925251015172, + "grad_norm": 2.5616161823272705, + "learning_rate": 8.016980740234022e-06, + "loss": 0.8683, + "step": 7796 + }, + { + "epoch": 0.6300733348148447, + "grad_norm": 2.460862874984741, + "learning_rate": 8.0164588988105e-06, + "loss": 0.9291, + "step": 7797 + }, + { + "epoch": 0.6301541445281723, + "grad_norm": 2.6509780883789062, + "learning_rate": 8.0159370057223e-06, + "loss": 0.9686, + "step": 7798 + }, + { + "epoch": 0.6302349542414998, + "grad_norm": 2.631164073944092, + "learning_rate": 8.015415060978358e-06, + "loss": 0.9321, + "step": 7799 + }, + { + "epoch": 0.6303157639548274, + "grad_norm": 2.489335298538208, + "learning_rate": 8.014893064587617e-06, + "loss": 0.8782, + "step": 7800 + }, + { + "epoch": 0.6303965736681549, + "grad_norm": 2.837106466293335, + "learning_rate": 8.014371016559016e-06, + "loss": 1.0091, + "step": 7801 + }, + { + "epoch": 0.6304773833814824, + "grad_norm": 2.7319021224975586, + "learning_rate": 8.013848916901494e-06, + "loss": 0.9499, + "step": 7802 + }, + { + "epoch": 0.63055819309481, + "grad_norm": 2.3950157165527344, + "learning_rate": 8.013326765623999e-06, + "loss": 0.9136, + "step": 7803 + }, + { + "epoch": 0.6306390028081376, + "grad_norm": 2.824277400970459, + "learning_rate": 8.01280456273547e-06, + "loss": 0.8592, + "step": 7804 + }, + { + "epoch": 0.630719812521465, + "grad_norm": 2.6503448486328125, + "learning_rate": 8.01228230824485e-06, + "loss": 0.9947, + "step": 7805 + }, + { + "epoch": 0.6308006222347926, + "grad_norm": 2.688629388809204, + "learning_rate": 8.011760002161087e-06, + "loss": 0.9991, + "step": 7806 + }, + { + "epoch": 0.6308814319481202, + "grad_norm": 2.8561294078826904, + "learning_rate": 8.011237644493124e-06, + "loss": 0.9442, + "step": 7807 + }, + { + "epoch": 0.6309622416614477, + "grad_norm": 2.801499128341675, + "learning_rate": 8.01071523524991e-06, + "loss": 0.8982, + "step": 7808 + }, + { + "epoch": 0.6310430513747752, + "grad_norm": 2.8403820991516113, + "learning_rate": 8.010192774440394e-06, + "loss": 0.9184, + "step": 7809 + }, + { + "epoch": 0.6311238610881028, + "grad_norm": 2.9464499950408936, + "learning_rate": 8.00967026207352e-06, + "loss": 1.0028, + "step": 7810 + }, + { + "epoch": 0.6312046708014303, + "grad_norm": 2.475886106491089, + "learning_rate": 8.009147698158241e-06, + "loss": 0.9276, + "step": 7811 + }, + { + "epoch": 0.6312854805147579, + "grad_norm": 2.6976876258850098, + "learning_rate": 8.008625082703507e-06, + "loss": 0.8602, + "step": 7812 + }, + { + "epoch": 0.6313662902280854, + "grad_norm": 2.429450511932373, + "learning_rate": 8.008102415718269e-06, + "loss": 0.9231, + "step": 7813 + }, + { + "epoch": 0.6314470999414129, + "grad_norm": 2.8100454807281494, + "learning_rate": 8.007579697211476e-06, + "loss": 0.9139, + "step": 7814 + }, + { + "epoch": 0.6315279096547405, + "grad_norm": 2.411275863647461, + "learning_rate": 8.007056927192084e-06, + "loss": 1.0832, + "step": 7815 + }, + { + "epoch": 0.6316087193680681, + "grad_norm": 2.3729898929595947, + "learning_rate": 8.006534105669046e-06, + "loss": 0.9018, + "step": 7816 + }, + { + "epoch": 0.6316895290813955, + "grad_norm": 3.1594552993774414, + "learning_rate": 8.006011232651317e-06, + "loss": 0.9886, + "step": 7817 + }, + { + "epoch": 0.6317703387947231, + "grad_norm": 2.5499870777130127, + "learning_rate": 8.005488308147852e-06, + "loss": 0.937, + "step": 7818 + }, + { + "epoch": 0.6318511485080507, + "grad_norm": 2.6816816329956055, + "learning_rate": 8.004965332167609e-06, + "loss": 1.0091, + "step": 7819 + }, + { + "epoch": 0.6319319582213783, + "grad_norm": 2.468831777572632, + "learning_rate": 8.004442304719541e-06, + "loss": 1.0155, + "step": 7820 + }, + { + "epoch": 0.6320127679347057, + "grad_norm": 2.800617218017578, + "learning_rate": 8.003919225812612e-06, + "loss": 0.9332, + "step": 7821 + }, + { + "epoch": 0.6320935776480333, + "grad_norm": 2.770237684249878, + "learning_rate": 8.003396095455778e-06, + "loss": 0.9352, + "step": 7822 + }, + { + "epoch": 0.6321743873613609, + "grad_norm": 2.297513723373413, + "learning_rate": 8.002872913658e-06, + "loss": 1.0963, + "step": 7823 + }, + { + "epoch": 0.6322551970746884, + "grad_norm": 2.436514139175415, + "learning_rate": 8.002349680428235e-06, + "loss": 0.8675, + "step": 7824 + }, + { + "epoch": 0.6323360067880159, + "grad_norm": 3.1877448558807373, + "learning_rate": 8.00182639577545e-06, + "loss": 0.9469, + "step": 7825 + }, + { + "epoch": 0.6324168165013435, + "grad_norm": 2.697570323944092, + "learning_rate": 8.001303059708605e-06, + "loss": 0.8729, + "step": 7826 + }, + { + "epoch": 0.632497626214671, + "grad_norm": 2.694343090057373, + "learning_rate": 8.000779672236664e-06, + "loss": 0.982, + "step": 7827 + }, + { + "epoch": 0.6325784359279986, + "grad_norm": 2.614118814468384, + "learning_rate": 8.000256233368592e-06, + "loss": 0.8815, + "step": 7828 + }, + { + "epoch": 0.6326592456413261, + "grad_norm": 2.5081512928009033, + "learning_rate": 7.999732743113353e-06, + "loss": 0.9012, + "step": 7829 + }, + { + "epoch": 0.6327400553546536, + "grad_norm": 2.6738438606262207, + "learning_rate": 7.999209201479913e-06, + "loss": 0.9866, + "step": 7830 + }, + { + "epoch": 0.6328208650679812, + "grad_norm": 2.5792617797851562, + "learning_rate": 7.99868560847724e-06, + "loss": 0.9247, + "step": 7831 + }, + { + "epoch": 0.6329016747813088, + "grad_norm": 2.5665578842163086, + "learning_rate": 7.9981619641143e-06, + "loss": 0.904, + "step": 7832 + }, + { + "epoch": 0.6329824844946362, + "grad_norm": 2.240414619445801, + "learning_rate": 7.997638268400067e-06, + "loss": 0.8802, + "step": 7833 + }, + { + "epoch": 0.6330632942079638, + "grad_norm": 2.4527547359466553, + "learning_rate": 7.997114521343505e-06, + "loss": 1.0719, + "step": 7834 + }, + { + "epoch": 0.6331441039212914, + "grad_norm": 2.39039945602417, + "learning_rate": 7.996590722953586e-06, + "loss": 1.0883, + "step": 7835 + }, + { + "epoch": 0.6332249136346189, + "grad_norm": 2.4689130783081055, + "learning_rate": 7.996066873239283e-06, + "loss": 0.8954, + "step": 7836 + }, + { + "epoch": 0.6333057233479464, + "grad_norm": 2.6633870601654053, + "learning_rate": 7.995542972209567e-06, + "loss": 0.9858, + "step": 7837 + }, + { + "epoch": 0.633386533061274, + "grad_norm": 2.6453187465667725, + "learning_rate": 7.995019019873411e-06, + "loss": 1.068, + "step": 7838 + }, + { + "epoch": 0.6334673427746015, + "grad_norm": 2.5491058826446533, + "learning_rate": 7.994495016239789e-06, + "loss": 1.013, + "step": 7839 + }, + { + "epoch": 0.6335481524879291, + "grad_norm": 2.9839184284210205, + "learning_rate": 7.993970961317678e-06, + "loss": 1.1085, + "step": 7840 + }, + { + "epoch": 0.6336289622012566, + "grad_norm": 2.376394510269165, + "learning_rate": 7.99344685511605e-06, + "loss": 0.9267, + "step": 7841 + }, + { + "epoch": 0.6337097719145841, + "grad_norm": 2.6307690143585205, + "learning_rate": 7.992922697643885e-06, + "loss": 0.9711, + "step": 7842 + }, + { + "epoch": 0.6337905816279117, + "grad_norm": 2.667869806289673, + "learning_rate": 7.99239848891016e-06, + "loss": 0.9759, + "step": 7843 + }, + { + "epoch": 0.6338713913412393, + "grad_norm": 3.161916494369507, + "learning_rate": 7.99187422892385e-06, + "loss": 0.8621, + "step": 7844 + }, + { + "epoch": 0.6339522010545667, + "grad_norm": 2.229846477508545, + "learning_rate": 7.99134991769394e-06, + "loss": 0.8275, + "step": 7845 + }, + { + "epoch": 0.6340330107678943, + "grad_norm": 2.615090847015381, + "learning_rate": 7.990825555229407e-06, + "loss": 0.7687, + "step": 7846 + }, + { + "epoch": 0.6341138204812219, + "grad_norm": 2.7549939155578613, + "learning_rate": 7.99030114153923e-06, + "loss": 0.9814, + "step": 7847 + }, + { + "epoch": 0.6341946301945494, + "grad_norm": 2.917659282684326, + "learning_rate": 7.989776676632395e-06, + "loss": 0.8248, + "step": 7848 + }, + { + "epoch": 0.6342754399078769, + "grad_norm": 2.4968743324279785, + "learning_rate": 7.989252160517884e-06, + "loss": 0.9364, + "step": 7849 + }, + { + "epoch": 0.6343562496212045, + "grad_norm": 2.3302972316741943, + "learning_rate": 7.988727593204679e-06, + "loss": 0.9482, + "step": 7850 + }, + { + "epoch": 0.634437059334532, + "grad_norm": 3.111441135406494, + "learning_rate": 7.988202974701766e-06, + "loss": 0.9921, + "step": 7851 + }, + { + "epoch": 0.6345178690478596, + "grad_norm": 2.7978410720825195, + "learning_rate": 7.987678305018128e-06, + "loss": 0.9492, + "step": 7852 + }, + { + "epoch": 0.6345986787611871, + "grad_norm": 2.696904420852661, + "learning_rate": 7.987153584162754e-06, + "loss": 1.0293, + "step": 7853 + }, + { + "epoch": 0.6346794884745146, + "grad_norm": 2.556978225708008, + "learning_rate": 7.986628812144632e-06, + "loss": 1.0262, + "step": 7854 + }, + { + "epoch": 0.6347602981878422, + "grad_norm": 2.380255699157715, + "learning_rate": 7.986103988972746e-06, + "loss": 0.8333, + "step": 7855 + }, + { + "epoch": 0.6348411079011698, + "grad_norm": 2.783190965652466, + "learning_rate": 7.985579114656089e-06, + "loss": 0.9723, + "step": 7856 + }, + { + "epoch": 0.6349219176144972, + "grad_norm": 2.297658920288086, + "learning_rate": 7.985054189203648e-06, + "loss": 0.9584, + "step": 7857 + }, + { + "epoch": 0.6350027273278248, + "grad_norm": 2.73989200592041, + "learning_rate": 7.984529212624417e-06, + "loss": 1.0242, + "step": 7858 + }, + { + "epoch": 0.6350835370411524, + "grad_norm": 2.856266498565674, + "learning_rate": 7.984004184927383e-06, + "loss": 0.9769, + "step": 7859 + }, + { + "epoch": 0.6351643467544799, + "grad_norm": 2.6034038066864014, + "learning_rate": 7.983479106121543e-06, + "loss": 0.9549, + "step": 7860 + }, + { + "epoch": 0.6352451564678074, + "grad_norm": 2.601545572280884, + "learning_rate": 7.982953976215888e-06, + "loss": 0.9885, + "step": 7861 + }, + { + "epoch": 0.635325966181135, + "grad_norm": 2.3755569458007812, + "learning_rate": 7.982428795219412e-06, + "loss": 0.896, + "step": 7862 + }, + { + "epoch": 0.6354067758944625, + "grad_norm": 2.7541861534118652, + "learning_rate": 7.98190356314111e-06, + "loss": 0.9988, + "step": 7863 + }, + { + "epoch": 0.6354875856077901, + "grad_norm": 2.01902437210083, + "learning_rate": 7.981378279989979e-06, + "loss": 1.1493, + "step": 7864 + }, + { + "epoch": 0.6355683953211176, + "grad_norm": 2.4613897800445557, + "learning_rate": 7.980852945775017e-06, + "loss": 0.9756, + "step": 7865 + }, + { + "epoch": 0.6356492050344451, + "grad_norm": 2.7889516353607178, + "learning_rate": 7.98032756050522e-06, + "loss": 1.08, + "step": 7866 + }, + { + "epoch": 0.6357300147477727, + "grad_norm": 2.5871918201446533, + "learning_rate": 7.979802124189585e-06, + "loss": 0.9198, + "step": 7867 + }, + { + "epoch": 0.6358108244611003, + "grad_norm": 2.2707509994506836, + "learning_rate": 7.979276636837115e-06, + "loss": 0.9897, + "step": 7868 + }, + { + "epoch": 0.6358916341744277, + "grad_norm": 2.6156702041625977, + "learning_rate": 7.978751098456807e-06, + "loss": 0.9799, + "step": 7869 + }, + { + "epoch": 0.6359724438877553, + "grad_norm": 2.71252703666687, + "learning_rate": 7.978225509057665e-06, + "loss": 0.9228, + "step": 7870 + }, + { + "epoch": 0.6360532536010829, + "grad_norm": 2.387237071990967, + "learning_rate": 7.977699868648688e-06, + "loss": 0.8877, + "step": 7871 + }, + { + "epoch": 0.6361340633144104, + "grad_norm": 2.5913264751434326, + "learning_rate": 7.977174177238882e-06, + "loss": 0.8578, + "step": 7872 + }, + { + "epoch": 0.6362148730277379, + "grad_norm": 3.011514902114868, + "learning_rate": 7.976648434837249e-06, + "loss": 0.8832, + "step": 7873 + }, + { + "epoch": 0.6362956827410655, + "grad_norm": 2.521620750427246, + "learning_rate": 7.976122641452796e-06, + "loss": 0.9534, + "step": 7874 + }, + { + "epoch": 0.636376492454393, + "grad_norm": 2.4802489280700684, + "learning_rate": 7.975596797094525e-06, + "loss": 1.0569, + "step": 7875 + }, + { + "epoch": 0.6364573021677206, + "grad_norm": 2.9051873683929443, + "learning_rate": 7.975070901771444e-06, + "loss": 0.8106, + "step": 7876 + }, + { + "epoch": 0.6365381118810481, + "grad_norm": 2.508762836456299, + "learning_rate": 7.974544955492562e-06, + "loss": 0.9217, + "step": 7877 + }, + { + "epoch": 0.6366189215943756, + "grad_norm": 2.3968288898468018, + "learning_rate": 7.974018958266885e-06, + "loss": 0.9095, + "step": 7878 + }, + { + "epoch": 0.6366997313077032, + "grad_norm": 2.5430245399475098, + "learning_rate": 7.973492910103424e-06, + "loss": 0.9602, + "step": 7879 + }, + { + "epoch": 0.6367805410210308, + "grad_norm": 2.813250780105591, + "learning_rate": 7.972966811011187e-06, + "loss": 0.9238, + "step": 7880 + }, + { + "epoch": 0.6368613507343582, + "grad_norm": 2.7060039043426514, + "learning_rate": 7.972440660999185e-06, + "loss": 0.8969, + "step": 7881 + }, + { + "epoch": 0.6369421604476858, + "grad_norm": 2.887779474258423, + "learning_rate": 7.971914460076434e-06, + "loss": 0.8344, + "step": 7882 + }, + { + "epoch": 0.6370229701610134, + "grad_norm": 2.6328446865081787, + "learning_rate": 7.971388208251937e-06, + "loss": 0.9091, + "step": 7883 + }, + { + "epoch": 0.6371037798743409, + "grad_norm": 2.2077245712280273, + "learning_rate": 7.970861905534718e-06, + "loss": 0.9226, + "step": 7884 + }, + { + "epoch": 0.6371845895876684, + "grad_norm": 2.7489259243011475, + "learning_rate": 7.970335551933785e-06, + "loss": 0.9441, + "step": 7885 + }, + { + "epoch": 0.637265399300996, + "grad_norm": 2.5437917709350586, + "learning_rate": 7.969809147458154e-06, + "loss": 0.94, + "step": 7886 + }, + { + "epoch": 0.6373462090143235, + "grad_norm": 2.82645320892334, + "learning_rate": 7.969282692116844e-06, + "loss": 1.0309, + "step": 7887 + }, + { + "epoch": 0.6374270187276511, + "grad_norm": 2.450364589691162, + "learning_rate": 7.968756185918869e-06, + "loss": 0.9699, + "step": 7888 + }, + { + "epoch": 0.6375078284409786, + "grad_norm": 2.5669620037078857, + "learning_rate": 7.968229628873246e-06, + "loss": 0.9245, + "step": 7889 + }, + { + "epoch": 0.6375886381543061, + "grad_norm": 2.772475242614746, + "learning_rate": 7.967703020988997e-06, + "loss": 0.7552, + "step": 7890 + }, + { + "epoch": 0.6376694478676337, + "grad_norm": 2.900303840637207, + "learning_rate": 7.967176362275138e-06, + "loss": 0.9327, + "step": 7891 + }, + { + "epoch": 0.6377502575809613, + "grad_norm": 4.1801910400390625, + "learning_rate": 7.96664965274069e-06, + "loss": 1.0217, + "step": 7892 + }, + { + "epoch": 0.6378310672942887, + "grad_norm": 2.6613759994506836, + "learning_rate": 7.966122892394679e-06, + "loss": 0.8932, + "step": 7893 + }, + { + "epoch": 0.6379118770076163, + "grad_norm": 2.336002826690674, + "learning_rate": 7.96559608124612e-06, + "loss": 1.0525, + "step": 7894 + }, + { + "epoch": 0.6379926867209439, + "grad_norm": 2.6573903560638428, + "learning_rate": 7.965069219304043e-06, + "loss": 0.9063, + "step": 7895 + }, + { + "epoch": 0.6380734964342714, + "grad_norm": 2.5049591064453125, + "learning_rate": 7.964542306577464e-06, + "loss": 0.8362, + "step": 7896 + }, + { + "epoch": 0.6381543061475989, + "grad_norm": 2.3977386951446533, + "learning_rate": 7.964015343075416e-06, + "loss": 1.0461, + "step": 7897 + }, + { + "epoch": 0.6382351158609265, + "grad_norm": 3.4002206325531006, + "learning_rate": 7.963488328806921e-06, + "loss": 0.912, + "step": 7898 + }, + { + "epoch": 0.638315925574254, + "grad_norm": 2.4830734729766846, + "learning_rate": 7.962961263781003e-06, + "loss": 1.0555, + "step": 7899 + }, + { + "epoch": 0.6383967352875816, + "grad_norm": 2.5380730628967285, + "learning_rate": 7.962434148006693e-06, + "loss": 0.8836, + "step": 7900 + }, + { + "epoch": 0.6384775450009091, + "grad_norm": 2.824876546859741, + "learning_rate": 7.961906981493016e-06, + "loss": 0.989, + "step": 7901 + }, + { + "epoch": 0.6385583547142366, + "grad_norm": 2.6988911628723145, + "learning_rate": 7.961379764249004e-06, + "loss": 0.8882, + "step": 7902 + }, + { + "epoch": 0.6386391644275642, + "grad_norm": 2.415241241455078, + "learning_rate": 7.960852496283686e-06, + "loss": 0.9554, + "step": 7903 + }, + { + "epoch": 0.6387199741408918, + "grad_norm": 2.6575350761413574, + "learning_rate": 7.960325177606093e-06, + "loss": 0.9876, + "step": 7904 + }, + { + "epoch": 0.6388007838542192, + "grad_norm": 2.8557636737823486, + "learning_rate": 7.959797808225257e-06, + "loss": 0.9652, + "step": 7905 + }, + { + "epoch": 0.6388815935675468, + "grad_norm": 2.9141557216644287, + "learning_rate": 7.959270388150209e-06, + "loss": 1.0365, + "step": 7906 + }, + { + "epoch": 0.6389624032808744, + "grad_norm": 2.2590649127960205, + "learning_rate": 7.958742917389983e-06, + "loss": 0.8244, + "step": 7907 + }, + { + "epoch": 0.6390432129942019, + "grad_norm": 2.4103503227233887, + "learning_rate": 7.958215395953614e-06, + "loss": 0.9339, + "step": 7908 + }, + { + "epoch": 0.6391240227075294, + "grad_norm": 2.4939024448394775, + "learning_rate": 7.957687823850136e-06, + "loss": 1.0206, + "step": 7909 + }, + { + "epoch": 0.639204832420857, + "grad_norm": 2.592484474182129, + "learning_rate": 7.957160201088588e-06, + "loss": 1.1189, + "step": 7910 + }, + { + "epoch": 0.6392856421341845, + "grad_norm": 2.637906789779663, + "learning_rate": 7.956632527678003e-06, + "loss": 1.0982, + "step": 7911 + }, + { + "epoch": 0.6393664518475121, + "grad_norm": 2.261767864227295, + "learning_rate": 7.95610480362742e-06, + "loss": 0.9419, + "step": 7912 + }, + { + "epoch": 0.6394472615608396, + "grad_norm": 2.8713479042053223, + "learning_rate": 7.955577028945881e-06, + "loss": 1.0715, + "step": 7913 + }, + { + "epoch": 0.6395280712741671, + "grad_norm": 2.9200046062469482, + "learning_rate": 7.955049203642421e-06, + "loss": 0.9441, + "step": 7914 + }, + { + "epoch": 0.6396088809874947, + "grad_norm": 2.5749030113220215, + "learning_rate": 7.954521327726082e-06, + "loss": 0.9433, + "step": 7915 + }, + { + "epoch": 0.6396896907008223, + "grad_norm": 2.8914525508880615, + "learning_rate": 7.953993401205907e-06, + "loss": 0.9765, + "step": 7916 + }, + { + "epoch": 0.6397705004141497, + "grad_norm": 2.4222776889801025, + "learning_rate": 7.953465424090933e-06, + "loss": 1.0381, + "step": 7917 + }, + { + "epoch": 0.6398513101274773, + "grad_norm": 2.5520670413970947, + "learning_rate": 7.95293739639021e-06, + "loss": 1.0009, + "step": 7918 + }, + { + "epoch": 0.6399321198408049, + "grad_norm": 3.170346736907959, + "learning_rate": 7.952409318112778e-06, + "loss": 0.9548, + "step": 7919 + }, + { + "epoch": 0.6400129295541324, + "grad_norm": 2.340275526046753, + "learning_rate": 7.951881189267681e-06, + "loss": 0.9463, + "step": 7920 + }, + { + "epoch": 0.6400937392674599, + "grad_norm": 2.8253910541534424, + "learning_rate": 7.951353009863966e-06, + "loss": 0.9926, + "step": 7921 + }, + { + "epoch": 0.6401745489807875, + "grad_norm": 2.312244176864624, + "learning_rate": 7.950824779910678e-06, + "loss": 1.0231, + "step": 7922 + }, + { + "epoch": 0.640255358694115, + "grad_norm": 2.3926475048065186, + "learning_rate": 7.950296499416866e-06, + "loss": 1.0799, + "step": 7923 + }, + { + "epoch": 0.6403361684074426, + "grad_norm": 2.747842311859131, + "learning_rate": 7.949768168391579e-06, + "loss": 0.8584, + "step": 7924 + }, + { + "epoch": 0.6404169781207701, + "grad_norm": 3.1573312282562256, + "learning_rate": 7.94923978684386e-06, + "loss": 0.8375, + "step": 7925 + }, + { + "epoch": 0.6404977878340976, + "grad_norm": 2.4790127277374268, + "learning_rate": 7.94871135478277e-06, + "loss": 0.9673, + "step": 7926 + }, + { + "epoch": 0.6405785975474252, + "grad_norm": 2.638072967529297, + "learning_rate": 7.94818287221735e-06, + "loss": 0.8478, + "step": 7927 + }, + { + "epoch": 0.6406594072607528, + "grad_norm": 2.696831703186035, + "learning_rate": 7.947654339156653e-06, + "loss": 0.947, + "step": 7928 + }, + { + "epoch": 0.6407402169740802, + "grad_norm": 3.0501718521118164, + "learning_rate": 7.947125755609734e-06, + "loss": 0.897, + "step": 7929 + }, + { + "epoch": 0.6408210266874078, + "grad_norm": 2.7243447303771973, + "learning_rate": 7.946597121585648e-06, + "loss": 0.9673, + "step": 7930 + }, + { + "epoch": 0.6409018364007354, + "grad_norm": 2.5512492656707764, + "learning_rate": 7.946068437093445e-06, + "loss": 0.8887, + "step": 7931 + }, + { + "epoch": 0.6409826461140629, + "grad_norm": 2.7076799869537354, + "learning_rate": 7.945539702142184e-06, + "loss": 0.9501, + "step": 7932 + }, + { + "epoch": 0.6410634558273904, + "grad_norm": 2.636019468307495, + "learning_rate": 7.945010916740916e-06, + "loss": 0.913, + "step": 7933 + }, + { + "epoch": 0.641144265540718, + "grad_norm": 2.915334701538086, + "learning_rate": 7.944482080898703e-06, + "loss": 0.9734, + "step": 7934 + }, + { + "epoch": 0.6412250752540455, + "grad_norm": 2.865138292312622, + "learning_rate": 7.9439531946246e-06, + "loss": 0.9657, + "step": 7935 + }, + { + "epoch": 0.6413058849673731, + "grad_norm": 2.6116302013397217, + "learning_rate": 7.943424257927667e-06, + "loss": 0.8759, + "step": 7936 + }, + { + "epoch": 0.6413866946807006, + "grad_norm": 2.7858128547668457, + "learning_rate": 7.942895270816961e-06, + "loss": 0.9374, + "step": 7937 + }, + { + "epoch": 0.6414675043940281, + "grad_norm": 2.4870376586914062, + "learning_rate": 7.942366233301545e-06, + "loss": 0.9349, + "step": 7938 + }, + { + "epoch": 0.6415483141073557, + "grad_norm": 2.4941608905792236, + "learning_rate": 7.941837145390478e-06, + "loss": 0.7869, + "step": 7939 + }, + { + "epoch": 0.6416291238206833, + "grad_norm": 2.52121901512146, + "learning_rate": 7.941308007092823e-06, + "loss": 1.0242, + "step": 7940 + }, + { + "epoch": 0.6417099335340107, + "grad_norm": 2.7866406440734863, + "learning_rate": 7.940778818417643e-06, + "loss": 0.9009, + "step": 7941 + }, + { + "epoch": 0.6417907432473383, + "grad_norm": 2.4861245155334473, + "learning_rate": 7.940249579374002e-06, + "loss": 0.9445, + "step": 7942 + }, + { + "epoch": 0.6418715529606659, + "grad_norm": 3.105573892593384, + "learning_rate": 7.939720289970963e-06, + "loss": 1.0505, + "step": 7943 + }, + { + "epoch": 0.6419523626739934, + "grad_norm": 2.5582029819488525, + "learning_rate": 7.939190950217592e-06, + "loss": 0.9101, + "step": 7944 + }, + { + "epoch": 0.6420331723873209, + "grad_norm": 3.139284610748291, + "learning_rate": 7.938661560122958e-06, + "loss": 1.0687, + "step": 7945 + }, + { + "epoch": 0.6421139821006485, + "grad_norm": 2.891692638397217, + "learning_rate": 7.938132119696125e-06, + "loss": 0.8713, + "step": 7946 + }, + { + "epoch": 0.642194791813976, + "grad_norm": 2.503300428390503, + "learning_rate": 7.93760262894616e-06, + "loss": 0.9078, + "step": 7947 + }, + { + "epoch": 0.6422756015273036, + "grad_norm": 2.2113330364227295, + "learning_rate": 7.937073087882137e-06, + "loss": 0.9952, + "step": 7948 + }, + { + "epoch": 0.6423564112406311, + "grad_norm": 2.411799907684326, + "learning_rate": 7.936543496513121e-06, + "loss": 0.9453, + "step": 7949 + }, + { + "epoch": 0.6424372209539587, + "grad_norm": 2.4153037071228027, + "learning_rate": 7.936013854848185e-06, + "loss": 0.9972, + "step": 7950 + }, + { + "epoch": 0.6425180306672862, + "grad_norm": 2.5781900882720947, + "learning_rate": 7.9354841628964e-06, + "loss": 0.9114, + "step": 7951 + }, + { + "epoch": 0.6425988403806138, + "grad_norm": 2.9225873947143555, + "learning_rate": 7.934954420666838e-06, + "loss": 0.9453, + "step": 7952 + }, + { + "epoch": 0.6426796500939413, + "grad_norm": 2.8506546020507812, + "learning_rate": 7.93442462816857e-06, + "loss": 0.9384, + "step": 7953 + }, + { + "epoch": 0.6427604598072688, + "grad_norm": 2.8344874382019043, + "learning_rate": 7.933894785410676e-06, + "loss": 0.9396, + "step": 7954 + }, + { + "epoch": 0.6428412695205964, + "grad_norm": 2.7729973793029785, + "learning_rate": 7.933364892402227e-06, + "loss": 1.0725, + "step": 7955 + }, + { + "epoch": 0.642922079233924, + "grad_norm": 2.305830478668213, + "learning_rate": 7.932834949152298e-06, + "loss": 0.9691, + "step": 7956 + }, + { + "epoch": 0.6430028889472514, + "grad_norm": 2.670337438583374, + "learning_rate": 7.932304955669967e-06, + "loss": 0.9467, + "step": 7957 + }, + { + "epoch": 0.643083698660579, + "grad_norm": 3.4643208980560303, + "learning_rate": 7.931774911964312e-06, + "loss": 0.8375, + "step": 7958 + }, + { + "epoch": 0.6431645083739066, + "grad_norm": 2.6386911869049072, + "learning_rate": 7.93124481804441e-06, + "loss": 0.9037, + "step": 7959 + }, + { + "epoch": 0.6432453180872341, + "grad_norm": 2.4012837409973145, + "learning_rate": 7.930714673919342e-06, + "loss": 1.0759, + "step": 7960 + }, + { + "epoch": 0.6433261278005616, + "grad_norm": 2.9881134033203125, + "learning_rate": 7.930184479598185e-06, + "loss": 0.9248, + "step": 7961 + }, + { + "epoch": 0.6434069375138892, + "grad_norm": 2.3555521965026855, + "learning_rate": 7.929654235090022e-06, + "loss": 1.013, + "step": 7962 + }, + { + "epoch": 0.6434877472272167, + "grad_norm": 2.7471446990966797, + "learning_rate": 7.929123940403938e-06, + "loss": 1.0243, + "step": 7963 + }, + { + "epoch": 0.6435685569405443, + "grad_norm": 2.596599817276001, + "learning_rate": 7.92859359554901e-06, + "loss": 0.9914, + "step": 7964 + }, + { + "epoch": 0.6436493666538718, + "grad_norm": 2.3659653663635254, + "learning_rate": 7.928063200534323e-06, + "loss": 0.9262, + "step": 7965 + }, + { + "epoch": 0.6437301763671993, + "grad_norm": 2.971914052963257, + "learning_rate": 7.927532755368965e-06, + "loss": 1.1657, + "step": 7966 + }, + { + "epoch": 0.6438109860805269, + "grad_norm": 2.639777421951294, + "learning_rate": 7.927002260062018e-06, + "loss": 0.8875, + "step": 7967 + }, + { + "epoch": 0.6438917957938545, + "grad_norm": 2.5370595455169678, + "learning_rate": 7.926471714622568e-06, + "loss": 0.9521, + "step": 7968 + }, + { + "epoch": 0.6439726055071819, + "grad_norm": 2.853273630142212, + "learning_rate": 7.925941119059702e-06, + "loss": 0.9387, + "step": 7969 + }, + { + "epoch": 0.6440534152205095, + "grad_norm": 2.5535457134246826, + "learning_rate": 7.92541047338251e-06, + "loss": 0.9954, + "step": 7970 + }, + { + "epoch": 0.6441342249338371, + "grad_norm": 3.0810725688934326, + "learning_rate": 7.924879777600078e-06, + "loss": 0.9183, + "step": 7971 + }, + { + "epoch": 0.6442150346471646, + "grad_norm": 2.6284542083740234, + "learning_rate": 7.9243490317215e-06, + "loss": 0.9929, + "step": 7972 + }, + { + "epoch": 0.6442958443604921, + "grad_norm": 2.6094870567321777, + "learning_rate": 7.923818235755859e-06, + "loss": 1.0417, + "step": 7973 + }, + { + "epoch": 0.6443766540738197, + "grad_norm": 2.7049832344055176, + "learning_rate": 7.923287389712251e-06, + "loss": 0.9304, + "step": 7974 + }, + { + "epoch": 0.6444574637871472, + "grad_norm": 2.2217698097229004, + "learning_rate": 7.92275649359977e-06, + "loss": 1.0143, + "step": 7975 + }, + { + "epoch": 0.6445382735004748, + "grad_norm": 2.392277956008911, + "learning_rate": 7.922225547427504e-06, + "loss": 0.9486, + "step": 7976 + }, + { + "epoch": 0.6446190832138023, + "grad_norm": 2.4179892539978027, + "learning_rate": 7.921694551204552e-06, + "loss": 1.0242, + "step": 7977 + }, + { + "epoch": 0.6446998929271298, + "grad_norm": 2.6017119884490967, + "learning_rate": 7.921163504940004e-06, + "loss": 0.9248, + "step": 7978 + }, + { + "epoch": 0.6447807026404574, + "grad_norm": 2.2930874824523926, + "learning_rate": 7.920632408642959e-06, + "loss": 0.9522, + "step": 7979 + }, + { + "epoch": 0.644861512353785, + "grad_norm": 2.3802316188812256, + "learning_rate": 7.92010126232251e-06, + "loss": 1.0128, + "step": 7980 + }, + { + "epoch": 0.6449423220671124, + "grad_norm": 2.891817331314087, + "learning_rate": 7.919570065987757e-06, + "loss": 1.0646, + "step": 7981 + }, + { + "epoch": 0.64502313178044, + "grad_norm": 2.985229015350342, + "learning_rate": 7.919038819647797e-06, + "loss": 0.8542, + "step": 7982 + }, + { + "epoch": 0.6451039414937676, + "grad_norm": 3.0287842750549316, + "learning_rate": 7.918507523311732e-06, + "loss": 0.9686, + "step": 7983 + }, + { + "epoch": 0.6451847512070951, + "grad_norm": 2.559133529663086, + "learning_rate": 7.917976176988656e-06, + "loss": 0.9464, + "step": 7984 + }, + { + "epoch": 0.6452655609204226, + "grad_norm": 3.291764736175537, + "learning_rate": 7.917444780687674e-06, + "loss": 1.015, + "step": 7985 + }, + { + "epoch": 0.6453463706337502, + "grad_norm": 2.4056661128997803, + "learning_rate": 7.916913334417887e-06, + "loss": 1.0248, + "step": 7986 + }, + { + "epoch": 0.6454271803470777, + "grad_norm": 2.574171543121338, + "learning_rate": 7.916381838188396e-06, + "loss": 0.9222, + "step": 7987 + }, + { + "epoch": 0.6455079900604053, + "grad_norm": 2.3434150218963623, + "learning_rate": 7.915850292008305e-06, + "loss": 0.9589, + "step": 7988 + }, + { + "epoch": 0.6455887997737328, + "grad_norm": 2.8135275840759277, + "learning_rate": 7.915318695886717e-06, + "loss": 0.8705, + "step": 7989 + }, + { + "epoch": 0.6456696094870603, + "grad_norm": 2.5289838314056396, + "learning_rate": 7.914787049832741e-06, + "loss": 0.9562, + "step": 7990 + }, + { + "epoch": 0.6457504192003879, + "grad_norm": 3.281461238861084, + "learning_rate": 7.914255353855478e-06, + "loss": 0.9741, + "step": 7991 + }, + { + "epoch": 0.6458312289137155, + "grad_norm": 2.5670602321624756, + "learning_rate": 7.913723607964037e-06, + "loss": 0.8829, + "step": 7992 + }, + { + "epoch": 0.6459120386270429, + "grad_norm": 3.479647159576416, + "learning_rate": 7.913191812167524e-06, + "loss": 0.8005, + "step": 7993 + }, + { + "epoch": 0.6459928483403705, + "grad_norm": 2.5427286624908447, + "learning_rate": 7.912659966475051e-06, + "loss": 0.9571, + "step": 7994 + }, + { + "epoch": 0.6460736580536981, + "grad_norm": 2.7731635570526123, + "learning_rate": 7.91212807089572e-06, + "loss": 1.0192, + "step": 7995 + }, + { + "epoch": 0.6461544677670256, + "grad_norm": 2.5769543647766113, + "learning_rate": 7.91159612543865e-06, + "loss": 0.9193, + "step": 7996 + }, + { + "epoch": 0.6462352774803531, + "grad_norm": 2.678894281387329, + "learning_rate": 7.911064130112947e-06, + "loss": 0.8745, + "step": 7997 + }, + { + "epoch": 0.6463160871936807, + "grad_norm": 3.0910682678222656, + "learning_rate": 7.910532084927724e-06, + "loss": 0.8793, + "step": 7998 + }, + { + "epoch": 0.6463968969070082, + "grad_norm": 2.6806740760803223, + "learning_rate": 7.909999989892092e-06, + "loss": 1.0389, + "step": 7999 + }, + { + "epoch": 0.6464777066203358, + "grad_norm": 2.4299354553222656, + "learning_rate": 7.909467845015167e-06, + "loss": 1.0521, + "step": 8000 + }, + { + "epoch": 0.6464777066203358, + "eval_loss": 0.7883428931236267, + "eval_runtime": 813.9119, + "eval_samples_per_second": 102.426, + "eval_steps_per_second": 12.804, + "step": 8000 + }, + { + "epoch": 0.6465585163336633, + "grad_norm": 2.5144739151000977, + "learning_rate": 7.90893565030606e-06, + "loss": 0.8928, + "step": 8001 + }, + { + "epoch": 0.6466393260469908, + "grad_norm": 2.7249040603637695, + "learning_rate": 7.90840340577389e-06, + "loss": 0.9346, + "step": 8002 + }, + { + "epoch": 0.6467201357603184, + "grad_norm": 2.625905990600586, + "learning_rate": 7.90787111142777e-06, + "loss": 0.8915, + "step": 8003 + }, + { + "epoch": 0.646800945473646, + "grad_norm": 2.4242939949035645, + "learning_rate": 7.90733876727682e-06, + "loss": 1.0098, + "step": 8004 + }, + { + "epoch": 0.6468817551869734, + "grad_norm": 2.6896331310272217, + "learning_rate": 7.906806373330156e-06, + "loss": 0.8493, + "step": 8005 + }, + { + "epoch": 0.646962564900301, + "grad_norm": 2.746581792831421, + "learning_rate": 7.906273929596895e-06, + "loss": 0.8971, + "step": 8006 + }, + { + "epoch": 0.6470433746136286, + "grad_norm": 2.4979236125946045, + "learning_rate": 7.905741436086158e-06, + "loss": 0.9366, + "step": 8007 + }, + { + "epoch": 0.6471241843269561, + "grad_norm": 2.9039738178253174, + "learning_rate": 7.905208892807069e-06, + "loss": 1.057, + "step": 8008 + }, + { + "epoch": 0.6472049940402836, + "grad_norm": 2.85227108001709, + "learning_rate": 7.904676299768741e-06, + "loss": 0.8735, + "step": 8009 + }, + { + "epoch": 0.6472858037536112, + "grad_norm": 2.5249216556549072, + "learning_rate": 7.904143656980303e-06, + "loss": 0.9225, + "step": 8010 + }, + { + "epoch": 0.6473666134669387, + "grad_norm": 3.0269346237182617, + "learning_rate": 7.903610964450876e-06, + "loss": 0.9081, + "step": 8011 + }, + { + "epoch": 0.6474474231802663, + "grad_norm": 2.7412753105163574, + "learning_rate": 7.903078222189582e-06, + "loss": 1.0081, + "step": 8012 + }, + { + "epoch": 0.6475282328935938, + "grad_norm": 3.5761208534240723, + "learning_rate": 7.902545430205548e-06, + "loss": 0.9045, + "step": 8013 + }, + { + "epoch": 0.6476090426069213, + "grad_norm": 2.872438669204712, + "learning_rate": 7.902012588507898e-06, + "loss": 0.952, + "step": 8014 + }, + { + "epoch": 0.6476898523202489, + "grad_norm": 3.0681557655334473, + "learning_rate": 7.901479697105759e-06, + "loss": 0.9553, + "step": 8015 + }, + { + "epoch": 0.6477706620335765, + "grad_norm": 2.399759531021118, + "learning_rate": 7.90094675600826e-06, + "loss": 0.9147, + "step": 8016 + }, + { + "epoch": 0.6478514717469039, + "grad_norm": 2.7116787433624268, + "learning_rate": 7.900413765224522e-06, + "loss": 0.9374, + "step": 8017 + }, + { + "epoch": 0.6479322814602315, + "grad_norm": 2.808452844619751, + "learning_rate": 7.899880724763681e-06, + "loss": 0.9594, + "step": 8018 + }, + { + "epoch": 0.6480130911735591, + "grad_norm": 2.55055570602417, + "learning_rate": 7.899347634634864e-06, + "loss": 0.906, + "step": 8019 + }, + { + "epoch": 0.6480939008868866, + "grad_norm": 2.735206365585327, + "learning_rate": 7.898814494847203e-06, + "loss": 0.8791, + "step": 8020 + }, + { + "epoch": 0.6481747106002141, + "grad_norm": 2.5543875694274902, + "learning_rate": 7.898281305409828e-06, + "loss": 0.8443, + "step": 8021 + }, + { + "epoch": 0.6482555203135417, + "grad_norm": 2.7260234355926514, + "learning_rate": 7.897748066331872e-06, + "loss": 0.8795, + "step": 8022 + }, + { + "epoch": 0.6483363300268692, + "grad_norm": 2.9717183113098145, + "learning_rate": 7.897214777622466e-06, + "loss": 0.9062, + "step": 8023 + }, + { + "epoch": 0.6484171397401968, + "grad_norm": 2.705498456954956, + "learning_rate": 7.896681439290746e-06, + "loss": 0.9699, + "step": 8024 + }, + { + "epoch": 0.6484979494535243, + "grad_norm": 2.6744043827056885, + "learning_rate": 7.896148051345847e-06, + "loss": 0.8502, + "step": 8025 + }, + { + "epoch": 0.6485787591668518, + "grad_norm": 2.67856502532959, + "learning_rate": 7.895614613796905e-06, + "loss": 0.9153, + "step": 8026 + }, + { + "epoch": 0.6486595688801794, + "grad_norm": 2.8819680213928223, + "learning_rate": 7.895081126653055e-06, + "loss": 0.8608, + "step": 8027 + }, + { + "epoch": 0.648740378593507, + "grad_norm": 2.570169687271118, + "learning_rate": 7.894547589923434e-06, + "loss": 0.9676, + "step": 8028 + }, + { + "epoch": 0.6488211883068344, + "grad_norm": 2.6497445106506348, + "learning_rate": 7.89401400361718e-06, + "loss": 0.9532, + "step": 8029 + }, + { + "epoch": 0.648901998020162, + "grad_norm": 2.6665828227996826, + "learning_rate": 7.893480367743435e-06, + "loss": 0.8863, + "step": 8030 + }, + { + "epoch": 0.6489828077334896, + "grad_norm": 2.9539785385131836, + "learning_rate": 7.892946682311337e-06, + "loss": 0.9181, + "step": 8031 + }, + { + "epoch": 0.6490636174468171, + "grad_norm": 2.391578197479248, + "learning_rate": 7.892412947330027e-06, + "loss": 0.8551, + "step": 8032 + }, + { + "epoch": 0.6491444271601446, + "grad_norm": 3.2689549922943115, + "learning_rate": 7.891879162808647e-06, + "loss": 0.8375, + "step": 8033 + }, + { + "epoch": 0.6492252368734722, + "grad_norm": 2.7801010608673096, + "learning_rate": 7.891345328756336e-06, + "loss": 1.0153, + "step": 8034 + }, + { + "epoch": 0.6493060465867997, + "grad_norm": 3.0100972652435303, + "learning_rate": 7.890811445182242e-06, + "loss": 0.9091, + "step": 8035 + }, + { + "epoch": 0.6493868563001273, + "grad_norm": 2.7918031215667725, + "learning_rate": 7.890277512095508e-06, + "loss": 0.8684, + "step": 8036 + }, + { + "epoch": 0.6494676660134548, + "grad_norm": 2.637303113937378, + "learning_rate": 7.889743529505279e-06, + "loss": 1.0033, + "step": 8037 + }, + { + "epoch": 0.6495484757267823, + "grad_norm": 2.579993963241577, + "learning_rate": 7.889209497420698e-06, + "loss": 0.9412, + "step": 8038 + }, + { + "epoch": 0.6496292854401099, + "grad_norm": 2.736687421798706, + "learning_rate": 7.888675415850915e-06, + "loss": 0.9336, + "step": 8039 + }, + { + "epoch": 0.6497100951534375, + "grad_norm": 2.3833227157592773, + "learning_rate": 7.888141284805076e-06, + "loss": 0.9242, + "step": 8040 + }, + { + "epoch": 0.6497909048667649, + "grad_norm": 3.161198616027832, + "learning_rate": 7.887607104292329e-06, + "loss": 0.9606, + "step": 8041 + }, + { + "epoch": 0.6498717145800925, + "grad_norm": 2.431769847869873, + "learning_rate": 7.887072874321824e-06, + "loss": 0.959, + "step": 8042 + }, + { + "epoch": 0.6499525242934201, + "grad_norm": 2.372891902923584, + "learning_rate": 7.886538594902712e-06, + "loss": 1.0232, + "step": 8043 + }, + { + "epoch": 0.6500333340067476, + "grad_norm": 2.8621509075164795, + "learning_rate": 7.886004266044143e-06, + "loss": 0.8678, + "step": 8044 + }, + { + "epoch": 0.6501141437200751, + "grad_norm": 2.5629258155822754, + "learning_rate": 7.885469887755269e-06, + "loss": 0.9922, + "step": 8045 + }, + { + "epoch": 0.6501949534334027, + "grad_norm": 2.5273284912109375, + "learning_rate": 7.88493546004524e-06, + "loss": 0.8521, + "step": 8046 + }, + { + "epoch": 0.6502757631467302, + "grad_norm": 2.6331684589385986, + "learning_rate": 7.884400982923214e-06, + "loss": 0.902, + "step": 8047 + }, + { + "epoch": 0.6503565728600578, + "grad_norm": 2.39434552192688, + "learning_rate": 7.883866456398341e-06, + "loss": 1.0303, + "step": 8048 + }, + { + "epoch": 0.6504373825733853, + "grad_norm": 2.408538818359375, + "learning_rate": 7.88333188047978e-06, + "loss": 0.9691, + "step": 8049 + }, + { + "epoch": 0.6505181922867128, + "grad_norm": 2.6992383003234863, + "learning_rate": 7.882797255176685e-06, + "loss": 0.8808, + "step": 8050 + }, + { + "epoch": 0.6505990020000404, + "grad_norm": 2.766754150390625, + "learning_rate": 7.882262580498213e-06, + "loss": 0.9663, + "step": 8051 + }, + { + "epoch": 0.650679811713368, + "grad_norm": 2.825565814971924, + "learning_rate": 7.881727856453522e-06, + "loss": 0.911, + "step": 8052 + }, + { + "epoch": 0.6507606214266954, + "grad_norm": 2.7521347999572754, + "learning_rate": 7.881193083051768e-06, + "loss": 0.9249, + "step": 8053 + }, + { + "epoch": 0.650841431140023, + "grad_norm": 2.808980703353882, + "learning_rate": 7.880658260302116e-06, + "loss": 0.9069, + "step": 8054 + }, + { + "epoch": 0.6509222408533506, + "grad_norm": 2.2507877349853516, + "learning_rate": 7.880123388213722e-06, + "loss": 0.895, + "step": 8055 + }, + { + "epoch": 0.6510030505666781, + "grad_norm": 2.9410057067871094, + "learning_rate": 7.879588466795746e-06, + "loss": 0.969, + "step": 8056 + }, + { + "epoch": 0.6510838602800056, + "grad_norm": 3.09829044342041, + "learning_rate": 7.879053496057355e-06, + "loss": 0.9526, + "step": 8057 + }, + { + "epoch": 0.6511646699933332, + "grad_norm": 2.6134846210479736, + "learning_rate": 7.878518476007707e-06, + "loss": 1.0055, + "step": 8058 + }, + { + "epoch": 0.6512454797066607, + "grad_norm": 2.373108386993408, + "learning_rate": 7.877983406655968e-06, + "loss": 0.8778, + "step": 8059 + }, + { + "epoch": 0.6513262894199883, + "grad_norm": 2.6206023693084717, + "learning_rate": 7.877448288011299e-06, + "loss": 1.0567, + "step": 8060 + }, + { + "epoch": 0.6514070991333158, + "grad_norm": 2.493279218673706, + "learning_rate": 7.876913120082871e-06, + "loss": 1.0763, + "step": 8061 + }, + { + "epoch": 0.6514879088466433, + "grad_norm": 2.254305839538574, + "learning_rate": 7.876377902879845e-06, + "loss": 1.0401, + "step": 8062 + }, + { + "epoch": 0.6515687185599709, + "grad_norm": 2.1235649585723877, + "learning_rate": 7.875842636411391e-06, + "loss": 1.0286, + "step": 8063 + }, + { + "epoch": 0.6516495282732985, + "grad_norm": 3.1026573181152344, + "learning_rate": 7.875307320686677e-06, + "loss": 0.9726, + "step": 8064 + }, + { + "epoch": 0.6517303379866259, + "grad_norm": 2.582230567932129, + "learning_rate": 7.874771955714869e-06, + "loss": 0.979, + "step": 8065 + }, + { + "epoch": 0.6518111476999535, + "grad_norm": 2.8377366065979004, + "learning_rate": 7.874236541505141e-06, + "loss": 0.9474, + "step": 8066 + }, + { + "epoch": 0.6518919574132811, + "grad_norm": 2.4523541927337646, + "learning_rate": 7.873701078066656e-06, + "loss": 0.9977, + "step": 8067 + }, + { + "epoch": 0.6519727671266086, + "grad_norm": 2.3577163219451904, + "learning_rate": 7.873165565408592e-06, + "loss": 0.9095, + "step": 8068 + }, + { + "epoch": 0.6520535768399361, + "grad_norm": 2.790398597717285, + "learning_rate": 7.87263000354012e-06, + "loss": 0.8775, + "step": 8069 + }, + { + "epoch": 0.6521343865532637, + "grad_norm": 2.748030185699463, + "learning_rate": 7.87209439247041e-06, + "loss": 1.1089, + "step": 8070 + }, + { + "epoch": 0.6522151962665912, + "grad_norm": 2.7407009601593018, + "learning_rate": 7.871558732208637e-06, + "loss": 0.8933, + "step": 8071 + }, + { + "epoch": 0.6522960059799188, + "grad_norm": 2.4599592685699463, + "learning_rate": 7.871023022763978e-06, + "loss": 0.8096, + "step": 8072 + }, + { + "epoch": 0.6523768156932463, + "grad_norm": 2.2446229457855225, + "learning_rate": 7.870487264145605e-06, + "loss": 0.9436, + "step": 8073 + }, + { + "epoch": 0.6524576254065738, + "grad_norm": 2.4470279216766357, + "learning_rate": 7.869951456362694e-06, + "loss": 1.0174, + "step": 8074 + }, + { + "epoch": 0.6525384351199014, + "grad_norm": 2.4033892154693604, + "learning_rate": 7.869415599424428e-06, + "loss": 1.0154, + "step": 8075 + }, + { + "epoch": 0.652619244833229, + "grad_norm": 2.594316005706787, + "learning_rate": 7.868879693339975e-06, + "loss": 0.8648, + "step": 8076 + }, + { + "epoch": 0.6527000545465564, + "grad_norm": 2.740948438644409, + "learning_rate": 7.868343738118523e-06, + "loss": 1.2255, + "step": 8077 + }, + { + "epoch": 0.652780864259884, + "grad_norm": 2.3592514991760254, + "learning_rate": 7.867807733769249e-06, + "loss": 0.901, + "step": 8078 + }, + { + "epoch": 0.6528616739732116, + "grad_norm": 2.428778886795044, + "learning_rate": 7.86727168030133e-06, + "loss": 0.9368, + "step": 8079 + }, + { + "epoch": 0.6529424836865392, + "grad_norm": 2.8880114555358887, + "learning_rate": 7.86673557772395e-06, + "loss": 1.0381, + "step": 8080 + }, + { + "epoch": 0.6530232933998666, + "grad_norm": 2.7028584480285645, + "learning_rate": 7.866199426046292e-06, + "loss": 0.942, + "step": 8081 + }, + { + "epoch": 0.6531041031131942, + "grad_norm": 2.4617974758148193, + "learning_rate": 7.865663225277537e-06, + "loss": 0.9983, + "step": 8082 + }, + { + "epoch": 0.6531849128265218, + "grad_norm": 2.67744517326355, + "learning_rate": 7.86512697542687e-06, + "loss": 0.9058, + "step": 8083 + }, + { + "epoch": 0.6532657225398493, + "grad_norm": 2.6994595527648926, + "learning_rate": 7.864590676503477e-06, + "loss": 0.8427, + "step": 8084 + }, + { + "epoch": 0.6533465322531768, + "grad_norm": 2.9935851097106934, + "learning_rate": 7.864054328516539e-06, + "loss": 1.068, + "step": 8085 + }, + { + "epoch": 0.6534273419665044, + "grad_norm": 2.4924216270446777, + "learning_rate": 7.863517931475247e-06, + "loss": 0.8998, + "step": 8086 + }, + { + "epoch": 0.6535081516798319, + "grad_norm": 2.3296256065368652, + "learning_rate": 7.862981485388787e-06, + "loss": 0.9844, + "step": 8087 + }, + { + "epoch": 0.6535889613931595, + "grad_norm": 2.7485809326171875, + "learning_rate": 7.862444990266346e-06, + "loss": 0.899, + "step": 8088 + }, + { + "epoch": 0.653669771106487, + "grad_norm": 3.0662386417388916, + "learning_rate": 7.861908446117112e-06, + "loss": 0.9192, + "step": 8089 + }, + { + "epoch": 0.6537505808198145, + "grad_norm": 2.37210750579834, + "learning_rate": 7.861371852950277e-06, + "loss": 0.8889, + "step": 8090 + }, + { + "epoch": 0.6538313905331421, + "grad_norm": 2.0970723628997803, + "learning_rate": 7.860835210775032e-06, + "loss": 1.0256, + "step": 8091 + }, + { + "epoch": 0.6539122002464697, + "grad_norm": 2.4767050743103027, + "learning_rate": 7.860298519600567e-06, + "loss": 0.9691, + "step": 8092 + }, + { + "epoch": 0.6539930099597971, + "grad_norm": 2.7629003524780273, + "learning_rate": 7.859761779436073e-06, + "loss": 0.9086, + "step": 8093 + }, + { + "epoch": 0.6540738196731247, + "grad_norm": 2.7053186893463135, + "learning_rate": 7.859224990290744e-06, + "loss": 1.0455, + "step": 8094 + }, + { + "epoch": 0.6541546293864523, + "grad_norm": 2.660536050796509, + "learning_rate": 7.858688152173774e-06, + "loss": 0.955, + "step": 8095 + }, + { + "epoch": 0.6542354390997798, + "grad_norm": 2.680800437927246, + "learning_rate": 7.858151265094358e-06, + "loss": 0.888, + "step": 8096 + }, + { + "epoch": 0.6543162488131073, + "grad_norm": 2.6603574752807617, + "learning_rate": 7.857614329061694e-06, + "loss": 1.0141, + "step": 8097 + }, + { + "epoch": 0.6543970585264349, + "grad_norm": 2.6089746952056885, + "learning_rate": 7.857077344084973e-06, + "loss": 0.8461, + "step": 8098 + }, + { + "epoch": 0.6544778682397624, + "grad_norm": 2.3982930183410645, + "learning_rate": 7.856540310173397e-06, + "loss": 0.9372, + "step": 8099 + }, + { + "epoch": 0.65455867795309, + "grad_norm": 2.7135848999023438, + "learning_rate": 7.856003227336163e-06, + "loss": 0.9828, + "step": 8100 + }, + { + "epoch": 0.6546394876664176, + "grad_norm": 2.6838266849517822, + "learning_rate": 7.855466095582466e-06, + "loss": 1.0033, + "step": 8101 + }, + { + "epoch": 0.654720297379745, + "grad_norm": 3.121654510498047, + "learning_rate": 7.854928914921511e-06, + "loss": 0.8824, + "step": 8102 + }, + { + "epoch": 0.6548011070930726, + "grad_norm": 2.817322254180908, + "learning_rate": 7.854391685362497e-06, + "loss": 1.0008, + "step": 8103 + }, + { + "epoch": 0.6548819168064002, + "grad_norm": 2.29815673828125, + "learning_rate": 7.853854406914625e-06, + "loss": 0.842, + "step": 8104 + }, + { + "epoch": 0.6549627265197276, + "grad_norm": 2.7330162525177, + "learning_rate": 7.853317079587097e-06, + "loss": 0.9539, + "step": 8105 + }, + { + "epoch": 0.6550435362330552, + "grad_norm": 2.7535059452056885, + "learning_rate": 7.852779703389117e-06, + "loss": 0.7586, + "step": 8106 + }, + { + "epoch": 0.6551243459463828, + "grad_norm": 2.806523084640503, + "learning_rate": 7.852242278329887e-06, + "loss": 1.0183, + "step": 8107 + }, + { + "epoch": 0.6552051556597103, + "grad_norm": 3.1190836429595947, + "learning_rate": 7.851704804418615e-06, + "loss": 0.844, + "step": 8108 + }, + { + "epoch": 0.6552859653730378, + "grad_norm": 2.6873362064361572, + "learning_rate": 7.851167281664505e-06, + "loss": 0.885, + "step": 8109 + }, + { + "epoch": 0.6553667750863654, + "grad_norm": 2.3540689945220947, + "learning_rate": 7.850629710076761e-06, + "loss": 1.0014, + "step": 8110 + }, + { + "epoch": 0.6554475847996929, + "grad_norm": 2.5897302627563477, + "learning_rate": 7.850092089664596e-06, + "loss": 1.1105, + "step": 8111 + }, + { + "epoch": 0.6555283945130205, + "grad_norm": 2.765284299850464, + "learning_rate": 7.849554420437212e-06, + "loss": 0.9382, + "step": 8112 + }, + { + "epoch": 0.655609204226348, + "grad_norm": 2.3324174880981445, + "learning_rate": 7.849016702403822e-06, + "loss": 0.9636, + "step": 8113 + }, + { + "epoch": 0.6556900139396755, + "grad_norm": 2.6708390712738037, + "learning_rate": 7.848478935573636e-06, + "loss": 0.979, + "step": 8114 + }, + { + "epoch": 0.6557708236530031, + "grad_norm": 2.6447081565856934, + "learning_rate": 7.84794111995586e-06, + "loss": 1.0572, + "step": 8115 + }, + { + "epoch": 0.6558516333663307, + "grad_norm": 2.8259623050689697, + "learning_rate": 7.847403255559712e-06, + "loss": 0.9194, + "step": 8116 + }, + { + "epoch": 0.6559324430796581, + "grad_norm": 2.4877662658691406, + "learning_rate": 7.846865342394399e-06, + "loss": 1.0061, + "step": 8117 + }, + { + "epoch": 0.6560132527929857, + "grad_norm": 2.4083447456359863, + "learning_rate": 7.846327380469136e-06, + "loss": 1.0343, + "step": 8118 + }, + { + "epoch": 0.6560940625063133, + "grad_norm": 2.6998558044433594, + "learning_rate": 7.84578936979314e-06, + "loss": 1.0667, + "step": 8119 + }, + { + "epoch": 0.6561748722196408, + "grad_norm": 3.0838003158569336, + "learning_rate": 7.845251310375622e-06, + "loss": 0.9375, + "step": 8120 + }, + { + "epoch": 0.6562556819329683, + "grad_norm": 2.4537813663482666, + "learning_rate": 7.844713202225796e-06, + "loss": 0.8858, + "step": 8121 + }, + { + "epoch": 0.6563364916462959, + "grad_norm": 2.4450929164886475, + "learning_rate": 7.844175045352883e-06, + "loss": 1.0421, + "step": 8122 + }, + { + "epoch": 0.6564173013596234, + "grad_norm": 3.038662910461426, + "learning_rate": 7.843636839766098e-06, + "loss": 0.9504, + "step": 8123 + }, + { + "epoch": 0.656498111072951, + "grad_norm": 2.5889804363250732, + "learning_rate": 7.843098585474661e-06, + "loss": 0.9088, + "step": 8124 + }, + { + "epoch": 0.6565789207862786, + "grad_norm": 3.5918288230895996, + "learning_rate": 7.84256028248779e-06, + "loss": 0.9749, + "step": 8125 + }, + { + "epoch": 0.656659730499606, + "grad_norm": 2.735668659210205, + "learning_rate": 7.842021930814704e-06, + "loss": 1.0207, + "step": 8126 + }, + { + "epoch": 0.6567405402129336, + "grad_norm": 2.8185863494873047, + "learning_rate": 7.841483530464622e-06, + "loss": 0.9252, + "step": 8127 + }, + { + "epoch": 0.6568213499262612, + "grad_norm": 2.4732277393341064, + "learning_rate": 7.840945081446771e-06, + "loss": 0.8831, + "step": 8128 + }, + { + "epoch": 0.6569021596395886, + "grad_norm": 2.7103240489959717, + "learning_rate": 7.840406583770367e-06, + "loss": 0.9958, + "step": 8129 + }, + { + "epoch": 0.6569829693529162, + "grad_norm": 2.8222081661224365, + "learning_rate": 7.839868037444638e-06, + "loss": 0.897, + "step": 8130 + }, + { + "epoch": 0.6570637790662438, + "grad_norm": 2.4207465648651123, + "learning_rate": 7.839329442478808e-06, + "loss": 1.0815, + "step": 8131 + }, + { + "epoch": 0.6571445887795713, + "grad_norm": 2.7852301597595215, + "learning_rate": 7.838790798882097e-06, + "loss": 0.929, + "step": 8132 + }, + { + "epoch": 0.6572253984928988, + "grad_norm": 3.249326705932617, + "learning_rate": 7.838252106663735e-06, + "loss": 0.9907, + "step": 8133 + }, + { + "epoch": 0.6573062082062264, + "grad_norm": 3.042463779449463, + "learning_rate": 7.837713365832946e-06, + "loss": 0.8892, + "step": 8134 + }, + { + "epoch": 0.6573870179195539, + "grad_norm": 2.8553247451782227, + "learning_rate": 7.83717457639896e-06, + "loss": 0.9131, + "step": 8135 + }, + { + "epoch": 0.6574678276328815, + "grad_norm": 2.646902084350586, + "learning_rate": 7.836635738371003e-06, + "loss": 0.8496, + "step": 8136 + }, + { + "epoch": 0.657548637346209, + "grad_norm": 2.9965171813964844, + "learning_rate": 7.836096851758305e-06, + "loss": 0.859, + "step": 8137 + }, + { + "epoch": 0.6576294470595365, + "grad_norm": 2.6346452236175537, + "learning_rate": 7.835557916570096e-06, + "loss": 1.0676, + "step": 8138 + }, + { + "epoch": 0.6577102567728641, + "grad_norm": 2.815836191177368, + "learning_rate": 7.835018932815607e-06, + "loss": 0.9813, + "step": 8139 + }, + { + "epoch": 0.6577910664861917, + "grad_norm": 2.4747209548950195, + "learning_rate": 7.834479900504066e-06, + "loss": 0.9141, + "step": 8140 + }, + { + "epoch": 0.6578718761995191, + "grad_norm": 3.1171391010284424, + "learning_rate": 7.83394081964471e-06, + "loss": 0.9211, + "step": 8141 + }, + { + "epoch": 0.6579526859128467, + "grad_norm": 2.8758184909820557, + "learning_rate": 7.83340169024677e-06, + "loss": 1.0595, + "step": 8142 + }, + { + "epoch": 0.6580334956261743, + "grad_norm": 2.6710498332977295, + "learning_rate": 7.832862512319481e-06, + "loss": 0.9622, + "step": 8143 + }, + { + "epoch": 0.6581143053395018, + "grad_norm": 2.159796953201294, + "learning_rate": 7.832323285872074e-06, + "loss": 1.0378, + "step": 8144 + }, + { + "epoch": 0.6581951150528293, + "grad_norm": 2.8067142963409424, + "learning_rate": 7.83178401091379e-06, + "loss": 0.9905, + "step": 8145 + }, + { + "epoch": 0.6582759247661569, + "grad_norm": 2.759702205657959, + "learning_rate": 7.831244687453864e-06, + "loss": 0.9221, + "step": 8146 + }, + { + "epoch": 0.6583567344794844, + "grad_norm": 3.2061405181884766, + "learning_rate": 7.83070531550153e-06, + "loss": 0.9053, + "step": 8147 + }, + { + "epoch": 0.658437544192812, + "grad_norm": 2.638150215148926, + "learning_rate": 7.83016589506603e-06, + "loss": 0.8964, + "step": 8148 + }, + { + "epoch": 0.6585183539061396, + "grad_norm": 3.284224510192871, + "learning_rate": 7.829626426156602e-06, + "loss": 1.04, + "step": 8149 + }, + { + "epoch": 0.658599163619467, + "grad_norm": 2.38773512840271, + "learning_rate": 7.829086908782485e-06, + "loss": 0.9748, + "step": 8150 + }, + { + "epoch": 0.6586799733327946, + "grad_norm": 2.14300274848938, + "learning_rate": 7.828547342952919e-06, + "loss": 0.9204, + "step": 8151 + }, + { + "epoch": 0.6587607830461222, + "grad_norm": 2.5345678329467773, + "learning_rate": 7.828007728677146e-06, + "loss": 0.9704, + "step": 8152 + }, + { + "epoch": 0.6588415927594496, + "grad_norm": 2.6654064655303955, + "learning_rate": 7.827468065964412e-06, + "loss": 0.9997, + "step": 8153 + }, + { + "epoch": 0.6589224024727772, + "grad_norm": 2.908874273300171, + "learning_rate": 7.826928354823954e-06, + "loss": 0.9124, + "step": 8154 + }, + { + "epoch": 0.6590032121861048, + "grad_norm": 3.229367733001709, + "learning_rate": 7.82638859526502e-06, + "loss": 0.9907, + "step": 8155 + }, + { + "epoch": 0.6590840218994323, + "grad_norm": 2.381117820739746, + "learning_rate": 7.825848787296853e-06, + "loss": 0.9387, + "step": 8156 + }, + { + "epoch": 0.6591648316127598, + "grad_norm": 2.5243749618530273, + "learning_rate": 7.825308930928699e-06, + "loss": 1.1287, + "step": 8157 + }, + { + "epoch": 0.6592456413260874, + "grad_norm": 2.5855326652526855, + "learning_rate": 7.824769026169807e-06, + "loss": 1.01, + "step": 8158 + }, + { + "epoch": 0.6593264510394149, + "grad_norm": 2.3746542930603027, + "learning_rate": 7.824229073029419e-06, + "loss": 1.0427, + "step": 8159 + }, + { + "epoch": 0.6594072607527425, + "grad_norm": 2.572866439819336, + "learning_rate": 7.823689071516787e-06, + "loss": 1.0039, + "step": 8160 + }, + { + "epoch": 0.65948807046607, + "grad_norm": 2.453367233276367, + "learning_rate": 7.823149021641159e-06, + "loss": 1.0211, + "step": 8161 + }, + { + "epoch": 0.6595688801793975, + "grad_norm": 2.7661163806915283, + "learning_rate": 7.822608923411786e-06, + "loss": 1.0553, + "step": 8162 + }, + { + "epoch": 0.6596496898927251, + "grad_norm": 2.6568291187286377, + "learning_rate": 7.822068776837914e-06, + "loss": 1.011, + "step": 8163 + }, + { + "epoch": 0.6597304996060527, + "grad_norm": 2.8543918132781982, + "learning_rate": 7.821528581928802e-06, + "loss": 0.9701, + "step": 8164 + }, + { + "epoch": 0.6598113093193801, + "grad_norm": 2.549863576889038, + "learning_rate": 7.820988338693694e-06, + "loss": 0.9714, + "step": 8165 + }, + { + "epoch": 0.6598921190327077, + "grad_norm": 2.6213786602020264, + "learning_rate": 7.82044804714185e-06, + "loss": 0.9224, + "step": 8166 + }, + { + "epoch": 0.6599729287460353, + "grad_norm": 2.7831413745880127, + "learning_rate": 7.81990770728252e-06, + "loss": 0.8969, + "step": 8167 + }, + { + "epoch": 0.6600537384593628, + "grad_norm": 2.559598922729492, + "learning_rate": 7.819367319124958e-06, + "loss": 1.0287, + "step": 8168 + }, + { + "epoch": 0.6601345481726903, + "grad_norm": 2.988743543624878, + "learning_rate": 7.818826882678423e-06, + "loss": 0.9466, + "step": 8169 + }, + { + "epoch": 0.6602153578860179, + "grad_norm": 2.464355945587158, + "learning_rate": 7.818286397952168e-06, + "loss": 1.0238, + "step": 8170 + }, + { + "epoch": 0.6602961675993454, + "grad_norm": 2.460078001022339, + "learning_rate": 7.817745864955452e-06, + "loss": 1.0904, + "step": 8171 + }, + { + "epoch": 0.660376977312673, + "grad_norm": 2.3599300384521484, + "learning_rate": 7.817205283697535e-06, + "loss": 1.0396, + "step": 8172 + }, + { + "epoch": 0.6604577870260006, + "grad_norm": 2.6572680473327637, + "learning_rate": 7.816664654187673e-06, + "loss": 0.9153, + "step": 8173 + }, + { + "epoch": 0.660538596739328, + "grad_norm": 2.1598360538482666, + "learning_rate": 7.816123976435125e-06, + "loss": 1.0339, + "step": 8174 + }, + { + "epoch": 0.6606194064526556, + "grad_norm": 3.005772590637207, + "learning_rate": 7.815583250449152e-06, + "loss": 0.9471, + "step": 8175 + }, + { + "epoch": 0.6607002161659832, + "grad_norm": 2.7757091522216797, + "learning_rate": 7.815042476239018e-06, + "loss": 0.898, + "step": 8176 + }, + { + "epoch": 0.6607810258793106, + "grad_norm": 2.637239456176758, + "learning_rate": 7.814501653813984e-06, + "loss": 0.9423, + "step": 8177 + }, + { + "epoch": 0.6608618355926382, + "grad_norm": 2.377521514892578, + "learning_rate": 7.81396078318331e-06, + "loss": 0.9358, + "step": 8178 + }, + { + "epoch": 0.6609426453059658, + "grad_norm": 2.9557089805603027, + "learning_rate": 7.813419864356264e-06, + "loss": 0.9624, + "step": 8179 + }, + { + "epoch": 0.6610234550192933, + "grad_norm": 2.4435203075408936, + "learning_rate": 7.812878897342107e-06, + "loss": 0.8897, + "step": 8180 + }, + { + "epoch": 0.6611042647326208, + "grad_norm": 2.9663431644439697, + "learning_rate": 7.812337882150108e-06, + "loss": 0.8366, + "step": 8181 + }, + { + "epoch": 0.6611850744459484, + "grad_norm": 3.0248186588287354, + "learning_rate": 7.81179681878953e-06, + "loss": 0.9345, + "step": 8182 + }, + { + "epoch": 0.6612658841592759, + "grad_norm": 3.1231157779693604, + "learning_rate": 7.811255707269642e-06, + "loss": 1.0285, + "step": 8183 + }, + { + "epoch": 0.6613466938726035, + "grad_norm": 2.6495866775512695, + "learning_rate": 7.810714547599714e-06, + "loss": 0.8142, + "step": 8184 + }, + { + "epoch": 0.661427503585931, + "grad_norm": 2.3191978931427, + "learning_rate": 7.81017333978901e-06, + "loss": 0.9625, + "step": 8185 + }, + { + "epoch": 0.6615083132992585, + "grad_norm": 2.4325387477874756, + "learning_rate": 7.8096320838468e-06, + "loss": 0.8434, + "step": 8186 + }, + { + "epoch": 0.6615891230125861, + "grad_norm": 2.579108238220215, + "learning_rate": 7.80909077978236e-06, + "loss": 0.9275, + "step": 8187 + }, + { + "epoch": 0.6616699327259137, + "grad_norm": 2.8837814331054688, + "learning_rate": 7.808549427604955e-06, + "loss": 1.0955, + "step": 8188 + }, + { + "epoch": 0.6617507424392411, + "grad_norm": 2.698129415512085, + "learning_rate": 7.80800802732386e-06, + "loss": 0.9433, + "step": 8189 + }, + { + "epoch": 0.6618315521525687, + "grad_norm": 2.9319772720336914, + "learning_rate": 7.807466578948349e-06, + "loss": 1.0015, + "step": 8190 + }, + { + "epoch": 0.6619123618658963, + "grad_norm": 2.947528123855591, + "learning_rate": 7.806925082487694e-06, + "loss": 1.0496, + "step": 8191 + }, + { + "epoch": 0.6619931715792238, + "grad_norm": 2.6251354217529297, + "learning_rate": 7.806383537951169e-06, + "loss": 0.9827, + "step": 8192 + }, + { + "epoch": 0.6620739812925514, + "grad_norm": 2.5413177013397217, + "learning_rate": 7.805841945348049e-06, + "loss": 0.9414, + "step": 8193 + }, + { + "epoch": 0.6621547910058789, + "grad_norm": 2.597057580947876, + "learning_rate": 7.805300304687614e-06, + "loss": 0.8435, + "step": 8194 + }, + { + "epoch": 0.6622356007192064, + "grad_norm": 2.618741989135742, + "learning_rate": 7.804758615979136e-06, + "loss": 0.9952, + "step": 8195 + }, + { + "epoch": 0.662316410432534, + "grad_norm": 2.9278066158294678, + "learning_rate": 7.804216879231894e-06, + "loss": 0.8114, + "step": 8196 + }, + { + "epoch": 0.6623972201458616, + "grad_norm": 2.789865255355835, + "learning_rate": 7.803675094455171e-06, + "loss": 0.9201, + "step": 8197 + }, + { + "epoch": 0.662478029859189, + "grad_norm": 3.073566436767578, + "learning_rate": 7.803133261658242e-06, + "loss": 0.9657, + "step": 8198 + }, + { + "epoch": 0.6625588395725166, + "grad_norm": 2.890922784805298, + "learning_rate": 7.802591380850386e-06, + "loss": 1.0548, + "step": 8199 + }, + { + "epoch": 0.6626396492858442, + "grad_norm": 2.835726737976074, + "learning_rate": 7.80204945204089e-06, + "loss": 0.8772, + "step": 8200 + }, + { + "epoch": 0.6627204589991716, + "grad_norm": 2.594926357269287, + "learning_rate": 7.801507475239032e-06, + "loss": 0.9912, + "step": 8201 + }, + { + "epoch": 0.6628012687124992, + "grad_norm": 2.3858537673950195, + "learning_rate": 7.800965450454095e-06, + "loss": 0.9948, + "step": 8202 + }, + { + "epoch": 0.6628820784258268, + "grad_norm": 2.5020763874053955, + "learning_rate": 7.800423377695363e-06, + "loss": 0.9253, + "step": 8203 + }, + { + "epoch": 0.6629628881391543, + "grad_norm": 2.653742790222168, + "learning_rate": 7.799881256972118e-06, + "loss": 0.8953, + "step": 8204 + }, + { + "epoch": 0.6630436978524819, + "grad_norm": 2.5069143772125244, + "learning_rate": 7.799339088293649e-06, + "loss": 0.8956, + "step": 8205 + }, + { + "epoch": 0.6631245075658094, + "grad_norm": 3.1007113456726074, + "learning_rate": 7.798796871669242e-06, + "loss": 0.9522, + "step": 8206 + }, + { + "epoch": 0.663205317279137, + "grad_norm": 2.8476850986480713, + "learning_rate": 7.79825460710818e-06, + "loss": 0.9397, + "step": 8207 + }, + { + "epoch": 0.6632861269924645, + "grad_norm": 2.517516851425171, + "learning_rate": 7.797712294619754e-06, + "loss": 1.0802, + "step": 8208 + }, + { + "epoch": 0.663366936705792, + "grad_norm": 2.7888779640197754, + "learning_rate": 7.797169934213253e-06, + "loss": 0.9823, + "step": 8209 + }, + { + "epoch": 0.6634477464191196, + "grad_norm": 2.2836861610412598, + "learning_rate": 7.796627525897964e-06, + "loss": 0.9072, + "step": 8210 + }, + { + "epoch": 0.6635285561324471, + "grad_norm": 2.598681688308716, + "learning_rate": 7.796085069683178e-06, + "loss": 1.1079, + "step": 8211 + }, + { + "epoch": 0.6636093658457747, + "grad_norm": 2.5902888774871826, + "learning_rate": 7.795542565578187e-06, + "loss": 0.8208, + "step": 8212 + }, + { + "epoch": 0.6636901755591023, + "grad_norm": 3.1470422744750977, + "learning_rate": 7.79500001359228e-06, + "loss": 0.9458, + "step": 8213 + }, + { + "epoch": 0.6637709852724297, + "grad_norm": 2.673978805541992, + "learning_rate": 7.794457413734753e-06, + "loss": 0.9343, + "step": 8214 + }, + { + "epoch": 0.6638517949857573, + "grad_norm": 2.3911330699920654, + "learning_rate": 7.793914766014898e-06, + "loss": 0.8567, + "step": 8215 + }, + { + "epoch": 0.6639326046990849, + "grad_norm": 2.661830186843872, + "learning_rate": 7.793372070442007e-06, + "loss": 1.0665, + "step": 8216 + }, + { + "epoch": 0.6640134144124124, + "grad_norm": 2.4099466800689697, + "learning_rate": 7.792829327025379e-06, + "loss": 0.9569, + "step": 8217 + }, + { + "epoch": 0.6640942241257399, + "grad_norm": 2.650916337966919, + "learning_rate": 7.792286535774307e-06, + "loss": 0.9633, + "step": 8218 + }, + { + "epoch": 0.6641750338390675, + "grad_norm": 2.642181873321533, + "learning_rate": 7.79174369669809e-06, + "loss": 0.9002, + "step": 8219 + }, + { + "epoch": 0.664255843552395, + "grad_norm": 2.3400585651397705, + "learning_rate": 7.791200809806025e-06, + "loss": 0.8768, + "step": 8220 + }, + { + "epoch": 0.6643366532657226, + "grad_norm": 2.772515296936035, + "learning_rate": 7.790657875107408e-06, + "loss": 1.0094, + "step": 8221 + }, + { + "epoch": 0.6644174629790501, + "grad_norm": 2.719703435897827, + "learning_rate": 7.79011489261154e-06, + "loss": 1.098, + "step": 8222 + }, + { + "epoch": 0.6644982726923776, + "grad_norm": 2.950763463973999, + "learning_rate": 7.789571862327721e-06, + "loss": 1.0463, + "step": 8223 + }, + { + "epoch": 0.6645790824057052, + "grad_norm": 3.0480339527130127, + "learning_rate": 7.78902878426525e-06, + "loss": 0.9207, + "step": 8224 + }, + { + "epoch": 0.6646598921190328, + "grad_norm": 2.881671190261841, + "learning_rate": 7.788485658433434e-06, + "loss": 0.955, + "step": 8225 + }, + { + "epoch": 0.6647407018323602, + "grad_norm": 2.436127185821533, + "learning_rate": 7.78794248484157e-06, + "loss": 0.8773, + "step": 8226 + }, + { + "epoch": 0.6648215115456878, + "grad_norm": 2.9438748359680176, + "learning_rate": 7.787399263498961e-06, + "loss": 0.8433, + "step": 8227 + }, + { + "epoch": 0.6649023212590154, + "grad_norm": 2.633422374725342, + "learning_rate": 7.786855994414915e-06, + "loss": 1.0823, + "step": 8228 + }, + { + "epoch": 0.6649831309723429, + "grad_norm": 2.4525909423828125, + "learning_rate": 7.786312677598736e-06, + "loss": 0.9231, + "step": 8229 + }, + { + "epoch": 0.6650639406856704, + "grad_norm": 2.606020212173462, + "learning_rate": 7.785769313059726e-06, + "loss": 0.8937, + "step": 8230 + }, + { + "epoch": 0.665144750398998, + "grad_norm": 2.752110242843628, + "learning_rate": 7.785225900807194e-06, + "loss": 0.9587, + "step": 8231 + }, + { + "epoch": 0.6652255601123255, + "grad_norm": 2.667178153991699, + "learning_rate": 7.78468244085045e-06, + "loss": 0.9951, + "step": 8232 + }, + { + "epoch": 0.665306369825653, + "grad_norm": 2.447206497192383, + "learning_rate": 7.784138933198798e-06, + "loss": 1.066, + "step": 8233 + }, + { + "epoch": 0.6653871795389806, + "grad_norm": 2.6468465328216553, + "learning_rate": 7.78359537786155e-06, + "loss": 0.8543, + "step": 8234 + }, + { + "epoch": 0.6654679892523081, + "grad_norm": 2.8490002155303955, + "learning_rate": 7.783051774848011e-06, + "loss": 1.0329, + "step": 8235 + }, + { + "epoch": 0.6655487989656357, + "grad_norm": 3.0898759365081787, + "learning_rate": 7.782508124167499e-06, + "loss": 0.8809, + "step": 8236 + }, + { + "epoch": 0.6656296086789633, + "grad_norm": 2.563471794128418, + "learning_rate": 7.78196442582932e-06, + "loss": 0.9191, + "step": 8237 + }, + { + "epoch": 0.6657104183922907, + "grad_norm": 2.8497154712677, + "learning_rate": 7.781420679842787e-06, + "loss": 1.063, + "step": 8238 + }, + { + "epoch": 0.6657912281056183, + "grad_norm": 2.5441057682037354, + "learning_rate": 7.780876886217215e-06, + "loss": 0.806, + "step": 8239 + }, + { + "epoch": 0.6658720378189459, + "grad_norm": 2.5067288875579834, + "learning_rate": 7.780333044961916e-06, + "loss": 0.8253, + "step": 8240 + }, + { + "epoch": 0.6659528475322734, + "grad_norm": 3.736954927444458, + "learning_rate": 7.779789156086203e-06, + "loss": 0.9403, + "step": 8241 + }, + { + "epoch": 0.6660336572456009, + "grad_norm": 3.1867945194244385, + "learning_rate": 7.779245219599397e-06, + "loss": 0.9003, + "step": 8242 + }, + { + "epoch": 0.6661144669589285, + "grad_norm": 2.6131770610809326, + "learning_rate": 7.778701235510811e-06, + "loss": 0.9347, + "step": 8243 + }, + { + "epoch": 0.666195276672256, + "grad_norm": 2.5567126274108887, + "learning_rate": 7.778157203829761e-06, + "loss": 0.814, + "step": 8244 + }, + { + "epoch": 0.6662760863855836, + "grad_norm": 2.6412107944488525, + "learning_rate": 7.777613124565567e-06, + "loss": 0.9403, + "step": 8245 + }, + { + "epoch": 0.6663568960989111, + "grad_norm": 2.68034291267395, + "learning_rate": 7.777068997727547e-06, + "loss": 0.9767, + "step": 8246 + }, + { + "epoch": 0.6664377058122386, + "grad_norm": 2.3616299629211426, + "learning_rate": 7.77652482332502e-06, + "loss": 1.0433, + "step": 8247 + }, + { + "epoch": 0.6665185155255662, + "grad_norm": 2.2110915184020996, + "learning_rate": 7.775980601367307e-06, + "loss": 1.0506, + "step": 8248 + }, + { + "epoch": 0.6665993252388938, + "grad_norm": 2.728952407836914, + "learning_rate": 7.775436331863731e-06, + "loss": 0.9137, + "step": 8249 + }, + { + "epoch": 0.6666801349522212, + "grad_norm": 2.613842487335205, + "learning_rate": 7.774892014823609e-06, + "loss": 1.0901, + "step": 8250 + }, + { + "epoch": 0.6667609446655488, + "grad_norm": 2.415443181991577, + "learning_rate": 7.774347650256268e-06, + "loss": 0.9084, + "step": 8251 + }, + { + "epoch": 0.6668417543788764, + "grad_norm": 2.3626067638397217, + "learning_rate": 7.773803238171031e-06, + "loss": 0.9239, + "step": 8252 + }, + { + "epoch": 0.6669225640922039, + "grad_norm": 2.2385334968566895, + "learning_rate": 7.773258778577224e-06, + "loss": 1.0494, + "step": 8253 + }, + { + "epoch": 0.6670033738055314, + "grad_norm": 2.796081304550171, + "learning_rate": 7.772714271484169e-06, + "loss": 0.9133, + "step": 8254 + }, + { + "epoch": 0.667084183518859, + "grad_norm": 2.7769646644592285, + "learning_rate": 7.772169716901194e-06, + "loss": 0.9149, + "step": 8255 + }, + { + "epoch": 0.6671649932321865, + "grad_norm": 2.5033700466156006, + "learning_rate": 7.771625114837625e-06, + "loss": 1.0176, + "step": 8256 + }, + { + "epoch": 0.667245802945514, + "grad_norm": 2.467130422592163, + "learning_rate": 7.77108046530279e-06, + "loss": 1.0605, + "step": 8257 + }, + { + "epoch": 0.6673266126588416, + "grad_norm": 2.6402151584625244, + "learning_rate": 7.770535768306019e-06, + "loss": 0.9663, + "step": 8258 + }, + { + "epoch": 0.6674074223721691, + "grad_norm": 2.4942843914031982, + "learning_rate": 7.76999102385664e-06, + "loss": 1.0421, + "step": 8259 + }, + { + "epoch": 0.6674882320854967, + "grad_norm": 2.8600966930389404, + "learning_rate": 7.769446231963982e-06, + "loss": 0.9089, + "step": 8260 + }, + { + "epoch": 0.6675690417988243, + "grad_norm": 2.88496470451355, + "learning_rate": 7.768901392637378e-06, + "loss": 0.9567, + "step": 8261 + }, + { + "epoch": 0.6676498515121517, + "grad_norm": 2.750972032546997, + "learning_rate": 7.768356505886158e-06, + "loss": 0.9366, + "step": 8262 + }, + { + "epoch": 0.6677306612254793, + "grad_norm": 3.0355007648468018, + "learning_rate": 7.767811571719657e-06, + "loss": 0.9799, + "step": 8263 + }, + { + "epoch": 0.6678114709388069, + "grad_norm": 2.573878765106201, + "learning_rate": 7.767266590147205e-06, + "loss": 0.9333, + "step": 8264 + }, + { + "epoch": 0.6678922806521344, + "grad_norm": 2.9451887607574463, + "learning_rate": 7.76672156117814e-06, + "loss": 0.948, + "step": 8265 + }, + { + "epoch": 0.6679730903654619, + "grad_norm": 2.7100412845611572, + "learning_rate": 7.766176484821794e-06, + "loss": 0.9268, + "step": 8266 + }, + { + "epoch": 0.6680539000787895, + "grad_norm": 2.3908088207244873, + "learning_rate": 7.765631361087507e-06, + "loss": 0.8855, + "step": 8267 + }, + { + "epoch": 0.668134709792117, + "grad_norm": 2.676058053970337, + "learning_rate": 7.765086189984609e-06, + "loss": 0.9573, + "step": 8268 + }, + { + "epoch": 0.6682155195054446, + "grad_norm": 2.635659694671631, + "learning_rate": 7.764540971522443e-06, + "loss": 1.0797, + "step": 8269 + }, + { + "epoch": 0.6682963292187721, + "grad_norm": 2.4136404991149902, + "learning_rate": 7.763995705710345e-06, + "loss": 0.9517, + "step": 8270 + }, + { + "epoch": 0.6683771389320996, + "grad_norm": 2.9774701595306396, + "learning_rate": 7.763450392557656e-06, + "loss": 0.9541, + "step": 8271 + }, + { + "epoch": 0.6684579486454272, + "grad_norm": 2.8616385459899902, + "learning_rate": 7.762905032073712e-06, + "loss": 1.0143, + "step": 8272 + }, + { + "epoch": 0.6685387583587548, + "grad_norm": 3.378654956817627, + "learning_rate": 7.76235962426786e-06, + "loss": 0.9294, + "step": 8273 + }, + { + "epoch": 0.6686195680720822, + "grad_norm": 2.612964630126953, + "learning_rate": 7.761814169149436e-06, + "loss": 0.9196, + "step": 8274 + }, + { + "epoch": 0.6687003777854098, + "grad_norm": 2.588641405105591, + "learning_rate": 7.761268666727782e-06, + "loss": 0.9922, + "step": 8275 + }, + { + "epoch": 0.6687811874987374, + "grad_norm": 2.8573877811431885, + "learning_rate": 7.760723117012245e-06, + "loss": 0.8514, + "step": 8276 + }, + { + "epoch": 0.6688619972120649, + "grad_norm": 2.6293280124664307, + "learning_rate": 7.760177520012167e-06, + "loss": 0.8541, + "step": 8277 + }, + { + "epoch": 0.6689428069253924, + "grad_norm": 2.5145294666290283, + "learning_rate": 7.759631875736892e-06, + "loss": 0.854, + "step": 8278 + }, + { + "epoch": 0.66902361663872, + "grad_norm": 2.9591727256774902, + "learning_rate": 7.75908618419577e-06, + "loss": 0.9361, + "step": 8279 + }, + { + "epoch": 0.6691044263520475, + "grad_norm": 2.414062023162842, + "learning_rate": 7.75854044539814e-06, + "loss": 0.8483, + "step": 8280 + }, + { + "epoch": 0.669185236065375, + "grad_norm": 2.8749969005584717, + "learning_rate": 7.757994659353354e-06, + "loss": 0.9572, + "step": 8281 + }, + { + "epoch": 0.6692660457787026, + "grad_norm": 2.8766791820526123, + "learning_rate": 7.757448826070761e-06, + "loss": 1.0735, + "step": 8282 + }, + { + "epoch": 0.6693468554920301, + "grad_norm": 2.4536221027374268, + "learning_rate": 7.756902945559705e-06, + "loss": 0.9159, + "step": 8283 + }, + { + "epoch": 0.6694276652053577, + "grad_norm": 3.069279193878174, + "learning_rate": 7.75635701782954e-06, + "loss": 0.9608, + "step": 8284 + }, + { + "epoch": 0.6695084749186853, + "grad_norm": 2.756054639816284, + "learning_rate": 7.755811042889615e-06, + "loss": 1.0801, + "step": 8285 + }, + { + "epoch": 0.6695892846320127, + "grad_norm": 2.6374106407165527, + "learning_rate": 7.755265020749281e-06, + "loss": 1.0975, + "step": 8286 + }, + { + "epoch": 0.6696700943453403, + "grad_norm": 2.7474169731140137, + "learning_rate": 7.75471895141789e-06, + "loss": 0.8723, + "step": 8287 + }, + { + "epoch": 0.6697509040586679, + "grad_norm": 2.4243593215942383, + "learning_rate": 7.754172834904797e-06, + "loss": 0.9728, + "step": 8288 + }, + { + "epoch": 0.6698317137719954, + "grad_norm": 2.5193612575531006, + "learning_rate": 7.753626671219352e-06, + "loss": 1.0481, + "step": 8289 + }, + { + "epoch": 0.6699125234853229, + "grad_norm": 2.3451879024505615, + "learning_rate": 7.753080460370912e-06, + "loss": 1.0229, + "step": 8290 + }, + { + "epoch": 0.6699933331986505, + "grad_norm": 4.024284362792969, + "learning_rate": 7.75253420236883e-06, + "loss": 0.7598, + "step": 8291 + }, + { + "epoch": 0.670074142911978, + "grad_norm": 2.246476888656616, + "learning_rate": 7.751987897222464e-06, + "loss": 1.0429, + "step": 8292 + }, + { + "epoch": 0.6701549526253056, + "grad_norm": 2.261608600616455, + "learning_rate": 7.751441544941171e-06, + "loss": 0.9731, + "step": 8293 + }, + { + "epoch": 0.6702357623386331, + "grad_norm": 3.0265629291534424, + "learning_rate": 7.750895145534308e-06, + "loss": 1.0262, + "step": 8294 + }, + { + "epoch": 0.6703165720519606, + "grad_norm": 2.9487316608428955, + "learning_rate": 7.750348699011233e-06, + "loss": 0.8359, + "step": 8295 + }, + { + "epoch": 0.6703973817652882, + "grad_norm": 2.4979705810546875, + "learning_rate": 7.749802205381307e-06, + "loss": 0.9042, + "step": 8296 + }, + { + "epoch": 0.6704781914786158, + "grad_norm": 2.4784200191497803, + "learning_rate": 7.749255664653888e-06, + "loss": 0.9499, + "step": 8297 + }, + { + "epoch": 0.6705590011919432, + "grad_norm": 2.55683970451355, + "learning_rate": 7.748709076838338e-06, + "loss": 0.9525, + "step": 8298 + }, + { + "epoch": 0.6706398109052708, + "grad_norm": 2.722597122192383, + "learning_rate": 7.74816244194402e-06, + "loss": 0.9261, + "step": 8299 + }, + { + "epoch": 0.6707206206185984, + "grad_norm": 2.730271816253662, + "learning_rate": 7.747615759980296e-06, + "loss": 1.0247, + "step": 8300 + }, + { + "epoch": 0.6708014303319259, + "grad_norm": 2.806570291519165, + "learning_rate": 7.747069030956526e-06, + "loss": 0.9996, + "step": 8301 + }, + { + "epoch": 0.6708822400452534, + "grad_norm": 2.540257215499878, + "learning_rate": 7.746522254882078e-06, + "loss": 0.9455, + "step": 8302 + }, + { + "epoch": 0.670963049758581, + "grad_norm": 2.505619525909424, + "learning_rate": 7.745975431766317e-06, + "loss": 0.9975, + "step": 8303 + }, + { + "epoch": 0.6710438594719085, + "grad_norm": 3.088472366333008, + "learning_rate": 7.745428561618606e-06, + "loss": 0.9773, + "step": 8304 + }, + { + "epoch": 0.6711246691852361, + "grad_norm": 2.639523506164551, + "learning_rate": 7.744881644448315e-06, + "loss": 0.9636, + "step": 8305 + }, + { + "epoch": 0.6712054788985636, + "grad_norm": 3.1845033168792725, + "learning_rate": 7.744334680264807e-06, + "loss": 0.9452, + "step": 8306 + }, + { + "epoch": 0.6712862886118911, + "grad_norm": 2.8969013690948486, + "learning_rate": 7.743787669077454e-06, + "loss": 1.0315, + "step": 8307 + }, + { + "epoch": 0.6713670983252187, + "grad_norm": 2.420459270477295, + "learning_rate": 7.743240610895623e-06, + "loss": 1.0473, + "step": 8308 + }, + { + "epoch": 0.6714479080385463, + "grad_norm": 2.644023895263672, + "learning_rate": 7.742693505728684e-06, + "loss": 0.795, + "step": 8309 + }, + { + "epoch": 0.6715287177518737, + "grad_norm": 2.8094165325164795, + "learning_rate": 7.74214635358601e-06, + "loss": 1.0484, + "step": 8310 + }, + { + "epoch": 0.6716095274652013, + "grad_norm": 2.2670786380767822, + "learning_rate": 7.741599154476969e-06, + "loss": 1.0099, + "step": 8311 + }, + { + "epoch": 0.6716903371785289, + "grad_norm": 2.4911980628967285, + "learning_rate": 7.741051908410935e-06, + "loss": 0.905, + "step": 8312 + }, + { + "epoch": 0.6717711468918564, + "grad_norm": 2.782815933227539, + "learning_rate": 7.74050461539728e-06, + "loss": 0.8601, + "step": 8313 + }, + { + "epoch": 0.6718519566051839, + "grad_norm": 2.432807207107544, + "learning_rate": 7.73995727544538e-06, + "loss": 0.8356, + "step": 8314 + }, + { + "epoch": 0.6719327663185115, + "grad_norm": 2.682298183441162, + "learning_rate": 7.739409888564606e-06, + "loss": 1.0302, + "step": 8315 + }, + { + "epoch": 0.672013576031839, + "grad_norm": 2.6344592571258545, + "learning_rate": 7.738862454764336e-06, + "loss": 0.9202, + "step": 8316 + }, + { + "epoch": 0.6720943857451666, + "grad_norm": 2.039740800857544, + "learning_rate": 7.738314974053947e-06, + "loss": 1.1516, + "step": 8317 + }, + { + "epoch": 0.6721751954584941, + "grad_norm": 2.588088274002075, + "learning_rate": 7.737767446442815e-06, + "loss": 0.9694, + "step": 8318 + }, + { + "epoch": 0.6722560051718216, + "grad_norm": 2.552767515182495, + "learning_rate": 7.737219871940315e-06, + "loss": 0.9992, + "step": 8319 + }, + { + "epoch": 0.6723368148851492, + "grad_norm": 2.126105785369873, + "learning_rate": 7.73667225055583e-06, + "loss": 0.8482, + "step": 8320 + }, + { + "epoch": 0.6724176245984768, + "grad_norm": 2.6231529712677, + "learning_rate": 7.736124582298737e-06, + "loss": 0.9491, + "step": 8321 + }, + { + "epoch": 0.6724984343118042, + "grad_norm": 2.7928731441497803, + "learning_rate": 7.735576867178417e-06, + "loss": 0.9421, + "step": 8322 + }, + { + "epoch": 0.6725792440251318, + "grad_norm": 3.0163440704345703, + "learning_rate": 7.73502910520425e-06, + "loss": 0.8872, + "step": 8323 + }, + { + "epoch": 0.6726600537384594, + "grad_norm": 3.2160558700561523, + "learning_rate": 7.73448129638562e-06, + "loss": 0.998, + "step": 8324 + }, + { + "epoch": 0.6727408634517869, + "grad_norm": 2.5027525424957275, + "learning_rate": 7.73393344073191e-06, + "loss": 0.8043, + "step": 8325 + }, + { + "epoch": 0.6728216731651144, + "grad_norm": 2.5036063194274902, + "learning_rate": 7.733385538252497e-06, + "loss": 0.8813, + "step": 8326 + }, + { + "epoch": 0.672902482878442, + "grad_norm": 3.024444818496704, + "learning_rate": 7.732837588956775e-06, + "loss": 0.8521, + "step": 8327 + }, + { + "epoch": 0.6729832925917695, + "grad_norm": 2.3691868782043457, + "learning_rate": 7.73228959285412e-06, + "loss": 0.9211, + "step": 8328 + }, + { + "epoch": 0.6730641023050971, + "grad_norm": 2.5722193717956543, + "learning_rate": 7.731741549953927e-06, + "loss": 0.9543, + "step": 8329 + }, + { + "epoch": 0.6731449120184246, + "grad_norm": 3.1957387924194336, + "learning_rate": 7.731193460265573e-06, + "loss": 0.9296, + "step": 8330 + }, + { + "epoch": 0.6732257217317521, + "grad_norm": 2.7510924339294434, + "learning_rate": 7.730645323798451e-06, + "loss": 0.9251, + "step": 8331 + }, + { + "epoch": 0.6733065314450797, + "grad_norm": 2.670241594314575, + "learning_rate": 7.730097140561949e-06, + "loss": 0.8923, + "step": 8332 + }, + { + "epoch": 0.6733873411584073, + "grad_norm": 2.8305563926696777, + "learning_rate": 7.729548910565457e-06, + "loss": 0.9963, + "step": 8333 + }, + { + "epoch": 0.6734681508717347, + "grad_norm": 2.385990619659424, + "learning_rate": 7.729000633818363e-06, + "loss": 0.9195, + "step": 8334 + }, + { + "epoch": 0.6735489605850623, + "grad_norm": 3.2661023139953613, + "learning_rate": 7.728452310330055e-06, + "loss": 0.9617, + "step": 8335 + }, + { + "epoch": 0.6736297702983899, + "grad_norm": 2.773858070373535, + "learning_rate": 7.72790394010993e-06, + "loss": 0.8973, + "step": 8336 + }, + { + "epoch": 0.6737105800117175, + "grad_norm": 2.914680242538452, + "learning_rate": 7.727355523167378e-06, + "loss": 0.9032, + "step": 8337 + }, + { + "epoch": 0.6737913897250449, + "grad_norm": 2.5987532138824463, + "learning_rate": 7.726807059511789e-06, + "loss": 0.9761, + "step": 8338 + }, + { + "epoch": 0.6738721994383725, + "grad_norm": 2.3974978923797607, + "learning_rate": 7.72625854915256e-06, + "loss": 0.9097, + "step": 8339 + }, + { + "epoch": 0.6739530091517001, + "grad_norm": 2.736830472946167, + "learning_rate": 7.725709992099089e-06, + "loss": 1.0532, + "step": 8340 + }, + { + "epoch": 0.6740338188650276, + "grad_norm": 2.79888653755188, + "learning_rate": 7.725161388360764e-06, + "loss": 1.0349, + "step": 8341 + }, + { + "epoch": 0.6741146285783551, + "grad_norm": 2.7195255756378174, + "learning_rate": 7.724612737946986e-06, + "loss": 0.9932, + "step": 8342 + }, + { + "epoch": 0.6741954382916827, + "grad_norm": 2.5283217430114746, + "learning_rate": 7.724064040867152e-06, + "loss": 0.9653, + "step": 8343 + }, + { + "epoch": 0.6742762480050102, + "grad_norm": 2.5808815956115723, + "learning_rate": 7.723515297130656e-06, + "loss": 0.9035, + "step": 8344 + }, + { + "epoch": 0.6743570577183378, + "grad_norm": 2.767984390258789, + "learning_rate": 7.722966506746902e-06, + "loss": 0.9965, + "step": 8345 + }, + { + "epoch": 0.6744378674316653, + "grad_norm": 2.6212093830108643, + "learning_rate": 7.722417669725288e-06, + "loss": 0.9327, + "step": 8346 + }, + { + "epoch": 0.6745186771449928, + "grad_norm": 2.387342929840088, + "learning_rate": 7.721868786075212e-06, + "loss": 0.9695, + "step": 8347 + }, + { + "epoch": 0.6745994868583204, + "grad_norm": 2.599292516708374, + "learning_rate": 7.721319855806076e-06, + "loss": 1.0379, + "step": 8348 + }, + { + "epoch": 0.674680296571648, + "grad_norm": 2.787990093231201, + "learning_rate": 7.720770878927284e-06, + "loss": 0.9756, + "step": 8349 + }, + { + "epoch": 0.6747611062849754, + "grad_norm": 2.2481515407562256, + "learning_rate": 7.720221855448233e-06, + "loss": 0.9603, + "step": 8350 + }, + { + "epoch": 0.674841915998303, + "grad_norm": 2.836638927459717, + "learning_rate": 7.719672785378334e-06, + "loss": 0.8768, + "step": 8351 + }, + { + "epoch": 0.6749227257116306, + "grad_norm": 2.7812743186950684, + "learning_rate": 7.719123668726986e-06, + "loss": 1.0855, + "step": 8352 + }, + { + "epoch": 0.6750035354249581, + "grad_norm": 3.177708387374878, + "learning_rate": 7.718574505503596e-06, + "loss": 0.9568, + "step": 8353 + }, + { + "epoch": 0.6750843451382856, + "grad_norm": 2.8781800270080566, + "learning_rate": 7.718025295717569e-06, + "loss": 0.9938, + "step": 8354 + }, + { + "epoch": 0.6751651548516132, + "grad_norm": 2.4317760467529297, + "learning_rate": 7.71747603937831e-06, + "loss": 0.9285, + "step": 8355 + }, + { + "epoch": 0.6752459645649407, + "grad_norm": 2.6491196155548096, + "learning_rate": 7.716926736495232e-06, + "loss": 0.9991, + "step": 8356 + }, + { + "epoch": 0.6753267742782683, + "grad_norm": 2.961341619491577, + "learning_rate": 7.71637738707774e-06, + "loss": 1.0286, + "step": 8357 + }, + { + "epoch": 0.6754075839915958, + "grad_norm": 2.510984182357788, + "learning_rate": 7.715827991135241e-06, + "loss": 0.8888, + "step": 8358 + }, + { + "epoch": 0.6754883937049233, + "grad_norm": 2.8395159244537354, + "learning_rate": 7.715278548677145e-06, + "loss": 0.9294, + "step": 8359 + }, + { + "epoch": 0.6755692034182509, + "grad_norm": 2.6775362491607666, + "learning_rate": 7.714729059712869e-06, + "loss": 1.0098, + "step": 8360 + }, + { + "epoch": 0.6756500131315785, + "grad_norm": 2.8229544162750244, + "learning_rate": 7.714179524251814e-06, + "loss": 0.8834, + "step": 8361 + }, + { + "epoch": 0.6757308228449059, + "grad_norm": 2.85675048828125, + "learning_rate": 7.713629942303403e-06, + "loss": 0.7929, + "step": 8362 + }, + { + "epoch": 0.6758116325582335, + "grad_norm": 2.634467124938965, + "learning_rate": 7.71308031387704e-06, + "loss": 0.9123, + "step": 8363 + }, + { + "epoch": 0.6758924422715611, + "grad_norm": 2.678504467010498, + "learning_rate": 7.712530638982143e-06, + "loss": 0.907, + "step": 8364 + }, + { + "epoch": 0.6759732519848886, + "grad_norm": 2.7769691944122314, + "learning_rate": 7.711980917628128e-06, + "loss": 0.8632, + "step": 8365 + }, + { + "epoch": 0.6760540616982161, + "grad_norm": 2.5603690147399902, + "learning_rate": 7.71143114982441e-06, + "loss": 0.9156, + "step": 8366 + }, + { + "epoch": 0.6761348714115437, + "grad_norm": 2.760061025619507, + "learning_rate": 7.7108813355804e-06, + "loss": 0.8569, + "step": 8367 + }, + { + "epoch": 0.6762156811248712, + "grad_norm": 2.8086774349212646, + "learning_rate": 7.710331474905522e-06, + "loss": 0.9045, + "step": 8368 + }, + { + "epoch": 0.6762964908381988, + "grad_norm": 2.4690489768981934, + "learning_rate": 7.709781567809188e-06, + "loss": 0.9131, + "step": 8369 + }, + { + "epoch": 0.6763773005515263, + "grad_norm": 2.483729600906372, + "learning_rate": 7.709231614300823e-06, + "loss": 0.9583, + "step": 8370 + }, + { + "epoch": 0.6764581102648538, + "grad_norm": 2.705441951751709, + "learning_rate": 7.708681614389838e-06, + "loss": 1.0647, + "step": 8371 + }, + { + "epoch": 0.6765389199781814, + "grad_norm": 2.5196518898010254, + "learning_rate": 7.708131568085661e-06, + "loss": 0.8732, + "step": 8372 + }, + { + "epoch": 0.676619729691509, + "grad_norm": 2.6411266326904297, + "learning_rate": 7.70758147539771e-06, + "loss": 0.8754, + "step": 8373 + }, + { + "epoch": 0.6767005394048364, + "grad_norm": 2.3072762489318848, + "learning_rate": 7.707031336335407e-06, + "loss": 0.9353, + "step": 8374 + }, + { + "epoch": 0.676781349118164, + "grad_norm": 2.587378978729248, + "learning_rate": 7.706481150908172e-06, + "loss": 0.8209, + "step": 8375 + }, + { + "epoch": 0.6768621588314916, + "grad_norm": 2.600463628768921, + "learning_rate": 7.70593091912543e-06, + "loss": 0.9594, + "step": 8376 + }, + { + "epoch": 0.6769429685448191, + "grad_norm": 2.5489156246185303, + "learning_rate": 7.705380640996609e-06, + "loss": 1.0448, + "step": 8377 + }, + { + "epoch": 0.6770237782581466, + "grad_norm": 2.501147747039795, + "learning_rate": 7.704830316531128e-06, + "loss": 1.0695, + "step": 8378 + }, + { + "epoch": 0.6771045879714742, + "grad_norm": 3.2265758514404297, + "learning_rate": 7.704279945738416e-06, + "loss": 0.9487, + "step": 8379 + }, + { + "epoch": 0.6771853976848017, + "grad_norm": 2.878840684890747, + "learning_rate": 7.703729528627899e-06, + "loss": 0.9795, + "step": 8380 + }, + { + "epoch": 0.6772662073981293, + "grad_norm": 2.609241485595703, + "learning_rate": 7.703179065209003e-06, + "loss": 1.0045, + "step": 8381 + }, + { + "epoch": 0.6773470171114568, + "grad_norm": 2.75795841217041, + "learning_rate": 7.702628555491159e-06, + "loss": 0.9914, + "step": 8382 + }, + { + "epoch": 0.6774278268247843, + "grad_norm": 2.7188820838928223, + "learning_rate": 7.702077999483793e-06, + "loss": 0.7822, + "step": 8383 + }, + { + "epoch": 0.6775086365381119, + "grad_norm": 2.3923799991607666, + "learning_rate": 7.701527397196336e-06, + "loss": 0.8113, + "step": 8384 + }, + { + "epoch": 0.6775894462514395, + "grad_norm": 2.942815065383911, + "learning_rate": 7.700976748638218e-06, + "loss": 1.0804, + "step": 8385 + }, + { + "epoch": 0.6776702559647669, + "grad_norm": 3.0890114307403564, + "learning_rate": 7.70042605381887e-06, + "loss": 0.9667, + "step": 8386 + }, + { + "epoch": 0.6777510656780945, + "grad_norm": 2.379671335220337, + "learning_rate": 7.699875312747726e-06, + "loss": 1.0517, + "step": 8387 + }, + { + "epoch": 0.6778318753914221, + "grad_norm": 2.3722407817840576, + "learning_rate": 7.699324525434217e-06, + "loss": 0.9749, + "step": 8388 + }, + { + "epoch": 0.6779126851047496, + "grad_norm": 2.7221665382385254, + "learning_rate": 7.698773691887778e-06, + "loss": 0.9211, + "step": 8389 + }, + { + "epoch": 0.6779934948180771, + "grad_norm": 2.7167699337005615, + "learning_rate": 7.698222812117843e-06, + "loss": 0.9072, + "step": 8390 + }, + { + "epoch": 0.6780743045314047, + "grad_norm": 2.2620158195495605, + "learning_rate": 7.697671886133846e-06, + "loss": 0.9907, + "step": 8391 + }, + { + "epoch": 0.6781551142447322, + "grad_norm": 2.5893588066101074, + "learning_rate": 7.697120913945224e-06, + "loss": 0.9744, + "step": 8392 + }, + { + "epoch": 0.6782359239580598, + "grad_norm": 2.9459714889526367, + "learning_rate": 7.696569895561415e-06, + "loss": 1.009, + "step": 8393 + }, + { + "epoch": 0.6783167336713873, + "grad_norm": 3.053457260131836, + "learning_rate": 7.696018830991852e-06, + "loss": 0.8888, + "step": 8394 + }, + { + "epoch": 0.6783975433847148, + "grad_norm": 2.751218557357788, + "learning_rate": 7.69546772024598e-06, + "loss": 1.0176, + "step": 8395 + }, + { + "epoch": 0.6784783530980424, + "grad_norm": 2.663726329803467, + "learning_rate": 7.694916563333234e-06, + "loss": 0.968, + "step": 8396 + }, + { + "epoch": 0.67855916281137, + "grad_norm": 2.3502562046051025, + "learning_rate": 7.694365360263055e-06, + "loss": 0.8048, + "step": 8397 + }, + { + "epoch": 0.6786399725246974, + "grad_norm": 2.6676876544952393, + "learning_rate": 7.693814111044885e-06, + "loss": 0.8791, + "step": 8398 + }, + { + "epoch": 0.678720782238025, + "grad_norm": 2.502436876296997, + "learning_rate": 7.693262815688163e-06, + "loss": 0.9708, + "step": 8399 + }, + { + "epoch": 0.6788015919513526, + "grad_norm": 2.1671106815338135, + "learning_rate": 7.692711474202334e-06, + "loss": 0.8879, + "step": 8400 + }, + { + "epoch": 0.6788824016646801, + "grad_norm": 3.063915491104126, + "learning_rate": 7.692160086596838e-06, + "loss": 1.0006, + "step": 8401 + }, + { + "epoch": 0.6789632113780076, + "grad_norm": 2.5574464797973633, + "learning_rate": 7.691608652881122e-06, + "loss": 0.9383, + "step": 8402 + }, + { + "epoch": 0.6790440210913352, + "grad_norm": 2.441051721572876, + "learning_rate": 7.691057173064629e-06, + "loss": 0.9807, + "step": 8403 + }, + { + "epoch": 0.6791248308046627, + "grad_norm": 3.3872270584106445, + "learning_rate": 7.690505647156806e-06, + "loss": 0.9922, + "step": 8404 + }, + { + "epoch": 0.6792056405179903, + "grad_norm": 2.5608327388763428, + "learning_rate": 7.689954075167098e-06, + "loss": 0.9392, + "step": 8405 + }, + { + "epoch": 0.6792864502313178, + "grad_norm": 2.5460565090179443, + "learning_rate": 7.689402457104954e-06, + "loss": 1.1762, + "step": 8406 + }, + { + "epoch": 0.6793672599446453, + "grad_norm": 2.524811267852783, + "learning_rate": 7.688850792979816e-06, + "loss": 0.8042, + "step": 8407 + }, + { + "epoch": 0.6794480696579729, + "grad_norm": 2.9841954708099365, + "learning_rate": 7.688299082801141e-06, + "loss": 0.9428, + "step": 8408 + }, + { + "epoch": 0.6795288793713005, + "grad_norm": 3.026340961456299, + "learning_rate": 7.687747326578374e-06, + "loss": 1.009, + "step": 8409 + }, + { + "epoch": 0.6796096890846279, + "grad_norm": 2.856476068496704, + "learning_rate": 7.687195524320965e-06, + "loss": 0.9688, + "step": 8410 + }, + { + "epoch": 0.6796904987979555, + "grad_norm": 2.6018548011779785, + "learning_rate": 7.686643676038364e-06, + "loss": 0.8525, + "step": 8411 + }, + { + "epoch": 0.6797713085112831, + "grad_norm": 2.567927598953247, + "learning_rate": 7.686091781740027e-06, + "loss": 0.9956, + "step": 8412 + }, + { + "epoch": 0.6798521182246106, + "grad_norm": 2.6431033611297607, + "learning_rate": 7.685539841435406e-06, + "loss": 0.9945, + "step": 8413 + }, + { + "epoch": 0.6799329279379381, + "grad_norm": 2.5504331588745117, + "learning_rate": 7.68498785513395e-06, + "loss": 1.0153, + "step": 8414 + }, + { + "epoch": 0.6800137376512657, + "grad_norm": 2.590113639831543, + "learning_rate": 7.684435822845115e-06, + "loss": 0.9778, + "step": 8415 + }, + { + "epoch": 0.6800945473645932, + "grad_norm": 2.3256642818450928, + "learning_rate": 7.683883744578359e-06, + "loss": 1.0757, + "step": 8416 + }, + { + "epoch": 0.6801753570779208, + "grad_norm": 3.3201260566711426, + "learning_rate": 7.683331620343135e-06, + "loss": 1.1739, + "step": 8417 + }, + { + "epoch": 0.6802561667912483, + "grad_norm": 2.8348820209503174, + "learning_rate": 7.6827794501489e-06, + "loss": 0.9597, + "step": 8418 + }, + { + "epoch": 0.6803369765045758, + "grad_norm": 2.7646586894989014, + "learning_rate": 7.682227234005113e-06, + "loss": 1.0271, + "step": 8419 + }, + { + "epoch": 0.6804177862179034, + "grad_norm": 2.6002559661865234, + "learning_rate": 7.681674971921227e-06, + "loss": 0.9492, + "step": 8420 + }, + { + "epoch": 0.680498595931231, + "grad_norm": 2.860055923461914, + "learning_rate": 7.681122663906708e-06, + "loss": 0.9503, + "step": 8421 + }, + { + "epoch": 0.6805794056445584, + "grad_norm": 2.656940221786499, + "learning_rate": 7.680570309971011e-06, + "loss": 0.9084, + "step": 8422 + }, + { + "epoch": 0.680660215357886, + "grad_norm": 2.8171634674072266, + "learning_rate": 7.680017910123597e-06, + "loss": 0.9681, + "step": 8423 + }, + { + "epoch": 0.6807410250712136, + "grad_norm": 2.595752239227295, + "learning_rate": 7.67946546437393e-06, + "loss": 0.8406, + "step": 8424 + }, + { + "epoch": 0.6808218347845411, + "grad_norm": 3.4196860790252686, + "learning_rate": 7.67891297273147e-06, + "loss": 0.9399, + "step": 8425 + }, + { + "epoch": 0.6809026444978686, + "grad_norm": 2.6884870529174805, + "learning_rate": 7.678360435205679e-06, + "loss": 1.0277, + "step": 8426 + }, + { + "epoch": 0.6809834542111962, + "grad_norm": 2.9558327198028564, + "learning_rate": 7.677807851806022e-06, + "loss": 0.9687, + "step": 8427 + }, + { + "epoch": 0.6810642639245237, + "grad_norm": 2.5531744956970215, + "learning_rate": 7.677255222541963e-06, + "loss": 0.9916, + "step": 8428 + }, + { + "epoch": 0.6811450736378513, + "grad_norm": 2.709683895111084, + "learning_rate": 7.676702547422966e-06, + "loss": 1.0118, + "step": 8429 + }, + { + "epoch": 0.6812258833511788, + "grad_norm": 2.541829824447632, + "learning_rate": 7.676149826458502e-06, + "loss": 0.8386, + "step": 8430 + }, + { + "epoch": 0.6813066930645063, + "grad_norm": 2.8499135971069336, + "learning_rate": 7.675597059658031e-06, + "loss": 0.979, + "step": 8431 + }, + { + "epoch": 0.6813875027778339, + "grad_norm": 2.470608949661255, + "learning_rate": 7.675044247031024e-06, + "loss": 1.0212, + "step": 8432 + }, + { + "epoch": 0.6814683124911615, + "grad_norm": 2.7002062797546387, + "learning_rate": 7.67449138858695e-06, + "loss": 1.0245, + "step": 8433 + }, + { + "epoch": 0.6815491222044889, + "grad_norm": 2.56925892829895, + "learning_rate": 7.673938484335276e-06, + "loss": 1.0153, + "step": 8434 + }, + { + "epoch": 0.6816299319178165, + "grad_norm": 2.4124858379364014, + "learning_rate": 7.673385534285473e-06, + "loss": 0.9645, + "step": 8435 + }, + { + "epoch": 0.6817107416311441, + "grad_norm": 2.7462291717529297, + "learning_rate": 7.672832538447011e-06, + "loss": 0.8917, + "step": 8436 + }, + { + "epoch": 0.6817915513444716, + "grad_norm": 2.918308973312378, + "learning_rate": 7.672279496829364e-06, + "loss": 1.0055, + "step": 8437 + }, + { + "epoch": 0.6818723610577991, + "grad_norm": 2.783287763595581, + "learning_rate": 7.671726409442002e-06, + "loss": 0.9613, + "step": 8438 + }, + { + "epoch": 0.6819531707711267, + "grad_norm": 2.619626522064209, + "learning_rate": 7.671173276294397e-06, + "loss": 1.051, + "step": 8439 + }, + { + "epoch": 0.6820339804844542, + "grad_norm": 2.851304054260254, + "learning_rate": 7.670620097396026e-06, + "loss": 1.0064, + "step": 8440 + }, + { + "epoch": 0.6821147901977818, + "grad_norm": 2.8876547813415527, + "learning_rate": 7.670066872756362e-06, + "loss": 0.953, + "step": 8441 + }, + { + "epoch": 0.6821955999111093, + "grad_norm": 2.6084249019622803, + "learning_rate": 7.669513602384879e-06, + "loss": 0.9463, + "step": 8442 + }, + { + "epoch": 0.6822764096244368, + "grad_norm": 2.5381696224212646, + "learning_rate": 7.668960286291056e-06, + "loss": 1.0155, + "step": 8443 + }, + { + "epoch": 0.6823572193377644, + "grad_norm": 2.913601875305176, + "learning_rate": 7.668406924484368e-06, + "loss": 0.8306, + "step": 8444 + }, + { + "epoch": 0.682438029051092, + "grad_norm": 2.6268184185028076, + "learning_rate": 7.667853516974292e-06, + "loss": 1.0605, + "step": 8445 + }, + { + "epoch": 0.6825188387644194, + "grad_norm": 2.4881415367126465, + "learning_rate": 7.66730006377031e-06, + "loss": 1.0588, + "step": 8446 + }, + { + "epoch": 0.682599648477747, + "grad_norm": 2.5792691707611084, + "learning_rate": 7.6667465648819e-06, + "loss": 0.9516, + "step": 8447 + }, + { + "epoch": 0.6826804581910746, + "grad_norm": 2.578933000564575, + "learning_rate": 7.666193020318537e-06, + "loss": 0.9021, + "step": 8448 + }, + { + "epoch": 0.6827612679044021, + "grad_norm": 2.4988861083984375, + "learning_rate": 7.66563943008971e-06, + "loss": 1.0085, + "step": 8449 + }, + { + "epoch": 0.6828420776177296, + "grad_norm": 2.6755433082580566, + "learning_rate": 7.665085794204896e-06, + "loss": 1.0179, + "step": 8450 + }, + { + "epoch": 0.6829228873310572, + "grad_norm": 2.3153629302978516, + "learning_rate": 7.664532112673578e-06, + "loss": 0.9813, + "step": 8451 + }, + { + "epoch": 0.6830036970443847, + "grad_norm": 2.309171438217163, + "learning_rate": 7.66397838550524e-06, + "loss": 0.8567, + "step": 8452 + }, + { + "epoch": 0.6830845067577123, + "grad_norm": 2.3666679859161377, + "learning_rate": 7.663424612709364e-06, + "loss": 0.9389, + "step": 8453 + }, + { + "epoch": 0.6831653164710398, + "grad_norm": 2.8175573348999023, + "learning_rate": 7.662870794295438e-06, + "loss": 0.9105, + "step": 8454 + }, + { + "epoch": 0.6832461261843673, + "grad_norm": 2.2377684116363525, + "learning_rate": 7.662316930272945e-06, + "loss": 0.9315, + "step": 8455 + }, + { + "epoch": 0.6833269358976949, + "grad_norm": 2.5128984451293945, + "learning_rate": 7.661763020651372e-06, + "loss": 0.9953, + "step": 8456 + }, + { + "epoch": 0.6834077456110225, + "grad_norm": 2.887550115585327, + "learning_rate": 7.661209065440207e-06, + "loss": 0.8619, + "step": 8457 + }, + { + "epoch": 0.6834885553243499, + "grad_norm": 2.778005599975586, + "learning_rate": 7.660655064648937e-06, + "loss": 0.8838, + "step": 8458 + }, + { + "epoch": 0.6835693650376775, + "grad_norm": 2.6972951889038086, + "learning_rate": 7.660101018287053e-06, + "loss": 0.9645, + "step": 8459 + }, + { + "epoch": 0.6836501747510051, + "grad_norm": 2.66129207611084, + "learning_rate": 7.659546926364038e-06, + "loss": 0.8559, + "step": 8460 + }, + { + "epoch": 0.6837309844643326, + "grad_norm": 2.4100239276885986, + "learning_rate": 7.65899278888939e-06, + "loss": 0.7838, + "step": 8461 + }, + { + "epoch": 0.6838117941776601, + "grad_norm": 2.6647441387176514, + "learning_rate": 7.658438605872596e-06, + "loss": 0.9808, + "step": 8462 + }, + { + "epoch": 0.6838926038909877, + "grad_norm": 2.9784247875213623, + "learning_rate": 7.657884377323149e-06, + "loss": 0.9254, + "step": 8463 + }, + { + "epoch": 0.6839734136043152, + "grad_norm": 2.894587278366089, + "learning_rate": 7.65733010325054e-06, + "loss": 0.9022, + "step": 8464 + }, + { + "epoch": 0.6840542233176428, + "grad_norm": 2.923002243041992, + "learning_rate": 7.656775783664265e-06, + "loss": 0.9115, + "step": 8465 + }, + { + "epoch": 0.6841350330309703, + "grad_norm": 2.273637056350708, + "learning_rate": 7.656221418573817e-06, + "loss": 0.8948, + "step": 8466 + }, + { + "epoch": 0.6842158427442979, + "grad_norm": 2.5338826179504395, + "learning_rate": 7.65566700798869e-06, + "loss": 1.0228, + "step": 8467 + }, + { + "epoch": 0.6842966524576254, + "grad_norm": 2.8406641483306885, + "learning_rate": 7.65511255191838e-06, + "loss": 0.9498, + "step": 8468 + }, + { + "epoch": 0.684377462170953, + "grad_norm": 2.497377872467041, + "learning_rate": 7.654558050372385e-06, + "loss": 0.9637, + "step": 8469 + }, + { + "epoch": 0.6844582718842805, + "grad_norm": 2.7127885818481445, + "learning_rate": 7.6540035033602e-06, + "loss": 1.0735, + "step": 8470 + }, + { + "epoch": 0.684539081597608, + "grad_norm": 2.4919934272766113, + "learning_rate": 7.653448910891325e-06, + "loss": 1.0278, + "step": 8471 + }, + { + "epoch": 0.6846198913109356, + "grad_norm": 2.5153603553771973, + "learning_rate": 7.652894272975257e-06, + "loss": 0.9611, + "step": 8472 + }, + { + "epoch": 0.6847007010242632, + "grad_norm": 2.4566097259521484, + "learning_rate": 7.652339589621498e-06, + "loss": 0.8935, + "step": 8473 + }, + { + "epoch": 0.6847815107375906, + "grad_norm": 2.7795534133911133, + "learning_rate": 7.651784860839547e-06, + "loss": 0.9947, + "step": 8474 + }, + { + "epoch": 0.6848623204509182, + "grad_norm": 3.1100776195526123, + "learning_rate": 7.651230086638905e-06, + "loss": 1.0381, + "step": 8475 + }, + { + "epoch": 0.6849431301642458, + "grad_norm": 2.716601848602295, + "learning_rate": 7.650675267029072e-06, + "loss": 0.9526, + "step": 8476 + }, + { + "epoch": 0.6850239398775733, + "grad_norm": 2.3293495178222656, + "learning_rate": 7.650120402019556e-06, + "loss": 0.9579, + "step": 8477 + }, + { + "epoch": 0.6851047495909008, + "grad_norm": 2.3756096363067627, + "learning_rate": 7.649565491619855e-06, + "loss": 0.8161, + "step": 8478 + }, + { + "epoch": 0.6851855593042284, + "grad_norm": 2.896602153778076, + "learning_rate": 7.649010535839478e-06, + "loss": 0.9511, + "step": 8479 + }, + { + "epoch": 0.6852663690175559, + "grad_norm": 2.6385321617126465, + "learning_rate": 7.648455534687927e-06, + "loss": 0.9406, + "step": 8480 + }, + { + "epoch": 0.6853471787308835, + "grad_norm": 2.6948325634002686, + "learning_rate": 7.647900488174708e-06, + "loss": 0.9515, + "step": 8481 + }, + { + "epoch": 0.685427988444211, + "grad_norm": 2.516601800918579, + "learning_rate": 7.647345396309328e-06, + "loss": 0.8487, + "step": 8482 + }, + { + "epoch": 0.6855087981575385, + "grad_norm": 2.4833476543426514, + "learning_rate": 7.646790259101297e-06, + "loss": 1.0024, + "step": 8483 + }, + { + "epoch": 0.6855896078708661, + "grad_norm": 2.570033550262451, + "learning_rate": 7.646235076560119e-06, + "loss": 0.8984, + "step": 8484 + }, + { + "epoch": 0.6856704175841937, + "grad_norm": 2.308443069458008, + "learning_rate": 7.645679848695305e-06, + "loss": 0.9218, + "step": 8485 + }, + { + "epoch": 0.6857512272975211, + "grad_norm": 2.457383632659912, + "learning_rate": 7.645124575516363e-06, + "loss": 0.9703, + "step": 8486 + }, + { + "epoch": 0.6858320370108487, + "grad_norm": 2.499898910522461, + "learning_rate": 7.644569257032805e-06, + "loss": 1.0285, + "step": 8487 + }, + { + "epoch": 0.6859128467241763, + "grad_norm": 2.739859104156494, + "learning_rate": 7.644013893254145e-06, + "loss": 0.9798, + "step": 8488 + }, + { + "epoch": 0.6859936564375038, + "grad_norm": 2.992964506149292, + "learning_rate": 7.64345848418989e-06, + "loss": 0.9043, + "step": 8489 + }, + { + "epoch": 0.6860744661508313, + "grad_norm": 2.946100950241089, + "learning_rate": 7.642903029849554e-06, + "loss": 0.8497, + "step": 8490 + }, + { + "epoch": 0.6861552758641589, + "grad_norm": 2.515207052230835, + "learning_rate": 7.642347530242654e-06, + "loss": 0.9575, + "step": 8491 + }, + { + "epoch": 0.6862360855774864, + "grad_norm": 3.00736927986145, + "learning_rate": 7.6417919853787e-06, + "loss": 0.9283, + "step": 8492 + }, + { + "epoch": 0.686316895290814, + "grad_norm": 2.454270601272583, + "learning_rate": 7.64123639526721e-06, + "loss": 1.0599, + "step": 8493 + }, + { + "epoch": 0.6863977050041415, + "grad_norm": 2.7114779949188232, + "learning_rate": 7.6406807599177e-06, + "loss": 1.05, + "step": 8494 + }, + { + "epoch": 0.686478514717469, + "grad_norm": 2.576533555984497, + "learning_rate": 7.640125079339684e-06, + "loss": 0.9726, + "step": 8495 + }, + { + "epoch": 0.6865593244307966, + "grad_norm": 2.3898873329162598, + "learning_rate": 7.639569353542683e-06, + "loss": 0.8742, + "step": 8496 + }, + { + "epoch": 0.6866401341441242, + "grad_norm": 2.7289464473724365, + "learning_rate": 7.639013582536213e-06, + "loss": 0.9219, + "step": 8497 + }, + { + "epoch": 0.6867209438574516, + "grad_norm": 2.785449743270874, + "learning_rate": 7.638457766329792e-06, + "loss": 0.963, + "step": 8498 + }, + { + "epoch": 0.6868017535707792, + "grad_norm": 2.609910249710083, + "learning_rate": 7.637901904932943e-06, + "loss": 0.8897, + "step": 8499 + }, + { + "epoch": 0.6868825632841068, + "grad_norm": 2.7766265869140625, + "learning_rate": 7.637345998355185e-06, + "loss": 0.9813, + "step": 8500 + }, + { + "epoch": 0.6869633729974343, + "grad_norm": 2.8442697525024414, + "learning_rate": 7.636790046606037e-06, + "loss": 1.0714, + "step": 8501 + }, + { + "epoch": 0.6870441827107618, + "grad_norm": 2.5213828086853027, + "learning_rate": 7.636234049695026e-06, + "loss": 0.9575, + "step": 8502 + }, + { + "epoch": 0.6871249924240894, + "grad_norm": 3.0026817321777344, + "learning_rate": 7.63567800763167e-06, + "loss": 0.8413, + "step": 8503 + }, + { + "epoch": 0.6872058021374169, + "grad_norm": 2.7835092544555664, + "learning_rate": 7.635121920425498e-06, + "loss": 0.9992, + "step": 8504 + }, + { + "epoch": 0.6872866118507445, + "grad_norm": 2.566882848739624, + "learning_rate": 7.634565788086028e-06, + "loss": 0.8768, + "step": 8505 + }, + { + "epoch": 0.687367421564072, + "grad_norm": 2.6903584003448486, + "learning_rate": 7.634009610622789e-06, + "loss": 0.9405, + "step": 8506 + }, + { + "epoch": 0.6874482312773995, + "grad_norm": 2.805124521255493, + "learning_rate": 7.633453388045306e-06, + "loss": 0.8344, + "step": 8507 + }, + { + "epoch": 0.6875290409907271, + "grad_norm": 2.63955020904541, + "learning_rate": 7.63289712036311e-06, + "loss": 0.9395, + "step": 8508 + }, + { + "epoch": 0.6876098507040547, + "grad_norm": 2.5578644275665283, + "learning_rate": 7.63234080758572e-06, + "loss": 0.8353, + "step": 8509 + }, + { + "epoch": 0.6876906604173821, + "grad_norm": 2.5840113162994385, + "learning_rate": 7.631784449722672e-06, + "loss": 0.8967, + "step": 8510 + }, + { + "epoch": 0.6877714701307097, + "grad_norm": 2.510427713394165, + "learning_rate": 7.631228046783492e-06, + "loss": 0.9796, + "step": 8511 + }, + { + "epoch": 0.6878522798440373, + "grad_norm": 2.9449968338012695, + "learning_rate": 7.63067159877771e-06, + "loss": 0.8771, + "step": 8512 + }, + { + "epoch": 0.6879330895573648, + "grad_norm": 2.2487404346466064, + "learning_rate": 7.630115105714854e-06, + "loss": 0.8831, + "step": 8513 + }, + { + "epoch": 0.6880138992706923, + "grad_norm": 2.572007179260254, + "learning_rate": 7.629558567604461e-06, + "loss": 0.9316, + "step": 8514 + }, + { + "epoch": 0.6880947089840199, + "grad_norm": 2.8729922771453857, + "learning_rate": 7.629001984456059e-06, + "loss": 0.8668, + "step": 8515 + }, + { + "epoch": 0.6881755186973474, + "grad_norm": 2.8302195072174072, + "learning_rate": 7.628445356279182e-06, + "loss": 1.0108, + "step": 8516 + }, + { + "epoch": 0.688256328410675, + "grad_norm": 2.6309752464294434, + "learning_rate": 7.627888683083363e-06, + "loss": 0.9727, + "step": 8517 + }, + { + "epoch": 0.6883371381240025, + "grad_norm": 2.6336255073547363, + "learning_rate": 7.6273319648781395e-06, + "loss": 0.9099, + "step": 8518 + }, + { + "epoch": 0.68841794783733, + "grad_norm": 2.4486746788024902, + "learning_rate": 7.626775201673042e-06, + "loss": 0.8963, + "step": 8519 + }, + { + "epoch": 0.6884987575506576, + "grad_norm": 2.5037882328033447, + "learning_rate": 7.6262183934776114e-06, + "loss": 1.0884, + "step": 8520 + }, + { + "epoch": 0.6885795672639852, + "grad_norm": 2.4955711364746094, + "learning_rate": 7.62566154030138e-06, + "loss": 0.8892, + "step": 8521 + }, + { + "epoch": 0.6886603769773126, + "grad_norm": 2.296449661254883, + "learning_rate": 7.625104642153889e-06, + "loss": 0.9347, + "step": 8522 + }, + { + "epoch": 0.6887411866906402, + "grad_norm": 2.760537624359131, + "learning_rate": 7.624547699044673e-06, + "loss": 1.0912, + "step": 8523 + }, + { + "epoch": 0.6888219964039678, + "grad_norm": 2.649718761444092, + "learning_rate": 7.623990710983275e-06, + "loss": 0.8756, + "step": 8524 + }, + { + "epoch": 0.6889028061172953, + "grad_norm": 2.7818546295166016, + "learning_rate": 7.623433677979234e-06, + "loss": 1.0735, + "step": 8525 + }, + { + "epoch": 0.6889836158306228, + "grad_norm": 2.3486881256103516, + "learning_rate": 7.622876600042088e-06, + "loss": 0.9275, + "step": 8526 + }, + { + "epoch": 0.6890644255439504, + "grad_norm": 2.675062656402588, + "learning_rate": 7.622319477181381e-06, + "loss": 0.9279, + "step": 8527 + }, + { + "epoch": 0.6891452352572779, + "grad_norm": 2.6401591300964355, + "learning_rate": 7.6217623094066554e-06, + "loss": 1.0097, + "step": 8528 + }, + { + "epoch": 0.6892260449706055, + "grad_norm": 2.5405540466308594, + "learning_rate": 7.6212050967274495e-06, + "loss": 0.7642, + "step": 8529 + }, + { + "epoch": 0.689306854683933, + "grad_norm": 2.401169776916504, + "learning_rate": 7.620647839153315e-06, + "loss": 1.0061, + "step": 8530 + }, + { + "epoch": 0.6893876643972605, + "grad_norm": 2.991809368133545, + "learning_rate": 7.620090536693787e-06, + "loss": 0.9315, + "step": 8531 + }, + { + "epoch": 0.6894684741105881, + "grad_norm": 2.5979745388031006, + "learning_rate": 7.61953318935842e-06, + "loss": 0.9467, + "step": 8532 + }, + { + "epoch": 0.6895492838239157, + "grad_norm": 2.3411121368408203, + "learning_rate": 7.618975797156753e-06, + "loss": 0.8983, + "step": 8533 + }, + { + "epoch": 0.6896300935372431, + "grad_norm": 2.627819538116455, + "learning_rate": 7.618418360098338e-06, + "loss": 1.0355, + "step": 8534 + }, + { + "epoch": 0.6897109032505707, + "grad_norm": 2.9491703510284424, + "learning_rate": 7.617860878192718e-06, + "loss": 0.9876, + "step": 8535 + }, + { + "epoch": 0.6897917129638983, + "grad_norm": 2.737373113632202, + "learning_rate": 7.617303351449444e-06, + "loss": 0.942, + "step": 8536 + }, + { + "epoch": 0.6898725226772258, + "grad_norm": 2.894930362701416, + "learning_rate": 7.616745779878065e-06, + "loss": 0.9467, + "step": 8537 + }, + { + "epoch": 0.6899533323905533, + "grad_norm": 2.7718076705932617, + "learning_rate": 7.616188163488132e-06, + "loss": 0.8853, + "step": 8538 + }, + { + "epoch": 0.6900341421038809, + "grad_norm": 2.4677348136901855, + "learning_rate": 7.615630502289191e-06, + "loss": 1.0362, + "step": 8539 + }, + { + "epoch": 0.6901149518172084, + "grad_norm": 2.5967016220092773, + "learning_rate": 7.615072796290797e-06, + "loss": 1.0943, + "step": 8540 + }, + { + "epoch": 0.690195761530536, + "grad_norm": 2.6121981143951416, + "learning_rate": 7.614515045502502e-06, + "loss": 0.8968, + "step": 8541 + }, + { + "epoch": 0.6902765712438635, + "grad_norm": 2.5807816982269287, + "learning_rate": 7.613957249933859e-06, + "loss": 0.8829, + "step": 8542 + }, + { + "epoch": 0.690357380957191, + "grad_norm": 2.292766809463501, + "learning_rate": 7.6133994095944195e-06, + "loss": 1.0097, + "step": 8543 + }, + { + "epoch": 0.6904381906705186, + "grad_norm": 2.4342575073242188, + "learning_rate": 7.612841524493741e-06, + "loss": 0.9578, + "step": 8544 + }, + { + "epoch": 0.6905190003838462, + "grad_norm": 2.439470052719116, + "learning_rate": 7.612283594641376e-06, + "loss": 0.951, + "step": 8545 + }, + { + "epoch": 0.6905998100971736, + "grad_norm": 2.6422617435455322, + "learning_rate": 7.611725620046884e-06, + "loss": 0.9223, + "step": 8546 + }, + { + "epoch": 0.6906806198105012, + "grad_norm": 2.6705915927886963, + "learning_rate": 7.611167600719819e-06, + "loss": 0.9802, + "step": 8547 + }, + { + "epoch": 0.6907614295238288, + "grad_norm": 2.584794044494629, + "learning_rate": 7.610609536669737e-06, + "loss": 0.8957, + "step": 8548 + }, + { + "epoch": 0.6908422392371563, + "grad_norm": 2.595289945602417, + "learning_rate": 7.610051427906201e-06, + "loss": 0.924, + "step": 8549 + }, + { + "epoch": 0.6909230489504838, + "grad_norm": 2.8844175338745117, + "learning_rate": 7.609493274438766e-06, + "loss": 0.9296, + "step": 8550 + }, + { + "epoch": 0.6910038586638114, + "grad_norm": 3.100165367126465, + "learning_rate": 7.608935076276994e-06, + "loss": 1.0573, + "step": 8551 + }, + { + "epoch": 0.6910846683771389, + "grad_norm": 2.577888250350952, + "learning_rate": 7.608376833430444e-06, + "loss": 0.8948, + "step": 8552 + }, + { + "epoch": 0.6911654780904665, + "grad_norm": 3.04179310798645, + "learning_rate": 7.607818545908681e-06, + "loss": 0.9858, + "step": 8553 + }, + { + "epoch": 0.691246287803794, + "grad_norm": 2.8973894119262695, + "learning_rate": 7.607260213721262e-06, + "loss": 0.854, + "step": 8554 + }, + { + "epoch": 0.6913270975171215, + "grad_norm": 3.1727302074432373, + "learning_rate": 7.606701836877752e-06, + "loss": 1.0172, + "step": 8555 + }, + { + "epoch": 0.6914079072304491, + "grad_norm": 2.350161552429199, + "learning_rate": 7.606143415387715e-06, + "loss": 0.8157, + "step": 8556 + }, + { + "epoch": 0.6914887169437767, + "grad_norm": 2.398963689804077, + "learning_rate": 7.605584949260716e-06, + "loss": 0.8972, + "step": 8557 + }, + { + "epoch": 0.6915695266571041, + "grad_norm": 2.073056697845459, + "learning_rate": 7.60502643850632e-06, + "loss": 1.0482, + "step": 8558 + }, + { + "epoch": 0.6916503363704317, + "grad_norm": 2.7007317543029785, + "learning_rate": 7.6044678831340915e-06, + "loss": 0.9308, + "step": 8559 + }, + { + "epoch": 0.6917311460837593, + "grad_norm": 2.6965243816375732, + "learning_rate": 7.603909283153598e-06, + "loss": 1.0158, + "step": 8560 + }, + { + "epoch": 0.6918119557970868, + "grad_norm": 2.4421262741088867, + "learning_rate": 7.603350638574408e-06, + "loss": 0.9155, + "step": 8561 + }, + { + "epoch": 0.6918927655104143, + "grad_norm": 2.9302661418914795, + "learning_rate": 7.602791949406088e-06, + "loss": 0.8886, + "step": 8562 + }, + { + "epoch": 0.6919735752237419, + "grad_norm": 2.614354372024536, + "learning_rate": 7.602233215658209e-06, + "loss": 1.0003, + "step": 8563 + }, + { + "epoch": 0.6920543849370694, + "grad_norm": 2.6408298015594482, + "learning_rate": 7.601674437340339e-06, + "loss": 1.0349, + "step": 8564 + }, + { + "epoch": 0.692135194650397, + "grad_norm": 2.468775987625122, + "learning_rate": 7.601115614462049e-06, + "loss": 1.0312, + "step": 8565 + }, + { + "epoch": 0.6922160043637245, + "grad_norm": 2.7402100563049316, + "learning_rate": 7.60055674703291e-06, + "loss": 0.8751, + "step": 8566 + }, + { + "epoch": 0.692296814077052, + "grad_norm": 2.747537136077881, + "learning_rate": 7.599997835062496e-06, + "loss": 0.9242, + "step": 8567 + }, + { + "epoch": 0.6923776237903796, + "grad_norm": 2.6012680530548096, + "learning_rate": 7.599438878560377e-06, + "loss": 0.9521, + "step": 8568 + }, + { + "epoch": 0.6924584335037072, + "grad_norm": 2.8128600120544434, + "learning_rate": 7.598879877536129e-06, + "loss": 1.0664, + "step": 8569 + }, + { + "epoch": 0.6925392432170346, + "grad_norm": 2.7352778911590576, + "learning_rate": 7.598320831999323e-06, + "loss": 0.8944, + "step": 8570 + }, + { + "epoch": 0.6926200529303622, + "grad_norm": 2.58370041847229, + "learning_rate": 7.59776174195954e-06, + "loss": 0.9682, + "step": 8571 + }, + { + "epoch": 0.6927008626436898, + "grad_norm": 2.5282909870147705, + "learning_rate": 7.597202607426349e-06, + "loss": 0.8859, + "step": 8572 + }, + { + "epoch": 0.6927816723570173, + "grad_norm": 3.129368543624878, + "learning_rate": 7.59664342840933e-06, + "loss": 1.035, + "step": 8573 + }, + { + "epoch": 0.6928624820703448, + "grad_norm": 2.583153486251831, + "learning_rate": 7.596084204918062e-06, + "loss": 0.926, + "step": 8574 + }, + { + "epoch": 0.6929432917836724, + "grad_norm": 2.4431116580963135, + "learning_rate": 7.595524936962122e-06, + "loss": 0.793, + "step": 8575 + }, + { + "epoch": 0.6930241014969999, + "grad_norm": 2.709498643875122, + "learning_rate": 7.594965624551085e-06, + "loss": 0.9978, + "step": 8576 + }, + { + "epoch": 0.6931049112103275, + "grad_norm": 2.3784282207489014, + "learning_rate": 7.594406267694536e-06, + "loss": 0.8461, + "step": 8577 + }, + { + "epoch": 0.693185720923655, + "grad_norm": 2.5849270820617676, + "learning_rate": 7.593846866402054e-06, + "loss": 0.9594, + "step": 8578 + }, + { + "epoch": 0.6932665306369825, + "grad_norm": 2.7068569660186768, + "learning_rate": 7.593287420683219e-06, + "loss": 0.8639, + "step": 8579 + }, + { + "epoch": 0.6933473403503101, + "grad_norm": 2.623075246810913, + "learning_rate": 7.592727930547613e-06, + "loss": 1.1281, + "step": 8580 + }, + { + "epoch": 0.6934281500636377, + "grad_norm": 2.4773852825164795, + "learning_rate": 7.59216839600482e-06, + "loss": 0.934, + "step": 8581 + }, + { + "epoch": 0.6935089597769651, + "grad_norm": 2.8432528972625732, + "learning_rate": 7.591608817064422e-06, + "loss": 0.772, + "step": 8582 + }, + { + "epoch": 0.6935897694902927, + "grad_norm": 2.484088659286499, + "learning_rate": 7.5910491937360054e-06, + "loss": 0.9708, + "step": 8583 + }, + { + "epoch": 0.6936705792036203, + "grad_norm": 3.355008602142334, + "learning_rate": 7.590489526029152e-06, + "loss": 0.9861, + "step": 8584 + }, + { + "epoch": 0.6937513889169478, + "grad_norm": 2.7875428199768066, + "learning_rate": 7.589929813953452e-06, + "loss": 0.8744, + "step": 8585 + }, + { + "epoch": 0.6938321986302753, + "grad_norm": 2.3667221069335938, + "learning_rate": 7.589370057518486e-06, + "loss": 0.9079, + "step": 8586 + }, + { + "epoch": 0.6939130083436029, + "grad_norm": 2.7356786727905273, + "learning_rate": 7.588810256733847e-06, + "loss": 0.8746, + "step": 8587 + }, + { + "epoch": 0.6939938180569304, + "grad_norm": 2.7664761543273926, + "learning_rate": 7.58825041160912e-06, + "loss": 0.9542, + "step": 8588 + }, + { + "epoch": 0.694074627770258, + "grad_norm": 2.645258665084839, + "learning_rate": 7.587690522153894e-06, + "loss": 0.9539, + "step": 8589 + }, + { + "epoch": 0.6941554374835855, + "grad_norm": 2.487731695175171, + "learning_rate": 7.587130588377758e-06, + "loss": 0.9562, + "step": 8590 + }, + { + "epoch": 0.694236247196913, + "grad_norm": 2.425704002380371, + "learning_rate": 7.586570610290305e-06, + "loss": 1.0059, + "step": 8591 + }, + { + "epoch": 0.6943170569102406, + "grad_norm": 2.481106996536255, + "learning_rate": 7.586010587901125e-06, + "loss": 0.8474, + "step": 8592 + }, + { + "epoch": 0.6943978666235682, + "grad_norm": 2.572812557220459, + "learning_rate": 7.585450521219807e-06, + "loss": 0.9305, + "step": 8593 + }, + { + "epoch": 0.6944786763368956, + "grad_norm": 2.375962257385254, + "learning_rate": 7.584890410255948e-06, + "loss": 0.9888, + "step": 8594 + }, + { + "epoch": 0.6945594860502232, + "grad_norm": 2.728224515914917, + "learning_rate": 7.584330255019137e-06, + "loss": 0.8738, + "step": 8595 + }, + { + "epoch": 0.6946402957635508, + "grad_norm": 3.3338637351989746, + "learning_rate": 7.583770055518971e-06, + "loss": 0.9901, + "step": 8596 + }, + { + "epoch": 0.6947211054768784, + "grad_norm": 2.8553481101989746, + "learning_rate": 7.583209811765044e-06, + "loss": 0.9182, + "step": 8597 + }, + { + "epoch": 0.6948019151902058, + "grad_norm": 2.6169826984405518, + "learning_rate": 7.582649523766952e-06, + "loss": 0.9113, + "step": 8598 + }, + { + "epoch": 0.6948827249035334, + "grad_norm": 3.1289563179016113, + "learning_rate": 7.582089191534292e-06, + "loss": 0.9261, + "step": 8599 + }, + { + "epoch": 0.694963534616861, + "grad_norm": 2.7174198627471924, + "learning_rate": 7.58152881507666e-06, + "loss": 0.8609, + "step": 8600 + }, + { + "epoch": 0.6950443443301885, + "grad_norm": 2.9810330867767334, + "learning_rate": 7.580968394403653e-06, + "loss": 0.948, + "step": 8601 + }, + { + "epoch": 0.695125154043516, + "grad_norm": 2.527822256088257, + "learning_rate": 7.580407929524871e-06, + "loss": 0.9894, + "step": 8602 + }, + { + "epoch": 0.6952059637568436, + "grad_norm": 3.117835521697998, + "learning_rate": 7.579847420449913e-06, + "loss": 1.0269, + "step": 8603 + }, + { + "epoch": 0.6952867734701711, + "grad_norm": 2.935013771057129, + "learning_rate": 7.5792868671883805e-06, + "loss": 0.9498, + "step": 8604 + }, + { + "epoch": 0.6953675831834987, + "grad_norm": 2.737457513809204, + "learning_rate": 7.578726269749874e-06, + "loss": 0.8966, + "step": 8605 + }, + { + "epoch": 0.6954483928968263, + "grad_norm": 2.5293126106262207, + "learning_rate": 7.578165628143993e-06, + "loss": 0.9781, + "step": 8606 + }, + { + "epoch": 0.6955292026101537, + "grad_norm": 2.5696821212768555, + "learning_rate": 7.577604942380342e-06, + "loss": 0.9699, + "step": 8607 + }, + { + "epoch": 0.6956100123234813, + "grad_norm": 2.6878457069396973, + "learning_rate": 7.577044212468523e-06, + "loss": 1.0177, + "step": 8608 + }, + { + "epoch": 0.6956908220368089, + "grad_norm": 2.777543306350708, + "learning_rate": 7.576483438418142e-06, + "loss": 0.9182, + "step": 8609 + }, + { + "epoch": 0.6957716317501363, + "grad_norm": 2.3973729610443115, + "learning_rate": 7.575922620238801e-06, + "loss": 0.8788, + "step": 8610 + }, + { + "epoch": 0.6958524414634639, + "grad_norm": 2.2486517429351807, + "learning_rate": 7.575361757940107e-06, + "loss": 1.0163, + "step": 8611 + }, + { + "epoch": 0.6959332511767915, + "grad_norm": 2.6407740116119385, + "learning_rate": 7.574800851531667e-06, + "loss": 1.0108, + "step": 8612 + }, + { + "epoch": 0.696014060890119, + "grad_norm": 2.4537734985351562, + "learning_rate": 7.574239901023086e-06, + "loss": 0.8857, + "step": 8613 + }, + { + "epoch": 0.6960948706034465, + "grad_norm": 2.5676252841949463, + "learning_rate": 7.573678906423973e-06, + "loss": 0.9512, + "step": 8614 + }, + { + "epoch": 0.6961756803167741, + "grad_norm": 2.4398884773254395, + "learning_rate": 7.573117867743937e-06, + "loss": 0.8377, + "step": 8615 + }, + { + "epoch": 0.6962564900301016, + "grad_norm": 2.8415637016296387, + "learning_rate": 7.572556784992586e-06, + "loss": 0.9853, + "step": 8616 + }, + { + "epoch": 0.6963372997434292, + "grad_norm": 2.5694844722747803, + "learning_rate": 7.571995658179529e-06, + "loss": 0.887, + "step": 8617 + }, + { + "epoch": 0.6964181094567568, + "grad_norm": 2.49828839302063, + "learning_rate": 7.57143448731438e-06, + "loss": 1.0492, + "step": 8618 + }, + { + "epoch": 0.6964989191700842, + "grad_norm": 3.0085999965667725, + "learning_rate": 7.570873272406748e-06, + "loss": 0.9792, + "step": 8619 + }, + { + "epoch": 0.6965797288834118, + "grad_norm": 2.5902698040008545, + "learning_rate": 7.570312013466248e-06, + "loss": 0.9358, + "step": 8620 + }, + { + "epoch": 0.6966605385967394, + "grad_norm": 2.436767578125, + "learning_rate": 7.569750710502487e-06, + "loss": 1.0047, + "step": 8621 + }, + { + "epoch": 0.6967413483100668, + "grad_norm": 2.3210556507110596, + "learning_rate": 7.569189363525086e-06, + "loss": 1.0272, + "step": 8622 + }, + { + "epoch": 0.6968221580233944, + "grad_norm": 2.89433217048645, + "learning_rate": 7.568627972543654e-06, + "loss": 0.8926, + "step": 8623 + }, + { + "epoch": 0.696902967736722, + "grad_norm": 2.4591269493103027, + "learning_rate": 7.568066537567811e-06, + "loss": 0.9729, + "step": 8624 + }, + { + "epoch": 0.6969837774500495, + "grad_norm": 2.656367778778076, + "learning_rate": 7.567505058607169e-06, + "loss": 0.9672, + "step": 8625 + }, + { + "epoch": 0.697064587163377, + "grad_norm": 2.5143606662750244, + "learning_rate": 7.566943535671346e-06, + "loss": 1.0344, + "step": 8626 + }, + { + "epoch": 0.6971453968767046, + "grad_norm": 2.8403632640838623, + "learning_rate": 7.5663819687699605e-06, + "loss": 0.9725, + "step": 8627 + }, + { + "epoch": 0.6972262065900321, + "grad_norm": 3.015887975692749, + "learning_rate": 7.565820357912631e-06, + "loss": 1.0496, + "step": 8628 + }, + { + "epoch": 0.6973070163033597, + "grad_norm": 2.7617697715759277, + "learning_rate": 7.565258703108973e-06, + "loss": 0.9673, + "step": 8629 + }, + { + "epoch": 0.6973878260166873, + "grad_norm": 2.7930734157562256, + "learning_rate": 7.564697004368613e-06, + "loss": 1.0001, + "step": 8630 + }, + { + "epoch": 0.6974686357300147, + "grad_norm": 2.5580267906188965, + "learning_rate": 7.564135261701165e-06, + "loss": 0.8846, + "step": 8631 + }, + { + "epoch": 0.6975494454433423, + "grad_norm": 2.546013355255127, + "learning_rate": 7.563573475116252e-06, + "loss": 0.9289, + "step": 8632 + }, + { + "epoch": 0.6976302551566699, + "grad_norm": 2.1583731174468994, + "learning_rate": 7.563011644623499e-06, + "loss": 0.9521, + "step": 8633 + }, + { + "epoch": 0.6977110648699973, + "grad_norm": 2.7555742263793945, + "learning_rate": 7.562449770232527e-06, + "loss": 0.9649, + "step": 8634 + }, + { + "epoch": 0.6977918745833249, + "grad_norm": 2.919497489929199, + "learning_rate": 7.561887851952958e-06, + "loss": 0.8866, + "step": 8635 + }, + { + "epoch": 0.6978726842966525, + "grad_norm": 2.2876970767974854, + "learning_rate": 7.561325889794417e-06, + "loss": 0.9934, + "step": 8636 + }, + { + "epoch": 0.69795349400998, + "grad_norm": 2.8355917930603027, + "learning_rate": 7.560763883766531e-06, + "loss": 0.8978, + "step": 8637 + }, + { + "epoch": 0.6980343037233075, + "grad_norm": 3.190540075302124, + "learning_rate": 7.560201833878924e-06, + "loss": 0.8722, + "step": 8638 + }, + { + "epoch": 0.6981151134366351, + "grad_norm": 3.1715128421783447, + "learning_rate": 7.5596397401412234e-06, + "loss": 0.873, + "step": 8639 + }, + { + "epoch": 0.6981959231499626, + "grad_norm": 3.1202750205993652, + "learning_rate": 7.559077602563057e-06, + "loss": 0.9352, + "step": 8640 + }, + { + "epoch": 0.6982767328632902, + "grad_norm": 2.6783177852630615, + "learning_rate": 7.558515421154049e-06, + "loss": 0.9136, + "step": 8641 + }, + { + "epoch": 0.6983575425766178, + "grad_norm": 2.462207317352295, + "learning_rate": 7.557953195923834e-06, + "loss": 0.9122, + "step": 8642 + }, + { + "epoch": 0.6984383522899452, + "grad_norm": 2.8287558555603027, + "learning_rate": 7.557390926882037e-06, + "loss": 0.898, + "step": 8643 + }, + { + "epoch": 0.6985191620032728, + "grad_norm": 2.6783833503723145, + "learning_rate": 7.556828614038292e-06, + "loss": 0.9573, + "step": 8644 + }, + { + "epoch": 0.6985999717166004, + "grad_norm": 2.578094720840454, + "learning_rate": 7.556266257402226e-06, + "loss": 0.9204, + "step": 8645 + }, + { + "epoch": 0.6986807814299278, + "grad_norm": 2.8346030712127686, + "learning_rate": 7.555703856983474e-06, + "loss": 1.0098, + "step": 8646 + }, + { + "epoch": 0.6987615911432554, + "grad_norm": 2.802305221557617, + "learning_rate": 7.555141412791666e-06, + "loss": 1.001, + "step": 8647 + }, + { + "epoch": 0.698842400856583, + "grad_norm": 2.5147013664245605, + "learning_rate": 7.554578924836441e-06, + "loss": 0.8849, + "step": 8648 + }, + { + "epoch": 0.6989232105699105, + "grad_norm": 2.50382137298584, + "learning_rate": 7.554016393127425e-06, + "loss": 1.0378, + "step": 8649 + }, + { + "epoch": 0.699004020283238, + "grad_norm": 2.706132173538208, + "learning_rate": 7.553453817674259e-06, + "loss": 0.8929, + "step": 8650 + }, + { + "epoch": 0.6990848299965656, + "grad_norm": 2.8096556663513184, + "learning_rate": 7.552891198486575e-06, + "loss": 0.8318, + "step": 8651 + }, + { + "epoch": 0.6991656397098931, + "grad_norm": 2.7654807567596436, + "learning_rate": 7.552328535574011e-06, + "loss": 0.8296, + "step": 8652 + }, + { + "epoch": 0.6992464494232207, + "grad_norm": 2.4102702140808105, + "learning_rate": 7.551765828946202e-06, + "loss": 0.9913, + "step": 8653 + }, + { + "epoch": 0.6993272591365483, + "grad_norm": 2.799931049346924, + "learning_rate": 7.5512030786127895e-06, + "loss": 0.9169, + "step": 8654 + }, + { + "epoch": 0.6994080688498757, + "grad_norm": 2.5439860820770264, + "learning_rate": 7.55064028458341e-06, + "loss": 0.9709, + "step": 8655 + }, + { + "epoch": 0.6994888785632033, + "grad_norm": 2.6339478492736816, + "learning_rate": 7.550077446867703e-06, + "loss": 0.9449, + "step": 8656 + }, + { + "epoch": 0.6995696882765309, + "grad_norm": 2.402557611465454, + "learning_rate": 7.549514565475306e-06, + "loss": 1.0367, + "step": 8657 + }, + { + "epoch": 0.6996504979898583, + "grad_norm": 3.135340929031372, + "learning_rate": 7.548951640415866e-06, + "loss": 0.8973, + "step": 8658 + }, + { + "epoch": 0.6997313077031859, + "grad_norm": 3.2426278591156006, + "learning_rate": 7.548388671699019e-06, + "loss": 0.9245, + "step": 8659 + }, + { + "epoch": 0.6998121174165135, + "grad_norm": 2.369006872177124, + "learning_rate": 7.547825659334408e-06, + "loss": 0.931, + "step": 8660 + }, + { + "epoch": 0.699892927129841, + "grad_norm": 2.7105281352996826, + "learning_rate": 7.5472626033316775e-06, + "loss": 0.9759, + "step": 8661 + }, + { + "epoch": 0.6999737368431685, + "grad_norm": 2.8235816955566406, + "learning_rate": 7.546699503700472e-06, + "loss": 0.8962, + "step": 8662 + }, + { + "epoch": 0.7000545465564961, + "grad_norm": 2.7356045246124268, + "learning_rate": 7.546136360450434e-06, + "loss": 0.974, + "step": 8663 + }, + { + "epoch": 0.7001353562698236, + "grad_norm": 2.7572247982025146, + "learning_rate": 7.54557317359121e-06, + "loss": 0.8473, + "step": 8664 + }, + { + "epoch": 0.7002161659831512, + "grad_norm": 2.7012922763824463, + "learning_rate": 7.545009943132446e-06, + "loss": 0.9708, + "step": 8665 + }, + { + "epoch": 0.7002969756964788, + "grad_norm": 2.7031867504119873, + "learning_rate": 7.544446669083788e-06, + "loss": 0.962, + "step": 8666 + }, + { + "epoch": 0.7003777854098062, + "grad_norm": 2.5701441764831543, + "learning_rate": 7.543883351454884e-06, + "loss": 1.0285, + "step": 8667 + }, + { + "epoch": 0.7004585951231338, + "grad_norm": 3.391747236251831, + "learning_rate": 7.543319990255382e-06, + "loss": 0.9503, + "step": 8668 + }, + { + "epoch": 0.7005394048364614, + "grad_norm": 2.4658329486846924, + "learning_rate": 7.542756585494933e-06, + "loss": 0.9535, + "step": 8669 + }, + { + "epoch": 0.7006202145497888, + "grad_norm": 2.760211229324341, + "learning_rate": 7.542193137183184e-06, + "loss": 0.9103, + "step": 8670 + }, + { + "epoch": 0.7007010242631164, + "grad_norm": 2.506631374359131, + "learning_rate": 7.541629645329787e-06, + "loss": 1.0329, + "step": 8671 + }, + { + "epoch": 0.700781833976444, + "grad_norm": 2.7162492275238037, + "learning_rate": 7.541066109944393e-06, + "loss": 0.9238, + "step": 8672 + }, + { + "epoch": 0.7008626436897715, + "grad_norm": 2.9053521156311035, + "learning_rate": 7.540502531036653e-06, + "loss": 1.0092, + "step": 8673 + }, + { + "epoch": 0.700943453403099, + "grad_norm": 2.562232494354248, + "learning_rate": 7.539938908616221e-06, + "loss": 1.0301, + "step": 8674 + }, + { + "epoch": 0.7010242631164266, + "grad_norm": 2.5382494926452637, + "learning_rate": 7.53937524269275e-06, + "loss": 0.907, + "step": 8675 + }, + { + "epoch": 0.7011050728297541, + "grad_norm": 2.983947515487671, + "learning_rate": 7.538811533275896e-06, + "loss": 0.9538, + "step": 8676 + }, + { + "epoch": 0.7011858825430817, + "grad_norm": 2.8068487644195557, + "learning_rate": 7.5382477803753095e-06, + "loss": 1.0288, + "step": 8677 + }, + { + "epoch": 0.7012666922564093, + "grad_norm": 2.4507806301116943, + "learning_rate": 7.537683984000651e-06, + "loss": 0.9991, + "step": 8678 + }, + { + "epoch": 0.7013475019697367, + "grad_norm": 2.6927502155303955, + "learning_rate": 7.5371201441615745e-06, + "loss": 0.9675, + "step": 8679 + }, + { + "epoch": 0.7014283116830643, + "grad_norm": 2.58587384223938, + "learning_rate": 7.53655626086774e-06, + "loss": 0.9279, + "step": 8680 + }, + { + "epoch": 0.7015091213963919, + "grad_norm": 2.3846824169158936, + "learning_rate": 7.535992334128801e-06, + "loss": 0.9453, + "step": 8681 + }, + { + "epoch": 0.7015899311097193, + "grad_norm": 2.7196900844573975, + "learning_rate": 7.535428363954418e-06, + "loss": 0.8563, + "step": 8682 + }, + { + "epoch": 0.7016707408230469, + "grad_norm": 2.4219930171966553, + "learning_rate": 7.534864350354252e-06, + "loss": 1.0132, + "step": 8683 + }, + { + "epoch": 0.7017515505363745, + "grad_norm": 3.8839070796966553, + "learning_rate": 7.534300293337962e-06, + "loss": 0.8899, + "step": 8684 + }, + { + "epoch": 0.701832360249702, + "grad_norm": 2.76360821723938, + "learning_rate": 7.5337361929152085e-06, + "loss": 1.0045, + "step": 8685 + }, + { + "epoch": 0.7019131699630295, + "grad_norm": 2.3911209106445312, + "learning_rate": 7.533172049095654e-06, + "loss": 0.9208, + "step": 8686 + }, + { + "epoch": 0.7019939796763571, + "grad_norm": 2.581125020980835, + "learning_rate": 7.532607861888962e-06, + "loss": 1.0219, + "step": 8687 + }, + { + "epoch": 0.7020747893896846, + "grad_norm": 2.369431734085083, + "learning_rate": 7.532043631304792e-06, + "loss": 0.9991, + "step": 8688 + }, + { + "epoch": 0.7021555991030122, + "grad_norm": 2.5366764068603516, + "learning_rate": 7.531479357352812e-06, + "loss": 0.8967, + "step": 8689 + }, + { + "epoch": 0.7022364088163398, + "grad_norm": 2.5214669704437256, + "learning_rate": 7.530915040042684e-06, + "loss": 0.9072, + "step": 8690 + }, + { + "epoch": 0.7023172185296672, + "grad_norm": 2.962538242340088, + "learning_rate": 7.5303506793840755e-06, + "loss": 0.9389, + "step": 8691 + }, + { + "epoch": 0.7023980282429948, + "grad_norm": 2.8953888416290283, + "learning_rate": 7.52978627538665e-06, + "loss": 1.0155, + "step": 8692 + }, + { + "epoch": 0.7024788379563224, + "grad_norm": 2.7045507431030273, + "learning_rate": 7.529221828060076e-06, + "loss": 1.0143, + "step": 8693 + }, + { + "epoch": 0.7025596476696498, + "grad_norm": 2.3151791095733643, + "learning_rate": 7.5286573374140205e-06, + "loss": 0.977, + "step": 8694 + }, + { + "epoch": 0.7026404573829774, + "grad_norm": 2.6774165630340576, + "learning_rate": 7.528092803458154e-06, + "loss": 0.8586, + "step": 8695 + }, + { + "epoch": 0.702721267096305, + "grad_norm": 2.803908109664917, + "learning_rate": 7.527528226202142e-06, + "loss": 0.8626, + "step": 8696 + }, + { + "epoch": 0.7028020768096325, + "grad_norm": 2.39029860496521, + "learning_rate": 7.526963605655659e-06, + "loss": 0.9367, + "step": 8697 + }, + { + "epoch": 0.70288288652296, + "grad_norm": 2.5351829528808594, + "learning_rate": 7.52639894182837e-06, + "loss": 0.9001, + "step": 8698 + }, + { + "epoch": 0.7029636962362876, + "grad_norm": 2.88838791847229, + "learning_rate": 7.5258342347299504e-06, + "loss": 0.9646, + "step": 8699 + }, + { + "epoch": 0.7030445059496151, + "grad_norm": 2.9428274631500244, + "learning_rate": 7.52526948437007e-06, + "loss": 0.979, + "step": 8700 + }, + { + "epoch": 0.7031253156629427, + "grad_norm": 2.27873158454895, + "learning_rate": 7.524704690758405e-06, + "loss": 0.8633, + "step": 8701 + }, + { + "epoch": 0.7032061253762703, + "grad_norm": 2.4483258724212646, + "learning_rate": 7.524139853904624e-06, + "loss": 0.8733, + "step": 8702 + }, + { + "epoch": 0.7032869350895977, + "grad_norm": 3.072887659072876, + "learning_rate": 7.523574973818406e-06, + "loss": 0.947, + "step": 8703 + }, + { + "epoch": 0.7033677448029253, + "grad_norm": 2.6498184204101562, + "learning_rate": 7.523010050509423e-06, + "loss": 0.9445, + "step": 8704 + }, + { + "epoch": 0.7034485545162529, + "grad_norm": 2.5125577449798584, + "learning_rate": 7.522445083987353e-06, + "loss": 0.8173, + "step": 8705 + }, + { + "epoch": 0.7035293642295803, + "grad_norm": 2.689518690109253, + "learning_rate": 7.521880074261869e-06, + "loss": 0.9604, + "step": 8706 + }, + { + "epoch": 0.7036101739429079, + "grad_norm": 3.4577524662017822, + "learning_rate": 7.521315021342652e-06, + "loss": 0.9367, + "step": 8707 + }, + { + "epoch": 0.7036909836562355, + "grad_norm": 2.4654698371887207, + "learning_rate": 7.520749925239378e-06, + "loss": 0.9222, + "step": 8708 + }, + { + "epoch": 0.703771793369563, + "grad_norm": 2.5475175380706787, + "learning_rate": 7.520184785961727e-06, + "loss": 0.9775, + "step": 8709 + }, + { + "epoch": 0.7038526030828905, + "grad_norm": 2.6552414894104004, + "learning_rate": 7.519619603519376e-06, + "loss": 0.8956, + "step": 8710 + }, + { + "epoch": 0.7039334127962181, + "grad_norm": 2.3695228099823, + "learning_rate": 7.519054377922009e-06, + "loss": 1.0462, + "step": 8711 + }, + { + "epoch": 0.7040142225095456, + "grad_norm": 2.7073700428009033, + "learning_rate": 7.518489109179304e-06, + "loss": 0.912, + "step": 8712 + }, + { + "epoch": 0.7040950322228732, + "grad_norm": 3.083273410797119, + "learning_rate": 7.5179237973009435e-06, + "loss": 0.991, + "step": 8713 + }, + { + "epoch": 0.7041758419362008, + "grad_norm": 2.614108085632324, + "learning_rate": 7.51735844229661e-06, + "loss": 1.073, + "step": 8714 + }, + { + "epoch": 0.7042566516495282, + "grad_norm": 2.8452632427215576, + "learning_rate": 7.5167930441759875e-06, + "loss": 0.9139, + "step": 8715 + }, + { + "epoch": 0.7043374613628558, + "grad_norm": 2.1154541969299316, + "learning_rate": 7.516227602948756e-06, + "loss": 1.0226, + "step": 8716 + }, + { + "epoch": 0.7044182710761834, + "grad_norm": 2.4039242267608643, + "learning_rate": 7.515662118624607e-06, + "loss": 0.9091, + "step": 8717 + }, + { + "epoch": 0.7044990807895108, + "grad_norm": 2.7088048458099365, + "learning_rate": 7.5150965912132205e-06, + "loss": 0.9404, + "step": 8718 + }, + { + "epoch": 0.7045798905028384, + "grad_norm": 2.4967024326324463, + "learning_rate": 7.5145310207242836e-06, + "loss": 0.9141, + "step": 8719 + }, + { + "epoch": 0.704660700216166, + "grad_norm": 2.6987764835357666, + "learning_rate": 7.513965407167485e-06, + "loss": 0.9831, + "step": 8720 + }, + { + "epoch": 0.7047415099294935, + "grad_norm": 2.955282688140869, + "learning_rate": 7.51339975055251e-06, + "loss": 1.0303, + "step": 8721 + }, + { + "epoch": 0.704822319642821, + "grad_norm": 2.0802903175354004, + "learning_rate": 7.512834050889048e-06, + "loss": 0.8419, + "step": 8722 + }, + { + "epoch": 0.7049031293561486, + "grad_norm": 2.7226569652557373, + "learning_rate": 7.51226830818679e-06, + "loss": 0.9608, + "step": 8723 + }, + { + "epoch": 0.7049839390694762, + "grad_norm": 2.963134527206421, + "learning_rate": 7.511702522455422e-06, + "loss": 0.9277, + "step": 8724 + }, + { + "epoch": 0.7050647487828037, + "grad_norm": 2.859212636947632, + "learning_rate": 7.511136693704637e-06, + "loss": 0.9165, + "step": 8725 + }, + { + "epoch": 0.7051455584961313, + "grad_norm": 2.5749497413635254, + "learning_rate": 7.510570821944126e-06, + "loss": 0.947, + "step": 8726 + }, + { + "epoch": 0.7052263682094588, + "grad_norm": 2.3279025554656982, + "learning_rate": 7.510004907183581e-06, + "loss": 0.919, + "step": 8727 + }, + { + "epoch": 0.7053071779227863, + "grad_norm": 2.355889320373535, + "learning_rate": 7.509438949432694e-06, + "loss": 0.8945, + "step": 8728 + }, + { + "epoch": 0.7053879876361139, + "grad_norm": 2.6222805976867676, + "learning_rate": 7.50887294870116e-06, + "loss": 0.9518, + "step": 8729 + }, + { + "epoch": 0.7054687973494415, + "grad_norm": 3.2635293006896973, + "learning_rate": 7.50830690499867e-06, + "loss": 0.9052, + "step": 8730 + }, + { + "epoch": 0.7055496070627689, + "grad_norm": 2.763084650039673, + "learning_rate": 7.507740818334924e-06, + "loss": 0.8967, + "step": 8731 + }, + { + "epoch": 0.7056304167760965, + "grad_norm": 2.514324426651001, + "learning_rate": 7.507174688719614e-06, + "loss": 0.9678, + "step": 8732 + }, + { + "epoch": 0.7057112264894241, + "grad_norm": 2.5376338958740234, + "learning_rate": 7.506608516162437e-06, + "loss": 0.9405, + "step": 8733 + }, + { + "epoch": 0.7057920362027515, + "grad_norm": 2.724379301071167, + "learning_rate": 7.50604230067309e-06, + "loss": 0.8867, + "step": 8734 + }, + { + "epoch": 0.7058728459160791, + "grad_norm": 2.402949094772339, + "learning_rate": 7.505476042261271e-06, + "loss": 0.844, + "step": 8735 + }, + { + "epoch": 0.7059536556294067, + "grad_norm": 2.7474207878112793, + "learning_rate": 7.504909740936681e-06, + "loss": 1.0281, + "step": 8736 + }, + { + "epoch": 0.7060344653427342, + "grad_norm": 2.48898983001709, + "learning_rate": 7.504343396709017e-06, + "loss": 0.9633, + "step": 8737 + }, + { + "epoch": 0.7061152750560618, + "grad_norm": 2.1229522228240967, + "learning_rate": 7.503777009587978e-06, + "loss": 1.0316, + "step": 8738 + }, + { + "epoch": 0.7061960847693893, + "grad_norm": 2.361743688583374, + "learning_rate": 7.5032105795832685e-06, + "loss": 0.9821, + "step": 8739 + }, + { + "epoch": 0.7062768944827168, + "grad_norm": 2.911328077316284, + "learning_rate": 7.502644106704586e-06, + "loss": 1.1078, + "step": 8740 + }, + { + "epoch": 0.7063577041960444, + "grad_norm": 2.2683932781219482, + "learning_rate": 7.5020775909616365e-06, + "loss": 0.9699, + "step": 8741 + }, + { + "epoch": 0.706438513909372, + "grad_norm": 2.544445753097534, + "learning_rate": 7.50151103236412e-06, + "loss": 1.0466, + "step": 8742 + }, + { + "epoch": 0.7065193236226994, + "grad_norm": 2.5798516273498535, + "learning_rate": 7.500944430921743e-06, + "loss": 0.9785, + "step": 8743 + }, + { + "epoch": 0.706600133336027, + "grad_norm": 2.4686522483825684, + "learning_rate": 7.500377786644207e-06, + "loss": 0.9202, + "step": 8744 + }, + { + "epoch": 0.7066809430493546, + "grad_norm": 2.7797043323516846, + "learning_rate": 7.499811099541221e-06, + "loss": 0.9207, + "step": 8745 + }, + { + "epoch": 0.706761752762682, + "grad_norm": 2.6669442653656006, + "learning_rate": 7.499244369622488e-06, + "loss": 0.9433, + "step": 8746 + }, + { + "epoch": 0.7068425624760096, + "grad_norm": 2.5092170238494873, + "learning_rate": 7.4986775968977155e-06, + "loss": 0.9155, + "step": 8747 + }, + { + "epoch": 0.7069233721893372, + "grad_norm": 2.559154510498047, + "learning_rate": 7.498110781376611e-06, + "loss": 0.9418, + "step": 8748 + }, + { + "epoch": 0.7070041819026647, + "grad_norm": 2.280831813812256, + "learning_rate": 7.497543923068883e-06, + "loss": 1.038, + "step": 8749 + }, + { + "epoch": 0.7070849916159923, + "grad_norm": 2.3083012104034424, + "learning_rate": 7.4969770219842395e-06, + "loss": 1.0707, + "step": 8750 + }, + { + "epoch": 0.7071658013293198, + "grad_norm": 3.4840195178985596, + "learning_rate": 7.4964100781323915e-06, + "loss": 1.0052, + "step": 8751 + }, + { + "epoch": 0.7072466110426473, + "grad_norm": 3.099773406982422, + "learning_rate": 7.495843091523049e-06, + "loss": 1.0539, + "step": 8752 + }, + { + "epoch": 0.7073274207559749, + "grad_norm": 2.438974142074585, + "learning_rate": 7.495276062165922e-06, + "loss": 1.1024, + "step": 8753 + }, + { + "epoch": 0.7074082304693025, + "grad_norm": 2.3954732418060303, + "learning_rate": 7.494708990070724e-06, + "loss": 0.9245, + "step": 8754 + }, + { + "epoch": 0.7074890401826299, + "grad_norm": 2.5976598262786865, + "learning_rate": 7.494141875247165e-06, + "loss": 0.9778, + "step": 8755 + }, + { + "epoch": 0.7075698498959575, + "grad_norm": 2.151345729827881, + "learning_rate": 7.493574717704964e-06, + "loss": 0.9815, + "step": 8756 + }, + { + "epoch": 0.7076506596092851, + "grad_norm": 2.6544687747955322, + "learning_rate": 7.493007517453828e-06, + "loss": 0.9092, + "step": 8757 + }, + { + "epoch": 0.7077314693226125, + "grad_norm": 2.319226026535034, + "learning_rate": 7.4924402745034745e-06, + "loss": 0.9584, + "step": 8758 + }, + { + "epoch": 0.7078122790359401, + "grad_norm": 2.8776092529296875, + "learning_rate": 7.49187298886362e-06, + "loss": 1.087, + "step": 8759 + }, + { + "epoch": 0.7078930887492677, + "grad_norm": 2.3584213256835938, + "learning_rate": 7.491305660543982e-06, + "loss": 0.7846, + "step": 8760 + }, + { + "epoch": 0.7079738984625952, + "grad_norm": 2.5362932682037354, + "learning_rate": 7.490738289554273e-06, + "loss": 0.8692, + "step": 8761 + }, + { + "epoch": 0.7080547081759228, + "grad_norm": 2.2135183811187744, + "learning_rate": 7.490170875904215e-06, + "loss": 1.0255, + "step": 8762 + }, + { + "epoch": 0.7081355178892503, + "grad_norm": 2.3536763191223145, + "learning_rate": 7.489603419603524e-06, + "loss": 0.8183, + "step": 8763 + }, + { + "epoch": 0.7082163276025778, + "grad_norm": 2.256617784500122, + "learning_rate": 7.48903592066192e-06, + "loss": 0.9819, + "step": 8764 + }, + { + "epoch": 0.7082971373159054, + "grad_norm": 2.7875657081604004, + "learning_rate": 7.488468379089123e-06, + "loss": 0.8544, + "step": 8765 + }, + { + "epoch": 0.708377947029233, + "grad_norm": 3.078984498977661, + "learning_rate": 7.487900794894853e-06, + "loss": 0.9126, + "step": 8766 + }, + { + "epoch": 0.7084587567425604, + "grad_norm": 2.362226963043213, + "learning_rate": 7.487333168088832e-06, + "loss": 0.8633, + "step": 8767 + }, + { + "epoch": 0.708539566455888, + "grad_norm": 2.774829626083374, + "learning_rate": 7.4867654986807824e-06, + "loss": 0.9722, + "step": 8768 + }, + { + "epoch": 0.7086203761692156, + "grad_norm": 2.991361379623413, + "learning_rate": 7.486197786680425e-06, + "loss": 0.9546, + "step": 8769 + }, + { + "epoch": 0.708701185882543, + "grad_norm": 2.9520673751831055, + "learning_rate": 7.485630032097486e-06, + "loss": 0.9553, + "step": 8770 + }, + { + "epoch": 0.7087819955958706, + "grad_norm": 3.1070826053619385, + "learning_rate": 7.4850622349416894e-06, + "loss": 1.0138, + "step": 8771 + }, + { + "epoch": 0.7088628053091982, + "grad_norm": 2.724799633026123, + "learning_rate": 7.484494395222758e-06, + "loss": 0.947, + "step": 8772 + }, + { + "epoch": 0.7089436150225257, + "grad_norm": 2.702982187271118, + "learning_rate": 7.483926512950418e-06, + "loss": 0.9498, + "step": 8773 + }, + { + "epoch": 0.7090244247358533, + "grad_norm": 2.4756410121917725, + "learning_rate": 7.483358588134398e-06, + "loss": 0.9672, + "step": 8774 + }, + { + "epoch": 0.7091052344491808, + "grad_norm": 2.4144980907440186, + "learning_rate": 7.482790620784423e-06, + "loss": 1.049, + "step": 8775 + }, + { + "epoch": 0.7091860441625083, + "grad_norm": 2.344552516937256, + "learning_rate": 7.482222610910222e-06, + "loss": 0.8233, + "step": 8776 + }, + { + "epoch": 0.7092668538758359, + "grad_norm": 2.753931760787964, + "learning_rate": 7.481654558521523e-06, + "loss": 0.9061, + "step": 8777 + }, + { + "epoch": 0.7093476635891635, + "grad_norm": 2.382394552230835, + "learning_rate": 7.481086463628057e-06, + "loss": 0.9233, + "step": 8778 + }, + { + "epoch": 0.7094284733024909, + "grad_norm": 2.821577787399292, + "learning_rate": 7.480518326239552e-06, + "loss": 1.034, + "step": 8779 + }, + { + "epoch": 0.7095092830158185, + "grad_norm": 2.4641852378845215, + "learning_rate": 7.47995014636574e-06, + "loss": 0.9662, + "step": 8780 + }, + { + "epoch": 0.7095900927291461, + "grad_norm": 6.780241966247559, + "learning_rate": 7.479381924016351e-06, + "loss": 0.9078, + "step": 8781 + }, + { + "epoch": 0.7096709024424736, + "grad_norm": 2.5848770141601562, + "learning_rate": 7.47881365920112e-06, + "loss": 0.9667, + "step": 8782 + }, + { + "epoch": 0.7097517121558011, + "grad_norm": 2.367079496383667, + "learning_rate": 7.478245351929777e-06, + "loss": 0.8423, + "step": 8783 + }, + { + "epoch": 0.7098325218691287, + "grad_norm": 2.7461133003234863, + "learning_rate": 7.4776770022120596e-06, + "loss": 1.039, + "step": 8784 + }, + { + "epoch": 0.7099133315824562, + "grad_norm": 2.403761148452759, + "learning_rate": 7.477108610057699e-06, + "loss": 0.8635, + "step": 8785 + }, + { + "epoch": 0.7099941412957838, + "grad_norm": 2.949636936187744, + "learning_rate": 7.47654017547643e-06, + "loss": 0.9767, + "step": 8786 + }, + { + "epoch": 0.7100749510091113, + "grad_norm": 2.945683479309082, + "learning_rate": 7.4759716984779906e-06, + "loss": 0.9774, + "step": 8787 + }, + { + "epoch": 0.7101557607224388, + "grad_norm": 2.611204147338867, + "learning_rate": 7.475403179072116e-06, + "loss": 1.0105, + "step": 8788 + }, + { + "epoch": 0.7102365704357664, + "grad_norm": 2.9422717094421387, + "learning_rate": 7.474834617268545e-06, + "loss": 0.9791, + "step": 8789 + }, + { + "epoch": 0.710317380149094, + "grad_norm": 2.365835666656494, + "learning_rate": 7.4742660130770165e-06, + "loss": 0.9045, + "step": 8790 + }, + { + "epoch": 0.7103981898624214, + "grad_norm": 2.431830644607544, + "learning_rate": 7.473697366507264e-06, + "loss": 0.971, + "step": 8791 + }, + { + "epoch": 0.710478999575749, + "grad_norm": 2.978790521621704, + "learning_rate": 7.4731286775690344e-06, + "loss": 0.9703, + "step": 8792 + }, + { + "epoch": 0.7105598092890766, + "grad_norm": 2.4299097061157227, + "learning_rate": 7.472559946272063e-06, + "loss": 1.082, + "step": 8793 + }, + { + "epoch": 0.710640619002404, + "grad_norm": 2.591629981994629, + "learning_rate": 7.4719911726260915e-06, + "loss": 0.8756, + "step": 8794 + }, + { + "epoch": 0.7107214287157316, + "grad_norm": 2.925849676132202, + "learning_rate": 7.471422356640863e-06, + "loss": 0.8851, + "step": 8795 + }, + { + "epoch": 0.7108022384290592, + "grad_norm": 2.668818473815918, + "learning_rate": 7.470853498326121e-06, + "loss": 0.9577, + "step": 8796 + }, + { + "epoch": 0.7108830481423867, + "grad_norm": 2.38317608833313, + "learning_rate": 7.470284597691603e-06, + "loss": 0.8331, + "step": 8797 + }, + { + "epoch": 0.7109638578557143, + "grad_norm": 2.542328119277954, + "learning_rate": 7.469715654747059e-06, + "loss": 1.0084, + "step": 8798 + }, + { + "epoch": 0.7110446675690418, + "grad_norm": 3.080044984817505, + "learning_rate": 7.469146669502232e-06, + "loss": 0.88, + "step": 8799 + }, + { + "epoch": 0.7111254772823693, + "grad_norm": 2.602565050125122, + "learning_rate": 7.468577641966866e-06, + "loss": 1.0348, + "step": 8800 + }, + { + "epoch": 0.7112062869956969, + "grad_norm": 2.9165420532226562, + "learning_rate": 7.468008572150707e-06, + "loss": 0.9912, + "step": 8801 + }, + { + "epoch": 0.7112870967090245, + "grad_norm": 2.833052158355713, + "learning_rate": 7.467439460063504e-06, + "loss": 0.9313, + "step": 8802 + }, + { + "epoch": 0.7113679064223519, + "grad_norm": 2.9368364810943604, + "learning_rate": 7.466870305715002e-06, + "loss": 0.9332, + "step": 8803 + }, + { + "epoch": 0.7114487161356795, + "grad_norm": 3.3586065769195557, + "learning_rate": 7.466301109114953e-06, + "loss": 0.9256, + "step": 8804 + }, + { + "epoch": 0.7115295258490071, + "grad_norm": 2.572722911834717, + "learning_rate": 7.4657318702731e-06, + "loss": 0.9497, + "step": 8805 + }, + { + "epoch": 0.7116103355623346, + "grad_norm": 2.5348410606384277, + "learning_rate": 7.465162589199197e-06, + "loss": 0.9544, + "step": 8806 + }, + { + "epoch": 0.7116911452756621, + "grad_norm": 3.143693208694458, + "learning_rate": 7.464593265902995e-06, + "loss": 0.9532, + "step": 8807 + }, + { + "epoch": 0.7117719549889897, + "grad_norm": 2.5578513145446777, + "learning_rate": 7.464023900394243e-06, + "loss": 0.8862, + "step": 8808 + }, + { + "epoch": 0.7118527647023172, + "grad_norm": 2.3635027408599854, + "learning_rate": 7.463454492682693e-06, + "loss": 0.9082, + "step": 8809 + }, + { + "epoch": 0.7119335744156448, + "grad_norm": 2.573094606399536, + "learning_rate": 7.462885042778097e-06, + "loss": 0.9715, + "step": 8810 + }, + { + "epoch": 0.7120143841289723, + "grad_norm": 2.5036468505859375, + "learning_rate": 7.462315550690211e-06, + "loss": 0.873, + "step": 8811 + }, + { + "epoch": 0.7120951938422998, + "grad_norm": 2.525418281555176, + "learning_rate": 7.461746016428787e-06, + "loss": 0.9455, + "step": 8812 + }, + { + "epoch": 0.7121760035556274, + "grad_norm": 2.4634768962860107, + "learning_rate": 7.461176440003581e-06, + "loss": 0.9364, + "step": 8813 + }, + { + "epoch": 0.712256813268955, + "grad_norm": 2.6082589626312256, + "learning_rate": 7.460606821424347e-06, + "loss": 0.9037, + "step": 8814 + }, + { + "epoch": 0.7123376229822824, + "grad_norm": 2.4430713653564453, + "learning_rate": 7.460037160700842e-06, + "loss": 0.8612, + "step": 8815 + }, + { + "epoch": 0.71241843269561, + "grad_norm": 2.94598126411438, + "learning_rate": 7.459467457842822e-06, + "loss": 0.9126, + "step": 8816 + }, + { + "epoch": 0.7124992424089376, + "grad_norm": 2.605842351913452, + "learning_rate": 7.458897712860045e-06, + "loss": 0.9074, + "step": 8817 + }, + { + "epoch": 0.712580052122265, + "grad_norm": 2.8378100395202637, + "learning_rate": 7.45832792576227e-06, + "loss": 1.006, + "step": 8818 + }, + { + "epoch": 0.7126608618355926, + "grad_norm": 2.8556532859802246, + "learning_rate": 7.457758096559256e-06, + "loss": 0.9265, + "step": 8819 + }, + { + "epoch": 0.7127416715489202, + "grad_norm": 2.8925414085388184, + "learning_rate": 7.457188225260763e-06, + "loss": 0.9719, + "step": 8820 + }, + { + "epoch": 0.7128224812622477, + "grad_norm": 3.1155261993408203, + "learning_rate": 7.456618311876551e-06, + "loss": 0.93, + "step": 8821 + }, + { + "epoch": 0.7129032909755753, + "grad_norm": 2.681637763977051, + "learning_rate": 7.456048356416381e-06, + "loss": 0.8978, + "step": 8822 + }, + { + "epoch": 0.7129841006889028, + "grad_norm": 2.8793821334838867, + "learning_rate": 7.455478358890016e-06, + "loss": 0.9453, + "step": 8823 + }, + { + "epoch": 0.7130649104022303, + "grad_norm": 2.7457947731018066, + "learning_rate": 7.454908319307218e-06, + "loss": 0.8971, + "step": 8824 + }, + { + "epoch": 0.7131457201155579, + "grad_norm": 2.4251136779785156, + "learning_rate": 7.45433823767775e-06, + "loss": 0.9628, + "step": 8825 + }, + { + "epoch": 0.7132265298288855, + "grad_norm": 2.7069544792175293, + "learning_rate": 7.453768114011377e-06, + "loss": 0.9469, + "step": 8826 + }, + { + "epoch": 0.7133073395422129, + "grad_norm": 3.0360372066497803, + "learning_rate": 7.453197948317864e-06, + "loss": 0.9199, + "step": 8827 + }, + { + "epoch": 0.7133881492555405, + "grad_norm": 2.604696273803711, + "learning_rate": 7.4526277406069735e-06, + "loss": 0.8767, + "step": 8828 + }, + { + "epoch": 0.7134689589688681, + "grad_norm": 2.3818135261535645, + "learning_rate": 7.4520574908884765e-06, + "loss": 0.9631, + "step": 8829 + }, + { + "epoch": 0.7135497686821956, + "grad_norm": 2.156619071960449, + "learning_rate": 7.451487199172136e-06, + "loss": 1.0937, + "step": 8830 + }, + { + "epoch": 0.7136305783955231, + "grad_norm": 2.309812307357788, + "learning_rate": 7.450916865467725e-06, + "loss": 1.0414, + "step": 8831 + }, + { + "epoch": 0.7137113881088507, + "grad_norm": 3.0624377727508545, + "learning_rate": 7.450346489785006e-06, + "loss": 1.0111, + "step": 8832 + }, + { + "epoch": 0.7137921978221782, + "grad_norm": 2.6573123931884766, + "learning_rate": 7.44977607213375e-06, + "loss": 1.0546, + "step": 8833 + }, + { + "epoch": 0.7138730075355058, + "grad_norm": 2.9727978706359863, + "learning_rate": 7.4492056125237275e-06, + "loss": 0.9685, + "step": 8834 + }, + { + "epoch": 0.7139538172488333, + "grad_norm": 2.5815351009368896, + "learning_rate": 7.44863511096471e-06, + "loss": 0.859, + "step": 8835 + }, + { + "epoch": 0.7140346269621608, + "grad_norm": 2.6571624279022217, + "learning_rate": 7.448064567466468e-06, + "loss": 0.9839, + "step": 8836 + }, + { + "epoch": 0.7141154366754884, + "grad_norm": 2.9274699687957764, + "learning_rate": 7.447493982038774e-06, + "loss": 0.9024, + "step": 8837 + }, + { + "epoch": 0.714196246388816, + "grad_norm": 2.851759433746338, + "learning_rate": 7.446923354691399e-06, + "loss": 0.9148, + "step": 8838 + }, + { + "epoch": 0.7142770561021434, + "grad_norm": 2.4229416847229004, + "learning_rate": 7.446352685434117e-06, + "loss": 0.8639, + "step": 8839 + }, + { + "epoch": 0.714357865815471, + "grad_norm": 3.232449769973755, + "learning_rate": 7.4457819742767045e-06, + "loss": 0.9335, + "step": 8840 + }, + { + "epoch": 0.7144386755287986, + "grad_norm": 2.5353729724884033, + "learning_rate": 7.445211221228934e-06, + "loss": 0.9065, + "step": 8841 + }, + { + "epoch": 0.714519485242126, + "grad_norm": 2.435084581375122, + "learning_rate": 7.444640426300581e-06, + "loss": 0.9874, + "step": 8842 + }, + { + "epoch": 0.7146002949554536, + "grad_norm": 2.8686530590057373, + "learning_rate": 7.444069589501425e-06, + "loss": 0.9891, + "step": 8843 + }, + { + "epoch": 0.7146811046687812, + "grad_norm": 2.7953402996063232, + "learning_rate": 7.443498710841238e-06, + "loss": 1.0319, + "step": 8844 + }, + { + "epoch": 0.7147619143821087, + "grad_norm": 3.346709728240967, + "learning_rate": 7.442927790329804e-06, + "loss": 0.829, + "step": 8845 + }, + { + "epoch": 0.7148427240954363, + "grad_norm": 2.531511068344116, + "learning_rate": 7.442356827976895e-06, + "loss": 1.0826, + "step": 8846 + }, + { + "epoch": 0.7149235338087638, + "grad_norm": 2.749894618988037, + "learning_rate": 7.441785823792294e-06, + "loss": 0.8799, + "step": 8847 + }, + { + "epoch": 0.7150043435220913, + "grad_norm": 2.744140625, + "learning_rate": 7.441214777785781e-06, + "loss": 1.0979, + "step": 8848 + }, + { + "epoch": 0.7150851532354189, + "grad_norm": 2.4637608528137207, + "learning_rate": 7.440643689967135e-06, + "loss": 0.904, + "step": 8849 + }, + { + "epoch": 0.7151659629487465, + "grad_norm": 2.5459442138671875, + "learning_rate": 7.440072560346139e-06, + "loss": 0.8667, + "step": 8850 + }, + { + "epoch": 0.7152467726620739, + "grad_norm": 2.253364086151123, + "learning_rate": 7.439501388932574e-06, + "loss": 0.8629, + "step": 8851 + }, + { + "epoch": 0.7153275823754015, + "grad_norm": 2.4464023113250732, + "learning_rate": 7.438930175736223e-06, + "loss": 0.9178, + "step": 8852 + }, + { + "epoch": 0.7154083920887291, + "grad_norm": 2.881850004196167, + "learning_rate": 7.43835892076687e-06, + "loss": 1.0262, + "step": 8853 + }, + { + "epoch": 0.7154892018020567, + "grad_norm": 2.4822871685028076, + "learning_rate": 7.437787624034297e-06, + "loss": 0.9632, + "step": 8854 + }, + { + "epoch": 0.7155700115153841, + "grad_norm": 2.4680581092834473, + "learning_rate": 7.437216285548293e-06, + "loss": 1.0259, + "step": 8855 + }, + { + "epoch": 0.7156508212287117, + "grad_norm": 2.3201491832733154, + "learning_rate": 7.436644905318639e-06, + "loss": 0.9346, + "step": 8856 + }, + { + "epoch": 0.7157316309420393, + "grad_norm": 2.338230848312378, + "learning_rate": 7.4360734833551265e-06, + "loss": 1.0319, + "step": 8857 + }, + { + "epoch": 0.7158124406553668, + "grad_norm": 2.7920634746551514, + "learning_rate": 7.435502019667537e-06, + "loss": 0.9249, + "step": 8858 + }, + { + "epoch": 0.7158932503686943, + "grad_norm": 2.5076868534088135, + "learning_rate": 7.4349305142656635e-06, + "loss": 0.9409, + "step": 8859 + }, + { + "epoch": 0.7159740600820219, + "grad_norm": 2.9880881309509277, + "learning_rate": 7.434358967159292e-06, + "loss": 0.9233, + "step": 8860 + }, + { + "epoch": 0.7160548697953494, + "grad_norm": 2.8383290767669678, + "learning_rate": 7.433787378358211e-06, + "loss": 0.9234, + "step": 8861 + }, + { + "epoch": 0.716135679508677, + "grad_norm": 2.662801504135132, + "learning_rate": 7.433215747872211e-06, + "loss": 0.9522, + "step": 8862 + }, + { + "epoch": 0.7162164892220045, + "grad_norm": 2.7637696266174316, + "learning_rate": 7.432644075711084e-06, + "loss": 1.0106, + "step": 8863 + }, + { + "epoch": 0.716297298935332, + "grad_norm": 2.501694917678833, + "learning_rate": 7.432072361884619e-06, + "loss": 0.8882, + "step": 8864 + }, + { + "epoch": 0.7163781086486596, + "grad_norm": 3.0861642360687256, + "learning_rate": 7.4315006064026115e-06, + "loss": 0.9129, + "step": 8865 + }, + { + "epoch": 0.7164589183619872, + "grad_norm": 2.6039481163024902, + "learning_rate": 7.430928809274851e-06, + "loss": 0.8577, + "step": 8866 + }, + { + "epoch": 0.7165397280753146, + "grad_norm": 2.7840166091918945, + "learning_rate": 7.430356970511132e-06, + "loss": 0.9922, + "step": 8867 + }, + { + "epoch": 0.7166205377886422, + "grad_norm": 2.8131988048553467, + "learning_rate": 7.429785090121249e-06, + "loss": 0.8784, + "step": 8868 + }, + { + "epoch": 0.7167013475019698, + "grad_norm": 2.765836000442505, + "learning_rate": 7.429213168114997e-06, + "loss": 0.8664, + "step": 8869 + }, + { + "epoch": 0.7167821572152973, + "grad_norm": 3.1490299701690674, + "learning_rate": 7.4286412045021706e-06, + "loss": 0.9114, + "step": 8870 + }, + { + "epoch": 0.7168629669286248, + "grad_norm": 2.8300869464874268, + "learning_rate": 7.428069199292569e-06, + "loss": 1.0112, + "step": 8871 + }, + { + "epoch": 0.7169437766419524, + "grad_norm": 2.7464821338653564, + "learning_rate": 7.427497152495986e-06, + "loss": 1.0271, + "step": 8872 + }, + { + "epoch": 0.7170245863552799, + "grad_norm": 2.8566503524780273, + "learning_rate": 7.426925064122221e-06, + "loss": 0.9684, + "step": 8873 + }, + { + "epoch": 0.7171053960686075, + "grad_norm": 2.7995786666870117, + "learning_rate": 7.426352934181072e-06, + "loss": 0.9177, + "step": 8874 + }, + { + "epoch": 0.717186205781935, + "grad_norm": 2.7830474376678467, + "learning_rate": 7.42578076268234e-06, + "loss": 0.9576, + "step": 8875 + }, + { + "epoch": 0.7172670154952625, + "grad_norm": 2.440385103225708, + "learning_rate": 7.42520854963582e-06, + "loss": 0.9043, + "step": 8876 + }, + { + "epoch": 0.7173478252085901, + "grad_norm": 2.4629225730895996, + "learning_rate": 7.424636295051319e-06, + "loss": 0.9895, + "step": 8877 + }, + { + "epoch": 0.7174286349219177, + "grad_norm": 2.867295026779175, + "learning_rate": 7.424063998938634e-06, + "loss": 0.8997, + "step": 8878 + }, + { + "epoch": 0.7175094446352451, + "grad_norm": 2.884681224822998, + "learning_rate": 7.423491661307569e-06, + "loss": 0.9008, + "step": 8879 + }, + { + "epoch": 0.7175902543485727, + "grad_norm": 2.4002974033355713, + "learning_rate": 7.422919282167926e-06, + "loss": 1.0467, + "step": 8880 + }, + { + "epoch": 0.7176710640619003, + "grad_norm": 2.7994320392608643, + "learning_rate": 7.422346861529509e-06, + "loss": 0.8691, + "step": 8881 + }, + { + "epoch": 0.7177518737752278, + "grad_norm": 2.3295881748199463, + "learning_rate": 7.421774399402122e-06, + "loss": 1.0631, + "step": 8882 + }, + { + "epoch": 0.7178326834885553, + "grad_norm": 2.559081554412842, + "learning_rate": 7.421201895795569e-06, + "loss": 0.9642, + "step": 8883 + }, + { + "epoch": 0.7179134932018829, + "grad_norm": 2.916477680206299, + "learning_rate": 7.420629350719656e-06, + "loss": 0.9586, + "step": 8884 + }, + { + "epoch": 0.7179943029152104, + "grad_norm": 2.9504342079162598, + "learning_rate": 7.4200567641841905e-06, + "loss": 0.9844, + "step": 8885 + }, + { + "epoch": 0.718075112628538, + "grad_norm": 2.548313856124878, + "learning_rate": 7.419484136198978e-06, + "loss": 0.9824, + "step": 8886 + }, + { + "epoch": 0.7181559223418655, + "grad_norm": 2.7055087089538574, + "learning_rate": 7.418911466773827e-06, + "loss": 0.8615, + "step": 8887 + }, + { + "epoch": 0.718236732055193, + "grad_norm": 2.6341552734375, + "learning_rate": 7.418338755918547e-06, + "loss": 1.0053, + "step": 8888 + }, + { + "epoch": 0.7183175417685206, + "grad_norm": 2.4791226387023926, + "learning_rate": 7.417766003642945e-06, + "loss": 1.2013, + "step": 8889 + }, + { + "epoch": 0.7183983514818482, + "grad_norm": 2.622504711151123, + "learning_rate": 7.417193209956832e-06, + "loss": 0.928, + "step": 8890 + }, + { + "epoch": 0.7184791611951756, + "grad_norm": 2.598707914352417, + "learning_rate": 7.416620374870018e-06, + "loss": 1.0093, + "step": 8891 + }, + { + "epoch": 0.7185599709085032, + "grad_norm": 3.0351829528808594, + "learning_rate": 7.416047498392316e-06, + "loss": 0.8535, + "step": 8892 + }, + { + "epoch": 0.7186407806218308, + "grad_norm": 2.9704062938690186, + "learning_rate": 7.415474580533535e-06, + "loss": 0.9562, + "step": 8893 + }, + { + "epoch": 0.7187215903351583, + "grad_norm": 2.4888765811920166, + "learning_rate": 7.414901621303492e-06, + "loss": 1.039, + "step": 8894 + }, + { + "epoch": 0.7188024000484858, + "grad_norm": 2.8852639198303223, + "learning_rate": 7.414328620711994e-06, + "loss": 0.8544, + "step": 8895 + }, + { + "epoch": 0.7188832097618134, + "grad_norm": 2.756049871444702, + "learning_rate": 7.413755578768863e-06, + "loss": 0.9316, + "step": 8896 + }, + { + "epoch": 0.7189640194751409, + "grad_norm": 2.4644150733947754, + "learning_rate": 7.4131824954839075e-06, + "loss": 1.041, + "step": 8897 + }, + { + "epoch": 0.7190448291884685, + "grad_norm": 2.5486245155334473, + "learning_rate": 7.4126093708669466e-06, + "loss": 0.9131, + "step": 8898 + }, + { + "epoch": 0.719125638901796, + "grad_norm": 2.486377716064453, + "learning_rate": 7.412036204927794e-06, + "loss": 0.9583, + "step": 8899 + }, + { + "epoch": 0.7192064486151235, + "grad_norm": 2.431185245513916, + "learning_rate": 7.411462997676269e-06, + "loss": 1.0265, + "step": 8900 + }, + { + "epoch": 0.7192872583284511, + "grad_norm": 2.976555109024048, + "learning_rate": 7.4108897491221875e-06, + "loss": 0.9008, + "step": 8901 + }, + { + "epoch": 0.7193680680417787, + "grad_norm": 2.3858213424682617, + "learning_rate": 7.410316459275369e-06, + "loss": 0.8608, + "step": 8902 + }, + { + "epoch": 0.7194488777551061, + "grad_norm": 2.9575653076171875, + "learning_rate": 7.409743128145632e-06, + "loss": 0.8934, + "step": 8903 + }, + { + "epoch": 0.7195296874684337, + "grad_norm": 2.8733882904052734, + "learning_rate": 7.409169755742797e-06, + "loss": 0.9504, + "step": 8904 + }, + { + "epoch": 0.7196104971817613, + "grad_norm": 3.163473129272461, + "learning_rate": 7.408596342076684e-06, + "loss": 0.95, + "step": 8905 + }, + { + "epoch": 0.7196913068950888, + "grad_norm": 2.6298673152923584, + "learning_rate": 7.4080228871571125e-06, + "loss": 0.8962, + "step": 8906 + }, + { + "epoch": 0.7197721166084163, + "grad_norm": 2.399029016494751, + "learning_rate": 7.407449390993907e-06, + "loss": 0.8262, + "step": 8907 + }, + { + "epoch": 0.7198529263217439, + "grad_norm": 2.5172319412231445, + "learning_rate": 7.40687585359689e-06, + "loss": 1.0425, + "step": 8908 + }, + { + "epoch": 0.7199337360350714, + "grad_norm": 2.794080972671509, + "learning_rate": 7.406302274975883e-06, + "loss": 0.9308, + "step": 8909 + }, + { + "epoch": 0.720014545748399, + "grad_norm": 2.1614184379577637, + "learning_rate": 7.405728655140711e-06, + "loss": 0.9056, + "step": 8910 + }, + { + "epoch": 0.7200953554617265, + "grad_norm": 2.6804192066192627, + "learning_rate": 7.405154994101198e-06, + "loss": 0.8006, + "step": 8911 + }, + { + "epoch": 0.720176165175054, + "grad_norm": 2.838224411010742, + "learning_rate": 7.404581291867172e-06, + "loss": 0.9918, + "step": 8912 + }, + { + "epoch": 0.7202569748883816, + "grad_norm": 2.758174419403076, + "learning_rate": 7.404007548448455e-06, + "loss": 0.9579, + "step": 8913 + }, + { + "epoch": 0.7203377846017092, + "grad_norm": 2.366940975189209, + "learning_rate": 7.403433763854878e-06, + "loss": 1.0412, + "step": 8914 + }, + { + "epoch": 0.7204185943150366, + "grad_norm": 2.428494930267334, + "learning_rate": 7.402859938096265e-06, + "loss": 1.0465, + "step": 8915 + }, + { + "epoch": 0.7204994040283642, + "grad_norm": 2.558030843734741, + "learning_rate": 7.402286071182449e-06, + "loss": 0.9653, + "step": 8916 + }, + { + "epoch": 0.7205802137416918, + "grad_norm": 2.6570165157318115, + "learning_rate": 7.401712163123252e-06, + "loss": 0.9815, + "step": 8917 + }, + { + "epoch": 0.7206610234550193, + "grad_norm": 2.3573553562164307, + "learning_rate": 7.4011382139285105e-06, + "loss": 0.9072, + "step": 8918 + }, + { + "epoch": 0.7207418331683468, + "grad_norm": 2.363840103149414, + "learning_rate": 7.40056422360805e-06, + "loss": 0.9098, + "step": 8919 + }, + { + "epoch": 0.7208226428816744, + "grad_norm": 2.896048069000244, + "learning_rate": 7.399990192171704e-06, + "loss": 1.0659, + "step": 8920 + }, + { + "epoch": 0.7209034525950019, + "grad_norm": 2.909825563430786, + "learning_rate": 7.3994161196293035e-06, + "loss": 0.9126, + "step": 8921 + }, + { + "epoch": 0.7209842623083295, + "grad_norm": 2.5531294345855713, + "learning_rate": 7.398842005990683e-06, + "loss": 0.9924, + "step": 8922 + }, + { + "epoch": 0.721065072021657, + "grad_norm": 2.868154764175415, + "learning_rate": 7.398267851265671e-06, + "loss": 0.8268, + "step": 8923 + }, + { + "epoch": 0.7211458817349845, + "grad_norm": 3.251495838165283, + "learning_rate": 7.397693655464106e-06, + "loss": 1.0013, + "step": 8924 + }, + { + "epoch": 0.7212266914483121, + "grad_norm": 2.6639227867126465, + "learning_rate": 7.3971194185958206e-06, + "loss": 1.063, + "step": 8925 + }, + { + "epoch": 0.7213075011616397, + "grad_norm": 2.635226011276245, + "learning_rate": 7.396545140670651e-06, + "loss": 0.8988, + "step": 8926 + }, + { + "epoch": 0.7213883108749671, + "grad_norm": 2.7212259769439697, + "learning_rate": 7.395970821698433e-06, + "loss": 0.8856, + "step": 8927 + }, + { + "epoch": 0.7214691205882947, + "grad_norm": 2.4809279441833496, + "learning_rate": 7.395396461689001e-06, + "loss": 1.1287, + "step": 8928 + }, + { + "epoch": 0.7215499303016223, + "grad_norm": 2.903891086578369, + "learning_rate": 7.394822060652196e-06, + "loss": 0.9249, + "step": 8929 + }, + { + "epoch": 0.7216307400149498, + "grad_norm": 2.535151243209839, + "learning_rate": 7.394247618597854e-06, + "loss": 0.9742, + "step": 8930 + }, + { + "epoch": 0.7217115497282773, + "grad_norm": 2.8602733612060547, + "learning_rate": 7.393673135535812e-06, + "loss": 1.0196, + "step": 8931 + }, + { + "epoch": 0.7217923594416049, + "grad_norm": 2.4082558155059814, + "learning_rate": 7.393098611475915e-06, + "loss": 1.0145, + "step": 8932 + }, + { + "epoch": 0.7218731691549324, + "grad_norm": 2.844003438949585, + "learning_rate": 7.392524046427998e-06, + "loss": 0.9257, + "step": 8933 + }, + { + "epoch": 0.72195397886826, + "grad_norm": 2.696645736694336, + "learning_rate": 7.3919494404019045e-06, + "loss": 0.9545, + "step": 8934 + }, + { + "epoch": 0.7220347885815875, + "grad_norm": 3.0856778621673584, + "learning_rate": 7.391374793407475e-06, + "loss": 1.0015, + "step": 8935 + }, + { + "epoch": 0.722115598294915, + "grad_norm": 2.524808168411255, + "learning_rate": 7.390800105454553e-06, + "loss": 0.9554, + "step": 8936 + }, + { + "epoch": 0.7221964080082426, + "grad_norm": 2.7753970623016357, + "learning_rate": 7.390225376552981e-06, + "loss": 0.9749, + "step": 8937 + }, + { + "epoch": 0.7222772177215702, + "grad_norm": 2.7223031520843506, + "learning_rate": 7.3896506067126015e-06, + "loss": 1.0576, + "step": 8938 + }, + { + "epoch": 0.7223580274348976, + "grad_norm": 2.9543721675872803, + "learning_rate": 7.38907579594326e-06, + "loss": 0.8495, + "step": 8939 + }, + { + "epoch": 0.7224388371482252, + "grad_norm": 2.7078185081481934, + "learning_rate": 7.3885009442548024e-06, + "loss": 1.0531, + "step": 8940 + }, + { + "epoch": 0.7225196468615528, + "grad_norm": 2.3410189151763916, + "learning_rate": 7.387926051657074e-06, + "loss": 0.9603, + "step": 8941 + }, + { + "epoch": 0.7226004565748803, + "grad_norm": 2.4575016498565674, + "learning_rate": 7.38735111815992e-06, + "loss": 0.8717, + "step": 8942 + }, + { + "epoch": 0.7226812662882078, + "grad_norm": 2.453535318374634, + "learning_rate": 7.386776143773189e-06, + "loss": 0.7974, + "step": 8943 + }, + { + "epoch": 0.7227620760015354, + "grad_norm": 2.7958028316497803, + "learning_rate": 7.386201128506728e-06, + "loss": 0.8795, + "step": 8944 + }, + { + "epoch": 0.7228428857148629, + "grad_norm": 3.515575647354126, + "learning_rate": 7.385626072370387e-06, + "loss": 0.9137, + "step": 8945 + }, + { + "epoch": 0.7229236954281905, + "grad_norm": 2.450437307357788, + "learning_rate": 7.385050975374014e-06, + "loss": 0.9303, + "step": 8946 + }, + { + "epoch": 0.723004505141518, + "grad_norm": 2.5931622982025146, + "learning_rate": 7.384475837527461e-06, + "loss": 0.8807, + "step": 8947 + }, + { + "epoch": 0.7230853148548455, + "grad_norm": 2.7522566318511963, + "learning_rate": 7.383900658840576e-06, + "loss": 0.8882, + "step": 8948 + }, + { + "epoch": 0.7231661245681731, + "grad_norm": 2.3719565868377686, + "learning_rate": 7.383325439323212e-06, + "loss": 0.874, + "step": 8949 + }, + { + "epoch": 0.7232469342815007, + "grad_norm": 3.017683744430542, + "learning_rate": 7.382750178985221e-06, + "loss": 0.7991, + "step": 8950 + }, + { + "epoch": 0.7233277439948281, + "grad_norm": 2.237924575805664, + "learning_rate": 7.382174877836456e-06, + "loss": 0.9674, + "step": 8951 + }, + { + "epoch": 0.7234085537081557, + "grad_norm": 2.619729518890381, + "learning_rate": 7.381599535886768e-06, + "loss": 0.8787, + "step": 8952 + }, + { + "epoch": 0.7234893634214833, + "grad_norm": 2.3338027000427246, + "learning_rate": 7.381024153146016e-06, + "loss": 0.8498, + "step": 8953 + }, + { + "epoch": 0.7235701731348108, + "grad_norm": 2.7453088760375977, + "learning_rate": 7.380448729624051e-06, + "loss": 1.0517, + "step": 8954 + }, + { + "epoch": 0.7236509828481383, + "grad_norm": 2.635988473892212, + "learning_rate": 7.379873265330732e-06, + "loss": 0.9731, + "step": 8955 + }, + { + "epoch": 0.7237317925614659, + "grad_norm": 3.369051933288574, + "learning_rate": 7.379297760275911e-06, + "loss": 0.9114, + "step": 8956 + }, + { + "epoch": 0.7238126022747934, + "grad_norm": 2.868903636932373, + "learning_rate": 7.378722214469447e-06, + "loss": 0.9999, + "step": 8957 + }, + { + "epoch": 0.723893411988121, + "grad_norm": 2.838514804840088, + "learning_rate": 7.378146627921199e-06, + "loss": 0.9413, + "step": 8958 + }, + { + "epoch": 0.7239742217014485, + "grad_norm": 3.0332632064819336, + "learning_rate": 7.377571000641024e-06, + "loss": 0.9165, + "step": 8959 + }, + { + "epoch": 0.724055031414776, + "grad_norm": 2.7605502605438232, + "learning_rate": 7.3769953326387825e-06, + "loss": 0.9708, + "step": 8960 + }, + { + "epoch": 0.7241358411281036, + "grad_norm": 2.6083874702453613, + "learning_rate": 7.376419623924333e-06, + "loss": 0.9836, + "step": 8961 + }, + { + "epoch": 0.7242166508414312, + "grad_norm": 2.502711057662964, + "learning_rate": 7.375843874507536e-06, + "loss": 0.8841, + "step": 8962 + }, + { + "epoch": 0.7242974605547586, + "grad_norm": 3.4860928058624268, + "learning_rate": 7.375268084398253e-06, + "loss": 0.9586, + "step": 8963 + }, + { + "epoch": 0.7243782702680862, + "grad_norm": 2.41768217086792, + "learning_rate": 7.374692253606346e-06, + "loss": 0.9388, + "step": 8964 + }, + { + "epoch": 0.7244590799814138, + "grad_norm": 2.621381998062134, + "learning_rate": 7.374116382141679e-06, + "loss": 0.9396, + "step": 8965 + }, + { + "epoch": 0.7245398896947413, + "grad_norm": 1.9862269163131714, + "learning_rate": 7.373540470014111e-06, + "loss": 0.9621, + "step": 8966 + }, + { + "epoch": 0.7246206994080688, + "grad_norm": 2.74418044090271, + "learning_rate": 7.3729645172335095e-06, + "loss": 0.8411, + "step": 8967 + }, + { + "epoch": 0.7247015091213964, + "grad_norm": 2.856605291366577, + "learning_rate": 7.372388523809739e-06, + "loss": 0.804, + "step": 8968 + }, + { + "epoch": 0.7247823188347239, + "grad_norm": 2.4016685485839844, + "learning_rate": 7.371812489752665e-06, + "loss": 0.8569, + "step": 8969 + }, + { + "epoch": 0.7248631285480515, + "grad_norm": 2.506629228591919, + "learning_rate": 7.371236415072153e-06, + "loss": 1.0598, + "step": 8970 + }, + { + "epoch": 0.724943938261379, + "grad_norm": 2.507174491882324, + "learning_rate": 7.37066029977807e-06, + "loss": 1.0069, + "step": 8971 + }, + { + "epoch": 0.7250247479747065, + "grad_norm": 2.9702212810516357, + "learning_rate": 7.370084143880282e-06, + "loss": 0.9635, + "step": 8972 + }, + { + "epoch": 0.7251055576880341, + "grad_norm": 3.0821893215179443, + "learning_rate": 7.369507947388659e-06, + "loss": 0.8754, + "step": 8973 + }, + { + "epoch": 0.7251863674013617, + "grad_norm": 2.659006118774414, + "learning_rate": 7.368931710313068e-06, + "loss": 0.8889, + "step": 8974 + }, + { + "epoch": 0.7252671771146891, + "grad_norm": 2.7876930236816406, + "learning_rate": 7.368355432663382e-06, + "loss": 0.936, + "step": 8975 + }, + { + "epoch": 0.7253479868280167, + "grad_norm": 2.774790048599243, + "learning_rate": 7.367779114449467e-06, + "loss": 0.9198, + "step": 8976 + }, + { + "epoch": 0.7254287965413443, + "grad_norm": 2.7417778968811035, + "learning_rate": 7.367202755681198e-06, + "loss": 0.7967, + "step": 8977 + }, + { + "epoch": 0.7255096062546718, + "grad_norm": 2.626347303390503, + "learning_rate": 7.366626356368443e-06, + "loss": 0.9192, + "step": 8978 + }, + { + "epoch": 0.7255904159679993, + "grad_norm": 2.4893431663513184, + "learning_rate": 7.3660499165210765e-06, + "loss": 0.9079, + "step": 8979 + }, + { + "epoch": 0.7256712256813269, + "grad_norm": 2.3547041416168213, + "learning_rate": 7.365473436148971e-06, + "loss": 0.844, + "step": 8980 + }, + { + "epoch": 0.7257520353946544, + "grad_norm": 2.3203346729278564, + "learning_rate": 7.3648969152619995e-06, + "loss": 0.9682, + "step": 8981 + }, + { + "epoch": 0.725832845107982, + "grad_norm": 2.567094564437866, + "learning_rate": 7.364320353870038e-06, + "loss": 0.9329, + "step": 8982 + }, + { + "epoch": 0.7259136548213095, + "grad_norm": 2.4649698734283447, + "learning_rate": 7.36374375198296e-06, + "loss": 0.9271, + "step": 8983 + }, + { + "epoch": 0.7259944645346371, + "grad_norm": 2.674482583999634, + "learning_rate": 7.363167109610641e-06, + "loss": 0.9464, + "step": 8984 + }, + { + "epoch": 0.7260752742479646, + "grad_norm": 2.594151020050049, + "learning_rate": 7.362590426762961e-06, + "loss": 0.8646, + "step": 8985 + }, + { + "epoch": 0.7261560839612922, + "grad_norm": 2.744310140609741, + "learning_rate": 7.362013703449794e-06, + "loss": 0.9258, + "step": 8986 + }, + { + "epoch": 0.7262368936746197, + "grad_norm": 2.228095293045044, + "learning_rate": 7.3614369396810185e-06, + "loss": 0.8247, + "step": 8987 + }, + { + "epoch": 0.7263177033879472, + "grad_norm": 3.085430383682251, + "learning_rate": 7.360860135466512e-06, + "loss": 0.8957, + "step": 8988 + }, + { + "epoch": 0.7263985131012748, + "grad_norm": 2.681002140045166, + "learning_rate": 7.360283290816157e-06, + "loss": 0.8328, + "step": 8989 + }, + { + "epoch": 0.7264793228146024, + "grad_norm": 2.853421449661255, + "learning_rate": 7.3597064057398285e-06, + "loss": 0.9714, + "step": 8990 + }, + { + "epoch": 0.7265601325279298, + "grad_norm": 2.6571035385131836, + "learning_rate": 7.359129480247412e-06, + "loss": 0.9846, + "step": 8991 + }, + { + "epoch": 0.7266409422412574, + "grad_norm": 2.6871957778930664, + "learning_rate": 7.358552514348787e-06, + "loss": 0.966, + "step": 8992 + }, + { + "epoch": 0.726721751954585, + "grad_norm": 2.384361982345581, + "learning_rate": 7.357975508053834e-06, + "loss": 0.9148, + "step": 8993 + }, + { + "epoch": 0.7268025616679125, + "grad_norm": 2.662299633026123, + "learning_rate": 7.357398461372438e-06, + "loss": 1.0036, + "step": 8994 + }, + { + "epoch": 0.72688337138124, + "grad_norm": 2.670008897781372, + "learning_rate": 7.356821374314482e-06, + "loss": 0.9544, + "step": 8995 + }, + { + "epoch": 0.7269641810945676, + "grad_norm": 2.415121555328369, + "learning_rate": 7.3562442468898485e-06, + "loss": 0.9899, + "step": 8996 + }, + { + "epoch": 0.7270449908078951, + "grad_norm": 2.4513497352600098, + "learning_rate": 7.355667079108425e-06, + "loss": 0.9313, + "step": 8997 + }, + { + "epoch": 0.7271258005212227, + "grad_norm": 2.724459648132324, + "learning_rate": 7.355089870980094e-06, + "loss": 1.0078, + "step": 8998 + }, + { + "epoch": 0.7272066102345502, + "grad_norm": 2.392666816711426, + "learning_rate": 7.354512622514744e-06, + "loss": 0.9195, + "step": 8999 + }, + { + "epoch": 0.7272874199478777, + "grad_norm": 2.799349308013916, + "learning_rate": 7.353935333722262e-06, + "loss": 0.8417, + "step": 9000 + }, + { + "epoch": 0.7272874199478777, + "eval_loss": 0.7830251455307007, + "eval_runtime": 815.0654, + "eval_samples_per_second": 102.281, + "eval_steps_per_second": 12.785, + "step": 9000 + }, + { + "epoch": 0.7273682296612053, + "grad_norm": 2.228584051132202, + "learning_rate": 7.353358004612533e-06, + "loss": 0.951, + "step": 9001 + }, + { + "epoch": 0.7274490393745329, + "grad_norm": 2.6647536754608154, + "learning_rate": 7.352780635195446e-06, + "loss": 0.8601, + "step": 9002 + }, + { + "epoch": 0.7275298490878603, + "grad_norm": 2.371415138244629, + "learning_rate": 7.352203225480893e-06, + "loss": 0.916, + "step": 9003 + }, + { + "epoch": 0.7276106588011879, + "grad_norm": 2.6645593643188477, + "learning_rate": 7.351625775478761e-06, + "loss": 0.9452, + "step": 9004 + }, + { + "epoch": 0.7276914685145155, + "grad_norm": 2.236506462097168, + "learning_rate": 7.35104828519894e-06, + "loss": 1.0534, + "step": 9005 + }, + { + "epoch": 0.727772278227843, + "grad_norm": 2.4360313415527344, + "learning_rate": 7.350470754651322e-06, + "loss": 0.9478, + "step": 9006 + }, + { + "epoch": 0.7278530879411705, + "grad_norm": 2.6017050743103027, + "learning_rate": 7.3498931838458e-06, + "loss": 0.8245, + "step": 9007 + }, + { + "epoch": 0.7279338976544981, + "grad_norm": 2.8063132762908936, + "learning_rate": 7.349315572792262e-06, + "loss": 1.0436, + "step": 9008 + }, + { + "epoch": 0.7280147073678256, + "grad_norm": 2.9765968322753906, + "learning_rate": 7.348737921500606e-06, + "loss": 0.9433, + "step": 9009 + }, + { + "epoch": 0.7280955170811532, + "grad_norm": 2.7164366245269775, + "learning_rate": 7.348160229980723e-06, + "loss": 0.8589, + "step": 9010 + }, + { + "epoch": 0.7281763267944807, + "grad_norm": 2.619481325149536, + "learning_rate": 7.347582498242509e-06, + "loss": 0.9097, + "step": 9011 + }, + { + "epoch": 0.7282571365078082, + "grad_norm": 2.7392752170562744, + "learning_rate": 7.347004726295857e-06, + "loss": 0.9423, + "step": 9012 + }, + { + "epoch": 0.7283379462211358, + "grad_norm": 2.6279306411743164, + "learning_rate": 7.3464269141506665e-06, + "loss": 0.9999, + "step": 9013 + }, + { + "epoch": 0.7284187559344634, + "grad_norm": 2.804171085357666, + "learning_rate": 7.3458490618168295e-06, + "loss": 1.1039, + "step": 9014 + }, + { + "epoch": 0.7284995656477908, + "grad_norm": 3.0502655506134033, + "learning_rate": 7.345271169304246e-06, + "loss": 0.8591, + "step": 9015 + }, + { + "epoch": 0.7285803753611184, + "grad_norm": 2.96559739112854, + "learning_rate": 7.3446932366228155e-06, + "loss": 0.9103, + "step": 9016 + }, + { + "epoch": 0.728661185074446, + "grad_norm": 2.4157369136810303, + "learning_rate": 7.344115263782432e-06, + "loss": 1.0023, + "step": 9017 + }, + { + "epoch": 0.7287419947877735, + "grad_norm": 2.7634778022766113, + "learning_rate": 7.343537250792998e-06, + "loss": 0.9369, + "step": 9018 + }, + { + "epoch": 0.728822804501101, + "grad_norm": 2.900740385055542, + "learning_rate": 7.342959197664412e-06, + "loss": 1.1396, + "step": 9019 + }, + { + "epoch": 0.7289036142144286, + "grad_norm": 2.649463653564453, + "learning_rate": 7.342381104406576e-06, + "loss": 0.9305, + "step": 9020 + }, + { + "epoch": 0.7289844239277561, + "grad_norm": 3.1890182495117188, + "learning_rate": 7.34180297102939e-06, + "loss": 0.9291, + "step": 9021 + }, + { + "epoch": 0.7290652336410837, + "grad_norm": 2.8196585178375244, + "learning_rate": 7.3412247975427586e-06, + "loss": 0.8877, + "step": 9022 + }, + { + "epoch": 0.7291460433544112, + "grad_norm": 2.358859062194824, + "learning_rate": 7.340646583956582e-06, + "loss": 0.9803, + "step": 9023 + }, + { + "epoch": 0.7292268530677387, + "grad_norm": 2.893080472946167, + "learning_rate": 7.340068330280764e-06, + "loss": 0.9453, + "step": 9024 + }, + { + "epoch": 0.7293076627810663, + "grad_norm": 2.7734615802764893, + "learning_rate": 7.339490036525208e-06, + "loss": 0.9046, + "step": 9025 + }, + { + "epoch": 0.7293884724943939, + "grad_norm": 2.8442749977111816, + "learning_rate": 7.338911702699822e-06, + "loss": 1.0598, + "step": 9026 + }, + { + "epoch": 0.7294692822077213, + "grad_norm": 2.563047170639038, + "learning_rate": 7.338333328814507e-06, + "loss": 0.9657, + "step": 9027 + }, + { + "epoch": 0.7295500919210489, + "grad_norm": 2.7017898559570312, + "learning_rate": 7.337754914879174e-06, + "loss": 1.0422, + "step": 9028 + }, + { + "epoch": 0.7296309016343765, + "grad_norm": 2.9704360961914062, + "learning_rate": 7.3371764609037236e-06, + "loss": 0.784, + "step": 9029 + }, + { + "epoch": 0.729711711347704, + "grad_norm": 2.824162006378174, + "learning_rate": 7.33659796689807e-06, + "loss": 0.9321, + "step": 9030 + }, + { + "epoch": 0.7297925210610315, + "grad_norm": 2.1505091190338135, + "learning_rate": 7.336019432872117e-06, + "loss": 0.9972, + "step": 9031 + }, + { + "epoch": 0.7298733307743591, + "grad_norm": 2.6777167320251465, + "learning_rate": 7.335440858835775e-06, + "loss": 0.9419, + "step": 9032 + }, + { + "epoch": 0.7299541404876866, + "grad_norm": 2.3954265117645264, + "learning_rate": 7.334862244798953e-06, + "loss": 1.0433, + "step": 9033 + }, + { + "epoch": 0.7300349502010142, + "grad_norm": 2.8312015533447266, + "learning_rate": 7.3342835907715625e-06, + "loss": 0.9176, + "step": 9034 + }, + { + "epoch": 0.7301157599143417, + "grad_norm": 3.6947519779205322, + "learning_rate": 7.3337048967635135e-06, + "loss": 0.8934, + "step": 9035 + }, + { + "epoch": 0.7301965696276692, + "grad_norm": 2.8133158683776855, + "learning_rate": 7.333126162784718e-06, + "loss": 0.9151, + "step": 9036 + }, + { + "epoch": 0.7302773793409968, + "grad_norm": 2.875257730484009, + "learning_rate": 7.332547388845087e-06, + "loss": 0.8651, + "step": 9037 + }, + { + "epoch": 0.7303581890543244, + "grad_norm": 2.589542865753174, + "learning_rate": 7.331968574954537e-06, + "loss": 0.9141, + "step": 9038 + }, + { + "epoch": 0.7304389987676518, + "grad_norm": 2.429004192352295, + "learning_rate": 7.331389721122977e-06, + "loss": 0.9255, + "step": 9039 + }, + { + "epoch": 0.7305198084809794, + "grad_norm": 2.7485947608947754, + "learning_rate": 7.330810827360324e-06, + "loss": 0.9342, + "step": 9040 + }, + { + "epoch": 0.730600618194307, + "grad_norm": 2.973147392272949, + "learning_rate": 7.330231893676494e-06, + "loss": 0.9602, + "step": 9041 + }, + { + "epoch": 0.7306814279076345, + "grad_norm": 3.019298553466797, + "learning_rate": 7.3296529200814005e-06, + "loss": 1.0176, + "step": 9042 + }, + { + "epoch": 0.730762237620962, + "grad_norm": 2.4859468936920166, + "learning_rate": 7.32907390658496e-06, + "loss": 0.9742, + "step": 9043 + }, + { + "epoch": 0.7308430473342896, + "grad_norm": 2.8655483722686768, + "learning_rate": 7.328494853197092e-06, + "loss": 0.8597, + "step": 9044 + }, + { + "epoch": 0.7309238570476171, + "grad_norm": 2.3558311462402344, + "learning_rate": 7.327915759927713e-06, + "loss": 0.9458, + "step": 9045 + }, + { + "epoch": 0.7310046667609447, + "grad_norm": 2.892517328262329, + "learning_rate": 7.327336626786739e-06, + "loss": 0.8994, + "step": 9046 + }, + { + "epoch": 0.7310854764742722, + "grad_norm": 2.5985705852508545, + "learning_rate": 7.326757453784094e-06, + "loss": 1.0047, + "step": 9047 + }, + { + "epoch": 0.7311662861875997, + "grad_norm": 2.7899577617645264, + "learning_rate": 7.326178240929693e-06, + "loss": 0.9206, + "step": 9048 + }, + { + "epoch": 0.7312470959009273, + "grad_norm": 2.733816146850586, + "learning_rate": 7.32559898823346e-06, + "loss": 0.8699, + "step": 9049 + }, + { + "epoch": 0.7313279056142549, + "grad_norm": 2.8018996715545654, + "learning_rate": 7.325019695705317e-06, + "loss": 0.9548, + "step": 9050 + }, + { + "epoch": 0.7314087153275823, + "grad_norm": 2.394652843475342, + "learning_rate": 7.32444036335518e-06, + "loss": 0.9311, + "step": 9051 + }, + { + "epoch": 0.7314895250409099, + "grad_norm": 2.5981502532958984, + "learning_rate": 7.323860991192978e-06, + "loss": 0.8349, + "step": 9052 + }, + { + "epoch": 0.7315703347542375, + "grad_norm": 2.5200812816619873, + "learning_rate": 7.32328157922863e-06, + "loss": 0.9657, + "step": 9053 + }, + { + "epoch": 0.731651144467565, + "grad_norm": 2.7489216327667236, + "learning_rate": 7.322702127472063e-06, + "loss": 0.953, + "step": 9054 + }, + { + "epoch": 0.7317319541808925, + "grad_norm": 2.780418872833252, + "learning_rate": 7.3221226359332e-06, + "loss": 0.8601, + "step": 9055 + }, + { + "epoch": 0.7318127638942201, + "grad_norm": 3.3629696369171143, + "learning_rate": 7.321543104621967e-06, + "loss": 0.9465, + "step": 9056 + }, + { + "epoch": 0.7318935736075476, + "grad_norm": 2.8562209606170654, + "learning_rate": 7.3209635335482874e-06, + "loss": 0.9981, + "step": 9057 + }, + { + "epoch": 0.7319743833208752, + "grad_norm": 2.8071959018707275, + "learning_rate": 7.3203839227220915e-06, + "loss": 0.8345, + "step": 9058 + }, + { + "epoch": 0.7320551930342027, + "grad_norm": 2.947610855102539, + "learning_rate": 7.319804272153306e-06, + "loss": 0.8278, + "step": 9059 + }, + { + "epoch": 0.7321360027475302, + "grad_norm": 2.4837849140167236, + "learning_rate": 7.319224581851857e-06, + "loss": 0.9373, + "step": 9060 + }, + { + "epoch": 0.7322168124608578, + "grad_norm": 2.406834602355957, + "learning_rate": 7.318644851827674e-06, + "loss": 0.8845, + "step": 9061 + }, + { + "epoch": 0.7322976221741854, + "grad_norm": 2.5397162437438965, + "learning_rate": 7.318065082090686e-06, + "loss": 0.9404, + "step": 9062 + }, + { + "epoch": 0.7323784318875128, + "grad_norm": 2.8372175693511963, + "learning_rate": 7.317485272650825e-06, + "loss": 0.9295, + "step": 9063 + }, + { + "epoch": 0.7324592416008404, + "grad_norm": 2.714838981628418, + "learning_rate": 7.31690542351802e-06, + "loss": 0.9338, + "step": 9064 + }, + { + "epoch": 0.732540051314168, + "grad_norm": 2.313499927520752, + "learning_rate": 7.316325534702202e-06, + "loss": 1.0098, + "step": 9065 + }, + { + "epoch": 0.7326208610274955, + "grad_norm": 2.831343173980713, + "learning_rate": 7.315745606213305e-06, + "loss": 0.9202, + "step": 9066 + }, + { + "epoch": 0.732701670740823, + "grad_norm": 2.8902063369750977, + "learning_rate": 7.315165638061262e-06, + "loss": 0.9094, + "step": 9067 + }, + { + "epoch": 0.7327824804541506, + "grad_norm": 2.2195374965667725, + "learning_rate": 7.314585630256001e-06, + "loss": 0.9287, + "step": 9068 + }, + { + "epoch": 0.7328632901674781, + "grad_norm": 2.9071133136749268, + "learning_rate": 7.314005582807464e-06, + "loss": 0.9714, + "step": 9069 + }, + { + "epoch": 0.7329440998808057, + "grad_norm": 2.5509865283966064, + "learning_rate": 7.31342549572558e-06, + "loss": 0.9228, + "step": 9070 + }, + { + "epoch": 0.7330249095941332, + "grad_norm": 2.6809580326080322, + "learning_rate": 7.3128453690202875e-06, + "loss": 1.0213, + "step": 9071 + }, + { + "epoch": 0.7331057193074607, + "grad_norm": 2.3250210285186768, + "learning_rate": 7.312265202701523e-06, + "loss": 1.0098, + "step": 9072 + }, + { + "epoch": 0.7331865290207883, + "grad_norm": 2.799264669418335, + "learning_rate": 7.31168499677922e-06, + "loss": 1.0227, + "step": 9073 + }, + { + "epoch": 0.7332673387341159, + "grad_norm": 2.917973756790161, + "learning_rate": 7.311104751263319e-06, + "loss": 0.9799, + "step": 9074 + }, + { + "epoch": 0.7333481484474433, + "grad_norm": 2.7183284759521484, + "learning_rate": 7.310524466163758e-06, + "loss": 0.97, + "step": 9075 + }, + { + "epoch": 0.7334289581607709, + "grad_norm": 2.3030471801757812, + "learning_rate": 7.309944141490474e-06, + "loss": 0.9627, + "step": 9076 + }, + { + "epoch": 0.7335097678740985, + "grad_norm": 2.5966598987579346, + "learning_rate": 7.309363777253409e-06, + "loss": 1.0361, + "step": 9077 + }, + { + "epoch": 0.733590577587426, + "grad_norm": 2.7329165935516357, + "learning_rate": 7.3087833734625e-06, + "loss": 0.8653, + "step": 9078 + }, + { + "epoch": 0.7336713873007535, + "grad_norm": 2.536996364593506, + "learning_rate": 7.308202930127693e-06, + "loss": 0.9458, + "step": 9079 + }, + { + "epoch": 0.7337521970140811, + "grad_norm": 2.4987056255340576, + "learning_rate": 7.307622447258925e-06, + "loss": 1.0911, + "step": 9080 + }, + { + "epoch": 0.7338330067274086, + "grad_norm": 2.4029359817504883, + "learning_rate": 7.307041924866139e-06, + "loss": 1.1116, + "step": 9081 + }, + { + "epoch": 0.7339138164407362, + "grad_norm": 2.360539197921753, + "learning_rate": 7.3064613629592806e-06, + "loss": 0.9258, + "step": 9082 + }, + { + "epoch": 0.7339946261540637, + "grad_norm": 2.5612308979034424, + "learning_rate": 7.305880761548291e-06, + "loss": 0.9422, + "step": 9083 + }, + { + "epoch": 0.7340754358673912, + "grad_norm": 3.0456793308258057, + "learning_rate": 7.305300120643114e-06, + "loss": 0.8824, + "step": 9084 + }, + { + "epoch": 0.7341562455807188, + "grad_norm": 2.5850014686584473, + "learning_rate": 7.304719440253697e-06, + "loss": 0.9751, + "step": 9085 + }, + { + "epoch": 0.7342370552940464, + "grad_norm": 2.634831428527832, + "learning_rate": 7.304138720389984e-06, + "loss": 0.9048, + "step": 9086 + }, + { + "epoch": 0.7343178650073738, + "grad_norm": 2.5613815784454346, + "learning_rate": 7.3035579610619225e-06, + "loss": 0.83, + "step": 9087 + }, + { + "epoch": 0.7343986747207014, + "grad_norm": 3.141700506210327, + "learning_rate": 7.302977162279457e-06, + "loss": 0.9762, + "step": 9088 + }, + { + "epoch": 0.734479484434029, + "grad_norm": 2.335719585418701, + "learning_rate": 7.3023963240525385e-06, + "loss": 0.9933, + "step": 9089 + }, + { + "epoch": 0.7345602941473565, + "grad_norm": 2.5486817359924316, + "learning_rate": 7.301815446391113e-06, + "loss": 0.9592, + "step": 9090 + }, + { + "epoch": 0.734641103860684, + "grad_norm": 2.6970295906066895, + "learning_rate": 7.30123452930513e-06, + "loss": 0.8509, + "step": 9091 + }, + { + "epoch": 0.7347219135740116, + "grad_norm": 2.5547571182250977, + "learning_rate": 7.300653572804539e-06, + "loss": 0.8352, + "step": 9092 + }, + { + "epoch": 0.7348027232873391, + "grad_norm": 2.8848555088043213, + "learning_rate": 7.300072576899292e-06, + "loss": 0.9291, + "step": 9093 + }, + { + "epoch": 0.7348835330006667, + "grad_norm": 2.2670273780822754, + "learning_rate": 7.299491541599338e-06, + "loss": 1.0142, + "step": 9094 + }, + { + "epoch": 0.7349643427139942, + "grad_norm": 2.5862274169921875, + "learning_rate": 7.298910466914632e-06, + "loss": 1.1024, + "step": 9095 + }, + { + "epoch": 0.7350451524273217, + "grad_norm": 2.634575605392456, + "learning_rate": 7.298329352855121e-06, + "loss": 0.9297, + "step": 9096 + }, + { + "epoch": 0.7351259621406493, + "grad_norm": 2.481804847717285, + "learning_rate": 7.297748199430764e-06, + "loss": 0.9552, + "step": 9097 + }, + { + "epoch": 0.7352067718539769, + "grad_norm": 2.592578411102295, + "learning_rate": 7.297167006651511e-06, + "loss": 0.9029, + "step": 9098 + }, + { + "epoch": 0.7352875815673043, + "grad_norm": 2.6386446952819824, + "learning_rate": 7.296585774527316e-06, + "loss": 1.0682, + "step": 9099 + }, + { + "epoch": 0.7353683912806319, + "grad_norm": 2.4857497215270996, + "learning_rate": 7.296004503068137e-06, + "loss": 0.9442, + "step": 9100 + }, + { + "epoch": 0.7354492009939595, + "grad_norm": 3.2034192085266113, + "learning_rate": 7.295423192283928e-06, + "loss": 0.9089, + "step": 9101 + }, + { + "epoch": 0.735530010707287, + "grad_norm": 2.7047171592712402, + "learning_rate": 7.294841842184645e-06, + "loss": 1.0233, + "step": 9102 + }, + { + "epoch": 0.7356108204206145, + "grad_norm": 2.326129913330078, + "learning_rate": 7.294260452780248e-06, + "loss": 1.0153, + "step": 9103 + }, + { + "epoch": 0.7356916301339421, + "grad_norm": 2.405121326446533, + "learning_rate": 7.293679024080689e-06, + "loss": 1.045, + "step": 9104 + }, + { + "epoch": 0.7357724398472696, + "grad_norm": 2.981825828552246, + "learning_rate": 7.293097556095933e-06, + "loss": 0.9242, + "step": 9105 + }, + { + "epoch": 0.7358532495605972, + "grad_norm": 2.567793846130371, + "learning_rate": 7.292516048835936e-06, + "loss": 1.0347, + "step": 9106 + }, + { + "epoch": 0.7359340592739247, + "grad_norm": 2.643747091293335, + "learning_rate": 7.2919345023106566e-06, + "loss": 1.0373, + "step": 9107 + }, + { + "epoch": 0.7360148689872522, + "grad_norm": 2.5018110275268555, + "learning_rate": 7.291352916530058e-06, + "loss": 1.0742, + "step": 9108 + }, + { + "epoch": 0.7360956787005798, + "grad_norm": 2.6694023609161377, + "learning_rate": 7.2907712915041005e-06, + "loss": 1.0641, + "step": 9109 + }, + { + "epoch": 0.7361764884139074, + "grad_norm": 2.499962329864502, + "learning_rate": 7.290189627242743e-06, + "loss": 1.0189, + "step": 9110 + }, + { + "epoch": 0.7362572981272348, + "grad_norm": 2.484152317047119, + "learning_rate": 7.2896079237559546e-06, + "loss": 0.9979, + "step": 9111 + }, + { + "epoch": 0.7363381078405624, + "grad_norm": 2.7532966136932373, + "learning_rate": 7.289026181053691e-06, + "loss": 0.8251, + "step": 9112 + }, + { + "epoch": 0.73641891755389, + "grad_norm": 2.6327338218688965, + "learning_rate": 7.288444399145922e-06, + "loss": 0.9772, + "step": 9113 + }, + { + "epoch": 0.7364997272672176, + "grad_norm": 2.598501443862915, + "learning_rate": 7.287862578042608e-06, + "loss": 1.0511, + "step": 9114 + }, + { + "epoch": 0.736580536980545, + "grad_norm": 2.483323335647583, + "learning_rate": 7.287280717753716e-06, + "loss": 0.8037, + "step": 9115 + }, + { + "epoch": 0.7366613466938726, + "grad_norm": 2.7200582027435303, + "learning_rate": 7.2866988182892116e-06, + "loss": 1.0953, + "step": 9116 + }, + { + "epoch": 0.7367421564072002, + "grad_norm": 2.8144047260284424, + "learning_rate": 7.286116879659063e-06, + "loss": 0.9657, + "step": 9117 + }, + { + "epoch": 0.7368229661205277, + "grad_norm": 2.6776721477508545, + "learning_rate": 7.2855349018732345e-06, + "loss": 0.9313, + "step": 9118 + }, + { + "epoch": 0.7369037758338552, + "grad_norm": 2.267498016357422, + "learning_rate": 7.284952884941696e-06, + "loss": 0.9559, + "step": 9119 + }, + { + "epoch": 0.7369845855471828, + "grad_norm": 2.507622718811035, + "learning_rate": 7.2843708288744155e-06, + "loss": 0.9789, + "step": 9120 + }, + { + "epoch": 0.7370653952605103, + "grad_norm": 3.1745150089263916, + "learning_rate": 7.28378873368136e-06, + "loss": 0.8794, + "step": 9121 + }, + { + "epoch": 0.7371462049738379, + "grad_norm": 2.592698335647583, + "learning_rate": 7.283206599372505e-06, + "loss": 0.9868, + "step": 9122 + }, + { + "epoch": 0.7372270146871654, + "grad_norm": 2.4423863887786865, + "learning_rate": 7.282624425957816e-06, + "loss": 0.9078, + "step": 9123 + }, + { + "epoch": 0.7373078244004929, + "grad_norm": 2.5273303985595703, + "learning_rate": 7.2820422134472635e-06, + "loss": 0.9141, + "step": 9124 + }, + { + "epoch": 0.7373886341138205, + "grad_norm": 2.8483357429504395, + "learning_rate": 7.2814599618508255e-06, + "loss": 0.8186, + "step": 9125 + }, + { + "epoch": 0.7374694438271481, + "grad_norm": 2.963029384613037, + "learning_rate": 7.280877671178468e-06, + "loss": 0.8411, + "step": 9126 + }, + { + "epoch": 0.7375502535404755, + "grad_norm": 2.508075714111328, + "learning_rate": 7.280295341440168e-06, + "loss": 0.7681, + "step": 9127 + }, + { + "epoch": 0.7376310632538031, + "grad_norm": 2.446751832962036, + "learning_rate": 7.279712972645898e-06, + "loss": 0.9448, + "step": 9128 + }, + { + "epoch": 0.7377118729671307, + "grad_norm": 2.707951545715332, + "learning_rate": 7.279130564805633e-06, + "loss": 0.905, + "step": 9129 + }, + { + "epoch": 0.7377926826804582, + "grad_norm": 2.327008008956909, + "learning_rate": 7.278548117929348e-06, + "loss": 0.9702, + "step": 9130 + }, + { + "epoch": 0.7378734923937857, + "grad_norm": 2.9833052158355713, + "learning_rate": 7.27796563202702e-06, + "loss": 1.0264, + "step": 9131 + }, + { + "epoch": 0.7379543021071133, + "grad_norm": 2.0838663578033447, + "learning_rate": 7.277383107108623e-06, + "loss": 1.0132, + "step": 9132 + }, + { + "epoch": 0.7380351118204408, + "grad_norm": 2.692185640335083, + "learning_rate": 7.2768005431841385e-06, + "loss": 0.9251, + "step": 9133 + }, + { + "epoch": 0.7381159215337684, + "grad_norm": 2.429028272628784, + "learning_rate": 7.27621794026354e-06, + "loss": 0.9071, + "step": 9134 + }, + { + "epoch": 0.738196731247096, + "grad_norm": 2.4352681636810303, + "learning_rate": 7.2756352983568094e-06, + "loss": 0.9305, + "step": 9135 + }, + { + "epoch": 0.7382775409604234, + "grad_norm": 2.6816604137420654, + "learning_rate": 7.275052617473923e-06, + "loss": 0.9283, + "step": 9136 + }, + { + "epoch": 0.738358350673751, + "grad_norm": 2.5244176387786865, + "learning_rate": 7.274469897624863e-06, + "loss": 0.9673, + "step": 9137 + }, + { + "epoch": 0.7384391603870786, + "grad_norm": 2.489497423171997, + "learning_rate": 7.273887138819608e-06, + "loss": 1.0181, + "step": 9138 + }, + { + "epoch": 0.738519970100406, + "grad_norm": 2.1473710536956787, + "learning_rate": 7.273304341068143e-06, + "loss": 1.0654, + "step": 9139 + }, + { + "epoch": 0.7386007798137336, + "grad_norm": 3.047027349472046, + "learning_rate": 7.272721504380446e-06, + "loss": 0.9208, + "step": 9140 + }, + { + "epoch": 0.7386815895270612, + "grad_norm": 2.316822052001953, + "learning_rate": 7.272138628766501e-06, + "loss": 0.9827, + "step": 9141 + }, + { + "epoch": 0.7387623992403887, + "grad_norm": 3.2937958240509033, + "learning_rate": 7.27155571423629e-06, + "loss": 0.8491, + "step": 9142 + }, + { + "epoch": 0.7388432089537162, + "grad_norm": 2.6288256645202637, + "learning_rate": 7.2709727607998e-06, + "loss": 0.9432, + "step": 9143 + }, + { + "epoch": 0.7389240186670438, + "grad_norm": 2.5328307151794434, + "learning_rate": 7.2703897684670125e-06, + "loss": 0.9151, + "step": 9144 + }, + { + "epoch": 0.7390048283803713, + "grad_norm": 2.8022024631500244, + "learning_rate": 7.269806737247914e-06, + "loss": 0.936, + "step": 9145 + }, + { + "epoch": 0.7390856380936989, + "grad_norm": 2.4462738037109375, + "learning_rate": 7.2692236671524915e-06, + "loss": 1.0055, + "step": 9146 + }, + { + "epoch": 0.7391664478070264, + "grad_norm": 2.5309243202209473, + "learning_rate": 7.268640558190731e-06, + "loss": 0.9735, + "step": 9147 + }, + { + "epoch": 0.7392472575203539, + "grad_norm": 2.905632734298706, + "learning_rate": 7.268057410372618e-06, + "loss": 1.0658, + "step": 9148 + }, + { + "epoch": 0.7393280672336815, + "grad_norm": 4.138669013977051, + "learning_rate": 7.267474223708142e-06, + "loss": 1.0168, + "step": 9149 + }, + { + "epoch": 0.7394088769470091, + "grad_norm": 2.6521735191345215, + "learning_rate": 7.266890998207291e-06, + "loss": 0.9827, + "step": 9150 + }, + { + "epoch": 0.7394896866603365, + "grad_norm": 2.6250736713409424, + "learning_rate": 7.266307733880054e-06, + "loss": 0.9865, + "step": 9151 + }, + { + "epoch": 0.7395704963736641, + "grad_norm": 2.8561973571777344, + "learning_rate": 7.265724430736423e-06, + "loss": 1.0175, + "step": 9152 + }, + { + "epoch": 0.7396513060869917, + "grad_norm": 2.865936040878296, + "learning_rate": 7.265141088786385e-06, + "loss": 0.9221, + "step": 9153 + }, + { + "epoch": 0.7397321158003192, + "grad_norm": 2.615757703781128, + "learning_rate": 7.264557708039935e-06, + "loss": 0.8526, + "step": 9154 + }, + { + "epoch": 0.7398129255136467, + "grad_norm": 2.5320639610290527, + "learning_rate": 7.263974288507062e-06, + "loss": 1.0548, + "step": 9155 + }, + { + "epoch": 0.7398937352269743, + "grad_norm": 2.7111244201660156, + "learning_rate": 7.263390830197761e-06, + "loss": 0.8035, + "step": 9156 + }, + { + "epoch": 0.7399745449403018, + "grad_norm": 2.625068426132202, + "learning_rate": 7.262807333122024e-06, + "loss": 0.8893, + "step": 9157 + }, + { + "epoch": 0.7400553546536294, + "grad_norm": 3.314542531967163, + "learning_rate": 7.262223797289843e-06, + "loss": 0.8737, + "step": 9158 + }, + { + "epoch": 0.740136164366957, + "grad_norm": 2.487816572189331, + "learning_rate": 7.261640222711216e-06, + "loss": 0.9601, + "step": 9159 + }, + { + "epoch": 0.7402169740802844, + "grad_norm": 2.62138032913208, + "learning_rate": 7.2610566093961356e-06, + "loss": 1.0944, + "step": 9160 + }, + { + "epoch": 0.740297783793612, + "grad_norm": 2.684164047241211, + "learning_rate": 7.2604729573546e-06, + "loss": 0.9524, + "step": 9161 + }, + { + "epoch": 0.7403785935069396, + "grad_norm": 2.382800579071045, + "learning_rate": 7.259889266596605e-06, + "loss": 1.1115, + "step": 9162 + }, + { + "epoch": 0.740459403220267, + "grad_norm": 3.1251096725463867, + "learning_rate": 7.259305537132144e-06, + "loss": 0.809, + "step": 9163 + }, + { + "epoch": 0.7405402129335946, + "grad_norm": 2.339355707168579, + "learning_rate": 7.258721768971222e-06, + "loss": 0.9239, + "step": 9164 + }, + { + "epoch": 0.7406210226469222, + "grad_norm": 2.5114083290100098, + "learning_rate": 7.258137962123832e-06, + "loss": 0.9411, + "step": 9165 + }, + { + "epoch": 0.7407018323602497, + "grad_norm": 2.3658273220062256, + "learning_rate": 7.257554116599975e-06, + "loss": 1.0043, + "step": 9166 + }, + { + "epoch": 0.7407826420735772, + "grad_norm": 2.6916441917419434, + "learning_rate": 7.256970232409651e-06, + "loss": 1.0074, + "step": 9167 + }, + { + "epoch": 0.7408634517869048, + "grad_norm": 3.2465579509735107, + "learning_rate": 7.256386309562862e-06, + "loss": 0.9164, + "step": 9168 + }, + { + "epoch": 0.7409442615002323, + "grad_norm": 2.397860288619995, + "learning_rate": 7.255802348069604e-06, + "loss": 0.9917, + "step": 9169 + }, + { + "epoch": 0.7410250712135599, + "grad_norm": 2.5243847370147705, + "learning_rate": 7.255218347939885e-06, + "loss": 0.9158, + "step": 9170 + }, + { + "epoch": 0.7411058809268875, + "grad_norm": 2.4917051792144775, + "learning_rate": 7.2546343091837035e-06, + "loss": 0.9196, + "step": 9171 + }, + { + "epoch": 0.7411866906402149, + "grad_norm": 3.039865493774414, + "learning_rate": 7.254050231811065e-06, + "loss": 0.8602, + "step": 9172 + }, + { + "epoch": 0.7412675003535425, + "grad_norm": 3.154069423675537, + "learning_rate": 7.253466115831973e-06, + "loss": 1.0926, + "step": 9173 + }, + { + "epoch": 0.7413483100668701, + "grad_norm": 2.7796924114227295, + "learning_rate": 7.2528819612564305e-06, + "loss": 0.9239, + "step": 9174 + }, + { + "epoch": 0.7414291197801975, + "grad_norm": 2.7398829460144043, + "learning_rate": 7.252297768094443e-06, + "loss": 1.0191, + "step": 9175 + }, + { + "epoch": 0.7415099294935251, + "grad_norm": 2.4738523960113525, + "learning_rate": 7.2517135363560185e-06, + "loss": 0.8959, + "step": 9176 + }, + { + "epoch": 0.7415907392068527, + "grad_norm": 2.7567601203918457, + "learning_rate": 7.25112926605116e-06, + "loss": 0.8983, + "step": 9177 + }, + { + "epoch": 0.7416715489201802, + "grad_norm": 2.247624158859253, + "learning_rate": 7.2505449571898775e-06, + "loss": 0.824, + "step": 9178 + }, + { + "epoch": 0.7417523586335077, + "grad_norm": 2.2220160961151123, + "learning_rate": 7.249960609782179e-06, + "loss": 1.0418, + "step": 9179 + }, + { + "epoch": 0.7418331683468353, + "grad_norm": 2.7129971981048584, + "learning_rate": 7.249376223838071e-06, + "loss": 0.9933, + "step": 9180 + }, + { + "epoch": 0.7419139780601628, + "grad_norm": 2.716585874557495, + "learning_rate": 7.248791799367563e-06, + "loss": 0.827, + "step": 9181 + }, + { + "epoch": 0.7419947877734904, + "grad_norm": 2.348145008087158, + "learning_rate": 7.248207336380666e-06, + "loss": 0.9534, + "step": 9182 + }, + { + "epoch": 0.742075597486818, + "grad_norm": 2.6000277996063232, + "learning_rate": 7.247622834887388e-06, + "loss": 0.8306, + "step": 9183 + }, + { + "epoch": 0.7421564072001454, + "grad_norm": 2.988957166671753, + "learning_rate": 7.2470382948977436e-06, + "loss": 0.9401, + "step": 9184 + }, + { + "epoch": 0.742237216913473, + "grad_norm": 2.2751994132995605, + "learning_rate": 7.2464537164217405e-06, + "loss": 0.8464, + "step": 9185 + }, + { + "epoch": 0.7423180266268006, + "grad_norm": 2.605870246887207, + "learning_rate": 7.245869099469396e-06, + "loss": 0.9212, + "step": 9186 + }, + { + "epoch": 0.742398836340128, + "grad_norm": 2.75929594039917, + "learning_rate": 7.24528444405072e-06, + "loss": 0.967, + "step": 9187 + }, + { + "epoch": 0.7424796460534556, + "grad_norm": 2.563917875289917, + "learning_rate": 7.244699750175726e-06, + "loss": 0.7994, + "step": 9188 + }, + { + "epoch": 0.7425604557667832, + "grad_norm": 2.3208367824554443, + "learning_rate": 7.244115017854429e-06, + "loss": 0.9049, + "step": 9189 + }, + { + "epoch": 0.7426412654801107, + "grad_norm": 2.7053263187408447, + "learning_rate": 7.243530247096845e-06, + "loss": 0.9216, + "step": 9190 + }, + { + "epoch": 0.7427220751934382, + "grad_norm": 2.4055285453796387, + "learning_rate": 7.242945437912987e-06, + "loss": 0.9529, + "step": 9191 + }, + { + "epoch": 0.7428028849067658, + "grad_norm": 2.134317398071289, + "learning_rate": 7.242360590312876e-06, + "loss": 0.9731, + "step": 9192 + }, + { + "epoch": 0.7428836946200933, + "grad_norm": 2.8379886150360107, + "learning_rate": 7.241775704306525e-06, + "loss": 0.9023, + "step": 9193 + }, + { + "epoch": 0.7429645043334209, + "grad_norm": 2.562758684158325, + "learning_rate": 7.241190779903953e-06, + "loss": 0.9453, + "step": 9194 + }, + { + "epoch": 0.7430453140467485, + "grad_norm": 2.912896156311035, + "learning_rate": 7.240605817115179e-06, + "loss": 0.9362, + "step": 9195 + }, + { + "epoch": 0.7431261237600759, + "grad_norm": 3.105250358581543, + "learning_rate": 7.240020815950222e-06, + "loss": 0.8728, + "step": 9196 + }, + { + "epoch": 0.7432069334734035, + "grad_norm": 2.6104366779327393, + "learning_rate": 7.239435776419098e-06, + "loss": 0.9926, + "step": 9197 + }, + { + "epoch": 0.7432877431867311, + "grad_norm": 2.365262269973755, + "learning_rate": 7.238850698531834e-06, + "loss": 1.0324, + "step": 9198 + }, + { + "epoch": 0.7433685529000585, + "grad_norm": 2.8031811714172363, + "learning_rate": 7.238265582298445e-06, + "loss": 0.8747, + "step": 9199 + }, + { + "epoch": 0.7434493626133861, + "grad_norm": 2.52441668510437, + "learning_rate": 7.237680427728956e-06, + "loss": 0.9131, + "step": 9200 + }, + { + "epoch": 0.7435301723267137, + "grad_norm": 3.2640960216522217, + "learning_rate": 7.237095234833388e-06, + "loss": 0.9751, + "step": 9201 + }, + { + "epoch": 0.7436109820400412, + "grad_norm": 2.737494945526123, + "learning_rate": 7.236510003621764e-06, + "loss": 1.0418, + "step": 9202 + }, + { + "epoch": 0.7436917917533687, + "grad_norm": 2.468668222427368, + "learning_rate": 7.235924734104109e-06, + "loss": 0.8864, + "step": 9203 + }, + { + "epoch": 0.7437726014666963, + "grad_norm": 2.862825870513916, + "learning_rate": 7.2353394262904456e-06, + "loss": 0.9239, + "step": 9204 + }, + { + "epoch": 0.7438534111800238, + "grad_norm": 2.7571980953216553, + "learning_rate": 7.234754080190797e-06, + "loss": 0.9473, + "step": 9205 + }, + { + "epoch": 0.7439342208933514, + "grad_norm": 3.0559935569763184, + "learning_rate": 7.234168695815194e-06, + "loss": 0.9531, + "step": 9206 + }, + { + "epoch": 0.744015030606679, + "grad_norm": 2.5840489864349365, + "learning_rate": 7.233583273173658e-06, + "loss": 0.9189, + "step": 9207 + }, + { + "epoch": 0.7440958403200064, + "grad_norm": 3.0948474407196045, + "learning_rate": 7.232997812276218e-06, + "loss": 1.0294, + "step": 9208 + }, + { + "epoch": 0.744176650033334, + "grad_norm": 2.4203271865844727, + "learning_rate": 7.232412313132902e-06, + "loss": 0.9397, + "step": 9209 + }, + { + "epoch": 0.7442574597466616, + "grad_norm": 3.032336950302124, + "learning_rate": 7.231826775753735e-06, + "loss": 0.9868, + "step": 9210 + }, + { + "epoch": 0.744338269459989, + "grad_norm": 3.3545117378234863, + "learning_rate": 7.231241200148751e-06, + "loss": 0.87, + "step": 9211 + }, + { + "epoch": 0.7444190791733166, + "grad_norm": 2.4008750915527344, + "learning_rate": 7.230655586327975e-06, + "loss": 0.8474, + "step": 9212 + }, + { + "epoch": 0.7444998888866442, + "grad_norm": 3.301022529602051, + "learning_rate": 7.230069934301439e-06, + "loss": 0.858, + "step": 9213 + }, + { + "epoch": 0.7445806985999717, + "grad_norm": 2.817265033721924, + "learning_rate": 7.2294842440791756e-06, + "loss": 0.9113, + "step": 9214 + }, + { + "epoch": 0.7446615083132992, + "grad_norm": 2.6447441577911377, + "learning_rate": 7.228898515671214e-06, + "loss": 1.0156, + "step": 9215 + }, + { + "epoch": 0.7447423180266268, + "grad_norm": 2.373446226119995, + "learning_rate": 7.228312749087585e-06, + "loss": 0.8783, + "step": 9216 + }, + { + "epoch": 0.7448231277399543, + "grad_norm": 2.9240570068359375, + "learning_rate": 7.2277269443383225e-06, + "loss": 1.0205, + "step": 9217 + }, + { + "epoch": 0.7449039374532819, + "grad_norm": 2.993472099304199, + "learning_rate": 7.227141101433463e-06, + "loss": 0.8996, + "step": 9218 + }, + { + "epoch": 0.7449847471666095, + "grad_norm": 2.8471803665161133, + "learning_rate": 7.226555220383036e-06, + "loss": 0.9852, + "step": 9219 + }, + { + "epoch": 0.7450655568799369, + "grad_norm": 2.5014472007751465, + "learning_rate": 7.225969301197079e-06, + "loss": 1.0542, + "step": 9220 + }, + { + "epoch": 0.7451463665932645, + "grad_norm": 2.8656868934631348, + "learning_rate": 7.225383343885628e-06, + "loss": 0.9652, + "step": 9221 + }, + { + "epoch": 0.7452271763065921, + "grad_norm": 3.0316591262817383, + "learning_rate": 7.224797348458714e-06, + "loss": 0.924, + "step": 9222 + }, + { + "epoch": 0.7453079860199195, + "grad_norm": 2.8609044551849365, + "learning_rate": 7.224211314926382e-06, + "loss": 0.9121, + "step": 9223 + }, + { + "epoch": 0.7453887957332471, + "grad_norm": 2.761373519897461, + "learning_rate": 7.223625243298662e-06, + "loss": 0.933, + "step": 9224 + }, + { + "epoch": 0.7454696054465747, + "grad_norm": 2.629418134689331, + "learning_rate": 7.223039133585595e-06, + "loss": 0.9997, + "step": 9225 + }, + { + "epoch": 0.7455504151599022, + "grad_norm": 2.426096200942993, + "learning_rate": 7.2224529857972205e-06, + "loss": 0.8457, + "step": 9226 + }, + { + "epoch": 0.7456312248732297, + "grad_norm": 3.3540234565734863, + "learning_rate": 7.221866799943575e-06, + "loss": 0.797, + "step": 9227 + }, + { + "epoch": 0.7457120345865573, + "grad_norm": 2.6288182735443115, + "learning_rate": 7.221280576034702e-06, + "loss": 0.9058, + "step": 9228 + }, + { + "epoch": 0.7457928442998848, + "grad_norm": 2.7626688480377197, + "learning_rate": 7.22069431408064e-06, + "loss": 0.9798, + "step": 9229 + }, + { + "epoch": 0.7458736540132124, + "grad_norm": 3.0615322589874268, + "learning_rate": 7.220108014091428e-06, + "loss": 0.9302, + "step": 9230 + }, + { + "epoch": 0.74595446372654, + "grad_norm": 2.7715580463409424, + "learning_rate": 7.2195216760771125e-06, + "loss": 1.0387, + "step": 9231 + }, + { + "epoch": 0.7460352734398674, + "grad_norm": 2.9438533782958984, + "learning_rate": 7.218935300047734e-06, + "loss": 0.9236, + "step": 9232 + }, + { + "epoch": 0.746116083153195, + "grad_norm": 2.639690637588501, + "learning_rate": 7.218348886013335e-06, + "loss": 1.0522, + "step": 9233 + }, + { + "epoch": 0.7461968928665226, + "grad_norm": 2.5193910598754883, + "learning_rate": 7.217762433983961e-06, + "loss": 0.8731, + "step": 9234 + }, + { + "epoch": 0.74627770257985, + "grad_norm": 3.033635139465332, + "learning_rate": 7.217175943969655e-06, + "loss": 0.931, + "step": 9235 + }, + { + "epoch": 0.7463585122931776, + "grad_norm": 2.6699891090393066, + "learning_rate": 7.216589415980462e-06, + "loss": 0.9871, + "step": 9236 + }, + { + "epoch": 0.7464393220065052, + "grad_norm": 2.3057162761688232, + "learning_rate": 7.21600285002643e-06, + "loss": 1.0662, + "step": 9237 + }, + { + "epoch": 0.7465201317198327, + "grad_norm": 2.666623115539551, + "learning_rate": 7.215416246117602e-06, + "loss": 0.9298, + "step": 9238 + }, + { + "epoch": 0.7466009414331602, + "grad_norm": 2.4717490673065186, + "learning_rate": 7.21482960426403e-06, + "loss": 0.8523, + "step": 9239 + }, + { + "epoch": 0.7466817511464878, + "grad_norm": 3.2087392807006836, + "learning_rate": 7.214242924475756e-06, + "loss": 0.9559, + "step": 9240 + }, + { + "epoch": 0.7467625608598154, + "grad_norm": 2.8614275455474854, + "learning_rate": 7.2136562067628334e-06, + "loss": 0.9902, + "step": 9241 + }, + { + "epoch": 0.7468433705731429, + "grad_norm": 2.664296865463257, + "learning_rate": 7.2130694511353074e-06, + "loss": 0.8702, + "step": 9242 + }, + { + "epoch": 0.7469241802864705, + "grad_norm": 2.6501624584198, + "learning_rate": 7.2124826576032315e-06, + "loss": 0.9428, + "step": 9243 + }, + { + "epoch": 0.747004989999798, + "grad_norm": 2.4006431102752686, + "learning_rate": 7.2118958261766515e-06, + "loss": 0.8907, + "step": 9244 + }, + { + "epoch": 0.7470857997131255, + "grad_norm": 2.529768705368042, + "learning_rate": 7.211308956865623e-06, + "loss": 1.0185, + "step": 9245 + }, + { + "epoch": 0.7471666094264531, + "grad_norm": 2.6251213550567627, + "learning_rate": 7.210722049680195e-06, + "loss": 0.8266, + "step": 9246 + }, + { + "epoch": 0.7472474191397807, + "grad_norm": 2.5997235774993896, + "learning_rate": 7.2101351046304204e-06, + "loss": 0.984, + "step": 9247 + }, + { + "epoch": 0.7473282288531081, + "grad_norm": 2.433603525161743, + "learning_rate": 7.209548121726351e-06, + "loss": 0.8437, + "step": 9248 + }, + { + "epoch": 0.7474090385664357, + "grad_norm": 2.428435802459717, + "learning_rate": 7.208961100978043e-06, + "loss": 0.8103, + "step": 9249 + }, + { + "epoch": 0.7474898482797633, + "grad_norm": 2.557340383529663, + "learning_rate": 7.208374042395547e-06, + "loss": 0.9709, + "step": 9250 + }, + { + "epoch": 0.7475706579930907, + "grad_norm": 2.5587244033813477, + "learning_rate": 7.207786945988924e-06, + "loss": 0.9366, + "step": 9251 + }, + { + "epoch": 0.7476514677064183, + "grad_norm": 2.7646195888519287, + "learning_rate": 7.207199811768222e-06, + "loss": 1.086, + "step": 9252 + }, + { + "epoch": 0.7477322774197459, + "grad_norm": 2.679720640182495, + "learning_rate": 7.206612639743502e-06, + "loss": 1.0583, + "step": 9253 + }, + { + "epoch": 0.7478130871330734, + "grad_norm": 3.1740527153015137, + "learning_rate": 7.20602542992482e-06, + "loss": 1.0459, + "step": 9254 + }, + { + "epoch": 0.747893896846401, + "grad_norm": 2.3224925994873047, + "learning_rate": 7.205438182322233e-06, + "loss": 0.9661, + "step": 9255 + }, + { + "epoch": 0.7479747065597285, + "grad_norm": 2.633460760116577, + "learning_rate": 7.2048508969457995e-06, + "loss": 0.902, + "step": 9256 + }, + { + "epoch": 0.748055516273056, + "grad_norm": 2.6579766273498535, + "learning_rate": 7.204263573805579e-06, + "loss": 0.8872, + "step": 9257 + }, + { + "epoch": 0.7481363259863836, + "grad_norm": 2.569265604019165, + "learning_rate": 7.2036762129116275e-06, + "loss": 0.8392, + "step": 9258 + }, + { + "epoch": 0.7482171356997112, + "grad_norm": 2.77009916305542, + "learning_rate": 7.203088814274011e-06, + "loss": 0.9, + "step": 9259 + }, + { + "epoch": 0.7482979454130386, + "grad_norm": 2.763956069946289, + "learning_rate": 7.202501377902784e-06, + "loss": 0.8667, + "step": 9260 + }, + { + "epoch": 0.7483787551263662, + "grad_norm": 2.542914390563965, + "learning_rate": 7.201913903808011e-06, + "loss": 0.9948, + "step": 9261 + }, + { + "epoch": 0.7484595648396938, + "grad_norm": 2.641510248184204, + "learning_rate": 7.201326391999754e-06, + "loss": 0.9146, + "step": 9262 + }, + { + "epoch": 0.7485403745530212, + "grad_norm": 2.4405808448791504, + "learning_rate": 7.200738842488078e-06, + "loss": 0.9546, + "step": 9263 + }, + { + "epoch": 0.7486211842663488, + "grad_norm": 2.7633845806121826, + "learning_rate": 7.20015125528304e-06, + "loss": 0.8837, + "step": 9264 + }, + { + "epoch": 0.7487019939796764, + "grad_norm": 2.6070621013641357, + "learning_rate": 7.199563630394709e-06, + "loss": 0.9535, + "step": 9265 + }, + { + "epoch": 0.7487828036930039, + "grad_norm": 3.175579786300659, + "learning_rate": 7.198975967833148e-06, + "loss": 0.9229, + "step": 9266 + }, + { + "epoch": 0.7488636134063315, + "grad_norm": 2.457697868347168, + "learning_rate": 7.198388267608424e-06, + "loss": 0.9232, + "step": 9267 + }, + { + "epoch": 0.748944423119659, + "grad_norm": 3.052302360534668, + "learning_rate": 7.1978005297305994e-06, + "loss": 0.8864, + "step": 9268 + }, + { + "epoch": 0.7490252328329865, + "grad_norm": 3.0551042556762695, + "learning_rate": 7.197212754209744e-06, + "loss": 0.8824, + "step": 9269 + }, + { + "epoch": 0.7491060425463141, + "grad_norm": 3.2402610778808594, + "learning_rate": 7.196624941055923e-06, + "loss": 1.0016, + "step": 9270 + }, + { + "epoch": 0.7491868522596417, + "grad_norm": 2.7520618438720703, + "learning_rate": 7.196037090279206e-06, + "loss": 0.8577, + "step": 9271 + }, + { + "epoch": 0.7492676619729691, + "grad_norm": 2.7571287155151367, + "learning_rate": 7.195449201889658e-06, + "loss": 0.966, + "step": 9272 + }, + { + "epoch": 0.7493484716862967, + "grad_norm": 2.440335273742676, + "learning_rate": 7.194861275897352e-06, + "loss": 0.9635, + "step": 9273 + }, + { + "epoch": 0.7494292813996243, + "grad_norm": 2.439288854598999, + "learning_rate": 7.194273312312357e-06, + "loss": 0.9975, + "step": 9274 + }, + { + "epoch": 0.7495100911129517, + "grad_norm": 3.4287447929382324, + "learning_rate": 7.193685311144741e-06, + "loss": 0.9167, + "step": 9275 + }, + { + "epoch": 0.7495909008262793, + "grad_norm": 2.7884204387664795, + "learning_rate": 7.193097272404578e-06, + "loss": 0.9657, + "step": 9276 + }, + { + "epoch": 0.7496717105396069, + "grad_norm": 2.3156442642211914, + "learning_rate": 7.192509196101938e-06, + "loss": 0.9158, + "step": 9277 + }, + { + "epoch": 0.7497525202529344, + "grad_norm": 2.6749985218048096, + "learning_rate": 7.191921082246893e-06, + "loss": 0.9151, + "step": 9278 + }, + { + "epoch": 0.749833329966262, + "grad_norm": 2.1590397357940674, + "learning_rate": 7.191332930849517e-06, + "loss": 1.0993, + "step": 9279 + }, + { + "epoch": 0.7499141396795895, + "grad_norm": 2.533832550048828, + "learning_rate": 7.190744741919884e-06, + "loss": 0.8439, + "step": 9280 + }, + { + "epoch": 0.749994949392917, + "grad_norm": 2.5818288326263428, + "learning_rate": 7.190156515468069e-06, + "loss": 1.0234, + "step": 9281 + }, + { + "epoch": 0.7500757591062446, + "grad_norm": 2.3331825733184814, + "learning_rate": 7.189568251504143e-06, + "loss": 0.889, + "step": 9282 + }, + { + "epoch": 0.7501565688195722, + "grad_norm": 2.4949986934661865, + "learning_rate": 7.1889799500381855e-06, + "loss": 0.9003, + "step": 9283 + }, + { + "epoch": 0.7502373785328996, + "grad_norm": 2.7428500652313232, + "learning_rate": 7.18839161108027e-06, + "loss": 0.8927, + "step": 9284 + }, + { + "epoch": 0.7503181882462272, + "grad_norm": 2.768780469894409, + "learning_rate": 7.187803234640474e-06, + "loss": 0.95, + "step": 9285 + }, + { + "epoch": 0.7503989979595548, + "grad_norm": 2.724456787109375, + "learning_rate": 7.187214820728877e-06, + "loss": 0.9073, + "step": 9286 + }, + { + "epoch": 0.7504798076728822, + "grad_norm": 2.688736915588379, + "learning_rate": 7.186626369355555e-06, + "loss": 0.8648, + "step": 9287 + }, + { + "epoch": 0.7505606173862098, + "grad_norm": 2.1352972984313965, + "learning_rate": 7.186037880530589e-06, + "loss": 0.9212, + "step": 9288 + }, + { + "epoch": 0.7506414270995374, + "grad_norm": 3.0642130374908447, + "learning_rate": 7.185449354264055e-06, + "loss": 0.914, + "step": 9289 + }, + { + "epoch": 0.7507222368128649, + "grad_norm": 2.4261667728424072, + "learning_rate": 7.184860790566035e-06, + "loss": 1.0105, + "step": 9290 + }, + { + "epoch": 0.7508030465261925, + "grad_norm": 2.8731417655944824, + "learning_rate": 7.18427218944661e-06, + "loss": 1.0317, + "step": 9291 + }, + { + "epoch": 0.75088385623952, + "grad_norm": 2.8006937503814697, + "learning_rate": 7.18368355091586e-06, + "loss": 0.8769, + "step": 9292 + }, + { + "epoch": 0.7509646659528475, + "grad_norm": 2.8680408000946045, + "learning_rate": 7.183094874983868e-06, + "loss": 0.9244, + "step": 9293 + }, + { + "epoch": 0.7510454756661751, + "grad_norm": 2.4608442783355713, + "learning_rate": 7.182506161660716e-06, + "loss": 0.9152, + "step": 9294 + }, + { + "epoch": 0.7511262853795027, + "grad_norm": 2.6141417026519775, + "learning_rate": 7.181917410956489e-06, + "loss": 0.9852, + "step": 9295 + }, + { + "epoch": 0.7512070950928301, + "grad_norm": 2.662980556488037, + "learning_rate": 7.181328622881269e-06, + "loss": 0.9862, + "step": 9296 + }, + { + "epoch": 0.7512879048061577, + "grad_norm": 3.090834379196167, + "learning_rate": 7.1807397974451395e-06, + "loss": 0.9018, + "step": 9297 + }, + { + "epoch": 0.7513687145194853, + "grad_norm": 2.5306742191314697, + "learning_rate": 7.18015093465819e-06, + "loss": 0.9616, + "step": 9298 + }, + { + "epoch": 0.7514495242328127, + "grad_norm": 3.2240328788757324, + "learning_rate": 7.179562034530502e-06, + "loss": 0.849, + "step": 9299 + }, + { + "epoch": 0.7515303339461403, + "grad_norm": 3.143828868865967, + "learning_rate": 7.1789730970721625e-06, + "loss": 0.955, + "step": 9300 + }, + { + "epoch": 0.7516111436594679, + "grad_norm": 2.6308634281158447, + "learning_rate": 7.17838412229326e-06, + "loss": 0.9277, + "step": 9301 + }, + { + "epoch": 0.7516919533727954, + "grad_norm": 2.7284719944000244, + "learning_rate": 7.177795110203884e-06, + "loss": 0.9378, + "step": 9302 + }, + { + "epoch": 0.751772763086123, + "grad_norm": 2.4026944637298584, + "learning_rate": 7.177206060814117e-06, + "loss": 0.952, + "step": 9303 + }, + { + "epoch": 0.7518535727994505, + "grad_norm": 2.9648163318634033, + "learning_rate": 7.176616974134054e-06, + "loss": 0.8554, + "step": 9304 + }, + { + "epoch": 0.751934382512778, + "grad_norm": 2.4647109508514404, + "learning_rate": 7.176027850173781e-06, + "loss": 0.817, + "step": 9305 + }, + { + "epoch": 0.7520151922261056, + "grad_norm": 2.2996625900268555, + "learning_rate": 7.17543868894339e-06, + "loss": 0.8776, + "step": 9306 + }, + { + "epoch": 0.7520960019394332, + "grad_norm": 2.7225048542022705, + "learning_rate": 7.174849490452972e-06, + "loss": 0.9062, + "step": 9307 + }, + { + "epoch": 0.7521768116527606, + "grad_norm": 3.1516716480255127, + "learning_rate": 7.174260254712617e-06, + "loss": 0.9935, + "step": 9308 + }, + { + "epoch": 0.7522576213660882, + "grad_norm": 2.550668478012085, + "learning_rate": 7.173670981732419e-06, + "loss": 1.1502, + "step": 9309 + }, + { + "epoch": 0.7523384310794158, + "grad_norm": 2.4681434631347656, + "learning_rate": 7.17308167152247e-06, + "loss": 1.0161, + "step": 9310 + }, + { + "epoch": 0.7524192407927432, + "grad_norm": 2.583268642425537, + "learning_rate": 7.172492324092862e-06, + "loss": 0.9408, + "step": 9311 + }, + { + "epoch": 0.7525000505060708, + "grad_norm": 3.556342363357544, + "learning_rate": 7.171902939453692e-06, + "loss": 0.9428, + "step": 9312 + }, + { + "epoch": 0.7525808602193984, + "grad_norm": 2.631986379623413, + "learning_rate": 7.171313517615053e-06, + "loss": 1.0738, + "step": 9313 + }, + { + "epoch": 0.7526616699327259, + "grad_norm": 2.767961263656616, + "learning_rate": 7.170724058587041e-06, + "loss": 0.8905, + "step": 9314 + }, + { + "epoch": 0.7527424796460535, + "grad_norm": 2.1314356327056885, + "learning_rate": 7.17013456237975e-06, + "loss": 0.9848, + "step": 9315 + }, + { + "epoch": 0.752823289359381, + "grad_norm": 2.775268793106079, + "learning_rate": 7.169545029003281e-06, + "loss": 1.0272, + "step": 9316 + }, + { + "epoch": 0.7529040990727085, + "grad_norm": 2.6422245502471924, + "learning_rate": 7.168955458467726e-06, + "loss": 0.8876, + "step": 9317 + }, + { + "epoch": 0.7529849087860361, + "grad_norm": 2.1179001331329346, + "learning_rate": 7.168365850783188e-06, + "loss": 0.935, + "step": 9318 + }, + { + "epoch": 0.7530657184993637, + "grad_norm": 2.697267532348633, + "learning_rate": 7.167776205959761e-06, + "loss": 0.8455, + "step": 9319 + }, + { + "epoch": 0.7531465282126911, + "grad_norm": 2.6576733589172363, + "learning_rate": 7.1671865240075475e-06, + "loss": 0.9284, + "step": 9320 + }, + { + "epoch": 0.7532273379260187, + "grad_norm": 3.2691657543182373, + "learning_rate": 7.166596804936646e-06, + "loss": 0.9172, + "step": 9321 + }, + { + "epoch": 0.7533081476393463, + "grad_norm": 2.57712984085083, + "learning_rate": 7.166007048757155e-06, + "loss": 0.9596, + "step": 9322 + }, + { + "epoch": 0.7533889573526737, + "grad_norm": 2.877537727355957, + "learning_rate": 7.16541725547918e-06, + "loss": 0.9322, + "step": 9323 + }, + { + "epoch": 0.7534697670660013, + "grad_norm": 2.29923939704895, + "learning_rate": 7.164827425112822e-06, + "loss": 0.9925, + "step": 9324 + }, + { + "epoch": 0.7535505767793289, + "grad_norm": 2.2775449752807617, + "learning_rate": 7.164237557668177e-06, + "loss": 0.9225, + "step": 9325 + }, + { + "epoch": 0.7536313864926564, + "grad_norm": 2.4381635189056396, + "learning_rate": 7.163647653155356e-06, + "loss": 0.9624, + "step": 9326 + }, + { + "epoch": 0.753712196205984, + "grad_norm": 3.1883251667022705, + "learning_rate": 7.16305771158446e-06, + "loss": 0.85, + "step": 9327 + }, + { + "epoch": 0.7537930059193115, + "grad_norm": 2.524061918258667, + "learning_rate": 7.162467732965592e-06, + "loss": 0.955, + "step": 9328 + }, + { + "epoch": 0.753873815632639, + "grad_norm": 2.958904981613159, + "learning_rate": 7.161877717308857e-06, + "loss": 1.0463, + "step": 9329 + }, + { + "epoch": 0.7539546253459666, + "grad_norm": 3.006033420562744, + "learning_rate": 7.161287664624364e-06, + "loss": 1.0036, + "step": 9330 + }, + { + "epoch": 0.7540354350592942, + "grad_norm": 2.6345412731170654, + "learning_rate": 7.160697574922212e-06, + "loss": 0.9288, + "step": 9331 + }, + { + "epoch": 0.7541162447726216, + "grad_norm": 2.555610179901123, + "learning_rate": 7.160107448212514e-06, + "loss": 0.9351, + "step": 9332 + }, + { + "epoch": 0.7541970544859492, + "grad_norm": 2.354729413986206, + "learning_rate": 7.159517284505375e-06, + "loss": 0.9694, + "step": 9333 + }, + { + "epoch": 0.7542778641992768, + "grad_norm": 2.697465658187866, + "learning_rate": 7.158927083810906e-06, + "loss": 0.9136, + "step": 9334 + }, + { + "epoch": 0.7543586739126042, + "grad_norm": 2.4910783767700195, + "learning_rate": 7.158336846139212e-06, + "loss": 0.9436, + "step": 9335 + }, + { + "epoch": 0.7544394836259318, + "grad_norm": 2.5268726348876953, + "learning_rate": 7.157746571500404e-06, + "loss": 0.9455, + "step": 9336 + }, + { + "epoch": 0.7545202933392594, + "grad_norm": 2.3225934505462646, + "learning_rate": 7.157156259904592e-06, + "loss": 0.9213, + "step": 9337 + }, + { + "epoch": 0.7546011030525869, + "grad_norm": 2.3548941612243652, + "learning_rate": 7.156565911361887e-06, + "loss": 0.8115, + "step": 9338 + }, + { + "epoch": 0.7546819127659145, + "grad_norm": 2.987687587738037, + "learning_rate": 7.155975525882397e-06, + "loss": 0.8953, + "step": 9339 + }, + { + "epoch": 0.754762722479242, + "grad_norm": 2.4488444328308105, + "learning_rate": 7.15538510347624e-06, + "loss": 0.8765, + "step": 9340 + }, + { + "epoch": 0.7548435321925695, + "grad_norm": 3.088615655899048, + "learning_rate": 7.154794644153523e-06, + "loss": 0.9218, + "step": 9341 + }, + { + "epoch": 0.7549243419058971, + "grad_norm": 2.824294090270996, + "learning_rate": 7.154204147924362e-06, + "loss": 0.9115, + "step": 9342 + }, + { + "epoch": 0.7550051516192247, + "grad_norm": 2.563656806945801, + "learning_rate": 7.153613614798869e-06, + "loss": 0.9283, + "step": 9343 + }, + { + "epoch": 0.7550859613325521, + "grad_norm": 2.5003597736358643, + "learning_rate": 7.15302304478716e-06, + "loss": 0.9461, + "step": 9344 + }, + { + "epoch": 0.7551667710458797, + "grad_norm": 4.22475004196167, + "learning_rate": 7.152432437899349e-06, + "loss": 0.9191, + "step": 9345 + }, + { + "epoch": 0.7552475807592073, + "grad_norm": 2.362718105316162, + "learning_rate": 7.151841794145553e-06, + "loss": 0.983, + "step": 9346 + }, + { + "epoch": 0.7553283904725347, + "grad_norm": 2.62459135055542, + "learning_rate": 7.151251113535886e-06, + "loss": 0.9091, + "step": 9347 + }, + { + "epoch": 0.7554092001858623, + "grad_norm": 3.6033778190612793, + "learning_rate": 7.150660396080469e-06, + "loss": 0.9059, + "step": 9348 + }, + { + "epoch": 0.7554900098991899, + "grad_norm": 2.774806261062622, + "learning_rate": 7.150069641789414e-06, + "loss": 0.8764, + "step": 9349 + }, + { + "epoch": 0.7555708196125174, + "grad_norm": 2.9223997592926025, + "learning_rate": 7.149478850672844e-06, + "loss": 0.8836, + "step": 9350 + }, + { + "epoch": 0.755651629325845, + "grad_norm": 2.658027410507202, + "learning_rate": 7.148888022740875e-06, + "loss": 0.9149, + "step": 9351 + }, + { + "epoch": 0.7557324390391725, + "grad_norm": 2.5316708087921143, + "learning_rate": 7.148297158003628e-06, + "loss": 0.9406, + "step": 9352 + }, + { + "epoch": 0.7558132487525, + "grad_norm": 2.776956558227539, + "learning_rate": 7.147706256471222e-06, + "loss": 1.0484, + "step": 9353 + }, + { + "epoch": 0.7558940584658276, + "grad_norm": 2.474447250366211, + "learning_rate": 7.147115318153778e-06, + "loss": 0.8387, + "step": 9354 + }, + { + "epoch": 0.7559748681791552, + "grad_norm": 2.3409523963928223, + "learning_rate": 7.146524343061418e-06, + "loss": 0.9392, + "step": 9355 + }, + { + "epoch": 0.7560556778924826, + "grad_norm": 2.7103476524353027, + "learning_rate": 7.145933331204264e-06, + "loss": 0.9128, + "step": 9356 + }, + { + "epoch": 0.7561364876058102, + "grad_norm": 3.336158037185669, + "learning_rate": 7.145342282592438e-06, + "loss": 0.921, + "step": 9357 + }, + { + "epoch": 0.7562172973191378, + "grad_norm": 2.7014338970184326, + "learning_rate": 7.144751197236063e-06, + "loss": 1.0213, + "step": 9358 + }, + { + "epoch": 0.7562981070324652, + "grad_norm": 2.6197993755340576, + "learning_rate": 7.144160075145263e-06, + "loss": 0.9492, + "step": 9359 + }, + { + "epoch": 0.7563789167457928, + "grad_norm": 2.4035115242004395, + "learning_rate": 7.143568916330163e-06, + "loss": 0.9155, + "step": 9360 + }, + { + "epoch": 0.7564597264591204, + "grad_norm": 2.1481218338012695, + "learning_rate": 7.142977720800888e-06, + "loss": 0.9322, + "step": 9361 + }, + { + "epoch": 0.7565405361724479, + "grad_norm": 2.654609441757202, + "learning_rate": 7.1423864885675634e-06, + "loss": 1.0117, + "step": 9362 + }, + { + "epoch": 0.7566213458857755, + "grad_norm": 2.4736640453338623, + "learning_rate": 7.141795219640318e-06, + "loss": 0.9867, + "step": 9363 + }, + { + "epoch": 0.756702155599103, + "grad_norm": 2.554844379425049, + "learning_rate": 7.141203914029273e-06, + "loss": 0.759, + "step": 9364 + }, + { + "epoch": 0.7567829653124305, + "grad_norm": 2.6223647594451904, + "learning_rate": 7.140612571744562e-06, + "loss": 0.8359, + "step": 9365 + }, + { + "epoch": 0.7568637750257581, + "grad_norm": 2.4716081619262695, + "learning_rate": 7.14002119279631e-06, + "loss": 0.9236, + "step": 9366 + }, + { + "epoch": 0.7569445847390857, + "grad_norm": 3.238743543624878, + "learning_rate": 7.139429777194648e-06, + "loss": 1.0816, + "step": 9367 + }, + { + "epoch": 0.7570253944524131, + "grad_norm": 3.119372606277466, + "learning_rate": 7.1388383249497025e-06, + "loss": 0.9217, + "step": 9368 + }, + { + "epoch": 0.7571062041657407, + "grad_norm": 2.5762522220611572, + "learning_rate": 7.138246836071609e-06, + "loss": 0.9669, + "step": 9369 + }, + { + "epoch": 0.7571870138790683, + "grad_norm": 2.8306868076324463, + "learning_rate": 7.13765531057049e-06, + "loss": 0.8696, + "step": 9370 + }, + { + "epoch": 0.7572678235923959, + "grad_norm": 2.553178310394287, + "learning_rate": 7.1370637484564856e-06, + "loss": 0.9538, + "step": 9371 + }, + { + "epoch": 0.7573486333057233, + "grad_norm": 2.827378749847412, + "learning_rate": 7.136472149739723e-06, + "loss": 0.9093, + "step": 9372 + }, + { + "epoch": 0.7574294430190509, + "grad_norm": 2.2313454151153564, + "learning_rate": 7.135880514430334e-06, + "loss": 1.0702, + "step": 9373 + }, + { + "epoch": 0.7575102527323785, + "grad_norm": 2.437563419342041, + "learning_rate": 7.1352888425384555e-06, + "loss": 0.8496, + "step": 9374 + }, + { + "epoch": 0.757591062445706, + "grad_norm": 2.5437726974487305, + "learning_rate": 7.13469713407422e-06, + "loss": 0.9566, + "step": 9375 + }, + { + "epoch": 0.7576718721590335, + "grad_norm": 2.799818277359009, + "learning_rate": 7.134105389047761e-06, + "loss": 0.9318, + "step": 9376 + }, + { + "epoch": 0.7577526818723611, + "grad_norm": 2.6814332008361816, + "learning_rate": 7.133513607469214e-06, + "loss": 0.8791, + "step": 9377 + }, + { + "epoch": 0.7578334915856886, + "grad_norm": 2.529116153717041, + "learning_rate": 7.132921789348714e-06, + "loss": 1.0308, + "step": 9378 + }, + { + "epoch": 0.7579143012990162, + "grad_norm": 2.8629038333892822, + "learning_rate": 7.1323299346964015e-06, + "loss": 0.9784, + "step": 9379 + }, + { + "epoch": 0.7579951110123437, + "grad_norm": 2.7337749004364014, + "learning_rate": 7.131738043522409e-06, + "loss": 0.9443, + "step": 9380 + }, + { + "epoch": 0.7580759207256712, + "grad_norm": 2.824218511581421, + "learning_rate": 7.131146115836875e-06, + "loss": 0.8851, + "step": 9381 + }, + { + "epoch": 0.7581567304389988, + "grad_norm": 2.648939371109009, + "learning_rate": 7.13055415164994e-06, + "loss": 0.9399, + "step": 9382 + }, + { + "epoch": 0.7582375401523264, + "grad_norm": 2.5566866397857666, + "learning_rate": 7.129962150971741e-06, + "loss": 0.94, + "step": 9383 + }, + { + "epoch": 0.7583183498656538, + "grad_norm": 2.4530868530273438, + "learning_rate": 7.1293701138124175e-06, + "loss": 0.8466, + "step": 9384 + }, + { + "epoch": 0.7583991595789814, + "grad_norm": 2.5966145992279053, + "learning_rate": 7.1287780401821115e-06, + "loss": 0.945, + "step": 9385 + }, + { + "epoch": 0.758479969292309, + "grad_norm": 2.6472713947296143, + "learning_rate": 7.1281859300909605e-06, + "loss": 0.8775, + "step": 9386 + }, + { + "epoch": 0.7585607790056365, + "grad_norm": 2.3527584075927734, + "learning_rate": 7.12759378354911e-06, + "loss": 1.0495, + "step": 9387 + }, + { + "epoch": 0.758641588718964, + "grad_norm": 2.7990071773529053, + "learning_rate": 7.1270016005666985e-06, + "loss": 0.8712, + "step": 9388 + }, + { + "epoch": 0.7587223984322916, + "grad_norm": 2.9060542583465576, + "learning_rate": 7.1264093811538704e-06, + "loss": 1.0565, + "step": 9389 + }, + { + "epoch": 0.7588032081456191, + "grad_norm": 3.042029619216919, + "learning_rate": 7.125817125320769e-06, + "loss": 0.9588, + "step": 9390 + }, + { + "epoch": 0.7588840178589467, + "grad_norm": 2.4240853786468506, + "learning_rate": 7.125224833077537e-06, + "loss": 1.0243, + "step": 9391 + }, + { + "epoch": 0.7589648275722742, + "grad_norm": 2.659764528274536, + "learning_rate": 7.124632504434321e-06, + "loss": 0.8337, + "step": 9392 + }, + { + "epoch": 0.7590456372856017, + "grad_norm": 2.4955406188964844, + "learning_rate": 7.124040139401265e-06, + "loss": 0.9165, + "step": 9393 + }, + { + "epoch": 0.7591264469989293, + "grad_norm": 2.823073625564575, + "learning_rate": 7.123447737988515e-06, + "loss": 0.9566, + "step": 9394 + }, + { + "epoch": 0.7592072567122569, + "grad_norm": 2.467170476913452, + "learning_rate": 7.122855300206216e-06, + "loss": 0.9647, + "step": 9395 + }, + { + "epoch": 0.7592880664255843, + "grad_norm": 2.9635438919067383, + "learning_rate": 7.122262826064518e-06, + "loss": 0.9491, + "step": 9396 + }, + { + "epoch": 0.7593688761389119, + "grad_norm": 2.6342523097991943, + "learning_rate": 7.121670315573567e-06, + "loss": 0.8406, + "step": 9397 + }, + { + "epoch": 0.7594496858522395, + "grad_norm": 2.398297071456909, + "learning_rate": 7.121077768743509e-06, + "loss": 0.9417, + "step": 9398 + }, + { + "epoch": 0.759530495565567, + "grad_norm": 2.964754581451416, + "learning_rate": 7.1204851855844966e-06, + "loss": 0.9138, + "step": 9399 + }, + { + "epoch": 0.7596113052788945, + "grad_norm": 2.776533603668213, + "learning_rate": 7.119892566106678e-06, + "loss": 0.9521, + "step": 9400 + }, + { + "epoch": 0.7596921149922221, + "grad_norm": 2.866835117340088, + "learning_rate": 7.119299910320202e-06, + "loss": 1.0627, + "step": 9401 + }, + { + "epoch": 0.7597729247055496, + "grad_norm": 2.481116771697998, + "learning_rate": 7.118707218235221e-06, + "loss": 1.0436, + "step": 9402 + }, + { + "epoch": 0.7598537344188772, + "grad_norm": 2.7597126960754395, + "learning_rate": 7.118114489861886e-06, + "loss": 1.0089, + "step": 9403 + }, + { + "epoch": 0.7599345441322047, + "grad_norm": 2.7167396545410156, + "learning_rate": 7.117521725210349e-06, + "loss": 0.9041, + "step": 9404 + }, + { + "epoch": 0.7600153538455322, + "grad_norm": 2.408841848373413, + "learning_rate": 7.1169289242907634e-06, + "loss": 0.9589, + "step": 9405 + }, + { + "epoch": 0.7600961635588598, + "grad_norm": 2.7458863258361816, + "learning_rate": 7.116336087113281e-06, + "loss": 0.9494, + "step": 9406 + }, + { + "epoch": 0.7601769732721874, + "grad_norm": 2.74415922164917, + "learning_rate": 7.115743213688057e-06, + "loss": 1.0392, + "step": 9407 + }, + { + "epoch": 0.7602577829855148, + "grad_norm": 2.944753646850586, + "learning_rate": 7.1151503040252435e-06, + "loss": 0.8144, + "step": 9408 + }, + { + "epoch": 0.7603385926988424, + "grad_norm": 2.4746129512786865, + "learning_rate": 7.114557358134998e-06, + "loss": 0.9022, + "step": 9409 + }, + { + "epoch": 0.76041940241217, + "grad_norm": 2.4922099113464355, + "learning_rate": 7.1139643760274756e-06, + "loss": 0.9453, + "step": 9410 + }, + { + "epoch": 0.7605002121254975, + "grad_norm": 2.807968854904175, + "learning_rate": 7.113371357712833e-06, + "loss": 0.9031, + "step": 9411 + }, + { + "epoch": 0.760581021838825, + "grad_norm": 2.376249074935913, + "learning_rate": 7.112778303201227e-06, + "loss": 0.7973, + "step": 9412 + }, + { + "epoch": 0.7606618315521526, + "grad_norm": 2.693059206008911, + "learning_rate": 7.1121852125028144e-06, + "loss": 1.0284, + "step": 9413 + }, + { + "epoch": 0.7607426412654801, + "grad_norm": 2.5827879905700684, + "learning_rate": 7.1115920856277545e-06, + "loss": 0.8489, + "step": 9414 + }, + { + "epoch": 0.7608234509788077, + "grad_norm": 2.589111089706421, + "learning_rate": 7.1109989225862055e-06, + "loss": 0.9226, + "step": 9415 + }, + { + "epoch": 0.7609042606921352, + "grad_norm": 3.392749786376953, + "learning_rate": 7.110405723388326e-06, + "loss": 0.8925, + "step": 9416 + }, + { + "epoch": 0.7609850704054627, + "grad_norm": 2.8906846046447754, + "learning_rate": 7.1098124880442775e-06, + "loss": 0.9414, + "step": 9417 + }, + { + "epoch": 0.7610658801187903, + "grad_norm": 2.530301332473755, + "learning_rate": 7.10921921656422e-06, + "loss": 0.9049, + "step": 9418 + }, + { + "epoch": 0.7611466898321179, + "grad_norm": 2.5978944301605225, + "learning_rate": 7.1086259089583165e-06, + "loss": 1.0372, + "step": 9419 + }, + { + "epoch": 0.7612274995454453, + "grad_norm": 2.696030616760254, + "learning_rate": 7.108032565236727e-06, + "loss": 0.9258, + "step": 9420 + }, + { + "epoch": 0.7613083092587729, + "grad_norm": 2.5111844539642334, + "learning_rate": 7.107439185409613e-06, + "loss": 0.9666, + "step": 9421 + }, + { + "epoch": 0.7613891189721005, + "grad_norm": 2.845205783843994, + "learning_rate": 7.106845769487142e-06, + "loss": 0.8302, + "step": 9422 + }, + { + "epoch": 0.761469928685428, + "grad_norm": 2.8490588665008545, + "learning_rate": 7.106252317479473e-06, + "loss": 0.8415, + "step": 9423 + }, + { + "epoch": 0.7615507383987555, + "grad_norm": 2.5833804607391357, + "learning_rate": 7.105658829396772e-06, + "loss": 0.8523, + "step": 9424 + }, + { + "epoch": 0.7616315481120831, + "grad_norm": 2.668302297592163, + "learning_rate": 7.105065305249206e-06, + "loss": 0.9104, + "step": 9425 + }, + { + "epoch": 0.7617123578254106, + "grad_norm": 3.0312631130218506, + "learning_rate": 7.104471745046937e-06, + "loss": 0.9433, + "step": 9426 + }, + { + "epoch": 0.7617931675387382, + "grad_norm": 2.984961986541748, + "learning_rate": 7.103878148800134e-06, + "loss": 1.0107, + "step": 9427 + }, + { + "epoch": 0.7618739772520657, + "grad_norm": 2.511423349380493, + "learning_rate": 7.103284516518966e-06, + "loss": 0.8763, + "step": 9428 + }, + { + "epoch": 0.7619547869653932, + "grad_norm": 2.8681230545043945, + "learning_rate": 7.102690848213593e-06, + "loss": 0.9905, + "step": 9429 + }, + { + "epoch": 0.7620355966787208, + "grad_norm": 2.4794304370880127, + "learning_rate": 7.102097143894191e-06, + "loss": 0.973, + "step": 9430 + }, + { + "epoch": 0.7621164063920484, + "grad_norm": 2.545872449874878, + "learning_rate": 7.101503403570924e-06, + "loss": 1.0162, + "step": 9431 + }, + { + "epoch": 0.7621972161053758, + "grad_norm": 2.8261799812316895, + "learning_rate": 7.1009096272539646e-06, + "loss": 0.8995, + "step": 9432 + }, + { + "epoch": 0.7622780258187034, + "grad_norm": 2.9498088359832764, + "learning_rate": 7.10031581495348e-06, + "loss": 1.0137, + "step": 9433 + }, + { + "epoch": 0.762358835532031, + "grad_norm": 3.1932687759399414, + "learning_rate": 7.099721966679642e-06, + "loss": 0.863, + "step": 9434 + }, + { + "epoch": 0.7624396452453585, + "grad_norm": 3.1584761142730713, + "learning_rate": 7.099128082442621e-06, + "loss": 0.8687, + "step": 9435 + }, + { + "epoch": 0.762520454958686, + "grad_norm": 2.386131525039673, + "learning_rate": 7.09853416225259e-06, + "loss": 0.9076, + "step": 9436 + }, + { + "epoch": 0.7626012646720136, + "grad_norm": 3.2778615951538086, + "learning_rate": 7.09794020611972e-06, + "loss": 0.9383, + "step": 9437 + }, + { + "epoch": 0.7626820743853411, + "grad_norm": 2.6084465980529785, + "learning_rate": 7.097346214054186e-06, + "loss": 0.9834, + "step": 9438 + }, + { + "epoch": 0.7627628840986687, + "grad_norm": 2.735183000564575, + "learning_rate": 7.0967521860661604e-06, + "loss": 0.8664, + "step": 9439 + }, + { + "epoch": 0.7628436938119962, + "grad_norm": 2.414707660675049, + "learning_rate": 7.096158122165816e-06, + "loss": 1.1097, + "step": 9440 + }, + { + "epoch": 0.7629245035253237, + "grad_norm": 3.164177417755127, + "learning_rate": 7.09556402236333e-06, + "loss": 1.0333, + "step": 9441 + }, + { + "epoch": 0.7630053132386513, + "grad_norm": 2.6094696521759033, + "learning_rate": 7.0949698866688774e-06, + "loss": 0.9298, + "step": 9442 + }, + { + "epoch": 0.7630861229519789, + "grad_norm": 2.682769536972046, + "learning_rate": 7.094375715092635e-06, + "loss": 0.9226, + "step": 9443 + }, + { + "epoch": 0.7631669326653063, + "grad_norm": 2.2476346492767334, + "learning_rate": 7.093781507644778e-06, + "loss": 1.0783, + "step": 9444 + }, + { + "epoch": 0.7632477423786339, + "grad_norm": 2.8686139583587646, + "learning_rate": 7.093187264335484e-06, + "loss": 0.9593, + "step": 9445 + }, + { + "epoch": 0.7633285520919615, + "grad_norm": 2.404218912124634, + "learning_rate": 7.092592985174932e-06, + "loss": 0.9245, + "step": 9446 + }, + { + "epoch": 0.763409361805289, + "grad_norm": 3.1972155570983887, + "learning_rate": 7.091998670173299e-06, + "loss": 0.9019, + "step": 9447 + }, + { + "epoch": 0.7634901715186165, + "grad_norm": 2.677638053894043, + "learning_rate": 7.091404319340765e-06, + "loss": 0.9107, + "step": 9448 + }, + { + "epoch": 0.7635709812319441, + "grad_norm": 2.568795919418335, + "learning_rate": 7.09080993268751e-06, + "loss": 1.0429, + "step": 9449 + }, + { + "epoch": 0.7636517909452716, + "grad_norm": 2.8815431594848633, + "learning_rate": 7.090215510223716e-06, + "loss": 1.0192, + "step": 9450 + }, + { + "epoch": 0.7637326006585992, + "grad_norm": 2.5321731567382812, + "learning_rate": 7.089621051959559e-06, + "loss": 0.977, + "step": 9451 + }, + { + "epoch": 0.7638134103719267, + "grad_norm": 2.7652008533477783, + "learning_rate": 7.089026557905227e-06, + "loss": 0.9829, + "step": 9452 + }, + { + "epoch": 0.7638942200852542, + "grad_norm": 2.2721855640411377, + "learning_rate": 7.088432028070897e-06, + "loss": 0.9826, + "step": 9453 + }, + { + "epoch": 0.7639750297985818, + "grad_norm": 2.68190860748291, + "learning_rate": 7.087837462466756e-06, + "loss": 0.9249, + "step": 9454 + }, + { + "epoch": 0.7640558395119094, + "grad_norm": 2.512723207473755, + "learning_rate": 7.087242861102984e-06, + "loss": 0.9233, + "step": 9455 + }, + { + "epoch": 0.7641366492252368, + "grad_norm": 2.821420431137085, + "learning_rate": 7.0866482239897675e-06, + "loss": 0.8105, + "step": 9456 + }, + { + "epoch": 0.7642174589385644, + "grad_norm": 2.5856151580810547, + "learning_rate": 7.08605355113729e-06, + "loss": 1.0468, + "step": 9457 + }, + { + "epoch": 0.764298268651892, + "grad_norm": 2.2887649536132812, + "learning_rate": 7.085458842555737e-06, + "loss": 0.8821, + "step": 9458 + }, + { + "epoch": 0.7643790783652195, + "grad_norm": 3.04512882232666, + "learning_rate": 7.084864098255294e-06, + "loss": 1.0228, + "step": 9459 + }, + { + "epoch": 0.764459888078547, + "grad_norm": 3.017383337020874, + "learning_rate": 7.0842693182461494e-06, + "loss": 0.9401, + "step": 9460 + }, + { + "epoch": 0.7645406977918746, + "grad_norm": 2.2113864421844482, + "learning_rate": 7.083674502538489e-06, + "loss": 1.0046, + "step": 9461 + }, + { + "epoch": 0.7646215075052021, + "grad_norm": 2.4700751304626465, + "learning_rate": 7.083079651142499e-06, + "loss": 0.956, + "step": 9462 + }, + { + "epoch": 0.7647023172185297, + "grad_norm": 3.114546537399292, + "learning_rate": 7.082484764068371e-06, + "loss": 1.096, + "step": 9463 + }, + { + "epoch": 0.7647831269318572, + "grad_norm": 2.289763927459717, + "learning_rate": 7.081889841326293e-06, + "loss": 0.9989, + "step": 9464 + }, + { + "epoch": 0.7648639366451847, + "grad_norm": 2.766525983810425, + "learning_rate": 7.081294882926452e-06, + "loss": 0.924, + "step": 9465 + }, + { + "epoch": 0.7649447463585123, + "grad_norm": 2.4099643230438232, + "learning_rate": 7.080699888879041e-06, + "loss": 0.9167, + "step": 9466 + }, + { + "epoch": 0.7650255560718399, + "grad_norm": 2.4988508224487305, + "learning_rate": 7.08010485919425e-06, + "loss": 0.9012, + "step": 9467 + }, + { + "epoch": 0.7651063657851673, + "grad_norm": 2.955271005630493, + "learning_rate": 7.0795097938822695e-06, + "loss": 0.9786, + "step": 9468 + }, + { + "epoch": 0.7651871754984949, + "grad_norm": 2.9606423377990723, + "learning_rate": 7.078914692953294e-06, + "loss": 0.9125, + "step": 9469 + }, + { + "epoch": 0.7652679852118225, + "grad_norm": 2.853749990463257, + "learning_rate": 7.078319556417513e-06, + "loss": 0.9281, + "step": 9470 + }, + { + "epoch": 0.76534879492515, + "grad_norm": 2.4236481189727783, + "learning_rate": 7.077724384285123e-06, + "loss": 1.0123, + "step": 9471 + }, + { + "epoch": 0.7654296046384775, + "grad_norm": 2.6906888484954834, + "learning_rate": 7.0771291765663156e-06, + "loss": 0.9696, + "step": 9472 + }, + { + "epoch": 0.7655104143518051, + "grad_norm": 2.526541233062744, + "learning_rate": 7.076533933271284e-06, + "loss": 0.9469, + "step": 9473 + }, + { + "epoch": 0.7655912240651326, + "grad_norm": 2.3343493938446045, + "learning_rate": 7.075938654410228e-06, + "loss": 0.8985, + "step": 9474 + }, + { + "epoch": 0.7656720337784602, + "grad_norm": 2.31614089012146, + "learning_rate": 7.0753433399933406e-06, + "loss": 0.8177, + "step": 9475 + }, + { + "epoch": 0.7657528434917877, + "grad_norm": 3.08620285987854, + "learning_rate": 7.074747990030816e-06, + "loss": 0.8226, + "step": 9476 + }, + { + "epoch": 0.7658336532051152, + "grad_norm": 2.4425439834594727, + "learning_rate": 7.074152604532854e-06, + "loss": 0.9616, + "step": 9477 + }, + { + "epoch": 0.7659144629184428, + "grad_norm": 2.5771522521972656, + "learning_rate": 7.073557183509651e-06, + "loss": 1.143, + "step": 9478 + }, + { + "epoch": 0.7659952726317704, + "grad_norm": 2.8968923091888428, + "learning_rate": 7.072961726971405e-06, + "loss": 0.8749, + "step": 9479 + }, + { + "epoch": 0.7660760823450978, + "grad_norm": 2.923482894897461, + "learning_rate": 7.072366234928316e-06, + "loss": 0.9574, + "step": 9480 + }, + { + "epoch": 0.7661568920584254, + "grad_norm": 2.593574285507202, + "learning_rate": 7.071770707390582e-06, + "loss": 0.8833, + "step": 9481 + }, + { + "epoch": 0.766237701771753, + "grad_norm": 2.2582805156707764, + "learning_rate": 7.071175144368403e-06, + "loss": 1.1031, + "step": 9482 + }, + { + "epoch": 0.7663185114850805, + "grad_norm": 3.0276057720184326, + "learning_rate": 7.070579545871979e-06, + "loss": 0.9027, + "step": 9483 + }, + { + "epoch": 0.766399321198408, + "grad_norm": 2.48002290725708, + "learning_rate": 7.069983911911513e-06, + "loss": 1.0016, + "step": 9484 + }, + { + "epoch": 0.7664801309117356, + "grad_norm": 2.7922701835632324, + "learning_rate": 7.0693882424972074e-06, + "loss": 1.0384, + "step": 9485 + }, + { + "epoch": 0.7665609406250631, + "grad_norm": 2.288861036300659, + "learning_rate": 7.068792537639261e-06, + "loss": 0.9336, + "step": 9486 + }, + { + "epoch": 0.7666417503383907, + "grad_norm": 2.3470566272735596, + "learning_rate": 7.0681967973478795e-06, + "loss": 0.9434, + "step": 9487 + }, + { + "epoch": 0.7667225600517182, + "grad_norm": 2.6336758136749268, + "learning_rate": 7.067601021633266e-06, + "loss": 0.9795, + "step": 9488 + }, + { + "epoch": 0.7668033697650457, + "grad_norm": 2.35019588470459, + "learning_rate": 7.067005210505626e-06, + "loss": 1.0797, + "step": 9489 + }, + { + "epoch": 0.7668841794783733, + "grad_norm": 2.5723166465759277, + "learning_rate": 7.066409363975161e-06, + "loss": 1.0282, + "step": 9490 + }, + { + "epoch": 0.7669649891917009, + "grad_norm": 2.7887842655181885, + "learning_rate": 7.065813482052077e-06, + "loss": 0.9215, + "step": 9491 + }, + { + "epoch": 0.7670457989050283, + "grad_norm": 2.442120313644409, + "learning_rate": 7.065217564746584e-06, + "loss": 0.9704, + "step": 9492 + }, + { + "epoch": 0.7671266086183559, + "grad_norm": 3.224236011505127, + "learning_rate": 7.064621612068885e-06, + "loss": 0.8509, + "step": 9493 + }, + { + "epoch": 0.7672074183316835, + "grad_norm": 2.459911823272705, + "learning_rate": 7.064025624029187e-06, + "loss": 0.8022, + "step": 9494 + }, + { + "epoch": 0.767288228045011, + "grad_norm": 2.5275039672851562, + "learning_rate": 7.063429600637701e-06, + "loss": 0.9088, + "step": 9495 + }, + { + "epoch": 0.7673690377583385, + "grad_norm": 2.8273355960845947, + "learning_rate": 7.062833541904631e-06, + "loss": 0.9208, + "step": 9496 + }, + { + "epoch": 0.7674498474716661, + "grad_norm": 2.6317386627197266, + "learning_rate": 7.062237447840191e-06, + "loss": 0.9965, + "step": 9497 + }, + { + "epoch": 0.7675306571849936, + "grad_norm": 2.4512696266174316, + "learning_rate": 7.061641318454586e-06, + "loss": 1.0097, + "step": 9498 + }, + { + "epoch": 0.7676114668983212, + "grad_norm": 4.055238723754883, + "learning_rate": 7.0610451537580306e-06, + "loss": 0.9901, + "step": 9499 + }, + { + "epoch": 0.7676922766116487, + "grad_norm": 2.9475176334381104, + "learning_rate": 7.060448953760732e-06, + "loss": 1.043, + "step": 9500 + }, + { + "epoch": 0.7677730863249763, + "grad_norm": 2.9297895431518555, + "learning_rate": 7.059852718472904e-06, + "loss": 0.9215, + "step": 9501 + }, + { + "epoch": 0.7678538960383038, + "grad_norm": 2.8318982124328613, + "learning_rate": 7.059256447904756e-06, + "loss": 0.9256, + "step": 9502 + }, + { + "epoch": 0.7679347057516314, + "grad_norm": 2.6794416904449463, + "learning_rate": 7.058660142066506e-06, + "loss": 0.8328, + "step": 9503 + }, + { + "epoch": 0.7680155154649589, + "grad_norm": 3.0643105506896973, + "learning_rate": 7.05806380096836e-06, + "loss": 1.0295, + "step": 9504 + }, + { + "epoch": 0.7680963251782864, + "grad_norm": 2.5476534366607666, + "learning_rate": 7.057467424620539e-06, + "loss": 0.9921, + "step": 9505 + }, + { + "epoch": 0.768177134891614, + "grad_norm": 2.6490678787231445, + "learning_rate": 7.056871013033252e-06, + "loss": 0.941, + "step": 9506 + }, + { + "epoch": 0.7682579446049416, + "grad_norm": 2.326160192489624, + "learning_rate": 7.056274566216717e-06, + "loss": 0.8238, + "step": 9507 + }, + { + "epoch": 0.768338754318269, + "grad_norm": 2.27193546295166, + "learning_rate": 7.055678084181148e-06, + "loss": 0.9076, + "step": 9508 + }, + { + "epoch": 0.7684195640315966, + "grad_norm": 3.777320623397827, + "learning_rate": 7.055081566936763e-06, + "loss": 0.9422, + "step": 9509 + }, + { + "epoch": 0.7685003737449242, + "grad_norm": 2.8519272804260254, + "learning_rate": 7.054485014493777e-06, + "loss": 0.9746, + "step": 9510 + }, + { + "epoch": 0.7685811834582517, + "grad_norm": 2.2314727306365967, + "learning_rate": 7.053888426862412e-06, + "loss": 0.9078, + "step": 9511 + }, + { + "epoch": 0.7686619931715792, + "grad_norm": 2.3984363079071045, + "learning_rate": 7.053291804052879e-06, + "loss": 1.0029, + "step": 9512 + }, + { + "epoch": 0.7687428028849068, + "grad_norm": 2.545302629470825, + "learning_rate": 7.052695146075403e-06, + "loss": 0.9006, + "step": 9513 + }, + { + "epoch": 0.7688236125982343, + "grad_norm": 2.481466770172119, + "learning_rate": 7.0520984529401995e-06, + "loss": 0.907, + "step": 9514 + }, + { + "epoch": 0.7689044223115619, + "grad_norm": 2.5026607513427734, + "learning_rate": 7.05150172465749e-06, + "loss": 0.9001, + "step": 9515 + }, + { + "epoch": 0.7689852320248894, + "grad_norm": 2.6458675861358643, + "learning_rate": 7.050904961237495e-06, + "loss": 0.9075, + "step": 9516 + }, + { + "epoch": 0.7690660417382169, + "grad_norm": 2.786869764328003, + "learning_rate": 7.050308162690436e-06, + "loss": 0.9283, + "step": 9517 + }, + { + "epoch": 0.7691468514515445, + "grad_norm": 2.780898094177246, + "learning_rate": 7.049711329026532e-06, + "loss": 0.9822, + "step": 9518 + }, + { + "epoch": 0.7692276611648721, + "grad_norm": 2.678616523742676, + "learning_rate": 7.04911446025601e-06, + "loss": 1.0265, + "step": 9519 + }, + { + "epoch": 0.7693084708781995, + "grad_norm": 2.473202705383301, + "learning_rate": 7.048517556389088e-06, + "loss": 0.915, + "step": 9520 + }, + { + "epoch": 0.7693892805915271, + "grad_norm": 2.3862879276275635, + "learning_rate": 7.047920617435994e-06, + "loss": 0.9869, + "step": 9521 + }, + { + "epoch": 0.7694700903048547, + "grad_norm": 2.347320795059204, + "learning_rate": 7.047323643406948e-06, + "loss": 0.9873, + "step": 9522 + }, + { + "epoch": 0.7695509000181822, + "grad_norm": 2.733719825744629, + "learning_rate": 7.046726634312179e-06, + "loss": 0.8149, + "step": 9523 + }, + { + "epoch": 0.7696317097315097, + "grad_norm": 2.5755741596221924, + "learning_rate": 7.046129590161908e-06, + "loss": 1.0158, + "step": 9524 + }, + { + "epoch": 0.7697125194448373, + "grad_norm": 2.6959922313690186, + "learning_rate": 7.045532510966364e-06, + "loss": 0.9994, + "step": 9525 + }, + { + "epoch": 0.7697933291581648, + "grad_norm": 2.443272829055786, + "learning_rate": 7.044935396735771e-06, + "loss": 1.0158, + "step": 9526 + }, + { + "epoch": 0.7698741388714924, + "grad_norm": 3.298548460006714, + "learning_rate": 7.04433824748036e-06, + "loss": 0.9775, + "step": 9527 + }, + { + "epoch": 0.7699549485848199, + "grad_norm": 2.616126537322998, + "learning_rate": 7.043741063210354e-06, + "loss": 0.9429, + "step": 9528 + }, + { + "epoch": 0.7700357582981474, + "grad_norm": 2.4621856212615967, + "learning_rate": 7.043143843935985e-06, + "loss": 1.0791, + "step": 9529 + }, + { + "epoch": 0.770116568011475, + "grad_norm": 2.9813919067382812, + "learning_rate": 7.042546589667481e-06, + "loss": 1.0554, + "step": 9530 + }, + { + "epoch": 0.7701973777248026, + "grad_norm": 2.506427764892578, + "learning_rate": 7.0419493004150715e-06, + "loss": 0.9289, + "step": 9531 + }, + { + "epoch": 0.77027818743813, + "grad_norm": 2.6569604873657227, + "learning_rate": 7.0413519761889835e-06, + "loss": 0.8694, + "step": 9532 + }, + { + "epoch": 0.7703589971514576, + "grad_norm": 2.4761314392089844, + "learning_rate": 7.040754616999454e-06, + "loss": 1.0014, + "step": 9533 + }, + { + "epoch": 0.7704398068647852, + "grad_norm": 2.295379638671875, + "learning_rate": 7.0401572228567094e-06, + "loss": 0.8459, + "step": 9534 + }, + { + "epoch": 0.7705206165781127, + "grad_norm": 2.436434030532837, + "learning_rate": 7.039559793770983e-06, + "loss": 0.9284, + "step": 9535 + }, + { + "epoch": 0.7706014262914402, + "grad_norm": 2.2874650955200195, + "learning_rate": 7.0389623297525065e-06, + "loss": 0.8921, + "step": 9536 + }, + { + "epoch": 0.7706822360047678, + "grad_norm": 2.787672519683838, + "learning_rate": 7.038364830811516e-06, + "loss": 0.8205, + "step": 9537 + }, + { + "epoch": 0.7707630457180953, + "grad_norm": 2.6347997188568115, + "learning_rate": 7.03776729695824e-06, + "loss": 0.8769, + "step": 9538 + }, + { + "epoch": 0.7708438554314229, + "grad_norm": 3.424485921859741, + "learning_rate": 7.037169728202919e-06, + "loss": 0.81, + "step": 9539 + }, + { + "epoch": 0.7709246651447504, + "grad_norm": 2.503952741622925, + "learning_rate": 7.036572124555783e-06, + "loss": 0.7479, + "step": 9540 + }, + { + "epoch": 0.7710054748580779, + "grad_norm": 2.5592153072357178, + "learning_rate": 7.03597448602707e-06, + "loss": 0.9672, + "step": 9541 + }, + { + "epoch": 0.7710862845714055, + "grad_norm": 2.636793375015259, + "learning_rate": 7.035376812627015e-06, + "loss": 0.8467, + "step": 9542 + }, + { + "epoch": 0.7711670942847331, + "grad_norm": 3.253643274307251, + "learning_rate": 7.034779104365855e-06, + "loss": 0.8573, + "step": 9543 + }, + { + "epoch": 0.7712479039980605, + "grad_norm": 2.7215523719787598, + "learning_rate": 7.034181361253829e-06, + "loss": 0.9402, + "step": 9544 + }, + { + "epoch": 0.7713287137113881, + "grad_norm": 2.5872609615325928, + "learning_rate": 7.033583583301171e-06, + "loss": 0.8611, + "step": 9545 + }, + { + "epoch": 0.7714095234247157, + "grad_norm": 3.0744309425354004, + "learning_rate": 7.032985770518123e-06, + "loss": 0.9617, + "step": 9546 + }, + { + "epoch": 0.7714903331380432, + "grad_norm": 2.5720481872558594, + "learning_rate": 7.032387922914925e-06, + "loss": 0.8442, + "step": 9547 + }, + { + "epoch": 0.7715711428513707, + "grad_norm": 2.4857845306396484, + "learning_rate": 7.031790040501812e-06, + "loss": 0.9309, + "step": 9548 + }, + { + "epoch": 0.7716519525646983, + "grad_norm": 2.5544092655181885, + "learning_rate": 7.031192123289028e-06, + "loss": 0.9437, + "step": 9549 + }, + { + "epoch": 0.7717327622780258, + "grad_norm": 2.7516963481903076, + "learning_rate": 7.030594171286813e-06, + "loss": 0.8815, + "step": 9550 + }, + { + "epoch": 0.7718135719913534, + "grad_norm": 2.5536606311798096, + "learning_rate": 7.029996184505408e-06, + "loss": 0.988, + "step": 9551 + }, + { + "epoch": 0.7718943817046809, + "grad_norm": 2.81453800201416, + "learning_rate": 7.029398162955054e-06, + "loss": 0.9714, + "step": 9552 + }, + { + "epoch": 0.7719751914180084, + "grad_norm": 2.854370355606079, + "learning_rate": 7.028800106645996e-06, + "loss": 1.0865, + "step": 9553 + }, + { + "epoch": 0.772056001131336, + "grad_norm": 2.843309164047241, + "learning_rate": 7.028202015588478e-06, + "loss": 0.861, + "step": 9554 + }, + { + "epoch": 0.7721368108446636, + "grad_norm": 2.750856876373291, + "learning_rate": 7.02760388979274e-06, + "loss": 0.9065, + "step": 9555 + }, + { + "epoch": 0.772217620557991, + "grad_norm": 2.711421251296997, + "learning_rate": 7.027005729269031e-06, + "loss": 1.0317, + "step": 9556 + }, + { + "epoch": 0.7722984302713186, + "grad_norm": 2.3563313484191895, + "learning_rate": 7.026407534027592e-06, + "loss": 0.9685, + "step": 9557 + }, + { + "epoch": 0.7723792399846462, + "grad_norm": 2.8109371662139893, + "learning_rate": 7.02580930407867e-06, + "loss": 0.7691, + "step": 9558 + }, + { + "epoch": 0.7724600496979737, + "grad_norm": 2.5159077644348145, + "learning_rate": 7.025211039432512e-06, + "loss": 0.9224, + "step": 9559 + }, + { + "epoch": 0.7725408594113012, + "grad_norm": 2.9255714416503906, + "learning_rate": 7.024612740099364e-06, + "loss": 0.9872, + "step": 9560 + }, + { + "epoch": 0.7726216691246288, + "grad_norm": 2.7818901538848877, + "learning_rate": 7.024014406089475e-06, + "loss": 0.9206, + "step": 9561 + }, + { + "epoch": 0.7727024788379563, + "grad_norm": 2.6201705932617188, + "learning_rate": 7.023416037413091e-06, + "loss": 0.8499, + "step": 9562 + }, + { + "epoch": 0.7727832885512839, + "grad_norm": 2.3721187114715576, + "learning_rate": 7.022817634080461e-06, + "loss": 1.0093, + "step": 9563 + }, + { + "epoch": 0.7728640982646114, + "grad_norm": 2.4636049270629883, + "learning_rate": 7.022219196101836e-06, + "loss": 0.9829, + "step": 9564 + }, + { + "epoch": 0.7729449079779389, + "grad_norm": 2.4145498275756836, + "learning_rate": 7.021620723487464e-06, + "loss": 0.8964, + "step": 9565 + }, + { + "epoch": 0.7730257176912665, + "grad_norm": 2.788201332092285, + "learning_rate": 7.021022216247595e-06, + "loss": 0.8574, + "step": 9566 + }, + { + "epoch": 0.7731065274045941, + "grad_norm": 2.318821430206299, + "learning_rate": 7.02042367439248e-06, + "loss": 0.8232, + "step": 9567 + }, + { + "epoch": 0.7731873371179215, + "grad_norm": 2.2990260124206543, + "learning_rate": 7.019825097932373e-06, + "loss": 0.9651, + "step": 9568 + }, + { + "epoch": 0.7732681468312491, + "grad_norm": 2.5955026149749756, + "learning_rate": 7.019226486877525e-06, + "loss": 0.904, + "step": 9569 + }, + { + "epoch": 0.7733489565445767, + "grad_norm": 2.8680477142333984, + "learning_rate": 7.018627841238188e-06, + "loss": 0.9284, + "step": 9570 + }, + { + "epoch": 0.7734297662579042, + "grad_norm": 2.5354461669921875, + "learning_rate": 7.018029161024615e-06, + "loss": 1.0128, + "step": 9571 + }, + { + "epoch": 0.7735105759712317, + "grad_norm": 2.5947415828704834, + "learning_rate": 7.017430446247062e-06, + "loss": 0.8596, + "step": 9572 + }, + { + "epoch": 0.7735913856845593, + "grad_norm": 2.6628518104553223, + "learning_rate": 7.016831696915782e-06, + "loss": 0.8106, + "step": 9573 + }, + { + "epoch": 0.7736721953978868, + "grad_norm": 2.7208001613616943, + "learning_rate": 7.016232913041029e-06, + "loss": 0.9419, + "step": 9574 + }, + { + "epoch": 0.7737530051112144, + "grad_norm": 2.77506947517395, + "learning_rate": 7.01563409463306e-06, + "loss": 0.9868, + "step": 9575 + }, + { + "epoch": 0.7738338148245419, + "grad_norm": 2.494380235671997, + "learning_rate": 7.015035241702133e-06, + "loss": 0.852, + "step": 9576 + }, + { + "epoch": 0.7739146245378694, + "grad_norm": 2.387253522872925, + "learning_rate": 7.014436354258501e-06, + "loss": 0.9764, + "step": 9577 + }, + { + "epoch": 0.773995434251197, + "grad_norm": 2.615269899368286, + "learning_rate": 7.013837432312427e-06, + "loss": 0.8534, + "step": 9578 + }, + { + "epoch": 0.7740762439645246, + "grad_norm": 3.229938268661499, + "learning_rate": 7.013238475874163e-06, + "loss": 0.8501, + "step": 9579 + }, + { + "epoch": 0.774157053677852, + "grad_norm": 2.679093360900879, + "learning_rate": 7.012639484953973e-06, + "loss": 0.8724, + "step": 9580 + }, + { + "epoch": 0.7742378633911796, + "grad_norm": 2.995333433151245, + "learning_rate": 7.0120404595621125e-06, + "loss": 0.9358, + "step": 9581 + }, + { + "epoch": 0.7743186731045072, + "grad_norm": 3.551964521408081, + "learning_rate": 7.011441399708842e-06, + "loss": 0.9257, + "step": 9582 + }, + { + "epoch": 0.7743994828178347, + "grad_norm": 2.7420544624328613, + "learning_rate": 7.010842305404424e-06, + "loss": 0.9684, + "step": 9583 + }, + { + "epoch": 0.7744802925311622, + "grad_norm": 3.192138195037842, + "learning_rate": 7.010243176659118e-06, + "loss": 0.8429, + "step": 9584 + }, + { + "epoch": 0.7745611022444898, + "grad_norm": 2.406846046447754, + "learning_rate": 7.009644013483186e-06, + "loss": 0.9449, + "step": 9585 + }, + { + "epoch": 0.7746419119578173, + "grad_norm": 2.807209014892578, + "learning_rate": 7.00904481588689e-06, + "loss": 1.1005, + "step": 9586 + }, + { + "epoch": 0.7747227216711449, + "grad_norm": 2.7669317722320557, + "learning_rate": 7.008445583880492e-06, + "loss": 1.0634, + "step": 9587 + }, + { + "epoch": 0.7748035313844724, + "grad_norm": 2.5482051372528076, + "learning_rate": 7.007846317474257e-06, + "loss": 0.9123, + "step": 9588 + }, + { + "epoch": 0.7748843410977999, + "grad_norm": 2.2731552124023438, + "learning_rate": 7.007247016678448e-06, + "loss": 0.9038, + "step": 9589 + }, + { + "epoch": 0.7749651508111275, + "grad_norm": 2.675006628036499, + "learning_rate": 7.006647681503331e-06, + "loss": 0.9246, + "step": 9590 + }, + { + "epoch": 0.7750459605244551, + "grad_norm": 2.4828829765319824, + "learning_rate": 7.006048311959168e-06, + "loss": 0.9194, + "step": 9591 + }, + { + "epoch": 0.7751267702377825, + "grad_norm": 3.1538140773773193, + "learning_rate": 7.0054489080562284e-06, + "loss": 0.8672, + "step": 9592 + }, + { + "epoch": 0.7752075799511101, + "grad_norm": 2.542527198791504, + "learning_rate": 7.004849469804775e-06, + "loss": 0.9722, + "step": 9593 + }, + { + "epoch": 0.7752883896644377, + "grad_norm": 2.6315410137176514, + "learning_rate": 7.004249997215079e-06, + "loss": 0.9036, + "step": 9594 + }, + { + "epoch": 0.7753691993777652, + "grad_norm": 2.555668830871582, + "learning_rate": 7.0036504902974044e-06, + "loss": 0.8578, + "step": 9595 + }, + { + "epoch": 0.7754500090910927, + "grad_norm": 2.8489654064178467, + "learning_rate": 7.00305094906202e-06, + "loss": 1.0087, + "step": 9596 + }, + { + "epoch": 0.7755308188044203, + "grad_norm": 2.558403968811035, + "learning_rate": 7.0024513735191964e-06, + "loss": 0.9447, + "step": 9597 + }, + { + "epoch": 0.7756116285177478, + "grad_norm": 3.083716630935669, + "learning_rate": 7.001851763679201e-06, + "loss": 0.8327, + "step": 9598 + }, + { + "epoch": 0.7756924382310754, + "grad_norm": 2.7847039699554443, + "learning_rate": 7.0012521195523034e-06, + "loss": 0.9479, + "step": 9599 + }, + { + "epoch": 0.7757732479444029, + "grad_norm": 2.4878573417663574, + "learning_rate": 7.000652441148777e-06, + "loss": 0.8972, + "step": 9600 + }, + { + "epoch": 0.7758540576577304, + "grad_norm": 2.6768605709075928, + "learning_rate": 7.000052728478888e-06, + "loss": 0.931, + "step": 9601 + }, + { + "epoch": 0.775934867371058, + "grad_norm": 3.023346424102783, + "learning_rate": 6.999452981552911e-06, + "loss": 0.8828, + "step": 9602 + }, + { + "epoch": 0.7760156770843856, + "grad_norm": 2.6194918155670166, + "learning_rate": 6.998853200381118e-06, + "loss": 1.0002, + "step": 9603 + }, + { + "epoch": 0.776096486797713, + "grad_norm": 2.7090258598327637, + "learning_rate": 6.998253384973784e-06, + "loss": 0.9234, + "step": 9604 + }, + { + "epoch": 0.7761772965110406, + "grad_norm": 2.317145824432373, + "learning_rate": 6.997653535341177e-06, + "loss": 0.8695, + "step": 9605 + }, + { + "epoch": 0.7762581062243682, + "grad_norm": 2.125000238418579, + "learning_rate": 6.997053651493576e-06, + "loss": 0.9545, + "step": 9606 + }, + { + "epoch": 0.7763389159376957, + "grad_norm": 2.7297613620758057, + "learning_rate": 6.996453733441252e-06, + "loss": 0.8297, + "step": 9607 + }, + { + "epoch": 0.7764197256510232, + "grad_norm": 2.6772239208221436, + "learning_rate": 6.995853781194484e-06, + "loss": 1.0247, + "step": 9608 + }, + { + "epoch": 0.7765005353643508, + "grad_norm": 2.860786199569702, + "learning_rate": 6.995253794763545e-06, + "loss": 0.997, + "step": 9609 + }, + { + "epoch": 0.7765813450776783, + "grad_norm": 2.5108728408813477, + "learning_rate": 6.994653774158711e-06, + "loss": 0.981, + "step": 9610 + }, + { + "epoch": 0.7766621547910059, + "grad_norm": 2.5882833003997803, + "learning_rate": 6.9940537193902594e-06, + "loss": 0.9718, + "step": 9611 + }, + { + "epoch": 0.7767429645043334, + "grad_norm": 3.0769033432006836, + "learning_rate": 6.993453630468468e-06, + "loss": 0.9234, + "step": 9612 + }, + { + "epoch": 0.7768237742176609, + "grad_norm": 2.6407887935638428, + "learning_rate": 6.992853507403617e-06, + "loss": 0.9675, + "step": 9613 + }, + { + "epoch": 0.7769045839309885, + "grad_norm": 2.8648247718811035, + "learning_rate": 6.992253350205982e-06, + "loss": 0.9322, + "step": 9614 + }, + { + "epoch": 0.7769853936443161, + "grad_norm": 2.362705707550049, + "learning_rate": 6.991653158885842e-06, + "loss": 0.9228, + "step": 9615 + }, + { + "epoch": 0.7770662033576435, + "grad_norm": 2.714418411254883, + "learning_rate": 6.99105293345348e-06, + "loss": 0.9775, + "step": 9616 + }, + { + "epoch": 0.7771470130709711, + "grad_norm": 2.5210421085357666, + "learning_rate": 6.990452673919174e-06, + "loss": 0.9148, + "step": 9617 + }, + { + "epoch": 0.7772278227842987, + "grad_norm": 2.44423770904541, + "learning_rate": 6.989852380293205e-06, + "loss": 0.8282, + "step": 9618 + }, + { + "epoch": 0.7773086324976262, + "grad_norm": 3.33247447013855, + "learning_rate": 6.989252052585856e-06, + "loss": 1.0303, + "step": 9619 + }, + { + "epoch": 0.7773894422109537, + "grad_norm": 2.6625845432281494, + "learning_rate": 6.988651690807407e-06, + "loss": 0.9808, + "step": 9620 + }, + { + "epoch": 0.7774702519242813, + "grad_norm": 2.823176145553589, + "learning_rate": 6.988051294968142e-06, + "loss": 1.0867, + "step": 9621 + }, + { + "epoch": 0.7775510616376088, + "grad_norm": 2.1813924312591553, + "learning_rate": 6.987450865078344e-06, + "loss": 0.8795, + "step": 9622 + }, + { + "epoch": 0.7776318713509364, + "grad_norm": 3.0216259956359863, + "learning_rate": 6.986850401148299e-06, + "loss": 0.9338, + "step": 9623 + }, + { + "epoch": 0.7777126810642639, + "grad_norm": 2.6659300327301025, + "learning_rate": 6.986249903188289e-06, + "loss": 0.8721, + "step": 9624 + }, + { + "epoch": 0.7777934907775914, + "grad_norm": 2.786710739135742, + "learning_rate": 6.985649371208601e-06, + "loss": 1.0178, + "step": 9625 + }, + { + "epoch": 0.777874300490919, + "grad_norm": 2.4938833713531494, + "learning_rate": 6.985048805219518e-06, + "loss": 0.9823, + "step": 9626 + }, + { + "epoch": 0.7779551102042466, + "grad_norm": 2.678549289703369, + "learning_rate": 6.984448205231328e-06, + "loss": 1.0505, + "step": 9627 + }, + { + "epoch": 0.778035919917574, + "grad_norm": 2.584136724472046, + "learning_rate": 6.983847571254317e-06, + "loss": 0.9083, + "step": 9628 + }, + { + "epoch": 0.7781167296309016, + "grad_norm": 2.886458158493042, + "learning_rate": 6.983246903298775e-06, + "loss": 0.8482, + "step": 9629 + }, + { + "epoch": 0.7781975393442292, + "grad_norm": 2.893474578857422, + "learning_rate": 6.982646201374985e-06, + "loss": 0.8756, + "step": 9630 + }, + { + "epoch": 0.7782783490575568, + "grad_norm": 2.426586389541626, + "learning_rate": 6.982045465493241e-06, + "loss": 0.9781, + "step": 9631 + }, + { + "epoch": 0.7783591587708842, + "grad_norm": 2.4432895183563232, + "learning_rate": 6.9814446956638305e-06, + "loss": 0.8297, + "step": 9632 + }, + { + "epoch": 0.7784399684842118, + "grad_norm": 2.637373924255371, + "learning_rate": 6.98084389189704e-06, + "loss": 0.9191, + "step": 9633 + }, + { + "epoch": 0.7785207781975394, + "grad_norm": 2.6416330337524414, + "learning_rate": 6.9802430542031645e-06, + "loss": 1.075, + "step": 9634 + }, + { + "epoch": 0.7786015879108669, + "grad_norm": 2.9617810249328613, + "learning_rate": 6.979642182592491e-06, + "loss": 0.8736, + "step": 9635 + }, + { + "epoch": 0.7786823976241944, + "grad_norm": 2.736532688140869, + "learning_rate": 6.979041277075313e-06, + "loss": 0.9148, + "step": 9636 + }, + { + "epoch": 0.778763207337522, + "grad_norm": 3.1691181659698486, + "learning_rate": 6.978440337661923e-06, + "loss": 1.0436, + "step": 9637 + }, + { + "epoch": 0.7788440170508495, + "grad_norm": 2.742832660675049, + "learning_rate": 6.977839364362612e-06, + "loss": 0.9721, + "step": 9638 + }, + { + "epoch": 0.7789248267641771, + "grad_norm": 2.6143288612365723, + "learning_rate": 6.977238357187675e-06, + "loss": 0.958, + "step": 9639 + }, + { + "epoch": 0.7790056364775046, + "grad_norm": 3.0187857151031494, + "learning_rate": 6.9766373161474054e-06, + "loss": 0.8361, + "step": 9640 + }, + { + "epoch": 0.7790864461908321, + "grad_norm": 2.5095255374908447, + "learning_rate": 6.976036241252095e-06, + "loss": 0.9714, + "step": 9641 + }, + { + "epoch": 0.7791672559041597, + "grad_norm": 2.383462905883789, + "learning_rate": 6.9754351325120426e-06, + "loss": 0.9255, + "step": 9642 + }, + { + "epoch": 0.7792480656174873, + "grad_norm": 3.18103289604187, + "learning_rate": 6.974833989937543e-06, + "loss": 0.7869, + "step": 9643 + }, + { + "epoch": 0.7793288753308147, + "grad_norm": 2.3761940002441406, + "learning_rate": 6.9742328135388896e-06, + "loss": 0.9537, + "step": 9644 + }, + { + "epoch": 0.7794096850441423, + "grad_norm": 2.1563880443573, + "learning_rate": 6.973631603326382e-06, + "loss": 1.1183, + "step": 9645 + }, + { + "epoch": 0.7794904947574699, + "grad_norm": 2.8754329681396484, + "learning_rate": 6.973030359310315e-06, + "loss": 0.8996, + "step": 9646 + }, + { + "epoch": 0.7795713044707974, + "grad_norm": 3.062407970428467, + "learning_rate": 6.972429081500989e-06, + "loss": 0.9424, + "step": 9647 + }, + { + "epoch": 0.779652114184125, + "grad_norm": 3.4805405139923096, + "learning_rate": 6.971827769908701e-06, + "loss": 0.9288, + "step": 9648 + }, + { + "epoch": 0.7797329238974525, + "grad_norm": 2.5562899112701416, + "learning_rate": 6.971226424543749e-06, + "loss": 0.882, + "step": 9649 + }, + { + "epoch": 0.77981373361078, + "grad_norm": 2.508535146713257, + "learning_rate": 6.970625045416435e-06, + "loss": 0.8526, + "step": 9650 + }, + { + "epoch": 0.7798945433241076, + "grad_norm": 2.536494016647339, + "learning_rate": 6.97002363253706e-06, + "loss": 0.9487, + "step": 9651 + }, + { + "epoch": 0.7799753530374351, + "grad_norm": 2.586965322494507, + "learning_rate": 6.96942218591592e-06, + "loss": 0.9016, + "step": 9652 + }, + { + "epoch": 0.7800561627507626, + "grad_norm": 2.5410730838775635, + "learning_rate": 6.968820705563319e-06, + "loss": 0.9202, + "step": 9653 + }, + { + "epoch": 0.7801369724640902, + "grad_norm": 2.654521942138672, + "learning_rate": 6.96821919148956e-06, + "loss": 0.912, + "step": 9654 + }, + { + "epoch": 0.7802177821774178, + "grad_norm": 2.298339366912842, + "learning_rate": 6.967617643704945e-06, + "loss": 0.9096, + "step": 9655 + }, + { + "epoch": 0.7802985918907452, + "grad_norm": 2.3644843101501465, + "learning_rate": 6.967016062219777e-06, + "loss": 1.0322, + "step": 9656 + }, + { + "epoch": 0.7803794016040728, + "grad_norm": 2.7085025310516357, + "learning_rate": 6.966414447044359e-06, + "loss": 0.9766, + "step": 9657 + }, + { + "epoch": 0.7804602113174004, + "grad_norm": 2.3096635341644287, + "learning_rate": 6.965812798188994e-06, + "loss": 0.9163, + "step": 9658 + }, + { + "epoch": 0.7805410210307279, + "grad_norm": 2.3417112827301025, + "learning_rate": 6.96521111566399e-06, + "loss": 0.9755, + "step": 9659 + }, + { + "epoch": 0.7806218307440554, + "grad_norm": 2.7602450847625732, + "learning_rate": 6.964609399479649e-06, + "loss": 1.0441, + "step": 9660 + }, + { + "epoch": 0.780702640457383, + "grad_norm": 2.400860548019409, + "learning_rate": 6.964007649646281e-06, + "loss": 0.8635, + "step": 9661 + }, + { + "epoch": 0.7807834501707105, + "grad_norm": 2.5426175594329834, + "learning_rate": 6.963405866174188e-06, + "loss": 0.8881, + "step": 9662 + }, + { + "epoch": 0.7808642598840381, + "grad_norm": 2.614553213119507, + "learning_rate": 6.962804049073679e-06, + "loss": 0.8525, + "step": 9663 + }, + { + "epoch": 0.7809450695973656, + "grad_norm": 2.7390501499176025, + "learning_rate": 6.962202198355062e-06, + "loss": 1.0068, + "step": 9664 + }, + { + "epoch": 0.7810258793106931, + "grad_norm": 2.543248176574707, + "learning_rate": 6.961600314028647e-06, + "loss": 0.9534, + "step": 9665 + }, + { + "epoch": 0.7811066890240207, + "grad_norm": 3.6785783767700195, + "learning_rate": 6.960998396104739e-06, + "loss": 0.9088, + "step": 9666 + }, + { + "epoch": 0.7811874987373483, + "grad_norm": 2.6666314601898193, + "learning_rate": 6.960396444593651e-06, + "loss": 0.977, + "step": 9667 + }, + { + "epoch": 0.7812683084506757, + "grad_norm": 2.5132291316986084, + "learning_rate": 6.959794459505691e-06, + "loss": 0.8372, + "step": 9668 + }, + { + "epoch": 0.7813491181640033, + "grad_norm": 2.5730397701263428, + "learning_rate": 6.959192440851169e-06, + "loss": 1.0269, + "step": 9669 + }, + { + "epoch": 0.7814299278773309, + "grad_norm": 3.234506607055664, + "learning_rate": 6.958590388640397e-06, + "loss": 1.0491, + "step": 9670 + }, + { + "epoch": 0.7815107375906584, + "grad_norm": 2.641125440597534, + "learning_rate": 6.957988302883688e-06, + "loss": 0.8949, + "step": 9671 + }, + { + "epoch": 0.781591547303986, + "grad_norm": 2.720885753631592, + "learning_rate": 6.957386183591351e-06, + "loss": 1.0241, + "step": 9672 + }, + { + "epoch": 0.7816723570173135, + "grad_norm": 2.790813446044922, + "learning_rate": 6.9567840307737035e-06, + "loss": 0.9351, + "step": 9673 + }, + { + "epoch": 0.781753166730641, + "grad_norm": 2.3688058853149414, + "learning_rate": 6.9561818444410545e-06, + "loss": 0.868, + "step": 9674 + }, + { + "epoch": 0.7818339764439686, + "grad_norm": 2.60577392578125, + "learning_rate": 6.955579624603721e-06, + "loss": 0.8864, + "step": 9675 + }, + { + "epoch": 0.7819147861572961, + "grad_norm": 2.5541322231292725, + "learning_rate": 6.954977371272016e-06, + "loss": 0.9899, + "step": 9676 + }, + { + "epoch": 0.7819955958706236, + "grad_norm": 2.799955129623413, + "learning_rate": 6.954375084456254e-06, + "loss": 0.9234, + "step": 9677 + }, + { + "epoch": 0.7820764055839512, + "grad_norm": 2.387197971343994, + "learning_rate": 6.953772764166753e-06, + "loss": 1.006, + "step": 9678 + }, + { + "epoch": 0.7821572152972788, + "grad_norm": 2.6300976276397705, + "learning_rate": 6.953170410413828e-06, + "loss": 0.8719, + "step": 9679 + }, + { + "epoch": 0.7822380250106062, + "grad_norm": 2.34307861328125, + "learning_rate": 6.952568023207795e-06, + "loss": 0.9994, + "step": 9680 + }, + { + "epoch": 0.7823188347239338, + "grad_norm": 2.5072860717773438, + "learning_rate": 6.951965602558973e-06, + "loss": 0.9497, + "step": 9681 + }, + { + "epoch": 0.7823996444372614, + "grad_norm": 2.631420850753784, + "learning_rate": 6.95136314847768e-06, + "loss": 0.9123, + "step": 9682 + }, + { + "epoch": 0.7824804541505889, + "grad_norm": 2.6064393520355225, + "learning_rate": 6.950760660974233e-06, + "loss": 1.0639, + "step": 9683 + }, + { + "epoch": 0.7825612638639164, + "grad_norm": 2.7990331649780273, + "learning_rate": 6.950158140058953e-06, + "loss": 0.7694, + "step": 9684 + }, + { + "epoch": 0.782642073577244, + "grad_norm": 2.9253673553466797, + "learning_rate": 6.949555585742157e-06, + "loss": 1.1346, + "step": 9685 + }, + { + "epoch": 0.7827228832905715, + "grad_norm": 2.707361936569214, + "learning_rate": 6.948952998034168e-06, + "loss": 0.9165, + "step": 9686 + }, + { + "epoch": 0.7828036930038991, + "grad_norm": 2.8470633029937744, + "learning_rate": 6.948350376945307e-06, + "loss": 0.941, + "step": 9687 + }, + { + "epoch": 0.7828845027172266, + "grad_norm": 2.722989320755005, + "learning_rate": 6.947747722485893e-06, + "loss": 0.9912, + "step": 9688 + }, + { + "epoch": 0.7829653124305541, + "grad_norm": 3.1429643630981445, + "learning_rate": 6.947145034666253e-06, + "loss": 0.9211, + "step": 9689 + }, + { + "epoch": 0.7830461221438817, + "grad_norm": 2.6342954635620117, + "learning_rate": 6.9465423134967035e-06, + "loss": 0.9303, + "step": 9690 + }, + { + "epoch": 0.7831269318572093, + "grad_norm": 2.8902268409729004, + "learning_rate": 6.94593955898757e-06, + "loss": 0.9141, + "step": 9691 + }, + { + "epoch": 0.7832077415705367, + "grad_norm": 2.7754554748535156, + "learning_rate": 6.945336771149177e-06, + "loss": 0.8759, + "step": 9692 + }, + { + "epoch": 0.7832885512838643, + "grad_norm": 2.7848026752471924, + "learning_rate": 6.9447339499918485e-06, + "loss": 0.9073, + "step": 9693 + }, + { + "epoch": 0.7833693609971919, + "grad_norm": 2.5997676849365234, + "learning_rate": 6.944131095525909e-06, + "loss": 0.8533, + "step": 9694 + }, + { + "epoch": 0.7834501707105194, + "grad_norm": 2.757568359375, + "learning_rate": 6.943528207761684e-06, + "loss": 0.9624, + "step": 9695 + }, + { + "epoch": 0.783530980423847, + "grad_norm": 2.446953535079956, + "learning_rate": 6.942925286709501e-06, + "loss": 0.9607, + "step": 9696 + }, + { + "epoch": 0.7836117901371745, + "grad_norm": 2.747488260269165, + "learning_rate": 6.942322332379683e-06, + "loss": 0.9538, + "step": 9697 + }, + { + "epoch": 0.783692599850502, + "grad_norm": 3.0803427696228027, + "learning_rate": 6.94171934478256e-06, + "loss": 0.972, + "step": 9698 + }, + { + "epoch": 0.7837734095638296, + "grad_norm": 2.4530720710754395, + "learning_rate": 6.94111632392846e-06, + "loss": 1.0084, + "step": 9699 + }, + { + "epoch": 0.7838542192771571, + "grad_norm": 2.7214879989624023, + "learning_rate": 6.94051326982771e-06, + "loss": 1.0035, + "step": 9700 + }, + { + "epoch": 0.7839350289904846, + "grad_norm": 2.7606163024902344, + "learning_rate": 6.939910182490639e-06, + "loss": 0.9441, + "step": 9701 + }, + { + "epoch": 0.7840158387038122, + "grad_norm": 2.752997875213623, + "learning_rate": 6.939307061927577e-06, + "loss": 0.9443, + "step": 9702 + }, + { + "epoch": 0.7840966484171398, + "grad_norm": 2.579437732696533, + "learning_rate": 6.938703908148854e-06, + "loss": 1.0311, + "step": 9703 + }, + { + "epoch": 0.7841774581304672, + "grad_norm": 2.837007761001587, + "learning_rate": 6.9381007211648e-06, + "loss": 0.8975, + "step": 9704 + }, + { + "epoch": 0.7842582678437948, + "grad_norm": 2.9079701900482178, + "learning_rate": 6.937497500985746e-06, + "loss": 0.9381, + "step": 9705 + }, + { + "epoch": 0.7843390775571224, + "grad_norm": 2.8375697135925293, + "learning_rate": 6.936894247622026e-06, + "loss": 1.0136, + "step": 9706 + }, + { + "epoch": 0.7844198872704499, + "grad_norm": 2.6654367446899414, + "learning_rate": 6.936290961083968e-06, + "loss": 0.9365, + "step": 9707 + }, + { + "epoch": 0.7845006969837774, + "grad_norm": 3.0772650241851807, + "learning_rate": 6.935687641381908e-06, + "loss": 1.1799, + "step": 9708 + }, + { + "epoch": 0.784581506697105, + "grad_norm": 2.334848165512085, + "learning_rate": 6.935084288526179e-06, + "loss": 0.9563, + "step": 9709 + }, + { + "epoch": 0.7846623164104325, + "grad_norm": 2.4315085411071777, + "learning_rate": 6.934480902527115e-06, + "loss": 0.8677, + "step": 9710 + }, + { + "epoch": 0.7847431261237601, + "grad_norm": 2.638908624649048, + "learning_rate": 6.93387748339505e-06, + "loss": 0.8608, + "step": 9711 + }, + { + "epoch": 0.7848239358370876, + "grad_norm": 2.3801815509796143, + "learning_rate": 6.933274031140319e-06, + "loss": 0.9107, + "step": 9712 + }, + { + "epoch": 0.7849047455504151, + "grad_norm": 2.940345525741577, + "learning_rate": 6.932670545773259e-06, + "loss": 0.8846, + "step": 9713 + }, + { + "epoch": 0.7849855552637427, + "grad_norm": 3.1622703075408936, + "learning_rate": 6.9320670273042034e-06, + "loss": 0.8589, + "step": 9714 + }, + { + "epoch": 0.7850663649770703, + "grad_norm": 2.639561891555786, + "learning_rate": 6.931463475743492e-06, + "loss": 0.9901, + "step": 9715 + }, + { + "epoch": 0.7851471746903977, + "grad_norm": 2.6701161861419678, + "learning_rate": 6.930859891101461e-06, + "loss": 0.9628, + "step": 9716 + }, + { + "epoch": 0.7852279844037253, + "grad_norm": 2.7424087524414062, + "learning_rate": 6.930256273388448e-06, + "loss": 0.9713, + "step": 9717 + }, + { + "epoch": 0.7853087941170529, + "grad_norm": 2.560697555541992, + "learning_rate": 6.929652622614793e-06, + "loss": 0.857, + "step": 9718 + }, + { + "epoch": 0.7853896038303804, + "grad_norm": 2.141119956970215, + "learning_rate": 6.929048938790832e-06, + "loss": 1.0336, + "step": 9719 + }, + { + "epoch": 0.785470413543708, + "grad_norm": 2.396378517150879, + "learning_rate": 6.928445221926909e-06, + "loss": 0.9565, + "step": 9720 + }, + { + "epoch": 0.7855512232570355, + "grad_norm": 2.7309021949768066, + "learning_rate": 6.927841472033362e-06, + "loss": 0.8041, + "step": 9721 + }, + { + "epoch": 0.785632032970363, + "grad_norm": 3.081422805786133, + "learning_rate": 6.9272376891205296e-06, + "loss": 1.0131, + "step": 9722 + }, + { + "epoch": 0.7857128426836906, + "grad_norm": 2.583186388015747, + "learning_rate": 6.926633873198757e-06, + "loss": 0.8351, + "step": 9723 + }, + { + "epoch": 0.7857936523970181, + "grad_norm": 2.7975621223449707, + "learning_rate": 6.926030024278384e-06, + "loss": 0.8724, + "step": 9724 + }, + { + "epoch": 0.7858744621103456, + "grad_norm": 2.965134620666504, + "learning_rate": 6.925426142369752e-06, + "loss": 0.877, + "step": 9725 + }, + { + "epoch": 0.7859552718236732, + "grad_norm": 2.5071730613708496, + "learning_rate": 6.924822227483208e-06, + "loss": 0.9737, + "step": 9726 + }, + { + "epoch": 0.7860360815370008, + "grad_norm": 2.840658664703369, + "learning_rate": 6.92421827962909e-06, + "loss": 0.9779, + "step": 9727 + }, + { + "epoch": 0.7861168912503282, + "grad_norm": 3.170917272567749, + "learning_rate": 6.923614298817747e-06, + "loss": 0.9217, + "step": 9728 + }, + { + "epoch": 0.7861977009636558, + "grad_norm": 2.484788179397583, + "learning_rate": 6.923010285059521e-06, + "loss": 1.0302, + "step": 9729 + }, + { + "epoch": 0.7862785106769834, + "grad_norm": 2.6187918186187744, + "learning_rate": 6.9224062383647595e-06, + "loss": 1.0012, + "step": 9730 + }, + { + "epoch": 0.7863593203903109, + "grad_norm": 3.17228627204895, + "learning_rate": 6.921802158743807e-06, + "loss": 0.9174, + "step": 9731 + }, + { + "epoch": 0.7864401301036384, + "grad_norm": 2.9996063709259033, + "learning_rate": 6.92119804620701e-06, + "loss": 0.9287, + "step": 9732 + }, + { + "epoch": 0.786520939816966, + "grad_norm": 3.090010166168213, + "learning_rate": 6.920593900764714e-06, + "loss": 0.9699, + "step": 9733 + }, + { + "epoch": 0.7866017495302935, + "grad_norm": 2.606588125228882, + "learning_rate": 6.91998972242727e-06, + "loss": 0.9704, + "step": 9734 + }, + { + "epoch": 0.7866825592436211, + "grad_norm": 2.7972769737243652, + "learning_rate": 6.919385511205024e-06, + "loss": 0.8863, + "step": 9735 + }, + { + "epoch": 0.7867633689569486, + "grad_norm": 2.594951868057251, + "learning_rate": 6.918781267108324e-06, + "loss": 0.9085, + "step": 9736 + }, + { + "epoch": 0.7868441786702761, + "grad_norm": 3.1590778827667236, + "learning_rate": 6.918176990147522e-06, + "loss": 0.8542, + "step": 9737 + }, + { + "epoch": 0.7869249883836037, + "grad_norm": 2.806760311126709, + "learning_rate": 6.917572680332965e-06, + "loss": 0.911, + "step": 9738 + }, + { + "epoch": 0.7870057980969313, + "grad_norm": 2.36431622505188, + "learning_rate": 6.916968337675003e-06, + "loss": 0.902, + "step": 9739 + }, + { + "epoch": 0.7870866078102587, + "grad_norm": 2.8250107765197754, + "learning_rate": 6.91636396218399e-06, + "loss": 0.8955, + "step": 9740 + }, + { + "epoch": 0.7871674175235863, + "grad_norm": 2.675884962081909, + "learning_rate": 6.915759553870275e-06, + "loss": 0.8415, + "step": 9741 + }, + { + "epoch": 0.7872482272369139, + "grad_norm": 2.768265962600708, + "learning_rate": 6.915155112744211e-06, + "loss": 0.8819, + "step": 9742 + }, + { + "epoch": 0.7873290369502414, + "grad_norm": 2.699328899383545, + "learning_rate": 6.91455063881615e-06, + "loss": 0.8715, + "step": 9743 + }, + { + "epoch": 0.787409846663569, + "grad_norm": 2.888277292251587, + "learning_rate": 6.913946132096447e-06, + "loss": 0.9695, + "step": 9744 + }, + { + "epoch": 0.7874906563768965, + "grad_norm": 2.8600375652313232, + "learning_rate": 6.913341592595453e-06, + "loss": 0.915, + "step": 9745 + }, + { + "epoch": 0.787571466090224, + "grad_norm": 2.665877342224121, + "learning_rate": 6.912737020323523e-06, + "loss": 1.0342, + "step": 9746 + }, + { + "epoch": 0.7876522758035516, + "grad_norm": 2.548896074295044, + "learning_rate": 6.912132415291014e-06, + "loss": 0.9424, + "step": 9747 + }, + { + "epoch": 0.7877330855168792, + "grad_norm": 2.6893019676208496, + "learning_rate": 6.91152777750828e-06, + "loss": 0.7621, + "step": 9748 + }, + { + "epoch": 0.7878138952302066, + "grad_norm": 2.669121026992798, + "learning_rate": 6.910923106985678e-06, + "loss": 0.8083, + "step": 9749 + }, + { + "epoch": 0.7878947049435342, + "grad_norm": 2.5475196838378906, + "learning_rate": 6.9103184037335615e-06, + "loss": 0.8622, + "step": 9750 + }, + { + "epoch": 0.7879755146568618, + "grad_norm": 2.509610652923584, + "learning_rate": 6.90971366776229e-06, + "loss": 0.8882, + "step": 9751 + }, + { + "epoch": 0.7880563243701892, + "grad_norm": 3.3432157039642334, + "learning_rate": 6.909108899082222e-06, + "loss": 0.9586, + "step": 9752 + }, + { + "epoch": 0.7881371340835168, + "grad_norm": 2.9079229831695557, + "learning_rate": 6.908504097703713e-06, + "loss": 0.8835, + "step": 9753 + }, + { + "epoch": 0.7882179437968444, + "grad_norm": 2.9204115867614746, + "learning_rate": 6.9078992636371246e-06, + "loss": 1.0229, + "step": 9754 + }, + { + "epoch": 0.7882987535101719, + "grad_norm": 2.441419839859009, + "learning_rate": 6.907294396892815e-06, + "loss": 0.8887, + "step": 9755 + }, + { + "epoch": 0.7883795632234994, + "grad_norm": 3.024362087249756, + "learning_rate": 6.9066894974811425e-06, + "loss": 0.9457, + "step": 9756 + }, + { + "epoch": 0.788460372936827, + "grad_norm": 2.8817200660705566, + "learning_rate": 6.906084565412471e-06, + "loss": 0.8425, + "step": 9757 + }, + { + "epoch": 0.7885411826501546, + "grad_norm": 2.3224284648895264, + "learning_rate": 6.905479600697158e-06, + "loss": 0.9562, + "step": 9758 + }, + { + "epoch": 0.7886219923634821, + "grad_norm": 2.697815179824829, + "learning_rate": 6.9048746033455675e-06, + "loss": 0.9303, + "step": 9759 + }, + { + "epoch": 0.7887028020768097, + "grad_norm": 2.7465643882751465, + "learning_rate": 6.904269573368061e-06, + "loss": 0.9734, + "step": 9760 + }, + { + "epoch": 0.7887836117901372, + "grad_norm": 3.1014504432678223, + "learning_rate": 6.903664510775e-06, + "loss": 0.9998, + "step": 9761 + }, + { + "epoch": 0.7888644215034647, + "grad_norm": 2.543888807296753, + "learning_rate": 6.90305941557675e-06, + "loss": 0.8978, + "step": 9762 + }, + { + "epoch": 0.7889452312167923, + "grad_norm": 3.0319859981536865, + "learning_rate": 6.9024542877836735e-06, + "loss": 0.9864, + "step": 9763 + }, + { + "epoch": 0.7890260409301199, + "grad_norm": 3.4651505947113037, + "learning_rate": 6.9018491274061325e-06, + "loss": 1.0205, + "step": 9764 + }, + { + "epoch": 0.7891068506434473, + "grad_norm": 2.6538500785827637, + "learning_rate": 6.901243934454498e-06, + "loss": 0.9339, + "step": 9765 + }, + { + "epoch": 0.7891876603567749, + "grad_norm": 2.7711353302001953, + "learning_rate": 6.9006387089391315e-06, + "loss": 0.9507, + "step": 9766 + }, + { + "epoch": 0.7892684700701025, + "grad_norm": 2.5237932205200195, + "learning_rate": 6.900033450870398e-06, + "loss": 0.9447, + "step": 9767 + }, + { + "epoch": 0.78934927978343, + "grad_norm": 2.416130304336548, + "learning_rate": 6.899428160258665e-06, + "loss": 0.8435, + "step": 9768 + }, + { + "epoch": 0.7894300894967575, + "grad_norm": 2.563319444656372, + "learning_rate": 6.8988228371143025e-06, + "loss": 0.8626, + "step": 9769 + }, + { + "epoch": 0.7895108992100851, + "grad_norm": 2.9576029777526855, + "learning_rate": 6.898217481447675e-06, + "loss": 0.8971, + "step": 9770 + }, + { + "epoch": 0.7895917089234126, + "grad_norm": 2.78456974029541, + "learning_rate": 6.897612093269153e-06, + "loss": 0.9585, + "step": 9771 + }, + { + "epoch": 0.7896725186367402, + "grad_norm": 2.344691038131714, + "learning_rate": 6.897006672589102e-06, + "loss": 1.0789, + "step": 9772 + }, + { + "epoch": 0.7897533283500677, + "grad_norm": 2.1248655319213867, + "learning_rate": 6.896401219417896e-06, + "loss": 0.9265, + "step": 9773 + }, + { + "epoch": 0.7898341380633952, + "grad_norm": 2.875714063644409, + "learning_rate": 6.8957957337659e-06, + "loss": 0.9438, + "step": 9774 + }, + { + "epoch": 0.7899149477767228, + "grad_norm": 3.0175061225891113, + "learning_rate": 6.895190215643488e-06, + "loss": 0.9157, + "step": 9775 + }, + { + "epoch": 0.7899957574900504, + "grad_norm": 2.4655911922454834, + "learning_rate": 6.89458466506103e-06, + "loss": 1.0023, + "step": 9776 + }, + { + "epoch": 0.7900765672033778, + "grad_norm": 2.8080813884735107, + "learning_rate": 6.893979082028899e-06, + "loss": 0.8888, + "step": 9777 + }, + { + "epoch": 0.7901573769167054, + "grad_norm": 2.922720432281494, + "learning_rate": 6.893373466557464e-06, + "loss": 0.8943, + "step": 9778 + }, + { + "epoch": 0.790238186630033, + "grad_norm": 2.578737258911133, + "learning_rate": 6.892767818657101e-06, + "loss": 0.9522, + "step": 9779 + }, + { + "epoch": 0.7903189963433604, + "grad_norm": 2.356459379196167, + "learning_rate": 6.892162138338181e-06, + "loss": 0.8621, + "step": 9780 + }, + { + "epoch": 0.790399806056688, + "grad_norm": 2.190770149230957, + "learning_rate": 6.891556425611079e-06, + "loss": 1.0324, + "step": 9781 + }, + { + "epoch": 0.7904806157700156, + "grad_norm": 3.4735827445983887, + "learning_rate": 6.890950680486169e-06, + "loss": 0.8301, + "step": 9782 + }, + { + "epoch": 0.7905614254833431, + "grad_norm": 3.018254041671753, + "learning_rate": 6.8903449029738265e-06, + "loss": 0.8631, + "step": 9783 + }, + { + "epoch": 0.7906422351966707, + "grad_norm": 3.063438892364502, + "learning_rate": 6.889739093084427e-06, + "loss": 0.889, + "step": 9784 + }, + { + "epoch": 0.7907230449099982, + "grad_norm": 3.142925262451172, + "learning_rate": 6.889133250828346e-06, + "loss": 0.7975, + "step": 9785 + }, + { + "epoch": 0.7908038546233257, + "grad_norm": 2.408189058303833, + "learning_rate": 6.888527376215959e-06, + "loss": 0.8847, + "step": 9786 + }, + { + "epoch": 0.7908846643366533, + "grad_norm": 2.9360907077789307, + "learning_rate": 6.887921469257647e-06, + "loss": 0.8699, + "step": 9787 + }, + { + "epoch": 0.7909654740499809, + "grad_norm": 2.627872943878174, + "learning_rate": 6.887315529963784e-06, + "loss": 0.9903, + "step": 9788 + }, + { + "epoch": 0.7910462837633083, + "grad_norm": 2.3716421127319336, + "learning_rate": 6.886709558344748e-06, + "loss": 0.8687, + "step": 9789 + }, + { + "epoch": 0.7911270934766359, + "grad_norm": 2.794773578643799, + "learning_rate": 6.886103554410921e-06, + "loss": 1.019, + "step": 9790 + }, + { + "epoch": 0.7912079031899635, + "grad_norm": 2.796485662460327, + "learning_rate": 6.885497518172681e-06, + "loss": 1.0813, + "step": 9791 + }, + { + "epoch": 0.791288712903291, + "grad_norm": 2.507392168045044, + "learning_rate": 6.884891449640407e-06, + "loss": 0.9772, + "step": 9792 + }, + { + "epoch": 0.7913695226166185, + "grad_norm": 2.811211109161377, + "learning_rate": 6.88428534882448e-06, + "loss": 0.898, + "step": 9793 + }, + { + "epoch": 0.7914503323299461, + "grad_norm": 3.025217294692993, + "learning_rate": 6.883679215735282e-06, + "loss": 0.9232, + "step": 9794 + }, + { + "epoch": 0.7915311420432736, + "grad_norm": 2.4453773498535156, + "learning_rate": 6.883073050383193e-06, + "loss": 0.9294, + "step": 9795 + }, + { + "epoch": 0.7916119517566012, + "grad_norm": 2.9261538982391357, + "learning_rate": 6.8824668527785954e-06, + "loss": 0.9779, + "step": 9796 + }, + { + "epoch": 0.7916927614699287, + "grad_norm": 2.2668650150299072, + "learning_rate": 6.881860622931873e-06, + "loss": 1.049, + "step": 9797 + }, + { + "epoch": 0.7917735711832562, + "grad_norm": 2.7034614086151123, + "learning_rate": 6.881254360853409e-06, + "loss": 0.9288, + "step": 9798 + }, + { + "epoch": 0.7918543808965838, + "grad_norm": 2.669090986251831, + "learning_rate": 6.880648066553588e-06, + "loss": 0.8742, + "step": 9799 + }, + { + "epoch": 0.7919351906099114, + "grad_norm": 3.4627444744110107, + "learning_rate": 6.88004174004279e-06, + "loss": 0.8534, + "step": 9800 + }, + { + "epoch": 0.7920160003232388, + "grad_norm": 2.5556793212890625, + "learning_rate": 6.879435381331405e-06, + "loss": 0.8797, + "step": 9801 + }, + { + "epoch": 0.7920968100365664, + "grad_norm": 2.601912498474121, + "learning_rate": 6.8788289904298155e-06, + "loss": 0.8918, + "step": 9802 + }, + { + "epoch": 0.792177619749894, + "grad_norm": 2.544543981552124, + "learning_rate": 6.878222567348409e-06, + "loss": 0.8968, + "step": 9803 + }, + { + "epoch": 0.7922584294632214, + "grad_norm": 3.0917084217071533, + "learning_rate": 6.8776161120975714e-06, + "loss": 0.9293, + "step": 9804 + }, + { + "epoch": 0.792339239176549, + "grad_norm": 2.291499376296997, + "learning_rate": 6.877009624687691e-06, + "loss": 0.8988, + "step": 9805 + }, + { + "epoch": 0.7924200488898766, + "grad_norm": 2.528583526611328, + "learning_rate": 6.8764031051291535e-06, + "loss": 0.95, + "step": 9806 + }, + { + "epoch": 0.7925008586032041, + "grad_norm": 2.412081241607666, + "learning_rate": 6.875796553432349e-06, + "loss": 0.9062, + "step": 9807 + }, + { + "epoch": 0.7925816683165317, + "grad_norm": 2.9253103733062744, + "learning_rate": 6.875189969607664e-06, + "loss": 0.9501, + "step": 9808 + }, + { + "epoch": 0.7926624780298592, + "grad_norm": 2.699721574783325, + "learning_rate": 6.8745833536654895e-06, + "loss": 1.0665, + "step": 9809 + }, + { + "epoch": 0.7927432877431867, + "grad_norm": 2.3653342723846436, + "learning_rate": 6.873976705616215e-06, + "loss": 0.9201, + "step": 9810 + }, + { + "epoch": 0.7928240974565143, + "grad_norm": 2.5907254219055176, + "learning_rate": 6.873370025470232e-06, + "loss": 0.955, + "step": 9811 + }, + { + "epoch": 0.7929049071698419, + "grad_norm": 2.5645127296447754, + "learning_rate": 6.872763313237929e-06, + "loss": 0.8921, + "step": 9812 + }, + { + "epoch": 0.7929857168831693, + "grad_norm": 2.475836992263794, + "learning_rate": 6.8721565689297e-06, + "loss": 0.9178, + "step": 9813 + }, + { + "epoch": 0.7930665265964969, + "grad_norm": 2.7445790767669678, + "learning_rate": 6.871549792555935e-06, + "loss": 0.9757, + "step": 9814 + }, + { + "epoch": 0.7931473363098245, + "grad_norm": 2.585686445236206, + "learning_rate": 6.870942984127029e-06, + "loss": 0.8806, + "step": 9815 + }, + { + "epoch": 0.793228146023152, + "grad_norm": 2.713639974594116, + "learning_rate": 6.870336143653372e-06, + "loss": 0.9453, + "step": 9816 + }, + { + "epoch": 0.7933089557364795, + "grad_norm": 2.8230977058410645, + "learning_rate": 6.86972927114536e-06, + "loss": 0.9789, + "step": 9817 + }, + { + "epoch": 0.7933897654498071, + "grad_norm": 3.0862629413604736, + "learning_rate": 6.869122366613387e-06, + "loss": 0.9532, + "step": 9818 + }, + { + "epoch": 0.7934705751631346, + "grad_norm": 2.866936445236206, + "learning_rate": 6.868515430067848e-06, + "loss": 0.9657, + "step": 9819 + }, + { + "epoch": 0.7935513848764622, + "grad_norm": 2.8744053840637207, + "learning_rate": 6.867908461519138e-06, + "loss": 0.9024, + "step": 9820 + }, + { + "epoch": 0.7936321945897897, + "grad_norm": 2.412216901779175, + "learning_rate": 6.867301460977652e-06, + "loss": 0.9955, + "step": 9821 + }, + { + "epoch": 0.7937130043031172, + "grad_norm": 2.5999553203582764, + "learning_rate": 6.86669442845379e-06, + "loss": 1.0087, + "step": 9822 + }, + { + "epoch": 0.7937938140164448, + "grad_norm": 2.983273506164551, + "learning_rate": 6.866087363957943e-06, + "loss": 0.9415, + "step": 9823 + }, + { + "epoch": 0.7938746237297724, + "grad_norm": 2.738760232925415, + "learning_rate": 6.865480267500514e-06, + "loss": 0.9421, + "step": 9824 + }, + { + "epoch": 0.7939554334430998, + "grad_norm": 2.6678097248077393, + "learning_rate": 6.864873139091897e-06, + "loss": 0.9528, + "step": 9825 + }, + { + "epoch": 0.7940362431564274, + "grad_norm": 2.8385188579559326, + "learning_rate": 6.864265978742494e-06, + "loss": 0.9275, + "step": 9826 + }, + { + "epoch": 0.794117052869755, + "grad_norm": 2.6046605110168457, + "learning_rate": 6.863658786462702e-06, + "loss": 0.9594, + "step": 9827 + }, + { + "epoch": 0.7941978625830824, + "grad_norm": 2.5831477642059326, + "learning_rate": 6.863051562262922e-06, + "loss": 0.9989, + "step": 9828 + }, + { + "epoch": 0.79427867229641, + "grad_norm": 2.5561978816986084, + "learning_rate": 6.862444306153555e-06, + "loss": 0.8679, + "step": 9829 + }, + { + "epoch": 0.7943594820097376, + "grad_norm": 3.213993549346924, + "learning_rate": 6.861837018145e-06, + "loss": 0.9804, + "step": 9830 + }, + { + "epoch": 0.7944402917230651, + "grad_norm": 2.7921385765075684, + "learning_rate": 6.861229698247658e-06, + "loss": 0.8718, + "step": 9831 + }, + { + "epoch": 0.7945211014363927, + "grad_norm": 2.426623582839966, + "learning_rate": 6.860622346471933e-06, + "loss": 0.873, + "step": 9832 + }, + { + "epoch": 0.7946019111497202, + "grad_norm": 2.5576171875, + "learning_rate": 6.8600149628282265e-06, + "loss": 0.837, + "step": 9833 + }, + { + "epoch": 0.7946827208630477, + "grad_norm": 2.269954204559326, + "learning_rate": 6.859407547326941e-06, + "loss": 0.96, + "step": 9834 + }, + { + "epoch": 0.7947635305763753, + "grad_norm": 2.6326749324798584, + "learning_rate": 6.85880009997848e-06, + "loss": 0.9024, + "step": 9835 + }, + { + "epoch": 0.7948443402897029, + "grad_norm": 2.807593822479248, + "learning_rate": 6.85819262079325e-06, + "loss": 0.9605, + "step": 9836 + }, + { + "epoch": 0.7949251500030303, + "grad_norm": 2.447377920150757, + "learning_rate": 6.857585109781652e-06, + "loss": 0.935, + "step": 9837 + }, + { + "epoch": 0.7950059597163579, + "grad_norm": 2.6915740966796875, + "learning_rate": 6.856977566954095e-06, + "loss": 0.886, + "step": 9838 + }, + { + "epoch": 0.7950867694296855, + "grad_norm": 2.9169421195983887, + "learning_rate": 6.8563699923209794e-06, + "loss": 0.941, + "step": 9839 + }, + { + "epoch": 0.795167579143013, + "grad_norm": 2.1841347217559814, + "learning_rate": 6.855762385892718e-06, + "loss": 0.9259, + "step": 9840 + }, + { + "epoch": 0.7952483888563405, + "grad_norm": 2.310147762298584, + "learning_rate": 6.855154747679713e-06, + "loss": 0.8719, + "step": 9841 + }, + { + "epoch": 0.7953291985696681, + "grad_norm": 2.3946635723114014, + "learning_rate": 6.854547077692374e-06, + "loss": 0.9772, + "step": 9842 + }, + { + "epoch": 0.7954100082829956, + "grad_norm": 2.599512815475464, + "learning_rate": 6.853939375941108e-06, + "loss": 0.9794, + "step": 9843 + }, + { + "epoch": 0.7954908179963232, + "grad_norm": 2.9427289962768555, + "learning_rate": 6.853331642436325e-06, + "loss": 0.8381, + "step": 9844 + }, + { + "epoch": 0.7955716277096507, + "grad_norm": 2.641378164291382, + "learning_rate": 6.85272387718843e-06, + "loss": 0.9253, + "step": 9845 + }, + { + "epoch": 0.7956524374229782, + "grad_norm": 2.788924217224121, + "learning_rate": 6.852116080207837e-06, + "loss": 1.0463, + "step": 9846 + }, + { + "epoch": 0.7957332471363058, + "grad_norm": 3.3630847930908203, + "learning_rate": 6.8515082515049535e-06, + "loss": 0.9495, + "step": 9847 + }, + { + "epoch": 0.7958140568496334, + "grad_norm": 2.465693712234497, + "learning_rate": 6.850900391090191e-06, + "loss": 0.8036, + "step": 9848 + }, + { + "epoch": 0.7958948665629608, + "grad_norm": 2.459573745727539, + "learning_rate": 6.850292498973962e-06, + "loss": 0.956, + "step": 9849 + }, + { + "epoch": 0.7959756762762884, + "grad_norm": 2.578242778778076, + "learning_rate": 6.849684575166676e-06, + "loss": 0.966, + "step": 9850 + }, + { + "epoch": 0.796056485989616, + "grad_norm": 2.8725247383117676, + "learning_rate": 6.849076619678745e-06, + "loss": 0.9189, + "step": 9851 + }, + { + "epoch": 0.7961372957029434, + "grad_norm": 2.562953472137451, + "learning_rate": 6.848468632520585e-06, + "loss": 0.9005, + "step": 9852 + }, + { + "epoch": 0.796218105416271, + "grad_norm": 2.785662889480591, + "learning_rate": 6.847860613702605e-06, + "loss": 1.0629, + "step": 9853 + }, + { + "epoch": 0.7962989151295986, + "grad_norm": 2.6662254333496094, + "learning_rate": 6.847252563235224e-06, + "loss": 0.9215, + "step": 9854 + }, + { + "epoch": 0.7963797248429261, + "grad_norm": 2.75342059135437, + "learning_rate": 6.846644481128852e-06, + "loss": 1.0127, + "step": 9855 + }, + { + "epoch": 0.7964605345562537, + "grad_norm": 2.9609148502349854, + "learning_rate": 6.8460363673939055e-06, + "loss": 0.9866, + "step": 9856 + }, + { + "epoch": 0.7965413442695812, + "grad_norm": 3.680875539779663, + "learning_rate": 6.8454282220408005e-06, + "loss": 0.9998, + "step": 9857 + }, + { + "epoch": 0.7966221539829087, + "grad_norm": 2.6819679737091064, + "learning_rate": 6.844820045079954e-06, + "loss": 1.005, + "step": 9858 + }, + { + "epoch": 0.7967029636962363, + "grad_norm": 2.4925944805145264, + "learning_rate": 6.844211836521779e-06, + "loss": 0.9933, + "step": 9859 + }, + { + "epoch": 0.7967837734095639, + "grad_norm": 2.279554843902588, + "learning_rate": 6.843603596376697e-06, + "loss": 0.8336, + "step": 9860 + }, + { + "epoch": 0.7968645831228913, + "grad_norm": 2.840665817260742, + "learning_rate": 6.842995324655123e-06, + "loss": 0.8977, + "step": 9861 + }, + { + "epoch": 0.7969453928362189, + "grad_norm": 2.678210496902466, + "learning_rate": 6.842387021367476e-06, + "loss": 0.8747, + "step": 9862 + }, + { + "epoch": 0.7970262025495465, + "grad_norm": 2.3635852336883545, + "learning_rate": 6.841778686524174e-06, + "loss": 1.137, + "step": 9863 + }, + { + "epoch": 0.797107012262874, + "grad_norm": 2.741825819015503, + "learning_rate": 6.8411703201356385e-06, + "loss": 1.0271, + "step": 9864 + }, + { + "epoch": 0.7971878219762015, + "grad_norm": 2.799586772918701, + "learning_rate": 6.840561922212285e-06, + "loss": 1.0293, + "step": 9865 + }, + { + "epoch": 0.7972686316895291, + "grad_norm": 2.8495535850524902, + "learning_rate": 6.8399534927645396e-06, + "loss": 0.9574, + "step": 9866 + }, + { + "epoch": 0.7973494414028566, + "grad_norm": 2.7940566539764404, + "learning_rate": 6.839345031802819e-06, + "loss": 0.9242, + "step": 9867 + }, + { + "epoch": 0.7974302511161842, + "grad_norm": 2.7938547134399414, + "learning_rate": 6.838736539337547e-06, + "loss": 0.9976, + "step": 9868 + }, + { + "epoch": 0.7975110608295117, + "grad_norm": 2.463744640350342, + "learning_rate": 6.838128015379144e-06, + "loss": 0.999, + "step": 9869 + }, + { + "epoch": 0.7975918705428392, + "grad_norm": 2.3955326080322266, + "learning_rate": 6.837519459938034e-06, + "loss": 1.0032, + "step": 9870 + }, + { + "epoch": 0.7976726802561668, + "grad_norm": 2.0644900798797607, + "learning_rate": 6.836910873024637e-06, + "loss": 0.9661, + "step": 9871 + }, + { + "epoch": 0.7977534899694944, + "grad_norm": 3.0256295204162598, + "learning_rate": 6.836302254649382e-06, + "loss": 0.9018, + "step": 9872 + }, + { + "epoch": 0.7978342996828218, + "grad_norm": 2.729816198348999, + "learning_rate": 6.835693604822687e-06, + "loss": 0.8376, + "step": 9873 + }, + { + "epoch": 0.7979151093961494, + "grad_norm": 2.375227451324463, + "learning_rate": 6.835084923554982e-06, + "loss": 1.035, + "step": 9874 + }, + { + "epoch": 0.797995919109477, + "grad_norm": 2.579192876815796, + "learning_rate": 6.834476210856689e-06, + "loss": 1.0178, + "step": 9875 + }, + { + "epoch": 0.7980767288228044, + "grad_norm": 2.5040221214294434, + "learning_rate": 6.833867466738235e-06, + "loss": 1.0459, + "step": 9876 + }, + { + "epoch": 0.798157538536132, + "grad_norm": 2.7196125984191895, + "learning_rate": 6.833258691210048e-06, + "loss": 1.0056, + "step": 9877 + }, + { + "epoch": 0.7982383482494596, + "grad_norm": 2.4624030590057373, + "learning_rate": 6.83264988428255e-06, + "loss": 0.9431, + "step": 9878 + }, + { + "epoch": 0.7983191579627871, + "grad_norm": 2.6063098907470703, + "learning_rate": 6.832041045966174e-06, + "loss": 0.8946, + "step": 9879 + }, + { + "epoch": 0.7983999676761147, + "grad_norm": 2.7179946899414062, + "learning_rate": 6.831432176271345e-06, + "loss": 0.9484, + "step": 9880 + }, + { + "epoch": 0.7984807773894422, + "grad_norm": 2.3139700889587402, + "learning_rate": 6.830823275208489e-06, + "loss": 0.856, + "step": 9881 + }, + { + "epoch": 0.7985615871027697, + "grad_norm": 2.4322259426116943, + "learning_rate": 6.8302143427880405e-06, + "loss": 0.9994, + "step": 9882 + }, + { + "epoch": 0.7986423968160973, + "grad_norm": 2.5613677501678467, + "learning_rate": 6.829605379020425e-06, + "loss": 0.8508, + "step": 9883 + }, + { + "epoch": 0.7987232065294249, + "grad_norm": 2.6813414096832275, + "learning_rate": 6.828996383916076e-06, + "loss": 0.904, + "step": 9884 + }, + { + "epoch": 0.7988040162427523, + "grad_norm": 2.4728004932403564, + "learning_rate": 6.82838735748542e-06, + "loss": 0.9978, + "step": 9885 + }, + { + "epoch": 0.7988848259560799, + "grad_norm": 2.4219539165496826, + "learning_rate": 6.827778299738891e-06, + "loss": 0.9248, + "step": 9886 + }, + { + "epoch": 0.7989656356694075, + "grad_norm": 2.4130876064300537, + "learning_rate": 6.8271692106869195e-06, + "loss": 0.8378, + "step": 9887 + }, + { + "epoch": 0.7990464453827351, + "grad_norm": 2.6504530906677246, + "learning_rate": 6.826560090339939e-06, + "loss": 0.8864, + "step": 9888 + }, + { + "epoch": 0.7991272550960625, + "grad_norm": 2.934321403503418, + "learning_rate": 6.825950938708381e-06, + "loss": 1.0809, + "step": 9889 + }, + { + "epoch": 0.7992080648093901, + "grad_norm": 2.4860029220581055, + "learning_rate": 6.825341755802679e-06, + "loss": 0.94, + "step": 9890 + }, + { + "epoch": 0.7992888745227177, + "grad_norm": 2.6035618782043457, + "learning_rate": 6.8247325416332675e-06, + "loss": 0.9673, + "step": 9891 + }, + { + "epoch": 0.7993696842360452, + "grad_norm": 2.690329074859619, + "learning_rate": 6.824123296210579e-06, + "loss": 0.9245, + "step": 9892 + }, + { + "epoch": 0.7994504939493727, + "grad_norm": 2.507843255996704, + "learning_rate": 6.823514019545052e-06, + "loss": 0.9987, + "step": 9893 + }, + { + "epoch": 0.7995313036627003, + "grad_norm": 2.463536024093628, + "learning_rate": 6.822904711647118e-06, + "loss": 1.1466, + "step": 9894 + }, + { + "epoch": 0.7996121133760278, + "grad_norm": 2.800693988800049, + "learning_rate": 6.822295372527216e-06, + "loss": 0.8854, + "step": 9895 + }, + { + "epoch": 0.7996929230893554, + "grad_norm": 2.625532865524292, + "learning_rate": 6.82168600219578e-06, + "loss": 0.8947, + "step": 9896 + }, + { + "epoch": 0.7997737328026829, + "grad_norm": 2.919964075088501, + "learning_rate": 6.82107660066325e-06, + "loss": 0.9991, + "step": 9897 + }, + { + "epoch": 0.7998545425160104, + "grad_norm": 2.7522058486938477, + "learning_rate": 6.82046716794006e-06, + "loss": 0.9677, + "step": 9898 + }, + { + "epoch": 0.799935352229338, + "grad_norm": 2.916342258453369, + "learning_rate": 6.819857704036652e-06, + "loss": 0.9301, + "step": 9899 + }, + { + "epoch": 0.8000161619426656, + "grad_norm": 2.8729188442230225, + "learning_rate": 6.819248208963461e-06, + "loss": 0.9065, + "step": 9900 + }, + { + "epoch": 0.800096971655993, + "grad_norm": 2.786853551864624, + "learning_rate": 6.818638682730929e-06, + "loss": 0.9544, + "step": 9901 + }, + { + "epoch": 0.8001777813693206, + "grad_norm": 2.7986268997192383, + "learning_rate": 6.818029125349494e-06, + "loss": 0.9148, + "step": 9902 + }, + { + "epoch": 0.8002585910826482, + "grad_norm": 2.4643256664276123, + "learning_rate": 6.817419536829597e-06, + "loss": 1.0353, + "step": 9903 + }, + { + "epoch": 0.8003394007959757, + "grad_norm": 2.592578411102295, + "learning_rate": 6.816809917181677e-06, + "loss": 0.9834, + "step": 9904 + }, + { + "epoch": 0.8004202105093032, + "grad_norm": 2.952615261077881, + "learning_rate": 6.8162002664161794e-06, + "loss": 0.9166, + "step": 9905 + }, + { + "epoch": 0.8005010202226308, + "grad_norm": 3.0346031188964844, + "learning_rate": 6.815590584543542e-06, + "loss": 0.9002, + "step": 9906 + }, + { + "epoch": 0.8005818299359583, + "grad_norm": 2.2955641746520996, + "learning_rate": 6.81498087157421e-06, + "loss": 0.8635, + "step": 9907 + }, + { + "epoch": 0.8006626396492859, + "grad_norm": 2.3769054412841797, + "learning_rate": 6.814371127518624e-06, + "loss": 0.9364, + "step": 9908 + }, + { + "epoch": 0.8007434493626134, + "grad_norm": 2.8720016479492188, + "learning_rate": 6.813761352387229e-06, + "loss": 0.9804, + "step": 9909 + }, + { + "epoch": 0.8008242590759409, + "grad_norm": 2.956225872039795, + "learning_rate": 6.8131515461904685e-06, + "loss": 0.9734, + "step": 9910 + }, + { + "epoch": 0.8009050687892685, + "grad_norm": 2.920043468475342, + "learning_rate": 6.812541708938787e-06, + "loss": 1.0127, + "step": 9911 + }, + { + "epoch": 0.8009858785025961, + "grad_norm": 2.2908806800842285, + "learning_rate": 6.811931840642628e-06, + "loss": 1.006, + "step": 9912 + }, + { + "epoch": 0.8010666882159235, + "grad_norm": 2.7652716636657715, + "learning_rate": 6.811321941312441e-06, + "loss": 0.9693, + "step": 9913 + }, + { + "epoch": 0.8011474979292511, + "grad_norm": 2.4583494663238525, + "learning_rate": 6.8107120109586685e-06, + "loss": 0.911, + "step": 9914 + }, + { + "epoch": 0.8012283076425787, + "grad_norm": 2.4979212284088135, + "learning_rate": 6.810102049591759e-06, + "loss": 1.0718, + "step": 9915 + }, + { + "epoch": 0.8013091173559062, + "grad_norm": 2.608715295791626, + "learning_rate": 6.809492057222158e-06, + "loss": 0.9271, + "step": 9916 + }, + { + "epoch": 0.8013899270692337, + "grad_norm": 2.429481267929077, + "learning_rate": 6.808882033860316e-06, + "loss": 0.9787, + "step": 9917 + }, + { + "epoch": 0.8014707367825613, + "grad_norm": 2.6573383808135986, + "learning_rate": 6.808271979516677e-06, + "loss": 1.0015, + "step": 9918 + }, + { + "epoch": 0.8015515464958888, + "grad_norm": 2.55149245262146, + "learning_rate": 6.807661894201695e-06, + "loss": 0.9501, + "step": 9919 + }, + { + "epoch": 0.8016323562092164, + "grad_norm": 2.7874369621276855, + "learning_rate": 6.807051777925812e-06, + "loss": 0.8918, + "step": 9920 + }, + { + "epoch": 0.8017131659225439, + "grad_norm": 3.394378662109375, + "learning_rate": 6.806441630699488e-06, + "loss": 0.9506, + "step": 9921 + }, + { + "epoch": 0.8017939756358714, + "grad_norm": 2.581477642059326, + "learning_rate": 6.805831452533165e-06, + "loss": 0.9971, + "step": 9922 + }, + { + "epoch": 0.801874785349199, + "grad_norm": 2.5167365074157715, + "learning_rate": 6.805221243437297e-06, + "loss": 0.8361, + "step": 9923 + }, + { + "epoch": 0.8019555950625266, + "grad_norm": 3.6850173473358154, + "learning_rate": 6.804611003422333e-06, + "loss": 1.0683, + "step": 9924 + }, + { + "epoch": 0.802036404775854, + "grad_norm": 2.8270466327667236, + "learning_rate": 6.80400073249873e-06, + "loss": 0.9289, + "step": 9925 + }, + { + "epoch": 0.8021172144891816, + "grad_norm": 2.3623886108398438, + "learning_rate": 6.803390430676935e-06, + "loss": 0.9429, + "step": 9926 + }, + { + "epoch": 0.8021980242025092, + "grad_norm": 2.547598361968994, + "learning_rate": 6.802780097967405e-06, + "loss": 0.9129, + "step": 9927 + }, + { + "epoch": 0.8022788339158367, + "grad_norm": 2.7918617725372314, + "learning_rate": 6.802169734380592e-06, + "loss": 0.9014, + "step": 9928 + }, + { + "epoch": 0.8023596436291642, + "grad_norm": 2.6616594791412354, + "learning_rate": 6.801559339926948e-06, + "loss": 0.9323, + "step": 9929 + }, + { + "epoch": 0.8024404533424918, + "grad_norm": 2.622589111328125, + "learning_rate": 6.800948914616932e-06, + "loss": 0.942, + "step": 9930 + }, + { + "epoch": 0.8025212630558193, + "grad_norm": 2.566311836242676, + "learning_rate": 6.8003384584609954e-06, + "loss": 0.9182, + "step": 9931 + }, + { + "epoch": 0.8026020727691469, + "grad_norm": 3.059619188308716, + "learning_rate": 6.7997279714695945e-06, + "loss": 0.9075, + "step": 9932 + }, + { + "epoch": 0.8026828824824744, + "grad_norm": 2.622683048248291, + "learning_rate": 6.799117453653188e-06, + "loss": 0.9596, + "step": 9933 + }, + { + "epoch": 0.8027636921958019, + "grad_norm": 2.312102794647217, + "learning_rate": 6.798506905022229e-06, + "loss": 0.9342, + "step": 9934 + }, + { + "epoch": 0.8028445019091295, + "grad_norm": 2.541771411895752, + "learning_rate": 6.7978963255871775e-06, + "loss": 1.0073, + "step": 9935 + }, + { + "epoch": 0.8029253116224571, + "grad_norm": 2.8717281818389893, + "learning_rate": 6.797285715358491e-06, + "loss": 0.9759, + "step": 9936 + }, + { + "epoch": 0.8030061213357845, + "grad_norm": 2.7421064376831055, + "learning_rate": 6.796675074346625e-06, + "loss": 0.8878, + "step": 9937 + }, + { + "epoch": 0.8030869310491121, + "grad_norm": 2.3162598609924316, + "learning_rate": 6.7960644025620405e-06, + "loss": 1.0536, + "step": 9938 + }, + { + "epoch": 0.8031677407624397, + "grad_norm": 2.7054810523986816, + "learning_rate": 6.795453700015198e-06, + "loss": 1.0363, + "step": 9939 + }, + { + "epoch": 0.8032485504757672, + "grad_norm": 2.5268211364746094, + "learning_rate": 6.794842966716554e-06, + "loss": 0.8561, + "step": 9940 + }, + { + "epoch": 0.8033293601890947, + "grad_norm": 2.5131895542144775, + "learning_rate": 6.7942322026765725e-06, + "loss": 0.8584, + "step": 9941 + }, + { + "epoch": 0.8034101699024223, + "grad_norm": 2.794346332550049, + "learning_rate": 6.793621407905713e-06, + "loss": 0.9088, + "step": 9942 + }, + { + "epoch": 0.8034909796157498, + "grad_norm": 2.537999153137207, + "learning_rate": 6.793010582414437e-06, + "loss": 1.0139, + "step": 9943 + }, + { + "epoch": 0.8035717893290774, + "grad_norm": 2.5075199604034424, + "learning_rate": 6.792399726213205e-06, + "loss": 0.9189, + "step": 9944 + }, + { + "epoch": 0.8036525990424049, + "grad_norm": 2.7178080081939697, + "learning_rate": 6.791788839312481e-06, + "loss": 0.8522, + "step": 9945 + }, + { + "epoch": 0.8037334087557324, + "grad_norm": 2.673673391342163, + "learning_rate": 6.791177921722727e-06, + "loss": 0.9387, + "step": 9946 + }, + { + "epoch": 0.80381421846906, + "grad_norm": 2.8458967208862305, + "learning_rate": 6.790566973454409e-06, + "loss": 0.89, + "step": 9947 + }, + { + "epoch": 0.8038950281823876, + "grad_norm": 2.802276611328125, + "learning_rate": 6.789955994517987e-06, + "loss": 1.0193, + "step": 9948 + }, + { + "epoch": 0.803975837895715, + "grad_norm": 2.669790029525757, + "learning_rate": 6.789344984923931e-06, + "loss": 0.9449, + "step": 9949 + }, + { + "epoch": 0.8040566476090426, + "grad_norm": 2.5872020721435547, + "learning_rate": 6.788733944682702e-06, + "loss": 0.956, + "step": 9950 + }, + { + "epoch": 0.8041374573223702, + "grad_norm": 2.780992269515991, + "learning_rate": 6.788122873804766e-06, + "loss": 0.948, + "step": 9951 + }, + { + "epoch": 0.8042182670356977, + "grad_norm": 2.6253044605255127, + "learning_rate": 6.787511772300589e-06, + "loss": 0.9695, + "step": 9952 + }, + { + "epoch": 0.8042990767490252, + "grad_norm": 3.0298964977264404, + "learning_rate": 6.78690064018064e-06, + "loss": 0.8866, + "step": 9953 + }, + { + "epoch": 0.8043798864623528, + "grad_norm": 2.735344409942627, + "learning_rate": 6.786289477455385e-06, + "loss": 0.8452, + "step": 9954 + }, + { + "epoch": 0.8044606961756803, + "grad_norm": 2.6249313354492188, + "learning_rate": 6.785678284135291e-06, + "loss": 0.8771, + "step": 9955 + }, + { + "epoch": 0.8045415058890079, + "grad_norm": 2.6650261878967285, + "learning_rate": 6.7850670602308275e-06, + "loss": 1.0703, + "step": 9956 + }, + { + "epoch": 0.8046223156023354, + "grad_norm": 2.552743434906006, + "learning_rate": 6.784455805752462e-06, + "loss": 0.9084, + "step": 9957 + }, + { + "epoch": 0.8047031253156629, + "grad_norm": 2.681318521499634, + "learning_rate": 6.783844520710664e-06, + "loss": 1.043, + "step": 9958 + }, + { + "epoch": 0.8047839350289905, + "grad_norm": 2.7370269298553467, + "learning_rate": 6.783233205115904e-06, + "loss": 0.9245, + "step": 9959 + }, + { + "epoch": 0.8048647447423181, + "grad_norm": 2.591618299484253, + "learning_rate": 6.782621858978653e-06, + "loss": 0.7968, + "step": 9960 + }, + { + "epoch": 0.8049455544556455, + "grad_norm": 2.88142728805542, + "learning_rate": 6.78201048230938e-06, + "loss": 0.9588, + "step": 9961 + }, + { + "epoch": 0.8050263641689731, + "grad_norm": 2.8691976070404053, + "learning_rate": 6.7813990751185585e-06, + "loss": 0.9014, + "step": 9962 + }, + { + "epoch": 0.8051071738823007, + "grad_norm": 2.421128988265991, + "learning_rate": 6.780787637416659e-06, + "loss": 0.9896, + "step": 9963 + }, + { + "epoch": 0.8051879835956282, + "grad_norm": 2.5189719200134277, + "learning_rate": 6.780176169214155e-06, + "loss": 0.9405, + "step": 9964 + }, + { + "epoch": 0.8052687933089557, + "grad_norm": 2.573498249053955, + "learning_rate": 6.7795646705215176e-06, + "loss": 0.8544, + "step": 9965 + }, + { + "epoch": 0.8053496030222833, + "grad_norm": 2.145843267440796, + "learning_rate": 6.778953141349222e-06, + "loss": 0.9849, + "step": 9966 + }, + { + "epoch": 0.8054304127356108, + "grad_norm": 2.535832166671753, + "learning_rate": 6.778341581707742e-06, + "loss": 0.9573, + "step": 9967 + }, + { + "epoch": 0.8055112224489384, + "grad_norm": 3.017761468887329, + "learning_rate": 6.777729991607551e-06, + "loss": 0.9537, + "step": 9968 + }, + { + "epoch": 0.8055920321622659, + "grad_norm": 2.8418562412261963, + "learning_rate": 6.777118371059126e-06, + "loss": 0.9567, + "step": 9969 + }, + { + "epoch": 0.8056728418755934, + "grad_norm": 2.78224778175354, + "learning_rate": 6.7765067200729415e-06, + "loss": 0.9891, + "step": 9970 + }, + { + "epoch": 0.805753651588921, + "grad_norm": 2.618445634841919, + "learning_rate": 6.7758950386594725e-06, + "loss": 0.8203, + "step": 9971 + }, + { + "epoch": 0.8058344613022486, + "grad_norm": 2.5080177783966064, + "learning_rate": 6.775283326829199e-06, + "loss": 0.7917, + "step": 9972 + }, + { + "epoch": 0.805915271015576, + "grad_norm": 2.6570470333099365, + "learning_rate": 6.7746715845925935e-06, + "loss": 0.9759, + "step": 9973 + }, + { + "epoch": 0.8059960807289036, + "grad_norm": 2.826026201248169, + "learning_rate": 6.7740598119601365e-06, + "loss": 0.8451, + "step": 9974 + }, + { + "epoch": 0.8060768904422312, + "grad_norm": 2.806075096130371, + "learning_rate": 6.773448008942307e-06, + "loss": 0.9144, + "step": 9975 + }, + { + "epoch": 0.8061577001555587, + "grad_norm": 2.6879494190216064, + "learning_rate": 6.772836175549582e-06, + "loss": 0.8722, + "step": 9976 + }, + { + "epoch": 0.8062385098688862, + "grad_norm": 2.495375633239746, + "learning_rate": 6.77222431179244e-06, + "loss": 0.9778, + "step": 9977 + }, + { + "epoch": 0.8063193195822138, + "grad_norm": 2.7937846183776855, + "learning_rate": 6.7716124176813645e-06, + "loss": 0.8248, + "step": 9978 + }, + { + "epoch": 0.8064001292955413, + "grad_norm": 2.7677974700927734, + "learning_rate": 6.771000493226831e-06, + "loss": 0.9429, + "step": 9979 + }, + { + "epoch": 0.8064809390088689, + "grad_norm": 2.934624195098877, + "learning_rate": 6.770388538439324e-06, + "loss": 0.8823, + "step": 9980 + }, + { + "epoch": 0.8065617487221964, + "grad_norm": 3.1143798828125, + "learning_rate": 6.769776553329322e-06, + "loss": 0.9946, + "step": 9981 + }, + { + "epoch": 0.8066425584355239, + "grad_norm": 2.7064270973205566, + "learning_rate": 6.76916453790731e-06, + "loss": 0.9639, + "step": 9982 + }, + { + "epoch": 0.8067233681488515, + "grad_norm": 2.8486764430999756, + "learning_rate": 6.768552492183768e-06, + "loss": 0.9309, + "step": 9983 + }, + { + "epoch": 0.8068041778621791, + "grad_norm": 2.335663080215454, + "learning_rate": 6.767940416169179e-06, + "loss": 0.9013, + "step": 9984 + }, + { + "epoch": 0.8068849875755065, + "grad_norm": 2.4963512420654297, + "learning_rate": 6.767328309874026e-06, + "loss": 1.0221, + "step": 9985 + }, + { + "epoch": 0.8069657972888341, + "grad_norm": 2.660006523132324, + "learning_rate": 6.766716173308795e-06, + "loss": 0.9881, + "step": 9986 + }, + { + "epoch": 0.8070466070021617, + "grad_norm": 2.5680716037750244, + "learning_rate": 6.766104006483968e-06, + "loss": 0.9788, + "step": 9987 + }, + { + "epoch": 0.8071274167154892, + "grad_norm": 3.0067737102508545, + "learning_rate": 6.765491809410032e-06, + "loss": 1.0068, + "step": 9988 + }, + { + "epoch": 0.8072082264288167, + "grad_norm": 2.735895872116089, + "learning_rate": 6.764879582097472e-06, + "loss": 1.0031, + "step": 9989 + }, + { + "epoch": 0.8072890361421443, + "grad_norm": 2.764328718185425, + "learning_rate": 6.764267324556773e-06, + "loss": 0.9728, + "step": 9990 + }, + { + "epoch": 0.8073698458554718, + "grad_norm": 3.4726850986480713, + "learning_rate": 6.763655036798421e-06, + "loss": 1.14, + "step": 9991 + }, + { + "epoch": 0.8074506555687994, + "grad_norm": 3.1520864963531494, + "learning_rate": 6.763042718832907e-06, + "loss": 0.8831, + "step": 9992 + }, + { + "epoch": 0.8075314652821269, + "grad_norm": 2.4253652095794678, + "learning_rate": 6.762430370670712e-06, + "loss": 0.9169, + "step": 9993 + }, + { + "epoch": 0.8076122749954544, + "grad_norm": 2.8476674556732178, + "learning_rate": 6.761817992322329e-06, + "loss": 0.9232, + "step": 9994 + }, + { + "epoch": 0.807693084708782, + "grad_norm": 2.554034471511841, + "learning_rate": 6.761205583798246e-06, + "loss": 0.8709, + "step": 9995 + }, + { + "epoch": 0.8077738944221096, + "grad_norm": 2.48921537399292, + "learning_rate": 6.76059314510895e-06, + "loss": 0.9303, + "step": 9996 + }, + { + "epoch": 0.807854704135437, + "grad_norm": 2.599687099456787, + "learning_rate": 6.759980676264932e-06, + "loss": 0.9226, + "step": 9997 + }, + { + "epoch": 0.8079355138487646, + "grad_norm": 2.7335262298583984, + "learning_rate": 6.759368177276684e-06, + "loss": 0.9733, + "step": 9998 + }, + { + "epoch": 0.8080163235620922, + "grad_norm": 2.438054323196411, + "learning_rate": 6.758755648154692e-06, + "loss": 0.7867, + "step": 9999 + }, + { + "epoch": 0.8080971332754197, + "grad_norm": 2.3297009468078613, + "learning_rate": 6.758143088909453e-06, + "loss": 0.9875, + "step": 10000 + }, + { + "epoch": 0.8080971332754197, + "eval_loss": 0.7729527354240417, + "eval_runtime": 815.2614, + "eval_samples_per_second": 102.257, + "eval_steps_per_second": 12.782, + "step": 10000 + }, + { + "epoch": 0.8081779429887472, + "grad_norm": 2.6086413860321045, + "learning_rate": 6.757530499551451e-06, + "loss": 1.0694, + "step": 10001 + }, + { + "epoch": 0.8082587527020748, + "grad_norm": 2.852970600128174, + "learning_rate": 6.756917880091186e-06, + "loss": 0.909, + "step": 10002 + }, + { + "epoch": 0.8083395624154023, + "grad_norm": 2.8168246746063232, + "learning_rate": 6.756305230539146e-06, + "loss": 1.0034, + "step": 10003 + }, + { + "epoch": 0.8084203721287299, + "grad_norm": 2.691760778427124, + "learning_rate": 6.755692550905826e-06, + "loss": 0.847, + "step": 10004 + }, + { + "epoch": 0.8085011818420574, + "grad_norm": 2.7063395977020264, + "learning_rate": 6.755079841201719e-06, + "loss": 1.0821, + "step": 10005 + }, + { + "epoch": 0.8085819915553849, + "grad_norm": 2.810978412628174, + "learning_rate": 6.754467101437321e-06, + "loss": 0.9827, + "step": 10006 + }, + { + "epoch": 0.8086628012687125, + "grad_norm": 2.860426664352417, + "learning_rate": 6.753854331623122e-06, + "loss": 0.9415, + "step": 10007 + }, + { + "epoch": 0.8087436109820401, + "grad_norm": 2.674544334411621, + "learning_rate": 6.7532415317696234e-06, + "loss": 1.0137, + "step": 10008 + }, + { + "epoch": 0.8088244206953675, + "grad_norm": 2.3481979370117188, + "learning_rate": 6.752628701887317e-06, + "loss": 0.9346, + "step": 10009 + }, + { + "epoch": 0.8089052304086951, + "grad_norm": 3.004422903060913, + "learning_rate": 6.7520158419867e-06, + "loss": 0.8694, + "step": 10010 + }, + { + "epoch": 0.8089860401220227, + "grad_norm": 3.0733017921447754, + "learning_rate": 6.75140295207827e-06, + "loss": 0.9039, + "step": 10011 + }, + { + "epoch": 0.8090668498353502, + "grad_norm": 2.641711473464966, + "learning_rate": 6.750790032172523e-06, + "loss": 1.105, + "step": 10012 + }, + { + "epoch": 0.8091476595486777, + "grad_norm": 3.112874984741211, + "learning_rate": 6.750177082279959e-06, + "loss": 0.863, + "step": 10013 + }, + { + "epoch": 0.8092284692620053, + "grad_norm": 2.5871925354003906, + "learning_rate": 6.749564102411074e-06, + "loss": 1.0219, + "step": 10014 + }, + { + "epoch": 0.8093092789753328, + "grad_norm": 2.4820566177368164, + "learning_rate": 6.748951092576367e-06, + "loss": 0.9423, + "step": 10015 + }, + { + "epoch": 0.8093900886886604, + "grad_norm": 2.7655489444732666, + "learning_rate": 6.7483380527863394e-06, + "loss": 1.0334, + "step": 10016 + }, + { + "epoch": 0.8094708984019879, + "grad_norm": 2.9179372787475586, + "learning_rate": 6.74772498305149e-06, + "loss": 0.8532, + "step": 10017 + }, + { + "epoch": 0.8095517081153155, + "grad_norm": 2.9236321449279785, + "learning_rate": 6.747111883382318e-06, + "loss": 0.8832, + "step": 10018 + }, + { + "epoch": 0.809632517828643, + "grad_norm": 2.989229202270508, + "learning_rate": 6.746498753789327e-06, + "loss": 0.9418, + "step": 10019 + }, + { + "epoch": 0.8097133275419706, + "grad_norm": 2.2503018379211426, + "learning_rate": 6.745885594283016e-06, + "loss": 1.0469, + "step": 10020 + }, + { + "epoch": 0.8097941372552981, + "grad_norm": 2.6143321990966797, + "learning_rate": 6.745272404873887e-06, + "loss": 0.9602, + "step": 10021 + }, + { + "epoch": 0.8098749469686256, + "grad_norm": 2.563796043395996, + "learning_rate": 6.744659185572444e-06, + "loss": 0.9779, + "step": 10022 + }, + { + "epoch": 0.8099557566819532, + "grad_norm": 2.7808890342712402, + "learning_rate": 6.744045936389191e-06, + "loss": 0.9327, + "step": 10023 + }, + { + "epoch": 0.8100365663952808, + "grad_norm": 3.1725735664367676, + "learning_rate": 6.743432657334628e-06, + "loss": 1.0153, + "step": 10024 + }, + { + "epoch": 0.8101173761086082, + "grad_norm": 2.564241409301758, + "learning_rate": 6.7428193484192605e-06, + "loss": 0.8836, + "step": 10025 + }, + { + "epoch": 0.8101981858219358, + "grad_norm": 2.9634478092193604, + "learning_rate": 6.742206009653593e-06, + "loss": 0.9728, + "step": 10026 + }, + { + "epoch": 0.8102789955352634, + "grad_norm": 2.711183547973633, + "learning_rate": 6.741592641048132e-06, + "loss": 1.021, + "step": 10027 + }, + { + "epoch": 0.8103598052485909, + "grad_norm": 2.8670310974121094, + "learning_rate": 6.74097924261338e-06, + "loss": 0.9941, + "step": 10028 + }, + { + "epoch": 0.8104406149619184, + "grad_norm": 2.6566109657287598, + "learning_rate": 6.7403658143598464e-06, + "loss": 0.9375, + "step": 10029 + }, + { + "epoch": 0.810521424675246, + "grad_norm": 3.2555360794067383, + "learning_rate": 6.739752356298035e-06, + "loss": 0.9621, + "step": 10030 + }, + { + "epoch": 0.8106022343885735, + "grad_norm": 2.604140281677246, + "learning_rate": 6.739138868438456e-06, + "loss": 0.9054, + "step": 10031 + }, + { + "epoch": 0.8106830441019011, + "grad_norm": 2.6726272106170654, + "learning_rate": 6.738525350791614e-06, + "loss": 0.9611, + "step": 10032 + }, + { + "epoch": 0.8107638538152286, + "grad_norm": 3.002016544342041, + "learning_rate": 6.737911803368017e-06, + "loss": 0.9564, + "step": 10033 + }, + { + "epoch": 0.8108446635285561, + "grad_norm": 2.8780055046081543, + "learning_rate": 6.737298226178175e-06, + "loss": 0.989, + "step": 10034 + }, + { + "epoch": 0.8109254732418837, + "grad_norm": 2.6711130142211914, + "learning_rate": 6.736684619232597e-06, + "loss": 0.9191, + "step": 10035 + }, + { + "epoch": 0.8110062829552113, + "grad_norm": 2.675698757171631, + "learning_rate": 6.7360709825417925e-06, + "loss": 0.9342, + "step": 10036 + }, + { + "epoch": 0.8110870926685387, + "grad_norm": 2.771097421646118, + "learning_rate": 6.735457316116273e-06, + "loss": 0.9834, + "step": 10037 + }, + { + "epoch": 0.8111679023818663, + "grad_norm": 2.6776483058929443, + "learning_rate": 6.7348436199665445e-06, + "loss": 0.9516, + "step": 10038 + }, + { + "epoch": 0.8112487120951939, + "grad_norm": 2.557114601135254, + "learning_rate": 6.734229894103124e-06, + "loss": 0.77, + "step": 10039 + }, + { + "epoch": 0.8113295218085214, + "grad_norm": 2.6638333797454834, + "learning_rate": 6.733616138536519e-06, + "loss": 0.8616, + "step": 10040 + }, + { + "epoch": 0.8114103315218489, + "grad_norm": 2.5837926864624023, + "learning_rate": 6.733002353277243e-06, + "loss": 0.9252, + "step": 10041 + }, + { + "epoch": 0.8114911412351765, + "grad_norm": 2.385305643081665, + "learning_rate": 6.73238853833581e-06, + "loss": 0.8884, + "step": 10042 + }, + { + "epoch": 0.811571950948504, + "grad_norm": 2.212360382080078, + "learning_rate": 6.73177469372273e-06, + "loss": 0.8848, + "step": 10043 + }, + { + "epoch": 0.8116527606618316, + "grad_norm": 2.545518398284912, + "learning_rate": 6.73116081944852e-06, + "loss": 0.8787, + "step": 10044 + }, + { + "epoch": 0.8117335703751591, + "grad_norm": 2.412827730178833, + "learning_rate": 6.730546915523693e-06, + "loss": 0.9444, + "step": 10045 + }, + { + "epoch": 0.8118143800884866, + "grad_norm": 3.18371844291687, + "learning_rate": 6.7299329819587615e-06, + "loss": 0.9335, + "step": 10046 + }, + { + "epoch": 0.8118951898018142, + "grad_norm": 2.795923948287964, + "learning_rate": 6.729319018764244e-06, + "loss": 1.0072, + "step": 10047 + }, + { + "epoch": 0.8119759995151418, + "grad_norm": 2.6364145278930664, + "learning_rate": 6.728705025950656e-06, + "loss": 0.9055, + "step": 10048 + }, + { + "epoch": 0.8120568092284692, + "grad_norm": 3.0788910388946533, + "learning_rate": 6.728091003528511e-06, + "loss": 0.9018, + "step": 10049 + }, + { + "epoch": 0.8121376189417968, + "grad_norm": 2.7288432121276855, + "learning_rate": 6.727476951508327e-06, + "loss": 1.0376, + "step": 10050 + }, + { + "epoch": 0.8122184286551244, + "grad_norm": 2.7107584476470947, + "learning_rate": 6.726862869900624e-06, + "loss": 0.9126, + "step": 10051 + }, + { + "epoch": 0.8122992383684519, + "grad_norm": 2.771477222442627, + "learning_rate": 6.726248758715914e-06, + "loss": 1.0442, + "step": 10052 + }, + { + "epoch": 0.8123800480817794, + "grad_norm": 2.47438383102417, + "learning_rate": 6.725634617964721e-06, + "loss": 0.7962, + "step": 10053 + }, + { + "epoch": 0.812460857795107, + "grad_norm": 2.296757698059082, + "learning_rate": 6.7250204476575594e-06, + "loss": 0.8564, + "step": 10054 + }, + { + "epoch": 0.8125416675084345, + "grad_norm": 2.618964433670044, + "learning_rate": 6.724406247804952e-06, + "loss": 0.9034, + "step": 10055 + }, + { + "epoch": 0.8126224772217621, + "grad_norm": 2.6588761806488037, + "learning_rate": 6.7237920184174165e-06, + "loss": 0.8751, + "step": 10056 + }, + { + "epoch": 0.8127032869350896, + "grad_norm": 2.5719103813171387, + "learning_rate": 6.723177759505473e-06, + "loss": 1.0358, + "step": 10057 + }, + { + "epoch": 0.8127840966484171, + "grad_norm": 2.8363654613494873, + "learning_rate": 6.722563471079643e-06, + "loss": 1.1362, + "step": 10058 + }, + { + "epoch": 0.8128649063617447, + "grad_norm": 2.587702989578247, + "learning_rate": 6.721949153150449e-06, + "loss": 0.997, + "step": 10059 + }, + { + "epoch": 0.8129457160750723, + "grad_norm": 2.430743932723999, + "learning_rate": 6.721334805728409e-06, + "loss": 1.0193, + "step": 10060 + }, + { + "epoch": 0.8130265257883997, + "grad_norm": 2.8744118213653564, + "learning_rate": 6.72072042882405e-06, + "loss": 0.969, + "step": 10061 + }, + { + "epoch": 0.8131073355017273, + "grad_norm": 2.4647622108459473, + "learning_rate": 6.720106022447891e-06, + "loss": 0.8981, + "step": 10062 + }, + { + "epoch": 0.8131881452150549, + "grad_norm": 2.7981672286987305, + "learning_rate": 6.719491586610457e-06, + "loss": 0.9021, + "step": 10063 + }, + { + "epoch": 0.8132689549283824, + "grad_norm": 2.9667656421661377, + "learning_rate": 6.718877121322271e-06, + "loss": 0.9479, + "step": 10064 + }, + { + "epoch": 0.8133497646417099, + "grad_norm": 2.5634799003601074, + "learning_rate": 6.718262626593861e-06, + "loss": 1.0027, + "step": 10065 + }, + { + "epoch": 0.8134305743550375, + "grad_norm": 3.1184725761413574, + "learning_rate": 6.717648102435745e-06, + "loss": 0.8169, + "step": 10066 + }, + { + "epoch": 0.813511384068365, + "grad_norm": 2.696802854537964, + "learning_rate": 6.717033548858455e-06, + "loss": 0.9675, + "step": 10067 + }, + { + "epoch": 0.8135921937816926, + "grad_norm": 2.2710602283477783, + "learning_rate": 6.716418965872513e-06, + "loss": 1.0478, + "step": 10068 + }, + { + "epoch": 0.8136730034950201, + "grad_norm": 3.26822829246521, + "learning_rate": 6.715804353488445e-06, + "loss": 0.878, + "step": 10069 + }, + { + "epoch": 0.8137538132083476, + "grad_norm": 2.9366466999053955, + "learning_rate": 6.71518971171678e-06, + "loss": 0.865, + "step": 10070 + }, + { + "epoch": 0.8138346229216752, + "grad_norm": 2.584158420562744, + "learning_rate": 6.714575040568044e-06, + "loss": 0.9311, + "step": 10071 + }, + { + "epoch": 0.8139154326350028, + "grad_norm": 2.484003782272339, + "learning_rate": 6.713960340052765e-06, + "loss": 0.9868, + "step": 10072 + }, + { + "epoch": 0.8139962423483302, + "grad_norm": 2.606750249862671, + "learning_rate": 6.713345610181474e-06, + "loss": 0.8955, + "step": 10073 + }, + { + "epoch": 0.8140770520616578, + "grad_norm": 2.787959575653076, + "learning_rate": 6.7127308509646935e-06, + "loss": 0.9094, + "step": 10074 + }, + { + "epoch": 0.8141578617749854, + "grad_norm": 2.514554023742676, + "learning_rate": 6.71211606241296e-06, + "loss": 0.9302, + "step": 10075 + }, + { + "epoch": 0.8142386714883129, + "grad_norm": 2.673313617706299, + "learning_rate": 6.7115012445367985e-06, + "loss": 1.0401, + "step": 10076 + }, + { + "epoch": 0.8143194812016404, + "grad_norm": 2.5632903575897217, + "learning_rate": 6.710886397346741e-06, + "loss": 0.9086, + "step": 10077 + }, + { + "epoch": 0.814400290914968, + "grad_norm": 2.2498035430908203, + "learning_rate": 6.710271520853319e-06, + "loss": 0.9825, + "step": 10078 + }, + { + "epoch": 0.8144811006282955, + "grad_norm": 2.883054733276367, + "learning_rate": 6.709656615067063e-06, + "loss": 0.8546, + "step": 10079 + }, + { + "epoch": 0.8145619103416231, + "grad_norm": 2.5208911895751953, + "learning_rate": 6.709041679998505e-06, + "loss": 0.8928, + "step": 10080 + }, + { + "epoch": 0.8146427200549506, + "grad_norm": 2.398552417755127, + "learning_rate": 6.708426715658177e-06, + "loss": 0.8561, + "step": 10081 + }, + { + "epoch": 0.8147235297682781, + "grad_norm": 2.866697072982788, + "learning_rate": 6.707811722056612e-06, + "loss": 0.9007, + "step": 10082 + }, + { + "epoch": 0.8148043394816057, + "grad_norm": 2.902155876159668, + "learning_rate": 6.707196699204345e-06, + "loss": 0.9017, + "step": 10083 + }, + { + "epoch": 0.8148851491949333, + "grad_norm": 2.83586049079895, + "learning_rate": 6.706581647111907e-06, + "loss": 1.0539, + "step": 10084 + }, + { + "epoch": 0.8149659589082607, + "grad_norm": 2.565206527709961, + "learning_rate": 6.705966565789834e-06, + "loss": 0.7901, + "step": 10085 + }, + { + "epoch": 0.8150467686215883, + "grad_norm": 2.6432831287384033, + "learning_rate": 6.705351455248661e-06, + "loss": 0.9973, + "step": 10086 + }, + { + "epoch": 0.8151275783349159, + "grad_norm": 2.851337194442749, + "learning_rate": 6.704736315498922e-06, + "loss": 0.9275, + "step": 10087 + }, + { + "epoch": 0.8152083880482434, + "grad_norm": 2.192448139190674, + "learning_rate": 6.704121146551154e-06, + "loss": 0.8284, + "step": 10088 + }, + { + "epoch": 0.8152891977615709, + "grad_norm": 2.867723226547241, + "learning_rate": 6.703505948415896e-06, + "loss": 0.9557, + "step": 10089 + }, + { + "epoch": 0.8153700074748985, + "grad_norm": 2.345224142074585, + "learning_rate": 6.7028907211036806e-06, + "loss": 0.9346, + "step": 10090 + }, + { + "epoch": 0.815450817188226, + "grad_norm": 2.7858355045318604, + "learning_rate": 6.702275464625045e-06, + "loss": 0.969, + "step": 10091 + }, + { + "epoch": 0.8155316269015536, + "grad_norm": 3.0980935096740723, + "learning_rate": 6.701660178990531e-06, + "loss": 0.878, + "step": 10092 + }, + { + "epoch": 0.8156124366148811, + "grad_norm": 3.0792758464813232, + "learning_rate": 6.701044864210673e-06, + "loss": 0.8974, + "step": 10093 + }, + { + "epoch": 0.8156932463282086, + "grad_norm": 3.3279123306274414, + "learning_rate": 6.700429520296012e-06, + "loss": 0.9739, + "step": 10094 + }, + { + "epoch": 0.8157740560415362, + "grad_norm": 2.926795721054077, + "learning_rate": 6.699814147257088e-06, + "loss": 0.9833, + "step": 10095 + }, + { + "epoch": 0.8158548657548638, + "grad_norm": 2.509044647216797, + "learning_rate": 6.6991987451044385e-06, + "loss": 1.0915, + "step": 10096 + }, + { + "epoch": 0.8159356754681912, + "grad_norm": 2.4170308113098145, + "learning_rate": 6.6985833138486055e-06, + "loss": 0.9636, + "step": 10097 + }, + { + "epoch": 0.8160164851815188, + "grad_norm": 3.246962308883667, + "learning_rate": 6.697967853500132e-06, + "loss": 0.8535, + "step": 10098 + }, + { + "epoch": 0.8160972948948464, + "grad_norm": 2.6362879276275635, + "learning_rate": 6.697352364069553e-06, + "loss": 0.9736, + "step": 10099 + }, + { + "epoch": 0.8161781046081739, + "grad_norm": 2.961191415786743, + "learning_rate": 6.696736845567417e-06, + "loss": 0.8553, + "step": 10100 + }, + { + "epoch": 0.8162589143215014, + "grad_norm": 2.4501521587371826, + "learning_rate": 6.6961212980042615e-06, + "loss": 0.978, + "step": 10101 + }, + { + "epoch": 0.816339724034829, + "grad_norm": 2.997964859008789, + "learning_rate": 6.695505721390632e-06, + "loss": 0.8973, + "step": 10102 + }, + { + "epoch": 0.8164205337481565, + "grad_norm": 3.016570806503296, + "learning_rate": 6.694890115737072e-06, + "loss": 0.975, + "step": 10103 + }, + { + "epoch": 0.8165013434614841, + "grad_norm": 2.6183083057403564, + "learning_rate": 6.694274481054125e-06, + "loss": 0.948, + "step": 10104 + }, + { + "epoch": 0.8165821531748116, + "grad_norm": 2.6524343490600586, + "learning_rate": 6.6936588173523335e-06, + "loss": 1.0718, + "step": 10105 + }, + { + "epoch": 0.8166629628881391, + "grad_norm": 2.4056267738342285, + "learning_rate": 6.693043124642244e-06, + "loss": 0.9673, + "step": 10106 + }, + { + "epoch": 0.8167437726014667, + "grad_norm": 3.0743188858032227, + "learning_rate": 6.6924274029344024e-06, + "loss": 0.8724, + "step": 10107 + }, + { + "epoch": 0.8168245823147943, + "grad_norm": 2.5183846950531006, + "learning_rate": 6.691811652239352e-06, + "loss": 1.002, + "step": 10108 + }, + { + "epoch": 0.8169053920281217, + "grad_norm": 2.6143288612365723, + "learning_rate": 6.691195872567643e-06, + "loss": 0.9921, + "step": 10109 + }, + { + "epoch": 0.8169862017414493, + "grad_norm": 2.5690085887908936, + "learning_rate": 6.690580063929819e-06, + "loss": 0.8967, + "step": 10110 + }, + { + "epoch": 0.8170670114547769, + "grad_norm": 3.043970823287964, + "learning_rate": 6.6899642263364296e-06, + "loss": 0.9835, + "step": 10111 + }, + { + "epoch": 0.8171478211681044, + "grad_norm": 2.7340104579925537, + "learning_rate": 6.6893483597980205e-06, + "loss": 0.9379, + "step": 10112 + }, + { + "epoch": 0.8172286308814319, + "grad_norm": 2.3324716091156006, + "learning_rate": 6.68873246432514e-06, + "loss": 0.9084, + "step": 10113 + }, + { + "epoch": 0.8173094405947595, + "grad_norm": 2.45967173576355, + "learning_rate": 6.68811653992834e-06, + "loss": 0.9476, + "step": 10114 + }, + { + "epoch": 0.817390250308087, + "grad_norm": 3.017951011657715, + "learning_rate": 6.6875005866181665e-06, + "loss": 0.8792, + "step": 10115 + }, + { + "epoch": 0.8174710600214146, + "grad_norm": 2.4142134189605713, + "learning_rate": 6.68688460440517e-06, + "loss": 1.0397, + "step": 10116 + }, + { + "epoch": 0.8175518697347421, + "grad_norm": 2.2612359523773193, + "learning_rate": 6.686268593299902e-06, + "loss": 0.916, + "step": 10117 + }, + { + "epoch": 0.8176326794480696, + "grad_norm": 2.9187474250793457, + "learning_rate": 6.685652553312912e-06, + "loss": 0.9164, + "step": 10118 + }, + { + "epoch": 0.8177134891613972, + "grad_norm": 2.5997235774993896, + "learning_rate": 6.685036484454751e-06, + "loss": 0.8463, + "step": 10119 + }, + { + "epoch": 0.8177942988747248, + "grad_norm": 2.774573802947998, + "learning_rate": 6.684420386735973e-06, + "loss": 1.1914, + "step": 10120 + }, + { + "epoch": 0.8178751085880522, + "grad_norm": 2.281858444213867, + "learning_rate": 6.683804260167128e-06, + "loss": 1.1155, + "step": 10121 + }, + { + "epoch": 0.8179559183013798, + "grad_norm": 2.6453025341033936, + "learning_rate": 6.683188104758771e-06, + "loss": 0.8863, + "step": 10122 + }, + { + "epoch": 0.8180367280147074, + "grad_norm": 2.6469886302948, + "learning_rate": 6.682571920521452e-06, + "loss": 1.0179, + "step": 10123 + }, + { + "epoch": 0.8181175377280349, + "grad_norm": 2.680061101913452, + "learning_rate": 6.6819557074657285e-06, + "loss": 0.8888, + "step": 10124 + }, + { + "epoch": 0.8181983474413624, + "grad_norm": 2.871833324432373, + "learning_rate": 6.681339465602152e-06, + "loss": 0.854, + "step": 10125 + }, + { + "epoch": 0.81827915715469, + "grad_norm": 2.630047082901001, + "learning_rate": 6.6807231949412775e-06, + "loss": 0.898, + "step": 10126 + }, + { + "epoch": 0.8183599668680175, + "grad_norm": 2.623267412185669, + "learning_rate": 6.680106895493661e-06, + "loss": 0.7925, + "step": 10127 + }, + { + "epoch": 0.8184407765813451, + "grad_norm": 2.749006509780884, + "learning_rate": 6.67949056726986e-06, + "loss": 1.0612, + "step": 10128 + }, + { + "epoch": 0.8185215862946726, + "grad_norm": 2.7983627319335938, + "learning_rate": 6.678874210280426e-06, + "loss": 1.1179, + "step": 10129 + }, + { + "epoch": 0.8186023960080001, + "grad_norm": 2.661078929901123, + "learning_rate": 6.67825782453592e-06, + "loss": 0.8077, + "step": 10130 + }, + { + "epoch": 0.8186832057213277, + "grad_norm": 2.6801116466522217, + "learning_rate": 6.677641410046896e-06, + "loss": 0.8544, + "step": 10131 + }, + { + "epoch": 0.8187640154346553, + "grad_norm": 2.988719940185547, + "learning_rate": 6.6770249668239165e-06, + "loss": 0.9211, + "step": 10132 + }, + { + "epoch": 0.8188448251479827, + "grad_norm": 2.723395824432373, + "learning_rate": 6.676408494877534e-06, + "loss": 0.8983, + "step": 10133 + }, + { + "epoch": 0.8189256348613103, + "grad_norm": 2.9994168281555176, + "learning_rate": 6.675791994218311e-06, + "loss": 0.9042, + "step": 10134 + }, + { + "epoch": 0.8190064445746379, + "grad_norm": 2.401463031768799, + "learning_rate": 6.675175464856806e-06, + "loss": 0.9688, + "step": 10135 + }, + { + "epoch": 0.8190872542879654, + "grad_norm": 2.3743197917938232, + "learning_rate": 6.674558906803576e-06, + "loss": 0.8271, + "step": 10136 + }, + { + "epoch": 0.8191680640012929, + "grad_norm": 2.615125894546509, + "learning_rate": 6.673942320069185e-06, + "loss": 0.9315, + "step": 10137 + }, + { + "epoch": 0.8192488737146205, + "grad_norm": 2.834791660308838, + "learning_rate": 6.673325704664191e-06, + "loss": 1.0527, + "step": 10138 + }, + { + "epoch": 0.819329683427948, + "grad_norm": 2.7235167026519775, + "learning_rate": 6.672709060599156e-06, + "loss": 0.8809, + "step": 10139 + }, + { + "epoch": 0.8194104931412756, + "grad_norm": 2.63031005859375, + "learning_rate": 6.672092387884643e-06, + "loss": 0.9727, + "step": 10140 + }, + { + "epoch": 0.8194913028546031, + "grad_norm": 2.686319589614868, + "learning_rate": 6.671475686531211e-06, + "loss": 0.9677, + "step": 10141 + }, + { + "epoch": 0.8195721125679306, + "grad_norm": 3.041576623916626, + "learning_rate": 6.670858956549427e-06, + "loss": 0.9037, + "step": 10142 + }, + { + "epoch": 0.8196529222812582, + "grad_norm": 3.731450319290161, + "learning_rate": 6.670242197949849e-06, + "loss": 1.0614, + "step": 10143 + }, + { + "epoch": 0.8197337319945858, + "grad_norm": 2.628826856613159, + "learning_rate": 6.669625410743044e-06, + "loss": 0.9932, + "step": 10144 + }, + { + "epoch": 0.8198145417079132, + "grad_norm": 2.3889551162719727, + "learning_rate": 6.669008594939574e-06, + "loss": 0.9255, + "step": 10145 + }, + { + "epoch": 0.8198953514212408, + "grad_norm": 2.936560869216919, + "learning_rate": 6.668391750550006e-06, + "loss": 0.8694, + "step": 10146 + }, + { + "epoch": 0.8199761611345684, + "grad_norm": 2.8249928951263428, + "learning_rate": 6.6677748775849035e-06, + "loss": 0.9415, + "step": 10147 + }, + { + "epoch": 0.820056970847896, + "grad_norm": 2.964205265045166, + "learning_rate": 6.667157976054835e-06, + "loss": 0.9028, + "step": 10148 + }, + { + "epoch": 0.8201377805612234, + "grad_norm": 2.5520401000976562, + "learning_rate": 6.66654104597036e-06, + "loss": 0.8618, + "step": 10149 + }, + { + "epoch": 0.820218590274551, + "grad_norm": 3.233186721801758, + "learning_rate": 6.66592408734205e-06, + "loss": 0.8467, + "step": 10150 + }, + { + "epoch": 0.8202993999878786, + "grad_norm": 2.4776675701141357, + "learning_rate": 6.665307100180472e-06, + "loss": 0.9657, + "step": 10151 + }, + { + "epoch": 0.8203802097012061, + "grad_norm": 2.51839280128479, + "learning_rate": 6.66469008449619e-06, + "loss": 0.9224, + "step": 10152 + }, + { + "epoch": 0.8204610194145336, + "grad_norm": 2.646523952484131, + "learning_rate": 6.664073040299777e-06, + "loss": 0.9803, + "step": 10153 + }, + { + "epoch": 0.8205418291278612, + "grad_norm": 2.538114547729492, + "learning_rate": 6.663455967601797e-06, + "loss": 0.9243, + "step": 10154 + }, + { + "epoch": 0.8206226388411887, + "grad_norm": 2.5881614685058594, + "learning_rate": 6.662838866412822e-06, + "loss": 1.0662, + "step": 10155 + }, + { + "epoch": 0.8207034485545163, + "grad_norm": 2.826141595840454, + "learning_rate": 6.662221736743422e-06, + "loss": 0.925, + "step": 10156 + }, + { + "epoch": 0.8207842582678438, + "grad_norm": 2.7673892974853516, + "learning_rate": 6.6616045786041625e-06, + "loss": 0.9167, + "step": 10157 + }, + { + "epoch": 0.8208650679811713, + "grad_norm": 2.562490701675415, + "learning_rate": 6.660987392005618e-06, + "loss": 0.8849, + "step": 10158 + }, + { + "epoch": 0.8209458776944989, + "grad_norm": 3.130794048309326, + "learning_rate": 6.660370176958358e-06, + "loss": 0.9633, + "step": 10159 + }, + { + "epoch": 0.8210266874078265, + "grad_norm": 2.4005680084228516, + "learning_rate": 6.659752933472954e-06, + "loss": 0.9505, + "step": 10160 + }, + { + "epoch": 0.8211074971211539, + "grad_norm": 2.835517644882202, + "learning_rate": 6.659135661559977e-06, + "loss": 0.9847, + "step": 10161 + }, + { + "epoch": 0.8211883068344815, + "grad_norm": 2.4224259853363037, + "learning_rate": 6.658518361230002e-06, + "loss": 0.8529, + "step": 10162 + }, + { + "epoch": 0.8212691165478091, + "grad_norm": 3.1083216667175293, + "learning_rate": 6.6579010324936e-06, + "loss": 0.8878, + "step": 10163 + }, + { + "epoch": 0.8213499262611366, + "grad_norm": 3.0418636798858643, + "learning_rate": 6.6572836753613425e-06, + "loss": 0.9649, + "step": 10164 + }, + { + "epoch": 0.8214307359744641, + "grad_norm": 2.8070740699768066, + "learning_rate": 6.656666289843808e-06, + "loss": 0.9534, + "step": 10165 + }, + { + "epoch": 0.8215115456877917, + "grad_norm": 2.583364248275757, + "learning_rate": 6.656048875951566e-06, + "loss": 0.7991, + "step": 10166 + }, + { + "epoch": 0.8215923554011192, + "grad_norm": 2.580359697341919, + "learning_rate": 6.655431433695195e-06, + "loss": 0.8787, + "step": 10167 + }, + { + "epoch": 0.8216731651144468, + "grad_norm": 2.6281025409698486, + "learning_rate": 6.654813963085268e-06, + "loss": 0.9272, + "step": 10168 + }, + { + "epoch": 0.8217539748277743, + "grad_norm": 2.742807149887085, + "learning_rate": 6.654196464132362e-06, + "loss": 0.9912, + "step": 10169 + }, + { + "epoch": 0.8218347845411018, + "grad_norm": 2.8565542697906494, + "learning_rate": 6.653578936847052e-06, + "loss": 0.8122, + "step": 10170 + }, + { + "epoch": 0.8219155942544294, + "grad_norm": 2.4775171279907227, + "learning_rate": 6.652961381239919e-06, + "loss": 0.9722, + "step": 10171 + }, + { + "epoch": 0.821996403967757, + "grad_norm": 2.6974034309387207, + "learning_rate": 6.652343797321532e-06, + "loss": 0.7939, + "step": 10172 + }, + { + "epoch": 0.8220772136810844, + "grad_norm": 2.4402425289154053, + "learning_rate": 6.651726185102477e-06, + "loss": 1.0766, + "step": 10173 + }, + { + "epoch": 0.822158023394412, + "grad_norm": 3.0757877826690674, + "learning_rate": 6.651108544593327e-06, + "loss": 0.9869, + "step": 10174 + }, + { + "epoch": 0.8222388331077396, + "grad_norm": 2.5132808685302734, + "learning_rate": 6.6504908758046645e-06, + "loss": 1.0028, + "step": 10175 + }, + { + "epoch": 0.8223196428210671, + "grad_norm": 2.898428201675415, + "learning_rate": 6.649873178747065e-06, + "loss": 0.881, + "step": 10176 + }, + { + "epoch": 0.8224004525343946, + "grad_norm": 2.6547205448150635, + "learning_rate": 6.649255453431112e-06, + "loss": 0.8828, + "step": 10177 + }, + { + "epoch": 0.8224812622477222, + "grad_norm": 2.9147605895996094, + "learning_rate": 6.648637699867379e-06, + "loss": 0.9364, + "step": 10178 + }, + { + "epoch": 0.8225620719610497, + "grad_norm": 2.318533182144165, + "learning_rate": 6.648019918066456e-06, + "loss": 0.9524, + "step": 10179 + }, + { + "epoch": 0.8226428816743773, + "grad_norm": 2.617670774459839, + "learning_rate": 6.647402108038916e-06, + "loss": 0.8285, + "step": 10180 + }, + { + "epoch": 0.8227236913877048, + "grad_norm": 2.6547958850860596, + "learning_rate": 6.646784269795347e-06, + "loss": 0.9326, + "step": 10181 + }, + { + "epoch": 0.8228045011010323, + "grad_norm": 2.443333148956299, + "learning_rate": 6.646166403346326e-06, + "loss": 0.9704, + "step": 10182 + }, + { + "epoch": 0.8228853108143599, + "grad_norm": 2.5647428035736084, + "learning_rate": 6.645548508702436e-06, + "loss": 0.8155, + "step": 10183 + }, + { + "epoch": 0.8229661205276875, + "grad_norm": 3.127439498901367, + "learning_rate": 6.644930585874263e-06, + "loss": 0.9464, + "step": 10184 + }, + { + "epoch": 0.8230469302410149, + "grad_norm": 3.0389275550842285, + "learning_rate": 6.6443126348723905e-06, + "loss": 0.8466, + "step": 10185 + }, + { + "epoch": 0.8231277399543425, + "grad_norm": 2.54085636138916, + "learning_rate": 6.643694655707399e-06, + "loss": 0.8709, + "step": 10186 + }, + { + "epoch": 0.8232085496676701, + "grad_norm": 2.587632656097412, + "learning_rate": 6.6430766483898765e-06, + "loss": 0.9208, + "step": 10187 + }, + { + "epoch": 0.8232893593809976, + "grad_norm": 2.5194945335388184, + "learning_rate": 6.642458612930406e-06, + "loss": 0.9098, + "step": 10188 + }, + { + "epoch": 0.8233701690943251, + "grad_norm": 2.77231764793396, + "learning_rate": 6.641840549339573e-06, + "loss": 0.9631, + "step": 10189 + }, + { + "epoch": 0.8234509788076527, + "grad_norm": 2.6095011234283447, + "learning_rate": 6.641222457627964e-06, + "loss": 0.964, + "step": 10190 + }, + { + "epoch": 0.8235317885209802, + "grad_norm": 2.629227638244629, + "learning_rate": 6.6406043378061665e-06, + "loss": 1.0529, + "step": 10191 + }, + { + "epoch": 0.8236125982343078, + "grad_norm": 2.665407180786133, + "learning_rate": 6.6399861898847654e-06, + "loss": 0.891, + "step": 10192 + }, + { + "epoch": 0.8236934079476353, + "grad_norm": 2.96156644821167, + "learning_rate": 6.63936801387435e-06, + "loss": 0.9784, + "step": 10193 + }, + { + "epoch": 0.8237742176609628, + "grad_norm": 2.704313278198242, + "learning_rate": 6.638749809785504e-06, + "loss": 0.9028, + "step": 10194 + }, + { + "epoch": 0.8238550273742904, + "grad_norm": 2.567556858062744, + "learning_rate": 6.6381315776288225e-06, + "loss": 0.8058, + "step": 10195 + }, + { + "epoch": 0.823935837087618, + "grad_norm": 2.753659248352051, + "learning_rate": 6.63751331741489e-06, + "loss": 0.8175, + "step": 10196 + }, + { + "epoch": 0.8240166468009454, + "grad_norm": 2.827409505844116, + "learning_rate": 6.636895029154295e-06, + "loss": 0.994, + "step": 10197 + }, + { + "epoch": 0.824097456514273, + "grad_norm": 2.9777119159698486, + "learning_rate": 6.63627671285763e-06, + "loss": 0.8782, + "step": 10198 + }, + { + "epoch": 0.8241782662276006, + "grad_norm": 2.711076259613037, + "learning_rate": 6.6356583685354845e-06, + "loss": 0.9108, + "step": 10199 + }, + { + "epoch": 0.8242590759409281, + "grad_norm": 2.158092975616455, + "learning_rate": 6.635039996198447e-06, + "loss": 0.9218, + "step": 10200 + }, + { + "epoch": 0.8243398856542556, + "grad_norm": 2.5535473823547363, + "learning_rate": 6.634421595857113e-06, + "loss": 0.9166, + "step": 10201 + }, + { + "epoch": 0.8244206953675832, + "grad_norm": 2.590787887573242, + "learning_rate": 6.63380316752207e-06, + "loss": 0.8717, + "step": 10202 + }, + { + "epoch": 0.8245015050809107, + "grad_norm": 3.331803798675537, + "learning_rate": 6.633184711203912e-06, + "loss": 0.9595, + "step": 10203 + }, + { + "epoch": 0.8245823147942383, + "grad_norm": 2.619899272918701, + "learning_rate": 6.632566226913232e-06, + "loss": 0.9667, + "step": 10204 + }, + { + "epoch": 0.8246631245075658, + "grad_norm": 2.8658032417297363, + "learning_rate": 6.631947714660622e-06, + "loss": 0.8818, + "step": 10205 + }, + { + "epoch": 0.8247439342208933, + "grad_norm": 2.6257688999176025, + "learning_rate": 6.6313291744566775e-06, + "loss": 0.9268, + "step": 10206 + }, + { + "epoch": 0.8248247439342209, + "grad_norm": 2.7508773803710938, + "learning_rate": 6.630710606311992e-06, + "loss": 0.9832, + "step": 10207 + }, + { + "epoch": 0.8249055536475485, + "grad_norm": 2.3672330379486084, + "learning_rate": 6.630092010237158e-06, + "loss": 0.8869, + "step": 10208 + }, + { + "epoch": 0.8249863633608759, + "grad_norm": 2.609768867492676, + "learning_rate": 6.629473386242773e-06, + "loss": 0.9656, + "step": 10209 + }, + { + "epoch": 0.8250671730742035, + "grad_norm": 2.485883951187134, + "learning_rate": 6.628854734339432e-06, + "loss": 1.0547, + "step": 10210 + }, + { + "epoch": 0.8251479827875311, + "grad_norm": 2.6045002937316895, + "learning_rate": 6.62823605453773e-06, + "loss": 0.8622, + "step": 10211 + }, + { + "epoch": 0.8252287925008586, + "grad_norm": 2.529799699783325, + "learning_rate": 6.627617346848265e-06, + "loss": 0.8746, + "step": 10212 + }, + { + "epoch": 0.8253096022141861, + "grad_norm": 2.90537166595459, + "learning_rate": 6.626998611281633e-06, + "loss": 0.9279, + "step": 10213 + }, + { + "epoch": 0.8253904119275137, + "grad_norm": 2.5929040908813477, + "learning_rate": 6.626379847848431e-06, + "loss": 0.8579, + "step": 10214 + }, + { + "epoch": 0.8254712216408412, + "grad_norm": 2.4857747554779053, + "learning_rate": 6.625761056559259e-06, + "loss": 1.0123, + "step": 10215 + }, + { + "epoch": 0.8255520313541688, + "grad_norm": 3.461653470993042, + "learning_rate": 6.625142237424712e-06, + "loss": 0.9556, + "step": 10216 + }, + { + "epoch": 0.8256328410674963, + "grad_norm": 2.7623372077941895, + "learning_rate": 6.624523390455392e-06, + "loss": 0.8098, + "step": 10217 + }, + { + "epoch": 0.8257136507808238, + "grad_norm": 2.3995823860168457, + "learning_rate": 6.623904515661897e-06, + "loss": 0.8664, + "step": 10218 + }, + { + "epoch": 0.8257944604941514, + "grad_norm": 2.5108940601348877, + "learning_rate": 6.623285613054826e-06, + "loss": 0.9601, + "step": 10219 + }, + { + "epoch": 0.825875270207479, + "grad_norm": 2.5409669876098633, + "learning_rate": 6.622666682644782e-06, + "loss": 0.9741, + "step": 10220 + }, + { + "epoch": 0.8259560799208064, + "grad_norm": 3.002204418182373, + "learning_rate": 6.622047724442363e-06, + "loss": 0.9147, + "step": 10221 + }, + { + "epoch": 0.826036889634134, + "grad_norm": 2.9399683475494385, + "learning_rate": 6.621428738458171e-06, + "loss": 0.8211, + "step": 10222 + }, + { + "epoch": 0.8261176993474616, + "grad_norm": 2.6760101318359375, + "learning_rate": 6.620809724702811e-06, + "loss": 0.9768, + "step": 10223 + }, + { + "epoch": 0.8261985090607891, + "grad_norm": 2.6501832008361816, + "learning_rate": 6.62019068318688e-06, + "loss": 0.8796, + "step": 10224 + }, + { + "epoch": 0.8262793187741166, + "grad_norm": 2.7179949283599854, + "learning_rate": 6.6195716139209835e-06, + "loss": 0.9877, + "step": 10225 + }, + { + "epoch": 0.8263601284874442, + "grad_norm": 2.8635172843933105, + "learning_rate": 6.618952516915723e-06, + "loss": 0.8617, + "step": 10226 + }, + { + "epoch": 0.8264409382007717, + "grad_norm": 2.3890533447265625, + "learning_rate": 6.618333392181705e-06, + "loss": 0.8804, + "step": 10227 + }, + { + "epoch": 0.8265217479140993, + "grad_norm": 2.7411887645721436, + "learning_rate": 6.6177142397295315e-06, + "loss": 1.0229, + "step": 10228 + }, + { + "epoch": 0.8266025576274268, + "grad_norm": 3.016167163848877, + "learning_rate": 6.617095059569807e-06, + "loss": 1.0175, + "step": 10229 + }, + { + "epoch": 0.8266833673407543, + "grad_norm": 3.2278783321380615, + "learning_rate": 6.616475851713139e-06, + "loss": 0.8996, + "step": 10230 + }, + { + "epoch": 0.8267641770540819, + "grad_norm": 2.789527177810669, + "learning_rate": 6.615856616170129e-06, + "loss": 0.9442, + "step": 10231 + }, + { + "epoch": 0.8268449867674095, + "grad_norm": 2.6740267276763916, + "learning_rate": 6.6152373529513855e-06, + "loss": 0.9406, + "step": 10232 + }, + { + "epoch": 0.8269257964807369, + "grad_norm": 2.271397590637207, + "learning_rate": 6.614618062067515e-06, + "loss": 0.9842, + "step": 10233 + }, + { + "epoch": 0.8270066061940645, + "grad_norm": 2.798659563064575, + "learning_rate": 6.6139987435291244e-06, + "loss": 0.9679, + "step": 10234 + }, + { + "epoch": 0.8270874159073921, + "grad_norm": 2.850368022918701, + "learning_rate": 6.613379397346821e-06, + "loss": 1.0395, + "step": 10235 + }, + { + "epoch": 0.8271682256207196, + "grad_norm": 2.6009531021118164, + "learning_rate": 6.612760023531212e-06, + "loss": 0.8984, + "step": 10236 + }, + { + "epoch": 0.8272490353340471, + "grad_norm": 3.158233165740967, + "learning_rate": 6.612140622092906e-06, + "loss": 0.8778, + "step": 10237 + }, + { + "epoch": 0.8273298450473747, + "grad_norm": 2.669201374053955, + "learning_rate": 6.611521193042514e-06, + "loss": 0.9173, + "step": 10238 + }, + { + "epoch": 0.8274106547607022, + "grad_norm": 2.4236388206481934, + "learning_rate": 6.6109017363906415e-06, + "loss": 1.0802, + "step": 10239 + }, + { + "epoch": 0.8274914644740298, + "grad_norm": 2.54530668258667, + "learning_rate": 6.610282252147903e-06, + "loss": 0.9442, + "step": 10240 + }, + { + "epoch": 0.8275722741873573, + "grad_norm": 2.8342511653900146, + "learning_rate": 6.6096627403249036e-06, + "loss": 0.9262, + "step": 10241 + }, + { + "epoch": 0.8276530839006848, + "grad_norm": 2.8013253211975098, + "learning_rate": 6.609043200932257e-06, + "loss": 0.8559, + "step": 10242 + }, + { + "epoch": 0.8277338936140124, + "grad_norm": 3.2630040645599365, + "learning_rate": 6.608423633980574e-06, + "loss": 0.8682, + "step": 10243 + }, + { + "epoch": 0.82781470332734, + "grad_norm": 2.5720272064208984, + "learning_rate": 6.607804039480468e-06, + "loss": 0.7945, + "step": 10244 + }, + { + "epoch": 0.8278955130406674, + "grad_norm": 2.8171470165252686, + "learning_rate": 6.607184417442547e-06, + "loss": 0.9817, + "step": 10245 + }, + { + "epoch": 0.827976322753995, + "grad_norm": 2.729215621948242, + "learning_rate": 6.606564767877428e-06, + "loss": 0.9457, + "step": 10246 + }, + { + "epoch": 0.8280571324673226, + "grad_norm": 2.5352158546447754, + "learning_rate": 6.60594509079572e-06, + "loss": 1.0682, + "step": 10247 + }, + { + "epoch": 0.8281379421806501, + "grad_norm": 2.6266767978668213, + "learning_rate": 6.605325386208041e-06, + "loss": 0.9063, + "step": 10248 + }, + { + "epoch": 0.8282187518939776, + "grad_norm": 3.0243353843688965, + "learning_rate": 6.604705654125001e-06, + "loss": 0.9711, + "step": 10249 + }, + { + "epoch": 0.8282995616073052, + "grad_norm": 2.0301334857940674, + "learning_rate": 6.604085894557217e-06, + "loss": 0.9394, + "step": 10250 + }, + { + "epoch": 0.8283803713206327, + "grad_norm": 2.6448678970336914, + "learning_rate": 6.603466107515304e-06, + "loss": 1.0479, + "step": 10251 + }, + { + "epoch": 0.8284611810339603, + "grad_norm": 2.66023325920105, + "learning_rate": 6.602846293009877e-06, + "loss": 0.9664, + "step": 10252 + }, + { + "epoch": 0.8285419907472878, + "grad_norm": 2.9944941997528076, + "learning_rate": 6.60222645105155e-06, + "loss": 0.9087, + "step": 10253 + }, + { + "epoch": 0.8286228004606153, + "grad_norm": 2.3767995834350586, + "learning_rate": 6.601606581650942e-06, + "loss": 0.8815, + "step": 10254 + }, + { + "epoch": 0.8287036101739429, + "grad_norm": 2.890199899673462, + "learning_rate": 6.600986684818669e-06, + "loss": 0.9807, + "step": 10255 + }, + { + "epoch": 0.8287844198872705, + "grad_norm": 2.7976033687591553, + "learning_rate": 6.600366760565349e-06, + "loss": 0.9007, + "step": 10256 + }, + { + "epoch": 0.8288652296005979, + "grad_norm": 2.4705512523651123, + "learning_rate": 6.599746808901598e-06, + "loss": 1.0106, + "step": 10257 + }, + { + "epoch": 0.8289460393139255, + "grad_norm": 2.668663263320923, + "learning_rate": 6.5991268298380365e-06, + "loss": 0.9168, + "step": 10258 + }, + { + "epoch": 0.8290268490272531, + "grad_norm": 3.10408616065979, + "learning_rate": 6.59850682338528e-06, + "loss": 0.8787, + "step": 10259 + }, + { + "epoch": 0.8291076587405806, + "grad_norm": 3.365107536315918, + "learning_rate": 6.597886789553952e-06, + "loss": 1.0074, + "step": 10260 + }, + { + "epoch": 0.8291884684539081, + "grad_norm": 2.9234774112701416, + "learning_rate": 6.597266728354669e-06, + "loss": 1.01, + "step": 10261 + }, + { + "epoch": 0.8292692781672357, + "grad_norm": 2.556942939758301, + "learning_rate": 6.596646639798053e-06, + "loss": 0.8714, + "step": 10262 + }, + { + "epoch": 0.8293500878805632, + "grad_norm": 2.7647294998168945, + "learning_rate": 6.596026523894723e-06, + "loss": 1.0166, + "step": 10263 + }, + { + "epoch": 0.8294308975938908, + "grad_norm": 3.5150368213653564, + "learning_rate": 6.595406380655301e-06, + "loss": 0.9164, + "step": 10264 + }, + { + "epoch": 0.8295117073072183, + "grad_norm": 3.1027796268463135, + "learning_rate": 6.5947862100904094e-06, + "loss": 0.9209, + "step": 10265 + }, + { + "epoch": 0.8295925170205458, + "grad_norm": 3.0092806816101074, + "learning_rate": 6.59416601221067e-06, + "loss": 0.9362, + "step": 10266 + }, + { + "epoch": 0.8296733267338734, + "grad_norm": 2.6388099193573, + "learning_rate": 6.593545787026702e-06, + "loss": 1.0916, + "step": 10267 + }, + { + "epoch": 0.829754136447201, + "grad_norm": 2.347038984298706, + "learning_rate": 6.592925534549133e-06, + "loss": 1.0426, + "step": 10268 + }, + { + "epoch": 0.8298349461605284, + "grad_norm": 2.541348457336426, + "learning_rate": 6.592305254788584e-06, + "loss": 0.9192, + "step": 10269 + }, + { + "epoch": 0.829915755873856, + "grad_norm": 2.2369649410247803, + "learning_rate": 6.591684947755678e-06, + "loss": 0.8332, + "step": 10270 + }, + { + "epoch": 0.8299965655871836, + "grad_norm": 2.574091911315918, + "learning_rate": 6.591064613461042e-06, + "loss": 0.9477, + "step": 10271 + }, + { + "epoch": 0.8300773753005111, + "grad_norm": 2.4311954975128174, + "learning_rate": 6.5904442519153e-06, + "loss": 0.9134, + "step": 10272 + }, + { + "epoch": 0.8301581850138386, + "grad_norm": 2.4001121520996094, + "learning_rate": 6.589823863129074e-06, + "loss": 0.9378, + "step": 10273 + }, + { + "epoch": 0.8302389947271662, + "grad_norm": 2.173936128616333, + "learning_rate": 6.589203447112997e-06, + "loss": 1.0063, + "step": 10274 + }, + { + "epoch": 0.8303198044404938, + "grad_norm": 2.3609704971313477, + "learning_rate": 6.588583003877686e-06, + "loss": 0.9422, + "step": 10275 + }, + { + "epoch": 0.8304006141538213, + "grad_norm": 2.4362683296203613, + "learning_rate": 6.587962533433776e-06, + "loss": 0.9251, + "step": 10276 + }, + { + "epoch": 0.8304814238671488, + "grad_norm": 2.4698076248168945, + "learning_rate": 6.587342035791889e-06, + "loss": 1.0722, + "step": 10277 + }, + { + "epoch": 0.8305622335804764, + "grad_norm": 3.0186030864715576, + "learning_rate": 6.586721510962655e-06, + "loss": 1.0061, + "step": 10278 + }, + { + "epoch": 0.8306430432938039, + "grad_norm": 3.0380935668945312, + "learning_rate": 6.5861009589567015e-06, + "loss": 0.9737, + "step": 10279 + }, + { + "epoch": 0.8307238530071315, + "grad_norm": 2.6261894702911377, + "learning_rate": 6.5854803797846566e-06, + "loss": 0.9573, + "step": 10280 + }, + { + "epoch": 0.830804662720459, + "grad_norm": 2.7082245349884033, + "learning_rate": 6.5848597734571495e-06, + "loss": 0.9382, + "step": 10281 + }, + { + "epoch": 0.8308854724337865, + "grad_norm": 2.721151113510132, + "learning_rate": 6.584239139984811e-06, + "loss": 0.88, + "step": 10282 + }, + { + "epoch": 0.8309662821471141, + "grad_norm": 2.8823740482330322, + "learning_rate": 6.5836184793782686e-06, + "loss": 1.0344, + "step": 10283 + }, + { + "epoch": 0.8310470918604417, + "grad_norm": 2.827423095703125, + "learning_rate": 6.582997791648154e-06, + "loss": 0.9251, + "step": 10284 + }, + { + "epoch": 0.8311279015737691, + "grad_norm": 2.6694536209106445, + "learning_rate": 6.582377076805099e-06, + "loss": 0.9052, + "step": 10285 + }, + { + "epoch": 0.8312087112870967, + "grad_norm": 2.617753267288208, + "learning_rate": 6.581756334859734e-06, + "loss": 0.8906, + "step": 10286 + }, + { + "epoch": 0.8312895210004243, + "grad_norm": 2.254328489303589, + "learning_rate": 6.58113556582269e-06, + "loss": 0.9846, + "step": 10287 + }, + { + "epoch": 0.8313703307137518, + "grad_norm": 2.3716840744018555, + "learning_rate": 6.5805147697046e-06, + "loss": 0.9119, + "step": 10288 + }, + { + "epoch": 0.8314511404270793, + "grad_norm": 3.1163132190704346, + "learning_rate": 6.579893946516098e-06, + "loss": 0.8303, + "step": 10289 + }, + { + "epoch": 0.8315319501404069, + "grad_norm": 2.55526065826416, + "learning_rate": 6.579273096267818e-06, + "loss": 0.8802, + "step": 10290 + }, + { + "epoch": 0.8316127598537344, + "grad_norm": 2.5996673107147217, + "learning_rate": 6.578652218970389e-06, + "loss": 1.0192, + "step": 10291 + }, + { + "epoch": 0.831693569567062, + "grad_norm": 2.965198040008545, + "learning_rate": 6.578031314634447e-06, + "loss": 0.9942, + "step": 10292 + }, + { + "epoch": 0.8317743792803896, + "grad_norm": 2.2330403327941895, + "learning_rate": 6.57741038327063e-06, + "loss": 0.8312, + "step": 10293 + }, + { + "epoch": 0.831855188993717, + "grad_norm": 2.1520514488220215, + "learning_rate": 6.57678942488957e-06, + "loss": 1.1504, + "step": 10294 + }, + { + "epoch": 0.8319359987070446, + "grad_norm": 2.4626641273498535, + "learning_rate": 6.576168439501902e-06, + "loss": 0.9926, + "step": 10295 + }, + { + "epoch": 0.8320168084203722, + "grad_norm": 2.5862386226654053, + "learning_rate": 6.5755474271182655e-06, + "loss": 0.8666, + "step": 10296 + }, + { + "epoch": 0.8320976181336996, + "grad_norm": 2.8516321182250977, + "learning_rate": 6.5749263877492934e-06, + "loss": 0.8282, + "step": 10297 + }, + { + "epoch": 0.8321784278470272, + "grad_norm": 2.7109827995300293, + "learning_rate": 6.574305321405622e-06, + "loss": 1.0934, + "step": 10298 + }, + { + "epoch": 0.8322592375603548, + "grad_norm": 2.948587656021118, + "learning_rate": 6.573684228097893e-06, + "loss": 0.9018, + "step": 10299 + }, + { + "epoch": 0.8323400472736823, + "grad_norm": 2.943542003631592, + "learning_rate": 6.5730631078367406e-06, + "loss": 0.9913, + "step": 10300 + }, + { + "epoch": 0.8324208569870098, + "grad_norm": 2.990828514099121, + "learning_rate": 6.572441960632803e-06, + "loss": 0.8745, + "step": 10301 + }, + { + "epoch": 0.8325016667003374, + "grad_norm": 2.6374523639678955, + "learning_rate": 6.571820786496721e-06, + "loss": 0.9704, + "step": 10302 + }, + { + "epoch": 0.8325824764136649, + "grad_norm": 2.488494396209717, + "learning_rate": 6.571199585439133e-06, + "loss": 0.9956, + "step": 10303 + }, + { + "epoch": 0.8326632861269925, + "grad_norm": 2.843135118484497, + "learning_rate": 6.570578357470678e-06, + "loss": 0.8777, + "step": 10304 + }, + { + "epoch": 0.83274409584032, + "grad_norm": 2.8891029357910156, + "learning_rate": 6.569957102601999e-06, + "loss": 0.9228, + "step": 10305 + }, + { + "epoch": 0.8328249055536475, + "grad_norm": 2.464301347732544, + "learning_rate": 6.569335820843732e-06, + "loss": 0.97, + "step": 10306 + }, + { + "epoch": 0.8329057152669751, + "grad_norm": 2.515451431274414, + "learning_rate": 6.568714512206522e-06, + "loss": 0.9194, + "step": 10307 + }, + { + "epoch": 0.8329865249803027, + "grad_norm": 2.717715263366699, + "learning_rate": 6.568093176701008e-06, + "loss": 0.9511, + "step": 10308 + }, + { + "epoch": 0.8330673346936301, + "grad_norm": 3.003147840499878, + "learning_rate": 6.567471814337834e-06, + "loss": 0.898, + "step": 10309 + }, + { + "epoch": 0.8331481444069577, + "grad_norm": 3.3430235385894775, + "learning_rate": 6.566850425127639e-06, + "loss": 0.9178, + "step": 10310 + }, + { + "epoch": 0.8332289541202853, + "grad_norm": 2.6437325477600098, + "learning_rate": 6.566229009081071e-06, + "loss": 0.9032, + "step": 10311 + }, + { + "epoch": 0.8333097638336128, + "grad_norm": 2.8529152870178223, + "learning_rate": 6.565607566208768e-06, + "loss": 0.8734, + "step": 10312 + }, + { + "epoch": 0.8333905735469403, + "grad_norm": 2.3621718883514404, + "learning_rate": 6.564986096521379e-06, + "loss": 1.0013, + "step": 10313 + }, + { + "epoch": 0.8334713832602679, + "grad_norm": 2.4563350677490234, + "learning_rate": 6.5643646000295425e-06, + "loss": 0.9843, + "step": 10314 + }, + { + "epoch": 0.8335521929735954, + "grad_norm": 2.558286428451538, + "learning_rate": 6.5637430767439096e-06, + "loss": 1.0255, + "step": 10315 + }, + { + "epoch": 0.833633002686923, + "grad_norm": 2.2724075317382812, + "learning_rate": 6.563121526675121e-06, + "loss": 0.9457, + "step": 10316 + }, + { + "epoch": 0.8337138124002506, + "grad_norm": 2.591700315475464, + "learning_rate": 6.5624999498338234e-06, + "loss": 1.0062, + "step": 10317 + }, + { + "epoch": 0.833794622113578, + "grad_norm": 2.2724504470825195, + "learning_rate": 6.561878346230664e-06, + "loss": 0.8854, + "step": 10318 + }, + { + "epoch": 0.8338754318269056, + "grad_norm": 2.7128658294677734, + "learning_rate": 6.5612567158762894e-06, + "loss": 0.9872, + "step": 10319 + }, + { + "epoch": 0.8339562415402332, + "grad_norm": 2.780306816101074, + "learning_rate": 6.560635058781342e-06, + "loss": 0.8845, + "step": 10320 + }, + { + "epoch": 0.8340370512535606, + "grad_norm": 2.6698758602142334, + "learning_rate": 6.5600133749564775e-06, + "loss": 1.0092, + "step": 10321 + }, + { + "epoch": 0.8341178609668882, + "grad_norm": 2.7282967567443848, + "learning_rate": 6.559391664412338e-06, + "loss": 0.9888, + "step": 10322 + }, + { + "epoch": 0.8341986706802158, + "grad_norm": 3.131373167037964, + "learning_rate": 6.558769927159573e-06, + "loss": 1.0423, + "step": 10323 + }, + { + "epoch": 0.8342794803935433, + "grad_norm": 2.3342626094818115, + "learning_rate": 6.558148163208832e-06, + "loss": 0.9927, + "step": 10324 + }, + { + "epoch": 0.8343602901068708, + "grad_norm": 2.7721657752990723, + "learning_rate": 6.557526372570765e-06, + "loss": 0.9143, + "step": 10325 + }, + { + "epoch": 0.8344410998201984, + "grad_norm": 2.354970693588257, + "learning_rate": 6.556904555256019e-06, + "loss": 1.0307, + "step": 10326 + }, + { + "epoch": 0.8345219095335259, + "grad_norm": 2.6982693672180176, + "learning_rate": 6.556282711275247e-06, + "loss": 0.8372, + "step": 10327 + }, + { + "epoch": 0.8346027192468535, + "grad_norm": 2.881446123123169, + "learning_rate": 6.555660840639097e-06, + "loss": 0.9558, + "step": 10328 + }, + { + "epoch": 0.834683528960181, + "grad_norm": 2.7588305473327637, + "learning_rate": 6.555038943358225e-06, + "loss": 0.88, + "step": 10329 + }, + { + "epoch": 0.8347643386735085, + "grad_norm": 2.5668423175811768, + "learning_rate": 6.554417019443278e-06, + "loss": 0.9132, + "step": 10330 + }, + { + "epoch": 0.8348451483868361, + "grad_norm": 2.724865198135376, + "learning_rate": 6.553795068904909e-06, + "loss": 0.9082, + "step": 10331 + }, + { + "epoch": 0.8349259581001637, + "grad_norm": 2.9270431995391846, + "learning_rate": 6.553173091753771e-06, + "loss": 0.9653, + "step": 10332 + }, + { + "epoch": 0.8350067678134911, + "grad_norm": 2.477363109588623, + "learning_rate": 6.552551088000519e-06, + "loss": 0.9978, + "step": 10333 + }, + { + "epoch": 0.8350875775268187, + "grad_norm": 2.592115879058838, + "learning_rate": 6.551929057655802e-06, + "loss": 1.0549, + "step": 10334 + }, + { + "epoch": 0.8351683872401463, + "grad_norm": 3.223541498184204, + "learning_rate": 6.551307000730278e-06, + "loss": 0.9469, + "step": 10335 + }, + { + "epoch": 0.8352491969534738, + "grad_norm": 2.4590396881103516, + "learning_rate": 6.5506849172346e-06, + "loss": 0.9423, + "step": 10336 + }, + { + "epoch": 0.8353300066668014, + "grad_norm": 2.496253252029419, + "learning_rate": 6.5500628071794215e-06, + "loss": 0.8054, + "step": 10337 + }, + { + "epoch": 0.8354108163801289, + "grad_norm": 2.501291513442993, + "learning_rate": 6.549440670575399e-06, + "loss": 0.9644, + "step": 10338 + }, + { + "epoch": 0.8354916260934564, + "grad_norm": 2.6153807640075684, + "learning_rate": 6.548818507433189e-06, + "loss": 0.8983, + "step": 10339 + }, + { + "epoch": 0.835572435806784, + "grad_norm": 2.555190086364746, + "learning_rate": 6.548196317763445e-06, + "loss": 0.8627, + "step": 10340 + }, + { + "epoch": 0.8356532455201116, + "grad_norm": 2.7666139602661133, + "learning_rate": 6.547574101576826e-06, + "loss": 0.9579, + "step": 10341 + }, + { + "epoch": 0.835734055233439, + "grad_norm": 2.5289762020111084, + "learning_rate": 6.546951858883986e-06, + "loss": 0.9392, + "step": 10342 + }, + { + "epoch": 0.8358148649467666, + "grad_norm": 2.3865489959716797, + "learning_rate": 6.546329589695588e-06, + "loss": 0.933, + "step": 10343 + }, + { + "epoch": 0.8358956746600942, + "grad_norm": 2.9827826023101807, + "learning_rate": 6.545707294022286e-06, + "loss": 0.8892, + "step": 10344 + }, + { + "epoch": 0.8359764843734216, + "grad_norm": 2.7289986610412598, + "learning_rate": 6.545084971874738e-06, + "loss": 0.9109, + "step": 10345 + }, + { + "epoch": 0.8360572940867492, + "grad_norm": 2.163463592529297, + "learning_rate": 6.5444626232636045e-06, + "loss": 0.9232, + "step": 10346 + }, + { + "epoch": 0.8361381038000768, + "grad_norm": 2.7812235355377197, + "learning_rate": 6.543840248199546e-06, + "loss": 0.8579, + "step": 10347 + }, + { + "epoch": 0.8362189135134043, + "grad_norm": 2.5764145851135254, + "learning_rate": 6.543217846693217e-06, + "loss": 0.9469, + "step": 10348 + }, + { + "epoch": 0.8362997232267319, + "grad_norm": 2.4816665649414062, + "learning_rate": 6.542595418755286e-06, + "loss": 1.0085, + "step": 10349 + }, + { + "epoch": 0.8363805329400594, + "grad_norm": 2.3393003940582275, + "learning_rate": 6.5419729643964055e-06, + "loss": 0.954, + "step": 10350 + }, + { + "epoch": 0.8364613426533869, + "grad_norm": 2.690126895904541, + "learning_rate": 6.541350483627242e-06, + "loss": 0.9522, + "step": 10351 + }, + { + "epoch": 0.8365421523667145, + "grad_norm": 2.5432019233703613, + "learning_rate": 6.5407279764584555e-06, + "loss": 0.9274, + "step": 10352 + }, + { + "epoch": 0.836622962080042, + "grad_norm": 2.5258004665374756, + "learning_rate": 6.540105442900707e-06, + "loss": 0.9676, + "step": 10353 + }, + { + "epoch": 0.8367037717933695, + "grad_norm": 2.782893180847168, + "learning_rate": 6.539482882964661e-06, + "loss": 0.8797, + "step": 10354 + }, + { + "epoch": 0.8367845815066971, + "grad_norm": 2.819946765899658, + "learning_rate": 6.538860296660978e-06, + "loss": 0.8726, + "step": 10355 + }, + { + "epoch": 0.8368653912200247, + "grad_norm": 2.5403528213500977, + "learning_rate": 6.538237684000324e-06, + "loss": 0.8837, + "step": 10356 + }, + { + "epoch": 0.8369462009333521, + "grad_norm": 2.641281843185425, + "learning_rate": 6.537615044993362e-06, + "loss": 0.8482, + "step": 10357 + }, + { + "epoch": 0.8370270106466797, + "grad_norm": 2.8591344356536865, + "learning_rate": 6.536992379650755e-06, + "loss": 0.8728, + "step": 10358 + }, + { + "epoch": 0.8371078203600073, + "grad_norm": 2.840751886367798, + "learning_rate": 6.5363696879831686e-06, + "loss": 0.9887, + "step": 10359 + }, + { + "epoch": 0.8371886300733348, + "grad_norm": 2.555953025817871, + "learning_rate": 6.535746970001268e-06, + "loss": 0.8691, + "step": 10360 + }, + { + "epoch": 0.8372694397866624, + "grad_norm": 2.655471086502075, + "learning_rate": 6.535124225715719e-06, + "loss": 0.9269, + "step": 10361 + }, + { + "epoch": 0.8373502494999899, + "grad_norm": 2.682887077331543, + "learning_rate": 6.534501455137188e-06, + "loss": 0.852, + "step": 10362 + }, + { + "epoch": 0.8374310592133174, + "grad_norm": 2.834235668182373, + "learning_rate": 6.53387865827634e-06, + "loss": 0.8463, + "step": 10363 + }, + { + "epoch": 0.837511868926645, + "grad_norm": 2.5176448822021484, + "learning_rate": 6.5332558351438454e-06, + "loss": 0.9175, + "step": 10364 + }, + { + "epoch": 0.8375926786399726, + "grad_norm": 2.8520047664642334, + "learning_rate": 6.532632985750369e-06, + "loss": 0.9253, + "step": 10365 + }, + { + "epoch": 0.8376734883533, + "grad_norm": 2.656947135925293, + "learning_rate": 6.5320101101065795e-06, + "loss": 0.9316, + "step": 10366 + }, + { + "epoch": 0.8377542980666276, + "grad_norm": 2.4157190322875977, + "learning_rate": 6.531387208223143e-06, + "loss": 0.9568, + "step": 10367 + }, + { + "epoch": 0.8378351077799552, + "grad_norm": 2.4857492446899414, + "learning_rate": 6.530764280110732e-06, + "loss": 1.0323, + "step": 10368 + }, + { + "epoch": 0.8379159174932826, + "grad_norm": 2.6493284702301025, + "learning_rate": 6.5301413257800126e-06, + "loss": 0.9611, + "step": 10369 + }, + { + "epoch": 0.8379967272066102, + "grad_norm": 2.4020230770111084, + "learning_rate": 6.5295183452416575e-06, + "loss": 1.0994, + "step": 10370 + }, + { + "epoch": 0.8380775369199378, + "grad_norm": 2.5846850872039795, + "learning_rate": 6.528895338506334e-06, + "loss": 0.8125, + "step": 10371 + }, + { + "epoch": 0.8381583466332653, + "grad_norm": 2.467902421951294, + "learning_rate": 6.528272305584717e-06, + "loss": 0.9869, + "step": 10372 + }, + { + "epoch": 0.8382391563465929, + "grad_norm": 2.4679744243621826, + "learning_rate": 6.527649246487471e-06, + "loss": 0.8842, + "step": 10373 + }, + { + "epoch": 0.8383199660599204, + "grad_norm": 2.61643648147583, + "learning_rate": 6.5270261612252725e-06, + "loss": 1.0172, + "step": 10374 + }, + { + "epoch": 0.8384007757732479, + "grad_norm": 2.2779440879821777, + "learning_rate": 6.526403049808791e-06, + "loss": 0.9067, + "step": 10375 + }, + { + "epoch": 0.8384815854865755, + "grad_norm": 2.7108347415924072, + "learning_rate": 6.525779912248702e-06, + "loss": 0.8728, + "step": 10376 + }, + { + "epoch": 0.838562395199903, + "grad_norm": 2.429582118988037, + "learning_rate": 6.525156748555674e-06, + "loss": 0.9884, + "step": 10377 + }, + { + "epoch": 0.8386432049132305, + "grad_norm": 3.2600700855255127, + "learning_rate": 6.524533558740385e-06, + "loss": 0.9382, + "step": 10378 + }, + { + "epoch": 0.8387240146265581, + "grad_norm": 3.188776731491089, + "learning_rate": 6.523910342813504e-06, + "loss": 1.016, + "step": 10379 + }, + { + "epoch": 0.8388048243398857, + "grad_norm": 3.175647258758545, + "learning_rate": 6.523287100785709e-06, + "loss": 0.9298, + "step": 10380 + }, + { + "epoch": 0.8388856340532131, + "grad_norm": 2.506345510482788, + "learning_rate": 6.522663832667672e-06, + "loss": 0.9197, + "step": 10381 + }, + { + "epoch": 0.8389664437665407, + "grad_norm": 2.5089378356933594, + "learning_rate": 6.52204053847007e-06, + "loss": 0.865, + "step": 10382 + }, + { + "epoch": 0.8390472534798683, + "grad_norm": 2.816622495651245, + "learning_rate": 6.521417218203579e-06, + "loss": 0.9613, + "step": 10383 + }, + { + "epoch": 0.8391280631931958, + "grad_norm": 2.5885701179504395, + "learning_rate": 6.520793871878871e-06, + "loss": 0.9186, + "step": 10384 + }, + { + "epoch": 0.8392088729065234, + "grad_norm": 2.666088581085205, + "learning_rate": 6.520170499506626e-06, + "loss": 0.937, + "step": 10385 + }, + { + "epoch": 0.8392896826198509, + "grad_norm": 2.1622211933135986, + "learning_rate": 6.519547101097522e-06, + "loss": 0.9675, + "step": 10386 + }, + { + "epoch": 0.8393704923331784, + "grad_norm": 2.9082515239715576, + "learning_rate": 6.518923676662231e-06, + "loss": 0.9938, + "step": 10387 + }, + { + "epoch": 0.839451302046506, + "grad_norm": 2.3681159019470215, + "learning_rate": 6.518300226211437e-06, + "loss": 1.0046, + "step": 10388 + }, + { + "epoch": 0.8395321117598336, + "grad_norm": 2.5531108379364014, + "learning_rate": 6.517676749755813e-06, + "loss": 1.1372, + "step": 10389 + }, + { + "epoch": 0.839612921473161, + "grad_norm": 3.0269222259521484, + "learning_rate": 6.5170532473060425e-06, + "loss": 0.9001, + "step": 10390 + }, + { + "epoch": 0.8396937311864886, + "grad_norm": 2.360135555267334, + "learning_rate": 6.5164297188728e-06, + "loss": 0.9343, + "step": 10391 + }, + { + "epoch": 0.8397745408998162, + "grad_norm": 2.1920666694641113, + "learning_rate": 6.515806164466768e-06, + "loss": 0.9894, + "step": 10392 + }, + { + "epoch": 0.8398553506131436, + "grad_norm": 2.4082422256469727, + "learning_rate": 6.515182584098624e-06, + "loss": 0.8902, + "step": 10393 + }, + { + "epoch": 0.8399361603264712, + "grad_norm": 2.6963584423065186, + "learning_rate": 6.514558977779052e-06, + "loss": 0.9596, + "step": 10394 + }, + { + "epoch": 0.8400169700397988, + "grad_norm": 3.0328590869903564, + "learning_rate": 6.513935345518731e-06, + "loss": 1.0362, + "step": 10395 + }, + { + "epoch": 0.8400977797531263, + "grad_norm": 2.9155263900756836, + "learning_rate": 6.51331168732834e-06, + "loss": 0.9551, + "step": 10396 + }, + { + "epoch": 0.8401785894664539, + "grad_norm": 2.918823719024658, + "learning_rate": 6.5126880032185634e-06, + "loss": 0.973, + "step": 10397 + }, + { + "epoch": 0.8402593991797814, + "grad_norm": 2.338228940963745, + "learning_rate": 6.512064293200084e-06, + "loss": 0.8797, + "step": 10398 + }, + { + "epoch": 0.8403402088931089, + "grad_norm": 2.837373971939087, + "learning_rate": 6.511440557283584e-06, + "loss": 0.86, + "step": 10399 + }, + { + "epoch": 0.8404210186064365, + "grad_norm": 2.5574188232421875, + "learning_rate": 6.5108167954797455e-06, + "loss": 0.8753, + "step": 10400 + }, + { + "epoch": 0.840501828319764, + "grad_norm": 2.618107795715332, + "learning_rate": 6.510193007799251e-06, + "loss": 0.8964, + "step": 10401 + }, + { + "epoch": 0.8405826380330915, + "grad_norm": 2.5248045921325684, + "learning_rate": 6.509569194252787e-06, + "loss": 0.9029, + "step": 10402 + }, + { + "epoch": 0.8406634477464191, + "grad_norm": 2.905775547027588, + "learning_rate": 6.508945354851037e-06, + "loss": 0.9027, + "step": 10403 + }, + { + "epoch": 0.8407442574597467, + "grad_norm": 2.5878195762634277, + "learning_rate": 6.508321489604685e-06, + "loss": 1.0004, + "step": 10404 + }, + { + "epoch": 0.8408250671730743, + "grad_norm": 3.027595043182373, + "learning_rate": 6.507697598524417e-06, + "loss": 0.9041, + "step": 10405 + }, + { + "epoch": 0.8409058768864017, + "grad_norm": 2.4799745082855225, + "learning_rate": 6.5070736816209205e-06, + "loss": 0.9806, + "step": 10406 + }, + { + "epoch": 0.8409866865997293, + "grad_norm": 2.5228214263916016, + "learning_rate": 6.5064497389048775e-06, + "loss": 0.9409, + "step": 10407 + }, + { + "epoch": 0.8410674963130569, + "grad_norm": 2.364022731781006, + "learning_rate": 6.5058257703869786e-06, + "loss": 0.9317, + "step": 10408 + }, + { + "epoch": 0.8411483060263844, + "grad_norm": 3.2667176723480225, + "learning_rate": 6.50520177607791e-06, + "loss": 0.8653, + "step": 10409 + }, + { + "epoch": 0.8412291157397119, + "grad_norm": 2.6843714714050293, + "learning_rate": 6.504577755988357e-06, + "loss": 0.9667, + "step": 10410 + }, + { + "epoch": 0.8413099254530395, + "grad_norm": 2.670832395553589, + "learning_rate": 6.50395371012901e-06, + "loss": 0.9697, + "step": 10411 + }, + { + "epoch": 0.841390735166367, + "grad_norm": 2.891305923461914, + "learning_rate": 6.503329638510556e-06, + "loss": 0.9298, + "step": 10412 + }, + { + "epoch": 0.8414715448796946, + "grad_norm": 2.956672191619873, + "learning_rate": 6.502705541143685e-06, + "loss": 1.0445, + "step": 10413 + }, + { + "epoch": 0.8415523545930221, + "grad_norm": 2.436795234680176, + "learning_rate": 6.502081418039086e-06, + "loss": 0.9082, + "step": 10414 + }, + { + "epoch": 0.8416331643063496, + "grad_norm": 3.1604461669921875, + "learning_rate": 6.501457269207446e-06, + "loss": 0.8802, + "step": 10415 + }, + { + "epoch": 0.8417139740196772, + "grad_norm": 2.384186267852783, + "learning_rate": 6.500833094659461e-06, + "loss": 0.9525, + "step": 10416 + }, + { + "epoch": 0.8417947837330048, + "grad_norm": 2.745936155319214, + "learning_rate": 6.500208894405817e-06, + "loss": 0.8545, + "step": 10417 + }, + { + "epoch": 0.8418755934463322, + "grad_norm": 2.5840983390808105, + "learning_rate": 6.499584668457205e-06, + "loss": 0.9579, + "step": 10418 + }, + { + "epoch": 0.8419564031596598, + "grad_norm": 2.8174636363983154, + "learning_rate": 6.498960416824319e-06, + "loss": 0.9557, + "step": 10419 + }, + { + "epoch": 0.8420372128729874, + "grad_norm": 2.354670763015747, + "learning_rate": 6.498336139517849e-06, + "loss": 0.8537, + "step": 10420 + }, + { + "epoch": 0.8421180225863149, + "grad_norm": 2.612330675125122, + "learning_rate": 6.497711836548488e-06, + "loss": 0.9151, + "step": 10421 + }, + { + "epoch": 0.8421988322996424, + "grad_norm": 2.763094663619995, + "learning_rate": 6.497087507926929e-06, + "loss": 0.9803, + "step": 10422 + }, + { + "epoch": 0.84227964201297, + "grad_norm": 2.8730318546295166, + "learning_rate": 6.4964631536638655e-06, + "loss": 1.0824, + "step": 10423 + }, + { + "epoch": 0.8423604517262975, + "grad_norm": 2.7079296112060547, + "learning_rate": 6.4958387737699916e-06, + "loss": 1.0005, + "step": 10424 + }, + { + "epoch": 0.842441261439625, + "grad_norm": 3.3459935188293457, + "learning_rate": 6.495214368256e-06, + "loss": 0.8695, + "step": 10425 + }, + { + "epoch": 0.8425220711529526, + "grad_norm": 2.3343212604522705, + "learning_rate": 6.494589937132585e-06, + "loss": 0.9076, + "step": 10426 + }, + { + "epoch": 0.8426028808662801, + "grad_norm": 2.8797125816345215, + "learning_rate": 6.493965480410443e-06, + "loss": 0.8911, + "step": 10427 + }, + { + "epoch": 0.8426836905796077, + "grad_norm": 2.4214296340942383, + "learning_rate": 6.493340998100268e-06, + "loss": 0.9846, + "step": 10428 + }, + { + "epoch": 0.8427645002929353, + "grad_norm": 2.4357798099517822, + "learning_rate": 6.492716490212757e-06, + "loss": 0.8807, + "step": 10429 + }, + { + "epoch": 0.8428453100062627, + "grad_norm": 2.704138994216919, + "learning_rate": 6.492091956758606e-06, + "loss": 0.9051, + "step": 10430 + }, + { + "epoch": 0.8429261197195903, + "grad_norm": 2.5518760681152344, + "learning_rate": 6.491467397748514e-06, + "loss": 1.0004, + "step": 10431 + }, + { + "epoch": 0.8430069294329179, + "grad_norm": 2.386793613433838, + "learning_rate": 6.490842813193174e-06, + "loss": 0.9185, + "step": 10432 + }, + { + "epoch": 0.8430877391462454, + "grad_norm": 2.8249313831329346, + "learning_rate": 6.4902182031032866e-06, + "loss": 1.0037, + "step": 10433 + }, + { + "epoch": 0.8431685488595729, + "grad_norm": 2.5971086025238037, + "learning_rate": 6.489593567489548e-06, + "loss": 0.8821, + "step": 10434 + }, + { + "epoch": 0.8432493585729005, + "grad_norm": 2.56463360786438, + "learning_rate": 6.4889689063626585e-06, + "loss": 0.9605, + "step": 10435 + }, + { + "epoch": 0.843330168286228, + "grad_norm": 2.573432207107544, + "learning_rate": 6.488344219733316e-06, + "loss": 0.903, + "step": 10436 + }, + { + "epoch": 0.8434109779995556, + "grad_norm": 2.7142174243927, + "learning_rate": 6.487719507612219e-06, + "loss": 1.094, + "step": 10437 + }, + { + "epoch": 0.8434917877128831, + "grad_norm": 2.6429712772369385, + "learning_rate": 6.487094770010069e-06, + "loss": 0.9953, + "step": 10438 + }, + { + "epoch": 0.8435725974262106, + "grad_norm": 2.763674020767212, + "learning_rate": 6.486470006937567e-06, + "loss": 0.8782, + "step": 10439 + }, + { + "epoch": 0.8436534071395382, + "grad_norm": 2.7453370094299316, + "learning_rate": 6.48584521840541e-06, + "loss": 0.9211, + "step": 10440 + }, + { + "epoch": 0.8437342168528658, + "grad_norm": 2.8645360469818115, + "learning_rate": 6.485220404424304e-06, + "loss": 0.882, + "step": 10441 + }, + { + "epoch": 0.8438150265661932, + "grad_norm": 2.6044774055480957, + "learning_rate": 6.4845955650049454e-06, + "loss": 0.8915, + "step": 10442 + }, + { + "epoch": 0.8438958362795208, + "grad_norm": 2.4029619693756104, + "learning_rate": 6.4839707001580395e-06, + "loss": 0.9417, + "step": 10443 + }, + { + "epoch": 0.8439766459928484, + "grad_norm": 2.8236749172210693, + "learning_rate": 6.483345809894289e-06, + "loss": 0.9543, + "step": 10444 + }, + { + "epoch": 0.8440574557061759, + "grad_norm": 2.9325554370880127, + "learning_rate": 6.482720894224397e-06, + "loss": 0.9072, + "step": 10445 + }, + { + "epoch": 0.8441382654195034, + "grad_norm": 2.2445032596588135, + "learning_rate": 6.482095953159062e-06, + "loss": 0.8364, + "step": 10446 + }, + { + "epoch": 0.844219075132831, + "grad_norm": 2.74062442779541, + "learning_rate": 6.481470986708994e-06, + "loss": 0.9539, + "step": 10447 + }, + { + "epoch": 0.8442998848461585, + "grad_norm": 2.362377405166626, + "learning_rate": 6.480845994884893e-06, + "loss": 0.9643, + "step": 10448 + }, + { + "epoch": 0.8443806945594861, + "grad_norm": 2.9559617042541504, + "learning_rate": 6.480220977697467e-06, + "loss": 0.8588, + "step": 10449 + }, + { + "epoch": 0.8444615042728136, + "grad_norm": 2.6655261516571045, + "learning_rate": 6.479595935157417e-06, + "loss": 0.9298, + "step": 10450 + }, + { + "epoch": 0.8445423139861411, + "grad_norm": 2.5437235832214355, + "learning_rate": 6.478970867275451e-06, + "loss": 0.9388, + "step": 10451 + }, + { + "epoch": 0.8446231236994687, + "grad_norm": 2.9839928150177, + "learning_rate": 6.478345774062276e-06, + "loss": 0.9739, + "step": 10452 + }, + { + "epoch": 0.8447039334127963, + "grad_norm": 2.721081495285034, + "learning_rate": 6.477720655528597e-06, + "loss": 0.9002, + "step": 10453 + }, + { + "epoch": 0.8447847431261237, + "grad_norm": 2.4781289100646973, + "learning_rate": 6.477095511685117e-06, + "loss": 1.0952, + "step": 10454 + }, + { + "epoch": 0.8448655528394513, + "grad_norm": 2.4446706771850586, + "learning_rate": 6.476470342542552e-06, + "loss": 1.0701, + "step": 10455 + }, + { + "epoch": 0.8449463625527789, + "grad_norm": 2.812384605407715, + "learning_rate": 6.4758451481116014e-06, + "loss": 1.0214, + "step": 10456 + }, + { + "epoch": 0.8450271722661064, + "grad_norm": 3.1847965717315674, + "learning_rate": 6.475219928402976e-06, + "loss": 0.9729, + "step": 10457 + }, + { + "epoch": 0.8451079819794339, + "grad_norm": 2.3405656814575195, + "learning_rate": 6.474594683427385e-06, + "loss": 1.065, + "step": 10458 + }, + { + "epoch": 0.8451887916927615, + "grad_norm": 2.5131845474243164, + "learning_rate": 6.4739694131955385e-06, + "loss": 0.9468, + "step": 10459 + }, + { + "epoch": 0.845269601406089, + "grad_norm": 2.65091609954834, + "learning_rate": 6.4733441177181435e-06, + "loss": 0.9437, + "step": 10460 + }, + { + "epoch": 0.8453504111194166, + "grad_norm": 2.3615477085113525, + "learning_rate": 6.47271879700591e-06, + "loss": 0.8991, + "step": 10461 + }, + { + "epoch": 0.8454312208327441, + "grad_norm": 2.9846079349517822, + "learning_rate": 6.47209345106955e-06, + "loss": 0.9782, + "step": 10462 + }, + { + "epoch": 0.8455120305460716, + "grad_norm": 2.397481918334961, + "learning_rate": 6.4714680799197725e-06, + "loss": 0.9648, + "step": 10463 + }, + { + "epoch": 0.8455928402593992, + "grad_norm": 3.003866672515869, + "learning_rate": 6.47084268356729e-06, + "loss": 0.961, + "step": 10464 + }, + { + "epoch": 0.8456736499727268, + "grad_norm": 2.726534843444824, + "learning_rate": 6.470217262022812e-06, + "loss": 0.9583, + "step": 10465 + }, + { + "epoch": 0.8457544596860542, + "grad_norm": 2.547591209411621, + "learning_rate": 6.469591815297051e-06, + "loss": 0.9565, + "step": 10466 + }, + { + "epoch": 0.8458352693993818, + "grad_norm": 2.7158493995666504, + "learning_rate": 6.4689663434007235e-06, + "loss": 0.8202, + "step": 10467 + }, + { + "epoch": 0.8459160791127094, + "grad_norm": 2.6833715438842773, + "learning_rate": 6.468340846344536e-06, + "loss": 0.8625, + "step": 10468 + }, + { + "epoch": 0.8459968888260369, + "grad_norm": 2.635164499282837, + "learning_rate": 6.4677153241392065e-06, + "loss": 1.0071, + "step": 10469 + }, + { + "epoch": 0.8460776985393644, + "grad_norm": 2.7561097145080566, + "learning_rate": 6.467089776795446e-06, + "loss": 0.8652, + "step": 10470 + }, + { + "epoch": 0.846158508252692, + "grad_norm": 2.521637201309204, + "learning_rate": 6.466464204323969e-06, + "loss": 1.0617, + "step": 10471 + }, + { + "epoch": 0.8462393179660195, + "grad_norm": 3.266026020050049, + "learning_rate": 6.46583860673549e-06, + "loss": 0.9337, + "step": 10472 + }, + { + "epoch": 0.8463201276793471, + "grad_norm": 2.7632336616516113, + "learning_rate": 6.465212984040727e-06, + "loss": 0.9889, + "step": 10473 + }, + { + "epoch": 0.8464009373926746, + "grad_norm": 2.851417064666748, + "learning_rate": 6.464587336250389e-06, + "loss": 1.0364, + "step": 10474 + }, + { + "epoch": 0.8464817471060021, + "grad_norm": 2.3812038898468018, + "learning_rate": 6.4639616633752e-06, + "loss": 0.9583, + "step": 10475 + }, + { + "epoch": 0.8465625568193297, + "grad_norm": 2.4048590660095215, + "learning_rate": 6.463335965425871e-06, + "loss": 1.0242, + "step": 10476 + }, + { + "epoch": 0.8466433665326573, + "grad_norm": 2.4567837715148926, + "learning_rate": 6.462710242413118e-06, + "loss": 0.8045, + "step": 10477 + }, + { + "epoch": 0.8467241762459847, + "grad_norm": 2.8058969974517822, + "learning_rate": 6.4620844943476615e-06, + "loss": 1.0603, + "step": 10478 + }, + { + "epoch": 0.8468049859593123, + "grad_norm": 2.433835983276367, + "learning_rate": 6.461458721240217e-06, + "loss": 0.8385, + "step": 10479 + }, + { + "epoch": 0.8468857956726399, + "grad_norm": 2.4225337505340576, + "learning_rate": 6.460832923101502e-06, + "loss": 0.9962, + "step": 10480 + }, + { + "epoch": 0.8469666053859674, + "grad_norm": 3.16461181640625, + "learning_rate": 6.460207099942237e-06, + "loss": 0.8905, + "step": 10481 + }, + { + "epoch": 0.8470474150992949, + "grad_norm": 3.0563158988952637, + "learning_rate": 6.459581251773139e-06, + "loss": 0.8266, + "step": 10482 + }, + { + "epoch": 0.8471282248126225, + "grad_norm": 2.891674757003784, + "learning_rate": 6.458955378604929e-06, + "loss": 0.9402, + "step": 10483 + }, + { + "epoch": 0.84720903452595, + "grad_norm": 2.4817230701446533, + "learning_rate": 6.458329480448324e-06, + "loss": 0.9309, + "step": 10484 + }, + { + "epoch": 0.8472898442392776, + "grad_norm": 2.5383822917938232, + "learning_rate": 6.457703557314048e-06, + "loss": 0.937, + "step": 10485 + }, + { + "epoch": 0.8473706539526051, + "grad_norm": 2.653319835662842, + "learning_rate": 6.457077609212817e-06, + "loss": 0.9991, + "step": 10486 + }, + { + "epoch": 0.8474514636659326, + "grad_norm": 2.8143310546875, + "learning_rate": 6.456451636155355e-06, + "loss": 0.878, + "step": 10487 + }, + { + "epoch": 0.8475322733792602, + "grad_norm": 2.7485921382904053, + "learning_rate": 6.455825638152383e-06, + "loss": 1.0788, + "step": 10488 + }, + { + "epoch": 0.8476130830925878, + "grad_norm": 2.448368787765503, + "learning_rate": 6.455199615214623e-06, + "loss": 0.9665, + "step": 10489 + }, + { + "epoch": 0.8476938928059152, + "grad_norm": 2.6947338581085205, + "learning_rate": 6.454573567352797e-06, + "loss": 0.8635, + "step": 10490 + }, + { + "epoch": 0.8477747025192428, + "grad_norm": 2.86772084236145, + "learning_rate": 6.453947494577627e-06, + "loss": 0.8238, + "step": 10491 + }, + { + "epoch": 0.8478555122325704, + "grad_norm": 3.0618245601654053, + "learning_rate": 6.453321396899837e-06, + "loss": 0.9415, + "step": 10492 + }, + { + "epoch": 0.8479363219458979, + "grad_norm": 2.782890558242798, + "learning_rate": 6.452695274330149e-06, + "loss": 0.9395, + "step": 10493 + }, + { + "epoch": 0.8480171316592254, + "grad_norm": 2.7398335933685303, + "learning_rate": 6.452069126879289e-06, + "loss": 0.8563, + "step": 10494 + }, + { + "epoch": 0.848097941372553, + "grad_norm": 2.7994000911712646, + "learning_rate": 6.451442954557981e-06, + "loss": 1.0517, + "step": 10495 + }, + { + "epoch": 0.8481787510858805, + "grad_norm": 2.945470094680786, + "learning_rate": 6.450816757376949e-06, + "loss": 0.8899, + "step": 10496 + }, + { + "epoch": 0.8482595607992081, + "grad_norm": 2.6887476444244385, + "learning_rate": 6.450190535346918e-06, + "loss": 0.9508, + "step": 10497 + }, + { + "epoch": 0.8483403705125356, + "grad_norm": 2.4166359901428223, + "learning_rate": 6.449564288478616e-06, + "loss": 0.9646, + "step": 10498 + }, + { + "epoch": 0.8484211802258631, + "grad_norm": 2.211223602294922, + "learning_rate": 6.448938016782766e-06, + "loss": 0.9367, + "step": 10499 + }, + { + "epoch": 0.8485019899391907, + "grad_norm": 2.7613167762756348, + "learning_rate": 6.448311720270096e-06, + "loss": 0.8813, + "step": 10500 + }, + { + "epoch": 0.8485827996525183, + "grad_norm": 2.7050299644470215, + "learning_rate": 6.447685398951333e-06, + "loss": 0.936, + "step": 10501 + }, + { + "epoch": 0.8486636093658457, + "grad_norm": 2.3954224586486816, + "learning_rate": 6.4470590528372054e-06, + "loss": 0.7174, + "step": 10502 + }, + { + "epoch": 0.8487444190791733, + "grad_norm": 2.9367053508758545, + "learning_rate": 6.446432681938439e-06, + "loss": 0.8673, + "step": 10503 + }, + { + "epoch": 0.8488252287925009, + "grad_norm": 2.6032369136810303, + "learning_rate": 6.445806286265764e-06, + "loss": 0.9011, + "step": 10504 + }, + { + "epoch": 0.8489060385058284, + "grad_norm": 2.7029058933258057, + "learning_rate": 6.445179865829905e-06, + "loss": 0.8865, + "step": 10505 + }, + { + "epoch": 0.8489868482191559, + "grad_norm": 2.5612471103668213, + "learning_rate": 6.444553420641597e-06, + "loss": 0.9992, + "step": 10506 + }, + { + "epoch": 0.8490676579324835, + "grad_norm": 2.1173813343048096, + "learning_rate": 6.443926950711564e-06, + "loss": 0.9415, + "step": 10507 + }, + { + "epoch": 0.849148467645811, + "grad_norm": 2.9662208557128906, + "learning_rate": 6.4433004560505405e-06, + "loss": 0.9405, + "step": 10508 + }, + { + "epoch": 0.8492292773591386, + "grad_norm": 2.5690298080444336, + "learning_rate": 6.442673936669255e-06, + "loss": 0.886, + "step": 10509 + }, + { + "epoch": 0.8493100870724661, + "grad_norm": 2.6806228160858154, + "learning_rate": 6.4420473925784365e-06, + "loss": 0.8541, + "step": 10510 + }, + { + "epoch": 0.8493908967857936, + "grad_norm": 2.445883274078369, + "learning_rate": 6.441420823788819e-06, + "loss": 0.8864, + "step": 10511 + }, + { + "epoch": 0.8494717064991212, + "grad_norm": 2.7138330936431885, + "learning_rate": 6.440794230311133e-06, + "loss": 0.9217, + "step": 10512 + }, + { + "epoch": 0.8495525162124488, + "grad_norm": 2.6860594749450684, + "learning_rate": 6.440167612156109e-06, + "loss": 0.9295, + "step": 10513 + }, + { + "epoch": 0.8496333259257762, + "grad_norm": 2.641309976577759, + "learning_rate": 6.439540969334481e-06, + "loss": 1.03, + "step": 10514 + }, + { + "epoch": 0.8497141356391038, + "grad_norm": 2.674391031265259, + "learning_rate": 6.4389143018569834e-06, + "loss": 0.9904, + "step": 10515 + }, + { + "epoch": 0.8497949453524314, + "grad_norm": 2.4982192516326904, + "learning_rate": 6.438287609734346e-06, + "loss": 1.0411, + "step": 10516 + }, + { + "epoch": 0.8498757550657589, + "grad_norm": 2.8474903106689453, + "learning_rate": 6.437660892977305e-06, + "loss": 0.7843, + "step": 10517 + }, + { + "epoch": 0.8499565647790864, + "grad_norm": 2.4621262550354004, + "learning_rate": 6.437034151596595e-06, + "loss": 0.8982, + "step": 10518 + }, + { + "epoch": 0.850037374492414, + "grad_norm": 2.777421236038208, + "learning_rate": 6.436407385602948e-06, + "loss": 0.9847, + "step": 10519 + }, + { + "epoch": 0.8501181842057415, + "grad_norm": 2.378206729888916, + "learning_rate": 6.435780595007102e-06, + "loss": 0.9996, + "step": 10520 + }, + { + "epoch": 0.8501989939190691, + "grad_norm": 2.539928436279297, + "learning_rate": 6.435153779819788e-06, + "loss": 0.978, + "step": 10521 + }, + { + "epoch": 0.8502798036323966, + "grad_norm": 2.253937244415283, + "learning_rate": 6.4345269400517485e-06, + "loss": 0.9378, + "step": 10522 + }, + { + "epoch": 0.8503606133457241, + "grad_norm": 2.5191359519958496, + "learning_rate": 6.433900075713714e-06, + "loss": 0.8788, + "step": 10523 + }, + { + "epoch": 0.8504414230590517, + "grad_norm": 2.7018587589263916, + "learning_rate": 6.4332731868164235e-06, + "loss": 0.8876, + "step": 10524 + }, + { + "epoch": 0.8505222327723793, + "grad_norm": 2.474862813949585, + "learning_rate": 6.432646273370613e-06, + "loss": 0.939, + "step": 10525 + }, + { + "epoch": 0.8506030424857067, + "grad_norm": 2.328767776489258, + "learning_rate": 6.432019335387023e-06, + "loss": 0.9886, + "step": 10526 + }, + { + "epoch": 0.8506838521990343, + "grad_norm": 2.6595728397369385, + "learning_rate": 6.431392372876386e-06, + "loss": 0.9237, + "step": 10527 + }, + { + "epoch": 0.8507646619123619, + "grad_norm": 2.5876286029815674, + "learning_rate": 6.430765385849447e-06, + "loss": 1.087, + "step": 10528 + }, + { + "epoch": 0.8508454716256894, + "grad_norm": 2.5601816177368164, + "learning_rate": 6.430138374316939e-06, + "loss": 1.0598, + "step": 10529 + }, + { + "epoch": 0.8509262813390169, + "grad_norm": 2.8048312664031982, + "learning_rate": 6.429511338289604e-06, + "loss": 0.9395, + "step": 10530 + }, + { + "epoch": 0.8510070910523445, + "grad_norm": 2.326653480529785, + "learning_rate": 6.428884277778183e-06, + "loss": 0.9201, + "step": 10531 + }, + { + "epoch": 0.851087900765672, + "grad_norm": 2.5945656299591064, + "learning_rate": 6.428257192793411e-06, + "loss": 0.8322, + "step": 10532 + }, + { + "epoch": 0.8511687104789996, + "grad_norm": 2.785053253173828, + "learning_rate": 6.427630083346033e-06, + "loss": 0.9954, + "step": 10533 + }, + { + "epoch": 0.8512495201923271, + "grad_norm": 2.6776323318481445, + "learning_rate": 6.4270029494467904e-06, + "loss": 0.8333, + "step": 10534 + }, + { + "epoch": 0.8513303299056547, + "grad_norm": 2.6399784088134766, + "learning_rate": 6.4263757911064195e-06, + "loss": 0.9619, + "step": 10535 + }, + { + "epoch": 0.8514111396189822, + "grad_norm": 2.997673511505127, + "learning_rate": 6.425748608335668e-06, + "loss": 0.984, + "step": 10536 + }, + { + "epoch": 0.8514919493323098, + "grad_norm": 2.540785551071167, + "learning_rate": 6.4251214011452735e-06, + "loss": 0.8854, + "step": 10537 + }, + { + "epoch": 0.8515727590456373, + "grad_norm": 2.7109200954437256, + "learning_rate": 6.424494169545981e-06, + "loss": 0.9228, + "step": 10538 + }, + { + "epoch": 0.8516535687589648, + "grad_norm": 2.867309093475342, + "learning_rate": 6.423866913548532e-06, + "loss": 0.9037, + "step": 10539 + }, + { + "epoch": 0.8517343784722924, + "grad_norm": 2.5105175971984863, + "learning_rate": 6.423239633163673e-06, + "loss": 0.8214, + "step": 10540 + }, + { + "epoch": 0.85181518818562, + "grad_norm": 2.6527299880981445, + "learning_rate": 6.4226123284021416e-06, + "loss": 0.9401, + "step": 10541 + }, + { + "epoch": 0.8518959978989474, + "grad_norm": 2.8117775917053223, + "learning_rate": 6.4219849992746885e-06, + "loss": 0.9948, + "step": 10542 + }, + { + "epoch": 0.851976807612275, + "grad_norm": 2.792485475540161, + "learning_rate": 6.421357645792054e-06, + "loss": 0.981, + "step": 10543 + }, + { + "epoch": 0.8520576173256026, + "grad_norm": 2.7991905212402344, + "learning_rate": 6.4207302679649865e-06, + "loss": 1.0453, + "step": 10544 + }, + { + "epoch": 0.8521384270389301, + "grad_norm": 3.1634392738342285, + "learning_rate": 6.420102865804228e-06, + "loss": 0.9755, + "step": 10545 + }, + { + "epoch": 0.8522192367522576, + "grad_norm": 2.360008716583252, + "learning_rate": 6.419475439320527e-06, + "loss": 0.9726, + "step": 10546 + }, + { + "epoch": 0.8523000464655852, + "grad_norm": 2.892042875289917, + "learning_rate": 6.418847988524629e-06, + "loss": 0.9303, + "step": 10547 + }, + { + "epoch": 0.8523808561789127, + "grad_norm": 2.1792070865631104, + "learning_rate": 6.418220513427282e-06, + "loss": 0.893, + "step": 10548 + }, + { + "epoch": 0.8524616658922403, + "grad_norm": 2.467843770980835, + "learning_rate": 6.417593014039229e-06, + "loss": 0.9939, + "step": 10549 + }, + { + "epoch": 0.8525424756055678, + "grad_norm": 2.352982997894287, + "learning_rate": 6.416965490371223e-06, + "loss": 0.8349, + "step": 10550 + }, + { + "epoch": 0.8526232853188953, + "grad_norm": 2.815352439880371, + "learning_rate": 6.4163379424340075e-06, + "loss": 0.9593, + "step": 10551 + }, + { + "epoch": 0.8527040950322229, + "grad_norm": 2.481914758682251, + "learning_rate": 6.415710370238334e-06, + "loss": 0.9265, + "step": 10552 + }, + { + "epoch": 0.8527849047455505, + "grad_norm": 2.6599695682525635, + "learning_rate": 6.41508277379495e-06, + "loss": 1.0223, + "step": 10553 + }, + { + "epoch": 0.8528657144588779, + "grad_norm": 2.587871789932251, + "learning_rate": 6.414455153114604e-06, + "loss": 1.0665, + "step": 10554 + }, + { + "epoch": 0.8529465241722055, + "grad_norm": 2.924708604812622, + "learning_rate": 6.413827508208046e-06, + "loss": 0.8857, + "step": 10555 + }, + { + "epoch": 0.8530273338855331, + "grad_norm": 2.524275064468384, + "learning_rate": 6.413199839086029e-06, + "loss": 0.9162, + "step": 10556 + }, + { + "epoch": 0.8531081435988606, + "grad_norm": 3.098365306854248, + "learning_rate": 6.4125721457592984e-06, + "loss": 0.8304, + "step": 10557 + }, + { + "epoch": 0.8531889533121881, + "grad_norm": 2.604714870452881, + "learning_rate": 6.411944428238608e-06, + "loss": 0.9379, + "step": 10558 + }, + { + "epoch": 0.8532697630255157, + "grad_norm": 2.5910441875457764, + "learning_rate": 6.411316686534709e-06, + "loss": 1.085, + "step": 10559 + }, + { + "epoch": 0.8533505727388432, + "grad_norm": 2.3200278282165527, + "learning_rate": 6.410688920658352e-06, + "loss": 0.9693, + "step": 10560 + }, + { + "epoch": 0.8534313824521708, + "grad_norm": 2.490990161895752, + "learning_rate": 6.410061130620291e-06, + "loss": 0.9117, + "step": 10561 + }, + { + "epoch": 0.8535121921654983, + "grad_norm": 2.962435245513916, + "learning_rate": 6.409433316431276e-06, + "loss": 0.8826, + "step": 10562 + }, + { + "epoch": 0.8535930018788258, + "grad_norm": 2.776380777359009, + "learning_rate": 6.4088054781020625e-06, + "loss": 0.8119, + "step": 10563 + }, + { + "epoch": 0.8536738115921534, + "grad_norm": 2.5014638900756836, + "learning_rate": 6.4081776156434025e-06, + "loss": 0.969, + "step": 10564 + }, + { + "epoch": 0.853754621305481, + "grad_norm": 2.2608299255371094, + "learning_rate": 6.40754972906605e-06, + "loss": 0.9605, + "step": 10565 + }, + { + "epoch": 0.8538354310188084, + "grad_norm": 2.999452590942383, + "learning_rate": 6.4069218183807605e-06, + "loss": 0.9088, + "step": 10566 + }, + { + "epoch": 0.853916240732136, + "grad_norm": 2.4732658863067627, + "learning_rate": 6.406293883598285e-06, + "loss": 0.8978, + "step": 10567 + }, + { + "epoch": 0.8539970504454636, + "grad_norm": 2.768444776535034, + "learning_rate": 6.405665924729382e-06, + "loss": 1.0506, + "step": 10568 + }, + { + "epoch": 0.8540778601587911, + "grad_norm": 2.745413064956665, + "learning_rate": 6.405037941784805e-06, + "loss": 0.8907, + "step": 10569 + }, + { + "epoch": 0.8541586698721186, + "grad_norm": 2.9249978065490723, + "learning_rate": 6.404409934775311e-06, + "loss": 0.9404, + "step": 10570 + }, + { + "epoch": 0.8542394795854462, + "grad_norm": 3.0861124992370605, + "learning_rate": 6.4037819037116564e-06, + "loss": 1.026, + "step": 10571 + }, + { + "epoch": 0.8543202892987737, + "grad_norm": 2.9700753688812256, + "learning_rate": 6.4031538486045954e-06, + "loss": 0.9146, + "step": 10572 + }, + { + "epoch": 0.8544010990121013, + "grad_norm": 2.7021214962005615, + "learning_rate": 6.402525769464889e-06, + "loss": 0.8618, + "step": 10573 + }, + { + "epoch": 0.8544819087254288, + "grad_norm": 2.956753730773926, + "learning_rate": 6.401897666303291e-06, + "loss": 1.0302, + "step": 10574 + }, + { + "epoch": 0.8545627184387563, + "grad_norm": 3.2103488445281982, + "learning_rate": 6.401269539130562e-06, + "loss": 0.929, + "step": 10575 + }, + { + "epoch": 0.8546435281520839, + "grad_norm": 2.6895854473114014, + "learning_rate": 6.4006413879574594e-06, + "loss": 0.9415, + "step": 10576 + }, + { + "epoch": 0.8547243378654115, + "grad_norm": 2.826874256134033, + "learning_rate": 6.400013212794741e-06, + "loss": 0.9348, + "step": 10577 + }, + { + "epoch": 0.8548051475787389, + "grad_norm": 2.940903902053833, + "learning_rate": 6.399385013653166e-06, + "loss": 0.8599, + "step": 10578 + }, + { + "epoch": 0.8548859572920665, + "grad_norm": 2.674924373626709, + "learning_rate": 6.398756790543498e-06, + "loss": 0.9335, + "step": 10579 + }, + { + "epoch": 0.8549667670053941, + "grad_norm": 2.5626094341278076, + "learning_rate": 6.39812854347649e-06, + "loss": 0.8634, + "step": 10580 + }, + { + "epoch": 0.8550475767187216, + "grad_norm": 2.4545671939849854, + "learning_rate": 6.397500272462906e-06, + "loss": 0.8704, + "step": 10581 + }, + { + "epoch": 0.8551283864320491, + "grad_norm": 3.2266173362731934, + "learning_rate": 6.396871977513508e-06, + "loss": 0.9394, + "step": 10582 + }, + { + "epoch": 0.8552091961453767, + "grad_norm": 2.3609201908111572, + "learning_rate": 6.396243658639056e-06, + "loss": 0.92, + "step": 10583 + }, + { + "epoch": 0.8552900058587042, + "grad_norm": 2.5651919841766357, + "learning_rate": 6.395615315850311e-06, + "loss": 1.0117, + "step": 10584 + }, + { + "epoch": 0.8553708155720318, + "grad_norm": 2.9391074180603027, + "learning_rate": 6.394986949158037e-06, + "loss": 0.9646, + "step": 10585 + }, + { + "epoch": 0.8554516252853593, + "grad_norm": 2.545461654663086, + "learning_rate": 6.394358558572991e-06, + "loss": 0.9538, + "step": 10586 + }, + { + "epoch": 0.8555324349986868, + "grad_norm": 2.345226287841797, + "learning_rate": 6.393730144105943e-06, + "loss": 0.9813, + "step": 10587 + }, + { + "epoch": 0.8556132447120144, + "grad_norm": 2.9139716625213623, + "learning_rate": 6.39310170576765e-06, + "loss": 0.7992, + "step": 10588 + }, + { + "epoch": 0.855694054425342, + "grad_norm": 2.6127424240112305, + "learning_rate": 6.3924732435688815e-06, + "loss": 0.9688, + "step": 10589 + }, + { + "epoch": 0.8557748641386694, + "grad_norm": 2.747901201248169, + "learning_rate": 6.3918447575203975e-06, + "loss": 0.925, + "step": 10590 + }, + { + "epoch": 0.855855673851997, + "grad_norm": 2.4753477573394775, + "learning_rate": 6.391216247632963e-06, + "loss": 0.8328, + "step": 10591 + }, + { + "epoch": 0.8559364835653246, + "grad_norm": 2.527433156967163, + "learning_rate": 6.390587713917344e-06, + "loss": 0.9202, + "step": 10592 + }, + { + "epoch": 0.8560172932786521, + "grad_norm": 3.302743673324585, + "learning_rate": 6.389959156384307e-06, + "loss": 0.8891, + "step": 10593 + }, + { + "epoch": 0.8560981029919796, + "grad_norm": 2.222712993621826, + "learning_rate": 6.389330575044612e-06, + "loss": 0.9005, + "step": 10594 + }, + { + "epoch": 0.8561789127053072, + "grad_norm": 2.5155506134033203, + "learning_rate": 6.388701969909032e-06, + "loss": 1.0071, + "step": 10595 + }, + { + "epoch": 0.8562597224186347, + "grad_norm": 2.667337417602539, + "learning_rate": 6.388073340988329e-06, + "loss": 0.9755, + "step": 10596 + }, + { + "epoch": 0.8563405321319623, + "grad_norm": 2.794900894165039, + "learning_rate": 6.387444688293272e-06, + "loss": 0.9474, + "step": 10597 + }, + { + "epoch": 0.8564213418452898, + "grad_norm": 2.404810905456543, + "learning_rate": 6.386816011834627e-06, + "loss": 0.9183, + "step": 10598 + }, + { + "epoch": 0.8565021515586173, + "grad_norm": 2.6825006008148193, + "learning_rate": 6.386187311623162e-06, + "loss": 0.9499, + "step": 10599 + }, + { + "epoch": 0.8565829612719449, + "grad_norm": 2.584521532058716, + "learning_rate": 6.385558587669646e-06, + "loss": 0.9438, + "step": 10600 + }, + { + "epoch": 0.8566637709852725, + "grad_norm": 2.64414644241333, + "learning_rate": 6.384929839984847e-06, + "loss": 0.9696, + "step": 10601 + }, + { + "epoch": 0.8567445806985999, + "grad_norm": 2.6944527626037598, + "learning_rate": 6.384301068579532e-06, + "loss": 0.8774, + "step": 10602 + }, + { + "epoch": 0.8568253904119275, + "grad_norm": 2.4189651012420654, + "learning_rate": 6.3836722734644765e-06, + "loss": 1.0631, + "step": 10603 + }, + { + "epoch": 0.8569062001252551, + "grad_norm": 2.6345582008361816, + "learning_rate": 6.3830434546504425e-06, + "loss": 0.9469, + "step": 10604 + }, + { + "epoch": 0.8569870098385826, + "grad_norm": 2.7669363021850586, + "learning_rate": 6.3824146121482055e-06, + "loss": 0.9544, + "step": 10605 + }, + { + "epoch": 0.8570678195519101, + "grad_norm": 2.6450963020324707, + "learning_rate": 6.381785745968533e-06, + "loss": 1.1208, + "step": 10606 + }, + { + "epoch": 0.8571486292652377, + "grad_norm": 2.425590991973877, + "learning_rate": 6.381156856122199e-06, + "loss": 0.9461, + "step": 10607 + }, + { + "epoch": 0.8572294389785652, + "grad_norm": 2.8030965328216553, + "learning_rate": 6.380527942619971e-06, + "loss": 0.9005, + "step": 10608 + }, + { + "epoch": 0.8573102486918928, + "grad_norm": 2.6762855052948, + "learning_rate": 6.379899005472624e-06, + "loss": 0.978, + "step": 10609 + }, + { + "epoch": 0.8573910584052203, + "grad_norm": 2.991990566253662, + "learning_rate": 6.379270044690928e-06, + "loss": 0.9275, + "step": 10610 + }, + { + "epoch": 0.8574718681185478, + "grad_norm": 2.6101531982421875, + "learning_rate": 6.378641060285657e-06, + "loss": 0.9789, + "step": 10611 + }, + { + "epoch": 0.8575526778318754, + "grad_norm": 2.340989828109741, + "learning_rate": 6.3780120522675835e-06, + "loss": 0.8883, + "step": 10612 + }, + { + "epoch": 0.857633487545203, + "grad_norm": 3.0196053981781006, + "learning_rate": 6.377383020647483e-06, + "loss": 0.8825, + "step": 10613 + }, + { + "epoch": 0.8577142972585304, + "grad_norm": 2.589050054550171, + "learning_rate": 6.376753965436124e-06, + "loss": 0.9256, + "step": 10614 + }, + { + "epoch": 0.857795106971858, + "grad_norm": 2.5810017585754395, + "learning_rate": 6.376124886644286e-06, + "loss": 0.9272, + "step": 10615 + }, + { + "epoch": 0.8578759166851856, + "grad_norm": 2.5934975147247314, + "learning_rate": 6.375495784282741e-06, + "loss": 1.0079, + "step": 10616 + }, + { + "epoch": 0.8579567263985131, + "grad_norm": 2.4420084953308105, + "learning_rate": 6.3748666583622655e-06, + "loss": 0.9891, + "step": 10617 + }, + { + "epoch": 0.8580375361118406, + "grad_norm": 2.4460995197296143, + "learning_rate": 6.3742375088936325e-06, + "loss": 0.9663, + "step": 10618 + }, + { + "epoch": 0.8581183458251682, + "grad_norm": 2.510251760482788, + "learning_rate": 6.37360833588762e-06, + "loss": 0.903, + "step": 10619 + }, + { + "epoch": 0.8581991555384957, + "grad_norm": 2.613154888153076, + "learning_rate": 6.372979139355003e-06, + "loss": 0.8368, + "step": 10620 + }, + { + "epoch": 0.8582799652518233, + "grad_norm": 2.815469980239868, + "learning_rate": 6.372349919306559e-06, + "loss": 0.9071, + "step": 10621 + }, + { + "epoch": 0.8583607749651508, + "grad_norm": 3.0432651042938232, + "learning_rate": 6.371720675753065e-06, + "loss": 0.909, + "step": 10622 + }, + { + "epoch": 0.8584415846784783, + "grad_norm": 3.003157138824463, + "learning_rate": 6.371091408705299e-06, + "loss": 0.9256, + "step": 10623 + }, + { + "epoch": 0.8585223943918059, + "grad_norm": 2.535301923751831, + "learning_rate": 6.370462118174037e-06, + "loss": 1.048, + "step": 10624 + }, + { + "epoch": 0.8586032041051335, + "grad_norm": 2.4820950031280518, + "learning_rate": 6.3698328041700575e-06, + "loss": 1.0474, + "step": 10625 + }, + { + "epoch": 0.8586840138184609, + "grad_norm": 3.022475242614746, + "learning_rate": 6.369203466704141e-06, + "loss": 0.943, + "step": 10626 + }, + { + "epoch": 0.8587648235317885, + "grad_norm": 2.51517915725708, + "learning_rate": 6.368574105787065e-06, + "loss": 0.868, + "step": 10627 + }, + { + "epoch": 0.8588456332451161, + "grad_norm": 2.576625347137451, + "learning_rate": 6.367944721429608e-06, + "loss": 0.9348, + "step": 10628 + }, + { + "epoch": 0.8589264429584436, + "grad_norm": 2.760350227355957, + "learning_rate": 6.3673153136425515e-06, + "loss": 0.9838, + "step": 10629 + }, + { + "epoch": 0.8590072526717711, + "grad_norm": 2.6769909858703613, + "learning_rate": 6.366685882436676e-06, + "loss": 0.8802, + "step": 10630 + }, + { + "epoch": 0.8590880623850987, + "grad_norm": 2.7748525142669678, + "learning_rate": 6.366056427822761e-06, + "loss": 0.832, + "step": 10631 + }, + { + "epoch": 0.8591688720984262, + "grad_norm": 3.260263681411743, + "learning_rate": 6.365426949811589e-06, + "loss": 1.0036, + "step": 10632 + }, + { + "epoch": 0.8592496818117538, + "grad_norm": 2.397263526916504, + "learning_rate": 6.3647974484139404e-06, + "loss": 0.8667, + "step": 10633 + }, + { + "epoch": 0.8593304915250813, + "grad_norm": 2.635941743850708, + "learning_rate": 6.364167923640596e-06, + "loss": 0.9143, + "step": 10634 + }, + { + "epoch": 0.8594113012384088, + "grad_norm": 2.3416852951049805, + "learning_rate": 6.363538375502341e-06, + "loss": 0.9969, + "step": 10635 + }, + { + "epoch": 0.8594921109517364, + "grad_norm": 2.5194973945617676, + "learning_rate": 6.362908804009954e-06, + "loss": 0.7951, + "step": 10636 + }, + { + "epoch": 0.859572920665064, + "grad_norm": 2.572483777999878, + "learning_rate": 6.3622792091742215e-06, + "loss": 0.937, + "step": 10637 + }, + { + "epoch": 0.8596537303783914, + "grad_norm": 2.6123387813568115, + "learning_rate": 6.361649591005926e-06, + "loss": 1.0149, + "step": 10638 + }, + { + "epoch": 0.859734540091719, + "grad_norm": 2.845313787460327, + "learning_rate": 6.36101994951585e-06, + "loss": 0.8273, + "step": 10639 + }, + { + "epoch": 0.8598153498050466, + "grad_norm": 2.2015128135681152, + "learning_rate": 6.360390284714781e-06, + "loss": 0.9506, + "step": 10640 + }, + { + "epoch": 0.8598961595183741, + "grad_norm": 2.50612473487854, + "learning_rate": 6.359760596613499e-06, + "loss": 0.8745, + "step": 10641 + }, + { + "epoch": 0.8599769692317016, + "grad_norm": 2.620586395263672, + "learning_rate": 6.359130885222792e-06, + "loss": 0.8983, + "step": 10642 + }, + { + "epoch": 0.8600577789450292, + "grad_norm": 2.8248183727264404, + "learning_rate": 6.358501150553444e-06, + "loss": 0.9756, + "step": 10643 + }, + { + "epoch": 0.8601385886583567, + "grad_norm": 2.6578304767608643, + "learning_rate": 6.357871392616244e-06, + "loss": 0.9275, + "step": 10644 + }, + { + "epoch": 0.8602193983716843, + "grad_norm": 2.0792160034179688, + "learning_rate": 6.357241611421975e-06, + "loss": 0.9592, + "step": 10645 + }, + { + "epoch": 0.8603002080850118, + "grad_norm": 2.506110429763794, + "learning_rate": 6.356611806981425e-06, + "loss": 0.7845, + "step": 10646 + }, + { + "epoch": 0.8603810177983393, + "grad_norm": 2.1438324451446533, + "learning_rate": 6.355981979305379e-06, + "loss": 1.0082, + "step": 10647 + }, + { + "epoch": 0.8604618275116669, + "grad_norm": 2.546485185623169, + "learning_rate": 6.355352128404629e-06, + "loss": 1.0375, + "step": 10648 + }, + { + "epoch": 0.8605426372249945, + "grad_norm": 2.4790470600128174, + "learning_rate": 6.354722254289958e-06, + "loss": 0.8797, + "step": 10649 + }, + { + "epoch": 0.8606234469383219, + "grad_norm": 2.379716157913208, + "learning_rate": 6.3540923569721555e-06, + "loss": 0.9556, + "step": 10650 + }, + { + "epoch": 0.8607042566516495, + "grad_norm": 2.492772340774536, + "learning_rate": 6.3534624364620115e-06, + "loss": 1.019, + "step": 10651 + }, + { + "epoch": 0.8607850663649771, + "grad_norm": 2.5661613941192627, + "learning_rate": 6.352832492770316e-06, + "loss": 0.9823, + "step": 10652 + }, + { + "epoch": 0.8608658760783046, + "grad_norm": 2.65501070022583, + "learning_rate": 6.352202525907854e-06, + "loss": 0.8606, + "step": 10653 + }, + { + "epoch": 0.8609466857916321, + "grad_norm": 2.75138783454895, + "learning_rate": 6.35157253588542e-06, + "loss": 0.8956, + "step": 10654 + }, + { + "epoch": 0.8610274955049597, + "grad_norm": 2.963960886001587, + "learning_rate": 6.350942522713799e-06, + "loss": 1.0333, + "step": 10655 + }, + { + "epoch": 0.8611083052182872, + "grad_norm": 2.7790400981903076, + "learning_rate": 6.350312486403789e-06, + "loss": 0.9164, + "step": 10656 + }, + { + "epoch": 0.8611891149316148, + "grad_norm": 2.602954626083374, + "learning_rate": 6.349682426966175e-06, + "loss": 0.7521, + "step": 10657 + }, + { + "epoch": 0.8612699246449423, + "grad_norm": 2.724419116973877, + "learning_rate": 6.349052344411749e-06, + "loss": 1.0504, + "step": 10658 + }, + { + "epoch": 0.8613507343582698, + "grad_norm": 2.1897552013397217, + "learning_rate": 6.348422238751306e-06, + "loss": 1.0841, + "step": 10659 + }, + { + "epoch": 0.8614315440715974, + "grad_norm": 2.358163595199585, + "learning_rate": 6.347792109995636e-06, + "loss": 1.062, + "step": 10660 + }, + { + "epoch": 0.861512353784925, + "grad_norm": 2.888578176498413, + "learning_rate": 6.34716195815553e-06, + "loss": 0.9526, + "step": 10661 + }, + { + "epoch": 0.8615931634982524, + "grad_norm": 2.2651138305664062, + "learning_rate": 6.346531783241786e-06, + "loss": 0.8792, + "step": 10662 + }, + { + "epoch": 0.86167397321158, + "grad_norm": 2.759732246398926, + "learning_rate": 6.3459015852651914e-06, + "loss": 0.7242, + "step": 10663 + }, + { + "epoch": 0.8617547829249076, + "grad_norm": 2.562382221221924, + "learning_rate": 6.345271364236545e-06, + "loss": 0.9803, + "step": 10664 + }, + { + "epoch": 0.8618355926382352, + "grad_norm": 2.7765278816223145, + "learning_rate": 6.344641120166637e-06, + "loss": 0.9276, + "step": 10665 + }, + { + "epoch": 0.8619164023515626, + "grad_norm": 2.386147975921631, + "learning_rate": 6.344010853066265e-06, + "loss": 0.9007, + "step": 10666 + }, + { + "epoch": 0.8619972120648902, + "grad_norm": 2.420410394668579, + "learning_rate": 6.343380562946221e-06, + "loss": 0.8955, + "step": 10667 + }, + { + "epoch": 0.8620780217782178, + "grad_norm": 2.726506471633911, + "learning_rate": 6.342750249817303e-06, + "loss": 0.8754, + "step": 10668 + }, + { + "epoch": 0.8621588314915453, + "grad_norm": 2.5447473526000977, + "learning_rate": 6.342119913690306e-06, + "loss": 0.8546, + "step": 10669 + }, + { + "epoch": 0.8622396412048728, + "grad_norm": 2.5017807483673096, + "learning_rate": 6.341489554576026e-06, + "loss": 0.8573, + "step": 10670 + }, + { + "epoch": 0.8623204509182004, + "grad_norm": 2.4028189182281494, + "learning_rate": 6.340859172485259e-06, + "loss": 0.8024, + "step": 10671 + }, + { + "epoch": 0.8624012606315279, + "grad_norm": 2.5131757259368896, + "learning_rate": 6.3402287674288025e-06, + "loss": 0.9333, + "step": 10672 + }, + { + "epoch": 0.8624820703448555, + "grad_norm": 2.429248809814453, + "learning_rate": 6.339598339417452e-06, + "loss": 0.9257, + "step": 10673 + }, + { + "epoch": 0.862562880058183, + "grad_norm": 2.963897228240967, + "learning_rate": 6.33896788846201e-06, + "loss": 0.8952, + "step": 10674 + }, + { + "epoch": 0.8626436897715105, + "grad_norm": 2.5240468978881836, + "learning_rate": 6.338337414573269e-06, + "loss": 0.8979, + "step": 10675 + }, + { + "epoch": 0.8627244994848381, + "grad_norm": 2.6063954830169678, + "learning_rate": 6.337706917762032e-06, + "loss": 0.9515, + "step": 10676 + }, + { + "epoch": 0.8628053091981657, + "grad_norm": 2.316326379776001, + "learning_rate": 6.337076398039095e-06, + "loss": 0.9365, + "step": 10677 + }, + { + "epoch": 0.8628861189114931, + "grad_norm": 2.583449125289917, + "learning_rate": 6.336445855415257e-06, + "loss": 0.7777, + "step": 10678 + }, + { + "epoch": 0.8629669286248207, + "grad_norm": 2.994060516357422, + "learning_rate": 6.33581528990132e-06, + "loss": 0.9925, + "step": 10679 + }, + { + "epoch": 0.8630477383381483, + "grad_norm": 2.4650230407714844, + "learning_rate": 6.335184701508084e-06, + "loss": 0.959, + "step": 10680 + }, + { + "epoch": 0.8631285480514758, + "grad_norm": 2.5867812633514404, + "learning_rate": 6.3345540902463456e-06, + "loss": 0.8716, + "step": 10681 + }, + { + "epoch": 0.8632093577648033, + "grad_norm": 2.8306491374969482, + "learning_rate": 6.333923456126912e-06, + "loss": 0.9326, + "step": 10682 + }, + { + "epoch": 0.8632901674781309, + "grad_norm": 2.8601202964782715, + "learning_rate": 6.333292799160578e-06, + "loss": 0.9888, + "step": 10683 + }, + { + "epoch": 0.8633709771914584, + "grad_norm": 2.715182065963745, + "learning_rate": 6.332662119358149e-06, + "loss": 0.8933, + "step": 10684 + }, + { + "epoch": 0.863451786904786, + "grad_norm": 2.2428197860717773, + "learning_rate": 6.332031416730426e-06, + "loss": 0.9219, + "step": 10685 + }, + { + "epoch": 0.8635325966181135, + "grad_norm": 2.675428628921509, + "learning_rate": 6.331400691288212e-06, + "loss": 0.9913, + "step": 10686 + }, + { + "epoch": 0.863613406331441, + "grad_norm": 2.7280433177948, + "learning_rate": 6.33076994304231e-06, + "loss": 1.0188, + "step": 10687 + }, + { + "epoch": 0.8636942160447686, + "grad_norm": 2.759230136871338, + "learning_rate": 6.330139172003521e-06, + "loss": 0.8432, + "step": 10688 + }, + { + "epoch": 0.8637750257580962, + "grad_norm": 2.314729928970337, + "learning_rate": 6.329508378182651e-06, + "loss": 0.9584, + "step": 10689 + }, + { + "epoch": 0.8638558354714236, + "grad_norm": 2.8474714756011963, + "learning_rate": 6.328877561590503e-06, + "loss": 0.9268, + "step": 10690 + }, + { + "epoch": 0.8639366451847512, + "grad_norm": 2.595289707183838, + "learning_rate": 6.328246722237882e-06, + "loss": 0.9734, + "step": 10691 + }, + { + "epoch": 0.8640174548980788, + "grad_norm": 2.816744089126587, + "learning_rate": 6.327615860135592e-06, + "loss": 1.0148, + "step": 10692 + }, + { + "epoch": 0.8640982646114063, + "grad_norm": 3.141812801361084, + "learning_rate": 6.326984975294438e-06, + "loss": 1.0503, + "step": 10693 + }, + { + "epoch": 0.8641790743247338, + "grad_norm": 2.314528226852417, + "learning_rate": 6.326354067725226e-06, + "loss": 0.9588, + "step": 10694 + }, + { + "epoch": 0.8642598840380614, + "grad_norm": 2.6816062927246094, + "learning_rate": 6.325723137438762e-06, + "loss": 0.8807, + "step": 10695 + }, + { + "epoch": 0.8643406937513889, + "grad_norm": 2.7583794593811035, + "learning_rate": 6.325092184445852e-06, + "loss": 0.9194, + "step": 10696 + }, + { + "epoch": 0.8644215034647165, + "grad_norm": 2.718410015106201, + "learning_rate": 6.3244612087573034e-06, + "loss": 0.9402, + "step": 10697 + }, + { + "epoch": 0.864502313178044, + "grad_norm": 2.365786552429199, + "learning_rate": 6.323830210383922e-06, + "loss": 0.8833, + "step": 10698 + }, + { + "epoch": 0.8645831228913715, + "grad_norm": 2.589773416519165, + "learning_rate": 6.323199189336517e-06, + "loss": 0.9362, + "step": 10699 + }, + { + "epoch": 0.8646639326046991, + "grad_norm": 2.79266619682312, + "learning_rate": 6.322568145625896e-06, + "loss": 0.953, + "step": 10700 + }, + { + "epoch": 0.8647447423180267, + "grad_norm": 2.7299811840057373, + "learning_rate": 6.321937079262866e-06, + "loss": 0.8029, + "step": 10701 + }, + { + "epoch": 0.8648255520313541, + "grad_norm": 2.4152779579162598, + "learning_rate": 6.321305990258235e-06, + "loss": 1.0027, + "step": 10702 + }, + { + "epoch": 0.8649063617446817, + "grad_norm": 3.3846495151519775, + "learning_rate": 6.320674878622815e-06, + "loss": 0.8574, + "step": 10703 + }, + { + "epoch": 0.8649871714580093, + "grad_norm": 2.5887768268585205, + "learning_rate": 6.320043744367412e-06, + "loss": 0.9724, + "step": 10704 + }, + { + "epoch": 0.8650679811713368, + "grad_norm": 2.4225897789001465, + "learning_rate": 6.31941258750284e-06, + "loss": 0.8693, + "step": 10705 + }, + { + "epoch": 0.8651487908846643, + "grad_norm": 2.640592575073242, + "learning_rate": 6.318781408039904e-06, + "loss": 1.0469, + "step": 10706 + }, + { + "epoch": 0.8652296005979919, + "grad_norm": 2.973048210144043, + "learning_rate": 6.31815020598942e-06, + "loss": 1.0066, + "step": 10707 + }, + { + "epoch": 0.8653104103113194, + "grad_norm": 2.6845743656158447, + "learning_rate": 6.317518981362194e-06, + "loss": 0.9655, + "step": 10708 + }, + { + "epoch": 0.865391220024647, + "grad_norm": 2.4086756706237793, + "learning_rate": 6.31688773416904e-06, + "loss": 0.9585, + "step": 10709 + }, + { + "epoch": 0.8654720297379745, + "grad_norm": 2.196768283843994, + "learning_rate": 6.31625646442077e-06, + "loss": 0.9601, + "step": 10710 + }, + { + "epoch": 0.865552839451302, + "grad_norm": 3.2099034786224365, + "learning_rate": 6.315625172128195e-06, + "loss": 0.9698, + "step": 10711 + }, + { + "epoch": 0.8656336491646296, + "grad_norm": 2.5552661418914795, + "learning_rate": 6.314993857302129e-06, + "loss": 1.1014, + "step": 10712 + }, + { + "epoch": 0.8657144588779572, + "grad_norm": 3.1415045261383057, + "learning_rate": 6.314362519953384e-06, + "loss": 0.9447, + "step": 10713 + }, + { + "epoch": 0.8657952685912846, + "grad_norm": 2.824753522872925, + "learning_rate": 6.313731160092771e-06, + "loss": 0.9919, + "step": 10714 + }, + { + "epoch": 0.8658760783046122, + "grad_norm": 2.9019217491149902, + "learning_rate": 6.313099777731109e-06, + "loss": 0.9108, + "step": 10715 + }, + { + "epoch": 0.8659568880179398, + "grad_norm": 2.5668866634368896, + "learning_rate": 6.312468372879207e-06, + "loss": 0.9482, + "step": 10716 + }, + { + "epoch": 0.8660376977312673, + "grad_norm": 2.8544135093688965, + "learning_rate": 6.311836945547882e-06, + "loss": 1.0704, + "step": 10717 + }, + { + "epoch": 0.8661185074445948, + "grad_norm": 2.715951442718506, + "learning_rate": 6.311205495747947e-06, + "loss": 0.9377, + "step": 10718 + }, + { + "epoch": 0.8661993171579224, + "grad_norm": 2.9735267162323, + "learning_rate": 6.310574023490222e-06, + "loss": 0.8002, + "step": 10719 + }, + { + "epoch": 0.8662801268712499, + "grad_norm": 2.4859063625335693, + "learning_rate": 6.309942528785515e-06, + "loss": 0.9082, + "step": 10720 + }, + { + "epoch": 0.8663609365845775, + "grad_norm": 2.8067026138305664, + "learning_rate": 6.309311011644649e-06, + "loss": 0.9845, + "step": 10721 + }, + { + "epoch": 0.866441746297905, + "grad_norm": 2.8802614212036133, + "learning_rate": 6.3086794720784364e-06, + "loss": 1.081, + "step": 10722 + }, + { + "epoch": 0.8665225560112325, + "grad_norm": 2.5529253482818604, + "learning_rate": 6.308047910097694e-06, + "loss": 0.8574, + "step": 10723 + }, + { + "epoch": 0.8666033657245601, + "grad_norm": 2.744807481765747, + "learning_rate": 6.3074163257132405e-06, + "loss": 0.8429, + "step": 10724 + }, + { + "epoch": 0.8666841754378877, + "grad_norm": 2.576052188873291, + "learning_rate": 6.306784718935892e-06, + "loss": 0.9237, + "step": 10725 + }, + { + "epoch": 0.8667649851512151, + "grad_norm": 2.367292881011963, + "learning_rate": 6.306153089776468e-06, + "loss": 0.9179, + "step": 10726 + }, + { + "epoch": 0.8668457948645427, + "grad_norm": 3.018024444580078, + "learning_rate": 6.305521438245788e-06, + "loss": 0.9679, + "step": 10727 + }, + { + "epoch": 0.8669266045778703, + "grad_norm": 2.719452142715454, + "learning_rate": 6.304889764354665e-06, + "loss": 0.782, + "step": 10728 + }, + { + "epoch": 0.8670074142911978, + "grad_norm": 2.57442045211792, + "learning_rate": 6.304258068113924e-06, + "loss": 0.9226, + "step": 10729 + }, + { + "epoch": 0.8670882240045253, + "grad_norm": 2.4634058475494385, + "learning_rate": 6.303626349534382e-06, + "loss": 0.86, + "step": 10730 + }, + { + "epoch": 0.8671690337178529, + "grad_norm": 2.393235921859741, + "learning_rate": 6.302994608626859e-06, + "loss": 0.9199, + "step": 10731 + }, + { + "epoch": 0.8672498434311804, + "grad_norm": 2.4220023155212402, + "learning_rate": 6.3023628454021744e-06, + "loss": 1.0465, + "step": 10732 + }, + { + "epoch": 0.867330653144508, + "grad_norm": 3.016916036605835, + "learning_rate": 6.301731059871151e-06, + "loss": 0.8538, + "step": 10733 + }, + { + "epoch": 0.8674114628578355, + "grad_norm": 2.7756950855255127, + "learning_rate": 6.3010992520446055e-06, + "loss": 0.8465, + "step": 10734 + }, + { + "epoch": 0.867492272571163, + "grad_norm": 2.587876319885254, + "learning_rate": 6.300467421933365e-06, + "loss": 0.8998, + "step": 10735 + }, + { + "epoch": 0.8675730822844906, + "grad_norm": 2.5709636211395264, + "learning_rate": 6.299835569548247e-06, + "loss": 0.8722, + "step": 10736 + }, + { + "epoch": 0.8676538919978182, + "grad_norm": 3.024399757385254, + "learning_rate": 6.299203694900076e-06, + "loss": 0.9851, + "step": 10737 + }, + { + "epoch": 0.8677347017111456, + "grad_norm": 2.966047763824463, + "learning_rate": 6.298571797999672e-06, + "loss": 0.8631, + "step": 10738 + }, + { + "epoch": 0.8678155114244732, + "grad_norm": 2.9161622524261475, + "learning_rate": 6.297939878857859e-06, + "loss": 0.846, + "step": 10739 + }, + { + "epoch": 0.8678963211378008, + "grad_norm": 2.444450616836548, + "learning_rate": 6.297307937485462e-06, + "loss": 0.8635, + "step": 10740 + }, + { + "epoch": 0.8679771308511283, + "grad_norm": 2.7127749919891357, + "learning_rate": 6.296675973893304e-06, + "loss": 0.8632, + "step": 10741 + }, + { + "epoch": 0.8680579405644558, + "grad_norm": 2.566418409347534, + "learning_rate": 6.296043988092205e-06, + "loss": 0.8537, + "step": 10742 + }, + { + "epoch": 0.8681387502777834, + "grad_norm": 2.485527753829956, + "learning_rate": 6.2954119800929955e-06, + "loss": 0.9469, + "step": 10743 + }, + { + "epoch": 0.8682195599911109, + "grad_norm": 2.256040573120117, + "learning_rate": 6.294779949906496e-06, + "loss": 0.969, + "step": 10744 + }, + { + "epoch": 0.8683003697044385, + "grad_norm": 2.8519866466522217, + "learning_rate": 6.2941478975435334e-06, + "loss": 1.0519, + "step": 10745 + }, + { + "epoch": 0.868381179417766, + "grad_norm": 3.0991079807281494, + "learning_rate": 6.293515823014933e-06, + "loss": 1.1046, + "step": 10746 + }, + { + "epoch": 0.8684619891310935, + "grad_norm": 2.535867691040039, + "learning_rate": 6.29288372633152e-06, + "loss": 0.8546, + "step": 10747 + }, + { + "epoch": 0.8685427988444211, + "grad_norm": 2.7065234184265137, + "learning_rate": 6.29225160750412e-06, + "loss": 0.8375, + "step": 10748 + }, + { + "epoch": 0.8686236085577487, + "grad_norm": 2.5868163108825684, + "learning_rate": 6.291619466543564e-06, + "loss": 0.8276, + "step": 10749 + }, + { + "epoch": 0.8687044182710761, + "grad_norm": 2.6736459732055664, + "learning_rate": 6.290987303460674e-06, + "loss": 0.9697, + "step": 10750 + }, + { + "epoch": 0.8687852279844037, + "grad_norm": 2.3866159915924072, + "learning_rate": 6.29035511826628e-06, + "loss": 0.9777, + "step": 10751 + }, + { + "epoch": 0.8688660376977313, + "grad_norm": 2.549373149871826, + "learning_rate": 6.289722910971208e-06, + "loss": 0.7991, + "step": 10752 + }, + { + "epoch": 0.8689468474110588, + "grad_norm": 2.6806037425994873, + "learning_rate": 6.289090681586289e-06, + "loss": 0.9878, + "step": 10753 + }, + { + "epoch": 0.8690276571243863, + "grad_norm": 3.136216640472412, + "learning_rate": 6.28845843012235e-06, + "loss": 0.8673, + "step": 10754 + }, + { + "epoch": 0.8691084668377139, + "grad_norm": 2.7405874729156494, + "learning_rate": 6.287826156590219e-06, + "loss": 0.9072, + "step": 10755 + }, + { + "epoch": 0.8691892765510414, + "grad_norm": 3.0692496299743652, + "learning_rate": 6.287193861000727e-06, + "loss": 0.8264, + "step": 10756 + }, + { + "epoch": 0.869270086264369, + "grad_norm": 2.371486186981201, + "learning_rate": 6.286561543364703e-06, + "loss": 0.9797, + "step": 10757 + }, + { + "epoch": 0.8693508959776965, + "grad_norm": 2.7062017917633057, + "learning_rate": 6.285929203692977e-06, + "loss": 0.9089, + "step": 10758 + }, + { + "epoch": 0.869431705691024, + "grad_norm": 2.3986001014709473, + "learning_rate": 6.285296841996378e-06, + "loss": 0.9209, + "step": 10759 + }, + { + "epoch": 0.8695125154043516, + "grad_norm": 2.299267530441284, + "learning_rate": 6.2846644582857396e-06, + "loss": 0.9217, + "step": 10760 + }, + { + "epoch": 0.8695933251176792, + "grad_norm": 2.544961929321289, + "learning_rate": 6.284032052571891e-06, + "loss": 1.0939, + "step": 10761 + }, + { + "epoch": 0.8696741348310066, + "grad_norm": 2.4752156734466553, + "learning_rate": 6.283399624865666e-06, + "loss": 0.9844, + "step": 10762 + }, + { + "epoch": 0.8697549445443342, + "grad_norm": 2.646388530731201, + "learning_rate": 6.282767175177893e-06, + "loss": 0.8684, + "step": 10763 + }, + { + "epoch": 0.8698357542576618, + "grad_norm": 3.183969259262085, + "learning_rate": 6.282134703519408e-06, + "loss": 0.8831, + "step": 10764 + }, + { + "epoch": 0.8699165639709893, + "grad_norm": 2.540184497833252, + "learning_rate": 6.281502209901043e-06, + "loss": 0.8108, + "step": 10765 + }, + { + "epoch": 0.8699973736843168, + "grad_norm": 2.5926268100738525, + "learning_rate": 6.280869694333629e-06, + "loss": 0.9253, + "step": 10766 + }, + { + "epoch": 0.8700781833976444, + "grad_norm": 2.694242000579834, + "learning_rate": 6.280237156827999e-06, + "loss": 0.8878, + "step": 10767 + }, + { + "epoch": 0.8701589931109719, + "grad_norm": 2.509291410446167, + "learning_rate": 6.279604597394991e-06, + "loss": 0.8167, + "step": 10768 + }, + { + "epoch": 0.8702398028242995, + "grad_norm": 2.694880962371826, + "learning_rate": 6.278972016045436e-06, + "loss": 0.9564, + "step": 10769 + }, + { + "epoch": 0.870320612537627, + "grad_norm": 2.6930549144744873, + "learning_rate": 6.278339412790169e-06, + "loss": 1.0387, + "step": 10770 + }, + { + "epoch": 0.8704014222509545, + "grad_norm": 2.729775905609131, + "learning_rate": 6.277706787640025e-06, + "loss": 0.7416, + "step": 10771 + }, + { + "epoch": 0.8704822319642821, + "grad_norm": 2.8160128593444824, + "learning_rate": 6.277074140605842e-06, + "loss": 1.0056, + "step": 10772 + }, + { + "epoch": 0.8705630416776097, + "grad_norm": 2.5572054386138916, + "learning_rate": 6.2764414716984514e-06, + "loss": 0.9884, + "step": 10773 + }, + { + "epoch": 0.8706438513909371, + "grad_norm": 2.755920886993408, + "learning_rate": 6.275808780928691e-06, + "loss": 1.0284, + "step": 10774 + }, + { + "epoch": 0.8707246611042647, + "grad_norm": 2.579279661178589, + "learning_rate": 6.275176068307399e-06, + "loss": 0.8889, + "step": 10775 + }, + { + "epoch": 0.8708054708175923, + "grad_norm": 2.5251824855804443, + "learning_rate": 6.27454333384541e-06, + "loss": 0.9342, + "step": 10776 + }, + { + "epoch": 0.8708862805309198, + "grad_norm": 2.084838628768921, + "learning_rate": 6.273910577553561e-06, + "loss": 0.9613, + "step": 10777 + }, + { + "epoch": 0.8709670902442473, + "grad_norm": 2.7771565914154053, + "learning_rate": 6.273277799442692e-06, + "loss": 1.0266, + "step": 10778 + }, + { + "epoch": 0.8710478999575749, + "grad_norm": 2.3363239765167236, + "learning_rate": 6.272644999523639e-06, + "loss": 0.8887, + "step": 10779 + }, + { + "epoch": 0.8711287096709024, + "grad_norm": 2.9889473915100098, + "learning_rate": 6.272012177807243e-06, + "loss": 1.011, + "step": 10780 + }, + { + "epoch": 0.87120951938423, + "grad_norm": 2.5180137157440186, + "learning_rate": 6.271379334304337e-06, + "loss": 1.037, + "step": 10781 + }, + { + "epoch": 0.8712903290975575, + "grad_norm": 2.5055792331695557, + "learning_rate": 6.270746469025767e-06, + "loss": 0.8885, + "step": 10782 + }, + { + "epoch": 0.871371138810885, + "grad_norm": 2.5165748596191406, + "learning_rate": 6.270113581982368e-06, + "loss": 0.9617, + "step": 10783 + }, + { + "epoch": 0.8714519485242126, + "grad_norm": 2.2549095153808594, + "learning_rate": 6.26948067318498e-06, + "loss": 0.9547, + "step": 10784 + }, + { + "epoch": 0.8715327582375402, + "grad_norm": 2.5069286823272705, + "learning_rate": 6.268847742644445e-06, + "loss": 0.8673, + "step": 10785 + }, + { + "epoch": 0.8716135679508676, + "grad_norm": 2.8266592025756836, + "learning_rate": 6.2682147903716036e-06, + "loss": 0.8074, + "step": 10786 + }, + { + "epoch": 0.8716943776641952, + "grad_norm": 2.5912058353424072, + "learning_rate": 6.267581816377294e-06, + "loss": 0.9483, + "step": 10787 + }, + { + "epoch": 0.8717751873775228, + "grad_norm": 2.6459462642669678, + "learning_rate": 6.266948820672362e-06, + "loss": 1.0598, + "step": 10788 + }, + { + "epoch": 0.8718559970908503, + "grad_norm": 2.4300882816314697, + "learning_rate": 6.266315803267645e-06, + "loss": 0.9263, + "step": 10789 + }, + { + "epoch": 0.8719368068041778, + "grad_norm": 2.8088088035583496, + "learning_rate": 6.265682764173987e-06, + "loss": 0.8896, + "step": 10790 + }, + { + "epoch": 0.8720176165175054, + "grad_norm": 3.032182455062866, + "learning_rate": 6.26504970340223e-06, + "loss": 0.9081, + "step": 10791 + }, + { + "epoch": 0.872098426230833, + "grad_norm": 2.6282832622528076, + "learning_rate": 6.264416620963217e-06, + "loss": 0.9339, + "step": 10792 + }, + { + "epoch": 0.8721792359441605, + "grad_norm": 2.961758613586426, + "learning_rate": 6.263783516867792e-06, + "loss": 0.9128, + "step": 10793 + }, + { + "epoch": 0.872260045657488, + "grad_norm": 2.9102795124053955, + "learning_rate": 6.263150391126799e-06, + "loss": 1.1049, + "step": 10794 + }, + { + "epoch": 0.8723408553708156, + "grad_norm": 2.5261833667755127, + "learning_rate": 6.262517243751078e-06, + "loss": 0.913, + "step": 10795 + }, + { + "epoch": 0.8724216650841431, + "grad_norm": 2.563030958175659, + "learning_rate": 6.261884074751479e-06, + "loss": 0.9188, + "step": 10796 + }, + { + "epoch": 0.8725024747974707, + "grad_norm": 2.705185651779175, + "learning_rate": 6.261250884138841e-06, + "loss": 0.9966, + "step": 10797 + }, + { + "epoch": 0.8725832845107983, + "grad_norm": 2.666682481765747, + "learning_rate": 6.260617671924012e-06, + "loss": 0.7857, + "step": 10798 + }, + { + "epoch": 0.8726640942241257, + "grad_norm": 2.5954337120056152, + "learning_rate": 6.259984438117837e-06, + "loss": 0.8968, + "step": 10799 + }, + { + "epoch": 0.8727449039374533, + "grad_norm": 2.9078593254089355, + "learning_rate": 6.259351182731164e-06, + "loss": 0.9886, + "step": 10800 + }, + { + "epoch": 0.8728257136507809, + "grad_norm": 2.364166498184204, + "learning_rate": 6.258717905774835e-06, + "loss": 0.8949, + "step": 10801 + }, + { + "epoch": 0.8729065233641083, + "grad_norm": 2.4891860485076904, + "learning_rate": 6.2580846072597e-06, + "loss": 0.9635, + "step": 10802 + }, + { + "epoch": 0.8729873330774359, + "grad_norm": 2.585794687271118, + "learning_rate": 6.257451287196603e-06, + "loss": 0.993, + "step": 10803 + }, + { + "epoch": 0.8730681427907635, + "grad_norm": 2.3488502502441406, + "learning_rate": 6.256817945596393e-06, + "loss": 0.8275, + "step": 10804 + }, + { + "epoch": 0.873148952504091, + "grad_norm": 2.637300491333008, + "learning_rate": 6.2561845824699175e-06, + "loss": 0.943, + "step": 10805 + }, + { + "epoch": 0.8732297622174185, + "grad_norm": 2.2985503673553467, + "learning_rate": 6.255551197828024e-06, + "loss": 0.8112, + "step": 10806 + }, + { + "epoch": 0.8733105719307461, + "grad_norm": 3.118016481399536, + "learning_rate": 6.254917791681561e-06, + "loss": 0.8169, + "step": 10807 + }, + { + "epoch": 0.8733913816440736, + "grad_norm": 2.5043869018554688, + "learning_rate": 6.2542843640413784e-06, + "loss": 0.9848, + "step": 10808 + }, + { + "epoch": 0.8734721913574012, + "grad_norm": 2.850107192993164, + "learning_rate": 6.253650914918322e-06, + "loss": 0.911, + "step": 10809 + }, + { + "epoch": 0.8735530010707288, + "grad_norm": 2.5064802169799805, + "learning_rate": 6.253017444323246e-06, + "loss": 0.9335, + "step": 10810 + }, + { + "epoch": 0.8736338107840562, + "grad_norm": 3.565614700317383, + "learning_rate": 6.252383952266996e-06, + "loss": 1.0437, + "step": 10811 + }, + { + "epoch": 0.8737146204973838, + "grad_norm": 2.5139176845550537, + "learning_rate": 6.251750438760425e-06, + "loss": 0.8993, + "step": 10812 + }, + { + "epoch": 0.8737954302107114, + "grad_norm": 2.882888078689575, + "learning_rate": 6.25111690381438e-06, + "loss": 0.9241, + "step": 10813 + }, + { + "epoch": 0.8738762399240388, + "grad_norm": 2.9219565391540527, + "learning_rate": 6.250483347439717e-06, + "loss": 0.9589, + "step": 10814 + }, + { + "epoch": 0.8739570496373664, + "grad_norm": 2.3526394367218018, + "learning_rate": 6.249849769647283e-06, + "loss": 0.8644, + "step": 10815 + }, + { + "epoch": 0.874037859350694, + "grad_norm": 2.953166961669922, + "learning_rate": 6.249216170447934e-06, + "loss": 0.9873, + "step": 10816 + }, + { + "epoch": 0.8741186690640215, + "grad_norm": 2.7625043392181396, + "learning_rate": 6.248582549852517e-06, + "loss": 0.8488, + "step": 10817 + }, + { + "epoch": 0.874199478777349, + "grad_norm": 2.5680489540100098, + "learning_rate": 6.247948907871888e-06, + "loss": 0.8417, + "step": 10818 + }, + { + "epoch": 0.8742802884906766, + "grad_norm": 2.6536850929260254, + "learning_rate": 6.247315244516897e-06, + "loss": 0.8816, + "step": 10819 + }, + { + "epoch": 0.8743610982040041, + "grad_norm": 3.0687880516052246, + "learning_rate": 6.2466815597984e-06, + "loss": 0.9783, + "step": 10820 + }, + { + "epoch": 0.8744419079173317, + "grad_norm": 2.711142063140869, + "learning_rate": 6.246047853727249e-06, + "loss": 0.8074, + "step": 10821 + }, + { + "epoch": 0.8745227176306593, + "grad_norm": 2.937854766845703, + "learning_rate": 6.245414126314297e-06, + "loss": 0.8972, + "step": 10822 + }, + { + "epoch": 0.8746035273439867, + "grad_norm": 2.8597121238708496, + "learning_rate": 6.2447803775704e-06, + "loss": 0.9454, + "step": 10823 + }, + { + "epoch": 0.8746843370573143, + "grad_norm": 2.685940742492676, + "learning_rate": 6.244146607506412e-06, + "loss": 0.9485, + "step": 10824 + }, + { + "epoch": 0.8747651467706419, + "grad_norm": 2.585789680480957, + "learning_rate": 6.243512816133188e-06, + "loss": 0.9593, + "step": 10825 + }, + { + "epoch": 0.8748459564839693, + "grad_norm": 2.7482876777648926, + "learning_rate": 6.2428790034615814e-06, + "loss": 0.9468, + "step": 10826 + }, + { + "epoch": 0.8749267661972969, + "grad_norm": 2.5974881649017334, + "learning_rate": 6.2422451695024515e-06, + "loss": 0.8735, + "step": 10827 + }, + { + "epoch": 0.8750075759106245, + "grad_norm": 2.5537526607513428, + "learning_rate": 6.241611314266651e-06, + "loss": 0.9705, + "step": 10828 + }, + { + "epoch": 0.875088385623952, + "grad_norm": 2.82592511177063, + "learning_rate": 6.240977437765039e-06, + "loss": 1.0177, + "step": 10829 + }, + { + "epoch": 0.8751691953372795, + "grad_norm": 2.7693116664886475, + "learning_rate": 6.24034354000847e-06, + "loss": 0.8391, + "step": 10830 + }, + { + "epoch": 0.8752500050506071, + "grad_norm": 2.3902838230133057, + "learning_rate": 6.2397096210078035e-06, + "loss": 0.8922, + "step": 10831 + }, + { + "epoch": 0.8753308147639346, + "grad_norm": 2.4729909896850586, + "learning_rate": 6.239075680773895e-06, + "loss": 1.0435, + "step": 10832 + }, + { + "epoch": 0.8754116244772622, + "grad_norm": 2.984990358352661, + "learning_rate": 6.238441719317603e-06, + "loss": 0.9861, + "step": 10833 + }, + { + "epoch": 0.8754924341905898, + "grad_norm": 3.044724225997925, + "learning_rate": 6.237807736649784e-06, + "loss": 0.8173, + "step": 10834 + }, + { + "epoch": 0.8755732439039172, + "grad_norm": 2.6723055839538574, + "learning_rate": 6.237173732781301e-06, + "loss": 1.0277, + "step": 10835 + }, + { + "epoch": 0.8756540536172448, + "grad_norm": 2.329822063446045, + "learning_rate": 6.236539707723008e-06, + "loss": 0.942, + "step": 10836 + }, + { + "epoch": 0.8757348633305724, + "grad_norm": 2.3134264945983887, + "learning_rate": 6.235905661485768e-06, + "loss": 0.9642, + "step": 10837 + }, + { + "epoch": 0.8758156730438998, + "grad_norm": 2.7943315505981445, + "learning_rate": 6.23527159408044e-06, + "loss": 0.9402, + "step": 10838 + }, + { + "epoch": 0.8758964827572274, + "grad_norm": 2.4493045806884766, + "learning_rate": 6.234637505517883e-06, + "loss": 1.0155, + "step": 10839 + }, + { + "epoch": 0.875977292470555, + "grad_norm": 2.632335901260376, + "learning_rate": 6.234003395808956e-06, + "loss": 0.9214, + "step": 10840 + }, + { + "epoch": 0.8760581021838825, + "grad_norm": 2.8725292682647705, + "learning_rate": 6.233369264964525e-06, + "loss": 0.8162, + "step": 10841 + }, + { + "epoch": 0.87613891189721, + "grad_norm": 2.914325714111328, + "learning_rate": 6.232735112995445e-06, + "loss": 0.9773, + "step": 10842 + }, + { + "epoch": 0.8762197216105376, + "grad_norm": 2.4684174060821533, + "learning_rate": 6.232100939912581e-06, + "loss": 0.8336, + "step": 10843 + }, + { + "epoch": 0.8763005313238651, + "grad_norm": 2.9864718914031982, + "learning_rate": 6.2314667457267944e-06, + "loss": 0.8743, + "step": 10844 + }, + { + "epoch": 0.8763813410371927, + "grad_norm": 2.4806735515594482, + "learning_rate": 6.230832530448947e-06, + "loss": 0.9626, + "step": 10845 + }, + { + "epoch": 0.8764621507505203, + "grad_norm": 3.071540355682373, + "learning_rate": 6.230198294089901e-06, + "loss": 0.8701, + "step": 10846 + }, + { + "epoch": 0.8765429604638477, + "grad_norm": 2.8915750980377197, + "learning_rate": 6.2295640366605205e-06, + "loss": 0.8878, + "step": 10847 + }, + { + "epoch": 0.8766237701771753, + "grad_norm": 2.8225057125091553, + "learning_rate": 6.228929758171667e-06, + "loss": 0.8599, + "step": 10848 + }, + { + "epoch": 0.8767045798905029, + "grad_norm": 3.210167646408081, + "learning_rate": 6.228295458634206e-06, + "loss": 0.8908, + "step": 10849 + }, + { + "epoch": 0.8767853896038303, + "grad_norm": 2.648663282394409, + "learning_rate": 6.227661138059e-06, + "loss": 0.8926, + "step": 10850 + }, + { + "epoch": 0.8768661993171579, + "grad_norm": 2.393606185913086, + "learning_rate": 6.227026796456915e-06, + "loss": 0.9686, + "step": 10851 + }, + { + "epoch": 0.8769470090304855, + "grad_norm": 2.7983009815216064, + "learning_rate": 6.226392433838815e-06, + "loss": 0.9171, + "step": 10852 + }, + { + "epoch": 0.877027818743813, + "grad_norm": 2.3201189041137695, + "learning_rate": 6.2257580502155655e-06, + "loss": 0.9764, + "step": 10853 + }, + { + "epoch": 0.8771086284571405, + "grad_norm": 2.386796712875366, + "learning_rate": 6.22512364559803e-06, + "loss": 0.8711, + "step": 10854 + }, + { + "epoch": 0.8771894381704681, + "grad_norm": 2.350531816482544, + "learning_rate": 6.2244892199970775e-06, + "loss": 0.909, + "step": 10855 + }, + { + "epoch": 0.8772702478837956, + "grad_norm": 2.6143767833709717, + "learning_rate": 6.223854773423571e-06, + "loss": 0.9039, + "step": 10856 + }, + { + "epoch": 0.8773510575971232, + "grad_norm": 2.9482271671295166, + "learning_rate": 6.223220305888379e-06, + "loss": 0.9026, + "step": 10857 + }, + { + "epoch": 0.8774318673104508, + "grad_norm": 2.7969493865966797, + "learning_rate": 6.222585817402368e-06, + "loss": 0.8364, + "step": 10858 + }, + { + "epoch": 0.8775126770237782, + "grad_norm": 2.87231707572937, + "learning_rate": 6.221951307976405e-06, + "loss": 0.8836, + "step": 10859 + }, + { + "epoch": 0.8775934867371058, + "grad_norm": 2.7605488300323486, + "learning_rate": 6.221316777621358e-06, + "loss": 0.894, + "step": 10860 + }, + { + "epoch": 0.8776742964504334, + "grad_norm": 2.188354015350342, + "learning_rate": 6.220682226348096e-06, + "loss": 0.8875, + "step": 10861 + }, + { + "epoch": 0.8777551061637608, + "grad_norm": 2.5554821491241455, + "learning_rate": 6.220047654167484e-06, + "loss": 0.7976, + "step": 10862 + }, + { + "epoch": 0.8778359158770884, + "grad_norm": 2.6469528675079346, + "learning_rate": 6.2194130610903945e-06, + "loss": 0.8315, + "step": 10863 + }, + { + "epoch": 0.877916725590416, + "grad_norm": 2.3564810752868652, + "learning_rate": 6.218778447127694e-06, + "loss": 0.7794, + "step": 10864 + }, + { + "epoch": 0.8779975353037435, + "grad_norm": 2.6810898780822754, + "learning_rate": 6.2181438122902535e-06, + "loss": 0.9194, + "step": 10865 + }, + { + "epoch": 0.878078345017071, + "grad_norm": 3.0478122234344482, + "learning_rate": 6.217509156588941e-06, + "loss": 0.937, + "step": 10866 + }, + { + "epoch": 0.8781591547303986, + "grad_norm": 2.703763008117676, + "learning_rate": 6.2168744800346296e-06, + "loss": 0.913, + "step": 10867 + }, + { + "epoch": 0.8782399644437261, + "grad_norm": 2.9125471115112305, + "learning_rate": 6.216239782638185e-06, + "loss": 0.9359, + "step": 10868 + }, + { + "epoch": 0.8783207741570537, + "grad_norm": 2.6811916828155518, + "learning_rate": 6.215605064410484e-06, + "loss": 1.0597, + "step": 10869 + }, + { + "epoch": 0.8784015838703813, + "grad_norm": 3.099846363067627, + "learning_rate": 6.2149703253623925e-06, + "loss": 0.9312, + "step": 10870 + }, + { + "epoch": 0.8784823935837087, + "grad_norm": 2.476093053817749, + "learning_rate": 6.214335565504785e-06, + "loss": 0.9466, + "step": 10871 + }, + { + "epoch": 0.8785632032970363, + "grad_norm": 2.362313985824585, + "learning_rate": 6.213700784848532e-06, + "loss": 0.8724, + "step": 10872 + }, + { + "epoch": 0.8786440130103639, + "grad_norm": 2.5924103260040283, + "learning_rate": 6.213065983404507e-06, + "loss": 0.8915, + "step": 10873 + }, + { + "epoch": 0.8787248227236913, + "grad_norm": 2.9155962467193604, + "learning_rate": 6.2124311611835816e-06, + "loss": 0.927, + "step": 10874 + }, + { + "epoch": 0.8788056324370189, + "grad_norm": 2.7962844371795654, + "learning_rate": 6.211796318196631e-06, + "loss": 0.9504, + "step": 10875 + }, + { + "epoch": 0.8788864421503465, + "grad_norm": 2.7248804569244385, + "learning_rate": 6.211161454454524e-06, + "loss": 0.9766, + "step": 10876 + }, + { + "epoch": 0.878967251863674, + "grad_norm": 2.827620267868042, + "learning_rate": 6.210526569968139e-06, + "loss": 0.9193, + "step": 10877 + }, + { + "epoch": 0.8790480615770015, + "grad_norm": 2.3883299827575684, + "learning_rate": 6.2098916647483476e-06, + "loss": 0.7755, + "step": 10878 + }, + { + "epoch": 0.8791288712903291, + "grad_norm": 2.251025915145874, + "learning_rate": 6.209256738806024e-06, + "loss": 0.9077, + "step": 10879 + }, + { + "epoch": 0.8792096810036566, + "grad_norm": 2.837599992752075, + "learning_rate": 6.208621792152045e-06, + "loss": 0.8625, + "step": 10880 + }, + { + "epoch": 0.8792904907169842, + "grad_norm": 2.9547653198242188, + "learning_rate": 6.207986824797284e-06, + "loss": 0.9137, + "step": 10881 + }, + { + "epoch": 0.8793713004303118, + "grad_norm": 2.667105197906494, + "learning_rate": 6.207351836752615e-06, + "loss": 0.9531, + "step": 10882 + }, + { + "epoch": 0.8794521101436392, + "grad_norm": 2.4493191242218018, + "learning_rate": 6.206716828028918e-06, + "loss": 1.0634, + "step": 10883 + }, + { + "epoch": 0.8795329198569668, + "grad_norm": 2.635343313217163, + "learning_rate": 6.206081798637066e-06, + "loss": 0.9985, + "step": 10884 + }, + { + "epoch": 0.8796137295702944, + "grad_norm": 2.2418689727783203, + "learning_rate": 6.205446748587935e-06, + "loss": 0.98, + "step": 10885 + }, + { + "epoch": 0.8796945392836218, + "grad_norm": 2.3565685749053955, + "learning_rate": 6.204811677892405e-06, + "loss": 0.98, + "step": 10886 + }, + { + "epoch": 0.8797753489969494, + "grad_norm": 2.800968885421753, + "learning_rate": 6.20417658656135e-06, + "loss": 0.9563, + "step": 10887 + }, + { + "epoch": 0.879856158710277, + "grad_norm": 2.515580177307129, + "learning_rate": 6.20354147460565e-06, + "loss": 0.9428, + "step": 10888 + }, + { + "epoch": 0.8799369684236045, + "grad_norm": 2.726541042327881, + "learning_rate": 6.2029063420361826e-06, + "loss": 0.7699, + "step": 10889 + }, + { + "epoch": 0.880017778136932, + "grad_norm": 2.676410436630249, + "learning_rate": 6.202271188863823e-06, + "loss": 0.9635, + "step": 10890 + }, + { + "epoch": 0.8800985878502596, + "grad_norm": 2.9437105655670166, + "learning_rate": 6.201636015099455e-06, + "loss": 1.0041, + "step": 10891 + }, + { + "epoch": 0.8801793975635871, + "grad_norm": 2.6276378631591797, + "learning_rate": 6.201000820753953e-06, + "loss": 0.9229, + "step": 10892 + }, + { + "epoch": 0.8802602072769147, + "grad_norm": 2.8323397636413574, + "learning_rate": 6.200365605838199e-06, + "loss": 0.9107, + "step": 10893 + }, + { + "epoch": 0.8803410169902423, + "grad_norm": 2.2161734104156494, + "learning_rate": 6.199730370363072e-06, + "loss": 0.9561, + "step": 10894 + }, + { + "epoch": 0.8804218267035697, + "grad_norm": 2.293673515319824, + "learning_rate": 6.199095114339452e-06, + "loss": 0.8399, + "step": 10895 + }, + { + "epoch": 0.8805026364168973, + "grad_norm": 2.8733720779418945, + "learning_rate": 6.198459837778219e-06, + "loss": 0.9495, + "step": 10896 + }, + { + "epoch": 0.8805834461302249, + "grad_norm": 2.476972818374634, + "learning_rate": 6.197824540690254e-06, + "loss": 0.9946, + "step": 10897 + }, + { + "epoch": 0.8806642558435523, + "grad_norm": 2.2801151275634766, + "learning_rate": 6.19718922308644e-06, + "loss": 0.7916, + "step": 10898 + }, + { + "epoch": 0.8807450655568799, + "grad_norm": 2.419496774673462, + "learning_rate": 6.196553884977654e-06, + "loss": 0.8658, + "step": 10899 + }, + { + "epoch": 0.8808258752702075, + "grad_norm": 2.6121389865875244, + "learning_rate": 6.195918526374782e-06, + "loss": 0.9492, + "step": 10900 + }, + { + "epoch": 0.880906684983535, + "grad_norm": 2.437551498413086, + "learning_rate": 6.195283147288704e-06, + "loss": 0.9233, + "step": 10901 + }, + { + "epoch": 0.8809874946968625, + "grad_norm": 2.4147794246673584, + "learning_rate": 6.194647747730305e-06, + "loss": 1.1507, + "step": 10902 + }, + { + "epoch": 0.8810683044101901, + "grad_norm": 2.5784285068511963, + "learning_rate": 6.194012327710464e-06, + "loss": 0.8709, + "step": 10903 + }, + { + "epoch": 0.8811491141235176, + "grad_norm": 2.5563831329345703, + "learning_rate": 6.1933768872400665e-06, + "loss": 1.0097, + "step": 10904 + }, + { + "epoch": 0.8812299238368452, + "grad_norm": 2.711738348007202, + "learning_rate": 6.1927414263299966e-06, + "loss": 0.8831, + "step": 10905 + }, + { + "epoch": 0.8813107335501728, + "grad_norm": 2.670205593109131, + "learning_rate": 6.192105944991138e-06, + "loss": 0.913, + "step": 10906 + }, + { + "epoch": 0.8813915432635002, + "grad_norm": 2.5287325382232666, + "learning_rate": 6.191470443234373e-06, + "loss": 0.9675, + "step": 10907 + }, + { + "epoch": 0.8814723529768278, + "grad_norm": 2.4057791233062744, + "learning_rate": 6.19083492107059e-06, + "loss": 0.9638, + "step": 10908 + }, + { + "epoch": 0.8815531626901554, + "grad_norm": 2.631802797317505, + "learning_rate": 6.19019937851067e-06, + "loss": 0.9026, + "step": 10909 + }, + { + "epoch": 0.8816339724034828, + "grad_norm": 2.7692408561706543, + "learning_rate": 6.189563815565499e-06, + "loss": 0.8508, + "step": 10910 + }, + { + "epoch": 0.8817147821168104, + "grad_norm": 2.1514453887939453, + "learning_rate": 6.188928232245966e-06, + "loss": 0.8769, + "step": 10911 + }, + { + "epoch": 0.881795591830138, + "grad_norm": 2.7200186252593994, + "learning_rate": 6.188292628562953e-06, + "loss": 0.7712, + "step": 10912 + }, + { + "epoch": 0.8818764015434655, + "grad_norm": 3.0090413093566895, + "learning_rate": 6.187657004527348e-06, + "loss": 1.01, + "step": 10913 + }, + { + "epoch": 0.881957211256793, + "grad_norm": 2.3555073738098145, + "learning_rate": 6.187021360150038e-06, + "loss": 0.9096, + "step": 10914 + }, + { + "epoch": 0.8820380209701206, + "grad_norm": 2.7792162895202637, + "learning_rate": 6.186385695441909e-06, + "loss": 0.999, + "step": 10915 + }, + { + "epoch": 0.8821188306834481, + "grad_norm": 2.5428154468536377, + "learning_rate": 6.18575001041385e-06, + "loss": 1.0028, + "step": 10916 + }, + { + "epoch": 0.8821996403967757, + "grad_norm": 2.696187973022461, + "learning_rate": 6.185114305076748e-06, + "loss": 1.0562, + "step": 10917 + }, + { + "epoch": 0.8822804501101033, + "grad_norm": 2.7940385341644287, + "learning_rate": 6.184478579441491e-06, + "loss": 0.861, + "step": 10918 + }, + { + "epoch": 0.8823612598234307, + "grad_norm": 2.8222694396972656, + "learning_rate": 6.183842833518966e-06, + "loss": 0.9965, + "step": 10919 + }, + { + "epoch": 0.8824420695367583, + "grad_norm": 2.582070827484131, + "learning_rate": 6.183207067320065e-06, + "loss": 0.8817, + "step": 10920 + }, + { + "epoch": 0.8825228792500859, + "grad_norm": 2.641030788421631, + "learning_rate": 6.1825712808556734e-06, + "loss": 0.9073, + "step": 10921 + }, + { + "epoch": 0.8826036889634135, + "grad_norm": 2.370635747909546, + "learning_rate": 6.1819354741366855e-06, + "loss": 1.0039, + "step": 10922 + }, + { + "epoch": 0.8826844986767409, + "grad_norm": 2.588547945022583, + "learning_rate": 6.181299647173987e-06, + "loss": 0.967, + "step": 10923 + }, + { + "epoch": 0.8827653083900685, + "grad_norm": 2.910203456878662, + "learning_rate": 6.180663799978469e-06, + "loss": 0.9606, + "step": 10924 + }, + { + "epoch": 0.8828461181033961, + "grad_norm": 2.415900945663452, + "learning_rate": 6.180027932561022e-06, + "loss": 0.9355, + "step": 10925 + }, + { + "epoch": 0.8829269278167236, + "grad_norm": 2.5070905685424805, + "learning_rate": 6.179392044932539e-06, + "loss": 0.9732, + "step": 10926 + }, + { + "epoch": 0.8830077375300511, + "grad_norm": 3.0649960041046143, + "learning_rate": 6.178756137103908e-06, + "loss": 0.9669, + "step": 10927 + }, + { + "epoch": 0.8830885472433787, + "grad_norm": 2.8635520935058594, + "learning_rate": 6.178120209086024e-06, + "loss": 0.9476, + "step": 10928 + }, + { + "epoch": 0.8831693569567062, + "grad_norm": 2.382916212081909, + "learning_rate": 6.177484260889775e-06, + "loss": 0.9309, + "step": 10929 + }, + { + "epoch": 0.8832501666700338, + "grad_norm": 2.9815895557403564, + "learning_rate": 6.176848292526057e-06, + "loss": 0.9731, + "step": 10930 + }, + { + "epoch": 0.8833309763833613, + "grad_norm": 3.019881248474121, + "learning_rate": 6.176212304005759e-06, + "loss": 1.0128, + "step": 10931 + }, + { + "epoch": 0.8834117860966888, + "grad_norm": 2.5681111812591553, + "learning_rate": 6.175576295339776e-06, + "loss": 0.8593, + "step": 10932 + }, + { + "epoch": 0.8834925958100164, + "grad_norm": 2.969653606414795, + "learning_rate": 6.174940266539003e-06, + "loss": 0.905, + "step": 10933 + }, + { + "epoch": 0.883573405523344, + "grad_norm": 2.4689438343048096, + "learning_rate": 6.174304217614331e-06, + "loss": 0.8833, + "step": 10934 + }, + { + "epoch": 0.8836542152366714, + "grad_norm": 3.0760090351104736, + "learning_rate": 6.173668148576652e-06, + "loss": 0.945, + "step": 10935 + }, + { + "epoch": 0.883735024949999, + "grad_norm": 2.8236286640167236, + "learning_rate": 6.173032059436868e-06, + "loss": 0.9239, + "step": 10936 + }, + { + "epoch": 0.8838158346633266, + "grad_norm": 2.3985254764556885, + "learning_rate": 6.172395950205865e-06, + "loss": 0.9703, + "step": 10937 + }, + { + "epoch": 0.883896644376654, + "grad_norm": 2.001253128051758, + "learning_rate": 6.171759820894544e-06, + "loss": 0.9547, + "step": 10938 + }, + { + "epoch": 0.8839774540899816, + "grad_norm": 2.5760116577148438, + "learning_rate": 6.171123671513797e-06, + "loss": 0.8207, + "step": 10939 + }, + { + "epoch": 0.8840582638033092, + "grad_norm": 2.603823184967041, + "learning_rate": 6.170487502074521e-06, + "loss": 1.0051, + "step": 10940 + }, + { + "epoch": 0.8841390735166367, + "grad_norm": 2.527780771255493, + "learning_rate": 6.169851312587612e-06, + "loss": 0.9405, + "step": 10941 + }, + { + "epoch": 0.8842198832299643, + "grad_norm": 2.5697104930877686, + "learning_rate": 6.169215103063967e-06, + "loss": 0.862, + "step": 10942 + }, + { + "epoch": 0.8843006929432918, + "grad_norm": 2.6602511405944824, + "learning_rate": 6.168578873514481e-06, + "loss": 0.9605, + "step": 10943 + }, + { + "epoch": 0.8843815026566193, + "grad_norm": 2.4392359256744385, + "learning_rate": 6.1679426239500525e-06, + "loss": 0.8319, + "step": 10944 + }, + { + "epoch": 0.8844623123699469, + "grad_norm": 2.3442440032958984, + "learning_rate": 6.167306354381579e-06, + "loss": 0.9502, + "step": 10945 + }, + { + "epoch": 0.8845431220832745, + "grad_norm": 2.7411835193634033, + "learning_rate": 6.166670064819957e-06, + "loss": 0.9613, + "step": 10946 + }, + { + "epoch": 0.8846239317966019, + "grad_norm": 2.5848867893218994, + "learning_rate": 6.166033755276087e-06, + "loss": 0.9501, + "step": 10947 + }, + { + "epoch": 0.8847047415099295, + "grad_norm": 2.587322235107422, + "learning_rate": 6.165397425760865e-06, + "loss": 0.9307, + "step": 10948 + }, + { + "epoch": 0.8847855512232571, + "grad_norm": 2.7896323204040527, + "learning_rate": 6.16476107628519e-06, + "loss": 0.8878, + "step": 10949 + }, + { + "epoch": 0.8848663609365846, + "grad_norm": 2.699521541595459, + "learning_rate": 6.164124706859962e-06, + "loss": 0.8462, + "step": 10950 + }, + { + "epoch": 0.8849471706499121, + "grad_norm": 2.4634876251220703, + "learning_rate": 6.16348831749608e-06, + "loss": 0.81, + "step": 10951 + }, + { + "epoch": 0.8850279803632397, + "grad_norm": 2.4452576637268066, + "learning_rate": 6.162851908204446e-06, + "loss": 0.9694, + "step": 10952 + }, + { + "epoch": 0.8851087900765672, + "grad_norm": 2.78956937789917, + "learning_rate": 6.162215478995956e-06, + "loss": 0.8283, + "step": 10953 + }, + { + "epoch": 0.8851895997898948, + "grad_norm": 2.6653823852539062, + "learning_rate": 6.161579029881514e-06, + "loss": 1.0106, + "step": 10954 + }, + { + "epoch": 0.8852704095032223, + "grad_norm": 3.050967216491699, + "learning_rate": 6.160942560872019e-06, + "loss": 0.984, + "step": 10955 + }, + { + "epoch": 0.8853512192165498, + "grad_norm": 2.719074249267578, + "learning_rate": 6.160306071978374e-06, + "loss": 1.0023, + "step": 10956 + }, + { + "epoch": 0.8854320289298774, + "grad_norm": 2.5805485248565674, + "learning_rate": 6.1596695632114765e-06, + "loss": 0.9016, + "step": 10957 + }, + { + "epoch": 0.885512838643205, + "grad_norm": 2.601243734359741, + "learning_rate": 6.159033034582234e-06, + "loss": 0.9219, + "step": 10958 + }, + { + "epoch": 0.8855936483565324, + "grad_norm": 2.6648449897766113, + "learning_rate": 6.158396486101545e-06, + "loss": 0.9181, + "step": 10959 + }, + { + "epoch": 0.88567445806986, + "grad_norm": 2.6205074787139893, + "learning_rate": 6.157759917780313e-06, + "loss": 0.912, + "step": 10960 + }, + { + "epoch": 0.8857552677831876, + "grad_norm": 2.8496949672698975, + "learning_rate": 6.1571233296294405e-06, + "loss": 0.8803, + "step": 10961 + }, + { + "epoch": 0.885836077496515, + "grad_norm": 3.2378621101379395, + "learning_rate": 6.156486721659831e-06, + "loss": 0.9554, + "step": 10962 + }, + { + "epoch": 0.8859168872098426, + "grad_norm": 2.833557367324829, + "learning_rate": 6.155850093882388e-06, + "loss": 0.8796, + "step": 10963 + }, + { + "epoch": 0.8859976969231702, + "grad_norm": 2.7678937911987305, + "learning_rate": 6.155213446308016e-06, + "loss": 0.9024, + "step": 10964 + }, + { + "epoch": 0.8860785066364977, + "grad_norm": 2.7520134449005127, + "learning_rate": 6.1545767789476195e-06, + "loss": 0.8869, + "step": 10965 + }, + { + "epoch": 0.8861593163498253, + "grad_norm": 2.827279567718506, + "learning_rate": 6.153940091812101e-06, + "loss": 0.9128, + "step": 10966 + }, + { + "epoch": 0.8862401260631528, + "grad_norm": 2.4025306701660156, + "learning_rate": 6.153303384912367e-06, + "loss": 0.911, + "step": 10967 + }, + { + "epoch": 0.8863209357764803, + "grad_norm": 2.7287189960479736, + "learning_rate": 6.152666658259323e-06, + "loss": 0.9942, + "step": 10968 + }, + { + "epoch": 0.8864017454898079, + "grad_norm": 2.780747413635254, + "learning_rate": 6.152029911863872e-06, + "loss": 1.056, + "step": 10969 + }, + { + "epoch": 0.8864825552031355, + "grad_norm": 3.0128583908081055, + "learning_rate": 6.151393145736925e-06, + "loss": 1.0137, + "step": 10970 + }, + { + "epoch": 0.8865633649164629, + "grad_norm": 2.7319211959838867, + "learning_rate": 6.150756359889382e-06, + "loss": 0.8488, + "step": 10971 + }, + { + "epoch": 0.8866441746297905, + "grad_norm": 2.742210865020752, + "learning_rate": 6.150119554332156e-06, + "loss": 0.8695, + "step": 10972 + }, + { + "epoch": 0.8867249843431181, + "grad_norm": 2.666001319885254, + "learning_rate": 6.1494827290761505e-06, + "loss": 0.8589, + "step": 10973 + }, + { + "epoch": 0.8868057940564456, + "grad_norm": 2.4835453033447266, + "learning_rate": 6.1488458841322704e-06, + "loss": 0.8999, + "step": 10974 + }, + { + "epoch": 0.8868866037697731, + "grad_norm": 2.4682981967926025, + "learning_rate": 6.14820901951143e-06, + "loss": 0.8755, + "step": 10975 + }, + { + "epoch": 0.8869674134831007, + "grad_norm": 2.6600914001464844, + "learning_rate": 6.14757213522453e-06, + "loss": 0.9856, + "step": 10976 + }, + { + "epoch": 0.8870482231964282, + "grad_norm": 2.5532233715057373, + "learning_rate": 6.146935231282484e-06, + "loss": 1.0455, + "step": 10977 + }, + { + "epoch": 0.8871290329097558, + "grad_norm": 2.6507294178009033, + "learning_rate": 6.1462983076961965e-06, + "loss": 0.9244, + "step": 10978 + }, + { + "epoch": 0.8872098426230833, + "grad_norm": 2.4792628288269043, + "learning_rate": 6.145661364476582e-06, + "loss": 0.7741, + "step": 10979 + }, + { + "epoch": 0.8872906523364108, + "grad_norm": 2.6015565395355225, + "learning_rate": 6.145024401634543e-06, + "loss": 0.9, + "step": 10980 + }, + { + "epoch": 0.8873714620497384, + "grad_norm": 2.4832382202148438, + "learning_rate": 6.144387419180994e-06, + "loss": 0.9151, + "step": 10981 + }, + { + "epoch": 0.887452271763066, + "grad_norm": 2.9805192947387695, + "learning_rate": 6.143750417126843e-06, + "loss": 1.0015, + "step": 10982 + }, + { + "epoch": 0.8875330814763934, + "grad_norm": 2.3034491539001465, + "learning_rate": 6.143113395483001e-06, + "loss": 0.8822, + "step": 10983 + }, + { + "epoch": 0.887613891189721, + "grad_norm": 2.5549511909484863, + "learning_rate": 6.142476354260378e-06, + "loss": 1.0219, + "step": 10984 + }, + { + "epoch": 0.8876947009030486, + "grad_norm": 2.888711929321289, + "learning_rate": 6.141839293469887e-06, + "loss": 0.8712, + "step": 10985 + }, + { + "epoch": 0.887775510616376, + "grad_norm": 2.5501506328582764, + "learning_rate": 6.141202213122436e-06, + "loss": 1.0281, + "step": 10986 + }, + { + "epoch": 0.8878563203297036, + "grad_norm": 2.496525764465332, + "learning_rate": 6.14056511322894e-06, + "loss": 0.8497, + "step": 10987 + }, + { + "epoch": 0.8879371300430312, + "grad_norm": 2.4370250701904297, + "learning_rate": 6.139927993800308e-06, + "loss": 0.8917, + "step": 10988 + }, + { + "epoch": 0.8880179397563587, + "grad_norm": 2.651669502258301, + "learning_rate": 6.139290854847455e-06, + "loss": 0.9555, + "step": 10989 + }, + { + "epoch": 0.8880987494696863, + "grad_norm": 2.6617887020111084, + "learning_rate": 6.138653696381292e-06, + "loss": 0.9536, + "step": 10990 + }, + { + "epoch": 0.8881795591830138, + "grad_norm": 2.939648151397705, + "learning_rate": 6.1380165184127315e-06, + "loss": 0.9768, + "step": 10991 + }, + { + "epoch": 0.8882603688963413, + "grad_norm": 2.7883365154266357, + "learning_rate": 6.137379320952688e-06, + "loss": 0.9624, + "step": 10992 + }, + { + "epoch": 0.8883411786096689, + "grad_norm": 2.86812686920166, + "learning_rate": 6.1367421040120765e-06, + "loss": 0.9721, + "step": 10993 + }, + { + "epoch": 0.8884219883229965, + "grad_norm": 2.7602572441101074, + "learning_rate": 6.136104867601806e-06, + "loss": 0.8592, + "step": 10994 + }, + { + "epoch": 0.8885027980363239, + "grad_norm": 3.019890785217285, + "learning_rate": 6.135467611732798e-06, + "loss": 0.9147, + "step": 10995 + }, + { + "epoch": 0.8885836077496515, + "grad_norm": 2.5440359115600586, + "learning_rate": 6.13483033641596e-06, + "loss": 0.8378, + "step": 10996 + }, + { + "epoch": 0.8886644174629791, + "grad_norm": 2.7020857334136963, + "learning_rate": 6.134193041662213e-06, + "loss": 1.0581, + "step": 10997 + }, + { + "epoch": 0.8887452271763066, + "grad_norm": 2.3741939067840576, + "learning_rate": 6.133555727482468e-06, + "loss": 0.9238, + "step": 10998 + }, + { + "epoch": 0.8888260368896341, + "grad_norm": 3.2570548057556152, + "learning_rate": 6.132918393887643e-06, + "loss": 0.9342, + "step": 10999 + }, + { + "epoch": 0.8889068466029617, + "grad_norm": 2.0186386108398438, + "learning_rate": 6.132281040888653e-06, + "loss": 1.014, + "step": 11000 + }, + { + "epoch": 0.8889068466029617, + "eval_loss": 0.7640535831451416, + "eval_runtime": 815.0076, + "eval_samples_per_second": 102.289, + "eval_steps_per_second": 12.786, + "step": 11000 + }, + { + "epoch": 0.8889876563162892, + "grad_norm": 2.4831643104553223, + "learning_rate": 6.131643668496417e-06, + "loss": 0.9275, + "step": 11001 + }, + { + "epoch": 0.8890684660296168, + "grad_norm": 2.905310869216919, + "learning_rate": 6.131006276721845e-06, + "loss": 0.8717, + "step": 11002 + }, + { + "epoch": 0.8891492757429443, + "grad_norm": 2.3836982250213623, + "learning_rate": 6.130368865575861e-06, + "loss": 0.9481, + "step": 11003 + }, + { + "epoch": 0.8892300854562718, + "grad_norm": 2.923142910003662, + "learning_rate": 6.129731435069379e-06, + "loss": 1.0165, + "step": 11004 + }, + { + "epoch": 0.8893108951695994, + "grad_norm": 2.755363941192627, + "learning_rate": 6.129093985213317e-06, + "loss": 1.0159, + "step": 11005 + }, + { + "epoch": 0.889391704882927, + "grad_norm": 2.3933568000793457, + "learning_rate": 6.1284565160185925e-06, + "loss": 0.868, + "step": 11006 + }, + { + "epoch": 0.8894725145962544, + "grad_norm": 2.53499698638916, + "learning_rate": 6.127819027496127e-06, + "loss": 0.89, + "step": 11007 + }, + { + "epoch": 0.889553324309582, + "grad_norm": 2.4543845653533936, + "learning_rate": 6.127181519656834e-06, + "loss": 1.0205, + "step": 11008 + }, + { + "epoch": 0.8896341340229096, + "grad_norm": 2.6794471740722656, + "learning_rate": 6.126543992511638e-06, + "loss": 0.8723, + "step": 11009 + }, + { + "epoch": 0.889714943736237, + "grad_norm": 2.9267797470092773, + "learning_rate": 6.1259064460714514e-06, + "loss": 0.938, + "step": 11010 + }, + { + "epoch": 0.8897957534495646, + "grad_norm": 2.8477814197540283, + "learning_rate": 6.1252688803472016e-06, + "loss": 0.8794, + "step": 11011 + }, + { + "epoch": 0.8898765631628922, + "grad_norm": 2.8415892124176025, + "learning_rate": 6.124631295349803e-06, + "loss": 0.9823, + "step": 11012 + }, + { + "epoch": 0.8899573728762197, + "grad_norm": 2.753502607345581, + "learning_rate": 6.123993691090178e-06, + "loss": 1.0443, + "step": 11013 + }, + { + "epoch": 0.8900381825895473, + "grad_norm": 2.7093663215637207, + "learning_rate": 6.1233560675792465e-06, + "loss": 0.881, + "step": 11014 + }, + { + "epoch": 0.8901189923028748, + "grad_norm": 2.673405170440674, + "learning_rate": 6.122718424827931e-06, + "loss": 0.9464, + "step": 11015 + }, + { + "epoch": 0.8901998020162023, + "grad_norm": 2.669354200363159, + "learning_rate": 6.122080762847151e-06, + "loss": 0.9228, + "step": 11016 + }, + { + "epoch": 0.8902806117295299, + "grad_norm": 2.303164005279541, + "learning_rate": 6.12144308164783e-06, + "loss": 0.938, + "step": 11017 + }, + { + "epoch": 0.8903614214428575, + "grad_norm": 2.6203126907348633, + "learning_rate": 6.120805381240888e-06, + "loss": 0.7984, + "step": 11018 + }, + { + "epoch": 0.8904422311561849, + "grad_norm": 2.3148365020751953, + "learning_rate": 6.120167661637247e-06, + "loss": 0.8208, + "step": 11019 + }, + { + "epoch": 0.8905230408695125, + "grad_norm": 2.8378329277038574, + "learning_rate": 6.119529922847832e-06, + "loss": 0.9023, + "step": 11020 + }, + { + "epoch": 0.8906038505828401, + "grad_norm": 2.271371841430664, + "learning_rate": 6.1188921648835646e-06, + "loss": 0.8225, + "step": 11021 + }, + { + "epoch": 0.8906846602961676, + "grad_norm": 2.5016963481903076, + "learning_rate": 6.118254387755367e-06, + "loss": 0.8901, + "step": 11022 + }, + { + "epoch": 0.8907654700094951, + "grad_norm": 2.6315226554870605, + "learning_rate": 6.117616591474166e-06, + "loss": 0.988, + "step": 11023 + }, + { + "epoch": 0.8908462797228227, + "grad_norm": 3.0709304809570312, + "learning_rate": 6.1169787760508806e-06, + "loss": 1.0308, + "step": 11024 + }, + { + "epoch": 0.8909270894361502, + "grad_norm": 2.647787094116211, + "learning_rate": 6.11634094149644e-06, + "loss": 0.9285, + "step": 11025 + }, + { + "epoch": 0.8910078991494778, + "grad_norm": 2.7675271034240723, + "learning_rate": 6.115703087821765e-06, + "loss": 0.9306, + "step": 11026 + }, + { + "epoch": 0.8910887088628053, + "grad_norm": 2.588698148727417, + "learning_rate": 6.115065215037782e-06, + "loss": 0.8785, + "step": 11027 + }, + { + "epoch": 0.8911695185761328, + "grad_norm": 2.85332989692688, + "learning_rate": 6.1144273231554165e-06, + "loss": 0.8812, + "step": 11028 + }, + { + "epoch": 0.8912503282894604, + "grad_norm": 2.28715181350708, + "learning_rate": 6.113789412185594e-06, + "loss": 1.0643, + "step": 11029 + }, + { + "epoch": 0.891331138002788, + "grad_norm": 2.4027044773101807, + "learning_rate": 6.113151482139241e-06, + "loss": 0.9467, + "step": 11030 + }, + { + "epoch": 0.8914119477161154, + "grad_norm": 3.2616589069366455, + "learning_rate": 6.112513533027282e-06, + "loss": 0.9655, + "step": 11031 + }, + { + "epoch": 0.891492757429443, + "grad_norm": 2.323657512664795, + "learning_rate": 6.111875564860645e-06, + "loss": 0.9994, + "step": 11032 + }, + { + "epoch": 0.8915735671427706, + "grad_norm": 2.5008647441864014, + "learning_rate": 6.1112375776502565e-06, + "loss": 0.911, + "step": 11033 + }, + { + "epoch": 0.891654376856098, + "grad_norm": 2.446787118911743, + "learning_rate": 6.110599571407043e-06, + "loss": 0.9701, + "step": 11034 + }, + { + "epoch": 0.8917351865694256, + "grad_norm": 2.3228187561035156, + "learning_rate": 6.109961546141932e-06, + "loss": 0.9283, + "step": 11035 + }, + { + "epoch": 0.8918159962827532, + "grad_norm": 2.728739023208618, + "learning_rate": 6.109323501865853e-06, + "loss": 0.8999, + "step": 11036 + }, + { + "epoch": 0.8918968059960807, + "grad_norm": 2.7874202728271484, + "learning_rate": 6.108685438589732e-06, + "loss": 0.8816, + "step": 11037 + }, + { + "epoch": 0.8919776157094083, + "grad_norm": 2.433786392211914, + "learning_rate": 6.108047356324498e-06, + "loss": 0.9881, + "step": 11038 + }, + { + "epoch": 0.8920584254227358, + "grad_norm": 2.3185479640960693, + "learning_rate": 6.107409255081082e-06, + "loss": 0.8745, + "step": 11039 + }, + { + "epoch": 0.8921392351360633, + "grad_norm": 2.803328514099121, + "learning_rate": 6.1067711348704104e-06, + "loss": 0.9704, + "step": 11040 + }, + { + "epoch": 0.8922200448493909, + "grad_norm": 2.8452160358428955, + "learning_rate": 6.1061329957034145e-06, + "loss": 0.893, + "step": 11041 + }, + { + "epoch": 0.8923008545627185, + "grad_norm": 2.792412042617798, + "learning_rate": 6.105494837591023e-06, + "loss": 0.9528, + "step": 11042 + }, + { + "epoch": 0.8923816642760459, + "grad_norm": 2.8253371715545654, + "learning_rate": 6.104856660544165e-06, + "loss": 0.8227, + "step": 11043 + }, + { + "epoch": 0.8924624739893735, + "grad_norm": 3.3019015789031982, + "learning_rate": 6.104218464573773e-06, + "loss": 0.9624, + "step": 11044 + }, + { + "epoch": 0.8925432837027011, + "grad_norm": 2.3970048427581787, + "learning_rate": 6.103580249690777e-06, + "loss": 0.9066, + "step": 11045 + }, + { + "epoch": 0.8926240934160286, + "grad_norm": 2.600074529647827, + "learning_rate": 6.102942015906109e-06, + "loss": 1.0343, + "step": 11046 + }, + { + "epoch": 0.8927049031293561, + "grad_norm": 2.8846118450164795, + "learning_rate": 6.102303763230698e-06, + "loss": 0.8015, + "step": 11047 + }, + { + "epoch": 0.8927857128426837, + "grad_norm": 3.0641016960144043, + "learning_rate": 6.101665491675479e-06, + "loss": 0.8428, + "step": 11048 + }, + { + "epoch": 0.8928665225560112, + "grad_norm": 2.2472636699676514, + "learning_rate": 6.101027201251381e-06, + "loss": 1.0, + "step": 11049 + }, + { + "epoch": 0.8929473322693388, + "grad_norm": 2.562082052230835, + "learning_rate": 6.100388891969337e-06, + "loss": 1.0599, + "step": 11050 + }, + { + "epoch": 0.8930281419826663, + "grad_norm": 2.611246109008789, + "learning_rate": 6.099750563840282e-06, + "loss": 0.9197, + "step": 11051 + }, + { + "epoch": 0.8931089516959939, + "grad_norm": 2.5023462772369385, + "learning_rate": 6.099112216875147e-06, + "loss": 0.8714, + "step": 11052 + }, + { + "epoch": 0.8931897614093214, + "grad_norm": 2.7679619789123535, + "learning_rate": 6.098473851084865e-06, + "loss": 0.8747, + "step": 11053 + }, + { + "epoch": 0.893270571122649, + "grad_norm": 2.6410112380981445, + "learning_rate": 6.097835466480372e-06, + "loss": 0.9315, + "step": 11054 + }, + { + "epoch": 0.8933513808359765, + "grad_norm": 2.6186962127685547, + "learning_rate": 6.097197063072598e-06, + "loss": 0.9532, + "step": 11055 + }, + { + "epoch": 0.893432190549304, + "grad_norm": 2.960371494293213, + "learning_rate": 6.096558640872482e-06, + "loss": 0.945, + "step": 11056 + }, + { + "epoch": 0.8935130002626316, + "grad_norm": 2.620387077331543, + "learning_rate": 6.0959201998909555e-06, + "loss": 1.0442, + "step": 11057 + }, + { + "epoch": 0.8935938099759592, + "grad_norm": 2.879689931869507, + "learning_rate": 6.095281740138954e-06, + "loss": 1.0425, + "step": 11058 + }, + { + "epoch": 0.8936746196892866, + "grad_norm": 2.5625710487365723, + "learning_rate": 6.094643261627413e-06, + "loss": 0.8893, + "step": 11059 + }, + { + "epoch": 0.8937554294026142, + "grad_norm": 2.65523099899292, + "learning_rate": 6.09400476436727e-06, + "loss": 0.901, + "step": 11060 + }, + { + "epoch": 0.8938362391159418, + "grad_norm": 2.5786983966827393, + "learning_rate": 6.093366248369456e-06, + "loss": 0.9458, + "step": 11061 + }, + { + "epoch": 0.8939170488292693, + "grad_norm": 3.3469996452331543, + "learning_rate": 6.092727713644912e-06, + "loss": 0.9274, + "step": 11062 + }, + { + "epoch": 0.8939978585425968, + "grad_norm": 2.6339290142059326, + "learning_rate": 6.0920891602045705e-06, + "loss": 0.9025, + "step": 11063 + }, + { + "epoch": 0.8940786682559244, + "grad_norm": 2.3962159156799316, + "learning_rate": 6.091450588059374e-06, + "loss": 0.9841, + "step": 11064 + }, + { + "epoch": 0.8941594779692519, + "grad_norm": 2.6802737712860107, + "learning_rate": 6.090811997220256e-06, + "loss": 0.9009, + "step": 11065 + }, + { + "epoch": 0.8942402876825795, + "grad_norm": 2.751603126525879, + "learning_rate": 6.090173387698154e-06, + "loss": 0.9618, + "step": 11066 + }, + { + "epoch": 0.894321097395907, + "grad_norm": 2.7731077671051025, + "learning_rate": 6.089534759504005e-06, + "loss": 0.7689, + "step": 11067 + }, + { + "epoch": 0.8944019071092345, + "grad_norm": 2.53085994720459, + "learning_rate": 6.088896112648751e-06, + "loss": 0.9838, + "step": 11068 + }, + { + "epoch": 0.8944827168225621, + "grad_norm": 2.406844139099121, + "learning_rate": 6.0882574471433266e-06, + "loss": 0.9046, + "step": 11069 + }, + { + "epoch": 0.8945635265358897, + "grad_norm": 2.21785831451416, + "learning_rate": 6.087618762998673e-06, + "loss": 0.6916, + "step": 11070 + }, + { + "epoch": 0.8946443362492171, + "grad_norm": 2.4833412170410156, + "learning_rate": 6.086980060225727e-06, + "loss": 0.9391, + "step": 11071 + }, + { + "epoch": 0.8947251459625447, + "grad_norm": 2.9493179321289062, + "learning_rate": 6.08634133883543e-06, + "loss": 1.0017, + "step": 11072 + }, + { + "epoch": 0.8948059556758723, + "grad_norm": 2.618835687637329, + "learning_rate": 6.0857025988387205e-06, + "loss": 0.9473, + "step": 11073 + }, + { + "epoch": 0.8948867653891998, + "grad_norm": 2.991755485534668, + "learning_rate": 6.085063840246541e-06, + "loss": 1.0111, + "step": 11074 + }, + { + "epoch": 0.8949675751025273, + "grad_norm": 2.582691192626953, + "learning_rate": 6.084425063069827e-06, + "loss": 0.8572, + "step": 11075 + }, + { + "epoch": 0.8950483848158549, + "grad_norm": 2.7862629890441895, + "learning_rate": 6.083786267319526e-06, + "loss": 0.9168, + "step": 11076 + }, + { + "epoch": 0.8951291945291824, + "grad_norm": 3.048041343688965, + "learning_rate": 6.083147453006572e-06, + "loss": 0.8811, + "step": 11077 + }, + { + "epoch": 0.89521000424251, + "grad_norm": 2.6280767917633057, + "learning_rate": 6.082508620141911e-06, + "loss": 0.7797, + "step": 11078 + }, + { + "epoch": 0.8952908139558375, + "grad_norm": 2.5700151920318604, + "learning_rate": 6.081869768736485e-06, + "loss": 0.9484, + "step": 11079 + }, + { + "epoch": 0.895371623669165, + "grad_norm": 2.745976686477661, + "learning_rate": 6.0812308988012326e-06, + "loss": 0.9675, + "step": 11080 + }, + { + "epoch": 0.8954524333824926, + "grad_norm": 2.3650968074798584, + "learning_rate": 6.0805920103470975e-06, + "loss": 0.8606, + "step": 11081 + }, + { + "epoch": 0.8955332430958202, + "grad_norm": 2.681852340698242, + "learning_rate": 6.079953103385025e-06, + "loss": 0.8773, + "step": 11082 + }, + { + "epoch": 0.8956140528091476, + "grad_norm": 2.5203349590301514, + "learning_rate": 6.079314177925952e-06, + "loss": 0.9918, + "step": 11083 + }, + { + "epoch": 0.8956948625224752, + "grad_norm": 2.5127477645874023, + "learning_rate": 6.078675233980829e-06, + "loss": 1.0656, + "step": 11084 + }, + { + "epoch": 0.8957756722358028, + "grad_norm": 2.8647875785827637, + "learning_rate": 6.078036271560594e-06, + "loss": 0.9295, + "step": 11085 + }, + { + "epoch": 0.8958564819491303, + "grad_norm": 2.4162609577178955, + "learning_rate": 6.077397290676194e-06, + "loss": 0.9008, + "step": 11086 + }, + { + "epoch": 0.8959372916624578, + "grad_norm": 2.674287796020508, + "learning_rate": 6.076758291338571e-06, + "loss": 0.9015, + "step": 11087 + }, + { + "epoch": 0.8960181013757854, + "grad_norm": 3.113100290298462, + "learning_rate": 6.076119273558672e-06, + "loss": 0.9221, + "step": 11088 + }, + { + "epoch": 0.8960989110891129, + "grad_norm": 2.8635690212249756, + "learning_rate": 6.07548023734744e-06, + "loss": 1.0285, + "step": 11089 + }, + { + "epoch": 0.8961797208024405, + "grad_norm": 2.6006035804748535, + "learning_rate": 6.07484118271582e-06, + "loss": 0.9718, + "step": 11090 + }, + { + "epoch": 0.896260530515768, + "grad_norm": 3.4972732067108154, + "learning_rate": 6.0742021096747584e-06, + "loss": 0.8588, + "step": 11091 + }, + { + "epoch": 0.8963413402290955, + "grad_norm": 2.4558448791503906, + "learning_rate": 6.073563018235202e-06, + "loss": 0.8262, + "step": 11092 + }, + { + "epoch": 0.8964221499424231, + "grad_norm": 2.393813371658325, + "learning_rate": 6.0729239084080935e-06, + "loss": 1.0198, + "step": 11093 + }, + { + "epoch": 0.8965029596557507, + "grad_norm": 2.940751314163208, + "learning_rate": 6.072284780204383e-06, + "loss": 0.8167, + "step": 11094 + }, + { + "epoch": 0.8965837693690781, + "grad_norm": 2.8081979751586914, + "learning_rate": 6.071645633635015e-06, + "loss": 0.9558, + "step": 11095 + }, + { + "epoch": 0.8966645790824057, + "grad_norm": 3.488110303878784, + "learning_rate": 6.071006468710936e-06, + "loss": 0.9361, + "step": 11096 + }, + { + "epoch": 0.8967453887957333, + "grad_norm": 2.4365170001983643, + "learning_rate": 6.070367285443096e-06, + "loss": 0.8705, + "step": 11097 + }, + { + "epoch": 0.8968261985090608, + "grad_norm": 2.6625611782073975, + "learning_rate": 6.069728083842441e-06, + "loss": 1.0411, + "step": 11098 + }, + { + "epoch": 0.8969070082223883, + "grad_norm": 2.440619707107544, + "learning_rate": 6.069088863919919e-06, + "loss": 0.9265, + "step": 11099 + }, + { + "epoch": 0.8969878179357159, + "grad_norm": 2.6538734436035156, + "learning_rate": 6.06844962568648e-06, + "loss": 1.0621, + "step": 11100 + }, + { + "epoch": 0.8970686276490434, + "grad_norm": 2.297936201095581, + "learning_rate": 6.0678103691530686e-06, + "loss": 0.8572, + "step": 11101 + }, + { + "epoch": 0.897149437362371, + "grad_norm": 2.6770808696746826, + "learning_rate": 6.067171094330637e-06, + "loss": 0.9088, + "step": 11102 + }, + { + "epoch": 0.8972302470756985, + "grad_norm": 2.618973970413208, + "learning_rate": 6.0665318012301345e-06, + "loss": 0.9641, + "step": 11103 + }, + { + "epoch": 0.897311056789026, + "grad_norm": 2.47927188873291, + "learning_rate": 6.065892489862509e-06, + "loss": 0.9679, + "step": 11104 + }, + { + "epoch": 0.8973918665023536, + "grad_norm": 2.793241500854492, + "learning_rate": 6.065253160238712e-06, + "loss": 0.9245, + "step": 11105 + }, + { + "epoch": 0.8974726762156812, + "grad_norm": 2.7728309631347656, + "learning_rate": 6.064613812369692e-06, + "loss": 0.9167, + "step": 11106 + }, + { + "epoch": 0.8975534859290086, + "grad_norm": 2.337017774581909, + "learning_rate": 6.063974446266402e-06, + "loss": 0.9126, + "step": 11107 + }, + { + "epoch": 0.8976342956423362, + "grad_norm": 2.579535722732544, + "learning_rate": 6.063335061939789e-06, + "loss": 0.9078, + "step": 11108 + }, + { + "epoch": 0.8977151053556638, + "grad_norm": 2.4813663959503174, + "learning_rate": 6.062695659400807e-06, + "loss": 0.9675, + "step": 11109 + }, + { + "epoch": 0.8977959150689913, + "grad_norm": 2.7406249046325684, + "learning_rate": 6.062056238660408e-06, + "loss": 0.9199, + "step": 11110 + }, + { + "epoch": 0.8978767247823188, + "grad_norm": 2.772345542907715, + "learning_rate": 6.061416799729541e-06, + "loss": 0.9108, + "step": 11111 + }, + { + "epoch": 0.8979575344956464, + "grad_norm": 2.8032472133636475, + "learning_rate": 6.060777342619159e-06, + "loss": 0.9564, + "step": 11112 + }, + { + "epoch": 0.8980383442089739, + "grad_norm": 2.816871404647827, + "learning_rate": 6.060137867340217e-06, + "loss": 0.818, + "step": 11113 + }, + { + "epoch": 0.8981191539223015, + "grad_norm": 2.663496494293213, + "learning_rate": 6.0594983739036614e-06, + "loss": 0.9222, + "step": 11114 + }, + { + "epoch": 0.898199963635629, + "grad_norm": 2.336209774017334, + "learning_rate": 6.0588588623204535e-06, + "loss": 0.8684, + "step": 11115 + }, + { + "epoch": 0.8982807733489565, + "grad_norm": 2.6434648036956787, + "learning_rate": 6.05821933260154e-06, + "loss": 0.9634, + "step": 11116 + }, + { + "epoch": 0.8983615830622841, + "grad_norm": 2.485868215560913, + "learning_rate": 6.057579784757879e-06, + "loss": 0.9125, + "step": 11117 + }, + { + "epoch": 0.8984423927756117, + "grad_norm": 2.5955581665039062, + "learning_rate": 6.05694021880042e-06, + "loss": 0.9674, + "step": 11118 + }, + { + "epoch": 0.8985232024889391, + "grad_norm": 3.049374580383301, + "learning_rate": 6.056300634740121e-06, + "loss": 0.8956, + "step": 11119 + }, + { + "epoch": 0.8986040122022667, + "grad_norm": 2.6581337451934814, + "learning_rate": 6.055661032587934e-06, + "loss": 0.8908, + "step": 11120 + }, + { + "epoch": 0.8986848219155943, + "grad_norm": 2.7783305644989014, + "learning_rate": 6.055021412354817e-06, + "loss": 0.9802, + "step": 11121 + }, + { + "epoch": 0.8987656316289218, + "grad_norm": 2.3142786026000977, + "learning_rate": 6.054381774051721e-06, + "loss": 1.0919, + "step": 11122 + }, + { + "epoch": 0.8988464413422493, + "grad_norm": 2.6475307941436768, + "learning_rate": 6.053742117689604e-06, + "loss": 1.0031, + "step": 11123 + }, + { + "epoch": 0.8989272510555769, + "grad_norm": 3.1276907920837402, + "learning_rate": 6.053102443279422e-06, + "loss": 1.0043, + "step": 11124 + }, + { + "epoch": 0.8990080607689044, + "grad_norm": 2.736304759979248, + "learning_rate": 6.052462750832128e-06, + "loss": 0.9356, + "step": 11125 + }, + { + "epoch": 0.899088870482232, + "grad_norm": 2.513861894607544, + "learning_rate": 6.0518230403586845e-06, + "loss": 0.8544, + "step": 11126 + }, + { + "epoch": 0.8991696801955595, + "grad_norm": 2.8509857654571533, + "learning_rate": 6.0511833118700425e-06, + "loss": 1.0235, + "step": 11127 + }, + { + "epoch": 0.899250489908887, + "grad_norm": 2.5635993480682373, + "learning_rate": 6.05054356537716e-06, + "loss": 0.916, + "step": 11128 + }, + { + "epoch": 0.8993312996222146, + "grad_norm": 2.838994264602661, + "learning_rate": 6.049903800890997e-06, + "loss": 0.8859, + "step": 11129 + }, + { + "epoch": 0.8994121093355422, + "grad_norm": 2.704026699066162, + "learning_rate": 6.04926401842251e-06, + "loss": 0.9902, + "step": 11130 + }, + { + "epoch": 0.8994929190488696, + "grad_norm": 2.9368770122528076, + "learning_rate": 6.048624217982655e-06, + "loss": 0.9803, + "step": 11131 + }, + { + "epoch": 0.8995737287621972, + "grad_norm": 2.286797046661377, + "learning_rate": 6.04798439958239e-06, + "loss": 0.8546, + "step": 11132 + }, + { + "epoch": 0.8996545384755248, + "grad_norm": 2.619764804840088, + "learning_rate": 6.047344563232678e-06, + "loss": 1.0141, + "step": 11133 + }, + { + "epoch": 0.8997353481888523, + "grad_norm": 2.3920695781707764, + "learning_rate": 6.046704708944473e-06, + "loss": 0.8981, + "step": 11134 + }, + { + "epoch": 0.8998161579021798, + "grad_norm": 2.8092894554138184, + "learning_rate": 6.046064836728739e-06, + "loss": 0.9764, + "step": 11135 + }, + { + "epoch": 0.8998969676155074, + "grad_norm": 2.5988481044769287, + "learning_rate": 6.04542494659643e-06, + "loss": 0.9469, + "step": 11136 + }, + { + "epoch": 0.8999777773288349, + "grad_norm": 2.972332000732422, + "learning_rate": 6.044785038558509e-06, + "loss": 0.8987, + "step": 11137 + }, + { + "epoch": 0.9000585870421625, + "grad_norm": 2.5728893280029297, + "learning_rate": 6.044145112625937e-06, + "loss": 1.062, + "step": 11138 + }, + { + "epoch": 0.90013939675549, + "grad_norm": 2.64799165725708, + "learning_rate": 6.04350516880967e-06, + "loss": 0.8855, + "step": 11139 + }, + { + "epoch": 0.9002202064688175, + "grad_norm": 2.9226863384246826, + "learning_rate": 6.042865207120674e-06, + "loss": 0.8502, + "step": 11140 + }, + { + "epoch": 0.9003010161821451, + "grad_norm": 2.4827048778533936, + "learning_rate": 6.042225227569908e-06, + "loss": 0.9605, + "step": 11141 + }, + { + "epoch": 0.9003818258954727, + "grad_norm": 2.5582778453826904, + "learning_rate": 6.04158523016833e-06, + "loss": 0.947, + "step": 11142 + }, + { + "epoch": 0.9004626356088001, + "grad_norm": 2.6642837524414062, + "learning_rate": 6.040945214926906e-06, + "loss": 0.8842, + "step": 11143 + }, + { + "epoch": 0.9005434453221277, + "grad_norm": 2.6373517513275146, + "learning_rate": 6.040305181856597e-06, + "loss": 0.9237, + "step": 11144 + }, + { + "epoch": 0.9006242550354553, + "grad_norm": 2.9530553817749023, + "learning_rate": 6.039665130968365e-06, + "loss": 0.93, + "step": 11145 + }, + { + "epoch": 0.9007050647487828, + "grad_norm": 2.6635923385620117, + "learning_rate": 6.039025062273171e-06, + "loss": 0.843, + "step": 11146 + }, + { + "epoch": 0.9007858744621103, + "grad_norm": 2.7025904655456543, + "learning_rate": 6.038384975781979e-06, + "loss": 0.8554, + "step": 11147 + }, + { + "epoch": 0.9008666841754379, + "grad_norm": 3.593562126159668, + "learning_rate": 6.037744871505753e-06, + "loss": 0.9221, + "step": 11148 + }, + { + "epoch": 0.9009474938887654, + "grad_norm": 2.8435449600219727, + "learning_rate": 6.037104749455457e-06, + "loss": 0.8964, + "step": 11149 + }, + { + "epoch": 0.901028303602093, + "grad_norm": 2.2871532440185547, + "learning_rate": 6.03646460964205e-06, + "loss": 1.0125, + "step": 11150 + }, + { + "epoch": 0.9011091133154205, + "grad_norm": 3.5378856658935547, + "learning_rate": 6.0358244520765015e-06, + "loss": 0.8169, + "step": 11151 + }, + { + "epoch": 0.901189923028748, + "grad_norm": 3.1749320030212402, + "learning_rate": 6.035184276769775e-06, + "loss": 0.9427, + "step": 11152 + }, + { + "epoch": 0.9012707327420756, + "grad_norm": 2.827220916748047, + "learning_rate": 6.034544083732832e-06, + "loss": 0.8491, + "step": 11153 + }, + { + "epoch": 0.9013515424554032, + "grad_norm": 3.2153000831604004, + "learning_rate": 6.033903872976639e-06, + "loss": 0.8981, + "step": 11154 + }, + { + "epoch": 0.9014323521687306, + "grad_norm": 2.887916088104248, + "learning_rate": 6.033263644512162e-06, + "loss": 1.0583, + "step": 11155 + }, + { + "epoch": 0.9015131618820582, + "grad_norm": 2.70062255859375, + "learning_rate": 6.032623398350367e-06, + "loss": 0.9759, + "step": 11156 + }, + { + "epoch": 0.9015939715953858, + "grad_norm": 2.5867743492126465, + "learning_rate": 6.031983134502219e-06, + "loss": 0.8996, + "step": 11157 + }, + { + "epoch": 0.9016747813087133, + "grad_norm": 2.6162171363830566, + "learning_rate": 6.031342852978684e-06, + "loss": 0.8708, + "step": 11158 + }, + { + "epoch": 0.9017555910220408, + "grad_norm": 2.669729709625244, + "learning_rate": 6.030702553790729e-06, + "loss": 0.7751, + "step": 11159 + }, + { + "epoch": 0.9018364007353684, + "grad_norm": 2.3106298446655273, + "learning_rate": 6.030062236949321e-06, + "loss": 0.8672, + "step": 11160 + }, + { + "epoch": 0.9019172104486959, + "grad_norm": 2.2521443367004395, + "learning_rate": 6.029421902465425e-06, + "loss": 0.8974, + "step": 11161 + }, + { + "epoch": 0.9019980201620235, + "grad_norm": 2.7022483348846436, + "learning_rate": 6.028781550350011e-06, + "loss": 0.9219, + "step": 11162 + }, + { + "epoch": 0.902078829875351, + "grad_norm": 3.1653780937194824, + "learning_rate": 6.028141180614047e-06, + "loss": 0.9184, + "step": 11163 + }, + { + "epoch": 0.9021596395886785, + "grad_norm": 2.475628614425659, + "learning_rate": 6.0275007932684986e-06, + "loss": 0.9396, + "step": 11164 + }, + { + "epoch": 0.9022404493020061, + "grad_norm": 3.138230562210083, + "learning_rate": 6.026860388324336e-06, + "loss": 1.0098, + "step": 11165 + }, + { + "epoch": 0.9023212590153337, + "grad_norm": 2.9436910152435303, + "learning_rate": 6.026219965792526e-06, + "loss": 1.0456, + "step": 11166 + }, + { + "epoch": 0.9024020687286611, + "grad_norm": 2.330030918121338, + "learning_rate": 6.025579525684039e-06, + "loss": 1.0196, + "step": 11167 + }, + { + "epoch": 0.9024828784419887, + "grad_norm": 2.46738600730896, + "learning_rate": 6.024939068009843e-06, + "loss": 0.9496, + "step": 11168 + }, + { + "epoch": 0.9025636881553163, + "grad_norm": 2.793869733810425, + "learning_rate": 6.0242985927809085e-06, + "loss": 0.8757, + "step": 11169 + }, + { + "epoch": 0.9026444978686438, + "grad_norm": 3.0422780513763428, + "learning_rate": 6.023658100008205e-06, + "loss": 1.0815, + "step": 11170 + }, + { + "epoch": 0.9027253075819713, + "grad_norm": 2.429168701171875, + "learning_rate": 6.0230175897027035e-06, + "loss": 0.9945, + "step": 11171 + }, + { + "epoch": 0.9028061172952989, + "grad_norm": 2.726804256439209, + "learning_rate": 6.022377061875372e-06, + "loss": 0.9619, + "step": 11172 + }, + { + "epoch": 0.9028869270086264, + "grad_norm": 2.9235785007476807, + "learning_rate": 6.021736516537183e-06, + "loss": 1.0218, + "step": 11173 + }, + { + "epoch": 0.902967736721954, + "grad_norm": 2.6464831829071045, + "learning_rate": 6.0210959536991095e-06, + "loss": 0.9229, + "step": 11174 + }, + { + "epoch": 0.9030485464352815, + "grad_norm": 2.5794496536254883, + "learning_rate": 6.0204553733721185e-06, + "loss": 0.8573, + "step": 11175 + }, + { + "epoch": 0.903129356148609, + "grad_norm": 2.67161226272583, + "learning_rate": 6.019814775567183e-06, + "loss": 0.95, + "step": 11176 + }, + { + "epoch": 0.9032101658619366, + "grad_norm": 2.5570461750030518, + "learning_rate": 6.019174160295277e-06, + "loss": 0.8372, + "step": 11177 + }, + { + "epoch": 0.9032909755752642, + "grad_norm": 2.561553716659546, + "learning_rate": 6.018533527567369e-06, + "loss": 0.9202, + "step": 11178 + }, + { + "epoch": 0.9033717852885916, + "grad_norm": 2.3138413429260254, + "learning_rate": 6.017892877394435e-06, + "loss": 0.8676, + "step": 11179 + }, + { + "epoch": 0.9034525950019192, + "grad_norm": 2.2205593585968018, + "learning_rate": 6.017252209787449e-06, + "loss": 0.8491, + "step": 11180 + }, + { + "epoch": 0.9035334047152468, + "grad_norm": 2.1689064502716064, + "learning_rate": 6.016611524757378e-06, + "loss": 1.0517, + "step": 11181 + }, + { + "epoch": 0.9036142144285744, + "grad_norm": 2.705734968185425, + "learning_rate": 6.015970822315201e-06, + "loss": 0.9101, + "step": 11182 + }, + { + "epoch": 0.9036950241419018, + "grad_norm": 3.0479984283447266, + "learning_rate": 6.015330102471889e-06, + "loss": 0.993, + "step": 11183 + }, + { + "epoch": 0.9037758338552294, + "grad_norm": 3.1683247089385986, + "learning_rate": 6.0146893652384165e-06, + "loss": 0.8702, + "step": 11184 + }, + { + "epoch": 0.903856643568557, + "grad_norm": 2.922452688217163, + "learning_rate": 6.014048610625758e-06, + "loss": 0.8839, + "step": 11185 + }, + { + "epoch": 0.9039374532818845, + "grad_norm": 2.803436040878296, + "learning_rate": 6.013407838644888e-06, + "loss": 0.9096, + "step": 11186 + }, + { + "epoch": 0.904018262995212, + "grad_norm": 2.449906587600708, + "learning_rate": 6.012767049306781e-06, + "loss": 0.9873, + "step": 11187 + }, + { + "epoch": 0.9040990727085396, + "grad_norm": 2.6783127784729004, + "learning_rate": 6.012126242622414e-06, + "loss": 0.9777, + "step": 11188 + }, + { + "epoch": 0.9041798824218671, + "grad_norm": 2.1947245597839355, + "learning_rate": 6.011485418602758e-06, + "loss": 0.992, + "step": 11189 + }, + { + "epoch": 0.9042606921351947, + "grad_norm": 2.6547415256500244, + "learning_rate": 6.010844577258795e-06, + "loss": 0.9082, + "step": 11190 + }, + { + "epoch": 0.9043415018485222, + "grad_norm": 3.241971969604492, + "learning_rate": 6.010203718601497e-06, + "loss": 0.9937, + "step": 11191 + }, + { + "epoch": 0.9044223115618497, + "grad_norm": 2.841972827911377, + "learning_rate": 6.009562842641841e-06, + "loss": 0.95, + "step": 11192 + }, + { + "epoch": 0.9045031212751773, + "grad_norm": 2.440422534942627, + "learning_rate": 6.008921949390805e-06, + "loss": 1.0262, + "step": 11193 + }, + { + "epoch": 0.9045839309885049, + "grad_norm": 2.7049107551574707, + "learning_rate": 6.008281038859364e-06, + "loss": 0.9716, + "step": 11194 + }, + { + "epoch": 0.9046647407018323, + "grad_norm": 2.8333685398101807, + "learning_rate": 6.007640111058495e-06, + "loss": 0.9343, + "step": 11195 + }, + { + "epoch": 0.9047455504151599, + "grad_norm": 3.6632275581359863, + "learning_rate": 6.006999165999179e-06, + "loss": 0.9483, + "step": 11196 + }, + { + "epoch": 0.9048263601284875, + "grad_norm": 3.449188709259033, + "learning_rate": 6.006358203692389e-06, + "loss": 0.8426, + "step": 11197 + }, + { + "epoch": 0.904907169841815, + "grad_norm": 2.5142784118652344, + "learning_rate": 6.005717224149108e-06, + "loss": 0.894, + "step": 11198 + }, + { + "epoch": 0.9049879795551425, + "grad_norm": 2.63183331489563, + "learning_rate": 6.005076227380311e-06, + "loss": 0.8921, + "step": 11199 + }, + { + "epoch": 0.9050687892684701, + "grad_norm": 2.4130797386169434, + "learning_rate": 6.0044352133969774e-06, + "loss": 0.9531, + "step": 11200 + }, + { + "epoch": 0.9051495989817976, + "grad_norm": 2.420818567276001, + "learning_rate": 6.0037941822100865e-06, + "loss": 0.8832, + "step": 11201 + }, + { + "epoch": 0.9052304086951252, + "grad_norm": 2.771743059158325, + "learning_rate": 6.00315313383062e-06, + "loss": 0.8849, + "step": 11202 + }, + { + "epoch": 0.9053112184084527, + "grad_norm": 2.915541172027588, + "learning_rate": 6.002512068269553e-06, + "loss": 0.8743, + "step": 11203 + }, + { + "epoch": 0.9053920281217802, + "grad_norm": 2.7193355560302734, + "learning_rate": 6.001870985537869e-06, + "loss": 1.0206, + "step": 11204 + }, + { + "epoch": 0.9054728378351078, + "grad_norm": 2.4656622409820557, + "learning_rate": 6.0012298856465455e-06, + "loss": 0.8858, + "step": 11205 + }, + { + "epoch": 0.9055536475484354, + "grad_norm": 2.5466904640197754, + "learning_rate": 6.000588768606566e-06, + "loss": 0.9497, + "step": 11206 + }, + { + "epoch": 0.9056344572617628, + "grad_norm": 2.4730398654937744, + "learning_rate": 5.999947634428908e-06, + "loss": 1.0108, + "step": 11207 + }, + { + "epoch": 0.9057152669750904, + "grad_norm": 2.6922435760498047, + "learning_rate": 5.999306483124557e-06, + "loss": 0.9906, + "step": 11208 + }, + { + "epoch": 0.905796076688418, + "grad_norm": 2.407792091369629, + "learning_rate": 5.998665314704488e-06, + "loss": 0.9333, + "step": 11209 + }, + { + "epoch": 0.9058768864017455, + "grad_norm": 2.6304855346679688, + "learning_rate": 5.99802412917969e-06, + "loss": 0.9595, + "step": 11210 + }, + { + "epoch": 0.905957696115073, + "grad_norm": 2.8021180629730225, + "learning_rate": 5.9973829265611395e-06, + "loss": 0.9729, + "step": 11211 + }, + { + "epoch": 0.9060385058284006, + "grad_norm": 2.408714532852173, + "learning_rate": 5.9967417068598214e-06, + "loss": 0.8815, + "step": 11212 + }, + { + "epoch": 0.9061193155417281, + "grad_norm": 2.826852798461914, + "learning_rate": 5.996100470086716e-06, + "loss": 0.968, + "step": 11213 + }, + { + "epoch": 0.9062001252550557, + "grad_norm": 2.453939199447632, + "learning_rate": 5.995459216252808e-06, + "loss": 0.8878, + "step": 11214 + }, + { + "epoch": 0.9062809349683832, + "grad_norm": 2.2264769077301025, + "learning_rate": 5.99481794536908e-06, + "loss": 1.1432, + "step": 11215 + }, + { + "epoch": 0.9063617446817107, + "grad_norm": 2.6396334171295166, + "learning_rate": 5.994176657446517e-06, + "loss": 0.8677, + "step": 11216 + }, + { + "epoch": 0.9064425543950383, + "grad_norm": 2.892902135848999, + "learning_rate": 5.993535352496099e-06, + "loss": 0.8049, + "step": 11217 + }, + { + "epoch": 0.9065233641083659, + "grad_norm": 2.7703239917755127, + "learning_rate": 5.992894030528815e-06, + "loss": 0.9139, + "step": 11218 + }, + { + "epoch": 0.9066041738216933, + "grad_norm": 2.794983148574829, + "learning_rate": 5.992252691555644e-06, + "loss": 0.9116, + "step": 11219 + }, + { + "epoch": 0.9066849835350209, + "grad_norm": 3.235292911529541, + "learning_rate": 5.9916113355875746e-06, + "loss": 0.9596, + "step": 11220 + }, + { + "epoch": 0.9067657932483485, + "grad_norm": 2.62172532081604, + "learning_rate": 5.99096996263559e-06, + "loss": 0.9795, + "step": 11221 + }, + { + "epoch": 0.906846602961676, + "grad_norm": 2.896568775177002, + "learning_rate": 5.990328572710675e-06, + "loss": 0.9722, + "step": 11222 + }, + { + "epoch": 0.9069274126750035, + "grad_norm": 2.6957719326019287, + "learning_rate": 5.989687165823816e-06, + "loss": 0.8538, + "step": 11223 + }, + { + "epoch": 0.9070082223883311, + "grad_norm": 2.6205925941467285, + "learning_rate": 5.989045741985999e-06, + "loss": 0.9752, + "step": 11224 + }, + { + "epoch": 0.9070890321016586, + "grad_norm": 2.3968451023101807, + "learning_rate": 5.9884043012082095e-06, + "loss": 1.0031, + "step": 11225 + }, + { + "epoch": 0.9071698418149862, + "grad_norm": 2.6111690998077393, + "learning_rate": 5.987762843501432e-06, + "loss": 0.9345, + "step": 11226 + }, + { + "epoch": 0.9072506515283137, + "grad_norm": 2.851182222366333, + "learning_rate": 5.987121368876657e-06, + "loss": 0.9331, + "step": 11227 + }, + { + "epoch": 0.9073314612416412, + "grad_norm": 2.383997678756714, + "learning_rate": 5.9864798773448686e-06, + "loss": 0.9757, + "step": 11228 + }, + { + "epoch": 0.9074122709549688, + "grad_norm": 2.7834765911102295, + "learning_rate": 5.985838368917054e-06, + "loss": 1.1084, + "step": 11229 + }, + { + "epoch": 0.9074930806682964, + "grad_norm": 2.503272533416748, + "learning_rate": 5.985196843604202e-06, + "loss": 0.8915, + "step": 11230 + }, + { + "epoch": 0.9075738903816238, + "grad_norm": 2.509212017059326, + "learning_rate": 5.984555301417301e-06, + "loss": 1.0119, + "step": 11231 + }, + { + "epoch": 0.9076547000949514, + "grad_norm": 2.4349122047424316, + "learning_rate": 5.983913742367337e-06, + "loss": 0.8884, + "step": 11232 + }, + { + "epoch": 0.907735509808279, + "grad_norm": 2.9278783798217773, + "learning_rate": 5.983272166465299e-06, + "loss": 0.9587, + "step": 11233 + }, + { + "epoch": 0.9078163195216065, + "grad_norm": 2.4104089736938477, + "learning_rate": 5.982630573722176e-06, + "loss": 0.9787, + "step": 11234 + }, + { + "epoch": 0.907897129234934, + "grad_norm": 2.575124502182007, + "learning_rate": 5.981988964148957e-06, + "loss": 0.9259, + "step": 11235 + }, + { + "epoch": 0.9079779389482616, + "grad_norm": 2.7539727687835693, + "learning_rate": 5.981347337756631e-06, + "loss": 0.9839, + "step": 11236 + }, + { + "epoch": 0.9080587486615891, + "grad_norm": 2.7652857303619385, + "learning_rate": 5.980705694556187e-06, + "loss": 1.0116, + "step": 11237 + }, + { + "epoch": 0.9081395583749167, + "grad_norm": 3.1519527435302734, + "learning_rate": 5.980064034558616e-06, + "loss": 0.8814, + "step": 11238 + }, + { + "epoch": 0.9082203680882442, + "grad_norm": 2.9400460720062256, + "learning_rate": 5.9794223577749085e-06, + "loss": 0.8787, + "step": 11239 + }, + { + "epoch": 0.9083011778015717, + "grad_norm": 2.1733686923980713, + "learning_rate": 5.978780664216051e-06, + "loss": 1.0534, + "step": 11240 + }, + { + "epoch": 0.9083819875148993, + "grad_norm": 2.6477341651916504, + "learning_rate": 5.97813895389304e-06, + "loss": 0.8846, + "step": 11241 + }, + { + "epoch": 0.9084627972282269, + "grad_norm": 2.5101115703582764, + "learning_rate": 5.977497226816863e-06, + "loss": 0.9324, + "step": 11242 + }, + { + "epoch": 0.9085436069415543, + "grad_norm": 2.8830316066741943, + "learning_rate": 5.976855482998512e-06, + "loss": 0.9997, + "step": 11243 + }, + { + "epoch": 0.9086244166548819, + "grad_norm": 2.2825920581817627, + "learning_rate": 5.976213722448976e-06, + "loss": 0.8071, + "step": 11244 + }, + { + "epoch": 0.9087052263682095, + "grad_norm": 2.728428840637207, + "learning_rate": 5.97557194517925e-06, + "loss": 0.9208, + "step": 11245 + }, + { + "epoch": 0.908786036081537, + "grad_norm": 2.7809622287750244, + "learning_rate": 5.974930151200327e-06, + "loss": 0.9794, + "step": 11246 + }, + { + "epoch": 0.9088668457948645, + "grad_norm": 2.585909605026245, + "learning_rate": 5.974288340523196e-06, + "loss": 0.9535, + "step": 11247 + }, + { + "epoch": 0.9089476555081921, + "grad_norm": 2.725247859954834, + "learning_rate": 5.97364651315885e-06, + "loss": 0.9258, + "step": 11248 + }, + { + "epoch": 0.9090284652215196, + "grad_norm": 2.8630080223083496, + "learning_rate": 5.973004669118287e-06, + "loss": 0.8904, + "step": 11249 + }, + { + "epoch": 0.9091092749348472, + "grad_norm": 2.7674100399017334, + "learning_rate": 5.972362808412493e-06, + "loss": 0.8673, + "step": 11250 + }, + { + "epoch": 0.9091900846481747, + "grad_norm": 2.413344383239746, + "learning_rate": 5.971720931052466e-06, + "loss": 0.8533, + "step": 11251 + }, + { + "epoch": 0.9092708943615022, + "grad_norm": 2.512573480606079, + "learning_rate": 5.971079037049199e-06, + "loss": 0.9182, + "step": 11252 + }, + { + "epoch": 0.9093517040748298, + "grad_norm": 2.508984327316284, + "learning_rate": 5.9704371264136865e-06, + "loss": 0.9513, + "step": 11253 + }, + { + "epoch": 0.9094325137881574, + "grad_norm": 2.908663749694824, + "learning_rate": 5.9697951991569205e-06, + "loss": 0.9486, + "step": 11254 + }, + { + "epoch": 0.9095133235014848, + "grad_norm": 3.1995677947998047, + "learning_rate": 5.969153255289899e-06, + "loss": 0.924, + "step": 11255 + }, + { + "epoch": 0.9095941332148124, + "grad_norm": 2.529698133468628, + "learning_rate": 5.9685112948236135e-06, + "loss": 0.9569, + "step": 11256 + }, + { + "epoch": 0.90967494292814, + "grad_norm": 2.919224739074707, + "learning_rate": 5.967869317769062e-06, + "loss": 0.9777, + "step": 11257 + }, + { + "epoch": 0.9097557526414675, + "grad_norm": 2.4557557106018066, + "learning_rate": 5.96722732413724e-06, + "loss": 0.9035, + "step": 11258 + }, + { + "epoch": 0.909836562354795, + "grad_norm": 2.8355343341827393, + "learning_rate": 5.966585313939142e-06, + "loss": 1.1082, + "step": 11259 + }, + { + "epoch": 0.9099173720681226, + "grad_norm": 2.2926454544067383, + "learning_rate": 5.965943287185764e-06, + "loss": 0.8098, + "step": 11260 + }, + { + "epoch": 0.9099981817814501, + "grad_norm": 2.4217917919158936, + "learning_rate": 5.965301243888105e-06, + "loss": 0.985, + "step": 11261 + }, + { + "epoch": 0.9100789914947777, + "grad_norm": 2.621795654296875, + "learning_rate": 5.964659184057157e-06, + "loss": 1.0147, + "step": 11262 + }, + { + "epoch": 0.9101598012081052, + "grad_norm": 2.48290753364563, + "learning_rate": 5.964017107703921e-06, + "loss": 1.0239, + "step": 11263 + }, + { + "epoch": 0.9102406109214327, + "grad_norm": 2.5411014556884766, + "learning_rate": 5.9633750148393924e-06, + "loss": 0.8103, + "step": 11264 + }, + { + "epoch": 0.9103214206347603, + "grad_norm": 2.88592267036438, + "learning_rate": 5.962732905474569e-06, + "loss": 0.966, + "step": 11265 + }, + { + "epoch": 0.9104022303480879, + "grad_norm": 3.0505177974700928, + "learning_rate": 5.962090779620447e-06, + "loss": 0.8293, + "step": 11266 + }, + { + "epoch": 0.9104830400614153, + "grad_norm": 2.4384031295776367, + "learning_rate": 5.961448637288029e-06, + "loss": 0.9133, + "step": 11267 + }, + { + "epoch": 0.9105638497747429, + "grad_norm": 2.8146374225616455, + "learning_rate": 5.960806478488308e-06, + "loss": 0.8945, + "step": 11268 + }, + { + "epoch": 0.9106446594880705, + "grad_norm": 2.360950231552124, + "learning_rate": 5.960164303232287e-06, + "loss": 0.9087, + "step": 11269 + }, + { + "epoch": 0.910725469201398, + "grad_norm": 2.6275992393493652, + "learning_rate": 5.95952211153096e-06, + "loss": 0.9402, + "step": 11270 + }, + { + "epoch": 0.9108062789147255, + "grad_norm": 2.512693166732788, + "learning_rate": 5.958879903395333e-06, + "loss": 0.9689, + "step": 11271 + }, + { + "epoch": 0.9108870886280531, + "grad_norm": 2.736257314682007, + "learning_rate": 5.958237678836399e-06, + "loss": 0.8769, + "step": 11272 + }, + { + "epoch": 0.9109678983413806, + "grad_norm": 2.865622043609619, + "learning_rate": 5.957595437865161e-06, + "loss": 0.8717, + "step": 11273 + }, + { + "epoch": 0.9110487080547082, + "grad_norm": 2.982100486755371, + "learning_rate": 5.956953180492618e-06, + "loss": 0.8607, + "step": 11274 + }, + { + "epoch": 0.9111295177680357, + "grad_norm": 2.5475263595581055, + "learning_rate": 5.956310906729773e-06, + "loss": 0.8857, + "step": 11275 + }, + { + "epoch": 0.9112103274813632, + "grad_norm": 2.3581409454345703, + "learning_rate": 5.955668616587622e-06, + "loss": 1.0149, + "step": 11276 + }, + { + "epoch": 0.9112911371946908, + "grad_norm": 2.7486884593963623, + "learning_rate": 5.95502631007717e-06, + "loss": 0.9528, + "step": 11277 + }, + { + "epoch": 0.9113719469080184, + "grad_norm": 2.519463062286377, + "learning_rate": 5.954383987209416e-06, + "loss": 1.1178, + "step": 11278 + }, + { + "epoch": 0.9114527566213458, + "grad_norm": 2.4370343685150146, + "learning_rate": 5.953741647995361e-06, + "loss": 0.8749, + "step": 11279 + }, + { + "epoch": 0.9115335663346734, + "grad_norm": 2.405346155166626, + "learning_rate": 5.953099292446007e-06, + "loss": 0.8948, + "step": 11280 + }, + { + "epoch": 0.911614376048001, + "grad_norm": 2.662438154220581, + "learning_rate": 5.952456920572358e-06, + "loss": 0.8326, + "step": 11281 + }, + { + "epoch": 0.9116951857613285, + "grad_norm": 3.1241025924682617, + "learning_rate": 5.9518145323854145e-06, + "loss": 0.8969, + "step": 11282 + }, + { + "epoch": 0.911775995474656, + "grad_norm": 3.142529249191284, + "learning_rate": 5.951172127896181e-06, + "loss": 0.9985, + "step": 11283 + }, + { + "epoch": 0.9118568051879836, + "grad_norm": 2.650031566619873, + "learning_rate": 5.950529707115655e-06, + "loss": 0.8894, + "step": 11284 + }, + { + "epoch": 0.9119376149013111, + "grad_norm": 3.0189099311828613, + "learning_rate": 5.949887270054848e-06, + "loss": 1.0169, + "step": 11285 + }, + { + "epoch": 0.9120184246146387, + "grad_norm": 2.3782217502593994, + "learning_rate": 5.949244816724757e-06, + "loss": 0.8812, + "step": 11286 + }, + { + "epoch": 0.9120992343279662, + "grad_norm": 2.6692121028900146, + "learning_rate": 5.948602347136386e-06, + "loss": 0.874, + "step": 11287 + }, + { + "epoch": 0.9121800440412937, + "grad_norm": 2.5997726917266846, + "learning_rate": 5.947959861300742e-06, + "loss": 0.8085, + "step": 11288 + }, + { + "epoch": 0.9122608537546213, + "grad_norm": 2.5190415382385254, + "learning_rate": 5.947317359228828e-06, + "loss": 0.9172, + "step": 11289 + }, + { + "epoch": 0.9123416634679489, + "grad_norm": 3.0348873138427734, + "learning_rate": 5.946674840931647e-06, + "loss": 0.9727, + "step": 11290 + }, + { + "epoch": 0.9124224731812763, + "grad_norm": 2.6810731887817383, + "learning_rate": 5.946032306420205e-06, + "loss": 0.8782, + "step": 11291 + }, + { + "epoch": 0.9125032828946039, + "grad_norm": 2.9404735565185547, + "learning_rate": 5.945389755705508e-06, + "loss": 0.9064, + "step": 11292 + }, + { + "epoch": 0.9125840926079315, + "grad_norm": 2.676630735397339, + "learning_rate": 5.9447471887985595e-06, + "loss": 1.0119, + "step": 11293 + }, + { + "epoch": 0.912664902321259, + "grad_norm": 2.9339253902435303, + "learning_rate": 5.944104605710367e-06, + "loss": 0.8553, + "step": 11294 + }, + { + "epoch": 0.9127457120345865, + "grad_norm": 3.127005100250244, + "learning_rate": 5.943462006451934e-06, + "loss": 0.8845, + "step": 11295 + }, + { + "epoch": 0.9128265217479141, + "grad_norm": 2.6678290367126465, + "learning_rate": 5.942819391034269e-06, + "loss": 0.9765, + "step": 11296 + }, + { + "epoch": 0.9129073314612416, + "grad_norm": 2.41629695892334, + "learning_rate": 5.942176759468378e-06, + "loss": 0.9757, + "step": 11297 + }, + { + "epoch": 0.9129881411745692, + "grad_norm": 2.2940430641174316, + "learning_rate": 5.941534111765267e-06, + "loss": 1.0025, + "step": 11298 + }, + { + "epoch": 0.9130689508878967, + "grad_norm": 2.623856782913208, + "learning_rate": 5.940891447935944e-06, + "loss": 0.9508, + "step": 11299 + }, + { + "epoch": 0.9131497606012242, + "grad_norm": 2.4364163875579834, + "learning_rate": 5.940248767991415e-06, + "loss": 1.0775, + "step": 11300 + }, + { + "epoch": 0.9132305703145518, + "grad_norm": 2.95477032661438, + "learning_rate": 5.939606071942687e-06, + "loss": 0.8752, + "step": 11301 + }, + { + "epoch": 0.9133113800278794, + "grad_norm": 2.120548963546753, + "learning_rate": 5.938963359800771e-06, + "loss": 0.852, + "step": 11302 + }, + { + "epoch": 0.9133921897412068, + "grad_norm": 2.6929080486297607, + "learning_rate": 5.938320631576671e-06, + "loss": 0.9214, + "step": 11303 + }, + { + "epoch": 0.9134729994545344, + "grad_norm": 2.3618075847625732, + "learning_rate": 5.937677887281399e-06, + "loss": 0.9602, + "step": 11304 + }, + { + "epoch": 0.913553809167862, + "grad_norm": 2.6843745708465576, + "learning_rate": 5.937035126925962e-06, + "loss": 0.9806, + "step": 11305 + }, + { + "epoch": 0.9136346188811895, + "grad_norm": 2.4290525913238525, + "learning_rate": 5.936392350521369e-06, + "loss": 0.8757, + "step": 11306 + }, + { + "epoch": 0.913715428594517, + "grad_norm": 2.4888224601745605, + "learning_rate": 5.9357495580786286e-06, + "loss": 0.9865, + "step": 11307 + }, + { + "epoch": 0.9137962383078446, + "grad_norm": 2.39178729057312, + "learning_rate": 5.935106749608752e-06, + "loss": 0.9695, + "step": 11308 + }, + { + "epoch": 0.9138770480211722, + "grad_norm": 2.755352258682251, + "learning_rate": 5.934463925122749e-06, + "loss": 0.9641, + "step": 11309 + }, + { + "epoch": 0.9139578577344997, + "grad_norm": 2.814725875854492, + "learning_rate": 5.933821084631625e-06, + "loss": 0.9578, + "step": 11310 + }, + { + "epoch": 0.9140386674478272, + "grad_norm": 2.35859751701355, + "learning_rate": 5.9331782281463965e-06, + "loss": 0.8058, + "step": 11311 + }, + { + "epoch": 0.9141194771611548, + "grad_norm": 2.6462783813476562, + "learning_rate": 5.932535355678072e-06, + "loss": 0.8071, + "step": 11312 + }, + { + "epoch": 0.9142002868744823, + "grad_norm": 2.5416271686553955, + "learning_rate": 5.93189246723766e-06, + "loss": 1.0333, + "step": 11313 + }, + { + "epoch": 0.9142810965878099, + "grad_norm": 2.3862597942352295, + "learning_rate": 5.931249562836176e-06, + "loss": 0.9642, + "step": 11314 + }, + { + "epoch": 0.9143619063011375, + "grad_norm": 2.9840433597564697, + "learning_rate": 5.9306066424846254e-06, + "loss": 0.8035, + "step": 11315 + }, + { + "epoch": 0.9144427160144649, + "grad_norm": 2.784557342529297, + "learning_rate": 5.929963706194027e-06, + "loss": 0.8911, + "step": 11316 + }, + { + "epoch": 0.9145235257277925, + "grad_norm": 2.5682101249694824, + "learning_rate": 5.929320753975388e-06, + "loss": 0.9524, + "step": 11317 + }, + { + "epoch": 0.9146043354411201, + "grad_norm": 2.7034237384796143, + "learning_rate": 5.9286777858397195e-06, + "loss": 0.9586, + "step": 11318 + }, + { + "epoch": 0.9146851451544475, + "grad_norm": 2.3170664310455322, + "learning_rate": 5.928034801798037e-06, + "loss": 0.8701, + "step": 11319 + }, + { + "epoch": 0.9147659548677751, + "grad_norm": 2.2595629692077637, + "learning_rate": 5.927391801861355e-06, + "loss": 0.9048, + "step": 11320 + }, + { + "epoch": 0.9148467645811027, + "grad_norm": 2.599877119064331, + "learning_rate": 5.926748786040681e-06, + "loss": 0.9804, + "step": 11321 + }, + { + "epoch": 0.9149275742944302, + "grad_norm": 2.9440958499908447, + "learning_rate": 5.926105754347034e-06, + "loss": 0.8105, + "step": 11322 + }, + { + "epoch": 0.9150083840077577, + "grad_norm": 2.644850730895996, + "learning_rate": 5.925462706791421e-06, + "loss": 1.0119, + "step": 11323 + }, + { + "epoch": 0.9150891937210853, + "grad_norm": 3.3473072052001953, + "learning_rate": 5.9248196433848645e-06, + "loss": 0.9653, + "step": 11324 + }, + { + "epoch": 0.9151700034344128, + "grad_norm": 2.77569842338562, + "learning_rate": 5.924176564138372e-06, + "loss": 0.8518, + "step": 11325 + }, + { + "epoch": 0.9152508131477404, + "grad_norm": 2.5223371982574463, + "learning_rate": 5.9235334690629586e-06, + "loss": 0.9585, + "step": 11326 + }, + { + "epoch": 0.915331622861068, + "grad_norm": 2.3707287311553955, + "learning_rate": 5.922890358169642e-06, + "loss": 0.8922, + "step": 11327 + }, + { + "epoch": 0.9154124325743954, + "grad_norm": 3.036858320236206, + "learning_rate": 5.922247231469435e-06, + "loss": 0.945, + "step": 11328 + }, + { + "epoch": 0.915493242287723, + "grad_norm": 2.6230826377868652, + "learning_rate": 5.921604088973352e-06, + "loss": 0.8826, + "step": 11329 + }, + { + "epoch": 0.9155740520010506, + "grad_norm": 2.5088369846343994, + "learning_rate": 5.9209609306924106e-06, + "loss": 0.9406, + "step": 11330 + }, + { + "epoch": 0.915654861714378, + "grad_norm": 2.4708962440490723, + "learning_rate": 5.9203177566376256e-06, + "loss": 0.8927, + "step": 11331 + }, + { + "epoch": 0.9157356714277056, + "grad_norm": 2.6827518939971924, + "learning_rate": 5.919674566820013e-06, + "loss": 1.0397, + "step": 11332 + }, + { + "epoch": 0.9158164811410332, + "grad_norm": 2.4582741260528564, + "learning_rate": 5.919031361250588e-06, + "loss": 0.8563, + "step": 11333 + }, + { + "epoch": 0.9158972908543607, + "grad_norm": 2.6187996864318848, + "learning_rate": 5.9183881399403694e-06, + "loss": 0.8182, + "step": 11334 + }, + { + "epoch": 0.9159781005676882, + "grad_norm": 2.827922821044922, + "learning_rate": 5.917744902900372e-06, + "loss": 0.9741, + "step": 11335 + }, + { + "epoch": 0.9160589102810158, + "grad_norm": 3.082225799560547, + "learning_rate": 5.917101650141616e-06, + "loss": 1.0021, + "step": 11336 + }, + { + "epoch": 0.9161397199943433, + "grad_norm": 2.4827330112457275, + "learning_rate": 5.916458381675113e-06, + "loss": 0.9795, + "step": 11337 + }, + { + "epoch": 0.9162205297076709, + "grad_norm": 3.0935449600219727, + "learning_rate": 5.915815097511887e-06, + "loss": 0.8948, + "step": 11338 + }, + { + "epoch": 0.9163013394209985, + "grad_norm": 2.7609193325042725, + "learning_rate": 5.915171797662952e-06, + "loss": 1.0349, + "step": 11339 + }, + { + "epoch": 0.9163821491343259, + "grad_norm": 2.6350550651550293, + "learning_rate": 5.914528482139328e-06, + "loss": 0.973, + "step": 11340 + }, + { + "epoch": 0.9164629588476535, + "grad_norm": 2.7748446464538574, + "learning_rate": 5.913885150952032e-06, + "loss": 0.879, + "step": 11341 + }, + { + "epoch": 0.9165437685609811, + "grad_norm": 2.544610023498535, + "learning_rate": 5.9132418041120845e-06, + "loss": 0.8538, + "step": 11342 + }, + { + "epoch": 0.9166245782743085, + "grad_norm": 2.6684980392456055, + "learning_rate": 5.912598441630501e-06, + "loss": 0.893, + "step": 11343 + }, + { + "epoch": 0.9167053879876361, + "grad_norm": 2.9629220962524414, + "learning_rate": 5.911955063518307e-06, + "loss": 0.8841, + "step": 11344 + }, + { + "epoch": 0.9167861977009637, + "grad_norm": 2.4688479900360107, + "learning_rate": 5.9113116697865145e-06, + "loss": 0.9434, + "step": 11345 + }, + { + "epoch": 0.9168670074142912, + "grad_norm": 2.7888643741607666, + "learning_rate": 5.910668260446149e-06, + "loss": 0.8742, + "step": 11346 + }, + { + "epoch": 0.9169478171276187, + "grad_norm": 2.280099630355835, + "learning_rate": 5.910024835508228e-06, + "loss": 0.8853, + "step": 11347 + }, + { + "epoch": 0.9170286268409463, + "grad_norm": 2.539198160171509, + "learning_rate": 5.9093813949837735e-06, + "loss": 1.0329, + "step": 11348 + }, + { + "epoch": 0.9171094365542738, + "grad_norm": 2.62003493309021, + "learning_rate": 5.908737938883803e-06, + "loss": 0.9036, + "step": 11349 + }, + { + "epoch": 0.9171902462676014, + "grad_norm": 2.6804158687591553, + "learning_rate": 5.908094467219341e-06, + "loss": 0.9506, + "step": 11350 + }, + { + "epoch": 0.917271055980929, + "grad_norm": 2.666212558746338, + "learning_rate": 5.907450980001405e-06, + "loss": 0.9333, + "step": 11351 + }, + { + "epoch": 0.9173518656942564, + "grad_norm": 3.5210061073303223, + "learning_rate": 5.90680747724102e-06, + "loss": 0.8667, + "step": 11352 + }, + { + "epoch": 0.917432675407584, + "grad_norm": 2.6402230262756348, + "learning_rate": 5.906163958949205e-06, + "loss": 0.8378, + "step": 11353 + }, + { + "epoch": 0.9175134851209116, + "grad_norm": 3.05657696723938, + "learning_rate": 5.905520425136983e-06, + "loss": 0.9953, + "step": 11354 + }, + { + "epoch": 0.917594294834239, + "grad_norm": 2.8205697536468506, + "learning_rate": 5.904876875815376e-06, + "loss": 0.9638, + "step": 11355 + }, + { + "epoch": 0.9176751045475666, + "grad_norm": 2.873640775680542, + "learning_rate": 5.904233310995409e-06, + "loss": 0.8999, + "step": 11356 + }, + { + "epoch": 0.9177559142608942, + "grad_norm": 2.755690813064575, + "learning_rate": 5.9035897306880986e-06, + "loss": 0.8995, + "step": 11357 + }, + { + "epoch": 0.9178367239742217, + "grad_norm": 2.4933114051818848, + "learning_rate": 5.902946134904473e-06, + "loss": 0.846, + "step": 11358 + }, + { + "epoch": 0.9179175336875492, + "grad_norm": 2.5199472904205322, + "learning_rate": 5.9023025236555545e-06, + "loss": 0.7741, + "step": 11359 + }, + { + "epoch": 0.9179983434008768, + "grad_norm": 2.5322792530059814, + "learning_rate": 5.901658896952365e-06, + "loss": 0.787, + "step": 11360 + }, + { + "epoch": 0.9180791531142043, + "grad_norm": 2.953186273574829, + "learning_rate": 5.901015254805929e-06, + "loss": 0.9253, + "step": 11361 + }, + { + "epoch": 0.9181599628275319, + "grad_norm": 2.3440465927124023, + "learning_rate": 5.900371597227271e-06, + "loss": 0.9632, + "step": 11362 + }, + { + "epoch": 0.9182407725408595, + "grad_norm": 2.270451307296753, + "learning_rate": 5.899727924227416e-06, + "loss": 1.0529, + "step": 11363 + }, + { + "epoch": 0.9183215822541869, + "grad_norm": 2.456655502319336, + "learning_rate": 5.899084235817387e-06, + "loss": 0.9195, + "step": 11364 + }, + { + "epoch": 0.9184023919675145, + "grad_norm": 2.6576099395751953, + "learning_rate": 5.89844053200821e-06, + "loss": 0.9569, + "step": 11365 + }, + { + "epoch": 0.9184832016808421, + "grad_norm": 3.1898820400238037, + "learning_rate": 5.89779681281091e-06, + "loss": 0.8924, + "step": 11366 + }, + { + "epoch": 0.9185640113941695, + "grad_norm": 2.303389310836792, + "learning_rate": 5.8971530782365105e-06, + "loss": 0.9054, + "step": 11367 + }, + { + "epoch": 0.9186448211074971, + "grad_norm": 2.348397731781006, + "learning_rate": 5.896509328296038e-06, + "loss": 0.8843, + "step": 11368 + }, + { + "epoch": 0.9187256308208247, + "grad_norm": 2.2865302562713623, + "learning_rate": 5.895865563000521e-06, + "loss": 0.8138, + "step": 11369 + }, + { + "epoch": 0.9188064405341522, + "grad_norm": 2.529402256011963, + "learning_rate": 5.895221782360983e-06, + "loss": 0.8935, + "step": 11370 + }, + { + "epoch": 0.9188872502474797, + "grad_norm": 2.8028602600097656, + "learning_rate": 5.89457798638845e-06, + "loss": 1.0034, + "step": 11371 + }, + { + "epoch": 0.9189680599608073, + "grad_norm": 2.6138877868652344, + "learning_rate": 5.893934175093951e-06, + "loss": 0.944, + "step": 11372 + }, + { + "epoch": 0.9190488696741348, + "grad_norm": 2.643620729446411, + "learning_rate": 5.893290348488512e-06, + "loss": 0.7615, + "step": 11373 + }, + { + "epoch": 0.9191296793874624, + "grad_norm": 2.6800639629364014, + "learning_rate": 5.892646506583158e-06, + "loss": 0.9227, + "step": 11374 + }, + { + "epoch": 0.91921048910079, + "grad_norm": 2.541599988937378, + "learning_rate": 5.89200264938892e-06, + "loss": 0.9378, + "step": 11375 + }, + { + "epoch": 0.9192912988141174, + "grad_norm": 2.6753969192504883, + "learning_rate": 5.891358776916822e-06, + "loss": 0.9243, + "step": 11376 + }, + { + "epoch": 0.919372108527445, + "grad_norm": 2.499925374984741, + "learning_rate": 5.890714889177895e-06, + "loss": 0.9163, + "step": 11377 + }, + { + "epoch": 0.9194529182407726, + "grad_norm": 2.6245625019073486, + "learning_rate": 5.890070986183168e-06, + "loss": 1.0037, + "step": 11378 + }, + { + "epoch": 0.9195337279541, + "grad_norm": 2.6513335704803467, + "learning_rate": 5.889427067943665e-06, + "loss": 0.8895, + "step": 11379 + }, + { + "epoch": 0.9196145376674276, + "grad_norm": 2.487545967102051, + "learning_rate": 5.8887831344704195e-06, + "loss": 0.8848, + "step": 11380 + }, + { + "epoch": 0.9196953473807552, + "grad_norm": 2.648772954940796, + "learning_rate": 5.888139185774459e-06, + "loss": 0.7962, + "step": 11381 + }, + { + "epoch": 0.9197761570940827, + "grad_norm": 2.3208978176116943, + "learning_rate": 5.887495221866811e-06, + "loss": 1.0391, + "step": 11382 + }, + { + "epoch": 0.9198569668074102, + "grad_norm": 2.3278167247772217, + "learning_rate": 5.8868512427585064e-06, + "loss": 0.8511, + "step": 11383 + }, + { + "epoch": 0.9199377765207378, + "grad_norm": 2.843883514404297, + "learning_rate": 5.886207248460575e-06, + "loss": 0.9639, + "step": 11384 + }, + { + "epoch": 0.9200185862340653, + "grad_norm": 2.954684257507324, + "learning_rate": 5.885563238984046e-06, + "loss": 0.8697, + "step": 11385 + }, + { + "epoch": 0.9200993959473929, + "grad_norm": 3.011845827102661, + "learning_rate": 5.884919214339952e-06, + "loss": 0.9803, + "step": 11386 + }, + { + "epoch": 0.9201802056607205, + "grad_norm": 2.6969895362854004, + "learning_rate": 5.884275174539324e-06, + "loss": 0.934, + "step": 11387 + }, + { + "epoch": 0.9202610153740479, + "grad_norm": 2.4887447357177734, + "learning_rate": 5.883631119593187e-06, + "loss": 0.9138, + "step": 11388 + }, + { + "epoch": 0.9203418250873755, + "grad_norm": 2.7332000732421875, + "learning_rate": 5.88298704951258e-06, + "loss": 0.8979, + "step": 11389 + }, + { + "epoch": 0.9204226348007031, + "grad_norm": 2.726731061935425, + "learning_rate": 5.8823429643085275e-06, + "loss": 0.8644, + "step": 11390 + }, + { + "epoch": 0.9205034445140305, + "grad_norm": 2.412320852279663, + "learning_rate": 5.881698863992067e-06, + "loss": 0.9154, + "step": 11391 + }, + { + "epoch": 0.9205842542273581, + "grad_norm": 2.885359048843384, + "learning_rate": 5.881054748574226e-06, + "loss": 1.0249, + "step": 11392 + }, + { + "epoch": 0.9206650639406857, + "grad_norm": 2.8027565479278564, + "learning_rate": 5.880410618066038e-06, + "loss": 0.8415, + "step": 11393 + }, + { + "epoch": 0.9207458736540132, + "grad_norm": 3.0334484577178955, + "learning_rate": 5.879766472478535e-06, + "loss": 0.919, + "step": 11394 + }, + { + "epoch": 0.9208266833673407, + "grad_norm": 2.489262342453003, + "learning_rate": 5.879122311822752e-06, + "loss": 0.9561, + "step": 11395 + }, + { + "epoch": 0.9209074930806683, + "grad_norm": 2.3767006397247314, + "learning_rate": 5.878478136109719e-06, + "loss": 1.0179, + "step": 11396 + }, + { + "epoch": 0.9209883027939958, + "grad_norm": 2.608154058456421, + "learning_rate": 5.877833945350471e-06, + "loss": 0.9476, + "step": 11397 + }, + { + "epoch": 0.9210691125073234, + "grad_norm": 2.684385061264038, + "learning_rate": 5.87718973955604e-06, + "loss": 0.9262, + "step": 11398 + }, + { + "epoch": 0.921149922220651, + "grad_norm": 2.503622055053711, + "learning_rate": 5.876545518737462e-06, + "loss": 0.9795, + "step": 11399 + }, + { + "epoch": 0.9212307319339784, + "grad_norm": 2.7895336151123047, + "learning_rate": 5.875901282905768e-06, + "loss": 0.8742, + "step": 11400 + }, + { + "epoch": 0.921311541647306, + "grad_norm": 2.5915136337280273, + "learning_rate": 5.875257032071995e-06, + "loss": 0.8651, + "step": 11401 + }, + { + "epoch": 0.9213923513606336, + "grad_norm": 2.8383326530456543, + "learning_rate": 5.874612766247174e-06, + "loss": 0.9806, + "step": 11402 + }, + { + "epoch": 0.921473161073961, + "grad_norm": 2.4563441276550293, + "learning_rate": 5.873968485442345e-06, + "loss": 0.7725, + "step": 11403 + }, + { + "epoch": 0.9215539707872886, + "grad_norm": 2.784400463104248, + "learning_rate": 5.8733241896685366e-06, + "loss": 1.0246, + "step": 11404 + }, + { + "epoch": 0.9216347805006162, + "grad_norm": 2.898120880126953, + "learning_rate": 5.8726798789367886e-06, + "loss": 0.9011, + "step": 11405 + }, + { + "epoch": 0.9217155902139437, + "grad_norm": 3.285613775253296, + "learning_rate": 5.872035553258136e-06, + "loss": 0.9468, + "step": 11406 + }, + { + "epoch": 0.9217963999272712, + "grad_norm": 2.813210964202881, + "learning_rate": 5.871391212643614e-06, + "loss": 0.8887, + "step": 11407 + }, + { + "epoch": 0.9218772096405988, + "grad_norm": 2.130511999130249, + "learning_rate": 5.870746857104256e-06, + "loss": 0.954, + "step": 11408 + }, + { + "epoch": 0.9219580193539263, + "grad_norm": 2.8262295722961426, + "learning_rate": 5.870102486651102e-06, + "loss": 0.8483, + "step": 11409 + }, + { + "epoch": 0.9220388290672539, + "grad_norm": 2.5086472034454346, + "learning_rate": 5.8694581012951866e-06, + "loss": 0.9501, + "step": 11410 + }, + { + "epoch": 0.9221196387805815, + "grad_norm": 3.1171772480010986, + "learning_rate": 5.868813701047549e-06, + "loss": 0.8756, + "step": 11411 + }, + { + "epoch": 0.9222004484939089, + "grad_norm": 2.7673299312591553, + "learning_rate": 5.868169285919222e-06, + "loss": 1.1057, + "step": 11412 + }, + { + "epoch": 0.9222812582072365, + "grad_norm": 2.42545747756958, + "learning_rate": 5.867524855921246e-06, + "loss": 0.9058, + "step": 11413 + }, + { + "epoch": 0.9223620679205641, + "grad_norm": 2.694911479949951, + "learning_rate": 5.866880411064657e-06, + "loss": 0.9866, + "step": 11414 + }, + { + "epoch": 0.9224428776338915, + "grad_norm": 2.726897716522217, + "learning_rate": 5.866235951360495e-06, + "loss": 0.9312, + "step": 11415 + }, + { + "epoch": 0.9225236873472191, + "grad_norm": 2.3903236389160156, + "learning_rate": 5.865591476819794e-06, + "loss": 1.0821, + "step": 11416 + }, + { + "epoch": 0.9226044970605467, + "grad_norm": 2.8196592330932617, + "learning_rate": 5.864946987453598e-06, + "loss": 0.8985, + "step": 11417 + }, + { + "epoch": 0.9226853067738742, + "grad_norm": 2.540738344192505, + "learning_rate": 5.864302483272939e-06, + "loss": 0.8386, + "step": 11418 + }, + { + "epoch": 0.9227661164872017, + "grad_norm": 2.7647039890289307, + "learning_rate": 5.863657964288863e-06, + "loss": 1.019, + "step": 11419 + }, + { + "epoch": 0.9228469262005293, + "grad_norm": 2.655989170074463, + "learning_rate": 5.8630134305124035e-06, + "loss": 0.8882, + "step": 11420 + }, + { + "epoch": 0.9229277359138568, + "grad_norm": 2.9405517578125, + "learning_rate": 5.8623688819546e-06, + "loss": 1.0154, + "step": 11421 + }, + { + "epoch": 0.9230085456271844, + "grad_norm": 3.0355875492095947, + "learning_rate": 5.861724318626495e-06, + "loss": 0.997, + "step": 11422 + }, + { + "epoch": 0.923089355340512, + "grad_norm": 2.91152024269104, + "learning_rate": 5.861079740539128e-06, + "loss": 0.9131, + "step": 11423 + }, + { + "epoch": 0.9231701650538394, + "grad_norm": 2.527146339416504, + "learning_rate": 5.860435147703536e-06, + "loss": 1.0204, + "step": 11424 + }, + { + "epoch": 0.923250974767167, + "grad_norm": 3.000993251800537, + "learning_rate": 5.8597905401307634e-06, + "loss": 0.9303, + "step": 11425 + }, + { + "epoch": 0.9233317844804946, + "grad_norm": 2.335820198059082, + "learning_rate": 5.8591459178318465e-06, + "loss": 0.8627, + "step": 11426 + }, + { + "epoch": 0.923412594193822, + "grad_norm": 2.655054807662964, + "learning_rate": 5.85850128081783e-06, + "loss": 0.8509, + "step": 11427 + }, + { + "epoch": 0.9234934039071496, + "grad_norm": 2.9699034690856934, + "learning_rate": 5.857856629099752e-06, + "loss": 0.9142, + "step": 11428 + }, + { + "epoch": 0.9235742136204772, + "grad_norm": 2.5918145179748535, + "learning_rate": 5.857211962688656e-06, + "loss": 1.0522, + "step": 11429 + }, + { + "epoch": 0.9236550233338047, + "grad_norm": 2.4602110385894775, + "learning_rate": 5.856567281595582e-06, + "loss": 0.8648, + "step": 11430 + }, + { + "epoch": 0.9237358330471322, + "grad_norm": 2.568098783493042, + "learning_rate": 5.855922585831573e-06, + "loss": 0.8071, + "step": 11431 + }, + { + "epoch": 0.9238166427604598, + "grad_norm": 2.309749126434326, + "learning_rate": 5.85527787540767e-06, + "loss": 0.9676, + "step": 11432 + }, + { + "epoch": 0.9238974524737873, + "grad_norm": 2.800462245941162, + "learning_rate": 5.8546331503349185e-06, + "loss": 0.8561, + "step": 11433 + }, + { + "epoch": 0.9239782621871149, + "grad_norm": 2.488215446472168, + "learning_rate": 5.853988410624356e-06, + "loss": 0.8526, + "step": 11434 + }, + { + "epoch": 0.9240590719004425, + "grad_norm": 3.203184127807617, + "learning_rate": 5.853343656287029e-06, + "loss": 0.892, + "step": 11435 + }, + { + "epoch": 0.9241398816137699, + "grad_norm": 2.5497629642486572, + "learning_rate": 5.852698887333979e-06, + "loss": 0.942, + "step": 11436 + }, + { + "epoch": 0.9242206913270975, + "grad_norm": 2.2810022830963135, + "learning_rate": 5.85205410377625e-06, + "loss": 0.9922, + "step": 11437 + }, + { + "epoch": 0.9243015010404251, + "grad_norm": 3.3598952293395996, + "learning_rate": 5.851409305624886e-06, + "loss": 0.9115, + "step": 11438 + }, + { + "epoch": 0.9243823107537527, + "grad_norm": 2.994471788406372, + "learning_rate": 5.850764492890929e-06, + "loss": 0.9589, + "step": 11439 + }, + { + "epoch": 0.9244631204670801, + "grad_norm": 2.88105845451355, + "learning_rate": 5.850119665585427e-06, + "loss": 0.9863, + "step": 11440 + }, + { + "epoch": 0.9245439301804077, + "grad_norm": 2.943375825881958, + "learning_rate": 5.8494748237194184e-06, + "loss": 0.8862, + "step": 11441 + }, + { + "epoch": 0.9246247398937353, + "grad_norm": 2.7711851596832275, + "learning_rate": 5.8488299673039525e-06, + "loss": 0.8328, + "step": 11442 + }, + { + "epoch": 0.9247055496070627, + "grad_norm": 2.6126210689544678, + "learning_rate": 5.848185096350073e-06, + "loss": 1.066, + "step": 11443 + }, + { + "epoch": 0.9247863593203903, + "grad_norm": 2.4791460037231445, + "learning_rate": 5.847540210868825e-06, + "loss": 1.0345, + "step": 11444 + }, + { + "epoch": 0.9248671690337179, + "grad_norm": 2.5218231678009033, + "learning_rate": 5.846895310871252e-06, + "loss": 0.9926, + "step": 11445 + }, + { + "epoch": 0.9249479787470454, + "grad_norm": 2.2506418228149414, + "learning_rate": 5.846250396368403e-06, + "loss": 0.9138, + "step": 11446 + }, + { + "epoch": 0.925028788460373, + "grad_norm": 2.818552017211914, + "learning_rate": 5.8456054673713215e-06, + "loss": 0.9325, + "step": 11447 + }, + { + "epoch": 0.9251095981737005, + "grad_norm": 2.802095413208008, + "learning_rate": 5.844960523891054e-06, + "loss": 0.8851, + "step": 11448 + }, + { + "epoch": 0.925190407887028, + "grad_norm": 2.4871411323547363, + "learning_rate": 5.844315565938645e-06, + "loss": 0.8751, + "step": 11449 + }, + { + "epoch": 0.9252712176003556, + "grad_norm": 2.527529001235962, + "learning_rate": 5.843670593525146e-06, + "loss": 0.9122, + "step": 11450 + }, + { + "epoch": 0.9253520273136832, + "grad_norm": 2.765439987182617, + "learning_rate": 5.8430256066616e-06, + "loss": 0.9113, + "step": 11451 + }, + { + "epoch": 0.9254328370270106, + "grad_norm": 2.457642078399658, + "learning_rate": 5.842380605359054e-06, + "loss": 0.9469, + "step": 11452 + }, + { + "epoch": 0.9255136467403382, + "grad_norm": 2.3127601146698, + "learning_rate": 5.841735589628556e-06, + "loss": 0.8463, + "step": 11453 + }, + { + "epoch": 0.9255944564536658, + "grad_norm": 2.3116958141326904, + "learning_rate": 5.841090559481155e-06, + "loss": 1.0376, + "step": 11454 + }, + { + "epoch": 0.9256752661669932, + "grad_norm": 2.455389976501465, + "learning_rate": 5.840445514927896e-06, + "loss": 1.0043, + "step": 11455 + }, + { + "epoch": 0.9257560758803208, + "grad_norm": 2.7778866291046143, + "learning_rate": 5.839800455979829e-06, + "loss": 1.0258, + "step": 11456 + }, + { + "epoch": 0.9258368855936484, + "grad_norm": 2.987760305404663, + "learning_rate": 5.839155382648003e-06, + "loss": 0.9582, + "step": 11457 + }, + { + "epoch": 0.9259176953069759, + "grad_norm": 2.9499425888061523, + "learning_rate": 5.838510294943465e-06, + "loss": 0.858, + "step": 11458 + }, + { + "epoch": 0.9259985050203035, + "grad_norm": 2.903751850128174, + "learning_rate": 5.837865192877263e-06, + "loss": 0.815, + "step": 11459 + }, + { + "epoch": 0.926079314733631, + "grad_norm": 2.858501434326172, + "learning_rate": 5.837220076460449e-06, + "loss": 0.831, + "step": 11460 + }, + { + "epoch": 0.9261601244469585, + "grad_norm": 2.4036664962768555, + "learning_rate": 5.83657494570407e-06, + "loss": 0.8755, + "step": 11461 + }, + { + "epoch": 0.9262409341602861, + "grad_norm": 2.3154263496398926, + "learning_rate": 5.835929800619177e-06, + "loss": 0.9799, + "step": 11462 + }, + { + "epoch": 0.9263217438736137, + "grad_norm": 2.4877054691314697, + "learning_rate": 5.835284641216816e-06, + "loss": 0.8289, + "step": 11463 + }, + { + "epoch": 0.9264025535869411, + "grad_norm": 2.2633888721466064, + "learning_rate": 5.8346394675080446e-06, + "loss": 1.1154, + "step": 11464 + }, + { + "epoch": 0.9264833633002687, + "grad_norm": 2.742274045944214, + "learning_rate": 5.8339942795039055e-06, + "loss": 0.9021, + "step": 11465 + }, + { + "epoch": 0.9265641730135963, + "grad_norm": 2.681795120239258, + "learning_rate": 5.833349077215452e-06, + "loss": 0.9081, + "step": 11466 + }, + { + "epoch": 0.9266449827269237, + "grad_norm": 2.5320637226104736, + "learning_rate": 5.832703860653736e-06, + "loss": 0.9118, + "step": 11467 + }, + { + "epoch": 0.9267257924402513, + "grad_norm": 2.741290330886841, + "learning_rate": 5.832058629829808e-06, + "loss": 0.9423, + "step": 11468 + }, + { + "epoch": 0.9268066021535789, + "grad_norm": 2.9602246284484863, + "learning_rate": 5.831413384754716e-06, + "loss": 0.903, + "step": 11469 + }, + { + "epoch": 0.9268874118669064, + "grad_norm": 2.332145929336548, + "learning_rate": 5.8307681254395165e-06, + "loss": 0.9625, + "step": 11470 + }, + { + "epoch": 0.926968221580234, + "grad_norm": 2.47112774848938, + "learning_rate": 5.830122851895259e-06, + "loss": 0.9085, + "step": 11471 + }, + { + "epoch": 0.9270490312935615, + "grad_norm": 2.4056050777435303, + "learning_rate": 5.8294775641329945e-06, + "loss": 0.9285, + "step": 11472 + }, + { + "epoch": 0.927129841006889, + "grad_norm": 2.653259754180908, + "learning_rate": 5.8288322621637776e-06, + "loss": 0.9261, + "step": 11473 + }, + { + "epoch": 0.9272106507202166, + "grad_norm": 2.3907012939453125, + "learning_rate": 5.8281869459986585e-06, + "loss": 0.9271, + "step": 11474 + }, + { + "epoch": 0.9272914604335442, + "grad_norm": 3.4400665760040283, + "learning_rate": 5.82754161564869e-06, + "loss": 0.964, + "step": 11475 + }, + { + "epoch": 0.9273722701468716, + "grad_norm": 3.1091995239257812, + "learning_rate": 5.826896271124928e-06, + "loss": 0.8983, + "step": 11476 + }, + { + "epoch": 0.9274530798601992, + "grad_norm": 2.6850452423095703, + "learning_rate": 5.826250912438421e-06, + "loss": 0.9606, + "step": 11477 + }, + { + "epoch": 0.9275338895735268, + "grad_norm": 2.686000108718872, + "learning_rate": 5.8256055396002275e-06, + "loss": 0.9391, + "step": 11478 + }, + { + "epoch": 0.9276146992868542, + "grad_norm": 2.9149484634399414, + "learning_rate": 5.824960152621397e-06, + "loss": 0.9082, + "step": 11479 + }, + { + "epoch": 0.9276955090001818, + "grad_norm": 2.213421583175659, + "learning_rate": 5.824314751512985e-06, + "loss": 0.9635, + "step": 11480 + }, + { + "epoch": 0.9277763187135094, + "grad_norm": 2.699484348297119, + "learning_rate": 5.823669336286046e-06, + "loss": 0.9009, + "step": 11481 + }, + { + "epoch": 0.9278571284268369, + "grad_norm": 2.49141526222229, + "learning_rate": 5.823023906951636e-06, + "loss": 0.9613, + "step": 11482 + }, + { + "epoch": 0.9279379381401645, + "grad_norm": 2.307377815246582, + "learning_rate": 5.822378463520805e-06, + "loss": 0.8796, + "step": 11483 + }, + { + "epoch": 0.928018747853492, + "grad_norm": 2.557276487350464, + "learning_rate": 5.821733006004613e-06, + "loss": 0.9419, + "step": 11484 + }, + { + "epoch": 0.9280995575668195, + "grad_norm": 2.382101058959961, + "learning_rate": 5.821087534414112e-06, + "loss": 0.9555, + "step": 11485 + }, + { + "epoch": 0.9281803672801471, + "grad_norm": 2.5004258155822754, + "learning_rate": 5.820442048760357e-06, + "loss": 0.9311, + "step": 11486 + }, + { + "epoch": 0.9282611769934747, + "grad_norm": 3.6078615188598633, + "learning_rate": 5.8197965490544064e-06, + "loss": 0.9319, + "step": 11487 + }, + { + "epoch": 0.9283419867068021, + "grad_norm": 2.3218090534210205, + "learning_rate": 5.819151035307314e-06, + "loss": 0.9341, + "step": 11488 + }, + { + "epoch": 0.9284227964201297, + "grad_norm": 2.539309501647949, + "learning_rate": 5.818505507530137e-06, + "loss": 0.933, + "step": 11489 + }, + { + "epoch": 0.9285036061334573, + "grad_norm": 2.954179048538208, + "learning_rate": 5.8178599657339305e-06, + "loss": 0.9342, + "step": 11490 + }, + { + "epoch": 0.9285844158467847, + "grad_norm": 2.8786184787750244, + "learning_rate": 5.817214409929751e-06, + "loss": 0.8229, + "step": 11491 + }, + { + "epoch": 0.9286652255601123, + "grad_norm": 2.192934036254883, + "learning_rate": 5.816568840128658e-06, + "loss": 1.0281, + "step": 11492 + }, + { + "epoch": 0.9287460352734399, + "grad_norm": 2.5164315700531006, + "learning_rate": 5.815923256341704e-06, + "loss": 0.9558, + "step": 11493 + }, + { + "epoch": 0.9288268449867674, + "grad_norm": 2.596343755722046, + "learning_rate": 5.815277658579951e-06, + "loss": 0.9164, + "step": 11494 + }, + { + "epoch": 0.928907654700095, + "grad_norm": 3.173933982849121, + "learning_rate": 5.8146320468544536e-06, + "loss": 1.0063, + "step": 11495 + }, + { + "epoch": 0.9289884644134225, + "grad_norm": 2.6160390377044678, + "learning_rate": 5.81398642117627e-06, + "loss": 0.8698, + "step": 11496 + }, + { + "epoch": 0.92906927412675, + "grad_norm": 2.4642016887664795, + "learning_rate": 5.8133407815564595e-06, + "loss": 0.8666, + "step": 11497 + }, + { + "epoch": 0.9291500838400776, + "grad_norm": 2.741631507873535, + "learning_rate": 5.812695128006079e-06, + "loss": 0.9123, + "step": 11498 + }, + { + "epoch": 0.9292308935534052, + "grad_norm": 2.4259939193725586, + "learning_rate": 5.812049460536187e-06, + "loss": 1.0552, + "step": 11499 + }, + { + "epoch": 0.9293117032667326, + "grad_norm": 2.5590362548828125, + "learning_rate": 5.811403779157844e-06, + "loss": 0.973, + "step": 11500 + }, + { + "epoch": 0.9293925129800602, + "grad_norm": 2.7256057262420654, + "learning_rate": 5.810758083882107e-06, + "loss": 1.0253, + "step": 11501 + }, + { + "epoch": 0.9294733226933878, + "grad_norm": 2.9486336708068848, + "learning_rate": 5.810112374720034e-06, + "loss": 0.9112, + "step": 11502 + }, + { + "epoch": 0.9295541324067152, + "grad_norm": 2.5152158737182617, + "learning_rate": 5.809466651682688e-06, + "loss": 0.9632, + "step": 11503 + }, + { + "epoch": 0.9296349421200428, + "grad_norm": 2.4745943546295166, + "learning_rate": 5.808820914781127e-06, + "loss": 0.8528, + "step": 11504 + }, + { + "epoch": 0.9297157518333704, + "grad_norm": 2.594898223876953, + "learning_rate": 5.80817516402641e-06, + "loss": 0.9564, + "step": 11505 + }, + { + "epoch": 0.9297965615466979, + "grad_norm": 2.7474186420440674, + "learning_rate": 5.807529399429599e-06, + "loss": 0.9599, + "step": 11506 + }, + { + "epoch": 0.9298773712600255, + "grad_norm": 2.5169644355773926, + "learning_rate": 5.806883621001754e-06, + "loss": 0.9271, + "step": 11507 + }, + { + "epoch": 0.929958180973353, + "grad_norm": 2.7150564193725586, + "learning_rate": 5.806237828753935e-06, + "loss": 0.8374, + "step": 11508 + }, + { + "epoch": 0.9300389906866805, + "grad_norm": 2.2712574005126953, + "learning_rate": 5.8055920226972005e-06, + "loss": 0.9027, + "step": 11509 + }, + { + "epoch": 0.9301198004000081, + "grad_norm": 2.483797550201416, + "learning_rate": 5.804946202842616e-06, + "loss": 0.8502, + "step": 11510 + }, + { + "epoch": 0.9302006101133357, + "grad_norm": 2.591059446334839, + "learning_rate": 5.80430036920124e-06, + "loss": 0.9023, + "step": 11511 + }, + { + "epoch": 0.9302814198266631, + "grad_norm": 2.3627212047576904, + "learning_rate": 5.803654521784135e-06, + "loss": 0.9751, + "step": 11512 + }, + { + "epoch": 0.9303622295399907, + "grad_norm": 2.7235360145568848, + "learning_rate": 5.803008660602364e-06, + "loss": 0.9483, + "step": 11513 + }, + { + "epoch": 0.9304430392533183, + "grad_norm": 2.6170332431793213, + "learning_rate": 5.802362785666987e-06, + "loss": 0.9834, + "step": 11514 + }, + { + "epoch": 0.9305238489666458, + "grad_norm": 2.68587327003479, + "learning_rate": 5.801716896989068e-06, + "loss": 0.9299, + "step": 11515 + }, + { + "epoch": 0.9306046586799733, + "grad_norm": 2.193279266357422, + "learning_rate": 5.801070994579668e-06, + "loss": 0.8976, + "step": 11516 + }, + { + "epoch": 0.9306854683933009, + "grad_norm": 2.9214203357696533, + "learning_rate": 5.800425078449849e-06, + "loss": 0.8915, + "step": 11517 + }, + { + "epoch": 0.9307662781066284, + "grad_norm": 2.671398639678955, + "learning_rate": 5.799779148610677e-06, + "loss": 0.8175, + "step": 11518 + }, + { + "epoch": 0.930847087819956, + "grad_norm": 2.8005127906799316, + "learning_rate": 5.799133205073213e-06, + "loss": 0.8693, + "step": 11519 + }, + { + "epoch": 0.9309278975332835, + "grad_norm": 3.3705873489379883, + "learning_rate": 5.798487247848521e-06, + "loss": 0.8651, + "step": 11520 + }, + { + "epoch": 0.931008707246611, + "grad_norm": 2.396902322769165, + "learning_rate": 5.7978412769476656e-06, + "loss": 0.8154, + "step": 11521 + }, + { + "epoch": 0.9310895169599386, + "grad_norm": 2.690016984939575, + "learning_rate": 5.797195292381707e-06, + "loss": 0.9181, + "step": 11522 + }, + { + "epoch": 0.9311703266732662, + "grad_norm": 2.621349573135376, + "learning_rate": 5.796549294161716e-06, + "loss": 1.0243, + "step": 11523 + }, + { + "epoch": 0.9312511363865936, + "grad_norm": 2.9115445613861084, + "learning_rate": 5.795903282298752e-06, + "loss": 1.0949, + "step": 11524 + }, + { + "epoch": 0.9313319460999212, + "grad_norm": 2.867117404937744, + "learning_rate": 5.79525725680388e-06, + "loss": 0.915, + "step": 11525 + }, + { + "epoch": 0.9314127558132488, + "grad_norm": 2.8154852390289307, + "learning_rate": 5.794611217688167e-06, + "loss": 0.9369, + "step": 11526 + }, + { + "epoch": 0.9314935655265763, + "grad_norm": 2.3597569465637207, + "learning_rate": 5.793965164962675e-06, + "loss": 1.0562, + "step": 11527 + }, + { + "epoch": 0.9315743752399038, + "grad_norm": 3.2262609004974365, + "learning_rate": 5.793319098638471e-06, + "loss": 0.8527, + "step": 11528 + }, + { + "epoch": 0.9316551849532314, + "grad_norm": 2.5392205715179443, + "learning_rate": 5.792673018726624e-06, + "loss": 0.8943, + "step": 11529 + }, + { + "epoch": 0.9317359946665589, + "grad_norm": 2.468940496444702, + "learning_rate": 5.792026925238192e-06, + "loss": 0.899, + "step": 11530 + }, + { + "epoch": 0.9318168043798865, + "grad_norm": 2.5027191638946533, + "learning_rate": 5.791380818184248e-06, + "loss": 1.014, + "step": 11531 + }, + { + "epoch": 0.931897614093214, + "grad_norm": 2.5542562007904053, + "learning_rate": 5.790734697575855e-06, + "loss": 0.8202, + "step": 11532 + }, + { + "epoch": 0.9319784238065415, + "grad_norm": 2.9731252193450928, + "learning_rate": 5.790088563424081e-06, + "loss": 1.0461, + "step": 11533 + }, + { + "epoch": 0.9320592335198691, + "grad_norm": 2.508049726486206, + "learning_rate": 5.789442415739991e-06, + "loss": 0.9014, + "step": 11534 + }, + { + "epoch": 0.9321400432331967, + "grad_norm": 2.503194570541382, + "learning_rate": 5.7887962545346545e-06, + "loss": 0.8512, + "step": 11535 + }, + { + "epoch": 0.9322208529465241, + "grad_norm": 2.6812469959259033, + "learning_rate": 5.788150079819135e-06, + "loss": 0.894, + "step": 11536 + }, + { + "epoch": 0.9323016626598517, + "grad_norm": 3.24379301071167, + "learning_rate": 5.7875038916045044e-06, + "loss": 0.9462, + "step": 11537 + }, + { + "epoch": 0.9323824723731793, + "grad_norm": 2.8694512844085693, + "learning_rate": 5.7868576899018256e-06, + "loss": 0.8687, + "step": 11538 + }, + { + "epoch": 0.9324632820865068, + "grad_norm": 2.6820590496063232, + "learning_rate": 5.786211474722171e-06, + "loss": 0.8236, + "step": 11539 + }, + { + "epoch": 0.9325440917998343, + "grad_norm": 2.6474108695983887, + "learning_rate": 5.785565246076605e-06, + "loss": 0.992, + "step": 11540 + }, + { + "epoch": 0.9326249015131619, + "grad_norm": 2.641235589981079, + "learning_rate": 5.7849190039761986e-06, + "loss": 0.9407, + "step": 11541 + }, + { + "epoch": 0.9327057112264894, + "grad_norm": 2.4772539138793945, + "learning_rate": 5.784272748432019e-06, + "loss": 0.9484, + "step": 11542 + }, + { + "epoch": 0.932786520939817, + "grad_norm": 2.6564748287200928, + "learning_rate": 5.7836264794551345e-06, + "loss": 0.8055, + "step": 11543 + }, + { + "epoch": 0.9328673306531445, + "grad_norm": 2.9654014110565186, + "learning_rate": 5.782980197056614e-06, + "loss": 0.8799, + "step": 11544 + }, + { + "epoch": 0.932948140366472, + "grad_norm": 2.4450013637542725, + "learning_rate": 5.78233390124753e-06, + "loss": 1.0247, + "step": 11545 + }, + { + "epoch": 0.9330289500797996, + "grad_norm": 2.6594128608703613, + "learning_rate": 5.781687592038949e-06, + "loss": 0.9457, + "step": 11546 + }, + { + "epoch": 0.9331097597931272, + "grad_norm": 2.8365838527679443, + "learning_rate": 5.78104126944194e-06, + "loss": 0.9219, + "step": 11547 + }, + { + "epoch": 0.9331905695064546, + "grad_norm": 3.188791036605835, + "learning_rate": 5.780394933467576e-06, + "loss": 0.9533, + "step": 11548 + }, + { + "epoch": 0.9332713792197822, + "grad_norm": 2.2971277236938477, + "learning_rate": 5.779748584126926e-06, + "loss": 0.9989, + "step": 11549 + }, + { + "epoch": 0.9333521889331098, + "grad_norm": 3.2613959312438965, + "learning_rate": 5.779102221431057e-06, + "loss": 0.9592, + "step": 11550 + }, + { + "epoch": 0.9334329986464373, + "grad_norm": 2.406313180923462, + "learning_rate": 5.778455845391047e-06, + "loss": 0.9709, + "step": 11551 + }, + { + "epoch": 0.9335138083597648, + "grad_norm": 2.714195966720581, + "learning_rate": 5.777809456017958e-06, + "loss": 0.945, + "step": 11552 + }, + { + "epoch": 0.9335946180730924, + "grad_norm": 2.3232004642486572, + "learning_rate": 5.777163053322869e-06, + "loss": 0.884, + "step": 11553 + }, + { + "epoch": 0.9336754277864199, + "grad_norm": 2.48897123336792, + "learning_rate": 5.776516637316844e-06, + "loss": 0.9891, + "step": 11554 + }, + { + "epoch": 0.9337562374997475, + "grad_norm": 2.561631202697754, + "learning_rate": 5.775870208010962e-06, + "loss": 1.0345, + "step": 11555 + }, + { + "epoch": 0.933837047213075, + "grad_norm": 3.3465235233306885, + "learning_rate": 5.775223765416289e-06, + "loss": 0.9146, + "step": 11556 + }, + { + "epoch": 0.9339178569264025, + "grad_norm": 2.860692024230957, + "learning_rate": 5.7745773095439005e-06, + "loss": 0.9705, + "step": 11557 + }, + { + "epoch": 0.9339986666397301, + "grad_norm": 2.452435255050659, + "learning_rate": 5.7739308404048665e-06, + "loss": 0.9616, + "step": 11558 + }, + { + "epoch": 0.9340794763530577, + "grad_norm": 2.4608728885650635, + "learning_rate": 5.7732843580102615e-06, + "loss": 1.0741, + "step": 11559 + }, + { + "epoch": 0.9341602860663851, + "grad_norm": 2.8395230770111084, + "learning_rate": 5.772637862371156e-06, + "loss": 0.9014, + "step": 11560 + }, + { + "epoch": 0.9342410957797127, + "grad_norm": 2.319927930831909, + "learning_rate": 5.771991353498624e-06, + "loss": 0.9134, + "step": 11561 + }, + { + "epoch": 0.9343219054930403, + "grad_norm": 2.8315300941467285, + "learning_rate": 5.771344831403739e-06, + "loss": 0.8373, + "step": 11562 + }, + { + "epoch": 0.9344027152063678, + "grad_norm": 2.3799524307250977, + "learning_rate": 5.770698296097573e-06, + "loss": 0.9199, + "step": 11563 + }, + { + "epoch": 0.9344835249196953, + "grad_norm": 2.1802234649658203, + "learning_rate": 5.770051747591202e-06, + "loss": 0.9585, + "step": 11564 + }, + { + "epoch": 0.9345643346330229, + "grad_norm": 2.759897232055664, + "learning_rate": 5.769405185895699e-06, + "loss": 0.9132, + "step": 11565 + }, + { + "epoch": 0.9346451443463504, + "grad_norm": 2.5097436904907227, + "learning_rate": 5.768758611022136e-06, + "loss": 0.9136, + "step": 11566 + }, + { + "epoch": 0.934725954059678, + "grad_norm": 2.9719035625457764, + "learning_rate": 5.768112022981589e-06, + "loss": 0.9139, + "step": 11567 + }, + { + "epoch": 0.9348067637730055, + "grad_norm": 2.778306245803833, + "learning_rate": 5.767465421785131e-06, + "loss": 1.0107, + "step": 11568 + }, + { + "epoch": 0.9348875734863331, + "grad_norm": 2.8643646240234375, + "learning_rate": 5.766818807443839e-06, + "loss": 1.0238, + "step": 11569 + }, + { + "epoch": 0.9349683831996606, + "grad_norm": 2.584343671798706, + "learning_rate": 5.766172179968788e-06, + "loss": 1.028, + "step": 11570 + }, + { + "epoch": 0.9350491929129882, + "grad_norm": 2.721311092376709, + "learning_rate": 5.765525539371049e-06, + "loss": 0.8965, + "step": 11571 + }, + { + "epoch": 0.9351300026263157, + "grad_norm": 2.5223119258880615, + "learning_rate": 5.764878885661703e-06, + "loss": 1.0345, + "step": 11572 + }, + { + "epoch": 0.9352108123396432, + "grad_norm": 2.4973721504211426, + "learning_rate": 5.764232218851822e-06, + "loss": 0.8728, + "step": 11573 + }, + { + "epoch": 0.9352916220529708, + "grad_norm": 2.6296730041503906, + "learning_rate": 5.763585538952485e-06, + "loss": 0.8364, + "step": 11574 + }, + { + "epoch": 0.9353724317662984, + "grad_norm": 2.2104923725128174, + "learning_rate": 5.7629388459747635e-06, + "loss": 1.0195, + "step": 11575 + }, + { + "epoch": 0.9354532414796258, + "grad_norm": 2.313533306121826, + "learning_rate": 5.7622921399297375e-06, + "loss": 0.9308, + "step": 11576 + }, + { + "epoch": 0.9355340511929534, + "grad_norm": 2.5132343769073486, + "learning_rate": 5.761645420828481e-06, + "loss": 0.8205, + "step": 11577 + }, + { + "epoch": 0.935614860906281, + "grad_norm": 2.088826894760132, + "learning_rate": 5.760998688682073e-06, + "loss": 1.0048, + "step": 11578 + }, + { + "epoch": 0.9356956706196085, + "grad_norm": 2.9883954524993896, + "learning_rate": 5.76035194350159e-06, + "loss": 0.9718, + "step": 11579 + }, + { + "epoch": 0.935776480332936, + "grad_norm": 3.0133285522460938, + "learning_rate": 5.759705185298109e-06, + "loss": 0.8568, + "step": 11580 + }, + { + "epoch": 0.9358572900462636, + "grad_norm": 2.6837682723999023, + "learning_rate": 5.759058414082704e-06, + "loss": 1.0335, + "step": 11581 + }, + { + "epoch": 0.9359380997595911, + "grad_norm": 2.656585693359375, + "learning_rate": 5.758411629866459e-06, + "loss": 0.862, + "step": 11582 + }, + { + "epoch": 0.9360189094729187, + "grad_norm": 2.485264778137207, + "learning_rate": 5.7577648326604465e-06, + "loss": 0.8882, + "step": 11583 + }, + { + "epoch": 0.9360997191862462, + "grad_norm": 2.6140716075897217, + "learning_rate": 5.757118022475749e-06, + "loss": 0.9033, + "step": 11584 + }, + { + "epoch": 0.9361805288995737, + "grad_norm": 2.6041059494018555, + "learning_rate": 5.756471199323441e-06, + "loss": 0.8124, + "step": 11585 + }, + { + "epoch": 0.9362613386129013, + "grad_norm": 2.636671304702759, + "learning_rate": 5.755824363214603e-06, + "loss": 0.9513, + "step": 11586 + }, + { + "epoch": 0.9363421483262289, + "grad_norm": 2.6526453495025635, + "learning_rate": 5.755177514160312e-06, + "loss": 0.9581, + "step": 11587 + }, + { + "epoch": 0.9364229580395563, + "grad_norm": 3.1483845710754395, + "learning_rate": 5.754530652171651e-06, + "loss": 0.8236, + "step": 11588 + }, + { + "epoch": 0.9365037677528839, + "grad_norm": 2.5279927253723145, + "learning_rate": 5.753883777259693e-06, + "loss": 0.9465, + "step": 11589 + }, + { + "epoch": 0.9365845774662115, + "grad_norm": 2.7167091369628906, + "learning_rate": 5.753236889435523e-06, + "loss": 1.0296, + "step": 11590 + }, + { + "epoch": 0.936665387179539, + "grad_norm": 3.3632664680480957, + "learning_rate": 5.752589988710216e-06, + "loss": 0.8465, + "step": 11591 + }, + { + "epoch": 0.9367461968928665, + "grad_norm": 2.8106067180633545, + "learning_rate": 5.751943075094857e-06, + "loss": 1.0021, + "step": 11592 + }, + { + "epoch": 0.9368270066061941, + "grad_norm": 2.6745994091033936, + "learning_rate": 5.751296148600521e-06, + "loss": 0.9987, + "step": 11593 + }, + { + "epoch": 0.9369078163195216, + "grad_norm": 2.5426883697509766, + "learning_rate": 5.750649209238294e-06, + "loss": 1.0077, + "step": 11594 + }, + { + "epoch": 0.9369886260328492, + "grad_norm": 2.624891996383667, + "learning_rate": 5.75000225701925e-06, + "loss": 1.0869, + "step": 11595 + }, + { + "epoch": 0.9370694357461767, + "grad_norm": 2.8439857959747314, + "learning_rate": 5.749355291954475e-06, + "loss": 0.958, + "step": 11596 + }, + { + "epoch": 0.9371502454595042, + "grad_norm": 2.8124709129333496, + "learning_rate": 5.7487083140550444e-06, + "loss": 0.8651, + "step": 11597 + }, + { + "epoch": 0.9372310551728318, + "grad_norm": 2.540600538253784, + "learning_rate": 5.748061323332047e-06, + "loss": 0.8403, + "step": 11598 + }, + { + "epoch": 0.9373118648861594, + "grad_norm": 2.6409599781036377, + "learning_rate": 5.7474143197965584e-06, + "loss": 0.9303, + "step": 11599 + }, + { + "epoch": 0.9373926745994868, + "grad_norm": 2.486414909362793, + "learning_rate": 5.7467673034596605e-06, + "loss": 0.8395, + "step": 11600 + }, + { + "epoch": 0.9374734843128144, + "grad_norm": 2.6569576263427734, + "learning_rate": 5.746120274332439e-06, + "loss": 0.9399, + "step": 11601 + }, + { + "epoch": 0.937554294026142, + "grad_norm": 2.4254419803619385, + "learning_rate": 5.745473232425972e-06, + "loss": 0.8859, + "step": 11602 + }, + { + "epoch": 0.9376351037394695, + "grad_norm": 2.6574559211730957, + "learning_rate": 5.744826177751341e-06, + "loss": 0.9295, + "step": 11603 + }, + { + "epoch": 0.937715913452797, + "grad_norm": 2.792635679244995, + "learning_rate": 5.7441791103196345e-06, + "loss": 0.8113, + "step": 11604 + }, + { + "epoch": 0.9377967231661246, + "grad_norm": 2.413358449935913, + "learning_rate": 5.743532030141929e-06, + "loss": 0.7997, + "step": 11605 + }, + { + "epoch": 0.9378775328794521, + "grad_norm": 2.2785565853118896, + "learning_rate": 5.742884937229312e-06, + "loss": 1.0717, + "step": 11606 + }, + { + "epoch": 0.9379583425927797, + "grad_norm": 2.803117275238037, + "learning_rate": 5.742237831592862e-06, + "loss": 0.8737, + "step": 11607 + }, + { + "epoch": 0.9380391523061072, + "grad_norm": 2.532715082168579, + "learning_rate": 5.741590713243666e-06, + "loss": 0.9361, + "step": 11608 + }, + { + "epoch": 0.9381199620194347, + "grad_norm": 2.657824754714966, + "learning_rate": 5.740943582192806e-06, + "loss": 0.9157, + "step": 11609 + }, + { + "epoch": 0.9382007717327623, + "grad_norm": 2.352769613265991, + "learning_rate": 5.740296438451367e-06, + "loss": 0.9595, + "step": 11610 + }, + { + "epoch": 0.9382815814460899, + "grad_norm": 2.897944688796997, + "learning_rate": 5.73964928203043e-06, + "loss": 0.9243, + "step": 11611 + }, + { + "epoch": 0.9383623911594173, + "grad_norm": 2.463521957397461, + "learning_rate": 5.739002112941085e-06, + "loss": 0.8229, + "step": 11612 + }, + { + "epoch": 0.9384432008727449, + "grad_norm": 2.732213020324707, + "learning_rate": 5.738354931194411e-06, + "loss": 0.8576, + "step": 11613 + }, + { + "epoch": 0.9385240105860725, + "grad_norm": 2.7223618030548096, + "learning_rate": 5.737707736801494e-06, + "loss": 0.8618, + "step": 11614 + }, + { + "epoch": 0.9386048202994, + "grad_norm": 2.5126633644104004, + "learning_rate": 5.73706052977342e-06, + "loss": 1.0201, + "step": 11615 + }, + { + "epoch": 0.9386856300127275, + "grad_norm": 2.850412607192993, + "learning_rate": 5.736413310121274e-06, + "loss": 0.9298, + "step": 11616 + }, + { + "epoch": 0.9387664397260551, + "grad_norm": 2.631685495376587, + "learning_rate": 5.73576607785614e-06, + "loss": 0.963, + "step": 11617 + }, + { + "epoch": 0.9388472494393826, + "grad_norm": 2.328568458557129, + "learning_rate": 5.735118832989105e-06, + "loss": 0.9466, + "step": 11618 + }, + { + "epoch": 0.9389280591527102, + "grad_norm": 2.7688541412353516, + "learning_rate": 5.734471575531253e-06, + "loss": 0.8964, + "step": 11619 + }, + { + "epoch": 0.9390088688660377, + "grad_norm": 2.0908613204956055, + "learning_rate": 5.733824305493672e-06, + "loss": 0.962, + "step": 11620 + }, + { + "epoch": 0.9390896785793652, + "grad_norm": 2.739166259765625, + "learning_rate": 5.733177022887447e-06, + "loss": 0.9175, + "step": 11621 + }, + { + "epoch": 0.9391704882926928, + "grad_norm": 2.4673783779144287, + "learning_rate": 5.732529727723665e-06, + "loss": 0.9727, + "step": 11622 + }, + { + "epoch": 0.9392512980060204, + "grad_norm": 2.764119863510132, + "learning_rate": 5.731882420013411e-06, + "loss": 0.8901, + "step": 11623 + }, + { + "epoch": 0.9393321077193478, + "grad_norm": 2.6008598804473877, + "learning_rate": 5.731235099767776e-06, + "loss": 0.8897, + "step": 11624 + }, + { + "epoch": 0.9394129174326754, + "grad_norm": 2.988536834716797, + "learning_rate": 5.73058776699784e-06, + "loss": 0.7958, + "step": 11625 + }, + { + "epoch": 0.939493727146003, + "grad_norm": 2.476825714111328, + "learning_rate": 5.729940421714698e-06, + "loss": 0.9187, + "step": 11626 + }, + { + "epoch": 0.9395745368593305, + "grad_norm": 3.226663112640381, + "learning_rate": 5.729293063929432e-06, + "loss": 1.0076, + "step": 11627 + }, + { + "epoch": 0.939655346572658, + "grad_norm": 2.6984446048736572, + "learning_rate": 5.728645693653132e-06, + "loss": 0.9631, + "step": 11628 + }, + { + "epoch": 0.9397361562859856, + "grad_norm": 2.6082351207733154, + "learning_rate": 5.727998310896885e-06, + "loss": 0.9045, + "step": 11629 + }, + { + "epoch": 0.9398169659993131, + "grad_norm": 2.7023630142211914, + "learning_rate": 5.72735091567178e-06, + "loss": 0.8335, + "step": 11630 + }, + { + "epoch": 0.9398977757126407, + "grad_norm": 2.7454357147216797, + "learning_rate": 5.726703507988904e-06, + "loss": 0.8524, + "step": 11631 + }, + { + "epoch": 0.9399785854259682, + "grad_norm": 2.3165290355682373, + "learning_rate": 5.7260560878593486e-06, + "loss": 1.0407, + "step": 11632 + }, + { + "epoch": 0.9400593951392957, + "grad_norm": 3.1288325786590576, + "learning_rate": 5.725408655294199e-06, + "loss": 0.8654, + "step": 11633 + }, + { + "epoch": 0.9401402048526233, + "grad_norm": 2.6203765869140625, + "learning_rate": 5.724761210304544e-06, + "loss": 0.9561, + "step": 11634 + }, + { + "epoch": 0.9402210145659509, + "grad_norm": 2.4022812843322754, + "learning_rate": 5.724113752901476e-06, + "loss": 0.8731, + "step": 11635 + }, + { + "epoch": 0.9403018242792783, + "grad_norm": 2.7591936588287354, + "learning_rate": 5.723466283096082e-06, + "loss": 0.9516, + "step": 11636 + }, + { + "epoch": 0.9403826339926059, + "grad_norm": 2.518235445022583, + "learning_rate": 5.7228188008994525e-06, + "loss": 0.9368, + "step": 11637 + }, + { + "epoch": 0.9404634437059335, + "grad_norm": 2.76204252243042, + "learning_rate": 5.722171306322677e-06, + "loss": 1.0127, + "step": 11638 + }, + { + "epoch": 0.940544253419261, + "grad_norm": 2.6521084308624268, + "learning_rate": 5.721523799376845e-06, + "loss": 0.8873, + "step": 11639 + }, + { + "epoch": 0.9406250631325885, + "grad_norm": 2.2734222412109375, + "learning_rate": 5.720876280073047e-06, + "loss": 1.0376, + "step": 11640 + }, + { + "epoch": 0.9407058728459161, + "grad_norm": 3.156776189804077, + "learning_rate": 5.720228748422376e-06, + "loss": 0.912, + "step": 11641 + }, + { + "epoch": 0.9407866825592436, + "grad_norm": 2.487765312194824, + "learning_rate": 5.719581204435919e-06, + "loss": 0.9032, + "step": 11642 + }, + { + "epoch": 0.9408674922725712, + "grad_norm": 2.592525005340576, + "learning_rate": 5.7189336481247685e-06, + "loss": 0.9763, + "step": 11643 + }, + { + "epoch": 0.9409483019858987, + "grad_norm": 2.4446444511413574, + "learning_rate": 5.718286079500015e-06, + "loss": 1.0842, + "step": 11644 + }, + { + "epoch": 0.9410291116992262, + "grad_norm": 2.3044979572296143, + "learning_rate": 5.717638498572751e-06, + "loss": 0.9632, + "step": 11645 + }, + { + "epoch": 0.9411099214125538, + "grad_norm": 2.7817041873931885, + "learning_rate": 5.7169909053540666e-06, + "loss": 0.867, + "step": 11646 + }, + { + "epoch": 0.9411907311258814, + "grad_norm": 2.5680007934570312, + "learning_rate": 5.716343299855054e-06, + "loss": 0.8192, + "step": 11647 + }, + { + "epoch": 0.9412715408392088, + "grad_norm": 2.3593533039093018, + "learning_rate": 5.7156956820868035e-06, + "loss": 1.0092, + "step": 11648 + }, + { + "epoch": 0.9413523505525364, + "grad_norm": 2.8043808937072754, + "learning_rate": 5.715048052060413e-06, + "loss": 0.9201, + "step": 11649 + }, + { + "epoch": 0.941433160265864, + "grad_norm": 2.6572163105010986, + "learning_rate": 5.7144004097869664e-06, + "loss": 0.8527, + "step": 11650 + }, + { + "epoch": 0.9415139699791915, + "grad_norm": 2.7471683025360107, + "learning_rate": 5.713752755277564e-06, + "loss": 0.9413, + "step": 11651 + }, + { + "epoch": 0.941594779692519, + "grad_norm": 2.9516263008117676, + "learning_rate": 5.713105088543294e-06, + "loss": 0.9228, + "step": 11652 + }, + { + "epoch": 0.9416755894058466, + "grad_norm": 2.697279214859009, + "learning_rate": 5.712457409595249e-06, + "loss": 0.9134, + "step": 11653 + }, + { + "epoch": 0.9417563991191741, + "grad_norm": 2.719963788986206, + "learning_rate": 5.711809718444525e-06, + "loss": 0.9863, + "step": 11654 + }, + { + "epoch": 0.9418372088325017, + "grad_norm": 2.6032958030700684, + "learning_rate": 5.711162015102216e-06, + "loss": 0.8067, + "step": 11655 + }, + { + "epoch": 0.9419180185458292, + "grad_norm": 2.5723934173583984, + "learning_rate": 5.7105142995794104e-06, + "loss": 0.9703, + "step": 11656 + }, + { + "epoch": 0.9419988282591567, + "grad_norm": 2.735833168029785, + "learning_rate": 5.709866571887208e-06, + "loss": 0.9125, + "step": 11657 + }, + { + "epoch": 0.9420796379724843, + "grad_norm": 2.497631549835205, + "learning_rate": 5.7092188320366994e-06, + "loss": 0.9172, + "step": 11658 + }, + { + "epoch": 0.9421604476858119, + "grad_norm": 2.514263153076172, + "learning_rate": 5.70857108003898e-06, + "loss": 1.0135, + "step": 11659 + }, + { + "epoch": 0.9422412573991393, + "grad_norm": 2.6322433948516846, + "learning_rate": 5.707923315905142e-06, + "loss": 0.8427, + "step": 11660 + }, + { + "epoch": 0.9423220671124669, + "grad_norm": 3.2455856800079346, + "learning_rate": 5.707275539646284e-06, + "loss": 0.8514, + "step": 11661 + }, + { + "epoch": 0.9424028768257945, + "grad_norm": 2.391359806060791, + "learning_rate": 5.706627751273496e-06, + "loss": 0.973, + "step": 11662 + }, + { + "epoch": 0.942483686539122, + "grad_norm": 2.312131643295288, + "learning_rate": 5.705979950797878e-06, + "loss": 0.8773, + "step": 11663 + }, + { + "epoch": 0.9425644962524495, + "grad_norm": 2.034846782684326, + "learning_rate": 5.7053321382305214e-06, + "loss": 0.942, + "step": 11664 + }, + { + "epoch": 0.9426453059657771, + "grad_norm": 2.5560832023620605, + "learning_rate": 5.704684313582526e-06, + "loss": 0.927, + "step": 11665 + }, + { + "epoch": 0.9427261156791046, + "grad_norm": 2.4851796627044678, + "learning_rate": 5.704036476864982e-06, + "loss": 1.0638, + "step": 11666 + }, + { + "epoch": 0.9428069253924322, + "grad_norm": 3.1105778217315674, + "learning_rate": 5.7033886280889894e-06, + "loss": 0.8441, + "step": 11667 + }, + { + "epoch": 0.9428877351057597, + "grad_norm": 2.5052027702331543, + "learning_rate": 5.702740767265643e-06, + "loss": 0.9546, + "step": 11668 + }, + { + "epoch": 0.9429685448190872, + "grad_norm": 2.8185160160064697, + "learning_rate": 5.7020928944060395e-06, + "loss": 1.0313, + "step": 11669 + }, + { + "epoch": 0.9430493545324148, + "grad_norm": 2.9099888801574707, + "learning_rate": 5.701445009521273e-06, + "loss": 1.024, + "step": 11670 + }, + { + "epoch": 0.9431301642457424, + "grad_norm": 2.643374443054199, + "learning_rate": 5.700797112622445e-06, + "loss": 1.0331, + "step": 11671 + }, + { + "epoch": 0.9432109739590698, + "grad_norm": 2.4487547874450684, + "learning_rate": 5.700149203720648e-06, + "loss": 0.9712, + "step": 11672 + }, + { + "epoch": 0.9432917836723974, + "grad_norm": 2.403853416442871, + "learning_rate": 5.699501282826979e-06, + "loss": 1.0223, + "step": 11673 + }, + { + "epoch": 0.943372593385725, + "grad_norm": 2.1139395236968994, + "learning_rate": 5.698853349952539e-06, + "loss": 1.0247, + "step": 11674 + }, + { + "epoch": 0.9434534030990525, + "grad_norm": 3.068643808364868, + "learning_rate": 5.6982054051084235e-06, + "loss": 0.8979, + "step": 11675 + }, + { + "epoch": 0.94353421281238, + "grad_norm": 2.852083206176758, + "learning_rate": 5.697557448305729e-06, + "loss": 0.9458, + "step": 11676 + }, + { + "epoch": 0.9436150225257076, + "grad_norm": 2.364699125289917, + "learning_rate": 5.696909479555557e-06, + "loss": 0.8893, + "step": 11677 + }, + { + "epoch": 0.9436958322390351, + "grad_norm": 2.350175142288208, + "learning_rate": 5.696261498869e-06, + "loss": 1.0053, + "step": 11678 + }, + { + "epoch": 0.9437766419523627, + "grad_norm": 2.7102997303009033, + "learning_rate": 5.695613506257162e-06, + "loss": 0.8596, + "step": 11679 + }, + { + "epoch": 0.9438574516656902, + "grad_norm": 2.601069927215576, + "learning_rate": 5.69496550173114e-06, + "loss": 0.8036, + "step": 11680 + }, + { + "epoch": 0.9439382613790177, + "grad_norm": 2.622908115386963, + "learning_rate": 5.69431748530203e-06, + "loss": 0.9233, + "step": 11681 + }, + { + "epoch": 0.9440190710923453, + "grad_norm": 2.724900960922241, + "learning_rate": 5.693669456980935e-06, + "loss": 1.043, + "step": 11682 + }, + { + "epoch": 0.9440998808056729, + "grad_norm": 2.661947727203369, + "learning_rate": 5.693021416778951e-06, + "loss": 1.0079, + "step": 11683 + }, + { + "epoch": 0.9441806905190003, + "grad_norm": 2.5352721214294434, + "learning_rate": 5.692373364707178e-06, + "loss": 0.8936, + "step": 11684 + }, + { + "epoch": 0.9442615002323279, + "grad_norm": 2.724100351333618, + "learning_rate": 5.691725300776717e-06, + "loss": 0.9612, + "step": 11685 + }, + { + "epoch": 0.9443423099456555, + "grad_norm": 3.0862293243408203, + "learning_rate": 5.691077224998667e-06, + "loss": 0.9231, + "step": 11686 + }, + { + "epoch": 0.944423119658983, + "grad_norm": 3.1719329357147217, + "learning_rate": 5.690429137384127e-06, + "loss": 0.964, + "step": 11687 + }, + { + "epoch": 0.9445039293723105, + "grad_norm": 2.7404732704162598, + "learning_rate": 5.689781037944198e-06, + "loss": 0.9465, + "step": 11688 + }, + { + "epoch": 0.9445847390856381, + "grad_norm": 3.070930004119873, + "learning_rate": 5.689132926689982e-06, + "loss": 0.8778, + "step": 11689 + }, + { + "epoch": 0.9446655487989656, + "grad_norm": 2.8368983268737793, + "learning_rate": 5.688484803632574e-06, + "loss": 1.0098, + "step": 11690 + }, + { + "epoch": 0.9447463585122932, + "grad_norm": 2.6157639026641846, + "learning_rate": 5.687836668783083e-06, + "loss": 0.9648, + "step": 11691 + }, + { + "epoch": 0.9448271682256207, + "grad_norm": 2.602119207382202, + "learning_rate": 5.687188522152603e-06, + "loss": 0.8824, + "step": 11692 + }, + { + "epoch": 0.9449079779389482, + "grad_norm": 2.6795992851257324, + "learning_rate": 5.686540363752241e-06, + "loss": 0.9899, + "step": 11693 + }, + { + "epoch": 0.9449887876522758, + "grad_norm": 2.348895788192749, + "learning_rate": 5.685892193593093e-06, + "loss": 0.8316, + "step": 11694 + }, + { + "epoch": 0.9450695973656034, + "grad_norm": 3.1809682846069336, + "learning_rate": 5.685244011686264e-06, + "loss": 0.9253, + "step": 11695 + }, + { + "epoch": 0.9451504070789308, + "grad_norm": 2.47281813621521, + "learning_rate": 5.684595818042854e-06, + "loss": 1.0117, + "step": 11696 + }, + { + "epoch": 0.9452312167922584, + "grad_norm": 2.3931689262390137, + "learning_rate": 5.683947612673966e-06, + "loss": 1.0186, + "step": 11697 + }, + { + "epoch": 0.945312026505586, + "grad_norm": 2.7105016708374023, + "learning_rate": 5.683299395590701e-06, + "loss": 0.8382, + "step": 11698 + }, + { + "epoch": 0.9453928362189136, + "grad_norm": 3.033129930496216, + "learning_rate": 5.682651166804165e-06, + "loss": 0.8572, + "step": 11699 + }, + { + "epoch": 0.945473645932241, + "grad_norm": 3.0416135787963867, + "learning_rate": 5.682002926325456e-06, + "loss": 0.8278, + "step": 11700 + }, + { + "epoch": 0.9455544556455686, + "grad_norm": 2.209350109100342, + "learning_rate": 5.681354674165678e-06, + "loss": 0.862, + "step": 11701 + }, + { + "epoch": 0.9456352653588962, + "grad_norm": 2.855954885482788, + "learning_rate": 5.680706410335936e-06, + "loss": 0.9371, + "step": 11702 + }, + { + "epoch": 0.9457160750722237, + "grad_norm": 2.812636375427246, + "learning_rate": 5.680058134847332e-06, + "loss": 0.8857, + "step": 11703 + }, + { + "epoch": 0.9457968847855512, + "grad_norm": 3.201815128326416, + "learning_rate": 5.679409847710968e-06, + "loss": 0.935, + "step": 11704 + }, + { + "epoch": 0.9458776944988788, + "grad_norm": 2.952446699142456, + "learning_rate": 5.67876154893795e-06, + "loss": 0.9729, + "step": 11705 + }, + { + "epoch": 0.9459585042122063, + "grad_norm": 2.510310173034668, + "learning_rate": 5.67811323853938e-06, + "loss": 0.8319, + "step": 11706 + }, + { + "epoch": 0.9460393139255339, + "grad_norm": 2.7408740520477295, + "learning_rate": 5.677464916526363e-06, + "loss": 0.9462, + "step": 11707 + }, + { + "epoch": 0.9461201236388614, + "grad_norm": 3.0367515087127686, + "learning_rate": 5.676816582910004e-06, + "loss": 0.9367, + "step": 11708 + }, + { + "epoch": 0.9462009333521889, + "grad_norm": 2.750922918319702, + "learning_rate": 5.676168237701405e-06, + "loss": 0.9806, + "step": 11709 + }, + { + "epoch": 0.9462817430655165, + "grad_norm": 2.858391284942627, + "learning_rate": 5.675519880911673e-06, + "loss": 0.8769, + "step": 11710 + }, + { + "epoch": 0.9463625527788441, + "grad_norm": 2.731144666671753, + "learning_rate": 5.67487151255191e-06, + "loss": 0.9623, + "step": 11711 + }, + { + "epoch": 0.9464433624921715, + "grad_norm": 2.8071606159210205, + "learning_rate": 5.674223132633224e-06, + "loss": 0.9204, + "step": 11712 + }, + { + "epoch": 0.9465241722054991, + "grad_norm": 2.459951400756836, + "learning_rate": 5.673574741166719e-06, + "loss": 0.9877, + "step": 11713 + }, + { + "epoch": 0.9466049819188267, + "grad_norm": 2.5169243812561035, + "learning_rate": 5.6729263381635e-06, + "loss": 0.9025, + "step": 11714 + }, + { + "epoch": 0.9466857916321542, + "grad_norm": 2.9062554836273193, + "learning_rate": 5.672277923634671e-06, + "loss": 0.8722, + "step": 11715 + }, + { + "epoch": 0.9467666013454817, + "grad_norm": 3.1349399089813232, + "learning_rate": 5.671629497591343e-06, + "loss": 0.9472, + "step": 11716 + }, + { + "epoch": 0.9468474110588093, + "grad_norm": 2.732954740524292, + "learning_rate": 5.6709810600446165e-06, + "loss": 0.889, + "step": 11717 + }, + { + "epoch": 0.9469282207721368, + "grad_norm": 2.821624279022217, + "learning_rate": 5.6703326110056e-06, + "loss": 0.8655, + "step": 11718 + }, + { + "epoch": 0.9470090304854644, + "grad_norm": 2.6683051586151123, + "learning_rate": 5.6696841504853994e-06, + "loss": 0.97, + "step": 11719 + }, + { + "epoch": 0.9470898401987919, + "grad_norm": 2.8609542846679688, + "learning_rate": 5.6690356784951216e-06, + "loss": 0.8594, + "step": 11720 + }, + { + "epoch": 0.9471706499121194, + "grad_norm": 2.2158589363098145, + "learning_rate": 5.668387195045874e-06, + "loss": 0.9758, + "step": 11721 + }, + { + "epoch": 0.947251459625447, + "grad_norm": 2.7605020999908447, + "learning_rate": 5.667738700148763e-06, + "loss": 1.0052, + "step": 11722 + }, + { + "epoch": 0.9473322693387746, + "grad_norm": 2.9931535720825195, + "learning_rate": 5.667090193814894e-06, + "loss": 0.9068, + "step": 11723 + }, + { + "epoch": 0.947413079052102, + "grad_norm": 2.7512810230255127, + "learning_rate": 5.666441676055378e-06, + "loss": 0.9482, + "step": 11724 + }, + { + "epoch": 0.9474938887654296, + "grad_norm": 2.234410047531128, + "learning_rate": 5.665793146881319e-06, + "loss": 1.0454, + "step": 11725 + }, + { + "epoch": 0.9475746984787572, + "grad_norm": 2.208270788192749, + "learning_rate": 5.665144606303826e-06, + "loss": 0.7288, + "step": 11726 + }, + { + "epoch": 0.9476555081920847, + "grad_norm": 2.417513370513916, + "learning_rate": 5.664496054334008e-06, + "loss": 0.9563, + "step": 11727 + }, + { + "epoch": 0.9477363179054122, + "grad_norm": 2.571308135986328, + "learning_rate": 5.663847490982973e-06, + "loss": 0.9953, + "step": 11728 + }, + { + "epoch": 0.9478171276187398, + "grad_norm": 2.855367660522461, + "learning_rate": 5.6631989162618265e-06, + "loss": 0.9072, + "step": 11729 + }, + { + "epoch": 0.9478979373320673, + "grad_norm": 2.699425458908081, + "learning_rate": 5.662550330181681e-06, + "loss": 0.9318, + "step": 11730 + }, + { + "epoch": 0.9479787470453949, + "grad_norm": 2.2659034729003906, + "learning_rate": 5.6619017327536415e-06, + "loss": 0.9078, + "step": 11731 + }, + { + "epoch": 0.9480595567587224, + "grad_norm": 2.4166064262390137, + "learning_rate": 5.661253123988821e-06, + "loss": 0.8629, + "step": 11732 + }, + { + "epoch": 0.9481403664720499, + "grad_norm": 2.674800157546997, + "learning_rate": 5.660604503898325e-06, + "loss": 0.9452, + "step": 11733 + }, + { + "epoch": 0.9482211761853775, + "grad_norm": 2.9452109336853027, + "learning_rate": 5.659955872493265e-06, + "loss": 0.8996, + "step": 11734 + }, + { + "epoch": 0.9483019858987051, + "grad_norm": 2.279548406600952, + "learning_rate": 5.659307229784748e-06, + "loss": 0.8853, + "step": 11735 + }, + { + "epoch": 0.9483827956120325, + "grad_norm": 2.522388219833374, + "learning_rate": 5.658658575783888e-06, + "loss": 0.9592, + "step": 11736 + }, + { + "epoch": 0.9484636053253601, + "grad_norm": 2.332998514175415, + "learning_rate": 5.6580099105017895e-06, + "loss": 0.9054, + "step": 11737 + }, + { + "epoch": 0.9485444150386877, + "grad_norm": 2.6834816932678223, + "learning_rate": 5.657361233949568e-06, + "loss": 0.9384, + "step": 11738 + }, + { + "epoch": 0.9486252247520152, + "grad_norm": 2.5026962757110596, + "learning_rate": 5.65671254613833e-06, + "loss": 1.0743, + "step": 11739 + }, + { + "epoch": 0.9487060344653427, + "grad_norm": 2.5357184410095215, + "learning_rate": 5.656063847079186e-06, + "loss": 0.9328, + "step": 11740 + }, + { + "epoch": 0.9487868441786703, + "grad_norm": 2.4060933589935303, + "learning_rate": 5.655415136783249e-06, + "loss": 0.9872, + "step": 11741 + }, + { + "epoch": 0.9488676538919978, + "grad_norm": 2.616105794906616, + "learning_rate": 5.6547664152616284e-06, + "loss": 1.0001, + "step": 11742 + }, + { + "epoch": 0.9489484636053254, + "grad_norm": 2.723905563354492, + "learning_rate": 5.654117682525434e-06, + "loss": 0.8787, + "step": 11743 + }, + { + "epoch": 0.9490292733186529, + "grad_norm": 3.021120548248291, + "learning_rate": 5.65346893858578e-06, + "loss": 0.9593, + "step": 11744 + }, + { + "epoch": 0.9491100830319804, + "grad_norm": 2.3811423778533936, + "learning_rate": 5.6528201834537746e-06, + "loss": 0.9215, + "step": 11745 + }, + { + "epoch": 0.949190892745308, + "grad_norm": 2.196046829223633, + "learning_rate": 5.652171417140533e-06, + "loss": 0.888, + "step": 11746 + }, + { + "epoch": 0.9492717024586356, + "grad_norm": 2.4733779430389404, + "learning_rate": 5.651522639657164e-06, + "loss": 0.8295, + "step": 11747 + }, + { + "epoch": 0.949352512171963, + "grad_norm": 2.6456706523895264, + "learning_rate": 5.650873851014781e-06, + "loss": 0.9543, + "step": 11748 + }, + { + "epoch": 0.9494333218852906, + "grad_norm": 2.7628049850463867, + "learning_rate": 5.650225051224496e-06, + "loss": 0.8961, + "step": 11749 + }, + { + "epoch": 0.9495141315986182, + "grad_norm": 2.874279260635376, + "learning_rate": 5.6495762402974215e-06, + "loss": 0.9824, + "step": 11750 + }, + { + "epoch": 0.9495949413119457, + "grad_norm": 2.812365770339966, + "learning_rate": 5.648927418244668e-06, + "loss": 0.9168, + "step": 11751 + }, + { + "epoch": 0.9496757510252732, + "grad_norm": 2.616420030593872, + "learning_rate": 5.648278585077352e-06, + "loss": 0.9293, + "step": 11752 + }, + { + "epoch": 0.9497565607386008, + "grad_norm": 3.3047990798950195, + "learning_rate": 5.6476297408065836e-06, + "loss": 0.8761, + "step": 11753 + }, + { + "epoch": 0.9498373704519283, + "grad_norm": 2.7111291885375977, + "learning_rate": 5.646980885443478e-06, + "loss": 0.8583, + "step": 11754 + }, + { + "epoch": 0.9499181801652559, + "grad_norm": 2.859605312347412, + "learning_rate": 5.646332018999145e-06, + "loss": 0.8666, + "step": 11755 + }, + { + "epoch": 0.9499989898785834, + "grad_norm": 2.3949639797210693, + "learning_rate": 5.645683141484703e-06, + "loss": 0.844, + "step": 11756 + }, + { + "epoch": 0.9500797995919109, + "grad_norm": 2.8391880989074707, + "learning_rate": 5.645034252911262e-06, + "loss": 0.9814, + "step": 11757 + }, + { + "epoch": 0.9501606093052385, + "grad_norm": 2.696854829788208, + "learning_rate": 5.644385353289939e-06, + "loss": 1.0628, + "step": 11758 + }, + { + "epoch": 0.9502414190185661, + "grad_norm": 2.8728668689727783, + "learning_rate": 5.643736442631842e-06, + "loss": 0.8773, + "step": 11759 + }, + { + "epoch": 0.9503222287318935, + "grad_norm": 2.1819210052490234, + "learning_rate": 5.643087520948093e-06, + "loss": 0.941, + "step": 11760 + }, + { + "epoch": 0.9504030384452211, + "grad_norm": 3.0423099994659424, + "learning_rate": 5.642438588249802e-06, + "loss": 0.9751, + "step": 11761 + }, + { + "epoch": 0.9504838481585487, + "grad_norm": 2.6552722454071045, + "learning_rate": 5.6417896445480846e-06, + "loss": 0.8125, + "step": 11762 + }, + { + "epoch": 0.9505646578718762, + "grad_norm": 2.639998435974121, + "learning_rate": 5.6411406898540555e-06, + "loss": 0.8741, + "step": 11763 + }, + { + "epoch": 0.9506454675852037, + "grad_norm": 2.564969062805176, + "learning_rate": 5.6404917241788295e-06, + "loss": 0.9727, + "step": 11764 + }, + { + "epoch": 0.9507262772985313, + "grad_norm": 2.4525957107543945, + "learning_rate": 5.6398427475335214e-06, + "loss": 0.9602, + "step": 11765 + }, + { + "epoch": 0.9508070870118588, + "grad_norm": 2.4959049224853516, + "learning_rate": 5.63919375992925e-06, + "loss": 0.8862, + "step": 11766 + }, + { + "epoch": 0.9508878967251864, + "grad_norm": 2.6759307384490967, + "learning_rate": 5.638544761377127e-06, + "loss": 0.9905, + "step": 11767 + }, + { + "epoch": 0.9509687064385139, + "grad_norm": 2.340460777282715, + "learning_rate": 5.637895751888269e-06, + "loss": 0.9592, + "step": 11768 + }, + { + "epoch": 0.9510495161518414, + "grad_norm": 2.380915641784668, + "learning_rate": 5.637246731473792e-06, + "loss": 0.9191, + "step": 11769 + }, + { + "epoch": 0.951130325865169, + "grad_norm": 2.7404839992523193, + "learning_rate": 5.636597700144814e-06, + "loss": 0.9549, + "step": 11770 + }, + { + "epoch": 0.9512111355784966, + "grad_norm": 2.454563856124878, + "learning_rate": 5.635948657912449e-06, + "loss": 0.8845, + "step": 11771 + }, + { + "epoch": 0.951291945291824, + "grad_norm": 2.698922634124756, + "learning_rate": 5.635299604787815e-06, + "loss": 0.9333, + "step": 11772 + }, + { + "epoch": 0.9513727550051516, + "grad_norm": 2.2386131286621094, + "learning_rate": 5.634650540782028e-06, + "loss": 0.8995, + "step": 11773 + }, + { + "epoch": 0.9514535647184792, + "grad_norm": 2.602815628051758, + "learning_rate": 5.6340014659062044e-06, + "loss": 0.9921, + "step": 11774 + }, + { + "epoch": 0.9515343744318067, + "grad_norm": 2.311607837677002, + "learning_rate": 5.633352380171464e-06, + "loss": 1.0103, + "step": 11775 + }, + { + "epoch": 0.9516151841451342, + "grad_norm": 2.5385825634002686, + "learning_rate": 5.6327032835889204e-06, + "loss": 0.8515, + "step": 11776 + }, + { + "epoch": 0.9516959938584618, + "grad_norm": 3.2339069843292236, + "learning_rate": 5.6320541761696925e-06, + "loss": 0.9709, + "step": 11777 + }, + { + "epoch": 0.9517768035717893, + "grad_norm": 2.827669143676758, + "learning_rate": 5.631405057924899e-06, + "loss": 0.8341, + "step": 11778 + }, + { + "epoch": 0.9518576132851169, + "grad_norm": 2.6024420261383057, + "learning_rate": 5.630755928865656e-06, + "loss": 0.9085, + "step": 11779 + }, + { + "epoch": 0.9519384229984444, + "grad_norm": 2.5606136322021484, + "learning_rate": 5.630106789003083e-06, + "loss": 0.8564, + "step": 11780 + }, + { + "epoch": 0.9520192327117719, + "grad_norm": 2.8526065349578857, + "learning_rate": 5.629457638348299e-06, + "loss": 0.9309, + "step": 11781 + }, + { + "epoch": 0.9521000424250995, + "grad_norm": 2.884697675704956, + "learning_rate": 5.628808476912417e-06, + "loss": 0.9203, + "step": 11782 + }, + { + "epoch": 0.9521808521384271, + "grad_norm": 2.9439496994018555, + "learning_rate": 5.628159304706564e-06, + "loss": 0.9687, + "step": 11783 + }, + { + "epoch": 0.9522616618517545, + "grad_norm": 2.4576220512390137, + "learning_rate": 5.627510121741852e-06, + "loss": 0.9139, + "step": 11784 + }, + { + "epoch": 0.9523424715650821, + "grad_norm": 3.0397865772247314, + "learning_rate": 5.626860928029403e-06, + "loss": 0.8678, + "step": 11785 + }, + { + "epoch": 0.9524232812784097, + "grad_norm": 2.5104920864105225, + "learning_rate": 5.626211723580335e-06, + "loss": 0.8331, + "step": 11786 + }, + { + "epoch": 0.9525040909917372, + "grad_norm": 2.589230537414551, + "learning_rate": 5.6255625084057685e-06, + "loss": 0.8887, + "step": 11787 + }, + { + "epoch": 0.9525849007050647, + "grad_norm": 2.8371191024780273, + "learning_rate": 5.624913282516822e-06, + "loss": 1.0051, + "step": 11788 + }, + { + "epoch": 0.9526657104183923, + "grad_norm": 3.225444793701172, + "learning_rate": 5.624264045924616e-06, + "loss": 0.9531, + "step": 11789 + }, + { + "epoch": 0.9527465201317198, + "grad_norm": 2.4302711486816406, + "learning_rate": 5.623614798640267e-06, + "loss": 0.9652, + "step": 11790 + }, + { + "epoch": 0.9528273298450474, + "grad_norm": 2.6602742671966553, + "learning_rate": 5.622965540674901e-06, + "loss": 0.9664, + "step": 11791 + }, + { + "epoch": 0.952908139558375, + "grad_norm": 2.643425941467285, + "learning_rate": 5.622316272039633e-06, + "loss": 0.9502, + "step": 11792 + }, + { + "epoch": 0.9529889492717024, + "grad_norm": 2.4053399562835693, + "learning_rate": 5.621666992745586e-06, + "loss": 0.9174, + "step": 11793 + }, + { + "epoch": 0.95306975898503, + "grad_norm": 2.7074623107910156, + "learning_rate": 5.621017702803879e-06, + "loss": 0.8457, + "step": 11794 + }, + { + "epoch": 0.9531505686983576, + "grad_norm": 2.365626573562622, + "learning_rate": 5.620368402225637e-06, + "loss": 0.9687, + "step": 11795 + }, + { + "epoch": 0.953231378411685, + "grad_norm": 2.4549739360809326, + "learning_rate": 5.619719091021973e-06, + "loss": 0.9489, + "step": 11796 + }, + { + "epoch": 0.9533121881250126, + "grad_norm": 3.2549610137939453, + "learning_rate": 5.619069769204017e-06, + "loss": 1.0024, + "step": 11797 + }, + { + "epoch": 0.9533929978383402, + "grad_norm": 2.302922248840332, + "learning_rate": 5.618420436782886e-06, + "loss": 0.9862, + "step": 11798 + }, + { + "epoch": 0.9534738075516677, + "grad_norm": 2.9284064769744873, + "learning_rate": 5.6177710937696996e-06, + "loss": 0.987, + "step": 11799 + }, + { + "epoch": 0.9535546172649952, + "grad_norm": 2.7569353580474854, + "learning_rate": 5.617121740175582e-06, + "loss": 0.9758, + "step": 11800 + }, + { + "epoch": 0.9536354269783228, + "grad_norm": 2.929086446762085, + "learning_rate": 5.616472376011654e-06, + "loss": 0.9325, + "step": 11801 + }, + { + "epoch": 0.9537162366916503, + "grad_norm": 2.611757278442383, + "learning_rate": 5.61582300128904e-06, + "loss": 0.8772, + "step": 11802 + }, + { + "epoch": 0.9537970464049779, + "grad_norm": 2.6273276805877686, + "learning_rate": 5.615173616018861e-06, + "loss": 1.0233, + "step": 11803 + }, + { + "epoch": 0.9538778561183054, + "grad_norm": 2.3987762928009033, + "learning_rate": 5.614524220212236e-06, + "loss": 0.829, + "step": 11804 + }, + { + "epoch": 0.9539586658316329, + "grad_norm": 2.251645088195801, + "learning_rate": 5.613874813880293e-06, + "loss": 0.9795, + "step": 11805 + }, + { + "epoch": 0.9540394755449605, + "grad_norm": 2.7683653831481934, + "learning_rate": 5.613225397034152e-06, + "loss": 0.8576, + "step": 11806 + }, + { + "epoch": 0.9541202852582881, + "grad_norm": 2.711724281311035, + "learning_rate": 5.612575969684936e-06, + "loss": 0.9559, + "step": 11807 + }, + { + "epoch": 0.9542010949716155, + "grad_norm": 2.6666228771209717, + "learning_rate": 5.611926531843768e-06, + "loss": 0.8438, + "step": 11808 + }, + { + "epoch": 0.9542819046849431, + "grad_norm": 3.5984926223754883, + "learning_rate": 5.611277083521772e-06, + "loss": 1.0368, + "step": 11809 + }, + { + "epoch": 0.9543627143982707, + "grad_norm": 2.522277593612671, + "learning_rate": 5.610627624730071e-06, + "loss": 1.0333, + "step": 11810 + }, + { + "epoch": 0.9544435241115982, + "grad_norm": 2.439326763153076, + "learning_rate": 5.609978155479789e-06, + "loss": 0.9332, + "step": 11811 + }, + { + "epoch": 0.9545243338249257, + "grad_norm": 2.7858619689941406, + "learning_rate": 5.60932867578205e-06, + "loss": 0.9524, + "step": 11812 + }, + { + "epoch": 0.9546051435382533, + "grad_norm": 2.5886895656585693, + "learning_rate": 5.608679185647976e-06, + "loss": 0.9871, + "step": 11813 + }, + { + "epoch": 0.9546859532515808, + "grad_norm": 2.249063014984131, + "learning_rate": 5.608029685088694e-06, + "loss": 0.9825, + "step": 11814 + }, + { + "epoch": 0.9547667629649084, + "grad_norm": 2.3512027263641357, + "learning_rate": 5.607380174115328e-06, + "loss": 0.8383, + "step": 11815 + }, + { + "epoch": 0.954847572678236, + "grad_norm": 2.7193336486816406, + "learning_rate": 5.6067306527390005e-06, + "loss": 1.0348, + "step": 11816 + }, + { + "epoch": 0.9549283823915634, + "grad_norm": 2.9844541549682617, + "learning_rate": 5.606081120970838e-06, + "loss": 0.8671, + "step": 11817 + }, + { + "epoch": 0.955009192104891, + "grad_norm": 2.8504726886749268, + "learning_rate": 5.605431578821965e-06, + "loss": 0.9323, + "step": 11818 + }, + { + "epoch": 0.9550900018182186, + "grad_norm": 2.616696834564209, + "learning_rate": 5.604782026303508e-06, + "loss": 0.9002, + "step": 11819 + }, + { + "epoch": 0.955170811531546, + "grad_norm": 3.192596197128296, + "learning_rate": 5.6041324634265895e-06, + "loss": 0.8543, + "step": 11820 + }, + { + "epoch": 0.9552516212448736, + "grad_norm": 2.398898124694824, + "learning_rate": 5.603482890202335e-06, + "loss": 1.0712, + "step": 11821 + }, + { + "epoch": 0.9553324309582012, + "grad_norm": 2.7207298278808594, + "learning_rate": 5.6028333066418725e-06, + "loss": 0.9207, + "step": 11822 + }, + { + "epoch": 0.9554132406715287, + "grad_norm": 2.7692246437072754, + "learning_rate": 5.602183712756328e-06, + "loss": 0.9189, + "step": 11823 + }, + { + "epoch": 0.9554940503848562, + "grad_norm": 2.6773533821105957, + "learning_rate": 5.601534108556824e-06, + "loss": 1.0012, + "step": 11824 + }, + { + "epoch": 0.9555748600981838, + "grad_norm": 2.6970131397247314, + "learning_rate": 5.600884494054491e-06, + "loss": 0.9982, + "step": 11825 + }, + { + "epoch": 0.9556556698115114, + "grad_norm": 2.2516486644744873, + "learning_rate": 5.600234869260451e-06, + "loss": 0.9003, + "step": 11826 + }, + { + "epoch": 0.9557364795248389, + "grad_norm": 2.576528310775757, + "learning_rate": 5.599585234185836e-06, + "loss": 0.9896, + "step": 11827 + }, + { + "epoch": 0.9558172892381664, + "grad_norm": 2.8807623386383057, + "learning_rate": 5.598935588841768e-06, + "loss": 0.885, + "step": 11828 + }, + { + "epoch": 0.955898098951494, + "grad_norm": 2.4815165996551514, + "learning_rate": 5.598285933239373e-06, + "loss": 0.8403, + "step": 11829 + }, + { + "epoch": 0.9559789086648215, + "grad_norm": 2.2415409088134766, + "learning_rate": 5.5976362673897825e-06, + "loss": 0.863, + "step": 11830 + }, + { + "epoch": 0.9560597183781491, + "grad_norm": 2.512897491455078, + "learning_rate": 5.59698659130412e-06, + "loss": 0.9813, + "step": 11831 + }, + { + "epoch": 0.9561405280914766, + "grad_norm": 2.87442684173584, + "learning_rate": 5.596336904993516e-06, + "loss": 0.9838, + "step": 11832 + }, + { + "epoch": 0.9562213378048041, + "grad_norm": 2.697709321975708, + "learning_rate": 5.595687208469096e-06, + "loss": 0.9258, + "step": 11833 + }, + { + "epoch": 0.9563021475181317, + "grad_norm": 2.5676422119140625, + "learning_rate": 5.5950375017419875e-06, + "loss": 0.9948, + "step": 11834 + }, + { + "epoch": 0.9563829572314593, + "grad_norm": 2.5715384483337402, + "learning_rate": 5.5943877848233185e-06, + "loss": 0.911, + "step": 11835 + }, + { + "epoch": 0.9564637669447867, + "grad_norm": 2.583868980407715, + "learning_rate": 5.59373805772422e-06, + "loss": 0.9617, + "step": 11836 + }, + { + "epoch": 0.9565445766581143, + "grad_norm": 2.7021234035491943, + "learning_rate": 5.593088320455815e-06, + "loss": 0.9524, + "step": 11837 + }, + { + "epoch": 0.9566253863714419, + "grad_norm": 2.6744143962860107, + "learning_rate": 5.592438573029236e-06, + "loss": 0.8343, + "step": 11838 + }, + { + "epoch": 0.9567061960847694, + "grad_norm": 2.4851043224334717, + "learning_rate": 5.591788815455611e-06, + "loss": 0.9552, + "step": 11839 + }, + { + "epoch": 0.956787005798097, + "grad_norm": 2.3759748935699463, + "learning_rate": 5.591139047746068e-06, + "loss": 0.9101, + "step": 11840 + }, + { + "epoch": 0.9568678155114245, + "grad_norm": 2.605360746383667, + "learning_rate": 5.590489269911738e-06, + "loss": 0.8584, + "step": 11841 + }, + { + "epoch": 0.956948625224752, + "grad_norm": 2.6257309913635254, + "learning_rate": 5.589839481963745e-06, + "loss": 0.923, + "step": 11842 + }, + { + "epoch": 0.9570294349380796, + "grad_norm": 2.8049874305725098, + "learning_rate": 5.589189683913224e-06, + "loss": 0.9867, + "step": 11843 + }, + { + "epoch": 0.9571102446514071, + "grad_norm": 2.438661575317383, + "learning_rate": 5.588539875771301e-06, + "loss": 1.0174, + "step": 11844 + }, + { + "epoch": 0.9571910543647346, + "grad_norm": 2.449471950531006, + "learning_rate": 5.587890057549108e-06, + "loss": 0.9168, + "step": 11845 + }, + { + "epoch": 0.9572718640780622, + "grad_norm": 2.5123777389526367, + "learning_rate": 5.587240229257773e-06, + "loss": 0.99, + "step": 11846 + }, + { + "epoch": 0.9573526737913898, + "grad_norm": 2.5206658840179443, + "learning_rate": 5.586590390908426e-06, + "loss": 0.8916, + "step": 11847 + }, + { + "epoch": 0.9574334835047172, + "grad_norm": 2.9969146251678467, + "learning_rate": 5.585940542512199e-06, + "loss": 0.9834, + "step": 11848 + }, + { + "epoch": 0.9575142932180448, + "grad_norm": 2.512291193008423, + "learning_rate": 5.585290684080219e-06, + "loss": 0.7877, + "step": 11849 + }, + { + "epoch": 0.9575951029313724, + "grad_norm": 2.7943925857543945, + "learning_rate": 5.584640815623621e-06, + "loss": 0.968, + "step": 11850 + }, + { + "epoch": 0.9576759126446999, + "grad_norm": 2.564974308013916, + "learning_rate": 5.583990937153533e-06, + "loss": 0.8463, + "step": 11851 + }, + { + "epoch": 0.9577567223580274, + "grad_norm": 2.8840105533599854, + "learning_rate": 5.583341048681085e-06, + "loss": 0.908, + "step": 11852 + }, + { + "epoch": 0.957837532071355, + "grad_norm": 2.7245535850524902, + "learning_rate": 5.582691150217408e-06, + "loss": 0.9197, + "step": 11853 + }, + { + "epoch": 0.9579183417846825, + "grad_norm": 2.4554336071014404, + "learning_rate": 5.582041241773637e-06, + "loss": 0.9541, + "step": 11854 + }, + { + "epoch": 0.9579991514980101, + "grad_norm": 2.4145753383636475, + "learning_rate": 5.5813913233609e-06, + "loss": 1.1204, + "step": 11855 + }, + { + "epoch": 0.9580799612113376, + "grad_norm": 2.252882719039917, + "learning_rate": 5.580741394990329e-06, + "loss": 0.9932, + "step": 11856 + }, + { + "epoch": 0.9581607709246651, + "grad_norm": 2.604893445968628, + "learning_rate": 5.580091456673055e-06, + "loss": 0.9917, + "step": 11857 + }, + { + "epoch": 0.9582415806379927, + "grad_norm": 3.008949041366577, + "learning_rate": 5.579441508420213e-06, + "loss": 0.9565, + "step": 11858 + }, + { + "epoch": 0.9583223903513203, + "grad_norm": 2.3516547679901123, + "learning_rate": 5.5787915502429315e-06, + "loss": 0.899, + "step": 11859 + }, + { + "epoch": 0.9584032000646477, + "grad_norm": 2.8255181312561035, + "learning_rate": 5.578141582152344e-06, + "loss": 0.8812, + "step": 11860 + }, + { + "epoch": 0.9584840097779753, + "grad_norm": 2.4160497188568115, + "learning_rate": 5.577491604159583e-06, + "loss": 0.9587, + "step": 11861 + }, + { + "epoch": 0.9585648194913029, + "grad_norm": 2.656268358230591, + "learning_rate": 5.576841616275782e-06, + "loss": 0.8639, + "step": 11862 + }, + { + "epoch": 0.9586456292046304, + "grad_norm": 2.7525899410247803, + "learning_rate": 5.576191618512071e-06, + "loss": 1.0354, + "step": 11863 + }, + { + "epoch": 0.958726438917958, + "grad_norm": 2.9054365158081055, + "learning_rate": 5.575541610879587e-06, + "loss": 0.9397, + "step": 11864 + }, + { + "epoch": 0.9588072486312855, + "grad_norm": 2.5695607662200928, + "learning_rate": 5.57489159338946e-06, + "loss": 0.8876, + "step": 11865 + }, + { + "epoch": 0.958888058344613, + "grad_norm": 2.642263174057007, + "learning_rate": 5.574241566052824e-06, + "loss": 0.9237, + "step": 11866 + }, + { + "epoch": 0.9589688680579406, + "grad_norm": 2.4560844898223877, + "learning_rate": 5.573591528880812e-06, + "loss": 0.8213, + "step": 11867 + }, + { + "epoch": 0.9590496777712681, + "grad_norm": 2.4272077083587646, + "learning_rate": 5.572941481884557e-06, + "loss": 0.9842, + "step": 11868 + }, + { + "epoch": 0.9591304874845956, + "grad_norm": 2.425368547439575, + "learning_rate": 5.572291425075195e-06, + "loss": 0.7658, + "step": 11869 + }, + { + "epoch": 0.9592112971979232, + "grad_norm": 2.9005725383758545, + "learning_rate": 5.5716413584638594e-06, + "loss": 0.888, + "step": 11870 + }, + { + "epoch": 0.9592921069112508, + "grad_norm": 2.5874617099761963, + "learning_rate": 5.570991282061681e-06, + "loss": 0.9456, + "step": 11871 + }, + { + "epoch": 0.9593729166245782, + "grad_norm": 2.476445198059082, + "learning_rate": 5.570341195879799e-06, + "loss": 0.9491, + "step": 11872 + }, + { + "epoch": 0.9594537263379058, + "grad_norm": 2.938046455383301, + "learning_rate": 5.5696910999293444e-06, + "loss": 1.0074, + "step": 11873 + }, + { + "epoch": 0.9595345360512334, + "grad_norm": 2.6625962257385254, + "learning_rate": 5.569040994221453e-06, + "loss": 1.0006, + "step": 11874 + }, + { + "epoch": 0.9596153457645609, + "grad_norm": 2.6659927368164062, + "learning_rate": 5.568390878767258e-06, + "loss": 0.9031, + "step": 11875 + }, + { + "epoch": 0.9596961554778884, + "grad_norm": 2.371853828430176, + "learning_rate": 5.567740753577898e-06, + "loss": 0.9263, + "step": 11876 + }, + { + "epoch": 0.959776965191216, + "grad_norm": 2.314671516418457, + "learning_rate": 5.567090618664503e-06, + "loss": 0.9129, + "step": 11877 + }, + { + "epoch": 0.9598577749045435, + "grad_norm": 2.497958183288574, + "learning_rate": 5.566440474038213e-06, + "loss": 0.8058, + "step": 11878 + }, + { + "epoch": 0.9599385846178711, + "grad_norm": 2.8782222270965576, + "learning_rate": 5.56579031971016e-06, + "loss": 0.9175, + "step": 11879 + }, + { + "epoch": 0.9600193943311986, + "grad_norm": 3.011770009994507, + "learning_rate": 5.565140155691482e-06, + "loss": 0.9486, + "step": 11880 + }, + { + "epoch": 0.9601002040445261, + "grad_norm": 2.551955223083496, + "learning_rate": 5.564489981993313e-06, + "loss": 0.8927, + "step": 11881 + }, + { + "epoch": 0.9601810137578537, + "grad_norm": 2.612999200820923, + "learning_rate": 5.563839798626789e-06, + "loss": 0.9371, + "step": 11882 + }, + { + "epoch": 0.9602618234711813, + "grad_norm": 3.1354992389678955, + "learning_rate": 5.5631896056030475e-06, + "loss": 0.976, + "step": 11883 + }, + { + "epoch": 0.9603426331845087, + "grad_norm": 2.590576171875, + "learning_rate": 5.562539402933225e-06, + "loss": 0.8114, + "step": 11884 + }, + { + "epoch": 0.9604234428978363, + "grad_norm": 2.5892324447631836, + "learning_rate": 5.561889190628454e-06, + "loss": 0.9457, + "step": 11885 + }, + { + "epoch": 0.9605042526111639, + "grad_norm": 2.3162331581115723, + "learning_rate": 5.5612389686998755e-06, + "loss": 0.9203, + "step": 11886 + }, + { + "epoch": 0.9605850623244914, + "grad_norm": 2.729736566543579, + "learning_rate": 5.5605887371586254e-06, + "loss": 0.8252, + "step": 11887 + }, + { + "epoch": 0.960665872037819, + "grad_norm": 2.5039331912994385, + "learning_rate": 5.559938496015838e-06, + "loss": 0.8686, + "step": 11888 + }, + { + "epoch": 0.9607466817511465, + "grad_norm": 2.8261559009552, + "learning_rate": 5.559288245282652e-06, + "loss": 0.828, + "step": 11889 + }, + { + "epoch": 0.960827491464474, + "grad_norm": 2.9055511951446533, + "learning_rate": 5.558637984970208e-06, + "loss": 0.9344, + "step": 11890 + }, + { + "epoch": 0.9609083011778016, + "grad_norm": 2.3357317447662354, + "learning_rate": 5.557987715089637e-06, + "loss": 1.0102, + "step": 11891 + }, + { + "epoch": 0.9609891108911292, + "grad_norm": 2.7615742683410645, + "learning_rate": 5.55733743565208e-06, + "loss": 0.9038, + "step": 11892 + }, + { + "epoch": 0.9610699206044566, + "grad_norm": 2.6449334621429443, + "learning_rate": 5.556687146668675e-06, + "loss": 0.9393, + "step": 11893 + }, + { + "epoch": 0.9611507303177842, + "grad_norm": 2.7535130977630615, + "learning_rate": 5.55603684815056e-06, + "loss": 1.0233, + "step": 11894 + }, + { + "epoch": 0.9612315400311118, + "grad_norm": 2.990295886993408, + "learning_rate": 5.555386540108872e-06, + "loss": 1.0242, + "step": 11895 + }, + { + "epoch": 0.9613123497444392, + "grad_norm": 2.797285318374634, + "learning_rate": 5.55473622255475e-06, + "loss": 0.9128, + "step": 11896 + }, + { + "epoch": 0.9613931594577668, + "grad_norm": 2.8205981254577637, + "learning_rate": 5.55408589549933e-06, + "loss": 0.8859, + "step": 11897 + }, + { + "epoch": 0.9614739691710944, + "grad_norm": 2.5844244956970215, + "learning_rate": 5.5534355589537545e-06, + "loss": 0.8984, + "step": 11898 + }, + { + "epoch": 0.9615547788844219, + "grad_norm": 2.822474241256714, + "learning_rate": 5.552785212929159e-06, + "loss": 0.9482, + "step": 11899 + }, + { + "epoch": 0.9616355885977494, + "grad_norm": 2.5194263458251953, + "learning_rate": 5.5521348574366864e-06, + "loss": 0.912, + "step": 11900 + }, + { + "epoch": 0.961716398311077, + "grad_norm": 3.069188117980957, + "learning_rate": 5.55148449248747e-06, + "loss": 0.96, + "step": 11901 + }, + { + "epoch": 0.9617972080244045, + "grad_norm": 2.8065030574798584, + "learning_rate": 5.5508341180926524e-06, + "loss": 1.038, + "step": 11902 + }, + { + "epoch": 0.9618780177377321, + "grad_norm": 2.3951709270477295, + "learning_rate": 5.5501837342633725e-06, + "loss": 0.9176, + "step": 11903 + }, + { + "epoch": 0.9619588274510597, + "grad_norm": 3.390561103820801, + "learning_rate": 5.54953334101077e-06, + "loss": 0.8808, + "step": 11904 + }, + { + "epoch": 0.9620396371643871, + "grad_norm": 2.726771593093872, + "learning_rate": 5.548882938345984e-06, + "loss": 0.9092, + "step": 11905 + }, + { + "epoch": 0.9621204468777147, + "grad_norm": 2.804781436920166, + "learning_rate": 5.5482325262801554e-06, + "loss": 0.8905, + "step": 11906 + }, + { + "epoch": 0.9622012565910423, + "grad_norm": 2.394566535949707, + "learning_rate": 5.547582104824423e-06, + "loss": 0.8691, + "step": 11907 + }, + { + "epoch": 0.9622820663043697, + "grad_norm": 2.655407428741455, + "learning_rate": 5.546931673989927e-06, + "loss": 0.9035, + "step": 11908 + }, + { + "epoch": 0.9623628760176973, + "grad_norm": 2.811229944229126, + "learning_rate": 5.546281233787809e-06, + "loss": 0.9811, + "step": 11909 + }, + { + "epoch": 0.9624436857310249, + "grad_norm": 2.610511064529419, + "learning_rate": 5.545630784229208e-06, + "loss": 0.9878, + "step": 11910 + }, + { + "epoch": 0.9625244954443524, + "grad_norm": 2.0195400714874268, + "learning_rate": 5.544980325325264e-06, + "loss": 0.9336, + "step": 11911 + }, + { + "epoch": 0.96260530515768, + "grad_norm": 3.0447850227355957, + "learning_rate": 5.5443298570871205e-06, + "loss": 0.952, + "step": 11912 + }, + { + "epoch": 0.9626861148710075, + "grad_norm": 2.9219303131103516, + "learning_rate": 5.543679379525917e-06, + "loss": 0.9372, + "step": 11913 + }, + { + "epoch": 0.962766924584335, + "grad_norm": 2.240722894668579, + "learning_rate": 5.543028892652794e-06, + "loss": 0.8187, + "step": 11914 + }, + { + "epoch": 0.9628477342976626, + "grad_norm": 2.5248091220855713, + "learning_rate": 5.542378396478894e-06, + "loss": 0.8664, + "step": 11915 + }, + { + "epoch": 0.9629285440109902, + "grad_norm": 2.4241583347320557, + "learning_rate": 5.541727891015357e-06, + "loss": 1.0349, + "step": 11916 + }, + { + "epoch": 0.9630093537243176, + "grad_norm": 3.2690250873565674, + "learning_rate": 5.541077376273327e-06, + "loss": 0.8942, + "step": 11917 + }, + { + "epoch": 0.9630901634376452, + "grad_norm": 2.689641237258911, + "learning_rate": 5.540426852263943e-06, + "loss": 1.0096, + "step": 11918 + }, + { + "epoch": 0.9631709731509728, + "grad_norm": 2.767505168914795, + "learning_rate": 5.5397763189983475e-06, + "loss": 0.8507, + "step": 11919 + }, + { + "epoch": 0.9632517828643002, + "grad_norm": 3.236069440841675, + "learning_rate": 5.539125776487684e-06, + "loss": 1.0064, + "step": 11920 + }, + { + "epoch": 0.9633325925776278, + "grad_norm": 2.6238200664520264, + "learning_rate": 5.538475224743094e-06, + "loss": 1.0667, + "step": 11921 + }, + { + "epoch": 0.9634134022909554, + "grad_norm": 3.1788158416748047, + "learning_rate": 5.537824663775719e-06, + "loss": 0.9107, + "step": 11922 + }, + { + "epoch": 0.9634942120042829, + "grad_norm": 2.5138230323791504, + "learning_rate": 5.5371740935967026e-06, + "loss": 0.9272, + "step": 11923 + }, + { + "epoch": 0.9635750217176104, + "grad_norm": 3.8556137084960938, + "learning_rate": 5.536523514217186e-06, + "loss": 0.8909, + "step": 11924 + }, + { + "epoch": 0.963655831430938, + "grad_norm": 2.61544132232666, + "learning_rate": 5.5358729256483145e-06, + "loss": 0.935, + "step": 11925 + }, + { + "epoch": 0.9637366411442655, + "grad_norm": 2.8928191661834717, + "learning_rate": 5.53522232790123e-06, + "loss": 0.9328, + "step": 11926 + }, + { + "epoch": 0.9638174508575931, + "grad_norm": 2.9073996543884277, + "learning_rate": 5.534571720987076e-06, + "loss": 0.8416, + "step": 11927 + }, + { + "epoch": 0.9638982605709207, + "grad_norm": 3.052318572998047, + "learning_rate": 5.533921104916993e-06, + "loss": 0.8459, + "step": 11928 + }, + { + "epoch": 0.9639790702842481, + "grad_norm": 2.600609302520752, + "learning_rate": 5.5332704797021295e-06, + "loss": 0.8499, + "step": 11929 + }, + { + "epoch": 0.9640598799975757, + "grad_norm": 2.682948589324951, + "learning_rate": 5.532619845353624e-06, + "loss": 0.8424, + "step": 11930 + }, + { + "epoch": 0.9641406897109033, + "grad_norm": 2.9137091636657715, + "learning_rate": 5.531969201882625e-06, + "loss": 0.9018, + "step": 11931 + }, + { + "epoch": 0.9642214994242307, + "grad_norm": 2.423874616622925, + "learning_rate": 5.531318549300273e-06, + "loss": 1.011, + "step": 11932 + }, + { + "epoch": 0.9643023091375583, + "grad_norm": 2.705209255218506, + "learning_rate": 5.5306678876177135e-06, + "loss": 0.941, + "step": 11933 + }, + { + "epoch": 0.9643831188508859, + "grad_norm": 2.503817319869995, + "learning_rate": 5.530017216846091e-06, + "loss": 0.9454, + "step": 11934 + }, + { + "epoch": 0.9644639285642134, + "grad_norm": 2.6077113151550293, + "learning_rate": 5.529366536996549e-06, + "loss": 0.9686, + "step": 11935 + }, + { + "epoch": 0.964544738277541, + "grad_norm": 2.6398589611053467, + "learning_rate": 5.528715848080233e-06, + "loss": 0.9402, + "step": 11936 + }, + { + "epoch": 0.9646255479908685, + "grad_norm": 2.7943220138549805, + "learning_rate": 5.5280651501082886e-06, + "loss": 0.8676, + "step": 11937 + }, + { + "epoch": 0.964706357704196, + "grad_norm": 3.4839236736297607, + "learning_rate": 5.527414443091856e-06, + "loss": 0.9526, + "step": 11938 + }, + { + "epoch": 0.9647871674175236, + "grad_norm": 2.680088996887207, + "learning_rate": 5.526763727042087e-06, + "loss": 0.8524, + "step": 11939 + }, + { + "epoch": 0.9648679771308512, + "grad_norm": 2.4965760707855225, + "learning_rate": 5.5261130019701225e-06, + "loss": 0.9418, + "step": 11940 + }, + { + "epoch": 0.9649487868441786, + "grad_norm": 2.5393991470336914, + "learning_rate": 5.525462267887108e-06, + "loss": 0.8356, + "step": 11941 + }, + { + "epoch": 0.9650295965575062, + "grad_norm": 2.610919952392578, + "learning_rate": 5.524811524804191e-06, + "loss": 0.9368, + "step": 11942 + }, + { + "epoch": 0.9651104062708338, + "grad_norm": 2.57777738571167, + "learning_rate": 5.524160772732517e-06, + "loss": 0.9092, + "step": 11943 + }, + { + "epoch": 0.9651912159841612, + "grad_norm": 2.3386118412017822, + "learning_rate": 5.5235100116832275e-06, + "loss": 0.8654, + "step": 11944 + }, + { + "epoch": 0.9652720256974888, + "grad_norm": 2.5760498046875, + "learning_rate": 5.522859241667475e-06, + "loss": 0.916, + "step": 11945 + }, + { + "epoch": 0.9653528354108164, + "grad_norm": 2.5684726238250732, + "learning_rate": 5.5222084626964e-06, + "loss": 0.8797, + "step": 11946 + }, + { + "epoch": 0.9654336451241439, + "grad_norm": 2.869614601135254, + "learning_rate": 5.521557674781153e-06, + "loss": 0.967, + "step": 11947 + }, + { + "epoch": 0.9655144548374714, + "grad_norm": 2.1509933471679688, + "learning_rate": 5.520906877932877e-06, + "loss": 0.9737, + "step": 11948 + }, + { + "epoch": 0.965595264550799, + "grad_norm": 2.528146982192993, + "learning_rate": 5.520256072162722e-06, + "loss": 0.8562, + "step": 11949 + }, + { + "epoch": 0.9656760742641265, + "grad_norm": 2.6308462619781494, + "learning_rate": 5.519605257481832e-06, + "loss": 0.994, + "step": 11950 + }, + { + "epoch": 0.9657568839774541, + "grad_norm": 3.2644970417022705, + "learning_rate": 5.518954433901356e-06, + "loss": 1.0149, + "step": 11951 + }, + { + "epoch": 0.9658376936907817, + "grad_norm": 3.344829559326172, + "learning_rate": 5.518303601432437e-06, + "loss": 1.0481, + "step": 11952 + }, + { + "epoch": 0.9659185034041091, + "grad_norm": 2.8689024448394775, + "learning_rate": 5.517652760086227e-06, + "loss": 1.0843, + "step": 11953 + }, + { + "epoch": 0.9659993131174367, + "grad_norm": 2.743041753768921, + "learning_rate": 5.517001909873871e-06, + "loss": 0.8881, + "step": 11954 + }, + { + "epoch": 0.9660801228307643, + "grad_norm": 2.755493640899658, + "learning_rate": 5.516351050806518e-06, + "loss": 0.9139, + "step": 11955 + }, + { + "epoch": 0.9661609325440919, + "grad_norm": 2.8872082233428955, + "learning_rate": 5.515700182895314e-06, + "loss": 0.848, + "step": 11956 + }, + { + "epoch": 0.9662417422574193, + "grad_norm": 2.6378605365753174, + "learning_rate": 5.515049306151408e-06, + "loss": 0.8225, + "step": 11957 + }, + { + "epoch": 0.9663225519707469, + "grad_norm": 2.765263557434082, + "learning_rate": 5.514398420585945e-06, + "loss": 0.8984, + "step": 11958 + }, + { + "epoch": 0.9664033616840745, + "grad_norm": 2.951899290084839, + "learning_rate": 5.513747526210077e-06, + "loss": 0.942, + "step": 11959 + }, + { + "epoch": 0.966484171397402, + "grad_norm": 2.599083423614502, + "learning_rate": 5.51309662303495e-06, + "loss": 0.8274, + "step": 11960 + }, + { + "epoch": 0.9665649811107295, + "grad_norm": 2.4001996517181396, + "learning_rate": 5.512445711071714e-06, + "loss": 0.7794, + "step": 11961 + }, + { + "epoch": 0.9666457908240571, + "grad_norm": 2.3266730308532715, + "learning_rate": 5.511794790331516e-06, + "loss": 1.0175, + "step": 11962 + }, + { + "epoch": 0.9667266005373846, + "grad_norm": 3.0140881538391113, + "learning_rate": 5.511143860825506e-06, + "loss": 1.0362, + "step": 11963 + }, + { + "epoch": 0.9668074102507122, + "grad_norm": 2.9782848358154297, + "learning_rate": 5.510492922564832e-06, + "loss": 0.9554, + "step": 11964 + }, + { + "epoch": 0.9668882199640397, + "grad_norm": 3.11130690574646, + "learning_rate": 5.509841975560644e-06, + "loss": 0.8305, + "step": 11965 + }, + { + "epoch": 0.9669690296773672, + "grad_norm": 2.929426670074463, + "learning_rate": 5.509191019824087e-06, + "loss": 0.8546, + "step": 11966 + }, + { + "epoch": 0.9670498393906948, + "grad_norm": 2.5395426750183105, + "learning_rate": 5.508540055366317e-06, + "loss": 0.8987, + "step": 11967 + }, + { + "epoch": 0.9671306491040224, + "grad_norm": 3.2608258724212646, + "learning_rate": 5.5078890821984795e-06, + "loss": 0.9975, + "step": 11968 + }, + { + "epoch": 0.9672114588173498, + "grad_norm": 2.535095691680908, + "learning_rate": 5.5072381003317245e-06, + "loss": 0.8209, + "step": 11969 + }, + { + "epoch": 0.9672922685306774, + "grad_norm": 2.618473768234253, + "learning_rate": 5.5065871097772015e-06, + "loss": 0.9664, + "step": 11970 + }, + { + "epoch": 0.967373078244005, + "grad_norm": 2.96295428276062, + "learning_rate": 5.505936110546061e-06, + "loss": 0.9803, + "step": 11971 + }, + { + "epoch": 0.9674538879573324, + "grad_norm": 2.8674564361572266, + "learning_rate": 5.505285102649452e-06, + "loss": 0.8752, + "step": 11972 + }, + { + "epoch": 0.96753469767066, + "grad_norm": 3.0477328300476074, + "learning_rate": 5.504634086098527e-06, + "loss": 0.8243, + "step": 11973 + }, + { + "epoch": 0.9676155073839876, + "grad_norm": 3.2491588592529297, + "learning_rate": 5.503983060904436e-06, + "loss": 0.9981, + "step": 11974 + }, + { + "epoch": 0.9676963170973151, + "grad_norm": 2.4708495140075684, + "learning_rate": 5.5033320270783265e-06, + "loss": 0.8138, + "step": 11975 + }, + { + "epoch": 0.9677771268106427, + "grad_norm": 2.916604518890381, + "learning_rate": 5.502680984631351e-06, + "loss": 0.8819, + "step": 11976 + }, + { + "epoch": 0.9678579365239702, + "grad_norm": 2.530425786972046, + "learning_rate": 5.502029933574662e-06, + "loss": 0.8206, + "step": 11977 + }, + { + "epoch": 0.9679387462372977, + "grad_norm": 2.459057331085205, + "learning_rate": 5.501378873919407e-06, + "loss": 0.8387, + "step": 11978 + }, + { + "epoch": 0.9680195559506253, + "grad_norm": 2.8737850189208984, + "learning_rate": 5.50072780567674e-06, + "loss": 0.8975, + "step": 11979 + }, + { + "epoch": 0.9681003656639529, + "grad_norm": 2.7501003742218018, + "learning_rate": 5.50007672885781e-06, + "loss": 0.951, + "step": 11980 + }, + { + "epoch": 0.9681811753772803, + "grad_norm": 2.6252670288085938, + "learning_rate": 5.499425643473771e-06, + "loss": 0.9224, + "step": 11981 + }, + { + "epoch": 0.9682619850906079, + "grad_norm": 2.9512546062469482, + "learning_rate": 5.498774549535773e-06, + "loss": 0.8445, + "step": 11982 + }, + { + "epoch": 0.9683427948039355, + "grad_norm": 2.7344777584075928, + "learning_rate": 5.498123447054966e-06, + "loss": 0.7579, + "step": 11983 + }, + { + "epoch": 0.968423604517263, + "grad_norm": 2.848266124725342, + "learning_rate": 5.4974723360425066e-06, + "loss": 0.8675, + "step": 11984 + }, + { + "epoch": 0.9685044142305905, + "grad_norm": 2.897792100906372, + "learning_rate": 5.4968212165095415e-06, + "loss": 0.9332, + "step": 11985 + }, + { + "epoch": 0.9685852239439181, + "grad_norm": 2.544478416442871, + "learning_rate": 5.496170088467225e-06, + "loss": 0.9673, + "step": 11986 + }, + { + "epoch": 0.9686660336572456, + "grad_norm": 2.631855010986328, + "learning_rate": 5.495518951926709e-06, + "loss": 0.87, + "step": 11987 + }, + { + "epoch": 0.9687468433705732, + "grad_norm": 2.7205095291137695, + "learning_rate": 5.494867806899149e-06, + "loss": 0.9345, + "step": 11988 + }, + { + "epoch": 0.9688276530839007, + "grad_norm": 2.4885149002075195, + "learning_rate": 5.494216653395691e-06, + "loss": 0.8209, + "step": 11989 + }, + { + "epoch": 0.9689084627972282, + "grad_norm": 2.4030749797821045, + "learning_rate": 5.493565491427495e-06, + "loss": 1.019, + "step": 11990 + }, + { + "epoch": 0.9689892725105558, + "grad_norm": 3.011697292327881, + "learning_rate": 5.492914321005707e-06, + "loss": 0.8068, + "step": 11991 + }, + { + "epoch": 0.9690700822238834, + "grad_norm": 2.883098602294922, + "learning_rate": 5.492263142141486e-06, + "loss": 0.9623, + "step": 11992 + }, + { + "epoch": 0.9691508919372108, + "grad_norm": 2.875462293624878, + "learning_rate": 5.491611954845981e-06, + "loss": 0.9659, + "step": 11993 + }, + { + "epoch": 0.9692317016505384, + "grad_norm": 2.7993433475494385, + "learning_rate": 5.4909607591303474e-06, + "loss": 0.9546, + "step": 11994 + }, + { + "epoch": 0.969312511363866, + "grad_norm": 2.930129289627075, + "learning_rate": 5.490309555005738e-06, + "loss": 0.9378, + "step": 11995 + }, + { + "epoch": 0.9693933210771934, + "grad_norm": 2.551013231277466, + "learning_rate": 5.489658342483306e-06, + "loss": 0.9169, + "step": 11996 + }, + { + "epoch": 0.969474130790521, + "grad_norm": 2.400601863861084, + "learning_rate": 5.489007121574205e-06, + "loss": 0.9753, + "step": 11997 + }, + { + "epoch": 0.9695549405038486, + "grad_norm": 2.214895248413086, + "learning_rate": 5.488355892289591e-06, + "loss": 1.0205, + "step": 11998 + }, + { + "epoch": 0.9696357502171761, + "grad_norm": 2.7257273197174072, + "learning_rate": 5.4877046546406146e-06, + "loss": 0.8608, + "step": 11999 + }, + { + "epoch": 0.9697165599305037, + "grad_norm": 2.630215883255005, + "learning_rate": 5.487053408638431e-06, + "loss": 0.9609, + "step": 12000 + }, + { + "epoch": 0.9697165599305037, + "eval_loss": 0.7585760354995728, + "eval_runtime": 815.1901, + "eval_samples_per_second": 102.266, + "eval_steps_per_second": 12.784, + "step": 12000 + }, + { + "epoch": 0.9697973696438312, + "grad_norm": 3.021000385284424, + "learning_rate": 5.486402154294196e-06, + "loss": 0.8275, + "step": 12001 + }, + { + "epoch": 0.9698781793571587, + "grad_norm": 2.782231092453003, + "learning_rate": 5.485750891619064e-06, + "loss": 0.8195, + "step": 12002 + }, + { + "epoch": 0.9699589890704863, + "grad_norm": 2.2484138011932373, + "learning_rate": 5.4850996206241855e-06, + "loss": 0.9467, + "step": 12003 + }, + { + "epoch": 0.9700397987838139, + "grad_norm": 2.9472787380218506, + "learning_rate": 5.484448341320722e-06, + "loss": 1.0583, + "step": 12004 + }, + { + "epoch": 0.9701206084971413, + "grad_norm": 3.0565943717956543, + "learning_rate": 5.4837970537198214e-06, + "loss": 0.8226, + "step": 12005 + }, + { + "epoch": 0.9702014182104689, + "grad_norm": 2.46749210357666, + "learning_rate": 5.483145757832645e-06, + "loss": 0.8567, + "step": 12006 + }, + { + "epoch": 0.9702822279237965, + "grad_norm": 2.705247402191162, + "learning_rate": 5.4824944536703424e-06, + "loss": 0.8806, + "step": 12007 + }, + { + "epoch": 0.970363037637124, + "grad_norm": 2.4201948642730713, + "learning_rate": 5.481843141244073e-06, + "loss": 0.8836, + "step": 12008 + }, + { + "epoch": 0.9704438473504515, + "grad_norm": 2.4932138919830322, + "learning_rate": 5.48119182056499e-06, + "loss": 0.9435, + "step": 12009 + }, + { + "epoch": 0.9705246570637791, + "grad_norm": 2.6153910160064697, + "learning_rate": 5.480540491644251e-06, + "loss": 0.7817, + "step": 12010 + }, + { + "epoch": 0.9706054667771066, + "grad_norm": 2.635456085205078, + "learning_rate": 5.479889154493008e-06, + "loss": 0.9235, + "step": 12011 + }, + { + "epoch": 0.9706862764904342, + "grad_norm": 2.6182751655578613, + "learning_rate": 5.479237809122421e-06, + "loss": 0.906, + "step": 12012 + }, + { + "epoch": 0.9707670862037617, + "grad_norm": 2.5122549533843994, + "learning_rate": 5.478586455543642e-06, + "loss": 0.9398, + "step": 12013 + }, + { + "epoch": 0.9708478959170892, + "grad_norm": 2.6415774822235107, + "learning_rate": 5.47793509376783e-06, + "loss": 0.9097, + "step": 12014 + }, + { + "epoch": 0.9709287056304168, + "grad_norm": 2.48339581489563, + "learning_rate": 5.47728372380614e-06, + "loss": 0.9476, + "step": 12015 + }, + { + "epoch": 0.9710095153437444, + "grad_norm": 2.688344955444336, + "learning_rate": 5.476632345669731e-06, + "loss": 0.9421, + "step": 12016 + }, + { + "epoch": 0.9710903250570718, + "grad_norm": 2.872114896774292, + "learning_rate": 5.475980959369754e-06, + "loss": 1.0038, + "step": 12017 + }, + { + "epoch": 0.9711711347703994, + "grad_norm": 2.2378416061401367, + "learning_rate": 5.475329564917372e-06, + "loss": 0.8921, + "step": 12018 + }, + { + "epoch": 0.971251944483727, + "grad_norm": 2.508561849594116, + "learning_rate": 5.4746781623237365e-06, + "loss": 0.9155, + "step": 12019 + }, + { + "epoch": 0.9713327541970544, + "grad_norm": 2.6900382041931152, + "learning_rate": 5.474026751600009e-06, + "loss": 0.9107, + "step": 12020 + }, + { + "epoch": 0.971413563910382, + "grad_norm": 2.8969435691833496, + "learning_rate": 5.473375332757344e-06, + "loss": 0.7556, + "step": 12021 + }, + { + "epoch": 0.9714943736237096, + "grad_norm": 2.584876775741577, + "learning_rate": 5.472723905806898e-06, + "loss": 0.8506, + "step": 12022 + }, + { + "epoch": 0.9715751833370371, + "grad_norm": 2.85139799118042, + "learning_rate": 5.472072470759829e-06, + "loss": 0.8943, + "step": 12023 + }, + { + "epoch": 0.9716559930503647, + "grad_norm": 2.6479482650756836, + "learning_rate": 5.471421027627297e-06, + "loss": 0.9549, + "step": 12024 + }, + { + "epoch": 0.9717368027636922, + "grad_norm": 2.532615900039673, + "learning_rate": 5.470769576420456e-06, + "loss": 0.879, + "step": 12025 + }, + { + "epoch": 0.9718176124770197, + "grad_norm": 3.0018045902252197, + "learning_rate": 5.470118117150467e-06, + "loss": 0.8712, + "step": 12026 + }, + { + "epoch": 0.9718984221903473, + "grad_norm": 2.863424777984619, + "learning_rate": 5.469466649828487e-06, + "loss": 0.8958, + "step": 12027 + }, + { + "epoch": 0.9719792319036749, + "grad_norm": 2.624499797821045, + "learning_rate": 5.468815174465672e-06, + "loss": 0.9215, + "step": 12028 + }, + { + "epoch": 0.9720600416170023, + "grad_norm": 2.3520593643188477, + "learning_rate": 5.46816369107318e-06, + "loss": 0.7808, + "step": 12029 + }, + { + "epoch": 0.9721408513303299, + "grad_norm": 3.0101473331451416, + "learning_rate": 5.4675121996621736e-06, + "loss": 0.8992, + "step": 12030 + }, + { + "epoch": 0.9722216610436575, + "grad_norm": 2.4136831760406494, + "learning_rate": 5.466860700243808e-06, + "loss": 0.9802, + "step": 12031 + }, + { + "epoch": 0.972302470756985, + "grad_norm": 2.4630825519561768, + "learning_rate": 5.466209192829244e-06, + "loss": 0.9886, + "step": 12032 + }, + { + "epoch": 0.9723832804703125, + "grad_norm": 2.6207082271575928, + "learning_rate": 5.465557677429636e-06, + "loss": 0.9481, + "step": 12033 + }, + { + "epoch": 0.9724640901836401, + "grad_norm": 2.466914415359497, + "learning_rate": 5.464906154056147e-06, + "loss": 0.9045, + "step": 12034 + }, + { + "epoch": 0.9725448998969676, + "grad_norm": 2.6892282962799072, + "learning_rate": 5.464254622719936e-06, + "loss": 0.9514, + "step": 12035 + }, + { + "epoch": 0.9726257096102952, + "grad_norm": 2.581954002380371, + "learning_rate": 5.4636030834321595e-06, + "loss": 0.8804, + "step": 12036 + }, + { + "epoch": 0.9727065193236227, + "grad_norm": 2.4629170894622803, + "learning_rate": 5.462951536203979e-06, + "loss": 0.9844, + "step": 12037 + }, + { + "epoch": 0.9727873290369502, + "grad_norm": 2.5060341358184814, + "learning_rate": 5.462299981046553e-06, + "loss": 0.9414, + "step": 12038 + }, + { + "epoch": 0.9728681387502778, + "grad_norm": 2.9171016216278076, + "learning_rate": 5.46164841797104e-06, + "loss": 0.9069, + "step": 12039 + }, + { + "epoch": 0.9729489484636054, + "grad_norm": 2.539320468902588, + "learning_rate": 5.460996846988602e-06, + "loss": 0.8887, + "step": 12040 + }, + { + "epoch": 0.9730297581769328, + "grad_norm": 2.4009783267974854, + "learning_rate": 5.460345268110399e-06, + "loss": 0.8742, + "step": 12041 + }, + { + "epoch": 0.9731105678902604, + "grad_norm": 2.4767725467681885, + "learning_rate": 5.459693681347588e-06, + "loss": 0.9974, + "step": 12042 + }, + { + "epoch": 0.973191377603588, + "grad_norm": 3.1554806232452393, + "learning_rate": 5.4590420867113325e-06, + "loss": 0.9001, + "step": 12043 + }, + { + "epoch": 0.9732721873169154, + "grad_norm": 2.9938976764678955, + "learning_rate": 5.45839048421279e-06, + "loss": 0.9279, + "step": 12044 + }, + { + "epoch": 0.973352997030243, + "grad_norm": 2.3939712047576904, + "learning_rate": 5.457738873863122e-06, + "loss": 0.9318, + "step": 12045 + }, + { + "epoch": 0.9734338067435706, + "grad_norm": 2.897372007369995, + "learning_rate": 5.45708725567349e-06, + "loss": 0.9627, + "step": 12046 + }, + { + "epoch": 0.9735146164568981, + "grad_norm": 2.5381247997283936, + "learning_rate": 5.4564356296550524e-06, + "loss": 0.8693, + "step": 12047 + }, + { + "epoch": 0.9735954261702257, + "grad_norm": 2.783339262008667, + "learning_rate": 5.455783995818972e-06, + "loss": 0.963, + "step": 12048 + }, + { + "epoch": 0.9736762358835532, + "grad_norm": 2.798495054244995, + "learning_rate": 5.45513235417641e-06, + "loss": 0.9404, + "step": 12049 + }, + { + "epoch": 0.9737570455968807, + "grad_norm": 2.6823842525482178, + "learning_rate": 5.454480704738524e-06, + "loss": 0.9302, + "step": 12050 + }, + { + "epoch": 0.9738378553102083, + "grad_norm": 2.952755928039551, + "learning_rate": 5.45382904751648e-06, + "loss": 0.9424, + "step": 12051 + }, + { + "epoch": 0.9739186650235359, + "grad_norm": 2.4954073429107666, + "learning_rate": 5.453177382521436e-06, + "loss": 1.122, + "step": 12052 + }, + { + "epoch": 0.9739994747368633, + "grad_norm": 2.240203857421875, + "learning_rate": 5.452525709764555e-06, + "loss": 1.0148, + "step": 12053 + }, + { + "epoch": 0.9740802844501909, + "grad_norm": 2.3791136741638184, + "learning_rate": 5.451874029256997e-06, + "loss": 0.9732, + "step": 12054 + }, + { + "epoch": 0.9741610941635185, + "grad_norm": 2.5452420711517334, + "learning_rate": 5.4512223410099264e-06, + "loss": 0.8804, + "step": 12055 + }, + { + "epoch": 0.974241903876846, + "grad_norm": 2.996633291244507, + "learning_rate": 5.450570645034501e-06, + "loss": 0.9005, + "step": 12056 + }, + { + "epoch": 0.9743227135901735, + "grad_norm": 2.428724527359009, + "learning_rate": 5.449918941341887e-06, + "loss": 0.9323, + "step": 12057 + }, + { + "epoch": 0.9744035233035011, + "grad_norm": 2.5964088439941406, + "learning_rate": 5.4492672299432424e-06, + "loss": 0.974, + "step": 12058 + }, + { + "epoch": 0.9744843330168286, + "grad_norm": 3.083604097366333, + "learning_rate": 5.448615510849735e-06, + "loss": 0.9759, + "step": 12059 + }, + { + "epoch": 0.9745651427301562, + "grad_norm": 2.5858798027038574, + "learning_rate": 5.447963784072522e-06, + "loss": 0.9801, + "step": 12060 + }, + { + "epoch": 0.9746459524434837, + "grad_norm": 2.552480697631836, + "learning_rate": 5.447312049622767e-06, + "loss": 0.8827, + "step": 12061 + }, + { + "epoch": 0.9747267621568112, + "grad_norm": 2.3750808238983154, + "learning_rate": 5.446660307511635e-06, + "loss": 0.8882, + "step": 12062 + }, + { + "epoch": 0.9748075718701388, + "grad_norm": 2.459052085876465, + "learning_rate": 5.446008557750288e-06, + "loss": 0.9051, + "step": 12063 + }, + { + "epoch": 0.9748883815834664, + "grad_norm": 2.8203341960906982, + "learning_rate": 5.445356800349886e-06, + "loss": 0.8869, + "step": 12064 + }, + { + "epoch": 0.9749691912967938, + "grad_norm": 2.5629642009735107, + "learning_rate": 5.444705035321596e-06, + "loss": 0.929, + "step": 12065 + }, + { + "epoch": 0.9750500010101214, + "grad_norm": 2.6861367225646973, + "learning_rate": 5.444053262676577e-06, + "loss": 0.8866, + "step": 12066 + }, + { + "epoch": 0.975130810723449, + "grad_norm": 2.8241360187530518, + "learning_rate": 5.443401482425996e-06, + "loss": 0.9322, + "step": 12067 + }, + { + "epoch": 0.9752116204367764, + "grad_norm": 2.5909547805786133, + "learning_rate": 5.442749694581015e-06, + "loss": 0.8902, + "step": 12068 + }, + { + "epoch": 0.975292430150104, + "grad_norm": 2.752270221710205, + "learning_rate": 5.442097899152798e-06, + "loss": 0.8437, + "step": 12069 + }, + { + "epoch": 0.9753732398634316, + "grad_norm": 2.348259687423706, + "learning_rate": 5.441446096152507e-06, + "loss": 0.9196, + "step": 12070 + }, + { + "epoch": 0.9754540495767591, + "grad_norm": 2.414008855819702, + "learning_rate": 5.440794285591309e-06, + "loss": 1.0726, + "step": 12071 + }, + { + "epoch": 0.9755348592900867, + "grad_norm": 3.0321905612945557, + "learning_rate": 5.440142467480364e-06, + "loss": 0.9362, + "step": 12072 + }, + { + "epoch": 0.9756156690034142, + "grad_norm": 2.7788331508636475, + "learning_rate": 5.43949064183084e-06, + "loss": 0.9472, + "step": 12073 + }, + { + "epoch": 0.9756964787167417, + "grad_norm": 2.6454648971557617, + "learning_rate": 5.438838808653899e-06, + "loss": 0.8909, + "step": 12074 + }, + { + "epoch": 0.9757772884300693, + "grad_norm": 2.9046497344970703, + "learning_rate": 5.4381869679607045e-06, + "loss": 0.8763, + "step": 12075 + }, + { + "epoch": 0.9758580981433969, + "grad_norm": 2.7738723754882812, + "learning_rate": 5.437535119762423e-06, + "loss": 0.9295, + "step": 12076 + }, + { + "epoch": 0.9759389078567243, + "grad_norm": 2.4629805088043213, + "learning_rate": 5.436883264070219e-06, + "loss": 0.9496, + "step": 12077 + }, + { + "epoch": 0.9760197175700519, + "grad_norm": 2.4428911209106445, + "learning_rate": 5.436231400895255e-06, + "loss": 0.9442, + "step": 12078 + }, + { + "epoch": 0.9761005272833795, + "grad_norm": 2.0524792671203613, + "learning_rate": 5.435579530248698e-06, + "loss": 1.0176, + "step": 12079 + }, + { + "epoch": 0.976181336996707, + "grad_norm": 2.963853597640991, + "learning_rate": 5.434927652141711e-06, + "loss": 0.9538, + "step": 12080 + }, + { + "epoch": 0.9762621467100345, + "grad_norm": 2.7061660289764404, + "learning_rate": 5.434275766585462e-06, + "loss": 0.9298, + "step": 12081 + }, + { + "epoch": 0.9763429564233621, + "grad_norm": 2.973245620727539, + "learning_rate": 5.433623873591114e-06, + "loss": 0.9745, + "step": 12082 + }, + { + "epoch": 0.9764237661366896, + "grad_norm": 2.5026097297668457, + "learning_rate": 5.432971973169834e-06, + "loss": 0.8883, + "step": 12083 + }, + { + "epoch": 0.9765045758500172, + "grad_norm": 2.575617790222168, + "learning_rate": 5.432320065332785e-06, + "loss": 0.85, + "step": 12084 + }, + { + "epoch": 0.9765853855633447, + "grad_norm": 3.1300740242004395, + "learning_rate": 5.431668150091135e-06, + "loss": 0.9331, + "step": 12085 + }, + { + "epoch": 0.9766661952766723, + "grad_norm": 2.5608370304107666, + "learning_rate": 5.431016227456047e-06, + "loss": 0.865, + "step": 12086 + }, + { + "epoch": 0.9767470049899998, + "grad_norm": 2.486452102661133, + "learning_rate": 5.43036429743869e-06, + "loss": 1.0273, + "step": 12087 + }, + { + "epoch": 0.9768278147033274, + "grad_norm": 2.638866662979126, + "learning_rate": 5.429712360050229e-06, + "loss": 0.9897, + "step": 12088 + }, + { + "epoch": 0.9769086244166549, + "grad_norm": 2.524085760116577, + "learning_rate": 5.429060415301829e-06, + "loss": 0.9344, + "step": 12089 + }, + { + "epoch": 0.9769894341299824, + "grad_norm": 2.0070433616638184, + "learning_rate": 5.428408463204657e-06, + "loss": 1.0262, + "step": 12090 + }, + { + "epoch": 0.97707024384331, + "grad_norm": 2.591197967529297, + "learning_rate": 5.427756503769881e-06, + "loss": 0.8865, + "step": 12091 + }, + { + "epoch": 0.9771510535566376, + "grad_norm": 2.7331416606903076, + "learning_rate": 5.4271045370086625e-06, + "loss": 0.9492, + "step": 12092 + }, + { + "epoch": 0.977231863269965, + "grad_norm": 2.887516498565674, + "learning_rate": 5.426452562932175e-06, + "loss": 0.9706, + "step": 12093 + }, + { + "epoch": 0.9773126729832926, + "grad_norm": 2.3011393547058105, + "learning_rate": 5.42580058155158e-06, + "loss": 0.8829, + "step": 12094 + }, + { + "epoch": 0.9773934826966202, + "grad_norm": 2.5624382495880127, + "learning_rate": 5.425148592878047e-06, + "loss": 0.9892, + "step": 12095 + }, + { + "epoch": 0.9774742924099477, + "grad_norm": 2.3409554958343506, + "learning_rate": 5.424496596922742e-06, + "loss": 0.9166, + "step": 12096 + }, + { + "epoch": 0.9775551021232752, + "grad_norm": 2.6583638191223145, + "learning_rate": 5.423844593696831e-06, + "loss": 0.8867, + "step": 12097 + }, + { + "epoch": 0.9776359118366028, + "grad_norm": 2.8415184020996094, + "learning_rate": 5.423192583211483e-06, + "loss": 0.9739, + "step": 12098 + }, + { + "epoch": 0.9777167215499303, + "grad_norm": 2.552694320678711, + "learning_rate": 5.422540565477865e-06, + "loss": 0.9481, + "step": 12099 + }, + { + "epoch": 0.9777975312632579, + "grad_norm": 3.0070302486419678, + "learning_rate": 5.421888540507144e-06, + "loss": 0.9894, + "step": 12100 + }, + { + "epoch": 0.9778783409765854, + "grad_norm": 2.724087953567505, + "learning_rate": 5.421236508310489e-06, + "loss": 0.8188, + "step": 12101 + }, + { + "epoch": 0.9779591506899129, + "grad_norm": 2.6836183071136475, + "learning_rate": 5.420584468899066e-06, + "loss": 0.8178, + "step": 12102 + }, + { + "epoch": 0.9780399604032405, + "grad_norm": 2.4756271839141846, + "learning_rate": 5.419932422284044e-06, + "loss": 0.9692, + "step": 12103 + }, + { + "epoch": 0.9781207701165681, + "grad_norm": 2.5955512523651123, + "learning_rate": 5.41928036847659e-06, + "loss": 0.8955, + "step": 12104 + }, + { + "epoch": 0.9782015798298955, + "grad_norm": 2.2906126976013184, + "learning_rate": 5.418628307487872e-06, + "loss": 0.9172, + "step": 12105 + }, + { + "epoch": 0.9782823895432231, + "grad_norm": 2.5296623706817627, + "learning_rate": 5.41797623932906e-06, + "loss": 0.9272, + "step": 12106 + }, + { + "epoch": 0.9783631992565507, + "grad_norm": 2.603579044342041, + "learning_rate": 5.41732416401132e-06, + "loss": 0.9143, + "step": 12107 + }, + { + "epoch": 0.9784440089698782, + "grad_norm": 2.800060272216797, + "learning_rate": 5.416672081545824e-06, + "loss": 0.9438, + "step": 12108 + }, + { + "epoch": 0.9785248186832057, + "grad_norm": 2.8698110580444336, + "learning_rate": 5.4160199919437375e-06, + "loss": 0.8999, + "step": 12109 + }, + { + "epoch": 0.9786056283965333, + "grad_norm": 2.505112409591675, + "learning_rate": 5.415367895216229e-06, + "loss": 0.94, + "step": 12110 + }, + { + "epoch": 0.9786864381098608, + "grad_norm": 3.3058252334594727, + "learning_rate": 5.414715791374469e-06, + "loss": 0.936, + "step": 12111 + }, + { + "epoch": 0.9787672478231884, + "grad_norm": 2.4477574825286865, + "learning_rate": 5.414063680429625e-06, + "loss": 0.9525, + "step": 12112 + }, + { + "epoch": 0.9788480575365159, + "grad_norm": 2.613408327102661, + "learning_rate": 5.413411562392868e-06, + "loss": 0.9188, + "step": 12113 + }, + { + "epoch": 0.9789288672498434, + "grad_norm": 2.8560428619384766, + "learning_rate": 5.412759437275366e-06, + "loss": 0.9135, + "step": 12114 + }, + { + "epoch": 0.979009676963171, + "grad_norm": 2.201775312423706, + "learning_rate": 5.412107305088289e-06, + "loss": 0.9843, + "step": 12115 + }, + { + "epoch": 0.9790904866764986, + "grad_norm": 2.7511961460113525, + "learning_rate": 5.411455165842806e-06, + "loss": 0.9251, + "step": 12116 + }, + { + "epoch": 0.979171296389826, + "grad_norm": 2.503464698791504, + "learning_rate": 5.410803019550086e-06, + "loss": 0.9065, + "step": 12117 + }, + { + "epoch": 0.9792521061031536, + "grad_norm": 2.3945083618164062, + "learning_rate": 5.4101508662213e-06, + "loss": 0.9408, + "step": 12118 + }, + { + "epoch": 0.9793329158164812, + "grad_norm": 2.640806198120117, + "learning_rate": 5.409498705867616e-06, + "loss": 0.8195, + "step": 12119 + }, + { + "epoch": 0.9794137255298087, + "grad_norm": 2.9981961250305176, + "learning_rate": 5.408846538500205e-06, + "loss": 0.9757, + "step": 12120 + }, + { + "epoch": 0.9794945352431362, + "grad_norm": 2.7424895763397217, + "learning_rate": 5.408194364130238e-06, + "loss": 0.8409, + "step": 12121 + }, + { + "epoch": 0.9795753449564638, + "grad_norm": 2.4994630813598633, + "learning_rate": 5.407542182768884e-06, + "loss": 0.9793, + "step": 12122 + }, + { + "epoch": 0.9796561546697913, + "grad_norm": 2.414841890335083, + "learning_rate": 5.406889994427313e-06, + "loss": 0.8574, + "step": 12123 + }, + { + "epoch": 0.9797369643831189, + "grad_norm": 2.671175479888916, + "learning_rate": 5.406237799116696e-06, + "loss": 0.8671, + "step": 12124 + }, + { + "epoch": 0.9798177740964464, + "grad_norm": 2.3028671741485596, + "learning_rate": 5.405585596848204e-06, + "loss": 0.8778, + "step": 12125 + }, + { + "epoch": 0.9798985838097739, + "grad_norm": 2.698812961578369, + "learning_rate": 5.404933387633007e-06, + "loss": 0.8468, + "step": 12126 + }, + { + "epoch": 0.9799793935231015, + "grad_norm": 2.5293102264404297, + "learning_rate": 5.404281171482275e-06, + "loss": 0.8957, + "step": 12127 + }, + { + "epoch": 0.9800602032364291, + "grad_norm": 2.2571027278900146, + "learning_rate": 5.40362894840718e-06, + "loss": 0.8962, + "step": 12128 + }, + { + "epoch": 0.9801410129497565, + "grad_norm": 2.41410493850708, + "learning_rate": 5.402976718418893e-06, + "loss": 1.0511, + "step": 12129 + }, + { + "epoch": 0.9802218226630841, + "grad_norm": 2.474343776702881, + "learning_rate": 5.402324481528587e-06, + "loss": 0.9717, + "step": 12130 + }, + { + "epoch": 0.9803026323764117, + "grad_norm": 2.4942386150360107, + "learning_rate": 5.401672237747428e-06, + "loss": 0.9959, + "step": 12131 + }, + { + "epoch": 0.9803834420897392, + "grad_norm": 2.3961400985717773, + "learning_rate": 5.401019987086593e-06, + "loss": 0.9601, + "step": 12132 + }, + { + "epoch": 0.9804642518030667, + "grad_norm": 2.85259747505188, + "learning_rate": 5.4003677295572496e-06, + "loss": 0.871, + "step": 12133 + }, + { + "epoch": 0.9805450615163943, + "grad_norm": 2.4561750888824463, + "learning_rate": 5.399715465170571e-06, + "loss": 0.9385, + "step": 12134 + }, + { + "epoch": 0.9806258712297218, + "grad_norm": 2.253120183944702, + "learning_rate": 5.399063193937729e-06, + "loss": 0.9056, + "step": 12135 + }, + { + "epoch": 0.9807066809430494, + "grad_norm": 2.6282289028167725, + "learning_rate": 5.398410915869896e-06, + "loss": 0.9145, + "step": 12136 + }, + { + "epoch": 0.9807874906563769, + "grad_norm": 2.1766889095306396, + "learning_rate": 5.397758630978241e-06, + "loss": 0.9398, + "step": 12137 + }, + { + "epoch": 0.9808683003697044, + "grad_norm": 2.755300760269165, + "learning_rate": 5.397106339273941e-06, + "loss": 0.9192, + "step": 12138 + }, + { + "epoch": 0.980949110083032, + "grad_norm": 2.4856979846954346, + "learning_rate": 5.396454040768164e-06, + "loss": 0.8801, + "step": 12139 + }, + { + "epoch": 0.9810299197963596, + "grad_norm": 2.632225513458252, + "learning_rate": 5.395801735472084e-06, + "loss": 0.9607, + "step": 12140 + }, + { + "epoch": 0.981110729509687, + "grad_norm": 3.21994686126709, + "learning_rate": 5.395149423396872e-06, + "loss": 0.9186, + "step": 12141 + }, + { + "epoch": 0.9811915392230146, + "grad_norm": 2.641205072402954, + "learning_rate": 5.394497104553702e-06, + "loss": 0.8499, + "step": 12142 + }, + { + "epoch": 0.9812723489363422, + "grad_norm": 2.4946446418762207, + "learning_rate": 5.393844778953748e-06, + "loss": 1.0907, + "step": 12143 + }, + { + "epoch": 0.9813531586496697, + "grad_norm": 2.7536115646362305, + "learning_rate": 5.39319244660818e-06, + "loss": 0.9752, + "step": 12144 + }, + { + "epoch": 0.9814339683629972, + "grad_norm": 2.757713794708252, + "learning_rate": 5.39254010752817e-06, + "loss": 0.8335, + "step": 12145 + }, + { + "epoch": 0.9815147780763248, + "grad_norm": 2.7080740928649902, + "learning_rate": 5.391887761724897e-06, + "loss": 0.9056, + "step": 12146 + }, + { + "epoch": 0.9815955877896523, + "grad_norm": 2.723893165588379, + "learning_rate": 5.3912354092095265e-06, + "loss": 0.9127, + "step": 12147 + }, + { + "epoch": 0.9816763975029799, + "grad_norm": 2.4144506454467773, + "learning_rate": 5.390583049993236e-06, + "loss": 0.9482, + "step": 12148 + }, + { + "epoch": 0.9817572072163074, + "grad_norm": 2.5960702896118164, + "learning_rate": 5.389930684087199e-06, + "loss": 0.8589, + "step": 12149 + }, + { + "epoch": 0.9818380169296349, + "grad_norm": 2.481485605239868, + "learning_rate": 5.3892783115025895e-06, + "loss": 0.8986, + "step": 12150 + }, + { + "epoch": 0.9819188266429625, + "grad_norm": 2.679650068283081, + "learning_rate": 5.388625932250576e-06, + "loss": 0.8275, + "step": 12151 + }, + { + "epoch": 0.9819996363562901, + "grad_norm": 2.44803786277771, + "learning_rate": 5.38797354634234e-06, + "loss": 0.9115, + "step": 12152 + }, + { + "epoch": 0.9820804460696175, + "grad_norm": 2.4601192474365234, + "learning_rate": 5.387321153789047e-06, + "loss": 0.8299, + "step": 12153 + }, + { + "epoch": 0.9821612557829451, + "grad_norm": 2.5130109786987305, + "learning_rate": 5.386668754601878e-06, + "loss": 0.9465, + "step": 12154 + }, + { + "epoch": 0.9822420654962727, + "grad_norm": 2.3025498390197754, + "learning_rate": 5.386016348792004e-06, + "loss": 0.9691, + "step": 12155 + }, + { + "epoch": 0.9823228752096002, + "grad_norm": 2.6452362537384033, + "learning_rate": 5.385363936370598e-06, + "loss": 0.9182, + "step": 12156 + }, + { + "epoch": 0.9824036849229277, + "grad_norm": 3.0965425968170166, + "learning_rate": 5.384711517348837e-06, + "loss": 0.996, + "step": 12157 + }, + { + "epoch": 0.9824844946362553, + "grad_norm": 2.5991649627685547, + "learning_rate": 5.384059091737892e-06, + "loss": 0.8321, + "step": 12158 + }, + { + "epoch": 0.9825653043495828, + "grad_norm": 2.765760898590088, + "learning_rate": 5.38340665954894e-06, + "loss": 0.9406, + "step": 12159 + }, + { + "epoch": 0.9826461140629104, + "grad_norm": 2.2090837955474854, + "learning_rate": 5.382754220793156e-06, + "loss": 0.9335, + "step": 12160 + }, + { + "epoch": 0.9827269237762379, + "grad_norm": 2.5422046184539795, + "learning_rate": 5.382101775481712e-06, + "loss": 1.0029, + "step": 12161 + }, + { + "epoch": 0.9828077334895654, + "grad_norm": 2.9563913345336914, + "learning_rate": 5.3814493236257855e-06, + "loss": 0.9966, + "step": 12162 + }, + { + "epoch": 0.982888543202893, + "grad_norm": 2.5533783435821533, + "learning_rate": 5.380796865236549e-06, + "loss": 0.9375, + "step": 12163 + }, + { + "epoch": 0.9829693529162206, + "grad_norm": 2.479666233062744, + "learning_rate": 5.3801444003251815e-06, + "loss": 1.0035, + "step": 12164 + }, + { + "epoch": 0.983050162629548, + "grad_norm": 2.4869494438171387, + "learning_rate": 5.3794919289028535e-06, + "loss": 0.9296, + "step": 12165 + }, + { + "epoch": 0.9831309723428756, + "grad_norm": 2.7353363037109375, + "learning_rate": 5.378839450980744e-06, + "loss": 0.9772, + "step": 12166 + }, + { + "epoch": 0.9832117820562032, + "grad_norm": 2.5732433795928955, + "learning_rate": 5.3781869665700235e-06, + "loss": 0.9728, + "step": 12167 + }, + { + "epoch": 0.9832925917695307, + "grad_norm": 2.4385268688201904, + "learning_rate": 5.377534475681875e-06, + "loss": 0.7798, + "step": 12168 + }, + { + "epoch": 0.9833734014828582, + "grad_norm": 2.479956865310669, + "learning_rate": 5.376881978327467e-06, + "loss": 0.9799, + "step": 12169 + }, + { + "epoch": 0.9834542111961858, + "grad_norm": 2.8344802856445312, + "learning_rate": 5.376229474517979e-06, + "loss": 0.9356, + "step": 12170 + }, + { + "epoch": 0.9835350209095133, + "grad_norm": 2.779484987258911, + "learning_rate": 5.375576964264585e-06, + "loss": 0.9546, + "step": 12171 + }, + { + "epoch": 0.9836158306228409, + "grad_norm": 2.6164891719818115, + "learning_rate": 5.374924447578462e-06, + "loss": 0.8925, + "step": 12172 + }, + { + "epoch": 0.9836966403361684, + "grad_norm": 2.648625135421753, + "learning_rate": 5.374271924470787e-06, + "loss": 0.9378, + "step": 12173 + }, + { + "epoch": 0.9837774500494959, + "grad_norm": 2.5899453163146973, + "learning_rate": 5.373619394952734e-06, + "loss": 0.9926, + "step": 12174 + }, + { + "epoch": 0.9838582597628235, + "grad_norm": 2.894723653793335, + "learning_rate": 5.372966859035481e-06, + "loss": 0.9491, + "step": 12175 + }, + { + "epoch": 0.9839390694761511, + "grad_norm": 2.368407726287842, + "learning_rate": 5.372314316730203e-06, + "loss": 0.8911, + "step": 12176 + }, + { + "epoch": 0.9840198791894785, + "grad_norm": 2.3118772506713867, + "learning_rate": 5.371661768048077e-06, + "loss": 0.7905, + "step": 12177 + }, + { + "epoch": 0.9841006889028061, + "grad_norm": 2.4478707313537598, + "learning_rate": 5.371009213000279e-06, + "loss": 0.9669, + "step": 12178 + }, + { + "epoch": 0.9841814986161337, + "grad_norm": 2.403913736343384, + "learning_rate": 5.3703566515979865e-06, + "loss": 0.9257, + "step": 12179 + }, + { + "epoch": 0.9842623083294612, + "grad_norm": 2.834071159362793, + "learning_rate": 5.369704083852376e-06, + "loss": 0.9239, + "step": 12180 + }, + { + "epoch": 0.9843431180427887, + "grad_norm": 2.5608348846435547, + "learning_rate": 5.369051509774625e-06, + "loss": 0.8549, + "step": 12181 + }, + { + "epoch": 0.9844239277561163, + "grad_norm": 2.570713996887207, + "learning_rate": 5.368398929375911e-06, + "loss": 0.9413, + "step": 12182 + }, + { + "epoch": 0.9845047374694438, + "grad_norm": 2.7243850231170654, + "learning_rate": 5.36774634266741e-06, + "loss": 1.0564, + "step": 12183 + }, + { + "epoch": 0.9845855471827714, + "grad_norm": 2.613048553466797, + "learning_rate": 5.367093749660299e-06, + "loss": 0.9697, + "step": 12184 + }, + { + "epoch": 0.9846663568960989, + "grad_norm": 2.798172950744629, + "learning_rate": 5.366441150365755e-06, + "loss": 0.8594, + "step": 12185 + }, + { + "epoch": 0.9847471666094264, + "grad_norm": 2.6238584518432617, + "learning_rate": 5.365788544794958e-06, + "loss": 0.9041, + "step": 12186 + }, + { + "epoch": 0.984827976322754, + "grad_norm": 2.3103373050689697, + "learning_rate": 5.365135932959083e-06, + "loss": 0.9151, + "step": 12187 + }, + { + "epoch": 0.9849087860360816, + "grad_norm": 2.1761138439178467, + "learning_rate": 5.364483314869308e-06, + "loss": 0.9919, + "step": 12188 + }, + { + "epoch": 0.984989595749409, + "grad_norm": 2.738905906677246, + "learning_rate": 5.363830690536812e-06, + "loss": 0.9131, + "step": 12189 + }, + { + "epoch": 0.9850704054627366, + "grad_norm": 2.7202906608581543, + "learning_rate": 5.3631780599727715e-06, + "loss": 0.8734, + "step": 12190 + }, + { + "epoch": 0.9851512151760642, + "grad_norm": 2.608964204788208, + "learning_rate": 5.362525423188366e-06, + "loss": 1.0752, + "step": 12191 + }, + { + "epoch": 0.9852320248893917, + "grad_norm": 2.9981770515441895, + "learning_rate": 5.361872780194772e-06, + "loss": 0.7842, + "step": 12192 + }, + { + "epoch": 0.9853128346027192, + "grad_norm": 2.777306318283081, + "learning_rate": 5.361220131003169e-06, + "loss": 0.8846, + "step": 12193 + }, + { + "epoch": 0.9853936443160468, + "grad_norm": 3.3271985054016113, + "learning_rate": 5.360567475624734e-06, + "loss": 0.7995, + "step": 12194 + }, + { + "epoch": 0.9854744540293743, + "grad_norm": 2.412200450897217, + "learning_rate": 5.359914814070646e-06, + "loss": 0.821, + "step": 12195 + }, + { + "epoch": 0.9855552637427019, + "grad_norm": 2.6865506172180176, + "learning_rate": 5.359262146352085e-06, + "loss": 1.0232, + "step": 12196 + }, + { + "epoch": 0.9856360734560294, + "grad_norm": 2.4174892902374268, + "learning_rate": 5.358609472480227e-06, + "loss": 0.8605, + "step": 12197 + }, + { + "epoch": 0.9857168831693569, + "grad_norm": 2.217020034790039, + "learning_rate": 5.357956792466252e-06, + "loss": 0.9156, + "step": 12198 + }, + { + "epoch": 0.9857976928826845, + "grad_norm": 2.7196741104125977, + "learning_rate": 5.35730410632134e-06, + "loss": 0.762, + "step": 12199 + }, + { + "epoch": 0.9858785025960121, + "grad_norm": 2.831913471221924, + "learning_rate": 5.356651414056669e-06, + "loss": 0.9218, + "step": 12200 + }, + { + "epoch": 0.9859593123093395, + "grad_norm": 2.774486780166626, + "learning_rate": 5.355998715683417e-06, + "loss": 1.0029, + "step": 12201 + }, + { + "epoch": 0.9860401220226671, + "grad_norm": 2.3571219444274902, + "learning_rate": 5.355346011212764e-06, + "loss": 1.0688, + "step": 12202 + }, + { + "epoch": 0.9861209317359947, + "grad_norm": 2.823669910430908, + "learning_rate": 5.354693300655891e-06, + "loss": 0.9271, + "step": 12203 + }, + { + "epoch": 0.9862017414493222, + "grad_norm": 2.444176197052002, + "learning_rate": 5.3540405840239725e-06, + "loss": 0.9965, + "step": 12204 + }, + { + "epoch": 0.9862825511626497, + "grad_norm": 2.6155660152435303, + "learning_rate": 5.353387861328194e-06, + "loss": 0.9131, + "step": 12205 + }, + { + "epoch": 0.9863633608759773, + "grad_norm": 2.318523645401001, + "learning_rate": 5.352735132579732e-06, + "loss": 1.0011, + "step": 12206 + }, + { + "epoch": 0.9864441705893048, + "grad_norm": 2.41597843170166, + "learning_rate": 5.352082397789764e-06, + "loss": 0.934, + "step": 12207 + }, + { + "epoch": 0.9865249803026324, + "grad_norm": 2.3817427158355713, + "learning_rate": 5.351429656969473e-06, + "loss": 0.9334, + "step": 12208 + }, + { + "epoch": 0.9866057900159599, + "grad_norm": 3.1035239696502686, + "learning_rate": 5.350776910130039e-06, + "loss": 0.9043, + "step": 12209 + }, + { + "epoch": 0.9866865997292874, + "grad_norm": 2.416182518005371, + "learning_rate": 5.35012415728264e-06, + "loss": 0.8499, + "step": 12210 + }, + { + "epoch": 0.986767409442615, + "grad_norm": 2.56245493888855, + "learning_rate": 5.34947139843846e-06, + "loss": 0.8969, + "step": 12211 + }, + { + "epoch": 0.9868482191559426, + "grad_norm": 2.5546376705169678, + "learning_rate": 5.348818633608671e-06, + "loss": 0.9497, + "step": 12212 + }, + { + "epoch": 0.98692902886927, + "grad_norm": 2.3924479484558105, + "learning_rate": 5.348165862804463e-06, + "loss": 0.9101, + "step": 12213 + }, + { + "epoch": 0.9870098385825976, + "grad_norm": 2.4456634521484375, + "learning_rate": 5.3475130860370106e-06, + "loss": 0.8853, + "step": 12214 + }, + { + "epoch": 0.9870906482959252, + "grad_norm": 2.8276612758636475, + "learning_rate": 5.346860303317495e-06, + "loss": 0.8544, + "step": 12215 + }, + { + "epoch": 0.9871714580092528, + "grad_norm": 2.8824312686920166, + "learning_rate": 5.346207514657098e-06, + "loss": 0.8829, + "step": 12216 + }, + { + "epoch": 0.9872522677225802, + "grad_norm": 2.500629425048828, + "learning_rate": 5.345554720067e-06, + "loss": 1.0615, + "step": 12217 + }, + { + "epoch": 0.9873330774359078, + "grad_norm": 2.476496696472168, + "learning_rate": 5.3449019195583795e-06, + "loss": 0.8171, + "step": 12218 + }, + { + "epoch": 0.9874138871492354, + "grad_norm": 2.551996946334839, + "learning_rate": 5.344249113142422e-06, + "loss": 0.848, + "step": 12219 + }, + { + "epoch": 0.9874946968625629, + "grad_norm": 2.895494222640991, + "learning_rate": 5.3435963008303046e-06, + "loss": 0.9491, + "step": 12220 + }, + { + "epoch": 0.9875755065758904, + "grad_norm": 2.8388452529907227, + "learning_rate": 5.342943482633211e-06, + "loss": 0.8374, + "step": 12221 + }, + { + "epoch": 0.987656316289218, + "grad_norm": 2.273324966430664, + "learning_rate": 5.34229065856232e-06, + "loss": 0.9174, + "step": 12222 + }, + { + "epoch": 0.9877371260025455, + "grad_norm": 2.447150945663452, + "learning_rate": 5.341637828628814e-06, + "loss": 0.9445, + "step": 12223 + }, + { + "epoch": 0.9878179357158731, + "grad_norm": 2.7661643028259277, + "learning_rate": 5.340984992843874e-06, + "loss": 0.9384, + "step": 12224 + }, + { + "epoch": 0.9878987454292006, + "grad_norm": 2.543273448944092, + "learning_rate": 5.340332151218684e-06, + "loss": 0.837, + "step": 12225 + }, + { + "epoch": 0.9879795551425281, + "grad_norm": 2.4272313117980957, + "learning_rate": 5.339679303764421e-06, + "loss": 0.9039, + "step": 12226 + }, + { + "epoch": 0.9880603648558557, + "grad_norm": 2.3481764793395996, + "learning_rate": 5.339026450492272e-06, + "loss": 0.9525, + "step": 12227 + }, + { + "epoch": 0.9881411745691833, + "grad_norm": 2.6108486652374268, + "learning_rate": 5.338373591413414e-06, + "loss": 0.9075, + "step": 12228 + }, + { + "epoch": 0.9882219842825107, + "grad_norm": 3.0879838466644287, + "learning_rate": 5.337720726539032e-06, + "loss": 0.8332, + "step": 12229 + }, + { + "epoch": 0.9883027939958383, + "grad_norm": 2.506908416748047, + "learning_rate": 5.337067855880305e-06, + "loss": 0.965, + "step": 12230 + }, + { + "epoch": 0.9883836037091659, + "grad_norm": 2.6116840839385986, + "learning_rate": 5.33641497944842e-06, + "loss": 0.885, + "step": 12231 + }, + { + "epoch": 0.9884644134224934, + "grad_norm": 2.8873798847198486, + "learning_rate": 5.335762097254554e-06, + "loss": 0.853, + "step": 12232 + }, + { + "epoch": 0.9885452231358209, + "grad_norm": 2.668135166168213, + "learning_rate": 5.3351092093098944e-06, + "loss": 0.8436, + "step": 12233 + }, + { + "epoch": 0.9886260328491485, + "grad_norm": 3.144533634185791, + "learning_rate": 5.334456315625618e-06, + "loss": 0.9255, + "step": 12234 + }, + { + "epoch": 0.988706842562476, + "grad_norm": 2.8562543392181396, + "learning_rate": 5.333803416212911e-06, + "loss": 0.985, + "step": 12235 + }, + { + "epoch": 0.9887876522758036, + "grad_norm": 2.622060537338257, + "learning_rate": 5.333150511082955e-06, + "loss": 0.9229, + "step": 12236 + }, + { + "epoch": 0.9888684619891311, + "grad_norm": 2.6421568393707275, + "learning_rate": 5.332497600246933e-06, + "loss": 0.7765, + "step": 12237 + }, + { + "epoch": 0.9889492717024586, + "grad_norm": 2.7658445835113525, + "learning_rate": 5.331844683716027e-06, + "loss": 0.8947, + "step": 12238 + }, + { + "epoch": 0.9890300814157862, + "grad_norm": 2.8542468547821045, + "learning_rate": 5.331191761501421e-06, + "loss": 0.8648, + "step": 12239 + }, + { + "epoch": 0.9891108911291138, + "grad_norm": 2.5092406272888184, + "learning_rate": 5.330538833614297e-06, + "loss": 1.0049, + "step": 12240 + }, + { + "epoch": 0.9891917008424412, + "grad_norm": 2.5922608375549316, + "learning_rate": 5.3298859000658395e-06, + "loss": 1.0946, + "step": 12241 + }, + { + "epoch": 0.9892725105557688, + "grad_norm": 2.412569046020508, + "learning_rate": 5.329232960867231e-06, + "loss": 1.0575, + "step": 12242 + }, + { + "epoch": 0.9893533202690964, + "grad_norm": 2.937439203262329, + "learning_rate": 5.328580016029653e-06, + "loss": 0.9753, + "step": 12243 + }, + { + "epoch": 0.9894341299824239, + "grad_norm": 2.7120580673217773, + "learning_rate": 5.327927065564291e-06, + "loss": 0.9585, + "step": 12244 + }, + { + "epoch": 0.9895149396957514, + "grad_norm": 2.4741392135620117, + "learning_rate": 5.3272741094823275e-06, + "loss": 0.9583, + "step": 12245 + }, + { + "epoch": 0.989595749409079, + "grad_norm": 2.476374387741089, + "learning_rate": 5.326621147794946e-06, + "loss": 0.9511, + "step": 12246 + }, + { + "epoch": 0.9896765591224065, + "grad_norm": 2.838733196258545, + "learning_rate": 5.325968180513331e-06, + "loss": 0.8448, + "step": 12247 + }, + { + "epoch": 0.9897573688357341, + "grad_norm": 2.329197883605957, + "learning_rate": 5.325315207648667e-06, + "loss": 0.9221, + "step": 12248 + }, + { + "epoch": 0.9898381785490616, + "grad_norm": 2.675673246383667, + "learning_rate": 5.3246622292121344e-06, + "loss": 0.8106, + "step": 12249 + }, + { + "epoch": 0.9899189882623891, + "grad_norm": 2.3492445945739746, + "learning_rate": 5.324009245214922e-06, + "loss": 0.9264, + "step": 12250 + }, + { + "epoch": 0.9899997979757167, + "grad_norm": 2.4474477767944336, + "learning_rate": 5.32335625566821e-06, + "loss": 0.9301, + "step": 12251 + }, + { + "epoch": 0.9900806076890443, + "grad_norm": 2.737623691558838, + "learning_rate": 5.322703260583183e-06, + "loss": 0.8766, + "step": 12252 + }, + { + "epoch": 0.9901614174023717, + "grad_norm": 2.7616465091705322, + "learning_rate": 5.322050259971027e-06, + "loss": 0.9792, + "step": 12253 + }, + { + "epoch": 0.9902422271156993, + "grad_norm": 3.4910755157470703, + "learning_rate": 5.321397253842924e-06, + "loss": 0.9654, + "step": 12254 + }, + { + "epoch": 0.9903230368290269, + "grad_norm": 2.699347734451294, + "learning_rate": 5.320744242210061e-06, + "loss": 0.9645, + "step": 12255 + }, + { + "epoch": 0.9904038465423544, + "grad_norm": 2.415865182876587, + "learning_rate": 5.320091225083622e-06, + "loss": 0.9715, + "step": 12256 + }, + { + "epoch": 0.9904846562556819, + "grad_norm": 2.8747668266296387, + "learning_rate": 5.319438202474788e-06, + "loss": 0.9351, + "step": 12257 + }, + { + "epoch": 0.9905654659690095, + "grad_norm": 2.717130184173584, + "learning_rate": 5.318785174394751e-06, + "loss": 0.8969, + "step": 12258 + }, + { + "epoch": 0.990646275682337, + "grad_norm": 3.111077308654785, + "learning_rate": 5.3181321408546885e-06, + "loss": 0.8883, + "step": 12259 + }, + { + "epoch": 0.9907270853956646, + "grad_norm": 3.141089677810669, + "learning_rate": 5.317479101865788e-06, + "loss": 1.0838, + "step": 12260 + }, + { + "epoch": 0.9908078951089921, + "grad_norm": 2.755082607269287, + "learning_rate": 5.316826057439236e-06, + "loss": 0.9619, + "step": 12261 + }, + { + "epoch": 0.9908887048223196, + "grad_norm": 2.669637441635132, + "learning_rate": 5.316173007586215e-06, + "loss": 1.0097, + "step": 12262 + }, + { + "epoch": 0.9909695145356472, + "grad_norm": 2.529109477996826, + "learning_rate": 5.315519952317912e-06, + "loss": 0.9657, + "step": 12263 + }, + { + "epoch": 0.9910503242489748, + "grad_norm": 2.3322739601135254, + "learning_rate": 5.314866891645514e-06, + "loss": 0.885, + "step": 12264 + }, + { + "epoch": 0.9911311339623022, + "grad_norm": 2.8430070877075195, + "learning_rate": 5.314213825580201e-06, + "loss": 0.892, + "step": 12265 + }, + { + "epoch": 0.9912119436756298, + "grad_norm": 2.8258538246154785, + "learning_rate": 5.3135607541331646e-06, + "loss": 0.9957, + "step": 12266 + }, + { + "epoch": 0.9912927533889574, + "grad_norm": 2.687260627746582, + "learning_rate": 5.312907677315585e-06, + "loss": 0.9395, + "step": 12267 + }, + { + "epoch": 0.9913735631022849, + "grad_norm": 2.6090567111968994, + "learning_rate": 5.3122545951386505e-06, + "loss": 0.9182, + "step": 12268 + }, + { + "epoch": 0.9914543728156124, + "grad_norm": 2.947476387023926, + "learning_rate": 5.311601507613547e-06, + "loss": 1.029, + "step": 12269 + }, + { + "epoch": 0.99153518252894, + "grad_norm": 2.5828793048858643, + "learning_rate": 5.310948414751461e-06, + "loss": 0.959, + "step": 12270 + }, + { + "epoch": 0.9916159922422675, + "grad_norm": 2.347775459289551, + "learning_rate": 5.310295316563575e-06, + "loss": 0.9352, + "step": 12271 + }, + { + "epoch": 0.9916968019555951, + "grad_norm": 2.6231443881988525, + "learning_rate": 5.309642213061079e-06, + "loss": 0.8537, + "step": 12272 + }, + { + "epoch": 0.9917776116689226, + "grad_norm": 2.8878562450408936, + "learning_rate": 5.308989104255157e-06, + "loss": 0.9144, + "step": 12273 + }, + { + "epoch": 0.9918584213822501, + "grad_norm": 2.5320656299591064, + "learning_rate": 5.308335990156994e-06, + "loss": 1.0086, + "step": 12274 + }, + { + "epoch": 0.9919392310955777, + "grad_norm": 2.5271451473236084, + "learning_rate": 5.3076828707777795e-06, + "loss": 0.8926, + "step": 12275 + }, + { + "epoch": 0.9920200408089053, + "grad_norm": 2.6236629486083984, + "learning_rate": 5.307029746128697e-06, + "loss": 0.8536, + "step": 12276 + }, + { + "epoch": 0.9921008505222327, + "grad_norm": 2.6657965183258057, + "learning_rate": 5.3063766162209354e-06, + "loss": 0.8521, + "step": 12277 + }, + { + "epoch": 0.9921816602355603, + "grad_norm": 2.7572550773620605, + "learning_rate": 5.305723481065679e-06, + "loss": 0.725, + "step": 12278 + }, + { + "epoch": 0.9922624699488879, + "grad_norm": 2.477506160736084, + "learning_rate": 5.305070340674114e-06, + "loss": 0.9167, + "step": 12279 + }, + { + "epoch": 0.9923432796622154, + "grad_norm": 2.4824118614196777, + "learning_rate": 5.304417195057432e-06, + "loss": 1.0134, + "step": 12280 + }, + { + "epoch": 0.9924240893755429, + "grad_norm": 2.8695285320281982, + "learning_rate": 5.303764044226814e-06, + "loss": 0.9027, + "step": 12281 + }, + { + "epoch": 0.9925048990888705, + "grad_norm": 2.9349610805511475, + "learning_rate": 5.303110888193449e-06, + "loss": 0.9602, + "step": 12282 + }, + { + "epoch": 0.992585708802198, + "grad_norm": 3.3354389667510986, + "learning_rate": 5.302457726968525e-06, + "loss": 0.9154, + "step": 12283 + }, + { + "epoch": 0.9926665185155256, + "grad_norm": 2.677093982696533, + "learning_rate": 5.301804560563229e-06, + "loss": 0.8953, + "step": 12284 + }, + { + "epoch": 0.9927473282288531, + "grad_norm": 2.967055082321167, + "learning_rate": 5.3011513889887445e-06, + "loss": 0.9596, + "step": 12285 + }, + { + "epoch": 0.9928281379421806, + "grad_norm": 2.8245794773101807, + "learning_rate": 5.300498212256266e-06, + "loss": 0.9689, + "step": 12286 + }, + { + "epoch": 0.9929089476555082, + "grad_norm": 3.164083480834961, + "learning_rate": 5.2998450303769734e-06, + "loss": 0.9935, + "step": 12287 + }, + { + "epoch": 0.9929897573688358, + "grad_norm": 2.2264647483825684, + "learning_rate": 5.299191843362057e-06, + "loss": 1.109, + "step": 12288 + }, + { + "epoch": 0.9930705670821632, + "grad_norm": 2.7262375354766846, + "learning_rate": 5.298538651222705e-06, + "loss": 0.9294, + "step": 12289 + }, + { + "epoch": 0.9931513767954908, + "grad_norm": 2.357172966003418, + "learning_rate": 5.297885453970106e-06, + "loss": 0.9459, + "step": 12290 + }, + { + "epoch": 0.9932321865088184, + "grad_norm": 2.671567916870117, + "learning_rate": 5.297232251615445e-06, + "loss": 0.8271, + "step": 12291 + }, + { + "epoch": 0.9933129962221459, + "grad_norm": 3.148672580718994, + "learning_rate": 5.296579044169913e-06, + "loss": 0.8556, + "step": 12292 + }, + { + "epoch": 0.9933938059354734, + "grad_norm": 3.001763105392456, + "learning_rate": 5.2959258316446935e-06, + "loss": 0.9584, + "step": 12293 + }, + { + "epoch": 0.993474615648801, + "grad_norm": 2.171727418899536, + "learning_rate": 5.2952726140509794e-06, + "loss": 0.8663, + "step": 12294 + }, + { + "epoch": 0.9935554253621285, + "grad_norm": 2.8575220108032227, + "learning_rate": 5.294619391399954e-06, + "loss": 0.979, + "step": 12295 + }, + { + "epoch": 0.9936362350754561, + "grad_norm": 2.449154853820801, + "learning_rate": 5.2939661637028085e-06, + "loss": 0.9035, + "step": 12296 + }, + { + "epoch": 0.9937170447887836, + "grad_norm": 3.02980899810791, + "learning_rate": 5.29331293097073e-06, + "loss": 0.7961, + "step": 12297 + }, + { + "epoch": 0.9937978545021111, + "grad_norm": 3.2935004234313965, + "learning_rate": 5.292659693214908e-06, + "loss": 0.9608, + "step": 12298 + }, + { + "epoch": 0.9938786642154387, + "grad_norm": 2.5653650760650635, + "learning_rate": 5.292006450446529e-06, + "loss": 0.9354, + "step": 12299 + }, + { + "epoch": 0.9939594739287663, + "grad_norm": 3.002340316772461, + "learning_rate": 5.2913532026767845e-06, + "loss": 0.9007, + "step": 12300 + }, + { + "epoch": 0.9940402836420937, + "grad_norm": 2.487698793411255, + "learning_rate": 5.290699949916859e-06, + "loss": 0.943, + "step": 12301 + }, + { + "epoch": 0.9941210933554213, + "grad_norm": 2.4758803844451904, + "learning_rate": 5.2900466921779436e-06, + "loss": 0.9927, + "step": 12302 + }, + { + "epoch": 0.9942019030687489, + "grad_norm": 2.691206455230713, + "learning_rate": 5.289393429471227e-06, + "loss": 0.9391, + "step": 12303 + }, + { + "epoch": 0.9942827127820764, + "grad_norm": 2.8046562671661377, + "learning_rate": 5.288740161807897e-06, + "loss": 0.9743, + "step": 12304 + }, + { + "epoch": 0.9943635224954039, + "grad_norm": 2.924703598022461, + "learning_rate": 5.288086889199143e-06, + "loss": 0.9712, + "step": 12305 + }, + { + "epoch": 0.9944443322087315, + "grad_norm": 2.697329521179199, + "learning_rate": 5.2874336116561545e-06, + "loss": 0.8344, + "step": 12306 + }, + { + "epoch": 0.994525141922059, + "grad_norm": 2.530019760131836, + "learning_rate": 5.28678032919012e-06, + "loss": 0.8061, + "step": 12307 + }, + { + "epoch": 0.9946059516353866, + "grad_norm": 2.3434691429138184, + "learning_rate": 5.28612704181223e-06, + "loss": 0.9062, + "step": 12308 + }, + { + "epoch": 0.9946867613487141, + "grad_norm": 2.569121837615967, + "learning_rate": 5.285473749533671e-06, + "loss": 0.8273, + "step": 12309 + }, + { + "epoch": 0.9947675710620416, + "grad_norm": 2.8427610397338867, + "learning_rate": 5.284820452365635e-06, + "loss": 0.834, + "step": 12310 + }, + { + "epoch": 0.9948483807753692, + "grad_norm": 2.7131824493408203, + "learning_rate": 5.28416715031931e-06, + "loss": 0.7951, + "step": 12311 + }, + { + "epoch": 0.9949291904886968, + "grad_norm": 2.529956817626953, + "learning_rate": 5.283513843405886e-06, + "loss": 0.8515, + "step": 12312 + }, + { + "epoch": 0.9950100002020242, + "grad_norm": 2.633648157119751, + "learning_rate": 5.282860531636552e-06, + "loss": 1.0068, + "step": 12313 + }, + { + "epoch": 0.9950908099153518, + "grad_norm": 2.5329723358154297, + "learning_rate": 5.282207215022499e-06, + "loss": 0.9443, + "step": 12314 + }, + { + "epoch": 0.9951716196286794, + "grad_norm": 2.7503702640533447, + "learning_rate": 5.281553893574916e-06, + "loss": 0.9101, + "step": 12315 + }, + { + "epoch": 0.9952524293420069, + "grad_norm": 2.7381479740142822, + "learning_rate": 5.280900567304989e-06, + "loss": 0.8936, + "step": 12316 + }, + { + "epoch": 0.9953332390553344, + "grad_norm": 2.5200910568237305, + "learning_rate": 5.280247236223916e-06, + "loss": 0.9186, + "step": 12317 + }, + { + "epoch": 0.995414048768662, + "grad_norm": 2.981550931930542, + "learning_rate": 5.279593900342881e-06, + "loss": 0.8855, + "step": 12318 + }, + { + "epoch": 0.9954948584819895, + "grad_norm": 2.4293437004089355, + "learning_rate": 5.278940559673075e-06, + "loss": 0.964, + "step": 12319 + }, + { + "epoch": 0.9955756681953171, + "grad_norm": 2.6228461265563965, + "learning_rate": 5.278287214225689e-06, + "loss": 0.8185, + "step": 12320 + }, + { + "epoch": 0.9956564779086446, + "grad_norm": 2.576329469680786, + "learning_rate": 5.277633864011913e-06, + "loss": 0.8389, + "step": 12321 + }, + { + "epoch": 0.9957372876219721, + "grad_norm": 2.8689534664154053, + "learning_rate": 5.276980509042937e-06, + "loss": 0.9261, + "step": 12322 + }, + { + "epoch": 0.9958180973352997, + "grad_norm": 2.744128942489624, + "learning_rate": 5.276327149329953e-06, + "loss": 0.8265, + "step": 12323 + }, + { + "epoch": 0.9958989070486273, + "grad_norm": 2.3744585514068604, + "learning_rate": 5.275673784884147e-06, + "loss": 0.8113, + "step": 12324 + }, + { + "epoch": 0.9959797167619547, + "grad_norm": 2.8023927211761475, + "learning_rate": 5.275020415716717e-06, + "loss": 0.8206, + "step": 12325 + }, + { + "epoch": 0.9960605264752823, + "grad_norm": 2.3721160888671875, + "learning_rate": 5.274367041838847e-06, + "loss": 0.8316, + "step": 12326 + }, + { + "epoch": 0.9961413361886099, + "grad_norm": 2.561278820037842, + "learning_rate": 5.27371366326173e-06, + "loss": 1.0216, + "step": 12327 + }, + { + "epoch": 0.9962221459019374, + "grad_norm": 2.513955593109131, + "learning_rate": 5.273060279996557e-06, + "loss": 0.8201, + "step": 12328 + }, + { + "epoch": 0.9963029556152649, + "grad_norm": 2.638197898864746, + "learning_rate": 5.27240689205452e-06, + "loss": 0.9466, + "step": 12329 + }, + { + "epoch": 0.9963837653285925, + "grad_norm": 2.419740676879883, + "learning_rate": 5.2717534994468066e-06, + "loss": 0.945, + "step": 12330 + }, + { + "epoch": 0.99646457504192, + "grad_norm": 2.927049160003662, + "learning_rate": 5.271100102184612e-06, + "loss": 0.9586, + "step": 12331 + }, + { + "epoch": 0.9965453847552476, + "grad_norm": 2.7897491455078125, + "learning_rate": 5.270446700279124e-06, + "loss": 1.0664, + "step": 12332 + }, + { + "epoch": 0.9966261944685751, + "grad_norm": 3.367159366607666, + "learning_rate": 5.269793293741535e-06, + "loss": 1.0409, + "step": 12333 + }, + { + "epoch": 0.9967070041819026, + "grad_norm": 2.744915008544922, + "learning_rate": 5.269139882583038e-06, + "loss": 0.874, + "step": 12334 + }, + { + "epoch": 0.9967878138952302, + "grad_norm": 2.3484771251678467, + "learning_rate": 5.2684864668148206e-06, + "loss": 0.8897, + "step": 12335 + }, + { + "epoch": 0.9968686236085578, + "grad_norm": 2.1473448276519775, + "learning_rate": 5.2678330464480775e-06, + "loss": 1.0396, + "step": 12336 + }, + { + "epoch": 0.9969494333218852, + "grad_norm": 3.1765217781066895, + "learning_rate": 5.267179621494e-06, + "loss": 0.8927, + "step": 12337 + }, + { + "epoch": 0.9970302430352128, + "grad_norm": 2.411862850189209, + "learning_rate": 5.266526191963777e-06, + "loss": 0.8184, + "step": 12338 + }, + { + "epoch": 0.9971110527485404, + "grad_norm": 2.7513530254364014, + "learning_rate": 5.265872757868603e-06, + "loss": 0.926, + "step": 12339 + }, + { + "epoch": 0.9971918624618679, + "grad_norm": 3.0000483989715576, + "learning_rate": 5.265219319219669e-06, + "loss": 0.8717, + "step": 12340 + }, + { + "epoch": 0.9972726721751954, + "grad_norm": 2.771310329437256, + "learning_rate": 5.264565876028166e-06, + "loss": 0.9364, + "step": 12341 + }, + { + "epoch": 0.997353481888523, + "grad_norm": 2.723525047302246, + "learning_rate": 5.263912428305285e-06, + "loss": 0.9053, + "step": 12342 + }, + { + "epoch": 0.9974342916018506, + "grad_norm": 2.8177742958068848, + "learning_rate": 5.263258976062223e-06, + "loss": 0.9837, + "step": 12343 + }, + { + "epoch": 0.9975151013151781, + "grad_norm": 2.651573657989502, + "learning_rate": 5.2626055193101644e-06, + "loss": 1.0256, + "step": 12344 + }, + { + "epoch": 0.9975959110285056, + "grad_norm": 2.6109039783477783, + "learning_rate": 5.261952058060309e-06, + "loss": 0.8424, + "step": 12345 + }, + { + "epoch": 0.9976767207418332, + "grad_norm": 2.802781105041504, + "learning_rate": 5.261298592323843e-06, + "loss": 0.794, + "step": 12346 + }, + { + "epoch": 0.9977575304551607, + "grad_norm": 3.1382737159729004, + "learning_rate": 5.260645122111963e-06, + "loss": 0.8761, + "step": 12347 + }, + { + "epoch": 0.9978383401684883, + "grad_norm": 2.3803482055664062, + "learning_rate": 5.259991647435858e-06, + "loss": 0.9459, + "step": 12348 + }, + { + "epoch": 0.9979191498818158, + "grad_norm": 2.5168089866638184, + "learning_rate": 5.259338168306723e-06, + "loss": 0.8965, + "step": 12349 + }, + { + "epoch": 0.9979999595951433, + "grad_norm": 2.7386581897735596, + "learning_rate": 5.258684684735749e-06, + "loss": 0.8913, + "step": 12350 + }, + { + "epoch": 0.9980807693084709, + "grad_norm": 2.4279236793518066, + "learning_rate": 5.258031196734131e-06, + "loss": 0.9048, + "step": 12351 + }, + { + "epoch": 0.9981615790217985, + "grad_norm": 2.523301362991333, + "learning_rate": 5.257377704313056e-06, + "loss": 0.8991, + "step": 12352 + }, + { + "epoch": 0.9982423887351259, + "grad_norm": 2.689197063446045, + "learning_rate": 5.256724207483723e-06, + "loss": 0.8349, + "step": 12353 + }, + { + "epoch": 0.9983231984484535, + "grad_norm": 2.5127463340759277, + "learning_rate": 5.2560707062573225e-06, + "loss": 0.8636, + "step": 12354 + }, + { + "epoch": 0.9984040081617811, + "grad_norm": 2.3001163005828857, + "learning_rate": 5.255417200645046e-06, + "loss": 0.9016, + "step": 12355 + }, + { + "epoch": 0.9984848178751086, + "grad_norm": 3.1291275024414062, + "learning_rate": 5.254763690658089e-06, + "loss": 0.9736, + "step": 12356 + }, + { + "epoch": 0.9985656275884361, + "grad_norm": 3.4742558002471924, + "learning_rate": 5.254110176307643e-06, + "loss": 0.9567, + "step": 12357 + }, + { + "epoch": 0.9986464373017637, + "grad_norm": 2.3015217781066895, + "learning_rate": 5.2534566576049005e-06, + "loss": 0.9369, + "step": 12358 + }, + { + "epoch": 0.9987272470150912, + "grad_norm": 2.615185499191284, + "learning_rate": 5.252803134561057e-06, + "loss": 0.8202, + "step": 12359 + }, + { + "epoch": 0.9988080567284188, + "grad_norm": 3.7705647945404053, + "learning_rate": 5.252149607187302e-06, + "loss": 0.9719, + "step": 12360 + }, + { + "epoch": 0.9988888664417463, + "grad_norm": 2.679399013519287, + "learning_rate": 5.251496075494834e-06, + "loss": 0.8473, + "step": 12361 + }, + { + "epoch": 0.9989696761550738, + "grad_norm": 2.5937654972076416, + "learning_rate": 5.250842539494843e-06, + "loss": 0.9509, + "step": 12362 + }, + { + "epoch": 0.9990504858684014, + "grad_norm": 2.3290038108825684, + "learning_rate": 5.250188999198522e-06, + "loss": 0.901, + "step": 12363 + }, + { + "epoch": 0.999131295581729, + "grad_norm": 2.8928582668304443, + "learning_rate": 5.249535454617067e-06, + "loss": 0.8926, + "step": 12364 + }, + { + "epoch": 0.9992121052950564, + "grad_norm": 2.4332470893859863, + "learning_rate": 5.248881905761671e-06, + "loss": 0.8268, + "step": 12365 + }, + { + "epoch": 0.999292915008384, + "grad_norm": 2.907656669616699, + "learning_rate": 5.248228352643525e-06, + "loss": 0.9369, + "step": 12366 + }, + { + "epoch": 0.9993737247217116, + "grad_norm": 2.733362913131714, + "learning_rate": 5.247574795273827e-06, + "loss": 0.8138, + "step": 12367 + }, + { + "epoch": 0.9994545344350391, + "grad_norm": 2.6702473163604736, + "learning_rate": 5.246921233663768e-06, + "loss": 0.9376, + "step": 12368 + }, + { + "epoch": 0.9995353441483666, + "grad_norm": 2.54905366897583, + "learning_rate": 5.2462676678245415e-06, + "loss": 0.9463, + "step": 12369 + }, + { + "epoch": 0.9996161538616942, + "grad_norm": 2.7659668922424316, + "learning_rate": 5.245614097767343e-06, + "loss": 0.9509, + "step": 12370 + }, + { + "epoch": 0.9996969635750217, + "grad_norm": 2.5379488468170166, + "learning_rate": 5.244960523503368e-06, + "loss": 0.903, + "step": 12371 + }, + { + "epoch": 0.9997777732883493, + "grad_norm": 2.2945473194122314, + "learning_rate": 5.244306945043807e-06, + "loss": 1.0499, + "step": 12372 + }, + { + "epoch": 0.9998585830016768, + "grad_norm": 3.2162699699401855, + "learning_rate": 5.2436533623998575e-06, + "loss": 0.9818, + "step": 12373 + }, + { + "epoch": 0.9999393927150043, + "grad_norm": 3.6801867485046387, + "learning_rate": 5.242999775582711e-06, + "loss": 0.9017, + "step": 12374 + }, + { + "epoch": 1.0000202024283318, + "grad_norm": 2.544635772705078, + "learning_rate": 5.2423461846035665e-06, + "loss": 0.9105, + "step": 12375 + }, + { + "epoch": 1.0001010121416594, + "grad_norm": 2.5052084922790527, + "learning_rate": 5.241692589473613e-06, + "loss": 0.8389, + "step": 12376 + }, + { + "epoch": 1.000181821854987, + "grad_norm": 2.650294542312622, + "learning_rate": 5.241038990204047e-06, + "loss": 0.8178, + "step": 12377 + }, + { + "epoch": 1.0002626315683145, + "grad_norm": 2.218303680419922, + "learning_rate": 5.240385386806064e-06, + "loss": 0.9017, + "step": 12378 + }, + { + "epoch": 1.000343441281642, + "grad_norm": 2.2635653018951416, + "learning_rate": 5.239731779290858e-06, + "loss": 0.8809, + "step": 12379 + }, + { + "epoch": 1.0004242509949697, + "grad_norm": 2.671069622039795, + "learning_rate": 5.239078167669622e-06, + "loss": 0.8148, + "step": 12380 + }, + { + "epoch": 1.000505060708297, + "grad_norm": 2.1969265937805176, + "learning_rate": 5.2384245519535545e-06, + "loss": 0.7317, + "step": 12381 + }, + { + "epoch": 1.0005858704216246, + "grad_norm": 2.548107385635376, + "learning_rate": 5.237770932153849e-06, + "loss": 0.8617, + "step": 12382 + }, + { + "epoch": 1.0006666801349522, + "grad_norm": 2.4351718425750732, + "learning_rate": 5.2371173082816985e-06, + "loss": 0.7876, + "step": 12383 + }, + { + "epoch": 1.0007474898482798, + "grad_norm": 2.3911964893341064, + "learning_rate": 5.2364636803483e-06, + "loss": 0.7921, + "step": 12384 + }, + { + "epoch": 1.0008282995616073, + "grad_norm": 2.0781474113464355, + "learning_rate": 5.2358100483648475e-06, + "loss": 0.8085, + "step": 12385 + }, + { + "epoch": 1.000909109274935, + "grad_norm": 2.5982773303985596, + "learning_rate": 5.235156412342537e-06, + "loss": 0.7222, + "step": 12386 + }, + { + "epoch": 1.0009899189882623, + "grad_norm": 2.546323776245117, + "learning_rate": 5.234502772292563e-06, + "loss": 0.7774, + "step": 12387 + }, + { + "epoch": 1.0010707287015899, + "grad_norm": 2.678215503692627, + "learning_rate": 5.233849128226121e-06, + "loss": 0.7791, + "step": 12388 + }, + { + "epoch": 1.0011515384149174, + "grad_norm": 2.8298981189727783, + "learning_rate": 5.233195480154406e-06, + "loss": 0.7541, + "step": 12389 + }, + { + "epoch": 1.001232348128245, + "grad_norm": 2.4765350818634033, + "learning_rate": 5.232541828088616e-06, + "loss": 0.7497, + "step": 12390 + }, + { + "epoch": 1.0013131578415726, + "grad_norm": 2.929046869277954, + "learning_rate": 5.231888172039941e-06, + "loss": 0.862, + "step": 12391 + }, + { + "epoch": 1.0013939675549002, + "grad_norm": 2.8621582984924316, + "learning_rate": 5.231234512019583e-06, + "loss": 0.9023, + "step": 12392 + }, + { + "epoch": 1.0014747772682275, + "grad_norm": 2.355471611022949, + "learning_rate": 5.230580848038732e-06, + "loss": 0.9258, + "step": 12393 + }, + { + "epoch": 1.0015555869815551, + "grad_norm": 2.8851842880249023, + "learning_rate": 5.2299271801085875e-06, + "loss": 0.8527, + "step": 12394 + }, + { + "epoch": 1.0016363966948827, + "grad_norm": 2.6690969467163086, + "learning_rate": 5.229273508240343e-06, + "loss": 0.8402, + "step": 12395 + }, + { + "epoch": 1.0017172064082103, + "grad_norm": 2.405996322631836, + "learning_rate": 5.2286198324451964e-06, + "loss": 0.7549, + "step": 12396 + }, + { + "epoch": 1.0017980161215378, + "grad_norm": 2.793443441390991, + "learning_rate": 5.227966152734341e-06, + "loss": 0.7942, + "step": 12397 + }, + { + "epoch": 1.0018788258348654, + "grad_norm": 2.437676429748535, + "learning_rate": 5.227312469118976e-06, + "loss": 0.7605, + "step": 12398 + }, + { + "epoch": 1.0019596355481928, + "grad_norm": 3.5769991874694824, + "learning_rate": 5.226658781610293e-06, + "loss": 0.872, + "step": 12399 + }, + { + "epoch": 1.0020404452615204, + "grad_norm": 2.2302238941192627, + "learning_rate": 5.226005090219493e-06, + "loss": 0.9423, + "step": 12400 + }, + { + "epoch": 1.002121254974848, + "grad_norm": 2.1968255043029785, + "learning_rate": 5.22535139495777e-06, + "loss": 0.8641, + "step": 12401 + }, + { + "epoch": 1.0022020646881755, + "grad_norm": 2.4747560024261475, + "learning_rate": 5.22469769583632e-06, + "loss": 0.8181, + "step": 12402 + }, + { + "epoch": 1.002282874401503, + "grad_norm": 3.3412742614746094, + "learning_rate": 5.2240439928663375e-06, + "loss": 0.7322, + "step": 12403 + }, + { + "epoch": 1.0023636841148307, + "grad_norm": 2.458338737487793, + "learning_rate": 5.223390286059023e-06, + "loss": 0.8743, + "step": 12404 + }, + { + "epoch": 1.002444493828158, + "grad_norm": 2.582960844039917, + "learning_rate": 5.22273657542557e-06, + "loss": 0.823, + "step": 12405 + }, + { + "epoch": 1.0025253035414856, + "grad_norm": 2.654578685760498, + "learning_rate": 5.222082860977176e-06, + "loss": 0.8594, + "step": 12406 + }, + { + "epoch": 1.0026061132548132, + "grad_norm": 2.927464246749878, + "learning_rate": 5.221429142725036e-06, + "loss": 0.8869, + "step": 12407 + }, + { + "epoch": 1.0026869229681408, + "grad_norm": 2.0888020992279053, + "learning_rate": 5.220775420680348e-06, + "loss": 0.7466, + "step": 12408 + }, + { + "epoch": 1.0027677326814683, + "grad_norm": 2.579902172088623, + "learning_rate": 5.22012169485431e-06, + "loss": 0.8496, + "step": 12409 + }, + { + "epoch": 1.002848542394796, + "grad_norm": 3.1141583919525146, + "learning_rate": 5.219467965258117e-06, + "loss": 0.8589, + "step": 12410 + }, + { + "epoch": 1.0029293521081235, + "grad_norm": 2.3609068393707275, + "learning_rate": 5.218814231902965e-06, + "loss": 0.8282, + "step": 12411 + }, + { + "epoch": 1.0030101618214509, + "grad_norm": 3.273946762084961, + "learning_rate": 5.2181604948000534e-06, + "loss": 0.8372, + "step": 12412 + }, + { + "epoch": 1.0030909715347784, + "grad_norm": 2.5162534713745117, + "learning_rate": 5.217506753960575e-06, + "loss": 0.67, + "step": 12413 + }, + { + "epoch": 1.003171781248106, + "grad_norm": 2.6449105739593506, + "learning_rate": 5.216853009395732e-06, + "loss": 0.8688, + "step": 12414 + }, + { + "epoch": 1.0032525909614336, + "grad_norm": 2.5569684505462646, + "learning_rate": 5.216199261116719e-06, + "loss": 0.7325, + "step": 12415 + }, + { + "epoch": 1.0033334006747612, + "grad_norm": 2.832686424255371, + "learning_rate": 5.215545509134732e-06, + "loss": 0.9348, + "step": 12416 + }, + { + "epoch": 1.0034142103880888, + "grad_norm": 2.564300775527954, + "learning_rate": 5.21489175346097e-06, + "loss": 0.8142, + "step": 12417 + }, + { + "epoch": 1.0034950201014161, + "grad_norm": 2.928838014602661, + "learning_rate": 5.21423799410663e-06, + "loss": 0.6983, + "step": 12418 + }, + { + "epoch": 1.0035758298147437, + "grad_norm": 2.172785758972168, + "learning_rate": 5.213584231082908e-06, + "loss": 0.8403, + "step": 12419 + }, + { + "epoch": 1.0036566395280713, + "grad_norm": 2.230769395828247, + "learning_rate": 5.212930464401002e-06, + "loss": 0.7288, + "step": 12420 + }, + { + "epoch": 1.0037374492413988, + "grad_norm": 2.494647264480591, + "learning_rate": 5.212276694072112e-06, + "loss": 0.9566, + "step": 12421 + }, + { + "epoch": 1.0038182589547264, + "grad_norm": 2.367222785949707, + "learning_rate": 5.211622920107431e-06, + "loss": 0.944, + "step": 12422 + }, + { + "epoch": 1.003899068668054, + "grad_norm": 2.6778035163879395, + "learning_rate": 5.210969142518159e-06, + "loss": 0.7903, + "step": 12423 + }, + { + "epoch": 1.0039798783813814, + "grad_norm": 2.5383713245391846, + "learning_rate": 5.210315361315494e-06, + "loss": 0.7225, + "step": 12424 + }, + { + "epoch": 1.004060688094709, + "grad_norm": 2.898833751678467, + "learning_rate": 5.209661576510633e-06, + "loss": 0.791, + "step": 12425 + }, + { + "epoch": 1.0041414978080365, + "grad_norm": 3.2661795616149902, + "learning_rate": 5.209007788114775e-06, + "loss": 0.8662, + "step": 12426 + }, + { + "epoch": 1.004222307521364, + "grad_norm": 3.6123569011688232, + "learning_rate": 5.208353996139115e-06, + "loss": 0.814, + "step": 12427 + }, + { + "epoch": 1.0043031172346917, + "grad_norm": 2.734471082687378, + "learning_rate": 5.207700200594854e-06, + "loss": 0.8, + "step": 12428 + }, + { + "epoch": 1.0043839269480193, + "grad_norm": 2.50447416305542, + "learning_rate": 5.207046401493188e-06, + "loss": 0.6917, + "step": 12429 + }, + { + "epoch": 1.0044647366613466, + "grad_norm": 2.889979362487793, + "learning_rate": 5.2063925988453155e-06, + "loss": 0.7898, + "step": 12430 + }, + { + "epoch": 1.0045455463746742, + "grad_norm": 2.468834638595581, + "learning_rate": 5.205738792662435e-06, + "loss": 0.8287, + "step": 12431 + }, + { + "epoch": 1.0046263560880018, + "grad_norm": 2.7482471466064453, + "learning_rate": 5.205084982955745e-06, + "loss": 0.8683, + "step": 12432 + }, + { + "epoch": 1.0047071658013293, + "grad_norm": 2.8472559452056885, + "learning_rate": 5.2044311697364405e-06, + "loss": 0.7325, + "step": 12433 + }, + { + "epoch": 1.004787975514657, + "grad_norm": 2.7432994842529297, + "learning_rate": 5.203777353015725e-06, + "loss": 0.7982, + "step": 12434 + }, + { + "epoch": 1.0048687852279845, + "grad_norm": 2.612081527709961, + "learning_rate": 5.203123532804793e-06, + "loss": 0.7473, + "step": 12435 + }, + { + "epoch": 1.0049495949413119, + "grad_norm": 2.707618236541748, + "learning_rate": 5.202469709114842e-06, + "loss": 0.7284, + "step": 12436 + }, + { + "epoch": 1.0050304046546394, + "grad_norm": 2.7868945598602295, + "learning_rate": 5.201815881957074e-06, + "loss": 0.8548, + "step": 12437 + }, + { + "epoch": 1.005111214367967, + "grad_norm": 2.8199827671051025, + "learning_rate": 5.201162051342687e-06, + "loss": 0.8966, + "step": 12438 + }, + { + "epoch": 1.0051920240812946, + "grad_norm": 3.1978096961975098, + "learning_rate": 5.200508217282876e-06, + "loss": 0.7633, + "step": 12439 + }, + { + "epoch": 1.0052728337946222, + "grad_norm": 2.757664680480957, + "learning_rate": 5.199854379788843e-06, + "loss": 0.837, + "step": 12440 + }, + { + "epoch": 1.0053536435079498, + "grad_norm": 3.1558547019958496, + "learning_rate": 5.199200538871786e-06, + "loss": 0.7882, + "step": 12441 + }, + { + "epoch": 1.0054344532212771, + "grad_norm": 2.5289220809936523, + "learning_rate": 5.198546694542903e-06, + "loss": 0.919, + "step": 12442 + }, + { + "epoch": 1.0055152629346047, + "grad_norm": 2.881176233291626, + "learning_rate": 5.197892846813393e-06, + "loss": 0.8632, + "step": 12443 + }, + { + "epoch": 1.0055960726479323, + "grad_norm": 2.6454885005950928, + "learning_rate": 5.1972389956944544e-06, + "loss": 0.9492, + "step": 12444 + }, + { + "epoch": 1.0056768823612598, + "grad_norm": 2.912611484527588, + "learning_rate": 5.196585141197288e-06, + "loss": 0.9273, + "step": 12445 + }, + { + "epoch": 1.0057576920745874, + "grad_norm": 2.206540584564209, + "learning_rate": 5.1959312833330895e-06, + "loss": 0.8335, + "step": 12446 + }, + { + "epoch": 1.005838501787915, + "grad_norm": 2.196087121963501, + "learning_rate": 5.195277422113062e-06, + "loss": 0.7331, + "step": 12447 + }, + { + "epoch": 1.0059193115012424, + "grad_norm": 3.0699610710144043, + "learning_rate": 5.1946235575484005e-06, + "loss": 0.8245, + "step": 12448 + }, + { + "epoch": 1.00600012121457, + "grad_norm": 2.666667938232422, + "learning_rate": 5.193969689650308e-06, + "loss": 0.7672, + "step": 12449 + }, + { + "epoch": 1.0060809309278975, + "grad_norm": 2.46816086769104, + "learning_rate": 5.19331581842998e-06, + "loss": 0.7748, + "step": 12450 + }, + { + "epoch": 1.006161740641225, + "grad_norm": 2.4281744956970215, + "learning_rate": 5.192661943898618e-06, + "loss": 0.8805, + "step": 12451 + }, + { + "epoch": 1.0062425503545527, + "grad_norm": 2.9114654064178467, + "learning_rate": 5.192008066067421e-06, + "loss": 0.8672, + "step": 12452 + }, + { + "epoch": 1.0063233600678803, + "grad_norm": 2.379807233810425, + "learning_rate": 5.1913541849475866e-06, + "loss": 0.8261, + "step": 12453 + }, + { + "epoch": 1.0064041697812076, + "grad_norm": 2.6848325729370117, + "learning_rate": 5.190700300550317e-06, + "loss": 0.9017, + "step": 12454 + }, + { + "epoch": 1.0064849794945352, + "grad_norm": 2.5436956882476807, + "learning_rate": 5.19004641288681e-06, + "loss": 0.7208, + "step": 12455 + }, + { + "epoch": 1.0065657892078628, + "grad_norm": 2.371771812438965, + "learning_rate": 5.189392521968266e-06, + "loss": 0.8036, + "step": 12456 + }, + { + "epoch": 1.0066465989211903, + "grad_norm": 2.7062172889709473, + "learning_rate": 5.188738627805884e-06, + "loss": 0.9021, + "step": 12457 + }, + { + "epoch": 1.006727408634518, + "grad_norm": 2.474684476852417, + "learning_rate": 5.1880847304108625e-06, + "loss": 0.8564, + "step": 12458 + }, + { + "epoch": 1.0068082183478455, + "grad_norm": 3.052793502807617, + "learning_rate": 5.187430829794405e-06, + "loss": 0.7722, + "step": 12459 + }, + { + "epoch": 1.0068890280611729, + "grad_norm": 2.415121078491211, + "learning_rate": 5.186776925967706e-06, + "loss": 0.8317, + "step": 12460 + }, + { + "epoch": 1.0069698377745004, + "grad_norm": 2.5590007305145264, + "learning_rate": 5.186123018941967e-06, + "loss": 0.7778, + "step": 12461 + }, + { + "epoch": 1.007050647487828, + "grad_norm": 2.7981984615325928, + "learning_rate": 5.1854691087283915e-06, + "loss": 0.78, + "step": 12462 + }, + { + "epoch": 1.0071314572011556, + "grad_norm": 2.3507823944091797, + "learning_rate": 5.184815195338176e-06, + "loss": 0.9592, + "step": 12463 + }, + { + "epoch": 1.0072122669144832, + "grad_norm": 2.577120065689087, + "learning_rate": 5.18416127878252e-06, + "loss": 0.9041, + "step": 12464 + }, + { + "epoch": 1.0072930766278108, + "grad_norm": 2.737476110458374, + "learning_rate": 5.183507359072626e-06, + "loss": 0.9182, + "step": 12465 + }, + { + "epoch": 1.0073738863411381, + "grad_norm": 2.4286694526672363, + "learning_rate": 5.1828534362196924e-06, + "loss": 0.8065, + "step": 12466 + }, + { + "epoch": 1.0074546960544657, + "grad_norm": 2.646874189376831, + "learning_rate": 5.182199510234919e-06, + "loss": 0.9631, + "step": 12467 + }, + { + "epoch": 1.0075355057677933, + "grad_norm": 2.3519954681396484, + "learning_rate": 5.181545581129507e-06, + "loss": 0.7719, + "step": 12468 + }, + { + "epoch": 1.0076163154811208, + "grad_norm": 2.4610695838928223, + "learning_rate": 5.180891648914656e-06, + "loss": 0.8303, + "step": 12469 + }, + { + "epoch": 1.0076971251944484, + "grad_norm": 3.065570116043091, + "learning_rate": 5.180237713601566e-06, + "loss": 0.7641, + "step": 12470 + }, + { + "epoch": 1.007777934907776, + "grad_norm": 2.477292537689209, + "learning_rate": 5.17958377520144e-06, + "loss": 0.9094, + "step": 12471 + }, + { + "epoch": 1.0078587446211034, + "grad_norm": 2.3511126041412354, + "learning_rate": 5.178929833725473e-06, + "loss": 0.8413, + "step": 12472 + }, + { + "epoch": 1.007939554334431, + "grad_norm": 2.6266164779663086, + "learning_rate": 5.178275889184872e-06, + "loss": 0.8049, + "step": 12473 + }, + { + "epoch": 1.0080203640477585, + "grad_norm": 2.6902618408203125, + "learning_rate": 5.177621941590833e-06, + "loss": 0.8439, + "step": 12474 + }, + { + "epoch": 1.008101173761086, + "grad_norm": 2.522594690322876, + "learning_rate": 5.176967990954557e-06, + "loss": 0.8213, + "step": 12475 + }, + { + "epoch": 1.0081819834744137, + "grad_norm": 3.043595790863037, + "learning_rate": 5.176314037287246e-06, + "loss": 0.7734, + "step": 12476 + }, + { + "epoch": 1.0082627931877413, + "grad_norm": 2.538238286972046, + "learning_rate": 5.1756600806001e-06, + "loss": 0.8054, + "step": 12477 + }, + { + "epoch": 1.0083436029010686, + "grad_norm": 2.71919846534729, + "learning_rate": 5.175006120904319e-06, + "loss": 0.895, + "step": 12478 + }, + { + "epoch": 1.0084244126143962, + "grad_norm": 2.378711700439453, + "learning_rate": 5.1743521582111054e-06, + "loss": 0.8879, + "step": 12479 + }, + { + "epoch": 1.0085052223277238, + "grad_norm": 2.6883904933929443, + "learning_rate": 5.1736981925316575e-06, + "loss": 0.8416, + "step": 12480 + }, + { + "epoch": 1.0085860320410514, + "grad_norm": 2.489894390106201, + "learning_rate": 5.173044223877181e-06, + "loss": 0.8498, + "step": 12481 + }, + { + "epoch": 1.008666841754379, + "grad_norm": 2.707709312438965, + "learning_rate": 5.172390252258871e-06, + "loss": 0.7953, + "step": 12482 + }, + { + "epoch": 1.0087476514677065, + "grad_norm": 2.8330583572387695, + "learning_rate": 5.171736277687931e-06, + "loss": 0.7782, + "step": 12483 + }, + { + "epoch": 1.0088284611810339, + "grad_norm": 2.5243594646453857, + "learning_rate": 5.171082300175562e-06, + "loss": 0.8177, + "step": 12484 + }, + { + "epoch": 1.0089092708943614, + "grad_norm": 2.657561779022217, + "learning_rate": 5.170428319732966e-06, + "loss": 0.8734, + "step": 12485 + }, + { + "epoch": 1.008990080607689, + "grad_norm": 2.4913158416748047, + "learning_rate": 5.169774336371342e-06, + "loss": 0.9185, + "step": 12486 + }, + { + "epoch": 1.0090708903210166, + "grad_norm": 2.6465888023376465, + "learning_rate": 5.1691203501018935e-06, + "loss": 0.8314, + "step": 12487 + }, + { + "epoch": 1.0091517000343442, + "grad_norm": 2.3169476985931396, + "learning_rate": 5.1684663609358195e-06, + "loss": 0.9192, + "step": 12488 + }, + { + "epoch": 1.0092325097476718, + "grad_norm": 2.53898024559021, + "learning_rate": 5.167812368884323e-06, + "loss": 0.8389, + "step": 12489 + }, + { + "epoch": 1.0093133194609991, + "grad_norm": 2.3562941551208496, + "learning_rate": 5.167158373958605e-06, + "loss": 0.7824, + "step": 12490 + }, + { + "epoch": 1.0093941291743267, + "grad_norm": 3.0374176502227783, + "learning_rate": 5.166504376169867e-06, + "loss": 0.6628, + "step": 12491 + }, + { + "epoch": 1.0094749388876543, + "grad_norm": 2.5810141563415527, + "learning_rate": 5.1658503755293075e-06, + "loss": 0.7645, + "step": 12492 + }, + { + "epoch": 1.0095557486009819, + "grad_norm": 2.4141604900360107, + "learning_rate": 5.165196372048133e-06, + "loss": 0.7173, + "step": 12493 + }, + { + "epoch": 1.0096365583143094, + "grad_norm": 2.61149001121521, + "learning_rate": 5.164542365737539e-06, + "loss": 0.7786, + "step": 12494 + }, + { + "epoch": 1.009717368027637, + "grad_norm": 2.6056108474731445, + "learning_rate": 5.1638883566087324e-06, + "loss": 0.883, + "step": 12495 + }, + { + "epoch": 1.0097981777409644, + "grad_norm": 3.0438790321350098, + "learning_rate": 5.1632343446729135e-06, + "loss": 0.6845, + "step": 12496 + }, + { + "epoch": 1.009878987454292, + "grad_norm": 3.2974979877471924, + "learning_rate": 5.162580329941283e-06, + "loss": 0.9291, + "step": 12497 + }, + { + "epoch": 1.0099597971676195, + "grad_norm": 2.6835813522338867, + "learning_rate": 5.161926312425042e-06, + "loss": 0.7465, + "step": 12498 + }, + { + "epoch": 1.010040606880947, + "grad_norm": 2.5462892055511475, + "learning_rate": 5.161272292135394e-06, + "loss": 0.7749, + "step": 12499 + }, + { + "epoch": 1.0101214165942747, + "grad_norm": 2.438927412033081, + "learning_rate": 5.160618269083538e-06, + "loss": 0.867, + "step": 12500 + }, + { + "epoch": 1.0102022263076023, + "grad_norm": 2.727975368499756, + "learning_rate": 5.159964243280681e-06, + "loss": 0.7904, + "step": 12501 + }, + { + "epoch": 1.0102830360209296, + "grad_norm": 2.4510722160339355, + "learning_rate": 5.159310214738019e-06, + "loss": 0.8352, + "step": 12502 + }, + { + "epoch": 1.0103638457342572, + "grad_norm": 2.4921584129333496, + "learning_rate": 5.158656183466757e-06, + "loss": 0.8254, + "step": 12503 + }, + { + "epoch": 1.0104446554475848, + "grad_norm": 2.5787413120269775, + "learning_rate": 5.158002149478096e-06, + "loss": 0.6811, + "step": 12504 + }, + { + "epoch": 1.0105254651609124, + "grad_norm": 2.7388923168182373, + "learning_rate": 5.157348112783239e-06, + "loss": 0.7902, + "step": 12505 + }, + { + "epoch": 1.01060627487424, + "grad_norm": 2.5851552486419678, + "learning_rate": 5.156694073393388e-06, + "loss": 0.9329, + "step": 12506 + }, + { + "epoch": 1.0106870845875675, + "grad_norm": 2.8699493408203125, + "learning_rate": 5.156040031319744e-06, + "loss": 0.8148, + "step": 12507 + }, + { + "epoch": 1.0107678943008949, + "grad_norm": 3.3402721881866455, + "learning_rate": 5.155385986573511e-06, + "loss": 0.7249, + "step": 12508 + }, + { + "epoch": 1.0108487040142224, + "grad_norm": 2.719470977783203, + "learning_rate": 5.154731939165891e-06, + "loss": 0.8114, + "step": 12509 + }, + { + "epoch": 1.01092951372755, + "grad_norm": 2.856743574142456, + "learning_rate": 5.154077889108085e-06, + "loss": 0.7428, + "step": 12510 + }, + { + "epoch": 1.0110103234408776, + "grad_norm": 2.2968685626983643, + "learning_rate": 5.153423836411293e-06, + "loss": 0.8198, + "step": 12511 + }, + { + "epoch": 1.0110911331542052, + "grad_norm": 3.1482110023498535, + "learning_rate": 5.152769781086723e-06, + "loss": 0.8385, + "step": 12512 + }, + { + "epoch": 1.0111719428675328, + "grad_norm": 2.8562870025634766, + "learning_rate": 5.152115723145572e-06, + "loss": 0.8137, + "step": 12513 + }, + { + "epoch": 1.0112527525808601, + "grad_norm": 2.7162911891937256, + "learning_rate": 5.151461662599047e-06, + "loss": 0.7455, + "step": 12514 + }, + { + "epoch": 1.0113335622941877, + "grad_norm": 2.4655370712280273, + "learning_rate": 5.1508075994583465e-06, + "loss": 0.7117, + "step": 12515 + }, + { + "epoch": 1.0114143720075153, + "grad_norm": 2.6548609733581543, + "learning_rate": 5.150153533734677e-06, + "loss": 0.8141, + "step": 12516 + }, + { + "epoch": 1.0114951817208429, + "grad_norm": 2.7891759872436523, + "learning_rate": 5.149499465439237e-06, + "loss": 0.8306, + "step": 12517 + }, + { + "epoch": 1.0115759914341704, + "grad_norm": 3.0253827571868896, + "learning_rate": 5.148845394583233e-06, + "loss": 0.7892, + "step": 12518 + }, + { + "epoch": 1.011656801147498, + "grad_norm": 2.6911189556121826, + "learning_rate": 5.148191321177864e-06, + "loss": 0.8156, + "step": 12519 + }, + { + "epoch": 1.0117376108608254, + "grad_norm": 2.3544700145721436, + "learning_rate": 5.147537245234334e-06, + "loss": 0.9618, + "step": 12520 + }, + { + "epoch": 1.011818420574153, + "grad_norm": 2.5659430027008057, + "learning_rate": 5.1468831667638475e-06, + "loss": 0.7331, + "step": 12521 + }, + { + "epoch": 1.0118992302874805, + "grad_norm": 2.667371988296509, + "learning_rate": 5.146229085777605e-06, + "loss": 0.789, + "step": 12522 + }, + { + "epoch": 1.011980040000808, + "grad_norm": 2.415018320083618, + "learning_rate": 5.145575002286811e-06, + "loss": 0.8749, + "step": 12523 + }, + { + "epoch": 1.0120608497141357, + "grad_norm": 2.4200429916381836, + "learning_rate": 5.144920916302669e-06, + "loss": 0.7911, + "step": 12524 + }, + { + "epoch": 1.0121416594274633, + "grad_norm": 2.622950553894043, + "learning_rate": 5.144266827836378e-06, + "loss": 0.8284, + "step": 12525 + }, + { + "epoch": 1.0122224691407906, + "grad_norm": 2.663954019546509, + "learning_rate": 5.143612736899145e-06, + "loss": 0.9321, + "step": 12526 + }, + { + "epoch": 1.0123032788541182, + "grad_norm": 2.183441638946533, + "learning_rate": 5.142958643502172e-06, + "loss": 0.7527, + "step": 12527 + }, + { + "epoch": 1.0123840885674458, + "grad_norm": 2.886779546737671, + "learning_rate": 5.14230454765666e-06, + "loss": 1.0143, + "step": 12528 + }, + { + "epoch": 1.0124648982807734, + "grad_norm": 2.8194220066070557, + "learning_rate": 5.141650449373815e-06, + "loss": 0.8163, + "step": 12529 + }, + { + "epoch": 1.012545707994101, + "grad_norm": 2.681802988052368, + "learning_rate": 5.14099634866484e-06, + "loss": 0.8449, + "step": 12530 + }, + { + "epoch": 1.0126265177074285, + "grad_norm": 2.327749013900757, + "learning_rate": 5.1403422455409334e-06, + "loss": 0.8555, + "step": 12531 + }, + { + "epoch": 1.0127073274207559, + "grad_norm": 2.747070074081421, + "learning_rate": 5.139688140013305e-06, + "loss": 0.8523, + "step": 12532 + }, + { + "epoch": 1.0127881371340834, + "grad_norm": 2.4575886726379395, + "learning_rate": 5.139034032093153e-06, + "loss": 0.7795, + "step": 12533 + }, + { + "epoch": 1.012868946847411, + "grad_norm": 2.783709764480591, + "learning_rate": 5.138379921791684e-06, + "loss": 0.8766, + "step": 12534 + }, + { + "epoch": 1.0129497565607386, + "grad_norm": 2.537069320678711, + "learning_rate": 5.1377258091201e-06, + "loss": 0.7238, + "step": 12535 + }, + { + "epoch": 1.0130305662740662, + "grad_norm": 2.3739278316497803, + "learning_rate": 5.137071694089604e-06, + "loss": 0.9084, + "step": 12536 + }, + { + "epoch": 1.0131113759873938, + "grad_norm": 3.3975095748901367, + "learning_rate": 5.1364175767114e-06, + "loss": 0.8118, + "step": 12537 + }, + { + "epoch": 1.0131921857007211, + "grad_norm": 2.5251986980438232, + "learning_rate": 5.135763456996692e-06, + "loss": 0.8556, + "step": 12538 + }, + { + "epoch": 1.0132729954140487, + "grad_norm": 2.407038927078247, + "learning_rate": 5.135109334956682e-06, + "loss": 0.8468, + "step": 12539 + }, + { + "epoch": 1.0133538051273763, + "grad_norm": 2.478182315826416, + "learning_rate": 5.134455210602575e-06, + "loss": 0.7428, + "step": 12540 + }, + { + "epoch": 1.0134346148407039, + "grad_norm": 2.464784860610962, + "learning_rate": 5.133801083945573e-06, + "loss": 0.6878, + "step": 12541 + }, + { + "epoch": 1.0135154245540314, + "grad_norm": 2.6182079315185547, + "learning_rate": 5.1331469549968814e-06, + "loss": 0.8197, + "step": 12542 + }, + { + "epoch": 1.013596234267359, + "grad_norm": 2.762474298477173, + "learning_rate": 5.132492823767702e-06, + "loss": 0.7491, + "step": 12543 + }, + { + "epoch": 1.0136770439806866, + "grad_norm": 2.3340630531311035, + "learning_rate": 5.131838690269242e-06, + "loss": 0.8649, + "step": 12544 + }, + { + "epoch": 1.013757853694014, + "grad_norm": 2.8064169883728027, + "learning_rate": 5.1311845545126995e-06, + "loss": 0.9643, + "step": 12545 + }, + { + "epoch": 1.0138386634073415, + "grad_norm": 2.4351413249969482, + "learning_rate": 5.130530416509283e-06, + "loss": 0.8348, + "step": 12546 + }, + { + "epoch": 1.013919473120669, + "grad_norm": 2.338689088821411, + "learning_rate": 5.129876276270195e-06, + "loss": 0.8594, + "step": 12547 + }, + { + "epoch": 1.0140002828339967, + "grad_norm": 2.895043134689331, + "learning_rate": 5.129222133806638e-06, + "loss": 0.7953, + "step": 12548 + }, + { + "epoch": 1.0140810925473243, + "grad_norm": 2.9791154861450195, + "learning_rate": 5.128567989129816e-06, + "loss": 0.8836, + "step": 12549 + }, + { + "epoch": 1.0141619022606518, + "grad_norm": 2.254404306411743, + "learning_rate": 5.127913842250936e-06, + "loss": 0.804, + "step": 12550 + }, + { + "epoch": 1.0142427119739792, + "grad_norm": 2.5991082191467285, + "learning_rate": 5.127259693181199e-06, + "loss": 0.8925, + "step": 12551 + }, + { + "epoch": 1.0143235216873068, + "grad_norm": 2.7920894622802734, + "learning_rate": 5.126605541931811e-06, + "loss": 0.9347, + "step": 12552 + }, + { + "epoch": 1.0144043314006344, + "grad_norm": 2.6438891887664795, + "learning_rate": 5.125951388513972e-06, + "loss": 0.6857, + "step": 12553 + }, + { + "epoch": 1.014485141113962, + "grad_norm": 2.456252336502075, + "learning_rate": 5.125297232938892e-06, + "loss": 0.8561, + "step": 12554 + }, + { + "epoch": 1.0145659508272895, + "grad_norm": 2.6659305095672607, + "learning_rate": 5.124643075217771e-06, + "loss": 0.8925, + "step": 12555 + }, + { + "epoch": 1.014646760540617, + "grad_norm": 2.682535409927368, + "learning_rate": 5.123988915361814e-06, + "loss": 0.7246, + "step": 12556 + }, + { + "epoch": 1.0147275702539444, + "grad_norm": 2.886742353439331, + "learning_rate": 5.123334753382224e-06, + "loss": 0.6959, + "step": 12557 + }, + { + "epoch": 1.014808379967272, + "grad_norm": 2.5031943321228027, + "learning_rate": 5.1226805892902095e-06, + "loss": 0.7845, + "step": 12558 + }, + { + "epoch": 1.0148891896805996, + "grad_norm": 2.4899203777313232, + "learning_rate": 5.122026423096968e-06, + "loss": 0.7403, + "step": 12559 + }, + { + "epoch": 1.0149699993939272, + "grad_norm": 2.523195505142212, + "learning_rate": 5.121372254813712e-06, + "loss": 0.803, + "step": 12560 + }, + { + "epoch": 1.0150508091072548, + "grad_norm": 3.197504758834839, + "learning_rate": 5.120718084451639e-06, + "loss": 0.8498, + "step": 12561 + }, + { + "epoch": 1.0151316188205823, + "grad_norm": 2.8150250911712646, + "learning_rate": 5.120063912021957e-06, + "loss": 0.8401, + "step": 12562 + }, + { + "epoch": 1.0152124285339097, + "grad_norm": 2.3248839378356934, + "learning_rate": 5.1194097375358675e-06, + "loss": 0.8333, + "step": 12563 + }, + { + "epoch": 1.0152932382472373, + "grad_norm": 2.4993953704833984, + "learning_rate": 5.118755561004577e-06, + "loss": 0.8824, + "step": 12564 + }, + { + "epoch": 1.0153740479605649, + "grad_norm": 2.494349718093872, + "learning_rate": 5.11810138243929e-06, + "loss": 0.794, + "step": 12565 + }, + { + "epoch": 1.0154548576738924, + "grad_norm": 2.4131264686584473, + "learning_rate": 5.117447201851212e-06, + "loss": 0.8869, + "step": 12566 + }, + { + "epoch": 1.01553566738722, + "grad_norm": 2.6513562202453613, + "learning_rate": 5.1167930192515434e-06, + "loss": 0.8108, + "step": 12567 + }, + { + "epoch": 1.0156164771005476, + "grad_norm": 2.5764501094818115, + "learning_rate": 5.116138834651494e-06, + "loss": 0.8413, + "step": 12568 + }, + { + "epoch": 1.015697286813875, + "grad_norm": 2.739586114883423, + "learning_rate": 5.115484648062265e-06, + "loss": 0.9202, + "step": 12569 + }, + { + "epoch": 1.0157780965272025, + "grad_norm": 2.8270676136016846, + "learning_rate": 5.114830459495063e-06, + "loss": 0.8101, + "step": 12570 + }, + { + "epoch": 1.01585890624053, + "grad_norm": 2.359265089035034, + "learning_rate": 5.114176268961089e-06, + "loss": 0.8041, + "step": 12571 + }, + { + "epoch": 1.0159397159538577, + "grad_norm": 2.790734052658081, + "learning_rate": 5.113522076471553e-06, + "loss": 0.8016, + "step": 12572 + }, + { + "epoch": 1.0160205256671853, + "grad_norm": 2.2723278999328613, + "learning_rate": 5.1128678820376565e-06, + "loss": 0.9098, + "step": 12573 + }, + { + "epoch": 1.0161013353805128, + "grad_norm": 2.446974515914917, + "learning_rate": 5.112213685670604e-06, + "loss": 0.7743, + "step": 12574 + }, + { + "epoch": 1.0161821450938402, + "grad_norm": 3.0320420265197754, + "learning_rate": 5.111559487381603e-06, + "loss": 0.8606, + "step": 12575 + }, + { + "epoch": 1.0162629548071678, + "grad_norm": 2.7096023559570312, + "learning_rate": 5.110905287181855e-06, + "loss": 0.8023, + "step": 12576 + }, + { + "epoch": 1.0163437645204954, + "grad_norm": 2.3316524028778076, + "learning_rate": 5.110251085082567e-06, + "loss": 0.7733, + "step": 12577 + }, + { + "epoch": 1.016424574233823, + "grad_norm": 2.485217332839966, + "learning_rate": 5.109596881094942e-06, + "loss": 0.8495, + "step": 12578 + }, + { + "epoch": 1.0165053839471505, + "grad_norm": 2.262326955795288, + "learning_rate": 5.108942675230188e-06, + "loss": 0.8006, + "step": 12579 + }, + { + "epoch": 1.016586193660478, + "grad_norm": 2.484311580657959, + "learning_rate": 5.1082884674995085e-06, + "loss": 0.8535, + "step": 12580 + }, + { + "epoch": 1.0166670033738054, + "grad_norm": 2.561978578567505, + "learning_rate": 5.107634257914107e-06, + "loss": 0.8623, + "step": 12581 + }, + { + "epoch": 1.016747813087133, + "grad_norm": 2.7206268310546875, + "learning_rate": 5.106980046485189e-06, + "loss": 0.7643, + "step": 12582 + }, + { + "epoch": 1.0168286228004606, + "grad_norm": 2.300199031829834, + "learning_rate": 5.106325833223963e-06, + "loss": 0.8361, + "step": 12583 + }, + { + "epoch": 1.0169094325137882, + "grad_norm": 2.913027763366699, + "learning_rate": 5.10567161814163e-06, + "loss": 0.9844, + "step": 12584 + }, + { + "epoch": 1.0169902422271158, + "grad_norm": 2.632624387741089, + "learning_rate": 5.105017401249397e-06, + "loss": 0.8527, + "step": 12585 + }, + { + "epoch": 1.0170710519404433, + "grad_norm": 2.1495048999786377, + "learning_rate": 5.104363182558467e-06, + "loss": 0.7796, + "step": 12586 + }, + { + "epoch": 1.0171518616537707, + "grad_norm": 2.6786608695983887, + "learning_rate": 5.103708962080048e-06, + "loss": 0.7279, + "step": 12587 + }, + { + "epoch": 1.0172326713670983, + "grad_norm": 2.464973211288452, + "learning_rate": 5.103054739825345e-06, + "loss": 0.8749, + "step": 12588 + }, + { + "epoch": 1.0173134810804259, + "grad_norm": 2.6223528385162354, + "learning_rate": 5.102400515805561e-06, + "loss": 0.8116, + "step": 12589 + }, + { + "epoch": 1.0173942907937534, + "grad_norm": 2.671349048614502, + "learning_rate": 5.101746290031903e-06, + "loss": 0.8056, + "step": 12590 + }, + { + "epoch": 1.017475100507081, + "grad_norm": 2.7348575592041016, + "learning_rate": 5.101092062515578e-06, + "loss": 0.8513, + "step": 12591 + }, + { + "epoch": 1.0175559102204086, + "grad_norm": 3.5210304260253906, + "learning_rate": 5.100437833267788e-06, + "loss": 0.8594, + "step": 12592 + }, + { + "epoch": 1.017636719933736, + "grad_norm": 2.3753433227539062, + "learning_rate": 5.099783602299739e-06, + "loss": 0.8299, + "step": 12593 + }, + { + "epoch": 1.0177175296470635, + "grad_norm": 2.5451722145080566, + "learning_rate": 5.099129369622639e-06, + "loss": 0.844, + "step": 12594 + }, + { + "epoch": 1.017798339360391, + "grad_norm": 3.6367604732513428, + "learning_rate": 5.098475135247689e-06, + "loss": 0.8029, + "step": 12595 + }, + { + "epoch": 1.0178791490737187, + "grad_norm": 2.472900867462158, + "learning_rate": 5.097820899186098e-06, + "loss": 0.8026, + "step": 12596 + }, + { + "epoch": 1.0179599587870463, + "grad_norm": 3.27156138420105, + "learning_rate": 5.0971666614490725e-06, + "loss": 0.9626, + "step": 12597 + }, + { + "epoch": 1.0180407685003738, + "grad_norm": 2.752840518951416, + "learning_rate": 5.096512422047812e-06, + "loss": 0.8839, + "step": 12598 + }, + { + "epoch": 1.0181215782137012, + "grad_norm": 2.6791982650756836, + "learning_rate": 5.095858180993529e-06, + "loss": 0.7915, + "step": 12599 + }, + { + "epoch": 1.0182023879270288, + "grad_norm": 2.264230489730835, + "learning_rate": 5.095203938297426e-06, + "loss": 0.8246, + "step": 12600 + }, + { + "epoch": 1.0182831976403564, + "grad_norm": 2.516456127166748, + "learning_rate": 5.094549693970707e-06, + "loss": 0.727, + "step": 12601 + }, + { + "epoch": 1.018364007353684, + "grad_norm": 2.3891682624816895, + "learning_rate": 5.093895448024581e-06, + "loss": 0.8913, + "step": 12602 + }, + { + "epoch": 1.0184448170670115, + "grad_norm": 2.3722238540649414, + "learning_rate": 5.093241200470252e-06, + "loss": 0.766, + "step": 12603 + }, + { + "epoch": 1.018525626780339, + "grad_norm": 2.553745746612549, + "learning_rate": 5.092586951318924e-06, + "loss": 0.7955, + "step": 12604 + }, + { + "epoch": 1.0186064364936664, + "grad_norm": 2.5744190216064453, + "learning_rate": 5.0919327005818065e-06, + "loss": 0.7883, + "step": 12605 + }, + { + "epoch": 1.018687246206994, + "grad_norm": 3.2034945487976074, + "learning_rate": 5.0912784482701015e-06, + "loss": 0.8694, + "step": 12606 + }, + { + "epoch": 1.0187680559203216, + "grad_norm": 2.4338269233703613, + "learning_rate": 5.090624194395018e-06, + "loss": 0.7803, + "step": 12607 + }, + { + "epoch": 1.0188488656336492, + "grad_norm": 2.874695062637329, + "learning_rate": 5.089969938967759e-06, + "loss": 0.8685, + "step": 12608 + }, + { + "epoch": 1.0189296753469768, + "grad_norm": 2.651587963104248, + "learning_rate": 5.089315681999531e-06, + "loss": 0.8423, + "step": 12609 + }, + { + "epoch": 1.0190104850603043, + "grad_norm": 2.773283004760742, + "learning_rate": 5.088661423501542e-06, + "loss": 0.858, + "step": 12610 + }, + { + "epoch": 1.0190912947736317, + "grad_norm": 2.34829044342041, + "learning_rate": 5.088007163484997e-06, + "loss": 0.8556, + "step": 12611 + }, + { + "epoch": 1.0191721044869593, + "grad_norm": 3.038344383239746, + "learning_rate": 5.087352901961098e-06, + "loss": 0.8405, + "step": 12612 + }, + { + "epoch": 1.0192529142002869, + "grad_norm": 2.6653900146484375, + "learning_rate": 5.0866986389410564e-06, + "loss": 0.8067, + "step": 12613 + }, + { + "epoch": 1.0193337239136144, + "grad_norm": 2.984839677810669, + "learning_rate": 5.086044374436076e-06, + "loss": 0.8352, + "step": 12614 + }, + { + "epoch": 1.019414533626942, + "grad_norm": 2.7549664974212646, + "learning_rate": 5.085390108457362e-06, + "loss": 0.7722, + "step": 12615 + }, + { + "epoch": 1.0194953433402696, + "grad_norm": 2.6267752647399902, + "learning_rate": 5.08473584101612e-06, + "loss": 0.8012, + "step": 12616 + }, + { + "epoch": 1.019576153053597, + "grad_norm": 2.9457740783691406, + "learning_rate": 5.084081572123558e-06, + "loss": 0.8584, + "step": 12617 + }, + { + "epoch": 1.0196569627669245, + "grad_norm": 2.533068895339966, + "learning_rate": 5.083427301790881e-06, + "loss": 0.8131, + "step": 12618 + }, + { + "epoch": 1.019737772480252, + "grad_norm": 2.8118538856506348, + "learning_rate": 5.082773030029297e-06, + "loss": 0.7693, + "step": 12619 + }, + { + "epoch": 1.0198185821935797, + "grad_norm": 2.4821224212646484, + "learning_rate": 5.082118756850007e-06, + "loss": 0.9049, + "step": 12620 + }, + { + "epoch": 1.0198993919069073, + "grad_norm": 2.4987287521362305, + "learning_rate": 5.081464482264223e-06, + "loss": 0.8304, + "step": 12621 + }, + { + "epoch": 1.0199802016202348, + "grad_norm": 2.7845113277435303, + "learning_rate": 5.080810206283147e-06, + "loss": 0.791, + "step": 12622 + }, + { + "epoch": 1.0200610113335622, + "grad_norm": 2.3762574195861816, + "learning_rate": 5.080155928917986e-06, + "loss": 0.8095, + "step": 12623 + }, + { + "epoch": 1.0201418210468898, + "grad_norm": 2.5248916149139404, + "learning_rate": 5.079501650179948e-06, + "loss": 0.7802, + "step": 12624 + }, + { + "epoch": 1.0202226307602174, + "grad_norm": 2.6407883167266846, + "learning_rate": 5.078847370080239e-06, + "loss": 0.7451, + "step": 12625 + }, + { + "epoch": 1.020303440473545, + "grad_norm": 2.595813512802124, + "learning_rate": 5.078193088630062e-06, + "loss": 0.8572, + "step": 12626 + }, + { + "epoch": 1.0203842501868725, + "grad_norm": 2.273280382156372, + "learning_rate": 5.077538805840629e-06, + "loss": 0.8425, + "step": 12627 + }, + { + "epoch": 1.0204650599002, + "grad_norm": 2.8976593017578125, + "learning_rate": 5.07688452172314e-06, + "loss": 0.824, + "step": 12628 + }, + { + "epoch": 1.0205458696135274, + "grad_norm": 2.5143322944641113, + "learning_rate": 5.076230236288805e-06, + "loss": 0.8648, + "step": 12629 + }, + { + "epoch": 1.020626679326855, + "grad_norm": 2.59073805809021, + "learning_rate": 5.07557594954883e-06, + "loss": 0.779, + "step": 12630 + }, + { + "epoch": 1.0207074890401826, + "grad_norm": 2.656137704849243, + "learning_rate": 5.07492166151442e-06, + "loss": 0.8114, + "step": 12631 + }, + { + "epoch": 1.0207882987535102, + "grad_norm": 2.878208875656128, + "learning_rate": 5.074267372196784e-06, + "loss": 0.8133, + "step": 12632 + }, + { + "epoch": 1.0208691084668378, + "grad_norm": 2.3239173889160156, + "learning_rate": 5.0736130816071265e-06, + "loss": 0.8897, + "step": 12633 + }, + { + "epoch": 1.0209499181801653, + "grad_norm": 2.847738027572632, + "learning_rate": 5.072958789756653e-06, + "loss": 0.8624, + "step": 12634 + }, + { + "epoch": 1.0210307278934927, + "grad_norm": 2.5505475997924805, + "learning_rate": 5.0723044966565716e-06, + "loss": 0.8422, + "step": 12635 + }, + { + "epoch": 1.0211115376068203, + "grad_norm": 2.650987148284912, + "learning_rate": 5.071650202318088e-06, + "loss": 0.791, + "step": 12636 + }, + { + "epoch": 1.0211923473201479, + "grad_norm": 3.117178440093994, + "learning_rate": 5.070995906752409e-06, + "loss": 0.8437, + "step": 12637 + }, + { + "epoch": 1.0212731570334754, + "grad_norm": 2.452789068222046, + "learning_rate": 5.0703416099707404e-06, + "loss": 0.8355, + "step": 12638 + }, + { + "epoch": 1.021353966746803, + "grad_norm": 2.923870801925659, + "learning_rate": 5.0696873119842906e-06, + "loss": 0.833, + "step": 12639 + }, + { + "epoch": 1.0214347764601306, + "grad_norm": 2.6646676063537598, + "learning_rate": 5.0690330128042645e-06, + "loss": 0.7995, + "step": 12640 + }, + { + "epoch": 1.021515586173458, + "grad_norm": 2.2564642429351807, + "learning_rate": 5.0683787124418695e-06, + "loss": 0.775, + "step": 12641 + }, + { + "epoch": 1.0215963958867855, + "grad_norm": 2.532127618789673, + "learning_rate": 5.06772441090831e-06, + "loss": 0.7502, + "step": 12642 + }, + { + "epoch": 1.021677205600113, + "grad_norm": 2.6183149814605713, + "learning_rate": 5.067070108214795e-06, + "loss": 0.7897, + "step": 12643 + }, + { + "epoch": 1.0217580153134407, + "grad_norm": 2.713432550430298, + "learning_rate": 5.066415804372532e-06, + "loss": 0.8426, + "step": 12644 + }, + { + "epoch": 1.0218388250267683, + "grad_norm": 2.9958345890045166, + "learning_rate": 5.065761499392725e-06, + "loss": 0.7845, + "step": 12645 + }, + { + "epoch": 1.0219196347400958, + "grad_norm": 3.305631637573242, + "learning_rate": 5.065107193286581e-06, + "loss": 0.8995, + "step": 12646 + }, + { + "epoch": 1.0220004444534232, + "grad_norm": 2.428516149520874, + "learning_rate": 5.0644528860653096e-06, + "loss": 0.8722, + "step": 12647 + }, + { + "epoch": 1.0220812541667508, + "grad_norm": 2.7107222080230713, + "learning_rate": 5.0637985777401145e-06, + "loss": 0.8177, + "step": 12648 + }, + { + "epoch": 1.0221620638800784, + "grad_norm": 2.5419907569885254, + "learning_rate": 5.063144268322203e-06, + "loss": 0.7565, + "step": 12649 + }, + { + "epoch": 1.022242873593406, + "grad_norm": 2.2726142406463623, + "learning_rate": 5.0624899578227825e-06, + "loss": 0.7221, + "step": 12650 + }, + { + "epoch": 1.0223236833067335, + "grad_norm": 2.45625901222229, + "learning_rate": 5.061835646253059e-06, + "loss": 0.7756, + "step": 12651 + }, + { + "epoch": 1.022404493020061, + "grad_norm": 3.0126771926879883, + "learning_rate": 5.061181333624241e-06, + "loss": 0.8195, + "step": 12652 + }, + { + "epoch": 1.0224853027333884, + "grad_norm": 2.5299274921417236, + "learning_rate": 5.060527019947533e-06, + "loss": 0.8244, + "step": 12653 + }, + { + "epoch": 1.022566112446716, + "grad_norm": 2.408522367477417, + "learning_rate": 5.059872705234144e-06, + "loss": 0.8256, + "step": 12654 + }, + { + "epoch": 1.0226469221600436, + "grad_norm": 2.281392812728882, + "learning_rate": 5.059218389495277e-06, + "loss": 0.8795, + "step": 12655 + }, + { + "epoch": 1.0227277318733712, + "grad_norm": 3.077888011932373, + "learning_rate": 5.058564072742145e-06, + "loss": 0.773, + "step": 12656 + }, + { + "epoch": 1.0228085415866988, + "grad_norm": 3.169032096862793, + "learning_rate": 5.057909754985948e-06, + "loss": 0.7304, + "step": 12657 + }, + { + "epoch": 1.0228893513000263, + "grad_norm": 2.776294469833374, + "learning_rate": 5.057255436237899e-06, + "loss": 0.8589, + "step": 12658 + }, + { + "epoch": 1.0229701610133537, + "grad_norm": 2.7166430950164795, + "learning_rate": 5.0566011165092e-06, + "loss": 0.7925, + "step": 12659 + }, + { + "epoch": 1.0230509707266813, + "grad_norm": 3.373649835586548, + "learning_rate": 5.0559467958110635e-06, + "loss": 0.7587, + "step": 12660 + }, + { + "epoch": 1.0231317804400089, + "grad_norm": 2.545762538909912, + "learning_rate": 5.05529247415469e-06, + "loss": 0.799, + "step": 12661 + }, + { + "epoch": 1.0232125901533364, + "grad_norm": 2.73091721534729, + "learning_rate": 5.0546381515512896e-06, + "loss": 0.8279, + "step": 12662 + }, + { + "epoch": 1.023293399866664, + "grad_norm": 2.65697979927063, + "learning_rate": 5.0539838280120715e-06, + "loss": 0.8238, + "step": 12663 + }, + { + "epoch": 1.0233742095799916, + "grad_norm": 2.65800142288208, + "learning_rate": 5.053329503548239e-06, + "loss": 0.785, + "step": 12664 + }, + { + "epoch": 1.0234550192933192, + "grad_norm": 3.661139965057373, + "learning_rate": 5.052675178170999e-06, + "loss": 0.8133, + "step": 12665 + }, + { + "epoch": 1.0235358290066465, + "grad_norm": 2.6694295406341553, + "learning_rate": 5.052020851891563e-06, + "loss": 0.8975, + "step": 12666 + }, + { + "epoch": 1.023616638719974, + "grad_norm": 2.9427649974823, + "learning_rate": 5.051366524721133e-06, + "loss": 0.8424, + "step": 12667 + }, + { + "epoch": 1.0236974484333017, + "grad_norm": 2.6894493103027344, + "learning_rate": 5.050712196670918e-06, + "loss": 0.8914, + "step": 12668 + }, + { + "epoch": 1.0237782581466293, + "grad_norm": 2.7558834552764893, + "learning_rate": 5.050057867752126e-06, + "loss": 0.8858, + "step": 12669 + }, + { + "epoch": 1.0238590678599568, + "grad_norm": 3.1157639026641846, + "learning_rate": 5.049403537975964e-06, + "loss": 0.817, + "step": 12670 + }, + { + "epoch": 1.0239398775732842, + "grad_norm": 2.6721572875976562, + "learning_rate": 5.048749207353636e-06, + "loss": 0.8365, + "step": 12671 + }, + { + "epoch": 1.0240206872866118, + "grad_norm": 2.8743832111358643, + "learning_rate": 5.048094875896354e-06, + "loss": 0.8592, + "step": 12672 + }, + { + "epoch": 1.0241014969999394, + "grad_norm": 2.6651179790496826, + "learning_rate": 5.047440543615321e-06, + "loss": 0.702, + "step": 12673 + }, + { + "epoch": 1.024182306713267, + "grad_norm": 2.6311092376708984, + "learning_rate": 5.0467862105217455e-06, + "loss": 0.7006, + "step": 12674 + }, + { + "epoch": 1.0242631164265945, + "grad_norm": 2.699047803878784, + "learning_rate": 5.0461318766268364e-06, + "loss": 0.8941, + "step": 12675 + }, + { + "epoch": 1.024343926139922, + "grad_norm": 2.647075891494751, + "learning_rate": 5.045477541941798e-06, + "loss": 0.726, + "step": 12676 + }, + { + "epoch": 1.0244247358532497, + "grad_norm": 2.4319651126861572, + "learning_rate": 5.044823206477839e-06, + "loss": 0.9222, + "step": 12677 + }, + { + "epoch": 1.024505545566577, + "grad_norm": 2.899913787841797, + "learning_rate": 5.044168870246166e-06, + "loss": 0.9301, + "step": 12678 + }, + { + "epoch": 1.0245863552799046, + "grad_norm": 2.800304412841797, + "learning_rate": 5.043514533257987e-06, + "loss": 0.7484, + "step": 12679 + }, + { + "epoch": 1.0246671649932322, + "grad_norm": 2.4168550968170166, + "learning_rate": 5.042860195524509e-06, + "loss": 0.8403, + "step": 12680 + }, + { + "epoch": 1.0247479747065598, + "grad_norm": 2.4322919845581055, + "learning_rate": 5.04220585705694e-06, + "loss": 0.8399, + "step": 12681 + }, + { + "epoch": 1.0248287844198873, + "grad_norm": 2.780128002166748, + "learning_rate": 5.0415515178664846e-06, + "loss": 0.7998, + "step": 12682 + }, + { + "epoch": 1.024909594133215, + "grad_norm": 2.586167573928833, + "learning_rate": 5.040897177964353e-06, + "loss": 0.7803, + "step": 12683 + }, + { + "epoch": 1.0249904038465423, + "grad_norm": 3.1910691261291504, + "learning_rate": 5.040242837361751e-06, + "loss": 0.7963, + "step": 12684 + }, + { + "epoch": 1.0250712135598699, + "grad_norm": 2.624464511871338, + "learning_rate": 5.039588496069883e-06, + "loss": 0.8317, + "step": 12685 + }, + { + "epoch": 1.0251520232731974, + "grad_norm": 2.69215989112854, + "learning_rate": 5.038934154099964e-06, + "loss": 0.7799, + "step": 12686 + }, + { + "epoch": 1.025232832986525, + "grad_norm": 2.8588006496429443, + "learning_rate": 5.038279811463193e-06, + "loss": 0.9083, + "step": 12687 + }, + { + "epoch": 1.0253136426998526, + "grad_norm": 2.506113290786743, + "learning_rate": 5.037625468170783e-06, + "loss": 0.8346, + "step": 12688 + }, + { + "epoch": 1.0253944524131802, + "grad_norm": 2.533895492553711, + "learning_rate": 5.0369711242339396e-06, + "loss": 0.8112, + "step": 12689 + }, + { + "epoch": 1.0254752621265075, + "grad_norm": 2.662245512008667, + "learning_rate": 5.036316779663869e-06, + "loss": 0.8136, + "step": 12690 + }, + { + "epoch": 1.025556071839835, + "grad_norm": 2.8152735233306885, + "learning_rate": 5.0356624344717785e-06, + "loss": 0.9144, + "step": 12691 + }, + { + "epoch": 1.0256368815531627, + "grad_norm": 2.935899496078491, + "learning_rate": 5.035008088668879e-06, + "loss": 0.8417, + "step": 12692 + }, + { + "epoch": 1.0257176912664903, + "grad_norm": 2.5224225521087646, + "learning_rate": 5.034353742266372e-06, + "loss": 0.8094, + "step": 12693 + }, + { + "epoch": 1.0257985009798178, + "grad_norm": 2.627077341079712, + "learning_rate": 5.033699395275471e-06, + "loss": 0.8952, + "step": 12694 + }, + { + "epoch": 1.0258793106931454, + "grad_norm": 2.5839152336120605, + "learning_rate": 5.033045047707379e-06, + "loss": 0.9106, + "step": 12695 + }, + { + "epoch": 1.0259601204064728, + "grad_norm": 2.659386157989502, + "learning_rate": 5.0323906995733055e-06, + "loss": 0.8824, + "step": 12696 + }, + { + "epoch": 1.0260409301198004, + "grad_norm": 2.8633780479431152, + "learning_rate": 5.031736350884456e-06, + "loss": 0.9488, + "step": 12697 + }, + { + "epoch": 1.026121739833128, + "grad_norm": 2.4078078269958496, + "learning_rate": 5.03108200165204e-06, + "loss": 0.8426, + "step": 12698 + }, + { + "epoch": 1.0262025495464555, + "grad_norm": 2.6873888969421387, + "learning_rate": 5.030427651887264e-06, + "loss": 0.7887, + "step": 12699 + }, + { + "epoch": 1.026283359259783, + "grad_norm": 3.359510898590088, + "learning_rate": 5.029773301601338e-06, + "loss": 0.7892, + "step": 12700 + }, + { + "epoch": 1.0263641689731107, + "grad_norm": 2.856140375137329, + "learning_rate": 5.0291189508054624e-06, + "loss": 0.8906, + "step": 12701 + }, + { + "epoch": 1.026444978686438, + "grad_norm": 2.6707277297973633, + "learning_rate": 5.028464599510853e-06, + "loss": 0.8341, + "step": 12702 + }, + { + "epoch": 1.0265257883997656, + "grad_norm": 2.730442523956299, + "learning_rate": 5.027810247728712e-06, + "loss": 0.7867, + "step": 12703 + }, + { + "epoch": 1.0266065981130932, + "grad_norm": 2.8404700756073, + "learning_rate": 5.027155895470248e-06, + "loss": 0.8352, + "step": 12704 + }, + { + "epoch": 1.0266874078264208, + "grad_norm": 2.6206679344177246, + "learning_rate": 5.0265015427466705e-06, + "loss": 0.8447, + "step": 12705 + }, + { + "epoch": 1.0267682175397483, + "grad_norm": 2.6233479976654053, + "learning_rate": 5.025847189569183e-06, + "loss": 0.8776, + "step": 12706 + }, + { + "epoch": 1.026849027253076, + "grad_norm": 2.4091241359710693, + "learning_rate": 5.025192835948996e-06, + "loss": 0.8689, + "step": 12707 + }, + { + "epoch": 1.0269298369664033, + "grad_norm": 2.6468141078948975, + "learning_rate": 5.024538481897319e-06, + "loss": 0.8165, + "step": 12708 + }, + { + "epoch": 1.0270106466797309, + "grad_norm": 2.6091692447662354, + "learning_rate": 5.0238841274253545e-06, + "loss": 0.7286, + "step": 12709 + }, + { + "epoch": 1.0270914563930584, + "grad_norm": 2.8415677547454834, + "learning_rate": 5.023229772544313e-06, + "loss": 0.9361, + "step": 12710 + }, + { + "epoch": 1.027172266106386, + "grad_norm": 2.6208531856536865, + "learning_rate": 5.022575417265402e-06, + "loss": 0.7295, + "step": 12711 + }, + { + "epoch": 1.0272530758197136, + "grad_norm": 2.9239227771759033, + "learning_rate": 5.0219210615998274e-06, + "loss": 0.8331, + "step": 12712 + }, + { + "epoch": 1.0273338855330412, + "grad_norm": 2.4374606609344482, + "learning_rate": 5.021266705558797e-06, + "loss": 0.7484, + "step": 12713 + }, + { + "epoch": 1.0274146952463685, + "grad_norm": 2.185107946395874, + "learning_rate": 5.020612349153521e-06, + "loss": 0.8039, + "step": 12714 + }, + { + "epoch": 1.027495504959696, + "grad_norm": 2.581763744354248, + "learning_rate": 5.019957992395204e-06, + "loss": 0.7838, + "step": 12715 + }, + { + "epoch": 1.0275763146730237, + "grad_norm": 2.4704651832580566, + "learning_rate": 5.0193036352950556e-06, + "loss": 0.8881, + "step": 12716 + }, + { + "epoch": 1.0276571243863513, + "grad_norm": 2.8464388847351074, + "learning_rate": 5.0186492778642815e-06, + "loss": 0.807, + "step": 12717 + }, + { + "epoch": 1.0277379340996788, + "grad_norm": 2.9068238735198975, + "learning_rate": 5.0179949201140905e-06, + "loss": 0.8626, + "step": 12718 + }, + { + "epoch": 1.0278187438130064, + "grad_norm": 2.8587801456451416, + "learning_rate": 5.0173405620556905e-06, + "loss": 0.892, + "step": 12719 + }, + { + "epoch": 1.0278995535263338, + "grad_norm": 2.780148983001709, + "learning_rate": 5.016686203700288e-06, + "loss": 0.851, + "step": 12720 + }, + { + "epoch": 1.0279803632396614, + "grad_norm": 2.440889596939087, + "learning_rate": 5.016031845059089e-06, + "loss": 0.8264, + "step": 12721 + }, + { + "epoch": 1.028061172952989, + "grad_norm": 2.604332447052002, + "learning_rate": 5.015377486143305e-06, + "loss": 0.8601, + "step": 12722 + }, + { + "epoch": 1.0281419826663165, + "grad_norm": 2.7778687477111816, + "learning_rate": 5.014723126964143e-06, + "loss": 0.7743, + "step": 12723 + }, + { + "epoch": 1.028222792379644, + "grad_norm": 2.172650098800659, + "learning_rate": 5.014068767532806e-06, + "loss": 0.8466, + "step": 12724 + }, + { + "epoch": 1.0283036020929717, + "grad_norm": 2.3984363079071045, + "learning_rate": 5.013414407860507e-06, + "loss": 0.8429, + "step": 12725 + }, + { + "epoch": 1.028384411806299, + "grad_norm": 2.6857895851135254, + "learning_rate": 5.01276004795845e-06, + "loss": 0.8363, + "step": 12726 + }, + { + "epoch": 1.0284652215196266, + "grad_norm": 2.552767276763916, + "learning_rate": 5.0121056878378475e-06, + "loss": 0.7717, + "step": 12727 + }, + { + "epoch": 1.0285460312329542, + "grad_norm": 2.4570114612579346, + "learning_rate": 5.011451327509901e-06, + "loss": 0.9366, + "step": 12728 + }, + { + "epoch": 1.0286268409462818, + "grad_norm": 2.271482467651367, + "learning_rate": 5.010796966985822e-06, + "loss": 0.8003, + "step": 12729 + }, + { + "epoch": 1.0287076506596093, + "grad_norm": 2.92912220954895, + "learning_rate": 5.010142606276816e-06, + "loss": 0.864, + "step": 12730 + }, + { + "epoch": 1.028788460372937, + "grad_norm": 2.7757370471954346, + "learning_rate": 5.009488245394092e-06, + "loss": 0.7798, + "step": 12731 + }, + { + "epoch": 1.0288692700862643, + "grad_norm": 2.60307240486145, + "learning_rate": 5.008833884348856e-06, + "loss": 0.7594, + "step": 12732 + }, + { + "epoch": 1.0289500797995919, + "grad_norm": 2.7053911685943604, + "learning_rate": 5.008179523152319e-06, + "loss": 0.8843, + "step": 12733 + }, + { + "epoch": 1.0290308895129194, + "grad_norm": 2.5982367992401123, + "learning_rate": 5.007525161815685e-06, + "loss": 0.8155, + "step": 12734 + }, + { + "epoch": 1.029111699226247, + "grad_norm": 2.5147147178649902, + "learning_rate": 5.006870800350163e-06, + "loss": 0.8066, + "step": 12735 + }, + { + "epoch": 1.0291925089395746, + "grad_norm": 2.2974839210510254, + "learning_rate": 5.0062164387669605e-06, + "loss": 0.8844, + "step": 12736 + }, + { + "epoch": 1.0292733186529022, + "grad_norm": 2.545964479446411, + "learning_rate": 5.005562077077287e-06, + "loss": 0.8175, + "step": 12737 + }, + { + "epoch": 1.0293541283662295, + "grad_norm": 2.3717331886291504, + "learning_rate": 5.004907715292346e-06, + "loss": 0.7335, + "step": 12738 + }, + { + "epoch": 1.029434938079557, + "grad_norm": 2.3097434043884277, + "learning_rate": 5.004253353423351e-06, + "loss": 0.8945, + "step": 12739 + }, + { + "epoch": 1.0295157477928847, + "grad_norm": 2.6539955139160156, + "learning_rate": 5.003598991481503e-06, + "loss": 0.8038, + "step": 12740 + }, + { + "epoch": 1.0295965575062123, + "grad_norm": 2.718130350112915, + "learning_rate": 5.0029446294780146e-06, + "loss": 0.813, + "step": 12741 + }, + { + "epoch": 1.0296773672195398, + "grad_norm": 2.3481106758117676, + "learning_rate": 5.0022902674240915e-06, + "loss": 0.7984, + "step": 12742 + }, + { + "epoch": 1.0297581769328674, + "grad_norm": 2.7545816898345947, + "learning_rate": 5.0016359053309415e-06, + "loss": 0.8325, + "step": 12743 + }, + { + "epoch": 1.0298389866461948, + "grad_norm": 2.639677047729492, + "learning_rate": 5.000981543209773e-06, + "loss": 0.914, + "step": 12744 + }, + { + "epoch": 1.0299197963595224, + "grad_norm": 2.8748130798339844, + "learning_rate": 5.000327181071793e-06, + "loss": 0.7105, + "step": 12745 + }, + { + "epoch": 1.03000060607285, + "grad_norm": 2.184514284133911, + "learning_rate": 4.9996728189282075e-06, + "loss": 0.8635, + "step": 12746 + }, + { + "epoch": 1.0300814157861775, + "grad_norm": 3.0235283374786377, + "learning_rate": 4.9990184567902275e-06, + "loss": 0.8444, + "step": 12747 + }, + { + "epoch": 1.030162225499505, + "grad_norm": 2.3344459533691406, + "learning_rate": 4.998364094669059e-06, + "loss": 0.8783, + "step": 12748 + }, + { + "epoch": 1.0302430352128327, + "grad_norm": 2.846672296524048, + "learning_rate": 4.997709732575909e-06, + "loss": 0.8181, + "step": 12749 + }, + { + "epoch": 1.03032384492616, + "grad_norm": 2.2562077045440674, + "learning_rate": 4.997055370521985e-06, + "loss": 0.8856, + "step": 12750 + }, + { + "epoch": 1.0304046546394876, + "grad_norm": 2.255314826965332, + "learning_rate": 4.996401008518499e-06, + "loss": 0.6853, + "step": 12751 + }, + { + "epoch": 1.0304854643528152, + "grad_norm": 3.0052249431610107, + "learning_rate": 4.995746646576651e-06, + "loss": 0.6795, + "step": 12752 + }, + { + "epoch": 1.0305662740661428, + "grad_norm": 3.18621826171875, + "learning_rate": 4.995092284707654e-06, + "loss": 0.7444, + "step": 12753 + }, + { + "epoch": 1.0306470837794703, + "grad_norm": 3.149110794067383, + "learning_rate": 4.9944379229227155e-06, + "loss": 0.7594, + "step": 12754 + }, + { + "epoch": 1.030727893492798, + "grad_norm": 2.6996846199035645, + "learning_rate": 4.993783561233041e-06, + "loss": 0.7711, + "step": 12755 + }, + { + "epoch": 1.0308087032061253, + "grad_norm": 2.3134026527404785, + "learning_rate": 4.993129199649838e-06, + "loss": 0.8269, + "step": 12756 + }, + { + "epoch": 1.0308895129194529, + "grad_norm": 2.3625435829162598, + "learning_rate": 4.992474838184318e-06, + "loss": 0.8723, + "step": 12757 + }, + { + "epoch": 1.0309703226327804, + "grad_norm": 2.7127509117126465, + "learning_rate": 4.991820476847683e-06, + "loss": 0.8032, + "step": 12758 + }, + { + "epoch": 1.031051132346108, + "grad_norm": 3.0470070838928223, + "learning_rate": 4.9911661156511445e-06, + "loss": 0.887, + "step": 12759 + }, + { + "epoch": 1.0311319420594356, + "grad_norm": 2.3634185791015625, + "learning_rate": 4.990511754605911e-06, + "loss": 0.7546, + "step": 12760 + }, + { + "epoch": 1.0312127517727632, + "grad_norm": 2.869631767272949, + "learning_rate": 4.989857393723187e-06, + "loss": 0.8403, + "step": 12761 + }, + { + "epoch": 1.0312935614860905, + "grad_norm": 2.5108728408813477, + "learning_rate": 4.989203033014179e-06, + "loss": 0.7941, + "step": 12762 + }, + { + "epoch": 1.031374371199418, + "grad_norm": 2.6491148471832275, + "learning_rate": 4.9885486724901e-06, + "loss": 0.811, + "step": 12763 + }, + { + "epoch": 1.0314551809127457, + "grad_norm": 2.439948797225952, + "learning_rate": 4.987894312162154e-06, + "loss": 0.8451, + "step": 12764 + }, + { + "epoch": 1.0315359906260733, + "grad_norm": 2.49680495262146, + "learning_rate": 4.987239952041549e-06, + "loss": 0.8132, + "step": 12765 + }, + { + "epoch": 1.0316168003394008, + "grad_norm": 2.9315829277038574, + "learning_rate": 4.9865855921394945e-06, + "loss": 0.7809, + "step": 12766 + }, + { + "epoch": 1.0316976100527284, + "grad_norm": 2.4637374877929688, + "learning_rate": 4.985931232467195e-06, + "loss": 0.8845, + "step": 12767 + }, + { + "epoch": 1.0317784197660558, + "grad_norm": 2.5532619953155518, + "learning_rate": 4.985276873035859e-06, + "loss": 0.8286, + "step": 12768 + }, + { + "epoch": 1.0318592294793834, + "grad_norm": 2.7578811645507812, + "learning_rate": 4.984622513856696e-06, + "loss": 0.8114, + "step": 12769 + }, + { + "epoch": 1.031940039192711, + "grad_norm": 2.925610065460205, + "learning_rate": 4.983968154940912e-06, + "loss": 0.7349, + "step": 12770 + }, + { + "epoch": 1.0320208489060385, + "grad_norm": 2.2802493572235107, + "learning_rate": 4.983313796299714e-06, + "loss": 0.8446, + "step": 12771 + }, + { + "epoch": 1.032101658619366, + "grad_norm": 2.626570224761963, + "learning_rate": 4.982659437944313e-06, + "loss": 0.8845, + "step": 12772 + }, + { + "epoch": 1.0321824683326937, + "grad_norm": 2.2684454917907715, + "learning_rate": 4.98200507988591e-06, + "loss": 0.8328, + "step": 12773 + }, + { + "epoch": 1.032263278046021, + "grad_norm": 2.4540622234344482, + "learning_rate": 4.981350722135719e-06, + "loss": 0.815, + "step": 12774 + }, + { + "epoch": 1.0323440877593486, + "grad_norm": 2.575643301010132, + "learning_rate": 4.980696364704945e-06, + "loss": 0.8865, + "step": 12775 + }, + { + "epoch": 1.0324248974726762, + "grad_norm": 2.8141043186187744, + "learning_rate": 4.980042007604797e-06, + "loss": 0.8462, + "step": 12776 + }, + { + "epoch": 1.0325057071860038, + "grad_norm": 2.6783711910247803, + "learning_rate": 4.979387650846481e-06, + "loss": 0.7909, + "step": 12777 + }, + { + "epoch": 1.0325865168993313, + "grad_norm": 2.9930763244628906, + "learning_rate": 4.978733294441203e-06, + "loss": 0.8268, + "step": 12778 + }, + { + "epoch": 1.032667326612659, + "grad_norm": 2.710965633392334, + "learning_rate": 4.978078938400174e-06, + "loss": 0.7757, + "step": 12779 + }, + { + "epoch": 1.0327481363259863, + "grad_norm": 3.494272232055664, + "learning_rate": 4.9774245827346e-06, + "loss": 0.8253, + "step": 12780 + }, + { + "epoch": 1.0328289460393139, + "grad_norm": 2.375830888748169, + "learning_rate": 4.9767702274556885e-06, + "loss": 0.899, + "step": 12781 + }, + { + "epoch": 1.0329097557526414, + "grad_norm": 2.467590093612671, + "learning_rate": 4.976115872574648e-06, + "loss": 0.8084, + "step": 12782 + }, + { + "epoch": 1.032990565465969, + "grad_norm": 2.582261800765991, + "learning_rate": 4.975461518102682e-06, + "loss": 0.8396, + "step": 12783 + }, + { + "epoch": 1.0330713751792966, + "grad_norm": 2.7683613300323486, + "learning_rate": 4.974807164051003e-06, + "loss": 0.9251, + "step": 12784 + }, + { + "epoch": 1.0331521848926242, + "grad_norm": 3.1756012439727783, + "learning_rate": 4.974152810430818e-06, + "loss": 0.8297, + "step": 12785 + }, + { + "epoch": 1.0332329946059517, + "grad_norm": 2.457958459854126, + "learning_rate": 4.973498457253332e-06, + "loss": 0.9918, + "step": 12786 + }, + { + "epoch": 1.033313804319279, + "grad_norm": 2.488844871520996, + "learning_rate": 4.972844104529753e-06, + "loss": 0.7827, + "step": 12787 + }, + { + "epoch": 1.0333946140326067, + "grad_norm": 2.1935784816741943, + "learning_rate": 4.972189752271291e-06, + "loss": 0.7634, + "step": 12788 + }, + { + "epoch": 1.0334754237459343, + "grad_norm": 2.20938777923584, + "learning_rate": 4.971535400489148e-06, + "loss": 0.8776, + "step": 12789 + }, + { + "epoch": 1.0335562334592618, + "grad_norm": 2.3523964881896973, + "learning_rate": 4.970881049194538e-06, + "loss": 0.7805, + "step": 12790 + }, + { + "epoch": 1.0336370431725894, + "grad_norm": 2.588494300842285, + "learning_rate": 4.970226698398666e-06, + "loss": 0.8473, + "step": 12791 + }, + { + "epoch": 1.0337178528859168, + "grad_norm": 2.520622730255127, + "learning_rate": 4.9695723481127374e-06, + "loss": 0.7837, + "step": 12792 + }, + { + "epoch": 1.0337986625992444, + "grad_norm": 2.6883108615875244, + "learning_rate": 4.968917998347961e-06, + "loss": 0.8244, + "step": 12793 + }, + { + "epoch": 1.033879472312572, + "grad_norm": 2.437628746032715, + "learning_rate": 4.968263649115546e-06, + "loss": 0.7811, + "step": 12794 + }, + { + "epoch": 1.0339602820258995, + "grad_norm": 2.5353872776031494, + "learning_rate": 4.967609300426697e-06, + "loss": 0.8087, + "step": 12795 + }, + { + "epoch": 1.034041091739227, + "grad_norm": 2.777150869369507, + "learning_rate": 4.966954952292623e-06, + "loss": 0.7576, + "step": 12796 + }, + { + "epoch": 1.0341219014525547, + "grad_norm": 3.088149309158325, + "learning_rate": 4.966300604724532e-06, + "loss": 0.7606, + "step": 12797 + }, + { + "epoch": 1.0342027111658822, + "grad_norm": 2.8645825386047363, + "learning_rate": 4.965646257733629e-06, + "loss": 0.8096, + "step": 12798 + }, + { + "epoch": 1.0342835208792096, + "grad_norm": 2.754312038421631, + "learning_rate": 4.964991911331122e-06, + "loss": 0.7841, + "step": 12799 + }, + { + "epoch": 1.0343643305925372, + "grad_norm": 2.337752342224121, + "learning_rate": 4.9643375655282214e-06, + "loss": 0.8053, + "step": 12800 + }, + { + "epoch": 1.0344451403058648, + "grad_norm": 2.6112146377563477, + "learning_rate": 4.963683220336133e-06, + "loss": 0.83, + "step": 12801 + }, + { + "epoch": 1.0345259500191923, + "grad_norm": 2.616032600402832, + "learning_rate": 4.963028875766062e-06, + "loss": 0.7731, + "step": 12802 + }, + { + "epoch": 1.03460675973252, + "grad_norm": 2.3240854740142822, + "learning_rate": 4.9623745318292175e-06, + "loss": 0.9083, + "step": 12803 + }, + { + "epoch": 1.0346875694458473, + "grad_norm": 2.695591926574707, + "learning_rate": 4.961720188536808e-06, + "loss": 0.8537, + "step": 12804 + }, + { + "epoch": 1.0347683791591749, + "grad_norm": 2.967604398727417, + "learning_rate": 4.961065845900038e-06, + "loss": 0.8413, + "step": 12805 + }, + { + "epoch": 1.0348491888725024, + "grad_norm": 2.7116377353668213, + "learning_rate": 4.960411503930117e-06, + "loss": 0.7494, + "step": 12806 + }, + { + "epoch": 1.03492999858583, + "grad_norm": 3.1584997177124023, + "learning_rate": 4.959757162638253e-06, + "loss": 0.7371, + "step": 12807 + }, + { + "epoch": 1.0350108082991576, + "grad_norm": 2.4165217876434326, + "learning_rate": 4.95910282203565e-06, + "loss": 0.7482, + "step": 12808 + }, + { + "epoch": 1.0350916180124852, + "grad_norm": 2.883049488067627, + "learning_rate": 4.958448482133516e-06, + "loss": 0.8778, + "step": 12809 + }, + { + "epoch": 1.0351724277258127, + "grad_norm": 2.670912981033325, + "learning_rate": 4.957794142943063e-06, + "loss": 0.8581, + "step": 12810 + }, + { + "epoch": 1.03525323743914, + "grad_norm": 2.74064302444458, + "learning_rate": 4.957139804475492e-06, + "loss": 0.843, + "step": 12811 + }, + { + "epoch": 1.0353340471524677, + "grad_norm": 2.5970287322998047, + "learning_rate": 4.956485466742014e-06, + "loss": 0.7506, + "step": 12812 + }, + { + "epoch": 1.0354148568657953, + "grad_norm": 2.4259068965911865, + "learning_rate": 4.955831129753835e-06, + "loss": 0.9008, + "step": 12813 + }, + { + "epoch": 1.0354956665791228, + "grad_norm": 2.599001407623291, + "learning_rate": 4.955176793522163e-06, + "loss": 0.7608, + "step": 12814 + }, + { + "epoch": 1.0355764762924504, + "grad_norm": 2.6436216831207275, + "learning_rate": 4.954522458058203e-06, + "loss": 0.8819, + "step": 12815 + }, + { + "epoch": 1.035657286005778, + "grad_norm": 2.8188560009002686, + "learning_rate": 4.953868123373167e-06, + "loss": 0.6734, + "step": 12816 + }, + { + "epoch": 1.0357380957191054, + "grad_norm": 3.691556692123413, + "learning_rate": 4.953213789478255e-06, + "loss": 0.7709, + "step": 12817 + }, + { + "epoch": 1.035818905432433, + "grad_norm": 2.5950491428375244, + "learning_rate": 4.95255945638468e-06, + "loss": 0.8796, + "step": 12818 + }, + { + "epoch": 1.0358997151457605, + "grad_norm": 2.524399995803833, + "learning_rate": 4.951905124103648e-06, + "loss": 0.8444, + "step": 12819 + }, + { + "epoch": 1.035980524859088, + "grad_norm": 2.501587390899658, + "learning_rate": 4.951250792646365e-06, + "loss": 0.947, + "step": 12820 + }, + { + "epoch": 1.0360613345724157, + "grad_norm": 2.551919937133789, + "learning_rate": 4.950596462024037e-06, + "loss": 0.8316, + "step": 12821 + }, + { + "epoch": 1.0361421442857432, + "grad_norm": 2.523668050765991, + "learning_rate": 4.9499421322478755e-06, + "loss": 0.7983, + "step": 12822 + }, + { + "epoch": 1.0362229539990706, + "grad_norm": 3.2756011486053467, + "learning_rate": 4.9492878033290826e-06, + "loss": 0.8154, + "step": 12823 + }, + { + "epoch": 1.0363037637123982, + "grad_norm": 2.44270396232605, + "learning_rate": 4.948633475278869e-06, + "loss": 0.8198, + "step": 12824 + }, + { + "epoch": 1.0363845734257258, + "grad_norm": 2.660534381866455, + "learning_rate": 4.94797914810844e-06, + "loss": 0.7953, + "step": 12825 + }, + { + "epoch": 1.0364653831390533, + "grad_norm": 2.6788415908813477, + "learning_rate": 4.947324821829002e-06, + "loss": 0.6867, + "step": 12826 + }, + { + "epoch": 1.036546192852381, + "grad_norm": 2.509154796600342, + "learning_rate": 4.946670496451762e-06, + "loss": 0.7959, + "step": 12827 + }, + { + "epoch": 1.0366270025657085, + "grad_norm": 2.484999895095825, + "learning_rate": 4.946016171987929e-06, + "loss": 0.837, + "step": 12828 + }, + { + "epoch": 1.0367078122790359, + "grad_norm": 2.3741610050201416, + "learning_rate": 4.945361848448711e-06, + "loss": 0.8531, + "step": 12829 + }, + { + "epoch": 1.0367886219923634, + "grad_norm": 2.699448585510254, + "learning_rate": 4.944707525845311e-06, + "loss": 0.9241, + "step": 12830 + }, + { + "epoch": 1.036869431705691, + "grad_norm": 2.320544958114624, + "learning_rate": 4.944053204188937e-06, + "loss": 0.868, + "step": 12831 + }, + { + "epoch": 1.0369502414190186, + "grad_norm": 2.888483762741089, + "learning_rate": 4.943398883490801e-06, + "loss": 0.8137, + "step": 12832 + }, + { + "epoch": 1.0370310511323462, + "grad_norm": 2.6746714115142822, + "learning_rate": 4.942744563762102e-06, + "loss": 0.8417, + "step": 12833 + }, + { + "epoch": 1.0371118608456737, + "grad_norm": 2.575489044189453, + "learning_rate": 4.942090245014052e-06, + "loss": 0.8192, + "step": 12834 + }, + { + "epoch": 1.037192670559001, + "grad_norm": 2.713181972503662, + "learning_rate": 4.941435927257857e-06, + "loss": 0.8827, + "step": 12835 + }, + { + "epoch": 1.0372734802723287, + "grad_norm": 2.8507280349731445, + "learning_rate": 4.940781610504724e-06, + "loss": 0.9301, + "step": 12836 + }, + { + "epoch": 1.0373542899856563, + "grad_norm": 2.50119686126709, + "learning_rate": 4.940127294765857e-06, + "loss": 0.9033, + "step": 12837 + }, + { + "epoch": 1.0374350996989838, + "grad_norm": 2.282935857772827, + "learning_rate": 4.939472980052468e-06, + "loss": 0.8336, + "step": 12838 + }, + { + "epoch": 1.0375159094123114, + "grad_norm": 2.280317544937134, + "learning_rate": 4.9388186663757605e-06, + "loss": 0.8113, + "step": 12839 + }, + { + "epoch": 1.037596719125639, + "grad_norm": 2.6849544048309326, + "learning_rate": 4.938164353746942e-06, + "loss": 0.8208, + "step": 12840 + }, + { + "epoch": 1.0376775288389664, + "grad_norm": 2.924696683883667, + "learning_rate": 4.937510042177219e-06, + "loss": 0.6863, + "step": 12841 + }, + { + "epoch": 1.037758338552294, + "grad_norm": 2.3179073333740234, + "learning_rate": 4.936855731677799e-06, + "loss": 0.7647, + "step": 12842 + }, + { + "epoch": 1.0378391482656215, + "grad_norm": 2.255683422088623, + "learning_rate": 4.936201422259886e-06, + "loss": 0.8274, + "step": 12843 + }, + { + "epoch": 1.037919957978949, + "grad_norm": 2.973609209060669, + "learning_rate": 4.935547113934692e-06, + "loss": 0.8438, + "step": 12844 + }, + { + "epoch": 1.0380007676922767, + "grad_norm": 2.8407390117645264, + "learning_rate": 4.93489280671342e-06, + "loss": 0.8347, + "step": 12845 + }, + { + "epoch": 1.0380815774056042, + "grad_norm": 2.5327024459838867, + "learning_rate": 4.9342385006072765e-06, + "loss": 0.7966, + "step": 12846 + }, + { + "epoch": 1.0381623871189316, + "grad_norm": 3.217491626739502, + "learning_rate": 4.93358419562747e-06, + "loss": 0.8251, + "step": 12847 + }, + { + "epoch": 1.0382431968322592, + "grad_norm": 2.3137214183807373, + "learning_rate": 4.932929891785205e-06, + "loss": 0.848, + "step": 12848 + }, + { + "epoch": 1.0383240065455868, + "grad_norm": 2.7785751819610596, + "learning_rate": 4.932275589091691e-06, + "loss": 0.7204, + "step": 12849 + }, + { + "epoch": 1.0384048162589143, + "grad_norm": 2.8221638202667236, + "learning_rate": 4.931621287558134e-06, + "loss": 0.7979, + "step": 12850 + }, + { + "epoch": 1.038485625972242, + "grad_norm": 2.9343366622924805, + "learning_rate": 4.930966987195738e-06, + "loss": 0.7742, + "step": 12851 + }, + { + "epoch": 1.0385664356855695, + "grad_norm": 2.5015809535980225, + "learning_rate": 4.930312688015711e-06, + "loss": 0.8307, + "step": 12852 + }, + { + "epoch": 1.0386472453988969, + "grad_norm": 3.0302183628082275, + "learning_rate": 4.929658390029262e-06, + "loss": 0.8124, + "step": 12853 + }, + { + "epoch": 1.0387280551122244, + "grad_norm": 3.35300874710083, + "learning_rate": 4.929004093247593e-06, + "loss": 0.7601, + "step": 12854 + }, + { + "epoch": 1.038808864825552, + "grad_norm": 3.1182520389556885, + "learning_rate": 4.928349797681914e-06, + "loss": 0.7175, + "step": 12855 + }, + { + "epoch": 1.0388896745388796, + "grad_norm": 2.6093690395355225, + "learning_rate": 4.92769550334343e-06, + "loss": 0.8113, + "step": 12856 + }, + { + "epoch": 1.0389704842522072, + "grad_norm": 2.6275503635406494, + "learning_rate": 4.92704121024335e-06, + "loss": 0.8617, + "step": 12857 + }, + { + "epoch": 1.0390512939655348, + "grad_norm": 2.651615619659424, + "learning_rate": 4.926386918392875e-06, + "loss": 0.7714, + "step": 12858 + }, + { + "epoch": 1.039132103678862, + "grad_norm": 2.784255027770996, + "learning_rate": 4.925732627803217e-06, + "loss": 0.7314, + "step": 12859 + }, + { + "epoch": 1.0392129133921897, + "grad_norm": 2.509617805480957, + "learning_rate": 4.925078338485581e-06, + "loss": 0.8124, + "step": 12860 + }, + { + "epoch": 1.0392937231055173, + "grad_norm": 3.1349759101867676, + "learning_rate": 4.924424050451172e-06, + "loss": 0.773, + "step": 12861 + }, + { + "epoch": 1.0393745328188448, + "grad_norm": 2.5523223876953125, + "learning_rate": 4.923769763711196e-06, + "loss": 0.7478, + "step": 12862 + }, + { + "epoch": 1.0394553425321724, + "grad_norm": 2.086899995803833, + "learning_rate": 4.923115478276863e-06, + "loss": 0.8137, + "step": 12863 + }, + { + "epoch": 1.0395361522455, + "grad_norm": 2.5579464435577393, + "learning_rate": 4.922461194159373e-06, + "loss": 0.7144, + "step": 12864 + }, + { + "epoch": 1.0396169619588274, + "grad_norm": 2.9754111766815186, + "learning_rate": 4.921806911369938e-06, + "loss": 0.8312, + "step": 12865 + }, + { + "epoch": 1.039697771672155, + "grad_norm": 2.61491060256958, + "learning_rate": 4.921152629919763e-06, + "loss": 0.8016, + "step": 12866 + }, + { + "epoch": 1.0397785813854825, + "grad_norm": 2.2346484661102295, + "learning_rate": 4.920498349820053e-06, + "loss": 0.8497, + "step": 12867 + }, + { + "epoch": 1.03985939109881, + "grad_norm": 2.6187667846679688, + "learning_rate": 4.919844071082014e-06, + "loss": 0.841, + "step": 12868 + }, + { + "epoch": 1.0399402008121377, + "grad_norm": 2.577465534210205, + "learning_rate": 4.919189793716857e-06, + "loss": 0.874, + "step": 12869 + }, + { + "epoch": 1.0400210105254653, + "grad_norm": 2.6797478199005127, + "learning_rate": 4.91853551773578e-06, + "loss": 0.8406, + "step": 12870 + }, + { + "epoch": 1.0401018202387926, + "grad_norm": 2.720130681991577, + "learning_rate": 4.917881243149994e-06, + "loss": 0.7676, + "step": 12871 + }, + { + "epoch": 1.0401826299521202, + "grad_norm": 2.9457404613494873, + "learning_rate": 4.9172269699707065e-06, + "loss": 0.8268, + "step": 12872 + }, + { + "epoch": 1.0402634396654478, + "grad_norm": 3.3807499408721924, + "learning_rate": 4.916572698209121e-06, + "loss": 0.8266, + "step": 12873 + }, + { + "epoch": 1.0403442493787753, + "grad_norm": 2.8594462871551514, + "learning_rate": 4.915918427876442e-06, + "loss": 0.8403, + "step": 12874 + }, + { + "epoch": 1.040425059092103, + "grad_norm": 2.592507839202881, + "learning_rate": 4.915264158983882e-06, + "loss": 0.8523, + "step": 12875 + }, + { + "epoch": 1.0405058688054305, + "grad_norm": 2.3449959754943848, + "learning_rate": 4.91460989154264e-06, + "loss": 0.8336, + "step": 12876 + }, + { + "epoch": 1.0405866785187579, + "grad_norm": 2.2409980297088623, + "learning_rate": 4.913955625563925e-06, + "loss": 0.8671, + "step": 12877 + }, + { + "epoch": 1.0406674882320854, + "grad_norm": 2.4352972507476807, + "learning_rate": 4.913301361058945e-06, + "loss": 0.8769, + "step": 12878 + }, + { + "epoch": 1.040748297945413, + "grad_norm": 2.753619909286499, + "learning_rate": 4.912647098038903e-06, + "loss": 0.7833, + "step": 12879 + }, + { + "epoch": 1.0408291076587406, + "grad_norm": 2.6027324199676514, + "learning_rate": 4.911992836515004e-06, + "loss": 0.8694, + "step": 12880 + }, + { + "epoch": 1.0409099173720682, + "grad_norm": 2.600783586502075, + "learning_rate": 4.91133857649846e-06, + "loss": 0.7395, + "step": 12881 + }, + { + "epoch": 1.0409907270853958, + "grad_norm": 2.7033865451812744, + "learning_rate": 4.91068431800047e-06, + "loss": 0.7953, + "step": 12882 + }, + { + "epoch": 1.041071536798723, + "grad_norm": 2.8208181858062744, + "learning_rate": 4.910030061032242e-06, + "loss": 0.8189, + "step": 12883 + }, + { + "epoch": 1.0411523465120507, + "grad_norm": 2.617600917816162, + "learning_rate": 4.9093758056049824e-06, + "loss": 0.844, + "step": 12884 + }, + { + "epoch": 1.0412331562253783, + "grad_norm": 2.503328800201416, + "learning_rate": 4.9087215517299e-06, + "loss": 0.8647, + "step": 12885 + }, + { + "epoch": 1.0413139659387058, + "grad_norm": 2.663407325744629, + "learning_rate": 4.908067299418194e-06, + "loss": 0.7519, + "step": 12886 + }, + { + "epoch": 1.0413947756520334, + "grad_norm": 2.375281572341919, + "learning_rate": 4.907413048681076e-06, + "loss": 0.8333, + "step": 12887 + }, + { + "epoch": 1.041475585365361, + "grad_norm": 2.865656614303589, + "learning_rate": 4.90675879952975e-06, + "loss": 0.8715, + "step": 12888 + }, + { + "epoch": 1.0415563950786884, + "grad_norm": 2.389338970184326, + "learning_rate": 4.906104551975421e-06, + "loss": 0.8152, + "step": 12889 + }, + { + "epoch": 1.041637204792016, + "grad_norm": 2.327669620513916, + "learning_rate": 4.905450306029293e-06, + "loss": 0.8006, + "step": 12890 + }, + { + "epoch": 1.0417180145053435, + "grad_norm": 2.764125347137451, + "learning_rate": 4.904796061702577e-06, + "loss": 0.8706, + "step": 12891 + }, + { + "epoch": 1.041798824218671, + "grad_norm": 2.936788320541382, + "learning_rate": 4.904141819006472e-06, + "loss": 0.8794, + "step": 12892 + }, + { + "epoch": 1.0418796339319987, + "grad_norm": 2.675888776779175, + "learning_rate": 4.903487577952188e-06, + "loss": 0.8843, + "step": 12893 + }, + { + "epoch": 1.0419604436453263, + "grad_norm": 2.5857391357421875, + "learning_rate": 4.902833338550931e-06, + "loss": 0.8065, + "step": 12894 + }, + { + "epoch": 1.0420412533586536, + "grad_norm": 2.7993967533111572, + "learning_rate": 4.902179100813903e-06, + "loss": 0.8547, + "step": 12895 + }, + { + "epoch": 1.0421220630719812, + "grad_norm": 2.5567092895507812, + "learning_rate": 4.901524864752311e-06, + "loss": 0.7886, + "step": 12896 + }, + { + "epoch": 1.0422028727853088, + "grad_norm": 2.6120681762695312, + "learning_rate": 4.900870630377364e-06, + "loss": 0.8189, + "step": 12897 + }, + { + "epoch": 1.0422836824986363, + "grad_norm": 2.7176361083984375, + "learning_rate": 4.900216397700262e-06, + "loss": 0.8435, + "step": 12898 + }, + { + "epoch": 1.042364492211964, + "grad_norm": 2.7765636444091797, + "learning_rate": 4.899562166732213e-06, + "loss": 0.8413, + "step": 12899 + }, + { + "epoch": 1.0424453019252915, + "grad_norm": 2.5095574855804443, + "learning_rate": 4.898907937484424e-06, + "loss": 0.7782, + "step": 12900 + }, + { + "epoch": 1.0425261116386189, + "grad_norm": 2.6688530445098877, + "learning_rate": 4.8982537099680975e-06, + "loss": 0.8314, + "step": 12901 + }, + { + "epoch": 1.0426069213519464, + "grad_norm": 2.7962560653686523, + "learning_rate": 4.897599484194439e-06, + "loss": 0.8385, + "step": 12902 + }, + { + "epoch": 1.042687731065274, + "grad_norm": 2.8190219402313232, + "learning_rate": 4.8969452601746564e-06, + "loss": 0.8413, + "step": 12903 + }, + { + "epoch": 1.0427685407786016, + "grad_norm": 3.0162580013275146, + "learning_rate": 4.896291037919953e-06, + "loss": 0.7438, + "step": 12904 + }, + { + "epoch": 1.0428493504919292, + "grad_norm": 2.2201485633850098, + "learning_rate": 4.895636817441534e-06, + "loss": 0.8379, + "step": 12905 + }, + { + "epoch": 1.0429301602052568, + "grad_norm": 2.630707025527954, + "learning_rate": 4.894982598750606e-06, + "loss": 0.7655, + "step": 12906 + }, + { + "epoch": 1.043010969918584, + "grad_norm": 2.468411922454834, + "learning_rate": 4.8943283818583716e-06, + "loss": 0.8834, + "step": 12907 + }, + { + "epoch": 1.0430917796319117, + "grad_norm": 2.922224521636963, + "learning_rate": 4.893674166776039e-06, + "loss": 0.7395, + "step": 12908 + }, + { + "epoch": 1.0431725893452393, + "grad_norm": 2.400918960571289, + "learning_rate": 4.893019953514811e-06, + "loss": 0.82, + "step": 12909 + }, + { + "epoch": 1.0432533990585668, + "grad_norm": 2.39370059967041, + "learning_rate": 4.892365742085895e-06, + "loss": 0.7025, + "step": 12910 + }, + { + "epoch": 1.0433342087718944, + "grad_norm": 2.4781503677368164, + "learning_rate": 4.891711532500494e-06, + "loss": 0.8592, + "step": 12911 + }, + { + "epoch": 1.043415018485222, + "grad_norm": 2.745285749435425, + "learning_rate": 4.891057324769813e-06, + "loss": 0.88, + "step": 12912 + }, + { + "epoch": 1.0434958281985494, + "grad_norm": 2.4261934757232666, + "learning_rate": 4.890403118905058e-06, + "loss": 0.7642, + "step": 12913 + }, + { + "epoch": 1.043576637911877, + "grad_norm": 2.5502376556396484, + "learning_rate": 4.889748914917435e-06, + "loss": 0.7788, + "step": 12914 + }, + { + "epoch": 1.0436574476252045, + "grad_norm": 3.543602705001831, + "learning_rate": 4.889094712818146e-06, + "loss": 0.841, + "step": 12915 + }, + { + "epoch": 1.043738257338532, + "grad_norm": 2.63369083404541, + "learning_rate": 4.8884405126184e-06, + "loss": 0.8548, + "step": 12916 + }, + { + "epoch": 1.0438190670518597, + "grad_norm": 2.512787342071533, + "learning_rate": 4.887786314329398e-06, + "loss": 0.8282, + "step": 12917 + }, + { + "epoch": 1.0438998767651873, + "grad_norm": 2.8143913745880127, + "learning_rate": 4.887132117962344e-06, + "loss": 0.8347, + "step": 12918 + }, + { + "epoch": 1.0439806864785148, + "grad_norm": 2.6553726196289062, + "learning_rate": 4.886477923528449e-06, + "loss": 0.7903, + "step": 12919 + }, + { + "epoch": 1.0440614961918422, + "grad_norm": 2.450526714324951, + "learning_rate": 4.8858237310389115e-06, + "loss": 0.9034, + "step": 12920 + }, + { + "epoch": 1.0441423059051698, + "grad_norm": 2.8087244033813477, + "learning_rate": 4.8851695405049395e-06, + "loss": 0.8621, + "step": 12921 + }, + { + "epoch": 1.0442231156184973, + "grad_norm": 2.3381245136260986, + "learning_rate": 4.884515351937737e-06, + "loss": 0.8188, + "step": 12922 + }, + { + "epoch": 1.044303925331825, + "grad_norm": 3.1490652561187744, + "learning_rate": 4.883861165348507e-06, + "loss": 0.7268, + "step": 12923 + }, + { + "epoch": 1.0443847350451525, + "grad_norm": 2.547015428543091, + "learning_rate": 4.8832069807484565e-06, + "loss": 0.7608, + "step": 12924 + }, + { + "epoch": 1.0444655447584799, + "grad_norm": 3.16630482673645, + "learning_rate": 4.88255279814879e-06, + "loss": 0.9203, + "step": 12925 + }, + { + "epoch": 1.0445463544718074, + "grad_norm": 2.810124397277832, + "learning_rate": 4.8818986175607114e-06, + "loss": 0.7702, + "step": 12926 + }, + { + "epoch": 1.044627164185135, + "grad_norm": 2.728691339492798, + "learning_rate": 4.881244438995424e-06, + "loss": 0.9028, + "step": 12927 + }, + { + "epoch": 1.0447079738984626, + "grad_norm": 2.4419636726379395, + "learning_rate": 4.880590262464136e-06, + "loss": 0.7741, + "step": 12928 + }, + { + "epoch": 1.0447887836117902, + "grad_norm": 2.686403274536133, + "learning_rate": 4.879936087978046e-06, + "loss": 0.786, + "step": 12929 + }, + { + "epoch": 1.0448695933251178, + "grad_norm": 2.2678635120391846, + "learning_rate": 4.879281915548363e-06, + "loss": 0.8956, + "step": 12930 + }, + { + "epoch": 1.0449504030384453, + "grad_norm": 2.7791199684143066, + "learning_rate": 4.878627745186291e-06, + "loss": 0.8339, + "step": 12931 + }, + { + "epoch": 1.0450312127517727, + "grad_norm": 2.401492118835449, + "learning_rate": 4.877973576903032e-06, + "loss": 0.9785, + "step": 12932 + }, + { + "epoch": 1.0451120224651003, + "grad_norm": 2.7468011379241943, + "learning_rate": 4.877319410709792e-06, + "loss": 0.8165, + "step": 12933 + }, + { + "epoch": 1.0451928321784278, + "grad_norm": 2.9621524810791016, + "learning_rate": 4.876665246617778e-06, + "loss": 0.7445, + "step": 12934 + }, + { + "epoch": 1.0452736418917554, + "grad_norm": 2.692807912826538, + "learning_rate": 4.876011084638188e-06, + "loss": 0.83, + "step": 12935 + }, + { + "epoch": 1.045354451605083, + "grad_norm": 2.5371296405792236, + "learning_rate": 4.875356924782231e-06, + "loss": 0.8248, + "step": 12936 + }, + { + "epoch": 1.0454352613184106, + "grad_norm": 2.3327078819274902, + "learning_rate": 4.874702767061109e-06, + "loss": 0.8322, + "step": 12937 + }, + { + "epoch": 1.045516071031738, + "grad_norm": 2.231863260269165, + "learning_rate": 4.874048611486029e-06, + "loss": 0.75, + "step": 12938 + }, + { + "epoch": 1.0455968807450655, + "grad_norm": 2.6811535358428955, + "learning_rate": 4.873394458068191e-06, + "loss": 0.8114, + "step": 12939 + }, + { + "epoch": 1.045677690458393, + "grad_norm": 2.4019546508789062, + "learning_rate": 4.872740306818801e-06, + "loss": 0.8005, + "step": 12940 + }, + { + "epoch": 1.0457585001717207, + "grad_norm": 2.4113852977752686, + "learning_rate": 4.872086157749065e-06, + "loss": 0.9376, + "step": 12941 + }, + { + "epoch": 1.0458393098850483, + "grad_norm": 2.4300756454467773, + "learning_rate": 4.871432010870184e-06, + "loss": 0.7849, + "step": 12942 + }, + { + "epoch": 1.0459201195983758, + "grad_norm": 2.8537158966064453, + "learning_rate": 4.870777866193362e-06, + "loss": 0.8639, + "step": 12943 + }, + { + "epoch": 1.0460009293117032, + "grad_norm": 2.6500296592712402, + "learning_rate": 4.870123723729809e-06, + "loss": 0.8609, + "step": 12944 + }, + { + "epoch": 1.0460817390250308, + "grad_norm": 2.716092824935913, + "learning_rate": 4.869469583490718e-06, + "loss": 0.7456, + "step": 12945 + }, + { + "epoch": 1.0461625487383583, + "grad_norm": 2.320584774017334, + "learning_rate": 4.868815445487301e-06, + "loss": 0.9307, + "step": 12946 + }, + { + "epoch": 1.046243358451686, + "grad_norm": 2.245326042175293, + "learning_rate": 4.868161309730761e-06, + "loss": 0.8419, + "step": 12947 + }, + { + "epoch": 1.0463241681650135, + "grad_norm": 2.4302146434783936, + "learning_rate": 4.867507176232299e-06, + "loss": 0.8272, + "step": 12948 + }, + { + "epoch": 1.046404977878341, + "grad_norm": 2.7676596641540527, + "learning_rate": 4.866853045003119e-06, + "loss": 0.8039, + "step": 12949 + }, + { + "epoch": 1.0464857875916684, + "grad_norm": 2.416679859161377, + "learning_rate": 4.8661989160544295e-06, + "loss": 0.7932, + "step": 12950 + }, + { + "epoch": 1.046566597304996, + "grad_norm": 2.6678290367126465, + "learning_rate": 4.865544789397427e-06, + "loss": 0.8741, + "step": 12951 + }, + { + "epoch": 1.0466474070183236, + "grad_norm": 2.4413371086120605, + "learning_rate": 4.864890665043319e-06, + "loss": 0.8358, + "step": 12952 + }, + { + "epoch": 1.0467282167316512, + "grad_norm": 2.3341283798217773, + "learning_rate": 4.86423654300331e-06, + "loss": 0.7614, + "step": 12953 + }, + { + "epoch": 1.0468090264449788, + "grad_norm": 3.1425116062164307, + "learning_rate": 4.863582423288602e-06, + "loss": 0.8203, + "step": 12954 + }, + { + "epoch": 1.0468898361583063, + "grad_norm": 2.804582118988037, + "learning_rate": 4.862928305910396e-06, + "loss": 0.8434, + "step": 12955 + }, + { + "epoch": 1.0469706458716337, + "grad_norm": 2.468287467956543, + "learning_rate": 4.862274190879903e-06, + "loss": 0.8474, + "step": 12956 + }, + { + "epoch": 1.0470514555849613, + "grad_norm": 2.8406238555908203, + "learning_rate": 4.861620078208317e-06, + "loss": 0.7911, + "step": 12957 + }, + { + "epoch": 1.0471322652982888, + "grad_norm": 2.4930367469787598, + "learning_rate": 4.8609659679068475e-06, + "loss": 0.9259, + "step": 12958 + }, + { + "epoch": 1.0472130750116164, + "grad_norm": 2.488539218902588, + "learning_rate": 4.8603118599866975e-06, + "loss": 0.8723, + "step": 12959 + }, + { + "epoch": 1.047293884724944, + "grad_norm": 2.456427574157715, + "learning_rate": 4.859657754459068e-06, + "loss": 0.8189, + "step": 12960 + }, + { + "epoch": 1.0473746944382716, + "grad_norm": 2.627305269241333, + "learning_rate": 4.859003651335163e-06, + "loss": 0.7841, + "step": 12961 + }, + { + "epoch": 1.047455504151599, + "grad_norm": 2.6415815353393555, + "learning_rate": 4.858349550626187e-06, + "loss": 0.7202, + "step": 12962 + }, + { + "epoch": 1.0475363138649265, + "grad_norm": 2.7195494174957275, + "learning_rate": 4.8576954523433415e-06, + "loss": 0.7869, + "step": 12963 + }, + { + "epoch": 1.047617123578254, + "grad_norm": 2.7807328701019287, + "learning_rate": 4.85704135649783e-06, + "loss": 0.7887, + "step": 12964 + }, + { + "epoch": 1.0476979332915817, + "grad_norm": 2.895347833633423, + "learning_rate": 4.856387263100854e-06, + "loss": 0.822, + "step": 12965 + }, + { + "epoch": 1.0477787430049093, + "grad_norm": 2.4659616947174072, + "learning_rate": 4.8557331721636235e-06, + "loss": 0.9038, + "step": 12966 + }, + { + "epoch": 1.0478595527182368, + "grad_norm": 2.6420671939849854, + "learning_rate": 4.8550790836973325e-06, + "loss": 0.8497, + "step": 12967 + }, + { + "epoch": 1.0479403624315642, + "grad_norm": 2.4824156761169434, + "learning_rate": 4.8544249977131895e-06, + "loss": 0.9595, + "step": 12968 + }, + { + "epoch": 1.0480211721448918, + "grad_norm": 2.7139363288879395, + "learning_rate": 4.853770914222396e-06, + "loss": 0.912, + "step": 12969 + }, + { + "epoch": 1.0481019818582193, + "grad_norm": 2.9319398403167725, + "learning_rate": 4.853116833236154e-06, + "loss": 0.8654, + "step": 12970 + }, + { + "epoch": 1.048182791571547, + "grad_norm": 2.4399850368499756, + "learning_rate": 4.852462754765665e-06, + "loss": 0.8604, + "step": 12971 + }, + { + "epoch": 1.0482636012848745, + "grad_norm": 2.647242307662964, + "learning_rate": 4.851808678822137e-06, + "loss": 0.716, + "step": 12972 + }, + { + "epoch": 1.048344410998202, + "grad_norm": 2.568131923675537, + "learning_rate": 4.851154605416769e-06, + "loss": 0.7549, + "step": 12973 + }, + { + "epoch": 1.0484252207115294, + "grad_norm": 2.3960366249084473, + "learning_rate": 4.850500534560764e-06, + "loss": 0.8559, + "step": 12974 + }, + { + "epoch": 1.048506030424857, + "grad_norm": 2.702261447906494, + "learning_rate": 4.849846466265326e-06, + "loss": 0.9374, + "step": 12975 + }, + { + "epoch": 1.0485868401381846, + "grad_norm": 2.4157943725585938, + "learning_rate": 4.849192400541654e-06, + "loss": 0.8004, + "step": 12976 + }, + { + "epoch": 1.0486676498515122, + "grad_norm": 2.5824832916259766, + "learning_rate": 4.848538337400954e-06, + "loss": 0.9134, + "step": 12977 + }, + { + "epoch": 1.0487484595648398, + "grad_norm": 2.6210951805114746, + "learning_rate": 4.847884276854429e-06, + "loss": 0.787, + "step": 12978 + }, + { + "epoch": 1.0488292692781673, + "grad_norm": 2.176785945892334, + "learning_rate": 4.8472302189132795e-06, + "loss": 0.943, + "step": 12979 + }, + { + "epoch": 1.0489100789914947, + "grad_norm": 3.3187334537506104, + "learning_rate": 4.846576163588708e-06, + "loss": 0.8931, + "step": 12980 + }, + { + "epoch": 1.0489908887048223, + "grad_norm": 2.90089750289917, + "learning_rate": 4.8459221108919185e-06, + "loss": 0.7279, + "step": 12981 + }, + { + "epoch": 1.0490716984181498, + "grad_norm": 2.589691162109375, + "learning_rate": 4.84526806083411e-06, + "loss": 0.8896, + "step": 12982 + }, + { + "epoch": 1.0491525081314774, + "grad_norm": 2.450986385345459, + "learning_rate": 4.844614013426489e-06, + "loss": 0.8449, + "step": 12983 + }, + { + "epoch": 1.049233317844805, + "grad_norm": 2.4798226356506348, + "learning_rate": 4.8439599686802566e-06, + "loss": 0.9056, + "step": 12984 + }, + { + "epoch": 1.0493141275581326, + "grad_norm": 2.879004716873169, + "learning_rate": 4.843305926606613e-06, + "loss": 0.8751, + "step": 12985 + }, + { + "epoch": 1.04939493727146, + "grad_norm": 2.4587759971618652, + "learning_rate": 4.8426518872167615e-06, + "loss": 0.9262, + "step": 12986 + }, + { + "epoch": 1.0494757469847875, + "grad_norm": 2.6293201446533203, + "learning_rate": 4.841997850521906e-06, + "loss": 0.8178, + "step": 12987 + }, + { + "epoch": 1.049556556698115, + "grad_norm": 2.8607261180877686, + "learning_rate": 4.841343816533245e-06, + "loss": 0.7812, + "step": 12988 + }, + { + "epoch": 1.0496373664114427, + "grad_norm": 2.734184741973877, + "learning_rate": 4.840689785261983e-06, + "loss": 0.8692, + "step": 12989 + }, + { + "epoch": 1.0497181761247703, + "grad_norm": 2.922410011291504, + "learning_rate": 4.840035756719322e-06, + "loss": 0.7434, + "step": 12990 + }, + { + "epoch": 1.0497989858380978, + "grad_norm": 2.6176400184631348, + "learning_rate": 4.839381730916463e-06, + "loss": 0.8148, + "step": 12991 + }, + { + "epoch": 1.0498797955514252, + "grad_norm": 2.8842031955718994, + "learning_rate": 4.838727707864607e-06, + "loss": 0.7583, + "step": 12992 + }, + { + "epoch": 1.0499606052647528, + "grad_norm": 2.6824910640716553, + "learning_rate": 4.838073687574959e-06, + "loss": 0.8783, + "step": 12993 + }, + { + "epoch": 1.0500414149780803, + "grad_norm": 2.2635958194732666, + "learning_rate": 4.837419670058719e-06, + "loss": 0.8861, + "step": 12994 + }, + { + "epoch": 1.050122224691408, + "grad_norm": 2.542126417160034, + "learning_rate": 4.836765655327088e-06, + "loss": 0.8662, + "step": 12995 + }, + { + "epoch": 1.0502030344047355, + "grad_norm": 2.507847309112549, + "learning_rate": 4.836111643391268e-06, + "loss": 0.8216, + "step": 12996 + }, + { + "epoch": 1.050283844118063, + "grad_norm": 2.5112709999084473, + "learning_rate": 4.8354576342624624e-06, + "loss": 0.8836, + "step": 12997 + }, + { + "epoch": 1.0503646538313904, + "grad_norm": 2.362645149230957, + "learning_rate": 4.834803627951869e-06, + "loss": 0.8943, + "step": 12998 + }, + { + "epoch": 1.050445463544718, + "grad_norm": 2.359710931777954, + "learning_rate": 4.834149624470693e-06, + "loss": 0.8865, + "step": 12999 + }, + { + "epoch": 1.0505262732580456, + "grad_norm": 4.162075042724609, + "learning_rate": 4.833495623830136e-06, + "loss": 0.7883, + "step": 13000 + }, + { + "epoch": 1.0505262732580456, + "eval_loss": 0.754727840423584, + "eval_runtime": 818.5065, + "eval_samples_per_second": 101.851, + "eval_steps_per_second": 12.732, + "step": 13000 + } + ], + "logging_steps": 1.0, + "max_steps": 24748, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1025855761417765e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}