{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9998204829009962, "eval_steps": 500, "global_step": 5570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003590341980073602, "grad_norm": 6.1201090812683105, "learning_rate": 1.7953321364452425e-06, "loss": 2.9926, "step": 10 }, { "epoch": 0.007180683960147204, "grad_norm": 4.0763983726501465, "learning_rate": 3.590664272890485e-06, "loss": 2.7866, "step": 20 }, { "epoch": 0.010771025940220806, "grad_norm": 3.385845184326172, "learning_rate": 5.385996409335727e-06, "loss": 2.0378, "step": 30 }, { "epoch": 0.014361367920294408, "grad_norm": 5.195909023284912, "learning_rate": 7.18132854578097e-06, "loss": 1.2251, "step": 40 }, { "epoch": 0.01795170990036801, "grad_norm": 2.219606637954712, "learning_rate": 8.976660682226211e-06, "loss": 0.6834, "step": 50 }, { "epoch": 0.02154205188044161, "grad_norm": 16.839906692504883, "learning_rate": 1.0771992818671454e-05, "loss": 0.4754, "step": 60 }, { "epoch": 0.025132393860515214, "grad_norm": 25.55668067932129, "learning_rate": 1.2567324955116697e-05, "loss": 0.3818, "step": 70 }, { "epoch": 0.028722735840588817, "grad_norm": 1.359479546546936, "learning_rate": 1.436265709156194e-05, "loss": 0.3797, "step": 80 }, { "epoch": 0.032313077820662416, "grad_norm": 1.2728756666183472, "learning_rate": 1.615798922800718e-05, "loss": 0.3712, "step": 90 }, { "epoch": 0.03590341980073602, "grad_norm": 1.9393813610076904, "learning_rate": 1.7953321364452423e-05, "loss": 0.3564, "step": 100 }, { "epoch": 0.03949376178080962, "grad_norm": 1.4643720388412476, "learning_rate": 1.9748653500897668e-05, "loss": 0.3438, "step": 110 }, { "epoch": 0.04308410376088322, "grad_norm": 1.4880571365356445, "learning_rate": 2.154398563734291e-05, "loss": 0.321, "step": 120 }, { "epoch": 0.046674445740956826, "grad_norm": 1.239957571029663, "learning_rate": 2.3339317773788153e-05, "loss": 0.319, "step": 130 }, { "epoch": 0.05026478772103043, "grad_norm": 1.2627112865447998, "learning_rate": 2.5134649910233395e-05, "loss": 0.3128, "step": 140 }, { "epoch": 0.05385512970110403, "grad_norm": 1.1520243883132935, "learning_rate": 2.6929982046678636e-05, "loss": 0.31, "step": 150 }, { "epoch": 0.05744547168117763, "grad_norm": 1.8554497957229614, "learning_rate": 2.872531418312388e-05, "loss": 0.3167, "step": 160 }, { "epoch": 0.061035813661251236, "grad_norm": 1.8501205444335938, "learning_rate": 3.0520646319569125e-05, "loss": 0.3177, "step": 170 }, { "epoch": 0.06462615564132483, "grad_norm": 1.249617099761963, "learning_rate": 3.231597845601436e-05, "loss": 0.3081, "step": 180 }, { "epoch": 0.06821649762139843, "grad_norm": 1.1702481508255005, "learning_rate": 3.411131059245961e-05, "loss": 0.3122, "step": 190 }, { "epoch": 0.07180683960147204, "grad_norm": 1.0217711925506592, "learning_rate": 3.5906642728904846e-05, "loss": 0.3047, "step": 200 }, { "epoch": 0.07539718158154564, "grad_norm": 0.8885968923568726, "learning_rate": 3.770197486535009e-05, "loss": 0.3042, "step": 210 }, { "epoch": 0.07898752356161924, "grad_norm": 1.0739161968231201, "learning_rate": 3.9497307001795335e-05, "loss": 0.2957, "step": 220 }, { "epoch": 0.08257786554169284, "grad_norm": 1.963072419166565, "learning_rate": 4.129263913824057e-05, "loss": 0.2967, "step": 230 }, { "epoch": 0.08616820752176645, "grad_norm": 0.9546407461166382, "learning_rate": 4.308797127468582e-05, "loss": 0.2834, "step": 240 }, { "epoch": 0.08975854950184005, "grad_norm": 1.82941734790802, "learning_rate": 4.488330341113106e-05, "loss": 0.2864, "step": 250 }, { "epoch": 0.09334889148191365, "grad_norm": 1.4494279623031616, "learning_rate": 4.667863554757631e-05, "loss": 0.2891, "step": 260 }, { "epoch": 0.09693923346198725, "grad_norm": 1.195784330368042, "learning_rate": 4.847396768402155e-05, "loss": 0.2904, "step": 270 }, { "epoch": 0.10052957544206086, "grad_norm": 1.0053528547286987, "learning_rate": 5.026929982046679e-05, "loss": 0.2804, "step": 280 }, { "epoch": 0.10411991742213446, "grad_norm": 4.148128986358643, "learning_rate": 5.2064631956912034e-05, "loss": 0.3165, "step": 290 }, { "epoch": 0.10771025940220806, "grad_norm": 1.4303346872329712, "learning_rate": 5.385996409335727e-05, "loss": 0.2747, "step": 300 }, { "epoch": 0.11130060138228166, "grad_norm": 1.128341794013977, "learning_rate": 5.565529622980251e-05, "loss": 0.297, "step": 310 }, { "epoch": 0.11489094336235527, "grad_norm": 1.2280890941619873, "learning_rate": 5.745062836624776e-05, "loss": 0.2821, "step": 320 }, { "epoch": 0.11848128534242887, "grad_norm": 1.4685401916503906, "learning_rate": 5.9245960502693e-05, "loss": 0.2815, "step": 330 }, { "epoch": 0.12207162732250247, "grad_norm": 2.4324777126312256, "learning_rate": 6.104129263913825e-05, "loss": 0.291, "step": 340 }, { "epoch": 0.12566196930257606, "grad_norm": 1.2875359058380127, "learning_rate": 6.283662477558349e-05, "loss": 0.2852, "step": 350 }, { "epoch": 0.12925231128264966, "grad_norm": 2.257322072982788, "learning_rate": 6.463195691202873e-05, "loss": 0.2804, "step": 360 }, { "epoch": 0.13284265326272326, "grad_norm": 1.3770567178726196, "learning_rate": 6.642728904847398e-05, "loss": 0.2873, "step": 370 }, { "epoch": 0.13643299524279687, "grad_norm": 1.6921864748001099, "learning_rate": 6.822262118491922e-05, "loss": 0.2974, "step": 380 }, { "epoch": 0.14002333722287047, "grad_norm": 0.9520618915557861, "learning_rate": 7.001795332136445e-05, "loss": 0.2939, "step": 390 }, { "epoch": 0.14361367920294407, "grad_norm": 0.812728762626648, "learning_rate": 7.181328545780969e-05, "loss": 0.2702, "step": 400 }, { "epoch": 0.14720402118301767, "grad_norm": 1.7924541234970093, "learning_rate": 7.360861759425493e-05, "loss": 0.2969, "step": 410 }, { "epoch": 0.15079436316309128, "grad_norm": 2.439558982849121, "learning_rate": 7.540394973070018e-05, "loss": 0.2893, "step": 420 }, { "epoch": 0.15438470514316488, "grad_norm": 0.8057828545570374, "learning_rate": 7.719928186714542e-05, "loss": 0.2808, "step": 430 }, { "epoch": 0.15797504712323848, "grad_norm": 1.2622177600860596, "learning_rate": 7.899461400359067e-05, "loss": 0.282, "step": 440 }, { "epoch": 0.16156538910331208, "grad_norm": 1.1095036268234253, "learning_rate": 8.078994614003591e-05, "loss": 0.2691, "step": 450 }, { "epoch": 0.1651557310833857, "grad_norm": 0.7493880987167358, "learning_rate": 8.258527827648115e-05, "loss": 0.2748, "step": 460 }, { "epoch": 0.1687460730634593, "grad_norm": 0.7199195623397827, "learning_rate": 8.43806104129264e-05, "loss": 0.2876, "step": 470 }, { "epoch": 0.1723364150435329, "grad_norm": 0.9257749915122986, "learning_rate": 8.617594254937164e-05, "loss": 0.2801, "step": 480 }, { "epoch": 0.1759267570236065, "grad_norm": 0.9219655394554138, "learning_rate": 8.797127468581689e-05, "loss": 0.2717, "step": 490 }, { "epoch": 0.1795170990036801, "grad_norm": 1.5916101932525635, "learning_rate": 8.976660682226212e-05, "loss": 0.275, "step": 500 }, { "epoch": 0.1831074409837537, "grad_norm": 1.1832544803619385, "learning_rate": 9.156193895870736e-05, "loss": 0.2774, "step": 510 }, { "epoch": 0.1866977829638273, "grad_norm": 0.8959478735923767, "learning_rate": 9.335727109515261e-05, "loss": 0.3003, "step": 520 }, { "epoch": 0.1902881249439009, "grad_norm": 0.7720569968223572, "learning_rate": 9.515260323159785e-05, "loss": 0.2771, "step": 530 }, { "epoch": 0.1938784669239745, "grad_norm": 1.263458013534546, "learning_rate": 9.69479353680431e-05, "loss": 0.2737, "step": 540 }, { "epoch": 0.1974688089040481, "grad_norm": 1.6316909790039062, "learning_rate": 9.874326750448834e-05, "loss": 0.2899, "step": 550 }, { "epoch": 0.2010591508841217, "grad_norm": 0.8948745131492615, "learning_rate": 9.999991163368873e-05, "loss": 0.2703, "step": 560 }, { "epoch": 0.20464949286419531, "grad_norm": 1.680094599723816, "learning_rate": 9.999834068573299e-05, "loss": 0.2828, "step": 570 }, { "epoch": 0.20823983484426892, "grad_norm": 1.1262023448944092, "learning_rate": 9.999480611298721e-05, "loss": 0.2651, "step": 580 }, { "epoch": 0.21183017682434252, "grad_norm": 1.2514327764511108, "learning_rate": 9.998930805426751e-05, "loss": 0.2828, "step": 590 }, { "epoch": 0.21542051880441612, "grad_norm": 0.8650713562965393, "learning_rate": 9.998184672550354e-05, "loss": 0.2641, "step": 600 }, { "epoch": 0.21901086078448972, "grad_norm": 1.3188605308532715, "learning_rate": 9.997242241973004e-05, "loss": 0.2791, "step": 610 }, { "epoch": 0.22260120276456333, "grad_norm": 1.677878737449646, "learning_rate": 9.996103550707527e-05, "loss": 0.2803, "step": 620 }, { "epoch": 0.22619154474463693, "grad_norm": 1.9317690134048462, "learning_rate": 9.994768643474658e-05, "loss": 0.263, "step": 630 }, { "epoch": 0.22978188672471053, "grad_norm": 0.8656140565872192, "learning_rate": 9.993237572701274e-05, "loss": 0.2723, "step": 640 }, { "epoch": 0.23337222870478413, "grad_norm": 0.7631008625030518, "learning_rate": 9.991510398518341e-05, "loss": 0.2958, "step": 650 }, { "epoch": 0.23696257068485774, "grad_norm": 0.6852580308914185, "learning_rate": 9.989587188758552e-05, "loss": 0.2612, "step": 660 }, { "epoch": 0.24055291266493134, "grad_norm": 0.6097802519798279, "learning_rate": 9.987468018953661e-05, "loss": 0.2607, "step": 670 }, { "epoch": 0.24414325464500494, "grad_norm": 1.254186987876892, "learning_rate": 9.985152972331516e-05, "loss": 0.2662, "step": 680 }, { "epoch": 0.24773359662507854, "grad_norm": 0.8868479132652283, "learning_rate": 9.982642139812793e-05, "loss": 0.2705, "step": 690 }, { "epoch": 0.2513239386051521, "grad_norm": 1.5867512226104736, "learning_rate": 9.979935620007424e-05, "loss": 0.2735, "step": 700 }, { "epoch": 0.25491428058522575, "grad_norm": 0.7384280562400818, "learning_rate": 9.977033519210725e-05, "loss": 0.2676, "step": 710 }, { "epoch": 0.2585046225652993, "grad_norm": 0.7617084383964539, "learning_rate": 9.97393595139922e-05, "loss": 0.2655, "step": 720 }, { "epoch": 0.26209496454537295, "grad_norm": 0.6475211381912231, "learning_rate": 9.970643038226166e-05, "loss": 0.2629, "step": 730 }, { "epoch": 0.26568530652544653, "grad_norm": 1.3059916496276855, "learning_rate": 9.967154909016772e-05, "loss": 0.2548, "step": 740 }, { "epoch": 0.26927564850552016, "grad_norm": 1.1138116121292114, "learning_rate": 9.963471700763123e-05, "loss": 0.2525, "step": 750 }, { "epoch": 0.27286599048559373, "grad_norm": 1.0550082921981812, "learning_rate": 9.959593558118803e-05, "loss": 0.2622, "step": 760 }, { "epoch": 0.27645633246566736, "grad_norm": 0.8017902374267578, "learning_rate": 9.955520633393205e-05, "loss": 0.2649, "step": 770 }, { "epoch": 0.28004667444574094, "grad_norm": 1.235143780708313, "learning_rate": 9.951253086545558e-05, "loss": 0.2747, "step": 780 }, { "epoch": 0.28363701642581457, "grad_norm": 0.7427018284797668, "learning_rate": 9.946791085178639e-05, "loss": 0.242, "step": 790 }, { "epoch": 0.28722735840588814, "grad_norm": 0.6972371935844421, "learning_rate": 9.942134804532193e-05, "loss": 0.2423, "step": 800 }, { "epoch": 0.2908177003859618, "grad_norm": 0.9071277976036072, "learning_rate": 9.937284427476052e-05, "loss": 0.2425, "step": 810 }, { "epoch": 0.29440804236603535, "grad_norm": 0.8345310688018799, "learning_rate": 9.932240144502952e-05, "loss": 0.2864, "step": 820 }, { "epoch": 0.297998384346109, "grad_norm": 1.1392581462860107, "learning_rate": 9.927002153721044e-05, "loss": 0.2366, "step": 830 }, { "epoch": 0.30158872632618255, "grad_norm": 0.9356684684753418, "learning_rate": 9.921570660846131e-05, "loss": 0.2464, "step": 840 }, { "epoch": 0.3051790683062562, "grad_norm": 1.5248229503631592, "learning_rate": 9.915945879193571e-05, "loss": 0.2809, "step": 850 }, { "epoch": 0.30876941028632976, "grad_norm": 1.0663933753967285, "learning_rate": 9.91012802966991e-05, "loss": 0.2779, "step": 860 }, { "epoch": 0.3123597522664034, "grad_norm": 0.9292562007904053, "learning_rate": 9.904117340764201e-05, "loss": 0.2465, "step": 870 }, { "epoch": 0.31595009424647696, "grad_norm": 0.7365911602973938, "learning_rate": 9.897914048539032e-05, "loss": 0.2688, "step": 880 }, { "epoch": 0.3195404362265506, "grad_norm": 1.0190156698226929, "learning_rate": 9.891518396621258e-05, "loss": 0.2471, "step": 890 }, { "epoch": 0.32313077820662417, "grad_norm": 1.167611837387085, "learning_rate": 9.884930636192426e-05, "loss": 0.2468, "step": 900 }, { "epoch": 0.3267211201866978, "grad_norm": 1.1509454250335693, "learning_rate": 9.878151025978918e-05, "loss": 0.2528, "step": 910 }, { "epoch": 0.3303114621667714, "grad_norm": 1.0654162168502808, "learning_rate": 9.871179832241781e-05, "loss": 0.2669, "step": 920 }, { "epoch": 0.333901804146845, "grad_norm": 0.9040902853012085, "learning_rate": 9.86401732876628e-05, "loss": 0.2513, "step": 930 }, { "epoch": 0.3374921461269186, "grad_norm": 2.8603482246398926, "learning_rate": 9.856663796851137e-05, "loss": 0.2526, "step": 940 }, { "epoch": 0.3410824881069922, "grad_norm": 0.7283102869987488, "learning_rate": 9.849119525297488e-05, "loss": 0.2565, "step": 950 }, { "epoch": 0.3446728300870658, "grad_norm": 1.1231544017791748, "learning_rate": 9.841384810397538e-05, "loss": 0.2591, "step": 960 }, { "epoch": 0.3482631720671394, "grad_norm": 1.3341351747512817, "learning_rate": 9.833459955922926e-05, "loss": 0.2426, "step": 970 }, { "epoch": 0.351853514047213, "grad_norm": 0.7382979393005371, "learning_rate": 9.825345273112796e-05, "loss": 0.2404, "step": 980 }, { "epoch": 0.3554438560272866, "grad_norm": 0.9196600914001465, "learning_rate": 9.817041080661571e-05, "loss": 0.269, "step": 990 }, { "epoch": 0.3590341980073602, "grad_norm": 4.254228115081787, "learning_rate": 9.808547704706437e-05, "loss": 0.2498, "step": 1000 }, { "epoch": 0.3626245399874338, "grad_norm": 0.6999326348304749, "learning_rate": 9.799865478814535e-05, "loss": 0.242, "step": 1010 }, { "epoch": 0.3662148819675074, "grad_norm": 1.5552287101745605, "learning_rate": 9.790994743969864e-05, "loss": 0.2663, "step": 1020 }, { "epoch": 0.36980522394758103, "grad_norm": 0.6971444487571716, "learning_rate": 9.781935848559878e-05, "loss": 0.2549, "step": 1030 }, { "epoch": 0.3733955659276546, "grad_norm": 1.180908441543579, "learning_rate": 9.772689148361817e-05, "loss": 0.2313, "step": 1040 }, { "epoch": 0.37698590790772823, "grad_norm": 0.633343517780304, "learning_rate": 9.763255006528731e-05, "loss": 0.2395, "step": 1050 }, { "epoch": 0.3805762498878018, "grad_norm": 0.9181081056594849, "learning_rate": 9.753633793575206e-05, "loss": 0.2512, "step": 1060 }, { "epoch": 0.38416659186787544, "grad_norm": 1.1254559755325317, "learning_rate": 9.743825887362832e-05, "loss": 0.2467, "step": 1070 }, { "epoch": 0.387756933847949, "grad_norm": 0.8145197629928589, "learning_rate": 9.733831673085344e-05, "loss": 0.2421, "step": 1080 }, { "epoch": 0.39134727582802264, "grad_norm": 0.5483050346374512, "learning_rate": 9.723651543253509e-05, "loss": 0.2578, "step": 1090 }, { "epoch": 0.3949376178080962, "grad_norm": 0.7891978621482849, "learning_rate": 9.713285897679699e-05, "loss": 0.2339, "step": 1100 }, { "epoch": 0.39852795978816985, "grad_norm": 0.6310613751411438, "learning_rate": 9.702735143462198e-05, "loss": 0.2379, "step": 1110 }, { "epoch": 0.4021183017682434, "grad_norm": 0.8631925582885742, "learning_rate": 9.691999694969208e-05, "loss": 0.2413, "step": 1120 }, { "epoch": 0.40570864374831705, "grad_norm": 0.7224175930023193, "learning_rate": 9.681079973822576e-05, "loss": 0.2343, "step": 1130 }, { "epoch": 0.40929898572839063, "grad_norm": 0.8189213871955872, "learning_rate": 9.669976408881238e-05, "loss": 0.2513, "step": 1140 }, { "epoch": 0.4128893277084642, "grad_norm": 0.8129417300224304, "learning_rate": 9.658689436224373e-05, "loss": 0.2547, "step": 1150 }, { "epoch": 0.41647966968853783, "grad_norm": 1.1440197229385376, "learning_rate": 9.647219499134277e-05, "loss": 0.2427, "step": 1160 }, { "epoch": 0.4200700116686114, "grad_norm": 0.9682267308235168, "learning_rate": 9.635567048078958e-05, "loss": 0.2411, "step": 1170 }, { "epoch": 0.42366035364868504, "grad_norm": 0.7513495683670044, "learning_rate": 9.623732540694437e-05, "loss": 0.252, "step": 1180 }, { "epoch": 0.4272506956287586, "grad_norm": 3.1498029232025146, "learning_rate": 9.61171644176678e-05, "loss": 0.2486, "step": 1190 }, { "epoch": 0.43084103760883224, "grad_norm": 0.6250784397125244, "learning_rate": 9.599519223213842e-05, "loss": 0.2459, "step": 1200 }, { "epoch": 0.4344313795889058, "grad_norm": 0.548052966594696, "learning_rate": 9.587141364066736e-05, "loss": 0.2334, "step": 1210 }, { "epoch": 0.43802172156897945, "grad_norm": 0.6549167037010193, "learning_rate": 9.574583350451016e-05, "loss": 0.2399, "step": 1220 }, { "epoch": 0.441612063549053, "grad_norm": 0.7177796363830566, "learning_rate": 9.561845675567586e-05, "loss": 0.2574, "step": 1230 }, { "epoch": 0.44520240552912665, "grad_norm": 1.0265281200408936, "learning_rate": 9.548928839673334e-05, "loss": 0.2285, "step": 1240 }, { "epoch": 0.44879274750920023, "grad_norm": 1.3221251964569092, "learning_rate": 9.535833350061473e-05, "loss": 0.2293, "step": 1250 }, { "epoch": 0.45238308948927386, "grad_norm": 0.9542430639266968, "learning_rate": 9.522559721041636e-05, "loss": 0.2367, "step": 1260 }, { "epoch": 0.45597343146934743, "grad_norm": 2.0089797973632812, "learning_rate": 9.509108473919662e-05, "loss": 0.2166, "step": 1270 }, { "epoch": 0.45956377344942106, "grad_norm": 1.2323672771453857, "learning_rate": 9.495480136977127e-05, "loss": 0.2253, "step": 1280 }, { "epoch": 0.46315411542949464, "grad_norm": 1.155745506286621, "learning_rate": 9.4816752454506e-05, "loss": 0.2236, "step": 1290 }, { "epoch": 0.46674445740956827, "grad_norm": 0.5866098403930664, "learning_rate": 9.46769434151062e-05, "loss": 0.2346, "step": 1300 }, { "epoch": 0.47033479938964184, "grad_norm": 0.8677975535392761, "learning_rate": 9.4535379742404e-05, "loss": 0.2229, "step": 1310 }, { "epoch": 0.4739251413697155, "grad_norm": 0.8805405497550964, "learning_rate": 9.439206699614263e-05, "loss": 0.2279, "step": 1320 }, { "epoch": 0.47751548334978905, "grad_norm": 0.5903385877609253, "learning_rate": 9.424701080475811e-05, "loss": 0.2454, "step": 1330 }, { "epoch": 0.4811058253298627, "grad_norm": 0.9364457726478577, "learning_rate": 9.410021686515815e-05, "loss": 0.2454, "step": 1340 }, { "epoch": 0.48469616730993625, "grad_norm": 1.4409586191177368, "learning_rate": 9.39516909424985e-05, "loss": 0.2417, "step": 1350 }, { "epoch": 0.4882865092900099, "grad_norm": 0.705747663974762, "learning_rate": 9.380143886995636e-05, "loss": 0.2253, "step": 1360 }, { "epoch": 0.49187685127008346, "grad_norm": 1.2557168006896973, "learning_rate": 9.364946654850148e-05, "loss": 0.2332, "step": 1370 }, { "epoch": 0.4954671932501571, "grad_norm": 1.4732472896575928, "learning_rate": 9.349577994666427e-05, "loss": 0.2202, "step": 1380 }, { "epoch": 0.49905753523023066, "grad_norm": 1.1212490797042847, "learning_rate": 9.33403851003015e-05, "loss": 0.2064, "step": 1390 }, { "epoch": 0.5026478772103042, "grad_norm": 0.825175404548645, "learning_rate": 9.31832881123591e-05, "loss": 0.2148, "step": 1400 }, { "epoch": 0.5062382191903779, "grad_norm": 0.8229523301124573, "learning_rate": 9.302449515263268e-05, "loss": 0.2307, "step": 1410 }, { "epoch": 0.5098285611704515, "grad_norm": 0.8145741820335388, "learning_rate": 9.286401245752501e-05, "loss": 0.2405, "step": 1420 }, { "epoch": 0.5134189031505251, "grad_norm": 0.7511823177337646, "learning_rate": 9.270184632980121e-05, "loss": 0.2311, "step": 1430 }, { "epoch": 0.5170092451305986, "grad_norm": 0.7575204968452454, "learning_rate": 9.253800313834127e-05, "loss": 0.2068, "step": 1440 }, { "epoch": 0.5205995871106723, "grad_norm": 0.6711773872375488, "learning_rate": 9.237248931788972e-05, "loss": 0.2336, "step": 1450 }, { "epoch": 0.5241899290907459, "grad_norm": 0.7057952880859375, "learning_rate": 9.220531136880314e-05, "loss": 0.2332, "step": 1460 }, { "epoch": 0.5277802710708195, "grad_norm": 0.7404478788375854, "learning_rate": 9.203647585679471e-05, "loss": 0.2204, "step": 1470 }, { "epoch": 0.5313706130508931, "grad_norm": 0.6271808743476868, "learning_rate": 9.186598941267642e-05, "loss": 0.207, "step": 1480 }, { "epoch": 0.5349609550309667, "grad_norm": 0.7089178562164307, "learning_rate": 9.169385873209863e-05, "loss": 0.2259, "step": 1490 }, { "epoch": 0.5385512970110403, "grad_norm": 0.949642539024353, "learning_rate": 9.152009057528714e-05, "loss": 0.229, "step": 1500 }, { "epoch": 0.5421416389911139, "grad_norm": 0.7554659247398376, "learning_rate": 9.134469176677762e-05, "loss": 0.2208, "step": 1510 }, { "epoch": 0.5457319809711875, "grad_norm": 0.713874340057373, "learning_rate": 9.116766919514765e-05, "loss": 0.2177, "step": 1520 }, { "epoch": 0.5493223229512612, "grad_norm": 0.6753556728363037, "learning_rate": 9.098902981274615e-05, "loss": 0.2202, "step": 1530 }, { "epoch": 0.5529126649313347, "grad_norm": 1.2491189241409302, "learning_rate": 9.080878063542035e-05, "loss": 0.2118, "step": 1540 }, { "epoch": 0.5565030069114083, "grad_norm": 0.6264563798904419, "learning_rate": 9.062692874224024e-05, "loss": 0.2211, "step": 1550 }, { "epoch": 0.5600933488914819, "grad_norm": 0.4661034941673279, "learning_rate": 9.044348127522054e-05, "loss": 0.2168, "step": 1560 }, { "epoch": 0.5636836908715556, "grad_norm": 0.6062325835227966, "learning_rate": 9.025844543904022e-05, "loss": 0.214, "step": 1570 }, { "epoch": 0.5672740328516291, "grad_norm": 0.6374778747558594, "learning_rate": 9.007182850075956e-05, "loss": 0.2083, "step": 1580 }, { "epoch": 0.5708643748317027, "grad_norm": 1.131443738937378, "learning_rate": 8.98836377895347e-05, "loss": 0.2005, "step": 1590 }, { "epoch": 0.5744547168117763, "grad_norm": 0.6167281866073608, "learning_rate": 8.969388069632987e-05, "loss": 0.2122, "step": 1600 }, { "epoch": 0.57804505879185, "grad_norm": 0.9362030625343323, "learning_rate": 8.950256467362699e-05, "loss": 0.2275, "step": 1610 }, { "epoch": 0.5816354007719235, "grad_norm": 0.9304684996604919, "learning_rate": 8.930969723513312e-05, "loss": 0.2027, "step": 1620 }, { "epoch": 0.5852257427519971, "grad_norm": 0.62895268201828, "learning_rate": 8.911528595548533e-05, "loss": 0.2266, "step": 1630 }, { "epoch": 0.5888160847320707, "grad_norm": 1.480999231338501, "learning_rate": 8.891933846995312e-05, "loss": 0.2052, "step": 1640 }, { "epoch": 0.5924064267121444, "grad_norm": 1.3081512451171875, "learning_rate": 8.872186247413874e-05, "loss": 0.212, "step": 1650 }, { "epoch": 0.595996768692218, "grad_norm": 2.765312671661377, "learning_rate": 8.852286572367476e-05, "loss": 0.2233, "step": 1660 }, { "epoch": 0.5995871106722915, "grad_norm": 1.2033319473266602, "learning_rate": 8.832235603391958e-05, "loss": 0.2199, "step": 1670 }, { "epoch": 0.6031774526523651, "grad_norm": 1.092360496520996, "learning_rate": 8.812034127965048e-05, "loss": 0.1994, "step": 1680 }, { "epoch": 0.6067677946324388, "grad_norm": 1.0622711181640625, "learning_rate": 8.791682939475438e-05, "loss": 0.2117, "step": 1690 }, { "epoch": 0.6103581366125124, "grad_norm": 0.722064733505249, "learning_rate": 8.771182837191613e-05, "loss": 0.2219, "step": 1700 }, { "epoch": 0.6139484785925859, "grad_norm": 0.602187991142273, "learning_rate": 8.750534626230475e-05, "loss": 0.2159, "step": 1710 }, { "epoch": 0.6175388205726595, "grad_norm": 0.7628340721130371, "learning_rate": 8.729739117525715e-05, "loss": 0.2088, "step": 1720 }, { "epoch": 0.6211291625527331, "grad_norm": 0.5262313485145569, "learning_rate": 8.708797127795963e-05, "loss": 0.2285, "step": 1730 }, { "epoch": 0.6247195045328068, "grad_norm": 0.6427643299102783, "learning_rate": 8.68770947951272e-05, "loss": 0.2094, "step": 1740 }, { "epoch": 0.6283098465128804, "grad_norm": 0.5874310731887817, "learning_rate": 8.666477000868046e-05, "loss": 0.2263, "step": 1750 }, { "epoch": 0.6319001884929539, "grad_norm": 0.561213493347168, "learning_rate": 8.645100525742042e-05, "loss": 0.2025, "step": 1760 }, { "epoch": 0.6354905304730275, "grad_norm": 0.7805958390235901, "learning_rate": 8.623580893670105e-05, "loss": 0.2171, "step": 1770 }, { "epoch": 0.6390808724531012, "grad_norm": 0.5806890726089478, "learning_rate": 8.601918949809937e-05, "loss": 0.2103, "step": 1780 }, { "epoch": 0.6426712144331748, "grad_norm": 0.581363320350647, "learning_rate": 8.580115544908374e-05, "loss": 0.2129, "step": 1790 }, { "epoch": 0.6462615564132483, "grad_norm": 0.4736599326133728, "learning_rate": 8.558171535267958e-05, "loss": 0.1993, "step": 1800 }, { "epoch": 0.6498518983933219, "grad_norm": 0.6482508778572083, "learning_rate": 8.536087782713318e-05, "loss": 0.193, "step": 1810 }, { "epoch": 0.6534422403733956, "grad_norm": 0.7920377850532532, "learning_rate": 8.513865154557315e-05, "loss": 0.1989, "step": 1820 }, { "epoch": 0.6570325823534692, "grad_norm": 0.7527133226394653, "learning_rate": 8.491504523566985e-05, "loss": 0.215, "step": 1830 }, { "epoch": 0.6606229243335427, "grad_norm": 0.8890761733055115, "learning_rate": 8.46900676792926e-05, "loss": 0.1972, "step": 1840 }, { "epoch": 0.6642132663136163, "grad_norm": 1.100785732269287, "learning_rate": 8.44637277121647e-05, "loss": 0.1958, "step": 1850 }, { "epoch": 0.66780360829369, "grad_norm": 0.6120195388793945, "learning_rate": 8.423603422351665e-05, "loss": 0.21, "step": 1860 }, { "epoch": 0.6713939502737636, "grad_norm": 0.9138973951339722, "learning_rate": 8.400699615573671e-05, "loss": 0.2144, "step": 1870 }, { "epoch": 0.6749842922538372, "grad_norm": 0.6855999827384949, "learning_rate": 8.377662250402e-05, "loss": 0.1949, "step": 1880 }, { "epoch": 0.6785746342339107, "grad_norm": 0.8468754291534424, "learning_rate": 8.354492231601505e-05, "loss": 0.207, "step": 1890 }, { "epoch": 0.6821649762139844, "grad_norm": 0.650043249130249, "learning_rate": 8.331190469146848e-05, "loss": 0.2029, "step": 1900 }, { "epoch": 0.685755318194058, "grad_norm": 0.7149790525436401, "learning_rate": 8.307757878186767e-05, "loss": 0.1891, "step": 1910 }, { "epoch": 0.6893456601741316, "grad_norm": 0.5650553703308105, "learning_rate": 8.284195379008137e-05, "loss": 0.2034, "step": 1920 }, { "epoch": 0.6929360021542051, "grad_norm": 0.8220282793045044, "learning_rate": 8.260503896999814e-05, "loss": 0.2004, "step": 1930 }, { "epoch": 0.6965263441342788, "grad_norm": 0.9552260041236877, "learning_rate": 8.236684362616307e-05, "loss": 0.2052, "step": 1940 }, { "epoch": 0.7001166861143524, "grad_norm": 0.643084704875946, "learning_rate": 8.212737711341223e-05, "loss": 0.2072, "step": 1950 }, { "epoch": 0.703707028094426, "grad_norm": 0.6681669354438782, "learning_rate": 8.188664883650537e-05, "loss": 0.1969, "step": 1960 }, { "epoch": 0.7072973700744996, "grad_norm": 1.1286799907684326, "learning_rate": 8.164466824975647e-05, "loss": 0.1964, "step": 1970 }, { "epoch": 0.7108877120545732, "grad_norm": 0.7001319527626038, "learning_rate": 8.14014448566625e-05, "loss": 0.1728, "step": 1980 }, { "epoch": 0.7144780540346468, "grad_norm": 0.8087079524993896, "learning_rate": 8.115698820953012e-05, "loss": 0.1879, "step": 1990 }, { "epoch": 0.7180683960147204, "grad_norm": 0.5888068079948425, "learning_rate": 8.091130790910065e-05, "loss": 0.2017, "step": 2000 }, { "epoch": 0.721658737994794, "grad_norm": 0.868241012096405, "learning_rate": 8.066441360417283e-05, "loss": 0.2002, "step": 2010 }, { "epoch": 0.7252490799748676, "grad_norm": 0.9173946976661682, "learning_rate": 8.041631499122399e-05, "loss": 0.1822, "step": 2020 }, { "epoch": 0.7288394219549412, "grad_norm": 0.7348050475120544, "learning_rate": 8.016702181402925e-05, "loss": 0.1822, "step": 2030 }, { "epoch": 0.7324297639350148, "grad_norm": 0.5974103808403015, "learning_rate": 7.991654386327877e-05, "loss": 0.1894, "step": 2040 }, { "epoch": 0.7360201059150884, "grad_norm": 1.2631843090057373, "learning_rate": 7.966489097619327e-05, "loss": 0.2005, "step": 2050 }, { "epoch": 0.7396104478951621, "grad_norm": 0.9306305050849915, "learning_rate": 7.941207303613773e-05, "loss": 0.2077, "step": 2060 }, { "epoch": 0.7432007898752356, "grad_norm": 0.6469571590423584, "learning_rate": 7.915809997223312e-05, "loss": 0.1893, "step": 2070 }, { "epoch": 0.7467911318553092, "grad_norm": 0.6804335713386536, "learning_rate": 7.89029817589665e-05, "loss": 0.1985, "step": 2080 }, { "epoch": 0.7503814738353828, "grad_norm": 0.6059459447860718, "learning_rate": 7.864672841579944e-05, "loss": 0.1856, "step": 2090 }, { "epoch": 0.7539718158154565, "grad_norm": 0.6755326390266418, "learning_rate": 7.838935000677419e-05, "loss": 0.1816, "step": 2100 }, { "epoch": 0.75756215779553, "grad_norm": 0.5813919901847839, "learning_rate": 7.813085664011873e-05, "loss": 0.1796, "step": 2110 }, { "epoch": 0.7611524997756036, "grad_norm": 0.9791029691696167, "learning_rate": 7.78712584678496e-05, "loss": 0.204, "step": 2120 }, { "epoch": 0.7647428417556772, "grad_norm": 0.6557776927947998, "learning_rate": 7.76105656853733e-05, "loss": 0.1897, "step": 2130 }, { "epoch": 0.7683331837357509, "grad_norm": 0.5696374177932739, "learning_rate": 7.73487885310858e-05, "loss": 0.1882, "step": 2140 }, { "epoch": 0.7719235257158245, "grad_norm": 0.704799473285675, "learning_rate": 7.708593728597046e-05, "loss": 0.186, "step": 2150 }, { "epoch": 0.775513867695898, "grad_norm": 0.9005138874053955, "learning_rate": 7.682202227319433e-05, "loss": 0.1938, "step": 2160 }, { "epoch": 0.7791042096759716, "grad_norm": 0.7679111957550049, "learning_rate": 7.655705385770258e-05, "loss": 0.182, "step": 2170 }, { "epoch": 0.7826945516560453, "grad_norm": 0.7027627229690552, "learning_rate": 7.629104244581156e-05, "loss": 0.1859, "step": 2180 }, { "epoch": 0.7862848936361189, "grad_norm": 0.8638216853141785, "learning_rate": 7.602399848480002e-05, "loss": 0.1945, "step": 2190 }, { "epoch": 0.7898752356161924, "grad_norm": 0.6846340894699097, "learning_rate": 7.575593246249885e-05, "loss": 0.1899, "step": 2200 }, { "epoch": 0.793465577596266, "grad_norm": 0.7671458721160889, "learning_rate": 7.548685490687919e-05, "loss": 0.1835, "step": 2210 }, { "epoch": 0.7970559195763397, "grad_norm": 1.7174897193908691, "learning_rate": 7.521677638563889e-05, "loss": 0.1742, "step": 2220 }, { "epoch": 0.8006462615564133, "grad_norm": 1.024430751800537, "learning_rate": 7.494570750578757e-05, "loss": 0.1827, "step": 2230 }, { "epoch": 0.8042366035364868, "grad_norm": 0.8393763303756714, "learning_rate": 7.467365891322995e-05, "loss": 0.1726, "step": 2240 }, { "epoch": 0.8078269455165604, "grad_norm": 3.184171438217163, "learning_rate": 7.440064129234783e-05, "loss": 0.1855, "step": 2250 }, { "epoch": 0.8114172874966341, "grad_norm": 0.7078256011009216, "learning_rate": 7.412666536558041e-05, "loss": 0.1783, "step": 2260 }, { "epoch": 0.8150076294767077, "grad_norm": 0.7265491485595703, "learning_rate": 7.385174189300323e-05, "loss": 0.19, "step": 2270 }, { "epoch": 0.8185979714567813, "grad_norm": 0.8136366605758667, "learning_rate": 7.35758816719055e-05, "loss": 0.1685, "step": 2280 }, { "epoch": 0.8221883134368548, "grad_norm": 1.0148855447769165, "learning_rate": 7.329909553636618e-05, "loss": 0.1781, "step": 2290 }, { "epoch": 0.8257786554169284, "grad_norm": 0.9568372964859009, "learning_rate": 7.302139435682831e-05, "loss": 0.1702, "step": 2300 }, { "epoch": 0.8293689973970021, "grad_norm": 1.8222324848175049, "learning_rate": 7.274278903967229e-05, "loss": 0.1823, "step": 2310 }, { "epoch": 0.8329593393770757, "grad_norm": 0.6024855375289917, "learning_rate": 7.246329052678736e-05, "loss": 0.1741, "step": 2320 }, { "epoch": 0.8365496813571492, "grad_norm": 0.9722542762756348, "learning_rate": 7.218290979514202e-05, "loss": 0.1757, "step": 2330 }, { "epoch": 0.8401400233372228, "grad_norm": 2.1216533184051514, "learning_rate": 7.190165785635273e-05, "loss": 0.1748, "step": 2340 }, { "epoch": 0.8437303653172965, "grad_norm": 0.6482483148574829, "learning_rate": 7.161954575625172e-05, "loss": 0.1799, "step": 2350 }, { "epoch": 0.8473207072973701, "grad_norm": 2.2838494777679443, "learning_rate": 7.133658457445291e-05, "loss": 0.1616, "step": 2360 }, { "epoch": 0.8509110492774437, "grad_norm": 0.6801573634147644, "learning_rate": 7.105278542391695e-05, "loss": 0.1806, "step": 2370 }, { "epoch": 0.8545013912575172, "grad_norm": 0.8442283272743225, "learning_rate": 7.076815945051465e-05, "loss": 0.1821, "step": 2380 }, { "epoch": 0.8580917332375909, "grad_norm": 1.1653680801391602, "learning_rate": 7.048271783258936e-05, "loss": 0.1773, "step": 2390 }, { "epoch": 0.8616820752176645, "grad_norm": 0.6987717151641846, "learning_rate": 7.019647178051779e-05, "loss": 0.1693, "step": 2400 }, { "epoch": 0.8652724171977381, "grad_norm": 0.6374627351760864, "learning_rate": 6.990943253626994e-05, "loss": 0.194, "step": 2410 }, { "epoch": 0.8688627591778116, "grad_norm": 0.6507960557937622, "learning_rate": 6.962161137296743e-05, "loss": 0.1568, "step": 2420 }, { "epoch": 0.8724531011578853, "grad_norm": 0.6699422597885132, "learning_rate": 6.933301959444082e-05, "loss": 0.1759, "step": 2430 }, { "epoch": 0.8760434431379589, "grad_norm": 0.48265889286994934, "learning_rate": 6.904366853478567e-05, "loss": 0.1735, "step": 2440 }, { "epoch": 0.8796337851180325, "grad_norm": 0.8710943460464478, "learning_rate": 6.875356955791735e-05, "loss": 0.1807, "step": 2450 }, { "epoch": 0.883224127098106, "grad_norm": 0.7356705069541931, "learning_rate": 6.846273405712483e-05, "loss": 0.1751, "step": 2460 }, { "epoch": 0.8868144690781797, "grad_norm": 0.6466989517211914, "learning_rate": 6.817117345462316e-05, "loss": 0.1599, "step": 2470 }, { "epoch": 0.8904048110582533, "grad_norm": 0.5134007334709167, "learning_rate": 6.787889920110488e-05, "loss": 0.1666, "step": 2480 }, { "epoch": 0.8939951530383269, "grad_norm": 0.471064954996109, "learning_rate": 6.75859227752903e-05, "loss": 0.1624, "step": 2490 }, { "epoch": 0.8975854950184005, "grad_norm": 0.606399655342102, "learning_rate": 6.729225568347677e-05, "loss": 0.1696, "step": 2500 }, { "epoch": 0.9011758369984741, "grad_norm": 0.6752104759216309, "learning_rate": 6.699790945908662e-05, "loss": 0.1607, "step": 2510 }, { "epoch": 0.9047661789785477, "grad_norm": 0.8237718939781189, "learning_rate": 6.670289566221437e-05, "loss": 0.1601, "step": 2520 }, { "epoch": 0.9083565209586213, "grad_norm": 0.7542670965194702, "learning_rate": 6.640722587917263e-05, "loss": 0.1608, "step": 2530 }, { "epoch": 0.9119468629386949, "grad_norm": 0.609646737575531, "learning_rate": 6.611091172203708e-05, "loss": 0.1586, "step": 2540 }, { "epoch": 0.9155372049187686, "grad_norm": 0.7793768644332886, "learning_rate": 6.581396482819038e-05, "loss": 0.1601, "step": 2550 }, { "epoch": 0.9191275468988421, "grad_norm": 0.9071997404098511, "learning_rate": 6.551639685986524e-05, "loss": 0.166, "step": 2560 }, { "epoch": 0.9227178888789157, "grad_norm": 1.0000146627426147, "learning_rate": 6.521821950368625e-05, "loss": 0.1702, "step": 2570 }, { "epoch": 0.9263082308589893, "grad_norm": 0.8889328241348267, "learning_rate": 6.491944447021102e-05, "loss": 0.1669, "step": 2580 }, { "epoch": 0.929898572839063, "grad_norm": 0.6329061985015869, "learning_rate": 6.462008349347022e-05, "loss": 0.1641, "step": 2590 }, { "epoch": 0.9334889148191365, "grad_norm": 0.7821244597434998, "learning_rate": 6.43201483305067e-05, "loss": 0.1643, "step": 2600 }, { "epoch": 0.9370792567992101, "grad_norm": 1.3463133573532104, "learning_rate": 6.401965076091382e-05, "loss": 0.1603, "step": 2610 }, { "epoch": 0.9406695987792837, "grad_norm": 2.534256935119629, "learning_rate": 6.371860258637278e-05, "loss": 0.1577, "step": 2620 }, { "epoch": 0.9442599407593574, "grad_norm": 0.9502484202384949, "learning_rate": 6.341701563018913e-05, "loss": 0.1529, "step": 2630 }, { "epoch": 0.947850282739431, "grad_norm": 0.5928242206573486, "learning_rate": 6.311490173682839e-05, "loss": 0.1633, "step": 2640 }, { "epoch": 0.9514406247195045, "grad_norm": 1.3390663862228394, "learning_rate": 6.281227277145093e-05, "loss": 0.1609, "step": 2650 }, { "epoch": 0.9550309666995781, "grad_norm": 0.8307391405105591, "learning_rate": 6.250914061944597e-05, "loss": 0.1654, "step": 2660 }, { "epoch": 0.9586213086796518, "grad_norm": 0.6453768610954285, "learning_rate": 6.220551718596477e-05, "loss": 0.1504, "step": 2670 }, { "epoch": 0.9622116506597254, "grad_norm": 0.9472678899765015, "learning_rate": 6.190141439545304e-05, "loss": 0.1441, "step": 2680 }, { "epoch": 0.9658019926397989, "grad_norm": 1.077405571937561, "learning_rate": 6.159684419118274e-05, "loss": 0.1574, "step": 2690 }, { "epoch": 0.9693923346198725, "grad_norm": 1.373565673828125, "learning_rate": 6.129181853478285e-05, "loss": 0.1557, "step": 2700 }, { "epoch": 0.9729826765999462, "grad_norm": 0.7159507274627686, "learning_rate": 6.0986349405769795e-05, "loss": 0.148, "step": 2710 }, { "epoch": 0.9765730185800198, "grad_norm": 0.7065421342849731, "learning_rate": 6.068044880107675e-05, "loss": 0.1481, "step": 2720 }, { "epoch": 0.9801633605600933, "grad_norm": 1.0575318336486816, "learning_rate": 6.0374128734582634e-05, "loss": 0.1546, "step": 2730 }, { "epoch": 0.9837537025401669, "grad_norm": 1.3331146240234375, "learning_rate": 6.006740123664022e-05, "loss": 0.1685, "step": 2740 }, { "epoch": 0.9873440445202406, "grad_norm": 0.712989091873169, "learning_rate": 5.976027835360366e-05, "loss": 0.1443, "step": 2750 }, { "epoch": 0.9909343865003142, "grad_norm": 0.9985840320587158, "learning_rate": 5.945277214735537e-05, "loss": 0.1381, "step": 2760 }, { "epoch": 0.9945247284803878, "grad_norm": 0.6109340786933899, "learning_rate": 5.914489469483234e-05, "loss": 0.1506, "step": 2770 }, { "epoch": 0.9981150704604613, "grad_norm": 0.5232493281364441, "learning_rate": 5.883665808755179e-05, "loss": 0.1527, "step": 2780 }, { "epoch": 1.001705412440535, "grad_norm": 1.120089054107666, "learning_rate": 5.852807443113635e-05, "loss": 0.1397, "step": 2790 }, { "epoch": 1.0052957544206085, "grad_norm": 0.9276136755943298, "learning_rate": 5.821915584483853e-05, "loss": 0.1155, "step": 2800 }, { "epoch": 1.008886096400682, "grad_norm": 0.6816973686218262, "learning_rate": 5.790991446106487e-05, "loss": 0.1111, "step": 2810 }, { "epoch": 1.0124764383807558, "grad_norm": 0.8138614296913147, "learning_rate": 5.7600362424899354e-05, "loss": 0.1107, "step": 2820 }, { "epoch": 1.0160667803608294, "grad_norm": 0.5443429350852966, "learning_rate": 5.729051189362649e-05, "loss": 0.1122, "step": 2830 }, { "epoch": 1.019657122340903, "grad_norm": 0.6204805970191956, "learning_rate": 5.698037503625379e-05, "loss": 0.1147, "step": 2840 }, { "epoch": 1.0232474643209766, "grad_norm": 0.5502025485038757, "learning_rate": 5.6669964033033905e-05, "loss": 0.1135, "step": 2850 }, { "epoch": 1.0268378063010501, "grad_norm": 0.6541283130645752, "learning_rate": 5.6359291074986244e-05, "loss": 0.1225, "step": 2860 }, { "epoch": 1.0304281482811237, "grad_norm": 0.6311090588569641, "learning_rate": 5.604836836341816e-05, "loss": 0.1063, "step": 2870 }, { "epoch": 1.0340184902611973, "grad_norm": 0.9657145738601685, "learning_rate": 5.573720810944575e-05, "loss": 0.1171, "step": 2880 }, { "epoch": 1.037608832241271, "grad_norm": 0.53743577003479, "learning_rate": 5.542582253351438e-05, "loss": 0.1128, "step": 2890 }, { "epoch": 1.0411991742213447, "grad_norm": 0.7501124739646912, "learning_rate": 5.511422386491858e-05, "loss": 0.1117, "step": 2900 }, { "epoch": 1.0447895162014182, "grad_norm": 0.7120064496994019, "learning_rate": 5.480242434132191e-05, "loss": 0.1049, "step": 2910 }, { "epoch": 1.0483798581814918, "grad_norm": 0.5755088329315186, "learning_rate": 5.4490436208276194e-05, "loss": 0.1047, "step": 2920 }, { "epoch": 1.0519702001615654, "grad_norm": 0.8773960471153259, "learning_rate": 5.4178271718740744e-05, "loss": 0.1119, "step": 2930 }, { "epoch": 1.055560542141639, "grad_norm": 0.5922686457633972, "learning_rate": 5.3865943132601e-05, "loss": 0.1092, "step": 2940 }, { "epoch": 1.0591508841217125, "grad_norm": 0.7486307621002197, "learning_rate": 5.355346271618715e-05, "loss": 0.1068, "step": 2950 }, { "epoch": 1.0627412261017861, "grad_norm": 0.8534032702445984, "learning_rate": 5.324084274179228e-05, "loss": 0.1072, "step": 2960 }, { "epoch": 1.0663315680818597, "grad_norm": 0.7270232439041138, "learning_rate": 5.292809548719049e-05, "loss": 0.1101, "step": 2970 }, { "epoch": 1.0699219100619335, "grad_norm": 0.5195777416229248, "learning_rate": 5.2615233235154616e-05, "loss": 0.1084, "step": 2980 }, { "epoch": 1.073512252042007, "grad_norm": 0.5684207081794739, "learning_rate": 5.230226827297395e-05, "loss": 0.1026, "step": 2990 }, { "epoch": 1.0771025940220806, "grad_norm": 1.3543568849563599, "learning_rate": 5.198921289197153e-05, "loss": 0.1026, "step": 3000 }, { "epoch": 1.0806929360021542, "grad_norm": 0.7514908313751221, "learning_rate": 5.167607938702154e-05, "loss": 0.1085, "step": 3010 }, { "epoch": 1.0842832779822278, "grad_norm": 0.6683730483055115, "learning_rate": 5.136288005606631e-05, "loss": 0.1012, "step": 3020 }, { "epoch": 1.0878736199623014, "grad_norm": 0.5652278065681458, "learning_rate": 5.1049627199633496e-05, "loss": 0.119, "step": 3030 }, { "epoch": 1.091463961942375, "grad_norm": 0.7017742395401001, "learning_rate": 5.073633312035287e-05, "loss": 0.1057, "step": 3040 }, { "epoch": 1.0950543039224485, "grad_norm": 0.5066478848457336, "learning_rate": 5.042301012247317e-05, "loss": 0.1127, "step": 3050 }, { "epoch": 1.0986446459025223, "grad_norm": 0.535321056842804, "learning_rate": 5.010967051137887e-05, "loss": 0.1102, "step": 3060 }, { "epoch": 1.1022349878825959, "grad_norm": 0.6270662546157837, "learning_rate": 4.979632659310695e-05, "loss": 0.1008, "step": 3070 }, { "epoch": 1.1058253298626695, "grad_norm": 0.748859703540802, "learning_rate": 4.9482990673863485e-05, "loss": 0.0995, "step": 3080 }, { "epoch": 1.109415671842743, "grad_norm": 0.500746488571167, "learning_rate": 4.916967505954046e-05, "loss": 0.1056, "step": 3090 }, { "epoch": 1.1130060138228166, "grad_norm": 0.5748748183250427, "learning_rate": 4.885639205523239e-05, "loss": 0.106, "step": 3100 }, { "epoch": 1.1165963558028902, "grad_norm": 0.593147337436676, "learning_rate": 4.854315396475304e-05, "loss": 0.1086, "step": 3110 }, { "epoch": 1.1201866977829638, "grad_norm": 0.6119722127914429, "learning_rate": 4.822997309015226e-05, "loss": 0.1035, "step": 3120 }, { "epoch": 1.1237770397630373, "grad_norm": 0.5296047925949097, "learning_rate": 4.7916861731232846e-05, "loss": 0.1083, "step": 3130 }, { "epoch": 1.127367381743111, "grad_norm": 0.7060047388076782, "learning_rate": 4.7603832185067416e-05, "loss": 0.1, "step": 3140 }, { "epoch": 1.1309577237231847, "grad_norm": 0.4993881583213806, "learning_rate": 4.729089674551547e-05, "loss": 0.1057, "step": 3150 }, { "epoch": 1.1345480657032583, "grad_norm": 0.7866911888122559, "learning_rate": 4.697806770274062e-05, "loss": 0.0997, "step": 3160 }, { "epoch": 1.1381384076833319, "grad_norm": 0.642524242401123, "learning_rate": 4.6665357342727865e-05, "loss": 0.1051, "step": 3170 }, { "epoch": 1.1417287496634054, "grad_norm": 0.5228136777877808, "learning_rate": 4.6352777946801094e-05, "loss": 0.1002, "step": 3180 }, { "epoch": 1.145319091643479, "grad_norm": 0.9493293762207031, "learning_rate": 4.604034179114067e-05, "loss": 0.1019, "step": 3190 }, { "epoch": 1.1489094336235526, "grad_norm": 0.5647363662719727, "learning_rate": 4.5728061146301476e-05, "loss": 0.0915, "step": 3200 }, { "epoch": 1.1524997756036262, "grad_norm": 0.6017284989356995, "learning_rate": 4.5415948276730805e-05, "loss": 0.1098, "step": 3210 }, { "epoch": 1.1560901175837, "grad_norm": 0.46670928597450256, "learning_rate": 4.5104015440286826e-05, "loss": 0.1056, "step": 3220 }, { "epoch": 1.1596804595637735, "grad_norm": 0.6661453247070312, "learning_rate": 4.479227488775707e-05, "loss": 0.0964, "step": 3230 }, { "epoch": 1.163270801543847, "grad_norm": 0.642352819442749, "learning_rate": 4.4480738862377444e-05, "loss": 0.0907, "step": 3240 }, { "epoch": 1.1668611435239207, "grad_norm": 2.4927215576171875, "learning_rate": 4.4169419599351186e-05, "loss": 0.0969, "step": 3250 }, { "epoch": 1.1704514855039942, "grad_norm": 0.5965277552604675, "learning_rate": 4.3858329325368536e-05, "loss": 0.0921, "step": 3260 }, { "epoch": 1.1740418274840678, "grad_norm": 0.503105103969574, "learning_rate": 4.354748025812639e-05, "loss": 0.0918, "step": 3270 }, { "epoch": 1.1776321694641414, "grad_norm": 2.0070412158966064, "learning_rate": 4.323688460584864e-05, "loss": 0.1008, "step": 3280 }, { "epoch": 1.181222511444215, "grad_norm": 0.5921032428741455, "learning_rate": 4.292655456680651e-05, "loss": 0.0992, "step": 3290 }, { "epoch": 1.1848128534242885, "grad_norm": 0.7106916308403015, "learning_rate": 4.261650232883965e-05, "loss": 0.0998, "step": 3300 }, { "epoch": 1.1884031954043623, "grad_norm": 0.7483718395233154, "learning_rate": 4.230674006887734e-05, "loss": 0.1007, "step": 3310 }, { "epoch": 1.191993537384436, "grad_norm": 0.5854814648628235, "learning_rate": 4.199727995246041e-05, "loss": 0.1001, "step": 3320 }, { "epoch": 1.1955838793645095, "grad_norm": 1.022163987159729, "learning_rate": 4.1688134133263285e-05, "loss": 0.0989, "step": 3330 }, { "epoch": 1.199174221344583, "grad_norm": 0.6698512434959412, "learning_rate": 4.1379314752616784e-05, "loss": 0.0929, "step": 3340 }, { "epoch": 1.2027645633246566, "grad_norm": 0.8445412516593933, "learning_rate": 4.107083393903126e-05, "loss": 0.0865, "step": 3350 }, { "epoch": 1.2063549053047302, "grad_norm": 0.9410879611968994, "learning_rate": 4.076270380772021e-05, "loss": 0.0942, "step": 3360 }, { "epoch": 1.2099452472848038, "grad_norm": 0.4104284346103668, "learning_rate": 4.04549364601245e-05, "loss": 0.0957, "step": 3370 }, { "epoch": 1.2135355892648776, "grad_norm": 0.8418083786964417, "learning_rate": 4.014754398343716e-05, "loss": 0.0925, "step": 3380 }, { "epoch": 1.2171259312449512, "grad_norm": 0.5773093700408936, "learning_rate": 3.984053845012858e-05, "loss": 0.0921, "step": 3390 }, { "epoch": 1.2207162732250247, "grad_norm": 1.2288339138031006, "learning_rate": 3.953393191747239e-05, "loss": 0.089, "step": 3400 }, { "epoch": 1.2243066152050983, "grad_norm": 0.5901492238044739, "learning_rate": 3.9227736427071995e-05, "loss": 0.0903, "step": 3410 }, { "epoch": 1.2278969571851719, "grad_norm": 0.6220996379852295, "learning_rate": 3.892196400438755e-05, "loss": 0.0958, "step": 3420 }, { "epoch": 1.2314872991652455, "grad_norm": 0.6737645864486694, "learning_rate": 3.8616626658263825e-05, "loss": 0.0892, "step": 3430 }, { "epoch": 1.235077641145319, "grad_norm": 0.5661391019821167, "learning_rate": 3.831173638045839e-05, "loss": 0.0888, "step": 3440 }, { "epoch": 1.2386679831253926, "grad_norm": 0.7712500095367432, "learning_rate": 3.800730514517077e-05, "loss": 0.0859, "step": 3450 }, { "epoch": 1.2422583251054662, "grad_norm": 0.7590687274932861, "learning_rate": 3.770334490857217e-05, "loss": 0.0868, "step": 3460 }, { "epoch": 1.24584866708554, "grad_norm": 0.5650063753128052, "learning_rate": 3.7399867608335895e-05, "loss": 0.0974, "step": 3470 }, { "epoch": 1.2494390090656136, "grad_norm": 0.8975266218185425, "learning_rate": 3.709688516316844e-05, "loss": 0.095, "step": 3480 }, { "epoch": 1.2530293510456871, "grad_norm": 0.5311192274093628, "learning_rate": 3.679440947234152e-05, "loss": 0.0925, "step": 3490 }, { "epoch": 1.2566196930257607, "grad_norm": 1.0144147872924805, "learning_rate": 3.649245241522468e-05, "loss": 0.0903, "step": 3500 }, { "epoch": 1.2602100350058343, "grad_norm": 0.6833083629608154, "learning_rate": 3.619102585081872e-05, "loss": 0.0929, "step": 3510 }, { "epoch": 1.2638003769859079, "grad_norm": 0.6380596160888672, "learning_rate": 3.589014161728999e-05, "loss": 0.0787, "step": 3520 }, { "epoch": 1.2673907189659814, "grad_norm": 0.7181170582771301, "learning_rate": 3.558981153150542e-05, "loss": 0.0859, "step": 3530 }, { "epoch": 1.2709810609460552, "grad_norm": 0.6842727661132812, "learning_rate": 3.529004738856853e-05, "loss": 0.0823, "step": 3540 }, { "epoch": 1.2745714029261288, "grad_norm": 1.5806798934936523, "learning_rate": 3.4990860961356044e-05, "loss": 0.085, "step": 3550 }, { "epoch": 1.2781617449062024, "grad_norm": 0.6149685978889465, "learning_rate": 3.4692264000055594e-05, "loss": 0.0818, "step": 3560 }, { "epoch": 1.281752086886276, "grad_norm": 0.797741174697876, "learning_rate": 3.4394268231704266e-05, "loss": 0.0787, "step": 3570 }, { "epoch": 1.2853424288663495, "grad_norm": 0.5583544373512268, "learning_rate": 3.4096885359728036e-05, "loss": 0.0879, "step": 3580 }, { "epoch": 1.288932770846423, "grad_norm": 1.2549068927764893, "learning_rate": 3.380012706348209e-05, "loss": 0.085, "step": 3590 }, { "epoch": 1.2925231128264967, "grad_norm": 0.56533282995224, "learning_rate": 3.350400499779214e-05, "loss": 0.0932, "step": 3600 }, { "epoch": 1.2961134548065703, "grad_norm": 0.9718196392059326, "learning_rate": 3.32085307924967e-05, "loss": 0.0901, "step": 3610 }, { "epoch": 1.2997037967866438, "grad_norm": 0.6769024133682251, "learning_rate": 3.2913716051990394e-05, "loss": 0.0845, "step": 3620 }, { "epoch": 1.3032941387667174, "grad_norm": 1.1620076894760132, "learning_rate": 3.261957235476813e-05, "loss": 0.0831, "step": 3630 }, { "epoch": 1.3068844807467912, "grad_norm": 0.5092564225196838, "learning_rate": 3.232611125297035e-05, "loss": 0.0804, "step": 3640 }, { "epoch": 1.3104748227268648, "grad_norm": 0.42432501912117004, "learning_rate": 3.2033344271929476e-05, "loss": 0.0866, "step": 3650 }, { "epoch": 1.3140651647069383, "grad_norm": 0.5998629331588745, "learning_rate": 3.17412829097171e-05, "loss": 0.0865, "step": 3660 }, { "epoch": 1.317655506687012, "grad_norm": 0.5421279072761536, "learning_rate": 3.144993863669251e-05, "loss": 0.0849, "step": 3670 }, { "epoch": 1.3212458486670855, "grad_norm": 0.6406755447387695, "learning_rate": 3.115932289505213e-05, "loss": 0.0814, "step": 3680 }, { "epoch": 1.324836190647159, "grad_norm": 0.9076423048973083, "learning_rate": 3.086944709838028e-05, "loss": 0.0898, "step": 3690 }, { "epoch": 1.3284265326272329, "grad_norm": 0.7807140350341797, "learning_rate": 3.0580322631200756e-05, "loss": 0.0828, "step": 3700 }, { "epoch": 1.3320168746073064, "grad_norm": 0.6127801537513733, "learning_rate": 3.029196084852981e-05, "loss": 0.08, "step": 3710 }, { "epoch": 1.33560721658738, "grad_norm": 0.6226149797439575, "learning_rate": 3.000437307543017e-05, "loss": 0.0774, "step": 3720 }, { "epoch": 1.3391975585674536, "grad_norm": 0.4141993820667267, "learning_rate": 2.9717570606566287e-05, "loss": 0.0817, "step": 3730 }, { "epoch": 1.3427879005475272, "grad_norm": 0.6416285634040833, "learning_rate": 2.943156470576073e-05, "loss": 0.0792, "step": 3740 }, { "epoch": 1.3463782425276007, "grad_norm": 0.6912229657173157, "learning_rate": 2.914636660555178e-05, "loss": 0.0743, "step": 3750 }, { "epoch": 1.3499685845076743, "grad_norm": 0.8113506436347961, "learning_rate": 2.886198750675233e-05, "loss": 0.0843, "step": 3760 }, { "epoch": 1.353558926487748, "grad_norm": 0.6693570613861084, "learning_rate": 2.8578438578010053e-05, "loss": 0.0718, "step": 3770 }, { "epoch": 1.3571492684678215, "grad_norm": 0.6286030411720276, "learning_rate": 2.8295730955368573e-05, "loss": 0.0821, "step": 3780 }, { "epoch": 1.360739610447895, "grad_norm": 0.5432600975036621, "learning_rate": 2.8013875741830264e-05, "loss": 0.0779, "step": 3790 }, { "epoch": 1.3643299524279688, "grad_norm": 0.5628815293312073, "learning_rate": 2.7732884006920225e-05, "loss": 0.076, "step": 3800 }, { "epoch": 1.3679202944080424, "grad_norm": 0.761500895023346, "learning_rate": 2.745276678625141e-05, "loss": 0.0869, "step": 3810 }, { "epoch": 1.371510636388116, "grad_norm": 0.5888515710830688, "learning_rate": 2.717353508109125e-05, "loss": 0.0812, "step": 3820 }, { "epoch": 1.3751009783681896, "grad_norm": 0.5477086305618286, "learning_rate": 2.6895199857929643e-05, "loss": 0.0772, "step": 3830 }, { "epoch": 1.3786913203482631, "grad_norm": 0.5078212022781372, "learning_rate": 2.6617772048048284e-05, "loss": 0.0707, "step": 3840 }, { "epoch": 1.3822816623283367, "grad_norm": 0.5893701910972595, "learning_rate": 2.634126254709125e-05, "loss": 0.081, "step": 3850 }, { "epoch": 1.3858720043084105, "grad_norm": 0.9726279973983765, "learning_rate": 2.6065682214637123e-05, "loss": 0.0868, "step": 3860 }, { "epoch": 1.389462346288484, "grad_norm": 0.5375906229019165, "learning_rate": 2.5791041873772513e-05, "loss": 0.0754, "step": 3870 }, { "epoch": 1.3930526882685577, "grad_norm": 0.5937024354934692, "learning_rate": 2.5517352310667053e-05, "loss": 0.07, "step": 3880 }, { "epoch": 1.3966430302486312, "grad_norm": 0.5695418119430542, "learning_rate": 2.524462427414967e-05, "loss": 0.0712, "step": 3890 }, { "epoch": 1.4002333722287048, "grad_norm": 0.6219804883003235, "learning_rate": 2.497286847528646e-05, "loss": 0.0771, "step": 3900 }, { "epoch": 1.4038237142087784, "grad_norm": 0.7533654570579529, "learning_rate": 2.4702095586960085e-05, "loss": 0.073, "step": 3910 }, { "epoch": 1.407414056188852, "grad_norm": 0.5750814080238342, "learning_rate": 2.443231624345061e-05, "loss": 0.0753, "step": 3920 }, { "epoch": 1.4110043981689255, "grad_norm": 0.5853593349456787, "learning_rate": 2.416354104001779e-05, "loss": 0.0754, "step": 3930 }, { "epoch": 1.414594740148999, "grad_norm": 0.4552966356277466, "learning_rate": 2.389578053248493e-05, "loss": 0.0753, "step": 3940 }, { "epoch": 1.4181850821290727, "grad_norm": 0.718437671661377, "learning_rate": 2.362904523682447e-05, "loss": 0.0758, "step": 3950 }, { "epoch": 1.4217754241091463, "grad_norm": 0.7326009273529053, "learning_rate": 2.3363345628744832e-05, "loss": 0.0756, "step": 3960 }, { "epoch": 1.42536576608922, "grad_norm": 0.9607858657836914, "learning_rate": 2.3098692143279066e-05, "loss": 0.0719, "step": 3970 }, { "epoch": 1.4289561080692936, "grad_norm": 0.7754957675933838, "learning_rate": 2.283509517437496e-05, "loss": 0.0717, "step": 3980 }, { "epoch": 1.4325464500493672, "grad_norm": 0.8900684714317322, "learning_rate": 2.2572565074486972e-05, "loss": 0.0757, "step": 3990 }, { "epoch": 1.4361367920294408, "grad_norm": 0.6538607478141785, "learning_rate": 2.2311112154169507e-05, "loss": 0.0709, "step": 4000 }, { "epoch": 1.4397271340095144, "grad_norm": 0.6442373991012573, "learning_rate": 2.2050746681672056e-05, "loss": 0.0736, "step": 4010 }, { "epoch": 1.443317475989588, "grad_norm": 0.9824745655059814, "learning_rate": 2.179147888253584e-05, "loss": 0.0741, "step": 4020 }, { "epoch": 1.4469078179696617, "grad_norm": 0.6084447503089905, "learning_rate": 2.1533318939192394e-05, "loss": 0.0675, "step": 4030 }, { "epoch": 1.4504981599497353, "grad_norm": 0.6071482300758362, "learning_rate": 2.127627699056345e-05, "loss": 0.0721, "step": 4040 }, { "epoch": 1.4540885019298089, "grad_norm": 0.5101909637451172, "learning_rate": 2.102036313166289e-05, "loss": 0.0691, "step": 4050 }, { "epoch": 1.4576788439098824, "grad_norm": 0.5907676815986633, "learning_rate": 2.076558741320016e-05, "loss": 0.0624, "step": 4060 }, { "epoch": 1.461269185889956, "grad_norm": 0.7201829552650452, "learning_rate": 2.0511959841185713e-05, "loss": 0.0749, "step": 4070 }, { "epoch": 1.4648595278700296, "grad_norm": 0.5254886150360107, "learning_rate": 2.0259490376537865e-05, "loss": 0.078, "step": 4080 }, { "epoch": 1.4684498698501032, "grad_norm": 0.4855566620826721, "learning_rate": 2.0008188934691614e-05, "loss": 0.0727, "step": 4090 }, { "epoch": 1.4720402118301767, "grad_norm": 0.68084716796875, "learning_rate": 1.975806538520937e-05, "loss": 0.0679, "step": 4100 }, { "epoch": 1.4756305538102503, "grad_norm": 0.5893229842185974, "learning_rate": 1.9509129551393145e-05, "loss": 0.0709, "step": 4110 }, { "epoch": 1.479220895790324, "grad_norm": 0.5513525605201721, "learning_rate": 1.9261391209898912e-05, "loss": 0.0664, "step": 4120 }, { "epoch": 1.4828112377703977, "grad_norm": 0.45056793093681335, "learning_rate": 1.9014860090352476e-05, "loss": 0.0635, "step": 4130 }, { "epoch": 1.4864015797504713, "grad_norm": 0.6190094947814941, "learning_rate": 1.8769545874967566e-05, "loss": 0.0693, "step": 4140 }, { "epoch": 1.4899919217305448, "grad_norm": 0.6586858034133911, "learning_rate": 1.852545819816539e-05, "loss": 0.0652, "step": 4150 }, { "epoch": 1.4935822637106184, "grad_norm": 0.9752713441848755, "learning_rate": 1.8282606646196353e-05, "loss": 0.0744, "step": 4160 }, { "epoch": 1.497172605690692, "grad_norm": 0.6681696176528931, "learning_rate": 1.8041000756763493e-05, "loss": 0.0671, "step": 4170 }, { "epoch": 1.5007629476707658, "grad_norm": 0.5906854867935181, "learning_rate": 1.7800650018648024e-05, "loss": 0.0736, "step": 4180 }, { "epoch": 1.5043532896508394, "grad_norm": 0.6534956097602844, "learning_rate": 1.7561563871336545e-05, "loss": 0.0674, "step": 4190 }, { "epoch": 1.507943631630913, "grad_norm": 0.5932891964912415, "learning_rate": 1.732375170465041e-05, "loss": 0.0672, "step": 4200 }, { "epoch": 1.5115339736109865, "grad_norm": 0.504921019077301, "learning_rate": 1.7087222858376834e-05, "loss": 0.07, "step": 4210 }, { "epoch": 1.51512431559106, "grad_norm": 0.6252205967903137, "learning_rate": 1.6851986621902265e-05, "loss": 0.0637, "step": 4220 }, { "epoch": 1.5187146575711337, "grad_norm": 0.47223180532455444, "learning_rate": 1.6618052233847404e-05, "loss": 0.0697, "step": 4230 }, { "epoch": 1.5223049995512072, "grad_norm": 0.4429969787597656, "learning_rate": 1.6385428881704405e-05, "loss": 0.0664, "step": 4240 }, { "epoch": 1.5258953415312808, "grad_norm": 0.44724294543266296, "learning_rate": 1.6154125701476092e-05, "loss": 0.0642, "step": 4250 }, { "epoch": 1.5294856835113544, "grad_norm": 0.49648982286453247, "learning_rate": 1.59241517773171e-05, "loss": 0.0616, "step": 4260 }, { "epoch": 1.533076025491428, "grad_norm": 0.3683583736419678, "learning_rate": 1.5695516141177142e-05, "loss": 0.0631, "step": 4270 }, { "epoch": 1.5366663674715015, "grad_norm": 0.7180688977241516, "learning_rate": 1.546822777244627e-05, "loss": 0.0658, "step": 4280 }, { "epoch": 1.5402567094515751, "grad_norm": 0.6510112881660461, "learning_rate": 1.5242295597602225e-05, "loss": 0.0624, "step": 4290 }, { "epoch": 1.543847051431649, "grad_norm": 0.6626403331756592, "learning_rate": 1.5017728489859862e-05, "loss": 0.0596, "step": 4300 }, { "epoch": 1.5474373934117225, "grad_norm": 0.7510163187980652, "learning_rate": 1.4794535268822673e-05, "loss": 0.0666, "step": 4310 }, { "epoch": 1.551027735391796, "grad_norm": 0.48777294158935547, "learning_rate": 1.4572724700136386e-05, "loss": 0.0623, "step": 4320 }, { "epoch": 1.5546180773718696, "grad_norm": 0.6740663647651672, "learning_rate": 1.4352305495144736e-05, "loss": 0.0699, "step": 4330 }, { "epoch": 1.5582084193519434, "grad_norm": 0.513523519039154, "learning_rate": 1.4133286310547294e-05, "loss": 0.0686, "step": 4340 }, { "epoch": 1.561798761332017, "grad_norm": 0.689508318901062, "learning_rate": 1.3915675748059537e-05, "loss": 0.0643, "step": 4350 }, { "epoch": 1.5653891033120906, "grad_norm": 0.7558987736701965, "learning_rate": 1.3699482354074989e-05, "loss": 0.0638, "step": 4360 }, { "epoch": 1.5689794452921642, "grad_norm": 1.4819414615631104, "learning_rate": 1.3484714619329574e-05, "loss": 0.0579, "step": 4370 }, { "epoch": 1.5725697872722377, "grad_norm": 0.45672255754470825, "learning_rate": 1.3271380978568187e-05, "loss": 0.0597, "step": 4380 }, { "epoch": 1.5761601292523113, "grad_norm": 0.7070518136024475, "learning_rate": 1.3059489810213371e-05, "loss": 0.0653, "step": 4390 }, { "epoch": 1.5797504712323849, "grad_norm": 0.4744075536727905, "learning_rate": 1.2849049436036326e-05, "loss": 0.0609, "step": 4400 }, { "epoch": 1.5833408132124585, "grad_norm": 0.5028963088989258, "learning_rate": 1.2640068120830035e-05, "loss": 0.0614, "step": 4410 }, { "epoch": 1.586931155192532, "grad_norm": 1.222612977027893, "learning_rate": 1.24325540720847e-05, "loss": 0.058, "step": 4420 }, { "epoch": 1.5905214971726056, "grad_norm": 0.4024209976196289, "learning_rate": 1.2226515439665392e-05, "loss": 0.0599, "step": 4430 }, { "epoch": 1.5941118391526792, "grad_norm": 0.5114520788192749, "learning_rate": 1.2021960315491975e-05, "loss": 0.0525, "step": 4440 }, { "epoch": 1.5977021811327528, "grad_norm": 0.6782193779945374, "learning_rate": 1.1818896733221318e-05, "loss": 0.0605, "step": 4450 }, { "epoch": 1.6012925231128265, "grad_norm": 0.4370103180408478, "learning_rate": 1.1617332667931763e-05, "loss": 0.0569, "step": 4460 }, { "epoch": 1.6048828650929001, "grad_norm": 0.5159808993339539, "learning_rate": 1.1417276035809926e-05, "loss": 0.0583, "step": 4470 }, { "epoch": 1.6084732070729737, "grad_norm": 0.45791277289390564, "learning_rate": 1.1218734693839794e-05, "loss": 0.0639, "step": 4480 }, { "epoch": 1.6120635490530473, "grad_norm": 0.6834966540336609, "learning_rate": 1.1021716439494156e-05, "loss": 0.0626, "step": 4490 }, { "epoch": 1.615653891033121, "grad_norm": 0.4611278176307678, "learning_rate": 1.0826229010428369e-05, "loss": 0.056, "step": 4500 }, { "epoch": 1.6192442330131946, "grad_norm": 0.6188788414001465, "learning_rate": 1.0632280084176444e-05, "loss": 0.0578, "step": 4510 }, { "epoch": 1.6228345749932682, "grad_norm": 0.5647935271263123, "learning_rate": 1.0439877277849575e-05, "loss": 0.0586, "step": 4520 }, { "epoch": 1.6264249169733418, "grad_norm": 0.6752751469612122, "learning_rate": 1.024902814783692e-05, "loss": 0.0555, "step": 4530 }, { "epoch": 1.6300152589534154, "grad_norm": 0.49796855449676514, "learning_rate": 1.0059740189508881e-05, "loss": 0.0556, "step": 4540 }, { "epoch": 1.633605600933489, "grad_norm": 0.6069309115409851, "learning_rate": 9.872020836922724e-06, "loss": 0.0564, "step": 4550 }, { "epoch": 1.6371959429135625, "grad_norm": 0.6443465948104858, "learning_rate": 9.68587746253059e-06, "loss": 0.0559, "step": 4560 }, { "epoch": 1.640786284893636, "grad_norm": 0.48786768317222595, "learning_rate": 9.501317376889985e-06, "loss": 0.0551, "step": 4570 }, { "epoch": 1.6443766268737097, "grad_norm": 0.6036781072616577, "learning_rate": 9.318347828376639e-06, "loss": 0.06, "step": 4580 }, { "epoch": 1.6479669688537832, "grad_norm": 0.7226144075393677, "learning_rate": 9.136976002899855e-06, "loss": 0.0616, "step": 4590 }, { "epoch": 1.6515573108338568, "grad_norm": 0.4328902065753937, "learning_rate": 8.957209023620277e-06, "loss": 0.0504, "step": 4600 }, { "epoch": 1.6551476528139304, "grad_norm": 0.506410539150238, "learning_rate": 8.779053950670146e-06, "loss": 0.059, "step": 4610 }, { "epoch": 1.658737994794004, "grad_norm": 0.6660659909248352, "learning_rate": 8.602517780876007e-06, "loss": 0.0528, "step": 4620 }, { "epoch": 1.6623283367740778, "grad_norm": 0.5838719606399536, "learning_rate": 8.427607447483943e-06, "loss": 0.0561, "step": 4630 }, { "epoch": 1.6659186787541513, "grad_norm": 0.7501543760299683, "learning_rate": 8.254329819887252e-06, "loss": 0.0527, "step": 4640 }, { "epoch": 1.669509020734225, "grad_norm": 0.4832637906074524, "learning_rate": 8.082691703356688e-06, "loss": 0.0512, "step": 4650 }, { "epoch": 1.6730993627142985, "grad_norm": 0.5931252241134644, "learning_rate": 7.912699838773151e-06, "loss": 0.0513, "step": 4660 }, { "epoch": 1.6766897046943723, "grad_norm": 0.5244051218032837, "learning_rate": 7.744360902363002e-06, "loss": 0.0544, "step": 4670 }, { "epoch": 1.6802800466744459, "grad_norm": 0.6513102054595947, "learning_rate": 7.577681505435813e-06, "loss": 0.054, "step": 4680 }, { "epoch": 1.6838703886545194, "grad_norm": 0.8317810297012329, "learning_rate": 7.412668194124728e-06, "loss": 0.0507, "step": 4690 }, { "epoch": 1.687460730634593, "grad_norm": 0.4875124394893646, "learning_rate": 7.2493274491294285e-06, "loss": 0.0488, "step": 4700 }, { "epoch": 1.6910510726146666, "grad_norm": 0.4913179576396942, "learning_rate": 7.087665685461497e-06, "loss": 0.0551, "step": 4710 }, { "epoch": 1.6946414145947402, "grad_norm": 0.47164708375930786, "learning_rate": 6.9276892521925816e-06, "loss": 0.0548, "step": 4720 }, { "epoch": 1.6982317565748137, "grad_norm": 0.39257460832595825, "learning_rate": 6.769404432204973e-06, "loss": 0.0532, "step": 4730 }, { "epoch": 1.7018220985548873, "grad_norm": 0.548692524433136, "learning_rate": 6.61281744194494e-06, "loss": 0.0503, "step": 4740 }, { "epoch": 1.7054124405349609, "grad_norm": 0.476531445980072, "learning_rate": 6.4579344311784475e-06, "loss": 0.0514, "step": 4750 }, { "epoch": 1.7090027825150345, "grad_norm": 0.47037366032600403, "learning_rate": 6.304761482749777e-06, "loss": 0.0497, "step": 4760 }, { "epoch": 1.712593124495108, "grad_norm": 0.7144917845726013, "learning_rate": 6.153304612342514e-06, "loss": 0.0529, "step": 4770 }, { "epoch": 1.7161834664751816, "grad_norm": 0.7041458487510681, "learning_rate": 6.003569768243411e-06, "loss": 0.0493, "step": 4780 }, { "epoch": 1.7197738084552554, "grad_norm": 0.5702252984046936, "learning_rate": 5.855562831108624e-06, "loss": 0.0491, "step": 4790 }, { "epoch": 1.723364150435329, "grad_norm": 0.697307288646698, "learning_rate": 5.709289613732888e-06, "loss": 0.0533, "step": 4800 }, { "epoch": 1.7269544924154026, "grad_norm": 0.6015498638153076, "learning_rate": 5.564755860821147e-06, "loss": 0.0521, "step": 4810 }, { "epoch": 1.7305448343954761, "grad_norm": 0.6062167882919312, "learning_rate": 5.421967248763021e-06, "loss": 0.0547, "step": 4820 }, { "epoch": 1.73413517637555, "grad_norm": 0.45276394486427307, "learning_rate": 5.2809293854097495e-06, "loss": 0.0553, "step": 4830 }, { "epoch": 1.7377255183556235, "grad_norm": 0.4024350047111511, "learning_rate": 5.14164780985405e-06, "loss": 0.0512, "step": 4840 }, { "epoch": 1.741315860335697, "grad_norm": 0.6370827555656433, "learning_rate": 5.0041279922125705e-06, "loss": 0.0562, "step": 4850 }, { "epoch": 1.7449062023157706, "grad_norm": 0.5606709122657776, "learning_rate": 4.868375333411002e-06, "loss": 0.0556, "step": 4860 }, { "epoch": 1.7484965442958442, "grad_norm": 0.8585699796676636, "learning_rate": 4.734395164971978e-06, "loss": 0.0459, "step": 4870 }, { "epoch": 1.7520868862759178, "grad_norm": 0.4308234453201294, "learning_rate": 4.6021927488057334e-06, "loss": 0.0471, "step": 4880 }, { "epoch": 1.7556772282559914, "grad_norm": 0.4660848081111908, "learning_rate": 4.471773277003427e-06, "loss": 0.0524, "step": 4890 }, { "epoch": 1.759267570236065, "grad_norm": 0.6825345158576965, "learning_rate": 4.343141871633188e-06, "loss": 0.0521, "step": 4900 }, { "epoch": 1.7628579122161385, "grad_norm": 0.6137758493423462, "learning_rate": 4.216303584538988e-06, "loss": 0.0539, "step": 4910 }, { "epoch": 1.766448254196212, "grad_norm": 0.7231915593147278, "learning_rate": 4.0912633971422425e-06, "loss": 0.0466, "step": 4920 }, { "epoch": 1.7700385961762857, "grad_norm": 0.6705979108810425, "learning_rate": 3.968026220246174e-06, "loss": 0.047, "step": 4930 }, { "epoch": 1.7736289381563592, "grad_norm": 0.5974612832069397, "learning_rate": 3.846596893842891e-06, "loss": 0.0499, "step": 4940 }, { "epoch": 1.777219280136433, "grad_norm": 0.6848942637443542, "learning_rate": 3.7269801869233845e-06, "loss": 0.0545, "step": 4950 }, { "epoch": 1.7808096221165066, "grad_norm": 0.6268109083175659, "learning_rate": 3.6091807972901624e-06, "loss": 0.0519, "step": 4960 }, { "epoch": 1.7843999640965802, "grad_norm": 0.8246615529060364, "learning_rate": 3.49320335137282e-06, "loss": 0.0495, "step": 4970 }, { "epoch": 1.7879903060766538, "grad_norm": 0.7163103222846985, "learning_rate": 3.3790524040462566e-06, "loss": 0.0465, "step": 4980 }, { "epoch": 1.7915806480567276, "grad_norm": 0.5779036283493042, "learning_rate": 3.266732438451842e-06, "loss": 0.0493, "step": 4990 }, { "epoch": 1.7951709900368011, "grad_norm": 0.5178433060646057, "learning_rate": 3.1562478658213656e-06, "loss": 0.0499, "step": 5000 }, { "epoch": 1.7987613320168747, "grad_norm": 0.7967355847358704, "learning_rate": 3.0476030253037415e-06, "loss": 0.0502, "step": 5010 }, { "epoch": 1.8023516739969483, "grad_norm": 0.8158264756202698, "learning_rate": 2.9408021837945942e-06, "loss": 0.0481, "step": 5020 }, { "epoch": 1.8059420159770219, "grad_norm": 0.43987634778022766, "learning_rate": 2.8358495357687364e-06, "loss": 0.0456, "step": 5030 }, { "epoch": 1.8095323579570954, "grad_norm": 0.45231232047080994, "learning_rate": 2.7327492031153866e-06, "loss": 0.0474, "step": 5040 }, { "epoch": 1.813122699937169, "grad_norm": 0.799350917339325, "learning_rate": 2.631505234976311e-06, "loss": 0.0489, "step": 5050 }, { "epoch": 1.8167130419172426, "grad_norm": 0.5466026663780212, "learning_rate": 2.5321216075867626e-06, "loss": 0.0474, "step": 5060 }, { "epoch": 1.8203033838973162, "grad_norm": 0.7424982190132141, "learning_rate": 2.4346022241193643e-06, "loss": 0.0452, "step": 5070 }, { "epoch": 1.8238937258773897, "grad_norm": 0.7979154586791992, "learning_rate": 2.3389509145308076e-06, "loss": 0.05, "step": 5080 }, { "epoch": 1.8274840678574633, "grad_norm": 0.6414862275123596, "learning_rate": 2.245171435411414e-06, "loss": 0.0487, "step": 5090 }, { "epoch": 1.8310744098375369, "grad_norm": 0.5069670081138611, "learning_rate": 2.1532674698376e-06, "loss": 0.0464, "step": 5100 }, { "epoch": 1.8346647518176105, "grad_norm": 0.4745350480079651, "learning_rate": 2.0632426272272464e-06, "loss": 0.0467, "step": 5110 }, { "epoch": 1.8382550937976843, "grad_norm": 0.5952518582344055, "learning_rate": 1.975100443197958e-06, "loss": 0.0508, "step": 5120 }, { "epoch": 1.8418454357777578, "grad_norm": 0.5413398146629333, "learning_rate": 1.8888443794281618e-06, "loss": 0.0426, "step": 5130 }, { "epoch": 1.8454357777578314, "grad_norm": 0.6297146677970886, "learning_rate": 1.8044778235211723e-06, "loss": 0.0523, "step": 5140 }, { "epoch": 1.849026119737905, "grad_norm": 0.458870530128479, "learning_rate": 1.72200408887217e-06, "loss": 0.0462, "step": 5150 }, { "epoch": 1.8526164617179788, "grad_norm": 0.6490904688835144, "learning_rate": 1.6414264145380442e-06, "loss": 0.0484, "step": 5160 }, { "epoch": 1.8562068036980524, "grad_norm": 0.7383233904838562, "learning_rate": 1.562747965110195e-06, "loss": 0.0484, "step": 5170 }, { "epoch": 1.859797145678126, "grad_norm": 2.4921016693115234, "learning_rate": 1.4859718305902326e-06, "loss": 0.046, "step": 5180 }, { "epoch": 1.8633874876581995, "grad_norm": 1.6146339178085327, "learning_rate": 1.411101026268652e-06, "loss": 0.043, "step": 5190 }, { "epoch": 1.866977829638273, "grad_norm": 0.47561097145080566, "learning_rate": 1.3381384926063833e-06, "loss": 0.0467, "step": 5200 }, { "epoch": 1.8705681716183467, "grad_norm": 0.5113374590873718, "learning_rate": 1.2670870951193292e-06, "loss": 0.0475, "step": 5210 }, { "epoch": 1.8741585135984202, "grad_norm": 0.5401134490966797, "learning_rate": 1.197949624265776e-06, "loss": 0.0482, "step": 5220 }, { "epoch": 1.8777488555784938, "grad_norm": 0.4193181097507477, "learning_rate": 1.1307287953368995e-06, "loss": 0.0472, "step": 5230 }, { "epoch": 1.8813391975585674, "grad_norm": 0.45812806487083435, "learning_rate": 1.065427248350015e-06, "loss": 0.0477, "step": 5240 }, { "epoch": 1.884929539538641, "grad_norm": 0.8749078512191772, "learning_rate": 1.0020475479449731e-06, "loss": 0.0507, "step": 5250 }, { "epoch": 1.8885198815187145, "grad_norm": 0.48960697650909424, "learning_rate": 9.405921832833841e-07, "loss": 0.046, "step": 5260 }, { "epoch": 1.892110223498788, "grad_norm": 0.7578288316726685, "learning_rate": 8.810635679509071e-07, "loss": 0.0471, "step": 5270 }, { "epoch": 1.895700565478862, "grad_norm": 0.6842608451843262, "learning_rate": 8.23464039862426e-07, "loss": 0.0445, "step": 5280 }, { "epoch": 1.8992909074589355, "grad_norm": 0.5089036226272583, "learning_rate": 7.67795861170234e-07, "loss": 0.0457, "step": 5290 }, { "epoch": 1.902881249439009, "grad_norm": 0.5393949151039124, "learning_rate": 7.140612181752048e-07, "loss": 0.0456, "step": 5300 }, { "epoch": 1.9064715914190826, "grad_norm": 0.9976809024810791, "learning_rate": 6.622622212409058e-07, "loss": 0.047, "step": 5310 }, { "epoch": 1.9100619333991564, "grad_norm": 0.5556519031524658, "learning_rate": 6.124009047107471e-07, "loss": 0.0517, "step": 5320 }, { "epoch": 1.91365227537923, "grad_norm": 0.534712553024292, "learning_rate": 5.644792268280574e-07, "loss": 0.0427, "step": 5330 }, { "epoch": 1.9172426173593036, "grad_norm": 0.7053726315498352, "learning_rate": 5.18499069659184e-07, "loss": 0.0455, "step": 5340 }, { "epoch": 1.9208329593393771, "grad_norm": 0.5793641209602356, "learning_rate": 4.744622390195963e-07, "loss": 0.0513, "step": 5350 }, { "epoch": 1.9244233013194507, "grad_norm": 0.4043155908584595, "learning_rate": 4.323704644029203e-07, "loss": 0.0501, "step": 5360 }, { "epoch": 1.9280136432995243, "grad_norm": 0.4776788353919983, "learning_rate": 3.9222539891307086e-07, "loss": 0.0415, "step": 5370 }, { "epoch": 1.9316039852795979, "grad_norm": 0.6649408340454102, "learning_rate": 3.5402861919928697e-07, "loss": 0.0451, "step": 5380 }, { "epoch": 1.9351943272596714, "grad_norm": 3.3624627590179443, "learning_rate": 3.1778162539421453e-07, "loss": 0.0472, "step": 5390 }, { "epoch": 1.938784669239745, "grad_norm": 0.5529268980026245, "learning_rate": 2.8348584105501453e-07, "loss": 0.045, "step": 5400 }, { "epoch": 1.9423750112198186, "grad_norm": 0.6905925273895264, "learning_rate": 2.511426131074246e-07, "loss": 0.0452, "step": 5410 }, { "epoch": 1.9459653531998922, "grad_norm": 0.6144551038742065, "learning_rate": 2.2075321179289565e-07, "loss": 0.0422, "step": 5420 }, { "epoch": 1.9495556951799657, "grad_norm": 1.2887723445892334, "learning_rate": 1.9231883061866517e-07, "loss": 0.0441, "step": 5430 }, { "epoch": 1.9531460371600395, "grad_norm": 0.7968602776527405, "learning_rate": 1.6584058631090582e-07, "loss": 0.0455, "step": 5440 }, { "epoch": 1.9567363791401131, "grad_norm": 0.7239225506782532, "learning_rate": 1.4131951877087158e-07, "loss": 0.0461, "step": 5450 }, { "epoch": 1.9603267211201867, "grad_norm": 0.6258605718612671, "learning_rate": 1.1875659103404157e-07, "loss": 0.0449, "step": 5460 }, { "epoch": 1.9639170631002603, "grad_norm": 0.7048450708389282, "learning_rate": 9.815268923230592e-08, "loss": 0.0469, "step": 5470 }, { "epoch": 1.967507405080334, "grad_norm": 0.6698242425918579, "learning_rate": 7.95086225591657e-08, "loss": 0.0469, "step": 5480 }, { "epoch": 1.9710977470604076, "grad_norm": 0.612483561038971, "learning_rate": 6.282512323795287e-08, "loss": 0.0432, "step": 5490 }, { "epoch": 1.9746880890404812, "grad_norm": 1.0906122922897339, "learning_rate": 4.81028464930755e-08, "loss": 0.0439, "step": 5500 }, { "epoch": 1.9782784310205548, "grad_norm": 0.5854030847549438, "learning_rate": 3.534237052426059e-08, "loss": 0.0461, "step": 5510 }, { "epoch": 1.9818687730006284, "grad_norm": 0.5965482592582703, "learning_rate": 2.4544196483888837e-08, "loss": 0.0449, "step": 5520 }, { "epoch": 1.985459114980702, "grad_norm": 1.0227429866790771, "learning_rate": 1.5708748457271548e-08, "loss": 0.0476, "step": 5530 }, { "epoch": 1.9890494569607755, "grad_norm": 0.506277859210968, "learning_rate": 8.836373446019507e-09, "loss": 0.0477, "step": 5540 }, { "epoch": 1.992639798940849, "grad_norm": 0.4811525344848633, "learning_rate": 3.927341354420522e-09, "loss": 0.0468, "step": 5550 }, { "epoch": 1.9962301409209227, "grad_norm": 0.4584663212299347, "learning_rate": 9.818449787979412e-10, "loss": 0.0424, "step": 5560 }, { "epoch": 1.9998204829009962, "grad_norm": 0.6924448609352112, "learning_rate": 0.0, "loss": 0.0443, "step": 5570 } ], "logging_steps": 10, "max_steps": 5570, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.008182835124896e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }