|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.98971898560658, |
|
"eval_steps": 500, |
|
"global_step": 910, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0054832076764907475, |
|
"grad_norm": 3.770785411476767, |
|
"learning_rate": 8.791208791208792e-07, |
|
"loss": 1.4398, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010966415352981495, |
|
"grad_norm": 3.735862689362563, |
|
"learning_rate": 1.7582417582417585e-06, |
|
"loss": 1.4312, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01644962302947224, |
|
"grad_norm": 3.6716515552918993, |
|
"learning_rate": 2.6373626373626375e-06, |
|
"loss": 1.4282, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02193283070596299, |
|
"grad_norm": 2.9996067371134263, |
|
"learning_rate": 3.516483516483517e-06, |
|
"loss": 1.4253, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.027416038382453736, |
|
"grad_norm": 2.010909979843364, |
|
"learning_rate": 4.395604395604396e-06, |
|
"loss": 1.409, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03289924605894448, |
|
"grad_norm": 4.214479330209327, |
|
"learning_rate": 5.274725274725275e-06, |
|
"loss": 1.3919, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03838245373543523, |
|
"grad_norm": 3.813560464004789, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 1.3746, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04386566141192598, |
|
"grad_norm": 4.133090320147191, |
|
"learning_rate": 7.032967032967034e-06, |
|
"loss": 1.369, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04934886908841672, |
|
"grad_norm": 2.9535800476287335, |
|
"learning_rate": 7.912087912087913e-06, |
|
"loss": 1.3454, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05483207676490747, |
|
"grad_norm": 1.8811217973016594, |
|
"learning_rate": 8.791208791208792e-06, |
|
"loss": 1.313, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06031528444139822, |
|
"grad_norm": 2.125700583604983, |
|
"learning_rate": 9.670329670329671e-06, |
|
"loss": 1.2887, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.06579849211788896, |
|
"grad_norm": 2.0105026799783703, |
|
"learning_rate": 1.054945054945055e-05, |
|
"loss": 1.3023, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07128169979437972, |
|
"grad_norm": 1.5919133601753956, |
|
"learning_rate": 1.1428571428571429e-05, |
|
"loss": 1.2574, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07676490747087046, |
|
"grad_norm": 1.6993983763168556, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 1.2502, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0822481151473612, |
|
"grad_norm": 1.6394584467433646, |
|
"learning_rate": 1.3186813186813187e-05, |
|
"loss": 1.2388, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08773132282385196, |
|
"grad_norm": 1.0501223018977153, |
|
"learning_rate": 1.4065934065934068e-05, |
|
"loss": 1.2362, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0932145305003427, |
|
"grad_norm": 1.5548564221506895, |
|
"learning_rate": 1.4945054945054947e-05, |
|
"loss": 1.2132, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.09869773817683344, |
|
"grad_norm": 1.2477698225638492, |
|
"learning_rate": 1.5824175824175826e-05, |
|
"loss": 1.2134, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1041809458533242, |
|
"grad_norm": 1.0400324580403362, |
|
"learning_rate": 1.6703296703296707e-05, |
|
"loss": 1.1997, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.10966415352981494, |
|
"grad_norm": 1.3692603670745445, |
|
"learning_rate": 1.7582417582417584e-05, |
|
"loss": 1.2207, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11514736120630568, |
|
"grad_norm": 1.3232718240744366, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 1.1828, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.12063056888279644, |
|
"grad_norm": 1.1051790307453309, |
|
"learning_rate": 1.9340659340659342e-05, |
|
"loss": 1.162, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.12611377655928718, |
|
"grad_norm": 1.1420336023724322, |
|
"learning_rate": 2.021978021978022e-05, |
|
"loss": 1.1591, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.13159698423577793, |
|
"grad_norm": 1.4925091531133705, |
|
"learning_rate": 2.10989010989011e-05, |
|
"loss": 1.1781, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.13708019191226867, |
|
"grad_norm": 1.3162293353241243, |
|
"learning_rate": 2.197802197802198e-05, |
|
"loss": 1.1565, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14256339958875944, |
|
"grad_norm": 1.6926059339584283, |
|
"learning_rate": 2.2857142857142858e-05, |
|
"loss": 1.1458, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.14804660726525018, |
|
"grad_norm": 1.6921867935133699, |
|
"learning_rate": 2.373626373626374e-05, |
|
"loss": 1.1631, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.15352981494174092, |
|
"grad_norm": 1.2313296487532925, |
|
"learning_rate": 2.461538461538462e-05, |
|
"loss": 1.1549, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.15901302261823166, |
|
"grad_norm": 2.046038737361219, |
|
"learning_rate": 2.5494505494505493e-05, |
|
"loss": 1.1562, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1644962302947224, |
|
"grad_norm": 1.1115829456958797, |
|
"learning_rate": 2.6373626373626374e-05, |
|
"loss": 1.1518, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16997943797121315, |
|
"grad_norm": 1.8782544663966916, |
|
"learning_rate": 2.7252747252747255e-05, |
|
"loss": 1.1518, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.17546264564770392, |
|
"grad_norm": 1.1415170717900838, |
|
"learning_rate": 2.8131868131868136e-05, |
|
"loss": 1.1231, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.18094585332419466, |
|
"grad_norm": 2.1125876630935028, |
|
"learning_rate": 2.9010989010989013e-05, |
|
"loss": 1.1841, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1864290610006854, |
|
"grad_norm": 1.5831068068806378, |
|
"learning_rate": 2.9890109890109894e-05, |
|
"loss": 1.1393, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.19191226867717615, |
|
"grad_norm": 1.3946597181538087, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 1.1305, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1973954763536669, |
|
"grad_norm": 1.4952578896409994, |
|
"learning_rate": 3.164835164835165e-05, |
|
"loss": 1.125, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.20287868403015763, |
|
"grad_norm": 1.7335130084562178, |
|
"learning_rate": 3.252747252747253e-05, |
|
"loss": 1.1105, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2083618917066484, |
|
"grad_norm": 1.6561068450562475, |
|
"learning_rate": 3.340659340659341e-05, |
|
"loss": 1.1266, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.21384509938313914, |
|
"grad_norm": 1.3219106884217757, |
|
"learning_rate": 3.4285714285714284e-05, |
|
"loss": 1.1182, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.21932830705962988, |
|
"grad_norm": 1.6718121802988681, |
|
"learning_rate": 3.516483516483517e-05, |
|
"loss": 1.1016, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22481151473612063, |
|
"grad_norm": 1.5826523659282272, |
|
"learning_rate": 3.6043956043956045e-05, |
|
"loss": 1.1114, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.23029472241261137, |
|
"grad_norm": 1.2747750565061255, |
|
"learning_rate": 3.692307692307693e-05, |
|
"loss": 1.1154, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2357779300891021, |
|
"grad_norm": 1.6996352048635637, |
|
"learning_rate": 3.7802197802197807e-05, |
|
"loss": 1.1026, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.24126113776559288, |
|
"grad_norm": 1.530343865914298, |
|
"learning_rate": 3.8681318681318684e-05, |
|
"loss": 1.1105, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.24674434544208362, |
|
"grad_norm": 1.4665064928890283, |
|
"learning_rate": 3.956043956043957e-05, |
|
"loss": 1.1143, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.25222755311857437, |
|
"grad_norm": 1.6561256024539444, |
|
"learning_rate": 4.043956043956044e-05, |
|
"loss": 1.0925, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2577107607950651, |
|
"grad_norm": 1.6444247903984497, |
|
"learning_rate": 4.131868131868133e-05, |
|
"loss": 1.0985, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.26319396847155585, |
|
"grad_norm": 1.4674837139863959, |
|
"learning_rate": 4.21978021978022e-05, |
|
"loss": 1.0918, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.2686771761480466, |
|
"grad_norm": 1.8220362504472873, |
|
"learning_rate": 4.307692307692308e-05, |
|
"loss": 1.11, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.27416038382453733, |
|
"grad_norm": 1.174555410925102, |
|
"learning_rate": 4.395604395604396e-05, |
|
"loss": 1.0623, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2796435915010281, |
|
"grad_norm": 2.189193064881643, |
|
"learning_rate": 4.483516483516484e-05, |
|
"loss": 1.1014, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.2851267991775189, |
|
"grad_norm": 1.6263384928431146, |
|
"learning_rate": 4.5714285714285716e-05, |
|
"loss": 1.0747, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2906100068540096, |
|
"grad_norm": 1.6900424233628644, |
|
"learning_rate": 4.65934065934066e-05, |
|
"loss": 1.1034, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.29609321453050036, |
|
"grad_norm": 1.3519597321545578, |
|
"learning_rate": 4.747252747252748e-05, |
|
"loss": 1.0879, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3015764222069911, |
|
"grad_norm": 2.0759258793682207, |
|
"learning_rate": 4.8351648351648355e-05, |
|
"loss": 1.1132, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.30705962988348184, |
|
"grad_norm": 1.478260710492767, |
|
"learning_rate": 4.923076923076924e-05, |
|
"loss": 1.0577, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3125428375599726, |
|
"grad_norm": 1.7076569151167882, |
|
"learning_rate": 5.0109890109890116e-05, |
|
"loss": 1.0804, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.31802604523646333, |
|
"grad_norm": 1.452859747995007, |
|
"learning_rate": 5.098901098901099e-05, |
|
"loss": 1.0929, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.32350925291295407, |
|
"grad_norm": 1.6662600890644894, |
|
"learning_rate": 5.186813186813188e-05, |
|
"loss": 1.0862, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3289924605894448, |
|
"grad_norm": 1.7184733724733494, |
|
"learning_rate": 5.274725274725275e-05, |
|
"loss": 1.0912, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.33447566826593556, |
|
"grad_norm": 1.509168313113862, |
|
"learning_rate": 5.3626373626373626e-05, |
|
"loss": 1.0603, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3399588759424263, |
|
"grad_norm": 1.8018031414395794, |
|
"learning_rate": 5.450549450549451e-05, |
|
"loss": 1.0659, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.34544208361891704, |
|
"grad_norm": 1.4688399614420558, |
|
"learning_rate": 5.538461538461539e-05, |
|
"loss": 1.0705, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.35092529129540784, |
|
"grad_norm": 1.7891549976809995, |
|
"learning_rate": 5.626373626373627e-05, |
|
"loss": 1.0667, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3564084989718986, |
|
"grad_norm": 1.6402319088531265, |
|
"learning_rate": 5.714285714285715e-05, |
|
"loss": 1.0655, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3618917066483893, |
|
"grad_norm": 1.4346232433315216, |
|
"learning_rate": 5.8021978021978026e-05, |
|
"loss": 1.0708, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.36737491432488006, |
|
"grad_norm": 1.85622206538933, |
|
"learning_rate": 5.890109890109891e-05, |
|
"loss": 1.09, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.3728581220013708, |
|
"grad_norm": 1.4954802686876467, |
|
"learning_rate": 5.978021978021979e-05, |
|
"loss": 1.0838, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.37834132967786155, |
|
"grad_norm": 2.4295449046847457, |
|
"learning_rate": 6.0659340659340665e-05, |
|
"loss": 1.1125, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3838245373543523, |
|
"grad_norm": 1.1324818025195698, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 1.0794, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.38930774503084303, |
|
"grad_norm": 2.352945287513035, |
|
"learning_rate": 6.241758241758242e-05, |
|
"loss": 1.0933, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3947909527073338, |
|
"grad_norm": 1.572624911613927, |
|
"learning_rate": 6.32967032967033e-05, |
|
"loss": 1.0702, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4002741603838245, |
|
"grad_norm": 1.9834325943744104, |
|
"learning_rate": 6.417582417582419e-05, |
|
"loss": 1.0934, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.40575736806031526, |
|
"grad_norm": 1.4472718261690063, |
|
"learning_rate": 6.505494505494506e-05, |
|
"loss": 1.1076, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.41124057573680606, |
|
"grad_norm": 2.340084320788359, |
|
"learning_rate": 6.593406593406594e-05, |
|
"loss": 1.0812, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4167237834132968, |
|
"grad_norm": 1.5327454596647803, |
|
"learning_rate": 6.681318681318683e-05, |
|
"loss": 1.0976, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.42220699108978754, |
|
"grad_norm": 1.3716878704229134, |
|
"learning_rate": 6.76923076923077e-05, |
|
"loss": 1.0985, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4276901987662783, |
|
"grad_norm": 2.1717200063816264, |
|
"learning_rate": 6.857142857142857e-05, |
|
"loss": 1.0772, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.433173406442769, |
|
"grad_norm": 1.680504341194773, |
|
"learning_rate": 6.945054945054945e-05, |
|
"loss": 1.0643, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.43865661411925977, |
|
"grad_norm": 1.812493407043134, |
|
"learning_rate": 7.032967032967034e-05, |
|
"loss": 1.0811, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4441398217957505, |
|
"grad_norm": 1.5610946891286874, |
|
"learning_rate": 7.12087912087912e-05, |
|
"loss": 1.0605, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.44962302947224125, |
|
"grad_norm": 2.0021707982693577, |
|
"learning_rate": 7.208791208791209e-05, |
|
"loss": 1.0631, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.455106237148732, |
|
"grad_norm": 1.6001450391406942, |
|
"learning_rate": 7.296703296703297e-05, |
|
"loss": 1.0591, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.46058944482522274, |
|
"grad_norm": 1.6957731179236344, |
|
"learning_rate": 7.384615384615386e-05, |
|
"loss": 1.0758, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4660726525017135, |
|
"grad_norm": 2.072499331723963, |
|
"learning_rate": 7.472527472527473e-05, |
|
"loss": 1.0555, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4715558601782042, |
|
"grad_norm": 1.4742479526369552, |
|
"learning_rate": 7.560439560439561e-05, |
|
"loss": 1.0775, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.477039067854695, |
|
"grad_norm": 2.1697322374940877, |
|
"learning_rate": 7.64835164835165e-05, |
|
"loss": 1.0969, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.48252227553118576, |
|
"grad_norm": 1.5007024763453864, |
|
"learning_rate": 7.736263736263737e-05, |
|
"loss": 1.0679, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.4880054832076765, |
|
"grad_norm": 2.3539849914930713, |
|
"learning_rate": 7.824175824175825e-05, |
|
"loss": 1.0784, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.49348869088416725, |
|
"grad_norm": 1.2113825492573165, |
|
"learning_rate": 7.912087912087914e-05, |
|
"loss": 1.0826, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.498971898560658, |
|
"grad_norm": 2.048722031904498, |
|
"learning_rate": 8e-05, |
|
"loss": 1.0955, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5044551062371487, |
|
"grad_norm": 1.492829681414569, |
|
"learning_rate": 7.999970571955439e-05, |
|
"loss": 1.0652, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5099383139136395, |
|
"grad_norm": 2.130280283155257, |
|
"learning_rate": 7.999882288254757e-05, |
|
"loss": 1.0768, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5154215215901302, |
|
"grad_norm": 1.4526728278544645, |
|
"learning_rate": 7.999735150196965e-05, |
|
"loss": 1.0701, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.520904729266621, |
|
"grad_norm": 1.4027863954186088, |
|
"learning_rate": 7.999529159947053e-05, |
|
"loss": 1.0795, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5263879369431117, |
|
"grad_norm": 2.0703960841299187, |
|
"learning_rate": 7.999264320535968e-05, |
|
"loss": 1.0873, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5318711446196025, |
|
"grad_norm": 1.5670304643178508, |
|
"learning_rate": 7.998940635860564e-05, |
|
"loss": 1.0731, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5373543522960932, |
|
"grad_norm": 1.6758059923230086, |
|
"learning_rate": 7.998558110683542e-05, |
|
"loss": 1.0788, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.542837559972584, |
|
"grad_norm": 1.3777914107786882, |
|
"learning_rate": 7.998116750633388e-05, |
|
"loss": 1.0675, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5483207676490747, |
|
"grad_norm": 1.7742978814606243, |
|
"learning_rate": 7.997616562204282e-05, |
|
"loss": 1.0619, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5538039753255655, |
|
"grad_norm": 1.910695077575461, |
|
"learning_rate": 7.99705755275601e-05, |
|
"loss": 1.0949, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.5592871830020562, |
|
"grad_norm": 1.6648059638461317, |
|
"learning_rate": 7.996439730513846e-05, |
|
"loss": 1.0712, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.564770390678547, |
|
"grad_norm": 1.8677352575926855, |
|
"learning_rate": 7.995763104568444e-05, |
|
"loss": 1.0624, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5702535983550377, |
|
"grad_norm": 1.081092588192711, |
|
"learning_rate": 7.99502768487569e-05, |
|
"loss": 1.0395, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5757368060315284, |
|
"grad_norm": 2.059128407576435, |
|
"learning_rate": 7.994233482256567e-05, |
|
"loss": 1.0772, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5812200137080192, |
|
"grad_norm": 1.542971505873209, |
|
"learning_rate": 7.993380508396992e-05, |
|
"loss": 1.0803, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.5867032213845099, |
|
"grad_norm": 1.55076714242406, |
|
"learning_rate": 7.992468775847638e-05, |
|
"loss": 1.0743, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5921864290610007, |
|
"grad_norm": 1.6504825115582353, |
|
"learning_rate": 7.99149829802376e-05, |
|
"loss": 1.0582, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5976696367374914, |
|
"grad_norm": 1.5595357168386863, |
|
"learning_rate": 7.990469089204992e-05, |
|
"loss": 1.0586, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6031528444139822, |
|
"grad_norm": 1.5048675592822867, |
|
"learning_rate": 7.989381164535131e-05, |
|
"loss": 1.0706, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6086360520904729, |
|
"grad_norm": 1.281006191844885, |
|
"learning_rate": 7.988234540021928e-05, |
|
"loss": 1.0479, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6141192597669637, |
|
"grad_norm": 1.3554471142105933, |
|
"learning_rate": 7.987029232536841e-05, |
|
"loss": 1.0567, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6196024674434544, |
|
"grad_norm": 1.4305679963313591, |
|
"learning_rate": 7.98576525981479e-05, |
|
"loss": 1.071, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6250856751199452, |
|
"grad_norm": 1.734508907845877, |
|
"learning_rate": 7.9844426404539e-05, |
|
"loss": 1.0516, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.630568882796436, |
|
"grad_norm": 1.379558477119407, |
|
"learning_rate": 7.983061393915222e-05, |
|
"loss": 1.0469, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6360520904729267, |
|
"grad_norm": 1.5909347084720777, |
|
"learning_rate": 7.981621540522444e-05, |
|
"loss": 1.0371, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6415352981494175, |
|
"grad_norm": 1.3125481276081445, |
|
"learning_rate": 7.980123101461606e-05, |
|
"loss": 1.0385, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6470185058259081, |
|
"grad_norm": 1.6052402185743397, |
|
"learning_rate": 7.978566098780771e-05, |
|
"loss": 1.0574, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.6525017135023989, |
|
"grad_norm": 1.3834678907216356, |
|
"learning_rate": 7.976950555389713e-05, |
|
"loss": 1.0465, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6579849211788896, |
|
"grad_norm": 1.6370067172637257, |
|
"learning_rate": 7.97527649505957e-05, |
|
"loss": 1.0486, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6634681288553804, |
|
"grad_norm": 1.073534432484641, |
|
"learning_rate": 7.973543942422506e-05, |
|
"loss": 1.0434, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6689513365318711, |
|
"grad_norm": 1.7459082997043855, |
|
"learning_rate": 7.97175292297134e-05, |
|
"loss": 1.067, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.6744345442083619, |
|
"grad_norm": 1.1822890462560018, |
|
"learning_rate": 7.969903463059169e-05, |
|
"loss": 1.0461, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6799177518848526, |
|
"grad_norm": 1.371152665092677, |
|
"learning_rate": 7.96799558989899e-05, |
|
"loss": 1.059, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.6854009595613434, |
|
"grad_norm": 1.3953798179234536, |
|
"learning_rate": 7.966029331563287e-05, |
|
"loss": 1.0625, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6908841672378341, |
|
"grad_norm": 1.1325159476008027, |
|
"learning_rate": 7.964004716983635e-05, |
|
"loss": 1.046, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.6963673749143249, |
|
"grad_norm": 1.638079564242603, |
|
"learning_rate": 7.961921775950254e-05, |
|
"loss": 1.0623, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7018505825908157, |
|
"grad_norm": 1.4117107286842978, |
|
"learning_rate": 7.959780539111585e-05, |
|
"loss": 1.0511, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7073337902673064, |
|
"grad_norm": 1.3569548365723587, |
|
"learning_rate": 7.957581037973835e-05, |
|
"loss": 1.0326, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7128169979437972, |
|
"grad_norm": 1.091128253611987, |
|
"learning_rate": 7.955323304900514e-05, |
|
"loss": 1.0605, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7183002056202878, |
|
"grad_norm": 1.3381775735837071, |
|
"learning_rate": 7.953007373111956e-05, |
|
"loss": 1.0493, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7237834132967786, |
|
"grad_norm": 1.6753953721889496, |
|
"learning_rate": 7.950633276684833e-05, |
|
"loss": 1.0489, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.7292666209732693, |
|
"grad_norm": 1.1295015092010343, |
|
"learning_rate": 7.948201050551651e-05, |
|
"loss": 1.031, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.7347498286497601, |
|
"grad_norm": 1.7688605218710511, |
|
"learning_rate": 7.945710730500243e-05, |
|
"loss": 1.0469, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7402330363262508, |
|
"grad_norm": 0.9321401117859591, |
|
"learning_rate": 7.943162353173232e-05, |
|
"loss": 1.0431, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7457162440027416, |
|
"grad_norm": 1.3514206349673292, |
|
"learning_rate": 7.940555956067495e-05, |
|
"loss": 1.044, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.7511994516792323, |
|
"grad_norm": 1.4851863360773496, |
|
"learning_rate": 7.937891577533624e-05, |
|
"loss": 1.0515, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.7566826593557231, |
|
"grad_norm": 1.440870365731717, |
|
"learning_rate": 7.93516925677534e-05, |
|
"loss": 1.0679, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7621658670322139, |
|
"grad_norm": 1.2090291573025855, |
|
"learning_rate": 7.932389033848931e-05, |
|
"loss": 1.042, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.7676490747087046, |
|
"grad_norm": 1.3907578329999266, |
|
"learning_rate": 7.929550949662659e-05, |
|
"loss": 1.0521, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7731322823851954, |
|
"grad_norm": 1.200182965091153, |
|
"learning_rate": 7.92665504597616e-05, |
|
"loss": 1.0241, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.7786154900616861, |
|
"grad_norm": 1.2632928685848217, |
|
"learning_rate": 7.923701365399826e-05, |
|
"loss": 1.0522, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.7840986977381769, |
|
"grad_norm": 1.4291827088345994, |
|
"learning_rate": 7.920689951394175e-05, |
|
"loss": 1.061, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.7895819054146676, |
|
"grad_norm": 1.3309391045635073, |
|
"learning_rate": 7.917620848269224e-05, |
|
"loss": 1.0403, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7950651130911583, |
|
"grad_norm": 1.2122573572305384, |
|
"learning_rate": 7.914494101183822e-05, |
|
"loss": 1.0462, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.800548320767649, |
|
"grad_norm": 1.251660077081641, |
|
"learning_rate": 7.911309756144995e-05, |
|
"loss": 1.0414, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.8060315284441398, |
|
"grad_norm": 1.345922867573313, |
|
"learning_rate": 7.908067860007268e-05, |
|
"loss": 1.0313, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.8115147361206305, |
|
"grad_norm": 0.9287950705039595, |
|
"learning_rate": 7.904768460471975e-05, |
|
"loss": 1.038, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.8169979437971213, |
|
"grad_norm": 1.0777391735849702, |
|
"learning_rate": 7.90141160608655e-05, |
|
"loss": 1.0167, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.8224811514736121, |
|
"grad_norm": 1.2120154182775404, |
|
"learning_rate": 7.897997346243825e-05, |
|
"loss": 1.0343, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8279643591501028, |
|
"grad_norm": 1.4336567463521839, |
|
"learning_rate": 7.894525731181297e-05, |
|
"loss": 1.0059, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.8334475668265936, |
|
"grad_norm": 1.2113263001766579, |
|
"learning_rate": 7.890996811980386e-05, |
|
"loss": 1.0449, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.8389307745030843, |
|
"grad_norm": 1.3174687681794697, |
|
"learning_rate": 7.887410640565689e-05, |
|
"loss": 1.0285, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.8444139821795751, |
|
"grad_norm": 1.225603290157931, |
|
"learning_rate": 7.883767269704209e-05, |
|
"loss": 1.003, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.8498971898560658, |
|
"grad_norm": 1.4144760379069086, |
|
"learning_rate": 7.880066753004588e-05, |
|
"loss": 1.0351, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8553803975325566, |
|
"grad_norm": 0.9308858125598364, |
|
"learning_rate": 7.876309144916312e-05, |
|
"loss": 1.0349, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.8608636052090473, |
|
"grad_norm": 1.289178086096443, |
|
"learning_rate": 7.87249450072891e-05, |
|
"loss": 1.0562, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.866346812885538, |
|
"grad_norm": 1.2077852579104889, |
|
"learning_rate": 7.86862287657114e-05, |
|
"loss": 1.0401, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.8718300205620287, |
|
"grad_norm": 1.08522876589818, |
|
"learning_rate": 7.864694329410168e-05, |
|
"loss": 1.0452, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.8773132282385195, |
|
"grad_norm": 1.1720826545668266, |
|
"learning_rate": 7.860708917050722e-05, |
|
"loss": 1.0356, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8827964359150103, |
|
"grad_norm": 1.4739281342592865, |
|
"learning_rate": 7.85666669813425e-05, |
|
"loss": 1.0144, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.888279643591501, |
|
"grad_norm": 1.0224677436038465, |
|
"learning_rate": 7.852567732138051e-05, |
|
"loss": 1.0246, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.8937628512679918, |
|
"grad_norm": 1.3144849480629979, |
|
"learning_rate": 7.848412079374403e-05, |
|
"loss": 1.0312, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.8992460589444825, |
|
"grad_norm": 0.9259999717965711, |
|
"learning_rate": 7.844199800989672e-05, |
|
"loss": 1.0587, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.9047292666209733, |
|
"grad_norm": 1.2803275856430567, |
|
"learning_rate": 7.839930958963415e-05, |
|
"loss": 1.0197, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.910212474297464, |
|
"grad_norm": 1.3241153517256665, |
|
"learning_rate": 7.835605616107471e-05, |
|
"loss": 1.0451, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.9156956819739548, |
|
"grad_norm": 0.8194270610855893, |
|
"learning_rate": 7.83122383606503e-05, |
|
"loss": 1.0159, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.9211788896504455, |
|
"grad_norm": 0.8812702028458952, |
|
"learning_rate": 7.826785683309702e-05, |
|
"loss": 1.0021, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.9266620973269363, |
|
"grad_norm": 1.2943748599092924, |
|
"learning_rate": 7.822291223144564e-05, |
|
"loss": 1.0332, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.932145305003427, |
|
"grad_norm": 1.1900876172194863, |
|
"learning_rate": 7.817740521701204e-05, |
|
"loss": 1.0375, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9376285126799178, |
|
"grad_norm": 1.1226402889101894, |
|
"learning_rate": 7.813133645938744e-05, |
|
"loss": 1.0207, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.9431117203564084, |
|
"grad_norm": 1.211346655033823, |
|
"learning_rate": 7.808470663642856e-05, |
|
"loss": 1.0076, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.9485949280328992, |
|
"grad_norm": 1.0624211029554345, |
|
"learning_rate": 7.803751643424769e-05, |
|
"loss": 1.0283, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.95407813570939, |
|
"grad_norm": 1.1373049276416176, |
|
"learning_rate": 7.798976654720248e-05, |
|
"loss": 1.0282, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.9595613433858807, |
|
"grad_norm": 1.0179746542873978, |
|
"learning_rate": 7.794145767788582e-05, |
|
"loss": 1.0331, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9650445510623715, |
|
"grad_norm": 1.0947604624344023, |
|
"learning_rate": 7.789259053711554e-05, |
|
"loss": 1.0358, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.9705277587388622, |
|
"grad_norm": 1.0275838636266867, |
|
"learning_rate": 7.784316584392379e-05, |
|
"loss": 1.0225, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.976010966415353, |
|
"grad_norm": 1.0737698018245303, |
|
"learning_rate": 7.779318432554663e-05, |
|
"loss": 1.0251, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.9814941740918437, |
|
"grad_norm": 1.3298644494187344, |
|
"learning_rate": 7.774264671741324e-05, |
|
"loss": 1.0141, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.9869773817683345, |
|
"grad_norm": 1.1088674490608672, |
|
"learning_rate": 7.769155376313509e-05, |
|
"loss": 1.0118, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9924605894448252, |
|
"grad_norm": 1.3595288197056004, |
|
"learning_rate": 7.763990621449507e-05, |
|
"loss": 1.031, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.997943797121316, |
|
"grad_norm": 0.9941472634431752, |
|
"learning_rate": 7.758770483143634e-05, |
|
"loss": 1.0233, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.0034270047978067, |
|
"grad_norm": 1.8804626787303236, |
|
"learning_rate": 7.753495038205123e-05, |
|
"loss": 1.6808, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.0089102124742975, |
|
"grad_norm": 1.0024508697092467, |
|
"learning_rate": 7.748164364256989e-05, |
|
"loss": 1.0053, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.0143934201507883, |
|
"grad_norm": 1.093213840060089, |
|
"learning_rate": 7.742778539734884e-05, |
|
"loss": 0.9916, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.019876627827279, |
|
"grad_norm": 1.325077555571521, |
|
"learning_rate": 7.737337643885956e-05, |
|
"loss": 1.009, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.0253598355037696, |
|
"grad_norm": 0.7331108340009108, |
|
"learning_rate": 7.73184175676766e-05, |
|
"loss": 0.9944, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.0308430431802604, |
|
"grad_norm": 1.1666619855541702, |
|
"learning_rate": 7.726290959246606e-05, |
|
"loss": 0.9985, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.0363262508567512, |
|
"grad_norm": 0.6970782613004463, |
|
"learning_rate": 7.720685332997354e-05, |
|
"loss": 1.0002, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.041809458533242, |
|
"grad_norm": 0.9459726357845105, |
|
"learning_rate": 7.715024960501209e-05, |
|
"loss": 0.9972, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0472926662097326, |
|
"grad_norm": 1.186575255220664, |
|
"learning_rate": 7.709309925045023e-05, |
|
"loss": 0.9978, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.0527758738862234, |
|
"grad_norm": 1.1776591664787488, |
|
"learning_rate": 7.70354031071995e-05, |
|
"loss": 1.0161, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.0582590815627142, |
|
"grad_norm": 1.1987709517142946, |
|
"learning_rate": 7.697716202420227e-05, |
|
"loss": 1.0126, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.063742289239205, |
|
"grad_norm": 1.3719598675561742, |
|
"learning_rate": 7.691837685841913e-05, |
|
"loss": 1.003, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.0692254969156956, |
|
"grad_norm": 0.7654739031004221, |
|
"learning_rate": 7.685904847481631e-05, |
|
"loss": 1.0008, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.0747087045921864, |
|
"grad_norm": 0.8192349385868019, |
|
"learning_rate": 7.679917774635298e-05, |
|
"loss": 0.9927, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.0801919122686772, |
|
"grad_norm": 0.9790278984629636, |
|
"learning_rate": 7.673876555396835e-05, |
|
"loss": 1.0003, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.085675119945168, |
|
"grad_norm": 1.4459053549970198, |
|
"learning_rate": 7.667781278656879e-05, |
|
"loss": 0.9974, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.0911583276216588, |
|
"grad_norm": 0.9511561601754845, |
|
"learning_rate": 7.661632034101466e-05, |
|
"loss": 0.9944, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.0966415352981493, |
|
"grad_norm": 1.1385829528759341, |
|
"learning_rate": 7.655428912210718e-05, |
|
"loss": 1.0033, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1021247429746401, |
|
"grad_norm": 0.8947911908116508, |
|
"learning_rate": 7.64917200425751e-05, |
|
"loss": 1.0027, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.107607950651131, |
|
"grad_norm": 1.0835036556236155, |
|
"learning_rate": 7.642861402306123e-05, |
|
"loss": 0.997, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.1130911583276217, |
|
"grad_norm": 1.278714863843557, |
|
"learning_rate": 7.636497199210895e-05, |
|
"loss": 1.0021, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.1185743660041123, |
|
"grad_norm": 1.0325169221569026, |
|
"learning_rate": 7.630079488614853e-05, |
|
"loss": 1.0109, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.124057573680603, |
|
"grad_norm": 1.4333181628002185, |
|
"learning_rate": 7.623608364948334e-05, |
|
"loss": 1.0075, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.129540781357094, |
|
"grad_norm": 0.757692830351415, |
|
"learning_rate": 7.617083923427596e-05, |
|
"loss": 0.9997, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.1350239890335847, |
|
"grad_norm": 0.9052852702132793, |
|
"learning_rate": 7.610506260053415e-05, |
|
"loss": 0.9926, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.1405071967100753, |
|
"grad_norm": 0.9541645139282031, |
|
"learning_rate": 7.603875471609677e-05, |
|
"loss": 1.0192, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.145990404386566, |
|
"grad_norm": 1.0002368977221223, |
|
"learning_rate": 7.597191655661952e-05, |
|
"loss": 1.009, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.1514736120630569, |
|
"grad_norm": 1.1460245071149242, |
|
"learning_rate": 7.590454910556058e-05, |
|
"loss": 1.0141, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1569568197395477, |
|
"grad_norm": 0.9540929426388082, |
|
"learning_rate": 7.583665335416608e-05, |
|
"loss": 0.9819, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.1624400274160385, |
|
"grad_norm": 1.0926693153246279, |
|
"learning_rate": 7.576823030145566e-05, |
|
"loss": 0.9951, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.167923235092529, |
|
"grad_norm": 1.1999013577213102, |
|
"learning_rate": 7.569928095420762e-05, |
|
"loss": 1.0109, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.1734064427690198, |
|
"grad_norm": 1.233840712656987, |
|
"learning_rate": 7.562980632694421e-05, |
|
"loss": 0.9775, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.1788896504455106, |
|
"grad_norm": 0.9301498473135285, |
|
"learning_rate": 7.555980744191666e-05, |
|
"loss": 0.9868, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1843728581220014, |
|
"grad_norm": 0.8903869662611502, |
|
"learning_rate": 7.548928532909006e-05, |
|
"loss": 0.9961, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.1898560657984922, |
|
"grad_norm": 0.9584208142183749, |
|
"learning_rate": 7.541824102612839e-05, |
|
"loss": 0.9926, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.1953392734749828, |
|
"grad_norm": 1.0412909874462872, |
|
"learning_rate": 7.534667557837912e-05, |
|
"loss": 1.0052, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.2008224811514736, |
|
"grad_norm": 1.129567257526931, |
|
"learning_rate": 7.527459003885783e-05, |
|
"loss": 0.9869, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.2063056888279644, |
|
"grad_norm": 1.0018364104469153, |
|
"learning_rate": 7.520198546823275e-05, |
|
"loss": 0.9994, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.211788896504455, |
|
"grad_norm": 1.0854193900597513, |
|
"learning_rate": 7.512886293480914e-05, |
|
"loss": 0.991, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.2172721041809458, |
|
"grad_norm": 0.8989902450272835, |
|
"learning_rate": 7.505522351451363e-05, |
|
"loss": 1.0011, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.2227553118574366, |
|
"grad_norm": 0.8836210417059804, |
|
"learning_rate": 7.498106829087822e-05, |
|
"loss": 0.9869, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.2282385195339274, |
|
"grad_norm": 0.973153600252484, |
|
"learning_rate": 7.490639835502458e-05, |
|
"loss": 0.9854, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.2337217272104182, |
|
"grad_norm": 1.1549659666106655, |
|
"learning_rate": 7.483121480564779e-05, |
|
"loss": 0.9844, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.2392049348869087, |
|
"grad_norm": 0.8467324247044801, |
|
"learning_rate": 7.475551874900027e-05, |
|
"loss": 0.9887, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.2446881425633995, |
|
"grad_norm": 0.7527167647459392, |
|
"learning_rate": 7.467931129887548e-05, |
|
"loss": 0.9989, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.2501713502398903, |
|
"grad_norm": 0.6353004936231207, |
|
"learning_rate": 7.460259357659155e-05, |
|
"loss": 0.9947, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.2556545579163811, |
|
"grad_norm": 0.5519629422547577, |
|
"learning_rate": 7.452536671097476e-05, |
|
"loss": 0.9822, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.261137765592872, |
|
"grad_norm": 0.5477181245472614, |
|
"learning_rate": 7.444763183834292e-05, |
|
"loss": 0.9686, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2666209732693625, |
|
"grad_norm": 0.587911630400038, |
|
"learning_rate": 7.436939010248868e-05, |
|
"loss": 0.9918, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.2721041809458533, |
|
"grad_norm": 0.7128522867737196, |
|
"learning_rate": 7.429064265466269e-05, |
|
"loss": 0.9702, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.2775873886223441, |
|
"grad_norm": 0.8502225365935777, |
|
"learning_rate": 7.421139065355663e-05, |
|
"loss": 0.9798, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.2830705962988347, |
|
"grad_norm": 1.0187687136282666, |
|
"learning_rate": 7.413163526528623e-05, |
|
"loss": 0.9861, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.2885538039753255, |
|
"grad_norm": 1.317692639157433, |
|
"learning_rate": 7.405137766337406e-05, |
|
"loss": 1.021, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.2940370116518163, |
|
"grad_norm": 0.6501556938062086, |
|
"learning_rate": 7.397061902873223e-05, |
|
"loss": 0.9888, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.299520219328307, |
|
"grad_norm": 0.673350620858405, |
|
"learning_rate": 7.388936054964512e-05, |
|
"loss": 0.9902, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.3050034270047979, |
|
"grad_norm": 0.8504309551590528, |
|
"learning_rate": 7.38076034217518e-05, |
|
"loss": 0.9898, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.3104866346812885, |
|
"grad_norm": 0.8625754977360135, |
|
"learning_rate": 7.372534884802844e-05, |
|
"loss": 1.0042, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.3159698423577793, |
|
"grad_norm": 0.827745673637963, |
|
"learning_rate": 7.364259803877072e-05, |
|
"loss": 0.9943, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.32145305003427, |
|
"grad_norm": 0.9138267229471313, |
|
"learning_rate": 7.355935221157584e-05, |
|
"loss": 0.9652, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.3269362577107608, |
|
"grad_norm": 1.1403654208865603, |
|
"learning_rate": 7.347561259132479e-05, |
|
"loss": 1.003, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.3324194653872516, |
|
"grad_norm": 0.9647824594303817, |
|
"learning_rate": 7.33913804101642e-05, |
|
"loss": 0.9986, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.3379026730637422, |
|
"grad_norm": 0.9071068484077749, |
|
"learning_rate": 7.330665690748825e-05, |
|
"loss": 0.9755, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.343385880740233, |
|
"grad_norm": 0.8147691285830508, |
|
"learning_rate": 7.322144332992047e-05, |
|
"loss": 0.9797, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.3488690884167238, |
|
"grad_norm": 0.7044192353299353, |
|
"learning_rate": 7.313574093129532e-05, |
|
"loss": 0.9608, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.3543522960932146, |
|
"grad_norm": 0.6346912465954524, |
|
"learning_rate": 7.30495509726398e-05, |
|
"loss": 1.0022, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.3598355037697054, |
|
"grad_norm": 0.5573384012759859, |
|
"learning_rate": 7.29628747221549e-05, |
|
"loss": 1.0092, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.365318711446196, |
|
"grad_norm": 0.43997105844804046, |
|
"learning_rate": 7.287571345519688e-05, |
|
"loss": 0.9738, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.3708019191226868, |
|
"grad_norm": 0.4226306918439261, |
|
"learning_rate": 7.278806845425856e-05, |
|
"loss": 1.0058, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3762851267991776, |
|
"grad_norm": 0.42590762216429656, |
|
"learning_rate": 7.269994100895047e-05, |
|
"loss": 0.9818, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.3817683344756682, |
|
"grad_norm": 0.4372777591442961, |
|
"learning_rate": 7.261133241598177e-05, |
|
"loss": 0.9751, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.387251542152159, |
|
"grad_norm": 0.48142270762634554, |
|
"learning_rate": 7.25222439791413e-05, |
|
"loss": 0.9721, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.3927347498286498, |
|
"grad_norm": 0.5632419671737571, |
|
"learning_rate": 7.24326770092783e-05, |
|
"loss": 0.9776, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.3982179575051406, |
|
"grad_norm": 0.6684531492862478, |
|
"learning_rate": 7.234263282428312e-05, |
|
"loss": 0.9767, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.4037011651816313, |
|
"grad_norm": 0.7980978087834616, |
|
"learning_rate": 7.225211274906795e-05, |
|
"loss": 0.9879, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.409184372858122, |
|
"grad_norm": 0.7499439022421684, |
|
"learning_rate": 7.216111811554718e-05, |
|
"loss": 0.978, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.4146675805346127, |
|
"grad_norm": 0.6727866867552594, |
|
"learning_rate": 7.206965026261787e-05, |
|
"loss": 0.9742, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.4201507882111035, |
|
"grad_norm": 0.7741815435063508, |
|
"learning_rate": 7.197771053614006e-05, |
|
"loss": 0.9845, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.4256339958875943, |
|
"grad_norm": 1.0192639583122494, |
|
"learning_rate": 7.188530028891691e-05, |
|
"loss": 0.971, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4311172035640851, |
|
"grad_norm": 1.2968135078305645, |
|
"learning_rate": 7.179242088067487e-05, |
|
"loss": 0.957, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.4366004112405757, |
|
"grad_norm": 0.8088269542396387, |
|
"learning_rate": 7.169907367804363e-05, |
|
"loss": 0.9681, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.4420836189170665, |
|
"grad_norm": 0.7292259902983808, |
|
"learning_rate": 7.160526005453599e-05, |
|
"loss": 0.9612, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.4475668265935573, |
|
"grad_norm": 0.6525436860415045, |
|
"learning_rate": 7.151098139052772e-05, |
|
"loss": 0.9796, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.4530500342700479, |
|
"grad_norm": 0.6556849626938652, |
|
"learning_rate": 7.141623907323717e-05, |
|
"loss": 0.9892, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.4585332419465387, |
|
"grad_norm": 0.7140364123549611, |
|
"learning_rate": 7.13210344967049e-05, |
|
"loss": 0.9969, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.4640164496230295, |
|
"grad_norm": 0.7556002996520803, |
|
"learning_rate": 7.122536906177318e-05, |
|
"loss": 0.9888, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.4694996572995203, |
|
"grad_norm": 0.9242516425211802, |
|
"learning_rate": 7.112924417606536e-05, |
|
"loss": 0.9802, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.474982864976011, |
|
"grad_norm": 0.882359584114108, |
|
"learning_rate": 7.103266125396512e-05, |
|
"loss": 0.9922, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.4804660726525016, |
|
"grad_norm": 0.8835626803646567, |
|
"learning_rate": 7.093562171659577e-05, |
|
"loss": 0.9879, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4859492803289924, |
|
"grad_norm": 0.7764848657115064, |
|
"learning_rate": 7.083812699179919e-05, |
|
"loss": 0.9624, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.4914324880054832, |
|
"grad_norm": 0.79015582202493, |
|
"learning_rate": 7.074017851411495e-05, |
|
"loss": 0.98, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.496915695681974, |
|
"grad_norm": 0.9280976445345387, |
|
"learning_rate": 7.064177772475912e-05, |
|
"loss": 0.9873, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.5023989033584648, |
|
"grad_norm": 1.0785228418807578, |
|
"learning_rate": 7.054292607160313e-05, |
|
"loss": 0.9869, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.5078821110349554, |
|
"grad_norm": 0.9804408020889978, |
|
"learning_rate": 7.044362500915239e-05, |
|
"loss": 0.9587, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.5133653187114462, |
|
"grad_norm": 0.9589974029042062, |
|
"learning_rate": 7.034387599852494e-05, |
|
"loss": 0.9668, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.518848526387937, |
|
"grad_norm": 0.9828431201468845, |
|
"learning_rate": 7.024368050742996e-05, |
|
"loss": 0.9546, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.5243317340644276, |
|
"grad_norm": 0.9871575188624485, |
|
"learning_rate": 7.014304001014614e-05, |
|
"loss": 0.9664, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.5298149417409186, |
|
"grad_norm": 0.8979909747965638, |
|
"learning_rate": 7.004195598749997e-05, |
|
"loss": 0.9695, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.5352981494174092, |
|
"grad_norm": 0.7418334820509743, |
|
"learning_rate": 6.994042992684406e-05, |
|
"loss": 0.977, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.5407813570939, |
|
"grad_norm": 0.646604829727326, |
|
"learning_rate": 6.983846332203508e-05, |
|
"loss": 0.9641, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.5462645647703908, |
|
"grad_norm": 0.6492561204321957, |
|
"learning_rate": 6.973605767341194e-05, |
|
"loss": 0.976, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.5517477724468813, |
|
"grad_norm": 0.6153278256285984, |
|
"learning_rate": 6.963321448777367e-05, |
|
"loss": 0.9865, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.5572309801233721, |
|
"grad_norm": 0.5549712659756844, |
|
"learning_rate": 6.952993527835714e-05, |
|
"loss": 0.9869, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.562714187799863, |
|
"grad_norm": 0.5609075426544077, |
|
"learning_rate": 6.942622156481498e-05, |
|
"loss": 1.0023, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.5681973954763535, |
|
"grad_norm": 0.4582297245144445, |
|
"learning_rate": 6.932207487319305e-05, |
|
"loss": 0.9726, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.5736806031528445, |
|
"grad_norm": 0.41570799728735885, |
|
"learning_rate": 6.921749673590813e-05, |
|
"loss": 0.9836, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.579163810829335, |
|
"grad_norm": 0.4801333642192195, |
|
"learning_rate": 6.911248869172523e-05, |
|
"loss": 0.9771, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.584647018505826, |
|
"grad_norm": 0.44443180188094267, |
|
"learning_rate": 6.900705228573507e-05, |
|
"loss": 0.9624, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.5901302261823167, |
|
"grad_norm": 0.38420949805748655, |
|
"learning_rate": 6.890118906933126e-05, |
|
"loss": 0.9659, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5956134338588073, |
|
"grad_norm": 0.40704833537993784, |
|
"learning_rate": 6.879490060018754e-05, |
|
"loss": 0.9718, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.6010966415352983, |
|
"grad_norm": 0.4595754745391335, |
|
"learning_rate": 6.86881884422348e-05, |
|
"loss": 0.9511, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.6065798492117889, |
|
"grad_norm": 0.4915604118827005, |
|
"learning_rate": 6.858105416563812e-05, |
|
"loss": 0.9443, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.6120630568882797, |
|
"grad_norm": 0.550923844804336, |
|
"learning_rate": 6.847349934677363e-05, |
|
"loss": 0.9767, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.6175462645647705, |
|
"grad_norm": 0.7101863814679902, |
|
"learning_rate": 6.836552556820533e-05, |
|
"loss": 0.978, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.623029472241261, |
|
"grad_norm": 1.027183744139685, |
|
"learning_rate": 6.82571344186618e-05, |
|
"loss": 0.9752, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.6285126799177518, |
|
"grad_norm": 1.2211181587902824, |
|
"learning_rate": 6.814832749301285e-05, |
|
"loss": 0.965, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.6339958875942426, |
|
"grad_norm": 0.6874293489066229, |
|
"learning_rate": 6.803910639224598e-05, |
|
"loss": 0.9889, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.6394790952707334, |
|
"grad_norm": 0.5130636301237326, |
|
"learning_rate": 6.792947272344292e-05, |
|
"loss": 0.9804, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.6449623029472242, |
|
"grad_norm": 0.5698007874640287, |
|
"learning_rate": 6.78194280997559e-05, |
|
"loss": 1.0037, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6504455106237148, |
|
"grad_norm": 0.525737677225322, |
|
"learning_rate": 6.770897414038398e-05, |
|
"loss": 0.965, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.6559287183002056, |
|
"grad_norm": 0.42725487684573776, |
|
"learning_rate": 6.759811247054918e-05, |
|
"loss": 0.984, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.6614119259766964, |
|
"grad_norm": 0.44161725440676974, |
|
"learning_rate": 6.748684472147255e-05, |
|
"loss": 0.9561, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.666895133653187, |
|
"grad_norm": 0.475656163202237, |
|
"learning_rate": 6.737517253035027e-05, |
|
"loss": 0.979, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.672378341329678, |
|
"grad_norm": 0.4354144881971005, |
|
"learning_rate": 6.726309754032942e-05, |
|
"loss": 0.9762, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.6778615490061686, |
|
"grad_norm": 0.45567387511294216, |
|
"learning_rate": 6.715062140048392e-05, |
|
"loss": 0.9885, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.6833447566826594, |
|
"grad_norm": 0.4702154465668684, |
|
"learning_rate": 6.703774576579018e-05, |
|
"loss": 0.9524, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.6888279643591502, |
|
"grad_norm": 0.5144836425148174, |
|
"learning_rate": 6.69244722971028e-05, |
|
"loss": 0.9542, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.6943111720356407, |
|
"grad_norm": 0.6222104516357243, |
|
"learning_rate": 6.681080266113017e-05, |
|
"loss": 0.9716, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.6997943797121315, |
|
"grad_norm": 0.8012284848489711, |
|
"learning_rate": 6.669673853040979e-05, |
|
"loss": 0.9732, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7052775873886223, |
|
"grad_norm": 0.9289507556170572, |
|
"learning_rate": 6.658228158328384e-05, |
|
"loss": 0.9827, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.7107607950651131, |
|
"grad_norm": 0.8593463233073516, |
|
"learning_rate": 6.646743350387438e-05, |
|
"loss": 1.0011, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.716244002741604, |
|
"grad_norm": 0.7853699004801804, |
|
"learning_rate": 6.635219598205863e-05, |
|
"loss": 0.9923, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.7217272104180945, |
|
"grad_norm": 0.8155772053440721, |
|
"learning_rate": 6.623657071344407e-05, |
|
"loss": 0.9474, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.7272104180945853, |
|
"grad_norm": 0.8677794295717961, |
|
"learning_rate": 6.61205593993434e-05, |
|
"loss": 0.9988, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.732693625771076, |
|
"grad_norm": 0.9646151142036712, |
|
"learning_rate": 6.600416374674978e-05, |
|
"loss": 0.9723, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.7381768334475667, |
|
"grad_norm": 1.054808954053776, |
|
"learning_rate": 6.588738546831136e-05, |
|
"loss": 0.9919, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.7436600411240577, |
|
"grad_norm": 0.9159951934744626, |
|
"learning_rate": 6.577022628230638e-05, |
|
"loss": 0.9793, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.7491432488005483, |
|
"grad_norm": 0.9253971678191606, |
|
"learning_rate": 6.565268791261769e-05, |
|
"loss": 0.9786, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.754626456477039, |
|
"grad_norm": 0.8302909529818902, |
|
"learning_rate": 6.553477208870748e-05, |
|
"loss": 0.9892, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.7601096641535299, |
|
"grad_norm": 0.6921147371125702, |
|
"learning_rate": 6.541648054559182e-05, |
|
"loss": 0.9947, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.7655928718300204, |
|
"grad_norm": 0.5132394155691594, |
|
"learning_rate": 6.529781502381509e-05, |
|
"loss": 0.9824, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.7710760795065115, |
|
"grad_norm": 0.39683272073533354, |
|
"learning_rate": 6.517877726942445e-05, |
|
"loss": 0.9627, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.776559287183002, |
|
"grad_norm": 0.3842764915321119, |
|
"learning_rate": 6.505936903394406e-05, |
|
"loss": 0.967, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.7820424948594928, |
|
"grad_norm": 0.5276096962930575, |
|
"learning_rate": 6.493959207434934e-05, |
|
"loss": 0.9949, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.7875257025359836, |
|
"grad_norm": 1.6686008806421238, |
|
"learning_rate": 6.481944815304117e-05, |
|
"loss": 0.9627, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.7930089102124742, |
|
"grad_norm": 0.5800400386093595, |
|
"learning_rate": 6.469893903781987e-05, |
|
"loss": 0.9762, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.798492117888965, |
|
"grad_norm": 0.7762268681851582, |
|
"learning_rate": 6.457806650185925e-05, |
|
"loss": 0.9889, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.8039753255654558, |
|
"grad_norm": 1.0162096674295886, |
|
"learning_rate": 6.44568323236805e-05, |
|
"loss": 0.9551, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.8094585332419464, |
|
"grad_norm": 1.106898918634063, |
|
"learning_rate": 6.433523828712599e-05, |
|
"loss": 0.9845, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.8149417409184374, |
|
"grad_norm": 0.6996155645379781, |
|
"learning_rate": 6.421328618133312e-05, |
|
"loss": 0.9507, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.820424948594928, |
|
"grad_norm": 0.5559169037599929, |
|
"learning_rate": 6.409097780070789e-05, |
|
"loss": 0.983, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.8259081562714188, |
|
"grad_norm": 0.576769546596351, |
|
"learning_rate": 6.396831494489852e-05, |
|
"loss": 0.9705, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.8313913639479096, |
|
"grad_norm": 0.6267959641520932, |
|
"learning_rate": 6.384529941876902e-05, |
|
"loss": 0.9735, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.8368745716244002, |
|
"grad_norm": 0.7904474034759853, |
|
"learning_rate": 6.372193303237258e-05, |
|
"loss": 0.9864, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.8423577793008912, |
|
"grad_norm": 0.7718780940719864, |
|
"learning_rate": 6.359821760092493e-05, |
|
"loss": 0.9853, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.8478409869773817, |
|
"grad_norm": 0.8233407480440916, |
|
"learning_rate": 6.347415494477771e-05, |
|
"loss": 0.9626, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.8533241946538725, |
|
"grad_norm": 0.5327118987154937, |
|
"learning_rate": 6.334974688939161e-05, |
|
"loss": 0.9651, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.8588074023303633, |
|
"grad_norm": 1.1227813676374583, |
|
"learning_rate": 6.322499526530951e-05, |
|
"loss": 0.9606, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.864290610006854, |
|
"grad_norm": 0.5044513236476138, |
|
"learning_rate": 6.30999019081296e-05, |
|
"loss": 0.9706, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8697738176833447, |
|
"grad_norm": 0.5297934306596983, |
|
"learning_rate": 6.297446865847833e-05, |
|
"loss": 0.9598, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.8752570253598355, |
|
"grad_norm": 0.7214246533398037, |
|
"learning_rate": 6.284869736198332e-05, |
|
"loss": 0.9842, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.880740233036326, |
|
"grad_norm": 11.761916238581446, |
|
"learning_rate": 6.272258986924624e-05, |
|
"loss": 0.9694, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.8862234407128171, |
|
"grad_norm": 0.9513953280967964, |
|
"learning_rate": 6.259614803581553e-05, |
|
"loss": 0.9701, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.8917066483893077, |
|
"grad_norm": 1.4907222296637521, |
|
"learning_rate": 6.246937372215916e-05, |
|
"loss": 1.0022, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.8971898560657985, |
|
"grad_norm": 0.7017698496988469, |
|
"learning_rate": 6.23422687936372e-05, |
|
"loss": 0.9865, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.9026730637422893, |
|
"grad_norm": 1.1225404349021637, |
|
"learning_rate": 6.22148351204744e-05, |
|
"loss": 0.9972, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.9081562714187799, |
|
"grad_norm": 0.903080574939645, |
|
"learning_rate": 6.208707457773267e-05, |
|
"loss": 0.9712, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.9136394790952709, |
|
"grad_norm": 0.8868757615913091, |
|
"learning_rate": 6.195898904528346e-05, |
|
"loss": 0.9749, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.9191226867717615, |
|
"grad_norm": 0.8643175740633899, |
|
"learning_rate": 6.183058040778018e-05, |
|
"loss": 0.977, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.9246058944482523, |
|
"grad_norm": 0.8365519935712682, |
|
"learning_rate": 6.170185055463039e-05, |
|
"loss": 0.9715, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.930089102124743, |
|
"grad_norm": 0.7005503452418033, |
|
"learning_rate": 6.157280137996797e-05, |
|
"loss": 0.9615, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.9355723098012336, |
|
"grad_norm": 0.6181080954802065, |
|
"learning_rate": 6.14434347826254e-05, |
|
"loss": 0.9713, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.9410555174777244, |
|
"grad_norm": 0.550645640988408, |
|
"learning_rate": 6.131375266610564e-05, |
|
"loss": 0.9918, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.9465387251542152, |
|
"grad_norm": 0.41477856958058457, |
|
"learning_rate": 6.118375693855426e-05, |
|
"loss": 0.9754, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.9520219328307058, |
|
"grad_norm": 0.5259660715406183, |
|
"learning_rate": 6.10534495127313e-05, |
|
"loss": 0.964, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.9575051405071968, |
|
"grad_norm": 0.51325398934719, |
|
"learning_rate": 6.092283230598311e-05, |
|
"loss": 0.9776, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.9629883481836874, |
|
"grad_norm": 0.45189992011139346, |
|
"learning_rate": 6.079190724021418e-05, |
|
"loss": 0.9789, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.9684715558601782, |
|
"grad_norm": 0.36793149019331095, |
|
"learning_rate": 6.066067624185886e-05, |
|
"loss": 0.9618, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.973954763536669, |
|
"grad_norm": 0.31587485174242214, |
|
"learning_rate": 6.0529141241852974e-05, |
|
"loss": 0.9722, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.9794379712131596, |
|
"grad_norm": 0.33408379397531063, |
|
"learning_rate": 6.0397304175605444e-05, |
|
"loss": 0.957, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.9849211788896506, |
|
"grad_norm": 0.32925216719938283, |
|
"learning_rate": 6.026516698296979e-05, |
|
"loss": 0.9866, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.9904043865661412, |
|
"grad_norm": 0.34707795848400175, |
|
"learning_rate": 6.0132731608215626e-05, |
|
"loss": 0.9689, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.995887594242632, |
|
"grad_norm": 0.33705532934899174, |
|
"learning_rate": 6.000000000000001e-05, |
|
"loss": 0.9903, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.0013708019191228, |
|
"grad_norm": 0.5509370965784027, |
|
"learning_rate": 5.9866974111338764e-05, |
|
"loss": 1.5285, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.0068540095956133, |
|
"grad_norm": 0.8014891517218886, |
|
"learning_rate": 5.973365589957777e-05, |
|
"loss": 0.9398, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.0123372172721044, |
|
"grad_norm": 0.923712638409977, |
|
"learning_rate": 5.96000473263642e-05, |
|
"loss": 0.9426, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.017820424948595, |
|
"grad_norm": 1.1253551776796877, |
|
"learning_rate": 5.946615035761756e-05, |
|
"loss": 0.9344, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.0233036326250855, |
|
"grad_norm": 0.8495067076997084, |
|
"learning_rate": 5.9331966963500825e-05, |
|
"loss": 0.9225, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.0287868403015765, |
|
"grad_norm": 0.6769916969341175, |
|
"learning_rate": 5.919749911839146e-05, |
|
"loss": 0.9648, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.034270047978067, |
|
"grad_norm": 0.6140664116420409, |
|
"learning_rate": 5.9062748800852315e-05, |
|
"loss": 0.9294, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.039753255654558, |
|
"grad_norm": 0.5658727809484153, |
|
"learning_rate": 5.892771799360258e-05, |
|
"loss": 0.9605, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.0452364633310487, |
|
"grad_norm": 0.5196234739220065, |
|
"learning_rate": 5.879240868348857e-05, |
|
"loss": 0.9203, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.0507196710075393, |
|
"grad_norm": 0.5322458101673834, |
|
"learning_rate": 5.865682286145446e-05, |
|
"loss": 0.9314, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.0562028786840303, |
|
"grad_norm": 0.4644574297552796, |
|
"learning_rate": 5.852096252251308e-05, |
|
"loss": 0.9348, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.061686086360521, |
|
"grad_norm": 0.45336140839584105, |
|
"learning_rate": 5.8384829665716475e-05, |
|
"loss": 0.9012, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.0671692940370114, |
|
"grad_norm": 0.49383619643504, |
|
"learning_rate": 5.824842629412653e-05, |
|
"loss": 0.9474, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.0726525017135025, |
|
"grad_norm": 0.4292299087159375, |
|
"learning_rate": 5.8111754414785504e-05, |
|
"loss": 0.8999, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.078135709389993, |
|
"grad_norm": 0.45513934186994226, |
|
"learning_rate": 5.797481603868646e-05, |
|
"loss": 0.93, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.083618917066484, |
|
"grad_norm": 0.4172253800200807, |
|
"learning_rate": 5.783761318074373e-05, |
|
"loss": 0.9297, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.0891021247429746, |
|
"grad_norm": 0.38216018087870474, |
|
"learning_rate": 5.770014785976322e-05, |
|
"loss": 0.9301, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.094585332419465, |
|
"grad_norm": 0.41108325844695326, |
|
"learning_rate": 5.756242209841272e-05, |
|
"loss": 0.9411, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.1000685400959562, |
|
"grad_norm": 0.374955676487953, |
|
"learning_rate": 5.742443792319216e-05, |
|
"loss": 0.9456, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.105551747772447, |
|
"grad_norm": 0.3696956544453686, |
|
"learning_rate": 5.728619736440375e-05, |
|
"loss": 0.9311, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.111034955448938, |
|
"grad_norm": 0.3087882400959682, |
|
"learning_rate": 5.714770245612217e-05, |
|
"loss": 0.945, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.1165181631254284, |
|
"grad_norm": 0.30607453448133787, |
|
"learning_rate": 5.700895523616459e-05, |
|
"loss": 0.9091, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.122001370801919, |
|
"grad_norm": 0.2923100493330831, |
|
"learning_rate": 5.6869957746060675e-05, |
|
"loss": 0.9305, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.12748457847841, |
|
"grad_norm": 0.32231037787471134, |
|
"learning_rate": 5.673071203102261e-05, |
|
"loss": 0.9413, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.1329677861549006, |
|
"grad_norm": 0.2847369225478741, |
|
"learning_rate": 5.6591220139914945e-05, |
|
"loss": 0.9348, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.138450993831391, |
|
"grad_norm": 0.26712566361223783, |
|
"learning_rate": 5.645148412522447e-05, |
|
"loss": 0.9304, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.143934201507882, |
|
"grad_norm": 0.2826845326442997, |
|
"learning_rate": 5.6311506043030006e-05, |
|
"loss": 0.9246, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.1494174091843727, |
|
"grad_norm": 0.2608630386687756, |
|
"learning_rate": 5.61712879529722e-05, |
|
"loss": 0.9207, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.1549006168608638, |
|
"grad_norm": 0.24817298190213744, |
|
"learning_rate": 5.6030831918223136e-05, |
|
"loss": 0.9354, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.1603838245373543, |
|
"grad_norm": 0.24967764451032934, |
|
"learning_rate": 5.5890140005456056e-05, |
|
"loss": 0.9084, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.165867032213845, |
|
"grad_norm": 0.2913323358304108, |
|
"learning_rate": 5.574921428481487e-05, |
|
"loss": 0.9198, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.171350239890336, |
|
"grad_norm": 0.28197622116779403, |
|
"learning_rate": 5.5608056829883796e-05, |
|
"loss": 0.9132, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.1768334475668265, |
|
"grad_norm": 0.7044465587321251, |
|
"learning_rate": 5.546666971765675e-05, |
|
"loss": 0.9402, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.1823166552433175, |
|
"grad_norm": 0.3486456432299187, |
|
"learning_rate": 5.532505502850688e-05, |
|
"loss": 0.9283, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.187799862919808, |
|
"grad_norm": 0.31689657334324073, |
|
"learning_rate": 5.5183214846155864e-05, |
|
"loss": 0.9453, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.1932830705962987, |
|
"grad_norm": 0.3211368338675237, |
|
"learning_rate": 5.504115125764329e-05, |
|
"loss": 0.9341, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.1987662782727897, |
|
"grad_norm": 0.3301868483523455, |
|
"learning_rate": 5.489886635329598e-05, |
|
"loss": 0.9159, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.2042494859492803, |
|
"grad_norm": 0.3873879810717108, |
|
"learning_rate": 5.4756362226697193e-05, |
|
"loss": 0.919, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.209732693625771, |
|
"grad_norm": 0.536935741788054, |
|
"learning_rate": 5.461364097465581e-05, |
|
"loss": 0.9501, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.215215901302262, |
|
"grad_norm": 0.7438640921551698, |
|
"learning_rate": 5.447070469717552e-05, |
|
"loss": 0.9265, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.2206991089787524, |
|
"grad_norm": 0.96824001629804, |
|
"learning_rate": 5.4327555497423874e-05, |
|
"loss": 0.9518, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.2261823166552435, |
|
"grad_norm": 1.1220442657897374, |
|
"learning_rate": 5.4184195481701425e-05, |
|
"loss": 0.9497, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.231665524331734, |
|
"grad_norm": 0.623316413195338, |
|
"learning_rate": 5.4040626759410625e-05, |
|
"loss": 0.9321, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.2371487320082246, |
|
"grad_norm": 0.3490982695093315, |
|
"learning_rate": 5.3896851443024837e-05, |
|
"loss": 0.9418, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.2426319396847156, |
|
"grad_norm": 0.6301408394566936, |
|
"learning_rate": 5.375287164805727e-05, |
|
"loss": 0.9554, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.248115147361206, |
|
"grad_norm": 0.7454695087115517, |
|
"learning_rate": 5.360868949302986e-05, |
|
"loss": 0.9132, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.2535983550376972, |
|
"grad_norm": 0.6120004710459457, |
|
"learning_rate": 5.3464307099442035e-05, |
|
"loss": 0.9261, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.259081562714188, |
|
"grad_norm": 0.44905767110087014, |
|
"learning_rate": 5.3319726591739536e-05, |
|
"loss": 0.9315, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.2645647703906784, |
|
"grad_norm": 3.4631254224930954, |
|
"learning_rate": 5.317495009728319e-05, |
|
"loss": 0.9336, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.2700479780671694, |
|
"grad_norm": 0.7088303944578741, |
|
"learning_rate": 5.302997974631757e-05, |
|
"loss": 0.9363, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.27553118574366, |
|
"grad_norm": 1.0280802158648574, |
|
"learning_rate": 5.288481767193963e-05, |
|
"loss": 0.9459, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.2810143934201506, |
|
"grad_norm": 0.758076517723585, |
|
"learning_rate": 5.2739466010067385e-05, |
|
"loss": 0.92, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.2864976010966416, |
|
"grad_norm": 0.6992773960171627, |
|
"learning_rate": 5.259392689940841e-05, |
|
"loss": 0.9155, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.291980808773132, |
|
"grad_norm": 0.5843686158243258, |
|
"learning_rate": 5.244820248142844e-05, |
|
"loss": 0.9209, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.297464016449623, |
|
"grad_norm": 0.5442980791956826, |
|
"learning_rate": 5.2302294900319796e-05, |
|
"loss": 0.945, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.3029472241261137, |
|
"grad_norm": 0.6550866194211868, |
|
"learning_rate": 5.215620630296988e-05, |
|
"loss": 0.9338, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.3084304318026048, |
|
"grad_norm": 0.5000014417009809, |
|
"learning_rate": 5.200993883892956e-05, |
|
"loss": 0.9347, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.3139136394790953, |
|
"grad_norm": 0.39164110972089616, |
|
"learning_rate": 5.1863494660381586e-05, |
|
"loss": 0.9389, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.319396847155586, |
|
"grad_norm": 0.43768247813945993, |
|
"learning_rate": 5.1716875922108836e-05, |
|
"loss": 0.9268, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.324880054832077, |
|
"grad_norm": 0.42576227430770175, |
|
"learning_rate": 5.1570084781462716e-05, |
|
"loss": 0.9556, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.3303632625085675, |
|
"grad_norm": 0.6202378063686813, |
|
"learning_rate": 5.142312339833131e-05, |
|
"loss": 0.9387, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.335846470185058, |
|
"grad_norm": 0.8376703889718673, |
|
"learning_rate": 5.1275993935107714e-05, |
|
"loss": 0.9229, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.341329677861549, |
|
"grad_norm": 0.547147928254462, |
|
"learning_rate": 5.112869855665811e-05, |
|
"loss": 0.9435, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.3468128855380397, |
|
"grad_norm": 0.8489497679798008, |
|
"learning_rate": 5.098123943028999e-05, |
|
"loss": 0.9356, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.3522960932145303, |
|
"grad_norm": 1.0146858088048836, |
|
"learning_rate": 5.0833618725720214e-05, |
|
"loss": 0.9448, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.3577793008910213, |
|
"grad_norm": 0.9115491187292197, |
|
"learning_rate": 5.0685838615043124e-05, |
|
"loss": 0.9244, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.363262508567512, |
|
"grad_norm": 0.6958313840237238, |
|
"learning_rate": 5.053790127269855e-05, |
|
"loss": 0.9129, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.368745716244003, |
|
"grad_norm": 0.4096200646347484, |
|
"learning_rate": 5.038980887543987e-05, |
|
"loss": 0.941, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.3742289239204935, |
|
"grad_norm": 0.412293973916545, |
|
"learning_rate": 5.024156360230189e-05, |
|
"loss": 0.9507, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.3797121315969845, |
|
"grad_norm": 0.6391777676946643, |
|
"learning_rate": 5.0093167634568874e-05, |
|
"loss": 0.9406, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.385195339273475, |
|
"grad_norm": 0.5793400631623677, |
|
"learning_rate": 4.9944623155742395e-05, |
|
"loss": 0.9144, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.3906785469499656, |
|
"grad_norm": 0.4118952919161777, |
|
"learning_rate": 4.979593235150924e-05, |
|
"loss": 0.9268, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.3961617546264566, |
|
"grad_norm": 0.32507791500349614, |
|
"learning_rate": 4.9647097409709186e-05, |
|
"loss": 0.9236, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.401644962302947, |
|
"grad_norm": 0.42100621017995543, |
|
"learning_rate": 4.94981205203029e-05, |
|
"loss": 0.9331, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.407128169979438, |
|
"grad_norm": 0.4843025816546595, |
|
"learning_rate": 4.934900387533965e-05, |
|
"loss": 0.9262, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.412611377655929, |
|
"grad_norm": 0.43099193801496954, |
|
"learning_rate": 4.9199749668925076e-05, |
|
"loss": 0.9391, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.4180945853324194, |
|
"grad_norm": 0.35246739703930285, |
|
"learning_rate": 4.9050360097188904e-05, |
|
"loss": 0.9349, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.42357779300891, |
|
"grad_norm": 0.2960781579764771, |
|
"learning_rate": 4.890083735825258e-05, |
|
"loss": 0.924, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.429061000685401, |
|
"grad_norm": 0.3742112117445952, |
|
"learning_rate": 4.875118365219706e-05, |
|
"loss": 0.9579, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.4345442083618916, |
|
"grad_norm": 0.3181326873253854, |
|
"learning_rate": 4.86014011810303e-05, |
|
"loss": 0.9205, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.4400274160383826, |
|
"grad_norm": 0.24053315100044398, |
|
"learning_rate": 4.845149214865491e-05, |
|
"loss": 0.9249, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.445510623714873, |
|
"grad_norm": 0.26885994692676984, |
|
"learning_rate": 4.830145876083575e-05, |
|
"loss": 0.9531, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.450993831391364, |
|
"grad_norm": 0.29839596806827645, |
|
"learning_rate": 4.81513032251674e-05, |
|
"loss": 0.9174, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.4564770390678548, |
|
"grad_norm": 0.2762827113377106, |
|
"learning_rate": 4.8001027751041784e-05, |
|
"loss": 0.9343, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.4619602467443453, |
|
"grad_norm": 0.24027837810919236, |
|
"learning_rate": 4.785063454961557e-05, |
|
"loss": 0.9542, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.4674434544208363, |
|
"grad_norm": 0.2538850685924787, |
|
"learning_rate": 4.7700125833777664e-05, |
|
"loss": 0.9244, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.472926662097327, |
|
"grad_norm": 0.24512740278452222, |
|
"learning_rate": 4.754950381811667e-05, |
|
"loss": 0.9371, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.4784098697738175, |
|
"grad_norm": 0.239290729128185, |
|
"learning_rate": 4.7398770718888296e-05, |
|
"loss": 0.9375, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.4838930774503085, |
|
"grad_norm": 0.23389680643967006, |
|
"learning_rate": 4.724792875398271e-05, |
|
"loss": 0.9079, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.489376285126799, |
|
"grad_norm": 0.24948455327274627, |
|
"learning_rate": 4.7096980142891936e-05, |
|
"loss": 0.9156, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.49485949280329, |
|
"grad_norm": 0.25385928315154366, |
|
"learning_rate": 4.694592710667723e-05, |
|
"loss": 0.9229, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.5003427004797807, |
|
"grad_norm": 0.22058160152481016, |
|
"learning_rate": 4.6794771867936286e-05, |
|
"loss": 0.9232, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.5058259081562713, |
|
"grad_norm": 0.1895088707996232, |
|
"learning_rate": 4.66435166507707e-05, |
|
"loss": 0.9419, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.5113091158327623, |
|
"grad_norm": 0.21080916833828472, |
|
"learning_rate": 4.6492163680753096e-05, |
|
"loss": 0.9211, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.516792323509253, |
|
"grad_norm": 0.24082839990262048, |
|
"learning_rate": 4.634071518489443e-05, |
|
"loss": 0.9168, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.522275531185744, |
|
"grad_norm": 0.27953124242683386, |
|
"learning_rate": 4.618917339161125e-05, |
|
"loss": 0.9323, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.5277587388622345, |
|
"grad_norm": 0.23606576570483712, |
|
"learning_rate": 4.6037540530692905e-05, |
|
"loss": 0.9396, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.533241946538725, |
|
"grad_norm": 0.23931407918785375, |
|
"learning_rate": 4.588581883326865e-05, |
|
"loss": 0.9265, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.538725154215216, |
|
"grad_norm": 0.2610767702353431, |
|
"learning_rate": 4.573401053177494e-05, |
|
"loss": 0.9294, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.5442083618917066, |
|
"grad_norm": 0.2501239011806691, |
|
"learning_rate": 4.558211785992251e-05, |
|
"loss": 0.9364, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.549691569568197, |
|
"grad_norm": 0.23499679030752327, |
|
"learning_rate": 4.543014305266352e-05, |
|
"loss": 0.9615, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.5551747772446882, |
|
"grad_norm": 0.2475408432558344, |
|
"learning_rate": 4.5278088346158665e-05, |
|
"loss": 0.9268, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.560657984921179, |
|
"grad_norm": 0.28708807149382903, |
|
"learning_rate": 4.512595597774427e-05, |
|
"loss": 0.9273, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 2.5661411925976694, |
|
"grad_norm": 0.2718981937349268, |
|
"learning_rate": 4.4973748185899416e-05, |
|
"loss": 0.9263, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.5716244002741604, |
|
"grad_norm": 0.25236164049340837, |
|
"learning_rate": 4.4821467210212924e-05, |
|
"loss": 0.9113, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 2.577107607950651, |
|
"grad_norm": 0.241733955145805, |
|
"learning_rate": 4.4669115291350484e-05, |
|
"loss": 0.9274, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.582590815627142, |
|
"grad_norm": 0.21189524141055552, |
|
"learning_rate": 4.451669467102162e-05, |
|
"loss": 0.9296, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 2.5880740233036326, |
|
"grad_norm": 0.20015531433684258, |
|
"learning_rate": 4.436420759194671e-05, |
|
"loss": 0.9193, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 2.5935572309801236, |
|
"grad_norm": 0.2058965605293262, |
|
"learning_rate": 4.4211656297824064e-05, |
|
"loss": 0.9429, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 2.599040438656614, |
|
"grad_norm": 0.2046878832240182, |
|
"learning_rate": 4.4059043033296815e-05, |
|
"loss": 0.9302, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.6045236463331047, |
|
"grad_norm": 0.22747989172056135, |
|
"learning_rate": 4.390637004391993e-05, |
|
"loss": 0.9137, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.6100068540095958, |
|
"grad_norm": 0.22860972503743232, |
|
"learning_rate": 4.375363957612717e-05, |
|
"loss": 0.9355, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 2.6154900616860863, |
|
"grad_norm": 0.21406527812867993, |
|
"learning_rate": 4.360085387719806e-05, |
|
"loss": 0.9326, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 2.620973269362577, |
|
"grad_norm": 0.2347455538571624, |
|
"learning_rate": 4.344801519522478e-05, |
|
"loss": 0.914, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 2.626456477039068, |
|
"grad_norm": 0.2800568223394633, |
|
"learning_rate": 4.32951257790791e-05, |
|
"loss": 0.9196, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 2.6319396847155585, |
|
"grad_norm": 0.29960031824686445, |
|
"learning_rate": 4.314218787837925e-05, |
|
"loss": 0.9199, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.637422892392049, |
|
"grad_norm": 0.24681941047649059, |
|
"learning_rate": 4.298920374345698e-05, |
|
"loss": 0.9258, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 2.64290610006854, |
|
"grad_norm": 0.2499779075588816, |
|
"learning_rate": 4.283617562532421e-05, |
|
"loss": 0.9349, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 2.648389307745031, |
|
"grad_norm": 0.21716808638868426, |
|
"learning_rate": 4.2683105775640096e-05, |
|
"loss": 0.9358, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 2.6538725154215217, |
|
"grad_norm": 0.26615335127773493, |
|
"learning_rate": 4.2529996446677814e-05, |
|
"loss": 0.9121, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 2.6593557230980123, |
|
"grad_norm": 0.2802919843954469, |
|
"learning_rate": 4.237684989129146e-05, |
|
"loss": 0.9175, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.6648389307745033, |
|
"grad_norm": 0.2692242115240618, |
|
"learning_rate": 4.2223668362882846e-05, |
|
"loss": 0.93, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 2.670322138450994, |
|
"grad_norm": 0.23379292559528167, |
|
"learning_rate": 4.2070454115368385e-05, |
|
"loss": 0.9214, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 2.6758053461274844, |
|
"grad_norm": 0.23215974230748349, |
|
"learning_rate": 4.191720940314593e-05, |
|
"loss": 0.9111, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 2.6812885538039755, |
|
"grad_norm": 0.2778040970662754, |
|
"learning_rate": 4.176393648106161e-05, |
|
"loss": 0.93, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 2.686771761480466, |
|
"grad_norm": 0.4565086038454969, |
|
"learning_rate": 4.1610637604376614e-05, |
|
"loss": 0.932, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.6922549691569566, |
|
"grad_norm": 0.22822596507213475, |
|
"learning_rate": 4.1457315028734015e-05, |
|
"loss": 0.9421, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 2.6977381768334476, |
|
"grad_norm": 0.2651182088836784, |
|
"learning_rate": 4.13039710101256e-05, |
|
"loss": 0.9375, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 2.703221384509938, |
|
"grad_norm": 2.256775615568554, |
|
"learning_rate": 4.11506078048587e-05, |
|
"loss": 0.9462, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 2.7087045921864292, |
|
"grad_norm": 0.47775298396500987, |
|
"learning_rate": 4.0997227669522924e-05, |
|
"loss": 0.9361, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 2.71418779986292, |
|
"grad_norm": 0.6415272036085159, |
|
"learning_rate": 4.0843832860956994e-05, |
|
"loss": 0.9105, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.719671007539411, |
|
"grad_norm": 0.47761645334870917, |
|
"learning_rate": 4.069042563621555e-05, |
|
"loss": 0.9288, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 2.7251542152159014, |
|
"grad_norm": 0.39340081097377483, |
|
"learning_rate": 4.0537008252535904e-05, |
|
"loss": 0.928, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 2.730637422892392, |
|
"grad_norm": 0.4032347665307978, |
|
"learning_rate": 4.0383582967304865e-05, |
|
"loss": 0.929, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.736120630568883, |
|
"grad_norm": 0.4465276677716616, |
|
"learning_rate": 4.023015203802551e-05, |
|
"loss": 0.9379, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 2.7416038382453736, |
|
"grad_norm": 0.3252831838439879, |
|
"learning_rate": 4.0076717722283936e-05, |
|
"loss": 0.9052, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.747087045921864, |
|
"grad_norm": 0.28556269271468004, |
|
"learning_rate": 3.992328227771608e-05, |
|
"loss": 0.9348, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 2.752570253598355, |
|
"grad_norm": 0.3329145096926203, |
|
"learning_rate": 3.976984796197451e-05, |
|
"loss": 0.9145, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 2.7580534612748457, |
|
"grad_norm": 0.27606016850672477, |
|
"learning_rate": 3.961641703269514e-05, |
|
"loss": 0.9351, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 2.7635366689513363, |
|
"grad_norm": 0.23531249021916406, |
|
"learning_rate": 3.946299174746411e-05, |
|
"loss": 0.9287, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.7690198766278273, |
|
"grad_norm": 0.3000618218962918, |
|
"learning_rate": 3.9309574363784465e-05, |
|
"loss": 0.9168, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.774503084304318, |
|
"grad_norm": 0.2725085905536336, |
|
"learning_rate": 3.915616713904302e-05, |
|
"loss": 0.9324, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 2.779986291980809, |
|
"grad_norm": 0.22554706135714297, |
|
"learning_rate": 3.9002772330477096e-05, |
|
"loss": 0.9301, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 2.7854694996572995, |
|
"grad_norm": 0.23876396577797063, |
|
"learning_rate": 3.884939219514132e-05, |
|
"loss": 0.919, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 2.7909527073337905, |
|
"grad_norm": 0.24095516006958517, |
|
"learning_rate": 3.869602898987441e-05, |
|
"loss": 0.9266, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 2.796435915010281, |
|
"grad_norm": 0.23743486283870385, |
|
"learning_rate": 3.854268497126601e-05, |
|
"loss": 0.936, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.8019191226867717, |
|
"grad_norm": 0.2113645336059951, |
|
"learning_rate": 3.8389362395623406e-05, |
|
"loss": 0.9275, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 2.8074023303632627, |
|
"grad_norm": 0.21077979039140243, |
|
"learning_rate": 3.8236063518938405e-05, |
|
"loss": 0.9205, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 2.8128855380397533, |
|
"grad_norm": 0.22799101511377703, |
|
"learning_rate": 3.8082790596854075e-05, |
|
"loss": 0.9121, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 2.818368745716244, |
|
"grad_norm": 0.23881115566274175, |
|
"learning_rate": 3.792954588463162e-05, |
|
"loss": 0.9441, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 2.823851953392735, |
|
"grad_norm": 0.2468480802505501, |
|
"learning_rate": 3.777633163711716e-05, |
|
"loss": 0.9315, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.8293351610692254, |
|
"grad_norm": 0.24127317607239004, |
|
"learning_rate": 3.7623150108708546e-05, |
|
"loss": 0.9255, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 2.834818368745716, |
|
"grad_norm": 0.2173302142599412, |
|
"learning_rate": 3.7470003553322186e-05, |
|
"loss": 0.9122, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 2.840301576422207, |
|
"grad_norm": 0.20993550761604957, |
|
"learning_rate": 3.7316894224359904e-05, |
|
"loss": 0.9136, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 2.8457847840986976, |
|
"grad_norm": 0.21489486047803172, |
|
"learning_rate": 3.71638243746758e-05, |
|
"loss": 0.9141, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 2.8512679917751886, |
|
"grad_norm": 0.2110516232439187, |
|
"learning_rate": 3.7010796256543034e-05, |
|
"loss": 0.9334, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.856751199451679, |
|
"grad_norm": 0.19507294240279735, |
|
"learning_rate": 3.6857812121620756e-05, |
|
"loss": 0.9194, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 2.8622344071281702, |
|
"grad_norm": 0.22638088296230285, |
|
"learning_rate": 3.670487422092092e-05, |
|
"loss": 0.9235, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 2.867717614804661, |
|
"grad_norm": 0.21271548062730075, |
|
"learning_rate": 3.655198480477523e-05, |
|
"loss": 0.9337, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 2.8732008224811514, |
|
"grad_norm": 0.1983909226603924, |
|
"learning_rate": 3.639914612280194e-05, |
|
"loss": 0.9261, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 2.8786840301576424, |
|
"grad_norm": 0.4448067239480651, |
|
"learning_rate": 3.6246360423872834e-05, |
|
"loss": 0.9323, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.884167237834133, |
|
"grad_norm": 0.24906699830014614, |
|
"learning_rate": 3.609362995608008e-05, |
|
"loss": 0.9392, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 2.8896504455106236, |
|
"grad_norm": 0.22348511782791708, |
|
"learning_rate": 3.59409569667032e-05, |
|
"loss": 0.9307, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 2.8951336531871146, |
|
"grad_norm": 0.2002054958378514, |
|
"learning_rate": 3.578834370217595e-05, |
|
"loss": 0.9179, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 2.900616860863605, |
|
"grad_norm": 0.20914969224319543, |
|
"learning_rate": 3.5635792408053304e-05, |
|
"loss": 0.9337, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 2.9061000685400957, |
|
"grad_norm": 0.2050856599518944, |
|
"learning_rate": 3.54833053289784e-05, |
|
"loss": 0.9268, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.9115832762165867, |
|
"grad_norm": 0.22357079010231426, |
|
"learning_rate": 3.533088470864953e-05, |
|
"loss": 0.922, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 2.9170664838930773, |
|
"grad_norm": 0.21756555260256386, |
|
"learning_rate": 3.517853278978708e-05, |
|
"loss": 0.9366, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 2.9225496915695683, |
|
"grad_norm": 0.2145272326220211, |
|
"learning_rate": 3.5026251814100604e-05, |
|
"loss": 0.9273, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 2.928032899246059, |
|
"grad_norm": 0.19855333975886716, |
|
"learning_rate": 3.487404402225574e-05, |
|
"loss": 0.9092, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 2.93351610692255, |
|
"grad_norm": 0.20077803143466286, |
|
"learning_rate": 3.4721911653841355e-05, |
|
"loss": 0.9338, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.9389993145990405, |
|
"grad_norm": 0.18066471168701084, |
|
"learning_rate": 3.45698569473365e-05, |
|
"loss": 0.9495, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 2.944482522275531, |
|
"grad_norm": 0.18734179693544623, |
|
"learning_rate": 3.44178821400775e-05, |
|
"loss": 0.9302, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 2.949965729952022, |
|
"grad_norm": 1.2819514876928657, |
|
"learning_rate": 3.426598946822507e-05, |
|
"loss": 0.9342, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 2.9554489376285127, |
|
"grad_norm": 0.21998918418380456, |
|
"learning_rate": 3.4114181166731355e-05, |
|
"loss": 0.9059, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 2.9609321453050033, |
|
"grad_norm": 0.23346119417414438, |
|
"learning_rate": 3.39624594693071e-05, |
|
"loss": 0.931, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.9664153529814943, |
|
"grad_norm": 0.23862986187508176, |
|
"learning_rate": 3.381082660838875e-05, |
|
"loss": 0.9386, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 2.971898560657985, |
|
"grad_norm": 0.23759718468050375, |
|
"learning_rate": 3.365928481510558e-05, |
|
"loss": 0.9367, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 2.9773817683344754, |
|
"grad_norm": 0.20198344717275046, |
|
"learning_rate": 3.3507836319246924e-05, |
|
"loss": 0.9164, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 2.9828649760109665, |
|
"grad_norm": 0.2104694393222235, |
|
"learning_rate": 3.33564833492293e-05, |
|
"loss": 0.9446, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 2.988348183687457, |
|
"grad_norm": 0.23875808400417559, |
|
"learning_rate": 3.3205228132063714e-05, |
|
"loss": 0.9319, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.993831391363948, |
|
"grad_norm": 0.2166410600290744, |
|
"learning_rate": 3.305407289332279e-05, |
|
"loss": 0.9372, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 2.9993145990404386, |
|
"grad_norm": 0.27336935598972234, |
|
"learning_rate": 3.290301985710807e-05, |
|
"loss": 1.3224, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.004797806716929, |
|
"grad_norm": 0.41849660540034167, |
|
"learning_rate": 3.27520712460173e-05, |
|
"loss": 1.0854, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.01028101439342, |
|
"grad_norm": 0.4165234810622607, |
|
"learning_rate": 3.260122928111172e-05, |
|
"loss": 0.8849, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.015764222069911, |
|
"grad_norm": 0.400638505892588, |
|
"learning_rate": 3.245049618188334e-05, |
|
"loss": 0.8893, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.021247429746402, |
|
"grad_norm": 0.3103499686134037, |
|
"learning_rate": 3.229987416622235e-05, |
|
"loss": 0.892, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.0267306374228924, |
|
"grad_norm": 0.34026890054529635, |
|
"learning_rate": 3.2149365450384445e-05, |
|
"loss": 0.8862, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.032213845099383, |
|
"grad_norm": 0.39382728987067206, |
|
"learning_rate": 3.199897224895823e-05, |
|
"loss": 0.89, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 3.037697052775874, |
|
"grad_norm": 0.3244232562451543, |
|
"learning_rate": 3.184869677483261e-05, |
|
"loss": 0.8992, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.0431802604523646, |
|
"grad_norm": 0.2732886799146708, |
|
"learning_rate": 3.169854123916426e-05, |
|
"loss": 0.8924, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.0486634681288556, |
|
"grad_norm": 0.3214178160165342, |
|
"learning_rate": 3.1548507851345094e-05, |
|
"loss": 0.8956, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.054146675805346, |
|
"grad_norm": 0.3379221592257927, |
|
"learning_rate": 3.139859881896971e-05, |
|
"loss": 0.8966, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 3.0596298834818367, |
|
"grad_norm": 0.23967633453436038, |
|
"learning_rate": 3.124881634780295e-05, |
|
"loss": 0.8704, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.0651130911583278, |
|
"grad_norm": 0.28791413087919254, |
|
"learning_rate": 3.109916264174743e-05, |
|
"loss": 0.898, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 3.0705962988348183, |
|
"grad_norm": 0.30027666282547666, |
|
"learning_rate": 3.094963990281112e-05, |
|
"loss": 0.8861, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.076079506511309, |
|
"grad_norm": 0.24118118182735915, |
|
"learning_rate": 3.080025033107494e-05, |
|
"loss": 0.8841, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 3.0815627141878, |
|
"grad_norm": 0.2264764026329518, |
|
"learning_rate": 3.065099612466037e-05, |
|
"loss": 0.8755, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.0870459218642905, |
|
"grad_norm": 0.25542576257876937, |
|
"learning_rate": 3.0501879479697112e-05, |
|
"loss": 0.9198, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 3.0925291295407815, |
|
"grad_norm": 0.2213496627104434, |
|
"learning_rate": 3.035290259029083e-05, |
|
"loss": 0.8998, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.098012337217272, |
|
"grad_norm": 0.21785326614913836, |
|
"learning_rate": 3.0204067648490766e-05, |
|
"loss": 0.9022, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.1034955448937627, |
|
"grad_norm": 0.2173897794863775, |
|
"learning_rate": 3.00553768442576e-05, |
|
"loss": 0.8784, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.1089787525702537, |
|
"grad_norm": 0.21438745068830176, |
|
"learning_rate": 2.9906832365431132e-05, |
|
"loss": 0.9082, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 3.1144619602467443, |
|
"grad_norm": 0.20526186072669544, |
|
"learning_rate": 2.9758436397698118e-05, |
|
"loss": 0.9004, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.1199451679232353, |
|
"grad_norm": 0.2006700821146824, |
|
"learning_rate": 2.961019112456014e-05, |
|
"loss": 0.87, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 3.125428375599726, |
|
"grad_norm": 0.21050122962004725, |
|
"learning_rate": 2.946209872730145e-05, |
|
"loss": 0.8958, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.1309115832762164, |
|
"grad_norm": 0.22224679543752676, |
|
"learning_rate": 2.931416138495689e-05, |
|
"loss": 0.8809, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 3.1363947909527075, |
|
"grad_norm": 0.19884200519314554, |
|
"learning_rate": 2.9166381274279803e-05, |
|
"loss": 0.8927, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 3.141877998629198, |
|
"grad_norm": 0.1955971229953881, |
|
"learning_rate": 2.901876056971002e-05, |
|
"loss": 0.9099, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 3.147361206305689, |
|
"grad_norm": 0.19513575382099455, |
|
"learning_rate": 2.88713014433419e-05, |
|
"loss": 0.8914, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 3.1528444139821796, |
|
"grad_norm": 0.18076444658460938, |
|
"learning_rate": 2.8724006064892296e-05, |
|
"loss": 0.8639, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.15832762165867, |
|
"grad_norm": 0.18498287851542897, |
|
"learning_rate": 2.85768766016687e-05, |
|
"loss": 0.8854, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 3.1638108293351612, |
|
"grad_norm": 0.1878415517014375, |
|
"learning_rate": 2.8429915218537297e-05, |
|
"loss": 0.8908, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 3.169294037011652, |
|
"grad_norm": 0.20058195415565724, |
|
"learning_rate": 2.8283124077891167e-05, |
|
"loss": 0.8822, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 3.1747772446881424, |
|
"grad_norm": 0.17471461344473282, |
|
"learning_rate": 2.813650533961843e-05, |
|
"loss": 0.8876, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 3.1802604523646334, |
|
"grad_norm": 0.1895547901022112, |
|
"learning_rate": 2.7990061161070445e-05, |
|
"loss": 0.8909, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.185743660041124, |
|
"grad_norm": 0.5540908088082868, |
|
"learning_rate": 2.7843793697030128e-05, |
|
"loss": 0.9036, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 3.191226867717615, |
|
"grad_norm": 0.21985667247185917, |
|
"learning_rate": 2.7697705099680217e-05, |
|
"loss": 0.8954, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 3.1967100753941056, |
|
"grad_norm": 0.18264756439459562, |
|
"learning_rate": 2.7551797518571573e-05, |
|
"loss": 0.875, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 3.202193283070596, |
|
"grad_norm": 0.1926800867763351, |
|
"learning_rate": 2.7406073100591605e-05, |
|
"loss": 0.89, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 3.207676490747087, |
|
"grad_norm": 0.1965971427549428, |
|
"learning_rate": 2.7260533989932628e-05, |
|
"loss": 0.8703, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.2131596984235777, |
|
"grad_norm": 0.19236103601320417, |
|
"learning_rate": 2.7115182328060385e-05, |
|
"loss": 0.888, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 3.2186429061000688, |
|
"grad_norm": 0.19426356221149135, |
|
"learning_rate": 2.697002025368245e-05, |
|
"loss": 0.8894, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 3.2241261137765593, |
|
"grad_norm": 0.1854247695317327, |
|
"learning_rate": 2.682504990271682e-05, |
|
"loss": 0.8853, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 3.22960932145305, |
|
"grad_norm": 0.21932696649265448, |
|
"learning_rate": 2.668027340826048e-05, |
|
"loss": 0.8804, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 3.235092529129541, |
|
"grad_norm": 0.30547051106449336, |
|
"learning_rate": 2.653569290055799e-05, |
|
"loss": 0.8855, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.2405757368060315, |
|
"grad_norm": 0.21730216080395384, |
|
"learning_rate": 2.6391310506970147e-05, |
|
"loss": 0.8738, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 3.246058944482522, |
|
"grad_norm": 0.7533763005766357, |
|
"learning_rate": 2.6247128351942726e-05, |
|
"loss": 0.8884, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 3.251542152159013, |
|
"grad_norm": 0.23270304391394997, |
|
"learning_rate": 2.6103148556975173e-05, |
|
"loss": 0.8822, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 3.2570253598355037, |
|
"grad_norm": 0.22462622263731738, |
|
"learning_rate": 2.5959373240589382e-05, |
|
"loss": 0.8977, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 3.2625085675119947, |
|
"grad_norm": 0.23970492867753068, |
|
"learning_rate": 2.5815804518298575e-05, |
|
"loss": 0.8953, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.2679917751884853, |
|
"grad_norm": 0.21761101160953683, |
|
"learning_rate": 2.5672444502576122e-05, |
|
"loss": 0.868, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 3.273474982864976, |
|
"grad_norm": 0.19755043958920854, |
|
"learning_rate": 2.55292953028245e-05, |
|
"loss": 0.8945, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 3.278958190541467, |
|
"grad_norm": 0.2294614254283116, |
|
"learning_rate": 2.53863590253442e-05, |
|
"loss": 0.9014, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 3.2844413982179574, |
|
"grad_norm": 0.2052910460949792, |
|
"learning_rate": 2.5243637773302817e-05, |
|
"loss": 0.8954, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 3.2899246058944485, |
|
"grad_norm": 0.1780748380826373, |
|
"learning_rate": 2.510113364670403e-05, |
|
"loss": 0.8888, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.295407813570939, |
|
"grad_norm": 0.21161561087783076, |
|
"learning_rate": 2.4958848742356724e-05, |
|
"loss": 0.9, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 3.3008910212474296, |
|
"grad_norm": 0.18385266024932176, |
|
"learning_rate": 2.481678515384415e-05, |
|
"loss": 0.8927, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 3.3063742289239206, |
|
"grad_norm": 0.180215579673697, |
|
"learning_rate": 2.4674944971493123e-05, |
|
"loss": 0.8922, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 3.311857436600411, |
|
"grad_norm": 0.20898208271490995, |
|
"learning_rate": 2.453333028234325e-05, |
|
"loss": 0.9042, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 3.317340644276902, |
|
"grad_norm": 0.2071805283405558, |
|
"learning_rate": 2.439194317011622e-05, |
|
"loss": 0.9012, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.322823851953393, |
|
"grad_norm": 0.16593932757416616, |
|
"learning_rate": 2.4250785715185138e-05, |
|
"loss": 0.882, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 3.3283070596298834, |
|
"grad_norm": 0.20712308956269607, |
|
"learning_rate": 2.410985999454396e-05, |
|
"loss": 0.8838, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 3.3337902673063744, |
|
"grad_norm": 0.17158735927539354, |
|
"learning_rate": 2.3969168081776867e-05, |
|
"loss": 0.8732, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 3.339273474982865, |
|
"grad_norm": 0.17694650881798035, |
|
"learning_rate": 2.382871204702781e-05, |
|
"loss": 0.8813, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 3.3447566826593556, |
|
"grad_norm": 0.19027363110894546, |
|
"learning_rate": 2.3688493956969997e-05, |
|
"loss": 0.9169, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.3502398903358466, |
|
"grad_norm": 0.1883450263699715, |
|
"learning_rate": 2.3548515874775547e-05, |
|
"loss": 0.8897, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 3.355723098012337, |
|
"grad_norm": 0.1676538782912043, |
|
"learning_rate": 2.340877986008507e-05, |
|
"loss": 0.8952, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 3.361206305688828, |
|
"grad_norm": 0.18929068926749942, |
|
"learning_rate": 2.3269287968977406e-05, |
|
"loss": 0.8884, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 3.3666895133653187, |
|
"grad_norm": 0.18748691973034498, |
|
"learning_rate": 2.3130042253939334e-05, |
|
"loss": 0.8964, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 3.3721727210418093, |
|
"grad_norm": 0.17421274944979948, |
|
"learning_rate": 2.2991044763835438e-05, |
|
"loss": 0.8715, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.3776559287183003, |
|
"grad_norm": 0.19236661368470973, |
|
"learning_rate": 2.285229754387783e-05, |
|
"loss": 0.8952, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 3.383139136394791, |
|
"grad_norm": 0.16600949256330813, |
|
"learning_rate": 2.2713802635596246e-05, |
|
"loss": 0.8924, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 3.3886223440712815, |
|
"grad_norm": 0.173933455195405, |
|
"learning_rate": 2.2575562076807857e-05, |
|
"loss": 0.9089, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 3.3941055517477725, |
|
"grad_norm": 0.1748738219825236, |
|
"learning_rate": 2.2437577901587284e-05, |
|
"loss": 0.905, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 3.399588759424263, |
|
"grad_norm": 0.16396278789777263, |
|
"learning_rate": 2.22998521402368e-05, |
|
"loss": 0.8831, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.405071967100754, |
|
"grad_norm": 0.159322352278441, |
|
"learning_rate": 2.216238681925628e-05, |
|
"loss": 0.8961, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 3.4105551747772447, |
|
"grad_norm": 0.1603751487606407, |
|
"learning_rate": 2.2025183961313542e-05, |
|
"loss": 0.8964, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 3.4160383824537353, |
|
"grad_norm": 0.1569001053409418, |
|
"learning_rate": 2.188824558521452e-05, |
|
"loss": 0.8987, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 3.4215215901302263, |
|
"grad_norm": 0.15634022646391366, |
|
"learning_rate": 2.175157370587348e-05, |
|
"loss": 0.8948, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 3.427004797806717, |
|
"grad_norm": 0.15419126088093488, |
|
"learning_rate": 2.1615170334283535e-05, |
|
"loss": 0.8896, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.432488005483208, |
|
"grad_norm": 0.1531889274422035, |
|
"learning_rate": 2.1479037477486936e-05, |
|
"loss": 0.8777, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 3.4379712131596984, |
|
"grad_norm": 0.16629317911613162, |
|
"learning_rate": 2.1343177138545547e-05, |
|
"loss": 0.8866, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 3.443454420836189, |
|
"grad_norm": 0.16323430207832845, |
|
"learning_rate": 2.1207591316511454e-05, |
|
"loss": 0.8863, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 3.44893762851268, |
|
"grad_norm": 0.1640881635654575, |
|
"learning_rate": 2.1072282006397425e-05, |
|
"loss": 0.8877, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 3.4544208361891706, |
|
"grad_norm": 0.15865517532778428, |
|
"learning_rate": 2.0937251199147684e-05, |
|
"loss": 0.8947, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.459904043865661, |
|
"grad_norm": 0.16320323651840082, |
|
"learning_rate": 2.0802500881608557e-05, |
|
"loss": 0.8976, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 3.465387251542152, |
|
"grad_norm": 0.14798931026156856, |
|
"learning_rate": 2.066803303649918e-05, |
|
"loss": 0.8647, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 3.470870459218643, |
|
"grad_norm": 0.1500679377824983, |
|
"learning_rate": 2.0533849642382446e-05, |
|
"loss": 0.8882, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 3.476353666895134, |
|
"grad_norm": 0.14953490103977057, |
|
"learning_rate": 2.039995267363581e-05, |
|
"loss": 0.8924, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 3.4818368745716244, |
|
"grad_norm": 0.1476923393733522, |
|
"learning_rate": 2.026634410042223e-05, |
|
"loss": 0.8858, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 3.487320082248115, |
|
"grad_norm": 0.15846688284973373, |
|
"learning_rate": 2.0133025888661263e-05, |
|
"loss": 0.8959, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 3.492803289924606, |
|
"grad_norm": 0.1441442793994033, |
|
"learning_rate": 2.0000000000000012e-05, |
|
"loss": 0.8976, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 3.4982864976010966, |
|
"grad_norm": 0.1662233677406197, |
|
"learning_rate": 1.986726839178438e-05, |
|
"loss": 0.8932, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 3.5037697052775876, |
|
"grad_norm": 0.1537273526944888, |
|
"learning_rate": 1.9734833017030227e-05, |
|
"loss": 0.873, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 3.509252912954078, |
|
"grad_norm": 0.1545372220602571, |
|
"learning_rate": 1.9602695824394576e-05, |
|
"loss": 0.8923, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.5147361206305687, |
|
"grad_norm": 0.15941175464229151, |
|
"learning_rate": 1.9470858758147036e-05, |
|
"loss": 0.8667, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 3.5202193283070597, |
|
"grad_norm": 0.1521791083684057, |
|
"learning_rate": 1.933932375814114e-05, |
|
"loss": 0.8947, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 3.5257025359835503, |
|
"grad_norm": 0.15904811631259153, |
|
"learning_rate": 1.9208092759785818e-05, |
|
"loss": 0.8954, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 3.531185743660041, |
|
"grad_norm": 0.15772373221240296, |
|
"learning_rate": 1.9077167694016903e-05, |
|
"loss": 0.891, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 3.536668951336532, |
|
"grad_norm": 0.16415057297908003, |
|
"learning_rate": 1.8946550487268706e-05, |
|
"loss": 0.8981, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 3.5421521590130225, |
|
"grad_norm": 0.16547585342894655, |
|
"learning_rate": 1.8816243061445734e-05, |
|
"loss": 0.8874, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 3.5476353666895135, |
|
"grad_norm": 0.15247215748823928, |
|
"learning_rate": 1.8686247333894366e-05, |
|
"loss": 0.888, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 3.553118574366004, |
|
"grad_norm": 0.16414994639468578, |
|
"learning_rate": 1.8556565217374606e-05, |
|
"loss": 0.8873, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 3.558601782042495, |
|
"grad_norm": 0.14847317925012327, |
|
"learning_rate": 1.8427198620032037e-05, |
|
"loss": 0.897, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 3.5640849897189857, |
|
"grad_norm": 0.16740639256848824, |
|
"learning_rate": 1.829814944536963e-05, |
|
"loss": 0.9012, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.5695681973954763, |
|
"grad_norm": 0.15906964929741546, |
|
"learning_rate": 1.8169419592219813e-05, |
|
"loss": 0.8899, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 3.5750514050719673, |
|
"grad_norm": 0.16808346242832645, |
|
"learning_rate": 1.8041010954716544e-05, |
|
"loss": 0.9008, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 3.580534612748458, |
|
"grad_norm": 0.15270335904858484, |
|
"learning_rate": 1.7912925422267345e-05, |
|
"loss": 0.8914, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 3.5860178204249484, |
|
"grad_norm": 4.558740159946488, |
|
"learning_rate": 1.7785164879525604e-05, |
|
"loss": 0.9097, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 3.5915010281014395, |
|
"grad_norm": 0.1998012183663189, |
|
"learning_rate": 1.7657731206362813e-05, |
|
"loss": 0.9044, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 3.59698423577793, |
|
"grad_norm": 0.18459743704746928, |
|
"learning_rate": 1.7530626277840846e-05, |
|
"loss": 0.8851, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 3.6024674434544206, |
|
"grad_norm": 0.194980974498013, |
|
"learning_rate": 1.7403851964184486e-05, |
|
"loss": 0.8997, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 3.6079506511309116, |
|
"grad_norm": 0.1918011345226112, |
|
"learning_rate": 1.7277410130753775e-05, |
|
"loss": 0.8876, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 3.613433858807402, |
|
"grad_norm": 0.17265185007479633, |
|
"learning_rate": 1.7151302638016683e-05, |
|
"loss": 0.874, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 3.618917066483893, |
|
"grad_norm": 0.1887609533456165, |
|
"learning_rate": 1.7025531341521685e-05, |
|
"loss": 0.8862, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.624400274160384, |
|
"grad_norm": 0.1894424801547077, |
|
"learning_rate": 1.690009809187041e-05, |
|
"loss": 0.8879, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 3.629883481836875, |
|
"grad_norm": 0.1787819433366742, |
|
"learning_rate": 1.6775004734690495e-05, |
|
"loss": 0.8774, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 3.6353666895133654, |
|
"grad_norm": 0.16300387548549075, |
|
"learning_rate": 1.6650253110608415e-05, |
|
"loss": 0.8925, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 3.640849897189856, |
|
"grad_norm": 0.16567955112172006, |
|
"learning_rate": 1.6525845055222306e-05, |
|
"loss": 0.9071, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 3.646333104866347, |
|
"grad_norm": 0.1643390268319312, |
|
"learning_rate": 1.6401782399075098e-05, |
|
"loss": 0.9064, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 3.6518163125428376, |
|
"grad_norm": 0.15797262211648114, |
|
"learning_rate": 1.627806696762745e-05, |
|
"loss": 0.8973, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 3.657299520219328, |
|
"grad_norm": 0.18157359982964277, |
|
"learning_rate": 1.615470058123099e-05, |
|
"loss": 0.871, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 3.662782727895819, |
|
"grad_norm": 0.15349809796270575, |
|
"learning_rate": 1.603168505510148e-05, |
|
"loss": 0.8957, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 3.6682659355723097, |
|
"grad_norm": 0.17402332671043225, |
|
"learning_rate": 1.5909022199292104e-05, |
|
"loss": 0.8888, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 3.6737491432488003, |
|
"grad_norm": 0.1554125530060754, |
|
"learning_rate": 1.5786713818666876e-05, |
|
"loss": 0.9054, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.6792323509252913, |
|
"grad_norm": 0.17178008005509693, |
|
"learning_rate": 1.566476171287401e-05, |
|
"loss": 0.8807, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 3.684715558601782, |
|
"grad_norm": 0.14883313948054624, |
|
"learning_rate": 1.554316767631951e-05, |
|
"loss": 0.8964, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 3.690198766278273, |
|
"grad_norm": 0.15943665222803124, |
|
"learning_rate": 1.5421933498140763e-05, |
|
"loss": 0.8864, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 3.6956819739547635, |
|
"grad_norm": 0.15165303732153182, |
|
"learning_rate": 1.5301060962180133e-05, |
|
"loss": 0.875, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 3.7011651816312545, |
|
"grad_norm": 0.15460350862452302, |
|
"learning_rate": 1.518055184695884e-05, |
|
"loss": 0.872, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.706648389307745, |
|
"grad_norm": 0.14512839459329285, |
|
"learning_rate": 1.5060407925650662e-05, |
|
"loss": 0.8792, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 3.7121315969842357, |
|
"grad_norm": 0.14629130374799093, |
|
"learning_rate": 1.494063096605595e-05, |
|
"loss": 0.885, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 3.7176148046607267, |
|
"grad_norm": 0.14425261349469004, |
|
"learning_rate": 1.4821222730575561e-05, |
|
"loss": 0.8879, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 3.7230980123372173, |
|
"grad_norm": 0.6459967446285357, |
|
"learning_rate": 1.4702184976184915e-05, |
|
"loss": 0.8969, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 3.728581220013708, |
|
"grad_norm": 0.1609987387109209, |
|
"learning_rate": 1.4583519454408191e-05, |
|
"loss": 0.8946, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.734064427690199, |
|
"grad_norm": 0.16619513073824907, |
|
"learning_rate": 1.4465227911292537e-05, |
|
"loss": 0.8802, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 3.7395476353666894, |
|
"grad_norm": 0.16622076062575558, |
|
"learning_rate": 1.434731208738232e-05, |
|
"loss": 0.9043, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 3.74503084304318, |
|
"grad_norm": 0.1511604554708221, |
|
"learning_rate": 1.4229773717693625e-05, |
|
"loss": 0.8865, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 3.750514050719671, |
|
"grad_norm": 0.15180628177086505, |
|
"learning_rate": 1.4112614531688645e-05, |
|
"loss": 0.8955, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 3.7559972583961616, |
|
"grad_norm": 0.14375773516761062, |
|
"learning_rate": 1.3995836253250233e-05, |
|
"loss": 0.8895, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.7614804660726526, |
|
"grad_norm": 0.15055378885989032, |
|
"learning_rate": 1.3879440600656607e-05, |
|
"loss": 0.8981, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 3.766963673749143, |
|
"grad_norm": 0.3396232761564488, |
|
"learning_rate": 1.3763429286555963e-05, |
|
"loss": 0.9004, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 3.7724468814256342, |
|
"grad_norm": 0.15268024778005962, |
|
"learning_rate": 1.3647804017941373e-05, |
|
"loss": 0.8845, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 3.777930089102125, |
|
"grad_norm": 0.15130129282023444, |
|
"learning_rate": 1.3532566496125634e-05, |
|
"loss": 0.8924, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 3.7834132967786154, |
|
"grad_norm": 0.1509237452137822, |
|
"learning_rate": 1.3417718416716183e-05, |
|
"loss": 0.8827, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.7888965044551064, |
|
"grad_norm": 0.1480697589327336, |
|
"learning_rate": 1.3303261469590228e-05, |
|
"loss": 0.8853, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 3.794379712131597, |
|
"grad_norm": 0.14512012199686875, |
|
"learning_rate": 1.3189197338869853e-05, |
|
"loss": 0.8736, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 3.7998629198080875, |
|
"grad_norm": 0.14156815438083506, |
|
"learning_rate": 1.3075527702897185e-05, |
|
"loss": 0.9144, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 3.8053461274845786, |
|
"grad_norm": 0.1471173694562544, |
|
"learning_rate": 1.2962254234209826e-05, |
|
"loss": 0.8808, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 3.810829335161069, |
|
"grad_norm": 0.13831572963208116, |
|
"learning_rate": 1.2849378599516085e-05, |
|
"loss": 0.8911, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.8163125428375597, |
|
"grad_norm": 0.13950206066753018, |
|
"learning_rate": 1.273690245967059e-05, |
|
"loss": 0.8955, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 3.8217957505140507, |
|
"grad_norm": 0.13768309928836486, |
|
"learning_rate": 1.2624827469649739e-05, |
|
"loss": 0.8861, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 3.8272789581905413, |
|
"grad_norm": 0.14367639560833734, |
|
"learning_rate": 1.2513155278527446e-05, |
|
"loss": 0.8851, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 3.8327621658670323, |
|
"grad_norm": 0.1386554623619909, |
|
"learning_rate": 1.240188752945084e-05, |
|
"loss": 0.9056, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 3.838245373543523, |
|
"grad_norm": 1.2865591556617946, |
|
"learning_rate": 1.2291025859616026e-05, |
|
"loss": 0.8962, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.843728581220014, |
|
"grad_norm": 0.19688018625704712, |
|
"learning_rate": 1.21805719002441e-05, |
|
"loss": 0.887, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 3.8492117888965045, |
|
"grad_norm": 0.2136145987737998, |
|
"learning_rate": 1.2070527276557092e-05, |
|
"loss": 0.8827, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 3.854694996572995, |
|
"grad_norm": 0.1856019277091022, |
|
"learning_rate": 1.1960893607754022e-05, |
|
"loss": 0.8789, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 3.860178204249486, |
|
"grad_norm": 0.16097377644982738, |
|
"learning_rate": 1.1851672506987165e-05, |
|
"loss": 0.8943, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 3.8656614119259767, |
|
"grad_norm": 0.17914103409237267, |
|
"learning_rate": 1.17428655813382e-05, |
|
"loss": 0.9002, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.8711446196024673, |
|
"grad_norm": 0.36179192792208653, |
|
"learning_rate": 1.1634474431794676e-05, |
|
"loss": 0.8922, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 3.8766278272789583, |
|
"grad_norm": 0.1707279122825088, |
|
"learning_rate": 1.1526500653226385e-05, |
|
"loss": 0.9079, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 3.882111034955449, |
|
"grad_norm": 0.1383198415613227, |
|
"learning_rate": 1.141894583436189e-05, |
|
"loss": 0.883, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 3.8875942426319394, |
|
"grad_norm": 0.15641274018303886, |
|
"learning_rate": 1.1311811557765208e-05, |
|
"loss": 0.892, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 3.8930774503084304, |
|
"grad_norm": 0.16278158477869095, |
|
"learning_rate": 1.1205099399812478e-05, |
|
"loss": 0.8857, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.898560657984921, |
|
"grad_norm": 0.15283986156792953, |
|
"learning_rate": 1.1098810930668754e-05, |
|
"loss": 0.8952, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 3.904043865661412, |
|
"grad_norm": 0.13614001952838004, |
|
"learning_rate": 1.0992947714264952e-05, |
|
"loss": 0.8838, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 3.9095270733379026, |
|
"grad_norm": 0.13744553818872712, |
|
"learning_rate": 1.088751130827478e-05, |
|
"loss": 0.8795, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 3.9150102810143936, |
|
"grad_norm": 0.15577116979501407, |
|
"learning_rate": 1.078250326409188e-05, |
|
"loss": 0.8924, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 3.920493488690884, |
|
"grad_norm": 0.13997947528267946, |
|
"learning_rate": 1.0677925126806956e-05, |
|
"loss": 0.8921, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.925976696367375, |
|
"grad_norm": 0.13838248446997362, |
|
"learning_rate": 1.0573778435185039e-05, |
|
"loss": 0.8941, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 3.931459904043866, |
|
"grad_norm": 0.1379231424304434, |
|
"learning_rate": 1.047006472164287e-05, |
|
"loss": 0.883, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 3.9369431117203564, |
|
"grad_norm": 0.13708859240144025, |
|
"learning_rate": 1.0366785512226359e-05, |
|
"loss": 0.8949, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 3.942426319396847, |
|
"grad_norm": 0.19426864143148992, |
|
"learning_rate": 1.0263942326588054e-05, |
|
"loss": 0.9119, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 3.947909527073338, |
|
"grad_norm": 0.13769915462697363, |
|
"learning_rate": 1.0161536677964933e-05, |
|
"loss": 0.8801, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.9533927347498286, |
|
"grad_norm": 0.13384470214559976, |
|
"learning_rate": 1.0059570073155953e-05, |
|
"loss": 0.8897, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 3.958875942426319, |
|
"grad_norm": 0.1312278166422601, |
|
"learning_rate": 9.958044012500023e-06, |
|
"loss": 0.8752, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 3.96435915010281, |
|
"grad_norm": 0.13365247691260085, |
|
"learning_rate": 9.856959989853876e-06, |
|
"loss": 0.8865, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 3.969842357779301, |
|
"grad_norm": 0.13321801521283821, |
|
"learning_rate": 9.75631949257004e-06, |
|
"loss": 0.8791, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 3.9753255654557917, |
|
"grad_norm": 0.12373583495929194, |
|
"learning_rate": 9.656124001475068e-06, |
|
"loss": 0.874, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.9808087731322823, |
|
"grad_norm": 0.131884923352051, |
|
"learning_rate": 9.556374990847618e-06, |
|
"loss": 0.8998, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 3.9862919808087733, |
|
"grad_norm": 0.12823642844323124, |
|
"learning_rate": 9.457073928396871e-06, |
|
"loss": 0.8832, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 3.991775188485264, |
|
"grad_norm": 0.1267184801459605, |
|
"learning_rate": 9.358222275240884e-06, |
|
"loss": 0.8827, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 3.9972583961617545, |
|
"grad_norm": 0.13081739200725578, |
|
"learning_rate": 9.25982148588506e-06, |
|
"loss": 0.8894, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 4.0027416038382455, |
|
"grad_norm": 0.3022610503112669, |
|
"learning_rate": 9.161873008200816e-06, |
|
"loss": 1.4531, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.0082248115147365, |
|
"grad_norm": 0.21415829635582506, |
|
"learning_rate": 9.064378283404247e-06, |
|
"loss": 0.8679, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 4.013708019191227, |
|
"grad_norm": 0.268037890967699, |
|
"learning_rate": 8.967338746034882e-06, |
|
"loss": 0.8705, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 4.019191226867718, |
|
"grad_norm": 0.17880419959895738, |
|
"learning_rate": 8.870755823934662e-06, |
|
"loss": 0.8584, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 4.024674434544209, |
|
"grad_norm": 0.22461674613474053, |
|
"learning_rate": 8.774630938226831e-06, |
|
"loss": 0.8811, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 4.030157642220699, |
|
"grad_norm": 0.2291332957928435, |
|
"learning_rate": 8.678965503295114e-06, |
|
"loss": 0.864, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.03564084989719, |
|
"grad_norm": 0.1835724453240001, |
|
"learning_rate": 8.583760926762852e-06, |
|
"loss": 0.865, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 4.041124057573681, |
|
"grad_norm": 0.2259271547888892, |
|
"learning_rate": 8.489018609472297e-06, |
|
"loss": 0.8542, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 4.046607265250171, |
|
"grad_norm": 0.1542427846466267, |
|
"learning_rate": 8.394739945464016e-06, |
|
"loss": 0.8762, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 4.052090472926662, |
|
"grad_norm": 0.19890352681544088, |
|
"learning_rate": 8.300926321956391e-06, |
|
"loss": 0.8605, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 4.057573680603153, |
|
"grad_norm": 0.17690612060645955, |
|
"learning_rate": 8.207579119325145e-06, |
|
"loss": 0.8531, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.063056888279643, |
|
"grad_norm": 0.15865147301476273, |
|
"learning_rate": 8.114699711083113e-06, |
|
"loss": 0.8522, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 4.068540095956134, |
|
"grad_norm": 0.1841897414662908, |
|
"learning_rate": 8.022289463859963e-06, |
|
"loss": 0.8631, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 4.074023303632625, |
|
"grad_norm": 0.14694677847916018, |
|
"learning_rate": 7.930349737382137e-06, |
|
"loss": 0.8563, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 4.079506511309116, |
|
"grad_norm": 0.16502565330919783, |
|
"learning_rate": 7.838881884452827e-06, |
|
"loss": 0.8724, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 4.084989718985606, |
|
"grad_norm": 0.23554464139707726, |
|
"learning_rate": 7.747887250932047e-06, |
|
"loss": 0.8557, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 4.090472926662097, |
|
"grad_norm": 0.1398510478275166, |
|
"learning_rate": 7.657367175716884e-06, |
|
"loss": 0.8516, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 4.095956134338588, |
|
"grad_norm": 0.16702844875733538, |
|
"learning_rate": 7.5673229907217146e-06, |
|
"loss": 0.8466, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 4.1014393420150785, |
|
"grad_norm": 0.1437634994264969, |
|
"learning_rate": 7.477756020858695e-06, |
|
"loss": 0.859, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 4.10692254969157, |
|
"grad_norm": 0.14595567734084924, |
|
"learning_rate": 7.38866758401823e-06, |
|
"loss": 0.8673, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 4.112405757368061, |
|
"grad_norm": 0.1481440539336634, |
|
"learning_rate": 7.300058991049534e-06, |
|
"loss": 0.8783, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.117888965044551, |
|
"grad_norm": 0.13602115614168497, |
|
"learning_rate": 7.211931545741433e-06, |
|
"loss": 0.8461, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 4.123372172721042, |
|
"grad_norm": 0.137497126546577, |
|
"learning_rate": 7.124286544803136e-06, |
|
"loss": 0.8623, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 4.128855380397533, |
|
"grad_norm": 0.13976966524048884, |
|
"learning_rate": 7.037125277845112e-06, |
|
"loss": 0.8757, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 4.134338588074023, |
|
"grad_norm": 0.13481097331708417, |
|
"learning_rate": 6.950449027360213e-06, |
|
"loss": 0.8681, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 4.139821795750514, |
|
"grad_norm": 0.13710599614237085, |
|
"learning_rate": 6.864259068704688e-06, |
|
"loss": 0.856, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 4.145305003427005, |
|
"grad_norm": 0.1332943117904311, |
|
"learning_rate": 6.778556670079535e-06, |
|
"loss": 0.8757, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 4.150788211103496, |
|
"grad_norm": 0.12811258664609723, |
|
"learning_rate": 6.69334309251175e-06, |
|
"loss": 0.8419, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 4.156271418779986, |
|
"grad_norm": 0.13194497255856788, |
|
"learning_rate": 6.608619589835803e-06, |
|
"loss": 0.8633, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 4.161754626456477, |
|
"grad_norm": 0.1361841789829523, |
|
"learning_rate": 6.524387408675208e-06, |
|
"loss": 0.8595, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 4.167237834132968, |
|
"grad_norm": 0.13100778084703357, |
|
"learning_rate": 6.440647788424166e-06, |
|
"loss": 0.8677, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.172721041809458, |
|
"grad_norm": 0.13496809903286835, |
|
"learning_rate": 6.357401961229293e-06, |
|
"loss": 0.8678, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 4.178204249485949, |
|
"grad_norm": 0.13401563339726721, |
|
"learning_rate": 6.274651151971567e-06, |
|
"loss": 0.8566, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 4.18368745716244, |
|
"grad_norm": 0.1394912023076164, |
|
"learning_rate": 6.1923965782482165e-06, |
|
"loss": 0.8815, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 4.18917066483893, |
|
"grad_norm": 0.13961247228771675, |
|
"learning_rate": 6.110639450354882e-06, |
|
"loss": 0.8674, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 4.194653872515421, |
|
"grad_norm": 0.1295423598196668, |
|
"learning_rate": 6.0293809712677775e-06, |
|
"loss": 0.874, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 4.2001370801919125, |
|
"grad_norm": 0.1283265409719677, |
|
"learning_rate": 5.9486223366259555e-06, |
|
"loss": 0.8597, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 4.205620287868403, |
|
"grad_norm": 0.700241302104076, |
|
"learning_rate": 5.868364734713776e-06, |
|
"loss": 0.857, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 4.211103495544894, |
|
"grad_norm": 0.134674672253731, |
|
"learning_rate": 5.788609346443386e-06, |
|
"loss": 0.8758, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 4.216586703221385, |
|
"grad_norm": 0.13348462597198057, |
|
"learning_rate": 5.70935734533733e-06, |
|
"loss": 0.8478, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 4.222069910897876, |
|
"grad_norm": 0.1433886053498496, |
|
"learning_rate": 5.630609897511328e-06, |
|
"loss": 0.8572, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.227553118574366, |
|
"grad_norm": 0.1344503715204191, |
|
"learning_rate": 5.552368161657082e-06, |
|
"loss": 0.8506, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 4.233036326250857, |
|
"grad_norm": 0.13228427889549804, |
|
"learning_rate": 5.474633289025244e-06, |
|
"loss": 0.838, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 4.238519533927348, |
|
"grad_norm": 0.13194232735031328, |
|
"learning_rate": 5.397406423408446e-06, |
|
"loss": 0.8709, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 4.244002741603838, |
|
"grad_norm": 0.1324994400696061, |
|
"learning_rate": 5.3206887011245165e-06, |
|
"loss": 0.8828, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 4.249485949280329, |
|
"grad_norm": 0.12868063064967905, |
|
"learning_rate": 5.24448125099974e-06, |
|
"loss": 0.8632, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.25496915695682, |
|
"grad_norm": 0.13131426367531818, |
|
"learning_rate": 5.1687851943522215e-06, |
|
"loss": 0.8695, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 4.26045236463331, |
|
"grad_norm": 0.1321275007098247, |
|
"learning_rate": 5.093601644975428e-06, |
|
"loss": 0.865, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 4.265935572309801, |
|
"grad_norm": 0.12860869696588378, |
|
"learning_rate": 5.018931709121791e-06, |
|
"loss": 0.8593, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 4.271418779986292, |
|
"grad_norm": 0.12768866221727887, |
|
"learning_rate": 4.9447764854863915e-06, |
|
"loss": 0.8556, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 4.276901987662782, |
|
"grad_norm": 0.12577824035047255, |
|
"learning_rate": 4.871137065190854e-06, |
|
"loss": 0.8696, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.282385195339273, |
|
"grad_norm": 0.1271952896466875, |
|
"learning_rate": 4.798014531767261e-06, |
|
"loss": 0.8625, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 4.287868403015764, |
|
"grad_norm": 0.12770579994200573, |
|
"learning_rate": 4.725409961142173e-06, |
|
"loss": 0.8703, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 4.293351610692255, |
|
"grad_norm": 0.1268234374359348, |
|
"learning_rate": 4.653324421620884e-06, |
|
"loss": 0.8577, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 4.2988348183687455, |
|
"grad_norm": 0.12603343458576113, |
|
"learning_rate": 4.581758973871609e-06, |
|
"loss": 0.8784, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 4.3043180260452365, |
|
"grad_norm": 0.12309331153032724, |
|
"learning_rate": 4.510714670909946e-06, |
|
"loss": 0.8507, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 4.3098012337217275, |
|
"grad_norm": 0.12632372215343676, |
|
"learning_rate": 4.440192558083367e-06, |
|
"loss": 0.8465, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 4.315284441398218, |
|
"grad_norm": 0.12233510742702297, |
|
"learning_rate": 4.370193673055787e-06, |
|
"loss": 0.8602, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 4.320767649074709, |
|
"grad_norm": 0.12377153787189331, |
|
"learning_rate": 4.300719045792376e-06, |
|
"loss": 0.861, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 4.3262508567512, |
|
"grad_norm": 0.12530735125691925, |
|
"learning_rate": 4.231769698544352e-06, |
|
"loss": 0.8879, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 4.33173406442769, |
|
"grad_norm": 0.12203824629809716, |
|
"learning_rate": 4.163346645833928e-06, |
|
"loss": 0.857, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 4.337217272104181, |
|
"grad_norm": 0.12040289652729098, |
|
"learning_rate": 4.0954508944394474e-06, |
|
"loss": 0.8653, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 4.342700479780672, |
|
"grad_norm": 0.2719330766641234, |
|
"learning_rate": 4.028083443380486e-06, |
|
"loss": 0.8623, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 4.348183687457162, |
|
"grad_norm": 0.12896172770081263, |
|
"learning_rate": 3.961245283903239e-06, |
|
"loss": 0.8703, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 4.353666895133653, |
|
"grad_norm": 0.256360715096439, |
|
"learning_rate": 3.89493739946587e-06, |
|
"loss": 0.8787, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 4.359150102810144, |
|
"grad_norm": 0.13114958683022193, |
|
"learning_rate": 3.829160765724052e-06, |
|
"loss": 0.8473, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 4.364633310486635, |
|
"grad_norm": 0.1251457780084145, |
|
"learning_rate": 3.7639163505166633e-06, |
|
"loss": 0.8637, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 4.370116518163125, |
|
"grad_norm": 0.12360176632912118, |
|
"learning_rate": 3.6992051138514717e-06, |
|
"loss": 0.867, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 4.375599725839616, |
|
"grad_norm": 0.1231975561547446, |
|
"learning_rate": 3.635028007891048e-06, |
|
"loss": 0.8639, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 4.381082933516107, |
|
"grad_norm": 0.12345807208116279, |
|
"learning_rate": 3.5713859769387795e-06, |
|
"loss": 0.8791, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 4.386566141192597, |
|
"grad_norm": 0.12478453410625485, |
|
"learning_rate": 3.5082799574249094e-06, |
|
"loss": 0.8591, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.392049348869088, |
|
"grad_norm": 0.12556171142015438, |
|
"learning_rate": 3.4457108778928272e-06, |
|
"loss": 0.8583, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 4.397532556545579, |
|
"grad_norm": 0.12267726816784849, |
|
"learning_rate": 3.3836796589853484e-06, |
|
"loss": 0.8625, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 4.4030157642220695, |
|
"grad_norm": 0.12026407977680072, |
|
"learning_rate": 3.3221872134312184e-06, |
|
"loss": 0.8488, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 4.4084989718985605, |
|
"grad_norm": 0.12019444401552158, |
|
"learning_rate": 3.261234446031658e-06, |
|
"loss": 0.8643, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 4.413982179575052, |
|
"grad_norm": 0.12210699680124913, |
|
"learning_rate": 3.200822253647031e-06, |
|
"loss": 0.8859, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 4.419465387251542, |
|
"grad_norm": 0.12255986263926599, |
|
"learning_rate": 3.140951525183691e-06, |
|
"loss": 0.8573, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 4.424948594928033, |
|
"grad_norm": 0.12389954653460372, |
|
"learning_rate": 3.0816231415808785e-06, |
|
"loss": 0.8512, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 4.430431802604524, |
|
"grad_norm": 0.12010839425684415, |
|
"learning_rate": 3.02283797579773e-06, |
|
"loss": 0.8671, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 4.435915010281015, |
|
"grad_norm": 0.11853788189152183, |
|
"learning_rate": 2.9645968928005085e-06, |
|
"loss": 0.8586, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 4.441398217957505, |
|
"grad_norm": 0.1329864288994537, |
|
"learning_rate": 2.906900749549784e-06, |
|
"loss": 0.8949, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.446881425633996, |
|
"grad_norm": 0.11809074531118806, |
|
"learning_rate": 2.849750394987907e-06, |
|
"loss": 0.8595, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 4.452364633310487, |
|
"grad_norm": 0.11945246424611973, |
|
"learning_rate": 2.793146670026472e-06, |
|
"loss": 0.8738, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 4.457847840986977, |
|
"grad_norm": 0.11667366701013149, |
|
"learning_rate": 2.737090407533938e-06, |
|
"loss": 0.8524, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 4.463331048663468, |
|
"grad_norm": 0.12055981293160757, |
|
"learning_rate": 2.681582432323406e-06, |
|
"loss": 0.8638, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 4.468814256339959, |
|
"grad_norm": 0.12272299690977, |
|
"learning_rate": 2.6266235611404645e-06, |
|
"loss": 0.8698, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 4.474297464016449, |
|
"grad_norm": 0.11634473455813253, |
|
"learning_rate": 2.5722146026511574e-06, |
|
"loss": 0.8528, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 4.47978067169294, |
|
"grad_norm": 0.11757436852730455, |
|
"learning_rate": 2.5183563574301185e-06, |
|
"loss": 0.8644, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 4.485263879369431, |
|
"grad_norm": 0.11644201446659469, |
|
"learning_rate": 2.465049617948778e-06, |
|
"loss": 0.8723, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 4.490747087045921, |
|
"grad_norm": 0.11889813055087124, |
|
"learning_rate": 2.4122951685636674e-06, |
|
"loss": 0.8742, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 4.496230294722412, |
|
"grad_norm": 0.11712594164609637, |
|
"learning_rate": 2.3600937855049467e-06, |
|
"loss": 0.8685, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.501713502398903, |
|
"grad_norm": 0.11398617749702712, |
|
"learning_rate": 2.308446236864916e-06, |
|
"loss": 0.8542, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 4.5071967100753945, |
|
"grad_norm": 0.11515630261113403, |
|
"learning_rate": 2.257353282586774e-06, |
|
"loss": 0.8355, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 4.512679917751885, |
|
"grad_norm": 0.12096238347096532, |
|
"learning_rate": 2.206815674453373e-06, |
|
"loss": 0.8717, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 4.518163125428376, |
|
"grad_norm": 0.11634803136785422, |
|
"learning_rate": 2.1568341560762152e-06, |
|
"loss": 0.8727, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 4.523646333104867, |
|
"grad_norm": 0.116283926332055, |
|
"learning_rate": 2.1074094628844754e-06, |
|
"loss": 0.8739, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 4.529129540781357, |
|
"grad_norm": 0.11823178558985727, |
|
"learning_rate": 2.0585423221141807e-06, |
|
"loss": 0.8685, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 4.534612748457848, |
|
"grad_norm": 0.11346261070128663, |
|
"learning_rate": 2.010233452797534e-06, |
|
"loss": 0.8618, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 4.540095956134339, |
|
"grad_norm": 0.11804999640238469, |
|
"learning_rate": 1.9624835657523222e-06, |
|
"loss": 0.8629, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 4.54557916381083, |
|
"grad_norm": 0.11621156619735486, |
|
"learning_rate": 1.9152933635714354e-06, |
|
"loss": 0.8626, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 4.55106237148732, |
|
"grad_norm": 0.1172026563649043, |
|
"learning_rate": 1.8686635406125697e-06, |
|
"loss": 0.8853, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.556545579163811, |
|
"grad_norm": 0.11454854230440348, |
|
"learning_rate": 1.822594782987972e-06, |
|
"loss": 0.837, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 4.562028786840301, |
|
"grad_norm": 0.11498604500319722, |
|
"learning_rate": 1.7770877685543687e-06, |
|
"loss": 0.8643, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 4.567511994516792, |
|
"grad_norm": 0.11659330833301497, |
|
"learning_rate": 1.7321431669029953e-06, |
|
"loss": 0.8669, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 4.572995202193283, |
|
"grad_norm": 0.1201780309737024, |
|
"learning_rate": 1.6877616393497075e-06, |
|
"loss": 0.872, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 4.578478409869774, |
|
"grad_norm": 0.11625946929875491, |
|
"learning_rate": 1.6439438389252948e-06, |
|
"loss": 0.8768, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 4.583961617546264, |
|
"grad_norm": 0.12022099106358185, |
|
"learning_rate": 1.6006904103658572e-06, |
|
"loss": 0.8668, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 4.589444825222755, |
|
"grad_norm": 0.11381027002549285, |
|
"learning_rate": 1.5580019901032929e-06, |
|
"loss": 0.855, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 4.594928032899246, |
|
"grad_norm": 0.11375387705527844, |
|
"learning_rate": 1.5158792062559813e-06, |
|
"loss": 0.8508, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 4.6004112405757365, |
|
"grad_norm": 0.11391108835892053, |
|
"learning_rate": 1.4743226786194931e-06, |
|
"loss": 0.8588, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 4.6058944482522275, |
|
"grad_norm": 0.11619063491074161, |
|
"learning_rate": 1.4333330186575079e-06, |
|
"loss": 0.8616, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.6113776559287185, |
|
"grad_norm": 0.11823159250643142, |
|
"learning_rate": 1.3929108294927951e-06, |
|
"loss": 0.8623, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 4.6168608636052095, |
|
"grad_norm": 0.1207957331708006, |
|
"learning_rate": 1.3530567058983369e-06, |
|
"loss": 0.8683, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 4.6223440712817, |
|
"grad_norm": 0.11402512226435695, |
|
"learning_rate": 1.31377123428861e-06, |
|
"loss": 0.8676, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 4.627827278958191, |
|
"grad_norm": 0.11562919886891948, |
|
"learning_rate": 1.2750549927109136e-06, |
|
"loss": 0.8521, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 4.633310486634681, |
|
"grad_norm": 0.11330894329763878, |
|
"learning_rate": 1.2369085508368862e-06, |
|
"loss": 0.8561, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 4.638793694311172, |
|
"grad_norm": 0.11498550405040812, |
|
"learning_rate": 1.1993324699541265e-06, |
|
"loss": 0.8481, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 4.644276901987663, |
|
"grad_norm": 0.1146270013894775, |
|
"learning_rate": 1.1623273029579195e-06, |
|
"loss": 0.8643, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 4.649760109664154, |
|
"grad_norm": 0.11325320105973269, |
|
"learning_rate": 1.1258935943431237e-06, |
|
"loss": 0.8633, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 4.655243317340644, |
|
"grad_norm": 0.11590461747206296, |
|
"learning_rate": 1.090031880196145e-06, |
|
"loss": 0.8622, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 4.660726525017135, |
|
"grad_norm": 0.1162327678391948, |
|
"learning_rate": 1.0547426881870292e-06, |
|
"loss": 0.8665, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.666209732693626, |
|
"grad_norm": 0.11300324812335996, |
|
"learning_rate": 1.0200265375617514e-06, |
|
"loss": 0.8698, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 4.671692940370116, |
|
"grad_norm": 0.1131953415650337, |
|
"learning_rate": 9.858839391345065e-07, |
|
"loss": 0.8815, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 4.677176148046607, |
|
"grad_norm": 0.11401963137384856, |
|
"learning_rate": 9.523153952802633e-07, |
|
"loss": 0.8532, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 4.682659355723098, |
|
"grad_norm": 0.11364587304383673, |
|
"learning_rate": 9.193213999273199e-07, |
|
"loss": 0.8693, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 4.688142563399589, |
|
"grad_norm": 0.11116715835095088, |
|
"learning_rate": 8.869024385500524e-07, |
|
"loss": 0.8651, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 4.693625771076079, |
|
"grad_norm": 0.11416749947476174, |
|
"learning_rate": 8.550589881617877e-07, |
|
"loss": 0.8618, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 4.69910897875257, |
|
"grad_norm": 0.11399280182713926, |
|
"learning_rate": 8.237915173077681e-07, |
|
"loss": 0.8459, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 4.7045921864290605, |
|
"grad_norm": 0.11248241190243433, |
|
"learning_rate": 7.93100486058247e-07, |
|
"loss": 0.8773, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 4.7100753941055515, |
|
"grad_norm": 0.11531822777678997, |
|
"learning_rate": 7.629863460017506e-07, |
|
"loss": 0.8644, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 4.715558601782043, |
|
"grad_norm": 0.1129479488663952, |
|
"learning_rate": 7.334495402383957e-07, |
|
"loss": 0.8629, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.721041809458534, |
|
"grad_norm": 0.11049875529447613, |
|
"learning_rate": 7.044905033734096e-07, |
|
"loss": 0.8738, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 4.726525017135024, |
|
"grad_norm": 0.11231491747052094, |
|
"learning_rate": 6.761096615107043e-07, |
|
"loss": 0.8747, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 4.732008224811515, |
|
"grad_norm": 0.1124055648842235, |
|
"learning_rate": 6.483074322466154e-07, |
|
"loss": 0.8762, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 4.737491432488006, |
|
"grad_norm": 0.1121079623755541, |
|
"learning_rate": 6.210842246637683e-07, |
|
"loss": 0.8682, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 4.742974640164496, |
|
"grad_norm": 0.11247133427567768, |
|
"learning_rate": 5.944404393250481e-07, |
|
"loss": 0.8403, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 4.748457847840987, |
|
"grad_norm": 0.11305732081461126, |
|
"learning_rate": 5.683764682677018e-07, |
|
"loss": 0.8785, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 4.753941055517478, |
|
"grad_norm": 0.11201120062752862, |
|
"learning_rate": 5.428926949975788e-07, |
|
"loss": 0.8675, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 4.759424263193969, |
|
"grad_norm": 0.1123495642176629, |
|
"learning_rate": 5.179894944834863e-07, |
|
"loss": 0.8769, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 4.764907470870459, |
|
"grad_norm": 0.11085620651600227, |
|
"learning_rate": 4.936672331516778e-07, |
|
"loss": 0.8614, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 4.77039067854695, |
|
"grad_norm": 0.11119796432260033, |
|
"learning_rate": 4.699262688804451e-07, |
|
"loss": 0.8428, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.77587388622344, |
|
"grad_norm": 0.11376038581076066, |
|
"learning_rate": 4.467669509948591e-07, |
|
"loss": 0.8603, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 4.781357093899931, |
|
"grad_norm": 0.11101436506944447, |
|
"learning_rate": 4.241896202616502e-07, |
|
"loss": 0.8615, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 4.786840301576422, |
|
"grad_norm": 0.1128704959623368, |
|
"learning_rate": 4.0219460888415884e-07, |
|
"loss": 0.865, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 4.792323509252913, |
|
"grad_norm": 0.10989876187245438, |
|
"learning_rate": 3.807822404974726e-07, |
|
"loss": 0.8571, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 4.797806716929403, |
|
"grad_norm": 0.1123757741386389, |
|
"learning_rate": 3.599528301636612e-07, |
|
"loss": 0.8736, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 4.803289924605894, |
|
"grad_norm": 0.10927129326687264, |
|
"learning_rate": 3.397066843671315e-07, |
|
"loss": 0.8566, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 4.8087731322823855, |
|
"grad_norm": 0.11068124311061739, |
|
"learning_rate": 3.200441010101196e-07, |
|
"loss": 0.8471, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 4.814256339958876, |
|
"grad_norm": 0.11040941653073615, |
|
"learning_rate": 3.0096536940832145e-07, |
|
"loss": 0.8809, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 4.819739547635367, |
|
"grad_norm": 0.11114204680301265, |
|
"learning_rate": 2.824707702866114e-07, |
|
"loss": 0.8524, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 4.825222755311858, |
|
"grad_norm": 0.11077647976079402, |
|
"learning_rate": 2.645605757749392e-07, |
|
"loss": 0.8574, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.830705962988349, |
|
"grad_norm": 0.11134140881315951, |
|
"learning_rate": 2.4723504940430187e-07, |
|
"loss": 0.8769, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 4.836189170664839, |
|
"grad_norm": 0.11048829969761541, |
|
"learning_rate": 2.3049444610288462e-07, |
|
"loss": 0.8689, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 4.84167237834133, |
|
"grad_norm": 0.11350618431987161, |
|
"learning_rate": 2.1433901219229502e-07, |
|
"loss": 0.8572, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 4.84715558601782, |
|
"grad_norm": 0.11054458698004693, |
|
"learning_rate": 1.9876898538394362e-07, |
|
"loss": 0.871, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 4.852638793694311, |
|
"grad_norm": 0.11089183807775897, |
|
"learning_rate": 1.8378459477555788e-07, |
|
"loss": 0.8712, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 4.858122001370802, |
|
"grad_norm": 0.1128090861209175, |
|
"learning_rate": 1.6938606084779375e-07, |
|
"loss": 0.8567, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 4.863605209047293, |
|
"grad_norm": 0.10957303514497624, |
|
"learning_rate": 1.555735954610027e-07, |
|
"loss": 0.846, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 4.869088416723783, |
|
"grad_norm": 0.11132388807214479, |
|
"learning_rate": 1.4234740185210095e-07, |
|
"loss": 0.8554, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 4.874571624400274, |
|
"grad_norm": 0.11105266439584284, |
|
"learning_rate": 1.2970767463160284e-07, |
|
"loss": 0.8557, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 4.880054832076765, |
|
"grad_norm": 0.11146453078388475, |
|
"learning_rate": 1.176545997807299e-07, |
|
"loss": 0.8712, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.885538039753255, |
|
"grad_norm": 0.11103753860761491, |
|
"learning_rate": 1.0618835464870191e-07, |
|
"loss": 0.8482, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 4.891021247429746, |
|
"grad_norm": 0.11060895361594707, |
|
"learning_rate": 9.530910795009895e-08, |
|
"loss": 0.8557, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 4.896504455106237, |
|
"grad_norm": 0.10942092985340518, |
|
"learning_rate": 8.501701976239673e-08, |
|
"loss": 0.854, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 4.901987662782728, |
|
"grad_norm": 0.11347350745319919, |
|
"learning_rate": 7.531224152362183e-08, |
|
"loss": 0.8613, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 4.9074708704592185, |
|
"grad_norm": 0.11094282402979687, |
|
"learning_rate": 6.619491603008676e-08, |
|
"loss": 0.8681, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 4.9129540781357095, |
|
"grad_norm": 0.1119562120229049, |
|
"learning_rate": 5.766517743432953e-08, |
|
"loss": 0.8596, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 4.9184372858122005, |
|
"grad_norm": 0.10924377003832167, |
|
"learning_rate": 4.9723151243106225e-08, |
|
"loss": 0.8487, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 4.923920493488691, |
|
"grad_norm": 0.10896221834453067, |
|
"learning_rate": 4.236895431557031e-08, |
|
"loss": 0.854, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 4.929403701165182, |
|
"grad_norm": 0.10999943471560589, |
|
"learning_rate": 3.560269486154066e-08, |
|
"loss": 0.8354, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 4.934886908841673, |
|
"grad_norm": 0.10985144652973201, |
|
"learning_rate": 2.9424472439911757e-08, |
|
"loss": 0.8707, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.940370116518163, |
|
"grad_norm": 0.10833684874636428, |
|
"learning_rate": 2.3834377957183684e-08, |
|
"loss": 0.8452, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 4.945853324194654, |
|
"grad_norm": 0.11116722675224537, |
|
"learning_rate": 1.8832493666125494e-08, |
|
"loss": 0.8673, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 4.951336531871145, |
|
"grad_norm": 0.7668025845471647, |
|
"learning_rate": 1.4418893164585002e-08, |
|
"loss": 0.8699, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 4.956819739547635, |
|
"grad_norm": 0.10932050335970388, |
|
"learning_rate": 1.0593641394369691e-08, |
|
"loss": 0.8614, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 4.962302947224126, |
|
"grad_norm": 0.10980806887744368, |
|
"learning_rate": 7.356794640318576e-09, |
|
"loss": 0.8517, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 4.967786154900617, |
|
"grad_norm": 0.11202862978010701, |
|
"learning_rate": 4.708400529476187e-09, |
|
"loss": 0.8464, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 4.973269362577108, |
|
"grad_norm": 0.11233852938525259, |
|
"learning_rate": 2.648498030364266e-09, |
|
"loss": 0.8675, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 4.978752570253598, |
|
"grad_norm": 0.11155180103993602, |
|
"learning_rate": 1.1771174524355388e-09, |
|
"loss": 0.8806, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 4.984235777930089, |
|
"grad_norm": 0.11032700245059573, |
|
"learning_rate": 2.9428044562074265e-10, |
|
"loss": 0.8577, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 4.98971898560658, |
|
"grad_norm": 0.10997852624200846, |
|
"learning_rate": 0.0, |
|
"loss": 0.8576, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.98971898560658, |
|
"step": 910, |
|
"total_flos": 2.1065883137322516e+19, |
|
"train_loss": 0.9554639947938395, |
|
"train_runtime": 109130.9683, |
|
"train_samples_per_second": 4.277, |
|
"train_steps_per_second": 0.008 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 910, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.1065883137322516e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|