diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9997006884166417, + "epoch": 1.9994013768332835, "eval_steps": 500, - "global_step": 835, + "global_step": 1670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -5860,6 +5860,5851 @@ "eval_samples_per_second": 212.547, "eval_steps_per_second": 53.175, "step": 835 + }, + { + "epoch": 1.0008979347500748, + "grad_norm": 0.5552700757980347, + "learning_rate": 5.037850152386574e-05, + "loss": 18.1151, + "step": 836 + }, + { + "epoch": 1.002095181083508, + "grad_norm": 0.18704776465892792, + "learning_rate": 5.0283877329108965e-05, + "loss": 10.3046, + "step": 837 + }, + { + "epoch": 1.003292427416941, + "grad_norm": 0.1981101632118225, + "learning_rate": 5.018925211760227e-05, + "loss": 10.295, + "step": 838 + }, + { + "epoch": 1.0044896737503741, + "grad_norm": 0.2190777063369751, + "learning_rate": 5.0094626228260266e-05, + "loss": 10.3173, + "step": 839 + }, + { + "epoch": 1.0056869200838072, + "grad_norm": 0.22749939560890198, + "learning_rate": 5e-05, + "loss": 10.3322, + "step": 840 + }, + { + "epoch": 1.0068841664172403, + "grad_norm": 0.2654731869697571, + "learning_rate": 4.9905373771739746e-05, + "loss": 10.3284, + "step": 841 + }, + { + "epoch": 1.0080814127506734, + "grad_norm": 0.24290290474891663, + "learning_rate": 4.981074788239773e-05, + "loss": 10.3497, + "step": 842 + }, + { + "epoch": 1.0092786590841065, + "grad_norm": 0.1886884868144989, + "learning_rate": 4.971612267089105e-05, + "loss": 10.3227, + "step": 843 + }, + { + "epoch": 1.0104759054175396, + "grad_norm": 0.17675116658210754, + "learning_rate": 4.962149847613428e-05, + "loss": 10.335, + "step": 844 + }, + { + "epoch": 1.0116731517509727, + "grad_norm": 0.15297679603099823, + "learning_rate": 4.952687563703841e-05, + "loss": 10.3558, + "step": 845 + }, + { + "epoch": 1.0128703980844058, + "grad_norm": 0.0992865264415741, + "learning_rate": 4.943225449250958e-05, + "loss": 10.313, + "step": 846 + }, + { + "epoch": 1.014067644417839, + "grad_norm": 0.072992242872715, + "learning_rate": 4.933763538144784e-05, + "loss": 10.3396, + "step": 847 + }, + { + "epoch": 1.015264890751272, + "grad_norm": 0.08298959583044052, + "learning_rate": 4.9243018642745926e-05, + "loss": 10.3257, + "step": 848 + }, + { + "epoch": 1.0164621370847051, + "grad_norm": 0.08252955228090286, + "learning_rate": 4.9148404615288144e-05, + "loss": 10.3434, + "step": 849 + }, + { + "epoch": 1.0176593834181382, + "grad_norm": 0.1154746264219284, + "learning_rate": 4.9053793637949067e-05, + "loss": 10.3239, + "step": 850 + }, + { + "epoch": 1.0188566297515713, + "grad_norm": 0.1080356240272522, + "learning_rate": 4.8959186049592274e-05, + "loss": 10.3274, + "step": 851 + }, + { + "epoch": 1.0200538760850044, + "grad_norm": 0.16470828652381897, + "learning_rate": 4.886458218906934e-05, + "loss": 10.3396, + "step": 852 + }, + { + "epoch": 1.0212511224184375, + "grad_norm": 0.1778523176908493, + "learning_rate": 4.8769982395218356e-05, + "loss": 10.3252, + "step": 853 + }, + { + "epoch": 1.0224483687518706, + "grad_norm": 0.19245301187038422, + "learning_rate": 4.8675387006862914e-05, + "loss": 10.3071, + "step": 854 + }, + { + "epoch": 1.0236456150853037, + "grad_norm": 0.06960920989513397, + "learning_rate": 4.858079636281085e-05, + "loss": 10.3333, + "step": 855 + }, + { + "epoch": 1.0248428614187368, + "grad_norm": 0.10019072890281677, + "learning_rate": 4.8486210801852946e-05, + "loss": 10.3138, + "step": 856 + }, + { + "epoch": 1.02604010775217, + "grad_norm": 0.0687229335308075, + "learning_rate": 4.83916306627618e-05, + "loss": 10.3137, + "step": 857 + }, + { + "epoch": 1.027237354085603, + "grad_norm": 0.06510143727064133, + "learning_rate": 4.829705628429061e-05, + "loss": 10.3197, + "step": 858 + }, + { + "epoch": 1.0284346004190361, + "grad_norm": 0.0829288512468338, + "learning_rate": 4.820248800517196e-05, + "loss": 10.3142, + "step": 859 + }, + { + "epoch": 1.0296318467524692, + "grad_norm": 0.07292906939983368, + "learning_rate": 4.810792616411649e-05, + "loss": 10.3329, + "step": 860 + }, + { + "epoch": 1.0308290930859023, + "grad_norm": 0.08417162299156189, + "learning_rate": 4.80133710998119e-05, + "loss": 10.3365, + "step": 861 + }, + { + "epoch": 1.0320263394193354, + "grad_norm": 0.10935333371162415, + "learning_rate": 4.7918823150921555e-05, + "loss": 10.3334, + "step": 862 + }, + { + "epoch": 1.0332235857527687, + "grad_norm": 0.12038741260766983, + "learning_rate": 4.782428265608333e-05, + "loss": 10.331, + "step": 863 + }, + { + "epoch": 1.0344208320862018, + "grad_norm": 0.07741657644510269, + "learning_rate": 4.772974995390845e-05, + "loss": 10.343, + "step": 864 + }, + { + "epoch": 1.035618078419635, + "grad_norm": 0.07101555168628693, + "learning_rate": 4.7635225382980176e-05, + "loss": 10.3022, + "step": 865 + }, + { + "epoch": 1.036815324753068, + "grad_norm": 0.076541967689991, + "learning_rate": 4.754070928185266e-05, + "loss": 10.3126, + "step": 866 + }, + { + "epoch": 1.0380125710865011, + "grad_norm": 0.0978110209107399, + "learning_rate": 4.7446201989049746e-05, + "loss": 10.3231, + "step": 867 + }, + { + "epoch": 1.0392098174199342, + "grad_norm": 0.07762591540813446, + "learning_rate": 4.735170384306371e-05, + "loss": 10.3138, + "step": 868 + }, + { + "epoch": 1.0404070637533673, + "grad_norm": 0.0648229569196701, + "learning_rate": 4.725721518235404e-05, + "loss": 10.3356, + "step": 869 + }, + { + "epoch": 1.0416043100868004, + "grad_norm": 0.0723283588886261, + "learning_rate": 4.7162736345346303e-05, + "loss": 10.2951, + "step": 870 + }, + { + "epoch": 1.0428015564202335, + "grad_norm": 0.09112104028463364, + "learning_rate": 4.7068267670430855e-05, + "loss": 10.3183, + "step": 871 + }, + { + "epoch": 1.0439988027536666, + "grad_norm": 0.0751243457198143, + "learning_rate": 4.6973809495961635e-05, + "loss": 10.3341, + "step": 872 + }, + { + "epoch": 1.0451960490870997, + "grad_norm": 0.16759902238845825, + "learning_rate": 4.687936216025503e-05, + "loss": 10.3033, + "step": 873 + }, + { + "epoch": 1.0463932954205328, + "grad_norm": 0.07561887055635452, + "learning_rate": 4.6784926001588544e-05, + "loss": 10.3225, + "step": 874 + }, + { + "epoch": 1.047590541753966, + "grad_norm": 0.08443213999271393, + "learning_rate": 4.669050135819966e-05, + "loss": 10.3039, + "step": 875 + }, + { + "epoch": 1.048787788087399, + "grad_norm": 0.11630913615226746, + "learning_rate": 4.659608856828467e-05, + "loss": 10.3, + "step": 876 + }, + { + "epoch": 1.0499850344208321, + "grad_norm": 0.10222963988780975, + "learning_rate": 4.650168796999736e-05, + "loss": 10.3155, + "step": 877 + }, + { + "epoch": 1.0511822807542652, + "grad_norm": 0.15242649614810944, + "learning_rate": 4.640729990144784e-05, + "loss": 10.2679, + "step": 878 + }, + { + "epoch": 1.0523795270876983, + "grad_norm": 0.10927727818489075, + "learning_rate": 4.6312924700701386e-05, + "loss": 10.3644, + "step": 879 + }, + { + "epoch": 1.0535767734211314, + "grad_norm": 0.1414031982421875, + "learning_rate": 4.621856270577718e-05, + "loss": 10.2978, + "step": 880 + }, + { + "epoch": 1.0547740197545645, + "grad_norm": 0.11384107172489166, + "learning_rate": 4.612421425464704e-05, + "loss": 10.3181, + "step": 881 + }, + { + "epoch": 1.0559712660879976, + "grad_norm": 0.13419802486896515, + "learning_rate": 4.6029879685234395e-05, + "loss": 10.3359, + "step": 882 + }, + { + "epoch": 1.0571685124214307, + "grad_norm": 0.1391170173883438, + "learning_rate": 4.593555933541284e-05, + "loss": 10.2817, + "step": 883 + }, + { + "epoch": 1.0583657587548638, + "grad_norm": 0.2582155466079712, + "learning_rate": 4.584125354300508e-05, + "loss": 10.4546, + "step": 884 + }, + { + "epoch": 1.059563005088297, + "grad_norm": 0.3637485206127167, + "learning_rate": 4.5746962645781724e-05, + "loss": 9.982, + "step": 885 + }, + { + "epoch": 1.06076025142173, + "grad_norm": 0.1871345043182373, + "learning_rate": 4.565268698145997e-05, + "loss": 10.2642, + "step": 886 + }, + { + "epoch": 1.0619574977551631, + "grad_norm": 0.22236977517604828, + "learning_rate": 4.555842688770247e-05, + "loss": 10.2965, + "step": 887 + }, + { + "epoch": 1.0631547440885962, + "grad_norm": 0.19666147232055664, + "learning_rate": 4.5464182702116135e-05, + "loss": 10.3299, + "step": 888 + }, + { + "epoch": 1.0643519904220293, + "grad_norm": 0.22874382138252258, + "learning_rate": 4.5369954762250894e-05, + "loss": 10.2982, + "step": 889 + }, + { + "epoch": 1.0655492367554624, + "grad_norm": 0.22550679743289948, + "learning_rate": 4.527574340559844e-05, + "loss": 10.3285, + "step": 890 + }, + { + "epoch": 1.0667464830888955, + "grad_norm": 0.22559818625450134, + "learning_rate": 4.518154896959114e-05, + "loss": 10.3414, + "step": 891 + }, + { + "epoch": 1.0679437294223286, + "grad_norm": 0.2170734852552414, + "learning_rate": 4.508737179160072e-05, + "loss": 10.3207, + "step": 892 + }, + { + "epoch": 1.0691409757557617, + "grad_norm": 0.17715151607990265, + "learning_rate": 4.499321220893708e-05, + "loss": 10.3573, + "step": 893 + }, + { + "epoch": 1.0703382220891948, + "grad_norm": 0.14928358793258667, + "learning_rate": 4.4899070558847154e-05, + "loss": 10.3452, + "step": 894 + }, + { + "epoch": 1.071535468422628, + "grad_norm": 0.08827892690896988, + "learning_rate": 4.480494717851359e-05, + "loss": 10.3243, + "step": 895 + }, + { + "epoch": 1.072732714756061, + "grad_norm": 0.09724269807338715, + "learning_rate": 4.47108424050536e-05, + "loss": 10.3302, + "step": 896 + }, + { + "epoch": 1.0739299610894941, + "grad_norm": 0.14585882425308228, + "learning_rate": 4.461675657551781e-05, + "loss": 10.3222, + "step": 897 + }, + { + "epoch": 1.0751272074229272, + "grad_norm": 0.08589126169681549, + "learning_rate": 4.452269002688897e-05, + "loss": 10.3053, + "step": 898 + }, + { + "epoch": 1.0763244537563603, + "grad_norm": 0.08568647503852844, + "learning_rate": 4.442864309608072e-05, + "loss": 10.3344, + "step": 899 + }, + { + "epoch": 1.0775217000897934, + "grad_norm": 0.14963027834892273, + "learning_rate": 4.433461611993651e-05, + "loss": 10.3189, + "step": 900 + }, + { + "epoch": 1.0787189464232265, + "grad_norm": 0.17478854954242706, + "learning_rate": 4.424060943522829e-05, + "loss": 10.3119, + "step": 901 + }, + { + "epoch": 1.0799161927566596, + "grad_norm": 0.13979867100715637, + "learning_rate": 4.4146623378655296e-05, + "loss": 10.3085, + "step": 902 + }, + { + "epoch": 1.0811134390900927, + "grad_norm": 0.14272142946720123, + "learning_rate": 4.405265828684297e-05, + "loss": 10.3178, + "step": 903 + }, + { + "epoch": 1.0823106854235258, + "grad_norm": 0.21423350274562836, + "learning_rate": 4.3958714496341576e-05, + "loss": 10.3164, + "step": 904 + }, + { + "epoch": 1.083507931756959, + "grad_norm": 0.1124742329120636, + "learning_rate": 4.386479234362512e-05, + "loss": 10.323, + "step": 905 + }, + { + "epoch": 1.084705178090392, + "grad_norm": 0.07562056183815002, + "learning_rate": 4.3770892165090126e-05, + "loss": 10.3052, + "step": 906 + }, + { + "epoch": 1.0859024244238251, + "grad_norm": 0.08453397452831268, + "learning_rate": 4.3677014297054394e-05, + "loss": 10.304, + "step": 907 + }, + { + "epoch": 1.0870996707572582, + "grad_norm": 0.06292343884706497, + "learning_rate": 4.358315907575579e-05, + "loss": 10.3313, + "step": 908 + }, + { + "epoch": 1.0882969170906913, + "grad_norm": 0.06008546054363251, + "learning_rate": 4.348932683735114e-05, + "loss": 10.3267, + "step": 909 + }, + { + "epoch": 1.0894941634241244, + "grad_norm": 0.06118779256939888, + "learning_rate": 4.3395517917914895e-05, + "loss": 10.3, + "step": 910 + }, + { + "epoch": 1.0906914097575575, + "grad_norm": 0.11741728335618973, + "learning_rate": 4.330173265343798e-05, + "loss": 10.3305, + "step": 911 + }, + { + "epoch": 1.0918886560909908, + "grad_norm": 0.08900406211614609, + "learning_rate": 4.3207971379826634e-05, + "loss": 10.31, + "step": 912 + }, + { + "epoch": 1.093085902424424, + "grad_norm": 0.1068909764289856, + "learning_rate": 4.3114234432901146e-05, + "loss": 10.3066, + "step": 913 + }, + { + "epoch": 1.094283148757857, + "grad_norm": 0.09474755823612213, + "learning_rate": 4.3020522148394676e-05, + "loss": 10.3074, + "step": 914 + }, + { + "epoch": 1.0954803950912901, + "grad_norm": 0.09461431950330734, + "learning_rate": 4.292683486195208e-05, + "loss": 10.3061, + "step": 915 + }, + { + "epoch": 1.0966776414247232, + "grad_norm": 0.09426665306091309, + "learning_rate": 4.283317290912863e-05, + "loss": 10.3163, + "step": 916 + }, + { + "epoch": 1.0978748877581563, + "grad_norm": 0.0746891051530838, + "learning_rate": 4.273953662538888e-05, + "loss": 10.3058, + "step": 917 + }, + { + "epoch": 1.0990721340915894, + "grad_norm": 0.0804484486579895, + "learning_rate": 4.2645926346105484e-05, + "loss": 10.3197, + "step": 918 + }, + { + "epoch": 1.1002693804250225, + "grad_norm": 0.07933864742517471, + "learning_rate": 4.255234240655793e-05, + "loss": 10.3099, + "step": 919 + }, + { + "epoch": 1.1014666267584556, + "grad_norm": 0.0669594556093216, + "learning_rate": 4.2458785141931314e-05, + "loss": 10.3039, + "step": 920 + }, + { + "epoch": 1.1026638730918887, + "grad_norm": 0.07099183648824692, + "learning_rate": 4.236525488731528e-05, + "loss": 10.3096, + "step": 921 + }, + { + "epoch": 1.1038611194253218, + "grad_norm": 0.08637561649084091, + "learning_rate": 4.22717519777027e-05, + "loss": 10.3265, + "step": 922 + }, + { + "epoch": 1.105058365758755, + "grad_norm": 0.10287933796644211, + "learning_rate": 4.2178276747988446e-05, + "loss": 10.3007, + "step": 923 + }, + { + "epoch": 1.106255612092188, + "grad_norm": 0.07196861505508423, + "learning_rate": 4.208482953296838e-05, + "loss": 10.3053, + "step": 924 + }, + { + "epoch": 1.1074528584256211, + "grad_norm": 0.08277542144060135, + "learning_rate": 4.1991410667337896e-05, + "loss": 10.316, + "step": 925 + }, + { + "epoch": 1.1086501047590542, + "grad_norm": 0.10728137940168381, + "learning_rate": 4.189802048569089e-05, + "loss": 10.293, + "step": 926 + }, + { + "epoch": 1.1098473510924873, + "grad_norm": 0.0857047513127327, + "learning_rate": 4.180465932251859e-05, + "loss": 10.305, + "step": 927 + }, + { + "epoch": 1.1110445974259204, + "grad_norm": 0.1254672110080719, + "learning_rate": 4.171132751220818e-05, + "loss": 10.287, + "step": 928 + }, + { + "epoch": 1.1122418437593535, + "grad_norm": 0.12211690843105316, + "learning_rate": 4.161802538904177e-05, + "loss": 10.3934, + "step": 929 + }, + { + "epoch": 1.1134390900927866, + "grad_norm": 0.1617048978805542, + "learning_rate": 4.1524753287195165e-05, + "loss": 10.2896, + "step": 930 + }, + { + "epoch": 1.1146363364262197, + "grad_norm": 0.12853345274925232, + "learning_rate": 4.1431511540736626e-05, + "loss": 10.2947, + "step": 931 + }, + { + "epoch": 1.1158335827596528, + "grad_norm": 0.144204780459404, + "learning_rate": 4.1338300483625615e-05, + "loss": 10.3067, + "step": 932 + }, + { + "epoch": 1.117030829093086, + "grad_norm": 0.1469617486000061, + "learning_rate": 4.124512044971182e-05, + "loss": 10.2439, + "step": 933 + }, + { + "epoch": 1.118228075426519, + "grad_norm": 0.22333326935768127, + "learning_rate": 4.1151971772733686e-05, + "loss": 10.2991, + "step": 934 + }, + { + "epoch": 1.1194253217599521, + "grad_norm": 0.3190890848636627, + "learning_rate": 4.105885478631741e-05, + "loss": 10.5589, + "step": 935 + }, + { + "epoch": 1.1206225680933852, + "grad_norm": 0.1713470071554184, + "learning_rate": 4.0965769823975696e-05, + "loss": 9.9278, + "step": 936 + }, + { + "epoch": 1.1218198144268183, + "grad_norm": 0.19788843393325806, + "learning_rate": 4.08727172191065e-05, + "loss": 10.3014, + "step": 937 + }, + { + "epoch": 1.1230170607602514, + "grad_norm": 0.17718441784381866, + "learning_rate": 4.07796973049919e-05, + "loss": 10.2948, + "step": 938 + }, + { + "epoch": 1.1242143070936845, + "grad_norm": 0.22185468673706055, + "learning_rate": 4.068671041479694e-05, + "loss": 10.3078, + "step": 939 + }, + { + "epoch": 1.1254115534271176, + "grad_norm": 0.18736934661865234, + "learning_rate": 4.059375688156832e-05, + "loss": 10.3438, + "step": 940 + }, + { + "epoch": 1.1266087997605507, + "grad_norm": 0.24128462374210358, + "learning_rate": 4.0500837038233276e-05, + "loss": 10.3447, + "step": 941 + }, + { + "epoch": 1.1278060460939838, + "grad_norm": 0.20960354804992676, + "learning_rate": 4.04079512175984e-05, + "loss": 10.3567, + "step": 942 + }, + { + "epoch": 1.129003292427417, + "grad_norm": 0.17799775302410126, + "learning_rate": 4.031509975234844e-05, + "loss": 10.2962, + "step": 943 + }, + { + "epoch": 1.13020053876085, + "grad_norm": 0.13640834391117096, + "learning_rate": 4.022228297504503e-05, + "loss": 10.3326, + "step": 944 + }, + { + "epoch": 1.1313977850942831, + "grad_norm": 0.11730225384235382, + "learning_rate": 4.012950121812565e-05, + "loss": 10.33, + "step": 945 + }, + { + "epoch": 1.1325950314277162, + "grad_norm": 0.0996352881193161, + "learning_rate": 4.003675481390228e-05, + "loss": 10.3294, + "step": 946 + }, + { + "epoch": 1.1337922777611493, + "grad_norm": 0.12392750382423401, + "learning_rate": 3.994404409456031e-05, + "loss": 10.3289, + "step": 947 + }, + { + "epoch": 1.1349895240945824, + "grad_norm": 0.09263347834348679, + "learning_rate": 3.985136939215731e-05, + "loss": 10.3326, + "step": 948 + }, + { + "epoch": 1.1361867704280155, + "grad_norm": 0.07573807239532471, + "learning_rate": 3.975873103862188e-05, + "loss": 10.3181, + "step": 949 + }, + { + "epoch": 1.1373840167614486, + "grad_norm": 0.08924584090709686, + "learning_rate": 3.966612936575235e-05, + "loss": 10.3264, + "step": 950 + }, + { + "epoch": 1.1385812630948817, + "grad_norm": 0.10707958787679672, + "learning_rate": 3.957356470521578e-05, + "loss": 10.316, + "step": 951 + }, + { + "epoch": 1.1397785094283148, + "grad_norm": 0.18007458746433258, + "learning_rate": 3.94810373885466e-05, + "loss": 10.3188, + "step": 952 + }, + { + "epoch": 1.140975755761748, + "grad_norm": 0.18197913467884064, + "learning_rate": 3.938854774714546e-05, + "loss": 10.3237, + "step": 953 + }, + { + "epoch": 1.142173002095181, + "grad_norm": 0.1612328290939331, + "learning_rate": 3.929609611227817e-05, + "loss": 10.3031, + "step": 954 + }, + { + "epoch": 1.1433702484286141, + "grad_norm": 0.112253338098526, + "learning_rate": 3.9203682815074316e-05, + "loss": 10.3147, + "step": 955 + }, + { + "epoch": 1.1445674947620472, + "grad_norm": 0.10741689801216125, + "learning_rate": 3.911130818652621e-05, + "loss": 10.3083, + "step": 956 + }, + { + "epoch": 1.1457647410954803, + "grad_norm": 0.1522885113954544, + "learning_rate": 3.9018972557487694e-05, + "loss": 10.3016, + "step": 957 + }, + { + "epoch": 1.1469619874289134, + "grad_norm": 0.09613947570323944, + "learning_rate": 3.8926676258672866e-05, + "loss": 10.3117, + "step": 958 + }, + { + "epoch": 1.1481592337623465, + "grad_norm": 0.0654083788394928, + "learning_rate": 3.883441962065499e-05, + "loss": 10.3079, + "step": 959 + }, + { + "epoch": 1.1493564800957796, + "grad_norm": 0.06491865962743759, + "learning_rate": 3.87422029738653e-05, + "loss": 10.3167, + "step": 960 + }, + { + "epoch": 1.1505537264292127, + "grad_norm": 0.08449612557888031, + "learning_rate": 3.865002664859178e-05, + "loss": 10.3138, + "step": 961 + }, + { + "epoch": 1.1517509727626458, + "grad_norm": 0.07651477307081223, + "learning_rate": 3.855789097497794e-05, + "loss": 10.3146, + "step": 962 + }, + { + "epoch": 1.152948219096079, + "grad_norm": 0.09476931393146515, + "learning_rate": 3.846579628302179e-05, + "loss": 10.3032, + "step": 963 + }, + { + "epoch": 1.154145465429512, + "grad_norm": 0.08630012720823288, + "learning_rate": 3.837374290257449e-05, + "loss": 10.3064, + "step": 964 + }, + { + "epoch": 1.1553427117629451, + "grad_norm": 0.1085483506321907, + "learning_rate": 3.828173116333925e-05, + "loss": 10.305, + "step": 965 + }, + { + "epoch": 1.1565399580963782, + "grad_norm": 0.10933104157447815, + "learning_rate": 3.818976139487017e-05, + "loss": 10.2915, + "step": 966 + }, + { + "epoch": 1.1577372044298113, + "grad_norm": 0.07977797836065292, + "learning_rate": 3.809783392657096e-05, + "loss": 10.3031, + "step": 967 + }, + { + "epoch": 1.1589344507632444, + "grad_norm": 0.09669488668441772, + "learning_rate": 3.800594908769385e-05, + "loss": 10.3051, + "step": 968 + }, + { + "epoch": 1.1601316970966775, + "grad_norm": 0.12084520608186722, + "learning_rate": 3.791410720733844e-05, + "loss": 10.2954, + "step": 969 + }, + { + "epoch": 1.1613289434301108, + "grad_norm": 0.06845562160015106, + "learning_rate": 3.7822308614450406e-05, + "loss": 10.3, + "step": 970 + }, + { + "epoch": 1.162526189763544, + "grad_norm": 0.0728553757071495, + "learning_rate": 3.773055363782039e-05, + "loss": 10.323, + "step": 971 + }, + { + "epoch": 1.163723436096977, + "grad_norm": 0.1105380430817604, + "learning_rate": 3.763884260608284e-05, + "loss": 10.3143, + "step": 972 + }, + { + "epoch": 1.1649206824304101, + "grad_norm": 0.07627365738153458, + "learning_rate": 3.7547175847714806e-05, + "loss": 10.2791, + "step": 973 + }, + { + "epoch": 1.1661179287638432, + "grad_norm": 0.07864202558994293, + "learning_rate": 3.745555369103471e-05, + "loss": 10.3202, + "step": 974 + }, + { + "epoch": 1.1673151750972763, + "grad_norm": 0.09418646991252899, + "learning_rate": 3.736397646420135e-05, + "loss": 10.2617, + "step": 975 + }, + { + "epoch": 1.1685124214307094, + "grad_norm": 0.10002101212739944, + "learning_rate": 3.727244449521245e-05, + "loss": 10.2926, + "step": 976 + }, + { + "epoch": 1.1697096677641425, + "grad_norm": 0.11879831552505493, + "learning_rate": 3.718095811190374e-05, + "loss": 10.2801, + "step": 977 + }, + { + "epoch": 1.1709069140975756, + "grad_norm": 0.18288545310497284, + "learning_rate": 3.708951764194767e-05, + "loss": 10.2928, + "step": 978 + }, + { + "epoch": 1.1721041604310087, + "grad_norm": 0.13710980117321014, + "learning_rate": 3.699812341285219e-05, + "loss": 10.3638, + "step": 979 + }, + { + "epoch": 1.1733014067644418, + "grad_norm": 0.15309835970401764, + "learning_rate": 3.690677575195967e-05, + "loss": 10.2474, + "step": 980 + }, + { + "epoch": 1.174498653097875, + "grad_norm": 0.13174058496952057, + "learning_rate": 3.681547498644568e-05, + "loss": 10.2873, + "step": 981 + }, + { + "epoch": 1.175695899431308, + "grad_norm": 0.15391582250595093, + "learning_rate": 3.6724221443317855e-05, + "loss": 10.351, + "step": 982 + }, + { + "epoch": 1.1768931457647411, + "grad_norm": 0.13625121116638184, + "learning_rate": 3.663301544941463e-05, + "loss": 10.306, + "step": 983 + }, + { + "epoch": 1.1780903920981742, + "grad_norm": 0.24699892103672028, + "learning_rate": 3.65418573314042e-05, + "loss": 10.3619, + "step": 984 + }, + { + "epoch": 1.1792876384316073, + "grad_norm": 0.37313058972358704, + "learning_rate": 3.645074741578326e-05, + "loss": 10.0332, + "step": 985 + }, + { + "epoch": 1.1804848847650404, + "grad_norm": 0.19191929697990417, + "learning_rate": 3.635968602887585e-05, + "loss": 10.2804, + "step": 986 + }, + { + "epoch": 1.1816821310984735, + "grad_norm": 0.2202102541923523, + "learning_rate": 3.626867349683223e-05, + "loss": 10.3138, + "step": 987 + }, + { + "epoch": 1.1828793774319066, + "grad_norm": 0.1774865984916687, + "learning_rate": 3.6177710145627635e-05, + "loss": 10.2925, + "step": 988 + }, + { + "epoch": 1.1840766237653397, + "grad_norm": 0.20344260334968567, + "learning_rate": 3.608679630106117e-05, + "loss": 10.3248, + "step": 989 + }, + { + "epoch": 1.1852738700987728, + "grad_norm": 0.18478690087795258, + "learning_rate": 3.599593228875465e-05, + "loss": 10.3038, + "step": 990 + }, + { + "epoch": 1.186471116432206, + "grad_norm": 0.19371066987514496, + "learning_rate": 3.590511843415139e-05, + "loss": 10.3182, + "step": 991 + }, + { + "epoch": 1.187668362765639, + "grad_norm": 0.21246269345283508, + "learning_rate": 3.581435506251501e-05, + "loss": 10.3276, + "step": 992 + }, + { + "epoch": 1.1888656090990721, + "grad_norm": 0.1605980545282364, + "learning_rate": 3.572364249892841e-05, + "loss": 10.3401, + "step": 993 + }, + { + "epoch": 1.1900628554325052, + "grad_norm": 0.1444818526506424, + "learning_rate": 3.563298106829244e-05, + "loss": 10.3297, + "step": 994 + }, + { + "epoch": 1.1912601017659383, + "grad_norm": 0.13539260625839233, + "learning_rate": 3.554237109532483e-05, + "loss": 10.3078, + "step": 995 + }, + { + "epoch": 1.1924573480993714, + "grad_norm": 0.14990341663360596, + "learning_rate": 3.545181290455904e-05, + "loss": 10.336, + "step": 996 + }, + { + "epoch": 1.1936545944328045, + "grad_norm": 0.10911143571138382, + "learning_rate": 3.5361306820343e-05, + "loss": 10.2955, + "step": 997 + }, + { + "epoch": 1.1948518407662376, + "grad_norm": 0.12982481718063354, + "learning_rate": 3.527085316683805e-05, + "loss": 10.3127, + "step": 998 + }, + { + "epoch": 1.1960490870996707, + "grad_norm": 0.08096631616353989, + "learning_rate": 3.518045226801777e-05, + "loss": 10.3098, + "step": 999 + }, + { + "epoch": 1.1972463334331038, + "grad_norm": 0.08309336006641388, + "learning_rate": 3.509010444766674e-05, + "loss": 10.3167, + "step": 1000 + }, + { + "epoch": 1.198443579766537, + "grad_norm": 0.06321199238300323, + "learning_rate": 3.499981002937943e-05, + "loss": 10.3193, + "step": 1001 + }, + { + "epoch": 1.19964082609997, + "grad_norm": 0.08466428518295288, + "learning_rate": 3.490956933655909e-05, + "loss": 10.3141, + "step": 1002 + }, + { + "epoch": 1.2008380724334031, + "grad_norm": 0.1582259088754654, + "learning_rate": 3.481938269241653e-05, + "loss": 10.306, + "step": 1003 + }, + { + "epoch": 1.2020353187668362, + "grad_norm": 0.1640256941318512, + "learning_rate": 3.4729250419968906e-05, + "loss": 10.2939, + "step": 1004 + }, + { + "epoch": 1.2032325651002693, + "grad_norm": 0.14718499779701233, + "learning_rate": 3.463917284203876e-05, + "loss": 10.3063, + "step": 1005 + }, + { + "epoch": 1.2044298114337024, + "grad_norm": 0.1664147824048996, + "learning_rate": 3.4549150281252636e-05, + "loss": 10.3039, + "step": 1006 + }, + { + "epoch": 1.2056270577671355, + "grad_norm": 0.18028020858764648, + "learning_rate": 3.4459183060040044e-05, + "loss": 10.3039, + "step": 1007 + }, + { + "epoch": 1.2068243041005686, + "grad_norm": 0.11442149430513382, + "learning_rate": 3.4369271500632336e-05, + "loss": 10.302, + "step": 1008 + }, + { + "epoch": 1.2080215504340017, + "grad_norm": 0.08839763700962067, + "learning_rate": 3.427941592506144e-05, + "loss": 10.3002, + "step": 1009 + }, + { + "epoch": 1.2092187967674348, + "grad_norm": 0.0720866397023201, + "learning_rate": 3.41896166551588e-05, + "loss": 10.303, + "step": 1010 + }, + { + "epoch": 1.210416043100868, + "grad_norm": 0.07061614096164703, + "learning_rate": 3.4099874012554205e-05, + "loss": 10.3241, + "step": 1011 + }, + { + "epoch": 1.211613289434301, + "grad_norm": 0.06903503090143204, + "learning_rate": 3.4010188318674614e-05, + "loss": 10.3067, + "step": 1012 + }, + { + "epoch": 1.2128105357677341, + "grad_norm": 0.07008113712072372, + "learning_rate": 3.3920559894742975e-05, + "loss": 10.3188, + "step": 1013 + }, + { + "epoch": 1.2140077821011672, + "grad_norm": 0.09250228106975555, + "learning_rate": 3.383098906177719e-05, + "loss": 10.3029, + "step": 1014 + }, + { + "epoch": 1.2152050284346005, + "grad_norm": 0.06623218953609467, + "learning_rate": 3.3741476140588824e-05, + "loss": 10.2997, + "step": 1015 + }, + { + "epoch": 1.2164022747680336, + "grad_norm": 0.07896113395690918, + "learning_rate": 3.365202145178205e-05, + "loss": 10.3002, + "step": 1016 + }, + { + "epoch": 1.2175995211014667, + "grad_norm": 0.08111994713544846, + "learning_rate": 3.356262531575251e-05, + "loss": 10.3021, + "step": 1017 + }, + { + "epoch": 1.2187967674348998, + "grad_norm": 0.1251569241285324, + "learning_rate": 3.347328805268605e-05, + "loss": 10.2967, + "step": 1018 + }, + { + "epoch": 1.219994013768333, + "grad_norm": 0.08974624425172806, + "learning_rate": 3.3384009982557706e-05, + "loss": 10.3191, + "step": 1019 + }, + { + "epoch": 1.221191260101766, + "grad_norm": 0.08964145928621292, + "learning_rate": 3.329479142513051e-05, + "loss": 10.3148, + "step": 1020 + }, + { + "epoch": 1.2223885064351991, + "grad_norm": 0.08103568851947784, + "learning_rate": 3.320563269995432e-05, + "loss": 10.3118, + "step": 1021 + }, + { + "epoch": 1.2235857527686322, + "grad_norm": 0.15111342072486877, + "learning_rate": 3.3116534126364685e-05, + "loss": 10.3137, + "step": 1022 + }, + { + "epoch": 1.2247829991020653, + "grad_norm": 0.08325458317995071, + "learning_rate": 3.302749602348175e-05, + "loss": 10.2816, + "step": 1023 + }, + { + "epoch": 1.2259802454354984, + "grad_norm": 0.08655396103858948, + "learning_rate": 3.293851871020905e-05, + "loss": 10.2973, + "step": 1024 + }, + { + "epoch": 1.2271774917689315, + "grad_norm": 0.08695276081562042, + "learning_rate": 3.284960250523237e-05, + "loss": 10.2798, + "step": 1025 + }, + { + "epoch": 1.2283747381023646, + "grad_norm": 0.1016772910952568, + "learning_rate": 3.2760747727018694e-05, + "loss": 10.3121, + "step": 1026 + }, + { + "epoch": 1.2295719844357977, + "grad_norm": 0.09758095443248749, + "learning_rate": 3.267195469381492e-05, + "loss": 10.2777, + "step": 1027 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 0.0810599997639656, + "learning_rate": 3.2583223723646836e-05, + "loss": 10.3078, + "step": 1028 + }, + { + "epoch": 1.231966477102664, + "grad_norm": 0.2638009786605835, + "learning_rate": 3.249455513431797e-05, + "loss": 10.2972, + "step": 1029 + }, + { + "epoch": 1.233163723436097, + "grad_norm": 0.16153264045715332, + "learning_rate": 3.240594924340835e-05, + "loss": 10.4261, + "step": 1030 + }, + { + "epoch": 1.2343609697695301, + "grad_norm": 0.13728973269462585, + "learning_rate": 3.231740636827349e-05, + "loss": 10.1713, + "step": 1031 + }, + { + "epoch": 1.2355582161029632, + "grad_norm": 0.18863603472709656, + "learning_rate": 3.2228926826043224e-05, + "loss": 10.3618, + "step": 1032 + }, + { + "epoch": 1.2367554624363963, + "grad_norm": 0.15606525540351868, + "learning_rate": 3.2140510933620506e-05, + "loss": 10.2316, + "step": 1033 + }, + { + "epoch": 1.2379527087698294, + "grad_norm": 0.2645421624183655, + "learning_rate": 3.205215900768029e-05, + "loss": 10.5211, + "step": 1034 + }, + { + "epoch": 1.2391499551032625, + "grad_norm": 0.35228899121284485, + "learning_rate": 3.196387136466853e-05, + "loss": 9.9339, + "step": 1035 + }, + { + "epoch": 1.2403472014366956, + "grad_norm": 0.1876143217086792, + "learning_rate": 3.187564832080084e-05, + "loss": 10.2978, + "step": 1036 + }, + { + "epoch": 1.2415444477701287, + "grad_norm": 0.17238491773605347, + "learning_rate": 3.178749019206151e-05, + "loss": 10.2753, + "step": 1037 + }, + { + "epoch": 1.2427416941035618, + "grad_norm": 0.16385385394096375, + "learning_rate": 3.169939729420233e-05, + "loss": 10.2724, + "step": 1038 + }, + { + "epoch": 1.243938940436995, + "grad_norm": 0.17765510082244873, + "learning_rate": 3.161136994274141e-05, + "loss": 10.3178, + "step": 1039 + }, + { + "epoch": 1.245136186770428, + "grad_norm": 0.18540142476558685, + "learning_rate": 3.152340845296216e-05, + "loss": 10.3372, + "step": 1040 + }, + { + "epoch": 1.2463334331038611, + "grad_norm": 0.16380912065505981, + "learning_rate": 3.1435513139912045e-05, + "loss": 10.3301, + "step": 1041 + }, + { + "epoch": 1.2475306794372942, + "grad_norm": 0.15362080931663513, + "learning_rate": 3.1347684318401536e-05, + "loss": 10.2918, + "step": 1042 + }, + { + "epoch": 1.2487279257707273, + "grad_norm": 0.1350010186433792, + "learning_rate": 3.125992230300294e-05, + "loss": 10.3383, + "step": 1043 + }, + { + "epoch": 1.2499251721041604, + "grad_norm": 0.14435046911239624, + "learning_rate": 3.11722274080493e-05, + "loss": 10.3343, + "step": 1044 + }, + { + "epoch": 1.2511224184375935, + "grad_norm": 0.136815145611763, + "learning_rate": 3.1084599947633256e-05, + "loss": 10.3045, + "step": 1045 + }, + { + "epoch": 1.2523196647710266, + "grad_norm": 0.09616651386022568, + "learning_rate": 3.099704023560587e-05, + "loss": 10.3091, + "step": 1046 + }, + { + "epoch": 1.2535169111044597, + "grad_norm": 0.12489359825849533, + "learning_rate": 3.090954858557567e-05, + "loss": 10.3232, + "step": 1047 + }, + { + "epoch": 1.2547141574378928, + "grad_norm": 0.12322378903627396, + "learning_rate": 3.08221253109073e-05, + "loss": 10.3133, + "step": 1048 + }, + { + "epoch": 1.255911403771326, + "grad_norm": 0.10134118795394897, + "learning_rate": 3.073477072472054e-05, + "loss": 10.3029, + "step": 1049 + }, + { + "epoch": 1.257108650104759, + "grad_norm": 0.12778601050376892, + "learning_rate": 3.0647485139889145e-05, + "loss": 10.313, + "step": 1050 + }, + { + "epoch": 1.2583058964381921, + "grad_norm": 0.06709828227758408, + "learning_rate": 3.056026886903978e-05, + "loss": 10.3195, + "step": 1051 + }, + { + "epoch": 1.2595031427716252, + "grad_norm": 0.06393318623304367, + "learning_rate": 3.0473122224550787e-05, + "loss": 10.2833, + "step": 1052 + }, + { + "epoch": 1.2607003891050583, + "grad_norm": 0.06393256783485413, + "learning_rate": 3.0386045518551158e-05, + "loss": 10.3031, + "step": 1053 + }, + { + "epoch": 1.2618976354384914, + "grad_norm": 0.07667026668787003, + "learning_rate": 3.0299039062919416e-05, + "loss": 10.307, + "step": 1054 + }, + { + "epoch": 1.2630948817719245, + "grad_norm": 0.11269262433052063, + "learning_rate": 3.0212103169282414e-05, + "loss": 10.2947, + "step": 1055 + }, + { + "epoch": 1.2642921281053576, + "grad_norm": 0.11728843301534653, + "learning_rate": 3.01252381490143e-05, + "loss": 10.293, + "step": 1056 + }, + { + "epoch": 1.2654893744387907, + "grad_norm": 0.13670283555984497, + "learning_rate": 3.0038444313235424e-05, + "loss": 10.2914, + "step": 1057 + }, + { + "epoch": 1.2666866207722238, + "grad_norm": 0.13579019904136658, + "learning_rate": 2.995172197281113e-05, + "loss": 10.3127, + "step": 1058 + }, + { + "epoch": 1.267883867105657, + "grad_norm": 0.07041673362255096, + "learning_rate": 2.986507143835066e-05, + "loss": 10.3204, + "step": 1059 + }, + { + "epoch": 1.26908111343909, + "grad_norm": 0.11383993923664093, + "learning_rate": 2.9778493020206154e-05, + "loss": 10.2798, + "step": 1060 + }, + { + "epoch": 1.2702783597725231, + "grad_norm": 0.10600785911083221, + "learning_rate": 2.9691987028471407e-05, + "loss": 10.3009, + "step": 1061 + }, + { + "epoch": 1.2714756061059562, + "grad_norm": 0.06610407680273056, + "learning_rate": 2.96055537729808e-05, + "loss": 10.322, + "step": 1062 + }, + { + "epoch": 1.2726728524393893, + "grad_norm": 0.09653027355670929, + "learning_rate": 2.9519193563308234e-05, + "loss": 10.2953, + "step": 1063 + }, + { + "epoch": 1.2738700987728224, + "grad_norm": 0.09015870839357376, + "learning_rate": 2.943290670876595e-05, + "loss": 10.3223, + "step": 1064 + }, + { + "epoch": 1.2750673451062555, + "grad_norm": 0.09204290807247162, + "learning_rate": 2.9346693518403458e-05, + "loss": 10.2921, + "step": 1065 + }, + { + "epoch": 1.2762645914396886, + "grad_norm": 0.06748061627149582, + "learning_rate": 2.926055430100647e-05, + "loss": 10.3171, + "step": 1066 + }, + { + "epoch": 1.2774618377731217, + "grad_norm": 0.07431725412607193, + "learning_rate": 2.9174489365095713e-05, + "loss": 10.294, + "step": 1067 + }, + { + "epoch": 1.2786590841065548, + "grad_norm": 0.08900409191846848, + "learning_rate": 2.908849901892587e-05, + "loss": 10.2999, + "step": 1068 + }, + { + "epoch": 1.279856330439988, + "grad_norm": 0.08584905415773392, + "learning_rate": 2.9002583570484475e-05, + "loss": 10.3086, + "step": 1069 + }, + { + "epoch": 1.281053576773421, + "grad_norm": 0.10978789627552032, + "learning_rate": 2.8916743327490803e-05, + "loss": 10.3005, + "step": 1070 + }, + { + "epoch": 1.2822508231068541, + "grad_norm": 0.10748513042926788, + "learning_rate": 2.8830978597394776e-05, + "loss": 10.2998, + "step": 1071 + }, + { + "epoch": 1.2834480694402872, + "grad_norm": 0.11631134152412415, + "learning_rate": 2.8745289687375843e-05, + "loss": 10.3153, + "step": 1072 + }, + { + "epoch": 1.2846453157737203, + "grad_norm": 0.0836760401725769, + "learning_rate": 2.8659676904341903e-05, + "loss": 10.2951, + "step": 1073 + }, + { + "epoch": 1.2858425621071534, + "grad_norm": 0.12832395732402802, + "learning_rate": 2.8574140554928175e-05, + "loss": 10.3112, + "step": 1074 + }, + { + "epoch": 1.2870398084405865, + "grad_norm": 0.12957684695720673, + "learning_rate": 2.848868094549615e-05, + "loss": 10.2542, + "step": 1075 + }, + { + "epoch": 1.2882370547740196, + "grad_norm": 0.11982838809490204, + "learning_rate": 2.8403298382132437e-05, + "loss": 10.2846, + "step": 1076 + }, + { + "epoch": 1.289434301107453, + "grad_norm": 0.0989912822842598, + "learning_rate": 2.8317993170647682e-05, + "loss": 10.2962, + "step": 1077 + }, + { + "epoch": 1.290631547440886, + "grad_norm": 0.19544099271297455, + "learning_rate": 2.8232765616575563e-05, + "loss": 10.2926, + "step": 1078 + }, + { + "epoch": 1.2918287937743191, + "grad_norm": 0.08971533924341202, + "learning_rate": 2.8147616025171504e-05, + "loss": 10.3442, + "step": 1079 + }, + { + "epoch": 1.2930260401077522, + "grad_norm": 0.13074661791324615, + "learning_rate": 2.8062544701411742e-05, + "loss": 10.3157, + "step": 1080 + }, + { + "epoch": 1.2942232864411853, + "grad_norm": 0.12712165713310242, + "learning_rate": 2.7977551949992225e-05, + "loss": 10.1971, + "step": 1081 + }, + { + "epoch": 1.2954205327746184, + "grad_norm": 0.13777250051498413, + "learning_rate": 2.789263807532746e-05, + "loss": 10.2694, + "step": 1082 + }, + { + "epoch": 1.2966177791080515, + "grad_norm": 0.1710941195487976, + "learning_rate": 2.780780338154937e-05, + "loss": 10.3681, + "step": 1083 + }, + { + "epoch": 1.2978150254414846, + "grad_norm": 0.19491247832775116, + "learning_rate": 2.7723048172506395e-05, + "loss": 10.3156, + "step": 1084 + }, + { + "epoch": 1.2990122717749177, + "grad_norm": 0.36479124426841736, + "learning_rate": 2.763837275176224e-05, + "loss": 10.1861, + "step": 1085 + }, + { + "epoch": 1.3002095181083508, + "grad_norm": 0.2100205272436142, + "learning_rate": 2.7553777422594774e-05, + "loss": 10.1825, + "step": 1086 + }, + { + "epoch": 1.301406764441784, + "grad_norm": 0.174909770488739, + "learning_rate": 2.7469262487995123e-05, + "loss": 10.3011, + "step": 1087 + }, + { + "epoch": 1.302604010775217, + "grad_norm": 0.15478569269180298, + "learning_rate": 2.7384828250666396e-05, + "loss": 10.2704, + "step": 1088 + }, + { + "epoch": 1.3038012571086501, + "grad_norm": 0.12674710154533386, + "learning_rate": 2.7300475013022663e-05, + "loss": 10.3054, + "step": 1089 + }, + { + "epoch": 1.3049985034420832, + "grad_norm": 0.12266760319471359, + "learning_rate": 2.721620307718793e-05, + "loss": 10.346, + "step": 1090 + }, + { + "epoch": 1.3061957497755163, + "grad_norm": 0.1265881061553955, + "learning_rate": 2.713201274499496e-05, + "loss": 10.3101, + "step": 1091 + }, + { + "epoch": 1.3073929961089494, + "grad_norm": 0.1043478325009346, + "learning_rate": 2.7047904317984273e-05, + "loss": 10.3042, + "step": 1092 + }, + { + "epoch": 1.3085902424423825, + "grad_norm": 0.14881400763988495, + "learning_rate": 2.6963878097403027e-05, + "loss": 10.3102, + "step": 1093 + }, + { + "epoch": 1.3097874887758156, + "grad_norm": 0.1407453864812851, + "learning_rate": 2.687993438420392e-05, + "loss": 10.3146, + "step": 1094 + }, + { + "epoch": 1.3109847351092487, + "grad_norm": 0.14604611694812775, + "learning_rate": 2.6796073479044174e-05, + "loss": 10.3393, + "step": 1095 + }, + { + "epoch": 1.3121819814426818, + "grad_norm": 0.15485920011997223, + "learning_rate": 2.6712295682284403e-05, + "loss": 10.3077, + "step": 1096 + }, + { + "epoch": 1.313379227776115, + "grad_norm": 0.1861550211906433, + "learning_rate": 2.6628601293987542e-05, + "loss": 10.3179, + "step": 1097 + }, + { + "epoch": 1.314576474109548, + "grad_norm": 0.10435593873262405, + "learning_rate": 2.65449906139178e-05, + "loss": 10.3072, + "step": 1098 + }, + { + "epoch": 1.3157737204429811, + "grad_norm": 0.13269411027431488, + "learning_rate": 2.6461463941539628e-05, + "loss": 10.3093, + "step": 1099 + }, + { + "epoch": 1.3169709667764142, + "grad_norm": 0.08386247605085373, + "learning_rate": 2.6378021576016466e-05, + "loss": 10.3089, + "step": 1100 + }, + { + "epoch": 1.3181682131098473, + "grad_norm": 0.09445594251155853, + "learning_rate": 2.6294663816209875e-05, + "loss": 10.3036, + "step": 1101 + }, + { + "epoch": 1.3193654594432804, + "grad_norm": 0.06842301785945892, + "learning_rate": 2.6211390960678413e-05, + "loss": 10.3015, + "step": 1102 + }, + { + "epoch": 1.3205627057767135, + "grad_norm": 0.06476344168186188, + "learning_rate": 2.612820330767651e-05, + "loss": 10.2952, + "step": 1103 + }, + { + "epoch": 1.3217599521101466, + "grad_norm": 0.10115151852369308, + "learning_rate": 2.604510115515336e-05, + "loss": 10.2927, + "step": 1104 + }, + { + "epoch": 1.3229571984435797, + "grad_norm": 0.06991171836853027, + "learning_rate": 2.5962084800752063e-05, + "loss": 10.3076, + "step": 1105 + }, + { + "epoch": 1.3241544447770128, + "grad_norm": 0.07870890200138092, + "learning_rate": 2.5879154541808337e-05, + "loss": 10.2997, + "step": 1106 + }, + { + "epoch": 1.325351691110446, + "grad_norm": 0.07721219211816788, + "learning_rate": 2.579631067534949e-05, + "loss": 10.2955, + "step": 1107 + }, + { + "epoch": 1.326548937443879, + "grad_norm": 0.09499223530292511, + "learning_rate": 2.5713553498093505e-05, + "loss": 10.2991, + "step": 1108 + }, + { + "epoch": 1.3277461837773121, + "grad_norm": 0.07830832898616791, + "learning_rate": 2.563088330644783e-05, + "loss": 10.3065, + "step": 1109 + }, + { + "epoch": 1.3289434301107452, + "grad_norm": 0.06924288719892502, + "learning_rate": 2.554830039650834e-05, + "loss": 10.3087, + "step": 1110 + }, + { + "epoch": 1.3301406764441783, + "grad_norm": 0.10514383763074875, + "learning_rate": 2.546580506405833e-05, + "loss": 10.3167, + "step": 1111 + }, + { + "epoch": 1.3313379227776114, + "grad_norm": 0.08361051976680756, + "learning_rate": 2.5383397604567394e-05, + "loss": 10.2955, + "step": 1112 + }, + { + "epoch": 1.3325351691110445, + "grad_norm": 0.11651310324668884, + "learning_rate": 2.530107831319042e-05, + "loss": 10.2938, + "step": 1113 + }, + { + "epoch": 1.3337324154444776, + "grad_norm": 0.08024927228689194, + "learning_rate": 2.5218847484766495e-05, + "loss": 10.3093, + "step": 1114 + }, + { + "epoch": 1.3349296617779107, + "grad_norm": 0.07790761440992355, + "learning_rate": 2.5136705413817875e-05, + "loss": 10.2938, + "step": 1115 + }, + { + "epoch": 1.336126908111344, + "grad_norm": 0.07910799980163574, + "learning_rate": 2.5054652394548893e-05, + "loss": 10.2544, + "step": 1116 + }, + { + "epoch": 1.3373241544447771, + "grad_norm": 0.07129036635160446, + "learning_rate": 2.497268872084495e-05, + "loss": 10.317, + "step": 1117 + }, + { + "epoch": 1.3385214007782102, + "grad_norm": 0.07619348913431168, + "learning_rate": 2.4890814686271448e-05, + "loss": 10.2929, + "step": 1118 + }, + { + "epoch": 1.3397186471116433, + "grad_norm": 0.06521634012460709, + "learning_rate": 2.480903058407269e-05, + "loss": 10.2896, + "step": 1119 + }, + { + "epoch": 1.3409158934450764, + "grad_norm": 0.11176613718271255, + "learning_rate": 2.4727336707170973e-05, + "loss": 10.294, + "step": 1120 + }, + { + "epoch": 1.3421131397785095, + "grad_norm": 0.08429154753684998, + "learning_rate": 2.4645733348165307e-05, + "loss": 10.3, + "step": 1121 + }, + { + "epoch": 1.3433103861119426, + "grad_norm": 0.12477404624223709, + "learning_rate": 2.456422079933056e-05, + "loss": 10.3017, + "step": 1122 + }, + { + "epoch": 1.3445076324453757, + "grad_norm": 0.0852007120847702, + "learning_rate": 2.4482799352616397e-05, + "loss": 10.2909, + "step": 1123 + }, + { + "epoch": 1.3457048787788088, + "grad_norm": 0.08019814640283585, + "learning_rate": 2.4401469299646133e-05, + "loss": 10.2955, + "step": 1124 + }, + { + "epoch": 1.346902125112242, + "grad_norm": 0.10162685066461563, + "learning_rate": 2.43202309317157e-05, + "loss": 10.2978, + "step": 1125 + }, + { + "epoch": 1.348099371445675, + "grad_norm": 0.10296429693698883, + "learning_rate": 2.4239084539792745e-05, + "loss": 10.2967, + "step": 1126 + }, + { + "epoch": 1.3492966177791081, + "grad_norm": 0.08111916482448578, + "learning_rate": 2.4158030414515448e-05, + "loss": 10.2745, + "step": 1127 + }, + { + "epoch": 1.3504938641125412, + "grad_norm": 0.11755450814962387, + "learning_rate": 2.4077068846191453e-05, + "loss": 10.2789, + "step": 1128 + }, + { + "epoch": 1.3516911104459743, + "grad_norm": 0.0978989228606224, + "learning_rate": 2.399620012479702e-05, + "loss": 10.3238, + "step": 1129 + }, + { + "epoch": 1.3528883567794074, + "grad_norm": 0.13354922831058502, + "learning_rate": 2.391542453997578e-05, + "loss": 10.2723, + "step": 1130 + }, + { + "epoch": 1.3540856031128405, + "grad_norm": 0.10712044686079025, + "learning_rate": 2.38347423810378e-05, + "loss": 10.2478, + "step": 1131 + }, + { + "epoch": 1.3552828494462736, + "grad_norm": 0.11602184921503067, + "learning_rate": 2.375415393695854e-05, + "loss": 10.2646, + "step": 1132 + }, + { + "epoch": 1.3564800957797067, + "grad_norm": 0.14657771587371826, + "learning_rate": 2.3673659496377788e-05, + "loss": 10.3714, + "step": 1133 + }, + { + "epoch": 1.3576773421131398, + "grad_norm": 0.23851728439331055, + "learning_rate": 2.3593259347598657e-05, + "loss": 10.6428, + "step": 1134 + }, + { + "epoch": 1.358874588446573, + "grad_norm": 0.4067697823047638, + "learning_rate": 2.3512953778586537e-05, + "loss": 9.7241, + "step": 1135 + }, + { + "epoch": 1.360071834780006, + "grad_norm": 0.21311528980731964, + "learning_rate": 2.3432743076968066e-05, + "loss": 10.3115, + "step": 1136 + }, + { + "epoch": 1.3612690811134391, + "grad_norm": 0.14670036733150482, + "learning_rate": 2.3352627530030075e-05, + "loss": 10.2592, + "step": 1137 + }, + { + "epoch": 1.3624663274468722, + "grad_norm": 0.14863643050193787, + "learning_rate": 2.3272607424718675e-05, + "loss": 10.2586, + "step": 1138 + }, + { + "epoch": 1.3636635737803053, + "grad_norm": 0.12885111570358276, + "learning_rate": 2.3192683047637996e-05, + "loss": 10.3163, + "step": 1139 + }, + { + "epoch": 1.3648608201137384, + "grad_norm": 0.09790987521409988, + "learning_rate": 2.3112854685049397e-05, + "loss": 10.3097, + "step": 1140 + }, + { + "epoch": 1.3660580664471715, + "grad_norm": 0.09870362281799316, + "learning_rate": 2.303312262287037e-05, + "loss": 10.308, + "step": 1141 + }, + { + "epoch": 1.3672553127806046, + "grad_norm": 0.09395050257444382, + "learning_rate": 2.29534871466734e-05, + "loss": 10.3211, + "step": 1142 + }, + { + "epoch": 1.3684525591140377, + "grad_norm": 0.15811267495155334, + "learning_rate": 2.2873948541685087e-05, + "loss": 10.3323, + "step": 1143 + }, + { + "epoch": 1.3696498054474708, + "grad_norm": 0.14397980272769928, + "learning_rate": 2.2794507092785106e-05, + "loss": 10.3343, + "step": 1144 + }, + { + "epoch": 1.370847051780904, + "grad_norm": 0.0969587042927742, + "learning_rate": 2.271516308450511e-05, + "loss": 10.2892, + "step": 1145 + }, + { + "epoch": 1.372044298114337, + "grad_norm": 0.09590260684490204, + "learning_rate": 2.2635916801027706e-05, + "loss": 10.3108, + "step": 1146 + }, + { + "epoch": 1.3732415444477701, + "grad_norm": 0.13256710767745972, + "learning_rate": 2.2556768526185594e-05, + "loss": 10.2958, + "step": 1147 + }, + { + "epoch": 1.3744387907812032, + "grad_norm": 0.14084471762180328, + "learning_rate": 2.2477718543460373e-05, + "loss": 10.3039, + "step": 1148 + }, + { + "epoch": 1.3756360371146363, + "grad_norm": 0.12222900241613388, + "learning_rate": 2.23987671359816e-05, + "loss": 10.3224, + "step": 1149 + }, + { + "epoch": 1.3768332834480694, + "grad_norm": 0.11203251779079437, + "learning_rate": 2.2319914586525777e-05, + "loss": 10.3071, + "step": 1150 + }, + { + "epoch": 1.3780305297815025, + "grad_norm": 0.0766541063785553, + "learning_rate": 2.224116117751533e-05, + "loss": 10.2893, + "step": 1151 + }, + { + "epoch": 1.3792277761149356, + "grad_norm": 0.058683354407548904, + "learning_rate": 2.21625071910176e-05, + "loss": 10.3032, + "step": 1152 + }, + { + "epoch": 1.3804250224483687, + "grad_norm": 0.05576328933238983, + "learning_rate": 2.2083952908743834e-05, + "loss": 10.2849, + "step": 1153 + }, + { + "epoch": 1.3816222687818018, + "grad_norm": 0.06732963025569916, + "learning_rate": 2.2005498612048155e-05, + "loss": 10.2899, + "step": 1154 + }, + { + "epoch": 1.382819515115235, + "grad_norm": 0.09074336290359497, + "learning_rate": 2.1927144581926596e-05, + "loss": 10.2893, + "step": 1155 + }, + { + "epoch": 1.384016761448668, + "grad_norm": 0.06785799562931061, + "learning_rate": 2.1848891099016057e-05, + "loss": 10.3005, + "step": 1156 + }, + { + "epoch": 1.3852140077821011, + "grad_norm": 0.056347865611314774, + "learning_rate": 2.1770738443593315e-05, + "loss": 10.3056, + "step": 1157 + }, + { + "epoch": 1.3864112541155342, + "grad_norm": 0.0817791149020195, + "learning_rate": 2.1692686895574005e-05, + "loss": 10.3019, + "step": 1158 + }, + { + "epoch": 1.3876085004489673, + "grad_norm": 0.06720899790525436, + "learning_rate": 2.161473673451169e-05, + "loss": 10.2965, + "step": 1159 + }, + { + "epoch": 1.3888057467824004, + "grad_norm": 0.06846219301223755, + "learning_rate": 2.1536888239596714e-05, + "loss": 10.3004, + "step": 1160 + }, + { + "epoch": 1.3900029931158335, + "grad_norm": 0.07374999672174454, + "learning_rate": 2.1459141689655315e-05, + "loss": 10.3111, + "step": 1161 + }, + { + "epoch": 1.3912002394492666, + "grad_norm": 0.09747304767370224, + "learning_rate": 2.1381497363148673e-05, + "loss": 10.2801, + "step": 1162 + }, + { + "epoch": 1.3923974857826997, + "grad_norm": 0.09986566752195358, + "learning_rate": 2.1303955538171726e-05, + "loss": 10.287, + "step": 1163 + }, + { + "epoch": 1.3935947321161328, + "grad_norm": 0.08550732582807541, + "learning_rate": 2.1226516492452336e-05, + "loss": 10.2798, + "step": 1164 + }, + { + "epoch": 1.394791978449566, + "grad_norm": 0.07137460261583328, + "learning_rate": 2.114918050335029e-05, + "loss": 10.3052, + "step": 1165 + }, + { + "epoch": 1.395989224782999, + "grad_norm": 0.07181624323129654, + "learning_rate": 2.1071947847856222e-05, + "loss": 10.2942, + "step": 1166 + }, + { + "epoch": 1.3971864711164321, + "grad_norm": 0.07651017606258392, + "learning_rate": 2.0994818802590606e-05, + "loss": 10.2856, + "step": 1167 + }, + { + "epoch": 1.3983837174498652, + "grad_norm": 0.07668477296829224, + "learning_rate": 2.091779364380293e-05, + "loss": 10.3089, + "step": 1168 + }, + { + "epoch": 1.3995809637832983, + "grad_norm": 0.08376431465148926, + "learning_rate": 2.084087264737052e-05, + "loss": 10.2829, + "step": 1169 + }, + { + "epoch": 1.4007782101167314, + "grad_norm": 0.06551386415958405, + "learning_rate": 2.0764056088797645e-05, + "loss": 10.2781, + "step": 1170 + }, + { + "epoch": 1.4019754564501645, + "grad_norm": 0.0657748430967331, + "learning_rate": 2.0687344243214535e-05, + "loss": 10.3368, + "step": 1171 + }, + { + "epoch": 1.4031727027835976, + "grad_norm": 0.08696157485246658, + "learning_rate": 2.061073738537635e-05, + "loss": 10.2924, + "step": 1172 + }, + { + "epoch": 1.4043699491170307, + "grad_norm": 0.10079991072416306, + "learning_rate": 2.0534235789662226e-05, + "loss": 10.2701, + "step": 1173 + }, + { + "epoch": 1.4055671954504638, + "grad_norm": 0.08060432970523834, + "learning_rate": 2.045783973007429e-05, + "loss": 10.3152, + "step": 1174 + }, + { + "epoch": 1.406764441783897, + "grad_norm": 0.09744574874639511, + "learning_rate": 2.0381549480236685e-05, + "loss": 10.2672, + "step": 1175 + }, + { + "epoch": 1.40796168811733, + "grad_norm": 0.13688093423843384, + "learning_rate": 2.030536531339456e-05, + "loss": 10.2788, + "step": 1176 + }, + { + "epoch": 1.4091589344507631, + "grad_norm": 0.09297854453325272, + "learning_rate": 2.0229287502413124e-05, + "loss": 10.2712, + "step": 1177 + }, + { + "epoch": 1.4103561807841962, + "grad_norm": 0.11464229226112366, + "learning_rate": 2.0153316319776662e-05, + "loss": 10.3233, + "step": 1178 + }, + { + "epoch": 1.4115534271176293, + "grad_norm": 0.13123725354671478, + "learning_rate": 2.007745203758751e-05, + "loss": 10.2931, + "step": 1179 + }, + { + "epoch": 1.4127506734510624, + "grad_norm": 0.12522581219673157, + "learning_rate": 2.000169492756523e-05, + "loss": 10.2026, + "step": 1180 + }, + { + "epoch": 1.4139479197844955, + "grad_norm": 0.11910898983478546, + "learning_rate": 1.9926045261045402e-05, + "loss": 10.2829, + "step": 1181 + }, + { + "epoch": 1.4151451661179288, + "grad_norm": 0.13346028327941895, + "learning_rate": 1.985050330897883e-05, + "loss": 10.4279, + "step": 1182 + }, + { + "epoch": 1.416342412451362, + "grad_norm": 0.14857794344425201, + "learning_rate": 1.977506934193059e-05, + "loss": 10.193, + "step": 1183 + }, + { + "epoch": 1.417539658784795, + "grad_norm": 0.2539500892162323, + "learning_rate": 1.9699743630078883e-05, + "loss": 10.4486, + "step": 1184 + }, + { + "epoch": 1.4187369051182281, + "grad_norm": 0.3851333260536194, + "learning_rate": 1.9624526443214224e-05, + "loss": 9.9465, + "step": 1185 + }, + { + "epoch": 1.4199341514516612, + "grad_norm": 0.191562220454216, + "learning_rate": 1.954941805073848e-05, + "loss": 10.2629, + "step": 1186 + }, + { + "epoch": 1.4211313977850943, + "grad_norm": 0.17502020299434662, + "learning_rate": 1.947441872166379e-05, + "loss": 10.3208, + "step": 1187 + }, + { + "epoch": 1.4223286441185274, + "grad_norm": 0.15126878023147583, + "learning_rate": 1.9399528724611644e-05, + "loss": 10.2725, + "step": 1188 + }, + { + "epoch": 1.4235258904519605, + "grad_norm": 0.139674574136734, + "learning_rate": 1.932474832781203e-05, + "loss": 10.2853, + "step": 1189 + }, + { + "epoch": 1.4247231367853936, + "grad_norm": 0.1290808469057083, + "learning_rate": 1.9250077799102322e-05, + "loss": 10.2973, + "step": 1190 + }, + { + "epoch": 1.4259203831188267, + "grad_norm": 0.1300542801618576, + "learning_rate": 1.91755174059264e-05, + "loss": 10.352, + "step": 1191 + }, + { + "epoch": 1.4271176294522598, + "grad_norm": 0.1189158633351326, + "learning_rate": 1.9101067415333684e-05, + "loss": 10.2859, + "step": 1192 + }, + { + "epoch": 1.428314875785693, + "grad_norm": 0.11389727145433426, + "learning_rate": 1.902672809397816e-05, + "loss": 10.294, + "step": 1193 + }, + { + "epoch": 1.429512122119126, + "grad_norm": 0.09121085703372955, + "learning_rate": 1.8952499708117432e-05, + "loss": 10.3209, + "step": 1194 + }, + { + "epoch": 1.4307093684525591, + "grad_norm": 0.1272386759519577, + "learning_rate": 1.8878382523611786e-05, + "loss": 10.3164, + "step": 1195 + }, + { + "epoch": 1.4319066147859922, + "grad_norm": 0.11616390198469162, + "learning_rate": 1.8804376805923223e-05, + "loss": 10.3176, + "step": 1196 + }, + { + "epoch": 1.4331038611194253, + "grad_norm": 0.15376238524913788, + "learning_rate": 1.873048282011449e-05, + "loss": 10.2901, + "step": 1197 + }, + { + "epoch": 1.4343011074528584, + "grad_norm": 0.13659100234508514, + "learning_rate": 1.8656700830848174e-05, + "loss": 10.2956, + "step": 1198 + }, + { + "epoch": 1.4354983537862915, + "grad_norm": 0.1524030566215515, + "learning_rate": 1.8583031102385707e-05, + "loss": 10.3008, + "step": 1199 + }, + { + "epoch": 1.4366956001197246, + "grad_norm": 0.11037743836641312, + "learning_rate": 1.850947389858643e-05, + "loss": 10.306, + "step": 1200 + }, + { + "epoch": 1.4378928464531577, + "grad_norm": 0.12072794884443283, + "learning_rate": 1.8436029482906748e-05, + "loss": 10.3131, + "step": 1201 + }, + { + "epoch": 1.4390900927865908, + "grad_norm": 0.06150706484913826, + "learning_rate": 1.8362698118398967e-05, + "loss": 10.3137, + "step": 1202 + }, + { + "epoch": 1.440287339120024, + "grad_norm": 0.06778173893690109, + "learning_rate": 1.8289480067710557e-05, + "loss": 10.2823, + "step": 1203 + }, + { + "epoch": 1.441484585453457, + "grad_norm": 0.05693823844194412, + "learning_rate": 1.821637559308315e-05, + "loss": 10.2915, + "step": 1204 + }, + { + "epoch": 1.4426818317868901, + "grad_norm": 0.06075819954276085, + "learning_rate": 1.8143384956351578e-05, + "loss": 10.2963, + "step": 1205 + }, + { + "epoch": 1.4438790781203232, + "grad_norm": 0.06289898604154587, + "learning_rate": 1.8070508418942876e-05, + "loss": 10.2835, + "step": 1206 + }, + { + "epoch": 1.4450763244537563, + "grad_norm": 0.05466059595346451, + "learning_rate": 1.7997746241875525e-05, + "loss": 10.3026, + "step": 1207 + }, + { + "epoch": 1.4462735707871894, + "grad_norm": 0.0636703372001648, + "learning_rate": 1.7925098685758345e-05, + "loss": 10.3066, + "step": 1208 + }, + { + "epoch": 1.4474708171206225, + "grad_norm": 0.08119691163301468, + "learning_rate": 1.7852566010789597e-05, + "loss": 10.2917, + "step": 1209 + }, + { + "epoch": 1.4486680634540556, + "grad_norm": 0.07576896250247955, + "learning_rate": 1.7780148476756147e-05, + "loss": 10.3074, + "step": 1210 + }, + { + "epoch": 1.4498653097874887, + "grad_norm": 0.08203244209289551, + "learning_rate": 1.770784634303243e-05, + "loss": 10.3091, + "step": 1211 + }, + { + "epoch": 1.4510625561209218, + "grad_norm": 0.09676560759544373, + "learning_rate": 1.763565986857955e-05, + "loss": 10.3187, + "step": 1212 + }, + { + "epoch": 1.452259802454355, + "grad_norm": 0.12036450952291489, + "learning_rate": 1.7563589311944378e-05, + "loss": 10.2861, + "step": 1213 + }, + { + "epoch": 1.453457048787788, + "grad_norm": 0.11823254078626633, + "learning_rate": 1.7491634931258587e-05, + "loss": 10.2736, + "step": 1214 + }, + { + "epoch": 1.4546542951212211, + "grad_norm": 0.11799007654190063, + "learning_rate": 1.7419796984237768e-05, + "loss": 10.2914, + "step": 1215 + }, + { + "epoch": 1.4558515414546542, + "grad_norm": 0.11495964974164963, + "learning_rate": 1.7348075728180478e-05, + "loss": 10.2788, + "step": 1216 + }, + { + "epoch": 1.4570487877880873, + "grad_norm": 0.08928609639406204, + "learning_rate": 1.7276471419967328e-05, + "loss": 10.2868, + "step": 1217 + }, + { + "epoch": 1.4582460341215204, + "grad_norm": 0.08012302219867706, + "learning_rate": 1.7204984316060063e-05, + "loss": 10.2869, + "step": 1218 + }, + { + "epoch": 1.4594432804549535, + "grad_norm": 0.06295078992843628, + "learning_rate": 1.7133614672500643e-05, + "loss": 10.2961, + "step": 1219 + }, + { + "epoch": 1.4606405267883866, + "grad_norm": 0.07612026482820511, + "learning_rate": 1.7062362744910322e-05, + "loss": 10.2862, + "step": 1220 + }, + { + "epoch": 1.46183777312182, + "grad_norm": 0.11788436025381088, + "learning_rate": 1.6991228788488728e-05, + "loss": 10.3229, + "step": 1221 + }, + { + "epoch": 1.463035019455253, + "grad_norm": 0.12167295068502426, + "learning_rate": 1.6920213058013022e-05, + "loss": 10.3075, + "step": 1222 + }, + { + "epoch": 1.4642322657886861, + "grad_norm": 0.08868716657161713, + "learning_rate": 1.6849315807836812e-05, + "loss": 10.275, + "step": 1223 + }, + { + "epoch": 1.4654295121221192, + "grad_norm": 0.08265525847673416, + "learning_rate": 1.6778537291889407e-05, + "loss": 10.2977, + "step": 1224 + }, + { + "epoch": 1.4666267584555523, + "grad_norm": 0.1327967792749405, + "learning_rate": 1.670787776367489e-05, + "loss": 10.2972, + "step": 1225 + }, + { + "epoch": 1.4678240047889854, + "grad_norm": 0.09218508005142212, + "learning_rate": 1.6637337476271124e-05, + "loss": 10.2651, + "step": 1226 + }, + { + "epoch": 1.4690212511224185, + "grad_norm": 0.08492898941040039, + "learning_rate": 1.6566916682328864e-05, + "loss": 10.2842, + "step": 1227 + }, + { + "epoch": 1.4702184974558516, + "grad_norm": 0.15046337246894836, + "learning_rate": 1.6496615634070956e-05, + "loss": 10.3161, + "step": 1228 + }, + { + "epoch": 1.4714157437892847, + "grad_norm": 0.11659656465053558, + "learning_rate": 1.6426434583291328e-05, + "loss": 10.3165, + "step": 1229 + }, + { + "epoch": 1.4726129901227178, + "grad_norm": 0.10272715240716934, + "learning_rate": 1.6356373781354058e-05, + "loss": 10.2317, + "step": 1230 + }, + { + "epoch": 1.473810236456151, + "grad_norm": 0.102105051279068, + "learning_rate": 1.6286433479192638e-05, + "loss": 10.3294, + "step": 1231 + }, + { + "epoch": 1.475007482789584, + "grad_norm": 0.12132613360881805, + "learning_rate": 1.6216613927308905e-05, + "loss": 10.398, + "step": 1232 + }, + { + "epoch": 1.4762047291230171, + "grad_norm": 0.14180143177509308, + "learning_rate": 1.6146915375772225e-05, + "loss": 10.2467, + "step": 1233 + }, + { + "epoch": 1.4774019754564502, + "grad_norm": 0.2615784704685211, + "learning_rate": 1.6077338074218596e-05, + "loss": 10.3414, + "step": 1234 + }, + { + "epoch": 1.4785992217898833, + "grad_norm": 0.3889216184616089, + "learning_rate": 1.6007882271849716e-05, + "loss": 10.0321, + "step": 1235 + }, + { + "epoch": 1.4797964681233164, + "grad_norm": 0.2168707400560379, + "learning_rate": 1.5938548217432136e-05, + "loss": 10.1586, + "step": 1236 + }, + { + "epoch": 1.4809937144567495, + "grad_norm": 0.16029125452041626, + "learning_rate": 1.586933615929634e-05, + "loss": 10.364, + "step": 1237 + }, + { + "epoch": 1.4821909607901826, + "grad_norm": 0.1461440920829773, + "learning_rate": 1.580024634533587e-05, + "loss": 10.2575, + "step": 1238 + }, + { + "epoch": 1.4833882071236157, + "grad_norm": 0.1233038678765297, + "learning_rate": 1.5731279023006413e-05, + "loss": 10.2951, + "step": 1239 + }, + { + "epoch": 1.4845854534570488, + "grad_norm": 0.12957803905010223, + "learning_rate": 1.566243443932496e-05, + "loss": 10.2987, + "step": 1240 + }, + { + "epoch": 1.485782699790482, + "grad_norm": 0.1535806804895401, + "learning_rate": 1.5593712840868867e-05, + "loss": 10.3343, + "step": 1241 + }, + { + "epoch": 1.486979946123915, + "grad_norm": 0.10079331696033478, + "learning_rate": 1.5525114473775014e-05, + "loss": 10.3278, + "step": 1242 + }, + { + "epoch": 1.4881771924573481, + "grad_norm": 0.11287102103233337, + "learning_rate": 1.5456639583738956e-05, + "loss": 10.3021, + "step": 1243 + }, + { + "epoch": 1.4893744387907812, + "grad_norm": 0.11160896718502045, + "learning_rate": 1.5388288416013896e-05, + "loss": 10.3003, + "step": 1244 + }, + { + "epoch": 1.4905716851242143, + "grad_norm": 0.08691979944705963, + "learning_rate": 1.5320061215409958e-05, + "loss": 10.3126, + "step": 1245 + }, + { + "epoch": 1.4917689314576474, + "grad_norm": 0.10075131058692932, + "learning_rate": 1.5251958226293306e-05, + "loss": 10.2872, + "step": 1246 + }, + { + "epoch": 1.4929661777910805, + "grad_norm": 0.13246287405490875, + "learning_rate": 1.518397969258516e-05, + "loss": 10.292, + "step": 1247 + }, + { + "epoch": 1.4941634241245136, + "grad_norm": 0.11041560769081116, + "learning_rate": 1.5116125857760966e-05, + "loss": 10.2948, + "step": 1248 + }, + { + "epoch": 1.4953606704579467, + "grad_norm": 0.11873572319746017, + "learning_rate": 1.504839696484962e-05, + "loss": 10.2937, + "step": 1249 + }, + { + "epoch": 1.4965579167913798, + "grad_norm": 0.1355929970741272, + "learning_rate": 1.4980793256432474e-05, + "loss": 10.3054, + "step": 1250 + }, + { + "epoch": 1.497755163124813, + "grad_norm": 0.08938276022672653, + "learning_rate": 1.4913314974642472e-05, + "loss": 10.3224, + "step": 1251 + }, + { + "epoch": 1.498952409458246, + "grad_norm": 0.05973433703184128, + "learning_rate": 1.4845962361163413e-05, + "loss": 10.2878, + "step": 1252 + }, + { + "epoch": 1.5001496557916791, + "grad_norm": 0.061717137694358826, + "learning_rate": 1.477873565722893e-05, + "loss": 10.2928, + "step": 1253 + }, + { + "epoch": 1.5013469021251122, + "grad_norm": 0.07511643320322037, + "learning_rate": 1.4711635103621719e-05, + "loss": 10.2964, + "step": 1254 + }, + { + "epoch": 1.5025441484585453, + "grad_norm": 0.06679512560367584, + "learning_rate": 1.4644660940672627e-05, + "loss": 10.2953, + "step": 1255 + }, + { + "epoch": 1.5037413947919784, + "grad_norm": 0.05847722291946411, + "learning_rate": 1.4577813408259838e-05, + "loss": 10.2842, + "step": 1256 + }, + { + "epoch": 1.5049386411254115, + "grad_norm": 0.054464198648929596, + "learning_rate": 1.451109274580798e-05, + "loss": 10.2877, + "step": 1257 + }, + { + "epoch": 1.5061358874588446, + "grad_norm": 0.06136462092399597, + "learning_rate": 1.4444499192287275e-05, + "loss": 10.2876, + "step": 1258 + }, + { + "epoch": 1.5073331337922777, + "grad_norm": 0.055170897394418716, + "learning_rate": 1.4378032986212686e-05, + "loss": 10.3017, + "step": 1259 + }, + { + "epoch": 1.5085303801257108, + "grad_norm": 0.07313201576471329, + "learning_rate": 1.4311694365643047e-05, + "loss": 10.286, + "step": 1260 + }, + { + "epoch": 1.509727626459144, + "grad_norm": 0.06189440190792084, + "learning_rate": 1.4245483568180285e-05, + "loss": 10.3202, + "step": 1261 + }, + { + "epoch": 1.510924872792577, + "grad_norm": 0.08800532668828964, + "learning_rate": 1.4179400830968415e-05, + "loss": 10.2998, + "step": 1262 + }, + { + "epoch": 1.5121221191260101, + "grad_norm": 0.13978801667690277, + "learning_rate": 1.4113446390692836e-05, + "loss": 10.2787, + "step": 1263 + }, + { + "epoch": 1.5133193654594432, + "grad_norm": 0.06997866183519363, + "learning_rate": 1.4047620483579477e-05, + "loss": 10.286, + "step": 1264 + }, + { + "epoch": 1.5145166117928763, + "grad_norm": 0.10451135784387589, + "learning_rate": 1.3981923345393815e-05, + "loss": 10.2967, + "step": 1265 + }, + { + "epoch": 1.5157138581263094, + "grad_norm": 0.0889141634106636, + "learning_rate": 1.3916355211440164e-05, + "loss": 10.296, + "step": 1266 + }, + { + "epoch": 1.5169111044597425, + "grad_norm": 0.0839218720793724, + "learning_rate": 1.3850916316560814e-05, + "loss": 10.3024, + "step": 1267 + }, + { + "epoch": 1.5181083507931756, + "grad_norm": 0.07090212404727936, + "learning_rate": 1.378560689513515e-05, + "loss": 10.285, + "step": 1268 + }, + { + "epoch": 1.5193055971266087, + "grad_norm": 0.06662372499704361, + "learning_rate": 1.3720427181078777e-05, + "loss": 10.3065, + "step": 1269 + }, + { + "epoch": 1.5205028434600418, + "grad_norm": 0.06592795997858047, + "learning_rate": 1.3655377407842812e-05, + "loss": 10.2783, + "step": 1270 + }, + { + "epoch": 1.521700089793475, + "grad_norm": 0.07569335401058197, + "learning_rate": 1.3590457808412931e-05, + "loss": 10.3054, + "step": 1271 + }, + { + "epoch": 1.522897336126908, + "grad_norm": 0.08120357245206833, + "learning_rate": 1.3525668615308562e-05, + "loss": 10.2715, + "step": 1272 + }, + { + "epoch": 1.5240945824603411, + "grad_norm": 0.09264546632766724, + "learning_rate": 1.3461010060582091e-05, + "loss": 10.3184, + "step": 1273 + }, + { + "epoch": 1.5252918287937742, + "grad_norm": 0.10651028901338577, + "learning_rate": 1.3396482375817975e-05, + "loss": 10.271, + "step": 1274 + }, + { + "epoch": 1.5264890751272073, + "grad_norm": 0.08496925234794617, + "learning_rate": 1.3332085792131966e-05, + "loss": 10.2848, + "step": 1275 + }, + { + "epoch": 1.5276863214606404, + "grad_norm": 0.10523577779531479, + "learning_rate": 1.3267820540170229e-05, + "loss": 10.269, + "step": 1276 + }, + { + "epoch": 1.5288835677940735, + "grad_norm": 0.10637875646352768, + "learning_rate": 1.3203686850108577e-05, + "loss": 10.2737, + "step": 1277 + }, + { + "epoch": 1.5300808141275066, + "grad_norm": 0.11013197153806686, + "learning_rate": 1.3139684951651588e-05, + "loss": 10.2786, + "step": 1278 + }, + { + "epoch": 1.5312780604609397, + "grad_norm": 0.1337360441684723, + "learning_rate": 1.3075815074031817e-05, + "loss": 10.3262, + "step": 1279 + }, + { + "epoch": 1.5324753067943728, + "grad_norm": 0.10860981792211533, + "learning_rate": 1.3012077446008968e-05, + "loss": 10.3178, + "step": 1280 + }, + { + "epoch": 1.533672553127806, + "grad_norm": 0.13930366933345795, + "learning_rate": 1.2948472295869057e-05, + "loss": 10.2123, + "step": 1281 + }, + { + "epoch": 1.534869799461239, + "grad_norm": 0.1220797449350357, + "learning_rate": 1.2884999851423673e-05, + "loss": 10.2672, + "step": 1282 + }, + { + "epoch": 1.5360670457946721, + "grad_norm": 0.13567830622196198, + "learning_rate": 1.2821660340009007e-05, + "loss": 10.3578, + "step": 1283 + }, + { + "epoch": 1.5372642921281052, + "grad_norm": 0.14362607896327972, + "learning_rate": 1.2758453988485164e-05, + "loss": 10.2299, + "step": 1284 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.33708757162094116, + "learning_rate": 1.2695381023235386e-05, + "loss": 10.565, + "step": 1285 + }, + { + "epoch": 1.5396587847949714, + "grad_norm": 0.178616002202034, + "learning_rate": 1.2632441670165056e-05, + "loss": 9.8553, + "step": 1286 + }, + { + "epoch": 1.5408560311284045, + "grad_norm": 0.1613122522830963, + "learning_rate": 1.2569636154701075e-05, + "loss": 10.3007, + "step": 1287 + }, + { + "epoch": 1.5420532774618376, + "grad_norm": 0.15765303373336792, + "learning_rate": 1.2506964701790985e-05, + "loss": 10.2848, + "step": 1288 + }, + { + "epoch": 1.5432505237952707, + "grad_norm": 0.11258494853973389, + "learning_rate": 1.2444427535902153e-05, + "loss": 10.2966, + "step": 1289 + }, + { + "epoch": 1.5444477701287038, + "grad_norm": 0.12303269654512405, + "learning_rate": 1.2382024881020937e-05, + "loss": 10.3166, + "step": 1290 + }, + { + "epoch": 1.545645016462137, + "grad_norm": 0.09735307842493057, + "learning_rate": 1.231975696065199e-05, + "loss": 10.3185, + "step": 1291 + }, + { + "epoch": 1.54684226279557, + "grad_norm": 0.12478283047676086, + "learning_rate": 1.2257623997817347e-05, + "loss": 10.2951, + "step": 1292 + }, + { + "epoch": 1.5480395091290033, + "grad_norm": 0.1140608936548233, + "learning_rate": 1.2195626215055694e-05, + "loss": 10.3203, + "step": 1293 + }, + { + "epoch": 1.5492367554624364, + "grad_norm": 0.08125541359186172, + "learning_rate": 1.213376383442153e-05, + "loss": 10.2953, + "step": 1294 + }, + { + "epoch": 1.5504340017958695, + "grad_norm": 0.09563332796096802, + "learning_rate": 1.2072037077484416e-05, + "loss": 10.3093, + "step": 1295 + }, + { + "epoch": 1.5516312481293026, + "grad_norm": 0.1059359610080719, + "learning_rate": 1.2010446165328126e-05, + "loss": 10.2925, + "step": 1296 + }, + { + "epoch": 1.5528284944627357, + "grad_norm": 0.16233988106250763, + "learning_rate": 1.1948991318549907e-05, + "loss": 10.295, + "step": 1297 + }, + { + "epoch": 1.5540257407961688, + "grad_norm": 0.1409648060798645, + "learning_rate": 1.188767275725966e-05, + "loss": 10.3016, + "step": 1298 + }, + { + "epoch": 1.555222987129602, + "grad_norm": 0.15842890739440918, + "learning_rate": 1.1826490701079162e-05, + "loss": 10.3058, + "step": 1299 + }, + { + "epoch": 1.556420233463035, + "grad_norm": 0.11585404723882675, + "learning_rate": 1.1765445369141276e-05, + "loss": 10.3004, + "step": 1300 + }, + { + "epoch": 1.5576174797964681, + "grad_norm": 0.11004593223333359, + "learning_rate": 1.1704536980089153e-05, + "loss": 10.2978, + "step": 1301 + }, + { + "epoch": 1.5588147261299012, + "grad_norm": 0.056950900703668594, + "learning_rate": 1.164376575207547e-05, + "loss": 10.281, + "step": 1302 + }, + { + "epoch": 1.5600119724633343, + "grad_norm": 0.08803658187389374, + "learning_rate": 1.1583131902761685e-05, + "loss": 10.3173, + "step": 1303 + }, + { + "epoch": 1.5612092187967674, + "grad_norm": 0.05935777723789215, + "learning_rate": 1.152263564931712e-05, + "loss": 10.2776, + "step": 1304 + }, + { + "epoch": 1.5624064651302005, + "grad_norm": 0.0633310079574585, + "learning_rate": 1.1462277208418338e-05, + "loss": 10.3058, + "step": 1305 + }, + { + "epoch": 1.5636037114636336, + "grad_norm": 0.05919630080461502, + "learning_rate": 1.140205679624834e-05, + "loss": 10.2723, + "step": 1306 + }, + { + "epoch": 1.5648009577970667, + "grad_norm": 0.05984526500105858, + "learning_rate": 1.1341974628495662e-05, + "loss": 10.2935, + "step": 1307 + }, + { + "epoch": 1.5659982041304998, + "grad_norm": 0.05397504195570946, + "learning_rate": 1.1282030920353747e-05, + "loss": 10.2869, + "step": 1308 + }, + { + "epoch": 1.567195450463933, + "grad_norm": 0.07828553766012192, + "learning_rate": 1.122222588652016e-05, + "loss": 10.2945, + "step": 1309 + }, + { + "epoch": 1.568392696797366, + "grad_norm": 0.07010767608880997, + "learning_rate": 1.1162559741195733e-05, + "loss": 10.2956, + "step": 1310 + }, + { + "epoch": 1.5695899431307991, + "grad_norm": 0.06288562715053558, + "learning_rate": 1.1103032698083831e-05, + "loss": 10.2941, + "step": 1311 + }, + { + "epoch": 1.5707871894642322, + "grad_norm": 0.05736493691802025, + "learning_rate": 1.1043644970389671e-05, + "loss": 10.2983, + "step": 1312 + }, + { + "epoch": 1.5719844357976653, + "grad_norm": 0.07730087637901306, + "learning_rate": 1.0984396770819438e-05, + "loss": 10.289, + "step": 1313 + }, + { + "epoch": 1.5731816821310984, + "grad_norm": 0.08970685303211212, + "learning_rate": 1.092528831157959e-05, + "loss": 10.2942, + "step": 1314 + }, + { + "epoch": 1.5743789284645315, + "grad_norm": 0.09654261916875839, + "learning_rate": 1.0866319804376085e-05, + "loss": 10.2939, + "step": 1315 + }, + { + "epoch": 1.5755761747979646, + "grad_norm": 0.10928742587566376, + "learning_rate": 1.0807491460413622e-05, + "loss": 10.2996, + "step": 1316 + }, + { + "epoch": 1.5767734211313977, + "grad_norm": 0.08177176862955093, + "learning_rate": 1.0748803490394876e-05, + "loss": 10.268, + "step": 1317 + }, + { + "epoch": 1.5779706674648308, + "grad_norm": 0.06169341132044792, + "learning_rate": 1.0690256104519764e-05, + "loss": 10.2901, + "step": 1318 + }, + { + "epoch": 1.5791679137982642, + "grad_norm": 0.06915286183357239, + "learning_rate": 1.063184951248467e-05, + "loss": 10.2866, + "step": 1319 + }, + { + "epoch": 1.5803651601316973, + "grad_norm": 0.08796220272779465, + "learning_rate": 1.0573583923481711e-05, + "loss": 10.2739, + "step": 1320 + }, + { + "epoch": 1.5815624064651304, + "grad_norm": 0.082234226167202, + "learning_rate": 1.0515459546197975e-05, + "loss": 10.3168, + "step": 1321 + }, + { + "epoch": 1.5827596527985635, + "grad_norm": 0.08065493404865265, + "learning_rate": 1.0457476588814774e-05, + "loss": 10.3038, + "step": 1322 + }, + { + "epoch": 1.5839568991319966, + "grad_norm": 0.10621247440576553, + "learning_rate": 1.0399635259006902e-05, + "loss": 10.2608, + "step": 1323 + }, + { + "epoch": 1.5851541454654297, + "grad_norm": 0.09793389588594437, + "learning_rate": 1.0341935763941935e-05, + "loss": 10.2916, + "step": 1324 + }, + { + "epoch": 1.5863513917988628, + "grad_norm": 0.09392872452735901, + "learning_rate": 1.0284378310279369e-05, + "loss": 10.286, + "step": 1325 + }, + { + "epoch": 1.5875486381322959, + "grad_norm": 0.1270560473203659, + "learning_rate": 1.0226963104170002e-05, + "loss": 10.2757, + "step": 1326 + }, + { + "epoch": 1.588745884465729, + "grad_norm": 0.08710789680480957, + "learning_rate": 1.0169690351255173e-05, + "loss": 10.2484, + "step": 1327 + }, + { + "epoch": 1.589943130799162, + "grad_norm": 0.10709777474403381, + "learning_rate": 1.011256025666597e-05, + "loss": 10.3321, + "step": 1328 + }, + { + "epoch": 1.5911403771325952, + "grad_norm": 0.12350647151470184, + "learning_rate": 1.0055573025022507e-05, + "loss": 10.3245, + "step": 1329 + }, + { + "epoch": 1.5923376234660283, + "grad_norm": 0.11619213968515396, + "learning_rate": 9.998728860433276e-06, + "loss": 10.2505, + "step": 1330 + }, + { + "epoch": 1.5935348697994614, + "grad_norm": 0.11807774752378464, + "learning_rate": 9.942027966494316e-06, + "loss": 10.2509, + "step": 1331 + }, + { + "epoch": 1.5947321161328945, + "grad_norm": 0.16891448199748993, + "learning_rate": 9.885470546288478e-06, + "loss": 10.3548, + "step": 1332 + }, + { + "epoch": 1.5959293624663276, + "grad_norm": 0.14741595089435577, + "learning_rate": 9.829056802384834e-06, + "loss": 10.1591, + "step": 1333 + }, + { + "epoch": 1.5971266087997606, + "grad_norm": 0.20285120606422424, + "learning_rate": 9.772786936837785e-06, + "loss": 10.3216, + "step": 1334 + }, + { + "epoch": 1.5983238551331937, + "grad_norm": 0.3578339219093323, + "learning_rate": 9.71666115118644e-06, + "loss": 10.3635, + "step": 1335 + }, + { + "epoch": 1.5995211014666268, + "grad_norm": 0.18940424919128418, + "learning_rate": 9.660679646453851e-06, + "loss": 10.1212, + "step": 1336 + }, + { + "epoch": 1.60071834780006, + "grad_norm": 0.13846224546432495, + "learning_rate": 9.60484262314631e-06, + "loss": 10.2402, + "step": 1337 + }, + { + "epoch": 1.601915594133493, + "grad_norm": 0.1558510810136795, + "learning_rate": 9.549150281252633e-06, + "loss": 10.2789, + "step": 1338 + }, + { + "epoch": 1.6031128404669261, + "grad_norm": 0.14840716123580933, + "learning_rate": 9.493602820243425e-06, + "loss": 10.2823, + "step": 1339 + }, + { + "epoch": 1.6043100868003592, + "grad_norm": 0.10457353293895721, + "learning_rate": 9.438200439070388e-06, + "loss": 10.3147, + "step": 1340 + }, + { + "epoch": 1.6055073331337923, + "grad_norm": 0.08213123679161072, + "learning_rate": 9.3829433361656e-06, + "loss": 10.3242, + "step": 1341 + }, + { + "epoch": 1.6067045794672254, + "grad_norm": 0.08259988576173782, + "learning_rate": 9.327831709440792e-06, + "loss": 10.3055, + "step": 1342 + }, + { + "epoch": 1.6079018258006585, + "grad_norm": 0.07817088812589645, + "learning_rate": 9.272865756286658e-06, + "loss": 10.2816, + "step": 1343 + }, + { + "epoch": 1.6090990721340916, + "grad_norm": 0.09419888257980347, + "learning_rate": 9.218045673572123e-06, + "loss": 10.3191, + "step": 1344 + }, + { + "epoch": 1.6102963184675247, + "grad_norm": 0.09685257822275162, + "learning_rate": 9.163371657643716e-06, + "loss": 10.3037, + "step": 1345 + }, + { + "epoch": 1.6114935648009578, + "grad_norm": 0.07926305383443832, + "learning_rate": 9.108843904324715e-06, + "loss": 10.2915, + "step": 1346 + }, + { + "epoch": 1.612690811134391, + "grad_norm": 0.11186622828245163, + "learning_rate": 9.054462608914576e-06, + "loss": 10.3031, + "step": 1347 + }, + { + "epoch": 1.613888057467824, + "grad_norm": 0.13665804266929626, + "learning_rate": 9.000227966188234e-06, + "loss": 10.3026, + "step": 1348 + }, + { + "epoch": 1.6150853038012571, + "grad_norm": 0.15781697630882263, + "learning_rate": 8.946140170395328e-06, + "loss": 10.3006, + "step": 1349 + }, + { + "epoch": 1.6162825501346902, + "grad_norm": 0.14723873138427734, + "learning_rate": 8.8921994152595e-06, + "loss": 10.3071, + "step": 1350 + }, + { + "epoch": 1.6174797964681233, + "grad_norm": 0.07926400005817413, + "learning_rate": 8.838405893977824e-06, + "loss": 10.2985, + "step": 1351 + }, + { + "epoch": 1.6186770428015564, + "grad_norm": 0.07052849978208542, + "learning_rate": 8.78475979922e-06, + "loss": 10.3054, + "step": 1352 + }, + { + "epoch": 1.6198742891349895, + "grad_norm": 0.06871381402015686, + "learning_rate": 8.731261323127659e-06, + "loss": 10.2906, + "step": 1353 + }, + { + "epoch": 1.6210715354684226, + "grad_norm": 0.05118335783481598, + "learning_rate": 8.677910657313782e-06, + "loss": 10.2728, + "step": 1354 + }, + { + "epoch": 1.6222687818018557, + "grad_norm": 0.05774825066328049, + "learning_rate": 8.624707992861897e-06, + "loss": 10.2851, + "step": 1355 + }, + { + "epoch": 1.6234660281352888, + "grad_norm": 0.052445411682128906, + "learning_rate": 8.571653520325463e-06, + "loss": 10.2888, + "step": 1356 + }, + { + "epoch": 1.624663274468722, + "grad_norm": 0.05949968844652176, + "learning_rate": 8.518747429727159e-06, + "loss": 10.2913, + "step": 1357 + }, + { + "epoch": 1.625860520802155, + "grad_norm": 0.11198560148477554, + "learning_rate": 8.465989910558209e-06, + "loss": 10.3134, + "step": 1358 + }, + { + "epoch": 1.6270577671355881, + "grad_norm": 0.06212862208485603, + "learning_rate": 8.41338115177771e-06, + "loss": 10.292, + "step": 1359 + }, + { + "epoch": 1.6282550134690212, + "grad_norm": 0.06781909614801407, + "learning_rate": 8.360921341811956e-06, + "loss": 10.2914, + "step": 1360 + }, + { + "epoch": 1.6294522598024543, + "grad_norm": 0.05178993567824364, + "learning_rate": 8.308610668553752e-06, + "loss": 10.3196, + "step": 1361 + }, + { + "epoch": 1.6306495061358874, + "grad_norm": 0.06877569854259491, + "learning_rate": 8.256449319361748e-06, + "loss": 10.2768, + "step": 1362 + }, + { + "epoch": 1.6318467524693205, + "grad_norm": 0.08168987929821014, + "learning_rate": 8.204437481059762e-06, + "loss": 10.2797, + "step": 1363 + }, + { + "epoch": 1.6330439988027536, + "grad_norm": 0.06182248890399933, + "learning_rate": 8.15257533993613e-06, + "loss": 10.2877, + "step": 1364 + }, + { + "epoch": 1.6342412451361867, + "grad_norm": 0.07554659992456436, + "learning_rate": 8.100863081742999e-06, + "loss": 10.2918, + "step": 1365 + }, + { + "epoch": 1.6354384914696198, + "grad_norm": 0.07100442051887512, + "learning_rate": 8.049300891695744e-06, + "loss": 10.3015, + "step": 1366 + }, + { + "epoch": 1.636635737803053, + "grad_norm": 0.07931441813707352, + "learning_rate": 7.99788895447217e-06, + "loss": 10.2622, + "step": 1367 + }, + { + "epoch": 1.637832984136486, + "grad_norm": 0.08694098144769669, + "learning_rate": 7.946627454211968e-06, + "loss": 10.2927, + "step": 1368 + }, + { + "epoch": 1.6390302304699191, + "grad_norm": 0.07427579909563065, + "learning_rate": 7.895516574516037e-06, + "loss": 10.2932, + "step": 1369 + }, + { + "epoch": 1.6402274768033522, + "grad_norm": 0.08578281849622726, + "learning_rate": 7.844556498445788e-06, + "loss": 10.2881, + "step": 1370 + }, + { + "epoch": 1.6414247231367853, + "grad_norm": 0.07499956339597702, + "learning_rate": 7.793747408522462e-06, + "loss": 10.3078, + "step": 1371 + }, + { + "epoch": 1.6426219694702184, + "grad_norm": 0.09381866455078125, + "learning_rate": 7.7430894867266e-06, + "loss": 10.3284, + "step": 1372 + }, + { + "epoch": 1.6438192158036515, + "grad_norm": 0.08043655008077621, + "learning_rate": 7.692582914497265e-06, + "loss": 10.2722, + "step": 1373 + }, + { + "epoch": 1.6450164621370846, + "grad_norm": 0.09763289242982864, + "learning_rate": 7.642227872731417e-06, + "loss": 10.2946, + "step": 1374 + }, + { + "epoch": 1.6462137084705177, + "grad_norm": 0.0772552415728569, + "learning_rate": 7.592024541783343e-06, + "loss": 10.2922, + "step": 1375 + }, + { + "epoch": 1.6474109548039508, + "grad_norm": 0.09323560446500778, + "learning_rate": 7.541973101463912e-06, + "loss": 10.2566, + "step": 1376 + }, + { + "epoch": 1.648608201137384, + "grad_norm": 0.0961635559797287, + "learning_rate": 7.492073731039995e-06, + "loss": 10.278, + "step": 1377 + }, + { + "epoch": 1.649805447470817, + "grad_norm": 0.11339109390974045, + "learning_rate": 7.4423266092337855e-06, + "loss": 10.253, + "step": 1378 + }, + { + "epoch": 1.6510026938042501, + "grad_norm": 0.15443108975887299, + "learning_rate": 7.392731914222189e-06, + "loss": 10.3142, + "step": 1379 + }, + { + "epoch": 1.6521999401376832, + "grad_norm": 0.11788905411958694, + "learning_rate": 7.343289823636168e-06, + "loss": 10.3119, + "step": 1380 + }, + { + "epoch": 1.6533971864711163, + "grad_norm": 0.11224735528230667, + "learning_rate": 7.294000514560101e-06, + "loss": 10.2716, + "step": 1381 + }, + { + "epoch": 1.6545944328045494, + "grad_norm": 0.11593727767467499, + "learning_rate": 7.244864163531162e-06, + "loss": 10.2952, + "step": 1382 + }, + { + "epoch": 1.6557916791379825, + "grad_norm": 0.13816185295581818, + "learning_rate": 7.195880946538674e-06, + "loss": 10.3389, + "step": 1383 + }, + { + "epoch": 1.6569889254714156, + "grad_norm": 0.16699369251728058, + "learning_rate": 7.147051039023528e-06, + "loss": 10.4825, + "step": 1384 + }, + { + "epoch": 1.6581861718048487, + "grad_norm": 0.34381812810897827, + "learning_rate": 7.098374615877451e-06, + "loss": 9.907, + "step": 1385 + }, + { + "epoch": 1.6593834181382818, + "grad_norm": 0.22115465998649597, + "learning_rate": 7.049851851442468e-06, + "loss": 10.2125, + "step": 1386 + }, + { + "epoch": 1.660580664471715, + "grad_norm": 0.1872490793466568, + "learning_rate": 7.001482919510288e-06, + "loss": 10.2982, + "step": 1387 + }, + { + "epoch": 1.661777910805148, + "grad_norm": 0.12597239017486572, + "learning_rate": 6.953267993321588e-06, + "loss": 10.2661, + "step": 1388 + }, + { + "epoch": 1.6629751571385811, + "grad_norm": 0.12170591205358505, + "learning_rate": 6.90520724556547e-06, + "loss": 10.297, + "step": 1389 + }, + { + "epoch": 1.6641724034720142, + "grad_norm": 0.12667423486709595, + "learning_rate": 6.857300848378856e-06, + "loss": 10.308, + "step": 1390 + }, + { + "epoch": 1.6653696498054473, + "grad_norm": 0.07838978618383408, + "learning_rate": 6.809548973345803e-06, + "loss": 10.3162, + "step": 1391 + }, + { + "epoch": 1.6665668961388804, + "grad_norm": 0.0677676647901535, + "learning_rate": 6.761951791496901e-06, + "loss": 10.3164, + "step": 1392 + }, + { + "epoch": 1.6677641424723135, + "grad_norm": 0.0773274376988411, + "learning_rate": 6.71450947330875e-06, + "loss": 10.3079, + "step": 1393 + }, + { + "epoch": 1.6689613888057466, + "grad_norm": 0.07863089442253113, + "learning_rate": 6.667222188703226e-06, + "loss": 10.3055, + "step": 1394 + }, + { + "epoch": 1.6701586351391797, + "grad_norm": 0.07816001772880554, + "learning_rate": 6.62009010704695e-06, + "loss": 10.3029, + "step": 1395 + }, + { + "epoch": 1.6713558814726128, + "grad_norm": 0.07088542729616165, + "learning_rate": 6.573113397150654e-06, + "loss": 10.2998, + "step": 1396 + }, + { + "epoch": 1.6725531278060461, + "grad_norm": 0.12833940982818604, + "learning_rate": 6.526292227268594e-06, + "loss": 10.3027, + "step": 1397 + }, + { + "epoch": 1.6737503741394792, + "grad_norm": 0.09928267449140549, + "learning_rate": 6.479626765097918e-06, + "loss": 10.3055, + "step": 1398 + }, + { + "epoch": 1.6749476204729123, + "grad_norm": 0.10712918639183044, + "learning_rate": 6.433117177778103e-06, + "loss": 10.3031, + "step": 1399 + }, + { + "epoch": 1.6761448668063454, + "grad_norm": 0.16568809747695923, + "learning_rate": 6.386763631890313e-06, + "loss": 10.3069, + "step": 1400 + }, + { + "epoch": 1.6773421131397785, + "grad_norm": 0.11915326863527298, + "learning_rate": 6.340566293456846e-06, + "loss": 10.3066, + "step": 1401 + }, + { + "epoch": 1.6785393594732116, + "grad_norm": 0.10023251920938492, + "learning_rate": 6.294525327940515e-06, + "loss": 10.274, + "step": 1402 + }, + { + "epoch": 1.6797366058066447, + "grad_norm": 0.05053332820534706, + "learning_rate": 6.248640900244046e-06, + "loss": 10.2918, + "step": 1403 + }, + { + "epoch": 1.6809338521400778, + "grad_norm": 0.05739394947886467, + "learning_rate": 6.202913174709507e-06, + "loss": 10.2784, + "step": 1404 + }, + { + "epoch": 1.682131098473511, + "grad_norm": 0.04986783117055893, + "learning_rate": 6.1573423151177534e-06, + "loss": 10.2765, + "step": 1405 + }, + { + "epoch": 1.683328344806944, + "grad_norm": 0.07087478786706924, + "learning_rate": 6.111928484687723e-06, + "loss": 10.2753, + "step": 1406 + }, + { + "epoch": 1.6845255911403771, + "grad_norm": 0.09234434366226196, + "learning_rate": 6.066671846075983e-06, + "loss": 10.3143, + "step": 1407 + }, + { + "epoch": 1.6857228374738102, + "grad_norm": 0.0674377977848053, + "learning_rate": 6.02157256137611e-06, + "loss": 10.2978, + "step": 1408 + }, + { + "epoch": 1.6869200838072433, + "grad_norm": 0.07978789508342743, + "learning_rate": 5.976630792118032e-06, + "loss": 10.2894, + "step": 1409 + }, + { + "epoch": 1.6881173301406764, + "grad_norm": 0.06527896970510483, + "learning_rate": 5.931846699267557e-06, + "loss": 10.2932, + "step": 1410 + }, + { + "epoch": 1.6893145764741095, + "grad_norm": 0.07311516255140305, + "learning_rate": 5.887220443225749e-06, + "loss": 10.2963, + "step": 1411 + }, + { + "epoch": 1.6905118228075426, + "grad_norm": 0.05921307951211929, + "learning_rate": 5.842752183828354e-06, + "loss": 10.299, + "step": 1412 + }, + { + "epoch": 1.6917090691409757, + "grad_norm": 0.06728245317935944, + "learning_rate": 5.79844208034519e-06, + "loss": 10.2729, + "step": 1413 + }, + { + "epoch": 1.6929063154744088, + "grad_norm": 0.05796895921230316, + "learning_rate": 5.7542902914796745e-06, + "loss": 10.2796, + "step": 1414 + }, + { + "epoch": 1.694103561807842, + "grad_norm": 0.06685442477464676, + "learning_rate": 5.710296975368162e-06, + "loss": 10.2686, + "step": 1415 + }, + { + "epoch": 1.695300808141275, + "grad_norm": 0.07838046550750732, + "learning_rate": 5.666462289579422e-06, + "loss": 10.2972, + "step": 1416 + }, + { + "epoch": 1.6964980544747081, + "grad_norm": 0.06768795102834702, + "learning_rate": 5.62278639111406e-06, + "loss": 10.2694, + "step": 1417 + }, + { + "epoch": 1.6976953008081412, + "grad_norm": 0.07134141027927399, + "learning_rate": 5.579269436403967e-06, + "loss": 10.3039, + "step": 1418 + }, + { + "epoch": 1.6988925471415743, + "grad_norm": 0.07692733407020569, + "learning_rate": 5.535911581311748e-06, + "loss": 10.2657, + "step": 1419 + }, + { + "epoch": 1.7000897934750074, + "grad_norm": 0.06634758412837982, + "learning_rate": 5.4927129811301715e-06, + "loss": 10.305, + "step": 1420 + }, + { + "epoch": 1.7012870398084405, + "grad_norm": 0.10109997540712357, + "learning_rate": 5.449673790581611e-06, + "loss": 10.2891, + "step": 1421 + }, + { + "epoch": 1.7024842861418736, + "grad_norm": 0.08702096343040466, + "learning_rate": 5.4067941638174806e-06, + "loss": 10.3005, + "step": 1422 + }, + { + "epoch": 1.703681532475307, + "grad_norm": 0.08837579935789108, + "learning_rate": 5.364074254417712e-06, + "loss": 10.2829, + "step": 1423 + }, + { + "epoch": 1.70487877880874, + "grad_norm": 0.07838171720504761, + "learning_rate": 5.3215142153901605e-06, + "loss": 10.2805, + "step": 1424 + }, + { + "epoch": 1.7060760251421732, + "grad_norm": 0.07888559252023697, + "learning_rate": 5.279114199170093e-06, + "loss": 10.2965, + "step": 1425 + }, + { + "epoch": 1.7072732714756063, + "grad_norm": 0.11617400497198105, + "learning_rate": 5.2368743576196536e-06, + "loss": 10.2558, + "step": 1426 + }, + { + "epoch": 1.7084705178090394, + "grad_norm": 0.10236542671918869, + "learning_rate": 5.194794842027251e-06, + "loss": 10.2726, + "step": 1427 + }, + { + "epoch": 1.7096677641424725, + "grad_norm": 0.1651146560907364, + "learning_rate": 5.152875803107083e-06, + "loss": 10.2571, + "step": 1428 + }, + { + "epoch": 1.7108650104759056, + "grad_norm": 0.16483420133590698, + "learning_rate": 5.111117390998599e-06, + "loss": 10.3314, + "step": 1429 + }, + { + "epoch": 1.7120622568093387, + "grad_norm": 0.13295166194438934, + "learning_rate": 5.0695197552659e-06, + "loss": 10.2998, + "step": 1430 + }, + { + "epoch": 1.7132595031427718, + "grad_norm": 0.12947431206703186, + "learning_rate": 5.028083044897247e-06, + "loss": 10.2338, + "step": 1431 + }, + { + "epoch": 1.7144567494762049, + "grad_norm": 0.1184263601899147, + "learning_rate": 4.986807408304567e-06, + "loss": 10.2819, + "step": 1432 + }, + { + "epoch": 1.715653995809638, + "grad_norm": 0.15510588884353638, + "learning_rate": 4.945692993322837e-06, + "loss": 10.3753, + "step": 1433 + }, + { + "epoch": 1.716851242143071, + "grad_norm": 0.16619780659675598, + "learning_rate": 4.9047399472095746e-06, + "loss": 10.2698, + "step": 1434 + }, + { + "epoch": 1.7180484884765042, + "grad_norm": 0.393914133310318, + "learning_rate": 4.863948416644382e-06, + "loss": 10.1124, + "step": 1435 + }, + { + "epoch": 1.7192457348099373, + "grad_norm": 0.22546739876270294, + "learning_rate": 4.82331854772834e-06, + "loss": 10.2136, + "step": 1436 + }, + { + "epoch": 1.7204429811433704, + "grad_norm": 0.15371963381767273, + "learning_rate": 4.782850485983515e-06, + "loss": 10.3011, + "step": 1437 + }, + { + "epoch": 1.7216402274768035, + "grad_norm": 0.11747770756483078, + "learning_rate": 4.742544376352443e-06, + "loss": 10.269, + "step": 1438 + }, + { + "epoch": 1.7228374738102366, + "grad_norm": 0.11215196549892426, + "learning_rate": 4.70240036319759e-06, + "loss": 10.2861, + "step": 1439 + }, + { + "epoch": 1.7240347201436697, + "grad_norm": 0.0815928727388382, + "learning_rate": 4.662418590300871e-06, + "loss": 10.3085, + "step": 1440 + }, + { + "epoch": 1.7252319664771028, + "grad_norm": 0.08260598033666611, + "learning_rate": 4.622599200863093e-06, + "loss": 10.3159, + "step": 1441 + }, + { + "epoch": 1.7264292128105359, + "grad_norm": 0.06445060670375824, + "learning_rate": 4.582942337503465e-06, + "loss": 10.3206, + "step": 1442 + }, + { + "epoch": 1.727626459143969, + "grad_norm": 0.07297521084547043, + "learning_rate": 4.543448142259099e-06, + "loss": 10.3108, + "step": 1443 + }, + { + "epoch": 1.728823705477402, + "grad_norm": 0.05839930474758148, + "learning_rate": 4.504116756584465e-06, + "loss": 10.2876, + "step": 1444 + }, + { + "epoch": 1.7300209518108352, + "grad_norm": 0.08180660754442215, + "learning_rate": 4.464948321350925e-06, + "loss": 10.3309, + "step": 1445 + }, + { + "epoch": 1.7312181981442682, + "grad_norm": 0.06208010017871857, + "learning_rate": 4.425942976846187e-06, + "loss": 10.302, + "step": 1446 + }, + { + "epoch": 1.7324154444777013, + "grad_norm": 0.06705661118030548, + "learning_rate": 4.387100862773869e-06, + "loss": 10.291, + "step": 1447 + }, + { + "epoch": 1.7336126908111344, + "grad_norm": 0.1199129968881607, + "learning_rate": 4.348422118252892e-06, + "loss": 10.2967, + "step": 1448 + }, + { + "epoch": 1.7348099371445675, + "grad_norm": 0.08125564455986023, + "learning_rate": 4.3099068818170775e-06, + "loss": 10.3101, + "step": 1449 + }, + { + "epoch": 1.7360071834780006, + "grad_norm": 0.08302604407072067, + "learning_rate": 4.271555291414636e-06, + "loss": 10.2918, + "step": 1450 + }, + { + "epoch": 1.7372044298114337, + "grad_norm": 0.07475101202726364, + "learning_rate": 4.23336748440763e-06, + "loss": 10.2989, + "step": 1451 + }, + { + "epoch": 1.7384016761448668, + "grad_norm": 0.06398740410804749, + "learning_rate": 4.195343597571488e-06, + "loss": 10.3063, + "step": 1452 + }, + { + "epoch": 1.7395989224783, + "grad_norm": 0.06320304423570633, + "learning_rate": 4.157483767094584e-06, + "loss": 10.3031, + "step": 1453 + }, + { + "epoch": 1.740796168811733, + "grad_norm": 0.05624968186020851, + "learning_rate": 4.119788128577667e-06, + "loss": 10.2814, + "step": 1454 + }, + { + "epoch": 1.7419934151451661, + "grad_norm": 0.06657078862190247, + "learning_rate": 4.082256817033392e-06, + "loss": 10.2833, + "step": 1455 + }, + { + "epoch": 1.7431906614785992, + "grad_norm": 0.06242256984114647, + "learning_rate": 4.044889966885895e-06, + "loss": 10.2655, + "step": 1456 + }, + { + "epoch": 1.7443879078120323, + "grad_norm": 0.05705961957573891, + "learning_rate": 4.007687711970243e-06, + "loss": 10.2938, + "step": 1457 + }, + { + "epoch": 1.7455851541454654, + "grad_norm": 0.059913270175457, + "learning_rate": 3.9706501855319765e-06, + "loss": 10.296, + "step": 1458 + }, + { + "epoch": 1.7467824004788985, + "grad_norm": 0.06840752065181732, + "learning_rate": 3.93377752022665e-06, + "loss": 10.2923, + "step": 1459 + }, + { + "epoch": 1.7479796468123316, + "grad_norm": 0.12457261234521866, + "learning_rate": 3.897069848119323e-06, + "loss": 10.2813, + "step": 1460 + }, + { + "epoch": 1.7491768931457647, + "grad_norm": 0.06760392338037491, + "learning_rate": 3.86052730068413e-06, + "loss": 10.3161, + "step": 1461 + }, + { + "epoch": 1.7503741394791978, + "grad_norm": 0.05782267823815346, + "learning_rate": 3.824150008803767e-06, + "loss": 10.275, + "step": 1462 + }, + { + "epoch": 1.751571385812631, + "grad_norm": 0.0647527202963829, + "learning_rate": 3.7879381027690565e-06, + "loss": 10.2733, + "step": 1463 + }, + { + "epoch": 1.752768632146064, + "grad_norm": 0.06330129504203796, + "learning_rate": 3.7518917122784604e-06, + "loss": 10.3069, + "step": 1464 + }, + { + "epoch": 1.7539658784794971, + "grad_norm": 0.07361055165529251, + "learning_rate": 3.716010966437611e-06, + "loss": 10.2784, + "step": 1465 + }, + { + "epoch": 1.7551631248129302, + "grad_norm": 0.0692029520869255, + "learning_rate": 3.680295993758881e-06, + "loss": 10.2828, + "step": 1466 + }, + { + "epoch": 1.7563603711463633, + "grad_norm": 0.07616943120956421, + "learning_rate": 3.6447469221608856e-06, + "loss": 10.2803, + "step": 1467 + }, + { + "epoch": 1.7575576174797964, + "grad_norm": 0.0730268806219101, + "learning_rate": 3.609363878968036e-06, + "loss": 10.2823, + "step": 1468 + }, + { + "epoch": 1.7587548638132295, + "grad_norm": 0.07882644236087799, + "learning_rate": 3.5741469909101045e-06, + "loss": 10.2953, + "step": 1469 + }, + { + "epoch": 1.7599521101466626, + "grad_norm": 0.08347548544406891, + "learning_rate": 3.539096384121743e-06, + "loss": 10.2741, + "step": 1470 + }, + { + "epoch": 1.7611493564800957, + "grad_norm": 0.08163131028413773, + "learning_rate": 3.5042121841420305e-06, + "loss": 10.2958, + "step": 1471 + }, + { + "epoch": 1.7623466028135288, + "grad_norm": 0.10762349516153336, + "learning_rate": 3.469494515914079e-06, + "loss": 10.31, + "step": 1472 + }, + { + "epoch": 1.763543849146962, + "grad_norm": 0.07833200693130493, + "learning_rate": 3.4349435037844714e-06, + "loss": 10.262, + "step": 1473 + }, + { + "epoch": 1.764741095480395, + "grad_norm": 0.10789546370506287, + "learning_rate": 3.40055927150294e-06, + "loss": 10.3063, + "step": 1474 + }, + { + "epoch": 1.7659383418138281, + "grad_norm": 0.11339820176362991, + "learning_rate": 3.3663419422218677e-06, + "loss": 10.2715, + "step": 1475 + }, + { + "epoch": 1.7671355881472612, + "grad_norm": 0.11684668064117432, + "learning_rate": 3.332291638495816e-06, + "loss": 10.2736, + "step": 1476 + }, + { + "epoch": 1.7683328344806943, + "grad_norm": 0.11093264818191528, + "learning_rate": 3.2984084822811346e-06, + "loss": 10.2988, + "step": 1477 + }, + { + "epoch": 1.7695300808141274, + "grad_norm": 0.09710708260536194, + "learning_rate": 3.2646925949355312e-06, + "loss": 10.2488, + "step": 1478 + }, + { + "epoch": 1.7707273271475605, + "grad_norm": 0.13823895156383514, + "learning_rate": 3.2311440972175977e-06, + "loss": 10.3078, + "step": 1479 + }, + { + "epoch": 1.7719245734809936, + "grad_norm": 0.13767948746681213, + "learning_rate": 3.1977631092863615e-06, + "loss": 10.3625, + "step": 1480 + }, + { + "epoch": 1.7731218198144267, + "grad_norm": 0.11143260449171066, + "learning_rate": 3.1645497507009513e-06, + "loss": 10.22, + "step": 1481 + }, + { + "epoch": 1.7743190661478598, + "grad_norm": 0.12027215957641602, + "learning_rate": 3.1315041404200663e-06, + "loss": 10.2454, + "step": 1482 + }, + { + "epoch": 1.775516312481293, + "grad_norm": 0.14723733067512512, + "learning_rate": 3.0986263968015904e-06, + "loss": 10.273, + "step": 1483 + }, + { + "epoch": 1.776713558814726, + "grad_norm": 0.2102520763874054, + "learning_rate": 3.065916637602173e-06, + "loss": 10.5949, + "step": 1484 + }, + { + "epoch": 1.7779108051481591, + "grad_norm": 0.44642749428749084, + "learning_rate": 3.0333749799768107e-06, + "loss": 9.8777, + "step": 1485 + }, + { + "epoch": 1.7791080514815922, + "grad_norm": 0.21991458535194397, + "learning_rate": 3.00100154047841e-06, + "loss": 10.2209, + "step": 1486 + }, + { + "epoch": 1.7803052978150253, + "grad_norm": 0.14081746339797974, + "learning_rate": 2.9687964350573748e-06, + "loss": 10.307, + "step": 1487 + }, + { + "epoch": 1.7815025441484584, + "grad_norm": 0.14267589151859283, + "learning_rate": 2.936759779061199e-06, + "loss": 10.2662, + "step": 1488 + }, + { + "epoch": 1.7826997904818915, + "grad_norm": 0.14273738861083984, + "learning_rate": 2.904891687234057e-06, + "loss": 10.271, + "step": 1489 + }, + { + "epoch": 1.7838970368153246, + "grad_norm": 0.10804067552089691, + "learning_rate": 2.8731922737163685e-06, + "loss": 10.3092, + "step": 1490 + }, + { + "epoch": 1.7850942831487577, + "grad_norm": 0.09078163653612137, + "learning_rate": 2.8416616520444193e-06, + "loss": 10.3066, + "step": 1491 + }, + { + "epoch": 1.7862915294821908, + "grad_norm": 0.08167129755020142, + "learning_rate": 2.810299935149935e-06, + "loss": 10.327, + "step": 1492 + }, + { + "epoch": 1.787488775815624, + "grad_norm": 0.06562914699316025, + "learning_rate": 2.779107235359696e-06, + "loss": 10.2849, + "step": 1493 + }, + { + "epoch": 1.788686022149057, + "grad_norm": 0.08646023273468018, + "learning_rate": 2.7480836643950956e-06, + "loss": 10.3012, + "step": 1494 + }, + { + "epoch": 1.7898832684824901, + "grad_norm": 0.07022864371538162, + "learning_rate": 2.7172293333717848e-06, + "loss": 10.3138, + "step": 1495 + }, + { + "epoch": 1.7910805148159232, + "grad_norm": 0.07976744323968887, + "learning_rate": 2.6865443527992696e-06, + "loss": 10.299, + "step": 1496 + }, + { + "epoch": 1.7922777611493563, + "grad_norm": 0.05912242829799652, + "learning_rate": 2.656028832580476e-06, + "loss": 10.284, + "step": 1497 + }, + { + "epoch": 1.7934750074827894, + "grad_norm": 0.08630083501338959, + "learning_rate": 2.6256828820113766e-06, + "loss": 10.2923, + "step": 1498 + }, + { + "epoch": 1.7946722538162225, + "grad_norm": 0.06238410994410515, + "learning_rate": 2.59550660978064e-06, + "loss": 10.3014, + "step": 1499 + }, + { + "epoch": 1.7958695001496556, + "grad_norm": 0.12038327008485794, + "learning_rate": 2.5655001239691835e-06, + "loss": 10.3073, + "step": 1500 + }, + { + "epoch": 1.797066746483089, + "grad_norm": 0.0836963802576065, + "learning_rate": 2.5356635320497924e-06, + "loss": 10.2995, + "step": 1501 + }, + { + "epoch": 1.798263992816522, + "grad_norm": 0.06807655096054077, + "learning_rate": 2.5059969408867843e-06, + "loss": 10.2949, + "step": 1502 + }, + { + "epoch": 1.7994612391499551, + "grad_norm": 0.05216376855969429, + "learning_rate": 2.476500456735581e-06, + "loss": 10.2983, + "step": 1503 + }, + { + "epoch": 1.8006584854833882, + "grad_norm": 0.06283175945281982, + "learning_rate": 2.4471741852423237e-06, + "loss": 10.2703, + "step": 1504 + }, + { + "epoch": 1.8018557318168213, + "grad_norm": 0.0593952052295208, + "learning_rate": 2.4180182314435307e-06, + "loss": 10.2957, + "step": 1505 + }, + { + "epoch": 1.8030529781502544, + "grad_norm": 0.07193513959646225, + "learning_rate": 2.3890326997656975e-06, + "loss": 10.2896, + "step": 1506 + }, + { + "epoch": 1.8042502244836875, + "grad_norm": 0.06681742519140244, + "learning_rate": 2.3602176940249188e-06, + "loss": 10.2959, + "step": 1507 + }, + { + "epoch": 1.8054474708171206, + "grad_norm": 0.08400710672140121, + "learning_rate": 2.331573317426533e-06, + "loss": 10.29, + "step": 1508 + }, + { + "epoch": 1.8066447171505537, + "grad_norm": 0.05899708345532417, + "learning_rate": 2.3030996725647403e-06, + "loss": 10.2973, + "step": 1509 + }, + { + "epoch": 1.8078419634839868, + "grad_norm": 0.10067924112081528, + "learning_rate": 2.274796861422246e-06, + "loss": 10.3114, + "step": 1510 + }, + { + "epoch": 1.80903920981742, + "grad_norm": 0.05429118499159813, + "learning_rate": 2.246664985369873e-06, + "loss": 10.2833, + "step": 1511 + }, + { + "epoch": 1.810236456150853, + "grad_norm": 0.0513097383081913, + "learning_rate": 2.2187041451662282e-06, + "loss": 10.2876, + "step": 1512 + }, + { + "epoch": 1.8114337024842861, + "grad_norm": 0.07512176036834717, + "learning_rate": 2.1909144409573144e-06, + "loss": 10.2769, + "step": 1513 + }, + { + "epoch": 1.8126309488177192, + "grad_norm": 0.06695140898227692, + "learning_rate": 2.163295972276219e-06, + "loss": 10.2785, + "step": 1514 + }, + { + "epoch": 1.8138281951511523, + "grad_norm": 0.07782050222158432, + "learning_rate": 2.1358488380426755e-06, + "loss": 10.299, + "step": 1515 + }, + { + "epoch": 1.8150254414845854, + "grad_norm": 0.06183487921953201, + "learning_rate": 2.1085731365627746e-06, + "loss": 10.2981, + "step": 1516 + }, + { + "epoch": 1.8162226878180185, + "grad_norm": 0.07254426181316376, + "learning_rate": 2.0814689655286158e-06, + "loss": 10.2812, + "step": 1517 + }, + { + "epoch": 1.8174199341514516, + "grad_norm": 0.08596598356962204, + "learning_rate": 2.054536422017922e-06, + "loss": 10.2987, + "step": 1518 + }, + { + "epoch": 1.8186171804848847, + "grad_norm": 0.0635576844215393, + "learning_rate": 2.027775602493681e-06, + "loss": 10.2812, + "step": 1519 + }, + { + "epoch": 1.8198144268183178, + "grad_norm": 0.07647490501403809, + "learning_rate": 2.0011866028038617e-06, + "loss": 10.2903, + "step": 1520 + }, + { + "epoch": 1.821011673151751, + "grad_norm": 0.08304933458566666, + "learning_rate": 1.9747695181810244e-06, + "loss": 10.2926, + "step": 1521 + }, + { + "epoch": 1.822208919485184, + "grad_norm": 0.10030169039964676, + "learning_rate": 1.9485244432419667e-06, + "loss": 10.2673, + "step": 1522 + }, + { + "epoch": 1.8234061658186171, + "grad_norm": 0.08289503306150436, + "learning_rate": 1.9224514719874465e-06, + "loss": 10.2828, + "step": 1523 + }, + { + "epoch": 1.8246034121520502, + "grad_norm": 0.09447790682315826, + "learning_rate": 1.896550697801769e-06, + "loss": 10.2859, + "step": 1524 + }, + { + "epoch": 1.8258006584854833, + "grad_norm": 0.10931795835494995, + "learning_rate": 1.8708222134525167e-06, + "loss": 10.2531, + "step": 1525 + }, + { + "epoch": 1.8269979048189164, + "grad_norm": 0.09017056971788406, + "learning_rate": 1.8452661110901715e-06, + "loss": 10.2792, + "step": 1526 + }, + { + "epoch": 1.8281951511523498, + "grad_norm": 0.11218827962875366, + "learning_rate": 1.819882482247809e-06, + "loss": 10.2665, + "step": 1527 + }, + { + "epoch": 1.8293923974857829, + "grad_norm": 0.1274181604385376, + "learning_rate": 1.7946714178407652e-06, + "loss": 10.3115, + "step": 1528 + }, + { + "epoch": 1.830589643819216, + "grad_norm": 0.11250434815883636, + "learning_rate": 1.769633008166316e-06, + "loss": 10.3201, + "step": 1529 + }, + { + "epoch": 1.831786890152649, + "grad_norm": 0.10763664543628693, + "learning_rate": 1.7447673429033362e-06, + "loss": 10.1831, + "step": 1530 + }, + { + "epoch": 1.8329841364860822, + "grad_norm": 0.11502611637115479, + "learning_rate": 1.720074511112002e-06, + "loss": 10.3362, + "step": 1531 + }, + { + "epoch": 1.8341813828195153, + "grad_norm": 0.12431536614894867, + "learning_rate": 1.695554601233451e-06, + "loss": 10.317, + "step": 1532 + }, + { + "epoch": 1.8353786291529484, + "grad_norm": 0.14267539978027344, + "learning_rate": 1.6712077010894777e-06, + "loss": 10.2431, + "step": 1533 + }, + { + "epoch": 1.8365758754863815, + "grad_norm": 0.2869594395160675, + "learning_rate": 1.6470338978822108e-06, + "loss": 10.5889, + "step": 1534 + }, + { + "epoch": 1.8377731218198146, + "grad_norm": 0.3596677780151367, + "learning_rate": 1.623033278193825e-06, + "loss": 9.717, + "step": 1535 + }, + { + "epoch": 1.8389703681532477, + "grad_norm": 0.21456004679203033, + "learning_rate": 1.5992059279861914e-06, + "loss": 10.3218, + "step": 1536 + }, + { + "epoch": 1.8401676144866808, + "grad_norm": 0.1550094485282898, + "learning_rate": 1.5755519326005941e-06, + "loss": 10.2274, + "step": 1537 + }, + { + "epoch": 1.8413648608201139, + "grad_norm": 0.15703994035720825, + "learning_rate": 1.5520713767574246e-06, + "loss": 10.3017, + "step": 1538 + }, + { + "epoch": 1.842562107153547, + "grad_norm": 0.13967515528202057, + "learning_rate": 1.528764344555883e-06, + "loss": 10.3057, + "step": 1539 + }, + { + "epoch": 1.84375935348698, + "grad_norm": 0.10350265353918076, + "learning_rate": 1.5056309194736384e-06, + "loss": 10.294, + "step": 1540 + }, + { + "epoch": 1.8449565998204132, + "grad_norm": 0.07884383201599121, + "learning_rate": 1.4826711843665964e-06, + "loss": 10.2913, + "step": 1541 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.06884119659662247, + "learning_rate": 1.4598852214685488e-06, + "loss": 10.3168, + "step": 1542 + }, + { + "epoch": 1.8473510924872794, + "grad_norm": 0.06975561380386353, + "learning_rate": 1.4372731123908744e-06, + "loss": 10.3113, + "step": 1543 + }, + { + "epoch": 1.8485483388207125, + "grad_norm": 0.09780983626842499, + "learning_rate": 1.414834938122306e-06, + "loss": 10.3092, + "step": 1544 + }, + { + "epoch": 1.8497455851541456, + "grad_norm": 0.06860961019992828, + "learning_rate": 1.3925707790285846e-06, + "loss": 10.3007, + "step": 1545 + }, + { + "epoch": 1.8509428314875787, + "grad_norm": 0.05669059604406357, + "learning_rate": 1.3704807148521903e-06, + "loss": 10.3003, + "step": 1546 + }, + { + "epoch": 1.8521400778210118, + "grad_norm": 0.07079499959945679, + "learning_rate": 1.3485648247120453e-06, + "loss": 10.2975, + "step": 1547 + }, + { + "epoch": 1.8533373241544449, + "grad_norm": 0.10130270570516586, + "learning_rate": 1.3268231871032655e-06, + "loss": 10.3048, + "step": 1548 + }, + { + "epoch": 1.854534570487878, + "grad_norm": 0.11469711363315582, + "learning_rate": 1.3052558798968272e-06, + "loss": 10.3028, + "step": 1549 + }, + { + "epoch": 1.855731816821311, + "grad_norm": 0.09146546572446823, + "learning_rate": 1.2838629803393342e-06, + "loss": 10.309, + "step": 1550 + }, + { + "epoch": 1.8569290631547442, + "grad_norm": 0.07775437831878662, + "learning_rate": 1.2626445650527174e-06, + "loss": 10.2781, + "step": 1551 + }, + { + "epoch": 1.8581263094881773, + "grad_norm": 0.06896555423736572, + "learning_rate": 1.2416007100339577e-06, + "loss": 10.301, + "step": 1552 + }, + { + "epoch": 1.8593235558216104, + "grad_norm": 0.04400847107172012, + "learning_rate": 1.2207314906548527e-06, + "loss": 10.2837, + "step": 1553 + }, + { + "epoch": 1.8605208021550435, + "grad_norm": 0.047213103622198105, + "learning_rate": 1.2000369816616674e-06, + "loss": 10.2687, + "step": 1554 + }, + { + "epoch": 1.8617180484884766, + "grad_norm": 0.053066615015268326, + "learning_rate": 1.1795172571749501e-06, + "loss": 10.2882, + "step": 1555 + }, + { + "epoch": 1.8629152948219097, + "grad_norm": 0.07588191330432892, + "learning_rate": 1.1591723906892337e-06, + "loss": 10.2932, + "step": 1556 + }, + { + "epoch": 1.8641125411553428, + "grad_norm": 0.05443607643246651, + "learning_rate": 1.139002455072735e-06, + "loss": 10.281, + "step": 1557 + }, + { + "epoch": 1.8653097874887759, + "grad_norm": 0.06149740144610405, + "learning_rate": 1.119007522567167e-06, + "loss": 10.2928, + "step": 1558 + }, + { + "epoch": 1.866507033822209, + "grad_norm": 0.08718440681695938, + "learning_rate": 1.0991876647874322e-06, + "loss": 10.2972, + "step": 1559 + }, + { + "epoch": 1.867704280155642, + "grad_norm": 0.0899738296866417, + "learning_rate": 1.0795429527213686e-06, + "loss": 10.2923, + "step": 1560 + }, + { + "epoch": 1.8689015264890751, + "grad_norm": 0.07872308045625687, + "learning_rate": 1.060073456729499e-06, + "loss": 10.2984, + "step": 1561 + }, + { + "epoch": 1.8700987728225082, + "grad_norm": 0.0658787339925766, + "learning_rate": 1.0407792465447986e-06, + "loss": 10.2828, + "step": 1562 + }, + { + "epoch": 1.8712960191559413, + "grad_norm": 0.06016536429524422, + "learning_rate": 1.021660391272422e-06, + "loss": 10.2791, + "step": 1563 + }, + { + "epoch": 1.8724932654893744, + "grad_norm": 0.06697628647089005, + "learning_rate": 1.002716959389466e-06, + "loss": 10.2666, + "step": 1564 + }, + { + "epoch": 1.8736905118228075, + "grad_norm": 0.08779527246952057, + "learning_rate": 9.839490187447177e-07, + "loss": 10.2853, + "step": 1565 + }, + { + "epoch": 1.8748877581562406, + "grad_norm": 0.07378949970006943, + "learning_rate": 9.653566365584176e-07, + "loss": 10.3079, + "step": 1566 + }, + { + "epoch": 1.8760850044896737, + "grad_norm": 0.0767425075173378, + "learning_rate": 9.469398794220152e-07, + "loss": 10.2932, + "step": 1567 + }, + { + "epoch": 1.8772822508231068, + "grad_norm": 0.07138129323720932, + "learning_rate": 9.28698813297929e-07, + "loss": 10.2826, + "step": 1568 + }, + { + "epoch": 1.87847949715654, + "grad_norm": 0.08919541537761688, + "learning_rate": 9.106335035193315e-07, + "loss": 10.2905, + "step": 1569 + }, + { + "epoch": 1.879676743489973, + "grad_norm": 0.08831840008497238, + "learning_rate": 8.927440147898702e-07, + "loss": 10.3138, + "step": 1570 + }, + { + "epoch": 1.8808739898234061, + "grad_norm": 0.09380318224430084, + "learning_rate": 8.750304111834807e-07, + "loss": 10.2687, + "step": 1571 + }, + { + "epoch": 1.8820712361568392, + "grad_norm": 0.10267306119203568, + "learning_rate": 8.574927561441349e-07, + "loss": 10.301, + "step": 1572 + }, + { + "epoch": 1.8832684824902723, + "grad_norm": 0.08051333576440811, + "learning_rate": 8.401311124856148e-07, + "loss": 10.2866, + "step": 1573 + }, + { + "epoch": 1.8844657288237054, + "grad_norm": 0.10789351910352707, + "learning_rate": 8.229455423913013e-07, + "loss": 10.2795, + "step": 1574 + }, + { + "epoch": 1.8856629751571385, + "grad_norm": 0.08615833520889282, + "learning_rate": 8.059361074139293e-07, + "loss": 10.2811, + "step": 1575 + }, + { + "epoch": 1.8868602214905716, + "grad_norm": 0.10325480997562408, + "learning_rate": 7.891028684753777e-07, + "loss": 10.2734, + "step": 1576 + }, + { + "epoch": 1.8880574678240047, + "grad_norm": 0.15683861076831818, + "learning_rate": 7.724458858664684e-07, + "loss": 10.2595, + "step": 1577 + }, + { + "epoch": 1.8892547141574378, + "grad_norm": 0.08866716921329498, + "learning_rate": 7.559652192467126e-07, + "loss": 10.3414, + "step": 1578 + }, + { + "epoch": 1.890451960490871, + "grad_norm": 0.11219555139541626, + "learning_rate": 7.396609276441313e-07, + "loss": 10.2498, + "step": 1579 + }, + { + "epoch": 1.891649206824304, + "grad_norm": 0.11629130691289902, + "learning_rate": 7.235330694550402e-07, + "loss": 10.2425, + "step": 1580 + }, + { + "epoch": 1.8928464531577371, + "grad_norm": 0.10970951616764069, + "learning_rate": 7.075817024438214e-07, + "loss": 10.3062, + "step": 1581 + }, + { + "epoch": 1.8940436994911702, + "grad_norm": 0.12196099013090134, + "learning_rate": 6.918068837427128e-07, + "loss": 10.2971, + "step": 1582 + }, + { + "epoch": 1.8952409458246033, + "grad_norm": 0.12181861698627472, + "learning_rate": 6.762086698516412e-07, + "loss": 10.2641, + "step": 1583 + }, + { + "epoch": 1.8964381921580364, + "grad_norm": 0.20476678013801575, + "learning_rate": 6.607871166379897e-07, + "loss": 10.2643, + "step": 1584 + }, + { + "epoch": 1.8976354384914695, + "grad_norm": 0.4002314507961273, + "learning_rate": 6.45542279336403e-07, + "loss": 10.2191, + "step": 1585 + }, + { + "epoch": 1.8988326848249026, + "grad_norm": 0.2193012237548828, + "learning_rate": 6.304742125485874e-07, + "loss": 10.2041, + "step": 1586 + }, + { + "epoch": 1.9000299311583357, + "grad_norm": 0.17310982942581177, + "learning_rate": 6.15582970243117e-07, + "loss": 10.2767, + "step": 1587 + }, + { + "epoch": 1.9012271774917688, + "grad_norm": 0.15236912667751312, + "learning_rate": 6.008686057552448e-07, + "loss": 10.2831, + "step": 1588 + }, + { + "epoch": 1.902424423825202, + "grad_norm": 0.1317615658044815, + "learning_rate": 5.863311717867192e-07, + "loss": 10.3062, + "step": 1589 + }, + { + "epoch": 1.903621670158635, + "grad_norm": 0.11767958849668503, + "learning_rate": 5.719707204055735e-07, + "loss": 10.3085, + "step": 1590 + }, + { + "epoch": 1.9048189164920681, + "grad_norm": 0.06816575676202774, + "learning_rate": 5.577873030459479e-07, + "loss": 10.2807, + "step": 1591 + }, + { + "epoch": 1.9060161628255012, + "grad_norm": 0.07067005336284637, + "learning_rate": 5.437809705079233e-07, + "loss": 10.3063, + "step": 1592 + }, + { + "epoch": 1.9072134091589343, + "grad_norm": 0.05901264399290085, + "learning_rate": 5.299517729573044e-07, + "loss": 10.3121, + "step": 1593 + }, + { + "epoch": 1.9084106554923674, + "grad_norm": 0.07548792660236359, + "learning_rate": 5.162997599254704e-07, + "loss": 10.3048, + "step": 1594 + }, + { + "epoch": 1.9096079018258005, + "grad_norm": 0.0813656821846962, + "learning_rate": 5.028249803091966e-07, + "loss": 10.2938, + "step": 1595 + }, + { + "epoch": 1.9108051481592336, + "grad_norm": 0.06446728110313416, + "learning_rate": 4.895274823704555e-07, + "loss": 10.2879, + "step": 1596 + }, + { + "epoch": 1.9120023944926667, + "grad_norm": 0.0759933814406395, + "learning_rate": 4.764073137362546e-07, + "loss": 10.3034, + "step": 1597 + }, + { + "epoch": 1.9131996408260998, + "grad_norm": 0.06734771281480789, + "learning_rate": 4.634645213984934e-07, + "loss": 10.3086, + "step": 1598 + }, + { + "epoch": 1.914396887159533, + "grad_norm": 0.07294712960720062, + "learning_rate": 4.50699151713746e-07, + "loss": 10.303, + "step": 1599 + }, + { + "epoch": 1.915594133492966, + "grad_norm": 0.07060587406158447, + "learning_rate": 4.381112504031337e-07, + "loss": 10.306, + "step": 1600 + }, + { + "epoch": 1.9167913798263991, + "grad_norm": 0.0678725317120552, + "learning_rate": 4.2570086255213637e-07, + "loss": 10.2929, + "step": 1601 + }, + { + "epoch": 1.9179886261598322, + "grad_norm": 0.06082233414053917, + "learning_rate": 4.134680326104645e-07, + "loss": 10.2938, + "step": 1602 + }, + { + "epoch": 1.9191858724932653, + "grad_norm": 0.05951394513249397, + "learning_rate": 4.0141280439184305e-07, + "loss": 10.2859, + "step": 1603 + }, + { + "epoch": 1.9203831188266984, + "grad_norm": 0.0722043514251709, + "learning_rate": 3.895352210739278e-07, + "loss": 10.3015, + "step": 1604 + }, + { + "epoch": 1.9215803651601315, + "grad_norm": 0.05289080739021301, + "learning_rate": 3.778353251980837e-07, + "loss": 10.2671, + "step": 1605 + }, + { + "epoch": 1.9227776114935649, + "grad_norm": 0.05037099868059158, + "learning_rate": 3.663131586692792e-07, + "loss": 10.2949, + "step": 1606 + }, + { + "epoch": 1.923974857826998, + "grad_norm": 0.06433635950088501, + "learning_rate": 3.5496876275590286e-07, + "loss": 10.2789, + "step": 1607 + }, + { + "epoch": 1.925172104160431, + "grad_norm": 0.05741877108812332, + "learning_rate": 3.4380217808964166e-07, + "loss": 10.2904, + "step": 1608 + }, + { + "epoch": 1.9263693504938642, + "grad_norm": 0.10033316165208817, + "learning_rate": 3.328134446653142e-07, + "loss": 10.2812, + "step": 1609 + }, + { + "epoch": 1.9275665968272973, + "grad_norm": 0.054372649639844894, + "learning_rate": 3.2200260184075406e-07, + "loss": 10.2948, + "step": 1610 + }, + { + "epoch": 1.9287638431607304, + "grad_norm": 0.09946265071630478, + "learning_rate": 3.1136968833663794e-07, + "loss": 10.3172, + "step": 1611 + }, + { + "epoch": 1.9299610894941635, + "grad_norm": 0.05201762169599533, + "learning_rate": 3.0091474223636895e-07, + "loss": 10.2796, + "step": 1612 + }, + { + "epoch": 1.9311583358275966, + "grad_norm": 0.05266715586185455, + "learning_rate": 2.9063780098592674e-07, + "loss": 10.2864, + "step": 1613 + }, + { + "epoch": 1.9323555821610297, + "grad_norm": 0.06616650521755219, + "learning_rate": 2.805389013937454e-07, + "loss": 10.2772, + "step": 1614 + }, + { + "epoch": 1.9335528284944627, + "grad_norm": 0.06895651668310165, + "learning_rate": 2.7061807963056906e-07, + "loss": 10.2941, + "step": 1615 + }, + { + "epoch": 1.9347500748278958, + "grad_norm": 0.05709347128868103, + "learning_rate": 2.6087537122934103e-07, + "loss": 10.2812, + "step": 1616 + }, + { + "epoch": 1.935947321161329, + "grad_norm": 0.06854600459337234, + "learning_rate": 2.5131081108504817e-07, + "loss": 10.2842, + "step": 1617 + }, + { + "epoch": 1.937144567494762, + "grad_norm": 0.06901566684246063, + "learning_rate": 2.419244334546267e-07, + "loss": 10.2828, + "step": 1618 + }, + { + "epoch": 1.9383418138281951, + "grad_norm": 0.06897091120481491, + "learning_rate": 2.3271627195681766e-07, + "loss": 10.3093, + "step": 1619 + }, + { + "epoch": 1.9395390601616282, + "grad_norm": 0.07506333291530609, + "learning_rate": 2.2368635957205618e-07, + "loss": 10.3011, + "step": 1620 + }, + { + "epoch": 1.9407363064950613, + "grad_norm": 0.09520450234413147, + "learning_rate": 2.1483472864234354e-07, + "loss": 10.284, + "step": 1621 + }, + { + "epoch": 1.9419335528284944, + "grad_norm": 0.13540160655975342, + "learning_rate": 2.061614108711474e-07, + "loss": 10.2993, + "step": 1622 + }, + { + "epoch": 1.9431307991619275, + "grad_norm": 0.09576369822025299, + "learning_rate": 1.9766643732328504e-07, + "loss": 10.2942, + "step": 1623 + }, + { + "epoch": 1.9443280454953606, + "grad_norm": 0.08980704843997955, + "learning_rate": 1.8934983842479047e-07, + "loss": 10.2803, + "step": 1624 + }, + { + "epoch": 1.9455252918287937, + "grad_norm": 0.13465505838394165, + "learning_rate": 1.812116439628364e-07, + "loss": 10.2874, + "step": 1625 + }, + { + "epoch": 1.9467225381622268, + "grad_norm": 0.10795759409666061, + "learning_rate": 1.732518830856067e-07, + "loss": 10.2975, + "step": 1626 + }, + { + "epoch": 1.94791978449566, + "grad_norm": 0.0945788323879242, + "learning_rate": 1.6547058430219088e-07, + "loss": 10.2483, + "step": 1627 + }, + { + "epoch": 1.949117030829093, + "grad_norm": 0.1601245403289795, + "learning_rate": 1.5786777548250641e-07, + "loss": 10.3094, + "step": 1628 + }, + { + "epoch": 1.9503142771625261, + "grad_norm": 0.10498841851949692, + "learning_rate": 1.5044348385716555e-07, + "loss": 10.3294, + "step": 1629 + }, + { + "epoch": 1.9515115234959592, + "grad_norm": 0.10477231442928314, + "learning_rate": 1.431977360173975e-07, + "loss": 10.1881, + "step": 1630 + }, + { + "epoch": 1.9527087698293923, + "grad_norm": 0.12594805657863617, + "learning_rate": 1.361305579149652e-07, + "loss": 10.314, + "step": 1631 + }, + { + "epoch": 1.9539060161628257, + "grad_norm": 0.12623995542526245, + "learning_rate": 1.2924197486203215e-07, + "loss": 10.3267, + "step": 1632 + }, + { + "epoch": 1.9551032624962588, + "grad_norm": 0.13854850828647614, + "learning_rate": 1.2253201153111238e-07, + "loss": 10.2677, + "step": 1633 + }, + { + "epoch": 1.9563005088296919, + "grad_norm": 0.24119798839092255, + "learning_rate": 1.16000691954965e-07, + "loss": 10.5225, + "step": 1634 + }, + { + "epoch": 1.957497755163125, + "grad_norm": 0.3617338538169861, + "learning_rate": 1.0964803952650537e-07, + "loss": 9.8073, + "step": 1635 + }, + { + "epoch": 1.958695001496558, + "grad_norm": 0.17286352813243866, + "learning_rate": 1.0347407699872191e-07, + "loss": 10.3332, + "step": 1636 + }, + { + "epoch": 1.9598922478299912, + "grad_norm": 0.13341809809207916, + "learning_rate": 9.747882648460937e-08, + "loss": 10.2429, + "step": 1637 + }, + { + "epoch": 1.9610894941634243, + "grad_norm": 0.09780275076627731, + "learning_rate": 9.1662309457069e-08, + "loss": 10.2747, + "step": 1638 + }, + { + "epoch": 1.9622867404968574, + "grad_norm": 0.08571167290210724, + "learning_rate": 8.602454674884186e-08, + "loss": 10.3059, + "step": 1639 + }, + { + "epoch": 1.9634839868302905, + "grad_norm": 0.07060141861438751, + "learning_rate": 8.056555855243675e-08, + "loss": 10.3138, + "step": 1640 + }, + { + "epoch": 1.9646812331637236, + "grad_norm": 0.07087056338787079, + "learning_rate": 7.528536442005241e-08, + "loss": 10.2709, + "step": 1641 + }, + { + "epoch": 1.9658784794971567, + "grad_norm": 0.06072325259447098, + "learning_rate": 7.018398326350539e-08, + "loss": 10.2961, + "step": 1642 + }, + { + "epoch": 1.9670757258305898, + "grad_norm": 0.07954999059438705, + "learning_rate": 6.526143335416901e-08, + "loss": 10.2936, + "step": 1643 + }, + { + "epoch": 1.9682729721640229, + "grad_norm": 0.09693991392850876, + "learning_rate": 6.051773232291225e-08, + "loss": 10.2887, + "step": 1644 + }, + { + "epoch": 1.969470218497456, + "grad_norm": 0.11456475406885147, + "learning_rate": 5.59528971600165e-08, + "loss": 10.3209, + "step": 1645 + }, + { + "epoch": 1.970667464830889, + "grad_norm": 0.05549129843711853, + "learning_rate": 5.15669442151423e-08, + "loss": 10.2928, + "step": 1646 + }, + { + "epoch": 1.9718647111643222, + "grad_norm": 0.06906992942094803, + "learning_rate": 4.735988919724599e-08, + "loss": 10.2904, + "step": 1647 + }, + { + "epoch": 1.9730619574977553, + "grad_norm": 0.05551117658615112, + "learning_rate": 4.333174717453536e-08, + "loss": 10.2781, + "step": 1648 + }, + { + "epoch": 1.9742592038311884, + "grad_norm": 0.046220384538173676, + "learning_rate": 3.948253257440859e-08, + "loss": 10.2909, + "step": 1649 + }, + { + "epoch": 1.9754564501646215, + "grad_norm": 0.061465710401535034, + "learning_rate": 3.581225918342646e-08, + "loss": 10.2934, + "step": 1650 + }, + { + "epoch": 1.9766536964980546, + "grad_norm": 0.08892307430505753, + "learning_rate": 3.2320940147229086e-08, + "loss": 10.2939, + "step": 1651 + }, + { + "epoch": 1.9778509428314877, + "grad_norm": 0.08193948864936829, + "learning_rate": 2.9008587970502653e-08, + "loss": 10.2927, + "step": 1652 + }, + { + "epoch": 1.9790481891649208, + "grad_norm": 0.05232901871204376, + "learning_rate": 2.5875214516946078e-08, + "loss": 10.2692, + "step": 1653 + }, + { + "epoch": 1.9802454354983539, + "grad_norm": 0.07632028311491013, + "learning_rate": 2.292083100920994e-08, + "loss": 10.3046, + "step": 1654 + }, + { + "epoch": 1.981442681831787, + "grad_norm": 0.059220828115940094, + "learning_rate": 2.0145448028874304e-08, + "loss": 10.2969, + "step": 1655 + }, + { + "epoch": 1.98263992816522, + "grad_norm": 0.06899555027484894, + "learning_rate": 1.7549075516393178e-08, + "loss": 10.2681, + "step": 1656 + }, + { + "epoch": 1.9838371744986532, + "grad_norm": 0.08329957723617554, + "learning_rate": 1.513172277106678e-08, + "loss": 10.2874, + "step": 1657 + }, + { + "epoch": 1.9850344208320863, + "grad_norm": 0.06838458776473999, + "learning_rate": 1.2893398451024886e-08, + "loss": 10.3133, + "step": 1658 + }, + { + "epoch": 1.9862316671655194, + "grad_norm": 0.07427065819501877, + "learning_rate": 1.0834110573154642e-08, + "loss": 10.2695, + "step": 1659 + }, + { + "epoch": 1.9874289134989525, + "grad_norm": 0.0783618688583374, + "learning_rate": 8.953866513111697e-09, + "loss": 10.3122, + "step": 1660 + }, + { + "epoch": 1.9886261598323856, + "grad_norm": 0.1068764179944992, + "learning_rate": 7.252673005281319e-09, + "loss": 10.2566, + "step": 1661 + }, + { + "epoch": 1.9898234061658187, + "grad_norm": 0.09878868609666824, + "learning_rate": 5.730536142745102e-09, + "loss": 10.2855, + "step": 1662 + }, + { + "epoch": 1.9910206524992518, + "grad_norm": 0.0892167016863823, + "learning_rate": 4.387461377269864e-09, + "loss": 10.2868, + "step": 1663 + }, + { + "epoch": 1.9922178988326849, + "grad_norm": 0.10703014582395554, + "learning_rate": 3.2234535192798843e-09, + "loss": 10.2911, + "step": 1664 + }, + { + "epoch": 1.993415145166118, + "grad_norm": 0.10144991427659988, + "learning_rate": 2.2385167378513593e-09, + "loss": 10.236, + "step": 1665 + }, + { + "epoch": 1.994612391499551, + "grad_norm": 0.09056767076253891, + "learning_rate": 1.432654560679092e-09, + "loss": 10.3577, + "step": 1666 + }, + { + "epoch": 1.9958096378329842, + "grad_norm": 0.10469964146614075, + "learning_rate": 8.058698740820436e-10, + "loss": 10.2485, + "step": 1667 + }, + { + "epoch": 1.9970068841664173, + "grad_norm": 0.11428900063037872, + "learning_rate": 3.5816492299223237e-10, + "loss": 10.2435, + "step": 1668 + }, + { + "epoch": 1.9982041304998504, + "grad_norm": 0.1339397430419922, + "learning_rate": 8.954131092142603e-11, + "loss": 10.418, + "step": 1669 + }, + { + "epoch": 1.9994013768332835, + "grad_norm": 0.20921064913272858, + "learning_rate": 0.0, + "loss": 10.6124, + "step": 1670 } ], "logging_steps": 1, @@ -5874,12 +11719,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 1115434597220352.0, + "total_flos": 2230243178840064.0, "train_batch_size": 4, "trial_name": null, "trial_params": null