{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.981488477521722, "eval_steps": 500, "global_step": 2640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03022289384208538, "grad_norm": 2.8042919635772705, "learning_rate": 3.7878787878787882e-06, "loss": 2.0238, "step": 10 }, { "epoch": 0.06044578768417076, "grad_norm": 1.7337448596954346, "learning_rate": 7.5757575757575764e-06, "loss": 1.7313, "step": 20 }, { "epoch": 0.09066868152625614, "grad_norm": 1.5929458141326904, "learning_rate": 1.1363636363636365e-05, "loss": 1.2745, "step": 30 }, { "epoch": 0.12089157536834153, "grad_norm": 0.9085432887077332, "learning_rate": 1.5151515151515153e-05, "loss": 0.8377, "step": 40 }, { "epoch": 0.1511144692104269, "grad_norm": 1.534300684928894, "learning_rate": 1.893939393939394e-05, "loss": 0.6819, "step": 50 }, { "epoch": 0.18133736305251227, "grad_norm": 1.3361095190048218, "learning_rate": 2.272727272727273e-05, "loss": 0.6215, "step": 60 }, { "epoch": 0.21156025689459765, "grad_norm": 1.5394957065582275, "learning_rate": 2.6515151515151516e-05, "loss": 0.583, "step": 70 }, { "epoch": 0.24178315073668305, "grad_norm": 2.153825521469116, "learning_rate": 3.0303030303030306e-05, "loss": 0.5443, "step": 80 }, { "epoch": 0.2720060445787684, "grad_norm": 1.6891471147537231, "learning_rate": 3.409090909090909e-05, "loss": 0.5418, "step": 90 }, { "epoch": 0.3022289384208538, "grad_norm": 1.127630591392517, "learning_rate": 3.787878787878788e-05, "loss": 0.527, "step": 100 }, { "epoch": 0.3324518322629392, "grad_norm": 3.1586360931396484, "learning_rate": 4.166666666666667e-05, "loss": 0.5007, "step": 110 }, { "epoch": 0.36267472610502455, "grad_norm": 1.173498272895813, "learning_rate": 4.545454545454546e-05, "loss": 0.5222, "step": 120 }, { "epoch": 0.3928976199471099, "grad_norm": 1.0684623718261719, "learning_rate": 4.9242424242424245e-05, "loss": 0.4919, "step": 130 }, { "epoch": 0.4231205137891953, "grad_norm": 0.9527800679206848, "learning_rate": 5.303030303030303e-05, "loss": 0.4892, "step": 140 }, { "epoch": 0.45334340763128067, "grad_norm": 0.7964572906494141, "learning_rate": 5.6818181818181825e-05, "loss": 0.49, "step": 150 }, { "epoch": 0.4835663014733661, "grad_norm": 0.8295568823814392, "learning_rate": 6.060606060606061e-05, "loss": 0.4796, "step": 160 }, { "epoch": 0.5137891953154514, "grad_norm": 0.9569409489631653, "learning_rate": 6.439393939393939e-05, "loss": 0.4702, "step": 170 }, { "epoch": 0.5440120891575368, "grad_norm": 0.7955500483512878, "learning_rate": 6.818181818181818e-05, "loss": 0.4818, "step": 180 }, { "epoch": 0.5742349829996222, "grad_norm": 1.0039526224136353, "learning_rate": 7.196969696969698e-05, "loss": 0.471, "step": 190 }, { "epoch": 0.6044578768417076, "grad_norm": 1.1145097017288208, "learning_rate": 7.575757575757576e-05, "loss": 0.4874, "step": 200 }, { "epoch": 0.634680770683793, "grad_norm": 0.8231551647186279, "learning_rate": 7.954545454545455e-05, "loss": 0.4641, "step": 210 }, { "epoch": 0.6649036645258783, "grad_norm": 0.7706289887428284, "learning_rate": 8.333333333333334e-05, "loss": 0.4703, "step": 220 }, { "epoch": 0.6951265583679638, "grad_norm": 0.9305810332298279, "learning_rate": 8.712121212121212e-05, "loss": 0.4589, "step": 230 }, { "epoch": 0.7253494522100491, "grad_norm": 0.895759105682373, "learning_rate": 9.090909090909092e-05, "loss": 0.465, "step": 240 }, { "epoch": 0.7555723460521345, "grad_norm": 0.8712740540504456, "learning_rate": 9.469696969696971e-05, "loss": 0.4665, "step": 250 }, { "epoch": 0.7857952398942198, "grad_norm": 0.9580489993095398, "learning_rate": 9.848484848484849e-05, "loss": 0.4478, "step": 260 }, { "epoch": 0.8160181337363053, "grad_norm": 1.2908588647842407, "learning_rate": 9.999842657116665e-05, "loss": 0.4545, "step": 270 }, { "epoch": 0.8462410275783906, "grad_norm": 0.865259051322937, "learning_rate": 9.99888115313551e-05, "loss": 0.4572, "step": 280 }, { "epoch": 0.876463921420476, "grad_norm": 0.9201974272727966, "learning_rate": 9.997045725776174e-05, "loss": 0.4283, "step": 290 }, { "epoch": 0.9066868152625613, "grad_norm": 0.7685046792030334, "learning_rate": 9.99433669591504e-05, "loss": 0.4374, "step": 300 }, { "epoch": 0.9369097091046468, "grad_norm": 3.0810554027557373, "learning_rate": 9.99075453715499e-05, "loss": 0.4333, "step": 310 }, { "epoch": 0.9671326029467322, "grad_norm": 1.222629189491272, "learning_rate": 9.986299875742613e-05, "loss": 0.4485, "step": 320 }, { "epoch": 0.9973554967888175, "grad_norm": 1.5258204936981201, "learning_rate": 9.980973490458728e-05, "loss": 0.4491, "step": 330 }, { "epoch": 1.027956176803929, "grad_norm": 3.25398325920105, "learning_rate": 9.97477631248223e-05, "loss": 0.4296, "step": 340 }, { "epoch": 1.0581790706460144, "grad_norm": 0.6229041814804077, "learning_rate": 9.967709425227294e-05, "loss": 0.4209, "step": 350 }, { "epoch": 1.0884019644880998, "grad_norm": 0.7541171312332153, "learning_rate": 9.959774064153977e-05, "loss": 0.3989, "step": 360 }, { "epoch": 1.1186248583301852, "grad_norm": 1.236759066581726, "learning_rate": 9.950971616552222e-05, "loss": 0.4226, "step": 370 }, { "epoch": 1.1488477521722704, "grad_norm": 1.4303499460220337, "learning_rate": 9.941303621299332e-05, "loss": 0.3918, "step": 380 }, { "epoch": 1.1790706460143558, "grad_norm": 1.2374080419540405, "learning_rate": 9.930771768590933e-05, "loss": 0.4012, "step": 390 }, { "epoch": 1.2092935398564413, "grad_norm": 0.7818596959114075, "learning_rate": 9.919377899645497e-05, "loss": 0.4226, "step": 400 }, { "epoch": 1.2395164336985267, "grad_norm": 0.8049737811088562, "learning_rate": 9.907124006382438e-05, "loss": 0.3948, "step": 410 }, { "epoch": 1.269739327540612, "grad_norm": 2.329979181289673, "learning_rate": 9.894012231073894e-05, "loss": 0.4052, "step": 420 }, { "epoch": 1.2999622213826973, "grad_norm": 0.8067731261253357, "learning_rate": 9.880044865970192e-05, "loss": 0.4049, "step": 430 }, { "epoch": 1.3301851152247828, "grad_norm": 0.8137128949165344, "learning_rate": 9.865224352899119e-05, "loss": 0.4083, "step": 440 }, { "epoch": 1.3604080090668682, "grad_norm": 1.8560458421707153, "learning_rate": 9.849553282839025e-05, "loss": 0.3963, "step": 450 }, { "epoch": 1.3906309029089536, "grad_norm": 0.6536078453063965, "learning_rate": 9.833034395465866e-05, "loss": 0.3951, "step": 460 }, { "epoch": 1.4208537967510388, "grad_norm": 0.57303386926651, "learning_rate": 9.815670578674232e-05, "loss": 0.3823, "step": 470 }, { "epoch": 1.4510766905931243, "grad_norm": 0.713214635848999, "learning_rate": 9.797464868072488e-05, "loss": 0.4022, "step": 480 }, { "epoch": 1.4812995844352097, "grad_norm": 0.6472973227500916, "learning_rate": 9.778420446452063e-05, "loss": 0.4026, "step": 490 }, { "epoch": 1.511522478277295, "grad_norm": 0.7143449783325195, "learning_rate": 9.75854064323104e-05, "loss": 0.4079, "step": 500 }, { "epoch": 1.5417453721193803, "grad_norm": 0.5345566272735596, "learning_rate": 9.737828933872075e-05, "loss": 0.3996, "step": 510 }, { "epoch": 1.5719682659614658, "grad_norm": 1.006264090538025, "learning_rate": 9.716288939274819e-05, "loss": 0.4077, "step": 520 }, { "epoch": 1.6021911598035512, "grad_norm": 0.7178054451942444, "learning_rate": 9.693924425142886e-05, "loss": 0.4167, "step": 530 }, { "epoch": 1.6324140536456366, "grad_norm": 0.8414012789726257, "learning_rate": 9.670739301325534e-05, "loss": 0.4104, "step": 540 }, { "epoch": 1.662636947487722, "grad_norm": 0.6398180723190308, "learning_rate": 9.646737621134112e-05, "loss": 0.3756, "step": 550 }, { "epoch": 1.6928598413298075, "grad_norm": 0.7549942135810852, "learning_rate": 9.62192358063346e-05, "loss": 0.3879, "step": 560 }, { "epoch": 1.7230827351718927, "grad_norm": 0.6766812205314636, "learning_rate": 9.596301517908328e-05, "loss": 0.3841, "step": 570 }, { "epoch": 1.7533056290139781, "grad_norm": 0.8327857255935669, "learning_rate": 9.56987591230498e-05, "loss": 0.3958, "step": 580 }, { "epoch": 1.7835285228560633, "grad_norm": 0.6985905766487122, "learning_rate": 9.542651383648091e-05, "loss": 0.4113, "step": 590 }, { "epoch": 1.8137514166981488, "grad_norm": 0.6444863677024841, "learning_rate": 9.514632691433107e-05, "loss": 0.392, "step": 600 }, { "epoch": 1.8439743105402342, "grad_norm": 0.8134333491325378, "learning_rate": 9.48582473399415e-05, "loss": 0.3996, "step": 610 }, { "epoch": 1.8741972043823196, "grad_norm": 0.6443466544151306, "learning_rate": 9.456232547647694e-05, "loss": 0.3596, "step": 620 }, { "epoch": 1.904420098224405, "grad_norm": 0.7170634269714355, "learning_rate": 9.425861305812082e-05, "loss": 0.3732, "step": 630 }, { "epoch": 1.9346429920664905, "grad_norm": 1.3517498970031738, "learning_rate": 9.394716318103098e-05, "loss": 0.3791, "step": 640 }, { "epoch": 1.9648658859085757, "grad_norm": 2.1151044368743896, "learning_rate": 9.362803029405712e-05, "loss": 0.3634, "step": 650 }, { "epoch": 1.9950887797506611, "grad_norm": 0.6365861296653748, "learning_rate": 9.330127018922194e-05, "loss": 0.393, "step": 660 }, { "epoch": 2.0256894597657724, "grad_norm": 1.6099416017532349, "learning_rate": 9.296693999196728e-05, "loss": 0.3329, "step": 670 }, { "epoch": 2.055912353607858, "grad_norm": 1.0266637802124023, "learning_rate": 9.262509815116732e-05, "loss": 0.3534, "step": 680 }, { "epoch": 2.0861352474499433, "grad_norm": 0.803153395652771, "learning_rate": 9.227580442891022e-05, "loss": 0.3328, "step": 690 }, { "epoch": 2.1163581412920287, "grad_norm": 0.6713061928749084, "learning_rate": 9.191911989005037e-05, "loss": 0.3524, "step": 700 }, { "epoch": 2.146581035134114, "grad_norm": 0.5472012162208557, "learning_rate": 9.155510689153282e-05, "loss": 0.3368, "step": 710 }, { "epoch": 2.1768039289761996, "grad_norm": 1.0651007890701294, "learning_rate": 9.118382907149165e-05, "loss": 0.2971, "step": 720 }, { "epoch": 2.207026822818285, "grad_norm": 0.7630440592765808, "learning_rate": 9.080535133812469e-05, "loss": 0.3375, "step": 730 }, { "epoch": 2.2372497166603704, "grad_norm": 0.6513319611549377, "learning_rate": 9.041973985834595e-05, "loss": 0.3234, "step": 740 }, { "epoch": 2.2674726105024554, "grad_norm": 0.5795676112174988, "learning_rate": 9.002706204621803e-05, "loss": 0.3288, "step": 750 }, { "epoch": 2.297695504344541, "grad_norm": 0.6500043272972107, "learning_rate": 8.962738655116658e-05, "loss": 0.3356, "step": 760 }, { "epoch": 2.3279183981866263, "grad_norm": 0.665492594242096, "learning_rate": 8.922078324597879e-05, "loss": 0.3359, "step": 770 }, { "epoch": 2.3581412920287117, "grad_norm": 0.7372997403144836, "learning_rate": 8.880732321458784e-05, "loss": 0.3347, "step": 780 }, { "epoch": 2.388364185870797, "grad_norm": 0.617290735244751, "learning_rate": 8.838707873964587e-05, "loss": 0.3365, "step": 790 }, { "epoch": 2.4185870797128826, "grad_norm": 0.7033390402793884, "learning_rate": 8.796012328988716e-05, "loss": 0.3386, "step": 800 }, { "epoch": 2.448809973554968, "grad_norm": 1.019696831703186, "learning_rate": 8.752653150728411e-05, "loss": 0.3244, "step": 810 }, { "epoch": 2.4790328673970534, "grad_norm": 0.8520340323448181, "learning_rate": 8.708637919399798e-05, "loss": 0.3387, "step": 820 }, { "epoch": 2.509255761239139, "grad_norm": 1.2463282346725464, "learning_rate": 8.663974329912696e-05, "loss": 0.3334, "step": 830 }, { "epoch": 2.539478655081224, "grad_norm": 0.6218689680099487, "learning_rate": 8.618670190525352e-05, "loss": 0.3412, "step": 840 }, { "epoch": 2.5697015489233093, "grad_norm": 0.635360836982727, "learning_rate": 8.572733421479382e-05, "loss": 0.3292, "step": 850 }, { "epoch": 2.5999244427653947, "grad_norm": 0.5387209057807922, "learning_rate": 8.526172053615121e-05, "loss": 0.3359, "step": 860 }, { "epoch": 2.63014733660748, "grad_norm": 0.5592532157897949, "learning_rate": 8.478994226967638e-05, "loss": 0.3466, "step": 870 }, { "epoch": 2.6603702304495656, "grad_norm": 0.7285254597663879, "learning_rate": 8.43120818934367e-05, "loss": 0.3345, "step": 880 }, { "epoch": 2.690593124291651, "grad_norm": 0.6214547753334045, "learning_rate": 8.382822294879698e-05, "loss": 0.3464, "step": 890 }, { "epoch": 2.7208160181337364, "grad_norm": 0.4706718325614929, "learning_rate": 8.333845002581458e-05, "loss": 0.3267, "step": 900 }, { "epoch": 2.751038911975822, "grad_norm": 1.0929789543151855, "learning_rate": 8.284284874845104e-05, "loss": 0.3327, "step": 910 }, { "epoch": 2.7812618058179073, "grad_norm": 0.6653083562850952, "learning_rate": 8.234150575960288e-05, "loss": 0.3146, "step": 920 }, { "epoch": 2.8114846996599923, "grad_norm": 0.4961538314819336, "learning_rate": 8.183450870595441e-05, "loss": 0.3382, "step": 930 }, { "epoch": 2.8417075935020777, "grad_norm": 1.2516436576843262, "learning_rate": 8.132194622265507e-05, "loss": 0.3293, "step": 940 }, { "epoch": 2.871930487344163, "grad_norm": 1.3802598714828491, "learning_rate": 8.080390791782375e-05, "loss": 0.3344, "step": 950 }, { "epoch": 2.9021533811862485, "grad_norm": 0.619377851486206, "learning_rate": 8.028048435688333e-05, "loss": 0.3396, "step": 960 }, { "epoch": 2.932376275028334, "grad_norm": 0.5353909134864807, "learning_rate": 7.975176704672758e-05, "loss": 0.3375, "step": 970 }, { "epoch": 2.9625991688704194, "grad_norm": 0.6385555267333984, "learning_rate": 7.921784841972355e-05, "loss": 0.3308, "step": 980 }, { "epoch": 2.992822062712505, "grad_norm": 1.1463056802749634, "learning_rate": 7.86788218175523e-05, "loss": 0.3283, "step": 990 }, { "epoch": 3.0234227427276164, "grad_norm": 0.7241145968437195, "learning_rate": 7.813478147489052e-05, "loss": 0.2587, "step": 1000 }, { "epoch": 3.0536456365697013, "grad_norm": 0.8696322441101074, "learning_rate": 7.758582250293596e-05, "loss": 0.2624, "step": 1010 }, { "epoch": 3.0838685304117868, "grad_norm": 0.7954265475273132, "learning_rate": 7.703204087277988e-05, "loss": 0.2414, "step": 1020 }, { "epoch": 3.114091424253872, "grad_norm": 0.611754298210144, "learning_rate": 7.647353339862895e-05, "loss": 0.2506, "step": 1030 }, { "epoch": 3.1443143180959576, "grad_norm": 0.7143287658691406, "learning_rate": 7.591039772087977e-05, "loss": 0.2516, "step": 1040 }, { "epoch": 3.174537211938043, "grad_norm": 0.597282350063324, "learning_rate": 7.534273228904915e-05, "loss": 0.2628, "step": 1050 }, { "epoch": 3.2047601057801285, "grad_norm": 0.7054301500320435, "learning_rate": 7.477063634456263e-05, "loss": 0.2611, "step": 1060 }, { "epoch": 3.234982999622214, "grad_norm": 0.5925849080085754, "learning_rate": 7.41942099034048e-05, "loss": 0.2782, "step": 1070 }, { "epoch": 3.2652058934642993, "grad_norm": 0.6402950286865234, "learning_rate": 7.361355373863414e-05, "loss": 0.2506, "step": 1080 }, { "epoch": 3.2954287873063848, "grad_norm": 0.6893470287322998, "learning_rate": 7.302876936276546e-05, "loss": 0.2593, "step": 1090 }, { "epoch": 3.3256516811484698, "grad_norm": 0.7923823595046997, "learning_rate": 7.243995901002312e-05, "loss": 0.2677, "step": 1100 }, { "epoch": 3.355874574990555, "grad_norm": 0.6485563516616821, "learning_rate": 7.184722561846798e-05, "loss": 0.2596, "step": 1110 }, { "epoch": 3.3860974688326406, "grad_norm": 1.6831997632980347, "learning_rate": 7.12506728120015e-05, "loss": 0.2507, "step": 1120 }, { "epoch": 3.416320362674726, "grad_norm": 0.6467958092689514, "learning_rate": 7.065040488224974e-05, "loss": 0.2626, "step": 1130 }, { "epoch": 3.4465432565168115, "grad_norm": 0.7198959589004517, "learning_rate": 7.004652677033068e-05, "loss": 0.2601, "step": 1140 }, { "epoch": 3.476766150358897, "grad_norm": 0.6917047500610352, "learning_rate": 6.94391440485081e-05, "loss": 0.2692, "step": 1150 }, { "epoch": 3.5069890442009823, "grad_norm": 0.9341140985488892, "learning_rate": 6.882836290173493e-05, "loss": 0.2609, "step": 1160 }, { "epoch": 3.5372119380430678, "grad_norm": 0.7227875590324402, "learning_rate": 6.821429010908971e-05, "loss": 0.2561, "step": 1170 }, { "epoch": 3.567434831885153, "grad_norm": 0.7613323926925659, "learning_rate": 6.759703302510898e-05, "loss": 0.2461, "step": 1180 }, { "epoch": 3.597657725727238, "grad_norm": 0.5406620502471924, "learning_rate": 6.697669956101914e-05, "loss": 0.2669, "step": 1190 }, { "epoch": 3.6278806195693236, "grad_norm": 0.6179462671279907, "learning_rate": 6.635339816587109e-05, "loss": 0.2655, "step": 1200 }, { "epoch": 3.658103513411409, "grad_norm": 0.7479238510131836, "learning_rate": 6.572723780758069e-05, "loss": 0.2567, "step": 1210 }, { "epoch": 3.6883264072534945, "grad_norm": 0.6450394988059998, "learning_rate": 6.509832795387858e-05, "loss": 0.2721, "step": 1220 }, { "epoch": 3.71854930109558, "grad_norm": 0.7902218103408813, "learning_rate": 6.446677855317264e-05, "loss": 0.2745, "step": 1230 }, { "epoch": 3.7487721949376653, "grad_norm": 0.5468536019325256, "learning_rate": 6.383270001532635e-05, "loss": 0.2662, "step": 1240 }, { "epoch": 3.7789950887797508, "grad_norm": 0.5629965662956238, "learning_rate": 6.319620319235659e-05, "loss": 0.2583, "step": 1250 }, { "epoch": 3.809217982621836, "grad_norm": 0.601555585861206, "learning_rate": 6.255739935905396e-05, "loss": 0.2681, "step": 1260 }, { "epoch": 3.8394408764639216, "grad_norm": 0.7035398483276367, "learning_rate": 6.191640019352942e-05, "loss": 0.2723, "step": 1270 }, { "epoch": 3.8696637703060066, "grad_norm": 0.6691277623176575, "learning_rate": 6.127331775769023e-05, "loss": 0.2582, "step": 1280 }, { "epoch": 3.899886664148092, "grad_norm": 0.5650741457939148, "learning_rate": 6.062826447764883e-05, "loss": 0.2667, "step": 1290 }, { "epoch": 3.9301095579901775, "grad_norm": 0.8159789443016052, "learning_rate": 5.998135312406821e-05, "loss": 0.2657, "step": 1300 }, { "epoch": 3.960332451832263, "grad_norm": 0.6696954965591431, "learning_rate": 5.9332696792446727e-05, "loss": 0.2697, "step": 1310 }, { "epoch": 3.9905553456743483, "grad_norm": 0.584435760974884, "learning_rate": 5.868240888334653e-05, "loss": 0.246, "step": 1320 }, { "epoch": 4.02115602568946, "grad_norm": 0.5735254883766174, "learning_rate": 5.803060308256824e-05, "loss": 0.1935, "step": 1330 }, { "epoch": 4.051378919531545, "grad_norm": 0.6788007616996765, "learning_rate": 5.737739334127611e-05, "loss": 0.1943, "step": 1340 }, { "epoch": 4.081601813373631, "grad_norm": 0.6538478136062622, "learning_rate": 5.6722893856076596e-05, "loss": 0.1877, "step": 1350 }, { "epoch": 4.111824707215716, "grad_norm": 0.6879804134368896, "learning_rate": 5.60672190490541e-05, "loss": 0.189, "step": 1360 }, { "epoch": 4.142047601057802, "grad_norm": 0.642549991607666, "learning_rate": 5.541048354776721e-05, "loss": 0.1848, "step": 1370 }, { "epoch": 4.1722704948998866, "grad_norm": 0.6130525469779968, "learning_rate": 5.475280216520913e-05, "loss": 0.1901, "step": 1380 }, { "epoch": 4.202493388741972, "grad_norm": 0.880362331867218, "learning_rate": 5.409428987973564e-05, "loss": 0.1804, "step": 1390 }, { "epoch": 4.232716282584057, "grad_norm": 0.6228994727134705, "learning_rate": 5.343506181496405e-05, "loss": 0.1915, "step": 1400 }, { "epoch": 4.262939176426142, "grad_norm": 0.6418446898460388, "learning_rate": 5.277523321964701e-05, "loss": 0.1877, "step": 1410 }, { "epoch": 4.293162070268228, "grad_norm": 0.7304417490959167, "learning_rate": 5.2114919447524155e-05, "loss": 0.1905, "step": 1420 }, { "epoch": 4.323384964110313, "grad_norm": 1.017180323600769, "learning_rate": 5.145423593715557e-05, "loss": 0.1736, "step": 1430 }, { "epoch": 4.353607857952399, "grad_norm": 0.7358663082122803, "learning_rate": 5.0793298191740404e-05, "loss": 0.19, "step": 1440 }, { "epoch": 4.383830751794484, "grad_norm": 0.674080491065979, "learning_rate": 5.013222175892411e-05, "loss": 0.1907, "step": 1450 }, { "epoch": 4.41405364563657, "grad_norm": 0.645470142364502, "learning_rate": 4.9471122210598034e-05, "loss": 0.196, "step": 1460 }, { "epoch": 4.444276539478655, "grad_norm": 0.5910624861717224, "learning_rate": 4.881011512269463e-05, "loss": 0.187, "step": 1470 }, { "epoch": 4.474499433320741, "grad_norm": 0.7850512862205505, "learning_rate": 4.8149316054982095e-05, "loss": 0.186, "step": 1480 }, { "epoch": 4.504722327162826, "grad_norm": 1.1707135438919067, "learning_rate": 4.748884053086175e-05, "loss": 0.1843, "step": 1490 }, { "epoch": 4.534945221004911, "grad_norm": 0.7980189323425293, "learning_rate": 4.6828804017171776e-05, "loss": 0.1985, "step": 1500 }, { "epoch": 4.565168114846997, "grad_norm": 0.5942202806472778, "learning_rate": 4.616932190400089e-05, "loss": 0.1949, "step": 1510 }, { "epoch": 4.595391008689082, "grad_norm": 0.6443480253219604, "learning_rate": 4.551050948451542e-05, "loss": 0.1847, "step": 1520 }, { "epoch": 4.6256139025311676, "grad_norm": 0.6935557126998901, "learning_rate": 4.485248193480328e-05, "loss": 0.1819, "step": 1530 }, { "epoch": 4.6558367963732525, "grad_norm": 0.8106910586357117, "learning_rate": 4.4195354293738484e-05, "loss": 0.1839, "step": 1540 }, { "epoch": 4.686059690215338, "grad_norm": 0.8624558448791504, "learning_rate": 4.353924144286963e-05, "loss": 0.1848, "step": 1550 }, { "epoch": 4.716282584057423, "grad_norm": 0.7457184791564941, "learning_rate": 4.288425808633575e-05, "loss": 0.1843, "step": 1560 }, { "epoch": 4.746505477899509, "grad_norm": 0.7222162485122681, "learning_rate": 4.223051873081349e-05, "loss": 0.1962, "step": 1570 }, { "epoch": 4.776728371741594, "grad_norm": 0.6403977274894714, "learning_rate": 4.157813766549848e-05, "loss": 0.1872, "step": 1580 }, { "epoch": 4.806951265583679, "grad_norm": 1.0512725114822388, "learning_rate": 4.092722894212487e-05, "loss": 0.186, "step": 1590 }, { "epoch": 4.837174159425765, "grad_norm": 0.7039276361465454, "learning_rate": 4.027790635502646e-05, "loss": 0.1854, "step": 1600 }, { "epoch": 4.86739705326785, "grad_norm": 1.100012183189392, "learning_rate": 3.963028342124265e-05, "loss": 0.1904, "step": 1610 }, { "epoch": 4.897619947109936, "grad_norm": 0.9361763000488281, "learning_rate": 3.898447336067297e-05, "loss": 0.1905, "step": 1620 }, { "epoch": 4.927842840952021, "grad_norm": 0.7108214497566223, "learning_rate": 3.8340589076283645e-05, "loss": 0.1807, "step": 1630 }, { "epoch": 4.958065734794107, "grad_norm": 0.5470288991928101, "learning_rate": 3.769874313436933e-05, "loss": 0.1765, "step": 1640 }, { "epoch": 4.988288628636192, "grad_norm": 1.0886564254760742, "learning_rate": 3.705904774487396e-05, "loss": 0.178, "step": 1650 }, { "epoch": 5.018889308651303, "grad_norm": 1.7127904891967773, "learning_rate": 3.64216147417737e-05, "loss": 0.1444, "step": 1660 }, { "epoch": 5.049112202493388, "grad_norm": 0.5874884128570557, "learning_rate": 3.5786555563525745e-05, "loss": 0.1326, "step": 1670 }, { "epoch": 5.079335096335474, "grad_norm": 0.6397553086280823, "learning_rate": 3.515398123358627e-05, "loss": 0.1269, "step": 1680 }, { "epoch": 5.109557990177559, "grad_norm": 0.8377091288566589, "learning_rate": 3.4524002341000715e-05, "loss": 0.1311, "step": 1690 }, { "epoch": 5.139780884019645, "grad_norm": 0.6393041014671326, "learning_rate": 3.389672902107044e-05, "loss": 0.132, "step": 1700 }, { "epoch": 5.17000377786173, "grad_norm": 0.8685671091079712, "learning_rate": 3.3272270936098246e-05, "loss": 0.1195, "step": 1710 }, { "epoch": 5.200226671703816, "grad_norm": 1.0302200317382812, "learning_rate": 3.2650737256216886e-05, "loss": 0.1309, "step": 1720 }, { "epoch": 5.230449565545901, "grad_norm": 0.6609297394752502, "learning_rate": 3.2032236640303545e-05, "loss": 0.127, "step": 1730 }, { "epoch": 5.260672459387987, "grad_norm": 0.717876136302948, "learning_rate": 3.141687721698363e-05, "loss": 0.1288, "step": 1740 }, { "epoch": 5.290895353230072, "grad_norm": 0.7257722616195679, "learning_rate": 3.0804766565727316e-05, "loss": 0.1289, "step": 1750 }, { "epoch": 5.321118247072157, "grad_norm": 0.5968477129936218, "learning_rate": 3.019601169804216e-05, "loss": 0.1246, "step": 1760 }, { "epoch": 5.351341140914243, "grad_norm": 0.5718531012535095, "learning_rate": 2.959071903876486e-05, "loss": 0.1221, "step": 1770 }, { "epoch": 5.381564034756328, "grad_norm": 0.7471643090248108, "learning_rate": 2.898899440745569e-05, "loss": 0.1247, "step": 1780 }, { "epoch": 5.4117869285984135, "grad_norm": 0.8166024684906006, "learning_rate": 2.8390942999898766e-05, "loss": 0.125, "step": 1790 }, { "epoch": 5.4420098224404985, "grad_norm": 0.6300913691520691, "learning_rate": 2.7796669369711294e-05, "loss": 0.1252, "step": 1800 }, { "epoch": 5.472232716282584, "grad_norm": 0.6817363500595093, "learning_rate": 2.720627741006505e-05, "loss": 0.1272, "step": 1810 }, { "epoch": 5.502455610124669, "grad_norm": 0.6244085431098938, "learning_rate": 2.6619870335523432e-05, "loss": 0.1218, "step": 1820 }, { "epoch": 5.532678503966755, "grad_norm": 0.7697808742523193, "learning_rate": 2.603755066399718e-05, "loss": 0.1221, "step": 1830 }, { "epoch": 5.56290139780884, "grad_norm": 0.728665292263031, "learning_rate": 2.5459420198821605e-05, "loss": 0.1267, "step": 1840 }, { "epoch": 5.593124291650925, "grad_norm": 0.8812174797058105, "learning_rate": 2.4885580010959153e-05, "loss": 0.1271, "step": 1850 }, { "epoch": 5.623347185493011, "grad_norm": 0.6068354845046997, "learning_rate": 2.4316130421329697e-05, "loss": 0.1257, "step": 1860 }, { "epoch": 5.653570079335096, "grad_norm": 0.6595832109451294, "learning_rate": 2.3751170983272e-05, "loss": 0.1233, "step": 1870 }, { "epoch": 5.683792973177182, "grad_norm": 0.5363273024559021, "learning_rate": 2.319080046513954e-05, "loss": 0.1256, "step": 1880 }, { "epoch": 5.714015867019267, "grad_norm": 0.5247385501861572, "learning_rate": 2.2635116833033393e-05, "loss": 0.1221, "step": 1890 }, { "epoch": 5.744238760861353, "grad_norm": 0.7972837686538696, "learning_rate": 2.2084217233675385e-05, "loss": 0.1191, "step": 1900 }, { "epoch": 5.774461654703438, "grad_norm": 0.766128659248352, "learning_rate": 2.1538197977424618e-05, "loss": 0.1207, "step": 1910 }, { "epoch": 5.804684548545524, "grad_norm": 0.6039882302284241, "learning_rate": 2.09971545214401e-05, "loss": 0.1187, "step": 1920 }, { "epoch": 5.834907442387609, "grad_norm": 0.6861318349838257, "learning_rate": 2.0461181452992496e-05, "loss": 0.1214, "step": 1930 }, { "epoch": 5.865130336229694, "grad_norm": 0.530323326587677, "learning_rate": 1.9930372472928095e-05, "loss": 0.1295, "step": 1940 }, { "epoch": 5.8953532300717795, "grad_norm": 0.6027620434761047, "learning_rate": 1.9404820379287675e-05, "loss": 0.1316, "step": 1950 }, { "epoch": 5.9255761239138645, "grad_norm": 0.5568642020225525, "learning_rate": 1.888461705108318e-05, "loss": 0.1185, "step": 1960 }, { "epoch": 5.95579901775595, "grad_norm": 0.677137553691864, "learning_rate": 1.8369853432235024e-05, "loss": 0.1213, "step": 1970 }, { "epoch": 5.986021911598035, "grad_norm": 0.7476499676704407, "learning_rate": 1.7860619515673033e-05, "loss": 0.1258, "step": 1980 }, { "epoch": 6.016622591613147, "grad_norm": 0.3778318464756012, "learning_rate": 1.73570043276035e-05, "loss": 0.1076, "step": 1990 }, { "epoch": 6.046845485455233, "grad_norm": 0.5993187427520752, "learning_rate": 1.68590959119452e-05, "loss": 0.0849, "step": 2000 }, { "epoch": 6.077068379297318, "grad_norm": 0.5854635834693909, "learning_rate": 1.6366981314937376e-05, "loss": 0.0911, "step": 2010 }, { "epoch": 6.107291273139403, "grad_norm": 0.48482272028923035, "learning_rate": 1.5880746569921868e-05, "loss": 0.1051, "step": 2020 }, { "epoch": 6.1375141669814886, "grad_norm": 0.42831626534461975, "learning_rate": 1.5400476682302454e-05, "loss": 0.087, "step": 2030 }, { "epoch": 6.1677370608235735, "grad_norm": 0.7245472073554993, "learning_rate": 1.4926255614683932e-05, "loss": 0.0981, "step": 2040 }, { "epoch": 6.197959954665659, "grad_norm": 0.4705452024936676, "learning_rate": 1.4458166272193425e-05, "loss": 0.0895, "step": 2050 }, { "epoch": 6.228182848507744, "grad_norm": 0.438855916261673, "learning_rate": 1.3996290487986568e-05, "loss": 0.088, "step": 2060 }, { "epoch": 6.25840574234983, "grad_norm": 0.641183614730835, "learning_rate": 1.354070900894115e-05, "loss": 0.0989, "step": 2070 }, { "epoch": 6.288628636191915, "grad_norm": 0.4393808841705322, "learning_rate": 1.3091501481540674e-05, "loss": 0.086, "step": 2080 }, { "epoch": 6.318851530034001, "grad_norm": 0.5123514533042908, "learning_rate": 1.264874643795021e-05, "loss": 0.0874, "step": 2090 }, { "epoch": 6.349074423876086, "grad_norm": 0.7096104025840759, "learning_rate": 1.2212521282287092e-05, "loss": 0.0858, "step": 2100 }, { "epoch": 6.379297317718171, "grad_norm": 0.7978008389472961, "learning_rate": 1.178290227708888e-05, "loss": 0.1001, "step": 2110 }, { "epoch": 6.409520211560257, "grad_norm": 0.8465205430984497, "learning_rate": 1.1359964529980849e-05, "loss": 0.0864, "step": 2120 }, { "epoch": 6.439743105402342, "grad_norm": 0.4713567793369293, "learning_rate": 1.0943781980545331e-05, "loss": 0.0871, "step": 2130 }, { "epoch": 6.469965999244428, "grad_norm": 0.5560480356216431, "learning_rate": 1.053442738739539e-05, "loss": 0.0801, "step": 2140 }, { "epoch": 6.500188893086513, "grad_norm": 0.4392213821411133, "learning_rate": 1.0131972315454869e-05, "loss": 0.0824, "step": 2150 }, { "epoch": 6.530411786928599, "grad_norm": 0.7270606160163879, "learning_rate": 9.73648712344707e-06, "loss": 0.0895, "step": 2160 }, { "epoch": 6.560634680770684, "grad_norm": 0.5097915530204773, "learning_rate": 9.348040951594477e-06, "loss": 0.0889, "step": 2170 }, { "epoch": 6.5908575746127696, "grad_norm": 0.7505512833595276, "learning_rate": 8.966701709531344e-06, "loss": 0.0889, "step": 2180 }, { "epoch": 6.6210804684548545, "grad_norm": 0.5032856464385986, "learning_rate": 8.592536064431467e-06, "loss": 0.0837, "step": 2190 }, { "epoch": 6.6513033622969395, "grad_norm": 0.632739245891571, "learning_rate": 8.225609429353187e-06, "loss": 0.0909, "step": 2200 }, { "epoch": 6.681526256139025, "grad_norm": 0.6564577221870422, "learning_rate": 7.865985951803645e-06, "loss": 0.0985, "step": 2210 }, { "epoch": 6.71174914998111, "grad_norm": 0.5407485961914062, "learning_rate": 7.513728502524286e-06, "loss": 0.0856, "step": 2220 }, { "epoch": 6.741972043823196, "grad_norm": 0.5400388836860657, "learning_rate": 7.168898664499485e-06, "loss": 0.0848, "step": 2230 }, { "epoch": 6.772194937665281, "grad_norm": 0.5073435306549072, "learning_rate": 6.831556722190452e-06, "loss": 0.088, "step": 2240 }, { "epoch": 6.802417831507367, "grad_norm": 0.5684376955032349, "learning_rate": 6.501761650996052e-06, "loss": 0.0937, "step": 2250 }, { "epoch": 6.832640725349452, "grad_norm": 0.6096483469009399, "learning_rate": 6.1795711069424666e-06, "loss": 0.0823, "step": 2260 }, { "epoch": 6.862863619191538, "grad_norm": 0.5201295018196106, "learning_rate": 5.865041416603601e-06, "loss": 0.092, "step": 2270 }, { "epoch": 6.893086513033623, "grad_norm": 0.6869983077049255, "learning_rate": 5.558227567253832e-06, "loss": 0.093, "step": 2280 }, { "epoch": 6.923309406875708, "grad_norm": 0.561122715473175, "learning_rate": 5.259183197254902e-06, "loss": 0.0802, "step": 2290 }, { "epoch": 6.953532300717794, "grad_norm": 0.5263656377792358, "learning_rate": 4.967960586678721e-06, "loss": 0.0839, "step": 2300 }, { "epoch": 6.983755194559879, "grad_norm": 0.575607419013977, "learning_rate": 4.684610648167503e-06, "loss": 0.0896, "step": 2310 }, { "epoch": 7.01435587457499, "grad_norm": 0.3270145654678345, "learning_rate": 4.4091829180330505e-06, "loss": 0.0778, "step": 2320 }, { "epoch": 7.044578768417076, "grad_norm": 0.5017428994178772, "learning_rate": 4.1417255475966145e-06, "loss": 0.0818, "step": 2330 }, { "epoch": 7.074801662259161, "grad_norm": 0.5966259837150574, "learning_rate": 3.8822852947709375e-06, "loss": 0.0747, "step": 2340 }, { "epoch": 7.105024556101247, "grad_norm": 0.758906900882721, "learning_rate": 3.630907515885801e-06, "loss": 0.0743, "step": 2350 }, { "epoch": 7.135247449943332, "grad_norm": 0.564600944519043, "learning_rate": 3.3876361577587113e-06, "loss": 0.0831, "step": 2360 }, { "epoch": 7.165470343785417, "grad_norm": 0.548526406288147, "learning_rate": 3.1525137500119207e-06, "loss": 0.0753, "step": 2370 }, { "epoch": 7.195693237627503, "grad_norm": 0.619285523891449, "learning_rate": 2.9255813976372226e-06, "loss": 0.0777, "step": 2380 }, { "epoch": 7.225916131469588, "grad_norm": 0.5830234289169312, "learning_rate": 2.7068787738098735e-06, "loss": 0.0803, "step": 2390 }, { "epoch": 7.256139025311674, "grad_norm": 0.40243130922317505, "learning_rate": 2.496444112952734e-06, "loss": 0.063, "step": 2400 }, { "epoch": 7.286361919153759, "grad_norm": 0.42115914821624756, "learning_rate": 2.2943142040519837e-06, "loss": 0.068, "step": 2410 }, { "epoch": 7.316584812995845, "grad_norm": 0.7173041105270386, "learning_rate": 2.100524384225555e-06, "loss": 0.0808, "step": 2420 }, { "epoch": 7.34680770683793, "grad_norm": 0.48172470927238464, "learning_rate": 1.915108532545351e-06, "loss": 0.0756, "step": 2430 }, { "epoch": 7.3770306006800155, "grad_norm": 0.541785717010498, "learning_rate": 1.738099064114368e-06, "loss": 0.0717, "step": 2440 }, { "epoch": 7.4072534945221005, "grad_norm": 0.6277313828468323, "learning_rate": 1.569526924399778e-06, "loss": 0.067, "step": 2450 }, { "epoch": 7.4374763883641855, "grad_norm": 1.088949203491211, "learning_rate": 1.4094215838229176e-06, "loss": 0.0773, "step": 2460 }, { "epoch": 7.467699282206271, "grad_norm": 0.7406247854232788, "learning_rate": 1.2578110326071702e-06, "loss": 0.0695, "step": 2470 }, { "epoch": 7.497922176048356, "grad_norm": 0.5518158078193665, "learning_rate": 1.1147217758845751e-06, "loss": 0.0711, "step": 2480 }, { "epoch": 7.528145069890442, "grad_norm": 1.5025277137756348, "learning_rate": 9.801788290621505e-07, "loss": 0.0749, "step": 2490 }, { "epoch": 7.558367963732527, "grad_norm": 0.43806299567222595, "learning_rate": 8.542057134485638e-07, "loss": 0.0679, "step": 2500 }, { "epoch": 7.588590857574613, "grad_norm": 0.604317843914032, "learning_rate": 7.368244521420486e-07, "loss": 0.0674, "step": 2510 }, { "epoch": 7.618813751416698, "grad_norm": 0.5833208560943604, "learning_rate": 6.280555661802856e-07, "loss": 0.0771, "step": 2520 }, { "epoch": 7.649036645258784, "grad_norm": 0.6899517178535461, "learning_rate": 5.279180709527765e-07, "loss": 0.072, "step": 2530 }, { "epoch": 7.679259539100869, "grad_norm": 0.5004780888557434, "learning_rate": 4.3642947287654277e-07, "loss": 0.0702, "step": 2540 }, { "epoch": 7.709482432942954, "grad_norm": 1.1061525344848633, "learning_rate": 3.536057663355852e-07, "loss": 0.0726, "step": 2550 }, { "epoch": 7.73970532678504, "grad_norm": 1.5561116933822632, "learning_rate": 2.794614308846644e-07, "loss": 0.0755, "step": 2560 }, { "epoch": 7.769928220627125, "grad_norm": 0.4418126940727234, "learning_rate": 2.1400942871794283e-07, "loss": 0.0783, "step": 2570 }, { "epoch": 7.800151114469211, "grad_norm": 0.6038776636123657, "learning_rate": 1.5726120240288634e-07, "loss": 0.0769, "step": 2580 }, { "epoch": 7.830374008311296, "grad_norm": 0.6262177228927612, "learning_rate": 1.0922667287981969e-07, "loss": 0.0748, "step": 2590 }, { "epoch": 7.8605969021533815, "grad_norm": 0.8348723649978638, "learning_rate": 6.991423772753636e-08, "loss": 0.0749, "step": 2600 }, { "epoch": 7.8908197959954665, "grad_norm": 0.4923211634159088, "learning_rate": 3.9330769695167245e-08, "loss": 0.0744, "step": 2610 }, { "epoch": 7.921042689837552, "grad_norm": 0.5318350791931152, "learning_rate": 1.748161550069183e-08, "loss": 0.0731, "step": 2620 }, { "epoch": 7.951265583679637, "grad_norm": 0.756984293460846, "learning_rate": 4.3705948961692e-09, "loss": 0.0716, "step": 2630 }, { "epoch": 7.981488477521722, "grad_norm": 0.797049880027771, "learning_rate": 0.0, "loss": 0.0746, "step": 2640 } ], "logging_steps": 10, "max_steps": 2640, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.018451794664358e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }