{ "best_metric": 0.7980687022209167, "best_model_checkpoint": "miner_id_24/checkpoint-1050", "epoch": 0.013958079235363127, "eval_steps": 150, "global_step": 1050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.329340879558393e-05, "eval_loss": 2.00691819190979, "eval_runtime": 2934.4159, "eval_samples_per_second": 43.176, "eval_steps_per_second": 10.794, "step": 1 }, { "epoch": 0.0001329340879558393, "grad_norm": 1.5657358169555664, "learning_rate": 2e-05, "loss": 1.2697, "step": 10 }, { "epoch": 0.0002658681759116786, "grad_norm": 1.3231450319290161, "learning_rate": 4e-05, "loss": 1.2549, "step": 20 }, { "epoch": 0.00039880226386751786, "grad_norm": 2.6863651275634766, "learning_rate": 6e-05, "loss": 1.043, "step": 30 }, { "epoch": 0.0005317363518233572, "grad_norm": 2.719365358352661, "learning_rate": 8e-05, "loss": 0.9829, "step": 40 }, { "epoch": 0.0006646704397791965, "grad_norm": 6.010656356811523, "learning_rate": 0.0001, "loss": 0.9896, "step": 50 }, { "epoch": 0.0007976045277350357, "grad_norm": 1.3875675201416016, "learning_rate": 9.999875400032707e-05, "loss": 0.942, "step": 60 }, { "epoch": 0.0009305386156908751, "grad_norm": 1.3019894361495972, "learning_rate": 9.999501606340891e-05, "loss": 0.9229, "step": 70 }, { "epoch": 0.0010634727036467144, "grad_norm": 1.3613632917404175, "learning_rate": 9.998878637554424e-05, "loss": 0.8867, "step": 80 }, { "epoch": 0.0011964067916025536, "grad_norm": 1.7349094152450562, "learning_rate": 9.998006524722059e-05, "loss": 0.7913, "step": 90 }, { "epoch": 0.001329340879558393, "grad_norm": 4.72564172744751, "learning_rate": 9.996885311309891e-05, "loss": 0.8381, "step": 100 }, { "epoch": 0.0014622749675142323, "grad_norm": 1.0643872022628784, "learning_rate": 9.995515053199182e-05, "loss": 0.9664, "step": 110 }, { "epoch": 0.0015952090554700715, "grad_norm": 1.1394284963607788, "learning_rate": 9.993895818683579e-05, "loss": 0.867, "step": 120 }, { "epoch": 0.0017281431434259108, "grad_norm": 1.412411093711853, "learning_rate": 9.992027688465707e-05, "loss": 0.8512, "step": 130 }, { "epoch": 0.0018610772313817502, "grad_norm": 2.1755828857421875, "learning_rate": 9.989910755653154e-05, "loss": 0.8609, "step": 140 }, { "epoch": 0.0019940113193375893, "grad_norm": 3.5878186225891113, "learning_rate": 9.987545125753819e-05, "loss": 0.8454, "step": 150 }, { "epoch": 0.0019940113193375893, "eval_loss": 0.9133399128913879, "eval_runtime": 2936.8352, "eval_samples_per_second": 43.14, "eval_steps_per_second": 10.785, "step": 150 }, { "epoch": 0.002126945407293429, "grad_norm": 0.8727161884307861, "learning_rate": 9.98493091667067e-05, "loss": 0.9236, "step": 160 }, { "epoch": 0.002259879495249268, "grad_norm": 1.0465607643127441, "learning_rate": 9.982068258695853e-05, "loss": 0.9194, "step": 170 }, { "epoch": 0.002392813583205107, "grad_norm": 1.2130919694900513, "learning_rate": 9.978957294504203e-05, "loss": 0.8099, "step": 180 }, { "epoch": 0.0025257476711609468, "grad_norm": 1.7668389081954956, "learning_rate": 9.975598179146133e-05, "loss": 0.8385, "step": 190 }, { "epoch": 0.002658681759116786, "grad_norm": 3.149250030517578, "learning_rate": 9.97199108003991e-05, "loss": 0.7771, "step": 200 }, { "epoch": 0.002791615847072625, "grad_norm": 0.8131629228591919, "learning_rate": 9.968136176963307e-05, "loss": 0.9286, "step": 210 }, { "epoch": 0.0029245499350284646, "grad_norm": 1.1880844831466675, "learning_rate": 9.964033662044643e-05, "loss": 0.9057, "step": 220 }, { "epoch": 0.0030574840229843038, "grad_norm": 1.2871088981628418, "learning_rate": 9.959683739753207e-05, "loss": 0.832, "step": 230 }, { "epoch": 0.003190418110940143, "grad_norm": 1.6375112533569336, "learning_rate": 9.955086626889068e-05, "loss": 0.8537, "step": 240 }, { "epoch": 0.0033233521988959825, "grad_norm": 2.9762630462646484, "learning_rate": 9.950242552572271e-05, "loss": 0.7039, "step": 250 }, { "epoch": 0.0034562862868518216, "grad_norm": 0.8247011303901672, "learning_rate": 9.945151758231421e-05, "loss": 0.8845, "step": 260 }, { "epoch": 0.003589220374807661, "grad_norm": 1.1122963428497314, "learning_rate": 9.939814497591636e-05, "loss": 0.8893, "step": 270 }, { "epoch": 0.0037221544627635004, "grad_norm": 1.3245172500610352, "learning_rate": 9.934231036661919e-05, "loss": 0.8365, "step": 280 }, { "epoch": 0.0038550885507193395, "grad_norm": 1.5120258331298828, "learning_rate": 9.928401653721891e-05, "loss": 0.8092, "step": 290 }, { "epoch": 0.003988022638675179, "grad_norm": 2.8242909908294678, "learning_rate": 9.922326639307917e-05, "loss": 0.7748, "step": 300 }, { "epoch": 0.003988022638675179, "eval_loss": 0.8625452518463135, "eval_runtime": 2939.9371, "eval_samples_per_second": 43.095, "eval_steps_per_second": 10.774, "step": 300 }, { "epoch": 0.004120956726631018, "grad_norm": 0.8277641534805298, "learning_rate": 9.91600629619864e-05, "loss": 0.9275, "step": 310 }, { "epoch": 0.004253890814586858, "grad_norm": 1.0220091342926025, "learning_rate": 9.909440939399876e-05, "loss": 0.9069, "step": 320 }, { "epoch": 0.004386824902542697, "grad_norm": 1.2098453044891357, "learning_rate": 9.902630896128923e-05, "loss": 0.7785, "step": 330 }, { "epoch": 0.004519758990498536, "grad_norm": 1.39773428440094, "learning_rate": 9.895576505798248e-05, "loss": 0.7963, "step": 340 }, { "epoch": 0.004652693078454375, "grad_norm": 2.3860318660736084, "learning_rate": 9.888278119998573e-05, "loss": 0.8, "step": 350 }, { "epoch": 0.004785627166410214, "grad_norm": 0.750569224357605, "learning_rate": 9.88073610248135e-05, "loss": 0.9546, "step": 360 }, { "epoch": 0.004918561254366054, "grad_norm": 1.0606836080551147, "learning_rate": 9.872950829140633e-05, "loss": 0.8189, "step": 370 }, { "epoch": 0.0050514953423218935, "grad_norm": 1.0504099130630493, "learning_rate": 9.864922687994347e-05, "loss": 0.8399, "step": 380 }, { "epoch": 0.005184429430277733, "grad_norm": 1.452014446258545, "learning_rate": 9.856652079164937e-05, "loss": 0.7719, "step": 390 }, { "epoch": 0.005317363518233572, "grad_norm": 3.2478177547454834, "learning_rate": 9.848139414859441e-05, "loss": 0.7396, "step": 400 }, { "epoch": 0.005450297606189411, "grad_norm": 0.8162371516227722, "learning_rate": 9.839385119348937e-05, "loss": 0.8558, "step": 410 }, { "epoch": 0.00558323169414525, "grad_norm": 1.0164737701416016, "learning_rate": 9.830389628947398e-05, "loss": 0.87, "step": 420 }, { "epoch": 0.00571616578210109, "grad_norm": 1.1020488739013672, "learning_rate": 9.82115339198995e-05, "loss": 0.8376, "step": 430 }, { "epoch": 0.005849099870056929, "grad_norm": 1.4051620960235596, "learning_rate": 9.811676868810517e-05, "loss": 0.7677, "step": 440 }, { "epoch": 0.005982033958012768, "grad_norm": 2.896073818206787, "learning_rate": 9.801960531718896e-05, "loss": 0.7716, "step": 450 }, { "epoch": 0.005982033958012768, "eval_loss": 0.8369041681289673, "eval_runtime": 2939.1318, "eval_samples_per_second": 43.107, "eval_steps_per_second": 10.777, "step": 450 }, { "epoch": 0.0061149680459686075, "grad_norm": 0.8108513951301575, "learning_rate": 9.792004864977198e-05, "loss": 0.9265, "step": 460 }, { "epoch": 0.006247902133924447, "grad_norm": 1.0227642059326172, "learning_rate": 9.781810364775722e-05, "loss": 0.8598, "step": 470 }, { "epoch": 0.006380836221880286, "grad_norm": 1.1522070169448853, "learning_rate": 9.771377539208228e-05, "loss": 0.7888, "step": 480 }, { "epoch": 0.006513770309836126, "grad_norm": 1.5358163118362427, "learning_rate": 9.760706908246603e-05, "loss": 0.8415, "step": 490 }, { "epoch": 0.006646704397791965, "grad_norm": 2.9296483993530273, "learning_rate": 9.749799003714954e-05, "loss": 0.7869, "step": 500 }, { "epoch": 0.006779638485747804, "grad_norm": 0.7771769165992737, "learning_rate": 9.738654369263103e-05, "loss": 0.9162, "step": 510 }, { "epoch": 0.006912572573703643, "grad_norm": 0.8883762955665588, "learning_rate": 9.727273560339483e-05, "loss": 0.8822, "step": 520 }, { "epoch": 0.007045506661659482, "grad_norm": 1.1782065629959106, "learning_rate": 9.715657144163463e-05, "loss": 0.8331, "step": 530 }, { "epoch": 0.007178440749615322, "grad_norm": 1.5381768941879272, "learning_rate": 9.703805699697072e-05, "loss": 0.7452, "step": 540 }, { "epoch": 0.0073113748375711616, "grad_norm": 3.0118932723999023, "learning_rate": 9.691719817616147e-05, "loss": 0.8766, "step": 550 }, { "epoch": 0.007444308925527001, "grad_norm": 0.883840024471283, "learning_rate": 9.679400100280896e-05, "loss": 0.9095, "step": 560 }, { "epoch": 0.00757724301348284, "grad_norm": 0.9513404369354248, "learning_rate": 9.666847161705867e-05, "loss": 0.9024, "step": 570 }, { "epoch": 0.007710177101438679, "grad_norm": 0.9779698252677917, "learning_rate": 9.654061627529354e-05, "loss": 0.8098, "step": 580 }, { "epoch": 0.007843111189394519, "grad_norm": 1.8038041591644287, "learning_rate": 9.641044134982215e-05, "loss": 0.8517, "step": 590 }, { "epoch": 0.007976045277350357, "grad_norm": 2.9668469429016113, "learning_rate": 9.627795332856107e-05, "loss": 0.7795, "step": 600 }, { "epoch": 0.007976045277350357, "eval_loss": 0.821550190448761, "eval_runtime": 2935.4759, "eval_samples_per_second": 43.16, "eval_steps_per_second": 10.79, "step": 600 }, { "epoch": 0.008108979365306197, "grad_norm": 0.8701363801956177, "learning_rate": 9.614315881471154e-05, "loss": 0.9088, "step": 610 }, { "epoch": 0.008241913453262036, "grad_norm": 1.020605206489563, "learning_rate": 9.600606452643037e-05, "loss": 0.8598, "step": 620 }, { "epoch": 0.008374847541217876, "grad_norm": 1.118003010749817, "learning_rate": 9.586667729649513e-05, "loss": 0.8552, "step": 630 }, { "epoch": 0.008507781629173716, "grad_norm": 1.3862948417663574, "learning_rate": 9.572500407196348e-05, "loss": 0.7882, "step": 640 }, { "epoch": 0.008640715717129554, "grad_norm": 2.998286485671997, "learning_rate": 9.55810519138271e-05, "loss": 0.7784, "step": 650 }, { "epoch": 0.008773649805085394, "grad_norm": 0.9497479796409607, "learning_rate": 9.543482799665969e-05, "loss": 0.9353, "step": 660 }, { "epoch": 0.008906583893041232, "grad_norm": 0.9987872242927551, "learning_rate": 9.528633960825933e-05, "loss": 0.8585, "step": 670 }, { "epoch": 0.009039517980997072, "grad_norm": 1.2126046419143677, "learning_rate": 9.513559414928538e-05, "loss": 0.828, "step": 680 }, { "epoch": 0.009172452068952912, "grad_norm": 1.1770802736282349, "learning_rate": 9.498259913288953e-05, "loss": 0.8091, "step": 690 }, { "epoch": 0.00930538615690875, "grad_norm": 2.9725301265716553, "learning_rate": 9.482736218434143e-05, "loss": 0.7926, "step": 700 }, { "epoch": 0.00943832024486459, "grad_norm": 0.9037257432937622, "learning_rate": 9.466989104064853e-05, "loss": 0.8929, "step": 710 }, { "epoch": 0.009571254332820429, "grad_norm": 0.9316276907920837, "learning_rate": 9.451019355017056e-05, "loss": 0.9077, "step": 720 }, { "epoch": 0.009704188420776269, "grad_norm": 1.080407977104187, "learning_rate": 9.43482776722284e-05, "loss": 0.8093, "step": 730 }, { "epoch": 0.009837122508732109, "grad_norm": 1.2614948749542236, "learning_rate": 9.418415147670725e-05, "loss": 0.7689, "step": 740 }, { "epoch": 0.009970056596687947, "grad_norm": 2.555124521255493, "learning_rate": 9.401782314365457e-05, "loss": 0.802, "step": 750 }, { "epoch": 0.009970056596687947, "eval_loss": 0.8119913935661316, "eval_runtime": 2937.8069, "eval_samples_per_second": 43.126, "eval_steps_per_second": 10.782, "step": 750 }, { "epoch": 0.010102990684643787, "grad_norm": 0.8965262770652771, "learning_rate": 9.38493009628723e-05, "loss": 0.8909, "step": 760 }, { "epoch": 0.010235924772599625, "grad_norm": 1.1142463684082031, "learning_rate": 9.36785933335037e-05, "loss": 0.8622, "step": 770 }, { "epoch": 0.010368858860555465, "grad_norm": 1.1537195444107056, "learning_rate": 9.350570876361482e-05, "loss": 0.7657, "step": 780 }, { "epoch": 0.010501792948511304, "grad_norm": 1.629679560661316, "learning_rate": 9.333065586977035e-05, "loss": 0.7819, "step": 790 }, { "epoch": 0.010634727036467144, "grad_norm": 2.6244359016418457, "learning_rate": 9.315344337660421e-05, "loss": 0.778, "step": 800 }, { "epoch": 0.010767661124422984, "grad_norm": 0.7998455166816711, "learning_rate": 9.297408011638477e-05, "loss": 0.8324, "step": 810 }, { "epoch": 0.010900595212378822, "grad_norm": 0.9035855531692505, "learning_rate": 9.279257502857455e-05, "loss": 0.8705, "step": 820 }, { "epoch": 0.011033529300334662, "grad_norm": 1.1769347190856934, "learning_rate": 9.260893715938477e-05, "loss": 0.7977, "step": 830 }, { "epoch": 0.0111664633882905, "grad_norm": 1.4826371669769287, "learning_rate": 9.24231756613244e-05, "loss": 0.7626, "step": 840 }, { "epoch": 0.01129939747624634, "grad_norm": 3.1065542697906494, "learning_rate": 9.22352997927441e-05, "loss": 0.7633, "step": 850 }, { "epoch": 0.01143233156420218, "grad_norm": 0.872209370136261, "learning_rate": 9.20453189173747e-05, "loss": 0.916, "step": 860 }, { "epoch": 0.011565265652158018, "grad_norm": 0.9683641791343689, "learning_rate": 9.185324250386054e-05, "loss": 0.8759, "step": 870 }, { "epoch": 0.011698199740113858, "grad_norm": 1.2745503187179565, "learning_rate": 9.165908012528755e-05, "loss": 0.8134, "step": 880 }, { "epoch": 0.011831133828069697, "grad_norm": 1.3109657764434814, "learning_rate": 9.146284145870614e-05, "loss": 0.7875, "step": 890 }, { "epoch": 0.011964067916025537, "grad_norm": 2.3323986530303955, "learning_rate": 9.126453628464888e-05, "loss": 0.7521, "step": 900 }, { "epoch": 0.011964067916025537, "eval_loss": 0.8035673499107361, "eval_runtime": 2937.1699, "eval_samples_per_second": 43.135, "eval_steps_per_second": 10.784, "step": 900 }, { "epoch": 0.012097002003981377, "grad_norm": 0.820439338684082, "learning_rate": 9.106417448664306e-05, "loss": 0.8856, "step": 910 }, { "epoch": 0.012229936091937215, "grad_norm": 1.0313884019851685, "learning_rate": 9.086176605071805e-05, "loss": 0.8814, "step": 920 }, { "epoch": 0.012362870179893055, "grad_norm": 1.2534911632537842, "learning_rate": 9.06573210649077e-05, "loss": 0.8069, "step": 930 }, { "epoch": 0.012495804267848893, "grad_norm": 1.3286960124969482, "learning_rate": 9.045084971874738e-05, "loss": 0.7651, "step": 940 }, { "epoch": 0.012628738355804733, "grad_norm": 3.241807460784912, "learning_rate": 9.024236230276629e-05, "loss": 0.8244, "step": 950 }, { "epoch": 0.012761672443760572, "grad_norm": 0.8718632459640503, "learning_rate": 9.003186920797452e-05, "loss": 0.8792, "step": 960 }, { "epoch": 0.012894606531716412, "grad_norm": 0.9083179235458374, "learning_rate": 8.981938092534517e-05, "loss": 0.8644, "step": 970 }, { "epoch": 0.013027540619672252, "grad_norm": 1.1338489055633545, "learning_rate": 8.960490804529144e-05, "loss": 0.8368, "step": 980 }, { "epoch": 0.01316047470762809, "grad_norm": 1.466839075088501, "learning_rate": 8.938846125713891e-05, "loss": 0.7183, "step": 990 }, { "epoch": 0.01329340879558393, "grad_norm": 2.3635900020599365, "learning_rate": 8.917005134859263e-05, "loss": 0.7475, "step": 1000 }, { "epoch": 0.013426342883539768, "grad_norm": 0.8753401041030884, "learning_rate": 8.894968920519959e-05, "loss": 0.9097, "step": 1010 }, { "epoch": 0.013559276971495608, "grad_norm": 1.0394352674484253, "learning_rate": 8.872738580980615e-05, "loss": 0.8533, "step": 1020 }, { "epoch": 0.013692211059451448, "grad_norm": 1.3622921705245972, "learning_rate": 8.850315224201063e-05, "loss": 0.8018, "step": 1030 }, { "epoch": 0.013825145147407287, "grad_norm": 1.4305646419525146, "learning_rate": 8.827699967761108e-05, "loss": 0.8168, "step": 1040 }, { "epoch": 0.013958079235363127, "grad_norm": 2.2436411380767822, "learning_rate": 8.80489393880484e-05, "loss": 0.7678, "step": 1050 }, { "epoch": 0.013958079235363127, "eval_loss": 0.7980687022209167, "eval_runtime": 2936.4148, "eval_samples_per_second": 43.146, "eval_steps_per_second": 10.787, "step": 1050 } ], "logging_steps": 10, "max_steps": 4500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.0206243314335744e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }