{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 29370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17024174327545114, "grad_norm": 0.6650531888008118, "learning_rate": 1.96595165134491e-05, "loss": 0.2305, "step": 500 }, { "epoch": 0.34048348655090227, "grad_norm": 2.3951523303985596, "learning_rate": 1.9319033026898198e-05, "loss": 0.1633, "step": 1000 }, { "epoch": 0.5107252298263534, "grad_norm": 0.9463224411010742, "learning_rate": 1.8978549540347296e-05, "loss": 0.1548, "step": 1500 }, { "epoch": 0.6809669731018045, "grad_norm": 1.2913334369659424, "learning_rate": 1.8638066053796395e-05, "loss": 0.1487, "step": 2000 }, { "epoch": 0.8512087163772557, "grad_norm": 1.0490585565567017, "learning_rate": 1.829758256724549e-05, "loss": 0.1435, "step": 2500 }, { "epoch": 1.0214504596527068, "grad_norm": 1.5652408599853516, "learning_rate": 1.7957099080694588e-05, "loss": 0.1397, "step": 3000 }, { "epoch": 1.191692202928158, "grad_norm": 1.1205641031265259, "learning_rate": 1.7616615594143686e-05, "loss": 0.1283, "step": 3500 }, { "epoch": 1.361933946203609, "grad_norm": 1.0744216442108154, "learning_rate": 1.727613210759278e-05, "loss": 0.1292, "step": 4000 }, { "epoch": 1.5321756894790601, "grad_norm": 1.089113712310791, "learning_rate": 1.693564862104188e-05, "loss": 0.1273, "step": 4500 }, { "epoch": 1.7024174327545114, "grad_norm": 2.334705114364624, "learning_rate": 1.6595165134490977e-05, "loss": 0.1275, "step": 5000 }, { "epoch": 1.8726591760299627, "grad_norm": 1.1323754787445068, "learning_rate": 1.6254681647940076e-05, "loss": 0.1251, "step": 5500 }, { "epoch": 2.0429009193054135, "grad_norm": 0.8757261633872986, "learning_rate": 1.5914198161389174e-05, "loss": 0.1213, "step": 6000 }, { "epoch": 2.213142662580865, "grad_norm": 1.1232839822769165, "learning_rate": 1.5573714674838272e-05, "loss": 0.1104, "step": 6500 }, { "epoch": 2.383384405856316, "grad_norm": 0.8715490698814392, "learning_rate": 1.5233231188287369e-05, "loss": 0.1099, "step": 7000 }, { "epoch": 2.553626149131767, "grad_norm": 1.2656769752502441, "learning_rate": 1.4892747701736467e-05, "loss": 0.1102, "step": 7500 }, { "epoch": 2.723867892407218, "grad_norm": 1.1669204235076904, "learning_rate": 1.4552264215185565e-05, "loss": 0.1101, "step": 8000 }, { "epoch": 2.8941096356826694, "grad_norm": 1.0073705911636353, "learning_rate": 1.4211780728634664e-05, "loss": 0.1085, "step": 8500 }, { "epoch": 3.0643513789581207, "grad_norm": 1.1393821239471436, "learning_rate": 1.3871297242083762e-05, "loss": 0.1027, "step": 9000 }, { "epoch": 3.2345931222335715, "grad_norm": 1.4679887294769287, "learning_rate": 1.3530813755532857e-05, "loss": 0.0926, "step": 9500 }, { "epoch": 3.404834865509023, "grad_norm": 0.8374710083007812, "learning_rate": 1.3190330268981955e-05, "loss": 0.0925, "step": 10000 }, { "epoch": 3.575076608784474, "grad_norm": 1.2514032125473022, "learning_rate": 1.2849846782431053e-05, "loss": 0.0927, "step": 10500 }, { "epoch": 3.7453183520599254, "grad_norm": 1.5251351594924927, "learning_rate": 1.250936329588015e-05, "loss": 0.0929, "step": 11000 }, { "epoch": 3.915560095335376, "grad_norm": 1.0668872594833374, "learning_rate": 1.2168879809329248e-05, "loss": 0.0923, "step": 11500 }, { "epoch": 4.085801838610827, "grad_norm": 1.0528796911239624, "learning_rate": 1.1828396322778346e-05, "loss": 0.0848, "step": 12000 }, { "epoch": 4.256043581886279, "grad_norm": 1.316041111946106, "learning_rate": 1.1487912836227445e-05, "loss": 0.0767, "step": 12500 }, { "epoch": 4.42628532516173, "grad_norm": 1.6180927753448486, "learning_rate": 1.1147429349676541e-05, "loss": 0.077, "step": 13000 }, { "epoch": 4.596527068437181, "grad_norm": 1.2156362533569336, "learning_rate": 1.080694586312564e-05, "loss": 0.0773, "step": 13500 }, { "epoch": 4.766768811712632, "grad_norm": 1.621887445449829, "learning_rate": 1.0466462376574738e-05, "loss": 0.0773, "step": 14000 }, { "epoch": 4.937010554988083, "grad_norm": 1.5306437015533447, "learning_rate": 1.0125978890023836e-05, "loss": 0.0774, "step": 14500 }, { "epoch": 5.107252298263534, "grad_norm": 22.37914276123047, "learning_rate": 9.785495403472932e-06, "loss": 0.0678, "step": 15000 }, { "epoch": 5.2774940415389855, "grad_norm": 1.3330860137939453, "learning_rate": 9.44501191692203e-06, "loss": 0.0634, "step": 15500 }, { "epoch": 5.447735784814436, "grad_norm": 1.9692567586898804, "learning_rate": 9.104528430371127e-06, "loss": 0.0634, "step": 16000 }, { "epoch": 5.617977528089888, "grad_norm": 1.3089221715927124, "learning_rate": 8.764044943820226e-06, "loss": 0.0635, "step": 16500 }, { "epoch": 5.788219271365339, "grad_norm": 1.5806821584701538, "learning_rate": 8.423561457269324e-06, "loss": 0.0637, "step": 17000 }, { "epoch": 5.95846101464079, "grad_norm": 1.579941987991333, "learning_rate": 8.08307797071842e-06, "loss": 0.0633, "step": 17500 }, { "epoch": 6.128702757916241, "grad_norm": 1.5726784467697144, "learning_rate": 7.742594484167519e-06, "loss": 0.054, "step": 18000 }, { "epoch": 6.298944501191692, "grad_norm": 1.140791654586792, "learning_rate": 7.402110997616616e-06, "loss": 0.052, "step": 18500 }, { "epoch": 6.469186244467143, "grad_norm": 1.6548409461975098, "learning_rate": 7.061627511065714e-06, "loss": 0.0516, "step": 19000 }, { "epoch": 6.639427987742595, "grad_norm": 1.3514069318771362, "learning_rate": 6.721144024514812e-06, "loss": 0.0522, "step": 19500 }, { "epoch": 6.809669731018046, "grad_norm": 1.5590009689331055, "learning_rate": 6.38066053796391e-06, "loss": 0.0518, "step": 20000 }, { "epoch": 6.9799114742934965, "grad_norm": 1.2986799478530884, "learning_rate": 6.0401770514130066e-06, "loss": 0.0524, "step": 20500 }, { "epoch": 7.150153217568948, "grad_norm": 1.5317639112472534, "learning_rate": 5.699693564862104e-06, "loss": 0.044, "step": 21000 }, { "epoch": 7.320394960844399, "grad_norm": 2.344708204269409, "learning_rate": 5.359210078311202e-06, "loss": 0.0415, "step": 21500 }, { "epoch": 7.49063670411985, "grad_norm": 3.3057548999786377, "learning_rate": 5.0187265917603005e-06, "loss": 0.0418, "step": 22000 }, { "epoch": 7.6608784473953015, "grad_norm": 1.3382242918014526, "learning_rate": 4.678243105209398e-06, "loss": 0.0419, "step": 22500 }, { "epoch": 7.831120190670752, "grad_norm": 1.7018738985061646, "learning_rate": 4.337759618658495e-06, "loss": 0.0421, "step": 23000 }, { "epoch": 8.001361933946203, "grad_norm": 0.9316732883453369, "learning_rate": 3.997276132107593e-06, "loss": 0.0414, "step": 23500 }, { "epoch": 8.171603677221654, "grad_norm": 1.4249956607818604, "learning_rate": 3.656792645556691e-06, "loss": 0.0346, "step": 24000 }, { "epoch": 8.341845420497107, "grad_norm": 1.263279914855957, "learning_rate": 3.3163091590057884e-06, "loss": 0.0345, "step": 24500 }, { "epoch": 8.512087163772557, "grad_norm": 2.6162939071655273, "learning_rate": 2.9758256724548862e-06, "loss": 0.0342, "step": 25000 }, { "epoch": 8.682328907048008, "grad_norm": 1.2574002742767334, "learning_rate": 2.635342185903984e-06, "loss": 0.0345, "step": 25500 }, { "epoch": 8.85257065032346, "grad_norm": 5.4230732917785645, "learning_rate": 2.2948586993530815e-06, "loss": 0.0344, "step": 26000 }, { "epoch": 9.02281239359891, "grad_norm": 0.885810136795044, "learning_rate": 1.9543752128021793e-06, "loss": 0.0333, "step": 26500 }, { "epoch": 9.19305413687436, "grad_norm": 1.7516717910766602, "learning_rate": 1.6138917262512767e-06, "loss": 0.0291, "step": 27000 }, { "epoch": 9.363295880149813, "grad_norm": 1.1372159719467163, "learning_rate": 1.2734082397003748e-06, "loss": 0.0293, "step": 27500 }, { "epoch": 9.533537623425264, "grad_norm": 0.9269993305206299, "learning_rate": 9.329247531494723e-07, "loss": 0.0294, "step": 28000 }, { "epoch": 9.703779366700715, "grad_norm": 1.229074239730835, "learning_rate": 5.9244126659857e-07, "loss": 0.0291, "step": 28500 }, { "epoch": 9.874021109976166, "grad_norm": 2.4099299907684326, "learning_rate": 2.519577800476677e-07, "loss": 0.0289, "step": 29000 }, { "epoch": 10.0, "step": 29370, "total_flos": 8.758967154215731e+17, "train_loss": 0.07915824020562384, "train_runtime": 29556.1772, "train_samples_per_second": 31.797, "train_steps_per_second": 0.994 } ], "logging_steps": 500, "max_steps": 29370, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.758967154215731e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }