{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 96980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 4.406856536865234, "learning_rate": 4.750206228088266e-05, "loss": 3.599, "step": 4849 }, { "epoch": 1.0, "eval_loss": 2.830796480178833, "eval_runtime": 63.7615, "eval_samples_per_second": 608.329, "eval_steps_per_second": 19.024, "step": 4849 }, { "epoch": 2.0, "grad_norm": 3.6979801654815674, "learning_rate": 4.5002577851103326e-05, "loss": 2.6769, "step": 9698 }, { "epoch": 2.0, "eval_loss": 2.430718183517456, "eval_runtime": 62.9007, "eval_samples_per_second": 616.654, "eval_steps_per_second": 19.284, "step": 9698 }, { "epoch": 3.0, "grad_norm": 3.8321845531463623, "learning_rate": 4.2503608991544655e-05, "loss": 2.3727, "step": 14547 }, { "epoch": 3.0, "eval_loss": 2.206211566925049, "eval_runtime": 62.7764, "eval_samples_per_second": 617.876, "eval_steps_per_second": 19.323, "step": 14547 }, { "epoch": 4.0, "grad_norm": 3.3764877319335938, "learning_rate": 4.000464013198598e-05, "loss": 2.1798, "step": 19396 }, { "epoch": 4.0, "eval_loss": 2.0566577911376953, "eval_runtime": 62.7, "eval_samples_per_second": 618.629, "eval_steps_per_second": 19.346, "step": 19396 }, { "epoch": 5.0, "grad_norm": 3.017418146133423, "learning_rate": 3.750567127242731e-05, "loss": 2.041, "step": 24245 }, { "epoch": 5.0, "eval_loss": 1.943306803703308, "eval_runtime": 62.7554, "eval_samples_per_second": 618.082, "eval_steps_per_second": 19.329, "step": 24245 }, { "epoch": 6.0, "grad_norm": 3.028724431991577, "learning_rate": 3.500567127242731e-05, "loss": 1.9417, "step": 29094 }, { "epoch": 6.0, "eval_loss": 1.8640257120132446, "eval_runtime": 63.2843, "eval_samples_per_second": 612.916, "eval_steps_per_second": 19.167, "step": 29094 }, { "epoch": 7.0, "grad_norm": 2.9556593894958496, "learning_rate": 3.2506702412868636e-05, "loss": 1.8625, "step": 33943 }, { "epoch": 7.0, "eval_loss": 1.7964398860931396, "eval_runtime": 63.0672, "eval_samples_per_second": 615.027, "eval_steps_per_second": 19.233, "step": 33943 }, { "epoch": 8.0, "grad_norm": 2.954166889190674, "learning_rate": 3.000670241286863e-05, "loss": 1.797, "step": 38792 }, { "epoch": 8.0, "eval_loss": 1.7490918636322021, "eval_runtime": 63.1115, "eval_samples_per_second": 614.595, "eval_steps_per_second": 19.22, "step": 38792 }, { "epoch": 9.0, "grad_norm": 2.954655885696411, "learning_rate": 2.7508249123530627e-05, "loss": 1.7491, "step": 43641 }, { "epoch": 9.0, "eval_loss": 1.7073553800582886, "eval_runtime": 63.2567, "eval_samples_per_second": 613.184, "eval_steps_per_second": 19.176, "step": 43641 }, { "epoch": 10.0, "grad_norm": 3.0339479446411133, "learning_rate": 2.5009280263971953e-05, "loss": 1.7049, "step": 48490 }, { "epoch": 10.0, "eval_loss": 1.6738481521606445, "eval_runtime": 63.2477, "eval_samples_per_second": 613.271, "eval_steps_per_second": 19.179, "step": 48490 }, { "epoch": 11.0, "grad_norm": 3.4147965908050537, "learning_rate": 2.2509795834192618e-05, "loss": 1.6655, "step": 53339 }, { "epoch": 11.0, "eval_loss": 1.6354576349258423, "eval_runtime": 63.6046, "eval_samples_per_second": 609.83, "eval_steps_per_second": 19.071, "step": 53339 }, { "epoch": 12.0, "grad_norm": 3.4822511672973633, "learning_rate": 2.0010826974633944e-05, "loss": 1.6295, "step": 58188 }, { "epoch": 12.0, "eval_loss": 1.6142407655715942, "eval_runtime": 63.1801, "eval_samples_per_second": 613.927, "eval_steps_per_second": 19.199, "step": 58188 }, { "epoch": 13.0, "grad_norm": 3.052323341369629, "learning_rate": 1.751134254485461e-05, "loss": 1.6022, "step": 63037 }, { "epoch": 13.0, "eval_loss": 1.595221757888794, "eval_runtime": 63.3841, "eval_samples_per_second": 611.951, "eval_steps_per_second": 19.137, "step": 63037 }, { "epoch": 14.0, "grad_norm": 3.0843849182128906, "learning_rate": 1.5012373685295938e-05, "loss": 1.5769, "step": 67886 }, { "epoch": 14.0, "eval_loss": 1.5756992101669312, "eval_runtime": 61.4761, "eval_samples_per_second": 630.944, "eval_steps_per_second": 19.731, "step": 67886 }, { "epoch": 15.0, "grad_norm": 3.4379537105560303, "learning_rate": 1.2513404825737265e-05, "loss": 1.5571, "step": 72735 }, { "epoch": 15.0, "eval_loss": 1.554135799407959, "eval_runtime": 62.2027, "eval_samples_per_second": 623.574, "eval_steps_per_second": 19.501, "step": 72735 }, { "epoch": 16.0, "grad_norm": 3.0627856254577637, "learning_rate": 1.0014435966178593e-05, "loss": 1.5343, "step": 77584 }, { "epoch": 16.0, "eval_loss": 1.541438102722168, "eval_runtime": 61.9741, "eval_samples_per_second": 625.874, "eval_steps_per_second": 19.573, "step": 77584 }, { "epoch": 17.0, "grad_norm": 3.1744439601898193, "learning_rate": 7.5154671066199224e-06, "loss": 1.5188, "step": 82433 }, { "epoch": 17.0, "eval_loss": 1.5268478393554688, "eval_runtime": 61.9172, "eval_samples_per_second": 626.449, "eval_steps_per_second": 19.591, "step": 82433 }, { "epoch": 18.0, "grad_norm": 3.5251266956329346, "learning_rate": 5.015982676840586e-06, "loss": 1.5037, "step": 87282 }, { "epoch": 18.0, "eval_loss": 1.5217667818069458, "eval_runtime": 61.3504, "eval_samples_per_second": 632.237, "eval_steps_per_second": 19.772, "step": 87282 }, { "epoch": 19.0, "grad_norm": 3.248511552810669, "learning_rate": 2.517013817281914e-06, "loss": 1.4927, "step": 92131 }, { "epoch": 19.0, "eval_loss": 1.511409044265747, "eval_runtime": 62.5116, "eval_samples_per_second": 620.493, "eval_steps_per_second": 19.404, "step": 92131 }, { "epoch": 20.0, "grad_norm": 2.856531858444214, "learning_rate": 1.7529387502577853e-08, "loss": 1.4862, "step": 96980 }, { "epoch": 20.0, "eval_loss": 1.5111480951309204, "eval_runtime": 61.8388, "eval_samples_per_second": 627.244, "eval_steps_per_second": 19.616, "step": 96980 } ], "logging_steps": 500, "max_steps": 96980, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0420834325692416e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }