{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9011406844106464, "eval_steps": 50, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 0.6337016224861145, "learning_rate": 7.5e-07, "loss": 1.326, "step": 50 }, { "epoch": 0.1, "eval_loss": 1.3350285291671753, "eval_runtime": 394.6138, "eval_samples_per_second": 5.459, "eval_steps_per_second": 0.684, "step": 50 }, { "epoch": 0.19, "grad_norm": 0.6025093197822571, "learning_rate": 1.5e-06, "loss": 1.3065, "step": 100 }, { "epoch": 0.19, "eval_loss": 1.2850024700164795, "eval_runtime": 394.5243, "eval_samples_per_second": 5.46, "eval_steps_per_second": 0.684, "step": 100 }, { "epoch": 0.29, "grad_norm": 0.5091537237167358, "learning_rate": 2.25e-06, "loss": 1.2129, "step": 150 }, { "epoch": 0.29, "eval_loss": 1.1226935386657715, "eval_runtime": 394.4733, "eval_samples_per_second": 5.46, "eval_steps_per_second": 0.684, "step": 150 }, { "epoch": 0.38, "grad_norm": 0.6157342195510864, "learning_rate": 3e-06, "loss": 0.9817, "step": 200 }, { "epoch": 0.38, "eval_loss": 0.8010650873184204, "eval_runtime": 394.6158, "eval_samples_per_second": 5.458, "eval_steps_per_second": 0.684, "step": 200 }, { "epoch": 0.48, "grad_norm": 0.22548706829547882, "learning_rate": 2.971177920604846e-06, "loss": 0.6524, "step": 250 }, { "epoch": 0.48, "eval_loss": 0.5880268812179565, "eval_runtime": 394.5715, "eval_samples_per_second": 5.459, "eval_steps_per_second": 0.684, "step": 250 }, { "epoch": 0.57, "grad_norm": 0.19589273631572723, "learning_rate": 2.88581929876693e-06, "loss": 0.5671, "step": 300 }, { "epoch": 0.57, "eval_loss": 0.563366174697876, "eval_runtime": 394.434, "eval_samples_per_second": 5.461, "eval_steps_per_second": 0.685, "step": 300 }, { "epoch": 0.67, "grad_norm": 0.20705421268939972, "learning_rate": 2.747204418453818e-06, "loss": 0.5519, "step": 350 }, { "epoch": 0.67, "eval_loss": 0.547275722026825, "eval_runtime": 394.6461, "eval_samples_per_second": 5.458, "eval_steps_per_second": 0.684, "step": 350 }, { "epoch": 0.76, "grad_norm": 0.1976100355386734, "learning_rate": 2.5606601717798212e-06, "loss": 0.5389, "step": 400 }, { "epoch": 0.76, "eval_loss": 0.5334580540657043, "eval_runtime": 394.6493, "eval_samples_per_second": 5.458, "eval_steps_per_second": 0.684, "step": 400 }, { "epoch": 0.86, "grad_norm": 0.18715547025203705, "learning_rate": 2.3333553495294033e-06, "loss": 0.5297, "step": 450 }, { "epoch": 0.86, "eval_loss": 0.5227681994438171, "eval_runtime": 394.7112, "eval_samples_per_second": 5.457, "eval_steps_per_second": 0.684, "step": 450 }, { "epoch": 0.95, "grad_norm": 0.1926855742931366, "learning_rate": 2.074025148547635e-06, "loss": 0.5114, "step": 500 }, { "epoch": 0.95, "eval_loss": 0.5151167511940002, "eval_runtime": 390.7412, "eval_samples_per_second": 5.513, "eval_steps_per_second": 0.691, "step": 500 }, { "epoch": 1.05, "grad_norm": 0.20605137944221497, "learning_rate": 1.7926354830241926e-06, "loss": 0.5109, "step": 550 }, { "epoch": 1.05, "eval_loss": 0.5088621973991394, "eval_runtime": 392.8267, "eval_samples_per_second": 5.483, "eval_steps_per_second": 0.687, "step": 550 }, { "epoch": 1.14, "grad_norm": 0.23452071845531464, "learning_rate": 1.5e-06, "loss": 0.5079, "step": 600 }, { "epoch": 1.14, "eval_loss": 0.5038764476776123, "eval_runtime": 391.3908, "eval_samples_per_second": 5.503, "eval_steps_per_second": 0.69, "step": 600 }, { "epoch": 1.24, "grad_norm": 0.22774113714694977, "learning_rate": 1.2073645169758077e-06, "loss": 0.4999, "step": 650 }, { "epoch": 1.24, "eval_loss": 0.5000301599502563, "eval_runtime": 392.1009, "eval_samples_per_second": 5.493, "eval_steps_per_second": 0.689, "step": 650 }, { "epoch": 1.33, "grad_norm": 0.21539181470870972, "learning_rate": 9.259748514523654e-07, "loss": 0.4908, "step": 700 }, { "epoch": 1.33, "eval_loss": 0.4972001016139984, "eval_runtime": 392.3879, "eval_samples_per_second": 5.489, "eval_steps_per_second": 0.688, "step": 700 }, { "epoch": 1.43, "grad_norm": 0.20129938423633575, "learning_rate": 6.666446504705971e-07, "loss": 0.4888, "step": 750 }, { "epoch": 1.43, "eval_loss": 0.4950333833694458, "eval_runtime": 392.4542, "eval_samples_per_second": 5.489, "eval_steps_per_second": 0.688, "step": 750 }, { "epoch": 1.52, "grad_norm": 0.20948025584220886, "learning_rate": 4.3933982822017883e-07, "loss": 0.4915, "step": 800 }, { "epoch": 1.52, "eval_loss": 0.49351951479911804, "eval_runtime": 392.3544, "eval_samples_per_second": 5.49, "eval_steps_per_second": 0.688, "step": 800 }, { "epoch": 1.62, "grad_norm": 0.21380692720413208, "learning_rate": 2.52795581546182e-07, "loss": 0.4935, "step": 850 }, { "epoch": 1.62, "eval_loss": 0.4924446642398834, "eval_runtime": 392.3544, "eval_samples_per_second": 5.49, "eval_steps_per_second": 0.688, "step": 850 }, { "epoch": 1.71, "grad_norm": 0.2027772217988968, "learning_rate": 1.141807012330699e-07, "loss": 0.4869, "step": 900 }, { "epoch": 1.71, "eval_loss": 0.4918237328529358, "eval_runtime": 392.4738, "eval_samples_per_second": 5.488, "eval_steps_per_second": 0.688, "step": 900 }, { "epoch": 1.81, "grad_norm": 0.22568389773368835, "learning_rate": 2.8822079395154353e-08, "loss": 0.4887, "step": 950 }, { "epoch": 1.81, "eval_loss": 0.4916115403175354, "eval_runtime": 392.4881, "eval_samples_per_second": 5.488, "eval_steps_per_second": 0.688, "step": 950 }, { "epoch": 1.9, "grad_norm": 0.2142002135515213, "learning_rate": 0.0, "loss": 0.4898, "step": 1000 }, { "epoch": 1.9, "eval_loss": 0.4915645122528076, "eval_runtime": 392.486, "eval_samples_per_second": 5.488, "eval_steps_per_second": 0.688, "step": 1000 } ], "logging_steps": 50, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "total_flos": 2.664713513598517e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }