{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 169, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029585798816568046, "grad_norm": 2.3596792221069336, "learning_rate": 4.705882352941177e-06, "loss": 1.1077, "mean_token_accuracy": 0.7105089992284774, "num_tokens": 2621440.0, "step": 5 }, { "epoch": 0.05917159763313609, "grad_norm": 1.7160693407058716, "learning_rate": 1.0588235294117648e-05, "loss": 1.0353, "mean_token_accuracy": 0.7203521847724914, "num_tokens": 5242880.0, "step": 10 }, { "epoch": 0.08875739644970414, "grad_norm": 0.8708027601242065, "learning_rate": 1.647058823529412e-05, "loss": 0.9553, "mean_token_accuracy": 0.7316273808479309, "num_tokens": 7861273.0, "step": 15 }, { "epoch": 0.11834319526627218, "grad_norm": 0.6267297267913818, "learning_rate": 1.999145758387301e-05, "loss": 0.8899, "mean_token_accuracy": 0.743945425748825, "num_tokens": 10474605.0, "step": 20 }, { "epoch": 0.14792899408284024, "grad_norm": 0.4869539439678192, "learning_rate": 1.9895522933272028e-05, "loss": 0.8455, "mean_token_accuracy": 0.7542784661054611, "num_tokens": 13096045.0, "step": 25 }, { "epoch": 0.17751479289940827, "grad_norm": 0.4492546319961548, "learning_rate": 1.9694002659393306e-05, "loss": 0.8389, "mean_token_accuracy": 0.7545322090387344, "num_tokens": 15717485.0, "step": 30 }, { "epoch": 0.20710059171597633, "grad_norm": 0.4301901161670685, "learning_rate": 1.9389046991574298e-05, "loss": 0.8268, "mean_token_accuracy": 0.7568192929029465, "num_tokens": 18338925.0, "step": 35 }, { "epoch": 0.23668639053254437, "grad_norm": 0.37857791781425476, "learning_rate": 1.898390981891979e-05, "loss": 0.7956, "mean_token_accuracy": 0.7649114817380905, "num_tokens": 20960365.0, "step": 40 }, { "epoch": 0.26627218934911245, "grad_norm": 0.3774871230125427, "learning_rate": 1.8482913971175737e-05, "loss": 0.8079, "mean_token_accuracy": 0.7604979366064072, "num_tokens": 23581805.0, "step": 45 }, { "epoch": 0.2958579881656805, "grad_norm": 0.3935665488243103, "learning_rate": 1.789140509396394e-05, "loss": 0.7961, "mean_token_accuracy": 0.7637095510959625, "num_tokens": 26203245.0, "step": 50 }, { "epoch": 0.3254437869822485, "grad_norm": 0.3560955226421356, "learning_rate": 1.7215694610530624e-05, "loss": 0.7946, "mean_token_accuracy": 0.7636954367160798, "num_tokens": 28824685.0, "step": 55 }, { "epoch": 0.35502958579881655, "grad_norm": 0.37536928057670593, "learning_rate": 1.646299237860941e-05, "loss": 0.7938, "mean_token_accuracy": 0.7634146034717559, "num_tokens": 31443682.0, "step": 60 }, { "epoch": 0.38461538461538464, "grad_norm": 0.3460127115249634, "learning_rate": 1.5641329760952514e-05, "loss": 0.7639, "mean_token_accuracy": 0.7715859562158585, "num_tokens": 34061327.0, "step": 65 }, { "epoch": 0.41420118343195267, "grad_norm": 0.37248924374580383, "learning_rate": 1.4759473930370738e-05, "loss": 0.7771, "mean_token_accuracy": 0.7680761635303497, "num_tokens": 36682767.0, "step": 70 }, { "epoch": 0.4437869822485207, "grad_norm": 0.34658947587013245, "learning_rate": 1.3826834323650899e-05, "loss": 0.8028, "mean_token_accuracy": 0.7603934347629547, "num_tokens": 39300751.0, "step": 75 }, { "epoch": 0.47337278106508873, "grad_norm": 0.3706357479095459, "learning_rate": 1.2853362242491054e-05, "loss": 0.7738, "mean_token_accuracy": 0.7687152832746506, "num_tokens": 41918913.0, "step": 80 }, { "epoch": 0.5029585798816568, "grad_norm": 0.3420180380344391, "learning_rate": 1.1849444672715587e-05, "loss": 0.779, "mean_token_accuracy": 0.7665890276432037, "num_tokens": 44540169.0, "step": 85 }, { "epoch": 0.5325443786982249, "grad_norm": 0.3875311315059662, "learning_rate": 1.0825793454723325e-05, "loss": 0.7683, "mean_token_accuracy": 0.7695774495601654, "num_tokens": 47160969.0, "step": 90 }, { "epoch": 0.5621301775147929, "grad_norm": 0.35641783475875854, "learning_rate": 9.79333098772446e-06, "loss": 0.7692, "mean_token_accuracy": 0.7692601472139359, "num_tokens": 49782409.0, "step": 95 }, { "epoch": 0.591715976331361, "grad_norm": 0.3525794446468353, "learning_rate": 8.763073687306523e-06, "loss": 0.7853, "step": 100 }, { "epoch": 0.591715976331361, "eval_loss": 0.7879331111907959, "eval_mean_token_accuracy": 0.77275630235672, "eval_num_tokens": 52403849.0, "eval_runtime": 1.3797, "eval_samples_per_second": 93.496, "eval_steps_per_second": 3.624, "step": 100 }, { "epoch": 0.621301775147929, "grad_norm": 0.31415921449661255, "learning_rate": 7.746014439841941e-06, "loss": 0.7483, "mean_token_accuracy": 0.7696478188037872, "num_tokens": 55025289.0, "step": 105 }, { "epoch": 0.650887573964497, "grad_norm": 0.35946062207221985, "learning_rate": 6.7530053079531664e-06, "loss": 0.751, "mean_token_accuracy": 0.773971700668335, "num_tokens": 57641987.0, "step": 110 }, { "epoch": 0.6804733727810651, "grad_norm": 0.3247712552547455, "learning_rate": 5.794641738572925e-06, "loss": 0.766, "mean_token_accuracy": 0.7699923694133759, "num_tokens": 60263427.0, "step": 115 }, { "epoch": 0.7100591715976331, "grad_norm": 0.32254937291145325, "learning_rate": 4.881149509103993e-06, "loss": 0.7655, "mean_token_accuracy": 0.7701400071382523, "num_tokens": 62882216.0, "step": 120 }, { "epoch": 0.7396449704142012, "grad_norm": 0.3108745515346527, "learning_rate": 4.0222756179675915e-06, "loss": 0.7772, "mean_token_accuracy": 0.7666765838861466, "num_tokens": 65503656.0, "step": 125 }, { "epoch": 0.7692307692307693, "grad_norm": 0.3257134258747101, "learning_rate": 3.2271842837425917e-06, "loss": 0.7507, "mean_token_accuracy": 0.7745073974132538, "num_tokens": 68125096.0, "step": 130 }, { "epoch": 0.7988165680473372, "grad_norm": 0.29293152689933777, "learning_rate": 2.504359162588741e-06, "loss": 0.7436, "mean_token_accuracy": 0.7756482750177384, "num_tokens": 70746536.0, "step": 135 }, { "epoch": 0.8284023668639053, "grad_norm": 0.3014591932296753, "learning_rate": 1.861512827298051e-06, "loss": 0.7438, "mean_token_accuracy": 0.7760634958744049, "num_tokens": 73365315.0, "step": 140 }, { "epoch": 0.8579881656804734, "grad_norm": 0.2900739312171936, "learning_rate": 1.305504473836331e-06, "loss": 0.7585, "mean_token_accuracy": 0.7716891765594482, "num_tokens": 75986755.0, "step": 145 }, { "epoch": 0.8875739644970414, "grad_norm": 0.2970161437988281, "learning_rate": 8.42266733449425e-07, "loss": 0.7436, "mean_token_accuracy": 0.7757604539394378, "num_tokens": 78608195.0, "step": 150 }, { "epoch": 0.9171597633136095, "grad_norm": 0.29888418316841125, "learning_rate": 4.7674237125185597e-07, "loss": 0.7513, "mean_token_accuracy": 0.7735760033130645, "num_tokens": 81229635.0, "step": 155 }, { "epoch": 0.9467455621301775, "grad_norm": 0.28909653425216675, "learning_rate": 2.1283154672645522e-07, "loss": 0.7583, "mean_token_accuracy": 0.7715224415063858, "num_tokens": 83851075.0, "step": 160 }, { "epoch": 0.9763313609467456, "grad_norm": 0.2939208447933197, "learning_rate": 5.3350198867574424e-08, "loss": 0.7567, "mean_token_accuracy": 0.7722329139709473, "num_tokens": 86472515.0, "step": 165 }, { "epoch": 1.0, "mean_token_accuracy": 0.7728699259459972, "num_tokens": 88569667.0, "step": 169, "total_flos": 6.966137809639834e+17, "train_loss": 0.80340930978222, "train_runtime": 717.7254, "train_samples_per_second": 30.133, "train_steps_per_second": 0.235 } ], "logging_steps": 5, "max_steps": 169, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.966137809639834e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }