|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 169, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029585798816568046, |
|
"grad_norm": 2.3596792221069336, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 1.1077, |
|
"mean_token_accuracy": 0.7105089992284774, |
|
"num_tokens": 2621440.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05917159763313609, |
|
"grad_norm": 1.7160693407058716, |
|
"learning_rate": 1.0588235294117648e-05, |
|
"loss": 1.0353, |
|
"mean_token_accuracy": 0.7203521847724914, |
|
"num_tokens": 5242880.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08875739644970414, |
|
"grad_norm": 0.8708027601242065, |
|
"learning_rate": 1.647058823529412e-05, |
|
"loss": 0.9553, |
|
"mean_token_accuracy": 0.7316273808479309, |
|
"num_tokens": 7861273.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11834319526627218, |
|
"grad_norm": 0.6267297267913818, |
|
"learning_rate": 1.999145758387301e-05, |
|
"loss": 0.8899, |
|
"mean_token_accuracy": 0.743945425748825, |
|
"num_tokens": 10474605.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14792899408284024, |
|
"grad_norm": 0.4869539439678192, |
|
"learning_rate": 1.9895522933272028e-05, |
|
"loss": 0.8455, |
|
"mean_token_accuracy": 0.7542784661054611, |
|
"num_tokens": 13096045.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17751479289940827, |
|
"grad_norm": 0.4492546319961548, |
|
"learning_rate": 1.9694002659393306e-05, |
|
"loss": 0.8389, |
|
"mean_token_accuracy": 0.7545322090387344, |
|
"num_tokens": 15717485.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20710059171597633, |
|
"grad_norm": 0.4301901161670685, |
|
"learning_rate": 1.9389046991574298e-05, |
|
"loss": 0.8268, |
|
"mean_token_accuracy": 0.7568192929029465, |
|
"num_tokens": 18338925.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23668639053254437, |
|
"grad_norm": 0.37857791781425476, |
|
"learning_rate": 1.898390981891979e-05, |
|
"loss": 0.7956, |
|
"mean_token_accuracy": 0.7649114817380905, |
|
"num_tokens": 20960365.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.26627218934911245, |
|
"grad_norm": 0.3774871230125427, |
|
"learning_rate": 1.8482913971175737e-05, |
|
"loss": 0.8079, |
|
"mean_token_accuracy": 0.7604979366064072, |
|
"num_tokens": 23581805.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"grad_norm": 0.3935665488243103, |
|
"learning_rate": 1.789140509396394e-05, |
|
"loss": 0.7961, |
|
"mean_token_accuracy": 0.7637095510959625, |
|
"num_tokens": 26203245.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3254437869822485, |
|
"grad_norm": 0.3560955226421356, |
|
"learning_rate": 1.7215694610530624e-05, |
|
"loss": 0.7946, |
|
"mean_token_accuracy": 0.7636954367160798, |
|
"num_tokens": 28824685.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.35502958579881655, |
|
"grad_norm": 0.37536928057670593, |
|
"learning_rate": 1.646299237860941e-05, |
|
"loss": 0.7938, |
|
"mean_token_accuracy": 0.7634146034717559, |
|
"num_tokens": 31443682.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 0.3460127115249634, |
|
"learning_rate": 1.5641329760952514e-05, |
|
"loss": 0.7639, |
|
"mean_token_accuracy": 0.7715859562158585, |
|
"num_tokens": 34061327.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.41420118343195267, |
|
"grad_norm": 0.37248924374580383, |
|
"learning_rate": 1.4759473930370738e-05, |
|
"loss": 0.7771, |
|
"mean_token_accuracy": 0.7680761635303497, |
|
"num_tokens": 36682767.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4437869822485207, |
|
"grad_norm": 0.34658947587013245, |
|
"learning_rate": 1.3826834323650899e-05, |
|
"loss": 0.8028, |
|
"mean_token_accuracy": 0.7603934347629547, |
|
"num_tokens": 39300751.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.47337278106508873, |
|
"grad_norm": 0.3706357479095459, |
|
"learning_rate": 1.2853362242491054e-05, |
|
"loss": 0.7738, |
|
"mean_token_accuracy": 0.7687152832746506, |
|
"num_tokens": 41918913.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5029585798816568, |
|
"grad_norm": 0.3420180380344391, |
|
"learning_rate": 1.1849444672715587e-05, |
|
"loss": 0.779, |
|
"mean_token_accuracy": 0.7665890276432037, |
|
"num_tokens": 44540169.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5325443786982249, |
|
"grad_norm": 0.3875311315059662, |
|
"learning_rate": 1.0825793454723325e-05, |
|
"loss": 0.7683, |
|
"mean_token_accuracy": 0.7695774495601654, |
|
"num_tokens": 47160969.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5621301775147929, |
|
"grad_norm": 0.35641783475875854, |
|
"learning_rate": 9.79333098772446e-06, |
|
"loss": 0.7692, |
|
"mean_token_accuracy": 0.7692601472139359, |
|
"num_tokens": 49782409.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 0.3525794446468353, |
|
"learning_rate": 8.763073687306523e-06, |
|
"loss": 0.7853, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"eval_loss": 0.7879331111907959, |
|
"eval_mean_token_accuracy": 0.77275630235672, |
|
"eval_num_tokens": 52403849.0, |
|
"eval_runtime": 1.3797, |
|
"eval_samples_per_second": 93.496, |
|
"eval_steps_per_second": 3.624, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.621301775147929, |
|
"grad_norm": 0.31415921449661255, |
|
"learning_rate": 7.746014439841941e-06, |
|
"loss": 0.7483, |
|
"mean_token_accuracy": 0.7696478188037872, |
|
"num_tokens": 55025289.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.650887573964497, |
|
"grad_norm": 0.35946062207221985, |
|
"learning_rate": 6.7530053079531664e-06, |
|
"loss": 0.751, |
|
"mean_token_accuracy": 0.773971700668335, |
|
"num_tokens": 57641987.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6804733727810651, |
|
"grad_norm": 0.3247712552547455, |
|
"learning_rate": 5.794641738572925e-06, |
|
"loss": 0.766, |
|
"mean_token_accuracy": 0.7699923694133759, |
|
"num_tokens": 60263427.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7100591715976331, |
|
"grad_norm": 0.32254937291145325, |
|
"learning_rate": 4.881149509103993e-06, |
|
"loss": 0.7655, |
|
"mean_token_accuracy": 0.7701400071382523, |
|
"num_tokens": 62882216.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7396449704142012, |
|
"grad_norm": 0.3108745515346527, |
|
"learning_rate": 4.0222756179675915e-06, |
|
"loss": 0.7772, |
|
"mean_token_accuracy": 0.7666765838861466, |
|
"num_tokens": 65503656.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.3257134258747101, |
|
"learning_rate": 3.2271842837425917e-06, |
|
"loss": 0.7507, |
|
"mean_token_accuracy": 0.7745073974132538, |
|
"num_tokens": 68125096.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7988165680473372, |
|
"grad_norm": 0.29293152689933777, |
|
"learning_rate": 2.504359162588741e-06, |
|
"loss": 0.7436, |
|
"mean_token_accuracy": 0.7756482750177384, |
|
"num_tokens": 70746536.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8284023668639053, |
|
"grad_norm": 0.3014591932296753, |
|
"learning_rate": 1.861512827298051e-06, |
|
"loss": 0.7438, |
|
"mean_token_accuracy": 0.7760634958744049, |
|
"num_tokens": 73365315.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8579881656804734, |
|
"grad_norm": 0.2900739312171936, |
|
"learning_rate": 1.305504473836331e-06, |
|
"loss": 0.7585, |
|
"mean_token_accuracy": 0.7716891765594482, |
|
"num_tokens": 75986755.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"grad_norm": 0.2970161437988281, |
|
"learning_rate": 8.42266733449425e-07, |
|
"loss": 0.7436, |
|
"mean_token_accuracy": 0.7757604539394378, |
|
"num_tokens": 78608195.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9171597633136095, |
|
"grad_norm": 0.29888418316841125, |
|
"learning_rate": 4.7674237125185597e-07, |
|
"loss": 0.7513, |
|
"mean_token_accuracy": 0.7735760033130645, |
|
"num_tokens": 81229635.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9467455621301775, |
|
"grad_norm": 0.28909653425216675, |
|
"learning_rate": 2.1283154672645522e-07, |
|
"loss": 0.7583, |
|
"mean_token_accuracy": 0.7715224415063858, |
|
"num_tokens": 83851075.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9763313609467456, |
|
"grad_norm": 0.2939208447933197, |
|
"learning_rate": 5.3350198867574424e-08, |
|
"loss": 0.7567, |
|
"mean_token_accuracy": 0.7722329139709473, |
|
"num_tokens": 86472515.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"mean_token_accuracy": 0.7728699259459972, |
|
"num_tokens": 88569667.0, |
|
"step": 169, |
|
"total_flos": 6.966137809639834e+17, |
|
"train_loss": 0.80340930978222, |
|
"train_runtime": 717.7254, |
|
"train_samples_per_second": 30.133, |
|
"train_steps_per_second": 0.235 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 169, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.966137809639834e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|