|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 1100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 4.545454545454545e-07, |
|
"loss": 2.5611, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 2.5692, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 2.644, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.304738759994507, |
|
"eval_runtime": 1.2267, |
|
"eval_samples_per_second": 18.749, |
|
"eval_steps_per_second": 4.891, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 2.3827, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 2.0781, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.9548, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.9422764778137207, |
|
"eval_runtime": 1.2199, |
|
"eval_samples_per_second": 18.854, |
|
"eval_steps_per_second": 4.918, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"learning_rate": 4.9987413559579636e-05, |
|
"loss": 1.8017, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"learning_rate": 4.988679806432712e-05, |
|
"loss": 1.8295, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"learning_rate": 4.968597221690986e-05, |
|
"loss": 1.7876, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.9136310815811157, |
|
"eval_runtime": 1.2209, |
|
"eval_samples_per_second": 18.839, |
|
"eval_steps_per_second": 4.915, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"learning_rate": 4.938574467213518e-05, |
|
"loss": 1.6775, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 4.898732434036244e-05, |
|
"loss": 1.6526, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 4.849231551964771e-05, |
|
"loss": 1.634, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.9493458271026611, |
|
"eval_runtime": 1.2438, |
|
"eval_samples_per_second": 18.491, |
|
"eval_steps_per_second": 4.824, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"learning_rate": 4.790271143580174e-05, |
|
"loss": 1.3805, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"learning_rate": 4.722088621637309e-05, |
|
"loss": 1.3783, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.1324849128723145, |
|
"eval_runtime": 1.2205, |
|
"eval_samples_per_second": 18.844, |
|
"eval_steps_per_second": 4.916, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"learning_rate": 4.644958533087443e-05, |
|
"loss": 1.2506, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"learning_rate": 4.559191453574582e-05, |
|
"loss": 1.0476, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"learning_rate": 4.465132736856969e-05, |
|
"loss": 1.0621, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.351741075515747, |
|
"eval_runtime": 1.2227, |
|
"eval_samples_per_second": 18.811, |
|
"eval_steps_per_second": 4.907, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"learning_rate": 4.3631611241893874e-05, |
|
"loss": 0.9006, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"learning_rate": 4.2536872192658036e-05, |
|
"loss": 0.7803, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"learning_rate": 4.137151834863213e-05, |
|
"loss": 0.8113, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.609915256500244, |
|
"eval_runtime": 1.2166, |
|
"eval_samples_per_second": 18.906, |
|
"eval_steps_per_second": 4.932, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"learning_rate": 4.014024217844167e-05, |
|
"loss": 0.6532, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"learning_rate": 3.884800159665276e-05, |
|
"loss": 0.5756, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.5873, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.8912346363067627, |
|
"eval_runtime": 1.258, |
|
"eval_samples_per_second": 18.284, |
|
"eval_steps_per_second": 4.77, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"learning_rate": 3.610166531514436e-05, |
|
"loss": 0.4178, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 8.73, |
|
"learning_rate": 3.465862814232822e-05, |
|
"loss": 0.4162, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 3.1381325721740723, |
|
"eval_runtime": 1.2162, |
|
"eval_samples_per_second": 18.912, |
|
"eval_steps_per_second": 4.934, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"learning_rate": 3.3176699082935545e-05, |
|
"loss": 0.4194, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.45, |
|
"learning_rate": 3.166184534225087e-05, |
|
"loss": 0.3122, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 9.82, |
|
"learning_rate": 3.012016670162977e-05, |
|
"loss": 0.3348, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 3.32578706741333, |
|
"eval_runtime": 1.2175, |
|
"eval_samples_per_second": 18.892, |
|
"eval_steps_per_second": 4.928, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 10.18, |
|
"learning_rate": 2.8557870956832132e-05, |
|
"loss": 0.3, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 10.55, |
|
"learning_rate": 2.698124892141971e-05, |
|
"loss": 0.2524, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"learning_rate": 2.5396649095870202e-05, |
|
"loss": 0.2436, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 3.546029806137085, |
|
"eval_runtime": 1.2185, |
|
"eval_samples_per_second": 18.875, |
|
"eval_steps_per_second": 4.924, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 11.27, |
|
"learning_rate": 2.3810452104406444e-05, |
|
"loss": 0.1852, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 11.64, |
|
"learning_rate": 2.222904500247473e-05, |
|
"loss": 0.1736, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 0.1977, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 3.683403253555298, |
|
"eval_runtime": 1.2576, |
|
"eval_samples_per_second": 18.289, |
|
"eval_steps_per_second": 4.771, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 12.36, |
|
"learning_rate": 1.9106026612264316e-05, |
|
"loss": 0.1451, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 12.73, |
|
"learning_rate": 1.7576990616793137e-05, |
|
"loss": 0.124, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 3.770543336868286, |
|
"eval_runtime": 1.2176, |
|
"eval_samples_per_second": 18.889, |
|
"eval_steps_per_second": 4.928, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 13.09, |
|
"learning_rate": 1.6077844460203206e-05, |
|
"loss": 0.1348, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 13.45, |
|
"learning_rate": 1.4614624674952842e-05, |
|
"loss": 0.0963, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"learning_rate": 1.3193223130682936e-05, |
|
"loss": 0.1207, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 3.8691725730895996, |
|
"eval_runtime": 1.2162, |
|
"eval_samples_per_second": 18.912, |
|
"eval_steps_per_second": 4.933, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 14.18, |
|
"learning_rate": 1.181936330973744e-05, |
|
"loss": 0.0855, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 14.55, |
|
"learning_rate": 1.049857726072005e-05, |
|
"loss": 0.0853, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 14.91, |
|
"learning_rate": 9.236183322886945e-06, |
|
"loss": 0.1002, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 3.9645142555236816, |
|
"eval_runtime": 1.2246, |
|
"eval_samples_per_second": 18.782, |
|
"eval_steps_per_second": 4.9, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"learning_rate": 8.0372647110717e-06, |
|
"loss": 0.0752, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 15.64, |
|
"learning_rate": 6.906649047373246e-06, |
|
"loss": 0.0925, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 5.848888922025553e-06, |
|
"loss": 0.0765, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 3.9715585708618164, |
|
"eval_runtime": 1.2422, |
|
"eval_samples_per_second": 18.515, |
|
"eval_steps_per_second": 4.83, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 16.36, |
|
"learning_rate": 4.868243561723535e-06, |
|
"loss": 0.0701, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 16.73, |
|
"learning_rate": 3.968661679220468e-06, |
|
"loss": 0.0912, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 3.9930052757263184, |
|
"eval_runtime": 1.2193, |
|
"eval_samples_per_second": 18.863, |
|
"eval_steps_per_second": 4.921, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 17.09, |
|
"learning_rate": 3.1537655732553768e-06, |
|
"loss": 0.0697, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 17.45, |
|
"learning_rate": 2.4268365428344736e-06, |
|
"loss": 0.066, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 17.82, |
|
"learning_rate": 1.790801674598186e-06, |
|
"loss": 0.0892, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 4.0004754066467285, |
|
"eval_runtime": 1.2178, |
|
"eval_samples_per_second": 18.886, |
|
"eval_steps_per_second": 4.927, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 18.18, |
|
"learning_rate": 1.248222056476367e-06, |
|
"loss": 0.0694, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 18.55, |
|
"learning_rate": 8.012824650910938e-07, |
|
"loss": 0.0859, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 18.91, |
|
"learning_rate": 4.517825684323324e-07, |
|
"loss": 0.067, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 3.999499797821045, |
|
"eval_runtime": 1.2172, |
|
"eval_samples_per_second": 18.896, |
|
"eval_steps_per_second": 4.929, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 19.27, |
|
"learning_rate": 2.011296792301165e-07, |
|
"loss": 0.0675, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 19.64, |
|
"learning_rate": 5.033308820289184e-08, |
|
"loss": 0.0898, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.067, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 1100, |
|
"total_flos": 3.807078373542298e+16, |
|
"train_loss": 0.0, |
|
"train_runtime": 15.1698, |
|
"train_samples_per_second": 286.094, |
|
"train_steps_per_second": 72.512 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 20, |
|
"total_flos": 3.807078373542298e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|