|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 100, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7140417098999023, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.7338, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.1201739311218262, |
|
"learning_rate": 3.1666666666666666e-05, |
|
"loss": 0.3534, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.3740745782852173, |
|
"learning_rate": 4.8333333333333334e-05, |
|
"loss": 0.1776, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7445772886276245, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.1226, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6376151442527771, |
|
"learning_rate": 8.166666666666667e-05, |
|
"loss": 0.1086, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.37051457166671753, |
|
"learning_rate": 9.833333333333333e-05, |
|
"loss": 0.0996, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5768002867698669, |
|
"learning_rate": 9.99314767377287e-05, |
|
"loss": 0.0959, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5846573114395142, |
|
"learning_rate": 9.9694847320726e-05, |
|
"loss": 0.0673, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5731233954429626, |
|
"learning_rate": 9.929006627092299e-05, |
|
"loss": 0.1235, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7275082468986511, |
|
"learning_rate": 9.871850323926177e-05, |
|
"loss": 0.0779, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.0684906542301178, |
|
"eval_runtime": 263.652, |
|
"eval_samples_per_second": 0.379, |
|
"eval_steps_per_second": 0.379, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5399947762489319, |
|
"learning_rate": 9.798209221411747e-05, |
|
"loss": 0.0591, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.39991146326065063, |
|
"learning_rate": 9.708332497729378e-05, |
|
"loss": 0.0841, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.4473568797111511, |
|
"learning_rate": 9.602524267262203e-05, |
|
"loss": 0.0823, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6712111234664917, |
|
"learning_rate": 9.481142551569318e-05, |
|
"loss": 0.0686, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3721703886985779, |
|
"learning_rate": 9.344598067954152e-05, |
|
"loss": 0.0658, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.43187215924263, |
|
"learning_rate": 9.193352839727121e-05, |
|
"loss": 0.0669, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.3821048140525818, |
|
"learning_rate": 9.027918632864997e-05, |
|
"loss": 0.0898, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.7914325594902039, |
|
"learning_rate": 8.848855224356839e-05, |
|
"loss": 0.0777, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.49602773785591125, |
|
"learning_rate": 8.656768508095853e-05, |
|
"loss": 0.0602, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.36420896649360657, |
|
"learning_rate": 8.452308444726249e-05, |
|
"loss": 0.0647, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.05108840763568878, |
|
"eval_runtime": 263.56, |
|
"eval_samples_per_second": 0.379, |
|
"eval_steps_per_second": 0.379, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.29956021904945374, |
|
"learning_rate": 8.236166862382163e-05, |
|
"loss": 0.0381, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.3197619915008545, |
|
"learning_rate": 8.009075115760243e-05, |
|
"loss": 0.0478, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.43458399176597595, |
|
"learning_rate": 7.771801611446858e-05, |
|
"loss": 0.0349, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.3282864987850189, |
|
"learning_rate": 7.52514920787345e-05, |
|
"loss": 0.0309, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.24097760021686554, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 0.0209, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.31138163805007935, |
|
"learning_rate": 7.007074988802946e-05, |
|
"loss": 0.0414, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.2666899859905243, |
|
"learning_rate": 6.737406172470657e-05, |
|
"loss": 0.0592, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.513999879360199, |
|
"learning_rate": 6.461858523613684e-05, |
|
"loss": 0.0328, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.2508058547973633, |
|
"learning_rate": 6.181364408253209e-05, |
|
"loss": 0.0256, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.22155922651290894, |
|
"learning_rate": 5.8968729296872874e-05, |
|
"loss": 0.0292, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.04999532178044319, |
|
"eval_runtime": 263.0996, |
|
"eval_samples_per_second": 0.38, |
|
"eval_steps_per_second": 0.38, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.5395333170890808, |
|
"learning_rate": 5.6093467170257374e-05, |
|
"loss": 0.0468, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.28358444571495056, |
|
"learning_rate": 5.319758667957928e-05, |
|
"loss": 0.0418, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.18217401206493378, |
|
"learning_rate": 5.0290886567749696e-05, |
|
"loss": 0.0227, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.4007870852947235, |
|
"learning_rate": 4.738320218785281e-05, |
|
"loss": 0.0383, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.20039933919906616, |
|
"learning_rate": 4.4484372223424415e-05, |
|
"loss": 0.0371, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.12333797663450241, |
|
"learning_rate": 4.160420539746115e-05, |
|
"loss": 0.0331, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.3924310803413391, |
|
"learning_rate": 3.875244728280676e-05, |
|
"loss": 0.0396, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.07961348444223404, |
|
"learning_rate": 3.593874732621847e-05, |
|
"loss": 0.033, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.5689805746078491, |
|
"learning_rate": 3.317262619769368e-05, |
|
"loss": 0.0336, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.0888177827000618, |
|
"learning_rate": 3.046344357553632e-05, |
|
"loss": 0.028, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.04492698982357979, |
|
"eval_runtime": 263.4857, |
|
"eval_samples_per_second": 0.38, |
|
"eval_steps_per_second": 0.38, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.4522015452384949, |
|
"learning_rate": 2.7820366476168224e-05, |
|
"loss": 0.0177, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.037934061139822006, |
|
"learning_rate": 2.52523382358473e-05, |
|
"loss": 0.0143, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.19100028276443481, |
|
"learning_rate": 2.2768048249248648e-05, |
|
"loss": 0.013, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.2139115184545517, |
|
"learning_rate": 2.0375902567303472e-05, |
|
"loss": 0.0105, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.31156909465789795, |
|
"learning_rate": 1.80839954537836e-05, |
|
"loss": 0.0136, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.08543579280376434, |
|
"learning_rate": 1.5900081996875083e-05, |
|
"loss": 0.0196, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.10310215502977371, |
|
"learning_rate": 1.3831551868414599e-05, |
|
"loss": 0.017, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.2742729187011719, |
|
"learning_rate": 1.1885404319579108e-05, |
|
"loss": 0.0254, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.09017336368560791, |
|
"learning_rate": 1.006822449763537e-05, |
|
"loss": 0.0174, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.07442392408847809, |
|
"learning_rate": 8.38616116388612e-06, |
|
"loss": 0.013, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.048839278519153595, |
|
"eval_runtime": 284.8522, |
|
"eval_samples_per_second": 0.351, |
|
"eval_steps_per_second": 0.351, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.29669731855392456, |
|
"learning_rate": 6.844905888208181e-06, |
|
"loss": 0.015, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.3099665939807892, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 0.0148, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.20635627210140228, |
|
"learning_rate": 4.205185894774455e-06, |
|
"loss": 0.0156, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.2796818017959595, |
|
"learning_rate": 3.115653153892761e-06, |
|
"loss": 0.015, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.15870630741119385, |
|
"learning_rate": 2.1847622018482283e-06, |
|
"loss": 0.0144, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.23704785108566284, |
|
"learning_rate": 1.4156628789559922e-06, |
|
"loss": 0.0241, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.025902193039655685, |
|
"learning_rate": 8.10957573872062e-07, |
|
"loss": 0.0096, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.04191463440656662, |
|
"learning_rate": 3.7269241793390085e-07, |
|
"loss": 0.0086, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.04461716488003731, |
|
"learning_rate": 1.0235036169963242e-07, |
|
"loss": 0.0115, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.21583612263202667, |
|
"learning_rate": 8.461571127882373e-10, |
|
"loss": 0.0116, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.04811210185289383, |
|
"eval_runtime": 284.4632, |
|
"eval_samples_per_second": 0.352, |
|
"eval_steps_per_second": 0.352, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 600, |
|
"total_flos": 3.3505112222539776e+16, |
|
"train_loss": 0.005030520980556806, |
|
"train_runtime": 5996.0343, |
|
"train_samples_per_second": 0.4, |
|
"train_steps_per_second": 0.1 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3505112222539776e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|