|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.999787007454739, |
|
"eval_steps": 500, |
|
"global_step": 2347, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.021299254526091587, |
|
"grad_norm": 1.8116117629273891, |
|
"learning_rate": 7.042253521126761e-06, |
|
"loss": 1.1672, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.042598509052183174, |
|
"grad_norm": 1.7801325074284498, |
|
"learning_rate": 9.99599471666643e-06, |
|
"loss": 1.0639, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06389776357827476, |
|
"grad_norm": 1.7302714965761716, |
|
"learning_rate": 9.970302558343623e-06, |
|
"loss": 1.0591, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08519701810436635, |
|
"grad_norm": 1.5998384203663196, |
|
"learning_rate": 9.920945445527302e-06, |
|
"loss": 1.055, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10649627263045794, |
|
"grad_norm": 1.6500912559924017, |
|
"learning_rate": 9.848158380778183e-06, |
|
"loss": 1.0419, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12779552715654952, |
|
"grad_norm": 1.596481575421808, |
|
"learning_rate": 9.75228792299379e-06, |
|
"loss": 1.0383, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14909478168264112, |
|
"grad_norm": 1.6526899468527811, |
|
"learning_rate": 9.633790537347847e-06, |
|
"loss": 1.0334, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1703940362087327, |
|
"grad_norm": 1.6150272900402385, |
|
"learning_rate": 9.493230421936143e-06, |
|
"loss": 1.0515, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19169329073482427, |
|
"grad_norm": 1.7820720198756743, |
|
"learning_rate": 9.331276821476838e-06, |
|
"loss": 1.0268, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21299254526091588, |
|
"grad_norm": 1.6035392653474185, |
|
"learning_rate": 9.148700840855383e-06, |
|
"loss": 1.0325, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23429179978700745, |
|
"grad_norm": 1.8039846181943315, |
|
"learning_rate": 8.94637177368566e-06, |
|
"loss": 1.0338, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.25559105431309903, |
|
"grad_norm": 1.6713279817736957, |
|
"learning_rate": 8.72525296336801e-06, |
|
"loss": 1.0195, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.27689030883919064, |
|
"grad_norm": 1.683168266494625, |
|
"learning_rate": 8.486397216350815e-06, |
|
"loss": 1.0222, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.29818956336528224, |
|
"grad_norm": 1.6916656722965304, |
|
"learning_rate": 8.230941789434305e-06, |
|
"loss": 1.0155, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3194888178913738, |
|
"grad_norm": 1.7135930235778254, |
|
"learning_rate": 7.960102974983407e-06, |
|
"loss": 1.0259, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3407880724174654, |
|
"grad_norm": 1.6801315844607865, |
|
"learning_rate": 7.675170309830907e-06, |
|
"loss": 1.0244, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.362087326943557, |
|
"grad_norm": 1.6004816476301602, |
|
"learning_rate": 7.377500435443875e-06, |
|
"loss": 1.0196, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.38338658146964855, |
|
"grad_norm": 1.560394824607186, |
|
"learning_rate": 7.068510638586805e-06, |
|
"loss": 1.0213, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.40468583599574015, |
|
"grad_norm": 1.7739863675472445, |
|
"learning_rate": 6.7496721032360715e-06, |
|
"loss": 1.0093, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.42598509052183176, |
|
"grad_norm": 1.5424575836895802, |
|
"learning_rate": 6.4225029058752076e-06, |
|
"loss": 1.004, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4472843450479233, |
|
"grad_norm": 1.7341809762292035, |
|
"learning_rate": 6.088560787522257e-06, |
|
"loss": 1.0103, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4685835995740149, |
|
"grad_norm": 1.712705790413237, |
|
"learning_rate": 5.749435736903566e-06, |
|
"loss": 1.0029, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4898828541001065, |
|
"grad_norm": 1.634153426002941, |
|
"learning_rate": 5.406742420087504e-06, |
|
"loss": 1.0043, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5111821086261981, |
|
"grad_norm": 1.621004209807033, |
|
"learning_rate": 5.062112492622702e-06, |
|
"loss": 1.004, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5324813631522897, |
|
"grad_norm": 1.6860145774112278, |
|
"learning_rate": 4.717186830784763e-06, |
|
"loss": 0.9951, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5537806176783813, |
|
"grad_norm": 1.6211493662048333, |
|
"learning_rate": 4.37360771892061e-06, |
|
"loss": 1.0013, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5750798722044729, |
|
"grad_norm": 1.8249160060138945, |
|
"learning_rate": 4.0330110300886206e-06, |
|
"loss": 0.9986, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5963791267305645, |
|
"grad_norm": 1.6686984774536926, |
|
"learning_rate": 3.697018437224645e-06, |
|
"loss": 0.9967, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.617678381256656, |
|
"grad_norm": 1.645263948081402, |
|
"learning_rate": 3.3672296919186444e-06, |
|
"loss": 1.0029, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6389776357827476, |
|
"grad_norm": 1.7334881644315554, |
|
"learning_rate": 3.0452150075647745e-06, |
|
"loss": 0.9794, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6602768903088392, |
|
"grad_norm": 1.6958071495211982, |
|
"learning_rate": 2.732507583150834e-06, |
|
"loss": 0.9866, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.6815761448349308, |
|
"grad_norm": 1.629069436410619, |
|
"learning_rate": 2.4305963032832697e-06, |
|
"loss": 0.9875, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7028753993610224, |
|
"grad_norm": 1.7526521548397989, |
|
"learning_rate": 2.140918649204985e-06, |
|
"loss": 0.988, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.724174653887114, |
|
"grad_norm": 1.717145624435277, |
|
"learning_rate": 1.8648538545584133e-06, |
|
"loss": 0.9803, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7454739084132055, |
|
"grad_norm": 1.6652219634436365, |
|
"learning_rate": 1.6037163384811998e-06, |
|
"loss": 0.9931, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7667731629392971, |
|
"grad_norm": 1.4903724798212907, |
|
"learning_rate": 1.3587494473012542e-06, |
|
"loss": 0.9831, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7880724174653887, |
|
"grad_norm": 1.6698441280484966, |
|
"learning_rate": 1.1311195346286702e-06, |
|
"loss": 0.9811, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.8093716719914803, |
|
"grad_norm": 1.7730255653260494, |
|
"learning_rate": 9.219104080308027e-07, |
|
"loss": 0.9726, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8306709265175719, |
|
"grad_norm": 1.471953815903398, |
|
"learning_rate": 7.321181687313578e-07, |
|
"loss": 0.9786, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8519701810436635, |
|
"grad_norm": 1.5610577043390987, |
|
"learning_rate": 5.626464689031197e-07, |
|
"loss": 0.9807, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.873269435569755, |
|
"grad_norm": 1.8173195064149255, |
|
"learning_rate": 4.1430220913559914e-07, |
|
"loss": 0.974, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.8945686900958466, |
|
"grad_norm": 1.7107057022608567, |
|
"learning_rate": 2.877916965631344e-07, |
|
"loss": 0.9874, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9158679446219382, |
|
"grad_norm": 1.7134293009529347, |
|
"learning_rate": 1.837172819456301e-07, |
|
"loss": 0.9778, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.9371671991480298, |
|
"grad_norm": 1.7486730313830365, |
|
"learning_rate": 1.025744917137117e-07, |
|
"loss": 0.9691, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9584664536741214, |
|
"grad_norm": 1.6373608571865919, |
|
"learning_rate": 4.474966863339203e-08, |
|
"loss": 0.9826, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.979765708200213, |
|
"grad_norm": 1.7726783615561228, |
|
"learning_rate": 1.0518132323694696e-08, |
|
"loss": 0.9805, |
|
"step": 2300 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 2347, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.374008149160755e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|