|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.5109034267912773, |
|
"eval_steps": 354, |
|
"global_step": 3100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05664117813650524, |
|
"grad_norm": 0.7603653073310852, |
|
"learning_rate": 0.0001978110599078341, |
|
"loss": 0.9425, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11328235627301048, |
|
"grad_norm": 0.6873273849487305, |
|
"learning_rate": 0.00019205069124423964, |
|
"loss": 0.6078, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16992353440951571, |
|
"grad_norm": 0.6323167085647583, |
|
"learning_rate": 0.00018629032258064517, |
|
"loss": 0.6748, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22656471254602095, |
|
"grad_norm": 1.0095610618591309, |
|
"learning_rate": 0.0001805299539170507, |
|
"loss": 0.6594, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2832058906825262, |
|
"grad_norm": 0.5822212100028992, |
|
"learning_rate": 0.00017476958525345623, |
|
"loss": 0.6317, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.33984706881903143, |
|
"grad_norm": 0.8490907549858093, |
|
"learning_rate": 0.00016900921658986176, |
|
"loss": 0.5742, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3964882469555367, |
|
"grad_norm": 0.6252707242965698, |
|
"learning_rate": 0.0001632488479262673, |
|
"loss": 0.5502, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4010195412064571, |
|
"eval_loss": 0.6019027233123779, |
|
"eval_runtime": 159.9351, |
|
"eval_samples_per_second": 9.81, |
|
"eval_steps_per_second": 2.457, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.4531294250920419, |
|
"grad_norm": 0.656812310218811, |
|
"learning_rate": 0.00015748847926267282, |
|
"loss": 0.5686, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5097706032285472, |
|
"grad_norm": 0.7391073703765869, |
|
"learning_rate": 0.00015172811059907835, |
|
"loss": 0.5701, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5664117813650524, |
|
"grad_norm": 0.9210707545280457, |
|
"learning_rate": 0.00014596774193548388, |
|
"loss": 0.6397, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6230529595015576, |
|
"grad_norm": 0.8228403329849243, |
|
"learning_rate": 0.00014020737327188939, |
|
"loss": 0.5822, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6796941376380629, |
|
"grad_norm": 0.716748833656311, |
|
"learning_rate": 0.00013444700460829494, |
|
"loss": 0.5881, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7363353157745681, |
|
"grad_norm": 0.7144941091537476, |
|
"learning_rate": 0.00012868663594470047, |
|
"loss": 0.6032, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7929764939110734, |
|
"grad_norm": 1.016291618347168, |
|
"learning_rate": 0.000122926267281106, |
|
"loss": 0.6377, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8020390824129142, |
|
"eval_loss": 0.5828524827957153, |
|
"eval_runtime": 150.8754, |
|
"eval_samples_per_second": 10.399, |
|
"eval_steps_per_second": 2.605, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.8496176720475785, |
|
"grad_norm": 1.0243154764175415, |
|
"learning_rate": 0.00011716589861751153, |
|
"loss": 0.6005, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9062588501840838, |
|
"grad_norm": 0.6541144251823425, |
|
"learning_rate": 0.00011140552995391706, |
|
"loss": 0.5723, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9629000283205891, |
|
"grad_norm": 1.0017038583755493, |
|
"learning_rate": 0.00010564516129032258, |
|
"loss": 0.5801, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0192580005664118, |
|
"grad_norm": 0.7527189254760742, |
|
"learning_rate": 9.988479262672812e-05, |
|
"loss": 0.5511, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.075899178702917, |
|
"grad_norm": 0.7966899871826172, |
|
"learning_rate": 9.412442396313365e-05, |
|
"loss": 0.49, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1325403568394223, |
|
"grad_norm": 0.7110822796821594, |
|
"learning_rate": 8.836405529953917e-05, |
|
"loss": 0.4725, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1891815349759276, |
|
"grad_norm": 0.7837777733802795, |
|
"learning_rate": 8.26036866359447e-05, |
|
"loss": 0.527, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.2027754177286887, |
|
"eval_loss": 0.5833637714385986, |
|
"eval_runtime": 150.6223, |
|
"eval_samples_per_second": 10.417, |
|
"eval_steps_per_second": 2.609, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 1.2458227131124326, |
|
"grad_norm": 0.8119267821311951, |
|
"learning_rate": 7.684331797235024e-05, |
|
"loss": 0.4892, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.302463891248938, |
|
"grad_norm": 0.8631129860877991, |
|
"learning_rate": 7.108294930875576e-05, |
|
"loss": 0.5124, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.3591050693854432, |
|
"grad_norm": 0.8685782551765442, |
|
"learning_rate": 6.532258064516129e-05, |
|
"loss": 0.4927, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4157462475219484, |
|
"grad_norm": 0.8397710919380188, |
|
"learning_rate": 5.956221198156682e-05, |
|
"loss": 0.5125, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.4723874256584537, |
|
"grad_norm": 0.7606781721115112, |
|
"learning_rate": 5.3801843317972355e-05, |
|
"loss": 0.4826, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.529028603794959, |
|
"grad_norm": 1.1354798078536987, |
|
"learning_rate": 4.8041474654377885e-05, |
|
"loss": 0.5101, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.5856697819314642, |
|
"grad_norm": 1.28499174118042, |
|
"learning_rate": 4.228110599078341e-05, |
|
"loss": 0.4687, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.603794958935146, |
|
"eval_loss": 0.5791710615158081, |
|
"eval_runtime": 150.7437, |
|
"eval_samples_per_second": 10.408, |
|
"eval_steps_per_second": 2.607, |
|
"step": 1416 |
|
}, |
|
{ |
|
"epoch": 1.6423109600679693, |
|
"grad_norm": 0.7527874708175659, |
|
"learning_rate": 3.6520737327188945e-05, |
|
"loss": 0.4992, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.6989521382044748, |
|
"grad_norm": 0.9351261854171753, |
|
"learning_rate": 3.076036866359447e-05, |
|
"loss": 0.4873, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7555933163409798, |
|
"grad_norm": 1.0196998119354248, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.4946, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.812234494477485, |
|
"grad_norm": 0.9896508455276489, |
|
"learning_rate": 1.923963133640553e-05, |
|
"loss": 0.4895, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8688756726139903, |
|
"grad_norm": 1.150964617729187, |
|
"learning_rate": 1.3479262672811061e-05, |
|
"loss": 0.5164, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.9255168507504956, |
|
"grad_norm": 0.8384917378425598, |
|
"learning_rate": 7.71889400921659e-06, |
|
"loss": 0.4914, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.9821580288870009, |
|
"grad_norm": 0.9231545329093933, |
|
"learning_rate": 1.9585253456221198e-06, |
|
"loss": 0.4939, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.0045312942509206, |
|
"eval_loss": 0.5844214558601379, |
|
"eval_runtime": 166.1298, |
|
"eval_samples_per_second": 9.444, |
|
"eval_steps_per_second": 2.366, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.0385160011328236, |
|
"grad_norm": 0.9673134088516235, |
|
"learning_rate": 0.00014248089741505446, |
|
"loss": 0.4303, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.0951571792693287, |
|
"grad_norm": 1.33237624168396, |
|
"learning_rate": 0.00014085514550479596, |
|
"loss": 0.4594, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.151798357405834, |
|
"grad_norm": 1.068943738937378, |
|
"learning_rate": 0.0001392293935945375, |
|
"loss": 0.4939, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.208439535542339, |
|
"grad_norm": 1.1625093221664429, |
|
"learning_rate": 0.00013760364168427899, |
|
"loss": 0.4757, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.2650807136788447, |
|
"grad_norm": 1.080735683441162, |
|
"learning_rate": 0.00013597788977402048, |
|
"loss": 0.4724, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.3217218918153497, |
|
"grad_norm": 0.8823259472846985, |
|
"learning_rate": 0.00013435213786376198, |
|
"loss": 0.4821, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.378363069951855, |
|
"grad_norm": 1.0513312816619873, |
|
"learning_rate": 0.0001327263859535035, |
|
"loss": 0.479, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.4055508354573774, |
|
"eval_loss": 0.6049736738204956, |
|
"eval_runtime": 156.3245, |
|
"eval_samples_per_second": 10.037, |
|
"eval_steps_per_second": 2.514, |
|
"step": 2124 |
|
}, |
|
{ |
|
"epoch": 2.4350042480883602, |
|
"grad_norm": 1.0902981758117676, |
|
"learning_rate": 0.000131100634043245, |
|
"loss": 0.4749, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.4916454262248653, |
|
"grad_norm": 0.9050194025039673, |
|
"learning_rate": 0.0001294748821329865, |
|
"loss": 0.4346, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.5482866043613708, |
|
"grad_norm": 1.0356699228286743, |
|
"learning_rate": 0.00012784913022272803, |
|
"loss": 0.4685, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.604927782497876, |
|
"grad_norm": 1.0071344375610352, |
|
"learning_rate": 0.00012622337831246953, |
|
"loss": 0.5, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.6615689606343813, |
|
"grad_norm": 1.0409235954284668, |
|
"learning_rate": 0.00012459762640221103, |
|
"loss": 0.4908, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.7182101387708864, |
|
"grad_norm": 0.8756324052810669, |
|
"learning_rate": 0.00012297187449195252, |
|
"loss": 0.4765, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.774851316907392, |
|
"grad_norm": 0.6662527918815613, |
|
"learning_rate": 0.00012134612258169405, |
|
"loss": 0.4524, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.8065703766638346, |
|
"eval_loss": 0.5904644727706909, |
|
"eval_runtime": 157.1505, |
|
"eval_samples_per_second": 9.984, |
|
"eval_steps_per_second": 2.501, |
|
"step": 2478 |
|
}, |
|
{ |
|
"epoch": 2.831492495043897, |
|
"grad_norm": 1.0368993282318115, |
|
"learning_rate": 0.00011972037067143556, |
|
"loss": 0.4661, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.888133673180402, |
|
"grad_norm": 1.0678080320358276, |
|
"learning_rate": 0.00011809461876117705, |
|
"loss": 0.4812, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.9447748513169074, |
|
"grad_norm": 1.2212059497833252, |
|
"learning_rate": 0.00011646886685091856, |
|
"loss": 0.482, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.00113282356273, |
|
"grad_norm": 0.757663369178772, |
|
"learning_rate": 0.00011484311494066007, |
|
"loss": 0.5062, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.057774001699235, |
|
"grad_norm": 0.8512151837348938, |
|
"learning_rate": 0.00011321736303040155, |
|
"loss": 0.3651, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.1144151798357407, |
|
"grad_norm": 0.9610685706138611, |
|
"learning_rate": 0.00011159161112014307, |
|
"loss": 0.3626, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.1710563579722457, |
|
"grad_norm": 1.0879671573638916, |
|
"learning_rate": 0.00010996585920988458, |
|
"loss": 0.3637, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.207306711979609, |
|
"eval_loss": 0.6150493025779724, |
|
"eval_runtime": 157.6799, |
|
"eval_samples_per_second": 9.951, |
|
"eval_steps_per_second": 2.492, |
|
"step": 2832 |
|
}, |
|
{ |
|
"epoch": 3.227697536108751, |
|
"grad_norm": 1.1237863302230835, |
|
"learning_rate": 0.00010834010729962609, |
|
"loss": 0.4015, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.2843387142452563, |
|
"grad_norm": 0.7940804958343506, |
|
"learning_rate": 0.00010671435538936759, |
|
"loss": 0.3572, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.3409798923817613, |
|
"grad_norm": 1.7029999494552612, |
|
"learning_rate": 0.0001050886034791091, |
|
"loss": 0.3902, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.397621070518267, |
|
"grad_norm": 0.8232925534248352, |
|
"learning_rate": 0.00010346285156885061, |
|
"loss": 0.3814, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.454262248654772, |
|
"grad_norm": 0.9208984375, |
|
"learning_rate": 0.0001018370996585921, |
|
"loss": 0.377, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 3.5109034267912773, |
|
"grad_norm": 1.1689094305038452, |
|
"learning_rate": 0.00010021134774833361, |
|
"loss": 0.3887, |
|
"step": 3100 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 6181, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0467108033589094e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|